diff --git a/.cursor/skills/alert-manager/SKILL.md b/.cursor/skills/alert-manager/SKILL.md
new file mode 100644
index 0000000000..3cece47ca5
--- /dev/null
+++ b/.cursor/skills/alert-manager/SKILL.md
@@ -0,0 +1,337 @@
+---
+name: alert-manager
+description: 'Configure SEO alerts for ranking drops, traffic changes, technical issues, competitor movements. SEO预警/排名监控'
+version: "6.0.0"
+license: Apache-2.0
+compatibility: "Claude Code ≥1.0, skills.sh marketplace, ClawHub marketplace, Vercel Labs skills ecosystem. No system packages required. Optional: MCP network access for SEO tool integrations."
+homepage: "https://github.com/aaron-he-zhu/seo-geo-claude-skills"
+when_to_use: "Use when setting up monitoring alerts for rankings, traffic, backlinks, technical issues, or AI visibility changes."
+argument-hint: "<domain> [metric]"
+metadata:
+  author: aaron-he-zhu
+  version: "6.0.0"
+  geo-relevance: "low"
+  tags:
+    - seo
+    - geo
+    - seo-alerts
+    - ranking-alerts
+    - traffic-monitoring
+    - competitor-alerts
+    - automated-monitoring
+    - anomaly-detection
+    - SEO预警
+    - SEOアラート
+    - SEO알림
+    - alertas-seo
+  triggers:
+    # EN-formal
+    - "set up SEO alerts"
+    - "monitor rankings"
+    - "ranking notifications"
+    - "traffic alerts"
+    - "competitor alerts"
+    - "automated monitoring"
+    # EN-casual
+    - "notify me when rankings drop"
+    - "alert me if rankings drop"
+    - "notify me of traffic changes"
+    - "watch competitor changes"
+    - "watch my keywords for changes"
+    - "alert me about changes"
+    # EN-question
+    - "how to monitor my rankings"
+    - "how to set up SEO alerts"
+    # ZH-pro
+    - "SEO预警"
+    - "排名监控"
+    - "流量报警"
+    - "竞品变动提醒"
+    # ZH-casual
+    - "排名掉了提醒我"
+    - "流量异常"
+    - "有变化通知我"
+    # JA
+    - "SEOアラート"
+    - "ランキング監視"
+    # KO
+    - "SEO 알림"
+    - "순위 모니터링"
+    # ES
+    - "alertas SEO"
+    - "monitoreo de rankings"
+    # PT
+    - "alertas de SEO"
+    # Misspellings
+    - "SEO allerts"
+---
+
+# Alert Manager
+
+
+> **[SEO & GEO Skills Library](https://github.com/aaron-he-zhu/seo-geo-claude-skills)** · 20 skills for SEO + GEO · [ClawHub](https://clawhub.ai/u/aaron-he-zhu) · [skills.sh](https://skills.sh/aaron-he-zhu/seo-geo-claude-skills)
+> **System Mode**: This monitoring skill follows the shared [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md) and [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md).
+
+
+Sets up proactive monitoring alerts for critical SEO and GEO metrics. Triggers notifications when rankings drop, traffic changes significantly, technical issues occur, or competitors make moves.
+
+**System role**: Monitoring layer skill. It turns performance changes into deltas, alerts, and next actions.
+
+## When This Must Trigger
+
+Use this when the conversation involves any of these situations — even if the user does not use SEO terminology:
+
+Use this whenever the task needs time-aware change detection, escalation, or stakeholder-ready visibility.
+
+- Setting up SEO monitoring systems
+- Creating ranking drop alerts
+- Monitoring technical SEO health
+- Tracking competitor movements
+- Alerting on content performance changes
+- Monitoring GEO/AI visibility changes
+- Setting up brand mention alerts
+
+## What This Skill Does
+
+1. **Alert Configuration**: Sets up custom alert thresholds
+2. **Multi-Metric Monitoring**: Tracks rankings, traffic, technical issues
+3. **Threshold Management**: Defines when alerts trigger
+4. **Priority Classification**: Categorizes alerts by severity
+5. **Notification Setup**: Configures how alerts are delivered
+6. **Alert Response Plans**: Creates action plans for each alert type
+7. **Alert History**: Tracks alert patterns over time
+
+## Quick Start
+
+Start with one of these prompts. Finish with a short handoff summary using the repository format in [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md).
+
+### Set Up Alerts
+
+```
+Set up SEO monitoring alerts for [domain]
+```
+
+```
+Create ranking drop alerts for my top 20 keywords
+```
+
+### Configure Specific Alerts
+
+```
+Alert me when [specific condition]
+```
+
+```
+Set up competitor monitoring for [competitor domains]
+```
+
+### Review Alert System
+
+```
+Review and optimize my current SEO alerts
+```
+
+## Skill Contract
+
+**Expected output**: a delta summary, alert/report output, and a short handoff summary ready for `memory/monitoring/`.
+
+- **Reads**: current metrics, previous baselines, alert thresholds, and reporting context from [CLAUDE.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CLAUDE.md) and the shared [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md) when available.
+- **Writes**: a user-facing monitoring deliverable plus a reusable summary that can be stored under `memory/monitoring/`.
+- **Promotes**: significant changes, confirmed anomalies, and follow-up actions to `memory/open-loops.md` and `memory/decisions.md`.
+- **Next handoff**: use the `Next Best Skill` below when a change needs action.
+
+## Data Sources
+
+> **Note:** All integrations are optional. This skill works without any API keys — users provide data manually when no tools are connected.
+
+> See [CONNECTORS.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CONNECTORS.md) for tool category placeholders.
+
+**With ~~SEO tool + ~~search console + ~~web crawler connected:**
+Automatically monitor real-time metric feeds for ranking changes via ~~SEO tool API, indexing and coverage alerts from ~~search console, and technical health alerts from ~~web crawler. Set up automated threshold-based alerts with notification delivery.
+
+**With manual data only:**
+Ask the user to provide:
+1. Current baseline metrics for alert thresholds (rankings, traffic, backlinks)
+2. Critical keywords or pages to monitor
+3. Alert priority levels and notification preferences
+4. Historical data to understand normal fluctuation ranges
+5. Manual reporting on metric changes when they check their tools
+
+Proceed with the alert configuration using provided parameters. User will need to manually check metrics and report changes for alert triggers.
+
+## Instructions
+
+When a user requests alert setup:
+
+1. **Define Alert Categories**
+
+   ```markdown
+   ## SEO Alert System Configuration
+   
+   **Domain**: [domain]
+   **Configured Date**: [date]
+   
+   ### Alert Categories
+   
+   | Category | Description | Typical Urgency |
+   |----------|-------------|-----------------|
+   | Ranking Alerts | Keyword position changes | Medium-High |
+   | Traffic Alerts | Organic traffic fluctuations | High |
+   | Technical Alerts | Site health issues | Critical |
+   | Backlink Alerts | Link profile changes | Medium |
+   | Competitor Alerts | Competitor movements | Low-Medium |
+   | GEO Alerts | AI visibility changes | Medium |
+   | Brand Alerts | Brand mentions and reputation | Medium |
+   ```
+
+2. **Configure Alert Rules by Category**
+
+   For each relevant category (Rankings, Traffic, Technical, Backlinks, Competitors, GEO/AI, Brand), define alert name, trigger condition, threshold, and priority level.
+
+   > **Reference**: See [references/alert-configuration-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/monitor/alert-manager/references/alert-configuration-templates.md) for complete alert tables, threshold examples, and response plan templates for all 7 categories.
+
+3. **Define Alert Response Plans**
+
+   Map each priority level (Critical, High, Medium, Low) to a response time and immediate action steps.
+
+4. **Set Up Alert Delivery**
+
+   Configure notification channels (Email, SMS, Slack), recipient routing by role, suppression rules (duplicate cooldown, maintenance windows), and escalation paths.
+
+5. **Create Alert Summary**
+
+    ```markdown
+    # SEO Alert System Summary
+    
+    **Domain**: [domain]
+    **Configured**: [date]
+    **Total Active Alerts**: [X]
+    
+    ## Alert Count by Category
+    
+    | Category | Critical | High | Medium | Low | Total |
+    |----------|----------|------|--------|-----|-------|
+    | Rankings | [X] | [X] | [X] | [X] | [X] |
+    | Traffic | [X] | [X] | [X] | [X] | [X] |
+    | Technical | [X] | [X] | [X] | [X] | [X] |
+    | Backlinks | [X] | [X] | [X] | [X] | [X] |
+    | Competitors | [X] | [X] | [X] | [X] | [X] |
+    | GEO | [X] | [X] | [X] | [X] | [X] |
+    | **Total** | **[X]** | **[X]** | **[X]** | **[X]** | **[X]** |
+    
+    ## Quick Reference
+    
+    ### If You Get a Critical Alert
+    
+    1. Don't panic
+    2. Check alert details
+    3. Follow response plan
+    4. Document actions taken
+    5. Update stakeholders
+    
+    ### Weekly Alert Review Checklist
+    
+    - [ ] Review all alerts triggered
+    - [ ] Identify patterns
+    - [ ] Adjust thresholds if needed
+    - [ ] Update response plans
+    - [ ] Clean up false positives
+    ```
+
+## Validation Checkpoints
+
+### Input Validation
+- [ ] Alert thresholds are based on realistic baseline data
+- [ ] Critical keywords and pages clearly identified
+- [ ] Response plans defined for each alert priority level
+- [ ] Notification channels configured with appropriate recipients
+
+### Output Validation
+- [ ] Every metric cites its data source and collection date
+- [ ] Alert thresholds account for normal metric fluctuations
+- [ ] Response plans are specific and time-bound
+- [ ] Source of each alert trigger clearly stated (~~SEO tool API alert, ~~search console notification, ~~web crawler alert, or manual user check)
+
+## Example
+
+**User**: "Set up ranking drop alerts for my top keywords"
+
+**Output**:
+
+```markdown
+## Ranking Alert Configuration
+
+### Critical Keywords (Immediate Alert)
+
+| Keyword | Current | Alert If | Priority |
+|---------|---------|----------|----------|
+| best project management software | 2 | Drops to 5+ | 🔴 Critical |
+| project management tools | 4 | Drops to 8+ | 🔴 Critical |
+| team collaboration software | 1 | Any drop | 🔴 Critical |
+
+### Important Keywords (Same-Day Alert)
+
+| Keyword | Current | Alert If | Priority |
+|---------|---------|----------|----------|
+| agile project management | 7 | Drops out of top 10 | 🔴 High |
+| kanban software | 9 | Drops out of top 10 | 🔴 High |
+
+### Alert Response Plan
+
+**If Critical Keyword Drops**:
+1. Check if page is still indexed (site:url)
+2. Look for algorithm update announcements
+3. Analyze what changed in SERP
+4. Review competitor ranking changes
+5. Check for technical issues on page
+6. Create recovery action plan within 24 hours
+
+**Notification**: Email + Slack to SEO team immediately
+```
+
+## Tips for Success
+
+1. **Start simple** - Don't create too many alerts initially
+2. **Tune thresholds** - Adjust based on normal fluctuations
+3. **Avoid alert fatigue** - Too many alerts = ignored alerts
+4. **Document response plans** - Know what to do when alerts fire
+5. **Review regularly** - Alerts need maintenance as your SEO matures
+6. **Include positive alerts** - Track wins, not just problems
+
+## Alert Threshold Quick Reference
+
+| Metric | Warning | Critical | Frequency |
+|--------|---------|----------|-----------|
+| Organic traffic | -15% WoW | -30% WoW | Daily |
+| Keyword positions | >3 position drop | >5 position drop | Daily |
+| Pages indexed | -5% change | -20% change | Weekly |
+| Crawl errors | >10 new/day | >50 new/day | Daily |
+| Core Web Vitals | "Needs Improvement" | "Poor" | Weekly |
+| Backlinks lost | >5% in 1 week | >15% in 1 week | Weekly |
+| AI citation loss | Any key query | >20% queries | Weekly |
+| Security issues | Any detected | Any detected | Daily |
+
+> **Reference**: See [references/alert-threshold-guide.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/monitor/alert-manager/references/alert-threshold-guide.md) for baseline establishment, threshold setting methodology, fatigue prevention, escalation paths, and response playbooks.
+
+
+### Save Results
+
+After delivering monitoring data or reports to the user, ask:
+
+> "Save these results for future sessions?"
+
+If yes, write a dated summary to `memory/monitoring/YYYY-MM-DD-<topic>.md` containing:
+- One-line headline finding or status change
+- Top 3-5 actionable items
+- Open loops or anomalies requiring follow-up
+- Source data references
+
+If any findings should influence ongoing strategy, recommend promoting key conclusions to `memory/hot-cache.md`.
+
+## Reference Materials
+
+- [Alert Threshold Guide](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/monitor/alert-manager/references/alert-threshold-guide.md) — Recommended thresholds by metric, fatigue prevention strategies, and escalation path templates
+
+## Next Best Skill
+
+- **Primary**: [rank-tracker](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/monitor/rank-tracker/SKILL.md) — pair alerts with a baseline measurement workflow.
diff --git a/.cursor/skills/alert-manager/references/alert-configuration-templates.md b/.cursor/skills/alert-manager/references/alert-configuration-templates.md
new file mode 100644
index 0000000000..c962aed99a
--- /dev/null
+++ b/.cursor/skills/alert-manager/references/alert-configuration-templates.md
@@ -0,0 +1,293 @@
+# Alert Configuration Templates
+
+Detailed alert configuration templates for each alert category. Use these templates when setting up a new alert system for a domain.
+
+---
+
+## Ranking Alerts
+
+### Position Drop Alerts
+
+| Alert Name | Condition | Threshold | Priority | Action |
+|------------|-----------|-----------|----------|--------|
+| Critical Drop | Any top 3 keyword drops 5+ positions | Position change >=5 | Critical | Immediate investigation |
+| Major Drop | Top 10 keyword drops out of top 10 | Position >10 | High | Same-day review |
+| Moderate Drop | Any keyword drops 10+ positions | Position change >=10 | Medium | Weekly review |
+| Competitor Overtake | Competitor passes you for key term | Comp position < yours | Medium | Analysis needed |
+
+### Position Improvement Alerts
+
+| Alert Name | Condition | Threshold | Priority |
+|------------|-----------|-----------|----------|
+| New Top 3 | Keyword enters top 3 | Position <=3 | Positive |
+| Page 1 Entry | Keyword enters top 10 | Position <=10 | Positive |
+| Significant Climb | Keyword improves 10+ positions | Change >=+10 | Positive |
+
+### SERP Feature Alerts
+
+| Alert Name | Condition | Priority |
+|------------|-----------|----------|
+| Snippet Lost | Lost featured snippet ownership | High |
+| Snippet Won | Won new featured snippet | Positive |
+| AI Overview Change | Appeared/disappeared in AI Overview | Medium |
+
+### Keywords to Monitor
+
+| Keyword | Current Rank | Alert Threshold | Priority |
+|---------|--------------|-----------------|----------|
+| [keyword 1] | [X] | Drop >=3 | Critical |
+| [keyword 2] | [X] | Drop >=5 | High |
+| [keyword 3] | [X] | Drop >=10 | Medium |
+
+---
+
+## Traffic Alerts
+
+### Traffic Decline Alerts
+
+| Alert Name | Condition | Threshold | Priority |
+|------------|-----------|-----------|----------|
+| Traffic Crash | Day-over-day decline | >=50% drop | Critical |
+| Significant Drop | Week-over-week decline | >=30% drop | High |
+| Moderate Decline | Month-over-month decline | >=20% drop | Medium |
+| Trend Warning | 3 consecutive weeks decline | Any decline | Medium |
+
+### Traffic Anomaly Alerts
+
+| Alert Name | Condition | Priority |
+|------------|-----------|----------|
+| Traffic Spike | Unusual increase | Investigate |
+| Zero Traffic | Page receiving 0 visits | High |
+| Bot Traffic | Unusual traffic pattern | Medium |
+
+### Page-Level Alerts
+
+| Page Type | Alert Condition | Priority |
+|-----------|-----------------|----------|
+| Homepage | Any 20%+ decline | Critical |
+| Top 10 pages | Any 30%+ decline | High |
+| Conversion pages | Any 25%+ decline | High |
+| Blog posts | Any 40%+ decline | Medium |
+
+### Conversion Alerts
+
+| Alert Name | Condition | Priority |
+|------------|-----------|----------|
+| Conversion Drop | Organic conversions down 30%+ | Critical |
+| CVR Decline | Conversion rate drops 20%+ | High |
+
+---
+
+## Technical SEO Alerts
+
+### Critical Technical Alerts
+
+| Alert Name | Condition | Priority | Response Time |
+|------------|-----------|----------|---------------|
+| Site Down | HTTP 5xx errors | Critical | Immediate |
+| SSL Expiry | Certificate expiring in 14 days | Critical | Same day |
+| Robots.txt Block | Important pages blocked | Critical | Same day |
+| Index Dropped | Pages dropping from index | Critical | Same day |
+
+### Crawl & Index Alerts
+
+| Alert Name | Condition | Priority |
+|------------|-----------|----------|
+| Crawl Errors Spike | Errors increase 50%+ | High |
+| New 404 Pages | 404 errors on important pages | Medium |
+| Redirect Chains | 3+ redirect hops detected | Medium |
+| Duplicate Content | New duplicates detected | Medium |
+| Index Coverage Drop | Indexed pages decline 10%+ | High |
+
+### Performance Alerts
+
+| Alert Name | Condition | Priority |
+|------------|-----------|----------|
+| Core Web Vitals Fail | CWV drops to "Poor" | High |
+| Page Speed Drop | Load time increases 50%+ | Medium |
+| Mobile Issues | Mobile usability errors | High |
+
+### Security Alerts
+
+| Alert Name | Condition | Priority |
+|------------|-----------|----------|
+| Security Issue | GSC security warning | Critical |
+| Manual Action | Google manual action | Critical |
+| Malware Detected | Site flagged for malware | Critical |
+
+---
+
+## Backlink Alerts
+
+### Link Loss Alerts
+
+| Alert Name | Condition | Priority |
+|------------|-----------|----------|
+| High-Value Link Lost | DA 70+ link removed | High |
+| Multiple Links Lost | 10+ links lost in a day | Medium |
+| Referring Domain Lost | Lost entire domain's links | Medium |
+
+### Link Gain Alerts
+
+| Alert Name | Condition | Priority |
+|------------|-----------|----------|
+| High-Value Link | New DA 70+ link | Positive |
+| Suspicious Links | Many low-quality links | Review |
+| Negative SEO | Spam link attack pattern | High |
+
+### Link Profile Alerts
+
+| Alert Name | Condition | Priority |
+|------------|-----------|----------|
+| Toxic Score Increase | Toxic score up 20%+ | High |
+| Anchor Over-Optimization | Exact match anchors >30% | Medium |
+
+---
+
+## Competitor Monitoring Alerts
+
+### Ranking Alerts
+
+| Alert Name | Condition | Priority |
+|------------|-----------|----------|
+| Competitor Overtake | Competitor passes you | Medium |
+| Competitor Top 3 | Competitor enters top 3 on key term | Medium |
+| Competitor Content | Competitor publishes on your topic | Info |
+
+### Activity Alerts
+
+| Alert Name | Condition | Priority |
+|------------|-----------|----------|
+| New Backlinks | Competitor gains high-DA link | Info |
+| Content Update | Competitor updates ranking content | Info |
+| New Content | Competitor publishes new content | Info |
+
+### Competitors to Monitor
+
+| Competitor | Domain | Monitor Keywords | Alert Priority |
+|------------|--------|------------------|----------------|
+| [Competitor 1] | [domain] | [X] keywords | High |
+| [Competitor 2] | [domain] | [X] keywords | Medium |
+| [Competitor 3] | [domain] | [X] keywords | Low |
+
+---
+
+## GEO (AI Visibility) Alerts
+
+### AI Citation Alerts
+
+| Alert Name | Condition | Priority |
+|------------|-----------|----------|
+| Citation Lost | Lost AI Overview citation | Medium |
+| Citation Won | New AI Overview citation | Positive |
+| Citation Position Drop | Dropped from 1st to 3rd+ source | Medium |
+| New AI Overview | AI Overview appears for tracked keyword | Info |
+
+### GEO Trend Alerts
+
+| Alert Name | Condition | Priority |
+|------------|-----------|----------|
+| Citation Rate Drop | AI citation rate drops 20%+ | High |
+| GEO Competitor | Competitor cited where you're not | Medium |
+
+---
+
+## Brand Monitoring Alerts
+
+### Mention Alerts
+
+| Alert Name | Condition | Priority |
+|------------|-----------|----------|
+| Brand Mention | New brand mention online | Info |
+| Negative Mention | Negative sentiment mention | High |
+| Review Alert | New review on key platforms | Medium |
+| Unlinked Mention | Brand mention without link | Opportunity |
+
+### Reputation Alerts
+
+| Alert Name | Condition | Priority |
+|------------|-----------|----------|
+| Review Rating Drop | Average rating drops | High |
+| Negative Press | Negative news article | High |
+| Competitor Comparison | Named in competitor comparison | Medium |
+
+---
+
+## Alert Response Plans
+
+### Critical Alert Response
+
+**Response Time**: Immediate (within 1 hour)
+
+| Alert Type | Immediate Actions |
+|------------|-------------------|
+| Site Down | 1. Check server status 2. Contact hosting 3. Check DNS |
+| Traffic Crash | 1. Check for algorithm update 2. Review GSC errors 3. Check competitors |
+| Manual Action | 1. Review GSC message 2. Identify issue 3. Begin remediation |
+| Critical Rank Drop | 1. Check if page indexed 2. Review SERP 3. Analyze competitors |
+
+### High Priority Response
+
+**Response Time**: Same day
+
+| Alert Type | Actions |
+|------------|---------|
+| Major Rank Drops | Analyze cause, create recovery plan |
+| Traffic Decline | Investigate source, check technical issues |
+| Backlink Loss | Attempt recovery outreach |
+| CWV Failure | Diagnose and fix performance issues |
+
+### Medium Priority Response
+
+**Response Time**: Within 48 hours
+
+| Alert Type | Actions |
+|------------|---------|
+| Moderate Rank Changes | Monitor trend, plan content updates |
+| Competitor Movement | Analyze competitor changes |
+| New 404s | Set up redirects, update internal links |
+
+### Low Priority
+
+**Response Time**: Weekly review
+
+| Alert Type | Actions |
+|------------|---------|
+| Positive Changes | Document wins, understand cause |
+| Info Alerts | Log for trend analysis |
+
+---
+
+## Alert Notification Setup
+
+### Notification Channels
+
+| Priority | Channels | Frequency |
+|----------|----------|-----------|
+| Critical | Email + SMS + Slack | Immediate |
+| High | Email + Slack | Immediate |
+| Medium | Email + Slack | Daily digest |
+| Low | Email | Weekly digest |
+
+### Alert Recipients
+
+| Role | Critical | High | Medium | Low |
+|------|----------|------|--------|-----|
+| SEO Manager | Yes | Yes | Yes | Yes |
+| Dev Team | Yes | Yes (tech only) | No | No |
+| Marketing Lead | Yes | Yes | No | No |
+| Executive | Yes | No | No | No |
+
+### Alert Suppression
+
+- Suppress duplicate alerts for 24 hours
+- Don't alert on known issues (maintenance windows)
+- Batch low-priority alerts into digests
+
+### Alert Escalation
+
+| If No Response In | Escalate To |
+|-------------------|-------------|
+| 1 hour (Critical) | SEO Manager -> Director |
+| 4 hours (High) | Team Lead -> Manager |
+| 24 hours (Medium) | Team -> Lead |
diff --git a/.cursor/skills/alert-manager/references/alert-threshold-guide.md b/.cursor/skills/alert-manager/references/alert-threshold-guide.md
new file mode 100644
index 0000000000..9b4099872e
--- /dev/null
+++ b/.cursor/skills/alert-manager/references/alert-threshold-guide.md
@@ -0,0 +1,377 @@
+# Alert Threshold Guide
+
+Complete reference for configuring SEO/GEO alert thresholds. Covers baseline establishment, threshold setting methodology, tuning process, alert routing configuration, notification channel setup, and response playbooks for each alert type.
+
+---
+
+## 1. Baseline Establishment Process
+
+Before setting any alert thresholds, you must establish a baseline that represents normal metric behavior for your site. Without a baseline, you will either set thresholds too tight (causing alert fatigue) or too loose (missing real problems).
+
+### Baseline Collection Timeline
+
+| Metric Category | Minimum Baseline Period | Ideal Baseline Period | Why |
+|----------------|------------------------|----------------------|-----|
+| Organic traffic | 4 weeks | 8-12 weeks | Accounts for weekly cycles and monthly patterns |
+| Keyword rankings | 2-4 weeks | 4-8 weeks | Rankings fluctuate daily; need to establish normal range |
+| Backlink metrics | 4 weeks | 8 weeks | Link acquisition is lumpy; need to see natural cadence |
+| Technical metrics | 2 weeks | 4 weeks | Most technical metrics are relatively stable |
+| Core Web Vitals | 4 weeks (28-day rolling) | 8 weeks | CrUX data is 28-day rolling average |
+| AI citations | 4 weeks | 8 weeks | AI answer composition changes frequently |
+
+### Baseline Data Collection Steps
+
+| Step | Action | Output |
+|------|--------|--------|
+| 1 | Record daily metric values for the baseline period | Raw data spreadsheet |
+| 2 | Calculate mean (average) for each metric | Central tendency |
+| 3 | Calculate standard deviation for each metric | Normal variation range |
+| 4 | Identify outliers (values > 2 standard deviations from mean) | Anomaly list |
+| 5 | Remove known outliers (holidays, outages, one-time events) | Clean baseline |
+| 6 | Recalculate mean and standard deviation on clean data | Final baseline values |
+| 7 | Document seasonal patterns if baseline covers enough time | Seasonal adjustment notes |
+
+### Baseline Metrics to Record
+
+| Metric | Daily | Weekly | Monthly |
+|--------|-------|--------|---------|
+| Organic sessions | Record | Calculate WoW % change | Calculate MoM % change |
+| Keyword positions (top 20) | Record | Calculate average movement | Calculate net position change |
+| Keywords in top 10 | Record | Calculate weekly count | Calculate monthly trend |
+| Crawl errors | Record | Calculate weekly new errors | Calculate monthly trend |
+| New backlinks | N/A | Record weekly count | Calculate monthly velocity |
+| Lost backlinks | N/A | Record weekly count | Calculate monthly velocity |
+| Core Web Vitals | N/A | Record from CrUX | Calculate monthly trend |
+| AI citations | N/A | Record weekly count | Calculate monthly trend |
+| Pages indexed | N/A | Record weekly count | Calculate monthly change |
+| Server response time | Record | Calculate weekly average | Calculate monthly average |
+
+---
+
+## 2. Threshold Setting Methodology
+
+### The Standard Deviation Method
+
+For most metrics, set thresholds based on standard deviations from your baseline mean.
+
+| Threshold Level | Formula | Meaning |
+|----------------|---------|---------|
+| **Info** | Mean +/- 1 standard deviation | Normal fluctuation range; log but do not alert |
+| **Warning** | Mean +/- 1.5 standard deviations | Unusual but not necessarily problematic |
+| **Critical** | Mean +/- 2 standard deviations | Statistically significant anomaly; investigate |
+| **Emergency** | Mean +/- 3 standard deviations | Extreme anomaly; immediate action required |
+
+**Example calculation:**
+
+```
+Metric: Daily organic sessions
+Baseline mean: 10,000 sessions/day
+Standard deviation: 800 sessions/day
+
+Info range:      8,200 - 11,800 (normal)
+Warning:         < 8,800 or > 11,200
+Critical:        < 8,400 or > 11,600
+Emergency:       < 7,600 or > 12,400
+```
+
+### The Percentage Method
+
+For metrics where standard deviation is not practical, use percentage-based thresholds.
+
+| Metric | Warning Threshold | Critical Threshold | Comparison Period |
+|--------|------------------|-------------------|-------------------|
+| Organic traffic | -15% vs. comparison | -30% vs. comparison | Week over week |
+| Keyword positions | >3 position average drop | >5 position average drop | Week over week |
+| Pages indexed | -5% change | -20% change | Week over week |
+| Referring domains | -5% loss | -15% loss | Month over month |
+| Crawl error rate | >2x baseline rate | >5x baseline rate | Day over day |
+| Conversion rate | -20% drop | -40% drop | Week over week |
+
+### The Absolute Value Method
+
+For binary or count-based metrics, use absolute thresholds.
+
+| Metric | Warning Threshold | Critical Threshold |
+|--------|------------------|-------------------|
+| New crawl errors | >10 new errors/day | >50 new errors/day |
+| Server 5xx errors | Any occurrence | >5 occurrences/hour |
+| Security issues | N/A | Any detection |
+| Manual penalties | N/A | Any notification |
+| SSL certificate expiry | <30 days to expiry | <7 days to expiry |
+| Robots.txt changes | Any unexpected change | Key pages blocked |
+
+---
+
+## 3. Threshold Configuration by Metric Category
+
+### Traffic Thresholds
+
+| Metric | Comparison | Warning | Critical | Emergency |
+|--------|-----------|---------|----------|-----------|
+| Total organic sessions | WoW | -15% | -30% | -50% |
+| Total organic sessions | DoD | -25% (weekday) | -40% | Site appears down |
+| Non-brand sessions | WoW | -20% | -35% | -50% |
+| Organic conversions | WoW | -20% | -40% | -60% |
+| Organic revenue | WoW | -15% | -30% | -50% |
+| Bounce rate | WoW | +10pp | +20pp | +30pp |
+| Page-level traffic (top 10 pages) | WoW | -25% | -40% | -60% |
+
+**Note:** Day-over-day traffic thresholds need day-of-week adjustment. Monday traffic typically differs from Saturday traffic. Compare Monday to Monday, not Monday to Sunday.
+
+### Ranking Thresholds
+
+| Metric | Scope | Warning | Critical |
+|--------|-------|---------|----------|
+| Position change (Tier 1 keywords) | Individual keyword | Drop >= 3 | Drop >= 5 |
+| Position change (Tier 2 keywords) | Individual keyword | Drop >= 5 | Drop >= 10 |
+| Position change (Tier 3 keywords) | Individual keyword | Drop >= 10 | Drop off page 3 |
+| Average position (all keywords) | Aggregate | +2.0 (worsening) | +5.0 (worsening) |
+| Keywords in top 10 | Count | -10% of count | -20% of count |
+| Keywords in top 3 | Count | Any decrease | -3 or more |
+| Brand keyword position | Individual | Any drop from #1 | Drops below #3 |
+| Featured snippet lost | Individual | Any loss | Loss of 3+ snippets |
+
+### Technical Thresholds
+
+| Metric | Warning | Critical | Emergency |
+|--------|---------|----------|-----------|
+| New 4xx errors | >5/day | >20/day | >100/day |
+| New 5xx errors | >1/day | >5/day | >20/day |
+| Crawl rate change | -30% vs. baseline | -60% vs. baseline | Near-zero crawl |
+| Index coverage drop | -5% | -15% | -30% |
+| Average server response time | >500ms | >1000ms | >2000ms |
+| LCP (mobile) | Moves to "Needs Improvement" | Moves to "Poor" | >6s |
+| CLS | >0.1 | >0.25 | >0.5 |
+| INP | >200ms | >500ms | >1000ms |
+| Robots.txt change | Any unexpected edit | Pages blocked | Entire site blocked |
+| Sitemap errors | New errors | Sitemap inaccessible | Sitemap returning 5xx |
+
+### Backlink Thresholds
+
+| Metric | Warning | Critical |
+|--------|---------|----------|
+| Referring domains lost (weekly) | >5% of total | >15% of total |
+| High-authority link lost (DR 60+) | Any loss | Loss of 3+ in one week |
+| Toxic link spike | >10 new toxic links/week | >50 new toxic links/week |
+| Anchor text over-optimization | Exact match reaches 20% | Exact match reaches 30% |
+| Negative SEO pattern | Unusual link velocity from low-DR sites | Massive spam link spike |
+
+### GEO / AI Visibility Thresholds
+
+| Metric | Warning | Critical |
+|--------|---------|----------|
+| AI citation rate | Drops 10+ percentage points | Drops below 10% |
+| Key query citation lost | Any Tier 1 query | 3+ Tier 1 queries |
+| Citation position degradation | Average position worsens by 2+ | Dropped from citations entirely |
+| Competitor gains citation you lost | 1 instance | Pattern across queries |
+
+---
+
+## 4. Alert Routing Configuration
+
+### Routing Matrix
+
+| Alert Category | P0 (Emergency) | P1 (Urgent) | P2 (Important) | P3 (Monitor) |
+|---------------|----------------|-------------|----------------|--------------|
+| **Traffic** | SEO Lead + Eng Manager + VP | SEO Lead + Marketing Mgr | SEO Team | Weekly digest |
+| **Rankings** | SEO Lead + Content Lead | SEO Team | SEO Team | Weekly digest |
+| **Technical** | SEO Lead + Eng Lead + DevOps | SEO Lead + Eng Team | SEO Team + Eng | Weekly digest |
+| **Backlinks** | SEO Lead | SEO Team | SEO Team | Weekly digest |
+| **Competitor** | N/A | SEO Lead | SEO Team | Weekly digest |
+| **GEO/AI** | SEO Lead + Content Lead | SEO Team | SEO Team | Weekly digest |
+| **Security** | SEO Lead + Eng Manager + VP + Legal | All above | N/A | N/A |
+
+### Role-Based Alert Filtering
+
+| Role | Receives | Does Not Receive |
+|------|---------|-----------------|
+| SEO Lead | All P0, P1, P2 alerts | P3 (weekly digest only) |
+| SEO Analyst | P1, P2 in their area | P0 (escalation only), other areas |
+| Content Lead | P0-P1 ranking + GEO alerts | Technical alerts, backlink alerts |
+| Engineering Lead | P0-P1 technical alerts | Ranking, content, backlink alerts |
+| Marketing VP | P0 only | P1-P3 (receives weekly summary) |
+| DevOps | P0 technical + security | All non-infrastructure alerts |
+
+---
+
+## 5. Notification Channel Setup
+
+### Channel Selection by Priority
+
+| Priority | Primary Channel | Secondary Channel | Escalation Channel |
+|----------|----------------|-------------------|-------------------|
+| P0 | SMS + Phone call | Slack (#seo-emergencies) | PagerDuty / on-call rotation |
+| P1 | Slack (#seo-alerts) | Email | SMS (if not acknowledged in 4h) |
+| P2 | Email | Slack (#seo-daily) | Auto-escalate to P1 after 1 week |
+| P3 | Weekly digest email | Dashboard | Auto-escalate to P2 after 1 month |
+
+### Notification Content Requirements
+
+Every alert notification should include:
+
+| Field | Required | Example |
+|-------|----------|---------|
+| Alert name | Yes | "Critical Ranking Drop" |
+| Priority level | Yes | "P0 — Emergency" |
+| Metric affected | Yes | "Position for 'project management software'" |
+| Current value | Yes | "Position 12" |
+| Previous value | Yes | "Position 3 (yesterday)" |
+| Threshold breached | Yes | "Dropped >5 positions" |
+| Timestamp | Yes | "2025-01-15 09:00 UTC" |
+| Affected URL | Yes (if applicable) | "yoursite.com/blog/pm-guide" |
+| Quick action link | Yes | Link to relevant tool/dashboard |
+| Suggested first step | Recommended | "Check if page is still indexed: site:yoursite.com/blog/pm-guide" |
+
+### Notification Suppression Rules
+
+| Rule | Configuration | Reason |
+|------|-------------|--------|
+| Duplicate cooldown | Do not re-alert on same metric for 24 hours | Prevent alert storms |
+| Maintenance window | Suppress non-security alerts during scheduled maintenance | Avoid known-cause alerts |
+| Weekend adjustment | Increase traffic thresholds by 20% on weekends | Weekend traffic naturally lower |
+| Holiday adjustment | Suppress traffic alerts on major holidays | Known seasonal impact |
+| Recovery auto-close | Auto-close alert if metric returns to normal within 48h | Reduce stale alerts |
+| Batch related alerts | Group multiple ranking drops into single "Ranking Alert" | Reduce notification volume |
+
+---
+
+## 6. Threshold Tuning Guide
+
+### When to Tune Thresholds
+
+| Signal | Action |
+|--------|--------|
+| Too many false positives (>30% of alerts are noise) | Widen thresholds by 0.5 standard deviations |
+| Missed a real problem | Tighten the specific threshold that should have caught it |
+| Seasonal change approaching | Adjust baselines for known seasonal patterns |
+| Major site change (redesign, migration) | Re-establish baseline from scratch (2-4 week observation) |
+| New competitor enters market | Add competitor monitoring, adjust ranking sensitivity |
+| After algorithm update | Let metrics stabilize for 2-4 weeks, then recalibrate |
+
+### Monthly Threshold Review Checklist
+
+| Check | Action |
+|-------|--------|
+| Review all alerts fired in the past month | Count true positives vs. false positives |
+| Calculate false positive rate | If >30%, thresholds are too tight |
+| Check for missed events | If a real issue was not alerted, threshold is too loose |
+| Review metric baselines | Recalculate mean and standard deviation with latest data |
+| Adjust seasonal baselines | Incorporate seasonal patterns from year-over-year data |
+| Update keyword tiers | Promote/demote keywords based on current business priority |
+| Verify notification routing | Confirm all recipients are still in the correct roles |
+| Test alert delivery | Send a test alert through each channel to verify delivery |
+
+### Threshold Evolution Over Time
+
+| Site Maturity | Threshold Approach | Rationale |
+|-------------|-------------------|-----------|
+| New site (0-6 months) | Wide thresholds, few alerts | Metrics are volatile; avoid noise |
+| Growing (6-18 months) | Moderate thresholds, expand coverage | Enough data for meaningful baselines |
+| Established (18+ months) | Tight thresholds, comprehensive | Stable baselines, can detect subtle changes |
+| Post-migration | Reset to wide, re-tighten over 4-8 weeks | Old baselines are invalid |
+
+---
+
+## 7. Playbook Templates by Alert Type
+
+### Playbook: Organic Traffic Emergency (P0)
+
+**Trigger:** Organic traffic drops >50% day-over-day
+
+| Step | Time | Action | Tool |
+|------|------|--------|------|
+| 1 | 0 min | Verify site is accessible from multiple locations | Manual browser check, uptime monitor |
+| 2 | 5 min | Check Google Search Status Dashboard for outages | Google Status Dashboard |
+| 3 | 10 min | Check Search Console for manual actions or security issues | ~~search console |
+| 4 | 15 min | Check robots.txt for accidental blocking | Direct URL check |
+| 5 | 20 min | Check for noindex tags added to key pages | Crawl or manual page inspection |
+| 6 | 30 min | Review recent deployments or CMS changes | Deploy log, git history |
+| 7 | 45 min | Check server logs for unusual patterns | Server access logs |
+| 8 | 60 min | If unresolved, escalate to Engineering Manager | Slack/phone |
+
+### Playbook: Security Alert (P0)
+
+**Trigger:** Google Search Console security issue or manual action
+
+| Step | Time | Action |
+|------|------|--------|
+| 1 | 0 min | Read the exact message in Search Console |
+| 2 | 5 min | Notify Engineering Manager and VP Marketing |
+| 3 | 15 min | Scan site for malware or injected content |
+| 4 | 30 min | If compromised: take affected pages offline, rotate all credentials |
+| 5 | 1 hour | Identify attack vector and patch vulnerability |
+| 6 | 2 hours | Clean all affected pages, submit for re-review |
+| 7 | 24 hours | Verify resolution in Search Console |
+| 8 | 1 week | Post-incident review and security hardening |
+
+### Playbook: Algorithm Update Impact (P1-P2)
+
+**Trigger:** Confirmed Google algorithm update + ranking/traffic changes
+
+| Step | Time | Action |
+|------|------|--------|
+| 1 | Day 0 | Confirm update via Google Search Status Dashboard or official channels |
+| 2 | Day 0 | Document pre-update baseline metrics (rankings, traffic, visibility) |
+| 3 | Day 1-3 | Monitor daily — do not make changes while update is rolling out |
+| 4 | Day 7 | First analysis: which pages/keywords improved, which declined |
+| 5 | Day 7 | Analyze pattern: content quality? link profile? technical? YMYL? |
+| 6 | Day 14 | Develop action plan based on analysis |
+| 7 | Day 14-60 | Implement improvements (content quality, E-E-A-T signals, technical fixes) |
+| 8 | Next update | Re-evaluate impact after next core update |
+
+### Playbook: Backlink Attack / Negative SEO (P1)
+
+**Trigger:** Unusual spike in low-quality backlinks (>100 new links from spam domains in one week)
+
+| Step | Time | Action |
+|------|------|--------|
+| 1 | Day 0 | Verify the spike in ~~link database |
+| 2 | Day 0 | Identify the pattern (same anchor text? same link network? same country?) |
+| 3 | Day 1 | Export all new toxic links |
+| 4 | Day 1 | Create disavow file with identified spam domains |
+| 5 | Day 2 | Upload disavow to Google Search Console |
+| 6 | Day 2 | Document the attack pattern for future reference |
+| 7 | Day 7 | Re-check for continued spam link activity |
+| 8 | Day 14 | Verify disavow processed, monitor rankings for impact |
+
+### Playbook: Core Web Vitals Degradation (P2)
+
+**Trigger:** Any CWV metric moves from "Good" to "Needs Improvement" or "Poor"
+
+| Step | Time | Action |
+|------|------|--------|
+| 1 | Day 0 | Identify which metric degraded and which page groups are affected |
+| 2 | Day 1 | Run PageSpeed Insights on representative pages |
+| 3 | Day 1 | Check recent deployments for potential cause (new scripts, images, layout changes) |
+| 4 | Day 2 | Create engineering ticket with diagnosis and fix recommendations |
+| 5 | Day 3-14 | Engineering implements fix |
+| 6 | Day 14 | Verify improvement in lab data (PageSpeed Insights) |
+| 7 | Day 42 | Verify improvement in field data (CrUX — 28-day rolling window) |
+
+---
+
+## 8. Alert System Maintenance
+
+### Quarterly System Review
+
+| Task | Frequency | Owner |
+|------|-----------|-------|
+| Recalculate all baselines with latest data | Quarterly | SEO Lead |
+| Review and update keyword tier assignments | Quarterly | SEO Team |
+| Audit notification routing (team changes, role changes) | Quarterly | SEO Lead |
+| Test all notification channels (SMS, Slack, email) | Quarterly | SEO Lead |
+| Review alert response times (are SLAs being met?) | Quarterly | SEO Lead |
+| Archive resolved alerts older than 90 days | Quarterly | SEO Analyst |
+| Update playbooks based on lessons learned | Quarterly | SEO Team |
+
+### Alert Effectiveness Metrics
+
+Track these metrics about your alerting system itself:
+
+| Metric | Target | Meaning |
+|--------|--------|---------|
+| False positive rate | <30% | % of alerts that were not actionable |
+| Mean time to acknowledge (MTTA) | P0: <15min, P1: <4h | Time from alert to first human response |
+| Mean time to resolve (MTTR) | P0: <2h, P1: <24h | Time from alert to resolution |
+| Missed incident rate | 0% | Real problems that were not alerted |
+| Alert volume per week | Manageable for team size | If overwhelming, thresholds need tuning |
diff --git a/.cursor/skills/backlink-analyzer/SKILL.md b/.cursor/skills/backlink-analyzer/SKILL.md
new file mode 100644
index 0000000000..ce0a2e7977
--- /dev/null
+++ b/.cursor/skills/backlink-analyzer/SKILL.md
@@ -0,0 +1,301 @@
+---
+name: backlink-analyzer
+description: 'Analyze backlink profiles: link authority, toxic links, building opportunities, competitor link gaps. 外链分析/反向链接'
+version: "6.0.0"
+license: Apache-2.0
+compatibility: "Claude Code ≥1.0, skills.sh marketplace, ClawHub marketplace, Vercel Labs skills ecosystem. No system packages required. Optional: MCP network access for SEO tool integrations."
+homepage: "https://github.com/aaron-he-zhu/seo-geo-claude-skills"
+when_to_use: "Use when analyzing backlink profiles, link quality, toxic links, referring domains, or anchor text distribution."
+argument-hint: "<domain or URL>"
+metadata:
+  author: aaron-he-zhu
+  version: "6.0.0"
+  geo-relevance: "low"
+  tags:
+    - seo
+    - backlinks
+    - link-building
+    - link-profile
+    - toxic-links
+    - off-page-seo
+    - link-audit
+    - referring-domains
+    - disavow
+    - ahrefs-alternative
+    - 外链分析
+    - 被リンク
+    - 백링크
+    - backlinks-seo
+  triggers:
+    # EN-formal
+    - "analyze backlinks"
+    - "check link profile"
+    - "find toxic links"
+    - "link building opportunities"
+    - "link profile analysis"
+    - "backlink audit"
+    - "link quality"
+    # EN-casual
+    - "who links to me"
+    - "I have spammy links"
+    - "how do I get more backlinks"
+    - "how do I get more links"
+    - "disavow links"
+    - "link building outreach"
+    - "disavow file"
+    # EN-question
+    - "how to build backlinks"
+    - "how to find toxic backlinks"
+    # ZH-pro
+    - "外链分析"
+    - "反向链接"
+    - "有毒链接"
+    - "链接建设"
+    # ZH-casual
+    - "外链怎么做"
+    - "有垃圾外链"
+    - "谁链接到我"
+    - "友链"
+    - "互换友链"
+    - "外链建设"
+    # JA
+    - "被リンク分析"
+    - "バックリンク"
+    - "リンク構築"
+    # KO
+    - "백링크 분석"
+    - "링크 빌딩"
+    - "누가 내 사이트 링크해?"
+    - "백링크 어떻게 늘려?"
+    # ES
+    - "análisis de backlinks"
+    - "enlaces entrantes"
+    # PT
+    - "análise de backlinks"
+    # Misspellings
+    - "backlink anaylsis"
+    - "backlnk analysis"
+---
+
+# Backlink Analyzer
+
+
+> **[SEO & GEO Skills Library](https://github.com/aaron-he-zhu/seo-geo-claude-skills)** · 20 skills for SEO + GEO · [ClawHub](https://clawhub.ai/u/aaron-he-zhu) · [skills.sh](https://skills.sh/aaron-he-zhu/seo-geo-claude-skills)
+> **System Mode**: This monitoring skill follows the shared [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md) and [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md).
+
+
+Analyzes, monitors, and optimizes backlink profiles. Identifies link quality, discovers opportunities, and tracks competitor link building activities.
+
+**System role**: Monitoring layer skill. It turns performance changes into deltas, alerts, and next actions.
+
+## When This Must Trigger
+
+Use this when the conversation involves any of these situations — even if the user does not use SEO terminology:
+
+Use this whenever the task needs time-aware change detection, escalation, or stakeholder-ready visibility.
+
+- Auditing your current backlink profile
+- Identifying toxic or harmful links
+- Discovering link building opportunities
+- Analyzing competitor backlink strategies
+- Monitoring new and lost links
+- Evaluating link quality for outreach
+- Preparing for link disavow
+
+## What This Skill Does
+
+1. **Profile Analysis**: Comprehensive backlink profile overview
+2. **Quality Assessment**: Evaluates link authority and relevance
+3. **Toxic Link Detection**: Identifies harmful links
+4. **Competitor Analysis**: Compares link profiles across competitors
+5. **Opportunity Discovery**: Finds link building prospects
+6. **Trend Monitoring**: Tracks link acquisition over time
+7. **Disavow Guidance**: Helps create disavow files
+
+## Quick Start
+
+Start with one of these prompts. Finish with a short handoff summary using the repository format in [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md).
+
+### Analyze Your Profile
+
+```
+Analyze backlink profile for [domain]
+```
+
+### Find Opportunities
+
+```
+Find link building opportunities by analyzing [competitor domains]
+```
+
+### Detect Issues
+
+```
+Check for toxic backlinks on [domain]
+```
+
+### Compare Profiles
+
+```
+Compare backlink profiles: [your domain] vs [competitor domains]
+```
+
+## Skill Contract
+
+**Expected output**: a delta summary, alert/report output, and a short handoff summary ready for `memory/monitoring/`.
+
+- **Reads**: current metrics, previous baselines, alert thresholds, and reporting context from [CLAUDE.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CLAUDE.md) and the shared [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md) when available.
+- **Writes**: a user-facing monitoring deliverable plus a reusable summary that can be stored under `memory/monitoring/`.
+- **Promotes**: significant changes, confirmed anomalies, and follow-up actions to `memory/open-loops.md` and `memory/decisions.md`.
+- **Next handoff**: use the `Next Best Skill` below when a change needs action.
+
+## Data Sources
+
+> **Note:** All integrations are optional. This skill works without any API keys — users provide data manually when no tools are connected.
+
+> See [CONNECTORS.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CONNECTORS.md) for tool category placeholders.
+
+**With ~~link database + ~~SEO tool connected:**
+Automatically pull comprehensive backlink profiles including referring domains, anchor text distribution, link quality metrics (DA/DR), link velocity, and toxic link detection from ~~link database. Competitor backlink data from ~~SEO tool for gap analysis.
+
+**With manual data only:**
+Ask the user to provide:
+1. Backlink export CSV (with source domains, anchor text, link type)
+2. Referring domains list with authority metrics
+3. Competitor domains for comparison
+4. Recent link gains/losses if tracking changes
+5. Any known toxic or spammy links
+
+Proceed with the full analysis using provided data. Note in the output which metrics are from automated collection vs. user-provided data.
+
+## Instructions
+
+When a user requests backlink analysis:
+
+1. **Generate Profile Overview** -- Key metrics (total backlinks, referring domains, DA/DR, dofollow ratio), link velocity (30d/90d/year), authority distribution chart, profile health score.
+
+2. **Analyze Link Quality** -- Top quality backlinks table, link type distribution, anchor text analysis (brand/exact/partial/URL/generic), geographic distribution.
+
+3. **Identify Toxic Links** -- Toxic score, risk indicators by type (spam, PBN, link farms, irrelevant), high-risk links to review, disavow recommendations (domain-level and URL-level).
+
+4. **Compare Against Competitors** -- Profile comparison table (referring domains, DA/DR, velocity, avg link DA), unique referring domains, link intersection analysis, competitor content attracting most links.
+
+5. **Find Link Building Opportunities** -- Link intersection prospects, broken link opportunities, unlinked mentions, resource page opportunities, guest post prospects, priority matrix (effort vs impact).
+
+6. **Track Link Changes** -- New and lost links for last 30 days with DA, type, anchor, dates. Net change and links to recover.
+
+7. **Generate Backlink Report** -- Executive summary, strengths, concerns, opportunities, competitive position, recommended actions (immediate/short-term/long-term), KPIs to track.
+
+   > **Reference**: See [references/analysis-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/monitor/backlink-analyzer/references/analysis-templates.md) for complete output templates for all 7 steps above.
+
+### CITE Item Mapping
+
+When running `domain-authority-auditor` after this analysis, the following data feeds directly into CITE scoring:
+
+| Backlink Metric | CITE Item | Dimension |
+|----------------|-----------|-----------|
+| Referring domains count | C01 (Referring Domain Volume) | Citation |
+| Authority distribution (DA breakdown) | C02 (Referring Domains Quality) | Citation |
+| Link velocity | C04 (Link Velocity) | Citation |
+| Geographic distribution | C10 (Link Source Diversity) | Citation |
+| Dofollow/Nofollow ratio | T02 (Dofollow Ratio Normality) | Trust |
+| Toxic link analysis | T01 (Link Profile Naturalness), T03 (Link-Traffic Coherence) | Trust |
+| Competitive link intersection | T05 (Profile Uniqueness) | Trust |
+
+## Validation Checkpoints
+
+### Input Validation
+- [ ] Target domain backlink data is complete and current
+- [ ] Competitor domains specified for comparison analysis
+- [ ] Backlink data includes necessary fields (source domain, anchor text, link type)
+- [ ] Authority metrics available (DA/DR or equivalent)
+
+### Output Validation
+- [ ] Every metric cites its data source and collection date
+- [ ] Toxic link assessments include risk justification
+- [ ] Link opportunity recommendations are specific and actionable
+- [ ] Source of each data point clearly stated (~~link database data, ~~SEO tool data, user-provided, or estimated)
+
+## Example
+
+**User**: "Find link building opportunities by analyzing HubSpot, Salesforce, and Mailchimp"
+
+**Output**:
+
+```markdown
+## Link Intersection Analysis
+
+### Sites linking to 2+ competitors (not you)
+
+| Domain | DA | HubSpot | Salesforce | Mailchimp | Opportunity |
+|--------|-----|---------|------------|-----------|-------------|
+| g2.com | 91 | ✅ | ✅ | ✅ | Get listed/reviewed |
+| capterra.com | 89 | ✅ | ✅ | ✅ | Submit for review |
+| entrepreneur.com | 92 | ✅ | ✅ | ❌ | Pitch guest post |
+| techcrunch.com | 94 | ✅ | ❌ | ✅ | PR/news pitch |
+
+### Top 5 Immediate Opportunities
+
+1. **G2.com** (DA 91) - All competitors listed
+   - Action: Create detailed G2 profile
+   - Effort: Low
+   - Impact: High authority + referral traffic
+
+2. **Entrepreneur.com** (DA 92) - 2 competitors have links
+   - Action: Pitch contributed article
+   - Effort: High
+   - Impact: High authority + brand exposure
+
+3. **MarketingProfs** (DA 75) - All competitors featured
+   - Action: Apply for expert contribution
+   - Effort: Medium
+   - Impact: Relevant audience + quality link
+
+### Estimated Impact
+
+If you acquire links from top 10 opportunities:
+- New referring domains: +10
+- Average DA of new links: 82
+- Estimated ranking impact: +2-5 positions for competitive keywords
+```
+
+## Tips for Success
+
+1. **Quality over quantity** - One DA 80 link beats ten DA 20 links
+2. **Monitor regularly** - Catch lost links and toxic links early
+3. **Study competitors** - Learn from their link building success
+4. **Diversify your profile** - Mix of link types and anchors
+5. **Disavow carefully** - Only disavow clearly toxic links
+
+## Link Quality and Strategy Reference
+
+> **Reference**: See [references/link-quality-rubric.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/monitor/backlink-analyzer/references/link-quality-rubric.md) for the complete link quality scoring matrix (6 weighted factors), toxic link identification criteria, link profile health benchmarks, and disavow file guidance.
+
+> **Reference**: See [references/outreach-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/monitor/backlink-analyzer/references/outreach-templates.md) for email outreach frameworks, subject line formulas, response rate benchmarks, follow-up sequences, and templates for each link building strategy.
+
+
+### Save Results
+
+After delivering monitoring data or reports to the user, ask:
+
+> "Save these results for future sessions?"
+
+If yes, write a dated summary to `memory/monitoring/YYYY-MM-DD-<topic>.md` containing:
+- One-line headline finding or status change
+- Top 3-5 actionable items
+- Open loops or anomalies requiring follow-up
+- Source data references
+
+If any findings should influence ongoing strategy, recommend promoting key conclusions to `memory/hot-cache.md`.
+
+
+**Gate check recommended**: If toxic link ratio exceeds 15%, recommend running domain-authority-auditor to assess overall domain trust impact.
+
+## Reference Materials
+
+- [Link Quality Rubric](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/monitor/backlink-analyzer/references/link-quality-rubric.md) — Quality scoring matrix with weighted factors and toxic link identification criteria
+- [Outreach Templates](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/monitor/backlink-analyzer/references/outreach-templates.md) — Email frameworks, subject line formulas, and response rate benchmarks
+
+## Next Best Skill
+
+- **Primary**: [domain-authority-auditor](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/domain-authority-auditor/SKILL.md) — translate link findings into a domain-level trust view.
diff --git a/.cursor/skills/backlink-analyzer/references/analysis-templates.md b/.cursor/skills/backlink-analyzer/references/analysis-templates.md
new file mode 100644
index 0000000000..d1809dc291
--- /dev/null
+++ b/.cursor/skills/backlink-analyzer/references/analysis-templates.md
@@ -0,0 +1,354 @@
+# Backlink Analysis Output Templates
+
+Detailed output templates for each step of the backlink analysis workflow. Use these templates when generating analysis deliverables.
+
+---
+
+## 1. Profile Overview Template
+
+```markdown
+## Backlink Profile Overview
+
+**Domain**: [domain]
+**Analysis Date**: [date]
+
+### Key Metrics
+
+| Metric | Value | Industry Avg | Status |
+|--------|-------|--------------|--------|
+| Total Backlinks | [X] | [Y] | [Above/Below avg] |
+| Referring Domains | [X] | [Y] | [status] |
+| Domain Authority | [X] | [Y] | [status] |
+| Domain Rating | [X] | [Y] | [status] |
+| Dofollow Links | [X] ([Y]%) | [Z]% | [status] |
+| Nofollow Links | [X] ([Y]%) | [Z]% | [status] |
+
+### Link Velocity
+
+| Period | New Links | Lost Links | Net Change |
+|--------|-----------|------------|------------|
+| Last 30 days | [X] | [Y] | [+/-Z] |
+| Last 90 days | [X] | [Y] | [+/-Z] |
+| Last year | [X] | [Y] | [+/-Z] |
+
+### Authority Distribution
+
+```
+DA 80-100: [X]%
+DA 60-79:  [X]%
+DA 40-59:  [X]%
+DA 20-39:  [X]%
+DA 0-19:   [X]%
+```
+
+**Profile Health Score**: [X]/100
+```
+
+---
+
+## 2. Link Quality Analysis Template
+
+```markdown
+## Link Quality Analysis
+
+### Top Quality Backlinks
+
+| Source Domain | DA | Link Type | Anchor | Target Page |
+|---------------|-----|-----------|--------|-------------|
+| [domain 1] | [DA] | Editorial | [anchor] | [page] |
+| [domain 2] | [DA] | Guest Post | [anchor] | [page] |
+| [domain 3] | [DA] | Resource | [anchor] | [page] |
+
+### Link Type Distribution
+
+| Type | Count | Percentage | Assessment |
+|------|-------|------------|------------|
+| Editorial | [X] | [Y]% | High quality |
+| Guest posts | [X] | [Y]% | Good |
+| Resource pages | [X] | [Y]% | Good |
+| Directory | [X] | [Y]% | Moderate |
+| Forum/Comments | [X] | [Y]% | Low quality |
+| Sponsored/Paid | [X] | [Y]% | Risky |
+
+### Anchor Text Analysis
+
+| Anchor Type | Count | Percentage | Status |
+|-------------|-------|------------|--------|
+| Brand name | [X] | [Y]% | Natural |
+| Exact match | [X] | [Y]% | [Warning if >30%] |
+| Partial match | [X] | [Y]% | Natural |
+| URL/Naked | [X] | [Y]% | Natural |
+| Generic | [X] | [Y]% | Natural |
+
+**Top Anchor Texts**:
+1. "[anchor 1]" - [X] links
+2. "[anchor 2]" - [X] links
+3. "[anchor 3]" - [X] links
+
+### Geographic Distribution
+
+| Country | Links | Percentage |
+|---------|-------|------------|
+| [Country 1] | [X] | [Y]% |
+| [Country 2] | [X] | [Y]% |
+| [Country 3] | [X] | [Y]% |
+```
+
+---
+
+## 3. Toxic Link Analysis Template
+
+```markdown
+## Toxic Link Analysis
+
+### Risk Summary
+
+**Toxic Score**: [X]/100
+**High Risk Links**: [X]
+**Medium Risk Links**: [X]
+**Action Required**: [Yes/No]
+
+### Toxic Link Indicators
+
+| Risk Type | Count | Examples |
+|-----------|-------|----------|
+| Spammy domains | [X] | [domains] |
+| Link farms | [X] | [domains] |
+| PBN suspected | [X] | [domains] |
+| Irrelevant sites | [X] | [domains] |
+| Foreign language spam | [X] | [domains] |
+| Penalized domains | [X] | [domains] |
+
+### High-Risk Links to Review
+
+| Source Domain | Risk Score | Issue | Recommendation |
+|---------------|------------|-------|----------------|
+| [domain 1] | 95/100 | Link farm | Disavow |
+| [domain 2] | 85/100 | Spam site | Disavow |
+| [domain 3] | 72/100 | PBN | Investigate |
+
+### Disavow Recommendations
+
+**Domains to disavow** ([X] total):
+```
+domain:[spam-site-1.com]
+domain:[spam-site-2.com]
+domain:[link-farm.com]
+```
+
+**Individual URLs to disavow** ([X] total):
+```
+[specific-url-1]
+[specific-url-2]
+```
+```
+
+---
+
+## 4. Competitive Backlink Analysis Template
+
+```markdown
+## Competitive Backlink Analysis
+
+### Profile Comparison
+
+| Metric | You | Competitor 1 | Competitor 2 | Competitor 3 |
+|--------|-----|--------------|--------------|--------------|
+| Referring Domains | [X] | [X] | [X] | [X] |
+| Domain Authority | [X] | [X] | [X] | [X] |
+| Domain Rating | [X] | [X] | [X] | [X] |
+| Link Velocity (30d) | [X] | [X] | [X] | [X] |
+| Avg Link DA | [X] | [X] | [X] | [X] |
+
+### Unique Referring Domains
+
+**Links only you have**: [X] domains
+**Links competitors share**: [X] domains
+**Links competitors have, you don't**: [X] domains -- Opportunity
+
+### Link Intersection Analysis
+
+**Sites linking to competitors but not you**:
+
+| Domain | DA | Links to Comp 1 | Comp 2 | Comp 3 | Opportunity |
+|--------|-----|-----------------|--------|--------|-------------|
+| [domain 1] | [DA] | Yes | Yes | Yes | High - All competitors |
+| [domain 2] | [DA] | Yes | Yes | No | High - 2 competitors |
+| [domain 3] | [DA] | Yes | No | No | Medium - 1 competitor |
+
+### Content Getting Most Links (Competitor Analysis)
+
+| Competitor | Content | Backlinks | Content Type |
+|------------|---------|-----------|--------------|
+| [Comp 1] | [Title/URL] | [X] | [Type] |
+| [Comp 2] | [Title/URL] | [X] | [Type] |
+| [Comp 3] | [Title/URL] | [X] | [Type] |
+
+**Insight**: [What content types attract most links in this niche]
+```
+
+---
+
+## 5. Link Building Opportunities Template
+
+```markdown
+## Link Building Opportunities
+
+### High-Priority Opportunities
+
+#### 1. Link Intersection Prospects
+
+Sites linking to multiple competitors but not you:
+
+| Domain | DA | Why Link | Contact Approach |
+|--------|-----|----------|------------------|
+| [domain 1] | [DA] | [resource page about X] | Suggest your resource |
+| [domain 2] | [DA] | [links to similar tools] | Pitch your tool |
+| [domain 3] | [DA] | [industry roundup] | Request inclusion |
+
+#### 2. Broken Link Opportunities
+
+| Source Page | Broken Link | Suggested Replacement |
+|-------------|-------------|----------------------|
+| [URL] | [broken URL] | [your relevant page] |
+
+#### 3. Unlinked Mentions
+
+| Site | Mention | Your Page to Link |
+|------|---------|-------------------|
+| [domain] | Mentioned your brand | [homepage] |
+| [domain] | Referenced your data | [research page] |
+
+#### 4. Resource Page Opportunities
+
+| Resource Page | Topic | Your Relevant Content |
+|---------------|-------|----------------------|
+| [URL] | [topic] | [your content] |
+
+#### 5. Guest Post Prospects
+
+| Site | DA | Topic Fit | Contact |
+|------|-----|-----------|---------|
+| [domain] | [DA] | [relevance] | [contact info/page] |
+
+### Link Building Priority Matrix
+
+| Opportunity Type | Effort | Impact | Priority |
+|------------------|--------|--------|----------|
+| Link intersection | Medium | High | Highest |
+| Broken links | Low | Medium | High |
+| Unlinked mentions | Low | Medium | High |
+| Resource pages | Medium | High | High |
+| Guest posts | High | High | Medium |
+```
+
+---
+
+## 6. Link Change Tracking Template
+
+```markdown
+## Link Change Tracking
+
+### New Links (Last 30 Days)
+
+| Source | DA | Type | Anchor | Date |
+|--------|-----|------|--------|------|
+| [domain 1] | [DA] | [type] | [anchor] | [date] |
+
+**Total new links**: [X]
+**Average DA of new links**: [X]
+**Best new link**: [domain] (DA [X])
+
+### Lost Links (Last 30 Days)
+
+| Source | DA | Reason | Action |
+|--------|-----|--------|--------|
+| [domain 1] | [DA] | Page removed | Reach out |
+| [domain 2] | [DA] | Link removed | Investigate |
+
+**Total lost links**: [X]
+**Net change**: [+/-X]
+
+### Links to Recover
+
+| Lost Link | Value | Recovery Strategy |
+|-----------|-------|-------------------|
+| [domain 1] | High | Contact webmaster |
+| [domain 2] | High | Update content they linked to |
+```
+
+---
+
+## 7. Backlink Report Template
+
+```markdown
+# Backlink Analysis Report
+
+**Domain**: [domain]
+**Report Date**: [date]
+**Period Analyzed**: [period]
+
+## Executive Summary
+
+Your backlink profile is [healthy/needs attention/concerning].
+
+**Key Stats**:
+- Referring domains: [X] ([+/-Y] vs last month)
+- Average link authority: [X] DA
+- Link velocity: [X] new links/month
+- Toxic link percentage: [X]%
+
+## Profile Strengths
+
+1. [Strength 1]
+2. [Strength 2]
+3. [Strength 3]
+
+## Areas of Concern
+
+1. [Concern 1]
+2. [Concern 2]
+
+## Opportunities Identified
+
+| Opportunity | Potential Links | Effort | Priority |
+|-------------|-----------------|--------|----------|
+| Link intersection | [X] sites | Medium | High |
+| Broken links | [X] sites | Low | High |
+| Resource pages | [X] sites | Medium | Medium |
+
+## Competitive Position
+
+Your referring domains rank #[X] among [Y] competitors.
+
+| Rank | Domain | Referring Domains |
+|------|--------|-------------------|
+| 1 | [domain] | [X] |
+| 2 | [domain] | [X] |
+| 3 | [domain] | [X] |
+
+## Recommended Actions
+
+### Immediate (This Week)
+- [ ] Disavow [X] toxic links identified
+- [ ] Reach out to [X] unlinked mentions
+
+### Short-term (This Month)
+- [ ] Pursue [X] link intersection opportunities
+- [ ] Fix [X] broken link opportunities
+- [ ] Recover [X] recently lost links
+
+### Long-term (This Quarter)
+- [ ] Create linkable asset targeting [topic]
+- [ ] Launch guest posting campaign
+- [ ] Build [X] resource page links
+
+## KPIs to Track
+
+| Metric | Current | 3-Month Target |
+|--------|---------|----------------|
+| Referring domains | [X] | [Y] |
+| Average DA of new links | [X] | [Y] |
+| Link velocity | [X]/mo | [Y]/mo |
+| Toxic link % | [X]% | <5% |
+```
diff --git a/.cursor/skills/backlink-analyzer/references/link-quality-rubric.md b/.cursor/skills/backlink-analyzer/references/link-quality-rubric.md
new file mode 100644
index 0000000000..03e801a492
--- /dev/null
+++ b/.cursor/skills/backlink-analyzer/references/link-quality-rubric.md
@@ -0,0 +1,292 @@
+# Link Quality Rubric
+
+Comprehensive reference for evaluating backlink quality. Use this rubric to assess individual links, audit entire link profiles, perform competitive link gap analysis, and prepare disavow files.
+
+---
+
+## 1. Individual Link Quality Evaluation
+
+### Scoring Methodology
+
+Evaluate each link across six factors. Multiply score (1-5) by factor weight to produce a weighted score. Sum all weighted scores for a final Link Quality Score (LQS).
+
+**Rating Scale:**
+- **LQS 4.0-5.0**: Premium link — high authority, topically relevant, editorial placement
+- **LQS 2.5-3.9**: Acceptable link — provides value, typical of healthy profiles
+- **LQS 1.0-2.4**: Low quality — minimal value, review for potential risk
+
+### Factor 1: Domain Authority (25% weight)
+
+| Score | DR / DA Range | Characteristics | Examples |
+|-------|-------------|-----------------|---------|
+| 5 | DR 70+ | Major publication, established authority | NYTimes, Forbes, BBC, major university sites |
+| 4 | DR 50-69 | Strong domain, recognized in industry | Industry publications, large blogs, government sites |
+| 3 | DR 30-49 | Moderate authority, established site | Mid-tier blogs, regional publications, niche authorities |
+| 2 | DR 15-29 | Low authority, newer or smaller site | Small blogs, newer companies, personal sites |
+| 1 | DR <15 | Very low authority | New sites, abandoned sites, thin content sites |
+
+**Notes:**
+- DR/DA is a proxy, not the sole indicator. A DR 30 site that is highly relevant to your niche may be more valuable than a DR 70 site in an unrelated field.
+- Check if the domain's authority is organic (earned over time) or inflated (bought links, PBN).
+
+### Factor 2: Topical Relevance (25% weight)
+
+| Score | Relevance Level | Description |
+|-------|----------------|-------------|
+| 5 | Exact match | Same niche, same subtopic. A link from a CRM review site to your CRM product. |
+| 4 | Closely related | Same industry, adjacent topic. A marketing blog linking to your email tool. |
+| 3 | Broadly related | Same general field. A business blog linking to your SaaS product. |
+| 2 | Tangentially related | Loose connection. A general news site mentioning your product in a tech roundup. |
+| 1 | Unrelated | No topical connection. A cooking blog linking to your B2B software. |
+
+**How to assess relevance:**
+1. Read the linking page content. Is it about your topic?
+2. Check the linking site's overall focus. Is it in your industry?
+3. Look at the surrounding content. Does the link make editorial sense?
+4. Check the site's other outbound links. Are they topically coherent?
+
+### Factor 3: Traffic to Linking Page (15% weight)
+
+| Score | Estimated Monthly Traffic | Characteristics |
+|-------|--------------------------|-----------------|
+| 5 | 10,000+ visits/month | High-traffic page, likely drives referral traffic |
+| 4 | 1,000-9,999 visits/month | Solid traffic, some referral value |
+| 3 | 100-999 visits/month | Moderate traffic, primarily SEO value |
+| 2 | 10-99 visits/month | Low traffic, SEO value only |
+| 1 | <10 visits/month | No meaningful traffic, minimal value |
+
+**Why traffic matters:**
+- Links from pages with real traffic are more likely to be genuine editorial placements.
+- Google likely weights links from pages that receive traffic more highly.
+- Referral traffic from the link provides direct business value beyond SEO.
+
+### Factor 4: Link Position (15% weight)
+
+| Score | Position | Description |
+|-------|----------|-------------|
+| 5 | In-content, editorial | Naturally placed within the article body as a citation or resource |
+| 4 | In-content, contextual | Within the body text but in a "resources" or "further reading" section |
+| 3 | Author bio or about section | Part of a contributor's bio or about page |
+| 2 | Sidebar or dedicated links section | Widget, blogroll, or sidebar placement |
+| 1 | Footer, sitewide, or hidden | Footer link, sitewide template link, or visually obscured |
+
+**Key principle:** Editorial in-content links carry the most weight because they represent a genuine endorsement. Footer and sitewide links are devalued by search engines.
+
+### Factor 5: Anchor Text (10% weight)
+
+| Score | Anchor Type | Example (for a CRM product) |
+|-------|------------|----------------------------|
+| 5 | Descriptive, natural | "this customer relationship management platform" |
+| 4 | Partial match, natural | "CRM tools for small businesses" |
+| 3 | Brand name | "Acme CRM" |
+| 2 | Naked URL | "https://acmecrm.com" |
+| 1 | Generic | "click here", "read more", "this website" |
+
+**Important nuance:** A natural link profile has a MIX of all anchor types. Too many exact-match anchors (score 5) can signal manipulation. The ideal distribution is:
+- Brand anchors: 30-40%
+- Naked URLs: 15-25%
+- Generic anchors: 10-20%
+- Descriptive/partial match: 15-25%
+- Exact match: 5-15%
+
+### Factor 6: Follow Status (10% weight)
+
+| Score | Status | Description |
+|-------|--------|-------------|
+| 5 | Dofollow, editorial | Standard followed link from editorial content |
+| 4 | Dofollow, non-editorial | Followed link from directory, profile, or user-generated content |
+| 3 | Sponsored (rel="sponsored") | Properly disclosed sponsored/paid link |
+| 2 | UGC (rel="ugc") | User-generated content link (forums, comments) |
+| 1 | Nofollow (rel="nofollow") | Explicitly nofollowed link |
+
+**Notes:**
+- Google treats nofollow as a "hint" rather than a directive since 2019.
+- Nofollow links from high-authority sites (e.g., Wikipedia) still provide brand value and referral traffic.
+- A healthy profile naturally includes a mix of followed and nofollowed links. Typical ratio: 60-80% dofollow, 20-40% nofollow.
+
+---
+
+## 2. Example Link Profile Assessments
+
+### Example A: Strong Link Profile
+
+| Characteristic | Value | Assessment |
+|---------------|-------|-----------|
+| Total referring domains | 1,200 | Healthy for a mid-size SaaS company |
+| Dofollow ratio | 72% | Natural distribution |
+| Average linking domain DR | 38 | Solid average authority |
+| Top anchor: brand name | 35% | Natural brand dominance |
+| Exact match anchors | 8% | Within safe range |
+| Topical relevance (sampled) | 75% related | Strong relevance signal |
+| Link velocity | +25/month net | Steady organic growth |
+| Toxic link estimate | 3% | Below 5% threshold — healthy |
+
+**Verdict:** Healthy profile with natural link distribution. Continue current strategy.
+
+### Example B: At-Risk Link Profile
+
+| Characteristic | Value | Assessment |
+|---------------|-------|-----------|
+| Total referring domains | 800 | Adequate but thin for competitive niche |
+| Dofollow ratio | 92% | Suspiciously high — may indicate link manipulation |
+| Average linking domain DR | 18 | Low average authority |
+| Top anchor: exact match keyword | 42% | Over-optimized — risk of penalty |
+| Exact match anchors | 42% | Far above safe threshold (>15%) |
+| Topical relevance (sampled) | 30% related | Many irrelevant links |
+| Link velocity | +80/month net | Unnaturally high — investigate |
+| Toxic link estimate | 18% | Above 10% threshold — action needed |
+
+**Verdict:** Profile shows signs of manipulation. Immediate actions needed: disavow toxic links, diversify anchor text, slow down link acquisition pace.
+
+### Example C: New Site Link Profile
+
+| Characteristic | Value | Assessment |
+|---------------|-------|-----------|
+| Total referring domains | 45 | Expected for a 6-month-old site |
+| Dofollow ratio | 65% | Natural |
+| Average linking domain DR | 28 | Reasonable for early-stage outreach |
+| Top anchor: brand name | 40% | Healthy |
+| Exact match anchors | 5% | Conservative and safe |
+| Topical relevance (sampled) | 80% related | Well-targeted outreach |
+| Link velocity | +8/month net | Appropriate for new site |
+| Toxic link estimate | 1% | Clean profile |
+
+**Verdict:** Healthy foundation. Focus on scaling link acquisition while maintaining quality standards.
+
+---
+
+## 3. Competitive Link Gap Analysis Methodology
+
+### Step-by-Step Process
+
+**Step 1: Identify competitors**
+Select 3-5 direct competitors who rank for your target keywords.
+
+**Step 2: Pull referring domain data**
+Export the full referring domain list for each competitor from ~~link database.
+
+**Step 3: Create intersection matrix**
+
+| Referring Domain | You | Comp 1 | Comp 2 | Comp 3 | Overlap Count |
+|-----------------|-----|--------|--------|--------|---------------|
+| example-a.com | No | Yes | Yes | Yes | 3 |
+| example-b.com | No | Yes | Yes | No | 2 |
+| example-c.com | No | Yes | No | No | 1 |
+| example-d.com | Yes | Yes | Yes | Yes | 3 (already have) |
+
+**Step 4: Prioritize opportunities**
+
+| Priority | Criteria | Rationale |
+|----------|---------|-----------|
+| Highest | Links to 3+ competitors, DR 50+, relevant | If all competitors have it, it is likely linkable |
+| High | Links to 2+ competitors, DR 30+, relevant | Strong signal of willingness to link in niche |
+| Medium | Links to 1 competitor, DR 50+, relevant | May be less accessible but high value |
+| Lower | Links to 1 competitor, DR <30, or low relevance | Diminishing returns |
+
+**Step 5: Analyze link context**
+For each high-priority opportunity, visit the actual linking page to understand:
+- Why did they link to your competitor? (resource page, mention, guest post, etc.)
+- What content on your site could replace or complement that link?
+- What outreach angle would work? (broken link, better resource, relationship)
+
+**Step 6: Create outreach plan**
+Build a prioritized list with contact information, outreach angle, and template selection.
+
+---
+
+## 4. Disavow File Format Guide
+
+### When to Disavow
+
+Only disavow links when you have clear evidence of risk. Unnecessary disavow can hurt your rankings.
+
+| Situation | Disavow? | Reasoning |
+|-----------|----------|-----------|
+| Obvious PBN links | Yes | Clear manipulation signal |
+| Paid links you cannot get removed | Yes | After attempting removal |
+| Spam attack (negative SEO) | Yes | Protect from third-party manipulation |
+| Low-quality directory links | Maybe | Only if pattern is excessive |
+| Foreign language spam | Yes | If clearly unnatural |
+| Low-DA sites with real content | No | Low quality is not toxic |
+| Nofollow links from any source | No | Already nofollowed; no risk |
+
+### Disavow File Format
+
+The disavow file is a plain text file (.txt) uploaded to Google Search Console.
+
+```
+# Disavow file for example.com
+# Generated: [date]
+# Reason: Toxic link cleanup
+
+# Individual URLs to disavow
+https://spam-site.com/page-with-link
+https://another-spam.com/toxic-page
+
+# Entire domains to disavow (use for sites with multiple toxic links)
+domain:link-farm-example.com
+domain:pbn-network-site.com
+domain:spam-directory.net
+```
+
+### Disavow File Best Practices
+
+| Practice | Why |
+|----------|-----|
+| Comment every entry or group | Future auditors need to understand why |
+| Use `domain:` for sites with multiple bad links | More thorough than individual URLs |
+| Use individual URLs when only one page is toxic | Avoid disavowing good links from the same domain |
+| Keep a changelog | Track what was added and when |
+| Review quarterly | Remove entries if domains have been cleaned up |
+| Never disavow your own domain | Common mistake that causes severe damage |
+| Back up before uploading | Keep previous version in case of errors |
+
+### Disavow Review Workflow
+
+| Step | Action | Tool |
+|------|--------|------|
+| 1 | Export full backlink profile | ~~link database |
+| 2 | Filter for known toxic patterns | Spam score, DR <10, foreign spam |
+| 3 | Manual review of flagged links | Visit each flagged domain |
+| 4 | Attempt removal via email first | Contact webmasters |
+| 5 | Wait 2 weeks for removal responses | Track outreach results |
+| 6 | Add non-removed toxic links to disavow | Format as .txt file |
+| 7 | Upload to Google Search Console | Disavow Links tool |
+| 8 | Document all actions and dates | Internal records |
+| 9 | Re-check in 4-6 weeks | Verify processing |
+
+---
+
+## 5. Link Profile Health Benchmarks
+
+### Healthy Profile Indicators
+
+| Metric | Healthy Range | Warning Sign | Critical |
+|--------|-------------|--------------|----------|
+| Dofollow ratio | 60-80% | >90% | >95% |
+| Exact match anchor % | <15% | 15-25% | >25% |
+| Brand anchor % | 25-45% | <15% | <5% |
+| Toxic link % | <5% | 5-10% | >10% |
+| Referring domain growth | Positive, steady | Flat | Declining |
+| Average linking DR | 25+ | 15-25 | <15 |
+| Link diversity (unique domains / total links) | >0.3 | 0.1-0.3 | <0.1 |
+| Topical relevance (sampled) | >60% | 40-60% | <40% |
+
+### Industry-Specific Benchmarks
+
+Authority expectations vary significantly by industry vertical.
+
+| Industry | Typical DR Range (Top 10 Sites) | Typical Referring Domains | Link Difficulty |
+|----------|-------------------------------|--------------------------|----------------|
+| Finance / Insurance | DR 60-90 | 5,000-50,000+ | Very High |
+| Health / Medical | DR 50-85 | 3,000-30,000+ | Very High |
+| Technology / SaaS | DR 40-80 | 1,000-20,000+ | High |
+| E-commerce (general) | DR 35-75 | 500-15,000+ | High |
+| Legal | DR 40-70 | 1,000-10,000+ | High |
+| Education | DR 50-90 | 2,000-25,000+ | Medium-High |
+| Local services | DR 15-45 | 50-500 | Medium |
+| B2B niche | DR 25-60 | 200-5,000+ | Medium |
+| Blog / Content site | DR 20-70 | 100-10,000+ | Medium |
+| New startup | DR 5-25 | 10-200 | Starting point |
+
+_Note: These are general ranges. Actual requirements depend on your specific keyword competition._
diff --git a/.cursor/skills/backlink-analyzer/references/outreach-templates.md b/.cursor/skills/backlink-analyzer/references/outreach-templates.md
new file mode 100644
index 0000000000..c6c25cf4b9
--- /dev/null
+++ b/.cursor/skills/backlink-analyzer/references/outreach-templates.md
@@ -0,0 +1,578 @@
+# Outreach Templates
+
+Copy-ready outreach email templates for each link building strategy. Includes subject line variations, follow-up sequences, personalization tips, and response handling scripts.
+
+---
+
+## General Outreach Principles
+
+Before using any template, follow these rules:
+
+| Principle | Why It Matters |
+|-----------|---------------|
+| **Personalize every email** | Generic emails get deleted. Reference specific content. |
+| **Lead with value** | Explain what is in it for them before making your ask. |
+| **Keep it short** | 100-150 words maximum for initial outreach. |
+| **One clear ask** | Do not combine multiple requests in one email. |
+| **Professional sender address** | Use name@yourdomain.com, not a free email provider. |
+| **No attachments on first email** | Attachments trigger spam filters and reduce trust. |
+| **Follow up (once or twice)** | Most responses come from follow-ups, not initial emails. |
+| **Track everything** | Log outreach in a spreadsheet or CRM for accountability. |
+
+---
+
+## 1. Broken Link Building Templates
+
+### Template 1A: Broken Link — Direct Approach
+
+**Subject:** Found a broken link on your [topic] page
+
+**Body:**
+
+Hi [Name],
+
+I was reading your article on [specific article title] — great resource on [brief compliment about content].
+
+I noticed the link to [description of broken link resource] in the [section name] section seems to be broken (returns a 404).
+
+I recently published a guide on [your topic] that covers similar ground: [Your URL]
+
+It might work as a replacement if you are updating the page. Either way, wanted to give you a heads-up about the broken link.
+
+Best,
+[Your Name]
+
+---
+
+### Template 1B: Broken Link — Roundup Page
+
+**Subject:** Quick heads-up about [their page title]
+
+**Body:**
+
+Hi [Name],
+
+Your [topic] resource page is one of my go-to references — really well curated.
+
+I noticed [X] links on the page are no longer working:
+- [Broken URL 1] — returns 404
+- [Broken URL 2] — domain expired
+
+If you are updating the page, I have a [content type] that covers [topic]: [Your URL]
+
+Happy to suggest other replacement resources too if that would help.
+
+Thanks for maintaining such a useful list,
+[Your Name]
+
+---
+
+### Follow-Up (5-7 days after initial email)
+
+**Subject:** Re: [original subject]
+
+**Body:**
+
+Hi [Name],
+
+Just bumping this up in case it got buried. I spotted a broken link on your [page title] and thought you would want to know.
+
+No worries if the page is staying as-is — just wanted to make sure you saw it.
+
+[Your Name]
+
+---
+
+## 2. Guest Post Pitching Templates
+
+### Template 2A: Guest Post — Topic Pitch
+
+**Subject:** Guest post idea: [proposed title]
+
+**Body:**
+
+Hi [Name],
+
+I have been following [their site name] for a while — your recent piece on [specific article] was especially [specific compliment].
+
+I would love to contribute a guest post on [proposed topic]. Here is what I have in mind:
+
+**Title:** [Proposed title]
+**Angle:** [1-2 sentences describing the unique angle]
+**Why your readers would care:** [1 sentence on value to their audience]
+
+I have written for [1-2 relevant publications] previously. Here are a couple of samples:
+- [Sample URL 1]
+- [Sample URL 2]
+
+Would this be a fit for [their site name]?
+
+Best,
+[Your Name]
+[Your title / credentials]
+
+---
+
+### Template 2B: Guest Post — Expertise Pitch
+
+**Subject:** [Your expertise area] contributor for [their site]
+
+**Body:**
+
+Hi [Name],
+
+I am [Your Name], a [your role/expertise] with [X years] experience in [field]. I noticed [their site] covers [topic area] extensively and thought I could contribute something useful.
+
+Three topic ideas that might work for your audience:
+
+1. [Title idea 1] — [one-line description]
+2. [Title idea 2] — [one-line description]
+3. [Title idea 3] — [one-line description]
+
+Happy to adjust angles or pitch different ideas. I can also share writing samples if helpful.
+
+Thanks,
+[Your Name]
+
+---
+
+### Follow-Up (7-10 days after initial email)
+
+**Subject:** Re: [original subject]
+
+**Body:**
+
+Hi [Name],
+
+Checking back on my guest post pitch. I know editorial calendars fill up fast, so no worries if the timing does not work.
+
+If any of the topics I suggested could work down the road, happy to keep in touch for a future slot.
+
+[Your Name]
+
+---
+
+## 3. Resource Page Outreach Templates
+
+### Template 3A: Resource Page — Direct Suggestion
+
+**Subject:** Resource for your [topic] page
+
+**Body:**
+
+Hi [Name],
+
+I found your [topic] resource page while researching [related topic] — it is a really well-organized collection.
+
+I thought [your resource title] might be a good addition to the [specific section] section. It covers [brief description of what it covers and why it is useful].
+
+Here is the link: [Your URL]
+
+No pressure at all — just thought it might be useful for your readers.
+
+Best,
+[Your Name]
+
+---
+
+### Template 3B: Resource Page — Value-Add Approach
+
+**Subject:** A few resources for your [topic] page
+
+**Body:**
+
+Hi [Name],
+
+I have been using your [topic] resource page as a reference and really appreciate the curation work.
+
+I wanted to suggest a few resources you might consider adding (not all mine):
+
+1. [External resource title] — [URL] — Great for [reason]
+2. [Your resource title] — [Your URL] — Covers [topic]
+3. [Another external resource] — [URL] — Useful for [reason]
+
+I included a couple of third-party resources alongside mine since they genuinely complement your page.
+
+Thanks for maintaining such a useful list,
+[Your Name]
+
+---
+
+## 4. Unlinked Mention Outreach Templates
+
+### Template 4A: Unlinked Brand Mention
+
+**Subject:** Thanks for mentioning [brand name]
+
+**Body:**
+
+Hi [Name],
+
+Just came across your article [article title] — thanks for mentioning [your brand/product]. Really appreciate the kind words about [specific thing they said].
+
+Quick request: would you be able to add a link to [Your URL] where you mention us? It would help your readers find us directly and help us out with attribution.
+
+Totally understand if it is not possible. Either way, glad you found [product/service] useful.
+
+Best,
+[Your Name]
+
+---
+
+### Template 4B: Unlinked Data/Research Mention
+
+**Subject:** Thanks for citing our [research/data]
+
+**Body:**
+
+Hi [Name],
+
+I noticed you referenced our [study/statistic/data point] in your article [article title]. Thanks for citing our research.
+
+If you are able to add a link to the original source, it would help your readers verify the data and access the full [study/report]: [Your URL]
+
+We actually just updated it with [new data point] that might be relevant to your piece as well.
+
+Thanks,
+[Your Name]
+
+---
+
+### Follow-Up (5-7 days)
+
+**Subject:** Re: [original subject]
+
+**Body:**
+
+Hi [Name],
+
+Just a quick follow-up on adding a link for the mention of [brand/research] in your article. I know updating published content can be low priority, so no rush.
+
+If it helps, the exact URL is: [Your URL]
+
+Thanks again for the mention,
+[Your Name]
+
+---
+
+## 5. Digital PR / Data-Driven Outreach Templates
+
+### Template 5A: Original Research Pitch
+
+**Subject:** New data: [key finding headline]
+
+**Body:**
+
+Hi [Name],
+
+We just published [study/survey/analysis] that found [most surprising or newsworthy finding].
+
+Key findings:
+- [Finding 1 — the headline stat]
+- [Finding 2 — supporting data]
+- [Finding 3 — counterintuitive insight]
+
+Full report: [Your URL]
+
+Given your coverage of [their beat/topic], I thought this might be worth a mention. Happy to provide additional data, quotes, or custom analysis for your audience.
+
+[Your Name]
+[Your title]
+
+---
+
+### Template 5B: Expert Commentary Pitch
+
+**Subject:** Expert source for your [topic] coverage
+
+**Body:**
+
+Hi [Name],
+
+I saw you are covering [topic/trend] and thought I could be a useful source.
+
+I am [Your Name], [your credentials — brief]. I have [specific experience relevant to their coverage].
+
+A couple of angles I could provide commentary on:
+- [Angle 1]
+- [Angle 2]
+
+Happy to jump on a quick call or provide written quotes. Available on short notice.
+
+[Your Name]
+[Your title, company]
+[Phone number for urgent requests]
+
+---
+
+## 6. HARO / Source Request Templates
+
+### Template 6A: HARO Response — Expert Source
+
+**Subject:** [HARO] Re: [original query title]
+
+**Body:**
+
+Hi [Journalist Name],
+
+Re: your query on [topic].
+
+**Source:** [Your Name], [Title] at [Company]
+**Credentials:** [1-2 sentences on relevant expertise]
+
+**Response:**
+
+[2-3 paragraphs directly answering their query. Be specific, quotable, and concise. Lead with the most valuable insight.]
+
+**Key quote:** "[One highly quotable sentence they can use directly]"
+
+Happy to elaborate or provide additional data. Available at [email] or [phone].
+
+[Your Name]
+
+---
+
+### Template 6B: HARO Response — Data Source
+
+**Subject:** [HARO] Re: [original query title] — with data
+
+**Body:**
+
+Hi [Journalist Name],
+
+For your piece on [topic], here is data from our [study/platform/analysis]:
+
+- [Statistic 1]: [data point with context]
+- [Statistic 2]: [data point with context]
+- [Statistic 3]: [data point with context]
+
+Source: [Your URL for attribution]
+
+**Expert quote from [Your Name], [Title]:**
+"[Quotable insight interpreting the data]"
+
+Full dataset available if needed for additional analysis.
+
+[Your Name]
+[Contact info]
+
+---
+
+## 7. Skyscraper Technique Templates
+
+### Template 7A: Skyscraper — Better Resource
+
+**Subject:** Updated resource on [topic]
+
+**Body:**
+
+Hi [Name],
+
+I noticed you linked to [competitor's article title] in your post on [their article title].
+
+We just published an updated version of that topic: [Your URL]
+
+It includes [specific improvements over the original]:
+- [Improvement 1 — e.g., "2025 data instead of 2022"]
+- [Improvement 2 — e.g., "interactive calculator included"]
+- [Improvement 3 — e.g., "covers 3 additional sub-topics"]
+
+If you think it would be a better resource for your readers, I would appreciate you considering swapping the link. No worries either way.
+
+Best,
+[Your Name]
+
+---
+
+## 8. Relationship Building Templates
+
+### Template 8A: Initial Relationship — No Ask
+
+**Subject:** Loved your piece on [topic]
+
+**Body:**
+
+Hi [Name],
+
+Just read your article on [specific article title] and wanted to say it was genuinely one of the best pieces I have read on [topic].
+
+The section on [specific section] was especially useful — I actually shared it with my team and we are implementing [specific takeaway].
+
+No ask here, just wanted to let you know your work is making an impact.
+
+[Your Name]
+
+---
+
+### Template 8B: Follow-Up After Relationship Building (2-4 weeks later)
+
+**Subject:** [New topic] — thought of your readers
+
+**Body:**
+
+Hi [Name],
+
+Following up on our exchange a few weeks back. I just published something your audience might find useful: [Your URL]
+
+It covers [brief description] with a focus on [specific angle].
+
+Would love to hear your thoughts if you get a chance to read it.
+
+Best,
+[Your Name]
+
+---
+
+## 9. Follow-Up Sequences
+
+### Standard Follow-Up Timeline
+
+| Email | Timing | Purpose | Tone |
+|-------|--------|---------|------|
+| Initial outreach | Day 0 | Make the ask | Professional, value-forward |
+| Follow-up 1 | Day 5-7 | Gentle reminder | Casual, brief |
+| Follow-up 2 | Day 14 | Final attempt | Short, no-pressure close |
+| Stop | After follow-up 2 | Do not send more | Move on |
+
+### Follow-Up 2 Template (Final Attempt)
+
+**Subject:** Re: [original subject]
+
+**Body:**
+
+Hi [Name],
+
+Last follow-up on this — I know you are busy.
+
+[One-sentence reminder of what you asked]
+
+If the timing is not right, no worries at all. Feel free to bookmark [Your URL] for whenever it might be useful.
+
+Best,
+[Your Name]
+
+---
+
+## 10. Response Handling Scripts
+
+### Positive Response — They Will Add the Link
+
+**Reply:**
+
+That is great, [Name] — really appreciate it.
+
+The exact URL is: [Your URL]
+Preferred anchor text (just a suggestion): [anchor text]
+
+Let me know if you need anything else. Happy to return the favor anytime.
+
+Best,
+[Your Name]
+
+---
+
+### Positive Response — They Want a Guest Post
+
+**Reply:**
+
+Wonderful, I would love to contribute. Here is what I am thinking:
+
+**Title:** [Proposed title]
+**Outline:**
+1. [Section 1]
+2. [Section 2]
+3. [Section 3]
+
+**Estimated length:** [word count]
+**Delivery timeline:** [date]
+
+Do you have any editorial guidelines or specific requirements I should follow?
+
+Best,
+[Your Name]
+
+---
+
+### Neutral Response — They Are Interested but Noncommittal
+
+**Reply:**
+
+Totally understand, [Name]. No rush at all.
+
+I will keep [Your URL] updated with [latest data/content], so it will be here whenever the timing works.
+
+Feel free to reach out if I can ever be a source or contributor for [their site].
+
+Best,
+[Your Name]
+
+---
+
+### Negative Response — They Decline
+
+**Reply:**
+
+Completely understand, [Name]. Thanks for letting me know.
+
+If anything changes or if I can ever be helpful as a source, do not hesitate to reach out.
+
+Best,
+[Your Name]
+
+_Note: Never argue, push back, or send additional follow-ups after a decline. Keep the relationship positive for potential future opportunities._
+
+---
+
+### No Response — After Full Sequence
+
+Do not send further emails. Add to a "re-engage in 3-6 months" list. When you re-engage, use a completely new angle or piece of content. Never reference the fact that they did not respond previously.
+
+---
+
+## 11. Personalization Tips
+
+### Research Checklist Before Sending
+
+| Check | Where to Find | How to Use |
+|-------|-------------|-----------|
+| Their name (correct spelling) | About page, LinkedIn, byline | Use in greeting and body |
+| Recent article they wrote | Their blog, Google "[name] + [site]" | Reference in opening line |
+| Their social media | Twitter/X, LinkedIn | Mention a recent post or shared interest |
+| Their publication's audience | About page, media kit | Tailor your value proposition |
+| Content they have linked to before | ~~link database, their recent articles | Match the type of content they prefer |
+| How they prefer to be contacted | Website contact page, social bio | Some prefer Twitter DMs over email |
+
+### Personalization Levels
+
+| Level | Effort | Response Rate Impact | When to Use |
+|-------|--------|---------------------|-------------|
+| **Zero** (template only) | Minimal | Baseline (lowest) | Never recommended |
+| **Basic** (name + site name) | Low | +20-30% | Bulk outreach to lower-priority targets |
+| **Moderate** (+ specific article reference) | Medium | +50-80% | Standard outreach to mid-tier targets |
+| **Deep** (+ shared interest, social reference, mutual connection) | High | +100-200% | High-priority targets (DR 60+ sites, key relationships) |
+
+---
+
+## 12. Subject Line Variations by Strategy
+
+### High-Performing Subject Line Patterns
+
+| Strategy | Subject Lines (pick one) |
+|----------|------------------------|
+| **Broken link** | "Found a broken link on your [topic] page" / "Quick heads-up about [page title]" / "Broken resource on [their site]" |
+| **Guest post** | "Guest post idea: [title]" / "[Topic] contributor for [their site]" / "Content pitch for [their site]" |
+| **Resource page** | "Resource for your [topic] page" / "Suggestion for your [topic] list" / "Addition for [page title]?" |
+| **Unlinked mention** | "Thanks for mentioning [brand]" / "Re: your mention of [brand/data]" / "Quick favor re: [their article]" |
+| **Digital PR** | "New data: [headline stat]" / "Exclusive: [finding]" / "[Topic] research for your coverage" |
+| **Skyscraper** | "Updated resource on [topic]" / "Better version of [competitor content]" / "Thought you'd want to see this" |
+
+### Subject Lines to Avoid
+
+| Avoid | Why |
+|-------|-----|
+| "Link exchange opportunity" | Screams spam and violates Google guidelines |
+| "SEO partnership" | Immediately signals manipulative intent |
+| ALL CAPS or excessive punctuation | Triggers spam filters, looks unprofessional |
+| "I'd love a backlink" | Too direct; leads with your need, not their value |
+| Misleading subjects ("Re:" when no prior conversation) | Damages trust immediately |
+| Long subjects (>60 characters) | Gets truncated in inbox, lower open rates |
diff --git a/.cursor/skills/competitor-analysis/SKILL.md b/.cursor/skills/competitor-analysis/SKILL.md
new file mode 100644
index 0000000000..11c1efc917
--- /dev/null
+++ b/.cursor/skills/competitor-analysis/SKILL.md
@@ -0,0 +1,309 @@
+---
+name: competitor-analysis
+description: 'Analyze competitor SEO/GEO: keywords, content, backlinks, AI citations, traffic share gaps. 竞品分析/竞争对手'
+version: "6.0.0"
+license: Apache-2.0
+compatibility: "Claude Code ≥1.0, skills.sh marketplace, ClawHub marketplace, Vercel Labs skills ecosystem. No system packages required. Optional: MCP network access for SEO tool integrations."
+homepage: "https://github.com/aaron-he-zhu/seo-geo-claude-skills"
+when_to_use: "Use when analyzing competitor SEO strategy, comparing domains, benchmarking against competitors, or finding competitor keywords and content gaps."
+argument-hint: "<competitor URL or domain>"
+metadata:
+  author: aaron-he-zhu
+  version: "6.0.0"
+  geo-relevance: "medium"
+  tags:
+    - seo
+    - geo
+    - competitor-analysis
+    - competitive-intelligence
+    - benchmarking
+    - competitor-keywords
+    - competitor-backlinks
+    - market-analysis
+    - spyfu-alternative
+    - 竞品分析
+    - 競合分析
+    - 경쟁분석
+    - analisis-competitivo
+  triggers:
+    # EN-formal
+    - "analyze competitors"
+    - "competitor SEO"
+    - "competitive analysis"
+    - "competitor keywords"
+    - "competitor backlinks"
+    - "market analysis"
+    - "competitive intelligence"
+    # EN-casual
+    - "what are my competitors doing"
+    - "what are they doing differently"
+    - "why do they rank higher"
+    - "spy on competitor SEO"
+    - "what are they doing better"
+    - "why do they outrank me"
+    # EN-question
+    - "who are my SEO competitors"
+    - "how do I beat my competitors"
+    - "why do competitors rank higher"
+    # EN-competitor
+    - "SpyFu alternative"
+    - "Semrush competitor analysis"
+    - "Ahrefs competitor tool"
+    # ZH-pro
+    - "竞品分析"
+    - "竞争对手分析"
+    - "竞品SEO"
+    - "对标分析"
+    - "竞争情报"
+    # ZH-casual
+    - "竞品怎么做的"
+    - "他们排名为什么比我高"
+    - "看看对手在干什么"
+    - "为什么他们排名好"
+    # JA
+    - "競合分析"
+    - "競合SEO分析"
+    - "ライバル分析"
+    # KO
+    - "경쟁 분석"
+    - "경쟁사 SEO"
+    - "경쟁사 키워드"
+    # ES
+    - "análisis de competidores"
+    - "análisis competitivo SEO"
+    # PT
+    - "análise de concorrentes"
+    # Misspellings
+    - "competitve analysis"
+    - "compeditor analysis"
+---
+
+# Competitor Analysis
+
+
+> **[SEO & GEO Skills Library](https://github.com/aaron-he-zhu/seo-geo-claude-skills)** · 20 skills for SEO + GEO · [ClawHub](https://clawhub.ai/u/aaron-he-zhu) · [skills.sh](https://skills.sh/aaron-he-zhu/seo-geo-claude-skills)
+> **System Mode**: This research skill follows the shared [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md) and [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md).
+
+
+This skill provides comprehensive analysis of competitor SEO and GEO strategies, revealing what's working in your market and identifying opportunities to outperform the competition.
+
+**System role**: Research layer skill. It turns market signals into reusable strategic inputs for the rest of the library.
+
+## When This Must Trigger
+
+Use this when the conversation involves any of these situations — even if the user does not use SEO terminology:
+
+Use this whenever the task needs reusable market intelligence that should influence strategy, not just an ad hoc answer.
+
+- Entering a new market or niche
+- Planning content strategy based on competitor success
+- Understanding why competitors rank higher
+- Finding backlink and partnership opportunities
+- Identifying content gaps competitors are missing
+- Analyzing competitor AI citation strategies
+- Benchmarking your SEO performance
+
+## What This Skill Does
+
+1. **Keyword Analysis**: Identifies keywords competitors rank for
+2. **Content Audit**: Analyzes competitor content strategies and formats
+3. **Backlink Profiling**: Reviews competitor link-building approaches
+4. **Technical Assessment**: Evaluates competitor site health
+5. **GEO Analysis**: Identifies how competitors appear in AI responses
+6. **Gap Identification**: Finds opportunities competitors miss
+7. **Strategy Extraction**: Reveals actionable insights from competitor success
+
+## Quick Start
+
+Start with one of these prompts. Finish with a short handoff summary using the repository format in [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md).
+
+### Basic Competitor Analysis
+
+```
+Analyze SEO strategy for [competitor URL]
+```
+
+```
+Compare my site [URL] against [competitor 1], [competitor 2], [competitor 3]
+```
+
+### Specific Analysis
+
+```
+What content is driving the most traffic for [competitor]?
+```
+
+```
+Analyze why [competitor] ranks #1 for [keyword]
+```
+
+### GEO-Focused Analysis
+
+```
+How is [competitor] getting cited in AI responses? What can I learn?
+```
+
+## Skill Contract
+
+**Expected output**: a prioritized research brief, evidence-backed findings, and a short handoff summary ready for `memory/research/`.
+
+- **Reads**: user goals, target market inputs, available tool data, and prior strategy from [CLAUDE.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CLAUDE.md) and the shared [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md) when available.
+- **Writes**: a user-facing research deliverable plus a reusable summary that can be stored under `memory/research/`.
+- **Promotes**: durable keyword priorities, competitor facts, entity candidates, and strategic decisions to `CLAUDE.md`, `memory/decisions.md`, and `memory/research/`; hand canonical entity work to `entity-optimizer`.
+- **Next handoff**: use the `Next Best Skill` below when the findings are ready to drive action.
+
+## Data Sources
+
+> **Note:** All integrations are optional. This skill works without any API keys — users provide data manually when no tools are connected.
+
+> See [CONNECTORS.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CONNECTORS.md) for tool category placeholders.
+
+**With ~~SEO tool + ~~analytics + ~~AI monitor connected:**
+Automatically pull competitor keyword rankings, backlink profiles, top performing content, domain authority metrics from ~~SEO tool. Compare against your site's metrics from ~~analytics and ~~search console. Check AI citation patterns for both your site and competitors using ~~AI monitor.
+
+**With manual data only:**
+Ask the user to provide:
+1. Competitor URLs to analyze (2-5 recommended)
+2. Your own site URL and current metrics (traffic, rankings if known)
+3. Industry or niche context
+4. Specific aspects to focus on (keywords, content, backlinks, etc.)
+5. Any known competitor strengths or weaknesses
+
+Proceed with the full analysis using provided data. Note in the output which metrics are from automated collection vs. user-provided data.
+
+## Instructions
+
+When a user requests competitor analysis:
+
+1. **Identify Competitors**
+
+   If not specified, help identify competitors:
+   
+   ```markdown
+   ### Competitor Identification Framework
+   
+   **Direct Competitors** (same product/service)
+   - Search "[your main keyword]" and note top 5 organic results
+   - Check who's advertising for your keywords
+   - Ask: Who do customers compare you to?
+   
+   **Indirect Competitors** (different solution, same problem)
+   - Search problem-focused keywords
+   - Look at alternative solutions
+   
+   **Content Competitors** (compete for same keywords)
+   - May not sell same product
+   - Rank for your target keywords
+   - Include media sites, blogs, aggregators
+   ```
+
+2. **Gather Competitor Data**
+
+   Collect for each competitor: URL, domain age, estimated traffic, domain authority, business model, target audience, and key offerings.
+
+3. **Analyze Keyword Rankings**
+
+   Document total keywords ranking, top 10/top 3 counts, top performing keywords (with position, volume, traffic, page URL), keyword distribution by intent, and keyword gaps.
+
+4. **Audit Content Strategy**
+
+   Analyze content volume by type, top performing content, content patterns (word count, frequency, formats), content themes, and success factors.
+
+5. **Analyze Backlink Profile**
+
+   Review total backlinks, referring domains, link quality distribution, top linking domains, link acquisition patterns, and linkable assets.
+
+6. **Technical SEO Assessment**
+
+   Evaluate Core Web Vitals, mobile-friendliness, site architecture, internal linking quality, URL structure, and technical strengths/weaknesses.
+
+7. **GEO/AI Citation Analysis**
+
+   Test competitor content in AI systems: document which queries cite them, GEO strategies observed (definitions, statistics, Q&A, authority signals), and GEO opportunities they are missing.
+
+8. **Synthesize Competitive Intelligence**
+
+   Produce a final report with: Executive Summary, Competitive Landscape comparison table, CITE domain authority comparison, Strengths to Learn From, Weaknesses to Exploit, Keyword Opportunities, Content Strategy Recommendations, and Action Plan (Immediate / Short-term / Long-term).
+
+   > **Reference**: See [references/analysis-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/research/competitor-analysis/references/analysis-templates.md) for detailed templates for each step.
+
+## Validation Checkpoints
+
+### Input Validation
+- [ ] Competitor URLs verified as relevant to your niche
+- [ ] Analysis scope defined (comprehensive or specific focus area)
+- [ ] Your own site metrics available for comparison
+- [ ] Minimum 2-3 competitors identified for meaningful patterns
+
+### Output Validation
+- [ ] Every recommendation cites specific data points (not generic advice)
+- [ ] Competitor strengths backed by measurable evidence (metrics, rankings)
+- [ ] Opportunities based on identifiable gaps, not assumptions
+- [ ] Action plan items are specific and actionable (not vague strategies)
+- [ ] Source of each data point clearly stated (~~SEO tool data, ~~analytics data, ~~AI monitor data, user-provided, or estimated)
+
+## Example
+
+> **Reference**: See [references/example-report.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/research/competitor-analysis/references/example-report.md) for a complete example analyzing HubSpot's marketing keyword dominance.
+
+## Advanced Analysis Types
+
+### Content Gap Analysis
+
+```
+Show me content [competitor] has that I don't, sorted by traffic potential
+```
+
+### Link Intersection
+
+```
+Find sites linking to [competitor 1] AND [competitor 2] but not me
+```
+
+### SERP Feature Analysis
+
+```
+What SERP features do competitors win? (Featured snippets, PAA, etc.)
+```
+
+### Historical Tracking
+
+```
+How has [competitor]'s SEO strategy evolved over the past year?
+```
+
+## Tips for Success
+
+1. **Analyze 3-5 competitors** for comprehensive view
+2. **Include indirect competitors** - they often have innovative approaches
+3. **Look beyond rankings** - analyze content quality, user experience
+4. **Study their failures** - avoid their mistakes
+5. **Monitor regularly** - competitor strategies evolve
+6. **Focus on actionable insights** - what can you actually implement?
+
+
+
+### Save Results
+
+After delivering findings to the user, ask:
+
+> "Save these results for future sessions?"
+
+If yes, write a dated summary to `memory/research/competitor-analysis/YYYY-MM-DD-<topic>.md` containing:
+- One-line headline finding
+- Top 3-5 actionable items
+- Open loops or blockers
+- Source data references
+
+If any findings should influence ongoing strategy, recommend promoting key conclusions to `memory/hot-cache.md`.
+
+## Reference Materials
+
+- [Analysis Templates](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/research/competitor-analysis/references/analysis-templates.md) — Detailed templates for each analysis step (profile, keywords, content, backlinks, technical, GEO, synthesis)
+- [Battlecard Template](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/research/competitor-analysis/references/battlecard-template.md) — Quick-reference competitive battlecard for sales and marketing teams
+- [Positioning Frameworks](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/research/competitor-analysis/references/positioning-frameworks.md) — Positioning maps, messaging matrices, narrative analysis, and differentiation frameworks
+- [Example Report](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/research/competitor-analysis/references/example-report.md) — Complete example analyzing HubSpot's marketing keyword dominance
+
+## Next Best Skill
+
+- **Primary**: [content-gap-analysis](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/research/content-gap-analysis/SKILL.md) — turn competitor findings into a focused opportunity map.
diff --git a/.cursor/skills/competitor-analysis/references/analysis-templates.md b/.cursor/skills/competitor-analysis/references/analysis-templates.md
new file mode 100644
index 0000000000..1e1e157167
--- /dev/null
+++ b/.cursor/skills/competitor-analysis/references/analysis-templates.md
@@ -0,0 +1,285 @@
+# Competitor Analysis — Analysis Templates
+
+Templates for each step of the competitor analysis workflow. Use these to structure your output.
+
+## Competitor Profile Template
+
+```markdown
+## Competitor Profile: [Name]
+
+**Basic Info**
+- URL: [website]
+- Domain Age: [years]
+- Estimated Traffic: [monthly visits]
+- Domain Authority/Rating: [score]
+
+**Business Model**
+- Type: [SaaS/E-commerce/Content/etc.]
+- Target Audience: [description]
+- Key Offerings: [products/services]
+```
+
+## Keyword Analysis Template
+
+```markdown
+### Keyword Analysis: [Competitor]
+
+**Total Keywords Ranking**: [X]
+**Keywords in Top 10**: [X]
+**Keywords in Top 3**: [X]
+
+#### Top Performing Keywords
+
+| Keyword | Position | Volume | Traffic Est. | Page |
+|---------|----------|--------|--------------|------|
+| [kw 1] | [pos] | [vol] | [traffic] | [url] |
+| [kw 2] | [pos] | [vol] | [traffic] | [url] |
+
+#### Keyword Distribution by Intent
+
+- Informational: [X]% ([keywords])
+- Commercial: [X]% ([keywords])
+- Transactional: [X]% ([keywords])
+- Navigational: [X]% ([keywords])
+
+#### Keyword Gaps (They rank, you don't)
+
+| Keyword | Their Position | Volume | Opportunity |
+|---------|----------------|--------|-------------|
+| [kw 1] | [pos] | [vol] | [analysis] |
+```
+
+## Content Analysis Template
+
+```markdown
+### Content Analysis: [Competitor]
+
+**Content Volume**
+- Total Pages: [X]
+- Blog Posts: [X]
+- Landing Pages: [X]
+- Resource Pages: [X]
+
+**Content Performance**
+
+#### Top Performing Content
+
+| Title | URL | Est. Traffic | Keywords | Backlinks |
+|-------|-----|--------------|----------|-----------|
+| [title 1] | [url] | [traffic] | [X] | [X] |
+
+**Content Patterns**
+
+- Average word count: [X] words
+- Publishing frequency: [X] posts/month
+- Content formats used:
+  - Blog posts: [X]%
+  - Guides/tutorials: [X]%
+  - Case studies: [X]%
+  - Tools/calculators: [X]%
+  - Videos: [X]%
+
+**Content Themes**
+
+| Theme | # Articles | Combined Traffic |
+|-------|------------|------------------|
+| [theme 1] | [X] | [traffic] |
+| [theme 2] | [X] | [traffic] |
+
+**What Makes Their Content Successful**
+
+1. [Success factor 1 with example]
+2. [Success factor 2 with example]
+3. [Success factor 3 with example]
+```
+
+## Backlink Analysis Template
+
+```markdown
+### Backlink Analysis: [Competitor]
+
+**Overview**
+- Total Backlinks: [X]
+- Referring Domains: [X]
+- Domain Rating: [X]
+
+**Link Quality Distribution**
+- High Authority (DR 70+): [X]%
+- Medium Authority (DR 30-69): [X]%
+- Low Authority (DR <30): [X]%
+
+**Top Linking Domains**
+
+| Domain | DR | Link Type | Target Page |
+|--------|-----|-----------|-------------|
+| [domain 1] | [DR] | [type] | [page] |
+
+**Link Acquisition Patterns**
+
+- Guest posts: [X]%
+- Editorial/organic: [X]%
+- Resource pages: [X]%
+- Directories: [X]%
+- Other: [X]%
+
+**Linkable Assets (Content attracting links)**
+
+| Asset | Type | Backlinks | Why It Works |
+|-------|------|-----------|--------------|
+| [asset 1] | [type] | [X] | [reason] |
+```
+
+## Technical SEO Assessment Template
+
+```markdown
+### Technical Analysis: [Competitor]
+
+**Site Performance**
+- Core Web Vitals: [Pass/Fail]
+- LCP: [X]s
+- FID: [X]ms
+- CLS: [X]
+- Mobile-friendly: [Yes/No]
+
+**Site Structure**
+- Site architecture depth: [X] levels
+- Internal linking quality: [Rating]
+- URL structure: [Clean/Messy]
+- Sitemap present: [Yes/No]
+
+**Technical Strengths**
+1. [Strength 1]
+2. [Strength 2]
+
+**Technical Weaknesses**
+1. [Weakness 1]
+2. [Weakness 2]
+```
+
+## GEO/AI Citation Analysis Template
+
+```markdown
+### GEO Analysis: [Competitor]
+
+**AI Visibility Assessment**
+
+Test competitor content in AI systems for relevant queries:
+
+| Query | AI Mentions Competitor? | What's Cited | Why |
+|-------|------------------------|--------------|-----|
+| [query 1] | Yes/No | [content] | [reason] |
+| [query 2] | Yes/No | [content] | [reason] |
+
+**GEO Strategies Observed**
+
+1. **Clear Definitions**
+   - Example: [quote from their content]
+   - Effectiveness: [rating]
+
+2. **Quotable Statistics**
+   - Example: [quote from their content]
+   - Effectiveness: [rating]
+
+3. **Q&A Format Content**
+   - Examples found: [X] pages
+   - Topics covered: [list]
+
+4. **Authority Signals**
+   - Expert authorship: [Yes/No]
+   - Citations to sources: [Yes/No]
+   - Original research: [Yes/No]
+
+**GEO Opportunities They're Missing**
+
+| Topic | Why Missing | Your Opportunity |
+|-------|-------------|------------------|
+| [topic 1] | [reason] | [action] |
+```
+
+## Synthesis Report Template
+
+```markdown
+# Competitive Analysis Report
+
+**Analysis Date**: [Date]
+**Competitors Analyzed**: [List]
+**Your Site**: [URL]
+
+## Executive Summary
+
+[2-3 paragraph overview of key findings and recommendations]
+
+## Competitive Landscape
+
+| Metric | You | Competitor 1 | Competitor 2 | Competitor 3 |
+|--------|-----|--------------|--------------|--------------|
+| Domain Authority | [X] | [X] | [X] | [X] |
+| Organic Traffic | [X] | [X] | [X] | [X] |
+| Keywords Top 10 | [X] | [X] | [X] | [X] |
+| Backlinks | [X] | [X] | [X] | [X] |
+| Content Pages | [X] | [X] | [X] | [X] |
+
+**Domain Authority Comparison (Recommended)**
+
+When domain-level comparison is needed, run `domain-authority-auditor` for each competitor to get CITE scores:
+
+| Domain | CITE Score | C (Citation) | I (Identity) | T (Trust) | E (Eminence) | Veto |
+|--------|-----------|-------------|-------------|----------|-------------|------|
+| Your domain | [score] | [score] | [score] | [score] | [score] | [pass/fail] |
+| Competitor 1 | [score] | [score] | [score] | [score] | [score] | [pass/fail] |
+| Competitor 2 | [score] | [score] | [score] | [score] | [score] | [pass/fail] |
+
+This reveals domain authority gaps that inform link building and brand strategy beyond keyword-level competition.
+
+## Competitor Strengths to Learn From
+
+### [Competitor 1]
+- **Strength**: [description]
+- **Why It Works**: [analysis]
+- **How to Apply**: [action item]
+
+[Repeat for each competitor]
+
+## Competitor Weaknesses to Exploit
+
+### Gap 1: [Description]
+- Who's weak: [competitors]
+- Opportunity size: [estimate]
+- Recommended action: [specific steps]
+
+[Repeat for each gap]
+
+## Keyword Opportunities
+
+### Keywords to Target (Competitor overlap)
+| Keyword | Volume | Avg Position | Best Strategy |
+|---------|--------|--------------|---------------|
+| [kw] | [vol] | [pos] | [strategy] |
+
+### Untapped Keywords (No competitor coverage)
+| Keyword | Volume | Difficulty | Opportunity |
+|---------|--------|------------|-------------|
+| [kw] | [vol] | [diff] | [description] |
+
+## Content Strategy Recommendations
+
+Based on competitor analysis:
+
+1. **Create**: [Content type] about [topic] because [reason]
+2. **Improve**: [Existing content] to match/exceed [competitor content]
+3. **Promote**: [Content] to sites like [competitor's link sources]
+
+## Action Plan
+
+### Immediate (This Week)
+1. [Action item]
+2. [Action item]
+
+### Short-term (This Month)
+1. [Action item]
+2. [Action item]
+
+### Long-term (This Quarter)
+1. [Action item]
+2. [Action item]
+```
diff --git a/.cursor/skills/competitor-analysis/references/battlecard-template.md b/.cursor/skills/competitor-analysis/references/battlecard-template.md
new file mode 100644
index 0000000000..66407a53c3
--- /dev/null
+++ b/.cursor/skills/competitor-analysis/references/battlecard-template.md
@@ -0,0 +1,256 @@
+# Competitive Battlecard Template
+
+A fill-in-the-blank template for creating competitive battlecards that equip sales, marketing, and content teams with actionable intelligence. Maintain one battlecard per major competitor and review quarterly.
+
+## Overview
+
+A competitive battlecard is a concise reference document that summarizes a competitor's strengths, weaknesses, positioning, and how to compete against them. Battlecards are living documents that should be updated as competitors evolve.
+
+**Audience:** Sales teams, content strategists, marketing leadership
+**Update frequency:** Quarterly review minimum; update immediately on major competitor changes
+**Length target:** 2-3 pages per competitor (scannable, not exhaustive)
+
+---
+
+## Battlecard Template
+
+### Header
+
+```
+COMPETITIVE BATTLECARD: [Competitor Name]
+Last Updated: [Date]
+Updated By: [Name/Team]
+Confidence Level: [High/Medium/Low] — based on data recency and source quality
+```
+
+---
+
+### Section 1: Competitor Overview
+
+| Field | Details |
+|-------|---------|
+| **Company Name** | [Name] |
+| **Website** | [URL] |
+| **Founded** | [Year] |
+| **Headquarters** | [Location] |
+| **Company Size** | [Employees] |
+| **Funding/Revenue** | [Known funding rounds or estimated revenue] |
+| **Target Customer** | [Primary audience: SMB, mid-market, enterprise, etc.] |
+| **Pricing Model** | [Freemium, subscription, usage-based, custom, etc.] |
+| **Pricing Range** | [Entry price → Enterprise price] |
+
+**One-Sentence Summary:**
+> [Competitor] is a [category] that helps [target audience] to [primary benefit] by [mechanism].
+
+---
+
+### Section 2: Their Pitch
+
+**Tagline/Slogan:**
+> [Their exact tagline from website]
+
+**Top 3 Claimed Differentiators:**
+1. [Differentiator 1 — their claim, not your assessment]
+2. [Differentiator 2]
+3. [Differentiator 3]
+
+**Positioning Statement (Reverse-Engineered):**
+> For [their target audience], [product] is the [category] that [key benefit] because [reason to believe].
+
+**Key Messages They Repeat:**
+- [Message 1 — found in blog, ads, sales decks]
+- [Message 2]
+- [Message 3]
+
+---
+
+### Section 3: Strengths (Be Honest)
+
+Credibility depends on honestly assessing where competitors are genuinely strong. Sugarcoating weaknesses or ignoring strengths reduces trust in the battlecard.
+
+| Strength | Evidence | Impact on Deals |
+|----------|---------|----------------|
+| [Strength 1] | [Where you observed this — reviews, demos, customer feedback] | [How this affects your competitive win rate] |
+| [Strength 2] | [Evidence] | [Impact] |
+| [Strength 3] | [Evidence] | [Impact] |
+
+---
+
+### Section 4: Weaknesses
+
+| Weakness | Evidence | How to Exploit |
+|----------|---------|---------------|
+| [Weakness 1] | [Sources: G2 reviews, customer complaints, technical limitations] | [Talking point or demo moment that highlights this gap] |
+| [Weakness 2] | [Evidence] | [How to exploit] |
+| [Weakness 3] | [Evidence] | [How to exploit] |
+
+**Common Complaints (from review sites):**
+- "[Exact quote from G2/Capterra/TrustRadius]" — [Source, Date]
+- "[Exact quote]" — [Source, Date]
+- "[Exact quote]" — [Source, Date]
+
+---
+
+### Section 5: Your Differentiators
+
+For each differentiator, provide the claim AND the proof. Unsupported claims erode credibility.
+
+| Differentiator | Your Advantage | Proof Point | How to Demo/Show |
+|---------------|---------------|------------|-----------------|
+| [Differentiator 1] | [What you do better] | [Data, testimonial, or demo evidence] | [Specific demo step or slide] |
+| [Differentiator 2] | [What you do better] | [Proof] | [How to show] |
+| [Differentiator 3] | [What you do better] | [Proof] | [How to show] |
+
+---
+
+### Section 6: Feature Comparison
+
+| Feature/Capability | You | [Competitor] | Notes |
+|-------------------|-----|-------------|-------|
+| [Feature 1] | [Yes/No/Partial + detail] | [Yes/No/Partial + detail] | [Context] |
+| [Feature 2] | [Yes/No/Partial] | [Yes/No/Partial] | [Context] |
+| [Feature 3] | [Yes/No/Partial] | [Yes/No/Partial] | [Context] |
+| [Feature 4] | [Yes/No/Partial] | [Yes/No/Partial] | [Context] |
+| [Feature 5] | [Yes/No/Partial] | [Yes/No/Partial] | [Context] |
+| [Integration A] | [Yes/No] | [Yes/No] | [Context] |
+| [Integration B] | [Yes/No] | [Yes/No] | [Context] |
+
+---
+
+### Section 7: Pricing Comparison
+
+| Tier | You | [Competitor] | Advantage |
+|------|-----|-------------|-----------|
+| Entry/Free | [Price + what's included] | [Price + what's included] | [Who wins at this tier and why] |
+| Mid-tier | [Price + what's included] | [Price + what's included] | [Who wins] |
+| Enterprise | [Price + what's included] | [Price + what's included] | [Who wins] |
+
+**Hidden Costs to Highlight:**
+- [Competitor charges extra for X, which you include]
+- [Competitor requires Y add-on for common use case]
+- [Competitor's pricing scales poorly because Z]
+
+---
+
+### Section 8: Objection Handling
+
+| When They Say... | You Respond With... | Supporting Evidence |
+|-----------------|--------------------|--------------------|
+| "[Competitor] has more features" | "[Your response — focus on outcomes, not feature count]" | [Case study, data point, or demo] |
+| "[Competitor] is cheaper" | "[Your response — focus on total value, ROI, or hidden costs]" | [TCO comparison, customer quote] |
+| "[Competitor] is more established" | "[Your response — focus on innovation, agility, or support quality]" | [Growth metrics, customer satisfaction data] |
+| "We're already using [Competitor]" | "[Your response — focus on switching ease, quick wins, or pain points]" | [Migration case study, time-to-value data] |
+| "[Competitor] integrates with [tool]" | "[Your response — confirm your integration or alternative]" | [Integration docs, workaround, or roadmap] |
+
+---
+
+### Section 9: Landmine Questions
+
+Questions to ask prospects early in the sales process that highlight your advantages and surface competitor weaknesses. Use these in discovery calls.
+
+| Landmine Question | What It Exposes | Your Advantage |
+|------------------|----------------|---------------|
+| "How important is [capability you have, they don't] to your workflow?" | Creates need for your differentiator | [Your capability] |
+| "Have you experienced [common pain point with competitor]?" | Surfaces known competitor weakness | [Your solution to that pain] |
+| "What's your timeline for seeing ROI from this tool?" | Highlights time-to-value differences | [Your faster time-to-value] |
+| "How does your team currently handle [use case you excel at]?" | Opens discussion where you shine | [Your approach to this use case] |
+
+---
+
+### Section 10: Win/Loss Analysis
+
+| Theme | Win Reasons (Why We Won) | Loss Reasons (Why We Lost) |
+|-------|------------------------|---------------------------|
+| Product | [What product features drove the win] | [What product gaps caused the loss] |
+| Pricing | [Price advantage or value perception] | [Price disadvantage or perceived poor value] |
+| Relationship | [Sales process, support quality] | [Better existing relationship with competitor] |
+| Brand | [Brand trust, market perception] | [Competitor brand stronger in this segment] |
+
+**Recent Win Story:**
+> [Brief narrative: who was the customer, what were they comparing, why did they choose you]
+
+**Recent Loss Story:**
+> [Brief narrative: who was the customer, what were they comparing, why did they choose competitor]
+
+---
+
+### Section 11: SEO/Content Intelligence
+
+| Dimension | [Competitor] | Your Position |
+|-----------|-------------|--------------|
+| Domain Authority/Rating | [Score] | [Score] |
+| Organic traffic (est.) | [Monthly visits] | [Monthly visits] |
+| Keywords in top 10 | [Count] | [Count] |
+| Top-performing content | [URL + topic] | [Your equivalent or gap] |
+| Content publishing cadence | [Posts/month] | [Posts/month] |
+| Backlink count | [Count] | [Count] |
+| AI citation frequency | [High/Medium/Low] | [High/Medium/Low] |
+
+**Content Strategy Observations:**
+- [What content themes drive their traffic]
+- [What formats they use effectively]
+- [Where their content is weak or outdated]
+
+---
+
+## Battlecard Maintenance Guidelines
+
+### Update Triggers
+
+Update the battlecard immediately when any of these occur:
+
+| Trigger | What to Update |
+|---------|---------------|
+| Competitor launches new feature | Feature comparison, differentiators, landmine questions |
+| Competitor changes pricing | Pricing comparison, objection handling |
+| Competitor raises funding / IPO | Overview section, positioning |
+| Major review trends change | Weaknesses, objection handling, win/loss |
+| You launch competing feature | Differentiators, feature comparison |
+| Significant win or loss against them | Win/loss analysis, objection handling |
+| Competitor publishes major content | SEO/content intelligence |
+
+### Quarterly Review Checklist
+
+- [ ] Verify all pricing information is current
+- [ ] Check for new features launched by competitor
+- [ ] Review latest G2/Capterra/TrustRadius reviews for new themes
+- [ ] Update win/loss analysis with recent deals
+- [ ] Refresh SEO/content metrics
+- [ ] Confirm differentiators are still accurate
+- [ ] Test all landmine questions with sales team for relevance
+- [ ] Archive outdated information (don't delete -- move to "Historical" section)
+
+### Data Sources for Battlecard Updates
+
+| Source | What to Gather | Update Frequency |
+|--------|---------------|-----------------|
+| Competitor website | Pricing, features, messaging, positioning | Monthly |
+| G2/Capterra reviews | Strengths, weaknesses, customer sentiment | Quarterly |
+| LinkedIn/social media | Team changes, company news, content strategy | Monthly |
+| Industry press/blogs | Funding, partnerships, market perception | As published |
+| Sales team feedback | Win/loss themes, objection patterns | Monthly |
+| SEO tools | Traffic, rankings, backlinks, content changes | Monthly |
+| AI system queries | AI citation patterns, brand mentions | Quarterly |
+| Customer feedback | Competitive switching reasons | Quarterly |
+
+---
+
+## Example: Populated Battlecard Header
+
+```
+COMPETITIVE BATTLECARD: SEMrush
+Last Updated: 2026-02-01
+Updated By: Content Strategy Team
+Confidence Level: High
+
+One-Sentence Summary:
+SEMrush is an all-in-one SEO and digital marketing platform that helps marketing
+professionals manage SEO, PPC, content, and social media campaigns from a single dashboard.
+
+Tagline: "Online Marketing Can Be Easy"
+
+Positioning Statement:
+For digital marketing professionals and agencies, SEMrush is the all-in-one marketing
+toolkit that provides competitive intelligence and workflow automation because it combines
+55+ tools across SEO, PPC, content, and social in one platform.
+```
diff --git a/.cursor/skills/competitor-analysis/references/example-report.md b/.cursor/skills/competitor-analysis/references/example-report.md
new file mode 100644
index 0000000000..4ef522d40a
--- /dev/null
+++ b/.cursor/skills/competitor-analysis/references/example-report.md
@@ -0,0 +1,75 @@
+# Competitor Analysis — Example Report
+
+This is a complete example output showing the expected format for competitor analysis deliverables.
+
+## Prompt
+
+**User**: "Analyze why HubSpot ranks so well for marketing-related keywords"
+
+## Example Output
+
+```markdown
+# Competitive Analysis: HubSpot
+
+## SEO Strategy Overview
+
+HubSpot dominates marketing keywords through a combination of:
+1. **Massive content moat** - 10,000+ blog posts
+2. **Free tools as linkbait** - Website grader, email signature generator
+3. **Educational brand** - Academy, certifications, courses
+4. **Topic cluster model** - Pioneered the pillar/cluster approach
+
+## What Makes Them Successful
+
+### Content Strategy
+
+**Publishing Volume**: 50-100 posts/month
+**Average Word Count**: 2,500+ words
+**Content Types**:
+- In-depth guides (35%)
+- How-to tutorials (25%)
+- Templates & examples (20%)
+- Data/research (10%)
+- Tools & calculators (10%)
+
+**Top Performing Content Pattern**:
+1. Ultimate guides on broad topics
+2. Free templates with email gate
+3. Statistics roundup posts
+4. Definition posts ("What is [term]")
+
+### GEO Success Factors
+
+HubSpot appears in AI responses frequently because:
+
+1. **Clear definitions** at the start of every post
+   > "Inbound marketing is a business methodology that attracts customers by creating valuable content and experiences tailored to them."
+
+2. **Quotable statistics**
+   > "Companies that blog get 55% more website visitors"
+
+3. **Comprehensive coverage** - AI trusts their authority
+
+### Linkable Assets
+
+| Asset | Backlinks | Why It Works |
+|-------|-----------|--------------|
+| Website Grader | 45,000+ | Free, instant value |
+| Marketing Statistics | 12,000+ | Quotable reference |
+| Blog Ideas Generator | 8,500+ | Solves real problem |
+
+## Weaknesses to Exploit
+
+1. **Content becoming dated** - Many posts 3+ years old
+2. **Generic advice** - Lacks industry-specific depth
+3. **Enterprise focus** - Underserves solopreneurs
+4. **Slow innovation** - Same formats for years
+
+## Your Opportunities
+
+1. Create more specific, niche content they can't cover
+2. Target long-tail keywords they ignore
+3. Build interactive tools in emerging areas
+4. Add original research they don't have
+5. Focus on GEO-optimized definitions in your niche
+```
diff --git a/.cursor/skills/competitor-analysis/references/positioning-frameworks.md b/.cursor/skills/competitor-analysis/references/positioning-frameworks.md
new file mode 100644
index 0000000000..42cba0ac50
--- /dev/null
+++ b/.cursor/skills/competitor-analysis/references/positioning-frameworks.md
@@ -0,0 +1,320 @@
+# Positioning Frameworks
+
+Comprehensive frameworks for analyzing competitor positioning, developing your own positioning strategy, and identifying differentiation opportunities. Use these frameworks to map competitive landscapes, craft positioning statements, and find messaging vulnerabilities.
+
+## Overview
+
+Positioning is how your product or brand occupies a distinct place in the mind of your target audience relative to competitors. Strong positioning makes your value proposition clear, your differentiation obvious, and your target audience feel understood.
+
+This reference covers:
+1. Positioning statement templates
+2. 2x2 positioning map methodology
+3. Category strategy options
+4. Differentiation playbooks
+5. Messaging vulnerability analysis
+
+---
+
+## 1. Positioning Statement Templates
+
+### Classic Positioning Statement
+
+The foundational template used across industries:
+
+```
+For [target audience],
+[product/brand] is the [category]
+that [key benefit / point of difference]
+because [reason to believe / proof point].
+```
+
+**Example (SEO tool):**
+> For growth-stage SaaS marketers, Clearscope is the content optimization platform that ensures every article ranks on page one because it uses real-time SERP data and NLP to grade content against ranking competitors.
+
+### Extended Positioning Statement
+
+Adds competitive context and emotional resonance:
+
+```
+For [target audience] who [situation/need],
+[product/brand] is the [category]
+that [functional benefit]
+unlike [competitive alternative]
+because [unique capability / proof].
+This matters because [emotional benefit / outcome].
+```
+
+**Example:**
+> For content teams who struggle to consistently rank new articles, MarketMuse is the content planning platform that identifies exactly which topics to cover, unlike manual keyword research tools, because it uses AI-driven topic models trained on ranking content. This matters because teams stop guessing and start publishing with confidence.
+
+### Before/After/Bridge Positioning
+
+Focuses on transformation:
+
+```
+BEFORE: [Current painful state for target audience]
+AFTER:  [Desired improved state]
+BRIDGE: [Your product] makes this possible by [mechanism].
+```
+
+### Problem-Agitation-Solution Positioning
+
+Focuses on pain point amplification:
+
+```
+PROBLEM:    [What the target audience struggles with]
+AGITATION:  [Why this problem is worse than they think — consequences, hidden costs]
+SOLUTION:   [How your product solves it differently than alternatives]
+```
+
+---
+
+## 2. Positioning Map (2x2 Matrix) Methodology
+
+### How to Build a Positioning Map
+
+A positioning map plots competitors on two key dimensions to visualize the competitive landscape and identify open positioning space.
+
+#### Step 1: Choose Your Axes
+
+Select two dimensions that matter most to your target audience. The axes should be:
+- **Meaningful to buyers** (not internal metrics)
+- **Differentiating** (competitors should spread across the map)
+- **Independent** (the two dimensions should not be correlated)
+
+#### Common Axis Pairs for SEO/Marketing
+
+| Axis Pair | X-Axis | Y-Axis | Best For |
+|-----------|--------|--------|---------|
+| Value Positioning | Price (Low → High) | Capability (Basic → Advanced) | Understanding market tiers |
+| UX Positioning | Ease of Use (Complex → Simple) | Power (Limited → Comprehensive) | Evaluating UX/capability tradeoffs |
+| Audience Positioning | SMB Focus ← → Enterprise Focus | Point Solution ← → Full Platform | Identifying segment gaps |
+| Innovation Positioning | Established/Stable ← → Innovative/Cutting-Edge | Niche ← → Broad | Timing market positioning |
+| Content Positioning | Data-Driven ← → Opinion-Driven | Beginner ← → Expert | Content strategy differentiation |
+
+#### Step 2: Plot Competitors
+
+Place each competitor (including yourself) on the map based on objective assessment.
+
+```
+              HIGH CAPABILITY
+                    │
+         Enterprise │  All-in-One
+         Suites     │  Platforms
+                    │
+ LOW ───────────────┼─────────────── HIGH
+ PRICE              │                PRICE
+                    │
+         Free/Basic │  Premium
+         Tools      │  Specialists
+                    │
+              LOW CAPABILITY
+```
+
+#### Step 3: Identify White Space
+
+Look for quadrants or areas where:
+- **No competitor exists** (market opportunity)
+- **Only weak competitors exist** (displacement opportunity)
+- **Crowded clusters exist** (differentiation challenge -- avoid or reframe)
+
+#### Step 4: Choose Your Position
+
+Select a position that:
+- Aligns with your actual product capabilities
+- Serves an underserved audience segment
+- Is defensible (hard for competitors to copy)
+- Connects to a clear value narrative
+
+### Positioning Map Anti-Patterns
+
+| Anti-Pattern | Description | Fix |
+|-------------|------------|-----|
+| Aspiration mapping | Plotting yourself where you want to be, not where you are | Be honest about current position; create a roadmap to desired position |
+| Vanity axes | Choosing dimensions that make you look good but don't matter to buyers | Validate axes with customer research |
+| Missing competitors | Only plotting direct competitors, missing substitutes | Include indirect competitors and alternative solutions |
+| Static mapping | Building the map once and never updating | Refresh quarterly as market shifts |
+
+---
+
+## 3. Category Strategy Options
+
+How you define your category determines your competitive set and positioning flexibility.
+
+### Strategy 1: Win the Existing Category
+
+**When to use:** You have a genuinely superior product in an established category.
+**Approach:** Accept the existing category definition and compete on being the best.
+**Messaging:** "The best [category] for [audience]"
+**Risk:** Head-to-head competition with established players.
+
+**Example:** "The best SEO tool for content marketers" (competing within "SEO tools")
+
+### Strategy 2: Create a New Sub-Category
+
+**When to use:** You have a genuine differentiator that deserves its own label.
+**Approach:** Segment the existing category and own the new segment.
+**Messaging:** "The first [new sub-category]"
+**Risk:** Market may not recognize the sub-category.
+
+**Example:** "Content optimization platform" (sub-category of "SEO tools" focused on content)
+
+### Strategy 3: Create a New Category
+
+**When to use:** Your product genuinely doesn't fit existing categories.
+**Approach:** Define a new category and position yourself as the category creator.
+**Messaging:** "Introducing [new category]: [definition]"
+**Risk:** High education cost; market may not adopt the category.
+
+**Example:** "GEO optimization platform" (new category combining SEO + AI visibility)
+
+### Strategy 4: Reframe the Category
+
+**When to use:** The existing category frames competition in a way that disadvantages you.
+**Approach:** Change how buyers think about the problem, shifting the evaluation criteria.
+**Messaging:** "Stop thinking about [old category]. Start thinking about [new frame]."
+**Risk:** Confusing prospects who use existing category to search/evaluate.
+
+**Example:** "Not another SEO tool -- a revenue intelligence platform" (reframing from SEO to revenue)
+
+### Category Strategy Decision Matrix
+
+| Factor | Win Existing | New Sub-Category | New Category | Reframe |
+|--------|-------------|-----------------|-------------|---------|
+| Product differentiation | Low-Medium | Medium-High | Very High | Medium |
+| Market education cost | Low | Medium | Very High | High |
+| Competitive intensity | High | Medium | Low (initially) | Medium |
+| SEO/search difficulty | High (competitive terms) | Medium (new terms) | Low (no competition) | Medium |
+| Sales cycle impact | Short (known category) | Medium | Long (education needed) | Medium-Long |
+
+---
+
+## 4. Differentiation Playbooks
+
+### Types of Differentiation
+
+Not all differentiation is equal. Stronger forms are harder to copy.
+
+| Type | Strength | Durability | Examples |
+|------|---------|-----------|---------|
+| **Feature** | Low | Low (easily copied) | "We have feature X" |
+| **Integration** | Medium | Medium | "We connect to Y ecosystem" |
+| **Experience** | Medium-High | Medium-High | "Our UX is designed for Z workflow" |
+| **Data/Network** | High | High | "Our data from N users makes results better" |
+| **Methodology** | High | High | "We invented the X framework" |
+| **Mission/Values** | High | Very High | "We're the only [category] focused on [mission]" |
+
+### Differentiation Audit Template
+
+For each competitor, assess your differentiation strength:
+
+| Dimension | Your Approach | Competitor's Approach | Differentiation Strength | Defensibility |
+|-----------|-------------|---------------------|------------------------|--------------|
+| Core technology | [Your tech] | [Their tech] | [Weak/Medium/Strong] | [Easy/Hard to copy] |
+| Target audience | [Your audience] | [Their audience] | [Weak/Medium/Strong] | [Easy/Hard to copy] |
+| Pricing model | [Your model] | [Their model] | [Weak/Medium/Strong] | [Easy/Hard to copy] |
+| Content/education | [Your approach] | [Their approach] | [Weak/Medium/Strong] | [Easy/Hard to copy] |
+| Customer support | [Your approach] | [Their approach] | [Weak/Medium/Strong] | [Easy/Hard to copy] |
+| Data advantage | [Your data] | [Their data] | [Weak/Medium/Strong] | [Easy/Hard to copy] |
+| Brand/community | [Your brand] | [Their brand] | [Weak/Medium/Strong] | [Easy/Hard to copy] |
+
+### The "Only We" Test
+
+For each claimed differentiator, apply this test:
+
+> "Only [your company] [does X] because [unique reason]."
+
+If a competitor could truthfully make the same claim, it is not a true differentiator. Revise until the statement is uniquely yours.
+
+**Strong example:**
+> "Only Clearscope grades content against real-time SERP competitor analysis because we built a proprietary NLP model trained on ranking content patterns."
+
+**Weak example:**
+> "Only we offer great customer support." (Every company claims this.)
+
+### Differentiation Messaging Formula
+
+```
+We're the only [category] that [unique capability]
+which means [customer benefit]
+so you can [desired outcome].
+```
+
+---
+
+## 5. Messaging Vulnerability Analysis
+
+### Identifying Competitor Messaging Vulnerabilities
+
+A messaging vulnerability is a gap between what a competitor claims and what they actually deliver, or a message that sounds good but collapses under scrutiny.
+
+#### Vulnerability Types
+
+| Vulnerability Type | Description | How to Exploit |
+|-------------------|------------|---------------|
+| **Promise-Reality Gap** | They claim X but reviews say Y | Reference review data; offer proof of your delivery |
+| **Specificity Gap** | They use vague claims ("best-in-class") without proof | Be specific with your claims; use data |
+| **Audience Mismatch** | Their messaging targets one audience but product serves another | Speak directly to the underserved audience |
+| **Legacy Positioning** | Their positioning is outdated; product has evolved | Position against their OLD story, which is stuck in market perception |
+| **Feature Overload** | They list features without connecting to outcomes | Lead with outcomes, not features |
+| **Price Sensitivity** | They avoid discussing price, suggesting it's high or confusing | Be transparent about pricing; create TCO comparisons |
+
+#### Competitor Messaging Audit Worksheet
+
+For each competitor, analyze their messaging across touchpoints:
+
+| Touchpoint | Their Message | Vulnerability? | Your Counter-Message |
+|-----------|-------------|---------------|---------------------|
+| Homepage hero | [Their headline] | [Yes/No — what's weak?] | [Your alternative angle] |
+| Pricing page | [How they frame pricing] | [Yes/No — what's hidden?] | [Your transparency approach] |
+| Blog/content | [Content themes and tone] | [Yes/No — what's missing?] | [Your content differentiation] |
+| Social media | [Social messaging and engagement] | [Yes/No — what's performative?] | [Your authentic approach] |
+| Sales emails/outreach | [Known sales messaging] | [Yes/No — what's pushy/generic?] | [Your consultative approach] |
+| Customer reviews | [What customers actually say] | [Yes/No — gap from their claims?] | [Highlight alignment of your claims + reviews] |
+
+### Counter-Positioning Strategies
+
+| Strategy | When to Use | Messaging Pattern |
+|----------|-----------|-----------------|
+| **Contrast** | When competitor is dominant but has clear weakness | "Unlike [competitor], we [your strength]" |
+| **Flanking** | When competitor ignores a valuable segment | "Built specifically for [underserved segment]" |
+| **Reframing** | When competitor's strength is actually a weakness in disguise | "[Competitor's feature] sounds good until you realize [negative consequence]" |
+| **Elevation** | When competing on features is a losing game | "Stop comparing features. What matters is [higher-level outcome]" |
+| **Specificity** | When competitor uses vague claims | "[Specific metric] for [specific audience] in [specific timeframe]" |
+
+### Messaging Strength Test
+
+Rate each competitor's messaging on these dimensions (1-5):
+
+| Dimension | What to Evaluate | Score |
+|-----------|-----------------|-------|
+| Clarity | Can you understand their value prop in 5 seconds? | [1-5] |
+| Specificity | Do they use concrete numbers and claims? | [1-5] |
+| Differentiation | Is their message distinct from competitors? | [1-5] |
+| Proof | Do they back claims with evidence? | [1-5] |
+| Audience fit | Does the message resonate with their target? | [1-5] |
+| Consistency | Is messaging consistent across all touchpoints? | [1-5] |
+| Emotional resonance | Does it connect with real pain or aspiration? | [1-5] |
+
+**Total Score** = Sum / 35
+
+| Score Range | Assessment |
+|------------|-----------|
+| 0.8-1.0 | Very strong messaging -- find a niche angle to compete |
+| 0.6-0.79 | Solid messaging -- exploit specific gaps |
+| 0.4-0.59 | Average messaging -- multiple attack vectors available |
+| Below 0.4 | Weak messaging -- opportunity to dominate with clarity |
+
+---
+
+## Applying These Frameworks Together
+
+### Recommended Workflow
+
+1. **Start with Positioning Maps** to understand the landscape visually
+2. **Analyze Category Strategy** to decide how to frame your competitive set
+3. **Build Positioning Statement** using the templates
+4. **Audit Differentiation** to ensure claims are defensible
+5. **Identify Messaging Vulnerabilities** to find attack angles
+6. **Create Battlecards** (see [battlecard-template.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/research/competitor-analysis/references/battlecard-template.md)) incorporating all insights
+7. **Review quarterly** as the competitive landscape shifts
diff --git a/.cursor/skills/content-gap-analysis/SKILL.md b/.cursor/skills/content-gap-analysis/SKILL.md
new file mode 100644
index 0000000000..0e4e0e7883
--- /dev/null
+++ b/.cursor/skills/content-gap-analysis/SKILL.md
@@ -0,0 +1,287 @@
+---
+name: content-gap-analysis
+description: 'Find content gaps: topics and keywords competitors cover that you don''t, with editorial calendar. 内容缺口/选题规划'
+version: "6.0.0"
+license: Apache-2.0
+compatibility: "Claude Code ≥1.0, skills.sh marketplace, ClawHub marketplace, Vercel Labs skills ecosystem. No system packages required. Optional: MCP network access for SEO tool integrations."
+homepage: "https://github.com/aaron-he-zhu/seo-geo-claude-skills"
+when_to_use: "Use when finding content gaps between two domains, discovering missing topics, or identifying coverage holes versus competitors."
+argument-hint: "<your domain> <competitor domain>"
+metadata:
+  author: aaron-he-zhu
+  version: "6.0.0"
+  geo-relevance: "medium"
+  tags:
+    - seo
+    - geo
+    - content-gaps
+    - topic-analysis
+    - content-strategy
+    - editorial-calendar
+    - competitive-gap
+    - content-opportunities
+    - 内容缺口
+    - コンテンツギャップ
+    - 콘텐츠갭
+    - brechas-contenido
+  triggers:
+    # EN-formal
+    - "find content gaps"
+    - "content opportunities"
+    - "topic analysis"
+    - "content strategy gaps"
+    - "editorial calendar"
+    - "untapped topics"
+    # EN-casual
+    - "what am I missing"
+    - "topics to cover"
+    - "what do competitors write about"
+    - "what should I cover next"
+    - "topics I haven't written about"
+    - "they cover this but I don't"
+    # EN-question
+    - "what topics am I missing"
+    - "what content should I create"
+    # ZH-pro
+    - "内容缺口分析"
+    - "选题规划"
+    - "内容机会"
+    - "竞品话题"
+    # ZH-casual
+    - "缺什么内容"
+    - "竞品写了什么"
+    - "还应该写什么"
+    # JA
+    - "コンテンツギャップ"
+    - "コンテンツ機会"
+    # KO
+    - "콘텐츠 갭 분석"
+    - "콘텐츠 기회"
+    # ES
+    - "brechas de contenido"
+    - "oportunidades de contenido"
+    # PT
+    - "lacunas de conteúdo"
+    # Misspellings
+    - "content gab analysis"
+---
+
+# Content Gap Analysis
+
+
+> **[SEO & GEO Skills Library](https://github.com/aaron-he-zhu/seo-geo-claude-skills)** · 20 skills for SEO + GEO · [ClawHub](https://clawhub.ai/u/aaron-he-zhu) · [skills.sh](https://skills.sh/aaron-he-zhu/seo-geo-claude-skills)
+> **System Mode**: This research skill follows the shared [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md) and [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md).
+
+
+Identifies content opportunities by analyzing gaps between a site's content and competitors'. Surfaces missing topics, untapped keywords, and content formats worth creating.
+
+**System role**: Research layer skill. It turns market signals into reusable strategic inputs for the rest of the library.
+
+## When This Must Trigger
+
+Use this when the conversation involves any of these situations — even if the user does not use SEO terminology:
+
+Use this whenever the task needs reusable market intelligence that should influence strategy, not just an ad hoc answer.
+
+- Planning content strategy and editorial calendar
+- Finding quick-win content opportunities
+- Understanding where competitors outperform you
+- Identifying underserved topics in your niche
+- Expanding into adjacent topic areas
+- Prioritizing content creation efforts
+- Finding GEO opportunities competitors miss
+
+## What This Skill Does
+
+1. **Keyword Gap Analysis**: Finds keywords competitors rank for that you don't
+2. **Topic Coverage Mapping**: Identifies topic areas needing more content
+3. **Content Format Gaps**: Reveals missing content types (videos, tools, guides)
+4. **Audience Need Mapping**: Matches gaps to audience journey stages
+5. **GEO Opportunity Detection**: Finds AI-answerable topics you're missing
+6. **Priority Scoring**: Ranks gaps by impact and effort
+7. **Content Calendar Creation**: Plans gap-filling content schedule
+
+## Quick Start
+
+Start with one of these prompts. Finish with a short handoff summary using the repository format in [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md).
+
+### Basic Gap Analysis
+
+```
+Find content gaps between my site [URL] and [competitor URLs]
+```
+
+```
+What content am I missing compared to my top 3 competitors?
+```
+
+### Topic-Specific Analysis
+
+```
+Find content gaps in [topic area] compared to industry leaders
+```
+
+```
+What [content type] do competitors have that I don't?
+```
+
+### Audience-Focused
+
+```
+What content gaps exist for [audience segment] in my niche?
+```
+
+## Skill Contract
+
+**Expected output**: a prioritized research brief, evidence-backed findings, and a short handoff summary ready for `memory/research/`.
+
+- **Reads**: user goals, target market inputs, available tool data, and prior strategy from [CLAUDE.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CLAUDE.md) and the shared [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md) when available.
+- **Writes**: a user-facing research deliverable plus a reusable summary that can be stored under `memory/research/`.
+- **Promotes**: durable keyword priorities, competitor facts, entity candidates, and strategic decisions to `CLAUDE.md`, `memory/decisions.md`, and `memory/research/`; hand canonical entity work to `entity-optimizer`.
+- **Next handoff**: use the `Next Best Skill` below when the findings are ready to drive action.
+
+## Data Sources
+
+> **Note:** All integrations are optional. This skill works without any API keys — users provide data manually when no tools are connected.
+
+> See [CONNECTORS.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CONNECTORS.md) for tool category placeholders.
+
+**With ~~SEO tool + ~~search console + ~~analytics + ~~AI monitor connected:**
+Automatically pull your site's content inventory from ~~search console and ~~analytics (indexed pages, traffic per page, keywords ranking), competitor content data from ~~SEO tool (ranking keywords, top pages, backlink counts), and AI citation patterns from ~~AI monitor. Keyword overlap analysis and gap identification can be automated.
+
+**With manual data only:**
+Ask the user to provide:
+1. Your site URL and content inventory (list of published content with topics)
+2. Competitor URLs (3-5 sites)
+3. Your current traffic and keyword performance (if available)
+4. Known content strengths and weaknesses
+5. Industry context and business goals
+
+Proceed with the full analysis using provided data. Note in the output which metrics are from automated collection vs. user-provided data.
+
+## Instructions
+
+When a user requests content gap analysis:
+
+1. **Define Analysis Scope**
+
+   Clarify parameters:
+   
+   ```markdown
+   ### Analysis Parameters
+   
+   **Your Site**: [URL]
+   **Competitors to Analyze**: [URLs or "identify for me"]
+   **Topic Focus**: [specific area or "all"]
+   **Content Types**: [blogs, guides, tools, videos, or "all"]
+   **Audience**: [target audience]
+   **Business Goals**: [traffic, leads, authority, etc.]
+   ```
+
+2. **Audit Your Existing Content**
+
+   Document total indexed pages, content by type and topic cluster, top performing content, and content strengths/weaknesses.
+
+3. **Analyze Competitor Content**
+
+   For each competitor: document content volume, monthly traffic, content distribution by type, topic coverage vs. yours, and unique content they have.
+
+4. **Identify Keyword Gaps**
+
+   Find keywords competitors rank for that you do not. Categorize into High Priority (high volume, achievable difficulty), Quick Wins (lower volume, low difficulty), and Long-term (high volume, high difficulty). Include keyword overlap analysis.
+
+5. **Map Topic Gaps**
+
+   Create a topic coverage comparison matrix across all competitors. For each missing topic cluster, document business relevance, competitor coverage, opportunity size, sub-topics, and recommended pillar/cluster approach.
+
+6. **Identify Content Format Gaps**
+
+   Compare format distribution (guides, tutorials, comparisons, case studies, tools, templates, video, infographics, research) against competitors and industry averages. For each gap, assess effort and expected impact.
+
+7. **Analyze GEO/AI Gaps**
+
+   Identify topics where competitors get AI citations but you do not. Document missing Q&A content, definition/explanation content, and comparison content. Score each by traditional SEO value and GEO value.
+
+8. **Map to Audience Journey**
+
+   Compare funnel stage coverage (Awareness, Consideration, Decision, Retention) against competitor averages. Detail specific gaps at each stage.
+
+9. **Prioritize and Create Action Plan**
+
+   Produce a final report with: Executive Summary, Prioritized Gap List (Tier 1 Quick Wins, Tier 2 Strategic Builds, Tier 3 Long-term), Content Calendar, and Success Metrics.
+
+   > **Reference**: See [references/analysis-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/research/content-gap-analysis/references/analysis-templates.md) for detailed templates for each step.
+
+## Validation Checkpoints
+
+### Input Validation
+- [ ] Your content inventory is complete or representative sample provided
+- [ ] Competitor URLs identified (minimum 2-3 competitors)
+- [ ] Analysis scope defined (specific topics or comprehensive)
+- [ ] Business goals and priorities clarified
+
+### Output Validation
+- [ ] Every recommendation cites specific data points (not generic advice)
+- [ ] Gap analysis compares like-to-like content (topic clusters to topic clusters)
+- [ ] Priority scoring based on measurable criteria (volume, difficulty, business fit)
+- [ ] Content calendar maps gaps to realistic timeframes
+- [ ] Source of each data point clearly stated (~~SEO tool data, ~~analytics data, ~~AI monitor data, user-provided, or estimated)
+
+## Example
+
+> **Reference**: See [references/example-report.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/research/content-gap-analysis/references/example-report.md) for a complete example analyzing SaaS marketing blog gaps vs. HubSpot and Drift.
+
+## Advanced Analysis
+
+### Competitive Cluster Comparison
+
+```
+Compare our topic cluster coverage for [topic] vs top 5 competitors
+```
+
+### Temporal Gap Analysis
+
+```
+What content have competitors published in the last 6 months that we haven't covered?
+```
+
+### Intent-Based Gaps
+
+```
+Find gaps in our [commercial/informational] intent content
+```
+
+## Tips for Success
+
+1. **Focus on actionable gaps** - Not all gaps are worth filling
+2. **Consider your resources** - Prioritize based on ability to execute
+3. **Quality over quantity** - Better to fill 5 gaps well than 20 poorly
+4. **Track what works** - Measure gap-filling success
+5. **Update regularly** - Gaps change as competitors publish
+6. **Include GEO opportunities** - Don't just optimize for traditional search
+
+
+
+### Save Results
+
+After delivering findings to the user, ask:
+
+> "Save these results for future sessions?"
+
+If yes, write a dated summary to `memory/research/content-gap-analysis/YYYY-MM-DD-<topic>.md` containing:
+- One-line headline finding
+- Top 3-5 actionable items
+- Open loops or blockers
+- Source data references
+
+If any findings should influence ongoing strategy, recommend promoting key conclusions to `memory/hot-cache.md`.
+
+## Reference Materials
+
+- [Analysis Templates](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/research/content-gap-analysis/references/analysis-templates.md) — Detailed templates for each analysis step (inventory, competitor content, keyword gaps, topic gaps, format gaps, GEO gaps, journey, prioritized report)
+- [Gap Analysis Frameworks](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/research/content-gap-analysis/references/gap-analysis-frameworks.md) — Content audit matrices, funnel mapping, and gap prioritization scoring methodologies
+- [Example Report](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/research/content-gap-analysis/references/example-report.md) — Complete example analyzing SaaS marketing blog gaps vs. HubSpot and Drift
+
+## Next Best Skill
+
+- **Primary**: [seo-content-writer](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/build/seo-content-writer/SKILL.md) — turn missing topics into a draft or content roadmap.
diff --git a/.cursor/skills/content-gap-analysis/references/analysis-templates.md b/.cursor/skills/content-gap-analysis/references/analysis-templates.md
new file mode 100644
index 0000000000..487919a67e
--- /dev/null
+++ b/.cursor/skills/content-gap-analysis/references/analysis-templates.md
@@ -0,0 +1,335 @@
+# Content Gap Analysis — Analysis Templates
+
+Templates for each step of the content gap analysis workflow. Use these to structure your output.
+
+## Content Inventory Template
+
+```markdown
+## Your Content Inventory
+
+**Total Indexed Pages**: [X]
+**Content by Type**:
+- Blog posts: [X]
+- Landing pages: [X]
+- Resource pages: [X]
+- Tools/calculators: [X]
+- Case studies: [X]
+
+**Content by Topic Cluster**:
+
+| Topic | Articles | Keywords Ranking | Traffic |
+|-------|----------|------------------|---------|
+| [topic 1] | [X] | [X] | [X] |
+| [topic 2] | [X] | [X] | [X] |
+| [topic 3] | [X] | [X] | [X] |
+
+**Top Performing Content**:
+1. [Title] - [traffic] visits - [keywords] keywords
+2. [Title] - [traffic] visits - [keywords] keywords
+3. [Title] - [traffic] visits - [keywords] keywords
+
+**Content Strengths**:
+- [Strength 1]
+- [Strength 2]
+
+**Content Weaknesses**:
+- [Weakness 1]
+- [Weakness 2]
+```
+
+## Competitor Content Analysis Template
+
+```markdown
+## Competitor Content Analysis
+
+### Competitor 1: [Name/URL]
+
+**Content Volume**: [X] pages
+**Monthly Traffic**: [X] visits
+
+**Content Distribution**:
+| Type | Count | Est. Traffic |
+|------|-------|--------------|
+| Blog posts | [X] | [X] |
+| Guides | [X] | [X] |
+| Tools | [X] | [X] |
+| Videos | [X] | [X] |
+
+**Topic Coverage**:
+| Topic | Articles | Your Coverage |
+|-------|----------|---------------|
+| [topic] | [X] | [X or "None"] |
+
+**Unique Content They Have**:
+1. [Content piece] - [traffic] - [why it works]
+2. [Content piece] - [traffic] - [why it works]
+
+[Repeat for each competitor]
+```
+
+## Keyword Gap Analysis Template
+
+```markdown
+## Keyword Gap Analysis
+
+### Keywords Competitors Rank For (You Don't)
+
+**High Priority Gaps** (High volume, achievable difficulty)
+
+| Keyword | Volume | Difficulty | Competitor | Their Position |
+|---------|--------|------------|------------|----------------|
+| [kw 1] | [vol] | [diff] | [comp] | [pos] |
+| [kw 2] | [vol] | [diff] | [comp] | [pos] |
+| [kw 3] | [vol] | [diff] | [comp] | [pos] |
+
+**Quick Win Gaps** (Lower volume, low difficulty)
+
+| Keyword | Volume | Difficulty | Competitor | Their Position |
+|---------|--------|------------|------------|----------------|
+| [kw 1] | [vol] | [diff] | [comp] | [pos] |
+
+**Long-term Gaps** (High volume, high difficulty)
+
+| Keyword | Volume | Difficulty | Competitor | Their Position |
+|---------|--------|------------|------------|----------------|
+| [kw 1] | [vol] | [diff] | [comp] | [pos] |
+
+### Keyword Overlap Analysis
+
+```
+Venn Diagram Representation:
+
+       You          Competitor 1
+        ○               ○
+       / \             / \
+      /   \           /   \
+     /  A  \ B       / C   \
+    /       \       /       \
+   ○─────────○─────○─────────○
+             Competitor 2
+
+A: Keywords only you rank for: [X]
+B: Overlap with Comp 1: [X]
+C: Keywords all competitors share: [X]
+Gap: Keywords they all have, you don't: [X]
+```
+
+**Unique Keywords (Your Advantage)**:
+| Keyword | Your Position | Volume |
+|---------|---------------|--------|
+| [kw] | [pos] | [vol] |
+```
+
+## Topic Gap Analysis Template
+
+```markdown
+## Topic Gap Analysis
+
+### Topic Coverage Comparison
+
+| Topic Area | You | Comp 1 | Comp 2 | Comp 3 | Gap? |
+|------------|-----|--------|--------|--------|------|
+| [Topic 1] | ✅ [X] | ✅ [X] | ✅ [X] | ✅ [X] | No |
+| [Topic 2] | ❌ 0 | ✅ [X] | ✅ [X] | ✅ [X] | **Yes** |
+| [Topic 3] | ✅ [X] | ✅ [X] | ❌ 0 | ✅ [X] | Partial |
+| [Topic 4] | ❌ 0 | ✅ [X] | ✅ [X] | ❌ 0 | **Yes** |
+
+### Missing Topic Clusters
+
+#### Gap 1: [Topic Area]
+
+**Why it matters**: [Business relevance]
+**Competitor coverage**: [Who covers it and how]
+**Opportunity size**: [Traffic/keyword potential]
+
+**Sub-topics to cover**:
+1. [Sub-topic] - [X] search volume
+2. [Sub-topic] - [X] search volume
+3. [Sub-topic] - [X] search volume
+
+**Recommended approach**:
+- Pillar content: [topic]
+- Cluster articles: [list]
+- Supporting content: [list]
+```
+
+## Content Format Gap Template
+
+```markdown
+## Content Format Gap Analysis
+
+### Format Distribution Comparison
+
+| Format | You | Comp 1 | Comp 2 | Industry Avg |
+|--------|-----|--------|--------|--------------|
+| Long-form guides | [X] | [X] | [X] | [X] |
+| Tutorials | [X] | [X] | [X] | [X] |
+| Comparison posts | [X] | [X] | [X] | [X] |
+| Case studies | [X] | [X] | [X] | [X] |
+| Tools/calculators | [X] | [X] | [X] | [X] |
+| Templates | [X] | [X] | [X] | [X] |
+| Video content | [X] | [X] | [X] | [X] |
+| Infographics | [X] | [X] | [X] | [X] |
+| Original research | [X] | [X] | [X] | [X] |
+
+### Format Gaps to Fill
+
+#### Gap: [Format Type]
+
+**Current state**: You have [X], competitors average [Y]
+**Best examples**: [Competitor content examples]
+**Opportunity**: [Description]
+**Effort to create**: [Low/Medium/High]
+**Expected impact**: [Low/Medium/High]
+
+**Recommended first project**:
+[Specific content idea]
+```
+
+## GEO Content Gap Template
+
+```markdown
+## GEO Content Gap Analysis
+
+### AI-Answerable Topics Assessment
+
+**Topics where competitors get AI citations (you don't)**:
+
+| Topic | AI Cites | Why They're Cited | Your Gap |
+|-------|----------|-------------------|----------|
+| [topic 1] | [Comp] | [reason] | [what you need] |
+| [topic 2] | [Comp] | [reason] | [what you need] |
+
+### GEO-Optimized Content Gaps
+
+**Missing Q&A Content**:
+| Question | Search Volume | Currently Answered By |
+|----------|---------------|----------------------|
+| [question] | [vol] | [competitor] |
+
+**Missing Definition/Explanation Content**:
+| Term | Search Volume | Best Current Source |
+|------|---------------|---------------------|
+| [term] | [vol] | [source] |
+
+**Missing Comparison Content**:
+| Comparison | Search Volume | Best Current Source |
+|------------|---------------|---------------------|
+| [A vs B] | [vol] | [source] |
+
+### GEO Opportunity Score
+
+| Topic | Traditional SEO Value | GEO Value | Combined Priority |
+|-------|----------------------|-----------|-------------------|
+| [topic] | [score] | [score] | [priority] |
+```
+
+## Audience Journey Gap Template
+
+```markdown
+## Audience Journey Gap Analysis
+
+### Funnel Stage Coverage
+
+| Stage | Your Content | Competitor Avg | Gap |
+|-------|--------------|----------------|-----|
+| Awareness | [X] articles | [X] articles | [+/-X] |
+| Consideration | [X] articles | [X] articles | [+/-X] |
+| Decision | [X] articles | [X] articles | [+/-X] |
+| Retention | [X] articles | [X] articles | [+/-X] |
+
+### Journey Gap Details
+
+#### Awareness Stage Gaps
+- Missing: [topics/content]
+- Opportunity: [description]
+
+#### Consideration Stage Gaps
+- Missing: [topics/content]
+- Opportunity: [description]
+
+#### Decision Stage Gaps
+- Missing: [topics/content]
+- Opportunity: [description]
+```
+
+## Prioritized Report Template
+
+```markdown
+# Content Gap Analysis Report
+
+## Executive Summary
+
+**Analysis Date**: [Date]
+**Sites Analyzed**: [Your site] vs [Competitors]
+
+**Key Findings**:
+1. [Most significant gap]
+2. [Second significant gap]
+3. [Third significant gap]
+
+**Total Opportunity**:
+- Keywords gaps identified: [X]
+- Estimated traffic opportunity: [X]/month
+- Quick wins available: [X] pieces
+
+---
+
+## Prioritized Gap List
+
+### Tier 1: Quick Wins (Do Now)
+
+| Content to Create | Target Keyword | Volume | Difficulty | Impact |
+|-------------------|----------------|--------|------------|--------|
+| [Title idea] | [keyword] | [vol] | [diff] | High |
+| [Title idea] | [keyword] | [vol] | [diff] | High |
+
+**Why prioritize**: Low effort, immediate ranking potential
+
+### Tier 2: Strategic Builds (This Quarter)
+
+| Content to Create | Target Keyword | Volume | Difficulty | Impact |
+|-------------------|----------------|--------|------------|--------|
+| [Title idea] | [keyword] | [vol] | [diff] | High |
+
+**Why prioritize**: High value, requires more resources
+
+### Tier 3: Long-term Investments (This Year)
+
+| Content to Create | Target Keyword | Volume | Difficulty | Impact |
+|-------------------|----------------|--------|------------|--------|
+| [Title idea] | [keyword] | [vol] | [diff] | High |
+
+**Why prioritize**: Builds authority, competitive differentiator
+
+---
+
+## Content Calendar Recommendation
+
+### Month 1
+| Week | Content | Type | Target Keyword | Status |
+|------|---------|------|----------------|--------|
+| 1 | [Title] | [Type] | [Keyword] | Planned |
+| 2 | [Title] | [Type] | [Keyword] | Planned |
+| 3 | [Title] | [Type] | [Keyword] | Planned |
+| 4 | [Title] | [Type] | [Keyword] | Planned |
+
+### Month 2
+[Continue...]
+
+### Month 3
+[Continue...]
+
+---
+
+## Success Metrics
+
+Track these to measure gap-filling success:
+
+| Metric | Current | 3-Month Target | 6-Month Target |
+|--------|---------|----------------|----------------|
+| Keyword coverage | [X] | [X] | [X] |
+| Topic clusters complete | [X] | [X] | [X] |
+| Traffic from new content | [X] | [X] | [X] |
+| AI citations | [X] | [X] | [X] |
+```
diff --git a/.cursor/skills/content-gap-analysis/references/example-report.md b/.cursor/skills/content-gap-analysis/references/example-report.md
new file mode 100644
index 0000000000..0a4438dce6
--- /dev/null
+++ b/.cursor/skills/content-gap-analysis/references/example-report.md
@@ -0,0 +1,47 @@
+# Content Gap Analysis — Example Report
+
+This is a complete example output showing the expected format for content gap analysis deliverables.
+
+## Prompt
+
+**User**: "Find content gaps for my SaaS marketing blog compared to HubSpot and Drift"
+
+## Example Output
+
+```markdown
+# Content Gap Analysis: SaaS Marketing Blog
+
+## Executive Summary
+
+Compared to HubSpot and Drift, your blog has significant gaps in:
+1. **Interactive tools** - They have 15+, you have 0
+2. **Comparison content** - Missing "[Your Tool] vs [Competitor]" pages
+3. **GEO-optimized definitions** - No glossary or term definitions
+
+Total opportunity: ~25,000 monthly visits from 45 keyword gaps
+
+## Top Keyword Gaps
+
+### Quick Wins (Difficulty <40)
+
+| Keyword | Volume | Difficulty | Who Ranks |
+|---------|--------|------------|-----------|
+| saas marketing metrics | 1,200 | 32 | HubSpot #3 |
+| b2b email sequences | 890 | 28 | Drift #5 |
+| saas onboarding emails | 720 | 25 | Neither! |
+| marketing qualified lead definition | 1,800 | 35 | HubSpot #1 |
+
+### Content Format Gaps
+
+**You're missing**:
+- [ ] Interactive ROI calculator (HubSpot gets 15k visits/mo from theirs)
+- [ ] Email template library (Drift's gets 8k visits/mo)
+- [ ] Marketing glossary (HubSpot's definition pages rank for 500+ keywords)
+
+## Recommended Content Calendar
+
+**Week 1**: "SaaS Marketing Metrics: Complete Guide" (Quick win)
+**Week 2**: "What is a Marketing Qualified Lead?" (GEO opportunity)
+**Week 3**: "B2B Email Sequence Templates" (Format gap)
+**Week 4**: "[Your Tool] vs HubSpot" (Comparison gap)
+```
diff --git a/.cursor/skills/content-gap-analysis/references/gap-analysis-frameworks.md b/.cursor/skills/content-gap-analysis/references/gap-analysis-frameworks.md
new file mode 100644
index 0000000000..a9ebc5a462
--- /dev/null
+++ b/.cursor/skills/content-gap-analysis/references/gap-analysis-frameworks.md
@@ -0,0 +1,318 @@
+# Gap Analysis Frameworks
+
+Complete frameworks for systematic keyword gap identification, content format gap analysis, funnel stage gap mapping, opportunity scoring models, and content calendar integration. Use these frameworks to transform gap analysis from ad-hoc observation into repeatable process.
+
+## Overview
+
+Content gap analysis answers three questions:
+1. **What are competitors covering that you are not?** (Keyword and topic gaps)
+2. **What content formats are expected but missing?** (Format gaps)
+3. **Where in the buyer journey does your content fail?** (Funnel gaps)
+
+This reference provides structured methodologies for each question, plus scoring and prioritization systems.
+
+---
+
+## 1. Systematic Keyword Gap Methodology
+
+### Step-by-Step Process
+
+#### Step 1: Define Your Keyword Universe
+
+Before finding gaps, establish what you already rank for:
+
+| Data Source | What It Provides | How to Collect |
+|------------|-----------------|---------------|
+| ~~search console | Keywords driving impressions and clicks to your site | Export Performance report (last 6 months) |
+| ~~SEO tool | Full keyword profile including positions 1-100 | Domain keyword export |
+| Content audit | Topics covered by existing content | Manual or automated URL crawl + categorization |
+
+**Output:** A master list of all keywords you currently rank for (positions 1-100).
+
+#### Step 2: Build Competitor Keyword Profiles
+
+For each competitor (3-5 recommended):
+
+| Competitor | Total Keywords | Keywords in Top 10 | Keywords in Top 3 | Estimated Traffic |
+|-----------|---------------|-------------------|-------------------|------------------|
+| [Competitor A] | [Count] | [Count] | [Count] | [Monthly est.] |
+| [Competitor B] | [Count] | [Count] | [Count] | [Monthly est.] |
+| [Competitor C] | [Count] | [Count] | [Count] | [Monthly est.] |
+| **You** | [Count] | [Count] | [Count] | [Monthly est.] |
+
+#### Step 3: Calculate Keyword Overlap
+
+Identify four keyword segments:
+
+```
+┌──────────────────────────────────────────────────────────┐
+│                                                          │
+│  A: Keywords ONLY you rank for (your unique advantage)   │
+│                                                          │
+│  B: Keywords you AND competitors share (competitive)     │
+│                                                          │
+│  C: Keywords ONLY competitors rank for (your gaps)       │
+│                                                          │
+│  D: Keywords NO ONE ranks for (market opportunity)       │
+│                                                          │
+└──────────────────────────────────────────────────────────┘
+```
+
+| Segment | Size | Strategic Meaning | Action |
+|---------|------|------------------|--------|
+| A (Only you) | [Count] | Your content moat | Protect and strengthen |
+| B (Shared) | [Count] | Competitive battleground | Improve rankings |
+| C (Only them) | [Count] | Your content gaps | Prioritize and create |
+| D (No one) | [Count] | Untapped market | Evaluate and pioneer |
+
+#### Step 4: Filter and Categorize Gaps (Segment C)
+
+Not all gaps are worth filling. Filter by:
+
+| Filter | Threshold | Purpose |
+|--------|-----------|---------|
+| Minimum search volume | >100/month (adjust for niche) | Ensure traffic potential |
+| Maximum keyword difficulty | <your domain's competitive range | Ensure you can rank |
+| Business relevance | Must relate to your product/service | Ensure strategic fit |
+| Intent match | Must match content types you can create | Ensure execution capability |
+
+**After filtering, categorize remaining gaps:**
+
+| Category | Description | Example |
+|----------|-----------|---------|
+| Topic gaps | Entire topics you haven't covered | Competitor has 10 articles on "technical SEO"; you have zero |
+| Depth gaps | You cover the topic but competitor goes deeper | You have 1 blog post; competitor has a pillar page + 8 cluster articles |
+| Angle gaps | Same topic but different perspective or audience | Competitor targets enterprise; you could target SMB |
+| Format gaps | Same keyword but different content format | Competitor has a video tutorial; you only have text |
+| Freshness gaps | Same topic but their content is more current | Competitor updated in 2026; yours is from 2024 |
+
+#### Step 5: Prioritize Gaps
+
+Use the scoring model from Section 4 below.
+
+---
+
+## 2. Content Format Gap Analysis
+
+### Format Audit Methodology
+
+#### Step 1: Inventory Your Content Formats
+
+Categorize all your published content:
+
+| Format | Count | % of Total | Avg. Traffic/Page | Top Performer |
+|--------|-------|-----------|-------------------|--------------|
+| Blog posts (text) | [X] | [X]% | [X] | [URL] |
+| Long-form guides | [X] | [X]% | [X] | [URL] |
+| How-to tutorials | [X] | [X]% | [X] | [URL] |
+| Comparison/vs pages | [X] | [X]% | [X] | [URL] |
+| Case studies | [X] | [X]% | [X] | [URL] |
+| Templates/downloads | [X] | [X]% | [X] | [URL] |
+| Interactive tools | [X] | [X]% | [X] | [URL] |
+| Video content | [X] | [X]% | [X] | [URL] |
+| Infographics | [X] | [X]% | [X] | [URL] |
+| Original research/data | [X] | [X]% | [X] | [URL] |
+| Glossary/definitions | [X] | [X]% | [X] | [URL] |
+| Webinars/recordings | [X] | [X]% | [X] | [URL] |
+| Podcasts/audio | [X] | [X]% | [X] | [URL] |
+
+#### Step 2: Benchmark Against Competitors
+
+| Format | You | Comp A | Comp B | Comp C | Industry Benchmark |
+|--------|-----|--------|--------|--------|-------------------|
+| Blog posts | [X] | [X] | [X] | [X] | Table stakes |
+| Long-form guides | [X] | [X] | [X] | [X] | Expected for authority |
+| How-to tutorials | [X] | [X] | [X] | [X] | Expected for SEO |
+| Comparison pages | [X] | [X] | [X] | [X] | Required for commercial |
+| Case studies | [X] | [X] | [X] | [X] | Expected for B2B |
+| Templates | [X] | [X] | [X] | [X] | Lead gen standard |
+| Interactive tools | [X] | [X] | [X] | [X] | Differentiator |
+| Video | [X] | [X] | [X] | [X] | Growing expectation |
+| Original research | [X] | [X] | [X] | [X] | Linkbait standard |
+| Glossary | [X] | [X] | [X] | [X] | GEO opportunity |
+
+#### Step 3: Identify Format Gaps
+
+For each format you are missing or underrepresenting:
+
+| Missing Format | Competitor Proof | Estimated Effort | Expected Impact | Priority |
+|---------------|-----------------|-----------------|----------------|---------|
+| [Format 1] | [Which competitor succeeds + traffic est.] | [Low/Med/High] | [Low/Med/High] | [P0-P3] |
+| [Format 2] | [Evidence] | [Effort] | [Impact] | [Priority] |
+
+#### Step 4: Format-SERP Feature Alignment
+
+Certain formats unlock specific SERP features:
+
+| Content Format | Unlocks SERP Feature | Schema Required |
+|---------------|---------------------|----------------|
+| FAQ sections | FAQ rich results, PAA eligibility | FAQPage |
+| Step-by-step tutorials | How-To rich results, Featured Snippet (list) | HowTo |
+| Review/comparison content | Review stars, AI Overview citations | Review, AggregateRating |
+| Video content | Video carousel, Video rich result | VideoObject |
+| Product pages | Shopping results, Product rich results | Product |
+| Event pages | Event rich results | Event |
+| Glossary/definitions | Featured Snippet (paragraph), AI Overview | DefinedTerm (optional) |
+| Data tables | Featured Snippet (table), AI Overview | Table (optional) |
+
+---
+
+## 3. Funnel Stage Gap Identification
+
+### Funnel Coverage Mapping
+
+#### Step 1: Define Funnel Stages and Content Expectations
+
+| Funnel Stage | User Mindset | Content Need | Expected Formats | Typical Keywords |
+|-------------|-------------|-------------|-----------------|-----------------|
+| **Awareness** | "I have a problem/question" | Educational content | Blog posts, infographics, social content | "what is", "how to", "why does" |
+| **Interest** | "I want to learn more" | Deeper educational content | Guides, webinars, email courses | "guide to", "tutorial", "examples" |
+| **Consideration** | "What are my options?" | Comparison and evaluation | Comparison posts, reviews, case studies | "best", "vs", "review", "alternatives" |
+| **Intent** | "I'm narrowing my choices" | Decision-support content | Demos, free trials, pricing pages, ROI calculators | "pricing", "demo", "free trial" |
+| **Purchase** | "I'm ready to act" | Conversion content | Product pages, checkout, signup forms | "buy", "sign up", "get started" |
+| **Retention** | "I need help/value" | Support and engagement | Help docs, tutorials, community, newsletters | "[product] how to", "support" |
+| **Advocacy** | "I want to share/recommend" | Shareable and referral content | Case studies, review programs, referral incentives | "[product] review", "recommend" |
+
+#### Step 2: Audit Content by Funnel Stage
+
+For each stage, count your content and compare to competitors:
+
+| Funnel Stage | Your Content Count | Comp A | Comp B | Gap Size | Gap Severity |
+|-------------|-------------------|--------|--------|----------|-------------|
+| Awareness | [X] | [X] | [X] | [+/- X] | [Low/Med/High/Critical] |
+| Interest | [X] | [X] | [X] | [+/- X] | [Low/Med/High/Critical] |
+| Consideration | [X] | [X] | [X] | [+/- X] | [Low/Med/High/Critical] |
+| Intent | [X] | [X] | [X] | [+/- X] | [Low/Med/High/Critical] |
+| Purchase | [X] | [X] | [X] | [+/- X] | [Low/Med/High/Critical] |
+| Retention | [X] | [X] | [X] | [+/- X] | [Low/Med/High/Critical] |
+| Advocacy | [X] | [X] | [X] | [+/- X] | [Low/Med/High/Critical] |
+
+**Gap Severity Criteria:**
+
+| Severity | Definition |
+|----------|-----------|
+| Critical | Zero content at this stage; users drop off here |
+| High | Significantly less content than competitors; measurable conversion drop |
+| Medium | Somewhat less content; competitors have advantage but you are present |
+| Low | Roughly on par; minor opportunities for improvement |
+
+#### Step 3: Identify Drop-Off Points
+
+Map your analytics data to funnel stages to find where users leave:
+
+| Transition | Metric to Check | Drop-Off Signal | Content Gap Likely |
+|-----------|----------------|----------------|-------------------|
+| Awareness → Interest | Bounce rate on blog posts | >70% bounce rate | Missing "next step" content or CTAs |
+| Interest → Consideration | Pages per session | <2 pages/session | Missing comparison/evaluation content |
+| Consideration → Intent | Demo/trial requests | Low conversion from comparison pages | Missing trust content (case studies, reviews) |
+| Intent → Purchase | Cart/signup abandonment | High abandonment rate | Missing objection-handling content |
+| Purchase → Retention | Churn rate | High early churn | Missing onboarding/help content |
+
+---
+
+## 4. Opportunity Scoring Model
+
+### Multi-Factor Gap Scoring
+
+Score each identified gap on 5 factors (1-5 scale each):
+
+| Factor | Weight | Score 1 (Low) | Score 3 (Medium) | Score 5 (High) |
+|--------|--------|--------------|-----------------|----------------|
+| **Search Demand** | 25% | <100 monthly searches | 500-2,000 searches | >5,000 searches |
+| **Competitive Density** | 20% | All competitors cover it well | 1-2 competitors cover it | No competitor covers it |
+| **Business Relevance** | 25% | Tangential to your offering | Related to your offering | Core to your offering |
+| **Creation Effort** | 15% | Requires new capabilities | Moderate effort | Quick to create |
+| **Conversion Potential** | 15% | Pure awareness (top-funnel) | Consideration stage | Decision/transactional stage |
+
+**Gap Priority Score** = Σ (Factor Weight x Score)
+
+### Priority Tiers
+
+| Tier | Score Range | Timeline | Action |
+|------|-----------|----------|--------|
+| **P0 -- Fill Immediately** | 4.0-5.0 | This sprint / this week | High demand + low competition + high relevance |
+| **P1 -- Fill This Quarter** | 3.0-3.9 | Next 1-3 months | Good demand + some competition + relevant |
+| **P2 -- Plan for Future** | 2.0-2.9 | Next 3-6 months | Moderate opportunity, requires more resources |
+| **P3 -- Monitor** | 1.0-1.9 | Track quarterly | Low priority but may become relevant |
+
+### Quick-Win Identification
+
+A "quick win" gap scores high on Demand + Relevance and low on Effort + Competition:
+
+```
+Quick Win Score = (Search Demand + Business Relevance) - (Creation Effort + Competitive Density)
+                   Higher is better
+```
+
+| Quick Win Score | Assessment |
+|----------------|-----------|
+| 4+ | Strong quick win -- create this content first |
+| 2-3 | Moderate quick win -- include in next content sprint |
+| 0-1 | Not a quick win -- may still be strategically important |
+| Negative | Avoid -- high effort/competition relative to demand |
+
+---
+
+## 5. Content Calendar Integration
+
+### Gap-to-Calendar Workflow
+
+#### Step 1: Group Gaps by Topic Cluster
+
+Group related gaps into clusters rather than addressing them individually:
+
+| Cluster | Related Gaps | Combined Volume | Pillar Needed? | Cluster Pages Needed |
+|---------|-------------|----------------|---------------|---------------------|
+| [Cluster A] | Gap 1, Gap 3, Gap 7 | [Sum of volumes] | [Yes/No] | [Count] |
+| [Cluster B] | Gap 2, Gap 5 | [Sum of volumes] | [Yes/No] | [Count] |
+| [Cluster C] | Gap 4, Gap 6, Gap 8, Gap 9 | [Sum of volumes] | [Yes/No] | [Count] |
+
+#### Step 2: Sequence by Priority and Dependencies
+
+| Order | Content Piece | Gap(s) Addressed | Priority | Dependencies | Target Publish |
+|-------|-------------|-----------------|---------|-------------|---------------|
+| 1 | [Pillar page for Cluster A] | Gap 1, 3, 7 (overview) | P0 | None | Week 1 |
+| 2 | [Quick-win article] | Gap 2 | P0 | None | Week 2 |
+| 3 | [Cluster A subtopic 1] | Gap 3 (deep dive) | P0 | Pillar published | Week 3 |
+| 4 | [Cluster A subtopic 2] | Gap 7 (deep dive) | P1 | Pillar published | Week 4 |
+| 5 | [Comparison page] | Gap 5 | P1 | None | Week 5 |
+
+#### Step 3: Assign Resources and Track
+
+| Content Piece | Writer | Reviewer | Target Date | Status | Gap Closed? |
+|-------------|--------|---------|------------|--------|------------|
+| [Title] | [Name] | [Name] | [Date] | [Draft/Review/Published] | [Yes/No/Partial] |
+
+### Calendar Cadence by Team Size
+
+| Team Size | Monthly Output | Gap Fill Rate | Full Gap Close Timeline |
+|-----------|---------------|--------------|----------------------|
+| Solo | 4-6 pieces/month | 3-4 gaps/month | 6-12 months for 30 gaps |
+| Small (2-3) | 8-15 pieces/month | 6-10 gaps/month | 3-6 months for 30 gaps |
+| Medium (4-6) | 15-25 pieces/month | 10-20 gaps/month | 2-3 months for 30 gaps |
+| Large (7+) | 25+ pieces/month | 20+ gaps/month | 1-2 months for 30 gaps |
+
+### Post-Publication Gap Validation
+
+After publishing gap-filling content, validate that the gap is actually closed:
+
+| Checkpoint | Timeframe | What to Check | Success Criteria |
+|-----------|-----------|--------------|-----------------|
+| Indexing | 1-2 weeks | Is the page indexed? | Appears in Google index |
+| Initial ranking | 2-4 weeks | Is it ranking for target keyword? | Appears in top 100 |
+| Competitive ranking | 2-3 months | Has it reached competitive positions? | Top 20 or improving trend |
+| Traffic impact | 3-6 months | Is it driving meaningful traffic? | Meeting traffic projections |
+| Gap closure | 6 months | Has the gap been effectively closed? | Ranking comparable to competitors |
+
+---
+
+## Framework Selection Guide
+
+| Situation | Primary Framework | Secondary Framework |
+|-----------|------------------|-------------------|
+| "We need more traffic" | Keyword Gap Methodology | Content Calendar Integration |
+| "Competitors outrank us everywhere" | Keyword Gap + Format Gap | Opportunity Scoring |
+| "Our conversion rate is low" | Funnel Stage Gap | Format Gap (decision-stage formats) |
+| "We don't know what to write next" | All three gap types | Opportunity Scoring + Calendar |
+| "We have limited resources" | Opportunity Scoring (quick wins) | Keyword Gap (filtered by difficulty) |
+| "We're entering a new market" | Keyword Gap (comprehensive) | Format Gap + Funnel Gap |
diff --git a/.cursor/skills/content-quality-auditor/SKILL.md b/.cursor/skills/content-quality-auditor/SKILL.md
new file mode 100644
index 0000000000..c3679a8e27
--- /dev/null
+++ b/.cursor/skills/content-quality-auditor/SKILL.md
@@ -0,0 +1,403 @@
+---
+name: content-quality-auditor
+description: 'Publish-readiness gate: 80-item CORE-EEAT audit with weighted scoring, veto checks, and fix plan. 内容质量/EEAT评分'
+version: "6.0.0"
+license: Apache-2.0
+allowed-tools: WebFetch
+compatibility: "Claude Code ≥1.0, skills.sh marketplace, ClawHub marketplace, Vercel Labs skills ecosystem. No system packages required. Optional: MCP network access for SEO tool integrations."
+homepage: "https://github.com/aaron-he-zhu/seo-geo-claude-skills"
+when_to_use: "Use when auditing content quality before publishing. Runs CORE-EEAT 80-item scoring with veto checks. Also when the user asks for E-E-A-T analysis or publish readiness."
+argument-hint: "<URL or paste content> [keyword]"
+metadata:
+  author: aaron-he-zhu
+  version: "6.0.0"
+  geo-relevance: "high"
+  tags:
+    - seo
+    - geo
+    - e-e-a-t
+    - core-eeat
+    - content-quality
+    - content-scoring
+    - helpful-content
+    - publish-readiness
+    - 内容质量
+    - コンテンツ品質
+    - 콘텐츠품질
+    - auditoria-eeat
+  triggers:
+    # EN-formal
+    - "audit content quality"
+    - "EEAT score"
+    - "CORE-EEAT audit"
+    - "content quality check"
+    - "content assessment"
+    - "quality score"
+    # EN-casual
+    - "is this ready to publish"
+    - "grade my article"
+    - "check before publishing"
+    - "how good is my content"
+    - "is my content good enough to rank"
+    - "rate my content quality"
+    # EN-question
+    - "is my content ready to publish"
+    - "how do I improve content quality"
+    # ZH-pro
+    - "内容质量审计"
+    - "EEAT评分"
+    - "内容评估"
+    # ZH-casual
+    - "文章能发吗"
+    - "内容打几分"
+    - "文章写得怎么样"
+    # JA
+    - "コンテンツ品質監査"
+    - "E-E-A-T評価"
+    # KO
+    - "콘텐츠 품질 감사"
+    - "EEAT 점수"
+    # ES
+    - "auditoría de calidad de contenido"
+    - "puntuación EEAT"
+    # PT
+    - "auditoria de qualidade"
+    # Misspellings
+    - "EEAT scroe"
+---
+
+# Content Quality Auditor
+
+> Based on [CORE-EEAT Content Benchmark](https://github.com/aaron-he-zhu/core-eeat-content-benchmark). Full benchmark reference: [references/core-eeat-benchmark.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/core-eeat-benchmark.md)
+
+
+> **[SEO & GEO Skills Library](https://github.com/aaron-he-zhu/seo-geo-claude-skills)** · 20 skills for SEO + GEO · [ClawHub](https://clawhub.ai/u/aaron-he-zhu) · [skills.sh](https://skills.sh/aaron-he-zhu/seo-geo-claude-skills)
+> **System Mode**: This cross-cutting skill is part of the protocol layer and follows the shared [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md) and [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md).
+
+
+This skill evaluates content quality across 80 standardized criteria organized in 8 dimensions. It produces a comprehensive audit report with per-item scoring, dimension and system scores, weighted totals by content type, and a prioritized action plan.
+
+**System role**: Publish Readiness Gate. It decides whether content is ready to ship, what blocks publication, and what should be promoted into durable project memory.
+
+## When This Must Trigger
+
+Use this when content needs a quality check before publishing — even if the user doesn't use audit terminology:
+
+- User asks "is this ready to publish" or "how good is this"
+- User just finished writing with seo-content-writer or content-refresher
+- **PostToolUse hook auto-triggers**: after content is written or substantially edited, the hook recommends this audit. When hook-triggered, skip setup questions — audit the content that was just produced.
+- Auditing content quality before publishing
+- Evaluating existing content for improvement opportunities
+- Benchmarking content against CORE-EEAT standards
+- Comparing content quality against competitors
+- Assessing both GEO readiness (AI citation potential) and SEO strength (source credibility)
+- Running periodic content quality checks as part of a content maintenance program
+- After writing or optimizing content with seo-content-writer or geo-content-optimizer
+
+## What This Skill Does
+
+1. **Full 80-Item Audit**: Scores every CORE-EEAT check item as Pass/Partial/Fail
+2. **Dimension Scoring**: Calculates scores for all 8 dimensions (0-100 each)
+3. **System Scoring**: Computes GEO Score (CORE) and SEO Score (EEAT)
+4. **Weighted Totals**: Applies content-type-specific weights for final score
+5. **Veto Detection**: Flags critical trust violations (T04, C01, R10)
+6. **Priority Ranking**: Identifies Top 5 improvements sorted by impact
+7. **Action Plan**: Generates specific, actionable improvement steps
+
+## Quick Start
+
+Start with one of these prompts. Finish with a publish verdict and a handoff summary using the repository format in [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md).
+
+### Audit Content
+
+```
+Audit this content against CORE-EEAT: [content text or URL]
+```
+
+```
+Run a content quality audit on [URL] as a [content type]
+```
+
+### Audit with Content Type
+
+```
+CORE-EEAT audit for this product review: [content]
+```
+
+```
+Score this how-to guide against the 80-item benchmark: [content]
+```
+
+### Comparative Audit
+
+```
+Audit my content vs competitor: [your content] vs [competitor content]
+```
+
+## Skill Contract
+
+**Gate verdict**: **SHIP** (no veto items, dimension scores above threshold) / **FIX** (issues found but no veto) / **BLOCK** (veto item T04, C01, or R10 failed). Always state the verdict prominently at the top of the report.
+
+**Expected output**: a CORE-EEAT audit report, a publish-readiness verdict, and a short handoff summary ready for `memory/audits/content/`.
+
+- **Reads**: the target content, content type, supporting evidence, and any prior decisions from [CLAUDE.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CLAUDE.md) and the shared [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md) when available.
+- **Writes**: a user-facing audit report plus a reusable summary that can be stored under `memory/audits/content/`.
+- **Promotes**: veto items and publish blockers to `memory/hot-cache.md` (auto-saved, no user confirmation needed). Top improvement priorities to `memory/open-loops.md`.
+- **Next handoff**: use the `Next Best Skill` below once the verdict is clear.
+
+## Data Sources
+
+> See [CONNECTORS.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CONNECTORS.md) for tool category placeholders.
+
+**With ~~web crawler + ~~SEO tool connected:**
+Automatically fetch page content, extract HTML structure, check schema markup, verify internal/external links, and pull competitor content for comparison.
+
+**With manual data only:**
+Ask the user to provide:
+1. Content text, URL, or file path
+2. Content type (if not auto-detectable): Product Review, How-to Guide, Comparison, Landing Page, Blog Post, FAQ Page, Alternative, Best-of, or Testimonial
+3. Optional: competitor content for benchmarking
+
+Proceed with the full 80-item audit using provided data. Note in the output which items could not be fully evaluated due to missing access (e.g., backlink data, schema markup, site-level signals).
+
+## Decision Gates
+
+When stopping to ask, always: (1) state the specific value and threshold, (2) offer numbered options with outcomes.
+
+**Stop and ask the user when:**
+- Content is under minimum word count for its type (blog/guide: 300 words; product/landing page: 150 words; FAQ: fewer than 3 entries with 50+ words each) — state the actual count and offer: (1) expand to minimum, (2) continue audit with Insufficient Data flags, (3) cancel
+- Content type cannot be auto-detected — state what you detected and ask to confirm before proceeding
+- Content is primarily media (video/image) with minimal text — ask whether to audit transcript, alt text, or skip
+- More than 50% of a dimension's items are N/A — name the dimension and ask: (1) provide supplementary data, (2) mark entire dimension as Insufficient Data
+- Any veto item triggers — flag it immediately with the item ID and ask: (1) stop for immediate fix, (2) continue full audit and flag in report
+
+**Continue silently (never stop for):**
+- Individual Partial scores within a dimension
+- Missing SEO tool data (mark items as N/A and continue)
+- Low overall score (the report is the deliverable, not a judgment call)
+- User not specifying content type (auto-detect and state your assumption)
+
+## Instructions
+
+When a user requests a content quality audit:
+
+### Step 1: Preparation
+
+```markdown
+### Audit Setup
+
+**Content**: [title or URL]
+**Content Type**: [auto-detected or user-specified]
+**Dimension Weights**: [loaded from content-type weight table]
+
+#### Veto Check (Emergency Brake)
+
+| Veto Item | Status | Action |
+|-----------|--------|--------|
+| T04: Disclosure Statements | ✅ Pass / ⚠️ VETO | [If VETO: "Add disclosure banner at page top immediately"] |
+| C01: Intent Alignment | ✅ Pass / ⚠️ VETO | [If VETO: "Rewrite title and first paragraph"] |
+| R10: Content Consistency | ✅ Pass / ⚠️ VETO | [If VETO: "Verify all data before publishing"] |
+```
+
+If any veto item triggers, flag it prominently at the top of the report and recommend immediate action before continuing the full audit.
+
+### Step 2: CORE Audit (40 items)
+
+Evaluate each item against the criteria in [references/core-eeat-benchmark.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/core-eeat-benchmark.md).
+
+Score each item:
+- **Pass** = 10 points (fully meets criteria)
+- **Partial** = 5 points (partially meets criteria)
+- **Fail** = 0 points (does not meet criteria)
+
+```markdown
+### C — Contextual Clarity
+
+| ID | Check Item | Score | Notes |
+|----|-----------|-------|-------|
+| C01 | Intent Alignment | Pass/Partial/Fail | [specific observation] |
+| C02 | Direct Answer | Pass/Partial/Fail | [specific observation] |
+| ... | ... | ... | ... |
+| C10 | Semantic Closure | Pass/Partial/Fail | [specific observation] |
+
+**C Score**: [X]/100
+```
+
+Repeat the same table format for **O** (Organization), **R** (Referenceability), and **E** (Exclusivity), scoring all 10 items per dimension.
+
+### Step 3: EEAT Audit (40 items)
+
+```markdown
+### Exp — Experience
+
+| ID | Check Item | Score | Notes |
+|----|-----------|-------|-------|
+| Exp01 | First-Person Narrative | Pass/Partial/Fail | [specific observation] |
+| ... | ... | ... | ... |
+
+**Exp Score**: [X]/100
+```
+
+Repeat the same table format for **Ept** (Expertise), **A** (Authority), and **T** (Trust), scoring all 10 items per dimension.
+
+See [references/item-reference.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/content-quality-auditor/references/item-reference.md) for the complete 80-item ID lookup table and site-level item handling notes.
+
+### Step 4: Scoring & Report
+
+Calculate scores and generate the final report:
+
+```markdown
+## CORE-EEAT Audit Report
+
+### Overview
+
+- **Content**: [title]
+- **Content Type**: [type]
+- **Audit Date**: [date]
+- **Total Score**: [score]/100 ([rating])
+- **GEO Score**: [score]/100 | **SEO Score**: [score]/100
+- **Veto Status**: ✅ No triggers / ⚠️ [item] triggered
+
+### Dimension Scores
+
+| Dimension | Score | Rating | Weight | Weighted |
+|-----------|-------|--------|--------|----------|
+| C — Contextual Clarity | [X]/100 | [rating] | [X]% | [X] |
+| O — Organization | [X]/100 | [rating] | [X]% | [X] |
+| R — Referenceability | [X]/100 | [rating] | [X]% | [X] |
+| E — Exclusivity | [X]/100 | [rating] | [X]% | [X] |
+| Exp — Experience | [X]/100 | [rating] | [X]% | [X] |
+| Ept — Expertise | [X]/100 | [rating] | [X]% | [X] |
+| A — Authority | [X]/100 | [rating] | [X]% | [X] |
+| T — Trust | [X]/100 | [rating] | [X]% | [X] |
+| **Weighted Total** | | | | **[X]/100** |
+
+**Score Calculation**:
+- GEO Score = (C + O + R + E) / 4
+- SEO Score = (Exp + Ept + A + T) / 4
+- Weighted Score = Σ (dimension_score × content_type_weight)
+
+**Rating Scale**: 90-100 Excellent | 75-89 Good | 60-74 Medium | 40-59 Low | 0-39 Poor
+
+### N/A Item Handling
+
+When an item cannot be evaluated (e.g., A01 Backlink Profile requires site-level data not available):
+
+1. Mark the item as "N/A" with reason
+2. Exclude N/A items from the dimension score calculation
+3. Dimension Score = (sum of scored items) / (number of scored items x 10) x 100
+4. If more than 50% of a dimension's items are N/A, flag the dimension as "Insufficient Data" and exclude it from the weighted total
+5. Recalculate weighted total using only dimensions with sufficient data, re-normalizing weights to sum to 100%
+
+**Example**: Authority dimension with 8 N/A items and 2 scored items (A05=8, A07=5):
+- Dimension score = (8+5) / (2 x 10) x 100 = 65
+- But 8/10 items are N/A (>50%), so flag as "Insufficient Data -- Authority"
+- Exclude A dimension from weighted total; redistribute its weight proportionally to remaining dimensions
+
+### Per-Item Scores
+
+#### CORE — Content Body (40 Items)
+
+| ID | Check Item | Score | Notes |
+|----|-----------|-------|-------|
+| C01 | Intent Alignment | [Pass/Partial/Fail] | [observation] |
+| C02 | Direct Answer | [Pass/Partial/Fail] | [observation] |
+| ... | ... | ... | ... |
+
+#### EEAT — Source Credibility (40 Items)
+
+| ID | Check Item | Score | Notes |
+|----|-----------|-------|-------|
+| Exp01 | First-Person Narrative | [Pass/Partial/Fail] | [observation] |
+| ... | ... | ... | ... |
+
+### Top 5 Priority Improvements
+
+Sorted by: weight × points lost (highest impact first)
+
+1. **[ID] [Name]** — [specific modification suggestion]
+   - Current: [Fail/Partial] | Potential gain: [X] weighted points
+   - Action: [concrete step]
+
+2. **[ID] [Name]** — [specific modification suggestion]
+   - Current: [Fail/Partial] | Potential gain: [X] weighted points
+   - Action: [concrete step]
+
+3–5. [Same format]
+
+### Action Plan
+
+#### Quick Wins (< 30 minutes each)
+- [ ] [Action 1]
+- [ ] [Action 2]
+
+#### Medium Effort (1-2 hours)
+- [ ] [Action 3]
+- [ ] [Action 4]
+
+#### Strategic (Requires planning)
+- [ ] [Action 5]
+- [ ] [Action 6]
+
+### Recommended Next Steps
+
+- For full content rewrite: use `seo-content-writer` with CORE-EEAT constraints
+- For GEO optimization: use `geo-content-optimizer` targeting failed GEO-First items
+- For content refresh: use `content-refresher` with weak dimensions as focus
+- For technical fixes: run `/seo:check-technical` for site-level issues
+```
+
+### Save Results
+
+After delivering findings to the user, ask:
+
+> "Save these results for future sessions?"
+
+If yes, write a dated summary to the appropriate `memory/` path using filename `YYYY-MM-DD-<topic>.md` containing:
+- One-line verdict or headline finding
+- Top 3-5 actionable items
+- Open loops or blockers
+- Source data references
+
+If any veto-level issue was found (CORE-EEAT T04, C01, R10 or CITE T03, T05, T09), also append a one-liner to `memory/hot-cache.md` without asking.
+
+## Validation Checkpoints
+
+### Input Validation
+- [ ] Content source identified (text, URL, or file path)
+- [ ] Content type confirmed (auto-detected or user-specified)
+- [ ] Content is substantial enough for meaningful audit (≥300 words)
+- [ ] If comparative audit, competitor content also provided
+
+### Output Validation
+- [ ] All 80 items scored (or marked N/A with reason)
+- [ ] All 8 dimension scores calculated correctly
+- [ ] Weighted total matches content-type weight configuration
+- [ ] Veto items checked and flagged if triggered
+- [ ] Top 5 improvements sorted by weighted impact, not arbitrary
+- [ ] Every recommendation is specific and actionable (not generic advice)
+- [ ] Action plan includes concrete steps with effort estimates
+
+## Example
+
+See [references/item-reference.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/content-quality-auditor/references/item-reference.md) for a complete scored example showing the C dimension with all 10 items, priority improvements, and weighted scoring.
+
+## Tips for Success
+
+1. **Start with veto items** — T04, C01, R10 are deal-breakers regardless of total score
+   > These veto items are consistent with the CORE-EEAT benchmark (Section 3), which defines them as items that can override the overall score.
+2. **Focus on high-weight dimensions** — Different content types prioritize different dimensions
+3. **GEO-First items matter most for AI visibility** — Prioritize items tagged GEO 🎯 if AI citation is the goal
+4. **Some EEAT items need site-level data** — Don't penalize content for things only observable at the site level (backlinks, brand recognition)
+5. **Use the weighted score, not just the raw average** — A product review with strong Exclusivity matters more than strong Authority
+6. **Re-audit after improvements** — Run again to verify score improvements and catch regressions
+7. **Pair with CITE for domain-level context** — A high content score on a low-authority domain signals a different priority than the reverse; run [domain-authority-auditor](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/domain-authority-auditor/SKILL.md) for the full 120-item picture
+
+## Reference Materials
+
+- [CORE-EEAT Content Benchmark](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/core-eeat-benchmark.md) — Full 80-item benchmark with dimension definitions, scoring criteria, and GEO-First item markers
+- [references/item-reference.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/content-quality-auditor/references/item-reference.md) — All 80 item IDs in a compact lookup table + site-level item handling notes + scored example report
+
+## Next Best Skill
+
+- **Primary**: [content-refresher](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/content-refresher/SKILL.md) — turn failed checks into a concrete rewrite plan.
diff --git a/.cursor/skills/content-quality-auditor/references/item-reference.md b/.cursor/skills/content-quality-auditor/references/item-reference.md
new file mode 100644
index 0000000000..49bf51045a
--- /dev/null
+++ b/.cursor/skills/content-quality-auditor/references/item-reference.md
@@ -0,0 +1,99 @@
+# CORE-EEAT Item Reference
+
+Quick reference for all 80 CORE-EEAT audit items. Full scoring criteria in [core-eeat-benchmark.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/core-eeat-benchmark.md).
+
+## Complete Item Reference
+
+| ID | Item | ID | Item |
+|----|------|----|------|
+| C01 | Intent Alignment | Exp01 | First-Person Narrative |
+| C02 | Direct Answer | Exp02 | Sensory Details |
+| C03 | Query Coverage | Exp03 | Process Documentation |
+| C04 | Definition First | Exp04 | Tangible Proof |
+| C05 | Topic Scope | Exp05 | Usage Duration |
+| C06 | Audience Targeting | Exp06 | Problems Encountered |
+| C07 | Semantic Coherence | Exp07 | Before/After Comparison |
+| C08 | Use Case Mapping | Exp08 | Quantified Metrics |
+| C09 | FAQ Coverage | Exp09 | Repeated Testing |
+| C10 | Semantic Closure | Exp10 | Limitations Acknowledged |
+| O01 | Heading Hierarchy | Ept01 | Author Identity |
+| O02 | Summary Box | Ept02 | Credentials Display |
+| O03 | Data Tables | Ept03 | Professional Vocabulary |
+| O04 | List Formatting | Ept04 | Technical Depth |
+| O05 | Schema Markup | Ept05 | Methodology Rigor |
+| O06 | Section Chunking | Ept06 | Edge Case Awareness |
+| O07 | Visual Hierarchy | Ept07 | Historical Context |
+| O08 | Anchor Navigation | Ept08 | Reasoning Transparency |
+| O09 | Information Density | Ept09 | Cross-domain Integration |
+| O10 | Multimedia Structure | Ept10 | Editorial Process |
+| R01 | Data Precision | A01 | Backlink Profile |
+| R02 | Citation Density | A02 | Media Mentions |
+| R03 | Source Hierarchy | A03 | Industry Awards |
+| R04 | Evidence-Claim Mapping | A04 | Publishing Record |
+| R05 | Methodology Transparency | A05 | Brand Recognition |
+| R06 | Timestamp & Versioning | A06 | Social Proof |
+| R07 | Entity Precision | A07 | Knowledge Graph Presence |
+| R08 | Internal Link Graph | A08 | Entity Consistency |
+| R09 | HTML Semantics | A09 | Partnership Signals |
+| R10 | Content Consistency | A10 | Community Standing |
+| E01 | Original Data | T01 | Legal Compliance |
+| E02 | Novel Framework | T02 | Contact Transparency |
+| E03 | Primary Research | T03 | Security Standards |
+| E04 | Contrarian View | T04 | Disclosure Statements |
+| E05 | Proprietary Visuals | T05 | Editorial Policy |
+| E06 | Gap Filling | T06 | Correction & Update Policy |
+| E07 | Practical Tools | T07 | Ad Experience |
+| E08 | Depth Advantage | T08 | Risk Disclaimers |
+| E09 | Synthesis Value | T09 | Review Authenticity |
+| E10 | Forward Insights | T10 | Customer Support |
+
+**Note on site-level items**: Most Authority items (A01-A10) and several Trust items (T01-T03, T05, T07, T10) require site-level or organization-level data that may not be observable from a single page. When auditing a standalone page without site context, mark these as "N/A — requires site-level data" and exclude from the dimension average.
+
+## Example Audit Report
+
+**User**: "Audit this blog post against CORE-EEAT: [paste of 'Best Project Management Tools for Remote Teams 2025']"
+
+**Output** (partial — showing one dimension to demonstrate format):
+
+```markdown
+## CORE-EEAT Audit Report
+
+### Overview
+
+- **Content**: "Best Project Management Tools for Remote Teams 2025"
+- **Content Type**: Blog Post / Comparison
+- **Audit Date**: 2025-06-15
+- **Veto Status**: No triggers
+
+### C -- Contextual Clarity (scored dimension example)
+
+| ID  | Check Item         | Score   | Points | Notes                                                       |
+|-----|--------------------|---------|--------|-------------------------------------------------------------|
+| C01 | Intent Alignment   | Pass    | 10     | Matches "best X" comparison intent; title and body aligned  |
+| C02 | Direct Answer      | Partial | 5      | Answer appears in first 300 words but no summary box        |
+| C03 | Query Coverage     | Pass    | 10     | Covers "project management tools", "remote team software", "best PM tools" |
+| C04 | Definition First   | Pass    | 10     | Key terms ("PM tool", "async collaboration") defined on first use |
+| C05 | Topic Scope        | Partial | 5      | States what's covered but not what's excluded               |
+| C06 | Audience Targeting | Pass    | 10     | Explicitly targets "remote team leads and managers"         |
+| C07 | Semantic Coherence | Pass    | 10     | Logical flow: intro > criteria > tools > comparison > verdict |
+| C08 | Use Case Mapping   | Pass    | 10     | Decision matrix for team size, budget, and features         |
+| C09 | FAQ Coverage       | Fail    | 0      | No FAQ section despite long-tail potential ("free PM tools for small teams") |
+| C10 | Semantic Closure   | Partial | 5      | Conclusion present but doesn't loop back to opening promise |
+
+**C Dimension Score**: 75/100 (Good)
+**Blog Post weight for C**: 25%
+**Weighted contribution**: 18.75
+
+#### Priority Improvements from C Dimension
+
+1. **C09 FAQ Coverage** -- Add FAQ section with 3-5 long-tail questions
+   - Current: Fail (0) | Potential gain: 2.5 weighted points
+   - Action: Add FAQ with "Are there free PM tools for small remote teams?", "How to migrate between PM tools?", etc.
+
+2. **C02 Direct Answer** -- Add a summary box above the fold
+   - Current: Partial (5) | Potential gain: 1.25 weighted points
+   - Action: Insert a "Top 3 Picks" callout box in the first 150 words
+
+[... remaining 7 dimensions (O, R, E, Exp, Ept, A, T) follow the same per-item format ...]
+[... then: Dimension Scores table, Top 5 Priority Improvements, Action Plan, Recommended Next Steps ...]
+```
diff --git a/.cursor/skills/content-refresher/SKILL.md b/.cursor/skills/content-refresher/SKILL.md
new file mode 100644
index 0000000000..5c1967a909
--- /dev/null
+++ b/.cursor/skills/content-refresher/SKILL.md
@@ -0,0 +1,396 @@
+---
+name: content-refresher
+description: 'Refresh outdated posts with current stats, new sections, freshness signals to restore rankings. 内容更新/排名恢复'
+version: "6.0.0"
+license: Apache-2.0
+compatibility: "Claude Code ≥1.0, skills.sh marketplace, ClawHub marketplace, Vercel Labs skills ecosystem. No system packages required. Optional: MCP network access for SEO tool integrations."
+homepage: "https://github.com/aaron-he-zhu/seo-geo-claude-skills"
+when_to_use: "Use when updating outdated content, refreshing old articles, improving declining pages, or adding new information to existing content."
+argument-hint: "<URL of outdated content>"
+metadata:
+  author: aaron-he-zhu
+  version: "6.0.0"
+  geo-relevance: "medium"
+  tags:
+    - seo
+    - geo
+    - content-refresh
+    - content-update
+    - content-decay
+    - ranking-recovery
+    - evergreen-content
+    - content-lifecycle
+    - 内容更新
+    - コンテンツ更新
+    - 콘텐츠갱신
+    - actualizar-contenido
+  triggers:
+    # EN-formal
+    - "update old content"
+    - "refresh content"
+    - "content is outdated"
+    - "improve declining rankings"
+    - "revive old blog posts"
+    - "content decay"
+    - "content refresh strategy"
+    # EN-casual
+    - "traffic is dropping"
+    - "ranking dropped"
+    - "this post is outdated"
+    - "traffic is declining on this page"
+    - "rankings dropped for this article"
+    - "my old content needs updating"
+    # EN-question
+    - "how to fix declining traffic"
+    - "why is my content losing rankings"
+    - "how often should I update content"
+    # EN-competitor
+    - "Clearscope content refresh"
+    - "MarketMuse content update"
+    # ZH-pro
+    - "内容更新"
+    - "内容刷新"
+    - "排名恢复"
+    - "内容衰减"
+    - "内容生命周期"
+    # ZH-casual
+    - "排名下降了"
+    - "文章过时了"
+    - "流量掉了"
+    - "老文章怎么办"
+    # JA
+    - "コンテンツ更新"
+    - "コンテンツリフレッシュ"
+    - "記事更新"
+    # KO
+    - "콘텐츠 갱신"
+    - "콘텐츠 업데이트"
+    - "순위 하락"
+    - "순위 하락 원인"
+    - "오래된 글 어떻게 해?"
+    # ES
+    - "actualizar contenido"
+    - "refrescar contenido antiguo"
+    # PT
+    - "atualizar conteúdo"
+    # Misspellings
+    - "content refesh"
+    - "content refreshh"
+---
+
+# Content Refresher
+
+
+> **[SEO & GEO Skills Library](https://github.com/aaron-he-zhu/seo-geo-claude-skills)** · 20 skills for SEO + GEO · [ClawHub](https://clawhub.ai/u/aaron-he-zhu) · [skills.sh](https://skills.sh/aaron-he-zhu/seo-geo-claude-skills)
+> **System Mode**: This optimization skill follows the shared [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md) and [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md).
+
+
+This skill helps identify and revitalize outdated content to reclaim lost rankings and traffic. It analyzes content freshness, identifies update opportunities, and guides the refresh process for maximum SEO and GEO impact.
+
+**System role**: Optimization layer skill. It turns weak pages, structures, and technical issues into prioritized repair work.
+
+## When This Must Trigger
+
+Use this when the conversation involves any of these situations — even if the user does not use SEO terminology:
+
+Use this whenever the task needs a diagnosis or repair plan that should feed directly into remediation work, not just a one-time opinion.
+
+- Content has lost rankings or traffic over time
+- Statistics and information are outdated
+- Competitors have published better content
+- Content needs updating for a new year
+- Industry changes require content updates
+- Adding new sections to existing content
+- Converting old content for GEO optimization
+
+## What This Skill Does
+
+1. **Freshness Analysis**: Identifies outdated content needing updates
+2. **Performance Tracking**: Finds content with declining traffic
+3. **Gap Identification**: Spots missing information competitors have
+4. **Update Prioritization**: Ranks content by refresh potential
+5. **Refresh Recommendations**: Provides specific update guidance
+6. **GEO Enhancement**: Updates content for AI citation potential
+7. **Republishing Strategy**: Advises on date and promotion tactics
+
+## Quick Start
+
+Start with one of these prompts. Finish with a short handoff summary using the repository format in [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md).
+
+### Identify Content to Refresh
+
+```
+Find content on [domain] that needs refreshing
+```
+
+```
+Which of my blog posts have lost the most traffic?
+```
+
+### Refresh Specific Content
+
+```
+Refresh this article for [current year]: [URL/content]
+```
+
+```
+Update this content to outrank [competitor URL]: [your URL]
+```
+
+### Content Refresh Strategy
+
+```
+Create a content refresh strategy for [domain/topic]
+```
+
+## Skill Contract
+
+**Expected output**: a scored diagnosis, prioritized repair plan, and a short handoff summary ready for `memory/audits/`.
+
+- **Reads**: the current page or site state, symptoms, prior audits, and current priorities from [CLAUDE.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CLAUDE.md) and the shared [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md) when available.
+- **Writes**: a user-facing audit or optimization plan plus a reusable summary that can be stored under `memory/audits/`.
+- **Promotes**: blocking defects, repeated weaknesses, and fix priorities to `memory/open-loops.md` and `memory/decisions.md`.
+- **Next handoff**: use the `Next Best Skill` below when the repair path is clear.
+
+## Data Sources
+
+> See [CONNECTORS.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CONNECTORS.md) for tool category placeholders.
+
+**With ~~analytics + ~~search console + ~~SEO tool connected:**
+Claude can automatically pull historical traffic trends from ~~analytics, fetch impression and ranking data from ~~search console, retrieve keyword position history from ~~SEO tool, and identify content with declining performance. This enables data-driven refresh prioritization.
+
+**With manual data only:**
+Ask the user to provide:
+1. Traffic data or screenshots showing performance trends
+2. Ranking screenshots or history for key pages
+3. Content publish dates and last update dates
+4. List of pages the user suspects need refreshing
+
+Proceed with the analysis using provided data. Note in the output which findings are from automated data vs. manual review.
+
+## Instructions
+
+When a user requests content refresh help:
+
+1. **CORE-EEAT Quick Score — Identify Weak Dimensions**
+
+   Before refreshing, run a quick CORE-EEAT assessment to focus effort on the weakest areas. Reference: [CORE-EEAT Benchmark](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/core-eeat-benchmark.md)
+
+   ```markdown
+   ### CORE-EEAT Quick Assessment
+
+   **Content**: [title or URL]
+   **Content Type**: [type]
+
+   Rapidly score each dimension (estimate 0-100):
+
+   | Dimension | Quick Score | Key Weakness | Refresh Priority |
+   |-----------|-----------|--------------|-----------------|
+   | C — Contextual Clarity | [X]/100 | [main issue] | 🔴/🟡/🟢 |
+   | O — Organization | [X]/100 | [main issue] | 🔴/🟡/🟢 |
+   | R — Referenceability | [X]/100 | [main issue] | 🔴/🟡/🟢 |
+   | E — Exclusivity | [X]/100 | [main issue] | 🔴/🟡/🟢 |
+   | Exp — Experience | [X]/100 | [main issue] | 🔴/🟡/🟢 |
+   | Ept — Expertise | [X]/100 | [main issue] | 🔴/🟡/🟢 |
+   | A — Authority | [X]/100 | [main issue] | 🔴/🟡/🟢 |
+   | T — Trust | [X]/100 | [main issue] | 🔴/🟡/🟢 |
+
+   **Weakest Dimensions** (focus refresh here):
+   1. [Dimension] — [what needs fixing]
+   2. [Dimension] — [what needs fixing]
+
+   **Refresh Strategy**: Focus on 🔴 dimensions first, then 🟡.
+
+   _For full 80-item audit, use [content-quality-auditor](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/content-quality-auditor/SKILL.md)_
+   ```
+
+2. **Identify Content Refresh Candidates**
+
+   ```markdown
+   ## Content Refresh Analysis
+   
+   ### Refresh Candidate Identification
+   
+   **Criteria for Content Refresh**:
+   - Published more than 6 months ago
+   - Contains dated information (years, statistics)
+   - Declining traffic trend
+   - Lost keyword rankings
+   - Outdated references or broken links
+   - Missing topics competitors now cover
+   - No GEO optimization
+   
+   ### Content Audit Results
+   
+   | Content | Published | Last Updated | Traffic Trend | Priority |
+   |---------|-----------|--------------|---------------|----------|
+   | [Title 1] | [date] | [date] | ↓ -45% | 🔴 High |
+   | [Title 2] | [date] | Never | ↓ -30% | 🔴 High |
+   | [Title 3] | [date] | [date] | ↓ -20% | 🟡 Medium |
+   | [Title 4] | [date] | [date] | → 0% | 🟡 Medium |
+   
+   ### Refresh Prioritization Matrix
+   
+   ```
+   High Traffic + High Decline = 🔴 Refresh Immediately
+   High Traffic + Low Decline = 🟡 Schedule Refresh
+   Low Traffic + High Decline = 🟡 Evaluate & Decide
+   Low Traffic + Low Decline = 🟢 Low Priority
+   ```
+   ```
+
+3. **Analyze Individual Content for Refresh**
+
+   ```markdown
+   ## Content Refresh Analysis: [Title]
+   
+   **URL**: [URL]
+   **Published**: [date]
+   **Last Updated**: [date]
+   **Word Count**: [X]
+   
+   ### Performance Metrics
+   
+   | Metric | 6 Mo Ago | Current | Change |
+   |--------|----------|---------|--------|
+   | Organic Traffic | [X]/mo | [X]/mo | [+/-X]% |
+   | Avg Position | [X] | [X] | [+/-X] |
+   | Impressions | [X] | [X] | [+/-X]% |
+   | CTR | [X]% | [X]% | [+/-X]% |
+   
+   ### Keywords Analysis
+   
+   | Keyword | Old Position | Current Position | Change |
+   |---------|--------------|------------------|--------|
+   | [kw 1] | [X] | [X] | ↓ [X] |
+   | [kw 2] | [X] | [X] | ↓ [X] |
+   | [kw 3] | [X] | [X] | ↓ [X] |
+   
+   ### Why This Content Needs Refresh
+   
+   1. **Outdated information**: [specific examples]
+   2. **Competitive gap**: [what competitors added]
+   3. **Missing topics**: [new subtopics to cover]
+   4. **SEO issues**: [current optimization problems]
+   5. **GEO potential**: [AI citation opportunities]
+   ```
+
+4. **Identify Specific Updates Needed**
+
+   ```markdown
+   ## Refresh Requirements
+   
+   ### Outdated Elements
+   
+   | Element | Current | Update Needed |
+   |---------|---------|---------------|
+   | Year references | "[old year]" | Update to [current year] |
+   | Statistics | "[old stat]" | Find current data |
+   | Tool mentions | "[old tool]" | Add newer tools |
+   | Links | [X] broken | Fix or replace |
+   | Screenshots | Outdated UI | Recapture |
+   
+   ### Missing Information
+   
+   **Topics competitors now cover that you don't**:
+   
+   | Topic | Competitor Coverage | Words Needed | Priority |
+   |-------|---------------------|--------------|----------|
+   | [Topic 1] | 3/5 competitors | ~300 words | High |
+   | [Topic 2] | 2/5 competitors | ~200 words | Medium |
+   | [Topic 3] | 4/5 competitors | ~400 words | High |
+   
+   ### SEO Updates Needed
+   
+   - [ ] Update title tag with current year
+   - [ ] Refresh meta description
+   - [ ] Add new H2 sections for [topics]
+   - [ ] Update internal links to newer content
+   - [ ] Add FAQ section for featured snippets
+   - [ ] Refresh images and add new alt text
+   
+   ### GEO Updates Needed
+   
+   - [ ] Add clear definition at start
+   - [ ] Include quotable statistics with sources
+   - [ ] Add Q&A formatted sections
+   - [ ] Update sources with current citations
+   - [ ] Create standalone factual statements
+   ```
+
+5. **Create Refresh Plan** — Structural changes, content additions, statistics/links/images to update
+
+   > **Reference**: See [references/refresh-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/content-refresher/references/refresh-templates.md) for the full refresh plan template (Step 5).
+
+6. **Write Refresh Content** — Updated introduction, new sections, refreshed statistics, new FAQ section
+
+   > **Reference**: See [references/refresh-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/content-refresher/references/refresh-templates.md) for the refresh content writing template (Step 6).
+
+7. **Optimize for GEO During Refresh** — Clear definitions, quotable statements, Q&A sections, updated citations
+
+   > **Reference**: See [references/refresh-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/content-refresher/references/refresh-templates.md) for the GEO enhancement template (Step 7).
+
+8. **Generate Republishing Strategy** — Date strategy (update/add "last updated"/keep original), technical implementation, promotion plan
+
+   > **Reference**: See [references/refresh-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/content-refresher/references/refresh-templates.md) for the republishing strategy template (Step 8).
+
+9. **Create Refresh Report** — Summary of changes, updates completed, expected outcomes, next review date
+
+   > **Reference**: See [references/refresh-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/content-refresher/references/refresh-templates.md) for the refresh report template (Step 9).
+
+## Validation Checkpoints
+
+### Input Validation
+- [ ] Target content URL or title clearly identified
+- [ ] Historical performance data available (traffic trends, rankings)
+- [ ] Content publish/update dates known
+- [ ] If comparing to competitors, competitor URLs provided
+
+### Output Validation
+- [ ] Every recommendation cites specific data points (not generic advice)
+- [ ] Outdated elements identified with specific examples and replacement data
+- [ ] All suggested additions include word counts and section locations
+- [ ] Source of each data point clearly stated (~~analytics data, ~~search console, ~~SEO tool, user-provided, or estimated)
+
+## Example
+
+> **Reference**: See [references/refresh-example.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/content-refresher/references/refresh-example.md) for a full worked example (cloud hosting refresh) and the comprehensive content refresh checklist.
+
+## Tips for Success
+
+1. **Prioritize by ROI** - Refresh high-potential content first
+2. **Don't just add dates** - Make substantial improvements
+3. **Beat competitors** - Add what they have and more
+4. **Track results** - Monitor ranking changes post-refresh
+5. **Schedule regular audits** - Check content health quarterly
+6. **Optimize for GEO** - Every refresh is a GEO opportunity
+
+> **Reference data**: For content decay signal taxonomy, lifecycle stages, refresh vs. rewrite decision framework, and update strategy by content type, see [references/content-decay-signals.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/content-refresher/references/content-decay-signals.md).
+
+
+### Save Results
+
+After delivering audit or optimization findings to the user, ask:
+
+> "Save these results for future sessions?"
+
+If yes, write a dated summary to `memory/audits/content-refresher/YYYY-MM-DD-<topic>.md` containing:
+- One-line verdict or headline finding
+- Top 3-5 actionable items
+- Open loops or blockers
+- Source data references
+
+If any veto-level issue was found (CORE-EEAT T04, C01, R10 or CITE T03, T05, T09), also append a one-liner to `memory/hot-cache.md` without asking.
+
+
+**Gate check recommended**: Run content-quality-auditor on refreshed content before republishing.
+
+## Reference Materials
+
+- [Content Decay Signals](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/content-refresher/references/content-decay-signals.md) — Decay indicators, lifecycle stages, and refresh triggers by content type
+- [Refresh Templates](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/content-refresher/references/refresh-templates.md) — Detailed output templates for steps 5-9 (refresh plan, content writing, GEO enhancement, republishing, report)
+- [Refresh Example & Checklist](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/content-refresher/references/refresh-example.md) — Full worked example and pre/post-refresh checklist
+
+## Next Best Skill
+
+- **Primary**: [content-quality-auditor](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/content-quality-auditor/SKILL.md) — re-score the refreshed content before shipping.
diff --git a/.cursor/skills/content-refresher/references/content-decay-signals.md b/.cursor/skills/content-refresher/references/content-decay-signals.md
new file mode 100644
index 0000000000..9eb7dd3c55
--- /dev/null
+++ b/.cursor/skills/content-refresher/references/content-decay-signals.md
@@ -0,0 +1,404 @@
+# Content Decay Signals
+
+Comprehensive decay detection system with automated monitoring setup, severity scoring, refresh playbooks by content type, and ROI estimation for content refresh investments.
+
+## Decay Signal Detection System
+
+### Primary Signals (High Reliability)
+
+These signals directly indicate content performance decline and should trigger immediate investigation.
+
+#### 1. Organic Traffic Decline
+
+| Severity | Threshold | Detection Window | Action |
+|----------|-----------|-----------------|--------|
+| Watch | 10-20% decline | Month-over-month | Add to monitoring list |
+| Warning | 20-40% decline | Month-over-month | Schedule refresh within 2 weeks |
+| Critical | 40-60% decline | Month-over-month | Refresh this week |
+| Emergency | >60% decline | Month-over-month | Investigate immediately (may be technical issue) |
+
+**Detection method**: Compare current month's organic sessions to same month previous year (to account for seasonality) and to previous month (for trend detection).
+
+**False positive check**: Before attributing traffic decline to content decay, rule out:
+- Seasonal variations (compare year-over-year, not just month-over-month)
+- Algorithm updates (check if decline coincides with known Google updates)
+- Technical issues (crawl errors, indexation problems, site speed regression)
+- Tracking code changes (analytics misconfiguration)
+
+#### 2. Ranking Position Drops
+
+| Severity | Threshold | Detection Window | Action |
+|----------|-----------|-----------------|--------|
+| Watch | 1-3 positions lost | 2-week average | Monitor |
+| Warning | 3-5 positions lost | 2-week average | Investigate cause |
+| Critical | 5-10 positions lost | 2-week average | Schedule immediate refresh |
+| Emergency | Dropped off page 1 to page 3+ | Any timeframe | Priority refresh or rewrite |
+
+**Detection method**: Track primary keyword positions weekly. Use 2-week rolling averages to smooth daily fluctuations.
+
+#### 3. Click-Through Rate Decline
+
+| Severity | Threshold | Context | Action |
+|----------|-----------|---------|--------|
+| Watch | CTR below expected for position | Position stable, CTR dropping | Review title and meta description |
+| Warning | CTR dropped 20%+ vs. baseline | With stable impressions | Rewrite title tag and meta description |
+| Critical | CTR dropped 40%+ vs. baseline | May indicate stale SERP appearance | Full refresh of title, description, and structured data |
+
+**Expected CTR by position** (organic, desktop, approximate):
+
+| Position | Expected CTR Range | Below This = Investigate |
+|----------|--------------------|-------------------------|
+| 1 | 25-35% | <20% |
+| 2 | 12-18% | <10% |
+| 3 | 8-12% | <6% |
+| 4-5 | 5-8% | <4% |
+| 6-10 | 2-5% | <2% |
+
+---
+
+### Secondary Signals (Moderate Reliability)
+
+These signals suggest potential decay but may have other causes. Use them to corroborate primary signals.
+
+#### 4. Engagement Metric Decline
+
+| Metric | Decay Indicator | Possible Cause |
+|--------|----------------|---------------|
+| Bounce rate increase >15% | Content no longer satisfies intent | Outdated information, better competitor content |
+| Time on page decrease >20% | Users leaving faster | Content not comprehensive enough |
+| Scroll depth decrease | Users not reading full content | Front-loading outdated info, losing interest |
+| Pages per session decrease | Users not exploring further | Poor internal linking, irrelevant content |
+
+#### 5. Content Freshness Indicators
+
+| Indicator | Decay Risk | Detection |
+|-----------|-----------|-----------|
+| Published >12 months ago, never updated | High | CMS date audit |
+| Contains year references 2+ years old | High | Text search for year patterns |
+| Statistics from 3+ years ago | Medium | Manual review or text search for "20XX" |
+| Broken external links (>10% of total) | Medium | Monthly crawl report |
+| Screenshots of outdated UI | Medium | Manual visual review |
+| References to discontinued products/tools | High | Manual review |
+
+#### 6. Competitive Displacement Signals
+
+| Signal | Detection Method | Severity |
+|--------|-----------------|----------|
+| New competitor content ranking above you | SERP monitoring | High |
+| Competitor content is longer and more comprehensive | Manual comparison | Medium |
+| Competitor has more recent publication date displayed in SERP | SERP monitoring | Medium |
+| Featured snippet lost to competitor | SERP monitoring | High |
+| AI overview now answers query without click | SERP monitoring | High |
+
+---
+
+### Tertiary Signals (Low Reliability, Supporting Evidence)
+
+These signals alone do not indicate decay but strengthen the case when combined with primary or secondary signals.
+
+| Signal | What It Suggests |
+|--------|-----------------|
+| Fewer social shares over time | Content less share-worthy (may be stale) |
+| Decrease in backlink acquisition | Content no longer being cited as a resource |
+| Fewer comments or engagement | Community interest waning |
+| Content not appearing in AI responses | Not structured for GEO or information is outdated |
+
+---
+
+## Automated Monitoring Setup
+
+### Monitoring Dashboard Configuration
+
+Set up these automated checks to catch decay early.
+
+#### Weekly Checks
+
+| Check | Data Source | Alert Threshold |
+|-------|-----------|----------------|
+| Keyword position changes | Rank tracker | Any target keyword drops >3 positions |
+| Crawl errors on key pages | Search Console | Any new crawl error on monitored pages |
+| Index coverage changes | Search Console | Any page drops from index |
+
+#### Monthly Checks
+
+| Check | Data Source | Alert Threshold |
+|-------|-----------|----------------|
+| Traffic comparison (MoM) | Analytics | >15% decline on any monitored page |
+| CTR comparison | Search Console | >20% CTR decline for any target keyword |
+| Broken link scan | Crawler | Any new broken links on monitored pages |
+| Competitor SERP changes | SERP tracker | New competitor enters top 5 |
+
+#### Quarterly Checks
+
+| Check | Data Source | Process |
+|-------|-----------|---------|
+| Content freshness audit | CMS + manual | Review all content older than 6 months |
+| Statistics accuracy check | Manual | Verify top 20 pages have current data |
+| Engagement trend review | Analytics | Compare engagement metrics across quarters |
+| Full competitive content gap | SEO tool | Identify new competitor content opportunities |
+
+### Alert Priority Matrix
+
+When multiple signals fire simultaneously, use this matrix to determine response urgency.
+
+| Primary Signal + Secondary Signal | Priority | Response |
+|----------------------------------|----------|----------|
+| Traffic decline + Position drop | P1 (Critical) | Refresh within 48 hours |
+| Traffic decline + CTR decline | P1 (Critical) | Rewrite title/description immediately, schedule content refresh |
+| Position drop + Competitor displacement | P2 (High) | Refresh within 1 week |
+| Traffic decline + Engagement decline | P2 (High) | Refresh within 1 week |
+| CTR decline only | P3 (Medium) | Rewrite title and meta description this week |
+| Freshness indicators only | P3 (Medium) | Schedule refresh within 2 weeks |
+| Engagement decline only | P4 (Low) | Investigate and schedule if confirmed |
+
+---
+
+## Decay Severity Scoring
+
+### Composite Decay Score
+
+Calculate a 0-100 decay severity score by summing weighted signal scores.
+
+| Signal Category | Weight | Score Range |
+|----------------|--------|-------------|
+| Traffic decline | 30% | 0 = no decline, 100 = >60% decline |
+| Position drops | 25% | 0 = stable, 100 = dropped off page 1 |
+| CTR decline | 15% | 0 = stable, 100 = >40% decline |
+| Content freshness | 15% | 0 = updated this quarter, 100 = >2 years stale |
+| Competitive displacement | 15% | 0 = no new competitors, 100 = displaced from top 3 |
+
+### Score Interpretation
+
+| Composite Score | Decay Stage | Action |
+|----------------|-------------|--------|
+| 0-20 | Healthy | Continue monitoring |
+| 21-40 | Early decay | Add to refresh queue (next month) |
+| 41-60 | Active decay | Schedule refresh (this week) |
+| 61-80 | Significant decay | Immediate refresh or rewrite decision |
+| 81-100 | Terminal decay | Rewrite, redirect, or retire |
+
+---
+
+## Refresh Playbooks by Content Type
+
+### Blog Post / Article Refresh Playbook
+
+| Step | Action | Time Estimate |
+|------|--------|--------------|
+| 1 | Update title with current year or hook | 10 min |
+| 2 | Rewrite introduction with fresh angle | 20 min |
+| 3 | Update all statistics with current sources | 30-60 min |
+| 4 | Add 1-2 new sections covering gaps | 60-90 min |
+| 5 | Update screenshots and images | 30 min |
+| 6 | Add or update FAQ section | 20 min |
+| 7 | Refresh internal links | 15 min |
+| 8 | Update meta description | 5 min |
+| 9 | Add/update schema markup | 10 min |
+| 10 | Update dateModified and republish | 5 min |
+| **Total** | | **3-4 hours** |
+
+### Product/Service Page Refresh Playbook
+
+| Step | Action | Time Estimate |
+|------|--------|--------------|
+| 1 | Update pricing, features, specifications | 30 min |
+| 2 | Add new customer testimonials/reviews | 20 min |
+| 3 | Update product images | 30 min |
+| 4 | Refresh comparison tables | 20 min |
+| 5 | Update internal links to related products | 15 min |
+| 6 | Verify and update schema markup | 10 min |
+| **Total** | | **2-2.5 hours** |
+
+### Statistics/Data Roundup Refresh Playbook
+
+| Step | Action | Time Estimate |
+|------|--------|--------------|
+| 1 | Verify every statistic is still current | 60-90 min |
+| 2 | Replace outdated stats with current data | 60 min |
+| 3 | Add new statistics from recent studies | 30 min |
+| 4 | Update source links and citations | 30 min |
+| 5 | Update year references throughout | 15 min |
+| 6 | Add new visualization if data changed significantly | 30 min |
+| 7 | Update title, meta description with year | 10 min |
+| **Total** | | **4-5 hours** |
+
+### How-To Guide Refresh Playbook
+
+| Step | Action | Time Estimate |
+|------|--------|--------------|
+| 1 | Verify all steps are still accurate | 30 min |
+| 2 | Update screenshots for UI changes | 60 min |
+| 3 | Add new methods or alternative approaches | 30 min |
+| 4 | Update tool recommendations | 15 min |
+| 5 | Add troubleshooting section if missing | 20 min |
+| 6 | Update FAQ with new common questions | 15 min |
+| 7 | Test all links and embedded resources | 15 min |
+| **Total** | | **3-3.5 hours** |
+
+---
+
+## ROI Estimation for Content Refresh
+
+### Cost-Benefit Framework
+
+| Factor | Measurement |
+|--------|------------|
+| **Cost of refresh** | Writer hours x hourly rate + tool costs |
+| **Current monthly traffic value** | Organic sessions x conversion rate x avg order value |
+| **Projected traffic recovery** | Based on decay stage and content potential |
+| **Time to recover** | Typically 4-8 weeks for rankings to respond |
+
+### Traffic Recovery Benchmarks
+
+Based on industry data for content refreshes (not rewrites):
+
+| Decay Stage at Refresh | Typical Traffic Recovery | Recovery Timeline |
+|------------------------|------------------------|-------------------|
+| Early decay | 90-110% of peak (often exceeds) | 2-4 weeks |
+| Active decay | 70-90% of peak | 4-8 weeks |
+| Significant decay | 40-70% of peak | 6-12 weeks |
+| Terminal decay | 10-40% of peak (rewrite may be better) | 8-16 weeks |
+
+### ROI Calculation Template
+
+```
+Refresh Cost:
+  Writer time: [X hours] x [$Y/hour] = $[Z]
+  Tool costs: $[A] (one-time crawl, research tools)
+  Total cost: $[Z + A]
+
+Monthly Traffic Value (before decay):
+  Peak monthly organic sessions: [N]
+  Conversion rate: [X]%
+  Average conversion value: $[Y]
+  Peak monthly value: [N] x [X]% x $[Y] = $[V]
+
+Expected Recovery:
+  Projected recovery: [%] of peak = $[V x %] per month
+  Current monthly value: $[current]
+  Monthly value increase: $[V x % - current]
+
+ROI:
+  Payback period: $[total cost] / $[monthly value increase] = [months]
+  12-month ROI: ($[monthly value increase] x 12 - $[total cost]) / $[total cost] x 100 = [X]%
+```
+
+### Refresh Priority Scoring
+
+When choosing which content to refresh first, score each candidate:
+
+| Factor | Weight | Score (1-10) |
+|--------|--------|-------------|
+| Current traffic value | 25% | Higher traffic = higher score |
+| Decay severity | 20% | More decay = more urgency |
+| Competitive opportunity | 20% | Weaker competition = higher score |
+| Refresh difficulty | 15% | Easier refresh = higher score |
+| Strategic importance | 10% | Aligns with business goals = higher score |
+| Backlink equity | 10% | More backlinks = more worth preserving |
+
+**Priority formula**: Weighted score total. Refresh highest-scoring content first.
+
+---
+
+## Content Retirement Decision
+
+Not all decaying content should be refreshed. Use this checklist to decide when to retire content instead.
+
+### Retire When
+
+- [ ] Content targets a keyword with zero search volume
+- [ ] Topic is no longer relevant to your business
+- [ ] No backlinks worth preserving
+- [ ] Content never ranked well even when fresh
+- [ ] Cost to refresh exceeds projected 12-month value recovery
+- [ ] Content cannibalizes a better-performing page on the same topic
+
+### Retirement Options
+
+| Option | When to Use | Implementation |
+|--------|------------|---------------|
+| 301 redirect | Content has backlinks or residual traffic | Redirect to best related page |
+| Consolidate | Multiple weak pages on same topic | Merge into one strong page, redirect others |
+| Noindex | Page has internal utility but should not rank | Add noindex, keep page accessible |
+| Delete (410) | Content has no value, no links, no traffic | Return 410 Gone status |
+
+### Post-Retirement Monitoring
+
+After retiring content, monitor for 4 weeks:
+- Verify redirects are working (no 404 errors)
+- Check that target pages are receiving redirected traffic
+- Monitor rankings of consolidated/target pages
+- Ensure no orphan pages were created by removing internal links
+
+---
+
+## Content Decay Signal Taxonomy
+
+### Decay Indicators
+
+| Signal | Source | Severity | Detection Method |
+|--------|--------|----------|-----------------|
+| Traffic decline >20% MoM | Analytics | High | Monthly traffic comparison |
+| Position drop >5 positions | Rank tracker | High | Weekly rank monitoring |
+| Outdated statistics/dates | Manual review | Medium | Annual content audit |
+| Broken external links | Crawler | Medium | Monthly crawl reports |
+| Decreased CTR | Search Console | Medium | Quarterly CTR analysis |
+| Competitor new content | SERP monitoring | Medium | Monthly SERP checks |
+| User engagement drop | Analytics | Low | Quarterly engagement review |
+| Index coverage issues | Search Console | High | Weekly coverage monitoring |
+
+### Content Decay Stages
+
+| Stage | Symptoms | Urgency | Recommended Action |
+|-------|---------|---------|-------------------|
+| **Early decay** | Slight traffic/position dip | Low | Monitor for 2-4 weeks |
+| **Active decay** | Consistent decline across 2+ months | Medium | Schedule refresh within 2 weeks |
+| **Significant decay** | 50%+ traffic loss, page 2+ | High | Immediate refresh or rewrite |
+| **Terminal decay** | No organic traffic, deindexed | Critical | Rewrite, redirect, or retire |
+
+## Refresh vs. Rewrite Decision Framework
+
+| Factor | Refresh (Update) | Rewrite (New version) |
+|--------|-----------------|---------------------|
+| Content quality | Good foundation, needs updating | Fundamentally flawed or outdated approach |
+| Position | Was ranking well, now dropping | Never ranked well despite optimization |
+| URL age | 1+ years, has earned backlinks | Young URL with no backlink equity |
+| Backlinks | Has external links pointing to it | No backlinks worth preserving |
+| Scope of changes needed | <50% of content changing | >50% needs rewriting |
+| Search intent | Intent hasn't changed | Search intent has evolved |
+
+**Decision rule:** If the URL has backlinks and was ranking, REFRESH. If not, consider REWRITE at a new URL (with 301 redirect if old URL has any equity).
+
+## Content Lifecycle Model
+
+```
+CREATE → PROMOTE → MAINTAIN → REFRESH → [REFRESH again] or RETIRE
+  │         │          │          │                          │
+  │      Month 1    Month 2-6   Month 6-12              When terminal
+  │    Social,      Monitor     Update facts,            301 redirect
+  │    outreach,    rankings,   add new sections,         to related
+  │    email        fix issues  improve depth              content
+```
+
+### Lifecycle Actions by Phase
+
+| Phase | Duration | Key Actions | Metrics to Track |
+|-------|----------|------------|-----------------|
+| Create | Week 1 | Publish, submit to Search Console | Indexation |
+| Promote | Month 1 | Social shares, email, outreach | Referral traffic, backlinks |
+| Maintain | Months 2-6 | Monitor, fix broken links, respond to comments | Rankings, traffic trend |
+| Refresh | Months 6-12+ | Update data, add sections, improve structure | Traffic recovery, new keywords |
+| Retire | When terminal | 301 redirect to best alternative | Redirect traffic recovery |
+
+## Update Strategy by Content Type
+
+| Content Type | Refresh Frequency | Key Updates | Shelf Life |
+|-------------|-------------------|------------|-----------|
+| Statistics roundups | Every 6 months | Replace old stats, add new sources | 6-12 months |
+| Tool comparisons | Every 3-6 months | Update pricing, features, screenshots | 3-6 months |
+| How-to guides | Annually | Update steps, screenshots, links | 12-18 months |
+| Evergreen guides | Every 12-18 months | Add new sections, update examples | 18-24 months |
+| News/trend content | Don't refresh | Archive or redirect | 1-3 months |
+| Case studies | Rarely | Update results if available | 2-3 years |
+| Glossary/definitions | As needed | Update when definitions evolve | 2-5 years |
+
diff --git a/.cursor/skills/content-refresher/references/refresh-example.md b/.cursor/skills/content-refresher/references/refresh-example.md
new file mode 100644
index 0000000000..40dcc1887d
--- /dev/null
+++ b/.cursor/skills/content-refresher/references/refresh-example.md
@@ -0,0 +1,143 @@
+# Content Refresher — Worked Example & Checklist
+
+Referenced from [SKILL.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/content-refresher/SKILL.md).
+
+---
+
+## Worked Example
+
+**User**: "Refresh my blog post about 'best cloud hosting providers'"
+
+**Output**:
+
+```markdown
+### CORE-EEAT Quick Assessment
+
+**Content**: "Best Cloud Hosting Providers 2023"
+**Content Type**: Commercial comparison / listicle
+
+| Dimension | Quick Score | Key Weakness | Refresh Priority |
+|-----------|-----------|--------------|-----------------|
+| C — Contextual Clarity | 60/100 | Title says 2023 — stale intent signal | 🔴 |
+| O — Organization | 75/100 | Good comparison tables but no summary box | 🟡 |
+| R — Referenceability | 35/100 | Pricing from Q1 2023, 3 broken links | 🔴 |
+| E — Exclusivity | 50/100 | No original benchmarks or test data | 🟡 |
+| Exp — Experience | 30/100 | No first-person testing narrative | 🔴 |
+| Ept — Expertise | 65/100 | Author bio present but lacks credentials | 🟡 |
+| A — Authority | 55/100 | 12 backlinks, was ranking page 1 | 🟢 |
+| T — Trust | 60/100 | Affiliate links present but not disclosed | 🔴 |
+
+**Weakest Dimensions** (focus refresh here):
+1. **Experience** — Add hands-on testing results ("We migrated a test site to each provider")
+2. **Referenceability** — Replace all 2023 pricing/uptime data with current figures
+
+## Content Refresh Analysis: Best Cloud Hosting Providers 2023
+
+**URL**: cloudhosting.com/best-cloud-hosting
+**Published**: 2023-02-14
+**Last Updated**: Never
+**Word Count**: 2,100
+
+### Performance Metrics
+
+| Metric | 6 Mo Ago | Current | Change |
+|--------|----------|---------|--------|
+| Organic Traffic | 3,200/mo | 1,400/mo | -56% |
+| Avg Position | 4.2 | 14.8 | ↓ 10.6 |
+| Impressions | 18,000 | 9,500 | -47% |
+| CTR | 6.1% | 2.3% | -3.8% |
+
+### Content Decay Signals Identified
+
+1. **Outdated year in title and H1** — "2023" signals stale content to users and search engines
+2. **Pricing data 18+ months old** — AWS Lightsail listed at $3.50/mo (now $5/mo), DigitalOcean at $4/mo (now $6/mo)
+3. **Missing new entrants** — No mention of Hetzner Cloud or Vultr, which 4/5 top competitors now cover
+4. **3 broken outbound links** — Provider comparison pages that have moved or been retired
+
+### Refresh vs. Rewrite Decision
+
+| Factor | Assessment |
+|--------|-----------|
+| Content quality | Good structure, solid comparison tables — foundation is sound |
+| URL equity | 12 referring domains, 18 months old |
+| Scope of changes | ~40% of content needs updating |
+| Search intent | Unchanged — still commercial comparison |
+
+**Decision**: **REFRESH** — The URL has earned backlinks, the structure is solid, and less than 50% needs rewriting. Keep the URL, update in place.
+
+## Content Refresh Plan
+
+**Current Title**: "Best Cloud Hosting Providers 2023"
+**Refreshed Title**: "Best Cloud Hosting Providers 2024: 7 Platforms Tested & Compared"
+
+### Specific Refresh Actions
+
+1. **Update all pricing and specs** (~30 min)
+   - Replace 2023 pricing for all 5 listed providers with current data
+   - Add uptime stats from the last 12 months (source: UptimeRobot public status pages)
+   - Update feature comparison table with current plan tiers
+
+2. **Add 2 missing providers + testing narrative** (~600 words)
+   - Add Hetzner Cloud and Vultr sections with same comparison format
+   - Write intro paragraph: "We deployed a WordPress benchmark site to each provider and measured TTFB, uptime, and support response times over 30 days"
+
+3. **Add affiliate disclosure and FAQ section** (~200 words)
+   - Add disclosure statement below introduction: "This post contains affiliate links. See our editorial policy."
+   - Add FAQ with 4 questions targeting People Also Ask (e.g., "What is the cheapest cloud hosting?", "Is cloud hosting faster than shared hosting?")
+   - Implement FAQ schema markup for rich result eligibility
+
+4. **Fix broken links and update internal links** (~15 min)
+   - Replace 3 broken outbound links with current provider URLs
+   - Add internal links to cloudhosting.com/vps-vs-cloud and cloudhosting.com/hosting-speed-test
+
+### Republishing Strategy
+
+**Recommendation**: Update Published Date — this is a major overhaul (40%+ new content, new providers, fresh test data). Update `dateModified` in Article schema, resubmit URL in Search Console, and share on social as "Updated for 2024."
+
+### Expected Outcomes
+
+| Metric | Current | 30-Day Target | 90-Day Target |
+|--------|---------|---------------|---------------|
+| Avg Position | 14.8 | 8-10 | 3-6 |
+| Organic Traffic | 1,400/mo | 2,200/mo | 3,500/mo |
+| Featured Snippets | 0 | 1 (FAQ) | 2+ |
+```
+
+---
+
+## Content Refresh Checklist
+
+```markdown
+### Pre-Refresh
+- [ ] Analyze current performance metrics
+- [ ] Identify outdated information
+- [ ] Research competitor updates
+- [ ] Note missing topics
+
+### Content Updates
+- [ ] Update year references
+- [ ] Refresh statistics with sources
+- [ ] Add new examples and case studies
+- [ ] Expand thin sections
+- [ ] Add new relevant sections
+- [ ] Create FAQ section
+
+### SEO Updates
+- [ ] Update title tag
+- [ ] Refresh meta description
+- [ ] Optimize headers
+- [ ] Update internal links
+- [ ] Add new images with alt text
+
+### GEO Updates
+- [ ] Add clear definition
+- [ ] Include quotable statements
+- [ ] Add Q&A formatted content
+- [ ] Update source citations
+
+### Technical
+- [ ] Update schema dateModified
+- [ ] Clear page cache
+- [ ] Update sitemap
+- [ ] Test page speed
+```
diff --git a/.cursor/skills/content-refresher/references/refresh-templates.md b/.cursor/skills/content-refresher/references/refresh-templates.md
new file mode 100644
index 0000000000..8b9e52a826
--- /dev/null
+++ b/.cursor/skills/content-refresher/references/refresh-templates.md
@@ -0,0 +1,253 @@
+# Content Refresh Templates
+
+Detailed output templates for content-refresher steps 5-9. Referenced from [SKILL.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/content-refresher/SKILL.md).
+
+---
+
+## Step 5: Create Refresh Plan
+
+```markdown
+## Content Refresh Plan
+
+### Title/URL
+**Current**: [current title]
+**Refreshed**: [updated title with year/hook]
+
+### Structural Changes
+
+**Keep As-Is**:
+- [Section 1] - Still relevant and accurate
+- [Section 2] - Still relevant and accurate
+
+**Update/Expand**:
+- [Section 3] - Update statistics, add [X] words
+- [Section 4] - Add new examples from [current year]
+
+**Add New Sections**:
+- [New Section 1] - [description, ~X words]
+- [New Section 2] - [description, ~X words]
+- FAQ Section - [X questions for featured snippets]
+
+**Remove/Consolidate**:
+- [Section 5] - Outdated, remove or redirect topic
+
+### Content Additions
+
+**New Word Count Target**: [X] words (+[Y] from current)
+
+| Section | Current | After Refresh | Notes |
+|---------|---------|---------------|-------|
+| Introduction | [X] | [X] | Add hook, update context |
+| [Section 1] | [X] | [X] | Keep |
+| [Section 2] | [X] | [X] | Update stats |
+| [New Section] | 0 | [X] | Add entirely |
+| FAQ | 0 | [X] | Add for GEO |
+| Conclusion | [X] | [X] | Update CTA |
+
+### Specific Updates
+
+**Statistics to Update**:
+
+| Old Statistic | New Statistic | Source |
+|---------------|---------------|--------|
+| "[old stat]" | "[find current]" | [source] |
+| "[old stat]" | "[find current]" | [source] |
+
+**Links to Update**:
+
+| Anchor Text | Old URL | New URL | Reason |
+|-------------|---------|---------|--------|
+| "[anchor]" | [old] | [new] | Broken |
+| "[anchor]" | [old] | [new] | Better resource |
+
+**Images to Update**:
+
+| Image | Action | New Alt Text |
+|-------|--------|--------------|
+| [img 1] | Replace | "[keyword-rich alt]" |
+| [img 2] | Keep | Update alt text |
+```
+
+---
+
+## Step 6: Write Refresh Content
+
+```markdown
+## Refreshed Content Sections
+
+### Updated Introduction
+
+[Write new introduction with:]
+- Updated hook for current year
+- Fresh statistics
+- Clear value proposition
+- Primary keyword in first 100 words
+
+### New Section: [Title]
+
+[Write new section covering:]
+- [Topic competitors now cover]
+- Current information and examples
+- GEO-optimized with quotable statements
+
+### Updated Statistics Section
+
+**Replace**:
+> "[Old statement with outdated stat]"
+
+**With**:
+> "[New statement with current stat] (Source, [current year])"
+
+### New FAQ Section
+
+## Frequently Asked Questions
+
+### [Question matching PAA/common query]?
+
+[Direct answer in 40-60 words, optimized for featured snippets]
+
+### [Question 2]?
+
+[Direct answer]
+
+### [Question 3]?
+
+[Direct answer]
+```
+
+---
+
+## Step 7: Optimize for GEO During Refresh
+
+```markdown
+## GEO Enhancement Opportunities
+
+### Add Clear Definitions
+
+**Add at start of article**:
+> **[Topic]** is [clear, quotable definition in 40-60 words that
+> AI systems can cite directly].
+
+### Add Quotable Statements
+
+**Transform**:
+> "Email marketing is effective for businesses."
+
+**Into**:
+> "Email marketing delivers an average ROI of $42 for every $1
+> invested, making it the highest-ROI digital marketing channel
+> according to the Data & Marketing Association ([current year])."
+
+### Add Q&A Sections
+
+Structure content with questions AI might answer:
+- What is [topic]?
+- How does [topic] work?
+- Why is [topic] important?
+- What are the benefits of [topic]?
+
+### Update Citations
+
+- Add sources for all statistics
+- Link to authoritative references
+- Include publication dates
+- Use recent sources (last 2 years)
+```
+
+---
+
+## Step 8: Generate Republishing Strategy
+
+```markdown
+## Republishing Strategy
+
+### Date Strategy
+
+**Options**:
+
+1. **Update Published Date**
+   - Use when: Major overhaul (50%+ new content)
+   - Pros: Signals freshness to Google
+   - Cons: Loses "original" authority
+
+2. **Add "Last Updated" Date**
+   - Use when: Moderate updates (20-50% new)
+   - Pros: Shows both original and fresh
+   - Cons: Original date visible
+
+3. **Keep Original Date**
+   - Use when: Minor updates (<20% new)
+   - Pros: Maintains authority
+   - Cons: Doesn't signal update
+
+**Recommendation**: [Option X] because [reason]
+
+### Technical Implementation
+
+- [ ] Update `dateModified` in schema
+- [ ] Update sitemap lastmod
+- [ ] Clear cache after publishing
+- [ ] Resubmit to ~~search console
+
+### Promotion Strategy
+
+**Immediately after refresh**:
+- [ ] Share on social media as "updated for [current year]"
+- [ ] Send to email list if significant update
+- [ ] Update internal links with fresh anchors
+- [ ] Reach out for new backlinks
+
+**Track Results**:
+- [ ] Monitor rankings for 4-6 weeks
+- [ ] Track traffic changes
+- [ ] Watch for featured snippet capture
+- [ ] Check AI citation improvements
+```
+
+---
+
+## Step 9: Create Refresh Report
+
+```markdown
+# Content Refresh Report
+
+## Summary
+
+**Content**: [Title]
+**Refresh Date**: [Date]
+**Refresh Level**: [Major/Moderate/Minor]
+
+## Changes Made
+
+| Element | Before | After |
+|---------|--------|-------|
+| Word count | [X] | [Y] (+[Z]%) |
+| Sections | [X] | [Y] |
+| Statistics | [X] outdated | [Y] current |
+| Internal links | [X] | [Y] |
+| Images | [X] | [Y] |
+| FAQ questions | 0 | [X] |
+
+## Updates Completed
+
+- [x] Updated title with current year
+- [x] Refreshed meta description
+- [x] Added [X] new sections
+- [x] Updated [X] statistics with sources
+- [x] Fixed [X] broken links
+- [x] Added FAQ section for GEO
+- [x] Implemented FAQ schema
+- [x] Updated images and alt text
+
+## Expected Outcomes
+
+| Metric | Current | 30-Day Target | 90-Day Target |
+|--------|---------|---------------|---------------|
+| Avg Position | [X] | [Y] | [Z] |
+| Organic Traffic | [X]/mo | [Y]/mo | [Z]/mo |
+| Featured Snippets | 0 | 1+ | 2+ |
+
+## Next Review
+
+Schedule next refresh review: [Date - 6 months from now]
+```
diff --git a/.cursor/skills/domain-authority-auditor/SKILL.md b/.cursor/skills/domain-authority-auditor/SKILL.md
new file mode 100644
index 0000000000..1840cfd38e
--- /dev/null
+++ b/.cursor/skills/domain-authority-auditor/SKILL.md
@@ -0,0 +1,394 @@
+---
+name: domain-authority-auditor
+description: '40-item CITE domain audit: citation, impact, trust, entity scoring with veto checks. 域名权威/网站可信度'
+version: "6.0.0"
+license: Apache-2.0
+compatibility: "Claude Code ≥1.0, skills.sh marketplace, ClawHub marketplace, Vercel Labs skills ecosystem. No system packages required. Optional: MCP network access for SEO tool integrations."
+homepage: "https://github.com/aaron-he-zhu/seo-geo-claude-skills"
+when_to_use: "Use when auditing domain trust and authority. Runs CITE 40-item scoring with veto checks. Also when the user asks about domain credibility or citation trustworthiness."
+argument-hint: "<domain>"
+metadata:
+  author: aaron-he-zhu
+  version: "6.0.0"
+  geo-relevance: "medium"
+  tags:
+    - seo
+    - geo
+    - domain-authority
+    - domain-rating
+    - domain-trust
+    - cite-framework
+    - site-authority
+    - 域名权威
+    - ドメイン権威
+    - 도메인권위
+    - autoridad-dominio
+  triggers:
+    # EN-formal
+    - "audit domain authority"
+    - "CITE audit"
+    - "domain trust score"
+    - "domain credibility check"
+    - "domain rating"
+    - "site authority"
+    # EN-casual
+    - "how trustworthy is my site"
+    - "is my domain credible"
+    - "is my domain trustworthy"
+    - "domain credibility score"
+    - "Google penalty recovery"
+    - "my site got penalized"
+    # EN-question
+    - "how authoritative is my site"
+    - "what is my domain authority"
+    # ZH-pro
+    - "域名权威审计"
+    - "网站可信度"
+    - "域名评分"
+    # ZH-casual
+    - "域名可信吗"
+    - "权威度多少"
+    - "网站可信度怎么样"
+    # JA
+    - "ドメイン権威"
+    - "ドメイン評価"
+    # KO
+    - "도메인 권위"
+    - "도메인 신뢰도"
+    # ES
+    - "autoridad de dominio"
+    - "auditoría de dominio"
+    # PT
+    - "autoridade de domínio"
+    # Misspellings
+    - "domain autority"
+---
+
+# Domain Authority Auditor
+
+> Based on [CITE Domain Rating](https://github.com/aaron-he-zhu/cite-domain-rating). Full benchmark reference: [references/cite-domain-rating.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/cite-domain-rating.md)
+
+> **[SEO & GEO Skills Library](https://github.com/aaron-he-zhu/seo-geo-claude-skills)** · 20 skills for SEO + GEO · [ClawHub](https://clawhub.ai/u/aaron-he-zhu) · [skills.sh](https://skills.sh/aaron-he-zhu/seo-geo-claude-skills)
+> **System Mode**: This cross-cutting skill is part of the protocol layer and follows the shared [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md) and [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md).
+
+
+This skill evaluates domain authority across 40 standardized criteria organized in 4 dimensions. It produces a comprehensive audit report with per-item scoring, dimension and weighted scores by domain type, veto item checks, and a prioritized action plan.
+
+**Sister skill**: [content-quality-auditor](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/content-quality-auditor/SKILL.md) evaluates content at the page level (80 items). This skill evaluates the domain behind the content (40 items). Together they provide a complete 120-item assessment.
+
+> **Namespace note**: CITE uses C01-C10 for Citation items; CORE-EEAT uses C01-C10 for Contextual Clarity items. In combined 120-item assessments, prefix with the framework name (e.g., CITE-C01 vs CORE-C01) to avoid confusion.
+
+**System role**: Citation Trust Gate. It decides whether a domain is credible enough to support ranking, citation, and brand authority work.
+
+## When This Must Trigger
+
+Use this when domain credibility or citation trustworthiness is in question — even if the user doesn't use audit terminology:
+
+- User asks "how trustworthy is my site" or "is my domain credible"
+- When backlink-analyzer finds toxic link ratio above 15%, its handoff summary recommends this gate check
+- Evaluating domain authority before a GEO campaign
+- Benchmarking your domain against competitors
+- Assessing whether a domain is trustworthy as a citation source
+- Running periodic domain health checks or after link building campaigns
+- Identifying manipulation red flags (PBNs, link farms, penalty history)
+- Cross-referencing with content-quality-auditor for full 120-item assessment
+
+## What This Skill Does
+
+1. **Full 40-Item Audit**: Scores every CITE check item as Pass/Partial/Fail
+2. **Dimension Scoring**: Calculates scores for all 4 dimensions (0-100 each)
+3. **Weighted Totals**: Applies domain-type-specific weights for CITE Score
+4. **Veto Detection**: Flags critical manipulation signals (T03, T05, T09)
+5. **Priority Ranking**: Identifies Top 5 improvements sorted by impact
+6. **Action Plan**: Generates specific, actionable improvement steps
+7. **Cross-Reference**: Optionally pairs with CORE-EEAT for combined diagnosis
+
+## Quick Start
+
+Start with one of these prompts. Finish with a citation-trust verdict and a handoff summary using the repository format in [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md).
+
+### Audit Your Domain
+
+```
+Audit domain authority for [domain]
+Run a CITE domain audit on [domain] as a [domain type]
+```
+
+### Audit with Domain Type
+
+```
+CITE audit for example.com as an e-commerce site
+Score this SaaS domain against the 40-item benchmark: [domain]
+```
+
+### Comparative Audit
+
+```
+Compare domain authority: [your domain] vs [competitor 1] vs [competitor 2]
+```
+
+### Combined Assessment
+
+```
+Run full 120-item assessment on [domain]: CITE domain audit + CORE-EEAT content audit on [sample pages]
+```
+
+## Skill Contract
+
+**Gate verdict**: **TRUSTED** (no veto items, scores above threshold) / **CAUTIOUS** (issues found but no veto) / **UNTRUSTED** (veto item T03, T05, or T09 failed). Always state the verdict prominently at the top of the report.
+
+**Expected output**: a CITE audit report, a citation-trust verdict, and a short handoff summary ready for `memory/audits/domain/`.
+
+- **Reads**: the target domain, supporting authority signals, comparison domains, and prior decisions from [CLAUDE.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CLAUDE.md) and the shared [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md) when available.
+- **Writes**: a user-facing authority report plus a reusable summary that can be stored under `memory/audits/domain/`.
+- **Promotes**: veto items and domain risks to `memory/hot-cache.md` (auto-saved). Authority context to `memory/audits/domain/`. Results feed into entity-optimizer as authority input for brand's canonical profile.
+- **Next handoff**: use the `Next Best Skill` below once the trust picture is clear.
+
+## Data Sources
+
+> See [CONNECTORS.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CONNECTORS.md) for tool category placeholders.
+
+> **Note:** All integrations are optional. This skill works without any API keys — users provide data manually when no tools are connected.
+
+**With ~~link database + ~~SEO tool + ~~AI monitor + ~~knowledge graph + ~~brand monitor connected:**
+Automatically pull backlink profiles and link quality metrics from ~~link database, domain authority scores and keyword rankings from ~~SEO tool, AI citation data from ~~AI monitor, entity presence from ~~knowledge graph, and brand mention data from ~~brand monitor.
+
+**With manual data only:**
+Ask the user to provide:
+1. Domain to evaluate
+2. Domain type (if not auto-detectable): Content Publisher, Product & Service, E-commerce, Community & UGC, Tool & Utility, or Authority & Institutional
+3. Backlink data: referring domains count, domain authority, top linking domains
+4. Traffic estimates (from any SEO tool or SimilarWeb)
+5. Competitor domains for comparison (optional)
+
+Proceed with the full 40-item audit using provided data. Note in the output which items could not be fully evaluated due to missing access (e.g., AI citation data, knowledge graph queries, WHOIS history).
+
+## Instructions
+
+When a user requests a domain authority audit:
+
+### Step 1: Preparation
+
+```markdown
+### Audit Setup
+
+**Domain**: [domain]
+**Domain Type**: [auto-detected or user-specified]
+**Dimension Weights**: [from domain-type weight table below]
+
+#### Domain-Type Weight Table
+
+> Canonical source: `references/cite-domain-rating.md`. This inline copy is for convenience.
+
+| Dim | Default | Content Publisher | Product & Service | E-commerce | Community & UGC | Tool & Utility | Authority & Institutional |
+|-----|:-------:|:-:|:-:|:-:|:-:|:-:|:-:|
+| C | 35% | **40%** | 25% | 20% | 35% | 25% | **45%** |
+| I | 20% | 15% | **30%** | 20% | 10% | **30%** | 20% |
+| T | 25% | 20% | 25% | **35%** | 25% | 25% | 20% |
+| E | 20% | 25% | 20% | 25% | **30%** | 20% | 15% |
+
+#### Veto Check (Emergency Brake)
+
+| Veto Item | Status | Action |
+|-----------|--------|--------|
+| T03: Link-Traffic Coherence | ✅ Pass / ⚠️ VETO | [If VETO: "Audit backlink profile; disavow toxic links"] |
+| T05: Backlink Profile Uniqueness | ✅ Pass / ⚠️ VETO | [If VETO: "Flag as manipulation network; investigate link sources"] |
+| T09: Penalty & Deindex History | ✅ Pass / ⚠️ VETO | [If VETO: "Address penalty first; all other optimization is futile"] |
+```
+
+If any veto item triggers, flag it prominently at the top of the report. CITE Score is capped at 39 (Poor) regardless of other scores.
+
+### Step 2: C + I Audit (20 items)
+
+Evaluate each item against the criteria in [references/cite-domain-rating.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/cite-domain-rating.md).
+
+Score each item:
+- **Pass** = 10 points (fully meets criteria)
+- **Partial** = 5 points (partially meets criteria)
+- **Fail** = 0 points (does not meet criteria)
+
+```markdown
+### C — Citation
+
+| ID | Check Item | Score | Notes |
+|----|-----------|-------|-------|
+| C01 | Referring Domains Volume | Pass/Partial/Fail | [specific observation] |
+| C02 | Referring Domains Quality | Pass/Partial/Fail | [specific observation] |
+| ... | ... | ... | ... |
+| C10 | Link Source Diversity | Pass/Partial/Fail | [specific observation] |
+
+**C Score**: [X]/100
+
+### I — Identity
+
+| ID | Check Item | Score | Notes |
+|----|-----------|-------|-------|
+| I01 | Knowledge Graph Presence | Pass/Partial/Fail | [specific observation] |
+| ... | ... | ... | ... |
+
+**I Score**: [X]/100
+```
+
+### Step 3: T + E Audit (20 items)
+
+Same format for Trust and Eminence dimensions.
+
+```markdown
+### T — Trust
+
+| ID | Check Item | Score | Notes |
+|----|-----------|-------|-------|
+| T01 | Link Profile Naturalness | Pass/Partial/Fail | [specific observation] |
+| ... | ... | ... | ... |
+
+**T Score**: [X]/100
+
+### E — Eminence
+
+| ID | Check Item | Score | Notes |
+|----|-----------|-------|-------|
+| E01 | Organic Search Visibility | Pass/Partial/Fail | [specific observation] |
+| ... | ... | ... | ... |
+
+**E Score**: [X]/100
+```
+
+**Note**: Some items require specialized data (C05-C08 AI citation data, I01 knowledge graph queries, T04-T05 IP/profile analysis). Score what is observable; mark unverifiable items as "N/A — requires [data source]" and exclude from dimension average.
+
+### Step 4: Scoring & Report
+
+Calculate scores and generate the final report:
+
+```markdown
+## CITE Domain Authority Report
+
+### Overview
+
+- **Domain**: [domain]
+- **Domain Type**: [type]
+- **Audit Date**: [date]
+- **CITE Score**: [score]/100 ([rating])
+- **Veto Status**: ✅ No triggers / ⚠️ [item] triggered — Score capped at 39
+
+### Dimension Scores
+
+| Dimension | Score | Rating | Weight | Weighted |
+|-----------|-------|--------|--------|----------|
+| C — Citation | [X]/100 | [rating] | [X]% | [X] |
+| I — Identity | [X]/100 | [rating] | [X]% | [X] |
+| T — Trust | [X]/100 | [rating] | [X]% | [X] |
+| E — Eminence | [X]/100 | [rating] | [X]% | [X] |
+| **CITE Score** | | | | **[X]/100** |
+
+**Score Calculation**: CITE Score = C × [w_C] + I × [w_I] + T × [w_T] + E × [w_E]
+
+**Rating Scale**: 90-100 Excellent | 75-89 Good | 60-74 Medium | 40-59 Low | 0-39 Poor
+
+### Per-Item Scores
+
+| ID | Check Item | Score | Notes |
+|----|-----------|-------|-------|
+| C01 | Referring Domains Volume | [Pass/Partial/Fail] | [observation] |
+| C02 | Referring Domains Quality | [Pass/Partial/Fail] | [observation] |
+| ... | ... | ... | ... |
+| E10 | Industry Share of Voice | [Pass/Partial/Fail] | [observation] |
+
+### Top 5 Priority Improvements
+
+Sorted by: weight × points lost (highest impact first)
+
+1. **[ID] [Name]** — [specific modification suggestion]
+   - Current: [Fail/Partial] | Potential gain: [X] weighted points
+   - Action: [concrete step]
+2. **[ID] [Name]** — [specific modification suggestion]
+   - Current: [Fail/Partial] | Potential gain: [X] weighted points
+   - Action: [concrete step]
+3–5. [Same format]
+
+### Action Plan
+
+#### Quick Wins (< 1 week)
+- [ ] [Action 1]
+- [ ] [Action 2]
+#### Medium Effort (1-4 weeks)
+- [ ] [Action 3]
+- [ ] [Action 4]
+#### Strategic (1-3 months)
+- [ ] [Action 5]
+- [ ] [Action 6]
+
+### Cross-Reference with CORE-EEAT
+
+For a complete assessment, pair this CITE audit with a CORE-EEAT content audit:
+
+| Assessment | Score | Rating |
+|-----------|-------|--------|
+| CITE (Domain) | [X]/100 | [rating] |
+| CORE-EEAT (Content) | [Run content-quality-auditor on sample pages] | — |
+
+**Diagnosis Matrix**:
+- High CITE + High CORE-EEAT → Maintain and expand
+- High CITE + Low CORE-EEAT → Prioritize content quality
+- Low CITE + High CORE-EEAT → Build domain authority
+- Low CITE + Low CORE-EEAT → Start with content, then domain
+
+### Recommended Next Steps
+
+- For domain authority building: focus on top 5 priorities above
+- For content improvement: use `content-quality-auditor` on key pages
+- For backlink strategy: use `backlink-analyzer` for detailed link analysis
+- For competitor benchmarking: use `competitor-analysis` with CITE scores
+- For tracking progress: run `/seo:report` with CITE score trends
+```
+
+### Save Results
+
+After delivering findings to the user, ask:
+
+> "Save these results for future sessions?"
+
+If yes, write a dated summary to the appropriate `memory/` path using filename `YYYY-MM-DD-<topic>.md` containing:
+- One-line verdict or headline finding
+- Top 3-5 actionable items
+- Open loops or blockers
+- Source data references
+
+If any veto-level issue was found (CORE-EEAT T04, C01, R10 or CITE T03, T05, T09), also append a one-liner to `memory/hot-cache.md` without asking.
+
+## Validation Checkpoints
+
+### Input Validation
+- [ ] Domain identified and accessible
+- [ ] Domain type confirmed (auto-detected or user-specified)
+- [ ] Backlink data available (at minimum: referring domains count, DA/DR)
+- [ ] If comparative audit, competitor domains also specified
+
+### Output Validation
+- [ ] All 40 items scored (or marked N/A with reason)
+- [ ] All 4 dimension scores calculated correctly
+- [ ] Weighted CITE Score matches domain-type weight configuration
+- [ ] All 3 veto items checked first and flagged if triggered
+- [ ] Top 5 improvements sorted by weighted impact, not arbitrary
+- [ ] Every recommendation is specific and actionable (not generic advice)
+- [ ] Action plan includes concrete steps with effort estimates
+
+## Example
+
+See [references/example-report.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/domain-authority-auditor/references/example-report.md) for a complete CITE audit of cloudhosting.com showing veto check, dimension scores, top 5 improvements, action plan, and cross-reference with CORE-EEAT.
+
+## Tips for Success
+
+1. **Start with veto items** — T03, T05, T09 can invalidate the entire score
+2. **Identify domain type first** — Different types have very different weight profiles
+3. **AI citation items (C05-C08) matter most for GEO** — Test by querying AI engines with niche-relevant questions
+4. **Some items need specialized tools** — Knowledge graph queries, AI citation monitoring, and IP diversity analysis may require manual research if tools aren't connected
+5. **Pair with CORE-EEAT for full picture** — Domain authority without content quality (or vice versa) tells only half the story
+
+## Reference Materials
+
+- [CITE Domain Rating](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/cite-domain-rating.md) — Full 40-item benchmark with dimension definitions, scoring criteria, domain-type weight tables, and veto items
+- [references/example-report.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/domain-authority-auditor/references/example-report.md) — Complete CITE audit example with scored dimensions, top 5 improvements, action plan, and CORE-EEAT cross-reference
+
+## Next Best Skill
+
+- **Primary**: [backlink-analyzer](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/monitor/backlink-analyzer/SKILL.md) — turn trust or citation issues into link-level investigation.
diff --git a/.cursor/skills/domain-authority-auditor/references/example-report.md b/.cursor/skills/domain-authority-auditor/references/example-report.md
new file mode 100644
index 0000000000..fe20e30132
--- /dev/null
+++ b/.cursor/skills/domain-authority-auditor/references/example-report.md
@@ -0,0 +1,99 @@
+# CITE Domain Authority Report — Example
+
+Full example audit output for reference. See the [SKILL.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/domain-authority-auditor/SKILL.md) for the complete workflow.
+
+## Example
+
+**User**: "Audit domain authority for cloudhosting.com as a content publisher"
+
+**Output**:
+
+```markdown
+## CITE Domain Authority Report
+
+### Overview
+
+- **Domain**: cloudhosting.com
+- **Domain Type**: Content Publisher
+- **Audit Date**: 2025-02-03
+- **CITE Score**: 69.9/100 (Medium)
+- **Veto Status**: ✅ No triggers
+
+#### Veto Check (Emergency Brake)
+
+| Veto Item | Status | Action |
+|-----------|--------|--------|
+| T03: Link-Traffic Coherence | ✅ Pass | Link growth correlates with traffic growth |
+| T05: Backlink Profile Uniqueness | ✅ Pass | No PBN patterns detected; diverse link sources |
+| T09: Penalty & Deindex History | ✅ Pass | No manual actions; clean penalty history |
+
+### Dimension Scores
+
+| Dimension | Score | Rating | Weight | Weighted |
+|-----------|-------|--------|--------|----------|
+| C — Citation | 72/100 | Medium | 40% | 28.8 |
+| I — Identity | 58/100 | Low | 15% | 8.7 |
+| T — Trust | 81/100 | Good | 20% | 16.2 |
+| E — Eminence | 65/100 | Medium | 25% | 16.25 |
+| **CITE Score** | | | | **69.9/100** |
+
+**Score Calculation**:
+- CITE Score = 72 × 0.40 + 58 × 0.15 + 81 × 0.20 + 65 × 0.25 = 69.9
+
+**Rating Scale**: 90-100 Excellent | 75-89 Good | 60-74 Medium | 40-59 Low | 0-39 Poor
+
+### Top 5 Priority Improvements
+
+Sorted by: weight × points lost (highest impact first)
+
+1. **I01 Knowledge Graph Presence** — Create entity entry in Google Knowledge Graph
+   - Current: Fail | Potential gain: 1.5 weighted points
+   - Action: Create Wikidata entry for CloudHost Inc. with P856 (website), P452 (industry), P571 (inception)
+
+2. **C05 AI Citation Volume** — Increase citations in AI-generated answers
+   - Current: Partial | Potential gain: 2.0 weighted points
+   - Action: Optimize top 10 pages for GEO; add definitive statements AI can quote directly
+
+3. **I03 Brand SERP Control** — Branded SERP shows only 4 of 10 results from owned properties
+   - Current: Partial | Potential gain: 0.75 weighted points
+   - Action: Claim Google Business Profile; build out social profiles; create CrunchBase entry
+
+4. **E04 Content Freshness Cadence** — 40% of content is >12 months without update
+   - Current: Partial | Potential gain: 1.25 weighted points
+   - Action: Establish monthly content refresh schedule; prioritize top 20 traffic pages
+
+5. **I05 Schema.org Completeness** — Organization schema missing sameAs, founder, foundingDate
+   - Current: Partial | Potential gain: 0.75 weighted points
+   - Action: Add complete Organization schema with sameAs links to Wikidata, LinkedIn, CrunchBase
+
+### Action Plan
+
+#### Quick Wins (< 1 week)
+- [ ] Add sameAs, founder, and foundingDate to Organization schema
+- [ ] Claim Google Business Profile for branded SERP control
+
+#### Medium Effort (1-4 weeks)
+- [ ] Create Wikidata entry with complete properties and references
+- [ ] Optimize top 10 pages with GEO-friendly definitive statements
+- [ ] Create or complete CrunchBase, LinkedIn company page profiles
+
+#### Strategic (1-3 months)
+- [ ] Launch monthly content refresh program targeting stale pages
+- [ ] Build topical authority through 3-4 pillar content clusters
+- [ ] Pursue digital PR to earn mentions on industry publications (TechCrunch, G2)
+
+### Cross-Reference with CORE-EEAT
+
+| Assessment | Score | Rating |
+|-----------|-------|--------|
+| CITE (Domain) | 69.9/100 | Medium |
+| CORE-EEAT (Content) | Run content-quality-auditor on sample pages | — |
+
+**Diagnosis**: Low CITE + unknown CORE-EEAT → Run `/seo:audit-page` on top 5 landing pages to determine whether to prioritize content quality or domain authority first.
+
+### Recommended Next Steps
+
+- For entity building: run `entity-optimizer` to strengthen I-dimension signals
+- For content audit: use `content-quality-auditor` on key pages
+- For tracking progress: run `/seo:report` with CITE score trends quarterly
+```
diff --git a/.cursor/skills/entity-optimizer/SKILL.md b/.cursor/skills/entity-optimizer/SKILL.md
new file mode 100644
index 0000000000..7a2a6c5029
--- /dev/null
+++ b/.cursor/skills/entity-optimizer/SKILL.md
@@ -0,0 +1,376 @@
+---
+name: entity-optimizer
+description: 'Build entity presence in Knowledge Graph, Wikidata, AI systems for brand recognition and citations. 实体优化/知识图谱'
+version: "6.0.0"
+license: Apache-2.0
+compatibility: "Claude Code ≥1.0, skills.sh marketplace, ClawHub marketplace, Vercel Labs skills ecosystem. No system packages required. Optional: MCP network access for SEO tool integrations."
+homepage: "https://github.com/aaron-he-zhu/seo-geo-claude-skills"
+when_to_use: "Use when optimizing entity presence for Knowledge Graph, Wikidata, or AI engine disambiguation. Also for brand entity canonicalization."
+argument-hint: "<entity name or brand>"
+metadata:
+  author: aaron-he-zhu
+  version: "6.0.0"
+  geo-relevance: "high"
+  tags:
+    - seo
+    - geo
+    - entity-optimization
+    - knowledge-graph
+    - knowledge-panel
+    - brand-entity
+    - wikidata
+    - entity-disambiguation
+    - 实体优化
+    - エンティティ
+    - 엔티티
+    - entidad-seo
+  triggers:
+    # EN-formal
+    - "optimize entity presence"
+    - "build knowledge graph"
+    - "improve knowledge panel"
+    - "entity audit"
+    - "establish brand entity"
+    - "entity disambiguation"
+    # EN-casual
+    - "Google doesn't know my brand"
+    - "no knowledge panel"
+    - "establish my brand"
+    - "establish my brand as an entity"
+    - "get a Google knowledge card"
+    - "no Wikipedia entry"
+    # EN-question
+    - "how to get a knowledge panel"
+    - "how to build brand entity"
+    # ZH-pro
+    - "实体优化"
+    - "知识图谱"
+    - "品牌实体"
+    - "知识面板"
+    - "品牌词"
+    - "品牌词优化"
+    # ZH-casual
+    - "品牌搜不到"
+    - "没有知识面板"
+    - "Google不认识我的品牌"
+    # JA
+    - "エンティティ最適化"
+    - "ナレッジパネル"
+    # KO
+    - "엔티티 최적화"
+    - "지식 패널"
+    - "구글이 내 브랜드 모르는데?"
+    - "지식 패널 만들려면?"
+    # ES
+    - "optimización de entidad"
+    - "panel de conocimiento"
+    # PT
+    - "otimização de entidade"
+    # Misspellings
+    - "knowlege panel"
+    - "enity optimization"
+---
+
+# Entity Optimizer
+
+
+> **[SEO & GEO Skills Library](https://github.com/aaron-he-zhu/seo-geo-claude-skills)** · 20 skills for SEO + GEO · [ClawHub](https://clawhub.ai/u/aaron-he-zhu) · [skills.sh](https://skills.sh/aaron-he-zhu/seo-geo-claude-skills)
+> **System Mode**: This cross-cutting skill is part of the protocol layer and follows the shared [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md) and [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md).
+
+
+Audits, builds, and maintains entity identity across search engines and AI systems. Entities — the people, organizations, products, and concepts that search engines and AI systems recognize as distinct things — are the foundation of how both Google and LLMs decide *what a brand is* and *whether to cite it*.
+
+**Why entities matter for SEO + GEO:**
+
+- **SEO**: Google's Knowledge Graph powers Knowledge Panels, rich results, and entity-based ranking signals. A well-defined entity earns SERP real estate.
+- **GEO**: AI systems resolve queries to entities before generating answers. If an AI cannot identify an entity, it cannot cite it — no matter how good the content is.
+
+**System role**: Canonical Entity Profile. It acts as the source of truth for entity identity, associations, and disambiguation across the library.
+
+## When This Must Trigger
+
+Use this when brand or entity identity needs to be established or verified — even if the user doesn't use entity terminology:
+
+- User says "Google doesn't know my brand" or "no knowledge panel"
+- Auto-recommended when `memory/entities/candidates.md` accumulates 3 or more uncanonized entity candidates from other skills
+- Establishing a new brand/person/product as a recognized entity
+- Auditing current entity presence across Knowledge Graph, Wikidata, and AI systems
+- Improving or correcting a Knowledge Panel
+- Building entity associations (entity ↔ topic, entity ↔ industry)
+- Resolving entity disambiguation issues (your entity confused with another)
+- Strengthening entity signals for AI citation
+- After launching a new brand, product, or organization
+- Preparing for a site migration (preserving entity identity)
+- Running periodic entity health checks
+
+## What This Skill Does
+
+1. **Entity Audit**: Evaluates current entity presence across search and AI systems
+2. **Knowledge Graph Analysis**: Checks Google Knowledge Graph, Wikidata, and Wikipedia status
+3. **AI Entity Resolution Test**: Queries AI systems to see how they identify and describe the entity
+4. **Entity Signal Mapping**: Identifies all signals that establish entity identity
+5. **Gap Analysis**: Finds missing or weak entity signals
+6. **Entity Building Plan**: Creates actionable plan to establish or strengthen entity presence
+7. **Disambiguation Strategy**: Resolves confusion with similarly-named entities
+
+## Quick Start
+
+Start with one of these prompts. Finish with a canonical entity profile and a handoff summary using the repository format in [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md).
+
+### Entity Audit
+
+```
+Audit entity presence for [brand/person/organization]
+```
+
+```
+How well do search engines and AI systems recognize [entity name]?
+```
+
+### Build Entity Presence
+
+```
+Build entity presence for [new brand] in the [industry] space
+```
+
+```
+Establish [person name] as a recognized expert in [topic]
+```
+
+### Fix Entity Issues
+
+```
+My Knowledge Panel shows incorrect information — fix entity signals for [entity]
+```
+
+```
+AI systems confuse [my entity] with [other entity] — help me disambiguate
+```
+
+## Skill Contract
+
+**Expected output**: an entity audit, a canonical entity profile, and a short handoff summary ready for `memory/entities/`.
+
+- **Reads**: the entity name, primary domain, known profiles, topic associations, and prior brand context from [CLAUDE.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CLAUDE.md) and the shared [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md) when available.
+- **Writes**: a user-facing entity report plus a reusable profile that can be stored under `memory/entities/`.
+- **Promotes**: canonical names, sameAs links, disambiguation notes, and entity gaps to `CLAUDE.md`, `memory/entities/`, and `memory/open-loops.md`.
+
+This skill is the sole writer of canonical entity profiles at `memory/entities/<name>.md`. Other skills write entity candidates to `memory/entities/candidates.md` only. When 3+ candidates accumulate, this skill should be recommended.
+
+- **Next handoff**: use the `Next Best Skill` below once the entity truth is clear.
+
+## Data Sources
+
+> See [CONNECTORS.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CONNECTORS.md) for tool category placeholders.
+
+**With ~~knowledge graph + ~~SEO tool + ~~AI monitor + ~~brand monitor connected:**
+Query Knowledge Graph API for entity status, pull branded search data from ~~SEO tool, test AI citation with ~~AI monitor, track brand mentions with ~~brand monitor.
+
+**With manual data only:**
+Ask the user to provide:
+1. Entity name, type (Person, Organization, Brand, Product, Creative Work, Event)
+2. Primary website / domain
+3. Known existing profiles (Wikipedia, Wikidata, social media, industry directories)
+4. Top 3-5 topics/industries the entity should be associated with
+5. Any known disambiguation issues (other entities with same/similar name)
+
+Without tools, Claude provides entity optimization strategy and recommendations based on information the user provides. The user must run search queries, check Knowledge Panels, and test AI responses to supply the raw data for analysis.
+
+Proceed with the audit using public search results, AI query testing, and SERP analysis. Note which items require tool access for full evaluation.
+
+## Instructions
+
+When a user requests entity optimization:
+
+### Step 1: Entity Discovery
+
+Establish the entity's current state across all systems.
+
+```markdown
+### Entity Profile
+
+**Entity Name**: [name]
+**Entity Type**: [Person / Organization / Brand / Product / Creative Work / Event]
+**Primary Domain**: [URL]
+**Target Topics**: [topic 1, topic 2, topic 3]
+
+#### Current Entity Presence
+
+| Platform | Status | Details |
+|----------|--------|---------|
+| Google Knowledge Panel | ✅ Present / ❌ Absent / ⚠️ Incorrect | [details] |
+| Wikidata | ✅ Listed / ❌ Not listed | [QID if exists] |
+| Wikipedia | ✅ Article / ⚠️ Mentioned only / ❌ Absent | [notability assessment] |
+| Google Knowledge Graph API | ✅ Entity found / ❌ Not found | [entity ID, types, score] |
+| Schema.org on site | ✅ Complete / ⚠️ Partial / ❌ Missing | [Organization/Person/Product schema] |
+
+#### AI Entity Resolution Test
+
+**Note**: Claude cannot directly query other AI systems or perform real-time web searches without tool access. When running without ~~AI monitor or ~~knowledge graph tools, ask the user to run these test queries and report the results, or use the user-provided information to assess entity presence.
+
+Test how AI systems identify this entity by querying:
+- "What is [entity name]?"
+- "Who founded [entity name]?" (for organizations)
+- "What does [entity name] do?"
+- "[entity name] vs [competitor]"
+
+| AI System | Recognizes Entity? | Description Accuracy | Cites Entity's Content? |
+|-----------|-------------------|---------------------|------------------------|
+| ChatGPT | ✅ / ⚠️ / ❌ | [accuracy notes] | [yes/no/partially] |
+| Claude | ✅ / ⚠️ / ❌ | [accuracy notes] | [yes/no/partially] |
+| Perplexity | ✅ / ⚠️ / ❌ | [accuracy notes] | [yes/no/partially] |
+| Google AI Overview | ✅ / ⚠️ / ❌ | [accuracy notes] | [yes/no/partially] |
+```
+
+### Step 2: Entity Signal Audit
+
+Evaluate entity signals across 6 categories. For the detailed 47-signal checklist with verification methods, see [references/entity-signal-checklist.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/entity-optimizer/references/entity-signal-checklist.md).
+
+Evaluate each signal as Pass / Fail / Partial with a specific action for each gap. The 6 categories are:
+
+1. **Structured Data Signals** -- Organization/Person schema, sameAs links, @id consistency, author schema
+2. **Knowledge Base Signals** -- Wikidata, Wikipedia, CrunchBase, industry directories
+3. **Consistent NAP+E Signals** -- Name/description/logo/social consistency across platforms
+4. **Content-Based Entity Signals** -- About page, author pages, topical authority, branded backlinks
+5. **Third-Party Entity Signals** -- Authoritative mentions, co-citation, reviews, press coverage
+6. **AI-Specific Entity Signals** -- Clear definitions, disambiguation, verifiable claims, crawlability
+
+> **Reference**: Use the audit template in [references/entity-signal-checklist.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/entity-optimizer/references/entity-signal-checklist.md) for the full 47-signal checklist with verification methods for each category.
+
+### Step 3: Report & Action Plan
+
+```markdown
+## Entity Optimization Report
+
+### Overview
+
+- **Entity**: [name]
+- **Entity Type**: [type]
+- **Audit Date**: [date]
+
+### Signal Category Summary
+
+| Category | Status | Key Findings |
+|----------|--------|-------------|
+| Structured Data | ✅ Strong / ⚠️ Gaps / ❌ Missing | [key findings] |
+| Knowledge Base | ✅ Strong / ⚠️ Gaps / ❌ Missing | [key findings] |
+| Consistency (NAP+E) | ✅ Strong / ⚠️ Gaps / ❌ Missing | [key findings] |
+| Content-Based | ✅ Strong / ⚠️ Gaps / ❌ Missing | [key findings] |
+| Third-Party | ✅ Strong / ⚠️ Gaps / ❌ Missing | [key findings] |
+| AI-Specific | ✅ Strong / ⚠️ Gaps / ❌ Missing | [key findings] |
+
+### Critical Issues
+
+[List any issues that severely impact entity recognition — disambiguation problems, incorrect Knowledge Panel, missing from Knowledge Graph entirely]
+
+### Top 5 Priority Actions
+
+Sorted by: impact on entity recognition × effort required
+
+1. **[Signal]** — [specific action]
+   - Impact: [High/Medium] | Effort: [Low/Medium/High]
+   - Why: [explanation of how this improves entity recognition]
+
+2. **[Signal]** — [specific action]
+   - Impact: [High/Medium] | Effort: [Low/Medium/High]
+   - Why: [explanation]
+
+3–5. [Same format]
+
+### Entity Building Roadmap
+
+#### Week 1-2: Foundation (Structured Data + Consistency)
+- [ ] Implement/fix Organization or Person schema with full properties
+- [ ] Add sameAs links to all authoritative profiles
+- [ ] Audit and fix NAP+E consistency across all platforms
+- [ ] Ensure About page is entity-rich and well-structured
+
+#### Month 1: Knowledge Bases
+- [ ] Create or update Wikidata entry with complete properties
+- [ ] Ensure CrunchBase / industry directory profiles are complete
+- [ ] Build Wikipedia notability (or plan path to notability)
+- [ ] Submit to relevant authoritative directories
+
+#### Month 2-3: Authority Building
+- [ ] Secure mentions on authoritative industry sites
+- [ ] Build co-citation signals with established entities
+- [ ] Create topical content clusters that reinforce entity-topic associations
+- [ ] Pursue PR opportunities that generate entity mentions
+
+#### Ongoing: AI-Specific Optimization
+- [ ] Test AI entity resolution quarterly
+- [ ] Update factual claims to remain current and verifiable
+- [ ] Monitor AI systems for incorrect entity information
+- [ ] Ensure new content reinforces entity identity signals
+
+### Cross-Reference
+
+- **CORE-EEAT relevance**: Items A07 (Knowledge Graph Presence) and A08 (Entity Consistency) directly overlap — entity optimization strengthens Authority dimension
+- **CITE relevance**: CITE I01-I10 (Identity dimension) measures entity signals at domain level — entity optimization feeds these scores
+- For content-level audit: `content-quality-auditor`
+- For domain-level audit: `domain-authority-auditor`
+```
+
+### Save Results
+
+After delivering findings to the user, ask:
+
+> "Save these results for future sessions?"
+
+If yes, write a dated summary to the appropriate `memory/` path using filename `YYYY-MM-DD-<topic>.md` containing:
+- One-line verdict or headline finding
+- Top 3-5 actionable items
+- Open loops or blockers
+- Source data references
+
+If any veto-level issue was found (CORE-EEAT T04, C01, R10 or CITE T03, T05, T09), also append a one-liner to `memory/hot-cache.md` without asking.
+
+## Validation Checkpoints
+
+### Input Validation
+- [ ] Entity name and type identified
+- [ ] Primary domain/website confirmed
+- [ ] Target topics/industries specified
+- [ ] Disambiguation context provided (if entity name is common)
+
+### Output Validation
+- [ ] All 6 signal categories evaluated
+- [ ] AI entity resolution tested with at least 3 queries
+- [ ] Knowledge Panel status checked
+- [ ] Wikidata/Wikipedia status verified
+- [ ] Schema.org markup on primary site audited
+- [ ] Every recommendation is specific and actionable
+- [ ] Roadmap includes concrete steps with timeframes
+- [ ] Cross-reference with CORE-EEAT A07/A08 and CITE I01-I10 noted
+
+## Example
+
+> **Reference**: See [references/example-audit-report.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/entity-optimizer/references/example-audit-report.md) for a complete example entity audit report for a B2B SaaS company (CloudMetrics), including AI entity resolution test results, entity health summary, top 3 priority actions, and CORE-EEAT/CITE cross-references.
+
+## Tips for Success
+
+1. **Start with Wikidata** — It's the single most influential editable knowledge base; a complete Wikidata entry with references often triggers Knowledge Panel creation within weeks
+2. **sameAs is your most powerful Schema.org property** — It directly tells search engines "I am this entity in the Knowledge Graph"; always include Wikidata URL first
+3. **Test AI recognition before and after** — Query ChatGPT, Claude, Perplexity, and Google AI Overview before optimizing, then again after; this is the most direct GEO metric
+4. **Entity signals compound** — Unlike content SEO, entity signals from different sources reinforce each other; 5 weak signals together are stronger than 1 strong signal alone
+5. **Consistency beats completeness** — A consistent entity name and description across 10 platforms beats a perfect profile on just 2
+6. **Don't neglect disambiguation** — If your entity name is shared with anything else, disambiguation is the first priority; all other signals are wasted if they're attributed to the wrong entity
+7. **Pair with CITE I-dimension for domain context** — Entity audit tells you how well the entity is recognized; CITE Identity (I01-I10) tells you how well the domain represents that entity; use both together
+
+## Entity Type Reference
+
+> **Reference**: See [references/entity-type-reference.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/entity-optimizer/references/entity-type-reference.md) for entity types with key signals, schemas, and disambiguation strategies by situation.
+
+## Knowledge Panel & Wikidata Optimization
+
+> **Reference**: See [references/knowledge-panel-wikidata-guide.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/entity-optimizer/references/knowledge-panel-wikidata-guide.md) for Knowledge Panel claiming/editing, common issues and fixes, Wikidata entry creation, key properties by entity type, and AI entity resolution optimization.
+
+## Reference Materials
+
+Detailed guides for entity optimization:
+- [references/entity-signal-checklist.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/entity-optimizer/references/entity-signal-checklist.md) — Complete signal checklist with verification methods
+- [references/knowledge-graph-guide.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/entity-optimizer/references/knowledge-graph-guide.md) — Wikidata, Wikipedia, and Knowledge Graph optimization playbook
+
+## Next Best Skill
+
+- **Primary**: [schema-markup-generator](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/build/schema-markup-generator/SKILL.md) — turn entity truth into machine-readable implementation.
diff --git a/.cursor/skills/entity-optimizer/references/entity-signal-checklist.md b/.cursor/skills/entity-optimizer/references/entity-signal-checklist.md
new file mode 100644
index 0000000000..b6b772fe59
--- /dev/null
+++ b/.cursor/skills/entity-optimizer/references/entity-signal-checklist.md
@@ -0,0 +1,137 @@
+# Entity Signal Checklist
+
+> Part of [entity-optimizer](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/entity-optimizer/SKILL.md). See also: [knowledge-graph-guide.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/entity-optimizer/references/knowledge-graph-guide.md)
+
+Complete checklist of entity signals organized by priority and verification method. Use this as a systematic audit guide — work through each signal, verify its status, and note actions needed.
+
+## Priority 1: Foundation Signals (Must-Have)
+
+These signals form the minimum viable entity identity. Without them, search engines and AI systems cannot reliably identify the entity.
+
+### On-Site Structured Data
+
+| # | Signal | Verification Method | Pass Criteria |
+|---|--------|-------------------|---------------|
+| 1 | Organization or Person schema on homepage | Run Google Rich Results Test on homepage | Schema present with name, url, logo, description |
+| 2 | sameAs property links to all authoritative profiles | Inspect schema markup | Links to Wikipedia, Wikidata, LinkedIn, social profiles |
+| 3 | Consistent @id used across all pages | Inspect schema on 5+ pages | Same @id (typically homepage URL + #organization) on every page |
+| 4 | About page exists with entity-rich content | Manual review | First paragraph defines entity clearly; includes founding date, key people, mission |
+| 5 | Contact page with verifiable information | Manual review | Physical address, phone, email — matches other directory listings |
+
+### Key External Profiles
+
+| # | Signal | Verification Method | Pass Criteria |
+|---|--------|-------------------|---------------|
+| 6 | Wikidata entry exists | Search wikidata.org | Entry with label, description, key properties, and references |
+| 7 | Google Business Profile (if applicable) | Search "[entity] Google Business" | Claimed, verified, complete profile |
+| 8 | LinkedIn company/person page | Search LinkedIn | Complete profile matching entity name and description |
+| 9 | CrunchBase profile (for companies/products) | Search crunchbase.com | Entry with description, founding info, key people |
+| 10 | Primary industry directory listing | Search top 3 industry directories | Listed with correct entity information |
+
+### Branded Search Presence
+
+| # | Signal | Verification Method | Pass Criteria |
+|---|--------|-------------------|---------------|
+| 11 | Branded search returns correct entity | Google "[entity name]" | Entity's website is #1; Knowledge Panel appears or SERP clearly identifies entity |
+| 12 | No disambiguation confusion | Google "[entity name]" | No other prominent entity dominates results for the same name |
+| 13 | Branded search volume exists | Check ~~SEO tool | Measurable branded search volume (any amount > 0) |
+
+## Priority 2: Authority Signals (Should-Have)
+
+These signals establish the entity as recognized and authoritative. They separate a "registered entity" from a "known entity."
+
+### Knowledge Graph Depth
+
+| # | Signal | Verification Method | Pass Criteria |
+|---|--------|-------------------|---------------|
+| 14 | Google Knowledge Panel present | Google "[entity name]" | Knowledge Panel displayed with correct information |
+| 15 | Knowledge Panel attributes complete | Review Knowledge Panel | Key attributes filled (founded, CEO, location, industry, etc.) |
+| 16 | Knowledge Panel image correct | Review Knowledge Panel | Preferred image displayed |
+| 17 | Wikipedia article (or strong notability path) | Search Wikipedia | Article exists, or entity has 3+ independent reliable sources for future article |
+| 18 | Wikidata properties complete | Review Wikidata entry | 10+ properties with references |
+
+### Third-Party Validation
+
+| # | Signal | Verification Method | Pass Criteria |
+|---|--------|-------------------|---------------|
+| 19 | Authoritative media mentions | Google News search for entity | 3+ mentions in recognized publications |
+| 20 | Industry awards or recognitions | Search "[entity] award" | At least 1 verifiable award or recognition |
+| 21 | Co-citation with established entities | Search for entity alongside competitors | Appears in "X vs Y" comparisons, listicles, or industry roundups |
+| 22 | Speaking engagements or publications | Search event/conference sites | Appears as speaker, author, or contributor |
+| 23 | Reviews on third-party platforms | Check G2, Trustpilot, Yelp, etc. | Reviews exist with reasonable volume and rating |
+
+### Content Authority
+
+| # | Signal | Verification Method | Pass Criteria |
+|---|--------|-------------------|---------------|
+| 24 | Topical content depth in target areas | Site search for target topics | 10+ pages covering target topics in depth |
+| 25 | Author pages with credentials | Review author pages | Author schema, credentials, sameAs to external profiles |
+| 26 | Original research or data published | Review content | At least 1 piece of original data/research cited by others |
+| 27 | Entity mentioned in own content naturally | Search site for entity name | Entity name appears contextually (not just in header/footer) |
+
+## Priority 3: AI-Specific Signals (Must-Have for GEO)
+
+These signals specifically help AI systems recognize, understand, and cite the entity.
+
+### AI Recognition
+
+| # | Signal | Verification Method | Pass Criteria |
+|---|--------|-------------------|---------------|
+| 28 | ChatGPT recognizes entity | Ask "What is [entity]?" | Correct description returned |
+| 29 | Perplexity recognizes entity | Ask "What is [entity]?" | Correct description with source citations |
+| 30 | Google AI Overview mentions entity | Search branded + topical queries | Entity appears in AI-generated overview |
+| 31 | AI description is accurate | Compare AI output to entity's self-description | No factual errors in AI's response |
+| 32 | AI associates entity with correct topics | Ask "[entity] expertise areas" | Correct topic associations returned |
+
+### AI Optimization
+
+| # | Signal | Verification Method | Pass Criteria |
+|---|--------|-------------------|---------------|
+| 33 | Entity definition quotable in first paragraph | Review About page and key pages | Clear, factual, self-contained definition suitable for AI quotation |
+| 34 | Factual claims are verifiable | Cross-reference claims with external sources | All claims about entity can be verified via third-party sources |
+| 35 | Entity name used consistently | Audit all platforms | Identical name format everywhere (no abbreviations in some places, full name in others) |
+| 36 | Content is crawlable by AI systems | Check robots.txt for AI bot access | Not blocking GPTBot, ClaudeBot, or other AI crawlers (unless intentional) |
+| 37 | Fresh information available | Check update dates | Key entity pages updated within last 6 months |
+
+## Priority 4: Advanced Signals (Nice-to-Have)
+
+These signals provide marginal gains but demonstrate thoroughness and maturity.
+
+### Extended Knowledge Base Presence
+
+| # | Signal | Verification Method | Pass Criteria |
+|---|--------|-------------------|---------------|
+| 38 | Multiple language entries in Wikidata | Check Wikidata labels | Labels and descriptions in languages matching target markets |
+| 39 | DBpedia entry | Search dbpedia.org | Entry exists (auto-generated from Wikipedia) |
+| 40 | Google Knowledge Graph ID known | Search Google Knowledge Graph API | Entity has a kg: identifier |
+| 41 | ISNI or VIAF identifier (for persons) | Search isni.org or viaf.org | Identifier exists and links correctly |
+
+### Social Entity Signals
+
+| # | Signal | Verification Method | Pass Criteria |
+|---|--------|-------------------|---------------|
+| 42 | Social profiles bidirectionally linked | Check website links to social AND social links to website | Both directions verified on all platforms |
+| 43 | Consistent entity description across social | Compare bios on all platforms | Same core description, adapted for platform length limits |
+| 44 | Social engagement demonstrates real audience | Review engagement metrics | Engagement patterns consistent with genuine audience (not bot-like) |
+
+### Technical Entity Signals
+
+| # | Signal | Verification Method | Pass Criteria |
+|---|--------|-------------------|---------------|
+| 45 | Entity homepage has strong backlink profile | Check ~~link database | Homepage DR/DA above industry median |
+| 46 | Branded anchor text in backlinks | Analyze anchor text distribution | Entity name appears naturally in inbound link anchor text |
+| 47 | Entity subdomain consistency | Check all subdomains | Same entity schema and branding across all subdomains |
+
+## How to Use This Checklist
+
+Work through signals by priority tier. For each signal, mark status as ✅ (present and correct), ⚠️ (present but incomplete), or ❌ (absent). Focus on completing each priority tier before moving to the next.
+
+### Priority Action Matrix
+
+| Current State | Focus Area | Expected Timeline |
+|--------------|-----------|-------------------|
+| Most Priority 1 signals ❌ | Priority 1 foundation signals only | 2-4 weeks |
+| Priority 1 mostly ✅, Priority 2 mixed | Priority 2 authority signals | 1-2 months |
+| Priority 1-2 mostly ✅ | Priority 3 AI-specific signals | 2-3 months |
+| Priority 1-3 mostly ✅ | Selective Priority 4 for completeness | Ongoing |
+| All tiers mostly ✅ | Maintenance + quarterly re-audit | Quarterly review |
diff --git a/.cursor/skills/entity-optimizer/references/entity-type-reference.md b/.cursor/skills/entity-optimizer/references/entity-type-reference.md
new file mode 100644
index 0000000000..1bffe7d948
--- /dev/null
+++ b/.cursor/skills/entity-optimizer/references/entity-type-reference.md
@@ -0,0 +1,24 @@
+# Entity Type Reference
+
+Reference tables for entity types, key signals, and disambiguation strategies.
+
+## Entity Types and Key Signals
+
+| Entity Type | Primary Signals | Secondary Signals | Key Schema |
+|-------------|----------------|-------------------|------------|
+| **Person** | Author pages, social profiles, publication history | Speaking, awards, media mentions | Person, ProfilePage |
+| **Organization** | Registration records, Wikidata, industry listings | Press coverage, partnerships, awards | Organization, Corporation |
+| **Brand** | Trademark, branded search volume, social presence | Reviews, brand mentions, visual identity | Brand, Organization |
+| **Product** | Product pages, reviews, comparison mentions | Awards, expert endorsements, market share | Product, SoftwareApplication |
+| **Creative Work** | Publication record, citations, reviews | Awards, adaptations, cultural impact | CreativeWork, Book, Movie |
+| **Event** | Event listings, press coverage, social buzz | Sponsorships, speaker profiles, attendance | Event |
+
+## Disambiguation Strategy by Situation
+
+| Situation | Strategy |
+|-----------|----------|
+| **Common name, unique entity** | Strengthen all signals; let signal volume resolve ambiguity |
+| **Name collision with larger entity** | Add qualifier consistently (e.g., "Acme Software" not just "Acme"); use sameAs extensively; build topic-specific authority that differentiates |
+| **Name collision with similar entity** | Geographic, industry, or product qualifiers; ensure Schema @id is unique and consistent; prioritize Wikidata disambiguation |
+| **Abbreviation/acronym conflict** | Prefer full name in structured data; use abbreviation only in contexts where entity is already established |
+| **Merged or renamed entity** | Redirect old entity signals; update all structured data; create explicit "formerly known as" content; update Wikidata |
diff --git a/.cursor/skills/entity-optimizer/references/example-audit-report.md b/.cursor/skills/entity-optimizer/references/example-audit-report.md
new file mode 100644
index 0000000000..dc3648167f
--- /dev/null
+++ b/.cursor/skills/entity-optimizer/references/example-audit-report.md
@@ -0,0 +1,59 @@
+# Example: Entity Optimization Report
+
+Full example output for an entity audit request.
+
+**User**: "Audit entity presence for CloudMetrics, our B2B SaaS analytics platform at cloudmetrics.io"
+
+**Output**:
+
+```markdown
+## Entity Optimization Report
+
+### Entity Profile
+
+**Entity Name**: CloudMetrics
+**Entity Type**: Organization (B2B SaaS)
+**Primary Domain**: cloudmetrics.io
+**Target Topics**: analytics platform, business intelligence, enterprise analytics
+
+### AI Entity Resolution Test
+
+Queries tested with results reported by user:
+
+| Query | Result | Assessment |
+|-------|--------|------------|
+| "What is CloudMetrics?" | Described as "an analytics tool" with no further detail | Partial recognition -- generic description, no mention of B2B focus or key features |
+| "Best analytics platforms for enterprises" | CloudMetrics not mentioned in any AI response | Not recognized as a player in the enterprise analytics space |
+| "CloudMetrics vs Datadog" | Correctly identified as a competitor to Datadog, but feature comparison was incomplete and partially inaccurate | Partial -- entity is associated with the right category but attributes are thin |
+| "Who founded CloudMetrics?" | No answer found by any AI system tested | Entity leadership not present in AI knowledge bases |
+
+### Entity Health Summary
+
+| Signal Category | Status | Key Findings |
+|-----------------|--------|--------------|
+| Knowledge Graph | Missing | No Wikidata entry exists; no Google Knowledge Panel triggers for branded queries |
+| Structured Data | Partial | Organization schema present on homepage with name, url, and logo; missing Person schema for CEO and leadership team; no sameAs links to external profiles |
+| Web Presence | Strong | Consistent NAP across LinkedIn, Twitter/X, G2, and Crunchbase; social profiles link back to cloudmetrics.io; branded search returns owned properties in top 5 |
+| Content-Based | Partial | About page exists but opens with marketing copy rather than an entity-defining statement; no dedicated author pages for leadership |
+| Third-Party | Partial | Listed on G2 and Crunchbase; 2 industry publication mentions found; no awards or analyst coverage |
+| AI-Specific | Weak | AI systems have only surface-level awareness; entity definition is not quotable from any authoritative source |
+
+### Top 3 Priority Actions
+
+1. **Create Wikidata entry** with key properties: instance of (P31: business intelligence software company), official website (P856: cloudmetrics.io), inception (P571), country (P17)
+   - Impact: High | Effort: Low
+   - Why: Wikidata is the foundational knowledge base that feeds Google Knowledge Graph, Bing, and AI training pipelines; without it, the entity cannot be formally resolved
+
+2. **Add Person schema for leadership team** on the About/Team page, including name, jobTitle, sameAs links to LinkedIn profiles, and worksFor pointing to the Organization entity
+   - Impact: High | Effort: Low
+   - Why: Addresses the "Who founded CloudMetrics?" gap directly; Person schema for key people creates bidirectional entity associations that strengthen organizational identity
+
+3. **Build Wikipedia notability through independent press coverage** -- target 3-5 articles in industry publications (TechCrunch, VentureBeat, Analytics India Magazine) that mention CloudMetrics by name with verifiable claims
+   - Impact: High | Effort: High
+   - Why: Wikipedia notability requires coverage in independent reliable sources; press mentions simultaneously feed AI training data, build third-party entity signals, and create the citation foundation for a future Wikipedia article
+
+### Cross-Reference
+
+- **CORE-EEAT**: A07 (Knowledge Graph Presence) scored Fail, A08 (Entity Consistency) scored Pass -- entity optimization should focus on knowledge base gaps rather than consistency
+- **CITE**: I-dimension weakest area is I01 (Knowledge Graph Presence) -- completing Wikidata entry and earning Knowledge Panel directly improves domain identity score
+```
diff --git a/.cursor/skills/entity-optimizer/references/knowledge-graph-guide.md b/.cursor/skills/entity-optimizer/references/knowledge-graph-guide.md
new file mode 100644
index 0000000000..ed7ad0aeb4
--- /dev/null
+++ b/.cursor/skills/entity-optimizer/references/knowledge-graph-guide.md
@@ -0,0 +1,399 @@
+# Knowledge Graph Optimization Guide
+
+> Part of [entity-optimizer](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/entity-optimizer/SKILL.md). See also: [entity-signal-checklist.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/entity-optimizer/references/entity-signal-checklist.md)
+
+Comprehensive playbook for establishing and maintaining entity presence across Google Knowledge Graph, Wikidata, Wikipedia, and other knowledge bases.
+
+## How Knowledge Graphs Work
+
+### The Entity Web
+
+Knowledge graphs are interconnected databases of entities and their relationships. Search engines and AI systems use them as ground truth for entity understanding.
+
+```
+Your Entity
+├── is described by → Wikidata entry
+├── is described by → Wikipedia article
+├── is described by → Schema.org markup on your site
+├── is linked to → Social profiles (LinkedIn, X, etc.)
+├── is mentioned by → News articles, industry sites
+├── is associated with → Topics, industries, other entities
+└── is recognized by → Google Knowledge Graph, Bing Satori, AI training data
+```
+
+### Which Knowledge Graphs Matter
+
+| Knowledge Graph | Who Uses It | Impact |
+|----------------|-------------|--------|
+| **Google Knowledge Graph** | Google Search, Google AI | Powers Knowledge Panels, rich results, entity understanding in search |
+| **Wikidata** | Google, Bing, Apple, Amazon, AI systems | Open data feeds multiple knowledge graphs; primary structured data source |
+| **Wikipedia** | Google, all AI systems | Training data for every major LLM; Knowledge Panel descriptions often sourced here |
+| **Bing Satori** | Bing, Copilot | Powers Bing's entity understanding and Microsoft Copilot |
+| **Schema.org (your site)** | All search engines, AI crawlers | First-party structured data you control directly |
+| **DBpedia** | Research, some AI systems | Auto-extracted from Wikipedia; relevant for academic/research entities |
+
+### Data Flow
+
+```
+Your Website (Schema.org) ─┐
+Wikidata ──────────────────┤
+Wikipedia ─────────────────┼──→ Google Knowledge Graph ──→ Knowledge Panel
+Industry Directories ──────┤                              AI Search Results
+News/Media Mentions ───────┤                              Rich Results
+Social Profiles ───────────┘
+```
+
+Understanding this flow is key: you influence the Knowledge Graph by controlling the **source signals** that feed it.
+
+## Google Knowledge Graph
+
+### Getting Into the Knowledge Graph
+
+There is no "submit to Knowledge Graph" form. Google builds its Knowledge Graph from multiple sources. To get included:
+
+1. **Have a Wikidata entry** — This is the most direct path
+2. **Earn a Wikipedia article** — Strongest single signal
+3. **Implement Schema.org markup** — Provides structured self-description
+4. **Get mentioned on authoritative sites** — Third-party validation
+5. **Build branded search demand** — Signals that users look for your entity
+
+### Checking Your Knowledge Graph Status
+
+**Method 1: Google Search**
+Search for your entity name in quotes. If a Knowledge Panel appears on the right, you're in the Knowledge Graph.
+
+**Method 2: Knowledge Graph API**
+```
+GET https://kgsearch.googleapis.com/v1/entities:search?query=[entity]&key=[API_KEY]
+```
+
+Response includes:
+- `@id`: Your Knowledge Graph ID (e.g., `kg:/m/0wrt4g`)
+- `name`: Entity name as Google understands it
+- `description`: Short entity description
+- `detailedDescription`: Longer description (usually from Wikipedia)
+- `resultScore`: Confidence score (higher = more established entity)
+
+**Method 3: ~~knowledge graph**
+If connected, query directly for entity status and attributes.
+
+### Claiming Your Knowledge Panel
+
+1. Search for your entity on Google
+2. If Knowledge Panel appears, look for "Claim this knowledge panel" link at bottom
+3. Verify via official website, Search Console, YouTube, or other Google property
+4. Once claimed, you can suggest edits (but Google has final say)
+
+### Common Knowledge Panel Fixes
+
+| Problem | Solution |
+|---------|----------|
+| **No Knowledge Panel** | Build Wikidata entry + Schema.org + authoritative mentions. Timeline: 2-6 months. |
+| **Wrong image** | Update preferred image on: Wikidata (P18), About page, social profiles. Claim panel and suggest preferred image. |
+| **Wrong description** | Edit Wikidata description. Update first paragraph of About page and Wikipedia article. |
+| **Missing attributes** | Add properties to Wikidata and Schema.org. Claim panel and suggest additions. |
+| **Outdated information** | Update Wikidata, About page, Wikipedia, and social profiles. Request refresh via claimed panel. |
+| **Wrong entity shown** | Disambiguation needed. See Wikidata section below for disambiguation strategy. |
+
+## Wikidata
+
+### Why Wikidata Is Critical
+
+Wikidata is the **single most influential editable knowledge base** for entity optimization:
+- Google uses it as a primary source for Knowledge Panels
+- Bing uses it for Satori knowledge graph
+- AI systems reference it during entity resolution
+- It's open and you can edit it (within their guidelines)
+
+### Creating a Wikidata Entry
+
+#### Step 1: Check Eligibility
+
+Wikidata requires "notability" — the entity must be referenced in at least one external source. Unlike Wikipedia, the notability bar is lower: a company mentioned in a news article, a product with reviews, or a person with published work typically qualifies.
+
+#### Step 2: Create the Item
+
+1. Go to https://www.wikidata.org/wiki/Special:NewItem
+2. Fill in:
+   - **Label**: Official entity name
+   - **Description**: Short description (e.g., "American software company" or "SEO optimization tool")
+   - **Aliases**: Alternative names, abbreviations, former names
+
+#### Step 3: Add Core Statements
+
+Essential properties for each entity type:
+
+**Organizations:**
+| Property | Code | Example |
+|----------|------|---------|
+| instance of | P31 | business (Q4830453) or specific type |
+| official website | P856 | https://example.com |
+| inception | P571 | 2020-01-15 |
+| country | P17 | United States (Q30) |
+| headquarters location | P159 | San Francisco (Q62) |
+| industry | P452 | software industry (Q638608) |
+| founded by | P112 | [founder's Wikidata item] |
+| CEO | P169 | [CEO's Wikidata item] |
+
+**Persons:**
+| Property | Code | Example |
+|----------|------|---------|
+| instance of | P31 | human (Q5) |
+| occupation | P106 | software engineer (Q183888) |
+| employer | P108 | [company Wikidata item] |
+| educated at | P69 | [university Wikidata item] |
+| country of citizenship | P27 | [country item] |
+| official website | P856 | https://example.com |
+
+**Products/Software:**
+| Property | Code | Example |
+|----------|------|---------|
+| instance of | P31 | software (Q7397) or web application (Q189210) |
+| developer | P178 | [company Wikidata item] |
+| official website | P856 | https://example.com |
+| programming language | P277 | Python (Q28865) |
+| operating system | P306 | Linux (Q388) |
+| software license | P275 | Apache-2.0 (Q13785927) |
+| inception | P571 | 2023-06-01 |
+
+#### Step 4: Add External Identifiers
+
+These link your Wikidata item to other knowledge bases:
+
+| Identifier | Code | Purpose |
+|-----------|------|---------|
+| official website | P856 | Primary web presence |
+| X (Twitter) username | P2002 | Social presence |
+| LinkedIn organization ID | P4264 | Professional presence |
+| GitHub username | P2037 | Technical presence |
+| CrunchBase ID | P2087 | Business data |
+| Google Knowledge Graph ID | P2671 | Google entity link |
+| App Store ID | P3861 | Mobile presence |
+
+#### Step 5: Add References
+
+**Every statement must have a reference.** Unreferenced statements may be removed.
+
+Good reference sources:
+- Official website (for factual claims like founding date)
+- News articles (for events, milestones)
+- Industry reports (for market position)
+- Government registries (for legal entity information)
+
+### Wikidata Maintenance
+
+| Task | Frequency | Why |
+|------|-----------|-----|
+| Review existing statements | Quarterly | Ensure accuracy; update changed information |
+| Add new properties | When new information available | Keep entry comprehensive |
+| Check for vandalism | Monthly | Others can edit your entry |
+| Add new references | When new coverage appears | Strengthen statement credibility |
+| Update identifiers | When new profiles created | Keep links current |
+
+## Wikipedia
+
+### Notability Requirements
+
+Wikipedia requires entities to meet "general notability guidelines" (GNG):
+- **Significant coverage** in **reliable, independent sources**
+- Coverage must be **non-trivial** (not just a mention or directory listing)
+- Sources must be **independent** of the entity (not press releases, not entity's own content)
+
+### Building Toward Notability
+
+If the entity doesn't have a Wikipedia article yet:
+
+1. **Audit existing coverage**: Search Google News, academic databases, and industry publications for mentions
+2. **Identify gaps**: What kinds of coverage are missing?
+3. **Build coverage first, then article**: The article is the last step, not the first
+
+Coverage-building strategies:
+| Strategy | Timeline | Notability Impact |
+|----------|----------|-------------------|
+| Industry report mentions | 3-6 months | Medium — depends on report authority |
+| News article coverage | 1-3 months | High — especially from recognized publications |
+| Conference speaking + coverage | 3-12 months | Medium — needs post-event coverage |
+| Academic paper citations | 6-12+ months | High — very strong for GNG |
+| Award recognition | Variable | Medium — depends on award authority |
+| Book publication or feature | 6-12+ months | High — strong independent source |
+
+### Wikipedia Article Best Practices
+
+**DO:**
+- Write in neutral, encyclopedic tone
+- Use only independent, reliable sources as references
+- Follow Wikipedia's Manual of Style
+- Disclose any conflict of interest on your Talk page
+- Let the community review and improve the article
+
+**DO NOT:**
+- Write promotional content
+- Use the entity's own website as a primary source
+- Create the article from a company account without disclosure
+- Remove criticism or negative but sourced information
+- Pay someone to write the article without disclosure (violates Wikipedia policy)
+
+### Wikipedia's Impact on AI
+
+Wikipedia is disproportionately important for AI systems because:
+- It's in the training data of every major LLM
+- AI systems treat it as a high-trust source
+- Wikipedia's structured format makes it easy for AI to extract and cite
+- The first paragraph of a Wikipedia article often becomes the AI's entity definition
+
+This makes Wikipedia presence one of the highest-impact entity optimization actions for GEO.
+
+## Schema.org Entity Markup
+
+### Minimum Viable Entity Schema
+
+Every entity should have at minimum this markup on the homepage:
+
+**Organization:**
+```json
+{
+  "@context": "https://schema.org",
+  "@type": "Organization",
+  "@id": "https://example.com/#organization",
+  "name": "Example Corp",
+  "url": "https://example.com",
+  "logo": "https://example.com/logo.png",
+  "description": "Example Corp is a [what it is] that [what it does].",
+  "foundingDate": "2020-01-15",
+  "founder": {
+    "@type": "Person",
+    "name": "Jane Smith",
+    "@id": "https://example.com/about/jane-smith#person"
+  },
+  "sameAs": [
+    "https://www.wikidata.org/wiki/Q12345678",
+    "https://en.wikipedia.org/wiki/Example_Corp",
+    "https://www.linkedin.com/company/example-corp",
+    "https://x.com/examplecorp",
+    "https://www.crunchbase.com/organization/example-corp"
+  ]
+}
+```
+
+**Person:**
+```json
+{
+  "@context": "https://schema.org",
+  "@type": "Person",
+  "@id": "https://example.com/about/jane-smith#person",
+  "name": "Jane Smith",
+  "url": "https://example.com/about/jane-smith",
+  "image": "https://example.com/photos/jane-smith.jpg",
+  "jobTitle": "CEO",
+  "worksFor": {
+    "@type": "Organization",
+    "@id": "https://example.com/#organization"
+  },
+  "description": "Jane Smith is [who they are] specializing in [expertise areas].",
+  "sameAs": [
+    "https://www.wikidata.org/wiki/Q87654321",
+    "https://www.linkedin.com/in/janesmith",
+    "https://x.com/janesmith"
+  ]
+}
+```
+
+### sameAs Best Practices
+
+The `sameAs` property is the **primary entity disambiguation signal** in Schema.org. It tells search engines "this is the same entity as the one on these other platforms."
+
+**Must include (when available):**
+1. Wikidata URL (most important for Knowledge Graph)
+2. Wikipedia URL
+3. LinkedIn URL
+4. Official social media profiles
+
+**Include when relevant:**
+5. CrunchBase URL
+6. GitHub URL
+7. IMDb URL (for people in entertainment)
+8. Industry directory URLs
+
+**Common mistakes:**
+- Linking to generic pages instead of entity-specific URLs
+- Inconsistent: Schema says "Example Corp" but LinkedIn says "Example Corporation"
+- Missing Wikidata link (this is the single most impactful sameAs)
+- Including dead or redirecting URLs
+
+### Cross-Page Entity Consistency
+
+Every page on the site should reference the same entity with the same `@id`:
+
+```json
+{
+  "@type": "WebPage",
+  "publisher": {
+    "@type": "Organization",
+    "@id": "https://example.com/#organization"
+  }
+}
+```
+
+For articles:
+```json
+{
+  "@type": "Article",
+  "author": {
+    "@type": "Person",
+    "@id": "https://example.com/about/jane-smith#person"
+  },
+  "publisher": {
+    "@type": "Organization",
+    "@id": "https://example.com/#organization"
+  }
+}
+```
+
+This creates a consistent entity graph that search engines can confidently map to Knowledge Graph entries.
+
+## Monitoring Entity Health
+
+### Quarterly Entity Health Check
+
+| Check | How | What to Look For |
+|-------|-----|-----------------|
+| Knowledge Panel accuracy | Google entity name | Correct info, image, attributes |
+| Wikidata entry | Visit Wikidata page | No vandalism, info still current |
+| AI entity resolution | Query 3+ AI systems | Accurate recognition and description |
+| Schema.org validation | Google Rich Results Test | No errors, complete entity data |
+| Branded search SERP | Google "[entity name]" | Clean SERP, no disambiguation issues |
+| Social profile consistency | Visit all profiles | Same name, description, links |
+
+### Entity Health Metrics to Track
+
+| Metric | Tool | Target |
+|--------|------|--------|
+| Knowledge Panel presence | Google Search | Present and accurate |
+| Branded search CTR | ~~search console | > 50% for exact brand name |
+| AI recognition rate | Manual testing | Recognized by 3/3 major AI systems |
+| Wikidata completeness | Wikidata | 15+ properties with references |
+| Schema.org error count | Google Search Console | 0 errors |
+| Brand mention volume | ~~brand monitor | Stable or growing trend |
+
+### Recovery Playbooks
+
+**Entity disappeared from Knowledge Graph:**
+1. Check if Wikidata entry was deleted or merged
+2. Verify Schema.org markup hasn't changed
+3. Look for major algorithm updates that might have affected entity recognition
+4. Rebuild signals: start with Wikidata, then Schema.org, then external mentions
+5. Timeline: 2-8 weeks for recovery
+
+**AI systems giving incorrect entity info:**
+1. Identify which sources have incorrect information
+2. Correct information at source (Wikidata, Wikipedia, About page)
+3. AI systems will update over time (training data refresh + live search)
+4. For urgent issues, some AI systems have feedback mechanisms
+5. Timeline: weeks to months depending on AI system update cycles
+
+**Knowledge Panel showing wrong entity:**
+1. Claim the Knowledge Panel (if you haven't already)
+2. Strengthen disambiguation signals (see SKILL.md Disambiguation Strategy)
+3. Add qualifier to entity name if needed
+4. Build more unique entity signals (original content, specific topic associations)
+5. Timeline: 1-3 months
diff --git a/.cursor/skills/entity-optimizer/references/knowledge-panel-wikidata-guide.md b/.cursor/skills/entity-optimizer/references/knowledge-panel-wikidata-guide.md
new file mode 100644
index 0000000000..b718f4ef88
--- /dev/null
+++ b/.cursor/skills/entity-optimizer/references/knowledge-panel-wikidata-guide.md
@@ -0,0 +1,81 @@
+# Knowledge Panel & Wikidata Optimization Guide
+
+Detailed instructions for Knowledge Panel optimization, Wikidata entry management, and AI entity resolution.
+
+## Knowledge Panel Optimization
+
+### Claiming and Editing
+
+1. **Google Knowledge Panel**: Claim via Google's verification process (search for entity -> click "Claim this knowledge panel")
+2. **Bing Knowledge Panel**: Driven by Wikidata and LinkedIn -- update those sources
+3. **AI Knowledge**: Driven by training data -- ensure authoritative sources describe entity correctly
+
+### Common Knowledge Panel Issues
+
+| Issue | Root Cause | Fix |
+|-------|-----------|-----|
+| No panel appears | Entity not in Knowledge Graph | Build Wikidata entry + structured data + authoritative mentions |
+| Wrong image | Image sourced from incorrect page | Update Wikidata image; ensure preferred image on About page and social profiles |
+| Wrong description | Description pulled from wrong source | Edit Wikidata description; ensure About page has clear entity description in first paragraph |
+| Missing attributes | Incomplete structured data | Add properties to Schema.org markup and Wikidata entry |
+| Wrong entity shown | Disambiguation failure | Strengthen unique signals; add qualifiers; resolve Wikidata disambiguation |
+| Outdated info | Source data not updated | Update Wikidata, About page, and all profile pages |
+
+## Wikidata Best Practices
+
+### Creating a Wikidata Entry
+
+1. **Check notability**: Entity must have at least one authoritative reference
+2. **Create item**: Add label, description, and aliases in relevant languages
+3. **Add statements**: instance of, official website, social media links, founding date, founders, industry
+4. **Add identifiers**: official website (P856), social media IDs, CrunchBase ID, ISNI, VIAF
+5. **Add references**: Every statement should have a reference to an authoritative source
+
+**Important**: Wikipedia's Conflict of Interest (COI) policy prohibits individuals and organizations from creating or editing articles about themselves. Instead of directly editing Wikipedia: (1) Focus on building notability through independent reliable sources (press coverage, industry publications, academic citations); (2) If you believe a Wikipedia article is warranted, consider engaging an independent Wikipedia editor through the Requested Articles process; (3) Ensure all claims about the entity are verifiable through third-party sources before any Wikipedia involvement.
+
+### Key Wikidata Properties by Entity Type
+
+| Property | Code | Person | Org | Brand | Product |
+|----------|------|:------:|:---:|:-----:|:-------:|
+| instance of | P31 | human | organization type | brand | product type |
+| official website | P856 | yes | yes | yes | yes |
+| occupation / industry | P106/P452 | yes | yes | -- | -- |
+| founded by | P112 | -- | yes | yes | -- |
+| inception | P571 | -- | yes | yes | yes |
+| country | P17 | yes | yes | -- | -- |
+| social media | various | yes | yes | yes | yes |
+| employer | P108 | yes | -- | -- | -- |
+| developer | P178 | -- | -- | -- | yes |
+
+## AI Entity Optimization
+
+### How AI Systems Resolve Entities
+
+```
+User query -> Entity extraction -> Entity resolution -> Knowledge retrieval -> Answer generation
+```
+
+AI systems follow this pipeline:
+1. **Extract** entity mentions from the query
+2. **Resolve** each mention to a known entity (or fail -> "I'm not sure")
+3. **Retrieve** associated knowledge about the entity
+4. **Generate** response citing sources that confirmed the entity's attributes
+
+### Signals AI Systems Use for Entity Resolution
+
+| Signal Type | What AI Checks | How to Optimize |
+|-------------|---------------|-----------------|
+| **Training data presence** | Was entity in pre-training corpus? | Get mentioned in high-quality, widely-crawled sources |
+| **Retrieval augmentation** | Does entity appear in live search results? | Strong SEO presence for branded queries |
+| **Structured data** | Can entity be matched to Knowledge Graph? | Complete Wikidata + Schema.org |
+| **Contextual co-occurrence** | What topics/entities appear alongside? | Build consistent topic associations across content |
+| **Source authority** | Are sources about entity trustworthy? | Get mentioned by authoritative, well-known sources |
+| **Recency** | Is information current? | Keep all entity profiles and content updated |
+
+### Entity-Specific GEO Tactics
+
+1. **Define clearly**: First paragraph of About page and key pages should define the entity in a way AI can quote directly
+2. **Be consistent**: Use identical entity description across all platforms
+3. **Build associations**: Create content that explicitly connects entity to target topics
+4. **Earn mentions**: Third-party authoritative mentions are stronger entity signals than self-description
+5. **Stay current**: Outdated entity information causes AI to lose confidence and stop citing
diff --git a/.cursor/skills/geo-content-optimizer/SKILL.md b/.cursor/skills/geo-content-optimizer/SKILL.md
new file mode 100644
index 0000000000..5ee0a58407
--- /dev/null
+++ b/.cursor/skills/geo-content-optimizer/SKILL.md
@@ -0,0 +1,420 @@
+---
+name: geo-content-optimizer
+description: 'Optimize content for AI citations in ChatGPT, Perplexity, AI Overviews, Gemini, Claude. AI引用优化/GEO优化/AI搜索'
+version: "6.0.0"
+license: Apache-2.0
+compatibility: "Claude Code ≥1.0, skills.sh marketplace, ClawHub marketplace, Vercel Labs skills ecosystem. No system packages required. Optional: MCP network access for SEO tool integrations."
+homepage: "https://github.com/aaron-he-zhu/seo-geo-claude-skills"
+when_to_use: "Use when optimizing content for AI engines like ChatGPT, Perplexity, AI Overviews, Gemini, Claude, or Copilot. Also for AI citation optimization and generative engine visibility."
+argument-hint: "<content URL or text> [target AI engine]"
+metadata:
+  author: aaron-he-zhu
+  version: "6.0.0"
+  geo-relevance: "high"
+  tags:
+    - geo
+    - ai-citations
+    - chatgpt-optimization
+    - perplexity-optimization
+    - google-ai-overview
+    - gemini
+    - generative-engine-optimization
+    - llm-citations
+    - ai-seo
+    - quotable-content
+    - AI引用优化
+    - GEO优化
+    - AI最適化
+    - AI최적화
+    - optimizacion-ia
+  triggers:
+    # EN-formal
+    - "optimize for AI"
+    - "get cited by ChatGPT"
+    - "AI optimization"
+    - "appear in AI answers"
+    - "GEO optimization"
+    - "generative engine optimization"
+    - "AI-friendly content"
+    - "LLM citations"
+    # EN-casual
+    - "get cited by AI"
+    - "show up in ChatGPT answers"
+    - "AI doesn't mention my brand"
+    - "make content AI-quotable"
+    - "how do I get AI to mention me"
+    - "ChatGPT doesn't know my brand"
+    # EN-question
+    - "how to appear in AI answers"
+    - "how to get cited by AI"
+    - "why doesn't AI mention my brand"
+    # EN-competitor
+    - "Perplexity optimization"
+    - "how to rank in ChatGPT answers"
+    - "AI Overview optimization tool"
+    # ZH-pro
+    - "AI引用优化"
+    - "GEO优化"
+    - "生成式引擎优化"
+    - "AI搜索优化"
+    - "AI可引用内容"
+    # ZH-casual
+    - "让AI引用我"
+    - "ChatGPT搜不到我"
+    - "AI不提我的品牌"
+    - "怎么让AI推荐我"
+    - "AI搜不到"
+    # JA
+    - "AIコンテンツ最適化"
+    - "AI引用されるコンテンツ"
+    - "ChatGPT対策"
+    - "AI検索最適化"
+    # KO
+    - "AI 최적화"
+    - "AI 검색 최적화"
+    - "ChatGPT 인용"
+    # ES
+    - "optimización para IA"
+    - "aparecer en respuestas de IA"
+    - "citación de IA"
+    # PT
+    - "otimização para IA"
+    - "aparecer nas respostas da IA"
+    # Misspellings
+    - "generative engine optimisation"
+    - "GEO optimisation"
+---
+
+# GEO Content Optimizer
+
+> **[SEO & GEO Skills Library](https://github.com/aaron-he-zhu/seo-geo-claude-skills)** · 20 skills for SEO + GEO · [ClawHub](https://clawhub.ai/u/aaron-he-zhu) · [skills.sh](https://skills.sh/aaron-he-zhu/seo-geo-claude-skills)
+> **System Mode**: This build skill follows the shared [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md) and [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md).
+
+
+This skill optimizes content to appear in AI-generated responses. As AI systems increasingly answer user queries directly, getting cited by these systems directly impacts visibility.
+
+**System role**: Build layer skill. It turns briefs and signals into assets that other skills can review, publish, and monitor.
+
+## When This Must Trigger
+
+Use this when the conversation involves any of these situations — even if the user does not use SEO terminology:
+
+Use this whenever the task needs a shippable asset or transformation that should feed directly into quality review, deployment, or monitoring.
+
+- Optimizing existing content for AI citations
+- Creating new content designed for both SEO and GEO
+- Improving chances of appearing in AI Overviews
+- Making content more quotable by AI systems
+- Adding authority signals that AI systems trust
+- Structuring content for AI comprehension
+- Competing for visibility in the AI-first search era
+
+## What This Skill Does
+
+1. **Citation Optimization**: Makes content more likely to be quoted by AI
+2. **Structure Enhancement**: Formats content for AI comprehension
+3. **Authority Building**: Adds signals that AI systems trust
+4. **Factual Enhancement**: Improves accuracy and verifiability
+5. **Quote Creation**: Creates memorable, citeable statements
+6. **Source Attribution**: Adds proper citations that AI can verify
+7. **GEO Scoring**: Evaluates content's AI-friendliness
+
+## Quick Start
+
+Start with one of these prompts. Finish with a short handoff summary using the repository format in [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md).
+
+### Optimize Existing Content
+
+```
+Optimize this content for GEO/AI citations: [content or URL]
+```
+
+```
+Make this article more likely to be cited by AI systems
+```
+
+### Create GEO-Optimized Content
+
+```
+Write content about [topic] optimized for both SEO and GEO
+```
+
+### GEO Audit
+
+```
+Audit this content for GEO readiness and suggest improvements
+```
+
+## Skill Contract
+
+**Expected output**: a ready-to-use asset or implementation-ready transformation plus a short handoff summary ready for `memory/content/`.
+
+- **Reads**: the brief, target keywords, entity inputs, quality constraints, and prior decisions from [CLAUDE.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CLAUDE.md) and the shared [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md) when available.
+- **Writes**: a user-facing content, metadata, or schema deliverable plus a reusable summary that can be stored under `memory/content/`.
+- **Promotes**: approved angles, messaging choices, missing evidence, and publish blockers to `CLAUDE.md`, `memory/decisions.md`, and `memory/open-loops.md`.
+- **Next handoff**: use the `Next Best Skill` below when the asset is ready for review or deployment.
+
+## Data Sources
+
+> See [CONNECTORS.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CONNECTORS.md) for tool category placeholders.
+
+> **Note:** All integrations are optional. This skill works without any API keys — users provide data manually when no tools are connected.
+
+**With ~~AI monitor + ~~SEO tool connected:**
+Automatically pull AI citation patterns (which content is being cited by ChatGPT, Claude, Perplexity), current AI visibility scores, competitor citation frequency, and AI Overview appearance tracking.
+
+**With manual data only:**
+Ask the user to provide:
+1. Target queries where they want AI citations
+2. Current content URL or full content text
+3. Any known instances where competitors are being cited by AI
+
+Proceed with the full workflow using provided data. Note in the output which metrics are from automated collection vs. user-provided data.
+
+## Instructions
+
+When a user requests GEO optimization:
+
+1. **Load CORE-EEAT GEO-First Optimization Targets**
+
+   Before optimizing, load GEO-critical items from the [CORE-EEAT Benchmark](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/core-eeat-benchmark.md):
+
+   ```markdown
+   ### CORE-EEAT GEO-First Targets
+
+   These items have the highest impact on AI engine citation. Use as optimization checklist:
+
+   **Top 6 Priority Items**:
+   | Rank | ID | Standard | Why It Matters |
+   |------|----|----------|---------------|
+   | 1 | C02 | Direct Answer in first 150 words | All engines extract from first paragraph |
+   | 2 | C09 | Structured FAQ with Schema | Directly matches AI follow-up queries |
+   | 3 | O03 | Data in tables, not prose | Most extractable structured format |
+   | 4 | O05 | JSON-LD Schema Markup | Helps AI understand content type |
+   | 5 | E01 | Original first-party data | AI prefers exclusive, verifiable sources |
+   | 6 | O02 | Key Takeaways / Summary Box | First choice for AI summary citations |
+
+   **All GEO-First Items** (optimize for all when possible):
+   C02, C04, C05, C07, C08, C09 | O02, O03, O04, O05, O06, O09
+   R01, R02, R03, R04, R05, R07, R09 | E01, E02, E03, E04, E06, E08, E09, E10
+   Exp10 | Ept05, Ept08 | A08
+
+   **AI Engine Preferences**:
+   | Engine | Priority Items |
+   |--------|----------------|
+   | Google AI Overview | C02, O03, O05, C09 |
+   | ChatGPT Browse | C02, R01, R02, E01 |
+   | Perplexity AI | E01, R03, R05, Ept05 |
+   | Claude | R04, Ept08, Exp10, R03 |
+
+   _Full benchmark: [references/core-eeat-benchmark.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/core-eeat-benchmark.md)_
+   ```
+
+2. **Analyze Current Content**
+
+   ```markdown
+   ## GEO Analysis: [Content Title]
+   
+   ### Current State Assessment
+   
+   | GEO Factor | Current Score (1-10) | Notes |
+   |------------|---------------------|-------|
+   | Clear definitions | [X] | [notes] |
+   | Quotable statements | [X] | [notes] |
+   | Factual density | [X] | [notes] |
+   | Source citations | [X] | [notes] |
+   | Q&A format | [X] | [notes] |
+   | Authority signals | [X] | [notes] |
+   | Content freshness | [X] | [notes] |
+   | Structure clarity | [X] | [notes] |
+   | **GEO Readiness** | **[avg]/10** | **Average across factors** |
+   
+   **Primary Weaknesses**:
+   1. [Weakness 1]
+   2. [Weakness 2]
+   3. [Weakness 3]
+   
+   **Quick Wins**:
+   1. [Quick improvement 1]
+   2. [Quick improvement 2]
+   ```
+
+3. **Apply GEO Optimization Techniques**
+
+   > **GEO fundamentals**: AI systems prioritize content that is authoritative (expert credentials, proper citations), accurate (verifiable, up-to-date), clear (well-structured, unambiguous), and quotable (standalone answers, specific data). See [references/geo-optimization-techniques.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/build/geo-content-optimizer/references/geo-optimization-techniques.md) for details.
+
+   Apply the six core optimization techniques: definition optimization, quotable statement creation, authority signal enhancement, structure optimization, factual density improvement, and FAQ schema implementation.
+
+   > **Reference**: See [references/geo-optimization-techniques.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/build/geo-content-optimizer/references/geo-optimization-techniques.md) for detailed before/after examples, templates, and checklists for each technique.
+
+   Key principles:
+   - **Definitions**: 25-50 words, standalone, starting with the term
+   - **Quotable statements**: Specific statistics with sources, verifiable facts
+   - **Authority signals**: Expert quotes with credentials, proper source citations
+   - **Structure**: Q&A format, comparison tables, numbered lists
+   - **Factual density**: Replace vague claims with specific data points
+   - **FAQ schema**: JSON-LD FAQPage markup matching visible content
+
+4. **Generate GEO-Optimized Output**
+
+   ```markdown
+   ## GEO Optimization Report
+
+   ### Changes Made
+
+   **Definitions Added/Improved**:
+   1. [Definition 1] - [location in content]
+   2. [Definition 2] - [location in content]
+
+   **Quotable Statements Created**:
+   1. "[Statement 1]"
+   2. "[Statement 2]"
+
+   **Authority Signals Added**:
+   1. [Expert quote/citation]
+   2. [Source attribution]
+
+   **Structural Improvements**:
+   1. [Change 1]
+   2. [Change 2]
+
+   ### Before/After GEO Score
+
+   | GEO Factor | Before (1-10) | After (1-10) | Change |
+   |------------|---------------|--------------|--------|
+   | Clear definitions | [X] | [X] | +[X] |
+   | Quotable statements | [X] | [X] | +[X] |
+   | Factual density | [X] | [X] | +[X] |
+   | Source citations | [X] | [X] | +[X] |
+   | Q&A format | [X] | [X] | +[X] |
+   | Authority signals | [X] | [X] | +[X] |
+   | **Overall GEO Score** | **[avg]/10** | **[avg]/10** | **+[X]** |
+
+   ### AI Query Coverage
+
+   This content is now optimized to answer:
+   - "What is [topic]?" ✅
+   - "How does [topic] work?" ✅
+   - "Why is [topic] important?" ✅
+   - "[Topic] vs [alternative]" ✅
+   - "Best [topic] for [use case]" ✅
+   ```
+
+5. **CORE-EEAT GEO Self-Check**
+
+    After optimization, verify GEO-First items:
+
+    ```markdown
+    ### CORE-EEAT GEO Post-Optimization Check
+
+    | ID | Standard | Status | Notes |
+    |----|----------|--------|-------|
+    | C02 | Direct Answer in first 150 words | ✅/⚠️/❌ | [notes] |
+    | C04 | Key terms defined on first use | ✅/⚠️/❌ | [notes] |
+    | C09 | Structured FAQ with Schema | ✅/⚠️/❌ | [notes] |
+    | O02 | Summary Box / Key Takeaways | ✅/⚠️/❌ | [notes] |
+    | O03 | Comparisons in tables | ✅/⚠️/❌ | [notes] |
+    | O05 | JSON-LD Schema Markup | ✅/⚠️/❌ | [notes] |
+    | O06 | Section chunking (3–5 sentences) | ✅/⚠️/❌ | [notes] |
+    | R01 | ≥5 precise data points with units | ✅/⚠️/❌ | [notes] |
+    | R02 | ≥1 citation per 500 words | ✅/⚠️/❌ | [notes] |
+    | R04 | Claims backed by evidence | ✅/⚠️/❌ | [notes] |
+    | R07 | Full entity names | ✅/⚠️/❌ | [notes] |
+    | E01 | Original first-party data | ✅/⚠️/❌ | [notes] |
+    | Exp10 | Limitations acknowledged | ✅/⚠️/❌ | [notes] |
+    | Ept08 | Reasoning transparency | ✅/⚠️/❌ | [notes] |
+
+    **Items Needing Attention**: [list any ⚠️/❌ items]
+
+    _For full 80-item audit, use [content-quality-auditor](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/content-quality-auditor/SKILL.md)_
+    ```
+
+## Validation Checkpoints
+
+### Input Validation
+- [ ] Content source identified (URL, full text, or content draft)
+- [ ] Target AI queries or topics clearly defined
+- [ ] Current GEO baseline assessed (if optimizing existing content)
+
+### Output Validation
+- [ ] At least 3 clear, quotable definitions added
+- [ ] Factual density improved with at least 5 verifiable statistics
+- [ ] All claims have source citations from authoritative sources
+- [ ] Q&A format sections cover top 5 user queries
+- [ ] GEO score improvement of at least 50% from baseline
+- [ ] Source of each data point clearly stated (~~AI monitor data, user-provided, or estimated)
+
+## Example
+
+**User**: "Optimize this paragraph for GEO: 'Email marketing is a good way to reach customers. It's been around for a while and many businesses use it. You can send newsletters, promotions, and other messages.'"
+
+**Output**:
+
+```markdown
+## GEO-Optimized Version
+
+**Email marketing** is a digital marketing strategy that uses email to promote 
+products, build customer relationships, and drive conversions. According to 
+the Data & Marketing Association, email marketing delivers an average ROI 
+of $42 for every $1 invested, making it the highest-performing digital 
+marketing channel.
+
+### Key email marketing formats include:
+
+1. **Newsletters**: Regular updates sharing company news, industry insights, 
+   and valuable content to maintain subscriber engagement
+2. **Promotional emails**: Time-sensitive offers, discounts, and product 
+   announcements designed to drive immediate sales
+3. **Transactional emails**: Order confirmations, shipping updates, and 
+   account notifications with 8x higher open rates than marketing emails
+4. **Automated sequences**: Pre-scheduled email series triggered by user 
+   actions, such as welcome series or abandoned cart reminders
+
+> **Key statistic**: 81% of small businesses rely on email as their primary 
+> customer acquisition channel (Emarsys, 2024).
+
+---
+
+### Changes Made:
+
+Changes: added clear definition, 2 stats with sources, structured list, DMA/Emarsys citations, 3 quotable facts. **GEO Score**: 1/10 → 8/10.
+```
+
+## GEO Optimization Checklist
+
+> **Reference**: See the GEO Readiness Checklist in [references/geo-optimization-techniques.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/build/geo-content-optimizer/references/geo-optimization-techniques.md) for the full checklist covering definitions, quotable content, authority, structure, and technical elements.
+
+## Tips for Success
+
+1. **Answer the question first** - Put the answer in the first sentence
+2. **Be specific** - Vague content doesn't get cited
+3. **Cite sources** - AI systems trust verifiable information
+4. **Stay current** - Update statistics and facts regularly
+5. **Match query format** - Questions deserve direct answers
+6. **Build authority** - Expert credentials increase citation likelihood
+
+
+### Save Results
+
+After delivering content or optimization output to the user, ask:
+
+> "Save these results for future sessions?"
+
+If yes, write a dated summary to `memory/content/YYYY-MM-DD-<topic>.md` containing:
+- One-line description of what was created
+- Target keyword and content type
+- Open loops or items needing review
+- Source data references
+
+**Gate check recommended**: Run content-quality-auditor before publishing (PostToolUse hook will remind automatically).
+
+If any findings should influence ongoing strategy, recommend promoting key conclusions to `memory/hot-cache.md`.
+
+## Reference Materials
+
+- [AI Citation Patterns](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/build/geo-content-optimizer/references/ai-citation-patterns.md) - How Google AI Overviews, ChatGPT, Perplexity, and Claude select and cite sources
+- [Quotable Content Examples](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/build/geo-content-optimizer/references/quotable-content-examples.md) - Before/after examples of content optimized for AI citation
+
+## Next Best Skill
+
+- **Primary**: [content-quality-auditor](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/content-quality-auditor/SKILL.md) — verify the optimized content is strong enough to ship and cite.
diff --git a/.cursor/skills/geo-content-optimizer/references/ai-citation-patterns.md b/.cursor/skills/geo-content-optimizer/references/ai-citation-patterns.md
new file mode 100644
index 0000000000..6430c20a9c
--- /dev/null
+++ b/.cursor/skills/geo-content-optimizer/references/ai-citation-patterns.md
@@ -0,0 +1,504 @@
+# AI Citation Patterns
+
+How different AI systems select and cite content. Understanding these patterns helps optimize content for AI visibility.
+
+## Google AI Overviews
+
+### Citation Behavior
+
+**Format preferences**:
+- Prefers structured, factual content
+- Cites multiple sources per overview
+- Shows source links as footnotes
+- Displays "Sources" section at bottom
+
+**What gets cited**:
+- Clear, direct answers to queries
+- Statistics with recent dates
+- Step-by-step instructions
+- Comparison tables
+- Definition blocks
+- List-formatted content
+
+**Content structure preferences**:
+- Short paragraphs (2-3 sentences)
+- Bullet points and numbered lists
+- Clear headings matching query intent
+- Tables for comparison data
+- FAQ formats
+
+**Authority signals**:
+- Domain authority (trusted sites favored)
+- E-E-A-T signals (expertise, authoritativeness, trustworthiness)
+- Recent publication/update dates
+- Author credentials visible
+- Citations to other authoritative sources
+
+**Citation frequency**: Typically cites 3-8 sources per AI Overview
+
+---
+
+## ChatGPT (with Browsing)
+
+### Citation Behavior
+
+**Format preferences**:
+- Inline citations with numbers [1], [2]
+- "Sources" list at end of response
+- Clickable source links
+- Sometimes quotes directly with quotation marks
+
+**What gets cited**:
+- Specific facts and statistics
+- Expert quotes
+- Technical explanations
+- Recent information (prioritizes freshness)
+- Authoritative domain content
+- Well-structured, scannable content
+
+**Source selection patterns**:
+- Favors .edu, .gov, .org domains
+- Prioritizes recognized brands/publishers
+- Values comprehensive content over thin pages
+- Prefers content with clear attribution
+- Looks for consensus across multiple sources
+
+**Quoting behavior**:
+- Pulls exact quotes when information is distinctive
+- Paraphrases general information
+- Combines information from multiple sources
+- Attributes specific claims to sources
+
+**Citation frequency**: 1-6 sources per response depending on complexity
+
+---
+
+## Perplexity AI
+
+### Citation Behavior
+
+**Format preferences**:
+- Superscript numbers [1] inline
+- Numbered source list with snippets
+- Shows brief excerpt from each source
+- Displays domain name and publish date
+
+**What gets cited**:
+- Recent content (strong freshness bias)
+- Authoritative sources
+- Content with clear, quotable statements
+- Statistical data with sources
+- Primary sources over secondary
+- Content matching query intent precisely
+
+**Content structure preferences**:
+- Extremely well-structured content
+- Clear topic sentences
+- Quotable, standalone statements
+- Factual density (stats, data, specifics)
+- Headings that match question formats
+
+**Authority signals**:
+- Domain credibility
+- Author expertise
+- Publication reputation
+- Recency of content
+- Depth of coverage
+
+**Citation frequency**: Typically 5-10 sources per response (more than others)
+
+**Unique behavior**: Often shows "Follow-up Questions" that can reveal additional citation opportunities
+
+---
+
+## Claude (Knowledge-Based Responses)
+
+### Citation Behavior
+
+**Note**: Claude typically relies on training data rather than live web access, but understanding preferences helps create citeable content.
+
+**Format preferences**:
+- When citing, uses clear attribution phrases
+- "According to [source]..."
+- "Research from [source] shows..."
+- May reference general knowledge without specific citations
+
+**What gets remembered/prioritized**:
+- Clear, authoritative definitions
+- Widely-accepted facts and statistics
+- Well-established methodologies
+- Consensus information
+- Content from recognized authorities
+
+**Content characteristics valued**:
+- Factual accuracy and precision
+- Logical structure and clarity
+- Comprehensive explanations
+- Technical accuracy
+- Unambiguous language
+
+---
+
+## Common Traits Across All AI Systems
+
+### Universal Citation Factors
+
+**Content quality**:
+- Factual accuracy (incorrect info won't be cited)
+- Clear, unambiguous language
+- Proper grammar and spelling
+- Comprehensive coverage
+- Up-to-date information
+
+**Structure**:
+- Scannable format (headings, lists, tables)
+- Logical organization
+- Clear topic segmentation
+- Short paragraphs
+- Visual hierarchy
+
+**Authority**:
+- Domain credibility
+- Author credentials
+- Source citations in content
+- Expertise signals
+- Editorial quality
+
+**Relevance**:
+- Precise match to query intent
+- Topic focus (not meandering)
+- Keyword-topic alignment
+- Depth of coverage on specific topic
+
+---
+
+## Optimal Content Structures for Citation
+
+### 1. Definition Blocks
+
+AI systems love clear, quotable definitions.
+
+**Structure**:
+```markdown
+**[Term]** is [clear category] that [primary function], [key characteristic].
+```
+
+**Example**:
+> **Search Engine Optimization (SEO)** is a digital marketing practice that improves website visibility in organic search results by optimizing content, technical elements, and authority signals.
+
+**Why it works**: Standalone, complete, unambiguous, proper scope.
+
+---
+
+### 2. Statistic Blocks
+
+Facts with sources are highly citeable.
+
+**Structure**:
+```markdown
+According to [Source], [specific statistic] as of [timeframe].
+```
+
+**Example**:
+> According to HubSpot's 2024 State of Marketing Report, 82% of marketers actively invest in content marketing, making it the most widely adopted digital marketing strategy.
+
+**Why it works**: Specific, attributed, recent, verifiable.
+
+---
+
+### 3. Q&A Pairs
+
+Question-answer formats match AI query patterns.
+
+**Structure**:
+```markdown
+### [Question matching common query]?
+
+[Direct answer in 40-60 words]
+
+[Optional supporting detail]
+```
+
+**Example**:
+> ### How long does SEO take to show results?
+>
+> SEO typically takes 3-6 months to show significant results for new websites, though this varies based on competition, domain authority, and strategy. Established sites may see improvements in 1-3 months for less competitive keywords.
+
+**Why it works**: Matches query format, provides concise answer, includes qualifiers.
+
+---
+
+### 4. Comparison Tables
+
+Structured comparisons are easy for AI to parse and cite.
+
+**Structure**:
+```markdown
+| Feature | Option A | Option B |
+|---------|----------|----------|
+| [Factor 1] | [Specific value] | [Specific value] |
+| [Factor 2] | [Specific value] | [Specific value] |
+| **Best for** | [Use case] | [Use case] |
+```
+
+**Example**:
+| Factor | Technical SEO | On-Page SEO |
+|--------|---------------|-------------|
+| Focus | Site infrastructure | Content optimization |
+| Timeframe | 1-3 months | Ongoing |
+| Complexity | High | Medium |
+| **Best for** | Site-wide issues | Individual page improvements |
+
+**Why it works**: Clear comparison, specific values, scannable format.
+
+---
+
+### 5. Step-by-Step Processes
+
+Numbered lists for "how to" queries.
+
+**Structure**:
+```markdown
+1. **[Action]** - [Brief explanation]
+2. **[Action]** - [Brief explanation]
+3. **[Action]** - [Brief explanation]
+```
+
+**Example**:
+> To conduct keyword research:
+> 1. **Identify seed keywords** - List 5-10 topics your audience searches for
+> 2. **Use keyword research tools** - Expand seed keywords into hundreds of variations
+> 3. **Analyze search intent** - Determine what content format each keyword requires
+> 4. **Evaluate competition** - Assess ranking difficulty for each keyword
+> 5. **Prioritize keywords** - Choose based on volume, difficulty, and relevance
+
+**Why it works**: Clear process, actionable steps, logical sequence.
+
+---
+
+### 6. List-Based Content
+
+Curated lists with brief explanations.
+
+**Structure**:
+```markdown
+**[Item name]**: [Clear description with key benefit]
+```
+
+**Example**:
+> Top on-page SEO factors:
+> - **Title tags**: Most important on-page element; include primary keyword within first 60 characters
+> - **Header tags**: Structure content hierarchically; use one H1, multiple H2s for main sections
+> - **Meta descriptions**: Don't directly impact rankings but affect CTR; keep under 160 characters
+> - **URL structure**: Use descriptive, keyword-rich URLs without unnecessary parameters
+
+**Why it works**: Scannable, specific, actionable.
+
+---
+
+### 7. Before/After Examples
+
+Concrete examples showing transformation.
+
+**Structure**:
+```markdown
+**Before**: [Weak example]
+**After**: [Strong example]
+**Why it's better**: [Explanation]
+```
+
+**Example**:
+> **Before**: "Email marketing is pretty effective."
+> **After**: "Email marketing delivers an average ROI of $42 for every $1 spent, according to the Data & Marketing Association."
+> **Why it's better**: Specific statistic, attributed source, quantifiable claim.
+
+**Why it works**: Shows concrete improvement, demonstrates principle.
+
+---
+
+### 8. Key Insight Callouts
+
+Highlighted important points.
+
+**Structure**:
+```markdown
+> **Key insight**: [Memorable, quotable statement]
+```
+
+**Example**:
+> **Key insight**: According to Google's John Mueller, internal linking is one of the most underutilized SEO tactics, with properly structured internal links often delivering faster ranking improvements than external link building.
+
+**Why it works**: Visually distinct, authoritative, quotable.
+
+---
+
+## Content Optimization by Query Type
+
+### Informational Queries ("What is...", "How does...", "Why...")
+
+**AI citation priorities**:
+1. Clear definitions
+2. Comprehensive explanations
+3. Expert perspectives
+4. Supporting statistics
+5. Real-world examples
+
+**Optimal structure**:
+- Definition in first paragraph
+- "Why it matters" section
+- How it works explanation
+- Common use cases
+- Expert quotes or citations
+
+---
+
+### Comparison Queries ("[A] vs [B]", "Best [category]")
+
+**AI citation priorities**:
+1. Comparison tables
+2. Clear pros/cons lists
+3. Use case recommendations
+4. Specific differentiators
+5. Verdict or recommendation
+
+**Optimal structure**:
+- Quick comparison table upfront
+- Individual descriptions
+- Feature-by-feature comparison
+- "Choose X if..." recommendations
+- Summary verdict
+
+---
+
+### How-To Queries ("How to...", "Steps to...")
+
+**AI citation priorities**:
+1. Numbered step-by-step processes
+2. Required tools/prerequisites
+3. Time estimates
+4. Success indicators
+5. Troubleshooting tips
+
+**Optimal structure**:
+- Prerequisites listed first
+- Clear numbered steps
+- Sub-steps where needed
+- Visual indicators of progress
+- Common problems and solutions
+
+---
+
+### Statistical Queries ("How much...", "How many...", "Statistics about...")
+
+**AI citation priorities**:
+1. Specific numbers with sources
+2. Recent data (within 1-2 years)
+3. Multiple data points
+4. Context for statistics
+5. Trend information
+
+**Optimal structure**:
+- Lead with key statistic
+- Source attribution immediately after
+- Context and interpretation
+- Related statistics
+- Takeaways from data
+
+---
+
+## Citation Likelihood Factors
+
+### High Citation Likelihood
+
+- [ ] Content from recognized authority domains
+- [ ] Published or updated within 12 months
+- [ ] Clear, standalone statements
+- [ ] Proper source attribution
+- [ ] Specific statistics with dates
+- [ ] Structured with headings/lists/tables
+- [ ] Comprehensive topic coverage
+- [ ] Author credentials visible
+- [ ] Technical accuracy verified
+- [ ] Consensus with other sources
+
+### Medium Citation Likelihood
+
+- [ ] Content from less-known but quality domains
+- [ ] Published 1-2 years ago
+- [ ] Clear but requires slight context
+- [ ] General industry claims
+- [ ] Good structure but less scannable
+- [ ] Moderate depth of coverage
+- [ ] No author listed but quality content
+- [ ] Some supporting evidence
+
+### Low Citation Likelihood
+
+- [ ] Content from unknown/low-authority domains
+- [ ] Published 3+ years ago without updates
+- [ ] Vague or ambiguous statements
+- [ ] No sources cited
+- [ ] Poor content structure (walls of text)
+- [ ] Thin or superficial coverage
+- [ ] Promotional or biased tone
+- [ ] Factual inconsistencies
+- [ ] No expertise signals
+
+---
+
+## AI System Comparison Summary
+
+| Factor | Google AI Overviews | ChatGPT | Perplexity | Claude |
+|--------|---------------------|---------|------------|--------|
+| **Freshness bias** | High | Medium | Very high | N/A (training data) |
+| **Authority weight** | Very high | High | High | High |
+| **Structure importance** | High | Medium | Very high | Medium |
+| **Citation count** | 3-8 | 1-6 | 5-10 | N/A |
+| **Quotable focus** | High | Medium | Very high | High |
+| **Domain trust** | Very high | High | Medium | High |
+| **Factual density** | High | High | Very high | Very high |
+
+---
+
+## Tracking AI Citations
+
+### Manual Monitoring
+
+**Check if your content appears in**:
+- Google AI Overviews for target keywords
+- ChatGPT responses (search your domain in ChatGPT)
+- Perplexity results for relevant queries
+- Other AI search engines
+
+**Test queries**:
+- Exact-match questions from your FAQ
+- Definitions of terms you've defined
+- Statistics you've cited with attribution
+- Processes you've documented
+
+### Indicators of AI Visibility
+
+- Increased direct traffic (AI users clicking sources)
+- Traffic spikes from unusual referrers
+- Engagement metrics: low bounce rate, high time-on-page
+- Return visitors (AI users coming back for more depth)
+
+---
+
+## Optimization Checklist for AI Citations
+
+Content ready for AI citation should have:
+
+- [ ] At least 3 clear, quotable definitions
+- [ ] 5+ specific statistics with sources and dates
+- [ ] Q&A format sections covering top queries
+- [ ] Comparison tables where relevant
+- [ ] Numbered lists for processes
+- [ ] Content published or updated within 12 months
+- [ ] Author credentials visible
+- [ ] External citations to authoritative sources
+- [ ] Structured with clear H2/H3 headings
+- [ ] Short paragraphs (2-4 sentences)
+- [ ] No promotional language
+- [ ] Technical accuracy verified
+- [ ] Mobile-friendly formatting
diff --git a/.cursor/skills/geo-content-optimizer/references/geo-optimization-techniques.md b/.cursor/skills/geo-content-optimizer/references/geo-optimization-techniques.md
new file mode 100644
index 0000000000..cfc343a748
--- /dev/null
+++ b/.cursor/skills/geo-content-optimizer/references/geo-optimization-techniques.md
@@ -0,0 +1,235 @@
+# GEO Optimization Techniques
+
+Detailed techniques for optimizing content for AI citation across Google AI Overviews, ChatGPT, Perplexity AI, Claude, and Gemini.
+
+## Definition Optimization
+
+AI systems love clear, quotable definitions.
+
+**Before** (Weak for GEO):
+> SEO is really important for businesses and involves various
+> techniques to improve visibility online through search engines.
+
+**After** (Strong for GEO):
+> **Search Engine Optimization (SEO)** is the practice of optimizing
+> websites and content to rank higher in search engine results pages
+> (SERPs), increasing organic traffic and visibility.
+
+**Definition Template**:
+"[Term] is [clear category/classification] that [primary function/purpose],
+[key characteristic or benefit]."
+
+**Checklist for GEO-Optimized Definitions**:
+- [ ] Starts with the term being defined
+- [ ] Provides clear category (what type of thing it is)
+- [ ] Explains primary function or purpose
+- [ ] Uses precise, unambiguous language
+- [ ] Can stand alone as a complete answer
+- [ ] Is 25-50 words for optimal citation length
+
+## Quotable Statement Optimization
+
+AI systems cite specific, standalone statements. Transform vague
+content into quotable facts.
+
+**Weak (Not quotable)**:
+> Email marketing is pretty effective and lots of companies use it.
+
+**Strong (Quotable)**:
+> Email marketing delivers an average ROI of $42 for every $1 spent,
+> making it one of the highest-performing digital marketing channels.
+
+**Types of Quotable Statements**:
+
+1. **Statistics**
+   - Include specific numbers
+   - Cite the source
+   - Add context (timeframe, comparison)
+
+   Example: "According to [Source], [specific statistic] as of [date]."
+
+2. **Facts**
+   - Verifiable information
+   - Unambiguous language
+   - Authoritative source
+
+   Example: "[Subject] was [fact], according to [authoritative source]."
+
+3. **Definitions** (covered above)
+
+4. **Comparisons**
+   - Clear comparison structure
+   - Specific differentiators
+
+   Example: "Unlike [A], [B] [specific difference], which means [implication]."
+
+5. **How-to Steps**
+   - Numbered, clear steps
+   - Action-oriented language
+
+   Example: "To [achieve goal], [step 1], then [step 2], and finally [step 3]."
+
+## Authority Signal Enhancement
+
+**Expert Attribution**
+
+Add expert quotes and credentials:
+
+> "AI will transform how we search for information," says Dr. Jane Smith,
+> AI Research Director at Stanford University.
+
+**Source Citations**
+
+Properly cite sources that AI can verify:
+
+Before:
+> Studies show that most people prefer video content.
+
+After:
+> According to Wyzowl's 2024 Video Marketing Statistics report,
+> 91% of consumers want to see more online video content from brands.
+
+**Authority Elements to Add**:
+- [ ] Author byline with credentials
+- [ ] Expert quotes with attribution
+- [ ] Citations to peer-reviewed research
+- [ ] References to recognized authorities
+- [ ] Original data or research
+- [ ] Case studies with named companies
+- [ ] Industry statistics with sources
+
+## Structure Optimization for GEO
+
+AI systems parse structured content more effectively.
+
+**Q&A Format**
+
+Transform content into question-answer pairs:
+
+```html
+<h2>What is [Topic]?</h2>
+<p>[Direct answer in 40-60 words]</p>
+
+<h2>How does [Topic] work?</h2>
+<p>[Clear explanation with steps if applicable]</p>
+
+<h2>Why is [Topic] important?</h2>
+<p>[Specific reasons with evidence]</p>
+```
+
+**Comparison Tables**
+
+For comparison queries, use clear tables:
+
+| Feature | Option A | Option B |
+|---------|----------|----------|
+| [Feature 1] | [Specific value] | [Specific value] |
+| [Feature 2] | [Specific value] | [Specific value] |
+| **Best for** | [Use case] | [Use case] |
+
+**Numbered Lists**
+
+For process or list queries:
+
+1. **Step 1: [Action]** - [Brief explanation]
+2. **Step 2: [Action]** - [Brief explanation]
+3. **Step 3: [Action]** - [Brief explanation]
+
+**Definition Boxes**
+
+Highlight key definitions:
+
+> **Key Definition**: [Term] refers to [clear definition].
+
+## Factual Density Improvement
+
+AI systems prefer fact-rich content over opinion-heavy content.
+
+**Content Transformation**:
+
+**Low factual density**:
+> Social media marketing is very popular nowadays. Many businesses
+> use it and find it helpful for reaching customers.
+
+**High factual density**:
+> Social media marketing reaches 4.9 billion users globally (Statista, 2024).
+> Businesses using social media marketing report 66% higher lead generation
+> rates compared to non-users (HubSpot State of Marketing Report, 2024).
+> The most effective platforms for B2B marketing are LinkedIn (96% usage),
+> Twitter (82%), and Facebook (80%).
+
+**Factual Enhancement Checklist**:
+- [ ] Add specific statistics with sources
+- [ ] Include exact dates, numbers, percentages
+- [ ] Replace vague claims with verified facts
+- [ ] Add recent data (within last 2 years)
+- [ ] Include multiple data points per section
+- [ ] Cross-reference with authoritative sources
+
+## FAQ Optimization for GEO
+
+FAQ sections are highly effective for GEO because:
+- They match question-based AI queries
+- They provide concise, structured answers
+- FAQ schema helps AI understand Q&A pairs
+
+**FAQ Structure**:
+
+## Frequently Asked Questions
+
+### [Question matching common query]?
+
+[Direct answer: 40-60 words]
+[Supporting detail or example]
+
+### [Question matching common query]?
+
+[Direct answer: 40-60 words]
+[Supporting detail or example]
+
+**FAQ Schema (JSON-LD)**:
+
+```json
+{
+  "@context": "https://schema.org",
+  "@type": "FAQPage",
+  "mainEntity": [{
+    "@type": "Question",
+    "name": "[Question text]",
+    "acceptedAnswer": {
+      "@type": "Answer",
+      "text": "[Answer text]"
+    }
+  }]
+}
+```
+
+## GEO Readiness Checklist
+
+Use this checklist for any content:
+
+**Definitions & Clarity**
+- [ ] Key terms are clearly defined
+- [ ] Definitions can stand alone as answers
+- [ ] Language is precise and unambiguous
+
+**Quotable Content**
+- [ ] Specific statistics included
+- [ ] Facts have source citations
+- [ ] Memorable statements created
+
+**Authority**
+- [ ] Expert quotes or credentials present
+- [ ] Authoritative sources cited
+- [ ] Original data or research included
+
+**Structure**
+- [ ] Q&A format sections included
+- [ ] Clear headings match common queries
+- [ ] Comparison tables where relevant
+- [ ] Numbered lists for processes
+
+**Technical**
+- [ ] FAQ schema markup added
+- [ ] Content freshness indicated
+- [ ] Sources are verifiable
diff --git a/.cursor/skills/geo-content-optimizer/references/quotable-content-examples.md b/.cursor/skills/geo-content-optimizer/references/quotable-content-examples.md
new file mode 100644
index 0000000000..ddc73201f1
--- /dev/null
+++ b/.cursor/skills/geo-content-optimizer/references/quotable-content-examples.md
@@ -0,0 +1,484 @@
+# Quotable Content Examples
+
+Before/after examples of content optimized for AI citation. Each example shows the transformation from generic to quotable.
+
+## Example 1: Definition Block
+
+### Before (Generic, Not Quotable)
+
+> SEO is really important for businesses and involves various techniques to improve visibility online through search engines. It's been around for a while and many businesses use it to get more traffic to their websites.
+
+**Problems**:
+- Vague language ("really important", "various techniques")
+- No clear classification
+- Meandering structure
+- Not standalone
+
+**Citation likelihood**: 1/10
+
+---
+
+### After (Clear, Quotable)
+
+> **Search Engine Optimization (SEO)** is a digital marketing practice that improves website rankings in organic search results through content optimization, technical improvements, and authority building. According to BrightEdge research, organic search drives 53% of all website traffic, making SEO the highest-impact channel for sustainable online visibility.
+
+**Improvements**:
+- Clear term + category classification
+- Specific components listed
+- Authoritative statistic with source
+- Standalone statement
+- Precise language
+
+**Citation likelihood**: 9/10
+
+---
+
+## Example 2: Statistical Content
+
+### Before (Weak)
+
+> Email marketing is pretty effective and lots of companies use it. It has a good return on investment compared to other marketing channels.
+
+**Problems**:
+- No specific numbers
+- No source attribution
+- Vague comparisons ("pretty effective", "lots", "good")
+- Not verifiable
+
+**Citation likelihood**: 2/10
+
+---
+
+### After (Strong)
+
+> Email marketing delivers an average ROI of $42 for every $1 spent, according to the Data & Marketing Association's 2024 research. This 4,200% return makes email the highest-performing digital marketing channel, outperforming social media (28% ROI) and paid search (23% ROI) by significant margins.
+
+**Improvements**:
+- Specific statistic with exact numbers
+- Source attribution with year
+- Comparative data
+- Verifiable claim
+- Context provided
+
+**Citation likelihood**: 9/10
+
+---
+
+## Example 3: Process/How-To Content
+
+### Before (Unclear)
+
+> To do keyword research, you should think about what your customers might search for and then use some tools to find more keywords. Look at what your competitors are doing too. Then pick the best keywords for your content.
+
+**Problems**:
+- Vague steps ("think about", "some tools")
+- No clear structure
+- Missing specifics
+- No actionable guidance
+
+**Citation likelihood**: 2/10
+
+---
+
+### After (Clear Process)
+
+> To conduct effective keyword research:
+>
+> 1. **Identify seed keywords** - List 5-10 core topics your target audience searches for
+> 2. **Expand with research tools** - Use tools like Google Keyword Planner or Ahrefs to generate 100+ related keywords
+> 3. **Analyze search intent** - Categorize each keyword as informational, commercial, or transactional
+> 4. **Assess competition** - Evaluate ranking difficulty using domain authority and SERP analysis
+> 5. **Prioritize strategically** - Select 10-15 keywords balancing search volume (1,000+ monthly searches) with achievable competition
+>
+> This process typically takes 2-4 hours for a comprehensive initial keyword list.
+
+**Improvements**:
+- Numbered, sequential steps
+- Specific actions in each step
+- Concrete numbers and tools
+- Time estimate included
+- Actionable guidance
+
+**Citation likelihood**: 8/10
+
+---
+
+## Example 4: Comparison Content
+
+### Before (Vague Comparison)
+
+> WordPress and Shopify are both popular website builders. WordPress is more flexible while Shopify is easier to use. The choice depends on what you need.
+
+**Problems**:
+- No specific differentiators
+- Vague claims ("more flexible", "easier")
+- No clear recommendation criteria
+- Not helpful for decision-making
+
+**Citation likelihood**: 2/10
+
+---
+
+### After (Structured Comparison)
+
+> | Factor | WordPress | Shopify |
+> |--------|-----------|---------|
+> | **Best for** | Content-heavy sites, blogs, custom needs | E-commerce, quick setup |
+> | **Setup time** | 4-8 hours | 1-2 hours |
+> | **Monthly cost** | $10-50 (hosting + theme) | $29-299 (subscription) |
+> | **Customization** | Unlimited (50,000+ plugins) | Limited to Shopify apps |
+> | **E-commerce** | Requires WooCommerce plugin | Built-in, optimized |
+> | **SEO control** | Complete control | Good but limited |
+>
+> **Choose WordPress if**: You need maximum flexibility, run a content-first site, or have technical resources.
+>
+> **Choose Shopify if**: E-commerce is your primary goal, you want fast setup, or you lack technical expertise.
+
+**Improvements**:
+- Structured table format
+- Specific, comparable metrics
+- Clear use case recommendations
+- Quantifiable differences
+- Decision framework
+
+**Citation likelihood**: 9/10
+
+---
+
+## Example 5: Expert Content
+
+### Before (No Authority)
+
+> Many people think that social media is important for SEO. It can help you get more visibility and traffic.
+
+**Problems**:
+- No expert attribution
+- Vague claims ("many people think")
+- No specific mechanism explained
+- Weak authority signals
+
+**Citation likelihood**: 1/10
+
+---
+
+### After (Expert-Backed)
+
+> While social media doesn't directly impact search rankings, it influences SEO through indirect channels. "Social signals drive discovery, which leads to backlinks and brand searches—both powerful ranking factors," explains Rand Fishkin, founder of SparkToro and former CEO of Moz. Research from Hootsuite found that content promoted on social media earns 2.3x more backlinks than non-promoted content within the first 30 days of publication.
+
+**Improvements**:
+- Expert quote with credentials
+- Specific mechanism explained
+- Research data with source
+- Quantifiable impact (2.3x)
+- Authority signals
+
+**Citation likelihood**: 9/10
+
+---
+
+## Example 6: Q&A Content
+
+### Before (Incomplete Answer)
+
+> **How long does SEO take?**
+>
+> It depends on a lot of factors. Sometimes it's fast, sometimes it takes a while. New sites usually take longer than established ones.
+
+**Problems**:
+- No specific timeframe
+- "It depends" without explaining what it depends on
+- Vague language ("fast", "a while", "longer")
+- Not actionable
+
+**Citation likelihood**: 1/10
+
+---
+
+### After (Complete Answer)
+
+> **How long does SEO take to show results?**
+>
+> SEO typically takes 3-6 months to show significant results for new websites. Established sites with existing authority may see improvements in 1-3 months for less competitive keywords. Results depend on four key factors:
+>
+> 1. **Domain authority** - New domains take 6-12 months; established domains see faster results
+> 2. **Competition level** - Low-competition keywords rank in 1-3 months; high-competition may take 12+ months
+> 3. **Implementation quality** - Comprehensive optimization accelerates results
+> 4. **Existing backlink profile** - Sites with 20+ quality backlinks see 40% faster improvements (Ahrefs study)
+>
+> Most businesses should budget 6 months for measurable ROI from SEO efforts.
+
+**Improvements**:
+- Specific timeframes with ranges
+- Factors clearly explained
+- Differentiated by scenario
+- Supporting data with source
+- Actionable planning guidance
+
+**Citation likelihood**: 9/10
+
+---
+
+## Example 7: List Content
+
+### Before (Thin List)
+
+> Here are some important on-page SEO factors:
+> - Title tags
+> - Meta descriptions
+> - Headers
+> - Content
+> - Links
+> - Images
+
+**Problems**:
+- No explanations
+- No prioritization
+- Not actionable
+- Missing context
+
+**Citation likelihood**: 2/10
+
+---
+
+### After (Rich List)
+
+> Critical on-page SEO factors ranked by impact:
+>
+> 1. **Title tags** - Most important on-page element; include primary keyword within first 60 characters. Title tags directly influence both rankings and CTR.
+>
+> 2. **Content quality and depth** - Comprehensive content (1,500+ words for competitive keywords) outranks thin content. Focus on topical authority over keyword density.
+>
+> 3. **Header structure (H1-H6)** - Use one H1 with primary keyword, multiple H2s for main sections. Proper hierarchy helps search engines understand content structure.
+>
+> 4. **Internal linking** - Link to 3-5 related pages using descriptive anchor text. Internal links distribute page authority and help search engines discover content.
+>
+> 5. **Image optimization** - Compress images to under 100KB, use descriptive alt text with target keywords. Page speed and accessibility both impact rankings.
+>
+> 6. **URL structure** - Use descriptive, keyword-rich URLs (example.com/keyword-topic) avoiding parameters. Clean URLs improve click-through rates by 25% (Backlinko study).
+>
+> Optimize these six factors before moving to advanced tactics.
+
+**Improvements**:
+- Prioritized ranking
+- Specific guidance for each
+- Actionable recommendations
+- Supporting data
+- Clear next step
+
+**Citation likelihood**: 8/10
+
+---
+
+## Example 8: Trend/Prediction Content
+
+### Before (Speculation)
+
+> AI is going to change SEO a lot in the future. Things will be different and SEO professionals will need to adapt.
+
+**Problems**:
+- Vague predictions ("a lot", "different")
+- No specific changes identified
+- No timeframe
+- No evidence
+
+**Citation likelihood**: 1/10
+
+---
+
+### After (Evidence-Based Trend)
+
+> AI is fundamentally transforming SEO search behavior. According to Gartner's 2024 research, AI-powered search will reduce traditional search engine traffic by 25% by 2026, as users increasingly rely on AI chatbots for information.
+>
+> Three specific shifts SEO professionals must prepare for:
+>
+> 1. **Zero-click content consumption** - Users get answers without visiting websites, requiring optimization for AI citations rather than clicks
+>
+> 2. **Conversation-based queries** - Natural language questions replace keyword-based searches, demanding content that answers specific questions
+>
+> 3. **Source attribution value** - Being cited by AI systems (Generative Engine Optimization/GEO) becomes as important as ranking #1
+>
+> Forrester predicts that by 2025, 30% of organic traffic will come from AI system citations rather than traditional search results, making GEO optimization critical for visibility.
+
+**Improvements**:
+- Specific statistics with sources
+- Clear timeframes
+- Concrete changes identified
+- Multiple research citations
+- Actionable implications
+
+**Citation likelihood**: 9/10
+
+---
+
+## Example 9: Technical Explanation
+
+### Before (Complex, Unclear)
+
+> Core Web Vitals are metrics that Google uses to measure user experience. They look at how fast your site loads and how stable it is when people use it. You should try to improve them.
+
+**Problems**:
+- Oversimplified without specifics
+- No concrete metrics
+- Vague improvement guidance
+- Missing thresholds
+
+**Citation likelihood**: 2/10
+
+---
+
+### After (Clear, Specific)
+
+> Core Web Vitals are three specific metrics Google uses to measure page experience:
+>
+> | Metric | Measures | Good Score | Impact |
+> |--------|----------|------------|--------|
+> | **LCP (Largest Contentful Paint)** | Loading performance | <2.5 seconds | How quickly main content appears |
+> | **FID (First Input Delay)** | Interactivity | <100 milliseconds | How quickly page responds to user input |
+> | **CLS (Cumulative Layout Shift)** | Visual stability | <0.1 | How much content shifts during loading |
+>
+> According to Google's 2024 documentation, pages meeting all three thresholds rank higher and receive 24% more clicks than pages failing these metrics. Core Web Vitals became an official ranking factor in June 2021 as part of Google's Page Experience update.
+>
+> Test your Core Web Vitals with ~~page speed tool.
+
+**Improvements**:
+- Three specific metrics defined
+- Clear thresholds provided
+- Table format for scannability
+- Official data with source
+- Testing tool referenced
+
+**Citation likelihood**: 9/10
+
+---
+
+## Example 10: Case Study Content
+
+### Before (Vague Results)
+
+> We improved our client's SEO and they got more traffic. They were really happy with the results and saw an increase in their business.
+
+**Problems**:
+- No specific numbers
+- No timeframe
+- No methodology shared
+- Not reproducible
+
+**Citation likelihood**: 1/10
+
+---
+
+### After (Data-Rich Case Study)
+
+> **Case Study: 312% Organic Traffic Increase in 6 Months**
+>
+> **Client**: B2B SaaS company (project management software)
+>
+> **Challenge**: Only 450 monthly organic visitors, ranking #20+ for target keywords
+>
+> **Strategy implemented**:
+> 1. Created 24 pillar pages targeting high-intent keywords
+> 2. Built internal linking hub connecting 150 existing pages
+> 3. Optimized 80 existing pages for featured snippets
+> 4. Secured 45 high-authority backlinks through expert roundups
+>
+> **Results (6 months)**:
+> - Organic traffic: 450 → 1,854 monthly visitors (+312%)
+> - Keyword rankings: 3 keywords in top 10 → 47 keywords in top 10
+> - Featured snippets: 0 → 12 owned snippets
+> - Lead conversions: 8 → 41 monthly leads (+412%)
+>
+> **Key insight**: 80% of traffic growth came from optimizing existing content rather than creating new pages, demonstrating the power of content refreshing strategies.
+
+**Improvements**:
+- Specific metrics with percentages
+- Clear timeframe
+- Detailed methodology
+- Reproducible tactics
+- Key learning identified
+
+**Citation likelihood**: 9/10
+
+---
+
+## Transformation Checklist
+
+To make any content more quotable:
+
+**Definitions**:
+- [ ] Replace vague language with precise terms
+- [ ] Add clear category classification
+- [ ] Make statement standalone
+- [ ] Include scope and boundaries
+
+**Statistics**:
+- [ ] Add specific numbers
+- [ ] Include source attribution
+- [ ] Specify timeframe/date
+- [ ] Add context for interpretation
+- [ ] Verify accuracy
+
+**Processes**:
+- [ ] Number steps clearly
+- [ ] Make each step actionable
+- [ ] Add time estimates
+- [ ] Include specific tools/methods
+- [ ] Provide success criteria
+
+**Comparisons**:
+- [ ] Use table format
+- [ ] Include specific, comparable metrics
+- [ ] Add "choose if..." recommendations
+- [ ] Quantify differences
+- [ ] Cover key decision factors
+
+**Lists**:
+- [ ] Prioritize items
+- [ ] Add explanations to each
+- [ ] Include specific guidance
+- [ ] Provide examples
+- [ ] Make actionable
+
+**Expert content**:
+- [ ] Add attributed quotes
+- [ ] Include credentials
+- [ ] Reference studies/research
+- [ ] Cite authoritative sources
+- [ ] Quantify claims
+
+---
+
+## Content Optimization Pattern
+
+**Generic → Quotable transformation follows this pattern**:
+
+1. **Add specificity** - Replace "many" with "73%", "fast" with "under 2.5 seconds"
+2. **Include attribution** - Add sources for all claims
+3. **Structure clearly** - Use tables, lists, numbered steps
+4. **Quantify everything** - Numbers > adjectives
+5. **Make standalone** - Each statement should work in isolation
+6. **Add context** - Explain why, not just what
+7. **Use precise language** - Eliminate ambiguity
+8. **Include timeframes** - When was this measured/true?
+9. **Show authority** - Reference experts, studies, data
+10. **Enable verification** - Link to sources, cite methodology
+
+---
+
+## Quick Quotability Test
+
+Ask these questions about your content:
+
+1. **Can AI quote this without additional context?** - If no, add context
+2. **Does it include specific numbers?** - If no, add data
+3. **Is the source clear?** - If no, add attribution
+4. **Is language precise and unambiguous?** - If no, clarify
+5. **Would an expert approve this?** - If no, add authority signals
+6. **Is it scannable?** - If no, add structure
+7. **Is it up-to-date?** - If no, update or add date context
+8. **Can it be verified?** - If no, add sources
+9. **Is it specific to a use case?** - If no, add context
+10. **Does it answer a complete question?** - If no, expand
+
+If you answer "yes" to 8+ questions, content is highly quotable.
+If you answer "yes" to 5-7 questions, content is moderately quotable.
+If you answer "yes" to fewer than 5, content needs significant optimization.
diff --git a/.cursor/skills/improve-codebase-architecture/DEEPENING.md b/.cursor/skills/improve-codebase-architecture/DEEPENING.md
new file mode 100644
index 0000000000..ecaf5d7dcf
--- /dev/null
+++ b/.cursor/skills/improve-codebase-architecture/DEEPENING.md
@@ -0,0 +1,37 @@
+# Deepening
+
+How to deepen a cluster of shallow modules safely, given its dependencies. Assumes the vocabulary in [LANGUAGE.md](LANGUAGE.md) — **module**, **interface**, **seam**, **adapter**.
+
+## Dependency categories
+
+When assessing a candidate for deepening, classify its dependencies. The category determines how the deepened module is tested across its seam.
+
+### 1. In-process
+
+Pure computation, in-memory state, no I/O. Always deepenable — merge the modules and test through the new interface directly. No adapter needed.
+
+### 2. Local-substitutable
+
+Dependencies that have local test stand-ins (PGLite for Postgres, in-memory filesystem). Deepenable if the stand-in exists. The deepened module is tested with the stand-in running in the test suite. The seam is internal; no port at the module's external interface.
+
+### 3. Remote but owned (Ports & Adapters)
+
+Your own services across a network boundary (microservices, internal APIs). Define a **port** (interface) at the seam. The deep module owns the logic; the transport is injected as an **adapter**. Tests use an in-memory adapter. Production uses an HTTP/gRPC/queue adapter.
+
+Recommendation shape: *"Define a port at the seam, implement an HTTP adapter for production and an in-memory adapter for testing, so the logic sits in one deep module even though it's deployed across a network."*
+
+### 4. True external (Mock)
+
+Third-party services (Stripe, Twilio, etc.) you don't control. The deepened module takes the external dependency as an injected port; tests provide a mock adapter.
+
+## Seam discipline
+
+- **One adapter means a hypothetical seam. Two adapters means a real one.** Don't introduce a port unless at least two adapters are justified (typically production + test). A single-adapter seam is just indirection.
+- **Internal seams vs external seams.** A deep module can have internal seams (private to its implementation, used by its own tests) as well as the external seam at its interface. Don't expose internal seams through the interface just because tests use them.
+
+## Testing strategy: replace, don't layer
+
+- Old unit tests on shallow modules become waste once tests at the deepened module's interface exist — delete them.
+- Write new tests at the deepened module's interface. The **interface is the test surface**.
+- Tests assert on observable outcomes through the interface, not internal state.
+- Tests should survive internal refactors — they describe behaviour, not implementation. If a test has to change when the implementation changes, it's testing past the interface.
diff --git a/.cursor/skills/improve-codebase-architecture/INTERFACE-DESIGN.md b/.cursor/skills/improve-codebase-architecture/INTERFACE-DESIGN.md
new file mode 100644
index 0000000000..3197723a0d
--- /dev/null
+++ b/.cursor/skills/improve-codebase-architecture/INTERFACE-DESIGN.md
@@ -0,0 +1,44 @@
+# Interface Design
+
+When the user wants to explore alternative interfaces for a chosen deepening candidate, use this parallel sub-agent pattern. Based on "Design It Twice" (Ousterhout) — your first idea is unlikely to be the best.
+
+Uses the vocabulary in [LANGUAGE.md](LANGUAGE.md) — **module**, **interface**, **seam**, **adapter**, **leverage**.
+
+## Process
+
+### 1. Frame the problem space
+
+Before spawning sub-agents, write a user-facing explanation of the problem space for the chosen candidate:
+
+- The constraints any new interface would need to satisfy
+- The dependencies it would rely on, and which category they fall into (see [DEEPENING.md](DEEPENING.md))
+- A rough illustrative code sketch to ground the constraints — not a proposal, just a way to make the constraints concrete
+
+Show this to the user, then immediately proceed to Step 2. The user reads and thinks while the sub-agents work in parallel.
+
+### 2. Spawn sub-agents
+
+Spawn 3+ sub-agents in parallel using the Agent tool. Each must produce a **radically different** interface for the deepened module.
+
+Prompt each sub-agent with a separate technical brief (file paths, coupling details, dependency category from [DEEPENING.md](DEEPENING.md), what sits behind the seam). The brief is independent of the user-facing problem-space explanation in Step 1. Give each agent a different design constraint:
+
+- Agent 1: "Minimize the interface — aim for 1–3 entry points max. Maximise leverage per entry point."
+- Agent 2: "Maximise flexibility — support many use cases and extension."
+- Agent 3: "Optimise for the most common caller — make the default case trivial."
+- Agent 4 (if applicable): "Design around ports & adapters for cross-seam dependencies."
+
+Include both [LANGUAGE.md](LANGUAGE.md) vocabulary and CONTEXT.md vocabulary in the brief so each sub-agent names things consistently with the architecture language and the project's domain language.
+
+Each sub-agent outputs:
+
+1. Interface (types, methods, params — plus invariants, ordering, error modes)
+2. Usage example showing how callers use it
+3. What the implementation hides behind the seam
+4. Dependency strategy and adapters (see [DEEPENING.md](DEEPENING.md))
+5. Trade-offs — where leverage is high, where it's thin
+
+### 3. Present and compare
+
+Present designs sequentially so the user can absorb each one, then compare them in prose. Contrast by **depth** (leverage at the interface), **locality** (where change concentrates), and **seam placement**.
+
+After comparing, give your own recommendation: which design you think is strongest and why. If elements from different designs would combine well, propose a hybrid. Be opinionated — the user wants a strong read, not a menu.
diff --git a/.cursor/skills/improve-codebase-architecture/LANGUAGE.md b/.cursor/skills/improve-codebase-architecture/LANGUAGE.md
new file mode 100644
index 0000000000..530c27630a
--- /dev/null
+++ b/.cursor/skills/improve-codebase-architecture/LANGUAGE.md
@@ -0,0 +1,53 @@
+# Language
+
+Shared vocabulary for every suggestion this skill makes. Use these terms exactly — don't substitute "component," "service," "API," or "boundary." Consistent language is the whole point.
+
+## Terms
+
+**Module**
+Anything with an interface and an implementation. Deliberately scale-agnostic — applies equally to a function, class, package, or tier-spanning slice.
+_Avoid_: unit, component, service.
+
+**Interface**
+Everything a caller must know to use the module correctly. Includes the type signature, but also invariants, ordering constraints, error modes, required configuration, and performance characteristics.
+_Avoid_: API, signature (too narrow — those refer only to the type-level surface).
+
+**Implementation**
+What's inside a module — its body of code. Distinct from **Adapter**: a thing can be a small adapter with a large implementation (a Postgres repo) or a large adapter with a small implementation (an in-memory fake). Reach for "adapter" when the seam is the topic; "implementation" otherwise.
+
+**Depth**
+Leverage at the interface — the amount of behaviour a caller (or test) can exercise per unit of interface they have to learn. A module is **deep** when a large amount of behaviour sits behind a small interface. A module is **shallow** when the interface is nearly as complex as the implementation.
+
+**Seam** _(from Michael Feathers)_
+A place where you can alter behaviour without editing in that place. The *location* at which a module's interface lives. Choosing where to put the seam is its own design decision, distinct from what goes behind it.
+_Avoid_: boundary (overloaded with DDD's bounded context).
+
+**Adapter**
+A concrete thing that satisfies an interface at a seam. Describes *role* (what slot it fills), not substance (what's inside).
+
+**Leverage**
+What callers get from depth. More capability per unit of interface they have to learn. One implementation pays back across N call sites and M tests.
+
+**Locality**
+What maintainers get from depth. Change, bugs, knowledge, and verification concentrate at one place rather than spreading across callers. Fix once, fixed everywhere.
+
+## Principles
+
+- **Depth is a property of the interface, not the implementation.** A deep module can be internally composed of small, mockable, swappable parts — they just aren't part of the interface. A module can have **internal seams** (private to its implementation, used by its own tests) as well as the **external seam** at its interface.
+- **The deletion test.** Imagine deleting the module. If complexity vanishes, the module wasn't hiding anything (it was a pass-through). If complexity reappears across N callers, the module was earning its keep.
+- **The interface is the test surface.** Callers and tests cross the same seam. If you want to test *past* the interface, the module is probably the wrong shape.
+- **One adapter means a hypothetical seam. Two adapters means a real one.** Don't introduce a seam unless something actually varies across it.
+
+## Relationships
+
+- A **Module** has exactly one **Interface** (the surface it presents to callers and tests).
+- **Depth** is a property of a **Module**, measured against its **Interface**.
+- A **Seam** is where a **Module**'s **Interface** lives.
+- An **Adapter** sits at a **Seam** and satisfies the **Interface**.
+- **Depth** produces **Leverage** for callers and **Locality** for maintainers.
+
+## Rejected framings
+
+- **Depth as ratio of implementation-lines to interface-lines** (Ousterhout): rewards padding the implementation. We use depth-as-leverage instead.
+- **"Interface" as the TypeScript `interface` keyword or a class's public methods**: too narrow — interface here includes every fact a caller must know.
+- **"Boundary"**: overloaded with DDD's bounded context. Say **seam** or **interface**.
diff --git a/.cursor/skills/improve-codebase-architecture/SKILL.md b/.cursor/skills/improve-codebase-architecture/SKILL.md
new file mode 100644
index 0000000000..05984a6096
--- /dev/null
+++ b/.cursor/skills/improve-codebase-architecture/SKILL.md
@@ -0,0 +1,71 @@
+---
+name: improve-codebase-architecture
+description: Find deepening opportunities in a codebase, informed by the domain language in CONTEXT.md and the decisions in docs/adr/. Use when the user wants to improve architecture, find refactoring opportunities, consolidate tightly-coupled modules, or make a codebase more testable and AI-navigable.
+---
+
+# Improve Codebase Architecture
+
+Surface architectural friction and propose **deepening opportunities** — refactors that turn shallow modules into deep ones. The aim is testability and AI-navigability.
+
+## Glossary
+
+Use these terms exactly in every suggestion. Consistent language is the point — don't drift into "component," "service," "API," or "boundary." Full definitions in [LANGUAGE.md](LANGUAGE.md).
+
+- **Module** — anything with an interface and an implementation (function, class, package, slice).
+- **Interface** — everything a caller must know to use the module: types, invariants, error modes, ordering, config. Not just the type signature.
+- **Implementation** — the code inside.
+- **Depth** — leverage at the interface: a lot of behaviour behind a small interface. **Deep** = high leverage. **Shallow** = interface nearly as complex as the implementation.
+- **Seam** — where an interface lives; a place behaviour can be altered without editing in place. (Use this, not "boundary.")
+- **Adapter** — a concrete thing satisfying an interface at a seam.
+- **Leverage** — what callers get from depth.
+- **Locality** — what maintainers get from depth: change, bugs, knowledge concentrated in one place.
+
+Key principles (see [LANGUAGE.md](LANGUAGE.md) for the full list):
+
+- **Deletion test**: imagine deleting the module. If complexity vanishes, it was a pass-through. If complexity reappears across N callers, it was earning its keep.
+- **The interface is the test surface.**
+- **One adapter = hypothetical seam. Two adapters = real seam.**
+
+This skill is _informed_ by the project's domain model. The domain language gives names to good seams; ADRs record decisions the skill should not re-litigate.
+
+## Process
+
+### 1. Explore
+
+Read the project's domain glossary and any ADRs in the area you're touching first.
+
+Then use the Agent tool with `subagent_type=Explore` to walk the codebase. Don't follow rigid heuristics — explore organically and note where you experience friction:
+
+- Where does understanding one concept require bouncing between many small modules?
+- Where are modules **shallow** — interface nearly as complex as the implementation?
+- Where have pure functions been extracted just for testability, but the real bugs hide in how they're called (no **locality**)?
+- Where do tightly-coupled modules leak across their seams?
+- Which parts of the codebase are untested, or hard to test through their current interface?
+
+Apply the **deletion test** to anything you suspect is shallow: would deleting it concentrate complexity, or just move it? A "yes, concentrates" is the signal you want.
+
+### 2. Present candidates
+
+Present a numbered list of deepening opportunities. For each candidate:
+
+- **Files** — which files/modules are involved
+- **Problem** — why the current architecture is causing friction
+- **Solution** — plain English description of what would change
+- **Benefits** — explained in terms of locality and leverage, and also in how tests would improve
+
+**Use CONTEXT.md vocabulary for the domain, and [LANGUAGE.md](LANGUAGE.md) vocabulary for the architecture.** If `CONTEXT.md` defines "Order," talk about "the Order intake module" — not "the FooBarHandler," and not "the Order service."
+
+**ADR conflicts**: if a candidate contradicts an existing ADR, only surface it when the friction is real enough to warrant revisiting the ADR. Mark it clearly (e.g. _"contradicts ADR-0007 — but worth reopening because…"_). Don't list every theoretical refactor an ADR forbids.
+
+Do NOT propose interfaces yet. Ask the user: "Which of these would you like to explore?"
+
+### 3. Grilling loop
+
+Once the user picks a candidate, drop into a grilling conversation. Walk the design tree with them — constraints, dependencies, the shape of the deepened module, what sits behind the seam, what tests survive.
+
+Side effects happen inline as decisions crystallize:
+
+- **Naming a deepened module after a concept not in `CONTEXT.md`?** Add the term to `CONTEXT.md` — same discipline as `/grill-with-docs` (see [CONTEXT-FORMAT.md](../grill-with-docs/CONTEXT-FORMAT.md)). Create the file lazily if it doesn't exist.
+- **Sharpening a fuzzy term during the conversation?** Update `CONTEXT.md` right there.
+- **User rejects the candidate with a load-bearing reason?** Offer an ADR, framed as: _"Want me to record this as an ADR so future architecture reviews don't re-suggest it?"_ Only offer when the reason would actually be needed by a future explorer to avoid re-suggesting the same thing — skip ephemeral reasons ("not worth it right now") and self-evident ones. See [ADR-FORMAT.md](../grill-with-docs/ADR-FORMAT.md).
+- **Want to explore alternative interfaces for the deepened module?** See [INTERFACE-DESIGN.md](INTERFACE-DESIGN.md).
diff --git a/.cursor/skills/internal-linking-optimizer/SKILL.md b/.cursor/skills/internal-linking-optimizer/SKILL.md
new file mode 100644
index 0000000000..d83f3eb972
--- /dev/null
+++ b/.cursor/skills/internal-linking-optimizer/SKILL.md
@@ -0,0 +1,351 @@
+---
+name: internal-linking-optimizer
+description: 'Optimize internal links: site architecture, authority distribution, orphan pages, crawl depth analysis. 内链优化/站内架构'
+version: "6.0.0"
+license: Apache-2.0
+compatibility: "Claude Code ≥1.0, skills.sh marketplace, ClawHub marketplace, Vercel Labs skills ecosystem. No system packages required. Optional: MCP network access for SEO tool integrations."
+homepage: "https://github.com/aaron-he-zhu/seo-geo-claude-skills"
+when_to_use: "Use when improving internal link structure, anchor text distribution, orphan pages, or site architecture."
+argument-hint: "<URL or sitemap>"
+metadata:
+  author: aaron-he-zhu
+  version: "6.0.0"
+  geo-relevance: "low"
+  tags:
+    - seo
+    - internal-linking
+    - site-architecture
+    - link-equity
+    - orphan-pages
+    - topical-authority
+    - crawl-depth
+    - 内链优化
+    - 内部リンク
+    - 내부링크
+    - enlaces-internos
+  triggers:
+    # EN-formal
+    - "fix internal links"
+    - "improve site architecture"
+    - "link structure"
+    - "distribute page authority"
+    - "internal linking strategy"
+    - "site navigation"
+    - "link equity"
+    # EN-casual
+    - "orphan pages"
+    - "site architecture is messy"
+    - "pages have no links pointing to them"
+    - "pages have no links"
+    - "site structure is messy"
+    # EN-question
+    - "how to improve internal linking"
+    - "how to fix orphan pages"
+    # ZH-pro
+    - "内链优化"
+    - "站内链接"
+    - "网站架构"
+    - "权重传递"
+    - "锚文本优化"
+    # ZH-casual
+    - "内链怎么做"
+    - "孤立页面"
+    - "网站结构乱"
+    # JA
+    - "内部リンク最適化"
+    - "サイト構造"
+    # KO
+    - "내부 링크 최적화"
+    - "사이트 구조"
+    # ES
+    - "enlaces internos"
+    - "arquitectura del sitio"
+    # PT
+    - "links internos"
+    # Misspellings
+    - "internal linkng"
+---
+
+# Internal Linking Optimizer
+
+
+> **[SEO & GEO Skills Library](https://github.com/aaron-he-zhu/seo-geo-claude-skills)** · 20 skills for SEO + GEO · [ClawHub](https://clawhub.ai/u/aaron-he-zhu) · [skills.sh](https://skills.sh/aaron-he-zhu/seo-geo-claude-skills)
+> **System Mode**: This optimization skill follows the shared [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md) and [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md).
+
+
+This skill analyzes your site's internal link structure and provides recommendations to improve SEO through strategic internal linking. It helps distribute authority, establish topical relevance, and improve crawlability.
+
+**System role**: Optimization layer skill. It turns weak pages, structures, and technical issues into prioritized repair work.
+
+## When This Must Trigger
+
+Use this when the conversation involves any of these situations — even if the user does not use SEO terminology:
+
+Use this whenever the task needs a diagnosis or repair plan that should feed directly into remediation work, not just a one-time opinion.
+
+- Improving site architecture for SEO
+- Distributing authority to important pages
+- Fixing orphan pages with no internal links
+- Creating topic cluster internal link strategies
+- Optimizing anchor text for SEO
+- Recovering pages that have lost rankings
+- Planning internal links for new content
+
+## What This Skill Does
+
+1. **Link Structure Analysis**: Maps current internal linking patterns
+2. **Authority Flow Mapping**: Shows how PageRank flows through site
+3. **Orphan Page Detection**: Finds pages with no internal links
+4. **Anchor Text Optimization**: Improves anchor text diversity
+5. **Topic Cluster Linking**: Creates pillar-cluster link strategies
+6. **Link Opportunity Finding**: Identifies where to add links
+7. **Navigation Optimization**: Improves site-wide link elements
+
+## Quick Start
+
+Start with one of these prompts. Finish with a short handoff summary using the repository format in [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md).
+
+### Analyze Current Structure
+
+```
+Analyze internal linking structure for [domain/sitemap]
+```
+
+```
+Find internal linking opportunities for [URL]
+```
+
+### Create Linking Strategy
+
+```
+Create internal linking plan for topic cluster about [topic]
+```
+
+```
+Suggest internal links for this new article: [content/URL]
+```
+
+### Fix Issues
+
+```
+Find orphan pages on [domain]
+```
+
+```
+Optimize anchor text across the site
+```
+
+## Skill Contract
+
+**Expected output**: a scored diagnosis, prioritized repair plan, and a short handoff summary ready for `memory/audits/`.
+
+- **Reads**: the current page or site state, symptoms, prior audits, and current priorities from [CLAUDE.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CLAUDE.md) and the shared [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md) when available.
+- **Writes**: a user-facing audit or optimization plan plus a reusable summary that can be stored under `memory/audits/`.
+- **Promotes**: blocking defects, repeated weaknesses, and fix priorities to `memory/open-loops.md` and `memory/decisions.md`.
+- **Next handoff**: use the `Next Best Skill` below when the repair path is clear.
+
+## Data Sources
+
+> **Note:** All integrations are optional. This skill works without any API keys — users provide data manually when no tools are connected.
+
+> See [CONNECTORS.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CONNECTORS.md) for tool category placeholders.
+
+**With ~~web crawler + ~~analytics connected:**
+Claude can automatically perform a full site crawl via ~~web crawler to map the complete link graph, fetch page performance metrics from ~~analytics to identify high-value pages, and analyze link flow throughout the site. This enables data-driven internal linking strategies.
+
+**With manual data only:**
+Ask the user to provide:
+1. Sitemap URL or list of important pages
+2. Key page URLs that need more internal links
+3. Content categories or topic clusters
+4. Any existing link structure documentation
+
+Proceed with the analysis using provided data. Note in the output which findings are from automated crawl vs. manual review.
+
+## Instructions
+
+When a user requests internal linking optimization:
+
+1. **Analyze Current Internal Link Structure**
+
+   ```markdown
+   ## Internal Link Structure Analysis
+   
+   ### Overview
+   
+   **Domain**: [domain]
+   **Total Pages Analyzed**: [X]
+   **Total Internal Links**: [X]
+   **Average Links per Page**: [X]
+   
+   ### Link Distribution
+   
+   | Links per Page | Page Count | Percentage |
+   |----------------|------------|------------|
+   | 0 (Orphan) | [X] | [X]% |
+   | 1-5 | [X] | [X]% |
+   | 6-10 | [X] | [X]% |
+   | 11-20 | [X] | [X]% |
+   | 20+ | [X] | [X]% |
+   
+   ### Top Linked Pages
+   
+   | Page | Internal Links | Authority | Notes |
+   |------|----------------|-----------|-------|
+   | [URL 1] | [X] | High | [notes] |
+   | [URL 2] | [X] | High | [notes] |
+   | [URL 3] | [X] | Medium | [notes] |
+   
+   ### Under-Linked Important Pages
+   
+   | Page | Current Links | Traffic | Recommended Links |
+   |------|---------------|---------|-------------------|
+   | [URL 1] | [X] | [X]/mo | [X]+ |
+   | [URL 2] | [X] | [X]/mo | [X]+ |
+   
+   **Structure Score**: [X]/10
+   ```
+
+2. **Identify Orphan Pages**
+
+   ```markdown
+   ## Orphan Page Analysis
+   
+   ### Definition
+   Orphan pages have no internal links pointing to them, making them 
+   hard for users and search engines to discover.
+   
+   ### Orphan Pages Found: [X]
+   
+   | Page | Traffic | Priority | Recommended Action |
+   |------|---------|----------|-------------------|
+   | [URL 1] | [X]/mo | High | Link from [pages] |
+   | [URL 2] | [X]/mo | Medium | Add to navigation |
+   | [URL 3] | 0 | Low | Consider deleting/redirecting |
+   
+   ### Fix Strategy
+   
+   **High Priority Orphans** (have traffic/rankings):
+   1. [URL] - Add links from: [relevant pages]
+   2. [URL] - Add links from: [relevant pages]
+   
+   **Medium Priority Orphans** (potentially valuable):
+   1. [URL] - Add to category/tag page
+   2. [URL] - Link from related content
+   
+   **Low Priority Orphans** (consider removing):
+   1. [URL] - Redirect to [better page]
+   2. [URL] - Delete or noindex
+   ```
+
+3. **Analyze Anchor Text Distribution**
+
+   > **CORE-EEAT alignment**: Internal linking quality maps to R08 (Internal Link Graph) in the CORE-EEAT benchmark -- use descriptive anchors, ensure links support topical authority. See [content-quality-auditor](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/content-quality-auditor/SKILL.md) for full audit.
+
+   ```markdown
+   ## Anchor Text Analysis
+   
+   ### Current Anchor Text Patterns
+   
+   **Most Used Anchors**:
+   
+   | Anchor Text | Count | Target Pages | Assessment |
+   |-------------|-------|--------------|------------|
+   | "click here" | [X] | [X] pages | ❌ Not descriptive |
+   | "read more" | [X] | [X] pages | ❌ Not descriptive |
+   | "[exact keyword]" | [X] | [page] | ⚠️ May be over-optimized |
+   | "[descriptive phrase]" | [X] | [page] | ✅ Good |
+   
+   ### Anchor Text Distribution by Page
+   
+   **Page: [Important URL]**
+   
+   | Anchor Text | Source Page | Status |
+   |-------------|-------------|--------|
+   | "[anchor 1]" | [source URL] | ✅/⚠️/❌ |
+   | "[anchor 2]" | [source URL] | ✅/⚠️/❌ |
+   
+   **Issues Found**:
+   - Over-optimized anchors: [X] instances
+   - Generic anchors: [X] instances
+   - Same anchor to multiple pages: [X] instances
+   
+   ### Anchor Text Recommendations
+   
+   **For Page: [URL]**
+   
+   Current: "[current anchor]" used [X] times
+   
+   Recommended variety:
+   - "[variation 1]" - Use from [page type]
+   - "[variation 2]" - Use from [page type]
+   - "[variation 3]" - Use from [page type]
+   
+   **Anchor Score**: [X]/10
+   ```
+
+4. **Create Topic Cluster Link Strategy** — Map current pillar/cluster links, recommend link structure, list specific links to add
+
+   > **Reference**: See [references/linking-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/internal-linking-optimizer/references/linking-templates.md) for the topic cluster link strategy template (Step 4).
+
+5. **Find Contextual Link Opportunities** — Analyze each page for topic-relevant link opportunities, prioritize high-impact additions
+
+   > **Reference**: See [references/linking-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/internal-linking-optimizer/references/linking-templates.md) for the contextual link opportunities template (Step 5).
+
+6. **Optimize Navigation and Footer Links** — Analyze main/footer/sidebar/breadcrumb navigation, recommend pages to add or remove
+
+   > **Reference**: See [references/linking-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/internal-linking-optimizer/references/linking-templates.md) for the navigation optimization template (Step 6).
+
+7. **Generate Link Implementation Plan** — Executive summary, current state metrics, phased priority actions (weeks 1-4+), implementation guide, tracking plan
+
+   > **Reference**: See [references/linking-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/internal-linking-optimizer/references/linking-templates.md) for the full implementation plan template (Step 7).
+
+## Validation Checkpoints
+
+### Input Validation
+- [ ] Site structure or sitemap provided (URL or file)
+- [ ] Target pages or topic clusters clearly defined
+- [ ] If optimizing specific page, page URL or content provided
+
+### Output Validation
+- [ ] Every recommendation cites specific data points (not generic advice)
+- [ ] All link suggestions include source page, target page, and recommended anchor text
+- [ ] Orphan page lists include URLs and recommended actions
+- [ ] Source of each data point clearly stated (~~web crawler data, ~~analytics, user-provided, or manual analysis)
+
+## Example
+
+> **Reference**: See [references/linking-example.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/internal-linking-optimizer/references/linking-example.md) for a full worked example (email marketing best practices internal linking opportunities).
+
+## Tips for Success
+
+1. **Quality over quantity** - Add relevant links, not random ones
+2. **User-first thinking** - Links should help users navigate
+3. **Vary anchor text** - Avoid over-optimization
+4. **Link to important pages** - Distribute authority strategically
+5. **Regular audits** - Internal links need maintenance as content grows
+
+
+### Save Results
+
+After delivering audit or optimization findings to the user, ask:
+
+> "Save these results for future sessions?"
+
+If yes, write a dated summary to `memory/audits/internal-linking-optimizer/YYYY-MM-DD-<topic>.md` containing:
+- One-line verdict or headline finding
+- Top 3-5 actionable items
+- Open loops or blockers
+- Source data references
+
+If any veto-level issue was found (CORE-EEAT T04, C01, R10 or CITE T03, T05, T09), also append a one-liner to `memory/hot-cache.md` without asking.
+
+## Reference Materials
+
+- [Link Architecture Patterns](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/internal-linking-optimizer/references/link-architecture-patterns.md) — Architecture models (hub-and-spoke, silo, flat, pyramid, mesh), anchor text diversity framework, link equity flow model, and internal link audit checklist
+- [Linking Templates](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/internal-linking-optimizer/references/linking-templates.md) — Detailed output templates for steps 6-7 (navigation optimization, implementation plan)
+- [Linking Example](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/internal-linking-optimizer/references/linking-example.md) — Full worked example for internal linking opportunities
+
+## Next Best Skill
+
+- **Primary**: [on-page-seo-auditor](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/on-page-seo-auditor/SKILL.md) — verify that revised internal links support the page-level goals.
diff --git a/.cursor/skills/internal-linking-optimizer/references/link-architecture-patterns.md b/.cursor/skills/internal-linking-optimizer/references/link-architecture-patterns.md
new file mode 100644
index 0000000000..24d2f3de2f
--- /dev/null
+++ b/.cursor/skills/internal-linking-optimizer/references/link-architecture-patterns.md
@@ -0,0 +1,362 @@
+# Link Architecture Patterns
+
+Detailed architecture models with implementation guides, migration strategies, and measurement frameworks for internal linking optimization.
+
+## Architecture Model Deep Dives
+
+### 1. Hub-and-Spoke (Topic Cluster) Model
+
+#### Overview
+
+The hub-and-spoke model organizes content around central "pillar" pages (hubs) that link to and from related "cluster" articles (spokes). This is the most widely recommended architecture for content-driven sites targeting topical authority.
+
+#### Structure Diagram
+
+```
+                    ┌──────────────┐
+                    │   Homepage   │
+                    └──────┬───────┘
+                           │
+            ┌──────────────┼──────────────┐
+            │              │              │
+     ┌──────▼──────┐ ┌────▼────┐  ┌──────▼──────┐
+     │  Hub A      │ │  Hub B  │  │  Hub C      │
+     │ (Pillar)    │ │(Pillar) │  │ (Pillar)    │
+     └──┬───┬───┬──┘ └────┬────┘  └──┬───┬───┬──┘
+        │   │   │         │          │   │   │
+       A1  A2  A3       B1  B2     C1  C2  C3
+        └───┼───┘               └───┼───┘
+         cross-links             cross-links
+```
+
+#### Implementation Steps
+
+1. **Identify 3-7 core topics** that define your business expertise
+2. **Create pillar pages** (2,000-5,000 words) that broadly cover each core topic
+3. **Map cluster articles** (800-2,000 words) that dive deep into subtopics
+4. **Implement bidirectional links**: every cluster article links to its pillar, every pillar links to all its clusters
+5. **Add cross-links** between related cluster articles within the same hub
+6. **Add bridge links** between hubs where subtopics overlap
+
+#### Link Rules
+
+| Link Type | Direction | Anchor Text Strategy |
+|-----------|-----------|---------------------|
+| Pillar → Cluster | Pillar links to each cluster | Descriptive: "learn about [subtopic]" |
+| Cluster → Pillar | Every cluster links back to pillar | Partial match: "our complete [topic] guide" |
+| Cluster ↔ Cluster | Between related clusters in same hub | Natural: "as we covered in [related article]" |
+| Hub ↔ Hub (bridge) | Between related pillar pages | Branded/natural: "see also our [topic] resource" |
+
+#### When to Use
+- Content marketing sites and blogs
+- SaaS companies building topical authority
+- Publishers covering defined topic areas
+- Any site with 50-500 content pages
+
+#### Measurement
+
+| Metric | Target | Tool |
+|--------|--------|------|
+| Pillar page rankings for head terms | Top 10 | Rank tracker |
+| Cluster article rankings for long-tail | Top 20 | Rank tracker |
+| Internal links per cluster article | 3-5 minimum | Crawl report |
+| Click depth from homepage to cluster | ≤3 clicks | Crawl report |
+| Organic traffic to hub pages | Month-over-month growth | Analytics |
+
+---
+
+### 2. Silo Structure
+
+#### Overview
+
+The silo model creates strict vertical hierarchies where content is organized into isolated "silos" (categories). Links flow vertically within a silo but rarely cross between silos. This concentrates topical relevance within each silo.
+
+#### Structure Diagram
+
+```
+                    ┌──────────────┐
+                    │   Homepage   │
+                    └──────┬───────┘
+                           │
+         ┌─────────────────┼─────────────────┐
+         │                 │                 │
+    ┌────▼─────┐     ┌────▼─────┐     ┌────▼─────┐
+    │  Silo A  │     │  Silo B  │     │  Silo C  │
+    │ Category │     │ Category │     │ Category │
+    └────┬─────┘     └────┬─────┘     └────┬─────┘
+         │                │                │
+    ┌────▼─────┐     ┌────▼─────┐     ┌────▼─────┐
+    │ Sub-cat  │     │ Sub-cat  │     │ Sub-cat  │
+    └────┬─────┘     └────┬─────┘     └────┬─────┘
+         │                │                │
+    ┌────▼────┐      ┌────▼────┐      ┌────▼────┐
+    │  Pages  │      │  Pages  │      │  Pages  │
+    └─────────┘      └─────────┘      └─────────┘
+
+    No horizontal links between silos (strict model)
+```
+
+#### Implementation Steps
+
+1. **Define 5-15 top-level categories** (silos) based on your product/service taxonomy
+2. **Create category landing pages** with overview content and links to subcategories
+3. **Build subcategory pages** linking down to individual product/content pages
+4. **Enforce vertical linking**: pages link up to their parent and down to their children
+5. **Use breadcrumbs** to reinforce the hierarchy visually and structurally
+6. **Limit cross-silo links** to only the most relevant connections (strict model) or allow them strategically (modified model)
+
+#### Link Rules
+
+| Link Type | Direction | Allowed? |
+|-----------|-----------|----------|
+| Parent → Child | Downward within silo | Always |
+| Child → Parent | Upward within silo | Always |
+| Sibling ↔ Sibling | Horizontal within same parent | Yes |
+| Cross-silo | Between different silos | Strict: No. Modified: Sparingly |
+| All pages → Homepage | Upward to root | Yes (via navigation) |
+
+#### When to Use
+- Large e-commerce sites (100+ product categories)
+- Directory sites with clear taxonomy
+- Sites where categories are truly distinct topics
+- Enterprises with separate business lines
+
+#### Limitations
+- Overly strict silos can trap link equity in one branch
+- Cross-topic content becomes difficult to place
+- Users may need to navigate up and over to find related content
+- Modified silo (allowing some cross-links) often works better in practice
+
+---
+
+### 3. Flat Architecture
+
+#### Overview
+
+A flat architecture keeps all pages within 2-3 clicks of the homepage. There is minimal hierarchy; instead, pages are broadly interlinked. This maximizes crawlability and distributes link equity evenly.
+
+#### Structure Diagram
+
+```
+              ┌──────────┐
+              │ Homepage │
+              └────┬─────┘
+                   │
+    ┌──────────────┼──────────────┐
+    │    │    │    │    │    │    │
+   P1   P2   P3   P4   P5   P6   P7
+    └────┼────┼────┼────┼────┼────┘
+         └────┴────┴────┘
+         (cross-linked freely)
+```
+
+#### Implementation Steps
+
+1. **Link all key pages from the homepage** (directly or via a comprehensive sitemap page)
+2. **Keep URL structure shallow**: /category/page, not /category/subcategory/year/page
+3. **Cross-link freely** between related pages at the same level
+4. **Use comprehensive navigation** menus, footer links, or HTML sitemaps
+5. **Limit total pages** to keep the architecture manageable
+
+#### When to Use
+- Small sites with fewer than 100 pages
+- Portfolio sites
+- Small business brochure sites
+- Startups with limited content
+
+#### Scaling Limits
+
+| Site Size | Flat Architecture Feasibility |
+|-----------|------------------------------|
+| <50 pages | Ideal |
+| 50-100 pages | Manageable with good navigation |
+| 100-500 pages | Difficult; consider hub-and-spoke |
+| 500+ pages | Not recommended; switch to hierarchical model |
+
+---
+
+### 4. Pyramid Architecture
+
+#### Overview
+
+The pyramid model mirrors traditional website hierarchies: a single homepage at the top, branching into categories, subcategories, and finally individual pages. Authority flows from top to bottom, concentrating at higher levels.
+
+#### Structure Diagram
+
+```
+Level 0:              Homepage
+                      /      \
+Level 1:        Category A    Category B
+                /    \          /    \
+Level 2:    Sub A1   Sub A2  Sub B1  Sub B2
+            / \      / \      / \     / \
+Level 3:  P1  P2   P3  P4   P5  P6  P7  P8
+```
+
+#### Implementation Steps
+
+1. **Design a clear hierarchy** with 3-4 levels maximum
+2. **Homepage links to all top-level categories** prominently
+3. **Category pages link to all subcategories** within them
+4. **Subcategory pages link to all child pages**
+5. **Implement breadcrumbs** to support the hierarchy
+6. **Add "related content" cross-links** at the page level to offset authority concentration
+
+#### Authority Flow Considerations
+
+| Level | Typical Authority | Action to Improve |
+|-------|-------------------|-------------------|
+| Homepage | Highest | Ensure links to priority categories are prominent |
+| Categories | High | Link from blog content, not just navigation |
+| Subcategories | Medium | Add contextual links from other sections |
+| Individual pages | Lowest | Cross-link, feature in "popular posts" widgets |
+
+#### When to Use
+- News and media sites
+- Large blogs (500+ posts)
+- Corporate sites with many divisions
+- Government/educational sites
+
+---
+
+### 5. Mesh/Matrix Architecture
+
+#### Overview
+
+The mesh model allows free-form linking between any related pages, regardless of hierarchy. Every page can link to any other relevant page. This creates a dense web of connections, similar to Wikipedia's link structure.
+
+#### Structure Diagram
+
+```
+    P1 ←──→ P2 ←──→ P3
+    ↕  ╲    ↕    ╱  ↕
+    P4 ←──→ P5 ←──→ P6
+    ↕  ╱    ↕    ╲  ↕
+    P7 ←──→ P8 ←──→ P9
+```
+
+#### Implementation Steps
+
+1. **Set linking rules** to prevent chaos: link only when topically relevant
+2. **Use contextual anchors** that describe the destination page
+3. **Set a link budget** per page (5-15 contextual links per 1,000 words)
+4. **Review link density regularly** to prune irrelevant connections
+5. **Maintain a link map** (spreadsheet or tool) to track the network
+
+#### Governance Rules
+
+| Rule | Purpose |
+|------|---------|
+| Every link must have topical relevance | Prevents link dilution |
+| Maximum 15 contextual links per 1,000 words | Prevents link farms |
+| Review links quarterly | Prunes outdated connections |
+| Use descriptive anchor text only | Maintains semantic value |
+| No reciprocal link trading between unrelated pages | Prevents manipulation patterns |
+
+#### When to Use
+- Knowledge bases and documentation sites
+- Wikis and encyclopedias
+- Research repositories
+- FAQ/help center sites
+
+---
+
+## Migration Between Models
+
+### Common Migration Paths
+
+| From | To | Reason | Difficulty |
+|------|----|--------|-----------|
+| Flat → Hub-and-Spoke | Site grew beyond 100 pages | Medium |
+| Silo → Hub-and-Spoke | Silos too rigid, need cross-topic links | Medium |
+| Pyramid → Hub-and-Spoke | Want to build topical clusters | High |
+| No structure → Any model | Starting from disorganized state | High |
+| Hub-and-Spoke → Hybrid | Need both clusters and strict categories | Medium |
+
+### Migration Steps (General)
+
+1. **Audit current state**: Map all existing internal links using a crawler
+2. **Design target architecture**: Choose model, map pages to their new positions
+3. **Create a link change plan**: Document every link addition, removal, and anchor text change
+4. **Implement in phases**: Start with highest-priority cluster/silo, then expand
+5. **Preserve existing equity**: Do not remove links that pass significant value without replacement
+6. **Monitor impact**: Track rankings and traffic for 4-8 weeks after each phase
+7. **Iterate**: Adjust the plan based on measured results
+
+### Migration Risk Mitigation
+
+| Risk | Mitigation |
+|------|-----------|
+| Temporary ranking drops | Migrate one section at a time, not all at once |
+| Broken internal links | Run crawl after each phase to verify |
+| Lost link equity | Ensure no orphan pages created during migration |
+| Anchor text disruption | Change anchors gradually, not all at once |
+
+---
+
+## Measurement Framework
+
+### Key Metrics by Architecture Model
+
+| Metric | Hub-and-Spoke | Silo | Flat | Pyramid | Mesh |
+|--------|---------------|------|------|---------|------|
+| Avg click depth | ≤3 | ≤4 | ≤2 | ≤4 | ≤3 |
+| Orphan pages | 0 | 0 | 0 | 0 | 0 |
+| Avg internal links per page | 5-10 | 3-7 | 8-15 | 3-5 | 8-15 |
+| Cross-section links | Many | Few | N/A | Some | Many |
+| Authority concentration | Distributed to hubs | Concentrated in silo tops | Even | Top-heavy | Even |
+
+### Monthly Monitoring Checklist
+
+| Check | Tool | Action if Failing |
+|-------|------|-------------------|
+| Orphan pages count | Crawl report | Add internal links immediately |
+| Average click depth | Crawl report | Add shortcuts to deep pages |
+| Crawl depth distribution | Crawl report | Flatten deep branches |
+| Internal link count per page | Crawl report | Add links to under-linked pages |
+| Anchor text diversity | Manual audit | Vary anchors for over-optimized pages |
+| Broken internal links | Crawl report | Fix or remove broken links |
+| New content linked within 48 hours | Editorial process | Add to related pages upon publishing |
+
+### ROI Estimation
+
+| Architecture Change | Typical Impact | Timeline to See Results |
+|--------------------|---------------|----------------------|
+| Fix orphan pages | +15-30% traffic to those pages | 2-4 weeks |
+| Build first topic cluster | +10-25% traffic to cluster pages | 4-8 weeks |
+| Reduce click depth by 1 level | +5-15% crawl efficiency | 2-6 weeks |
+| Anchor text optimization | +5-10% ranking improvement for target terms | 4-12 weeks |
+| Full architecture migration | +20-50% overall organic traffic | 3-6 months |
+
+---
+
+## Hybrid Architecture Strategies
+
+Most real-world sites combine elements from multiple models. Common hybrid patterns:
+
+### Hub-and-Spoke + Silo (Recommended for Medium-Large Sites)
+
+```
+Homepage
+  ├── Category Silo A
+  │     ├── Hub A1 (pillar) ←→ Cluster articles
+  │     └── Hub A2 (pillar) ←→ Cluster articles
+  ├── Category Silo B
+  │     ├── Hub B1 (pillar) ←→ Cluster articles
+  │     └── Hub B2 (pillar) ←→ Cluster articles
+  └── Cross-category bridge links (A1 ↔ B2 where relevant)
+```
+
+- **Silos** provide category organization for navigation and URL structure
+- **Hubs** within each silo build topical authority for specific keyword clusters
+- **Bridge links** connect related content across silos where user intent overlaps
+
+### Implementation Priority Order
+
+1. Fix structural issues first (orphan pages, broken links)
+2. Implement primary architecture model
+3. Add cross-linking strategy
+4. Optimize anchor text
+5. Monitor and iterate
+
+This order ensures each phase builds on a solid foundation rather than optimizing details on a broken structure.
diff --git a/.cursor/skills/internal-linking-optimizer/references/linking-example.md b/.cursor/skills/internal-linking-optimizer/references/linking-example.md
new file mode 100644
index 0000000000..ef8703eea6
--- /dev/null
+++ b/.cursor/skills/internal-linking-optimizer/references/linking-example.md
@@ -0,0 +1,42 @@
+# Internal Linking Optimizer — Worked Example
+
+Referenced from [SKILL.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/internal-linking-optimizer/SKILL.md).
+
+---
+
+## Worked Example
+
+**User**: "Find internal linking opportunities for my blog post about 'email marketing best practices'"
+
+**Output**:
+
+```markdown
+## Internal Linking Opportunities
+
+**Page**: /blog/email-marketing-best-practices/
+**Current Internal Links**: 2
+
+### Recommended Links to Add
+
+| Section | Text to Link | Target Page | Anchor |
+|---------|--------------|-------------|--------|
+| Para 2 | "building your email list" | /blog/grow-email-list/ | "building your email list" |
+| Para 5 | "subject lines" | /blog/email-subject-lines/ | "write compelling subject lines" |
+| Section on segmentation | "audience segments" | /blog/email-segmentation-guide/ | "segment your audience" |
+| CTA section | "marketing automation" | /services/email-automation/ | "email automation services" |
+| Conclusion | "email marketing tools" | /blog/best-email-tools/ | "top email marketing tools" |
+
+### Pages That Should Link TO This Article
+
+| Source Page | Location | Anchor Text |
+|-------------|----------|-------------|
+| /blog/digital-marketing-guide/ | Email section | "email marketing best practices" |
+| /services/marketing-services/ | Related content | "email marketing strategies" |
+| /blog/lead-generation-tips/ | Email mention | "email marketing techniques" |
+
+### Priority Actions
+
+1. Add 5 outbound internal links (listed above)
+2. Request 3 inbound links from related pages
+3. Add to "Marketing" category page
+```
diff --git a/.cursor/skills/internal-linking-optimizer/references/linking-templates.md b/.cursor/skills/internal-linking-optimizer/references/linking-templates.md
new file mode 100644
index 0000000000..ce5e1e4c40
--- /dev/null
+++ b/.cursor/skills/internal-linking-optimizer/references/linking-templates.md
@@ -0,0 +1,219 @@
+# Internal Linking Optimizer — Output Templates
+
+Detailed output templates for internal-linking-optimizer steps 4-7. Referenced from [SKILL.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/internal-linking-optimizer/SKILL.md).
+
+---
+
+## Step 4: Create Topic Cluster Link Strategy
+
+```markdown
+## Topic Cluster Internal Linking
+
+### Cluster: [Main Topic]
+
+**Pillar Page**: [URL]
+**Cluster Articles**: [X]
+
+### Current Link Map
+
+```
+[Pillar Page]
+   ├── [Cluster Article 1] ←→ [linked?]
+   ├── [Cluster Article 2] ←→ [linked?]
+   ├── [Cluster Article 3] ←→ [linked?]
+   └── [Cluster Article 4] ←→ [linked?]
+```
+
+### Recommended Link Structure
+
+```
+[Pillar Page]
+   ├── Links TO all cluster articles ✅
+   │
+   ├── [Cluster Article 1]
+   │   ├── Link TO pillar ✅
+   │   └── Link TO related cluster articles
+   │
+   ├── [Cluster Article 2]
+   │   ├── Link TO pillar ✅
+   │   └── Link TO related cluster articles
+   │
+   └── [etc.]
+```
+
+### Links to Add
+
+| From Page | To Page | Anchor Text | Location |
+|-----------|---------|-------------|----------|
+| [URL 1] | [URL 2] | "[anchor]" | [paragraph/section] |
+| [URL 2] | [URL 3] | "[anchor]" | [paragraph/section] |
+| [Pillar] | [Cluster 1] | "[anchor]" | [section] |
+```
+
+---
+
+## Step 5: Find Contextual Link Opportunities
+
+```markdown
+## Contextual Link Opportunities
+
+### Link Opportunity Analysis
+
+For each page, find relevant pages to link to based on:
+- Topic relevance
+- Keyword overlap
+- User journey logic
+- Authority distribution needs
+
+### Opportunities Found
+
+**Page: [URL 1]**
+**Topic**: [topic]
+**Current internal links**: [X]
+
+| Opportunity | Target Page | Anchor Text | Why Link |
+|-------------|-------------|-------------|----------|
+| Paragraph 2 mentions "[topic]" | [URL] | "[topic phrase]" | Topic match |
+| Section on "[subject]" | [URL] | "[anchor]" | Related guide |
+| CTA at end | [URL] | "[anchor]" | User journey |
+
+**Page: [URL 2]**
+[Continue for each page...]
+
+### Priority Link Additions
+
+**High Impact Links** (add these first):
+
+1. **From**: [Source URL]
+   **To**: [Target URL]
+   **Anchor**: "[anchor text]"
+   **Why**: [reason - e.g., "Target page needs authority boost"]
+   **Where to add**: [specific location in content]
+
+2. **From**: [Source URL]
+   **To**: [Target URL]
+   [etc.]
+```
+
+---
+
+## Step 6: Optimize Navigation and Footer Links
+
+```markdown
+## Site-Wide Link Optimization
+
+### Current Navigation Analysis
+
+**Main Navigation**:
+- Links present: [list]
+- Missing important pages: [list]
+- Too many links: [Yes/No]
+
+**Footer Navigation**:
+- Links present: [list]
+- SEO value: [assessment]
+
+### Navigation Recommendations
+
+| Element | Current | Recommended | Reason |
+|---------|---------|-------------|--------|
+| Main nav | [X] links | [Y] links | [reason] |
+| Footer | [X] links | [Y] links | [reason] |
+| Sidebar | [status] | [recommendation] | [reason] |
+| Breadcrumbs | [status] | [recommendation] | [reason] |
+
+### Pages to Add to Navigation
+
+1. [Page] - Add to [location] because [reason]
+2. [Page] - Add to [location] because [reason]
+
+### Pages to Remove from Navigation
+
+1. [Page] - Move to [footer/remove] because [reason]
+```
+
+---
+
+## Step 7: Generate Link Implementation Plan
+
+```markdown
+# Internal Linking Optimization Plan
+
+**Site**: [domain]
+**Analysis Date**: [date]
+
+## Executive Summary
+
+- Total link opportunities found: [X]
+- Orphan pages to fix: [X]
+- Estimated traffic impact: [+X%]
+- Priority actions: [X]
+
+## Current State
+
+| Metric | Current | Target | Gap |
+|--------|---------|--------|-----|
+| Avg links per page | [X] | [X] | [X] |
+| Orphan pages | [X] | 0 | [X] |
+| Over-optimized anchors | [X]% | <10% | [X]% |
+| Topic cluster coverage | [X]% | 100% | [X]% |
+
+## Priority Actions
+
+### Phase 1: Critical Fixes (Week 1)
+
+**Fix Orphan Pages**:
+- [ ] [URL] - Add links from [X] pages
+- [ ] [URL] - Add links from [X] pages
+
+**High-Value Link Additions**:
+- [ ] Link [Page A] to [Page B] with "[anchor]"
+- [ ] Link [Page A] to [Page C] with "[anchor]"
+
+### Phase 2: Topic Clusters (Week 2-3)
+
+**Cluster 1: [Topic]**
+- [ ] Ensure pillar links to all [X] cluster articles
+- [ ] Add [X] cross-links between cluster articles
+
+**Cluster 2: [Topic]**
+- [ ] [Tasks]
+
+### Phase 3: Optimization (Week 4+)
+
+**Anchor Text Diversity**:
+- [ ] Vary anchors for [Page] - currently [X]% exact match
+- [ ] [Additional tasks]
+
+**Navigation Updates**:
+- [ ] Add [Page] to main navigation
+- [ ] Update footer links
+
+## Implementation Guide
+
+### Adding Internal Links
+
+Best practices:
+1. Add links contextually within content
+2. Use descriptive anchor text (not "click here")
+3. Link to relevant, helpful pages
+4. Aim for 3-10 internal links per 1,000 words
+5. Vary anchor text for the same target
+
+### Anchor Text Guidelines
+
+| Type | Example | Usage |
+|------|---------|-------|
+| Exact match | "keyword research" | 10-20% |
+| Partial match | "tips for keyword research" | 30-40% |
+| Branded | "Brand's guide to..." | 10-20% |
+| Natural | "this article", "learn more" | 20-30% |
+
+## Tracking Success
+
+Monitor these metrics weekly:
+- [ ] Rankings for target keywords
+- [ ] Traffic to previously orphan pages
+- [ ] Crawl stats in ~~search console
+- [ ] Internal link distribution changes
+```
diff --git a/.cursor/skills/keyword-research/SKILL.md b/.cursor/skills/keyword-research/SKILL.md
new file mode 100644
index 0000000000..fdbb208b11
--- /dev/null
+++ b/.cursor/skills/keyword-research/SKILL.md
@@ -0,0 +1,435 @@
+---
+name: keyword-research
+description: 'Find high-value SEO keywords: search volume, difficulty, intent classification, topic clusters. 关键词研究/内容选题'
+version: "6.0.0"
+license: Apache-2.0
+compatibility: "Claude Code ≥1.0, skills.sh marketplace, ClawHub marketplace, Vercel Labs skills ecosystem. No system packages required. Optional: MCP network access for SEO tool integrations."
+homepage: "https://github.com/aaron-he-zhu/seo-geo-claude-skills"
+when_to_use: "Use when starting keyword research for a new page, topic, or campaign. Also when the user asks about search volume, keyword difficulty, topic clusters, long-tail keywords, or what to write about."
+argument-hint: "<topic or seed keyword> [market/language]"
+metadata:
+  author: aaron-he-zhu
+  version: "6.0.0"
+  geo-relevance: "medium"
+  tags:
+    - seo
+    - geo
+    - keywords
+    - keyword-research
+    - search-volume
+    - keyword-difficulty
+    - topic-clusters
+    - long-tail-keywords
+    - search-intent
+    - content-calendar
+    - ahrefs
+    - semrush
+    - google-keyword-planner
+    - 关键词研究
+    - SEO关键词
+    - キーワード調査
+    - 키워드분석
+    - palabras-clave
+  triggers:
+    # EN-formal
+    - "keyword research"
+    - "find keywords"
+    - "keyword analysis"
+    - "keyword discovery"
+    - "search volume analysis"
+    - "keyword difficulty"
+    - "topic research"
+    - "identify ranking opportunities"
+    # EN-casual
+    - "what should I write about"
+    - "what are people searching for"
+    - "what are people googling"
+    - "find me topics to write"
+    - "give me keyword ideas"
+    - "which keywords should I target"
+    - "why is my traffic low"
+    - "I need content ideas"
+    # EN-question
+    - "how do I find good keywords"
+    - "what keywords should I target"
+    - "how competitive is this keyword"
+    # EN-competitor
+    - "Ahrefs keyword explorer alternative"
+    - "Semrush keyword magic tool"
+    - "Google Keyword Planner alternative"
+    - "Ubersuggest alternative"
+    # ZH-pro
+    - "关键词研究"
+    - "关键词分析"
+    - "搜索量查询"
+    - "关键词难度"
+    - "SEO关键词"
+    - "长尾关键词"
+    - "词库整理"
+    - "关键词布局"
+    - "关键词挖掘"
+    # ZH-casual
+    - "写什么内容好"
+    - "找选题"
+    - "帮我挖词"
+    - "不知道写什么"
+    - "查关键词"
+    - "选词"
+    - "帮我找词"
+    # JA
+    - "キーワード調査"
+    - "キーワードリサーチ"
+    - "SEOキーワード分析"
+    - "検索ボリューム"
+    - "ロングテールキーワード"
+    - "検索意図分析"
+    # KO
+    - "키워드 리서치"
+    - "키워드 분석"
+    - "검색량 분석"
+    - "키워드 어떻게 찾아요?"
+    - "검색어 분석"
+    - "경쟁도 낮은 키워드는?"
+    # ES
+    - "investigación de palabras clave"
+    - "análisis de palabras clave"
+    - "volumen de búsqueda"
+    - "posicionamiento web"
+    - "cómo encontrar palabras clave"
+    # PT
+    - "pesquisa de palavras-chave"
+    # Misspellings
+    - "keywrod research"
+    - "keywork research"
+---
+
+# Keyword Research
+
+
+> **[SEO & GEO Skills Library](https://github.com/aaron-he-zhu/seo-geo-claude-skills)** · 20 skills for SEO + GEO · [ClawHub](https://clawhub.ai/u/aaron-he-zhu) · [skills.sh](https://skills.sh/aaron-he-zhu/seo-geo-claude-skills)
+> **System Mode**: This research skill follows the shared [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md) and [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md).
+
+
+Discovers, analyzes, and prioritizes keywords for SEO and GEO content strategies. Identifies high-value opportunities based on search volume, competition, intent, and business relevance.
+
+**System role**: Research layer skill. It turns market signals into reusable strategic inputs for the rest of the library.
+
+## When This Must Trigger
+
+Use this when the conversation involves any of these situations — even if the user does not use SEO terminology:
+
+Use this whenever the task needs reusable market intelligence that should influence strategy, not just an ad hoc answer.
+
+- Starting a new content strategy or campaign
+- Expanding into new topics or markets
+- Finding keywords for a specific product or service
+- Identifying long-tail keyword opportunities
+- Understanding search intent for your industry
+- Planning content calendars
+- Researching keywords for GEO optimization
+
+## What This Skill Does
+
+1. **Keyword Discovery**: Generates comprehensive keyword lists from seed terms
+2. **Intent Classification**: Categorizes keywords by user intent (informational, navigational, commercial, transactional)
+3. **Difficulty Assessment**: Evaluates competition level and ranking difficulty
+4. **Opportunity Scoring**: Prioritizes keywords by potential ROI
+5. **Clustering**: Groups related keywords into topic clusters
+6. **GEO Relevance**: Identifies keywords likely to trigger AI responses
+
+## Quick Start
+
+Start with one of these prompts. Finish with a short handoff summary using the repository format in [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md).
+
+### Basic Keyword Research
+
+```
+Research keywords for [topic/product/service]
+```
+
+```
+Find keyword opportunities for a [industry] business targeting [audience]
+```
+
+### With Specific Goals
+
+```
+Find low-competition keywords for [topic] with commercial intent
+```
+
+```
+Identify question-based keywords for [topic] that AI systems might answer
+```
+
+### Competitive Research
+
+```
+What keywords is [competitor URL] ranking for that I should target?
+```
+
+## Skill Contract
+
+**Expected output**: a prioritized research brief, evidence-backed findings, and a short handoff summary ready for `memory/research/`.
+
+- **Reads**: user goals, target market inputs, available tool data, and prior strategy from [CLAUDE.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CLAUDE.md) and the shared [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md) when available.
+- **Writes**: a user-facing research deliverable plus a reusable summary that can be stored under `memory/research/`.
+- **Promotes**: durable keyword priorities, competitor facts, entity candidates, and strategic decisions to `CLAUDE.md`, `memory/decisions.md`, and `memory/research/`; hand canonical entity work to `entity-optimizer`.
+- **Next handoff**: use the `Next Best Skill` below when the findings are ready to drive action.
+
+## Data Sources
+
+> **Note:** All integrations are optional. This skill works without any API keys — users provide data manually when no tools are connected.
+
+> See [CONNECTORS.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CONNECTORS.md) for tool category placeholders.
+
+**With ~~SEO tool + ~~search console connected:**
+Automatically pull historical search volume data, keyword difficulty scores, SERP analysis, current rankings from ~~search console, and competitor keyword overlap. The skill will fetch seed keyword metrics, related keyword suggestions, and search trend data.
+
+**With manual data only:**
+Ask the user to provide:
+1. Seed keywords or topic description
+2. Target audience and geographic location
+3. Business goals (traffic, leads, sales)
+4. Current domain authority (if known) or site age
+5. Any known keyword performance data or search volume estimates
+
+Proceed with the full analysis using provided data. Note in the output which metrics are from automated collection vs. user-provided data.
+
+## Instructions
+
+When a user requests keyword research:
+
+At the start of each phase, announce: **[Phase X/8: Name]** so the user can track progress.
+
+### Phase 1/8: Scope
+
+   Ask clarifying questions if not provided:
+   - What is your product/service/topic?
+   - Who is your target audience?
+   - What is your business goal? (traffic, leads, sales)
+   - What is your current domain authority? (new site, established, etc.)
+   - Any specific geographic targeting?
+   - Preferred language?
+
+### Phase 2/8: Discover
+
+   Start with:
+   - Core product/service terms
+   - Problem-focused keywords (what issues do you solve?)
+   - Solution-focused keywords (how do you help?)
+   - Audience-specific terms
+   - Industry terminology
+
+### Phase 3/8: Variations
+
+   For each seed keyword, generate variations:
+   
+   ```markdown
+   ## Keyword Expansion Patterns
+   
+   ### Modifiers
+   - Best [keyword]
+   - Top [keyword]
+   - [keyword] for [audience]
+   - [keyword] near me
+   - [keyword] [year]
+   - How to [keyword]
+   - What is [keyword]
+   - [keyword] vs [alternative]
+   - [keyword] examples
+   - [keyword] tools
+   
+   ### Long-tail Variations
+   - [keyword] for beginners
+   - [keyword] for small business
+   - Free [keyword]
+   - [keyword] software/tool/service
+   - [keyword] template
+   - [keyword] checklist
+   - [keyword] guide
+   ```
+
+### Phase 4/8: Classify
+
+   Categorize each keyword:
+
+   | Intent | Signals | Example | Content Type |
+   |--------|---------|---------|--------------|
+   | Informational | what, how, why, guide, learn | "what is SEO" | Blog posts, guides |
+   | Navigational | brand names, specific sites | "google analytics login" | Homepage, product pages |
+   | Commercial | best, review, vs, compare | "best SEO tools [current year]" | Comparison posts, reviews |
+   | Transactional | buy, price, discount, order | "buy SEO software" | Product pages, pricing |
+
+### Phase 5/8: Score
+
+   Score each keyword (1-100 scale):
+
+   ```markdown
+   ### Difficulty Factors
+   
+   **High Difficulty (70-100)**
+   - Major brands ranking
+   - High domain authority competitors
+   - Established content (1000+ backlinks)
+   - Paid ads dominating SERP
+   
+   **Medium Difficulty (40-69)**
+   - Mix of authority and niche sites
+   - Some opportunities for quality content
+   - Moderate backlink requirements
+   
+   **Low Difficulty (1-39)**
+   - Few authoritative competitors
+   - Thin or outdated content ranking
+   - Long-tail variations
+   - New or emerging topics
+   ```
+
+#### Opportunity Score
+
+   Formula: `Opportunity = (Volume × Intent Value) / Difficulty`
+
+   **Intent Value** assigns a numeric weight by search intent:
+   - Informational = 1
+   - Navigational = 1
+   - Commercial = 2
+   - Transactional = 3
+
+   ```markdown
+   ### Opportunity Matrix
+   
+   | Scenario | Volume | Difficulty | Intent | Priority |
+   |----------|--------|------------|--------|----------|
+   | Quick Win | Low-Med | Low | High | ⭐⭐⭐⭐⭐ |
+   | Growth | High | Medium | High | ⭐⭐⭐⭐ |
+   | Long-term | High | High | High | ⭐⭐⭐ |
+   | Research | Low | Low | Low | ⭐⭐ |
+   ```
+
+### Phase 6/8: GEO-Check — AI Answer Overlap
+
+   Keywords likely to trigger AI responses:
+   
+   ```markdown
+   ### GEO-Relevant Keywords
+   
+   **High GEO Potential**
+   - Question formats: "What is...", "How does...", "Why is..."
+   - Definition queries: "[term] meaning", "[term] definition"
+   - Comparison queries: "[A] vs [B]", "difference between..."
+   - List queries: "best [category]", "top [number] [items]"
+   - How-to queries: "how to [action]", "steps to [goal]"
+   
+   **AI Answer Indicators**
+   - Query is factual/definitional
+   - Answer can be summarized concisely
+   - Topic is well-documented online
+   - Low commercial intent
+   ```
+
+### Phase 7/8: Cluster
+
+   Group keywords into content clusters:
+
+   ```markdown
+   ## Topic Cluster: [Main Topic]
+   
+   **Pillar Content**: [Primary keyword]
+   - Search volume: [X]
+   - Difficulty: [X]
+   - Content type: Comprehensive guide
+   
+   **Cluster Content**:
+   
+   ### Sub-topic 1: [Secondary keyword]
+   - Volume: [X]
+   - Difficulty: [X]
+   - Links to: Pillar
+   - Content type: [Blog post/Tutorial/etc.]
+   
+   ### Sub-topic 2: [Secondary keyword]
+   - Volume: [X]
+   - Difficulty: [X]
+   - Links to: Pillar + Sub-topic 1
+   - Content type: [Blog post/Tutorial/etc.]
+   
+   [Continue for all cluster keywords...]
+   ```
+
+### Phase 8/8: Deliver
+
+   Produce a report containing: Executive Summary, Top Keyword Opportunities (Quick Wins, Growth, GEO), Topic Clusters, Content Calendar, and Next Steps.
+
+   **Quality bar** — every recommendation must include at least one specific number. If it reads like the left column, rewrite it before including.
+
+   | ❌ Generic (rewrite before including) | ✅ Actionable |
+   |---|---|
+   | "Target long-tail keywords for better results" | "Target 'project management for nonprofits' (vol: 320, KD: 22) — no DR>40 sites in top 10" |
+   | "This keyword has good potential" | "Opportunity 8.4: vol 4,800, KD 28, transactional intent — gap analysis shows no content updated since 2023 in top 5" |
+   | "Consider creating content around this topic" | "Write '[Tool A] vs [Tool B] for small teams' — 1,200/mo searches, current #1 is a 2022 article with 12 backlinks" |
+   | "Optimize your page for this keyword" | "Add primary keyword to H1 (currently missing), write a 40-word direct answer in paragraph 1, add 3 internal links from your /blog/ cluster" |
+
+   > **Reference**: See [references/example-report.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/research/keyword-research/references/example-report.md) for the full report template and example.
+
+## Validation Checkpoints
+
+### Input Validation
+- [ ] Seed keywords or topic description clearly provided
+- [ ] Target audience and business goals specified
+- [ ] Geographic and language targeting confirmed
+- [ ] Domain authority or site maturity level established
+
+### Output Validation
+- [ ] Every recommendation cites specific data points (not generic advice)
+- [ ] Search volume and difficulty scores included for each keyword
+- [ ] Keywords grouped by intent and mapped to content types
+- [ ] Topic clusters show clear pillar-to-cluster relationships
+- [ ] Source of each data point clearly stated (~~SEO tool data, user-provided, or estimated)
+
+## Example
+
+> **Reference**: See [references/example-report.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/research/keyword-research/references/example-report.md) for a complete example report for "project management software for small businesses".
+
+### Advanced Usage
+
+- **Intent Mapping**: `Map all keywords for [topic] by search intent and funnel stage`
+- **Seasonal Analysis**: `Identify seasonal keyword trends for [industry]`
+- **Competitor Gap**: `What keywords do [competitor 1], [competitor 2] rank for that I'm missing?`
+- **Local Keywords**: `Research local keywords for [business type] in [city/region]`
+
+## Tips for Success
+
+1. **Start with seed keywords** that describe your core offering
+2. **Don't ignore long-tail** - they often have highest conversion rates
+3. **Match content to intent** - informational queries need guides, not sales pages
+4. **Group into clusters** for topical authority
+5. **Prioritize quick wins** to build momentum and credibility
+6. **Include GEO keywords** in your strategy for AI visibility
+7. **Review quarterly** - keyword dynamics change over time
+
+
+
+### Save Results
+
+After delivering findings to the user, ask:
+
+> "Save these results for future sessions?"
+
+If yes, write a dated summary to `memory/research/keyword-research/YYYY-MM-DD-<topic>.md` containing:
+- One-line headline finding
+- Top 3-5 actionable items
+- Open loops or blockers
+- Source data references
+
+If any findings should influence ongoing strategy, recommend promoting key conclusions to `memory/hot-cache.md`.
+
+## Reference Materials
+
+- [Keyword Intent Taxonomy](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/research/keyword-research/references/keyword-intent-taxonomy.md) — Complete intent classification with signal words and content strategies
+- [Topic Cluster Templates](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/research/keyword-research/references/topic-cluster-templates.md) — Hub-and-spoke architecture templates for pillar and cluster content
+- [Keyword Prioritization Framework](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/research/keyword-research/references/keyword-prioritization-framework.md) — Priority scoring matrix, categories, and seasonal keyword patterns
+- [Example Report](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/research/keyword-research/references/example-report.md) — Complete example keyword research report for project management software
+
+## Next Best Skill
+
+- **Primary**: [competitor-analysis](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/research/competitor-analysis/SKILL.md) — turn keyword opportunities into a competitive benchmark.
diff --git a/.cursor/skills/keyword-research/references/example-report.md b/.cursor/skills/keyword-research/references/example-report.md
new file mode 100644
index 0000000000..3c0a0f495a
--- /dev/null
+++ b/.cursor/skills/keyword-research/references/example-report.md
@@ -0,0 +1,96 @@
+# Keyword Research — Example Report
+
+This is a complete example output showing the expected format for keyword research deliverables.
+
+## Prompt
+
+**User**: "Research keywords for a project management software company targeting small businesses"
+
+## Example Output
+
+```markdown
+# Keyword Research Report: Project Management Software
+
+**Generated**: [current month and year]
+**Target Audience**: Small business owners and teams
+**Business Goal**: Software signups and trials
+
+## Executive Summary
+
+- Total keywords analyzed: 150+
+- High-priority opportunities: 23
+- Estimated traffic potential: 45,000/month
+- Recommended focus areas:
+  - Task management workflows
+  - Team collaboration
+  - Small business productivity
+
+## Top Keyword Opportunities
+
+### Quick Wins (Priority: Immediate)
+
+| Keyword | Volume | Difficulty | Intent | Score |
+|---------|--------|------------|--------|-------|
+| project management for small teams | 1,200 | 28 | Commercial | 92 |
+| simple task management software | 890 | 25 | Commercial | 89 |
+| best free project management tool | 2,400 | 35 | Commercial | 85 |
+| how to manage remote team projects | 720 | 22 | Informational | 82 |
+| project tracking spreadsheet alternative | 480 | 18 | Commercial | 80 |
+
+### Growth Keywords (Priority: 3-6 months)
+
+| Keyword | Volume | Difficulty | Intent | Score |
+|---------|--------|------------|--------|-------|
+| project management software | 18,000 | 72 | Commercial | 65 |
+| best project management tools [current year] | 8,500 | 65 | Commercial | 62 |
+| project management app | 12,000 | 68 | Commercial | 58 |
+
+### GEO Opportunities (AI-citation potential)
+
+| Keyword | Type | AI Potential | Recommended Format |
+|---------|------|--------------|-------------------|
+| what is project management | Definition | ⭐⭐⭐⭐⭐ | Clear definition + methodology |
+| agile vs waterfall | Comparison | ⭐⭐⭐⭐⭐ | Side-by-side comparison table |
+| project management methodologies | List | ⭐⭐⭐⭐ | Comprehensive list with pros/cons |
+| how to create a project plan | How-to | ⭐⭐⭐⭐ | Step-by-step guide |
+| project management best practices | List | ⭐⭐⭐⭐ | Numbered best practices |
+
+## Topic Clusters
+
+### Cluster 1: Project Management Fundamentals
+
+**Pillar**: "Complete Guide to Project Management" (8,500 volume)
+
+Cluster articles:
+1. What is project management? (2,200 volume)
+2. Project management methodologies explained (1,800 volume)
+3. How to create a project plan (1,400 volume)
+4. Project management best practices (1,200 volume)
+5. Project management roles and responsibilities (890 volume)
+
+### Cluster 2: Team Collaboration
+
+**Pillar**: "Team Collaboration Tools Guide" (4,200 volume)
+
+Cluster articles:
+1. How to improve team communication (1,600 volume)
+2. Remote team management tips (1,400 volume)
+3. Best practices for distributed teams (920 volume)
+4. Team productivity tools comparison (780 volume)
+
+## Content Calendar Recommendations
+
+| Month | Content | Target Keyword | Type |
+|-------|---------|----------------|------|
+| Week 1 | Simple Task Management Guide | simple task management software | Blog + Demo |
+| Week 2 | Project Management for Small Teams | project management for small teams | Pillar Page |
+| Week 3 | Agile vs Waterfall: Complete Comparison | agile vs waterfall | Comparison |
+| Week 4 | Free PM Tools Roundup | best free project management tool | Listicle |
+
+## Next Steps
+
+1. **Immediate**: Create landing pages for top 5 quick-win keywords
+2. **Week 1-2**: Write pillar content for "Project Management Fundamentals"
+3. **Week 3-4**: Build out cluster content with internal linking
+4. **Ongoing**: Track rankings and adjust strategy based on performance
+```
diff --git a/.cursor/skills/keyword-research/references/keyword-intent-taxonomy.md b/.cursor/skills/keyword-research/references/keyword-intent-taxonomy.md
new file mode 100644
index 0000000000..7e5f362506
--- /dev/null
+++ b/.cursor/skills/keyword-research/references/keyword-intent-taxonomy.md
@@ -0,0 +1,324 @@
+# Keyword Intent Taxonomy
+
+A comprehensive classification system for understanding search intent, mapping keywords to content strategies, and aligning with conversion funnels.
+
+## Overview
+
+Search intent (also called user intent or query intent) is the underlying goal a user has when typing a query into a search engine or AI system. Correctly classifying intent determines content format, tone, depth, and conversion strategy.
+
+This taxonomy defines **4 primary intent types**, each with **3-4 sub-categories**, along with signal words, SERP feature correlations, and content format recommendations.
+
+---
+
+## Primary Intent Types
+
+### 1. Informational Intent
+
+The user wants to **learn something**. They are seeking knowledge, not a product.
+
+#### Sub-categories
+
+| Sub-category | Description | Example Queries |
+|-------------|------------|----------------|
+| **Educational** | Broad learning about a topic | "what is SEO", "content marketing explained", "how search engines work" |
+| **Instructional** | Step-by-step guidance on completing a task | "how to set up Google Analytics", "how to write a meta description", "how to do keyword research" |
+| **Exploratory** | Open-ended research to understand options | "types of SEO strategies", "content marketing trends", "link building techniques" |
+| **Troubleshooting** | Diagnosing and fixing a problem | "why is my site not ranking", "Google indexing issues", "404 error fix" |
+
+#### Signal Words and Modifiers
+
+**Strong informational signals:**
+- what, what is, what are, what does
+- how, how to, how does, how do I
+- why, why is, why does, why do
+- when, when to, when should
+- who, who is, who does
+- where, where to, where can I
+- guide, tutorial, learn, explain
+- examples, tips, ideas, ways
+- definition, meaning, difference between
+- history of, basics of, introduction to
+- pros and cons, advantages, disadvantages
+- best practices, checklist, steps
+
+**Weak informational signals (may overlap with commercial):**
+- overview, comparison, review (can be commercial)
+- list of, types of, methods
+
+#### SERP Feature Correlation
+
+| SERP Feature | Frequency for Informational | Notes |
+|-------------|---------------------------|-------|
+| Featured Snippet | Very High | Especially for "what is" and "how to" queries |
+| People Also Ask | Very High | Almost always present for informational queries |
+| Knowledge Panel | High | For entity-related informational queries |
+| AI Overview | Very High | AI systems prioritize informational answers |
+| Video Carousel | Medium-High | Especially for instructional queries |
+| Image Pack | Medium | For visual/diagram-seeking queries |
+| Shopping Results | Very Low | Rarely appears for pure informational |
+
+#### Content Format Recommendations
+
+| Sub-category | Best Formats | Target Length | Key Elements |
+|-------------|-------------|--------------|-------------|
+| Educational | Definitive guides, explainers, glossary entries | 1,500-3,000 words | Clear definitions, examples, visuals |
+| Instructional | Step-by-step tutorials, how-to guides, video walkthroughs | 2,000-4,000 words | Numbered steps, screenshots, code samples |
+| Exploratory | Listicles, roundups, trend reports, comparison tables | 2,000-3,500 words | Categories, pros/cons, use cases |
+| Troubleshooting | FAQ pages, debug guides, knowledge base articles | 800-2,000 words | Problem-solution pairs, error codes, screenshots |
+
+---
+
+### 2. Navigational Intent
+
+The user wants to **find a specific website, page, or resource**. They already know where they want to go.
+
+#### Sub-categories
+
+| Sub-category | Description | Example Queries |
+|-------------|------------|----------------|
+| **Brand Search** | Looking for a specific company or product | "Ahrefs", "HubSpot CRM", "Moz Pro" |
+| **Product/Feature Search** | Looking for a specific feature or tool within a known brand | "Ahrefs keyword explorer", "Google Search Console coverage report" |
+| **Login/Access** | Trying to access an account or dashboard | "Ahrefs login", "Google Analytics sign in", "SEMrush dashboard" |
+| **Support/Docs** | Seeking help documentation for a known product | "Ahrefs API documentation", "Google Search Console help" |
+
+#### Signal Words and Modifiers
+
+**Strong navigational signals:**
+- [brand name] (standalone)
+- [brand name] login, sign in, sign up
+- [brand name] pricing, plans
+- [brand name] support, help, docs
+- [brand name] [feature name]
+- [brand name] download, app
+- official, website, site
+- [brand name] alternatives (borderline commercial)
+
+#### SERP Feature Correlation
+
+| SERP Feature | Frequency for Navigational | Notes |
+|-------------|---------------------------|-------|
+| Sitelinks | Very High | Brand queries almost always show sitelinks |
+| Knowledge Panel | High | For established brands with Google entity |
+| Featured Snippet | Low | Rarely for pure navigational |
+| People Also Ask | Medium | Often brand-related questions |
+| AI Overview | Low | AI usually defers to direct navigation |
+
+#### Content Format Recommendations
+
+| Sub-category | Best Formats | Key Elements |
+|-------------|-------------|-------------|
+| Brand Search | Homepage, about page, landing pages | Clear branding, value prop, navigation |
+| Product/Feature Search | Product pages, feature pages | Feature details, CTAs, screenshots |
+| Login/Access | Login page, getting started guide | Fast load, clear login form, password reset |
+| Support/Docs | Knowledge base, documentation hub | Search functionality, categorized articles |
+
+---
+
+### 3. Commercial Investigation Intent
+
+The user is **researching before making a purchase decision**. They are comparing options, reading reviews, and evaluating alternatives.
+
+#### Sub-categories
+
+| Sub-category | Description | Example Queries |
+|-------------|------------|----------------|
+| **Comparison** | Evaluating two or more specific options | "Ahrefs vs SEMrush", "WordPress vs Squarespace for SEO" |
+| **Review-seeking** | Looking for opinions and evaluations | "Ahrefs review 2026", "best SEO tool reviews", "is Moz worth it" |
+| **Best-of Research** | Seeking curated top options in a category | "best SEO tools", "top keyword research tools", "best free backlink checkers" |
+| **Evaluation** | Assessing fit for a specific need | "SEO tools for small business", "enterprise SEO platform features", "SEO tool for agencies" |
+
+#### Signal Words and Modifiers
+
+**Strong commercial signals:**
+- best, top, leading
+- vs, versus, compared to, comparison
+- review, reviews, rating, ratings
+- alternative, alternatives to
+- like [brand], similar to
+- for [audience], for [use case]
+- pros and cons (when about products)
+- worth it, is [product] good
+- features, capabilities, integrations
+- pricing, cost, how much
+- free vs paid, premium
+
+**Seasonal/temporal commercial signals:**
+- [product] [year] (e.g., "best SEO tools 2026")
+- new, latest, updated
+
+#### SERP Feature Correlation
+
+| SERP Feature | Frequency for Commercial | Notes |
+|-------------|------------------------|-------|
+| People Also Ask | Very High | Comparison and evaluation questions |
+| Featured Snippet | High | Especially for "best" and comparison queries |
+| AI Overview | High | AI often synthesizes comparisons |
+| Shopping Results | Medium | When products have direct purchase path |
+| Video Carousel | Medium | Review and comparison videos |
+| Ads (top) | Very High | Advertisers target commercial intent heavily |
+
+#### Content Format Recommendations
+
+| Sub-category | Best Formats | Target Length | Key Elements |
+|-------------|-------------|--------------|-------------|
+| Comparison | Side-by-side comparison pages, "X vs Y" articles | 2,500-4,000 words | Feature tables, pricing tables, verdict |
+| Review-seeking | In-depth reviews, user testimonials, case studies | 2,000-3,500 words | Honest pros/cons, screenshots, scoring |
+| Best-of Research | Ranked listicles, category roundups | 3,000-5,000 words | Criteria explanation, mini-reviews, comparison table |
+| Evaluation | Buying guides, use-case guides, ROI calculators | 2,000-3,500 words | Decision criteria, audience segmentation, recommendations |
+
+---
+
+### 4. Transactional Intent
+
+The user wants to **complete a specific action**, typically a purchase, signup, or download.
+
+#### Sub-categories
+
+| Sub-category | Description | Example Queries |
+|-------------|------------|----------------|
+| **Purchase** | Ready to buy a product or service | "buy Ahrefs subscription", "SEMrush pricing plans", "order SEO audit" |
+| **Signup/Trial** | Ready to create an account or start a trial | "Ahrefs free trial", "sign up for Moz", "create Google Search Console account" |
+| **Download** | Wants to download software, template, or resource | "download Screaming Frog", "SEO checklist PDF download", "free keyword template" |
+| **Hire/Engage** | Ready to hire a professional or agency | "hire SEO consultant", "SEO agency near me", "freelance SEO expert" |
+
+#### Signal Words and Modifiers
+
+**Strong transactional signals:**
+- buy, purchase, order, subscribe
+- price, pricing, cost, discount, coupon, deal
+- free trial, demo, signup, sign up, register
+- download, install, get
+- hire, book, schedule, request quote
+- near me, in [location]
+- cheap, affordable, budget
+- premium, pro, enterprise
+- [product] discount code, [product] coupon
+
+#### SERP Feature Correlation
+
+| SERP Feature | Frequency for Transactional | Notes |
+|-------------|---------------------------|-------|
+| Shopping Results | Very High | Dominant for product purchase queries |
+| Ads (top + bottom) | Very High | Maximum ad coverage |
+| Local Pack | High | For service-based transactional queries |
+| Sitelinks | Medium | For brand-specific transactional |
+| Featured Snippet | Low | Rarely for pure transactional |
+| AI Overview | Low-Medium | Sometimes for pricing/comparison |
+
+#### Content Format Recommendations
+
+| Sub-category | Best Formats | Key Elements |
+|-------------|-------------|-------------|
+| Purchase | Product pages, pricing pages, checkout | Clear pricing, CTAs, trust signals, reviews |
+| Signup/Trial | Landing pages, signup forms, onboarding | Minimal friction, benefits, social proof |
+| Download | Resource landing pages, gated content | Preview, form, instant delivery |
+| Hire/Engage | Service pages, contact forms, portfolios | Credentials, case studies, clear next step |
+
+---
+
+## Intent Mapping to Conversion Funnel
+
+### Full Funnel Alignment
+
+```
+FUNNEL STAGE        INTENT TYPE              CONTENT GOAL
+─────────────────────────────────────────────────────────────
+Awareness       →   Informational            Attract & educate
+                    (Educational, Exploratory)
+
+Interest        →   Informational            Deepen engagement
+                    (Instructional)
+
+Consideration   →   Commercial Investigation  Build preference
+                    (Best-of, Comparison)
+
+Evaluation      →   Commercial Investigation  Overcome objections
+                    (Review, Evaluation)
+
+Decision        →   Transactional            Convert
+                    (Purchase, Signup, Hire)
+
+Retention       →   Navigational             Support & retain
+                    (Support/Docs, Login)
+
+Advocacy        →   Informational            Enable sharing
+                    (Educational — about you)
+```
+
+### Conversion Potential by Intent
+
+| Intent Type | Avg. Conversion Rate | Typical CTA | Nurture Length |
+|------------|---------------------|-------------|---------------|
+| Informational | 0.5-2% | Newsletter signup, content download | Long (weeks-months) |
+| Navigational | N/A (already engaged) | Login, access feature | N/A |
+| Commercial Investigation | 2-5% | Free trial, demo request, comparison download | Medium (days-weeks) |
+| Transactional | 5-15% | Buy now, subscribe, hire | Short (immediate-days) |
+
+---
+
+## Mixed Intent Queries
+
+Many real-world queries blend multiple intent types. Recognize and address the dominant + secondary intent.
+
+### Common Mixed Intent Patterns
+
+| Query Pattern | Primary Intent | Secondary Intent | Content Strategy |
+|--------------|---------------|-----------------|-----------------|
+| "best [product] for [use case]" | Commercial | Informational | Buying guide with educational context |
+| "how to [task] with [product]" | Instructional | Navigational | Tutorial featuring the product |
+| "[product] review and pricing" | Commercial | Transactional | Review with clear pricing CTAs |
+| "what is [concept] tools" | Informational | Commercial | Explainer that transitions to tool recommendations |
+| "[product] free download" | Transactional | Navigational | Landing page with download + product info |
+
+### Handling Mixed Intent in Content
+
+1. **Lead with the primary intent** -- answer the dominant question first
+2. **Bridge to the secondary intent** -- naturally transition to the secondary need
+3. **Match SERP format** -- if SERPs show mixed results, your content should too
+4. **Segment CTAs** -- provide different CTAs for different intent segments within the page
+
+---
+
+## SEO-Specific Keyword Intent Examples
+
+### SEO Industry Keyword Classification
+
+| Keyword | Intent | Sub-category | Recommended Content |
+|---------|--------|-------------|-------------------|
+| what is SEO | Informational | Educational | Definitive guide with visuals |
+| how to do keyword research | Informational | Instructional | Step-by-step tutorial |
+| best SEO tools 2026 | Commercial | Best-of Research | Ranked listicle with mini-reviews |
+| Ahrefs vs SEMrush | Commercial | Comparison | Side-by-side feature comparison |
+| buy Ahrefs subscription | Transactional | Purchase | Pricing/product page |
+| Google Search Console login | Navigational | Login/Access | Login page or redirect |
+| why is my site not indexing | Informational | Troubleshooting | Debug guide with solutions |
+| SEO agency for ecommerce | Transactional | Hire/Engage | Service page with portfolio |
+| is Moz worth it | Commercial | Review-seeking | Honest review with alternatives |
+| SEO checklist template | Transactional | Download | Gated resource landing page |
+| types of backlinks | Informational | Exploratory | Categorized explainer |
+| SEMrush pricing | Commercial | Evaluation | Pricing breakdown page |
+
+---
+
+## Applying Intent Taxonomy in Keyword Research
+
+### Step-by-Step Classification Process
+
+1. **Read the query literally** -- what is the user asking for?
+2. **Check for signal words** -- match against the signal word tables above
+3. **Verify against SERP** -- what does Google actually show? The SERP is the ground truth
+4. **Identify sub-category** -- narrow from primary intent to sub-category
+5. **Check for mixed intent** -- does the query blend two types?
+6. **Map to content format** -- select format based on intent + sub-category
+7. **Assign funnel stage** -- determine where this fits in the buyer journey
+8. **Set conversion expectation** -- align CTA and goals with intent type
+
+### Common Classification Mistakes
+
+| Mistake | Example | Why It's Wrong | Correct Classification |
+|---------|---------|---------------|----------------------|
+| Treating all "best" queries as informational | "best CRM software" | User is comparing products to buy | Commercial Investigation |
+| Treating all questions as informational | "how much does Ahrefs cost" | User is evaluating a purchase | Commercial / Transactional |
+| Ignoring local intent | "SEO services" | May have strong local intent in many contexts | Transactional (local) |
+| Missing navigational intent | "HubSpot blog" | User wants a specific page | Navigational |
+| Assuming single intent | "SEO tools" | Could be informational OR commercial | Mixed -- check SERP |
diff --git a/.cursor/skills/keyword-research/references/keyword-prioritization-framework.md b/.cursor/skills/keyword-research/references/keyword-prioritization-framework.md
new file mode 100644
index 0000000000..e32b7dda99
--- /dev/null
+++ b/.cursor/skills/keyword-research/references/keyword-prioritization-framework.md
@@ -0,0 +1,41 @@
+# Keyword Prioritization Framework
+
+Systematic scoring methodology for ranking keywords by strategic value.
+
+## Relationship to Opportunity Score
+
+**Which score to use**: Use the Priority Score (below) for initial keyword triage and shortlisting. Use the Opportunity Score (Step 6 in the main workflow) for final content calendar prioritization, where the additional GEO and competitive factors provide more nuanced ranking.
+
+## Priority Scoring Matrix
+
+Score each keyword 1-5 on these factors, then calculate weighted total:
+
+| Factor | Weight | Score 1 (Low) | Score 5 (High) |
+|--------|--------|---------------|----------------|
+| Search Volume | 20% | <100/mo | >10,000/mo |
+| Keyword Difficulty | 25% | KD >80 (hard) | KD <20 (easy) |
+| Business Relevance | 30% | Tangential to offering | Core to offering |
+| Search Intent Match | 15% | Informational only | Transactional/commercial |
+| Trend Direction | 10% | Declining | Growing |
+
+**Priority Score** = Σ(Factor Weight × Score) / 5
+
+## Priority Categories
+
+| Priority | Score Range | Action |
+|----------|------------|--------|
+| P0 — Must Target | 4.0-5.0 | Create content immediately |
+| P1 — High Value | 3.0-3.9 | Queue for next content sprint |
+| P2 — Opportunity | 2.0-2.9 | Plan for future content calendar |
+| P3 — Monitor | 1.0-1.9 | Track but don't prioritize |
+
+## Seasonal Keyword Patterns
+
+### Seasonal Analysis Framework
+
+| Season Trigger | Example Keywords | Planning Lead Time | Content Strategy |
+|---------------|-----------------|-------------------|-----------------|
+| Calendar events | "Black Friday SEO", "New Year marketing plan" | 3-4 months ahead | Publish 6-8 weeks before peak |
+| Industry events | "[Conference] takeaways", "Google algorithm update" | 1-2 months / reactive | Pre-plan templates, react quickly |
+| Budget cycles | "marketing budget template Q1", "SEO ROI report" | 2-3 months ahead | Target planning season (Oct-Dec) |
+| Seasonal demand | "summer marketing ideas", "holiday email campaigns" | 2-3 months ahead | Refresh annually with new data |
diff --git a/.cursor/skills/keyword-research/references/topic-cluster-templates.md b/.cursor/skills/keyword-research/references/topic-cluster-templates.md
new file mode 100644
index 0000000000..da6ec594d2
--- /dev/null
+++ b/.cursor/skills/keyword-research/references/topic-cluster-templates.md
@@ -0,0 +1,286 @@
+# Topic Cluster Templates
+
+Planning worksheets, architecture patterns, and measurement frameworks for building topic clusters that establish topical authority and drive organic traffic.
+
+## Overview
+
+A topic cluster is a group of interlinked content pieces organized around a central pillar page. The pillar covers a broad topic comprehensively, while cluster pages dive deep into specific subtopics. Internal links bind the cluster together, signaling topical authority to search engines and AI systems.
+
+---
+
+## Topic Cluster Planning Worksheet
+
+### Step 1: Define the Pillar Topic
+
+Use this template to evaluate whether a topic deserves a full cluster:
+
+| Field | Your Input |
+|-------|-----------|
+| **Pillar Topic** | [Broad topic name] |
+| **Pillar Keyword** | [Head keyword, typically 1-3 words] |
+| **Monthly Search Volume** | [Volume] |
+| **Keyword Difficulty** | [KD score] |
+| **Business Relevance** | [1-5 scale: how core is this to your product/service?] |
+| **Current Ranking** | [Your current position, or "Not ranking"] |
+| **Competitor Coverage** | [How many competitors have pillar content on this?] |
+| **Estimated Cluster Size** | [How many subtopics can you identify?] |
+| **Content Assets Available** | [Existing content you can repurpose or link] |
+
+**Pillar viability checklist:**
+- [ ] At least 8-12 subtopics can be identified
+- [ ] Combined cluster keyword volume exceeds 5,000/month
+- [ ] Topic is directly relevant to your product or service
+- [ ] You can provide unique expertise or data on this topic
+- [ ] Competitors have not yet built a dominant cluster
+
+### Step 2: Map Cluster Subtopics
+
+| # | Subtopic | Target Keyword | Volume | KD | Intent | Content Format | Status |
+|---|---------|---------------|--------|-----|--------|---------------|--------|
+| 1 | [Subtopic name] | [Long-tail keyword] | [Vol] | [KD] | [I/N/C/T] | [Guide/Tutorial/List/etc.] | [Idea/Draft/Published] |
+| 2 | [Subtopic name] | [Long-tail keyword] | [Vol] | [KD] | [I/N/C/T] | [Guide/Tutorial/List/etc.] | [Idea/Draft/Published] |
+| 3 | [Subtopic name] | [Long-tail keyword] | [Vol] | [KD] | [I/N/C/T] | [Guide/Tutorial/List/etc.] | [Idea/Draft/Published] |
+| 4 | [Subtopic name] | [Long-tail keyword] | [Vol] | [KD] | [I/N/C/T] | [Guide/Tutorial/List/etc.] | [Idea/Draft/Published] |
+| 5 | [Subtopic name] | [Long-tail keyword] | [Vol] | [KD] | [I/N/C/T] | [Guide/Tutorial/List/etc.] | [Idea/Draft/Published] |
+| 6 | [Subtopic name] | [Long-tail keyword] | [Vol] | [KD] | [I/N/C/T] | [Guide/Tutorial/List/etc.] | [Idea/Draft/Published] |
+| 7 | [Subtopic name] | [Long-tail keyword] | [Vol] | [KD] | [I/N/C/T] | [Guide/Tutorial/List/etc.] | [Idea/Draft/Published] |
+| 8 | [Subtopic name] | [Long-tail keyword] | [Vol] | [KD] | [I/N/C/T] | [Guide/Tutorial/List/etc.] | [Idea/Draft/Published] |
+
+### Step 3: Define Internal Linking Map
+
+| Source Page | Links To | Anchor Text Strategy |
+|------------|---------|---------------------|
+| Pillar | Cluster 1, 2, 3... (all) | Descriptive, keyword-relevant anchors |
+| Cluster 1 | Pillar + Cluster 2, 3 | Natural contextual links |
+| Cluster 2 | Pillar + Cluster 1, 4 | Natural contextual links |
+| Cluster 3 | Pillar + Cluster 1 | Natural contextual links |
+
+**Linking rules:**
+- Every cluster page MUST link to the pillar page
+- The pillar page MUST link to every cluster page
+- Cluster pages SHOULD link to 2-3 related cluster pages where contextually relevant
+- Use descriptive anchor text (not "click here" or bare URLs)
+- Link placement should be within body content, not just in a footer list
+
+---
+
+## Hub-and-Spoke Architecture Patterns
+
+### Pattern 1: Classic Hub-Spoke (Best for Educational Topics)
+
+```
+                         ┌──────────────────┐
+                    ┌────│ What is [Topic]?  │
+                    │    └──────────────────┘
+                    │    ┌──────────────────┐
+                    ├────│ [Topic] Benefits  │
+                    │    └──────────────────┘
+┌───────────────┐   │    ┌──────────────────┐
+│    PILLAR:    │───┼────│ [Topic] Examples  │
+│ Complete      │   │    └──────────────────┘
+│ Guide to      │   │    ┌──────────────────┐
+│ [Topic]       │   ├────│ [Topic] Tools     │
+└───────────────┘   │    └──────────────────┘
+                    │    ┌──────────────────┐
+                    ├────│ [Topic] Mistakes  │
+                    │    └──────────────────┘
+                    │    ┌──────────────────┐
+                    └────│ [Topic] Checklist │
+                         └──────────────────┘
+```
+
+**Best for:** Broad educational topics where subtopics don't overlap much.
+**Example:** "Content Marketing" pillar with spokes for strategy, types, examples, tools, metrics, mistakes.
+
+### Pattern 2: Layered Cluster (Best for Technical Topics)
+
+```
+                    ┌─────────────────────────────┐
+               ┌────│ BEGINNER CLUSTER             │
+               │    │ - Getting Started            │
+               │    │ - Basic Concepts             │
+               │    │ - First Steps Tutorial       │
+               │    └─────────────────────────────┘
+┌──────────┐   │    ┌─────────────────────────────┐
+│ PILLAR:  │───┼────│ INTERMEDIATE CLUSTER         │
+│ Complete │   │    │ - Advanced Techniques        │
+│ Guide    │   │    │ - Common Patterns            │
+└──────────┘   │    │ - Optimization Strategies    │
+               │    └─────────────────────────────┘
+               │    ┌─────────────────────────────┐
+               └────│ ADVANCED CLUSTER             │
+                    │ - Expert Strategies          │
+                    │ - Edge Cases                 │
+                    │ - Integration Guides         │
+                    └─────────────────────────────┘
+```
+
+**Best for:** Topics with clear skill progressions (programming, technical SEO, data analysis).
+**Example:** "Technical SEO" pillar with beginner (crawling basics), intermediate (JavaScript rendering), advanced (log file analysis) layers.
+
+### Pattern 3: Use-Case Cluster (Best for Product/Service Topics)
+
+```
+                    ┌──────────────────────┐
+               ┌────│ [Topic] for SMBs     │
+               │    └──────────────────────┘
+               │    ┌──────────────────────┐
+┌──────────┐   ├────│ [Topic] for Enterprise│
+│ PILLAR:  │───┤    └──────────────────────┘
+│ [Topic]  │   │    ┌──────────────────────┐
+│ Guide    │   ├────│ [Topic] for Agencies  │
+└──────────┘   │    └──────────────────────┘
+               │    ┌──────────────────────┐
+               ├────│ [Topic] for Ecommerce │
+               │    └──────────────────────┘
+               │    ┌──────────────────────┐
+               └────│ [Topic] for SaaS      │
+                    └──────────────────────┘
+```
+
+**Best for:** Products/services with distinct audience segments.
+**Example:** "SEO Strategy" pillar with spokes for different business types (ecommerce SEO, SaaS SEO, local SEO, B2B SEO).
+
+### Pattern 4: Process Cluster (Best for How-To Topics)
+
+```
+┌──────────┐     ┌─────────┐     ┌─────────┐     ┌─────────┐     ┌─────────┐
+│ PILLAR:  │────→│ Step 1:  │────→│ Step 2:  │────→│ Step 3:  │────→│ Step 4:  │
+│ How to   │     │ Research │     │ Plan     │     │ Execute  │     │ Measure  │
+│ [Process]│     └─────────┘     └─────────┘     └─────────┘     └─────────┘
+└──────────┘          │               │               │               │
+                 ┌─────────┐    ┌─────────┐    ┌─────────┐    ┌─────────┐
+                 │Sub-guide │    │Template  │    │Tutorial  │    │Dashboard │
+                 │  1a      │    │  2a      │    │  3a      │    │  Guide   │
+                 └─────────┘    └─────────┘    └─────────┘    └─────────┘
+```
+
+**Best for:** Multi-step processes where each step is complex enough for its own article.
+**Example:** "Link Building" pillar with sequential steps (prospecting, outreach, content creation, tracking).
+
+---
+
+## Internal Linking Patterns Within Clusters
+
+### Linking Density Guidelines
+
+| Cluster Size | Min Links Per Cluster Page | Max Links Per Cluster Page | Pillar Link Density |
+|-------------|--------------------------|--------------------------|-------------------|
+| 5-8 pages | 2-3 internal links | 5-6 internal links | Link to every cluster page |
+| 9-15 pages | 3-4 internal links | 6-8 internal links | Link to every cluster page |
+| 16+ pages | 4-5 internal links | 8-10 internal links | Link to top cluster pages, categorize rest |
+
+### Anchor Text Strategy
+
+| Link Type | Anchor Text Approach | Example |
+|-----------|---------------------|---------|
+| Cluster → Pillar | Broad keyword or branded | "our complete guide to keyword research" |
+| Pillar → Cluster | Specific keyword for that cluster page | "learn about long-tail keyword strategies" |
+| Cluster → Cluster | Contextual, conversational | "this connects to how you assess keyword difficulty" |
+
+### Linking Anti-Patterns to Avoid
+
+| Anti-Pattern | Why It's Bad | Fix |
+|-------------|-------------|-----|
+| Footer-only links to cluster pages | Low link equity, poor UX | Move links into body content |
+| Generic anchors ("click here", "read more") | No keyword signal | Use descriptive, keyword-relevant anchors |
+| Orphan cluster pages (no inbound links) | Search engines can't discover/value them | Add contextual links from pillar and related clusters |
+| Over-linking (20+ internal links per page) | Dilutes link equity, feels spammy | Keep to 5-10 relevant internal links |
+| Linking only to pillar (ignoring sibling clusters) | Misses cross-cluster relevance | Link to 2-3 related sibling pages |
+
+---
+
+## Content Calendar Integration
+
+### Cluster Build Sequence
+
+The order you publish cluster content matters. Follow this sequence for maximum impact:
+
+| Phase | What to Publish | Why This Order |
+|-------|----------------|---------------|
+| **Phase 1: Pillar** | Publish the pillar page first | Establishes the hub; cluster pages need something to link to |
+| **Phase 2: Quick-win clusters** | Publish 3-4 lowest-difficulty cluster pages | Build early traffic and indexing momentum |
+| **Phase 3: High-value clusters** | Publish highest-volume cluster pages | Leverage pillar authority for competitive terms |
+| **Phase 4: Long-tail clusters** | Publish remaining niche subtopics | Fill coverage gaps, capture long-tail traffic |
+| **Phase 5: Update cycle** | Refresh pillar with new links; update outdated clusters | Maintain freshness signals |
+
+### Sample Content Calendar for One Cluster
+
+| Week | Content Piece | Type | Target Keyword | Word Count | Dependencies |
+|------|--------------|------|---------------|-----------|-------------|
+| 1 | Complete Guide to [Pillar Topic] | Pillar | [Head keyword] | 3,500-5,000 | None |
+| 2 | What is [Subtopic A]? | Cluster | [Long-tail A] | 1,500-2,000 | Pillar published |
+| 3 | How to [Subtopic B] | Cluster | [Long-tail B] | 2,000-2,500 | Pillar published |
+| 4 | [Subtopic C] vs [Subtopic D] | Cluster | [Long-tail C] | 2,000-2,500 | Pillar published |
+| 5 | Best [Subtopic E] Tools | Cluster | [Long-tail E] | 2,500-3,000 | Pillar published |
+| 6 | [Subtopic F] for Beginners | Cluster | [Long-tail F] | 1,500-2,000 | Pillar published |
+| 7 | [Subtopic G] Checklist | Cluster | [Long-tail G] | 1,000-1,500 | Pillar published |
+| 8 | Update Pillar + add all internal links | Update | -- | -- | All clusters published |
+
+### Publishing Cadence Recommendations
+
+| Team Size | Cluster Build Time | Recommended Cadence |
+|-----------|-------------------|-------------------|
+| Solo content creator | 6-8 weeks per cluster | 1 cluster per quarter |
+| Small team (2-3 writers) | 3-4 weeks per cluster | 1 cluster per month |
+| Content team (4-6 writers) | 2-3 weeks per cluster | 2 clusters per month |
+| Large team (7+ writers) | 1-2 weeks per cluster | 1 cluster per week |
+
+---
+
+## Cluster Performance Measurement Framework
+
+### Key Metrics by Level
+
+#### Cluster-Level Metrics
+
+| Metric | What It Measures | Target | How to Track |
+|--------|-----------------|--------|-------------|
+| Total cluster traffic | Aggregate organic visits to all pages in cluster | Growing month-over-month | Analytics: filter by URL folder/tag |
+| Keyword coverage | Number of keywords cluster ranks for | 50+ keywords per mature cluster | SEO tool: filter by cluster URLs |
+| Average position | Mean ranking across all cluster keywords | Improving trend toward top 10 | SEO tool: average position report |
+| Internal link equity | PageRank flow within cluster | Pillar has highest internal links | Site audit tool: internal link report |
+| Cluster completeness | Percentage of planned subtopics published | 100% within planned timeframe | Content calendar tracking |
+
+#### Page-Level Metrics (Per Cluster Page)
+
+| Metric | Pillar Target | Cluster Page Target |
+|--------|-------------|-------------------|
+| Organic traffic | Highest in cluster | Proportional to keyword volume |
+| Keywords ranking | 50-200+ | 10-50 |
+| Backlinks | Attracts most links | Some organic links |
+| Avg. time on page | 4-8 minutes | 2-5 minutes |
+| Bounce rate | <60% | <70% |
+| Internal CTR | High clicks to cluster pages | Clicks to pillar + sibling pages |
+
+### Performance Review Cadence
+
+| Timeframe | What to Review | Action If Underperforming |
+|-----------|---------------|--------------------------|
+| 2 weeks post-publish | Indexing status, initial impressions | Fix indexing issues, check for crawl errors |
+| 1 month post-publish | Early ranking signals, traffic | Optimize titles/metas, add internal links |
+| 3 months post-publish | Ranking positions, traffic trends | Content refresh, add missing subtopics, build links |
+| 6 months post-publish | Full performance assessment | Major content update or strategic pivot |
+| Quarterly (ongoing) | Cluster-level aggregate trends | Identify declining pages, plan refreshes |
+
+### Cluster Health Scorecard
+
+Rate each cluster quarterly on these dimensions:
+
+| Dimension | Score 1 (Poor) | Score 3 (Average) | Score 5 (Excellent) |
+|-----------|---------------|-------------------|-------------------|
+| Traffic growth | Declining | Flat | Growing 10%+ MoM |
+| Keyword coverage | <20 keywords | 20-50 keywords | 50+ keywords |
+| Top 10 rankings | 0 keywords in top 10 | 1-5 in top 10 | 5+ in top 10 |
+| Content freshness | Not updated in 12+ months | Updated within 6 months | Updated within 3 months |
+| Internal linking | Missing links, orphan pages | Basic linking in place | Full cross-linking with relevant anchors |
+| Completeness | <50% of subtopics covered | 50-80% covered | 80-100% covered |
+
+**Cluster Health Score** = Average of all dimension scores
+
+| Score Range | Health Status | Action |
+|------------|--------------|--------|
+| 4.0-5.0 | Healthy | Maintain cadence, expand to adjacent clusters |
+| 3.0-3.9 | Needs attention | Refresh outdated content, fill subtopic gaps |
+| 2.0-2.9 | At risk | Major content update, link building campaign |
+| 1.0-1.9 | Failing | Strategic review -- consider merging, rewriting, or retiring |
diff --git a/.cursor/skills/memory-management/SKILL.md b/.cursor/skills/memory-management/SKILL.md
new file mode 100644
index 0000000000..c9ec3af3b5
--- /dev/null
+++ b/.cursor/skills/memory-management/SKILL.md
@@ -0,0 +1,342 @@
+---
+name: memory-management
+description: 'Persist SEO/GEO campaign context across Claude sessions with automatic hot-list, active work, and archive tiers. 项目记忆/跨会话'
+version: "7.0.0"
+license: Apache-2.0
+compatibility: "Claude Code ≥1.0, skills.sh marketplace, ClawHub marketplace, Vercel Labs skills ecosystem. No system packages required. Optional: MCP network access for SEO tool integrations."
+homepage: "https://github.com/aaron-he-zhu/seo-geo-claude-skills"
+when_to_use: "Use when reviewing, archiving, or cleaning up campaign memory. Also when the user asks to check saved findings, manage hot cache, or archive old data."
+argument-hint: "[review|archive|cleanup]"
+metadata:
+  author: aaron-he-zhu
+  version: "7.0.0"
+  geo-relevance: "low"
+  tags:
+    - seo
+    - geo
+    - project-memory
+    - context-management
+    - campaign-tracking
+    - session-context
+    - hot-cache
+    - 项目记忆
+    - プロジェクト記憶
+    - 프로젝트메모리
+    - memoria-proyecto
+  triggers:
+    # EN-formal
+    - "remember project context"
+    - "save SEO data"
+    - "track campaign progress"
+    - "store keyword data"
+    - "manage project memory"
+    - "project context"
+    - "refresh wiki index"
+    - "build wiki index"
+    - "wiki lint"
+    # EN-casual
+    - "remember this for next time"
+    - "save my keyword data"
+    - "keep track of this campaign"
+    - "what did we decide last time"
+    - "what do we know so far"
+    - "project status"
+    # EN-question
+    - "how to save project progress"
+    # ZH-pro
+    - "项目记忆管理"
+    - "SEO数据保存"
+    - "跨会话记忆"
+    - "刷新wiki索引"
+    - "项目状况"
+    # ZH-casual
+    - "保存进度"
+    - "上次说了什么"
+    - "记住这个"
+    # JA
+    - "プロジェクト記憶"
+    - "SEOデータ保存"
+    # KO
+    - "프로젝트 메모리"
+    - "데이터 저장"
+    # ES
+    - "memoria del proyecto"
+    - "guardar progreso"
+    # PT
+    - "memória do projeto"
+---
+
+# Memory Management
+
+> **[SEO & GEO Skills Library](https://github.com/aaron-he-zhu/seo-geo-claude-skills)** · 20 skills for SEO + GEO · [ClawHub](https://clawhub.ai/u/aaron-he-zhu) · [skills.sh](https://skills.sh/aaron-he-zhu/seo-geo-claude-skills)
+> **System Mode**: This cross-cutting skill is part of the protocol layer and follows the shared [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md) and [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md).
+
+This skill implements a three-tier memory system (HOT/WARM/COLD) for SEO and GEO projects. HOT memory (80 lines max) loads automatically every session via the SessionStart hook. WARM memory loads on demand per skill. COLD memory is archived data queried only when explicitly requested. The skill manages the full lifecycle: capture, promote, demote, and archive.
+
+**System role**: Campaign Memory Loop. It defines how project context is captured, promoted, archived, and handed off across sessions. It is the sole executor of WARM-to-COLD archival and the aggregator for cross-skill project status queries.
+
+## When This Must Trigger
+
+Use this whenever project state should survive the current session — even if the user doesn't use memory terminology:
+
+- User says "remember this", "save this", "keep track of this"
+- User asks "what did we decide", "what do we know", "project status"
+- Setting up memory structure for a new SEO project
+- After completing audits, ranking checks, or performance reports (Stop hook reminds automatically)
+- When project context needs updating (new keywords, competitors, priorities)
+- When you need to look up historical data or project-specific terminology
+- After 30+ days of work to clean up and archive stale data
+- When open-loops.md has items older than 7 days (SessionStart hook reminds automatically)
+
+## What This Skill Does
+
+1. **HOT Cache Management**: Maintains `memory/hot-cache.md` (80 lines max) — loaded automatically every session by SessionStart hook
+2. **WARM Storage**: Organizes dated findings in `memory/` subdirectories — loaded on demand by relevant skills
+3. **COLD Archive**: Moves stale data (90+ days unreferenced) to `memory/archive/` with date prefix
+4. **Promotion**: Elevates frequently-referenced findings from WARM to HOT (3+ refs in 7 days, or 2+ skill refs)
+5. **Demotion**: Moves unreferenced HOT items to WARM (30 days), WARM to COLD (90 days)
+6. **Cross-Skill Aggregation**: When user asks "what do we know", aggregates from all `memory/` subdirectories
+7. **Open Loop Tracking**: Maintains `memory/open-loops.md`, reminds user of stale items via SessionStart hook
+8. **Wiki Index Maintenance**: Compiles `memory/wiki/index.md` — a structured, auto-refreshed index of all WARM files with precise fields (score, 健康度, status, next_action, mtime) and best-effort summaries. Supports project isolation via `memory/wiki/<project>/index.md`. Auto-refreshed on PostToolUse; user confirmation not required (index is a fully rebuildable derived artifact). Delete `memory/wiki/` at any time to revert to pre-wiki behavior.
+9. **Wiki Compiled Pages** (Phase 2): Generates interlinked entity, keyword, and topic pages from WARM files with source hash tracking, contradiction detection, and confidence-labeled reconciliation. Requires user confirmation before writing.
+10. **Wiki Lint** (Phase 2): Detects contradictions, orphan pages, stale claims, missing pages, and source hash mismatches across wiki and WARM files via `/seo:wiki-lint`.
+11. **WARM Retirement Preview** (Phase 3): `wiki-lint --retire-preview` lists WARM files fully covered by wiki compiled pages as retirement candidates. Actual archival to COLD requires explicit user confirmation.
+
+## Quick Start
+
+Start with one of these prompts. Finish with a hot-cache update plan and a handoff summary using the repository format in [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md).
+
+### Initialize Wiki Index
+
+```
+Refresh wiki index
+```
+
+```
+Build wiki index for [project name]
+```
+
+Generates `memory/wiki/index.md` from existing WARM files. Required once to enable wiki features; subsequent refreshes happen automatically.
+
+### Initialize Memory Structure
+
+```
+Set up SEO memory for [project name]
+```
+
+```
+Initialize memory structure for a new [industry] website optimization project
+```
+
+### Update After Analysis
+
+```
+Update memory after ranking check for [keyword group]
+```
+
+```
+Refresh hot cache with latest competitor analysis findings
+```
+
+### Query Stored Context
+
+```
+What are our hero keywords?
+```
+
+```
+Show me the last ranking update date for [keyword category]
+```
+
+```
+Look up our primary competitors and their domain authority
+```
+
+### Promotion and Demotion
+
+```
+Promote [keyword] to hot cache
+```
+
+```
+Archive stale data that hasn't been referenced in 30+ days
+```
+
+### Glossary Management
+
+```
+Add [term] to project glossary: [definition]
+```
+
+```
+What does [internal jargon] mean in this project?
+```
+
+## Skill Contract
+
+**Expected output**: a memory update plan, hot-cache changes, and a short handoff summary.
+
+- **Reads**: current campaign facts, new findings from other skills, approved decisions, and the shared [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md).
+- **Writes**: updates to `memory/hot-cache.md`, `memory/open-loops.md`, `memory/decisions.md`, and related `memory/` folders. Manages WARM-to-COLD archival in `memory/archive/`. Compiles `memory/wiki/index.md` (auto-refreshed) and wiki compiled pages (user-confirmed).
+- **Promotes**: durable strategy, blockers, terminology, entity candidates, and major deltas. Applies temperature lifecycle rules: promote to HOT on high reference frequency, demote on staleness.
+- **Next handoff**: use the `Next Best Skill` below when the project memory baseline is ready for active work.
+
+### Temperature Lifecycle Rules
+
+> See [references/promotion-demotion-rules.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/memory-management/references/promotion-demotion-rules.md) for the full promotion/demotion table and action procedures.
+
+### Hook Integration
+
+This skill's behavior is reinforced by the library's prompt-based hooks:
+- **SessionStart**: loads `memory/hot-cache.md`, reminds of stale open loops; loads `memory/wiki/<project>/index.md` (or global `index.md`) if it exists; provides light-user guidance based on Quick Status when `next_action` items are available
+- **PostToolUse**: after any WARM file write, silently refreshes `memory/wiki/index.md` (Phase 1); prompts to update compiled pages (Phase 2)
+- **Stop**: prompts to save session findings, auto-saves veto issues to hot-cache; appends changelog entry to index.md bottom
+
+## Data Sources
+
+> See [CONNECTORS.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CONNECTORS.md) for tool category placeholders.
+
+**With ~~SEO tool + ~~analytics + ~~search console connected:**
+Automatically populate memory from historical data: keyword rankings over time, competitor domain authority changes, traffic metrics, conversion data, backlink profile evolution. The skill will fetch current rankings, alert on significant changes, and update both hot cache and cold storage.
+
+**With manual data only:**
+Ask the user to provide:
+1. Current target keywords with priority levels
+2. Primary competitors (3-5 domains)
+3. Key performance metrics and last update date
+4. Active campaigns and their status
+5. Any project-specific terminology or abbreviations
+
+Proceed with memory structure creation using provided data. Note in CLAUDE.md which data requires manual updates vs. automated refresh.
+
+## Instructions
+
+When a user requests SEO memory management:
+
+### 1. Initialize Memory Structure
+
+For new projects, create the directory structure defined in the [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md). Key directories: `memory/` (decisions, open-loops, glossary, entities, research, content, audits, monitoring) plus `memory/wiki/` (auto-managed compiled index with optional per-project subdirectories).
+
+> **Templates**: [hot-cache-template.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/memory-management/references/hot-cache-template.md) · [glossary-template.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/memory-management/references/glossary-template.md) · [Wiki spec](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/proposal-wiki-layer-v3.md)
+
+### 2. Context Lookup Flow
+
+When a user references something unclear, follow this lookup sequence:
+
+**Step 1: Check CLAUDE.md (Hot Cache)**
+- Is it in active keywords?
+- Is it in primary competitors?
+- Is it in current priorities or campaigns?
+
+**Step 2: Check Wiki Index** (`memory/wiki/index.md` or project-level) — locate relevant WARM files
+
+**Step 3: Check memory/glossary.md**
+- Is it defined as project terminology?
+- Is it a custom segment or shorthand?
+
+**Step 4: Check Cold Storage**
+- Search memory/research/keywords/ for historical keyword context
+- Search memory/research/competitors/ for past analyses
+- Search memory/monitoring/reports/ for archived mentions
+
+**Step 5: Ask User**
+- If not found in any layer, ask for clarification
+- Log the new term in glossary if it's project-specific
+
+Example lookup:
+
+```markdown
+User: "Update rankings for our hero KWs"
+
+Step 1: Check CLAUDE.md → Found "Hero Keywords (Priority 1)" section
+Step 2: Extract keyword list from hot cache
+Step 3: Execute ranking check
+Step 4: Update both CLAUDE.md and memory/monitoring/rank-history/YYYY-MM-DD-ranks.csv
+```
+
+### 3. Promotion & Demotion Logic
+
+> **Reference**: See [references/promotion-demotion-rules.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/memory-management/references/promotion-demotion-rules.md) for detailed promotion/demotion triggers (keywords, competitors, metrics, campaigns) and the action procedures for each.
+
+### 4. Update Triggers, Archive Management & Cross-Skill Integration
+
+> **Reference**: See [references/update-triggers-integration.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/memory-management/references/update-triggers-integration.md) for the complete update procedures after ranking checks, competitor analyses, audits, and reports; monthly/quarterly archive routines; and integration points with all 8 connected skills (keyword-research, rank-tracker, competitor-analysis, content-gap-analysis, seo-content-writer, content-quality-auditor, domain-authority-auditor).
+
+### 5. Memory Hygiene Checks
+
+When invoked for review or cleanup:
+
+1. **Line count check**: Count lines in `memory/hot-cache.md`. If >80, list oldest entries for archival.
+2. **Byte check**: If hot-cache exceeds 25KB, warn and recommend trimming long entries.
+3. **Staleness scan**: List memory files older than 30 days that have not been referenced. Recommend archival for files >90 days.
+4. **Frontmatter audit**: Check that all memory files (except hot-cache.md) have `name`, `description`, and `type` in their frontmatter. Report any missing fields.
+
+### 6. Save Results
+
+After delivering any memory update or aggregation to the user, ask:
+
+> "Save these results for future sessions?"
+
+If yes, write a dated summary to the appropriate `memory/` path using filename `YYYY-MM-DD-<topic>.md` containing:
+- One-line verdict or headline finding
+- Top 3-5 actionable items
+- Open loops or blockers
+- Source data references
+
+If any veto-level issue was found (CORE-EEAT T04, C01, R10 or CITE T03, T05, T09), also append a one-liner to `memory/hot-cache.md` without asking.
+
+## Validation Checkpoints
+
+### Structure Validation
+- [ ] memory/hot-cache.md exists and is under 80 lines
+- [ ] memory/ directory structure matches the shared state model
+- [ ] glossary.md exists and is populated with project basics
+- [ ] All historical data files include timestamps in filename or metadata
+
+### Content Validation
+- [ ] CLAUDE.md "Last Updated" date is current
+- [ ] Every keyword in hot cache has current rank, target rank, and status
+- [ ] Every competitor has domain authority and position assessment
+- [ ] Every active campaign has status percentage and expected completion date
+- [ ] Key Metrics Snapshot shows "Previous" values for comparison
+
+### Lookup Validation
+- [ ] Test lookup flow: reference a term → verify it finds it in correct layer
+- [ ] Test promotion: manually promote item → verify it appears in CLAUDE.md
+- [ ] Test demotion: manually archive item → verify removed from CLAUDE.md
+- [ ] Glossary contains all custom segments and shorthand used in CLAUDE.md
+
+### Update Validation
+- [ ] After ranking check, `memory/monitoring/rank-history/` has a dated snapshot or export
+- [ ] After competitor analysis, `memory/research/competitors/` has a dated file
+- [ ] After audit, top action items appear in CLAUDE.md priorities
+- [ ] After monthly report, metrics snapshot reflects new data
+
+## Examples
+
+> **Reference**: See [references/examples.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/memory-management/references/examples.md) for three complete examples: (1) updating hero keyword rankings with memory refresh, (2) glossary lookup flow, and (3) initializing memory for a new e-commerce project.
+
+## Advanced Features
+
+- **Smart Context Loading**: `Load full context for [campaign name]` — retrieves hot cache + all cold storage files for a campaign
+- **Memory Health Check**: `Run memory health check` — finds orphaned files, missing timestamps, stale items, broken references
+- **Bulk Promotion/Demotion**: `Promote all keywords ranking in top 10 to hot cache` / `Demote all completed campaigns from Q3`
+- **Memory Snapshot**: `Create memory snapshot for [date/milestone]` — point-in-time copy for major milestones
+- **Cross-Project Memory**: `Compare memory with [other project]` — keyword overlaps, competitor intersections across projects
+- **Wiki Lint**: `/seo:wiki-lint [--fix] [--project name] [--retire-preview]` — contradictions, orphans, stale claims, hash mismatches. See [commands/wiki-lint.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/commands/wiki-lint.md)
+
+## Practical Limitations
+
+- **Concurrent access**: Use timestamped filenames to avoid overwrites from parallel sessions.
+- **Cold storage retrieval**: WARM/COLD files only load on demand. Hot cache is the primary cross-session mechanism.
+- **Data freshness**: Stale data (>90 days) should be flagged for refresh. Wiki index `mtime` field helps detect staleness.
+- **Wiki compilation**: Index is best-effort for summaries; precise fields (score, status, mtime) are deterministic. Delete `memory/wiki/` anytime to revert.
+
+## Reference Materials
+
+- [CORE-EEAT Content Benchmark](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/core-eeat-benchmark.md) — Content quality scoring stored in memory
+- [CITE Domain Rating](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/cite-domain-rating.md) — Domain authority scoring stored in memory
+
+## Next Best Skill
+
+- **Primary**: [keyword-research](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/research/keyword-research/SKILL.md) — seed or refresh campaign strategy with current demand signals.
diff --git a/.cursor/skills/memory-management/references/examples.md b/.cursor/skills/memory-management/references/examples.md
new file mode 100644
index 0000000000..c114b5ea53
--- /dev/null
+++ b/.cursor/skills/memory-management/references/examples.md
@@ -0,0 +1,138 @@
+# Memory Management Examples
+
+Detailed examples showing memory management in action.
+
+---
+
+## Example 1: Update Rankings for Hero Keywords
+
+**User**: "Update rankings for our hero keywords"
+
+**Assistant**:
+
+```markdown
+I'll update rankings for your hero keywords. Let me check the hot cache for the list.
+
+[Reads CLAUDE.md, finds Hero Keywords section:]
+- keyword research tools (currently #15, target #5)
+- best seo software (currently #23, target #10)
+- free backlink checker (currently #8, target #3)
+- content optimization guide (currently #12, target #5)
+- technical seo audit (currently #18, target #8)
+
+[If ~~SEO tool connected: runs automated ranking check]
+[If manual: asks user for current rankings]
+
+---
+
+## Ranking Update - December 15, 2024
+
+| Keyword | Previous | Current | Change | Target | Status |
+|---------|----------|---------|--------|--------|--------|
+| keyword research tools | #15 | #12 | up 3 | #5 | Improving |
+| best seo software | #23 | #24 | down 1 | #10 | Stable |
+| free backlink checker | #8 | #6 | up 2 | #3 | Near target |
+| content optimization guide | #12 | #11 | up 1 | #5 | Improving |
+| technical seo audit | #18 | #16 | up 2 | #8 | Improving |
+
+### Notable Changes
+- **free backlink checker** moved to #6 (only 3 positions from target!)
+- **keyword research tools** gained 3 positions (strong movement)
+- All other keywords showing stable or positive movement
+
+### Updated Files
+1. CLAUDE.md -> Hero Keywords table updated with current ranks
+2. memory/monitoring/rank-history/2024-12-15-ranks.csv -> New row added:
+   `2024-12-15,keyword research tools,12,best seo software,24,free backlink checker,6,...`
+
+### Recommendations
+1. Push additional content/backlinks to "free backlink checker" pages to hit #3 target
+2. Investigate why "keyword research tools" suddenly gained 3 positions (replicate success)
+3. Monitor "best seo software" for any negative trends
+
+Hot cache refreshed. Next ranking check recommended: December 22, 2024.
+```
+
+---
+
+## Example 2: Glossary Lookup
+
+**User**: "What does 'quick wins' mean in our project?"
+
+**Assistant**:
+
+```markdown
+Let me look that up in the project memory.
+
+[Step 1: Check CLAUDE.md -> not found]
+[Step 2: Check memory/glossary.md -> found!]
+
+From your project glossary:
+
+**Quick Wins**
+- **Definition**: Low-difficulty, high-value target keywords
+- **Keyword Count**: 25 keywords
+- **Context**: This segment includes keywords where you can achieve page 1 rankings within 30-60 days due to low competition, despite having decent search volume
+
+According to your memory files, your current quick wins are stored in:
+`memory/research/keywords/long-tail-keywords.md` (tagged with "quick-win" label)
+
+Would you like me to show you the current list of quick win keywords?
+```
+
+---
+
+## Example 3: Initialize Memory for New Project
+
+**User**: "Set up memory for a new e-commerce site selling outdoor gear"
+
+**Assistant**:
+
+```markdown
+I'll set up the two-layer memory system for your outdoor gear e-commerce site.
+
+## Creating Directory Structure
+
+[Creates: memory/glossary.md, memory/research/, memory/content/, memory/monitoring/, etc.]
+
+## CLAUDE.md (Hot Cache) - Initial Setup
+
+I'll need some information to populate the hot cache:
+
+1. **Target Keywords**: What are your top 10 hero keywords?
+   (e.g., "hiking backpacks", "camping tents", "outdoor clothing")
+
+2. **Primary Competitors**: Who are your top 3-5 competitors?
+   (e.g., rei.com, backcountry.com, moosejaw.com)
+
+3. **Current Metrics** (if available):
+   - Domain authority?
+   - Current organic traffic?
+   - Any existing rankings?
+
+4. **Active Initiatives**:
+   - Are you launching a new category?
+   - Running a content campaign?
+   - Focusing on any specific product line?
+
+5. **Project Terminology**:
+   - Any internal product names or abbreviations?
+   - How do you segment your keyword strategy?
+
+Please provide what you have, and I'll create your initial CLAUDE.md and glossary. We can start with basics and enrich as we gather more data.
+
+---
+
+Once you provide this information, I'll generate:
+- CLAUDE.md with your initial hot cache
+- memory/glossary.md with project terminology
+- memory/research/keywords/ structure with placeholder files
+- memory/research/competitors/ with initial competitor list
+- memory/content/calendar/active-calendar.md for Q1 2025
+
+After setup, you can use commands like:
+- "Update rankings for hero keywords"
+- "Add new competitor [domain]"
+- "Promote [keyword] to hot cache"
+- "Archive campaigns from Q3 2024"
+```
diff --git a/.cursor/skills/memory-management/references/glossary-template.md b/.cursor/skills/memory-management/references/glossary-template.md
new file mode 100644
index 0000000000..2e3edd309d
--- /dev/null
+++ b/.cursor/skills/memory-management/references/glossary-template.md
@@ -0,0 +1,64 @@
+# Glossary Template
+
+Use this template to create the project glossary at memory/glossary.md.
+
+### 3. Glossary Structure (memory/glossary.md)
+
+```markdown
+# [Project Name] - SEO Glossary
+
+**Purpose**: Project-specific terminology, brand terms, internal jargon, and abbreviations.
+
+**Last Updated**: [Date]
+
+## Brand Terms
+
+| Term | Full Name | Context |
+|------|-----------|---------|
+| [abbreviation] | [full name] | [when/how it's used] |
+
+## Product Names
+
+| Internal Name | Public Name | Category |
+|---------------|-------------|----------|
+| [code name] | [market name] | [product type] |
+
+## Custom Segments
+
+| Segment Name | Definition | Keyword Count |
+|--------------|------------|---------------|
+| Hero KWs | Top 10 revenue-driving keywords | 10 |
+| Quick Wins | Low-difficulty, high-value targets | 25 |
+| Brand Defense | Brand + modifier combinations | 15 |
+
+## Competitor Shorthand
+
+| Shorthand | Full Domain | Notes |
+|-----------|-------------|-------|
+| Comp A | competitor-a.com | Main competitor, [industry] |
+| Comp B | competitor-b.com | Niche competitor, [specialty] |
+
+## Project Acronyms
+
+| Acronym | Full Term | Definition |
+|---------|-----------|------------|
+| [ABC] | [Full term] | [What it means in this project] |
+
+## Internal Jargon
+
+| Phrase | Meaning | Example Usage |
+|--------|---------|---------------|
+| [phrase] | [definition] | "Check rankings for [phrase]" |
+
+## Historical Context
+
+- **Project started**: [Date]
+- **Major algorithm updates affecting us**: [List with dates]
+- **Site migrations/redesigns**: [List with dates]
+- **Key milestones**: [List with dates]
+
+---
+
+_This glossary helps decode project shorthand and maintain consistency across reports and communications._
+```
+
diff --git a/.cursor/skills/memory-management/references/hot-cache-template.md b/.cursor/skills/memory-management/references/hot-cache-template.md
new file mode 100644
index 0000000000..a57a46bd55
--- /dev/null
+++ b/.cursor/skills/memory-management/references/hot-cache-template.md
@@ -0,0 +1,96 @@
+# Hot Cache (CLAUDE.md) Template
+
+Use this template when initializing memory for a new SEO project.
+
+### 2. Hot Cache (CLAUDE.md) Structure
+
+Create or update CLAUDE.md with this template:
+
+```markdown
+# [Project Name] - SEO Memory (Hot Cache)
+
+**Last Updated**: [Date]
+**Project Status**: [Active/Maintenance/Growth]
+
+## Active Target Keywords (Top 10-20)
+
+### Hero Keywords (Priority 1)
+| Keyword | Current Rank | Target Rank | Volume | Status |
+|---------|--------------|-------------|--------|--------|
+| [keyword 1] | #[X] | #[Y] | [vol] | [tracking/optimizing/achieved] |
+| [keyword 2] | #[X] | #[Y] | [vol] | [tracking/optimizing/achieved] |
+
+### Secondary Keywords (Priority 2)
+| Keyword | Current Rank | Target Rank | Volume | Status |
+|---------|--------------|-------------|--------|--------|
+| [keyword 1] | #[X] | #[Y] | [vol] | [tracking/optimizing/achieved] |
+
+_Full keyword research: memory/research/keywords/_
+
+## Primary Competitors (Top 3-5)
+
+1. **[Competitor 1]** (DA: [X])
+   - Main strengths: [brief]
+   - Our position vs. them: [ahead/behind/competitive]
+
+2. **[Competitor 2]** (DA: [X])
+   - Main strengths: [brief]
+   - Our position vs. them: [ahead/behind/competitive]
+
+_Detailed analyses: memory/research/competitors/_
+
+## Current Optimization Priorities
+
+1. **[Priority 1]** - [Brief description]
+   - Status: [not started/in progress/completed]
+   - Expected impact: [high/medium/low]
+   - Deadline: [date]
+
+2. **[Priority 2]** - [Brief description]
+   - Status: [not started/in progress/completed]
+   - Expected impact: [high/medium/low]
+   - Deadline: [date]
+
+## Key Metrics Snapshot
+
+**Last Metrics Update**: [Date]
+
+| Metric | Current | Previous | Change | Target |
+|--------|---------|----------|--------|--------|
+| Organic Traffic | [X] | [X] | [+/-X%] | [target] |
+| Avg. Position | [X] | [X] | [+/-X] | [target] |
+| Total Keywords Ranking | [X] | [X] | [+/-X] | [target] |
+| Page 1 Rankings | [X] | [X] | [+/-X] | [target] |
+| Domain Authority | [X] | [X] | [+/-X] | [target] |
+| CITE Score | [X] | [X] | [+/-X] | [target] |
+| Last Content Audit Score | [score]/100 | ([rating]) | — [date] | [page audited] |
+| Total Backlinks | [X] | [X] | [+/-X%] | [target] |
+
+_Historical data: memory/monitoring/reports/ and memory/monitoring/rank-history/_
+
+## Active Campaigns
+
+### [Campaign Name 1]
+- **Duration**: [Start date] - [End date]
+- **Goal**: [Specific goal]
+- **Status**: [planning/active/completed]
+- **Progress**: [X]%
+- **Key activities**: [brief list]
+
+### [Campaign Name 2]
+- **Duration**: [Start date] - [End date]
+- **Goal**: [Specific goal]
+- **Status**: [planning/active/completed]
+- **Progress**: [X]%
+- **Key activities**: [brief list]
+
+_Campaign plans and archives: memory/content/calendar/_
+
+## Quick Reference Notes
+
+- [Important note 1]
+- [Important note 2]
+- [Important note 3]
+
+_For project terminology, see: memory/glossary.md_
+```
diff --git a/.cursor/skills/memory-management/references/promotion-demotion-rules.md b/.cursor/skills/memory-management/references/promotion-demotion-rules.md
new file mode 100644
index 0000000000..01332cb1b7
--- /dev/null
+++ b/.cursor/skills/memory-management/references/promotion-demotion-rules.md
@@ -0,0 +1,58 @@
+# Promotion and Demotion Rules
+
+Detailed triggers and actions for moving items between the hot cache (CLAUDE.md) and cold storage (memory/ subdirectories).
+
+## Promotion Logic
+
+**Promote to Hot Cache when:**
+
+### Keyword promotion triggers:
+- Keyword referenced in 3+ conversations within 7 days
+- Keyword showing significant movement (5+ position change)
+- Keyword targeted in new active campaign
+- Keyword suddenly driving traffic spike
+
+### Competitor promotion triggers:
+- Competitor mentioned in 3+ recent analyses
+- Competitor showing aggressive SEO activity
+- Competitor launching competing content
+
+### Metric promotion triggers:
+- Metric requested frequently for reporting
+- Metric showing anomalous behavior
+- Metric tied to active priority/campaign
+
+**Promotion action:**
+```markdown
+1. Add item to relevant section in CLAUDE.md
+2. Add note: "Promoted [date] - [reason]"
+3. Keep detailed data in cold storage
+4. Set reminder to review in 30 days
+```
+
+## Demotion Logic
+
+**Demote to Cold Storage when:**
+
+### Keyword demotion triggers:
+- Not referenced in 30+ days
+- Target rank achieved and stable for 60+ days
+- Keyword no longer in active strategy
+- Keyword replaced by higher-priority target
+
+### Competitor demotion triggers:
+- No significant activity in 60+ days
+- Fell out of top 5 competitors by relevance
+- No longer targeting same keywords
+
+### Campaign demotion triggers:
+- Campaign completed 30+ days ago
+- Campaign cancelled or postponed indefinitely
+
+**Demotion action:**
+```markdown
+1. Remove from CLAUDE.md
+2. Archive full data in memory/[category]/archive/
+3. Add line to CLAUDE.md: "Last reviewed [category]: [date]"
+4. Keep 1-line summary if historically significant
+```
diff --git a/.cursor/skills/memory-management/references/update-triggers-integration.md b/.cursor/skills/memory-management/references/update-triggers-integration.md
new file mode 100644
index 0000000000..00faa54c43
--- /dev/null
+++ b/.cursor/skills/memory-management/references/update-triggers-integration.md
@@ -0,0 +1,102 @@
+# Update Triggers & Cross-Skill Integration
+
+Systematic memory update procedures triggered by specific events, plus integration points with other SEO skills.
+
+## Update Triggers
+
+After specific events, update memory systematically:
+
+### After Ranking Check
+```markdown
+1. Update CLAUDE.md -> Hero Keywords table (current ranks)
+2. Save a dated snapshot to memory/monitoring/rank-history/YYYY-MM-DD-ranks.csv
+3. Note any keywords with significant movement
+4. Update "Last Metrics Update" date in CLAUDE.md
+5. If hero keyword moves +/-5 positions, create alert note
+```
+
+### After Competitor Analysis
+```markdown
+1. Update CLAUDE.md -> Primary Competitors section (DA, position vs. them)
+2. Save full report to memory/research/competitors/YYYY-MM-DD-analysis.md
+3. Update competitor overview notes in memory/research/competitors/
+4. Note new competitor strategies in hot cache
+```
+
+### After Audit (Technical/Content/Backlink)
+```markdown
+1. Save full report to memory/audits/[type]/YYYY-MM-DD-[audit-name].md
+2. Extract top 3-5 action items -> CLAUDE.md Current Optimization Priorities
+3. Update Key Metrics Snapshot if audit includes metrics
+4. Create campaign entry if audit spawns new initiative
+```
+
+### After Monthly/Quarterly Report
+```markdown
+1. Save report to memory/monitoring/reports/[period]/YYYY-MM-report.md
+2. Update all metrics in CLAUDE.md Key Metrics Snapshot
+3. Review hot cache -> demote stale items
+4. Update campaign statuses
+5. Archive completed campaigns
+```
+
+## Archive Management
+
+### Monthly Archive Routine
+```markdown
+1. Review CLAUDE.md for items not referenced in 30 days
+2. Move stale items to appropriate cold storage
+3. Create monthly snapshot: memory/monitoring/snapshots/YYYY-MM-CLAUDE.md
+4. Compress old rank-history exports (keep recent snapshots easiest to access)
+5. Update glossary with new terms from the month
+```
+
+### Quarterly Archive Routine
+```markdown
+1. Review entire cold storage structure
+2. Compress files older than 6 months
+3. Create quarterly summary report
+4. Update project timeline in glossary
+5. Audit all active campaigns -> archive completed ones
+```
+
+## Cross-Skill Memory Integration
+
+This skill coordinates with other SEO skills:
+
+### When keyword-research runs:
+- Add discovered keywords to memory/research/keywords/
+- Promote top opportunities to CLAUDE.md if high-value
+- Update glossary if new terminology emerges
+
+### When rank-tracker runs:
+- Update memory/monitoring/rank-history/
+- Refresh CLAUDE.md Hero Keywords table
+- Flag significant movements for hot cache notes
+
+### When competitor-analysis runs:
+- Update competitor files in memory/research/competitors/
+- Refresh CLAUDE.md Primary Competitors section
+- Add new competitors if they outrank current top 5
+
+### When content-gap-analysis runs:
+- Store full findings in memory/research/content-gaps/
+- Promote gap opportunities to CLAUDE.md priorities
+- Update memory/content/calendar/ with recommended topics
+
+### When seo-content-writer produces content:
+- Log to memory/content/published/YYYY-MM-DD-[slug].md
+- Track target keyword and publish date
+- Set reminder to check performance in 30 days
+
+### When content-quality-auditor runs:
+- Save full report to `memory/audits/content/YYYY-MM-DD-core-eeat-[page-slug].md`
+- Update CLAUDE.md Key Metrics with latest score
+- If score < 60 (Poor/Low), flag in Active Campaigns section
+- Track dimension scores for trend analysis
+
+### When domain-authority-auditor runs:
+- Save full report to memory/audits/domain/YYYY-MM-DD-cite-audit.md
+- Update CITE Score in CLAUDE.md Key Metrics Snapshot
+- Note veto item status and dimension scores
+- Compare against previous CITE audit if available
diff --git a/.cursor/skills/meta-tags-optimizer/SKILL.md b/.cursor/skills/meta-tags-optimizer/SKILL.md
new file mode 100644
index 0000000000..1e3a81c9f9
--- /dev/null
+++ b/.cursor/skills/meta-tags-optimizer/SKILL.md
@@ -0,0 +1,417 @@
+---
+name: meta-tags-optimizer
+description: 'Optimize title tags, meta descriptions, Open Graph, Twitter cards for maximum CTR with A/B variations. 标题优化/元描述/CTR'
+version: "6.0.0"
+license: Apache-2.0
+compatibility: "Claude Code ≥1.0, skills.sh marketplace, ClawHub marketplace, Vercel Labs skills ecosystem. No system packages required. Optional: MCP network access for SEO tool integrations."
+homepage: "https://github.com/aaron-he-zhu/seo-geo-claude-skills"
+when_to_use: "Use when optimizing title tags, meta descriptions, Open Graph tags, or Twitter Cards for a page."
+argument-hint: "<page URL or content>"
+metadata:
+  author: aaron-he-zhu
+  version: "6.0.0"
+  geo-relevance: "low"
+  tags:
+    - seo
+    - meta-tags
+    - title-tag
+    - meta-description
+    - open-graph
+    - twitter-card
+    - ctr-optimization
+    - social-sharing
+    - 标题优化
+    - 元描述
+    - メタタグ
+    - 메타태그
+    - meta-tags-seo
+  triggers:
+    # EN-formal
+    - "optimize title tag"
+    - "write meta description"
+    - "improve CTR"
+    - "Open Graph tags"
+    - "social media preview"
+    - "title optimization"
+    - "meta tags"
+    - "Twitter cards"
+    # EN-casual
+    - "my title tag needs work"
+    - "low click-through rate"
+    - "fix my meta tags"
+    - "OG tags not showing"
+    - "social preview looks wrong"
+    - "my click rate is low"
+    # EN-question
+    - "how to write a good title tag"
+    - "how to improve click-through rate"
+    - "what is a good meta description"
+    # EN-competitor
+    - "Yoast SEO title tool"
+    - "RankMath title optimizer"
+    # ZH-pro
+    - "标题标签优化"
+    - "元描述优化"
+    - "OG标签"
+    - "点击率提升"
+    - "社交预览"
+    - "TDK优化"
+    # ZH-casual
+    - "标题不好"
+    - "点击率太低"
+    - "社交分享预览不对"
+    - "标题怎么写"
+    - "TDK怎么写"
+    # JA
+    - "メタタグ最適化"
+    - "タイトルタグ"
+    - "CTR改善"
+    # KO
+    - "메타 태그 최적화"
+    - "제목 태그"
+    - "클릭률 개선"
+    # ES
+    - "optimizar meta tags"
+    - "mejorar CTR"
+    - "etiquetas Open Graph"
+    # PT
+    - "otimizar meta tags"
+    # Misspellings
+    - "meta discription"
+    - "tittle tag"
+---
+
+# Meta Tags Optimizer
+
+> **[SEO & GEO Skills Library](https://github.com/aaron-he-zhu/seo-geo-claude-skills)** · 20 skills for SEO + GEO · [ClawHub](https://clawhub.ai/u/aaron-he-zhu) · [skills.sh](https://skills.sh/aaron-he-zhu/seo-geo-claude-skills)
+> **System Mode**: This build skill follows the shared [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md) and [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md).
+
+
+This skill creates compelling, optimized meta tags that improve click-through rates from search results and enhance social media sharing. It covers title tags, meta descriptions, and social meta tags.
+
+**System role**: Build layer skill. It turns briefs and signals into assets that other skills can review, publish, and monitor.
+
+## When This Must Trigger
+
+Use this when the conversation involves any of these situations — even if the user does not use SEO terminology:
+
+Use this whenever the task needs a shippable asset or transformation that should feed directly into quality review, deployment, or monitoring.
+
+- Creating meta tags for new pages
+- Optimizing existing meta tags for better CTR
+- Preparing pages for social media sharing
+- Fixing duplicate or missing meta tags
+- A/B testing title and description variations
+- Optimizing for specific SERP features
+- Creating meta tags for different page types
+
+## What This Skill Does
+
+1. **Title Tag Creation**: Writes compelling, keyword-optimized titles
+2. **Meta Description Writing**: Creates click-worthy descriptions
+3. **Open Graph Optimization**: Prepares pages for social sharing
+4. **Twitter Card Setup**: Optimizes Twitter-specific meta tags
+5. **CTR Analysis**: Suggests improvements for better click rates
+6. **Character Counting**: Ensures proper length for SERP display
+7. **A/B Test Suggestions**: Provides variations for testing
+
+## Quick Start
+
+Start with one of these prompts. Finish with a short handoff summary using the repository format in [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md).
+
+### Create Meta Tags
+
+```
+Create meta tags for a page about [topic] targeting [keyword]
+```
+
+```
+Write title and meta description for this content: [content/URL]
+```
+
+### Optimize Existing Tags
+
+```
+Improve these meta tags for better CTR: [current tags]
+```
+
+### Social Media Tags
+
+```
+Create Open Graph and Twitter card tags for [page/URL]
+```
+
+## Skill Contract
+
+**Expected output**: a ready-to-use asset or implementation-ready transformation plus a short handoff summary ready for `memory/content/`.
+
+- **Reads**: the brief, target keywords, entity inputs, quality constraints, and prior decisions from [CLAUDE.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CLAUDE.md) and the shared [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md) when available.
+- **Writes**: a user-facing content, metadata, or schema deliverable plus a reusable summary that can be stored under `memory/content/`.
+- **Promotes**: approved angles, messaging choices, missing evidence, and publish blockers to `CLAUDE.md`, `memory/decisions.md`, and `memory/open-loops.md`.
+- **Next handoff**: use the `Next Best Skill` below when the asset is ready for review or deployment.
+
+## Data Sources
+
+> See [CONNECTORS.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CONNECTORS.md) for tool category placeholders.
+
+**With ~~search console + ~~SEO tool connected:**
+Automatically pull current meta tags, CTR data by query, competitor title/description patterns, SERP preview data, and impression/click metrics to identify optimization opportunities.
+
+**With manual data only:**
+Ask the user to provide:
+1. Current title and meta description (if optimizing existing)
+2. Target primary keyword and 2-3 secondary keywords
+3. Page URL and main content/value proposition
+4. Competitor URLs or examples of well-performing titles in the SERP
+
+Proceed with the full workflow using provided data. Note in the output which metrics are from automated collection vs. user-provided data.
+
+## Instructions
+
+When a user requests meta tag optimization:
+
+1. **Gather Page Information**
+
+   ```markdown
+   ### Page Analysis
+   
+   **Page URL**: [URL]
+   **Page Type**: [blog/product/landing/service/homepage]
+   **Primary Keyword**: [keyword]
+   **Secondary Keywords**: [keywords]
+   **Target Audience**: [audience]
+   **Primary CTA**: [action you want users to take]
+   **Unique Value Prop**: [what makes this page special]
+   ```
+
+2. **Create Optimized Title Tag**
+
+   ```markdown
+   ### Title Tag Optimization
+   
+   **Requirements**:
+   - Length: 50-60 characters (displays fully in SERP)
+   - Include primary keyword (preferably near front)
+   - Make it compelling and click-worthy
+   - Match search intent
+   - Include brand name if appropriate
+   
+   **Title Tag Formula Options**:
+   
+   1. **Keyword | Benefit | Brand**
+      "[Primary Keyword]: [Benefit] | [Brand Name]"
+      
+   2. **Number + Keyword + Promise**
+      "[Number] [Keyword] That [Promise/Result]"
+      
+   3. **How-to Format**
+      "How to [Keyword]: [Benefit/Result]"
+      
+   4. **Question Format**
+      "What is [Keyword]? [Brief Answer/Hook]"
+      
+   5. **Year + Keyword**
+      "[Keyword] in [Year]: [Hook/Update]"
+   
+   **Generated Title Options**:
+   
+   | Option | Title | Length | Power Words | Keyword Position |
+   |--------|-------|--------|-------------|------------------|
+   | 1 | [Title] | [X] chars | [words] | [Front/Middle] |
+   | 2 | [Title] | [X] chars | [words] | [Front/Middle] |
+   | 3 | [Title] | [X] chars | [words] | [Front/Middle] |
+   
+   **Recommended**: Option [X]
+   **Reasoning**: [Why this option is best]
+   
+   **Title Tag Code**:
+   ```html
+   <title>[Selected Title]</title>
+   ```
+   ```
+
+3. **Write Meta Description**
+
+   ```markdown
+   ### Meta Description Optimization
+   
+   **Requirements**:
+   - Length: 150-160 characters (displays fully in SERP)
+   - Include primary keyword naturally
+   - Include clear call-to-action
+   - Match page content accurately
+   - Create urgency or curiosity
+   - Avoid duplicate descriptions
+   
+   **Meta Description Formula**:
+   
+   [What the page offers] + [Benefit to user] + [Call-to-action]
+   
+   **Power Elements to Include**:
+   - Numbers and statistics
+   - Current year
+   - Emotional triggers
+   - Action verbs
+   - Unique value proposition
+   
+   **Generated Description Options**:
+   
+   | Option | Description | Length | CTA | Emotional Trigger |
+   |--------|-------------|--------|-----|-------------------|
+   | 1 | [Description] | [X] chars | [CTA] | [Trigger] |
+   | 2 | [Description] | [X] chars | [CTA] | [Trigger] |
+   | 3 | [Description] | [X] chars | [CTA] | [Trigger] |
+   
+   **Recommended**: Option [X]
+   **Reasoning**: [Why this option is best]
+   
+   **Meta Description Code**:
+   ```html
+   <meta name="description" content="[Selected Description]">
+   ```
+   ```
+
+4. **Create Open Graph, Twitter Card, and Additional Meta Tags**
+
+   Generate OG tags (og:type, og:url, og:title, og:description, og:image), Twitter Card tags, canonical URL, robots, viewport, author, and article-specific tags. Then combine into a complete meta tag block.
+
+   > **Reference**: See [references/meta-tag-code-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/build/meta-tags-optimizer/references/meta-tag-code-templates.md) for OG type selection guide, Twitter card type selection, all HTML code templates, and the complete meta tag block template.
+
+5. **CORE-EEAT Alignment Check**
+
+   Verify meta tags align with content quality standards. Reference: [CORE-EEAT Benchmark](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/core-eeat-benchmark.md)
+
+   ```markdown
+   ### CORE-EEAT Meta Tag Alignment
+
+   | Check | Status | Notes |
+   |-------|--------|-------|
+   | **C01 Intent Alignment**: Title promise matches actual content delivery | ✅/⚠️/❌ | [Does the title accurately represent what the page delivers?] |
+   | **C02 Direct Answer**: Meta description reflects the core answer available in first 150 words | ✅/⚠️/❌ | [Does the description preview the direct answer?] |
+
+   **If C01 fails**: Title is misleading — rewrite to match actual content.
+   **If C02 fails**: Content may need restructuring to front-load the answer, or description should better reflect available content.
+   ```
+
+9. **Provide CTR Optimization Tips**
+
+   ```markdown
+   ## CTR Optimization Analysis
+
+   ### Power Words Used
+   - [Word 1] - Creates [emotion/action]
+   - [Word 2] - Creates [emotion/action]
+
+   ### CTR Boosting Elements
+
+   | Element | Present | Impact |
+   |---------|---------|--------|
+   | Numbers | Yes/No | +20-30% CTR |
+   | Current Year | Yes/No | +15-20% CTR |
+   | Power Words | Yes/No | +10-15% CTR |
+   | Question | Yes/No | +10-15% CTR |
+   | Brackets | Yes/No | +10% CTR |
+
+   ### A/B Test Suggestions
+
+   Test these variations:
+
+   **Version A** (Current):
+   - Title: [Title]
+   - Description: [Description]
+
+   **Version B** (Test):
+   - Title: [Alternative title]
+   - Description: [Alternative description]
+   - Hypothesis: [Why this might perform better]
+   ```
+
+## Validation Checkpoints
+
+### Input Validation
+- [ ] Primary keyword confirmed and matches page content
+- [ ] Page type identified (blog/product/landing/service/homepage)
+- [ ] Target audience and search intent clearly defined
+- [ ] Unique value proposition articulated
+
+### Output Validation
+- [ ] Title length 50-60 characters (displays fully in SERP)
+- [ ] Meta description length 150-160 characters
+- [ ] Primary keyword appears in both title and description
+- [ ] Open Graph image specified (1200x630px recommended)
+- [ ] All HTML syntax valid (no unclosed quotes or tags)
+- [ ] Source of each data point clearly stated (~~search console CTR data, ~~SEO tool competitor data, user-provided, or estimated)
+
+## Example
+
+**User**: "Create meta tags for a blog post about 'how to start a podcast in [current year]'"
+
+**Output**:
+
+```markdown
+## Meta Tags: How to Start a Podcast ([current year])
+
+### Title Tag
+```html
+<title>How to Start a Podcast in [current year]: Complete Beginner's Guide</title>
+```
+**Length**: ~55 characters ✅
+**Keyword**: "how to start a podcast" at front ✅
+**Power Words**: "Complete", "Beginner's" ✅
+
+### Meta Description
+```html
+<meta name="description" content="Learn how to start a podcast in [current year] with our step-by-step guide. Covers equipment, hosting, recording, and launching your first episode. Start podcasting today!">
+```
+**Length**: ~163 characters ✅
+**Keyword**: Included naturally ✅
+**CTA**: "Start podcasting today!" ✅
+
+_Complete meta tag block (with OG, Twitter, Article tags) generated using template from [references/meta-tag-code-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/build/meta-tags-optimizer/references/meta-tag-code-templates.md)._
+
+### A/B Test Variations
+
+**Title Variation B**:
+"Start a Podcast in [current year]: Step-by-Step Guide (+ Free Checklist)"
+
+**Title Variation C**:
+"How to Start a Podcast: [current year] Guide [Equipment + Software + Tips]"
+
+**Description Variation B**:
+"Want to start a podcast in [current year]? This guide covers everything: equipment ($100 budget option), best hosting platforms, recording tips, and how to get your first 1,000 listeners."
+```
+
+## Tips for Success
+
+1. **Front-load keywords** - Put important terms at the beginning
+2. **Match intent** - Description should preview what page delivers
+3. **Be specific** - Vague descriptions get ignored
+4. **Test variations** - Small changes can significantly impact CTR
+5. **Update regularly** - Add current year, refresh messaging
+6. **Check competitors** - See what's working in your SERP
+
+
+### Save Results
+
+After delivering content or optimization output to the user, ask:
+
+> "Save these results for future sessions?"
+
+If yes, write a dated summary to `memory/content/YYYY-MM-DD-<topic>.md` containing:
+- One-line description of what was created
+- Target keyword and content type
+- Open loops or items needing review
+- Source data references
+
+**Gate check recommended**: Run content-quality-auditor before publishing (PostToolUse hook will remind automatically).
+
+If any findings should influence ongoing strategy, recommend promoting key conclusions to `memory/hot-cache.md`.
+
+## Reference Materials
+
+- [Meta Tag Formulas](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/build/meta-tags-optimizer/references/meta-tag-formulas.md) — Proven title and description formulas
+- [CTR and Social Reference](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/build/meta-tags-optimizer/references/ctr-and-social-reference.md) — Page-type templates, CTR data, OG best practices
+
+## Next Best Skill
+
+- **Primary**: [schema-markup-generator](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/build/schema-markup-generator/SKILL.md) — complete the SERP packaging with structured data.
diff --git a/.cursor/skills/meta-tags-optimizer/references/ctr-and-social-reference.md b/.cursor/skills/meta-tags-optimizer/references/ctr-and-social-reference.md
new file mode 100644
index 0000000000..8fc5205c80
--- /dev/null
+++ b/.cursor/skills/meta-tags-optimizer/references/ctr-and-social-reference.md
@@ -0,0 +1,149 @@
+# CTR Optimization, Social Tags, and Page-Type Templates
+
+Reference tables for click-through rate optimization, Open Graph best practices, meta description copywriting frameworks, and page-type meta tag templates.
+
+---
+
+## Page-Type Meta Tag Templates
+
+### Homepage
+
+```html
+<title>[Brand Name] - [Primary Value Proposition]</title>
+<meta name="description" content="[Brand] helps [audience] [achieve goal]. [Key feature/benefit]. [CTA]">
+```
+
+### Product Page
+
+```html
+<title>[Product Name] - [Key Benefit] | [Brand]</title>
+<meta name="description" content="[Product] [key features]. [Price/offer if applicable]. [Social proof]. [CTA]">
+```
+
+### Blog Post
+
+```html
+<title>[How to/What is/Number] [Keyword] [Benefit/Year]</title>
+<meta name="description" content="[What they'll learn]. [Key points covered]. [CTA]">
+```
+
+### Service Page
+
+```html
+<title>[Service] in [Location] - [Brand] | [Differentiator]</title>
+<meta name="description" content="[Service description]. [Experience/credentials]. [Key benefit]. [CTA]">
+```
+
+---
+
+## Title Tag Formula Patterns
+
+### Proven Title Formulas
+
+| Formula | Template | Example | Best For |
+|---------|----------|---------|---------|
+| How-To | How to [Achieve Result] ([Year]) | How to Improve SEO Rankings (2026) | Informational guides |
+| Number List | [Number] [Adjective] [Topic] for [Audience] | 15 Proven SEO Strategies for E-commerce | Listicles |
+| Question | [Question]? Here's [Answer Preview] | What Is Technical SEO? Complete Guide | Definition content |
+| Comparison | [Option A] vs [Option B]: [Differentiator] | Ahrefs vs SEMrush: Which SEO Tool Wins? | Comparison pages |
+| Year-Based | Best [Topic] in [Year] ([Tested/Ranked]) | Best SEO Tools in 2026 (Expert Tested) | Evergreen + fresh |
+| Benefit-Driven | [Benefit]: [How to Achieve It] | Rank #1 on Google: 10-Step SEO Checklist | High-CTR pages |
+| Parenthetical | [Topic] ([Modifier]) | Link Building Guide (With Templates) | Adding value signal |
+| Negative | [Number] [Topic] Mistakes [Consequence] | 7 SEO Mistakes That Kill Your Rankings | Problem-awareness content |
+
+### Title Tag Power Words
+
+| Category | Power Words | Use Sparingly |
+|----------|-----------|---------------|
+| Urgency | Now, Today, Quick, Fast, Instant, 2026 | Don't overuse; pair with substance |
+| Value | Free, Proven, Complete, Essential, Ultimate | Avoid hyperbole without backing |
+| Specificity | [Exact number], Step-by-Step, Checklist, Template | More specific = more clicks |
+| Curiosity | Secret, Little-Known, Surprising, Actually | Must deliver on the promise |
+| Authority | Expert, Research-Backed, Data-Driven, Tested | Only use when genuinely expert content |
+| Emotional | Best, Worst, Mistakes, Warning, Powerful | Balance emotion with credibility |
+
+### Title Tag Length Optimization
+
+| Length Range | SERP Behavior | Recommendation |
+|-------------|--------------|----------------|
+| <30 characters | May appear thin | Expand with modifier or brand |
+| 30-50 characters | Fully displayed on all devices | Ideal for mobile-first |
+| 50-60 characters | Full display on desktop, may truncate mobile | Sweet spot for desktop |
+| 60-65 characters | Truncated on some devices | Front-load keywords |
+| >65 characters | Truncated everywhere | Avoid — key info gets cut |
+
+---
+
+## Meta Description Copywriting Frameworks
+
+### AIDA Framework
+
+| Element | What It Does | Example |
+|---------|-------------|---------|
+| **A**ttention | Hook with a bold claim or question | "Want to double your organic traffic?" |
+| **I**nterest | Build relevance | "This guide covers 15 proven strategies..." |
+| **D**esire | Show benefit | "...used by top-ranking sites to increase traffic by 200%." |
+| **A**ction | CTA | "Read the full guide now." |
+
+**Full Example**: "Want to double your organic traffic? This guide covers 15 proven SEO strategies used by top-ranking sites to increase traffic by 200%. Read the full guide now." (158 chars)
+
+### PAS Framework
+
+| Element | What It Does | Example |
+|---------|-------------|---------|
+| **P**roblem | Identify pain point | "Struggling to rank on Google?" |
+| **A**gitate | Amplify the problem | "Most SEO guides are outdated and miss key ranking factors." |
+| **S**olution | Offer the solution | "Our 2026 guide covers what actually works. Read now." |
+
+### Benefit-Proof-CTA Framework
+
+| Element | Example |
+|---------|---------|
+| Benefit | "Learn the exact SEO process that grew our traffic 5x" |
+| Proof | "— with real data from 200+ audits." |
+| CTA | "Get the free checklist." |
+
+---
+
+## CTR Optimization Data
+
+### Factors That Increase Organic CTR
+
+| Factor | CTR Impact | Implementation |
+|--------|-----------|----------------|
+| Number in title | +20-30% | "7 Ways", "15 Tips", "2026" |
+| Question in title | +14% | Start with How, What, Why |
+| Emotional words | +7% | "Proven", "Essential", "Mistake" |
+| Brackets/parentheses | +38% | "[Guide]", "(With Examples)" |
+| Current year | +10-15% | "Best SEO Tools 2026" |
+| Power words | +12% | See Power Words table above |
+| Match search intent | +15-25% | Align title with SERP expectations |
+| Rich results (schema) | +30% | FAQ, HowTo, Review stars |
+
+---
+
+## Open Graph Best Practices by Platform
+
+### Platform-Specific OG Optimization
+
+| Platform | Image Size | Title Length | Description Length | Special Tags |
+|----------|-----------|-------------|-------------------|-------------|
+| Facebook | 1200x630px | 40-60 chars | 125-155 chars | og:type, og:locale |
+| Twitter/X | 1200x600px | 70 chars max | 200 chars | twitter:card, twitter:site |
+| LinkedIn | 1200x627px | 70 chars | 150 chars | Standard OG tags |
+| Pinterest | 1000x1500px (2:3) | 100 chars | 500 chars | og:type=article preferred |
+| Slack | 500x500px min | Full title | First ~300 chars | Standard OG tags |
+
+### OG Tag Checklist
+
+| Tag | Required? | Notes |
+|-----|----------|-------|
+| og:title | Yes | Can differ from HTML title; optimize for social sharing |
+| og:description | Yes | Social-optimized; can differ from meta description |
+| og:image | Yes | Must be absolute URL; minimum 200x200px, ideal 1200x630px |
+| og:url | Yes | Canonical URL |
+| og:type | Yes | "website" for homepage, "article" for blog posts |
+| og:site_name | Recommended | Your brand name |
+| og:locale | Recommended | e.g., "en_US" |
+| twitter:card | Recommended | "summary_large_image" for articles |
+| twitter:site | Recommended | @yourbrand |
diff --git a/.cursor/skills/meta-tags-optimizer/references/meta-tag-code-templates.md b/.cursor/skills/meta-tags-optimizer/references/meta-tag-code-templates.md
new file mode 100644
index 0000000000..88d5b62867
--- /dev/null
+++ b/.cursor/skills/meta-tags-optimizer/references/meta-tag-code-templates.md
@@ -0,0 +1,153 @@
+# Meta Tag Code Templates
+
+HTML code templates for Open Graph tags, Twitter cards, additional meta tags, and complete meta tag blocks.
+
+---
+
+## Open Graph Tags (Facebook, LinkedIn, etc.)
+
+**Required OG Tags**:
+
+```html
+<!-- Primary Open Graph Tags -->
+<meta property="og:type" content="[article/website/product]">
+<meta property="og:url" content="[Full canonical URL]">
+<meta property="og:title" content="[OG-optimized title - up to 60 chars]">
+<meta property="og:description" content="[OG description - up to 200 chars]">
+<meta property="og:image" content="[Image URL - 1200x630px recommended]">
+
+<!-- Optional but Recommended -->
+<meta property="og:site_name" content="[Website Name]">
+<meta property="og:locale" content="en_US">
+```
+
+**OG Type Selection Guide**:
+
+| Page Type | og:type |
+|-----------|---------|
+| Blog post | article |
+| Homepage | website |
+| Product | product |
+| Video | video.other |
+| Profile | profile |
+
+**OG Title Considerations**:
+- Can be different from title tag
+- Optimize for social sharing context
+- More conversational tone acceptable
+- Up to 60 characters ideal
+
+**OG Description Considerations**:
+- Can be longer than meta description (up to 200 chars)
+- Focus on shareability
+- What would make someone click when shared?
+
+**OG Image Requirements**:
+- Recommended size: 1200x630 pixels
+- Minimum size: 600x315 pixels
+- Format: JPG or PNG
+- Keep text to less than 20% of image
+- Include branding subtly
+
+---
+
+## Twitter Card Tags
+
+**Card Type Selection**:
+
+| Card Type | Best For | Image Size |
+|-----------|----------|------------|
+| summary | Articles, blogs | 144x144 min |
+| summary_large_image | Visual content | 300x157 min |
+| player | Video/audio | 640x360 min |
+| app | Mobile apps | 800x418 |
+
+**Twitter Card Code**:
+
+```html
+<!-- Twitter Card Tags -->
+<meta name="twitter:card" content="[summary_large_image/summary]">
+<meta name="twitter:site" content="@[YourTwitterHandle]">
+<meta name="twitter:creator" content="@[AuthorTwitterHandle]">
+<meta name="twitter:title" content="[Title - 70 chars max]">
+<meta name="twitter:description" content="[Description - 200 chars max]">
+<meta name="twitter:image" content="[Image URL]">
+<meta name="twitter:image:alt" content="[Image description for accessibility]">
+```
+
+**Twitter-Specific Considerations**:
+- Shorter titles work better (under 70 chars)
+- Include @mentions if relevant
+- Hashtag-relevant terms can help discovery
+- Test with Twitter Card Validator
+
+---
+
+## Additional Recommended Meta Tags
+
+**Canonical URL** (Prevent duplicates):
+```html
+<link rel="canonical" href="[Preferred URL]">
+```
+
+**Robots Tag** (Indexing control):
+```html
+<meta name="robots" content="index, follow">
+```
+
+**Viewport** (Mobile optimization):
+```html
+<meta name="viewport" content="width=device-width, initial-scale=1">
+```
+
+**Author** (For articles):
+```html
+<meta name="author" content="[Author Name]">
+```
+
+**Language**:
+```html
+<html lang="en">
+```
+
+**Article-Specific** (For blog posts):
+```html
+<meta property="article:published_time" content="[ISO 8601 date]">
+<meta property="article:modified_time" content="[ISO 8601 date]">
+<meta property="article:author" content="[Author URL]">
+<meta property="article:section" content="[Category]">
+<meta property="article:tag" content="[Tag 1]">
+```
+
+---
+
+## Complete Meta Tag Block Template
+
+Copy and paste this complete meta tag block:
+
+```html
+<!-- Primary Meta Tags -->
+<title>[Optimized Title]</title>
+<meta name="title" content="[Optimized Title]">
+<meta name="description" content="[Optimized Description]">
+<link rel="canonical" href="[Canonical URL]">
+
+<!-- Open Graph / Facebook -->
+<meta property="og:type" content="[type]">
+<meta property="og:url" content="[URL]">
+<meta property="og:title" content="[OG Title]">
+<meta property="og:description" content="[OG Description]">
+<meta property="og:image" content="[Image URL]">
+<meta property="og:site_name" content="[Site Name]">
+
+<!-- Twitter -->
+<meta name="twitter:card" content="summary_large_image">
+<meta name="twitter:url" content="[URL]">
+<meta name="twitter:title" content="[Twitter Title]">
+<meta name="twitter:description" content="[Twitter Description]">
+<meta name="twitter:image" content="[Image URL]">
+
+<!-- Additional -->
+<meta name="robots" content="index, follow">
+<meta name="author" content="[Author]">
+```
diff --git a/.cursor/skills/meta-tags-optimizer/references/meta-tag-formulas.md b/.cursor/skills/meta-tags-optimizer/references/meta-tag-formulas.md
new file mode 100644
index 0000000000..dd1399f2be
--- /dev/null
+++ b/.cursor/skills/meta-tags-optimizer/references/meta-tag-formulas.md
@@ -0,0 +1,355 @@
+# Meta Tag Formulas Reference
+
+Extended title tag formulas, meta description templates, A/B testing methodology, seasonal patterns, CTR benchmarks, and multi-language optimization guidance.
+
+---
+
+## Title Tag Formulas by Content Type
+
+### Informational Content
+
+| # | Formula | Template | Example |
+|---|---------|----------|---------|
+| 1 | How-To + Year | How to [Action] in [Year] | How to Build Backlinks in 2026 |
+| 2 | How-To + Result | How to [Action] ([Result]) | How to Write Meta Tags (Rank Higher) |
+| 3 | Definitive Guide | The Definitive Guide to [Topic] | The Definitive Guide to Technical SEO |
+| 4 | Complete Guide | [Topic]: The Complete Guide ([Year]) | Schema Markup: The Complete Guide (2026) |
+| 5 | Beginner's Guide | [Topic] for Beginners: [Promise] | SEO for Beginners: Rank in 30 Days |
+| 6 | What Is | What Is [Topic]? [Short Clarifier] | What Is E-E-A-T? Google's Quality Standard |
+| 7 | Everything You Need | Everything You Need to Know About [Topic] | Everything You Need to Know About Core Web Vitals |
+| 8 | Step-by-Step | [Topic]: A Step-by-Step Guide | Link Building: A Step-by-Step Guide |
+
+### Listicle Content
+
+| # | Formula | Template | Example |
+|---|---------|----------|---------|
+| 9 | Numbered Tips | [N] [Topic] Tips That [Result] | 12 On-Page SEO Tips That Boost Rankings |
+| 10 | Best Of | [N] Best [Items] in [Year] ([Qualifier]) | 9 Best SEO Tools in 2026 (Expert Tested) |
+| 11 | Ways To | [N] Ways to [Achieve Goal] [Timeframe] | 7 Ways to Increase Organic Traffic This Month |
+| 12 | Strategies | [N] [Adjective] [Topic] Strategies for [Audience] | 15 Proven Content Strategies for SaaS |
+| 13 | Examples | [N] [Topic] Examples [Value Add] | 20 Meta Description Examples (+ Templates) |
+| 14 | Mistakes | [N] [Topic] Mistakes [Negative Consequence] | 10 Title Tag Mistakes That Hurt Your CTR |
+| 15 | Secrets | [N] [Topic] Secrets [Experts/Pros] Use | 5 SEO Secrets Top Agencies Use |
+| 16 | Reasons | [N] Reasons [Why/To] [Statement] | 8 Reasons Your Site Isn't Ranking |
+
+### Comparison Content
+
+| # | Formula | Template | Example |
+|---|---------|----------|---------|
+| 17 | A vs B | [A] vs [B]: [Differentiator] | Ahrefs vs SEMrush: Which Tool Is Better? |
+| 18 | A vs B + Year | [A] vs [B] ([Year]): [Angle] | WordPress vs Webflow (2026): Honest Comparison |
+| 19 | A vs B + Winner | [A] vs [B]: Which [Category] Wins? | Yoast vs Rank Math: Which SEO Plugin Wins? |
+| 20 | Alternatives | [N] Best [Product] Alternatives in [Year] | 7 Best Ahrefs Alternatives in 2026 |
+
+### Commercial / Transactional Content
+
+| # | Formula | Template | Example |
+|---|---------|----------|---------|
+| 21 | Product + Benefit | [Product] - [Primary Benefit] \| [Brand] | SEO Audit Tool - Find Issues in Minutes \| SiteChecker |
+| 22 | Pricing | [Product] Pricing: [Value Statement] | Semrush Pricing: Plans from $119/mo (2026) |
+| 23 | Review | [Product] Review ([Year]): [Verdict] | Surfer SEO Review (2026): Worth the Price? |
+| 24 | Free Tool | Free [Tool Type]: [What It Does] | Free Meta Tag Generator: Preview Your SERP Listing |
+
+### Local SEO Content
+
+| # | Formula | Template | Example |
+|---|---------|----------|---------|
+| 25 | Service + Location | [Service] in [City] - [Brand] \| [Differentiator] | Plumber in Austin - FastFix \| Same-Day Service |
+| 26 | Best + Location | Best [Service] in [City] ([Year]) | Best Coffee Shops in Portland (2026) |
+| 27 | Near Me Style | [Service] Near [Location]: [Value] | SEO Agency Near Chicago: Free Site Audit |
+| 28 | Location Guide | [Topic] in [Location]: [Qualifier] Guide | Restaurants in Brooklyn: The Local's Guide |
+
+### Problem-Awareness Content
+
+| # | Formula | Template | Example |
+|---|---------|----------|---------|
+| 29 | Warning | Warning: [Problem] [Consequence] | Warning: These SEO Tactics Will Get You Penalized |
+| 30 | Why + Problem | Why [Problem Exists] (And How to Fix It) | Why Your Meta Descriptions Are Ignored (And How to Fix It) |
+| 31 | Stop Doing | Stop [Bad Practice]: [Better Alternative] | Stop Keyword Stuffing: What to Do Instead |
+| 32 | Truth About | The Truth About [Topic] [Revelation] | The Truth About SEO in 2026: What Changed |
+
+---
+
+## Meta Description Templates
+
+### By Content Type
+
+#### Blog Posts / Articles
+
+| # | Template | Character Count |
+|---|----------|-----------------|
+| 1 | Learn [topic] with our [qualifier] guide. Covers [point 1], [point 2], and [point 3]. [CTA]. | ~140-155 |
+| 2 | [Question]? This [year] guide explains [what], [why], and [how]. Get actionable tips now. | ~130-150 |
+| 3 | Discover [N] [adjective] [topic] strategies that [result]. Backed by [proof element]. Read the full guide. | ~145-160 |
+| 4 | [Bold claim about topic]. We tested [N] [methods/tools] and found what works. See the results. | ~130-150 |
+
+#### Product / Service Pages
+
+| # | Template | Character Count |
+|---|----------|-----------------|
+| 5 | [Product] helps you [benefit]. [Feature 1], [Feature 2], [Feature 3]. [Price/offer]. [CTA]. | ~140-155 |
+| 6 | Looking for [solution]? [Product] [key differentiator]. Trusted by [social proof]. [CTA]. | ~130-150 |
+| 7 | [Product] - [primary benefit] in [timeframe]. [Star rating] from [N]+ reviews. [CTA]. | ~130-145 |
+
+#### E-commerce / Category Pages
+
+| # | Template | Character Count |
+|---|----------|-----------------|
+| 8 | Shop [category] from [Brand]. [N]+ [items] starting at $[price]. Free shipping on orders over $[amount]. | ~145-160 |
+| 9 | Browse our [adjective] collection of [products]. [Unique selling point]. [Guarantee/offer]. Order today. | ~140-155 |
+
+#### Comparison Pages
+
+| # | Template | Character Count |
+|---|----------|-----------------|
+| 10 | [A] vs [B]: which is better for [use case]? We compared [criteria]. See the winner + detailed breakdown. | ~145-160 |
+| 11 | Comparing [A] and [B] on [criteria 1], [criteria 2], and price. Our [year] verdict inside. | ~130-145 |
+
+#### Local Business Pages
+
+| # | Template | Character Count |
+|---|----------|-----------------|
+| 12 | [Service] in [City] by [Brand]. [Experience] years of experience. [N]-star rated. [CTA] — call [phone] today. | ~150-160 |
+| 13 | Need [service] in [location]? [Brand] offers [differentiator]. [Guarantee]. Book your [offer] now. | ~140-155 |
+
+---
+
+## A/B Testing Methodology for Titles and Descriptions
+
+### Pre-Test Preparation
+
+1. **Baseline metrics** — Record current CTR, average position, impressions, and clicks for the target page (minimum 30 days of data from ~~search console).
+2. **Hypothesis** — State what you expect to change and why: "Adding a number to the title will increase CTR by X% because listicle titles outperform generic ones at this SERP position."
+3. **Single variable** — Change only ONE element per test (title OR description, not both).
+4. **Minimum sample** — Wait until the page has accumulated at least 1,000 impressions in the test period before drawing conclusions.
+
+### Test Execution Steps
+
+| Step | Action | Duration |
+|------|--------|----------|
+| 1 | Record baseline CTR for 30 days | 30 days |
+| 2 | Implement title/description change | Day 0 |
+| 3 | Allow Google to re-crawl and update SERP | 3-7 days |
+| 4 | Monitor new CTR (exclude first 7 days) | 30+ days |
+| 5 | Compare new CTR vs. baseline at same avg. position | — |
+| 6 | Decide: keep, revert, or iterate | — |
+
+### What to Test (Priority Order)
+
+| Priority | Element | Test Variable | Expected Impact |
+|----------|---------|--------------|-----------------|
+| 1 | Title tag | Add/remove number | +15-25% CTR |
+| 2 | Title tag | Add/remove year | +10-15% CTR |
+| 3 | Title tag | Add brackets/parentheses | +10-38% CTR |
+| 4 | Title tag | Change power word | +5-12% CTR |
+| 5 | Title tag | Reorder keyword placement | +5-10% CTR |
+| 6 | Meta description | Add CTA | +5-10% CTR |
+| 7 | Meta description | Add numbers/statistics | +5-15% CTR |
+| 8 | Meta description | Change emotional tone | +3-8% CTR |
+
+### Interpreting Results
+
+| Scenario | Interpretation | Action |
+|----------|---------------|--------|
+| CTR up, position stable | Title/description change is working | Keep the change |
+| CTR up, position up | Improved engagement signals helping rankings | Keep and document |
+| CTR down, position stable | Change was negative | Revert immediately |
+| CTR flat, position stable | Change had no impact | Try a bolder variation |
+| CTR up, position down | Higher CTR but lower impressions | Evaluate net click change |
+
+### Statistical Significance Checklist
+
+- [ ] Minimum 1,000 impressions in test period
+- [ ] Test ran for at least 30 days (after re-crawl)
+- [ ] Average position did not shift more than 2 positions
+- [ ] No major algorithm updates during test period
+- [ ] No seasonal bias (compare similar months year-over-year)
+- [ ] CTR change exceeds 10% relative difference
+
+---
+
+## Seasonal Title Patterns
+
+### Calendar-Driven Updates
+
+| Season/Event | Title Modifier | Example |
+|-------------|---------------|---------|
+| New Year (Jan) | [Year] Edition, Updated for [Year] | SEO Checklist: 2026 Edition |
+| Q1 Planning | [Year] Strategy, Q1 [Year] | Content Strategy for Q1 2026 |
+| Spring | Spring [Year], Fresh | Spring 2026 SEO Audit Checklist |
+| Back to School (Aug-Sep) | Fall [Year], New Season | Fall 2026 Marketing Trends |
+| Black Friday (Nov) | Black Friday [Year], Holiday | Black Friday SEO Prep Guide (2026) |
+| End of Year (Dec) | [Year] Wrap-Up, [Next Year] Preview | 2026 SEO Wrap-Up + 2027 Predictions |
+| Any month | [Month] [Year] | Best SEO Tools (February 2026) |
+
+### Evergreen + Fresh Hybrid Strategy
+
+Keep the core title evergreen but append a freshness signal:
+
+| Base Title (Evergreen) | Freshness Signal | Combined |
+|-----------------------|------------------|----------|
+| How to Start a Blog | (Updated Feb 2026) | How to Start a Blog (Updated Feb 2026) |
+| Best CRM Software | [2026 Rankings] | Best CRM Software [2026 Rankings] |
+| SEO Beginner's Guide | - Latest Edition | SEO Beginner's Guide - Latest Edition |
+
+**Update cadence**: Refresh the year/date modifier every 3-6 months for pages targeting "best", "top", or year-based queries.
+
+---
+
+## CTR Benchmarks by Industry
+
+### Average Organic CTR by SERP Position (All Industries)
+
+| Position | Desktop CTR | Mobile CTR | Combined Avg |
+|----------|------------|------------|-------------|
+| 1 | 31.7% | 26.9% | 29.3% |
+| 2 | 17.5% | 15.1% | 16.3% |
+| 3 | 11.3% | 10.2% | 10.8% |
+| 4 | 7.9% | 7.1% | 7.5% |
+| 5 | 5.8% | 5.2% | 5.5% |
+| 6 | 4.4% | 3.9% | 4.2% |
+| 7 | 3.5% | 3.1% | 3.3% |
+| 8 | 2.9% | 2.5% | 2.7% |
+| 9 | 2.5% | 2.1% | 2.3% |
+| 10 | 2.2% | 1.8% | 2.0% |
+
+### CTR Variation by Industry Vertical
+
+| Industry | Avg CTR (Pos 1) | Avg CTR (Pos 1-3) | Notes |
+|----------|----------------|-------------------|-------|
+| Technology / SaaS | 25-30% | 50-55% | High competition; rich results common |
+| E-commerce | 20-25% | 40-45% | Shopping results reduce organic CTR |
+| Healthcare | 28-35% | 55-60% | Users seek authoritative sources |
+| Finance | 22-28% | 45-50% | Featured snippets capture clicks |
+| Travel | 18-24% | 38-44% | Maps and hotel packs reduce CTR |
+| Legal | 30-38% | 58-65% | High intent; fewer SERP features |
+| Education | 26-32% | 52-58% | Informational queries; high trust needed |
+| Real Estate | 20-26% | 42-48% | Local packs reduce organic CTR |
+| Food / Recipes | 15-22% | 35-42% | Recipe carousels dominate top positions |
+| B2B Services | 28-34% | 55-62% | Less SERP competition; niche queries |
+
+### When Your CTR Is Below Benchmark
+
+| CTR vs. Benchmark | Likely Cause | Recommended Action |
+|-------------------|-------------|-------------------|
+| 50%+ below | Title does not match intent | Rewrite title to align with query intent |
+| 25-50% below | Weak title or strong SERP features | Add power words, numbers, or brackets |
+| 10-25% below | Competitors have better titles | Analyze top 3 competitor titles and differentiate |
+| At benchmark | Performing as expected | Test incremental improvements |
+| Above benchmark | Strong title; optimize further | Protect current title; test description instead |
+
+---
+
+## Multi-Language Title Optimization
+
+### General Principles
+
+1. **Do not translate literally** -- Adapt the title formula to the target language's natural phrasing and cultural norms.
+2. **Character limits vary** -- CJK characters are wider; effective SERP display length is shorter by character count.
+3. **Keyword research per locale** -- Search volume and intent differ across languages. Always do locale-specific keyword research.
+4. **Brand name placement** -- Some markets prefer brand-first; others prefer keyword-first.
+
+### Language-Specific Considerations
+
+| Language | Max Chars (SERP) | Key Differences | Example Adaptation |
+|----------|-----------------|-----------------|-------------------|
+| English | 55-60 | Standard baseline | How to Learn SEO (2026 Guide) |
+| Spanish | 55-60 | Longer phrases; use abbreviations | Como Aprender SEO: Guia Completa 2026 |
+| French | 55-60 | Accented characters count same | Guide SEO Complet : Comment se Classer en 2026 |
+| German | 50-55 | Compound words are long | SEO-Leitfaden 2026: Schritt-fur-Schritt |
+| Japanese | 30-35 | CJK characters take ~2x width | SEO対策ガイド2026【初心者向け】 |
+| Chinese (Simplified) | 28-32 | CJK characters; brackets common | SEO优化指南（2026完整版） |
+| Korean | 30-35 | CJK width; particles add length | SEO 최적화 가이드 2026 [초보자용] |
+| Arabic | 50-55 | RTL display; test SERP rendering | دليل تحسين محركات البحث 2026 |
+| Portuguese (BR) | 55-60 | Similar to Spanish | Guia de SEO 2026: Como Ranquear no Google |
+| Hindi | 45-50 | Devanagari takes more width | SEO कैसे करें: 2026 गाइड |
+
+### Multi-Language Meta Description Tips
+
+| Principle | Detail |
+|-----------|--------|
+| Localize the CTA | "Learn more" may not translate well; use culturally appropriate CTAs |
+| Adjust character targets | For CJK, aim for 70-80 characters; for Latin-script languages, 150-160 |
+| Use local proof points | Reference local statistics, regulations, or market leaders |
+| Test SERP rendering | Use Google's SERP preview in the target locale to check truncation |
+| Separate hreflang tags | Ensure each language version has the correct hreflang annotation |
+
+---
+
+## Title Tag Modifier Library
+
+### Modifiers by Purpose
+
+| Purpose | Modifiers |
+|---------|----------|
+| Freshness | 2026, Updated, Latest, New, This Month, This Week |
+| Comprehensiveness | Complete, Ultimate, Definitive, A-to-Z, Full, In-Depth |
+| Ease | Easy, Simple, Quick, Beginner, No-Code, DIY |
+| Speed | Fast, Instant, [N]-Minute, Today, Quick-Start |
+| Authority | Expert, Pro, Advanced, Research-Backed, Data-Driven |
+| Value | Free, Cheap, Budget, Affordable, Open-Source |
+| Format | Guide, Checklist, Template, Cheat Sheet, Playbook, Toolkit |
+| Exclusivity | Insider, Members-Only, Exclusive, First Look, Early Access |
+| Social Proof | Trusted, [N]+ Users, Award-Winning, #1 Rated |
+| Scarcity | Limited, Last Chance, Ending Soon, Only [N] Left |
+
+### Modifier Placement Rules
+
+| Position | When to Use | Example |
+|----------|------------|---------|
+| Front | When modifier IS the keyword ("Free", "Best") | Free SEO Audit Tool for Small Business |
+| Middle | When clarifying the topic | SEO Tools (Free) for Small Business |
+| End (parenthetical) | When adding value signal | SEO Tools for Small Business (Free) |
+| End (pipe/dash) | When adding brand | SEO Tools for Small Business \| Brand |
+
+---
+
+## Common Title Tag Mistakes
+
+| Mistake | Why It Hurts | Fix |
+|---------|-------------|-----|
+| Keyword stuffing | Google may rewrite your title | Use primary keyword once; vary phrasing |
+| Too generic ("Home", "Welcome") | Zero CTR differentiation | Write a benefit-driven title |
+| All caps | Looks spammy; may be rewritten | Use title case or sentence case |
+| Missing primary keyword | Misses ranking signal | Include target keyword naturally |
+| Duplicate titles across pages | Cannibalization; Google may pick wrong page | Unique title per page |
+| Brand name first (non-branded query) | Wastes prime keyword space | Put brand at end after pipe |
+| No emotional hook | Blends in with competitors | Add one power word or number |
+| Exceeding 60 characters | Key info truncated in SERP | Front-load; keep under 60 chars |
+| Misleading clickbait | High bounce rate hurts rankings | Title must match content delivery |
+| Ignoring SERP context | Title doesn't stand out among competitors | Check live SERP before finalizing |
+
+---
+
+## Quick-Reference: Title Formula Selection Flowchart
+
+```
+START: What type of content is this?
+|
++-- Informational (how-to, guide, tutorial)
+|   +-- Is it step-by-step? --> Use Formula #2 or #8
+|   +-- Is it a definition? --> Use Formula #6
+|   +-- Is it comprehensive? --> Use Formula #3 or #4
+|   +-- Is it for beginners? --> Use Formula #5
+|
++-- Listicle (tips, tools, examples)
+|   +-- Is it "best of"? --> Use Formula #10
+|   +-- Is it mistakes/warnings? --> Use Formula #14 or #29
+|   +-- Is it tips/strategies? --> Use Formula #9 or #12
+|   +-- Is it examples? --> Use Formula #13
+|
++-- Comparison
+|   +-- Two products/tools? --> Use Formula #17 or #18
+|   +-- Multiple alternatives? --> Use Formula #20
+|
++-- Commercial / Product
+|   +-- Product page? --> Use Formula #21
+|   +-- Pricing page? --> Use Formula #22
+|   +-- Review? --> Use Formula #23
+|   +-- Free tool? --> Use Formula #24
+|
++-- Local
+|   +-- Service page? --> Use Formula #25
+|   +-- Best-of local? --> Use Formula #26
+|   +-- Near me? --> Use Formula #27
+```
diff --git a/.cursor/skills/on-page-seo-auditor/SKILL.md b/.cursor/skills/on-page-seo-auditor/SKILL.md
new file mode 100644
index 0000000000..170bee1145
--- /dev/null
+++ b/.cursor/skills/on-page-seo-auditor/SKILL.md
@@ -0,0 +1,364 @@
+---
+name: on-page-seo-auditor
+description: 'Audit on-page SEO: titles, headers, images, links with scored report and fix priorities. 页面SEO审计/排名诊断'
+version: "6.0.0"
+license: Apache-2.0
+compatibility: "Claude Code ≥1.0, skills.sh marketplace, ClawHub marketplace, Vercel Labs skills ecosystem. No system packages required. Optional: MCP network access for SEO tool integrations."
+homepage: "https://github.com/aaron-he-zhu/seo-geo-claude-skills"
+when_to_use: "Use when auditing a page's on-page SEO health, checking heading structure, keyword placement, image optimization, or content quality signals."
+argument-hint: "<URL> [keyword]"
+allowed-tools: WebFetch
+metadata:
+  author: aaron-he-zhu
+  version: "6.0.0"
+  geo-relevance: "medium"
+  tags:
+    - seo
+    - on-page-audit
+    - page-optimization
+    - seo-score
+    - content-audit
+    - h1-optimization
+    - meta-audit
+    - seo-checklist
+    - yoast-alternative
+    - screaming-frog-alternative
+    - 页面SEO
+    - 网页优化
+    - ページSEO
+    - 페이지감사
+    - auditoria-seo
+  triggers:
+    # EN-formal
+    - "audit page SEO"
+    - "on-page SEO check"
+    - "SEO score"
+    - "page optimization"
+    - "on-page audit"
+    - "SEO page analysis"
+    - "content audit"
+    # EN-casual
+    - "what SEO issues does this page have"
+    - "check my page"
+    - "score my page"
+    - "why isn't this page ranking"
+    - "what's wrong with this page's SEO"
+    - "is my page optimized"
+    - "my rankings tanked"
+    - "why did my rankings drop"
+    # EN-question
+    - "why is my page not ranking"
+    - "how do I improve my page SEO"
+    - "what SEO problems does this page have"
+    # EN-competitor
+    - "Screaming Frog alternative"
+    - "Yoast SEO alternative"
+    # ZH-pro
+    - "页面SEO审计"
+    - "网页优化检查"
+    - "SEO评分"
+    - "页面诊断"
+    - "页面优化分析"
+    # ZH-casual
+    - "页面有什么问题"
+    - "为什么排不上去"
+    - "检查一下我的页面"
+    - "SEO打分"
+    - "排名上不去怎么办"
+    - "网页收录问题"
+    # JA
+    - "ページSEO監査"
+    - "オンページSEO"
+    - "ページ最適化"
+    - "SEOスコア"
+    # KO
+    - "페이지 SEO 감사"
+    - "온페이지 SEO"
+    - "SEO 점수"
+    - "이 페이지 뭐가 문제야?"
+    - "왜 순위가 안 올라가?"
+    - "SEO 점수 확인해줘"
+    # ES
+    - "auditoría SEO on-page"
+    - "análisis de página SEO"
+    - "puntuación SEO"
+    # PT
+    - "auditoria SEO on-page"
+    # Misspellings
+    - "on page SEO aduit"
+    - "SEO scroe"
+---
+
+# On-Page SEO Auditor
+
+
+> **[SEO & GEO Skills Library](https://github.com/aaron-he-zhu/seo-geo-claude-skills)** · 20 skills for SEO + GEO · [ClawHub](https://clawhub.ai/u/aaron-he-zhu) · [skills.sh](https://skills.sh/aaron-he-zhu/seo-geo-claude-skills)
+> **System Mode**: This optimization skill follows the shared [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md) and [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md).
+
+
+This skill performs detailed on-page SEO audits to identify issues and optimization opportunities. It analyzes all on-page elements that affect search rankings and provides actionable recommendations.
+
+**System role**: Optimization layer skill. It turns weak pages, structures, and technical issues into prioritized repair work.
+
+## When This Must Trigger
+
+Use this when the conversation involves any of these situations — even if the user does not use SEO terminology:
+
+Use this whenever the task needs a diagnosis or repair plan that should feed directly into remediation work, not just a one-time opinion.
+
+- Auditing pages before or after publishing
+- Identifying why a page isn't ranking well
+- Optimizing existing content for better performance
+- Creating pre-publish SEO checklists
+- Comparing your on-page SEO to competitors
+- Systematic site-wide SEO improvements
+- Training team members on SEO best practices
+
+## What This Skill Does
+
+1. **Title Tag Analysis**: Evaluates title optimization and CTR potential
+2. **Meta Description Review**: Checks description quality and length
+3. **Header Structure Audit**: Analyzes H1-H6 hierarchy
+4. **Content Quality Assessment**: Reviews content depth and optimization
+5. **Keyword Usage Analysis**: Checks keyword placement and density
+6. **Internal Link Review**: Evaluates internal linking structure
+7. **Image Optimization Check**: Audits alt text and file optimization
+8. **Technical On-Page Review**: Checks URL, canonical, and mobile factors
+
+## Quick Start
+
+Start with one of these prompts. Finish with a short handoff summary using the repository format in [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md).
+
+### Audit a Single Page
+
+```
+Audit the on-page SEO of [URL]
+```
+
+```
+Check SEO issues on this page targeting [keyword]: [URL/content]
+```
+
+### Compare Against Competitors
+
+```
+Compare on-page SEO of [your URL] vs [competitor URL] for [keyword]
+```
+
+### Audit Content Before Publishing
+
+```
+Pre-publish SEO audit for this content targeting [keyword]: [content]
+```
+
+## Skill Contract
+
+**Expected output**: a scored diagnosis, prioritized repair plan, and a short handoff summary ready for `memory/audits/`.
+
+- **Reads**: the current page or site state, symptoms, prior audits, and current priorities from [CLAUDE.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CLAUDE.md) and the shared [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md) when available.
+- **Writes**: a user-facing audit or optimization plan plus a reusable summary that can be stored under `memory/audits/`.
+- **Promotes**: blocking defects, repeated weaknesses, and fix priorities to `memory/open-loops.md` and `memory/decisions.md`.
+- **Next handoff**: use the `Next Best Skill` below when the repair path is clear.
+
+## Data Sources
+
+> See [CONNECTORS.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CONNECTORS.md) for tool category placeholders.
+
+**With ~~SEO tool + ~~web crawler connected:**
+Claude can automatically pull page HTML via ~~web crawler, fetch keyword search volume and difficulty from ~~SEO tool, retrieve click-through rate data from ~~search console, and download competitor pages for comparison. This enables fully automated audits with live data.
+
+**With manual data only:**
+Ask the user to provide:
+1. Page URL or complete HTML content
+2. Target primary and secondary keywords
+3. Competitor page URLs for comparison (optional)
+
+Proceed with the full audit using provided data. Note in the output which findings are from automated crawl vs. manual review.
+
+## Instructions
+
+When a user requests an on-page SEO audit:
+
+1. **Gather Page Information**
+
+   ```markdown
+   ### Audit Setup
+   
+   **Page URL**: [URL]
+   **Target Keyword**: [primary keyword]
+   **Secondary Keywords**: [additional keywords]
+   **Page Type**: [blog/product/landing/service]
+   **Business Goal**: [traffic/conversions/authority]
+   ```
+
+2. **Audit Title Tag**
+
+   ```markdown
+   ## Title Tag Analysis
+   
+   **Current Title**: [title]
+   **Character Count**: [X] characters
+   
+   | Criterion | Status | Notes |
+   |-----------|--------|-------|
+   | Length (50-60 chars) | ✅/⚠️/❌ | [notes] |
+   | Keyword included | ✅/⚠️/❌ | Position: [front/middle/end] |
+   | Keyword at front | ✅/⚠️/❌ | [notes] |
+   | Unique across site | ✅/⚠️/❌ | [notes] |
+   | Compelling/clickable | ✅/⚠️/❌ | [notes] |
+   | Matches intent | ✅/⚠️/❌ | [notes] |
+   
+   **Title Score**: [X]/10
+   
+   **Issues Found**:
+   - [Issue 1]
+   - [Issue 2]
+   
+   **Recommended Title**:
+   "[Optimized title suggestion]"
+   
+   **Why**: [Explanation of improvements]
+   ```
+
+3. **Audit Meta Description**
+
+   ```markdown
+   ## Meta Description Analysis
+   
+   **Current Description**: [description]
+   **Character Count**: [X] characters
+   
+   | Criterion | Status | Notes |
+   |-----------|--------|-------|
+   | Length (150-160 chars) | ✅/⚠️/❌ | [notes] |
+   | Keyword included | ✅/⚠️/❌ | [notes] |
+   | Call-to-action present | ✅/⚠️/❌ | [notes] |
+   | Unique across site | ✅/⚠️/❌ | [notes] |
+   | Accurately describes page | ✅/⚠️/❌ | [notes] |
+   | Compelling copy | ✅/⚠️/❌ | [notes] |
+   
+   **Description Score**: [X]/10
+   
+   **Issues Found**:
+   - [Issue 1]
+   
+   **Recommended Description**:
+   "[Optimized description suggestion]" ([X] chars)
+   ```
+
+4. **Audit Header Structure**
+
+   ```markdown
+   ## Header Structure Analysis
+   
+   ### Current Header Hierarchy
+   
+   ```
+   H1: [H1 text]
+     H2: [H2 text]
+       H3: [H3 text]
+       H3: [H3 text]
+     H2: [H2 text]
+       H3: [H3 text]
+     H2: [H2 text]
+   ```
+   
+   | Criterion | Status | Notes |
+   |-----------|--------|-------|
+   | Single H1 | ✅/⚠️/❌ | Found: [X] H1s |
+   | H1 includes keyword | ✅/⚠️/❌ | [notes] |
+   | Logical hierarchy | ✅/⚠️/❌ | [notes] |
+   | H2s include keywords | ✅/⚠️/❌ | [X]/[Y] contain keywords |
+   | No skipped levels | ✅/⚠️/❌ | [notes] |
+   | Descriptive headers | ✅/⚠️/❌ | [notes] |
+   
+   **Header Score**: [X]/10
+   
+   **Issues Found**:
+   - [Issue 1]
+   - [Issue 2]
+   
+   **Recommended Changes**:
+   - H1: [suggestion]
+   - H2s: [suggestions]
+   ```
+
+5. **Audit Content Quality** — Word count, reading level, comprehensiveness, formatting, E-E-A-T signals, content elements checklist, gap identification
+
+   > **Reference**: See [references/audit-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/on-page-seo-auditor/references/audit-templates.md) for the content quality template (Step 5).
+
+6. **Audit Keyword Usage** — Primary/secondary keyword placement across all page elements, LSI/related terms, density analysis
+
+   > **Reference**: See [references/audit-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/on-page-seo-auditor/references/audit-templates.md) for the keyword optimization template (Step 6).
+
+7. **Audit Internal Links** — Link count, anchor text relevance, broken links, recommended additions
+
+   > **Reference**: See [references/audit-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/on-page-seo-auditor/references/audit-templates.md) for the internal linking template (Step 7).
+
+8. **Audit Images** — Alt text, file names, sizes, formats, lazy loading
+
+   > **Reference**: See [references/audit-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/on-page-seo-auditor/references/audit-templates.md) for the image optimization template (Step 8).
+
+9. **Audit Technical On-Page Elements** — URL, canonical, mobile, speed, HTTPS, schema
+
+   > **Reference**: See [references/audit-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/on-page-seo-auditor/references/audit-templates.md) for the technical on-page template (Step 9).
+
+10. **CORE-EEAT Content Quality Quick Scan** — 17 on-page-relevant items from the 80-item CORE-EEAT benchmark
+
+    > **Reference**: See [references/audit-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/on-page-seo-auditor/references/audit-templates.md) for the CORE-EEAT quick scan template (Step 10). Full benchmark: [CORE-EEAT Benchmark](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/core-eeat-benchmark.md).
+
+11. **Generate Audit Summary** — Overall score with visual breakdown, priority issues (critical/important/minor), quick wins, detailed recommendations, competitor comparison, action checklist, expected results
+
+    > **Reference**: See [references/audit-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/on-page-seo-auditor/references/audit-templates.md) for the full audit summary template (Step 11).
+
+## Validation Checkpoints
+
+### Input Validation
+- [ ] Target keyword(s) clearly specified by user
+- [ ] Page content accessible (either via URL or provided HTML)
+- [ ] If competitor comparison requested, competitor URL provided
+
+### Output Validation
+- [ ] Every recommendation cites specific data points (not generic advice)
+- [ ] Scores based on measurable criteria, not subjective opinion
+- [ ] All suggested changes include specific locations (title tag, H2 #3, paragraph 5, etc.)
+- [ ] Source of each data point clearly stated (~~SEO tool data, user-provided, ~~web crawler, or manual review)
+
+## Example
+
+> **Reference**: See [references/audit-example.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/on-page-seo-auditor/references/audit-example.md) for a full worked example (noise-cancelling headphones audit) and page-type checklists (blog post, product page, landing page).
+
+## Tips for Success
+
+1. **Prioritize issues by impact** - Fix critical issues first
+2. **Compare to competitors** - See what's working for top rankings
+3. **Balance optimization and readability** - Don't over-optimize
+4. **Audit regularly** - Content degrades over time
+5. **Test changes** - Track ranking changes after updates
+
+> **Scoring details**: For the complete weight distribution, scoring scale, issue resolution playbook, and industry benchmarks, see [references/scoring-rubric.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/on-page-seo-auditor/references/scoring-rubric.md).
+
+
+### Save Results
+
+After delivering audit or optimization findings to the user, ask:
+
+> "Save these results for future sessions?"
+
+If yes, write a dated summary to `memory/audits/on-page-seo-auditor/YYYY-MM-DD-<topic>.md` containing:
+- One-line verdict or headline finding
+- Top 3-5 actionable items
+- Open loops or blockers
+- Source data references
+
+If any veto-level issue was found (CORE-EEAT T04, C01, R10 or CITE T03, T05, T09), also append a one-liner to `memory/hot-cache.md` without asking.
+
+## Reference Materials
+
+- [Scoring Rubric](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/on-page-seo-auditor/references/scoring-rubric.md) — Detailed scoring criteria, weight distribution, and grade boundaries for on-page audits
+- [Audit Templates](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/on-page-seo-auditor/references/audit-templates.md) — Detailed output templates for steps 5-11 (content quality, keywords, links, images, technical, CORE-EEAT scan, audit summary)
+- [Audit Example & Checklists](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/on-page-seo-auditor/references/audit-example.md) — Full worked example and page-type checklists (blog, product, landing page)
+
+## Next Best Skill
+
+- **Primary**: [content-refresher](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/content-refresher/SKILL.md) — turn page-level findings into concrete edits.
diff --git a/.cursor/skills/on-page-seo-auditor/references/audit-example.md b/.cursor/skills/on-page-seo-auditor/references/audit-example.md
new file mode 100644
index 0000000000..60b2b59882
--- /dev/null
+++ b/.cursor/skills/on-page-seo-auditor/references/audit-example.md
@@ -0,0 +1,109 @@
+# On-Page SEO Auditor — Worked Example & Page Type Checklists
+
+Referenced from [SKILL.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/on-page-seo-auditor/SKILL.md).
+
+---
+
+## Worked Example
+
+**User**: "Audit the on-page SEO of https://example.com/best-noise-cancelling-headphones targeting 'best noise cancelling headphones'"
+
+**Output**:
+
+```markdown
+# On-Page SEO Audit Report
+
+**Page**: https://example.com/best-noise-cancelling-headphones
+**Target Keyword**: best noise cancelling headphones
+**Secondary Keywords**: wireless noise cancelling headphones, ANC headphones, noise cancelling headphones review
+**Page Type**: commercial (reviews/roundup)
+**Audit Date**: 2025-01-15
+
+## Summary
+
+| Audit Area | Score | Key Finding |
+|------------|-------|-------------|
+| Title Tag | 8/10 | Good keyword placement; slightly long at 63 chars |
+| Meta Description | 6/10 | Missing CTA; keyword present but generic copy |
+| Header Structure | 9/10 | Clean hierarchy; H2s cover all major products |
+| Content Quality | 7/10 | 2,400 words is solid; lacks original test data |
+| Keyword Optimization | 8/10 | Strong placement; density at 1.2% is healthy |
+| Internal Links | 5/10 | Only 2 internal links; missing links to brand pages |
+| Images | 6/10 | 3/8 images missing alt text; no WebP format |
+| Technical Elements | 7/10 | Missing Product schema; good URL and mobile |
+
+## Overall Score: 71/100
+
+Calculation: (8x0.15 + 6x0.05 + 9x0.10 + 7x0.25 + 8x0.15 + 5x0.10 + 6x0.10 + 7x0.10) x 10 = 71
+
+Score Breakdown:
+████████░░ Title Tag:        8/10  (15%)
+██████░░░░ Meta Description: 6/10  ( 5%)
+█████████░ Headers:          9/10  (10%)
+███████░░░ Content:          7/10  (25%)
+████████░░ Keywords:         8/10  (15%)
+█████░░░░░ Internal Links:   5/10  (10%)
+██████░░░░ Images:           6/10  (10%)
+███████░░░ Technical:        7/10  (10%)
+
+## Priority Issues
+
+### Critical
+1. **Internal linking severely underdeveloped** — Only 2 internal links found. Add links to individual headphone review pages (/sony-wh1000xm5-review, /bose-qc-ultra-review) and the headphones category page. Target 5-8 contextual internal links.
+2. **3 product images missing alt text** — Images for Sony WH-1000XM5, Bose QC Ultra, and Apple AirPods Max have empty alt attributes. Each missing alt tag is a lost ranking signal in Google Images.
+
+### Important
+1. **Meta description lacks call-to-action** — Current description states facts but does not compel clicks. Add "Compare prices and features" or "See our top picks" to drive CTR.
+
+## Quick Wins
+
+1. **Add alt text to 3 images** (5 min) — Use descriptive text like "Sony WH-1000XM5 noise cancelling headphones on desk" instead of empty attributes.
+2. **Rewrite meta description with CTA** (5 min) — Change to: "Compare the 10 best noise cancelling headphones for 2025. Expert-tested picks from Sony, Bose, and Apple with pros, cons, and pricing. See our top picks."
+3. **Add 4+ internal links** (10 min) — Link product names to their individual review pages and add a "See all headphones" link to the category hub.
+```
+
+---
+
+## Audit Checklists by Page Type
+
+### Blog Post Checklist
+
+```markdown
+- [ ] Title includes keyword and is compelling
+- [ ] Meta description has keyword and CTA
+- [ ] Single H1 with keyword
+- [ ] H2s cover main topics
+- [ ] Keyword in first 100 words
+- [ ] 1,500+ words for competitive topics
+- [ ] 3+ internal links with varied anchors
+- [ ] Images with descriptive alt text
+- [ ] FAQ section with schema
+- [ ] Author bio with credentials
+```
+
+### Product Page Checklist
+
+```markdown
+- [ ] Product name in title
+- [ ] Price and availability in description
+- [ ] H1 is product name
+- [ ] Product features in H2s
+- [ ] Multiple product images with alt text
+- [ ] Customer reviews visible
+- [ ] Product schema implemented
+- [ ] Related products linked
+- [ ] Clear CTA button
+```
+
+### Landing Page Checklist
+
+```markdown
+- [ ] Keyword-optimized title
+- [ ] Benefit-focused meta description
+- [ ] Clear H1 value proposition
+- [ ] Supporting H2 sections
+- [ ] Trust signals (testimonials, logos)
+- [ ] Single clear CTA
+- [ ] Fast page load speed
+- [ ] Mobile-optimized layout
+```
diff --git a/.cursor/skills/on-page-seo-auditor/references/audit-templates.md b/.cursor/skills/on-page-seo-auditor/references/audit-templates.md
new file mode 100644
index 0000000000..660944145d
--- /dev/null
+++ b/.cursor/skills/on-page-seo-auditor/references/audit-templates.md
@@ -0,0 +1,315 @@
+# On-Page SEO Auditor — Output Templates
+
+Detailed output templates for on-page-seo-auditor steps 5-11. Referenced from [SKILL.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/on-page-seo-auditor/SKILL.md).
+
+---
+
+## Step 5: Audit Content Quality
+
+```markdown
+## Content Quality Analysis
+
+**Word Count**: [X] words
+**Reading Level**: [Grade level]
+**Estimated Read Time**: [X] minutes
+
+| Criterion | Status | Notes |
+|-----------|--------|-------|
+| Sufficient length | ✅/⚠️/❌ | [comparison to ranking content] |
+| Comprehensive coverage | ✅/⚠️/❌ | [notes] |
+| Unique value/insights | ✅/⚠️/❌ | [notes] |
+| Up-to-date information | ✅/⚠️/❌ | [notes] |
+| Proper formatting | ✅/⚠️/❌ | [notes] |
+| Readability | ✅/⚠️/❌ | [notes] |
+| E-E-A-T signals | ✅/⚠️/❌ | [notes] |
+
+**Content Elements Present**:
+- [ ] Introduction with keyword
+- [ ] Clear sections/structure
+- [ ] Bullet points/lists
+- [ ] Tables where appropriate
+- [ ] Images/visuals
+- [ ] Examples/case studies
+- [ ] Statistics with sources
+- [ ] Expert quotes
+- [ ] FAQ section
+- [ ] Conclusion with CTA
+
+**Content Score**: [X]/10
+
+**Gaps Identified**:
+- [Missing topic/section 1]
+- [Missing topic/section 2]
+
+**Recommendations**:
+1. [Specific improvement]
+2. [Specific improvement]
+```
+
+---
+
+## Step 6: Audit Keyword Usage
+
+```markdown
+## Keyword Optimization Analysis
+
+**Primary Keyword**: "[keyword]"
+**Keyword Density**: [X]%
+
+### Keyword Placement
+
+| Location | Present | Notes |
+|----------|---------|-------|
+| Title tag | ✅/❌ | Position: [X] |
+| Meta description | ✅/❌ | [notes] |
+| H1 | ✅/❌ | [notes] |
+| First 100 words | ✅/❌ | Word position: [X] |
+| H2 headings | ✅/❌ | In [X]/[Y] H2s |
+| Body content | ✅/❌ | [X] occurrences |
+| URL slug | ✅/❌ | [notes] |
+| Image alt text | ✅/❌ | In [X]/[Y] images |
+| Conclusion | ✅/❌ | [notes] |
+
+### Secondary Keywords
+
+| Keyword | Occurrences | Status |
+|---------|-------------|--------|
+| [keyword 1] | [X] | ✅/⚠️/❌ |
+| [keyword 2] | [X] | ✅/⚠️/❌ |
+
+### LSI/Related Terms
+
+**Present**: [list of related terms found]
+**Missing**: [important related terms not found]
+
+**Keyword Score**: [X]/10
+
+**Issues**:
+- [Issue 1]
+
+**Recommendations**:
+- [Suggestion 1]
+```
+
+---
+
+## Step 7: Audit Internal Links
+
+```markdown
+## Internal Linking Analysis
+
+**Total Internal Links**: [X]
+**Unique Internal Links**: [X]
+
+| Criterion | Status | Notes |
+|-----------|--------|-------|
+| Number of internal links | ✅/⚠️/❌ | [X] (recommend 3-5+) |
+| Relevant anchor text | ✅/⚠️/❌ | [notes] |
+| Links to related content | ✅/⚠️/❌ | [notes] |
+| Links to important pages | ✅/⚠️/❌ | [notes] |
+| No broken links | ✅/⚠️/❌ | [X] broken found |
+| Natural placement | ✅/⚠️/❌ | [notes] |
+
+**Current Internal Links**:
+1. "[Anchor text]" → [URL]
+2. "[Anchor text]" → [URL]
+3. "[Anchor text]" → [URL]
+
+**Internal Linking Score**: [X]/10
+
+**Recommended Additional Links**:
+1. Add link to "[Related page]" with anchor "[suggested anchor]"
+2. Add link to "[Related page]" with anchor "[suggested anchor]"
+
+**Anchor Text Improvements**:
+- Change "[current anchor]" to "[improved anchor]"
+```
+
+---
+
+## Step 8: Audit Images
+
+```markdown
+## Image Optimization Analysis
+
+**Total Images**: [X]
+
+### Image Audit Table
+
+| Image | Alt Text | File Name | Size | Status |
+|-------|----------|-----------|------|--------|
+| [img1] | [alt or "missing"] | [filename] | [KB] | ✅/⚠️/❌ |
+| [img2] | [alt or "missing"] | [filename] | [KB] | ✅/⚠️/❌ |
+
+| Criterion | Status | Notes |
+|-----------|--------|-------|
+| All images have alt text | ✅/⚠️/❌ | [X]/[Y] have alt |
+| Alt text includes keywords | ✅/⚠️/❌ | [notes] |
+| Descriptive file names | ✅/⚠️/❌ | [notes] |
+| Appropriate file sizes | ✅/⚠️/❌ | [notes] |
+| Modern formats (WebP) | ✅/⚠️/❌ | [notes] |
+| Lazy loading enabled | ✅/⚠️/❌ | [notes] |
+
+**Image Score**: [X]/10
+
+**Recommendations**:
+1. Add alt text to image [X]: "[suggested alt text]"
+2. Compress image [Y]: Currently [X]KB, should be under [Y]KB
+3. Rename [filename] to [better-filename]
+```
+
+---
+
+## Step 9: Audit Technical On-Page Elements
+
+```markdown
+## Technical On-Page Analysis
+
+| Element | Current Value | Status | Recommendation |
+|---------|---------------|--------|----------------|
+| URL | [URL] | ✅/⚠️/❌ | [notes] |
+| URL length | [X] chars | ✅/⚠️/❌ | [notes] |
+| URL keywords | [present/absent] | ✅/⚠️/❌ | [notes] |
+| Canonical tag | [URL or "missing"] | ✅/⚠️/❌ | [notes] |
+| Mobile-friendly | [yes/no] | ✅/⚠️/❌ | [notes] |
+| Page speed | [X]s | ✅/⚠️/❌ | [notes] |
+| HTTPS | [yes/no] | ✅/⚠️/❌ | [notes] |
+| Schema markup | [types or "none"] | ✅/⚠️/❌ | [notes] |
+
+**Technical Score**: [X]/10
+```
+
+---
+
+## Step 10: CORE-EEAT Content Quality Quick Scan
+
+Run a quick scan of on-page-relevant CORE-EEAT items. Reference: [CORE-EEAT Benchmark](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/core-eeat-benchmark.md)
+
+```markdown
+## CORE-EEAT Quick Scan
+
+Content-relevant items from the 80-item benchmark:
+
+| ID | Check Item | Status | Notes |
+|----|-----------|--------|-------|
+| C01 | Intent Alignment | ✅/⚠️/❌ | Title promise = content delivery |
+| C02 | Direct Answer | ✅/⚠️/❌ | Core answer in first 150 words |
+| C09 | FAQ Coverage | ✅/⚠️/❌ | Structured FAQ present |
+| C10 | Semantic Closure | ✅/⚠️/❌ | Conclusion answers opening |
+| O01 | Heading Hierarchy | ✅/⚠️/❌ | H1→H2→H3, no skipping |
+| O02 | Summary Box | ✅/⚠️/❌ | TL;DR or Key Takeaways |
+| O03 | Data Tables | ✅/⚠️/❌ | Comparisons in tables |
+| O05 | Schema Markup | ✅/⚠️/❌ | Appropriate JSON-LD |
+| O06 | Section Chunking | ✅/⚠️/❌ | Single topic per section |
+| R01 | Data Precision | ✅/⚠️/❌ | ≥5 precise numbers |
+| R02 | Citation Density | ✅/⚠️/❌ | ≥1 per 500 words |
+| R06 | Timestamp | ✅/⚠️/❌ | Updated <1 year |
+| R08 | Internal Link Graph | ✅/⚠️/❌ | Descriptive anchors |
+| R10 | Content Consistency | ✅/⚠️/❌ | No contradictions |
+| Exp01 | First-Person Narrative | ✅/⚠️/❌ | "I tested" or "We found" |
+| Ept01 | Author Identity | ✅/⚠️/❌ | Byline + bio present |
+| T04 | Disclosure Statements | ✅/⚠️/❌ | Affiliate links disclosed |
+
+**CORE-EEAT Quick Score**: [X]/17 items passing
+
+> For a complete 80-item audit with weighted scoring, use `content-quality-auditor`.
+```
+
+---
+
+## Step 11: Generate Audit Summary
+
+```markdown
+# On-Page SEO Audit Report
+
+**Page**: [URL]
+**Target Keyword**: [keyword]
+**Audit Date**: [date]
+
+## Overall Score: [X]/100
+
+```
+Score Breakdown:
+████████░░ Title Tag: 8/10
+██████░░░░ Meta Description: 6/10
+█████████░ Headers: 9/10
+███████░░░ Content: 7/10
+██████░░░░ Keywords: 6/10
+█████░░░░░ Internal Links: 5/10
+████░░░░░░ Images: 4/10
+████████░░ Technical: 8/10
+```
+
+## Priority Issues
+
+### 🔴 Critical (Fix Immediately)
+1. [Critical issue 1]
+2. [Critical issue 2]
+
+### 🟡 Important (Fix Soon)
+1. [Important issue 1]
+2. [Important issue 2]
+
+### 🟢 Minor (Nice to Have)
+1. [Minor issue 1]
+2. [Minor issue 2]
+
+## Quick Wins
+
+These changes will have immediate impact:
+
+1. **[Change 1]**: [Why and how]
+2. **[Change 2]**: [Why and how]
+3. **[Change 3]**: [Why and how]
+
+## Detailed Recommendations
+
+### Title Tag
+- **Current**: [current title]
+- **Recommended**: [new title]
+- **Impact**: [expected improvement]
+
+### Meta Description
+- **Current**: [current description]
+- **Recommended**: [new description]
+- **Impact**: [expected improvement]
+
+### Content Improvements
+1. [Specific content change with location]
+2. [Specific content change with location]
+
+### Internal Linking
+1. Add link: "[anchor]" → [destination]
+2. Add link: "[anchor]" → [destination]
+
+### Image Optimization
+1. [Image 1]: [change needed]
+2. [Image 2]: [change needed]
+
+## Competitor Comparison
+
+| Element | Your Page | Top Competitor | Gap |
+|---------|-----------|----------------|-----|
+| Word count | [X] | [Y] | [+/-Z] |
+| Internal links | [X] | [Y] | [+/-Z] |
+| Images | [X] | [Y] | [+/-Z] |
+| H2 headings | [X] | [Y] | [+/-Z] |
+
+## Action Checklist
+
+- [ ] Update title tag
+- [ ] Rewrite meta description
+- [ ] Add keyword to H1
+- [ ] Add [X] more internal links
+- [ ] Add alt text to [X] images
+- [ ] Add [X] more content sections
+- [ ] Implement FAQ schema
+- [ ] [Additional action items]
+
+## Expected Results
+
+After implementing these changes:
+- Estimated ranking improvement: [X] positions
+- Estimated CTR improvement: [X]%
+- Estimated traffic increase: [X]%
+```
diff --git a/.cursor/skills/on-page-seo-auditor/references/scoring-rubric.md b/.cursor/skills/on-page-seo-auditor/references/scoring-rubric.md
new file mode 100644
index 0000000000..037f2a2106
--- /dev/null
+++ b/.cursor/skills/on-page-seo-auditor/references/scoring-rubric.md
@@ -0,0 +1,496 @@
+# On-Page SEO Scoring Rubric
+
+Detailed scoring criteria for each of the 8 audit sections. Use this rubric to ensure consistent, objective scoring across audits.
+
+## How to Use This Rubric
+
+1. Score each section independently using the criteria below
+2. Apply the section weight to calculate the weighted score
+3. Sum all weighted scores for the overall page score (out of 100)
+4. Use calibration examples to verify your scoring is consistent
+
+## Section 1: Title Tag (Weight: 15%, Max: 15 points)
+
+### Scoring Criteria
+
+| Criterion | Points | Requirement |
+|-----------|--------|-------------|
+| Keyword presence | 3 | Primary keyword appears in title |
+| Keyword position | 2 | Primary keyword in first half of title |
+| Length optimization | 2 | Between 50-60 characters |
+| Uniqueness | 2 | Title is unique across the site |
+| Compelling copy | 2 | Includes benefit, modifier, or hook |
+| Intent match | 2 | Title matches search intent accurately |
+| Brand inclusion | 1 | Brand name present (at end) |
+| No truncation risk | 1 | Displays fully in SERP without cutoff |
+
+### Calibration Examples
+
+**Score 15/15 (Excellent)**:
+- Title: "Keyword Research: 7 Proven Methods to Find Low-Competition Keywords | Brand"
+- 58 characters, keyword at front, benefit-driven, unique, brand at end
+
+**Score 11/15 (Good)**:
+- Title: "The Complete Guide to Keyword Research for Beginners"
+- 52 characters, keyword present but not at front, no brand, still compelling
+
+**Score 7/15 (Needs Work)**:
+- Title: "Keyword Research"
+- Too short (16 chars), no benefit, no brand, not compelling, generic
+
+**Score 3/15 (Poor)**:
+- Title: "Blog Post #47 - Untitled"
+- No keyword, no benefit, not descriptive, not unique
+
+**Score 0/15 (Missing)**:
+- No title tag present, or title tag is empty
+
+## Section 2: Meta Description (Weight: 5%, Max: 5 points)
+
+### Scoring Criteria
+
+| Criterion | Points | Requirement |
+|-----------|--------|-------------|
+| Keyword inclusion | 1 | Primary keyword appears naturally |
+| Length optimization | 1 | Between 150-160 characters |
+| Call-to-action | 1 | Contains explicit or implicit CTA |
+| Unique description | 1 | Not duplicated from other pages |
+| Accurate summary | 1 | Accurately describes page content |
+
+### Calibration Examples
+
+**Score 5/5 (Excellent)**:
+- Description: "Learn 7 proven keyword research methods that top SEOs use to find low-competition opportunities. Includes free templates, real examples, and step-by-step walkthroughs. Get started today."
+- 155 chars, keyword present, CTA ("Get started"), accurate, unique
+
+**Score 3/5 (Needs Work)**:
+- Description: "This page talks about keyword research and some tips you might find useful for SEO."
+- 83 chars (too short), weak copy, no CTA, vague
+
+**Score 0/5 (Missing)**:
+- No meta description set; search engine generates snippet automatically
+
+## Section 3: Header Structure (Weight: 10%, Max: 10 points)
+
+### Scoring Criteria
+
+| Criterion | Points | Requirement |
+|-----------|--------|-------------|
+| Single H1 present | 2 | Exactly one H1 on the page |
+| H1 contains keyword | 2 | Primary keyword in H1 text |
+| Logical hierarchy | 2 | No skipped levels (H1→H2→H3, not H1→H3) |
+| H2s cover key subtopics | 2 | H2s address main topic facets |
+| Descriptive headers | 1 | Headers describe section content clearly |
+| Keyword variations in H2s | 1 | Secondary keywords or LSI terms in subheadings |
+
+### Calibration Examples
+
+**Score 10/10 (Excellent)**:
+```
+H1: How to Do Keyword Research: A Step-by-Step Guide
+  H2: What Is Keyword Research and Why Does It Matter?
+  H2: 7 Keyword Research Methods That Work
+    H3: Method 1: Seed Keyword Brainstorming
+    H3: Method 2: Competitor Keyword Analysis
+    H3: Method 3: Google Autocomplete Mining
+  H2: Free Keyword Research Tools Compared
+  H2: How to Prioritize Keywords by Difficulty
+  H2: Keyword Research FAQ
+```
+- Single H1 with keyword, logical hierarchy, descriptive H2s with variations
+
+**Score 6/10 (Needs Work)**:
+```
+H1: Keyword Research
+H2: Introduction
+H2: Methods
+H2: Tools
+H2: Conclusion
+```
+- Single H1 but generic, H2s are vague, no keyword variations
+
+**Score 2/10 (Poor)**:
+```
+H1: Welcome to Our Blog
+H1: Keyword Research Guide
+H3: Some Tips
+H2: More Information
+```
+- Multiple H1s, skipped levels, generic, no keyword strategy
+
+## Section 4: Content Quality (Weight: 25%, Max: 25 points)
+
+### Scoring Criteria
+
+| Criterion | Points | Requirement |
+|-----------|--------|-------------|
+| Sufficient length | 4 | Meets minimum for query type (see benchmarks) |
+| Comprehensive coverage | 4 | Covers all major subtopics that top-ranking pages cover |
+| Unique value | 4 | Original insights, data, or perspective not found elsewhere |
+| Up-to-date information | 3 | Statistics, dates, and references are current |
+| Proper formatting | 3 | Uses lists, tables, bold, images to improve readability |
+| Readability | 3 | Appropriate reading level for target audience |
+| E-E-A-T signals | 4 | Author byline, credentials, first-person experience, cited sources |
+
+### Content Length Benchmarks (for "Sufficient length" criterion)
+
+| Query Type | Minimum for 4/4 | Minimum for 3/4 | Minimum for 2/4 | Below 1/4 |
+|-----------|-----------------|-----------------|-----------------|-----------|
+| Informational | 1,500+ words | 1,000-1,499 | 500-999 | <500 |
+| Commercial | 1,200+ words | 800-1,199 | 400-799 | <400 |
+| Transactional | 500+ words | 350-499 | 200-349 | <200 |
+| Local | 400+ words | 250-399 | 150-249 | <150 |
+
+### Calibration Examples
+
+**Score 25/25 (Excellent)**:
+- 2,400-word guide on keyword research
+- Covers all subtopics competitors cover plus unique methods
+- Contains original data from author's experiments
+- All statistics from the last 12 months
+- Well-formatted with tables, lists, images, code examples
+- Written at appropriate level for target audience
+- Author byline with SEO credentials, first-person "I tested" language
+
+**Score 16/25 (Good)**:
+- 1,600-word guide, covers main subtopics
+- Some unique insights but mostly curated information
+- Most statistics current, a few outdated
+- Good formatting but could use more visual elements
+- Author byline present but limited credentials
+
+**Score 8/25 (Needs Work)**:
+- 700-word article on an informational query (too thin)
+- Missing several subtopics competitors cover
+- No original data or insights
+- Several outdated statistics
+- Wall of text with minimal formatting
+- No author byline or E-E-A-T signals
+
+## Section 5: Keyword Optimization (Weight: 15%, Max: 15 points)
+
+### Scoring Criteria
+
+| Criterion | Points | Requirement |
+|-----------|--------|-------------|
+| Keyword in title | 2 | Primary keyword in title tag |
+| Keyword in H1 | 2 | Primary keyword in H1 |
+| Keyword in first 100 words | 2 | Primary keyword appears early in content |
+| Keyword density (0.5-2.5%) | 2 | Natural density, not stuffed, not too sparse |
+| Secondary keywords present | 2 | 2-3 secondary/related keywords used |
+| LSI/semantic terms | 2 | Related terms and synonyms present |
+| Keyword in URL | 1 | Primary keyword in URL slug |
+| Keyword in image alt text | 1 | At least one image alt contains keyword naturally |
+| Keyword in meta description | 1 | Primary keyword in meta description |
+
+### Keyword Density Guidelines
+
+| Density | Score Impact | Assessment |
+|---------|-------------|-----------|
+| 0.5-1.0% | Full points | Natural, well-integrated |
+| 1.0-2.0% | Full points | Acceptable, slightly keyword-focused |
+| 2.0-2.5% | -1 point | Borderline, review for naturalness |
+| 2.5-3.0% | -2 points | Over-optimized, needs reduction |
+| >3.0% | 0 points for density | Keyword stuffing, immediate fix needed |
+| <0.5% | -1 point | Under-optimized, add more natural usage |
+
+### Calibration Examples
+
+**Score 15/15 (Excellent)**:
+- "keyword research" in title (position 1), H1, first sentence, URL slug, 1 image alt
+- Density: 1.2% across 2,000 words (24 occurrences, well-distributed)
+- Secondary keywords: "keyword tool", "search volume", "keyword difficulty" all present
+- LSI terms: "SEO", "search terms", "Google", "organic traffic", "SERP" present
+
+**Score 9/15 (Needs Work)**:
+- Keyword in title and H1, but not in first 100 words
+- Density: 0.3% (too sparse for a 2,000-word article)
+- Missing secondary keywords
+- No LSI/semantic terms beyond the primary keyword
+
+## Section 6: Internal/External Links (Weight: 10%, Max: 10 points)
+
+### Scoring Criteria
+
+| Criterion | Points | Requirement |
+|-----------|--------|-------------|
+| Internal link count | 2 | 3-5+ contextual internal links (per 1,000 words) |
+| Internal link relevance | 2 | Links point to topically related pages |
+| Descriptive anchor text | 2 | Anchors describe destination, not "click here" |
+| External link quality | 2 | Links to authoritative, relevant external sources |
+| No broken links | 1 | All links return 200 status |
+| Link placement | 1 | Links placed naturally within content flow |
+
+### Internal Link Count Guidelines
+
+| Content Length | Minimum Links | Ideal Range | Too Many |
+|---------------|--------------|-------------|---------|
+| <500 words | 2 | 2-4 | >8 |
+| 500-1,000 words | 3 | 3-6 | >12 |
+| 1,000-2,000 words | 4 | 5-10 | >20 |
+| 2,000+ words | 5 | 8-15 | >25 |
+
+### Calibration Examples
+
+**Score 10/10 (Excellent)**:
+- 8 internal links in a 2,000-word article
+- All link to relevant related content
+- Anchor text like "learn about keyword difficulty scoring" (descriptive)
+- 3 external links to authoritative sources (Google documentation, industry studies)
+- Zero broken links
+
+**Score 5/10 (Needs Work)**:
+- 2 internal links in a 2,000-word article (too few)
+- One link uses "click here" anchor text
+- No external links to sources
+- One broken link found
+
+## Section 7: Image Optimization (Weight: 10%, Max: 10 points)
+
+### Scoring Criteria
+
+| Criterion | Points | Requirement |
+|-----------|--------|-------------|
+| All images have alt text | 3 | Every image has descriptive alt attribute |
+| Alt text includes keywords | 2 | At least 1 image alt naturally includes primary keyword |
+| Descriptive file names | 1 | Files named descriptively (keyword-research-tool.webp, not IMG_4532.jpg) |
+| Optimized file sizes | 2 | Images compressed appropriately (<200KB for photos, <50KB for graphics) |
+| Modern formats | 1 | Uses WebP or AVIF where supported |
+| Lazy loading | 1 | Below-fold images use loading="lazy" |
+
+### Image Size Guidelines
+
+| Image Type | Target Size | Format | Notes |
+|-----------|------------|--------|-------|
+| Hero/banner | <200KB | WebP | Resize to actual display dimensions |
+| Content photos | <150KB | WebP | Compress at 80% quality |
+| Screenshots | <100KB | WebP/PNG | Use PNG only if text clarity critical |
+| Icons/graphics | <30KB | SVG/WebP | SVG preferred for vector graphics |
+| Thumbnails | <50KB | WebP | Generate at actual thumbnail dimensions |
+
+### Calibration Examples
+
+**Score 10/10 (Excellent)**:
+- 5 images, all with descriptive alt text
+- Alt text: "screenshot of Ahrefs keyword research tool showing difficulty score"
+- Files named: keyword-research-tool-ahrefs.webp
+- All images <150KB, WebP format, lazy loading on below-fold images
+
+**Score 4/10 (Needs Work)**:
+- 3 images, 1 missing alt text
+- Alt text on others: "image1", "screenshot" (not descriptive)
+- Files named: IMG_3421.jpg, photo.png
+- One image is 1.2MB uncompressed JPEG
+
+## Section 8: Page-Level Technical (Weight: 10%, Max: 10 points)
+
+### Scoring Criteria
+
+| Criterion | Points | Requirement |
+|-----------|--------|-------------|
+| Clean URL structure | 2 | Short, descriptive, keyword-containing URL |
+| Correct canonical tag | 2 | Self-referencing canonical or appropriate cross-domain canonical |
+| Mobile-friendly | 2 | Passes mobile-friendly test, responsive layout |
+| Page speed (LCP) | 2 | LCP ≤2.5s on mobile |
+| HTTPS | 1 | Page served over HTTPS with valid certificate |
+| Schema markup | 1 | Appropriate schema type implemented (Article, FAQ, HowTo, etc.) |
+
+### URL Quality Guidelines
+
+| Aspect | Good | Bad |
+|--------|------|-----|
+| Length | /blog/keyword-research-guide | /blog/2024/01/15/the-complete-ultimate-guide-to-keyword-research-for-beginners-and-experts |
+| Keywords | /services/seo-audit | /services/page-id-4532 |
+| Characters | lowercase, hyphens | UPPERCASE, underscores, special chars |
+| Parameters | /products/shoes (clean) | /products?id=432&color=red&size=9 |
+
+### Calibration Examples
+
+**Score 10/10 (Excellent)**:
+- URL: /blog/keyword-research-guide (clean, keyword-present)
+- Self-referencing canonical tag present
+- Fully responsive, passes mobile-friendly test
+- LCP: 1.8s on mobile
+- HTTPS with valid certificate
+- Article + FAQ schema implemented
+
+**Score 5/10 (Needs Work)**:
+- URL: /blog/?p=4532 (not descriptive)
+- No canonical tag
+- Mobile-friendly but some tap targets too small
+- LCP: 3.2s (needs improvement)
+- HTTPS present
+- No schema markup
+
+## Overall Score Calculation
+
+### Formula
+
+```
+Overall Score = Sum of (Section Score / Section Max * Section Weight * 100)
+```
+
+### Example Calculation
+
+| Section | Raw Score | Max | Weight | Weighted Score |
+|---------|-----------|-----|--------|---------------|
+| Title Tag | 12 | 15 | 15% | 12.0 |
+| Meta Description | 4 | 5 | 5% | 4.0 |
+| Header Structure | 8 | 10 | 10% | 8.0 |
+| Content Quality | 20 | 25 | 25% | 20.0 |
+| Keyword Optimization | 11 | 15 | 15% | 11.0 |
+| Internal/External Links | 7 | 10 | 10% | 7.0 |
+| Image Optimization | 6 | 10 | 10% | 6.0 |
+| Page-Level Technical | 8 | 10 | 10% | 8.0 |
+| **Total** | | | **100%** | **76.0/100** |
+
+### Overall Score Interpretation
+
+| Score Range | Grade | Assessment |
+|------------|-------|-----------|
+| 90-100 | A+ | Exceptional — minor tweaks only |
+| 80-89 | A | Strong — a few optimization opportunities |
+| 70-79 | B | Good — several areas need attention |
+| 60-69 | C | Average — significant improvements needed |
+| 50-59 | D | Below average — major issues present |
+| <50 | F | Poor — comprehensive overhaul required |
+
+## Calibration Guidance
+
+### Avoiding Common Scoring Errors
+
+1. **Halo effect**: A great title does not mean great content. Score each section independently.
+2. **Recency bias**: Do not inflate scores for recently published content. Freshness is one factor among many.
+3. **Tool dependency**: If you cannot verify a metric (e.g., page speed without tool access), note it as "unverified" rather than guessing.
+4. **Competitor anchoring**: Score against the rubric criteria, not relative to competitors. Competitor comparison is a separate analysis step.
+5. **Intent mismatch**: A page with perfect technical SEO but wrong intent alignment should still score low on Content Quality.
+
+### When to Adjust Weights
+
+The default weights suit most content pages. Consider adjusting for:
+
+| Page Type | Increase Weight | Decrease Weight | Reason |
+|-----------|----------------|----------------|--------|
+| E-commerce product | Image Optimization, Technical | Content Quality | Products need visuals and speed more than long-form |
+| Long-form guide | Content Quality, Keywords | Image Optimization | Content depth matters most |
+| Landing page | Technical, Title | Content Quality | Speed and CTR drive conversions |
+| Local service page | Technical, Links | Keywords | NAP consistency and local signals matter more |
+
+Always document weight adjustments and the reasoning in the audit report.
+
+---
+
+## Scoring Rubric
+
+### Section Weight Distribution
+
+| Audit Section | Weight | Max Score | Rationale |
+|--------------|--------|-----------|-----------|
+| Title Tag | 15% | 15 | Strongest single ranking signal |
+| Meta Description | 5% | 5 | CTR impact, not direct ranking factor |
+| Header Structure | 10% | 10 | Content organization, semantic signals |
+| Content Quality | 25% | 25 | Strongest holistic ranking factor |
+| Keyword Optimization | 15% | 15 | Relevance signals |
+| Internal/External Links | 10% | 10 | Authority flow, context signals |
+| Image Optimization | 10% | 10 | Accessibility + image search opportunity |
+| Page-Level Technical | 10% | 10 | Core Web Vitals, mobile, security |
+
+### Scoring Scale per Factor
+
+| Score | Meaning | Action Required |
+|-------|---------|-----------------|
+| 10/10 | Excellent — follows all best practices | None |
+| 7-9/10 | Good — minor improvements possible | Optional optimization |
+| 4-6/10 | Needs work — notable issues | Fix within this week |
+| 1-3/10 | Poor — significant problems | Fix immediately (Critical) |
+| 0/10 | Missing or broken | Fix immediately (Blocking) |
+
+### Scoring Conversion Formula
+
+Each section is scored out of 10, then converted to the 100-point overall score using section weights:
+
+```
+Overall Score = Sum of (section_score x section_weight) x 10
+```
+
+Where section weights are: Title 0.15, Meta 0.05, Headers 0.10, Content 0.25, Keywords 0.15, Links 0.10, Images 0.10, Technical 0.10.
+
+**Worked example:**
+
+| Section | Score /10 | Weight | Weighted |
+|---------|-----------|--------|----------|
+| Title Tag | 8 | 0.15 | 1.20 |
+| Meta Description | 6 | 0.05 | 0.30 |
+| Header Structure | 9 | 0.10 | 0.90 |
+| Content Quality | 7 | 0.25 | 1.75 |
+| Keyword Optimization | 8 | 0.15 | 1.20 |
+| Internal/External Links | 5 | 0.10 | 0.50 |
+| Image Optimization | 6 | 0.10 | 0.60 |
+| Page-Level Technical | 7 | 0.10 | 0.70 |
+| **Total** | | **1.00** | **7.15** |
+
+**Overall Score** = 7.15 x 10 = **71 / 100**
+
+## Common Issue Resolution Playbook
+
+### Title Tag Issues
+
+| Issue | Impact | Quick Fix Template |
+|-------|--------|-------------------|
+| Missing title | Critical | Add: "[Primary Keyword]: [Benefit] | [Brand]" |
+| Too long (>60 chars) | Medium | Shorten: move brand to end, remove filler words |
+| Too short (<30 chars) | Medium | Expand: add modifier, benefit, or year |
+| Missing keyword | High | Rewrite to include primary keyword in first half |
+| Duplicate title | High | Make each page title unique; add page-specific modifier |
+
+### Meta Description Issues
+
+| Issue | Impact | Quick Fix Template |
+|-------|--------|-------------------|
+| Missing description | Medium | Write: "[What this page covers]. [Key benefit]. [CTA]." (150-160 chars) |
+| Too long (>160 chars) | Low | Trim from end; ensure core message fits in 150 chars |
+| Missing keyword | Low | Naturally incorporate primary keyword |
+| No CTA | Low | Add: "Learn more", "Discover", "Find out", "Get started" |
+| Duplicated across pages | Medium | Write unique description for each page |
+
+### Header Issues
+
+| Issue | Impact | Quick Fix |
+|-------|--------|-----------|
+| Missing H1 | Critical | Add one H1 per page containing primary keyword |
+| Multiple H1s | High | Keep one H1, convert others to H2 |
+| Skipped heading levels | Medium | Use sequential hierarchy: H1→H2→H3 |
+| Headers not descriptive | Medium | Rewrite to include keyword variations |
+| No H2s (single long block) | Medium | Break content into sections with descriptive H2s every 200-300 words |
+
+### Content Issues
+
+| Issue | Impact | Quick Fix |
+|-------|--------|-----------|
+| Thin content (<300 words) | Critical | Expand with subtopics, FAQ, examples |
+| Keyword stuffing (>3%) | High | Reduce usage, use synonyms and related terms |
+| No structured data | Medium | Add relevant schema (FAQ, HowTo, Article) |
+| Missing internal links | Medium | Add 3-5 contextual internal links |
+| No images | Low | Add 2-3 relevant images with alt text |
+
+## Industry Benchmark Data
+
+### Content Length Benchmarks by Query Type
+
+| Query Type | Top 10 Average Word Count | Recommended Minimum |
+|-----------|--------------------------|-------------------|
+| Informational (guides) | 2,200 words | 1,500 words |
+| Commercial (reviews) | 1,800 words | 1,200 words |
+| Transactional (product) | 800 words | 500 words |
+| Local (service pages) | 600 words | 400 words |
+| Definition queries | 1,200 words | 800 words |
+
+### Page Speed Benchmarks
+
+| Metric | Good | Needs Improvement | Poor |
+|--------|------|-------------------|------|
+| LCP | ≤2.5s | 2.5-4.0s | >4.0s |
+| FID/INP | ≤100ms/200ms | 100-300ms | >300ms |
+| CLS | ≤0.1 | 0.1-0.25 | >0.25 |
+| TTFB | ≤800ms | 800-1800ms | >1800ms |
+
diff --git a/.cursor/skills/performance-reporter/SKILL.md b/.cursor/skills/performance-reporter/SKILL.md
new file mode 100644
index 0000000000..27ae07d35e
--- /dev/null
+++ b/.cursor/skills/performance-reporter/SKILL.md
@@ -0,0 +1,263 @@
+---
+name: performance-reporter
+description: 'Generate SEO/GEO dashboards: rankings, traffic, backlinks, AI visibility for stakeholders. SEO报告/绩效仪表盘'
+version: "6.0.0"
+license: Apache-2.0
+compatibility: "Claude Code ≥1.0, skills.sh marketplace, ClawHub marketplace, Vercel Labs skills ecosystem. No system packages required. Optional: MCP network access for SEO tool integrations."
+homepage: "https://github.com/aaron-he-zhu/seo-geo-claude-skills"
+when_to_use: "Use when generating SEO performance reports, traffic summaries, ranking reports, or stakeholder-facing dashboards."
+argument-hint: "<domain> [date range]"
+metadata:
+  author: aaron-he-zhu
+  version: "6.0.0"
+  geo-relevance: "medium"
+  tags:
+    - seo
+    - geo
+    - seo-reporting
+    - performance-report
+    - kpi-dashboard
+    - traffic-report
+    - monthly-report
+    - stakeholder-report
+    - SEO报告
+    - SEOレポート
+    - SEO리포트
+    - informe-seo
+  triggers:
+    # EN-formal
+    - "generate SEO report"
+    - "performance report"
+    - "traffic report"
+    - "SEO dashboard"
+    - "SEO analytics"
+    - "monthly report"
+    # EN-casual
+    - "report to stakeholders"
+    - "monthly SEO report"
+    - "show me my SEO results"
+    - "present SEO results to my boss"
+    - "report to my boss"
+    - "monthly SEO summary"
+    # EN-question
+    - "how are my SEO metrics"
+    - "how is my SEO performing"
+    # ZH-pro
+    - "SEO报告"
+    - "绩效仪表盘"
+    - "流量报告"
+    - "数据看板"
+    # ZH-casual
+    - "出SEO报告"
+    - "汇报给老板"
+    - "看看数据"
+    - "月报"
+    - "出月报"
+    - "周报"
+    # JA
+    - "SEOレポート"
+    - "パフォーマンスレポート"
+    # KO
+    - "SEO 리포트"
+    - "성과 보고서"
+    # ES
+    - "informe SEO"
+    - "reporte de rendimiento"
+    # PT
+    - "relatório SEO"
+    # Misspellings
+    - "SEO repoort"
+---
+
+# Performance Reporter
+
+
+> **[SEO & GEO Skills Library](https://github.com/aaron-he-zhu/seo-geo-claude-skills)** · 20 skills for SEO + GEO · [ClawHub](https://clawhub.ai/u/aaron-he-zhu) · [skills.sh](https://skills.sh/aaron-he-zhu/seo-geo-claude-skills)
+> **System Mode**: This monitoring skill follows the shared [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md) and [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md).
+
+
+This skill creates comprehensive SEO and GEO performance reports that combine multiple metrics into actionable insights. It produces executive summaries, detailed analyses, and visual data presentations for stakeholder communication.
+
+**System role**: Monitoring layer skill. It turns performance changes into deltas, alerts, and next actions.
+
+## When This Must Trigger
+
+Use this when the conversation involves any of these situations — even if the user does not use SEO terminology:
+
+Use this whenever the task needs time-aware change detection, escalation, or stakeholder-ready visibility.
+
+- Monthly/quarterly SEO reporting
+- Executive stakeholder updates
+- Client reporting for agencies
+- Tracking campaign performance
+- Combining multiple SEO metrics
+- Creating GEO visibility reports
+- Documenting ROI from SEO efforts
+
+## What This Skill Does
+
+1. **Data Aggregation**: Combines multiple SEO data sources
+2. **Trend Analysis**: Identifies patterns across metrics
+3. **Executive Summaries**: Creates high-level overviews
+4. **Visual Reports**: Presents data in clear formats
+5. **Benchmark Comparison**: Tracks against goals and competitors
+6. **Content Quality Tracking**: Integrates CORE-EEAT scores across audited pages
+7. **ROI Calculation**: Measures SEO investment returns
+8. **Recommendations**: Suggests actions based on data
+
+## Quick Start
+
+Start with one of these prompts. Finish with a short handoff summary using the repository format in [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md).
+
+### Generate Performance Report
+
+```
+Create an SEO performance report for [domain] for [time period]
+```
+
+### Executive Summary
+
+```
+Generate an executive summary of SEO performance for [month/quarter]
+```
+
+### Specific Report Types
+
+```
+Create a GEO visibility report for [domain]
+```
+
+```
+Generate a content performance report
+```
+
+## Skill Contract
+
+**Expected output**: a delta summary, alert/report output, and a short handoff summary ready for `memory/monitoring/`.
+
+- **Reads**: current metrics, previous baselines, alert thresholds, and reporting context from [CLAUDE.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CLAUDE.md) and the shared [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md) when available.
+- **Writes**: a user-facing monitoring deliverable plus a reusable summary that can be stored under `memory/monitoring/`.
+- **Promotes**: significant changes, confirmed anomalies, and follow-up actions to `memory/open-loops.md` and `memory/decisions.md`.
+- **Next handoff**: use the `Next Best Skill` below when a change needs action.
+
+## Data Sources
+
+> **Note:** All integrations are optional. This skill works without any API keys — users provide data manually when no tools are connected.
+
+> See [CONNECTORS.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CONNECTORS.md) for tool category placeholders.
+
+**With ~~analytics + ~~search console + ~~SEO tool + ~~AI monitor connected:**
+Automatically aggregate traffic metrics from ~~analytics, search performance data from ~~search console, ranking and backlink data from ~~SEO tool, and GEO visibility metrics from ~~AI monitor. Creates comprehensive multi-source reports with historical trends.
+
+**With manual data only:**
+Ask the user to provide:
+1. Analytics screenshots or traffic data export (sessions, users, conversions)
+2. Search Console data (impressions, clicks, average position)
+3. Keyword ranking data for the reporting period
+4. Backlink metrics (referring domains, new/lost links)
+5. Key performance indicators and goals for comparison
+6. AI citation data if tracking GEO metrics
+
+Proceed with the full analysis using provided data. Note in the output which metrics are from automated collection vs. user-provided data.
+
+## Instructions
+
+When a user requests a performance report:
+
+1. **Define Report Parameters** -- Domain, report period, comparison period, report type (Monthly/Quarterly/Annual), audience (Executive/Technical/Client), focus areas.
+
+2. **Create Executive Summary** -- Overall performance rating, key wins/watch areas/action required, metrics at a glance table (traffic, rankings, conversions, DA, AI citations), SEO ROI calculation.
+
+3. **Report Organic Traffic Performance** -- Traffic overview (sessions, users, pageviews, bounce rate), traffic trend visualization, traffic by source/device, top performing pages.
+
+4. **Report Keyword Rankings** -- Rankings overview by position range, distribution change visualization, top improvements and declines, SERP feature performance.
+
+5. **Report GEO/AI Performance** -- AI citation overview, citations by topic, GEO wins, optimization opportunities.
+
+6. **Report Domain Authority (CITE Score)** -- If a CITE audit has been run, include CITE dimension scores (C/I/T/E) with period-over-period trends and veto status. If no audit exists, note as "Not yet evaluated."
+
+7. **Content Quality (CORE-EEAT Score)** -- If content-quality-auditor has been run, include average scores across all 8 CORE-EEAT dimensions with trends. If no audit exists, note as "Not yet evaluated."
+
+8. **Report Backlink Performance** -- Link profile summary, weekly link acquisition, notable new links, competitive position.
+
+9. **Report Content Performance** -- Publishing summary, top performing content, content needing attention, content ROI.
+
+10. **Generate Recommendations** -- Immediate/short-term/long-term actions with priority, expected impact, and owner. Goals for next period.
+
+11. **Compile Full Report** -- Combine all sections with table of contents, appendix (data sources, methodology, glossary).
+
+   > **Reference**: See [references/report-output-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/monitor/performance-reporter/references/report-output-templates.md) for complete output templates for all 11 report sections.
+
+## Validation Checkpoints
+
+### Input Validation
+- [ ] Reporting period clearly defined with comparison period
+- [ ] All required data sources available or alternatives noted
+- [ ] Target audience identified (executive/technical/client)
+- [ ] Performance goals and KPIs established for benchmarking
+
+### Output Validation
+- [ ] Every metric cites its data source and collection date
+- [ ] Trends include period-over-period comparisons
+- [ ] Recommendations are specific, prioritized, and actionable
+- [ ] Source of each data point clearly stated (~~analytics data, ~~search console data, ~~SEO tool data, user-provided, or estimated)
+
+## Example
+
+**User**: "Create a monthly SEO report for cloudhosting.com for January 2025"
+
+**Output** (abbreviated -- full report uses templates from all 11 steps):
+
+```markdown
+# CloudHosting SEO & GEO Performance Report — January 2025
+
+## Executive Summary — Overall Performance: Good
+
+| Metric | Jan 2025 | Dec 2024 | Change | Target | Status |
+|--------|----------|----------|--------|--------|--------|
+| Organic Traffic | 52,100 | 45,200 | +15.3% | 50,000 | On track |
+| Keywords Top 10 | 87 | 79 | +8 | 90 | Watch |
+| Organic Conversions | 684 | 612 | +11.8% | 700 | Watch |
+| Domain Rating | 54 | 53 | +1 | 55 | Watch |
+| AI Citations | 18 | 12 | +50.0% | 20 | Watch |
+
+**SEO ROI**: $8,200 invested / $41,040 organic revenue = 400%
+
+**Immediate**: Fix 37 crawl errors on /pricing/ pages
+**This Month**: Optimize mobile LCP; publish 3 AI Overview comparison pages
+**This Quarter**: Build Wikidata entry for CloudHost Inc.
+```
+
+## Tips for Success
+
+1. **Lead with insights** - Start with what matters, not raw data
+2. **Visualize data** - Charts and graphs improve comprehension
+3. **Compare periods** - Context makes data meaningful
+4. **Include actions** - Every report should drive decisions
+5. **Customize for audience** - Executives need different info than technical teams
+6. **Track GEO metrics** - AI visibility is increasingly important
+
+
+### Save Results
+
+After delivering monitoring data or reports to the user, ask:
+
+> "Save these results for future sessions?"
+
+If yes, write a dated summary to `memory/monitoring/YYYY-MM-DD-<topic>.md` containing:
+- One-line headline finding or status change
+- Top 3-5 actionable items
+- Open loops or anomalies requiring follow-up
+- Source data references
+
+If any findings should influence ongoing strategy, recommend promoting key conclusions to `memory/hot-cache.md`.
+
+## Reference Materials
+
+- [Report Output Templates](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/monitor/performance-reporter/references/report-output-templates.md) — Complete output templates for all 11 report sections
+- [KPI Definitions](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/monitor/performance-reporter/references/kpi-definitions.md) — SEO/GEO metric definitions with benchmarks, good ranges, warning thresholds, trend analysis, and attribution guidance
+- [Report Templates by Audience](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/monitor/performance-reporter/references/report-templates.md) — Copy-ready templates for executive, marketing, technical, and client audiences
+
+## Next Best Skill
+
+- **Primary**: [alert-manager](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/monitor/alert-manager/SKILL.md) — turn reporting insights into ongoing monitoring rules.
diff --git a/.cursor/skills/performance-reporter/references/kpi-definitions.md b/.cursor/skills/performance-reporter/references/kpi-definitions.md
new file mode 100644
index 0000000000..74632b0361
--- /dev/null
+++ b/.cursor/skills/performance-reporter/references/kpi-definitions.md
@@ -0,0 +1,524 @@
+# SEO/GEO KPI Definitions
+
+Complete glossary of SEO and GEO key performance indicators with calculation formulas, data sources, benchmark ranges by industry, and interpretation guidance.
+
+---
+
+## 1. Organic Search KPIs
+
+### Organic Sessions
+
+| Attribute | Detail |
+|-----------|--------|
+| **Definition** | Number of visits to your site originating from organic (unpaid) search engine results |
+| **Formula** | Count of sessions where medium = "organic" |
+| **Data Source** | ~~analytics (Google Analytics, Adobe Analytics, or equivalent) |
+| **Good Range** | Growing month-over-month; 3-10% MoM growth is healthy |
+| **Warning** | Decline >10% MoM without known seasonal cause |
+| **Segmentation** | Always separate brand vs. non-brand organic sessions |
+
+**Interpretation:**
+- Growing organic sessions with stable conversion rate = SEO strategy is working.
+- Growing sessions but declining conversions = traffic quality issue; check keyword targeting.
+- Flat sessions despite new content = content not ranking or cannibalizing existing pages.
+
+---
+
+### Organic Click-Through Rate (CTR)
+
+| Attribute | Detail |
+|-----------|--------|
+| **Definition** | Percentage of search impressions that result in a click to your site |
+| **Formula** | (Organic Clicks / Organic Impressions) x 100 |
+| **Data Source** | ~~search console |
+| **Good Range** | >3% overall; varies significantly by position and query type |
+| **Warning** | <1.5% overall or declining trend |
+| **Segmentation** | By query type (brand vs. non-brand), by page, by position range |
+
+**Benchmarks by Position:**
+
+| Position | Expected CTR Range |
+|----------|-------------------|
+| #1 | 25-35% |
+| #2 | 12-18% |
+| #3 | 8-13% |
+| #4-5 | 4-8% |
+| #6-10 | 2-5% |
+| #11-20 | 0.5-2% |
+
+**Interpretation:**
+- High impressions but low CTR = title tags and meta descriptions need optimization.
+- CTR declining for stable positions = SERP features (AI Overview, PAA) stealing clicks.
+- CTR higher than position benchmarks = strong brand recognition or compelling snippets.
+
+---
+
+### Average Position
+
+| Attribute | Detail |
+|-----------|--------|
+| **Definition** | Mean ranking position across all tracked keywords or queries |
+| **Formula** | Sum of all positions / count of keywords |
+| **Data Source** | ~~search console (query-level), ~~SEO tool (keyword-level) |
+| **Good Range** | <20 for tracked keywords; improving trend |
+| **Warning** | >30 or rising (worsening) trend |
+| **Segmentation** | By keyword group, by page, by intent type |
+
+**Interpretation:**
+- Average position is a directional indicator, not an absolute measure. A few very low-ranking keywords can drag the average down significantly.
+- Always pair with keyword distribution (how many keywords in top 10, top 20, etc.) for a complete picture.
+
+---
+
+### Keyword Visibility Score
+
+| Attribute | Detail |
+|-----------|--------|
+| **Definition** | Weighted score combining keyword positions and search volumes into a single index |
+| **Formula** | Sum of (estimated CTR at position x monthly search volume) for each keyword |
+| **Data Source** | ~~SEO tool |
+| **Good Range** | Growing over time; absolute value depends on niche |
+| **Warning** | Declining trend for 3+ consecutive weeks |
+| **Segmentation** | By topic cluster, by competitor |
+
+**Interpretation:**
+- Visibility score accounts for both ranking position and keyword importance (volume).
+- A single high-volume keyword moving from #1 to #5 can cause a larger visibility drop than 20 low-volume keywords dropping off page 1.
+
+---
+
+### Pages Indexed
+
+| Attribute | Detail |
+|-----------|--------|
+| **Definition** | Number of your pages included in Google's search index |
+| **Formula** | Count of valid indexed pages in Index Coverage report |
+| **Data Source** | ~~search console (Index Coverage / Pages report) |
+| **Good Range** | Indexed count close to total intended indexable pages; growing with new content |
+| **Warning** | Indexed count dropping without intentional removal; large gap between submitted and indexed |
+| **Segmentation** | By sitemap, by content type, by subdirectory |
+
+**Interpretation:**
+- Indexed < submitted = quality or technical issues preventing indexing.
+- Sudden drop in indexed pages = possible noindex tag, robots.txt change, or manual action.
+- Indexed > intended = duplicate content, parameter URLs, or faceted navigation issues.
+
+---
+
+### Organic Conversion Rate
+
+| Attribute | Detail |
+|-----------|--------|
+| **Definition** | Percentage of organic sessions that complete a defined conversion goal |
+| **Formula** | (Organic Conversions / Organic Sessions) x 100 |
+| **Data Source** | ~~analytics |
+| **Good Range** | >2% for lead generation; >1% for e-commerce (varies by industry) |
+| **Warning** | <0.5% or declining while traffic grows |
+| **Segmentation** | By landing page, by keyword intent, by device |
+
+**Industry Benchmarks:**
+
+| Industry | Typical Organic CVR |
+|----------|-------------------|
+| SaaS / Software | 2-5% |
+| E-commerce | 1-3% |
+| Finance | 3-6% |
+| Healthcare | 2-4% |
+| B2B Services | 2-5% |
+| Media / Publishing | 0.5-2% (ad-supported) |
+| Education | 2-5% |
+
+---
+
+### Non-Brand Organic Traffic Share
+
+| Attribute | Detail |
+|-----------|--------|
+| **Definition** | Percentage of organic traffic coming from non-branded search queries |
+| **Formula** | (Organic sessions - brand query sessions) / Organic sessions x 100 |
+| **Data Source** | ~~search console + ~~analytics |
+| **Good Range** | >50% of total organic; growing |
+| **Warning** | <30% (over-reliance on brand awareness, not SEO) |
+| **Segmentation** | Trend over time |
+
+**Interpretation:**
+- High non-brand share = SEO is driving new audience discovery.
+- Low non-brand share = organic traffic is mostly people who already know your brand; SEO is underperforming for acquisition.
+
+---
+
+## 2. GEO / AI Visibility KPIs
+
+### AI Citation Rate
+
+| Attribute | Detail |
+|-----------|--------|
+| **Definition** | Percentage of monitored queries where your content is cited in AI-generated answers |
+| **Formula** | (Queries where you are cited / Total monitored queries with AI answers) x 100 |
+| **Data Source** | ~~AI monitor |
+| **Good Range** | >20% of monitored queries |
+| **Warning** | <5% or declining trend |
+| **Segmentation** | By topic cluster, by content type |
+
+---
+
+### AI Citation Position
+
+| Attribute | Detail |
+|-----------|--------|
+| **Definition** | Your average position among cited sources in AI-generated responses |
+| **Formula** | Sum of citation positions / count of citations |
+| **Data Source** | ~~AI monitor |
+| **Good Range** | Top 3 sources on average |
+| **Warning** | Not cited or consistently cited in position 5+ |
+| **Segmentation** | By query, by topic |
+
+---
+
+### AI Answer Coverage
+
+| Attribute | Detail |
+|-----------|--------|
+| **Definition** | Percentage of your target topics that appear in AI-generated answers |
+| **Formula** | (Topics with AI answers / Total target topics) x 100 |
+| **Data Source** | ~~AI monitor |
+| **Good Range** | Growing over time as AI answers expand |
+| **Warning** | Declining coverage may indicate content quality issues |
+| **Segmentation** | By topic cluster |
+
+---
+
+### Brand Mention in AI Responses
+
+| Attribute | Detail |
+|-----------|--------|
+| **Definition** | Number of times your brand is mentioned in AI-generated responses across monitored queries |
+| **Formula** | Count of AI responses containing your brand name |
+| **Data Source** | ~~AI monitor |
+| **Good Range** | Growing; present in responses for your key topics |
+| **Warning** | Zero mentions for topics where you are an authority |
+| **Segmentation** | By query category |
+
+---
+
+## 3. Domain Authority KPIs
+
+### Domain Rating / Domain Authority
+
+| Attribute | Detail |
+|-----------|--------|
+| **Definition** | Proprietary metric estimating the overall strength of a domain's backlink profile (0-100 scale) |
+| **Formula** | Varies by tool (logarithmic scale based on backlink quantity and quality) |
+| **Data Source** | ~~SEO tool (Ahrefs DR, Moz DA, or equivalent) |
+| **Good Range** | Growing; competitive with top-ranking sites in your niche |
+| **Warning** | Declining or significantly below competitors |
+| **Segmentation** | Compare against competitors |
+
+**Benchmarks by Site Stage:**
+
+| Site Stage | Typical DR/DA |
+|-----------|--------------|
+| Brand new (0-6 months) | 0-15 |
+| Early growth (6-18 months) | 15-30 |
+| Established (18-36 months) | 25-50 |
+| Mature (3+ years) | 40-70+ |
+| Industry leader | 70-90+ |
+
+---
+
+### Referring Domains
+
+| Attribute | Detail |
+|-----------|--------|
+| **Definition** | Count of unique domains that link to your site |
+| **Formula** | Count of distinct root domains with at least one dofollow or nofollow link |
+| **Data Source** | ~~link database |
+| **Good Range** | Growing MoM; higher than primary competitors |
+| **Warning** | Net loss of referring domains for 2+ consecutive months |
+| **Segmentation** | By authority tier (DR 0-20, 20-40, 40-60, 60+) |
+
+---
+
+### Backlink Growth Rate
+
+| Attribute | Detail |
+|-----------|--------|
+| **Definition** | Net new backlinks acquired per month |
+| **Formula** | New backlinks gained - backlinks lost in the period |
+| **Data Source** | ~~link database |
+| **Good Range** | Positive and steady; proportional to content output |
+| **Warning** | Negative for 2+ months; sudden spikes (may indicate spam) |
+| **Segmentation** | By link quality tier |
+
+---
+
+### Toxic Link Ratio
+
+| Attribute | Detail |
+|-----------|--------|
+| **Definition** | Percentage of your backlinks classified as toxic or spammy |
+| **Formula** | (Toxic backlinks / Total backlinks) x 100 |
+| **Data Source** | ~~link database (toxic score/spam score) |
+| **Good Range** | <5% |
+| **Warning** | 5-10% (monitor and clean up) |
+| **Critical** | >10% (immediate disavow action needed) |
+| **Segmentation** | By toxic type (PBN, spam, irrelevant) |
+
+---
+
+## 4. Technical SEO KPIs
+
+### Core Web Vitals
+
+| Metric | Definition | Good | Needs Improvement | Poor |
+|--------|-----------|------|-------------------|------|
+| **LCP** (Largest Contentful Paint) | Time to render largest content element | <=2.5s | 2.5-4.0s | >4.0s |
+| **CLS** (Cumulative Layout Shift) | Visual stability during page load | <=0.1 | 0.1-0.25 | >0.25 |
+| **INP** (Interaction to Next Paint) | Responsiveness to user interactions | <=200ms | 200-500ms | >500ms |
+
+**Data Source:** ~~search console (Core Web Vitals report), Chrome UX Report, PageSpeed Insights
+
+---
+
+### Crawl Budget Utilization
+
+| Attribute | Detail |
+|-----------|--------|
+| **Definition** | How efficiently search engine crawlers are spending their crawl budget on your site |
+| **Formula** | (Useful pages crawled / Total pages crawled) x 100 |
+| **Data Source** | ~~search console (Crawl Stats), server logs |
+| **Good Range** | >80% of crawled pages are indexable, valuable pages |
+| **Warning** | High crawl of non-indexable or low-value pages |
+| **Segmentation** | By content type, by HTTP status code |
+
+---
+
+### Index Coverage Rate
+
+| Attribute | Detail |
+|-----------|--------|
+| **Definition** | Percentage of submitted pages that are successfully indexed |
+| **Formula** | (Indexed pages / Submitted pages) x 100 |
+| **Data Source** | ~~search console |
+| **Good Range** | >90% for sites with curated sitemaps |
+| **Warning** | <80% or declining |
+| **Segmentation** | By sitemap, by exclusion reason |
+
+---
+
+## 5. Content Performance KPIs
+
+### Content Efficiency Score
+
+| Attribute | Detail |
+|-----------|--------|
+| **Definition** | Ratio of content investment to organic traffic generated |
+| **Formula** | Organic sessions per content piece / cost per content piece |
+| **Data Source** | ~~analytics + internal cost tracking |
+| **Good Range** | Improving over time; varies by content type |
+| **Warning** | Declining efficiency despite continued investment |
+| **Segmentation** | By content type, by topic, by author |
+
+---
+
+### Content Decay Rate
+
+| Attribute | Detail |
+|-----------|--------|
+| **Definition** | Percentage of existing content losing organic traffic over a defined period |
+| **Formula** | (Pages with >20% traffic decline over 6 months / Total pages with traffic) x 100 |
+| **Data Source** | ~~analytics |
+| **Good Range** | <20% of pages decaying per 6-month period |
+| **Warning** | >30% of pages decaying |
+| **Segmentation** | By content age, by topic, by content type |
+
+---
+
+### Organic Revenue Per Session
+
+| Attribute | Detail |
+|-----------|--------|
+| **Definition** | Average revenue generated per organic search session |
+| **Formula** | Total organic revenue / Total organic sessions |
+| **Data Source** | ~~analytics (e-commerce tracking or goal values) |
+| **Good Range** | Stable or growing; varies hugely by industry |
+| **Warning** | Declining while traffic grows (traffic quality deteriorating) |
+| **Segmentation** | By landing page, by keyword intent, by device |
+
+---
+
+## 6. Competitive KPIs
+
+### Share of Voice (SOV)
+
+| Attribute | Detail |
+|-----------|--------|
+| **Definition** | Your visibility as a percentage of total visibility across tracked keywords |
+| **Formula** | (Your visibility score / Sum of all tracked competitors' visibility scores) x 100 |
+| **Data Source** | ~~SEO tool |
+| **Good Range** | Growing; leading in your core topic areas |
+| **Warning** | Declining for 3+ consecutive months |
+| **Segmentation** | By topic cluster, by competitor |
+
+---
+
+### Competitive Keyword Overlap
+
+| Attribute | Detail |
+|-----------|--------|
+| **Definition** | Percentage of your tracked keywords where a specific competitor also ranks in the top 20 |
+| **Formula** | (Keywords where both rank in top 20 / Your total tracked keywords) x 100 |
+| **Data Source** | ~~SEO tool |
+| **Good Range** | Context-dependent; high overlap for direct competitors is expected |
+| **Warning** | New competitor appearing with high overlap indicates emerging threat |
+| **Segmentation** | By competitor, by keyword group |
+
+---
+
+## 7. ROI and Business Impact KPIs
+
+### SEO ROI
+
+| Attribute | Detail |
+|-----------|--------|
+| **Definition** | Return on investment from SEO activities |
+| **Formula** | ((Organic Revenue - SEO Investment) / SEO Investment) x 100 |
+| **Data Source** | ~~analytics + internal cost tracking |
+| **Good Range** | >200% annually (SEO compounds over time) |
+| **Warning** | <100% after 12+ months of investment |
+| **Segmentation** | By content type, by campaign |
+
+**Note:** SEO ROI should be measured over 12+ month horizons. Short-term ROI calculations are misleading because SEO benefits compound over time.
+
+---
+
+### Organic Traffic Value
+
+| Attribute | Detail |
+|-----------|--------|
+| **Definition** | Estimated cost to acquire equivalent traffic through paid search |
+| **Formula** | Sum of (monthly organic clicks per keyword x CPC for that keyword) |
+| **Data Source** | ~~SEO tool (traffic value calculation) |
+| **Good Range** | Growing; significantly higher than SEO investment |
+| **Warning** | Declining traffic value despite stable traffic (keywords losing CPC value) |
+| **Segmentation** | By keyword group, by page |
+
+**Interpretation:**
+- Organic traffic value represents how much you would need to spend on PPC to get the same traffic.
+- Useful for communicating SEO value to stakeholders who understand paid media budgets.
+- A site with $50K/month organic traffic value that spends $10K/month on SEO is getting a 5:1 return.
+
+---
+
+## SEO/GEO Metric Definitions and Benchmarks
+
+### Organic Search Metrics
+
+| Metric | Definition | Good Range | Warning | Source |
+|--------|-----------|-----------|---------|--------|
+| Organic sessions | Visits from organic search | Growing MoM | >10% decline | ~~analytics |
+| Keyword visibility | % of target keywords in top 100 | >60% | <40% | ~~SEO tool |
+| Average position | Mean position across tracked keywords | <20 | >30 | ~~search console |
+| Organic CTR | Clicks / impressions from search | >3% | <1.5% | ~~search console |
+| Pages indexed | Pages in Google index | Growing | Dropping | ~~search console |
+| Organic conversion rate | Conversions / organic sessions | >2% | <0.5% | ~~analytics |
+| Non-brand organic traffic | Organic traffic minus brand searches | >50% of total organic | <30% | ~~analytics |
+
+### GEO/AI Visibility Metrics
+
+| Metric | Definition | Good Range | Warning | Source |
+|--------|-----------|-----------|---------|--------|
+| AI citation rate | % of monitored queries citing your content | >20% | <5% | ~~AI monitor |
+| AI citation position | Average position in AI response citations | Top 3 sources | Not cited | ~~AI monitor |
+| AI answer coverage | % of your topics appearing in AI answers | Growing | Declining | ~~AI monitor |
+| Brand mention in AI | Times your brand is mentioned in AI responses | Growing | Zero | ~~AI monitor |
+
+### Domain Authority Metrics
+
+| Metric | Definition | Good Range | Warning | Source |
+|--------|-----------|-----------|---------|--------|
+| Domain Rating/Authority | Overall domain strength | Growing | Declining | ~~SEO tool |
+| Referring domains | Unique domains linking to you | Growing MoM | Loss >10% MoM | ~~link database |
+| Backlink growth rate | Net new backlinks per month | Positive | Negative trend | ~~link database |
+| Toxic link ratio | Toxic links / total links | <5% | >10% | ~~link database |
+
+## Reporting Templates by Audience
+
+### Executive Report (C-Suite / Leadership)
+
+**Focus:** Business outcomes, ROI, competitive position
+**Length:** 1 page + appendix
+**Frequency:** Monthly or Quarterly
+
+| Section | Content |
+|---------|---------|
+| Traffic & Revenue | Organic traffic trend + attributed revenue |
+| Competitive Position | Visibility share vs. top 3 competitors |
+| AI Visibility | AI citation trend and coverage |
+| Key Wins | Top 3 achievements with business impact |
+| Risks | Top 3 concerns with proposed mitigation |
+| Investment Ask | Resources needed for next period |
+
+### Marketing Team Report
+
+**Focus:** Channel performance, content effectiveness, technical health
+**Length:** 2-3 pages
+**Frequency:** Monthly
+
+| Section | Content |
+|---------|---------|
+| Keyword Performance | Rankings gained/lost, new keywords discovered |
+| Content Performance | Top pages by traffic, engagement, conversions |
+| Technical Health | Crawl errors, speed scores, indexation |
+| Backlink Profile | New links, lost links, quality assessment |
+| GEO Performance | AI citation changes, new citations |
+| Action Items | P0-P3 prioritized task list |
+
+### Technical SEO Report
+
+**Focus:** Crawlability, indexation, speed, errors
+**Length:** Detailed
+**Frequency:** Weekly or Bi-weekly
+
+| Section | Content |
+|---------|---------|
+| Crawl Stats | Pages crawled, errors, crawl budget usage |
+| Index Coverage | Indexed/excluded/errored pages |
+| Core Web Vitals | LCP, CLS, INP trends |
+| Error Log | New 4xx/5xx errors with resolution status |
+| Schema Validation | New warnings, rich result eligibility |
+| Technical Debt | Outstanding issues by priority |
+
+## Trend Analysis Framework
+
+### Period-Over-Period Analysis
+
+| Comparison | Best For | Limitation |
+|-----------|---------|-----------|
+| Week over week (WoW) | Detecting sudden changes | Noisy, affected by day-of-week patterns |
+| Month over month (MoM) | Identifying trends | Seasonal bias |
+| Year over year (YoY) | Accounting for seasonality | Does not reflect recent trajectory |
+| Rolling 30-day average | Smoothing noise | Lags behind real changes |
+
+### Trend Interpretation Guidelines
+
+| Pattern | Likely Cause | Recommended Action |
+|---------|-------------|-------------------|
+| Steady growth | Strategy is working | Continue, optimize high performers |
+| Sudden spike then drop | Viral content or algorithm volatility | Investigate cause, build on if repeatable |
+| Gradual decline | Content decay, competition, technical debt | Comprehensive audit needed |
+| Flat line | Plateau — existing strategy maxed out | New content areas, new link strategies |
+| Seasonal pattern | Industry/demand cycles | Plan content calendar around peaks |
+
+## SEO Attribution Guidance
+
+### Attribution Challenges in SEO
+
+| Challenge | Impact | Mitigation |
+|----------|--------|-----------|
+| Long conversion paths | SEO rarely gets last-touch credit | Use assisted conversions report |
+| Brand vs. non-brand | Brand searches inflate organic metrics | Always separate brand/non-brand |
+| Cross-device journeys | Mobile search to desktop conversion | Enable cross-device tracking |
+| SEO + paid overlap | Cannibalization or lift? | Test turning off paid for branded terms |
+| Content assists sales | Hard to attribute | Track content touches in CRM |
+
diff --git a/.cursor/skills/performance-reporter/references/report-output-templates.md b/.cursor/skills/performance-reporter/references/report-output-templates.md
new file mode 100644
index 0000000000..fa0f0fdfaf
--- /dev/null
+++ b/.cursor/skills/performance-reporter/references/report-output-templates.md
@@ -0,0 +1,417 @@
+# Performance Report Output Templates
+
+Detailed output templates for each step of the performance reporting workflow. Use these templates when generating reports for stakeholders.
+
+---
+
+## 1. Report Configuration Template
+
+```markdown
+## Report Configuration
+
+**Domain**: [domain]
+**Report Period**: [start date] to [end date]
+**Comparison Period**: [previous period for comparison]
+**Report Type**: [Monthly/Quarterly/Annual/Custom]
+**Audience**: [Executive/Technical/Client]
+**Focus Areas**: [Rankings/Traffic/Content/Backlinks/GEO]
+```
+
+---
+
+## 2. Executive Summary Template
+
+```markdown
+# SEO Performance Report
+
+**Domain**: [domain]
+**Period**: [date range]
+**Prepared**: [date]
+
+---
+
+## Executive Summary
+
+### Overall Performance: [Excellent/Good/Needs Attention/Critical]
+
+**Key Highlights**:
+
+Wins:
+- [Win 1 - e.g., "Organic traffic increased 25%"]
+- [Win 2 - e.g., "3 new #1 rankings achieved"]
+- [Win 3 - e.g., "Conversion rate improved 15%"]
+
+Watch Areas:
+- [Area 1 - e.g., "Mobile rankings declining slightly"]
+- [Area 2 - e.g., "Competitor gaining ground on key terms"]
+
+Action Required:
+- [Issue 1 - e.g., "Technical SEO audit needed"]
+
+### Key Metrics at a Glance
+
+| Metric | This Period | Last Period | Change | Target | Status |
+|--------|-------------|-------------|--------|--------|--------|
+| Organic Traffic | [X] | [Y] | [+/-Z%] | [T] | [status] |
+| Keyword Rankings (Top 10) | [X] | [Y] | [+/-Z] | [T] | [status] |
+| Organic Conversions | [X] | [Y] | [+/-Z%] | [T] | [status] |
+| Domain Authority | [X] | [Y] | [+/-Z] | [T] | [status] |
+| AI Citations | [X] | [Y] | [+/-Z%] | [T] | [status] |
+
+### SEO ROI
+
+**Investment**: $[X] (content, tools, effort)
+**Organic Revenue**: $[Y]
+**ROI**: [Z]%
+```
+
+---
+
+## 3. Organic Traffic Analysis Template
+
+```markdown
+## Organic Traffic Analysis
+
+### Traffic Overview
+
+| Metric | This Period | vs Last Period | vs Last Year |
+|--------|-------------|----------------|--------------|
+| Sessions | [X] | [+/-Y%] | [+/-Z%] |
+| Users | [X] | [+/-Y%] | [+/-Z%] |
+| Pageviews | [X] | [+/-Y%] | [+/-Z%] |
+| Avg. Session Duration | [X] | [+/-Y%] | [+/-Z%] |
+| Bounce Rate | [X]% | [+/-Y%] | [+/-Z%] |
+| Pages per Session | [X] | [+/-Y] | [+/-Z] |
+
+### Traffic Trend
+
+```
+[Month 1]  ████████████████████ [X]
+[Month 2]  █████████████████████ [Y]
+[Month 3]  ███████████████████████ [Z]
+[Current]  ████████████████████████ [W]
+```
+
+### Traffic by Source
+
+| Channel | Sessions | % of Total | Change |
+|---------|----------|------------|--------|
+| Organic Search | [X] | [Y]% | [+/-Z%] |
+| Direct | [X] | [Y]% | [+/-Z%] |
+| Referral | [X] | [Y]% | [+/-Z%] |
+| Social | [X] | [Y]% | [+/-Z%] |
+
+### Top Performing Pages
+
+| Page | Sessions | Change | Conversions |
+|------|----------|--------|-------------|
+| [Page 1] | [X] | [+/-Y%] | [Z] |
+| [Page 2] | [X] | [+/-Y%] | [Z] |
+| [Page 3] | [X] | [+/-Y%] | [Z] |
+
+### Traffic by Device
+
+| Device | Sessions | Change | Conv. Rate |
+|--------|----------|--------|------------|
+| Desktop | [X] ([Y]%) | [+/-Z%] | [%] |
+| Mobile | [X] ([Y]%) | [+/-Z%] | [%] |
+| Tablet | [X] ([Y]%) | [+/-Z%] | [%] |
+```
+
+---
+
+## 4. Keyword Ranking Performance Template
+
+```markdown
+## Keyword Ranking Performance
+
+### Rankings Overview
+
+| Position Range | Keywords | Change | Traffic Impact |
+|----------------|----------|--------|----------------|
+| Position 1 | [X] | [+/-Y] | [Z] sessions |
+| Position 2-3 | [X] | [+/-Y] | [Z] sessions |
+| Position 4-10 | [X] | [+/-Y] | [Z] sessions |
+| Position 11-20 | [X] | [+/-Y] | [Z] sessions |
+| Position 21-50 | [X] | [+/-Y] | [Z] sessions |
+
+### Top Ranking Improvements
+
+| Keyword | Previous | Current | Change | Traffic |
+|---------|----------|---------|--------|---------|
+| [kw 1] | [X] | [Y] | +[Z] | [sessions] |
+
+### Rankings That Declined
+
+| Keyword | Previous | Current | Change | Impact | Action |
+|---------|----------|---------|--------|--------|--------|
+| [kw 1] | [X] | [Y] | -[Z] | -[sessions] | [action] |
+
+### SERP Feature Performance
+
+| Feature | Won | Lost | Opportunities |
+|---------|-----|------|---------------|
+| Featured Snippets | [X] | [Y] | [Z] |
+| People Also Ask | [X] | [Y] | [Z] |
+| Local Pack | [X] | [Y] | [Z] |
+```
+
+---
+
+## 5. GEO/AI Visibility Template
+
+```markdown
+## GEO (AI Visibility) Performance
+
+### AI Citation Overview
+
+| Metric | This Period | Last Period | Change |
+|--------|-------------|-------------|--------|
+| Keywords with AI Overview | [X]/[Y] | [X]/[Y] | [+/-Z] |
+| Your AI Citations | [X] | [Y] | [+/-Z%] |
+| Citation Rate | [X]% | [Y]% | [+/-Z%] |
+| Avg Citation Position | [X] | [Y] | [+/-Z] |
+
+### AI Citation by Topic
+
+| Topic Cluster | Opportunities | Citations | Rate |
+|---------------|---------------|-----------|------|
+| [Topic 1] | [X] | [Y] | [Z]% |
+
+### GEO Wins This Period
+
+| Query | Citation Status | Source Page | Impact |
+|-------|-----------------|-------------|--------|
+| [query 1] | New citation | [page] | High visibility |
+
+### GEO Optimization Opportunities
+
+| Query | AI Overview | You Cited? | Gap | Action |
+|-------|-------------|------------|-----|--------|
+| [query] | Yes | No | [gap] | [action] |
+```
+
+---
+
+## 6. Domain Authority (CITE Score) Template
+
+```markdown
+## Domain Authority (CITE Score)
+
+### CITE Score Summary
+
+| Metric | This Period | Last Period | Change |
+|--------|-------------|-------------|--------|
+| CITE Score | [X]/100 | [Y]/100 | [+/-Z] |
+| C -- Citation | [X]/100 | [Y]/100 | [+/-Z] |
+| I -- Identity | [X]/100 | [Y]/100 | [+/-Z] |
+| T -- Trust | [X]/100 | [Y]/100 | [+/-Z] |
+| E -- Eminence | [X]/100 | [Y]/100 | [+/-Z] |
+
+**Veto Status**: No triggers / [item] triggered
+
+### Key Changes
+
+- [Notable improvement or concern 1]
+- [Notable improvement or concern 2]
+
+_For full 40-item evaluation, run `/seo:audit-domain`_
+```
+
+**Note**: If no previous CITE audit exists, note this section as "Not yet evaluated -- run domain-authority-auditor for baseline" and skip.
+
+---
+
+## 7. Content Quality (CORE-EEAT Score) Template
+
+```markdown
+## Content Quality (CORE-EEAT Score)
+
+### Content Quality Summary
+
+| Metric | Value |
+|--------|-------|
+| Pages Audited | [count] |
+| Average CORE-EEAT Score | [score]/100 ([rating]) |
+| Average GEO Score (CORE) | [score]/100 |
+| Average SEO Score (EEAT) | [score]/100 |
+| Veto Items Triggered | [count] ([item IDs]) |
+
+### Dimension Averages Across Audited Pages
+
+| Dimension | Average Score | Trend |
+|-----------|--------------|-------|
+| C -- Contextual Clarity | [score] | [up/down/stable] |
+| O -- Organization | [score] | [up/down/stable] |
+| R -- Referenceability | [score] | [up/down/stable] |
+| E -- Exclusivity | [score] | [up/down/stable] |
+| Exp -- Experience | [score] | [up/down/stable] |
+| Ept -- Expertise | [score] | [up/down/stable] |
+| A -- Authority | [score] | [up/down/stable] |
+| T -- Trust | [score] | [up/down/stable] |
+
+### Key Content Quality Changes
+
+- [Notable score changes since last report]
+- [Pages with significant quality improvements/declines]
+
+_For full 80-item evaluation, run `/seo:audit-page` on individual pages._
+```
+
+**Note**: If no content quality audit exists, note this section as "Content quality not yet evaluated -- run `/seo:audit-page` on key landing pages to establish baseline" and skip.
+
+---
+
+## 8. Backlink Performance Template
+
+```markdown
+## Backlink Performance
+
+### Link Profile Summary
+
+| Metric | This Period | Last Period | Change |
+|--------|-------------|-------------|--------|
+| Total Backlinks | [X] | [Y] | [+/-Z] |
+| Referring Domains | [X] | [Y] | [+/-Z] |
+| Domain Authority | [X] | [Y] | [+/-Z] |
+| Avg. Link DA | [X] | [Y] | [+/-Z] |
+
+### Link Acquisition
+
+| Period | New Links | Lost Links | Net |
+|--------|-----------|------------|-----|
+| Week 1 | [X] | [Y] | [+/-Z] |
+| Week 2 | [X] | [Y] | [+/-Z] |
+| Week 3 | [X] | [Y] | [+/-Z] |
+| Week 4 | [X] | [Y] | [+/-Z] |
+| **Total** | **[X]** | **[Y]** | **[+/-Z]** |
+
+### Notable New Links
+
+| Source | DA | Type | Value |
+|--------|-----|------|-------|
+| [domain 1] | [DA] | [type] | High |
+
+### Competitive Position
+
+Your referring domains rank #[X] of [Y] competitors.
+```
+
+---
+
+## 9. Content Performance Template
+
+```markdown
+## Content Performance
+
+### Content Publishing Summary
+
+| Metric | This Period | Last Period | Target |
+|--------|-------------|-------------|--------|
+| New articles published | [X] | [Y] | [Z] |
+| Content updates | [X] | [Y] | [Z] |
+| Total word count | [X] | [Y] | - |
+
+### Top Performing Content
+
+| Content | Traffic | Rankings | Conversions | Status |
+|---------|---------|----------|-------------|--------|
+| [Title 1] | [X] | [Y] keywords | [Z] | Top performer |
+| [Title 2] | [X] | [Y] keywords | [Z] | Growing |
+
+### Content Needing Attention
+
+| Content | Issue | Traffic Change | Action |
+|---------|-------|----------------|--------|
+| [Title] | [issue] | -[X]% | [action] |
+
+### Content ROI
+
+| Content Piece | Investment | Traffic Value | ROI |
+|---------------|------------|---------------|-----|
+| [Title 1] | $[X] | $[Y] | [Z]% |
+```
+
+---
+
+## 10. Recommendations Template
+
+```markdown
+## Recommendations & Next Steps
+
+### Immediate Actions (This Week)
+
+| Priority | Action | Expected Impact | Owner |
+|----------|--------|-----------------|-------|
+| High | [Action 1] | [Impact] | [Owner] |
+
+### Short-term (This Month)
+
+| Priority | Action | Expected Impact | Owner |
+|----------|--------|-----------------|-------|
+| Medium | [Action 1] | [Impact] | [Owner] |
+
+### Long-term (This Quarter)
+
+| Priority | Action | Expected Impact | Owner |
+|----------|--------|-----------------|-------|
+| Planned | [Action 1] | [Impact] | [Owner] |
+
+### Goals for Next Period
+
+| Metric | Current | Target | Action to Achieve |
+|--------|---------|--------|-------------------|
+| Organic Traffic | [X] | [Y] | [action] |
+| Keywords Top 10 | [X] | [Y] | [action] |
+| AI Citations | [X] | [Y] | [action] |
+| Referring Domains | [X] | [Y] | [action] |
+```
+
+---
+
+## 11. Full Report Compilation Template
+
+```markdown
+# [Company] SEO & GEO Performance Report
+
+## [Month/Quarter] [Year]
+
+---
+
+### Table of Contents
+
+1. Executive Summary
+2. Organic Traffic Performance
+3. Keyword Rankings
+4. GEO/AI Visibility
+5. Domain Authority (CITE Score)
+6. Content Quality (CORE-EEAT Score)
+7. Backlink Analysis
+8. Content Performance
+9. Technical Health
+10. Competitive Landscape
+11. Recommendations
+12. Appendix
+
+---
+
+[Include all sections from above]
+
+---
+
+## Appendix
+
+### Data Sources
+- ~~analytics (traffic and conversion data)
+- ~~search console (search performance)
+- ~~SEO tool (rankings and backlinks)
+- ~~AI monitor (GEO metrics)
+
+### Methodology
+[Explain how metrics were calculated]
+
+### Glossary
+- **GEO**: Generative Engine Optimization
+- **DA**: Domain Authority
+- [Additional terms]
+```
diff --git a/.cursor/skills/performance-reporter/references/report-templates.md b/.cursor/skills/performance-reporter/references/report-templates.md
new file mode 100644
index 0000000000..2312194dbe
--- /dev/null
+++ b/.cursor/skills/performance-reporter/references/report-templates.md
@@ -0,0 +1,383 @@
+# SEO/GEO Report Templates
+
+Copy-ready report templates for executive, marketing, and technical audiences. Each template includes example data to illustrate proper formatting and data presentation.
+
+---
+
+## 1. Executive Report Template
+
+Use this template for C-suite, VP-level, or board reporting. Focuses on business impact, competitive position, and strategic recommendations. Keep to 1 page plus optional appendix.
+
+---
+
+```markdown
+# SEO & GEO Performance Summary
+
+**Period:** January 2025
+**Prepared for:** Leadership Team
+**Prepared by:** [Name], SEO Lead
+
+---
+
+## Performance at a Glance
+
+| KPI | Jan 2025 | Dec 2024 | MoM Change | YoY Change | Target | Status |
+|-----|----------|----------|------------|------------|--------|--------|
+| Organic Revenue | $142,000 | $128,000 | +10.9% | +34% | $130,000 | On track |
+| Organic Sessions | 285,000 | 261,000 | +9.2% | +28% | 270,000 | On track |
+| Keywords in Top 10 | 187 | 172 | +15 | +62 | 180 | On track |
+| AI Citations | 34 | 28 | +21.4% | N/A | 30 | On track |
+| Domain Rating | 52 | 51 | +1 | +8 | 55 | In progress |
+| Organic CVR | 2.8% | 2.6% | +0.2pp | +0.5pp | 2.5% | On track |
+
+## Competitive Position
+
+**Share of Voice Ranking:** #2 of 5 tracked competitors (up from #3 in Q3)
+
+| Rank | Competitor | Visibility Share |
+|------|-----------|-----------------|
+| 1 | CompetitorA.com | 28% |
+| **2** | **YourSite.com** | **24%** |
+| 3 | CompetitorB.com | 21% |
+| 4 | CompetitorC.com | 16% |
+| 5 | CompetitorD.com | 11% |
+
+## Key Wins
+
+1. **Achieved #1 ranking for "project management software"** — estimated $18K/month traffic value
+2. **AI citation rate increased to 34%** — up from 22% in Q3, now cited in AI responses for core product queries
+3. **Published research report generated 45 backlinks** — from sites including TechCrunch (DR 94) and Forbes (DR 95)
+
+## Risks & Mitigation
+
+1. **Google core update expected in February** — monitoring closely; content quality scores above benchmark
+2. **CompetitorA increased content production 2x** — accelerating our editorial calendar in response
+3. **Mobile CWV scores degraded** — engineering team addressing LCP issue, fix scheduled for Feb 15
+
+## Investment & ROI
+
+| Item | Jan Spend | Organic Revenue | Cumulative ROI (12mo) |
+|------|----------|----------------|----------------------|
+| SEO team | $12,000 | $142,000 | 418% |
+| Content production | $8,000 | (included above) | (included above) |
+| Tools & data | $2,500 | (included above) | (included above) |
+| **Total** | **$22,500** | **$142,000** | **418%** |
+
+## Ask for Next Period
+
+- Approve $5,000 budget increase for digital PR campaign targeting Q2 product launch
+- Engineering allocation: 20 hours for Core Web Vitals remediation
+```
+
+---
+
+## 2. Marketing Team Report Template
+
+Use this template for marketing managers, content teams, and channel leads. Provides detailed performance data with actionable insights.
+
+---
+
+```markdown
+# Monthly SEO & GEO Performance Report
+
+**Domain:** yoursite.com
+**Period:** January 1-31, 2025
+**Comparison:** vs. December 2024 and January 2024
+**Prepared:** February 3, 2025
+
+---
+
+## 1. Traffic Performance
+
+### Overview
+
+| Metric | Jan 2025 | Dec 2024 | MoM | Jan 2024 | YoY |
+|--------|----------|----------|-----|----------|-----|
+| Organic Sessions | 285,000 | 261,000 | +9.2% | 223,000 | +27.8% |
+| Organic Users | 198,000 | 182,000 | +8.8% | 158,000 | +25.3% |
+| Organic Pageviews | 412,000 | 378,000 | +9.0% | 318,000 | +29.6% |
+| Bounce Rate | 42% | 44% | -2pp | 48% | -6pp |
+| Avg Session Duration | 3:12 | 3:05 | +3.8% | 2:48 | +14.3% |
+| Pages/Session | 2.4 | 2.3 | +4.3% | 2.1 | +14.3% |
+
+### Traffic by Device
+
+| Device | Sessions | % of Total | MoM Change | CVR |
+|--------|----------|------------|------------|-----|
+| Desktop | 142,500 (50%) | 50% | +7.1% | 3.2% |
+| Mobile | 131,100 (46%) | 46% | +11.5% | 2.1% |
+| Tablet | 11,400 (4%) | 4% | +5.6% | 2.8% |
+
+### Non-Brand vs. Brand Split
+
+| Segment | Sessions | % of Organic | MoM Change |
+|---------|----------|-------------|------------|
+| Non-brand | 184,000 | 64.6% | +12.1% |
+| Brand | 101,000 | 35.4% | +4.1% |
+
+**Insight:** Non-brand growth outpacing brand growth indicates SEO is driving new audience discovery effectively.
+
+## 2. Keyword Rankings
+
+### Position Distribution
+
+| Range | Jan 2025 | Dec 2024 | Change |
+|-------|----------|----------|--------|
+| Position #1 | 23 | 19 | +4 |
+| Position #2-3 | 41 | 38 | +3 |
+| Position #4-10 | 123 | 115 | +8 |
+| Position #11-20 | 89 | 94 | -5 |
+| Position #21-50 | 156 | 162 | -6 |
+| Not ranking | 68 | 72 | -4 |
+| **Total tracked** | **500** | **500** | — |
+
+### Top 5 Ranking Improvements
+
+| Keyword | Volume | Old Rank | New Rank | Change | Est. Traffic |
+|---------|--------|----------|----------|--------|-------------|
+| project management software | 18,100 | 4 | 1 | +3 | +4,200/mo |
+| best CRM for small business | 8,200 | 12 | 6 | +6 | +380/mo |
+| team collaboration tools | 6,500 | 8 | 3 | +5 | +420/mo |
+| agile methodology guide | 4,400 | 22 | 9 | +13 | +180/mo |
+| remote work software | 5,100 | 15 | 7 | +8 | +210/mo |
+
+### Top 5 Ranking Declines
+
+| Keyword | Volume | Old Rank | New Rank | Change | Action |
+|---------|--------|----------|----------|--------|--------|
+| kanban board software | 3,600 | 5 | 11 | -6 | Content refresh scheduled Feb 10 |
+| project planning template | 2,900 | 7 | 14 | -7 | Competitor published superior template |
+| time tracking tools | 4,100 | 9 | 15 | -6 | New SERP features displacing organic |
+| workflow automation | 3,200 | 6 | 9 | -3 | Monitoring, within normal fluctuation |
+| sprint planning guide | 1,800 | 3 | 8 | -5 | Content update needed with 2025 examples |
+
+## 3. Content Performance
+
+### Top Performing Content (by Organic Traffic)
+
+| Page | Sessions | MoM | Rankings | Conversions |
+|------|----------|-----|----------|-------------|
+| /blog/project-management-guide | 18,400 | +22% | 12 KWs top 10 | 184 |
+| /product/features | 15,200 | +8% | 8 KWs top 10 | 456 |
+| /blog/crm-comparison | 12,100 | +15% | 9 KWs top 10 | 121 |
+| /resources/templates | 9,800 | +31% | 6 KWs top 10 | 98 |
+| /blog/remote-work-tips | 8,600 | +5% | 5 KWs top 10 | 43 |
+
+### Content Needing Refresh (Decaying Traffic)
+
+| Page | Sessions | MoM | Age | Action |
+|------|----------|-----|-----|--------|
+| /blog/2023-trends | 2,100 | -35% | 14mo | Rewrite as 2025 trends |
+| /blog/old-tool-comparison | 1,400 | -28% | 18mo | Update with current pricing/features |
+| /guides/beginner-pm | 3,200 | -18% | 10mo | Add interactive elements, update screenshots |
+
+## 4. GEO / AI Visibility
+
+| Metric | Jan 2025 | Dec 2024 | Change |
+|--------|----------|----------|--------|
+| Queries with AI Overview | 78/500 | 72/500 | +6 |
+| Your citations in AI | 34 | 28 | +6 |
+| Citation rate | 43.6% | 38.9% | +4.7pp |
+| Avg citation position | 2.1 | 2.4 | +0.3 (better) |
+
+### New AI Citations Won
+
+- "best project management software" — cited as #1 source
+- "how to create a project plan" — cited as #2 source
+- "agile vs waterfall" — cited as #3 source
+
+## 5. Backlink Performance
+
+| Metric | Jan 2025 | Dec 2024 | Change |
+|--------|----------|----------|--------|
+| Total referring domains | 1,847 | 1,812 | +35 |
+| New links acquired | 62 | 48 | +14 |
+| Links lost | 27 | 31 | -4 (fewer lost) |
+| Average new link DR | 41 | 38 | +3 |
+
+### Notable New Links
+
+| Source | DR | Type |
+|--------|-----|------|
+| TechCrunch | 94 | Press mention (research report) |
+| Forbes | 95 | Expert quote in roundup |
+| HubSpot Blog | 93 | Resource citation |
+
+## 6. Action Items
+
+| Priority | Action | Owner | Deadline |
+|----------|--------|-------|----------|
+| P0 | Fix mobile LCP issue (currently 3.8s) | Engineering | Feb 15 |
+| P1 | Refresh kanban board software article | Content | Feb 10 |
+| P1 | Publish 2025 trends article (replace 2023 version) | Content | Feb 14 |
+| P2 | Build links to CRM comparison page (target: 5 new) | SEO | Feb 28 |
+| P2 | Optimize 3 pages for AI citation (add structured data) | SEO | Feb 21 |
+| P3 | Research new keyword opportunities in "AI tools" space | SEO | Feb 28 |
+```
+
+---
+
+## 3. Technical SEO Report Template
+
+Use this template for engineering teams, dev leads, and technical stakeholders. Focuses on crawlability, indexation, performance, and error resolution.
+
+---
+
+```markdown
+# Technical SEO Health Report
+
+**Domain:** yoursite.com
+**Period:** January 2025
+**Prepared:** February 3, 2025
+
+---
+
+## 1. Crawl Health
+
+### Crawl Stats Summary
+
+| Metric | Jan 2025 | Dec 2024 | Change | Status |
+|--------|----------|----------|--------|--------|
+| Total pages crawled | 12,400 | 11,800 | +5.1% | Normal |
+| Avg crawl rate (pages/day) | 400 | 381 | +5.0% | Normal |
+| Avg response time | 320ms | 290ms | +10.3% | Monitor |
+| Crawl errors | 18 | 12 | +6 | Investigate |
+
+### Crawl Budget Efficiency
+
+| Page Type | Pages Crawled | % of Crawl | Should Crawl? |
+|-----------|-------------|-----------|---------------|
+| Blog posts | 4,200 | 33.9% | Yes |
+| Product pages | 3,100 | 25.0% | Yes |
+| Category pages | 2,400 | 19.4% | Yes |
+| Parameter URLs | 1,800 | 14.5% | No — block in robots.txt |
+| Paginated archives | 900 | 7.3% | Partially — consolidate |
+
+**Action:** Block parameter URLs via robots.txt to reclaim 14.5% of crawl budget.
+
+## 2. Index Coverage
+
+### Index Status
+
+| Status | Count | Change | Action |
+|--------|-------|--------|--------|
+| Valid (indexed) | 2,847 | +23 | None — healthy growth |
+| Valid with warnings | 42 | +3 | Review warnings |
+| Excluded (intentional) | 1,204 | +8 | Verify intentional |
+| Excluded (error) | 67 | +12 | Fix errors |
+
+### Exclusion Reasons (Top 5)
+
+| Reason | Count | Change | Priority |
+|--------|-------|--------|----------|
+| Crawled, not indexed | 34 | +8 | P1 — improve content quality |
+| Duplicate without canonical | 15 | +2 | P2 — add canonical tags |
+| Blocked by robots.txt | 12 | 0 | Verify intentional |
+| Soft 404 | 6 | +2 | P1 — fix or redirect |
+| Server error (5xx) | 0 | 0 | Clean |
+
+## 3. Core Web Vitals
+
+### Field Data (CrUX — 28-day rolling)
+
+| Metric | Mobile | Desktop | Status |
+|--------|--------|---------|--------|
+| LCP | 3.8s (Poor) | 2.1s (Good) | Mobile needs fix |
+| CLS | 0.08 (Good) | 0.05 (Good) | Passing |
+| INP | 180ms (Good) | 120ms (Good) | Passing |
+
+### LCP Diagnosis (Mobile)
+
+| Page Group | Mobile LCP | Issue | Fix |
+|-----------|-----------|-------|-----|
+| /blog/* | 4.2s | Hero images not optimized | Convert to WebP, add lazy loading |
+| /product/* | 3.5s | Third-party script blocking render | Defer non-critical scripts |
+| /resources/* | 3.1s | Large CSS bundle | Code-split CSS |
+
+**Estimated fix timeline:** 2 weeks (engineering sprint Feb 10-21)
+
+## 4. Error Log
+
+### New Errors This Period
+
+| Error Type | Count | Pages Affected | Priority |
+|-----------|-------|---------------|----------|
+| 404 Not Found | 8 | /blog/deleted-post-1, /old/page-2, +6 | P1 |
+| 500 Server Error | 3 | /api/webhook (intermittent) | P0 |
+| Redirect chain (3+ hops) | 5 | /old-url -> /new-url -> /final-url | P2 |
+| Mixed content warning | 2 | /resources/old-guide, /tools/calc | P2 |
+
+### Error Resolution Status
+
+| Error | Opened | Status | Owner |
+|-------|--------|--------|-------|
+| 404 on /blog/deleted-post-1 | Jan 5 | Redirect created Jan 8 | SEO |
+| 500 on /api/webhook | Jan 12 | Investigating | Backend |
+| Redirect chain on /old-url | Jan 15 | Pending | SEO |
+
+## 5. Schema & Rich Results
+
+### Schema Validation
+
+| Schema Type | Pages | Valid | Warnings | Errors |
+|------------|-------|-------|----------|--------|
+| Article | 342 | 338 | 4 | 0 |
+| Product | 89 | 87 | 0 | 2 |
+| FAQ | 56 | 56 | 0 | 0 |
+| HowTo | 23 | 23 | 0 | 0 |
+| BreadcrumbList | 2,847 | 2,847 | 0 | 0 |
+
+**Action:** Fix 2 Product schema errors (missing "price" property on clearance items).
+
+## 6. Technical Debt Tracker
+
+| Item | Priority | Effort | Impact | Status |
+|------|----------|--------|--------|--------|
+| Block parameter URLs in robots.txt | P1 | Low | Crawl budget +14% | Scheduled Feb 5 |
+| Fix mobile LCP (image optimization) | P0 | Medium | CWV pass rate | Sprint Feb 10-21 |
+| Consolidate duplicate pages (42 pages) | P2 | Medium | Index quality | Backlog |
+| Implement hreflang for EN-GB pages | P3 | High | International SEO | Q2 planned |
+| Migrate HTTP resources to HTTPS | P2 | Low | Security + trust | Scheduled Feb 12 |
+
+## 7. Recommendations for Engineering
+
+1. **Immediate:** Resolve intermittent 500 errors on /api/webhook endpoint
+2. **This sprint:** Optimize hero images on blog templates (convert to WebP, implement responsive sizes)
+3. **Next sprint:** Defer third-party scripts on product pages (move to async loading)
+4. **Backlog:** Implement edge caching for static blog content to improve TTFB
+```
+
+---
+
+## 4. Report Assembly Guidelines
+
+### How to Combine Templates
+
+| Audience | Include Sections | Exclude |
+|----------|-----------------|---------|
+| CEO / Board | Executive template only | All technical detail |
+| VP Marketing | Executive + Marketing sections 1-4, 6 | Crawl stats, schema details |
+| Marketing Manager | Full Marketing template | Deep technical debugging |
+| SEO Manager | Marketing + Technical templates | Executive ROI framing |
+| Engineering Lead | Technical template only | Revenue, keyword details |
+| Client (agency) | Executive + Marketing sections 1-3, 6 | Internal cost data, technical debt |
+
+### Data Freshness Requirements
+
+| Report Type | Maximum Data Age | Reason |
+|------------|-----------------|--------|
+| Executive monthly | 3 days after period end | Allow for data processing lag |
+| Marketing weekly | 1 day after period end | Timely action items |
+| Technical | Real-time where possible | Issues need immediate attention |
+| Quarterly review | 5 days after quarter end | Allow for data reconciliation |
+
+### Report Delivery Checklist
+
+- [ ] All data sources verified and dated
+- [ ] Period-over-period comparisons included
+- [ ] Competitor context provided where relevant
+- [ ] Action items are specific, assigned, and time-bound
+- [ ] Report tailored to audience (no jargon for executives, full detail for technical)
+- [ ] Visualizations are clear and labeled
+- [ ] Sources cited for all external benchmarks
+- [ ] Draft reviewed for accuracy before distribution
diff --git a/.cursor/skills/playwright-testing/SKILL.md b/.cursor/skills/playwright-testing/SKILL.md
new file mode 100644
index 0000000000..6512f12dc6
--- /dev/null
+++ b/.cursor/skills/playwright-testing/SKILL.md
@@ -0,0 +1,295 @@
+---
+name: playwright-testing
+description: Use when writing Playwright tests, fixing flaky tests, debugging failures, implementing Page Object Model, configuring CI/CD, optimizing performance, mocking APIs, handling authentication or OAuth, testing accessibility (axe-core), file uploads/downloads, date/time mocking, WebSockets, geolocation, permissions, multi-tab/popup flows, mobile/responsive layouts, touch gestures, GraphQL, error handling, offline mode, multi-user collaboration, third-party services (payments, email verification), console error monitoring, global setup/teardown, test annotations (skip, fixme, slow), test tags (@smoke, @fast, @critical, filtering with --grep), project dependencies, security testing (XSS, CSRF, auth), performance budgets (Web Vitals, Lighthouse), iframes, component testing, canvas/WebGL, service workers/PWA, test coverage, i18n/localization, Electron apps, or browser extension testing. Covers E2E, component, API, visual, accessibility, security, Electron, and extension testing.
+---
+
+# Playwright Testing
+
+This skill provides comprehensive guidance for all aspects of Playwright test development, from writing new tests to debugging and maintaining existing test suites.
+
+## Activity-Based Reference Guide
+
+Consult these references based on what you're doing:
+
+### Writing New Tests
+
+**When to use**: Creating new test files, writing test cases, implementing test scenarios
+
+| Activity                            | Reference Files                                                                                                                               |
+| ----------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- |
+| **Writing E2E tests**               | [test-suite-structure.md](core/test-suite-structure.md), [locators.md](core/locators.md), [assertions-waiting.md](core/assertions-waiting.md) |
+| **Writing component tests**         | [component-testing.md](testing-patterns/component-testing.md), [test-suite-structure.md](core/test-suite-structure.md)                        |
+| **Writing API tests**               | [api-testing.md](testing-patterns/api-testing.md), [test-suite-structure.md](core/test-suite-structure.md)                                    |
+| **Writing GraphQL tests**           | [graphql-testing.md](testing-patterns/graphql-testing.md), [api-testing.md](testing-patterns/api-testing.md)                                  |
+| **Writing visual regression tests** | [visual-regression.md](testing-patterns/visual-regression.md), [canvas-webgl.md](testing-patterns/canvas-webgl.md)                            |
+| **Structuring test code with POM**  | [page-object-model.md](core/page-object-model.md), [test-suite-structure.md](core/test-suite-structure.md)                                    |
+| **Setting up test data/fixtures**   | [fixtures-hooks.md](core/fixtures-hooks.md), [test-data.md](core/test-data.md)                                                                |
+| **Handling authentication**         | [authentication.md](advanced/authentication.md), [authentication-flows.md](advanced/authentication-flows.md)                                  |
+| **Testing date/time features**      | [clock-mocking.md](advanced/clock-mocking.md)                                                                                                 |
+| **Testing file upload/download**    | [file-operations.md](testing-patterns/file-operations.md), [file-upload-download.md](testing-patterns/file-upload-download.md)                |
+| **Testing forms/validation**        | [forms-validation.md](testing-patterns/forms-validation.md)                                                                                   |
+| **Testing drag and drop**           | [drag-drop.md](testing-patterns/drag-drop.md)                                                                                                 |
+| **Testing accessibility**           | [accessibility.md](testing-patterns/accessibility.md)                                                                                         |
+| **Testing security (XSS, CSRF)**    | [security-testing.md](testing-patterns/security-testing.md)                                                                                   |
+| **Using test annotations**          | [annotations.md](core/annotations.md)                                                                                                         |
+| **Using test tags**                 | [test-tags.md](core/test-tags.md)                                                                                                             |
+| **Testing iframes**                 | [iframes.md](browser-apis/iframes.md)                                                                                                         |
+| **Testing canvas/WebGL**            | [canvas-webgl.md](testing-patterns/canvas-webgl.md)                                                                                           |
+| **Internationalization (i18n)**     | [i18n.md](testing-patterns/i18n.md)                                                                                                           |
+| **Testing Electron apps**           | [electron.md](testing-patterns/electron.md)                                                                                                   |
+| **Testing browser extensions**      | [browser-extensions.md](testing-patterns/browser-extensions.md)                                                                               |
+
+### Mobile & Responsive Testing
+
+**When to use**: Testing mobile devices, touch interactions, responsive layouts
+
+| Activity                        | Reference Files                                                                  |
+| ------------------------------- | -------------------------------------------------------------------------------- |
+| **Device emulation**            | [mobile-testing.md](advanced/mobile-testing.md)                                  |
+| **Touch gestures (swipe, tap)** | [mobile-testing.md](advanced/mobile-testing.md)                                  |
+| **Viewport/breakpoint testing** | [mobile-testing.md](advanced/mobile-testing.md)                                  |
+| **Mobile-specific UI**          | [mobile-testing.md](advanced/mobile-testing.md), [locators.md](core/locators.md) |
+
+### Real-Time & Browser APIs
+
+**When to use**: Testing WebSockets, geolocation, permissions, multi-tab flows
+
+| Activity                        | Reference Files                                                                          |
+| ------------------------------- | ---------------------------------------------------------------------------------------- |
+| **WebSocket/real-time testing** | [websockets.md](browser-apis/websockets.md)                                              |
+| **Geolocation mocking**         | [browser-apis.md](browser-apis/browser-apis.md)                                          |
+| **Permission handling**         | [browser-apis.md](browser-apis/browser-apis.md)                                          |
+| **Clipboard testing**           | [browser-apis.md](browser-apis/browser-apis.md)                                          |
+| **Camera/microphone mocking**   | [browser-apis.md](browser-apis/browser-apis.md)                                          |
+| **Multi-tab/popup flows**       | [multi-context.md](advanced/multi-context.md)                                            |
+| **OAuth popup handling**        | [third-party.md](advanced/third-party.md), [multi-context.md](advanced/multi-context.md) |
+
+### Debugging & Troubleshooting
+
+**When to use**: Test failures, element not found, timeouts, unexpected behavior
+
+| Activity                                          | Reference Files                                                                                                                                |
+| ------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
+| **Debugging test failures**                       | [debugging.md](debugging/debugging.md), [assertions-waiting.md](core/assertions-waiting.md)                                                    |
+| **Fixing flaky tests**                            | [flaky-tests.md](debugging/flaky-tests.md), [debugging.md](debugging/debugging.md), [assertions-waiting.md](core/assertions-waiting.md)        |
+| **Debugging flaky parallel runs**                 | [flaky-tests.md](debugging/flaky-tests.md), [performance.md](infrastructure-ci-cd/performance.md), [fixtures-hooks.md](core/fixtures-hooks.md) |
+| **Ensuring test isolation / avoiding state leak** | [flaky-tests.md](debugging/flaky-tests.md), [fixtures-hooks.md](core/fixtures-hooks.md), [performance.md](infrastructure-ci-cd/performance.md) |
+| **Fixing selector issues**                        | [locators.md](core/locators.md), [debugging.md](debugging/debugging.md)                                                                        |
+| **Investigating timeout issues**                  | [assertions-waiting.md](core/assertions-waiting.md), [debugging.md](debugging/debugging.md)                                                    |
+| **Using trace viewer**                            | [debugging.md](debugging/debugging.md)                                                                                                         |
+| **Debugging race conditions**                     | [flaky-tests.md](debugging/flaky-tests.md), [debugging.md](debugging/debugging.md), [assertions-waiting.md](core/assertions-waiting.md)        |
+| **Debugging console/JS errors**                   | [console-errors.md](debugging/console-errors.md), [debugging.md](debugging/debugging.md)                                                       |
+
+### Error & Edge Case Testing
+
+**When to use**: Testing error states, offline mode, network failures, validation
+
+| Activity                       | Reference Files                                                                                       |
+| ------------------------------ | ----------------------------------------------------------------------------------------------------- |
+| **Error boundary testing**     | [error-testing.md](debugging/error-testing.md)                                                        |
+| **Network failure simulation** | [error-testing.md](debugging/error-testing.md), [network-advanced.md](advanced/network-advanced.md)   |
+| **Offline mode testing**       | [error-testing.md](debugging/error-testing.md), [service-workers.md](browser-apis/service-workers.md) |
+| **Service worker testing**     | [service-workers.md](browser-apis/service-workers.md)                                                 |
+| **Loading state testing**      | [error-testing.md](debugging/error-testing.md)                                                        |
+| **Form validation testing**    | [error-testing.md](debugging/error-testing.md)                                                        |
+
+### Multi-User & Collaboration Testing
+
+**When to use**: Testing features involving multiple users, roles, or real-time collaboration
+
+| Activity                       | Reference Files                                                                      |
+| ------------------------------ | ------------------------------------------------------------------------------------ |
+| **Multiple users in one test** | [multi-user.md](advanced/multi-user.md)                                              |
+| **Real-time collaboration**    | [multi-user.md](advanced/multi-user.md), [websockets.md](browser-apis/websockets.md) |
+| **Role-based access testing**  | [multi-user.md](advanced/multi-user.md)                                              |
+| **Concurrent action testing**  | [multi-user.md](advanced/multi-user.md)                                              |
+
+### Architecture Decisions
+
+**When to use**: Choosing test patterns, deciding between approaches, planning test architecture
+
+| Activity                     | Reference Files                                           |
+| ---------------------------- | --------------------------------------------------------- |
+| **POM vs fixtures decision** | [pom-vs-fixtures.md](architecture/pom-vs-fixtures.md)     |
+| **Test type selection**      | [test-architecture.md](architecture/test-architecture.md) |
+| **Mock vs real services**    | [when-to-mock.md](architecture/when-to-mock.md)           |
+| **Test suite structure**     | [test-suite-structure.md](core/test-suite-structure.md)   |
+
+### Framework-Specific Testing
+
+**When to use**: Testing React or Next.js applications
+
+| Activity                  | Reference Files                     |
+| ------------------------- | ----------------------------------- |
+| **Testing React apps**    | [react.md](frameworks/react.md)     |
+| **Testing Next.js apps**  | [nextjs.md](frameworks/nextjs.md)   |
+
+### Refactoring & Maintenance
+
+**When to use**: Improving existing tests, code review, reducing duplication
+
+| Activity                             | Reference Files                                                                                            |
+| ------------------------------------ | ---------------------------------------------------------------------------------------------------------- |
+| **Refactoring to Page Object Model** | [page-object-model.md](core/page-object-model.md), [test-suite-structure.md](core/test-suite-structure.md) |
+| **Improving test organization**      | [test-suite-structure.md](core/test-suite-structure.md), [page-object-model.md](core/page-object-model.md) |
+| **Extracting common setup/teardown** | [fixtures-hooks.md](core/fixtures-hooks.md)                                                                |
+| **Replacing brittle selectors**      | [locators.md](core/locators.md)                                                                            |
+| **Removing explicit waits**          | [assertions-waiting.md](core/assertions-waiting.md)                                                        |
+| **Creating test data factories**     | [test-data.md](core/test-data.md)                                                                          |
+| **Configuration setup**              | [configuration.md](core/configuration.md)                                                                  |
+
+### Infrastructure & Configuration
+
+**When to use**: Setting up projects, configuring CI/CD, optimizing performance
+
+| Activity                                | Reference Files                                                                                                          |
+| --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------ |
+| **Configuring Playwright project**      | [configuration.md](core/configuration.md), [projects-dependencies.md](core/projects-dependencies.md)                     |
+| **Setting up CI/CD pipelines**          | [ci-cd.md](infrastructure-ci-cd/ci-cd.md), [github-actions.md](infrastructure-ci-cd/github-actions.md)                   |
+| **GitHub Actions setup**                | [github-actions.md](infrastructure-ci-cd/github-actions.md)                                                              |
+| **GitLab CI setup**                     | [gitlab.md](infrastructure-ci-cd/gitlab.md)                                                                              |
+| **Other CI providers**                  | [other-providers.md](infrastructure-ci-cd/other-providers.md)                                                            |
+| **Docker/container setup**              | [docker.md](infrastructure-ci-cd/docker.md)                                                                              |
+| **Global setup & teardown**             | [global-setup.md](core/global-setup.md)                                                                                  |
+| **Project dependencies**                | [projects-dependencies.md](core/projects-dependencies.md)                                                                |
+| **Optimizing test performance**         | [performance.md](infrastructure-ci-cd/performance.md), [test-suite-structure.md](core/test-suite-structure.md)           |
+| **Configuring parallel execution**      | [parallel-sharding.md](infrastructure-ci-cd/parallel-sharding.md), [performance.md](infrastructure-ci-cd/performance.md) |
+| **Isolating test data between workers** | [fixtures-hooks.md](core/fixtures-hooks.md), [performance.md](infrastructure-ci-cd/performance.md)                       |
+| **Test coverage**                       | [test-coverage.md](infrastructure-ci-cd/test-coverage.md)                                                                |
+| **Test reporting/artifacts**            | [reporting.md](infrastructure-ci-cd/reporting.md)                                                                        |
+
+### Advanced Patterns
+
+**When to use**: Complex scenarios, API mocking, network interception
+
+| Activity                             | Reference Files                                                                                              |
+| ------------------------------------ | ------------------------------------------------------------------------------------------------------------ |
+| **Mocking API responses**            | [test-suite-structure.md](core/test-suite-structure.md), [network-advanced.md](advanced/network-advanced.md) |
+| **Network interception**             | [network-advanced.md](advanced/network-advanced.md), [assertions-waiting.md](core/assertions-waiting.md)     |
+| **GraphQL mocking**                  | [network-advanced.md](advanced/network-advanced.md)                                                          |
+| **HAR recording/playback**           | [network-advanced.md](advanced/network-advanced.md)                                                          |
+| **Custom fixtures**                  | [fixtures-hooks.md](core/fixtures-hooks.md)                                                                  |
+| **Advanced waiting strategies**      | [assertions-waiting.md](core/assertions-waiting.md)                                                          |
+| **OAuth/SSO mocking**                | [third-party.md](advanced/third-party.md), [multi-context.md](advanced/multi-context.md)                     |
+| **Payment gateway mocking**          | [third-party.md](advanced/third-party.md)                                                                    |
+| **Email/SMS verification mocking**   | [third-party.md](advanced/third-party.md)                                                                    |
+| **Failing on console errors**        | [console-errors.md](debugging/console-errors.md)                                                             |
+| **Security testing (XSS, CSRF)**     | [security-testing.md](testing-patterns/security-testing.md)                                                  |
+| **Performance budgets & Web Vitals** | [performance-testing.md](testing-patterns/performance-testing.md)                                            |
+| **Lighthouse integration**           | [performance-testing.md](testing-patterns/performance-testing.md)                                            |
+| **Test annotations (skip, fixme)**   | [annotations.md](core/annotations.md)                                                                        |
+| **Test tags (@smoke, @fast)**        | [test-tags.md](core/test-tags.md)                                                                            |
+| **Test steps for reporting**         | [annotations.md](core/annotations.md)                                                                        |
+
+## Quick Decision Tree
+
+```
+What are you doing?
+│
+├─ Writing a new test?
+│  ├─ E2E test → core/test-suite-structure.md, core/locators.md, core/assertions-waiting.md
+│  ├─ Component test → testing-patterns/component-testing.md
+│  ├─ API test → testing-patterns/api-testing.md, core/test-suite-structure.md
+│  ├─ GraphQL test → testing-patterns/graphql-testing.md
+│  ├─ Visual regression → testing-patterns/visual-regression.md
+│  ├─ Visual/canvas test → testing-patterns/canvas-webgl.md, core/test-suite-structure.md
+│  ├─ Accessibility test → testing-patterns/accessibility.md
+│  ├─ Mobile/responsive test → advanced/mobile-testing.md
+│  ├─ i18n/locale test → testing-patterns/i18n.md
+│  ├─ Electron app test → testing-patterns/electron.md
+│  ├─ Browser extension test → testing-patterns/browser-extensions.md
+│  ├─ Multi-user test → advanced/multi-user.md
+│  ├─ Form validation test → testing-patterns/forms-validation.md
+│  └─ Drag and drop test → testing-patterns/drag-drop.md
+│
+├─ Testing specific features?
+│  ├─ File upload/download → testing-patterns/file-operations.md, testing-patterns/file-upload-download.md
+│  ├─ Date/time dependent → advanced/clock-mocking.md
+│  ├─ WebSocket/real-time → browser-apis/websockets.md
+│  ├─ Geolocation/permissions → browser-apis/browser-apis.md
+│  ├─ OAuth/SSO mocking → advanced/third-party.md, advanced/multi-context.md
+│  ├─ Payments/email/SMS → advanced/third-party.md
+│  ├─ iFrames → browser-apis/iframes.md
+│  ├─ Canvas/WebGL/charts → testing-patterns/canvas-webgl.md
+│  ├─ Service workers/PWA → browser-apis/service-workers.md
+│  ├─ i18n/localization → testing-patterns/i18n.md
+│  ├─ Security (XSS, CSRF) → testing-patterns/security-testing.md
+│  └─ Performance/Web Vitals → testing-patterns/performance-testing.md
+│
+├─ Architecture decisions?
+│  ├─ POM vs fixtures → architecture/pom-vs-fixtures.md
+│  ├─ Test type selection → architecture/test-architecture.md
+│  ├─ Mock vs real services → architecture/when-to-mock.md
+│  └─ Test suite structure → core/test-suite-structure.md
+│
+├─ Framework-specific testing?
+│  ├─ React app → frameworks/react.md
+│  └─ Next.js app → frameworks/nextjs.md
+│
+├─ Authentication testing?
+│  ├─ Basic auth patterns → advanced/authentication.md
+│  └─ Complex flows (MFA, reset) → advanced/authentication-flows.md
+│
+├─ Test is failing/flaky?
+│  ├─ Flaky test investigation → debugging/flaky-tests.md
+│  ├─ Element not found → core/locators.md, debugging/debugging.md
+│  ├─ Timeout issues → core/assertions-waiting.md, debugging/debugging.md
+│  ├─ Race conditions → debugging/flaky-tests.md, debugging/debugging.md
+│  ├─ Flaky only with multiple workers → debugging/flaky-tests.md, infrastructure-ci-cd/performance.md
+│  ├─ State leak / isolation → debugging/flaky-tests.md, core/fixtures-hooks.md
+│  ├─ Console/JS errors → debugging/console-errors.md, debugging/debugging.md
+│  └─ General debugging → debugging/debugging.md
+│
+├─ Testing error scenarios?
+│  ├─ Network failures → debugging/error-testing.md, advanced/network-advanced.md
+│  ├─ Offline (unexpected) → debugging/error-testing.md
+│  ├─ Offline-first/PWA → browser-apis/service-workers.md
+│  ├─ Error boundaries → debugging/error-testing.md
+│  └─ Form validation → testing-patterns/forms-validation.md, debugging/error-testing.md
+│
+├─ Refactoring existing code?
+│  ├─ Implementing POM → core/page-object-model.md
+│  ├─ Improving selectors → core/locators.md
+│  ├─ Extracting fixtures → core/fixtures-hooks.md
+│  ├─ Creating data factories → core/test-data.md
+│  └─ Configuration setup → core/configuration.md
+│
+├─ Setting up infrastructure?
+│  ├─ CI/CD → infrastructure-ci-cd/ci-cd.md
+│  ├─ GitHub Actions → infrastructure-ci-cd/github-actions.md
+│  ├─ GitLab CI → infrastructure-ci-cd/gitlab.md
+│  ├─ Other CI providers → infrastructure-ci-cd/other-providers.md
+│  ├─ Docker/containers → infrastructure-ci-cd/docker.md
+│  ├─ Sharding/parallel → infrastructure-ci-cd/parallel-sharding.md
+│  ├─ Reporting/artifacts → infrastructure-ci-cd/reporting.md
+│  ├─ Global setup/teardown → core/global-setup.md
+│  ├─ Project dependencies → core/projects-dependencies.md
+│  ├─ Test performance → infrastructure-ci-cd/performance.md
+│  ├─ Test coverage → infrastructure-ci-cd/test-coverage.md
+│  └─ Project config → core/configuration.md, core/projects-dependencies.md
+│
+├─ Organizing tests?
+│  ├─ Skip/fixme/slow tests → core/annotations.md
+│  ├─ Test tags (@smoke, @fast) → core/test-tags.md
+│  ├─ Filtering tests (--grep) → core/test-tags.md
+│  ├─ Test steps → core/annotations.md
+│  └─ Conditional execution → core/annotations.md
+│
+└─ Running subset of tests?
+   ├─ By tag (@smoke, @critical) → core/test-tags.md
+   ├─ Exclude slow/flaky tests → core/test-tags.md
+   ├─ PR vs nightly tests → core/test-tags.md, infrastructure-ci-cd/ci-cd.md
+   └─ Project-specific filtering → core/test-tags.md, core/configuration.md
+```
+
+## Test Validation Loop
+
+After writing or modifying tests:
+
+1. **Run tests**: `npx playwright test --reporter=list`
+2. **If tests fail**:
+   - Review error output and trace (`npx playwright show-trace`)
+   - Fix locators, waits, or assertions
+   - Re-run tests
+3. **Only proceed when all tests pass**
+4. **Run multiple times** for critical tests: `npx playwright test --repeat-each=5`
diff --git a/.cursor/skills/playwright-testing/advanced/authentication-flows.md b/.cursor/skills/playwright-testing/advanced/authentication-flows.md
new file mode 100644
index 0000000000..24ad08c177
--- /dev/null
+++ b/.cursor/skills/playwright-testing/advanced/authentication-flows.md
@@ -0,0 +1,360 @@
+# Complex Authentication Flow Patterns
+
+## Table of Contents
+
+1. [Email Verification Flows](#email-verification-flows)
+2. [Password Reset](#password-reset)
+3. [Session Timeout](#session-timeout)
+4. [Remember Me Persistence](#remember-me-persistence)
+5. [Logout Patterns](#logout-patterns)
+6. [Tips](#tips)
+7. [Related](#related)
+
+> **When to use**: Testing email verification, password reset, session timeout/expiration, or remember-me functionality. For basic auth setup (storage state, OAuth mocking, MFA, role-based access), see [authentication.md](authentication.md).
+
+---
+
+## Email Verification Flows
+
+### Capturing Verification Tokens
+
+Intercept API responses to capture verification tokens for testing:
+
+```typescript
+test('completes registration with email verification', async ({ page }) => {
+  let capturedToken = '';
+
+  await page.route('**/api/auth/register', async (route) => {
+    const response = await route.fetch();
+    const body = await response.json();
+    capturedToken = body.verificationToken;
+    await route.fulfill({ response });
+  });
+
+  await page.goto('/register');
+  await page.getByLabel('Name').fill('New User');
+  await page.getByLabel('Email').fill('newuser@test.com');
+  await page.getByLabel('Password', { exact: true }).fill('SecurePass!');
+  await page.getByLabel('Confirm password').fill('SecurePass!');
+  await page.getByRole('button', { name: 'Create account' }).click();
+
+  await expect(page.getByText('Check your inbox')).toBeVisible();
+
+  expect(capturedToken).toBeTruthy();
+  await page.goto(`/verify?token=${capturedToken}`);
+
+  await expect(page.getByText('Email confirmed')).toBeVisible();
+});
+```
+
+### Fully Mocked Verification
+
+```typescript
+test('verifies email with mocked endpoints', async ({ page }) => {
+  const mockToken = 'test-verification-abc123';
+
+  await page.route('**/api/auth/register', async (route) => {
+    await route.fulfill({
+      status: 200,
+      contentType: 'application/json',
+      body: JSON.stringify({ message: 'Verification sent', verificationToken: mockToken }),
+    });
+  });
+
+  await page.route(`**/api/auth/verify?token=${mockToken}`, async (route) => {
+    await route.fulfill({
+      status: 200,
+      contentType: 'application/json',
+      body: JSON.stringify({ verified: true }),
+    });
+  });
+
+  await page.goto('/register');
+  await page.getByLabel('Email').fill('test@example.com');
+  await page.getByLabel('Password', { exact: true }).fill('Password123!');
+  await page.getByRole('button', { name: 'Sign up' }).click();
+
+  await expect(page.getByText('Check your inbox')).toBeVisible();
+
+  await page.goto(`/verify?token=${mockToken}`);
+  await expect(page.getByText('Email confirmed')).toBeVisible();
+});
+```
+
+---
+
+## Password Reset
+
+### Complete Reset Flow
+
+```typescript
+test('resets password through email link', async ({ page }) => {
+  let resetToken = '';
+
+  await page.route('**/api/auth/forgot-password', async (route) => {
+    const response = await route.fetch();
+    const body = await response.json();
+    resetToken = body.resetToken;
+    await route.fulfill({ response });
+  });
+
+  await page.goto('/forgot-password');
+  await page.getByLabel('Email').fill('user@test.com');
+  await page.getByRole('button', { name: 'Send link' }).click();
+
+  await expect(page.getByText('Reset email sent')).toBeVisible();
+
+  expect(resetToken).toBeTruthy();
+  await page.goto(`/reset-password?token=${resetToken}`);
+
+  await page.getByLabel('New password', { exact: true }).fill('NewPassword456!');
+  await page.getByLabel('Confirm password').fill('NewPassword456!');
+  await page.getByRole('button', { name: 'Update password' }).click();
+
+  await expect(page.getByText('Password updated')).toBeVisible();
+});
+```
+
+### Expired Token Handling
+
+```typescript
+test('shows error for expired reset token', async ({ page }) => {
+  await page.goto('/reset-password?token=expired-token');
+
+  await page.getByLabel('New password', { exact: true }).fill('NewPass!');
+  await page.getByLabel('Confirm password').fill('NewPass!');
+  await page.getByRole('button', { name: 'Update password' }).click();
+
+  await expect(page.getByRole('alert')).toContainText(/expired|invalid/i);
+});
+```
+
+### Password Strength Validation
+
+```typescript
+test('enforces password requirements on reset', async ({ page }) => {
+  await page.goto('/reset-password?token=valid-token');
+
+  await page.getByLabel('New password', { exact: true }).fill('weak');
+  await page.getByLabel('Confirm password').fill('weak');
+  await page.getByRole('button', { name: 'Update password' }).click();
+
+  await expect(page.getByText(/at least 8 characters/i)).toBeVisible();
+});
+```
+
+---
+
+## Session Timeout
+
+### Detecting Expired Sessions
+
+```typescript
+test('redirects to signin after session expires', async ({ page, context }) => {
+  await page.goto('/signin');
+  await page.getByLabel('Email').fill('user@test.com');
+  await page.getByLabel('Password').fill('Password!');
+  await page.getByRole('button', { name: 'Sign in' }).click();
+  await expect(page).toHaveURL('/home');
+
+  const cookies = await context.cookies();
+  const sessionCookie = cookies.find((c) => c.name.includes('session'));
+
+  if (sessionCookie) {
+    await context.clearCookies({ name: sessionCookie.name });
+  }
+
+  await page.goto('/profile');
+  await expect(page).toHaveURL(/\/signin/);
+  await expect(page.getByText(/session.*expired|sign in again/i)).toBeVisible();
+});
+```
+
+### Session Extension Warning
+
+```typescript
+test('shows warning before session expires', async ({ page }) => {
+  await page.route('**/api/auth/session', async (route) => {
+    await route.fulfill({
+      status: 200,
+      contentType: 'application/json',
+      body: JSON.stringify({ valid: true, expiresIn: 60 }),
+    });
+  });
+
+  await page.goto('/home');
+
+  await expect(page.getByText(/session.*expir/i)).toBeVisible({ timeout: 10000 });
+  await expect(page.getByRole('button', { name: /extend|stay signed in/i })).toBeVisible();
+});
+```
+
+### Session Extension Action
+
+```typescript
+test('extends session when user clicks extend', async ({ page }) => {
+  let sessionExtended = false;
+
+  await page.route('**/api/auth/session', async (route) => {
+    await route.fulfill({
+      status: 200,
+      contentType: 'application/json',
+      body: JSON.stringify({ valid: true, expiresIn: 60 }),
+    });
+  });
+
+  await page.route('**/api/auth/refresh', async (route) => {
+    sessionExtended = true;
+    await route.fulfill({
+      status: 200,
+      contentType: 'application/json',
+      body: JSON.stringify({ valid: true, expiresIn: 3600 }),
+    });
+  });
+
+  await page.goto('/home');
+
+  await expect(page.getByRole('button', { name: /extend|stay signed in/i })).toBeVisible({
+    timeout: 10000,
+  });
+  await page.getByRole('button', { name: /extend|stay signed in/i }).click();
+
+  expect(sessionExtended).toBe(true);
+  await expect(page.getByText(/session.*expir/i)).not.toBeVisible();
+});
+```
+
+---
+
+## Remember Me Persistence
+
+### Persistent Session
+
+```typescript
+test('persists session with remember me enabled', async ({ browser }) => {
+  const ctx1 = await browser.newContext();
+  const page1 = await ctx1.newPage();
+
+  await page1.goto('/signin');
+  await page1.getByLabel('Email').fill('user@test.com');
+  await page1.getByLabel('Password').fill('Password!');
+  await page1.getByLabel('Keep me signed in').check();
+  await page1.getByRole('button', { name: 'Sign in' }).click();
+
+  await expect(page1).toHaveURL('/home');
+
+  const state = await ctx1.storageState();
+  await ctx1.close();
+
+  const ctx2 = await browser.newContext({ storageState: state });
+  const page2 = await ctx2.newPage();
+
+  await page2.goto('/home');
+  await expect(page2).toHaveURL('/home');
+  await expect(page2.getByText('Welcome')).toBeVisible();
+
+  await ctx2.close();
+});
+```
+
+### Session-Only Login
+
+```typescript
+test('session-only login does not persist across browser restarts', async ({ browser }) => {
+  const ctx1 = await browser.newContext();
+  const page1 = await ctx1.newPage();
+
+  await page1.goto('/signin');
+  await page1.getByLabel('Email').fill('user@test.com');
+  await page1.getByLabel('Password').fill('Password!');
+  // Leave "Remember me" unchecked
+  await expect(page1.getByLabel('Keep me signed in')).not.toBeChecked();
+  await page1.getByRole('button', { name: 'Sign in' }).click();
+
+  await expect(page1).toHaveURL('/home');
+
+  // Only keep persistent cookies (filter out session cookies)
+  const cookies = await ctx1.cookies();
+  await ctx1.close();
+
+  const persistentCookies = cookies.filter((c) => c.expires > 0);
+  const ctx2 = await browser.newContext();
+  await ctx2.addCookies(persistentCookies);
+  const page2 = await ctx2.newPage();
+
+  await page2.goto('/home');
+
+  // Should redirect to login since session was not persisted
+  await expect(page2).toHaveURL(/\/signin/);
+
+  await ctx2.close();
+});
+```
+
+---
+
+## Logout Patterns
+
+### Standard Logout with Session Cleanup
+
+```typescript
+test.use({ storageState: '.auth/user.json' });
+
+test('logs out and clears session', async ({ page, context }) => {
+  await page.goto('/home');
+
+  await page.getByRole('button', { name: /account|menu/i }).click();
+  await page.getByRole('menuitem', { name: 'Sign out' }).click();
+
+  await expect(page).toHaveURL('/signin');
+
+  const cookies = await context.cookies();
+  const sessionCookies = cookies.filter((c) => c.name.includes('session') || c.name.includes('token'));
+  expect(sessionCookies).toHaveLength(0);
+
+  await page.goto('/home');
+  await expect(page).toHaveURL(/\/signin/);
+});
+```
+
+### Logout from All Devices
+
+```typescript
+test('logs out from all devices', async ({ page }) => {
+  let logoutAllCalled = false;
+
+  await page.route('**/api/auth/logout-all', async (route) => {
+    logoutAllCalled = true;
+    await route.fulfill({
+      status: 200,
+      contentType: 'application/json',
+      body: JSON.stringify({ message: 'Logged out everywhere' }),
+    });
+  });
+
+  await page.goto('/settings/security');
+
+  await page.getByRole('button', { name: 'Sign out everywhere' }).click();
+  await page.getByRole('dialog').getByRole('button', { name: 'Confirm' }).click();
+
+  expect(logoutAllCalled).toBe(true);
+  await expect(page).toHaveURL(/\/signin/);
+});
+```
+
+---
+
+## Tips
+
+1. **Configure shorter session timeouts in test environments** — Enables testing timeout behavior without slow tests
+2. **Test token expiration edge cases** — Expired tokens, invalid tokens, already-used tokens
+3. **Verify cleanup on logout** — Check both cookies and localStorage are cleared
+4. **Test the full flow end-to-end** — Password reset should verify login with new password works
+
+---
+
+## Related
+
+- [authentication.md](authentication.md) — Storage state, OAuth mocking, MFA, role-based access, API login
+- [fixtures-hooks.md](../core/fixtures-hooks.md) — Creating auth fixtures
+- [third-party.md](./third-party.md) — Mocking external auth providers
diff --git a/.cursor/skills/playwright-testing/advanced/authentication.md b/.cursor/skills/playwright-testing/advanced/authentication.md
new file mode 100644
index 0000000000..02c2dd7b27
--- /dev/null
+++ b/.cursor/skills/playwright-testing/advanced/authentication.md
@@ -0,0 +1,871 @@
+# Authentication Testing
+
+## Table of Contents
+
+1. [Quick Reference](#quick-reference)
+2. [Patterns](#patterns)
+3. [Decision Guide](#decision-guide)
+4. [Anti-Patterns](#anti-patterns)
+5. [Troubleshooting](#troubleshooting)
+6. [Related](#related)
+
+> **When to use**: Apps with login, session management, or protected routes. Authentication is the most common source of slow test suites.
+
+## Quick Reference
+
+```typescript
+// Storage state reuse — the #1 pattern for fast auth
+await page.goto("/login");
+await page.getByLabel("Username").fill("testuser@example.com");
+await page.getByLabel("Password").fill("secretPass123");
+await page.getByRole("button", { name: "Log in" }).click();
+await page.context().storageState({ path: ".auth/session.json" });
+
+// Reuse in config — every test starts authenticated
+{
+  use: {
+    storageState: ".auth/session.json"
+  }
+}
+
+// API login — skip the UI entirely
+const context = await browser.newContext();
+const response = await context.request.post("/api/auth/login", {
+  data: { email: "testuser@example.com", password: "secretPass123" },
+});
+await context.storageState({ path: ".auth/session.json" });
+```
+
+## Patterns
+
+### Storage State Reuse
+
+**Use when**: You need authenticated tests and want to avoid logging in before every test.
+**Avoid when**: Tests require completely fresh sessions, or you are testing the login flow itself.
+
+`storageState` serializes cookies and localStorage to a JSON file. Load it in any browser context to start authenticated instantly.
+
+```typescript
+// scripts/generate-auth.ts — run once to generate the state file
+import { chromium } from "@playwright/test";
+
+async function generateAuthState() {
+  const browser = await chromium.launch();
+  const context = await browser.newContext();
+  const page = await context.newPage();
+
+  await page.goto("http://localhost:4000/login");
+  await page.getByLabel("Username").fill("testuser@example.com");
+  await page.getByLabel("Password").fill("secretPass123");
+  await page.getByRole("button", { name: "Log in" }).click();
+  await page.waitForURL("/home");
+
+  await context.storageState({ path: ".auth/session.json" });
+  await browser.close();
+}
+
+generateAuthState();
+```
+
+```typescript
+// playwright.config.ts — load saved state for all tests
+import { defineConfig } from "@playwright/test";
+
+export default defineConfig({
+  use: {
+    baseURL: "http://localhost:4000",
+    storageState: ".auth/session.json",
+  },
+});
+```
+
+```typescript
+// tests/home.spec.ts — test starts already logged in
+import { test, expect } from "@playwright/test";
+
+test("authenticated user sees home page", async ({ page }) => {
+  await page.goto("/home");
+  await expect(page.getByRole("heading", { name: "Home" })).toBeVisible();
+});
+```
+
+### Global Setup Authentication
+
+**Use when**: You want to authenticate once before the entire test suite runs.
+**Avoid when**: Different tests need different users, or your tokens expire faster than your suite runs.
+
+```typescript
+// global-setup.ts
+import { chromium, type FullConfig } from "@playwright/test";
+
+async function globalSetup(config: FullConfig) {
+  const { baseURL } = config.projects[0].use;
+  const browser = await chromium.launch();
+  const context = await browser.newContext();
+  const page = await context.newPage();
+
+  await page.goto(`${baseURL}/login`);
+  await page.getByLabel("Username").fill(process.env.TEST_USER_EMAIL!);
+  await page.getByLabel("Password").fill(process.env.TEST_USER_PASSWORD!);
+  await page.getByRole("button", { name: "Log in" }).click();
+  await page.waitForURL("**/home");
+
+  await context.storageState({ path: ".auth/session.json" });
+  await browser.close();
+}
+
+export default globalSetup;
+```
+
+```typescript
+// playwright.config.ts
+import { defineConfig } from "@playwright/test";
+
+export default defineConfig({
+  globalSetup: require.resolve("./global-setup"),
+  use: {
+    baseURL: "http://localhost:4000",
+    storageState: ".auth/session.json",
+  },
+});
+```
+
+Add `.auth/` to `.gitignore`. Auth state files contain session tokens and should never be committed.
+
+### Per-Worker Authentication
+
+**Use when**: Each parallel worker needs its own authenticated session to avoid race conditions for tests that modify server-side state.
+**Avoid when**: Tests are read-only and a modifying shared session is safe, you can use a single shared account.
+
+> **Sharded runs**: `parallelIndex` resets per shard, so different shards can have workers with the same index. To avoid collisions, include the shard identifier in the username (e.g., `worker-${SHARD_INDEX}-${parallelIndex}@example.com`) by passing a `SHARD_INDEX` environment variable from your CI matrix.
+
+```typescript
+// fixtures/auth.ts
+import { test as base, type BrowserContext } from "@playwright/test";
+
+type AuthFixtures = {
+  authenticatedContext: BrowserContext;
+};
+
+export const test = base.extend<{}, AuthFixtures>({
+  authenticatedContext: [
+    async ({ browser }, use) => {
+      const context = await browser.newContext();
+      const page = await context.newPage();
+
+      await page.goto("/login");
+      await page
+        .getByLabel("Username")
+        .fill(`worker-${test.info().parallelIndex}@example.com`);
+      await page.getByLabel("Password").fill("secretPass123");
+      await page.getByRole("button", { name: "Log in" }).click();
+      await page.waitForURL("/home");
+      await page.close();
+
+      await use(context);
+      await context.close();
+    },
+    { scope: "worker" },
+  ],
+});
+
+export { expect } from "@playwright/test";
+```
+
+```typescript
+// tests/settings.spec.ts
+import { test, expect } from "../fixtures/auth";
+
+test("update display name", async ({ authenticatedContext }) => {
+  const page = await authenticatedContext.newPage();
+  await page.goto("/settings/profile");
+  await page.getByLabel("Display name").fill("Updated Name");
+  await page.getByRole("button", { name: "Save" }).click();
+  await expect(page.getByText("Profile saved")).toBeVisible();
+});
+```
+
+### Multiple Roles
+
+**Use when**: Your app has role-based access control and you need to test different permission levels.
+**Avoid when**: Your app has a single user role.
+
+```typescript
+// global-setup.ts — authenticate all roles
+import { chromium, type FullConfig } from "@playwright/test";
+
+const accounts = [
+  {
+    role: "admin",
+    email: "admin@example.com",
+    password: process.env.ADMIN_PASSWORD!,
+  },
+  {
+    role: "member",
+    email: "member@example.com",
+    password: process.env.MEMBER_PASSWORD!,
+  },
+  {
+    role: "guest",
+    email: "guest@example.com",
+    password: process.env.GUEST_PASSWORD!,
+  },
+];
+
+async function globalSetup(config: FullConfig) {
+  const { baseURL } = config.projects[0].use;
+
+  for (const { role, email, password } of accounts) {
+    const browser = await chromium.launch();
+    const context = await browser.newContext();
+    const page = await context.newPage();
+
+    await page.goto(`${baseURL}/login`);
+    await page.getByLabel("Username").fill(email);
+    await page.getByLabel("Password").fill(password);
+    await page.getByRole("button", { name: "Log in" }).click();
+    await page.waitForURL("**/home");
+
+    await context.storageState({ path: `.auth/${role}.json` });
+    await browser.close();
+  }
+}
+
+export default globalSetup;
+```
+
+```typescript
+// playwright.config.ts — one project per role
+import { defineConfig } from "@playwright/test";
+
+export default defineConfig({
+  globalSetup: require.resolve("./global-setup"),
+  projects: [
+    {
+      name: "admin",
+      use: { storageState: ".auth/admin.json" },
+      testMatch: "**/*.admin.spec.ts",
+    },
+    {
+      name: "member",
+      use: { storageState: ".auth/member.json" },
+      testMatch: "**/*.member.spec.ts",
+    },
+    {
+      name: "guest",
+      use: { storageState: ".auth/guest.json" },
+      testMatch: "**/*.guest.spec.ts",
+    },
+    {
+      name: "anonymous",
+      use: { storageState: { cookies: [], origins: [] } },
+      testMatch: "**/*.anon.spec.ts",
+    },
+  ],
+});
+```
+
+```typescript
+// tests/admin-panel.admin.spec.ts
+import { test, expect } from "@playwright/test";
+
+test("admin can access user management", async ({ page }) => {
+  await page.goto("/admin/users");
+  await expect(
+    page.getByRole("heading", { name: "User Management" })
+  ).toBeVisible();
+  await expect(page.getByRole("button", { name: "Remove user" })).toBeEnabled();
+});
+```
+
+```typescript
+// tests/admin-panel.guest.spec.ts
+import { test, expect } from "@playwright/test";
+
+test("guest cannot access admin panel", async ({ page }) => {
+  await page.goto("/admin/users");
+  await expect(page.getByText("Access denied")).toBeVisible();
+});
+```
+
+**Alternative**: Use a fixture that accepts a role parameter when you need role switching within a single spec file.
+
+```typescript
+// fixtures/auth.ts — role-based fixture
+import { test as base, type Page } from "@playwright/test";
+import fs from "fs";
+
+type RoleFixtures = {
+  loginAs: (role: "admin" | "member" | "guest") => Promise<Page>;
+};
+
+export const test = base.extend<RoleFixtures>({
+  loginAs: async ({ browser }, use) => {
+    const pages: Page[] = [];
+
+    await use(async (role) => {
+      const statePath = `.auth/${role}.json`;
+      if (!fs.existsSync(statePath)) {
+        throw new Error(
+          `Auth state for role "${role}" not found at ${statePath}`
+        );
+      }
+      const context = await browser.newContext({ storageState: statePath });
+      const page = await context.newPage();
+      pages.push(page);
+      return page;
+    });
+
+    for (const page of pages) {
+      await page.context().close();
+    }
+  },
+});
+
+export { expect } from "@playwright/test";
+```
+
+```typescript
+// tests/role-comparison.spec.ts
+import { test, expect } from "../fixtures/auth";
+
+test("admin sees remove button, guest does not", async ({ loginAs }) => {
+  const adminPage = await loginAs("admin");
+  await adminPage.goto("/admin/users");
+  await expect(
+    adminPage.getByRole("button", { name: "Remove user" })
+  ).toBeVisible();
+
+  const guestPage = await loginAs("guest");
+  await guestPage.goto("/admin/users");
+  await expect(guestPage.getByText("Access denied")).toBeVisible();
+});
+```
+
+### OAuth/SSO Mocking
+
+**Use when**: Your app authenticates via a third-party OAuth provider and you cannot hit the real provider in tests.
+**Avoid when**: You have a dedicated test tenant on the OAuth provider.
+
+A typical OAuth flow works like this:
+
+1. User clicks "Sign in with Provider" → browser navigates to `https://accounts.provider.com/authorize?...`
+2. User authenticates on the provider's page → provider redirects back to your app's **callback route** (e.g. `http://localhost:4000/auth/callback?code=ABC&state=XYZ`)
+3. Your backend exchanges the `code` for an access token, creates a session, and redirects the user to a logged-in page
+
+In tests you can short-circuit step 2 with `page.route()`: intercept the outbound request to the provider and respond with a `302` redirect straight to your callback route, supplying a mock `code` and `state`. Your backend still executes its normal callback handler — the only part that's mocked is the provider's authorization page.
+
+For cases where you want to skip the browser redirect entirely, a second approach calls a **test-only API endpoint** that creates the session server-side and returns the session cookie directly.
+
+```typescript
+// tests/oauth-login.spec.ts — mock the callback route
+import { test, expect } from "@playwright/test";
+
+test("login via mocked OAuth flow", async ({ page }) => {
+  await page.route("https://accounts.provider.com/**", async (route) => {
+    const callbackUrl = new URL("http://localhost:4000/auth/callback");
+    callbackUrl.searchParams.set("code", "mock-auth-code-xyz");
+    callbackUrl.searchParams.set("state", "expected-state-value");
+    await route.fulfill({
+      status: 302,
+      headers: { location: callbackUrl.toString() },
+    });
+  });
+
+  await page.goto("/login");
+  await page.getByRole("button", { name: "Sign in with Provider" }).click();
+
+  await page.waitForURL("/home");
+  await expect(page.getByRole("heading", { name: "Home" })).toBeVisible();
+});
+```
+
+```typescript
+// tests/oauth-login.spec.ts — API-based session injection
+import { test, expect } from "@playwright/test";
+
+test("bypass OAuth entirely via API session injection", async ({
+  page,
+}) => {
+  // Call a test-only endpoint that creates a session without OAuth
+  const response = await page.request.post("/api/test/create-session", {
+    data: {
+      email: "oauth-user@example.com",
+      provider: "provider",
+      role: "member",
+    },
+  });
+  expect(response.ok()).toBeTruthy();
+
+  await page.context().storageState({ path: ".auth/oauth-user.json" });
+  await page.goto("/home");
+  await expect(page.getByRole("heading", { name: "Home" })).toBeVisible();
+});
+```
+
+**Backend requirement**: Your backend must expose a test-only session creation endpoint (guarded by `NODE_ENV=test`) or accept a known test OAuth code.
+
+### MFA Handling
+
+**Use when**: Your app requires two-factor authentication (TOTP, SMS, email codes).
+**Avoid when**: MFA is optional and you can disable it for test accounts.
+
+**Strategy 1**: Generate real TOTP codes from a shared secret.
+
+```typescript
+// helpers/totp.ts
+import * as OTPAuth from "otpauth";
+
+export function generateTOTP(secret: string): string {
+  const totp = new OTPAuth.TOTP({
+    secret: OTPAuth.Secret.fromBase32(secret),
+    digits: 6,
+    period: 30,
+    algorithm: "SHA1",
+  });
+  return totp.generate();
+}
+```
+
+```typescript
+// tests/mfa-login.spec.ts
+import { test, expect } from "@playwright/test";
+import { generateTOTP } from "../helpers/totp";
+
+test("login with TOTP two-factor auth", async ({ page }) => {
+  await page.goto("/login");
+  await page.getByLabel("Username").fill("mfa-user@example.com");
+  await page.getByLabel("Password").fill("secretPass123");
+  await page.getByRole("button", { name: "Log in" }).click();
+
+  await expect(page.getByText("Enter your authentication code")).toBeVisible();
+
+  const code = generateTOTP(process.env.MFA_TOTP_SECRET!);
+  await page.getByLabel("Authentication code").fill(code);
+  await page.getByRole("button", { name: "Verify" }).click();
+
+  await page.waitForURL("/home");
+  await expect(page.getByRole("heading", { name: "Home" })).toBeVisible();
+});
+```
+
+**Strategy 2**: Mock MFA at the backend level. Have your backend accept a known bypass code (e.g., `000000`) when `NODE_ENV=test`.
+
+**Strategy 3**: Disable MFA for test accounts at the infrastructure level.
+
+### Session Refresh
+
+**Use when**: Your tokens expire during long test runs.
+**Avoid when**: Your test suite runs quickly and tokens outlast the entire run.
+
+```typescript
+// fixtures/auth-with-refresh.ts
+import { test as base, type BrowserContext } from "@playwright/test";
+import fs from "fs";
+
+type AuthFixtures = {
+  authenticatedPage: import("@playwright/test").Page;
+};
+
+export const test = base.extend<AuthFixtures>({
+  authenticatedPage: async ({ browser }, use) => {
+    const statePath = ".auth/session.json";
+
+    let context: BrowserContext;
+    if (fs.existsSync(statePath)) {
+      context = await browser.newContext({ storageState: statePath });
+      const page = await context.newPage();
+
+      const response = await page.request.get("/api/auth/me");
+      if (response.ok()) {
+        await use(page);
+        await context.close();
+        return;
+      }
+      await context.close();
+    }
+
+    context = await browser.newContext();
+    const page = await context.newPage();
+    await page.goto("/login");
+    await page.getByLabel("Username").fill(process.env.TEST_USER_EMAIL!);
+    await page.getByLabel("Password").fill(process.env.TEST_USER_PASSWORD!);
+    await page.getByRole("button", { name: "Log in" }).click();
+    await page.waitForURL("/home");
+
+    await context.storageState({ path: statePath });
+
+    await use(page);
+    await context.close();
+  },
+});
+
+export { expect } from "@playwright/test";
+```
+
+### Login Page Object
+
+**Use when**: Multiple test files need to log in and you want consistent, maintainable login logic.
+**Avoid when**: You use `storageState` everywhere and never navigate through the login UI in tests.
+
+```typescript
+// page-objects/LoginPage.ts
+import { type Page, type Locator, expect } from "@playwright/test";
+
+export class LoginPage {
+  readonly page: Page;
+  readonly usernameInput: Locator;
+  readonly passwordInput: Locator;
+  readonly loginButton: Locator;
+  readonly errorMessage: Locator;
+  readonly forgotPasswordLink: Locator;
+
+  constructor(page: Page) {
+    this.page = page;
+    this.usernameInput = page.getByLabel("Username");
+    this.passwordInput = page.getByLabel("Password");
+    this.loginButton = page.getByRole("button", { name: "Log in" });
+    this.errorMessage = page.getByRole("alert");
+    this.forgotPasswordLink = page.getByRole("link", {
+      name: "Forgot password",
+    });
+  }
+
+  async goto() {
+    await this.page.goto("/login");
+    await expect(this.loginButton).toBeVisible();
+  }
+
+  async login(username: string, password: string) {
+    await this.usernameInput.fill(username);
+    await this.passwordInput.fill(password);
+    await this.loginButton.click();
+  }
+
+  async loginAndWaitForHome(username: string, password: string) {
+    await this.login(username, password);
+    await this.page.waitForURL("/home");
+  }
+
+  async expectError(message: string | RegExp) {
+    await expect(this.errorMessage).toContainText(message);
+  }
+
+  async expectFieldError(field: "username" | "password", message: string) {
+    const input =
+      field === "username" ? this.usernameInput : this.passwordInput;
+    await expect(input).toHaveAttribute("aria-invalid", "true");
+    const errorId = await input.getAttribute("aria-describedby");
+    if (errorId) {
+      await expect(this.page.locator(`#${errorId}`)).toContainText(message);
+    }
+  }
+}
+```
+
+```typescript
+// tests/login.spec.ts
+import { test, expect } from "@playwright/test";
+import { LoginPage } from "../page-objects/LoginPage";
+
+test.use({ storageState: { cookies: [], origins: [] } });
+
+test.describe("login page", () => {
+  let loginPage: LoginPage;
+
+  test.beforeEach(async ({ page }) => {
+    loginPage = new LoginPage(page);
+    await loginPage.goto();
+  });
+
+  test("successful login redirects to home", async ({ page }) => {
+    await loginPage.loginAndWaitForHome(
+      "testuser@example.com",
+      "secretPass123"
+    );
+    await expect(page.getByRole("heading", { name: "Home" })).toBeVisible();
+  });
+
+  test("wrong password shows error", async () => {
+    await loginPage.login("testuser@example.com", "wrong-password");
+    await loginPage.expectError("Invalid username or password");
+  });
+
+  test("empty fields show validation errors", async () => {
+    await loginPage.loginButton.click();
+    await loginPage.expectFieldError("username", "Username is required");
+  });
+
+  test("forgot password link navigates correctly", async ({ page }) => {
+    await loginPage.forgotPasswordLink.click();
+    await page.waitForURL("/forgot-password");
+    await expect(
+      page.getByRole("heading", { name: "Reset password" })
+    ).toBeVisible();
+  });
+});
+```
+
+### API-Based Login
+
+**Use when**: You want the fastest possible authentication without any browser interaction.
+**Avoid when**: You are specifically testing the login UI.
+
+API login is typically 5-10x faster than UI login.
+
+```typescript
+// global-setup.ts — API-based login (fastest)
+import { request, type FullConfig } from "@playwright/test";
+
+async function globalSetup(config: FullConfig) {
+  const { baseURL } = config.projects[0].use;
+
+  const requestContext = await request.newContext({ baseURL });
+
+  const response = await requestContext.post("/api/auth/login", {
+    data: {
+      email: process.env.TEST_USER_EMAIL!,
+      password: process.env.TEST_USER_PASSWORD!,
+    },
+  });
+
+  if (!response.ok()) {
+    throw new Error(
+      `API login failed: ${response.status()} ${await response.text()}`
+    );
+  }
+
+  await requestContext.storageState({ path: ".auth/session.json" });
+  await requestContext.dispose();
+}
+
+export default globalSetup;
+```
+
+```typescript
+// fixtures/api-auth.ts — fixture version for per-test authentication
+import { test as base } from "@playwright/test";
+
+export const test = base.extend({
+  authenticatedPage: async ({ browser, playwright }, use) => {
+    const apiContext = await playwright.request.newContext({
+      baseURL: "http://localhost:4000",
+    });
+
+    await apiContext.post("/api/auth/login", {
+      data: {
+        email: "testuser@example.com",
+        password: "secretPass123",
+      },
+    });
+
+    const state = await apiContext.storageState();
+    const context = await browser.newContext({ storageState: state });
+    const page = await context.newPage();
+
+    await use(page);
+
+    await context.close();
+    await apiContext.dispose();
+  },
+});
+
+export { expect } from "@playwright/test";
+```
+
+### Unauthenticated Tests
+
+**Use when**: Testing the login page, signup flow, password reset, public pages, or redirect behavior for unauthenticated users.
+**Avoid when**: The test requires a logged-in user.
+
+When your config sets a default `storageState`, you must explicitly clear it for unauthenticated tests.
+
+```typescript
+// tests/public-pages.spec.ts
+import { test, expect } from "@playwright/test";
+
+test.use({ storageState: { cookies: [], origins: [] } });
+
+test.describe("unauthenticated access", () => {
+  test("homepage is accessible without login", async ({ page }) => {
+    await page.goto("/");
+    await expect(page.getByRole("heading", { name: "Welcome" })).toBeVisible();
+    await expect(page.getByRole("link", { name: "Log in" })).toBeVisible();
+  });
+
+  test("protected route redirects to login", async ({ page }) => {
+    await page.goto("/home");
+    await page.waitForURL("**/login**");
+    expect(page.url()).toContain("redirect=%2Fhome");
+  });
+
+  test("expired session shows re-login prompt", async ({ page, context }) => {
+    await page.goto("/home");
+    await context.clearCookies();
+
+    await page.goto("/settings");
+    await page.waitForURL("**/login**");
+    await expect(page.getByText("Your session has expired")).toBeVisible();
+  });
+
+  test("signup flow creates account", async ({ page }) => {
+    await page.goto("/signup");
+    await page.getByLabel("Name").fill("New User");
+    await page.getByLabel("Email").fill(`test-${Date.now()}@example.com`);
+    await page.getByLabel("Password", { exact: true }).fill("secretPass123");
+    await page.getByLabel("Confirm password").fill("secretPass123");
+    await page.getByRole("button", { name: "Create account" }).click();
+
+    await page.waitForURL("/onboarding");
+    await expect(page.getByText("Welcome, New User")).toBeVisible();
+  });
+});
+```
+
+## Decision Guide
+
+| Scenario                         | Approach                       | Speed    | Isolation      | When to Choose                                                 |
+| -------------------------------- | ------------------------------ | -------- | -------------- | -------------------------------------------------------------- |
+| Most tests need auth             | Global setup + `storageState`  | Fastest  | Shared session | Default for nearly every project                               |
+| Tests modify user state          | Per-worker fixture             | Fast     | Per worker     | Tests update profile, change settings, or mutate data          |
+| Multiple user roles              | Per-project `storageState`     | Fastest  | Per role       | App has admin/member/guest roles                               |
+| Testing the login page           | No `storageState`              | N/A      | Full           | Use `test.use({ storageState: { cookies: [], origins: [] } })` |
+| OAuth/SSO provider               | Mock the callback              | Fast     | Per test       | Never hit real OAuth providers in CI                           |
+| MFA is required                  | TOTP generation or bypass      | Moderate | Per test       | Generate real TOTP codes or use a test-mode bypass             |
+| Token expires mid-suite          | Session refresh fixture        | Fast     | Per check      | Fixture validates the session before use                       |
+| Single test needs different user | `loginAs(role)` fixture        | Moderate | Per call       | Rare: prefer per-project roles                                 |
+| API-first app (no login UI)      | API login via `request.post()` | Fastest  | Per test       | No browser needed for auth                                     |
+
+### UI Login vs API Login vs Storage State
+
+```text
+Need to test the login page itself?
+├── Yes → UI login with LoginPage POM, no storageState
+└── No → Do you have a login API endpoint?
+    ├── Yes → API login in global setup, save storageState (fastest)
+    └── No → UI login in global setup, save storageState
+              └── Tokens expire quickly?
+                  ├── Yes → Add session refresh fixture
+                  └── No → Standard storageState reuse is fine
+```
+
+## Anti-Patterns
+
+| Don't Do This                                                             | Problem                                     | Do This Instead                                                           |
+| ------------------------------------------------------------------------- | ------------------------------------------- | ------------------------------------------------------------------------- |
+| Log in via UI before every test                                           | Adds 2-5 seconds per test                   | Use `storageState` to skip login entirely                                 |
+| Share a single auth state file across parallel workers that mutate state  | Race conditions                             | Use per-worker fixtures with `{ scope: 'worker' }`                        |
+| Hardcode credentials in test files                                        | Security risk                               | Use environment variables and `.env` files                                |
+| Ignore token expiration                                                   | Tests fail intermittently with 401 errors   | Add a session validity check in your auth fixture                         |
+| Hit real OAuth providers in CI                                            | Flaky: rate limits, CAPTCHA, network issues | Mock the OAuth callback or use API session injection                      |
+| Use `page.waitForTimeout(2000)` after login                               | Arbitrary delay                             | `await page.waitForURL('/home')` or `await expect(heading).toBeVisible()` |
+| Store `.auth/*.json` files in git                                         | Tokens in version control                   | Add `.auth/` to `.gitignore`                                              |
+| Create one "god" test account with all permissions                        | Cannot test role-based access control       | Create separate accounts per role                                         |
+| Use `browser.newContext()` without `storageState` for authenticated tests | Every context starts unauthenticated        | Pass `storageState` when creating the context                             |
+| Test MFA by disabling it everywhere                                       | You never test the MFA flow                 | Use TOTP generation for at least one test                                 |
+
+## Troubleshooting
+
+### Global setup fails with "Target page, context or browser has been closed"
+
+**Cause**: The login page redirected unexpectedly, or the browser closed before `storageState()` was called.
+
+**Fix**:
+
+- Add `await page.waitForURL()` after the login action
+- Check that `baseURL` in your config matches the actual server URL and protocol
+- Add error handling to global setup:
+
+```typescript
+const response = await page.waitForResponse("**/api/auth/**");
+if (!response.ok()) {
+  throw new Error(
+    `Login failed in global setup: ${response.status()} ${await response.text()}`
+  );
+}
+```
+
+### Tests fail with 401 Unauthorized after running for a while
+
+**Cause**: The session token saved in `storageState` has expired.
+
+**Fix**:
+
+- Use the session refresh fixture pattern
+- Increase token expiry in test environment configuration
+- Switch to API-based login in a worker-scoped fixture
+
+### `storageState` file is empty or contains no cookies
+
+**Cause**: `storageState()` was called before the login response set cookies.
+
+**Fix**:
+
+- Wait for the post-login page to load: `await page.waitForURL('/home')`
+- Verify cookies exist before saving:
+
+```typescript
+const cookies = await context.cookies();
+if (cookies.length === 0) {
+  throw new Error("No cookies found after login");
+}
+await context.storageState({ path: ".auth/session.json" });
+```
+
+### Different browsers get different cookies
+
+**Cause**: Some auth flows set cookies with `SameSite=Strict` or use browser-specific cookie behavior.
+
+**Fix**:
+
+- Generate separate auth state files per browser project
+- Check if your auth uses `SameSite=None; Secure` cookies that require HTTPS:
+
+```typescript
+projects: [
+  {
+    name: 'chromium',
+    use: { ...devices['Desktop Chrome'], storageState: '.auth/chromium-session.json' },
+  },
+  {
+    name: 'firefox',
+    use: { ...devices['Desktop Firefox'], storageState: '.auth/firefox-session.json' },
+  },
+],
+```
+
+### Parallel tests interfere with each other's sessions
+
+**Cause**: Multiple workers share the same test account and one worker's actions affect others.
+
+**Fix**:
+
+- Use per-worker test accounts: `worker-${test.info().parallelIndex}@example.com`
+- Use the per-worker authentication fixture pattern
+- Make tests idempotent
+
+### OAuth mock does not work — still redirects to real provider
+
+**Cause**: `page.route()` was registered after the navigation that triggers the OAuth redirect.
+
+**Fix**:
+
+- Register route handlers before any navigation: call `page.route()` before `page.goto()`
+- Log the actual redirect URL to verify the pattern:
+
+```typescript
+page.on("request", (req) => {
+  if (req.url().includes("oauth") || req.url().includes("accounts.provider")) {
+    console.log("OAuth request:", req.url());
+  }
+});
+```
+
+## Related
+
+- [fixtures-hooks.md](../core/fixtures-hooks.md) — custom fixtures for auth setup and teardown
+- [configuration.md](../core/configuration.md) — `storageState`, projects, and global setup configuration
+- [global-setup.md](../core/global-setup.md) — global setup patterns and project dependencies
+- [network-advanced.md](network-advanced.md) — route interception patterns used in OAuth mocking
+- [api-testing.md](../testing-patterns/api-testing.md) — API request context used in API-based login
+- [flaky-tests.md](../debugging/flaky-tests.md) — diagnosing auth-related flakiness
diff --git a/.cursor/skills/playwright-testing/advanced/clock-mocking.md b/.cursor/skills/playwright-testing/advanced/clock-mocking.md
new file mode 100644
index 0000000000..073d087055
--- /dev/null
+++ b/.cursor/skills/playwright-testing/advanced/clock-mocking.md
@@ -0,0 +1,364 @@
+# Date, Time & Clock Mocking
+
+## Table of Contents
+
+1. [Clock API Basics](#clock-api-basics)
+2. [Fixed Time Testing](#fixed-time-testing)
+3. [Time Advancement](#time-advancement)
+4. [Timezone Testing](#timezone-testing)
+5. [Timer Mocking](#timer-mocking)
+
+## Clock API Basics
+
+### Install Clock
+
+```typescript
+test("mock current time", async ({ page }) => {
+  // Install clock before navigating
+  await page.clock.install({ time: new Date("2025-01-15T09:00:00") });
+
+  await page.goto("/dashboard");
+
+  // Page sees January 15, 2025 as current date
+  await expect(page.getByText("January 15, 2025")).toBeVisible();
+});
+```
+
+### Clock with Fixture
+
+```typescript
+// fixtures/clock.fixture.ts
+import { test as base } from "@playwright/test";
+
+type ClockFixtures = {
+  mockTime: (date: Date | string) => Promise<void>;
+};
+
+export const test = base.extend<ClockFixtures>({
+  mockTime: async ({ page }, use) => {
+    await use(async (date) => {
+      const time = typeof date === "string" ? new Date(date) : date;
+      await page.clock.install({ time });
+    });
+  },
+});
+
+// Usage
+test("subscription expiry", async ({ page, mockTime }) => {
+  await mockTime("2025-12-31T23:59:00");
+  await page.goto("/subscription");
+
+  await expect(page.getByText("Expires today")).toBeVisible();
+});
+```
+
+## Fixed Time Testing
+
+### Test Date-Dependent Features
+
+```typescript
+test("show holiday banner in December", async ({ page }) => {
+  await page.clock.install({ time: new Date("2025-12-20T10:00:00") });
+
+  await page.goto("/");
+
+  await expect(page.getByRole("banner", { name: /holiday/i })).toBeVisible();
+});
+
+test("no holiday banner in January", async ({ page }) => {
+  await page.clock.install({ time: new Date("2025-01-15T10:00:00") });
+
+  await page.goto("/");
+
+  await expect(page.getByRole("banner", { name: /holiday/i })).toBeHidden();
+});
+```
+
+### Test Relative Time Display
+
+```typescript
+test("shows relative time correctly", async ({ page }) => {
+  // Fix time to control "posted 2 hours ago" text
+  await page.clock.install({ time: new Date("2025-06-15T14:00:00") });
+
+  // Mock API to return post with known timestamp
+  await page.route("**/api/posts/1", (route) =>
+    route.fulfill({
+      json: {
+        id: 1,
+        title: "Test Post",
+        createdAt: "2025-06-15T12:00:00Z", // 2 hours before mock time
+      },
+    }),
+  );
+
+  await page.goto("/posts/1");
+
+  await expect(page.getByText("2 hours ago")).toBeVisible();
+});
+```
+
+### Test Date Boundaries
+
+```typescript
+test.describe("end of month billing", () => {
+  test("shows billing on last day of month", async ({ page }) => {
+    await page.clock.install({ time: new Date("2025-01-31T10:00:00") });
+    await page.goto("/billing");
+
+    await expect(page.getByText("Payment due today")).toBeVisible();
+  });
+
+  test("shows days remaining mid-month", async ({ page }) => {
+    await page.clock.install({ time: new Date("2025-01-15T10:00:00") });
+    await page.goto("/billing");
+
+    await expect(page.getByText("16 days until payment")).toBeVisible();
+  });
+});
+```
+
+## Time Advancement
+
+### Advance Time Manually
+
+```typescript
+test("session timeout warning", async ({ page }) => {
+  await page.clock.install({ time: new Date("2025-01-15T09:00:00") });
+  await page.goto("/dashboard");
+
+  // Advance 25 minutes (session timeout at 30 min)
+  await page.clock.fastForward("25:00");
+
+  await expect(page.getByText("Session expires in 5 minutes")).toBeVisible();
+
+  // Advance 5 more minutes
+  await page.clock.fastForward("05:00");
+
+  await expect(page.getByText("Session expired")).toBeVisible();
+});
+```
+
+### Pause and Resume Time
+
+```typescript
+test("countdown timer", async ({ page }) => {
+  await page.clock.install({ time: new Date("2025-01-15T09:00:00") });
+  await page.goto("/sale");
+
+  // Initial state
+  await expect(page.getByText("Sale ends in 2:00:00")).toBeVisible();
+
+  // Advance 1 hour
+  await page.clock.fastForward("01:00:00");
+
+  await expect(page.getByText("Sale ends in 1:00:00")).toBeVisible();
+
+  // Advance past end
+  await page.clock.fastForward("01:00:01");
+
+  await expect(page.getByText("Sale ended")).toBeVisible();
+});
+```
+
+### Run Pending Timers
+
+```typescript
+test("debounced search", async ({ page }) => {
+  await page.clock.install({ time: new Date("2025-01-15T09:00:00") });
+  await page.goto("/search");
+
+  await page.getByLabel("Search").fill("playwright");
+
+  // Search is debounced by 300ms, won't fire yet
+  await expect(page.getByTestId("search-results")).toBeHidden();
+
+  // Fast forward past debounce
+  await page.clock.fastForward(300);
+
+  // Now search should execute
+  await expect(page.getByTestId("search-results")).toBeVisible();
+});
+```
+
+## Timezone Testing
+
+### Test Different Timezones
+
+```typescript
+test.describe("timezone display", () => {
+  test("shows correct time in PST", async ({ browser }) => {
+    const context = await browser.newContext({
+      timezoneId: "America/Los_Angeles",
+    });
+    const page = await context.newPage();
+
+    await page.clock.install({ time: new Date("2025-01-15T17:00:00Z") }); // 5 PM UTC
+
+    await page.goto("/schedule");
+
+    // Should show 9 AM PST
+    await expect(page.getByText("9:00 AM")).toBeVisible();
+
+    await context.close();
+  });
+
+  test("shows correct time in JST", async ({ browser }) => {
+    const context = await browser.newContext({
+      timezoneId: "Asia/Tokyo",
+    });
+    const page = await context.newPage();
+
+    await page.clock.install({ time: new Date("2025-01-15T17:00:00Z") }); // 5 PM UTC
+
+    await page.goto("/schedule");
+
+    // Should show 2 AM next day JST
+    await expect(page.getByText("2:00 AM")).toBeVisible();
+
+    await context.close();
+  });
+});
+```
+
+### Timezone Fixture
+
+```typescript
+// fixtures/timezone.fixture.ts
+import { test as base } from "@playwright/test";
+
+type TimezoneFixtures = {
+  pageInTimezone: (timezone: string) => Promise<Page>;
+};
+
+export const test = base.extend<TimezoneFixtures>({
+  pageInTimezone: async ({ browser }, use) => {
+    const pages: Page[] = [];
+
+    await use(async (timezone) => {
+      const context = await browser.newContext({ timezoneId: timezone });
+      const page = await context.newPage();
+      pages.push(page);
+      return page;
+    });
+
+    // Cleanup
+    for (const page of pages) {
+      await page.context().close();
+    }
+  },
+});
+```
+
+## Timer Mocking
+
+### Mock setInterval
+
+```typescript
+test("auto-refresh data", async ({ page }) => {
+  await page.clock.install({ time: new Date("2025-01-15T09:00:00") });
+
+  let apiCalls = 0;
+  await page.route("**/api/data", (route) => {
+    apiCalls++;
+    route.fulfill({ json: { value: apiCalls } });
+  });
+
+  await page.goto("/live-data"); // Sets up 30s refresh interval
+
+  expect(apiCalls).toBe(1); // Initial load
+
+  // Advance 30 seconds
+  await page.clock.fastForward("00:30");
+  expect(apiCalls).toBe(2); // First refresh
+
+  // Advance another 30 seconds
+  await page.clock.fastForward("00:30");
+  expect(apiCalls).toBe(3); // Second refresh
+});
+```
+
+### Mock setTimeout Chains
+
+```typescript
+test("notification queue", async ({ page }) => {
+  await page.clock.install({ time: new Date("2025-01-15T09:00:00") });
+  await page.goto("/notifications");
+
+  // Trigger 3 notifications that show sequentially
+  await page.getByRole("button", { name: "Show All" }).click();
+
+  // First notification appears immediately
+  await expect(page.getByText("Notification 1")).toBeVisible();
+
+  // Second appears after 2 seconds
+  await page.clock.fastForward("00:02");
+  await expect(page.getByText("Notification 2")).toBeVisible();
+
+  // Third appears after 2 more seconds
+  await page.clock.fastForward("00:02");
+  await expect(page.getByText("Notification 3")).toBeVisible();
+});
+```
+
+### Test Animation Frames
+
+```typescript
+test("animation completes", async ({ page }) => {
+  await page.clock.install({ time: new Date("2025-01-15T09:00:00") });
+  await page.goto("/animation-demo");
+
+  await page.getByRole("button", { name: "Animate" }).click();
+
+  // Animation runs for 500ms
+  const element = page.getByTestId("animated-box");
+  await expect(element).toHaveCSS("opacity", "0");
+
+  // Fast forward through animation
+  await page.clock.fastForward(500);
+
+  await expect(element).toHaveCSS("opacity", "1");
+});
+```
+
+## Best Practices
+
+### Always Install Clock Before Navigation
+
+```typescript
+// Good
+test("date test", async ({ page }) => {
+  await page.clock.install({ time: new Date("2025-01-15") });
+  await page.goto("/"); // Page loads with mocked time
+});
+
+// Bad - time already captured by page
+test("date test", async ({ page }) => {
+  await page.goto("/");
+  await page.clock.install({ time: new Date("2025-01-15") }); // Too late!
+});
+```
+
+### Use ISO Strings for Clarity
+
+```typescript
+// Good - explicit timezone
+await page.clock.install({ time: new Date("2025-01-15T09:00:00Z") });
+
+// Ambiguous - uses local timezone
+await page.clock.install({ time: new Date("2025-01-15T09:00:00") });
+```
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern                             | Problem                         | Solution                               |
+| ---------------------------------------- | ------------------------------- | -------------------------------------- |
+| Installing clock after navigation        | Page already captured real time | Install clock before `goto()`          |
+| Hardcoded relative dates                 | Tests break over time           | Use fixed dates with clock mock        |
+| Not accounting for timezone              | Tests fail in different regions | Use explicit UTC times or set timezone |
+| Using `waitForTimeout` with mocked clock | Conflicts with mocked timers    | Use `fastForward` instead              |
+
+## Related References
+
+- **Assertions**: See [assertions-waiting.md](../core/assertions-waiting.md) for time-based assertions
+- **Fixtures**: See [fixtures-hooks.md](../core/fixtures-hooks.md) for clock fixtures
diff --git a/.cursor/skills/playwright-testing/advanced/mobile-testing.md b/.cursor/skills/playwright-testing/advanced/mobile-testing.md
new file mode 100644
index 0000000000..e928bdee50
--- /dev/null
+++ b/.cursor/skills/playwright-testing/advanced/mobile-testing.md
@@ -0,0 +1,409 @@
+# Mobile & Responsive Testing
+
+## Table of Contents
+
+1. [Device Emulation](#device-emulation)
+2. [Touch Gestures](#touch-gestures)
+3. [Viewport Testing](#viewport-testing)
+4. [Mobile-Specific UI](#mobile-specific-ui)
+5. [Responsive Breakpoints](#responsive-breakpoints)
+
+## Device Emulation
+
+### Use Built-in Devices
+
+```typescript
+import { test, devices } from "@playwright/test";
+
+// Configure in playwright.config.ts
+export default defineConfig({
+  projects: [
+    { name: "Desktop Chrome", use: { ...devices["Desktop Chrome"] } },
+    { name: "Mobile Safari", use: { ...devices["iPhone 14"] } },
+    { name: "Mobile Chrome", use: { ...devices["Pixel 7"] } },
+    { name: "Tablet", use: { ...devices["iPad Pro 11"] } },
+  ],
+});
+```
+
+### Custom Device Configuration
+
+```typescript
+test.use({
+  viewport: { width: 390, height: 844 },
+  deviceScaleFactor: 3,
+  isMobile: true,
+  hasTouch: true,
+  userAgent:
+    "Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15",
+});
+
+test("custom mobile device", async ({ page }) => {
+  await page.goto("/");
+  // Test runs with custom device settings
+});
+```
+
+### Test Across Multiple Devices
+
+```typescript
+const mobileDevices = ["iPhone 14", "Pixel 7", "Galaxy S21"];
+
+for (const deviceName of mobileDevices) {
+  test(`checkout on ${deviceName}`, async ({ browser }) => {
+    const device = devices[deviceName];
+    const context = await browser.newContext({ ...device });
+    const page = await context.newPage();
+
+    await page.goto("/checkout");
+    await expect(page.getByRole("button", { name: "Pay" })).toBeVisible();
+
+    await context.close();
+  });
+}
+```
+
+## Touch Gestures
+
+### Tap
+
+```typescript
+test.use({ hasTouch: true });
+
+test("tap to interact", async ({ page }) => {
+  await page.goto("/gallery");
+
+  // Tap is like click but for touch devices
+  await page.getByRole("img", { name: "Photo 1" }).tap();
+
+  await expect(page.getByRole("dialog")).toBeVisible();
+});
+```
+
+### Swipe
+
+```typescript
+test("swipe carousel", async ({ page }) => {
+  await page.goto("/carousel");
+
+  const carousel = page.getByTestId("carousel");
+  const box = await carousel.boundingBox();
+
+  if (box) {
+    // Swipe left
+    await page.touchscreen.tap(box.x + box.width - 50, box.y + box.height / 2);
+    await page.mouse.move(box.x + 50, box.y + box.height / 2);
+
+    // Or use drag
+    await carousel.dragTo(carousel, {
+      sourcePosition: { x: box.width - 50, y: box.height / 2 },
+      targetPosition: { x: 50, y: box.height / 2 },
+    });
+  }
+
+  await expect(page.getByText("Slide 2")).toBeVisible();
+});
+```
+
+### Swipe Fixture
+
+```typescript
+// fixtures/touch.fixture.ts
+import { test as base, Page } from "@playwright/test";
+
+type TouchFixtures = {
+  swipe: (
+    element: Locator,
+    direction: "left" | "right" | "up" | "down",
+  ) => Promise<void>;
+};
+
+export const test = base.extend<TouchFixtures>({
+  swipe: async ({ page }, use) => {
+    await use(async (element, direction) => {
+      const box = await element.boundingBox();
+      if (!box) throw new Error("Element not visible");
+
+      const centerX = box.x + box.width / 2;
+      const centerY = box.y + box.height / 2;
+      const distance = 100;
+
+      const moves = {
+        left: {
+          startX: centerX + distance,
+          endX: centerX - distance,
+          y: centerY,
+        },
+        right: {
+          startX: centerX - distance,
+          endX: centerX + distance,
+          y: centerY,
+        },
+        up: {
+          startX: centerX,
+          endX: centerX,
+          startY: centerY + distance,
+          endY: centerY - distance,
+        },
+        down: {
+          startX: centerX,
+          endX: centerX,
+          startY: centerY - distance,
+          endY: centerY + distance,
+        },
+      };
+
+      const move = moves[direction];
+      await page.touchscreen.tap(move.startX, move.startY ?? move.y);
+      await page.mouse.move(move.endX, move.endY ?? move.y, { steps: 10 });
+      await page.mouse.up();
+    });
+  },
+});
+
+// Usage
+test("swipe to delete", async ({ page, swipe }) => {
+  await page.goto("/inbox");
+
+  const message = page.getByTestId("message-1");
+  await swipe(message, "left");
+
+  await expect(page.getByRole("button", { name: "Delete" })).toBeVisible();
+});
+```
+
+### Long Press
+
+```typescript
+test("long press for context menu", async ({ page }) => {
+  await page.goto("/files");
+
+  const file = page.getByText("document.pdf");
+  const box = await file.boundingBox();
+
+  if (box) {
+    // Touch down
+    await page.touchscreen.tap(box.x + box.width / 2, box.y + box.height / 2);
+
+    // Hold for 500ms
+    await page.waitForTimeout(500);
+
+    // Context menu should appear
+    await expect(page.getByRole("menu")).toBeVisible();
+  }
+});
+```
+
+### Pinch Zoom
+
+```typescript
+test("pinch to zoom image", async ({ page }) => {
+  await page.goto("/map");
+
+  // Pinch zoom requires two touch points
+  // Playwright doesn't have native pinch support, so we simulate via evaluate
+  await page.evaluate(() => {
+    const element = document.querySelector("#map");
+    if (element) {
+      // Simulate wheel event as fallback for zoom
+      element.dispatchEvent(
+        new WheelEvent("wheel", {
+          deltaY: -100, // Negative = zoom in
+          ctrlKey: true, // Ctrl+wheel = pinch on many apps
+        }),
+      );
+    }
+  });
+
+  // Or trigger the app's zoom function directly
+  await page.evaluate(() => {
+    (window as any).mapInstance?.setZoom(15);
+  });
+});
+```
+
+## Viewport Testing
+
+### Test Different Sizes
+
+```typescript
+const viewports = [
+  { name: "mobile", width: 375, height: 667 },
+  { name: "tablet", width: 768, height: 1024 },
+  { name: "desktop", width: 1920, height: 1080 },
+];
+
+for (const { name, width, height } of viewports) {
+  test(`navigation on ${name}`, async ({ page }) => {
+    await page.setViewportSize({ width, height });
+    await page.goto("/");
+
+    if (width < 768) {
+      // Mobile: should have hamburger menu
+      await expect(page.getByRole("button", { name: "Menu" })).toBeVisible();
+    } else {
+      // Desktop: should have visible nav links
+      await expect(page.getByRole("link", { name: "Products" })).toBeVisible();
+    }
+  });
+}
+```
+
+### Dynamic Viewport Changes
+
+```typescript
+test("responsive layout change", async ({ page }) => {
+  await page.setViewportSize({ width: 1200, height: 800 });
+  await page.goto("/dashboard");
+
+  // Desktop: sidebar visible
+  await expect(page.getByRole("complementary")).toBeVisible();
+
+  // Resize to mobile
+  await page.setViewportSize({ width: 375, height: 667 });
+
+  // Mobile: sidebar hidden, hamburger visible
+  await expect(page.getByRole("complementary")).toBeHidden();
+  await expect(page.getByRole("button", { name: "Menu" })).toBeVisible();
+});
+```
+
+## Mobile-Specific UI
+
+### Hamburger Menu
+
+```typescript
+test("mobile navigation", async ({ page }) => {
+  await page.setViewportSize({ width: 375, height: 667 });
+  await page.goto("/");
+
+  // Open hamburger menu
+  await page.getByRole("button", { name: "Menu" }).click();
+
+  // Navigation drawer should appear
+  const nav = page.getByRole("navigation");
+  await expect(nav).toBeVisible();
+
+  // Navigate via mobile menu
+  await nav.getByRole("link", { name: "Products" }).click();
+
+  await expect(page).toHaveURL("/products");
+  // Menu should close after navigation
+  await expect(nav).toBeHidden();
+});
+```
+
+### Bottom Sheet
+
+```typescript
+test("bottom sheet interaction", async ({ page }) => {
+  await page.setViewportSize({ width: 375, height: 667 });
+  await page.goto("/product/123");
+
+  await page.getByRole("button", { name: "Add to Cart" }).click();
+
+  // Bottom sheet appears
+  const sheet = page.getByRole("dialog");
+  await expect(sheet).toBeVisible();
+
+  // Select options
+  await sheet.getByRole("combobox", { name: "Size" }).selectOption("Large");
+  await sheet.getByRole("button", { name: "Confirm" }).click();
+
+  await expect(page.getByText("Added to cart")).toBeVisible();
+});
+```
+
+### Pull to Refresh
+
+```typescript
+test("pull to refresh", async ({ page }) => {
+  await page.goto("/feed");
+
+  const feed = page.getByTestId("feed");
+  const initialFirstItem = await feed.locator("> *").first().textContent();
+
+  // Simulate pull down
+  const box = await feed.boundingBox();
+  if (box) {
+    await page.touchscreen.tap(box.x + box.width / 2, box.y + 50);
+    await page.mouse.move(box.x + box.width / 2, box.y + 200, { steps: 20 });
+    await page.mouse.up();
+  }
+
+  // Wait for refresh
+  await expect(page.getByTestId("loading")).toBeVisible();
+  await expect(page.getByTestId("loading")).toBeHidden();
+
+  // Content should be updated (in a real app)
+});
+```
+
+## Responsive Breakpoints
+
+### Test All Breakpoints
+
+```typescript
+const breakpoints = {
+  xs: 320,
+  sm: 640,
+  md: 768,
+  lg: 1024,
+  xl: 1280,
+  "2xl": 1536,
+};
+
+test.describe("responsive header", () => {
+  for (const [name, width] of Object.entries(breakpoints)) {
+    test(`header at ${name} (${width}px)`, async ({ page }) => {
+      await page.setViewportSize({ width, height: 800 });
+      await page.goto("/");
+
+      if (width < 768) {
+        await expect(page.getByTestId("mobile-menu-button")).toBeVisible();
+        await expect(page.getByTestId("desktop-nav")).toBeHidden();
+      } else {
+        await expect(page.getByTestId("mobile-menu-button")).toBeHidden();
+        await expect(page.getByTestId("desktop-nav")).toBeVisible();
+      }
+    });
+  }
+});
+```
+
+### Visual Regression at Breakpoints
+
+```typescript
+test.describe("visual regression", () => {
+  const sizes = [
+    { width: 375, height: 667, name: "mobile" },
+    { width: 768, height: 1024, name: "tablet" },
+    { width: 1440, height: 900, name: "desktop" },
+  ];
+
+  for (const { width, height, name } of sizes) {
+    test(`homepage at ${name}`, async ({ page }) => {
+      await page.setViewportSize({ width, height });
+      await page.goto("/");
+
+      await expect(page).toHaveScreenshot(`homepage-${name}.png`);
+    });
+  }
+});
+```
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern                | Problem                   | Solution                         |
+| --------------------------- | ------------------------- | -------------------------------- |
+| Only testing one viewport   | Misses responsive bugs    | Test multiple breakpoints        |
+| Ignoring touch events       | Features broken on mobile | Test tap, swipe, long press      |
+| Hardcoded viewport in tests | Can't test multiple sizes | Use `page.setViewportSize()`     |
+| Not testing orientation     | Landscape bugs missed     | Test both portrait and landscape |
+
+## Related References
+
+- **Visual Testing**: See [test-suite-structure.md](../core/test-suite-structure.md) for screenshot testing
+- **Locators**: See [locators.md](../core/locators.md) for mobile-friendly selectors
+- **Browser APIs**: See [browser-apis.md](../browser-apis/browser-apis.md) for permissions (camera, geolocation, notifications)
+- **Canvas/Touch**: See [canvas-webgl.md](../testing-patterns/canvas-webgl.md) for touch gestures on canvas elements
diff --git a/.cursor/skills/playwright-testing/advanced/multi-context.md b/.cursor/skills/playwright-testing/advanced/multi-context.md
new file mode 100644
index 0000000000..ed1cf8a07c
--- /dev/null
+++ b/.cursor/skills/playwright-testing/advanced/multi-context.md
@@ -0,0 +1,288 @@
+# Multi-Tab, Window & Popup Testing
+
+This file covers **single-user scenarios** with multiple browser tabs, windows, and popups. For **multi-user collaboration testing** (multiple users interacting simultaneously), see [multi-user.md](multi-user.md).
+
+## Table of Contents
+
+1. [Popup Handling](#popup-handling)
+2. [New Tab Navigation](#new-tab-navigation)
+3. [OAuth Flows](#oauth-flows)
+4. [Multiple Windows](#multiple-windows)
+5. [Tab Coordination](#tab-coordination)
+
+## Popup Handling
+
+### Basic Popup
+
+```typescript
+test("handle popup window", async ({ page }) => {
+  await page.goto("/");
+
+  // Start waiting for popup before triggering it
+  const popupPromise = page.waitForEvent("popup");
+  await page.getByRole("button", { name: "Open Support Chat" }).click();
+  const popup = await popupPromise;
+
+  // Wait for popup to load
+  await popup.waitForLoadState();
+
+  // Interact with popup
+  await popup.getByLabel("Message").fill("Need help");
+  await popup.getByRole("button", { name: "Send" }).click();
+
+  await expect(popup.getByText("Message sent")).toBeVisible();
+
+  // Close popup
+  await popup.close();
+});
+```
+
+### Popup with Authentication
+
+```typescript
+test("popup login flow", async ({ page }) => {
+  await page.goto("/dashboard");
+
+  const popupPromise = page.waitForEvent("popup");
+  await page.getByRole("button", { name: "Connect Account" }).click();
+  const popup = await popupPromise;
+
+  await popup.waitForLoadState();
+
+  // Complete login in popup
+  await popup.getByLabel("Email").fill("user@example.com");
+  await popup.getByLabel("Password").fill("password123");
+  await popup.getByRole("button", { name: "Log In" }).click();
+
+  // Popup should close automatically after auth
+  await popup.waitForEvent("close");
+
+  // Main page should reflect connected state
+  await expect(page.getByText("Account connected")).toBeVisible();
+});
+```
+
+### Handle Blocked Popups
+
+```typescript
+test("handle popup blocker", async ({ page }) => {
+  await page.goto("/share");
+
+  // Listen for console messages about blocked popup
+  page.on("console", (msg) => {
+    if (msg.text().includes("popup blocked")) {
+      console.log("Popup was blocked");
+    }
+  });
+
+  const popupPromise = page.waitForEvent("popup").catch(() => null);
+  await page.getByRole("button", { name: "Share to Twitter" }).click();
+  const popup = await popupPromise;
+
+  if (!popup) {
+    // Popup blocked - app should show fallback
+    await expect(page.getByText("Copy share link instead")).toBeVisible();
+  }
+});
+```
+
+## New Tab Navigation
+
+### Link Opens in New Tab
+
+```typescript
+test("external link opens in new tab", async ({ page, context }) => {
+  await page.goto("/resources");
+
+  // Wait for new page in context
+  const pagePromise = context.waitForEvent("page");
+  await page.getByRole("link", { name: "Documentation" }).click();
+  const newPage = await pagePromise;
+
+  await newPage.waitForLoadState();
+
+  expect(newPage.url()).toContain("docs.example.com");
+  await expect(newPage.getByRole("heading", { level: 1 })).toBeVisible();
+
+  // Original page still there
+  expect(page.url()).toContain("/resources");
+
+  await newPage.close();
+});
+```
+
+### Intercept New Tab
+
+```typescript
+test("prevent new tab for testing", async ({ page }) => {
+  await page.goto("/links");
+
+  // Remove target="_blank" to keep navigation in same tab
+  await page.evaluate(() => {
+    document.querySelectorAll('a[target="_blank"]').forEach((a) => {
+      a.removeAttribute("target");
+    });
+  });
+
+  // Now link opens in same tab
+  await page.getByRole("link", { name: "External Site" }).click();
+
+  // Can test the destination page
+  await expect(page).toHaveURL(/external-site\.com/);
+});
+```
+
+## OAuth Flows
+
+### Google OAuth Popup
+
+```typescript
+test("Google OAuth login", async ({ page }) => {
+  await page.goto("/login");
+
+  const popupPromise = page.waitForEvent("popup");
+  await page.getByRole("button", { name: "Sign in with Google" }).click();
+  const popup = await popupPromise;
+
+  await popup.waitForLoadState();
+
+  // Handle Google's OAuth flow
+  await popup.getByLabel("Email or phone").fill("test@gmail.com");
+  await popup.getByRole("button", { name: "Next" }).click();
+
+  await popup.getByLabel("Enter your password").fill("password");
+  await popup.getByRole("button", { name: "Next" }).click();
+
+  // Wait for redirect back and popup close
+  await popup.waitForEvent("close");
+
+  // Verify logged in on main page
+  await expect(page.getByText("Welcome, Test User")).toBeVisible();
+});
+```
+
+### Mock OAuth (Recommended)
+
+```typescript
+test("mock OAuth flow", async ({ page, context }) => {
+  // Mock the OAuth callback instead of real flow
+  await page.route("**/auth/callback**", async (route) => {
+    // Simulate successful OAuth
+    const url = new URL(route.request().url());
+    url.searchParams.set("code", "mock-auth-code");
+    await route.fulfill({
+      status: 302,
+      headers: { Location: "/dashboard" },
+    });
+  });
+
+  // Mock token exchange
+  await page.route("**/api/auth/token", (route) =>
+    route.fulfill({
+      json: {
+        access_token: "mock-token",
+        user: { name: "Test User", email: "test@example.com" },
+      },
+    }),
+  );
+
+  await page.goto("/login");
+  await page.getByRole("button", { name: "Sign in with Google" }).click();
+
+  // Should redirect to dashboard without actual OAuth
+  await expect(page).toHaveURL("/dashboard");
+  await expect(page.getByText("Welcome, Test User")).toBeVisible();
+});
+```
+
+### OAuth Fixture
+
+> **For comprehensive OAuth mocking patterns** (fixtures, multiple providers, SAML SSO), see [third-party.md](third-party.md#oauthsso-mocking). This section focuses on popup window handling mechanics for OAuth flows.
+
+## Multiple Windows
+
+### Test Across Multiple Windows
+
+```typescript
+test("sync between windows", async ({ context }) => {
+  // Open two pages
+  const page1 = await context.newPage();
+  const page2 = await context.newPage();
+
+  await page1.goto("/dashboard");
+  await page2.goto("/dashboard");
+
+  // Make change in first window
+  await page1.getByRole("button", { name: "Add Item" }).click();
+  await page1.getByLabel("Name").fill("New Item");
+  await page1.getByRole("button", { name: "Save" }).click();
+
+  // Should sync to second window (if app supports real-time sync)
+  await expect(page2.getByText("New Item")).toBeVisible({ timeout: 10000 });
+});
+```
+
+### Different Users in Different Windows
+
+> **For multi-user collaboration patterns** (admin/user interactions, real-time collaboration, role-based testing, concurrent actions), see [multi-user.md](multi-user.md). This file focuses on single-user scenarios with multiple tabs/windows/popups.
+
+## Tab Coordination
+
+### Switch Between Tabs
+
+```typescript
+test("manage multiple tabs", async ({ context }) => {
+  const page1 = await context.newPage();
+  await page1.goto("/editor");
+
+  const page2 = await context.newPage();
+  await page2.goto("/preview");
+
+  // Edit in first tab
+  await page1.bringToFront();
+  await page1.getByLabel("Content").fill("Hello World");
+
+  // Check preview in second tab
+  await page2.bringToFront();
+  await page2.reload(); // If preview needs refresh
+  await expect(page2.getByText("Hello World")).toBeVisible();
+});
+```
+
+### Close All Tabs Except One
+
+```typescript
+test("cleanup tabs after test", async ({ context }) => {
+  const mainPage = await context.newPage();
+  await mainPage.goto("/");
+
+  // Open several popups during test
+  for (let i = 0; i < 3; i++) {
+    const popup = await context.newPage();
+    await popup.goto(`/popup/${i}`);
+  }
+
+  // Close all except main page
+  for (const page of context.pages()) {
+    if (page !== mainPage) {
+      await page.close();
+    }
+  }
+
+  expect(context.pages()).toHaveLength(1);
+});
+```
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern            | Problem                        | Solution                                   |
+| ----------------------- | ------------------------------ | ------------------------------------------ |
+| Not waiting for popup   | Race condition                 | Use `waitForEvent("popup")` before trigger |
+| Testing real OAuth      | Slow, flaky, needs credentials | Mock OAuth endpoints                       |
+| Assuming popup opens    | May be blocked                 | Handle both open and blocked cases         |
+| Not closing extra pages | Resource leak                  | Close pages in cleanup                     |
+
+## Related References
+
+- **Authentication**: See [fixtures-hooks.md](../core/fixtures-hooks.md) for auth patterns
+- **Network**: See [network-advanced.md](network-advanced.md) for mocking OAuth
diff --git a/.cursor/skills/playwright-testing/advanced/multi-user.md b/.cursor/skills/playwright-testing/advanced/multi-user.md
new file mode 100644
index 0000000000..301e55c4fe
--- /dev/null
+++ b/.cursor/skills/playwright-testing/advanced/multi-user.md
@@ -0,0 +1,393 @@
+# Multi-User & Collaboration Testing
+
+## Table of Contents
+
+1. [Multiple Browser Contexts](#multiple-browser-contexts)
+2. [Real-Time Collaboration](#real-time-collaboration)
+3. [Role-Based Testing](#role-based-testing)
+4. [Concurrent Actions](#concurrent-actions)
+5. [Chat & Messaging](#chat--messaging)
+
+## Multiple Browser Contexts
+
+### Two Users in Same Test
+
+```typescript
+test("two users see each other's changes", async ({ browser }) => {
+  // Create two isolated contexts (like two browsers)
+  const userAContext = await browser.newContext();
+  const userBContext = await browser.newContext();
+
+  const userAPage = await userAContext.newPage();
+  const userBPage = await userBContext.newPage();
+
+  // Both users go to the same document
+  await userAPage.goto("/doc/shared-123");
+  await userBPage.goto("/doc/shared-123");
+
+  // User A types
+  await userAPage.getByLabel("Content").fill("Hello from User A");
+
+  // User B should see the change
+  await expect(userBPage.getByText("Hello from User A")).toBeVisible();
+
+  // Cleanup
+  await userAContext.close();
+  await userBContext.close();
+});
+```
+
+### Multiple Users with Auth States
+
+```typescript
+test("admin and user interaction", async ({ browser }) => {
+  // Load different auth states
+  const adminContext = await browser.newContext({
+    storageState: ".auth/admin.json",
+  });
+  const userContext = await browser.newContext({
+    storageState: ".auth/user.json",
+  });
+
+  const adminPage = await adminContext.newPage();
+  const userPage = await userContext.newPage();
+
+  // User submits request
+  await userPage.goto("/support");
+  await userPage.getByLabel("Message").fill("Need help!");
+  await userPage.getByRole("button", { name: "Submit" }).click();
+
+  // Admin sees and responds
+  await adminPage.goto("/admin/tickets");
+  await expect(adminPage.getByText("Need help!")).toBeVisible();
+  await adminPage.getByRole("button", { name: "Reply" }).click();
+  await adminPage.getByLabel("Response").fill("How can I help?");
+  await adminPage.getByRole("button", { name: "Send" }).click();
+
+  // User sees response
+  await expect(userPage.getByText("How can I help?")).toBeVisible();
+
+  await adminContext.close();
+  await userContext.close();
+});
+```
+
+### Multi-User Fixture
+
+```typescript
+// fixtures/multi-user.fixture.ts
+import { test as base, Browser, BrowserContext, Page } from "@playwright/test";
+
+type UserSession = {
+  context: BrowserContext;
+  page: Page;
+};
+
+type MultiUserFixtures = {
+  createUser: (authState?: string) => Promise<UserSession>;
+};
+
+export const test = base.extend<MultiUserFixtures>({
+  createUser: async ({ browser }, use) => {
+    const sessions: UserSession[] = [];
+
+    await use(async (authState) => {
+      const context = await browser.newContext({
+        storageState: authState,
+      });
+      const page = await context.newPage();
+      sessions.push({ context, page });
+      return { context, page };
+    });
+
+    // Cleanup all sessions
+    for (const session of sessions) {
+      await session.context.close();
+    }
+  },
+});
+
+// Usage
+test("3 users collaborate", async ({ createUser }) => {
+  const alice = await createUser(".auth/alice.json");
+  const bob = await createUser(".auth/bob.json");
+  const charlie = await createUser(".auth/charlie.json");
+
+  // All navigate to same room
+  await alice.page.goto("/room/123");
+  await bob.page.goto("/room/123");
+  await charlie.page.goto("/room/123");
+
+  // Test interactions...
+});
+```
+
+## Real-Time Collaboration
+
+### Collaborative Document
+
+```typescript
+test("real-time collaborative editing", async ({ browser }) => {
+  const user1 = await browser.newContext();
+  const user2 = await browser.newContext();
+
+  const page1 = await user1.newPage();
+  const page2 = await user2.newPage();
+
+  await page1.goto("/docs/shared");
+  await page2.goto("/docs/shared");
+
+  // User 1 types at the beginning
+  const editor1 = page1.getByRole("textbox");
+  await editor1.click();
+  await editor1.press("Home");
+  await editor1.type("User 1: ");
+
+  // User 2 types at the end
+  const editor2 = page2.getByRole("textbox");
+  await editor2.click();
+  await editor2.press("End");
+  await editor2.type(" - User 2");
+
+  // Both should see combined result
+  await expect(page1.getByRole("textbox")).toContainText("User 1:");
+  await expect(page1.getByRole("textbox")).toContainText("- User 2");
+  await expect(page2.getByRole("textbox")).toContainText("User 1:");
+  await expect(page2.getByRole("textbox")).toContainText("- User 2");
+
+  await user1.close();
+  await user2.close();
+});
+```
+
+### Cursor Presence
+
+```typescript
+test("shows other user cursors", async ({ browser }) => {
+  const ctx1 = await browser.newContext();
+  const ctx2 = await browser.newContext();
+
+  const page1 = await ctx1.newPage();
+  const page2 = await ctx2.newPage();
+
+  // Mock to identify users
+  await page1.route("**/api/me", (route) =>
+    route.fulfill({ json: { id: "user-1", name: "Alice" } }),
+  );
+  await page2.route("**/api/me", (route) =>
+    route.fulfill({ json: { id: "user-2", name: "Bob" } }),
+  );
+
+  await page1.goto("/whiteboard/123");
+  await page2.goto("/whiteboard/123");
+
+  // Move cursor on page1
+  await page1.mouse.move(200, 200);
+
+  // Page2 should see Alice's cursor
+  await expect(page2.getByTestId("cursor-user-1")).toBeVisible();
+  await expect(page2.getByText("Alice")).toBeVisible();
+
+  await ctx1.close();
+  await ctx2.close();
+});
+```
+
+## Role-Based Testing
+
+### Test RBAC
+
+```typescript
+const roles = [
+  { role: "admin", canDelete: true, canEdit: true, canView: true },
+  { role: "editor", canDelete: false, canEdit: true, canView: true },
+  { role: "viewer", canDelete: false, canEdit: false, canView: true },
+];
+
+for (const { role, canDelete, canEdit, canView } of roles) {
+  test(`${role} permissions`, async ({ browser }) => {
+    const context = await browser.newContext({
+      storageState: `.auth/${role}.json`,
+    });
+    const page = await context.newPage();
+
+    await page.goto("/document/123");
+
+    // Check view permission
+    if (canView) {
+      await expect(page.getByTestId("content")).toBeVisible();
+    } else {
+      await expect(page.getByText("Access denied")).toBeVisible();
+    }
+
+    // Check edit permission
+    const editButton = page.getByRole("button", { name: "Edit" });
+    if (canEdit) {
+      await expect(editButton).toBeEnabled();
+    } else {
+      await expect(editButton).toBeDisabled();
+    }
+
+    // Check delete permission
+    const deleteButton = page.getByRole("button", { name: "Delete" });
+    if (canDelete) {
+      await expect(deleteButton).toBeVisible();
+    } else {
+      await expect(deleteButton).toBeHidden();
+    }
+
+    await context.close();
+  });
+}
+```
+
+### Permission Escalation Test
+
+```typescript
+test("cannot access admin routes as user", async ({ browser }) => {
+  const userContext = await browser.newContext({
+    storageState: ".auth/user.json",
+  });
+  const page = await userContext.newPage();
+
+  // Try to access admin page directly
+  await page.goto("/admin/users");
+
+  // Should redirect or show error
+  await expect(page).not.toHaveURL("/admin/users");
+  await expect(page.getByText("Access denied")).toBeVisible();
+
+  await userContext.close();
+});
+```
+
+## Concurrent Actions
+
+### Race Condition Testing
+
+```typescript
+test("handles concurrent edits", async ({ browser }) => {
+  const ctx1 = await browser.newContext();
+  const ctx2 = await browser.newContext();
+
+  const page1 = await ctx1.newPage();
+  const page2 = await ctx2.newPage();
+
+  await page1.goto("/item/123");
+  await page2.goto("/item/123");
+
+  // Both click edit at the same time
+  await Promise.all([
+    page1.getByRole("button", { name: "Edit" }).click(),
+    page2.getByRole("button", { name: "Edit" }).click(),
+  ]);
+
+  // Both try to save different values
+  await page1.getByLabel("Name").fill("Value from User 1");
+  await page2.getByLabel("Name").fill("Value from User 2");
+
+  await Promise.all([
+    page1.getByRole("button", { name: "Save" }).click(),
+    page2.getByRole("button", { name: "Save" }).click(),
+  ]);
+
+  // One should succeed, one should get conflict error
+  const page1HasConflict = await page1.getByText("Conflict").isVisible();
+  const page2HasConflict = await page2.getByText("Conflict").isVisible();
+
+  // Exactly one should have conflict
+  expect(page1HasConflict || page2HasConflict).toBe(true);
+  expect(page1HasConflict && page2HasConflict).toBe(false);
+
+  await ctx1.close();
+  await ctx2.close();
+});
+```
+
+### Optimistic Locking Test
+
+```typescript
+test("optimistic locking prevents overwrites", async ({ browser }) => {
+  const ctx1 = await browser.newContext();
+  const ctx2 = await browser.newContext();
+
+  const page1 = await ctx1.newPage();
+  const page2 = await ctx2.newPage();
+
+  // Both load the same version
+  await page1.goto("/record/123");
+  await page2.goto("/record/123");
+
+  // User 1 edits and saves first
+  await page1.getByRole("button", { name: "Edit" }).click();
+  await page1.getByLabel("Value").fill("Updated by User 1");
+  await page1.getByRole("button", { name: "Save" }).click();
+  await expect(page1.getByText("Saved")).toBeVisible();
+
+  // User 2 tries to save with stale version
+  await page2.getByRole("button", { name: "Edit" }).click();
+  await page2.getByLabel("Value").fill("Updated by User 2");
+  await page2.getByRole("button", { name: "Save" }).click();
+
+  // Should fail with version conflict
+  await expect(page2.getByText("Someone else modified this")).toBeVisible();
+  await expect(page2.getByRole("button", { name: "Reload" })).toBeVisible();
+
+  await ctx1.close();
+  await ctx2.close();
+});
+```
+
+## Chat & Messaging
+
+### Real-Time Chat
+
+```typescript
+test("chat messages sync between users", async ({ browser }) => {
+  const aliceCtx = await browser.newContext();
+  const bobCtx = await browser.newContext();
+
+  const alicePage = await aliceCtx.newPage();
+  const bobPage = await bobCtx.newPage();
+
+  // Setup user identities
+  await alicePage.route("**/api/me", (r) =>
+    r.fulfill({ json: { name: "Alice" } }),
+  );
+  await bobPage.route("**/api/me", (r) => r.fulfill({ json: { name: "Bob" } }));
+
+  await alicePage.goto("/chat/room-1");
+  await bobPage.goto("/chat/room-1");
+
+  // Alice sends message
+  await alicePage.getByLabel("Message").fill("Hi Bob!");
+  await alicePage.getByRole("button", { name: "Send" }).click();
+
+  // Bob sees it
+  await expect(bobPage.getByText("Alice: Hi Bob!")).toBeVisible();
+
+  // Bob replies
+  await bobPage.getByLabel("Message").fill("Hey Alice!");
+  await bobPage.getByRole("button", { name: "Send" }).click();
+
+  // Alice sees it
+  await expect(alicePage.getByText("Bob: Hey Alice!")).toBeVisible();
+
+  await aliceCtx.close();
+  await bobCtx.close();
+});
+```
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern                  | Problem                       | Solution                     |
+| ----------------------------- | ----------------------------- | ---------------------------- |
+| Sharing context between users | State leaks, not isolated     | Create separate contexts     |
+| Not closing contexts          | Memory leak, browser overload | Always close in cleanup      |
+| Hardcoded timing for sync     | Flaky tests                   | Use `expect().toBeVisible()` |
+| Testing only single user      | Misses collaboration bugs     | Test multi-user scenarios    |
+
+## Related References
+
+- **Authentication**: See [fixtures-hooks.md](../core/fixtures-hooks.md) for auth setup
+- **WebSockets**: See [websockets.md](../browser-apis/websockets.md) for real-time mocking
diff --git a/.cursor/skills/playwright-testing/advanced/network-advanced.md b/.cursor/skills/playwright-testing/advanced/network-advanced.md
new file mode 100644
index 0000000000..fa017fe53f
--- /dev/null
+++ b/.cursor/skills/playwright-testing/advanced/network-advanced.md
@@ -0,0 +1,452 @@
+# Advanced Network Interception
+
+## Table of Contents
+
+1. [Request Modification](#request-modification)
+2. [GraphQL Mocking](#graphql-mocking)
+3. [HAR Recording & Playback](#har-recording--playback)
+4. [Conditional Mocking](#conditional-mocking)
+5. [Network Throttling](#network-throttling)
+
+## Request Modification
+
+### Modify Request Headers
+
+```typescript
+test("add auth header to requests", async ({ page }) => {
+  await page.route("**/api/**", (route) => {
+    const headers = {
+      ...route.request().headers(),
+      Authorization: "Bearer test-token",
+      "X-Test-Header": "test-value",
+    };
+    route.continue({ headers });
+  });
+
+  await page.goto("/dashboard");
+});
+```
+
+### Modify Request Body
+
+```typescript
+test("modify POST body", async ({ page }) => {
+  await page.route("**/api/orders", async (route) => {
+    if (route.request().method() === "POST") {
+      const postData = route.request().postDataJSON();
+
+      // Add test metadata
+      const modifiedData = {
+        ...postData,
+        testMode: true,
+        testTimestamp: Date.now(),
+      };
+
+      await route.continue({
+        postData: JSON.stringify(modifiedData),
+      });
+    } else {
+      await route.continue();
+    }
+  });
+
+  await page.goto("/checkout");
+  await page.getByRole("button", { name: "Place Order" }).click();
+});
+```
+
+### Transform Response
+
+```typescript
+test("modify API response", async ({ page }) => {
+  await page.route("**/api/products", async (route) => {
+    // Fetch real response
+    const response = await route.fetch();
+    const json = await response.json();
+
+    // Modify response
+    const modified = json.map((product: any) => ({
+      ...product,
+      price: product.price * 0.9, // 10% discount
+      testMode: true,
+    }));
+
+    await route.fulfill({
+      response,
+      json: modified,
+    });
+  });
+
+  await page.goto("/products");
+});
+```
+
+## GraphQL Mocking
+
+### Mock by Operation Name
+
+```typescript
+test("mock GraphQL query", async ({ page }) => {
+  await page.route("**/graphql", async (route) => {
+    const postData = route.request().postDataJSON();
+
+    if (postData.operationName === "GetUser") {
+      return route.fulfill({
+        json: {
+          data: {
+            user: {
+              id: "1",
+              name: "Test User",
+              email: "test@example.com",
+            },
+          },
+        },
+      });
+    }
+
+    if (postData.operationName === "GetProducts") {
+      return route.fulfill({
+        json: {
+          data: {
+            products: [
+              { id: "1", name: "Product A", price: 29.99 },
+              { id: "2", name: "Product B", price: 49.99 },
+            ],
+          },
+        },
+      });
+    }
+
+    // Pass through unmocked operations
+    return route.continue();
+  });
+
+  await page.goto("/dashboard");
+});
+```
+
+### GraphQL Mock Fixture
+
+```typescript
+// fixtures/graphql.fixture.ts
+type GraphQLMock = {
+  operation: string;
+  variables?: Record<string, any>;
+  response: { data?: any; errors?: any[] };
+};
+
+type GraphQLFixtures = {
+  mockGraphQL: (mocks: GraphQLMock[]) => Promise<void>;
+};
+
+export const test = base.extend<GraphQLFixtures>({
+  mockGraphQL: async ({ page }, use) => {
+    await use(async (mocks) => {
+      await page.route("**/graphql", async (route) => {
+        const postData = route.request().postDataJSON();
+
+        const mock = mocks.find((m) => {
+          if (m.operation !== postData.operationName) return false;
+
+          // Optionally match variables
+          if (m.variables) {
+            return (
+              JSON.stringify(m.variables) === JSON.stringify(postData.variables)
+            );
+          }
+          return true;
+        });
+
+        if (mock) {
+          return route.fulfill({ json: mock.response });
+        }
+
+        return route.continue();
+      });
+    });
+  },
+});
+
+// Usage
+test("dashboard with mocked GraphQL", async ({ page, mockGraphQL }) => {
+  await mockGraphQL([
+    {
+      operation: "GetDashboardStats",
+      response: {
+        data: { stats: { users: 100, revenue: 50000 } },
+      },
+    },
+    {
+      operation: "GetUser",
+      variables: { id: "1" },
+      response: {
+        data: { user: { id: "1", name: "John" } },
+      },
+    },
+  ]);
+
+  await page.goto("/dashboard");
+  await expect(page.getByText("100 users")).toBeVisible();
+});
+```
+
+### Mock GraphQL Mutations
+
+```typescript
+test("mock GraphQL mutation", async ({ page }) => {
+  await page.route("**/graphql", async (route) => {
+    const postData = route.request().postDataJSON();
+
+    if (postData.operationName === "CreateOrder") {
+      const { input } = postData.variables;
+
+      return route.fulfill({
+        json: {
+          data: {
+            createOrder: {
+              id: "order-123",
+              status: "PENDING",
+              items: input.items,
+              total: input.items.reduce(
+                (sum: number, item: any) => sum + item.price * item.quantity,
+                0,
+              ),
+            },
+          },
+        },
+      });
+    }
+
+    return route.continue();
+  });
+
+  await page.goto("/checkout");
+  await page.getByRole("button", { name: "Place Order" }).click();
+
+  await expect(page.getByText("Order #order-123")).toBeVisible();
+});
+```
+
+## HAR Recording & Playback
+
+### Record HAR File
+
+```typescript
+// Record network traffic
+test("record HAR", async ({ page, context }) => {
+  // Start recording
+  await context.routeFromHAR("./recordings/checkout.har", {
+    update: true, // Create/update HAR file
+    url: "**/api/**",
+  });
+
+  await page.goto("/checkout");
+  await page.getByRole("button", { name: "Place Order" }).click();
+
+  // HAR file is saved automatically
+});
+```
+
+### Playback HAR File
+
+```typescript
+// Use recorded HAR for offline testing
+test("playback HAR", async ({ page, context }) => {
+  await context.routeFromHAR("./recordings/checkout.har", {
+    url: "**/api/**",
+    update: false, // Don't update, just playback
+  });
+
+  await page.goto("/checkout");
+
+  // All API calls served from HAR file
+  await expect(page.getByText("Order confirmed")).toBeVisible();
+});
+```
+
+### HAR with Fallback
+
+```typescript
+test("HAR with live fallback", async ({ page, context }) => {
+  await context.routeFromHAR("./recordings/api.har", {
+    url: "**/api/**",
+    update: false,
+    notFound: "fallback", // Use real network if not in HAR
+  });
+
+  await page.goto("/dashboard");
+});
+```
+
+## Conditional Mocking
+
+### Mock Based on Request Body
+
+```typescript
+test("conditional mock by body", async ({ page }) => {
+  await page.route("**/api/search", async (route) => {
+    const body = route.request().postDataJSON();
+
+    if (body.query === "error") {
+      return route.fulfill({
+        status: 500,
+        json: { error: "Search failed" },
+      });
+    }
+
+    if (body.query === "empty") {
+      return route.fulfill({
+        json: { results: [] },
+      });
+    }
+
+    // Default response
+    return route.fulfill({
+      json: {
+        results: [{ id: 1, title: `Result for: ${body.query}` }],
+      },
+    });
+  });
+
+  await page.goto("/search");
+
+  // Test different scenarios
+  await page.getByLabel("Search").fill("error");
+  await page.getByLabel("Search").press("Enter");
+  await expect(page.getByText("Search failed")).toBeVisible();
+});
+```
+
+### Mock Nth Request
+
+```typescript
+test("different response on retry", async ({ page }) => {
+  let callCount = 0;
+
+  await page.route("**/api/status", (route) => {
+    callCount++;
+
+    if (callCount < 3) {
+      return route.fulfill({
+        status: 503,
+        json: { error: "Service unavailable" },
+      });
+    }
+
+    // Succeed on 3rd attempt
+    return route.fulfill({
+      json: { status: "ok" },
+    });
+  });
+
+  await page.goto("/dashboard");
+
+  // App should retry and eventually succeed
+  await expect(page.getByText("Connected")).toBeVisible();
+});
+```
+
+### Mock with Delay
+
+```typescript
+test("slow network simulation", async ({ page }) => {
+  await page.route("**/api/data", async (route) => {
+    // Simulate 2 second delay
+    await new Promise((resolve) => setTimeout(resolve, 2000));
+
+    return route.fulfill({
+      json: { data: "loaded" },
+    });
+  });
+
+  await page.goto("/dashboard");
+
+  // Loading state should appear
+  await expect(page.getByText("Loading...")).toBeVisible();
+
+  // Then data appears
+  await expect(page.getByText("loaded")).toBeVisible();
+});
+```
+
+## Network Throttling
+
+### Slow 3G Simulation
+
+```typescript
+test("slow network experience", async ({ page, context }) => {
+  // Create CDP session for network throttling
+  const client = await context.newCDPSession(page);
+
+  await client.send("Network.emulateNetworkConditions", {
+    offline: false,
+    downloadThroughput: (500 * 1024) / 8, // 500 Kbps
+    uploadThroughput: (500 * 1024) / 8,
+    latency: 400, // 400ms
+  });
+
+  await page.goto("/");
+
+  // Test loading states appear
+  await expect(page.getByTestId("skeleton-loader")).toBeVisible();
+});
+```
+
+### Offline Mode
+
+Use `context.setOffline(true/false)` to simulate network connectivity changes.
+
+> **For comprehensive offline testing patterns:**
+>
+> - **Network failure simulation** (error recovery, graceful degradation): See [error-testing.md](error-testing.md#offline-testing)
+> - **Offline-first/PWA testing** (service workers, caching, background sync): See [service-workers.md](service-workers.md#offline-testing)
+
+### Network Throttling Fixture
+
+```typescript
+// fixtures/network.fixture.ts
+type NetworkCondition = "slow3g" | "fast3g" | "offline";
+
+const conditions = {
+  slow3g: { downloadThroughput: 50000, uploadThroughput: 50000, latency: 2000 },
+  fast3g: { downloadThroughput: 180000, uploadThroughput: 75000, latency: 150 },
+};
+
+type NetworkFixtures = {
+  setNetworkCondition: (condition: NetworkCondition) => Promise<void>;
+};
+
+export const test = base.extend<NetworkFixtures>({
+  setNetworkCondition: async ({ page, context }, use) => {
+    const client = await context.newCDPSession(page);
+
+    await use(async (condition) => {
+      if (condition === "offline") {
+        await context.setOffline(true);
+      } else {
+        await client.send("Network.emulateNetworkConditions", {
+          offline: false,
+          ...conditions[condition],
+        });
+      }
+    });
+
+    // Reset
+    await context.setOffline(false);
+  },
+});
+```
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern             | Problem                        | Solution                         |
+| ------------------------ | ------------------------------ | -------------------------------- |
+| Mocking all requests     | Tests don't reflect reality    | Mock only what's necessary       |
+| No cleanup of routes     | Routes persist across tests    | Use fixtures with cleanup        |
+| Ignoring request method  | Mock applies to wrong requests | Check `route.request().method()` |
+| Hardcoded mock responses | Brittle, hard to maintain      | Use factories for mock data      |
+
+## Related References
+
+- **Basic Mocking**: See [test-suite-structure.md](../core/test-suite-structure.md) for simple mocking
+- **WebSockets**: See [websockets.md](../browser-apis/websockets.md) for real-time mocking
diff --git a/.cursor/skills/playwright-testing/advanced/third-party.md b/.cursor/skills/playwright-testing/advanced/third-party.md
new file mode 100644
index 0000000000..acf8ab8901
--- /dev/null
+++ b/.cursor/skills/playwright-testing/advanced/third-party.md
@@ -0,0 +1,464 @@
+# Third-Party Service Mocking
+
+## Table of Contents
+
+1. [OAuth/SSO Mocking](#oauthsso-mocking)
+2. [Payment Gateway Mocking](#payment-gateway-mocking)
+3. [Email Verification](#email-verification)
+4. [SMS Verification](#sms-verification)
+5. [Analytics & Tracking](#analytics--tracking)
+
+## OAuth/SSO Mocking
+
+### Mock Google OAuth
+
+```typescript
+test("Google OAuth login", async ({ page }) => {
+  // Mock the OAuth callback
+  await page.route("**/auth/google/callback**", (route) => {
+    const url = new URL(route.request().url());
+    // Simulate successful OAuth by redirecting with token
+    route.fulfill({
+      status: 302,
+      headers: {
+        Location: "/dashboard?token=mock-jwt-token",
+      },
+    });
+  });
+
+  // Mock the token verification endpoint
+  await page.route("**/api/auth/verify", (route) =>
+    route.fulfill({
+      json: {
+        valid: true,
+        user: {
+          id: "123",
+          email: "test@gmail.com",
+          name: "Test User",
+        },
+      },
+    }),
+  );
+
+  await page.goto("/login");
+  await page.getByRole("button", { name: "Sign in with Google" }).click();
+
+  await expect(page.getByText("Welcome, Test User")).toBeVisible();
+});
+```
+
+### OAuth Fixture
+
+```typescript
+// fixtures/oauth.fixture.ts
+type OAuthProvider = "google" | "github" | "microsoft";
+
+type OAuthUser = {
+  id: string;
+  email: string;
+  name: string;
+  avatar?: string;
+};
+
+type OAuthFixtures = {
+  mockOAuth: (provider: OAuthProvider, user: OAuthUser) => Promise<void>;
+};
+
+export const test = base.extend<OAuthFixtures>({
+  mockOAuth: async ({ page }, use) => {
+    await use(async (provider, user) => {
+      // Mock callback redirect
+      await page.route(`**/auth/${provider}/callback**`, (route) =>
+        route.fulfill({
+          status: 302,
+          headers: { Location: `/auth/success?provider=${provider}` },
+        }),
+      );
+
+      // Mock session/user endpoint
+      await page.route("**/api/auth/session", (route) =>
+        route.fulfill({
+          json: { user, provider, authenticated: true },
+        }),
+      );
+
+      // Mock user info endpoint
+      await page.route("**/api/me", (route) => route.fulfill({ json: user }));
+    });
+  },
+});
+
+// Usage
+test("login with GitHub", async ({ page, mockOAuth }) => {
+  await mockOAuth("github", {
+    id: "gh-123",
+    email: "dev@github.com",
+    name: "GitHub User",
+  });
+
+  await page.goto("/login");
+  await page.getByRole("button", { name: "Sign in with GitHub" }).click();
+
+  await expect(page.getByText("Welcome, GitHub User")).toBeVisible();
+});
+```
+
+### Mock SAML SSO
+
+```typescript
+test("SAML SSO login", async ({ page }) => {
+  // Mock SAML assertion consumer service
+  await page.route("**/saml/acs", async (route) => {
+    route.fulfill({
+      status: 302,
+      headers: {
+        Location: "/dashboard",
+        "Set-Cookie": "session=mock-saml-session; Path=/; HttpOnly",
+      },
+    });
+  });
+
+  // Mock session validation
+  await page.route("**/api/session", (route) =>
+    route.fulfill({
+      json: {
+        user: { email: "user@company.com", name: "SSO User" },
+        provider: "saml",
+      },
+    }),
+  );
+
+  await page.goto("/login");
+  await page.getByRole("button", { name: "SSO Login" }).click();
+
+  await expect(page).toHaveURL("/dashboard");
+});
+```
+
+## Payment Gateway Mocking
+
+### Mock Stripe
+
+```typescript
+test("Stripe checkout", async ({ page }) => {
+  // Mock Stripe.js
+  await page.addInitScript(() => {
+    (window as any).Stripe = () => ({
+      elements: () => ({
+        create: () => ({
+          mount: () => {},
+          on: () => {},
+          destroy: () => {},
+        }),
+      }),
+      confirmCardPayment: async () => ({
+        paymentIntent: { status: "succeeded", id: "pi_mock_123" },
+      }),
+      createPaymentMethod: async () => ({
+        paymentMethod: { id: "pm_mock_123" },
+      }),
+    });
+  });
+
+  // Mock backend payment endpoint
+  await page.route("**/api/create-payment-intent", (route) =>
+    route.fulfill({
+      json: { clientSecret: "pi_mock_123_secret_mock" },
+    }),
+  );
+
+  await page.route("**/api/confirm-payment", (route) =>
+    route.fulfill({
+      json: { success: true, orderId: "order-123" },
+    }),
+  );
+
+  await page.goto("/checkout");
+  await page.getByRole("button", { name: "Pay $99.99" }).click();
+
+  await expect(page.getByText("Payment successful")).toBeVisible();
+});
+```
+
+### Mock PayPal
+
+```typescript
+test("PayPal checkout", async ({ page }) => {
+  // Mock PayPal SDK
+  await page.addInitScript(() => {
+    (window as any).paypal = {
+      Buttons: () => ({
+        render: () => Promise.resolve(),
+        isEligible: () => true,
+      }),
+      FUNDING: { PAYPAL: "paypal", CARD: "card" },
+    };
+  });
+
+  // Mock PayPal order creation
+  await page.route("**/api/paypal/create-order", (route) =>
+    route.fulfill({
+      json: { orderId: "PAYPAL-ORDER-123" },
+    }),
+  );
+
+  // Mock PayPal capture
+  await page.route("**/api/paypal/capture", (route) =>
+    route.fulfill({
+      json: { success: true, transactionId: "TXN-123" },
+    }),
+  );
+
+  await page.goto("/checkout");
+
+  // Simulate PayPal approval callback
+  await page.evaluate(() => {
+    (window as any).onPayPalApprove?.({ orderID: "PAYPAL-ORDER-123" });
+  });
+
+  await expect(page.getByText("Order confirmed")).toBeVisible();
+});
+```
+
+### Payment Fixture
+
+```typescript
+// fixtures/payment.fixture.ts
+type PaymentFixtures = {
+  mockStripe: (options?: { failPayment?: boolean }) => Promise<void>;
+};
+
+export const test = base.extend<PaymentFixtures>({
+  mockStripe: async ({ page }, use) => {
+    await use(async (options = {}) => {
+      await page.addInitScript(
+        ([shouldFail]) => {
+          (window as any).Stripe = () => ({
+            elements: () => ({
+              create: () => ({
+                mount: () => {},
+                on: (event: string, handler: Function) => {
+                  if (event === "ready") setTimeout(handler, 100);
+                },
+                destroy: () => {},
+              }),
+            }),
+            confirmCardPayment: async () => {
+              if (shouldFail) {
+                return { error: { message: "Card declined" } };
+              }
+              return { paymentIntent: { status: "succeeded" } };
+            },
+          });
+        },
+        [options.failPayment],
+      );
+    });
+  },
+});
+
+// Usage
+test("handles declined card", async ({ page, mockStripe }) => {
+  await mockStripe({ failPayment: true });
+
+  await page.goto("/checkout");
+  await page.getByRole("button", { name: "Pay" }).click();
+
+  await expect(page.getByText("Card declined")).toBeVisible();
+});
+```
+
+## Email Verification
+
+### Mock Email API
+
+```typescript
+test("email verification flow", async ({ page, request }) => {
+  let verificationToken: string;
+
+  // Capture the verification email
+  await page.route("**/api/send-verification", async (route) => {
+    const body = route.request().postDataJSON();
+    verificationToken = `mock-token-${Date.now()}`;
+
+    // Don't actually send email, just store token
+    route.fulfill({
+      json: { sent: true, messageId: "msg-123" },
+    });
+  });
+
+  // Mock token verification
+  await page.route("**/api/verify-email**", (route) => {
+    const url = new URL(route.request().url());
+    const token = url.searchParams.get("token");
+
+    if (token === verificationToken) {
+      route.fulfill({ json: { verified: true } });
+    } else {
+      route.fulfill({ status: 400, json: { error: "Invalid token" } });
+    }
+  });
+
+  await page.goto("/signup");
+  await page.getByLabel("Email").fill("test@example.com");
+  await page.getByRole("button", { name: "Sign Up" }).click();
+
+  await expect(page.getByText("Check your email")).toBeVisible();
+
+  // Simulate clicking email link
+  await page.goto(`/verify?token=${verificationToken}`);
+
+  await expect(page.getByText("Email verified")).toBeVisible();
+});
+```
+
+### Use Mailinator/Temp Mail
+
+```typescript
+// fixtures/email.fixture.ts
+type EmailFixtures = {
+  getVerificationEmail: (inbox: string) => Promise<{ link: string }>;
+};
+
+export const test = base.extend<EmailFixtures>({
+  getVerificationEmail: async ({ request }, use) => {
+    await use(async (inbox) => {
+      // Poll Mailinator API for new email
+      const response = await request.get(
+        `https://api.mailinator.com/v2/domains/public/inboxes/${inbox}`,
+        {
+          headers: {
+            Authorization: `Bearer ${process.env.MAILINATOR_API_KEY}`,
+          },
+        },
+      );
+
+      const messages = await response.json();
+      const latest = messages.msgs[0];
+
+      // Get full message
+      const msgResponse = await request.get(
+        `https://api.mailinator.com/v2/domains/public/inboxes/${inbox}/messages/${latest.id}`,
+        {
+          headers: {
+            Authorization: `Bearer ${process.env.MAILINATOR_API_KEY}`,
+          },
+        },
+      );
+
+      const message = await msgResponse.json();
+
+      // Extract verification link from HTML
+      const linkMatch = message.parts[0].body.match(
+        /href="([^"]*verify[^"]*)"/,
+      );
+      return { link: linkMatch?.[1] || "" };
+    });
+  },
+});
+```
+
+## SMS Verification
+
+### Mock SMS API
+
+```typescript
+test("SMS verification", async ({ page }) => {
+  let smsCode: string;
+
+  // Capture SMS send
+  await page.route("**/api/send-sms", (route) => {
+    smsCode = Math.random().toString().slice(2, 8); // 6-digit code
+
+    route.fulfill({
+      json: { sent: true, messageId: "sms-123" },
+    });
+  });
+
+  // Mock code verification
+  await page.route("**/api/verify-sms", (route) => {
+    const body = route.request().postDataJSON();
+
+    if (body.code === smsCode) {
+      route.fulfill({ json: { verified: true } });
+    } else {
+      route.fulfill({ status: 400, json: { error: "Invalid code" } });
+    }
+  });
+
+  await page.goto("/verify-phone");
+  await page.getByLabel("Phone").fill("+1234567890");
+  await page.getByRole("button", { name: "Send Code" }).click();
+
+  // Enter the code
+  await page.getByLabel("Verification Code").fill(smsCode);
+  await page.getByRole("button", { name: "Verify" }).click();
+
+  await expect(page.getByText("Phone verified")).toBeVisible();
+});
+```
+
+## Analytics & Tracking
+
+### Block Analytics in Tests
+
+```typescript
+test.beforeEach(async ({ page }) => {
+  // Block all analytics/tracking
+  await page.route(
+    /google-analytics|googletagmanager|facebook|hotjar|segment|mixpanel|amplitude/,
+    (route) => route.abort(),
+  );
+});
+```
+
+### Mock Analytics for Verification
+
+```typescript
+test("tracks purchase event", async ({ page }) => {
+  const analyticsEvents: any[] = [];
+
+  // Capture analytics calls
+  await page.route("**/api/analytics/**", (route) => {
+    analyticsEvents.push(route.request().postDataJSON());
+    route.fulfill({ status: 200 });
+  });
+
+  // Mock analytics SDK
+  await page.addInitScript(() => {
+    (window as any).analytics = {
+      track: (event: string, props: any) => {
+        fetch("/api/analytics/track", {
+          method: "POST",
+          body: JSON.stringify({ event, props }),
+        });
+      },
+    };
+  });
+
+  await page.goto("/checkout");
+  await page.getByRole("button", { name: "Complete Purchase" }).click();
+
+  // Verify analytics event was sent
+  expect(analyticsEvents).toContainEqual(
+    expect.objectContaining({
+      event: "Purchase Completed",
+      props: expect.objectContaining({ amount: expect.any(Number) }),
+    }),
+  );
+});
+```
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern              | Problem                        | Solution                |
+| ------------------------- | ------------------------------ | ----------------------- |
+| Using real OAuth in tests | Slow, needs credentials, flaky | Mock OAuth endpoints    |
+| Real payment processing   | Charges real money, slow       | Use test mode or mock   |
+| Waiting for real emails   | Very slow, unreliable          | Mock email API          |
+| Not mocking analytics     | Pollutes analytics data        | Block or mock analytics |
+
+## Related References
+
+- **Network Mocking**: See [network-advanced.md](network-advanced.md) for route patterns
+- **Authentication**: See [fixtures-hooks.md](../core/fixtures-hooks.md) for auth patterns
diff --git a/.cursor/skills/playwright-testing/architecture/pom-vs-fixtures.md b/.cursor/skills/playwright-testing/architecture/pom-vs-fixtures.md
new file mode 100644
index 0000000000..eafb06f767
--- /dev/null
+++ b/.cursor/skills/playwright-testing/architecture/pom-vs-fixtures.md
@@ -0,0 +1,363 @@
+# Organizing Reusable Test Code
+
+## Table of Contents
+
+1. [Pattern Comparison](#pattern-comparison)
+2. [Selection Flowchart](#selection-flowchart)
+3. [Page Objects](#page-objects)
+4. [Custom Fixtures](#custom-fixtures)
+5. [Helper Functions](#helper-functions)
+6. [Combined Project Structure](#combined-project-structure)
+7. [Anti-Patterns](#anti-patterns)
+
+Use all three patterns together. Most projects benefit from a hybrid approach:
+
+- **Page objects** for UI interaction (pages/components with 5+ interactions)
+- **Custom fixtures** for test infrastructure (auth state, database, API clients, anything with lifecycle)
+- **Helper functions** for stateless utilities (generate data, format values, simple waits)
+
+If only using one pattern, choose **custom fixtures** — they handle setup/teardown, compose well, and Playwright is built around them.
+
+## Pattern Comparison
+
+| Aspect | Page Objects | Custom Fixtures | Helper Functions |
+|---|---|---|---|
+| **Purpose** | Encapsulate UI interactions | Provide resources with setup/teardown | Stateless utilities |
+| **Lifecycle** | Manual (constructor/methods) | Built-in (`use()` with automatic teardown) | None |
+| **Composability** | Constructor injection or fixture wiring | Depend on other fixtures | Call other functions |
+| **Best for** | Pages with many reused interactions | Resources needing setup AND teardown | Simple logic with no side effects |
+
+## Selection Flowchart
+
+```text
+What kind of reusable code?
+|
++-- Interacts with browser page/component?
+|   |
+|   +-- Has 5+ interactions (fill, click, navigate, assert)?
+|   |   +-- YES: Used in 3+ test files?
+|   |   |   +-- YES --> PAGE OBJECT
+|   |   |   +-- NO --> Inline or small helper
+|   |   +-- NO --> HELPER FUNCTION
+|   |
+|   +-- Needs setup before AND cleanup after test?
+|       +-- YES --> CUSTOM FIXTURE
+|       +-- NO --> PAGE OBJECT method or HELPER
+|
++-- Manages resource with lifecycle (create/destroy)?
+|   +-- Examples: auth state, DB connection, API client, test user
+|   +-- YES --> CUSTOM FIXTURE (always)
+|
++-- Stateless utility? (no browser, no side effects)
+|   +-- Examples: random email, format date, build URL, parse response
+|   +-- YES --> HELPER FUNCTION
+|
++-- Not sure?
+    +-- Start with HELPER FUNCTION
+    +-- Promote to PAGE OBJECT when interactions grow
+    +-- Promote to FIXTURE when lifecycle needed
+```
+
+## Page Objects
+
+Best for pages/components with 5+ interactions appearing in 3+ test files.
+
+```typescript
+// page-objects/booking.page.ts
+import { type Page, type Locator, expect } from '@playwright/test';
+
+export class BookingPage {
+  readonly page: Page;
+  readonly dateField: Locator;
+  readonly guestCount: Locator;
+  readonly roomType: Locator;
+  readonly reserveBtn: Locator;
+  readonly totalPrice: Locator;
+
+  constructor(page: Page) {
+    this.page = page;
+    this.dateField = page.getByLabel('Check-in date');
+    this.guestCount = page.getByLabel('Guests');
+    this.roomType = page.getByLabel('Room type');
+    this.reserveBtn = page.getByRole('button', { name: 'Reserve' });
+    this.totalPrice = page.getByTestId('total-price');
+  }
+
+  async goto() {
+    await this.page.goto('/booking');
+  }
+
+  async fillDetails(opts: { date: string; guests: number; room: string }) {
+    await this.dateField.fill(opts.date);
+    await this.guestCount.fill(String(opts.guests));
+    await this.roomType.selectOption(opts.room);
+  }
+
+  async reserve() {
+    await this.reserveBtn.click();
+    await this.page.waitForURL('**/confirmation');
+  }
+
+  async expectPrice(amount: string) {
+    await expect(this.totalPrice).toHaveText(amount);
+  }
+}
+```
+
+```typescript
+// tests/booking/reservation.spec.ts
+import { test, expect } from '@playwright/test';
+import { BookingPage } from '../page-objects/booking.page';
+
+test('complete reservation with standard room', async ({ page }) => {
+  const booking = new BookingPage(page);
+  await booking.goto();
+  await booking.fillDetails({ date: '2026-03-15', guests: 2, room: 'standard' });
+  await booking.reserve();
+  await expect(page.getByText('Reservation confirmed')).toBeVisible();
+});
+```
+
+**Page object principles:**
+- One class per logical page/component, not per URL
+- Constructor takes `Page`
+- Locators as `readonly` properties in constructor
+- Methods represent user intent (`reserve`, `fillDetails`), not low-level clicks
+- Navigation methods (`goto`) belong on the page object
+
+## Custom Fixtures
+
+Best for resources needing setup before and teardown after tests — auth state, database connections, API clients, test users.
+
+```typescript
+// fixtures/base.fixture.ts
+import { test as base, expect } from '@playwright/test';
+import { BookingPage } from '../page-objects/booking.page';
+import { generateMember } from '../helpers/data';
+
+type Fixtures = {
+  bookingPage: BookingPage;
+  member: { email: string; password: string; id: string };
+  loggedInPage: import('@playwright/test').Page;
+};
+
+export const test = base.extend<Fixtures>({
+  bookingPage: async ({ page }, use) => {
+    await use(new BookingPage(page));
+  },
+
+  member: async ({ request }, use) => {
+    const data = generateMember();
+    const res = await request.post('/api/test/members', { data });
+    const member = await res.json();
+    await use(member);
+    await request.delete(`/api/test/members/${member.id}`);
+  },
+
+  loggedInPage: async ({ page, member }, use) => {
+    await page.goto('/login');
+    await page.getByLabel('Email').fill(member.email);
+    await page.getByLabel('Password').fill(member.password);
+    await page.getByRole('button', { name: 'Sign in' }).click();
+    await expect(page).toHaveURL('/dashboard');
+    await use(page);
+  },
+});
+
+export { expect } from '@playwright/test';
+```
+
+```typescript
+// tests/dashboard/overview.spec.ts
+import { test, expect } from '../../fixtures/base.fixture';
+
+test('member sees dashboard widgets', async ({ loggedInPage }) => {
+  await expect(loggedInPage.getByRole('heading', { name: 'Dashboard' })).toBeVisible();
+  await expect(loggedInPage.getByTestId('stats-widget')).toBeVisible();
+});
+
+test('new member sees welcome prompt', async ({ loggedInPage, member }) => {
+  await expect(loggedInPage.getByText(`Welcome, ${member.email}`)).toBeVisible();
+});
+```
+
+**Fixture principles:**
+- Use `test.extend()` — never module-level variables
+- `use()` callback separates setup from teardown
+- Teardown runs even if test fails
+- Fixtures compose: one can depend on another
+- Fixtures are lazy: created only when requested
+- Wrap page objects in fixtures for lifecycle management
+
+## Helper Functions
+
+Best for stateless utilities — generating test data, formatting values, building URLs, parsing responses.
+
+```typescript
+// helpers/data.ts
+import { randomUUID } from 'node:crypto';
+
+export function generateEmail(prefix = 'user'): string {
+  return `${prefix}-${Date.now()}-${randomUUID().slice(0, 8)}@test.local`;
+}
+
+export function generateMember(overrides: Partial<Member> = {}): Member {
+  return {
+    email: generateEmail(),
+    password: 'SecurePass456!',
+    name: 'Test Member',
+    ...overrides,
+  };
+}
+
+interface Member {
+  email: string;
+  password: string;
+  name: string;
+}
+
+export function formatPrice(cents: number): string {
+  return `$${(cents / 100).toFixed(2)}`;
+}
+```
+
+```typescript
+// helpers/assertions.ts
+import { type Page, expect } from '@playwright/test';
+
+export async function expectNotification(page: Page, message: string): Promise<void> {
+  const notification = page.getByRole('alert').filter({ hasText: message });
+  await expect(notification).toBeVisible();
+  await expect(notification).toBeHidden({ timeout: 10000 });
+}
+```
+
+```typescript
+// tests/settings/account.spec.ts
+import { test, expect } from '@playwright/test';
+import { generateEmail } from '../../helpers/data';
+import { expectNotification } from '../../helpers/assertions';
+
+test('update account email', async ({ page }) => {
+  const newEmail = generateEmail('updated');
+  await page.goto('/settings/account');
+  await page.getByLabel('Email').fill(newEmail);
+  await page.getByRole('button', { name: 'Save' }).click();
+  await expectNotification(page, 'Account updated');
+  await expect(page.getByLabel('Email')).toHaveValue(newEmail);
+});
+```
+
+**Helper principles:**
+- Pure functions with no side effects
+- No browser state — take `page` as parameter if needed
+- Promote to fixture if setup/teardown needed
+- Promote to page object if many page interactions grow
+- Keep small and focused
+
+## Combined Project Structure
+
+```text
+tests/
++-- fixtures/
+|   +-- auth.fixture.ts
+|   +-- db.fixture.ts
+|   +-- base.fixture.ts
++-- page-objects/
+|   +-- login.page.ts
+|   +-- booking.page.ts
+|   +-- components/
+|       +-- data-table.component.ts
++-- helpers/
+|   +-- data.ts
+|   +-- assertions.ts
++-- e2e/
+|   +-- auth/
+|   |   +-- login.spec.ts
+|   +-- booking/
+|       +-- reservation.spec.ts
+playwright.config.ts
+```
+
+**Layer responsibilities:**
+
+| Layer | Pattern | Responsibility |
+|---|---|---|
+| **Test file** | `test()` | Describes behavior, orchestrates layers |
+| **Fixtures** | `test.extend()` | Resource lifecycle — setup, provide, teardown |
+| **Page objects** | Classes | UI interaction — navigation, actions, locators |
+| **Helpers** | Functions | Utilities — data generation, formatting, assertions |
+
+## Anti-Patterns
+
+### Page object managing resources
+
+```typescript
+// BAD: page object handling API calls and database
+class LoginPage {
+  async createUser() { /* API call */ }
+  async deleteUser() { /* API call */ }
+  async signIn(email: string, password: string) { /* UI */ }
+}
+```
+
+Resource lifecycle belongs in fixtures where teardown is guaranteed. Keep only `signIn` in the page object.
+
+### Locator-only page objects
+
+```typescript
+// BAD: no methods, just locators
+class LoginPage {
+  emailInput = this.page.getByLabel('Email');
+  passwordInput = this.page.getByLabel('Password');
+  submitBtn = this.page.getByRole('button', { name: 'Sign in' });
+  constructor(private page: Page) {}
+}
+```
+
+Add intent-revealing methods or skip the page object entirely.
+
+### Monolithic fixtures
+
+```typescript
+// BAD: one fixture doing everything
+test.extend({
+  everything: async ({ page, request }, use) => {
+    const user = await createUser(request);
+    const products = await seedProducts(request, 50);
+    await setupPayment(request, user.id);
+    await page.goto('/dashboard');
+    await use({ user, products, page });
+    // massive teardown...
+  },
+});
+```
+
+Break into small, composable fixtures. Each fixture does one thing.
+
+### Helpers with side effects
+
+```typescript
+// BAD: module-level state
+let createdUserId: string;
+
+export async function createTestUser(request: APIRequestContext) {
+  const res = await request.post('/api/users', { data: { email: 'test@example.com' } });
+  const user = await res.json();
+  createdUserId = user.id; // shared across tests!
+  return user;
+}
+```
+
+Module-level state leaks between parallel tests. If it has side effects and needs cleanup, make it a fixture.
+
+### Over-abstracting simple operations
+
+```typescript
+// BAD: helper for one-liner
+export async function clickButton(page: Page, name: string) {
+  await page.getByRole('button', { name }).click();
+}
+```
+
+Only abstract when there is real duplication (3+ usages) or complexity (5+ interactions).
diff --git a/.cursor/skills/playwright-testing/architecture/test-architecture.md b/.cursor/skills/playwright-testing/architecture/test-architecture.md
new file mode 100644
index 0000000000..28b6f6c9c5
--- /dev/null
+++ b/.cursor/skills/playwright-testing/architecture/test-architecture.md
@@ -0,0 +1,369 @@
+# Choosing Test Types: E2E, Component, or API
+
+## Table of Contents
+
+1. [Decision Matrix](#decision-matrix)
+2. [API Tests](#api-tests)
+3. [Component Tests](#component-tests)
+4. [E2E Tests](#e2e-tests)
+5. [Layering Test Types](#layering-test-types)
+6. [Common Mistakes](#common-mistakes)
+7. [Related](#related)
+
+> **When to use**: Deciding which test type to write for a feature. Ask: "What's the cheapest test that gives confidence this works?"
+
+## Decision Matrix
+
+| Scenario                    | Recommended Type | Rationale                                     |
+| --------------------------- | ---------------- | --------------------------------------------- |
+| Login / auth flow           | E2E              | Cross-page, cookies, redirects, session state |
+| Form submission             | Component        | Isolated validation logic, error states       |
+| CRUD operations             | API              | Data integrity matters more than UI           |
+| Search with results UI      | Component + API  | API for query logic; component for rendering  |
+| Cross-page navigation       | E2E              | Routing, history, deep linking                |
+| API error handling          | API              | Status codes, error shapes, edge cases        |
+| UI error feedback           | Component        | Toast, banner, inline error rendering         |
+| Accessibility               | Component        | ARIA roles, keyboard nav per-component        |
+| Responsive layout           | Component        | Viewport-specific rendering without full app  |
+| API contract validation     | API              | Response shapes, headers, auth                |
+| WebSocket/real-time         | E2E              | Requires full browser environment             |
+| Payment / checkout          | E2E              | Multi-step, third-party iframes               |
+| Onboarding wizard           | E2E              | Multi-step, state persists across pages       |
+| Widget behavior             | Component        | Toggle, accordion, date picker, modal         |
+| Permissions / authorization | API              | Role-based access is backend logic            |
+
+## API Tests
+
+**Ideal for**:
+
+- CRUD operations (create, read, update, delete)
+- Input validation and error responses (400, 422)
+- Permission and authorization checks
+- Data integrity and business rules
+- API contract verification
+- Edge cases expensive to reproduce through UI
+- Test data setup/teardown for E2E tests
+
+**Avoid for**:
+
+- Testing how errors display to users
+- Browser-specific behavior (cookies, redirects)
+- Visual layout or responsive design
+- Flows requiring JavaScript execution or DOM interaction
+- Third-party iframe interactions
+
+```typescript
+import { test, expect } from "@playwright/test";
+
+test.describe("Products API", () => {
+  let token: string;
+
+  test.beforeAll(async ({ request }) => {
+    const res = await request.post("/api/auth/token", {
+      data: { email: "manager@shop.io", password: "mgr-secret" },
+    });
+    token = (await res.json()).accessToken;
+  });
+
+  test("creates product with valid payload", async ({ request }) => {
+    const res = await request.post("/api/products", {
+      headers: { Authorization: `Bearer ${token}` },
+      data: { name: "Widget Pro", sku: "WGT-100", price: 29.99 },
+    });
+
+    expect(res.status()).toBe(201);
+    const product = await res.json();
+    expect(product).toMatchObject({ name: "Widget Pro", sku: "WGT-100" });
+    expect(product).toHaveProperty("id");
+  });
+
+  test("rejects duplicate SKU with 409", async ({ request }) => {
+    const res = await request.post("/api/products", {
+      headers: { Authorization: `Bearer ${token}` },
+      data: { name: "Duplicate", sku: "WGT-100", price: 19.99 },
+    });
+
+    expect(res.status()).toBe(409);
+    expect((await res.json()).message).toContain("already exists");
+  });
+
+  test("returns 422 for missing required fields", async ({ request }) => {
+    const res = await request.post("/api/products", {
+      headers: { Authorization: `Bearer ${token}` },
+      data: { name: "Incomplete" },
+    });
+
+    expect(res.status()).toBe(422);
+    const err = await res.json();
+    expect(err.errors).toContainEqual(
+      expect.objectContaining({ field: "sku" })
+    );
+  });
+
+  test("staff role cannot delete products", async ({ request }) => {
+    const staffLogin = await request.post("/api/auth/token", {
+      data: { email: "staff@shop.io", password: "staff-pass" },
+    });
+    const staffToken = (await staffLogin.json()).accessToken;
+
+    const res = await request.delete("/api/products/123", {
+      headers: { Authorization: `Bearer ${staffToken}` },
+    });
+
+    expect(res.status()).toBe(403);
+  });
+
+  test("lists products with pagination", async ({ request }) => {
+    const res = await request.get("/api/products", {
+      headers: { Authorization: `Bearer ${token}` },
+      params: { page: "1", limit: "20" },
+    });
+
+    expect(res.status()).toBe(200);
+    const body = await res.json();
+    expect(body.items).toBeInstanceOf(Array);
+    expect(body.items.length).toBeLessThanOrEqual(20);
+    expect(body).toHaveProperty("totalCount");
+  });
+});
+```
+
+## Component Tests
+
+**Ideal for**:
+
+- Form validation (required fields, format rules, error messages)
+- Interactive widgets (modals, dropdowns, accordions, date pickers)
+- Conditional rendering (show/hide, loading states, empty states)
+- Accessibility per-component (ARIA attributes, keyboard navigation)
+- Responsive layout at different viewports
+- Visual states (hover, focus, disabled, selected)
+
+**Avoid for**:
+
+- Testing routing or navigation between pages
+- Flows requiring real cookies, sessions, or server-side state
+- Data persistence or API contract validation
+- Third-party iframe interactions
+- Anything requiring multiple pages or browser contexts
+
+```typescript
+import { test, expect } from "@playwright/experimental-ct-react";
+import { ContactForm } from "../src/components/ContactForm";
+
+test.describe("ContactForm component", () => {
+  test("displays validation errors on empty submit", async ({ mount }) => {
+    const component = await mount(<ContactForm onSubmit={() => {}} />);
+
+    await component.getByRole("button", { name: "Send message" }).click();
+
+    await expect(component.getByText("Name is required")).toBeVisible();
+    await expect(component.getByText("Email is required")).toBeVisible();
+  });
+
+  test("rejects malformed email", async ({ mount }) => {
+    const component = await mount(<ContactForm onSubmit={() => {}} />);
+
+    await component.getByLabel("Name").fill("Alex");
+    await component.getByLabel("Email").fill("invalid-email");
+    await component.getByLabel("Message").fill("Hello");
+    await component.getByRole("button", { name: "Send message" }).click();
+
+    await expect(component.getByText("Enter a valid email")).toBeVisible();
+  });
+
+  test("invokes onSubmit with form data", async ({ mount }) => {
+    const submissions: Array<{ name: string; email: string; message: string }> =
+      [];
+    const component = await mount(
+      <ContactForm onSubmit={(data) => submissions.push(data)} />
+    );
+
+    await component.getByLabel("Name").fill("Alex");
+    await component.getByLabel("Email").fill("alex@company.org");
+    await component.getByLabel("Message").fill("Inquiry about pricing");
+    await component.getByRole("button", { name: "Send message" }).click();
+
+    expect(submissions).toHaveLength(1);
+    expect(submissions[0]).toEqual({
+      name: "Alex",
+      email: "alex@company.org",
+      message: "Inquiry about pricing",
+    });
+  });
+
+  test("disables button during submission", async ({ mount }) => {
+    const component = await mount(
+      <ContactForm onSubmit={() => {}} submitting={true} />
+    );
+
+    await expect(
+      component.getByRole("button", { name: "Sending..." })
+    ).toBeDisabled();
+  });
+
+  test("associates labels with inputs for accessibility", async ({ mount }) => {
+    const component = await mount(<ContactForm onSubmit={() => {}} />);
+
+    await expect(
+      component.getByRole("textbox", { name: "Name" })
+    ).toBeVisible();
+    await expect(
+      component.getByRole("textbox", { name: "Email" })
+    ).toBeVisible();
+  });
+});
+```
+
+## E2E Tests
+
+**Ideal for**:
+
+- Critical user flows that generate revenue (checkout, signup)
+- Authentication flows (login, SSO, MFA, password reset)
+- Multi-page workflows where state carries across navigation
+- Flows involving third-party iframes (payment widgets)
+- Smoke tests validating the entire stack
+- Real-time collaboration requiring multiple browser contexts
+
+**Avoid for**:
+
+- Testing every form validation permutation
+- CRUD operations where UI is a thin wrapper
+- Verifying individual component states
+- Testing API response shapes or error codes
+- Responsive layout at every breakpoint
+- Edge cases that only affect the backend
+
+```typescript
+import { test, expect } from "@playwright/test";
+
+test.describe("subscription flow", () => {
+  test.beforeEach(async ({ page }) => {
+    await page.request.post("/api/test/seed-account", {
+      data: { plan: "free", email: "subscriber@demo.io" },
+    });
+    await page.goto("/account/upgrade");
+  });
+
+  test("upgrades to premium plan", async ({ page }) => {
+    await test.step("select plan", async () => {
+      await expect(
+        page.getByRole("heading", { name: "Choose Your Plan" })
+      ).toBeVisible();
+      await page.getByRole("button", { name: "Select Premium" }).click();
+    });
+
+    await test.step("enter billing details", async () => {
+      await page.getByLabel("Cardholder name").fill("Sam Johnson");
+      await page.getByLabel("Billing address").fill("456 Oak Ave");
+      await page.getByLabel("City").fill("Seattle");
+      await page.getByRole("combobox", { name: "State" }).selectOption("WA");
+      await page.getByLabel("Postal code").fill("98101");
+      await page.getByRole("button", { name: "Continue" }).click();
+    });
+
+    await test.step("complete payment", async () => {
+      const paymentFrame = page.frameLocator('iframe[title="Secure Payment"]');
+      await paymentFrame.getByLabel("Card number").fill("5555555555554444");
+      await paymentFrame.getByLabel("Expiry").fill("09/29");
+      await paymentFrame.getByLabel("CVV").fill("456");
+      await page.getByRole("button", { name: "Subscribe now" }).click();
+    });
+
+    await test.step("verify success", async () => {
+      await page.waitForURL("**/account/subscription/success**");
+      await expect(
+        page.getByRole("heading", { name: "Welcome to Premium" })
+      ).toBeVisible();
+      await expect(page.getByText(/Subscription #\d+/)).toBeVisible();
+    });
+  });
+});
+```
+
+## Layering Test Types
+
+Effective test suites combine all three types. Example for an "inventory management" feature:
+
+### API Layer (60% of tests)
+
+Cover every backend logic permutation. Cheap to run and maintain.
+
+```
+tests/api/inventory.spec.ts
+  - creates item with valid data (201)
+  - rejects duplicate SKU (409)
+  - rejects invalid quantity format (422)
+  - rejects missing required fields (422)
+  - warehouse-staff cannot delete items (403)
+  - unauthenticated request returns 401
+  - lists items with pagination
+  - filters items by category
+  - updates item stock level
+  - archives an item
+  - prevents archiving items with pending orders
+```
+
+### Component Layer (30% of tests)
+
+Cover every visual state and interaction.
+
+```
+tests/components/InventoryForm.spec.tsx
+  - shows validation errors on empty submit
+  - shows inline error for invalid SKU format
+  - disables submit while saving
+  - calls onSubmit with form data
+  - resets form after successful save
+
+tests/components/InventoryTable.spec.tsx
+  - renders item rows from props
+  - shows empty state when no items
+  - handles archive confirmation modal
+  - sorts by column header click
+  - shows stock level badges with correct colors
+```
+
+### E2E Layer (10% of tests)
+
+Cover only critical paths proving full stack works.
+
+```
+tests/e2e/inventory.spec.ts
+  - manager creates item and sees it in list
+  - manager updates item stock level
+  - warehouse-staff cannot access admin settings
+```
+
+### Execution Profile
+
+For this feature:
+
+- **11 API tests** — ~2 seconds total, no browser
+- **10 component tests** — ~5 seconds total, real browser but no server
+- **3 E2E tests** — ~15 seconds total, full stack
+
+Total: 24 tests, ~22 seconds. API tests catch most regressions. Component tests catch UI bugs. E2E tests prove wiring works. If E2E fails but API and component pass, the problem is in integration (routing, state management, API client).
+
+## Common Mistakes
+
+| Anti-Pattern                              | Problem                                                  | Better Approach                                                |
+| ----------------------------------------- | -------------------------------------------------------- | -------------------------------------------------------------- |
+| E2E for every validation rule             | 30-second browser test for something API covers in 200ms | API test for validation, one component test for error display  |
+| No API tests, all E2E                     | Slow suite, flaky from UI timing, hard to diagnose       | API tests for data/logic, E2E for critical paths only          |
+| Component tests mocking everything        | Tests pass but app broken because mocks drift            | Mock only external boundaries; API tests verify real contracts |
+| Same assertion in API, component, AND E2E | Triple maintenance cost                                  | Each layer tests what it uniquely verifies                     |
+| E2E creating test data via UI             | 2-minute test where 90 seconds is setup                  | Seed via API in `beforeEach`, test actual flow                 |
+| Testing third-party behavior              | Testing that Stripe validates cards (Stripe's job)       | Mock Stripe; trust their contract                              |
+| Skipping API layer                        | Can't tell if bug is frontend or backend                 | API tests isolate backend; component tests isolate frontend    |
+| One giant E2E for entire feature          | 5-minute test failing somewhere with no clear cause      | Focused E2E per critical path; use `test.step()`               |
+
+## Related
+
+- [test-suite-structure.md](../core/test-suite-structure.md) — file structure and naming
+- [api-testing.md](../testing-patterns/api-testing.md) — Playwright's `request` API for HTTP testing
+- [component-testing.md](../testing-patterns/component-testing.md) — setting up component tests
+- [authentication.md](../advanced/authentication.md) — auth flow patterns with `storageState`
+- [when-to-mock.md](when-to-mock.md) — when to mock vs hit real services
+- [pom-vs-fixtures.md](pom-vs-fixtures.md) — organizing shared test logic
diff --git a/.cursor/skills/playwright-testing/architecture/when-to-mock.md b/.cursor/skills/playwright-testing/architecture/when-to-mock.md
new file mode 100644
index 0000000000..d5d5705b1f
--- /dev/null
+++ b/.cursor/skills/playwright-testing/architecture/when-to-mock.md
@@ -0,0 +1,383 @@
+# Mocking Strategy: Real vs Mock Services
+
+## Table of Contents
+
+1. [Core Principle](#core-principle)
+2. [Decision Matrix](#decision-matrix)
+3. [Decision Flowchart](#decision-flowchart)
+4. [Mocking Techniques](#mocking-techniques)
+5. [Real Service Strategies](#real-service-strategies)
+6. [Hybrid Approach: Fixture-Based Mock Control](#hybrid-approach-fixture-based-mock-control)
+7. [Validating Mock Accuracy](#validating-mock-accuracy)
+8. [Anti-Patterns](#anti-patterns)
+
+> **When to use**: Deciding whether to mock API calls, intercept network requests, or hit real services in Playwright tests.
+
+## Core Principle
+
+**Mock at the boundary, test your stack end-to-end.** Mock third-party services you don't own (payment gateways, email providers, OAuth). Never mock your own frontend-to-backend communication. Tests should prove YOUR code works, not that third-party APIs are available.
+
+## Decision Matrix
+
+| Scenario | Mock? | Strategy |
+| --- | --- | --- |
+| Your own REST/GraphQL API | Never | Hit real API against staging or local dev |
+| Your database (through your API) | Never | Seed via API or fixtures |
+| Authentication (your auth system) | Mostly no | Use `storageState` to skip login in most tests |
+| Stripe / payment gateway | Always | `route.fulfill()` with expected responses |
+| SendGrid / email service | Always | Mock the API call, verify request payload |
+| OAuth providers (Google, GitHub) | Always | Mock token exchange, test your callback handler |
+| Analytics (Segment, Mixpanel) | Always | `route.abort()` or `route.fulfill()` |
+| Maps / geocoding APIs | Always | Mock with static responses |
+| Feature flags (LaunchDarkly) | Usually | Mock to force specific flag states |
+| CDN / static assets | Never | Let them load normally |
+| Flaky external dependency | CI: mock, local: real | Conditional mocking based on environment |
+| Slow external dependency | Dev: mock, nightly: real | Separate test projects in config |
+
+## Decision Flowchart
+
+```text
+Is this service part of YOUR codebase?
+├── YES → Do NOT mock. Test the real integration.
+│   ├── Is it slow? → Optimize the service, not the test.
+│   └── Is it flaky? → Fix the service. Flaky infra is a bug.
+└── NO → It's a third-party service.
+    ├── Is it paid per call? → ALWAYS mock.
+    ├── Is it rate-limited? → ALWAYS mock.
+    ├── Is it slow or unreliable? → ALWAYS mock.
+    └── Is it a complex multi-step flow? → Mock with HAR recording.
+```
+
+## Mocking Techniques
+
+### Blocking Unwanted Requests
+
+Block third-party scripts that slow tests and add no coverage:
+
+```typescript
+test.beforeEach(async ({ page }) => {
+  await page.route('**/{analytics,tracking,segment,hotjar}.{com,io}/**', (route) => {
+    route.abort();
+  });
+});
+
+test('dashboard renders without tracking scripts', async ({ page }) => {
+  await page.goto('/dashboard');
+  await expect(page.getByRole('heading', { name: 'Dashboard' })).toBeVisible();
+});
+```
+
+### Full Mock (route.fulfill)
+
+Completely replace a third-party API response:
+
+```typescript
+test('order flow with mocked payment service', async ({ page }) => {
+  await page.route('**/api/charge', (route) => {
+    route.fulfill({
+      status: 200,
+      contentType: 'application/json',
+      body: JSON.stringify({
+        transactionId: 'txn_mock_abc',
+        status: 'completed',
+      }),
+    });
+  });
+
+  await page.goto('/order/confirm');
+  await page.getByRole('button', { name: 'Complete Purchase' }).click();
+  await expect(page.getByText('Order confirmed')).toBeVisible();
+});
+
+test('display error on payment decline', async ({ page }) => {
+  await page.route('**/api/charge', (route) => {
+    route.fulfill({
+      status: 402,
+      contentType: 'application/json',
+      body: JSON.stringify({
+        error: { code: 'insufficient_funds', message: 'Card declined.' },
+      }),
+    });
+  });
+
+  await page.goto('/order/confirm');
+  await page.getByRole('button', { name: 'Complete Purchase' }).click();
+  await expect(page.getByRole('alert')).toContainText('Card declined');
+});
+```
+
+### Partial Mock (Modify Responses)
+
+Let the real API call happen but tweak the response:
+
+```typescript
+test('display low inventory warning', async ({ page }) => {
+  await page.route('**/api/inventory/*', async (route) => {
+    const response = await route.fetch();
+    const data = await response.json();
+
+    data.quantity = 1;
+    data.lowStock = true;
+
+    await route.fulfill({
+      response,
+      body: JSON.stringify(data),
+    });
+  });
+
+  await page.goto('/products/widget-pro');
+  await expect(page.getByText('Only 1 remaining')).toBeVisible();
+});
+
+test('inject test notification into real response', async ({ page }) => {
+  await page.route('**/api/alerts', async (route) => {
+    const response = await route.fetch();
+    const data = await response.json();
+
+    data.items.push({
+      id: 'test-alert',
+      text: 'Report generated',
+      category: 'info',
+    });
+
+    await route.fulfill({
+      response,
+      body: JSON.stringify(data),
+    });
+  });
+
+  await page.goto('/home');
+  await expect(page.getByText('Report generated')).toBeVisible();
+});
+```
+
+### Record and Replay (HAR Files)
+
+For complex API sequences (OAuth flows, multi-step wizards):
+
+**Recording:**
+
+```typescript
+test('capture API traffic for admin panel', async ({ page }) => {
+  await page.routeFromHAR('tests/fixtures/admin-panel.har', {
+    url: '**/api/**',
+    update: true,
+  });
+
+  await page.goto('/admin');
+  await page.getByRole('tab', { name: 'Reports' }).click();
+  await page.getByRole('tab', { name: 'Settings' }).click();
+});
+```
+
+**Replaying:**
+
+```typescript
+test('admin panel loads with recorded data', async ({ page }) => {
+  await page.routeFromHAR('tests/fixtures/admin-panel.har', {
+    url: '**/api/**',
+    update: false,
+  });
+
+  await page.goto('/admin');
+  await expect(page.getByRole('heading', { name: 'Reports' })).toBeVisible();
+});
+```
+
+**HAR maintenance:**
+
+- Record against a known-good staging environment
+- Commit `.har` files to version control
+- Re-record when APIs change
+- Scope HAR to specific URL patterns
+
+## Real Service Strategies
+
+### Local Dev Server
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  webServer: {
+    command: 'npm run dev',
+    url: 'http://localhost:3000',
+    reuseExistingServer: !process.env.CI,
+    timeout: 30_000,
+  },
+  use: {
+    baseURL: 'http://localhost:3000',
+  },
+});
+```
+
+### Staging Environment
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  use: {
+    baseURL: process.env.CI
+      ? 'https://staging.example.com'
+      : 'http://localhost:3000',
+  },
+});
+```
+
+### Test Containers
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  webServer: {
+    command: 'docker compose -f docker-compose.test.yml up --wait',
+    url: 'http://localhost:3000/health',
+    reuseExistingServer: !process.env.CI,
+    timeout: 120_000,
+  },
+  globalTeardown: './tests/global-teardown.ts',
+});
+```
+
+```typescript
+// tests/global-teardown.ts
+import { execSync } from 'child_process';
+
+export default function globalTeardown() {
+  if (process.env.CI) {
+    execSync('docker compose -f docker-compose.test.yml down -v');
+  }
+}
+```
+
+## Hybrid Approach: Fixture-Based Mock Control
+
+Create fixtures that let individual tests opt into mocking specific services:
+
+```typescript
+// tests/fixtures/service-mocks.ts
+import { test as base } from '@playwright/test';
+
+type MockConfig = {
+  mockPayments: boolean;
+  mockNotifications: boolean;
+  mockAnalytics: boolean;
+};
+
+export const test = base.extend<MockConfig>({
+  mockPayments: [true, { option: true }],
+  mockNotifications: [true, { option: true }],
+  mockAnalytics: [true, { option: true }],
+
+  page: async ({ page, mockPayments, mockNotifications, mockAnalytics }, use) => {
+    if (mockPayments) {
+      await page.route('**/api/billing/**', (route) => {
+        route.fulfill({
+          status: 200,
+          contentType: 'application/json',
+          body: JSON.stringify({ status: 'paid', id: 'inv_mock_789' }),
+        });
+      });
+    }
+
+    if (mockNotifications) {
+      await page.route('**/api/notify', (route) => {
+        route.fulfill({
+          status: 200,
+          contentType: 'application/json',
+          body: JSON.stringify({ delivered: true }),
+        });
+      });
+    }
+
+    if (mockAnalytics) {
+      await page.route('**/{segment,mixpanel,amplitude}.**/**', (route) => {
+        route.abort();
+      });
+    }
+
+    await use(page);
+  },
+});
+
+export { expect } from '@playwright/test';
+```
+
+```typescript
+// tests/billing.spec.ts
+import { test, expect } from './fixtures/service-mocks';
+
+test('subscription renewal sends notification', async ({ page }) => {
+  await page.goto('/account/billing');
+  await page.getByRole('button', { name: 'Renew Now' }).click();
+  await expect(page.getByText('Subscription renewed')).toBeVisible();
+});
+
+test.describe('integration suite', () => {
+  test.use({ mockPayments: false });
+
+  test('real billing flow against test gateway', async ({ page }) => {
+    await page.goto('/account/billing');
+    await page.getByRole('button', { name: 'Renew Now' }).click();
+    await expect(page.getByText('Subscription renewed')).toBeVisible();
+  });
+});
+```
+
+### Environment-Based Test Projects
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  projects: [
+    {
+      name: 'ci-fast',
+      testMatch: '**/*.spec.ts',
+      use: { baseURL: 'http://localhost:3000' },
+    },
+    {
+      name: 'nightly-full',
+      testMatch: '**/*.integration.spec.ts',
+      use: { baseURL: 'https://staging.example.com' },
+      timeout: 120_000,
+    },
+  ],
+});
+```
+
+## Validating Mock Accuracy
+
+Guard against mock drift from real APIs:
+
+```typescript
+test.describe('contract validation', () => {
+  test('billing mock matches real API shape', async ({ request }) => {
+    const realResponse = await request.post('/api/billing/charge', {
+      data: { amount: 5000, currency: 'usd' },
+    });
+    const realBody = await realResponse.json();
+
+    const mockBody = {
+      status: 'paid',
+      id: 'inv_mock_789',
+    };
+
+    expect(Object.keys(mockBody).sort()).toEqual(Object.keys(realBody).sort());
+
+    for (const key of Object.keys(mockBody)) {
+      expect(typeof mockBody[key]).toBe(typeof realBody[key]);
+    }
+  });
+});
+```
+
+## Anti-Patterns
+
+| Don't Do This | Problem | Do This Instead |
+| --- | --- | --- |
+| Mock your own API | Tests pass, app breaks. Zero integration coverage. | Hit your real API. Mock only third-party services. |
+| Mock everything for speed | You test a fiction. Frontend and backend may be incompatible. | Mock only external boundaries. |
+| Never mock anything | Tests are slow, flaky, fail when third parties have outages. | Mock third-party services. |
+| Use outdated mocks | Mock returns different shape than real API. | Run contract validation tests. Re-record HAR files regularly. |
+| Mock with `page.evaluate()` to stub fetch | Fragile, doesn't survive navigation. | Use `page.route()` which intercepts at network layer. |
+| Copy-paste mocks across files | One API change requires updating many files. | Centralize mocks in fixtures. |
+| Block all network and whitelist | Extremely brittle. Every new endpoint requires update. | Allow all by default. Selectively mock third-party services. |
diff --git a/.cursor/skills/playwright-testing/browser-apis/browser-apis.md b/.cursor/skills/playwright-testing/browser-apis/browser-apis.md
new file mode 100644
index 0000000000..cc4c269e8f
--- /dev/null
+++ b/.cursor/skills/playwright-testing/browser-apis/browser-apis.md
@@ -0,0 +1,391 @@
+# Browser APIs: Geolocation, Permissions & More
+
+## Table of Contents
+
+1. [Geolocation](#geolocation)
+2. [Permissions](#permissions)
+3. [Clipboard](#clipboard)
+4. [Notifications](#notifications)
+5. [Camera & Microphone](#camera--microphone)
+
+## Geolocation
+
+### Mock Location
+
+```typescript
+test("shows nearby stores", async ({ context }) => {
+  // Grant permission and set location
+  await context.grantPermissions(["geolocation"]);
+  await context.setGeolocation({ latitude: 37.7749, longitude: -122.4194 }); // San Francisco
+
+  const page = await context.newPage();
+  await page.goto("/store-finder");
+  await page.getByRole("button", { name: "Find Nearby" }).click();
+
+  await expect(page.getByText("San Francisco")).toBeVisible();
+});
+```
+
+### Geolocation Fixture
+
+```typescript
+// fixtures/geolocation.fixture.ts
+import { test as base } from "@playwright/test";
+
+type Coordinates = { latitude: number; longitude: number; accuracy?: number };
+
+type GeoFixtures = {
+  setLocation: (coords: Coordinates) => Promise<void>;
+};
+
+export const test = base.extend<GeoFixtures>({
+  setLocation: async ({ context }, use) => {
+    await context.grantPermissions(["geolocation"]);
+
+    await use(async (coords) => {
+      await context.setGeolocation({
+        latitude: coords.latitude,
+        longitude: coords.longitude,
+        accuracy: coords.accuracy ?? 100,
+      });
+    });
+  },
+});
+
+// Usage
+test("delivery zone check", async ({ page, setLocation }) => {
+  await setLocation({ latitude: 40.7128, longitude: -74.006 }); // NYC
+
+  await page.goto("/delivery");
+
+  await expect(page.getByText("Delivery available")).toBeVisible();
+});
+```
+
+### Test Location Changes
+
+```typescript
+test("tracks location updates", async ({ context }) => {
+  await context.grantPermissions(["geolocation"]);
+
+  const page = await context.newPage();
+  await page.goto("/tracking");
+
+  // Initial location
+  await context.setGeolocation({ latitude: 37.7749, longitude: -122.4194 });
+  await page.getByRole("button", { name: "Start Tracking" }).click();
+
+  await expect(page.getByTestId("location")).toContainText("37.7749");
+
+  // Move to new location
+  await context.setGeolocation({ latitude: 37.8044, longitude: -122.2712 });
+
+  // Trigger location update
+  await page.evaluate(() => {
+    navigator.geolocation.getCurrentPosition(() => {});
+  });
+
+  await expect(page.getByTestId("location")).toContainText("37.8044");
+});
+```
+
+### Test Geolocation Denial
+
+```typescript
+test("handles location denied", async ({ browser }) => {
+  // Create context without geolocation permission
+  const context = await browser.newContext({
+    permissions: [], // No permissions
+  });
+
+  const page = await context.newPage();
+  await page.goto("/store-finder");
+  await page.getByRole("button", { name: "Find Nearby" }).click();
+
+  await expect(page.getByText("Location access denied")).toBeVisible();
+  await expect(page.getByLabel("Enter ZIP code")).toBeVisible();
+
+  await context.close();
+});
+```
+
+## Permissions
+
+### Grant Permissions
+
+```typescript
+test("notifications with permission", async ({ context }) => {
+  await context.grantPermissions(["notifications"]);
+
+  const page = await context.newPage();
+  await page.goto("/alerts");
+
+  // Notification API should work
+  const permission = await page.evaluate(() => Notification.permission);
+  expect(permission).toBe("granted");
+});
+```
+
+### Test Permission Denied
+
+```typescript
+test("handles notification permission denied", async ({ browser }) => {
+  const context = await browser.newContext({
+    permissions: [], // Deny all
+  });
+
+  const page = await context.newPage();
+  await page.goto("/notifications");
+
+  await page.getByRole("button", { name: "Enable Notifications" }).click();
+
+  await expect(page.getByText("Please enable notifications")).toBeVisible();
+
+  await context.close();
+});
+```
+
+### Multiple Permissions
+
+```typescript
+test("video call with permissions", async ({ context }) => {
+  await context.grantPermissions(["camera", "microphone", "notifications"]);
+
+  const page = await context.newPage();
+  await page.goto("/video-call");
+
+  // All permissions should be granted
+  const permissions = await page.evaluate(async () => ({
+    camera: await navigator.permissions.query({
+      name: "camera" as PermissionName,
+    }),
+    microphone: await navigator.permissions.query({
+      name: "microphone" as PermissionName,
+    }),
+  }));
+
+  expect(permissions.camera.state).toBe("granted");
+  expect(permissions.microphone.state).toBe("granted");
+});
+```
+
+## Clipboard
+
+### Test Copy to Clipboard
+
+```typescript
+test("copy button works", async ({ page, context }) => {
+  // Grant clipboard permissions
+  await context.grantPermissions(["clipboard-read", "clipboard-write"]);
+
+  await page.goto("/share");
+
+  await page.getByRole("button", { name: "Copy Link" }).click();
+
+  // Read clipboard content
+  const clipboardContent = await page.evaluate(() =>
+    navigator.clipboard.readText(),
+  );
+
+  expect(clipboardContent).toContain("https://example.com/share/");
+});
+```
+
+### Test Paste from Clipboard
+
+```typescript
+test("paste from clipboard", async ({ page, context }) => {
+  await context.grantPermissions(["clipboard-read", "clipboard-write"]);
+
+  await page.goto("/editor");
+
+  // Write to clipboard
+  await page.evaluate(() => navigator.clipboard.writeText("Pasted content"));
+
+  // Trigger paste
+  await page.getByLabel("Content").focus();
+  await page.keyboard.press("Control+V");
+
+  await expect(page.getByLabel("Content")).toHaveValue("Pasted content");
+});
+```
+
+### Clipboard Fixture
+
+```typescript
+// fixtures/clipboard.fixture.ts
+import { test as base } from "@playwright/test";
+
+type ClipboardFixtures = {
+  clipboard: {
+    write: (text: string) => Promise<void>;
+    read: () => Promise<string>;
+  };
+};
+
+export const test = base.extend<ClipboardFixtures>({
+  clipboard: async ({ page, context }, use) => {
+    await context.grantPermissions(["clipboard-read", "clipboard-write"]);
+
+    await use({
+      write: async (text) => {
+        await page.evaluate((t) => navigator.clipboard.writeText(t), text);
+      },
+      read: async () => {
+        return page.evaluate(() => navigator.clipboard.readText());
+      },
+    });
+  },
+});
+```
+
+## Notifications
+
+### Mock Notification API
+
+```typescript
+test("shows browser notification", async ({ page }) => {
+  const notifications: any[] = [];
+
+  // Mock Notification constructor
+  await page.addInitScript(() => {
+    (window as any).__notifications = [];
+    (window as any).Notification = class {
+      constructor(title: string, options?: NotificationOptions) {
+        (window as any).__notifications.push({ title, ...options });
+      }
+      static permission = "granted";
+      static requestPermission = async () => "granted";
+    };
+  });
+
+  await page.goto("/alerts");
+  await page.getByRole("button", { name: "Notify Me" }).click();
+
+  // Check notification was created
+  const created = await page.evaluate(() => (window as any).__notifications);
+  expect(created).toHaveLength(1);
+  expect(created[0].title).toBe("New Alert");
+});
+```
+
+### Test Notification Click
+
+```typescript
+test("notification click handler", async ({ page }) => {
+  await page.addInitScript(() => {
+    (window as any).Notification = class {
+      onclick: (() => void) | null = null;
+      constructor(title: string) {
+        // Simulate click after creation
+        setTimeout(() => this.onclick?.(), 100);
+      }
+      static permission = "granted";
+      static requestPermission = async () => "granted";
+    };
+  });
+
+  await page.goto("/messages");
+  await page.evaluate(() => {
+    new Notification("New Message");
+  });
+
+  // Should navigate to messages when notification clicked
+  await expect(page).toHaveURL(/\/messages/);
+});
+```
+
+## Camera & Microphone
+
+### Mock Media Devices
+
+```typescript
+test("video preview works", async ({ page, context }) => {
+  await context.grantPermissions(["camera"]);
+
+  // Mock getUserMedia
+  await page.addInitScript(() => {
+    navigator.mediaDevices.getUserMedia = async () => {
+      const canvas = document.createElement("canvas");
+      canvas.width = 640;
+      canvas.height = 480;
+      return canvas.captureStream();
+    };
+  });
+
+  await page.goto("/video-settings");
+  await page.getByRole("button", { name: "Start Camera" }).click();
+
+  await expect(page.getByTestId("video-preview")).toBeVisible();
+});
+```
+
+### Test Media Device Selection
+
+```typescript
+test("switch camera", async ({ page }) => {
+  await page.addInitScript(() => {
+    navigator.mediaDevices.enumerateDevices = async () =>
+      [
+        {
+          deviceId: "cam1",
+          kind: "videoinput",
+          label: "Front Camera",
+          groupId: "1",
+        },
+        {
+          deviceId: "cam2",
+          kind: "videoinput",
+          label: "Back Camera",
+          groupId: "2",
+        },
+      ] as MediaDeviceInfo[];
+
+    navigator.mediaDevices.getUserMedia = async () => {
+      const canvas = document.createElement("canvas");
+      return canvas.captureStream();
+    };
+  });
+
+  await page.goto("/camera");
+
+  // Should show camera options
+  await expect(page.getByRole("combobox", { name: "Camera" })).toBeVisible();
+  await expect(page.getByText("Front Camera")).toBeVisible();
+  await expect(page.getByText("Back Camera")).toBeVisible();
+});
+```
+
+### Test Media Errors
+
+```typescript
+test("handles camera access error", async ({ page }) => {
+  await page.addInitScript(() => {
+    navigator.mediaDevices.getUserMedia = async () => {
+      throw new DOMException("Permission denied", "NotAllowedError");
+    };
+  });
+
+  await page.goto("/video-call");
+  await page.getByRole("button", { name: "Join Call" }).click();
+
+  await expect(page.getByText("Camera access denied")).toBeVisible();
+  await expect(
+    page.getByRole("button", { name: "Join Audio Only" }),
+  ).toBeVisible();
+});
+```
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern                  | Problem                           | Solution                            |
+| ----------------------------- | --------------------------------- | ----------------------------------- |
+| Not granting permissions      | Tests fail with permission errors | Use `context.grantPermissions()`    |
+| Testing real geolocation      | Flaky, environment-dependent      | Mock with `setGeolocation()`        |
+| Not testing permission denial | Misses error handling             | Test both granted and denied states |
+| Using real camera/mic         | CI has no devices                 | Mock `getUserMedia`                 |
+
+## Related References
+
+- **Fixtures**: See [fixtures-hooks.md](../core/fixtures-hooks.md) for context fixtures
+- **Mobile**: See [mobile-testing.md](../advanced/mobile-testing.md) for device emulation
diff --git a/.cursor/skills/playwright-testing/browser-apis/iframes.md b/.cursor/skills/playwright-testing/browser-apis/iframes.md
new file mode 100644
index 0000000000..155cc1c1b4
--- /dev/null
+++ b/.cursor/skills/playwright-testing/browser-apis/iframes.md
@@ -0,0 +1,403 @@
+# iFrame Testing
+
+## Table of Contents
+
+1. [Basic iFrame Access](#basic-iframe-access)
+2. [Cross-Origin iFrames](#cross-origin-iframes)
+3. [Nested iFrames](#nested-iframes)
+4. [Dynamic iFrames](#dynamic-iframes)
+5. [iFrame Navigation](#iframe-navigation)
+6. [Common Patterns](#common-patterns)
+
+## Basic iFrame Access
+
+### Using frameLocator
+
+```typescript
+// Access iframe by selector
+const frame = page.frameLocator("iframe#payment");
+await frame.getByRole("button", { name: "Pay" }).click();
+
+// Access by name attribute
+const namedFrame = page.frameLocator('iframe[name="checkout"]');
+await namedFrame.getByLabel("Card number").fill("4242424242424242");
+
+// Access by title
+const titledFrame = page.frameLocator('iframe[title="Payment Form"]');
+
+// Access by src (partial match)
+const srcFrame = page.frameLocator('iframe[src*="stripe.com"]');
+```
+
+### Frame vs FrameLocator
+
+```typescript
+// frameLocator - for locator-based operations (recommended)
+const frameLocator = page.frameLocator("#my-iframe");
+await frameLocator.getByRole("button").click();
+
+// frame() - for Frame object operations (navigation, evaluation)
+const frame = page.frame({ name: "my-frame" });
+if (frame) {
+  await frame.goto("https://example.com");
+  const title = await frame.title();
+}
+
+// Get all frames
+const frames = page.frames();
+for (const f of frames) {
+  console.log("Frame URL:", f.url());
+}
+```
+
+### Waiting for iFrame Content
+
+```typescript
+// Wait for iframe to load
+const frame = page.frameLocator("#dynamic-iframe");
+
+// Wait for element inside iframe
+await expect(frame.getByRole("heading")).toBeVisible({ timeout: 10000 });
+
+// Wait for iframe src to change
+await page.waitForFunction(() => {
+  const iframe = document.querySelector("iframe#my-frame") as HTMLIFrameElement;
+  return iframe?.src.includes("loaded");
+});
+```
+
+## Cross-Origin iFrames
+
+### Accessing Cross-Origin Content
+
+```typescript
+// Cross-origin iframes work seamlessly with frameLocator
+const thirdPartyFrame = page.frameLocator('iframe[src*="third-party.com"]');
+
+// Interact with elements inside cross-origin iframe
+await thirdPartyFrame.getByRole("textbox").fill("test@example.com");
+await thirdPartyFrame.getByRole("button", { name: "Submit" }).click();
+
+// Wait for cross-origin iframe to be ready
+await expect(thirdPartyFrame.locator("body")).toBeVisible();
+```
+
+### Payment Provider iFrames (Stripe, PayPal)
+
+```typescript
+test("Stripe payment iframe", async ({ page }) => {
+  await page.goto("/checkout");
+
+  // Stripe uses multiple iframes for each field
+  const cardFrame = page
+    .frameLocator('iframe[name*="__privateStripeFrame"]')
+    .first();
+
+  // Wait for Stripe to initialize
+  await expect(cardFrame.locator('[placeholder="Card number"]')).toBeVisible({
+    timeout: 15000,
+  });
+
+  // Fill card details
+  await cardFrame
+    .locator('[placeholder="Card number"]')
+    .fill("4242424242424242");
+  await cardFrame.locator('[placeholder="MM / YY"]').fill("12/30");
+  await cardFrame.locator('[placeholder="CVC"]').fill("123");
+});
+```
+
+### Handling OAuth in iFrames
+
+```typescript
+test("OAuth iframe flow", async ({ page }) => {
+  await page.goto("/login");
+  await page.getByRole("button", { name: "Sign in with Google" }).click();
+
+  // If OAuth opens in iframe instead of popup
+  const oauthFrame = page.frameLocator('iframe[src*="accounts.google.com"]');
+
+  // Wait for OAuth form
+  await expect(oauthFrame.getByLabel("Email")).toBeVisible({ timeout: 10000 });
+  await oauthFrame.getByLabel("Email").fill("test@gmail.com");
+});
+```
+
+## Nested iFrames
+
+### Accessing Nested Frames
+
+```typescript
+// Parent iframe contains child iframe
+const parentFrame = page.frameLocator("#outer-frame");
+const childFrame = parentFrame.frameLocator("#inner-frame");
+
+// Interact with deeply nested content
+await childFrame.getByRole("button", { name: "Submit" }).click();
+
+// Multiple levels of nesting
+const level1 = page.frameLocator("#level1");
+const level2 = level1.frameLocator("#level2");
+const level3 = level2.frameLocator("#level3");
+await level3.getByText("Deep content").click();
+```
+
+### Finding Elements Across Frame Hierarchy
+
+```typescript
+// Helper to search all frames for an element
+async function findInAnyFrame(
+  page: Page,
+  selector: string,
+): Promise<Locator | null> {
+  // Check main page first
+  const mainCount = await page.locator(selector).count();
+  if (mainCount > 0) return page.locator(selector);
+
+  // Check all frames
+  for (const frame of page.frames()) {
+    const count = await frame.locator(selector).count();
+    if (count > 0) {
+      return frame.locator(selector);
+    }
+  }
+  return null;
+}
+
+test("find element in any frame", async ({ page }) => {
+  await page.goto("/complex-page");
+  const element = await findInAnyFrame(page, '[data-testid="submit-btn"]');
+  if (element) await element.click();
+});
+```
+
+## Dynamic iFrames
+
+### iFrames Created at Runtime
+
+```typescript
+test("handle dynamically created iframe", async ({ page }) => {
+  await page.goto("/dashboard");
+
+  // Click button that creates iframe
+  await page.getByRole("button", { name: "Open Widget" }).click();
+
+  // Wait for iframe to appear in DOM
+  await page.waitForSelector("iframe#widget-frame");
+
+  // Now access the frame
+  const widgetFrame = page.frameLocator("#widget-frame");
+  await expect(widgetFrame.getByText("Widget Loaded")).toBeVisible();
+});
+```
+
+### iFrames with Changing src
+
+```typescript
+test("iframe src changes", async ({ page }) => {
+  await page.goto("/multi-step");
+
+  const frame = page.frameLocator("#step-frame");
+
+  // Step 1
+  await expect(frame.getByText("Step 1")).toBeVisible();
+  await frame.getByRole("button", { name: "Next" }).click();
+
+  // Wait for iframe to reload with new content
+  await expect(frame.getByText("Step 2")).toBeVisible({ timeout: 10000 });
+  await frame.getByRole("button", { name: "Next" }).click();
+
+  // Step 3
+  await expect(frame.getByText("Step 3")).toBeVisible({ timeout: 10000 });
+});
+```
+
+### Lazy-Loaded iFrames
+
+```typescript
+test("lazy loaded iframe", async ({ page }) => {
+  await page.goto("/page-with-lazy-iframe");
+
+  // Scroll to trigger lazy load
+  await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
+
+  // Wait for iframe to load
+  const lazyFrame = page.frameLocator("#lazy-iframe");
+  await expect(lazyFrame.locator("body")).not.toBeEmpty({ timeout: 15000 });
+
+  // Interact with content
+  await lazyFrame.getByRole("button").click();
+});
+```
+
+## iFrame Navigation
+
+### Navigating Within iFrame
+
+```typescript
+test("iframe internal navigation", async ({ page }) => {
+  await page.goto("/app");
+
+  // Get frame object for navigation control
+  const frame = page.frame({ name: "content-frame" });
+  if (!frame) throw new Error("Frame not found");
+
+  // Navigate within iframe
+  await frame.goto("https://embedded-app.com/page2");
+
+  // Wait for navigation
+  await frame.waitForURL("**/page2");
+
+  // Verify content
+  await expect(frame.getByRole("heading")).toHaveText("Page 2");
+});
+```
+
+### Handling Frame Navigation Events
+
+```typescript
+test("track iframe navigation", async ({ page }) => {
+  const navigations: string[] = [];
+
+  // Listen to frame navigation
+  page.on("framenavigated", (frame) => {
+    if (frame.parentFrame()) {
+      // This is an iframe navigation
+      navigations.push(frame.url());
+    }
+  });
+
+  await page.goto("/with-iframe");
+  await page
+    .frameLocator("#nav-frame")
+    .getByRole("link", { name: "Page 2" })
+    .click();
+
+  // Verify navigation occurred
+  expect(navigations.some((url) => url.includes("page2"))).toBe(true);
+});
+```
+
+## Common Patterns
+
+### iFrame Fixture
+
+```typescript
+// fixtures.ts
+import { test as base, FrameLocator } from "@playwright/test";
+
+export const test = base.extend<{ paymentFrame: FrameLocator }>({
+  paymentFrame: async ({ page }, use) => {
+    await page.goto("/checkout");
+
+    // Wait for payment iframe to be ready
+    const frame = page.frameLocator('iframe[src*="payment"]');
+    await expect(frame.locator("body")).toBeVisible({ timeout: 15000 });
+
+    await use(frame);
+  },
+});
+
+// test file
+test("complete payment", async ({ paymentFrame }) => {
+  await paymentFrame.getByLabel("Card").fill("4242424242424242");
+  await paymentFrame.getByRole("button", { name: "Pay" }).click();
+});
+```
+
+### Debugging iFrame Issues
+
+```typescript
+test("debug iframe content", async ({ page }) => {
+  await page.goto("/page-with-iframes");
+
+  // List all frames
+  console.log("All frames:");
+  for (const frame of page.frames()) {
+    console.log(`  - ${frame.name() || "(unnamed)"}: ${frame.url()}`);
+  }
+
+  // Screenshot specific iframe content
+  const frame = page.frame({ name: "target-frame" });
+  if (frame) {
+    const body = frame.locator("body");
+    await body.screenshot({ path: "iframe-content.png" });
+  }
+
+  // Get iframe HTML for debugging
+  const frameContent = page.frameLocator("#my-frame");
+  const html = await frameContent.locator("body").innerHTML();
+  console.log("iFrame HTML:", html.substring(0, 500));
+});
+```
+
+### Handling iFrame Load Failures
+
+```typescript
+test("handle iframe load failure", async ({ page }) => {
+  await page.goto("/page-with-unreliable-iframe");
+
+  const frame = page.frameLocator("#unreliable-frame");
+
+  try {
+    // Try to interact with iframe content
+    await expect(frame.getByRole("button")).toBeVisible({ timeout: 5000 });
+    await frame.getByRole("button").click();
+  } catch (error) {
+    // Fallback: refresh iframe
+    await page.evaluate(() => {
+      const iframe = document.querySelector(
+        "#unreliable-frame",
+      ) as HTMLIFrameElement;
+      if (iframe) iframe.src = iframe.src;
+    });
+
+    // Retry
+    await expect(frame.getByRole("button")).toBeVisible({ timeout: 10000 });
+    await frame.getByRole("button").click();
+  }
+});
+```
+
+### Mocking iFrame Content
+
+```typescript
+test("mock iframe response", async ({ page }) => {
+  // Intercept iframe src request
+  await page.route("**/embedded-widget**", (route) => {
+    route.fulfill({
+      contentType: "text/html",
+      body: `
+        <!DOCTYPE html>
+        <html>
+          <body>
+            <h1>Mocked Widget</h1>
+            <p>Mocked widget content</p>
+          </body>
+        </html>
+      `,
+    });
+  });
+
+  await page.goto("/page-with-widget");
+
+  const frame = page.frameLocator("#widget-frame");
+  await expect(frame.getByRole("heading")).toHaveText("Mocked Widget");
+});
+```
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern                          | Problem                           | Solution                                           |
+| ------------------------------------- | --------------------------------- | -------------------------------------------------- |
+| Using `page.frame()` for interactions | Less reliable than frameLocator   | Use `page.frameLocator()` for element interactions |
+| Hardcoding iframe index               | Fragile if DOM order changes      | Use name, id, or src attribute selectors           |
+| Not waiting for iframe load           | Race conditions                   | Wait for element inside iframe to be visible       |
+| Assuming same-origin                  | Cross-origin has different timing | Always wait for iframe content explicitly          |
+| Ignoring nested iframes               | Element not found                 | Chain frameLocator calls for nested frames         |
+
+## Related References
+
+- **Locators**: See [locators.md](../core/locators.md) for selector strategies
+- **Third-party services**: See [third-party.md](../advanced/third-party.md) for payment iframe patterns
+- **Debugging**: See [debugging.md](../debugging/debugging.md) for troubleshooting iframe issues
diff --git a/.cursor/skills/playwright-testing/browser-apis/service-workers.md b/.cursor/skills/playwright-testing/browser-apis/service-workers.md
new file mode 100644
index 0000000000..7603de322e
--- /dev/null
+++ b/.cursor/skills/playwright-testing/browser-apis/service-workers.md
@@ -0,0 +1,504 @@
+# Service Worker Testing
+
+## Table of Contents
+
+1. [Service Worker Basics](#service-worker-basics)
+2. [Registration & Lifecycle](#registration--lifecycle)
+3. [Cache Testing](#cache-testing)
+4. [Offline Testing](#offline-testing)
+5. [Push Notifications](#push-notifications)
+6. [Background Sync](#background-sync)
+
+## Service Worker Basics
+
+### Waiting for Service Worker Registration
+
+```typescript
+test("service worker registers", async ({ page }) => {
+  await page.goto("/pwa-app");
+
+  // Wait for SW to register
+  const swRegistered = await page.evaluate(async () => {
+    if (!("serviceWorker" in navigator)) return false;
+
+    const registration = await navigator.serviceWorker.ready;
+    return !!registration.active;
+  });
+
+  expect(swRegistered).toBe(true);
+});
+```
+
+### Getting Service Worker State
+
+```typescript
+test("check SW state", async ({ page }) => {
+  await page.goto("/");
+
+  const swState = await page.evaluate(async () => {
+    const registration = await navigator.serviceWorker.getRegistration();
+    if (!registration) return null;
+
+    return {
+      installing: !!registration.installing,
+      waiting: !!registration.waiting,
+      active: !!registration.active,
+      scope: registration.scope,
+    };
+  });
+
+  expect(swState?.active).toBe(true);
+  expect(swState?.scope).toContain(page.url());
+});
+```
+
+### Service Worker Context
+
+```typescript
+test("access service worker", async ({ context, page }) => {
+  await page.goto("/pwa-app");
+
+  // Get all service workers in context
+  const workers = context.serviceWorkers();
+
+  // Wait for service worker if not yet available
+  if (workers.length === 0) {
+    await context.waitForEvent("serviceworker");
+  }
+
+  const sw = context.serviceWorkers()[0];
+  expect(sw.url()).toContain("sw.js");
+});
+```
+
+## Registration & Lifecycle
+
+### Testing SW Update Flow
+
+```typescript
+test("service worker updates", async ({ page }) => {
+  await page.goto("/pwa-app");
+
+  // Check for update
+  const hasUpdate = await page.evaluate(async () => {
+    const registration = await navigator.serviceWorker.ready;
+    await registration.update();
+
+    return new Promise<boolean>((resolve) => {
+      if (registration.waiting) {
+        resolve(true);
+      } else {
+        registration.addEventListener("updatefound", () => {
+          resolve(true);
+        });
+        // Timeout if no update
+        setTimeout(() => resolve(false), 5000);
+      }
+    });
+  });
+
+  // If update found, test skip waiting flow
+  if (hasUpdate) {
+    await page.evaluate(async () => {
+      const registration = await navigator.serviceWorker.ready;
+      registration.waiting?.postMessage({ type: "SKIP_WAITING" });
+    });
+
+    // Wait for controller change
+    await page.evaluate(() => {
+      return new Promise<void>((resolve) => {
+        navigator.serviceWorker.addEventListener("controllerchange", () => {
+          resolve();
+        });
+      });
+    });
+  }
+});
+```
+
+### Testing SW Installation
+
+```typescript
+test("verify SW install event", async ({ context, page }) => {
+  // Listen for service worker before navigating
+  const swPromise = context.waitForEvent("serviceworker");
+
+  await page.goto("/pwa-app");
+
+  const sw = await swPromise;
+
+  // Evaluate in SW context
+  const swVersion = await sw.evaluate(() => {
+    // Access SW globals
+    return (self as any).SW_VERSION || "unknown";
+  });
+
+  expect(swVersion).toBe("1.0.0");
+});
+```
+
+### Unregistering Service Workers
+
+```typescript
+test.beforeEach(async ({ page }) => {
+  await page.goto("/");
+
+  // Unregister all service workers for clean state
+  await page.evaluate(async () => {
+    const registrations = await navigator.serviceWorker.getRegistrations();
+    await Promise.all(registrations.map((r) => r.unregister()));
+  });
+
+  // Clear caches
+  await page.evaluate(async () => {
+    const cacheNames = await caches.keys();
+    await Promise.all(cacheNames.map((name) => caches.delete(name)));
+  });
+});
+```
+
+## Cache Testing
+
+### Verifying Cached Resources
+
+```typescript
+test("assets are cached", async ({ page }) => {
+  await page.goto("/pwa-app");
+
+  // Wait for SW to cache assets
+  await page.evaluate(async () => {
+    await navigator.serviceWorker.ready;
+  });
+
+  // Check cache contents
+  const cachedUrls = await page.evaluate(async () => {
+    const cache = await caches.open("app-cache-v1");
+    const requests = await cache.keys();
+    return requests.map((r) => r.url);
+  });
+
+  expect(cachedUrls).toContain(expect.stringContaining("/styles.css"));
+  expect(cachedUrls).toContain(expect.stringContaining("/app.js"));
+});
+```
+
+### Testing Cache Strategies
+
+```typescript
+test("cache-first strategy", async ({ page }) => {
+  await page.goto("/pwa-app");
+
+  // Wait for initial cache
+  await page.waitForFunction(async () => {
+    const cache = await caches.open("app-cache-v1");
+    const keys = await cache.keys();
+    return keys.length > 0;
+  });
+
+  // Block network for cached resources
+  await page.route("**/styles.css", (route) => route.abort());
+
+  // Reload - should work from cache
+  await page.reload();
+
+  // Verify page still styled (CSS loaded from cache)
+  const hasStyles = await page.evaluate(() => {
+    const body = document.body;
+    const styles = window.getComputedStyle(body);
+    return styles.fontFamily !== ""; // Has custom font from CSS
+  });
+
+  expect(hasStyles).toBe(true);
+});
+```
+
+### Testing Cache Updates
+
+```typescript
+test("cache updates on new version", async ({ page }) => {
+  await page.goto("/pwa-app");
+
+  // Get initial cache
+  const initialCacheKeys = await page.evaluate(async () => {
+    const cache = await caches.open("app-cache-v1");
+    const keys = await cache.keys();
+    return keys.map((r) => r.url);
+  });
+
+  // Simulate app update by mocking SW response
+  await page.route("**/sw.js", (route) => {
+    route.fulfill({
+      contentType: "application/javascript",
+      body: `
+        const VERSION = 'v2';
+        self.addEventListener('install', (e) => {
+          e.waitUntil(caches.open('app-cache-v2'));
+          self.skipWaiting();
+        });
+      `,
+    });
+  });
+
+  // Trigger update
+  await page.evaluate(async () => {
+    const reg = await navigator.serviceWorker.ready;
+    await reg.update();
+  });
+
+  // Verify new cache exists
+  await page.waitForFunction(async () => {
+    return await caches.has("app-cache-v2");
+  });
+});
+```
+
+## Offline Testing
+
+This section covers **offline-first apps (PWAs)** that are designed to work offline using service workers, caching, and background sync. For testing **unexpected network failures** (error recovery, graceful degradation), see [error-testing.md](error-testing.md#offline-testing).
+
+### Simulating Offline Mode
+
+```typescript
+test("app works offline", async ({ page, context }) => {
+  await page.goto("/pwa-app");
+
+  // Ensure SW is active and content cached
+  await page.evaluate(async () => {
+    await navigator.serviceWorker.ready;
+  });
+  await page.waitForTimeout(1000); // Allow caching to complete
+
+  // Go offline
+  await context.setOffline(true);
+
+  // Navigate to cached page
+  await page.reload();
+
+  // Verify content loads
+  await expect(page.getByRole("heading", { name: "Dashboard" })).toBeVisible();
+
+  // Verify offline indicator
+  await expect(page.locator(".offline-badge")).toBeVisible();
+
+  // Go back online
+  await context.setOffline(false);
+  await expect(page.locator(".offline-badge")).not.toBeVisible();
+});
+```
+
+### Testing Offline Fallback
+
+```typescript
+test("shows offline page for uncached routes", async ({ page, context }) => {
+  await page.goto("/pwa-app");
+  await page.evaluate(() => navigator.serviceWorker.ready);
+
+  // Go offline
+  await context.setOffline(true);
+
+  // Navigate to uncached page
+  await page.goto("/uncached-page");
+
+  // Should show offline fallback
+  await expect(page.getByText("You are offline")).toBeVisible();
+  await expect(page.getByRole("button", { name: "Retry" })).toBeVisible();
+});
+```
+
+### Testing Offline Form Submission
+
+```typescript
+test("queues form submission offline", async ({ page, context }) => {
+  await page.goto("/pwa-app/form");
+
+  // Go offline
+  await context.setOffline(true);
+
+  // Submit form
+  await page.getByLabel("Message").fill("Offline message");
+  await page.getByRole("button", { name: "Send" }).click();
+
+  // Should show queued status
+  await expect(page.getByText("Queued for sync")).toBeVisible();
+
+  // Go online
+  await context.setOffline(false);
+
+  // Trigger sync (or wait for automatic)
+  await page.evaluate(async () => {
+    const reg = await navigator.serviceWorker.ready;
+    // Manually trigger sync for testing
+    await (reg as any).sync?.register("form-sync");
+  });
+
+  // Verify submission completed
+  await expect(page.getByText("Message sent")).toBeVisible({ timeout: 10000 });
+});
+```
+
+## Push Notifications
+
+### Mocking Push Subscription
+
+```typescript
+test("handles push subscription", async ({ page, context }) => {
+  // Grant notification permission
+  await context.grantPermissions(["notifications"]);
+
+  await page.goto("/pwa-app");
+
+  // Subscribe to push
+  const subscription = await page.evaluate(async () => {
+    const reg = await navigator.serviceWorker.ready;
+    const sub = await reg.pushManager.subscribe({
+      userVisibleOnly: true,
+      applicationServerKey: "test-key",
+    });
+    return sub.toJSON();
+  });
+
+  expect(subscription.endpoint).toBeDefined();
+});
+```
+
+### Testing Push Message Handling
+
+```typescript
+test("handles push notification", async ({ context, page }) => {
+  await context.grantPermissions(["notifications"]);
+  await page.goto("/pwa-app");
+
+  // Wait for SW
+  const swPromise = context.waitForEvent("serviceworker");
+  const sw = await swPromise;
+
+  // Simulate push message to service worker
+  await sw.evaluate(async () => {
+    // Dispatch push event
+    const pushEvent = new PushEvent("push", {
+      data: new PushMessageData(
+        JSON.stringify({ title: "Test", body: "Push message" }),
+      ),
+    });
+    self.dispatchEvent(pushEvent);
+  });
+
+  // Note: Actual notification display testing is limited in Playwright
+  // Focus on verifying the SW handles the push correctly
+});
+```
+
+### Testing Notification Click
+
+```typescript
+test("notification click opens page", async ({ context, page }) => {
+  await context.grantPermissions(["notifications"]);
+  await page.goto("/pwa-app");
+
+  // Store notification URL target
+  let notificationUrl = "";
+
+  // Listen for new pages (notification click opens new page)
+  context.on("page", (newPage) => {
+    notificationUrl = newPage.url();
+  });
+
+  // Trigger notification via SW
+  await page.evaluate(async () => {
+    const reg = await navigator.serviceWorker.ready;
+    await reg.showNotification("Test", {
+      body: "Click me",
+      data: { url: "/notification-target" },
+    });
+  });
+
+  // Simulate clicking notification (via SW)
+  const sw = context.serviceWorkers()[0];
+  await sw.evaluate(() => {
+    self.dispatchEvent(
+      new NotificationEvent("notificationclick", {
+        notification: { data: { url: "/notification-target" } } as any,
+      }),
+    );
+  });
+
+  // Verify navigation occurred
+  await page.waitForTimeout(1000);
+  // Check if new page opened or current page navigated
+});
+```
+
+## Background Sync
+
+### Testing Background Sync Registration
+
+```typescript
+test("registers background sync", async ({ page }) => {
+  await page.goto("/pwa-app");
+
+  // Register sync
+  const syncRegistered = await page.evaluate(async () => {
+    const reg = await navigator.serviceWorker.ready;
+    if (!("sync" in reg)) return false;
+
+    await (reg as any).sync.register("my-sync");
+    return true;
+  });
+
+  expect(syncRegistered).toBe(true);
+});
+```
+
+### Testing Sync Event
+
+```typescript
+test("sync event fires when online", async ({ context, page }) => {
+  await page.goto("/pwa-app");
+
+  // Queue data while offline
+  await context.setOffline(true);
+
+  await page.evaluate(async () => {
+    // Store data in IndexedDB for sync
+    const db = await openDB();
+    await db.put("sync-queue", { id: 1, data: "test" });
+
+    // Register sync
+    const reg = await navigator.serviceWorker.ready;
+    await (reg as any).sync.register("data-sync");
+  });
+
+  // Track sync completion
+  await page.evaluate(() => {
+    window.syncCompleted = false;
+    navigator.serviceWorker.addEventListener("message", (e) => {
+      if (e.data.type === "SYNC_COMPLETE") {
+        window.syncCompleted = true;
+      }
+    });
+  });
+
+  // Go online
+  await context.setOffline(false);
+
+  // Wait for sync to complete
+  await page.waitForFunction(() => window.syncCompleted, { timeout: 10000 });
+});
+```
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern                   | Problem                 | Solution                                     |
+| ------------------------------ | ----------------------- | -------------------------------------------- |
+| Not clearing SW between tests  | Tests affect each other | Unregister SW in beforeEach                  |
+| Not waiting for SW ready       | Race conditions         | Always await `navigator.serviceWorker.ready` |
+| Testing in isolation only      | Misses real SW behavior | Test with actual caching                     |
+| Hardcoded timeouts for caching | Flaky tests             | Wait for cache to populate                   |
+| Ignoring SW update cycle       | Missing update bugs     | Test install, activate, update flows         |
+
+## Related References
+
+- **Network Failures**: See [error-testing.md](error-testing.md#offline-testing) for unexpected network failure patterns
+- **Browser APIs**: See [browser-apis.md](browser-apis.md) for permissions
+- **Network Mocking**: See [network-advanced.md](../advanced/network-advanced.md) for network interception
+- **Browser Extensions**: See [browser-extensions.md](../testing-patterns/browser-extensions.md) for extension service worker patterns
diff --git a/.cursor/skills/playwright-testing/browser-apis/websockets.md b/.cursor/skills/playwright-testing/browser-apis/websockets.md
new file mode 100644
index 0000000000..075a997395
--- /dev/null
+++ b/.cursor/skills/playwright-testing/browser-apis/websockets.md
@@ -0,0 +1,403 @@
+# WebSocket & Real-Time Testing
+
+## Table of Contents
+
+1. [WebSocket Basics](#websocket-basics)
+2. [Mocking WebSocket Messages](#mocking-websocket-messages)
+3. [Testing Real-Time Features](#testing-real-time-features)
+4. [Server-Sent Events](#server-sent-events)
+5. [Reconnection Testing](#reconnection-testing)
+
+## WebSocket Basics
+
+### Wait for WebSocket Connection
+
+```typescript
+test("chat connects via websocket", async ({ page }) => {
+  // Listen for WebSocket connection
+  const wsPromise = page.waitForEvent("websocket");
+
+  await page.goto("/chat");
+
+  const ws = await wsPromise;
+  expect(ws.url()).toContain("/ws/chat");
+
+  // Wait for connection to be established
+  await ws.waitForEvent("framesent");
+});
+```
+
+### Monitor WebSocket Messages
+
+```typescript
+test("receives real-time updates", async ({ page }) => {
+  const messages: string[] = [];
+
+  // Set up listener before navigation
+  page.on("websocket", (ws) => {
+    ws.on("framereceived", (frame) => {
+      messages.push(frame.payload as string);
+    });
+  });
+
+  await page.goto("/dashboard");
+
+  // Wait for some messages
+  await expect.poll(() => messages.length).toBeGreaterThan(0);
+
+  // Verify message format
+  const data = JSON.parse(messages[0]);
+  expect(data).toHaveProperty("type");
+});
+```
+
+### Capture Sent Messages
+
+```typescript
+test("sends correct message format", async ({ page }) => {
+  const sentMessages: string[] = [];
+
+  page.on("websocket", (ws) => {
+    ws.on("framesent", (frame) => {
+      sentMessages.push(frame.payload as string);
+    });
+  });
+
+  await page.goto("/chat");
+  await page.getByLabel("Message").fill("Hello!");
+  await page.getByRole("button", { name: "Send" }).click();
+
+  // Verify sent message
+  await expect.poll(() => sentMessages.length).toBeGreaterThan(0);
+
+  const sent = JSON.parse(sentMessages[sentMessages.length - 1]);
+  expect(sent).toEqual({
+    type: "message",
+    content: "Hello!",
+  });
+});
+```
+
+## Mocking WebSocket Messages
+
+### Inject Messages via Page Evaluate
+
+```typescript
+test("displays incoming chat message", async ({ page }) => {
+  await page.goto("/chat");
+
+  // Wait for WebSocket to be ready
+  await page.waitForFunction(
+    () => (window as any).chatSocket?.readyState === 1,
+  );
+
+  // Simulate incoming message
+  await page.evaluate(() => {
+    const event = new MessageEvent("message", {
+      data: JSON.stringify({
+        type: "message",
+        from: "Alice",
+        content: "Hello there!",
+      }),
+    });
+    (window as any).chatSocket.dispatchEvent(event);
+  });
+
+  await expect(page.getByText("Alice: Hello there!")).toBeVisible();
+});
+```
+
+### Mock WebSocket with Route Handler
+
+```typescript
+test("mock websocket entirely", async ({ page, context }) => {
+  // Intercept the WebSocket upgrade
+  await context.route("**/ws/**", async (route) => {
+    // For WebSocket routes, we can't fulfill directly
+    // Instead, use page.evaluate to mock the client-side
+  });
+
+  // Alternative: Mock at application level
+  await page.addInitScript(() => {
+    const OriginalWebSocket = window.WebSocket;
+    (window as any).WebSocket = function (url: string) {
+      const ws = {
+        readyState: 1,
+        send: (data: string) => {
+          console.log("WS Send:", data);
+        },
+        close: () => {},
+        addEventListener: () => {},
+        removeEventListener: () => {},
+      };
+      setTimeout(() => ws.onopen?.(), 100);
+      return ws;
+    };
+  });
+
+  await page.goto("/chat");
+});
+```
+
+### WebSocket Mock Fixture
+
+```typescript
+// fixtures/websocket.fixture.ts
+import { test as base, Page } from "@playwright/test";
+
+type WsMessage = { type: string; [key: string]: any };
+
+type WebSocketFixtures = {
+  mockWebSocket: {
+    injectMessage: (message: WsMessage) => Promise<void>;
+    getSentMessages: () => Promise<WsMessage[]>;
+  };
+};
+
+export const test = base.extend<WebSocketFixtures>({
+  mockWebSocket: async ({ page }, use) => {
+    const sentMessages: WsMessage[] = [];
+
+    // Capture sent messages
+    await page.addInitScript(() => {
+      (window as any).__wsSent = [];
+      const OriginalWebSocket = window.WebSocket;
+      window.WebSocket = function (url: string) {
+        const ws = new OriginalWebSocket(url);
+        const originalSend = ws.send.bind(ws);
+        ws.send = (data: string) => {
+          (window as any).__wsSent.push(JSON.parse(data));
+          originalSend(data);
+        };
+        (window as any).__ws = ws;
+        return ws;
+      } as any;
+    });
+
+    await use({
+      injectMessage: async (message) => {
+        await page.evaluate((msg) => {
+          const event = new MessageEvent("message", {
+            data: JSON.stringify(msg),
+          });
+          (window as any).__ws?.dispatchEvent(event);
+        }, message);
+      },
+      getSentMessages: async () => {
+        return page.evaluate(() => (window as any).__wsSent || []);
+      },
+    });
+  },
+});
+
+// Usage
+test("chat with mocked websocket", async ({ page, mockWebSocket }) => {
+  await page.goto("/chat");
+
+  // Inject incoming message
+  await mockWebSocket.injectMessage({
+    type: "message",
+    from: "Bob",
+    content: "Hi!",
+  });
+
+  await expect(page.getByText("Bob: Hi!")).toBeVisible();
+
+  // Send a reply
+  await page.getByLabel("Message").fill("Hello Bob!");
+  await page.getByRole("button", { name: "Send" }).click();
+
+  // Verify sent message
+  const sent = await mockWebSocket.getSentMessages();
+  expect(sent).toContainEqual(
+    expect.objectContaining({ content: "Hello Bob!" }),
+  );
+});
+```
+
+## Testing Real-Time Features
+
+### Live Notifications
+
+```typescript
+test("displays live notification", async ({ page }) => {
+  await page.goto("/dashboard");
+
+  // Simulate notification via WebSocket
+  await page.evaluate(() => {
+    const event = new MessageEvent("message", {
+      data: JSON.stringify({
+        type: "notification",
+        title: "New Order",
+        message: "Order #123 received",
+      }),
+    });
+    (window as any).notificationSocket.dispatchEvent(event);
+  });
+
+  await expect(page.getByRole("alert")).toContainText("Order #123 received");
+});
+```
+
+### Live Data Updates
+
+```typescript
+test("updates stock price in real-time", async ({ page }) => {
+  await page.goto("/stocks/AAPL");
+
+  const priceElement = page.getByTestId("stock-price");
+  const initialPrice = await priceElement.textContent();
+
+  // Simulate price update
+  await page.evaluate(() => {
+    const event = new MessageEvent("message", {
+      data: JSON.stringify({
+        type: "price_update",
+        symbol: "AAPL",
+        price: 150.25,
+      }),
+    });
+    (window as any).stockSocket.dispatchEvent(event);
+  });
+
+  await expect(priceElement).not.toHaveText(initialPrice!);
+  await expect(priceElement).toContainText("150.25");
+});
+```
+
+### Collaborative Editing
+
+```typescript
+test("shows collaborator cursor", async ({ page }) => {
+  await page.goto("/document/123");
+
+  // Simulate another user's cursor position
+  await page.evaluate(() => {
+    const event = new MessageEvent("message", {
+      data: JSON.stringify({
+        type: "cursor",
+        userId: "user-456",
+        userName: "Alice",
+        position: { x: 100, y: 200 },
+      }),
+    });
+    (window as any).docSocket.dispatchEvent(event);
+  });
+
+  await expect(page.getByTestId("cursor-user-456")).toBeVisible();
+  await expect(page.getByText("Alice")).toBeVisible();
+});
+```
+
+## Server-Sent Events
+
+### Test SSE Updates
+
+```typescript
+test("receives SSE updates", async ({ page }) => {
+  // Mock SSE endpoint
+  await page.route("**/api/events", (route) => {
+    route.fulfill({
+      status: 200,
+      headers: {
+        "Content-Type": "text/event-stream",
+        "Cache-Control": "no-cache",
+        Connection: "keep-alive",
+      },
+      body: `data: {"type":"update","value":42}\n\n`,
+    });
+  });
+
+  await page.goto("/live-data");
+
+  await expect(page.getByTestId("value")).toHaveText("42");
+});
+```
+
+### Simulate Multiple SSE Events
+
+```typescript
+test("handles multiple SSE events", async ({ page }) => {
+  await page.route("**/api/events", async (route) => {
+    const encoder = new TextEncoder();
+    const events = [
+      `data: {"count":1}\n\n`,
+      `data: {"count":2}\n\n`,
+      `data: {"count":3}\n\n`,
+    ];
+
+    route.fulfill({
+      status: 200,
+      headers: { "Content-Type": "text/event-stream" },
+      body: events.join(""),
+    });
+  });
+
+  await page.goto("/counter");
+
+  // Should receive all events
+  await expect(page.getByTestId("count")).toHaveText("3");
+});
+```
+
+## Reconnection Testing
+
+### Test Connection Loss
+
+```typescript
+test("handles connection loss gracefully", async ({ page }) => {
+  await page.goto("/chat");
+
+  // Simulate connection close
+  await page.evaluate(() => {
+    (window as any).chatSocket.close();
+  });
+
+  // Should show disconnected state
+  await expect(page.getByText("Reconnecting...")).toBeVisible();
+});
+```
+
+### Test Reconnection
+
+```typescript
+test("reconnects after connection loss", async ({ page }) => {
+  await page.goto("/chat");
+
+  // Simulate disconnect
+  await page.evaluate(() => {
+    (window as any).chatSocket.close();
+  });
+
+  await expect(page.getByText("Reconnecting...")).toBeVisible();
+
+  // Simulate reconnection
+  await page.evaluate(() => {
+    const event = new Event("open");
+    (window as any).chatSocket = { readyState: 1 };
+    (window as any).chatSocket.dispatchEvent?.(event);
+  });
+
+  // Force component to re-check connection
+  await page.evaluate(() => {
+    window.dispatchEvent(new Event("online"));
+  });
+
+  await expect(page.getByText("Connected")).toBeVisible();
+});
+```
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern                          | Problem                       | Solution                           |
+| ------------------------------------- | ----------------------------- | ---------------------------------- |
+| Not waiting for WebSocket ready       | Messages sent too early       | Wait for `readyState === 1`        |
+| Testing against real WebSocket server | Flaky, timing-dependent       | Mock WebSocket messages            |
+| Ignoring connection state             | Tests pass but feature broken | Test connected/disconnected states |
+| No cleanup of listeners               | Memory leaks in tests         | Clean up event listeners           |
+
+## Related References
+
+- **Network**: See [network-advanced.md](../advanced/network-advanced.md) for HTTP mocking patterns
+- **Assertions**: See [assertions-waiting.md](../core/assertions-waiting.md) for polling patterns
+- **Multi-User**: See [multi-user.md](../advanced/multi-user.md) for real-time collaboration testing with multiple users
diff --git a/.cursor/skills/playwright-testing/core/annotations.md b/.cursor/skills/playwright-testing/core/annotations.md
new file mode 100644
index 0000000000..ac0f890a5e
--- /dev/null
+++ b/.cursor/skills/playwright-testing/core/annotations.md
@@ -0,0 +1,424 @@
+# Test Annotations & Organization
+
+## Table of Contents
+
+1. [Skip Annotations](#skip-annotations)
+2. [Fixme & Fail Annotations](#fixme--fail-annotations)
+3. [Slow Tests](#slow-tests)
+4. [Test Steps](#test-steps)
+5. [Custom Annotations](#custom-annotations)
+6. [Conditional Annotations](#conditional-annotations)
+
+## Skip Annotations
+
+### Basic Skip
+
+```typescript
+// Skip unconditionally
+test.skip("feature not implemented", async ({ page }) => {
+  // This test won't run
+});
+
+// Skip with reason
+test("payment flow", async ({ page }) => {
+  test.skip(true, "Payment gateway in maintenance");
+  // Test body won't execute
+});
+```
+
+### Conditional Skip
+
+```typescript
+test("webkit-specific feature", async ({ page, browserName }) => {
+  test.skip(browserName !== "webkit", "This feature only works in WebKit");
+
+  await page.goto("/webkit-feature");
+});
+
+test("production only", async ({ page }) => {
+  test.skip(process.env.ENV !== "production", "Only runs against production");
+
+  await page.goto("/prod-feature");
+});
+```
+
+### Skip by Platform
+
+```typescript
+test("windows-specific", async ({ page }) => {
+  test.skip(process.platform !== "win32", "Windows only");
+});
+
+test("not on CI", async ({ page }) => {
+  test.skip(!!process.env.CI, "Skipped in CI environment");
+});
+```
+
+### Skip Describe Block
+
+```typescript
+test.describe("Admin features", () => {
+  test.skip(
+    ({ browserName }) => browserName === "firefox",
+    "Firefox admin bug",
+  );
+
+  test("admin dashboard", async ({ page }) => {
+    // Skipped in Firefox
+  });
+
+  test("admin settings", async ({ page }) => {
+    // Skipped in Firefox
+  });
+});
+```
+
+## Fixme & Fail Annotations
+
+### Fixme - Known Issues
+
+```typescript
+// Mark test as needing fix (skips the test)
+test.fixme("broken after refactor", async ({ page }) => {
+  // Test won't run but is tracked
+});
+
+// Conditional fixme
+test("flaky on CI", async ({ page }) => {
+  test.fixme(!!process.env.CI, "Investigate CI flakiness - ticket #123");
+
+  await page.goto("/flaky-feature");
+});
+```
+
+### Fail - Expected Failures
+
+```typescript
+// Test is expected to fail (runs but expects failure)
+test("known bug", async ({ page }) => {
+  test.fail();
+
+  await page.goto("/buggy-page");
+  // If this passes, the test fails (bug was fixed!)
+  await expect(page.getByText("Working")).toBeVisible();
+});
+
+// Conditional fail
+test("fails on webkit", async ({ page, browserName }) => {
+  test.fail(browserName === "webkit", "WebKit rendering bug #456");
+
+  await page.goto("/render-test");
+  await expect(page.getByTestId("element")).toHaveCSS("width", "100px");
+});
+```
+
+### Difference Between Skip, Fixme, Fail
+
+| Annotation     | Runs? | Use Case                         |
+| -------------- | ----- | -------------------------------- |
+| `test.skip()`  | No    | Feature not applicable           |
+| `test.fixme()` | No    | Known bug, needs investigation   |
+| `test.fail()`  | Yes   | Expected to fail, tracking a bug |
+
+## Slow Tests
+
+### Mark Slow Tests
+
+```typescript
+// Triple the default timeout
+test("large data import", async ({ page }) => {
+  test.slow();
+
+  await page.goto("/import");
+  await page.setInputFiles("#file", "large-file.csv");
+  await page.getByRole("button", { name: "Import" }).click();
+
+  await expect(page.getByText("Import complete")).toBeVisible();
+});
+
+// Conditional slow
+test("video processing", async ({ page, browserName }) => {
+  test.slow(browserName === "webkit", "WebKit video processing is slow");
+
+  await page.goto("/video-editor");
+});
+```
+
+### Custom Timeout
+
+```typescript
+test("very long operation", async ({ page }) => {
+  // Set specific timeout (in milliseconds)
+  test.setTimeout(120000); // 2 minutes
+
+  await page.goto("/long-operation");
+});
+
+// Timeout for describe block
+test.describe("Integration tests", () => {
+  test.describe.configure({ timeout: 60000 });
+
+  test("test 1", async ({ page }) => {
+    // Has 60 second timeout
+  });
+});
+```
+
+## Test Steps
+
+### Basic Steps
+
+```typescript
+test("checkout flow", async ({ page }) => {
+  await test.step("Add item to cart", async () => {
+    await page.goto("/products");
+    await page.getByRole("button", { name: "Add to Cart" }).click();
+  });
+
+  await test.step("Go to checkout", async () => {
+    await page.getByRole("link", { name: "Cart" }).click();
+    await page.getByRole("button", { name: "Checkout" }).click();
+  });
+
+  await test.step("Fill shipping info", async () => {
+    await page.getByLabel("Address").fill("123 Test St");
+    await page.getByLabel("City").fill("Test City");
+  });
+
+  await test.step("Complete payment", async () => {
+    await page.getByLabel("Card").fill("4242424242424242");
+    await page.getByRole("button", { name: "Pay" }).click();
+  });
+
+  await expect(page.getByText("Order confirmed")).toBeVisible();
+});
+```
+
+### Nested Steps
+
+```typescript
+test("user registration", async ({ page }) => {
+  await test.step("Fill registration form", async () => {
+    await page.goto("/register");
+
+    await test.step("Personal info", async () => {
+      await page.getByLabel("Name").fill("John Doe");
+      await page.getByLabel("Email").fill("john@example.com");
+    });
+
+    await test.step("Security", async () => {
+      await page.getByLabel("Password").fill("SecurePass123");
+      await page.getByLabel("Confirm Password").fill("SecurePass123");
+    });
+  });
+
+  await test.step("Submit and verify", async () => {
+    await page.getByRole("button", { name: "Register" }).click();
+    await expect(page.getByText("Welcome")).toBeVisible();
+  });
+});
+```
+
+### Steps with Return Values
+
+```typescript
+test("verify order", async ({ page }) => {
+  const orderId = await test.step("Create order", async () => {
+    await page.goto("/checkout");
+    await page.getByRole("button", { name: "Place Order" }).click();
+
+    // Return value from step
+    return await page.getByTestId("order-id").textContent();
+  });
+
+  await test.step("Verify order details", async () => {
+    await page.goto(`/orders/${orderId}`);
+    await expect(page.getByText(`Order #${orderId}`)).toBeVisible();
+  });
+});
+```
+
+### Step in Page Object
+
+```typescript
+// pages/checkout.page.ts
+export class CheckoutPage {
+  async fillShippingInfo(address: string, city: string) {
+    await test.step("Fill shipping information", async () => {
+      await this.page.getByLabel("Address").fill(address);
+      await this.page.getByLabel("City").fill(city);
+    });
+  }
+
+  async completePayment(cardNumber: string) {
+    await test.step("Complete payment", async () => {
+      await this.page.getByLabel("Card").fill(cardNumber);
+      await this.page.getByRole("button", { name: "Pay" }).click();
+    });
+  }
+}
+```
+
+## Custom Annotations
+
+### Add Annotations
+
+```typescript
+test("important feature", async ({ page }, testInfo) => {
+  // Add custom annotation
+  testInfo.annotations.push({
+    type: "priority",
+    description: "high",
+  });
+
+  testInfo.annotations.push({
+    type: "ticket",
+    description: "JIRA-123",
+  });
+
+  await page.goto("/feature");
+});
+```
+
+### Annotation Fixture
+
+```typescript
+// fixtures/annotations.fixture.ts
+import { test as base, TestInfo } from "@playwright/test";
+
+type AnnotationFixtures = {
+  annotate: {
+    ticket: (id: string) => void;
+    priority: (level: "low" | "medium" | "high") => void;
+    owner: (name: string) => void;
+  };
+};
+
+export const test = base.extend<AnnotationFixtures>({
+  annotate: async ({}, use, testInfo) => {
+    await use({
+      ticket: (id) => {
+        testInfo.annotations.push({ type: "ticket", description: id });
+      },
+      priority: (level) => {
+        testInfo.annotations.push({ type: "priority", description: level });
+      },
+      owner: (name) => {
+        testInfo.annotations.push({ type: "owner", description: name });
+      },
+    });
+  },
+});
+
+// Usage
+test("critical feature", async ({ page, annotate }) => {
+  annotate.ticket("JIRA-456");
+  annotate.priority("high");
+  annotate.owner("Alice");
+
+  await page.goto("/critical");
+});
+```
+
+### Read Annotations in Reporter
+
+```typescript
+// reporters/annotation-reporter.ts
+import { Reporter, TestCase, TestResult } from "@playwright/test/reporter";
+
+class AnnotationReporter implements Reporter {
+  onTestEnd(test: TestCase, result: TestResult) {
+    const ticket = test.annotations.find((a) => a.type === "ticket");
+    const priority = test.annotations.find((a) => a.type === "priority");
+
+    if (ticket) {
+      console.log(`Test linked to: ${ticket.description}`);
+    }
+
+    if (priority?.description === "high" && result.status === "failed") {
+      console.log(`HIGH PRIORITY FAILURE: ${test.title}`);
+    }
+  }
+}
+
+export default AnnotationReporter;
+```
+
+## Conditional Annotations
+
+### Annotation Helper
+
+```typescript
+// helpers/test-annotations.ts
+import { test } from "@playwright/test";
+
+export function skipInCI(reason = "Skipped in CI") {
+  test.skip(!!process.env.CI, reason);
+}
+
+export function skipInBrowser(browser: string, reason: string) {
+  test.beforeEach(({ browserName }) => {
+    test.skip(browserName === browser, reason);
+  });
+}
+
+export function onlyInEnv(env: string) {
+  test.skip(process.env.ENV !== env, `Only runs in ${env}`);
+}
+```
+
+```typescript
+// tests/feature.spec.ts
+import { skipInCI, onlyInEnv } from "../helpers/test-annotations";
+
+test("local only feature", async ({ page }) => {
+  skipInCI("Uses local resources");
+
+  await page.goto("/local-feature");
+});
+
+test("production check", async ({ page }) => {
+  onlyInEnv("production");
+
+  await page.goto("/prod-only");
+});
+```
+
+### Describe-Level Conditions
+
+```typescript
+test.describe("Mobile features", () => {
+  test.beforeEach(({ isMobile }) => {
+    test.skip(!isMobile, "Mobile only tests");
+  });
+
+  test("touch gestures", async ({ page }) => {
+    // Only runs on mobile
+  });
+});
+
+test.describe("Desktop features", () => {
+  test.beforeEach(({ isMobile }) => {
+    test.skip(isMobile, "Desktop only tests");
+  });
+
+  test("hover interactions", async ({ page }) => {
+    // Only runs on desktop
+  });
+});
+```
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern                | Problem                | Solution                         |
+| --------------------------- | ---------------------- | -------------------------------- |
+| Skipping without reason     | Hard to track why      | Always provide description       |
+| Too many skipped tests      | Test debt accumulates  | Review and clean up regularly    |
+| Using skip instead of fixme | Loses intent           | Use fixme for bugs, skip for N/A |
+| Not using steps             | Hard to debug failures | Group logical actions in steps   |
+
+## Related References
+
+- **Test Tags**: See [test-tags.md](test-tags.md) for tagging and filtering tests with `--grep`
+- **Test Organization**: See [test-suite-structure.md](test-suite-structure.md) for structuring tests
+- **Debugging**: See [debugging.md](../debugging/debugging.md) for troubleshooting
diff --git a/.cursor/skills/playwright-testing/core/assertions-waiting.md b/.cursor/skills/playwright-testing/core/assertions-waiting.md
new file mode 100644
index 0000000000..bd03dd8a4d
--- /dev/null
+++ b/.cursor/skills/playwright-testing/core/assertions-waiting.md
@@ -0,0 +1,361 @@
+# Assertions & Waiting
+
+## Table of Contents
+
+1. [Web-First Assertions](#web-first-assertions)
+2. [Generic Assertions](#generic-assertions)
+3. [Soft Assertions](#soft-assertions)
+4. [Waiting Strategies](#waiting-strategies)
+5. [Polling & Retrying](#polling--retrying)
+6. [Custom Matchers](#custom-matchers)
+
+## Web-First Assertions
+
+Auto-retry until condition is met or timeout. Always prefer these over generic assertions.
+
+### Locator Assertions
+
+```typescript
+import { expect } from "@playwright/test";
+
+// Visibility
+await expect(page.getByRole("button")).toBeVisible();
+await expect(page.getByRole("button")).toBeHidden();
+await expect(page.getByRole("button")).not.toBeVisible();
+
+// Enabled/Disabled
+await expect(page.getByRole("button")).toBeEnabled();
+await expect(page.getByRole("button")).toBeDisabled();
+
+// Text content
+await expect(page.getByRole("heading")).toHaveText("Welcome");
+await expect(page.getByRole("heading")).toHaveText(/welcome/i);
+await expect(page.getByRole("heading")).toContainText("Welcome");
+
+// Count
+await expect(page.getByRole("listitem")).toHaveCount(5);
+
+// Attributes
+await expect(page.getByRole("link")).toHaveAttribute("href", "/home");
+await expect(page.getByRole("img")).toHaveAttribute("alt", /logo/i);
+
+// CSS
+await expect(page.getByRole("button")).toHaveClass(/primary/);
+await expect(page.getByRole("button")).toHaveCSS("color", "rgb(0, 0, 255)");
+
+// Input values
+await expect(page.getByLabel("Email")).toHaveValue("user@example.com");
+await expect(page.getByLabel("Email")).toBeEmpty();
+
+// Focus
+await expect(page.getByLabel("Email")).toBeFocused();
+
+// Checked state
+await expect(page.getByRole("checkbox")).toBeChecked();
+await expect(page.getByRole("checkbox")).not.toBeChecked();
+
+// Editable state
+await expect(page.getByLabel("Name")).toBeEditable();
+```
+
+### Page Assertions
+
+```typescript
+// URL
+await expect(page).toHaveURL("/dashboard");
+await expect(page).toHaveURL(/\/dashboard/);
+
+// Title
+await expect(page).toHaveTitle("Dashboard - MyApp");
+await expect(page).toHaveTitle(/dashboard/i);
+```
+
+### Response Assertions
+
+```typescript
+const response = await page.request.get("/api/users");
+await expect(response).toBeOK();
+await expect(response).not.toBeOK();
+```
+
+## Generic Assertions
+
+Use for non-UI values. Do NOT retry - execute immediately.
+
+```typescript
+// Equality
+expect(value).toBe(5);
+expect(object).toEqual({ name: "Test" });
+expect(array).toContain("item");
+
+// Truthiness
+expect(value).toBeTruthy();
+expect(value).toBeFalsy();
+expect(value).toBeNull();
+expect(value).toBeUndefined();
+expect(value).toBeDefined();
+
+// Numbers
+expect(value).toBeGreaterThan(5);
+expect(value).toBeLessThanOrEqual(10);
+expect(value).toBeCloseTo(5.5, 1);
+
+// Strings
+expect(string).toMatch(/pattern/);
+expect(string).toContain("substring");
+
+// Arrays/Objects
+expect(array).toHaveLength(3);
+expect(object).toHaveProperty("key", "value");
+
+// Exceptions
+expect(() => fn()).toThrow();
+expect(() => fn()).toThrow("error message");
+await expect(asyncFn()).rejects.toThrow();
+```
+
+## Soft Assertions
+
+Continue test execution after failure, report all failures at end.
+
+```typescript
+test("check multiple elements", async ({ page }) => {
+  await page.goto("/dashboard");
+
+  // Won't stop on first failure
+  await expect.soft(page.getByRole("heading")).toHaveText("Dashboard");
+  await expect.soft(page.getByRole("button", { name: "Save" })).toBeEnabled();
+  await expect.soft(page.getByText("Welcome")).toBeVisible();
+
+  // Test continues; all failures reported at end
+});
+```
+
+### Soft Assertions with Early Exit
+
+```typescript
+test("check form", async ({ page }) => {
+  await expect.soft(page.getByRole("form")).toBeVisible();
+
+  // Exit early if form not visible (pointless to check fields)
+  if (expect.soft.hasFailures()) {
+    return;
+  }
+
+  await expect.soft(page.getByLabel("Name")).toBeVisible();
+  await expect.soft(page.getByLabel("Email")).toBeVisible();
+});
+```
+
+## Waiting Strategies
+
+### Auto-Waiting (Default)
+
+Actions automatically wait for:
+
+- Element to be attached to DOM
+- Element to be visible
+- Element to be stable (no animations)
+- Element to be enabled
+- Element to receive events
+
+```typescript
+// These auto-wait
+await page.click("button");
+await page.fill("input", "text");
+await page.getByRole("button").click();
+```
+
+### Wait for Navigation
+
+```typescript
+// Wait for URL change
+await page.waitForURL("/dashboard");
+await page.waitForURL(/\/dashboard/);
+
+// Wait for navigation after action
+await Promise.all([
+  page.waitForURL("**/dashboard"),
+  page.click('a[href="/dashboard"]'),
+]);
+
+// Or without Promise.all
+const urlPromise = page.waitForURL("**/dashboard");
+await page.click("a");
+await urlPromise;
+```
+
+### Wait for Network
+
+```typescript
+// Wait for specific response
+const responsePromise = page.waitForResponse("**/api/users");
+await page.click("button");
+const response = await responsePromise;
+expect(response.status()).toBe(200);
+
+// Wait for request
+const requestPromise = page.waitForRequest("**/api/submit");
+await page.click("button");
+const request = await requestPromise;
+
+// Wait for no network activity
+await page.waitForLoadState("networkidle");
+```
+
+### Wait for Element State
+
+```typescript
+// Wait for element to appear
+await page.getByRole("dialog").waitFor({ state: "visible" });
+
+// Wait for element to disappear
+await page.getByText("Loading...").waitFor({ state: "hidden" });
+
+// Wait for element to be attached
+await page.getByTestId("result").waitFor({ state: "attached" });
+
+// Wait for element to be detached
+await page.getByTestId("modal").waitFor({ state: "detached" });
+```
+
+### Wait for Function
+
+```typescript
+// Wait for arbitrary condition
+await page.waitForFunction(() => {
+  return document.querySelector(".loaded") !== null;
+});
+
+// With arguments
+await page.waitForFunction(
+  (selector) => document.querySelector(selector)?.textContent === "Ready",
+  ".status",
+);
+```
+
+## Polling & Retrying
+
+### toPass() for Polling
+
+Retry until block passes or times out:
+
+```typescript
+await expect(async () => {
+  const response = await page.request.get("/api/status");
+  expect(response.status()).toBe(200);
+
+  const data = await response.json();
+  expect(data.ready).toBe(true);
+}).toPass({
+  intervals: [1000, 2000, 5000], // Retry intervals
+  timeout: 30000,
+});
+```
+
+### expect.poll()
+
+Poll a function until assertion passes:
+
+```typescript
+// Poll API until condition met
+await expect
+  .poll(
+    async () => {
+      const response = await page.request.get("/api/job/123");
+      return (await response.json()).status;
+    },
+    {
+      intervals: [1000, 2000, 5000],
+      timeout: 30000,
+    },
+  )
+  .toBe("completed");
+
+// Poll DOM value
+await expect.poll(() => page.getByTestId("counter").textContent()).toBe("10");
+```
+
+## Custom Matchers
+
+```typescript
+// playwright.config.ts or fixtures
+import { expect } from "@playwright/test";
+
+expect.extend({
+  async toHaveDataLoaded(page: Page) {
+    const locator = page.getByTestId("data-container");
+    let pass = false;
+    let message = "";
+
+    try {
+      await expect(locator).toBeVisible();
+      await expect(locator).not.toContainText("Loading");
+      pass = true;
+    } catch (e) {
+      message = `Expected data to be loaded but found loading state`;
+    }
+
+    return { pass, message: () => message };
+  },
+});
+
+// Extend TypeScript types
+declare global {
+  namespace PlaywrightTest {
+    interface Matchers<R> {
+      toHaveDataLoaded(): Promise<R>;
+    }
+  }
+}
+
+// Usage
+await expect(page).toHaveDataLoaded();
+```
+
+## Timeouts
+
+### Configure Timeouts
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  timeout: 30000, // Test timeout
+  expect: {
+    timeout: 5000, // Assertion timeout
+  },
+});
+
+// Per-test timeout
+test("long test", async ({ page }) => {
+  test.setTimeout(60000);
+  // ...
+});
+
+// Per-assertion timeout
+await expect(page.getByRole("button")).toBeVisible({ timeout: 10000 });
+```
+
+## Best Practices
+
+| Do                             | Don't                          |
+| ------------------------------ | ------------------------------ |
+| Use web-first assertions       | Use generic assertions for DOM |
+| Let auto-waiting work          | Add unnecessary explicit waits |
+| Use `toPass()` for polling     | Write manual retry loops       |
+| Configure appropriate timeouts | Use `waitForTimeout()`         |
+| Check specific conditions      | Wait for arbitrary time        |
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern                                              | Problem                       | Solution                                     |
+| --------------------------------------------------------- | ----------------------------- | -------------------------------------------- |
+| `await page.waitForTimeout(5000)`                         | Slow, flaky, arbitrary timing | Use auto-waiting or `waitForResponse`        |
+| `await new Promise(resolve => setTimeout(resolve, 1000))` | Same as above                 | Use `waitForResponse` or element state waits |
+| Generic assertions on DOM elements                        | No auto-retry, flaky          | Use web-first assertions with `expect()`     |
+
+## Related References
+
+- **Debugging timeout issues**: See [debugging.md](../debugging/debugging.md) for troubleshooting
+- **Fixing flaky tests**: See [debugging.md](../debugging/debugging.md) for race condition solutions
+- **Network interception**: See [test-suite-structure.md](test-suite-structure.md) for API mocking
diff --git a/.cursor/skills/playwright-testing/core/configuration.md b/.cursor/skills/playwright-testing/core/configuration.md
new file mode 100644
index 0000000000..66b9d33849
--- /dev/null
+++ b/.cursor/skills/playwright-testing/core/configuration.md
@@ -0,0 +1,452 @@
+# Playwright Configuration
+
+## Table of Contents
+
+1. [CLI Quick Reference](#cli-quick-reference)
+2. [Decision Guide](#decision-guide)
+3. [Production-Ready Config](#production-ready-config)
+4. [Patterns](#patterns)
+5. [Anti-Patterns](#anti-patterns)
+6. [Troubleshooting](#troubleshooting)
+7. [Related](#related)
+
+> **When to use**: Setting up a new project, adjusting timeouts, adding browser targets, configuring CI behavior, or managing environment-specific settings.
+
+## CLI Quick Reference
+
+```bash
+npx playwright init                           # scaffold config + first test
+npx playwright test --config=custom.config.ts # use alternate config
+npx playwright test --project=chromium        # run single project
+npx playwright test --reporter=html           # override reporter
+npx playwright test --grep @smoke             # run tests tagged @smoke
+npx playwright test --grep-invert @slow       # exclude @slow tests
+npx playwright show-report                    # open last HTML report
+DEBUG=pw:api npx playwright test              # verbose logging
+```
+
+## Decision Guide
+
+### Timeout Selection
+
+| Symptom | Setting | Default | Recommended |
+|---------|---------|---------|-------------|
+| Test takes too long overall | `timeout` | 30s | 30-60s (max 120s) |
+| Assertion retries too long/short | `expect.timeout` | 5s | 5-10s |
+| `page.goto()` or `waitForURL()` times out | `navigationTimeout` | 30s | 10-30s |
+| `click()`, `fill()` time out | `actionTimeout` | 0 (unlimited) | 10-15s |
+| Dev server slow to start | `webServer.timeout` | 60s | 60-180s |
+
+### Server Management
+
+| Scenario | Approach |
+|----------|----------|
+| App in same repo | `webServer` with `reuseExistingServer: !process.env.CI` |
+| Separate repos | Manual start or Docker Compose |
+| Testing deployed environment | No `webServer`; set `baseURL` via env |
+| Multiple services | Array of `webServer` entries |
+
+### Single vs Multi-Project
+
+| Scenario | Approach |
+|----------|----------|
+| Early development | Single project (chromium only) |
+| Pre-release validation | Multi-project: chromium + firefox + webkit |
+| Mobile-responsive app | Add mobile projects alongside desktop |
+| Auth + non-auth tests | Setup project with dependencies |
+| Tight CI budget | Chromium on PRs; all browsers on main |
+
+### globalSetup vs Setup Projects vs Fixtures
+
+| Need | Use |
+|------|-----|
+| One-time DB seed | `globalSetup` |
+| Shared browser auth | Setup project with `dependencies` |
+| Per-test isolated state | Custom fixture via `test.extend()` |
+| Cleanup after all tests | `globalTeardown` |
+
+## Production-Ready Config
+
+```ts
+// playwright.config.ts
+import { defineConfig, devices } from '@playwright/test';
+import dotenv from 'dotenv';
+import path from 'path';
+
+dotenv.config({ path: path.resolve(__dirname, '.env') });
+
+export default defineConfig({
+  testDir: './e2e',
+  testMatch: '**/*.spec.ts',
+
+  fullyParallel: true,
+  forbidOnly: !!process.env.CI,
+  retries: process.env.CI ? 2 : 0,
+  workers: process.env.CI ? '50%' : undefined,
+
+  reporter: process.env.CI
+    ? [['html', { open: 'never' }], ['github']]
+    : [['html', { open: 'on-failure' }]],
+
+  timeout: 30_000,
+  expect: { timeout: 5_000 },
+
+  use: {
+    baseURL: process.env.BASE_URL || 'http://localhost:4000',
+    actionTimeout: 10_000,
+    navigationTimeout: 15_000,
+    trace: 'on-first-retry',
+    screenshot: 'only-on-failure',
+    video: 'retain-on-failure',
+    locale: 'en-US',
+    timezoneId: 'America/Los_Angeles',
+  },
+
+  projects: [
+    { name: 'chromium', use: { ...devices['Desktop Chrome'] } },
+    { name: 'firefox', use: { ...devices['Desktop Firefox'] } },
+    { name: 'webkit', use: { ...devices['Desktop Safari'] } },
+    { name: 'mobile-chrome', use: { ...devices['Pixel 7'] } },
+    { name: 'mobile-safari', use: { ...devices['iPhone 14'] } },
+  ],
+
+  webServer: {
+    command: 'npm run start',
+    url: 'http://localhost:4000',
+    reuseExistingServer: !process.env.CI,
+    timeout: 120_000,
+    stdout: 'pipe',
+    stderr: 'pipe',
+  },
+});
+```
+
+## Patterns
+
+### Environment-Specific Configuration
+
+**Use when**: Tests run against dev, staging, and production environments.
+
+```ts
+// playwright.config.ts
+import { defineConfig } from '@playwright/test';
+import dotenv from 'dotenv';
+import path from 'path';
+
+const ENV = process.env.TEST_ENV || 'local';
+dotenv.config({ path: path.resolve(__dirname, `.env.${ENV}`) });
+
+const envConfig: Record<string, { baseURL: string; retries: number }> = {
+  local:   { baseURL: 'http://localhost:4000',      retries: 0 },
+  staging: { baseURL: 'https://staging.myapp.com',  retries: 2 },
+  prod:    { baseURL: 'https://myapp.com',          retries: 2 },
+};
+
+export default defineConfig({
+  testDir: './e2e',
+  retries: envConfig[ENV].retries,
+  use: { baseURL: envConfig[ENV].baseURL },
+});
+```
+
+```bash
+TEST_ENV=staging npx playwright test
+TEST_ENV=prod npx playwright test --grep @smoke
+```
+
+### Setup Project with Dependencies
+
+**Use when**: Tests need shared authentication state before running.
+
+```ts
+// playwright.config.ts
+import { defineConfig, devices } from '@playwright/test';
+
+export default defineConfig({
+  testDir: './e2e',
+  projects: [
+    {
+      name: 'setup',
+      testMatch: /auth\.setup\.ts/,
+    },
+    {
+      name: 'chromium',
+      use: {
+        ...devices['Desktop Chrome'],
+        storageState: 'playwright/.auth/session.json',
+      },
+      dependencies: ['setup'],
+    },
+    {
+      name: 'firefox',
+      use: {
+        ...devices['Desktop Firefox'],
+        storageState: 'playwright/.auth/session.json',
+      },
+      dependencies: ['setup'],
+    },
+  ],
+});
+```
+
+```ts
+// e2e/auth.setup.ts
+import { test as setup, expect } from '@playwright/test';
+
+const authFile = 'playwright/.auth/session.json';
+
+setup('authenticate', async ({ page }) => {
+  await page.goto('/login');
+  await page.getByLabel('Username').fill('testuser@example.com');
+  await page.getByLabel('Password').fill(process.env.TEST_PASSWORD!);
+  await page.getByRole('button', { name: 'Log in' }).click();
+  await expect(page.getByRole('heading', { name: 'Home' })).toBeVisible();
+  await page.context().storageState({ path: authFile });
+});
+```
+
+### webServer with Build Step
+
+**Use when**: Tests need a running application server managed by Playwright.
+
+```ts
+// playwright.config.ts
+import { defineConfig } from '@playwright/test';
+
+export default defineConfig({
+  testDir: './e2e',
+  use: { baseURL: 'http://localhost:4000' },
+  webServer: {
+    command: process.env.CI
+      ? 'npm run build && npm run preview'
+      : 'npm run dev',
+    url: 'http://localhost:4000',
+    reuseExistingServer: !process.env.CI,
+    timeout: 120_000,
+    env: {
+      NODE_ENV: 'test',
+      DB_URL: process.env.DB_URL || 'postgresql://localhost:5432/testdb',
+    },
+  },
+});
+```
+
+### globalSetup / globalTeardown
+
+**Use when**: One-time non-browser work like seeding a database. Runs once per test run.
+
+```ts
+// playwright.config.ts
+import { defineConfig } from '@playwright/test';
+
+export default defineConfig({
+  testDir: './e2e',
+  globalSetup: './e2e/setup.ts',
+  globalTeardown: './e2e/teardown.ts',
+});
+```
+
+```ts
+// e2e/setup.ts
+import { FullConfig } from '@playwright/test';
+
+export default async function globalSetup(config: FullConfig) {
+  const { execSync } = await import('child_process');
+  execSync('npx prisma db seed', { stdio: 'inherit' });
+  process.env.TEST_RUN_ID = `run-${Date.now()}`;
+}
+```
+
+```ts
+// e2e/teardown.ts
+import { FullConfig } from '@playwright/test';
+
+export default async function globalTeardown(config: FullConfig) {
+  const { execSync } = await import('child_process');
+  execSync('npx prisma db push --force-reset', { stdio: 'inherit' });
+}
+```
+
+### Environment Variables with .env
+
+**Use when**: Managing secrets, URLs, or feature flags without hardcoding.
+
+```bash
+# .env.example (commit this)
+BASE_URL=http://localhost:4000
+TEST_PASSWORD=
+API_KEY=
+
+# .env.local (gitignored)
+BASE_URL=http://localhost:4000
+TEST_PASSWORD=secret123
+API_KEY=dev-key-abc
+
+# .env.staging (gitignored)
+BASE_URL=https://staging.myapp.com
+TEST_PASSWORD=staging-pass
+API_KEY=staging-key-xyz
+```
+
+```bash
+# .gitignore
+.env
+.env.local
+.env.staging
+.env.production
+playwright/.auth/
+```
+
+Install dotenv:
+
+```bash
+npm install -D dotenv
+```
+
+### Tag-Based Test Filtering
+
+**Use when**: Running subsets of tests in different CI stages (PR vs nightly).
+
+```ts
+// playwright.config.ts
+import { defineConfig } from '@playwright/test';
+
+export default defineConfig({
+  testDir: './e2e',
+
+  // Filter by tags in CI
+  grep: process.env.CI ? /@smoke|@critical/ : undefined,
+  grepInvert: process.env.CI ? /@flaky/ : undefined,
+});
+```
+
+**Project-specific filtering:**
+
+```ts
+// playwright.config.ts
+import { defineConfig, devices } from '@playwright/test';
+
+export default defineConfig({
+  testDir: './e2e',
+  projects: [
+    {
+      name: 'smoke',
+      grep: /@smoke/,
+      use: { ...devices['Desktop Chrome'] },
+    },
+    {
+      name: 'regression',
+      grepInvert: /@smoke/,
+      use: { ...devices['Desktop Chrome'] },
+    },
+    {
+      name: 'critical-only',
+      grep: /@critical/,
+      use: { ...devices['Desktop Chrome'] },
+    },
+  ],
+});
+```
+
+```bash
+# Run specific project
+npx playwright test --project=smoke
+npx playwright test --project=regression
+```
+
+### Artifact Collection Strategy
+
+| Setting | Local | CI | Reason |
+|---------|-------|-----|--------|
+| `trace` | `'off'` | `'on-first-retry'` | Traces are large; collect on failure only |
+| `screenshot` | `'off'` | `'only-on-failure'` | Useful for CI debugging |
+| `video` | `'off'` | `'retain-on-failure'` | Recording slows tests |
+
+```ts
+// playwright.config.ts
+import { defineConfig } from '@playwright/test';
+
+export default defineConfig({
+  testDir: './e2e',
+  use: {
+    trace: process.env.CI ? 'on-first-retry' : 'off',
+    screenshot: process.env.CI ? 'only-on-failure' : 'off',
+    video: process.env.CI ? 'retain-on-failure' : 'off',
+  },
+});
+```
+
+## Anti-Patterns
+
+| Don't | Problem | Do Instead |
+|-------|---------|------------|
+| `timeout: 300_000` globally | Masks flaky tests; slow CI | Fix root cause; keep 30s default |
+| Hardcoded URLs: `page.goto('http://localhost:4000/login')` | Breaks in other environments | Use `baseURL` + relative paths |
+| All browsers on every PR | 3x CI time | Chromium on PRs; all on main |
+| `trace: 'on'` always | Huge artifacts, slow uploads | `trace: 'on-first-retry'` |
+| `video: 'on'` always | Massive storage; slow tests | `video: 'retain-on-failure'` |
+| Config in test files: `test.use({ viewport: {...} })` everywhere | Scattered, inconsistent | Define once in project config |
+| `retries: 3` locally | Hides flakiness | `retries: 0` local, `retries: 2` CI |
+| No `forbidOnly` in CI | Committed `test.only` runs single test | `forbidOnly: !!process.env.CI` |
+| `globalSetup` for browser auth | No browser context available | Use setup project with dependencies |
+| Committing `.env` with credentials | Security risk | Commit `.env.example` only |
+
+## Troubleshooting
+
+### baseURL Not Working
+
+**Cause**: Using absolute URL in `page.goto()` ignores `baseURL`.
+
+```ts
+// Wrong - ignores baseURL
+await page.goto('http://localhost:4000/dashboard');
+
+// Correct - uses baseURL
+await page.goto('/dashboard');
+```
+
+### webServer Starts But Tests Get Connection Refused
+
+**Cause**: `webServer.url` doesn't match actual server address or health check returns non-200.
+
+```ts
+webServer: {
+  command: 'npm run dev',
+  url: 'http://localhost:4000/api/health',  // use real endpoint
+  reuseExistingServer: !process.env.CI,
+  timeout: 120_000,
+},
+```
+
+### Tests Pass Locally But Timeout in CI
+
+**Cause**: CI machines are slower. Increase timeouts and reduce workers:
+
+```ts
+export default defineConfig({
+  workers: process.env.CI ? '50%' : undefined,
+  use: {
+    navigationTimeout: process.env.CI ? 30_000 : 15_000,
+    actionTimeout: process.env.CI ? 15_000 : 10_000,
+  },
+});
+```
+
+### "Target page, context or browser has been closed"
+
+**Cause**: Test exceeded `timeout` and Playwright tore down browser during action.
+
+**Fix**: Don't increase global timeout. Find slow step using trace:
+
+```bash
+npx playwright test --trace on
+npx playwright show-report
+```
+
+## Related
+
+- [test-tags.md](./test-tags.md) - tagging and filtering tests with `--grep`
+- [fixtures-hooks.md](./fixtures-hooks.md) - custom fixtures for per-test state
+- [test-suite-structure.md](test-suite-structure.md) - file structure and naming
+- [authentication.md](../advanced/authentication.md) - setup projects for shared auth
+- [projects-dependencies.md](./projects-dependencies.md) - advanced multi-project patterns
diff --git a/.cursor/skills/playwright-testing/core/fixtures-hooks.md b/.cursor/skills/playwright-testing/core/fixtures-hooks.md
new file mode 100644
index 0000000000..ff9dc93f71
--- /dev/null
+++ b/.cursor/skills/playwright-testing/core/fixtures-hooks.md
@@ -0,0 +1,417 @@
+# Fixtures & Hooks
+
+## Table of Contents
+
+1. [Built-in Fixtures](#built-in-fixtures)
+2. [Custom Fixtures](#custom-fixtures)
+3. [Fixture Scopes](#fixture-scopes)
+4. [Hooks](#hooks)
+5. [Authentication Patterns](#authentication-patterns)
+6. [Database Fixtures](#database-fixtures)
+
+## Built-in Fixtures
+
+### Core Fixtures
+
+```typescript
+test("example", async ({
+  page, // Isolated page instance
+  context, // Browser context (cookies, localStorage)
+  browser, // Browser instance
+  browserName, // 'chromium', 'firefox', or 'webkit'
+  request, // API request context
+}) => {
+  // Each test gets fresh instances
+});
+```
+
+### Request Fixture
+
+```typescript
+test("API call", async ({ request }) => {
+  const response = await request.get("/api/users");
+  await expect(response).toBeOK();
+
+  const users = await response.json();
+  expect(users).toHaveLength(5);
+});
+```
+
+## Custom Fixtures
+
+### Basic Custom Fixture
+
+```typescript
+// fixtures.ts
+import { test as base } from "@playwright/test";
+
+// Declare fixture types
+type MyFixtures = {
+  todoPage: TodoPage;
+  apiClient: ApiClient;
+};
+
+export const test = base.extend<MyFixtures>({
+  // Fixture with setup and teardown
+  todoPage: async ({ page }, use) => {
+    const todoPage = new TodoPage(page);
+    await todoPage.goto();
+
+    await use(todoPage); // Test runs here
+
+    // Teardown (optional)
+    await todoPage.clearTodos();
+  },
+
+  // Simple fixture
+  apiClient: async ({ request }, use) => {
+    await use(new ApiClient(request));
+  },
+});
+
+export { expect } from "@playwright/test";
+```
+
+### Fixture with Options
+
+```typescript
+type Options = {
+  defaultUser: { email: string; password: string };
+};
+
+type Fixtures = {
+  authenticatedPage: Page;
+};
+
+export const test = base.extend<Options & Fixtures>({
+  // Define option with default
+  defaultUser: [
+    { email: "test@example.com", password: "pass123" },
+    { option: true },
+  ],
+
+  // Use option in fixture
+  authenticatedPage: async ({ page, defaultUser }, use) => {
+    await page.goto("/login");
+    await page.getByLabel("Email").fill(defaultUser.email);
+    await page.getByLabel("Password").fill(defaultUser.password);
+    await page.getByRole("button", { name: "Sign in" }).click();
+    await use(page);
+  },
+});
+
+// Override in config
+export default defineConfig({
+  use: {
+    defaultUser: { email: "admin@example.com", password: "admin123" },
+  },
+});
+```
+
+### Automatic Fixtures
+
+```typescript
+export const test = base.extend<{}, { setupDb: void }>({
+  // Auto-fixture runs for every test without explicit usage
+  setupDb: [
+    async ({}, use) => {
+      await seedDatabase();
+      await use();
+      await cleanDatabase();
+    },
+    { auto: true },
+  ],
+});
+```
+
+## Fixture Scopes
+
+### Test Scope (Default)
+
+Created fresh for each test:
+
+```typescript
+test.extend({
+  page: async ({ browser }, use) => {
+    const page = await browser.newPage();
+    await use(page);
+    await page.close();
+  },
+});
+```
+
+### Worker Scope
+
+Shared across tests in the same worker (each worker gets its own instance; tests in different workers do not share it):
+
+```typescript
+type WorkerFixtures = {
+  sharedAccount: Account;
+};
+
+export const test = base.extend<{}, WorkerFixtures>({
+  sharedAccount: [
+    async ({ browser }, use) => {
+      // Expensive setup - runs once per worker
+      const account = await createTestAccount();
+      await use(account);
+      await deleteTestAccount(account);
+    },
+    { scope: "worker" },
+  ],
+});
+```
+
+### Isolate test data between parallel workers
+
+When tests in different workers touch the same backend or DB (e.g. same user, same tenant), they can collide and cause flaky failures. Use `testInfo.workerIndex` (or `process.env.TEST_WORKER_INDEX`) in a worker-scoped fixture to create unique data per worker:
+
+```typescript
+import { test as baseTest } from "@playwright/test";
+
+type WorkerFixtures = {
+  dbUserName: string;
+};
+
+export const test = baseTest.extend<{}, WorkerFixtures>({
+  dbUserName: [
+    async ({}, use, testInfo) => {
+      const userName = `user-${testInfo.workerIndex}`;
+      await createUserInTestDatabase(userName);
+      await use(userName);
+      await deleteUserFromTestDatabase(userName);
+    },
+    { scope: "worker" },
+  ],
+});
+```
+
+Then each worker uses a distinct user (e.g. `user-1`, `user-2`), so parallel workers do not overwrite each other’s data.
+
+## Hooks
+
+### beforeEach / afterEach
+
+```typescript
+test.beforeEach(async ({ page }) => {
+  // Runs before each test in file
+  await page.goto("/");
+});
+
+test.afterEach(async ({ page }, testInfo) => {
+  // Runs after each test
+  if (testInfo.status !== "passed") {
+    await page.screenshot({ path: `failed-${testInfo.title}.png` });
+  }
+});
+```
+
+### beforeAll / afterAll
+
+```typescript
+test.beforeAll(async ({ browser }) => {
+  // Runs once before all tests in file
+  // Note: Cannot use page fixture here
+});
+
+test.afterAll(async () => {
+  // Runs once after all tests in file
+});
+```
+
+### Describe-Level Hooks
+
+```typescript
+test.describe("User Management", () => {
+  test.beforeEach(async ({ page }) => {
+    await page.goto("/users");
+  });
+
+  test("can list users", async ({ page }) => {
+    // Starts at /users
+  });
+
+  test("can add user", async ({ page }) => {
+    // Starts at /users
+  });
+});
+```
+
+## Authentication Patterns
+
+### Global Setup with Storage State
+
+```typescript
+// auth.setup.ts
+import { test as setup, expect } from "@playwright/test";
+
+const authFile = ".auth/user.json";
+
+setup("authenticate", async ({ page }) => {
+  await page.goto("/login");
+  await page.getByLabel("Email").fill(process.env.TEST_EMAIL!);
+  await page.getByLabel("Password").fill(process.env.TEST_PASSWORD!);
+  await page.getByRole("button", { name: "Sign in" }).click();
+
+  await expect(page.getByRole("heading", { name: "Dashboard" })).toBeVisible();
+  await page.context().storageState({ path: authFile });
+});
+```
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  projects: [
+    { name: "setup", testMatch: /.*\.setup\.ts/ },
+    {
+      name: "chromium",
+      use: {
+        ...devices["Desktop Chrome"],
+        storageState: ".auth/user.json",
+      },
+      dependencies: ["setup"],
+    },
+  ],
+});
+```
+
+### Multiple Auth States
+
+```typescript
+// auth.setup.ts
+setup("admin auth", async ({ page }) => {
+  await login(page, "admin@example.com", "adminpass");
+  await page.context().storageState({ path: ".auth/admin.json" });
+});
+
+setup("user auth", async ({ page }) => {
+  await login(page, "user@example.com", "userpass");
+  await page.context().storageState({ path: ".auth/user.json" });
+});
+```
+
+```typescript
+// playwright.config.ts
+projects: [
+  {
+    name: "admin tests",
+    testMatch: /.*admin.*\.spec\.ts/,
+    use: { storageState: ".auth/admin.json" },
+    dependencies: ["setup"],
+  },
+  {
+    name: "user tests",
+    testMatch: /.*user.*\.spec\.ts/,
+    use: { storageState: ".auth/user.json" },
+    dependencies: ["setup"],
+  },
+];
+```
+
+### Auth Fixture
+
+```typescript
+// fixtures/auth.fixture.ts
+export const test = base.extend<{ adminPage: Page; userPage: Page }>({
+  adminPage: async ({ browser }, use) => {
+    const context = await browser.newContext({
+      storageState: ".auth/admin.json",
+    });
+    const page = await context.newPage();
+    await use(page);
+    await context.close();
+  },
+
+  userPage: async ({ browser }, use) => {
+    const context = await browser.newContext({
+      storageState: ".auth/user.json",
+    });
+    const page = await context.newPage();
+    await use(page);
+    await context.close();
+  },
+});
+```
+
+## Database Fixtures
+
+This section covers **per-test database fixtures** (isolation, transaction rollback). For related topics:
+
+- **Test data factories** (builders, Faker): See [test-data.md](test-data.md)
+- **One-time database setup** (migrations, snapshots): See [global-setup.md](global-setup.md#database-patterns)
+
+### Transaction Rollback Pattern
+
+```typescript
+import { test as base } from "@playwright/test";
+import { db } from "../db";
+
+export const test = base.extend<{ dbTransaction: Transaction }>({
+  dbTransaction: async ({}, use) => {
+    const transaction = await db.beginTransaction();
+
+    await use(transaction);
+
+    await transaction.rollback(); // Clean slate for next test
+  },
+});
+```
+
+### Seed Data Fixture
+
+```typescript
+type TestData = {
+  testUser: User;
+  testProducts: Product[];
+};
+
+export const test = base.extend<TestData>({
+  testUser: async ({}, use) => {
+    const user = await db.users.create({
+      email: `test-${Date.now()}@example.com`,
+      name: "Test User",
+    });
+
+    await use(user);
+
+    await db.users.delete(user.id);
+  },
+
+  testProducts: async ({ testUser }, use) => {
+    const products = await db.products.createMany([
+      { name: "Product A", ownerId: testUser.id },
+      { name: "Product B", ownerId: testUser.id },
+    ]);
+
+    await use(products);
+
+    await db.products.deleteMany(products.map((p) => p.id));
+  },
+});
+```
+
+## Fixture Tips
+
+| Tip                | Explanation                                 |
+| ------------------ | ------------------------------------------- |
+| Fixtures are lazy  | Only created when used                      |
+| Compose fixtures   | Use other fixtures as dependencies          |
+| Keep setup minimal | Do heavy lifting in worker-scoped fixtures  |
+| Clean up resources | Use teardown in fixtures, not afterEach     |
+| Avoid shared state | Each fixture instance should be independent |
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern                              | Problem                                                    | Solution                                                                                                                                                                              |
+| ----------------------------------------- | ---------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Shared mutable state between tests        | Race conditions, order dependencies                        | Use fixtures for isolation                                                                                                                                                            |
+| Global variables in tests                 | Tests depend on execution order                            | Use fixtures or beforeEach for setup                                                                                                                                                  |
+| Not cleaning up test data                 | Tests interfere with each other                            | Use fixtures with teardown or database transactions                                                                                                                                   |
+| Shared `page` or `context` in `beforeAll` | State leak between tests; flaky when tests run in parallel | Use default one-context-per-test, or `beforeEach` + fresh page; if serial is required, prefer `test.describe.configure({ mode: 'serial' })` and document that isolation is sacrificed |
+| Backend/DB state shared across workers    | Tests in different workers collide on same data            | Use worker-scoped fixture with `testInfo.workerIndex` to create unique data per worker                                                                                                |
+
+## Related References
+
+- **Page Objects with fixtures**: See [page-object-model.md](page-object-model.md) for POM patterns
+- **Test organization**: See [test-suite-structure.md](test-suite-structure.md) for test structure
+- **Debugging fixture issues**: See [debugging.md](../debugging/debugging.md) for troubleshooting
diff --git a/.cursor/skills/playwright-testing/core/global-setup.md b/.cursor/skills/playwright-testing/core/global-setup.md
new file mode 100644
index 0000000000..a0335226e7
--- /dev/null
+++ b/.cursor/skills/playwright-testing/core/global-setup.md
@@ -0,0 +1,434 @@
+# Global Setup & Teardown
+
+## Table of Contents
+
+1. [Global Setup](#global-setup)
+2. [Global Teardown](#global-teardown)
+3. [Database Patterns](#database-patterns)
+4. [Environment Provisioning](#environment-provisioning)
+5. [Setup Projects vs Global Setup](#setup-projects-vs-global-setup)
+6. [Parallel Execution Caveats](#parallel-execution-caveats)
+
+## Global Setup
+
+### Basic Global Setup
+
+```typescript
+// global-setup.ts
+import { FullConfig } from "@playwright/test";
+
+async function globalSetup(config: FullConfig) {
+  console.log("Running global setup...");
+  // Perform one-time setup: start services, run migrations, etc.
+}
+
+export default globalSetup;
+```
+
+### Configure Global Setup
+
+```typescript
+// playwright.config.ts
+import { defineConfig } from "@playwright/test";
+
+export default defineConfig({
+  globalSetup: require.resolve("./global-setup"),
+  globalTeardown: require.resolve("./global-teardown"),
+});
+```
+
+> **Authentication in Global Setup**: For authentication patterns using storage state in global setup, see [fixtures-hooks.md](fixtures-hooks.md#authentication-patterns). Setup projects are generally preferred for authentication as they provide access to Playwright fixtures.
+
+### Global Setup with Return Value
+
+```typescript
+// global-setup.ts
+async function globalSetup(config: FullConfig): Promise<() => Promise<void>> {
+  const server = await startTestServer();
+
+  // Return cleanup function (alternative to globalTeardown)
+  return async () => {
+    await server.stop();
+  };
+}
+
+export default globalSetup;
+```
+
+### Access Config in Global Setup
+
+```typescript
+// global-setup.ts
+import { FullConfig } from "@playwright/test";
+
+async function globalSetup(config: FullConfig) {
+  const { baseURL } = config.projects[0].use;
+  console.log(`Setting up for ${baseURL}`);
+
+  // Access custom config
+  const workers = config.workers;
+  const timeout = config.timeout;
+
+  // Access environment
+  const isCI = !!process.env.CI;
+}
+
+export default globalSetup;
+```
+
+## Global Teardown
+
+### Basic Global Teardown
+
+```typescript
+// global-teardown.ts
+import { FullConfig } from "@playwright/test";
+import fs from "fs";
+
+async function globalTeardown(config: FullConfig) {
+  console.log("Running global teardown...");
+
+  // Clean up auth files
+  if (fs.existsSync(".auth")) {
+    fs.rmSync(".auth", { recursive: true });
+  }
+
+  // Clean up test data
+  await cleanupTestDatabase();
+
+  // Stop services
+  await stopTestServices();
+}
+
+export default globalTeardown;
+```
+
+### Conditional Teardown
+
+```typescript
+// global-teardown.ts
+async function globalTeardown(config: FullConfig) {
+  // Skip cleanup in CI (containers are discarded anyway)
+  if (process.env.CI) {
+    console.log("Skipping teardown in CI");
+    return;
+  }
+
+  // Local cleanup
+  await cleanupLocalTestData();
+}
+
+export default globalTeardown;
+```
+
+## Database Patterns
+
+This section covers **one-time database setup** (migrations, snapshots, per-worker databases). For related topics:
+
+- **Per-test database fixtures** (isolation, transaction rollback): See [fixtures-hooks.md](fixtures-hooks.md#database-fixtures)
+- **Test data factories** (builders, Faker): See [test-data.md](test-data.md)
+
+### Database Migration in Setup
+
+```typescript
+// global-setup.ts
+import { execSync } from "child_process";
+
+async function globalSetup() {
+  console.log("Running database migrations...");
+
+  // Run migrations
+  execSync("npx prisma migrate deploy", { stdio: "inherit" });
+
+  // Seed test data
+  execSync("npx prisma db seed", { stdio: "inherit" });
+}
+
+export default globalSetup;
+```
+
+### Database Snapshot Pattern
+
+```typescript
+// global-setup.ts
+import { execSync } from "child_process";
+import fs from "fs";
+
+const SNAPSHOT_PATH = "./test-db-snapshot.sql";
+
+async function globalSetup() {
+  // Check if snapshot exists
+  if (fs.existsSync(SNAPSHOT_PATH)) {
+    console.log("Restoring database from snapshot...");
+    execSync(`psql $DATABASE_URL < ${SNAPSHOT_PATH}`, { stdio: "inherit" });
+    return;
+  }
+
+  // First run: migrate and create snapshot
+  console.log("Creating database snapshot...");
+  execSync("npx prisma migrate deploy", { stdio: "inherit" });
+  execSync("npx prisma db seed", { stdio: "inherit" });
+  execSync(`pg_dump $DATABASE_URL > ${SNAPSHOT_PATH}`, { stdio: "inherit" });
+}
+
+export default globalSetup;
+```
+
+### Test Database per Worker
+
+```typescript
+// global-setup.ts
+async function globalSetup(config: FullConfig) {
+  const workerCount = config.workers || 1;
+
+  // Create a database for each worker
+  for (let i = 0; i < workerCount; i++) {
+    const dbName = `test_db_worker_${i}`;
+    await createDatabase(dbName);
+    await runMigrations(dbName);
+    await seedDatabase(dbName);
+  }
+}
+
+// global-teardown.ts
+async function globalTeardown(config: FullConfig) {
+  const workerCount = config.workers || 1;
+
+  for (let i = 0; i < workerCount; i++) {
+    await dropDatabase(`test_db_worker_${i}`);
+  }
+}
+```
+
+## Environment Provisioning
+
+### Start Services in Setup
+
+```typescript
+// global-setup.ts
+import { execSync, spawn } from "child_process";
+
+let serverProcess: any;
+
+async function globalSetup() {
+  // Start backend server
+  serverProcess = spawn("npm", ["run", "start:test"], {
+    stdio: "pipe",
+    detached: true,
+  });
+
+  // Wait for server to be ready
+  await waitForServer("http://localhost:3000/health", 30000);
+
+  // Store PID for teardown
+  process.env.SERVER_PID = serverProcess.pid.toString();
+}
+
+async function waitForServer(url: string, timeout: number) {
+  const start = Date.now();
+
+  while (Date.now() - start < timeout) {
+    try {
+      const response = await fetch(url);
+      if (response.ok) return;
+    } catch {
+      // Server not ready yet
+    }
+    await new Promise((r) => setTimeout(r, 1000));
+  }
+
+  throw new Error(`Server did not start within ${timeout}ms`);
+}
+
+export default globalSetup;
+```
+
+### Docker Compose Setup
+
+```typescript
+// global-setup.ts
+import { execSync } from "child_process";
+
+async function globalSetup() {
+  console.log("Starting Docker services...");
+
+  execSync("docker-compose -f docker-compose.test.yml up -d", {
+    stdio: "inherit",
+  });
+
+  // Wait for services to be healthy
+  execSync("docker-compose -f docker-compose.test.yml exec -T db pg_isready", {
+    stdio: "inherit",
+  });
+}
+
+export default globalSetup;
+```
+
+```typescript
+// global-teardown.ts
+import { execSync } from "child_process";
+
+async function globalTeardown() {
+  console.log("Stopping Docker services...");
+
+  execSync("docker-compose -f docker-compose.test.yml down -v", {
+    stdio: "inherit",
+  });
+}
+
+export default globalTeardown;
+```
+
+### Environment Variables Setup
+
+```typescript
+// global-setup.ts
+import dotenv from "dotenv";
+import path from "path";
+
+async function globalSetup() {
+  // Load test-specific environment
+  const envFile = process.env.CI ? ".env.ci" : ".env.test";
+  dotenv.config({ path: path.resolve(process.cwd(), envFile) });
+
+  // Validate required variables
+  const required = ["DATABASE_URL", "API_KEY", "TEST_EMAIL"];
+  for (const key of required) {
+    if (!process.env[key]) {
+      throw new Error(`Missing required environment variable: ${key}`);
+    }
+  }
+}
+
+export default globalSetup;
+```
+
+## Setup Projects vs Global Setup
+
+### When to Use Each
+
+| Use Global Setup                      | Use Setup Projects                       |
+| ------------------------------------- | ---------------------------------------- |
+| One-time setup (migrations, services) | Per-project setup (auth states)          |
+| No access to Playwright fixtures      | Need page, request fixtures              |
+| Runs once before all projects         | Can run per-project or have dependencies |
+| Shared across all workers             | Can be parallelized                      |
+
+### Setup Project Pattern
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  projects: [
+    // Setup project
+    {
+      name: "setup",
+      testMatch: /.*\.setup\.ts/,
+    },
+    // Test projects depend on setup
+    {
+      name: "chromium",
+      use: { ...devices["Desktop Chrome"] },
+      dependencies: ["setup"],
+    },
+    {
+      name: "firefox",
+      use: { ...devices["Desktop Firefox"] },
+      dependencies: ["setup"],
+    },
+  ],
+});
+```
+
+> **For complete authentication setup patterns**, see [fixtures-hooks.md](fixtures-hooks.md#authentication-patterns).
+
+### Combining Both
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  // Global: Start services, run migrations
+  globalSetup: require.resolve("./global-setup"),
+  globalTeardown: require.resolve("./global-teardown"),
+
+  projects: [
+    // Setup project: Create auth states
+    { name: "setup", testMatch: /.*\.setup\.ts/ },
+    {
+      name: "chromium",
+      use: {
+        ...devices["Desktop Chrome"],
+        storageState: ".auth/user.json",
+      },
+      dependencies: ["setup"],
+    },
+  ],
+});
+```
+
+## Parallel Execution Caveats
+
+### Understanding Global Setup Execution
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│  globalSetup runs ONCE                                      │
+│  ↓                                                          │
+│  ┌─────────┐  ┌─────────┐  ┌─────────┐  ┌─────────┐        │
+│  │ Worker 1│  │ Worker 2│  │ Worker 3│  │ Worker 4│        │
+│  │ tests   │  │ tests   │  │ tests   │  │ tests   │        │
+│  └─────────┘  └─────────┘  └─────────┘  └─────────┘        │
+│  ↓                                                          │
+│  globalTeardown runs ONCE                                   │
+└─────────────────────────────────────────────────────────────┘
+```
+
+**Key implications:**
+
+- Global setup has **no access** to Playwright fixtures (`page`, `request`, `context`)
+- State created in global setup is **shared** across all workers
+- If tests **modify** shared state, they may conflict with parallel workers
+- Global setup **cannot** react to individual test needs
+
+### When to Prefer Worker-Scoped Fixtures
+
+Use **worker-scoped fixtures** instead of globalSetup when:
+
+| Scenario                             | Why Fixtures Are Better                              |
+| ------------------------------------ | ---------------------------------------------------- |
+| Each worker needs isolated resources | Fixtures can create per-worker databases, servers    |
+| Setup needs Playwright APIs          | Fixtures have access to `page`, `request`, `browser` |
+| Setup depends on test configuration  | Fixtures receive test context and options            |
+| Resources need cleanup per worker    | Worker fixtures auto-cleanup when worker exits       |
+
+### Common Parallel Pitfall
+
+```typescript
+// ❌ BAD: Global setup creates ONE user, all workers fight over it
+async function globalSetup() {
+  await createUser({ email: "test@example.com" }); // Shared!
+}
+
+// ✅ GOOD: Each worker gets its own user via worker-scoped fixture
+// Uses workerInfo.workerIndex to create unique data per worker
+```
+
+> **For worker-scoped fixture patterns** (per-worker databases, unique test data, `workerIndex` isolation), see [fixtures-hooks.md](fixtures-hooks.md#isolate-test-data-between-parallel-workers).
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern                   | Problem                          | Solution                                   |
+| ------------------------------ | -------------------------------- | ------------------------------------------ |
+| Heavy setup in globalSetup     | Slow test startup                | Use setup projects for parallelizable work |
+| Not cleaning up in teardown    | Leaks resources, flaky CI        | Always clean up or use containers          |
+| Hardcoded URLs in setup        | Breaks in different environments | Use config.projects[0].use.baseURL         |
+| No timeout on service wait     | Hangs forever if service fails   | Add timeout with clear error               |
+| Shared mutable state           | Race conditions in parallel      | Use worker-scoped fixtures for isolation   |
+| Global setup for per-test data | Tests conflict                   | Use test-scoped fixtures                   |
+
+## Related References
+
+- **Fixtures & Auth**: See [fixtures-hooks.md](fixtures-hooks.md) for worker-scoped fixtures and auth patterns
+- **CI/CD**: See [ci-cd.md](../infrastructure-ci-cd/ci-cd.md) for CI setup patterns
+- **Projects**: See [projects-dependencies.md](projects-dependencies.md) for project configuration
diff --git a/.cursor/skills/playwright-testing/core/locators.md b/.cursor/skills/playwright-testing/core/locators.md
new file mode 100644
index 0000000000..afe3af3616
--- /dev/null
+++ b/.cursor/skills/playwright-testing/core/locators.md
@@ -0,0 +1,242 @@
+# Locator Strategies
+
+## Table of Contents
+
+1. [Priority Order](#priority-order)
+2. [User-Facing Locators](#user-facing-locators)
+3. [Filtering & Chaining](#filtering--chaining)
+4. [Dynamic Content](#dynamic-content)
+5. [Shadow DOM](#shadow-dom)
+6. [Iframes](#iframes)
+
+## Priority Order
+
+Use locators in this order of preference:
+
+1. **Role-based** (most resilient): `getByRole`
+2. **Label-based**: `getByLabel`, `getByPlaceholder`
+3. **Text-based**: `getByText`, `getByTitle`
+4. **Test IDs** (when semantic locators aren't possible): `getByTestId`
+5. **CSS/XPath** (last resort): `locator('css=...')`, `locator('xpath=...')`
+
+## User-Facing Locators
+
+### getByRole
+
+Most robust approach - matches how users and assistive technology perceive the page.
+
+```typescript
+// Buttons
+page.getByRole("button", { name: "Submit", exact: true }); // exact accessible name
+page.getByRole("button", { name: /submit/i }); // flexible case-insensitive match
+
+// Links
+page.getByRole("link", { name: "Home" });
+
+// Form elements
+page.getByRole("textbox", { name: "Email" });
+page.getByRole("checkbox", { name: "Remember me" });
+page.getByRole("combobox", { name: "Country" });
+page.getByRole("radio", { name: "Option A" });
+
+// Headings
+page.getByRole("heading", { name: "Welcome", level: 1 });
+
+// Lists & items
+page.getByRole("list").getByRole("listitem");
+
+// Navigation & regions
+page.getByRole("navigation");
+page.getByRole("main");
+page.getByRole("dialog");
+page.getByRole("alert");
+```
+
+### getByLabel
+
+For form elements with associated labels.
+
+```typescript
+// Input with <label for="email">
+page.getByLabel("Email address");
+
+// Input with aria-label
+page.getByLabel("Search");
+
+// Exact match
+page.getByLabel("Email", { exact: true });
+```
+
+### getByPlaceholder
+
+```typescript
+page.getByPlaceholder("Enter your email");
+page.getByPlaceholder(/email/i);
+```
+
+### getByText
+
+```typescript
+// Partial match (default)
+page.getByText("Welcome");
+
+// Exact match
+page.getByText("Welcome to our site", { exact: true });
+
+// Regex
+page.getByText(/welcome/i);
+```
+
+### getByTestId
+
+Configure custom test ID attribute in `playwright.config.ts`:
+
+```typescript
+use: {
+  testIdAttribute: "data-testid"; // default
+}
+```
+
+Usage:
+
+```typescript
+// React: <Button data-testid="submit-btn">Submit</Button>
+page.getByTestId("submit-btn");
+```
+
+## Filtering & Chaining
+
+### filter()
+
+Narrow down locators:
+
+```typescript
+// Filter by text
+page.getByRole("listitem").filter({ hasText: "Product" });
+
+// Filter by NOT having text
+page.getByRole("listitem").filter({ hasNotText: "Out of stock" });
+
+// Filter by child locator
+page.getByRole("listitem").filter({
+  has: page.getByRole("button", { name: "Buy" }),
+});
+
+// Combine filters
+page
+  .getByRole("listitem")
+  .filter({ hasText: "Product" })
+  .filter({ has: page.getByText("$9.99") });
+```
+
+### Chaining
+
+```typescript
+// Navigate down the DOM tree
+page.getByRole("article").getByRole("heading");
+
+// Get parent/ancestor
+page.getByText("Child").locator("..");
+page.getByText("Child").locator("xpath=ancestor::article");
+```
+
+### nth() and first()/last()
+
+```typescript
+page.getByRole("listitem").first();
+page.getByRole("listitem").last();
+page.getByRole("listitem").nth(2); // 0-indexed
+```
+
+## Dynamic Content
+
+### Waiting for Elements
+
+Locators auto-wait for actionability by default. For explicit state waiting:
+
+```typescript
+await page.getByRole("button").waitFor({ state: "visible" });
+await page.getByText("Loading").waitFor({ state: "hidden" });
+```
+
+> **For comprehensive waiting strategies** (element state, navigation, network, polling with `toPass()`), see [assertions-waiting.md](assertions-waiting.md#waiting-strategies).
+
+### Lists with Dynamic Items
+
+```typescript
+// Wait for specific count
+await expect(page.getByRole("listitem")).toHaveCount(5);
+
+// Get all matching elements
+const items = await page.getByRole("listitem").all();
+for (const item of items) {
+  await expect(item).toBeVisible();
+}
+```
+
+## Shadow DOM
+
+Playwright pierces shadow DOM by default:
+
+```typescript
+// Automatically finds elements inside shadow roots
+page.getByRole("button", { name: "Shadow Button" });
+
+// Explicit shadow DOM traversal (if needed)
+page.locator("my-component").locator("internal:shadow=button");
+```
+
+## Iframes
+
+```typescript
+// By frame name or URL
+const frame = page.frameLocator('iframe[name="content"]');
+await frame.getByRole("button").click();
+
+// By index
+const frame = page.frameLocator("iframe").first();
+
+// Nested iframes
+const nestedFrame = page.frameLocator("#outer").frameLocator("#inner");
+await nestedFrame.getByText("Content").click();
+```
+
+## Debugging Locators
+
+```typescript
+// Highlight element in headed mode
+await page.getByRole("button").highlight();
+
+// Count matches
+const count = await page.getByRole("listitem").count();
+
+// Check if exists without waiting
+const exists = (await page.getByRole("button").count()) > 0;
+
+// Use Playwright Inspector
+// PWDEBUG=1 npx playwright test
+```
+
+## Common Issues & Solutions
+
+| Issue                   | Solution                                         |
+| ----------------------- | ------------------------------------------------ |
+| Multiple elements match | Add filters or use `nth()`, `first()`, `last()`  |
+| Element not found       | Check visibility, wait for load, verify selector |
+| Stale element           | Locators are lazy; re-query if DOM changes       |
+| Dynamic IDs             | Use stable attributes like role, text, test-id   |
+| Hidden elements         | Use `{ force: true }` only when necessary        |
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern                      | Problem                           | Solution                                          |
+| --------------------------------- | --------------------------------- | ------------------------------------------------- |
+| `page.locator('.btn-primary')`    | Brittle, implementation-dependent | `page.getByRole('button', { name: 'Submit' })`    |
+| `page.locator('#dynamic-id-123')` | Breaks when IDs change            | Use stable attributes like role, text, or test-id |
+| Testing implementation details    | Breaks on refactoring             | Test user-visible behavior                        |
+
+## Related References
+
+- **Debugging selector issues**: See [debugging.md](../debugging/debugging.md) for troubleshooting
+- **Waiting for elements**: See [assertions-waiting.md](assertions-waiting.md) for waiting strategies
+- **Using in Page Objects**: See [page-object-model.md](page-object-model.md) for organizing locators
diff --git a/.cursor/skills/playwright-testing/core/page-object-model.md b/.cursor/skills/playwright-testing/core/page-object-model.md
new file mode 100644
index 0000000000..0cdc6d642c
--- /dev/null
+++ b/.cursor/skills/playwright-testing/core/page-object-model.md
@@ -0,0 +1,315 @@
+# Page Object Model (POM)
+
+## Table of Contents
+
+1. [Overview](#overview)
+2. [Basic Structure](#basic-structure)
+3. [Component Objects](#component-objects)
+4. [Composition Patterns](#composition-patterns)
+5. [Factory Functions](#factory-functions)
+6. [Best Practices](#best-practices)
+
+## Overview
+
+Page Object Model encapsulates page structure and interactions, providing:
+
+- **Maintainability**: Change selectors in one place
+- **Reusability**: Share page interactions across tests
+- **Readability**: Tests express intent, not implementation
+
+## Basic Structure
+
+### Page Class
+
+```typescript
+// pages/login.page.ts
+import { Page, Locator, expect } from "@playwright/test";
+
+export class LoginPage {
+  readonly page: Page;
+  readonly emailInput: Locator;
+  readonly passwordInput: Locator;
+  readonly submitButton: Locator;
+  readonly errorMessage: Locator;
+
+  constructor(page: Page) {
+    this.page = page;
+    this.emailInput = page.getByLabel("Email");
+    this.passwordInput = page.getByLabel("Password");
+    this.submitButton = page.getByRole("button", { name: "Sign in" });
+    this.errorMessage = page.getByRole("alert");
+  }
+
+  async goto() {
+    await this.page.goto("/login");
+  }
+
+  async login(email: string, password: string) {
+    await this.emailInput.fill(email);
+    await this.passwordInput.fill(password);
+    await this.submitButton.click();
+  }
+
+  async expectError(message: string) {
+    await expect(this.errorMessage).toContainText(message);
+  }
+}
+```
+
+### Usage in Tests
+
+```typescript
+// tests/login.spec.ts
+import { test, expect } from "@playwright/test";
+import { LoginPage } from "../pages/login.page";
+
+test.describe("Login", () => {
+  test("successful login redirects to dashboard", async ({ page }) => {
+    const loginPage = new LoginPage(page);
+    await loginPage.goto();
+    await loginPage.login("user@example.com", "password123");
+    await expect(page).toHaveURL("/dashboard");
+  });
+
+  test("shows error for invalid credentials", async ({ page }) => {
+    const loginPage = new LoginPage(page);
+    await loginPage.goto();
+    await loginPage.login("invalid@example.com", "wrong");
+    await loginPage.expectError("Invalid credentials");
+  });
+});
+```
+
+## Component Objects
+
+For reusable UI components:
+
+```typescript
+// components/navbar.component.ts
+import { Page, Locator } from "@playwright/test";
+
+export class NavbarComponent {
+  readonly container: Locator;
+  readonly logo: Locator;
+  readonly searchInput: Locator;
+  readonly userMenu: Locator;
+
+  constructor(page: Page) {
+    this.container = page.getByRole("navigation");
+    this.logo = this.container.getByRole("link", { name: "Home" });
+    this.searchInput = this.container.getByRole("searchbox");
+    this.userMenu = this.container.getByRole("button", { name: /user menu/i });
+  }
+
+  async search(query: string) {
+    await this.searchInput.fill(query);
+    await this.searchInput.press("Enter");
+  }
+
+  async openUserMenu() {
+    await this.userMenu.click();
+  }
+}
+```
+
+```typescript
+// components/modal.component.ts
+import { Locator, expect } from "@playwright/test";
+
+export class ModalComponent {
+  readonly container: Locator;
+  readonly title: Locator;
+  readonly closeButton: Locator;
+  readonly confirmButton: Locator;
+
+  constructor(container: Locator) {
+    this.container = container;
+    this.title = container.getByRole("heading");
+    this.closeButton = container.getByRole("button", { name: "Close" });
+    this.confirmButton = container.getByRole("button", { name: "Confirm" });
+  }
+
+  async expectTitle(title: string) {
+    await expect(this.title).toHaveText(title);
+  }
+
+  async close() {
+    await this.closeButton.click();
+  }
+
+  async confirm() {
+    await this.confirmButton.click();
+  }
+}
+```
+
+## Composition Patterns
+
+### Page with Components
+
+```typescript
+// pages/dashboard.page.ts
+import { Page, Locator } from "@playwright/test";
+import { NavbarComponent } from "../components/navbar.component";
+import { ModalComponent } from "../components/modal.component";
+
+export class DashboardPage {
+  readonly page: Page;
+  readonly navbar: NavbarComponent;
+  readonly newProjectButton: Locator;
+
+  constructor(page: Page) {
+    this.page = page;
+    this.navbar = new NavbarComponent(page);
+    this.newProjectButton = page.getByRole("button", { name: "New Project" });
+  }
+
+  async goto() {
+    await this.page.goto("/dashboard");
+  }
+
+  async createProject() {
+    await this.newProjectButton.click();
+    return new ModalComponent(this.page.getByRole("dialog"));
+  }
+}
+```
+
+### Page Navigation
+
+```typescript
+// pages/base.page.ts
+import { Page } from "@playwright/test";
+
+export abstract class BasePage {
+  constructor(readonly page: Page) {}
+
+  abstract goto(): Promise<void>;
+
+  async getTitle(): Promise<string> {
+    return this.page.title();
+  }
+}
+```
+
+```typescript
+// Return new page object on navigation
+export class LoginPage extends BasePage {
+  async login(email: string, password: string): Promise<DashboardPage> {
+    await this.emailInput.fill(email);
+    await this.passwordInput.fill(password);
+    await this.submitButton.click();
+    return new DashboardPage(this.page);
+  }
+}
+
+// Usage
+const loginPage = new LoginPage(page);
+await loginPage.goto();
+const dashboardPage = await loginPage.login("user@example.com", "pass");
+await dashboardPage.expectWelcomeMessage();
+```
+
+## Factory Functions
+
+Alternative to classes for simpler pages:
+
+```typescript
+// pages/login.page.ts
+import { Page } from "@playwright/test";
+
+export function createLoginPage(page: Page) {
+  const emailInput = page.getByLabel("Email");
+  const passwordInput = page.getByLabel("Password");
+  const submitButton = page.getByRole("button", { name: "Sign in" });
+
+  return {
+    goto: () => page.goto("/login"),
+    login: async (email: string, password: string) => {
+      await emailInput.fill(email);
+      await passwordInput.fill(password);
+      await submitButton.click();
+    },
+    emailInput,
+    passwordInput,
+    submitButton,
+  };
+}
+
+// Usage
+const loginPage = createLoginPage(page);
+await loginPage.goto();
+await loginPage.login("user@example.com", "password");
+```
+
+## Best Practices
+
+### Do
+
+- **Keep locators in page objects** - Single source of truth
+- **Return new page objects** when navigation occurs
+- **Expose elements** for custom assertions in tests
+- **Use descriptive method names** - `submitOrder()` not `clickButton()`
+- **Keep methods focused** - One action per method
+
+### Don't
+
+- **Don't include assertions in page methods** (usually) - Keep in tests
+- **Don't expose implementation details** - Hide complex interactions
+- **Don't make page objects too large** - Split into components
+- **Don't share state** between page object instances
+
+### Directory Structure
+
+```
+tests/
+├── pages/
+│   ├── base.page.ts
+│   ├── login.page.ts
+│   ├── dashboard.page.ts
+│   └── settings.page.ts
+├── components/
+│   ├── navbar.component.ts
+│   ├── modal.component.ts
+│   └── table.component.ts
+├── fixtures/
+│   └── pages.fixture.ts
+└── specs/
+    ├── login.spec.ts
+    └── dashboard.spec.ts
+```
+
+### Using with Fixtures
+
+```typescript
+// fixtures/pages.fixture.ts
+import { test as base } from "@playwright/test";
+import { LoginPage } from "../pages/login.page";
+import { DashboardPage } from "../pages/dashboard.page";
+
+type Pages = {
+  loginPage: LoginPage;
+  dashboardPage: DashboardPage;
+};
+
+export const test = base.extend<Pages>({
+  loginPage: async ({ page }, use) => {
+    await use(new LoginPage(page));
+  },
+  dashboardPage: async ({ page }, use) => {
+    await use(new DashboardPage(page));
+  },
+});
+
+// Usage in tests
+test("can login", async ({ loginPage }) => {
+  await loginPage.goto();
+  await loginPage.login("user@example.com", "password");
+});
+```
+
+## Related References
+
+- **Locator strategies**: See [locators.md](locators.md) for selecting elements
+- **Fixtures**: See [fixtures-hooks.md](fixtures-hooks.md) for advanced fixture patterns
+- **Test organization**: See [test-suite-structure.md](test-suite-structure.md) for structuring test suites
diff --git a/.cursor/skills/playwright-testing/core/projects-dependencies.md b/.cursor/skills/playwright-testing/core/projects-dependencies.md
new file mode 100644
index 0000000000..1005daeb20
--- /dev/null
+++ b/.cursor/skills/playwright-testing/core/projects-dependencies.md
@@ -0,0 +1,453 @@
+# Projects & Dependencies
+
+## Table of Contents
+
+1. [Project Configuration](#project-configuration)
+2. [Project Dependencies](#project-dependencies)
+3. [Setup Projects](#setup-projects)
+4. [Filtering & Running Projects](#filtering--running-projects)
+5. [Sharing Configuration](#sharing-configuration)
+6. [Advanced Patterns](#advanced-patterns)
+
+## Project Configuration
+
+### Basic Multi-Browser Setup
+
+```typescript
+// playwright.config.ts
+import { defineConfig, devices } from "@playwright/test";
+
+export default defineConfig({
+  projects: [
+    {
+      name: "chromium",
+      use: { ...devices["Desktop Chrome"] },
+    },
+    {
+      name: "firefox",
+      use: { ...devices["Desktop Firefox"] },
+    },
+    {
+      name: "webkit",
+      use: { ...devices["Desktop Safari"] },
+    },
+  ],
+});
+```
+
+### Environment-Based Projects
+
+```typescript
+export default defineConfig({
+  projects: [
+    {
+      name: "staging",
+      use: {
+        baseURL: "https://staging.example.com",
+      },
+    },
+    {
+      name: "production",
+      use: {
+        baseURL: "https://example.com",
+      },
+    },
+    {
+      name: "local",
+      use: {
+        baseURL: "http://localhost:3000",
+      },
+    },
+  ],
+});
+```
+
+### Test Type Projects
+
+```typescript
+export default defineConfig({
+  projects: [
+    {
+      name: "e2e",
+      testDir: "./tests/e2e",
+      use: { ...devices["Desktop Chrome"] },
+    },
+    {
+      name: "api",
+      testDir: "./tests/api",
+      use: { baseURL: "http://localhost:3000" },
+    },
+    {
+      name: "visual",
+      testDir: "./tests/visual",
+      use: {
+        ...devices["Desktop Chrome"],
+        viewport: { width: 1280, height: 720 },
+      },
+    },
+  ],
+});
+```
+
+## Project Dependencies
+
+### Setup Dependency
+
+```typescript
+export default defineConfig({
+  projects: [
+    // Setup project runs first
+    {
+      name: "setup",
+      testMatch: /.*\.setup\.ts/,
+    },
+
+    // Browser projects depend on setup
+    {
+      name: "chromium",
+      use: {
+        ...devices["Desktop Chrome"],
+        storageState: ".auth/user.json",
+      },
+      dependencies: ["setup"],
+    },
+    {
+      name: "firefox",
+      use: {
+        ...devices["Desktop Firefox"],
+        storageState: ".auth/user.json",
+      },
+      dependencies: ["setup"],
+    },
+  ],
+});
+```
+
+### Multiple Auth States
+
+```typescript
+export default defineConfig({
+  projects: [
+    // Auth setup projects
+    {
+      name: "setup-admin",
+      testMatch: /admin\.setup\.ts/,
+    },
+    {
+      name: "setup-user",
+      testMatch: /user\.setup\.ts/,
+    },
+
+    // Admin tests
+    {
+      name: "admin-tests",
+      testDir: "./tests/admin",
+      use: { storageState: ".auth/admin.json" },
+      dependencies: ["setup-admin"],
+    },
+
+    // User tests
+    {
+      name: "user-tests",
+      testDir: "./tests/user",
+      use: { storageState: ".auth/user.json" },
+      dependencies: ["setup-user"],
+    },
+
+    // Tests that need both
+    {
+      name: "integration-tests",
+      testDir: "./tests/integration",
+      dependencies: ["setup-admin", "setup-user"],
+    },
+  ],
+});
+```
+
+### Chained Dependencies
+
+```typescript
+export default defineConfig({
+  projects: [
+    // Step 1: Database setup
+    {
+      name: "db-setup",
+      testMatch: /db\.setup\.ts/,
+    },
+
+    // Step 2: Auth setup (needs DB)
+    {
+      name: "auth-setup",
+      testMatch: /auth\.setup\.ts/,
+      dependencies: ["db-setup"],
+    },
+
+    // Step 3: Seed data (needs auth)
+    {
+      name: "seed-setup",
+      testMatch: /seed\.setup\.ts/,
+      dependencies: ["auth-setup"],
+    },
+
+    // Tests (need everything)
+    {
+      name: "tests",
+      testDir: "./tests",
+      dependencies: ["seed-setup"],
+    },
+  ],
+});
+```
+
+## Setup Projects
+
+### Authentication Setup
+
+Setup projects are the recommended way to handle authentication. They run before your main test projects and can use Playwright fixtures.
+
+> **For complete authentication patterns** (storage state, multiple auth states, auth fixtures), see [fixtures-hooks.md](fixtures-hooks.md#authentication-patterns).
+
+### Data Seeding Setup
+
+```typescript
+// seed.setup.ts
+import { test as setup } from "@playwright/test";
+
+setup("seed test data", async ({ request }) => {
+  // Create test data via API
+  await request.post("/api/test/seed", {
+    data: {
+      users: 10,
+      products: 50,
+      orders: 100,
+    },
+  });
+});
+```
+
+### Cleanup Setup
+
+```typescript
+// cleanup.setup.ts
+import { test as setup } from "@playwright/test";
+
+setup("cleanup previous run", async ({ request }) => {
+  // Clean up data from previous test runs
+  await request.delete("/api/test/cleanup");
+});
+```
+
+## Filtering & Running Projects
+
+### Run Specific Project
+
+```bash
+# Run single project
+npx playwright test --project=chromium
+
+# Run multiple projects
+npx playwright test --project=chromium --project=firefox
+```
+
+### Run by Grep
+
+```bash
+# Run tests matching pattern
+npx playwright test --grep @smoke
+
+# Run project with grep
+npx playwright test --project=chromium --grep @critical
+
+# Exclude pattern
+npx playwright test --grep-invert @slow
+```
+
+### Project-Specific Grep
+
+```typescript
+export default defineConfig({
+  projects: [
+    {
+      name: "smoke",
+      grep: /@smoke/,
+      use: { ...devices["Desktop Chrome"] },
+    },
+    {
+      name: "regression",
+      grepInvert: /@smoke/,
+      use: { ...devices["Desktop Chrome"] },
+    },
+  ],
+});
+```
+
+## Sharing Configuration
+
+### Base Configuration
+
+```typescript
+// playwright.config.ts
+const baseConfig = {
+  timeout: 30000,
+  expect: { timeout: 5000 },
+  use: {
+    trace: "on-first-retry",
+    screenshot: "only-on-failure",
+  },
+};
+
+export default defineConfig({
+  ...baseConfig,
+  projects: [
+    {
+      name: "chromium",
+      use: {
+        ...baseConfig.use,
+        ...devices["Desktop Chrome"],
+      },
+    },
+    {
+      name: "firefox",
+      use: {
+        ...baseConfig.use,
+        ...devices["Desktop Firefox"],
+      },
+    },
+  ],
+});
+```
+
+### Shared Project Settings
+
+```typescript
+const sharedBrowserConfig = {
+  timeout: 60000,
+  retries: 2,
+  use: {
+    video: "on-first-retry",
+    trace: "on-first-retry",
+  },
+};
+
+export default defineConfig({
+  projects: [
+    {
+      name: "chromium",
+      ...sharedBrowserConfig,
+      use: {
+        ...sharedBrowserConfig.use,
+        ...devices["Desktop Chrome"],
+      },
+    },
+    {
+      name: "firefox",
+      ...sharedBrowserConfig,
+      use: {
+        ...sharedBrowserConfig.use,
+        ...devices["Desktop Firefox"],
+      },
+    },
+  ],
+});
+```
+
+## Advanced Patterns
+
+### Conditional Projects
+
+```typescript
+const projects = [
+  {
+    name: "chromium",
+    use: { ...devices["Desktop Chrome"] },
+  },
+];
+
+// Add Firefox only in CI
+if (process.env.CI) {
+  projects.push({
+    name: "firefox",
+    use: { ...devices["Desktop Firefox"] },
+  });
+}
+
+// Add mobile only for specific test dirs
+if (process.env.TEST_MOBILE) {
+  projects.push({
+    name: "mobile",
+    use: { ...devices["iPhone 14"] },
+  });
+}
+
+export default defineConfig({ projects });
+```
+
+### Project Metadata
+
+```typescript
+export default defineConfig({
+  projects: [
+    {
+      name: "chromium",
+      use: { ...devices["Desktop Chrome"] },
+      metadata: {
+        platform: "desktop",
+        browser: "chromium",
+        priority: "high",
+      },
+    },
+  ],
+});
+
+// Access in test
+test("example", async ({ page }, testInfo) => {
+  const { platform, priority } = testInfo.project.metadata;
+  console.log(`Running on ${platform} with ${priority} priority`);
+});
+```
+
+### Teardown Projects
+
+```typescript
+export default defineConfig({
+  projects: [
+    {
+      name: "setup",
+      testMatch: /.*\.setup\.ts/,
+      teardown: "teardown", // Run teardown after this completes
+    },
+    {
+      name: "teardown",
+      testMatch: /.*\.teardown\.ts/,
+    },
+    {
+      name: "tests",
+      dependencies: ["setup"],
+    },
+  ],
+});
+```
+
+```typescript
+// cleanup.teardown.ts
+import { test as teardown } from "@playwright/test";
+
+teardown("cleanup", async ({ request }) => {
+  await request.delete("/api/test/data");
+});
+```
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern               | Problem                | Solution                            |
+| -------------------------- | ---------------------- | ----------------------------------- |
+| Too many browser projects  | Slow CI, expensive     | Focus on critical browsers          |
+| Missing setup dependencies | Tests fail randomly    | Declare all dependencies explicitly |
+| Duplicated configuration   | Hard to maintain       | Extract shared config               |
+| Not using setup projects   | Repeated auth in tests | Use setup project + storageState    |
+
+## Related References
+
+- **Global Setup**: See [global-setup.md](global-setup.md) for globalSetup vs setup projects
+- **Fixtures**: See [fixtures-hooks.md](fixtures-hooks.md) for authentication patterns
+- **CI/CD**: See [ci-cd.md](../infrastructure-ci-cd/ci-cd.md) for running projects in CI
diff --git a/.cursor/skills/playwright-testing/core/test-data.md b/.cursor/skills/playwright-testing/core/test-data.md
new file mode 100644
index 0000000000..1c07af843e
--- /dev/null
+++ b/.cursor/skills/playwright-testing/core/test-data.md
@@ -0,0 +1,492 @@
+# Test Data Factories & Generators
+
+This file covers **reusable test data builders** (factories, Faker, data generators). For related topics:
+
+- **Per-test database fixtures** (isolation, transaction rollback): See [fixtures-hooks.md](fixtures-hooks.md#database-fixtures)
+- **One-time database setup** (migrations, snapshots): See [global-setup.md](global-setup.md#database-patterns)
+
+## Table of Contents
+
+1. [Factory Pattern](#factory-pattern)
+2. [Faker Integration](#faker-integration)
+3. [Data-Driven Testing](#data-driven-testing)
+4. [Test Data Fixtures](#test-data-fixtures)
+5. [Database Seeding](#database-seeding)
+
+## Factory Pattern
+
+### Basic Factory
+
+```typescript
+// factories/user.factory.ts
+interface User {
+  id: string;
+  email: string;
+  name: string;
+  role: "admin" | "user" | "guest";
+  createdAt: Date;
+}
+
+let userIdCounter = 0;
+
+export function createUser(overrides: Partial<User> = {}): User {
+  userIdCounter++;
+  return {
+    id: `user-${userIdCounter}`,
+    email: `user${userIdCounter}@test.com`,
+    name: `Test User ${userIdCounter}`,
+    role: "user",
+    createdAt: new Date(),
+    ...overrides,
+  };
+}
+
+// Usage
+const user = createUser();
+const admin = createUser({ role: "admin", name: "Admin User" });
+```
+
+### Factory with Traits
+
+```typescript
+// factories/product.factory.ts
+interface Product {
+  id: string;
+  name: string;
+  price: number;
+  stock: number;
+  category: string;
+  featured: boolean;
+}
+
+type ProductTrait = "outOfStock" | "featured" | "expensive" | "sale";
+
+const traits: Record<ProductTrait, Partial<Product>> = {
+  outOfStock: { stock: 0 },
+  featured: { featured: true },
+  expensive: { price: 999.99 },
+  sale: { price: 9.99 },
+};
+
+let productIdCounter = 0;
+
+export function createProduct(
+  overrides: Partial<Product> = {},
+  ...traitNames: ProductTrait[]
+): Product {
+  productIdCounter++;
+
+  const appliedTraits = traitNames.reduce(
+    (acc, trait) => ({ ...acc, ...traits[trait] }),
+    {},
+  );
+
+  return {
+    id: `prod-${productIdCounter}`,
+    name: `Product ${productIdCounter}`,
+    price: 29.99,
+    stock: 100,
+    category: "General",
+    featured: false,
+    ...appliedTraits,
+    ...overrides,
+  };
+}
+
+// Usage
+const product = createProduct();
+const featuredProduct = createProduct({}, "featured");
+const saleItem = createProduct({ name: "Sale Item" }, "sale", "featured");
+const soldOut = createProduct({}, "outOfStock");
+```
+
+### Factory with Relationships
+
+```typescript
+// factories/order.factory.ts
+import { createUser, User } from "./user.factory";
+import { createProduct, Product } from "./product.factory";
+
+interface OrderItem {
+  product: Product;
+  quantity: number;
+}
+
+interface Order {
+  id: string;
+  user: User;
+  items: OrderItem[];
+  total: number;
+  status: "pending" | "paid" | "shipped" | "delivered";
+}
+
+let orderIdCounter = 0;
+
+export function createOrder(overrides: Partial<Order> = {}): Order {
+  orderIdCounter++;
+
+  const user = overrides.user ?? createUser();
+  const items = overrides.items ?? [{ product: createProduct(), quantity: 1 }];
+  const total = items.reduce(
+    (sum, item) => sum + item.product.price * item.quantity,
+    0,
+  );
+
+  return {
+    id: `order-${orderIdCounter}`,
+    user,
+    items,
+    total,
+    status: "pending",
+    ...overrides,
+  };
+}
+
+// Usage
+const order = createOrder();
+const bigOrder = createOrder({
+  items: [
+    { product: createProduct({ price: 100 }), quantity: 5 },
+    { product: createProduct({ price: 50 }), quantity: 2 },
+  ],
+});
+```
+
+## Faker Integration
+
+### Setup Faker
+
+```bash
+npm install -D @faker-js/faker
+```
+
+```typescript
+// factories/faker-user.factory.ts
+import { faker } from "@faker-js/faker";
+
+interface User {
+  id: string;
+  email: string;
+  name: string;
+  avatar: string;
+  address: {
+    street: string;
+    city: string;
+    country: string;
+    zipCode: string;
+  };
+}
+
+export function createFakeUser(overrides: Partial<User> = {}): User {
+  return {
+    id: faker.string.uuid(),
+    email: faker.internet.email(),
+    name: faker.person.fullName(),
+    avatar: faker.image.avatar(),
+    address: {
+      street: faker.location.streetAddress(),
+      city: faker.location.city(),
+      country: faker.location.country(),
+      zipCode: faker.location.zipCode(),
+    },
+    ...overrides,
+  };
+}
+```
+
+### Seeded Faker for Reproducibility
+
+```typescript
+import { faker } from "@faker-js/faker";
+
+// Set seed for reproducible data
+faker.seed(12345);
+
+export function createDeterministicUser(): User {
+  return {
+    id: faker.string.uuid(),
+    email: faker.internet.email(),
+    name: faker.person.fullName(),
+    // Same seed = same data every time
+  };
+}
+
+// Or seed per test
+test("user profile", async ({ page }) => {
+  faker.seed(42); // Reset seed for this test
+  const user = createFakeUser();
+  // user will always have the same data
+});
+```
+
+### Faker Fixture
+
+```typescript
+// fixtures/faker.fixture.ts
+import { test as base } from "@playwright/test";
+import { faker } from "@faker-js/faker";
+
+type FakerFixtures = {
+  fake: typeof faker;
+};
+
+export const test = base.extend<FakerFixtures>({
+  fake: async ({}, use, testInfo) => {
+    // Seed based on test name for reproducibility
+    faker.seed(testInfo.title.length);
+    await use(faker);
+  },
+});
+
+// Usage
+test("create user with fake data", async ({ page, fake }) => {
+  await page.goto("/signup");
+
+  await page.getByLabel("Name").fill(fake.person.fullName());
+  await page.getByLabel("Email").fill(fake.internet.email());
+  await page.getByLabel("Password").fill(fake.internet.password());
+
+  await page.getByRole("button", { name: "Sign Up" }).click();
+});
+```
+
+## Data-Driven Testing
+
+### test.each with Arrays
+
+```typescript
+const loginScenarios = [
+  { email: "user@example.com", password: "pass123", expected: "Dashboard" },
+  { email: "admin@example.com", password: "admin123", expected: "Admin Panel" },
+  {
+    email: "invalid@example.com",
+    password: "wrong",
+    expected: "Invalid credentials",
+  },
+];
+
+for (const { email, password, expected } of loginScenarios) {
+  test(`login with ${email}`, async ({ page }) => {
+    await page.goto("/login");
+    await page.getByLabel("Email").fill(email);
+    await page.getByLabel("Password").fill(password);
+    await page.getByRole("button", { name: "Sign In" }).click();
+
+    await expect(page.getByText(expected)).toBeVisible();
+  });
+}
+```
+
+### Parameterized Tests
+
+```typescript
+// data/checkout-scenarios.ts
+export const checkoutScenarios = [
+  {
+    name: "standard shipping",
+    shipping: "standard",
+    expectedDays: "5-7 business days",
+    expectedCost: "$5.99",
+  },
+  {
+    name: "express shipping",
+    shipping: "express",
+    expectedDays: "2-3 business days",
+    expectedCost: "$14.99",
+  },
+  {
+    name: "overnight shipping",
+    shipping: "overnight",
+    expectedDays: "Next business day",
+    expectedCost: "$29.99",
+  },
+];
+```
+
+```typescript
+import { checkoutScenarios } from "./data/checkout-scenarios";
+
+test.describe("shipping options", () => {
+  for (const scenario of checkoutScenarios) {
+    test(`checkout with ${scenario.name}`, async ({ page }) => {
+      await page.goto("/checkout");
+
+      await page.getByLabel(scenario.shipping, { exact: false }).check();
+
+      await expect(page.getByText(scenario.expectedDays)).toBeVisible();
+      await expect(page.getByText(scenario.expectedCost)).toBeVisible();
+    });
+  }
+});
+```
+
+### CSV/JSON Data Source
+
+```typescript
+import fs from "fs";
+
+interface TestCase {
+  input: string;
+  expected: string;
+}
+
+// Load test data from JSON
+const testCases: TestCase[] = JSON.parse(
+  fs.readFileSync("./data/search-tests.json", "utf-8"),
+);
+
+test.describe("search functionality", () => {
+  for (const { input, expected } of testCases) {
+    test(`search for "${input}"`, async ({ page }) => {
+      await page.goto("/search");
+      await page.getByLabel("Search").fill(input);
+      await page.getByLabel("Search").press("Enter");
+
+      await expect(page.getByText(expected)).toBeVisible();
+    });
+  }
+});
+```
+
+## Test Data Fixtures
+
+### Fixture with Factory
+
+```typescript
+// fixtures/data.fixture.ts
+import { test as base } from "@playwright/test";
+import { createUser, User } from "../factories/user.factory";
+import { createProduct, Product } from "../factories/product.factory";
+
+type DataFixtures = {
+  testUser: User;
+  testProducts: Product[];
+};
+
+export const test = base.extend<DataFixtures>({
+  testUser: async ({}, use) => {
+    const user = createUser({ name: "E2E Test User" });
+    await use(user);
+  },
+
+  testProducts: async ({}, use) => {
+    const products = [
+      createProduct({ name: "Test Product 1" }),
+      createProduct({ name: "Test Product 2" }),
+      createProduct({ name: "Test Product 3" }),
+    ];
+    await use(products);
+  },
+});
+
+// Usage
+test("add product to cart", async ({ page, testUser, testProducts }) => {
+  // Mock API with test data
+  await page.route("**/api/user", (route) => route.fulfill({ json: testUser }));
+  await page.route("**/api/products", (route) =>
+    route.fulfill({ json: testProducts }),
+  );
+
+  await page.goto("/products");
+  await expect(page.getByText(testProducts[0].name)).toBeVisible();
+});
+```
+
+## Database Seeding
+
+### API-Based Seeding
+
+```typescript
+// fixtures/seed.fixture.ts
+import { test as base, APIRequestContext } from "@playwright/test";
+import { createUser } from "../factories/user.factory";
+
+type SeedFixtures = {
+  seedUser: (overrides?: Partial<User>) => Promise<User>;
+  cleanupUsers: string[];
+};
+
+export const test = base.extend<SeedFixtures>({
+  cleanupUsers: [],
+
+  seedUser: async ({ request, cleanupUsers }, use) => {
+    await use(async (overrides = {}) => {
+      const userData = createUser(overrides);
+
+      const response = await request.post("/api/test/users", {
+        data: userData,
+      });
+      const user = await response.json();
+
+      cleanupUsers.push(user.id);
+      return user;
+    });
+  },
+
+  // Cleanup after test
+  cleanupUsers: async ({ request }, use) => {
+    const userIds: string[] = [];
+    await use(userIds);
+
+    // Delete all created users
+    for (const id of userIds) {
+      await request.delete(`/api/test/users/${id}`);
+    }
+  },
+});
+
+// Usage
+test("user profile page", async ({ page, seedUser }) => {
+  const user = await seedUser({ name: "John Doe" });
+
+  await page.goto(`/users/${user.id}`);
+  await expect(page.getByText("John Doe")).toBeVisible();
+});
+```
+
+### Transaction Rollback Seeding
+
+```typescript
+// fixtures/db.fixture.ts
+export const test = base.extend<{}, { db: DbTransaction }>({
+  db: [
+    async ({}, use) => {
+      const client = await pool.connect();
+      await client.query("BEGIN");
+
+      await use({
+        query: (sql: string, params?: any[]) => client.query(sql, params),
+        seed: async (table: string, data: object) => {
+          const keys = Object.keys(data);
+          const values = Object.values(data);
+          const placeholders = keys.map((_, i) => `$${i + 1}`);
+
+          const result = await client.query(
+            `INSERT INTO ${table} (${keys.join(", ")}) VALUES (${placeholders.join(", ")}) RETURNING *`,
+            values,
+          );
+          return result.rows[0];
+        },
+      });
+
+      await client.query("ROLLBACK");
+      client.release();
+    },
+    { scope: "test" },
+  ],
+});
+```
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern                    | Problem                         | Solution                   |
+| ------------------------------- | ------------------------------- | -------------------------- |
+| Hardcoded test data             | Brittle, repetitive             | Use factories              |
+| Random data without seed        | Non-reproducible failures       | Seed faker per test        |
+| Shared mutable test data        | Tests interfere with each other | Create fresh data per test |
+| Manual data creation everywhere | Duplication, maintenance burden | Centralize in factories    |
+
+## Related References
+
+- **Fixtures**: See [fixtures-hooks.md](fixtures-hooks.md) for fixture patterns
+- **API Testing**: See [test-suite-structure.md](test-suite-structure.md) for API mocking
diff --git a/.cursor/skills/playwright-testing/core/test-suite-structure.md b/.cursor/skills/playwright-testing/core/test-suite-structure.md
new file mode 100644
index 0000000000..547a78afe3
--- /dev/null
+++ b/.cursor/skills/playwright-testing/core/test-suite-structure.md
@@ -0,0 +1,361 @@
+# Test Suite Structure
+
+## Table of Contents
+
+1. [Configuration](#configuration)
+2. [E2E Tests](#e2e-tests)
+3. [Component Tests](#component-tests)
+4. [API Tests](#api-tests)
+5. [Visual Regression Tests](#visual-regression-tests)
+6. [Directory Structure](#directory-structure)
+7. [Tagging & Filtering](#tagging--filtering)
+
+### Project Setup
+
+```bash
+npm init playwright@latest
+```
+
+## Configuration
+
+### Essential Configuration
+
+```typescript
+// playwright.config.ts
+import { defineConfig, devices } from "@playwright/test";
+
+export default defineConfig({
+  testDir: "./tests",
+  fullyParallel: true,
+  forbidOnly: !!process.env.CI,
+  retries: process.env.CI ? 2 : 0,
+  workers: process.env.CI ? 1 : undefined,
+  reporter: [["html"], ["list"]],
+  use: {
+    baseURL: "http://localhost:3000",
+    trace: "on-first-retry",
+    screenshot: "only-on-failure",
+  },
+  projects: [
+    { name: "setup", testMatch: /.*\.setup\.ts/ },
+    {
+      name: "chromium",
+      use: { ...devices["Desktop Chrome"] },
+      dependencies: ["setup"],
+    },
+  ],
+  webServer: {
+    command: "npm run dev",
+    url: "http://localhost:3000",
+    reuseExistingServer: !process.env.CI,
+  },
+});
+```
+
+## E2E Tests
+
+Full user journey tests through the browser.
+
+### Structure
+
+```typescript
+// tests/e2e/checkout.spec.ts
+import { test, expect } from "@playwright/test";
+
+test.describe("Checkout Flow", () => {
+  test.beforeEach(async ({ page }) => {
+    await page.goto("/products");
+  });
+
+  test("complete purchase as guest", async ({ page }) => {
+    // Add to cart
+    await page.getByRole("button", { name: "Add to Cart" }).first().click();
+    await expect(page.getByTestId("cart-count")).toHaveText("1");
+
+    // Go to checkout
+    await page.getByRole("link", { name: "Cart" }).click();
+    await page.getByRole("button", { name: "Checkout" }).click();
+
+    // Fill shipping
+    await page.getByLabel("Email").fill("guest@example.com");
+    await page.getByLabel("Address").fill("123 Test St");
+    await page.getByRole("button", { name: "Continue" }).click();
+
+    // Payment
+    await page.getByLabel("Card Number").fill("4242424242424242");
+    await page.getByRole("button", { name: "Pay Now" }).click();
+
+    // Confirmation
+    await expect(page.getByRole("heading")).toHaveText("Order Confirmed");
+  });
+
+  test("apply discount code", async ({ page }) => {
+    await page.getByRole("button", { name: "Add to Cart" }).first().click();
+    await page.getByRole("link", { name: "Cart" }).click();
+
+    await page.getByLabel("Discount Code").fill("SAVE10");
+    await page.getByRole("button", { name: "Apply" }).click();
+
+    await expect(page.getByText("10% discount applied")).toBeVisible();
+  });
+});
+```
+
+### Best Practices
+
+- Test critical user journeys
+- Keep tests independent
+- Use realistic data
+- Clean up test data in teardown
+
+## Component Tests
+
+Test individual components in isolation using Playwright Component Testing.
+
+```bash
+npm init playwright@latest -- --ct
+```
+
+For comprehensive component testing patterns including mounting, props, events, slots, mocking, and framework-specific examples, see **[component-testing.md](../testing-patterns/component-testing.md)**.
+
+## API Tests
+
+Test backend APIs without browser.
+
+### API Mocking Patterns
+
+For E2E tests that need to mock API responses:
+
+```typescript
+// Mock single endpoint
+test("displays mocked users", async ({ page }) => {
+  await page.route("**/api/users", (route) =>
+    route.fulfill({
+      status: 200,
+      json: [{ id: 1, name: "Test User" }],
+    })
+  );
+
+  await page.goto("/users");
+  await expect(page.getByText("Test User")).toBeVisible();
+});
+
+// Mock with different responses
+test("handles API errors", async ({ page }) => {
+  await page.route("**/api/users", (route) =>
+    route.fulfill({
+      status: 500,
+      json: { error: "Server error" },
+    })
+  );
+
+  await page.goto("/users");
+  await expect(page.getByText("Server error")).toBeVisible();
+});
+
+// Conditional mocking
+test("mocks based on request", async ({ page }) => {
+  await page.route("**/api/users", (route, request) => {
+    if (request.method() === "GET") {
+      route.fulfill({ json: [{ id: 1, name: "User" }] });
+    } else {
+      route.continue();
+    }
+  });
+});
+
+// Mock with delay (simulate slow network)
+test("handles slow API", async ({ page }) => {
+  await page.route("**/api/data", (route) =>
+    route.fulfill({
+      json: { data: "test" },
+      delay: 2000, // 2 second delay
+    })
+  );
+
+  await page.goto("/dashboard");
+  await expect(page.getByText("Loading...")).toBeVisible();
+  await expect(page.getByText("test")).toBeVisible();
+});
+```
+
+For advanced patterns (GraphQL mocking, HAR recording, request modification, network throttling), see **[network-advanced.md](../advanced/network-advanced.md)**.
+
+## Visual Regression Tests
+
+Compare screenshots to detect visual changes.
+
+### Basic Visual Test
+
+```typescript
+// tests/visual/homepage.spec.ts
+import { test, expect } from "@playwright/test";
+
+test("homepage visual", async ({ page }) => {
+  await page.goto("/");
+  await expect(page).toHaveScreenshot("homepage.png");
+});
+
+test("component visual", async ({ page }) => {
+  await page.goto("/components");
+
+  const button = page.getByRole("button", { name: "Primary" });
+  await expect(button).toHaveScreenshot("primary-button.png");
+});
+```
+
+### Visual Test Options
+
+```typescript
+test("dashboard visual", async ({ page }) => {
+  await page.goto("/dashboard");
+
+  await expect(page).toHaveScreenshot("dashboard.png", {
+    fullPage: true, // Capture entire scrollable page
+    maxDiffPixels: 100, // Allow up to 100 different pixels
+    maxDiffPixelRatio: 0.01, // Or 1% difference
+    threshold: 0.2, // Pixel comparison threshold
+    animations: "disabled", // Disable animations
+    mask: [page.getByTestId("date")], // Mask dynamic content
+  });
+});
+```
+
+### Handling Dynamic Content
+
+```typescript
+test("page with dynamic content", async ({ page }) => {
+  await page.goto("/profile");
+
+  // Mask elements that change
+  await expect(page).toHaveScreenshot("profile.png", {
+    mask: [
+      page.getByTestId("timestamp"),
+      page.getByTestId("avatar"),
+      page.getByRole("img"),
+    ],
+  });
+});
+
+// Or hide elements via CSS
+test("page hiding dynamic elements", async ({ page }) => {
+  await page.goto("/profile");
+
+  await page.addStyleTag({
+    content: `
+      .dynamic-content { visibility: hidden !important; }
+      [data-testid="ad-banner"] { display: none !important; }
+    `,
+  });
+
+  await expect(page).toHaveScreenshot("profile-stable.png");
+});
+```
+
+### Visual Test Configuration
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  expect: {
+    toHaveScreenshot: {
+      maxDiffPixels: 50,
+      animations: "disabled",
+    },
+  },
+  projects: [
+    {
+      name: "visual-chrome",
+      use: {
+        ...devices["Desktop Chrome"],
+        viewport: { width: 1280, height: 720 },
+      },
+      testMatch: /.*visual.*\.spec\.ts/,
+    },
+  ],
+});
+```
+
+### Update Snapshots
+
+```bash
+# Update all snapshots
+npx playwright test --update-snapshots
+
+# Update specific test
+npx playwright test homepage.spec.ts --update-snapshots
+```
+
+## Directory Structure
+
+```
+tests/
+├── e2e/                    # End-to-end tests
+│   ├── auth.spec.ts
+│   ├── checkout.spec.ts
+│   └── dashboard.spec.ts
+├── component/              # Component tests
+│   ├── Button.spec.tsx
+│   └── Modal.spec.tsx
+├── api/                    # API tests
+│   ├── users.spec.ts
+│   └── products.spec.ts
+├── visual/                 # Visual regression tests
+│   └── homepage.spec.ts
+├── fixtures/               # Custom fixtures
+│   ├── auth.fixture.ts
+│   └── api.fixture.ts
+└── pages/                  # Page objects
+    ├── login.page.ts
+    └── dashboard.page.ts
+```
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern                          | Problem                            | Solution                  |
+| ------------------------------------- | ---------------------------------- | ------------------------- |
+| Long test files                       | Hard to maintain, slow to navigate | Split by feature, use POM |
+| Tests depend on execution order       | Flaky, hard to debug               | Keep tests independent    |
+| Testing multiple features in one test | Hard to debug failures             | One feature per test      |
+
+## Related References
+
+- **Component Testing**: See [component-testing.md](../testing-patterns/component-testing.md) for comprehensive CT patterns
+- **Projects**: See [projects-dependencies.md](projects-dependencies.md) for project-based filtering
+- **Page Objects**: See [page-object-model.md](page-object-model.md) for organizing page interactions
+- **Test Data**: See [fixtures-hooks.md](fixtures-hooks.md) for managing test data
+
+## Tagging & Filtering
+
+### Using Tags
+
+```typescript
+test("user login @smoke @auth", async ({ page }) => {
+  // ...
+});
+
+test("checkout flow @e2e @critical", async ({ page }) => {
+  // ...
+});
+
+test.describe("API tests @api", () => {
+  test("create user", async ({ request }) => {
+    // ...
+  });
+});
+```
+
+### Running Tagged Tests
+
+```bash
+# Run smoke tests
+npx playwright test --grep @smoke
+
+# Run all except slow tests
+npx playwright test --grep-invert @slow
+
+# Combine tags
+npx playwright test --grep "@smoke|@critical"
+```
+
+For project-based filtering and advanced project configuration, see **[projects-dependencies.md](projects-dependencies.md)**.
diff --git a/.cursor/skills/playwright-testing/core/test-tags.md b/.cursor/skills/playwright-testing/core/test-tags.md
new file mode 100644
index 0000000000..6d81cd9649
--- /dev/null
+++ b/.cursor/skills/playwright-testing/core/test-tags.md
@@ -0,0 +1,298 @@
+# Test Tags
+
+## Table of Contents
+
+1. [Basic Tagging](#basic-tagging)
+2. [Tagging Describe Blocks](#tagging-describe-blocks)
+3. [Running Tagged Tests](#running-tagged-tests)
+4. [Filtering by Tags](#filtering-by-tags)
+5. [Configuration-Based Filtering](#configuration-based-filtering)
+6. [Tag Organization Patterns](#tag-organization-patterns)
+7. [Common Tag Categories](#common-tag-categories)
+8. [Anti-Patterns to Avoid](#anti-patterns-to-avoid)
+9. [Related References](#related-references)
+
+## Basic Tagging
+
+### Tag via Details Object
+
+```typescript
+import { test, expect } from "@playwright/test";
+
+test(
+  "test login page",
+  {
+    tag: "@fast",
+  },
+  async ({ page }) => {
+    await page.goto("/login");
+    await expect(page.getByRole("heading")).toBeVisible();
+  }
+);
+
+test(
+  "test dashboard",
+  {
+    tag: "@slow",
+  },
+  async ({ page }) => {
+    await page.goto("/dashboard");
+    await expect(page.getByTestId("charts")).toBeVisible();
+  }
+);
+```
+
+### Tag via Title (not recommended)
+
+```typescript
+test("test full report @slow", async ({ page }) => {
+  await page.goto("/reports/full");
+  await expect(page.getByText("Report loaded")).toBeVisible();
+});
+
+test("quick validation @fast @smoke", async ({ page }) => {
+  await page.goto("/");
+  await expect(page.locator("body")).toBeVisible();
+});
+```
+
+## Tagging Describe Blocks
+
+### Tag All Tests in Group
+
+```typescript
+test.describe(
+  "report tests",
+  {
+    tag: "@report",
+  },
+  () => {
+    test("test report header", async ({ page }) => {
+      // Inherits @report tag
+    });
+
+    test("test report footer", async ({ page }) => {
+      // Inherits @report tag
+    });
+  }
+);
+```
+
+### Combine Group and Test Tags
+
+```typescript
+test.describe(
+  "admin features",
+  {
+    tag: "@admin",
+  },
+  () => {
+    test("admin dashboard", async ({ page }) => {
+      // Has @admin tag
+    });
+
+    test(
+      "admin settings",
+      {
+        tag: ["@slow", "@critical"],
+      },
+      async ({ page }) => {
+        // Has @admin, @slow, @critical tags
+      }
+    );
+  }
+);
+```
+
+## Running Tagged Tests
+
+### Run Tests with Specific Tag
+
+```bash
+# Run all @fast tests
+npx playwright test --grep @fast
+```
+
+### Exclude Tests with Tag
+
+```bash
+# Run all tests except @slow
+npx playwright test --grep-invert @slow
+```
+
+## Filtering by Tags
+
+### Logical OR (Either Tag)
+
+```bash
+# Run tests with @fast OR @smoke
+npx playwright test --grep "@fast|@smoke"
+```
+
+### Logical AND (Both Tags)
+
+```bash
+# Run tests with both @fast AND @critical
+npx playwright test --grep "(?=.*@fast)(?=.*@critical)"
+```
+
+### Complex Patterns
+
+```bash
+# Run @e2e tests that are also @critical
+npx playwright test --grep "(?=.*@e2e)(?=.*@critical)"
+
+# Run @api tests excluding @slow
+npx playwright test --grep "@api" --grep-invert "@slow"
+```
+
+## Configuration-Based Filtering
+
+### Filter in playwright.config.ts
+
+```typescript
+import { defineConfig } from "@playwright/test";
+
+export default defineConfig({
+  grep: /@smoke/,
+  grepInvert: /@flaky/,
+});
+```
+
+### Project-Specific Tags
+
+```typescript
+import { defineConfig } from "@playwright/test";
+
+export default defineConfig({
+  projects: [
+    {
+      name: "smoke",
+      grep: /@smoke/,
+    },
+    {
+      name: "regression",
+      grepInvert: /@smoke/,
+    },
+    {
+      name: "critical-only",
+      grep: /@critical/,
+    },
+  ],
+});
+```
+
+### Environment-Based Filtering
+
+```typescript
+import { defineConfig } from "@playwright/test";
+
+const isCI = !!process.env.CI;
+
+export default defineConfig({
+  grep: isCI ? /@smoke|@critical/ : undefined,
+  grepInvert: isCI ? /@flaky/ : undefined,
+});
+```
+
+## Tag Organization Patterns
+
+### By Test Type
+
+```typescript
+// Smoke tests - quick validation
+test("homepage loads", { tag: "@smoke" }, async ({ page }) => {});
+test("login works", { tag: "@smoke" }, async ({ page }) => {});
+
+// Regression tests - comprehensive
+test("full checkout flow", { tag: "@regression" }, async ({ page }) => {});
+test("all payment methods", { tag: "@regression" }, async ({ page }) => {});
+
+// E2E tests - user journeys
+test("complete user journey", { tag: "@e2e" }, async ({ page }) => {});
+```
+
+### By Priority
+
+```typescript
+test(
+  "payment processing",
+  {
+    tag: ["@critical", "@p0"],
+  },
+  async ({ page }) => {}
+);
+
+test(
+  "user preferences",
+  {
+    tag: ["@p1"],
+  },
+  async ({ page }) => {}
+);
+
+test(
+  "theme customization",
+  {
+    tag: ["@p2"],
+  },
+  async ({ page }) => {}
+);
+```
+
+### By Feature Area
+
+```typescript
+test.describe(
+  "authentication",
+  {
+    tag: "@auth",
+  },
+  () => {
+    test("login @smoke", async ({ page }) => {});
+    test("logout", async ({ page }) => {});
+    test("password reset @slow", async ({ page }) => {});
+  }
+);
+
+test.describe(
+  "payments",
+  {
+    tag: "@payments",
+  },
+  () => {
+    test("credit card @critical", async ({ page }) => {});
+    test("paypal @critical", async ({ page }) => {});
+  }
+);
+```
+
+## Common Tag Categories
+
+| Category        | Tags                                          | Purpose                       |
+| --------------- | --------------------------------------------- | ----------------------------- |
+| **Speed**       | `@fast`, `@slow`                              | Execution time classification |
+| **Priority**    | `@critical`, `@p0`, `@p1`, `@p2`              | Business importance           |
+| **Type**        | `@smoke`, `@regression`, `@e2e`               | Test suite categorization     |
+| **Feature**     | `@auth`, `@payments`, `@settings`             | Feature area grouping         |
+| **Pipeline**    | `@pr`, `@nightly`, `@release`                 | CI/CD execution timing        |
+| **Status**      | `@flaky`, `@wip`, `@quarantine`               | Test health tracking          |
+| **Environment** | `@local`, `@staging`, `@prod`                 | Target environment            |
+| **Team**        | `@team-frontend`, `@team-backend`, `@team-qa` | Team assignment               |
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern             | Problem                  | Solution                                       |
+| ------------------------ | ------------------------ | ---------------------------------------------- |
+| Too many tags per test   | Hard to maintain         | Limit to 2-3 relevant tags                     |
+| Inconsistent naming      | Confusing filtering      | Establish naming conventions                   |
+| Missing `@` prefix       | Tags won't match filters | Always prefix with `@`                         |
+| Overlapping tag meanings | Ambiguous categorization | Define clear tag semantics                     |
+| Not using tags           | Can't selectively run    | Tag by type, priority, or feature              |
+| Tags in test title       | Hard to parse/filter     | Use the details object for tags, not the title |
+
+## Related References
+
+- **Test Organization**: See [test-suite-structure.md](test-suite-structure.md) for structuring tests
+- **Annotations**: See [annotations.md](annotations.md) for skip, fixme, fail, slow
+- **CI/CD Integration**: See [ci-cd.md](../infrastructure-ci-cd/ci-cd.md) for pipeline setup
diff --git a/.cursor/skills/playwright-testing/debugging/console-errors.md b/.cursor/skills/playwright-testing/debugging/console-errors.md
new file mode 100644
index 0000000000..d1e9882c39
--- /dev/null
+++ b/.cursor/skills/playwright-testing/debugging/console-errors.md
@@ -0,0 +1,420 @@
+# Browser Console & JavaScript Error Handling
+
+## Table of Contents
+
+1. [Capturing Console Messages](#capturing-console-messages)
+2. [Failing on Console Errors](#failing-on-console-errors)
+3. [JavaScript Error Detection](#javascript-error-detection)
+4. [Monitoring Warnings](#monitoring-warnings)
+5. [Console Fixtures](#console-fixtures)
+
+## Capturing Console Messages
+
+### Basic Console Capture
+
+```typescript
+test("capture console logs", async ({ page }) => {
+  const logs: string[] = [];
+
+  page.on("console", (msg) => {
+    logs.push(`${msg.type()}: ${msg.text()}`);
+  });
+
+  await page.goto("/");
+
+  // Check what was logged
+  console.log("Captured logs:", logs);
+});
+```
+
+### Capture by Type
+
+```typescript
+test("capture specific console types", async ({ page }) => {
+  const errors: string[] = [];
+  const warnings: string[] = [];
+  const infos: string[] = [];
+
+  page.on("console", (msg) => {
+    switch (msg.type()) {
+      case "error":
+        errors.push(msg.text());
+        break;
+      case "warning":
+        warnings.push(msg.text());
+        break;
+      case "info":
+      case "log":
+        infos.push(msg.text());
+        break;
+    }
+  });
+
+  await page.goto("/dashboard");
+
+  expect(errors).toHaveLength(0);
+  console.log("Warnings:", warnings);
+});
+```
+
+### Capture with Stack Trace
+
+```typescript
+test("capture errors with location", async ({ page }) => {
+  const errors: { message: string; location?: string }[] = [];
+
+  page.on("console", async (msg) => {
+    if (msg.type() === "error") {
+      const location = msg.location();
+      errors.push({
+        message: msg.text(),
+        location: location
+          ? `${location.url}:${location.lineNumber}`
+          : undefined,
+      });
+    }
+  });
+
+  await page.goto("/buggy-page");
+
+  // Log errors with source location
+  errors.forEach((e) => {
+    console.log(`Error: ${e.message}`);
+    if (e.location) console.log(`  at ${e.location}`);
+  });
+});
+```
+
+## Failing on Console Errors
+
+### Fail Test on Any Error
+
+```typescript
+test("no console errors allowed", async ({ page }) => {
+  const errors: string[] = [];
+
+  page.on("console", (msg) => {
+    if (msg.type() === "error") {
+      errors.push(msg.text());
+    }
+  });
+
+  await page.goto("/");
+  await page.getByRole("button", { name: "Load Data" }).click();
+
+  // Fail if any console errors
+  expect(errors, `Console errors found:\n${errors.join("\n")}`).toHaveLength(0);
+});
+```
+
+### Fail with Allowed Exceptions
+
+```typescript
+test("no unexpected console errors", async ({ page }) => {
+  const allowedErrors = [
+    /Failed to load resource.*favicon/,
+    /ResizeObserver loop/,
+  ];
+
+  const unexpectedErrors: string[] = [];
+
+  page.on("console", (msg) => {
+    if (msg.type() === "error") {
+      const text = msg.text();
+      const isAllowed = allowedErrors.some((pattern) => pattern.test(text));
+      if (!isAllowed) {
+        unexpectedErrors.push(text);
+      }
+    }
+  });
+
+  await page.goto("/");
+
+  expect(
+    unexpectedErrors,
+    `Unexpected console errors:\n${unexpectedErrors.join("\n")}`,
+  ).toHaveLength(0);
+});
+```
+
+### Auto-Fail Fixture
+
+```typescript
+// fixtures/console.fixture.ts
+type ConsoleFixtures = {
+  failOnConsoleError: void;
+};
+
+export const test = base.extend<ConsoleFixtures>({
+  failOnConsoleError: [
+    async ({ page }, use, testInfo) => {
+      const errors: string[] = [];
+
+      page.on("console", (msg) => {
+        if (msg.type() === "error") {
+          errors.push(msg.text());
+        }
+      });
+
+      await use();
+
+      // After test, check for errors
+      if (errors.length > 0) {
+        testInfo.annotations.push({
+          type: "console-errors",
+          description: errors.join("\n"),
+        });
+        throw new Error(`Console errors detected:\n${errors.join("\n")}`);
+      }
+    },
+    { auto: true }, // Runs for every test
+  ],
+});
+```
+
+## JavaScript Error Detection
+
+### Catch Uncaught Exceptions
+
+```typescript
+test("no uncaught exceptions", async ({ page }) => {
+  const pageErrors: Error[] = [];
+
+  page.on("pageerror", (error) => {
+    pageErrors.push(error);
+  });
+
+  await page.goto("/");
+  await page.getByRole("button", { name: "Trigger Action" }).click();
+
+  expect(
+    pageErrors,
+    `Uncaught exceptions:\n${pageErrors.map((e) => e.message).join("\n")}`,
+  ).toHaveLength(0);
+});
+```
+
+### Capture Error Details
+
+```typescript
+test("capture JS error details", async ({ page }) => {
+  const errors: { message: string; stack?: string }[] = [];
+
+  page.on("pageerror", (error) => {
+    errors.push({
+      message: error.message,
+      stack: error.stack,
+    });
+  });
+
+  await page.goto("/error-page");
+
+  if (errors.length > 0) {
+    console.log("JavaScript errors:");
+    errors.forEach((e) => {
+      console.log(`  Message: ${e.message}`);
+      console.log(`  Stack: ${e.stack}`);
+    });
+  }
+});
+```
+
+### Test Error Boundary Triggers
+
+```typescript
+test("error boundary catches render error", async ({ page }) => {
+  let errorCaught = false;
+
+  page.on("pageerror", () => {
+    // Note: React error boundaries catch errors before they become pageerrors
+    // This would only fire for unhandled errors
+    errorCaught = true;
+  });
+
+  // Trigger component error via props
+  await page.route(
+    "**/api/data",
+    (route) => route.fulfill({ json: null }), // Will cause "cannot read property of null"
+  );
+
+  await page.goto("/dashboard");
+
+  // Error boundary should show fallback, not crash
+  await expect(page.getByText("Something went wrong")).toBeVisible();
+  expect(errorCaught).toBe(false); // Error was caught by boundary
+});
+```
+
+## Monitoring Warnings
+
+### Capture Deprecation Warnings
+
+```typescript
+test("no deprecation warnings", async ({ page }) => {
+  const deprecations: string[] = [];
+
+  page.on("console", (msg) => {
+    const text = msg.text();
+    if (
+      msg.type() === "warning" &&
+      (text.includes("deprecated") || text.includes("Deprecation"))
+    ) {
+      deprecations.push(text);
+    }
+  });
+
+  await page.goto("/");
+
+  if (deprecations.length > 0) {
+    console.warn("Deprecation warnings found:");
+    deprecations.forEach((d) => console.warn(`  - ${d}`));
+  }
+
+  // Optionally fail
+  // expect(deprecations).toHaveLength(0);
+});
+```
+
+### React Development Warnings
+
+```typescript
+test("no React warnings", async ({ page }) => {
+  const reactWarnings: string[] = [];
+
+  page.on("console", (msg) => {
+    const text = msg.text();
+    if (
+      msg.type() === "warning" &&
+      (text.includes("Warning:") || text.includes("React"))
+    ) {
+      reactWarnings.push(text);
+    }
+  });
+
+  await page.goto("/");
+
+  // Common React warnings to check
+  const criticalWarnings = reactWarnings.filter(
+    (w) =>
+      w.includes("Each child in a list should have a unique") ||
+      w.includes("Cannot update a component") ||
+      w.includes("Can't perform a React state update"),
+  );
+
+  expect(
+    criticalWarnings,
+    `React warnings:\n${criticalWarnings.join("\n")}`,
+  ).toHaveLength(0);
+});
+```
+
+## Console Fixtures
+
+### Comprehensive Console Fixture
+
+```typescript
+// fixtures/console.fixture.ts
+type ConsoleMessage = {
+  type: string;
+  text: string;
+  location?: { url: string; line: number };
+  timestamp: number;
+};
+
+type ConsoleFixtures = {
+  consoleMessages: ConsoleMessage[];
+  getConsoleErrors: () => ConsoleMessage[];
+  getConsoleWarnings: () => ConsoleMessage[];
+  assertNoErrors: (allowedPatterns?: RegExp[]) => void;
+};
+
+export const test = base.extend<ConsoleFixtures>({
+  consoleMessages: async ({ page }, use) => {
+    const messages: ConsoleMessage[] = [];
+
+    page.on("console", (msg) => {
+      const location = msg.location();
+      messages.push({
+        type: msg.type(),
+        text: msg.text(),
+        location: location
+          ? { url: location.url, line: location.lineNumber }
+          : undefined,
+        timestamp: Date.now(),
+      });
+    });
+
+    await use(messages);
+  },
+
+  getConsoleErrors: async ({ consoleMessages }, use) => {
+    await use(() => consoleMessages.filter((m) => m.type === "error"));
+  },
+
+  getConsoleWarnings: async ({ consoleMessages }, use) => {
+    await use(() => consoleMessages.filter((m) => m.type === "warning"));
+  },
+
+  assertNoErrors: async ({ getConsoleErrors }, use) => {
+    await use((allowedPatterns = []) => {
+      const errors = getConsoleErrors();
+      const unexpected = errors.filter(
+        (e) => !allowedPatterns.some((p) => p.test(e.text)),
+      );
+
+      if (unexpected.length > 0) {
+        throw new Error(
+          `Unexpected console errors:\n${unexpected.map((e) => e.text).join("\n")}`,
+        );
+      }
+    });
+  },
+});
+
+// Usage
+test("page loads without errors", async ({ page, assertNoErrors }) => {
+  await page.goto("/dashboard");
+  await page.getByRole("button", { name: "Load" }).click();
+
+  assertNoErrors([/favicon/]); // Allow favicon errors
+});
+```
+
+### Attach Console to Report
+
+```typescript
+test("capture console for debugging", async ({ page }, testInfo) => {
+  const logs: string[] = [];
+
+  page.on("console", (msg) => {
+    logs.push(`[${msg.type()}] ${msg.text()}`);
+  });
+
+  page.on("pageerror", (error) => {
+    logs.push(`[EXCEPTION] ${error.message}`);
+  });
+
+  await page.goto("/");
+  // ... test actions
+
+  // Attach console log to test report
+  await testInfo.attach("console-log", {
+    body: logs.join("\n"),
+    contentType: "text/plain",
+  });
+});
+```
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern               | Problem                    | Solution                    |
+| -------------------------- | -------------------------- | --------------------------- |
+| Ignoring console errors    | Bugs go unnoticed          | Check for errors in tests   |
+| Too strict error checking  | Tests fail on minor issues | Allow known/expected errors |
+| Not capturing stack traces | Hard to debug              | Include location info       |
+| Checking only at end       | Miss errors during actions | Capture continuously        |
+
+## Related References
+
+- **Debugging**: See [debugging.md](debugging.md) for troubleshooting
+- **Error Testing**: See [error-testing.md](error-testing.md) for error scenarios
diff --git a/.cursor/skills/playwright-testing/debugging/debugging.md b/.cursor/skills/playwright-testing/debugging/debugging.md
new file mode 100644
index 0000000000..187e4879e7
--- /dev/null
+++ b/.cursor/skills/playwright-testing/debugging/debugging.md
@@ -0,0 +1,504 @@
+# Debugging & Troubleshooting
+
+## Table of Contents
+
+1. [Debug Tools](#debug-tools)
+2. [Trace Viewer](#trace-viewer)
+3. [Identifying Flaky Tests](#identifying-flaky-tests)
+4. [Debugging Network Issues](#debugging-network-issues)
+5. [Debugging in CI](#debugging-in-ci)
+6. [Debugging Authentication](#debugging-authentication)
+7. [Debugging Screenshots](#debugging-screenshots)
+8. [Common Issues](#common-issues)
+9. [Logging](#logging)
+
+## Debug Tools
+
+### Playwright Inspector
+
+```bash
+# Run with inspector
+PWDEBUG=1 npx playwright test
+# Or specific test
+PWDEBUG=1 npx playwright test login.spec.ts
+```
+
+Features:
+
+- Step through test actions
+- Pick locators visually
+- Inspect DOM state
+- Edit and re-run
+
+### Headed Mode
+
+```bash
+# Run with visible browser
+npx playwright test --headed
+
+# Interactive debugging (headed, paused, step-through)
+npx playwright test --debug
+```
+
+You can also set `slowMo` to add an `N` ms delay per action, making test execution easier to follow while debugging.
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  use: {
+    launchOptions: {
+      slowMo: 500,
+    },
+  },
+});
+```
+
+### UI Mode
+
+```bash
+# Interactive test runner
+npx playwright test --ui
+```
+
+Features:
+
+- Watch mode
+- Test timeline
+- DOM snapshots
+- Network logs
+- Console logs
+
+### Debug in Code
+
+```typescript
+test("debug example", async ({ page }) => {
+  await page.goto("/");
+
+  // Pause and open inspector
+  await page.pause();
+
+  // Continue test...
+  await page.click("button");
+});
+```
+
+## Trace Viewer
+
+### Enable Traces
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  use: {
+    trace: "on-first-retry", // Record on retry
+    // trace: 'on',                 // Always record
+    // trace: 'retain-on-failure',  // Keep only failures
+  },
+});
+```
+
+### View Traces
+
+```bash
+# Open trace file
+npx playwright show-trace trace.zip
+
+# From test-results
+npx playwright show-trace test-results/test-name/trace.zip
+```
+
+### Trace Contents
+
+- Screenshots at each action
+- DOM snapshots
+- Network requests/responses
+- Console logs
+- Action timeline
+- Source code
+
+### Programmatic Traces
+
+```typescript
+test("manual trace", async ({ page, context }) => {
+  await context.tracing.start({ screenshots: true, snapshots: true });
+
+  await page.goto("/");
+  await page.click("button");
+
+  await context.tracing.stop({ path: "trace.zip" });
+});
+```
+
+## Identifying Flaky Tests
+
+If a test fails intermittently, it's likely flaky. Quick checks:
+
+| Behavior                               | Likely Cause                  | Next Step                              |
+| -------------------------------------- | ----------------------------- | -------------------------------------- |
+| Fails sometimes, passes other times    | Flaky - timing/race condition | [flaky-tests.md](flaky-tests.md)       |
+| Fails only with multiple workers       | Flaky - parallelism/isolation | [flaky-tests.md](flaky-tests.md)       |
+| Fails only in CI                       | Environment difference        | [CI Debugging](#debugging-in-ci) below |
+| Always fails                           | Bug in test or app            | Debug with tools above                 |
+| Always passes locally, always fails CI | CI-specific issue             | [ci-cd.md](../infrastructure-ci-cd/ci-cd.md)                   |
+
+> **For flaky test detection commands, root cause analysis, and fixing strategies**, see [flaky-tests.md](flaky-tests.md).
+
+## Debugging Network Issues
+
+### Monitor All Requests
+
+```typescript
+test("debug network", async ({ page }) => {
+  const requests: string[] = [];
+  const failures: string[] = [];
+
+  page.on("request", (req) => requests.push(`>> ${req.method()} ${req.url()}`));
+  page.on("requestfinished", (req) => {
+    const resp = req.response();
+    requests.push(`<< ${resp?.status()} ${req.url()}`);
+  });
+  page.on("requestfailed", (req) => {
+    failures.push(`FAILED: ${req.url()} - ${req.failure()?.errorText}`);
+  });
+
+  await page.goto("/dashboard");
+
+  // Log summary
+  console.log("Requests:", requests.length);
+  if (failures.length) console.log("Failures:", failures);
+});
+```
+
+### Wait for Specific API Response
+
+When debugging network-dependent issues, wait for specific API responses instead of arbitrary timeouts.
+
+```typescript
+// Start waiting BEFORE triggering the request
+const responsePromise = page.waitForResponse(
+  (resp) => resp.url().includes("/api/data") && resp.status() === 200,
+);
+await page.getByRole("button", { name: "Load" }).click();
+const response = await responsePromise;
+console.log("Status:", response.status());
+```
+
+> **For comprehensive waiting patterns** (navigation, element state, network, polling), see [assertions-waiting.md](../core/assertions-waiting.md#waiting-strategies).
+
+### Debug Slow Requests
+
+```typescript
+test("find slow requests", async ({ page }) => {
+  page.on("requestfinished", (request) => {
+    const timing = request.timing();
+    const total = timing.responseEnd - timing.requestStart;
+    if (total > 1000) {
+      console.log(`SLOW (${total}ms): ${request.url()}`);
+    }
+  });
+
+  await page.goto("/");
+});
+```
+
+## Debugging in CI
+
+### Simulate CI Locally
+
+```bash
+# Run in headless mode like CI
+CI=true npx playwright test
+
+# Match CI browser versions
+npx playwright install --with-deps
+
+# Run in Docker (same as CI)
+docker run --rm -v $(pwd):/work -w /work \
+  mcr.microsoft.com/playwright:v1.40.0-jammy \
+  npx playwright test
+```
+
+### CI-Specific Configuration
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  // More artifacts in CI for debugging
+  use: {
+    trace: process.env.CI ? "on-first-retry" : "off",
+    video: process.env.CI ? "retain-on-failure" : "off",
+    screenshot: process.env.CI ? "only-on-failure" : "off",
+  },
+
+  // More retries in CI (but investigate failures!)
+  retries: process.env.CI ? 2 : 0,
+});
+```
+
+### Debug CI Environment
+
+```typescript
+test("CI environment check", async ({ page }, testInfo) => {
+  console.log("CI:", process.env.CI);
+  console.log("Project:", testInfo.project.name);
+  console.log("Worker:", testInfo.workerIndex);
+  console.log("Retry:", testInfo.retry);
+  console.log("Base URL:", testInfo.project.use.baseURL);
+
+  // Check viewport
+  const viewport = page.viewportSize();
+  console.log("Viewport:", viewport);
+});
+```
+
+## Debugging Authentication
+
+```typescript
+test("debug auth", async ({ page, context }) => {
+  // Inspect current storage state
+  const storage = await context.storageState();
+  console.log(
+    "Cookies:",
+    storage.cookies.map((c) => c.name),
+  );
+
+  // Check if auth cookies are present
+  const cookies = await context.cookies();
+  const authCookie = cookies.find((c) => c.name.includes("session"));
+  console.log("Auth cookie:", authCookie ? "present" : "MISSING");
+
+  await page.goto("/protected");
+
+  // Check if redirected to login (auth failed)
+  if (page.url().includes("/login")) {
+    console.error("Auth failed - redirected to login");
+    // Save state for inspection
+    await context.storageState({ path: "debug-auth.json" });
+  }
+});
+```
+
+## Debugging Screenshots
+
+### Compare Visual State
+
+```typescript
+test("visual debug", async ({ page }, testInfo) => {
+  await page.goto("/");
+
+  // Screenshot before action
+  await page.screenshot({
+    path: testInfo.outputPath("before.png"),
+    fullPage: true,
+  });
+
+  await page.getByRole("button", { name: "Open Menu" }).click();
+
+  // Screenshot after action
+  await page.screenshot({
+    path: testInfo.outputPath("after.png"),
+    fullPage: true,
+  });
+
+  // Attach to report
+  await testInfo.attach("before", {
+    path: testInfo.outputPath("before.png"),
+    contentType: "image/png",
+  });
+});
+```
+
+### Screenshot Specific Element
+
+```typescript
+test("element screenshot", async ({ page }) => {
+  await page.goto("/");
+
+  const element = page.getByTestId("problem-area");
+
+  // Screenshot just the element
+  await element.screenshot({ path: "element-debug.png" });
+
+  // Highlight element in full page screenshot
+  await element.evaluate((el) => (el.style.border = "3px solid red"));
+  await page.screenshot({ path: "highlighted.png" });
+});
+```
+
+## Common Issues
+
+### Element Not Found
+
+```typescript
+// Debug: Check if element exists
+console.log(await page.getByRole("button").count());
+
+// Debug: Log all buttons
+const buttons = await page.getByRole("button").all();
+for (const button of buttons) {
+  console.log(await button.textContent());
+}
+
+// Debug: Screenshot before action
+await page.screenshot({ path: "debug.png" });
+await page.getByRole("button").click();
+```
+
+### Timeout Issues
+
+```typescript
+// Increase timeout for slow operations
+await expect(page.getByText("Loaded")).toBeVisible({ timeout: 30000 });
+
+// Global timeout increase
+test.setTimeout(60000);
+
+// Check what's blocking
+test("debug timeout", async ({ page }) => {
+  await page.goto("/slow-page");
+
+  // Log network activity
+  page.on("request", (request) => console.log(">>", request.url()));
+  page.on("response", (response) =>
+    console.log("<<", response.url(), response.status()),
+  );
+});
+```
+
+### Selector Issues
+
+```typescript
+// Debug: Highlight element
+await page.getByRole("button").highlight();
+
+// Debug: Evaluate selector in browser console
+// Run in Inspector console:
+// playwright.locator('button').first().highlight()
+
+// Debug: Get element info
+const element = page.getByRole("button");
+console.log("Count:", await element.count());
+console.log("Visible:", await element.isVisible());
+console.log("Enabled:", await element.isEnabled());
+```
+
+### Frame Issues
+
+```typescript
+// Debug: List all frames
+for (const frame of page.frames()) {
+  console.log("Frame:", frame.url());
+}
+
+// Debug: Check if element is in iframe
+const frame = page.frameLocator("iframe").first();
+console.log(await frame.getByRole("button").count());
+```
+
+## Logging
+
+### Capture Browser Console
+
+```typescript
+test("with logging", async ({ page }) => {
+  page.on("console", (msg) => console.log("Browser:", msg.text()));
+  page.on("pageerror", (error) => console.log("Page error:", error.message));
+  await page.goto("/");
+});
+```
+
+> **For comprehensive console error handling** (fail on errors, allowed patterns, fixtures), see [console-errors.md](console-errors.md).
+
+### Custom Test Attachments
+
+```typescript
+test("with attachments", async ({ page }, testInfo) => {
+  // Attach screenshot to report
+  const screenshot = await page.screenshot();
+  await testInfo.attach("screenshot", {
+    body: screenshot,
+    contentType: "image/png",
+  });
+
+  // Attach logs or data
+  await testInfo.attach("logs", {
+    body: "Custom log data",
+    contentType: "text/plain",
+  });
+
+  // Use testInfo for output paths
+  const outputPath = testInfo.outputPath("debug-file.json");
+});
+```
+
+## Troubleshooting Checklist
+
+### By Symptom
+
+| Symptom                                       | Common Causes                                                | Quick Fixes                                                         | Reference                                                                  |
+| --------------------------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------------- | -------------------------------------------------------------------------- |
+| **Element not found**                         | Wrong selector, element not visible, in iframe, timing issue | Check locator with Inspector, wait for visibility, use frameLocator | [locators.md](../core/locators.md), [assertions-waiting.md](../core/assertions-waiting.md) |
+| **Timeout errors**                            | Slow network, heavy page load, waiting for wrong condition   | Increase timeout, wait for specific response, check network tab     | [assertions-waiting.md](../core/assertions-waiting.md)                             |
+| **Flaky tests**                               | Race conditions, shared state, timing dependencies           | See comprehensive flaky test guide                                  | [flaky-tests.md](flaky-tests.md)                                           |
+| **Tests pass locally, fail in CI**            | Environment differences, missing dependencies, timing        | Simulate CI locally, check CI logs, verify environment vars         | [ci-cd.md](../infrastructure-ci-cd/ci-cd.md), [flaky-tests.md](flaky-tests.md)                     |
+| **Slow test execution**                       | Not parallelized, heavy network calls, unnecessary waits     | Enable parallelization, mock APIs, optimize waits                   | [performance.md](../infrastructure-ci-cd/performance.md)                                           |
+| **Selector works in browser but not in test** | Element not attached, wrong context, dynamic content         | Use auto-waiting, check iframe, verify element state                | [locators.md](../core/locators.md)                                                 |
+| **Test fails on retry**                       | Non-deterministic data, external dependencies                | Use test data fixtures, mock external services                      | [fixtures-hooks.md](../core/fixtures-hooks.md)                                     |
+
+### Step-by-Step Debugging Process
+
+1. **Reproduce the issue**
+
+   ```bash
+   # Run with trace enabled
+   npx playwright test tests/failing.spec.ts --trace on
+
+   # If intermittent, run multiple times
+   npx playwright test --repeat-each=10
+   ```
+
+2. **Inspect the failure**
+
+   ```bash
+   # View trace
+   npx playwright show-trace test-results/path-to-trace.zip
+
+   # Run in headed mode to watch
+   npx playwright test --headed
+
+   # Use inspector for step-by-step
+   PWDEBUG=1 npx playwright test
+   ```
+
+3. **Isolate the problem**
+
+   ```typescript
+   // Add debugging points
+   await page.pause();
+
+   // Log element state
+   console.log("Element count:", await page.getByRole("button").count());
+   console.log("Element visible:", await page.getByRole("button").isVisible());
+
+   // Take screenshot at failure point
+   await page.screenshot({ path: "debug.png" });
+   ```
+
+4. **Check related areas**
+   - Network requests: Are API calls completing? (see [Debugging Network Issues](#debugging-network-issues))
+   - Timing: Is auto-waiting working correctly?
+   - State: Is the test isolated? (see [flaky-tests.md](flaky-tests.md))
+   - Environment: Does it work locally but fail in CI? (see [Debugging in CI](#debugging-in-ci))
+
+5. **Apply fix and verify**
+   - Fix the root cause (not just symptoms)
+   - Run multiple times to confirm stability: `--repeat-each=10`
+   - Check related tests aren't affected
+
+## Related References
+
+- **Flaky tests**: See [flaky-tests.md](flaky-tests.md) for comprehensive flaky test guide
+- **Locator issues**: See [locators.md](../core/locators.md) for selector strategies
+- **Waiting problems**: See [assertions-waiting.md](../core/assertions-waiting.md) for waiting patterns
+- **Test isolation**: See [fixtures-hooks.md](../core/fixtures-hooks.md) for fixtures and isolation
+- **CI issues**: See [ci-cd.md](../infrastructure-ci-cd/ci-cd.md) for CI configuration
diff --git a/.cursor/skills/playwright-testing/debugging/error-testing.md b/.cursor/skills/playwright-testing/debugging/error-testing.md
new file mode 100644
index 0000000000..43b17b6b7f
--- /dev/null
+++ b/.cursor/skills/playwright-testing/debugging/error-testing.md
@@ -0,0 +1,360 @@
+# Error & Edge Case Testing
+
+## Table of Contents
+
+1. [Error Boundaries](#error-boundaries)
+2. [Network Failures](#network-failures)
+3. [Offline Testing](#offline-testing)
+4. [Loading States](#loading-states)
+5. [Form Validation](#form-validation)
+
+## Error Boundaries
+
+### Test Component Errors
+
+```typescript
+test("error boundary catches component error", async ({ page }) => {
+  // Trigger error via mock
+  await page.route("**/api/user", (route) => {
+    route.fulfill({
+      json: null, // Will cause component to throw
+    });
+  });
+
+  await page.goto("/profile");
+
+  // Error boundary should render fallback
+  await expect(page.getByText("Something went wrong")).toBeVisible();
+  await expect(page.getByRole("button", { name: "Try Again" })).toBeVisible();
+});
+```
+
+### Test Error Recovery
+
+```typescript
+test("recover from error state", async ({ page }) => {
+  let requestCount = 0;
+
+  await page.route("**/api/data", (route) => {
+    requestCount++;
+    if (requestCount === 1) {
+      return route.fulfill({ status: 500 });
+    }
+    return route.fulfill({
+      json: { data: "success" },
+    });
+  });
+
+  await page.goto("/dashboard");
+
+  // Error state
+  await expect(page.getByText("Failed to load")).toBeVisible();
+
+  // Retry
+  await page.getByRole("button", { name: "Retry" }).click();
+
+  // Success state
+  await expect(page.getByText("success")).toBeVisible();
+});
+```
+
+### Test JavaScript Errors
+
+```typescript
+test("handles runtime error gracefully", async ({ page }) => {
+  const errors: string[] = [];
+
+  page.on("pageerror", (error) => {
+    errors.push(error.message);
+  });
+
+  await page.goto("/buggy-page");
+
+  // App should still be functional despite error
+  await expect(page.getByRole("navigation")).toBeVisible();
+
+  // Error was logged
+  expect(errors.length).toBeGreaterThan(0);
+});
+```
+
+## Network Failures
+
+### Test API Errors
+
+```typescript
+test.describe("API error handling", () => {
+  const errorCodes = [400, 401, 403, 404, 500, 502, 503];
+
+  for (const status of errorCodes) {
+    test(`handles ${status} error`, async ({ page }) => {
+      await page.route("**/api/data", (route) =>
+        route.fulfill({
+          status,
+          json: { error: `Error ${status}` },
+        }),
+      );
+
+      await page.goto("/dashboard");
+
+      // Appropriate error message shown
+      await expect(page.getByRole("alert")).toBeVisible();
+    });
+  }
+});
+```
+
+### Test Timeout
+
+```typescript
+test("handles request timeout", async ({ page }) => {
+  await page.route("**/api/slow", async (route) => {
+    // Never respond - simulates timeout
+    await new Promise(() => {});
+  });
+
+  await page.goto("/slow-page");
+
+  // Should show timeout message (app should have its own timeout)
+  await expect(page.getByText("Request timed out")).toBeVisible({
+    timeout: 15000,
+  });
+});
+```
+
+### Test Connection Reset
+
+```typescript
+test("handles connection failure", async ({ page }) => {
+  await page.route("**/api/data", (route) => {
+    route.abort("connectionfailed");
+  });
+
+  await page.goto("/dashboard");
+
+  await expect(page.getByText("Connection failed")).toBeVisible();
+  await expect(page.getByRole("button", { name: "Retry" })).toBeVisible();
+});
+```
+
+### Test Mid-Request Failure
+
+```typescript
+test("handles failure during request", async ({ page }) => {
+  let requestStarted = false;
+
+  await page.route("**/api/upload", async (route) => {
+    requestStarted = true;
+    // Abort after small delay (mid-request)
+    await new Promise((resolve) => setTimeout(resolve, 500));
+    route.abort("failed");
+  });
+
+  await page.goto("/upload");
+  await page.getByLabel("File").setInputFiles("./fixtures/large-file.pdf");
+  await page.getByRole("button", { name: "Upload" }).click();
+
+  // Should show failure, not hang
+  await expect(page.getByText("Upload failed")).toBeVisible();
+  expect(requestStarted).toBe(true);
+});
+```
+
+## Offline Testing
+
+This section covers **unexpected network failures** and error recovery. For **offline-first apps (PWAs)** with service workers, caching, and background sync, see [service-workers.md](service-workers.md#offline-testing).
+
+### Go Offline During Session
+
+```typescript
+test("handles going offline", async ({ page, context }) => {
+  await page.goto("/dashboard");
+  await expect(page.getByTestId("data")).toBeVisible();
+
+  // Go offline unexpectedly
+  await context.setOffline(true);
+
+  // Try to refresh data
+  await page.getByRole("button", { name: "Refresh" }).click();
+
+  // Should show offline indicator
+  await expect(page.getByText("You're offline")).toBeVisible();
+
+  // Go back online
+  await context.setOffline(false);
+
+  // Should recover
+  await page.getByRole("button", { name: "Refresh" }).click();
+  await expect(page.getByText("You're offline")).toBeHidden();
+});
+```
+
+### Test Network Recovery
+
+```typescript
+test("recovers gracefully when connection returns", async ({
+  page,
+  context,
+}) => {
+  await page.goto("/dashboard");
+
+  // Simulate connection drop
+  await context.setOffline(true);
+
+  // App should show degraded state
+  await expect(page.getByRole("alert")).toContainText(/offline|connection/i);
+
+  // Connection restored
+  await context.setOffline(false);
+
+  // Retry should work
+  await page.getByRole("button", { name: "Retry" }).click();
+  await expect(page.getByTestId("data")).toBeVisible();
+});
+```
+
+## Loading States
+
+### Test Skeleton Loaders
+
+```typescript
+test("shows skeleton during load", async ({ page }) => {
+  // Add delay to API response
+  await page.route("**/api/posts", async (route) => {
+    await new Promise((resolve) => setTimeout(resolve, 1000));
+    route.fulfill({
+      json: [{ id: 1, title: "Post 1" }],
+    });
+  });
+
+  await page.goto("/posts");
+
+  // Skeleton should appear immediately
+  await expect(page.getByTestId("skeleton")).toBeVisible();
+
+  // Then content replaces skeleton
+  await expect(page.getByText("Post 1")).toBeVisible();
+  await expect(page.getByTestId("skeleton")).toBeHidden();
+});
+```
+
+### Test Loading Indicators
+
+```typescript
+test("shows loading state for actions", async ({ page }) => {
+  await page.route("**/api/save", async (route) => {
+    await new Promise((resolve) => setTimeout(resolve, 500));
+    route.fulfill({ json: { success: true } });
+  });
+
+  await page.goto("/editor");
+  await page.getByLabel("Content").fill("New content");
+
+  const saveButton = page.getByRole("button", { name: "Save" });
+  await saveButton.click();
+
+  // Button should show loading state
+  await expect(saveButton).toBeDisabled();
+  await expect(page.getByTestId("spinner")).toBeVisible();
+
+  // Then success state
+  await expect(saveButton).toBeEnabled();
+  await expect(page.getByText("Saved")).toBeVisible();
+});
+```
+
+### Test Empty States
+
+```typescript
+test("shows empty state when no data", async ({ page }) => {
+  await page.route("**/api/items", (route) => route.fulfill({ json: [] }));
+
+  await page.goto("/items");
+
+  await expect(page.getByText("No items yet")).toBeVisible();
+  await expect(
+    page.getByRole("button", { name: "Create First Item" }),
+  ).toBeVisible();
+});
+```
+
+## Form Validation
+
+### Test Client-Side Validation
+
+```typescript
+test("validates required fields", async ({ page }) => {
+  await page.goto("/signup");
+
+  // Submit empty form
+  await page.getByRole("button", { name: "Sign Up" }).click();
+
+  // Should show validation errors
+  await expect(page.getByText("Email is required")).toBeVisible();
+  await expect(page.getByText("Password is required")).toBeVisible();
+
+  // Form should not submit
+  await expect(page).toHaveURL("/signup");
+});
+```
+
+### Test Format Validation
+
+```typescript
+test("validates email format", async ({ page }) => {
+  await page.goto("/signup");
+
+  await page.getByLabel("Email").fill("invalid-email");
+  await page.getByLabel("Email").blur();
+
+  await expect(page.getByText("Invalid email address")).toBeVisible();
+
+  // Fix the error
+  await page.getByLabel("Email").fill("valid@email.com");
+  await page.getByLabel("Email").blur();
+
+  await expect(page.getByText("Invalid email address")).toBeHidden();
+});
+```
+
+### Test Server-Side Validation
+
+```typescript
+test("handles server validation errors", async ({ page }) => {
+  await page.route("**/api/register", (route) =>
+    route.fulfill({
+      status: 422,
+      json: {
+        errors: {
+          email: "Email already exists",
+          username: "Username is taken",
+        },
+      },
+    }),
+  );
+
+  await page.goto("/signup");
+  await page.getByLabel("Email").fill("taken@email.com");
+  await page.getByLabel("Username").fill("takenuser");
+  await page.getByLabel("Password").fill("password123");
+  await page.getByRole("button", { name: "Sign Up" }).click();
+
+  // Server errors should display
+  await expect(page.getByText("Email already exists")).toBeVisible();
+  await expect(page.getByText("Username is taken")).toBeVisible();
+});
+```
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern             | Problem                        | Solution                               |
+| ------------------------ | ------------------------------ | -------------------------------------- |
+| Only testing happy path  | Misses error handling bugs     | Test all error scenarios               |
+| No network failure tests | App crashes on poor connection | Test offline/slow/failed requests      |
+| Skipping loading states  | Janky UX not caught            | Assert loading UI appears              |
+| Ignoring validation      | Form bugs slip through         | Test both client and server validation |
+
+## Related References
+
+- **Network Mocking**: See [network-advanced.md](../advanced/network-advanced.md) for mock patterns
+- **Assertions**: See [assertions-waiting.md](../core/assertions-waiting.md) for error assertions
diff --git a/.cursor/skills/playwright-testing/debugging/flaky-tests.md b/.cursor/skills/playwright-testing/debugging/flaky-tests.md
new file mode 100644
index 0000000000..d081f21168
--- /dev/null
+++ b/.cursor/skills/playwright-testing/debugging/flaky-tests.md
@@ -0,0 +1,496 @@
+# Debugging and Managing Flaky Tests
+
+## Table of Contents
+
+1. [Understanding Flakiness Types](#understanding-flakiness-types)
+2. [Detection and Reproduction](#detection-and-reproduction)
+3. [Root Cause Analysis](#root-cause-analysis)
+4. [Fixing Strategies by Type](#fixing-strategies-by-type)
+5. [CI-Specific Flakiness](#ci-specific-flakiness)
+6. [Quarantine and Management](#quarantine-and-management)
+7. [Prevention Strategies](#prevention-strategies)
+
+## Understanding Flakiness Types
+
+### Categories of Flakiness
+
+Most flaky tests fall into distinct categories requiring different remediation:
+
+| Category                    | Symptoms                        | Common Causes                                          |
+| --------------------------- | ------------------------------- | ------------------------------------------------------ |
+| **UI-driven**               | Element not found, click missed | Missing waits, animations, dynamic rendering           |
+| **Environment-driven**      | CI-only failures                | Slower CPU, memory limits, cold browser starts         |
+| **Data/parallelism-driven** | Fails with multiple workers     | Shared backend data, reused accounts, state collisions |
+| **Test-suite-driven**       | Fails when run with other tests | Leaked state, shared fixtures, order dependencies      |
+
+### Flakiness Decision Tree
+
+```
+Test fails intermittently
+├─ Fails locally too?
+│  ├─ YES → Timing/async issue → Check waits and assertions
+│  └─ NO → CI-specific → Check environment differences
+│
+├─ Fails only with multiple workers?
+│  └─ YES → Parallelism issue → Check data isolation
+│
+├─ Fails only when run after specific tests?
+│  └─ YES → State leak → Check fixtures and cleanup
+│
+└─ Fails randomly regardless of conditions?
+   └─ External dependency → Check network/API stability
+```
+
+## Detection and Reproduction
+
+### Confirming Flakiness
+
+```bash
+# Run test multiple times to confirm instability
+npx playwright test tests/checkout.spec.ts --repeat-each=20
+
+# Run with single worker to isolate parallelism issues
+npx playwright test --workers=1
+
+# Run in CI-like conditions locally
+CI=true npx playwright test --repeat-each=10
+```
+
+### Reproduction Strategies
+
+```typescript
+// playwright.config.ts - Enable artifacts for flaky test investigation
+export default defineConfig({
+  retries: process.env.CI ? 2 : 0,
+  use: {
+    trace: "on-first-retry", // Capture trace on retry
+    video: "retain-on-failure",
+    screenshot: "only-on-failure",
+  },
+});
+```
+
+### Identify Flaky Tests Programmatically
+
+```typescript
+// Track test results across runs
+test.afterEach(async ({}, testInfo) => {
+  if (testInfo.retry > 0 && testInfo.status === "passed") {
+    console.warn(`FLAKY: ${testInfo.title} passed on retry ${testInfo.retry}`);
+    // Log to your tracking system
+  }
+});
+```
+
+## Root Cause Analysis
+
+### Event Logging for Race Conditions
+
+Add comprehensive event logging to expose timing issues:
+
+```typescript
+test.beforeEach(async ({ page }) => {
+  page.on("console", (msg) =>
+    console.log(`CONSOLE [${msg.type()}]:`, msg.text()),
+  );
+  page.on("pageerror", (err) => console.error("PAGE ERROR:", err.message));
+  page.on("requestfailed", (req) =>
+    console.error(`REQUEST FAILED: ${req.url()}`),
+  );
+});
+```
+
+> **For comprehensive console error handling** (fail on errors, allowed patterns, fixtures), see [console-errors.md](console-errors.md).
+
+### Network Timing Analysis
+
+```typescript
+// Capture slow or failed requests
+test.beforeEach(async ({ page }) => {
+  const slowRequests: string[] = [];
+
+  page.on("requestfinished", (request) => {
+    const timing = request.timing();
+    const duration = timing.responseEnd - timing.requestStart;
+    if (duration > 2000) {
+      slowRequests.push(`${request.url()} took ${duration}ms`);
+    }
+  });
+
+  page.on("requestfailed", (request) => {
+    console.error(`Failed: ${request.url()} - ${request.failure()?.errorText}`);
+  });
+});
+```
+
+### Trace Analysis
+
+```bash
+# View trace from failed CI run
+npx playwright show-trace path/to/trace.zip
+
+# Generate trace for specific test
+npx playwright test tests/flaky.spec.ts --trace on
+```
+
+## Fixing Strategies by Type
+
+### UI-Driven Flakiness
+
+**Problem: Element not ready when action executes**
+
+```typescript
+// ❌ BAD: No wait for element state
+await page.click("#submit");
+await page.fill("#username", "test"); // Element may not be ready
+
+// ✅ GOOD: Actions + assertions pattern (auto-waiting built-in)
+await page.getByRole("button", { name: "Submit" }).click();
+await expect(page.getByRole("heading", { name: "Dashboard" })).toBeVisible();
+```
+
+**Problem: Animations or transitions interfere**
+
+```typescript
+// ❌ BAD: Click during animation
+await page.click(".menu-item");
+
+// ✅ GOOD: Wait for animation to complete
+await page.getByRole("menuitem", { name: "Settings" }).click();
+await expect(page.getByRole("dialog")).toBeVisible();
+// Or disable animations entirely
+await page.emulateMedia({ reducedMotion: "reduce" });
+```
+
+**Problem: Brittle selectors**
+
+```typescript
+// ❌ BAD: Fragile CSS chain
+await page.click("div.container > div:nth-child(2) > button.btn-primary");
+
+// ✅ GOOD: Semantic selectors
+await page.getByRole("button", { name: "Continue" }).click();
+await page.getByTestId("checkout-button").click();
+await page.getByLabel("Email address").fill("test@example.com");
+```
+
+### Async/Timing Flakiness
+
+**Problem: Race between test and application**
+
+```typescript
+// ❌ BAD: Arbitrary sleep
+await page.click("#load-data");
+await page.waitForTimeout(3000); // Hope data loads in 3s
+
+// ✅ GOOD: Wait for specific condition
+await page.click("#load-data");
+await expect(page.locator(".data-row")).toHaveCount(10, { timeout: 10000 });
+
+// ✅ BETTER: Wait for network response, then assert
+const responsePromise = page.waitForResponse(
+  (r) =>
+    r.url().includes("/api/data") &&
+    r.request().method() === "GET" &&
+    r.ok(),
+);
+await page.click("#load-data");
+await responsePromise;
+await expect(page.locator(".data-row")).toHaveCount(10);
+```
+
+> **For comprehensive waiting strategies** (navigation, element state, network, polling with `toPass()`), see [assertions-waiting.md](assertions-waiting.md#waiting-strategies).
+
+**Problem: Complex async state**
+
+```typescript
+// Custom wait for application-specific conditions
+await page.waitForFunction(() => {
+  const app = (window as any).__APP_STATE__;
+  return app?.isReady && !app?.isLoading;
+});
+
+// Wait for multiple conditions
+await Promise.all([
+  page.waitForResponse("**/api/user"),
+  page.waitForResponse("**/api/settings"),
+  page.getByRole("button", { name: "Load" }).click(),
+]);
+```
+
+### Data/Parallelism-Driven Flakiness
+
+**Problem: Tests share backend data**
+
+```typescript
+// ❌ BAD: All workers use same user
+const testUser = { email: "test@example.com", password: "pass123" };
+
+// ✅ GOOD: Unique data per worker
+import { test as base } from "@playwright/test";
+
+export const test = base.extend<
+  {},
+  { testUser: { email: string; id: string } }
+>({
+  testUser: [
+    async ({}, use, workerInfo) => {
+      const email = `test-${workerInfo.workerIndex}-${Date.now()}@example.com`;
+      const user = await createTestUser(email);
+      await use(user);
+      await deleteTestUser(user.id);
+    },
+    { scope: "worker" },
+  ],
+});
+```
+
+**Problem: Shared storageState across workers**
+
+```typescript
+// ❌ BAD: All workers share same auth state
+use: {
+  storageState: '.auth/user.json',
+}
+
+// ✅ GOOD: Per-worker auth state
+export const test = base.extend<{}, { workerStorageState: string }>({
+  workerStorageState: [
+    async ({ browser }, use, workerInfo) => {
+      const id = workerInfo.workerIndex;
+      const fileName = `.auth/user-${id}.json`;
+
+      if (!fs.existsSync(fileName)) {
+        const page = await browser.newPage({ storageState: undefined });
+        await authenticateUser(page, `worker${id}@test.com`);
+        await page.context().storageState({ path: fileName });
+        await page.close();
+      }
+
+      await use(fileName);
+    },
+    { scope: "worker" },
+  ],
+});
+```
+
+### Test-Suite-Driven Flakiness (State Leaks)
+
+**Problem: Tests affect each other**
+
+```typescript
+// ❌ BAD: Module-level state persists across tests
+let sharedPage: Page;
+
+test.beforeAll(async ({ browser }) => {
+  sharedPage = await browser.newPage(); // Shared across tests!
+});
+
+// ✅ GOOD: Use Playwright's default isolation (fresh context per test)
+test("first test", async ({ page }) => {
+  // Fresh page for this test
+});
+
+test("second test", async ({ page }) => {
+  // Fresh page for this test
+});
+```
+
+**Problem: Fixture cleanup not happening**
+
+```typescript
+// ✅ GOOD: Proper fixture with cleanup
+export const test = base.extend<{ tempFile: string }>({
+  tempFile: async ({}, use) => {
+    const file = `/tmp/test-${Date.now()}.json`;
+    fs.writeFileSync(file, "{}");
+
+    await use(file);
+
+    // Cleanup always runs, even on failure
+    if (fs.existsSync(file)) {
+      fs.unlinkSync(file);
+    }
+  },
+});
+```
+
+## CI-Specific Flakiness
+
+### Why Tests Fail Only in CI
+
+| CI Condition       | Impact                                | Solution                                             |
+| ------------------ | ------------------------------------- | ---------------------------------------------------- |
+| Slower CPU         | Actions complete later than expected  | Use auto-waiting, not timeouts                       |
+| Cold browser start | No cached assets, slower initial load | Add explicit waits for first navigation              |
+| Headless mode      | Different rendering behavior          | Test locally in headless mode                        |
+| Shared runners     | Resource contention                   | Reduce parallelism or use dedicated runners          |
+| Network latency    | API calls slower                      | Mock external APIs, increase timeouts for real calls |
+
+### Simulating CI Locally
+
+```bash
+# Run headless with CI environment variable
+CI=true npx playwright test
+
+# Limit CPU (Linux/Mac)
+cpulimit -l 50 -- npx playwright test
+
+# Run in Docker matching CI environment
+docker run -it --rm \
+  -v $(pwd):/work \
+  -w /work \
+  mcr.microsoft.com/playwright:v1.40.0-jammy \
+  npx playwright test
+```
+
+### Consistent Viewport and Scale
+
+```typescript
+// playwright.config.ts - Match CI rendering exactly
+export default defineConfig({
+  use: {
+    viewport: { width: 1280, height: 720 },
+    deviceScaleFactor: 1,
+  },
+});
+```
+
+### Network Stubbing for External APIs
+
+```typescript
+// Eliminate external API flakiness
+test.beforeEach(async ({ page }) => {
+  // Stub unstable third-party APIs
+  await page.route("**/api.analytics.com/**", (route) =>
+    route.fulfill({ body: "" }),
+  );
+  await page.route("**/api.payment-provider.com/**", (route) =>
+    route.fulfill({ json: { status: "ok" } }),
+  );
+});
+
+// Test-specific stub
+test("checkout with payment", async ({ page }) => {
+  await page.route("**/api/payment", (route) =>
+    route.fulfill({ json: { success: true, transactionId: "test-123" } }),
+  );
+  // Test proceeds with deterministic response
+});
+```
+
+## Quarantine and Management
+
+### Quarantine Pattern
+
+```typescript
+// playwright.config.ts - Separate flaky tests
+export default defineConfig({
+  projects: [
+    {
+      name: "stable",
+      testIgnore: ["**/*.flaky.spec.ts"],
+    },
+    {
+      name: "quarantine",
+      testMatch: ["**/*.flaky.spec.ts"],
+      retries: 3,
+    },
+  ],
+});
+```
+
+### Annotation-Based Quarantine
+
+```typescript
+// Mark flaky tests with annotations
+test("intermittent checkout issue", async ({ page }, testInfo) => {
+  testInfo.annotations.push({
+    type: "flaky",
+    description: "Investigating payment API timing - JIRA-1234",
+  });
+
+  // Test implementation
+});
+
+// Skip flaky test conditionally
+test("known CI flaky", async ({ page }) => {
+  test.skip(!!process.env.CI, "Flaky in CI - investigating JIRA-5678");
+  // Test implementation
+});
+```
+
+## Prevention Strategies
+
+### Test Burn-In
+
+```bash
+# Run new tests many times before merging
+npx playwright test tests/new-feature.spec.ts --repeat-each=50
+
+# Run in parallel to expose race conditions
+npx playwright test tests/new-feature.spec.ts --repeat-each=20 --workers=4
+```
+
+### Isolation Checklist
+
+```typescript
+// ✅ Each test should be self-contained
+test.describe("User profile", () => {
+  test("can update name", async ({ page, testUser }) => {
+    // Uses unique testUser fixture
+    // No dependency on other tests
+    // Cleanup handled by fixture
+  });
+
+  test("can update email", async ({ page, testUser }) => {
+    // Independent of "can update name"
+    // Own testUser, own state
+  });
+});
+```
+
+### Defensive Assertions
+
+```typescript
+// ❌ BAD: Single point of failure
+await expect(page.locator(".items")).toHaveCount(5);
+
+// ✅ GOOD: Progressive assertions that help diagnose
+await expect(page.locator(".items-container")).toBeVisible();
+await expect(page.locator(".loading")).not.toBeVisible();
+await expect(page.locator(".items")).toHaveCount(5);
+```
+
+### Retry Budget
+
+```typescript
+// playwright.config.ts - Limit retries to avoid masking issues
+export default defineConfig({
+  retries: process.env.CI ? 2 : 0, // Only retry in CI
+  expect: {
+    timeout: 10000, // Reasonable assertion timeout
+  },
+  timeout: 60000, // Test timeout
+});
+```
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern                              | Problem                             | Solution                                       |
+| ----------------------------------------- | ----------------------------------- | ---------------------------------------------- |
+| `waitForTimeout()` as primary wait        | Arbitrary, hides real timing issues | Use auto-waiting assertions                    |
+| Increasing global timeout to "fix" flakes | Masks root cause, slows all tests   | Find and fix actual timing issue               |
+| Retrying until pass                       | Hides systemic problems             | Fix root cause, use retries for diagnosis only |
+| Shared test data across workers           | Race conditions, collisions         | Isolate data per worker                        |
+| Testing real external APIs                | Network variability                 | Mock external dependencies                     |
+| Module-level mutable state                | Leaks between tests                 | Use fixtures with proper cleanup               |
+| Ignoring flaky tests                      | Problem compounds over time         | Quarantine and track for fixing                |
+
+## Related References
+
+- **Debugging**: See [debugging.md](debugging.md) for trace viewer and inspector
+- **Fixtures**: See [fixtures-hooks.md](../core/fixtures-hooks.md) for worker-scoped isolation
+- **Performance**: See [performance.md](../infrastructure-ci-cd/performance.md) for parallel execution patterns
+- **Assertions**: See [assertions-waiting.md](../core/assertions-waiting.md) for auto-waiting patterns
+- **Global Setup**: See [global-setup.md](../core/global-setup.md) for setup vs fixtures decision
diff --git a/.cursor/skills/playwright-testing/frameworks/nextjs.md b/.cursor/skills/playwright-testing/frameworks/nextjs.md
new file mode 100644
index 0000000000..c16a9e9dfa
--- /dev/null
+++ b/.cursor/skills/playwright-testing/frameworks/nextjs.md
@@ -0,0 +1,469 @@
+# Next.js Testing Patterns
+
+## Table of Contents
+
+1. [Setup](#setup)
+2. [App Router Patterns](#app-router-patterns)
+3. [Pages Router Patterns](#pages-router-patterns)
+4. [Dynamic Routes](#dynamic-routes)
+5. [API Routes](#api-routes)
+6. [Middleware Testing](#middleware-testing)
+7. [Hydration Testing](#hydration-testing)
+8. [next/image Testing](#nextimage-testing)
+9. [NextAuth.js Authentication](#nextauthjs-authentication)
+10. [Tips](#tips)
+11. [Anti-Patterns](#anti-patterns)
+12. [Related](#related)
+
+> **When to use**: Testing Next.js applications with App Router, Pages Router, API routes, middleware, SSR, dynamic routes, and server components.
+> **Prerequisites**: [configuration.md](../core/configuration.md), [locators.md](../core/locators.md)
+
+## Setup
+
+### Configuration with webServer
+
+```typescript
+// playwright.config.ts
+import { defineConfig, devices } from '@playwright/test';
+
+export default defineConfig({
+  testDir: './tests',
+  fullyParallel: true,
+  forbidOnly: !!process.env.CI,
+  retries: process.env.CI ? 2 : 0,
+  workers: process.env.CI ? '50%' : undefined,
+
+  use: {
+    baseURL: 'http://localhost:3000',
+    trace: 'on-first-retry',
+    screenshot: 'only-on-failure',
+  },
+
+  projects: [
+    { name: 'chromium', use: { ...devices['Desktop Chrome'] } },
+    { name: 'mobile', use: { ...devices['iPhone 14'] } },
+  ],
+
+  webServer: {
+    command: process.env.CI
+      ? 'npm run build && npm run start'
+      : 'npm run dev',
+    url: 'http://localhost:3000',
+    reuseExistingServer: !process.env.CI,
+    timeout: 120_000,
+    env: {
+      NODE_ENV: process.env.CI ? 'production' : 'test',
+    },
+  },
+});
+```
+
+### Environment Variables
+
+Next.js loads `.env.test` when `NODE_ENV=test`:
+
+```bash
+# .env.test (commit this)
+NEXT_PUBLIC_API_URL=http://localhost:3000/api
+DATABASE_URL=postgresql://localhost:5432/test_db
+
+# .env.test.local (gitignored)
+NEXTAUTH_SECRET=test-secret-local
+```
+
+## App Router Patterns
+
+### Server Component Content
+
+```typescript
+test('renders server component content', async ({ page }) => {
+  await page.goto('/');
+
+  await expect(page.getByRole('heading', { name: 'Welcome', level: 1 })).toBeVisible();
+  await expect(page.getByRole('navigation', { name: 'Main' })).toBeVisible();
+});
+```
+
+### Loading States with Streaming
+
+```typescript
+test('loading state during data streaming', async ({ page }) => {
+  await page.route('**/api/stats', async (route) => {
+    await new Promise((r) => setTimeout(r, 2000));
+    await route.continue();
+  });
+
+  await page.goto('/dashboard');
+
+  await expect(page.getByRole('progressbar')).toBeVisible();
+  await expect(page.getByRole('heading', { name: 'Dashboard' })).toBeVisible();
+  await expect(page.getByRole('progressbar')).toBeHidden();
+});
+```
+
+### Nested Layouts
+
+```typescript
+test('layouts persist across navigation', async ({ page }) => {
+  await page.goto('/dashboard/analytics');
+
+  const sidebar = page.getByRole('navigation', { name: 'Dashboard' });
+  await expect(sidebar).toBeVisible();
+
+  await sidebar.getByRole('link', { name: 'Settings' }).click();
+  await page.waitForURL('/dashboard/settings');
+
+  await expect(sidebar).toBeVisible();
+  await expect(page.getByRole('heading', { name: 'Settings' })).toBeVisible();
+});
+```
+
+## Pages Router Patterns
+
+### SSR with getServerSideProps
+
+```typescript
+test('page with getServerSideProps renders data', async ({ page }) => {
+  await page.goto('/blog');
+
+  await expect(page.getByRole('heading', { name: 'Blog', level: 1 })).toBeVisible();
+  await expect(page.getByRole('article')).toHaveCount(10);
+  await expect(page.getByRole('article').first()).toContainText(/\w+/);
+});
+```
+
+### Static Generation with getStaticProps
+
+```typescript
+test('static page shows pre-rendered content', async ({ page }) => {
+  await page.goto('/about');
+
+  await expect(page.getByRole('heading', { name: 'About Us' })).toBeVisible();
+  await expect(page.getByText('Founded in 2020')).toBeVisible();
+});
+```
+
+## Dynamic Routes
+
+### Slug Parameters
+
+```typescript
+test('dynamic [slug] renders correct content', async ({ page }) => {
+  await page.goto('/blog/testing-guide');
+
+  await expect(page.getByRole('heading', { level: 1 })).toContainText('Testing Guide');
+  await expect(page.getByText('Page not found')).toBeHidden();
+});
+
+test('non-existent slug shows 404', async ({ page }) => {
+  const response = await page.goto('/blog/nonexistent-post');
+
+  expect(response?.status()).toBe(404);
+  await expect(page.getByRole('heading', { name: '404' })).toBeVisible();
+});
+```
+
+### Catch-All Routes
+
+```typescript
+test('catch-all handles nested paths', async ({ page }) => {
+  await page.goto('/docs/getting-started/installation');
+  await expect(page.getByRole('heading', { name: 'Installation' })).toBeVisible();
+
+  await page.goto('/docs/api/configuration');
+  await expect(page.getByRole('heading', { name: 'Configuration' })).toBeVisible();
+});
+```
+
+### Query Parameters
+
+```typescript
+test('query parameters filter content', async ({ page }) => {
+  await page.goto('/products?category=electronics&sort=price-asc');
+
+  await expect(page.getByRole('heading', { name: 'Electronics' })).toBeVisible();
+
+  const prices = await page.getByTestId('product-price').allTextContents();
+  const numericPrices = prices.map((p) => parseFloat(p.replace('$', '')));
+  expect(numericPrices).toEqual([...numericPrices].sort((a, b) => a - b));
+});
+```
+
+## API Routes
+
+### Direct API Testing
+
+```typescript
+test('GET /api/products returns list', async ({ request }) => {
+  const response = await request.get('/api/products');
+
+  expect(response.ok()).toBeTruthy();
+  const body = await response.json();
+  expect(body.products).toBeInstanceOf(Array);
+  expect(body.products[0]).toHaveProperty('id');
+  expect(body.products[0]).toHaveProperty('name');
+});
+
+test('POST /api/products creates item', async ({ request }) => {
+  const response = await request.post('/api/products', {
+    data: { name: 'Test Product', price: 29.99 },
+  });
+
+  expect(response.status()).toBe(201);
+  const body = await response.json();
+  expect(body.product.name).toBe('Test Product');
+});
+
+test('POST /api/products validates fields', async ({ request }) => {
+  const response = await request.post('/api/products', {
+    data: { name: '' },
+  });
+
+  expect(response.status()).toBe(400);
+  const body = await response.json();
+  expect(body.error).toContainEqual(expect.objectContaining({ field: 'price' }));
+});
+```
+
+### API Through UI
+
+```typescript
+test('form submission calls API', async ({ page }) => {
+  await page.goto('/products/new');
+
+  await page.getByLabel('Product name').fill('Widget');
+  await page.getByLabel('Price').fill('19.99');
+  await page.getByRole('button', { name: 'Create product' }).click();
+
+  await expect(page.getByText('Product created successfully')).toBeVisible();
+  await page.waitForURL('/products/**');
+});
+```
+
+## Middleware Testing
+
+### Auth Redirects
+
+```typescript
+test('unauthenticated user redirected to login', async ({ page }) => {
+  await page.goto('/dashboard');
+
+  expect(page.url()).toContain('/login');
+  await expect(page.getByRole('heading', { name: 'Sign in' })).toBeVisible();
+});
+
+test('redirect preserves return URL', async ({ page }) => {
+  await page.goto('/dashboard/settings');
+
+  const url = new URL(page.url());
+  expect(url.pathname).toBe('/login');
+  expect(url.searchParams.get('callbackUrl') || url.searchParams.get('returnTo'))
+    .toContain('/dashboard/settings');
+});
+```
+
+### Security Headers
+
+```typescript
+test('middleware sets security headers', async ({ page }) => {
+  const response = await page.goto('/');
+
+  const headers = response!.headers();
+  expect(headers['x-frame-options']).toBe('DENY');
+  expect(headers['x-content-type-options']).toBe('nosniff');
+});
+```
+
+### Locale Rewrites
+
+```typescript
+test('middleware rewrites based on locale', async ({ page, context }) => {
+  await context.setExtraHTTPHeaders({
+    'Accept-Language': 'fr-FR,fr;q=0.9',
+  });
+
+  await page.goto('/');
+
+  await expect(page.getByText('Bienvenue')).toBeVisible();
+});
+```
+
+## Hydration Testing
+
+### Console Error Detection
+
+```typescript
+test('no hydration errors in console', async ({ page }) => {
+  const consoleErrors: string[] = [];
+  page.on('console', (msg) => {
+    if (msg.type() === 'error') {
+      consoleErrors.push(msg.text());
+    }
+  });
+
+  await page.goto('/');
+  await page.getByRole('button', { name: 'Get started' }).click();
+
+  const hydrationErrors = consoleErrors.filter(
+    (e) =>
+      e.includes('Hydration') ||
+      e.includes('hydration') ||
+      e.includes('did not match')
+  );
+  expect(hydrationErrors).toEqual([]);
+});
+```
+
+### Interactive Elements After Hydration
+
+```typescript
+test('interactive elements work after hydration', async ({ page }) => {
+  await page.goto('/');
+
+  const counter = page.getByTestId('counter-value');
+  await expect(counter).toHaveText('0');
+
+  await page.getByRole('button', { name: 'Increment' }).click();
+  await expect(counter).toHaveText('1');
+});
+```
+
+## next/image Testing
+
+```typescript
+test('hero image loads with srcset', async ({ page }) => {
+  await page.goto('/');
+
+  const heroImage = page.getByRole('img', { name: 'Hero banner' });
+  await expect(heroImage).toBeVisible();
+
+  const srcset = await heroImage.getAttribute('srcset');
+  expect(srcset).toBeTruthy();
+  expect(srcset).toContain('w=');
+
+  const loading = await heroImage.getAttribute('loading');
+  expect(loading).not.toBe('lazy');
+});
+
+test('offscreen images lazy load', async ({ page }) => {
+  await page.goto('/gallery');
+
+  const offscreenImage = page.getByRole('img', { name: 'Gallery item 20' });
+
+  await offscreenImage.scrollIntoViewIfNeeded();
+  await expect(offscreenImage).toBeVisible();
+
+  const naturalWidth = await offscreenImage.evaluate(
+    (img: HTMLImageElement) => img.naturalWidth
+  );
+  expect(naturalWidth).toBeGreaterThan(0);
+});
+```
+
+## NextAuth.js Authentication
+
+### Setup Project
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  projects: [
+    { name: 'setup', testMatch: /auth\.setup\.ts/ },
+    {
+      name: 'authenticated',
+      use: { storageState: 'playwright/.auth/user.json' },
+      dependencies: ['setup'],
+    },
+    { name: 'unauthenticated', testMatch: '**/*.unauth.spec.ts' },
+  ],
+});
+```
+
+### Auth Setup
+
+```typescript
+// tests/auth.setup.ts
+import { test as setup, expect } from '@playwright/test';
+
+const authFile = 'playwright/.auth/user.json';
+
+setup('authenticate via credentials', async ({ page }) => {
+  await page.goto('/login');
+  await page.getByLabel('Email').fill('test@example.com');
+  await page.getByLabel('Password').fill(process.env.TEST_PASSWORD!);
+  await page.getByRole('button', { name: 'Sign in' }).click();
+
+  await page.waitForURL('/dashboard');
+  await expect(page.getByRole('heading', { name: 'Dashboard' })).toBeVisible();
+
+  await page.context().storageState({ path: authFile });
+});
+```
+
+### Authenticated Tests
+
+```typescript
+test('authenticated user sees dashboard', async ({ page }) => {
+  await page.goto('/dashboard');
+
+  await expect(page.getByRole('heading', { name: 'Dashboard' })).toBeVisible();
+  await expect(page.getByText('test@example.com')).toBeVisible();
+});
+```
+
+## Tips
+
+### Dev Server vs Production Build
+
+| Scenario | Command | Trade-off |
+|---|---|---|
+| Local development | `npm run dev` | Fast iteration, no production behavior |
+| CI pipeline | `npm run build && npm run start` | Tests real production bundle |
+
+### Turbopack
+
+```typescript
+webServer: {
+  command: process.env.CI
+    ? 'npm run build && npm run start'
+    : 'npx next dev --turbopack',
+  url: 'http://localhost:3000',
+  reuseExistingServer: !process.env.CI,
+},
+```
+
+### Multiple webServer Entries
+
+```typescript
+webServer: [
+  {
+    command: 'npm run dev:api',
+    url: 'http://localhost:4000/health',
+    reuseExistingServer: !process.env.CI,
+  },
+  {
+    command: 'npm run dev',
+    url: 'http://localhost:3000',
+    reuseExistingServer: !process.env.CI,
+  },
+],
+```
+
+## Anti-Patterns
+
+| Don't Do This | Problem | Do This Instead |
+|---|---|---|
+| `await page.waitForTimeout(3000)` | Arbitrary waits are fragile | `await page.waitForURL('/path')` or `await expect(locator).toBeVisible()` |
+| Test `getServerSideProps` directly | Depends on req/res context | Navigate to page and verify rendered output |
+| Mock your own API routes | Hides real API bugs | Let real API handle requests; mock only external services |
+| `page.goto('http://localhost:3000/path')` | Breaks when port changes | Use `page.goto('/path')` with `baseURL` |
+| Run `npm run build` locally for every test | Extremely slow | Use `npm run dev` locally with `reuseExistingServer: true` |
+| Test `next/image` by checking exact URLs | Paths change between dev/prod | Assert on `alt`, visibility, `naturalWidth > 0`, `srcset` |
+| Test server actions by calling as functions | Server actions need Next.js runtime | Trigger through UI (forms, buttons) |
+
+## Related
+
+- [configuration.md](../core/configuration.md) -- Playwright configuration including `webServer`
+- [authentication.md](../advanced/authentication.md) -- authentication setup and `storageState`
+- [api-testing.md](../testing-patterns/api-testing.md) -- testing API routes with `request` context
+- [react.md](react.md) -- React patterns for Next.js client components
diff --git a/.cursor/skills/playwright-testing/frameworks/react.md b/.cursor/skills/playwright-testing/frameworks/react.md
new file mode 100644
index 0000000000..20e3a1ae6e
--- /dev/null
+++ b/.cursor/skills/playwright-testing/frameworks/react.md
@@ -0,0 +1,531 @@
+# React Application Testing
+
+## Table of Contents
+
+1. [Patterns](#patterns)
+2. [Setup](#setup)
+3. [Framework Tips](#framework-tips)
+4. [Anti-Patterns](#anti-patterns)
+5. [Related](#related)
+
+> **When to use**: Testing React apps built with Vite, Create React App, or custom bundlers. Covers E2E testing, component testing, React Router navigation, form libraries, portals, error boundaries, and context/state verification.
+> **Prerequisites**: [configuration.md](../core/configuration.md), [locators.md](../core/locators.md)
+
+## Patterns
+
+### Testing Context and Global State
+
+**Use when**: Verifying React context (theme, auth, locale) and state management (Redux, Zustand) produce correct UI changes.
+**Avoid when**: You want to assert on raw state objects—test the UI, not internal state.
+
+```typescript
+import { test, expect } from '@playwright/test';
+
+test.describe('theme switching', () => {
+  test('toggle applies dark mode across pages', async ({ page }) => {
+    await page.goto('/preferences');
+
+    const root = page.locator('html');
+    await expect(root).not.toHaveClass(/dark-mode/);
+
+    await page.getByRole('switch', { name: 'Enable dark theme' }).click();
+    await expect(root).toHaveClass(/dark-mode/);
+
+    await page.getByRole('link', { name: 'Dashboard' }).click();
+    await expect(page.locator('html')).toHaveClass(/dark-mode/);
+  });
+});
+
+test.describe('cart state persistence', () => {
+  test('item count updates globally', async ({ page }) => {
+    await page.goto('/catalog');
+
+    const badge = page.getByTestId('cart-badge');
+
+    await page.getByRole('listitem')
+      .filter({ hasText: 'Wireless Headphones' })
+      .getByRole('button', { name: 'Add' })
+      .click();
+    await expect(badge).toHaveText('1');
+
+    await page.getByRole('link', { name: 'Contact' }).click();
+    await expect(badge).toHaveText('1');
+  });
+});
+
+test.describe('auth state', () => {
+  test('login updates header across components', async ({ page }) => {
+    await page.goto('/');
+
+    await expect(page.getByRole('link', { name: 'Login' })).toBeVisible();
+
+    await page.getByRole('link', { name: 'Login' }).click();
+    await page.getByLabel('Username').fill('testuser');
+    await page.getByLabel('Password').fill('secret123');
+    await page.getByRole('button', { name: 'Submit' }).click();
+
+    await expect(page.getByRole('link', { name: 'Login' })).toBeHidden();
+    await expect(page.getByText('testuser')).toBeVisible();
+  });
+});
+```
+
+### React Router Navigation
+
+**Use when**: Testing client-side routing with React Router v6+—route transitions, URL parameters, protected routes, browser history.
+**Avoid when**: Server-side routing (Next.js App Router—see [nextjs.md](nextjs.md)).
+
+```typescript
+import { test, expect } from '@playwright/test';
+
+test.describe('client routing', () => {
+  test('navigation preserves SPA state', async ({ page }) => {
+    await page.goto('/');
+
+    await page.evaluate(() => {
+      (window as any).__spaMarker = 'active';
+    });
+
+    await page.getByRole('link', { name: 'Inventory' }).click();
+    await page.waitForURL('/inventory');
+
+    const marker = await page.evaluate(() => (window as any).__spaMarker);
+    expect(marker).toBe('active');
+  });
+
+  test('query params filter content', async ({ page }) => {
+    await page.goto('/items?type=books');
+
+    await expect(page.getByRole('heading', { name: 'Books' })).toBeVisible();
+
+    await page.getByRole('link', { name: 'Music' }).click();
+    await page.waitForURL('/items?type=music');
+
+    await expect(page.getByRole('heading', { name: 'Music' })).toBeVisible();
+  });
+
+  test('nested routes render layouts', async ({ page }) => {
+    await page.goto('/account/security');
+
+    await expect(page.getByRole('heading', { name: 'Account' })).toBeVisible();
+    await expect(page.getByRole('heading', { name: 'Security', level: 2 })).toBeVisible();
+
+    await page.getByRole('link', { name: 'Privacy' }).click();
+    await page.waitForURL('/account/privacy');
+
+    await expect(page.getByRole('heading', { name: 'Account' })).toBeVisible();
+    await expect(page.getByRole('heading', { name: 'Privacy', level: 2 })).toBeVisible();
+  });
+
+  test('history navigation works', async ({ page }) => {
+    await page.goto('/');
+    await page.getByRole('link', { name: 'Inventory' }).click();
+    await page.waitForURL('/inventory');
+    await page.getByRole('link', { name: 'Help' }).click();
+    await page.waitForURL('/help');
+
+    await page.goBack();
+    await expect(page).toHaveURL(/\/inventory/);
+
+    await page.goBack();
+    await expect(page).toHaveURL(/\/$/);
+  });
+
+  test('protected route redirects', async ({ page }) => {
+    await page.goto('/admin/users');
+
+    await expect(page).toHaveURL(/\/login/);
+  });
+
+  test('unknown route shows 404', async ({ page }) => {
+    await page.goto('/nonexistent-path');
+
+    await expect(page.getByRole('heading', { name: 'Not Found' })).toBeVisible();
+  });
+});
+```
+
+### Testing Hooks Through UI
+
+**Use when**: Verifying custom hooks produce correct UI behavior—Playwright cannot call hooks directly.
+**Avoid when**: Hook logic is pure computation—use unit tests instead.
+
+```typescript
+import { test, expect } from '@playwright/test';
+
+test.describe('useDebounce via SearchBox', () => {
+  test('batches rapid input', async ({ page }) => {
+    await page.goto('/search');
+
+    const apiCalls: string[] = [];
+    await page.route('**/api/query*', async (route) => {
+      apiCalls.push(route.request().url());
+      await route.continue();
+    });
+
+    await page.getByRole('textbox', { name: 'Search' }).pressSequentially('testing', {
+      delay: 40,
+    });
+
+    await expect(page.getByRole('listitem')).toHaveCount(3);
+    expect(apiCalls.length).toBeLessThanOrEqual(2);
+  });
+});
+
+test.describe('usePagination via DataGrid', () => {
+  test('page controls work', async ({ page }) => {
+    await page.goto('/records');
+
+    await expect(page.getByText('Page 1 of 10')).toBeVisible();
+
+    await page.getByRole('button', { name: 'Next' }).click();
+    await expect(page.getByText('Page 2 of 10')).toBeVisible();
+
+    await page.getByRole('button', { name: 'Previous' }).click();
+    await expect(page.getByText('Page 1 of 10')).toBeVisible();
+    await expect(page.getByRole('button', { name: 'Previous' })).toBeDisabled();
+  });
+});
+```
+
+### Form Libraries (React Hook Form, Formik)
+
+**Use when**: Testing forms built with react-hook-form or Formik—Playwright interacts with DOM, form library is transparent.
+
+```typescript
+import { test, expect } from '@playwright/test';
+
+test.describe('signup form', () => {
+  test.beforeEach(async ({ page }) => {
+    await page.goto('/signup');
+  });
+
+  test('validation on empty submit', async ({ page }) => {
+    await page.getByRole('button', { name: 'Register' }).click();
+
+    await expect(page.getByText('Email required')).toBeVisible();
+    await expect(page.getByText('Password required')).toBeVisible();
+  });
+
+  test('inline validation on blur', async ({ page }) => {
+    const email = page.getByLabel('Email');
+    await email.fill('invalid');
+    await email.blur();
+
+    await expect(page.getByText('Invalid email format')).toBeVisible();
+  });
+
+  test('password strength indicator', async ({ page }) => {
+    const pwd = page.getByLabel('Password', { exact: true });
+
+    await pwd.fill('weak');
+    await expect(page.getByText('Minimum 8 characters')).toHaveClass(/invalid/);
+
+    await pwd.fill('StrongPass1!');
+    await expect(page.getByText('Minimum 8 characters')).toHaveClass(/valid/);
+  });
+
+  test('successful submission redirects', async ({ page }) => {
+    await page.getByLabel('Name').fill('Alice');
+    await page.getByLabel('Email').fill('alice@test.com');
+    await page.getByLabel('Password', { exact: true }).fill('Secure123!');
+    await page.getByLabel('Confirm').fill('Secure123!');
+    await page.getByLabel('Accept terms').check();
+
+    await page.getByRole('button', { name: 'Register' }).click();
+
+    await page.waitForURL('/welcome');
+    await expect(page.getByText('Hello, Alice')).toBeVisible();
+  });
+
+  test('submit button disabled during request', async ({ page }) => {
+    await page.route('**/api/signup', async (route) => {
+      await new Promise((r) => setTimeout(r, 800));
+      await route.fulfill({ status: 201, json: { id: 1 } });
+    });
+
+    await page.getByLabel('Name').fill('Bob');
+    await page.getByLabel('Email').fill('bob@test.com');
+    await page.getByLabel('Password', { exact: true }).fill('Secure123!');
+    await page.getByLabel('Confirm').fill('Secure123!');
+    await page.getByLabel('Accept terms').check();
+
+    await page.getByRole('button', { name: 'Register' }).click();
+
+    await expect(page.getByRole('button', { name: /Registering|Loading/ })).toBeDisabled();
+  });
+});
+```
+
+### Portals (Modals, Tooltips, Dropdowns)
+
+**Use when**: Testing components rendered via `ReactDOM.createPortal()`—modals, dialogs, tooltips, menus. These render outside parent DOM but Playwright sees the full document.
+
+```typescript
+import { test, expect } from '@playwright/test';
+
+test.describe('portal components', () => {
+  test('modal interaction', async ({ page }) => {
+    await page.goto('/items');
+
+    await page.getByRole('button', { name: 'Remove' }).first().click();
+
+    const dialog = page.getByRole('dialog', { name: 'Confirm removal' });
+    await expect(dialog).toBeVisible();
+    await expect(dialog.getByRole('button', { name: 'Cancel' })).toBeFocused();
+
+    await dialog.getByRole('button', { name: 'Remove' }).click();
+    await expect(dialog).toBeHidden();
+  });
+
+  test('escape closes modal', async ({ page }) => {
+    await page.goto('/items');
+    await page.getByRole('button', { name: 'Remove' }).first().click();
+
+    const dialog = page.getByRole('dialog');
+    await expect(dialog).toBeVisible();
+
+    await page.keyboard.press('Escape');
+    await expect(dialog).toBeHidden();
+  });
+
+  test('tooltip on hover', async ({ page }) => {
+    await page.goto('/panel');
+
+    await page.getByRole('button', { name: 'Help' }).hover();
+    await expect(page.getByRole('tooltip')).toBeVisible();
+
+    await page.mouse.move(0, 0);
+    await expect(page.getByRole('tooltip')).toBeHidden();
+  });
+
+  test('dropdown menu', async ({ page }) => {
+    await page.goto('/panel');
+
+    await page.getByRole('button', { name: 'Actions' }).click();
+
+    const menu = page.getByRole('menu');
+    await expect(menu).toBeVisible();
+
+    await menu.getByRole('menuitem', { name: 'Rename' }).click();
+    await expect(menu).toBeHidden();
+  });
+
+  test('toast auto-dismisses', async ({ page }) => {
+    await page.goto('/preferences');
+
+    await page.getByRole('button', { name: 'Save' }).click();
+    await expect(page.getByText('Preferences saved')).toBeVisible();
+
+    await expect(page.getByText('Preferences saved')).toBeHidden({ timeout: 8000 });
+  });
+});
+```
+
+### Error Boundaries
+
+**Use when**: Verifying error boundaries catch rendering errors and show fallback UI.
+**Avoid when**: Testing error handling in event handlers or async code—error boundaries only catch render errors.
+
+```typescript
+import { test, expect } from '@playwright/test';
+
+test.describe('error boundary', () => {
+  test('shows fallback on crash', async ({ page }) => {
+    await page.route('**/api/widgets', (route) => {
+      route.fulfill({
+        status: 200,
+        json: { widgets: null },
+      });
+    });
+
+    await page.goto('/panel');
+
+    await expect(page.getByText('Something went wrong')).toBeVisible();
+    await expect(page.getByRole('button', { name: 'Retry' })).toBeVisible();
+    await expect(page.getByRole('navigation')).toBeVisible();
+  });
+
+  test('retry recovers component', async ({ page }) => {
+    let calls = 0;
+    await page.route('**/api/widgets', (route) => {
+      calls++;
+      if (calls === 1) {
+        route.fulfill({ status: 200, json: { widgets: null } });
+      } else {
+        route.fulfill({ status: 200, json: { widgets: [{ id: 1, name: 'Chart' }] } });
+      }
+    });
+
+    await page.goto('/panel');
+
+    await expect(page.getByText('Something went wrong')).toBeVisible();
+
+    await page.getByRole('button', { name: 'Retry' }).click();
+
+    await expect(page.getByText('Something went wrong')).toBeHidden();
+    await expect(page.getByText('Chart')).toBeVisible();
+  });
+});
+```
+
+### Component Testing (Experimental)
+
+**Use when**: Testing complex interactive components in isolation—data tables, form wizards, rich editors. Needs real browser but not full app.
+**Avoid when**: Component depends heavily on backend data or routing—use E2E instead.
+
+```typescript
+// playwright-ct.config.ts
+import { defineConfig, devices } from '@playwright/experimental-ct-react';
+
+export default defineConfig({
+  testDir: './tests/components',
+  testMatch: '**/*.ct.ts',
+  use: {
+    trace: 'on-first-retry',
+    ctPort: 3100,
+  },
+  projects: [
+    { name: 'chromium', use: { ...devices['Desktop Chrome'] } },
+  ],
+});
+```
+
+```typescript
+// tests/components/Stepper.ct.ts
+import { test, expect } from '@playwright/experimental-ct-react';
+import Stepper from '../../src/components/Stepper';
+
+test('increments on click', async ({ mount }) => {
+  const component = await mount(<Stepper initial={0} />);
+
+  await expect(component.getByText('Value: 0')).toBeVisible();
+  await component.getByRole('button', { name: '+' }).click();
+  await expect(component.getByText('Value: 1')).toBeVisible();
+});
+
+test('fires onChange callback', async ({ mount }) => {
+  const values: number[] = [];
+  const component = await mount(
+    <Stepper initial={0} onChange={(v) => values.push(v)} />
+  );
+
+  await component.getByRole('button', { name: '+' }).click();
+  await component.getByRole('button', { name: '+' }).click();
+
+  expect(values).toEqual([1, 2]);
+});
+
+test('respects min boundary', async ({ mount }) => {
+  const component = await mount(<Stepper initial={0} min={0} />);
+
+  await expect(component.getByRole('button', { name: '-' })).toBeDisabled();
+});
+```
+
+## Setup
+
+### E2E Config (Vite)
+
+```typescript
+// playwright.config.ts
+import { defineConfig, devices } from '@playwright/test';
+
+export default defineConfig({
+  testDir: './tests',
+  fullyParallel: true,
+  forbidOnly: !!process.env.CI,
+  retries: process.env.CI ? 2 : 0,
+  workers: process.env.CI ? '50%' : undefined,
+
+  use: {
+    baseURL: 'http://localhost:5173',
+    trace: 'on-first-retry',
+    screenshot: 'only-on-failure',
+  },
+
+  projects: [
+    { name: 'chromium', use: { ...devices['Desktop Chrome'] } },
+    { name: 'firefox', use: { ...devices['Desktop Firefox'] } },
+    { name: 'mobile', use: { ...devices['iPhone 14'] } },
+  ],
+
+  webServer: {
+    command: process.env.CI ? 'npm run build && npx vite preview --port 5173' : 'npm run dev',
+    url: 'http://localhost:5173',
+    reuseExistingServer: !process.env.CI,
+    timeout: 120_000,
+  },
+});
+```
+
+### CRA vs Vite Differences
+
+| Aspect | Create React App | Vite |
+|---|---|---|
+| Default port | `3000` | `5173` |
+| Build output | `build/` | `dist/` |
+| Serve production | `npx serve -s build -l 3000` | `npx vite preview --port 5173` |
+| Env var prefix | `REACT_APP_*` | `VITE_*` |
+
+## Framework Tips
+
+### Strict Mode Double Effects
+
+React Strict Mode runs effects twice in development. Tests should be resilient:
+
+- Don't assert exact API call counts in dev mode
+- Run against production build for call count assertions, or account for double invocations
+
+### Suspense and Lazy Components
+
+```typescript
+test('lazy route loads content', async ({ page }) => {
+  await page.goto('/');
+
+  await page.getByRole('link', { name: 'Analytics' }).click();
+
+  await expect(page.getByRole('heading', { name: 'Analytics' })).toBeVisible();
+});
+```
+
+### Detecting Memory Leaks
+
+```typescript
+test('no unmounted state warnings', async ({ page }) => {
+  const warnings: string[] = [];
+  page.on('console', (msg) => {
+    if (msg.type() === 'warning' && msg.text().includes('unmounted')) {
+      warnings.push(msg.text());
+    }
+  });
+
+  await page.goto('/panel');
+  await page.getByRole('link', { name: 'Settings' }).click();
+  await page.goBack();
+  await page.getByRole('link', { name: 'Profile' }).click();
+
+  expect(warnings).toEqual([]);
+});
+```
+
+## Anti-Patterns
+
+| Don't | Problem | Do Instead |
+|---|---|---|
+| `page.evaluate(() => store.getState())` | Couples tests to implementation | Assert on UI: `expect(badge).toHaveText('3')` |
+| Import components in E2E tests | E2E runs in Node, not browser | Use `@playwright/experimental-ct-react` for components |
+| `page.waitForTimeout(500)` after state changes | Timing varies across machines | `expect(locator).toHaveText('value')` auto-retries |
+| `page.locator('.MuiButton-root')` | Class names change between versions | `page.getByRole('button', { name: 'Submit' })` |
+| Test every component with CT | Overhead for simple components | CT for complex widgets, unit tests for logic, E2E for flows |
+| Skip keyboard navigation tests | Accessibility regressions common | Test Tab, Enter, Escape, Arrow interactions |
+| Assert on `__REACT_FIBER__` internals | Not stable across versions | Only interact with rendered DOM |
+
+## Related
+
+- [locators.md](../core/locators.md) — locator strategies for any React component library
+- [assertions-waiting.md](../core/assertions-waiting.md) — auto-waiting for React state changes
+- [forms-validation.md](../testing-patterns/forms-validation.md) — form testing patterns
+- [component-testing.md](../testing-patterns/component-testing.md) — in-depth component testing
+- [test-architecture.md](../architecture/test-architecture.md) — E2E vs component vs unit decisions
+- [nextjs.md](nextjs.md) — Next.js-specific patterns for SSR
diff --git a/.cursor/skills/playwright-testing/infrastructure-ci-cd/ci-cd.md b/.cursor/skills/playwright-testing/infrastructure-ci-cd/ci-cd.md
new file mode 100644
index 0000000000..d5ad0d4c8f
--- /dev/null
+++ b/.cursor/skills/playwright-testing/infrastructure-ci-cd/ci-cd.md
@@ -0,0 +1,468 @@
+# CI/CD Integration
+
+## Table of Contents
+
+1. [GitHub Actions](#github-actions)
+2. [Docker](#docker)
+3. [Reporting](#reporting)
+4. [Sharding](#sharding)
+5. [Environment Management](#environment-management)
+6. [Caching](#caching)
+
+## GitHub Actions
+
+### Basic Workflow
+
+```yaml
+# .github/workflows/playwright.yml
+name: Playwright Tests
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+jobs:
+  test:
+    timeout-minutes: 60
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-node@v4
+        with:
+          node-version: 22
+          cache: "npm"
+
+      - name: Install dependencies
+        run: npm ci
+
+      - name: Install Playwright browsers
+        run: npx playwright install --with-deps
+
+      - name: Run Playwright tests
+        run: npx playwright test
+
+      - uses: actions/upload-artifact@v4
+        if: ${{ !cancelled() }}
+        with:
+          name: playwright-report
+          path: playwright-report/
+          retention-days: 30
+```
+
+### With Sharding
+
+```yaml
+name: Playwright Tests
+
+on:
+  push:
+    branches: [main]
+
+jobs:
+  test:
+    timeout-minutes: 60
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        shardIndex: [1, 2, 3, 4]
+        shardTotal: [4]
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-node@v4
+        with:
+          node-version: 22
+          cache: "npm"
+
+      - name: Install dependencies
+        run: npm ci
+
+      - name: Install Playwright browsers
+        run: npx playwright install --with-deps
+
+      - name: Run Playwright tests
+        run: npx playwright test --shard=${{ matrix.shardIndex }}/${{ matrix.shardTotal }}
+
+      - name: Upload blob report
+        if: ${{ !cancelled() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: blob-report-${{ matrix.shardIndex }}
+          path: blob-report
+          retention-days: 1
+
+  merge-reports:
+    if: ${{ !cancelled() }}
+    needs: [test]
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-node@v4
+        with:
+          node-version: 22
+          cache: "npm"
+
+      - name: Install dependencies
+        run: npm ci
+
+      - name: Download blob reports
+        uses: actions/download-artifact@v4
+        with:
+          path: all-blob-reports
+          pattern: blob-report-*
+          merge-multiple: true
+
+      - name: Merge reports
+        run: npx playwright merge-reports --reporter html ./all-blob-reports
+
+      - name: Upload HTML report
+        uses: actions/upload-artifact@v4
+        with:
+          name: html-report
+          path: playwright-report
+          retention-days: 14
+```
+
+### With Container
+
+```yaml
+jobs:
+  test:
+    timeout-minutes: 60
+    runs-on: ubuntu-latest
+    container:
+      # Use latest or more appropriate playwright version (match package.json)
+      image: mcr.microsoft.com/playwright:v1.40.0-jammy
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-node@v4
+        with:
+          node-version: 22
+          cache: "npm"
+
+      - name: Install dependencies
+        run: npm ci
+
+      - name: Run tests
+        run: npx playwright test
+        env:
+          HOME: /root
+```
+
+## Docker
+
+### Dockerfile
+
+```dockerfile
+FROM mcr.microsoft.com/playwright:v1.40.0-jammy
+
+WORKDIR /app
+
+COPY package*.json ./
+RUN npm ci
+
+COPY . .
+
+CMD ["npx", "playwright", "test"]
+```
+
+### Docker Compose
+
+```yaml
+# docker-compose.yml
+version: "3.8"
+
+services:
+  playwright:
+    build: .
+    volumes:
+      - ./playwright-report:/app/playwright-report
+      - ./test-results:/app/test-results
+    environment:
+      - CI=true
+      - BASE_URL=http://app:3000
+    depends_on:
+      - app
+
+  app:
+    build: ./app
+    ports:
+      - "3000:3000"
+```
+
+### Run with Docker
+
+```bash
+# Build and run
+docker build -t playwright-tests .
+docker run --rm -v $(pwd)/playwright-report:/app/playwright-report playwright-tests
+
+# With docker-compose
+docker-compose run --rm playwright
+```
+
+## Reporting
+
+### Configuration
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  reporter: [
+    // Always generate
+    ["html", { outputFolder: "playwright-report" }],
+
+    // Console output
+    ["list"],
+
+    // CI-friendly
+    ["github"], // GitHub Actions annotations
+
+    // JUnit for CI integration
+    ["junit", { outputFile: "results.xml" }],
+
+    // JSON for custom processing
+    ["json", { outputFile: "results.json" }],
+
+    // Blob for merging shards
+    ["blob", { outputDir: "blob-report" }],
+  ],
+});
+```
+
+### CI-Specific Reporter
+
+```typescript
+export default defineConfig({
+  reporter: process.env.CI
+    ? [["github"], ["blob"], ["html"]]
+    : [["list"], ["html"]],
+});
+```
+
+## Sharding
+
+### Command Line
+
+```bash
+# Split into 4 shards, run shard 1
+npx playwright test --shard=1/4
+
+# Run shard 2
+npx playwright test --shard=2/4
+```
+
+### Configuration
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  // Evenly distribute tests across shards
+  fullyParallel: true,
+
+  // For blob reporter to merge later
+  reporter: process.env.CI ? [["blob"]] : [["html"]],
+});
+```
+
+### Merge Sharded Reports
+
+```bash
+# After all shards complete, merge blob reports
+npx playwright merge-reports --reporter html ./all-blob-reports
+```
+
+## Environment Management
+
+### Environment Variables
+
+```typescript
+// playwright.config.ts
+import { defineConfig } from "@playwright/test";
+import dotenv from "dotenv";
+
+// Load env file based on environment
+dotenv.config({ path: `.env.${process.env.NODE_ENV || "development"}` });
+
+export default defineConfig({
+  use: {
+    baseURL: process.env.BASE_URL || "http://localhost:3000",
+  },
+});
+```
+
+### Multiple Environments
+
+```yaml
+# .github/workflows/playwright.yml
+jobs:
+  test:
+    strategy:
+      matrix:
+        environment: [staging, production]
+    steps:
+      - name: Run tests
+        run: npx playwright test
+        env:
+          BASE_URL: ${{ matrix.environment == 'staging' && 'https://staging.example.com' || 'https://example.com' }}
+          TEST_USER: ${{ secrets[format('TEST_USER_{0}', matrix.environment)] }}
+```
+
+### Secrets Management
+
+```yaml
+# GitHub Actions secrets
+- name: Run tests
+  run: npx playwright test
+  env:
+    TEST_EMAIL: ${{ secrets.TEST_EMAIL }}
+    TEST_PASSWORD: ${{ secrets.TEST_PASSWORD }}
+```
+
+```typescript
+// tests use environment variables
+test("login", async ({ page }) => {
+  await page.getByLabel("Email").fill(process.env.TEST_EMAIL!);
+  await page.getByLabel("Password").fill(process.env.TEST_PASSWORD!);
+});
+```
+
+## Caching
+
+### Cache Playwright Browsers
+
+```yaml
+- name: Cache Playwright browsers
+  uses: actions/cache@v4
+  id: playwright-cache
+  with:
+    path: ~/.cache/ms-playwright
+    key: playwright-${{ runner.os }}-${{ hashFiles('package-lock.json') }}
+
+- name: Install Playwright browsers
+  if: steps.playwright-cache.outputs.cache-hit != 'true'
+  run: npx playwright install --with-deps
+
+- name: Install system deps only
+  if: steps.playwright-cache.outputs.cache-hit == 'true'
+  run: npx playwright install-deps
+```
+
+### Cache Node Modules
+
+```yaml
+- uses: actions/setup-node@v4
+  with:
+    node-version: 22
+    cache: "npm"
+
+- name: Install dependencies
+  run: npm ci
+```
+
+## Tag-Based Test Filtering
+
+### Run Specific Tags in CI
+
+```yaml
+# Run smoke tests on PR
+- name: Run smoke tests
+  run: npx playwright test --grep @smoke
+
+# Run full regression nightly
+- name: Run regression
+  run: npx playwright test --grep @regression
+
+# Exclude flaky tests
+- name: Run stable tests
+  run: npx playwright test --grep-invert @flaky
+```
+
+### PR vs Nightly Strategy
+
+```yaml
+# .github/workflows/pr.yml - Fast feedback
+- name: Run critical tests
+  run: npx playwright test --grep "@smoke|@critical"
+
+# .github/workflows/nightly.yml - Full coverage
+- name: Run all tests
+  run: npx playwright test --grep-invert @flaky
+```
+
+### Tag Filtering in Config
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  grep: process.env.CI ? /@smoke|@critical/ : undefined,
+  grepInvert: process.env.CI ? /@flaky/ : undefined,
+});
+```
+
+### Project-Based Tag Filtering
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  projects: [
+    {
+      name: "smoke",
+      grep: /@smoke/,
+    },
+    {
+      name: "regression",
+      grepInvert: /@smoke/,
+    },
+  ],
+});
+```
+
+## Best Practices
+
+| Practice                      | Benefit                   |
+| ----------------------------- | ------------------------- |
+| Use `npm ci`                  | Deterministic installs    |
+| Run headless in CI            | Faster, no display needed |
+| Set retries in CI only        | Handle flakiness          |
+| Upload artifacts on failure   | Debug failures            |
+| Use sharding for large suites | Faster execution          |
+| Cache browsers                | Faster setup              |
+| Use blob reporter for shards  | Merge reports correctly   |
+| Use tags for PR vs nightly    | Fast feedback + coverage  |
+| Exclude @flaky in CI          | Stable pipeline           |
+
+## CI Configuration Reference
+
+```typescript
+// playwright.config.ts - CI optimized
+export default defineConfig({
+  testDir: "./tests",
+  fullyParallel: true,
+  forbidOnly: !!process.env.CI,
+  retries: process.env.CI ? 2 : 0,
+  workers: process.env.CI ? 1 : undefined,
+  reporter: process.env.CI
+    ? [["github"], ["blob"], ["html"]]
+    : [["list"], ["html"]],
+  use: {
+    baseURL: process.env.BASE_URL || "http://localhost:3000",
+    trace: "on-first-retry",
+    screenshot: "only-on-failure",
+    video: "on-first-retry",
+  },
+});
+```
+
+## Related References
+
+- **Test tags**: See [test-tags.md](../core/test-tags.md) for tagging and filtering patterns
+- **Performance optimization**: See [performance.md](performance.md) for sharding and parallelization
+- **Debugging CI failures**: See [debugging.md](../debugging/debugging.md) for troubleshooting
+- **Test reporting**: See [debugging.md](../debugging/debugging.md) for trace viewer usage
diff --git a/.cursor/skills/playwright-testing/infrastructure-ci-cd/docker.md b/.cursor/skills/playwright-testing/infrastructure-ci-cd/docker.md
new file mode 100644
index 0000000000..f20040b4ae
--- /dev/null
+++ b/.cursor/skills/playwright-testing/infrastructure-ci-cd/docker.md
@@ -0,0 +1,283 @@
+# Container-Based Testing
+
+## Table of Contents
+
+1. [Patterns](#patterns)
+2. [Decision Guide](#decision-guide)
+3. [Anti-Patterns](#anti-patterns)
+4. [Troubleshooting](#troubleshooting)
+
+> **When to use**: Running tests in containers for reproducible environments, CI pipelines, or consistent browser versions across team machines.
+
+## Patterns
+
+### Official Image Usage
+
+Run tests without building a custom image:
+
+```bash
+docker run --rm \
+  -v $(pwd):/app \
+  -w /app \
+  -e CI=true \
+  -e BASE_URL=http://host.docker.internal:3000 \
+  mcr.microsoft.com/playwright:v1.48.0-noble \
+  bash -c "npm ci && npx playwright test"
+```
+
+Extract reports with bind mounts:
+
+```bash
+docker run --rm \
+  -v $(pwd):/app \
+  -v $(pwd)/playwright-report:/app/playwright-report \
+  -v $(pwd)/test-results:/app/test-results \
+  -w /app \
+  mcr.microsoft.com/playwright:v1.48.0-noble \
+  bash -c "npm ci && npx playwright test"
+```
+
+### Custom Dockerfile
+
+Build a custom image when you need additional dependencies or pre-installed packages:
+
+```dockerfile
+FROM mcr.microsoft.com/playwright:v1.48.0-noble
+
+WORKDIR /app
+
+COPY package.json package-lock.json ./
+RUN npm ci
+
+COPY . .
+
+CMD ["npx", "playwright", "test"]
+```
+
+Chromium-only slim image:
+
+```dockerfile
+FROM node:latest-slim
+
+RUN npx playwright install --with-deps chromium
+
+WORKDIR /app
+COPY package.json package-lock.json ./
+RUN npm ci
+COPY . .
+
+CMD ["npx", "playwright", "test", "--project=chromium"]
+```
+
+### Docker Compose Stack
+
+Full application stack with database, cache, and test runner:
+
+```yaml
+services:
+  app:
+    build: .
+    ports:
+      - "3000:3000"
+    environment:
+      - NODE_ENV=test
+      - DATABASE_URL=postgresql://postgres:postgres@db:5432/test
+    depends_on:
+      db:
+        condition: service_healthy
+
+  db:
+    image: postgres:latest-alpine
+    environment:
+      POSTGRES_USER: postgres
+      POSTGRES_PASSWORD: postgres
+      POSTGRES_DB: test
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U postgres"]
+      interval: 5s
+      timeout: 5s
+      retries: 5
+    tmpfs:
+      - /var/lib/postgresql/data
+
+  e2e:
+    image: mcr.microsoft.com/playwright:v1.48.0-noble
+    working_dir: /app
+    volumes:
+      - .:/app
+      - /app/node_modules
+    environment:
+      - CI=true
+      - BASE_URL=http://app:3000
+    depends_on:
+      - app
+    command: bash -c "npm ci && npx playwright test"
+    profiles:
+      - test
+```
+
+Run commands:
+
+```bash
+docker compose --profile test up --abort-on-container-exit --exit-code-from e2e
+
+docker compose --profile test down -v
+```
+
+### CI Container Jobs
+
+**GitHub Actions:**
+
+```yaml
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    container:
+      image: mcr.microsoft.com/playwright:v1.48.0-noble
+    steps:
+      - uses: actions/checkout@v4
+      - run: npm ci
+      - run: npx playwright test
+        env:
+          HOME: /root
+```
+
+**GitLab CI:**
+
+```yaml
+test:
+  image: mcr.microsoft.com/playwright:v1.48.0-noble
+  script:
+    - npm ci
+    - npx playwright test
+```
+
+**Jenkins:**
+
+```groovy
+pipeline {
+    agent {
+        docker {
+            image 'mcr.microsoft.com/playwright:v1.48.0-noble'
+            args '-u root'
+        }
+    }
+    stages {
+        stage('Test') {
+            steps {
+                sh 'npm ci'
+                sh 'npx playwright test'
+            }
+        }
+    }
+}
+```
+
+### Dev Container Setup
+
+VS Code Dev Container or GitHub Codespaces configuration:
+
+```json
+{
+  "name": "Playwright Dev",
+  "image": "mcr.microsoft.com/playwright:v1.48.0-noble",
+  "features": {
+    "ghcr.io/devcontainers/features/node:latest": {
+      "version": "20"
+    }
+  },
+  "postCreateCommand": "npm ci",
+  "customizations": {
+    "vscode": {
+      "extensions": ["ms-playwright.playwright"]
+    }
+  },
+  "forwardPorts": [3000, 9323],
+  "remoteUser": "root"
+}
+```
+
+## Decision Guide
+
+| Scenario | Approach |
+|---|---|
+| Simple CI pipeline | Official image as CI container |
+| Tests need database + cache | Docker Compose with app, db, e2e services |
+| Team needs identical environments | Dev Container or custom Dockerfile |
+| Only testing Chromium | Slim image with `install --with-deps chromium` |
+| Cross-browser testing | Official image (all browsers pre-installed) |
+| Local development | Run directly on host for faster iteration |
+
+## Anti-Patterns
+
+| Anti-Pattern | Problem | Solution |
+|---|---|---|
+| Installing browsers at runtime | Wastes 60-90 seconds per run | Use official image or bake browsers into custom image |
+| Running as non-root without sandbox config | Chromium sandbox permission errors | Run as root or disable sandbox |
+| Bind-mounting `node_modules` from host | Platform-specific binary crashes | Use anonymous volume: `-v /app/node_modules` |
+| No health checks on dependent services | Tests start before database ready | Add `healthcheck` with `depends_on: condition: service_healthy` |
+| Building application inside Playwright container | Large image, slow builds | Separate app and e2e containers |
+
+## Troubleshooting
+
+### "browserType.launch: Executable doesn't exist"
+
+Playwright version mismatch with Docker image. Ensure `@playwright/test` version matches image tag:
+
+```bash
+npm ls @playwright/test
+docker pull mcr.microsoft.com/playwright:v<matching-version>-noble
+```
+
+### "net::ERR_CONNECTION_REFUSED" in docker-compose
+
+Tests trying to reach `localhost` instead of service name. Configure `baseURL`:
+
+```typescript
+import { defineConfig } from '@playwright/test';
+
+export default defineConfig({
+  use: {
+    baseURL: process.env.BASE_URL || 'http://localhost:3000',
+  },
+});
+```
+
+```yaml
+e2e:
+  environment:
+    - BASE_URL=http://app:3000
+```
+
+### Permission denied on mounted volumes
+
+Match user IDs or run as root:
+
+```bash
+docker run --rm -u $(id -u):$(id -g) \
+  -v $(pwd):/app -w /app \
+  mcr.microsoft.com/playwright:v1.48.0-noble \
+  npx playwright test
+```
+
+### Slow container tests on macOS/Windows
+
+Docker Desktop I/O overhead. Copy files instead of mounting:
+
+```dockerfile
+FROM mcr.microsoft.com/playwright:v1.48.0-noble
+WORKDIR /app
+COPY . .
+RUN npm ci
+CMD ["npx", "playwright", "test"]
+```
+
+Or use delegated mount:
+
+```bash
+docker run --rm \
+  -v $(pwd):/app:delegated \
+  -w /app \
+  mcr.microsoft.com/playwright:v1.48.0-noble \
+  bash -c "npm ci && npx playwright test"
+```
diff --git a/.cursor/skills/playwright-testing/infrastructure-ci-cd/github-actions.md b/.cursor/skills/playwright-testing/infrastructure-ci-cd/github-actions.md
new file mode 100644
index 0000000000..0a8597550f
--- /dev/null
+++ b/.cursor/skills/playwright-testing/infrastructure-ci-cd/github-actions.md
@@ -0,0 +1,546 @@
+# GitHub Actions for Playwright
+
+## Table of Contents
+
+1. [CLI Commands](#cli-commands)
+2. [Workflow Patterns](#workflow-patterns)
+3. [Scenario Guide](#scenario-guide)
+4. [Common Mistakes](#common-mistakes)
+5. [Troubleshooting](#troubleshooting)
+6. [Related](#related)
+
+> **When to use**: Automating Playwright tests on pull requests, main branch merges, or scheduled runs.
+
+## CLI Commands
+
+```bash
+npx playwright install --with-deps    # browsers + OS dependencies
+npx playwright test --shard=1/4       # run shard 1 of 4
+npx playwright test --reporter=github # PR annotations
+npx playwright merge-reports ./blob-report  # combine shard reports
+```
+
+## Workflow Patterns
+
+### Basic Workflow
+
+**Use when**: Starting a new project or running a small test suite.
+
+```yaml
+# .github/workflows/e2e.yml
+name: E2E Tests
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+concurrency:
+  group: e2e-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  CI: true
+
+jobs:
+  test:
+    timeout-minutes: 30
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - run: npm ci
+
+      - name: Cache browsers
+        id: browser-cache
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/ms-playwright
+          key: pw-${{ runner.os }}-${{ hashFiles('package-lock.json') }}
+
+      - name: Install browsers
+        if: steps.browser-cache.outputs.cache-hit != 'true'
+        run: npx playwright install --with-deps
+
+      - name: Install OS dependencies
+        if: steps.browser-cache.outputs.cache-hit == 'true'
+        run: npx playwright install-deps
+
+      - run: npx playwright test
+
+      - name: Upload report
+        uses: actions/upload-artifact@v4
+        if: ${{ !cancelled() }}
+        with:
+          name: test-report
+          path: playwright-report/
+          retention-days: 14
+
+      - name: Upload traces
+        uses: actions/upload-artifact@v4
+        if: failure()
+        with:
+          name: traces
+          path: test-results/
+          retention-days: 7
+```
+
+### Sharded Execution
+
+**Use when**: Test suite exceeds 10 minutes. Sharding cuts wall-clock time significantly.
+**Avoid when**: Suite runs under 5 minutes—sharding overhead negates benefits.
+
+```yaml
+# .github/workflows/e2e-sharded.yml
+name: E2E Tests (Sharded)
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+concurrency:
+  group: e2e-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  CI: true
+
+jobs:
+  test:
+    timeout-minutes: 20
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        shard: [1/4, 2/4, 3/4, 4/4]
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - run: npm ci
+
+      - name: Cache browsers
+        id: browser-cache
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/ms-playwright
+          key: pw-${{ runner.os }}-${{ hashFiles('package-lock.json') }}
+
+      - name: Install browsers
+        if: steps.browser-cache.outputs.cache-hit != 'true'
+        run: npx playwright install --with-deps
+
+      - name: Install OS dependencies
+        if: steps.browser-cache.outputs.cache-hit == 'true'
+        run: npx playwright install-deps
+
+      - name: Run tests (shard ${{ matrix.shard }})
+        run: npx playwright test --shard=${{ matrix.shard }}
+
+      - name: Upload blob report
+        uses: actions/upload-artifact@v4
+        if: ${{ !cancelled() }}
+        with:
+          name: blob-${{ strategy.job-index }}
+          path: blob-report/
+          retention-days: 1
+
+  merge:
+    if: ${{ !cancelled() }}
+    needs: test
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - run: npm ci
+
+      - name: Download blob reports
+        uses: actions/download-artifact@v4
+        with:
+          path: all-blobs
+          pattern: blob-*
+          merge-multiple: true
+
+      - name: Merge reports
+        run: npx playwright merge-reports --reporter=html ./all-blobs
+
+      - name: Upload merged report
+        uses: actions/upload-artifact@v4
+        with:
+          name: test-report
+          path: playwright-report/
+          retention-days: 14
+```
+
+**Config for sharding**—enable blob reporter:
+
+```typescript
+// playwright.config.ts
+import { defineConfig } from '@playwright/test';
+
+export default defineConfig({
+  reporter: process.env.CI
+    ? [['blob'], ['github']]
+    : [['html', { open: 'on-failure' }]],
+});
+```
+
+### Container-Based Execution
+
+**Use when**: Reproducible environment matching local Docker setup, or runner OS dependencies cause issues.
+**Avoid when**: Standard `ubuntu-latest` with `--with-deps` works fine.
+
+```yaml
+# .github/workflows/e2e-container.yml
+name: E2E Tests (Container)
+
+on:
+  pull_request:
+    branches: [main]
+
+jobs:
+  test:
+    timeout-minutes: 30
+    runs-on: ubuntu-latest
+    container:
+      image: mcr.microsoft.com/playwright:v1.48.0-noble
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - run: npm ci
+
+      - name: Run tests
+        run: npx playwright test
+        env:
+          HOME: /root
+
+      - uses: actions/upload-artifact@v4
+        if: ${{ !cancelled() }}
+        with:
+          name: test-report
+          path: playwright-report/
+          retention-days: 14
+```
+
+### Environment Secrets
+
+**Use when**: Tests target staging/production with credentials.
+**Avoid when**: Tests only run against local dev server.
+
+```yaml
+# .github/workflows/e2e-staging.yml
+name: Staging Tests
+
+on:
+  push:
+    branches: [main]
+  workflow_dispatch:
+
+jobs:
+  test:
+    timeout-minutes: 30
+    runs-on: ubuntu-latest
+    environment: staging
+
+    env:
+      CI: true
+      BASE_URL: ${{ vars.STAGING_URL }}
+      TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
+      API_TOKEN: ${{ secrets.API_TOKEN }}
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - run: npm ci
+
+      - name: Cache browsers
+        id: browser-cache
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/ms-playwright
+          key: pw-${{ runner.os }}-${{ hashFiles('package-lock.json') }}
+
+      - name: Install browsers
+        if: steps.browser-cache.outputs.cache-hit != 'true'
+        run: npx playwright install --with-deps
+
+      - name: Install OS dependencies
+        if: steps.browser-cache.outputs.cache-hit == 'true'
+        run: npx playwright install-deps
+
+      - name: Run smoke tests
+        run: npx playwright test --grep @smoke
+
+      - uses: actions/upload-artifact@v4
+        if: ${{ !cancelled() }}
+        with:
+          name: staging-report
+          path: playwright-report/
+          retention-days: 14
+```
+
+### Scheduled Runs
+
+**Use when**: Full regression suite is too slow for every PR—run nightly instead.
+**Avoid when**: Suite runs under 15 minutes and can run on every PR.
+
+```yaml
+# .github/workflows/nightly.yml
+name: Nightly Regression
+
+on:
+  schedule:
+    - cron: '0 3 * * 1-5'
+  workflow_dispatch:
+
+jobs:
+  test:
+    timeout-minutes: 60
+    runs-on: ubuntu-latest
+
+    env:
+      CI: true
+      BASE_URL: ${{ vars.STAGING_URL }}
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - run: npm ci
+
+      - name: Install browsers
+        run: npx playwright install --with-deps
+
+      - name: Run full regression
+        run: npx playwright test --grep @regression
+
+      - uses: actions/upload-artifact@v4
+        if: ${{ !cancelled() }}
+        with:
+          name: nightly-${{ github.run_number }}
+          path: playwright-report/
+          retention-days: 30
+
+      - name: Notify on failure
+        if: failure()
+        uses: slackapi/slack-github-action@latest
+        with:
+          payload: |
+            {
+              "text": "Nightly regression failed: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+            }
+        env:
+          SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
+```
+
+### Reusable Workflow
+
+**Use when**: Multiple repositories share the same Playwright setup.
+**Avoid when**: Single repo with one workflow.
+
+```yaml
+# .github/workflows/pw-reusable.yml
+name: Playwright Reusable
+
+on:
+  workflow_call:
+    inputs:
+      node-version:
+        type: string
+        default: 'lts/*'
+      test-command:
+        type: string
+        default: 'npx playwright test'
+    secrets:
+      BASE_URL:
+        required: false
+      TEST_PASSWORD:
+        required: false
+
+jobs:
+  test:
+    timeout-minutes: 30
+    runs-on: ubuntu-latest
+
+    env:
+      CI: true
+      BASE_URL: ${{ secrets.BASE_URL }}
+      TEST_PASSWORD: ${{ secrets.TEST_PASSWORD }}
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-node@v4
+        with:
+          node-version: ${{ inputs.node-version }}
+          cache: npm
+
+      - run: npm ci
+
+      - name: Cache browsers
+        id: browser-cache
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/ms-playwright
+          key: pw-${{ runner.os }}-${{ hashFiles('package-lock.json') }}
+
+      - name: Install browsers
+        if: steps.browser-cache.outputs.cache-hit != 'true'
+        run: npx playwright install --with-deps
+
+      - name: Install OS dependencies
+        if: steps.browser-cache.outputs.cache-hit == 'true'
+        run: npx playwright install-deps
+
+      - name: Run tests
+        run: ${{ inputs.test-command }}
+
+      - uses: actions/upload-artifact@v4
+        if: ${{ !cancelled() }}
+        with:
+          name: test-report
+          path: playwright-report/
+          retention-days: 14
+```
+
+**Calling the reusable workflow:**
+
+```yaml
+# .github/workflows/ci.yml
+name: CI
+on:
+  pull_request:
+    branches: [main]
+
+jobs:
+  e2e:
+    uses: ./.github/workflows/pw-reusable.yml
+    with:
+      node-version: 'lts/*'
+    secrets:
+      BASE_URL: ${{ secrets.STAGING_URL }}
+      TEST_PASSWORD: ${{ secrets.TEST_PASSWORD }}
+```
+
+## Scenario Guide
+
+| Scenario | Approach |
+|---|---|
+| Small suite (< 5 min) | Single job, no sharding |
+| Medium suite (5-20 min) | 2-4 shards with matrix |
+| Large suite (20+ min) | 4-8 shards + blob merge |
+| Cross-browser on PRs | Chromium only on PRs; all browsers on main |
+| Staging/prod smoke tests | Separate workflow with `environment:` |
+| Nightly full regression | `schedule` trigger + `workflow_dispatch` |
+| Multiple repos, same setup | Reusable workflow with `workflow_call` |
+| Reproducible env needed | Container job with Playwright image |
+
+## Common Mistakes
+
+| Mistake | Problem | Fix |
+|---|---|---|
+| No `concurrency` group | Duplicate runs waste minutes | Add `concurrency: { group: ..., cancel-in-progress: true }` |
+| `fail-fast: true` with sharding | One failure cancels others | Set `fail-fast: false` |
+| No browser caching | 60-90 seconds wasted per run | Cache `~/.cache/ms-playwright` |
+| No `timeout-minutes` | Stuck jobs run for 6 hours | Set explicit timeout: 20-30 minutes |
+| Artifacts only on failure | No report when tests pass | Use `if: ${{ !cancelled() }}` |
+| Hardcoded secrets | Security risk | Use GitHub Secrets and Environments |
+| All browsers on every PR | 3x CI cost | Chromium on PR; cross-browser on main |
+| No artifact retention | Default 90-day fills storage | Set `retention-days: 7-14` |
+| Missing `--with-deps` | Browser launch failures | Always use `npx playwright install --with-deps` |
+
+## Troubleshooting
+
+### Browser launch fails: "Missing dependencies"
+
+**Cause**: Browsers restored from cache but OS dependencies weren't cached.
+
+**Fix**: Run `npx playwright install-deps` on cache hit:
+
+```yaml
+- name: Install OS dependencies
+  if: steps.browser-cache.outputs.cache-hit == 'true'
+  run: npx playwright install-deps
+```
+
+### Tests pass locally but timeout in CI
+
+**Cause**: CI runners have fewer resources than dev machines.
+
+**Fix**: Reduce workers and increase timeouts:
+
+```typescript
+// playwright.config.ts
+import { defineConfig } from '@playwright/test';
+
+export default defineConfig({
+  workers: process.env.CI ? '50%' : undefined,
+  use: {
+    actionTimeout: process.env.CI ? 15_000 : 10_000,
+    navigationTimeout: process.env.CI ? 30_000 : 15_000,
+  },
+});
+```
+
+### Sharded reports incomplete
+
+**Cause**: Artifact names collide or `merge-multiple` not set.
+
+**Fix**: Unique names per shard and enable merge:
+
+```yaml
+# Upload in each shard
+- uses: actions/upload-artifact@v4
+  with:
+    name: blob-${{ strategy.job-index }}
+    path: blob-report/
+
+# Download in merge job
+- uses: actions/download-artifact@v4
+  with:
+    path: all-blobs
+    pattern: blob-*
+    merge-multiple: true
+```
+
+### `webServer` fails: "port already in use"
+
+**Cause**: Zombie process from previous run.
+
+**Fix**: Kill stale processes before starting:
+
+```yaml
+- name: Kill stale processes
+  run: lsof -ti:3000 | xargs kill -9 2>/dev/null || true
+```
+
+### No PR annotations
+
+**Cause**: `github` reporter not configured.
+
+**Fix**: Add `github` reporter for CI:
+
+```typescript
+// playwright.config.ts
+import { defineConfig } from '@playwright/test';
+
+export default defineConfig({
+  reporter: process.env.CI
+    ? [['html', { open: 'never' }], ['github']]
+    : [['html', { open: 'on-failure' }]],
+});
+```
+
+## Related
+
+- [test-tags.md](../core/test-tags.md) — tagging and filtering tests
+- [parallel-sharding.md](parallel-sharding.md) — sharding strategies
+- [reporting.md](reporting.md) — reporter configuration
+- [docker.md](docker.md) — container images
+- [gitlab.md](gitlab.md) — GitLab CI equivalent
+- [other-providers.md](other-providers.md) — CircleCI, Azure DevOps, Jenkins
diff --git a/.cursor/skills/playwright-testing/infrastructure-ci-cd/gitlab.md b/.cursor/skills/playwright-testing/infrastructure-ci-cd/gitlab.md
new file mode 100644
index 0000000000..576692e36e
--- /dev/null
+++ b/.cursor/skills/playwright-testing/infrastructure-ci-cd/gitlab.md
@@ -0,0 +1,397 @@
+# GitLab CI/CD Configuration
+
+## Table of Contents
+
+1. [Key Commands](#key-commands)
+2. [Patterns](#patterns)
+3. [Decision Guide](#decision-guide)
+4. [Anti-Patterns](#anti-patterns)
+5. [Troubleshooting](#troubleshooting)
+
+> **When to use**: Running Playwright tests in GitLab pipelines on merge requests, merges to main, or scheduled pipelines.
+
+## Key Commands
+
+```bash
+npx playwright install --with-deps    # install browsers + OS deps
+npx playwright test --shard=1/4       # run 1 of 4 parallel shards
+npx playwright merge-reports ./blob-report  # merge shard results
+npx playwright test --reporter=dot    # minimal output for CI logs
+```
+
+## Patterns
+
+### Basic Pipeline Configuration
+
+**Use when**: Any GitLab project with Playwright tests.
+
+```yaml
+# .gitlab-ci.yml
+image: mcr.microsoft.com/playwright:v1.48.0-noble
+
+stages:
+  - install
+  - test
+  - report
+
+variables:
+  CI: "true"
+  npm_config_cache: "$CI_PROJECT_DIR/.npm"
+
+cache:
+  key:
+    files:
+      - package-lock.json
+  paths:
+    - .npm/
+    - node_modules/
+
+setup:
+  stage: install
+  script:
+    - npm ci
+  artifacts:
+    paths:
+      - node_modules/
+    expire_in: 1 hour
+
+e2e:
+  stage: test
+  needs: [setup]
+  script:
+    - npx playwright test
+  artifacts:
+    when: always
+    paths:
+      - playwright-report/
+      - test-results/
+    expire_in: 14 days
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "merge_request_event"
+    - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
+```
+
+### Sharded Parallel Execution
+
+**Use when**: Test suite exceeds 10 minutes. GitLab's `parallel` keyword splits across jobs automatically.
+**Avoid when**: Suite runs under 5 minutes.
+
+```yaml
+image: mcr.microsoft.com/playwright:v1.48.0-noble
+
+stages:
+  - install
+  - test
+  - report
+
+variables:
+  CI: "true"
+  npm_config_cache: "$CI_PROJECT_DIR/.npm"
+
+cache:
+  key:
+    files:
+      - package-lock.json
+  paths:
+    - .npm/
+    - node_modules/
+
+setup:
+  stage: install
+  script:
+    - npm ci
+  artifacts:
+    paths:
+      - node_modules/
+    expire_in: 1 hour
+
+e2e:
+  stage: test
+  needs: [setup]
+  parallel: 4
+  script:
+    - npx playwright test --shard=$CI_NODE_INDEX/$CI_NODE_TOTAL
+  artifacts:
+    when: always
+    paths:
+      - blob-report/
+    expire_in: 1 hour
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "merge_request_event"
+    - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
+
+combine-reports:
+  stage: report
+  needs: [e2e]
+  when: always
+  script:
+    - npx playwright merge-reports --reporter=html ./blob-report
+  artifacts:
+    when: always
+    paths:
+      - playwright-report/
+    expire_in: 14 days
+```
+
+**Config for sharded pipelines:**
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  reporter: process.env.CI
+    ? [["blob"], ["dot"]]
+    : [["html", { open: "on-failure" }]],
+});
+```
+
+### Environment Variables and Secrets
+
+**Use when**: Tests need secrets (API keys, passwords) and should only run on merge requests or the default branch.
+
+```yaml
+image: mcr.microsoft.com/playwright:v1.48.0-noble
+
+stages:
+  - test
+
+variables:
+  CI: "true"
+
+e2e:staging:
+  stage: test
+  variables:
+    BASE_URL: $STAGING_URL
+    TEST_PASSWORD: $TEST_PASSWORD
+    API_KEY: $API_KEY
+  before_script:
+    - npm ci
+  script:
+    - npx playwright test
+  artifacts:
+    when: always
+    paths:
+      - playwright-report/
+      - test-results/
+    expire_in: 14 days
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "merge_request_event"
+    - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
+    - when: manual
+      allow_failure: true
+```
+
+**Setting variables in GitLab:**
+Navigate to **Settings > CI/CD > Variables** and add:
+
+- `STAGING_URL` -- not masked, not protected
+- `TEST_PASSWORD` -- masked, protected
+- `API_KEY` -- masked, protected
+
+### Multi-Browser Matrix
+
+**Use when**: Running Chromium on MRs and all browsers on the default branch.
+
+```yaml
+image: mcr.microsoft.com/playwright:v1.48.0-noble
+
+stages:
+  - install
+  - test
+
+variables:
+  CI: "true"
+
+setup:
+  stage: install
+  script:
+    - npm ci
+  artifacts:
+    paths:
+      - node_modules/
+    expire_in: 1 hour
+
+e2e:chromium:
+  stage: test
+  needs: [setup]
+  script:
+    - npx playwright test --project=chromium
+  artifacts:
+    when: always
+    paths:
+      - playwright-report/
+      - test-results/
+    expire_in: 14 days
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "merge_request_event"
+
+e2e:all-browsers:
+  stage: test
+  needs: [setup]
+  parallel:
+    matrix:
+      - PROJECT: [chromium, firefox, webkit]
+  script:
+    - npx playwright test --project=$PROJECT
+  artifacts:
+    when: always
+    paths:
+      - playwright-report/
+      - test-results/
+    expire_in: 14 days
+  rules:
+    - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
+```
+
+### Services Integration (Database, Cache)
+
+**Use when**: Tests need the application running alongside Playwright, or you need external services.
+
+```yaml
+stages:
+  - test
+
+e2e:integration:
+  stage: test
+  image: mcr.microsoft.com/playwright:v1.48.0-noble
+  services:
+    - name: postgres:latest
+      alias: db
+    - name: redis:latest
+      alias: cache
+  variables:
+    CI: "true"
+    DATABASE_URL: "postgresql://postgres:postgres@db:5432/testdb"
+    REDIS_URL: "redis://cache:6379"
+    POSTGRES_PASSWORD: "postgres"
+    POSTGRES_DB: "testdb"
+  before_script:
+    - npm ci
+    - npx prisma db push
+    - npx prisma db seed
+  script:
+    - npx playwright test
+  artifacts:
+    when: always
+    paths:
+      - playwright-report/
+      - test-results/
+    expire_in: 14 days
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "merge_request_event"
+    - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
+```
+
+### Scheduled Nightly Regression
+
+**Use when**: Full regression is too slow for every MR.
+
+```yaml
+e2e:nightly:
+  stage: test
+  image: mcr.microsoft.com/playwright:v1.48.0-noble
+  before_script:
+    - npm ci
+  script:
+    - npx playwright test --grep @regression
+  artifacts:
+    when: always
+    paths:
+      - playwright-report/
+    expire_in: 30 days
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "schedule"
+```
+
+Set up the schedule in **CI/CD > Schedules**: `0 3 * * 1-5` (3 AM UTC, weekdays).
+
+## Decision Guide
+
+| Scenario                             | Approach                                               | Why                                                 |
+| ------------------------------------ | ------------------------------------------------------ | --------------------------------------------------- |
+| Simple project, < 5 min suite        | Single `test` job using Playwright Docker image        | No sharding overhead; artifacts capture report      |
+| Suite > 10 min                       | `parallel: N` with `--shard`                           | GitLab auto-assigns `CI_NODE_INDEX`/`CI_NODE_TOTAL` |
+| Merge request fast feedback          | Chromium only on MRs; all browsers on main             | 3x fewer pipeline minutes on MRs                    |
+| External services needed (DB, Redis) | `services:` keyword with Postgres/Redis images         | GitLab manages service lifecycle                    |
+| Secrets for staging environment      | GitLab CI/CD Variables (masked + protected)            | Never hardcode secrets in `.gitlab-ci.yml`          |
+| Full nightly regression              | Pipeline schedule (`CI_PIPELINE_SOURCE == "schedule"`) | Avoids blocking MR pipelines                        |
+| Report browsing                      | `artifacts:` with `paths: [playwright-report/]`        | Browse directly in GitLab job artifacts UI          |
+
+## Anti-Patterns
+
+| Anti-Pattern                                         | Problem                                                            | Do This Instead                                                           |
+| ---------------------------------------------------- | ------------------------------------------------------------------ | ------------------------------------------------------------------------- |
+| Not using the Playwright Docker image                | Installing browsers every run adds 1-2 minutes                     | Use `mcr.microsoft.com/playwright:v1.48.0-noble` as base image                   |
+| `artifacts: when: on_failure` only                   | No report when tests pass; can't verify results                    | Use `when: always` to capture reports regardless                          |
+| No `expire_in` on artifacts                          | Artifacts accumulate and consume storage                           | Set `expire_in: 14 days` for reports, `1 hour` for intermediate artifacts |
+| Hardcoding `CI_NODE_TOTAL` in shard flag             | Breaks when you change `parallel:` value                           | Use `--shard=$CI_NODE_INDEX/$CI_NODE_TOTAL`                               |
+| Skipping `needs:` between stages                     | Jobs wait for all previous stage jobs, not just their dependencies | Use `needs:` for precise dependency graphs                                |
+| Large `cache:` including `node_modules/` without key | Stale cache causes version conflicts                               | Key cache on `package-lock.json` hash                                     |
+
+## Troubleshooting
+
+### Browser launch fails: "Failed to launch browser"
+
+**Cause**: Not using the Playwright Docker image, or using a version that doesn't match your `@playwright/test` version.
+
+**Fix**: Match the Docker image tag to your Playwright version:
+
+```yaml
+# Check your version: npm ls @playwright/test
+image: mcr.microsoft.com/playwright:v1.48.0-noble
+```
+
+### Tests hang in GitLab runner: "Navigation timeout exceeded"
+
+**Cause**: GitLab shared runners may have limited resources.
+
+**Fix**: Reduce workers and increase timeouts:
+
+```typescript
+export default defineConfig({
+  workers: process.env.CI ? 2 : undefined,
+  use: {
+    navigationTimeout: process.env.CI ? 30_000 : 15_000,
+  },
+});
+```
+
+### Pipeline runs on every push, not just merge requests
+
+**Cause**: Missing `rules:` configuration.
+
+**Fix**: Add explicit rules:
+
+```yaml
+rules:
+  - if: $CI_PIPELINE_SOURCE == "merge_request_event"
+  - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
+```
+
+### Services (Postgres/Redis) not reachable from tests
+
+**Cause**: Using `localhost` instead of the service alias.
+
+**Fix**: Use the service alias as hostname:
+
+```yaml
+services:
+  - name: postgres:latest
+    alias: db
+
+variables:
+  DATABASE_URL: "postgresql://postgres:postgres@db:5432/testdb"
+```
+
+### Merged report is empty after sharded run
+
+**Cause**: Each shard job needs the `blob` reporter, not `html`.
+
+**Fix**: Configure blob reporter for CI:
+
+```typescript
+export default defineConfig({
+  reporter: process.env.CI
+    ? [["blob"], ["dot"]]
+    : [["html", { open: "on-failure" }]],
+});
+```
diff --git a/.cursor/skills/playwright-testing/infrastructure-ci-cd/other-providers.md b/.cursor/skills/playwright-testing/infrastructure-ci-cd/other-providers.md
new file mode 100644
index 0000000000..ca80859320
--- /dev/null
+++ b/.cursor/skills/playwright-testing/infrastructure-ci-cd/other-providers.md
@@ -0,0 +1,521 @@
+# CI: CircleCI, Azure DevOps, and Jenkins
+
+> **When to use**: Running Playwright tests in CI platforms other than GitHub Actions or GitLab.
+
+## Table of Contents
+
+1. [Common Commands](#common-commands)
+2. [Jenkins](#jenkins)
+3. [CircleCI](#circleci)
+4. [Azure DevOps](#azure-devops)
+5. [JUnit Reporter Config](#junit-reporter-config)
+6. [Platform Comparison](#platform-comparison)
+7. [Troubleshooting](#troubleshooting)
+8. [Anti-Patterns](#anti-patterns)
+
+---
+
+## Common Commands
+
+```bash
+npx playwright install --with-deps    # browsers + OS dependencies
+npx playwright test --shard=1/4       # parallel sharding
+npx playwright merge-reports ./blob-report  # combine shard results
+npx playwright test --reporter=dot,html     # multiple reporters
+```
+
+## Jenkins
+
+### Declarative Pipeline
+
+```groovy
+// Jenkinsfile
+pipeline {
+    agent {
+        docker {
+            image 'mcr.microsoft.com/playwright:v1.48.0-noble'
+            args '-u root'
+        }
+    }
+
+    environment {
+        CI = 'true'
+        HOME = '/root'
+        npm_config_cache = "${WORKSPACE}/.npm"
+    }
+
+    options {
+        timeout(time: 30, unit: 'MINUTES')
+        disableConcurrentBuilds()
+    }
+
+    stages {
+        stage('Install') {
+            steps {
+                sh 'npm ci'
+            }
+        }
+
+        stage('Test') {
+            steps {
+                sh 'npx playwright test'
+            }
+            post {
+                always {
+                    junit allowEmptyResults: true,
+                         testResults: 'results/junit.xml'
+                    archiveArtifacts artifacts: 'pw-report/**',
+                                     allowEmptyArchive: true
+                    archiveArtifacts artifacts: 'results/**',
+                                     allowEmptyArchive: true
+                }
+            }
+        }
+    }
+
+    post {
+        failure {
+            echo 'Tests failed!'
+        }
+        cleanup {
+            cleanWs()
+        }
+    }
+}
+```
+
+### Parallel Shards
+
+```groovy
+// Jenkinsfile (sharded)
+pipeline {
+    agent none
+
+    environment {
+        CI = 'true'
+        HOME = '/root'
+    }
+
+    options {
+        timeout(time: 30, unit: 'MINUTES')
+    }
+
+    stages {
+        stage('Test') {
+            parallel {
+                stage('Shard 1') {
+                    agent {
+                        docker {
+                            image 'mcr.microsoft.com/playwright:v1.48.0-noble'
+                            args '-u root'
+                        }
+                    }
+                    steps {
+                        sh 'npm ci'
+                        sh 'npx playwright test --shard=1/4'
+                    }
+                    post {
+                        always {
+                            archiveArtifacts artifacts: 'blob-report/**',
+                                             allowEmptyArchive: true
+                        }
+                    }
+                }
+                stage('Shard 2') {
+                    agent {
+                        docker {
+                            image 'mcr.microsoft.com/playwright:v1.48.0-noble'
+                            args '-u root'
+                        }
+                    }
+                    steps {
+                        sh 'npm ci'
+                        sh 'npx playwright test --shard=2/4'
+                    }
+                    post {
+                        always {
+                            archiveArtifacts artifacts: 'blob-report/**',
+                                             allowEmptyArchive: true
+                        }
+                    }
+                }
+                stage('Shard 3') {
+                    agent {
+                        docker {
+                            image 'mcr.microsoft.com/playwright:v1.48.0-noble'
+                            args '-u root'
+                        }
+                    }
+                    steps {
+                        sh 'npm ci'
+                        sh 'npx playwright test --shard=3/4'
+                    }
+                    post {
+                        always {
+                            archiveArtifacts artifacts: 'blob-report/**',
+                                             allowEmptyArchive: true
+                        }
+                    }
+                }
+                stage('Shard 4') {
+                    agent {
+                        docker {
+                            image 'mcr.microsoft.com/playwright:v1.48.0-noble'
+                            args '-u root'
+                        }
+                    }
+                    steps {
+                        sh 'npm ci'
+                        sh 'npx playwright test --shard=4/4'
+                    }
+                    post {
+                        always {
+                            archiveArtifacts artifacts: 'blob-report/**',
+                                             allowEmptyArchive: true
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+```
+
+## CircleCI
+
+### Basic Pipeline
+
+```yaml
+# .circleci/config.yml
+version: 2.1
+
+executors:
+  pw:
+    docker:
+      - image: mcr.microsoft.com/playwright:v1.48.0-noble
+    working_directory: ~/app
+
+jobs:
+  install:
+    executor: pw
+    steps:
+      - checkout
+      - restore_cache:
+          keys:
+            - deps-{{ checksum "package-lock.json" }}
+      - run: npm ci
+      - save_cache:
+          key: deps-{{ checksum "package-lock.json" }}
+          paths:
+            - node_modules
+      - persist_to_workspace:
+          root: .
+          paths:
+            - node_modules
+
+  test:
+    executor: pw
+    parallelism: 4
+    steps:
+      - checkout
+      - attach_workspace:
+          at: .
+      - run:
+          name: Run tests
+          command: |
+            npx playwright test --shard=$((CIRCLE_NODE_INDEX + 1))/$CIRCLE_NODE_TOTAL
+      - store_artifacts:
+          path: pw-report
+          destination: pw-report
+      - store_artifacts:
+          path: results
+          destination: results
+      - store_test_results:
+          path: results/junit.xml
+
+workflows:
+  test:
+    jobs:
+      - install
+      - test:
+          requires:
+            - install
+```
+
+### Using Orbs
+
+```yaml
+# .circleci/config.yml
+version: 2.1
+
+orbs:
+  node: circleci/node@latest
+
+executors:
+  pw:
+    docker:
+      - image: mcr.microsoft.com/playwright:v1.48.0-noble
+
+jobs:
+  e2e:
+    executor: pw
+    parallelism: 4
+    steps:
+      - checkout
+      - node/install-packages
+      - run:
+          name: Run tests
+          command: npx playwright test --shard=$((CIRCLE_NODE_INDEX + 1))/$CIRCLE_NODE_TOTAL
+      - store_artifacts:
+          path: pw-report
+      - store_test_results:
+          path: results/junit.xml
+
+workflows:
+  main:
+    jobs:
+      - e2e
+```
+
+## Azure DevOps
+
+### Basic Pipeline
+
+```yaml
+# azure-pipelines.yml
+trigger:
+  branches:
+    include:
+      - main
+
+pr:
+  branches:
+    include:
+      - main
+
+pool:
+  vmImage: "ubuntu-latest"
+
+variables:
+  CI: "true"
+  npm_config_cache: $(Pipeline.Workspace)/.npm
+
+steps:
+  - task: NodeTool@0
+    inputs:
+      versionSpec: "20.x"
+    displayName: "Install Node.js"
+
+  - task: Cache@2
+    inputs:
+      key: 'npm | "$(Agent.OS)" | package-lock.json'
+      restoreKeys: |
+        npm | "$(Agent.OS)"
+      path: $(npm_config_cache)
+    displayName: "Cache npm"
+
+  - script: npm ci
+    displayName: "Install dependencies"
+
+  - script: npx playwright install --with-deps
+    displayName: "Install browsers"
+
+  - script: npx playwright test
+    displayName: "Run tests"
+
+  - task: PublishTestResults@2
+    condition: always()
+    inputs:
+      testResultsFormat: "JUnit"
+      testResultsFiles: "results/junit.xml"
+      mergeTestResults: true
+      testRunTitle: "E2E Tests"
+    displayName: "Publish results"
+
+  - task: PublishPipelineArtifact@1
+    condition: always()
+    inputs:
+      targetPath: pw-report
+      artifact: pw-report
+      publishLocation: "pipeline"
+    displayName: "Upload report"
+```
+
+### With Sharding
+
+```yaml
+# azure-pipelines.yml
+trigger:
+  branches:
+    include:
+      - main
+
+pr:
+  branches:
+    include:
+      - main
+
+variables:
+  CI: "true"
+
+stages:
+  - stage: Test
+    jobs:
+      - job: E2E
+        pool:
+          vmImage: "ubuntu-latest"
+        strategy:
+          matrix:
+            shard1:
+              SHARD: "1/4"
+            shard2:
+              SHARD: "2/4"
+            shard3:
+              SHARD: "3/4"
+            shard4:
+              SHARD: "4/4"
+        steps:
+          - task: NodeTool@0
+            inputs:
+              versionSpec: "20.x"
+
+          - script: npm ci
+            displayName: "Install dependencies"
+
+          - script: npx playwright install --with-deps
+            displayName: "Install browsers"
+
+          - script: npx playwright test --shard=$(SHARD)
+            displayName: "Run tests (shard $(SHARD))"
+
+          - task: PublishPipelineArtifact@1
+            condition: always()
+            inputs:
+              targetPath: blob-report
+              artifact: blob-report-$(System.JobPositionInPhase)
+            displayName: "Upload blob report"
+
+  - stage: Report
+    dependsOn: Test
+    condition: always()
+    jobs:
+      - job: MergeReports
+        pool:
+          vmImage: "ubuntu-latest"
+        steps:
+          - task: NodeTool@0
+            inputs:
+              versionSpec: "20.x"
+
+          - script: npm ci
+            displayName: "Install dependencies"
+
+          - task: DownloadPipelineArtifact@2
+            inputs:
+              patterns: "blob-report-*/**"
+              path: all-blob-reports
+            displayName: "Download blob reports"
+
+          - script: npx playwright merge-reports --reporter=html ./all-blob-reports
+            displayName: "Merge reports"
+
+          - task: PublishPipelineArtifact@1
+            inputs:
+              targetPath: pw-report
+              artifact: pw-report
+            displayName: "Upload merged report"
+```
+
+## JUnit Reporter Config
+
+All platforms benefit from JUnit output for native test result display:
+
+```typescript
+// playwright.config.ts
+import { defineConfig } from "@playwright/test";
+
+export default defineConfig({
+  reporter: process.env.CI
+    ? [
+        ["dot"],
+        ["html", { open: "never" }],
+        ["junit", { outputFile: "results/junit.xml" }],
+      ]
+    : [["html", { open: "on-failure" }]],
+});
+```
+
+## Platform Comparison
+
+| Feature           | CircleCI                                        | Azure DevOps                     | Jenkins                |
+| ----------------- | ----------------------------------------------- | -------------------------------- | ---------------------- |
+| Docker support    | `docker:` executor                              | `vmImage` or container jobs      | Docker Pipeline plugin |
+| Parallelism       | `parallelism: N` + `CIRCLE_NODE_INDEX`          | `strategy.matrix`                | `parallel` stages      |
+| Artifact upload   | `store_artifacts`                               | `PublishPipelineArtifact@1`      | `archiveArtifacts`     |
+| JUnit integration | `store_test_results`                            | `PublishTestResults@2`           | `junit` step           |
+| Shard variable    | `$((CIRCLE_NODE_INDEX + 1))/$CIRCLE_NODE_TOTAL` | Define in matrix: `SHARD: '1/4'` | Hardcode per stage     |
+| Cache key         | `checksum "package-lock.json"`                  | `Cache@2` with key template      | `stash`/`unstash`      |
+| Secrets           | Context + env variables                         | Variable groups                  | Credentials plugin     |
+
+## Troubleshooting
+
+### Jenkins: "Browser closed unexpectedly"
+
+Running as non-root in container causes sandbox issues.
+
+```groovy
+agent {
+    docker {
+        image 'mcr.microsoft.com/playwright:v1.48.0-noble'
+        args '-u root'
+    }
+}
+environment {
+    HOME = '/root'
+}
+```
+
+### CircleCI: "Executable doesn't exist"
+
+Image version mismatch with `@playwright/test` version. Use `latest` tag or match versions:
+
+```yaml
+docker:
+  - image: mcr.microsoft.com/playwright:v1.48.0-noble
+```
+
+### Azure DevOps: Test results not showing
+
+Missing JUnit reporter or `PublishTestResults@2` task:
+
+```typescript
+reporter: [['junit', { outputFile: 'results/junit.xml' }]],
+```
+
+```yaml
+- task: PublishTestResults@2
+  condition: always()
+  inputs:
+    testResultsFormat: "JUnit"
+    testResultsFiles: "results/junit.xml"
+```
+
+### Shard index off by one
+
+CircleCI's `CIRCLE_NODE_INDEX` is 0-based, Playwright's `--shard` is 1-based:
+
+```yaml
+# CircleCI - add 1
+command: npx playwright test --shard=$((CIRCLE_NODE_INDEX + 1))/$CIRCLE_NODE_TOTAL
+```
+
+## Anti-Patterns
+
+| Anti-Pattern                        | Problem                                   | Solution                                             |
+| ----------------------------------- | ----------------------------------------- | ---------------------------------------------------- |
+| Missing `--with-deps` on bare metal | OS libs missing, browser launch fails     | Use Playwright Docker image or `--with-deps`         |
+| No JUnit reporter                   | CI can't display test results             | Add `['junit', { outputFile: 'results/junit.xml' }]` |
+| No job timeout                      | Hung tests consume resources indefinitely | Set explicit timeout (20-30 min)                     |
+| No artifact upload on success       | Can't verify passing results              | Always upload reports (`condition: always()`)        |
+| Non-root in container without setup | Permission errors on browser binaries     | Run as root or configure permissions                 |
+| Hardcoded shard count               | Must update multiple places               | Use CI-native variables                              |
diff --git a/.cursor/skills/playwright-testing/infrastructure-ci-cd/parallel-sharding.md b/.cursor/skills/playwright-testing/infrastructure-ci-cd/parallel-sharding.md
new file mode 100644
index 0000000000..898a425959
--- /dev/null
+++ b/.cursor/skills/playwright-testing/infrastructure-ci-cd/parallel-sharding.md
@@ -0,0 +1,371 @@
+# Sharding and Parallel Execution
+
+## Table of Contents
+
+1. [CLI Commands](#cli-commands)
+2. [Patterns](#patterns)
+3. [Decision Guide](#decision-guide)
+4. [Anti-Patterns](#anti-patterns)
+5. [Troubleshooting](#troubleshooting)
+
+> **When to use**: Speeding up test suites by running tests concurrently on one machine (workers) or splitting across multiple CI jobs (sharding).
+
+## CLI Commands
+
+```bash
+# Parallelism within one machine
+npx playwright test --workers=4
+npx playwright test --workers=50%
+
+# Splitting across CI jobs
+npx playwright test --shard=1/4
+npx playwright test --shard=2/4
+
+# Merging shard outputs
+npx playwright merge-reports ./blob-report
+npx playwright merge-reports --reporter=html,json ./blob-report
+
+# Override config for single run
+npx playwright test --fully-parallel
+```
+
+## Patterns
+
+### Worker Configuration
+
+**Use when**: Controlling concurrent test execution on a single machine.
+
+```ts
+// playwright.config.ts
+import { defineConfig } from "@playwright/test";
+
+export default defineConfig({
+  // Tests WITHIN a file also run in parallel
+  fullyParallel: true,
+
+  // Worker count options:
+  // - undefined: auto-detect (half CPU cores)
+  // - number: fixed count
+  // - string: percentage of cores
+  workers: process.env.CI ? "50%" : undefined,
+});
+```
+
+**`fullyParallel` behavior:**
+
+| Setting                          | Files parallel | Tests in file parallel |
+| -------------------------------- | -------------- | ---------------------- |
+| `fullyParallel: false` (default) | Yes            | No (serial)            |
+| `fullyParallel: true`            | Yes            | Yes                    |
+
+**Serial execution for specific files:**
+
+```ts
+// tests/checkout-flow.spec.ts
+import { test, expect } from "@playwright/test";
+
+test.describe.configure({ mode: "serial" });
+
+test("add items to cart", async ({ page }) => {
+  // ...
+});
+
+test("complete payment", async ({ page }) => {
+  // ...
+});
+```
+
+### Sharding Across CI Machines
+
+**Use when**: Suite exceeds 5 minutes even with maximum workers.
+
+```bash
+# Job 1            Job 2            Job 3            Job 4
+--shard=1/4        --shard=2/4      --shard=3/4      --shard=4/4
+```
+
+**Config for sharded runs:**
+
+```ts
+// playwright.config.ts
+import { defineConfig } from "@playwright/test";
+
+export default defineConfig({
+  fullyParallel: true,
+  workers: process.env.CI ? "50%" : undefined,
+
+  reporter: process.env.CI
+    ? [["blob"], ["github"]]
+    : [["html", { open: "on-failure" }]],
+});
+```
+
+### Merging Shard Reports
+
+**Use when**: Combining blob reports from multiple shards into a unified report.
+
+```bash
+# Merge all blobs into HTML
+npx playwright merge-reports --reporter=html ./all-blob-reports
+
+# Multiple formats
+npx playwright merge-reports --reporter=html,json,junit ./all-blob-reports
+
+# Custom output location
+PLAYWRIGHT_HTML_REPORT=merged-report npx playwright merge-reports --reporter=html ./all-blob-reports
+```
+
+**GitHub Actions merge job:**
+
+```yaml
+merge-reports:
+  if: ${{ !cancelled() }}
+  needs: test
+  runs-on: ubuntu-latest
+  steps:
+    - uses: actions/checkout@v4
+    - run: npm ci
+
+    - uses: actions/download-artifact@v4
+      with:
+        path: all-blob-reports
+        pattern: blob-report-*
+        merge-multiple: true
+
+    - run: npx playwright merge-reports --reporter=html ./all-blob-reports
+
+    - uses: actions/upload-artifact@v4
+      with:
+        name: playwright-report
+        path: playwright-report/
+        retention-days: 14
+```
+
+### Worker-Scoped Fixtures
+
+**Use when**: Expensive resources (DB connections, auth tokens) should be created once per worker, not per test.
+
+```ts
+// fixtures.ts
+import { test as base } from "@playwright/test";
+
+type WorkerFixtures = {
+  dbClient: DatabaseClient;
+  apiToken: string;
+};
+
+export const test = base.extend<{}, WorkerFixtures>({
+  dbClient: [
+    async ({}, use) => {
+      const client = await DatabaseClient.connect(process.env.DB_URL!);
+      await use(client);
+      await client.disconnect();
+    },
+    { scope: "worker" },
+  ],
+
+  apiToken: [
+    async ({}, use, workerInfo) => {
+      const res = await fetch(`${process.env.API_URL}/auth`, {
+        method: "POST",
+        headers: { "Content-Type": "application/json" },
+        body: JSON.stringify({
+          user: `test-user-${workerInfo.workerIndex}`,
+          password: process.env.TEST_PASSWORD,
+        }),
+      });
+      const { token } = await res.json();
+      await use(token);
+    },
+    { scope: "worker" },
+  ],
+});
+
+export { expect } from "@playwright/test";
+```
+
+### Test Isolation for Parallelism
+
+**Use when**: Preparing tests to run without interference.
+
+Each test must create its own state. No test should depend on or modify shared state.
+
+```ts
+// BAD: Shared user causes race conditions
+test("edit settings", async ({ page }) => {
+  await page.goto("/users/test-user/settings");
+  await page.getByLabel("Email").fill("new@example.com");
+  await page.getByRole("button", { name: "Save" }).click();
+});
+
+// GOOD: Unique user per test
+test("edit settings", async ({ page, request }) => {
+  const res = await request.post("/api/users", {
+    data: { name: `user-${Date.now()}`, email: `${Date.now()}@test.com` },
+  });
+  const user = await res.json();
+
+  await page.goto(`/users/${user.id}/settings`);
+  await page.getByLabel("Email").fill("updated@example.com");
+  await page.getByRole("button", { name: "Save" }).click();
+  await expect(page.getByLabel("Email")).toHaveValue("updated@example.com");
+
+  await request.delete(`/api/users/${user.id}`);
+});
+```
+
+**Using `testInfo` for unique identifiers:**
+
+```ts
+import { test, expect } from "@playwright/test";
+
+test("submit order", async ({ page }, testInfo) => {
+  const orderId = `order-${testInfo.workerIndex}-${Date.now()}`;
+  await page.goto(`/orders/new?ref=${orderId}`);
+  // ...
+});
+```
+
+### Dynamic Shard Count
+
+**Use when**: Automatically adjusting shards based on test count.
+
+```yaml
+# .github/workflows/playwright.yml
+jobs:
+  calculate-shards:
+    runs-on: ubuntu-latest
+    outputs:
+      shard-count: ${{ steps.calc.outputs.count }}
+      shard-matrix: ${{ steps.calc.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v4
+      - run: npm ci
+      - id: calc
+        run: |
+          TEST_COUNT=$(npx playwright test --list --reporter=json 2>/dev/null | node -e "
+            const data = require('fs').readFileSync('/dev/stdin', 'utf8');
+            const parsed = JSON.parse(data);
+            console.log(parsed.suites?.reduce((acc, s) => acc + (s.specs?.length || 0), 0) || 0);
+          ")
+          # 1 shard per 20 tests, min 1, max 8
+          SHARDS=$(( (TEST_COUNT + 19) / 20 ))
+          SHARDS=$(( SHARDS > 8 ? 8 : SHARDS ))
+          SHARDS=$(( SHARDS < 1 ? 1 : SHARDS ))
+          MATRIX="["
+          for i in $(seq 1 $SHARDS); do
+            [ $i -gt 1 ] && MATRIX+=","
+            MATRIX+="\"$i/$SHARDS\""
+          done
+          MATRIX+="]"
+          echo "count=$SHARDS" >> $GITHUB_OUTPUT
+          echo "matrix=$MATRIX" >> $GITHUB_OUTPUT
+
+  test:
+    needs: calculate-shards
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        shard: ${{ fromJson(needs.calculate-shards.outputs.shard-matrix) }}
+    steps:
+      - uses: actions/checkout@v4
+      - run: npm ci
+      - run: npx playwright install --with-deps
+      - run: npx playwright test --shard=${{ matrix.shard }}
+```
+
+## Decision Guide
+
+| Scenario                         | Workers        | Shards | Reason                                  |
+| -------------------------------- | -------------- | ------ | --------------------------------------- |
+| < 50 tests, < 5 min              | Auto (default) | None   | No optimization needed                  |
+| 50-200 tests, 5-15 min           | `'50%'` in CI  | 2-4    | Balance speed and cost                  |
+| 200+ tests, > 15 min             | `'50%'` in CI  | 4-8    | Keep feedback under 10 min              |
+| Flaky due to resource contention | Reduce to 2    | Keep   | Less CPU/memory pressure                |
+| Tests modify shared database     | 1 or isolate   | Useful | Sharding splits files; workers run them |
+| CI has limited resources         | 1 or `'25%'`   | More   | Compensate with more machines           |
+
+| Aspect         | Workers (in-process)      | Shards (across machines)   |
+| -------------- | ------------------------- | -------------------------- |
+| What it splits | Tests across CPU cores    | Test files across CI jobs  |
+| Controlled by  | Config or `--workers` CLI | `--shard=X/Y` CLI flag     |
+| Shares memory  | Yes                       | No                         |
+| Report merging | Not needed                | Required (`merge-reports`) |
+| Cost           | Free (same machine)       | More CI minutes            |
+
+## Anti-Patterns
+
+| Anti-Pattern                            | Problem                                  | Solution                                             |
+| --------------------------------------- | ---------------------------------------- | ---------------------------------------------------- |
+| `fullyParallel: false` without reason   | Tests in files run serially              | Set `fullyParallel: true` unless tests need serial   |
+| `workers: 1` in CI "for safety"         | Negates parallelism                      | Fix isolation issues; use `workers: '50%'`           |
+| Hardcoded shared user account           | Race conditions in parallel runs         | Each test creates unique data                        |
+| Sharding without blob reporter          | Each shard produces separate HTML report | Configure `reporter: [['blob']]` for CI              |
+| Sharding with 3 tests                   | Setup overhead exceeds time saved        | Only shard when suite > 5 minutes                    |
+| `test.describe.serial()` everywhere     | Kills parallelism, creates dependencies  | Use only when tests genuinely need prior state       |
+| Workers > CPU cores                     | Context switching overhead               | Use `'50%'` or auto-detect                           |
+| Missing `fail-fast: false` in CI matrix | One shard failure cancels others         | Always set `fail-fast: false` for sharded strategies |
+
+## Troubleshooting
+
+### Tests pass solo but fail together
+
+- **Shared state**. Make test data unique:
+  ```ts
+  test("create item", async ({ request }, ti) => {
+    await request.post("/api/items", {
+      data: { name: `Item-${ti.workerIndex}-${Date.now()}` },
+    });
+  });
+  ```
+
+### "No tests found" in some shards
+
+- **Too many shards**. Never exceed file count:
+  ```bash
+  npx playwright test --shard=1/10   # ok if 10 files
+  npx playwright test --shard=1/20   # too many, some shards empty
+  ```
+
+### Merged report missing results
+
+- **Blob reports collide**. Use unique names:
+  ```yaml
+  # Each shard
+  - uses: actions/upload-artifact@v4
+    with:
+      name: blob-report-${{ strategy.job-index }}
+      path: blob-report/
+  # Merge step
+  - uses: actions/download-artifact@v4
+    with:
+      pattern: blob-report-*
+      merge-multiple: true
+      path: all-blob-reports
+  ```
+
+### Worker-scoped fixture not working
+
+- **Missing `{ scope: 'worker' }`**. Fix:
+  ```ts
+  export const test = base.extend({
+    resource: [
+      async ({}, use) => {
+        const r = await Resource.create();
+        await use(r);
+        await r.destroy();
+      },
+      { scope: "worker" },
+    ],
+  });
+  ```
+
+### More workers = Slower
+
+- **Too many workers thrash**. Limit in CI:
+  ```ts
+  export default defineConfig({
+    workers: process.env.CI ? 2 : undefined,
+  });
+  ```
diff --git a/.cursor/skills/playwright-testing/infrastructure-ci-cd/performance.md b/.cursor/skills/playwright-testing/infrastructure-ci-cd/performance.md
new file mode 100644
index 0000000000..bc03da8e92
--- /dev/null
+++ b/.cursor/skills/playwright-testing/infrastructure-ci-cd/performance.md
@@ -0,0 +1,453 @@
+# Performance & Parallelization
+
+## Table of Contents
+
+1. [Parallel Execution](#parallel-execution)
+2. [Sharding](#sharding)
+3. [Test Optimization](#test-optimization)
+4. [Network Optimization](#network-optimization)
+5. [Isolation and Parallel Execution](#isolation-and-parallel-execution)
+6. [Resource Management](#resource-management)
+7. [Benchmarking](#benchmarking)
+
+## Parallel Execution
+
+### Configuration
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  // Run test files in parallel
+  fullyParallel: true,
+
+  // Number of worker processes
+  workers: process.env.CI ? 1 : undefined, // undefined = half CPU cores
+
+  // Or explicit count
+  // workers: 4,
+  // workers: '50%', // Percentage of CPU cores
+});
+```
+
+### Serial Execution When Needed
+
+```typescript
+// Entire file serial
+test.describe.configure({ mode: "serial" });
+
+test.describe("Sequential Tests", () => {
+  test("first", async ({ page }) => {
+    // Runs first
+  });
+
+  test("second", async ({ page }) => {
+    // Runs after first
+  });
+});
+```
+
+```typescript
+// Single describe block serial
+test.describe("Parallel Tests", () => {
+  test("a", async () => {}); // Parallel
+  test("b", async () => {}); // Parallel
+});
+
+test.describe.serial("Serial Tests", () => {
+  test("c", async () => {}); // Serial
+  test("d", async () => {}); // Serial
+});
+```
+
+### Parallel Projects
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  projects: [
+    { name: "chromium", use: { ...devices["Desktop Chrome"] } },
+    { name: "firefox", use: { ...devices["Desktop Firefox"] } },
+    { name: "webkit", use: { ...devices["Desktop Safari"] } },
+  ],
+});
+```
+
+```bash
+# Run all projects in parallel
+npx playwright test
+
+# Run specific project
+npx playwright test --project=chromium
+```
+
+## Sharding
+
+### Basic Sharding
+
+```bash
+# Split tests across 4 machines
+# Machine 1:
+npx playwright test --shard=1/4
+
+# Machine 2:
+npx playwright test --shard=2/4
+
+# Machine 3:
+npx playwright test --shard=3/4
+
+# Machine 4:
+npx playwright test --shard=4/4
+```
+
+### Sharding Strategy
+
+Tests are distributed evenly by file. For optimal sharding:
+
+- Keep test files similar in size
+- Use `fullyParallel: true` for even distribution
+- Balance slow tests across files
+
+### CI Sharding Pattern
+
+```yaml
+# GitHub Actions
+jobs:
+  test:
+    strategy:
+      matrix:
+        shard: [1, 2, 3, 4]
+    steps:
+      - run: npx playwright test --shard=${{ matrix.shard }}/4
+```
+
+> **For comprehensive CI sharding** (blob reports, merging sharded results, full workflows), see [ci-cd.md](ci-cd.md#sharding).
+
+## Test Optimization
+
+### Reuse Authentication
+
+Avoid logging in for every test. Use setup projects with storage state to authenticate once and reuse the session.
+
+> **For authentication patterns** (storage state, multiple auth states, setup projects), see [fixtures-hooks.md](fixtures-hooks.md#authentication-patterns).
+
+### Reuse Page State (serial only — trade-off with isolation)
+
+Sharing a single page/context across tests with `beforeAll`/`afterAll` is **not recommended** for most suites: it breaks test isolation, causes state leak between tests, and makes failures harder to debug. Prefer a fresh `page` per test (Playwright default). Use shared page only when you explicitly need serial execution and accept no isolation.
+
+```typescript
+// ⚠️ Serial only, no isolation: state from one test leaks into the next.
+// Prefer test.describe.configure({ mode: 'serial' }) + fresh page per test, or beforeEach + page.goto().
+test.describe.configure({ mode: "serial" });
+test.describe("Dashboard", () => {
+  let page: Page;
+
+  test.beforeAll(async ({ browser }) => {
+    const context = await browser.newContext({
+      storageState: ".auth/user.json",
+    });
+    page = await context.newPage();
+    await page.goto("/dashboard");
+  });
+
+  test.afterAll(async () => {
+    await page?.close();
+  });
+
+  test("shows stats", async () => {
+    await expect(page.getByTestId("stats")).toBeVisible();
+  });
+
+  test("shows chart", async () => {
+    await expect(page.getByTestId("chart")).toBeVisible();
+  });
+});
+```
+
+### Lazy Navigation
+
+```typescript
+// Bad: Navigate in every test
+test("check header", async ({ page }) => {
+  await page.goto("/products");
+  await expect(page.getByRole("heading")).toBeVisible();
+});
+
+test("check footer", async ({ page }) => {
+  await page.goto("/products");
+  await expect(page.getByRole("contentinfo")).toBeVisible();
+});
+
+// Good: Share navigation
+test.describe("Products Page", () => {
+  test.beforeEach(async ({ page }) => {
+    await page.goto("/products");
+  });
+
+  test("check header", async ({ page }) => {
+    await expect(page.getByRole("heading")).toBeVisible();
+  });
+
+  test("check footer", async ({ page }) => {
+    await expect(page.getByRole("contentinfo")).toBeVisible();
+  });
+});
+```
+
+### Skip Unnecessary Setup
+
+```typescript
+// Use test.skip for conditional execution
+test("admin feature", async ({ page }) => {
+  test.skip(!process.env.ADMIN_ENABLED, "Admin features disabled");
+  // ...
+});
+
+// Use test.fixme for known broken tests
+test.fixme("broken feature", async ({ page }) => {
+  // Skipped but tracked
+});
+```
+
+## Network Optimization
+
+### Mock APIs
+
+```typescript
+test.beforeEach(async ({ page }) => {
+  // Mock slow/heavy endpoints
+  await page.route("**/api/analytics", (route) =>
+    route.fulfill({ json: { views: 1000 } }),
+  );
+
+  await page.route("**/api/recommendations", (route) =>
+    route.fulfill({ json: [] }),
+  );
+});
+```
+
+### Block Unnecessary Resources
+
+```typescript
+test.beforeEach(async ({ page }) => {
+  // Block analytics, ads, tracking
+  await page.route("**/*", (route) => {
+    const url = route.request().url();
+    if (
+      url.includes("google-analytics") ||
+      url.includes("facebook") ||
+      url.includes("hotjar")
+    ) {
+      return route.abort();
+    }
+    return route.continue();
+  });
+});
+```
+
+### Block Resource Types
+
+```typescript
+// Block images and fonts for faster tests
+await page.route("**/*", (route) => {
+  const resourceType = route.request().resourceType();
+  if (["image", "font", "stylesheet"].includes(resourceType)) {
+    return route.abort();
+  }
+  return route.continue();
+});
+```
+
+### Cache API Responses
+
+```typescript
+const apiCache = new Map<string, object>();
+
+test.beforeEach(async ({ page }) => {
+  await page.route("**/api/**", async (route) => {
+    const url = route.request().url();
+
+    if (apiCache.has(url)) {
+      return route.fulfill({ json: apiCache.get(url) });
+    }
+
+    const response = await route.fetch();
+    const json = await response.json();
+    apiCache.set(url, json);
+    return route.fulfill({ json });
+  });
+});
+```
+
+## Isolation and Parallel Execution
+
+### Default: one context per test
+
+Playwright gives each test its own browser context (and page). That gives isolation: no shared cookies, storage, or DOM between tests, so failures don’t carry over and you can run tests in any order or in parallel. Keep this default unless you have a clear reason to share state.
+
+### Avoiding state leak in parallel runs
+
+- **Do not** rely on shared mutable state (e.g. a single `page` or `context` in `beforeAll`) when tests can run in parallel. State from one test can leak into another and cause flaky, order-dependent failures.
+- Use **fixtures** for setup/teardown and **`beforeEach`** for per-test navigation so each test gets a fresh page or a clean slate.
+- For **backend or DB state** shared across tests, isolate per worker so parallel workers don’t collide. Use a worker-scoped fixture and `testInfo.workerIndex` (or `process.env.TEST_WORKER_INDEX`) to create unique data per worker (e.g. unique user or DB prefix). See [fixtures-hooks.md](../core/fixtures-hooks.md) for worker-scoped fixtures and [debugging.md](../debugging/debugging.md) for debugging flaky parallel runs.
+
+### Debugging flaky parallel runs
+
+If a test is flaky only with multiple workers:
+
+1. **Reproduce**: Run with default workers and `--repeat-each=10` (or `--repeat-each=100 --max-failures=1`).
+2. **Confirm parallel-specific**: Run with `--workers=1`. If the failure disappears, the cause is likely shared state or non-isolated backend/DB data.
+3. **Fix**: Remove shared page/context; use per-test fixtures and `beforeEach`; isolate test data per worker with `workerIndex` in a worker-scoped fixture.
+
+Workers are restarted after a test failure so subsequent tests in that worker get a clean environment; fixing isolation still prevents the initial flakiness.
+
+## Resource Management
+
+### Browser Contexts
+
+```typescript
+// Recommended: One context per test (default) — full isolation
+test("isolated test", async ({ page }) => {
+  // Fresh context automatically
+});
+
+// Manual context for specific needs
+test("multiple tabs", async ({ browser }) => {
+  const context = await browser.newContext();
+  const page1 = await context.newPage();
+  const page2 = await context.newPage();
+
+  // Clean up
+  await context.close();
+});
+```
+
+### Memory Management
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  // Limit concurrent workers
+  workers: 2,
+
+  // Limit parallel tests per worker
+  use: {
+    // Lower memory usage
+    launchOptions: {
+      args: ["--disable-dev-shm-usage"],
+    },
+  },
+});
+```
+
+### Timeouts
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  // Global test timeout
+  timeout: 30000,
+
+  // Assertion timeout
+  expect: {
+    timeout: 5000,
+  },
+
+  // Navigation timeout
+  use: {
+    navigationTimeout: 15000,
+    actionTimeout: 10000,
+  },
+});
+```
+
+## Benchmarking
+
+### Measure Test Duration
+
+```typescript
+test("performance test", async ({ page }, testInfo) => {
+  const startTime = Date.now();
+
+  await page.goto("/");
+
+  const loadTime = Date.now() - startTime;
+  console.log(`Page load: ${loadTime}ms`);
+
+  // Add to test report
+  testInfo.annotations.push({
+    type: "performance",
+    description: `Load time: ${loadTime}ms`,
+  });
+});
+```
+
+### Performance Metrics
+
+```typescript
+test("collect metrics", async ({ page }) => {
+  await page.goto("/");
+
+  const metrics = await page.evaluate(() => ({
+    // Navigation timing
+    loadTime:
+      performance.timing.loadEventEnd - performance.timing.navigationStart,
+    domContentLoaded:
+      performance.timing.domContentLoadedEventEnd -
+      performance.timing.navigationStart,
+
+    // Performance entries
+    resources: performance.getEntriesByType("resource").length,
+
+    // Memory (Chrome only)
+    // @ts-ignore
+    memory: performance.memory?.usedJSHeapSize,
+  }));
+
+  console.log("Metrics:", metrics);
+  expect(metrics.loadTime).toBeLessThan(3000);
+});
+```
+
+### Lighthouse Integration
+
+```typescript
+import { playAudit } from "playwright-lighthouse";
+
+test("lighthouse audit", async ({ page }) => {
+  await page.goto("/");
+
+  const audit = await playAudit({
+    page,
+    thresholds: {
+      performance: 80,
+      accessibility: 90,
+      "best-practices": 80,
+      seo: 80,
+    },
+    port: 9222,
+  });
+
+  expect(audit.lhr.categories.performance.score * 100).toBeGreaterThanOrEqual(
+    80,
+  );
+});
+```
+
+## Performance Checklist
+
+| Optimization                   | Impact     |
+| ------------------------------ | ---------- |
+| Enable `fullyParallel`         | High       |
+| Reuse authentication           | High       |
+| Mock heavy APIs                | High       |
+| Block tracking scripts         | Medium     |
+| Use sharding in CI             | High       |
+| Reduce workers if memory-bound | Medium     |
+| Cache API responses            | Medium     |
+| Skip unnecessary tests         | Low-Medium |
+
+## Related References
+
+- **CI/CD sharding**: See [ci-cd.md](ci-cd.md) for CI configuration
+- **Test organization**: See [test-suite-structure.md](../core/test-suite-structure.md) for structuring tests
+- **Fixtures for reuse**: See [fixtures-hooks.md](../core/fixtures-hooks.md) for authentication patterns
diff --git a/.cursor/skills/playwright-testing/infrastructure-ci-cd/reporting.md b/.cursor/skills/playwright-testing/infrastructure-ci-cd/reporting.md
new file mode 100644
index 0000000000..55e1a9374c
--- /dev/null
+++ b/.cursor/skills/playwright-testing/infrastructure-ci-cd/reporting.md
@@ -0,0 +1,424 @@
+# Test Reports & Artifacts
+
+## Table of Contents
+
+1. [CLI Commands](#cli-commands)
+2. [Reporter Configuration](#reporter-configuration)
+3. [Custom Reporter](#custom-reporter)
+4. [Trace Configuration](#trace-configuration)
+5. [Screenshot & Video Settings](#screenshot--video-settings)
+6. [Artifact Directory Structure](#artifact-directory-structure)
+7. [CI Artifact Upload](#ci-artifact-upload)
+8. [Decision Guide](#decision-guide)
+9. [Anti-Patterns](#anti-patterns)
+10. [Troubleshooting](#troubleshooting)
+
+> **When to use**: Configuring test output for debugging, CI dashboards, and team visibility.
+
+## CLI Commands
+
+```bash
+# Display last HTML report
+npx playwright show-report
+
+# Specify reporter
+npx playwright test --reporter=html
+npx playwright test --reporter=dot           # minimal CI output
+npx playwright test --reporter=line          # one line per test
+npx playwright test --reporter=json          # machine-readable
+npx playwright test --reporter=junit         # CI integration
+
+# Combine reporters
+npx playwright test --reporter=dot,html
+
+# Merge sharded reports
+npx playwright merge-reports --reporter=html ./blob-report
+```
+
+## Reporter Configuration
+
+### Environment-Based Setup
+
+```typescript
+// playwright.config.ts
+import { defineConfig } from '@playwright/test';
+
+export default defineConfig({
+  reporter: process.env.CI
+    ? [
+        ['dot'],
+        ['html', { open: 'never' }],
+        ['junit', { outputFile: 'results/junit.xml' }],
+        ['github'],
+      ]
+    : [
+        ['list'],
+        ['html', { open: 'on-failure' }],
+      ],
+});
+```
+
+### Reporter Types
+
+| Reporter | Output | Use Case |
+|---|---|---|
+| `list` | One line per test | Local development |
+| `line` | Single updating line | Local, less verbose |
+| `dot` | `.` pass, `F` fail | CI logs |
+| `html` | Interactive HTML page | Post-run analysis |
+| `json` | Machine-readable JSON | Custom tooling |
+| `junit` | JUnit XML | CI platforms |
+| `github` | PR annotations | GitHub Actions |
+| `blob` | Binary archive | Shard merging |
+
+### JSON Output to File
+
+```typescript
+import { defineConfig } from '@playwright/test';
+
+export default defineConfig({
+  reporter: [
+    ['json', { outputFile: 'results/output.json' }],
+  ],
+});
+```
+
+### JUnit Customization
+
+```typescript
+import { defineConfig } from '@playwright/test';
+
+export default defineConfig({
+  reporter: [
+    ['junit', {
+      outputFile: 'results/junit.xml',
+      stripANSIControlSequences: true,
+      includeProjectInTestName: true,
+    }],
+  ],
+});
+```
+
+## Custom Reporter
+
+Build custom reporters for Slack notifications, database logging, or dashboards.
+
+```typescript
+// reporters/notification-reporter.ts
+import type {
+  FullResult,
+  Reporter,
+  TestCase,
+  TestResult,
+} from '@playwright/test/reporter';
+
+class NotificationReporter implements Reporter {
+  private passed = 0;
+  private failed = 0;
+  private skipped = 0;
+  private failures: string[] = [];
+
+  onTestEnd(test: TestCase, result: TestResult) {
+    switch (result.status) {
+      case 'passed':
+        this.passed++;
+        break;
+      case 'failed':
+      case 'timedOut':
+        this.failed++;
+        this.failures.push(`${test.title}: ${result.error?.message?.split('\n')[0]}`);
+        break;
+      case 'skipped':
+        this.skipped++;
+        break;
+    }
+  }
+
+  async onEnd(result: FullResult) {
+    const total = this.passed + this.failed + this.skipped;
+    const status = this.failed > 0 ? 'FAILED' : 'PASSED';
+    const message = [
+      `Tests ${status}`,
+      `Passed: ${this.passed} | Failed: ${this.failed} | Skipped: ${this.skipped}`,
+      `Duration: ${(result.duration / 1000).toFixed(1)}s`,
+    ];
+
+    if (this.failures.length > 0) {
+      message.push('', 'Failures:');
+      this.failures.slice(0, 5).forEach((f) => message.push(`  - ${f}`));
+      if (this.failures.length > 5) {
+        message.push(`  ...and ${this.failures.length - 5} more`);
+      }
+    }
+
+    const webhookUrl = process.env.NOTIFICATION_WEBHOOK;
+    if (webhookUrl) {
+      const controller = new AbortController();
+      const timeout = setTimeout(() => controller.abort(), 5000);
+      try {
+        await fetch(webhookUrl, {
+          method: 'POST',
+          headers: { 'Content-Type': 'application/json' },
+          body: JSON.stringify({ text: message.join('\n') }),
+          signal: controller.signal,
+        });
+      } catch (error) {
+        // Intentionally swallow notifier failures to avoid blocking test completion
+        console.warn('Webhook notification failed:', error.message);
+      } finally {
+        clearTimeout(timeout);
+      }
+    }
+  }
+}
+
+export default NotificationReporter;
+```
+
+**Register custom reporter:**
+
+```typescript
+import { defineConfig } from '@playwright/test';
+
+export default defineConfig({
+  reporter: [
+    ['dot'],
+    ['html', { open: 'never' }],
+    ['./reporters/notification-reporter.ts'],
+  ],
+});
+```
+
+## Trace Configuration
+
+Traces capture actions, network requests, DOM snapshots, and console logs.
+
+```typescript
+import { defineConfig } from '@playwright/test';
+
+export default defineConfig({
+  retries: process.env.CI ? 2 : 0,
+  use: {
+    trace: 'on-first-retry',
+  },
+});
+```
+
+### Trace Options
+
+| Value | Behavior | Overhead |
+|---|---|---|
+| `'off'` | Never records | None |
+| `'on'` | Every test | High |
+| `'on-first-retry'` | On first retry after failure | Minimal |
+| `'retain-on-failure'` | Records all, keeps failures | Medium |
+| `'retain-on-first-failure'` | Records all, keeps first failure | Medium |
+
+### Viewing Traces
+
+```bash
+# Local trace viewer
+npx playwright show-trace results/my-test/trace.zip
+
+# From HTML report (click Traces tab)
+npx playwright show-report
+
+# Online viewer: https://trace.playwright.dev
+```
+
+## Screenshot & Video Settings
+
+```typescript
+import { defineConfig } from '@playwright/test';
+
+export default defineConfig({
+  use: {
+    screenshot: 'only-on-failure',
+    video: 'retain-on-failure',
+  },
+});
+```
+
+### Video with Custom Size
+
+```typescript
+use: {
+  video: {
+    mode: 'retain-on-failure',
+    size: { width: 1280, height: 720 },
+  },
+},
+```
+
+### Screenshot Options
+
+| Value | Captures | Disk Cost |
+|---|---|---|
+| `'off'` | Never | None |
+| `'on'` | Every test | High |
+| `'only-on-failure'` | Failed tests | Low |
+
+### Video Options
+
+| Value | Records | Keeps | Disk Cost |
+|---|---|---|---|
+| `'off'` | Never | — | None |
+| `'on'` | Every test | All | Very high |
+| `'on-first-retry'` | On retry | Retried | Low |
+| `'retain-on-failure'` | Every test | Failed | Medium |
+
+## Artifact Directory Structure
+
+```text
+test-results/
+├── checkout-test-chromium/
+│   ├── trace.zip
+│   ├── test-failed-1.png
+│   └── video.webm
+├── login-test-firefox/
+│   ├── trace.zip
+│   └── test-failed-1.png
+└── junit.xml
+
+playwright-report/
+├── index.html
+└── data/
+
+blob-report/
+└── report-1.zip
+```
+
+## CI Artifact Upload
+
+### GitHub Actions
+
+```yaml
+- uses: actions/upload-artifact@v4
+  if: ${{ !cancelled() }}
+  with:
+    name: playwright-report
+    path: playwright-report/
+    retention-days: 14
+
+- uses: actions/upload-artifact@v4
+  if: failure()
+  with:
+    name: test-traces
+    path: |
+      test-results/**/trace.zip
+      test-results/**/*.png
+      test-results/**/*.webm
+    retention-days: 7
+```
+
+## Decision Guide
+
+| Scenario | Reporter Configuration |
+|---|---|
+| Local development | `[['list'], ['html', { open: 'on-failure' }]]` |
+| GitHub Actions | `[['dot'], ['html'], ['github']]` |
+| GitLab CI | `[['dot'], ['html'], ['junit']]` |
+| Azure DevOps / Jenkins | `[['dot'], ['html'], ['junit']]` |
+| Sharded CI | `[['blob'], ['github']]` |
+| Custom dashboard | `[['json', { outputFile: '...' }]]` + custom reporter |
+
+| Artifact | When to Collect | Retention | Upload Condition |
+|---|---|---|---|
+| HTML report | Always | 14 days | `if: ${{ !cancelled() }}` |
+| Traces | On failure | 7 days | `if: failure()` |
+| Screenshots | On failure | 7 days | `if: failure()` |
+| Videos | On failure | 7 days | `if: failure()` |
+| JUnit XML | Always | 14 days | `if: ${{ !cancelled() }}` |
+| Blob report | Always (sharded) | 1 day | `if: ${{ !cancelled() }}` |
+
+## Anti-Patterns
+
+| Anti-Pattern | Problem | Solution |
+|---|---|---|
+| No reporter configured | Default `list` only; no persistent report | Configure `html` + CI reporter |
+| `trace: 'on'` in CI | Massive artifacts, slow uploads | Use `trace: 'on-first-retry'` |
+| `video: 'on'` in CI | Enormous storage, slower tests | Use `video: 'retain-on-failure'` |
+| Upload artifacts only on failure | No report when tests pass | Upload with `if: ${{ !cancelled() }}` |
+| No retention limits | CI storage fills quickly | Set `retention-days: 7-14` |
+| Only `dot` reporter | Cannot drill into failures | Pair `dot` with `html` |
+| JUnit to stdout | Interferes with console output | Write to file |
+| Blocking `onEnd` in custom reporter | Slow HTTP calls delay pipeline | Use `Promise.race` with timeout |
+
+## Troubleshooting
+
+### Empty HTML Report
+
+Check reporter config. HTML report defaults to `playwright-report/`:
+
+```typescript
+import { defineConfig } from '@playwright/test';
+
+export default defineConfig({
+  reporter: [['html', { outputFolder: 'playwright-report', open: 'never' }]],
+});
+```
+
+### Traces Too Large
+
+Switch from `trace: 'on'` to `'on-first-retry'` with retries enabled:
+
+```typescript
+import { defineConfig } from '@playwright/test';
+
+export default defineConfig({
+  retries: process.env.CI ? 2 : 0,
+  use: {
+    trace: 'on-first-retry',
+  },
+});
+```
+
+### JUnit XML Not Recognized
+
+Ensure path matches CI configuration:
+
+```typescript
+reporter: [['junit', { outputFile: 'results/junit.xml' }]],
+```
+
+```yaml
+# GitHub Actions
+- uses: dorny/test-reporter@latest
+  with:
+    path: results/junit.xml
+    reporter: java-junit
+
+# Azure DevOps
+- task: PublishTestResults@latest
+  inputs:
+    testResultsFiles: 'results/junit.xml'
+
+# Jenkins
+junit 'results/junit.xml'
+```
+
+### Empty Merged Report
+
+Use `blob` reporter for sharded runs (not `html`):
+
+```typescript
+import { defineConfig } from '@playwright/test';
+
+export default defineConfig({
+  reporter: process.env.CI
+    ? [['blob'], ['dot']]
+    : [['html', { open: 'on-failure' }]],
+});
+```
+
+### Missing Screenshots in Report
+
+Enable screenshots and keep both directories:
+
+```typescript
+use: {
+  screenshot: 'only-on-failure',
+},
+```
+
+The HTML report embeds screenshots from `test-results/`. Deleting that directory removes screenshots from the report.
diff --git a/.cursor/skills/playwright-testing/infrastructure-ci-cd/test-coverage.md b/.cursor/skills/playwright-testing/infrastructure-ci-cd/test-coverage.md
new file mode 100644
index 0000000000..9bc73aebe2
--- /dev/null
+++ b/.cursor/skills/playwright-testing/infrastructure-ci-cd/test-coverage.md
@@ -0,0 +1,497 @@
+# Test Coverage
+
+## Table of Contents
+
+1. [Coverage Setup](#coverage-setup)
+2. [Collecting Coverage](#collecting-coverage)
+3. [Coverage Reports](#coverage-reports)
+4. [Coverage Thresholds](#coverage-thresholds)
+5. [Advanced Patterns](#advanced-patterns)
+6. [CI Integration](#ci-integration)
+
+## Coverage Setup
+
+### Install Dependencies
+
+```bash
+# For V8 coverage (built into Playwright)
+# No additional dependencies needed
+
+# For Istanbul-based coverage (more features)
+npm install -D nyc @istanbuljs/nyc-config-typescript
+```
+
+### Basic Configuration
+
+```typescript
+// playwright.config.ts
+import { defineConfig } from "@playwright/test";
+
+export default defineConfig({
+  use: {
+    // Enable coverage collection
+    contextOptions: {
+      // V8 coverage is automatic with the API below
+    },
+  },
+});
+```
+
+### V8 Coverage Fixture
+
+```typescript
+// fixtures/coverage.ts
+import { test as base, expect } from "@playwright/test";
+import fs from "fs";
+import path from "path";
+import { randomUUID } from "crypto";
+
+export const test = base.extend<{}, { collectCoverage: void }>({
+  collectCoverage: [
+    async ({ browser }, use) => {
+      // Start coverage for all pages
+      const context = await browser.newContext();
+      const page = await context.newPage();
+
+      await page.coverage.startJSCoverage();
+      await page.coverage.startCSSCoverage();
+
+      await use();
+
+      // Collect coverage
+      const [jsCoverage, cssCoverage] = await Promise.all([
+        page.coverage.stopJSCoverage(),
+        page.coverage.stopCSSCoverage(),
+      ]);
+
+      // Save coverage data
+      const coverageDir = "./coverage";
+      if (!fs.existsSync(coverageDir)) {
+        fs.mkdirSync(coverageDir, { recursive: true });
+      }
+
+      fs.writeFileSync(
+        path.join(coverageDir, `coverage-${randomUUID()}.json`),
+        JSON.stringify([...jsCoverage, ...cssCoverage])
+      );
+
+      await context.close();
+    },
+    { scope: "worker", auto: true },
+  ],
+});
+```
+
+## Collecting Coverage
+
+### Per-Test Coverage
+
+```typescript
+test("collect coverage for single test", async ({ page }) => {
+  // Start coverage collection
+  await page.coverage.startJSCoverage({
+    resetOnNavigation: false,
+  });
+
+  // Run test
+  await page.goto("/app");
+  await page.getByRole("button", { name: "Submit" }).click();
+  await expect(page.getByText("Success")).toBeVisible();
+
+  // Stop and get coverage
+  const coverage = await page.coverage.stopJSCoverage();
+
+  // Filter to only your source files
+  const appCoverage = coverage.filter((entry) => entry.url.includes("/src/"));
+
+  console.log(`Covered ${appCoverage.length} source files`);
+});
+```
+
+### Coverage for Specific Files
+
+```typescript
+test("track specific module coverage", async ({ page }) => {
+  await page.coverage.startJSCoverage();
+
+  await page.goto("/checkout");
+  await page.getByRole("button", { name: "Pay" }).click();
+
+  const coverage = await page.coverage.stopJSCoverage();
+
+  // Find coverage for checkout module
+  const checkoutCoverage = coverage.find((c) => c.url.includes("checkout.js"));
+
+  if (checkoutCoverage) {
+    const totalBytes = checkoutCoverage.text?.length || 0;
+    const coveredBytes = checkoutCoverage.ranges.reduce(
+      (sum, range) => sum + (range.end - range.start),
+      0
+    );
+    const percentage = (coveredBytes / totalBytes) * 100;
+
+    console.log(`Checkout module: ${percentage.toFixed(1)}% covered`);
+    expect(percentage).toBeGreaterThan(80);
+  }
+});
+```
+
+### CSS Coverage
+
+```typescript
+test("collect CSS coverage", async ({ page }) => {
+  await page.coverage.startCSSCoverage();
+
+  await page.goto("/app");
+
+  // Interact to trigger different CSS states
+  await page.getByRole("button").hover();
+  await page.getByRole("dialog").waitFor();
+
+  const cssCoverage = await page.coverage.stopCSSCoverage();
+
+  // Find unused CSS
+  for (const entry of cssCoverage) {
+    const totalBytes = entry.text?.length || 0;
+    const usedBytes = entry.ranges.reduce(
+      (sum, range) => sum + (range.end - range.start),
+      0
+    );
+    const unusedPercentage = ((totalBytes - usedBytes) / totalBytes) * 100;
+
+    if (unusedPercentage > 50) {
+      console.warn(`${entry.url}: ${unusedPercentage.toFixed(1)}% unused CSS`);
+    }
+  }
+});
+```
+
+## Coverage Reports
+
+### Converting to Istanbul Format
+
+```typescript
+// scripts/convert-coverage.ts
+import { execSync } from "child_process";
+import fs from "fs";
+import path from "path";
+import v8ToIstanbul from "v8-to-istanbul";
+
+async function convertCoverage() {
+  const coverageDir = "./coverage";
+  const files = fs.readdirSync(coverageDir).filter((f) => f.endsWith(".json"));
+
+  const istanbulCoverage: any = {};
+
+  for (const file of files) {
+    const coverageData = JSON.parse(
+      fs.readFileSync(path.join(coverageDir, file), "utf-8")
+    );
+
+    for (const entry of coverageData) {
+      if (!entry.url.startsWith("file://")) continue;
+
+      const filePath = entry.url.replace("file://", "");
+      const converter = v8ToIstanbul(filePath);
+
+      await converter.load();
+      converter.applyCoverage(entry.functions || []);
+
+      const istanbul = converter.toIstanbul();
+      Object.assign(istanbulCoverage, istanbul);
+    }
+  }
+
+  fs.writeFileSync(
+    path.join(coverageDir, "coverage-final.json"),
+    JSON.stringify(istanbulCoverage)
+  );
+}
+
+convertCoverage();
+```
+
+### Generating HTML Report
+
+```bash
+# Using nyc to generate report
+npx nyc report --reporter=html --reporter=text --temp-dir=./coverage
+```
+
+```typescript
+// package.json scripts
+{
+  "scripts": {
+    "test": "playwright test",
+    "test:coverage": "playwright test && npm run coverage:report",
+    "coverage:report": "npx nyc report --reporter=html --reporter=lcov --temp-dir=./coverage"
+  }
+}
+```
+
+### Custom Coverage Reporter
+
+```typescript
+// reporters/coverage-reporter.ts
+import type { Reporter, FullResult } from "@playwright/test/reporter";
+import fs from "fs";
+import path from "path";
+
+class CoverageReporter implements Reporter {
+  private coverageData: any[] = [];
+
+  onEnd(result: FullResult) {
+    // Aggregate all coverage files
+    const coverageDir = "./coverage";
+    const files = fs
+      .readdirSync(coverageDir)
+      .filter((f) => f.endsWith(".json"));
+
+    for (const file of files) {
+      const data = JSON.parse(
+        fs.readFileSync(path.join(coverageDir, file), "utf-8")
+      );
+      this.coverageData.push(...data);
+    }
+
+    // Generate summary
+    const summary = this.generateSummary();
+    console.log("\n📊 Coverage Summary:");
+    console.log(`   Files: ${summary.totalFiles}`);
+    console.log(`   Lines: ${summary.lineCoverage.toFixed(1)}%`);
+    console.log(`   Bytes: ${summary.byteCoverage.toFixed(1)}%`);
+
+    if (summary.lineCoverage < 80) {
+      console.warn("⚠️  Coverage below 80% threshold!");
+    }
+  }
+
+  private generateSummary() {
+    let totalBytes = 0;
+    let coveredBytes = 0;
+    const files = new Set<string>();
+
+    for (const entry of this.coverageData) {
+      if (entry.url.includes("/src/")) {
+        files.add(entry.url);
+        totalBytes += entry.text?.length || 0;
+        coveredBytes += entry.ranges.reduce(
+          (sum: number, r: any) => sum + (r.end - r.start),
+          0
+        );
+      }
+    }
+
+    return {
+      totalFiles: files.size,
+      byteCoverage: (coveredBytes / totalBytes) * 100,
+      lineCoverage: (coveredBytes / totalBytes) * 100, // Simplified
+    };
+  }
+}
+
+export default CoverageReporter;
+```
+
+## Coverage Thresholds
+
+### Enforcing Minimum Coverage
+
+```typescript
+// tests/coverage.spec.ts
+import { test, expect } from "@playwright/test";
+import fs from "fs";
+import path from "path";
+
+test.afterAll(async () => {
+  const coverageDir = "./coverage";
+  const files = fs.readdirSync(coverageDir).filter((f) => f.endsWith(".json"));
+
+  let totalBytes = 0;
+  let coveredBytes = 0;
+
+  for (const file of files) {
+    const coverage = JSON.parse(
+      fs.readFileSync(path.join(coverageDir, file), "utf-8")
+    );
+
+    for (const entry of coverage) {
+      if (!entry.url.includes("/src/")) continue;
+      totalBytes += entry.text?.length || 0;
+      coveredBytes += entry.ranges.reduce(
+        (sum: number, r: any) => sum + (r.end - r.start),
+        0
+      );
+    }
+  }
+
+  const coveragePercent = (coveredBytes / totalBytes) * 100;
+
+  // Enforce threshold
+  expect(coveragePercent).toBeGreaterThan(80);
+});
+```
+
+### Per-Directory Thresholds
+
+```typescript
+// coverage-check.ts
+interface CoverageThreshold {
+  pattern: RegExp;
+  minCoverage: number;
+}
+
+const thresholds: CoverageThreshold[] = [
+  { pattern: /\/src\/core\//, minCoverage: 90 },
+  { pattern: /\/src\/utils\//, minCoverage: 85 },
+  { pattern: /\/src\/components\//, minCoverage: 70 },
+  { pattern: /\/src\/pages\//, minCoverage: 60 },
+];
+
+function checkThresholds(coverage: any[]): string[] {
+  const violations: string[] = [];
+
+  for (const threshold of thresholds) {
+    const matchingFiles = coverage.filter((c) => threshold.pattern.test(c.url));
+
+    let total = 0;
+    let covered = 0;
+
+    for (const file of matchingFiles) {
+      total += file.text?.length || 0;
+      covered += file.ranges.reduce(
+        (sum: number, r: any) => sum + (r.end - r.start),
+        0
+      );
+    }
+
+    const percent = total > 0 ? (covered / total) * 100 : 0;
+
+    if (percent < threshold.minCoverage) {
+      violations.push(
+        `${threshold.pattern}: ${percent.toFixed(1)}% < ${
+          threshold.minCoverage
+        }%`
+      );
+    }
+  }
+
+  return violations;
+}
+```
+
+## Advanced Patterns
+
+### Merging Coverage Across Shards
+
+```typescript
+// scripts/merge-coverage.ts
+import fs from "fs";
+import { glob } from "glob";
+
+async function mergeCoverage() {
+  const files = await glob("shard-*/coverage/*.json");
+  const merged = new Map<string, any>();
+
+  for (const file of files) {
+    const data = JSON.parse(fs.readFileSync(file, "utf-8"));
+    for (const entry of data) {
+      if (merged.has(entry.url)) {
+        const existing = merged.get(entry.url);
+        existing.ranges.push(...entry.ranges);
+      } else {
+        merged.set(entry.url, { ...entry });
+      }
+    }
+  }
+
+  fs.writeFileSync(
+    "./coverage/merged.json",
+    JSON.stringify([...merged.values()])
+  );
+}
+
+mergeCoverage();
+```
+
+### Incremental Coverage
+
+```typescript
+// Check coverage only for changed files in CI
+import { execSync } from "child_process";
+import fs from "fs";
+
+const changedFiles = execSync("git diff --name-only HEAD~1")
+  .toString()
+  .split("\n")
+  .filter((f) => f.endsWith(".ts"));
+
+const coverage = JSON.parse(fs.readFileSync("./coverage/merged.json", "utf-8"));
+
+for (const file of changedFiles) {
+  const entry = coverage.find((c: any) => c.url.includes(file));
+  if (entry) {
+    const percent =
+      (entry.ranges.reduce((s: number, r: any) => s + r.end - r.start, 0) /
+        (entry.text?.length || 1)) *
+      100;
+    console.log(`${file}: ${percent.toFixed(1)}%`);
+  }
+}
+```
+
+## CI Integration
+
+### GitHub Actions
+
+```yaml
+# .github/workflows/test.yml
+name: Tests with Coverage
+
+on: [push, pull_request]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-node@v4
+        with:
+          node-version: 22
+
+      - run: npm ci
+      - run: npx playwright install --with-deps
+
+      - name: Run tests with coverage
+        run: npm run test:coverage
+
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v3
+        with:
+          files: ./coverage/lcov.info
+          fail_ci_if_error: true
+
+      - name: Check coverage threshold
+        run: |
+          COVERAGE=$(cat coverage/coverage-summary.json | jq '.total.lines.pct')
+          if (( $(echo "$COVERAGE < 80" | bc -l) )); then
+            echo "Coverage $COVERAGE% is below 80% threshold"
+            exit 1
+          fi
+```
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern                 | Problem                                | Solution                    |
+| ---------------------------- | -------------------------------------- | --------------------------- |
+| Coverage for coverage's sake | Gaming metrics                         | Focus on critical paths     |
+| 100% coverage target         | Diminishing returns, tests for getters | Set realistic thresholds    |
+| Ignoring coverage drops      | Technical debt                         | Enforce thresholds in CI    |
+| No source map support        | Wrong line numbers                     | Enable source maps in build |
+| Coverage only in CI          | Late feedback                          | Run locally too             |
+
+## Related References
+
+- **CI/CD**: See [ci-cd.md](ci-cd.md) for pipeline configuration
+- **Performance**: See [performance.md](performance.md) for optimizing coverage collection
diff --git a/.cursor/skills/playwright-testing/testing-patterns/accessibility.md b/.cursor/skills/playwright-testing/testing-patterns/accessibility.md
new file mode 100644
index 0000000000..1f13e21d94
--- /dev/null
+++ b/.cursor/skills/playwright-testing/testing-patterns/accessibility.md
@@ -0,0 +1,359 @@
+# Accessibility Testing
+
+## Table of Contents
+
+1. [Axe-Core Integration](#axe-core-integration)
+2. [Keyboard Navigation](#keyboard-navigation)
+3. [ARIA Validation](#aria-validation)
+4. [Focus Management](#focus-management)
+5. [Color & Contrast](#color--contrast)
+
+## Axe-Core Integration
+
+### Setup
+
+```bash
+npm install -D @axe-core/playwright
+```
+
+### Basic A11y Test
+
+```typescript
+import { test, expect } from "@playwright/test";
+import AxeBuilder from "@axe-core/playwright";
+
+test("homepage should have no a11y violations", async ({ page }) => {
+  await page.goto("/");
+
+  const results = await new AxeBuilder({ page }).analyze();
+
+  expect(results.violations).toEqual([]);
+});
+```
+
+### Scoped Analysis
+
+```typescript
+test("form accessibility", async ({ page }) => {
+  await page.goto("/contact");
+
+  // Analyze only the form
+  const results = await new AxeBuilder({ page })
+    .include("#contact-form")
+    .analyze();
+
+  expect(results.violations).toEqual([]);
+});
+
+test("ignore known issues", async ({ page }) => {
+  await page.goto("/legacy-page");
+
+  const results = await new AxeBuilder({ page })
+    .exclude(".legacy-widget") // Skip legacy component
+    .disableRules(["color-contrast"]) // Disable specific rule
+    .analyze();
+
+  expect(results.violations).toEqual([]);
+});
+```
+
+### A11y Fixture
+
+```typescript
+// fixtures/a11y.fixture.ts
+import { test as base } from "@playwright/test";
+import AxeBuilder from "@axe-core/playwright";
+
+type A11yFixtures = {
+  makeAxeBuilder: () => AxeBuilder;
+};
+
+export const test = base.extend<A11yFixtures>({
+  makeAxeBuilder: async ({ page }, use) => {
+    await use(() =>
+      new AxeBuilder({ page }).withTags([
+        "wcag2a",
+        "wcag2aa",
+        "wcag21a",
+        "wcag21aa",
+      ]),
+    );
+  },
+});
+
+// Usage
+test("dashboard a11y", async ({ page, makeAxeBuilder }) => {
+  await page.goto("/dashboard");
+  const results = await makeAxeBuilder().analyze();
+  expect(results.violations).toEqual([]);
+});
+```
+
+### Detailed Violation Reporting
+
+```typescript
+test("report a11y issues", async ({ page }) => {
+  await page.goto("/");
+
+  const results = await new AxeBuilder({ page }).analyze();
+
+  // Custom failure message with details
+  const violations = results.violations.map((v) => ({
+    id: v.id,
+    impact: v.impact,
+    description: v.description,
+    nodes: v.nodes.map((n) => n.html),
+  }));
+
+  expect(violations, JSON.stringify(violations, null, 2)).toHaveLength(0);
+});
+```
+
+## Keyboard Navigation
+
+### Tab Order Testing
+
+```typescript
+test("correct tab order in form", async ({ page }) => {
+  await page.goto("/signup");
+
+  // Start from the beginning
+  await page.keyboard.press("Tab");
+  await expect(page.getByLabel("Email")).toBeFocused();
+
+  await page.keyboard.press("Tab");
+  await expect(page.getByLabel("Password")).toBeFocused();
+
+  await page.keyboard.press("Tab");
+  await expect(page.getByRole("button", { name: "Sign up" })).toBeFocused();
+});
+```
+
+### Keyboard-Only Interaction
+
+```typescript
+test("complete flow with keyboard only", async ({ page }) => {
+  await page.goto("/products");
+
+  // Navigate to product with keyboard
+  await page.keyboard.press("Tab"); // Skip to main content
+  await page.keyboard.press("Tab"); // First product
+  await page.keyboard.press("Enter"); // Open product
+
+  await expect(page).toHaveURL(/\/products\/\d+/);
+
+  // Add to cart with keyboard
+  await page.keyboard.press("Tab");
+  await page.keyboard.press("Tab"); // Navigate to "Add to Cart"
+  await page.keyboard.press("Enter");
+
+  await expect(page.getByRole("alert")).toContainText("Added to cart");
+});
+```
+
+### Skip Links
+
+```typescript
+test("skip link works", async ({ page }) => {
+  await page.goto("/");
+
+  await page.keyboard.press("Tab");
+  const skipLink = page.getByRole("link", { name: /skip to main/i });
+  await expect(skipLink).toBeFocused();
+
+  await page.keyboard.press("Enter");
+
+  // Focus should move to main content
+  await expect(page.getByRole("main")).toBeFocused();
+});
+```
+
+### Escape Key Handling
+
+```typescript
+test("escape closes modal", async ({ page }) => {
+  await page.goto("/dashboard");
+  await page.getByRole("button", { name: "Settings" }).click();
+
+  const modal = page.getByRole("dialog");
+  await expect(modal).toBeVisible();
+
+  await page.keyboard.press("Escape");
+
+  await expect(modal).toBeHidden();
+  // Focus should return to trigger
+  await expect(page.getByRole("button", { name: "Settings" })).toBeFocused();
+});
+```
+
+## ARIA Validation
+
+### Role Verification
+
+```typescript
+test("correct ARIA roles", async ({ page }) => {
+  await page.goto("/dashboard");
+
+  // Verify landmark roles
+  await expect(page.getByRole("navigation")).toBeVisible();
+  await expect(page.getByRole("main")).toBeVisible();
+  await expect(page.getByRole("contentinfo")).toBeVisible(); // footer
+
+  // Verify interactive roles
+  await expect(page.getByRole("button", { name: "Menu" })).toBeVisible();
+  await expect(page.getByRole("search")).toBeVisible();
+});
+```
+
+### ARIA States
+
+```typescript
+test("aria-expanded updates correctly", async ({ page }) => {
+  await page.goto("/faq");
+
+  const accordion = page.getByRole("button", { name: "Shipping" });
+
+  // Initially collapsed
+  await expect(accordion).toHaveAttribute("aria-expanded", "false");
+
+  await accordion.click();
+
+  // Now expanded
+  await expect(accordion).toHaveAttribute("aria-expanded", "true");
+
+  // Content is visible
+  const panel = page.getByRole("region", { name: "Shipping" });
+  await expect(panel).toBeVisible();
+});
+```
+
+### Live Regions
+
+```typescript
+test("live region announces updates", async ({ page }) => {
+  await page.goto("/checkout");
+
+  // Find live region
+  const liveRegion = page.locator('[aria-live="polite"]');
+
+  await page.getByLabel("Quantity").fill("3");
+
+  // Live region should update with new total
+  await expect(liveRegion).toContainText("Total: $29.97");
+});
+```
+
+## Focus Management
+
+### Focus Trap in Modal
+
+```typescript
+test("focus trapped in modal", async ({ page }) => {
+  await page.goto("/");
+  await page.getByRole("button", { name: "Open Modal" }).click();
+
+  const modal = page.getByRole("dialog");
+  await expect(modal).toBeVisible();
+
+  // Get all focusable elements in modal
+  const focusableElements = modal.locator(
+    'button, [href], input, select, textarea, [tabindex]:not([tabindex="-1"])',
+  );
+  const count = await focusableElements.count();
+
+  // Tab through all elements, should stay in modal
+  for (let i = 0; i < count + 1; i++) {
+    await page.keyboard.press("Tab");
+    const focused = page.locator(":focus");
+    await expect(modal).toContainText((await focused.textContent()) || "");
+  }
+});
+```
+
+### Focus Restoration
+
+```typescript
+test("focus returns after modal close", async ({ page }) => {
+  await page.goto("/");
+
+  const trigger = page.getByRole("button", { name: "Delete Item" });
+  await trigger.click();
+
+  await page.getByRole("button", { name: "Cancel" }).click();
+
+  // Focus should return to the trigger
+  await expect(trigger).toBeFocused();
+});
+```
+
+## Color & Contrast
+
+### High Contrast Mode
+
+```typescript
+test("works in high contrast mode", async ({ page }) => {
+  await page.emulateMedia({ forcedColors: "active" });
+  await page.goto("/");
+
+  // Verify key elements are visible
+  await expect(page.getByRole("navigation")).toBeVisible();
+  await expect(page.getByRole("button", { name: "Sign In" })).toBeVisible();
+
+  // Take screenshot for visual verification
+  await expect(page).toHaveScreenshot("high-contrast.png");
+});
+```
+
+### Reduced Motion
+
+```typescript
+test("respects reduced motion preference", async ({ page }) => {
+  await page.emulateMedia({ reducedMotion: "reduce" });
+  await page.goto("/");
+
+  // Animations should be disabled
+  const hero = page.getByTestId("hero-animation");
+  const animation = await hero.evaluate(
+    (el) => getComputedStyle(el).animationDuration,
+  );
+
+  expect(animation).toBe("0s");
+});
+```
+
+## CI Integration
+
+### A11y as CI Gate
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  projects: [
+    {
+      name: "a11y",
+      testMatch: /.*\.a11y\.spec\.ts/,
+      use: { ...devices["Desktop Chrome"] },
+    },
+  ],
+});
+```
+
+```yaml
+# .github/workflows/a11y.yml
+- name: Run accessibility tests
+  run: npx playwright test --project=a11y
+```
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern                  | Problem                      | Solution                                   |
+| ----------------------------- | ---------------------------- | ------------------------------------------ |
+| Testing a11y only on homepage | Misses issues on other pages | Test all critical user flows               |
+| Ignoring all violations       | No value from tests          | Address or explicitly exclude known issues |
+| Only automated testing        | Misses many a11y issues      | Combine with manual testing                |
+| Testing without screen reader | Misses interaction issues    | Test with VoiceOver/NVDA periodically      |
+
+## Related References
+
+- **Locators**: See [locators.md](../core/locators.md) for role-based selectors
+- **Visual testing**: See [test-suite-structure.md](../core/test-suite-structure.md) for screenshot comparison
diff --git a/.cursor/skills/playwright-testing/testing-patterns/api-testing.md b/.cursor/skills/playwright-testing/testing-patterns/api-testing.md
new file mode 100644
index 0000000000..e3b5fa2207
--- /dev/null
+++ b/.cursor/skills/playwright-testing/testing-patterns/api-testing.md
@@ -0,0 +1,719 @@
+# API Testing
+
+## Table of Contents
+
+1. [Patterns](#patterns)
+2. [Decision Guide](#decision-guide)
+3. [Anti-Patterns](#anti-patterns)
+4. [Troubleshooting](#troubleshooting)
+
+> **When to use**: Testing REST APIs directly — validating endpoints, seeding test data, or verifying backend behavior without browser overhead.
+> **See also**: [graphql-testing.md](graphql-testing.md) for GraphQL-specific patterns.
+
+## Patterns
+
+### Request Fixtures for Authenticated Clients
+
+**Use when**: Multiple tests need an authenticated API client with shared configuration.
+**Avoid when**: A single test makes one-off API calls — use the built-in `request` fixture directly.
+
+```typescript
+// fixtures/api-fixtures.ts
+import { test as base, expect, APIRequestContext } from "@playwright/test";
+
+type ApiFixtures = {
+  authApi: APIRequestContext;
+  adminApi: APIRequestContext;
+};
+
+export const test = base.extend<ApiFixtures>({
+  authApi: async ({ playwright }, use) => {
+    const ctx = await playwright.request.newContext({
+      baseURL: "https://api.myapp.io",
+      extraHTTPHeaders: {
+        Authorization: `Bearer ${process.env.API_TOKEN}`,
+        Accept: "application/json",
+      },
+    });
+    await use(ctx);
+    await ctx.dispose();
+  },
+
+  adminApi: async ({ playwright }, use) => {
+    const loginCtx = await playwright.request.newContext({
+      baseURL: "https://api.myapp.io",
+    });
+    const loginResp = await loginCtx.post("/auth/login", {
+      data: {
+        email: process.env.ADMIN_EMAIL,
+        password: process.env.ADMIN_PASSWORD,
+      },
+    });
+    expect(loginResp.ok()).toBeTruthy();
+    const { token } = await loginResp.json();
+    await loginCtx.dispose();
+
+    const ctx = await playwright.request.newContext({
+      baseURL: "https://api.myapp.io",
+      extraHTTPHeaders: {
+        Authorization: `Bearer ${token}`,
+        Accept: "application/json",
+      },
+    });
+    await use(ctx);
+    await ctx.dispose();
+  },
+});
+
+export { expect };
+```
+
+```typescript
+// tests/api/admin.spec.ts
+import { test, expect } from "../../fixtures/api-fixtures";
+
+test("admin retrieves all accounts", async ({ adminApi }) => {
+  const resp = await adminApi.get("/admin/accounts");
+  expect(resp.status()).toBe(200);
+  const body = await resp.json();
+  expect(body.accounts.length).toBeGreaterThan(0);
+});
+```
+
+### CRUD Operations
+
+**Use when**: Making HTTP requests — GET, POST, PUT, PATCH, DELETE with headers, query params, and bodies.
+**Avoid when**: You need to test browser-rendered responses (redirects, cookies with `HttpOnly`).
+
+```typescript
+import { test, expect } from "@playwright/test";
+
+test("full CRUD cycle", async ({ request }) => {
+  // GET with query params
+  const listResp = await request.get("/api/items", {
+    params: { page: 1, limit: 10, category: "tools" },
+  });
+  expect(listResp.ok()).toBeTruthy();
+
+  // POST with JSON body
+  const createResp = await request.post("/api/items", {
+    data: {
+      title: "Hammer",
+      price: 19.99,
+      category: "tools",
+    },
+  });
+  expect(createResp.status()).toBe(201);
+  const created = await createResp.json();
+
+  // PUT — full replacement
+  const putResp = await request.put(`/api/items/${created.id}`, {
+    data: {
+      title: "Claw Hammer",
+      price: 24.99,
+      category: "tools",
+    },
+  });
+  expect(putResp.ok()).toBeTruthy();
+
+  // PATCH — partial update
+  const patchResp = await request.patch(`/api/items/${created.id}`, {
+    data: { price: 22.5 },
+  });
+  expect(patchResp.ok()).toBeTruthy();
+  const patched = await patchResp.json();
+  expect(patched.price).toBe(22.5);
+
+  // DELETE
+  const delResp = await request.delete(`/api/items/${created.id}`);
+  expect(delResp.status()).toBe(204);
+
+  // Verify deletion
+  const getDeleted = await request.get(`/api/items/${created.id}`);
+  expect(getDeleted.status()).toBe(404);
+});
+
+test("form-urlencoded body", async ({ request }) => {
+  const resp = await request.post("/oauth/token", {
+    form: {
+      grant_type: "client_credentials",
+      client_id: "my-client",
+      client_secret: "secret-value",
+    },
+  });
+  expect(resp.ok()).toBeTruthy();
+  const token = await resp.json();
+  expect(token).toHaveProperty("access_token");
+});
+```
+
+### Dedicated API Project Configuration
+
+**Use when**: Writing dedicated API test suites that do not need a browser.
+
+```typescript
+// playwright.config.ts
+import { defineConfig } from "@playwright/test";
+
+export default defineConfig({
+  projects: [
+    {
+      name: "api",
+      testDir: "./tests/api",
+      use: {
+        baseURL: "https://api.myapp.io",
+        extraHTTPHeaders: { Accept: "application/json" },
+      },
+    },
+    {
+      name: "e2e",
+      testDir: "./tests/e2e",
+      use: {
+        baseURL: "https://myapp.io",
+        browserName: "chromium",
+      },
+    },
+  ],
+});
+```
+
+### Response Assertions
+
+**Use when**: Validating response status, headers, and body structure.
+**Avoid when**: Never skip these — every API test should assert on status and body.
+
+```typescript
+import { test, expect } from "@playwright/test";
+
+test("comprehensive response validation", async ({ request }) => {
+  const resp = await request.get("/api/items/101");
+
+  // Status code — always check first
+  expect(resp.status()).toBe(200);
+  expect(resp.ok()).toBeTruthy();
+
+  // Headers
+  expect(resp.headers()["content-type"]).toContain("application/json");
+  expect(resp.headers()["cache-control"]).toMatch(/max-age=\d+/);
+
+  const item = await resp.json();
+
+  // Exact match on known fields
+  expect(item.id).toBe(101);
+  expect(item.title).toBe("Widget");
+
+  // Partial match — ignore fields you don't care about
+  expect(item).toMatchObject({
+    id: 101,
+    title: "Widget",
+    status: expect.stringMatching(/^(active|inactive|archived)$/),
+  });
+
+  // Type checks
+  expect(item).toMatchObject({
+    id: expect.any(Number),
+    title: expect.any(String),
+    createdAt: expect.any(String),
+    tags: expect.any(Array),
+  });
+
+  // Array content
+  expect(item.tags).toEqual(expect.arrayContaining(["featured"]));
+  expect(item.tags).not.toContain("deprecated");
+
+  // Nested object
+  expect(item.metadata).toMatchObject({
+    views: expect.any(Number),
+    rating: expect.any(Number),
+  });
+
+  // Date format
+  expect(new Date(item.createdAt).toISOString()).toBe(item.createdAt);
+});
+
+test("list response structure", async ({ request }) => {
+  const resp = await request.get("/api/items");
+  const body = await resp.json();
+
+  expect(body.items).toHaveLength(10);
+
+  for (const item of body.items) {
+    expect(item).toMatchObject({
+      id: expect.any(Number),
+      title: expect.any(String),
+      price: expect.any(Number),
+    });
+  }
+
+  expect(body.pagination).toEqual({
+    page: 1,
+    limit: 10,
+    total: expect.any(Number),
+    totalPages: expect.any(Number),
+  });
+});
+```
+
+### API Data Seeding
+
+**Use when**: E2E tests need specific data to exist before running. API seeding is 10-100x faster than UI-based setup.
+**Avoid when**: The test specifically validates the creation flow through the UI.
+
+```typescript
+import { test as base, expect } from "@playwright/test";
+
+type SeedFixtures = {
+  seedAccount: { id: number; email: string; password: string };
+  seedWorkspace: { id: number; name: string };
+};
+
+export const test = base.extend<SeedFixtures>({
+  seedAccount: async ({ request }, use) => {
+    const email = `account-${Date.now()}@test.io`;
+    const password = "SecurePass123!";
+
+    const resp = await request.post("/api/accounts", {
+      data: { name: "Test Account", email, password },
+    });
+    expect(resp.ok()).toBeTruthy();
+    const account = await resp.json();
+
+    await use({ id: account.id, email, password });
+
+    // Cleanup
+    await request.delete(`/api/accounts/${account.id}`);
+  },
+
+  seedWorkspace: async ({ request, seedAccount }, use) => {
+    const resp = await request.post("/api/workspaces", {
+      data: { name: `Workspace ${Date.now()}`, ownerId: seedAccount.id },
+    });
+    expect(resp.ok()).toBeTruthy();
+    const workspace = await resp.json();
+
+    await use({ id: workspace.id, name: workspace.name });
+
+    await request.delete(`/api/workspaces/${workspace.id}`);
+  },
+});
+
+export { expect };
+```
+
+```typescript
+// tests/e2e/workspace-dashboard.spec.ts
+import { test, expect } from "../../fixtures/seed-fixtures";
+
+test("user sees workspace on dashboard", async ({
+  page,
+  seedAccount,
+  seedWorkspace,
+}) => {
+  await page.goto("/login");
+  await page.getByLabel("Email").fill(seedAccount.email);
+  await page.getByLabel("Password").fill(seedAccount.password);
+  await page.getByRole("button", { name: "Sign in" }).click();
+
+  await page.waitForURL("/dashboard");
+  await expect(
+    page.getByRole("heading", { name: seedWorkspace.name })
+  ).toBeVisible();
+});
+```
+
+### Error Response Testing
+
+**Use when**: Every API has error paths — test them. A missing 401 test today is a security hole tomorrow.
+
+```typescript
+import { test, expect } from "@playwright/test";
+
+test.describe("Error responses", () => {
+  test("400 — validation error with details", async ({ request }) => {
+    const resp = await request.post("/api/items", {
+      data: { title: "", price: -5 },
+    });
+    expect(resp.status()).toBe(400);
+
+    const body = await resp.json();
+    expect(body).toMatchObject({
+      error: "Validation Error",
+      details: expect.any(Array),
+    });
+    expect(body.details).toEqual(
+      expect.arrayContaining([
+        expect.objectContaining({
+          field: "title",
+          message: expect.any(String),
+        }),
+        expect.objectContaining({
+          field: "price",
+          message: expect.any(String),
+        }),
+      ])
+    );
+  });
+
+  test("401 — missing authentication", async ({ request }) => {
+    const resp = await request.get("/api/protected/resource", {
+      headers: { Authorization: "" },
+    });
+    expect(resp.status()).toBe(401);
+    const body = await resp.json();
+    expect(body.error).toMatch(/unauthorized|unauthenticated/i);
+  });
+
+  test("403 — insufficient permissions", async ({ request }) => {
+    const resp = await request.delete("/api/admin/items/1");
+    expect(resp.status()).toBe(403);
+    const body = await resp.json();
+    expect(body.error).toMatch(/forbidden|insufficient permissions/i);
+  });
+
+  test("404 — resource not found", async ({ request }) => {
+    const resp = await request.get("/api/items/999999");
+    expect(resp.status()).toBe(404);
+    const body = await resp.json();
+    expect(body).toMatchObject({ error: expect.stringMatching(/not found/i) });
+  });
+
+  test("409 — conflict on duplicate", async ({ request }) => {
+    const sku = `SKU-${Date.now()}`;
+    await request.post("/api/items", { data: { title: "First", sku } });
+
+    const resp = await request.post("/api/items", {
+      data: { title: "Duplicate", sku },
+    });
+    expect(resp.status()).toBe(409);
+  });
+
+  test("422 — unprocessable entity", async ({ request }) => {
+    const resp = await request.post("/api/orders", {
+      data: { items: [] },
+    });
+    expect(resp.status()).toBe(422);
+    const body = await resp.json();
+    expect(body.error).toContain("at least one item");
+  });
+
+  test("429 — rate limiting", async ({ request }) => {
+    const responses = await Promise.all(
+      Array.from({ length: 50 }, () =>
+        request.get("/api/search", { params: { q: "test" } })
+      )
+    );
+    const rateLimited = responses.filter((r) => r.status() === 429);
+    expect(rateLimited.length).toBeGreaterThan(0);
+    expect(rateLimited[0].headers()["retry-after"]).toBeDefined();
+  });
+});
+```
+
+### File Upload via API
+
+**Use when**: Testing file upload endpoints with multipart form data.
+**Avoid when**: You need to test the browser file picker dialog — use `page.setInputFiles()` instead.
+
+```typescript
+import { test, expect } from "@playwright/test";
+import path from "path";
+import fs from "fs";
+
+test("upload file via multipart", async ({ request }) => {
+  const filePath = path.resolve("tests/fixtures/report.pdf");
+
+  const resp = await request.post("/api/documents/upload", {
+    multipart: {
+      file: {
+        name: "report.pdf",
+        mimeType: "application/pdf",
+        buffer: fs.readFileSync(filePath),
+      },
+      description: "Monthly report",
+      category: "reports",
+    },
+  });
+
+  expect(resp.status()).toBe(201);
+  const body = await resp.json();
+  expect(body).toMatchObject({
+    id: expect.any(String),
+    filename: "report.pdf",
+    mimeType: "application/pdf",
+    size: expect.any(Number),
+    url: expect.stringMatching(/^https:\/\//),
+  });
+});
+
+test("rejects oversized files", async ({ request }) => {
+  const largeBuffer = Buffer.alloc(11 * 1024 * 1024); // 11MB
+
+  const resp = await request.post("/api/documents/upload", {
+    multipart: {
+      file: {
+        name: "large-file.bin",
+        mimeType: "application/octet-stream",
+        buffer: largeBuffer,
+      },
+    },
+  });
+
+  expect(resp.status()).toBe(413);
+});
+```
+
+### Chained API Calls
+
+**Use when**: Testing multi-step workflows — create, read, update, delete sequences; order flows; state machine transitions.
+**Avoid when**: You can test each endpoint in isolation and the interactions are trivial.
+
+```typescript
+import { test, expect } from "@playwright/test";
+
+test("complete order workflow", async ({ request }) => {
+  // Step 1: Create a product
+  const productResp = await request.post("/api/products", {
+    data: { name: "Gadget", price: 49.99, stock: 50 },
+  });
+  expect(productResp.status()).toBe(201);
+  const product = await productResp.json();
+
+  // Step 2: Create a cart
+  const cartResp = await request.post("/api/carts", {
+    data: { items: [{ productId: product.id, quantity: 3 }] },
+  });
+  expect(cartResp.status()).toBe(201);
+  const cart = await cartResp.json();
+  expect(cart.total).toBe(149.97);
+
+  // Step 3: Checkout
+  const orderResp = await request.post("/api/orders", {
+    data: {
+      cartId: cart.id,
+      shippingAddress: {
+        street: "456 Main Ave",
+        city: "Metropolis",
+        zip: "54321",
+      },
+    },
+  });
+  expect(orderResp.status()).toBe(201);
+  const order = await orderResp.json();
+  expect(order.status).toBe("pending");
+  expect(order.items).toHaveLength(1);
+
+  // Step 4: Verify order in list
+  const ordersResp = await request.get("/api/orders");
+  const orders = await ordersResp.json();
+  expect(orders.items.map((o: any) => o.id)).toContain(order.id);
+
+  // Step 5: Verify stock decreased
+  const updatedProduct = await (
+    await request.get(`/api/products/${product.id}`)
+  ).json();
+  expect(updatedProduct.stock).toBe(47);
+
+  // Cleanup
+  await request.delete(`/api/orders/${order.id}`);
+  await request.delete(`/api/products/${product.id}`);
+});
+
+test("state machine transitions — publish workflow", async ({ request }) => {
+  const createResp = await request.post("/api/articles", {
+    data: { title: "Draft Article", body: "Content here." },
+  });
+  const article = await createResp.json();
+  expect(article.status).toBe("draft");
+
+  // Submit for review
+  const reviewResp = await request.patch(`/api/articles/${article.id}/status`, {
+    data: { status: "in_review" },
+  });
+  expect(reviewResp.ok()).toBeTruthy();
+  expect((await reviewResp.json()).status).toBe("in_review");
+
+  // Approve
+  const approveResp = await request.patch(
+    `/api/articles/${article.id}/status`,
+    {
+      data: { status: "published" },
+    }
+  );
+  expect(approveResp.ok()).toBeTruthy();
+  expect((await approveResp.json()).status).toBe("published");
+
+  // Cannot revert to draft from published
+  const revertResp = await request.patch(`/api/articles/${article.id}/status`, {
+    data: { status: "draft" },
+  });
+  expect(revertResp.status()).toBe(422);
+
+  await request.delete(`/api/articles/${article.id}`);
+});
+
+test("API + E2E hybrid — seed via API, verify in browser", async ({
+  request,
+  page,
+}) => {
+  const resp = await request.post("/api/products", {
+    data: {
+      name: `Hybrid Product ${Date.now()}`,
+      price: 35.0,
+      published: true,
+    },
+  });
+  const product = await resp.json();
+
+  await page.goto("/products");
+  await expect(page.getByRole("heading", { name: product.name })).toBeVisible();
+  await expect(page.getByText("$35.00")).toBeVisible();
+
+  await request.delete(`/api/products/${product.id}`);
+});
+```
+
+### Schema Validation with Zod
+
+**Use when**: Verifying API responses match a contract — field types, required fields, value constraints.
+**Avoid when**: You only need to check one or two specific fields — use `toMatchObject` instead.
+
+```typescript
+import { test, expect } from "@playwright/test";
+import { z } from "zod";
+
+const ItemSchema = z.object({
+  id: z.number().positive(),
+  title: z.string().min(1),
+  price: z.number().nonnegative(),
+  status: z.enum(["active", "inactive", "archived"]),
+  createdAt: z.string().datetime(),
+  metadata: z.object({
+    views: z.number().int().nonnegative(),
+    rating: z.number().min(0).max(5).nullable(),
+  }),
+});
+
+const PaginatedItemsSchema = z.object({
+  items: z.array(ItemSchema),
+  pagination: z.object({
+    page: z.number().int().positive(),
+    limit: z.number().int().positive(),
+    total: z.number().int().nonnegative(),
+  }),
+});
+
+test("GET /api/items matches schema", async ({ request }) => {
+  const resp = await request.get("/api/items");
+  expect(resp.ok()).toBeTruthy();
+
+  const body = await resp.json();
+  const result = PaginatedItemsSchema.safeParse(body);
+
+  if (!result.success) {
+    throw new Error(
+      `Schema validation failed:\n${result.error.issues
+        .map((i) => `  ${i.path.join(".")}: ${i.message}`)
+        .join("\n")}`
+    );
+  }
+});
+```
+
+## Decision Guide
+
+| Scenario                                         | Use API Tests               | Use E2E Tests                  | Why                                                                |
+| ------------------------------------------------ | --------------------------- | ------------------------------ | ------------------------------------------------------------------ |
+| Validate response status/body/headers            | Yes                         | No                             | No browser needed; 10-100x faster                                  |
+| Test business logic (calculations, rules)        | Yes                         | No                             | API tests isolate backend logic from UI                            |
+| Verify form submission creates correct data      | Seed via API, submit via UI | Yes                            | UI test validates the form; API check confirms persistence         |
+| Test error messages shown to user                | No                          | Yes                            | Error rendering is a UI concern                                    |
+| Validate pagination, filtering, sorting          | Yes                         | Maybe both                     | API test for correctness; E2E test only if the UI logic is complex |
+| Seed test data for E2E tests                     | Yes (fixture)               | No                             | API seeding is fast and reliable                                   |
+| Test auth flows (login/logout/RBAC)              | Yes for token/session logic | Yes for UI flow                | Both matter: API protects resources, UI guides users               |
+| Verify file upload processing                    | Yes                         | Only if testing file picker UI | API test validates backend processing                              |
+| Contract/schema regression testing               | Yes                         | No                             | Schema tests run in milliseconds                                   |
+| Test third-party webhook handling                | Yes                         | No                             | Webhooks are API-to-API; no UI involved                            |
+| Verify redirect behavior after action            | No                          | Yes                            | Redirects are browser/navigation concerns                          |
+| Test real-time updates (WebSocket + API trigger) | API triggers                | E2E verifies                   | Seed via API, observe in browser                                   |
+
+## Anti-Patterns
+
+| Don't Do This                                        | Problem                                                                                | Do This Instead                                                   |
+| ---------------------------------------------------- | -------------------------------------------------------------------------------------- | ----------------------------------------------------------------- |
+| Use E2E tests to validate pure API responses         | Slow, flaky, launches a browser for no reason                                          | Use `request` fixture — no browser, direct HTTP                   |
+| Ignore `response.status()`                           | A 500 with a fallback body can pass all body assertions                                | Always assert status first: `expect(response.status()).toBe(200)` |
+| Skip response header checks                          | Missing `Content-Type`, `Cache-Control`, CORS headers cause production bugs            | Assert critical headers                                           |
+| Only test the happy path                             | Real users trigger 400, 401, 403, 404, 409, 422 — every one needs a test               | Dedicate a `describe` block to error responses                    |
+| Hardcode IDs in API tests                            | Tests break when database is reset or IDs are reassigned                               | Create resources in the test, use returned IDs                    |
+| Share mutable state between tests                    | Tests that depend on execution order are flaky and cannot run in parallel              | Each test creates and cleans up its own data                      |
+| Parse `response.text()` then `JSON.parse()` manually | Playwright's `response.json()` handles this and throws clear errors on non-JSON        | Use `await response.json()`                                       |
+| Forget cleanup after creating resources              | Test pollution: subsequent tests may see stale data or hit unique constraints          | Use fixtures with teardown or explicit `delete` calls             |
+| Use `page.request` when you don't need a page        | `page.request` shares cookies with the browser context, which may cause auth confusion | Use the standalone `request` fixture for pure API tests           |
+
+## Troubleshooting
+
+### "Request failed: connect ECONNREFUSED 127.0.0.1:3000"
+
+**Cause**: The API server is not running, or `baseURL` points to the wrong host/port.
+
+**Fix**: Verify the server is running before tests. Use `webServer` in config to start it automatically.
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  webServer: {
+    command: "npm run start:api",
+    url: "http://localhost:3000/api/health",
+    reuseExistingServer: !process.env.CI,
+  },
+  use: { baseURL: "http://localhost:3000" },
+});
+```
+
+### "response.json() failed — body is not valid JSON"
+
+**Cause**: The endpoint returned HTML (error page), plain text, or an empty body instead of JSON.
+
+**Fix**: Check `response.status()` first — a 500 or 302 often returns HTML. Log `await response.text()` to see the actual body. Verify the `Accept: application/json` header is set.
+
+```typescript
+const resp = await request.get("/api/endpoint");
+if (!resp.ok()) {
+  console.error(`Status: ${resp.status()}, Body: ${await resp.text()}`);
+}
+const body = await resp.json();
+```
+
+### "401 Unauthorized" when using `request` fixture
+
+**Cause**: The built-in `request` fixture does not carry browser cookies or auth tokens automatically.
+
+**Fix**: Set `extraHTTPHeaders` in config or create a custom authenticated fixture. If you need cookies from a browser login, use `page.request` instead.
+
+```typescript
+// Option A: config-level headers
+export default defineConfig({
+  use: {
+    extraHTTPHeaders: { Authorization: `Bearer ${process.env.API_TOKEN}` },
+  },
+});
+
+// Option B: per-request headers
+const resp = await request.get("/api/resource", {
+  headers: { Authorization: `Bearer ${token}` },
+});
+
+// Option C: use page.request to inherit browser cookies
+test("API call with browser auth", async ({ page }) => {
+  await page.goto("/login");
+  // ... login via UI ...
+  const resp = await page.request.get("/api/profile");
+  expect(resp.ok()).toBeTruthy();
+});
+```
+
+### Tests pass locally but fail in CI
+
+**Cause**: Different environments, database state, or missing environment variables.
+
+**Fix**: Use `process.env` for secrets and base URLs. Run database seeds or migrations in `globalSetup`. Use unique identifiers (timestamps, UUIDs) for test data. Check that the CI `baseURL` matches the deployed service.
diff --git a/.cursor/skills/playwright-testing/testing-patterns/browser-extensions.md b/.cursor/skills/playwright-testing/testing-patterns/browser-extensions.md
new file mode 100644
index 0000000000..f3991fbb96
--- /dev/null
+++ b/.cursor/skills/playwright-testing/testing-patterns/browser-extensions.md
@@ -0,0 +1,506 @@
+# Browser Extension Testing
+
+## Table of Contents
+
+1. [Setup & Configuration](#setup--configuration)
+2. [Loading Extensions](#loading-extensions)
+3. [Popup Testing](#popup-testing)
+4. [Background Script Testing](#background-script-testing)
+5. [Content Script Testing](#content-script-testing)
+6. [Extension APIs](#extension-apis)
+7. [Cross-Browser Testing](#cross-browser-testing)
+
+## Setup & Configuration
+
+### Prerequisites
+
+```bash
+npm install -D @playwright/test
+npx playwright install chromium  # Extensions only work in Chromium
+```
+
+### Basic Configuration
+
+```typescript
+// playwright.config.ts
+import { defineConfig } from "@playwright/test";
+import path from "path";
+
+export default defineConfig({
+  testDir: "./tests",
+  use: {
+    // Extensions require non-headless Chromium
+    headless: false,
+  },
+  projects: [
+    {
+      name: "chromium-extension",
+      use: {
+        browserName: "chromium",
+      },
+    },
+  ],
+});
+```
+
+### Extension Fixture
+
+```typescript
+// fixtures/extension.ts
+import { test as base, chromium, BrowserContext, Page } from "@playwright/test";
+import path from "path";
+
+type ExtensionFixtures = {
+  context: BrowserContext;
+  extensionId: string;
+  backgroundPage: Page;
+};
+
+export const test = base.extend<ExtensionFixtures>({
+  context: async ({}, use) => {
+    const pathToExtension = path.join(__dirname, "../extension");
+
+    const context = await chromium.launchPersistentContext("", {
+      headless: false,
+      args: [
+        `--disable-extensions-except=${pathToExtension}`,
+        `--load-extension=${pathToExtension}`,
+      ],
+    });
+
+    await use(context);
+    await context.close();
+  },
+
+  extensionId: async ({ context }, use) => {
+    // Get extension ID from service worker URL
+    let extensionId = "";
+
+    // Wait for service worker to be registered
+    const serviceWorker =
+      context.serviceWorkers()[0] ||
+      (await context.waitForEvent("serviceworker"));
+
+    extensionId = serviceWorker.url().split("/")[2];
+
+    await use(extensionId);
+  },
+
+  backgroundPage: async ({ context }, use) => {
+    // For Manifest V2 extensions
+    const backgroundPage =
+      context.backgroundPages()[0] ||
+      (await context.waitForEvent("backgroundpage"));
+
+    await use(backgroundPage);
+  },
+});
+
+export { expect } from "@playwright/test";
+```
+
+## Loading Extensions
+
+### Manifest V3 (Service Worker)
+
+```typescript
+test("load MV3 extension", async () => {
+  const pathToExtension = path.join(__dirname, "../my-extension");
+
+  const context = await chromium.launchPersistentContext("", {
+    headless: false,
+    args: [
+      `--disable-extensions-except=${pathToExtension}`,
+      `--load-extension=${pathToExtension}`,
+    ],
+  });
+
+  // Wait for service worker
+  const serviceWorker = await context.waitForEvent("serviceworker");
+  expect(serviceWorker.url()).toContain("chrome-extension://");
+
+  await context.close();
+});
+```
+
+### Manifest V2 (Background Page)
+
+```typescript
+test("load MV2 extension", async () => {
+  const pathToExtension = path.join(__dirname, "../my-extension-v2");
+
+  const context = await chromium.launchPersistentContext("", {
+    headless: false,
+    args: [
+      `--disable-extensions-except=${pathToExtension}`,
+      `--load-extension=${pathToExtension}`,
+    ],
+  });
+
+  // Wait for background page
+  const backgroundPage = await context.waitForEvent("backgroundpage");
+  expect(backgroundPage.url()).toContain("chrome-extension://");
+
+  await context.close();
+});
+```
+
+### Multiple Extensions
+
+```typescript
+test("load multiple extensions", async () => {
+  const extension1 = path.join(__dirname, "../extension1");
+  const extension2 = path.join(__dirname, "../extension2");
+
+  const context = await chromium.launchPersistentContext("", {
+    headless: false,
+    args: [
+      `--disable-extensions-except=${extension1},${extension2}`,
+      `--load-extension=${extension1},${extension2}`,
+    ],
+  });
+
+  // Both service workers should be available
+  await context.waitForEvent("serviceworker");
+  await context.waitForEvent("serviceworker");
+
+  expect(context.serviceWorkers().length).toBe(2);
+
+  await context.close();
+});
+```
+
+## Popup Testing
+
+### Opening Extension Popup
+
+```typescript
+test("test popup UI", async ({ context, extensionId }) => {
+  // Open popup directly by URL
+  const popupPage = await context.newPage();
+  await popupPage.goto(`chrome-extension://${extensionId}/popup.html`);
+
+  // Test popup interactions
+  await expect(popupPage.getByRole("heading")).toHaveText("My Extension");
+  await popupPage.getByRole("button", { name: "Enable" }).click();
+  await expect(popupPage.getByText("Enabled")).toBeVisible();
+});
+```
+
+### Popup State Persistence
+
+```typescript
+test("popup remembers state", async ({ context, extensionId }) => {
+  // First interaction
+  const popup1 = await context.newPage();
+  await popup1.goto(`chrome-extension://${extensionId}/popup.html`);
+  await popup1.getByRole("checkbox", { name: "Dark Mode" }).check();
+  await popup1.close();
+
+  // Reopen popup
+  const popup2 = await context.newPage();
+  await popup2.goto(`chrome-extension://${extensionId}/popup.html`);
+
+  // State should persist
+  await expect(
+    popup2.getByRole("checkbox", { name: "Dark Mode" }),
+  ).toBeChecked();
+});
+```
+
+### Popup Communication with Background
+
+```typescript
+test("popup sends message to background", async ({ context, extensionId }) => {
+  const popup = await context.newPage();
+  await popup.goto(`chrome-extension://${extensionId}/popup.html`);
+
+  // Set up listener for response
+  const responsePromise = popup.evaluate(() => {
+    return new Promise((resolve) => {
+      chrome.runtime.onMessage.addListener((message) => {
+        if (message.type === "RESPONSE") resolve(message.data);
+      });
+    });
+  });
+
+  // Click button that sends message
+  await popup.getByRole("button", { name: "Fetch Data" }).click();
+
+  // Verify response
+  const response = await responsePromise;
+  expect(response).toBeDefined();
+});
+```
+
+## Background Script Testing
+
+### Manifest V3 Service Worker
+
+```typescript
+test("service worker handles messages", async ({ context, extensionId }) => {
+  const page = await context.newPage();
+  await page.goto("https://example.com");
+
+  // Send message to service worker from page
+  const response = await page.evaluate(async (extId) => {
+    return new Promise((resolve) => {
+      chrome.runtime.sendMessage(extId, { type: "GET_STATUS" }, resolve);
+    });
+  }, extensionId);
+
+  expect(response).toEqual({ status: "active" });
+});
+```
+
+### Testing Background Logic
+
+```typescript
+test("background script logic", async ({ context }) => {
+  const serviceWorker =
+    context.serviceWorkers()[0] ||
+    (await context.waitForEvent("serviceworker"));
+
+  // Evaluate in service worker context
+  const result = await serviceWorker.evaluate(async () => {
+    // Access extension APIs
+    const storage = await chrome.storage.local.get("settings");
+    return storage;
+  });
+
+  expect(result.settings).toBeDefined();
+});
+```
+
+### Alarms and Timers
+
+```typescript
+test("alarm triggers correctly", async ({ context }) => {
+  const serviceWorker = await context.waitForEvent("serviceworker");
+
+  // Create alarm
+  await serviceWorker.evaluate(async () => {
+    await chrome.alarms.create("test-alarm", { delayInMinutes: 0.01 });
+  });
+
+  // Wait for alarm handler
+  await serviceWorker.evaluate(() => {
+    return new Promise<void>((resolve) => {
+      chrome.alarms.onAlarm.addListener((alarm) => {
+        if (alarm.name === "test-alarm") resolve();
+      });
+    });
+  });
+
+  // Verify alarm was handled (check side effects)
+  const wasHandled = await serviceWorker.evaluate(async () => {
+    const { alarmTriggered } = await chrome.storage.local.get("alarmTriggered");
+    return alarmTriggered;
+  });
+
+  expect(wasHandled).toBe(true);
+});
+```
+
+## Content Script Testing
+
+### Injected Content Script
+
+```typescript
+test("content script injects UI", async ({ context }) => {
+  const page = await context.newPage();
+  await page.goto("https://example.com");
+
+  // Wait for content script to inject elements
+  await expect(page.locator("#my-extension-widget")).toBeVisible();
+
+  // Interact with injected UI
+  await page.locator("#my-extension-widget button").click();
+  await expect(page.locator("#my-extension-widget .result")).toHaveText(
+    "Success",
+  );
+});
+```
+
+### Content Script Communication
+
+```typescript
+test("content script communicates with background", async ({
+  context,
+  extensionId,
+}) => {
+  const page = await context.newPage();
+  await page.goto("https://example.com");
+
+  // Trigger content script action
+  await page.locator("#my-extension-button").click();
+
+  // Wait for background response reflected in UI
+  await expect(page.locator("#my-extension-status")).toHaveText("Connected");
+});
+```
+
+### Page Modification Testing
+
+```typescript
+test("content script modifies page", async ({ context }) => {
+  const page = await context.newPage();
+  await page.goto("https://example.com");
+
+  // Verify content script modifications
+  const hasModification = await page.evaluate(() => {
+    // Check for injected styles
+    const styles = document.querySelectorAll('style[data-extension="my-ext"]');
+    return styles.length > 0;
+  });
+
+  expect(hasModification).toBe(true);
+
+  // Check DOM modifications
+  const modifiedElements = await page
+    .locator("[data-modified-by-extension]")
+    .count();
+  expect(modifiedElements).toBeGreaterThan(0);
+});
+```
+
+## Extension APIs
+
+### Storage API
+
+```typescript
+test("chrome.storage operations", async ({ context }) => {
+  const serviceWorker = await context.waitForEvent("serviceworker");
+
+  // Set storage
+  await serviceWorker.evaluate(async () => {
+    await chrome.storage.local.set({ key: "value", count: 42 });
+  });
+
+  // Get storage
+  const data = await serviceWorker.evaluate(async () => {
+    return await chrome.storage.local.get(["key", "count"]);
+  });
+
+  expect(data).toEqual({ key: "value", count: 42 });
+
+  // Test storage.sync
+  await serviceWorker.evaluate(async () => {
+    await chrome.storage.sync.set({ synced: true });
+  });
+
+  const syncData = await serviceWorker.evaluate(async () => {
+    return await chrome.storage.sync.get("synced");
+  });
+
+  expect(syncData.synced).toBe(true);
+});
+```
+
+### Tabs API
+
+```typescript
+test("chrome.tabs operations", async ({ context }) => {
+  const serviceWorker = await context.waitForEvent("serviceworker");
+
+  // Create a tab
+  const page = await context.newPage();
+  await page.goto("https://example.com");
+
+  // Query tabs from service worker
+  const tabs = await serviceWorker.evaluate(async () => {
+    return await chrome.tabs.query({ url: "*://example.com/*" });
+  });
+
+  expect(tabs.length).toBeGreaterThan(0);
+  expect(tabs[0].url).toContain("example.com");
+
+  // Send message to tab
+  await serviceWorker.evaluate(async (tabId) => {
+    await chrome.tabs.sendMessage(tabId, { type: "PING" });
+  }, tabs[0].id);
+});
+```
+
+### Context Menus
+
+```typescript
+test("context menu actions", async ({ context, extensionId }) => {
+  const serviceWorker = await context.waitForEvent("serviceworker");
+
+  // Create context menu
+  await serviceWorker.evaluate(async () => {
+    await chrome.contextMenus.create({
+      id: "test-menu",
+      title: "Test Action",
+      contexts: ["selection"],
+    });
+  });
+
+  // Simulate context menu click
+  const page = await context.newPage();
+  await page.goto("https://example.com");
+
+  // Select text
+  await page.evaluate(() => {
+    const range = document.createRange();
+    range.selectNodeContents(document.body.firstChild!);
+    window.getSelection()?.addRange(range);
+  });
+
+  // Trigger context menu action programmatically
+  await serviceWorker.evaluate(async () => {
+    // Simulate the click handler
+    chrome.contextMenus.onClicked.dispatch(
+      { menuItemId: "test-menu", selectionText: "selected text" },
+      { id: 1, url: "https://example.com" },
+    );
+  });
+});
+```
+
+### Permissions API
+
+```typescript
+test("request permissions", async ({ context, extensionId }) => {
+  const popup = await context.newPage();
+  await popup.goto(`chrome-extension://${extensionId}/popup.html`);
+
+  // Check current permissions
+  const hasPermission = await popup.evaluate(async () => {
+    return await chrome.permissions.contains({
+      origins: ["https://*.github.com/*"],
+    });
+  });
+
+  // Request new permission (will show prompt in real scenario)
+  // For testing, we check the request is made correctly
+  const permissionRequest = popup.evaluate(async () => {
+    try {
+      return await chrome.permissions.request({
+        origins: ["https://*.github.com/*"],
+      });
+    } catch (e) {
+      return false;
+    }
+  });
+
+  // In automated tests, permission prompts are typically auto-granted or mocked
+});
+```
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern                   | Problem               | Solution                                 |
+| ------------------------------ | --------------------- | ---------------------------------------- |
+| Testing in headless mode       | Extensions don't load | Use `headless: false`                    |
+| Not waiting for service worker | Race conditions       | Wait for `serviceworker` event           |
+| Hardcoding extension ID        | ID changes on reload  | Extract ID from service worker URL       |
+| Testing packed extensions only | Slow iteration        | Test unpacked during development         |
+| Ignoring MV3 differences       | Breaking changes      | Test both MV2 and MV3 if supporting both |
+
+## Related References
+
+- **Service Workers**: See [service-workers.md](../browser-apis/service-workers.md) for SW testing patterns
+- **Multi-Context**: See [multi-context.md](../advanced/multi-context.md) for popup handling
+- **Browser APIs**: See [browser-apis.md](../browser-apis/browser-apis.md) for permissions testing
diff --git a/.cursor/skills/playwright-testing/testing-patterns/canvas-webgl.md b/.cursor/skills/playwright-testing/testing-patterns/canvas-webgl.md
new file mode 100644
index 0000000000..2d9aeb2e3b
--- /dev/null
+++ b/.cursor/skills/playwright-testing/testing-patterns/canvas-webgl.md
@@ -0,0 +1,493 @@
+# Canvas & WebGL Testing
+
+## Table of Contents
+
+1. [Canvas Basics](#canvas-basics)
+2. [Visual Comparison](#visual-comparison)
+3. [Interaction Testing](#interaction-testing)
+4. [WebGL Testing](#webgl-testing)
+5. [Chart Libraries](#chart-libraries)
+6. [Game & Animation Testing](#game--animation-testing)
+
+## Canvas Basics
+
+### Locating Canvas Elements
+
+```typescript
+test("find canvas", async ({ page }) => {
+  await page.goto("/canvas-app");
+
+  // By tag
+  const canvas = page.locator("canvas");
+
+  // By ID or class
+  const gameCanvas = page.locator("canvas#game");
+  const chartCanvas = page.locator("canvas.chart-canvas");
+
+  // Verify canvas is present and visible
+  await expect(canvas).toBeVisible();
+
+  // Get canvas dimensions
+  const box = await canvas.boundingBox();
+  console.log(`Canvas size: ${box?.width}x${box?.height}`);
+});
+```
+
+### Canvas Screenshot Testing
+
+```typescript
+test("canvas renders correctly", async ({ page }) => {
+  await page.goto("/chart");
+
+  // Wait for canvas to be ready (check for specific content)
+  await page.waitForFunction(() => {
+    const canvas = document.querySelector("canvas");
+    const ctx = canvas?.getContext("2d");
+    // Check if canvas has been drawn to
+    return ctx && !isCanvasBlank(canvas);
+
+    function isCanvasBlank(canvas) {
+      const ctx = canvas.getContext("2d");
+      const data = ctx.getImageData(0, 0, canvas.width, canvas.height).data;
+      return !data.some((channel) => channel !== 0);
+    }
+  });
+
+  // Screenshot just the canvas
+  const canvas = page.locator("canvas");
+  await expect(canvas).toHaveScreenshot("chart.png");
+});
+```
+
+### Extracting Canvas Data
+
+```typescript
+test("verify canvas content", async ({ page }) => {
+  await page.goto("/drawing-app");
+
+  // Get canvas image data
+  const imageData = await page.evaluate(() => {
+    const canvas = document.querySelector("canvas") as HTMLCanvasElement;
+    return canvas.toDataURL("image/png");
+  });
+
+  // Verify it's not empty
+  expect(imageData).toMatch(/^data:image\/png;base64,.+/);
+
+  // Get pixel data at specific location
+  const pixelColor = await page.evaluate(() => {
+    const canvas = document.querySelector("canvas") as HTMLCanvasElement;
+    const ctx = canvas.getContext("2d")!;
+    const pixel = ctx.getImageData(100, 100, 1, 1).data;
+    return { r: pixel[0], g: pixel[1], b: pixel[2], a: pixel[3] };
+  });
+
+  // Verify specific pixel color
+  expect(pixelColor.r).toBeGreaterThan(200); // Expecting red-ish
+});
+```
+
+## Visual Comparison
+
+### Screenshot Assertions
+
+```typescript
+test("chart matches baseline", async ({ page }) => {
+  await page.goto("/dashboard");
+
+  // Wait for chart animation to complete
+  await page.waitForTimeout(1000); // Or better: wait for specific state
+
+  // Full page screenshot
+  await expect(page).toHaveScreenshot("dashboard.png", {
+    maxDiffPixels: 100, // Allow small differences
+  });
+
+  // Just the canvas
+  const chart = page.locator("canvas#sales-chart");
+  await expect(chart).toHaveScreenshot("sales-chart.png", {
+    maxDiffPixelRatio: 0.01, // 1% difference allowed
+  });
+});
+```
+
+### Handling Animation
+
+```typescript
+test("animated canvas", async ({ page }) => {
+  await page.goto("/animated-chart");
+
+  // Pause animation before screenshot
+  await page.evaluate(() => {
+    // Common pattern: chart libraries expose pause method
+    window.chartInstance?.stop?.();
+
+    // Or override requestAnimationFrame
+    window.requestAnimationFrame = () => 0;
+  });
+
+  await expect(page.locator("canvas")).toHaveScreenshot();
+});
+
+test("wait for animation complete", async ({ page }) => {
+  await page.goto("/chart-with-animation");
+
+  // Wait for animation complete event
+  await page.evaluate(() => {
+    return new Promise<void>((resolve) => {
+      if (window.chart?.isAnimating === false) {
+        resolve();
+      } else {
+        window.chart?.on("animationComplete", resolve);
+      }
+    });
+  });
+
+  await expect(page.locator("canvas")).toHaveScreenshot();
+});
+```
+
+### Threshold Configuration
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  expect: {
+    toHaveScreenshot: {
+      // Increased threshold for canvas (anti-aliasing differences)
+      maxDiffPixelRatio: 0.02,
+      threshold: 0.3, // Per-pixel color threshold
+      animations: "disabled",
+    },
+  },
+});
+```
+
+## Interaction Testing
+
+### Click on Canvas
+
+```typescript
+test("click on canvas element", async ({ page }) => {
+  await page.goto("/interactive-map");
+
+  const canvas = page.locator("canvas");
+
+  // Click at specific coordinates
+  await canvas.click({ position: { x: 150, y: 200 } });
+
+  // Verify click was registered
+  await expect(page.locator("#info-panel")).toContainText("Location: Paris");
+});
+```
+
+### Drawing on Canvas
+
+```typescript
+test("draw on canvas", async ({ page }) => {
+  await page.goto("/whiteboard");
+
+  const canvas = page.locator("canvas");
+  const box = await canvas.boundingBox();
+
+  // Draw a line using mouse
+  await page.mouse.move(box!.x + 50, box!.y + 50);
+  await page.mouse.down();
+  await page.mouse.move(box!.x + 200, box!.y + 200, { steps: 10 });
+  await page.mouse.up();
+
+  // Verify something was drawn
+  const hasDrawing = await page.evaluate(() => {
+    const canvas = document.querySelector("canvas") as HTMLCanvasElement;
+    const ctx = canvas.getContext("2d")!;
+    const data = ctx.getImageData(0, 0, canvas.width, canvas.height).data;
+    return data.some((v, i) => i % 4 !== 3 && v !== 255); // Non-white pixels
+  });
+
+  expect(hasDrawing).toBe(true);
+});
+```
+
+### Drag and Drop
+
+```typescript
+test("drag canvas element", async ({ page }) => {
+  await page.goto("/diagram-editor");
+
+  const canvas = page.locator("canvas");
+  const box = await canvas.boundingBox();
+
+  // Drag shape from position A to B
+  await page.mouse.move(box!.x + 100, box!.y + 100);
+  await page.mouse.down();
+  await page.mouse.move(box!.x + 300, box!.y + 200, { steps: 20 });
+  await page.mouse.up();
+
+  // Verify via screenshot or state check
+  await expect(canvas).toHaveScreenshot("shape-moved.png");
+});
+```
+
+### Touch Gestures on Canvas
+
+```typescript
+test("pinch zoom on canvas", async ({ page }) => {
+  await page.goto("/map");
+
+  const canvas = page.locator("canvas");
+  const box = await canvas.boundingBox();
+  const centerX = box!.x + box!.width / 2;
+  const centerY = box!.y + box!.height / 2;
+
+  // Simulate pinch zoom using two touch points
+  await page.touchscreen.tap(centerX, centerY);
+
+  // Use evaluate for complex gestures
+  await page.evaluate(
+    async ({ x, y }) => {
+      const target = document.querySelector("canvas")!;
+
+      // Simulate pinch start
+      const touch1 = new Touch({
+        identifier: 1,
+        target,
+        clientX: x - 50,
+        clientY: y,
+      });
+      const touch2 = new Touch({
+        identifier: 2,
+        target,
+        clientX: x + 50,
+        clientY: y,
+      });
+
+      target.dispatchEvent(
+        new TouchEvent("touchstart", {
+          touches: [touch1, touch2],
+          targetTouches: [touch1, touch2],
+          bubbles: true,
+        }),
+      );
+
+      // Simulate pinch out
+      const touch1End = new Touch({
+        identifier: 1,
+        target,
+        clientX: x - 100,
+        clientY: y,
+      });
+      const touch2End = new Touch({
+        identifier: 2,
+        target,
+        clientX: x + 100,
+        clientY: y,
+      });
+
+      target.dispatchEvent(
+        new TouchEvent("touchmove", {
+          touches: [touch1End, touch2End],
+          targetTouches: [touch1End, touch2End],
+          bubbles: true,
+        }),
+      );
+
+      target.dispatchEvent(new TouchEvent("touchend", { bubbles: true }));
+    },
+    { x: centerX, y: centerY },
+  );
+
+  // Verify zoom level changed
+  const zoomLevel = await page.locator("#zoom-indicator").textContent();
+  expect(parseFloat(zoomLevel!)).toBeGreaterThan(1);
+});
+```
+
+## WebGL Testing
+
+### Checking WebGL Support
+
+```typescript
+test("WebGL is supported", async ({ page }) => {
+  await page.goto("/3d-viewer");
+
+  const hasWebGL = await page.evaluate(() => {
+    const canvas = document.createElement("canvas");
+    const gl =
+      canvas.getContext("webgl") || canvas.getContext("experimental-webgl");
+    return !!gl;
+  });
+
+  expect(hasWebGL).toBe(true);
+});
+```
+
+### WebGL Screenshot Testing
+
+```typescript
+test("3D scene renders", async ({ page }) => {
+  await page.goto("/3d-model-viewer");
+
+  // Wait for WebGL scene to render
+  await page.waitForFunction(() => {
+    const canvas = document.querySelector("canvas");
+    if (!canvas) return false;
+
+    const gl = canvas.getContext("webgl") || canvas.getContext("webgl2");
+    if (!gl) return false;
+
+    // Check if something has been drawn
+    const pixels = new Uint8Array(4);
+    gl.readPixels(
+      canvas.width / 2,
+      canvas.height / 2,
+      1,
+      1,
+      gl.RGBA,
+      gl.UNSIGNED_BYTE,
+      pixels,
+    );
+    return pixels.some((p) => p > 0);
+  });
+
+  // Screenshot comparison (higher threshold for WebGL)
+  await expect(page.locator("canvas")).toHaveScreenshot("3d-scene.png", {
+    maxDiffPixelRatio: 0.05, // WebGL can have more variation
+  });
+});
+```
+
+### Testing Three.js Applications
+
+```typescript
+test("Three.js scene interaction", async ({ page }) => {
+  await page.goto("/three-demo");
+
+  // Wait for scene to be ready
+  await page.waitForFunction(() => window.scene?.children?.length > 0);
+
+  // Interact with scene (orbit controls)
+  const canvas = page.locator("canvas");
+  const box = await canvas.boundingBox();
+
+  // Rotate camera by dragging
+  await page.mouse.move(box!.x + box!.width / 2, box!.y + box!.height / 2);
+  await page.mouse.down();
+  await page.mouse.move(
+    box!.x + box!.width / 2 + 100,
+    box!.y + box!.height / 2,
+    {
+      steps: 10,
+    },
+  );
+  await page.mouse.up();
+
+  // Verify camera position changed
+  const cameraRotation = await page.evaluate(() => {
+    return window.camera?.rotation?.y;
+  });
+
+  expect(cameraRotation).not.toBe(0);
+});
+```
+
+## Chart Libraries
+
+### Chart.js Testing
+
+```typescript
+test("Chart.js renders data", async ({ page }) => {
+  await page.goto("/chartjs-demo");
+
+  // Wait for Chart.js to initialize
+  await page.waitForFunction(() => {
+    return window.Chart && document.querySelector("canvas")?.__chart__;
+  });
+
+  // Get chart data via Chart.js API
+  const chartData = await page.evaluate(() => {
+    const canvas = document.querySelector("canvas") as any;
+    const chart = canvas.__chart__;
+    return chart.data.datasets[0].data;
+  });
+
+  expect(chartData).toEqual([12, 19, 3, 5, 2, 3]);
+
+  // Screenshot test
+  await expect(page.locator("canvas")).toHaveScreenshot();
+});
+```
+
+### D3.js / ECharts Testing
+
+```typescript
+test("chart library interaction", async ({ page }) => {
+  await page.goto("/chart-demo");
+
+  // Wait for chart to render
+  await page.waitForFunction(() => document.querySelector("canvas, svg.chart"));
+
+  // For SVG charts (D3)
+  const bars = page.locator("svg.chart rect.bar");
+  if ((await bars.count()) > 0) {
+    await bars.first().hover();
+    await expect(page.locator(".tooltip")).toBeVisible();
+  }
+
+  // For canvas charts (ECharts, Chart.js)
+  const canvas = page.locator("canvas");
+  await canvas.click({ position: { x: 200, y: 150 } });
+});
+```
+
+## Game & Animation Testing
+
+### Frame-by-Frame Testing
+
+```typescript
+test("game frame control", async ({ page }) => {
+  await page.goto("/game");
+
+  // Pause and step through frames
+  await page.evaluate(() => window.gameLoop?.pause());
+  await page.evaluate(() => window.gameLoop?.tick());
+  await expect(page.locator("canvas")).toHaveScreenshot("frame-1.png");
+
+  for (let i = 0; i < 10; i++) {
+    await page.evaluate(() => window.gameLoop?.tick());
+  }
+  await expect(page.locator("canvas")).toHaveScreenshot("frame-11.png");
+});
+```
+
+### Testing Game State
+
+```typescript
+test("game state changes", async ({ page }) => {
+  await page.goto("/game");
+
+  const initialScore = await page.evaluate(() => window.game?.score);
+  expect(initialScore).toBe(0);
+
+  await page.keyboard.press("Space"); // Action
+  await page.waitForTimeout(500);
+
+  const newScore = await page.evaluate(() => window.game?.score);
+  expect(newScore).toBeGreaterThan(0);
+});
+```
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern             | Problem                  | Solution                            |
+| ------------------------ | ------------------------ | ----------------------------------- |
+| Pixel-perfect assertions | Fails across browsers/OS | Use maxDiffPixelRatio threshold     |
+| Not waiting for render   | Blank canvas screenshots | Wait for draw completion            |
+| Testing raw pixel data   | Brittle and slow         | Use visual comparison               |
+| Ignoring animation       | Flaky screenshots        | Pause/disable animations            |
+| Hardcoded coordinates    | Breaks on resize         | Calculate relative to canvas bounds |
+
+## Related References
+
+- **Visual Testing**: See [test-suite-structure.md](../core/test-suite-structure.md) for visual regression setup
+- **Mobile Gestures**: See [mobile-testing.md](../advanced/mobile-testing.md) for touch interactions
+- **Performance**: See [performance-testing.md](performance-testing.md) for FPS monitoring
diff --git a/.cursor/skills/playwright-testing/testing-patterns/component-testing.md b/.cursor/skills/playwright-testing/testing-patterns/component-testing.md
new file mode 100644
index 0000000000..71c0dff0fc
--- /dev/null
+++ b/.cursor/skills/playwright-testing/testing-patterns/component-testing.md
@@ -0,0 +1,438 @@
+# Component Testing
+
+## Table of Contents
+
+1. [Setup & Configuration](#setup--configuration)
+2. [Mounting Components](#mounting-components)
+3. [Props & State Testing](#props--state-testing)
+4. [Events & Interactions](#events--interactions)
+5. [Slots & Children](#slots--children)
+6. [Mocking Dependencies](#mocking-dependencies)
+7. [Framework-Specific Patterns](#framework-specific-patterns)
+
+## Setup & Configuration
+
+### Installation
+
+```bash
+# React
+npm init playwright@latest -- --ct
+```
+
+### Configuration
+
+```typescript
+// playwright-ct.config.ts
+import { defineConfig, devices } from "@playwright/experimental-ct-react";
+
+export default defineConfig({
+  testDir: "./tests/components",
+  snapshotDir: "./tests/components/__snapshots__",
+
+  use: {
+    ctPort: 3100,
+    ctViteConfig: {
+      resolve: {
+        alias: {
+          "@": "/src",
+        },
+      },
+    },
+  },
+
+  projects: [
+    { name: "chromium", use: { ...devices["Desktop Chrome"] } },
+    { name: "firefox", use: { ...devices["Desktop Firefox"] } },
+    { name: "webkit", use: { ...devices["Desktop Safari"] } },
+  ],
+});
+```
+
+### Project Structure
+
+```
+src/
+  components/
+    Button.tsx
+    Modal.tsx
+tests/
+  components/
+    Button.spec.tsx
+    Modal.spec.tsx
+playwright/
+  index.html    # CT entry point
+  index.tsx     # CT setup (providers, styles)
+```
+
+## Mounting Components
+
+### Basic Mount
+
+```tsx
+// Button.spec.tsx
+import { test, expect } from "@playwright/experimental-ct-react";
+import { Button } from "@/components/Button";
+
+test("renders button with text", async ({ mount }) => {
+  const component = await mount(<Button>Click me</Button>);
+
+  await expect(component).toContainText("Click me");
+  await expect(component).toBeVisible();
+});
+```
+
+### Mount with Props
+
+```tsx
+test("renders with all props", async ({ mount }) => {
+  const component = await mount(
+    <Button variant="primary" size="large" disabled={false} icon="check">
+      Submit
+    </Button>,
+  );
+
+  await expect(component).toHaveClass(/primary/);
+  await expect(component).toHaveClass(/large/);
+  await expect(component.locator("svg")).toBeVisible(); // icon
+});
+```
+
+### Mount with Wrapper/Provider
+
+```tsx
+// playwright/index.tsx - Global providers
+import { ThemeProvider } from "@/providers/theme";
+import { QueryClientProvider } from "@tanstack/react-query";
+import "@/styles/globals.css";
+
+export default function PlaywrightWrapper({ children }) {
+  return (
+    <QueryClientProvider client={queryClient}>
+      <ThemeProvider>{children}</ThemeProvider>
+    </QueryClientProvider>
+  );
+}
+```
+
+```tsx
+// Or per-test wrapper
+test("with custom provider", async ({ mount }) => {
+  const component = await mount(
+    <AuthProvider initialUser={{ name: "Test" }}>
+      <UserProfile />
+    </AuthProvider>,
+  );
+
+  await expect(component.getByText("Test")).toBeVisible();
+});
+```
+
+## Props & State Testing
+
+### Testing Prop Variations
+
+```tsx
+test.describe("Button variants", () => {
+  const variants = ["primary", "secondary", "danger", "ghost"] as const;
+
+  for (const variant of variants) {
+    test(`renders ${variant} variant`, async ({ mount }) => {
+      const component = await mount(<Button variant={variant}>Button</Button>);
+      await expect(component).toHaveClass(new RegExp(variant));
+    });
+  }
+});
+```
+
+### Updating Props
+
+```tsx
+test("responds to prop changes", async ({ mount }) => {
+  const component = await mount(<Counter initialCount={0} />);
+
+  await expect(component.getByTestId("count")).toHaveText("0");
+
+  // Update props
+  await component.update(<Counter initialCount={10} />);
+  await expect(component.getByTestId("count")).toHaveText("10");
+});
+```
+
+### Testing Controlled Components
+
+```tsx
+test("controlled input", async ({ mount }) => {
+  let externalValue = "";
+
+  const component = await mount(
+    <Input
+      value={externalValue}
+      onChange={(e) => {
+        externalValue = e.target.value;
+      }}
+    />,
+  );
+
+  await component.locator("input").fill("hello");
+
+  // For controlled components, update with new value
+  await component.update(
+    <Input value="hello" onChange={(e) => (externalValue = e.target.value)} />,
+  );
+
+  await expect(component.locator("input")).toHaveValue("hello");
+});
+```
+
+### Testing Internal State
+
+```tsx
+test("internal state updates", async ({ mount }) => {
+  const component = await mount(<Toggle defaultChecked={false} />);
+
+  // Initial state
+  await expect(component.locator('[role="switch"]')).toHaveAttribute(
+    "aria-checked",
+    "false",
+  );
+
+  // Trigger state change
+  await component.click();
+
+  // Verify state updated
+  await expect(component.locator('[role="switch"]')).toHaveAttribute(
+    "aria-checked",
+    "true",
+  );
+});
+```
+
+## Events & Interactions
+
+### Testing Click Events
+
+```tsx
+test("click event fires", async ({ mount }) => {
+  let clicked = false;
+
+  const component = await mount(
+    <Button onClick={() => (clicked = true)}>Click</Button>,
+  );
+
+  await component.click();
+
+  expect(clicked).toBe(true);
+});
+```
+
+### Testing Event Payloads
+
+```tsx
+test("onChange provides correct value", async ({ mount }) => {
+  const values: string[] = [];
+
+  const component = await mount(
+    <Select
+      options={["a", "b", "c"]}
+      onChange={(value) => values.push(value)}
+    />,
+  );
+
+  await component.getByRole("combobox").click();
+  await component.getByRole("option", { name: "b" }).click();
+
+  expect(values).toEqual(["b"]);
+});
+```
+
+### Testing Form Submission
+
+```tsx
+test("form submission", async ({ mount }) => {
+  let submittedData: FormData | null = null;
+
+  const component = await mount(
+    <LoginForm
+      onSubmit={(data) => {
+        submittedData = data;
+      }}
+    />,
+  );
+
+  await component.getByLabel("Email").fill("test@example.com");
+  await component.getByLabel("Password").fill("secret123");
+  await component.getByRole("button", { name: "Sign in" }).click();
+
+  expect(submittedData).toEqual({
+    email: "test@example.com",
+    password: "secret123",
+  });
+});
+```
+
+### Testing Keyboard Interactions
+
+```tsx
+test("keyboard navigation", async ({ mount }) => {
+  const component = await mount(
+    <Dropdown options={["Apple", "Banana", "Cherry"]} />,
+  );
+
+  // Open dropdown
+  await component.getByRole("button").click();
+
+  // Navigate with keyboard
+  await component.press("ArrowDown");
+  await component.press("ArrowDown");
+  await component.press("Enter");
+
+  await expect(component.getByRole("button")).toHaveText("Banana");
+});
+```
+
+## Slots & Children
+
+### Testing Children Content
+
+```tsx
+test("renders children", async ({ mount }) => {
+  const component = await mount(
+    <Card>
+      <h2>Title</h2>
+      <p>Description</p>
+    </Card>,
+  );
+
+  await expect(component.getByRole("heading")).toHaveText("Title");
+  await expect(component.getByText("Description")).toBeVisible();
+});
+```
+
+### Testing Render Props
+
+```tsx
+test("render prop pattern", async ({ mount }) => {
+  const component = await mount(
+    <DataFetcher url="/api/users">
+      {({ data, loading }) =>
+        loading ? <span>Loading...</span> : <span>{data.name}</span>
+      }
+    </DataFetcher>,
+  );
+
+  // Initially loading
+  await expect(component.getByText("Loading...")).toBeVisible();
+
+  // After data loads
+  await expect(component.getByText(/User/)).toBeVisible();
+});
+```
+
+## Mocking Dependencies
+
+### Mocking Imports
+
+```tsx
+// playwright/index.tsx - Mock at setup level
+import { beforeMount } from "@playwright/experimental-ct-react/hooks";
+
+beforeMount(async ({ hooksConfig }) => {
+  // Mock analytics
+  window.analytics = {
+    track: () => {},
+    identify: () => {},
+  };
+
+  // Mock feature flags
+  if (hooksConfig?.featureFlags) {
+    window.__FEATURE_FLAGS__ = hooksConfig.featureFlags;
+  }
+});
+```
+
+```tsx
+// Test with mocked config
+test("with feature flag", async ({ mount }) => {
+  const component = await mount(<FeatureComponent />, {
+    hooksConfig: {
+      featureFlags: { newFeature: true },
+    },
+  });
+
+  await expect(component.getByText("New Feature")).toBeVisible();
+});
+```
+
+### Mocking API Calls
+
+```tsx
+test("component with API", async ({ mount, page }) => {
+  // Mock API before mounting
+  await page.route("**/api/user", (route) => {
+    route.fulfill({
+      json: { id: 1, name: "Test User" },
+    });
+  });
+
+  const component = await mount(<UserProfile userId={1} />);
+
+  await expect(component.getByText("Test User")).toBeVisible();
+});
+```
+
+### Mocking Hooks
+
+```tsx
+// Mock custom hook via module mock
+test("with mocked hook", async ({ mount }) => {
+  const component = await mount(<Dashboard />, {
+    hooksConfig: {
+      mockAuth: { user: { name: "Admin" }, isAdmin: true },
+    },
+  });
+
+  await expect(component.getByText("Admin Panel")).toBeVisible();
+});
+```
+
+## Framework-Specific Patterns
+
+### React Testing
+
+```tsx
+// React with refs
+test("exposes ref methods", async ({ mount }) => {
+  let inputRef: HTMLInputElement | null = null;
+
+  const component = await mount(<Input ref={(el) => (inputRef = el)} />);
+
+  await component.locator("input").fill("test");
+  expect(inputRef?.value).toBe("test");
+});
+
+// React with context
+test("uses context", async ({ mount }) => {
+  const component = await mount(
+    <UserContext.Provider value={{ name: "Test" }}>
+      <UserGreeting />
+    </UserContext.Provider>,
+  );
+
+  await expect(component).toContainText("Hello, Test");
+});
+```
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern                   | Problem             | Solution                          |
+| ------------------------------ | ------------------- | --------------------------------- |
+| Testing implementation details | Brittle tests       | Test behavior, not internal state |
+| Snapshot testing everything    | Maintenance burden  | Use for visual regression only    |
+| Not isolating components       | Hidden dependencies | Mock all external dependencies    |
+| Testing framework behavior     | Redundant           | Focus on your component logic     |
+| Skipping accessibility         | Misses real issues  | Include a11y checks in CT         |
+
+## Related References
+
+- **Accessibility**: See [accessibility.md](accessibility.md) for a11y testing in components
+- **Fixtures**: See [fixtures-hooks.md](../core/fixtures-hooks.md) for shared test setup
diff --git a/.cursor/skills/playwright-testing/testing-patterns/drag-drop.md b/.cursor/skills/playwright-testing/testing-patterns/drag-drop.md
new file mode 100644
index 0000000000..4d4d0abc13
--- /dev/null
+++ b/.cursor/skills/playwright-testing/testing-patterns/drag-drop.md
@@ -0,0 +1,576 @@
+# Drag and Drop Testing
+
+## Table of Contents
+
+1. [Kanban Board (Cross-Column Movement)](#kanban-board-cross-column-movement)
+2. [Sortable Lists (Reordering)](#sortable-lists-reordering)
+3. [Native HTML5 Drag and Drop](#native-html5-drag-and-drop)
+4. [File Drop Zone](#file-drop-zone)
+5. [Canvas Coordinate-Based Dragging](#canvas-coordinate-based-dragging)
+6. [Custom Drag Preview](#custom-drag-preview)
+7. [Variations](#variations)
+8. [Tips](#tips)
+
+> **When to use**: Testing drag-and-drop interactions — sortable lists, kanban boards, file drop zones, or repositionable elements.
+
+---
+
+## Kanban Board (Cross-Column Movement)
+
+```typescript
+import { test, expect } from '@playwright/test';
+
+test('moves card between columns', async ({ page }) => {
+  await page.goto('/board');
+
+  const backlog = page.locator('[data-column="backlog"]');
+  const active = page.locator('[data-column="active"]');
+
+  const ticket = backlog.getByText('Update API docs');
+  await expect(ticket).toBeVisible();
+
+  const backlogCountBefore = await backlog.getByRole('article').count();
+  const activeCountBefore = await active.getByRole('article').count();
+
+  await ticket.dragTo(active);
+
+  await expect(active.getByText('Update API docs')).toBeVisible();
+  await expect(backlog.getByText('Update API docs')).not.toBeVisible();
+
+  await expect(backlog.getByRole('article')).toHaveCount(backlogCountBefore - 1);
+  await expect(active.getByRole('article')).toHaveCount(activeCountBefore + 1);
+});
+
+test('progresses card through workflow stages', async ({ page }) => {
+  await page.goto('/board');
+
+  const cols = {
+    backlog: page.locator('[data-column="backlog"]'),
+    active: page.locator('[data-column="active"]'),
+    review: page.locator('[data-column="review"]'),
+    complete: page.locator('[data-column="complete"]'),
+  };
+
+  await cols.backlog.getByText('Update API docs').dragTo(cols.active);
+  await expect(cols.active.getByText('Update API docs')).toBeVisible();
+
+  await cols.active.getByText('Update API docs').dragTo(cols.review);
+  await expect(cols.review.getByText('Update API docs')).toBeVisible();
+
+  await cols.review.getByText('Update API docs').dragTo(cols.complete);
+  await expect(cols.complete.getByText('Update API docs')).toBeVisible();
+
+  await expect(cols.backlog.getByText('Update API docs')).not.toBeVisible();
+  await expect(cols.active.getByText('Update API docs')).not.toBeVisible();
+  await expect(cols.review.getByText('Update API docs')).not.toBeVisible();
+});
+
+test('reorders cards within same column', async ({ page }) => {
+  await page.goto('/board');
+
+  const backlog = page.locator('[data-column="backlog"]');
+
+  const itemX = backlog.getByRole('article').filter({ hasText: 'Item X' });
+  const itemZ = backlog.getByRole('article').filter({ hasText: 'Item Z' });
+
+  await itemZ.dragTo(itemX);
+
+  const cards = await backlog.getByRole('article').allTextContents();
+  expect(cards.indexOf('Item Z')).toBeLessThan(cards.indexOf('Item X'));
+});
+
+test('verifies drag persists via API', async ({ page }) => {
+  await page.goto('/board');
+
+  const backlog = page.locator('[data-column="backlog"]');
+  const active = page.locator('[data-column="active"]');
+
+  const responsePromise = page.waitForResponse(
+    (r) => r.url().includes('/api/tickets') && r.request().method() === 'PATCH'
+  );
+
+  await backlog.getByText('Update API docs').dragTo(active);
+
+  const response = await responsePromise;
+  expect(response.status()).toBe(200);
+
+  const body = await response.json();
+  expect(body.column).toBe('active');
+
+  await page.reload();
+  await expect(active.getByText('Update API docs')).toBeVisible();
+});
+```
+
+---
+
+## Sortable Lists (Reordering)
+
+```typescript
+import { test, expect } from '@playwright/test';
+
+test('reorders list items', async ({ page }) => {
+  await page.goto('/priorities');
+
+  const list = page.getByRole('list', { name: 'Priority list' });
+
+  const initial = await list.getByRole('listitem').allTextContents();
+  expect(initial[0]).toContain('Priority A');
+  expect(initial[1]).toContain('Priority B');
+  expect(initial[2]).toContain('Priority C');
+
+  const priorityC = list.getByRole('listitem').filter({ hasText: 'Priority C' });
+  const priorityA = list.getByRole('listitem').filter({ hasText: 'Priority A' });
+
+  await priorityC.dragTo(priorityA);
+
+  const reordered = await list.getByRole('listitem').allTextContents();
+  expect(reordered[0]).toContain('Priority C');
+  expect(reordered[1]).toContain('Priority A');
+  expect(reordered[2]).toContain('Priority B');
+});
+
+test('reorders via drag handle', async ({ page }) => {
+  await page.goto('/priorities');
+
+  const list = page.getByRole('list', { name: 'Priority list' });
+
+  const handle = list
+    .getByRole('listitem')
+    .filter({ hasText: 'Priority C' })
+    .getByRole('button', { name: /drag|reorder|grip/i });
+
+  const target = list.getByRole('listitem').filter({ hasText: 'Priority A' });
+
+  await handle.dragTo(target);
+
+  const items = await list.getByRole('listitem').allTextContents();
+  expect(items[0]).toContain('Priority C');
+});
+
+test('reorder persists after reload', async ({ page }) => {
+  await page.goto('/priorities');
+
+  const list = page.getByRole('list', { name: 'Priority list' });
+
+  const priorityC = list.getByRole('listitem').filter({ hasText: 'Priority C' });
+  const priorityA = list.getByRole('listitem').filter({ hasText: 'Priority A' });
+
+  await priorityC.dragTo(priorityA);
+
+  await page.waitForResponse((response) =>
+    response.url().includes('/api/priorities/reorder') && response.status() === 200
+  );
+
+  await page.reload();
+
+  const items = await list.getByRole('listitem').allTextContents();
+  expect(items[0]).toContain('Priority C');
+  expect(items[1]).toContain('Priority A');
+  expect(items[2]).toContain('Priority B');
+});
+```
+
+### Incremental Mouse Movement for Custom Libraries
+
+Some drag libraries (react-beautiful-dnd, dnd-kit) require incremental mouse movements:
+
+```typescript
+test('reorders with incremental mouse movements', async ({ page }) => {
+  await page.goto('/priorities');
+
+  const list = page.getByRole('list', { name: 'Priority list' });
+  const source = list.getByRole('listitem').filter({ hasText: 'Priority C' });
+  const target = list.getByRole('listitem').filter({ hasText: 'Priority A' });
+
+  const sourceBox = await source.boundingBox();
+  const targetBox = await target.boundingBox();
+
+  await source.hover();
+  await page.mouse.down();
+
+  const steps = 10;
+  for (let i = 1; i <= steps; i++) {
+    await page.mouse.move(
+      sourceBox!.x + sourceBox!.width / 2,
+      sourceBox!.y + (targetBox!.y - sourceBox!.y) * (i / steps),
+      { steps: 1 }
+    );
+  }
+
+  await page.mouse.up();
+
+  const items = await list.getByRole('listitem').allTextContents();
+  expect(items[0]).toContain('Priority C');
+});
+```
+
+---
+
+## Native HTML5 Drag and Drop
+
+```typescript
+import { test, expect } from '@playwright/test';
+
+test('drags item to drop zone', async ({ page }) => {
+  await page.goto('/drag-example');
+
+  const source = page.getByText('Movable Element');
+  const dropArea = page.locator('#target-zone');
+
+  await expect(source).toBeVisible();
+  await expect(dropArea).not.toContainText('Movable Element');
+
+  await source.dragTo(dropArea);
+
+  await expect(dropArea).toContainText('Movable Element');
+});
+
+test('drags between zones', async ({ page }) => {
+  await page.goto('/drag-example');
+
+  const item = page.locator('[data-testid="element-1"]');
+  const areaA = page.locator('[data-testid="area-a"]');
+  const areaB = page.locator('[data-testid="area-b"]');
+
+  await expect(areaA).toContainText('Element 1');
+
+  await item.dragTo(areaB);
+
+  await expect(areaB).toContainText('Element 1');
+  await expect(areaA).not.toContainText('Element 1');
+
+  await areaB.getByText('Element 1').dragTo(areaA);
+
+  await expect(areaA).toContainText('Element 1');
+  await expect(areaB).not.toContainText('Element 1');
+});
+
+test('verifies drag visual feedback', async ({ page }) => {
+  await page.goto('/drag-example');
+
+  const source = page.getByText('Movable Element');
+  const dropArea = page.locator('#target-zone');
+
+  await source.hover();
+  await page.mouse.down();
+
+  const dropBox = await dropArea.boundingBox();
+  await page.mouse.move(dropBox!.x + dropBox!.width / 2, dropBox!.y + dropBox!.height / 2);
+
+  await expect(dropArea).toHaveClass(/drag-over|highlight/);
+
+  await page.mouse.up();
+
+  await expect(dropArea).not.toHaveClass(/drag-over|highlight/);
+  await expect(dropArea).toContainText('Movable Element');
+});
+```
+
+---
+
+## File Drop Zone
+
+```typescript
+import { test, expect } from '@playwright/test';
+import path from 'path';
+
+test('uploads file via drop zone', async ({ page }) => {
+  await page.goto('/upload');
+
+  const dropZone = page.locator('[data-testid="file-drop-zone"]');
+
+  await expect(dropZone).toContainText('Drag files here');
+
+  const fileInput = page.locator('input[type="file"]');
+
+  await fileInput.setInputFiles(path.resolve(__dirname, '../fixtures/report.pdf'));
+
+  await expect(page.getByText('report.pdf')).toBeVisible();
+  await expect(page.getByText(/\d+ KB/)).toBeVisible();
+});
+
+test('simulates drag-over visual feedback', async ({ page }) => {
+  await page.goto('/upload');
+
+  const dropZone = page.locator('[data-testid="file-drop-zone"]');
+
+  await dropZone.dispatchEvent('dragenter', {
+    dataTransfer: { types: ['Files'] },
+  });
+
+  await expect(dropZone).toHaveClass(/drag-active|drop-highlight/);
+  await expect(dropZone).toContainText(/drop.*here|release.*upload/i);
+
+  await dropZone.dispatchEvent('dragleave');
+
+  await expect(dropZone).not.toHaveClass(/drag-active|drop-highlight/);
+});
+
+test('rejects invalid file types', async ({ page }) => {
+  await page.goto('/upload');
+
+  const fileInput = page.locator('input[type="file"]');
+
+  await fileInput.setInputFiles({
+    name: 'script.exe',
+    mimeType: 'application/x-msdownload',
+    buffer: Buffer.from('fake-content'),
+  });
+
+  await expect(page.getByRole('alert')).toContainText(/not allowed|invalid file type/i);
+  await expect(page.getByText('script.exe')).not.toBeVisible();
+});
+```
+
+---
+
+## Canvas Coordinate-Based Dragging
+
+```typescript
+import { test, expect } from '@playwright/test';
+
+test('drags element to specific coordinates', async ({ page }) => {
+  await page.goto('/design-tool');
+
+  const canvas = page.locator('#editor-canvas');
+  const shape = page.locator('[data-testid="shape-1"]');
+
+  const canvasBox = await canvas.boundingBox();
+  const targetX = canvasBox!.x + 300;
+  const targetY = canvasBox!.y + 200;
+
+  await shape.hover();
+  await page.mouse.down();
+  await page.mouse.move(targetX, targetY, { steps: 10 });
+  await page.mouse.up();
+
+  const newBox = await shape.boundingBox();
+  expect(newBox!.x).toBeCloseTo(targetX - newBox!.width / 2, -1);
+  expect(newBox!.y).toBeCloseTo(targetY - newBox!.height / 2, -1);
+});
+
+test('snaps element to grid', async ({ page }) => {
+  await page.goto('/design-tool');
+
+  const shape = page.locator('[data-testid="shape-1"]');
+  const canvas = page.locator('#editor-canvas');
+
+  const canvasBox = await canvas.boundingBox();
+
+  await shape.hover();
+  await page.mouse.down();
+  await page.mouse.move(canvasBox!.x + 147, canvasBox!.y + 83, { steps: 10 });
+  await page.mouse.up();
+
+  const snappedBox = await shape.boundingBox();
+  expect(snappedBox!.x % 20).toBeCloseTo(0, 0);
+  expect(snappedBox!.y % 20).toBeCloseTo(0, 0);
+});
+
+test('constrains drag within boundaries', async ({ page }) => {
+  await page.goto('/design-tool');
+
+  const shape = page.locator('[data-testid="bounded-shape"]');
+  const container = page.locator('#bounds-container');
+
+  const containerBox = await container.boundingBox();
+
+  await shape.hover();
+  await page.mouse.down();
+  await page.mouse.move(containerBox!.x + containerBox!.width + 500, containerBox!.y - 200, {
+    steps: 10,
+  });
+  await page.mouse.up();
+
+  const shapeBox = await shape.boundingBox();
+
+  expect(shapeBox!.x).toBeGreaterThanOrEqual(containerBox!.x);
+  expect(shapeBox!.y).toBeGreaterThanOrEqual(containerBox!.y);
+  expect(shapeBox!.x + shapeBox!.width).toBeLessThanOrEqual(
+    containerBox!.x + containerBox!.width
+  );
+  expect(shapeBox!.y + shapeBox!.height).toBeLessThanOrEqual(
+    containerBox!.y + containerBox!.height
+  );
+});
+
+test('resizes element via handle', async ({ page }) => {
+  await page.goto('/design-tool');
+
+  const shape = page.locator('[data-testid="shape-1"]');
+  await shape.click();
+
+  const resizeHandle = shape.locator('.resize-handle-se');
+  const handleBox = await resizeHandle.boundingBox();
+
+  const initialBox = await shape.boundingBox();
+
+  await resizeHandle.hover();
+  await page.mouse.down();
+  await page.mouse.move(handleBox!.x + 100, handleBox!.y + 80, { steps: 5 });
+  await page.mouse.up();
+
+  const newBox = await shape.boundingBox();
+  expect(newBox!.width).toBeCloseTo(initialBox!.width + 100, -1);
+  expect(newBox!.height).toBeCloseTo(initialBox!.height + 80, -1);
+});
+```
+
+---
+
+## Custom Drag Preview
+
+```typescript
+import { test, expect } from '@playwright/test';
+
+test('shows custom drag preview', async ({ page }) => {
+  await page.goto('/board');
+
+  const card = page.locator('[data-testid="ticket-1"]');
+  const targetCol = page.locator('[data-column="active"]');
+
+  const cardBox = await card.boundingBox();
+  const targetBox = await targetCol.boundingBox();
+
+  await card.hover();
+  await page.mouse.down();
+
+  const midX = (cardBox!.x + targetBox!.x) / 2;
+  const midY = (cardBox!.y + targetBox!.y) / 2;
+  await page.mouse.move(midX, midY, { steps: 5 });
+
+  await expect(page.locator('.drag-preview')).toBeVisible();
+  await expect(card).toHaveClass(/dragging|placeholder/);
+
+  await page.mouse.move(
+    targetBox!.x + targetBox!.width / 2,
+    targetBox!.y + targetBox!.height / 2,
+    { steps: 5 }
+  );
+  await page.mouse.up();
+
+  await expect(page.locator('.drag-preview')).not.toBeVisible();
+});
+
+test('multi-select drag shows item count', async ({ page }) => {
+  await page.goto('/board');
+
+  await page.locator('[data-testid="ticket-1"]').click();
+  await page.locator('[data-testid="ticket-2"]').click({ modifiers: ['Shift'] });
+  await page.locator('[data-testid="ticket-3"]').click({ modifiers: ['Shift'] });
+
+  const card = page.locator('[data-testid="ticket-1"]');
+  const targetCol = page.locator('[data-column="complete"]');
+
+  await card.hover();
+  await page.mouse.down();
+
+  const targetBox = await targetCol.boundingBox();
+  await page.mouse.move(targetBox!.x + 50, targetBox!.y + 50, { steps: 5 });
+
+  await expect(page.locator('.drag-preview')).toContainText('3 items');
+
+  await page.mouse.up();
+
+  await expect(targetCol.locator('[data-testid="ticket-1"]')).toBeVisible();
+  await expect(targetCol.locator('[data-testid="ticket-2"]')).toBeVisible();
+  await expect(targetCol.locator('[data-testid="ticket-3"]')).toBeVisible();
+});
+```
+
+---
+
+## Variations
+
+### Keyboard-Based Reordering
+
+```typescript
+test('reorders using keyboard', async ({ page }) => {
+  await page.goto('/priorities');
+
+  const list = page.getByRole('list', { name: 'Priority list' });
+  const priorityC = list.getByRole('listitem').filter({ hasText: 'Priority C' });
+
+  await priorityC.focus();
+  await page.keyboard.press('Space');
+
+  await page.keyboard.press('ArrowUp');
+  await page.keyboard.press('ArrowUp');
+
+  await page.keyboard.press('Space');
+
+  const items = await list.getByRole('listitem').allTextContents();
+  expect(items[0]).toContain('Priority C');
+});
+```
+
+### Cross-Frame Dragging
+
+```typescript
+test('drags between main page and iframe', async ({ page }) => {
+  await page.goto('/composer');
+
+  const sourceWidget = page.getByText('Component A');
+  const iframe = page.frameLocator('#preview-frame');
+  const iframeElement = page.locator('#preview-frame');
+
+  const sourceBox = await sourceWidget.boundingBox();
+  const iframeBox = await iframeElement.boundingBox();
+
+  const targetX = iframeBox!.x + 100;
+  const targetY = iframeBox!.y + 100;
+
+  await sourceWidget.hover();
+  await page.mouse.down();
+  await page.mouse.move(targetX, targetY, { steps: 20 });
+  await page.mouse.up();
+
+  await expect(iframe.getByText('Component A')).toBeVisible();
+});
+```
+
+### Touch-Based Drag on Mobile
+
+```typescript
+test('drags via touch events', async ({ page }) => {
+  await page.goto('/priorities');
+
+  const list = page.getByRole('list', { name: 'Priority list' });
+  const source = list.getByRole('listitem').filter({ hasText: 'Priority C' });
+  const target = list.getByRole('listitem').filter({ hasText: 'Priority A' });
+
+  const sourceBox = await source.boundingBox();
+  const targetBox = await target.boundingBox();
+
+  await source.dispatchEvent('touchstart', {
+    touches: [{ clientX: sourceBox!.x + 10, clientY: sourceBox!.y + 10 }],
+  });
+
+  for (let i = 1; i <= 5; i++) {
+    const y = sourceBox!.y + (targetBox!.y - sourceBox!.y) * (i / 5);
+    await source.dispatchEvent('touchmove', {
+      touches: [{ clientX: sourceBox!.x + 10, clientY: y }],
+    });
+  }
+
+  await source.dispatchEvent('touchend');
+
+  const items = await list.getByRole('listitem').allTextContents();
+  expect(items[0]).toContain('Priority C');
+});
+```
+
+---
+
+## Tips
+
+1. **Start with `dragTo()`, fall back to manual mouse events**. Playwright's `dragTo()` handles most HTML5 drag-and-drop. Use `page.mouse.down()` / `move()` / `up()` only for custom libraries (react-beautiful-dnd, dnd-kit, SortableJS) that need specific event sequences.
+
+2. **Add intermediate mouse steps for drag libraries**. Libraries like `react-beautiful-dnd` require multiple `mousemove` events. Use `{ steps: 10 }` or a manual loop — a single jump often fails silently.
+
+3. **Assert final state, not just the drop event**. Verify DOM reflects the change — item order, column contents, position coordinates. Visual feedback during drag is secondary to the persisted state.
+
+4. **Use `boundingBox()` for coordinate assertions**. For canvas editors or position-sensitive drops, capture bounding box after the operation and compare with `toBeCloseTo()` for tolerance.
+
+5. **Test undo after drag operations**. If your app supports Ctrl+Z, verify the drag is reversible — this catches state management bugs.
diff --git a/.cursor/skills/playwright-testing/testing-patterns/electron.md b/.cursor/skills/playwright-testing/testing-patterns/electron.md
new file mode 100644
index 0000000000..016f198a4a
--- /dev/null
+++ b/.cursor/skills/playwright-testing/testing-patterns/electron.md
@@ -0,0 +1,509 @@
+# Electron Testing
+
+## Table of Contents
+
+1. [Setup & Configuration](#setup--configuration)
+2. [Launching Electron Apps](#launching-electron-apps)
+3. [Main Process Testing](#main-process-testing)
+4. [Renderer Process Testing](#renderer-process-testing)
+5. [IPC Communication](#ipc-communication)
+6. [Native Features](#native-features)
+7. [Packaging & Distribution](#packaging--distribution)
+
+## Setup & Configuration
+
+### Installation
+
+```bash
+npm install -D @playwright/test electron
+```
+
+### Basic Configuration
+
+```typescript
+// playwright.config.ts
+import { defineConfig } from "@playwright/test";
+
+export default defineConfig({
+  testDir: "./tests",
+  timeout: 30000,
+  use: {
+    trace: "on-first-retry",
+  },
+});
+```
+
+### Electron Test Fixture
+
+```typescript
+// fixtures/electron.ts
+import {
+  test as base,
+  _electron as electron,
+  ElectronApplication,
+  Page,
+} from "@playwright/test";
+
+type ElectronFixtures = {
+  electronApp: ElectronApplication;
+  window: Page;
+};
+
+export const test = base.extend<ElectronFixtures>({
+  electronApp: async ({}, use) => {
+    // Launch Electron app
+    const electronApp = await electron.launch({
+      args: [".", "--no-sandbox"],
+      env: {
+        ...process.env,
+        NODE_ENV: "test",
+      },
+    });
+
+    await use(electronApp);
+
+    // Cleanup
+    await electronApp.close();
+  },
+
+  window: async ({ electronApp }, use) => {
+    // Wait for first window
+    const window = await electronApp.firstWindow();
+
+    // Wait for app to be ready
+    await window.waitForLoadState("domcontentloaded");
+
+    await use(window);
+  },
+});
+
+export { expect } from "@playwright/test";
+```
+
+### Launch Options
+
+```typescript
+// Advanced launch configuration
+const electronApp = await electron.launch({
+  args: ["main.js", "--custom-flag"],
+  cwd: "/path/to/app",
+  env: {
+    ...process.env,
+    ELECTRON_ENABLE_LOGGING: "1",
+    NODE_ENV: "test",
+  },
+  timeout: 30000,
+  // For packaged apps
+  executablePath: "/path/to/MyApp.app/Contents/MacOS/MyApp",
+});
+```
+
+## Launching Electron Apps
+
+### Development Mode
+
+```typescript
+test("launch in dev mode", async () => {
+  const electronApp = await electron.launch({
+    args: ["."], // Points to package.json main
+  });
+
+  const window = await electronApp.firstWindow();
+  await expect(window.locator("h1")).toContainText("My App");
+
+  await electronApp.close();
+});
+```
+
+### Packaged Application
+
+```typescript
+test("launch packaged app", async () => {
+  const appPath =
+    process.platform === "darwin"
+      ? "/Applications/MyApp.app/Contents/MacOS/MyApp"
+      : process.platform === "win32"
+        ? "C:\\Program Files\\MyApp\\MyApp.exe"
+        : "/usr/bin/myapp";
+
+  const electronApp = await electron.launch({
+    executablePath: appPath,
+  });
+
+  const window = await electronApp.firstWindow();
+  await expect(window).toHaveTitle(/MyApp/);
+
+  await electronApp.close();
+});
+```
+
+### Multiple Windows
+
+```typescript
+test("handle multiple windows", async ({ electronApp }) => {
+  const mainWindow = await electronApp.firstWindow();
+
+  // Trigger new window
+  await mainWindow.getByRole("button", { name: "Open Settings" }).click();
+
+  // Wait for new window
+  const settingsWindow = await electronApp.waitForEvent("window");
+
+  // Both windows are now accessible
+  await expect(settingsWindow.locator("h1")).toHaveText("Settings");
+  await expect(mainWindow.locator("h1")).toHaveText("Main");
+
+  // Get all windows
+  const windows = electronApp.windows();
+  expect(windows.length).toBe(2);
+});
+```
+
+## Main Process Testing
+
+### Evaluate in Main Process
+
+```typescript
+test("access main process", async ({ electronApp }) => {
+  // Evaluate in main process context
+  const appPath = await electronApp.evaluate(async ({ app }) => {
+    return app.getAppPath();
+  });
+
+  expect(appPath).toContain("my-electron-app");
+});
+```
+
+### Access Electron APIs
+
+```typescript
+test("electron API access", async ({ electronApp }) => {
+  // Get app version
+  const version = await electronApp.evaluate(async ({ app }) => {
+    return app.getVersion();
+  });
+  expect(version).toMatch(/^\d+\.\d+\.\d+$/);
+
+  // Get platform info
+  const platform = await electronApp.evaluate(async ({ app }) => {
+    return process.platform;
+  });
+  expect(["darwin", "win32", "linux"]).toContain(platform);
+
+  // Check if app is ready
+  const isReady = await electronApp.evaluate(async ({ app }) => {
+    return app.isReady();
+  });
+  expect(isReady).toBe(true);
+});
+```
+
+### BrowserWindow Properties
+
+```typescript
+test("check window properties", async ({ electronApp, window }) => {
+  // Get BrowserWindow from main process
+  const windowBounds = await electronApp.evaluate(async ({ BrowserWindow }) => {
+    const win = BrowserWindow.getAllWindows()[0];
+    return win.getBounds();
+  });
+
+  expect(windowBounds.width).toBeGreaterThan(0);
+  expect(windowBounds.height).toBeGreaterThan(0);
+
+  // Check window state
+  const isMaximized = await electronApp.evaluate(async ({ BrowserWindow }) => {
+    const win = BrowserWindow.getAllWindows()[0];
+    return win.isMaximized();
+  });
+
+  // Check window title
+  const title = await electronApp.evaluate(async ({ BrowserWindow }) => {
+    const win = BrowserWindow.getAllWindows()[0];
+    return win.getTitle();
+  });
+  expect(title).toBeTruthy();
+});
+```
+
+## Renderer Process Testing
+
+### Standard Page Testing
+
+```typescript
+test("renderer interactions", async ({ window }) => {
+  // Standard Playwright page interactions
+  await window.getByRole("button", { name: "Click Me" }).click();
+  await expect(window.getByText("Clicked!")).toBeVisible();
+
+  // Fill forms
+  await window.getByLabel("Username").fill("testuser");
+  await window.getByLabel("Password").fill("password123");
+  await window.getByRole("button", { name: "Login" }).click();
+
+  // Verify navigation
+  await expect(window).toHaveURL(/dashboard/);
+});
+```
+
+### Access Node.js in Renderer
+
+```typescript
+test("node integration", async ({ window }) => {
+  // If nodeIntegration is enabled
+  const nodeVersion = await window.evaluate(() => {
+    return (window as any).process?.version;
+  });
+
+  // Check if Node APIs are available
+  const hasFs = await window.evaluate(() => {
+    return typeof (window as any).require === "function";
+  });
+});
+```
+
+### Context Isolation Testing
+
+```typescript
+test("context isolation", async ({ window }) => {
+  // Test preload script exposed APIs
+  const apiAvailable = await window.evaluate(() => {
+    return typeof (window as any).electronAPI !== "undefined";
+  });
+  expect(apiAvailable).toBe(true);
+
+  // Call exposed API
+  const result = await window.evaluate(async () => {
+    return await (window as any).electronAPI.getAppVersion();
+  });
+  expect(result).toMatch(/^\d+\.\d+\.\d+$/);
+});
+```
+
+## IPC Communication
+
+### Testing IPC from Renderer
+
+```typescript
+test("IPC invoke", async ({ window }) => {
+  // Test preload-exposed IPC call
+  const result = await window.evaluate(async () => {
+    return await (window as any).electronAPI.getData("user-settings");
+  });
+
+  expect(result).toHaveProperty("theme");
+});
+```
+
+### Testing IPC from Main Process
+
+```typescript
+test("main to renderer IPC", async ({ electronApp, window }) => {
+  // Set up listener in renderer
+  await window.evaluate(() => {
+    (window as any).receivedMessage = null;
+    (window as any).electronAPI.onMessage((msg: string) => {
+      (window as any).receivedMessage = msg;
+    });
+  });
+
+  // Send from main process
+  await electronApp.evaluate(async ({ BrowserWindow }) => {
+    const win = BrowserWindow.getAllWindows()[0];
+    win.webContents.send("message", "Hello from main!");
+  });
+
+  // Verify receipt
+  await window.waitForFunction(() => (window as any).receivedMessage !== null);
+  const message = await window.evaluate(() => (window as any).receivedMessage);
+  expect(message).toBe("Hello from main!");
+});
+```
+
+### Mock IPC Handlers
+
+```typescript
+// In test setup or fixture
+test("mock IPC handler", async ({ electronApp, window }) => {
+  // Override IPC handler in main process
+  await electronApp.evaluate(async ({ ipcMain }) => {
+    // Remove existing handler
+    ipcMain.removeHandler("fetch-data");
+
+    // Add mock handler
+    ipcMain.handle("fetch-data", async () => {
+      return { mocked: true, data: "test-data" };
+    });
+  });
+
+  // Test with mocked handler
+  const result = await window.evaluate(async () => {
+    return await (window as any).electronAPI.fetchData();
+  });
+
+  expect(result.mocked).toBe(true);
+});
+```
+
+## Native Features
+
+### File System Dialogs
+
+```typescript
+test("file dialog", async ({ electronApp, window }) => {
+  // Mock dialog response
+  await electronApp.evaluate(async ({ dialog }) => {
+    dialog.showOpenDialog = async () => ({
+      canceled: false,
+      filePaths: ["/mock/path/file.txt"],
+    });
+  });
+
+  // Trigger file open
+  await window.getByRole("button", { name: "Open File" }).click();
+
+  // Verify file was "opened"
+  await expect(window.getByText("file.txt")).toBeVisible();
+});
+
+test("save dialog", async ({ electronApp, window }) => {
+  await electronApp.evaluate(async ({ dialog }) => {
+    dialog.showSaveDialog = async () => ({
+      canceled: false,
+      filePath: "/mock/path/saved-file.txt",
+    });
+  });
+
+  await window.getByRole("button", { name: "Save" }).click();
+  await expect(window.getByText("Saved successfully")).toBeVisible();
+});
+```
+
+### Menu Testing
+
+```typescript
+test("application menu", async ({ electronApp }) => {
+  // Get menu structure
+  const menuLabels = await electronApp.evaluate(async ({ Menu }) => {
+    const menu = Menu.getApplicationMenu();
+    return menu?.items.map((item) => item.label) || [];
+  });
+
+  expect(menuLabels).toContain("File");
+  expect(menuLabels).toContain("Edit");
+
+  // Trigger menu action
+  await electronApp.evaluate(async ({ Menu }) => {
+    const menu = Menu.getApplicationMenu();
+    const fileMenu = menu?.items.find((item) => item.label === "File");
+    const newItem = fileMenu?.submenu?.items.find(
+      (item) => item.label === "New",
+    );
+    newItem?.click();
+  });
+});
+```
+
+### Native Notifications
+
+```typescript
+test("notifications", async ({ electronApp, window }) => {
+  // Mock Notification
+  let notificationShown = false;
+  await electronApp.evaluate(async ({ Notification }) => {
+    const OriginalNotification = Notification;
+    (global as any).Notification = class extends OriginalNotification {
+      constructor(options: any) {
+        super(options);
+        (global as any).lastNotification = options;
+      }
+    };
+  });
+
+  // Trigger notification
+  await window.getByRole("button", { name: "Notify" }).click();
+
+  // Verify notification was created
+  const notification = await electronApp.evaluate(async () => {
+    return (global as any).lastNotification;
+  });
+
+  expect(notification.title).toBe("New Message");
+});
+```
+
+### Clipboard
+
+```typescript
+test("clipboard operations", async ({ electronApp, window }) => {
+  // Write to clipboard
+  await electronApp.evaluate(async ({ clipboard }) => {
+    clipboard.writeText("Test clipboard content");
+  });
+
+  // Paste in app
+  await window.getByRole("textbox").focus();
+  await window.keyboard.press("ControlOrMeta+v");
+
+  // Read clipboard
+  const clipboardContent = await electronApp.evaluate(async ({ clipboard }) => {
+    return clipboard.readText();
+  });
+
+  expect(clipboardContent).toBe("Test clipboard content");
+});
+```
+
+## Packaging & Distribution
+
+### Testing Packaged Apps
+
+```typescript
+// fixtures/packaged-electron.ts
+import { test as base, _electron as electron } from "@playwright/test";
+import path from "path";
+import { execSync } from "child_process";
+
+export const test = base.extend({
+  electronApp: async ({}, use) => {
+    // Build the app first (or use pre-built)
+    const distPath = path.join(__dirname, "../dist");
+
+    let executablePath: string;
+    if (process.platform === "darwin") {
+      executablePath = path.join(
+        distPath,
+        "mac",
+        "MyApp.app",
+        "Contents",
+        "MacOS",
+        "MyApp",
+      );
+    } else if (process.platform === "win32") {
+      executablePath = path.join(distPath, "win-unpacked", "MyApp.exe");
+    } else {
+      executablePath = path.join(distPath, "linux-unpacked", "myapp");
+    }
+
+    const electronApp = await electron.launch({ executablePath });
+    await use(electronApp);
+    await electronApp.close();
+  },
+});
+```
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern                          | Problem                      | Solution                                     |
+| ------------------------------------- | ---------------------------- | -------------------------------------------- |
+| Not closing ElectronApplication       | Resource leaks               | Always call `electronApp.close()` in cleanup |
+| Hardcoded executable paths            | Breaks cross-platform        | Use platform detection                       |
+| Testing packaged app without building | Outdated code                | Build before testing or test dev mode        |
+| Ignoring IPC in tests                 | Missing coverage             | Test IPC communication explicitly            |
+| Not mocking native dialogs            | Tests hang waiting for input | Mock dialog responses                        |
+
+## Related References
+
+- **Fixtures**: See [fixtures-hooks.md](../core/fixtures-hooks.md) for custom fixture patterns
+- **Component Testing**: See [component-testing.md](component-testing.md) for renderer testing patterns
+- **Debugging**: See [debugging.md](../debugging/debugging.md) for troubleshooting
diff --git a/.cursor/skills/playwright-testing/testing-patterns/file-operations.md b/.cursor/skills/playwright-testing/testing-patterns/file-operations.md
new file mode 100644
index 0000000000..367b8b149c
--- /dev/null
+++ b/.cursor/skills/playwright-testing/testing-patterns/file-operations.md
@@ -0,0 +1,377 @@
+# File Upload & Download Testing
+
+> For advanced patterns (progress tracking, cancellation, retry logic), see [file-upload-download.md](./file-upload-download.md)
+
+## Table of Contents
+
+1. [File Downloads](#file-downloads)
+2. [File Uploads](#file-uploads)
+3. [Drag and Drop](#drag-and-drop)
+4. [File Content Verification](#file-content-verification)
+
+## File Downloads
+
+### Basic Download
+
+```typescript
+test("download PDF report", async ({ page }) => {
+  await page.goto("/reports");
+
+  // Start waiting for download before clicking
+  const downloadPromise = page.waitForEvent("download");
+  await page.getByRole("button", { name: "Download PDF" }).click();
+  const download = await downloadPromise;
+
+  // Verify filename
+  expect(download.suggestedFilename()).toBe("report.pdf");
+
+  // Save to specific path
+  await download.saveAs("./downloads/report.pdf");
+});
+```
+
+### Download with Custom Path
+
+```typescript
+test("download to temp directory", async ({ page }, testInfo) => {
+  await page.goto("/exports");
+
+  const downloadPromise = page.waitForEvent("download");
+  await page.getByRole("link", { name: "Export CSV" }).click();
+  const download = await downloadPromise;
+
+  // Save to test output directory
+  const path = testInfo.outputPath(download.suggestedFilename());
+  await download.saveAs(path);
+
+  // Attach to test report
+  await testInfo.attach("downloaded-file", { path });
+});
+```
+
+### Verify Download Content
+
+```typescript
+import fs from "fs";
+import path from "path";
+
+test("verify CSV content", async ({ page }, testInfo) => {
+  await page.goto("/data");
+
+  const downloadPromise = page.waitForEvent("download");
+  await page.getByRole("button", { name: "Export" }).click();
+  const download = await downloadPromise;
+
+  const filePath = testInfo.outputPath("export.csv");
+  await download.saveAs(filePath);
+
+  // Read and verify content
+  const content = fs.readFileSync(filePath, "utf-8");
+  expect(content).toContain("Name,Email,Status");
+  expect(content).toContain("John Doe");
+
+  // Verify row count
+  const rows = content.trim().split("\n");
+  expect(rows.length).toBeGreaterThan(1);
+});
+```
+
+### Multiple Downloads
+
+```typescript
+test("download multiple files", async ({ page }) => {
+  await page.goto("/batch-export");
+
+  await page.getByRole("checkbox", { name: "Select All" }).check();
+
+  // Collect all downloads
+  const downloads: Download[] = [];
+  page.on("download", (download) => downloads.push(download));
+
+  await page.getByRole("button", { name: "Download Selected" }).click();
+
+  // Wait for all downloads
+  await expect.poll(() => downloads.length, { timeout: 30000 }).toBe(5);
+
+  // Verify each download
+  for (const download of downloads) {
+    expect(download.suggestedFilename()).toMatch(/\.pdf$/);
+  }
+});
+```
+
+### Download Fixture
+
+```typescript
+// fixtures/download.fixture.ts
+import { test as base, Download } from "@playwright/test";
+import fs from "fs";
+import path from "path";
+
+type DownloadFixtures = {
+  downloadDir: string;
+  downloadAndVerify: (
+    trigger: () => Promise<void>,
+    expectedFilename: string,
+  ) => Promise<string>;
+};
+
+export const test = base.extend<DownloadFixtures>({
+  downloadDir: async ({}, use, testInfo) => {
+    const dir = testInfo.outputPath("downloads");
+    fs.mkdirSync(dir, { recursive: true });
+    await use(dir);
+  },
+
+  downloadAndVerify: async ({ page, downloadDir }, use) => {
+    await use(async (trigger, expectedFilename) => {
+      const downloadPromise = page.waitForEvent("download");
+      await trigger();
+      const download = await downloadPromise;
+
+      expect(download.suggestedFilename()).toBe(expectedFilename);
+
+      const filePath = path.join(downloadDir, expectedFilename);
+      await download.saveAs(filePath);
+      return filePath;
+    });
+  },
+});
+```
+
+## File Uploads
+
+### Basic Upload
+
+```typescript
+test("upload profile picture", async ({ page }) => {
+  await page.goto("/settings/profile");
+
+  // Upload file
+  await page
+    .getByLabel("Profile Picture")
+    .setInputFiles("./fixtures/avatar.png");
+
+  // Verify preview
+  await expect(page.getByAltText("Profile preview")).toBeVisible();
+
+  await page.getByRole("button", { name: "Save" }).click();
+  await expect(page.getByText("Profile updated")).toBeVisible();
+});
+```
+
+### Multiple File Upload
+
+```typescript
+test("upload multiple documents", async ({ page }) => {
+  await page.goto("/documents/upload");
+
+  await page
+    .getByLabel("Documents")
+    .setInputFiles([
+      "./fixtures/doc1.pdf",
+      "./fixtures/doc2.pdf",
+      "./fixtures/doc3.pdf",
+    ]);
+
+  // Verify all files listed
+  await expect(page.getByText("doc1.pdf")).toBeVisible();
+  await expect(page.getByText("doc2.pdf")).toBeVisible();
+  await expect(page.getByText("doc3.pdf")).toBeVisible();
+
+  await page.getByRole("button", { name: "Upload All" }).click();
+  await expect(page.getByText("3 files uploaded")).toBeVisible();
+});
+```
+
+### Upload with File Chooser
+
+```typescript
+test("upload via file chooser dialog", async ({ page }) => {
+  await page.goto("/upload");
+
+  // Handle file chooser
+  const fileChooserPromise = page.waitForEvent("filechooser");
+  await page.getByRole("button", { name: "Choose File" }).click();
+  const fileChooser = await fileChooserPromise;
+
+  await fileChooser.setFiles("./fixtures/document.pdf");
+
+  await expect(page.getByText("document.pdf")).toBeVisible();
+});
+```
+
+### Clear and Re-upload
+
+```typescript
+test("replace uploaded file", async ({ page }) => {
+  await page.goto("/upload");
+
+  const input = page.getByLabel("Document");
+
+  // Upload first file
+  await input.setInputFiles("./fixtures/old.pdf");
+  await expect(page.getByText("old.pdf")).toBeVisible();
+
+  // Clear selection
+  await input.setInputFiles([]);
+
+  // Upload new file
+  await input.setInputFiles("./fixtures/new.pdf");
+  await expect(page.getByText("new.pdf")).toBeVisible();
+  await expect(page.getByText("old.pdf")).toBeHidden();
+});
+```
+
+### Upload from Buffer
+
+```typescript
+test("upload generated file", async ({ page }) => {
+  await page.goto("/upload");
+
+  // Create file content dynamically
+  const content = "Name,Email\nJohn,john@example.com";
+
+  await page.getByLabel("CSV File").setInputFiles({
+    name: "users.csv",
+    mimeType: "text/csv",
+    buffer: Buffer.from(content),
+  });
+
+  await expect(page.getByText("users.csv")).toBeVisible();
+});
+```
+
+## Drag and Drop
+
+### Drag and Drop Upload
+
+```typescript
+test("drag and drop file upload", async ({ page }) => {
+  await page.goto("/upload");
+
+  const dropzone = page.getByTestId("dropzone");
+
+  // Create a DataTransfer with the file
+  const dataTransfer = await page.evaluateHandle(() => new DataTransfer());
+
+  // Read file and add to DataTransfer
+  const buffer = fs.readFileSync("./fixtures/image.png");
+  await page.evaluate(
+    async ([dataTransfer, data]) => {
+      const file = new File([new Uint8Array(data)], "image.png", {
+        type: "image/png",
+      });
+      dataTransfer.items.add(file);
+    },
+    [dataTransfer, [...buffer]] as const,
+  );
+
+  // Dispatch drop event
+  await dropzone.dispatchEvent("drop", { dataTransfer });
+
+  await expect(page.getByText("image.png uploaded")).toBeVisible();
+});
+```
+
+### Simpler Drag and Drop
+
+```typescript
+test("drag and drop with setInputFiles", async ({ page }) => {
+  await page.goto("/upload");
+
+  // Most dropzones have a hidden file input
+  const input = page.locator('input[type="file"]');
+
+  // This works even if the input is hidden
+  await input.setInputFiles("./fixtures/document.pdf");
+
+  await expect(page.getByText("document.pdf")).toBeVisible();
+});
+```
+
+## File Content Verification
+
+### Verify PDF Content
+
+```typescript
+import pdf from "pdf-parse";
+
+test("verify PDF content", async ({ page }, testInfo) => {
+  await page.goto("/invoice/123");
+
+  const downloadPromise = page.waitForEvent("download");
+  await page.getByRole("button", { name: "Download Invoice" }).click();
+  const download = await downloadPromise;
+
+  const path = testInfo.outputPath("invoice.pdf");
+  await download.saveAs(path);
+
+  // Parse PDF
+  const dataBuffer = fs.readFileSync(path);
+  const data = await pdf(dataBuffer);
+
+  expect(data.text).toContain("Invoice #123");
+  expect(data.text).toContain("Total: $99.99");
+});
+```
+
+### Verify Excel Content
+
+```typescript
+import XLSX from "xlsx";
+
+test("verify Excel export", async ({ page }, testInfo) => {
+  await page.goto("/reports");
+
+  const downloadPromise = page.waitForEvent("download");
+  await page.getByRole("button", { name: "Export Excel" }).click();
+  const download = await downloadPromise;
+
+  const path = testInfo.outputPath("report.xlsx");
+  await download.saveAs(path);
+
+  // Parse Excel
+  const workbook = XLSX.readFile(path);
+  const sheet = workbook.Sheets[workbook.SheetNames[0]];
+  const data = XLSX.utils.sheet_to_json(sheet);
+
+  expect(data).toHaveLength(10);
+  expect(data[0]).toHaveProperty("Name");
+  expect(data[0]).toHaveProperty("Email");
+});
+```
+
+### Verify JSON Download
+
+```typescript
+test("verify JSON export", async ({ page }, testInfo) => {
+  await page.goto("/api-data");
+
+  const downloadPromise = page.waitForEvent("download");
+  await page.getByRole("button", { name: "Export JSON" }).click();
+  const download = await downloadPromise;
+
+  const path = testInfo.outputPath("data.json");
+  await download.saveAs(path);
+
+  const content = JSON.parse(fs.readFileSync(path, "utf-8"));
+
+  expect(content.users).toHaveLength(5);
+  expect(content.exportDate).toBeDefined();
+});
+```
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern                          | Problem                         | Solution                                      |
+| ------------------------------------- | ------------------------------- | --------------------------------------------- |
+| Not waiting for download              | Race condition, test fails      | Always use `waitForEvent("download")`         |
+| Hardcoded download paths              | Conflicts in parallel runs      | Use `testInfo.outputPath()`                   |
+| Skipping content verification         | Download might be empty/corrupt | Verify file content when possible             |
+| Using `force: true` for hidden inputs | May not trigger proper events   | Use `setInputFiles` on hidden inputs directly |
+
+## Related References
+
+- **Fixtures**: See [fixtures-hooks.md](../core/fixtures-hooks.md) for download fixture patterns
+- **Debugging**: See [debugging.md](../debugging/debugging.md) for troubleshooting download issues
diff --git a/.cursor/skills/playwright-testing/testing-patterns/file-upload-download.md b/.cursor/skills/playwright-testing/testing-patterns/file-upload-download.md
new file mode 100644
index 0000000000..75a43482b1
--- /dev/null
+++ b/.cursor/skills/playwright-testing/testing-patterns/file-upload-download.md
@@ -0,0 +1,562 @@
+# File Upload and Download Testing
+
+> **When to use**: Testing file uploads (single, multiple, drag-and-drop), downloads (content verification, filename, type), upload progress indicators, or file type/size restrictions.
+
+## Table of Contents
+
+1. [Downloading Files](#downloading-files)
+2. [Single File Upload](#single-file-upload)
+3. [Multiple File Upload](#multiple-file-upload)
+4. [Drag-and-Drop Zones](#drag-and-drop-zones)
+5. [File Chooser Dialog](#file-chooser-dialog)
+6. [Upload Progress and Cancellation](#upload-progress-and-cancellation)
+7. [Retry After Failure](#retry-after-failure)
+8. [File Type and Size Restrictions](#file-type-and-size-restrictions)
+9. [Image Preview](#image-preview)
+10. [Authenticated Downloads](#authenticated-downloads)
+11. [Tips](#tips)
+
+---
+
+## Downloading Files
+
+### Capturing Downloads and Verifying Content
+
+```typescript
+import { test, expect } from '@playwright/test';
+import fs from 'fs';
+import path from 'path';
+
+test('verifies downloaded CSV content', async ({ page }) => {
+  await page.goto('/exports');
+
+  // Set up download listener BEFORE triggering the download
+  const downloadPromise = page.waitForEvent('download');
+  await page.getByRole('link', { name: 'transactions.csv' }).click();
+
+  const download = await downloadPromise;
+  const savePath = path.join(__dirname, '../tmp', download.suggestedFilename());
+  await download.saveAs(savePath);
+
+  const content = fs.readFileSync(savePath, 'utf-8');
+  expect(content).toContain('id,amount,date');
+  expect(content).toContain('1001,250.00,2025-01-15');
+
+  const rows = content.trim().split('\n');
+  expect(rows.length).toBeGreaterThan(1);
+
+  fs.unlinkSync(savePath);
+});
+
+test('reads download via stream without disk I/O', async ({ page }) => {
+  await page.goto('/exports');
+
+  const downloadPromise = page.waitForEvent('download');
+  await page.getByRole('link', { name: 'transactions.csv' }).click();
+
+  const download = await downloadPromise;
+  const readable = await download.createReadStream();
+  const chunks: Buffer[] = [];
+
+  for await (const chunk of readable!) {
+    chunks.push(Buffer.from(chunk));
+  }
+
+  const content = Buffer.concat(chunks).toString('utf-8');
+  expect(content).toContain('id,amount,date');
+});
+```
+
+### Verifying Filename and Format
+
+```typescript
+test('export filename matches selected format', async ({ page }) => {
+  await page.goto('/analytics');
+
+  const downloadPromise = page.waitForEvent('download');
+  await page.getByRole('button', { name: 'Export PDF' }).click();
+
+  const download = await downloadPromise;
+  expect(download.suggestedFilename()).toMatch(/^analytics-\d{4}-\d{2}-\d{2}\.pdf$/);
+});
+
+test('format selector changes output extension', async ({ page }) => {
+  await page.goto('/analytics');
+
+  await page.getByLabel('Format').selectOption('csv');
+  const csvDownload = page.waitForEvent('download');
+  await page.getByRole('button', { name: 'Download' }).click();
+  expect((await csvDownload).suggestedFilename()).toMatch(/\.csv$/);
+
+  await page.getByLabel('Format').selectOption('xlsx');
+  const xlsxDownload = page.waitForEvent('download');
+  await page.getByRole('button', { name: 'Download' }).click();
+  expect((await xlsxDownload).suggestedFilename()).toMatch(/\.xlsx$/);
+});
+```
+
+### Checking Response Headers
+
+```typescript
+test('download response has correct MIME type', async ({ page }) => {
+  await page.goto('/analytics');
+
+  const responsePromise = page.waitForResponse('**/api/analytics/export**');
+  const downloadPromise = page.waitForEvent('download');
+
+  await page.getByRole('button', { name: 'Export PDF' }).click();
+
+  const response = await responsePromise;
+  expect(response.headers()['content-type']).toContain('application/pdf');
+  expect(response.headers()['content-disposition']).toContain('attachment');
+
+  await downloadPromise;
+});
+```
+
+### Handling Download Failures
+
+```typescript
+test('shows error when download fails', async ({ page }) => {
+  await page.route('**/api/analytics/export**', async (route) => {
+    await route.fulfill({
+      status: 500,
+      contentType: 'application/json',
+      body: JSON.stringify({ error: 'Generation failed' }),
+    });
+  });
+
+  await page.goto('/analytics');
+  await page.getByRole('button', { name: 'Export PDF' }).click();
+
+  await expect(page.getByRole('alert')).toContainText(/failed|error/i);
+});
+```
+
+---
+
+## Single File Upload
+
+### From Fixture File
+
+```typescript
+import path from 'path';
+
+test('uploads document from fixture', async ({ page }) => {
+  await page.goto('/attachments');
+
+  const fileInput = page.locator('input[type="file"]');
+  await fileInput.setInputFiles(path.resolve(__dirname, '../fixtures/invoice.pdf'));
+
+  await expect(page.getByText('invoice.pdf')).toBeVisible();
+
+  await page.getByRole('button', { name: 'Upload' }).click();
+  await expect(page.getByRole('alert')).toContainText('uploaded successfully');
+  await expect(page.getByRole('link', { name: 'invoice.pdf' })).toBeVisible();
+});
+```
+
+### From In-Memory Buffer
+
+```typescript
+test('uploads in-memory CSV', async ({ page }) => {
+  await page.goto('/attachments');
+
+  const fileInput = page.locator('input[type="file"]');
+  await fileInput.setInputFiles({
+    name: 'contacts.csv',
+    mimeType: 'text/csv',
+    buffer: Buffer.from('name,email\nAlice,alice@acme.com\nBob,bob@acme.com'),
+  });
+
+  await expect(page.getByText('contacts.csv')).toBeVisible();
+  await page.getByRole('button', { name: 'Upload' }).click();
+  await expect(page.getByRole('alert')).toContainText('uploaded successfully');
+});
+```
+
+### Clearing Selection
+
+```typescript
+test('clears selected file', async ({ page }) => {
+  await page.goto('/attachments');
+
+  const fileInput = page.locator('input[type="file"]');
+  await fileInput.setInputFiles({
+    name: 'draft.txt',
+    mimeType: 'text/plain',
+    buffer: Buffer.from('draft content'),
+  });
+
+  await expect(page.getByText('draft.txt')).toBeVisible();
+
+  // Clear via API
+  await fileInput.setInputFiles([]);
+  await expect(page.getByText('draft.txt')).not.toBeVisible();
+});
+```
+
+---
+
+## Multiple File Upload
+
+```typescript
+test('uploads multiple files at once', async ({ page }) => {
+  await page.goto('/attachments');
+
+  const fileInput = page.locator('input[type="file"]');
+  await fileInput.setInputFiles([
+    { name: 'doc1.pdf', mimeType: 'application/pdf', buffer: Buffer.from('pdf1') },
+    { name: 'doc2.pdf', mimeType: 'application/pdf', buffer: Buffer.from('pdf2') },
+    { name: 'doc3.pdf', mimeType: 'application/pdf', buffer: Buffer.from('pdf3') },
+  ]);
+
+  await expect(page.getByText('doc1.pdf')).toBeVisible();
+  await expect(page.getByText('doc2.pdf')).toBeVisible();
+  await expect(page.getByText('doc3.pdf')).toBeVisible();
+  await expect(page.getByText('3 files selected')).toBeVisible();
+
+  await page.getByRole('button', { name: 'Upload all' }).click();
+  await expect(page.getByRole('alert')).toContainText('3 files uploaded');
+});
+
+test('removes one file from selection', async ({ page }) => {
+  await page.goto('/attachments');
+
+  const fileInput = page.locator('input[type="file"]');
+  await fileInput.setInputFiles([
+    { name: 'keep.txt', mimeType: 'text/plain', buffer: Buffer.from('keep') },
+    { name: 'discard.txt', mimeType: 'text/plain', buffer: Buffer.from('discard') },
+  ]);
+
+  const discardRow = page.getByText('discard.txt').locator('..');
+  await discardRow.getByRole('button', { name: /remove|delete|×/i }).click();
+
+  await expect(page.getByText('discard.txt')).not.toBeVisible();
+  await expect(page.getByText('keep.txt')).toBeVisible();
+});
+```
+
+---
+
+## Drag-and-Drop Zones
+
+Drop zones always have an underlying `input[type="file"]`—target it directly instead of simulating OS-level drag events.
+
+```typescript
+test('uploads via drop zone', async ({ page }) => {
+  await page.goto('/attachments');
+
+  const dropZone = page.locator('[data-testid="drop-zone"]');
+  await expect(dropZone).toContainText(/drag.*here|drop.*files/i);
+
+  const fileInput = page.locator('input[type="file"]');
+  await fileInput.setInputFiles({
+    name: 'dropped.pdf',
+    mimeType: 'application/pdf',
+    buffer: Buffer.from('pdf-content'),
+  });
+
+  await expect(dropZone.getByText('dropped.pdf')).toBeVisible();
+  await page.getByRole('button', { name: 'Upload' }).click();
+  await expect(page.getByRole('alert')).toContainText('uploaded successfully');
+});
+
+test('shows visual feedback on drag-over', async ({ page }) => {
+  await page.goto('/attachments');
+
+  const dropZone = page.locator('[data-testid="drop-zone"]');
+
+  await dropZone.dispatchEvent('dragenter', {
+    dataTransfer: { types: ['Files'], files: [] },
+  });
+
+  await expect(dropZone).toHaveClass(/active|highlight|drag-over/);
+  await expect(dropZone).toContainText(/release|drop now/i);
+
+  await dropZone.dispatchEvent('dragleave');
+  await expect(dropZone).not.toHaveClass(/active|highlight|drag-over/);
+});
+```
+
+---
+
+## File Chooser Dialog
+
+```typescript
+test('uploads via native file chooser', async ({ page }) => {
+  await page.goto('/attachments');
+
+  const fileChooserPromise = page.waitForEvent('filechooser');
+  await page.getByRole('button', { name: 'Choose file' }).click();
+
+  const fileChooser = await fileChooserPromise;
+  expect(fileChooser.isMultiple()).toBe(false);
+
+  await fileChooser.setFiles({
+    name: 'selected.pdf',
+    mimeType: 'application/pdf',
+    buffer: Buffer.from('pdf-content'),
+  });
+
+  await expect(page.getByText('selected.pdf')).toBeVisible();
+});
+```
+
+---
+
+## Upload Progress and Cancellation
+
+```typescript
+test('displays upload progress for large file', async ({ page }) => {
+  await page.goto('/attachments');
+
+  const fileInput = page.locator('input[type="file"]');
+  const largeBuffer = Buffer.alloc(5 * 1024 * 1024, 'x');
+
+  await fileInput.setInputFiles({
+    name: 'dataset.bin',
+    mimeType: 'application/octet-stream',
+    buffer: largeBuffer,
+  });
+
+  await page.getByRole('button', { name: 'Upload' }).click();
+
+  const progressBar = page.getByRole('progressbar');
+  await expect(progressBar).toBeVisible();
+
+  await expect(async () => {
+    const value = await progressBar.getAttribute('aria-valuenow');
+    expect(Number(value)).toBeGreaterThan(0);
+  }).toPass({ timeout: 10000 });
+
+  await expect(progressBar).not.toBeVisible({ timeout: 60000 });
+  await expect(page.getByRole('alert')).toContainText('uploaded successfully');
+});
+
+test('cancels in-progress upload', async ({ page }) => {
+  await page.route('**/api/attachments/upload', async (route) => {
+    await new Promise((resolve) => setTimeout(resolve, 10000));
+    await route.continue();
+  });
+
+  await page.goto('/attachments');
+
+  const fileInput = page.locator('input[type="file"]');
+  await fileInput.setInputFiles({
+    name: 'large.bin',
+    mimeType: 'application/octet-stream',
+    buffer: Buffer.alloc(5 * 1024 * 1024, 'x'),
+  });
+
+  await page.getByRole('button', { name: 'Upload' }).click();
+  await expect(page.getByRole('progressbar')).toBeVisible();
+
+  await page.getByRole('button', { name: 'Cancel upload' }).click();
+
+  await expect(page.getByRole('progressbar')).not.toBeVisible();
+  await expect(page.getByText(/cancelled|aborted/i)).toBeVisible();
+  await expect(page.getByRole('link', { name: 'large.bin' })).not.toBeVisible();
+});
+```
+
+---
+
+## Retry After Failure
+
+```typescript
+test('retries failed upload', async ({ page }) => {
+  let attempt = 0;
+
+  await page.route('**/api/attachments/upload', async (route) => {
+    attempt++;
+    if (attempt === 1) {
+      await route.fulfill({
+        status: 500,
+        contentType: 'application/json',
+        body: JSON.stringify({ error: 'Server error' }),
+      });
+    } else {
+      await route.fulfill({
+        status: 200,
+        contentType: 'application/json',
+        body: JSON.stringify({ id: 'abc', name: 'data.csv' }),
+      });
+    }
+  });
+
+  await page.goto('/attachments');
+
+  const fileInput = page.locator('input[type="file"]');
+  await fileInput.setInputFiles({
+    name: 'data.csv',
+    mimeType: 'text/csv',
+    buffer: Buffer.from('col1,col2\nval1,val2'),
+  });
+
+  await page.getByRole('button', { name: 'Upload' }).click();
+  await expect(page.getByText(/upload failed|error/i)).toBeVisible();
+
+  await page.getByRole('button', { name: /retry/i }).click();
+  await expect(page.getByRole('alert')).toContainText('uploaded successfully');
+  expect(attempt).toBe(2);
+});
+```
+
+---
+
+## File Type and Size Restrictions
+
+### Validating Allowed Types
+
+```typescript
+test('accepts allowed file types', async ({ page }) => {
+  await page.goto('/attachments');
+
+  const fileInput = page.locator('input[type="file"]');
+  await expect(fileInput).toHaveAttribute('accept', /\.pdf|\.doc|\.docx|\.txt/);
+
+  await fileInput.setInputFiles({
+    name: 'report.pdf',
+    mimeType: 'application/pdf',
+    buffer: Buffer.from('pdf-content'),
+  });
+
+  await expect(page.getByText('report.pdf')).toBeVisible();
+  await expect(page.getByText(/not allowed|invalid/i)).not.toBeVisible();
+});
+
+test('rejects disallowed file types', async ({ page }) => {
+  await page.goto('/attachments');
+
+  const fileInput = page.locator('input[type="file"]');
+  // setInputFiles bypasses the accept attribute—tests JavaScript validation
+  await fileInput.setInputFiles({
+    name: 'malware.exe',
+    mimeType: 'application/x-msdownload',
+    buffer: Buffer.from('exe-content'),
+  });
+
+  await expect(page.getByRole('alert')).toContainText(
+    /not allowed|unsupported file type|only .pdf, .doc/i
+  );
+  await expect(page.getByText('malware.exe')).not.toBeVisible();
+});
+```
+
+### Enforcing Size Limits
+
+```typescript
+test('rejects oversized file', async ({ page }) => {
+  await page.goto('/attachments');
+
+  const fileInput = page.locator('input[type="file"]');
+  const oversizedBuffer = Buffer.alloc(11 * 1024 * 1024, 'x');
+
+  await fileInput.setInputFiles({
+    name: 'huge.pdf',
+    mimeType: 'application/pdf',
+    buffer: oversizedBuffer,
+  });
+
+  await expect(page.getByRole('alert')).toContainText(/file.*too large|exceeds.*10 ?MB/i);
+  await expect(page.getByText('huge.pdf')).not.toBeVisible();
+});
+```
+
+### Enforcing File Count Limits
+
+```typescript
+test('rejects too many files', async ({ page }) => {
+  await page.goto('/attachments');
+
+  const fileInput = page.locator('input[type="file"]');
+  const files = Array.from({ length: 6 }, (_, i) => ({
+    name: `file-${i + 1}.txt`,
+    mimeType: 'text/plain' as const,
+    buffer: Buffer.from(`content ${i + 1}`),
+  }));
+
+  await fileInput.setInputFiles(files);
+
+  await expect(page.getByRole('alert')).toContainText(/maximum.*5 files|too many files/i);
+});
+```
+
+### Validating Image Dimensions
+
+```typescript
+test('rejects image below minimum dimensions', async ({ page }) => {
+  await page.goto('/profile/avatar');
+
+  const fileInput = page.locator('input[type="file"]');
+  // Minimal 1x1 PNG
+  const tinyPng = Buffer.from(
+    'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==',
+    'base64'
+  );
+
+  await fileInput.setInputFiles({
+    name: 'tiny.png',
+    mimeType: 'image/png',
+    buffer: tinyPng,
+  });
+
+  await expect(page.getByRole('alert')).toContainText(/minimum.*dimensions|too small/i);
+});
+```
+
+---
+
+## Image Preview
+
+```typescript
+test('shows image preview after selection', async ({ page }) => {
+  await page.goto('/profile/avatar');
+
+  const fileInput = page.locator('input[type="file"]');
+  await fileInput.setInputFiles(path.resolve(__dirname, '../fixtures/photo.jpg'));
+
+  const preview = page.getByRole('img', { name: /preview|avatar/i });
+  await expect(preview).toBeVisible();
+
+  const src = await preview.getAttribute('src');
+  expect(src).toMatch(/^(blob:|data:image)/);
+});
+```
+
+---
+
+## Authenticated Downloads
+
+```typescript
+test('downloads file requiring authentication', async ({ page, request }) => {
+  await page.goto('/attachments');
+
+  // Browser download works because cookies are sent
+  const downloadPromise = page.waitForEvent('download');
+  await page.getByRole('link', { name: 'confidential.pdf' }).click();
+
+  const download = await downloadPromise;
+  expect(download.suggestedFilename()).toBe('confidential.pdf');
+
+  // Verify via API request (carries auth context)
+  const response = await request.get('/api/attachments/456/download');
+  expect(response.ok()).toBeTruthy();
+  expect(response.headers()['content-type']).toContain('application/pdf');
+});
+```
+
+---
+
+## Tips
+
+1. **Use `setInputFiles` for uploads**. Even drag-and-drop zones have an underlying `input[type="file"]`. Target it directly instead of simulating OS-level drag events.
+
+2. **Prefer in-memory buffers**. Creating files with `Buffer.from()` keeps tests self-contained. Use fixture files only when you need real content (e.g., a valid PDF your app parses).
+
+3. **Set up download listener before clicking**. Call `page.waitForEvent('download')` before the click that triggers the download—otherwise you may miss the event.
+
+4. **Use `createReadStream()` for content verification**. Reading directly from the stream avoids disk I/O and cleanup of temporary files.
+
+5. **Test both `accept` attribute and JavaScript validation**. The HTML `accept` attribute only filters the OS file dialog. `setInputFiles()` bypasses it, which is exactly what you need to test your app's JavaScript validation.
diff --git a/.cursor/skills/playwright-testing/testing-patterns/forms-validation.md b/.cursor/skills/playwright-testing/testing-patterns/forms-validation.md
new file mode 100644
index 0000000000..2cfdd8cdcd
--- /dev/null
+++ b/.cursor/skills/playwright-testing/testing-patterns/forms-validation.md
@@ -0,0 +1,561 @@
+# Form Testing Patterns
+
+## Table of Contents
+
+1. [Quick Reference](#quick-reference)
+2. [Patterns](#patterns)
+3. [Decision Guide](#decision-guide)
+4. [Anti-Patterns](#anti-patterns)
+5. [Troubleshooting](#troubleshooting)
+
+> **When to use**: Testing form filling, submission, validation messages, multi-step wizards, dynamic fields, and auto-complete interactions.
+
+## Quick Reference
+
+```typescript
+// Text input
+await page.getByLabel("Username").fill("john_doe");
+
+// Select dropdown
+await page.getByLabel("Region").selectOption("EU");
+await page.getByLabel("Region").selectOption({ label: "Europe" });
+
+// Checkbox and radio
+await page.getByLabel("Subscribe").check();
+await page.getByLabel("Priority shipping").click();
+
+// Date input
+await page.getByLabel("Departure").fill("2025-08-20");
+
+// Clear a field
+await page.getByLabel("Username").clear();
+
+// Submit
+await page.getByRole("button", { name: "Register" }).click();
+
+// Verify validation error
+await expect(page.getByText("Username is required")).toBeVisible();
+```
+
+## Patterns
+
+### Auto-Complete and Typeahead Fields
+
+**Use when**: Testing search fields, address lookups, mention pickers, or any input that shows suggestions as the user types.
+
+```typescript
+test("select from typeahead suggestions", async ({ page }) => {
+  await page.goto("/products");
+
+  const searchBox = page.getByRole("combobox", { name: "Find product" });
+  await searchBox.pressSequentially("lapt", { delay: 100 });
+
+  const suggestionList = page.getByRole("listbox");
+  await expect(suggestionList).toBeVisible();
+
+  await suggestionList.getByRole("option", { name: "Laptop Pro" }).click();
+  await expect(searchBox).toHaveValue("Laptop Pro");
+});
+
+test("typeahead with API-driven suggestions", async ({ page }) => {
+  await page.goto("/shipping");
+
+  const streetField = page.getByLabel("Street");
+  const responsePromise = page.waitForResponse("**/api/address-lookup*");
+  await streetField.pressSequentially("456 Elm", { delay: 50 });
+
+  await responsePromise;
+
+  await page.getByRole("option", { name: /456 Elm St/ }).click();
+
+  await expect(page.getByLabel("Town")).toHaveValue("Austin");
+  await expect(page.getByLabel("State")).toHaveValue("TX");
+  await expect(page.getByLabel("Postal code")).toHaveValue("78701");
+});
+
+test("dismiss suggestions and enter custom value", async ({ page }) => {
+  await page.goto("/labels");
+
+  const labelInput = page.getByLabel("New label");
+  await labelInput.pressSequentially("my-label");
+
+  await labelInput.press("Escape");
+  await expect(page.getByRole("listbox")).not.toBeVisible();
+
+  await labelInput.press("Enter");
+  await expect(page.getByText("my-label")).toBeVisible();
+});
+```
+
+### Dynamic Forms — Conditional Fields
+
+**Use when**: Form fields appear, disappear, or change based on the value of other fields.
+
+```typescript
+test("conditional fields appear based on selection", async ({ page }) => {
+  await page.goto("/loan/apply");
+
+  await page.getByLabel("Applicant type").selectOption("corporate");
+
+  await expect(page.getByLabel("Business name")).toBeVisible();
+  await expect(page.getByLabel("EIN")).toBeVisible();
+
+  await page.getByLabel("Business name").fill("TechCorp Inc");
+  await page.getByLabel("EIN").fill("98-7654321");
+
+  await page.getByLabel("Applicant type").selectOption("individual");
+  await expect(page.getByLabel("Business name")).not.toBeVisible();
+  await expect(page.getByLabel("EIN")).not.toBeVisible();
+});
+
+test("checkbox toggles additional section", async ({ page }) => {
+  await page.goto("/delivery");
+
+  await page.getByLabel("Separate invoice address").check();
+
+  const invoiceSection = page.getByRole("group", { name: "Invoice address" });
+  await expect(invoiceSection).toBeVisible();
+
+  await invoiceSection.getByLabel("Address").fill("789 Pine Rd");
+  await invoiceSection.getByLabel("City").fill("Denver");
+
+  await page.getByLabel("Separate invoice address").uncheck();
+  await expect(invoiceSection).not.toBeVisible();
+});
+
+test("dependent dropdown chains", async ({ page }) => {
+  await page.goto("/region-selector");
+
+  await page.getByLabel("Country").selectOption("CA");
+
+  const provinceDropdown = page.getByLabel("Province");
+  await expect(provinceDropdown.getByRole("option")).not.toHaveCount(0);
+
+  await provinceDropdown.selectOption("ON");
+
+  const cityDropdown = page.getByLabel("City");
+  await expect(cityDropdown.getByRole("option")).not.toHaveCount(0);
+
+  await cityDropdown.selectOption({ label: "Toronto" });
+});
+```
+
+### Multi-Step Forms and Wizards
+
+**Use when**: The form spans multiple pages or steps, with next/previous navigation and per-step validation.
+
+```typescript
+test("complete a multi-step booking wizard", async ({ page }) => {
+  await page.goto("/booking");
+
+  await test.step("enter guest information", async () => {
+    await expect(
+      page.getByRole("heading", { name: "Guest Info" }),
+    ).toBeVisible();
+
+    await page.getByLabel("Full name").fill("Alice Smith");
+    await page.getByLabel("Email").fill("alice@test.com");
+    await page.getByLabel("Phone").fill("555-1234");
+
+    await page.getByRole("button", { name: "Next" }).click();
+  });
+
+  await test.step("select room options", async () => {
+    await expect(
+      page.getByRole("heading", { name: "Room Selection" }),
+    ).toBeVisible();
+
+    await page.getByLabel("Room type").selectOption("suite");
+    await page.getByLabel("Check-in").fill("2025-09-01");
+    await page.getByLabel("Check-out").fill("2025-09-05");
+
+    await page.getByRole("button", { name: "Next" }).click();
+  });
+
+  await test.step("confirm booking", async () => {
+    await expect(
+      page.getByRole("heading", { name: "Confirmation" }),
+    ).toBeVisible();
+
+    await expect(page.getByText("Alice Smith")).toBeVisible();
+    await expect(page.getByText("suite")).toBeVisible();
+
+    await page.getByRole("button", { name: "Confirm booking" }).click();
+  });
+
+  await expect(
+    page.getByRole("heading", { name: "Booking complete" }),
+  ).toBeVisible();
+});
+
+test("wizard validates each step before proceeding", async ({ page }) => {
+  await page.goto("/booking");
+
+  await page.getByRole("button", { name: "Next" }).click();
+
+  await expect(page.getByRole("heading", { name: "Guest Info" })).toBeVisible();
+  await expect(page.getByText("Full name is required")).toBeVisible();
+});
+
+test("wizard supports going back without losing data", async ({ page }) => {
+  await page.goto("/booking");
+
+  await page.getByLabel("Full name").fill("Alice Smith");
+  await page.getByLabel("Email").fill("alice@test.com");
+  await page.getByLabel("Phone").fill("555-1234");
+  await page.getByRole("button", { name: "Next" }).click();
+
+  await page.getByRole("button", { name: "Previous" }).click();
+
+  await expect(page.getByLabel("Full name")).toHaveValue("Alice Smith");
+  await expect(page.getByLabel("Email")).toHaveValue("alice@test.com");
+});
+```
+
+### Form Submission and Response Handling
+
+**Use when**: Testing what happens after a form is submitted — success messages, redirects, error responses from the server, and loading states during submission.
+
+```typescript
+test("successful form submission shows confirmation", async ({ page }) => {
+  await page.goto("/feedback");
+
+  await page.getByLabel("Subject").fill("Feature request");
+  await page.getByLabel("Email").fill("user@test.com");
+  await page.getByLabel("Details").fill("Please add dark mode");
+
+  const responsePromise = page.waitForResponse("**/api/feedback");
+  await page.getByRole("button", { name: "Submit feedback" }).click();
+  const response = await responsePromise;
+
+  expect(response.status()).toBe(200);
+  await expect(page.getByText("Feedback received")).toBeVisible();
+});
+
+test("form submission shows server-side validation errors", async ({
+  page,
+}) => {
+  await page.goto("/signup");
+
+  await page.getByLabel("Email").fill("existing@test.com");
+  await page.getByLabel("Password", { exact: true }).fill("Secure1@pass");
+  await page.getByRole("button", { name: "Sign up" }).click();
+
+  await expect(
+    page.getByText("Email address already registered"),
+  ).toBeVisible();
+});
+
+test("form shows loading state during submission", async ({ page }) => {
+  await page.goto("/feedback");
+
+  await page.getByLabel("Subject").fill("Bug report");
+  await page.getByLabel("Email").fill("user@test.com");
+  await page.getByLabel("Details").fill("Found an issue");
+
+  const submit = page.getByRole("button", {
+    name: /Submit feedback|Submitting/,
+  });
+  await submit.click();
+
+  await expect(submit).toHaveText(/Submitting/);
+  await expect(submit).toBeDisabled();
+
+  await expect(submit).toHaveText("Submit feedback");
+  await expect(submit).toBeEnabled();
+});
+
+test("form redirects after successful submission", async ({ page }) => {
+  await page.goto("/auth/login");
+
+  await page.getByLabel("Email").fill("admin@test.com");
+  await page.getByLabel("Password").fill("admin123");
+  await page.getByRole("button", { name: "Log in" }).click();
+
+  await page.waitForURL("/home");
+  await expect(page.getByRole("heading", { name: "Welcome" })).toBeVisible();
+});
+```
+
+### Filling Basic Form Fields
+
+**Use when**: Testing any form with standard HTML inputs — text, email, password, number, textarea, select, checkbox, radio.
+
+```typescript
+test("fill and submit a signup form", async ({ page }) => {
+  await page.goto("/signup");
+
+  await page.getByLabel("First name").fill("Bob");
+  await page.getByLabel("Last name").fill("Wilson");
+  await page.getByLabel("Email").fill("bob@test.com");
+  await page.getByLabel("Password", { exact: true }).fill("P@ssw0rd!");
+  await page.getByLabel("Confirm password").fill("P@ssw0rd!");
+
+  await page.getByLabel("About you").fill("Developer with 5 years experience.");
+  await page.getByLabel("Years of experience").fill("5");
+
+  await page.getByLabel("Country").selectOption("UK");
+  await page.getByLabel("City").selectOption({ label: "London" });
+  await page
+    .getByLabel("Skills")
+    .selectOption(["typescript", "playwright", "nodejs"]);
+
+  await page.getByLabel("Accept terms").check();
+  await expect(page.getByLabel("Accept terms")).toBeChecked();
+
+  await page.getByLabel("Annual billing").check();
+  await expect(page.getByLabel("Annual billing")).toBeChecked();
+
+  await page.getByRole("button", { name: "Create account" }).click();
+  await expect(page.getByRole("heading", { name: "Welcome" })).toBeVisible();
+});
+```
+
+### Date and Time Inputs
+
+**Use when**: Testing native `<input type="date">`, `<input type="time">`, `<input type="datetime-local">`, or third-party date pickers.
+
+```typescript
+test("fill native date and time inputs", async ({ page }) => {
+  await page.goto("/reservation");
+
+  await page.getByLabel("Reservation date").fill("2025-07-10");
+  await expect(page.getByLabel("Reservation date")).toHaveValue("2025-07-10");
+
+  await page.getByLabel("Time slot").fill("18:00");
+  await page.getByLabel("Reminder").fill("2025-07-10T17:30");
+});
+
+test("interact with a third-party date picker", async ({ page }) => {
+  await page.goto("/reservation");
+
+  await page.getByLabel("Event date").click();
+  await page.getByRole("button", { name: "Next month" }).click();
+  await page.getByRole("gridcell", { name: "25" }).click();
+
+  await expect(page.getByLabel("Event date")).toHaveValue(/2025/);
+});
+```
+
+### Required Field Validation
+
+**Use when**: Testing that the form shows appropriate error messages when required fields are empty.
+
+```typescript
+test("shows validation errors for empty required fields", async ({ page }) => {
+  await page.goto("/inquiry");
+
+  await page.getByRole("button", { name: "Send inquiry" }).click();
+
+  await expect(page.getByText("Name is required")).toBeVisible();
+  await expect(page.getByText("Email is required")).toBeVisible();
+  await expect(page.getByText("Question is required")).toBeVisible();
+
+  await expect(page).toHaveURL(/\/inquiry/);
+});
+
+test("clears validation errors when fields are filled", async ({ page }) => {
+  await page.goto("/inquiry");
+
+  await page.getByRole("button", { name: "Send inquiry" }).click();
+  await expect(page.getByText("Name is required")).toBeVisible();
+
+  await page.getByLabel("Name").fill("Carol Brown");
+  await page.getByLabel("Email").focus();
+
+  await expect(page.getByText("Name is required")).not.toBeVisible();
+});
+
+test("native HTML5 validation with required attribute", async ({ page }) => {
+  await page.goto("/basic-form");
+
+  await page.getByRole("button", { name: "Submit" }).click();
+
+  const emailInput = page.getByLabel("Email");
+  const validationMessage = await emailInput.evaluate(
+    (el: HTMLInputElement) => el.validationMessage,
+  );
+  expect(validationMessage).toBeTruthy();
+});
+```
+
+### Format Validation and Custom Rules
+
+**Use when**: Testing email format, phone number format, password strength, and business-specific validation rules.
+
+```typescript
+test("validates email format", async ({ page }) => {
+  await page.goto("/signup");
+
+  const emailField = page.getByLabel("Email");
+
+  const invalidEmails = [
+    "invalid",
+    "missing@",
+    "@nodomain.com",
+    "has spaces@mail.com",
+  ];
+
+  for (const email of invalidEmails) {
+    await emailField.fill(email);
+    await emailField.blur();
+    await expect(page.getByText("Enter a valid email address")).toBeVisible();
+  }
+
+  await emailField.fill("correct@domain.com");
+  await emailField.blur();
+  await expect(page.getByText("Enter a valid email address")).not.toBeVisible();
+});
+
+test("validates password strength rules", async ({ page }) => {
+  await page.goto("/signup");
+
+  const passwordField = page.getByLabel("Password", { exact: true });
+
+  await passwordField.fill("Xy1!");
+  await passwordField.blur();
+  await expect(page.getByText("Minimum 8 characters")).toBeVisible();
+
+  await passwordField.fill("lowercase1!");
+  await passwordField.blur();
+  await expect(page.getByText("Include an uppercase letter")).toBeVisible();
+
+  await passwordField.fill("SecureP@ss1");
+  await passwordField.blur();
+  await expect(page.getByText(/Minimum|Include/)).not.toBeVisible();
+});
+
+test("validates custom business rule — minimum amount", async ({ page }) => {
+  await page.goto("/transfer");
+
+  await page.getByLabel("Amount").fill("5");
+  await page.getByLabel("Amount").blur();
+  await expect(page.getByText("Minimum transfer is $10")).toBeVisible();
+
+  await page.getByLabel("Amount").fill("1000000");
+  await page.getByLabel("Amount").blur();
+  await expect(page.getByText("Maximum transfer is $100,000")).toBeVisible();
+
+  await page.getByLabel("Amount").fill("500");
+  await page.getByLabel("Amount").blur();
+  await expect(page.getByText(/Minimum|Maximum/)).not.toBeVisible();
+});
+```
+
+### Form Reset Testing
+
+**Use when**: Testing "clear form" or "reset" functionality, verifying that fields return to their default values.
+
+```typescript
+test("reset button clears all fields to defaults", async ({ page }) => {
+  await page.goto("/preferences");
+
+  await page.getByLabel("Nickname").fill("CustomNick");
+  await page.getByLabel("Language").selectOption("es");
+  await page.getByLabel("Email alerts").uncheck();
+
+  await page.getByRole("button", { name: "Reset" }).click();
+
+  await expect(page.getByLabel("Nickname")).toHaveValue("");
+  await expect(page.getByLabel("Language")).toHaveValue("en");
+  await expect(page.getByLabel("Email alerts")).toBeChecked();
+});
+
+test("confirmation dialog before resetting a dirty form", async ({ page }) => {
+  await page.goto("/document");
+
+  await page.getByLabel("Document title").fill("Draft document");
+
+  page.on("dialog", (dialog) => dialog.accept());
+  await page.getByRole("button", { name: "Clear changes" }).click();
+
+  await expect(page.getByLabel("Document title")).toHaveValue("");
+});
+```
+
+## Decision Guide
+
+| Scenario                             | Approach                                               | Key API                                                      |
+| ------------------------------------ | ------------------------------------------------------ | ------------------------------------------------------------ |
+| Standard text input                  | `fill()` (clears, then types)                          | `page.getByLabel('Field').fill('value')`                     |
+| Need keystroke events (autocomplete) | `pressSequentially()` with delay                       | `locator.pressSequentially('text', { delay: 100 })`          |
+| Native `<select>` dropdown           | `selectOption()` by value or label                     | `locator.selectOption('US')` or `{ label: 'United States' }` |
+| Custom dropdown (ARIA listbox)       | Click trigger, then select option role                 | `getByRole('option', { name: '...' }).click()`               |
+| Checkbox                             | `check()` / `uncheck()` (idempotent)                   | `locator.check()` — safe to call even if already checked     |
+| Radio button                         | `check()` on the target radio                          | `page.getByLabel('Option').check()`                          |
+| Date input (native)                  | `fill()` with ISO format                               | `locator.fill('2025-03-15')`                                 |
+| Date picker (third-party)            | Click to open, navigate, select day                    | `getByRole('gridcell', { name: '15' }).click()`              |
+| Validation errors                    | Submit, then assert error text                         | `expect(page.getByText('Required')).toBeVisible()`           |
+| Multi-step wizard                    | `test.step()` per step, assert heading                 | `await test.step('Step 1', async () => { ... })`             |
+| Conditional/dynamic fields           | Change trigger field, assert new field visibility      | `expect(locator).toBeVisible()` / `.not.toBeVisible()`       |
+| Form submission                      | `waitForResponse` + click submit                       | Register response listener before click                      |
+| Auto-complete                        | `pressSequentially()`, wait for listbox, select option | `getByRole('option', { name }).click()`                      |
+| Form reset                           | Click reset, assert default values                     | `expect(locator).toHaveValue('')`                            |
+
+## Anti-Patterns
+
+| Don't Do This                                           | Problem                                                    | Do This Instead                                        |
+| ------------------------------------------------------- | ---------------------------------------------------------- | ------------------------------------------------------ |
+| `await page.getByLabel('Field').type('value')`          | `type()` appends to existing content; does not clear first | `await page.getByLabel('Field').fill('value')`         |
+| `await page.getByLabel('Option').click()`               | `click()` toggles — if already checked, it unchecks        | `await page.getByLabel('Option').check()`              |
+| `await page.fill('#email', 'test@test.com')`            | CSS selector is fragile                                    | `await page.getByLabel('Email').fill('test@test.com')` |
+| `await page.selectOption('select', 'US')` without label | Targets first `<select>` on page; ambiguous                | `await page.getByLabel('Country').selectOption('US')`  |
+| Testing every invalid input in one test                 | Test becomes huge, slow, and hard to debug                 | One test per validation rule or group related rules    |
+| `expect(await input.inputValue()).toBe('value')`        | Resolves once — no retry. Race condition.                  | `await expect(input).toHaveValue('value')`             |
+| Filling fields with `page.evaluate()`                   | Bypasses event handlers (no `input`, `change` events fire) | Use `fill()` or `pressSequentially()`                  |
+| Not waiting for conditional fields before filling       | `fill()` fails on hidden/detached elements                 | `await expect(field).toBeVisible()` first              |
+| Hardcoding wait after selecting a dropdown              | `waitForTimeout(500)` is flaky and slow                    | Wait for the dependent element to appear               |
+| Skipping server-side validation tests                   | Client-side validation can be bypassed                     | Test both client-side UX and server response           |
+
+## Troubleshooting
+
+### `fill()` does nothing or clears but doesn't type
+
+**Cause**: The input field uses a contenteditable div (rich text editors), not a real `<input>` or `<textarea>`.
+
+```typescript
+const isContentEditable = await page
+  .getByTestId("editor")
+  .evaluate((el) => el.getAttribute("contenteditable"));
+
+if (isContentEditable) {
+  await page.getByTestId("editor").click();
+  await page.getByTestId("editor").pressSequentially("Hello world");
+}
+```
+
+### Date picker does not accept `fill()` value
+
+**Cause**: Third-party date pickers often render custom UI over a hidden input. `fill()` sets the hidden input but the UI does not update.
+
+```typescript
+await page.getByLabel("Date").click();
+await page.getByRole("button", { name: "Next month" }).click();
+await page.getByRole("gridcell", { name: "15" }).click();
+
+// Alternatively, if the library reads from the input on change:
+await page.getByLabel("Date").fill("2025-06-15");
+await page.getByLabel("Date").dispatchEvent("change");
+```
+
+### `selectOption()` throws "not a select element"
+
+**Cause**: The dropdown is a custom component (ARIA listbox), not a native `<select>`.
+
+```typescript
+await page.getByRole("combobox", { name: "Country" }).click();
+await page.getByRole("option", { name: "United States" }).click();
+```
+
+### Validation errors do not appear after `fill()` and submit
+
+**Cause**: The validation triggers on `blur` (focus leaving the field), but `fill()` does not trigger blur automatically.
+
+```typescript
+await page.getByLabel("Email").fill("invalid");
+await page.getByLabel("Email").blur();
+await expect(page.getByText("Enter a valid email")).toBeVisible();
+
+// Or move focus to the next field
+await page.getByLabel("Password").focus();
+```
diff --git a/.cursor/skills/playwright-testing/testing-patterns/graphql-testing.md b/.cursor/skills/playwright-testing/testing-patterns/graphql-testing.md
new file mode 100644
index 0000000000..d2d35044d5
--- /dev/null
+++ b/.cursor/skills/playwright-testing/testing-patterns/graphql-testing.md
@@ -0,0 +1,331 @@
+# GraphQL Testing
+
+## Table of Contents
+
+1. [Patterns](#patterns)
+2. [Anti-Patterns](#anti-patterns)
+3. [Troubleshooting](#troubleshooting)
+
+> **When to use**: Testing GraphQL APIs — queries, mutations, variables, and error handling.
+
+## Patterns
+
+### Basic Query with Variables
+
+All GraphQL requests go through `POST` to a single endpoint. Send `query`, `variables`, and optionally `operationName` in the JSON body.
+
+```typescript
+import { test, expect } from "@playwright/test";
+
+const GQL_ENDPOINT = "/graphql";
+
+test("query with variables", async ({ request }) => {
+  const resp = await request.post(GQL_ENDPOINT, {
+    data: {
+      query: `
+        query FetchItem($id: ID!) {
+          item(id: $id) {
+            id
+            title
+            price
+            reviews { id rating }
+          }
+        }
+      `,
+      variables: { id: "101" },
+    },
+  });
+
+  expect(resp.ok()).toBeTruthy();
+  const { data, errors } = await resp.json();
+
+  // GraphQL returns 200 even on errors — always check both
+  expect(errors).toBeUndefined();
+  expect(data.item).toMatchObject({
+    id: "101",
+    title: expect.any(String),
+    price: expect.any(Number),
+  });
+  expect(data.item.reviews).toEqual(
+    expect.arrayContaining([
+      expect.objectContaining({
+        id: expect.any(String),
+        rating: expect.any(Number),
+      }),
+    ])
+  );
+});
+```
+
+### Mutations
+
+```typescript
+import { test, expect } from "@playwright/test";
+
+const GQL_ENDPOINT = "/graphql";
+
+test("mutation creates resource", async ({ request }) => {
+  const resp = await request.post(GQL_ENDPOINT, {
+    data: {
+      query: `
+        mutation AddItem($input: ItemInput!) {
+          addItem(input: $input) {
+            id
+            title
+            status
+          }
+        }
+      `,
+      variables: {
+        input: {
+          title: "New Widget",
+          price: 15.0,
+          status: "DRAFT",
+        },
+      },
+    },
+  });
+
+  const { data, errors } = await resp.json();
+  expect(errors).toBeUndefined();
+  expect(data.addItem).toMatchObject({
+    id: expect.any(String),
+    title: "New Widget",
+    status: "DRAFT",
+  });
+});
+```
+
+### Validation Errors
+
+```typescript
+import { test, expect } from "@playwright/test";
+
+const GQL_ENDPOINT = "/graphql";
+
+test("handles validation errors", async ({ request }) => {
+  const resp = await request.post(GQL_ENDPOINT, {
+    data: {
+      query: `
+        mutation AddItem($input: ItemInput!) {
+          addItem(input: $input) { id }
+        }
+      `,
+      variables: { input: { title: "" } },
+    },
+  });
+
+  const { data, errors } = await resp.json();
+  expect(errors).toBeDefined();
+  expect(errors.length).toBeGreaterThan(0);
+  expect(errors[0].message).toContain("title");
+  expect(errors[0].extensions?.code).toBe("BAD_USER_INPUT");
+});
+```
+
+### Authorization Errors
+
+```typescript
+import { test, expect } from "@playwright/test";
+
+const GQL_ENDPOINT = "/graphql";
+
+test("handles authorization errors", async ({ request }) => {
+  const resp = await request.post(GQL_ENDPOINT, {
+    data: {
+      query: `
+        query AdminDashboard {
+          adminMetrics { revenue activeUsers }
+        }
+      `,
+    },
+  });
+
+  const { data, errors } = await resp.json();
+  expect(errors).toBeDefined();
+  expect(errors[0].extensions?.code).toBe("UNAUTHORIZED");
+  expect(data?.adminMetrics).toBeNull();
+});
+```
+
+### Authenticated GraphQL Fixture
+
+```typescript
+// fixtures/graphql-fixtures.ts
+import { test as base, expect, APIRequestContext } from "@playwright/test";
+
+type GraphQLFixtures = {
+  gqlClient: APIRequestContext;
+  adminGqlClient: APIRequestContext;
+};
+
+export const test = base.extend<GraphQLFixtures>({
+  gqlClient: async ({ playwright }, use) => {
+    const ctx = await playwright.request.newContext({
+      baseURL: "https://api.myapp.io",
+      extraHTTPHeaders: {
+        Authorization: `Bearer ${process.env.API_TOKEN}`,
+        "Content-Type": "application/json",
+      },
+    });
+    await use(ctx);
+    await ctx.dispose();
+  },
+
+  adminGqlClient: async ({ playwright }, use) => {
+    const loginCtx = await playwright.request.newContext({
+      baseURL: "https://api.myapp.io",
+    });
+    const loginResp = await loginCtx.post("/graphql", {
+      data: {
+        query: `
+          mutation Login($email: String!, $password: String!) {
+            login(email: $email, password: $password) { token }
+          }
+        `,
+        variables: {
+          email: process.env.ADMIN_EMAIL,
+          password: process.env.ADMIN_PASSWORD,
+        },
+      },
+    });
+    const { data } = await loginResp.json();
+    
+    if (!data?.login?.token) {
+      throw new Error(`Admin login failed: status ${loginResp.status()}, response: ${JSON.stringify(data)}`);
+    }
+    
+    await loginCtx.dispose();
+
+    const ctx = await playwright.request.newContext({
+      baseURL: "https://api.myapp.io",
+      extraHTTPHeaders: {
+        Authorization: `Bearer ${data.login.token}`,
+        "Content-Type": "application/json",
+      },
+    });
+    await use(ctx);
+    await ctx.dispose();
+  },
+});
+
+export { expect };
+```
+
+### GraphQL Helper Function
+
+```typescript
+// utils/graphql.ts
+import { APIRequestContext, expect } from "@playwright/test";
+
+export async function gqlQuery<T = any>(
+  request: APIRequestContext,
+  query: string,
+  variables?: Record<string, any>
+): Promise<{ data: T; errors?: any[] }> {
+  const resp = await request.post("/graphql", {
+    data: { query, variables },
+  });
+  expect(resp.ok()).toBeTruthy();
+  return resp.json();
+}
+
+export async function gqlMutation<T = any>(
+  request: APIRequestContext,
+  mutation: string,
+  variables?: Record<string, any>
+): Promise<{ data: T; errors?: any[] }> {
+  return gqlQuery<T>(request, mutation, variables);
+}
+```
+
+```typescript
+// tests/api/items.spec.ts
+import { test, expect } from "@playwright/test";
+import { gqlQuery, gqlMutation } from "../../utils/graphql";
+
+test("fetch and update item", async ({ request }) => {
+  const { data: fetchData } = await gqlQuery(
+    request,
+    `query GetItem($id: ID!) { item(id: $id) { id title } }`,
+    { id: "101" }
+  );
+  expect(fetchData.item.title).toBeDefined();
+
+  const { data: updateData, errors } = await gqlMutation(
+    request,
+    `mutation UpdateItem($id: ID!, $title: String!) {
+      updateItem(id: $id, title: $title) { id title }
+    }`,
+    { id: "101", title: "Updated Title" }
+  );
+  expect(errors).toBeUndefined();
+  expect(updateData.updateItem.title).toBe("Updated Title");
+});
+```
+
+## Anti-Patterns
+
+| Don't Do This | Problem | Do This Instead |
+| --- | --- | --- |
+| Check only `response.ok()` | GraphQL returns 200 even on errors — `errors` array is the real signal | Always check both `data` and `errors` in the response body |
+| Ignore `errors` array | Validation and auth errors appear in `errors`, not HTTP status | Destructure and assert: `expect(errors).toBeUndefined()` |
+| Hardcode query strings inline everywhere | Duplicated queries are hard to maintain | Extract queries to constants or use a helper function |
+| Skip variable validation | Invalid variables cause cryptic server errors | Validate input shape before sending |
+
+## Troubleshooting
+
+### GraphQL returns 200 but data is null
+
+**Cause**: GraphQL servers return HTTP 200 even when the query has errors. The actual error is in the `errors` array.
+
+**Fix**: Always destructure and check both `data` and `errors`.
+
+```typescript
+const { data, errors } = await resp.json();
+if (errors) {
+  console.error("GraphQL errors:", JSON.stringify(errors, null, 2));
+}
+expect(errors).toBeUndefined();
+expect(data.item).toBeDefined();
+```
+
+### "Cannot query field X on type Y"
+
+**Cause**: The field doesn't exist in the schema, or you're querying the wrong type.
+
+**Fix**: Verify the schema. Use introspection or check your GraphQL IDE for available fields.
+
+```typescript
+// Introspection query to debug schema
+const { data } = await request.post("/graphql", {
+  data: {
+    query: `{ __type(name: "Item") { fields { name type { name } } } }`,
+  },
+});
+console.log(data.__type.fields);
+```
+
+### Variables not being applied
+
+**Cause**: Variable names in the query don't match the `variables` object keys, or types don't match.
+
+**Fix**: Ensure variable names match exactly (case-sensitive) and types align with the schema.
+
+```typescript
+// Wrong: variable name mismatch
+const resp = await request.post("/graphql", {
+  data: {
+    query: `query GetItem($itemId: ID!) { item(id: $itemId) { id } }`,
+    variables: { id: "101" }, // Should be { itemId: "101" }
+  },
+});
+
+// Correct
+const resp = await request.post("/graphql", {
+  data: {
+    query: `query GetItem($itemId: ID!) { item(id: $itemId) { id } }`,
+    variables: { itemId: "101" },
+  },
+});
+```
diff --git a/.cursor/skills/playwright-testing/testing-patterns/i18n.md b/.cursor/skills/playwright-testing/testing-patterns/i18n.md
new file mode 100644
index 0000000000..686cdc33f9
--- /dev/null
+++ b/.cursor/skills/playwright-testing/testing-patterns/i18n.md
@@ -0,0 +1,508 @@
+# Internationalization (i18n) Testing
+
+## Table of Contents
+
+1. [Locale Configuration](#locale-configuration)
+2. [Testing Multiple Locales](#testing-multiple-locales)
+3. [RTL Layout Testing](#rtl-layout-testing)
+4. [Date, Time & Number Formats](#date-time--number-formats)
+5. [Translation Verification](#translation-verification)
+6. [Visual Regression for i18n](#visual-regression-for-i18n)
+
+## Locale Configuration
+
+### Setting Browser Locale
+
+```typescript
+// playwright.config.ts
+import { defineConfig, devices } from "@playwright/test";
+
+export default defineConfig({
+  projects: [
+    {
+      name: "english",
+      use: {
+        ...devices["Desktop Chrome"],
+        locale: "en-US",
+        timezoneId: "America/New_York",
+      },
+    },
+    {
+      name: "german",
+      use: {
+        ...devices["Desktop Chrome"],
+        locale: "de-DE",
+        timezoneId: "Europe/Berlin",
+      },
+    },
+    {
+      name: "japanese",
+      use: {
+        ...devices["Desktop Chrome"],
+        locale: "ja-JP",
+        timezoneId: "Asia/Tokyo",
+      },
+    },
+    {
+      name: "arabic",
+      use: {
+        ...devices["Desktop Chrome"],
+        locale: "ar-SA",
+        timezoneId: "Asia/Riyadh",
+      },
+    },
+  ],
+});
+```
+
+### Per-Test Locale Override
+
+```typescript
+test("test in French locale", async ({ browser }) => {
+  const context = await browser.newContext({
+    locale: "fr-FR",
+    timezoneId: "Europe/Paris",
+  });
+
+  const page = await context.newPage();
+  await page.goto("/");
+
+  // Verify French content
+  await expect(page.getByRole("button", { name: "Connexion" })).toBeVisible();
+
+  await context.close();
+});
+```
+
+### Accept-Language Header
+
+```typescript
+test("server-side locale detection", async ({ browser }) => {
+  const context = await browser.newContext({
+    locale: "es-ES",
+    extraHTTPHeaders: {
+      "Accept-Language": "es-ES,es;q=0.9,en;q=0.8",
+    },
+  });
+
+  const page = await context.newPage();
+  await page.goto("/");
+
+  // Server should respond with Spanish content
+  await expect(page.locator("html")).toHaveAttribute("lang", "es");
+});
+```
+
+## Testing Multiple Locales
+
+### Parameterized Locale Tests
+
+```typescript
+const locales = [
+  { locale: "en-US", greeting: "Hello", button: "Sign In" },
+  { locale: "de-DE", greeting: "Hallo", button: "Anmelden" },
+  { locale: "fr-FR", greeting: "Bonjour", button: "Se connecter" },
+  { locale: "ja-JP", greeting: "こんにちは", button: "ログイン" },
+];
+
+for (const { locale, greeting, button } of locales) {
+  test(`login page in ${locale}`, async ({ browser }) => {
+    const context = await browser.newContext({ locale });
+    const page = await context.newPage();
+
+    await page.goto("/login");
+
+    await expect(page.getByText(greeting)).toBeVisible();
+    await expect(page.getByRole("button", { name: button })).toBeVisible();
+
+    await context.close();
+  });
+}
+```
+
+### Locale Fixture
+
+```typescript
+// fixtures/i18n.ts
+import { test as base } from "@playwright/test";
+
+type LocaleFixtures = {
+  localePage: (locale: string) => Promise<Page>;
+};
+
+export const test = base.extend<LocaleFixtures>({
+  localePage: async ({ browser }, use) => {
+    const pages: Page[] = [];
+
+    const createLocalePage = async (locale: string) => {
+      const context = await browser.newContext({ locale });
+      const page = await context.newPage();
+      pages.push(page);
+      return page;
+    };
+
+    await use(createLocalePage);
+
+    // Cleanup
+    for (const page of pages) {
+      await page.context().close();
+    }
+  },
+});
+
+// Usage
+test("compare locales", async ({ localePage }) => {
+  const enPage = await localePage("en-US");
+  const dePage = await localePage("de-DE");
+
+  await enPage.goto("/pricing");
+  await dePage.goto("/pricing");
+
+  const enPrice = await enPage.getByTestId("price").textContent();
+  const dePrice = await dePage.getByTestId("price").textContent();
+
+  expect(enPrice).toContain("$");
+  expect(dePrice).toContain("€");
+});
+```
+
+### Testing Locale Switching
+
+```typescript
+test("user can switch locale", async ({ page }) => {
+  await page.goto("/");
+
+  // Initial locale (from browser)
+  await expect(page.locator("html")).toHaveAttribute("lang", "en");
+
+  // Switch to German
+  await page.getByRole("button", { name: "Language" }).click();
+  await page.getByRole("menuitem", { name: "Deutsch" }).click();
+
+  // Verify switch
+  await expect(page.locator("html")).toHaveAttribute("lang", "de");
+  await expect(page.getByRole("heading", { level: 1 })).toContainText(
+    /Willkommen/,
+  );
+
+  // Verify persistence (reload)
+  await page.reload();
+  await expect(page.locator("html")).toHaveAttribute("lang", "de");
+});
+```
+
+## RTL Layout Testing
+
+### Setting Up RTL Tests
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  projects: [
+    {
+      name: "rtl-arabic",
+      use: {
+        locale: "ar-SA",
+        // RTL is usually set by the app based on locale
+      },
+    },
+    {
+      name: "rtl-hebrew",
+      use: {
+        locale: "he-IL",
+      },
+    },
+  ],
+});
+```
+
+### Verifying RTL Direction
+
+```typescript
+test("RTL layout is applied", async ({ page }) => {
+  await page.goto("/");
+
+  // Check document direction
+  await expect(page.locator("html")).toHaveAttribute("dir", "rtl");
+
+  // Or check computed style
+  const direction = await page.evaluate(() => {
+    return window.getComputedStyle(document.body).direction;
+  });
+  expect(direction).toBe("rtl");
+});
+```
+
+### RTL-Specific Element Positioning
+
+```typescript
+test("sidebar is on the right in RTL", async ({ page }) => {
+  await page.goto("/dashboard");
+
+  const sidebar = page.getByTestId("sidebar");
+  const main = page.getByTestId("main-content");
+
+  const sidebarBox = await sidebar.boundingBox();
+  const mainBox = await main.boundingBox();
+
+  // In RTL, sidebar should be to the right of main content
+  expect(sidebarBox!.x).toBeGreaterThan(mainBox!.x);
+});
+```
+
+### RTL Visual Regression
+
+```typescript
+test("RTL layout matches snapshot", async ({ page }) => {
+  await page.goto("/");
+
+  // Screenshot for RTL comparison
+  await expect(page).toHaveScreenshot("homepage-rtl.png", {
+    // Separate snapshots per locale/direction
+    fullPage: true,
+  });
+});
+
+// LTR comparison
+test("LTR layout matches snapshot", async ({ browser }) => {
+  const context = await browser.newContext({ locale: "en-US" });
+  const page = await context.newPage();
+
+  await page.goto("/");
+  await expect(page).toHaveScreenshot("homepage-ltr.png", { fullPage: true });
+});
+```
+
+### Testing Bidirectional Text
+
+```typescript
+test("bidirectional text renders correctly", async ({ page }) => {
+  await page.goto("/profile");
+
+  // Mixed LTR/RTL content
+  const nameField = page.getByTestId("full-name");
+
+  // Arabic name with English email
+  await expect(nameField).toContainText("محمد (mohammed@example.com)");
+
+  // Verify text doesn't overlap or break
+  const box = await nameField.boundingBox();
+  expect(box!.width).toBeGreaterThan(100); // Content not collapsed
+});
+```
+
+## Date, Time & Number Formats
+
+### Testing Date Formats
+
+```typescript
+test("dates are formatted per locale", async ({ browser }) => {
+  const testDate = new Date("2024-03-15");
+
+  const formats = [
+    { locale: "en-US", expected: "March 15, 2024" },
+    { locale: "en-GB", expected: "15 March 2024" },
+    { locale: "de-DE", expected: "15. März 2024" },
+    { locale: "ja-JP", expected: "2024年3月15日" },
+  ];
+
+  for (const { locale, expected } of formats) {
+    const context = await browser.newContext({ locale });
+    const page = await context.newPage();
+
+    await page.goto(`/event?date=${testDate.toISOString()}`);
+
+    const dateDisplay = page.getByTestId("event-date");
+    await expect(dateDisplay).toContainText(expected);
+
+    await context.close();
+  }
+});
+```
+
+### Testing Number Formats
+
+```typescript
+test("numbers are formatted per locale", async ({ browser }) => {
+  const testNumber = 1234567.89;
+
+  const formats = [
+    { locale: "en-US", expected: "1,234,567.89" },
+    { locale: "de-DE", expected: "1.234.567,89" },
+    { locale: "fr-FR", expected: "1 234 567,89" },
+  ];
+
+  for (const { locale, expected } of formats) {
+    const context = await browser.newContext({ locale });
+    const page = await context.newPage();
+
+    await page.goto(`/stats?value=${testNumber}`);
+
+    await expect(page.getByTestId("formatted-number")).toHaveText(expected);
+
+    await context.close();
+  }
+});
+```
+
+### Testing Currency Formats
+
+```typescript
+test("currency displays correctly", async ({ browser }) => {
+  const price = 99.99;
+
+  const currencies = [
+    { locale: "en-US", currency: "USD", expected: "$99.99" },
+    { locale: "de-DE", currency: "EUR", expected: "99,99 €" },
+    { locale: "ja-JP", currency: "JPY", expected: "￥100" }, // JPY has no decimals
+    { locale: "en-GB", currency: "GBP", expected: "£99.99" },
+  ];
+
+  for (const { locale, currency, expected } of currencies) {
+    const context = await browser.newContext({ locale });
+    const page = await context.newPage();
+
+    await page.goto(`/product?price=${price}&currency=${currency}`);
+
+    await expect(page.getByTestId("price")).toContainText(expected);
+
+    await context.close();
+  }
+});
+```
+
+## Translation Verification
+
+### Checking for Missing Translations
+
+```typescript
+test("no missing translations", async ({ page }) => {
+  await page.goto("/");
+
+  // Common patterns for missing translations
+  const missingPatterns = [
+    /\{\{.*\}\}/, // Handlebars-style
+    /\$\{.*\}/, // Template literal style
+    /t\(["'][\w.]+["']\)/, // i18n key exposed
+    /MISSING_TRANSLATION/, // Common placeholder
+    /\[UNTRANSLATED\]/, // Another placeholder
+  ];
+
+  const bodyText = await page.locator("body").textContent();
+
+  for (const pattern of missingPatterns) {
+    expect(bodyText).not.toMatch(pattern);
+  }
+});
+```
+
+### Detecting Text Overflow
+
+```typescript
+test("translations fit UI containers", async ({ browser }) => {
+  const locales = ["en-US", "de-DE", "fr-FR", "es-ES"];
+  const issues: string[] = [];
+
+  for (const locale of locales) {
+    const context = await browser.newContext({ locale });
+    const page = await context.newPage();
+    await page.goto("/");
+
+    const overflowing = await page.evaluate(() => {
+      const elements = document.querySelectorAll("button, .label, h1, h2, h3");
+      return Array.from(elements)
+        .filter(
+          (el) =>
+            (el as HTMLElement).scrollWidth > (el as HTMLElement).clientWidth,
+        )
+        .map((el) => `${el.tagName}: "${el.textContent?.substring(0, 20)}..."`);
+    });
+
+    if (overflowing.length > 0)
+      issues.push(`${locale}: ${overflowing.join(", ")}`);
+    await context.close();
+  }
+
+  expect(issues).toEqual([]);
+});
+```
+
+## Visual Regression for i18n
+
+### Locale-Specific Snapshots
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  snapshotPathTemplate:
+    "{testDir}/__snapshots__/{projectName}/{testFilePath}/{arg}{ext}",
+
+  projects: [
+    { name: "en-US", use: { locale: "en-US" } },
+    { name: "de-DE", use: { locale: "de-DE" } },
+    { name: "ja-JP", use: { locale: "ja-JP" } },
+    { name: "ar-SA", use: { locale: "ar-SA" } },
+  ],
+});
+```
+
+```typescript
+// test file
+test("homepage visual", async ({ page }) => {
+  await page.goto("/");
+
+  // Snapshot auto-saved to {projectName}/homepage.png
+  await expect(page).toHaveScreenshot("homepage.png");
+});
+```
+
+### Critical Element Screenshots
+
+```typescript
+test("navigation in all locales", async ({ page }) => {
+  await page.goto("/");
+
+  // Just the nav - catches overflow, truncation
+  const nav = page.getByRole("navigation");
+  await expect(nav).toHaveScreenshot("navigation.png");
+});
+
+test("buttons dont truncate", async ({ page }) => {
+  await page.goto("/checkout");
+
+  const ctaButton = page.getByRole("button", {
+    name: /checkout|kaufen|acheter/i,
+  });
+  await expect(ctaButton).toHaveScreenshot("checkout-button.png");
+});
+```
+
+### Font Loading for i18n
+
+```typescript
+test("wait for fonts before screenshot", async ({ page }) => {
+  await page.goto("/");
+
+  // Wait for fonts (important for CJK, Arabic)
+  await page.evaluate(() => document.fonts.ready);
+  await page.waitForFunction(() =>
+    document.fonts.check("16px 'Noto Sans Arabic'"),
+  );
+
+  await expect(page).toHaveScreenshot("with-fonts.png");
+});
+```
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern              | Problem                         | Solution                        |
+| ------------------------- | ------------------------------- | ------------------------------- |
+| Hardcoded text assertions | Breaks in other locales         | Use test IDs or parameterize    |
+| Single locale testing     | Misses i18n bugs                | Test multiple locales           |
+| Ignoring RTL              | Layout broken for RTL users     | Dedicated RTL project           |
+| No font wait              | Screenshots with fallback fonts | Wait for `document.fonts.ready` |
+
+## Related References
+
+- **Clock Mocking**: See [clock-mocking.md](../advanced/clock-mocking.md) for timezone testing
+- **Mobile Testing**: See [mobile-testing.md](../advanced/mobile-testing.md) for device-specific locales
diff --git a/.cursor/skills/playwright-testing/testing-patterns/performance-testing.md b/.cursor/skills/playwright-testing/testing-patterns/performance-testing.md
new file mode 100644
index 0000000000..50135b49db
--- /dev/null
+++ b/.cursor/skills/playwright-testing/testing-patterns/performance-testing.md
@@ -0,0 +1,476 @@
+# Performance Testing & Web Vitals
+
+## Table of Contents
+
+1. [Core Web Vitals](#core-web-vitals)
+2. [Performance Metrics](#performance-metrics)
+3. [Performance Budgets](#performance-budgets)
+4. [Lighthouse Integration](#lighthouse-integration)
+5. [Performance Fixtures](#performance-fixtures)
+6. [CI Performance Monitoring](#ci-performance-monitoring)
+
+## Core Web Vitals
+
+### Measure LCP, FID, CLS
+
+```typescript
+test("core web vitals within thresholds", async ({ page }) => {
+  // Inject web-vitals library
+  await page.addInitScript(() => {
+    (window as any).__webVitals = {};
+
+    // Simplified web vitals collection
+    new PerformanceObserver((list) => {
+      for (const entry of list.getEntries()) {
+        if (entry.entryType === "largest-contentful-paint") {
+          (window as any).__webVitals.lcp = entry.startTime;
+        }
+      }
+    }).observe({ type: "largest-contentful-paint", buffered: true });
+
+    new PerformanceObserver((list) => {
+      let cls = 0;
+      for (const entry of list.getEntries() as any[]) {
+        if (!entry.hadRecentInput) {
+          cls += entry.value;
+        }
+      }
+      (window as any).__webVitals.cls = cls;
+    }).observe({ type: "layout-shift", buffered: true });
+  });
+
+  await page.goto("/");
+
+  // Wait for page to stabilize
+  await page.waitForLoadState("networkidle");
+
+  // Get metrics
+  const vitals = await page.evaluate(() => (window as any).__webVitals);
+
+  // Assert thresholds (Google's "good" thresholds)
+  expect(vitals.lcp).toBeLessThan(2500); // LCP < 2.5s
+  expect(vitals.cls).toBeLessThan(0.1); // CLS < 0.1
+});
+```
+
+### Using web-vitals Library
+
+```typescript
+test("web vitals with library", async ({ page }) => {
+  await page.addInitScript(() => {
+    (window as any).__vitals = {};
+  });
+
+  // Inject web-vitals after navigation
+  await page.goto("/");
+
+  await page.addScriptTag({
+    url: "https://unpkg.com/web-vitals@3/dist/web-vitals.iife.js",
+  });
+
+  await page.evaluate(() => {
+    const { onLCP, onFID, onCLS, onFCP, onTTFB } = (window as any).webVitals;
+
+    onLCP((metric: any) => ((window as any).__vitals.lcp = metric.value));
+    onFID((metric: any) => ((window as any).__vitals.fid = metric.value));
+    onCLS((metric: any) => ((window as any).__vitals.cls = metric.value));
+    onFCP((metric: any) => ((window as any).__vitals.fcp = metric.value));
+    onTTFB((metric: any) => ((window as any).__vitals.ttfb = metric.value));
+  });
+
+  // Trigger FID by clicking
+  await page.getByRole("button").first().click();
+
+  // Wait and collect
+  await page.waitForTimeout(1000);
+
+  const vitals = await page.evaluate(() => (window as any).__vitals);
+
+  console.log("Web Vitals:", vitals);
+
+  // Assertions
+  if (vitals.lcp) expect(vitals.lcp).toBeLessThan(2500);
+  if (vitals.fid) expect(vitals.fid).toBeLessThan(100);
+  if (vitals.cls) expect(vitals.cls).toBeLessThan(0.1);
+});
+```
+
+## Performance Metrics
+
+### Navigation Timing
+
+```typescript
+test("page load performance", async ({ page }) => {
+  await page.goto("/");
+
+  const timing = await page.evaluate(() => {
+    const nav = performance.getEntriesByType(
+      "navigation",
+    )[0] as PerformanceNavigationTiming;
+
+    return {
+      // Time to First Byte
+      ttfb: nav.responseStart - nav.requestStart,
+      // DOM Content Loaded
+      domContentLoaded: nav.domContentLoadedEventEnd - nav.startTime,
+      // Full page load
+      loadComplete: nav.loadEventEnd - nav.startTime,
+      // DNS lookup
+      dns: nav.domainLookupEnd - nav.domainLookupStart,
+      // Connection time
+      connection: nav.connectEnd - nav.connectStart,
+      // Download time
+      download: nav.responseEnd - nav.responseStart,
+      // DOM processing
+      domProcessing: nav.domComplete - nav.domInteractive,
+    };
+  });
+
+  console.log("Performance timing:", timing);
+
+  // Assertions
+  expect(timing.ttfb).toBeLessThan(600); // TTFB < 600ms
+  expect(timing.domContentLoaded).toBeLessThan(2000); // DCL < 2s
+  expect(timing.loadComplete).toBeLessThan(4000); // Load < 4s
+});
+```
+
+### Resource Timing
+
+```typescript
+test("resource loading performance", async ({ page }) => {
+  await page.goto("/");
+
+  const resources = await page.evaluate(() => {
+    return performance.getEntriesByType("resource").map((entry) => ({
+      name: entry.name.split("/").pop(),
+      type: (entry as PerformanceResourceTiming).initiatorType,
+      duration: entry.duration,
+      size: (entry as PerformanceResourceTiming).transferSize,
+    }));
+  });
+
+  // Find slow resources
+  const slowResources = resources.filter((r) => r.duration > 1000);
+
+  if (slowResources.length > 0) {
+    console.warn("Slow resources:", slowResources);
+  }
+
+  // Find large resources
+  const largeResources = resources.filter((r) => r.size > 500000); // > 500KB
+
+  expect(largeResources.length).toBe(0);
+});
+```
+
+### Memory Usage
+
+```typescript
+test("memory usage is reasonable", async ({ page }) => {
+  await page.goto("/dashboard");
+
+  // Check memory (Chrome only)
+  const memory = await page.evaluate(() => {
+    if ((performance as any).memory) {
+      return {
+        usedJSHeapSize: (performance as any).memory.usedJSHeapSize,
+        totalJSHeapSize: (performance as any).memory.totalJSHeapSize,
+      };
+    }
+    return null;
+  });
+
+  if (memory) {
+    const usedMB = memory.usedJSHeapSize / 1024 / 1024;
+    console.log(`Memory usage: ${usedMB.toFixed(2)} MB`);
+
+    // Assert reasonable memory usage
+    expect(usedMB).toBeLessThan(100); // < 100MB
+  }
+});
+```
+
+## Performance Budgets
+
+### Define Budgets
+
+```typescript
+// performance-budgets.ts
+export const budgets = {
+  homepage: {
+    lcp: 2500,
+    cls: 0.1,
+    fcp: 1800,
+    ttfb: 600,
+    totalSize: 1500000, // 1.5MB
+    jsSize: 500000, // 500KB
+    imageCount: 20,
+  },
+  dashboard: {
+    lcp: 3000,
+    cls: 0.1,
+    fcp: 2000,
+    ttfb: 800,
+    totalSize: 2000000,
+    jsSize: 800000,
+  },
+};
+```
+
+### Test Against Budgets
+
+```typescript
+import { budgets } from "./performance-budgets";
+
+test("homepage meets performance budget", async ({ page }) => {
+  const budget = budgets.homepage;
+
+  await page.goto("/");
+  await page.waitForLoadState("networkidle");
+
+  // Measure LCP
+  const lcp = await page.evaluate(() => {
+    return new Promise<number>((resolve) => {
+      new PerformanceObserver((list) => {
+        const entries = list.getEntries();
+        resolve(entries[entries.length - 1].startTime);
+      }).observe({ type: "largest-contentful-paint", buffered: true });
+    });
+  });
+
+  // Measure resources
+  const resources = await page.evaluate(() => {
+    const entries = performance.getEntriesByType(
+      "resource",
+    ) as PerformanceResourceTiming[];
+    return {
+      totalSize: entries.reduce((sum, e) => sum + (e.transferSize || 0), 0),
+      jsSize: entries
+        .filter((e) => e.initiatorType === "script")
+        .reduce((sum, e) => sum + (e.transferSize || 0), 0),
+      imageCount: entries.filter((e) => e.initiatorType === "img").length,
+    };
+  });
+
+  // Assert budgets
+  expect(lcp, "LCP exceeds budget").toBeLessThan(budget.lcp);
+  expect(resources.totalSize, "Total size exceeds budget").toBeLessThan(
+    budget.totalSize,
+  );
+  expect(resources.jsSize, "JS size exceeds budget").toBeLessThan(
+    budget.jsSize,
+  );
+  expect(resources.imageCount, "Too many images").toBeLessThanOrEqual(
+    budget.imageCount,
+  );
+});
+```
+
+### Budget Fixture
+
+```typescript
+// fixtures/performance.fixture.ts
+type PerformanceBudget = {
+  lcp?: number;
+  cls?: number;
+  ttfb?: number;
+  totalSize?: number;
+};
+
+type PerformanceFixtures = {
+  assertBudget: (budget: PerformanceBudget) => Promise<void>;
+};
+
+export const test = base.extend<PerformanceFixtures>({
+  assertBudget: async ({ page }, use) => {
+    await use(async (budget) => {
+      const metrics = await page.evaluate(() => {
+        const nav = performance.getEntriesByType(
+          "navigation",
+        )[0] as PerformanceNavigationTiming;
+        const resources = performance.getEntriesByType(
+          "resource",
+        ) as PerformanceResourceTiming[];
+
+        return {
+          ttfb: nav.responseStart - nav.requestStart,
+          totalSize: resources.reduce(
+            (sum, r) => sum + (r.transferSize || 0),
+            0,
+          ),
+        };
+      });
+
+      if (budget.ttfb) {
+        expect(
+          metrics.ttfb,
+          `TTFB ${metrics.ttfb}ms exceeds budget ${budget.ttfb}ms`,
+        ).toBeLessThan(budget.ttfb);
+      }
+
+      if (budget.totalSize) {
+        expect(metrics.totalSize, `Total size exceeds budget`).toBeLessThan(
+          budget.totalSize,
+        );
+      }
+    });
+  },
+});
+```
+
+## Lighthouse Integration
+
+### Using playwright-lighthouse
+
+```bash
+npm install -D playwright-lighthouse lighthouse
+```
+
+```typescript
+import { playAudit } from "playwright-lighthouse";
+
+test("lighthouse audit", async ({ page }) => {
+  await page.goto("/");
+
+  // Run Lighthouse
+  const audit = await playAudit({
+    page,
+    port: 9222, // Chrome debugging port
+    thresholds: {
+      performance: 80,
+      accessibility: 90,
+      "best-practices": 80,
+      seo: 80,
+    },
+  });
+
+  // Assertions
+  expect(audit.lhr.categories.performance.score * 100).toBeGreaterThanOrEqual(
+    80,
+  );
+  expect(audit.lhr.categories.accessibility.score * 100).toBeGreaterThanOrEqual(
+    90,
+  );
+});
+```
+
+### Lighthouse with Config
+
+```typescript
+test("lighthouse with custom config", async ({ page }, testInfo) => {
+  await page.goto("/");
+
+  const audit = await playAudit({
+    page,
+    port: 9222,
+    thresholds: {
+      performance: 70,
+    },
+    config: {
+      extends: "lighthouse:default",
+      settings: {
+        onlyCategories: ["performance"],
+        throttling: {
+          rttMs: 40,
+          throughputKbps: 10240,
+          cpuSlowdownMultiplier: 1,
+        },
+      },
+    },
+  });
+
+  // Save report
+  const reportPath = testInfo.outputPath("lighthouse-report.html");
+  // Save audit.report to file
+
+  // Attach to test report
+  await testInfo.attach("lighthouse", {
+    body: JSON.stringify(audit.lhr),
+    contentType: "application/json",
+  });
+});
+```
+
+## CI Performance Monitoring
+
+### Track Performance Over Time
+
+```typescript
+// reporters/perf-reporter.ts
+import { Reporter, TestResult } from "@playwright/test/reporter";
+
+class PerfReporter implements Reporter {
+  private metrics: any[] = [];
+
+  onTestEnd(test: any, result: TestResult) {
+    const perfAnnotation = test.annotations.find(
+      (a: any) => a.type === "performance",
+    );
+
+    if (perfAnnotation) {
+      this.metrics.push({
+        test: test.title,
+        ...JSON.parse(perfAnnotation.description),
+        timestamp: new Date().toISOString(),
+      });
+    }
+  }
+
+  async onEnd() {
+    // Send to metrics service
+    if (process.env.METRICS_ENDPOINT) {
+      await fetch(process.env.METRICS_ENDPOINT, {
+        method: "POST",
+        body: JSON.stringify({
+          commit: process.env.GITHUB_SHA,
+          branch: process.env.GITHUB_REF,
+          metrics: this.metrics,
+        }),
+      });
+    }
+  }
+}
+
+export default PerfReporter;
+```
+
+### Performance Regression Detection
+
+```typescript
+test("no performance regression", async ({ page }) => {
+  await page.goto("/");
+
+  const metrics = await page.evaluate(() => {
+    const nav = performance.getEntriesByType(
+      "navigation",
+    )[0] as PerformanceNavigationTiming;
+    return {
+      loadTime: nav.loadEventEnd - nav.startTime,
+    };
+  });
+
+  // Compare against baseline (could be from file or API)
+  const baseline = 2000; // ms
+  const threshold = 1.1; // 10% regression allowed
+
+  expect(
+    metrics.loadTime,
+    `Load time ${metrics.loadTime}ms is ${((metrics.loadTime / baseline - 1) * 100).toFixed(1)}% slower than baseline`,
+  ).toBeLessThan(baseline * threshold);
+});
+```
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern                | Problem                   | Solution                         |
+| --------------------------- | ------------------------- | -------------------------------- |
+| Testing only once           | Results vary              | Run multiple times, use averages |
+| Ignoring network conditions | Unrealistic results       | Test with throttling             |
+| No baseline comparison      | Can't detect regressions  | Track metrics over time          |
+| Testing in dev mode         | Slow, not production-like | Test production builds           |
+
+## Related References
+
+- **Performance Optimization**: See [performance.md](../infrastructure-ci-cd/performance.md) for test execution performance
+- **CI/CD**: See [ci-cd.md](../infrastructure-ci-cd/ci-cd.md) for CI integration
diff --git a/.cursor/skills/playwright-testing/testing-patterns/security-testing.md b/.cursor/skills/playwright-testing/testing-patterns/security-testing.md
new file mode 100644
index 0000000000..f1f622a0f1
--- /dev/null
+++ b/.cursor/skills/playwright-testing/testing-patterns/security-testing.md
@@ -0,0 +1,430 @@
+# Security Testing Basics
+
+## Table of Contents
+
+1. [XSS Prevention](#xss-prevention)
+2. [CSRF Protection](#csrf-protection)
+3. [Authentication Security](#authentication-security)
+4. [Authorization Testing](#authorization-testing)
+5. [Input Validation](#input-validation)
+6. [Security Headers](#security-headers)
+
+## XSS Prevention
+
+### Test Reflected XSS
+
+```typescript
+test("input is properly escaped", async ({ page }) => {
+  const xssPayloads = [
+    '<script>alert("xss")</script>',
+    '<img src="x" onerror="alert(1)">',
+    '"><script>alert(1)</script>',
+    "javascript:alert(1)",
+    '<svg onload="alert(1)">',
+  ];
+
+  for (const payload of xssPayloads) {
+    await page.goto(`/search?q=${encodeURIComponent(payload)}`);
+
+    // Verify script didn't execute
+    const alertTriggered = await page.evaluate(() => {
+      return (window as any).__xssTriggered === true;
+    });
+    expect(alertTriggered).toBe(false);
+
+    // Verify payload is escaped in HTML
+    const content = await page.content();
+    expect(content).not.toContain("<script>alert");
+    expect(content).not.toContain("onerror=");
+  }
+});
+```
+
+### Test Stored XSS
+
+```typescript
+test("user content is sanitized", async ({ page }) => {
+  await page.goto("/create-post");
+
+  // Try to inject script via form
+  await page.getByLabel("Content").fill('<script>alert("xss")</script>Hello');
+  await page.getByRole("button", { name: "Submit" }).click();
+
+  // View the post
+  await page.goto("/posts/latest");
+
+  // Script should not be in page
+  const scripts = await page.locator("script").count();
+  const pageContent = await page.content();
+
+  // The script tag should be escaped or removed
+  expect(pageContent).not.toContain("<script>alert");
+
+  // Text should still be visible (just sanitized)
+  await expect(page.getByText("Hello")).toBeVisible();
+});
+```
+
+### Monitor for XSS Execution
+
+```typescript
+test("no XSS execution", async ({ page }) => {
+  // Set up XSS detection
+  await page.addInitScript(() => {
+    (window as any).__xssDetected = false;
+
+    // Override alert/confirm/prompt
+    window.alert = () => {
+      (window as any).__xssDetected = true;
+    };
+    window.confirm = () => {
+      (window as any).__xssDetected = true;
+      return false;
+    };
+    window.prompt = () => {
+      (window as any).__xssDetected = true;
+      return null;
+    };
+  });
+
+  // Perform test actions
+  await page.goto("/vulnerable-page");
+  await page.getByLabel("Search").fill('"><img src=x onerror=alert(1)>');
+  await page.getByLabel("Search").press("Enter");
+
+  // Check if XSS triggered
+  const xssDetected = await page.evaluate(() => (window as any).__xssDetected);
+  expect(xssDetected).toBe(false);
+});
+```
+
+## CSRF Protection
+
+### Verify CSRF Token Present
+
+```typescript
+test("forms include CSRF token", async ({ page }) => {
+  await page.goto("/settings");
+
+  // Check form has CSRF token
+  const csrfInput = page.locator(
+    'input[name="_csrf"], input[name="csrf_token"]',
+  );
+  await expect(csrfInput).toBeAttached();
+
+  const csrfValue = await csrfInput.getAttribute("value");
+  expect(csrfValue).toBeTruthy();
+  expect(csrfValue!.length).toBeGreaterThan(20);
+});
+```
+
+### Test CSRF Token Validation
+
+```typescript
+test("rejects requests without CSRF token", async ({ page, request }) => {
+  await page.goto("/settings");
+
+  // Try to submit without CSRF token
+  const response = await request.post("/api/settings", {
+    data: { theme: "dark" },
+    headers: {
+      "Content-Type": "application/json",
+    },
+  });
+
+  // Should be rejected
+  expect(response.status()).toBe(403);
+});
+
+test("rejects requests with invalid CSRF token", async ({ page, request }) => {
+  await page.goto("/settings");
+
+  const response = await request.post("/api/settings", {
+    data: { theme: "dark" },
+    headers: {
+      "X-CSRF-Token": "invalid-token",
+    },
+  });
+
+  expect(response.status()).toBe(403);
+});
+```
+
+### Test CSRF with Valid Token
+
+```typescript
+test("accepts requests with valid CSRF token", async ({ page }) => {
+  await page.goto("/settings");
+
+  // Get CSRF token from page
+  const csrfToken = await page
+    .locator('meta[name="csrf-token"]')
+    .getAttribute("content");
+
+  // Submit form normally
+  await page.getByLabel("Theme").selectOption("dark");
+  await page.getByRole("button", { name: "Save" }).click();
+
+  // Should succeed
+  await expect(page.getByText("Settings saved")).toBeVisible();
+});
+```
+
+## Authentication Security
+
+### Test Session Expiry
+
+```typescript
+test("session expires after timeout", async ({ page, context }) => {
+  await page.goto("/login");
+  await page.getByLabel("Email").fill("user@example.com");
+  await page.getByLabel("Password").fill("password");
+  await page.getByRole("button", { name: "Sign in" }).click();
+
+  await expect(page).toHaveURL("/dashboard");
+
+  // Simulate time passing (if using clock mocking)
+  await page.clock.fastForward("02:00:00"); // 2 hours
+
+  // Try to access protected page
+  await page.goto("/profile");
+
+  // Should redirect to login
+  await expect(page).toHaveURL(/\/login/);
+  await expect(page.getByText("Session expired")).toBeVisible();
+});
+```
+
+### Test Concurrent Sessions
+
+```typescript
+test("handles concurrent session limit", async ({ browser }) => {
+  // Login from first browser
+  const context1 = await browser.newContext();
+  const page1 = await context1.newPage();
+
+  await page1.goto("/login");
+  await page1.getByLabel("Email").fill("user@example.com");
+  await page1.getByLabel("Password").fill("password");
+  await page1.getByRole("button", { name: "Sign in" }).click();
+  await expect(page1).toHaveURL("/dashboard");
+
+  // Login from second browser (same user)
+  const context2 = await browser.newContext();
+  const page2 = await context2.newPage();
+
+  await page2.goto("/login");
+  await page2.getByLabel("Email").fill("user@example.com");
+  await page2.getByLabel("Password").fill("password");
+  await page2.getByRole("button", { name: "Sign in" }).click();
+
+  // First session should be invalidated (or warning shown)
+  await page1.reload();
+  await expect(
+    page1.getByText(/session.*another device|logged out/i),
+  ).toBeVisible();
+
+  await context1.close();
+  await context2.close();
+});
+```
+
+### Test Password Reset Security
+
+```typescript
+test("password reset token is single-use", async ({ page, request }) => {
+  // Request password reset
+  await page.goto("/forgot-password");
+  await page.getByLabel("Email").fill("user@example.com");
+  await page.getByRole("button", { name: "Reset" }).click();
+
+  // Get token (in test env, might be exposed or use email mock)
+  const resetToken = "mock-reset-token";
+
+  // Use token first time
+  await page.goto(`/reset-password?token=${resetToken}`);
+  await page.getByLabel("New Password").fill("NewPassword123");
+  await page.getByRole("button", { name: "Reset" }).click();
+
+  await expect(page.getByText("Password updated")).toBeVisible();
+
+  // Try to use same token again
+  await page.goto(`/reset-password?token=${resetToken}`);
+
+  await expect(page.getByText("Invalid or expired token")).toBeVisible();
+});
+```
+
+## Authorization Testing
+
+### Test Unauthorized Access
+
+```typescript
+test.describe("authorization", () => {
+  test("cannot access admin routes as user", async ({ browser }) => {
+    const context = await browser.newContext({
+      storageState: ".auth/user.json", // Regular user
+    });
+    const page = await context.newPage();
+
+    // Try to access admin page
+    await page.goto("/admin/users");
+
+    // Should be denied
+    await expect(page).not.toHaveURL("/admin/users");
+    expect(
+      (await page.getByText("Access denied").isVisible()) ||
+        (await page.url()).includes("/login") ||
+        (await page.url()).includes("/403"),
+    ).toBe(true);
+
+    await context.close();
+  });
+
+  test("cannot access other user's data", async ({ page }) => {
+    // Logged in as user 1, try to access user 2's profile
+    await page.goto("/users/other-user-id/settings");
+
+    await expect(page.getByText("Access denied")).toBeVisible();
+  });
+});
+```
+
+### Test IDOR (Insecure Direct Object Reference)
+
+```typescript
+test("cannot access other user resources by changing ID", async ({
+  page,
+  request,
+}) => {
+  // Get current user's order
+  await page.goto("/orders/my-order-123");
+  await expect(page.getByText("Order #my-order-123")).toBeVisible();
+
+  // Try to access another user's order
+  const response = await request.get("/api/orders/other-user-order-456");
+
+  // Should be forbidden
+  expect(response.status()).toBe(403);
+});
+```
+
+## Input Validation
+
+### Test SQL Injection Prevention
+
+```typescript
+test("SQL injection is prevented", async ({ page }) => {
+  const sqlPayloads = [
+    "'; DROP TABLE users; --",
+    "1' OR '1'='1",
+    "1; DELETE FROM orders",
+    "' UNION SELECT * FROM users --",
+  ];
+
+  for (const payload of sqlPayloads) {
+    await page.goto("/search");
+    await page.getByLabel("Search").fill(payload);
+    await page.getByRole("button", { name: "Search" }).click();
+
+    // Should not error (injection blocked/escaped)
+    await expect(page.getByText("Error")).not.toBeVisible();
+
+    // Should show no results or escaped text
+    const hasError = await page
+      .getByText(/database error|sql|syntax/i)
+      .isVisible();
+    expect(hasError).toBe(false);
+  }
+});
+```
+
+### Test Input Length Limits
+
+```typescript
+test("enforces input length limits", async ({ page }) => {
+  await page.goto("/profile");
+
+  // Try to submit very long input
+  const longString = "a".repeat(10000);
+
+  await page.getByLabel("Bio").fill(longString);
+  await page.getByRole("button", { name: "Save" }).click();
+
+  // Should show validation error or truncate
+  const bioValue = await page.getByLabel("Bio").inputValue();
+  expect(bioValue.length).toBeLessThanOrEqual(500); // Expected max
+});
+```
+
+## Security Headers
+
+### Verify Security Headers
+
+```typescript
+test("response includes security headers", async ({ page }) => {
+  const response = await page.goto("/");
+
+  const headers = response!.headers();
+
+  // Content Security Policy
+  expect(headers["content-security-policy"]).toBeTruthy();
+
+  // Prevent clickjacking
+  expect(headers["x-frame-options"]).toMatch(/DENY|SAMEORIGIN/);
+
+  // Prevent MIME type sniffing
+  expect(headers["x-content-type-options"]).toBe("nosniff");
+
+  // XSS Protection (legacy but good to have)
+  expect(headers["x-xss-protection"]).toBeTruthy();
+
+  // HTTPS enforcement
+  if (!page.url().includes("localhost")) {
+    expect(headers["strict-transport-security"]).toBeTruthy();
+  }
+});
+```
+
+### Test CSP Violations
+
+```typescript
+test("CSP blocks inline scripts", async ({ page }) => {
+  const cspViolations: string[] = [];
+
+  // Listen for CSP violations via console
+  page.on("console", (msg) => {
+    if (msg.text().includes("Content Security Policy")) {
+      cspViolations.push(msg.text());
+    }
+  });
+
+  await page.goto("/");
+
+  // Try to inject inline script - CSP should block it
+  await page.evaluate(() => {
+    const script = document.createElement("script");
+    script.textContent = 'console.log("injected")';
+    document.body.appendChild(script);
+  });
+
+  expect(cspViolations.length).toBeGreaterThan(0);
+});
+```
+
+> **For comprehensive console monitoring** (fixtures, allowed patterns, fail on errors), see [console-errors.md](../debugging/console-errors.md).
+
+## Anti-Patterns to Avoid
+
+| Anti-Pattern               | Problem               | Solution                      |
+| -------------------------- | --------------------- | ----------------------------- |
+| Testing only happy path    | Misses security holes | Test malicious inputs         |
+| Hardcoded test credentials | Security risk         | Use environment variables     |
+| Skipping auth tests in dev | Bugs reach production | Test auth in all environments |
+| Not testing authorization  | Access control bugs   | Test all role combinations    |
+
+## Related References
+
+- **Authentication**: See [fixtures-hooks.md](../core/fixtures-hooks.md) for auth fixtures
+- **Multi-User**: See [multi-user.md](../advanced/multi-user.md) for role-based testing
+- **Error Testing**: See [error-testing.md](../debugging/error-testing.md) for validation testing
diff --git a/.cursor/skills/playwright-testing/testing-patterns/visual-regression.md b/.cursor/skills/playwright-testing/testing-patterns/visual-regression.md
new file mode 100644
index 0000000000..14280f6c53
--- /dev/null
+++ b/.cursor/skills/playwright-testing/testing-patterns/visual-regression.md
@@ -0,0 +1,634 @@
+# Visual Regression Testing
+
+## Table of Contents
+
+1. [Quick Reference](#quick-reference)
+2. [Patterns](#patterns)
+3. [Decision Guide](#decision-guide)
+4. [Anti-Patterns](#anti-patterns)
+5. [Troubleshooting](#troubleshooting)
+
+> **When to use**: Detecting unintended visual changes—layout shifts, style regressions, broken responsive designs—that functional assertions miss.
+
+## Quick Reference
+
+```typescript
+// Element screenshot
+await expect(page.getByTestId('product-card')).toHaveScreenshot();
+
+// Full page screenshot
+await expect(page).toHaveScreenshot('landing-hero.png');
+
+// Threshold for minor pixel variance
+await expect(page).toHaveScreenshot({ maxDiffPixelRatio: 0.01 });
+
+// Mask volatile content
+await expect(page).toHaveScreenshot({
+  mask: [page.getByTestId('clock'), page.getByRole('img', { name: 'User photo' })],
+});
+
+// Disable CSS animations
+await expect(page).toHaveScreenshot({ animations: 'disabled' });
+
+// Update baselines
+npx playwright test --update-snapshots
+```
+
+## Patterns
+
+### Masking Volatile Content
+
+**Use when**: Page contains timestamps, avatars, ad slots, relative dates, random images, or A/B variants.
+
+The `mask` option overlays a solid box over specified locators before capturing.
+
+```typescript
+test('analytics panel with masked dynamic elements', async ({ page }) => {
+  await page.goto('/analytics');
+
+  await expect(page).toHaveScreenshot('analytics.png', {
+    mask: [
+      page.getByTestId('last-updated'),
+      page.getByTestId('profile-avatar'),
+      page.getByTestId('active-users'),
+      page.locator('.promo-banner'),
+    ],
+    maskColor: '#FF00FF',
+  });
+});
+
+test('activity stream with relative times', async ({ page }) => {
+  await page.goto('/activity');
+
+  await expect(page).toHaveScreenshot('activity.png', {
+    mask: [page.locator('time[datetime]')],
+  });
+});
+```
+
+**Alternative: freeze content with JavaScript** when masking affects layout:
+
+```typescript
+test('freeze timestamps before capture', async ({ page }) => {
+  await page.goto('/analytics');
+
+  await page.evaluate(() => {
+    document.querySelectorAll('[data-testid="time-display"]').forEach((el) => {
+      el.textContent = 'Jan 1, 2025 12:00 PM';
+    });
+  });
+
+  await expect(page).toHaveScreenshot('analytics-frozen.png');
+});
+```
+
+### Disabling Animations
+
+**Use when**: Always. CSS animations and transitions are the primary cause of flaky visual diffs.
+
+```typescript
+test('renders without animation interference', async ({ page }) => {
+  await page.goto('/');
+
+  await expect(page).toHaveScreenshot('home.png', {
+    animations: 'disabled',
+  });
+});
+```
+
+**Set globally** in config:
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  expect: {
+    toHaveScreenshot: {
+      animations: 'disabled',
+    },
+  },
+});
+```
+
+When `animations: 'disabled'` is set, Playwright injects CSS forcing animation/transition duration to 0s, waits for running animations to finish, then captures.
+
+For JavaScript-driven animations (GSAP, Framer Motion), wait for stability:
+
+```typescript
+test('page with JS animations', async ({ page }) => {
+  await page.goto('/animated-hero');
+
+  const heroBanner = page.getByTestId('hero-banner');
+  await heroBanner.waitFor({ state: 'visible' });
+  
+  // Wait for animation to complete by checking for stable state
+  await expect(heroBanner).not.toHaveClass(/animating/);
+
+  await expect(page).toHaveScreenshot('hero.png', {
+    animations: 'disabled',
+  });
+});
+```
+
+### Configuring Thresholds
+
+**Use when**: Minor rendering differences from anti-aliasing, font hinting, or sub-pixel rendering cause false failures.
+
+| Option | Controls | Typical Value |
+|---|---|---|
+| `maxDiffPixels` | Absolute pixel count that can differ | `100` for pages, `10` for components |
+| `maxDiffPixelRatio` | Fraction of total pixels (0-1) | `0.01` (1%) for pages |
+| `threshold` | Per-pixel color tolerance (0-1) | `0.2` for most UIs, `0.1` for design systems |
+
+```typescript
+test('control panel allows minor variance', async ({ page }) => {
+  await page.goto('/control-panel');
+
+  await expect(page).toHaveScreenshot('control-panel.png', {
+    maxDiffPixelRatio: 0.01,
+  });
+});
+
+test('brand logo renders pixel-perfect', async ({ page }) => {
+  await page.goto('/brand');
+
+  await expect(page.getByTestId('brand-logo')).toHaveScreenshot('brand-logo.png', {
+    maxDiffPixels: 0,
+    threshold: 0,
+  });
+});
+
+test('graph allows anti-aliasing differences', async ({ page }) => {
+  await page.goto('/reports');
+
+  await expect(page.getByTestId('sales-graph')).toHaveScreenshot('sales-graph.png', {
+    threshold: 0.3,
+    maxDiffPixels: 200,
+  });
+});
+```
+
+**Global thresholds** in config:
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  expect: {
+    toHaveScreenshot: {
+      maxDiffPixelRatio: 0.01,
+      threshold: 0.2,
+      animations: 'disabled',
+    },
+  },
+});
+```
+
+### CI Configuration
+
+**Use when**: Running visual tests in CI. Consistent rendering is critical—the same test must produce identical screenshots every time.
+
+**The problem**: Font rendering and anti-aliasing differ across operating systems. macOS snapshots won't match Linux.
+
+**The solution**: Run visual tests in Docker using the official Playwright container. Generate and update snapshots from the same container.
+
+**GitHub Actions with Docker**
+
+```yaml
+# .github/workflows/visual-tests.yml
+name: Visual Regression Tests
+on: [push, pull_request]
+
+jobs:
+  visual-tests:
+    runs-on: ubuntu-latest
+    container:
+      image: mcr.microsoft.com/playwright:v1.48.0-noble
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-node@v4
+        with:
+          node-version: lts/*
+          cache: npm
+
+      - run: npm ci
+
+      - name: Run visual tests
+        run: npx playwright test --project=visual
+        env:
+          HOME: /root
+
+      - uses: actions/upload-artifact@v4
+        if: failure()
+        with:
+          name: visual-test-report
+          path: playwright-report/
+          retention-days: 14
+```
+
+**Updating snapshots locally using Docker**:
+
+```bash
+docker run --rm -v $(pwd):/work -w /work \
+  mcr.microsoft.com/playwright:v1.48.0-noble \
+  npx playwright test --update-snapshots --project=visual
+```
+
+**Add script to `package.json`**:
+
+```json
+{
+  "scripts": {
+    "test:visual": "npx playwright test --project=visual",
+    "test:visual:update": "docker run --rm -v $(pwd):/work -w /work mcr.microsoft.com/playwright:v1.48.0-noble npx playwright test --update-snapshots --project=visual"
+  }
+}
+```
+
+**Platform-agnostic snapshots** (requires Docker for generation):
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  snapshotPathTemplate: '{testDir}/{testFileDir}/{testFileName}-snapshots/{arg}{-projectName}{ext}',
+  projects: [
+    {
+      name: 'visual',
+      testMatch: '**/*.visual.spec.ts',
+      use: { ...devices['Desktop Chrome'] },
+    },
+  ],
+});
+```
+
+### Full Page vs Element Screenshots
+
+**Use when**: Deciding scope. Full page catches layout shifts. Element screenshots isolate components and are more stable.
+
+```typescript
+test('full page captures layout shifts', async ({ page }) => {
+  await page.goto('/');
+
+  // Visible viewport
+  await expect(page).toHaveScreenshot('home-viewport.png');
+
+  // Entire scrollable page
+  await expect(page).toHaveScreenshot('home-full.png', {
+    fullPage: true,
+  });
+});
+
+test('element screenshot isolates component', async ({ page }) => {
+  await page.goto('/catalog');
+
+  await expect(page.getByRole('table')).toHaveScreenshot('catalog-table.png');
+  await expect(page.getByTestId('featured-item')).toHaveScreenshot('featured-item.png');
+});
+```
+
+**Rule of thumb**: Element screenshots for independently changing components. Full page screenshots for key layouts where spacing matters.
+
+### Responsive Visual Testing
+
+**Use when**: Application has responsive breakpoints requiring verification at different viewport sizes.
+
+```typescript
+const breakpoints = [
+  { name: 'phone', width: 375, height: 812 },
+  { name: 'tablet', width: 768, height: 1024 },
+  { name: 'desktop', width: 1440, height: 900 },
+];
+
+for (const bp of breakpoints) {
+  test(`landing at ${bp.name} (${bp.width}x${bp.height})`, async ({ page }) => {
+    await page.setViewportSize({ width: bp.width, height: bp.height });
+    await page.goto('/');
+
+    await expect(page).toHaveScreenshot(`landing-${bp.name}.png`, {
+      animations: 'disabled',
+      fullPage: true,
+    });
+  });
+}
+```
+
+**Alternative: use projects for responsive testing**:
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  projects: [
+    {
+      name: 'desktop',
+      testMatch: '**/*.visual.spec.ts',
+      use: {
+        ...devices['Desktop Chrome'],
+        viewport: { width: 1440, height: 900 },
+      },
+    },
+    {
+      name: 'tablet',
+      testMatch: '**/*.visual.spec.ts',
+      use: { ...devices['iPad (gen 7)'] },
+    },
+    {
+      name: 'mobile',
+      testMatch: '**/*.visual.spec.ts',
+      use: { ...devices['iPhone 14'] },
+    },
+  ],
+});
+```
+
+### Component Visual Testing
+
+**Use when**: Testing individual UI components in isolation—buttons, cards, forms, modals. Faster and more stable than full-page screenshots.
+
+```typescript
+test.describe('Button visual states', () => {
+  test('primary button', async ({ page }) => {
+    await page.goto('/storybook/iframe.html?id=button--primary');
+    const btn = page.getByRole('button');
+    await expect(btn).toHaveScreenshot('btn-primary.png', {
+      animations: 'disabled',
+    });
+  });
+
+  test('primary button hover', async ({ page }) => {
+    await page.goto('/storybook/iframe.html?id=button--primary');
+    const btn = page.getByRole('button');
+    await btn.hover();
+    await expect(btn).toHaveScreenshot('btn-primary-hover.png', {
+      animations: 'disabled',
+    });
+  });
+
+  test('button sizes', async ({ page }) => {
+    for (const size of ['small', 'medium', 'large']) {
+      await page.goto(`/storybook/iframe.html?id=button--${size}`);
+      const btn = page.getByRole('button');
+      await expect(btn).toHaveScreenshot(`btn-${size}.png`, {
+        animations: 'disabled',
+      });
+    }
+  });
+});
+```
+
+**Using a dedicated test harness** instead of Storybook:
+
+```typescript
+test.describe('Card component', () => {
+  test.beforeEach(async ({ page }) => {
+    await page.goto('/test-harness/card');
+  });
+
+  test('default state', async ({ page }) => {
+    await expect(page.getByTestId('card')).toHaveScreenshot('card-default.png', {
+      animations: 'disabled',
+    });
+  });
+
+  test('truncates long content', async ({ page }) => {
+    await page.goto('/test-harness/card?content=long');
+    await expect(page.getByTestId('card')).toHaveScreenshot('card-long.png', {
+      animations: 'disabled',
+    });
+  });
+});
+```
+
+### Updating Snapshots
+
+**Use when**: Intentionally changed UI—design refresh, rebrand, new feature. Never update when diff is unexpected.
+
+```bash
+# Update all snapshots
+npx playwright test --update-snapshots
+
+# Update for specific file
+npx playwright test tests/landing.spec.ts --update-snapshots
+
+# Update for specific project
+npx playwright test --project=chromium --update-snapshots
+```
+
+**Workflow for reviewing changes:**
+
+1. Run tests and view failures in HTML report:
+   ```bash
+   npx playwright test
+   npx playwright show-report
+   ```
+   The report shows expected, actual, and diff images side-by-side.
+
+2. If changes are intentional, update:
+   ```bash
+   npx playwright test --update-snapshots
+   ```
+
+3. Review updated snapshots before committing:
+   ```bash
+   git diff --name-only
+   ```
+
+**Tag visual tests for selective updates:**
+
+```typescript
+test('landing visual @visual', async ({ page }) => {
+  await page.goto('/');
+  await expect(page).toHaveScreenshot('landing.png', {
+    animations: 'disabled',
+  });
+});
+```
+
+```bash
+npx playwright test --grep @visual --update-snapshots
+```
+
+### Cross-Browser Visual Testing
+
+**Use when**: Users span Chrome, Firefox, Safari and you need per-browser rendering verification.
+
+Playwright separates snapshots by project name automatically. Each browser gets its own baseline—browsers render fonts and shadows differently.
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  expect: {
+    toHaveScreenshot: {
+      animations: 'disabled',
+      maxDiffPixelRatio: 0.01,
+    },
+  },
+  projects: [
+    {
+      name: 'chromium',
+      use: { ...devices['Desktop Chrome'] },
+    },
+    {
+      name: 'firefox',
+      use: { ...devices['Desktop Firefox'] },
+    },
+    {
+      name: 'webkit',
+      use: { ...devices['Desktop Safari'] },
+    },
+  ],
+});
+```
+
+**Strategy**: Run visual tests in a single browser (Chromium on Linux in CI) to minimize snapshot count. Add other browsers only when you have actual cross-browser rendering bugs:
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  projects: [
+    {
+      name: 'visual',
+      testMatch: '**/*.visual.spec.ts',
+      use: { ...devices['Desktop Chrome'] },
+    },
+    {
+      name: 'chromium',
+      testIgnore: '**/*.visual.spec.ts',
+      use: { ...devices['Desktop Chrome'] },
+    },
+    {
+      name: 'firefox',
+      testIgnore: '**/*.visual.spec.ts',
+      use: { ...devices['Desktop Firefox'] },
+    },
+  ],
+});
+```
+
+## Decision Guide
+
+| Scenario | Approach | Rationale |
+|---|---|---|
+| Key landing/marketing pages | Full page, `fullPage: true` | Catches layout shifts, spacing, overall harmony |
+| Individual components | Element screenshot | Isolated, fast, immune to unrelated changes |
+| Page with dynamic content | Full page + `mask` | Covers layout while ignoring volatile content |
+| Design system library | Element per variant, zero threshold | Pixel-perfect enforcement |
+| Responsive verification | Screenshot per viewport | Catches breakpoint bugs |
+| Cross-browser consistency | Separate snapshots per browser | Browsers render differently |
+| CI pipeline | Docker container, Linux-only snapshots | Consistent rendering |
+| Threshold: design system | `threshold: 0`, `maxDiffPixels: 0` | Zero tolerance |
+| Threshold: content pages | `maxDiffPixelRatio: 0.01`, `threshold: 0.2` | Minor anti-aliasing variance |
+| Threshold: charts/graphs | `maxDiffPixels: 200`, `threshold: 0.3` | Anti-aliasing on curves varies |
+
+## Anti-Patterns
+
+| Don't | Problem | Do Instead |
+|---|---|---|
+| Visual test every page | Massive maintenance, constant false failures | Pick 5-10 key pages and critical components |
+| Skip masking dynamic content | Screenshots differ every run, permanently flaky | Use `mask` for all volatile elements |
+| Run across macOS, Linux, Windows | Font rendering differs, snapshots never match | Standardize on Linux via Docker |
+| Skip Docker in CI | OS updates shift rendering silently | Pin specific Playwright Docker image |
+| Blindly run `--update-snapshots` | Accepts unintentional regressions | Always review diff in HTML report first |
+| Skip `animations: 'disabled'` | CSS transitions create random diffs | Set globally in config |
+| Replace functional assertions with visual tests | Diffs don't tell you *what* broke | Visual tests complement, never replace |
+| Commit snapshots from different platforms | Tests fail for everyone | All team members use same Docker container |
+| Set threshold too high (`0.1`) | 10% pixel change passes, defeats purpose | Start with `0.01`, adjust per-test |
+| Full page on infinite scroll pages | Page height nondeterministic | Element screenshots on above-the-fold content |
+
+## Troubleshooting
+
+### "Screenshot comparison failed" on first CI run after local development
+
+**Cause**: Snapshots generated on macOS locally. CI runs on Linux. Font rendering differs.
+
+**Fix**: Generate snapshots using Docker:
+
+```bash
+docker run --rm -v $(pwd):/work -w /work \
+  mcr.microsoft.com/playwright:v1.48.0-noble \
+  npx playwright test --update-snapshots --project=visual
+```
+
+Commit Linux-generated snapshots.
+
+### "Expected screenshot to match but X pixels differ"
+
+**Cause**: Anti-aliasing, font hinting, sub-pixel rendering differences.
+
+**Fix**: Add tolerance:
+
+```typescript
+await expect(page).toHaveScreenshot('page.png', {
+  maxDiffPixelRatio: 0.01,
+  threshold: 0.2,
+});
+```
+
+Check HTML report diff image to determine if it's regression or noise.
+
+### Visual tests pass locally but fail in CI (even with Docker)
+
+**Cause**: Different Playwright versions locally vs CI.
+
+**Fix**: Ensure `package.json` version matches Docker image tag:
+
+```json
+{
+  "devDependencies": {
+    "@playwright/test": "latest"
+  }
+}
+```
+
+```yaml
+container:
+  image: mcr.microsoft.com/playwright:v1.48.0-noble
+```
+
+### Animations cause random diff failures
+
+**Cause**: CSS animations captured mid-frame.
+
+**Fix**: Set `animations: 'disabled'` globally:
+
+```typescript
+// playwright.config.ts
+export default defineConfig({
+  expect: {
+    toHaveScreenshot: {
+      animations: 'disabled',
+    },
+  },
+});
+```
+
+For JS animations, wait for stable state before capture.
+
+### Snapshot file names conflict between tests
+
+**Cause**: Two tests use same screenshot name without unique paths.
+
+**Fix**: Use explicit unique names:
+
+```typescript
+await expect(page).toHaveScreenshot('auth-home.png');
+await expect(page).toHaveScreenshot('public-home.png');
+```
+
+Or customize snapshot path template:
+
+```typescript
+export default defineConfig({
+  snapshotPathTemplate: '{testDir}/{testFileDir}/{testFileName}-snapshots/{arg}{-projectName}{ext}',
+});
+```
+
+### Too many snapshot files to maintain
+
+**Cause**: Visual tests for every page, browser, viewport.
+
+**Fix**: Be selective. Visual test only high-risk pages:
+- Landing and marketing pages
+- Design system components
+- Complex layouts (dashboards, data tables)
+- Pages after major refactor
+
+Skip pages where functional assertions cover key elements.
diff --git a/.cursor/skills/rank-tracker/SKILL.md b/.cursor/skills/rank-tracker/SKILL.md
new file mode 100644
index 0000000000..26020e9e33
--- /dev/null
+++ b/.cursor/skills/rank-tracker/SKILL.md
@@ -0,0 +1,275 @@
+---
+name: rank-tracker
+description: 'Track keyword rankings and SERP feature changes in traditional search and AI responses over time. 排名追踪/SERP监控'
+version: "6.0.0"
+license: Apache-2.0
+compatibility: "Claude Code ≥1.0, skills.sh marketplace, ClawHub marketplace, Vercel Labs skills ecosystem. No system packages required. Optional: MCP network access for SEO tool integrations."
+homepage: "https://github.com/aaron-he-zhu/seo-geo-claude-skills"
+when_to_use: "Use when tracking keyword rankings, monitoring position changes, comparing ranking snapshots, or detecting ranking drops."
+argument-hint: "<domain> [keyword list]"
+metadata:
+  author: aaron-he-zhu
+  version: "6.0.0"
+  geo-relevance: "medium"
+  tags:
+    - seo
+    - geo
+    - rank-tracking
+    - keyword-rankings
+    - serp-positions
+    - ranking-changes
+    - position-tracking
+    - 排名追踪
+    - ランキング追跡
+    - 순위추적
+    - seguimiento-rankings
+  triggers:
+    # EN-formal
+    - "track rankings"
+    - "check keyword positions"
+    - "ranking changes"
+    - "monitor SERP positions"
+    - "keyword tracking"
+    - "position monitoring"
+    # EN-casual
+    - "how am I ranking"
+    - "where do I rank for this keyword"
+    - "did my rankings change"
+    - "where do I rank now"
+    - "check my positions"
+    # EN-question
+    - "what position am I ranking at"
+    - "how are my rankings doing"
+    # ZH-pro
+    - "排名追踪"
+    - "关键词排名"
+    - "SERP位置监控"
+    - "排名变化"
+    # ZH-casual
+    - "查排名"
+    - "排名变了吗"
+    - "我排第几"
+    # JA
+    - "ランキング追跡"
+    - "検索順位チェック"
+    - "順位変動"
+    - "キーワード順位確認"
+    # KO
+    - "순위 추적"
+    - "키워드 순위"
+    - "순위 확인"
+    - "내 순위 어떻게 됐어?"
+    # ES
+    - "seguimiento de rankings"
+    - "posición en buscadores"
+    - "posicionamiento SEO"
+    - "en qué posición estoy"
+    # PT
+    - "rastreamento de rankings"
+    - "monitoramento de posições"
+    - "posição no Google"
+    # Misspellings
+    - "rank trackng"
+---
+
+# Rank Tracker
+
+
+> **[SEO & GEO Skills Library](https://github.com/aaron-he-zhu/seo-geo-claude-skills)** · 20 skills for SEO + GEO · [ClawHub](https://clawhub.ai/u/aaron-he-zhu) · [skills.sh](https://skills.sh/aaron-he-zhu/seo-geo-claude-skills)
+> **System Mode**: This monitoring skill follows the shared [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md) and [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md).
+
+
+Tracks, analyzes, and reports on keyword ranking positions over time. Monitors both traditional SERP rankings and AI/GEO visibility to provide comprehensive search performance insights.
+
+**System role**: Monitoring layer skill. It turns performance changes into deltas, alerts, and next actions.
+
+## When This Must Trigger
+
+Use this when the conversation involves any of these situations — even if the user does not use SEO terminology:
+
+Use this whenever the task needs time-aware change detection, escalation, or stakeholder-ready visibility.
+
+- Setting up ranking tracking for new campaigns
+- Monitoring keyword position changes
+- Analyzing ranking trends over time
+- Comparing rankings against competitors
+- Tracking SERP feature appearances
+- Monitoring AI Overview inclusions
+- Creating ranking reports for stakeholders
+
+## What This Skill Does
+
+1. **Position Tracking**: Records and tracks keyword rankings
+2. **Trend Analysis**: Identifies ranking patterns over time
+3. **Movement Detection**: Flags significant position changes
+4. **Competitor Comparison**: Benchmarks against competitors
+5. **SERP Feature Tracking**: Monitors featured snippets, PAA
+6. **GEO Visibility Tracking**: Tracks AI citation appearances
+7. **Report Generation**: Creates ranking performance reports
+
+## Quick Start
+
+Start with one of these prompts. Finish with a short handoff summary using the repository format in [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md).
+
+### Set Up Tracking
+
+```
+Set up rank tracking for [domain] targeting these keywords: [keyword list]
+```
+
+### Analyze Rankings
+
+```
+Analyze ranking changes for [domain] over the past [time period]
+```
+
+### Compare to Competitors
+
+```
+Compare my rankings to [competitor] for [keywords]
+```
+
+### Generate Reports
+
+```
+Create a ranking report for [domain/campaign]
+```
+
+## Skill Contract
+
+**Expected output**: a delta summary, alert/report output, and a short handoff summary ready for `memory/monitoring/`.
+
+- **Reads**: current metrics, previous baselines, alert thresholds, and reporting context from [CLAUDE.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CLAUDE.md) and the shared [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md) when available.
+- **Writes**: a user-facing monitoring deliverable plus a reusable summary that can be stored under `memory/monitoring/`.
+- **Promotes**: significant changes, confirmed anomalies, and follow-up actions to `memory/open-loops.md` and `memory/decisions.md`.
+- **Next handoff**: use the `Next Best Skill` below when a change needs action.
+
+## Data Sources
+
+> **Note:** All integrations are optional. This skill works without any API keys — users provide data manually when no tools are connected.
+
+> See [CONNECTORS.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CONNECTORS.md) for tool category placeholders.
+
+**With ~~SEO tool + ~~search console + ~~analytics + ~~AI monitor connected:**
+Automatically pull ranking positions from ~~SEO tool, search impressions/clicks from ~~search console, traffic data from ~~analytics, and AI Overview citation tracking from ~~AI monitor. Daily automated rank checks with historical trend data.
+
+**With manual data only:**
+Ask the user to provide:
+1. Keyword ranking positions (current and historical if available)
+2. Target keyword list with search volumes
+3. Competitor domains and their ranking positions for key terms
+4. SERP feature status (featured snippets, PAA appearances)
+5. AI Overview citation data (if tracking GEO metrics)
+
+Proceed with the full analysis using provided data. Note in the output which metrics are from automated collection vs. user-provided data.
+
+## Instructions
+
+When a user requests rank tracking or analysis:
+
+1. **Set Up Keyword Tracking** -- Configure domain, location, device, language, update frequency. Add keywords with volume, current rank, type, and priority. Set up competitor tracking and keyword categories (brand/product/informational/commercial).
+
+2. **Record Current Rankings** -- Ranking overview by position range (#1, #2-3, #4-10, #11-20, etc.), position distribution visualization, detailed rankings with URL, SERP features, and change.
+
+3. **Analyze Ranking Changes** -- Overall movement metrics, biggest improvements and declines with hypothesized causes, recommended recovery actions, stable keywords, new rankings, lost rankings.
+
+4. **Track SERP Features** -- Feature ownership comparison vs competitors (snippets, PAA, image/video pack, local pack), featured snippet status, PAA appearances.
+
+5. **Track GEO/AI Visibility** -- AI Overview presence per keyword, citation rate and position, GEO performance trend over time, improvement opportunities.
+
+6. **Compare Against Competitors** -- Share of voice table, head-to-head comparison per keyword, competitor movement alerts with threat level.
+
+7. **Generate Ranking Report** -- Executive summary with overall trend, position distribution, key highlights (wins/concerns/opportunities), detailed analysis, SERP feature report, GEO visibility, competitive position, recommendations.
+
+   > **Reference**: See [references/ranking-analysis-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/monitor/rank-tracker/references/ranking-analysis-templates.md) for complete output templates for all 7 steps.
+
+## Validation Checkpoints
+
+### Input Validation
+- [ ] Keywords list is complete with search volumes
+- [ ] Target domain and tracking location are specified
+- [ ] Competitor domains identified for comparison
+- [ ] Historical baseline data available or initial tracking period set
+
+### Output Validation
+- [ ] Every metric cites its data source and collection date
+- [ ] Ranking changes include context (vs. previous period)
+- [ ] Significant movements have explanations or investigation notes
+- [ ] Source of each data point clearly stated (~~SEO tool data, ~~search console data, user-provided, or estimated)
+
+## Example
+
+**User**: "Analyze my ranking changes for the past month"
+
+**Output**:
+
+```markdown
+# Ranking Analysis: [current month, year]
+
+## Summary
+
+Your average position improved from 15.3 to 12.8 (-2.5 positions = better)
+Keywords in top 10 increased from 12 to 17 (+5)
+
+## Biggest Wins
+
+| Keyword | Old | New | Change | Possible Cause |
+|---------|-----|-----|--------|----------------|
+| email marketing tips | 18 | 5 | +13 | Likely driven by content refresh |
+| best crm software | 24 | 11 | +13 | Correlates with new backlinks acquired |
+| sales automation | 15 | 7 | +8 | Correlates with schema markup addition |
+
+## Needs Attention
+
+| Keyword | Old | New | Change | Action |
+|---------|-----|-----|--------|--------|
+| marketing automation | 4 | 12 | -8 | Likely displaced by new HubSpot guide |
+
+**Recommended**: Update your marketing automation guide with [current year] statistics and examples.
+```
+
+## Tips for Success
+
+1. **Track consistently** - Same time, same device, same location
+2. **Include enough keywords** - 50-200 for meaningful data
+3. **Segment by intent** - Track brand, commercial, informational separately
+4. **Monitor competitors** - Context makes your data meaningful
+5. **Track SERP features** - Position 1 without snippet may lose to position 4 with snippet
+6. **Include GEO metrics** - AI visibility increasingly important
+
+## Rank Change Quick Reference
+
+### Response Protocol
+
+| Change | Timeframe | Action |
+|--------|-----------|--------|
+| Drop 1-3 positions | Wait 1-2 weeks | Monitor -- may be normal fluctuation |
+| Drop 3-5 positions | Investigate within 1 week | Check for technical issues, competitor changes |
+| Drop 5-10 positions | Investigate immediately | Full diagnostic: technical, content, links |
+| Drop off page 1 | Emergency response | Comprehensive audit + recovery plan |
+| Position gained | Document and learn | What worked? Can you replicate? |
+
+> **Reference**: See [references/tracking-setup-guide.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/monitor/rank-tracker/references/tracking-setup-guide.md) for root cause taxonomy, CTR benchmarks by position, SERP feature CTR impact, algorithm update assessment, tracking configuration best practices, keyword selection and grouping strategies, and data interpretation guidelines.
+
+
+### Save Results
+
+After delivering monitoring data or reports to the user, ask:
+
+> "Save these results for future sessions?"
+
+If yes, write a dated summary to `memory/monitoring/YYYY-MM-DD-<topic>.md` containing:
+- One-line headline finding or status change
+- Top 3-5 actionable items
+- Open loops or anomalies requiring follow-up
+- Source data references
+
+If any findings should influence ongoing strategy, recommend promoting key conclusions to `memory/hot-cache.md`.
+
+## Reference Materials
+
+- [Tracking Setup Guide](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/monitor/rank-tracker/references/tracking-setup-guide.md) — Configuration best practices, device/location settings, and SERP feature tracking setup
+
+## Next Best Skill
+
+- **Primary**: [alert-manager](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/monitor/alert-manager/SKILL.md) — operationalize rank changes into thresholds and follow-ups.
diff --git a/.cursor/skills/rank-tracker/references/ranking-analysis-templates.md b/.cursor/skills/rank-tracker/references/ranking-analysis-templates.md
new file mode 100644
index 0000000000..e25f44e82b
--- /dev/null
+++ b/.cursor/skills/rank-tracker/references/ranking-analysis-templates.md
@@ -0,0 +1,308 @@
+# Ranking Analysis Output Templates
+
+Detailed output templates for each step of the rank tracking workflow. Use these templates when setting up tracking, recording snapshots, and generating reports.
+
+---
+
+## 1. Rank Tracking Setup Template
+
+```markdown
+## Rank Tracking Setup
+
+### Tracking Configuration
+
+**Domain**: [domain]
+**Tracking Location**: [country/city]
+**Device**: [Mobile/Desktop/Both]
+**Language**: [language]
+**Update Frequency**: [Daily/Weekly/Monthly]
+
+### Keywords to Track
+
+| Keyword | Volume | Current Rank | Type | Priority |
+|---------|--------|--------------|------|----------|
+| [keyword 1] | [vol] | [rank] | Primary | High |
+| [keyword 2] | [vol] | [rank] | Primary | High |
+| [keyword 3] | [vol] | [rank] | Secondary | Medium |
+| [keyword 4] | [vol] | [rank] | Long-tail | Medium |
+| [keyword 5] | [vol] | [rank] | Brand | High |
+
+### Competitor Tracking
+
+Track these competitors for benchmark:
+1. [Competitor 1] - [domain]
+2. [Competitor 2] - [domain]
+3. [Competitor 3] - [domain]
+
+### Tracking Categories
+
+| Category | Keywords | Description |
+|----------|----------|-------------|
+| Brand | [X] | Brand name variations |
+| Product | [X] | Product-related terms |
+| Informational | [X] | Educational queries |
+| Commercial | [X] | Buying intent terms |
+```
+
+---
+
+## 2. Current Ranking Snapshot Template
+
+```markdown
+## Current Ranking Snapshot
+
+**Date**: [date]
+**Domain**: [domain]
+
+### Ranking Overview
+
+| Position Range | Keyword Count | % of Total |
+|----------------|---------------|------------|
+| #1 | [X] | [X]% |
+| #2-3 | [X] | [X]% |
+| #4-10 | [X] | [X]% |
+| #11-20 | [X] | [X]% |
+| #21-50 | [X] | [X]% |
+| #51-100 | [X] | [X]% |
+| Not ranking | [X] | [X]% |
+
+### Position Distribution
+
+```
+Position 1:     [X] keywords
+Position 2-3:   [X] keywords
+Position 4-10:  [X] keywords
+Position 11-20: [X] keywords
+Position 21+:   [X] keywords
+```
+
+### Detailed Rankings
+
+| Keyword | Position | URL | SERP Features | Change |
+|---------|----------|-----|---------------|--------|
+| [kw 1] | 3 | [url] | Featured Snippet | +2 |
+| [kw 2] | 7 | [url] | PAA | -1 |
+| [kw 3] | 12 | [url] | None | New |
+| [kw 4] | 1 | [url] | Featured Snippet | Stable |
+```
+
+---
+
+## 3. Ranking Change Analysis Template
+
+```markdown
+## Ranking Change Analysis
+
+**Period**: [start date] to [end date]
+
+### Overall Movement
+
+| Metric | Start | End | Change |
+|--------|-------|-----|--------|
+| Avg Position | [X] | [Y] | [+/-Z] |
+| Keywords in Top 10 | [X] | [Y] | [+/-Z] |
+| Keywords in Top 3 | [X] | [Y] | [+/-Z] |
+| Keywords #1 | [X] | [Y] | [+/-Z] |
+
+### Biggest Improvements
+
+| Keyword | Old Rank | New Rank | Change | Est. Traffic Impact |
+|---------|----------|----------|--------|---------------------|
+| [kw 1] | 15 | 4 | +11 | +[X] visits/mo |
+| [kw 2] | 25 | 9 | +16 | +[X] visits/mo |
+
+**Possible causes**:
+- [kw 1]: [hypothesis - e.g., content refresh may have improved relevance]
+
+### Biggest Declines
+
+| Keyword | Old Rank | New Rank | Change | Est. Traffic Impact |
+|---------|----------|----------|--------|---------------------|
+| [kw 1] | 3 | 12 | -9 | -[X] visits/mo |
+
+**Likely factors**:
+- [kw 1]: [hypothesis - e.g., competitor may have published updated guide]
+
+> These are hypotheses based on available signals, not confirmed causes. Investigate each with the relevant skill to confirm.
+
+**Recommended actions**:
+- [kw 1]: [action to recover]
+
+### Stable Keywords
+
+[X] keywords remained within +/-3 positions (stable)
+
+### New Rankings
+
+| Keyword | Position | URL | Notes |
+|---------|----------|-----|-------|
+| [kw 1] | [pos] | [url] | [notes] |
+
+### Lost Rankings
+
+| Keyword | Last Position | URL | Action |
+|---------|---------------|-----|--------|
+| [kw 1] | [pos] | [url] | [investigate/refresh] |
+```
+
+---
+
+## 4. SERP Feature Tracking Template
+
+```markdown
+## SERP Feature Tracking
+
+### Feature Ownership
+
+| Feature | Your Count | Competitor Avg | Opportunity |
+|---------|------------|----------------|-------------|
+| Featured Snippets | [X] | [Y] | [+/-Z] |
+| People Also Ask | [X] | [Y] | [+/-Z] |
+| Image Pack | [X] | [Y] | [+/-Z] |
+| Video Results | [X] | [Y] | [+/-Z] |
+| Local Pack | [X] | [Y] | [+/-Z] |
+
+### Featured Snippet Status
+
+| Keyword | You Own? | Current Owner | Winnable? |
+|---------|----------|---------------|-----------|
+| [kw 1] | Yes | You | Maintain |
+| [kw 2] | No | [Competitor] | High |
+
+### PAA Appearances
+
+| Question | Your Answer? | Position | Action |
+|----------|--------------|----------|--------|
+| [Question 1] | Yes/No | [pos] | [action] |
+```
+
+---
+
+## 5. GEO/AI Visibility Tracking Template
+
+```markdown
+## AI/GEO Visibility Tracking
+
+### AI Overview Presence
+
+| Keyword | AI Overview | You Cited? | Citation Position |
+|---------|-------------|------------|-------------------|
+| [kw 1] | Yes | Yes | 1st source |
+| [kw 2] | Yes | Yes | 3rd source |
+| [kw 3] | Yes | No | Not cited |
+| [kw 4] | No | N/A | N/A |
+
+### AI Citation Rate
+
+| Metric | Value |
+|--------|-------|
+| Keywords with AI Overview | [X]/[Total] ([Y]%) |
+| Your citations in AI Overview | [X]/[Y] ([Z]%) |
+| Avg citation position | [X] |
+
+### GEO Performance Trend
+
+| Period | AI Overviews Tracked | Your Citations | Rate |
+|--------|---------------------|----------------|------|
+| Last week | [X] | [Y] | [Z]% |
+| 2 weeks ago | [X] | [Y] | [Z]% |
+| Month ago | [X] | [Y] | [Z]% |
+
+### GEO Improvement Opportunities
+
+| Keyword | Has AI Overview | You Cited? | Content Gap |
+|---------|-----------------|------------|-------------|
+| [kw 1] | Yes | No | Need clearer definition |
+| [kw 2] | Yes | No | Missing quotable stats |
+```
+
+---
+
+## 6. Competitor Ranking Comparison Template
+
+```markdown
+## Competitor Ranking Comparison
+
+### Share of Voice
+
+| Domain | Keywords Ranked | Avg Position | Visibility |
+|--------|-----------------|--------------|------------|
+| [Your site] | [X] | [Y] | [Z]% |
+| [Competitor 1] | [X] | [Y] | [Z]% |
+| [Competitor 2] | [X] | [Y] | [Z]% |
+
+### Head-to-Head Comparison
+
+**You vs [Competitor 1]**:
+
+| Keyword | Your Rank | Their Rank | Winner |
+|---------|-----------|------------|--------|
+| [kw 1] | 3 | 7 | You |
+| [kw 2] | 12 | 5 | Them |
+
+**Summary**: You win [X]/[Y] keywords vs [Competitor 1]
+
+### Competitor Movement Alerts
+
+| Competitor | Keyword | Their Change | Threat Level |
+|------------|---------|--------------|--------------|
+| [Comp 1] | [kw] | +15 positions | High |
+| [Comp 2] | [kw] | +8 positions | Medium |
+```
+
+---
+
+## 7. Ranking Performance Report Template
+
+```markdown
+# Ranking Performance Report
+
+**Domain**: [domain]
+**Report Period**: [start] to [end]
+**Generated**: [date]
+
+## Executive Summary
+
+**Overall Trend**: [Improving/Stable/Declining]
+
+| Metric | Value | vs Last Period | Status |
+|--------|-------|----------------|--------|
+| Total keywords tracked | [X] | [+/-Y] | [status] |
+| Keywords in top 10 | [X] | [+/-Y] | [status] |
+| Keywords in top 3 | [X] | [+/-Y] | [status] |
+| Average position | [X] | [+/-Y] | [status] |
+| Estimated traffic | [X] | [+/-Y]% | [status] |
+
+## Key Highlights
+
+### Wins
+- [Achievement 1]
+- [Achievement 2]
+
+### Concerns
+- [Issue 1]
+- [Issue 2]
+
+### Opportunities
+- [Opportunity 1]
+
+## Detailed Analysis
+
+[Include top performing keywords, keywords needing attention, SERP features, GEO visibility, competitive position]
+
+## Recommendations
+
+### Immediate Actions
+1. [Action] for [keyword] - [expected impact]
+
+### This Month
+1. [Action]
+
+### Next Quarter
+1. [Strategic action]
+
+## Next Report
+
+Scheduled: [date]
+Focus areas: [areas to monitor]
+```
diff --git a/.cursor/skills/rank-tracker/references/tracking-setup-guide.md b/.cursor/skills/rank-tracker/references/tracking-setup-guide.md
new file mode 100644
index 0000000000..3ce652ba7b
--- /dev/null
+++ b/.cursor/skills/rank-tracker/references/tracking-setup-guide.md
@@ -0,0 +1,329 @@
+# Rank Tracking Setup Guide
+
+Comprehensive reference for configuring rank tracking systems. Covers tool configuration, keyword selection, grouping strategies, alert thresholds, reporting cadences, and data interpretation.
+
+---
+
+## 1. Tracking Tool Configuration
+
+### Initial Setup Checklist
+
+| Step | Action | Notes |
+|------|--------|-------|
+| 1 | Select tracking tool | ~~SEO tool with rank tracking capability |
+| 2 | Add target domain | Primary domain + any subdomains |
+| 3 | Set tracking location | Country, state/region, or city level |
+| 4 | Configure device settings | Mobile, desktop, or both (recommended: both) |
+| 5 | Set search engine | Google (primary), Bing (optional), local engines |
+| 6 | Set language | Match target audience language |
+| 7 | Configure update frequency | Daily for priority keywords, weekly for long-tail |
+| 8 | Add competitor domains | 3-5 direct competitors |
+| 9 | Import keyword list | From keyword research or existing tracking |
+| 10 | Verify initial data pull | Confirm positions match manual spot checks |
+
+### Location Configuration
+
+Rank tracking results vary dramatically by location. Configure tracking to match your actual target market.
+
+| Scenario | Location Setting | Example |
+|----------|-----------------|---------|
+| National business | Country level | United States |
+| Regional business | State/region level | California, US |
+| Local business | City level | San Francisco, CA, US |
+| Multi-location | Separate project per location | NYC + LA + Chicago |
+| International | Separate project per country | US + UK + Canada |
+
+**Common mistake**: Tracking from a single location when your audience is distributed. If you serve multiple cities, track key terms in each city separately.
+
+### Device Configuration
+
+| Device | When to Track | Why |
+|--------|--------------|-----|
+| Desktop | Always | Baseline reference, still significant traffic |
+| Mobile | Always | 60%+ of searches are mobile; rankings differ from desktop |
+| Both separately | Recommended | Identify device-specific ranking issues |
+| Tablet | Optional | Usually mirrors desktop closely |
+
+### Update Frequency Recommendations
+
+| Keyword Tier | Frequency | Rationale |
+|-------------|-----------|-----------|
+| Top 20 revenue keywords | Daily | Catch drops immediately |
+| Brand keywords | Daily | Protect brand presence |
+| Page 1 keywords (21-50) | 2-3x per week | Monitor competitive positions |
+| Page 2 keywords (51-100) | Weekly | Track progress without excessive API usage |
+| Long-tail / monitoring (100+) | Weekly | Cost-efficient tracking |
+| New/experimental keywords | Daily for first 30 days, then adjust | Establish baseline quickly |
+
+---
+
+## 2. Keyword Selection for Tracking
+
+### How Many Keywords to Track
+
+| Site Size | Recommended Keywords | Breakdown |
+|-----------|---------------------|-----------|
+| Small (< 50 pages) | 50-100 | 10 brand + 20 primary + 20 secondary + rest long-tail |
+| Medium (50-500 pages) | 100-500 | 20 brand + 50 primary + 100 secondary + rest long-tail |
+| Large (500+ pages) | 500-2000+ | Scale proportionally; focus on revenue pages |
+| Enterprise | 2000-10000+ | Comprehensive coverage with automated management |
+
+### Keyword Selection Criteria
+
+Select keywords for tracking based on these factors:
+
+| Factor | Weight | Selection Rule |
+|--------|--------|---------------|
+| Revenue impact | High | Always track keywords that drive conversions |
+| Search volume | Medium | Track keywords with meaningful volume for your niche |
+| Current ranking | Medium | Track keywords where you rank (pages 1-3) plus targets |
+| Competitive value | Medium | Track keywords your competitors target |
+| Strategic importance | High | Track keywords aligned with business goals |
+| Content investment | Medium | Track keywords for pages you invested in creating |
+
+### Keyword Types to Include
+
+| Type | % of Tracked Keywords | Examples |
+|------|----------------------|---------|
+| Brand keywords | 5-10% | "[brand name]", "[brand] + product", "[brand] reviews" |
+| Primary commercial | 15-25% | "[product category]", "best [product]", "buy [product]" |
+| Secondary commercial | 15-25% | "[product] for [use case]", "[product] vs [competitor]" |
+| Informational | 20-30% | "how to [topic]", "what is [concept]", "[topic] guide" |
+| Long-tail | 15-25% | Specific queries with 3+ words, lower volume |
+| Local (if applicable) | 5-10% | "[service] near me", "[service] in [city]" |
+
+### Keywords You Should NOT Track
+
+| Skip These | Why |
+|-----------|-----|
+| Zero-volume keywords (unless strategic) | No measurable impact |
+| Keywords you have no content for | Track only when you create targeting content |
+| Extremely broad single-word terms | Too volatile, hard to rank, misleading data |
+| Misspellings (unless significant volume) | Clutters reporting |
+
+---
+
+## 3. Keyword Grouping Strategies
+
+Effective grouping transforms raw keyword data into strategic insights.
+
+### Grouping Dimensions
+
+| Dimension | Examples | Insight Gained |
+|-----------|---------|----------------|
+| **Topic cluster** | "email marketing", "email automation", "email templates" | Content hub performance |
+| **Search intent** | Informational, commercial, transactional, navigational | Funnel stage performance |
+| **Funnel stage** | Awareness, consideration, decision, retention | Buyer journey alignment |
+| **Product/service** | Product A, Product B, Service C | Product line performance |
+| **Content type** | Blog, landing page, product page, guide | Format effectiveness |
+| **Priority tier** | Tier 1 (revenue), Tier 2 (growth), Tier 3 (monitor) | Resource allocation |
+| **Page** | URL-level grouping | Page-specific performance |
+| **Competitor overlap** | Keywords where you compete with specific rivals | Competitive intelligence |
+
+### Recommended Group Hierarchy
+
+```
+Level 1: Business Unit / Product Line
+  └── Level 2: Topic Cluster / Category
+       └── Level 3: Search Intent
+            └── Level 4: Priority Tier
+```
+
+**Example:**
+
+```
+SaaS Product
+  └── Project Management
+       └── Commercial Intent
+            └── Tier 1: "project management software", "best PM tools"
+            └── Tier 2: "PM software for small teams", "agile PM tools"
+       └── Informational Intent
+            └── Tier 1: "what is project management", "PM methodologies"
+  └── Collaboration
+       └── Commercial Intent
+            └── Tier 1: "team collaboration software", "best collab tools"
+```
+
+### Group Naming Conventions
+
+Use consistent, descriptive names:
+
+| Pattern | Example | Benefit |
+|---------|---------|---------|
+| `[Category] - [Intent]` | "Email Marketing - Commercial" | Clear context |
+| `[Product] / [Feature]` | "CRM / Lead Scoring" | Product-line clarity |
+| `T1: [Topic]` | "T1: Core Product Terms" | Priority at a glance |
+
+---
+
+## 4. Alert Threshold Configuration
+
+### Setting Baseline Thresholds
+
+Before setting alerts, establish a baseline by tracking for 2-4 weeks without alerts to understand normal fluctuation ranges.
+
+| Metric | Baseline Period | Normal Fluctuation | Alert When |
+|--------|----------------|-------------------|------------|
+| Individual keyword position | 2-4 weeks | +/- 2-3 positions | Exceeds normal range |
+| Average position (all keywords) | 4 weeks | +/- 1-2 positions | Exceeds normal range |
+| Keywords in top 10 | 4 weeks | +/- 5% | Drops >10% |
+| Keywords in top 3 | 4 weeks | +/- 3% | Drops >5% |
+
+### Alert Configuration by Keyword Tier
+
+| Keyword Tier | Drop Alert | Gain Alert | Competitor Alert |
+|-------------|-----------|-----------|-----------------|
+| Tier 1 (revenue) | Drop >= 3 positions | Gain >= 3 positions | Competitor enters top 5 |
+| Tier 2 (growth) | Drop >= 5 positions | Enters top 10 | Competitor overtakes you |
+| Tier 3 (monitor) | Drop >= 10 positions | Enters top 20 | None |
+| Brand | Any drop from #1 | N/A | Competitor ranks for your brand |
+
+### Alert Delivery Preferences
+
+| Alert Type | Channel | Frequency |
+|-----------|---------|-----------|
+| Critical drops (Tier 1) | Email + Slack | Immediate |
+| Significant changes | Email | Daily digest |
+| Weekly summary | Email | Every Monday |
+| Monthly report | Email + dashboard | 1st of month |
+
+---
+
+## 5. Reporting Cadences
+
+### Recommended Report Schedule
+
+| Report Type | Audience | Frequency | Key Metrics |
+|------------|----------|-----------|-------------|
+| Quick pulse | SEO team | Daily | Major movements, alerts fired |
+| Weekly summary | Marketing team | Weekly | Position changes, trends, actions taken |
+| Monthly report | Stakeholders | Monthly | Full performance analysis, MoM comparisons |
+| Quarterly review | Leadership | Quarterly | Strategic trends, ROI, competitive position |
+| Annual review | Executive | Annually | YoY growth, strategic recommendations |
+
+### Report Content by Cadence
+
+**Daily Pulse (1-2 minutes to review):**
+- Keywords with biggest position changes
+- Alerts triggered
+- Competitor movements
+
+**Weekly Summary (5-10 minutes to review):**
+- Position distribution changes
+- Top 5 improvements and declines
+- SERP feature wins/losses
+- AI citation changes
+- Week-over-week trend
+
+**Monthly Report (15-30 minutes to review):**
+- Full position distribution analysis
+- Month-over-month trends
+- Competitor share of voice
+- SERP feature ownership
+- GEO visibility trends
+- Content performance by keyword group
+- Recommendations for next month
+
+**Quarterly Review (30-60 minutes to review):**
+- Quarter-over-quarter trends
+- Progress against annual goals
+- Competitive landscape shifts
+- Strategic keyword opportunities
+- Budget and resource recommendations
+
+---
+
+## 6. Data Interpretation Guidelines
+
+### Understanding Rank Fluctuations
+
+| Pattern | Meaning | Action |
+|---------|---------|--------|
+| Daily +/- 1-2 positions | Normal fluctuation | Ignore; track weekly trend |
+| Sudden drop 5+ positions, recovers in 2-3 days | Google testing / data center variation | Monitor, no action needed |
+| Steady decline over 2+ weeks | Real ranking loss | Investigate cause (see Root Cause Taxonomy) |
+| Sudden drop affecting many keywords | Algorithm update or technical issue | Check Search Status Dashboard + technical health |
+| One keyword drops, others stable | Page-specific or competitor-specific issue | Analyze that specific SERP |
+| All keywords for one URL drop | Page-level issue (noindex, 404, slow) | Check page technical health |
+| Gradual improvement over weeks | SEO efforts working | Document what worked, continue strategy |
+
+### Position vs. Traffic Relationship
+
+Position changes do not translate linearly to traffic changes.
+
+| Position Change | Estimated Traffic Impact |
+|----------------|------------------------|
+| #1 to #2 | -50% to -60% click loss |
+| #2 to #3 | -25% to -30% click loss |
+| #3 to #5 | -30% to -40% click loss |
+| #5 to #10 | -50% to -60% click loss |
+| #10 to #11 | -60% to -80% click loss (page 2 cliff) |
+| #11 to #20 | Minimal additional loss (already low) |
+
+**Key insight**: Moving from position #10 to #11 (page 1 to page 2) has a far greater impact than moving from #20 to #30. Prioritize keeping keywords on page 1.
+
+### Interpreting Visibility Score
+
+Visibility score combines position and search volume into a single metric.
+
+| Visibility Change | Interpretation |
+|-------------------|---------------|
+| Increasing visibility, stable positions | Seasonal search volume increase |
+| Decreasing visibility, stable positions | Seasonal search volume decrease |
+| Increasing visibility, improving positions | SEO strategy is working |
+| Decreasing visibility, declining positions | Ranking loss causing traffic decline |
+| Stable visibility, mixed position changes | Gains offsetting losses |
+
+### Comparing Against Competitors
+
+| Competitive Signal | Meaning | Response |
+|-------------------|---------|----------|
+| Competitor visibility rising, yours stable | They are gaining, you may lose ground | Analyze their strategy, accelerate efforts |
+| Your visibility rising faster than competitors | You are winning share of voice | Continue strategy, identify what is working |
+| All competitors dropping simultaneously | Algorithm update affecting the niche | Focus on quality, wait for dust to settle |
+| One competitor surging | They made a significant change | Analyze what they did (content? links? technical?) |
+
+### Data Quality Checks
+
+Always verify your tracking data quality.
+
+| Check | How | Frequency |
+|-------|-----|-----------|
+| Spot-check positions manually | Incognito search for 5-10 keywords | Weekly |
+| Compare with Search Console | Match tracked positions with GSC average position | Monthly |
+| Check for tracking errors | Look for keywords showing "not found" or position 0 | Weekly |
+| Verify competitor data | Spot-check competitor rankings manually | Monthly |
+| Confirm location accuracy | Search from target location (VPN if needed) | Quarterly |
+
+---
+
+## 7. Common Pitfalls and How to Avoid Them
+
+| Pitfall | Problem | Solution |
+|---------|---------|----------|
+| Tracking too many keywords | Dilutes focus, increases cost | Focus on keywords with business impact |
+| Checking rankings too frequently | Obsessing over daily noise | Focus on weekly and monthly trends |
+| Not segmenting data | Averages hide important patterns | Group by intent, topic, priority |
+| Ignoring SERP features | Position alone does not tell the whole story | Track featured snippets, AI Overviews, PAA |
+| Not tracking competitors | Your data lacks context | Always track 3-5 competitors |
+| Single location tracking | Misses local variations | Track each target market separately |
+| Forgetting mobile | Mobile and desktop rankings differ | Always track both devices |
+| Not documenting changes | Cannot correlate changes to actions | Log all content updates, technical changes, link building |
+
+---
+
+## 8. Migration and Tool Switching
+
+If you need to switch rank tracking tools, follow this process to preserve data continuity.
+
+| Step | Action |
+|------|--------|
+| 1 | Export all historical data from current tool |
+| 2 | Run both tools in parallel for 2-4 weeks |
+| 3 | Compare data between tools to understand differences |
+| 4 | Document any systematic position differences |
+| 5 | Import historical data into new tool if supported |
+| 6 | Reconfigure all alerts and reports in new tool |
+| 7 | Decommission old tool after confidence in new data |
+
+**Important**: Different tools may report slightly different positions due to data center sampling, timing, and methodology. A 1-2 position variance between tools is normal.
diff --git a/.cursor/skills/schema-markup-generator/SKILL.md b/.cursor/skills/schema-markup-generator/SKILL.md
new file mode 100644
index 0000000000..7aebdb85fa
--- /dev/null
+++ b/.cursor/skills/schema-markup-generator/SKILL.md
@@ -0,0 +1,402 @@
+---
+name: schema-markup-generator
+description: 'Generate JSON-LD structured data for FAQ, HowTo, Article, Product, LocalBusiness rich results. Schema标记/结构化数据'
+version: "6.0.0"
+license: Apache-2.0
+compatibility: "Claude Code ≥1.0, skills.sh marketplace, ClawHub marketplace, Vercel Labs skills ecosystem. No system packages required. Optional: MCP network access for SEO tool integrations."
+homepage: "https://github.com/aaron-he-zhu/seo-geo-claude-skills"
+when_to_use: "Use when generating JSON-LD structured data, Schema.org markup, or rich snippet markup for a page."
+argument-hint: "<page URL or content type>"
+allowed-tools: WebFetch
+metadata:
+  author: aaron-he-zhu
+  version: "6.0.0"
+  geo-relevance: "medium"
+  tags:
+    - seo
+    - structured-data
+    - json-ld
+    - rich-results
+    - faq-schema
+    - howto-schema
+    - product-schema
+    - article-schema
+    - schema-org
+    - 结构化数据
+    - 構造化データ
+    - 스키마마크업
+    - datos-estructurados
+  triggers:
+    # EN-formal
+    - "add schema markup"
+    - "generate structured data"
+    - "JSON-LD"
+    - "rich snippets"
+    - "FAQ schema"
+    - "schema.org"
+    - "structured data markup"
+    # EN-casual
+    - "add FAQ rich results"
+    - "I want star ratings in Google"
+    - "product markup"
+    - "recipe schema"
+    - "add structured data to my page"
+    # EN-question
+    - "how to add schema markup"
+    - "how to get rich snippets"
+    # ZH-pro
+    - "结构化数据"
+    - "Schema标记"
+    - "JSON-LD生成"
+    - "富摘要"
+    # ZH-casual
+    - "添加结构化数据"
+    - "要星级评分"
+    - "搜索结果要好看"
+    # JA
+    - "構造化データ"
+    - "スキーママークアップ"
+    - "リッチリザルト"
+    # KO
+    - "스키마 마크업"
+    - "구조화 데이터"
+    # ES
+    - "datos estructurados"
+    - "marcado schema"
+    # PT
+    - "dados estruturados"
+    # Misspellings
+    - "shema markup"
+    - "structred data"
+---
+
+# Schema Markup Generator
+
+> **[SEO & GEO Skills Library](https://github.com/aaron-he-zhu/seo-geo-claude-skills)** · 20 skills for SEO + GEO · [ClawHub](https://clawhub.ai/u/aaron-he-zhu) · [skills.sh](https://skills.sh/aaron-he-zhu/seo-geo-claude-skills)
+> **System Mode**: This build skill follows the shared [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md) and [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md).
+
+
+This skill creates Schema.org structured data markup in JSON-LD format to help search engines understand your content and enable rich results in SERPs.
+
+**System role**: Build layer skill. It turns briefs and signals into assets that other skills can review, publish, and monitor.
+
+## When This Must Trigger
+
+Use this when the conversation involves any of these situations — even if the user does not use SEO terminology:
+
+Use this whenever the task needs a shippable asset or transformation that should feed directly into quality review, deployment, or monitoring.
+
+- Adding FAQ schema for expanded SERP presence
+- Creating How-To schema for step-by-step content
+- Adding Product schema for e-commerce pages
+- Implementing Article schema for blog posts
+- Adding Local Business schema for location pages
+- Creating Review/Rating schema
+- Implementing Organization schema for brand presence
+- Any page where rich results would improve visibility
+
+## What This Skill Does
+
+1. **Schema Type Selection**: Recommends appropriate schema types
+2. **JSON-LD Generation**: Creates valid structured data markup
+3. **Property Mapping**: Maps your content to schema properties
+4. **Validation Guidance**: Ensures schema meets requirements
+5. **Nested Schema**: Handles complex, multi-type schemas
+6. **Rich Result Eligibility**: Identifies which rich results you can target
+
+## Quick Start
+
+Start with one of these prompts. Finish with a short handoff summary using the repository format in [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md).
+
+### Generate Schema for Content
+
+```
+Generate schema markup for this [content type]: [content/URL]
+```
+
+```
+Create FAQ schema for these questions and answers: [Q&A list]
+```
+
+### Specific Schema Types
+
+```
+Create Product schema for [product name] with [details]
+```
+
+```
+Generate LocalBusiness schema for [business name and details]
+```
+
+### Audit Existing Schema
+
+```
+Review and improve this schema markup: [existing schema]
+```
+
+## Skill Contract
+
+**Expected output**: a ready-to-use asset or implementation-ready transformation plus a short handoff summary ready for `memory/content/`.
+
+- **Reads**: the brief, target keywords, entity inputs, quality constraints, and prior decisions from [CLAUDE.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CLAUDE.md) and the shared [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md) when available.
+- **Writes**: a user-facing content, metadata, or schema deliverable plus a reusable summary that can be stored under `memory/content/`.
+- **Promotes**: approved angles, messaging choices, missing evidence, and publish blockers to `CLAUDE.md`, `memory/decisions.md`, and `memory/open-loops.md`.
+- **Next handoff**: use the `Next Best Skill` below when the asset is ready for review or deployment.
+
+## Data Sources
+
+> See [CONNECTORS.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CONNECTORS.md) for tool category placeholders.
+
+**With ~~web crawler connected:**
+Automatically crawl and extract page content (visible text, headings, lists, tables), existing schema markup, page metadata, and structured content elements that map to schema properties.
+
+**With manual data only:**
+Ask the user to provide:
+1. Page URL or full HTML content
+2. Page type (article, product, FAQ, how-to, local business, etc.)
+3. Specific data needed for schema (prices, dates, author info, Q&A pairs, etc.)
+4. Current schema markup (if optimizing existing)
+
+Proceed with the full workflow using provided data. Note in the output which data is from automated extraction vs. user-provided data.
+
+## Instructions
+
+When a user requests schema markup:
+
+1. **Identify Content Type and Rich Result Opportunity**
+
+   Reference the [CORE-EEAT Benchmark](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/core-eeat-benchmark.md) item **O05 (Schema Markup)** for content-type to schema mapping:
+
+   ```markdown
+   ### CORE-EEAT Schema Mapping (O05)
+
+   | Content Type | Required Schema | Conditional Schema |
+   |-------------|----------------|--------------------|
+   | Blog (guides) | Article, Breadcrumb | FAQ, HowTo |
+   | Blog (tools) | Article, Breadcrumb | FAQ, Review |
+   | Blog (insights) | Article, Breadcrumb | FAQ |
+   | Alternative | Comparison*, Breadcrumb, FAQ | AggregateRating |
+   | Best-of | ItemList, Breadcrumb, FAQ | AggregateRating per tool |
+   | Use-case | WebPage, Breadcrumb, FAQ | — |
+   | FAQ | FAQPage, Breadcrumb | — |
+   | Landing | SoftwareApplication, Breadcrumb, FAQ | WebPage |
+   | Testimonial | Review, Breadcrumb | FAQ, Person |
+
+   *Use the mapping above to ensure schema type matches content type (CORE-EEAT O05: Pass criteria).*
+   ```
+
+   ```markdown
+   ### Schema Analysis
+
+   **Content Type**: [blog/product/FAQ/how-to/local business/etc.]
+   **Page URL**: [URL]
+
+   **Eligible Rich Results**:
+   
+   | Rich Result Type | Eligibility | Impact |
+   |------------------|-------------|--------|
+   | FAQ | ✅/❌ | High - Expands SERP presence |
+   | How-To | ✅/❌ | Medium - Shows steps in SERP |
+   | Product | ✅/❌ | High - Shows price, availability |
+   | Review | ✅/❌ | High - Shows star ratings |
+   | Article | ✅/❌ | Medium - Shows publish date, author |
+   | Breadcrumb | ✅/❌ | Medium - Shows navigation path |
+   | Video | ✅/❌ | High - Shows video thumbnail |
+   
+   **Recommended Schema Types**:
+   1. [Primary schema type] - [reason]
+   2. [Secondary schema type] - [reason]
+   ```
+
+2. **Generate Schema Markup**
+
+   Based on the identified content type, generate the appropriate JSON-LD schema. Supported types: FAQPage, HowTo, Article/BlogPosting/NewsArticle, Product, LocalBusiness, Organization, BreadcrumbList, Event, Recipe, and combined multi-type schemas.
+
+   > **Reference**: See [references/schema-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/build/schema-markup-generator/references/schema-templates.md) for complete, copy-ready JSON-LD templates for all schema types with required and optional properties.
+
+   For each schema generated, include:
+   - All required properties for the chosen type
+   - Rich result preview showing expected SERP appearance
+   - Notes on which properties are required vs. optional
+
+   When combining multiple schema types on one page, wrap them in a JSON array inside a single `<script type="application/ld+json">` tag.
+
+3. **Provide Implementation and Validation**
+
+    ```markdown
+    ## Implementation Guide
+
+    ### Adding Schema to Your Page
+
+    **Option 1: In HTML <head>**
+    ```html
+    <head>
+      <script type="application/ld+json">
+        [Your JSON-LD schema here]
+      </script>
+    </head>
+    ```
+
+    **Option 2: Before closing </body>**
+    ```html
+      <script type="application/ld+json">
+        [Your JSON-LD schema here]
+      </script>
+    </body>
+    ```
+
+    ### Validation Steps
+
+    1. **~~schema validator**
+       - Test your live URL or paste code
+       - Check for errors and warnings
+
+    2. **Schema.org Validator**
+       - URL: https://validator.schema.org/
+       - Validates against Schema.org spec
+
+    3. **~~search console**
+       - Monitor rich results in ~~search console
+       - Check Enhancements reports for issues
+
+    ### Validation Checklist
+
+    - [ ] JSON syntax is valid (no trailing commas)
+    - [ ] All required properties present
+    - [ ] URLs are absolute, not relative
+    - [ ] Dates are in ISO 8601 format
+    - [ ] Content matches visible page content
+    - [ ] No policy violations
+    ```
+
+## Validation Checkpoints
+
+### Input Validation
+- [ ] Page URL or content provided
+- [ ] Schema type appropriate for content (Article for blog, Product for e-commerce, etc.)
+- [ ] All required data available (author, dates, prices, etc. depending on schema type)
+- [ ] Content eligibility for rich results confirmed
+
+### Output Validation
+- [ ] JSON syntax validates (no trailing commas, proper quotes)
+- [ ] All required properties present for chosen schema type
+- [ ] URLs are absolute, not relative
+- [ ] Dates in ISO 8601 format (YYYY-MM-DDTHH:MM:SS+00:00)
+- [ ] Schema content matches visible page content exactly
+- [ ] Passes ~~schema validator with no errors
+- [ ] Source of each data point clearly stated (~~web crawler extraction, user-provided, or manual entry)
+
+## Example
+
+**User**: "Generate FAQ schema for a page about SEO with 3 questions"
+
+**Output**:
+
+```markdown
+## FAQ Schema for SEO Page
+
+### Schema Markup
+
+```json
+{
+  "@context": "https://schema.org",
+  "@type": "FAQPage",
+  "mainEntity": [
+    {
+      "@type": "Question",
+      "name": "What is SEO?",
+      "acceptedAnswer": {
+        "@type": "Answer",
+        "text": "SEO (Search Engine Optimization) is the practice of optimizing websites and content to rank higher in search engine results pages (SERPs). It involves technical optimizations, content creation, and link building to increase organic visibility and drive qualified traffic to your website."
+      }
+    },
+    {
+      "@type": "Question",
+      "name": "How long does SEO take to work?",
+      "acceptedAnswer": {
+        "@type": "Answer",
+        "text": "SEO typically takes 3-6 months to show significant results, though this varies based on competition, domain authority, and effort invested. New websites may take 6-12 months, while established sites with existing authority can see improvements in 1-3 months for less competitive keywords."
+      }
+    },
+    {
+      "@type": "Question",
+      "name": "Is SEO better than paid advertising?",
+      "acceptedAnswer": {
+        "@type": "Answer",
+        "text": "SEO and paid advertising serve different purposes. SEO provides sustainable, long-term traffic without per-click costs but takes time to build. Paid advertising delivers immediate results but stops when you stop paying. Most successful businesses use both: paid ads for immediate leads and SEO for long-term growth."
+      }
+    }
+  ]
+}
+```
+
+_Implementation: Wrap the above JSON-LD in `<script type="application/ld+json">...</script>` and place in `<head>` or before `</body>`. Test with ~~schema validator._
+
+### SERP Preview
+
+```
+SEO Guide: Complete Beginner's Tutorial
+yoursite.com/seo-guide/
+Learn SEO from scratch with our comprehensive guide...
+
+▼ What is SEO?
+  SEO (Search Engine Optimization) is the practice of optimizing...
+▼ How long does SEO take to work?
+  SEO typically takes 3-6 months to show significant results...
+▼ Is SEO better than paid advertising?
+  SEO and paid advertising serve different purposes...
+```
+```
+
+## Schema Type Quick Reference
+
+| Content Type | Schema Type | Key Properties |
+|--------------|-------------|----------------|
+| Blog Post | BlogPosting/Article | headline, datePublished, author |
+| Product | Product | name, price, availability |
+| FAQ | FAQPage | Question, Answer |
+| How-To | HowTo | step, totalTime |
+| Local Business | LocalBusiness | address, geo, openingHours |
+| Recipe | Recipe | ingredients, cookTime |
+| Event | Event | startDate, location |
+| Video | VideoObject | uploadDate, duration |
+| Course | Course | provider, name |
+| Review | Review | itemReviewed, ratingValue |
+
+## Tips for Success
+
+1. **Match visible content** - Schema must reflect what users see
+2. **Don't spam** - Only add schema for relevant content
+3. **Keep updated** - Update dates and prices when they change
+4. **Test thoroughly** - Validate before deploying
+5. **Monitor Search Console** - Watch for errors and warnings
+
+## Schema Type Decision Tree
+
+> **Reference**: See [references/schema-decision-tree.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/build/schema-markup-generator/references/schema-decision-tree.md) for the full decision tree (content-to-schema mapping), industry-specific recommendations, implementation priority tiers (P0-P4), and validation quick reference.
+
+
+### Save Results
+
+After delivering content or optimization output to the user, ask:
+
+> "Save these results for future sessions?"
+
+If yes, write a dated summary to `memory/content/YYYY-MM-DD-<topic>.md` containing:
+- One-line description of what was created
+- Target keyword and content type
+- Open loops or items needing review
+- Source data references
+
+**Gate check recommended**: Run content-quality-auditor before publishing (PostToolUse hook will remind automatically).
+
+If any findings should influence ongoing strategy, recommend promoting key conclusions to `memory/hot-cache.md`.
+
+## Reference Materials
+
+- [Schema Templates](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/build/schema-markup-generator/references/schema-templates.md) - Copy-ready JSON-LD templates for all schema types
+- [Validation Guide](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/build/schema-markup-generator/references/validation-guide.md) - Common errors, required properties, testing workflow
+
+## Next Best Skill
+
+- **Primary**: [technical-seo-checker](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/technical-seo-checker/SKILL.md) — verify implementation quality and deployment readiness.
diff --git a/.cursor/skills/schema-markup-generator/references/schema-decision-tree.md b/.cursor/skills/schema-markup-generator/references/schema-decision-tree.md
new file mode 100644
index 0000000000..6ba8a175ea
--- /dev/null
+++ b/.cursor/skills/schema-markup-generator/references/schema-decision-tree.md
@@ -0,0 +1,65 @@
+# Schema Type Decision Tree
+
+Guidelines for selecting the right schema types based on content, industry, and implementation priority.
+
+---
+
+## When to Use Which Schema
+
+| Your Content | Primary Schema | Add If Applicable | Rich Result Eligibility |
+|-------------|---------------|-------------------|----------------------|
+| Blog post / article | Article | FAQ, HowTo, Speakable | Article carousel, FAQ rich result |
+| Product page | Product | Review, Offer, AggregateRating | Product snippet with price/rating |
+| Service page | Service | FAQ, LocalBusiness | Service snippet |
+| How-to guide | HowTo | Article, FAQ | How-to rich result with steps |
+| FAQ page | FAQPage | Article | FAQ accordion in SERP |
+| Recipe | Recipe | Video, AggregateRating | Recipe carousel |
+| Event | Event | Offer, Organization | Event snippet with date/location |
+| Video | VideoObject | Article | Video carousel, key moments |
+| Local business | LocalBusiness | Review, OpeningHoursSpecification | Local pack, knowledge panel |
+| Person/author | Person | Organization | Knowledge panel |
+| Organization | Organization | ContactPoint, Logo | Knowledge panel |
+| Course | Course | Organization | Course rich result |
+| Job posting | JobPosting | Organization | Google for Jobs listing |
+| Breadcrumb | BreadcrumbList | (Always add alongside other schema) | Breadcrumb trail in SERP |
+| Software/App | SoftwareApplication | Review, Offer | App snippet |
+
+---
+
+## Industry-Specific Schema Recommendations
+
+| Industry | Essential Schema | High-Value Additions |
+|----------|-----------------|---------------------|
+| E-commerce | Product, BreadcrumbList, Organization | AggregateRating, FAQ, Review |
+| SaaS | SoftwareApplication, FAQPage, Organization | HowTo, VideoObject, Review |
+| Local Services | LocalBusiness, Service | FAQ, Review, Event |
+| Publishing/Media | Article, Person, Organization | FAQ, Speakable, VideoObject |
+| Education | Course, Organization | FAQ, HowTo, Event |
+| Healthcare | MedicalWebPage, Organization | FAQ, Physician, MedicalClinic |
+| Real Estate | RealEstateListing, Organization | LocalBusiness, FAQ |
+| Restaurants | Restaurant, Menu | Review, Event, FAQ |
+
+---
+
+## Schema Implementation Priority
+
+| Priority | Schema Types | Why |
+|----------|-------------|-----|
+| P0 -- Always | Organization, BreadcrumbList, WebSite (SearchAction) | Foundation for all sites |
+| P1 -- Content | Article, FAQPage, HowTo | Direct rich result eligibility |
+| P2 -- Commercial | Product, Review, AggregateRating, Offer | Revenue-impacting rich results |
+| P3 -- Authority | Person, SameAs, Speakable | E-E-A-T signals, AI citation |
+| P4 -- Specialized | Industry-specific types | Niche rich results |
+
+---
+
+## Schema Validation Quick Reference
+
+| Issue | Impact | Fix |
+|-------|--------|-----|
+| Missing required property | Schema ignored by Google | Add all required fields (check schema.org) |
+| Invalid date format | Warning, may lose rich result | Use ISO 8601: "2026-02-11" |
+| Incorrect @type | Schema misinterpreted | Match @type exactly to schema.org |
+| Self-referencing sameAs | Warning | sameAs should link to EXTERNAL profiles |
+| Missing image for Article | Loses article rich result | Add image property with valid URL |
+| Review without itemReviewed | Review not connected | Nest review inside Product/Service/etc. |
diff --git a/.cursor/skills/schema-markup-generator/references/schema-templates.md b/.cursor/skills/schema-markup-generator/references/schema-templates.md
new file mode 100644
index 0000000000..bb4925cf5a
--- /dev/null
+++ b/.cursor/skills/schema-markup-generator/references/schema-templates.md
@@ -0,0 +1,631 @@
+# Schema.org JSON-LD Templates
+
+Complete, copy-ready structured data templates for all major schema types. Customize the bracketed values to match your content.
+
+## FAQPage Schema
+
+For pages with frequently asked questions. Minimum 2 Q&A pairs required.
+
+```json
+{
+  "@context": "https://schema.org",
+  "@type": "FAQPage",
+  "mainEntity": [
+    {
+      "@type": "Question",
+      "name": "[Question text - exactly as shown on page]",
+      "acceptedAnswer": {
+        "@type": "Answer",
+        "text": "[Complete answer text - must match visible content]"
+      }
+    },
+    {
+      "@type": "Question",
+      "name": "[Question 2]",
+      "acceptedAnswer": {
+        "@type": "Answer",
+        "text": "[Answer 2]"
+      }
+    },
+    {
+      "@type": "Question",
+      "name": "[Question 3]",
+      "acceptedAnswer": {
+        "@type": "Answer",
+        "text": "[Answer 3]"
+      }
+    }
+  ]
+}
+```
+
+**Requirements**: Questions must be complete questions, answers must be comprehensive, content must match visible page content.
+
+---
+
+## HowTo Schema
+
+For step-by-step instructional content.
+
+```json
+{
+  "@context": "https://schema.org",
+  "@type": "HowTo",
+  "name": "[How-to title - what will users learn]",
+  "description": "[Brief description of what this tutorial teaches]",
+  "image": {
+    "@type": "ImageObject",
+    "url": "[Main image URL]",
+    "height": "[height in pixels]",
+    "width": "[width in pixels]"
+  },
+  "totalTime": "PT[X]H[Y]M",
+  "estimatedCost": {
+    "@type": "MonetaryAmount",
+    "currency": "USD",
+    "value": "[estimated cost or 0]"
+  },
+  "supply": [
+    {
+      "@type": "HowToSupply",
+      "name": "[Supply item 1]"
+    },
+    {
+      "@type": "HowToSupply",
+      "name": "[Supply item 2]"
+    }
+  ],
+  "tool": [
+    {
+      "@type": "HowToTool",
+      "name": "[Tool 1]"
+    },
+    {
+      "@type": "HowToTool",
+      "name": "[Tool 2]"
+    }
+  ],
+  "step": [
+    {
+      "@type": "HowToStep",
+      "position": 1,
+      "name": "[Step 1 title]",
+      "text": "[Step 1 detailed instructions]",
+      "url": "[Page URL]#step1",
+      "image": "[Step 1 image URL - optional]"
+    },
+    {
+      "@type": "HowToStep",
+      "position": 2,
+      "name": "[Step 2 title]",
+      "text": "[Step 2 detailed instructions]",
+      "url": "[Page URL]#step2",
+      "image": "[Step 2 image URL - optional]"
+    },
+    {
+      "@type": "HowToStep",
+      "position": 3,
+      "name": "[Step 3 title]",
+      "text": "[Step 3 detailed instructions]",
+      "url": "[Page URL]#step3",
+      "image": "[Step 3 image URL - optional]"
+    }
+  ]
+}
+```
+
+**Time format**: PT[X]H[Y]M where X = hours, Y = minutes. Example: PT1H30M = 1 hour 30 minutes.
+
+---
+
+## Article / BlogPosting / NewsArticle Schema
+
+For blog posts, articles, and news content.
+
+```json
+{
+  "@context": "https://schema.org",
+  "@type": "BlogPosting",
+  "headline": "[Article title - max 110 characters for best display]",
+  "description": "[Article summary or excerpt]",
+  "image": [
+    "[Featured image URL - 1200px wide recommended]",
+    "[Alternative image URL - 4:3 ratio]",
+    "[Alternative image URL - 16:9 ratio]"
+  ],
+  "datePublished": "[ISO 8601 date: 2024-01-15T08:00:00+00:00]",
+  "dateModified": "[ISO 8601 date - same as published if never modified]",
+  "author": {
+    "@type": "Person",
+    "name": "[Author Full Name]",
+    "url": "[Author profile URL]",
+    "jobTitle": "[Author job title - optional]"
+  },
+  "publisher": {
+    "@type": "Organization",
+    "name": "[Publisher/Company Name]",
+    "logo": {
+      "@type": "ImageObject",
+      "url": "[Publisher logo URL - max 600px wide, 60px high]",
+      "width": "[width]",
+      "height": "[height]"
+    }
+  },
+  "mainEntityOfPage": {
+    "@type": "WebPage",
+    "@id": "[Canonical URL of this article]"
+  },
+  "articleBody": "[Full article text - optional but recommended]",
+  "wordCount": "[word count - optional]"
+}
+```
+
+**Type variants**: Use `Article` for general articles, `BlogPosting` for blog posts, `NewsArticle` for news content, `TechArticle` for technical documentation.
+
+---
+
+## Product Schema
+
+For e-commerce product pages.
+
+```json
+{
+  "@context": "https://schema.org",
+  "@type": "Product",
+  "name": "[Product Name]",
+  "image": [
+    "[Product image URL 1]",
+    "[Product image URL 2]",
+    "[Product image URL 3]"
+  ],
+  "description": "[Product description]",
+  "sku": "[SKU code]",
+  "mpn": "[Manufacturer Part Number - optional]",
+  "brand": {
+    "@type": "Brand",
+    "name": "[Brand Name]"
+  },
+  "offers": {
+    "@type": "Offer",
+    "url": "[Product page URL]",
+    "priceCurrency": "USD",
+    "price": "[Price as number: 29.99]",
+    "priceValidUntil": "[Date price is valid until: 2024-12-31]",
+    "availability": "https://schema.org/InStock",
+    "seller": {
+      "@type": "Organization",
+      "name": "[Seller/Store Name]"
+    },
+    "shippingDetails": {
+      "@type": "OfferShippingDetails",
+      "shippingRate": {
+        "@type": "MonetaryAmount",
+        "value": "[shipping cost]",
+        "currency": "USD"
+      },
+      "shippingDestination": {
+        "@type": "DefinedRegion",
+        "addressCountry": "US"
+      }
+    }
+  },
+  "aggregateRating": {
+    "@type": "AggregateRating",
+    "ratingValue": "[4.5]",
+    "reviewCount": "[89]",
+    "bestRating": "5",
+    "worstRating": "1"
+  },
+  "review": [
+    {
+      "@type": "Review",
+      "reviewRating": {
+        "@type": "Rating",
+        "ratingValue": "[5]",
+        "bestRating": "5"
+      },
+      "author": {
+        "@type": "Person",
+        "name": "[Reviewer Name]"
+      },
+      "reviewBody": "[Review text]",
+      "datePublished": "[Review date: 2024-01-15]"
+    }
+  ]
+}
+```
+
+**Availability options**: `InStock`, `OutOfStock`, `PreOrder`, `Discontinued`, `LimitedAvailability`, `OnlineOnly`, `InStoreOnly`, `SoldOut`
+
+---
+
+## LocalBusiness Schema
+
+For local business pages with physical locations.
+
+```json
+{
+  "@context": "https://schema.org",
+  "@type": "LocalBusiness",
+  "name": "[Business Name]",
+  "image": "[Business image or logo URL]",
+  "@id": "[Business page URL]",
+  "url": "[Website URL]",
+  "telephone": "[Phone number: +1-555-555-5555]",
+  "priceRange": "[$$$ or price range like $10-$50]",
+  "address": {
+    "@type": "PostalAddress",
+    "streetAddress": "[Street address]",
+    "addressLocality": "[City]",
+    "addressRegion": "[State/Province]",
+    "postalCode": "[ZIP/Postal code]",
+    "addressCountry": "US"
+  },
+  "geo": {
+    "@type": "GeoCoordinates",
+    "latitude": "[latitude as number: 40.7128]",
+    "longitude": "[longitude as number: -74.0060]"
+  },
+  "openingHoursSpecification": [
+    {
+      "@type": "OpeningHoursSpecification",
+      "dayOfWeek": ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"],
+      "opens": "09:00",
+      "closes": "17:00"
+    },
+    {
+      "@type": "OpeningHoursSpecification",
+      "dayOfWeek": "Saturday",
+      "opens": "10:00",
+      "closes": "15:00"
+    }
+  ],
+  "aggregateRating": {
+    "@type": "AggregateRating",
+    "ratingValue": "[4.5]",
+    "reviewCount": "[123]"
+  },
+  "servesCuisine": "[Cuisine type - for restaurants only]"
+}
+```
+
+**Type variants**: Use more specific types when applicable: `Restaurant`, `Store`, `AutoRepair`, `HealthAndBeautyBusiness`, `LegalService`, etc.
+
+---
+
+## Organization Schema
+
+For brand/company homepage.
+
+```json
+{
+  "@context": "https://schema.org",
+  "@type": "Organization",
+  "name": "[Organization Name]",
+  "url": "[Website URL]",
+  "logo": "[Logo URL - recommended 112x112px or larger]",
+  "description": "[Company description]",
+  "sameAs": [
+    "[Facebook URL]",
+    "[Twitter URL]",
+    "[LinkedIn URL]",
+    "[Instagram URL]",
+    "[YouTube URL]"
+  ],
+  "contactPoint": {
+    "@type": "ContactPoint",
+    "telephone": "[Phone number]",
+    "contactType": "customer service",
+    "email": "[Email address]",
+    "availableLanguage": ["English", "Spanish"],
+    "areaServed": "US"
+  },
+  "founder": {
+    "@type": "Person",
+    "name": "[Founder name - optional]"
+  },
+  "foundingDate": "[YYYY-MM-DD - optional]",
+  "address": {
+    "@type": "PostalAddress",
+    "streetAddress": "[Street address]",
+    "addressLocality": "[City]",
+    "addressRegion": "[State]",
+    "postalCode": "[ZIP]",
+    "addressCountry": "US"
+  }
+}
+```
+
+---
+
+## BreadcrumbList Schema
+
+For navigation breadcrumbs.
+
+```json
+{
+  "@context": "https://schema.org",
+  "@type": "BreadcrumbList",
+  "itemListElement": [
+    {
+      "@type": "ListItem",
+      "position": 1,
+      "name": "Home",
+      "item": "[Homepage URL]"
+    },
+    {
+      "@type": "ListItem",
+      "position": 2,
+      "name": "[Category Name]",
+      "item": "[Category URL]"
+    },
+    {
+      "@type": "ListItem",
+      "position": 3,
+      "name": "[Subcategory Name]",
+      "item": "[Subcategory URL]"
+    },
+    {
+      "@type": "ListItem",
+      "position": 4,
+      "name": "[Current Page Name]",
+      "item": "[Current Page URL]"
+    }
+  ]
+}
+```
+
+**Important**: Position numbers must be sequential starting from 1. Last item should be the current page.
+
+---
+
+## VideoObject Schema
+
+For video content.
+
+```json
+{
+  "@context": "https://schema.org",
+  "@type": "VideoObject",
+  "name": "[Video title]",
+  "description": "[Video description]",
+  "thumbnailUrl": "[Video thumbnail URL - minimum 160x90px]",
+  "uploadDate": "[ISO 8601 date: 2024-01-15T08:00:00+00:00]",
+  "duration": "PT[X]M[Y]S",
+  "contentUrl": "[Video file URL]",
+  "embedUrl": "[Video embed URL]",
+  "interactionStatistic": {
+    "@type": "InteractionCounter",
+    "interactionType": { "@type": "WatchAction" },
+    "userInteractionCount": "[view count]"
+  }
+}
+```
+
+**Duration format**: PT[X]M[Y]S where X = minutes, Y = seconds. Example: PT5M30S = 5 minutes 30 seconds.
+
+---
+
+## Event Schema
+
+For events, conferences, concerts, etc.
+
+```json
+{
+  "@context": "https://schema.org",
+  "@type": "Event",
+  "name": "[Event Name]",
+  "description": "[Event description]",
+  "image": "[Event image URL]",
+  "startDate": "[ISO 8601 date: 2024-06-15T19:00:00-05:00]",
+  "endDate": "[ISO 8601 date: 2024-06-15T22:00:00-05:00]",
+  "eventStatus": "https://schema.org/EventScheduled",
+  "eventAttendanceMode": "https://schema.org/OfflineEventAttendanceMode",
+  "location": {
+    "@type": "Place",
+    "name": "[Venue Name]",
+    "address": {
+      "@type": "PostalAddress",
+      "streetAddress": "[Street address]",
+      "addressLocality": "[City]",
+      "addressRegion": "[State]",
+      "postalCode": "[ZIP]",
+      "addressCountry": "US"
+    }
+  },
+  "offers": {
+    "@type": "Offer",
+    "url": "[Ticket purchase URL]",
+    "price": "[ticket price]",
+    "priceCurrency": "USD",
+    "availability": "https://schema.org/InStock",
+    "validFrom": "[Sale start date]"
+  },
+  "organizer": {
+    "@type": "Organization",
+    "name": "[Organizer name]",
+    "url": "[Organizer website]"
+  }
+}
+```
+
+**Event status options**: `EventScheduled`, `EventCancelled`, `EventPostponed`, `EventRescheduled`, `EventMovedOnline`
+
+**Attendance mode**: `OfflineEventAttendanceMode`, `OnlineEventAttendanceMode`, `MixedEventAttendanceMode`
+
+---
+
+## Course Schema
+
+For online courses and educational content.
+
+```json
+{
+  "@context": "https://schema.org",
+  "@type": "Course",
+  "name": "[Course Name]",
+  "description": "[Course description]",
+  "provider": {
+    "@type": "Organization",
+    "name": "[Provider name]",
+    "sameAs": "[Provider URL]"
+  },
+  "offers": {
+    "@type": "Offer",
+    "category": "Paid",
+    "price": "[price]",
+    "priceCurrency": "USD"
+  },
+  "hasCourseInstance": {
+    "@type": "CourseInstance",
+    "courseMode": "online",
+    "courseWorkload": "PT[X]H",
+    "instructor": {
+      "@type": "Person",
+      "name": "[Instructor name]"
+    }
+  }
+}
+```
+
+---
+
+## Recipe Schema
+
+For cooking recipes.
+
+```json
+{
+  "@context": "https://schema.org",
+  "@type": "Recipe",
+  "name": "[Recipe name]",
+  "image": "[Recipe image URL]",
+  "author": {
+    "@type": "Person",
+    "name": "[Author name]"
+  },
+  "datePublished": "[ISO 8601 date]",
+  "description": "[Recipe description]",
+  "prepTime": "PT[X]M",
+  "cookTime": "PT[X]M",
+  "totalTime": "PT[X]M",
+  "recipeYield": "[Servings: e.g., '4 servings']",
+  "recipeCategory": "[Category: e.g., 'Dinner']",
+  "recipeCuisine": "[Cuisine: e.g., 'Italian']",
+  "keywords": "[comma, separated, keywords]",
+  "nutrition": {
+    "@type": "NutritionInformation",
+    "calories": "[calories per serving]"
+  },
+  "recipeIngredient": [
+    "[Ingredient 1 with quantity]",
+    "[Ingredient 2 with quantity]",
+    "[Ingredient 3 with quantity]"
+  ],
+  "recipeInstructions": [
+    {
+      "@type": "HowToStep",
+      "text": "[Step 1 instructions]"
+    },
+    {
+      "@type": "HowToStep",
+      "text": "[Step 2 instructions]"
+    }
+  ],
+  "aggregateRating": {
+    "@type": "AggregateRating",
+    "ratingValue": "[4.5]",
+    "reviewCount": "[number of reviews]"
+  }
+}
+```
+
+---
+
+## SoftwareApplication Schema
+
+For software, apps, and tools.
+
+```json
+{
+  "@context": "https://schema.org",
+  "@type": "SoftwareApplication",
+  "name": "[Software name]",
+  "operatingSystem": "[Windows, macOS, iOS, Android, Web]",
+  "applicationCategory": "BusinessApplication",
+  "offers": {
+    "@type": "Offer",
+    "price": "[price or 0 for free]",
+    "priceCurrency": "USD"
+  },
+  "aggregateRating": {
+    "@type": "AggregateRating",
+    "ratingValue": "[4.5]",
+    "reviewCount": "[number of reviews]"
+  },
+  "screenshot": "[Screenshot URL - optional]",
+  "softwareVersion": "[version number]",
+  "fileSize": "[file size with units: e.g., '50MB']",
+  "datePublished": "[Release date]",
+  "downloadUrl": "[Download URL - optional]"
+}
+```
+
+---
+
+## Multiple Schema Types (Combined Array)
+
+To include multiple schema types on one page, wrap them in an array:
+
+```html
+<script type="application/ld+json">
+[
+  {
+    "@context": "https://schema.org",
+    "@type": "Article",
+    "headline": "[Article title]",
+    "author": {
+      "@type": "Person",
+      "name": "[Author]"
+    }
+  },
+  {
+    "@context": "https://schema.org",
+    "@type": "FAQPage",
+    "mainEntity": [
+      {
+        "@type": "Question",
+        "name": "[Question]",
+        "acceptedAnswer": {
+          "@type": "Answer",
+          "text": "[Answer]"
+        }
+      }
+    ]
+  },
+  {
+    "@context": "https://schema.org",
+    "@type": "BreadcrumbList",
+    "itemListElement": [
+      {
+        "@type": "ListItem",
+        "position": 1,
+        "name": "Home",
+        "item": "[URL]"
+      }
+    ]
+  }
+]
+</script>
+```
+
+---
+
+## Implementation Notes
+
+- Always validate schema at https://validator.schema.org/ and https://search.google.com/test/rich-results
+- Remove bracketed placeholders and replace with actual content
+- Use absolute URLs, not relative paths
+- Dates must be in ISO 8601 format
+- Schema must match visible page content (Google policy requirement)
+- No trailing commas in JSON (invalid syntax)
diff --git a/.cursor/skills/schema-markup-generator/references/validation-guide.md b/.cursor/skills/schema-markup-generator/references/validation-guide.md
new file mode 100644
index 0000000000..d060eec21d
--- /dev/null
+++ b/.cursor/skills/schema-markup-generator/references/validation-guide.md
@@ -0,0 +1,444 @@
+# Schema Markup Validation Guide
+
+Complete reference for validating, testing, and troubleshooting structured data.
+
+## Validation Tools
+
+### Google Rich Results Test
+- **URL**: https://search.google.com/test/rich-results
+- **Purpose**: Check if your schema is eligible for Google rich results
+- **Tests**: Live URL or code snippet
+- **Output**: Errors, warnings, eligible rich result types
+
+### Schema.org Validator
+- **URL**: https://validator.schema.org/
+- **Purpose**: Validate against official Schema.org specification
+- **Tests**: URL, code snippet, or microdata
+- **Output**: Technical validation errors
+
+### Google Search Console
+- **Location**: Search Console → Enhancements section
+- **Purpose**: Monitor rich results performance and errors at scale
+- **Reports**: Rich results status, coverage, issues over time
+
+---
+
+## Common JSON-LD Syntax Errors
+
+### Trailing Commas
+
+**Error**: Invalid JSON syntax
+```json
+{
+  "@context": "https://schema.org",
+  "@type": "Article",
+  "headline": "Title",  ← Trailing comma here
+}
+```
+
+**Fix**: Remove the comma after the last property
+```json
+{
+  "@context": "https://schema.org",
+  "@type": "Article",
+  "headline": "Title"
+}
+```
+
+### Missing Required Quotes
+
+**Error**: Property names must be quoted
+```json
+{
+  @context: "https://schema.org"
+}
+```
+
+**Fix**: Quote all property names
+```json
+{
+  "@context": "https://schema.org"
+}
+```
+
+### Incorrect Date Format
+
+**Error**: Invalid date format
+```json
+{
+  "datePublished": "01/15/2024"
+}
+```
+
+**Fix**: Use ISO 8601 format
+```json
+{
+  "datePublished": "2024-01-15T08:00:00+00:00"
+}
+```
+
+### Relative URLs Instead of Absolute
+
+**Error**: Relative URLs are not allowed
+```json
+{
+  "image": "/images/photo.jpg"
+}
+```
+
+**Fix**: Use absolute URLs
+```json
+{
+  "image": "https://example.com/images/photo.jpg"
+}
+```
+
+### Incorrect Array Syntax
+
+**Error**: Multiple values not in array
+```json
+{
+  "image": "url1.jpg", "url2.jpg"
+}
+```
+
+**Fix**: Use array brackets for multiple values
+```json
+{
+  "image": ["url1.jpg", "url2.jpg"]
+}
+```
+
+---
+
+## Required vs Recommended Properties
+
+### FAQPage Schema
+
+| Property | Status | Notes |
+|----------|--------|-------|
+| @type | Required | Must be "FAQPage" |
+| mainEntity | Required | Array of Question objects |
+| Question.name | Required | The question text |
+| Answer.text | Required | The answer text |
+
+**Minimum**: 2 Q&A pairs
+
+### HowTo Schema
+
+| Property | Status | Notes |
+|----------|--------|-------|
+| @type | Required | Must be "HowTo" |
+| name | Required | Title of the how-to |
+| step | Required | Array of HowToStep objects |
+| step.text | Required | Step instructions |
+| image | Recommended | Improves visibility |
+| totalTime | Recommended | Shows duration in results |
+| supply | Recommended | Lists materials needed |
+| tool | Recommended | Lists tools needed |
+
+**Minimum**: 2 steps with text
+
+### Article Schema
+
+| Property | Status | Notes |
+|----------|--------|-------|
+| @type | Required | Article/BlogPosting/NewsArticle |
+| headline | Required | Max 110 characters |
+| image | Required | Minimum 1200px wide |
+| datePublished | Required | ISO 8601 format |
+| author | Required | Person or Organization |
+| publisher | Required | Organization with logo |
+| publisher.logo | Required | Max 600px wide, 60px high |
+| dateModified | Recommended | Update when content changes |
+| description | Recommended | Improves display |
+
+### Product Schema
+
+| Property | Status | Notes |
+|----------|--------|-------|
+| @type | Required | Must be "Product" |
+| name | Required | Product name |
+| image | Required | Product images |
+| description | Recommended | Product description |
+| offers | Recommended | Required for price display |
+| offers.price | Recommended | Required for price display |
+| offers.priceCurrency | Recommended | Required for price display |
+| offers.availability | Recommended | Stock status |
+| aggregateRating | Recommended | Required for star ratings |
+| review | Recommended | Individual reviews |
+| sku | Recommended | Product identifier |
+| brand | Recommended | Brand information |
+
+### LocalBusiness Schema
+
+| Property | Status | Notes |
+|----------|--------|-------|
+| @type | Required | LocalBusiness or subtype |
+| name | Required | Business name |
+| address | Required | PostalAddress object |
+| address.streetAddress | Required | Street address |
+| address.addressLocality | Required | City |
+| address.addressRegion | Required | State/province |
+| address.postalCode | Required | ZIP/postal code |
+| address.addressCountry | Required | Country code |
+| geo | Recommended | Latitude/longitude |
+| telephone | Recommended | Phone number |
+| openingHoursSpecification | Recommended | Business hours |
+| priceRange | Recommended | Price range indicator |
+| aggregateRating | Recommended | Customer ratings |
+
+### Organization Schema
+
+| Property | Status | Notes |
+|----------|--------|-------|
+| @type | Required | Must be "Organization" |
+| name | Required | Organization name |
+| url | Required | Website URL |
+| logo | Recommended | Brand logo |
+| sameAs | Recommended | Social media profiles |
+| contactPoint | Recommended | Contact information |
+
+---
+
+## Google Rich Result Eligibility Requirements
+
+### FAQ Rich Results
+
+**Eligibility checklist**:
+- [ ] Minimum 2 Q&A pairs
+- [ ] Questions are actual questions (contain "?")
+- [ ] Answers are complete and comprehensive
+- [ ] Content matches visible page content exactly
+- [ ] Not a forum or Q&A page where users can submit answers
+- [ ] Not advertising or promotional in nature
+- [ ] Not for medical, legal, or financial advice without proper E-E-A-T
+
+**Ineligible content**:
+- Medical advice without credentials
+- Legal advice
+- Product/service comparisons that are promotional
+- User-generated Q&A (use QAPage instead)
+
+### How-To Rich Results
+
+**Eligibility checklist**:
+- [ ] Minimum 2 steps with clear instructions
+- [ ] Complete process from start to finish
+- [ ] Each step has meaningful text (not just a title)
+- [ ] Not advertising or promotional
+- [ ] Not harmful or dangerous content
+- [ ] Steps are actionable and practical
+
+**Ineligible content**:
+- Single-step processes
+- Recipes (use Recipe schema instead)
+- Promotional tutorials
+
+### Product Rich Results
+
+**For price display**:
+- [ ] Valid Product schema
+- [ ] `offers` with `price` property
+- [ ] `priceCurrency` specified
+- [ ] `availability` status
+
+**For review stars**:
+- [ ] Valid `aggregateRating` OR individual `review`
+- [ ] Minimum 1 review for individual review display
+- [ ] Honest, unbiased reviews (not paid/incentivized)
+
+**For product markup**:
+- [ ] `name` property present
+- [ ] At least one `image`
+- [ ] Valid product type (not person, organization, etc.)
+
+### Article Rich Results
+
+**Eligibility checklist**:
+- [ ] Valid Article/BlogPosting/NewsArticle schema
+- [ ] High-quality, original content
+- [ ] Proper `publisher` with valid logo
+- [ ] Valid `author` information
+- [ ] Images meet size requirements (1200px wide)
+- [ ] Not short-form content (minimum ~300 words)
+
+---
+
+## Testing Workflow
+
+### Initial Implementation
+
+1. **Add schema to development/staging environment**
+2. **Validate syntax at validator.schema.org**
+   - Paste code or test URL
+   - Fix all errors before proceeding
+3. **Test at Google Rich Results Test**
+   - Check for Google-specific issues
+   - Verify eligible rich result types
+4. **Visual inspection**
+   - View page source to confirm schema is present
+   - Check JSON formatting in browser
+
+### Pre-Launch Testing
+
+1. **Test on staging URL with Rich Results Test**
+2. **Verify all required properties present**
+3. **Confirm content matches visible page content**
+4. **Check for policy violations**
+5. **Test multiple schema types if combining**
+6. **Validate images are accessible and meet size requirements**
+
+### Post-Launch Monitoring
+
+1. **Submit sitemap to Google Search Console**
+2. **Monitor Enhancements reports**
+   - Check for validation errors
+   - Watch for policy violations
+   - Track rich result impressions
+3. **Re-test pages if content changes**
+4. **Update `dateModified` when updating content**
+5. **Fix errors within 30 days to avoid rich result removal**
+
+---
+
+## Common Policy Violations
+
+### Content Mismatch
+
+**Violation**: Schema content doesn't match visible page content
+
+**Example**: FAQ schema includes Q&A pairs not visible on page
+
+**Fix**: Ensure all structured data reflects actual page content exactly
+
+### Deceptive Content
+
+**Violation**: Schema contains misleading information
+
+**Example**: Product reviews that are fake or incentivized
+
+**Fix**: Only include genuine, verifiable information
+
+### Spammy Markup
+
+**Violation**: Excessive or irrelevant schema
+
+**Example**: Adding Product schema to every blog post
+
+**Fix**: Only use schema types relevant to page content
+
+### Hidden Content
+
+**Violation**: Schema references content hidden from users
+
+**Example**: FAQ answers only in schema, not visible on page
+
+**Fix**: Make all schema content visible to users
+
+### Promotional Content in FAQ
+
+**Violation**: Using FAQ schema for promotional purposes
+
+**Example**: Questions like "Why is [Brand] the best?"
+
+**Fix**: Use neutral, informational questions
+
+---
+
+## Debugging Common Issues
+
+### Schema Not Appearing in Rich Results Test
+
+**Possible causes**:
+- JSON syntax error (validate at validator.schema.org)
+- Schema in incorrect location (should be in `<head>` or `<body>`)
+- Script tag missing `type="application/ld+json"`
+- Content served dynamically after page load (bot can't see it)
+
+**Debug steps**:
+1. View page source (not inspect element)
+2. Search for `"@type"`
+3. Copy JSON to validator.schema.org
+4. Fix syntax errors
+
+### Rich Results Not Showing in Search
+
+**Possible causes**:
+- Schema is new (can take days/weeks to appear)
+- Page not indexed by Google
+- Schema has errors in Search Console
+- Content doesn't meet quality guidelines
+- Competition for rich results is high
+
+**Debug steps**:
+1. Check Search Console → Enhancements
+2. Use URL Inspection tool to request indexing
+3. Verify schema passes Rich Results Test
+4. Check for manual actions
+
+### Warnings vs Errors
+
+**Errors** (must fix):
+- Invalid syntax
+- Missing required properties
+- Invalid property values
+- Schema type doesn't exist
+
+**Warnings** (should fix when possible):
+- Missing recommended properties
+- Suboptimal property values
+- Non-standard extensions
+- Property not recognized for this type
+
+---
+
+## Schema Maintenance Checklist
+
+### Monthly
+- [ ] Check Search Console for new errors
+- [ ] Verify rich results are still appearing
+- [ ] Update `dateModified` on changed content
+
+### Quarterly
+- [ ] Audit all schema implementations
+- [ ] Test key pages with Rich Results Test
+- [ ] Update any outdated information (prices, dates, etc.)
+- [ ] Check for new schema types relevant to your content
+
+### After Content Changes
+- [ ] Update schema to match new content
+- [ ] Update `dateModified` timestamp
+- [ ] Re-validate with Rich Results Test
+- [ ] Request re-indexing in Search Console if major changes
+
+### After Site Migration
+- [ ] Verify schema preserved on new URLs
+- [ ] Update all absolute URLs in schema
+- [ ] Submit new sitemap
+- [ ] Monitor for errors in new domain's Search Console
+
+---
+
+## Quick Reference: Error Messages and Fixes
+
+| Error Message | Cause | Fix |
+|---------------|-------|-----|
+| "Missing required field" | Required property not included | Add the required property |
+| "Invalid date format" | Date not in ISO 8601 | Use format: 2024-01-15T08:00:00+00:00 |
+| "URL is not absolute" | Relative URL used | Add full URL with https:// |
+| "Unexpected token" | JSON syntax error | Check for missing quotes, brackets, commas |
+| "This markup is not eligible for rich results" | Schema type or content doesn't qualify | Review eligibility requirements |
+| "Image too small" | Image doesn't meet size requirements | Use image at least 1200px wide |
+| "The attribute price is required" | Product missing price | Add offers.price property |
+| "Logo must be 600x60 or smaller" | Publisher logo too large | Resize logo to meet requirements |
+
+---
+
+## Resources
+
+- **Schema.org Documentation**: https://schema.org/
+- **Google Search Central**: https://developers.google.com/search/docs/appearance/structured-data
+- **Rich Results Test**: https://search.google.com/test/rich-results
+- **Schema Validator**: https://validator.schema.org/
+- **JSON-LD Playground**: https://json-ld.org/playground/
diff --git a/.cursor/skills/seo-content-writer/SKILL.md b/.cursor/skills/seo-content-writer/SKILL.md
new file mode 100644
index 0000000000..6478e4e6be
--- /dev/null
+++ b/.cursor/skills/seo-content-writer/SKILL.md
@@ -0,0 +1,456 @@
+---
+name: seo-content-writer
+description: 'Write SEO blog posts, articles, landing pages with keyword integration, header optimization, and snippet targeting. SEO文章写作/内容优化'
+version: "6.0.0"
+license: Apache-2.0
+compatibility: "Claude Code ≥1.0, skills.sh marketplace, ClawHub marketplace, Vercel Labs skills ecosystem. No system packages required. Optional: MCP network access for SEO tool integrations."
+homepage: "https://github.com/aaron-he-zhu/seo-geo-claude-skills"
+when_to_use: "Use when writing SEO-optimized articles, blog posts, landing pages, or product descriptions. Also when the user asks to create content targeting a specific keyword."
+argument-hint: "<topic> <target keyword>"
+metadata:
+  author: aaron-he-zhu
+  version: "6.0.0"
+  geo-relevance: "medium"
+  tags:
+    - seo
+    - content-writing
+    - blog-writing
+    - seo-copywriting
+    - content-creation
+    - featured-snippet-optimization
+    - article-writing
+    - landing-page
+    - surferSEO-alternative
+    - clearscope-alternative
+    - SEO文章
+    - 博客写作
+    - SEOライティング
+    - SEO글쓰기
+    - redaccion-seo
+  triggers:
+    # EN-formal
+    - "write SEO content"
+    - "create blog post"
+    - "write an article"
+    - "content writing"
+    - "draft optimized content"
+    - "write for SEO"
+    - "SEO copywriting"
+    - "blog writing"
+    # EN-casual
+    - "write me a blog post"
+    - "write me an article"
+    - "help me write about"
+    - "draft an article about"
+    - "I need a blog post"
+    - "create content for my site"
+    # EN-question
+    - "how do I write content that ranks"
+    - "how to write SEO friendly content"
+    - "what makes content rank well"
+    # EN-competitor
+    - "SurferSEO alternative"
+    - "Clearscope alternative"
+    - "Jasper AI alternative for SEO"
+    # ZH-pro
+    - "SEO文章写作"
+    - "SEO内容创作"
+    - "博客写作"
+    - "内容优化"
+    - "内容创作"
+    # ZH-casual
+    - "帮我写文章"
+    - "写一篇博客"
+    - "排名上不去"
+    - "帮我写SEO文章"
+    - "写一篇SEO文章"
+    # JA
+    - "SEOライティング"
+    - "SEO記事作成"
+    - "ブログ記事作成"
+    - "SEOコンテンツ"
+    # KO
+    - "SEO 글쓰기"
+    - "블로그 작성"
+    - "SEO 콘텐츠 작성"
+    - "블로그 글 작성해줘"
+    - "이 주제로 글 써봐"
+    # ES
+    - "redacción SEO"
+    - "escribir artículo SEO"
+    - "contenido optimizado"
+    # PT
+    - "redação SEO"
+    - "escrever artigo SEO"
+    # Misspellings
+    - "SEO copywritting"
+    - "writting SEO content"
+---
+
+# SEO Content Writer
+
+> **[SEO & GEO Skills Library](https://github.com/aaron-he-zhu/seo-geo-claude-skills)** · 20 skills for SEO + GEO · [ClawHub](https://clawhub.ai/u/aaron-he-zhu) · [skills.sh](https://skills.sh/aaron-he-zhu/seo-geo-claude-skills)
+> **System Mode**: This build skill follows the shared [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md) and [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md).
+
+
+This skill creates search-engine-optimized content that ranks well while providing genuine value to readers. It applies proven SEO copywriting techniques, proper keyword integration, and optimal content structure.
+
+**System role**: Build layer skill. It turns briefs and signals into assets that other skills can review, publish, and monitor.
+
+## When This Must Trigger
+
+Use this when the conversation involves any of these situations — even if the user does not use SEO terminology:
+
+Use this whenever the task needs a shippable asset or transformation that should feed directly into quality review, deployment, or monitoring.
+
+- Writing blog posts targeting specific keywords
+- Creating landing pages optimized for search
+- Developing pillar content for topic clusters
+- Writing product descriptions for e-commerce
+- Creating service pages for local SEO
+- Producing how-to guides and tutorials
+- Writing comparison and review articles
+
+## What This Skill Does
+
+1. **Keyword Integration**: Naturally incorporates target and related keywords
+2. **Structure Optimization**: Creates scannable, well-organized content
+3. **Title & Meta Creation**: Writes compelling, click-worthy titles
+4. **Header Optimization**: Uses strategic H1-H6 hierarchy
+5. **Internal Linking**: Suggests relevant internal link opportunities
+6. **Readability Enhancement**: Ensures content is accessible and engaging
+7. **Featured Snippet Optimization**: Formats for SERP feature opportunities
+
+## Quick Start
+
+Start with one of these prompts. Finish with a short handoff summary using the repository format in [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md).
+
+### Basic Content Creation
+
+```
+Write an SEO-optimized article about [topic] targeting the keyword [keyword]
+```
+
+```
+Create a blog post for [topic] with these keywords: [keyword list]
+```
+
+### With Specific Requirements
+
+```
+Write a 2,000-word guide about [topic] targeting [keyword],
+include FAQ section for featured snippets
+```
+
+### Content Briefs
+
+```
+Here's my content brief: [brief]. Write SEO-optimized content following this outline.
+```
+
+## Skill Contract
+
+**Expected output**: a ready-to-use asset or implementation-ready transformation plus a short handoff summary ready for `memory/content/`.
+
+- **Reads**: the brief, target keywords, entity inputs, quality constraints, and prior decisions from [CLAUDE.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CLAUDE.md) and the shared [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md) when available.
+- **Writes**: a user-facing content, metadata, or schema deliverable plus a reusable summary that can be stored under `memory/content/`.
+- **Promotes**: approved angles, messaging choices, missing evidence, and publish blockers to `CLAUDE.md`, `memory/decisions.md`, and `memory/open-loops.md`.
+- **Next handoff**: use the `Next Best Skill` below when the asset is ready for review or deployment.
+
+## Data Sources
+
+> See [CONNECTORS.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CONNECTORS.md) for tool category placeholders.
+
+**With ~~SEO tool + ~~search console connected:**
+Automatically pull keyword metrics (search volume, difficulty, CPC), competitor content analysis (top-ranking pages, content length, common topics), SERP features (featured snippets, PAA questions), and keyword opportunities (related keywords, question-based queries).
+
+**With manual data only:**
+Ask the user to provide:
+1. Target primary keyword and 3-5 secondary keywords
+2. Target audience and search intent (informational/commercial/transactional)
+3. Target word count and desired tone
+4. Any competitor URLs or content examples to reference
+
+Proceed with the full workflow using provided data. Note in the output which metrics are from automated collection vs. user-provided data.
+
+## Instructions
+
+When a user requests SEO content:
+
+1. **Gather Requirements**
+
+   Confirm or ask for:
+   
+   ```markdown
+   ### Content Requirements
+   
+   **Primary Keyword**: [main keyword]
+   **Secondary Keywords**: [2-5 related keywords]
+   **Target Word Count**: [length]
+   **Content Type**: [blog/guide/landing page/etc.]
+   **Target Audience**: [who is this for]
+   **Search Intent**: [informational/commercial/transactional]
+   **Tone**: [professional/casual/technical/friendly]
+   **CTA Goal**: [what action should readers take]
+   **Competitor URLs**: [top ranking content to beat]
+   ```
+
+2. **Load CORE-EEAT Quality Constraints**
+
+   Before writing, load content quality standards from the [CORE-EEAT Benchmark](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/core-eeat-benchmark.md):
+
+   ```markdown
+   ### CORE-EEAT Pre-Write Checklist
+
+   **Content Type**: [identified from requirements above]
+   **Loaded Constraints** (high-weight items for this content type):
+
+   Apply these standards while writing:
+
+   | ID | Standard | How to Apply |
+   |----|----------|-------------|
+   | C01 | Intent Alignment | Title promise must match content delivery |
+   | C02 | Direct Answer | Core answer in first 150 words |
+   | C06 | Audience Targeting | State "this article is for..." |
+   | C10 | Semantic Closure | Conclusion answers opening question + next steps |
+   | O01 | Heading Hierarchy | H1→H2→H3, no level skipping |
+   | O02 | Summary Box | Include TL;DR or Key Takeaways |
+   | O06 | Section Chunking | Each section single topic; paragraphs 3–5 sentences |
+   | O09 | Information Density | No filler; consistent terminology |
+   | R01 | Data Precision | ≥5 precise numbers with units |
+   | R02 | Citation Density | ≥1 external citation per 500 words |
+   | R04 | Evidence-Claim Mapping | Every claim backed by evidence |
+   | R07 | Entity Precision | Full names for people/orgs/products |
+   | C03 | Query Coverage | Cover ≥3 query variants (synonyms, long-tail) |
+   | O08 | Anchor Navigation | Table of contents with jump links |
+   | O10 | Multimedia Structure | Images/videos have captions and carry information |
+   | E07 | Practical Tools | Include downloadable templates, checklists, or calculators |
+
+   _These 16 items apply across all content types. For content-type-specific dimension weights, see the Content-Type Weight Table in [core-eeat-benchmark.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/core-eeat-benchmark.md)._
+   _Full 80-item benchmark: [references/core-eeat-benchmark.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/core-eeat-benchmark.md)_
+   _For complete content quality audit: use [content-quality-auditor](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/content-quality-auditor/SKILL.md)_
+   ```
+
+3. **Research and Plan**
+
+   Before writing:
+   
+   ```markdown
+   ### Content Research
+   
+   **SERP Analysis**:
+   - Top results format: [what's ranking]
+   - Average word count: [X] words
+   - Common sections: [list]
+   - SERP features: [snippets, PAA, etc.]
+   
+   **Keyword Map**:
+   - Primary: [keyword] - use in title, H1, intro, conclusion
+   - Secondary: [keywords] - use in H2s, body paragraphs
+   - LSI/Related: [terms] - sprinkle naturally throughout
+   - Questions: [PAA questions] - use as H2/H3s or FAQ
+   
+   **Content Angle**:
+   [What unique perspective or value will this content provide?]
+   ```
+
+4. **Create Optimized Title**
+
+   ```markdown
+   ### Title Optimization
+   
+   **Requirements**:
+   - Include primary keyword (preferably at start)
+   - Under 60 characters for full SERP display
+   - Compelling and click-worthy
+   - Match search intent
+   
+   **Title Options**:
+   
+   1. [Title option 1] ([X] chars)
+      - Keyword position: [front/middle]
+      - Power words: [list]
+   
+   2. [Title option 2] ([X] chars)
+      - Keyword position: [front/middle]
+      - Power words: [list]
+   
+   **Recommended**: [Best option with reasoning]
+   ```
+
+5. **Write Meta Description**
+
+   ```markdown
+   ### Meta Description
+   
+   **Requirements**:
+   - 150-160 characters
+   - Include primary keyword naturally
+   - Include call-to-action
+   - Compelling and specific
+   
+   **Meta Description**:
+   "[Description text]" ([X] characters)
+   
+   **Elements included**:
+   - ✅ Primary keyword
+   - ✅ Value proposition
+   - ✅ CTA or curiosity hook
+   ```
+
+6. **Structure Content and Write**
+
+   Structure: H1 (primary keyword, one per page) > Introduction (100-150 words, hook + promise + keyword in first 100 words) > H2 sections (secondary keywords/questions) > H3 sub-topics > FAQ section > Conclusion (summary + keyword + CTA).
+
+7. **Apply On-Page SEO Best Practices**
+
+   Follow the on-page SEO checklist (keyword placement, content quality, readability, technical elements) and the content writing template (H1 with keyword, hook, sections with H2/H3, FAQ, conclusion with CTA).
+
+   > **Reference**: See [references/seo-writing-checklist.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/build/seo-content-writer/references/seo-writing-checklist.md) for the full on-page SEO checklist, content writing template, and featured snippet optimization patterns.
+
+   Key requirements while writing:
+   - Primary keyword in title, H1, first 100 words, at least one H2, and conclusion
+   - Paragraphs of 3-5 sentences; varied sentence length; bullet points and bold key phrases
+   - Internal links (2-5) and external authoritative links (2-3)
+   - FAQ section with 40-60 word answers for featured snippet opportunity
+   - Optimize for definition, list, table, and how-to snippets where applicable
+
+8. **Add Internal/External Links**
+
+   ```markdown
+   ### Link Recommendations
+   
+   **Internal Links** (include 2-5):
+   1. "[anchor text]" → [/your-page-url] (relevant because: [reason])
+   2. "[anchor text]" → [/your-page-url] (relevant because: [reason])
+   
+   **External Links** (include 2-3 authoritative sources):
+   1. "[anchor text]" → [authoritative-source.com] (supports: [claim])
+   2. "[anchor text]" → [authoritative-source.com] (supports: [claim])
+   ```
+
+9. **Final SEO Review and CORE-EEAT Self-Check**
+
+    Score content across 10 SEO factors (title, meta description, H1, keyword placement, H2s, internal links, external links, FAQ, readability, word count) and produce an Overall SEO Score out of 10.
+
+    Then verify the 16 CORE-EEAT pre-write constraints (C01, C02, C06, C10, O01, O02, O06, O09, R01, R02, R04, R07, C03, O08, O10, E07) with pass/warning/fail status. List items needing attention.
+
+    _For full 80-item audit, use [content-quality-auditor](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/content-quality-auditor/SKILL.md)_
+
+#### Issue Classification
+
+When the self-check reveals issues, classify and handle them:
+
+**Auto-correct (fix silently, then document in a `### Changes Made` block after the final content):**
+- Meta description exceeds 160 characters → rewrite to fit while preserving primary keyword and CTA
+- Title tag exceeds 60 characters → shorten while preserving primary keyword
+- Missing alt text on images → generate descriptive alt text
+- Duplicate H2 headings → differentiate with modifiers
+- Keyword density above 2% → replace some instances with semantic variants
+- Missing table of contents → generate TOC with anchor links for articles with 3+ H2 sections
+- Paragraphs exceeding 5 sentences → split at the most natural break point
+
+Use this format for the Changes Made block:
+
+```markdown
+### Changes Made During Self-Check
+
+| Item | Original | Fixed |
+|------|----------|-------|
+| Meta description | 185 chars | 158 chars — removed non-essential qualifier |
+| Keyword density | 2.4% | 1.8% — replaced 3 instances with semantic variants |
+```
+
+**Needs your decision (ask before changing):**
+- H1 wording changes (may affect brand voice)
+- Keyword density below 0.5% (may need structural rewrite)
+- Tone adjustments (formal ↔ casual)
+- Claim strength (e.g., "best" → "top-rated" for compliance)
+- Content length significantly above/below target (±30%)
+- Removing/replacing external links
+- Statistics or data claims that cannot be verified against the cited source
+
+## Validation Checkpoints
+
+### Input Validation
+- [ ] Primary keyword confirmed and matches search intent
+- [ ] Target word count specified (minimum 800 words for substantive content)
+- [ ] Content type and audience clearly defined
+- [ ] Competitor URLs reviewed or target SERP features identified
+
+### Output Validation
+- [ ] Keyword density within 1-2% for primary keyword (Note: Keyword density is a guideline, not a hard rule. Modern search engines prioritize semantic relevance and natural language over exact density targets. Focus on covering the topic comprehensively with semantic variants rather than hitting a specific percentage.)
+- [ ] All sections from outline covered completely
+- [ ] Internal links included (2-5 relevant links)
+- [ ] FAQ section present with at least 3 questions
+- [ ] Readability score appropriate for target audience
+- [ ] Source of each data point clearly stated (~~SEO tool data, user-provided, or estimated)
+
+## Example
+
+**User**: "Write an SEO-optimized article about 'email marketing best practices' targeting small businesses"
+
+> **Reference**: See [references/seo-writing-checklist.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/build/seo-content-writer/references/seo-writing-checklist.md) for the full example output demonstrating a complete SEO article with meta description, H1/H2/H3 hierarchy, statistics with citations, comparison tables, FAQ section, and conclusion with CTA.
+
+The example output demonstrates: keyword in H1 and first 100 words, statistics with sources (DMA, Emarsys), comparison tables, bullet-point lists, pro tips, FAQ section with 40-60 word answers, and a clear CTA in the conclusion.
+
+## Content Type Templates
+
+### How-To Guide
+
+```
+Write a how-to guide for [task] targeting [keyword]
+```
+
+### Comparison Article
+
+```
+Write a comparison article: [Option A] vs [Option B] for [keyword]
+```
+
+### Listicle
+
+```
+Write a list post: "X Best [Items] for [Audience/Purpose]" targeting [keyword]
+```
+
+### Ultimate Guide
+
+```
+Write an ultimate guide about [topic] (3,000+ words) targeting [keyword]
+```
+
+## Tips for Success
+
+1. **Match search intent** - Informational queries need guides, not sales pages
+2. **Front-load value** - Put key information early for readers and snippets
+3. **Use data and examples** - Specific beats generic every time
+4. **Write for humans first** - SEO optimization should feel natural
+5. **Include visual elements** - Break up text with images, tables, lists
+6. **Update regularly** - Fresh content signals to search engines
+
+
+### Save Results
+
+After delivering content or optimization output to the user, ask:
+
+> "Save these results for future sessions?"
+
+If yes, write a dated summary to `memory/content/YYYY-MM-DD-<topic>.md` containing:
+- One-line description of what was created
+- Target keyword and content type
+- Open loops or items needing review
+- Source data references
+
+**Gate check recommended**: Run content-quality-auditor before publishing (PostToolUse hook will remind automatically).
+
+If any findings should influence ongoing strategy, recommend promoting key conclusions to `memory/hot-cache.md`.
+
+## Reference Materials
+
+- [Title Formulas](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/build/seo-content-writer/references/title-formulas.md) - Proven headline formulas, power words, CTR patterns
+- [Content Structure Templates](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/build/seo-content-writer/references/content-structure-templates.md) - Templates for blog posts, comparisons, listicles, how-tos, pillar pages
+
+## Next Best Skill
+
+- **Primary**: [content-quality-auditor](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/content-quality-auditor/SKILL.md) — gate the draft before publishing or handing it off.
diff --git a/.cursor/skills/seo-content-writer/references/content-structure-templates.md b/.cursor/skills/seo-content-writer/references/content-structure-templates.md
new file mode 100644
index 0000000000..35a5e29ffa
--- /dev/null
+++ b/.cursor/skills/seo-content-writer/references/content-structure-templates.md
@@ -0,0 +1,875 @@
+# Content Structure Templates
+
+Markdown templates for common SEO content types. Customize section headings and content while maintaining the structural framework.
+
+## Blog Post (Informational)
+
+**Target word count**: 1,200-1,800 words
+**Primary goal**: Educate and inform, build authority, rank for informational queries
+
+```markdown
+# [Primary Keyword in H1] - [Benefit or Hook]
+
+[Opening hook - 1 compelling sentence that grabs attention]
+
+[Problem statement - describe the pain point or challenge your reader faces]
+
+[Promise - explain what readers will learn or gain from this post]
+
+In this guide, you'll discover:
+- [Key takeaway 1]
+- [Key takeaway 2]
+- [Key takeaway 3]
+
+## What Is [Topic]?
+
+[Clear definition in 40-60 words - GEO-optimized]
+
+[Expanded explanation with context]
+
+[Why this matters - relevance to reader]
+
+## Why [Topic] Matters
+
+[Benefit 1 with supporting evidence]
+
+[Benefit 2 with supporting evidence]
+
+[Benefit 3 with supporting evidence]
+
+> **Key Insight**: [Highlighted important point or statistic]
+
+## [Secondary Keyword or Main Section 1]
+
+[Introduction to this section]
+
+### [Sub-topic 1]
+
+[Detailed explanation]
+
+**Key points**:
+- Point 1
+- Point 2
+- Point 3
+
+### [Sub-topic 2]
+
+[Detailed explanation]
+
+[Example or case study]
+
+## [Secondary Keyword or Main Section 2]
+
+[Section introduction]
+
+| Factor | Description | Impact |
+|--------|-------------|--------|
+| [Factor 1] | [Description] | High/Medium/Low |
+| [Factor 2] | [Description] | High/Medium/Low |
+| [Factor 3] | [Description] | High/Medium/Low |
+
+[Additional explanation]
+
+## [Secondary Keyword or Main Section 3]
+
+[Content for this section]
+
+**Best practices**:
+1. [Practice 1 with explanation]
+2. [Practice 2 with explanation]
+3. [Practice 3 with explanation]
+
+## Common Mistakes to Avoid
+
+### [Mistake 1]
+
+[Why this is a mistake]
+[How to avoid it]
+
+### [Mistake 2]
+
+[Why this is a mistake]
+[How to avoid it]
+
+### [Mistake 3]
+
+[Why this is a mistake]
+[How to avoid it]
+
+## Frequently Asked Questions
+
+### [Question from PAA or common query]?
+
+[Direct answer in 40-60 words]
+
+[Optional: Additional context]
+
+### [Question 2]?
+
+[Direct answer]
+
+### [Question 3]?
+
+[Direct answer]
+
+## Conclusion
+
+[Recap key points - include primary keyword]
+
+[Final insight or takeaway]
+
+[Clear call-to-action: next step for reader]
+
+---
+
+**Related reading**: [Internal link 1] | [Internal link 2] | [Internal link 3]
+```
+
+**Internal links**: 3-5 contextual links to related content
+**External links**: 2-3 links to authoritative sources
+**Keywords**: Primary in H1, first 100 words, conclusion; secondary in H2s
+**GEO-optimization**: Definition block, FAQ section, quotable statistics
+
+---
+
+## Comparison Article ("[A] vs [B]")
+
+**Target word count**: 1,500-2,500 words
+**Primary goal**: Help readers choose between options, rank for comparison queries
+
+```markdown
+# [Option A] vs [Option B]: Which Is Better for [Use Case]?
+
+[Hook - present the comparison dilemma]
+
+[Context - why this comparison matters]
+
+[Promise - what readers will understand after reading]
+
+**Quick answer**: [Brief verdict for those who want immediate answer]
+
+## [Option A] vs [Option B]: Quick Comparison
+
+| Feature | [Option A] | [Option B] |
+|---------|-----------|-----------|
+| [Feature 1] | [Details] | [Details] |
+| [Feature 2] | [Details] | [Details] |
+| [Feature 3] | [Details] | [Details] |
+| [Feature 4] | [Details] | [Details] |
+| **Best for** | [Use case] | [Use case] |
+| **Starting price** | [Price] | [Price] |
+
+## What Is [Option A]?
+
+[Clear definition]
+
+[Key features and capabilities]
+
+**Pros**:
+- [Advantage 1]
+- [Advantage 2]
+- [Advantage 3]
+
+**Cons**:
+- [Disadvantage 1]
+- [Disadvantage 2]
+
+## What Is [Option B]?
+
+[Clear definition]
+
+[Key features and capabilities]
+
+**Pros**:
+- [Advantage 1]
+- [Advantage 2]
+- [Advantage 3]
+
+**Cons**:
+- [Disadvantage 1]
+- [Disadvantage 2]
+
+## [Option A] vs [Option B]: Detailed Comparison
+
+### [Feature Category 1]
+
+**[Option A]**: [Detailed explanation with examples]
+
+**[Option B]**: [Detailed explanation with examples]
+
+**Winner**: [A/B/Tie] - [Brief reasoning]
+
+### [Feature Category 2]
+
+**[Option A]**: [Details]
+
+**[Option B]**: [Details]
+
+**Winner**: [A/B/Tie] - [Brief reasoning]
+
+### [Feature Category 3]
+
+[Continue pattern for key comparison points]
+
+## Pricing: [Option A] vs [Option B]
+
+### [Option A] Pricing
+
+- [Tier 1]: $[X]/month - [What's included]
+- [Tier 2]: $[X]/month - [What's included]
+- [Tier 3]: $[X]/month - [What's included]
+
+### [Option B] Pricing
+
+- [Tier 1]: $[X]/month - [What's included]
+- [Tier 2]: $[X]/month - [What's included]
+- [Tier 3]: $[X]/month - [What's included]
+
+**Value winner**: [Which offers better value and why]
+
+## Which Should You Choose?
+
+### Choose [Option A] if:
+- [Criteria 1]
+- [Criteria 2]
+- [Criteria 3]
+
+### Choose [Option B] if:
+- [Criteria 1]
+- [Criteria 2]
+- [Criteria 3]
+
+## Frequently Asked Questions
+
+### Is [Option A] better than [Option B]?
+
+[Nuanced answer - depends on use case]
+
+### Can you use [Option A] and [Option B] together?
+
+[Answer with explanation]
+
+### Which is easier for beginners: [Option A] or [Option B]?
+
+[Answer with reasoning]
+
+## Final Verdict: [Option A] vs [Option B]
+
+[Balanced conclusion]
+
+[Recommendation based on different use cases]
+
+[Clear call-to-action]
+```
+
+**Internal links**: Links to detailed reviews of each option, related comparisons
+**External links**: Official websites of both options, third-party reviews
+**Keywords**: Both options mentioned throughout, comparison keywords in H2s
+
+---
+
+## Listicle ("Top N [Items]")
+
+**Target word count**: 1,500-2,500 words (depending on list length)
+**Primary goal**: Provide curated recommendations, rank for "best [topic]" queries
+
+```markdown
+# [Number] Best [Items] for [Audience/Purpose] ([Year])
+
+[Hook - establish why this list matters]
+
+[Context - what makes these the "best"]
+
+[Promise - what readers will gain]
+
+After testing [X] different [items], we've identified the [number] that deliver the best results for [audience/purpose].
+
+## Quick Summary: Top [Number] [Items]
+
+| Rank | [Item] | Best For | Price |
+|------|--------|----------|-------|
+| 1 | [Item 1] | [Use case] | [Price] |
+| 2 | [Item 2] | [Use case] | [Price] |
+| 3 | [Item 3] | [Use case] | [Price] |
+
+## How We Chose These [Items]
+
+Our selection criteria:
+- [Criterion 1] - [Why it matters]
+- [Criterion 2] - [Why it matters]
+- [Criterion 3] - [Why it matters]
+
+## 1. [Item Name] - Best for [Specific Use Case]
+
+![Alt text: [Item name] screenshot/image]
+
+**What it is**: [Brief description]
+
+**Why we love it**: [Key benefits and features]
+
+**Key features**:
+- [Feature 1]
+- [Feature 2]
+- [Feature 3]
+
+**Pricing**: [Price details]
+
+**Pros**:
+- [Pro 1]
+- [Pro 2]
+
+**Cons**:
+- [Con 1]
+- [Con 2]
+
+**Best for**: [Ideal user or use case]
+
+[Link to item]
+
+---
+
+## 2. [Item Name] - Best for [Specific Use Case]
+
+[Follow same structure as #1]
+
+---
+
+## 3. [Item Name] - Best for [Specific Use Case]
+
+[Continue pattern for all items in list]
+
+---
+
+## Comparison: Which [Item] Is Right for You?
+
+| Feature | [Item 1] | [Item 2] | [Item 3] |
+|---------|----------|----------|----------|
+| [Feature 1] | ✓/✗ | ✓/✗ | ✓/✗ |
+| [Feature 2] | ✓/✗ | ✓/✗ | ✓/✗ |
+| [Feature 3] | ✓/✗ | ✓/✗ | ✓/✗ |
+
+## How to Choose the Best [Item] for Your Needs
+
+### Consider [Factor 1]
+
+[Explanation of how this factor affects choice]
+
+### Consider [Factor 2]
+
+[Explanation]
+
+### Consider [Factor 3]
+
+[Explanation]
+
+## Frequently Asked Questions
+
+### What is the best [item] for beginners?
+
+[Answer with specific recommendation]
+
+### Which [item] offers the best value?
+
+[Answer with reasoning]
+
+### Are paid [items] better than free ones?
+
+[Balanced answer]
+
+## Conclusion: Our Top Pick
+
+[Restate #1 recommendation]
+
+[Why it's the best overall choice]
+
+[Call-to-action]
+```
+
+**Internal links**: Related guides, comparison articles, category pages
+**External links**: Links to each recommended item, review sources
+**Keywords**: "Best [topic]", individual item names, use case keywords
+
+---
+
+## How-To Guide (Step-by-Step)
+
+**Target word count**: 1,200-2,000 words
+**Primary goal**: Teach a specific process, rank for "how to [task]" queries
+
+```markdown
+# How to [Achieve Goal]: [Number]-Step Guide [for Audience]
+
+[Hook - present the outcome readers want]
+
+[Current problem - why this is challenging]
+
+[Promise - what readers will accomplish]
+
+By following this step-by-step guide, you'll be able to [specific outcome] in [timeframe].
+
+## What You'll Need
+
+**Tools**:
+- [Tool 1]
+- [Tool 2]
+- [Tool 3]
+
+**Time required**: [Estimated time]
+
+**Skill level**: [Beginner/Intermediate/Advanced]
+
+## Why [Task] Matters
+
+[Explain the benefits of completing this task]
+
+[Provide context and importance]
+
+## Step 1: [Action Title]
+
+![Alt text: Screenshot showing step 1]
+
+[Clear instruction for this step]
+
+[Why this step is important]
+
+**How to do it**:
+1. [Sub-step 1]
+2. [Sub-step 2]
+3. [Sub-step 3]
+
+> **Pro Tip**: [Helpful tip to make this step easier]
+
+**Common mistakes**:
+- [Mistake to avoid]
+- [Another mistake]
+
+---
+
+## Step 2: [Action Title]
+
+![Alt text: Screenshot showing step 2]
+
+[Instruction for this step]
+
+[Additional context or explanation]
+
+**Example**: [Concrete example showing this step]
+
+---
+
+## Step 3: [Action Title]
+
+[Continue pattern for all steps]
+
+---
+
+## Step [Final Number]: [Action Title]
+
+[Final step instruction]
+
+[What success looks like]
+
+**You'll know it's working when**: [Success indicators]
+
+---
+
+## What to Do If [Common Problem]
+
+### Problem: [Issue 1]
+
+**Solution**: [How to fix it]
+
+### Problem: [Issue 2]
+
+**Solution**: [How to fix it]
+
+## Advanced Tips
+
+Once you've mastered the basics:
+
+1. **[Advanced tip 1]** - [Explanation]
+2. **[Advanced tip 2]** - [Explanation]
+3. **[Advanced tip 3]** - [Explanation]
+
+## Frequently Asked Questions
+
+### How long does it take to [complete task]?
+
+[Realistic timeframe with variables]
+
+### Do I need [specific tool/skill] to [complete task]?
+
+[Answer with alternatives if applicable]
+
+### What if [specific concern]?
+
+[Address concern with solution]
+
+## Conclusion
+
+[Recap the process]
+
+[Encourage reader to take action]
+
+[Clear next step or CTA]
+
+**Next steps**: [Link to related guide or advanced tutorial]
+```
+
+**Internal links**: Links to tool reviews, prerequisite guides, related tutorials
+**External links**: Tool documentation, additional resources
+**Keywords**: "How to [task]", action words in each step heading
+
+---
+
+## Product Review
+
+**Target word count**: 1,500-2,000 words
+**Primary goal**: Help buyers make informed decisions, rank for "[product] review" queries
+
+```markdown
+# [Product Name] Review: Is It Worth It in [Year]?
+
+[Hook - establish the product's promise or popularity]
+
+[Context - who this review is for]
+
+After using [Product] for [timeframe], here's my honest assessment of whether it lives up to the hype.
+
+## [Product Name] Overview
+
+**What it is**: [Clear description]
+
+**Price**: [Pricing details]
+
+**Best for**: [Ideal user or use case]
+
+**Rating**: ★★★★☆ (4/5)
+
+## Pros and Cons at a Glance
+
+### Pros
+- [Major advantage 1]
+- [Major advantage 2]
+- [Major advantage 3]
+
+### Cons
+- [Main limitation 1]
+- [Main limitation 2]
+
+## What Is [Product Name]?
+
+[Detailed product description]
+
+[Who makes it and company background]
+
+[Product positioning and target market]
+
+## Key Features
+
+### [Feature 1]
+
+[Detailed explanation of feature]
+
+[How it works in practice]
+
+[Our experience with this feature]
+
+### [Feature 2]
+
+[Continue pattern for main features]
+
+## Performance Testing
+
+### [Test Category 1]
+
+**What we tested**: [Testing methodology]
+
+**Results**: [Specific results with data]
+
+**Verdict**: [Assessment]
+
+### [Test Category 2]
+
+[Continue testing pattern]
+
+## Pricing and Plans
+
+| Plan | Price | What's Included |
+|------|-------|-----------------|
+| [Tier 1] | $[X]/month | [Features] |
+| [Tier 2] | $[X]/month | [Features] |
+| [Tier 3] | $[X]/month | [Features] |
+
+**Value assessment**: [Whether pricing is justified]
+
+## [Product] vs Competitors
+
+| Feature | [Product] | [Competitor 1] | [Competitor 2] |
+|---------|-----------|----------------|----------------|
+| [Feature 1] | [Details] | [Details] | [Details] |
+| [Feature 2] | [Details] | [Details] | [Details] |
+| [Feature 3] | [Details] | [Details] | [Details] |
+
+## Who Should Buy [Product]?
+
+### Perfect for:
+- [User type 1] because [reason]
+- [User type 2] because [reason]
+
+### Not ideal for:
+- [User type] because [reason]
+- [User type] because [reason]
+
+## Frequently Asked Questions
+
+### Is [Product] worth the price?
+
+[Honest assessment with context]
+
+### How does [Product] compare to [main competitor]?
+
+[Balanced comparison]
+
+### What's the learning curve like?
+
+[Assessment with timeframe]
+
+## Final Verdict: Should You Buy [Product]?
+
+[Balanced conclusion]
+
+[Specific recommendation based on use case]
+
+**Our rating**: ★★★★☆ (4/5)
+
+[Link to product with disclosure if affiliate]
+```
+
+**Disclosure**: Include affiliate disclosure if applicable
+**Internal links**: Related product reviews, comparison articles, category pages
+**External links**: Product website, official documentation
+
+---
+
+## Pillar Page (Comprehensive Guide)
+
+**Target word count**: 3,000-5,000+ words
+**Primary goal**: Become definitive resource, rank for head terms, support topic cluster
+
+```markdown
+# [Topic]: The Complete Guide [for Audience] ([Year])
+
+[Compelling hook - establish the importance of this topic]
+
+[Problem - what challenges exist in this space]
+
+[Promise - what this comprehensive guide delivers]
+
+This is the most comprehensive guide to [topic] on the web. You'll learn everything from fundamentals to advanced strategies.
+
+## Table of Contents
+
+- [Chapter 1]
+- [Chapter 2]
+- [Chapter 3]
+- [Chapter 4]
+- [Chapter 5]
+
+## Chapter 1: [Topic] Fundamentals
+
+### What Is [Topic]?
+
+[Comprehensive definition - GEO-optimized]
+
+[History and evolution]
+
+[Current state and importance]
+
+### Why [Topic] Matters
+
+[Multiple benefits with evidence]
+
+[Industry statistics and trends]
+
+### Key Concepts and Terminology
+
+| Term | Definition |
+|------|------------|
+| [Term 1] | [Definition] |
+| [Term 2] | [Definition] |
+| [Term 3] | [Definition] |
+
+## Chapter 2: [Major Sub-Topic]
+
+[Comprehensive section on major aspect]
+
+[Multiple sub-sections with H3 headings]
+
+[Examples, case studies, data]
+
+[Internal links to detailed cluster content]
+
+> **Deep dive**: For more on [sub-topic], see our complete guide: [link]
+
+## Chapter 3: [Major Sub-Topic]
+
+[Continue pattern with substantial content sections]
+
+## Chapter 4: [Advanced/Tactical Section]
+
+[Strategic or tactical content]
+
+[Actionable frameworks and processes]
+
+## Chapter 5: [Implementation/Case Studies]
+
+[Real-world applications]
+
+[Success stories]
+
+[Implementation roadmap]
+
+## Frequently Asked Questions
+
+[Comprehensive FAQ section with 8-10 questions]
+
+## Key Takeaways
+
+[Summarize main points from each chapter]
+
+## What's Next?
+
+[Recommend next steps]
+
+[Links to related cluster content]
+
+## Resources
+
+**Recommended tools**:
+- [Tool 1] - [Use case]
+- [Tool 2] - [Use case]
+
+**Further reading**:
+- [Internal link to cluster content]
+- [Internal link to cluster content]
+- [External resource]
+```
+
+**Internal links**: 8-15 links to related cluster content
+**External links**: 5-10 authoritative sources
+**Update frequency**: Quarterly for pillar pages
+
+---
+
+## FAQ Page
+
+**Target word count**: 1,000-1,500 words
+**Primary goal**: Answer common questions, rank for question queries, enable FAQ rich results
+
+```markdown
+# [Topic]: Frequently Asked Questions
+
+[Introduction explaining what questions this page answers]
+
+[Who this FAQ is for]
+
+## General Questions
+
+### What is [topic]?
+
+[Comprehensive answer with definition]
+
+### How does [topic] work?
+
+[Clear explanation with steps if applicable]
+
+### Why is [topic] important?
+
+[Benefits and context]
+
+## Getting Started
+
+### How do I get started with [topic]?
+
+[Step-by-step guidance]
+
+### What do I need to [accomplish goal]?
+
+[Requirements and prerequisites]
+
+### How long does it take to [achieve result]?
+
+[Realistic timeframe]
+
+## Common Problems
+
+### Why isn't [expected result] happening?
+
+[Troubleshooting guidance]
+
+### What should I do if [problem occurs]?
+
+[Solution with steps]
+
+### How do I fix [specific error]?
+
+[Fix instructions]
+
+## Comparison Questions
+
+### [Option A] vs [Option B]: which is better?
+
+[Balanced comparison answer]
+
+### Should I [action 1] or [action 2]?
+
+[Guidance based on scenarios]
+
+## Pricing and Plans
+
+### How much does [topic] cost?
+
+[Pricing information with context]
+
+### Is there a free version?
+
+[Information about free options]
+
+## Advanced Questions
+
+### Can I [advanced action]?
+
+[Answer with technical details]
+
+### How do I [complex task]?
+
+[Guidance with link to detailed guide]
+
+## Still Have Questions?
+
+[Call-to-action for additional support]
+
+[Contact information or link to contact page]
+```
+
+**Schema markup**: Add FAQPage schema to enable rich results
+**Internal links**: Link to detailed guides for complex answers
+**Keywords**: Include question keywords naturally
+
+---
+
+## Implementation Checklist
+
+For any template:
+
+- [ ] Customize all [bracketed placeholders]
+- [ ] Include primary keyword in H1 and first 100 words
+- [ ] Place secondary keywords in H2/H3 headings
+- [ ] Add 3-5 internal links to related content
+- [ ] Include 2-3 external links to authoritative sources
+- [ ] Create GEO-optimized definition blocks
+- [ ] Add FAQ section for featured snippet opportunities
+- [ ] Include relevant images with descriptive alt text
+- [ ] Write compelling meta description (150-160 chars)
+- [ ] Optimize title tag (50-60 chars with primary keyword)
diff --git a/.cursor/skills/seo-content-writer/references/seo-writing-checklist.md b/.cursor/skills/seo-content-writer/references/seo-writing-checklist.md
new file mode 100644
index 0000000000..225e9fe350
--- /dev/null
+++ b/.cursor/skills/seo-content-writer/references/seo-writing-checklist.md
@@ -0,0 +1,250 @@
+# SEO Writing Checklist and Content Template
+
+On-page SEO checklist, content writing template, and featured snippet optimization patterns.
+
+---
+
+## On-Page SEO Checklist
+
+**Keyword Placement**:
+- [ ] Primary keyword in title
+- [ ] Primary keyword in H1
+- [ ] Primary keyword in first 100 words
+- [ ] Primary keyword in at least one H2
+- [ ] Primary keyword in conclusion
+- [ ] Primary keyword in meta description
+- [ ] Secondary keywords in H2s/H3s
+- [ ] Related terms throughout body
+
+**Content Quality**:
+- [ ] Comprehensive coverage of topic
+- [ ] Original insights or data
+- [ ] Actionable takeaways
+- [ ] Examples and illustrations
+- [ ] Expert quotes or citations (for E-E-A-T)
+
+**Readability**:
+- [ ] Paragraphs of 3-5 sentences (per CORE-EEAT O06 Section Chunking standard)
+- [ ] Varied sentence length
+- [ ] Bullet points and lists
+- [ ] Bold key phrases
+- [ ] Table of contents for long content
+
+**Technical**:
+- [ ] Internal links to relevant pages (2-5)
+- [ ] External links to authoritative sources (2-3)
+- [ ] Image alt text with keywords
+- [ ] URL slug includes keyword
+
+---
+
+## Content Writing Template
+
+```markdown
+# [H1 with Primary Keyword]
+
+[Hook sentence that grabs attention]
+
+[Problem statement or context - why this matters]
+
+[Promise - what the reader will learn/gain] [Include primary keyword naturally]
+
+[Brief overview of what's covered - can be bullet points for scanability]
+
+## [H2 - First Main Section with Secondary Keyword]
+
+[Introduction to section - 1-2 sentences]
+
+[Main content with valuable information]
+
+[Examples, data, or evidence to support points]
+
+[Transition to next section]
+
+### [H3 - Sub-section if needed]
+
+[Detailed content]
+
+[Key points in bullet format]:
+- Point 1
+- Point 2
+- Point 3
+
+## [H2 - Second Main Section]
+
+[Continue with valuable content...]
+
+> **Pro Tip**: [Highlighted tip or key insight]
+
+| Column 1 | Column 2 | Column 3 |
+|----------|----------|----------|
+| Data | Data | Data |
+
+## [H2 - Additional Sections as Needed]
+
+[Content...]
+
+## Frequently Asked Questions
+
+### [Question from PAA or common query]?
+
+[Direct, concise answer in 40-60 words for featured snippet opportunity]
+
+### [Question 2]?
+
+[Answer]
+
+### [Question 3]?
+
+[Answer]
+
+## Conclusion
+
+[Summary of key points - include primary keyword]
+
+[Final thought or insight]
+
+[Clear call-to-action: what should reader do next?]
+```
+
+---
+
+## Featured Snippet Optimization
+
+**For Definition Snippets**:
+"[Term] is [clear, concise definition in 40-60 words]"
+
+**For List Snippets**:
+Create clear, numbered or bulleted lists under H2s
+
+**For Table Snippets**:
+Use comparison tables with clear headers
+
+**For How-To Snippets**:
+Number each step clearly: "Step 1:", "Step 2:", etc.
+
+---
+
+## Example: Full SEO Article
+
+**User**: "Write an SEO-optimized article about 'email marketing best practices' targeting small businesses"
+
+**Output**:
+
+```markdown
+# Email Marketing Best Practices: A Complete Guide for Small Businesses
+
+**Meta Description**: Discover proven email marketing best practices that help small businesses increase open rates, boost engagement, and drive sales. Start improving your campaigns today. (158 chars)
+
+---
+
+Want to know the secret behind small businesses that consistently turn subscribers into loyal customers? It's not luck---it's strategic email marketing.
+
+Email marketing remains one of the most effective channels for small businesses, delivering an average ROI of $42 for every $1 spent. But with inboxes more crowded than ever, following email marketing best practices isn't optional---it's essential for survival.
+
+In this guide, you'll learn:
+- How to build a quality email list that converts
+- Proven strategies to increase open and click rates
+- Advanced personalization techniques that drive results
+- Common mistakes that kill email performance
+
+Let's dive into the strategies that will transform your email marketing.
+
+## Why Email Marketing Matters for Small Businesses
+
+Before we explore the best practices, let's understand why email deserves your attention.
+
+Unlike social media where algorithms control who sees your content, email gives you direct access to your audience. You own your email list---no platform can take it away.
+
+**Key email marketing statistics for small businesses**:
+- 81% of SMBs rely on email as their primary customer acquisition channel
+- Email subscribers are 3x more likely to share content on social media
+- Personalized emails generate 6x higher transaction rates
+
+## Building a High-Quality Email List
+
+### Use Strategic Opt-in Incentives
+
+The foundation of effective email marketing is a quality list. Here's how to grow yours:
+
+**Lead magnets that convert**:
+- Industry-specific templates
+- Exclusive discounts or early access
+- Free tools or calculators
+- Educational email courses
+
+> **Pro Tip**: The best lead magnets solve a specific, immediate problem for your target audience.
+
+### Implement Double Opt-in
+
+Double opt-in confirms subscriber intent and improves deliverability. Yes, you'll have fewer subscribers, but they'll be more engaged.
+
+| Single Opt-in | Double Opt-in |
+|---------------|---------------|
+| More subscribers | Fewer subscribers |
+| Higher bounce rates | Lower bounce rates |
+| Lower engagement | Higher engagement |
+| Spam risk | Better deliverability |
+
+## Crafting Emails That Get Opened
+
+### Write Compelling Subject Lines
+
+Your subject line determines whether emails get opened or ignored.
+
+**Subject line best practices**:
+1. Keep it under 50 characters
+2. Create urgency without being spammy
+3. Use numbers and specifics
+4. Personalize when possible
+5. A/B test consistently
+
+### Optimize Preview Text
+
+The preview text (preheader) is your second chance to convince someone to open. Use it to expand on your subject line, not repeat it.
+
+## Personalization Strategies That Work
+
+Generic emails perform poorly. Here's how to personalize effectively:
+
+### Segment Your Email List
+
+Divide your list based on:
+- Purchase history
+- Engagement level
+- Demographics
+- Interests or preferences
+- Customer lifecycle stage
+
+### Use Dynamic Content
+
+Send the same email but show different content blocks based on subscriber data.
+
+## Frequently Asked Questions
+
+### How often should small businesses send marketing emails?
+
+For most small businesses, sending 1-2 emails per week strikes the right balance. This keeps you top-of-mind without overwhelming subscribers. Monitor your unsubscribe rates---if they spike after increasing frequency, scale back.
+
+### What is a good email open rate for small businesses?
+
+A good email open rate for small businesses is 20-25%. However, this varies by industry. E-commerce averages 15-20%, while B2B services often see 25-30%. Focus on improving your own benchmarks rather than industry averages.
+
+### Should I buy email lists to grow faster?
+
+Never buy email lists. Purchased lists violate GDPR and CAN-SPAM regulations, damage your sender reputation, and result in poor engagement. Organic list growth, while slower, produces much better results.
+
+## Conclusion
+
+Implementing these email marketing best practices will help your small business build stronger customer relationships and drive consistent revenue. Remember: quality always beats quantity in email marketing.
+
+Start with one improvement today. Whether it's segmenting your list, testing subject lines, or cleaning inactive subscribers, small changes compound into significant results.
+
+**Ready to level up your email marketing?** Download our free Email Marketing Checklist to ensure every campaign you send follows these best practices.
+
+---
+
+*Further reading: [Internal link: "How to Write Email Copy That Converts"]*
+
+*Sources: [Litmus Email Marketing Statistics](https://litmus.com), [Campaign Monitor Email Benchmarks](https://campaignmonitor.com)*
+```
diff --git a/.cursor/skills/seo-content-writer/references/title-formulas.md b/.cursor/skills/seo-content-writer/references/title-formulas.md
new file mode 100644
index 0000000000..73017696a1
--- /dev/null
+++ b/.cursor/skills/seo-content-writer/references/title-formulas.md
@@ -0,0 +1,339 @@
+# Title and Headline Formulas
+
+Proven title structures that drive clicks and rankings. All formulas target 50-60 characters for optimal SERP display.
+
+## Numbered List Formulas
+
+**Pattern**: [Number] [Adjective] [Topic] [Qualifier/Benefit]
+
+### Examples
+- "7 Ways to Improve Your SEO Rankings in 2024"
+- "15 Content Marketing Strategies That Actually Work"
+- "10 Free Tools for Keyword Research (Beginner-Friendly)"
+- "21 Email Marketing Tips from Industry Experts"
+- "5 Quick Wins for Better Site Performance"
+
+**Why it works**: Numbers promise specific value, create curiosity, set clear expectations.
+
+**Best practices**:
+- Odd numbers (7, 9, 15) often outperform even numbers
+- Use specific numbers (13 vs "over 10")
+- Keep total under 30 for credibility
+- Match number to actual content sections
+
+---
+
+## How-To Formulas
+
+**Pattern 1**: How to [Achieve Goal] in [Timeframe]
+- "How to Rank on Google in 30 Days"
+- "How to Write SEO Content in Under 2 Hours"
+- "How to Build Backlinks in 2024"
+
+**Pattern 2**: How to [Achieve Goal] Without [Common Objection]
+- "How to Learn SEO Without Spending Money"
+- "How to Increase Traffic Without Paid Ads"
+- "How to Write Blog Posts Without Writer's Block"
+
+**Pattern 3**: How to [Achieve Goal] (Even If [Limitation])
+- "How to Rank #1 (Even If You're a New Website)"
+- "How to Get Backlinks (Even With Zero Audience)"
+- "How to Write SEO Content (Even If You Hate Writing)"
+
+**Pattern 4**: How to [Achieve Goal] Like [Authority/Expert]
+- "How to Write Email Copy Like a Professional Copywriter"
+- "How to Do Keyword Research Like an SEO Pro"
+- "How to Create Content Like Top Marketing Agencies"
+
+---
+
+## Question Formulas
+
+**Pattern 1**: What Is [Topic]? [Benefit/Hook]
+- "What Is Technical SEO? A Complete Guide"
+- "What Is GEO? Everything You Need to Know"
+- "What Is Content Marketing? (And Why It Matters)"
+
+**Pattern 2**: Why [Action] [Result]
+- "Why Most Content Marketing Strategies Fail"
+- "Why Your Website Isn't Ranking"
+- "Why Email Still Beats Social Media for ROI"
+
+**Pattern 3**: Should You [Action]? [Quick Answer]
+- "Should You Buy Backlinks? (Spoiler: No)"
+- "Should You Hire an SEO Agency? A Cost-Benefit Analysis"
+- "Should You Rewrite Old Content? When It Makes Sense"
+
+**Pattern 4**: When Should You [Action]
+- "When Should You Publish Blog Posts for Maximum Traffic"
+- "When Should You Update Your SEO Strategy"
+- "When Should You Remove Content from Your Website"
+
+---
+
+## Comparison Formulas
+
+**Pattern 1**: [Option A] vs [Option B]: Which Is Better?
+- "WordPress vs Webflow: Which Is Better for SEO?"
+- "Ahrefs vs SEMrush: Complete Comparison 2024"
+- "In-House SEO vs Agency: Which Should You Choose?"
+
+**Pattern 2**: [Option A] vs [Option B] (and the Winner Is...)
+- "Free Tools vs Paid SEO Software (and the Winner Is...)"
+- "Long-Form vs Short-Form Content (and the Winner Is...)"
+
+**Pattern 3**: [Option A] or [Option B] for [Audience/Goal]
+- "Shopify or WooCommerce for Small Business SEO"
+- "YouTube or TikTok for B2B Marketing"
+
+---
+
+## Ultimate Guide Formulas
+
+**Pattern 1**: The [Complete/Definitive] Guide to [Topic]
+- "The Complete Guide to Technical SEO"
+- "The Definitive Guide to Link Building in 2024"
+
+**Pattern 2**: [Topic]: The Ultimate Guide for [Audience]
+- "Content Marketing: The Ultimate Guide for Beginners"
+- "Local SEO: The Ultimate Guide for Small Businesses"
+
+**Pattern 3**: Everything You Need to Know About [Topic]
+- "Everything You Need to Know About Core Web Vitals"
+- "Everything You Need to Know About Schema Markup"
+
+---
+
+## Power Word Formulas
+
+**Pattern**: [Power Word] [Topic] [Qualifier]
+
+### Power Word Categories
+
+**Authority words**:
+- Proven, Tested, Verified, Certified, Official
+- Example: "Proven Strategies to Double Your Organic Traffic"
+
+**Urgency words**:
+- Now, Today, Fast, Quick, Instantly, Immediately
+- Example: "Quick Wins: Improve SEO Performance Today"
+
+**Value words**:
+- Free, Complete, Essential, Ultimate, Comprehensive
+- Example: "The Essential SEO Checklist for 2024"
+
+**Exclusivity words**:
+- Secret, Hidden, Little-Known, Insider, Exclusive
+- Example: "7 Little-Known SEO Tactics That Actually Work"
+
+**Emotional words**:
+- Amazing, Powerful, Effortless, Simple, Easy
+- Example: "Simple SEO Strategies for Busy Entrepreneurs"
+
+---
+
+## Before/After Formulas
+
+**Pattern 1**: From [Bad State] to [Good State]
+- "From Zero to 10K Monthly Visitors: A Case Study"
+- "From Page 5 to Position 1: How We Ranked a Competitive Keyword"
+
+**Pattern 2**: How I/We [Achieved Result] in [Timeframe]
+- "How I Increased Organic Traffic by 300% in 6 Months"
+- "How We Built 100 Backlinks in 30 Days"
+
+**Pattern 3**: [Bad State] → [Good State]: [Method]
+- "500 Visitors → 50K: Our Content Strategy Revealed"
+- "Struggling → Thriving: SEO Transformation Story"
+
+---
+
+## Mistake/Problem Formulas
+
+**Pattern 1**: [Number] [Topic] Mistakes [Qualifier]
+- "5 SEO Mistakes Killing Your Rankings"
+- "10 Content Marketing Mistakes to Avoid in 2024"
+- "3 Technical SEO Mistakes Even Pros Make"
+
+**Pattern 2**: Stop [Wrong Action]. Do [Right Action] Instead
+- "Stop Keyword Stuffing. Do This Instead"
+- "Stop Buying Backlinks. Try These 7 Tactics Instead"
+
+**Pattern 3**: Why Your [Topic] Isn't Working (And How to Fix It)
+- "Why Your Content Isn't Ranking (And How to Fix It)"
+- "Why Your SEO Strategy Isn't Working (And How to Fix It)"
+
+---
+
+## Checklist Formulas
+
+**Pattern 1**: [Topic] Checklist: [Number] Must-Have [Items]
+- "Technical SEO Checklist: 23 Must-Check Items"
+- "Blog Post Checklist: 15 Things to Check Before Publishing"
+
+**Pattern 2**: The [Timeframe] [Topic] Checklist
+- "The 2024 SEO Audit Checklist"
+- "The Pre-Launch Website Checklist"
+
+---
+
+## Curiosity Gap Formulas
+
+**Pattern 1**: The [Adjective] Reason [Unexpected Fact]
+- "The Surprising Reason Long Content Ranks Better"
+- "The Real Reason Your Blog Posts Aren't Getting Traffic"
+
+**Pattern 2**: [Number] Things About [Topic] That [Surprising Fact]
+- "7 Things About SEO That Google Doesn't Tell You"
+- "5 Things About Content Marketing That Changed in 2024"
+
+**Pattern 3**: What [Expert/Group] Know About [Topic] (That You Don't)
+- "What Top SEO Agencies Know About Link Building (That You Don't)"
+- "What Professional Copywriters Know About Headlines"
+
+---
+
+## Benefit-Driven Formulas
+
+**Pattern 1**: [Action] to [Specific Benefit]
+- "Optimize Your Title Tags to Double Your CTR"
+- "Write Better Meta Descriptions to Increase Organic Traffic"
+
+**Pattern 2**: [Action] for [Outcome] Without [Objection]
+- "Scale Content Production for Better Rankings Without Hiring Writers"
+- "Improve Site Speed for Higher Rankings Without Technical Skills"
+
+---
+
+## Authority/Credibility Formulas
+
+**Pattern 1**: [Expert/Authority] Says [Action/Tip]
+- "Google's John Mueller Says Internal Linking Is Underrated"
+- "SEO Experts Reveal Their Top Ranking Factor for 2024"
+
+**Pattern 2**: [Number] [Experts/Companies] Share [Topic]
+- "50 SEO Experts Share Their #1 Ranking Tip"
+- "10 Successful Brands Share Their Content Strategy"
+
+---
+
+## CTR Optimization Patterns
+
+### Elements That Increase CTR
+
+**Year/Timeframe** (signals freshness):
+- "SEO Trends for 2024"
+- "Best Practices [Updated January 2024]"
+
+**Brackets/Parentheticals** (adds extra context):
+- "Email Marketing Tips (That Actually Work)"
+- "SEO Tools [Free & Paid Options]"
+
+**Numbers** (promise specific value):
+- "7 Ways...", "15 Tools...", "3 Reasons..."
+
+**Power words** (emotional trigger):
+- Ultimate, Essential, Complete, Proven, Secret
+
+**Target audience** (relevance signal):
+- "...for Beginners", "...for Small Businesses", "...for B2B"
+
+### Elements That Decrease CTR
+
+- All caps (looks spammy)
+- Excessive punctuation (!!!, ???)
+- Clickbait without substance
+- Misleading promises
+- Generic titles ("SEO Tips", "Marketing Guide")
+
+---
+
+## Title Length Guidelines
+
+### Google SERP
+- **Optimal**: 50-60 characters
+- **Maximum**: ~70 characters before truncation
+- **Mobile**: ~55 characters (truncates earlier)
+
+### Social Media
+- **Twitter/X**: 70-100 characters for best engagement
+- **Facebook**: 40-50 characters (shorter performs better)
+- **LinkedIn**: 70-100 characters
+- **Pinterest**: 40-60 characters
+
+### Email Subject Lines
+- **Optimal**: 40-50 characters
+- **Mobile**: 30-40 characters (truncates earlier)
+- **Desktop**: Up to 60 characters
+
+---
+
+## Before/After Title Examples
+
+### Generic → Optimized
+
+**Before**: "SEO Tips"
+**After**: "7 SEO Tips That Increased Our Traffic by 300%"
+- Added: Number, specificity, proof
+
+**Before**: "How to Do Keyword Research"
+**After**: "How to Do Keyword Research (Free Tools & 5-Step Process)"
+- Added: Benefit, specificity, process indicator
+
+**Before**: "Content Marketing Guide"
+**After**: "The Complete Content Marketing Guide for Small Businesses [2024]"
+- Added: Completeness indicator, target audience, year
+
+**Before**: "What Is Technical SEO"
+**After**: "What Is Technical SEO? A Beginner's Guide to Better Rankings"
+- Added: Audience level, benefit
+
+**Before**: "WordPress vs Shopify"
+**After**: "WordPress vs Shopify: Which Is Better for SEO? (2024 Comparison)"
+- Added: Specific comparison angle, year
+
+**Before**: "Link Building Strategies"
+**After**: "15 White-Hat Link Building Strategies That Still Work in 2024"
+- Added: Number, quality signal, time relevance
+
+---
+
+## Title Writing Checklist
+
+Before finalizing any title, check:
+
+- [ ] Includes primary keyword naturally
+- [ ] Length is 50-60 characters
+- [ ] Matches search intent (informational/transactional/navigational)
+- [ ] Contains power word or emotional trigger
+- [ ] Specific and concrete (not vague)
+- [ ] Promises clear value or benefit
+- [ ] Accurate representation of content
+- [ ] No clickbait or misleading claims
+- [ ] Unique compared to competitors
+- [ ] Works when truncated on mobile
+
+---
+
+## Testing and Optimization
+
+### A/B Test These Variables
+- Number variations (7 vs 10 vs 15)
+- Power word placement (beginning vs end)
+- Question vs statement format
+- With/without year
+- With/without brackets
+- Different benefit framing
+
+### Track These Metrics
+- Click-through rate (CTR) in Search Console
+- Average position in SERPs
+- Time on page (engagement indicator)
+- Bounce rate (relevance indicator)
+
+### When to Rewrite Titles
+- CTR below 2% for top 3 positions
+- CTR below 5% for positions 4-10
+- Content is updated (add [Updated 2024])
+- Competitors have better titles
+- Ranking but not getting clicks
diff --git a/.cursor/skills/serp-analysis/SKILL.md b/.cursor/skills/serp-analysis/SKILL.md
new file mode 100644
index 0000000000..d9b35be636
--- /dev/null
+++ b/.cursor/skills/serp-analysis/SKILL.md
@@ -0,0 +1,284 @@
+---
+name: serp-analysis
+description: 'Analyze SERPs: ranking factors, features, intent patterns, AI overviews, featured snippets. SERP分析/搜索结果'
+version: "6.0.0"
+license: Apache-2.0
+compatibility: "Claude Code ≥1.0, skills.sh marketplace, ClawHub marketplace, Vercel Labs skills ecosystem. No system packages required. Optional: MCP network access for SEO tool integrations."
+allowed-tools: WebFetch
+homepage: "https://github.com/aaron-he-zhu/seo-geo-claude-skills"
+when_to_use: "Use when analyzing search engine results pages, SERP features, featured snippets, People Also Ask, or understanding ranking patterns for a query."
+argument-hint: "<keyword or query>"
+metadata:
+  author: aaron-he-zhu
+  version: "6.0.0"
+  geo-relevance: "high"
+  tags:
+    - seo
+    - geo
+    - serp-analysis
+    - serp-features
+    - featured-snippet
+    - ai-overview
+    - people-also-ask
+    - search-intent
+    - SERP分析
+    - 検索結果分析
+    - 검색결과
+    - analisis-serp
+  triggers:
+    # EN-formal
+    - "analyze search results"
+    - "SERP analysis"
+    - "what ranks for"
+    - "SERP features"
+    - "why does this page rank"
+    - "featured snippets"
+    - "AI overviews"
+    # EN-casual
+    - "what's on page one for this query"
+    - "who ranks for this keyword"
+    - "what does Google show for"
+    - "what shows up for this search"
+    - "who is on page one"
+    # EN-question
+    - "why does this page rank first"
+    - "what SERP features appear for"
+    # ZH-pro
+    - "SERP分析"
+    - "搜索结果分析"
+    - "精选摘要"
+    - "AI概览"
+    # ZH-casual
+    - "谁排第一"
+    - "搜索结果长什么样"
+    - "谁排在前面"
+    # JA
+    - "検索結果ページ分析"
+    - "検索結果分析"
+    - "強調スニペット"
+    # KO
+    - "검색 결과 분석"
+    - "SERP 분석"
+    # ES
+    - "análisis SERP"
+    - "análisis de resultados de búsqueda"
+    # PT
+    - "análise de SERP"
+    # Misspellings
+    - "serp anaylsis"
+---
+
+# SERP Analysis
+
+
+> **[SEO & GEO Skills Library](https://github.com/aaron-he-zhu/seo-geo-claude-skills)** · 20 skills for SEO + GEO · [ClawHub](https://clawhub.ai/u/aaron-he-zhu) · [skills.sh](https://skills.sh/aaron-he-zhu/seo-geo-claude-skills)
+> **System Mode**: This research skill follows the shared [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md) and [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md).
+
+
+This skill analyzes Search Engine Results Pages to reveal what's working for ranking content, which SERP features appear, and what triggers AI-generated answers. Understand the battlefield before creating content.
+
+**System role**: Research layer skill. It turns market signals into reusable strategic inputs for the rest of the library.
+
+## When This Must Trigger
+
+Use this when the conversation involves any of these situations — even if the user does not use SEO terminology:
+
+Use this whenever the task needs reusable market intelligence that should influence strategy, not just an ad hoc answer.
+
+- Before creating content for a target keyword
+- Understanding why certain pages rank #1
+- Identifying SERP feature opportunities (featured snippets, PAA)
+- Analyzing AI Overview/SGE patterns
+- Evaluating keyword difficulty more accurately
+- Planning content format based on what ranks
+- Identifying ranking factors for specific queries
+
+## What This Skill Does
+
+1. **SERP Composition Analysis**: Maps what appears on the results page
+2. **Ranking Factor Identification**: Reveals why top results rank
+3. **SERP Feature Mapping**: Identifies featured snippets, PAA, knowledge panels
+4. **AI Overview Analysis**: Examines when and how AI answers appear
+5. **Intent Signal Detection**: Confirms user intent from SERP composition
+6. **Content Format Recommendations**: Suggests optimal format based on SERP
+7. **Difficulty Assessment**: Evaluates realistic ranking potential
+
+## Quick Start
+
+Start with one of these prompts. Finish with a short handoff summary using the repository format in [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md).
+
+### Basic SERP Analysis
+
+```
+Analyze the SERP for [keyword]
+```
+
+```
+What does it take to rank for [keyword]?
+```
+
+### Feature-Specific Analysis
+
+```
+Analyze featured snippet opportunities for [keyword list]
+```
+
+```
+Which of these keywords trigger AI Overviews? [keyword list]
+```
+
+### Competitive SERP Analysis
+
+```
+Why does [URL] rank #1 for [keyword]?
+```
+
+## Skill Contract
+
+**Expected output**: a prioritized research brief, evidence-backed findings, and a short handoff summary ready for `memory/research/`.
+
+- **Reads**: user goals, target market inputs, available tool data, and prior strategy from [CLAUDE.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CLAUDE.md) and the shared [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md) when available.
+- **Writes**: a user-facing research deliverable plus a reusable summary that can be stored under `memory/research/`.
+- **Promotes**: durable keyword priorities, competitor facts, entity candidates, and strategic decisions to `CLAUDE.md`, `memory/decisions.md`, and `memory/research/`; hand canonical entity work to `entity-optimizer`.
+- **Next handoff**: use the `Next Best Skill` below when the findings are ready to drive action.
+
+## Data Sources
+
+> **Note:** All integrations are optional. This skill works without any API keys — users provide data manually when no tools are connected.
+
+> See [CONNECTORS.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CONNECTORS.md) for tool category placeholders.
+
+**With ~~SEO tool + ~~search console + ~~AI monitor connected:**
+Automatically fetch SERP snapshots for target keywords, extract ranking page metrics (domain authority, backlinks, content length), pull SERP feature data, and check AI Overview presence using ~~AI monitor. Historical SERP change data and mobile vs. desktop variations can be retrieved automatically.
+
+**With manual data only:**
+Ask the user to provide:
+1. Target keyword(s) to analyze
+2. SERP screenshots or detailed descriptions of search results
+3. URLs of top 10 ranking pages
+4. Search location and device type (mobile/desktop)
+5. Any observations about SERP features (featured snippets, PAA, AI Overviews)
+
+Proceed with the full analysis using provided data. Note in the output which metrics are from automated collection vs. user-provided data.
+
+## Instructions
+
+When a user requests SERP analysis:
+
+1. **Understand the Query**
+
+   Clarify if needed:
+   - Target keyword(s) to analyze
+   - Search location/language
+   - Device type (mobile/desktop)
+   - Specific questions about the SERP
+
+2. **Map SERP Composition**
+
+   Document all elements appearing on the results page: AI Overview, ads, featured snippet, organic results, PAA, knowledge panel, image pack, video results, local pack, shopping results, news results, sitelinks, and related searches.
+
+3. **Analyze Top Ranking Pages**
+
+   For each of the top 10 results, document: URL, domain, domain authority, content type, word count, publish/update dates, on-page factors (title, meta description, H1, URL structure), content structure (headings, media, tables, FAQ), estimated metrics (backlinks, referring domains), and why it ranks.
+
+4. **Identify Ranking Patterns**
+
+   Analyze common characteristics across top 5 results: word count, domain authority, backlinks, content freshness, HTTPS, mobile optimization. Document content format distribution, domain type distribution, and key success factors.
+
+5. **Analyze SERP Features**
+
+   For each present SERP feature: analyze the current holder, content format, and strategy to win. Cover Featured Snippet (type, content, winning strategy), PAA (questions, current answers, optimization approach), and AI Overview (sources cited, content patterns, citation strategy).
+
+6. **Determine Search Intent**
+
+   Confirm primary intent from SERP composition. Document evidence, intent breakdown percentages, and content format implications (format, tone, CTA).
+
+7. **Calculate True Difficulty**
+
+   Score overall difficulty (1-100) based on: top 10 domain authority, page authority, backlinks required, content quality bar, and SERP stability. Provide realistic assessments for new, growing, and established sites, plus easier alternatives.
+
+8. **Generate Recommendations**
+
+   Produce a summary with: Key Findings, Content Requirements to Rank (minimum requirements + differentiators), SERP Feature Strategy, Recommended Content Outline, and Next Steps.
+
+   > **Reference**: See [references/analysis-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/research/serp-analysis/references/analysis-templates.md) for detailed templates for each step.
+
+## Validation Checkpoints
+
+### Input Validation
+- [ ] Target keyword(s) clearly specified
+- [ ] Search location and device type confirmed
+- [ ] SERP data is current (date confirmed)
+- [ ] Top 10 ranking URLs identified or provided
+
+### Output Validation
+- [ ] Every recommendation cites specific data points (not generic advice)
+- [ ] SERP composition mapped with all features documented
+- [ ] Ranking factors identified from actual top 10 analysis (not assumptions)
+- [ ] Content requirements based on observed patterns in current SERP
+- [ ] Source of each data point clearly stated (~~SEO tool data, ~~AI monitor data, user-provided, or manual observation)
+
+## Example
+
+> **Reference**: See [references/example-report.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/research/serp-analysis/references/example-report.md) for a complete example analyzing the SERP for "how to start a podcast".
+
+## Advanced Analysis
+
+### Multi-Keyword SERP Comparison
+
+```
+Compare SERPs for [keyword 1], [keyword 2], [keyword 3]
+```
+
+### Historical SERP Changes
+
+```
+How has the SERP for [keyword] changed over time?
+```
+
+### Local SERP Variations
+
+```
+Compare SERP for [keyword] in [location 1] vs [location 2]
+```
+
+### Mobile vs Desktop SERP
+
+```
+Analyze mobile vs desktop SERP differences for [keyword]
+```
+
+## Tips for Success
+
+1. **Always check SERP before writing** - Don't assume, verify
+2. **Match content format to SERP** - If lists rank, write lists
+3. **Identify SERP feature opportunities** - Lower competition than #1
+4. **Note SERP volatility** - Stable SERPs are harder to break into
+5. **Study the outliers** - Why does a weaker site rank? Opportunity!
+6. **Consider AI Overview optimization** - Growing importance
+
+
+
+### Save Results
+
+After delivering findings to the user, ask:
+
+> "Save these results for future sessions?"
+
+If yes, write a dated summary to `memory/research/serp-analysis/YYYY-MM-DD-<topic>.md` containing:
+- One-line headline finding
+- Top 3-5 actionable items
+- Open loops or blockers
+- Source data references
+
+If any findings should influence ongoing strategy, recommend promoting key conclusions to `memory/hot-cache.md`.
+
+## Reference Materials
+
+- [Analysis Templates](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/research/serp-analysis/references/analysis-templates.md) — Detailed templates for each analysis step (SERP composition, top results, ranking patterns, features, intent, difficulty, recommendations)
+- [SERP Feature Taxonomy](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/research/serp-analysis/references/serp-feature-taxonomy.md) — Complete taxonomy of SERP features with trigger conditions, AI overview framework, intent signals, and volatility assessment
+- [Example Report](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/research/serp-analysis/references/example-report.md) — Complete example analyzing the SERP for "how to start a podcast"
+
+## Next Best Skill
+
+- **Primary**: [seo-content-writer](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/build/seo-content-writer/SKILL.md) — turn SERP patterns into a content brief or page structure.
diff --git a/.cursor/skills/serp-analysis/references/analysis-templates.md b/.cursor/skills/serp-analysis/references/analysis-templates.md
new file mode 100644
index 0000000000..8f1a030b29
--- /dev/null
+++ b/.cursor/skills/serp-analysis/references/analysis-templates.md
@@ -0,0 +1,297 @@
+# SERP Analysis — Analysis Templates
+
+Templates for each step of the SERP analysis workflow. Use these to structure your output.
+
+## SERP Composition Template
+
+```markdown
+## SERP Analysis: "[keyword]"
+
+**Search Details**
+- Keyword: [keyword]
+- Location: [location]
+- Device: [mobile/desktop]
+- Date: [date]
+
+### SERP Layout Overview
+
+```
+┌─────────────────────────────────────────┐
+│ [AI Overview / SGE] (if present)        │
+├─────────────────────────────────────────┤
+│ [Ads] - [X] ads above fold              │
+├─────────────────────────────────────────┤
+│ [Featured Snippet] (if present)         │
+├─────────────────────────────────────────┤
+│ [Organic Result #1]                     │
+│ [Organic Result #2]                     │
+│ [People Also Ask] (if present)          │
+│ [Organic Result #3]                     │
+│ ...                                     │
+├─────────────────────────────────────────┤
+│ [Related Searches]                      │
+└─────────────────────────────────────────┘
+```
+
+### SERP Features Present
+
+| Feature | Present | Position | Opportunity |
+|---------|---------|----------|-------------|
+| AI Overview | Yes/No | Top | [analysis] |
+| Featured Snippet | Yes/No | [pos] | [analysis] |
+| People Also Ask | Yes/No | [pos] | [analysis] |
+| Knowledge Panel | Yes/No | Right | [analysis] |
+| Image Pack | Yes/No | [pos] | [analysis] |
+| Video Results | Yes/No | [pos] | [analysis] |
+| Local Pack | Yes/No | [pos] | [analysis] |
+| Shopping Results | Yes/No | [pos] | [analysis] |
+| News Results | Yes/No | [pos] | [analysis] |
+| Sitelinks | Yes/No | [pos] | [analysis] |
+```
+
+## Top Results Analysis Template
+
+```markdown
+### Top 10 Organic Results Analysis
+
+#### Position #1: [Title]
+
+**URL**: [url]
+**Domain**: [domain]
+**Domain Authority**: [DA]
+
+**Content Analysis**:
+- Type: [Blog/Product/Guide/etc.]
+- Word Count: [X] words
+- Publish Date: [date]
+- Last Updated: [date]
+
+**On-Page Factors**:
+- Title: [exact title]
+- Title contains keyword: Yes/No
+- Meta description: [description]
+- H1: [heading]
+- URL structure: [clean/keyword-rich/etc.]
+
+**Content Structure**:
+- Headings (H2s): [list key sections]
+- Media: [X] images, [X] videos
+- Tables/Lists: Yes/No
+- FAQ section: Yes/No
+
+**Estimated Metrics**:
+- Page backlinks: [X]
+- Referring domains: [X]
+- Social shares: [X]
+
+**Why It Ranks #1**:
+1. [Factor 1]
+2. [Factor 2]
+3. [Factor 3]
+
+[Repeat for positions #2-10]
+```
+
+## Ranking Patterns Template
+
+```markdown
+### Ranking Patterns Analysis
+
+**Common Characteristics of Top 5 Results**:
+
+| Factor | Avg/Common Value | Importance |
+|--------|-----------------|------------|
+| Word Count | [X] words | High/Med/Low |
+| Domain Authority | [X] | High/Med/Low |
+| Page Backlinks | [X] | High/Med/Low |
+| Content Freshness | [timeframe] | High/Med/Low |
+| HTTPS | [X]% | High/Med/Low |
+| Mobile Optimized | [X]% | High/Med/Low |
+
+**Content Format Distribution**:
+- How-to guides: [X]/10
+- Listicles: [X]/10
+- In-depth articles: [X]/10
+- Product pages: [X]/10
+- Other: [X]/10
+
+**Domain Type Distribution**:
+- Brand/Company sites: [X]/10
+- Media/News sites: [X]/10
+- Niche blogs: [X]/10
+- Aggregators: [X]/10
+
+**Key Success Factors Identified**:
+
+1. **[Factor 1]**: [Explanation + evidence]
+2. **[Factor 2]**: [Explanation + evidence]
+3. **[Factor 3]**: [Explanation + evidence]
+```
+
+## SERP Features Analysis Template
+
+```markdown
+### Featured Snippet Analysis
+
+**Current Snippet Holder**: [URL]
+**Snippet Type**: [Paragraph/List/Table/Video]
+**Snippet Content**:
+> [Exact text/description of snippet]
+
+**How to Win This Snippet**:
+1. [Strategy based on current snippet]
+2. [Content format recommendation]
+3. [Structure recommendation]
+
+---
+
+### People Also Ask (PAA) Analysis
+
+**Questions Appearing**:
+1. [Question 1] → Currently answered by: [URL]
+2. [Question 2] → Currently answered by: [URL]
+3. [Question 3] → Currently answered by: [URL]
+4. [Question 4] → Currently answered by: [URL]
+
+**PAA Optimization Strategy**:
+- Include these questions as H2/H3 headings
+- Provide direct, concise answers (40-60 words)
+- Use FAQ schema markup
+
+---
+
+### AI Overview Analysis
+
+**AI Overview Present**: Yes/No
+**AI Overview Type**: [Summary/List/Comparison/etc.]
+
+**Sources Cited in AI Overview**:
+1. [Source 1] - [Why cited]
+2. [Source 2] - [Why cited]
+3. [Source 3] - [Why cited]
+
+**AI Overview Content Patterns**:
+- Pulls definitions from: [source type]
+- Lists information as: [format]
+- Cites statistics from: [source type]
+
+**How to Get Cited in AI Overview**:
+1. [Specific recommendation]
+2. [Specific recommendation]
+3. [Specific recommendation]
+```
+
+## Search Intent Template
+
+```markdown
+### Search Intent Analysis
+
+**Primary Intent**: [Informational/Commercial/Transactional/Navigational]
+
+**Evidence**:
+- SERP features suggest: [analysis]
+- Top results are: [content types]
+- User likely wants: [description]
+
+**Intent Breakdown**:
+- Informational signals: [X]%
+- Commercial signals: [X]%
+- Transactional signals: [X]%
+
+**Content Format Implication**:
+Based on intent, your content should:
+- Format: [recommendation]
+- Tone: [recommendation]
+- CTA: [recommendation]
+```
+
+## Difficulty Assessment Template
+
+```markdown
+### Difficulty Assessment
+
+**Overall Difficulty Score**: [X]/100
+
+**Difficulty Factors**:
+
+| Factor | Score | Weight | Impact |
+|--------|-------|--------|--------|
+| Top 10 Domain Authority | [avg] | 25% | [High/Med/Low] |
+| Top 10 Page Authority | [avg] | 20% | [High/Med/Low] |
+| Backlinks Required | [est.] | 20% | [High/Med/Low] |
+| Content Quality Bar | [rating] | 20% | [High/Med/Low] |
+| SERP Stability | [rating] | 15% | [High/Med/Low] |
+
+**Realistic Assessment**:
+
+- **New site (DA <20)**: [Can rank?] [Timeframe]
+- **Growing site (DA 20-40)**: [Can rank?] [Timeframe]
+- **Established site (DA 40+)**: [Can rank?] [Timeframe]
+
+**Easier Alternatives**:
+If too difficult, consider:
+- [Alternative keyword 1] - Difficulty: [X]
+- [Alternative keyword 2] - Difficulty: [X]
+```
+
+## Recommendations Template
+
+```markdown
+## SERP Analysis Summary & Recommendations
+
+### Key Findings
+
+1. [Most important finding]
+2. [Second important finding]
+3. [Third important finding]
+
+### Content Requirements to Rank
+
+To compete for "[keyword]", you need:
+
+**Minimum Requirements**:
+- [ ] Word count: [X]+ words
+- [ ] Backlinks: [X]+ referring domains
+- [ ] Domain Authority: [X]+
+- [ ] Content format: [type]
+- [ ] Include: [specific elements]
+
+**Differentiators to Win**:
+- [ ] [Unique angle from analysis]
+- [ ] [Missing element in current results]
+- [ ] [SERP feature opportunity]
+
+### SERP Feature Strategy
+
+| Feature | Winnable? | Strategy |
+|---------|-----------|----------|
+| Featured Snippet | Yes/No | [strategy] |
+| PAA | Yes/No | [strategy] |
+| AI Overview | Yes/No | [strategy] |
+
+### Recommended Content Outline
+
+Based on SERP analysis:
+
+```
+Title: [Optimized title]
+
+H1: [Main heading]
+
+[Introduction - address intent immediately]
+
+H2: [Section based on PAA/top results]
+H2: [Section based on PAA/top results]
+H2: [Section based on PAA/top results]
+
+[FAQ section for PAA optimization]
+
+[Conclusion with CTA]
+```
+
+### Next Steps
+
+1. [Immediate action]
+2. [Content creation action]
+3. [Optimization action]
+```
diff --git a/.cursor/skills/serp-analysis/references/example-report.md b/.cursor/skills/serp-analysis/references/example-report.md
new file mode 100644
index 0000000000..772b5c8847
--- /dev/null
+++ b/.cursor/skills/serp-analysis/references/example-report.md
@@ -0,0 +1,85 @@
+# SERP Analysis — Example Report
+
+This is a complete example output showing the expected format for SERP analysis deliverables.
+
+## Prompt
+
+**User**: "Analyze the SERP for 'how to start a podcast'"
+
+## Example Output
+
+```markdown
+# SERP Analysis: "how to start a podcast"
+
+## SERP Overview
+
+**Intent**: Informational (100%)
+**Difficulty**: 72/100 (High)
+**AI Overview**: Yes - comprehensive guide format
+
+### SERP Features Present
+
+| Feature | Present | Analysis |
+|---------|---------|----------|
+| AI Overview | ✅ | Lists steps, cites 3 sources |
+| Featured Snippet | ✅ | Step-by-step list from Buzzsprout |
+| People Also Ask | ✅ | 4 questions visible, expandable |
+| Video Results | ✅ | 3 YouTube videos mid-page |
+| Image Pack | ❌ | |
+
+### Top 5 Results Analysis
+
+| Pos | Domain | DA | Word Count | Format | Backlinks |
+|-----|--------|-----|------------|--------|-----------|
+| 1 | Buzzsprout | 71 | 8,500 | Ultimate Guide | 2,400 |
+| 2 | Spotify | 93 | 3,200 | How-to Guide | 890 |
+| 3 | Podcastinsights | 58 | 12,000 | Mega Guide | 1,800 |
+| 4 | Transistor | 62 | 5,500 | Tutorial | 720 |
+| 5 | HubSpot | 91 | 6,200 | Complete Guide | 1,100 |
+
+### Why #1 Ranks First
+
+Buzzsprout's guide succeeds because:
+1. **Comprehensive** - Covers every step in detail
+2. **Updated** - Current year in title, recent updates
+3. **Structured** - Clear numbered steps (owns featured snippet)
+4. **Authoritative** - Podcast hosting company (topical authority)
+5. **Supporting content** - Links to detailed sub-guides
+
+### Featured Snippet Opportunity
+
+**Current format**: Ordered list (steps)
+**Current holder**: Buzzsprout
+
+**To win snippet**:
+- Create cleaner, more scannable list format
+- Keep steps to 8-10 items max
+- Start each step with action verb
+- Include "how to start a podcast" in H2
+
+### AI Overview Analysis
+
+**Sources cited**:
+1. Buzzsprout - "Choose your podcast topic"
+2. Spotify for Podcasters - "Record and edit"
+3. Wikipedia - Definition of podcasting
+
+**Pattern**: AI pulls step-by-step instructions from guides with clear structure
+
+### Content Requirements
+
+To rank on page 1:
+- **Word count**: 5,000+ words minimum
+- **Format**: Step-by-step ultimate guide
+- **Backlinks**: 500+ from relevant domains
+- **Updates**: Must show current year
+- **Unique angle**: Equipment comparisons, cost breakdowns, or specific niche focus
+
+### Recommended Strategy
+
+Given high difficulty, consider:
+1. Target long-tail: "how to start a podcast for free" (Difficulty: 45)
+2. Target niche: "how to start a podcast about [topic]" (Difficulty: 30)
+3. Create supporting video content for video carousel
+4. Focus on PAA optimization for quick wins
+```
diff --git a/.cursor/skills/serp-analysis/references/serp-feature-taxonomy.md b/.cursor/skills/serp-analysis/references/serp-feature-taxonomy.md
new file mode 100644
index 0000000000..15e16d8610
--- /dev/null
+++ b/.cursor/skills/serp-analysis/references/serp-feature-taxonomy.md
@@ -0,0 +1,404 @@
+# SERP Feature Taxonomy
+
+A comprehensive reference covering every SERP feature type, trigger conditions, optimization techniques, monitoring approaches, and AI Overview patterns. Use this to plan which SERP features to target and how to win them.
+
+## Overview
+
+Modern Search Engine Results Pages are far more than ten blue links. Google displays 20+ distinct feature types that can dramatically affect click-through rates, visibility, and traffic. Understanding which features appear for your target keywords -- and how to optimize for them -- is essential to any SEO or GEO strategy.
+
+---
+
+## SERP Feature Categories
+
+SERP features fall into five broad categories:
+
+| Category | Features | Controlled By |
+|----------|---------|--------------|
+| **Knowledge Features** | Knowledge Panel, AI Overview, Featured Snippet | Content quality + structured data |
+| **Engagement Features** | People Also Ask, Related Searches, Things to Know | Content relevance + question coverage |
+| **Rich Results** | FAQ, How-To, Review Stars, Recipe, Event, Product | Schema markup + content format |
+| **Media Features** | Image Pack, Video Carousel, Web Stories | Media optimization + hosting platform |
+| **Commerce Features** | Shopping Results, Local Pack, Ads | Merchant feeds + Google Business Profile + ad spend |
+
+---
+
+## Complete Feature Reference
+
+### 1. Featured Snippet
+
+**What it is:** An extracted answer displayed at Position 0 (above organic results) in a box.
+
+**Sub-types:**
+
+| Sub-type | Format | Typical Trigger | Example Query |
+|---------|--------|----------------|---------------|
+| Paragraph | 40-60 word text block | "What is", "Why is", definitions | "what is domain authority" |
+| Ordered List | Numbered steps | "How to", process queries | "how to submit a sitemap" |
+| Unordered List | Bulleted list | "Types of", "best", collections | "types of schema markup" |
+| Table | Data in rows/columns | Comparison, data, pricing | "HTTP status codes list" |
+| Video | YouTube clip with timestamp | "How to" with visual component | "how to use Google Search Console" |
+
+**Optimization Playbook:**
+
+1. **Identify snippet-eligible keywords** -- Check if a snippet already exists for your target keyword
+2. **Match the existing format** -- If current snippet is a list, create a list; if paragraph, write a concise paragraph
+3. **Place the answer immediately after the triggering heading** -- Use H2/H3 with the target question, then answer directly below
+4. **Keep paragraph snippets to 40-60 words** -- Concise, complete answers win
+5. **Use proper HTML structure** -- Ordered lists use `<ol>`, tables use `<table>`, not just visual formatting
+6. **Include the target query in the heading** -- The H2/H3 should closely match the search query
+7. **Provide context after the snippet answer** -- Elaborate below to demonstrate depth
+
+**Monitoring:**
+- Track featured snippet ownership weekly for target keywords
+- Monitor snippet format changes (Google may switch from paragraph to list)
+- Watch for snippet loss after content updates
+
+---
+
+### 2. People Also Ask (PAA)
+
+**What it is:** An expandable accordion of related questions with brief answers pulled from web pages.
+
+**Trigger conditions:**
+- Almost all informational queries
+- Many commercial investigation queries
+- Questions beget more questions -- clicking one PAA reveals additional questions
+
+**Optimization Playbook:**
+
+1. **Mine PAA questions for content ideas** -- Each PAA question is a validated search query
+2. **Answer PAA questions within your content** -- Use the exact question as an H2 or H3
+3. **Keep answers concise (40-60 words)** -- PAA answers are short excerpts
+4. **Use FAQ schema markup** -- Increases eligibility for PAA and FAQ rich results
+5. **Create dedicated FAQ sections** -- Group 5-10 related questions at the end of articles
+6. **Target the cascade** -- When users click one PAA, new questions appear; cover those too
+
+**PAA Mining Workflow:**
+1. Search your target keyword
+2. Note all visible PAA questions (4 initially)
+3. Click each one to reveal 2-4 more
+4. Repeat to collect 15-20 related questions
+5. Group questions by subtopic
+6. Create content addressing each cluster
+
+---
+
+### 3. AI Overview (formerly SGE)
+
+**What it is:** An AI-generated summary at the top of the SERP that synthesizes information from multiple sources, with cited links.
+
+**Trigger conditions:**
+- Informational queries (highest trigger rate)
+- Some commercial investigation queries
+- Question-format queries
+- Definitional and explanatory queries
+- Lower trigger rate for navigational and transactional queries
+
+**AI Overview Formats:**
+
+| Format | Description | Trigger Pattern |
+|--------|-----------|----------------|
+| Summary paragraph | Synthesized text answer | Definitional and explanatory queries |
+| Bulleted list | Key points extracted from sources | "Benefits of", "reasons for", multi-factor answers |
+| Step-by-step | Ordered process | "How to" queries |
+| Comparison | Side-by-side analysis | "X vs Y", "difference between" |
+| Table | Structured data comparison | Data comparison, pricing, specifications |
+
+**Optimization Playbook:**
+
+1. **Write clear, citable sentences** -- AI systems extract well-formed statements of fact
+2. **Front-load key information** -- Place the most important answer in the first 1-2 sentences of each section
+3. **Use structured data** -- Schema markup helps AI systems understand your content
+4. **Establish topical authority** -- AI overviews prefer citing authoritative sources on a topic
+5. **Include original data and statistics** -- Unique data points are highly citable
+6. **Create comparison content** -- AI loves to cite well-structured comparison tables
+7. **Update content regularly** -- Recency signals influence AI source selection
+8. **Use clear section headings** -- AI systems use headings to understand content structure
+
+**Source Citation Patterns:**
+
+| What Gets Cited | Why | How to Optimize |
+|----------------|-----|----------------|
+| Definitions | AI needs authoritative definitions | Write clear, complete definitions in first paragraph |
+| Statistics | AI cites specific data points | Include original research, cite sources |
+| Step-by-step processes | AI extracts structured sequences | Use numbered lists with clear step headers |
+| Comparison data | AI synthesizes multi-source comparisons | Create comparison tables with clear labels |
+| Expert quotes | AI values authoritative voices | Include expert attribution with credentials |
+
+---
+
+### 4. Knowledge Panel
+
+**What it is:** A large information box (typically right sidebar on desktop) showing structured entity information from Google's Knowledge Graph.
+
+**Trigger conditions:**
+- Brand/entity queries
+- Notable person queries
+- Place/organization queries
+- Product/service entities
+
+**Optimization Playbook:**
+
+1. **Establish a Google Knowledge Graph entity** -- Ensure your brand exists as a recognized entity
+2. **Claim and verify your Knowledge Panel** -- Use the "Claim this knowledge panel" option
+3. **Maintain consistent NAP** -- Name, Address, Phone across all web properties
+4. **Build Wikipedia presence** -- Knowledge Panels pull heavily from Wikipedia/Wikidata
+5. **Use Organization schema markup** -- Help Google understand your entity
+6. **Maintain active social profiles** -- Connected social accounts appear in Knowledge Panel
+7. **Get featured in authoritative sources** -- Mentions in news, industry publications, and databases
+
+---
+
+### 5. Image Pack
+
+**What it is:** A row of image thumbnails within organic results, linking to Google Images.
+
+**Trigger conditions:**
+- Visual queries ("what does X look like")
+- Product queries
+- Design/inspiration queries
+- Some informational queries with visual components
+
+**Optimization Playbook:**
+
+1. **Use descriptive file names** -- `seo-audit-checklist-template.png` not `IMG_4523.png`
+2. **Write complete alt text** -- Describe the image content and context accurately
+3. **Optimize image file size** -- Compress without losing quality (WebP format preferred)
+4. **Use original images** -- Stock photos rarely rank; original screenshots, diagrams, and photos perform better
+5. **Add image structured data** -- ImageObject schema when applicable
+6. **Place images near relevant text** -- Context from surrounding content helps ranking
+7. **Create image sitemaps** -- Help Google discover all your images
+8. **Use responsive images** -- Serve appropriate sizes for different devices
+
+---
+
+### 6. Video Carousel / Video Results
+
+**What it is:** A horizontal carousel of video thumbnails, typically from YouTube, or individual video results with thumbnails in organic listings.
+
+**Trigger conditions:**
+- "How to" queries
+- Tutorial and instructional queries
+- Entertainment queries
+- Review queries
+- Any query where video content provides superior user experience
+
+**Optimization Playbook:**
+
+1. **Host on YouTube** -- YouTube videos dominate video carousels
+2. **Optimize video title** -- Include target keyword naturally
+3. **Write detailed descriptions** -- First 2-3 lines appear in search; include keywords and summary
+4. **Add chapters/timestamps** -- Key Moments markup helps Google surface specific sections
+5. **Create transcripts** -- Closed captions and transcripts provide indexable text
+6. **Use VideoObject schema** -- On your own site pages embedding video
+7. **Design compelling thumbnails** -- Higher CTR from search results
+8. **Target video-intent keywords** -- "How to" and tutorial queries have highest video potential
+
+---
+
+### 7. Local Pack (Map Pack)
+
+**What it is:** A map with 3 local business listings showing name, rating, address, and hours.
+
+**Trigger conditions:**
+- "[service] near me" queries
+- "[service] in [location]" queries
+- Queries with implicit local intent
+- Service-based business queries
+
+**Optimization Playbook:**
+
+1. **Claim and optimize Google Business Profile** -- Complete every field
+2. **Build consistent local citations** -- NAP consistency across directories
+3. **Collect and respond to reviews** -- Volume and recency of reviews matter
+4. **Add photos regularly** -- Active profiles rank higher
+5. **Use local business schema** -- LocalBusiness structured data on website
+6. **Create location-specific pages** -- If multiple locations, each needs its own page
+7. **Build local backlinks** -- Local news, chambers of commerce, community sites
+8. **Post Google Business updates** -- Regular posts signal activity
+
+---
+
+### 8. Shopping Results
+
+**What it is:** Product listing ads and free product listings with images, prices, and store names.
+
+**Trigger conditions:**
+- Product purchase queries
+- Product name queries
+- "Buy [product]" queries
+- Price comparison queries
+
+**Optimization Playbook:**
+
+1. **Submit product feed to Google Merchant Center** -- Required for shopping results
+2. **Optimize product titles** -- Include key attributes (brand, color, size, model)
+3. **Use high-quality product images** -- White background, multiple angles
+4. **Implement Product schema** -- Structured data for price, availability, reviews
+5. **Keep pricing accurate** -- Mismatches between feed and landing page cause disapproval
+6. **Collect product reviews** -- Aggregate ratings appear in shopping results
+7. **Optimize landing pages** -- Fast, mobile-friendly, clear purchase path
+
+---
+
+### 9. Sitelinks
+
+**What it is:** Additional links beneath a search result that point to specific pages within the same domain.
+
+**Sub-types:**
+
+| Sub-type | Appearance | Trigger |
+|---------|-----------|---------|
+| Full sitelinks | 4-6 two-column links with descriptions | Brand/navigational queries for authoritative sites |
+| Inline sitelinks | 2-4 single-line links | Semi-navigational queries |
+| Search box sitelinks | Site-specific search box | Large, well-structured sites |
+
+**Optimization Playbook:**
+
+1. **Build clear site architecture** -- Logical hierarchy with descriptive navigation
+2. **Use descriptive page titles** -- Each page should have a unique, clear title
+3. **Implement breadcrumb schema** -- Helps Google understand site structure
+4. **Create a comprehensive sitemap** -- XML sitemap submitted to Search Console
+5. **Build internal links** -- Strong internal linking reinforces page importance
+6. **Use SearchAction schema** -- Enables the sitelinks search box
+
+---
+
+### 10. Rich Results (Schema-Dependent)
+
+These features depend on specific structured data markup:
+
+| Rich Result | Schema Required | Content Type | Visual Impact |
+|------------|----------------|-------------|--------------|
+| FAQ | FAQPage | FAQ sections on any page | Expandable Q&A below listing |
+| How-To | HowTo | Step-by-step instructions | Steps with optional images |
+| Review Stars | Review / AggregateRating | Product/service reviews | Star rating in snippet |
+| Recipe | Recipe | Food/cooking content | Image, cook time, calories |
+| Event | Event | Event listings | Date, location, price |
+| Job Posting | JobPosting | Job listings | Salary, location, company |
+| Course | Course | Educational content | Provider, description, rating |
+| Breadcrumb | BreadcrumbList | Any page with hierarchy | Path display replacing URL |
+
+**General Rich Result Optimization:**
+
+1. **Validate with Rich Results Test** -- Test every page before publishing
+2. **Follow Google's structured data guidelines** -- No cloaking or misleading markup
+3. **Keep markup accurate** -- Schema content must match visible page content
+4. **Monitor in Search Console** -- Check Enhancement reports for errors
+5. **Don't over-mark** -- Only add schema for content types genuinely on the page
+
+---
+
+### 11. Related Searches / People Also Search For
+
+**What it is:** Related query suggestions at the bottom of the SERP ("Related searches") or shown after a user clicks a result and returns ("People also search for").
+
+**Value for SEO:**
+- Keyword discovery -- reveals semantically related queries
+- Content gap identification -- topics users explore after your target query
+- Topic cluster planning -- natural subtopics to cover
+
+**How to Use:**
+1. Mine related searches for content ideas and internal linking opportunities
+2. Cover related topics within your content to demonstrate comprehensiveness
+3. Use related search terms as H2/H3 headings in long-form content
+
+---
+
+### 12. "Things to Know" / Key Moments
+
+**What it is:** Carousel cards showing key aspects of a topic, or key moments within a video.
+
+**Trigger conditions:**
+- Broad informational queries
+- Multi-faceted topics
+- Video content with chapters
+
+**Optimization:**
+- Cover multiple aspects of a topic comprehensively
+- Use clear section headings that match common subtopics
+- For video: add chapter markers with timestamps
+
+---
+
+## SERP Feature Prioritization Matrix
+
+Not all SERP features deserve equal attention. Prioritize based on your content type and goals:
+
+| SERP Feature | Traffic Impact | Effort to Win | Best For |
+|-------------|---------------|--------------|---------|
+| Featured Snippet | Very High | Medium | Informational content sites |
+| AI Overview citation | High (growing) | Medium-High | Authority/expertise sites |
+| People Also Ask | Medium-High | Low-Medium | FAQ-rich content |
+| Video Carousel | High | High (video production) | Tutorial/how-to content |
+| Local Pack | Very High (local) | Medium | Local businesses |
+| Rich Results (FAQ) | Medium | Low | Any content with Q&A |
+| Rich Results (Review) | Medium-High | Low-Medium | Product/service reviews |
+| Image Pack | Medium | Low-Medium | Visual content creators |
+| Shopping Results | Very High (ecommerce) | Medium | Product sellers |
+| Knowledge Panel | Medium (brand) | High (long-term) | Established brands |
+| Sitelinks | Low (brand already ranking) | Low (structural) | Large, structured sites |
+
+---
+
+## SERP Feature Monitoring Framework
+
+### What to Track
+
+| Metric | Frequency | Tool Category | Action Threshold |
+|--------|-----------|--------------|-----------------|
+| Featured snippet ownership | Weekly | ~~SEO tool | Lost snippet → investigate within 48 hours |
+| AI Overview citation rate | Weekly | ~~AI monitor | Citation loss → review content freshness |
+| PAA presence for target keywords | Monthly | ~~SEO tool | New PAA questions → create content |
+| SERP feature composition changes | Monthly | ~~SEO tool | New feature appearing → optimize for it |
+| Rich result errors | Weekly | Search Console | Any error → fix immediately |
+| Local Pack ranking | Weekly | ~~SEO tool | Drop below position 3 → investigate |
+
+### SERP Feature Change Analysis
+
+When SERP features change for your target keywords, investigate:
+
+| Change | Possible Causes | Recommended Action |
+|--------|----------------|-------------------|
+| Featured snippet disappeared | Google removed snippet for this query; competitor won it | Check if snippet still exists; create better snippet-targeted content |
+| AI Overview appeared (new) | Google expanded AI Overviews to this query type | Optimize content for AI citation |
+| AI Overview disappeared | Query type removed from AI Overview program | Refocus on traditional SERP features |
+| Video carousel appeared | Google detected video intent for this query | Create video content for the keyword |
+| Local Pack appeared | Google detected local intent shift | Consider local SEO if relevant |
+| Shopping results appeared | Google detected commercial intent shift | Consider product markup or adjust content angle |
+
+---
+
+## SERP Feature Combination Patterns
+
+Certain SERP feature combinations indicate specific opportunities:
+
+| SERP Combination | What It Signals | Opportunity |
+|-----------------|----------------|-------------|
+| AI Overview + Featured Snippet | Google sees this as high-information query | Optimize for both -- structured content with clear answers |
+| Video + PAA + Featured Snippet | Multi-format informational query | Create comprehensive guide with video and FAQ |
+| Shopping + Ads + Reviews | Strong commercial intent | Product optimization, review content |
+| Local Pack + Ads | Local commercial intent | Google Business Profile optimization |
+| No features (just blue links) | Low-feature query (or very new topic) | Potential early-mover advantage for rich results |
+| PAA only (no snippet) | Snippet opportunity not yet captured | Create snippet-optimized content |
+
+---
+
+## AI Overview vs. Traditional SERP Feature Strategy
+
+The rise of AI Overviews changes how to prioritize SERP features:
+
+| Scenario | Traditional Strategy | AI-Era Strategy |
+|---------|---------------------|----------------|
+| Informational query | Win featured snippet | Win AI Overview citation AND featured snippet |
+| Comparison query | Create comparison content | Create structured comparison tables (AI prefers these) |
+| Definition query | Write clear definition for snippet | Write authoritative, citable definition with evidence |
+| How-to query | Create step-by-step list | Create steps with unique insights AI can synthesize |
+| List query | Create comprehensive ranked list | Create list with original data/reasoning AI can cite |
+
+### Key Difference
+
+- **Traditional SERP features** reward **format optimization** (structure your content to match the feature)
+- **AI Overviews** reward **authority and uniqueness** (be the source AI trusts for accurate, original information)
+
+Optimizing for both requires content that is both structurally sound AND substantively authoritative.
diff --git a/.cursor/skills/technical-seo-checker/SKILL.md b/.cursor/skills/technical-seo-checker/SKILL.md
new file mode 100644
index 0000000000..5e0cc76da1
--- /dev/null
+++ b/.cursor/skills/technical-seo-checker/SKILL.md
@@ -0,0 +1,397 @@
+---
+name: technical-seo-checker
+description: 'Technical SEO audit: Core Web Vitals, crawl, indexing, mobile, speed, architecture, redirects. 技术SEO/网站速度'
+version: "6.0.0"
+license: Apache-2.0
+compatibility: "Claude Code ≥1.0, skills.sh marketplace, ClawHub marketplace, Vercel Labs skills ecosystem. No system packages required. Optional: MCP network access for SEO tool integrations."
+homepage: "https://github.com/aaron-he-zhu/seo-geo-claude-skills"
+when_to_use: "Use when checking technical SEO health: site speed, Core Web Vitals, indexing, crawlability, robots.txt, sitemaps, or canonical tags."
+argument-hint: "<URL or domain>"
+allowed-tools: WebFetch
+metadata:
+  author: aaron-he-zhu
+  version: "6.0.0"
+  geo-relevance: "low"
+  tags:
+    - seo
+    - technical-seo
+    - core-web-vitals
+    - page-speed
+    - crawlability
+    - indexability
+    - mobile-seo
+    - site-health
+    - lcp
+    - cls
+    - inp
+    - robots-txt
+    - xml-sitemap
+    - 技术SEO
+    - 网站速度
+    - テクニカルSEO
+    - 기술SEO
+    - seo-tecnico
+  triggers:
+    # EN-formal
+    - "technical SEO audit"
+    - "check page speed"
+    - "Core Web Vitals"
+    - "crawl issues"
+    - "site indexing problems"
+    - "canonical tag issues"
+    - "duplicate content"
+    - "mobile-friendly check"
+    - "site speed"
+    - "site health check"
+    # EN-casual
+    - "my site is slow"
+    - "Google can't crawl my site"
+    - "Google can't find my pages"
+    - "mobile issues"
+    - "indexing problems"
+    - "why is my site slow"
+    # EN-question
+    - "how do I fix my page speed"
+    - "why is my site not indexed"
+    - "how to improve Core Web Vitals"
+    - "why did my site disappear from Google"
+    # EN-competitor
+    - "PageSpeed Insights alternative"
+    - "GTmetrix alternative"
+    - "Sitebulb alternative"
+    # ZH-pro
+    - "技术SEO检查"
+    - "网站速度优化"
+    - "核心网页指标"
+    - "爬虫问题"
+    - "索引问题"
+    - "网站收录"
+    - "sitemap提交"
+    - "robots设置"
+    # ZH-casual
+    - "网站加载太慢"
+    - "网站太慢了"
+    - "Google找不到我的页面"
+    - "手机端有问题"
+    - "收录不了"
+    - "Google收录少"
+    # JA
+    - "テクニカルSEO"
+    - "サイト速度"
+    - "コアウェブバイタル"
+    - "クロール問題"
+    - "インデックス登録"
+    - "モバイル最適化"
+    # KO
+    - "기술 SEO"
+    - "사이트 속도"
+    - "코어 웹 바이탈"
+    - "크롤링 문제"
+    - "사이트 왜 이렇게 느려?"
+    # ES
+    - "auditoría SEO técnica"
+    - "velocidad del sitio"
+    - "problemas de indexación"
+    - "mi sitio no aparece en Google"
+    - "velocidad de carga"
+    # PT
+    - "auditoria SEO técnica"
+    - "meu site não aparece no Google"
+    - "velocidade de carregamento"
+    # Misspellings
+    - "techincal SEO"
+    - "core web vitalls"
+---
+
+# Technical SEO Checker
+
+
+> **[SEO & GEO Skills Library](https://github.com/aaron-he-zhu/seo-geo-claude-skills)** · 20 skills for SEO + GEO · [ClawHub](https://clawhub.ai/u/aaron-he-zhu) · [skills.sh](https://skills.sh/aaron-he-zhu/seo-geo-claude-skills)
+> **System Mode**: This optimization skill follows the shared [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md) and [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md).
+
+
+This skill performs comprehensive technical SEO audits to identify issues that may prevent search engines from properly crawling, indexing, and ranking your site.
+
+**System role**: Optimization layer skill. It turns weak pages, structures, and technical issues into prioritized repair work.
+
+## When This Must Trigger
+
+Use this when the conversation involves any of these situations — even if the user does not use SEO terminology:
+
+Use this whenever the task needs a diagnosis or repair plan that should feed directly into remediation work, not just a one-time opinion.
+
+- Launching a new website
+- Diagnosing ranking drops
+- Pre-migration SEO audits
+- Regular technical health checks
+- Identifying crawl and index issues
+- Improving site performance
+- Fixing Core Web Vitals issues
+
+## What This Skill Does
+
+1. **Crawlability Audit**: Checks robots.txt, sitemaps, crawl issues
+2. **Indexability Review**: Analyzes index status and blockers
+3. **Site Speed Analysis**: Evaluates Core Web Vitals and performance
+4. **Mobile-Friendliness**: Checks mobile optimization
+5. **Security Check**: Reviews HTTPS and security headers
+6. **Structured Data Audit**: Validates schema markup
+7. **URL Structure Analysis**: Reviews URL patterns and redirects
+8. **International SEO**: Checks hreflang and localization
+
+## Quick Start
+
+Start with one of these prompts. Finish with a short handoff summary using the repository format in [Skill Contract](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/skill-contract.md).
+
+### Full Technical Audit
+
+```
+Perform a technical SEO audit for [URL/domain]
+```
+
+### Specific Issue Check
+
+```
+Check Core Web Vitals for [URL]
+```
+
+```
+Audit crawlability and indexability for [domain]
+```
+
+### Pre-Migration Audit
+
+```
+Technical SEO checklist for migrating [old domain] to [new domain]
+```
+
+## Skill Contract
+
+**Expected output**: a scored diagnosis, prioritized repair plan, and a short handoff summary ready for `memory/audits/`.
+
+- **Reads**: the current page or site state, symptoms, prior audits, and current priorities from [CLAUDE.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CLAUDE.md) and the shared [State Model](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/references/state-model.md) when available.
+- **Writes**: a user-facing audit or optimization plan plus a reusable summary that can be stored under `memory/audits/`.
+- **Promotes**: blocking defects, repeated weaknesses, and fix priorities to `memory/open-loops.md` and `memory/decisions.md`.
+- **Next handoff**: use the `Next Best Skill` below when the repair path is clear.
+
+## Data Sources
+
+> See [CONNECTORS.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/CONNECTORS.md) for tool category placeholders.
+
+**With ~~web crawler + ~~page speed tool + ~~CDN connected:**
+Claude can automatically crawl the entire site structure via ~~web crawler, pull Core Web Vitals and performance metrics from ~~page speed tool, analyze caching headers from ~~CDN, and fetch mobile-friendliness data. This enables comprehensive automated technical audits.
+
+**With manual data only:**
+Ask the user to provide:
+1. Site URL(s) to audit
+2. PageSpeed Insights screenshots or reports
+3. robots.txt file content
+4. sitemap.xml URL or file
+
+Proceed with the full audit using provided data. Note in the output which findings are from automated crawl vs. manual review.
+
+## Instructions
+
+When a user requests a technical SEO audit:
+
+1. **Audit Crawlability**
+
+   ```markdown
+   ## Crawlability Analysis
+   
+   ### Robots.txt Review
+   
+   **URL**: [domain]/robots.txt
+   **Status**: [Found/Not Found/Error]
+   
+   **Current Content**:
+   ```
+   [robots.txt content]
+   ```
+   
+   | Check | Status | Notes |
+   |-------|--------|-------|
+   | File exists | ✅/❌ | [notes] |
+   | Valid syntax | ✅/⚠️/❌ | [errors found] |
+   | Sitemap declared | ✅/❌ | [sitemap URL] |
+   | Important pages blocked | ✅/⚠️/❌ | [blocked paths] |
+   | Assets blocked | ✅/⚠️/❌ | [CSS/JS blocked?] |
+   | Correct user-agents | ✅/⚠️/❌ | [notes] |
+   
+   **Issues Found**:
+   - [Issue 1]
+   - [Issue 2]
+   
+   **Recommended robots.txt**:
+   ```
+   User-agent: *
+   Allow: /
+   Disallow: /admin/
+   Disallow: /private/
+   
+   Sitemap: https://example.com/sitemap.xml
+   ```
+   
+   ---
+   
+   ### XML Sitemap Review
+   
+   **Sitemap URL**: [URL]
+   **Status**: [Found/Not Found/Error]
+   
+   | Check | Status | Notes |
+   |-------|--------|-------|
+   | Sitemap exists | ✅/❌ | [notes] |
+   | Valid XML format | ✅/⚠️/❌ | [errors] |
+   | In robots.txt | ✅/❌ | [notes] |
+   | Submitted to ~~search console | ✅/⚠️/❌ | [notes] |
+   | URLs count | [X] | [appropriate?] |
+   | Only indexable URLs | ✅/⚠️/❌ | [notes] |
+   | Includes priority | ✅/⚠️ | [notes] |
+   | Includes lastmod | ✅/⚠️ | [accurate?] |
+   
+   **Issues Found**:
+   - [Issue 1]
+   
+   ---
+   
+   ### Crawl Budget Analysis
+   
+   | Factor | Status | Impact |
+   |--------|--------|--------|
+   | Crawl errors | [X] errors | [Low/Med/High] |
+   | Duplicate content | [X] pages | [Low/Med/High] |
+   | Thin content | [X] pages | [Low/Med/High] |
+   | Redirect chains | [X] found | [Low/Med/High] |
+   | Orphan pages | [X] found | [Low/Med/High] |
+   
+   **Crawlability Score**: [X]/10
+   ```
+
+2. **Audit Indexability**
+
+   ```markdown
+   ## Indexability Analysis
+   
+   ### Index Status Overview
+   
+   | Metric | Count | Notes |
+   |--------|-------|-------|
+   | Pages in sitemap | [X] | |
+   | Pages indexed | [X] | [source: site: search] |
+   | Index coverage ratio | [X]% | [good if >90%] |
+   
+   ### Index Blockers Check
+   
+   | Blocker Type | Found | Pages Affected |
+   |--------------|-------|----------------|
+   | noindex meta tag | [X] | [list or "none"] |
+   | noindex X-Robots | [X] | [list or "none"] |
+   | Robots.txt blocked | [X] | [list or "none"] |
+   | Canonical to other | [X] | [list or "none"] |
+   | 4xx/5xx errors | [X] | [list or "none"] |
+   | Redirect loops | [X] | [list or "none"] |
+   
+   ### Canonical Tags Audit
+   
+   | Check | Status | Notes |
+   |-------|--------|-------|
+   | Canonicals present | ✅/⚠️/❌ | [X]% of pages |
+   | Self-referencing | ✅/⚠️/❌ | [notes] |
+   | Consistent (HTTP/HTTPS) | ✅/⚠️/❌ | [notes] |
+   | Consistent (www/non-www) | ✅/⚠️/❌ | [notes] |
+   | No conflicting signals | ✅/⚠️/❌ | [notes] |
+   
+   ### Duplicate Content Issues
+   
+   | Issue Type | Count | Examples |
+   |------------|-------|----------|
+   | Exact duplicates | [X] | [URLs] |
+   | Near duplicates | [X] | [URLs] |
+   | Parameter duplicates | [X] | [URLs] |
+   | WWW/non-WWW | [X] | [notes] |
+   | HTTP/HTTPS | [X] | [notes] |
+   
+   **Indexability Score**: [X]/10
+   ```
+
+3. **Audit Site Speed & Core Web Vitals** — CWV metrics (LCP/FID/CLS/INP), additional performance metrics (TTFB/FCP/Speed Index/TBT), resource loading breakdown, optimization recommendations
+
+   > **Reference**: See [references/technical-audit-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/technical-seo-checker/references/technical-audit-templates.md) for the performance analysis template (Step 3).
+
+4. **Audit Mobile-Friendliness** — Mobile-friendly test, responsive design check, mobile-first indexing verification
+
+   > **Reference**: See [references/technical-audit-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/technical-seo-checker/references/technical-audit-templates.md) for the mobile optimization template (Step 4).
+
+5. **Audit Security & HTTPS** — SSL certificate, HTTPS enforcement, mixed content, HSTS, security headers (CSP, X-Frame-Options, etc.)
+
+   > **Reference**: See [references/technical-audit-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/technical-seo-checker/references/technical-audit-templates.md) for the security analysis template (Step 5).
+
+6. **Audit URL Structure** — URL patterns, issues (dynamic params, session IDs, uppercase), redirect analysis (chains, loops, 302s)
+
+   > **Reference**: See [references/technical-audit-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/technical-seo-checker/references/technical-audit-templates.md) for the URL structure template (Step 6).
+
+7. **Audit Structured Data** — Schema markup validation, missing schema opportunities. CORE-EEAT alignment: maps to O05.
+
+   > **Reference**: See [references/technical-audit-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/technical-seo-checker/references/technical-audit-templates.md) for the structured data template (Step 7).
+
+8. **Audit International SEO (if applicable)** — Hreflang implementation, language/region targeting
+
+   > **Reference**: See [references/technical-audit-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/technical-seo-checker/references/technical-audit-templates.md) for the international SEO template (Step 8).
+
+9. **Generate Technical Audit Summary** — Overall health score with visual breakdown, critical/high/medium issues, quick wins, implementation roadmap (weeks 1-4+), monitoring recommendations
+
+   > **Reference**: See [references/technical-audit-templates.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/technical-seo-checker/references/technical-audit-templates.md) for the audit summary template (Step 9).
+
+## Validation Checkpoints
+
+### Input Validation
+- [ ] Site URL or domain clearly specified
+- [ ] Access to technical data (robots.txt, sitemap, or crawl results)
+- [ ] Performance metrics available (via ~~page speed tool or screenshots)
+
+### Output Validation
+- [ ] Every recommendation cites specific data points (not generic advice)
+- [ ] All issues include affected URLs or page counts
+- [ ] Performance metrics include actual numbers with units (seconds, KB, etc.)
+- [ ] Source of each data point clearly stated (~~web crawler data, ~~page speed tool, user-provided, or estimated)
+
+## Example
+
+> **Reference**: See [references/technical-audit-example.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/technical-seo-checker/references/technical-audit-example.md) for a full worked example (cloudhosting.com technical audit) and the comprehensive technical SEO checklist.
+
+## Tips for Success
+
+1. **Prioritize by impact** - Fix critical issues first
+2. **Monitor continuously** - Use ~~search console alerts
+3. **Test changes** - Verify fixes work before deploying widely
+4. **Document everything** - Track changes for troubleshooting
+5. **Regular audits** - Schedule quarterly technical reviews
+
+> **Technical reference**: For issue severity framework, prioritization matrix, and Core Web Vitals optimization quick reference, see [references/http-status-codes.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/technical-seo-checker/references/http-status-codes.md).
+
+
+### Save Results
+
+After delivering audit or optimization findings to the user, ask:
+
+> "Save these results for future sessions?"
+
+If yes, write a dated summary to `memory/audits/technical-seo-checker/YYYY-MM-DD-<topic>.md` containing:
+- One-line verdict or headline finding
+- Top 3-5 actionable items
+- Open loops or blockers
+- Source data references
+
+If any veto-level issue was found (CORE-EEAT T04, C01, R10 or CITE T03, T05, T09), also append a one-liner to `memory/hot-cache.md` without asking.
+
+## Reference Materials
+
+- [robots.txt Reference](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/technical-seo-checker/references/robots-txt-reference.md) — Syntax guide, templates, common configurations
+- [HTTP Status Codes](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/technical-seo-checker/references/http-status-codes.md) — SEO impact of each status code, redirect best practices
+- [Technical Audit Templates](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/technical-seo-checker/references/technical-audit-templates.md) — Detailed output templates for steps 3-9 (CWV, mobile, security, URL structure, structured data, international, audit summary)
+- [Technical Audit Example & Checklist](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/technical-seo-checker/references/technical-audit-example.md) — Full worked example and comprehensive technical SEO checklist
+
+## Next Best Skill
+
+- **Primary**: [on-page-seo-auditor](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/on-page-seo-auditor/SKILL.md) — continue from infrastructure issues into page-level remediation.
diff --git a/.cursor/skills/technical-seo-checker/references/http-status-codes.md b/.cursor/skills/technical-seo-checker/references/http-status-codes.md
new file mode 100644
index 0000000000..32fa482435
--- /dev/null
+++ b/.cursor/skills/technical-seo-checker/references/http-status-codes.md
@@ -0,0 +1,705 @@
+# HTTP Status Codes for SEO
+
+SEO-relevant HTTP status codes, their implications, and how to diagnose and fix issues.
+
+## Status Code Categories
+
+- **2xx**: Success - Request succeeded
+- **3xx**: Redirection - Further action needed
+- **4xx**: Client Error - Problem with the request
+- **5xx**: Server Error - Server failed to fulfill request
+
+---
+
+## 2xx Success Codes
+
+### 200 OK
+
+**What it means**: Request succeeded, content returned normally.
+
+**SEO impact**: Positive - page is accessible and indexable.
+
+**When to use**: Standard response for all working pages.
+
+**When it's a problem**: When different URLs return 200 for same content (should use 301 redirect).
+
+---
+
+### 204 No Content
+
+**What it means**: Request succeeded but no content to return.
+
+**SEO impact**: Neutral - rarely used for pages meant to be indexed.
+
+**Common use**: API responses, AJAX requests.
+
+---
+
+## 3xx Redirection Codes
+
+### 301 Moved Permanently
+
+**What it means**: Resource permanently moved to new URL. All link equity transfers.
+
+**SEO impact**: Positive when used correctly - passes 90-99% of link equity.
+
+**When to use**:
+- Permanently changing URL structure
+- Consolidating duplicate content
+- Moving to new domain
+- Changing HTTP to HTTPS
+- Changing www to non-www (or vice versa)
+
+**Example header**:
+```
+HTTP/1.1 301 Moved Permanently
+Location: https://example.com/new-page
+```
+
+**Common mistakes**:
+- Using 302 instead of 301 for permanent changes
+- Creating redirect chains (A→B→C)
+- Redirecting to irrelevant pages
+- Not redirecting HTTP to HTTPS
+
+**How to implement**:
+- **.htaccess** (Apache): `Redirect 301 /old-page /new-page`
+- **nginx**: `rewrite ^/old-page$ /new-page permanent;`
+- **Server-side**: Set Location header with 301 status
+
+---
+
+### 302 Found (Temporary Redirect)
+
+**What it means**: Resource temporarily at different URL. Original URL should still be used.
+
+**SEO impact**: Neutral to negative - does NOT pass full link equity. Search engines keep indexing original URL.
+
+**When to use**:
+- A/B testing
+- Temporary promotions
+- Maintenance redirects
+- Geolocation redirects (sometimes)
+
+**When NOT to use**: Permanent URL changes (use 301).
+
+**Warning**: Google may treat long-standing 302s as 301s, but better to be explicit.
+
+---
+
+### 303 See Other
+
+**What it means**: Response can be found at another URI using GET.
+
+**SEO impact**: Minimal - rarely used for SEO purposes.
+
+**Common use**: After form submissions, redirect to results page.
+
+---
+
+### 307 Temporary Redirect
+
+**What it means**: Temporary redirect that preserves request method (POST stays POST).
+
+**SEO impact**: Similar to 302 - temporary, doesn't pass full link equity.
+
+**Difference from 302**: Guarantees request method won't change (more precise than 302).
+
+**When to use**: Temporary redirects where HTTP method preservation matters.
+
+---
+
+### 308 Permanent Redirect
+
+**What it means**: Permanent redirect that preserves request method.
+
+**SEO impact**: Similar to 301 - passes link equity.
+
+**Difference from 301**: Guarantees request method won't change (POST stays POST).
+
+**When to use**: Permanent redirects where method preservation matters (rare for SEO).
+
+---
+
+### Redirect Chain Issues
+
+**Problem**: Multiple redirects before reaching final destination.
+
+**Example chain**:
+```
+http://example.com/page
+  → https://example.com/page (redirect 1)
+  → https://www.example.com/page (redirect 2)
+  → https://www.example.com/new-page (redirect 3)
+```
+
+**SEO impact**:
+- Slows page load (each redirect = new HTTP request)
+- Dilutes link equity with each hop
+- Wastes crawl budget
+- Poor user experience
+
+**How to fix**: Redirect directly from original URL to final destination.
+
+**Fixed version**:
+```
+http://example.com/page
+  → https://www.example.com/new-page (single redirect)
+```
+
+---
+
+### Redirect Loops
+
+**Problem**: Redirects create infinite loop.
+
+**Example**:
+```
+/page-a → /page-b
+/page-b → /page-a
+```
+
+**SEO impact**: Severe - page completely inaccessible.
+
+**Symptoms**:
+- Browser shows "Too many redirects" error
+- Page never loads
+- Search Console shows crawl errors
+
+**How to diagnose**:
+1. Use redirect checker tool
+2. Check .htaccess or nginx config for conflicting rules
+3. Review server-side redirect logic
+
+**How to fix**:
+1. Identify conflicting redirect rules
+2. Remove or correct the loop
+3. Test thoroughly
+4. Request recrawl in Search Console
+
+---
+
+## 4xx Client Error Codes
+
+### 404 Not Found
+
+**What it means**: Requested resource doesn't exist.
+
+**SEO impact**: Neutral to negative depending on context.
+
+**When 404s are OK**:
+- Legitimately deleted pages with no equivalent
+- Never-existed URLs from typos
+- Temporary content that expired (old promotions)
+- Intentionally removed low-quality content
+
+**When 404s are problems**:
+- Pages that should exist are returning 404
+- Previously working pages now broken
+- Important pages missing from navigation
+- High-traffic pages deleted without redirect
+
+**How to fix**:
+1. **If content moved**: Set up 301 redirect to new location
+2. **If content deleted**: Either keep 404 or redirect to relevant category
+3. **If never existed**: Leave as 404
+4. **If important**: Restore the page
+
+**Monitoring 404s**:
+- Check Search Console → Coverage → Not found (404)
+- Review referrer data to see what's linking to 404s
+- Fix high-value 404s first (most traffic/backlinks)
+
+**Soft 404s** (BAD):
+- Page returns 200 but shows "not found" message
+- Search engines may keep page indexed
+- Creates duplicate content issues
+- Fix: Return proper 404 status code
+
+---
+
+### 410 Gone
+
+**What it means**: Resource permanently deleted, never coming back.
+
+**SEO impact**: Stronger signal than 404 - tells search engines not to return.
+
+**When to use**:
+- Discontinued products
+- Expired promotions
+- Permanently removed content
+- Outdated information
+
+**Difference from 404**:
+- 404: "Not found" (might exist at another URL)
+- 410: "Gone forever" (don't look for it)
+
+**When to use 410 vs 301**:
+- Use 410: No equivalent replacement exists
+- Use 301: Relevant alternative exists
+
+**How search engines respond**:
+- Faster de-indexing than 404
+- Stop crawling sooner
+- Better for crawl budget
+
+---
+
+### 403 Forbidden
+
+**What it means**: Server understood request but refuses to authorize it.
+
+**SEO impact**: Negative - page inaccessible and won't be indexed.
+
+**Common causes**:
+- Permission restrictions
+- IP blocking
+- .htaccess restrictions
+- File permissions (chmod)
+- Authentication required
+
+**When it's intentional**:
+- Admin areas
+- Member-only content
+- Geographic restrictions
+
+**When it's a problem**:
+- Public pages returning 403
+- Search engine bots blocked
+- Accidental permission changes
+
+**How to diagnose**:
+1. Check .htaccess for IP restrictions
+2. Verify file permissions (should be 644 for files, 755 for directories)
+3. Check server-level access rules
+4. Test with different IPs/user-agents
+
+**How to fix**:
+1. Adjust file permissions: `chmod 644 filename`
+2. Remove blocking rules from .htaccess
+3. Whitelist search engine bots
+4. Review server firewall rules
+
+---
+
+### 401 Unauthorized
+
+**What it means**: Authentication required but not provided or failed.
+
+**SEO impact**: Negative - page won't be indexed.
+
+**Common causes**:
+- Password-protected pages
+- HTTP Basic Authentication
+- Expired sessions
+- Missing credentials
+
+**When it's intentional**: Member areas, staging sites, admin panels.
+
+**How to handle for SEO**:
+- Don't password-protect pages you want indexed
+- Use separate staging domain with 401
+- For members-only content, show teaser with meta robots noindex
+
+---
+
+### 429 Too Many Requests
+
+**What it means**: User/bot sent too many requests in given timeframe (rate limiting).
+
+**SEO impact**: Negative if search engines can't crawl.
+
+**Common causes**:
+- Aggressive crawling
+- DDoS protection triggered
+- API rate limits
+- Server throttling
+
+**How to handle**:
+1. Check Googlebot isn't being rate-limited (use Search Console)
+2. Whitelist verified search engine bots
+3. Configure rate limits appropriately
+4. Monitor crawl rate in Search Console
+
+---
+
+## 5xx Server Error Codes
+
+### 500 Internal Server Error
+
+**What it means**: Generic server error, something went wrong.
+
+**SEO impact**: Very negative if persistent - prevents indexing and ranking.
+
+**Common causes**:
+- PHP/code errors
+- Database connection issues
+- .htaccess syntax errors
+- Resource limits exceeded
+- Plugin/theme conflicts (WordPress)
+
+**How to diagnose**:
+1. Check server error logs
+2. Review recent code/config changes
+3. Test locally or on staging
+4. Disable plugins one by one (if CMS)
+5. Check .htaccess syntax
+
+**How to fix**:
+1. Review error logs for specific error
+2. Roll back recent changes
+3. Fix code errors
+4. Increase resource limits if needed
+5. Test thoroughly before re-deploying
+
+**Monitoring**: Set up alerts for 500 errors (sudden spike = problem).
+
+---
+
+### 502 Bad Gateway
+
+**What it means**: Server received invalid response from upstream server.
+
+**SEO impact**: Negative if persistent - prevents crawling/indexing.
+
+**Common causes**:
+- Proxy/load balancer issues
+- Upstream server down
+- Timeout issues
+- Firewall blocking
+
+**Common scenarios**:
+- CDN can't reach origin server
+- Application server crashed
+- Database server unresponsive
+
+**How to fix**:
+1. Check upstream server status
+2. Verify firewall rules
+3. Check timeout settings
+4. Restart proxy/load balancer if needed
+5. Review CDN configuration
+
+---
+
+### 503 Service Unavailable
+
+**What it means**: Server temporarily unable to handle request.
+
+**SEO impact**: Neutral if truly temporary with Retry-After header. Negative if prolonged.
+
+**Common causes**:
+- Maintenance mode
+- Server overload
+- Database down
+- Resource exhaustion
+
+**Proper use for maintenance**:
+```
+HTTP/1.1 503 Service Unavailable
+Retry-After: 3600
+```
+
+**Best practices for maintenance**:
+1. Use 503 (not 404 or 500)
+2. Include Retry-After header
+3. Keep maintenance brief (<24 hours)
+4. Schedule during low-traffic times
+5. Inform users with clear message
+
+**How search engines handle 503**:
+- Short-term (hours): Will retry, no ranking impact
+- Long-term (days+): May drop rankings, de-index pages
+
+---
+
+### 504 Gateway Timeout
+
+**What it means**: Server didn't receive timely response from upstream server.
+
+**SEO impact**: Negative - prevents crawling.
+
+**Common causes**:
+- Slow database queries
+- External API timeouts
+- Insufficient server resources
+- Network issues
+
+**How to fix**:
+1. Optimize slow queries
+2. Increase timeout limits
+3. Add caching
+4. Scale server resources
+5. Review external dependencies
+
+---
+
+## Status Code Decision Flowchart
+
+### Content Moved Permanently?
+→ YES: Use **301 redirect**
+→ NO: Continue
+
+### Content Moved Temporarily?
+→ YES: Use **302 redirect**
+→ NO: Continue
+
+### Content Deleted with No Replacement?
+→ YES: Use **404** (or **410** if permanently gone)
+→ NO: Continue
+
+### Content Exists at This URL?
+→ YES: Use **200 OK**
+→ NO: Use **404**
+
+### Need Authentication?
+→ YES: Use **401**
+→ NO: Continue
+
+### Access Forbidden?
+→ YES: Use **403**
+→ NO: Continue
+
+### Server Error?
+→ YES: Use **500**, **502**, **503**, or **504** depending on cause
+→ NO: Use **200 OK**
+
+---
+
+## Diagnosing Status Code Issues
+
+### Tools
+
+**Browser DevTools**:
+1. Open DevTools (F12)
+2. Go to Network tab
+3. Reload page
+4. Check status code in first request
+
+**cURL command**:
+```bash
+curl -I https://example.com/page
+```
+
+**Online checkers**:
+- httpstatus.io
+- redirect-checker.org
+- websiteplanet.com/webtools/redirects/
+
+**Google Search Console**:
+- Coverage report → Error/Excluded sections
+- URL Inspection tool → Check specific URLs
+
+---
+
+### Common Diagnostic Scenarios
+
+### "Page Won't Index"
+
+**Check**:
+1. Status code (should be 200)
+2. Redirects (shouldn't redirect away)
+3. 4xx/5xx errors
+4. robots.txt blocking
+5. noindex meta tag
+
+### "Page Disappeared from Results"
+
+**Check**:
+1. Returns 404/410/5xx
+2. Redirecting elsewhere (301/302)
+3. Changed to 403/401
+4. Server timing out (504)
+
+### "Traffic Dropped After Migration"
+
+**Check**:
+1. Old URLs return 404 (should be 301)
+2. Redirect chains (should be direct)
+3. Redirect loops
+4. Wrong redirect type (302 vs 301)
+5. Incorrect redirect targets
+
+---
+
+## Status Codes and Crawl Budget
+
+### Impact on Crawl Budget
+
+**Efficient (minimal impact)**:
+- 200 OK
+- 301 redirects (if minimal chains)
+- 410 Gone (removes from crawl queue)
+
+**Moderate impact**:
+- 302 redirects (search engine may keep checking)
+- 404 errors (search engines periodically recheck)
+- Redirect chains (multiple requests per URL)
+
+**High impact (wasteful)**:
+- 5xx errors (search engines retry frequently)
+- Redirect loops (waste crawl budget)
+- Soft 404s (search engine confused, keeps crawling)
+- 429 rate limiting (prevents efficient crawling)
+
+---
+
+## SEO Status Code Best Practices
+
+### For Migrations
+
+- [ ] Use 301 redirects for all permanently moved pages
+- [ ] Redirect directly to final destination (no chains)
+- [ ] Test all redirects before launching
+- [ ] Keep redirects in place for at least 1 year
+- [ ] Monitor 404 errors in Search Console post-launch
+- [ ] Map 1:1 where possible (old URL → equivalent new URL)
+
+### For Deleted Content
+
+- [ ] Use 301 if relevant replacement exists
+- [ ] Use 404 if no replacement and might return
+- [ ] Use 410 if permanently gone, never returning
+- [ ] Don't redirect to irrelevant pages (creates soft 404)
+- [ ] Create custom 404 page with search and navigation
+
+### For Maintenance
+
+- [ ] Use 503 with Retry-After header
+- [ ] Keep maintenance window brief (<24 hours)
+- [ ] Create user-friendly maintenance page
+- [ ] Inform users of expected downtime
+- [ ] Monitor Search Console for crawl issues
+
+### For Performance
+
+- [ ] Minimize redirect chains
+- [ ] Fix redirect loops immediately
+- [ ] Monitor 5xx errors closely
+- [ ] Set up alerts for sudden status code changes
+- [ ] Optimize to reduce 504 timeouts
+
+---
+
+## Status Code Monitoring
+
+### Key Metrics to Track
+
+**In Search Console**:
+- Crawl errors by type
+- Server errors (5xx) trend
+- Not found (404) trend
+- Redirect errors
+
+**In analytics**:
+- 404 page views
+- Entry pages with high exit rate (might be errors)
+- Sudden traffic drops (could indicate status code issues)
+
+**Server logs**:
+- Status code distribution
+- 5xx error frequency
+- Unusual patterns
+
+### Setting Up Alerts
+
+**Alert on**:
+- Sudden increase in 5xx errors
+- Increase in 404 errors
+- New redirect chains
+- Crawl error spikes in Search Console
+
+**Tools**:
+- Google Search Console email alerts
+- Server monitoring (UptimeRobot, Pingdom)
+- Log analysis tools
+- Custom scripts for log monitoring
+
+---
+
+## Quick Reference Table
+
+| Code | Name | SEO Impact | Use When | Passes Link Equity? |
+|------|------|------------|----------|---------------------|
+| 200 | OK | ✅ Positive | Page works normally | N/A (original URL) |
+| 301 | Moved Permanently | ✅ Positive | Permanent URL change | ✅ Yes (~90-99%) |
+| 302 | Found | ⚠️ Neutral | Temporary redirect | ❌ No |
+| 307 | Temporary Redirect | ⚠️ Neutral | Temporary (method preserved) | ❌ No |
+| 308 | Permanent Redirect | ✅ Positive | Permanent (method preserved) | ✅ Yes |
+| 404 | Not Found | ⚠️ Neutral | Content doesn't exist | N/A |
+| 410 | Gone | ⚠️ Neutral | Permanent deletion | N/A |
+| 403 | Forbidden | ❌ Negative | Access denied | N/A |
+| 401 | Unauthorized | ❌ Negative | Auth required | N/A |
+| 500 | Internal Server Error | ❌ Negative | Server error | N/A |
+| 502 | Bad Gateway | ❌ Negative | Upstream error | N/A |
+| 503 | Service Unavailable | ⚠️ Neutral | Temporary downtime | N/A |
+| 504 | Gateway Timeout | ❌ Negative | Timeout error | N/A |
+
+---
+
+## Status Code Testing Checklist
+
+Before launching site changes:
+
+- [ ] Test all redirects return correct status codes
+- [ ] Verify no redirect chains exist
+- [ ] Check no redirect loops present
+- [ ] Confirm important pages return 200
+- [ ] Ensure deleted pages return 404/410 (not 200)
+- [ ] Verify 301s point to correct destinations
+- [ ] Test with multiple user-agents
+- [ ] Check status codes in Search Console
+- [ ] Monitor server logs for unusual patterns
+- [ ] Set up alerts for error spikes
+
+---
+
+## Technical SEO Severity Framework
+
+### Issue Classification
+
+| Severity | Impact Description | Examples | Response Time |
+|----------|-------------------|---------|---------------|
+| **Critical** | Prevents indexation or causes site-wide issues | Robots.txt blocking site, noindex on key pages, site-wide 500 errors | Same day |
+| **High** | Significantly impacts rankings or user experience | Slow page speed, missing hreflang, duplicate content, redirect chains | Within 1 week |
+| **Medium** | Affects specific pages or has moderate impact | Missing schema, suboptimal canonicals, thin content pages | Within 1 month |
+| **Low** | Minor optimization opportunities | Image compression, minor CLS issues, non-essential schema missing | Next quarter |
+
+### Technical Debt Prioritization Matrix
+
+| Factor | Weight | Assessment |
+|--------|--------|-----------|
+| Pages affected | 30% | Site-wide > Section > Single page |
+| Revenue impact | 25% | Revenue pages > Blog > Utility pages |
+| Fix difficulty | 20% | Config change < Template change < Code rewrite |
+| Competitive impact | 15% | Competitors passing you > parity > you ahead |
+| Crawl budget waste | 10% | High waste > Moderate > Minimal |
+
+## Core Web Vitals Optimization Quick Reference
+
+### LCP (Largest Contentful Paint) Optimization
+
+| Root Cause | Detection | Fix |
+|-----------|-----------|-----|
+| Large hero image | PageSpeed Insights | Serve WebP, resize to container, add loading="lazy" |
+| Render-blocking CSS/JS | DevTools Coverage | Defer non-critical, inline critical CSS |
+| Slow server response | TTFB >800ms | CDN, server-side caching, upgrade hosting |
+| Third-party scripts | DevTools Network | Defer/async, use facade pattern |
+
+### CLS (Cumulative Layout Shift) Optimization
+
+| Root Cause | Detection | Fix |
+|-----------|-----------|-----|
+| Images without dimensions | DevTools | Add explicit width/height attributes |
+| Ads/embeds without reserved space | Visual inspection | Set min-height on containers |
+| Web fonts causing FOUT | DevTools | font-display: swap + preload fonts |
+| Dynamic content injection | Visual inspection | Reserve space with CSS |
+
+### INP (Interaction to Next Paint) Optimization
+
+| Root Cause | Detection | Fix |
+|-----------|-----------|-----|
+| Long JavaScript tasks | DevTools Performance | Break into smaller tasks, use requestIdleCallback |
+| Heavy event handlers | DevTools | Debounce/throttle, use passive listeners |
+| Main thread blocking | DevTools | Web workers for heavy computation |
+
diff --git a/.cursor/skills/technical-seo-checker/references/robots-txt-reference.md b/.cursor/skills/technical-seo-checker/references/robots-txt-reference.md
new file mode 100644
index 0000000000..589952dec0
--- /dev/null
+++ b/.cursor/skills/technical-seo-checker/references/robots-txt-reference.md
@@ -0,0 +1,717 @@
+# Robots.txt Reference Guide
+
+Complete reference for creating, testing, and troubleshooting robots.txt files.
+
+## Syntax Guide
+
+### Basic Structure
+
+```
+User-agent: [bot name]
+Disallow: [path to block]
+Allow: [path to allow]
+Sitemap: [sitemap URL]
+Crawl-delay: [seconds]
+```
+
+---
+
+## Core Directives
+
+### User-agent
+
+Specifies which bot the rules apply to.
+
+**Syntax**: `User-agent: [bot-name]`
+
+**Common user-agents**:
+```
+User-agent: *                    # All bots
+User-agent: Googlebot            # Google's crawler
+User-agent: Bingbot              # Bing's crawler
+User-agent: GPTBot               # OpenAI's crawler
+User-agent: CCBot                # Common Crawl bot
+User-agent: anthropic-ai         # Anthropic's crawler
+User-agent: PerplexityBot        # Perplexity AI crawler
+User-agent: ClaudeBot            # Claude's web crawler
+```
+
+**Multiple user-agents**: Group rules by leaving no blank lines between user-agent declarations.
+
+```
+User-agent: Googlebot
+User-agent: Bingbot
+Disallow: /admin/
+```
+
+---
+
+### Disallow
+
+Blocks bots from crawling specified paths.
+
+**Syntax**: `Disallow: [path]`
+
+**Examples**:
+```
+Disallow: /                      # Block entire site
+Disallow: /admin/                # Block admin directory
+Disallow: /private               # Block private directory (and subdirectories)
+Disallow: /*.pdf$                # Block all PDF files
+Disallow: /*?                    # Block all URLs with parameters
+Disallow:                        # Allow everything (empty disallow)
+```
+
+**Path matching**:
+- `/` at end = block directory and all subdirectories
+- Without `/` at end = block all paths starting with string
+- `*` = wildcard, matches any sequence
+- `$` = end of URL
+
+---
+
+### Allow
+
+Explicitly allows crawling (overrides Disallow).
+
+**Syntax**: `Allow: [path]`
+
+**Common use**: Allow specific subdirectories within blocked parent.
+
+```
+User-agent: *
+Disallow: /admin/
+Allow: /admin/public/
+```
+
+**Note**: Allow is not standard but supported by Google, Bing, and most major crawlers.
+
+---
+
+### Sitemap
+
+Specifies location of XML sitemap.
+
+**Syntax**: `Sitemap: [absolute URL]`
+
+**Examples**:
+```
+Sitemap: https://example.com/sitemap.xml
+Sitemap: https://example.com/sitemap_index.xml
+Sitemap: https://example.com/blog/sitemap.xml
+```
+
+**Best practices**:
+- Use absolute URLs (not relative)
+- Can include multiple Sitemap directives
+- Place at end of file
+- Submit same sitemap(s) to Google Search Console
+
+---
+
+### Crawl-delay
+
+Adds delay between requests (seconds).
+
+**Syntax**: `Crawl-delay: [seconds]`
+
+**Example**:
+```
+User-agent: *
+Crawl-delay: 10
+```
+
+**Warning**: Not supported by Googlebot (use Search Console rate limiting instead). Supported by Bing, Yandex, and others.
+
+---
+
+## Common Configurations
+
+### 1. Allow All Bots (Default)
+
+```
+User-agent: *
+Disallow:
+
+Sitemap: https://example.com/sitemap.xml
+```
+
+Use when you want all bots to crawl entire site.
+
+---
+
+### 2. Block All Bots
+
+```
+User-agent: *
+Disallow: /
+```
+
+Use for development/staging sites or private content.
+
+---
+
+### 3. Block Specific Directories
+
+```
+User-agent: *
+Disallow: /admin/
+Disallow: /private/
+Disallow: /temp/
+Disallow: /cgi-bin/
+
+Sitemap: https://example.com/sitemap.xml
+```
+
+Standard configuration blocking admin and utility directories.
+
+---
+
+### 4. Block All AI Crawlers
+
+```
+# Block OpenAI
+User-agent: GPTBot
+Disallow: /
+
+# Block Anthropic
+User-agent: anthropic-ai
+User-agent: ClaudeBot
+Disallow: /
+
+# Block Common Crawl
+User-agent: CCBot
+Disallow: /
+
+# Block Perplexity
+User-agent: PerplexityBot
+Disallow: /
+
+# Block Google-Extended (Bard training)
+User-agent: Google-Extended
+Disallow: /
+
+# Allow search engines
+User-agent: Googlebot
+Disallow:
+
+User-agent: Bingbot
+Disallow:
+
+Sitemap: https://example.com/sitemap.xml
+```
+
+Use when you want search indexing but not AI training.
+
+---
+
+### 5. Allow Search Engines, Block Everything Else
+
+```
+# Block all by default
+User-agent: *
+Disallow: /
+
+# Allow Google
+User-agent: Googlebot
+Disallow:
+
+# Allow Bing
+User-agent: Bingbot
+Disallow:
+
+# Allow DuckDuckGo
+User-agent: DuckDuckBot
+Disallow:
+
+Sitemap: https://example.com/sitemap.xml
+```
+
+---
+
+### 6. Block URL Parameters
+
+```
+User-agent: *
+Disallow: /*?                    # Block all URLs with parameters
+Allow: /?                        # Allow homepage with parameters
+
+Sitemap: https://example.com/sitemap.xml
+```
+
+Prevents duplicate content from parameter variations.
+
+---
+
+### 7. Block File Types
+
+```
+User-agent: *
+Disallow: /*.pdf$
+Disallow: /*.doc$
+Disallow: /*.xls$
+Disallow: /*.zip$
+
+Sitemap: https://example.com/sitemap.xml
+```
+
+---
+
+### 8. E-commerce Configuration
+
+```
+User-agent: *
+# Block search/filter pages
+Disallow: /*?q=
+Disallow: /*?sort=
+Disallow: /*?filter=
+
+# Block account pages
+Disallow: /account/
+Disallow: /cart/
+Disallow: /checkout/
+
+# Block admin
+Disallow: /admin/
+
+# Allow product pages
+Allow: /products/
+
+Sitemap: https://example.com/sitemap.xml
+```
+
+---
+
+### 9. WordPress Configuration
+
+```
+User-agent: *
+# WordPress core
+Disallow: /wp-admin/
+Allow: /wp-admin/admin-ajax.php
+
+# WordPress directories
+Disallow: /wp-includes/
+Disallow: /wp-content/plugins/
+Disallow: /wp-content/themes/
+
+# Allow uploads
+Allow: /wp-content/uploads/
+
+# Block parameter pages
+Disallow: /?s=
+Disallow: /feed/
+Disallow: /trackback/
+
+Sitemap: https://example.com/sitemap_index.xml
+```
+
+---
+
+### 10. Shopify Configuration
+
+```
+User-agent: *
+# Block admin and account
+Disallow: /admin
+Disallow: /account
+Disallow: /cart
+Disallow: /checkout
+
+# Block search
+Disallow: /search
+
+# Block collections with filters
+Disallow: /collections/*+*
+Disallow: /collections/*?*
+
+Sitemap: https://example.com/sitemap.xml
+```
+
+---
+
+## Platform-Specific Templates
+
+### Wix
+
+```
+User-agent: *
+Disallow: /_api/
+Disallow: /_partials/
+
+Sitemap: https://example.com/sitemap.xml
+```
+
+### Squarespace
+
+```
+User-agent: *
+Disallow: /config/
+Disallow: /search
+
+Sitemap: https://example.com/sitemap.xml
+```
+
+### Webflow
+
+```
+User-agent: *
+Allow: /
+
+Sitemap: https://example.com/sitemap.xml
+```
+
+### Drupal
+
+```
+User-agent: *
+Disallow: /admin/
+Disallow: /user/
+Disallow: /node/add/
+Disallow: /?q=
+
+Sitemap: https://example.com/sitemap.xml
+```
+
+---
+
+## Testing and Validation
+
+### Google Search Console Robots.txt Tester
+
+1. Go to: Search Console → Settings → robots.txt
+2. View current robots.txt
+3. Test specific URLs
+4. See which user-agents are affected
+
+### Manual Testing
+
+Test URL pattern: `https://example.com/robots.txt`
+
+Check file is:
+- Accessible (returns 200 status)
+- Plain text format
+- UTF-8 encoded
+- Located at root domain
+- No more than 500KB (Google limit)
+
+### Common Testing Scenarios
+
+Test these URLs in tester:
+- Homepage: `/`
+- Product page: `/products/example`
+- Admin page: `/admin/`
+- Parameter page: `/search?q=test`
+- File: `/document.pdf`
+
+---
+
+## Common Mistakes and Fixes
+
+### Mistake 1: Blocking CSS/JS Files
+
+**Wrong**:
+```
+User-agent: *
+Disallow: /css/
+Disallow: /js/
+```
+
+**Why it's wrong**: Google needs CSS/JS to render pages properly.
+
+**Fix**:
+```
+User-agent: *
+Allow: /css/
+Allow: /js/
+```
+
+---
+
+### Mistake 2: Using Relative URLs for Sitemap
+
+**Wrong**:
+```
+Sitemap: /sitemap.xml
+```
+
+**Fix**:
+```
+Sitemap: https://example.com/sitemap.xml
+```
+
+---
+
+### Mistake 3: Spaces in Directives
+
+**Wrong**:
+```
+User-agent : Googlebot
+Disallow : /admin/
+```
+
+**Fix** (no spaces before colons):
+```
+User-agent: Googlebot
+Disallow: /admin/
+```
+
+---
+
+### Mistake 4: Forgetting Trailing Slash
+
+**Intention**: Block /admin directory
+
+**Wrong**:
+```
+Disallow: /admin
+```
+
+**Result**: Also blocks /admin-panel, /administrator, etc.
+
+**Fix**:
+```
+Disallow: /admin/
+```
+
+---
+
+### Mistake 5: Blocking Entire Site Accidentally
+
+**Wrong**:
+```
+User-agent: *
+Disallow: /
+Allow: /blog/
+```
+
+**Why it's wrong**: Many bots don't support Allow directive.
+
+**Fix**: Use noindex meta tags for pages you don't want indexed, not robots.txt.
+
+---
+
+### Mistake 6: Not Blocking Development Environments
+
+**Wrong**: No robots.txt on staging.example.com
+
+**Result**: Staging site gets indexed.
+
+**Fix**:
+```
+User-agent: *
+Disallow: /
+```
+
+On all non-production environments.
+
+---
+
+### Mistake 7: Case Sensitivity Errors
+
+**Note**: Directives are case-insensitive, but paths are case-sensitive.
+
+**Example**:
+```
+Disallow: /Admin/        # Blocks /Admin/ but not /admin/
+```
+
+**Fix**: Block both if needed:
+```
+Disallow: /admin/
+Disallow: /Admin/
+```
+
+---
+
+## Advanced Patterns
+
+### Wildcard Examples
+
+```
+# Block all PDFs
+Disallow: /*.pdf$
+
+# Block all URLs with parameters
+Disallow: /*?
+
+# Block all URLs ending in .php
+Disallow: /*.php$
+
+# Block all admin paths regardless of location
+Disallow: /*/admin/
+```
+
+### Multiple Sitemaps
+
+```
+Sitemap: https://example.com/sitemap-pages.xml
+Sitemap: https://example.com/sitemap-posts.xml
+Sitemap: https://example.com/sitemap-products.xml
+```
+
+### Bot-Specific Rules
+
+```
+# Aggressive bot - slow it down
+User-agent: BadBot
+Crawl-delay: 60
+Disallow: /
+
+# Good bots - full access
+User-agent: Googlebot
+User-agent: Bingbot
+Disallow:
+
+# Default for others
+User-agent: *
+Crawl-delay: 10
+Disallow: /admin/
+```
+
+---
+
+## Robots.txt vs Meta Robots vs X-Robots-Tag
+
+### When to use each:
+
+**Robots.txt**:
+- Block crawling of entire directories
+- Reduce crawl budget waste
+- Block parameter variations
+- Does NOT prevent indexing if page is linked from elsewhere
+
+**Meta robots tag**:
+- Prevent specific pages from being indexed
+- Control snippet display
+- Control following links
+- Example: `<meta name="robots" content="noindex,follow">`
+
+**X-Robots-Tag HTTP header**:
+- Control non-HTML files (PDFs, images)
+- Server-level control
+- Example: `X-Robots-Tag: noindex`
+
+**Important**: If you don't want a page indexed, use noindex (meta tag or header), NOT robots.txt.
+
+---
+
+## Monitoring and Maintenance
+
+### Regular Checks
+
+**Monthly**:
+- [ ] Verify robots.txt is accessible
+- [ ] Check Search Console for blocked URLs
+- [ ] Review crawl stats for blocked resources
+
+**Quarterly**:
+- [ ] Audit blocked paths - still relevant?
+- [ ] Check for new admin/private sections to block
+- [ ] Review AI crawler landscape (new bots?)
+
+**After site changes**:
+- [ ] Update robots.txt if URL structure changed
+- [ ] Test new sections (should they be blocked?)
+- [ ] Verify sitemaps still referenced
+
+### Search Console Monitoring
+
+Check these reports:
+- **Coverage** → Excluded by robots.txt
+- **Settings** → Crawl stats
+- **URL Inspection** → Test specific URLs
+
+---
+
+## Robots.txt Checklist
+
+Before deploying:
+
+- [ ] File is named exactly `robots.txt` (lowercase)
+- [ ] Located at root domain (`example.com/robots.txt`)
+- [ ] Plain text format (not HTML or PDF)
+- [ ] UTF-8 encoding
+- [ ] No HTML tags in file
+- [ ] All paths start with `/`
+- [ ] Sitemap URLs are absolute
+- [ ] No spaces before colons
+- [ ] Tested in Search Console robots.txt tester
+- [ ] Not blocking important CSS/JS/images
+- [ ] Not blocking content you want indexed
+- [ ] Trailing slashes used correctly for directories
+- [ ] Wildcard patterns tested
+- [ ] File size under 500KB
+
+---
+
+## Emergency Fixes
+
+### Accidentally Blocked Entire Site
+
+**Symptom**: All pages blocked in Search Console
+
+**Fix**:
+1. Edit robots.txt to:
+```
+User-agent: *
+Disallow:
+
+Sitemap: https://example.com/sitemap.xml
+```
+2. Test in Search Console
+3. Request urgent recrawl for key pages
+4. Monitor Coverage report for recovery
+
+**Recovery time**: 1-7 days
+
+---
+
+### Blocked CSS/JS Files
+
+**Symptom**: "Blocked by robots.txt" in Mobile-Friendly Test
+
+**Fix**:
+1. Add Allow directives:
+```
+User-agent: *
+Allow: /css/
+Allow: /js/
+Allow: /wp-content/uploads/
+```
+2. Test in robots.txt tester
+3. Request re-render in URL Inspection tool
+
+---
+
+### Staging Site Indexed
+
+**Symptom**: staging.example.com appears in search results
+
+**Fix**:
+1. Add to staging robots.txt:
+```
+User-agent: *
+Disallow: /
+```
+2. Add noindex meta tag to all staging pages
+3. Remove staging URLs in Search Console (Removals tool)
+
+---
+
+## Resources and Tools
+
+**Testing**:
+- Google Search Console robots.txt tester
+- Bing Webmaster Tools robots.txt analyzer
+- Technical SEO browser extensions
+
+**Validation**:
+- https://www.google.com/webmasters/tools/robots-testing-tool
+- https://en.ryte.com/free-tools/robots-txt/
+- https://technicalseo.com/tools/robots-txt/
+
+**Documentation**:
+- Google: https://developers.google.com/search/docs/crawling-indexing/robots/intro
+- Bing: https://www.bing.com/webmasters/help/robots-txt-validation
+- Robots.txt spec: https://www.robotstxt.org/
diff --git a/.cursor/skills/technical-seo-checker/references/technical-audit-example.md b/.cursor/skills/technical-seo-checker/references/technical-audit-example.md
new file mode 100644
index 0000000000..5f2d6ad434
--- /dev/null
+++ b/.cursor/skills/technical-seo-checker/references/technical-audit-example.md
@@ -0,0 +1,169 @@
+# Technical SEO Checker — Worked Example & Checklist
+
+Referenced from [SKILL.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/technical-seo-checker/SKILL.md).
+
+---
+
+## Worked Example
+
+**User**: "Check the technical SEO of cloudhosting.com"
+
+**Output**:
+
+```markdown
+# Technical SEO Audit Report
+
+**Domain**: cloudhosting.com
+**Audit Date**: 2024-09-15
+**Pages Analyzed**: 312
+
+## Crawlability Analysis
+
+### Robots.txt Review
+
+**URL**: cloudhosting.com/robots.txt
+**Status**: Found
+
+| Check | Status | Notes |
+|-------|--------|-------|
+| File exists | ✅ | 200 response |
+| Valid syntax | ⚠️ | Wildcard pattern `Disallow: /*?` too aggressive — blocks faceted pages |
+| Sitemap declared | ❌ | No Sitemap directive in robots.txt |
+| Important pages blocked | ⚠️ | /pricing/ blocked by `Disallow: /pricing` rule |
+| Assets blocked | ✅ | CSS/JS accessible |
+
+**Issues Found**:
+- Sitemap URL not declared in robots.txt
+- `/pricing/` inadvertently blocked — high-value commercial page
+
+### XML Sitemap Review
+
+**Sitemap URL**: cloudhosting.com/sitemap.xml
+**Status**: Found (not referenced in robots.txt)
+
+| Check | Status | Notes |
+|-------|--------|-------|
+| Sitemap exists | ✅ | Valid XML, 287 URLs |
+| Only indexable URLs | ❌ | 23 noindex URLs included |
+| Includes lastmod | ⚠️ | All dates set to 2023-01-01 — not accurate |
+
+**Crawlability Score**: 5/10
+
+## Performance Analysis
+
+### Core Web Vitals
+
+| Metric | Mobile | Desktop | Target | Status |
+|--------|--------|---------|--------|--------|
+| LCP (Largest Contentful Paint) | 4.8s | 2.1s | <2.5s | ❌ Mobile / ✅ Desktop |
+| FID (First Input Delay) | 45ms | 12ms | <100ms | ✅ / ✅ |
+| CLS (Cumulative Layout Shift) | 0.24 | 0.08 | <0.1 | ❌ Mobile / ✅ Desktop |
+| INP (Interaction to Next Paint) | 380ms | 140ms | <200ms | ❌ Mobile / ✅ Desktop |
+
+### Additional Performance Metrics
+
+| Metric | Value | Status |
+|--------|-------|--------|
+| Time to First Byte (TTFB) | 1,240ms | ❌ |
+| Page Size | 3.8MB | ❌ |
+| Requests | 94 | ⚠️ |
+
+**LCP Issues**:
+- Uncompressed hero image (2.4MB PNG): Convert to WebP, est. save 1.9MB
+- No CDN detected: TTFB 1,240ms from origin server
+
+**CLS Issues**:
+- Ad banner at top of page injects without reserved height (0.18 shift contribution)
+
+**Performance Score**: 3/10
+
+## Security Analysis
+
+### HTTPS Status
+
+| Check | Status | Notes |
+|-------|--------|-------|
+| SSL certificate valid | ✅ | Expires: 2025-03-22 |
+| HTTPS enforced | ⚠️ | http://cloudhosting.com returns 200 instead of 301 redirect |
+| Mixed content | ❌ | 7 images loaded over HTTP on /features/ page |
+| HSTS enabled | ❌ | Header not present |
+
+**Security Score**: 5/10
+
+## Structured Data Analysis
+
+### Schema Markup Found
+
+| Schema Type | Pages | Valid | Errors |
+|-------------|-------|-------|--------|
+| Organization | 1 (homepage) | ✅ | None |
+| Article | 0 | — | Missing on 48 blog posts |
+| Product | 0 | — | Missing on 5 plan pages |
+| FAQ | 0 | — | Missing on 12 pages with FAQ content |
+
+**Structured Data Score**: 3/10
+
+## Overall Technical Health: 42/100
+
+```
+Score Breakdown:
+█████░░░░░ Crawlability: 5/10
+██████░░░░ Indexability: 6/10
+███░░░░░░░ Performance: 3/10
+██████░░░░ Mobile: 6/10
+█████░░░░░ Security: 5/10
+██████░░░░ URL Structure: 6/10
+███░░░░░░░ Structured Data: 3/10
+```
+
+## Priority Issues
+
+### 🔴 Critical (Fix Immediately)
+1. **Mobile LCP 4.8s (target <2.5s)** — Compress hero image to WebP (est. save 1.9MB) and implement a CDN to reduce TTFB from 1,240ms to <400ms.
+
+### 🟡 Important (Fix Soon)
+2. **HTTP not redirecting to HTTPS** — Add 301 redirect from http:// to https:// and enable HSTS header. 7 mixed-content images on /features/ need URL updates.
+
+### 🟢 Minor (Optimize)
+3. **No Article/FAQ schema on blog posts** — Add Article schema to 48 blog posts and FAQ schema to 12 FAQ pages for rich result eligibility.
+```
+
+---
+
+## Technical SEO Checklist
+
+```markdown
+### Crawlability
+- [ ] robots.txt is valid and not blocking important content
+- [ ] XML sitemap exists and is submitted to ~~search console
+- [ ] No crawl errors in ~~search console
+- [ ] No redirect chains or loops
+
+### Indexability
+- [ ] Important pages are indexable
+- [ ] Canonical tags are correct
+- [ ] No duplicate content issues
+- [ ] Pagination is handled correctly
+
+### Performance
+- [ ] Core Web Vitals pass
+- [ ] Page speed under 3 seconds
+- [ ] Images are optimized
+- [ ] JS/CSS are minified
+
+### Mobile
+- [ ] Mobile-friendly test passes
+- [ ] Viewport is configured
+- [ ] Touch elements are properly sized
+
+### Security
+- [ ] HTTPS is enforced
+- [ ] SSL certificate is valid
+- [ ] No mixed content
+- [ ] Security headers present
+
+### Structure
+- [ ] URLs are clean and descriptive
+- [ ] Site architecture is logical
+- [ ] Internal linking is strong
+```
diff --git a/.cursor/skills/technical-seo-checker/references/technical-audit-templates.md b/.cursor/skills/technical-seo-checker/references/technical-audit-templates.md
new file mode 100644
index 0000000000..71af860ccf
--- /dev/null
+++ b/.cursor/skills/technical-seo-checker/references/technical-audit-templates.md
@@ -0,0 +1,311 @@
+# Technical SEO Checker — Output Templates
+
+Detailed output templates for technical-seo-checker steps 3-9. Referenced from [SKILL.md](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/optimize/technical-seo-checker/SKILL.md).
+
+---
+
+## Step 3: Audit Site Speed & Core Web Vitals
+
+```markdown
+## Performance Analysis
+
+### Core Web Vitals
+
+| Metric | Mobile | Desktop | Target | Status |
+|--------|--------|---------|--------|--------|
+| LCP (Largest Contentful Paint) | [X]s | [X]s | <2.5s | ✅/⚠️/❌ |
+| FID (First Input Delay) | [X]ms | [X]ms | <100ms | ✅/⚠️/❌ |
+| CLS (Cumulative Layout Shift) | [X] | [X] | <0.1 | ✅/⚠️/❌ |
+| INP (Interaction to Next Paint) | [X]ms | [X]ms | <200ms | ✅/⚠️/❌ |
+
+### Additional Performance Metrics
+
+| Metric | Value | Status |
+|--------|-------|--------|
+| Time to First Byte (TTFB) | [X]ms | ✅/⚠️/❌ |
+| First Contentful Paint (FCP) | [X]s | ✅/⚠️/❌ |
+| Speed Index | [X] | ✅/⚠️/❌ |
+| Total Blocking Time | [X]ms | ✅/⚠️/❌ |
+| Page Size | [X]MB | ✅/⚠️/❌ |
+| Requests | [X] | ✅/⚠️/❌ |
+
+### Performance Issues
+
+**LCP Issues**:
+- [Issue]: [Impact] - [Solution]
+- [Issue]: [Impact] - [Solution]
+
+**CLS Issues**:
+- [Issue]: [Impact] - [Solution]
+
+**Resource Loading**:
+| Resource Type | Count | Size | Issues |
+|---------------|-------|------|--------|
+| Images | [X] | [X]MB | [notes] |
+| JavaScript | [X] | [X]MB | [notes] |
+| CSS | [X] | [X]KB | [notes] |
+| Fonts | [X] | [X]KB | [notes] |
+
+### Optimization Recommendations
+
+**High Impact**:
+1. [Recommendation] - Est. improvement: [X]s
+2. [Recommendation] - Est. improvement: [X]s
+
+**Medium Impact**:
+1. [Recommendation]
+2. [Recommendation]
+
+**Performance Score**: [X]/10
+```
+
+---
+
+## Step 4: Audit Mobile-Friendliness
+
+```markdown
+## Mobile Optimization Analysis
+
+### Mobile-Friendly Test
+
+| Check | Status | Notes |
+|-------|--------|-------|
+| Mobile-friendly overall | ✅/❌ | [notes] |
+| Viewport configured | ✅/❌ | [viewport tag] |
+| Text readable | ✅/⚠️/❌ | Font size: [X]px |
+| Tap targets sized | ✅/⚠️/❌ | [notes] |
+| Content fits viewport | ✅/❌ | [notes] |
+| No horizontal scroll | ✅/❌ | [notes] |
+
+### Responsive Design Check
+
+| Element | Desktop | Mobile | Issues |
+|---------|---------|--------|--------|
+| Navigation | [status] | [status] | [notes] |
+| Images | [status] | [status] | [notes] |
+| Forms | [status] | [status] | [notes] |
+| Tables | [status] | [status] | [notes] |
+| Videos | [status] | [status] | [notes] |
+
+### Mobile-First Indexing
+
+| Check | Status | Notes |
+|-------|--------|-------|
+| Mobile version has all content | ✅/⚠️/❌ | [notes] |
+| Mobile has same structured data | ✅/⚠️/❌ | [notes] |
+| Mobile has same meta tags | ✅/⚠️/❌ | [notes] |
+| Mobile images have alt text | ✅/⚠️/❌ | [notes] |
+
+**Mobile Score**: [X]/10
+```
+
+---
+
+## Step 5: Audit Security & HTTPS
+
+```markdown
+## Security Analysis
+
+### HTTPS Status
+
+| Check | Status | Notes |
+|-------|--------|-------|
+| SSL certificate valid | ✅/❌ | Expires: [date] |
+| HTTPS enforced | ✅/❌ | [redirects properly?] |
+| Mixed content | ✅/⚠️/❌ | [X] issues |
+| HSTS enabled | ✅/⚠️ | [notes] |
+| Certificate chain | ✅/⚠️/❌ | [notes] |
+
+### Security Headers
+
+| Header | Present | Value | Recommended |
+|--------|---------|-------|-------------|
+| Content-Security-Policy | ✅/❌ | [value] | [recommendation] |
+| X-Frame-Options | ✅/❌ | [value] | DENY or SAMEORIGIN |
+| X-Content-Type-Options | ✅/❌ | [value] | nosniff |
+| X-XSS-Protection | ✅/❌ | [value] | 1; mode=block |
+| Referrer-Policy | ✅/❌ | [value] | [recommendation] |
+
+**Security Score**: [X]/10
+```
+
+---
+
+## Step 6: Audit URL Structure
+
+```markdown
+## URL Structure Analysis
+
+### URL Pattern Review
+
+| Check | Status | Notes |
+|-------|--------|-------|
+| HTTPS URLs | ✅/⚠️/❌ | [X]% HTTPS |
+| Lowercase URLs | ✅/⚠️/❌ | [notes] |
+| No special characters | ✅/⚠️/❌ | [notes] |
+| Readable/descriptive | ✅/⚠️/❌ | [notes] |
+| Appropriate length | ✅/⚠️/❌ | Avg: [X] chars |
+| Keywords in URLs | ✅/⚠️/❌ | [notes] |
+| Consistent structure | ✅/⚠️/❌ | [notes] |
+
+### URL Issues Found
+
+| Issue Type | Count | Examples |
+|------------|-------|----------|
+| Dynamic parameters | [X] | [URLs] |
+| Session IDs in URLs | [X] | [URLs] |
+| Uppercase characters | [X] | [URLs] |
+| Special characters | [X] | [URLs] |
+| Very long URLs (>100) | [X] | [URLs] |
+
+### Redirect Analysis
+
+| Check | Status | Notes |
+|-------|--------|-------|
+| Redirect chains | [X] found | [max chain length] |
+| Redirect loops | [X] found | [URLs] |
+| 302 → 301 needed | [X] found | [URLs] |
+| Broken redirects | [X] found | [URLs] |
+
+**URL Score**: [X]/10
+```
+
+---
+
+## Step 7: Audit Structured Data
+
+> **CORE-EEAT alignment**: Schema markup quality maps to O05 (Schema Markup) in the CORE-EEAT benchmark. See [content-quality-auditor](https://github.com/aaron-he-zhu/seo-geo-claude-skills/blob/main/cross-cutting/content-quality-auditor/SKILL.md) for full content quality audit.
+
+```markdown
+## Structured Data Analysis
+
+### Schema Markup Found
+
+| Schema Type | Pages | Valid | Errors |
+|-------------|-------|-------|--------|
+| [Type 1] | [X] | ✅/❌ | [errors] |
+| [Type 2] | [X] | ✅/❌ | [errors] |
+
+### Validation Results
+
+**Errors**:
+- [Error 1]: [affected pages] - [solution]
+- [Error 2]: [affected pages] - [solution]
+
+**Warnings**:
+- [Warning 1]: [notes]
+
+### Missing Schema Opportunities
+
+| Page Type | Current Schema | Recommended |
+|-----------|----------------|-------------|
+| Blog posts | [current] | Article + FAQ |
+| Products | [current] | Product + Review |
+| Homepage | [current] | Organization |
+
+**Structured Data Score**: [X]/10
+```
+
+---
+
+## Step 8: Audit International SEO (if applicable)
+
+```markdown
+## International SEO Analysis
+
+### Hreflang Implementation
+
+| Check | Status | Notes |
+|-------|--------|-------|
+| Hreflang tags present | ✅/❌ | [notes] |
+| Self-referencing | ✅/⚠️/❌ | [notes] |
+| Return tags present | ✅/⚠️/❌ | [notes] |
+| Valid language codes | ✅/⚠️/❌ | [notes] |
+| x-default tag | ✅/⚠️ | [notes] |
+
+### Language/Region Targeting
+
+| Language | URL | Hreflang | Status |
+|----------|-----|----------|--------|
+| [en-US] | [URL] | [tag] | ✅/⚠️/❌ |
+| [es-ES] | [URL] | [tag] | ✅/⚠️/❌ |
+
+**International Score**: [X]/10
+```
+
+---
+
+## Step 9: Generate Technical Audit Summary
+
+```markdown
+# Technical SEO Audit Report
+
+**Domain**: [domain]
+**Audit Date**: [date]
+**Pages Analyzed**: [X]
+
+## Overall Technical Health: [X]/100
+
+```
+Score Breakdown:
+████████░░ Crawlability: 8/10
+███████░░░ Indexability: 7/10
+█████░░░░░ Performance: 5/10
+████████░░ Mobile: 8/10
+█████████░ Security: 9/10
+██████░░░░ URL Structure: 6/10
+█████░░░░░ Structured Data: 5/10
+```
+
+## Critical Issues (Fix Immediately)
+
+1. **[Issue]**: [Impact]
+   - Affected: [pages/scope]
+   - Solution: [specific fix]
+   - Priority: 🔴 Critical
+
+2. **[Issue]**: [Impact]
+   - Affected: [pages/scope]
+   - Solution: [specific fix]
+   - Priority: 🔴 Critical
+
+## High Priority Issues
+
+1. **[Issue]**: [Solution]
+2. **[Issue]**: [Solution]
+
+## Medium Priority Issues
+
+1. **[Issue]**: [Solution]
+2. **[Issue]**: [Solution]
+
+## Quick Wins
+
+These can be fixed quickly for immediate improvement:
+
+1. [Quick fix 1]
+2. [Quick fix 2]
+3. [Quick fix 3]
+
+## Implementation Roadmap
+
+### Week 1: Critical Fixes
+- [ ] [Task 1]
+- [ ] [Task 2]
+
+### Week 2-3: High Priority
+- [ ] [Task 1]
+- [ ] [Task 2]
+
+### Week 4+: Optimization
+- [ ] [Task 1]
+- [ ] [Task 2]
+
+## Monitoring Recommendations
+
+Set up alerts for:
+- Core Web Vitals drops
+- Crawl error spikes
+- Index coverage changes
+- Security issues
+```
diff --git a/.cursor/skills/ugc-brief/.skillfish.json b/.cursor/skills/ugc-brief/.skillfish.json
new file mode 100644
index 0000000000..426656d0a6
--- /dev/null
+++ b/.cursor/skills/ugc-brief/.skillfish.json
@@ -0,0 +1,10 @@
+{
+  "version": 2,
+  "name": "ugc-brief",
+  "owner": "walkerhi11",
+  "repo": "mediabuy-plugins",
+  "path": ".claude/skills/ugc-brief",
+  "branch": "main",
+  "sha": "98bdcf49353c883665432be36f81b94aee6f188d",
+  "source": "manual"
+}
\ No newline at end of file
diff --git a/.cursor/skills/ugc-brief/SKILL.md b/.cursor/skills/ugc-brief/SKILL.md
new file mode 100644
index 0000000000..35b1c6f24a
--- /dev/null
+++ b/.cursor/skills/ugc-brief/SKILL.md
@@ -0,0 +1,260 @@
+---
+name: ugc-brief
+description: Create briefs for UGC creators including product info, hook requirements, talking points, and authenticity guidelines. Use when working with UGC creators, preparing creator packages, or scaling content production with external talent.
+---
+
+# UGC Brief Generator
+
+Create creator-ready briefs for authentic UGC content.
+
+## Process
+
+### Step 1: Define Product & Key Messages
+
+**Product Overview:**
+- Product name and category
+- Key benefits (top 3)
+- Unique mechanism
+- Price point/offer
+- Target customer
+
+**Core Messages:**
+- Primary claim/promise
+- Secondary proof points
+- Must-mention features
+- Compliance requirements
+
+### Step 2: Specify Hook Requirements
+
+**Hook Type Needed:**
+- [ ] Greed (savings, value)
+- [ ] Relevancy (timely, seasonal)
+- [ ] Emotion (reaction, story)
+- [ ] Demographic (call out audience)
+- [ ] Cliffhanger (curiosity gap)
+
+**Hook Examples to Reference:**
+- "[Winning hook from brand]"
+- "[Competitor hook working well]"
+
+**Hook Don'ts:**
+- No scripted openings
+- No "Hey guys" generic starts
+- No product name in first 3 seconds
+
+### Step 3: List Talking Points
+
+**Required Elements:**
+1. Problem acknowledgment
+2. Discovery story or proof
+3. Product solution
+4. Specific benefit/result
+5. Call to action
+
+**Proof Points to Include:**
+- [ ] Personal result/experience
+- [ ] Specific numbers if applicable
+- [ ] Before/after if relevant
+- [ ] Endorsement of mechanism
+
+**Optional Elements:**
+- Unboxing moment
+- Application/usage demo
+- Comparison to alternative
+- Response to objection
+
+### Step 4: Include Authenticity Guidelines
+
+**DO:**
+- Speak naturally, not scripted
+- Use your own words
+- Show genuine reactions
+- Include imperfections
+- Film in natural lighting
+- Use phone (not professional camera)
+- Look at camera like talking to friend
+
+**DON'T:**
+- Read from script (bullet points OK)
+- Over-polish or heavily edit
+- Use professional lighting/studio
+- Add music or effects
+- Sound like a commercial
+- Fake enthusiasm
+- Use brand jargon unnaturally
+
+**Authenticity Test:**
+"Would you actually send this video to a friend recommending the product?"
+
+### Step 5: Output Creator Brief
+
+```
+## UGC BRIEF: [Product Name]
+
+### CREATOR DETAILS
+- Creator name: [If assigned]
+- Platform: [TikTok/IG/FB]
+- Format: [Vertical 9:16]
+- Length: [15-60 seconds]
+- Deliverable: [Raw footage / Edited]
+
+---
+
+### PRODUCT INFO
+
+**Product:** [Name]
+**What it does:** [One sentence]
+**Key benefit:** [Primary promise]
+**Price/Offer:** [Current deal]
+**Website:** [URL]
+
+**Unique selling point:**
+[What makes this different]
+
+---
+
+### YOUR ANGLE
+
+**You are:** [Avatar description]
+- Age range: [X-X]
+- Situation: [Context]
+- Problem you had: [Pain point]
+- How you found product: [Discovery]
+
+---
+
+### HOOK (First 3 seconds)
+
+**Type:** [Hook category]
+
+**Examples to inspire (don't copy exactly):**
+- "[Example hook 1]"
+- "[Example hook 2]"
+
+**Requirements:**
+- Stop the scroll immediately
+- Don't mention product name yet
+- Create curiosity or relate to problem
+
+---
+
+### TALKING POINTS (Middle)
+
+Cover these in your own words:
+
+1. **Problem:** [Describe the struggle]
+   - Your words: _______________
+
+2. **Discovery:** [How you found it]
+   - Your words: _______________
+
+3. **Experience:** [What happened when you tried it]
+   - Your words: _______________
+
+4. **Proof:** [Specific result or benefit]
+   - Your words: _______________
+
+**Must mention:**
+- [ ] [Required element 1]
+- [ ] [Required element 2]
+
+**Nice to include:**
+- [ ] [Optional element]
+
+---
+
+### CALL TO ACTION (Last 5 seconds)
+
+**Say something like:**
+- "Link is in my bio"
+- "Tap the link to try it"
+- "I put the link below"
+
+**Add urgency if natural:**
+- "They're running a sale right now"
+- "I heard they sell out fast"
+
+---
+
+### FILMING GUIDELINES
+
+**Setting:**
+- [ ] Natural environment (home, outside)
+- [ ] Good natural lighting (face a window)
+- [ ] Clean but not perfect background
+- [ ] Vertical format (phone camera)
+
+**Style:**
+- [ ] Talking to a friend, not performing
+- [ ] Eye contact with camera
+- [ ] Natural energy (match your personality)
+- [ ] OK to have mistakes/restarts in raw footage
+
+**Technical:**
+- [ ] Film in 1080p or 4K
+- [ ] Clear audio (quiet space)
+- [ ] Multiple takes encouraged
+- [ ] Send raw footage (we'll edit)
+
+---
+
+### DO NOT
+
+- Script word-for-word
+- Use professional equipment
+- Add filters or effects
+- Sound like an infomercial
+- Make claims we can't substantiate
+- Mention competitors by name
+
+---
+
+### DELIVERABLES
+
+**Send:**
+- [ ] Raw video file(s)
+- [ ] Multiple takes if available
+- [ ] Any B-roll of product
+
+**Format:** [MP4/MOV]
+**Due:** [Date]
+**Send to:** [Email/Platform]
+
+---
+
+### PAYMENT & RIGHTS
+
+**Compensation:** [Amount/Terms]
+**Usage rights:** [Perpetual/Limited]
+**Exclusivity:** [Yes/No/Terms]
+
+---
+
+### REFERENCE VIDEOS
+[Links to examples of style wanted]
+
+### QUESTIONS?
+Contact: [Name/Email]
+```
+
+## UGC Best Practices (LeadsIcon/Kamal)
+
+**Why Native Beats Polish:**
+- Audiences smell BS instantly
+- Platform algorithms favor native content
+- Trust comes from relatability
+- Perfect = ad = skip
+
+**Creator Selection:**
+- Match target demographic
+- Natural on-camera presence
+- Actually would use product
+- Not over-produced style
+
+**Volume Strategy:**
+- Multiple creators same brief
+- Each brings unique delivery
+- Mix and match best clips
+- Keep testing new faces
+
+Source: LeadsIcon, Kamal FounderAds
diff --git a/.cursor/skills/vercel-react-best-practices/AGENTS.md b/.cursor/skills/vercel-react-best-practices/AGENTS.md
index 94c3c84413..2b839ab516 100644
--- a/.cursor/skills/vercel-react-best-practices/AGENTS.md
+++ b/.cursor/skills/vercel-react-best-practices/AGENTS.md
@@ -549,6 +549,8 @@ Preload heavy bundles before they're needed to reduce perceived latency.
 **Example: preload on hover/focus**
 
 ```tsx
+import { Button } from '@/components/ui/button'
+
 function EditorButton({ onClick }: { onClick: () => void }) {
   const preload = () => {
     if (typeof window !== 'undefined') {
@@ -557,13 +559,13 @@ function EditorButton({ onClick }: { onClick: () => void }) {
   }
 
   return (
-    <button
+    <Button
       onMouseEnter={preload}
       onFocus={preload}
       onClick={onClick}
     >
       Open Editor
-    </button>
+    </Button>
   )
 }
 ```
@@ -1239,11 +1241,12 @@ function StaticContent() {
 **For mutations:**
 
 ```tsx
+import { Button } from '@/components/ui/button'
 import { useSWRMutation } from 'swr/mutation'
 
 function UpdateButton() {
   const { trigger } = useSWRMutation('/api/user', updateUser)
-  return <button onClick={() => trigger()}>Update</button>
+  return <Button onClick={() => trigger()}>Update</Button>
 }
 ```
 
@@ -1369,6 +1372,8 @@ Don't subscribe to dynamic state (searchParams, localStorage) if you only read i
 **Incorrect: subscribes to all searchParams changes**
 
 ```tsx
+import { Button } from '@/components/ui/button'
+
 function ShareButton({ chatId }: { chatId: string }) {
   const searchParams = useSearchParams()
 
@@ -1377,13 +1382,15 @@ function ShareButton({ chatId }: { chatId: string }) {
     shareChat(chatId, { ref })
   }
 
-  return <button onClick={handleShare}>Share</button>
+  return <Button onClick={handleShare}>Share</Button>
 }
 ```
 
 **Correct: reads on demand, no subscription**
 
 ```tsx
+import { Button } from '@/components/ui/button'
+
 function ShareButton({ chatId }: { chatId: string }) {
   const handleShare = () => {
     const params = new URLSearchParams(window.location.search)
@@ -1391,7 +1398,7 @@ function ShareButton({ chatId }: { chatId: string }) {
     shareChat(chatId, { ref })
   }
 
-  return <button onClick={handleShare}>Share</button>
+  return <Button onClick={handleShare}>Share</Button>
 }
 ```
 
@@ -1549,6 +1556,8 @@ If a side effect is triggered by a specific user action (submit, click, drag), r
 **Incorrect: event modeled as state + effect**
 
 ```tsx
+import { Button } from '@/components/ui/button'
+
 function Form() {
   const [submitted, setSubmitted] = useState(false)
   const theme = useContext(ThemeContext)
@@ -1560,13 +1569,15 @@ function Form() {
     }
   }, [submitted, theme])
 
-  return <button onClick={() => setSubmitted(true)}>Submit</button>
+  return <Button onClick={() => setSubmitted(true)}>Submit</Button>
 }
 ```
 
 **Correct: do it in the handler**
 
 ```tsx
+import { Button } from '@/components/ui/button'
+
 function Form() {
   const theme = useContext(ThemeContext)
 
@@ -1575,7 +1586,7 @@ function Form() {
     showToast('Registered', theme)
   }
 
-  return <button onClick={handleSubmit}>Submit</button>
+  return <Button onClick={handleSubmit}>Submit</Button>
 }
 ```
 
diff --git a/.cursor/skills/vercel-react-best-practices/rules/bundle-preload.md b/.cursor/skills/vercel-react-best-practices/rules/bundle-preload.md
index 7000504065..0662ef81bd 100644
--- a/.cursor/skills/vercel-react-best-practices/rules/bundle-preload.md
+++ b/.cursor/skills/vercel-react-best-practices/rules/bundle-preload.md
@@ -12,6 +12,8 @@ Preload heavy bundles before they're needed to reduce perceived latency.
 **Example (preload on hover/focus):**
 
 ```tsx
+import { Button } from "@/components/ui/button"
+
 function EditorButton({ onClick }: { onClick: () => void }) {
   const preload = () => {
     if (typeof window !== 'undefined') {
@@ -20,13 +22,13 @@ function EditorButton({ onClick }: { onClick: () => void }) {
   }
 
   return (
-    <button
+    <Button
       onMouseEnter={preload}
       onFocus={preload}
       onClick={onClick}
     >
       Open Editor
-    </button>
+    </Button>
   )
 }
 ```
diff --git a/.cursor/skills/vercel-react-best-practices/rules/client-swr-dedup.md b/.cursor/skills/vercel-react-best-practices/rules/client-swr-dedup.md
index 2a430f27fd..22d419bcaf 100644
--- a/.cursor/skills/vercel-react-best-practices/rules/client-swr-dedup.md
+++ b/.cursor/skills/vercel-react-best-practices/rules/client-swr-dedup.md
@@ -45,11 +45,12 @@ function StaticContent() {
 **For mutations:**
 
 ```tsx
+import { Button } from '@/components/ui/button'
 import { useSWRMutation } from 'swr/mutation'
 
 function UpdateButton() {
   const { trigger } = useSWRMutation('/api/user', updateUser)
-  return <button onClick={() => trigger()}>Update</button>
+  return <Button onClick={() => trigger()}>Update</Button>
 }
 ```
 
diff --git a/.cursor/skills/vercel-react-best-practices/rules/rerender-defer-reads.md b/.cursor/skills/vercel-react-best-practices/rules/rerender-defer-reads.md
index e867c95f02..94410bc5b9 100644
--- a/.cursor/skills/vercel-react-best-practices/rules/rerender-defer-reads.md
+++ b/.cursor/skills/vercel-react-best-practices/rules/rerender-defer-reads.md
@@ -12,6 +12,8 @@ Don't subscribe to dynamic state (searchParams, localStorage) if you only read i
 **Incorrect (subscribes to all searchParams changes):**
 
 ```tsx
+import { Button } from '@/components/ui/button'
+
 function ShareButton({ chatId }: { chatId: string }) {
   const searchParams = useSearchParams()
 
@@ -20,13 +22,15 @@ function ShareButton({ chatId }: { chatId: string }) {
     shareChat(chatId, { ref })
   }
 
-  return <button onClick={handleShare}>Share</button>
+  return <Button onClick={handleShare}>Share</Button>
 }
 ```
 
 **Correct (reads on demand, no subscription):**
 
 ```tsx
+import { Button } from '@/components/ui/button'
+
 function ShareButton({ chatId }: { chatId: string }) {
   const handleShare = () => {
     const params = new URLSearchParams(window.location.search)
@@ -34,6 +38,6 @@ function ShareButton({ chatId }: { chatId: string }) {
     shareChat(chatId, { ref })
   }
 
-  return <button onClick={handleShare}>Share</button>
+  return <Button onClick={handleShare}>Share</Button>
 }
 ```
diff --git a/.cursor/skills/vercel-react-best-practices/rules/rerender-move-effect-to-event.md b/.cursor/skills/vercel-react-best-practices/rules/rerender-move-effect-to-event.md
index dd58a1af09..299815d697 100644
--- a/.cursor/skills/vercel-react-best-practices/rules/rerender-move-effect-to-event.md
+++ b/.cursor/skills/vercel-react-best-practices/rules/rerender-move-effect-to-event.md
@@ -12,6 +12,8 @@ If a side effect is triggered by a specific user action (submit, click, drag), r
 **Incorrect (event modeled as state + effect):**
 
 ```tsx
+import { Button } from '@/components/ui/button'
+
 function Form() {
   const [submitted, setSubmitted] = useState(false)
   const theme = useContext(ThemeContext)
@@ -23,13 +25,15 @@ function Form() {
     }
   }, [submitted, theme])
 
-  return <button onClick={() => setSubmitted(true)}>Submit</button>
+  return <Button onClick={() => setSubmitted(true)}>Submit</Button>
 }
 ```
 
 **Correct (do it in the handler):**
 
 ```tsx
+import { Button } from '@/components/ui/button'
+
 function Form() {
   const theme = useContext(ThemeContext)
 
@@ -38,7 +42,7 @@ function Form() {
     showToast('Registered', theme)
   }
 
-  return <button onClick={handleSubmit}>Submit</button>
+  return <Button onClick={handleSubmit}>Submit</Button>
 }
 ```
 
diff --git a/.github/workflows/desktop-release.yml b/.github/workflows/desktop-release.yml
index b5536eb34d..ad1c128bcf 100644
--- a/.github/workflows/desktop-release.yml
+++ b/.github/workflows/desktop-release.yml
@@ -22,6 +22,7 @@ on:
 
 permissions:
   contents: write
+  id-token: write
 
 jobs:
   build:
@@ -58,6 +59,22 @@ jobs:
           fi
           echo "VERSION=$VERSION" >> "$GITHUB_OUTPUT"
 
+      - name: Detect Windows signing eligibility
+        id: sign
+        shell: bash
+        run: |
+          # Sign Windows builds only on production v* tags (not beta-v*, not workflow_dispatch).
+          # This matches the single OIDC federated credential configured in Entra ID.
+          if [ "${{ matrix.os }}" = "windows-latest" ] \
+            && [ "${{ github.event_name }}" = "push" ] \
+            && [[ "$GITHUB_REF" == refs/tags/v* ]]; then
+            echo "enabled=true" >> "$GITHUB_OUTPUT"
+            echo "Windows signing: ENABLED (v* tag on windows-latest)"
+          else
+            echo "enabled=false" >> "$GITHUB_OUTPUT"
+            echo "Windows signing: skipped"
+          fi
+
       - name: Setup pnpm
         uses: pnpm/action-setup@v5
 
@@ -98,7 +115,44 @@ jobs:
 
       - name: Package & Publish
         shell: bash
-        run: pnpm exec electron-builder ${{ matrix.platform }} --config electron-builder.yml --publish ${{ inputs.publish || 'always' }} -c.extraMetadata.version=${{ steps.version.outputs.VERSION }}
+        run: |
+          CMD=(pnpm exec electron-builder ${{ matrix.platform }} \
+            --config electron-builder.yml \
+            --publish "${{ inputs.publish || 'always' }}" \
+            -c.extraMetadata.version="${{ steps.version.outputs.VERSION }}")
+
+          if [ "${{ steps.sign.outputs.enabled }}" = "true" ]; then
+            CMD+=(-c.win.azureSignOptions.publisherName="$WINDOWS_PUBLISHER_NAME")
+            CMD+=(-c.win.azureSignOptions.endpoint="$AZURE_CODESIGN_ENDPOINT")
+            CMD+=(-c.win.azureSignOptions.codeSigningAccountName="$AZURE_CODESIGN_ACCOUNT")
+            CMD+=(-c.win.azureSignOptions.certificateProfileName="$AZURE_CODESIGN_PROFILE")
+          fi
+
+          "${CMD[@]}"
         working-directory: surfsense_desktop
         env:
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          WINDOWS_PUBLISHER_NAME: ${{ vars.WINDOWS_PUBLISHER_NAME }}
+          AZURE_CODESIGN_ENDPOINT: ${{ vars.AZURE_CODESIGN_ENDPOINT }}
+          AZURE_CODESIGN_ACCOUNT: ${{ vars.AZURE_CODESIGN_ACCOUNT }}
+          AZURE_CODESIGN_PROFILE: ${{ vars.AZURE_CODESIGN_PROFILE }}
+          # macOS Developer ID signing + notarization. Only the macos-latest runner
+          # consumes these; Windows/Linux runners ignore them. CSC_LINK accepts either
+          # a file path or a base64-encoded .p12 blob — electron-builder auto-detects.
+          CSC_LINK: ${{ secrets.MAC_CERT_P12_BASE64 }}
+          CSC_KEY_PASSWORD: ${{ secrets.MAC_CERT_PASSWORD }}
+          APPLE_ID: ${{ secrets.APPLE_ID }}
+          APPLE_APP_SPECIFIC_PASSWORD: ${{ secrets.APPLE_APP_SPECIFIC_PASSWORD }}
+          APPLE_TEAM_ID: ${{ secrets.APPLE_TEAM_ID }}
+          # TEMP DEBUG — remove once the codesign hang on macos-latest is diagnosed.
+          # Surfaces the exact codesign / notarize commands electron-builder spawns,
+          # so we can see which subprocess hangs.
+          DEBUG: electron-builder,electron-osx-sign*,@electron/notarize*
+          ELECTRON_BUILDER_ALLOW_UNRESOLVED_DEPENDENCIES: "true"
+          # Service principal credentials for Azure.Identity EnvironmentCredential used by the
+          # TrustedSigning PowerShell module. Only populated when signing is enabled.
+          # electron-builder 26 does not yet support OIDC federated tokens for Azure signing,
+          # so we fall back to client-secret auth. Rotate AZURE_CLIENT_SECRET before expiry.
+          AZURE_TENANT_ID: ${{ steps.sign.outputs.enabled == 'true' && secrets.AZURE_TENANT_ID || '' }}
+          AZURE_CLIENT_ID: ${{ steps.sign.outputs.enabled == 'true' && secrets.AZURE_CLIENT_ID || '' }}
+          AZURE_CLIENT_SECRET: ${{ steps.sign.outputs.enabled == 'true' && secrets.AZURE_CLIENT_SECRET || '' }}
diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
index 9b3b3eb510..d5c0113e93 100644
--- a/.github/workflows/docker-build.yml
+++ b/.github/workflows/docker-build.yml
@@ -38,7 +38,7 @@ jobs:
       new_tag: ${{ steps.tag_version.outputs.next_version }}
     steps:
       - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
         with:
           fetch-depth: 0
           ref: ${{ github.event.inputs.branch }}
@@ -115,16 +115,18 @@ jobs:
             name: surfsense-backend
             context: ./surfsense_backend
             file: ./surfsense_backend/Dockerfile
+            target: production
           - image: web
             name: surfsense-web
             context: ./surfsense_web
             file: ./surfsense_web/Dockerfile
+            target: runner
     env:
       REGISTRY_IMAGE: ghcr.io/${{ github.repository_owner }}/${{ matrix.name }}
 
     steps:
       - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
 
       - name: Set lowercase image name
         id: image
@@ -132,19 +134,19 @@ jobs:
 
       - name: Docker meta
         id: meta
-        uses: docker/metadata-action@v5
+        uses: docker/metadata-action@v6
         with:
           images: ${{ steps.image.outputs.name }}
 
       - name: Login to GitHub Container Registry
-        uses: docker/login-action@v3
+        uses: docker/login-action@v4
         with:
           registry: ghcr.io
           username: ${{ github.repository_owner }}
           password: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@v4
 
       - name: Free up disk space
         run: |
@@ -156,10 +158,11 @@ jobs:
 
       - name: Build and push by digest ${{ matrix.name }} (${{ matrix.suffix }})
         id: build
-        uses: docker/build-push-action@v6
+        uses: docker/build-push-action@v7
         with:
           context: ${{ matrix.context }}
           file: ${{ matrix.file }}
+          target: ${{ matrix.target }}
           labels: ${{ steps.meta.outputs.labels }}
           tags: ${{ steps.image.outputs.name }}
           outputs: type=image,push-by-digest=true,name-canonical=true,push=true
@@ -181,7 +184,7 @@ jobs:
           touch "/tmp/digests/${digest#sha256:}"
 
       - name: Upload digest
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v7
         with:
           name: digests-${{ matrix.image }}-${{ matrix.suffix }}
           path: /tmp/digests/*
@@ -212,22 +215,22 @@ jobs:
         run: echo "name=${REGISTRY_IMAGE,,}" >> $GITHUB_OUTPUT
 
       - name: Download amd64 digest
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v8
         with:
           name: digests-${{ matrix.image }}-amd64
           path: /tmp/digests
 
       - name: Download arm64 digest
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v8
         with:
           name: digests-${{ matrix.image }}-arm64
           path: /tmp/digests
 
       - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@v4
 
       - name: Login to GitHub Container Registry
-        uses: docker/login-action@v3
+        uses: docker/login-action@v4
         with:
           registry: ghcr.io
           username: ${{ github.repository_owner }}
@@ -246,7 +249,7 @@ jobs:
 
       - name: Docker meta
         id: meta
-        uses: docker/metadata-action@v5
+        uses: docker/metadata-action@v6
         with:
           images: ${{ steps.image.outputs.name }}
           tags: |
diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
new file mode 100644
index 0000000000..b87537dab3
--- /dev/null
+++ b/.github/workflows/e2e-tests.yml
@@ -0,0 +1,174 @@
+name: E2E Tests
+
+on:
+  pull_request:
+    branches: [main, dev]
+    types: [opened, synchronize, reopened, ready_for_review]
+    paths:
+      - 'surfsense_web/**'
+      - 'surfsense_backend/**'
+      - 'docker/docker-compose.e2e.yml'
+      - '.github/workflows/e2e-tests.yml'
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  e2e:
+    name: Journey
+    runs-on: ubuntu-latest
+    if: github.event.pull_request.draft == false
+    timeout-minutes: 30
+
+    env:
+      # Test user that the backend creates via /auth/register before Playwright runs.
+      PLAYWRIGHT_TEST_EMAIL: e2e-test@surfsense.net
+      PLAYWRIGHT_TEST_PASSWORD: E2eTestPassword123!
+      # Frontend env: Playwright's webServer (surfsense_web/playwright.config.ts)
+      # spawns `pnpm build && pnpm start` in CI; these get baked into the build.
+      NEXT_PUBLIC_FASTAPI_BACKEND_URL: http://localhost:8000
+      NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE: LOCAL
+      # Shared secret for the test-only POST /__e2e__/auth/token endpoint.
+      # Must match docker-compose.e2e.yml's backend env (x-backend-env).
+      E2E_MINT_SECRET: e2e-mint-secret-not-for-production
+
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v4
+
+      # ─── Backend stack ─────────────────────────────────────────────────
+      # Builds the e2e image (multi-stage, deps cached via GHA), brings up
+      # db + redis + backend + celery_worker, blocks until every healthcheck
+      # is green. No `uv` invocation on the runner; no PID files; no curl
+      # polling loops; readiness is gated by Docker healthchecks.
+      - name: Build & start backend stack
+        run: |
+          docker compose -f docker/docker-compose.e2e.yml \
+            up -d --build --wait --wait-timeout 300
+
+      - name: Show backend stack status
+        if: always()
+        run: docker compose -f docker/docker-compose.e2e.yml ps
+
+      - name: Register E2E test user
+        run: |
+          # 200/201 = created, 400 = already exists (idempotent across reruns).
+          STATUS=$(curl -s -o /tmp/register.json -w "%{http_code}" \
+            -X POST http://localhost:8000/auth/register \
+            -H "Content-Type: application/json" \
+            -d "{\"email\":\"${PLAYWRIGHT_TEST_EMAIL}\",\"password\":\"${PLAYWRIGHT_TEST_PASSWORD}\"}")
+          echo "Register status: ${STATUS}"
+          cat /tmp/register.json
+          if [ "${STATUS}" != "200" ] && [ "${STATUS}" != "201" ] && [ "${STATUS}" != "400" ]; then
+            echo "::error::Failed to register test user (status ${STATUS})"
+            exit 1
+          fi
+
+          # Flush auth rate-limit counters so Playwright starts clean.
+          docker compose -f docker/docker-compose.e2e.yml exec -T redis \
+            sh -c "redis-cli --scan --pattern 'surfsense:auth_rate_limit:*' \
+              | xargs -r redis-cli DEL" || true
+
+      # ─── Frontend (host-side) ──────────────────────────────────────────
+      # Playwright's webServer block in playwright.config.ts spawns
+      # `pnpm build && pnpm start` in CI mode and waits for :3000.
+      - uses: actions/setup-node@v6
+        with:
+          node-version: '20'
+
+      - uses: pnpm/action-setup@v6
+
+      - name: Get pnpm store directory
+        id: pnpm-cache
+        shell: bash
+        run: echo "STORE_PATH=$(pnpm store path --silent)" >> $GITHUB_OUTPUT
+
+      - name: Cache pnpm store
+        uses: actions/cache@v5
+        with:
+          path: ${{ steps.pnpm-cache.outputs.STORE_PATH }}
+          key: pnpm-${{ runner.os }}-${{ hashFiles('surfsense_web/pnpm-lock.yaml') }}
+          restore-keys: pnpm-${{ runner.os }}-
+
+      - name: Install web dependencies
+        working-directory: surfsense_web
+        run: pnpm install --frozen-lockfile
+
+      - name: Cache Playwright browsers
+        id: playwright-cache
+        uses: actions/cache@v5
+        with:
+          path: ~/.cache/ms-playwright
+          key: playwright-${{ runner.os }}-${{ hashFiles('surfsense_web/pnpm-lock.yaml') }}
+
+      - name: Install Playwright browsers
+        if: steps.playwright-cache.outputs.cache-hit != 'true'
+        working-directory: surfsense_web
+        run: pnpm exec playwright install --with-deps chromium
+
+      - name: Install Playwright system deps (cache hit)
+        if: steps.playwright-cache.outputs.cache-hit == 'true'
+        working-directory: surfsense_web
+        run: pnpm exec playwright install-deps chromium
+
+      - name: Cache Next.js build
+        uses: actions/cache@v5
+        with:
+          path: surfsense_web/.next/cache
+          key: nextjs-${{ runner.os }}-${{ hashFiles('surfsense_web/pnpm-lock.yaml') }}-${{ github.sha }}
+          restore-keys: |
+            nextjs-${{ runner.os }}-${{ hashFiles('surfsense_web/pnpm-lock.yaml') }}-
+            nextjs-${{ runner.os }}-
+
+      # ─── Tests ─────────────────────────────────────────────────────────
+      - name: Run Playwright tests
+        working-directory: surfsense_web
+        run: pnpm test:e2e:prod
+
+      # ─── Failure diagnostics ───────────────────────────────────────────
+      - name: Dump backend stack logs on failure
+        if: ${{ failure() || cancelled() }}
+        run: |
+          mkdir -p ./compose-logs
+          docker compose -f docker/docker-compose.e2e.yml logs --no-color --timestamps \
+            > ./compose-logs/all-services.log 2>&1 || true
+          for svc in db redis backend celery_worker; do
+            docker compose -f docker/docker-compose.e2e.yml logs --no-color --timestamps "$svc" \
+              > "./compose-logs/${svc}.log" 2>&1 || true
+          done
+          docker compose -f docker/docker-compose.e2e.yml ps \
+            > ./compose-logs/ps.txt 2>&1 || true
+
+      # ─── Artifacts ─────────────────────────────────────────────────────
+      - name: Upload Playwright HTML report
+        if: always()
+        uses: actions/upload-artifact@v7
+        with:
+          name: playwright-report
+          path: surfsense_web/playwright-report/
+          retention-days: 14
+
+      - name: Upload Playwright traces
+        if: failure()
+        uses: actions/upload-artifact@v7
+        with:
+          name: playwright-traces
+          path: surfsense_web/test-results/
+          retention-days: 14
+
+      - name: Upload backend stack logs
+        if: ${{ failure() || cancelled() }}
+        uses: actions/upload-artifact@v7
+        with:
+          name: backend-stack-logs
+          path: ./compose-logs/
+          retention-days: 7
+
+      # ─── Teardown ──────────────────────────────────────────────────────
+      - name: Tear down backend stack
+        if: always()
+        run: docker compose -f docker/docker-compose.e2e.yml down -v --remove-orphans
diff --git a/.github/workflows/notary-status.yml b/.github/workflows/notary-status.yml
new file mode 100644
index 0000000000..5c7c420386
--- /dev/null
+++ b/.github/workflows/notary-status.yml
@@ -0,0 +1,60 @@
+name: Notary status check
+
+# One-off diagnostic workflow. Queries Apple's notary service to see if your
+# submissions are queued, in progress, accepted, or rejected. Useful when a
+# notarization seems "hung" — most often the queue itself, especially on a
+# brand-new Apple Developer account.
+#
+# Run via: Actions tab -> "Notary status check" -> Run workflow.
+# Inputs are optional; if you provide a submission ID, it also fetches that
+# submission's full Apple log.
+#
+# Safe to delete after diagnosis.
+
+on:
+  workflow_dispatch:
+    inputs:
+      submission_id:
+        description: 'Optional: submission UUID to fetch full Apple log for'
+        required: false
+        default: ''
+
+jobs:
+  status:
+    runs-on: macos-latest
+    steps:
+      - name: List recent notarization submissions
+        env:
+          APPLE_ID: ${{ secrets.APPLE_ID }}
+          APPLE_APP_SPECIFIC_PASSWORD: ${{ secrets.APPLE_APP_SPECIFIC_PASSWORD }}
+          APPLE_TEAM_ID: ${{ secrets.APPLE_TEAM_ID }}
+        run: |
+          set -euo pipefail
+          echo "::group::Submission history (most recent first)"
+          xcrun notarytool history \
+            --apple-id "$APPLE_ID" \
+            --password "$APPLE_APP_SPECIFIC_PASSWORD" \
+            --team-id "$APPLE_TEAM_ID"
+          echo "::endgroup::"
+
+      - name: Inspect specific submission (if id provided)
+        if: ${{ inputs.submission_id != '' }}
+        env:
+          APPLE_ID: ${{ secrets.APPLE_ID }}
+          APPLE_APP_SPECIFIC_PASSWORD: ${{ secrets.APPLE_APP_SPECIFIC_PASSWORD }}
+          APPLE_TEAM_ID: ${{ secrets.APPLE_TEAM_ID }}
+          SUBMISSION_ID: ${{ inputs.submission_id }}
+        run: |
+          set -euo pipefail
+          echo "::group::Submission info"
+          xcrun notarytool info "$SUBMISSION_ID" \
+            --apple-id "$APPLE_ID" \
+            --password "$APPLE_APP_SPECIFIC_PASSWORD" \
+            --team-id "$APPLE_TEAM_ID"
+          echo "::endgroup::"
+          echo "::group::Apple's processing log for this submission"
+          xcrun notarytool log "$SUBMISSION_ID" \
+            --apple-id "$APPLE_ID" \
+            --password "$APPLE_APP_SPECIFIC_PASSWORD" \
+            --team-id "$APPLE_TEAM_ID" || true
+          echo "::endgroup::"
diff --git a/.github/workflows/obsidian-plugin-lint.yml b/.github/workflows/obsidian-plugin-lint.yml
new file mode 100644
index 0000000000..42bd099b12
--- /dev/null
+++ b/.github/workflows/obsidian-plugin-lint.yml
@@ -0,0 +1,39 @@
+name: Obsidian Plugin Lint
+
+# Lints + type-checks + builds the Obsidian plugin on every push/PR that
+# touches its sources. The official obsidian-sample-plugin template ships
+# its own ESLint+esbuild setup; we run that here instead of folding the
+# plugin into the monorepo's Biome-based code-quality.yml so the tooling
+# stays aligned with what `obsidianmd/eslint-plugin-obsidianmd` checks
+# against.
+
+on:
+  push:
+    branches: ["**"]
+    paths:
+      - "surfsense_obsidian/**"
+      - ".github/workflows/obsidian-plugin-lint.yml"
+  pull_request:
+    branches: ["**"]
+    paths:
+      - "surfsense_obsidian/**"
+      - ".github/workflows/obsidian-plugin-lint.yml"
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: surfsense_obsidian
+    steps:
+      - uses: actions/checkout@v6
+
+      - uses: actions/setup-node@v6
+        with:
+          node-version: 22.x
+          cache: npm
+          cache-dependency-path: surfsense_obsidian/package-lock.json
+
+      - run: npm ci
+      - run: npm run lint
+      - run: npm run build
diff --git a/.github/workflows/release-obsidian-plugin.yml b/.github/workflows/release-obsidian-plugin.yml
new file mode 100644
index 0000000000..dfe15e7d65
--- /dev/null
+++ b/.github/workflows/release-obsidian-plugin.yml
@@ -0,0 +1,119 @@
+name: Release Obsidian Plugin
+
+# Tag format: `obsidian-v<version>` and `<version>` must match `surfsense_obsidian/manifest.json` exactly.
+on:
+  push:
+    tags:
+      - "obsidian-v*"
+  workflow_dispatch:
+    inputs:
+      publish:
+        description: "Publish to GitHub Releases"
+        required: true
+        type: choice
+        options:
+          - never
+          - always
+        default: "never"
+
+permissions:
+  contents: write
+
+jobs:
+  build-and-release:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: surfsense_obsidian
+
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          # Need write access for the manifest/versions.json mirror commit
+          # back to main further down.
+          fetch-depth: 0
+          token: ${{ secrets.GITHUB_TOKEN }}
+
+      - uses: actions/setup-node@v6
+        with:
+          node-version: 22.x
+          cache: npm
+          cache-dependency-path: surfsense_obsidian/package-lock.json
+
+      - name: Resolve plugin version
+        id: version
+        run: |
+          manifest_version=$(node -p "require('./manifest.json').version")
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            # Manual runs derive the release version from manifest.json.
+            version="$manifest_version"
+            tag="obsidian-v$version"
+          else
+            tag="${GITHUB_REF_NAME}"
+            if [ -z "$tag" ] || [[ "$tag" != obsidian-v* ]]; then
+              echo "::error::Invalid tag '$tag'. Expected format: obsidian-v<version>"
+              exit 1
+            fi
+            version="${tag#obsidian-v}"
+            if [ "$version" != "$manifest_version" ]; then
+              echo "::error::Tag version '$version' does not match manifest version '$manifest_version'"
+              exit 1
+            fi
+          fi
+          echo "tag=$tag"            >> "$GITHUB_OUTPUT"
+          echo "version=$version"    >> "$GITHUB_OUTPUT"
+
+      - name: Resolve publish mode
+        id: release_mode
+        run: |
+          if [ "${{ github.event_name }}" = "push" ] || [ "${{ inputs.publish }}" = "always" ]; then
+            echo "should_publish=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "should_publish=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      - run: npm ci
+
+      - run: npm run lint
+
+      - run: npm run build
+
+      - name: Verify build artifacts
+        run: |
+          for f in main.js manifest.json styles.css; do
+            test -f "$f" || (echo "::error::Missing release artifact: $f" && exit 1)
+          done
+
+      - name: Mirror manifest.json + versions.json to repo root
+        if: steps.release_mode.outputs.should_publish == 'true'
+        working-directory: ${{ github.workspace }}
+        run: |
+          cp surfsense_obsidian/manifest.json manifest.json
+          cp surfsense_obsidian/versions.json versions.json
+          if git diff --quiet manifest.json versions.json; then
+            echo "Root manifest/versions already up to date."
+            exit 0
+          fi
+          git config user.name  "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+          git add manifest.json versions.json
+          git commit -m "chore(obsidian-plugin): mirror manifest+versions for ${{ steps.version.outputs.tag }}"
+          # Push to the default branch so Obsidian can fetch raw files from HEAD.
+          if ! git push origin HEAD:${{ github.event.repository.default_branch }}; then
+            echo "::warning::Failed to push mirrored manifest/versions to default branch (likely branch protection). Continuing release."
+          fi
+
+      # Publish release under bare `manifest.json` version (no `obsidian-v` prefix) for BRAT/store compatibility.
+      # `make_latest: "false"` keeps the desktop app's `v*` release headlined since Obsidian and BRAT resolve plugins via getReleaseByTag, not the latest flag.
+      - name: Create GitHub release
+        if: steps.release_mode.outputs.should_publish == 'true'
+        uses: softprops/action-gh-release@v3
+        with:
+          tag_name: ${{ steps.version.outputs.version }}
+          name: SurfSense Obsidian Plugin ${{ steps.version.outputs.version }}
+          generate_release_notes: true
+          make_latest: "false"
+          files: |
+            surfsense_obsidian/main.js
+            surfsense_obsidian/manifest.json
+            surfsense_obsidian/styles.css
diff --git a/.gitignore b/.gitignore
index 7a76816819..fd0d4da2a3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,15 @@ node_modules/
 deepagents/.env.local
 .env.local
 .idea/
+debug.log
+
+references/
+references
+
+# Playwright (E2E test artifacts)
+surfsense_web/playwright/.auth/
+surfsense_web/playwright-report/
+surfsense_web/test-results/
+surfsense_web/blob-report/
+
+content_research/
diff --git a/.vscode/launch.json b/.vscode/launch.json
index 029e7c6475..ad8f8f2a70 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -26,7 +26,16 @@
             "pythonArgs": [
                 "run",
                 "python"
-            ]
+            ],
+            // Mute LangGraph/Pydantic checkpoint serializer warnings
+            // (UserWarnings emitted from pydantic/main.py when the
+            // runtime snapshots a SurfSenseContextSchema into a field
+            // typed `None`) so the debugger's "Raised Exceptions"
+            // breakpoint doesn't pause on a known-harmless event.
+            // Production logs are unaffected.
+            "env": {
+                "PYTHONWARNINGS": "ignore::UserWarning:pydantic.main"
+            }
         },
         {
             "name": "Backend: FastAPI (No Reload)",
@@ -40,7 +49,10 @@
             "pythonArgs": [
                 "run",
                 "python"
-            ]
+            ],
+            "env": {
+                "PYTHONWARNINGS": "ignore::UserWarning:pydantic.main"
+            }
         },
         {
             "name": "Backend: FastAPI (main.py)",
@@ -54,7 +66,10 @@
             "pythonArgs": [
                 "run",
                 "python"
-            ]
+            ],
+            "env": {
+                "PYTHONWARNINGS": "ignore::UserWarning:pydantic.main"
+            }
         },
         {
             "name": "Frontend: Next.js",
@@ -104,7 +119,10 @@
             "pythonArgs": [
                 "run",
                 "python"
-            ]
+            ],
+            "env": {
+                "PYTHONWARNINGS": "ignore::UserWarning:pydantic.main"
+            }
         },
         {
             "name": "Celery: Beat Scheduler",
@@ -124,7 +142,10 @@
             "pythonArgs": [
                 "run",
                 "python"
-            ]
+            ],
+            "env": {
+                "PYTHONWARNINGS": "ignore::UserWarning:pydantic.main"
+            }
         }
     ],
     "compounds": [
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 05bd307022..7da4b54f83 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,4 +1,9 @@
 {
 	"biome.configurationPath": "./surfsense_web/biome.json",
-	"deepscan.ignoreConfirmWarning": true
+	"deepscan.ignoreConfirmWarning": true,
+	"python.defaultInterpreterPath": "${workspaceFolder}/surfsense_backend/.venv/bin/python",
+	"basedpyright.analysis.extraPaths": [
+		"${workspaceFolder}/surfsense_backend"
+	],
+	"python-envs.pythonProjects": []
 }
\ No newline at end of file
diff --git a/README.es.md b/README.es.md
index 299c6e95cc..dea86a7936 100644
--- a/README.es.md
+++ b/README.es.md
@@ -41,7 +41,7 @@ NotebookLM es una de las mejores y más útiles plataformas de IA que existen, p
 - **Sin Dependencia de Proveedores** - Configura cualquier modelo LLM, de imagen, TTS y STT.
 - **25+ Fuentes de Datos Externas** - Agrega tus fuentes desde Google Drive, OneDrive, Dropbox, Notion y muchos otros servicios externos.
 - **Soporte Multijugador en Tiempo Real** - Trabaja fácilmente con los miembros de tu equipo en un notebook compartido.
-- **Aplicación de Escritorio** - Obtén asistencia de IA en cualquier aplicación con Quick Assist, General Assist, Extreme Assist y sincronización de carpetas locales.
+- **Aplicación de Escritorio** - Obtén asistencia de IA en cualquier aplicación con Quick Assist, General Assist, Screenshot Assist y sincronización de carpetas locales.
 
 ...y más por venir.
 
@@ -84,9 +84,9 @@ https://github.com/user-attachments/assets/a0a16566-6967-4374-ac51-9b3e07fbecd7
 
    <p align="center"><img src="surfsense_web/public/homepage/hero_tutorial/quick_assist.gif" alt="Quick Assist" /></p>
 
-   - Aplicación de Escritorio — Extreme Assist
+   - Aplicación de Escritorio — Screenshot Assist
 
-   <p align="center"><img src="surfsense_web/public/homepage/hero_tutorial/extreme_assist.gif" alt="Extreme Assist" /></p>
+   <p align="center"><img src="surfsense_web/public/homepage/hero_tutorial/screenshot_assist.gif" alt="Screenshot Assist" /></p>
 
    - Aplicación de Escritorio — Watch Local Folder
 
@@ -150,7 +150,7 @@ La aplicación de escritorio incluye estas potentes funciones:
 
 - **General Assist** — Lanza SurfSense al instante desde cualquier aplicación con un atajo global.
 - **Quick Assist** — Selecciona texto en cualquier lugar, luego pide a la IA que lo explique, reescriba o actúe sobre él.
-- **Extreme Assist** — Obtén sugerencias de escritura en línea impulsadas por tu base de conocimiento mientras escribes en cualquier aplicación.
+- **Screenshot Assist** — Selecciona una región de tu pantalla y adjúntala al chat para que las respuestas se basen en tu base de conocimiento.
 - **Watch Local Folder** — Vigila una carpeta local y sincroniza automáticamente los cambios de archivos con tu base de conocimiento. **Pro tip:** Apúntalo a tu bóveda de Obsidian para mantener tus notas buscables en SurfSense.
 
 Todas las funciones operan contra tu espacio de búsqueda elegido, por lo que tus respuestas siempre están basadas en tus propios datos.
@@ -199,14 +199,14 @@ Todas las funciones operan contra tu espacio de búsqueda elegido, por lo que tu
 | **Generación de Videos** | Resúmenes en video cinemáticos vía Veo 3 (solo Ultra) | Disponible (NotebookLM es mejor aquí, mejorando activamente) |
 | **Generación de Presentaciones** | Diapositivas más atractivas pero no editables | Crea presentaciones editables basadas en diapositivas |
 | **Generación de Podcasts** | Resúmenes de audio con hosts e idiomas personalizables | Disponible con múltiples proveedores TTS (NotebookLM es mejor aquí, mejorando activamente) |
-| **Aplicación de Escritorio** | No | Aplicación nativa con General Assist, Quick Assist, Extreme Assist y sincronización de carpetas locales |
+| **Aplicación de Escritorio** | No | Aplicación nativa con General Assist, Quick Assist, Screenshot Assist y sincronización de carpetas locales |
 | **Extensión de Navegador** | No | Extensión multi-navegador para guardar cualquier página web, incluyendo páginas protegidas por autenticación |
 
 <details>
 <summary><b>Lista completa de Fuentes Externas</b></summary>
 <a id="fuentes-externas"></a>
 
-Motores de Búsqueda (Tavily, LinkUp) · SearxNG · Google Drive · OneDrive · Dropbox · Slack · Microsoft Teams · Linear · Jira · ClickUp · Confluence · BookStack · Notion · Gmail · Videos de YouTube · GitHub · Discord · Airtable · Google Calendar · Luma · Circleback · Elasticsearch · Obsidian, y más por venir.
+Motores de Búsqueda (SearXNG, Tavily, LinkUp, Baidu Search) · Google Drive · OneDrive · Dropbox · Slack · Microsoft Teams · Linear · Jira · ClickUp · Confluence · BookStack · Notion · Gmail · Videos de YouTube · GitHub · Discord · Airtable · Google Calendar · Luma · Circleback · Elasticsearch · Obsidian, y más por venir.
 
 </details>
 
diff --git a/README.hi.md b/README.hi.md
index 11a25ee0d2..43e24c3ee6 100644
--- a/README.hi.md
+++ b/README.hi.md
@@ -41,7 +41,7 @@ NotebookLM वहाँ उपलब्ध सबसे अच्छे और 
 - **कोई विक्रेता लॉक-इन नहीं** - किसी भी LLM, इमेज, TTS और STT मॉडल को कॉन्फ़िगर करें।
 - **25+ बाहरी डेटा स्रोत** - Google Drive, OneDrive, Dropbox, Notion और कई अन्य बाहरी सेवाओं से अपने स्रोत जोड़ें।
 - **रीयल-टाइम मल्टीप्लेयर सपोर्ट** - एक साझा notebook में अपनी टीम के सदस्यों के साथ आसानी से काम करें।
-- **डेस्कटॉप ऐप** - Quick Assist, General Assist, Extreme Assist और लोकल फ़ोल्डर सिंक के साथ किसी भी एप्लिकेशन में AI सहायता प्राप्त करें।
+- **डेस्कटॉप ऐप** - Quick Assist, General Assist, Screenshot Assist और लोकल फ़ोल्डर सिंक के साथ किसी भी एप्लिकेशन में AI सहायता प्राप्त करें।
 
 ...और भी बहुत कुछ आने वाला है।
 
@@ -84,9 +84,9 @@ https://github.com/user-attachments/assets/a0a16566-6967-4374-ac51-9b3e07fbecd7
 
    <p align="center"><img src="surfsense_web/public/homepage/hero_tutorial/quick_assist.gif" alt="Quick Assist" /></p>
 
-   - डेस्कटॉप ऐप — Extreme Assist
+   - डेस्कटॉप ऐप — Screenshot Assist
 
-   <p align="center"><img src="surfsense_web/public/homepage/hero_tutorial/extreme_assist.gif" alt="Extreme Assist" /></p>
+   <p align="center"><img src="surfsense_web/public/homepage/hero_tutorial/screenshot_assist.gif" alt="Screenshot Assist" /></p>
 
    - डेस्कटॉप ऐप — Watch Local Folder
 
@@ -150,7 +150,7 @@ SurfSense एक डेस्कटॉप ऐप भी प्रदान क
 
 - **General Assist** — एक ग्लोबल शॉर्टकट से किसी भी एप्लिकेशन से तुरंत SurfSense लॉन्च करें।
 - **Quick Assist** — कहीं भी टेक्स्ट चुनें, फिर AI से समझाने, फिर से लिखने या उस पर कार्रवाई करने को कहें।
-- **Extreme Assist** — किसी भी ऐप में टाइप करते समय अपनी नॉलेज बेस से संचालित इनलाइन लेखन सुझाव प्राप्त करें।
+- **Screenshot Assist** — स्क्रीन पर एक क्षेत्र चुनें और उसे चैट में जोड़ें, ताकि उत्तर आपकी नॉलेज बेस पर आधारित रहें।
 - **Watch Local Folder** — एक लोकल फ़ोल्डर को वॉच करें और फ़ाइल परिवर्तनों को स्वचालित रूप से अपनी नॉलेज बेस में सिंक करें। **Pro tip:** इसे अपने Obsidian vault पर पॉइंट करें ताकि आपके नोट्स SurfSense में सर्च करने योग्य रहें।
 
 सभी सुविधाएं आपके चुने हुए सर्च स्पेस पर काम करती हैं, ताकि आपके उत्तर हमेशा आपके अपने डेटा पर आधारित हों।
@@ -199,14 +199,14 @@ SurfSense एक डेस्कटॉप ऐप भी प्रदान क
 | **वीडियो जनरेशन** | Veo 3 के माध्यम से सिनेमैटिक वीडियो ओवरव्यू (केवल Ultra) | उपलब्ध (NotebookLM यहाँ बेहतर है, सक्रिय रूप से सुधार हो रहा है) |
 | **प्रेजेंटेशन जनरेशन** | बेहतर दिखने वाली स्लाइड्स लेकिन संपादन योग्य नहीं | संपादन योग्य, स्लाइड आधारित प्रेजेंटेशन बनाएं |
 | **पॉडकास्ट जनरेशन** | कस्टमाइज़ेबल होस्ट और भाषाओं के साथ ऑडियो ओवरव्यू | कई TTS प्रदाताओं के साथ उपलब्ध (NotebookLM यहाँ बेहतर है, सक्रिय रूप से सुधार हो रहा है) |
-| **डेस्कटॉप ऐप** | नहीं | General Assist, Quick Assist, Extreme Assist और लोकल फ़ोल्डर सिंक के साथ नेटिव ऐप |
+| **डेस्कटॉप ऐप** | नहीं | General Assist, Quick Assist, Screenshot Assist और लोकल फ़ोल्डर सिंक के साथ नेटिव ऐप |
 | **ब्राउज़र एक्सटेंशन** | नहीं | किसी भी वेबपेज को सहेजने के लिए क्रॉस-ब्राउज़र एक्सटेंशन, प्रमाणीकरण सुरक्षित पेज सहित |
 
 <details>
 <summary><b>बाहरी स्रोतों की पूरी सूची</b></summary>
 <a id="बाहरी-स्रोत"></a>
 
-सर्च इंजन (Tavily, LinkUp) · SearxNG · Google Drive · OneDrive · Dropbox · Slack · Microsoft Teams · Linear · Jira · ClickUp · Confluence · BookStack · Notion · Gmail · YouTube वीडियो · GitHub · Discord · Airtable · Google Calendar · Luma · Circleback · Elasticsearch · Obsidian, और भी बहुत कुछ आने वाला है।
+सर्च इंजन (SearXNG, Tavily, LinkUp, Baidu Search) · Google Drive · OneDrive · Dropbox · Slack · Microsoft Teams · Linear · Jira · ClickUp · Confluence · BookStack · Notion · Gmail · YouTube वीडियो · GitHub · Discord · Airtable · Google Calendar · Luma · Circleback · Elasticsearch · Obsidian, और भी बहुत कुछ आने वाला है।
 
 </details>
 
diff --git a/README.md b/README.md
index 035c4f5158..ab9f9e2211 100644
--- a/README.md
+++ b/README.md
@@ -41,7 +41,8 @@ NotebookLM is one of the best and most useful AI platforms out there, but once y
 - **No Vendor Lock-in** - Configure any LLM, image, TTS, and STT models to use.
 - **25+ External Data Sources** - Add your sources from Google Drive, OneDrive, Dropbox, Notion, and many other external services.
 - **Real-Time Multiplayer Support** - Work easily with your team members in a shared notebook.
-- **Desktop App** - Get AI assistance in any application with Quick Assist, General Assist, Extreme Assist, and local folder sync.
+- **AI File Sorting** - Automatically organize your documents into a smart folder hierarchy using AI-powered categorization by source, date, and topic.
+- **Desktop App** - Get AI assistance in any application with Quick Assist, General Assist, Screenshot Assist, and local folder sync.
 
 ...and more to come.
 
@@ -84,9 +85,9 @@ https://github.com/user-attachments/assets/a0a16566-6967-4374-ac51-9b3e07fbecd7
 
    <p align="center"><img src="surfsense_web/public/homepage/hero_tutorial/quick_assist.gif" alt="Quick Assist" /></p>
 
-   - Desktop App — Extreme Assist
+   - Desktop App — Screenshot Assist
 
-   <p align="center"><img src="surfsense_web/public/homepage/hero_tutorial/extreme_assist.gif" alt="Extreme Assist" /></p>
+   <p align="center"><img src="surfsense_web/public/homepage/hero_tutorial/screenshot_assist.gif" alt="Screenshot Assist" /></p>
 
    - Desktop App — Watch Local Folder
 
@@ -150,7 +151,7 @@ The desktop app includes these powerful features:
 
 - **General Assist** — Launch SurfSense instantly from any application with a global shortcut.
 - **Quick Assist** — Select text anywhere, then ask AI to explain, rewrite, or act on it.
-- **Extreme Assist** — Get inline writing suggestions powered by your knowledge base as you type in any app.
+- **Screenshot Assist** — Select a region on your screen and attach it to chat so answers stay grounded in your knowledge base.
 - **Watch Local Folder** — Watch a local folder and automatically sync file changes to your knowledge base. **Pro tip:** Point it at your Obsidian vault to keep your notes searchable in SurfSense.
 
 All features operate against your chosen search space, so your answers are always grounded in your own data.
@@ -199,14 +200,15 @@ All features operate against your chosen search space, so your answers are alway
 | **Video Generation** | Cinematic Video Overviews via Veo 3 (Ultra only) | Available (NotebookLM is better here, actively improving) |
 | **Presentation Generation** | Better looking slides but not editable | Create editable, slide-based presentations |
 | **Podcast Generation** | Audio Overviews with customizable hosts and languages | Available with multiple TTS providers (NotebookLM is better here, actively improving) |
-| **Desktop App** | No | Native app with General Assist, Quick Assist, Extreme Assist, and local folder sync |
+| **AI File Sorting** | No | LLM-powered auto-categorization into source, date, category, and subcategory folders |
+| **Desktop App** | No | Native app with General Assist, Quick Assist, Screenshot Assist, and local folder sync |
 | **Browser Extension** | No | Cross-browser extension to save any webpage, including auth-protected pages |
 
 <details>
 <summary><b>Full list of External Sources</b></summary>
 <a id="external-sources"></a>
 
-Search Engines (Tavily, LinkUp) · SearxNG · Google Drive · OneDrive · Dropbox · Slack · Microsoft Teams · Linear · Jira · ClickUp · Confluence · BookStack · Notion · Gmail · YouTube Videos · GitHub · Discord · Airtable · Google Calendar · Luma · Circleback · Elasticsearch · Obsidian, and more to come.
+Search Engines (SearXNG, Tavily, LinkUp, Baidu Search) · Google Drive · OneDrive · Dropbox · Slack · Microsoft Teams · Linear · Jira · ClickUp · Confluence · BookStack · Notion · Gmail · YouTube Videos · GitHub · Discord · Airtable · Google Calendar · Luma · Circleback · Elasticsearch · Obsidian, and more to come.
 
 </details>
 
diff --git a/README.pt-BR.md b/README.pt-BR.md
index 9323b2bcec..fcb004cd66 100644
--- a/README.pt-BR.md
+++ b/README.pt-BR.md
@@ -41,7 +41,7 @@ O NotebookLM é uma das melhores e mais úteis plataformas de IA disponíveis, m
 - **Sem Dependência de Fornecedor** - Configure qualquer modelo LLM, de imagem, TTS e STT.
 - **25+ Fontes de Dados Externas** - Adicione suas fontes do Google Drive, OneDrive, Dropbox, Notion e muitos outros serviços externos.
 - **Suporte Multiplayer em Tempo Real** - Trabalhe facilmente com os membros da sua equipe em um notebook compartilhado.
-- **Aplicativo Desktop** - Obtenha assistência de IA em qualquer aplicativo com Quick Assist, General Assist, Extreme Assist e sincronização de pastas locais.
+- **Aplicativo Desktop** - Obtenha assistência de IA em qualquer aplicativo com Quick Assist, General Assist, Screenshot Assist e sincronização de pastas locais.
 
 ...e mais por vir.
 
@@ -84,9 +84,9 @@ https://github.com/user-attachments/assets/a0a16566-6967-4374-ac51-9b3e07fbecd7
 
    <p align="center"><img src="surfsense_web/public/homepage/hero_tutorial/quick_assist.gif" alt="Quick Assist" /></p>
 
-   - Aplicativo Desktop — Extreme Assist
+   - Aplicativo Desktop — Screenshot Assist
 
-   <p align="center"><img src="surfsense_web/public/homepage/hero_tutorial/extreme_assist.gif" alt="Extreme Assist" /></p>
+   <p align="center"><img src="surfsense_web/public/homepage/hero_tutorial/screenshot_assist.gif" alt="Screenshot Assist" /></p>
 
    - Aplicativo Desktop — Watch Local Folder
 
@@ -150,7 +150,7 @@ O aplicativo desktop inclui estes recursos poderosos:
 
 - **General Assist** — Abra o SurfSense instantaneamente de qualquer aplicativo com um atalho global.
 - **Quick Assist** — Selecione texto em qualquer lugar, depois peça à IA para explicar, reescrever ou agir sobre ele.
-- **Extreme Assist** — Receba sugestões de escrita em linha alimentadas pela sua base de conhecimento enquanto digita em qualquer aplicativo.
+- **Screenshot Assist** — Selecione uma região da tela e anexe ao chat para respostas fundamentadas na sua base de conhecimento.
 - **Watch Local Folder** — Monitore uma pasta local e sincronize automaticamente as alterações de arquivos com sua base de conhecimento. **Pro tip:** Aponte para seu cofre do Obsidian para manter suas notas pesquisáveis no SurfSense.
 
 Todos os recursos operam no espaço de busca escolhido, para que suas respostas sejam sempre baseadas nos seus próprios dados.
@@ -199,14 +199,14 @@ Todos os recursos operam no espaço de busca escolhido, para que suas respostas
 | **Geração de Vídeos** | Visões gerais cinemáticas via Veo 3 (apenas Ultra) | Disponível (NotebookLM é melhor aqui, melhorando ativamente) |
 | **Geração de Apresentações** | Slides mais bonitos mas não editáveis | Cria apresentações editáveis baseadas em slides |
 | **Geração de Podcasts** | Visões gerais em áudio com hosts e idiomas personalizáveis | Disponível com múltiplos provedores TTS (NotebookLM é melhor aqui, melhorando ativamente) |
-| **Aplicativo Desktop** | Não | Aplicativo nativo com General Assist, Quick Assist, Extreme Assist e sincronização de pastas locais |
+| **Aplicativo Desktop** | Não | Aplicativo nativo com General Assist, Quick Assist, Screenshot Assist e sincronização de pastas locais |
 | **Extensão de Navegador** | Não | Extensão multi-navegador para salvar qualquer página web, incluindo páginas protegidas por autenticação |
 
 <details>
 <summary><b>Lista completa de Fontes Externas</b></summary>
 <a id="fontes-externas"></a>
 
-Mecanismos de Busca (Tavily, LinkUp) · SearxNG · Google Drive · OneDrive · Dropbox · Slack · Microsoft Teams · Linear · Jira · ClickUp · Confluence · BookStack · Notion · Gmail · Vídeos do YouTube · GitHub · Discord · Airtable · Google Calendar · Luma · Circleback · Elasticsearch · Obsidian, e mais por vir.
+Mecanismos de Busca (SearXNG, Tavily, LinkUp, Baidu Search) · Google Drive · OneDrive · Dropbox · Slack · Microsoft Teams · Linear · Jira · ClickUp · Confluence · BookStack · Notion · Gmail · Vídeos do YouTube · GitHub · Discord · Airtable · Google Calendar · Luma · Circleback · Elasticsearch · Obsidian, e mais por vir.
 
 </details>
 
diff --git a/README.zh-CN.md b/README.zh-CN.md
index 29200243b5..a07f4afdcc 100644
--- a/README.zh-CN.md
+++ b/README.zh-CN.md
@@ -41,7 +41,7 @@ NotebookLM 是目前最好、最实用的 AI 平台之一，但当你开始经
 - **无供应商锁定** - 配置任何 LLM、图像、TTS 和 STT 模型。
 - **25+ 外部数据源** - 从 Google Drive、OneDrive、Dropbox、Notion 和许多其他外部服务添加你的来源。
 - **实时多人协作支持** - 在共享笔记本中轻松与团队成员协作。
-- **桌面应用** - 通过 Quick Assist、General Assist、Extreme Assist 和本地文件夹同步在任何应用程序中获得 AI 助手。
+- **桌面应用** - 通过 Quick Assist、General Assist、Screenshot Assist 和本地文件夹同步在任何应用程序中获得 AI 助手。
 
 ...更多功能即将推出。
 
@@ -84,9 +84,9 @@ https://github.com/user-attachments/assets/a0a16566-6967-4374-ac51-9b3e07fbecd7
 
    <p align="center"><img src="surfsense_web/public/homepage/hero_tutorial/quick_assist.gif" alt="Quick Assist" /></p>
 
-   - 桌面应用 — Extreme Assist
+   - 桌面应用 — Screenshot Assist
 
-   <p align="center"><img src="surfsense_web/public/homepage/hero_tutorial/extreme_assist.gif" alt="Extreme Assist" /></p>
+   <p align="center"><img src="surfsense_web/public/homepage/hero_tutorial/screenshot_assist.gif" alt="Screenshot Assist" /></p>
 
    - 桌面应用 — Watch Local Folder
 
@@ -150,7 +150,7 @@ SurfSense 还提供桌面应用，将 AI 助手带到您计算机上的每个应
 
 - **General Assist** — 通过全局快捷键从任何应用程序即时启动 SurfSense。
 - **Quick Assist** — 在任何位置选中文本，然后让 AI 解释、改写或对其执行操作。
-- **Extreme Assist** — 在任何应用中输入时，获得基于您知识库的内联写作建议。
+- **Screenshot Assist** — 在屏幕上框选区域并附加到聊天，让回复基于您的知识库。
 - **Watch Local Folder** — 监视本地文件夹，自动将文件更改同步到您的知识库。**Pro tip：** 将其指向您的 Obsidian vault，让笔记在 SurfSense 中随时可搜索。
 
 所有功能均基于您选择的搜索空间运行，确保回答始终以您自己的数据为依据。
@@ -199,14 +199,14 @@ SurfSense 还提供桌面应用，将 AI 助手带到您计算机上的每个应
 | **视频生成** | 通过 Veo 3 的电影级视频概览（仅 Ultra） | 可用（NotebookLM 在此方面更好，正在积极改进） |
 | **演示文稿生成** | 更美观的幻灯片但不可编辑 | 创建可编辑的幻灯片式演示文稿 |
 | **播客生成** | 可自定义主持人和语言的音频概览 | 可用，支持多种 TTS 提供商（NotebookLM 在此方面更好，正在积极改进） |
-| **桌面应用** | 否 | 原生应用，包含 General Assist、Quick Assist、Extreme Assist 和本地文件夹同步 |
+| **桌面应用** | 否 | 原生应用，包含 General Assist、Quick Assist、Screenshot Assist 和本地文件夹同步 |
 | **浏览器扩展** | 否 | 跨浏览器扩展，保存任何网页，包括需要身份验证的页面 |
 
 <details>
 <summary><b>外部数据源完整列表</b></summary>
 <a id="外部数据源"></a>
 
-搜索引擎（Tavily、LinkUp）· SearxNG · Google Drive · OneDrive · Dropbox · Slack · Microsoft Teams · Linear · Jira · ClickUp · Confluence · BookStack · Notion · Gmail · YouTube 视频 · GitHub · Discord · Airtable · Google Calendar · Luma · Circleback · Elasticsearch · Obsidian，更多即将推出。
+搜索引擎（SearXNG、Tavily、LinkUp、Baidu Search）· Google Drive · OneDrive · Dropbox · Slack · Microsoft Teams · Linear · Jira · ClickUp · Confluence · BookStack · Notion · Gmail · YouTube 视频 · GitHub · Discord · Airtable · Google Calendar · Luma · Circleback · Elasticsearch · Obsidian，更多即将推出。
 
 </details>
 
diff --git a/VERSION b/VERSION
index ceddfb28f4..b056f4120e 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.0.15
+0.0.24
diff --git a/docker/.env.example b/docker/.env.example
index 4394929a70..4a396d2d0f 100644
--- a/docker/.env.example
+++ b/docker/.env.example
@@ -4,7 +4,7 @@
 # Database, Redis, and internal service wiring are handled automatically.
 # ==============================================================================
 
-# SurfSense version (use "latest", a clean version like "0.0.14", or a specific build like "0.0.14.1")
+# SurfSense version (use "latest" or a specific version like "0.0.14")
 SURFSENSE_VERSION=latest
 
 # ------------------------------------------------------------------------------
@@ -74,6 +74,7 @@ EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
 # BACKEND_URL=https://api.yourdomain.com
 # NEXT_PUBLIC_FASTAPI_BACKEND_URL=https://api.yourdomain.com
 # NEXT_PUBLIC_ZERO_CACHE_URL=https://zero.yourdomain.com
+# FASTAPI_BACKEND_INTERNAL_URL=http://backend:8000
 
 # ------------------------------------------------------------------------------
 # Zero-cache (real-time sync)
@@ -161,6 +162,14 @@ STRIPE_PAGE_BUYING_ENABLED=FALSE
 # STRIPE_RECONCILIATION_LOOKBACK_MINUTES=10
 # STRIPE_RECONCILIATION_BATCH_SIZE=100
 
+# Premium credit purchases via Stripe ($1 buys 1_000_000 micro-USD of
+# credit; premium turns debit the actual per-call provider cost
+# reported by LiteLLM, so cheap and expensive models bill proportionally)
+# STRIPE_TOKEN_BUYING_ENABLED=FALSE
+# STRIPE_PREMIUM_TOKEN_PRICE_ID=price_...
+# STRIPE_CREDIT_MICROS_PER_UNIT=1000000
+# DEPRECATED — STRIPE_TOKENS_PER_UNIT=1000000
+
 # ------------------------------------------------------------------------------
 # TTS & STT (Text-to-Speech / Speech-to-Text)
 # ------------------------------------------------------------------------------
@@ -302,6 +311,23 @@ STT_SERVICE=local/base
 # Advanced (optional)
 # ------------------------------------------------------------------------------
 
+# New-chat agent feature flags
+SURFSENSE_ENABLE_CONTEXT_EDITING=true
+SURFSENSE_ENABLE_COMPACTION_V2=true
+SURFSENSE_ENABLE_RETRY_AFTER=true
+SURFSENSE_ENABLE_MODEL_FALLBACK=false
+SURFSENSE_ENABLE_MODEL_CALL_LIMIT=true
+SURFSENSE_ENABLE_TOOL_CALL_LIMIT=true
+SURFSENSE_ENABLE_TOOL_CALL_REPAIR=true
+SURFSENSE_ENABLE_BUSY_MUTEX=true
+SURFSENSE_ENABLE_SKILLS=true
+SURFSENSE_ENABLE_SPECIALIZED_SUBAGENTS=true
+SURFSENSE_ENABLE_KB_PLANNER_RUNNABLE=true
+SURFSENSE_ENABLE_ACTION_LOG=true
+SURFSENSE_ENABLE_REVERT_ROUTE=true
+SURFSENSE_ENABLE_PERMISSION=true
+SURFSENSE_ENABLE_DOOM_LOOP=true
+
 # Periodic connector sync interval (default: 5m)
 # SCHEDULE_CHECKER_INTERVAL=5m
 
@@ -312,6 +338,41 @@ STT_SERVICE=local/base
 # Pages limit per user for ETL (default: unlimited)
 # PAGES_LIMIT=500
 
+# Premium credit quota per registered user, in micro-USD (default: $5).
+# Premium turns are debited at the actual per-call provider cost reported
+# by LiteLLM. Only applies to models with billing_tier=premium.
+# PREMIUM_CREDIT_MICROS_LIMIT=5000000
+# DEPRECATED — PREMIUM_TOKEN_LIMIT=5000000
+
+# Safety ceiling on per-call premium reservation, in micro-USD ($1.00 default).
+# QUOTA_MAX_RESERVE_MICROS=1000000
+
+# Per-image reservation for POST /image-generations, in micro-USD ($0.05 default).
+# QUOTA_DEFAULT_IMAGE_RESERVE_MICROS=50000
+
+# Per-podcast reservation for the podcast Celery task ($0.20 default).
+# QUOTA_DEFAULT_PODCAST_RESERVE_MICROS=200000
+
+# Per-video-presentation reservation for the video Celery task ($1.00 default).
+# Override path bypasses QUOTA_MAX_RESERVE_MICROS clamp — raise with care.
+# QUOTA_DEFAULT_VIDEO_PRESENTATION_RESERVE_MICROS=1000000
+
+# No-login (anonymous) mode — public users can chat without an account
+# Set TRUE to enable /free pages and anonymous chat API
+NOLOGIN_MODE_ENABLED=FALSE
+# ANON_TOKEN_LIMIT=1000000
+# ANON_TOKEN_WARNING_THRESHOLD=800000
+# ANON_TOKEN_QUOTA_TTL_DAYS=30
+# ANON_MAX_UPLOAD_SIZE_MB=5
+# QUOTA_MAX_RESERVE_PER_CALL=8000
+# Abuse prevention: max concurrent anonymous streams per IP
+# ANON_MAX_CONCURRENT_STREAMS=2
+# Number of chat requests per IP before Turnstile CAPTCHA is required
+# ANON_CAPTCHA_REQUEST_THRESHOLD=5
+# Cloudflare Turnstile CAPTCHA (https://dash.cloudflare.com/ -> Turnstile)
+# TURNSTILE_ENABLED=FALSE
+# TURNSTILE_SECRET_KEY=
+
 # Connector indexing lock TTL in seconds (default: 28800 = 8 hours)
 # CONNECTOR_INDEXING_LOCK_TTL_SECONDS=28800
 
diff --git a/docker/docker-compose.deps-only.yml b/docker/docker-compose.deps-only.yml
new file mode 100644
index 0000000000..2be0bfe6e3
--- /dev/null
+++ b/docker/docker-compose.deps-only.yml
@@ -0,0 +1,139 @@
+# =============================================================================
+# SurfSense — Dependencies only (no backend / frontend / Celery images)
+# =============================================================================
+# Postgres, Redis, SearXNG, pgAdmin, Zero — run API + Next + Celery on the host.
+# Celery is not Dockerized here: use `uv run` from surfsense_backend/ (no extra
+# backend image build just for workers).
+#
+# From repo root (SurfSense/):
+#   docker compose -f docker/docker-compose.deps-only.yml up -d
+#
+# Compose variable substitution uses `docker/.env` (copy from .env.example).
+# Bind mounts use ./postgresql.conf and ./searxng in this directory.
+#
+# Local Celery (from surfsense_backend/, after Redis is up):
+#   uv run celery -A celery_worker.celery_app worker --loglevel=info --concurrency=1 --pool=solo --queues=surfsense,surfsense.connectors
+#   uv run celery -A celery_worker.celery_app beat --loglevel=info
+#
+# Host setup:
+#   - Backend .env: DATABASE_URL=postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense
+#   - Backend .env: SEARXNG_DEFAULT_HOST=http://localhost:${SEARXNG_PORT:-8888}
+#   - Backend .env: CELERY_BROKER_URL / REDIS_APP_URL → redis://localhost:6379/0
+#   - Web .env: NEXT_PUBLIC_ZERO_CACHE_URL=http://localhost:${ZERO_CACHE_PORT:-4848}
+#
+# IMPORTANT — schema migrations:
+#   This compose file does NOT build the backend image and therefore cannot
+#   run a `migrations` service. You MUST run alembic on the host before
+#   bringing zero-cache up, or zero-cache will crash-loop with
+#   `Unknown or invalid publications. Specified: [zero_publication]`.
+#
+#   First-time / after-pull:
+#     cd surfsense_backend && uv run alembic upgrade head
+#
+#   The other compose files (docker-compose.yml, docker-compose.dev.yml)
+#   handle this automatically via a dedicated `migrations` service.
+# =============================================================================
+
+name: surfsense-deps
+
+services:
+  db:
+    image: pgvector/pgvector:pg17
+    ports:
+      - "${POSTGRES_PORT:-5432}:5432"
+    volumes:
+      - postgres_data:/var/lib/postgresql/data
+      - ./postgresql.conf:/etc/postgresql/postgresql.conf:ro
+    environment:
+      - POSTGRES_USER=${DB_USER:-postgres}
+      - POSTGRES_PASSWORD=${DB_PASSWORD:-postgres}
+      - POSTGRES_DB=${DB_NAME:-surfsense}
+    command: postgres -c config_file=/etc/postgresql/postgresql.conf
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U ${DB_USER:-postgres} -d ${DB_NAME:-surfsense}"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+
+  pgadmin:
+    image: dpage/pgadmin4
+    ports:
+      - "${PGADMIN_PORT:-5050}:80"
+    environment:
+      - PGADMIN_DEFAULT_EMAIL=${PGADMIN_DEFAULT_EMAIL:-admin@surfsense.com}
+      - PGADMIN_DEFAULT_PASSWORD=${PGADMIN_DEFAULT_PASSWORD:-surfsense}
+    volumes:
+      - pgadmin_data:/var/lib/pgadmin
+    depends_on:
+      - db
+
+  redis:
+    image: redis:8-alpine
+    ports:
+      - "${REDIS_PORT:-6379}:6379"
+    volumes:
+      - redis_data:/data
+    command: redis-server --appendonly yes
+    healthcheck:
+      test: ["CMD", "redis-cli", "ping"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+
+  searxng:
+    image: searxng/searxng:2026.3.13-3c1f68c59
+    ports:
+      - "${SEARXNG_PORT:-8888}:8080"
+    volumes:
+      - ./searxng:/etc/searxng
+    environment:
+      - SEARXNG_SECRET=${SEARXNG_SECRET:-surfsense-searxng-secret}
+    healthcheck:
+      test: ["CMD", "wget", "--spider", "-q", "http://localhost:8080/healthz"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+
+  # NOTE: zero-cache requires the `zero_publication` Postgres publication to
+  # exist before it starts. In this deps-only stack there is no backend
+  # container to run migrations, so you must run `uv run alembic upgrade head`
+  # from `surfsense_backend/` on the host BEFORE `docker compose up -d`.
+  zero-cache:
+    image: rocicorp/zero:1.4.0
+    ports:
+      - "${ZERO_CACHE_PORT:-4848}:4848"
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
+    depends_on:
+      db:
+        condition: service_healthy
+    environment:
+      - ZERO_UPSTREAM_DB=postgresql://${DB_USER:-postgres}:${DB_PASSWORD:-postgres}@db:5432/${DB_NAME:-surfsense}?sslmode=${DB_SSLMODE:-disable}
+      - ZERO_CVR_DB=postgresql://${DB_USER:-postgres}:${DB_PASSWORD:-postgres}@db:5432/${DB_NAME:-surfsense}?sslmode=${DB_SSLMODE:-disable}
+      - ZERO_CHANGE_DB=postgresql://${DB_USER:-postgres}:${DB_PASSWORD:-postgres}@db:5432/${DB_NAME:-surfsense}?sslmode=${DB_SSLMODE:-disable}
+      - ZERO_REPLICA_FILE=/data/zero.db
+      - ZERO_ADMIN_PASSWORD=${ZERO_ADMIN_PASSWORD:-surfsense-zero-admin}
+      - ZERO_APP_PUBLICATIONS=${ZERO_APP_PUBLICATIONS:-zero_publication}
+      - ZERO_NUM_SYNC_WORKERS=${ZERO_NUM_SYNC_WORKERS:-4}
+      - ZERO_UPSTREAM_MAX_CONNS=${ZERO_UPSTREAM_MAX_CONNS:-20}
+      - ZERO_CVR_MAX_CONNS=${ZERO_CVR_MAX_CONNS:-30}
+      - ZERO_QUERY_URL=${ZERO_QUERY_URL:-http://host.docker.internal:3000/api/zero/query}
+      - ZERO_MUTATE_URL=${ZERO_MUTATE_URL:-http://host.docker.internal:3000/api/zero/mutate}
+    volumes:
+      - zero_cache_data:/data
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:4848/keepalive"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+
+volumes:
+  postgres_data:
+    name: surfsense-deps-postgres
+  pgadmin_data:
+    name: surfsense-deps-pgadmin
+  redis_data:
+    name: surfsense-deps-redis
+  zero_cache_data:
+    name: surfsense-deps-zero-cache
diff --git a/docker/docker-compose.dev.yml b/docker/docker-compose.dev.yml
index 666b0c642e..913e3e3ceb 100644
--- a/docker/docker-compose.dev.yml
+++ b/docker/docker-compose.dev.yml
@@ -10,6 +10,11 @@
 
 name: surfsense-dev
 
+x-backend-build: &backend-build
+  context: ../surfsense_backend
+  args:
+    EMBEDDING_MODEL: ${EMBEDDING_MODEL:-sentence-transformers/all-MiniLM-L6-v2}
+
 services:
   db:
     image: pgvector/pgvector:pg17
@@ -29,6 +34,25 @@ services:
       timeout: 5s
       retries: 5
 
+  # Short-lived schema runner; see docker/docker-compose.yml `migrations`
+  # service for the full rationale. Builds from the same backend context as
+  # the dev backend/celery services.
+  migrations:
+    build: *backend-build
+    env_file:
+      - ../surfsense_backend/.env
+    environment:
+      - DATABASE_URL=${DATABASE_URL:-postgresql+asyncpg://${DB_USER:-postgres}:${DB_PASSWORD:-postgres}@${DB_HOST:-db}:${DB_PORT:-5432}/${DB_NAME:-surfsense}}
+      - PYTHONPATH=/app
+      - SERVICE_ROLE=migrate
+      - MIGRATION_TIMEOUT=${MIGRATION_TIMEOUT:-900}
+    volumes:
+      - zero_init:/zero-init
+    depends_on:
+      db:
+        condition: service_healthy
+    restart: "no"
+
   pgadmin:
     image: dpage/pgadmin4
     ports:
@@ -69,7 +93,7 @@ services:
       retries: 5
 
   backend:
-    build: ../surfsense_backend
+    build: *backend-build
     ports:
       - "${BACKEND_PORT:-8000}:8000"
     volumes:
@@ -77,6 +101,8 @@ services:
       - shared_temp:/shared_tmp
     env_file:
       - ../surfsense_backend/.env
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
     environment:
       - DATABASE_URL=${DATABASE_URL:-postgresql+asyncpg://${DB_USER:-postgres}:${DB_PASSWORD:-postgres}@${DB_HOST:-db}:${DB_PORT:-5432}/${DB_NAME:-surfsense}}
       - CELERY_BROKER_URL=${REDIS_URL:-redis://redis:6379/0}
@@ -104,20 +130,24 @@ services:
         condition: service_healthy
       searxng:
         condition: service_healthy
+      migrations:
+        condition: service_completed_successfully
     healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      test: ["CMD", "curl", "-f", "http://localhost:8000/ready"]
       interval: 15s
       timeout: 5s
       retries: 30
       start_period: 200s
 
   celery_worker:
-    build: ../surfsense_backend
+    build: *backend-build
     volumes:
       - ../surfsense_backend/app:/app/app
       - shared_temp:/shared_tmp
     env_file:
       - ../surfsense_backend/.env
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
     environment:
       - DATABASE_URL=${DATABASE_URL:-postgresql+asyncpg://${DB_USER:-postgres}:${DB_PASSWORD:-postgres}@${DB_HOST:-db}:${DB_PORT:-5432}/${DB_NAME:-surfsense}}
       - CELERY_BROKER_URL=${REDIS_URL:-redis://redis:6379/0}
@@ -132,11 +162,13 @@ services:
         condition: service_healthy
       redis:
         condition: service_healthy
+      migrations:
+        condition: service_completed_successfully
       backend:
         condition: service_healthy
 
   celery_beat:
-    build: ../surfsense_backend
+    build: *backend-build
     env_file:
       - ../surfsense_backend/.env
     environment:
@@ -151,11 +183,13 @@ services:
         condition: service_healthy
       redis:
         condition: service_healthy
+      migrations:
+        condition: service_completed_successfully
       celery_worker:
         condition: service_started
 
   # flower:
-  #   build: ../surfsense_backend
+  #   build: *backend-build
   #   ports:
   #     - "${FLOWER_PORT:-5555}:5555"
   #   env_file:
@@ -170,14 +204,16 @@ services:
   #     - celery_worker
 
   zero-cache:
-    image: rocicorp/zero:0.26.2
+    image: rocicorp/zero:1.4.0
     ports:
       - "${ZERO_CACHE_PORT:-4848}:4848"
     extra_hosts:
       - "host.docker.internal:host-gateway"
     depends_on:
-      backend:
+      db:
         condition: service_healthy
+      migrations:
+        condition: service_completed_successfully
     environment:
       - ZERO_UPSTREAM_DB=${ZERO_UPSTREAM_DB:-postgresql://${DB_USER:-postgres}:${DB_PASSWORD:-postgres}@${DB_HOST:-db}:${DB_PORT:-5432}/${DB_NAME:-surfsense}?sslmode=${DB_SSLMODE:-disable}}
       - ZERO_CVR_DB=${ZERO_CVR_DB:-postgresql://${DB_USER:-postgres}:${DB_PASSWORD:-postgres}@${DB_HOST:-db}:${DB_PORT:-5432}/${DB_NAME:-surfsense}?sslmode=${DB_SSLMODE:-disable}}
@@ -192,6 +228,12 @@ services:
       - ZERO_MUTATE_URL=${ZERO_MUTATE_URL:-http://frontend:3000/api/zero/mutate}
     volumes:
       - zero_cache_data:/data
+      - zero_init:/zero-init
+    # Wrapper: see docker/docker-compose.yml `zero-cache` for rationale.
+    entrypoint: ["sh", "-c"]
+    # Pass the script as a single list element so Compose does not tokenize it.
+    command:
+      - 'if [ -f /zero-init/needs_reset ]; then echo "[zero-init] publication change detected; wiping replica file(s) under /data" && rm -f /data/zero.db /data/zero.db-shm /data/zero.db-wal && rm -f /zero-init/needs_reset; fi; exec zero-cache'
     restart: unless-stopped
     healthcheck:
       test: ["CMD", "curl", "-f", "http://localhost:4848/keepalive"]
@@ -229,3 +271,5 @@ volumes:
     name: surfsense-dev-shared-temp
   zero_cache_data:
     name: surfsense-dev-zero-cache
+  zero_init:
+    name: surfsense-dev-zero-init
diff --git a/docker/docker-compose.e2e.yml b/docker/docker-compose.e2e.yml
new file mode 100644
index 0000000000..2d55595f7d
--- /dev/null
+++ b/docker/docker-compose.e2e.yml
@@ -0,0 +1,181 @@
+# =============================================================================
+# SurfSense — E2E Docker Compose stack
+# =============================================================================
+# Hermetic backend stack for Playwright E2E tests:
+#   - db / redis on an internal-only network (no internet egress)
+#   - backend (FastAPI) joins the internal network AND a separate ingress
+#     bridge so the host runner can reach :8000
+#   - celery_worker on the internal network only — zero egress surface
+#
+# The backend image is built from surfsense_backend/Dockerfile target=e2e,
+# which adds tests/ via the `tests-source` additional context (tests/ is
+# excluded from the main context by .dockerignore so production never ships
+# test fakes). See surfsense_backend/Dockerfile for stage layout.
+#
+# Usage from repo root:
+#   docker compose -f docker/docker-compose.e2e.yml up -d --build --wait
+#   curl -X POST http://localhost:8000/auth/register ...
+#   ( run Playwright on host, pointing at localhost:8000 + localhost:3000 )
+#   docker compose -f docker/docker-compose.e2e.yml down -v
+# =============================================================================
+
+name: surfsense-e2e
+
+x-backend-env: &backend-env
+  DATABASE_URL: postgresql+asyncpg://postgres:postgres@db:5432/surfsense_e2e
+  CELERY_BROKER_URL: redis://redis:6379/0
+  CELERY_RESULT_BACKEND: redis://redis:6379/0
+  REDIS_APP_URL: redis://redis:6379/0
+  CELERY_TASK_DEFAULT_QUEUE: surfsense
+  SECRET_KEY: ci-test-secret-key-not-for-production
+  AUTH_TYPE: LOCAL
+  REGISTRATION_ENABLED: "TRUE"
+  ETL_SERVICE: DOCLING
+  EMBEDDING_MODEL: sentence-transformers/all-MiniLM-L6-v2
+  NEXT_FRONTEND_URL: http://host.docker.internal:3000
+  # Sentinel keys — fakes never read them; turns leaked real calls into 401s.
+  COMPOSIO_API_KEY: e2e-deny-real-call-sentinel
+  COMPOSIO_ENABLED: "TRUE"
+  OPENAI_API_KEY: e2e-deny-real-call-sentinel
+  ANTHROPIC_API_KEY: e2e-deny-real-call-sentinel
+  LITELLM_API_KEY: e2e-deny-real-call-sentinel
+  MICROSOFT_CLIENT_ID: fake-microsoft-client-id
+  MICROSOFT_CLIENT_SECRET: fake-microsoft-client-secret
+  ONEDRIVE_REDIRECT_URI: http://localhost:8000/api/v1/auth/onedrive/connector/callback
+  DROPBOX_APP_KEY: fake-dropbox-app-key
+  DROPBOX_APP_SECRET: fake-dropbox-app-secret
+  DROPBOX_REDIRECT_URI: http://localhost:8000/api/v1/auth/dropbox/connector/callback
+  # Defense-in-depth: even though L3 egress is denied for the worker via
+  # `internal: true`, the backend still has a route via `ingress`. Setting
+  # HTTPS_PROXY to an unreachable port turns any leaked Python outbound HTTP
+  # call into a fast Connection refused. UNLIKE the old runner-shell setup,
+  # this proxy is set on the container env and `uv` is never invoked here,
+  # so there is no interaction with uv's implicit-sync behaviour.
+  HTTPS_PROXY: http://127.0.0.1:1
+  HTTP_PROXY: http://127.0.0.1:1
+  NO_PROXY: localhost,127.0.0.1,0.0.0.0,db,redis,host.docker.internal
+  HF_HUB_OFFLINE: "1"
+  TRANSFORMERS_OFFLINE: "1"
+  # Test-only token-mint endpoint secret (see tests/e2e/run_backend.py).
+  E2E_MINT_SECRET: e2e-mint-secret-not-for-production
+
+services:
+  db:
+    image: pgvector/pgvector:pg17
+    command: >
+      postgres
+        -c wal_level=logical
+        -c max_wal_senders=10
+        -c max_replication_slots=10
+    environment:
+      POSTGRES_USER: postgres
+      POSTGRES_PASSWORD: postgres
+      POSTGRES_DB: surfsense_e2e
+    # Ephemeral storage — every CI run gets a clean DB, no volume cleanup needed.
+    tmpfs:
+      - /var/lib/postgresql/data
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U postgres -d surfsense_e2e"]
+      interval: 2s
+      timeout: 3s
+      retries: 30
+    networks: [internal]
+
+  redis:
+    image: redis:8-alpine
+    healthcheck:
+      test: ["CMD", "redis-cli", "ping"]
+      interval: 2s
+      timeout: 3s
+      retries: 30
+    networks: [internal]
+
+  backend:
+    build:
+      context: ../surfsense_backend
+      dockerfile: Dockerfile
+      target: e2e
+      additional_contexts:
+        # tests/ is excluded from the main context by .dockerignore;
+        # the e2e stage's `COPY --from=tests-source` pulls it in here.
+        tests-source: ../surfsense_backend/tests
+      args:
+        EMBEDDING_MODEL: sentence-transformers/all-MiniLM-L6-v2
+      cache_from:
+        - type=gha,scope=surfsense-e2e-backend
+      cache_to:
+        - type=gha,mode=max,scope=surfsense-e2e-backend
+    image: surfsense-e2e-backend:local
+    environment:
+      <<: *backend-env
+      SERVICE_ROLE: api
+    volumes:
+      - shared_temp:/shared_tmp
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
+    ports:
+      - "8000:8000"
+    depends_on:
+      db: { condition: service_healthy }
+      redis: { condition: service_healthy }
+    healthcheck:
+      # Use Python (already in the image) instead of curl/wget to avoid
+      # depending on either tool being installed in the runtime layers.
+      test:
+        - CMD
+        - python
+        - -c
+        - |
+          import sys, urllib.request
+          try:
+              r = urllib.request.urlopen("http://localhost:8000/openapi.json", timeout=2)
+              sys.exit(0 if r.status == 200 else 1)
+          except Exception:
+              sys.exit(1)
+      interval: 3s
+      timeout: 5s
+      retries: 60
+      start_period: 30s
+    networks:
+      - internal      # to reach db/redis
+      - ingress       # so host can reach :8000
+
+  celery_worker:
+    image: surfsense-e2e-backend:local
+    pull_policy: never
+    # No build: section — reuses the image built by the `backend` service.
+    # Compose v2 builds shared images exactly once across services that
+    # reference the same `image:` tag.
+    environment:
+      <<: *backend-env
+      SERVICE_ROLE: worker
+    volumes:
+      - shared_temp:/shared_tmp
+    depends_on:
+      backend: { condition: service_healthy }
+    healthcheck:
+      test:
+        - CMD-SHELL
+        - "celery -A app.celery_app inspect ping --timeout 2 | grep -q pong"
+      interval: 5s
+      timeout: 5s
+      retries: 12
+      start_period: 20s
+    networks: [internal]
+
+networks:
+  # Internal network: containers attached only to this network have NO route
+  # to the host or the internet. This is the L3 deny-egress mechanism that
+  # replaces the fragile HTTPS_PROXY-on-the-runner approach.
+  internal:
+    driver: bridge
+    internal: true
+
+  # Regular bridge network. Only the `backend` service joins it, solely so
+  # the host can reach :8000 via the published port. celery_worker / db /
+  # redis stay off this network entirely.
+  ingress:
+    driver: bridge
+
+volumes:
+  shared_temp:
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
index c14cd8b091..8683c82585 100644
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@@ -27,6 +27,28 @@ services:
       timeout: 5s
       retries: 5
 
+  # Short-lived schema runner. Executes `alembic upgrade head` and verifies
+  # that the `zero_publication` Postgres logical-replication publication
+  # exists, then exits 0. Downstream services (backend, celery_*, zero-cache)
+  # gate on this with `condition: service_completed_successfully` so a failed
+  # migration halts the whole stack instead of silently producing a half-built
+  # system that crash-loops zero-cache on missing publications.
+  migrations:
+    image: ghcr.io/modsetter/surfsense-backend:${SURFSENSE_VERSION:-latest}
+    env_file:
+      - .env
+    environment:
+      DATABASE_URL: ${DATABASE_URL:-postgresql+asyncpg://${DB_USER:-surfsense}:${DB_PASSWORD:-surfsense}@${DB_HOST:-db}:${DB_PORT:-5432}/${DB_NAME:-surfsense}}
+      PYTHONPATH: /app
+      SERVICE_ROLE: migrate
+      MIGRATION_TIMEOUT: ${MIGRATION_TIMEOUT:-900}
+    volumes:
+      - zero_init:/zero-init
+    depends_on:
+      db:
+        condition: service_healthy
+    restart: "no"
+
   redis:
     image: redis:8-alpine
     volumes:
@@ -60,6 +82,8 @@ services:
       - shared_temp:/shared_tmp
     env_file:
       - .env
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
     environment:
       DATABASE_URL: ${DATABASE_URL:-postgresql+asyncpg://${DB_USER:-surfsense}:${DB_PASSWORD:-surfsense}@${DB_HOST:-db}:${DB_PORT:-5432}/${DB_NAME:-surfsense}}
       CELERY_BROKER_URL: ${REDIS_URL:-redis://redis:6379/0}
@@ -86,9 +110,11 @@ services:
         condition: service_healthy
       searxng:
         condition: service_healthy
+      migrations:
+        condition: service_completed_successfully
     restart: unless-stopped
     healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      test: ["CMD", "curl", "-f", "http://localhost:8000/ready"]
       interval: 15s
       timeout: 5s
       retries: 30
@@ -100,6 +126,8 @@ services:
       - shared_temp:/shared_tmp
     env_file:
       - .env
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
     environment:
       DATABASE_URL: ${DATABASE_URL:-postgresql+asyncpg://${DB_USER:-surfsense}:${DB_PASSWORD:-surfsense}@${DB_HOST:-db}:${DB_PORT:-5432}/${DB_NAME:-surfsense}}
       CELERY_BROKER_URL: ${REDIS_URL:-redis://redis:6379/0}
@@ -114,6 +142,8 @@ services:
         condition: service_healthy
       redis:
         condition: service_healthy
+      migrations:
+        condition: service_completed_successfully
       backend:
         condition: service_healthy
     labels:
@@ -136,6 +166,8 @@ services:
         condition: service_healthy
       redis:
         condition: service_healthy
+      migrations:
+        condition: service_completed_successfully
       celery_worker:
         condition: service_started
     labels:
@@ -159,7 +191,7 @@ services:
   #   restart: unless-stopped
 
   zero-cache:
-    image: rocicorp/zero:0.26.2
+    image: rocicorp/zero:1.4.0
     ports:
       - "${ZERO_CACHE_PORT:-5929}:4848"
     extra_hosts:
@@ -178,10 +210,21 @@ services:
       ZERO_MUTATE_URL: ${ZERO_MUTATE_URL:-http://frontend:3000/api/zero/mutate}
     volumes:
       - zero_cache_data:/data
+      - zero_init:/zero-init
+    # Wrapper: if the migrations service flagged a publication change via
+    # /zero-init/needs_reset, wipe the SQLite replica before starting so
+    # zero-cache does a clean initial sync. Recovers from the half-built
+    # replica state (`_zero.tableMetadata` missing) caused by earlier crashes.
+    entrypoint: ["sh", "-c"]
+    # Pass the script as a single list element so Compose does not tokenize it.
+    command:
+      - 'if [ -f /zero-init/needs_reset ]; then echo "[zero-init] publication change detected; wiping replica file(s) under /data" && rm -f /data/zero.db /data/zero.db-shm /data/zero.db-wal && rm -f /zero-init/needs_reset; fi; exec zero-cache'
     restart: unless-stopped
     depends_on:
-      backend:
+      db:
         condition: service_healthy
+      migrations:
+        condition: service_completed_successfully
     healthcheck:
       test: ["CMD", "curl", "-f", "http://localhost:4848/keepalive"]
       interval: 10s
@@ -198,6 +241,7 @@ services:
       NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE: ${AUTH_TYPE:-SSO}
       NEXT_PUBLIC_ETL_SERVICE: ${ETL_SERVICE:-DOCLING}
       NEXT_PUBLIC_DEPLOYMENT_MODE: ${DEPLOYMENT_MODE:-self-hosted}
+      FASTAPI_BACKEND_INTERNAL_URL: ${FASTAPI_BACKEND_INTERNAL_URL:-http://backend:8000}
     labels:
       - "com.centurylinklabs.watchtower.enable=true"
     depends_on:
@@ -216,3 +260,5 @@ volumes:
     name: surfsense-shared-temp
   zero_cache_data:
     name: surfsense-zero-cache
+  zero_init:
+    name: surfsense-zero-init
diff --git a/docker/scripts/install.ps1 b/docker/scripts/install.ps1
index 0eb3886a2e..60c4fd5dfc 100644
--- a/docker/scripts/install.ps1
+++ b/docker/scripts/install.ps1
@@ -97,6 +97,161 @@ function Wait-ForPostgres {
     Write-Ok "PostgreSQL is ready."
 }
 
+# ── Stack health helpers ────────────────────────────────────────────────────
+
+function Get-ComposeServices {
+    Push-Location $InstallDir
+    try {
+        $raw = Invoke-NativeSafe { docker compose ps -a --format json 2>$null }
+    } finally {
+        Pop-Location
+    }
+    if ([string]::IsNullOrWhiteSpace($raw)) { return @() }
+
+    # Compose v2.21+ emits a JSON array; older versions emit one object per line.
+    try {
+        $parsed = $raw | ConvertFrom-Json
+        if ($parsed -is [System.Collections.IEnumerable] -and -not ($parsed -is [string])) {
+            return @($parsed)
+        }
+        return @($parsed)
+    } catch {
+        $services = @()
+        foreach ($line in ($raw -split "`r?`n")) {
+            $line = $line.Trim()
+            if (-not $line) { continue }
+            try { $services += ($line | ConvertFrom-Json) } catch { }
+        }
+        return $services
+    }
+}
+
+function Wait-StackHealthy {
+    param([int]$TimeoutSec = 300)
+
+    $deadline = (Get-Date).AddSeconds($TimeoutSec)
+    $lastReport = ""
+
+    while ((Get-Date) -lt $deadline) {
+        $services = Get-ComposeServices
+        if (-not $services -or $services.Count -eq 0) {
+            Start-Sleep -Seconds 3
+            continue
+        }
+
+        $bad = @()
+        $waiting = @()
+        $good = @()
+
+        foreach ($svc in $services) {
+            $name = $svc.Service
+            $state = $svc.State
+            $health = if ($svc.PSObject.Properties.Name -contains 'Health') { $svc.Health } else { '' }
+            $exit = if ($svc.PSObject.Properties.Name -contains 'ExitCode') { $svc.ExitCode } else { $null }
+
+            if ($name -eq 'migrations') {
+                if ($state -eq 'exited' -and $exit -eq 0) { $good += $name }
+                elseif ($state -eq 'exited') { $bad += "${name} (exit=${exit})" }
+                else { $waiting += "${name} (${state})" }
+                continue
+            }
+
+            if ($state -eq 'running') {
+                if ([string]::IsNullOrEmpty($health) -or $health -eq 'healthy') {
+                    $good += $name
+                } elseif ($health -eq 'starting') {
+                    $waiting += "${name} (starting)"
+                } elseif ($health -eq 'unhealthy') {
+                    $bad += "${name} (unhealthy)"
+                } else {
+                    $waiting += "${name} (${health})"
+                }
+            } elseif ($state -eq 'restarting') {
+                $bad += "${name} (restarting)"
+            } elseif ($state -eq 'exited') {
+                $bad += "${name} (exited, code=${exit})"
+            } else {
+                $waiting += "${name} (${state})"
+            }
+        }
+
+        if ($bad.Count -gt 0) {
+            return @{ Ok = $false; Reason = 'failure'; Bad = $bad; Waiting = $waiting; Good = $good }
+        }
+        if ($waiting.Count -eq 0) {
+            return @{ Ok = $true; Reason = 'all_healthy'; Good = $good }
+        }
+
+        $report = "Waiting on: " + ($waiting -join ', ')
+        if ($report -ne $lastReport) {
+            Write-Info $report
+            $lastReport = $report
+        }
+        Start-Sleep -Seconds 5
+    }
+
+    return @{ Ok = $false; Reason = 'timeout'; Bad = $bad; Waiting = $waiting; Good = $good }
+}
+
+function Test-StaleZeroCacheVolume {
+    $raw = Invoke-NativeSafe { docker volume ls --format '{{.Name}}' 2>$null }
+    if ([string]::IsNullOrWhiteSpace($raw)) { return $false }
+    $names = $raw -split "`r?`n" | ForEach-Object { $_.Trim() } | Where-Object { $_ }
+    $hasZeroCache = $names -contains 'surfsense-zero-cache'
+    $hasZeroInit = $names -contains 'surfsense-zero-init'
+    # Pre-fix installs created surfsense-zero-cache but never surfsense-zero-init.
+    # Such a volume may hold a half-initialized SQLite replica from an earlier
+    # crash-loop. Wiping it forces zero-cache to do a fresh initial sync.
+    return ($hasZeroCache -and -not $hasZeroInit)
+}
+
+function Invoke-StaleZeroCacheCleanup {
+    if (-not (Test-StaleZeroCacheVolume)) { return }
+
+    Write-Warn "Detected pre-existing 'surfsense-zero-cache' volume from an install that"
+    Write-Warn "predates the migrations-service fix. It may contain a half-initialized"
+    Write-Warn "SQLite replica that would block zero-cache from starting."
+    Write-Warn "The volume will be removed in 5 seconds; press Ctrl+C to cancel."
+    Start-Sleep -Seconds 5
+
+    Push-Location $InstallDir
+    Invoke-NativeSafe { docker compose down --remove-orphans 2>$null } | Out-Null
+    Pop-Location
+    Invoke-NativeSafe { docker volume rm surfsense-zero-cache 2>$null } | Out-Null
+    Write-Ok "Removed surfsense-zero-cache volume; zero-cache will re-sync on next start."
+}
+
+function Write-Err-NoExit {
+    param([string]$Message)
+    Write-Host "[ERROR] $Message" -ForegroundColor Red
+}
+
+function Invoke-StackFailureReport {
+    param([hashtable]$Result)
+
+    Write-Host ""
+    Write-Err-NoExit "Stack did not reach a healthy state."
+    if ($Result.Bad.Count -gt 0) { Write-Host ("  Failed: " + ($Result.Bad -join ', ')) }
+    if ($Result.Waiting.Count -gt 0) { Write-Host ("  Stuck:  " + ($Result.Waiting -join ', ')) }
+
+    Write-Host ""
+    Write-Info "Recent logs from migrations / zero-cache / backend:"
+    Push-Location $InstallDir
+    try {
+        Invoke-NativeSafe { docker compose logs --tail=60 migrations zero-cache backend 2>&1 } | Write-Host
+    } finally {
+        Pop-Location
+    }
+
+    Write-Host ""
+    Write-Host "Recovery hints:" -ForegroundColor Yellow
+    Write-Host "  1. Inspect migrations:   cd $InstallDir; docker compose logs migrations"
+    Write-Host "  2. Verify publication:   cd $InstallDir; docker compose exec db psql -U surfsense -d surfsense -c 'SELECT pubname FROM pg_publication;'"
+    Write-Host "  3. Hard reset zero db:   cd $InstallDir; docker compose down; docker volume rm surfsense-zero-cache; docker compose up -d"
+    Write-Host ""
+    exit 1
+}
+
 # ── Download files ──────────────────────────────────────────────────────────
 
 Write-Step "Downloading SurfSense files"
@@ -191,6 +346,8 @@ if (-not (Test-Path $envPath)) {
 
 # ── Start containers ────────────────────────────────────────────────────────
 
+Invoke-StaleZeroCacheCleanup
+
 if ($MigrationMode) {
     $envContent = Get-Content $envPath
     $DbUser = ($envContent | Select-String '^DB_USER=' | ForEach-Object { ($_ -split '=',2)[1].Trim('"') }) | Select-Object -First 1
@@ -251,7 +408,13 @@ if ($MigrationMode) {
     Push-Location $InstallDir
     Invoke-NativeSafe { docker compose up -d }
     Pop-Location
-    Write-Ok "All services started."
+    Write-Ok "All containers started; waiting for stack to become healthy..."
+
+    $waitResult = Wait-StackHealthy -TimeoutSec 300
+    if (-not $waitResult.Ok) {
+        Invoke-StackFailureReport -Result $waitResult
+    }
+    Write-Ok "All services healthy."
 
     Remove-Item $KeyFile -ErrorAction SilentlyContinue
 
@@ -260,7 +423,13 @@ if ($MigrationMode) {
     Push-Location $InstallDir
     Invoke-NativeSafe { docker compose up -d }
     Pop-Location
-    Write-Ok "All services started."
+    Write-Ok "All containers started; waiting for stack to become healthy..."
+
+    $waitResult = Wait-StackHealthy -TimeoutSec 300
+    if (-not $waitResult.Ok) {
+        Invoke-StackFailureReport -Result $waitResult
+    }
+    Write-Ok "All services healthy."
 }
 
 # ── Watchtower (auto-update) ────────────────────────────────────────────────
diff --git a/docker/scripts/install.sh b/docker/scripts/install.sh
index fcab4d55a3..db81f95eb3 100644
--- a/docker/scripts/install.sh
+++ b/docker/scripts/install.sh
@@ -97,6 +97,163 @@ wait_for_pg() {
     success "PostgreSQL is ready."
 }
 
+# ── Stack health helpers ─────────────────────────────────────────────────────
+
+# Enumerate compose services for project `surfsense` as `service|state|health|exitcode`
+# lines. Uses `docker inspect` so we don't depend on `jq`, `python3`, or the
+# exact ordering of fields in `docker compose ps --format json` output.
+get_compose_services() {
+    local containers
+    containers=$(docker ps -a --filter "label=com.docker.compose.project=surfsense" --format '{{.Names}}' 2>/dev/null) || true
+    [[ -z "$containers" ]] && return 0
+
+    while IFS= read -r container; do
+        [[ -z "$container" ]] && continue
+        local svc state health code
+        svc=$(docker inspect -f '{{index .Config.Labels "com.docker.compose.service"}}' "$container" 2>/dev/null || echo "")
+        state=$(docker inspect -f '{{.State.Status}}' "$container" 2>/dev/null || echo "unknown")
+        health=$(docker inspect -f '{{if .State.Health}}{{.State.Health.Status}}{{end}}' "$container" 2>/dev/null || echo "")
+        code=$(docker inspect -f '{{.State.ExitCode}}' "$container" 2>/dev/null || echo "")
+        [[ -z "$svc" ]] && continue
+        printf '%s|%s|%s|%s\n' "$svc" "$state" "$health" "$code"
+    done <<< "$containers"
+}
+
+# Globals populated by wait_stack_healthy / consumed by stack_failure_report.
+STACK_BAD=()
+STACK_WAITING=()
+STACK_GOOD=()
+STACK_TIMEOUT=false
+
+wait_stack_healthy() {
+    local timeout_sec=${1:-300}
+    local deadline=$(($(date +%s) + timeout_sec))
+    local last_report=""
+    local bad=()
+    local waiting=()
+    local good=()
+
+    while [[ $(date +%s) -lt $deadline ]]; do
+        local lines
+        lines=$(get_compose_services)
+        if [[ -z "$lines" ]]; then
+            sleep 3
+            continue
+        fi
+
+        bad=()
+        waiting=()
+        good=()
+
+        while IFS='|' read -r name state health code; do
+            [[ -z "$name" ]] && continue
+            if [[ "$name" == "migrations" ]]; then
+                if [[ "$state" == "exited" && "$code" == "0" ]]; then
+                    good+=("$name")
+                elif [[ "$state" == "exited" ]]; then
+                    bad+=("${name} (exit=${code})")
+                else
+                    waiting+=("${name} (${state})")
+                fi
+                continue
+            fi
+
+            if [[ "$state" == "running" ]]; then
+                if [[ -z "$health" || "$health" == "healthy" ]]; then
+                    good+=("$name")
+                elif [[ "$health" == "starting" ]]; then
+                    waiting+=("${name} (starting)")
+                elif [[ "$health" == "unhealthy" ]]; then
+                    bad+=("${name} (unhealthy)")
+                else
+                    waiting+=("${name} (${health})")
+                fi
+            elif [[ "$state" == "restarting" ]]; then
+                bad+=("${name} (restarting)")
+            elif [[ "$state" == "exited" ]]; then
+                bad+=("${name} (exited, code=${code})")
+            else
+                waiting+=("${name} (${state})")
+            fi
+        done <<< "$lines"
+
+        if (( ${#bad[@]} > 0 )); then
+            STACK_BAD=("${bad[@]}")
+            STACK_WAITING=("${waiting[@]}")
+            STACK_GOOD=("${good[@]}")
+            return 1
+        fi
+        if (( ${#waiting[@]} == 0 )); then
+            STACK_GOOD=("${good[@]}")
+            return 0
+        fi
+
+        local report="Waiting on: ${waiting[*]}"
+        if [[ "$report" != "$last_report" ]]; then
+            info "$report"
+            last_report="$report"
+        fi
+        sleep 5
+    done
+
+    # bad/waiting/good are declared at function scope so referencing them is
+    # safe even if the polling loop never executed its body.
+    STACK_BAD=()
+    [[ ${#bad[@]} -gt 0 ]] && STACK_BAD=("${bad[@]}")
+    STACK_WAITING=()
+    [[ ${#waiting[@]} -gt 0 ]] && STACK_WAITING=("${waiting[@]}")
+    STACK_GOOD=()
+    [[ ${#good[@]} -gt 0 ]] && STACK_GOOD=("${good[@]}")
+    STACK_TIMEOUT=true
+    return 1
+}
+
+stack_failure_report() {
+    echo ""
+    echo -e "\033[31m[ERROR]\033[0m Stack did not reach a healthy state."
+    if (( ${#STACK_BAD[@]} > 0 )) && [[ -n "${STACK_BAD[0]}" ]]; then
+        echo "  Failed: ${STACK_BAD[*]}"
+    fi
+    if (( ${#STACK_WAITING[@]} > 0 )) && [[ -n "${STACK_WAITING[0]}" ]]; then
+        echo "  Stuck:  ${STACK_WAITING[*]}"
+    fi
+    echo ""
+    info "Recent logs from migrations / zero-cache / backend:"
+    (cd "${INSTALL_DIR}" && ${DC} logs --tail=60 migrations zero-cache backend 2>&1) || true
+    echo ""
+    echo "Recovery hints:"
+    echo "  1. Inspect migrations:   cd ${INSTALL_DIR} && ${DC} logs migrations"
+    echo "  2. Verify publication:   cd ${INSTALL_DIR} && ${DC} exec db psql -U surfsense -d surfsense -c 'SELECT pubname FROM pg_publication;'"
+    echo "  3. Hard reset zero db:   cd ${INSTALL_DIR} && ${DC} down && docker volume rm surfsense-zero-cache && ${DC} up -d"
+    echo ""
+    exit 1
+}
+
+# True if `surfsense-zero-cache` exists but `surfsense-zero-init` does not.
+# That signals an install that predates the migrations-service fix; the old
+# replica may be half-initialized and would block zero-cache on next start.
+test_stale_zero_cache_volume() {
+    local has_zc has_zi
+    has_zc=$(docker volume ls --format '{{.Name}}' 2>/dev/null | grep -Fx 'surfsense-zero-cache' || true)
+    has_zi=$(docker volume ls --format '{{.Name}}' 2>/dev/null | grep -Fx 'surfsense-zero-init' || true)
+    [[ -n "$has_zc" && -z "$has_zi" ]]
+}
+
+invoke_stale_zero_cache_cleanup() {
+    if ! test_stale_zero_cache_volume; then
+        return 0
+    fi
+    warn "Detected pre-existing 'surfsense-zero-cache' volume from an install that"
+    warn "predates the migrations-service fix. It may contain a half-initialized"
+    warn "SQLite replica that would block zero-cache from starting."
+    warn "The volume will be removed in 5 seconds; press Ctrl+C to cancel."
+    sleep 5
+
+    (cd "${INSTALL_DIR}" && ${DC} down --remove-orphans 2>/dev/null) || true
+    docker volume rm surfsense-zero-cache 2>/dev/null || true
+    success "Removed surfsense-zero-cache volume; zero-cache will re-sync on next start."
+}
+
 # ── Download files ───────────────────────────────────────────────────────────
 
 step "Downloading SurfSense files"
@@ -186,6 +343,8 @@ fi
 
 # ── Start containers ─────────────────────────────────────────────────────────
 
+invoke_stale_zero_cache_cleanup
+
 if $MIGRATION_MODE; then
     # Read DB credentials from .env (fall back to defaults from docker-compose.yml)
     DB_USER=$(grep '^DB_USER=' "${INSTALL_DIR}/.env" 2>/dev/null | cut -d= -f2 | tr -d '"' | head -1 || true)
@@ -243,7 +402,12 @@ if $MIGRATION_MODE; then
 
     step "Starting all SurfSense services"
     (cd "${INSTALL_DIR}" && ${DC} up -d) < /dev/null
-    success "All services started."
+    success "All containers started; waiting for stack to become healthy..."
+
+    if ! wait_stack_healthy 300; then
+        stack_failure_report
+    fi
+    success "All services healthy."
 
     # Key file is no longer needed — SECRET_KEY is now in .env
     rm -f "${KEY_FILE}"
@@ -251,7 +415,12 @@ if $MIGRATION_MODE; then
 else
     step "Starting SurfSense"
     (cd "${INSTALL_DIR}" && ${DC} up -d) < /dev/null
-    success "All services started."
+    success "All containers started; waiting for stack to become healthy..."
+
+    if ! wait_stack_healthy 300; then
+        stack_failure_report
+    fi
+    success "All services healthy."
 fi
 
 # ── Watchtower (auto-update) ─────────────────────────────────────────────────
diff --git a/manifest.json b/manifest.json
new file mode 100644
index 0000000000..d03a5b6500
--- /dev/null
+++ b/manifest.json
@@ -0,0 +1,10 @@
+{
+	"id": "surfsense-obsidian",
+	"name": "SurfSense",
+	"version": "0.1.0",
+	"minAppVersion": "1.5.4",
+	"description": "Turn your vault into a searchable second brain with SurfSense.",
+	"author": "SurfSense",
+	"authorUrl": "https://www.surfsense.com",
+	"isDesktopOnly": false
+}
diff --git a/package.json b/package.json
index 8a1a6add8e..1e45c17066 100644
--- a/package.json
+++ b/package.json
@@ -1,5 +1,5 @@
 {
   "name": "surfsense",
   "private": true,
-  "packageManager": "pnpm@10.24.0"
+  "packageManager": "pnpm@10.26.0"
 }
diff --git a/skills-lock.json b/skills-lock.json
index a1947ff7f2..f722ec0d37 100644
--- a/skills-lock.json
+++ b/skills-lock.json
@@ -1,11 +1,117 @@
 {
   "version": 1,
   "skills": {
+    "alert-manager": {
+      "source": "aaron-he-zhu/seo-geo-claude-skills",
+      "sourceType": "github",
+      "computedHash": "f96aa6f7d8dd982a866ac090c1770e39cd0a01c63c9bc315895d3c210f80b393"
+    },
+    "backlink-analyzer": {
+      "source": "aaron-he-zhu/seo-geo-claude-skills",
+      "sourceType": "github",
+      "computedHash": "25556853b9cd6434bbb1bcf6cd6794d84c05214c1f8564a85db46f73e5c438e7"
+    },
+    "competitor-analysis": {
+      "source": "aaron-he-zhu/seo-geo-claude-skills",
+      "sourceType": "github",
+      "computedHash": "ecee4d821c68ae0cdce088da9f688f2a1ed724ddad01854cceaac9e85109c6f1"
+    },
+    "content-gap-analysis": {
+      "source": "aaron-he-zhu/seo-geo-claude-skills",
+      "sourceType": "github",
+      "computedHash": "9979051baf6210013ba7c6df0229081c93882efe5791dfae9c0ede93349c2537"
+    },
+    "content-quality-auditor": {
+      "source": "aaron-he-zhu/seo-geo-claude-skills",
+      "sourceType": "github",
+      "computedHash": "02de7d19e7b4611bdb2f10a386ce7a6b9799a47bce3e964bd33b8f9f3e4599f4"
+    },
+    "content-refresher": {
+      "source": "aaron-he-zhu/seo-geo-claude-skills",
+      "sourceType": "github",
+      "computedHash": "282ee22b8cac4bc4ad369f9bbf581cef9770a5dfabadf0581f36c674973261ce"
+    },
+    "domain-authority-auditor": {
+      "source": "aaron-he-zhu/seo-geo-claude-skills",
+      "sourceType": "github",
+      "computedHash": "e97547ac36ea8e59a65bdfdda25b8d130cb7f5e2cf13dac56fd01f24e74718a5"
+    },
+    "entity-optimizer": {
+      "source": "aaron-he-zhu/seo-geo-claude-skills",
+      "sourceType": "github",
+      "computedHash": "58704efa3ce530dc504e5942e740339ec03d4f3f2419f9f6812c4eee73accb73"
+    },
+    "geo-content-optimizer": {
+      "source": "aaron-he-zhu/seo-geo-claude-skills",
+      "sourceType": "github",
+      "computedHash": "ddd61f32254be1303ce4b7be5d507c932de4af53489a0ebb1309bf61de99018c"
+    },
+    "improve-codebase-architecture": {
+      "source": "mattpocock/skills",
+      "sourceType": "github",
+      "skillPath": "skills/engineering/improve-codebase-architecture/SKILL.md",
+      "computedHash": "2da1d23b8f53cfe67f2e0b68924ab9f4ec400bb6480de097007eeaeb517d1722"
+    },
+    "internal-linking-optimizer": {
+      "source": "aaron-he-zhu/seo-geo-claude-skills",
+      "sourceType": "github",
+      "computedHash": "10c54a570eacd569bde0ff55d0cb9ecb89658ba92afceb264da59dd7f600801c"
+    },
+    "keyword-research": {
+      "source": "aaron-he-zhu/seo-geo-claude-skills",
+      "sourceType": "github",
+      "computedHash": "921a7e6e7ab45d1dfb4534d4ca445a94a95e7e01022b5f171690da2e40caf55f"
+    },
+    "memory-management": {
+      "source": "aaron-he-zhu/seo-geo-claude-skills",
+      "sourceType": "github",
+      "computedHash": "df7a35f2479812e4bf452e2c2bbb942c9ef163f8d7bd281a0e38765aa33ecff7"
+    },
+    "meta-tags-optimizer": {
+      "source": "aaron-he-zhu/seo-geo-claude-skills",
+      "sourceType": "github",
+      "computedHash": "4d88aa5adc17c1802f21c04162bbab6bd8c5965c35362a502ee6ccab76777a54"
+    },
+    "on-page-seo-auditor": {
+      "source": "aaron-he-zhu/seo-geo-claude-skills",
+      "sourceType": "github",
+      "computedHash": "12714ad503f3493a977f101d26d2b5f43991ef41f148332c22c9b29e9b7c9f23"
+    },
+    "performance-reporter": {
+      "source": "aaron-he-zhu/seo-geo-claude-skills",
+      "sourceType": "github",
+      "computedHash": "fb6454815439e5b9993d18aae71a92928ef36814f550a16f9ce8a8d8ac6901d3"
+    },
+    "rank-tracker": {
+      "source": "aaron-he-zhu/seo-geo-claude-skills",
+      "sourceType": "github",
+      "computedHash": "2d4c9a44056713d946481031cc00b3113b6055af6f4bea0cfd8dcb8a9d5abbf9"
+    },
+    "schema-markup-generator": {
+      "source": "aaron-he-zhu/seo-geo-claude-skills",
+      "sourceType": "github",
+      "computedHash": "8587b6f64de8274895fb97b94089ca48685e55694828e67f4a2cf2937d5d9e5d"
+    },
+    "seo-content-writer": {
+      "source": "aaron-he-zhu/seo-geo-claude-skills",
+      "sourceType": "github",
+      "computedHash": "0ac426e7f412aca94496e8dcc599ac11f15e04043605968a7c146293ecac54ca"
+    },
+    "serp-analysis": {
+      "source": "aaron-he-zhu/seo-geo-claude-skills",
+      "sourceType": "github",
+      "computedHash": "e3d18f4f1514bb1537e8429f7fe011c9f3e666e5a3dcbc1623d9f83e47de59a2"
+    },
     "shadcn": {
       "source": "shadcn/ui",
       "sourceType": "github",
       "computedHash": "b1cc1defe03f0e8f78652b7d20e4d4e6421babc2b45835d4f361afe684dc47a2"
     },
+    "technical-seo-checker": {
+      "source": "aaron-he-zhu/seo-geo-claude-skills",
+      "sourceType": "github",
+      "computedHash": "a9bf73872ed4ec682f7d91d0a6528bdc91806571d3c47b3c8046eee5106dc791"
+    },
     "vercel-react-best-practices": {
       "source": "vercel-labs/agent-skills",
       "sourceType": "github",
diff --git a/surfsense_backend/.dockerignore b/surfsense_backend/.dockerignore
index 8668ee72bf..2ecdb36889 100644
--- a/surfsense_backend/.dockerignore
+++ b/surfsense_backend/.dockerignore
@@ -24,4 +24,6 @@ wheels/
 *.egg
 .pytest_cache/
 .coverage
-htmlcov/ 
\ No newline at end of file
+htmlcov/
+
+tests/
\ No newline at end of file
diff --git a/surfsense_backend/.env.example b/surfsense_backend/.env.example
index 3899c99f79..4b72e91536 100644
--- a/surfsense_backend/.env.example
+++ b/surfsense_backend/.env.example
@@ -53,6 +53,17 @@ STRIPE_PRICE_ID=price_...
 STRIPE_PAGES_PER_UNIT=1000
 # Set FALSE to disable new checkout session creation temporarily
 STRIPE_PAGE_BUYING_ENABLED=TRUE
+
+# Premium credit purchases via Stripe (for premium-tier model usage).
+# Each pack grants STRIPE_CREDIT_MICROS_PER_UNIT micro-USD of credit
+# (default 1_000_000 = $1.00). Premium turns are billed at the actual
+# per-call provider cost reported by LiteLLM.
+STRIPE_TOKEN_BUYING_ENABLED=FALSE
+STRIPE_PREMIUM_TOKEN_PRICE_ID=price_...
+STRIPE_CREDIT_MICROS_PER_UNIT=1000000
+# DEPRECATED — use STRIPE_CREDIT_MICROS_PER_UNIT (1:1 numerical mapping):
+# STRIPE_TOKENS_PER_UNIT=1000000
+
 # Periodic Stripe safety net for purchases left in PENDING (minutes old)
 STRIPE_RECONCILIATION_LOOKBACK_MINUTES=10
 # Max pending purchases to check per reconciliation run
@@ -183,6 +194,60 @@ VIDEO_PRESENTATION_DEFAULT_DURATION_IN_FRAMES=300
 # (Optional) Maximum pages limit per user for ETL services (default: `999999999` for unlimited in OSS version)  
 PAGES_LIMIT=500
 
+# Premium credit quota per registered user, in micro-USD
+# (default: 5,000,000 == $5.00 of credit). Premium turns are debited at the
+# actual per-call provider cost reported by LiteLLM, so cheap and expensive
+# models bill proportionally. Applies only to models with
+# billing_tier=premium in global_llm_config.yaml.
+PREMIUM_CREDIT_MICROS_LIMIT=5000000
+# DEPRECATED — use PREMIUM_CREDIT_MICROS_LIMIT (1:1 numerical mapping):
+# PREMIUM_TOKEN_LIMIT=5000000
+
+# Safety ceiling on per-call premium reservation, in micro-USD.
+# stream_new_chat estimates an upper-bound cost from the model's
+# litellm-published per-token rates × the config's quota_reserve_tokens
+# and clamps to this value so a misconfigured model can't lock the
+# user's whole balance on one call. Default $1.00.
+QUOTA_MAX_RESERVE_MICROS=1000000
+
+# Per-image reservation (in micro-USD) for the POST /image-generations
+# endpoint. Bypassed for free configs. Default $0.05.
+QUOTA_DEFAULT_IMAGE_RESERVE_MICROS=50000
+
+# Per-podcast reservation (in micro-USD) used by the podcast Celery task.
+# Single envelope covers one transcript-generation LLM call. Default $0.20.
+QUOTA_DEFAULT_PODCAST_RESERVE_MICROS=200000
+
+# Per-video-presentation reservation (in micro-USD) used by the video
+# presentation Celery task. Covers worst-case fan-out of N slide-scene
+# generations + refines. Default $1.00. NOTE: tasks using the override
+# path bypass the QUOTA_MAX_RESERVE_MICROS clamp — raise with care.
+QUOTA_DEFAULT_VIDEO_PRESENTATION_RESERVE_MICROS=1000000
+
+# No-login (anonymous) mode — allows public users to chat without an account
+# Set TRUE to enable /free pages and anonymous chat API
+NOLOGIN_MODE_ENABLED=FALSE
+# Total tokens allowed per anonymous session before requiring account creation
+ANON_TOKEN_LIMIT=500000
+# Token count at which the UI shows a soft warning
+ANON_TOKEN_WARNING_THRESHOLD=400000
+# Days before anonymous quota tracking expires in Redis
+ANON_TOKEN_QUOTA_TTL_DAYS=30
+# Max document upload size for anonymous users (MB)
+ANON_MAX_UPLOAD_SIZE_MB=5
+# Maximum tokens to reserve per LLM call for quota enforcement (safety cap)
+QUOTA_MAX_RESERVE_PER_CALL=8000
+
+# Abuse prevention: max concurrent anonymous streams per IP (default: 2)
+ANON_MAX_CONCURRENT_STREAMS=2
+# Number of chat requests per IP before Turnstile CAPTCHA is required (default: 5)
+ANON_CAPTCHA_REQUEST_THRESHOLD=5
+
+# Cloudflare Turnstile CAPTCHA (https://dash.cloudflare.com/ -> Turnstile)
+# Set TURNSTILE_ENABLED=TRUE and provide keys to activate CAPTCHA for anonymous chat
+TURNSTILE_ENABLED=FALSE
+TURNSTILE_SECRET_KEY=
+
 
 # Residential Proxy Configuration (anonymous-proxies.net)
 # Used for web crawling, link previews, and YouTube transcript fetching to avoid IP bans.
@@ -203,8 +268,87 @@ LLAMA_CLOUD_API_KEY=llx-nnn
 # AZURE_DI_ENDPOINT=https://your-resource.cognitiveservices.azure.com/
 # AZURE_DI_KEY=your-key
 
+# Daytona Sandbox (isolated code execution)
+# DAYTONA_SANDBOX_ENABLED=FALSE
+# DAYTONA_API_KEY=your-daytona-api-key
+# DAYTONA_API_URL=https://app.daytona.io/api
+# DAYTONA_TARGET=us
+# DAYTONA_SNAPSHOT_ID=
+
+# Desktop local filesystem mode (chat file tools run against a local folder root)
+# ENABLE_DESKTOP_LOCAL_FILESYSTEM=FALSE
+
 # OPTIONAL: Add these for LangSmith Observability
 LANGSMITH_TRACING=true
 LANGSMITH_ENDPOINT=https://api.smith.langchain.com
 LANGSMITH_API_KEY=lsv2_pt_.....
 LANGSMITH_PROJECT=surfsense
+
+
+# =============================================================================
+# OPTIONAL: New-chat agent feature flags
+# =============================================================================
+# Multi-agent orchestrator switch for authenticated chat streaming.
+# MULTI_AGENT_CHAT_ENABLED=false
+
+# Master kill-switch — when true, every flag below is forced OFF.
+# SURFSENSE_DISABLE_NEW_AGENT_STACK=false
+
+# Agent quality
+# SURFSENSE_ENABLE_CONTEXT_EDITING=false
+# SURFSENSE_ENABLE_COMPACTION_V2=false
+# SURFSENSE_ENABLE_RETRY_AFTER=false
+# SURFSENSE_ENABLE_MODEL_FALLBACK=false
+# SURFSENSE_ENABLE_MODEL_CALL_LIMIT=false
+# SURFSENSE_ENABLE_TOOL_CALL_LIMIT=false
+# SURFSENSE_ENABLE_TOOL_CALL_REPAIR=false
+# SURFSENSE_ENABLE_DOOM_LOOP=false   # leave OFF until UI handles permission='doom_loop'
+
+# Safety
+# SURFSENSE_ENABLE_PERMISSION=false
+# SURFSENSE_ENABLE_BUSY_MUTEX=false
+# SURFSENSE_ENABLE_LLM_TOOL_SELECTOR=false   # adds a per-turn LLM call
+
+# Observability — OTel (also requires OTEL_EXPORTER_OTLP_ENDPOINT)
+# SURFSENSE_ENABLE_OTEL=false
+
+# Skills + subagents
+# SURFSENSE_ENABLE_SKILLS=false
+# SURFSENSE_ENABLE_SPECIALIZED_SUBAGENTS=false
+# SURFSENSE_ENABLE_KB_PLANNER_RUNNABLE=false
+
+# Snapshot / revert
+# SURFSENSE_ENABLE_ACTION_LOG=false
+# SURFSENSE_ENABLE_REVERT_ROUTE=false        # Backend-only; flip when UI ships
+
+# Plugins
+# SURFSENSE_ENABLE_PLUGIN_LOADER=false
+# Comma-separated allowlist of plugin entry-point names
+# SURFSENSE_ALLOWED_PLUGINS=year_substituter
+
+# -----------------------------------------------------------------------------
+# Compiled-agent cache (Phase 1 + 2 perf optimization, default ON)
+# -----------------------------------------------------------------------------
+# When ON, the per-turn LangGraph + middleware compile result (~3-5s of CPU
+# on a cold turn) is reused across subsequent turns on the same thread,
+# collapsing it to a microsecond hash lookup. All connector tools acquire
+# their own short-lived DB session per call (Phase 2 refactor) so a cached
+# closure is safe to share across requests. Flip OFF only as a last-resort
+# rollback if you suspect cache-related staleness.
+# SURFSENSE_ENABLE_AGENT_CACHE=true
+
+# Cache capacity (max number of compiled-agent entries kept in memory)
+# and TTL per entry (seconds). Working set is typically one entry per
+# active thread on this replica; tune up for very large deployments.
+# SURFSENSE_AGENT_CACHE_MAXSIZE=256
+# SURFSENSE_AGENT_CACHE_TTL_SECONDS=1800
+
+# -----------------------------------------------------------------------------
+# Connector discovery TTL cache (Phase 1.4 perf optimization)
+# -----------------------------------------------------------------------------
+# Caches the per-search-space "available connectors" + "available document
+# types" lookups that ``create_surfsense_deep_agent`` hits on every turn.
+# ORM event listeners auto-invalidate on connector / document inserts,
+# updates and deletes — the TTL only bounds staleness for bulk-import
+# paths that bypass the ORM. Set to 0 to disable the cache.
+# SURFSENSE_CONNECTOR_DISCOVERY_TTL_SECONDS=30
diff --git a/surfsense_backend/.gitignore b/surfsense_backend/.gitignore
index 1cd7fd32c5..47fd53aef1 100644
--- a/surfsense_backend/.gitignore
+++ b/surfsense_backend/.gitignore
@@ -13,5 +13,5 @@ celerybeat-schedule*
 celerybeat-schedule.*
 celerybeat-schedule.dir
 celerybeat-schedule.bak
-global_llm_config.yaml
+/app/config/global_llm_config.yaml
 app/templates/_generated/
\ No newline at end of file
diff --git a/surfsense_backend/Dockerfile b/surfsense_backend/Dockerfile
index 1222b36b67..0c783f403d 100644
--- a/surfsense_backend/Dockerfile
+++ b/surfsense_backend/Dockerfile
@@ -1,8 +1,16 @@
-FROM python:3.12-slim
+# =============================================================================
+# SurfSense Backend — Multi-stage Dockerfile
+# =============================================================================
+# Graph: base → deps → models → {e2e, production}
+#   e2e        — tests/ via additional_contexts (docker-compose.e2e.yml)
+#   production — published ghcr.io image (docker-build.yml pins target)
+# =============================================================================
+
+# ─── Stage 1: base (system deps, Pandoc, certificates) ──────────────────────
+FROM python:3.12-slim AS base
 
 WORKDIR /app
 
-# Install system dependencies including SSL tools, CUDA dependencies, and Tesseract OCR
 RUN apt-get update && apt-get install -y --no-install-recommends \
     gcc \
     python3-dev \
@@ -11,6 +19,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     wget \
     unzip \
     gnupg2 \
+    ffmpeg \
     espeak-ng \
     libsndfile1 \
     libgl1 \
@@ -22,71 +31,83 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     git \
     && rm -rf /var/lib/apt/lists/*
 
-# Install Pandoc 3.x from GitHub as a fallback for Linux where pypandoc_binary
-# may not bundle pandoc (apt ships 2.17 which has broken table rendering).
-# pypandoc_binary bundles pandoc on Windows/macOS; on Linux it picks this up.
+RUN which ffmpeg && ffmpeg -version
+
+# Pandoc 3.x from GitHub Releases — apt ships 2.17 which has broken table rendering.
+# pypandoc_binary bundles pandoc on Windows/macOS; on Linux it picks up this binary.
 RUN ARCH=$(dpkg --print-architecture) && \
     wget -qO /tmp/pandoc.deb "https://github.com/jgm/pandoc/releases/download/3.9/pandoc-3.9-1-${ARCH}.deb" && \
     dpkg -i /tmp/pandoc.deb && \
     rm /tmp/pandoc.deb
 
-# Update certificates and install SSL tools
 RUN update-ca-certificates
 RUN pip install --upgrade certifi pip-system-certs
 
-# Copy requirements
-COPY pyproject.toml .
-COPY uv.lock .
-
-# Install PyTorch based on architecture
-RUN if [ "$(uname -m)" = "x86_64" ]; then \
-        pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121; \
-    else \
-        pip install --no-cache-dir torch torchvision torchaudio; \
-    fi
-
-# Install python dependencies
-RUN pip install --no-cache-dir uv && \
-    uv pip install --system --no-cache-dir -e .
-
-# Set SSL environment variables dynamically
-RUN CERTIFI_PATH=$(python -c "import certifi; print(certifi.where())") && \
-    echo "Setting SSL_CERT_FILE to $CERTIFI_PATH" && \
-    echo "export SSL_CERT_FILE=$CERTIFI_PATH" >> /root/.bashrc && \
-    echo "export REQUESTS_CA_BUNDLE=$CERTIFI_PATH" >> /root/.bashrc
 ENV SSL_CERT_FILE=/usr/local/lib/python3.12/site-packages/certifi/cacert.pem
 ENV REQUESTS_CA_BUNDLE=/usr/local/lib/python3.12/site-packages/certifi/cacert.pem
+ENV SURFSENSE_ALLOW_STATIC_FFMPEG_DOWNLOAD=FALSE
+
+
+# ─── Stage 2: deps (Python deps frozen from uv.lock) ────────────────────────
+FROM base AS deps
+
+COPY pyproject.toml uv.lock ./
+
+# Install all Python dependencies from uv.lock for deterministic builds.
+#
+# `uv pip install -e .` re-resolves from pyproject.toml and ignores uv.lock,
+# which lets prod silently drift to newer upstream versions on every rebuild
+# (e.g. deepagents 0.4.x -> 0.5.x breaking the FilesystemMiddleware imports).
+# Exporting the lock to requirements.txt and feeding it to `uv pip install`
+# pins every transitive package to the exact version captured in uv.lock.
+#
+# Note on torch/CUDA: we do NOT install torch from a separate cu* index here.
+# PyPI's torch wheels for Linux x86_64 already ship CUDA-enabled and pull
+# nvidia-cudnn-cu13, nvidia-nccl-cu13, triton, etc. as install deps (all
+# captured in uv.lock). If a specific CUDA version is needed, wire it through
+# [tool.uv.sources] in pyproject.toml so the lock stays the source of truth.
+RUN pip install --no-cache-dir uv && \
+    uv export --frozen --no-dev --no-hashes --no-emit-project \
+        --format requirements-txt -o /tmp/requirements.txt && \
+    uv pip install --system --no-cache-dir -r /tmp/requirements.txt && \
+    rm /tmp/requirements.txt
+
+
+# ─── Stage 3: models (pre-baked offline assets) ─────────────────────────────
+FROM deps AS models
 
 # Pre-download EasyOCR models to avoid runtime SSL issues
-RUN mkdir -p /root/.EasyOCR/model
-RUN wget --no-check-certificate https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/english_g2.zip -O /root/.EasyOCR/model/english_g2.zip || true
-RUN wget --no-check-certificate https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip -O /root/.EasyOCR/model/craft_mlt_25k.zip || true
-RUN cd /root/.EasyOCR/model && (unzip -o english_g2.zip || true) && (unzip -o craft_mlt_25k.zip || true)
+RUN mkdir -p /root/.EasyOCR/model && \
+    wget --no-check-certificate https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/english_g2.zip      -O /root/.EasyOCR/model/english_g2.zip      || true && \
+    wget --no-check-certificate https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip -O /root/.EasyOCR/model/craft_mlt_25k.zip || true && \
+    cd /root/.EasyOCR/model && \
+    (unzip -o english_g2.zip || true) && \
+    (unzip -o craft_mlt_25k.zip || true)
 
 # Pre-download Docling models
-RUN python -c "try:\n    from docling.document_converter import DocumentConverter\n    conv = DocumentConverter()\nexcept:\n    pass" || true
+RUN printf '%s\n' \
+    'try:' \
+    '    from docling.document_converter import DocumentConverter' \
+    '    DocumentConverter()' \
+    'except Exception:' \
+    '    pass' \
+    | python || true
 
-# Install Playwright browsers for web scraping if needed
-RUN pip install playwright && \
-    playwright install chromium --with-deps
+ARG EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
+RUN python -c "from chonkie import AutoEmbeddings; AutoEmbeddings.get_embeddings('${EMBEDDING_MODEL}')"
 
-# Copy source code
-COPY . .
-
-# Copy and set permissions for entrypoint script
-# Use dos2unix to ensure LF line endings (fixes CRLF issues from Windows checkouts)
-COPY scripts/docker/entrypoint.sh /app/scripts/docker/entrypoint.sh
-RUN dos2unix /app/scripts/docker/entrypoint.sh && chmod +x /app/scripts/docker/entrypoint.sh
+# Install Playwright browsers (the playwright python package itself is in deps)
+RUN playwright install chromium --with-deps
 
 # Shared temp directory for file uploads between API and Worker containers.
 # Python's tempfile module uses TMPDIR, so uploaded files land here.
 # Mount the SAME volume at /shared_tmp on both API and Worker in Coolify.
 RUN mkdir -p /shared_tmp
-ENV TMPDIR=/shared_tmp
 
-# Prevent uvloop compatibility issues
 ENV PYTHONPATH=/app
 ENV UVICORN_LOOP=asyncio
+ENV TMPDIR=/shared_tmp
+ENV PYTHONUNBUFFERED=1
 
 # Tune glibc malloc to return freed memory to the OS more aggressively.
 # Without these, Python's gc.collect() frees objects but the underlying
@@ -95,11 +116,65 @@ ENV MALLOC_MMAP_THRESHOLD_=65536
 ENV MALLOC_TRIM_THRESHOLD_=131072
 ENV MALLOC_MMAP_MAX_=65536
 
+
+# ─── Stage 4: e2e (production source + tests/ + e2e entrypoint) ─────────────
+# Built via `docker buildx build --target e2e`. The default build target is
+# `production` (the last stage), so this stage is opt-in for CI only.
+#
+# `tests/` is excluded from the main build context by .dockerignore (so prod
+# can never accidentally ship test fakes). The e2e stage receives tests/
+# through an "additional context" passed by docker-compose.e2e.yml — see
+# https://docs.docker.com/reference/compose-file/build/#additional_contexts
+FROM models AS e2e
+
+# Same source copy as production. .dockerignore filters out tests/.
+COPY . .
+
+# Bring tests/ in via the named additional build context. CI passes
+#   --build-context tests-source=./tests
+# (or the equivalent additional_contexts entry in docker-compose.e2e.yml).
+COPY --from=tests-source . ./tests/
+
+# Install the project itself in editable mode. Dependencies were already
+# installed deterministically from uv.lock above, so --no-deps prevents any
+# re-resolution that could pull newer versions.
+RUN uv pip install --system --no-cache-dir --no-deps -e .
+
+COPY scripts/docker/entrypoint.e2e.sh /app/scripts/docker/entrypoint.e2e.sh
+RUN dos2unix /app/scripts/docker/entrypoint.e2e.sh && chmod +x /app/scripts/docker/entrypoint.e2e.sh
+
+# SERVICE_ROLE is overridden per service in docker-compose.e2e.yml (api / worker).
+ENV SERVICE_ROLE=api
+
+EXPOSE 8000-8001
+CMD ["/app/scripts/docker/entrypoint.e2e.sh"]
+
+
+# ─── Stage 5: production (published ghcr.io image) ──────────────────────────
+# CI pins `target: production`; also the default for `docker build` / dev compose.
+FROM models AS production
+
+# Copy source code (tests/ excluded by .dockerignore — production never ships tests).
+COPY . .
+
+# Install the project itself in editable mode. Dependencies were already
+# installed deterministically from uv.lock above, so --no-deps prevents any
+# re-resolution that could pull newer versions.
+RUN uv pip install --system --no-cache-dir --no-deps -e .
+
+# Use dos2unix to ensure LF line endings (fixes CRLF issues from Windows checkouts)
+COPY scripts/docker/entrypoint.sh /app/scripts/docker/entrypoint.sh
+RUN dos2unix /app/scripts/docker/entrypoint.sh && chmod +x /app/scripts/docker/entrypoint.sh
+
 # SERVICE_ROLE controls which process this container runs:
-#   api     – FastAPI backend only (runs migrations on startup)
+#   migrate – Run alembic upgrade head, verify zero_publication exists, exit 0.
+#             Used by the dedicated `migrations` service in docker-compose.yml
+#             so downstream services gate on `service_completed_successfully`.
+#   api     – FastAPI backend only (does NOT run migrations)
 #   worker  – Celery worker only
 #   beat    – Celery beat scheduler only
-#   all     – All three (legacy / dev default)
+#   all     – migrations + api + worker + beat (legacy / dev default;
+#             fails fast on migration error)
 ENV SERVICE_ROLE=all
 
 # Celery worker tuning (only used when SERVICE_ROLE=worker or all)
@@ -112,6 +187,5 @@ ENV CELERY_MAX_TASKS_PER_CHILD=50
 #   ""                       – both queues (default, for single-worker setups)
 ENV CELERY_QUEUES=""
 
-# Run
 EXPOSE 8000-8001
-CMD ["/app/scripts/docker/entrypoint.sh"]
\ No newline at end of file
+CMD ["/app/scripts/docker/entrypoint.sh"]
diff --git a/surfsense_backend/alembic/env.py b/surfsense_backend/alembic/env.py
index bd8c203560..5354211aa7 100644
--- a/surfsense_backend/alembic/env.py
+++ b/surfsense_backend/alembic/env.py
@@ -67,7 +67,11 @@ def run_migrations_offline() -> None:
 
 
 def do_run_migrations(connection: Connection) -> None:
-    context.configure(connection=connection, target_metadata=target_metadata)
+    context.configure(
+        connection=connection,
+        target_metadata=target_metadata,
+        transaction_per_migration=True,
+    )
 
     with context.begin_transaction():
         context.run_migrations()
diff --git a/surfsense_backend/alembic/versions/116_create_zero_publication.py b/surfsense_backend/alembic/versions/116_create_zero_publication.py
index ff74952a96..927673c353 100644
--- a/surfsense_backend/alembic/versions/116_create_zero_publication.py
+++ b/surfsense_backend/alembic/versions/116_create_zero_publication.py
@@ -5,6 +5,17 @@
 
 See: https://zero.rocicorp.dev/docs/zero-cache-config#app-publications
 
+NOTE for future migration authors: this is the ONLY migration allowed
+to use bare ``CREATE PUBLICATION``. All subsequent mutations of
+``zero_publication`` MUST use the ``COMMENT ON PUBLICATION`` bookend
+pattern wrapping an ``ALTER PUBLICATION ... SET TABLE`` -- copy the
+``upgrade()`` function from migration
+``143_force_zero_publication_resync.py`` as your starting template.
+Raw ``DROP``/``CREATE PUBLICATION`` in new migrations would
+re-introduce bug #1355 (zero-cache stuck on a stale replica snapshot
+because Zero >= 1.0's change-streamer never sees the schema-change
+event).
+
 Revision ID: 116
 Revises: 115
 """
diff --git a/surfsense_backend/alembic/versions/117_optimize_zero_publication_column_lists.py b/surfsense_backend/alembic/versions/117_optimize_zero_publication_column_lists.py
index 78a26a381b..c21ed2bf0a 100644
--- a/surfsense_backend/alembic/versions/117_optimize_zero_publication_column_lists.py
+++ b/surfsense_backend/alembic/versions/117_optimize_zero_publication_column_lists.py
@@ -17,6 +17,16 @@
   3. Delete / reset the zero-cache data volume
   4. Restart zero-cache  (it will do a fresh initial sync)
 
+DO NOT COPY THIS PATTERN. The ``DROP PUBLICATION`` + ``CREATE
+PUBLICATION`` dance below is the pre-#1355 anti-pattern: on Zero >=
+1.0 it does not reliably wake the zero-cache change-streamer and can
+leave the replica pinned to a stale snapshot. This file is
+grandfathered in because it has already shipped to users; new
+publication mutations MUST use the ``COMMENT ON PUBLICATION`` bookend
+pattern wrapping an ``ALTER PUBLICATION ... SET TABLE`` -- copy the
+``upgrade()`` function from migration
+``143_force_zero_publication_resync.py`` as your starting template.
+
 Revision ID: 117
 Revises: 116
 """
@@ -79,40 +89,44 @@ def _terminate_blocked_pids(conn, table: str) -> None:
 
 def upgrade() -> None:
     conn = op.get_bind()
-
-    conn.execute(sa.text("SET lock_timeout = '10s'"))
-
-    for tbl in sorted(TABLES_WITH_FULL_IDENTITY):
-        _terminate_blocked_pids(conn, tbl)
-        conn.execute(sa.text(f'LOCK TABLE "{tbl}" IN ACCESS EXCLUSIVE MODE'))
-
-    for tbl in TABLES_WITH_FULL_IDENTITY:
-        conn.execute(sa.text(f'ALTER TABLE "{tbl}" REPLICA IDENTITY DEFAULT'))
-
-    conn.execute(sa.text(f"DROP PUBLICATION IF EXISTS {PUBLICATION_NAME}"))
-
-    has_zero_ver = conn.execute(
-        sa.text(
-            "SELECT 1 FROM information_schema.columns "
-            "WHERE table_name = 'documents' AND column_name = '_0_version'"
+    # asyncpg requires LOCK TABLE inside a transaction block. Alembic already
+    # opened one via context.begin_transaction(), but the driver still errors
+    # unless we use an explicit SAVEPOINT (nested transaction) for this block.
+    tx = conn.begin_nested() if conn.in_transaction() else conn.begin()
+    with tx:
+        conn.execute(sa.text("SET lock_timeout = '10s'"))
+
+        for tbl in sorted(TABLES_WITH_FULL_IDENTITY):
+            _terminate_blocked_pids(conn, tbl)
+            conn.execute(sa.text(f'LOCK TABLE "{tbl}" IN ACCESS EXCLUSIVE MODE'))
+
+        for tbl in TABLES_WITH_FULL_IDENTITY:
+            conn.execute(sa.text(f'ALTER TABLE "{tbl}" REPLICA IDENTITY DEFAULT'))
+
+        conn.execute(sa.text(f"DROP PUBLICATION IF EXISTS {PUBLICATION_NAME}"))
+
+        has_zero_ver = conn.execute(
+            sa.text(
+                "SELECT 1 FROM information_schema.columns "
+                "WHERE table_name = 'documents' AND column_name = '_0_version'"
+            )
+        ).fetchone()
+
+        cols = DOCUMENT_COLS + (['"_0_version"'] if has_zero_ver else [])
+        col_list = ", ".join(cols)
+
+        conn.execute(
+            sa.text(
+                f"CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE "
+                f"notifications, "
+                f"documents ({col_list}), "
+                f"folders, "
+                f"search_source_connectors, "
+                f"new_chat_messages, "
+                f"chat_comments, "
+                f"chat_session_state"
+            )
         )
-    ).fetchone()
-
-    cols = DOCUMENT_COLS + (['"_0_version"'] if has_zero_ver else [])
-    col_list = ", ".join(cols)
-
-    conn.execute(
-        sa.text(
-            f"CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE "
-            f"notifications, "
-            f"documents ({col_list}), "
-            f"folders, "
-            f"search_source_connectors, "
-            f"new_chat_messages, "
-            f"chat_comments, "
-            f"chat_session_state"
-        )
-    )
 
 
 def downgrade() -> None:
diff --git a/surfsense_backend/alembic/versions/118_add_local_folder_sync_and_versioning.py b/surfsense_backend/alembic/versions/118_add_local_folder_sync_and_versioning.py
index 1fef9fbcb6..1dce24e561 100644
--- a/surfsense_backend/alembic/versions/118_add_local_folder_sync_and_versioning.py
+++ b/surfsense_backend/alembic/versions/118_add_local_folder_sync_and_versioning.py
@@ -1,5 +1,16 @@
 """Add LOCAL_FOLDER_FILE document type, folder metadata, and document_versions table
 
+DO NOT COPY THIS PATTERN. The bare ``ALTER PUBLICATION ... ADD/DROP
+TABLE`` calls below pre-date the ``COMMENT ON PUBLICATION`` bookend
+fix for bug #1355: on Zero >= 1.0 they do not reliably wake the
+zero-cache change-streamer and can leave the replica pinned to a
+stale snapshot. This file is grandfathered in because it has already
+shipped to users; new publication mutations MUST use the
+``COMMENT ON PUBLICATION`` bookend pattern wrapping an
+``ALTER PUBLICATION ... SET TABLE`` -- copy the ``upgrade()`` function
+from migration ``143_force_zero_publication_resync.py`` as your
+starting template.
+
 Revision ID: 118
 Revises: 117
 """
diff --git a/surfsense_backend/alembic/versions/121_add_memory_md_columns.py b/surfsense_backend/alembic/versions/121_add_memory_md_columns.py
new file mode 100644
index 0000000000..ac248dfcab
--- /dev/null
+++ b/surfsense_backend/alembic/versions/121_add_memory_md_columns.py
@@ -0,0 +1,50 @@
+"""Add memory_md columns to user and searchspaces tables
+
+Revision ID: 121
+Revises: 120
+
+Changes:
+1. Add memory_md TEXT column to user table (personal memory)
+2. Add shared_memory_md TEXT column to searchspaces table (team memory)
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+from alembic import op
+
+revision: str = "121"
+down_revision: str | None = "120"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+    # Idempotent: column(s) may already exist after a failed run or manual DDL.
+    op.execute(
+        """
+        DO $$
+        BEGIN
+            IF NOT EXISTS (
+                SELECT 1 FROM information_schema.columns
+                WHERE table_schema = 'public' AND table_name = 'user'
+                  AND column_name = 'memory_md'
+            ) THEN
+                ALTER TABLE "user" ADD COLUMN memory_md TEXT DEFAULT '';
+            END IF;
+            IF NOT EXISTS (
+                SELECT 1 FROM information_schema.columns
+                WHERE table_schema = 'public' AND table_name = 'searchspaces'
+                  AND column_name = 'shared_memory_md'
+            ) THEN
+                ALTER TABLE searchspaces ADD COLUMN shared_memory_md TEXT DEFAULT '';
+            END IF;
+        END$$;
+        """
+    )
+
+
+def downgrade() -> None:
+    op.execute("ALTER TABLE searchspaces DROP COLUMN IF EXISTS shared_memory_md")
+    op.execute('ALTER TABLE "user" DROP COLUMN IF EXISTS memory_md')
diff --git a/surfsense_backend/alembic/versions/122_migrate_and_drop_old_memory_tables.py b/surfsense_backend/alembic/versions/122_migrate_and_drop_old_memory_tables.py
new file mode 100644
index 0000000000..b8ddddb9cb
--- /dev/null
+++ b/surfsense_backend/alembic/versions/122_migrate_and_drop_old_memory_tables.py
@@ -0,0 +1,247 @@
+"""Migrate row-per-fact memories to markdown, then drop legacy tables
+
+Revision ID: 122
+Revises: 121
+
+Converts user_memories rows into per-user markdown documents stored in
+user.memory_md, and shared_memories rows into per-search-space markdown
+stored in searchspaces.shared_memory_md.  Then drops the old tables and
+the memorycategory enum.
+
+The markdown format matches the new memory system:
+  ## Heading
+  - (YYYY-MM-DD) [fact|pref|instr] memory text
+"""
+
+from __future__ import annotations
+
+import logging
+from collections import defaultdict
+from collections.abc import Sequence
+from uuid import UUID
+
+import sqlalchemy as sa
+from sqlalchemy import inspect as sa_inspect
+
+from alembic import op
+from app.config import config
+
+logger = logging.getLogger(__name__)
+
+revision: str = "122"
+down_revision: str | None = "121"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+EMBEDDING_DIM = config.embedding_model_instance.dimension
+
+_CATEGORY_TO_MARKER = {
+    "fact": "fact",
+    "context": "fact",
+    "preference": "pref",
+    "instruction": "instr",
+}
+
+_CATEGORY_HEADING = {
+    "fact": "Facts",
+    "preference": "Preferences",
+    "instruction": "Instructions",
+    "context": "Context",
+}
+
+_HEADING_ORDER = ["fact", "preference", "instruction", "context"]
+
+
+def _build_markdown(rows: list[tuple]) -> str:
+    """Build a markdown document from (memory_text, category, created_at) rows."""
+    by_category: dict[str, list[str]] = defaultdict(list)
+
+    for memory_text, category, created_at in rows:
+        cat = str(category)
+        marker = _CATEGORY_TO_MARKER.get(cat, "fact")
+        date_str = created_at.strftime("%Y-%m-%d")
+        clean_text = str(memory_text).replace("\n", " ").strip()
+        bullet = f"- ({date_str}) [{marker}] {clean_text}"
+        by_category[cat].append(bullet)
+
+    sections: list[str] = []
+    for cat in _HEADING_ORDER:
+        if cat in by_category:
+            heading = _CATEGORY_HEADING[cat]
+            sections.append(f"## {heading}")
+            sections.extend(by_category[cat])
+            sections.append("")
+
+    return "\n".join(sections).strip() + "\n"
+
+
+def _migrate_user_memories(conn: sa.engine.Connection) -> None:
+    """Convert user_memories rows → user.memory_md grouped by user_id."""
+    rows = conn.execute(
+        sa.text(
+            "SELECT user_id, memory_text, category::text, created_at "
+            "FROM user_memories ORDER BY created_at"
+        )
+    ).fetchall()
+
+    if not rows:
+        logger.info("user_memories is empty, skipping data migration.")
+        return
+
+    by_user: dict[UUID, list[tuple]] = defaultdict(list)
+    for user_id, memory_text, category, created_at in rows:
+        by_user[user_id].append((memory_text, category, created_at))
+
+    migrated = 0
+    for uid, user_rows in by_user.items():
+        existing = conn.execute(
+            sa.text('SELECT memory_md FROM "user" WHERE id = :uid'),
+            {"uid": uid},
+        ).scalar()
+
+        if existing and existing.strip():
+            logger.info("User %s already has memory_md, skipping.", uid)
+            continue
+
+        markdown = _build_markdown(user_rows)
+        conn.execute(
+            sa.text('UPDATE "user" SET memory_md = :md WHERE id = :uid'),
+            {"md": markdown, "uid": uid},
+        )
+        migrated += 1
+
+    logger.info("Migrated user_memories for %d user(s).", migrated)
+
+
+def _migrate_shared_memories(conn: sa.engine.Connection) -> None:
+    """Convert shared_memories rows → searchspaces.shared_memory_md."""
+    rows = conn.execute(
+        sa.text(
+            "SELECT search_space_id, memory_text, category::text, created_at "
+            "FROM shared_memories ORDER BY created_at"
+        )
+    ).fetchall()
+
+    if not rows:
+        logger.info("shared_memories is empty, skipping data migration.")
+        return
+
+    by_space: dict[int, list[tuple]] = defaultdict(list)
+    for search_space_id, memory_text, category, created_at in rows:
+        by_space[search_space_id].append((memory_text, category, created_at))
+
+    migrated = 0
+    for space_id, space_rows in by_space.items():
+        existing = conn.execute(
+            sa.text("SELECT shared_memory_md FROM searchspaces WHERE id = :sid"),
+            {"sid": space_id},
+        ).scalar()
+
+        if existing and existing.strip():
+            logger.info(
+                "Search space %s already has shared_memory_md, skipping.", space_id
+            )
+            continue
+
+        markdown = _build_markdown(space_rows)
+        conn.execute(
+            sa.text("UPDATE searchspaces SET shared_memory_md = :md WHERE id = :sid"),
+            {"md": markdown, "sid": space_id},
+        )
+        migrated += 1
+
+    logger.info("Migrated shared_memories for %d search space(s).", migrated)
+
+
+def upgrade() -> None:
+    conn = op.get_bind()
+    inspector = sa_inspect(conn)
+    tables = inspector.get_table_names()
+
+    if "user_memories" in tables:
+        _migrate_user_memories(conn)
+
+    if "shared_memories" in tables:
+        _migrate_shared_memories(conn)
+
+    op.execute("DROP TABLE IF EXISTS shared_memories CASCADE;")
+    op.execute("DROP TABLE IF EXISTS user_memories CASCADE;")
+    op.execute("DROP TYPE IF EXISTS memorycategory;")
+
+
+def downgrade() -> None:
+    op.execute(
+        """
+        DO $$
+        BEGIN
+            IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'memorycategory') THEN
+                CREATE TYPE memorycategory AS ENUM (
+                    'preference',
+                    'fact',
+                    'instruction',
+                    'context'
+                );
+            END IF;
+        END$$;
+        """
+    )
+
+    op.execute(
+        f"""
+        CREATE TABLE IF NOT EXISTS user_memories (
+            id SERIAL PRIMARY KEY,
+            created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
+            user_id UUID NOT NULL REFERENCES "user"(id) ON DELETE CASCADE,
+            search_space_id INTEGER REFERENCES searchspaces(id) ON DELETE CASCADE,
+            memory_text TEXT NOT NULL,
+            category memorycategory NOT NULL DEFAULT 'fact',
+            embedding vector({EMBEDDING_DIM}),
+            updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
+        );
+        """
+    )
+    op.execute(
+        "CREATE INDEX IF NOT EXISTS ix_user_memories_user_id ON user_memories(user_id);"
+    )
+    op.execute(
+        "CREATE INDEX IF NOT EXISTS ix_user_memories_search_space_id ON user_memories(search_space_id);"
+    )
+    op.execute(
+        "CREATE INDEX IF NOT EXISTS ix_user_memories_updated_at ON user_memories(updated_at);"
+    )
+    op.execute(
+        "CREATE INDEX IF NOT EXISTS ix_user_memories_category ON user_memories(category);"
+    )
+    op.execute(
+        "CREATE INDEX IF NOT EXISTS ix_user_memories_user_search_space ON user_memories(user_id, search_space_id);"
+    )
+    op.execute(
+        "CREATE INDEX IF NOT EXISTS user_memories_vector_index ON user_memories USING hnsw (embedding public.vector_cosine_ops);"
+    )
+
+    op.execute(
+        f"""
+        CREATE TABLE IF NOT EXISTS shared_memories (
+            id SERIAL PRIMARY KEY,
+            created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
+            updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
+            search_space_id INTEGER NOT NULL REFERENCES searchspaces(id) ON DELETE CASCADE,
+            created_by_id UUID NOT NULL REFERENCES "user"(id) ON DELETE CASCADE,
+            memory_text TEXT NOT NULL,
+            category memorycategory NOT NULL DEFAULT 'fact',
+            embedding vector({EMBEDDING_DIM})
+        );
+        """
+    )
+    op.execute(
+        "CREATE INDEX IF NOT EXISTS ix_shared_memories_search_space_id ON shared_memories(search_space_id);"
+    )
+    op.execute(
+        "CREATE INDEX IF NOT EXISTS ix_shared_memories_updated_at ON shared_memories(updated_at);"
+    )
+    op.execute(
+        "CREATE INDEX IF NOT EXISTS ix_shared_memories_created_by_id ON shared_memories(created_by_id);"
+    )
+    op.execute(
+        "CREATE INDEX IF NOT EXISTS shared_memories_vector_index ON shared_memories USING hnsw (embedding public.vector_cosine_ops);"
+    )
diff --git a/surfsense_backend/alembic/versions/123_add_enable_vision_llm_to_connectors.py b/surfsense_backend/alembic/versions/123_add_enable_vision_llm_to_connectors.py
new file mode 100644
index 0000000000..353e0680e8
--- /dev/null
+++ b/surfsense_backend/alembic/versions/123_add_enable_vision_llm_to_connectors.py
@@ -0,0 +1,45 @@
+"""123_add_enable_vision_llm_to_connectors
+
+Revision ID: 123
+Revises: 122
+Create Date: 2026-04-09
+
+Adds enable_vision_llm boolean column to search_source_connectors.
+Defaults to False so vision LLM image processing is opt-in.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = "123"
+down_revision: str | None = "122"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+    conn = op.get_bind()
+    existing_columns = [
+        col["name"] for col in sa.inspect(conn).get_columns("search_source_connectors")
+    ]
+
+    if "enable_vision_llm" not in existing_columns:
+        op.add_column(
+            "search_source_connectors",
+            sa.Column(
+                "enable_vision_llm",
+                sa.Boolean(),
+                nullable=False,
+                server_default=sa.text("false"),
+            ),
+        )
+
+
+def downgrade() -> None:
+    op.drop_column("search_source_connectors", "enable_vision_llm")
diff --git a/surfsense_backend/alembic/versions/124_add_ai_file_sort_enabled.py b/surfsense_backend/alembic/versions/124_add_ai_file_sort_enabled.py
new file mode 100644
index 0000000000..b77eb9337f
--- /dev/null
+++ b/surfsense_backend/alembic/versions/124_add_ai_file_sort_enabled.py
@@ -0,0 +1,44 @@
+"""124_add_ai_file_sort_enabled
+
+Revision ID: 124
+Revises: 123
+Create Date: 2026-04-14
+
+Adds ai_file_sort_enabled boolean column to searchspaces.
+Defaults to False so AI file sorting is opt-in per search space.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+
+from alembic import op
+
+revision: str = "124"
+down_revision: str | None = "123"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+    conn = op.get_bind()
+    existing_columns = [
+        col["name"] for col in sa.inspect(conn).get_columns("searchspaces")
+    ]
+
+    if "ai_file_sort_enabled" not in existing_columns:
+        op.add_column(
+            "searchspaces",
+            sa.Column(
+                "ai_file_sort_enabled",
+                sa.Boolean(),
+                nullable=False,
+                server_default=sa.text("false"),
+            ),
+        )
+
+
+def downgrade() -> None:
+    op.drop_column("searchspaces", "ai_file_sort_enabled")
diff --git a/surfsense_backend/alembic/versions/125_add_token_usage_table.py b/surfsense_backend/alembic/versions/125_add_token_usage_table.py
new file mode 100644
index 0000000000..915561c8c3
--- /dev/null
+++ b/surfsense_backend/alembic/versions/125_add_token_usage_table.py
@@ -0,0 +1,85 @@
+"""125_add_token_usage_table
+
+Revision ID: 125
+Revises: 124
+Create Date: 2026-04-14
+
+Adds token_usage table for tracking LLM token consumption per message.
+Supports future extension via usage_type for indexing, image gen, etc.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+from sqlalchemy.dialects.postgresql import JSONB, UUID
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = "125"
+down_revision: str | None = "124"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+    conn = op.get_bind()
+    if sa.inspect(conn).has_table("token_usage"):
+        return
+
+    op.create_table(
+        "token_usage",
+        sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
+        sa.Column("prompt_tokens", sa.Integer(), nullable=False, server_default="0"),
+        sa.Column(
+            "completion_tokens", sa.Integer(), nullable=False, server_default="0"
+        ),
+        sa.Column("total_tokens", sa.Integer(), nullable=False, server_default="0"),
+        sa.Column("model_breakdown", JSONB, nullable=True),
+        sa.Column("call_details", JSONB, nullable=True),
+        sa.Column("usage_type", sa.String(50), nullable=False, server_default="chat"),
+        sa.Column(
+            "thread_id",
+            sa.Integer(),
+            sa.ForeignKey("new_chat_threads.id", ondelete="CASCADE"),
+            nullable=True,
+        ),
+        sa.Column(
+            "message_id",
+            sa.Integer(),
+            sa.ForeignKey("new_chat_messages.id", ondelete="SET NULL"),
+            nullable=True,
+        ),
+        sa.Column(
+            "search_space_id",
+            sa.Integer(),
+            sa.ForeignKey("searchspaces.id", ondelete="CASCADE"),
+            nullable=False,
+        ),
+        sa.Column(
+            "user_id",
+            UUID(as_uuid=True),
+            sa.ForeignKey("user.id", ondelete="CASCADE"),
+            nullable=False,
+        ),
+        sa.Column(
+            "created_at",
+            sa.TIMESTAMP(timezone=True),
+            nullable=False,
+            server_default=sa.func.now(),
+        ),
+    )
+
+    op.create_index("ix_token_usage_thread_id", "token_usage", ["thread_id"])
+    op.create_index("ix_token_usage_message_id", "token_usage", ["message_id"])
+    op.create_index(
+        "ix_token_usage_search_space_id", "token_usage", ["search_space_id"]
+    )
+    op.create_index("ix_token_usage_user_id", "token_usage", ["user_id"])
+    op.create_index("ix_token_usage_usage_type", "token_usage", ["usage_type"])
+
+
+def downgrade() -> None:
+    op.drop_table("token_usage")
diff --git a/surfsense_backend/alembic/versions/126_add_premium_token_quota.py b/surfsense_backend/alembic/versions/126_add_premium_token_quota.py
new file mode 100644
index 0000000000..c9ce7a5319
--- /dev/null
+++ b/surfsense_backend/alembic/versions/126_add_premium_token_quota.py
@@ -0,0 +1,166 @@
+"""add premium token quota columns and purchase table
+
+Revision ID: 126
+Revises: 125
+Create Date: 2026-04-15
+
+Adds premium_tokens_limit, premium_tokens_used, premium_tokens_reserved
+to the user table and creates the premium_token_purchases table.
+"""
+
+from __future__ import annotations
+
+import os
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+from alembic import op
+
+revision: str = "126"
+down_revision: str | None = "125"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+PREMIUM_TOKEN_LIMIT_DEFAULT = os.getenv("PREMIUM_TOKEN_LIMIT", "5000000")
+
+
+def upgrade() -> None:
+    conn = op.get_bind()
+
+    # --- User table: add premium token columns if missing ---
+    inspector = sa.inspect(conn)
+    user_columns = {c["name"] for c in inspector.get_columns("user")}
+
+    if "premium_tokens_limit" not in user_columns:
+        op.add_column(
+            "user",
+            sa.Column(
+                "premium_tokens_limit",
+                sa.BigInteger(),
+                nullable=False,
+                server_default=PREMIUM_TOKEN_LIMIT_DEFAULT,
+            ),
+        )
+    if "premium_tokens_used" not in user_columns:
+        op.add_column(
+            "user",
+            sa.Column(
+                "premium_tokens_used",
+                sa.BigInteger(),
+                nullable=False,
+                server_default="0",
+            ),
+        )
+    if "premium_tokens_reserved" not in user_columns:
+        op.add_column(
+            "user",
+            sa.Column(
+                "premium_tokens_reserved",
+                sa.BigInteger(),
+                nullable=False,
+                server_default="0",
+            ),
+        )
+
+    # --- PremiumTokenPurchase enum + table ---
+    enum_exists = conn.execute(
+        sa.text("SELECT 1 FROM pg_type WHERE typname = 'premiumtokenpurchasestatus'")
+    ).fetchone()
+    if not enum_exists:
+        purchase_status_enum = postgresql.ENUM(
+            "PENDING",
+            "COMPLETED",
+            "FAILED",
+            name="premiumtokenpurchasestatus",
+            create_type=False,
+        )
+        purchase_status_enum.create(conn, checkfirst=True)
+
+    if not inspector.has_table("premium_token_purchases"):
+        op.create_table(
+            "premium_token_purchases",
+            sa.Column("id", postgresql.UUID(as_uuid=True), nullable=False),
+            sa.Column("user_id", postgresql.UUID(as_uuid=True), nullable=False),
+            sa.Column(
+                "stripe_checkout_session_id",
+                sa.String(length=255),
+                nullable=False,
+            ),
+            sa.Column(
+                "stripe_payment_intent_id",
+                sa.String(length=255),
+                nullable=True,
+            ),
+            sa.Column("quantity", sa.Integer(), nullable=False),
+            sa.Column("tokens_granted", sa.BigInteger(), nullable=False),
+            sa.Column("amount_total", sa.Integer(), nullable=True),
+            sa.Column("currency", sa.String(length=10), nullable=True),
+            sa.Column(
+                "status",
+                postgresql.ENUM(
+                    "PENDING",
+                    "COMPLETED",
+                    "FAILED",
+                    name="premiumtokenpurchasestatus",
+                    create_type=False,
+                ),
+                nullable=False,
+                server_default=sa.text("'PENDING'::premiumtokenpurchasestatus"),
+            ),
+            sa.Column("completed_at", sa.TIMESTAMP(timezone=True), nullable=True),
+            sa.Column(
+                "created_at",
+                sa.TIMESTAMP(timezone=True),
+                server_default=sa.text("now()"),
+                nullable=False,
+            ),
+            sa.Column(
+                "updated_at",
+                sa.TIMESTAMP(timezone=True),
+                server_default=sa.text("now()"),
+                nullable=False,
+            ),
+            sa.ForeignKeyConstraint(
+                ["user_id"],
+                ["user.id"],
+                ondelete="CASCADE",
+            ),
+            sa.PrimaryKeyConstraint("id"),
+            sa.UniqueConstraint(
+                "stripe_checkout_session_id",
+                name="uq_premium_token_purchases_stripe_checkout_session_id",
+            ),
+        )
+
+    op.execute(
+        "CREATE INDEX IF NOT EXISTS ix_premium_token_purchases_user_id "
+        "ON premium_token_purchases (user_id)"
+    )
+    op.execute(
+        "CREATE UNIQUE INDEX IF NOT EXISTS ix_premium_token_purchases_stripe_session "
+        "ON premium_token_purchases (stripe_checkout_session_id)"
+    )
+    op.execute(
+        "CREATE INDEX IF NOT EXISTS ix_premium_token_purchases_payment_intent "
+        "ON premium_token_purchases (stripe_payment_intent_id)"
+    )
+    op.execute(
+        "CREATE INDEX IF NOT EXISTS ix_premium_token_purchases_status "
+        "ON premium_token_purchases (status)"
+    )
+
+
+def downgrade() -> None:
+    op.execute("DROP INDEX IF EXISTS ix_premium_token_purchases_status")
+    op.execute("DROP INDEX IF EXISTS ix_premium_token_purchases_payment_intent")
+    op.execute("DROP INDEX IF EXISTS ix_premium_token_purchases_stripe_session")
+    op.execute("DROP INDEX IF EXISTS ix_premium_token_purchases_user_id")
+    op.execute("DROP TABLE IF EXISTS premium_token_purchases")
+    postgresql.ENUM(name="premiumtokenpurchasestatus").drop(
+        op.get_bind(), checkfirst=True
+    )
+    op.drop_column("user", "premium_tokens_reserved")
+    op.drop_column("user", "premium_tokens_used")
+    op.drop_column("user", "premium_tokens_limit")
diff --git a/surfsense_backend/alembic/versions/127_add_report_content_type.py b/surfsense_backend/alembic/versions/127_add_report_content_type.py
new file mode 100644
index 0000000000..93bf471af9
--- /dev/null
+++ b/surfsense_backend/alembic/versions/127_add_report_content_type.py
@@ -0,0 +1,42 @@
+"""127_add_report_content_type
+
+Revision ID: 127
+Revises: 126
+Create Date: 2026-04-15
+
+Adds content_type column to reports table to distinguish between
+Markdown reports and Typst-based resumes.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+
+from alembic import op
+
+revision: str = "127"
+down_revision: str | None = "126"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+    conn = op.get_bind()
+    columns = [c["name"] for c in sa.inspect(conn).get_columns("reports")]
+    if "content_type" in columns:
+        return
+    op.add_column(
+        "reports",
+        sa.Column(
+            "content_type",
+            sa.String(20),
+            nullable=False,
+            server_default="markdown",
+        ),
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("reports", "content_type")
diff --git a/surfsense_backend/alembic/versions/128_seed_build_resume_prompt.py b/surfsense_backend/alembic/versions/128_seed_build_resume_prompt.py
new file mode 100644
index 0000000000..886879a7b3
--- /dev/null
+++ b/surfsense_backend/alembic/versions/128_seed_build_resume_prompt.py
@@ -0,0 +1,43 @@
+"""128_seed_build_resume_prompt
+
+Revision ID: 128
+Revises: 127
+Create Date: 2026-04-15
+
+Seeds the 'Build Resume' default prompt for all existing users.
+New users get it automatically via SYSTEM_PROMPT_DEFAULTS on signup.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+
+from alembic import op
+
+revision: str = "128"
+down_revision: str | None = "127"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+    conn = op.get_bind()
+    conn.execute(
+        sa.text(
+            """
+            INSERT INTO prompts
+                (user_id, default_prompt_slug, name, prompt, mode, version, is_public, created_at)
+            SELECT u.id, 'build-resume', 'Build Resume',
+                   E'Build me a professional resume. Here is my information:\\n\\n{selection}',
+                   'explore'::prompt_mode, 1, false, now()
+            FROM "user" u
+            ON CONFLICT (user_id, default_prompt_slug) DO NOTHING
+            """
+        )
+    )
+
+
+def downgrade() -> None:
+    op.execute("DELETE FROM prompts WHERE default_prompt_slug = 'build-resume'")
diff --git a/surfsense_backend/alembic/versions/129_obsidian_plugin_vault_identity.py b/surfsense_backend/alembic/versions/129_obsidian_plugin_vault_identity.py
new file mode 100644
index 0000000000..0c0e3dbe56
--- /dev/null
+++ b/surfsense_backend/alembic/versions/129_obsidian_plugin_vault_identity.py
@@ -0,0 +1,106 @@
+"""129_obsidian_plugin_vault_identity
+
+Revision ID: 129
+Revises: 128
+Create Date: 2026-04-21
+
+Locks down vault identity for the Obsidian plugin connector:
+
+- Deactivates pre-plugin OBSIDIAN_CONNECTOR rows.
+- Partial unique index on ``(user_id, (config->>'vault_id'))`` for the
+  ``/obsidian/connect`` upsert fast path.
+- Partial unique index on ``(user_id, (config->>'vault_fingerprint'))``
+  so two devices observing the same vault content can never produce
+  two connector rows. Collisions are caught by the route handler and
+  routed through the merge path.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+
+from alembic import op
+
+revision: str = "129"
+down_revision: str | None = "128"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+    conn = op.get_bind()
+
+    conn.execute(
+        sa.text(
+            """
+            UPDATE search_source_connectors
+            SET
+                is_indexable = false,
+                periodic_indexing_enabled = false,
+                next_scheduled_at = NULL,
+                config = COALESCE(config, '{}'::json)::jsonb
+                         || jsonb_build_object(
+                             'legacy', true,
+                             'deactivated_at', to_char(
+                                 now() AT TIME ZONE 'UTC',
+                                 'YYYY-MM-DD"T"HH24:MI:SS"Z"'
+                             )
+                         )
+            WHERE connector_type = 'OBSIDIAN_CONNECTOR'
+              AND COALESCE((config::jsonb)->>'source', '') <> 'plugin'
+            """
+        )
+    )
+
+    conn.execute(
+        sa.text(
+            """
+            CREATE UNIQUE INDEX IF NOT EXISTS
+                search_source_connectors_obsidian_plugin_vault_uniq
+            ON search_source_connectors (user_id, ((config->>'vault_id')))
+            WHERE connector_type = 'OBSIDIAN_CONNECTOR'
+              AND config->>'source' = 'plugin'
+              AND config->>'vault_id' IS NOT NULL
+            """
+        )
+    )
+
+    conn.execute(
+        sa.text(
+            """
+            CREATE UNIQUE INDEX IF NOT EXISTS
+                search_source_connectors_obsidian_plugin_fingerprint_uniq
+            ON search_source_connectors (user_id, ((config->>'vault_fingerprint')))
+            WHERE connector_type = 'OBSIDIAN_CONNECTOR'
+              AND config->>'source' = 'plugin'
+              AND config->>'vault_fingerprint' IS NOT NULL
+            """
+        )
+    )
+
+
+def downgrade() -> None:
+    conn = op.get_bind()
+    conn.execute(
+        sa.text(
+            "DROP INDEX IF EXISTS "
+            "search_source_connectors_obsidian_plugin_fingerprint_uniq"
+        )
+    )
+    conn.execute(
+        sa.text(
+            "DROP INDEX IF EXISTS search_source_connectors_obsidian_plugin_vault_uniq"
+        )
+    )
+    conn.execute(
+        sa.text(
+            """
+            UPDATE search_source_connectors
+            SET config = (config::jsonb - 'legacy' - 'deactivated_at')::json
+            WHERE connector_type = 'OBSIDIAN_CONNECTOR'
+              AND (config::jsonb) ? 'legacy'
+            """
+        )
+    )
diff --git a/surfsense_backend/alembic/versions/130_add_agent_action_log.py b/surfsense_backend/alembic/versions/130_add_agent_action_log.py
new file mode 100644
index 0000000000..5978848d07
--- /dev/null
+++ b/surfsense_backend/alembic/versions/130_add_agent_action_log.py
@@ -0,0 +1,98 @@
+"""130_add_agent_action_log
+
+Revision ID: 130
+Revises: 129
+Create Date: 2026-04-28
+
+Adds the append-only ``agent_action_log`` table that
+:class:`ActionLogMiddleware` writes to after every tool call. Each row
+optionally carries a ``reverse_descriptor`` payload used by
+``POST /api/threads/{thread_id}/revert/{action_id}`` to undo the action.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+from alembic import op
+
+revision: str = "130"
+down_revision: str | None = "129"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+    bind = op.get_bind()
+    if sa.inspect(bind).has_table("agent_action_log"):
+        return
+
+    op.create_table(
+        "agent_action_log",
+        sa.Column("id", sa.Integer(), primary_key=True, index=True),
+        sa.Column(
+            "thread_id",
+            sa.Integer(),
+            sa.ForeignKey("new_chat_threads.id", ondelete="CASCADE"),
+            nullable=False,
+            index=True,
+        ),
+        sa.Column(
+            "user_id",
+            postgresql.UUID(as_uuid=True),
+            sa.ForeignKey("user.id", ondelete="SET NULL"),
+            nullable=True,
+            index=True,
+        ),
+        sa.Column(
+            "search_space_id",
+            sa.Integer(),
+            sa.ForeignKey("searchspaces.id", ondelete="CASCADE"),
+            nullable=False,
+            index=True,
+        ),
+        sa.Column("turn_id", sa.String(length=64), nullable=True, index=True),
+        sa.Column("message_id", sa.String(length=128), nullable=True, index=True),
+        sa.Column("tool_name", sa.String(length=255), nullable=False, index=True),
+        sa.Column("args", postgresql.JSONB(astext_type=sa.Text()), nullable=True),
+        sa.Column("result_id", sa.String(length=255), nullable=True),
+        sa.Column(
+            "reversible",
+            sa.Boolean(),
+            nullable=False,
+            server_default=sa.text("false"),
+        ),
+        sa.Column(
+            "reverse_descriptor",
+            postgresql.JSONB(astext_type=sa.Text()),
+            nullable=True,
+        ),
+        sa.Column("error", postgresql.JSONB(astext_type=sa.Text()), nullable=True),
+        sa.Column(
+            "reverse_of",
+            sa.Integer(),
+            sa.ForeignKey("agent_action_log.id", ondelete="SET NULL"),
+            nullable=True,
+            index=True,
+        ),
+        sa.Column(
+            "created_at",
+            sa.TIMESTAMP(timezone=True),
+            nullable=False,
+            server_default=sa.text("(now() AT TIME ZONE 'utc')"),
+            index=True,
+        ),
+    )
+    op.create_index(
+        "ix_agent_action_log_thread_created",
+        "agent_action_log",
+        ["thread_id", "created_at"],
+    )
+
+
+def downgrade() -> None:
+    op.drop_index("ix_agent_action_log_thread_created", table_name="agent_action_log")
+    op.drop_table("agent_action_log")
diff --git a/surfsense_backend/alembic/versions/131_add_document_revisions.py b/surfsense_backend/alembic/versions/131_add_document_revisions.py
new file mode 100644
index 0000000000..c1e9b6068b
--- /dev/null
+++ b/surfsense_backend/alembic/versions/131_add_document_revisions.py
@@ -0,0 +1,136 @@
+"""131_add_document_revisions
+
+Revision ID: 131
+Revises: 130
+Create Date: 2026-04-28
+
+Adds two snapshot tables that back the per-action revert flow:
+
+* ``document_revisions``: pre-mutation snapshot of NOTE/FILE/EXTENSION docs.
+* ``folder_revisions``: pre-mutation snapshot of folder mkdir/move/delete.
+
+Both are written by :class:`KnowledgeBasePersistenceMiddleware` ahead of
+state-changing tool calls and consumed by ``revert_service.revert_action``.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+from alembic import op
+
+revision: str = "131"
+down_revision: str | None = "130"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+    bind = op.get_bind()
+    inspector = sa.inspect(bind)
+
+    if inspector.has_table("document_revisions") and inspector.has_table(
+        "folder_revisions"
+    ):
+        return
+
+    if not inspector.has_table("document_revisions"):
+        _create_document_revisions()
+    if not inspector.has_table("folder_revisions"):
+        _create_folder_revisions()
+
+
+def _create_document_revisions() -> None:
+    op.create_table(
+        "document_revisions",
+        sa.Column("id", sa.Integer(), primary_key=True, index=True),
+        sa.Column(
+            "document_id",
+            sa.Integer(),
+            sa.ForeignKey("documents.id", ondelete="CASCADE"),
+            nullable=False,
+            index=True,
+        ),
+        sa.Column(
+            "search_space_id",
+            sa.Integer(),
+            sa.ForeignKey("searchspaces.id", ondelete="CASCADE"),
+            nullable=False,
+            index=True,
+        ),
+        sa.Column("content_before", sa.Text(), nullable=True),
+        sa.Column("title_before", sa.String(), nullable=True),
+        sa.Column("folder_id_before", sa.Integer(), nullable=True),
+        sa.Column(
+            "chunks_before", postgresql.JSONB(astext_type=sa.Text()), nullable=True
+        ),
+        sa.Column(
+            "metadata_before", postgresql.JSONB(astext_type=sa.Text()), nullable=True
+        ),
+        sa.Column(
+            "created_by_turn_id", sa.String(length=64), nullable=True, index=True
+        ),
+        sa.Column(
+            "agent_action_id",
+            sa.Integer(),
+            sa.ForeignKey("agent_action_log.id", ondelete="SET NULL"),
+            nullable=True,
+            index=True,
+        ),
+        sa.Column(
+            "created_at",
+            sa.TIMESTAMP(timezone=True),
+            nullable=False,
+            server_default=sa.text("(now() AT TIME ZONE 'utc')"),
+            index=True,
+        ),
+    )
+
+
+def _create_folder_revisions() -> None:
+    op.create_table(
+        "folder_revisions",
+        sa.Column("id", sa.Integer(), primary_key=True, index=True),
+        sa.Column(
+            "folder_id",
+            sa.Integer(),
+            sa.ForeignKey("folders.id", ondelete="CASCADE"),
+            nullable=False,
+            index=True,
+        ),
+        sa.Column(
+            "search_space_id",
+            sa.Integer(),
+            sa.ForeignKey("searchspaces.id", ondelete="CASCADE"),
+            nullable=False,
+            index=True,
+        ),
+        sa.Column("name_before", sa.String(length=255), nullable=True),
+        sa.Column("parent_id_before", sa.Integer(), nullable=True),
+        sa.Column("position_before", sa.String(length=50), nullable=True),
+        sa.Column(
+            "created_by_turn_id", sa.String(length=64), nullable=True, index=True
+        ),
+        sa.Column(
+            "agent_action_id",
+            sa.Integer(),
+            sa.ForeignKey("agent_action_log.id", ondelete="SET NULL"),
+            nullable=True,
+            index=True,
+        ),
+        sa.Column(
+            "created_at",
+            sa.TIMESTAMP(timezone=True),
+            nullable=False,
+            server_default=sa.text("(now() AT TIME ZONE 'utc')"),
+            index=True,
+        ),
+    )
+
+
+def downgrade() -> None:
+    op.drop_table("folder_revisions")
+    op.drop_table("document_revisions")
diff --git a/surfsense_backend/alembic/versions/132_add_agent_permission_rules.py b/surfsense_backend/alembic/versions/132_add_agent_permission_rules.py
new file mode 100644
index 0000000000..1ee3cd2f01
--- /dev/null
+++ b/surfsense_backend/alembic/versions/132_add_agent_permission_rules.py
@@ -0,0 +1,85 @@
+"""132_add_agent_permission_rules
+
+Revision ID: 132
+Revises: 131
+Create Date: 2026-04-28
+
+Adds the persistent ``agent_permission_rules`` table consumed by
+:class:`PermissionMiddleware` at agent build time. Rules can be scoped
+at search-space (``user_id`` / ``thread_id`` NULL), user-wide
+(``user_id`` set, ``thread_id`` NULL), or per-thread (``thread_id`` set).
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+from alembic import op
+
+revision: str = "132"
+down_revision: str | None = "131"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+    bind = op.get_bind()
+    if sa.inspect(bind).has_table("agent_permission_rules"):
+        return
+
+    op.create_table(
+        "agent_permission_rules",
+        sa.Column("id", sa.Integer(), primary_key=True, index=True),
+        sa.Column(
+            "search_space_id",
+            sa.Integer(),
+            sa.ForeignKey("searchspaces.id", ondelete="CASCADE"),
+            nullable=False,
+            index=True,
+        ),
+        sa.Column(
+            "user_id",
+            postgresql.UUID(as_uuid=True),
+            sa.ForeignKey("user.id", ondelete="CASCADE"),
+            nullable=True,
+            index=True,
+        ),
+        sa.Column(
+            "thread_id",
+            sa.Integer(),
+            sa.ForeignKey("new_chat_threads.id", ondelete="CASCADE"),
+            nullable=True,
+            index=True,
+        ),
+        sa.Column("permission", sa.String(length=255), nullable=False),
+        sa.Column(
+            "pattern",
+            sa.String(length=255),
+            nullable=False,
+            server_default="*",
+        ),
+        sa.Column("action", sa.String(length=16), nullable=False),
+        sa.Column(
+            "created_at",
+            sa.TIMESTAMP(timezone=True),
+            nullable=False,
+            server_default=sa.text("(now() AT TIME ZONE 'utc')"),
+            index=True,
+        ),
+        sa.UniqueConstraint(
+            "search_space_id",
+            "user_id",
+            "thread_id",
+            "permission",
+            "pattern",
+            "action",
+            name="uq_agent_permission_rules_scope",
+        ),
+    )
+
+
+def downgrade() -> None:
+    op.drop_table("agent_permission_rules")
diff --git a/surfsense_backend/alembic/versions/133_drop_documents_content_hash_unique.py b/surfsense_backend/alembic/versions/133_drop_documents_content_hash_unique.py
new file mode 100644
index 0000000000..eec53ecb64
--- /dev/null
+++ b/surfsense_backend/alembic/versions/133_drop_documents_content_hash_unique.py
@@ -0,0 +1,105 @@
+"""133_drop_documents_content_hash_unique
+
+Revision ID: 133
+Revises: 132
+Create Date: 2026-04-29
+
+Drop the global UNIQUE constraint on ``documents.content_hash`` so the
+new-chat agent's ``write_file`` flow can persist legitimate file copies
+(two paths, identical content) without hitting a constraint that mirrors
+no real filesystem semantic.
+
+Path uniqueness still lives on ``documents.unique_identifier_hash`` (per
+search space), which is the right invariant — exactly like an inode at a
+given path on a POSIX filesystem.
+
+The non-unique INDEX on ``content_hash`` is preserved so connector
+indexers' "have we seen this content before?" lookup
+(:func:`app.tasks.document_processors.base.check_duplicate_document`,
+which already uses ``.scalars().first()`` and is therefore tolerant of
+duplicates) stays cheap.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+from sqlalchemy import inspect
+
+from alembic import op
+
+revision: str = "133"
+down_revision: str | None = "132"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def _existing_constraint_names(bind, table: str) -> set[str]:
+    inspector = inspect(bind)
+    return {c["name"] for c in inspector.get_unique_constraints(table)}
+
+
+def _existing_index_names(bind, table: str) -> set[str]:
+    inspector = inspect(bind)
+    return {i["name"] for i in inspector.get_indexes(table)}
+
+
+def upgrade() -> None:
+    bind = op.get_bind()
+
+    # Both the named UniqueConstraint (added in revision 8) and the
+    # implicit-unique-index variant SQLAlchemy may emit need draining.
+    constraints = _existing_constraint_names(bind, "documents")
+    if "uq_documents_content_hash" in constraints:
+        op.drop_constraint("uq_documents_content_hash", "documents", type_="unique")
+
+    indexes = _existing_index_names(bind, "documents")
+    # Some Postgres versions surface the unique constraint via a unique
+    # index of the same name; check for that too.
+    for idx_name in ("uq_documents_content_hash",):
+        if idx_name in indexes:
+            op.drop_index(idx_name, table_name="documents")
+
+    # Ensure the non-unique index is present for fast lookups.
+    if "ix_documents_content_hash" not in indexes:
+        op.create_index(
+            "ix_documents_content_hash",
+            "documents",
+            ["content_hash"],
+            unique=False,
+        )
+
+
+def downgrade() -> None:
+    bind = op.get_bind()
+
+    # Re-applying UNIQUE is destructive: there may now be legitimate
+    # duplicates (e.g. two NOTE documents that share content because the
+    # user explicitly copied one to a new path). To avoid the migration
+    # silently deleting user data, we keep only the lowest-id row per
+    # content_hash — same strategy revision 8 used when first introducing
+    # the constraint.
+    op.execute(
+        """
+        DELETE FROM documents
+        WHERE id NOT IN (
+            SELECT MIN(id)
+            FROM documents
+            GROUP BY content_hash
+        )
+        """
+    )
+
+    indexes = _existing_index_names(bind, "documents")
+    if "ix_documents_content_hash" in indexes:
+        op.drop_index("ix_documents_content_hash", table_name="documents")
+
+    op.create_index(
+        "ix_documents_content_hash",
+        "documents",
+        ["content_hash"],
+        unique=False,
+    )
+    op.create_unique_constraint(
+        "uq_documents_content_hash", "documents", ["content_hash"]
+    )
diff --git a/surfsense_backend/alembic/versions/134_relax_revision_fks.py b/surfsense_backend/alembic/versions/134_relax_revision_fks.py
new file mode 100644
index 0000000000..99b665426a
--- /dev/null
+++ b/surfsense_backend/alembic/versions/134_relax_revision_fks.py
@@ -0,0 +1,139 @@
+"""134_relax_revision_fks
+
+Revision ID: 134
+Revises: 133
+Create Date: 2026-04-29
+
+Relax the parent FKs on ``document_revisions`` and ``folder_revisions`` so
+revisions survive the deletes they describe.
+
+Why: the snapshot/revert pipeline writes a ``DocumentRevision`` BEFORE
+hard-deleting a document via the ``rm`` tool (and likewise a
+``FolderRevision`` before ``rmdir``). If the FK is ``ON DELETE CASCADE``
+the snapshot row is wiped at the exact moment we need it most — revert
+then has nothing to read and the operation becomes irreversible.
+
+Migration:
+
+* ``document_revisions.document_id``: ``NOT NULL`` -> nullable; FK
+  ``ON DELETE CASCADE`` -> ``ON DELETE SET NULL``.
+* ``folder_revisions.folder_id``: same treatment.
+
+The ``search_space_id`` FK on both tables is left unchanged (still
+``ON DELETE CASCADE``). When a search space is deleted, all documents,
+folders, AND their revisions go together — that's the correct teardown
+story.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+from sqlalchemy import inspect
+
+from alembic import op
+
+revision: str = "134"
+down_revision: str | None = "133"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def _fk_name(bind, table: str, column: str) -> str | None:
+    """Return the (single) FK constraint name on ``table.column``, if any."""
+    inspector = inspect(bind)
+    for fk in inspector.get_foreign_keys(table):
+        cols = fk.get("constrained_columns") or []
+        if cols == [column]:
+            return fk.get("name")
+    return None
+
+
+def upgrade() -> None:
+    bind = op.get_bind()
+
+    # --- document_revisions.document_id -> nullable + SET NULL ---------------
+    fk_name = _fk_name(bind, "document_revisions", "document_id")
+    if fk_name:
+        op.drop_constraint(fk_name, "document_revisions", type_="foreignkey")
+    op.alter_column(
+        "document_revisions",
+        "document_id",
+        existing_type=sa.Integer(),
+        nullable=True,
+    )
+    op.create_foreign_key(
+        "document_revisions_document_id_fkey",
+        "document_revisions",
+        "documents",
+        ["document_id"],
+        ["id"],
+        ondelete="SET NULL",
+    )
+
+    # --- folder_revisions.folder_id -> nullable + SET NULL -------------------
+    fk_name = _fk_name(bind, "folder_revisions", "folder_id")
+    if fk_name:
+        op.drop_constraint(fk_name, "folder_revisions", type_="foreignkey")
+    op.alter_column(
+        "folder_revisions",
+        "folder_id",
+        existing_type=sa.Integer(),
+        nullable=True,
+    )
+    op.create_foreign_key(
+        "folder_revisions_folder_id_fkey",
+        "folder_revisions",
+        "folders",
+        ["folder_id"],
+        ["id"],
+        ondelete="SET NULL",
+    )
+
+
+def downgrade() -> None:
+    bind = op.get_bind()
+
+    # Reinstating NOT NULL + CASCADE requires draining orphan rows first
+    # (any revision whose parent doc/folder has already been deleted).
+    op.execute("DELETE FROM document_revisions WHERE document_id IS NULL")
+    op.execute("DELETE FROM folder_revisions WHERE folder_id IS NULL")
+
+    # --- document_revisions.document_id -> NOT NULL + CASCADE ---------------
+    fk_name = _fk_name(bind, "document_revisions", "document_id")
+    if fk_name:
+        op.drop_constraint(fk_name, "document_revisions", type_="foreignkey")
+    op.alter_column(
+        "document_revisions",
+        "document_id",
+        existing_type=sa.Integer(),
+        nullable=False,
+    )
+    op.create_foreign_key(
+        "document_revisions_document_id_fkey",
+        "document_revisions",
+        "documents",
+        ["document_id"],
+        ["id"],
+        ondelete="CASCADE",
+    )
+
+    # --- folder_revisions.folder_id -> NOT NULL + CASCADE -------------------
+    fk_name = _fk_name(bind, "folder_revisions", "folder_id")
+    if fk_name:
+        op.drop_constraint(fk_name, "folder_revisions", type_="foreignkey")
+    op.alter_column(
+        "folder_revisions",
+        "folder_id",
+        existing_type=sa.Integer(),
+        nullable=False,
+    )
+    op.create_foreign_key(
+        "folder_revisions_folder_id_fkey",
+        "folder_revisions",
+        "folders",
+        ["folder_id"],
+        ["id"],
+        ondelete="CASCADE",
+    )
diff --git a/surfsense_backend/alembic/versions/135_action_log_correlation_ids.py b/surfsense_backend/alembic/versions/135_action_log_correlation_ids.py
new file mode 100644
index 0000000000..e40c4fb260
--- /dev/null
+++ b/surfsense_backend/alembic/versions/135_action_log_correlation_ids.py
@@ -0,0 +1,92 @@
+"""135_action_log_correlation_ids
+
+Revision ID: 135
+Revises: 134
+Create Date: 2026-04-29
+
+Action-log correlation-id cleanup.
+
+Background
+----------
+``agent_action_log.turn_id`` is misnamed. ``ActionLogMiddleware`` writes
+the LangChain ``tool_call.id`` into that column today (see
+``action_log.py:_resolve_turn_id``), and ``kb_persistence._find_action_ids_batch``
+joins on it as such. The real chat-turn id (``f"{chat_id}:{ms}"`` from
+``stream_new_chat.py``) lives in ``config.configurable.turn_id`` and was
+never persisted.
+
+This migration introduces two new, correctly-named columns:
+
+* ``tool_call_id`` (LangChain tool-call id, what ``turn_id`` actually held)
+* ``chat_turn_id`` (the per-turn correlation id from
+  ``configurable.turn_id`` — used by the per-turn ``revert-turn`` route).
+
+Backfill copies the current ``turn_id`` values into ``tool_call_id`` so
+existing joins keep working. The old ``turn_id`` column is left in place
+for one release as a deprecated alias to give safe rollback. ``ActionLogMiddleware``
+keeps writing it (= ``tool_call_id``) for the same reason.
+
+Indexes
+-------
+
+* ``ix_agent_action_log_tool_call_id`` — required by
+  ``_find_action_ids_batch`` (was on ``turn_id``).
+* ``ix_agent_action_log_chat_turn_id`` — required by the
+  ``revert-turn/{chat_turn_id}`` query.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+
+from alembic import op
+
+revision: str = "135"
+down_revision: str | None = "134"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+    bind = op.get_bind()
+    inspector = sa.inspect(bind)
+    columns = {c["name"] for c in inspector.get_columns("agent_action_log")}
+    indexes = {i["name"] for i in inspector.get_indexes("agent_action_log")}
+
+    if "tool_call_id" not in columns:
+        op.add_column(
+            "agent_action_log",
+            sa.Column("tool_call_id", sa.String(length=64), nullable=True),
+        )
+    if "chat_turn_id" not in columns:
+        op.add_column(
+            "agent_action_log",
+            sa.Column("chat_turn_id", sa.String(length=64), nullable=True),
+        )
+
+    if "ix_agent_action_log_tool_call_id" not in indexes:
+        op.create_index(
+            "ix_agent_action_log_tool_call_id",
+            "agent_action_log",
+            ["tool_call_id"],
+        )
+    if "ix_agent_action_log_chat_turn_id" not in indexes:
+        op.create_index(
+            "ix_agent_action_log_chat_turn_id",
+            "agent_action_log",
+            ["chat_turn_id"],
+        )
+
+    if "turn_id" in columns:
+        op.execute(
+            "UPDATE agent_action_log SET tool_call_id = turn_id WHERE tool_call_id IS NULL"
+        )
+
+
+def downgrade() -> None:
+    op.drop_index("ix_agent_action_log_chat_turn_id", table_name="agent_action_log")
+    op.drop_index("ix_agent_action_log_tool_call_id", table_name="agent_action_log")
+    op.drop_column("agent_action_log", "chat_turn_id")
+    op.drop_column("agent_action_log", "tool_call_id")
diff --git a/surfsense_backend/alembic/versions/136_new_chat_message_turn_id.py b/surfsense_backend/alembic/versions/136_new_chat_message_turn_id.py
new file mode 100644
index 0000000000..ee02e453ca
--- /dev/null
+++ b/surfsense_backend/alembic/versions/136_new_chat_message_turn_id.py
@@ -0,0 +1,59 @@
+"""136_new_chat_message_turn_id
+
+Revision ID: 136
+Revises: 135
+Create Date: 2026-04-29
+
+Persist the per-turn correlation id on each chat message.
+
+Background
+----------
+LangGraph's checkpointer stores user-provided ``configurable.turn_id``
+in checkpoint metadata (see
+``langgraph/checkpoint/base/__init__.py:get_checkpoint_metadata``). To
+support edit-from-arbitrary-position, the regenerate route needs to map
+a ``message_id`` -> ``turn_id`` -> checkpoint at request time. Without
+this column the mapping doesn't exist anywhere, so regenerate would
+have to hardcode the "last 2 messages" rewind heuristic.
+
+This migration adds a nullable ``turn_id`` column to ``new_chat_messages``
+plus an index. Legacy rows have NULL — the regenerate route degrades
+gracefully to the reload-last-two heuristic for those.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+
+from alembic import op
+
+revision: str = "136"
+down_revision: str | None = "135"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+    bind = op.get_bind()
+    inspector = sa.inspect(bind)
+    columns = {c["name"] for c in inspector.get_columns("new_chat_messages")}
+    indexes = {i["name"] for i in inspector.get_indexes("new_chat_messages")}
+
+    if "turn_id" not in columns:
+        op.add_column(
+            "new_chat_messages",
+            sa.Column("turn_id", sa.String(length=64), nullable=True),
+        )
+    if "ix_new_chat_messages_turn_id" not in indexes:
+        op.create_index(
+            "ix_new_chat_messages_turn_id",
+            "new_chat_messages",
+            ["turn_id"],
+        )
+
+
+def downgrade() -> None:
+    op.drop_index("ix_new_chat_messages_turn_id", table_name="new_chat_messages")
+    op.drop_column("new_chat_messages", "turn_id")
diff --git a/surfsense_backend/alembic/versions/137_unique_reverse_of_in_action_log.py b/surfsense_backend/alembic/versions/137_unique_reverse_of_in_action_log.py
new file mode 100644
index 0000000000..47421e7125
--- /dev/null
+++ b/surfsense_backend/alembic/versions/137_unique_reverse_of_in_action_log.py
@@ -0,0 +1,81 @@
+"""137_unique_reverse_of_in_action_log
+
+Revision ID: 137
+Revises: 136
+Create Date: 2026-04-29
+
+Protect ``agent_action_log.reverse_of`` against double inserts. Two
+concurrent revert calls (single-action route + the per-turn batch
+route, or two batch routes racing) both pass the
+``_was_already_reverted`` SELECT and both insert their own
+``_revert:*`` rows. The application-level idempotency check is racy
+because there's no DB constraint backing it.
+
+This migration adds a partial unique index on ``reverse_of`` (PostgreSQL
+``WHERE reverse_of IS NOT NULL``) so the second concurrent insert raises
+``IntegrityError`` and the route can translate it to ``"already_reverted"``
+deterministically.
+
+The plain ``UniqueConstraint`` flavour can't be used because most
+existing rows have ``reverse_of = NULL`` (only revert rows fill it),
+and Postgres does treat NULL as distinct in unique indexes — but a
+partial index is the cleanest expression of intent and works even on
+older Postgres releases that distinguish NULL handling.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+
+from alembic import op
+
+revision: str = "137"
+down_revision: str | None = "136"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+_INDEX_NAME = "ux_agent_action_log_reverse_of"
+
+
+def upgrade() -> None:
+    bind = op.get_bind()
+    indexes = {i["name"] for i in sa.inspect(bind).get_indexes("agent_action_log")}
+    if _INDEX_NAME in indexes:
+        return
+
+    # Defensively de-dup any pre-existing double-revert rows before
+    # adding the unique index. Keeps the OLDEST row (smallest id) and
+    # NULLs out the duplicates' ``reverse_of`` so they survive as audit
+    # trail but no longer claim to be the canonical revert. We do NOT
+    # delete them — operators can still inspect them via /actions.
+    op.execute(
+        """
+        WITH dups AS (
+            SELECT id,
+                   reverse_of,
+                   ROW_NUMBER() OVER (
+                       PARTITION BY reverse_of ORDER BY id ASC
+                   ) AS rn
+              FROM agent_action_log
+             WHERE reverse_of IS NOT NULL
+        )
+        UPDATE agent_action_log
+           SET reverse_of = NULL
+         WHERE id IN (SELECT id FROM dups WHERE rn > 1)
+        """
+    )
+
+    op.create_index(
+        _INDEX_NAME,
+        "agent_action_log",
+        ["reverse_of"],
+        unique=True,
+        postgresql_where="reverse_of IS NOT NULL",
+    )
+
+
+def downgrade() -> None:
+    op.drop_index(_INDEX_NAME, table_name="agent_action_log")
diff --git a/surfsense_backend/alembic/versions/138_add_thread_auto_model_pinning_fields.py b/surfsense_backend/alembic/versions/138_add_thread_auto_model_pinning_fields.py
new file mode 100644
index 0000000000..fba621a0cc
--- /dev/null
+++ b/surfsense_backend/alembic/versions/138_add_thread_auto_model_pinning_fields.py
@@ -0,0 +1,44 @@
+"""138_add_thread_auto_model_pinning_fields
+
+Revision ID: 138
+Revises: 137
+Create Date: 2026-04-30
+
+Add a single thread-level column to persist the Auto (Fastest) model pin:
+- pinned_llm_config_id: concrete resolved global LLM config id used for this
+  thread. NULL means "no pin; Auto will resolve on next turn".
+
+The column is unindexed: all reads are by new_chat_threads.id (primary key),
+so a secondary index would be dead write amplification.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+from alembic import op
+
+revision: str = "138"
+down_revision: str | None = "137"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+    op.execute(
+        "ALTER TABLE new_chat_threads "
+        "ADD COLUMN IF NOT EXISTS pinned_llm_config_id INTEGER"
+    )
+
+
+def downgrade() -> None:
+    # Drop any shape the thread row may be carrying. The extra columns and
+    # indexes only exist on dev DBs that ran an earlier draft of 138; IF EXISTS
+    # makes each statement a safe no-op on the lean shape.
+    op.execute("DROP INDEX IF EXISTS ix_new_chat_threads_pinned_auto_mode")
+    op.execute("DROP INDEX IF EXISTS ix_new_chat_threads_pinned_llm_config_id")
+    op.execute("ALTER TABLE new_chat_threads DROP COLUMN IF EXISTS pinned_at")
+    op.execute("ALTER TABLE new_chat_threads DROP COLUMN IF EXISTS pinned_auto_mode")
+    op.execute(
+        "ALTER TABLE new_chat_threads DROP COLUMN IF EXISTS pinned_llm_config_id"
+    )
diff --git a/surfsense_backend/alembic/versions/139_add_user_to_zero_publication.py b/surfsense_backend/alembic/versions/139_add_user_to_zero_publication.py
new file mode 100644
index 0000000000..646049e3c2
--- /dev/null
+++ b/surfsense_backend/alembic/versions/139_add_user_to_zero_publication.py
@@ -0,0 +1,170 @@
+"""add user table to zero_publication with column list
+
+Adds the "user" table to zero_publication with a column-list publication
+so that only the 5 fields driving the live usage meters are replicated
+through WAL -> zero-cache -> browser IndexedDB:
+
+    id, pages_limit, pages_used,
+    premium_tokens_limit, premium_tokens_used
+
+Sensitive columns (hashed_password, email, oauth_account, display_name,
+avatar_url, memory_md, refresh_tokens, last_login, etc.) are NOT
+included in the publication, so they never enter WAL replication.
+
+Also re-asserts REPLICA IDENTITY DEFAULT on "user" for idempotency
+(it is already DEFAULT today since "user" was never in the
+TABLES_WITH_FULL_IDENTITY list of migration 117).
+
+IMPORTANT - before AND after running this migration:
+  1. Stop zero-cache  (it holds replication locks that will deadlock DDL)
+  2. Run:  alembic upgrade head
+  3. Delete / reset the zero-cache data volume
+  4. Restart zero-cache  (it will do a fresh initial sync)
+
+DO NOT COPY THIS PATTERN. The ``DROP PUBLICATION`` + ``CREATE
+PUBLICATION`` dance below is the pre-#1355 anti-pattern: on Zero >=
+1.0 it does not reliably wake the zero-cache change-streamer and can
+leave the replica pinned to a stale snapshot. This file is
+grandfathered in because it has already shipped to users; new
+publication mutations MUST use the ``COMMENT ON PUBLICATION`` bookend
+pattern wrapping an ``ALTER PUBLICATION ... SET TABLE`` -- copy the
+``upgrade()`` function from migration
+``143_force_zero_publication_resync.py`` as your starting template.
+
+Revision ID: 139
+Revises: 138
+"""
+
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+
+from alembic import op
+
+revision: str = "139"
+down_revision: str | None = "138"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+PUBLICATION_NAME = "zero_publication"
+
+# Document column list as left by migration 117. Must match exactly.
+DOCUMENT_COLS = [
+    "id",
+    "title",
+    "document_type",
+    "search_space_id",
+    "folder_id",
+    "created_by_id",
+    "status",
+    "created_at",
+    "updated_at",
+]
+
+# Five fields needed by the live usage meters (sidebar Tokens/Pages,
+# Buy Tokens content). Keep this list narrow on purpose: anything added
+# here flows into WAL and IndexedDB for every connected browser.
+USER_COLS = [
+    "id",
+    "pages_limit",
+    "pages_used",
+    "premium_tokens_limit",
+    "premium_tokens_used",
+]
+
+
+def _terminate_blocked_pids(conn, table: str) -> None:
+    """Kill backends whose locks on *table* would block our AccessExclusiveLock."""
+    conn.execute(
+        sa.text(
+            "SELECT pg_terminate_backend(l.pid) "
+            "FROM pg_locks l "
+            "JOIN pg_class c ON c.oid = l.relation "
+            "WHERE c.relname = :tbl "
+            "  AND l.pid != pg_backend_pid()"
+        ),
+        {"tbl": table},
+    )
+
+
+def _has_zero_version(conn, table: str) -> bool:
+    return (
+        conn.execute(
+            sa.text(
+                "SELECT 1 FROM information_schema.columns "
+                "WHERE table_name = :tbl AND column_name = '_0_version'"
+            ),
+            {"tbl": table},
+        ).fetchone()
+        is not None
+    )
+
+
+def _build_publication_ddl(
+    documents_has_zero_ver: bool, user_has_zero_ver: bool
+) -> str:
+    doc_cols = DOCUMENT_COLS + (['"_0_version"'] if documents_has_zero_ver else [])
+    user_cols = USER_COLS + (['"_0_version"'] if user_has_zero_ver else [])
+    doc_col_list = ", ".join(doc_cols)
+    user_col_list = ", ".join(user_cols)
+    return (
+        f"CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE "
+        f"notifications, "
+        f"documents ({doc_col_list}), "
+        f"folders, "
+        f"search_source_connectors, "
+        f"new_chat_messages, "
+        f"chat_comments, "
+        f"chat_session_state, "
+        f'"user" ({user_col_list})'
+    )
+
+
+def _build_publication_ddl_without_user(documents_has_zero_ver: bool) -> str:
+    doc_cols = DOCUMENT_COLS + (['"_0_version"'] if documents_has_zero_ver else [])
+    doc_col_list = ", ".join(doc_cols)
+    return (
+        f"CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE "
+        f"notifications, "
+        f"documents ({doc_col_list}), "
+        f"folders, "
+        f"search_source_connectors, "
+        f"new_chat_messages, "
+        f"chat_comments, "
+        f"chat_session_state"
+    )
+
+
+def upgrade() -> None:
+    conn = op.get_bind()
+    # asyncpg requires LOCK TABLE inside a transaction block. Alembic already
+    # opened one via context.begin_transaction(), but the driver still errors
+    # unless we use an explicit SAVEPOINT (nested transaction) for this block.
+    tx = conn.begin_nested() if conn.in_transaction() else conn.begin()
+    with tx:
+        conn.execute(sa.text("SET lock_timeout = '10s'"))
+
+        _terminate_blocked_pids(conn, "user")
+        conn.execute(sa.text('LOCK TABLE "user" IN ACCESS EXCLUSIVE MODE'))
+
+        # Idempotent: "user" was never in TABLES_WITH_FULL_IDENTITY of
+        # migration 117, so this is already DEFAULT. Re-assert anyway so
+        # the column-list publication stays valid (DEFAULT identity only
+        # requires the PK to be in the column list).
+        conn.execute(sa.text('ALTER TABLE "user" REPLICA IDENTITY DEFAULT'))
+
+        conn.execute(sa.text(f"DROP PUBLICATION IF EXISTS {PUBLICATION_NAME}"))
+
+        documents_has_zero_ver = _has_zero_version(conn, "documents")
+        user_has_zero_ver = _has_zero_version(conn, "user")
+
+        conn.execute(
+            sa.text(_build_publication_ddl(documents_has_zero_ver, user_has_zero_ver))
+        )
+
+
+def downgrade() -> None:
+    conn = op.get_bind()
+    conn.execute(sa.text(f"DROP PUBLICATION IF EXISTS {PUBLICATION_NAME}"))
+    documents_has_zero_ver = _has_zero_version(conn, "documents")
+    conn.execute(sa.text(_build_publication_ddl_without_user(documents_has_zero_ver)))
diff --git a/surfsense_backend/alembic/versions/140_premium_tokens_to_credit_micros.py b/surfsense_backend/alembic/versions/140_premium_tokens_to_credit_micros.py
new file mode 100644
index 0000000000..ff88ac34eb
--- /dev/null
+++ b/surfsense_backend/alembic/versions/140_premium_tokens_to_credit_micros.py
@@ -0,0 +1,301 @@
+"""rename premium token columns to credit micros and add cost_micros to token_usage
+
+Migrates the premium quota system from a flat token counter to a USD-cost
+based credit system, where 1 credit = 1 micro-USD ($0.000001).
+
+Column renames (1:1 numerical mapping — the prior $1 per 1M tokens Stripe
+price means every existing value is already correct in the new unit, no
+data transformation needed):
+
+    user.premium_tokens_limit       -> premium_credit_micros_limit
+    user.premium_tokens_used        -> premium_credit_micros_used
+    user.premium_tokens_reserved    -> premium_credit_micros_reserved
+
+    premium_token_purchases.tokens_granted -> credit_micros_granted
+
+New column for cost auditing per turn:
+
+    token_usage.cost_micros (BigInteger NOT NULL DEFAULT 0)
+
+The "user" table is in zero_publication's column list (added in 139), so
+this migration must drop and recreate the publication with the renamed
+column names, otherwise zero-cache will replicate stale column names and
+the FE Zero schema will fail to bind.
+
+IMPORTANT - before AND after running this migration:
+  1. Stop zero-cache  (it holds replication locks that will deadlock DDL)
+  2. Run:  alembic upgrade head
+  3. Delete / reset the zero-cache data volume
+  4. Restart zero-cache  (it will do a fresh initial sync)
+
+Skipping the zero-cache stop will deadlock at the ACCESS EXCLUSIVE LOCK on
+"user". Skipping the data-volume reset will leave IndexedDB clients seeing
+column-not-found errors from a stale catalog snapshot.
+
+DO NOT COPY THIS PATTERN. The ``DROP PUBLICATION`` + ``CREATE
+PUBLICATION`` dance below is the pre-#1355 anti-pattern: on Zero >=
+1.0 it does not reliably wake the zero-cache change-streamer and can
+leave the replica pinned to a stale snapshot. This file is
+grandfathered in because it has already shipped to users; new
+publication mutations MUST use the ``COMMENT ON PUBLICATION`` bookend
+pattern wrapping an ``ALTER PUBLICATION ... SET TABLE`` -- copy the
+``upgrade()`` function from migration
+``143_force_zero_publication_resync.py`` as your starting template.
+
+Revision ID: 140
+Revises: 139
+"""
+
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+
+from alembic import op
+
+revision: str = "140"
+down_revision: str | None = "139"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+PUBLICATION_NAME = "zero_publication"
+
+# Replicates 139's document column list verbatim — must stay in sync.
+DOCUMENT_COLS = [
+    "id",
+    "title",
+    "document_type",
+    "search_space_id",
+    "folder_id",
+    "created_by_id",
+    "status",
+    "created_at",
+    "updated_at",
+]
+
+# Same five live-meter fields as 139, with the renamed column names.
+USER_COLS = [
+    "id",
+    "pages_limit",
+    "pages_used",
+    "premium_credit_micros_limit",
+    "premium_credit_micros_used",
+]
+
+
+def _terminate_blocked_pids(conn, table: str) -> None:
+    """Kill backends whose locks on *table* would block our AccessExclusiveLock."""
+    conn.execute(
+        sa.text(
+            "SELECT pg_terminate_backend(l.pid) "
+            "FROM pg_locks l "
+            "JOIN pg_class c ON c.oid = l.relation "
+            "WHERE c.relname = :tbl "
+            "  AND l.pid != pg_backend_pid()"
+        ),
+        {"tbl": table},
+    )
+
+
+def _has_zero_version(conn, table: str) -> bool:
+    return (
+        conn.execute(
+            sa.text(
+                "SELECT 1 FROM information_schema.columns "
+                "WHERE table_name = :tbl AND column_name = '_0_version'"
+            ),
+            {"tbl": table},
+        ).fetchone()
+        is not None
+    )
+
+
+def _column_exists(conn, table: str, column: str) -> bool:
+    return (
+        conn.execute(
+            sa.text(
+                "SELECT 1 FROM information_schema.columns "
+                "WHERE table_name = :tbl AND column_name = :col"
+            ),
+            {"tbl": table, "col": column},
+        ).fetchone()
+        is not None
+    )
+
+
+def _build_publication_ddl(
+    user_cols: list[str],
+    *,
+    documents_has_zero_ver: bool,
+    user_has_zero_ver: bool,
+) -> str:
+    doc_cols = DOCUMENT_COLS + (['"_0_version"'] if documents_has_zero_ver else [])
+    user_col_list_with_meta = user_cols + (
+        ['"_0_version"'] if user_has_zero_ver else []
+    )
+    doc_col_list = ", ".join(doc_cols)
+    user_col_list = ", ".join(user_col_list_with_meta)
+    return (
+        f"CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE "
+        f"notifications, "
+        f"documents ({doc_col_list}), "
+        f"folders, "
+        f"search_source_connectors, "
+        f"new_chat_messages, "
+        f"chat_comments, "
+        f"chat_session_state, "
+        f'"user" ({user_col_list})'
+    )
+
+
+def upgrade() -> None:
+    conn = op.get_bind()
+
+    # ------------------------------------------------------------------
+    # 1. Add cost_micros to token_usage. Idempotent guard so re-runs in
+    #    dev environments are safe.
+    # ------------------------------------------------------------------
+    if not _column_exists(conn, "token_usage", "cost_micros"):
+        op.add_column(
+            "token_usage",
+            sa.Column(
+                "cost_micros",
+                sa.BigInteger(),
+                nullable=False,
+                server_default="0",
+            ),
+        )
+
+    # ------------------------------------------------------------------
+    # 2. Rename premium_token_purchases.tokens_granted -> credit_micros_granted.
+    # ------------------------------------------------------------------
+    if _column_exists(
+        conn, "premium_token_purchases", "tokens_granted"
+    ) and not _column_exists(conn, "premium_token_purchases", "credit_micros_granted"):
+        op.alter_column(
+            "premium_token_purchases",
+            "tokens_granted",
+            new_column_name="credit_micros_granted",
+        )
+
+    # ------------------------------------------------------------------
+    # 3. Rename user.premium_tokens_* -> premium_credit_micros_*.
+    #
+    # We must drop the publication first (it references the old column
+    # names) and re-acquire the lock for DDL. asyncpg requires LOCK TABLE
+    # in a transaction block; alembic's outer transaction already holds
+    # one, but a SAVEPOINT keeps the LOCK + DDL atomic.
+    # ------------------------------------------------------------------
+    tx = conn.begin_nested() if conn.in_transaction() else conn.begin()
+    with tx:
+        conn.execute(sa.text("SET lock_timeout = '10s'"))
+
+        _terminate_blocked_pids(conn, "user")
+        conn.execute(sa.text('LOCK TABLE "user" IN ACCESS EXCLUSIVE MODE'))
+
+        # Re-assert REPLICA IDENTITY DEFAULT for safety; column-list
+        # publications require at least the PK to be in the column list,
+        # which is true for both the old and new shape.
+        conn.execute(sa.text('ALTER TABLE "user" REPLICA IDENTITY DEFAULT'))
+
+        # Drop the publication BEFORE renaming columns, otherwise Postgres
+        # rejects the rename: "cannot drop column ... referenced by
+        # publication".
+        conn.execute(sa.text(f"DROP PUBLICATION IF EXISTS {PUBLICATION_NAME}"))
+
+        for old, new in (
+            ("premium_tokens_limit", "premium_credit_micros_limit"),
+            ("premium_tokens_used", "premium_credit_micros_used"),
+            ("premium_tokens_reserved", "premium_credit_micros_reserved"),
+        ):
+            if _column_exists(conn, "user", old) and not _column_exists(
+                conn, "user", new
+            ):
+                op.alter_column("user", old, new_column_name=new)
+
+        # Update the server_default on the renamed limit column so newly
+        # inserted users get $5 of credit (== 5_000_000 micros) by
+        # default. Existing rows are unaffected.
+        op.alter_column(
+            "user",
+            "premium_credit_micros_limit",
+            server_default="5000000",
+        )
+
+        # Recreate the publication with the new column names.
+        documents_has_zero_ver = _has_zero_version(conn, "documents")
+        user_has_zero_ver = _has_zero_version(conn, "user")
+        conn.execute(
+            sa.text(
+                _build_publication_ddl(
+                    USER_COLS,
+                    documents_has_zero_ver=documents_has_zero_ver,
+                    user_has_zero_ver=user_has_zero_ver,
+                )
+            )
+        )
+
+
+def downgrade() -> None:
+    """Revert the rename and drop ``cost_micros``.
+
+    Mirrors ``upgrade``: drop the publication, rename columns back, drop
+    the new column, recreate the publication with the old column list.
+    Same zero-cache stop/reset runbook applies in reverse.
+    """
+    conn = op.get_bind()
+
+    tx = conn.begin_nested() if conn.in_transaction() else conn.begin()
+    with tx:
+        conn.execute(sa.text("SET lock_timeout = '10s'"))
+
+        _terminate_blocked_pids(conn, "user")
+        conn.execute(sa.text('LOCK TABLE "user" IN ACCESS EXCLUSIVE MODE'))
+
+        conn.execute(sa.text(f"DROP PUBLICATION IF EXISTS {PUBLICATION_NAME}"))
+
+        for new, old in (
+            ("premium_credit_micros_limit", "premium_tokens_limit"),
+            ("premium_credit_micros_used", "premium_tokens_used"),
+            ("premium_credit_micros_reserved", "premium_tokens_reserved"),
+        ):
+            if _column_exists(conn, "user", new) and not _column_exists(
+                conn, "user", old
+            ):
+                op.alter_column("user", new, new_column_name=old)
+
+        op.alter_column(
+            "user",
+            "premium_tokens_limit",
+            server_default="5000000",
+        )
+
+        legacy_user_cols = [
+            "id",
+            "pages_limit",
+            "pages_used",
+            "premium_tokens_limit",
+            "premium_tokens_used",
+        ]
+        documents_has_zero_ver = _has_zero_version(conn, "documents")
+        user_has_zero_ver = _has_zero_version(conn, "user")
+        conn.execute(
+            sa.text(
+                _build_publication_ddl(
+                    legacy_user_cols,
+                    documents_has_zero_ver=documents_has_zero_ver,
+                    user_has_zero_ver=user_has_zero_ver,
+                )
+            )
+        )
+
+    if _column_exists(
+        conn, "premium_token_purchases", "credit_micros_granted"
+    ) and not _column_exists(conn, "premium_token_purchases", "tokens_granted"):
+        op.alter_column(
+            "premium_token_purchases",
+            "credit_micros_granted",
+            new_column_name="tokens_granted",
+        )
+
+    if _column_exists(conn, "token_usage", "cost_micros"):
+        op.drop_column("token_usage", "cost_micros")
diff --git a/surfsense_backend/alembic/versions/141_unique_chat_message_turn_role.py b/surfsense_backend/alembic/versions/141_unique_chat_message_turn_role.py
new file mode 100644
index 0000000000..1226a59b45
--- /dev/null
+++ b/surfsense_backend/alembic/versions/141_unique_chat_message_turn_role.py
@@ -0,0 +1,71 @@
+"""141_unique_chat_message_turn_role
+
+Revision ID: 141
+Revises: 140
+Create Date: 2026-05-04
+
+Add a partial unique index on ``new_chat_messages(thread_id, turn_id, role)``
+where ``turn_id IS NOT NULL``.
+
+Why
+---
+The streaming chat path (`stream_new_chat` / `stream_resume_chat`) is being
+moved to write its own ``new_chat_messages`` rows server-side instead of
+relying on the frontend's later ``POST /threads/{id}/messages`` call. This
+closes the "ghost-thread" abuse vector where authenticated callers got free
+LLM completions while ``new_chat_messages`` stayed empty.
+
+For server-side and legacy frontend writes to coexist we need an idempotency
+key. The natural triple is ``(thread_id, turn_id, role)``: the server issues
+exactly one ``turn_id`` per turn, and a turn produces at most one user
+message and one assistant message. Whichever side wins the race writes the
+row; the loser hits ``IntegrityError`` and recovers gracefully.
+
+Partial — ``WHERE turn_id IS NOT NULL`` — so:
+
+  * Legacy rows that predate the ``turn_id`` column (migration 136) keep
+    co-existing without de-dup.
+  * Clone / snapshot inserts in
+    ``app/services/public_chat_service.py`` that build ``NewChatMessage``
+    without ``turn_id`` are unaffected (multiple snapshot copies of the same
+    user/assistant pair are intentional).
+
+This index coexists with the existing single-column ``ix_new_chat_messages_turn_id``
+from migration 136 — no collision.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+
+from alembic import op
+
+revision: str = "141"
+down_revision: str | None = "140"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+INDEX_NAME = "uq_new_chat_messages_thread_turn_role"
+TABLE_NAME = "new_chat_messages"
+
+
+def upgrade() -> None:
+    bind = op.get_bind()
+    indexes = {i["name"] for i in sa.inspect(bind).get_indexes(TABLE_NAME)}
+    if INDEX_NAME in indexes:
+        return
+
+    op.create_index(
+        INDEX_NAME,
+        TABLE_NAME,
+        ["thread_id", "turn_id", "role"],
+        unique=True,
+        postgresql_where=sa.text("turn_id IS NOT NULL"),
+    )
+
+
+def downgrade() -> None:
+    op.drop_index(INDEX_NAME, table_name=TABLE_NAME)
diff --git a/surfsense_backend/alembic/versions/142_token_usage_message_id_unique.py b/surfsense_backend/alembic/versions/142_token_usage_message_id_unique.py
new file mode 100644
index 0000000000..43b30a7566
--- /dev/null
+++ b/surfsense_backend/alembic/versions/142_token_usage_message_id_unique.py
@@ -0,0 +1,134 @@
+"""142_token_usage_message_id_unique
+
+Revision ID: 142
+Revises: 141
+Create Date: 2026-05-04
+
+Add a partial unique index on ``token_usage(message_id)`` where
+``message_id IS NOT NULL``.
+
+Why
+---
+Two writers can race on the same assistant turn's ``token_usage`` row:
+
+  * ``finalize_assistant_turn`` (server-side, called from the streaming
+    finally block in ``stream_new_chat`` / ``stream_resume_chat``)
+  * ``append_message``'s recovery branch in
+    ``app/routes/new_chat_routes.py`` (legacy frontend round-trip)
+
+Both currently use ``SELECT ... THEN INSERT`` in separate sessions, so a
+micro-second-aligned race could observe "no row" on each side and double
+INSERT, producing duplicate ``token_usage`` rows for the same
+``message_id``.
+
+A partial unique index on ``message_id`` (``WHERE message_id IS NOT NULL``)
+turns both writes into ``INSERT ... ON CONFLICT (message_id) DO NOTHING``
+no-ops for the loser, hard-eliminating the race at the DB level. Partial
+because non-chat usage rows (indexing, image generation, podcasts) keep
+``message_id`` NULL — they're per-event, no de-dup needed.
+
+Pre-flight
+----------
+Today's schema only has a non-unique index on ``message_id`` so a
+duplicate population could already exist from any past race. We:
+
+  * Detect duplicate ``message_id`` groups (``HAVING COUNT(*) > 1``).
+  * If the group count is at or below ``DUPLICATE_ABORT_THRESHOLD`` (50)
+    we dedupe by deleting all but the smallest ``id`` per group.
+  * If the count exceeds the threshold we abort with a descriptive
+    error rather than silently mutate prod data — operator must
+    investigate before retrying.
+
+Concurrency
+-----------
+``CREATE INDEX CONCURRENTLY`` is required on this hot table to avoid
+stalling production writes during deploy (a regular ``CREATE INDEX``
+holds an ACCESS EXCLUSIVE lock for the duration of the build, which
+would block ``token_usage`` INSERTs for every active streaming chat).
+The trade-off is a slower migration (CONCURRENTLY scans the table
+twice) and the ``CREATE`` statement cannot run inside alembic's default
+transaction wrapper — ``autocommit_block()`` handles that.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+
+from alembic import op
+
+revision: str = "142"
+down_revision: str | None = "141"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+INDEX_NAME = "uq_token_usage_message_id"
+TABLE_NAME = "token_usage"
+
+# Refuse to silently mutate prod data if the duplicate population is
+# unexpectedly large — operator should investigate the upstream cause
+# before retrying. 50 is comfortably above any plausible duplicate
+# count from the existing race window (the race is microseconds wide).
+DUPLICATE_ABORT_THRESHOLD = 50
+
+
+def upgrade() -> None:
+    conn = op.get_bind()
+
+    dup_groups = conn.execute(
+        sa.text(
+            "SELECT message_id, COUNT(*) AS n "
+            "FROM token_usage "
+            "WHERE message_id IS NOT NULL "
+            "GROUP BY message_id "
+            "HAVING COUNT(*) > 1"
+        )
+    ).fetchall()
+
+    if len(dup_groups) > DUPLICATE_ABORT_THRESHOLD:
+        raise RuntimeError(
+            f"token_usage has {len(dup_groups)} duplicate message_id groups "
+            f"(threshold={DUPLICATE_ABORT_THRESHOLD}). "
+            "Resolve the duplicates manually before re-running this migration."
+        )
+
+    if dup_groups:
+        # Delete all but the smallest-id row per duplicate group. The
+        # smallest id is by definition the earliest insert, so we keep
+        # the row most likely to reflect the actual stream's first
+        # successful write.
+        conn.execute(
+            sa.text(
+                """
+                DELETE FROM token_usage
+                WHERE id IN (
+                    SELECT id FROM (
+                        SELECT
+                            id,
+                            row_number() OVER (
+                                PARTITION BY message_id ORDER BY id ASC
+                            ) AS rn
+                        FROM token_usage
+                        WHERE message_id IS NOT NULL
+                    ) ranked
+                    WHERE rn > 1
+                )
+                """
+            )
+        )
+
+    # CREATE INDEX CONCURRENTLY cannot run inside a transaction. Drop
+    # alembic's auto-transaction for this op only.
+    with op.get_context().autocommit_block():
+        op.execute(
+            f"CREATE UNIQUE INDEX CONCURRENTLY IF NOT EXISTS {INDEX_NAME} "
+            f"ON {TABLE_NAME} (message_id) "
+            "WHERE message_id IS NOT NULL"
+        )
+
+
+def downgrade() -> None:
+    with op.get_context().autocommit_block():
+        op.execute(f"DROP INDEX CONCURRENTLY IF EXISTS {INDEX_NAME}")
diff --git a/surfsense_backend/alembic/versions/143_force_zero_publication_resync.py b/surfsense_backend/alembic/versions/143_force_zero_publication_resync.py
new file mode 100644
index 0000000000..147cbde56f
--- /dev/null
+++ b/surfsense_backend/alembic/versions/143_force_zero_publication_resync.py
@@ -0,0 +1,142 @@
+"""force zero-cache to resync after upgrading to Zero >= 1.0
+
+Re-emits the current ``zero_publication`` shape using
+``ALTER PUBLICATION ... SET TABLE`` wrapped in
+``COMMENT ON PUBLICATION`` bookends. This is the publication-change
+hook documented for Zero ``>=1.0``:
+
+    https://zero.rocicorp.dev/docs/connecting-to-postgres#publication-changes
+
+Background
+----------
+Migrations 117 / 139 / 140 mutated ``zero_publication`` using
+``DROP PUBLICATION`` + ``CREATE PUBLICATION``. On Zero 0.26.2 that
+sequence did not reliably wake the zero-cache change-streamer, so
+affected installs ended up with a SQLite replica file (in the
+``surfsense-zero-cache`` volume) that was snapshotted against the
+pre-``user`` publication. The frontend Zero schema includes a
+``userTable`` query, which then failed with
+``SchemaVersionNotSupported`` and triggered the default
+``onUpdateNeeded`` -> ``location.reload()`` every WebSocket keepalive
+interval (~60s). See bug #1355.
+
+This migration emits the canonical publication shape one more time,
+this time using a pattern that fires Postgres event triggers and
+Zero's schema-change hook. With ``ZERO_AUTO_RESET=true`` (the default)
+and Zero ``>=1.0``, zero-cache responds by wiping its replica and
+doing a fresh initial sync from the corrected publication.
+
+The publication shape itself is unchanged versus migration 140 -- on
+installs whose replica is already correct, this is a no-op aside
+from the harmless event-trigger fire.
+
+Revision ID: 143
+Revises: 142
+"""
+
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+
+from alembic import op
+
+revision: str = "143"
+down_revision: str | None = "142"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+PUBLICATION_NAME = "zero_publication"
+
+# Must stay in sync with the column lists in migrations 117 / 139 / 140.
+DOCUMENT_COLS = [
+    "id",
+    "title",
+    "document_type",
+    "search_space_id",
+    "folder_id",
+    "created_by_id",
+    "status",
+    "created_at",
+    "updated_at",
+]
+
+USER_COLS = [
+    "id",
+    "pages_limit",
+    "pages_used",
+    "premium_credit_micros_limit",
+    "premium_credit_micros_used",
+]
+
+
+def _has_zero_version(conn, table: str) -> bool:
+    return (
+        conn.execute(
+            sa.text(
+                "SELECT 1 FROM information_schema.columns "
+                "WHERE table_name = :tbl AND column_name = '_0_version'"
+            ),
+            {"tbl": table},
+        ).fetchone()
+        is not None
+    )
+
+
+def _build_set_table_ddl(
+    *, documents_has_zero_ver: bool, user_has_zero_ver: bool
+) -> str:
+    doc_cols = DOCUMENT_COLS + (['"_0_version"'] if documents_has_zero_ver else [])
+    user_cols = USER_COLS + (['"_0_version"'] if user_has_zero_ver else [])
+    doc_col_list = ", ".join(doc_cols)
+    user_col_list = ", ".join(user_cols)
+    return (
+        f"ALTER PUBLICATION {PUBLICATION_NAME} SET TABLE "
+        f"notifications, "
+        f"documents ({doc_col_list}), "
+        f"folders, "
+        f"search_source_connectors, "
+        f"new_chat_messages, "
+        f"chat_comments, "
+        f"chat_session_state, "
+        f'"user" ({user_col_list})'
+    )
+
+
+def upgrade() -> None:
+    conn = op.get_bind()
+
+    exists = conn.execute(
+        sa.text("SELECT 1 FROM pg_publication WHERE pubname = :name"),
+        {"name": PUBLICATION_NAME},
+    ).fetchone()
+    if not exists:
+        return
+
+    documents_has_zero_ver = _has_zero_version(conn, "documents")
+    user_has_zero_ver = _has_zero_version(conn, "user")
+
+    # The COMMENT-ALTER-COMMENT trio MUST run in a single transaction so
+    # Zero observes them as one schema-change event. Alembic's outer
+    # transaction already covers us, but a SAVEPOINT keeps the trio
+    # atomic with asyncpg, matching the pattern used in migrations
+    # 117 / 139 / 140.
+    tx = conn.begin_nested() if conn.in_transaction() else conn.begin()
+    with tx:
+        conn.execute(
+            sa.text(f"COMMENT ON PUBLICATION {PUBLICATION_NAME} IS 'pre-143-resync'")
+        )
+        conn.execute(
+            sa.text(
+                _build_set_table_ddl(
+                    documents_has_zero_ver=documents_has_zero_ver,
+                    user_has_zero_ver=user_has_zero_ver,
+                )
+            )
+        )
+        conn.execute(
+            sa.text(f"COMMENT ON PUBLICATION {PUBLICATION_NAME} IS 'post-143-resync'")
+        )
+
+
+def downgrade() -> None:
+    """No-op. The publication shape is unchanged versus migration 140."""
diff --git a/surfsense_backend/app/agents/autocomplete/__init__.py b/surfsense_backend/app/agents/autocomplete/__init__.py
deleted file mode 100644
index 55d7a692dd..0000000000
--- a/surfsense_backend/app/agents/autocomplete/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-"""Agent-based vision autocomplete with scoped filesystem exploration."""
-
-from app.agents.autocomplete.autocomplete_agent import (
-    create_autocomplete_agent,
-    stream_autocomplete_agent,
-)
-
-__all__ = [
-    "create_autocomplete_agent",
-    "stream_autocomplete_agent",
-]
diff --git a/surfsense_backend/app/agents/autocomplete/autocomplete_agent.py b/surfsense_backend/app/agents/autocomplete/autocomplete_agent.py
index 2d8f05fd36..890b3e06ec 100644
--- a/surfsense_backend/app/agents/autocomplete/autocomplete_agent.py
+++ b/surfsense_backend/app/agents/autocomplete/autocomplete_agent.py
@@ -28,13 +28,76 @@
 from langchain_core.language_models import BaseChatModel
 from langchain_core.messages import AIMessage, ToolMessage
 
+from app.agents.new_chat.document_xml import build_document_xml
 from app.agents.new_chat.middleware.filesystem import SurfSenseFilesystemMiddleware
 from app.agents.new_chat.middleware.knowledge_search import (
-    build_scoped_filesystem,
     search_knowledge_base,
 )
+from app.agents.new_chat.path_resolver import (
+    DOCUMENTS_ROOT,
+    build_path_index,
+    doc_to_virtual_path,
+)
+from app.db import shielded_async_session
 from app.services.new_streaming_service import VercelStreamingService
 
+try:
+    from deepagents.backends.utils import create_file_data
+except Exception:  # pragma: no cover - defensive
+
+    def create_file_data(content: str) -> dict[str, Any]:
+        return {"content": content.split("\n")}
+
+
+async def _build_autocomplete_filesystem(
+    *,
+    documents: Any,
+    search_space_id: int,
+) -> tuple[dict[str, Any], dict[int, str]]:
+    """Build a ``state['files']``-shaped dict from KB search results.
+
+    This is the autocomplete-specific replacement for the previous
+    ``build_scoped_filesystem`` helper. It uses the canonical path resolver
+    so paths line up with the rest of the system, including collision
+    suffixes for duplicate titles.
+    """
+    files: dict[str, Any] = {}
+    doc_id_to_path: dict[int, str] = {}
+
+    if not documents:
+        return files, doc_id_to_path
+
+    async with shielded_async_session() as session:
+        index = await build_path_index(session, search_space_id)
+
+    for document in documents:
+        if not isinstance(document, dict):
+            continue
+        meta = document.get("document") or {}
+        doc_id = meta.get("id")
+        if not isinstance(doc_id, int):
+            continue
+        title = str(meta.get("title") or "untitled")
+        folder_id = meta.get("folder_id")
+        path = doc_to_virtual_path(
+            doc_id=doc_id, title=title, folder_id=folder_id, index=index
+        )
+        chunk_ids = document.get("matched_chunk_ids") or []
+        try:
+            matched_set = {int(c) for c in chunk_ids}
+        except (TypeError, ValueError):
+            matched_set = set()
+        xml = build_document_xml(document, matched_chunk_ids=matched_set)
+        files[path] = create_file_data(xml)
+        doc_id_to_path[doc_id] = path
+
+    if not files:
+        # Ensure the synthetic /documents folder is visible even when empty.
+        files.setdefault(f"{DOCUMENTS_ROOT}/.placeholder", create_file_data(""))
+
+    return files, doc_id_to_path
+
+
 logger = logging.getLogger(__name__)
 
 KB_TOP_K = 10
@@ -174,7 +237,7 @@ async def precompute_kb_filesystem(
         if not search_results:
             return _KBResult()
 
-        new_files, _ = await build_scoped_filesystem(
+        new_files, _ = await _build_autocomplete_filesystem(
             documents=search_results,
             search_space_id=search_space_id,
         )
@@ -215,13 +278,12 @@ async def precompute_kb_filesystem(
 class AutocompleteFilesystemMiddleware(SurfSenseFilesystemMiddleware):
     """Filesystem middleware for autocomplete — read-only exploration only.
 
-    Strips ``save_document`` (permanent KB persistence) and passes
-    ``search_space_id=None`` so ``write_file`` / ``edit_file`` stay ephemeral.
+    Passes ``search_space_id=None`` so the new persistence pipeline is
+    bypassed; the autocomplete flow only reads, never commits to Postgres.
     """
 
     def __init__(self) -> None:
         super().__init__(search_space_id=None, created_by_id=None)
-        self.tools = [t for t in self.tools if t.name != "save_document"]
 
 
 # ---------------------------------------------------------------------------
diff --git a/surfsense_backend/app/agents/multi_agent_chat/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/__init__.py
new file mode 100644
index 0000000000..6c7d79eb84
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/__init__.py
@@ -0,0 +1,7 @@
+"""Deepagents-backed routes: ``subagents/``; main-agent graph under ``main_agent/`` (SRP subpackages)."""
+
+from __future__ import annotations
+
+from .main_agent import create_multi_agent_chat_deep_agent
+
+__all__ = ["create_multi_agent_chat_deep_agent"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/constants.py b/surfsense_backend/app/agents/multi_agent_chat/constants.py
new file mode 100644
index 0000000000..7e40618131
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/constants.py
@@ -0,0 +1,44 @@
+"""Connector-type to subagent name; subagent name to availability tokens for build_subagents."""
+
+from __future__ import annotations
+
+CONNECTOR_TYPE_TO_CONNECTOR_AGENT_MAPS: dict[str, str] = {
+    "GOOGLE_GMAIL_CONNECTOR": "gmail",
+    "COMPOSIO_GMAIL_CONNECTOR": "gmail",
+    "GOOGLE_CALENDAR_CONNECTOR": "calendar",
+    "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR": "calendar",
+    "DISCORD_CONNECTOR": "discord",
+    "TEAMS_CONNECTOR": "teams",
+    "LUMA_CONNECTOR": "luma",
+    "LINEAR_CONNECTOR": "linear",
+    "JIRA_CONNECTOR": "jira",
+    "CLICKUP_CONNECTOR": "clickup",
+    "SLACK_CONNECTOR": "slack",
+    "AIRTABLE_CONNECTOR": "airtable",
+    "NOTION_CONNECTOR": "notion",
+    "CONFLUENCE_CONNECTOR": "confluence",
+    "GOOGLE_DRIVE_CONNECTOR": "google_drive",
+    "COMPOSIO_GOOGLE_DRIVE_CONNECTOR": "google_drive",
+    "DROPBOX_CONNECTOR": "dropbox",
+    "ONEDRIVE_CONNECTOR": "onedrive",
+}
+
+SUBAGENT_TO_REQUIRED_CONNECTOR_MAP: dict[str, frozenset[str]] = {
+    "deliverables": frozenset(),
+    "knowledge_base": frozenset(),
+    "airtable": frozenset({"AIRTABLE_CONNECTOR"}),
+    "calendar": frozenset({"GOOGLE_CALENDAR_CONNECTOR"}),
+    "clickup": frozenset({"CLICKUP_CONNECTOR"}),
+    "confluence": frozenset({"CONFLUENCE_CONNECTOR"}),
+    "discord": frozenset({"DISCORD_CONNECTOR"}),
+    "dropbox": frozenset({"DROPBOX_FILE"}),
+    "gmail": frozenset({"GOOGLE_GMAIL_CONNECTOR"}),
+    "google_drive": frozenset({"GOOGLE_DRIVE_FILE"}),
+    "jira": frozenset({"JIRA_CONNECTOR"}),
+    "linear": frozenset({"LINEAR_CONNECTOR"}),
+    "luma": frozenset({"LUMA_CONNECTOR"}),
+    "notion": frozenset({"NOTION_CONNECTOR"}),
+    "onedrive": frozenset({"ONEDRIVE_FILE"}),
+    "slack": frozenset({"SLACK_CONNECTOR"}),
+    "teams": frozenset({"TEAMS_CONNECTOR"}),
+}
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/__init__.py
new file mode 100644
index 0000000000..f74ca0cd0d
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/__init__.py
@@ -0,0 +1,7 @@
+"""Main-agent deep agent: ``runtime/`` (factory), ``graph/`` (compile), ``system_prompt/``, etc."""
+
+from __future__ import annotations
+
+from .runtime import create_multi_agent_chat_deep_agent
+
+__all__ = ["create_multi_agent_chat_deep_agent"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/context_prune/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/context_prune/__init__.py
new file mode 100644
index 0000000000..550ba54c53
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/context_prune/__init__.py
@@ -0,0 +1,7 @@
+"""Tool-name pruning for context editing (exclude lists without dropping protected tools)."""
+
+from __future__ import annotations
+
+from .prune_tool_names import PRUNE_PROTECTED_TOOL_NAMES, safe_exclude_tools
+
+__all__ = ["PRUNE_PROTECTED_TOOL_NAMES", "safe_exclude_tools"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/context_prune/prune_tool_names.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/context_prune/prune_tool_names.py
new file mode 100644
index 0000000000..c8bf6d6e09
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/context_prune/prune_tool_names.py
@@ -0,0 +1,26 @@
+"""Tool names excluded from context-editing prune when bound."""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+from langchain_core.tools import BaseTool
+
+PRUNE_PROTECTED_TOOL_NAMES: frozenset[str] = frozenset(
+    {
+        "generate_report",
+        "generate_resume",
+        "generate_podcast",
+        "generate_video_presentation",
+        "generate_image",
+        "read_email",
+        "search_emails",
+        "invalid",
+    },
+)
+
+
+def safe_exclude_tools(tools: Sequence[BaseTool]) -> tuple[str, ...]:
+    """Names from ``PRUNE_PROTECTED_TOOL_NAMES`` that appear in ``tools``."""
+    enabled = {t.name for t in tools}
+    return tuple(n for n in PRUNE_PROTECTED_TOOL_NAMES if n in enabled)
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/graph/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/graph/__init__.py
new file mode 100644
index 0000000000..e12108484a
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/graph/__init__.py
@@ -0,0 +1,7 @@
+"""Sync compile of the main-agent LangGraph graph (middleware + ``create_agent``)."""
+
+from __future__ import annotations
+
+from .compile_graph_sync import build_compiled_agent_graph_sync
+
+__all__ = ["build_compiled_agent_graph_sync"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/graph/compile_graph_sync.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/graph/compile_graph_sync.py
new file mode 100644
index 0000000000..b86da932a5
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/graph/compile_graph_sync.py
@@ -0,0 +1,83 @@
+"""Synchronous graph compile (middleware + ``create_agent``)."""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+from typing import Any
+
+from deepagents import __version__ as deepagents_version
+from langchain.agents import create_agent
+from langchain_core.language_models import BaseChatModel
+from langchain_core.tools import BaseTool
+from langgraph.types import Checkpointer
+
+from app.agents.multi_agent_chat.middleware.stack import (
+    build_main_agent_deepagent_middleware,
+)
+from app.agents.new_chat.context import SurfSenseContextSchema
+from app.agents.new_chat.feature_flags import AgentFeatureFlags
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+from app.db import ChatVisibility
+
+
+def build_compiled_agent_graph_sync(
+    *,
+    llm: BaseChatModel,
+    tools: Sequence[BaseTool],
+    final_system_prompt: str,
+    backend_resolver: Any,
+    filesystem_mode: FilesystemMode,
+    search_space_id: int,
+    user_id: str | None,
+    thread_id: int | None,
+    visibility: ChatVisibility,
+    anon_session_id: str | None,
+    available_connectors: list[str] | None,
+    available_document_types: list[str] | None,
+    mentioned_document_ids: list[int] | None,
+    max_input_tokens: int | None,
+    flags: AgentFeatureFlags,
+    checkpointer: Checkpointer,
+    subagent_dependencies: dict[str, Any],
+    mcp_tools_by_agent: dict[str, list[BaseTool]] | None = None,
+    disabled_tools: list[str] | None = None,
+):
+    """Sync compile: middleware + ``create_agent`` (run via ``asyncio.to_thread``)."""
+    main_agent_middleware = build_main_agent_deepagent_middleware(
+        llm=llm,
+        tools=tools,
+        backend_resolver=backend_resolver,
+        filesystem_mode=filesystem_mode,
+        search_space_id=search_space_id,
+        user_id=user_id,
+        thread_id=thread_id,
+        visibility=visibility,
+        anon_session_id=anon_session_id,
+        available_connectors=available_connectors,
+        available_document_types=available_document_types,
+        mentioned_document_ids=mentioned_document_ids,
+        max_input_tokens=max_input_tokens,
+        flags=flags,
+        subagent_dependencies=subagent_dependencies,
+        checkpointer=checkpointer,
+        mcp_tools_by_agent=mcp_tools_by_agent,
+        disabled_tools=disabled_tools,
+    )
+
+    agent = create_agent(
+        llm,
+        system_prompt=final_system_prompt,
+        tools=list(tools),
+        middleware=main_agent_middleware,
+        context_schema=SurfSenseContextSchema,
+        checkpointer=checkpointer,
+    )
+    return agent.with_config(
+        {
+            "recursion_limit": 10_000,
+            "metadata": {
+                "ls_integration": "deepagents",
+                "versions": {"deepagents": deepagents_version},
+            },
+        }
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/runtime/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/runtime/__init__.py
new file mode 100644
index 0000000000..593e8da209
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/runtime/__init__.py
@@ -0,0 +1,7 @@
+"""Async factory: wiring tools, prompts, MCP buckets, then graph compile."""
+
+from __future__ import annotations
+
+from .factory import create_multi_agent_chat_deep_agent
+
+__all__ = ["create_multi_agent_chat_deep_agent"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/runtime/agent_cache.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/runtime/agent_cache.py
new file mode 100644
index 0000000000..03cf7acb83
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/runtime/agent_cache.py
@@ -0,0 +1,116 @@
+"""Compiled agent graph caching for the multi-agent path."""
+
+from __future__ import annotations
+
+import asyncio
+from collections.abc import Sequence
+from typing import Any
+
+from langchain_core.language_models import BaseChatModel
+from langchain_core.tools import BaseTool
+from langgraph.types import Checkpointer
+
+from app.agents.new_chat.agent_cache import (
+    flags_signature,
+    get_cache,
+    stable_hash,
+    system_prompt_hash,
+    tools_signature,
+)
+from app.agents.new_chat.feature_flags import AgentFeatureFlags
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+from app.db import ChatVisibility
+
+from ..graph.compile_graph_sync import build_compiled_agent_graph_sync
+
+
+def mcp_signature(mcp_tools_by_agent: dict[str, list[BaseTool]]) -> str:
+    """Hash the per-agent MCP tool surface so a change rotates the cache key."""
+    rows = []
+    for agent_name in sorted(mcp_tools_by_agent.keys()):
+        names = sorted(
+            getattr(t, "name", "") or "" for t in mcp_tools_by_agent[agent_name]
+        )
+        rows.append((agent_name, names))
+    return stable_hash(rows)
+
+
+async def build_agent_with_cache(
+    *,
+    llm: BaseChatModel,
+    tools: Sequence[BaseTool],
+    final_system_prompt: str,
+    backend_resolver: Any,
+    filesystem_mode: FilesystemMode,
+    search_space_id: int,
+    user_id: str | None,
+    thread_id: int | None,
+    visibility: ChatVisibility,
+    anon_session_id: str | None,
+    available_connectors: list[str],
+    available_document_types: list[str],
+    mentioned_document_ids: list[int] | None,
+    max_input_tokens: int | None,
+    flags: AgentFeatureFlags,
+    checkpointer: Checkpointer,
+    subagent_dependencies: dict[str, Any],
+    mcp_tools_by_agent: dict[str, list[BaseTool]],
+    disabled_tools: list[str] | None,
+    config_id: str | None,
+) -> Any:
+    """Compile the multi-agent graph, serving from cache when key components are stable."""
+
+    async def _build() -> Any:
+        return await asyncio.to_thread(
+            build_compiled_agent_graph_sync,
+            llm=llm,
+            tools=tools,
+            final_system_prompt=final_system_prompt,
+            backend_resolver=backend_resolver,
+            filesystem_mode=filesystem_mode,
+            search_space_id=search_space_id,
+            user_id=user_id,
+            thread_id=thread_id,
+            visibility=visibility,
+            anon_session_id=anon_session_id,
+            available_connectors=available_connectors,
+            available_document_types=available_document_types,
+            mentioned_document_ids=mentioned_document_ids,
+            max_input_tokens=max_input_tokens,
+            flags=flags,
+            checkpointer=checkpointer,
+            subagent_dependencies=subagent_dependencies,
+            mcp_tools_by_agent=mcp_tools_by_agent,
+            disabled_tools=disabled_tools,
+        )
+
+    if not (flags.enable_agent_cache and not flags.disable_new_agent_stack):
+        return await _build()
+
+    # Every per-request value any middleware closes over at __init__ must be in
+    # the key, otherwise a hit will leak state across threads. Bump the schema
+    # version when the component list changes shape.
+    cache_key = stable_hash(
+        "multi-agent-v1",
+        config_id,
+        thread_id,
+        user_id,
+        search_space_id,
+        visibility,
+        filesystem_mode,
+        anon_session_id,
+        tools_signature(
+            tools,
+            available_connectors=available_connectors,
+            available_document_types=available_document_types,
+        ),
+        mcp_signature(mcp_tools_by_agent),
+        flags_signature(flags),
+        system_prompt_hash(final_system_prompt),
+        max_input_tokens,
+        sorted(disabled_tools) if disabled_tools else None,
+    )
+    return await get_cache().get_or_build(cache_key, builder=_build)
+
+
+__all__ = ["build_agent_with_cache", "mcp_signature"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/runtime/factory.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/runtime/factory.py
new file mode 100644
index 0000000000..8451b3b7da
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/runtime/factory.py
@@ -0,0 +1,298 @@
+"""Async factory: tools, system prompt, MCP buckets for subagents, then sync graph compile."""
+
+from __future__ import annotations
+
+import logging
+import time
+from collections.abc import Sequence
+from typing import Any
+
+from langchain_core.language_models import BaseChatModel
+from langchain_core.tools import BaseTool
+from langgraph.types import Checkpointer
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.agents.multi_agent_chat.subagents import (
+    get_subagents_to_exclude,
+    main_prompt_registry_subagent_lines,
+)
+from app.agents.multi_agent_chat.subagents.mcp_tools.index import (
+    load_mcp_tools_by_connector,
+)
+from app.agents.new_chat.chat_deepagent import _map_connectors_to_searchable_types
+from app.agents.new_chat.feature_flags import AgentFeatureFlags, get_flags
+from app.agents.new_chat.filesystem_backends import build_backend_resolver
+from app.agents.new_chat.filesystem_selection import FilesystemMode, FilesystemSelection
+from app.agents.new_chat.llm_config import AgentConfig
+from app.agents.new_chat.prompt_caching import apply_litellm_prompt_caching
+from app.agents.new_chat.tools.invalid_tool import INVALID_TOOL_NAME, invalid_tool
+from app.agents.new_chat.tools.registry import build_tools_async
+from app.db import ChatVisibility
+from app.services.connector_service import ConnectorService
+from app.services.user_tool_allowlist import (
+    fetch_user_allowlist_rulesets,
+    make_trusted_tool_saver,
+)
+from app.utils.perf import get_perf_logger
+
+from ..system_prompt import build_main_agent_system_prompt
+from ..tools import (
+    MAIN_AGENT_SURFSENSE_TOOL_NAMES,
+    MAIN_AGENT_SURFSENSE_TOOL_NAMES_ORDERED,
+)
+from .agent_cache import build_agent_with_cache
+
+_perf_log = get_perf_logger()
+
+
+async def create_multi_agent_chat_deep_agent(
+    llm: BaseChatModel,
+    search_space_id: int,
+    db_session: AsyncSession,
+    connector_service: ConnectorService,
+    checkpointer: Checkpointer,
+    user_id: str | None = None,
+    thread_id: int | None = None,
+    agent_config: AgentConfig | None = None,
+    enabled_tools: list[str] | None = None,
+    disabled_tools: list[str] | None = None,
+    additional_tools: Sequence[BaseTool] | None = None,
+    firecrawl_api_key: str | None = None,
+    thread_visibility: ChatVisibility | None = None,
+    mentioned_document_ids: list[int] | None = None,
+    anon_session_id: str | None = None,
+    filesystem_selection: FilesystemSelection | None = None,
+):
+    """Deep agent with SurfSense tools/middleware; registry route subagents behind ``task`` when enabled."""
+    _t_agent_total = time.perf_counter()
+
+    apply_litellm_prompt_caching(llm, agent_config=agent_config, thread_id=thread_id)
+
+    filesystem_selection = filesystem_selection or FilesystemSelection()
+    backend_resolver = build_backend_resolver(
+        filesystem_selection,
+        search_space_id=search_space_id
+        if filesystem_selection.mode == FilesystemMode.CLOUD
+        else None,
+    )
+
+    available_connectors: list[str] | None = None
+    available_document_types: list[str] | None = None
+
+    _t0 = time.perf_counter()
+    try:
+        connector_types = await connector_service.get_available_connectors(
+            search_space_id
+        )
+        available_connectors = _map_connectors_to_searchable_types(connector_types)
+
+        available_document_types = await connector_service.get_available_document_types(
+            search_space_id
+        )
+
+    except Exception as e:
+        logging.warning(
+            "Connector/doc-type discovery failed; excluding connector subagents this turn: %s",
+            e,
+        )
+
+    # Fail closed: a None list short-circuits ``get_subagents_to_exclude`` to "exclude
+    # nothing", which would silently advertise every connector specialist on a flaky
+    # discovery call. Empty list excludes connector-gated subagents while keeping builtins.
+    if available_connectors is None:
+        available_connectors = []
+    if available_document_types is None:
+        available_document_types = []
+    _perf_log.info(
+        "[create_agent] Connector/doc-type discovery in %.3fs",
+        time.perf_counter() - _t0,
+    )
+
+    visibility = thread_visibility or ChatVisibility.PRIVATE
+
+    _model_profile = getattr(llm, "profile", None)
+    _max_input_tokens: int | None = (
+        _model_profile.get("max_input_tokens")
+        if isinstance(_model_profile, dict)
+        else None
+    )
+
+    dependencies: dict[str, Any] = {
+        "search_space_id": search_space_id,
+        "db_session": db_session,
+        "connector_service": connector_service,
+        "firecrawl_api_key": firecrawl_api_key,
+        "user_id": user_id,
+        "thread_id": thread_id,
+        "thread_visibility": visibility,
+        "available_connectors": available_connectors,
+        "available_document_types": available_document_types,
+        "max_input_tokens": _max_input_tokens,
+        "llm": llm,
+    }
+
+    _t0 = time.perf_counter()
+    try:
+        mcp_tools_by_agent = await load_mcp_tools_by_connector(
+            db_session, search_space_id
+        )
+    except Exception as e:
+        # Degrade to builtins-only rather than aborting the turn: a transient
+        # DB or MCP-server hiccup should not deny the user a response.
+        logging.warning(
+            "MCP tool discovery failed; subagents will run without MCP tools this turn: %s",
+            e,
+        )
+        mcp_tools_by_agent = {}
+    _perf_log.info(
+        "[create_agent] load_mcp_tools_by_connector in %.3fs (%d agents)",
+        time.perf_counter() - _t0,
+        len(mcp_tools_by_agent),
+    )
+
+    # User-scoped allow-list ("Always Allow" persisted to
+    # ``SearchSourceConnector.config.trusted_tools``). Layered last in each
+    # subagent's PermissionMiddleware so user ``allow`` overrides coded
+    # ``ask`` via last-match-wins. Anonymous turns and read failures both
+    # degrade to "no user rules" rather than blocking the turn.
+    user_allowlist_by_subagent: dict[str, Any] = {}
+    trusted_tool_saver = None
+    if user_id:
+        try:
+            import uuid as _uuid
+
+            user_uuid = _uuid.UUID(user_id)
+        except (TypeError, ValueError):
+            user_uuid = None
+
+        if user_uuid is not None:
+            _t0 = time.perf_counter()
+            try:
+                user_allowlist_by_subagent = await fetch_user_allowlist_rulesets(
+                    db_session,
+                    user_id=user_uuid,
+                    search_space_id=search_space_id,
+                )
+            except Exception as e:
+                logging.warning(
+                    "User allow-list fetch failed; subagents will run without user trust rules this turn: %s",
+                    e,
+                )
+                user_allowlist_by_subagent = {}
+            _perf_log.info(
+                "[create_agent] fetch_user_allowlist_rulesets in %.3fs (%d subagents have rules)",
+                time.perf_counter() - _t0,
+                len(user_allowlist_by_subagent),
+            )
+            trusted_tool_saver = make_trusted_tool_saver(user_uuid)
+    dependencies["user_allowlist_by_subagent"] = user_allowlist_by_subagent
+    dependencies["trusted_tool_saver"] = trusted_tool_saver
+
+    modified_disabled_tools = list(disabled_tools) if disabled_tools else []
+
+    if "search_knowledge_base" not in modified_disabled_tools:
+        modified_disabled_tools.append("search_knowledge_base")
+
+    if enabled_tools is not None:
+        main_agent_enabled_tools = [
+            n for n in enabled_tools if n in MAIN_AGENT_SURFSENSE_TOOL_NAMES
+        ]
+    else:
+        main_agent_enabled_tools = list(MAIN_AGENT_SURFSENSE_TOOL_NAMES_ORDERED)
+
+    _t0 = time.perf_counter()
+    tools = await build_tools_async(
+        dependencies=dependencies,
+        enabled_tools=main_agent_enabled_tools,
+        disabled_tools=modified_disabled_tools,
+        additional_tools=list(additional_tools) if additional_tools else None,
+        include_mcp_tools=False,
+    )
+
+    _flags: AgentFeatureFlags = get_flags()
+    if _flags.enable_tool_call_repair and INVALID_TOOL_NAME not in {
+        t.name for t in tools
+    }:
+        tools = [*list(tools), invalid_tool]
+    _perf_log.info(
+        "[create_agent] build_tools_async in %.3fs (%d tools)",
+        time.perf_counter() - _t0,
+        len(tools),
+    )
+
+    _t0 = time.perf_counter()
+    _enabled_tool_names = {t.name for t in tools}
+    _user_disabled_tool_names = set(disabled_tools) if disabled_tools else set()
+
+    _model_name: str | None = None
+    prof = getattr(llm, "model_name", None) or getattr(llm, "model", None)
+    if isinstance(prof, str):
+        _model_name = prof
+
+    _connector_exclude = get_subagents_to_exclude(available_connectors)
+    _registry_subagent_prompt_lines = main_prompt_registry_subagent_lines(
+        _connector_exclude
+    )
+
+    if agent_config is not None:
+        system_prompt = build_main_agent_system_prompt(
+            today=None,
+            thread_visibility=thread_visibility,
+            enabled_tool_names=_enabled_tool_names,
+            disabled_tool_names=_user_disabled_tool_names,
+            custom_system_instructions=agent_config.system_instructions,
+            use_default_system_instructions=agent_config.use_default_system_instructions,
+            citations_enabled=agent_config.citations_enabled,
+            model_name=_model_name or getattr(agent_config, "model_name", None),
+            registry_subagent_prompt_lines=_registry_subagent_prompt_lines,
+        )
+    else:
+        system_prompt = build_main_agent_system_prompt(
+            thread_visibility=thread_visibility,
+            enabled_tool_names=_enabled_tool_names,
+            disabled_tool_names=_user_disabled_tool_names,
+            citations_enabled=True,
+            model_name=_model_name,
+            registry_subagent_prompt_lines=_registry_subagent_prompt_lines,
+        )
+    _perf_log.info(
+        "[create_agent] System prompt built in %.3fs", time.perf_counter() - _t0
+    )
+
+    final_system_prompt = system_prompt
+
+    config_id = agent_config.config_id if agent_config is not None else None
+
+    _t0 = time.perf_counter()
+    agent = await build_agent_with_cache(
+        llm=llm,
+        tools=tools,
+        final_system_prompt=final_system_prompt,
+        backend_resolver=backend_resolver,
+        filesystem_mode=filesystem_selection.mode,
+        search_space_id=search_space_id,
+        user_id=user_id,
+        thread_id=thread_id,
+        visibility=visibility,
+        anon_session_id=anon_session_id,
+        available_connectors=available_connectors,
+        available_document_types=available_document_types,
+        mentioned_document_ids=mentioned_document_ids,
+        max_input_tokens=_max_input_tokens,
+        flags=_flags,
+        checkpointer=checkpointer,
+        subagent_dependencies=dependencies,
+        mcp_tools_by_agent=mcp_tools_by_agent,
+        disabled_tools=disabled_tools,
+        config_id=config_id,
+    )
+    _perf_log.info(
+        "[create_agent] Middleware stack + graph compiled in %.3fs",
+        time.perf_counter() - _t0,
+    )
+
+    _perf_log.info(
+        "[create_agent] Total agent creation in %.3fs",
+        time.perf_counter() - _t_agent_total,
+    )
+    return agent
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/__init__.py
new file mode 100644
index 0000000000..d58aecdf43
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/__init__.py
@@ -0,0 +1,7 @@
+"""Main-agent system prompt — not shared verbatim with single-agent ``new_chat``."""
+
+from __future__ import annotations
+
+from .builder import build_main_agent_system_prompt
+
+__all__ = ["build_main_agent_system_prompt"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/__init__.py
new file mode 100644
index 0000000000..165235fc82
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/__init__.py
@@ -0,0 +1,7 @@
+"""Assemble the main-agent system prompt from ``prompts/`` fragments."""
+
+from __future__ import annotations
+
+from .compose import build_main_agent_system_prompt
+
+__all__ = ["build_main_agent_system_prompt"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/compose.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/compose.py
new file mode 100644
index 0000000000..fc341dce3b
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/compose.py
@@ -0,0 +1,100 @@
+"""Assemble the main-agent system prompt from ``prompts/``.
+
+Section order (default flow)::
+
+    <agent_identity>
+    [user's custom_system_instructions, if any]
+    <core_behavior>                 # default body
+    <knowledge_base_first>          # default body
+    <dynamic_context>               # always
+    <routing>                       # default body
+    <specialists>                   # always (dynamic roster)
+    <tools>                         # always (vertical-slice)
+    <memory_protocol>               # default body
+    <citations>                     # always
+    <output_format>                 # always
+    <refusal_and_limits>            # always
+    <reminder>                      # always
+
+``custom_system_instructions`` is **additive**, not a replacement: it slots
+between identity and the default body so platform safety nets (KB-first,
+routing, citations, output formatting, refusal rules) always apply.
+
+``use_default_system_instructions=False`` skips the four "default body"
+sections but keeps all the always-on platform sections.
+"""
+
+from __future__ import annotations
+
+from datetime import UTC, datetime
+
+from app.db import ChatVisibility
+
+from .load_md import read_prompt_md
+from .sections.citations import build_citations_section
+from .sections.dynamic_context import build_dynamic_context_section
+from .sections.identity import build_identity_section
+from .sections.memory_protocol import build_memory_protocol_section
+from .sections.specialists import build_specialists_section
+from .sections.tools import build_tools_section
+
+
+def build_main_agent_system_prompt(
+    *,
+    registry_subagent_prompt_lines: list[tuple[str, str]],
+    today: datetime | None = None,
+    thread_visibility: ChatVisibility | None = None,
+    enabled_tool_names: set[str] | None = None,
+    disabled_tool_names: set[str] | None = None,
+    custom_system_instructions: str | None = None,
+    use_default_system_instructions: bool = True,
+    citations_enabled: bool = True,
+    model_name: str | None = None,
+) -> str:
+    resolved_today = (today or datetime.now(UTC)).astimezone(UTC).date().isoformat()
+    visibility = thread_visibility or ChatVisibility.PRIVATE
+
+    parts: list[str] = []
+
+    parts.append(
+        build_identity_section(visibility=visibility, resolved_today=resolved_today)
+    )
+
+    if custom_system_instructions and custom_system_instructions.strip():
+        parts.append(
+            "\n"
+            + custom_system_instructions.format(resolved_today=resolved_today)
+            + "\n"
+        )
+
+    if use_default_system_instructions:
+        parts.append(_wrap(read_prompt_md("core_behavior.md")))
+        parts.append(_wrap(read_prompt_md("kb_first.md")))
+
+    parts.append(build_dynamic_context_section(visibility=visibility))
+
+    if use_default_system_instructions:
+        parts.append(_wrap(read_prompt_md("routing.md")))
+
+    parts.append(build_specialists_section(registry_subagent_prompt_lines))
+    parts.append(
+        build_tools_section(
+            visibility=visibility,
+            enabled_tool_names=enabled_tool_names,
+            disabled_tool_names=disabled_tool_names,
+        )
+    )
+
+    if use_default_system_instructions:
+        parts.append(build_memory_protocol_section(visibility=visibility))
+
+    parts.append(build_citations_section(citations_enabled=citations_enabled))
+    parts.append(_wrap(read_prompt_md("output_format.md")))
+    parts.append(_wrap(read_prompt_md("refusal_and_limits.md")))
+    parts.append(_wrap(read_prompt_md("reminder.md")))
+
+    return "".join(p for p in parts if p)
+
+
+def _wrap(fragment: str) -> str:
+    return f"\n{fragment}\n" if fragment else ""
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/load_md.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/load_md.py
new file mode 100644
index 0000000000..61e30b1c74
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/load_md.py
@@ -0,0 +1,16 @@
+"""Load main-agent prompt fragments from ``system_prompt/prompts/``."""
+
+from __future__ import annotations
+
+from importlib import resources
+
+_PROMPTS_PACKAGE = "app.agents.multi_agent_chat.main_agent.system_prompt.prompts"
+
+
+def read_prompt_md(filename: str) -> str:
+    """Load ``prompts/{filename}`` (e.g. ``core_behavior.md`` or ``tools/web_search/description.md``)."""
+    ref = resources.files(_PROMPTS_PACKAGE).joinpath(filename)
+    if not ref.is_file():
+        return ""
+    text = ref.read_text(encoding="utf-8")
+    return text[:-1] if text.endswith("\n") else text
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/provider_hints.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/provider_hints.py
new file mode 100644
index 0000000000..78d7b08ec9
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/provider_hints.py
@@ -0,0 +1,50 @@
+"""Provider-specific style hints from ``prompts/providers/`` (main agent only)."""
+
+from __future__ import annotations
+
+import re
+
+from .load_md import read_prompt_md
+
+ProviderVariant = str
+
+_OPENAI_CODEX_RE = re.compile(
+    r"\b(gpt-codex|codex-mini|gpt-[\d.]+-codex)\b", re.IGNORECASE
+)
+_OPENAI_REASONING_RE = re.compile(r"\b(gpt-5|o\d|o-)", re.IGNORECASE)
+_OPENAI_CLASSIC_RE = re.compile(r"\bgpt-4", re.IGNORECASE)
+_ANTHROPIC_RE = re.compile(r"\bclaude\b", re.IGNORECASE)
+_GOOGLE_RE = re.compile(r"\bgemini\b", re.IGNORECASE)
+_KIMI_RE = re.compile(r"\b(kimi[-\d.]*|moonshot)\b", re.IGNORECASE)
+_GROK_RE = re.compile(r"\bgrok\b", re.IGNORECASE)
+_DEEPSEEK_RE = re.compile(r"\bdeepseek\b", re.IGNORECASE)
+
+
+def detect_provider_variant(model_name: str | None) -> ProviderVariant:
+    if not model_name:
+        return "default"
+    name = model_name.strip()
+    if _OPENAI_CODEX_RE.search(name):
+        return "openai_codex"
+    if _OPENAI_REASONING_RE.search(name):
+        return "openai_reasoning"
+    if _OPENAI_CLASSIC_RE.search(name):
+        return "openai_classic"
+    if _ANTHROPIC_RE.search(name):
+        return "anthropic"
+    if _GOOGLE_RE.search(name):
+        return "google"
+    if _KIMI_RE.search(name):
+        return "kimi"
+    if _GROK_RE.search(name):
+        return "grok"
+    if _DEEPSEEK_RE.search(name):
+        return "deepseek"
+    return "default"
+
+
+def build_provider_hint_block(provider_variant: ProviderVariant) -> str:
+    if not provider_variant or provider_variant == "default":
+        return ""
+    text = read_prompt_md(f"providers/{provider_variant}.md")
+    return f"\n{text}\n" if text else ""
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/sections/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/sections/__init__.py
new file mode 100644
index 0000000000..568b52bafb
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/sections/__init__.py
@@ -0,0 +1 @@
+"""Rendered slices of the main-agent system prompt."""
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/sections/citations.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/sections/citations.py
new file mode 100644
index 0000000000..8e7641f8de
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/sections/citations.py
@@ -0,0 +1,11 @@
+"""``<citations>`` section — on/off variant based on workspace configuration."""
+
+from __future__ import annotations
+
+from ..load_md import read_prompt_md
+
+
+def build_citations_section(*, citations_enabled: bool) -> str:
+    variant = "on" if citations_enabled else "off"
+    fragment = read_prompt_md(f"citations/{variant}.md")
+    return f"\n{fragment}\n" if fragment else ""
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/sections/dynamic_context.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/sections/dynamic_context.py
new file mode 100644
index 0000000000..2a9554894d
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/sections/dynamic_context.py
@@ -0,0 +1,13 @@
+"""``<dynamic_context>`` section — visibility-aware (private vs team thread)."""
+
+from __future__ import annotations
+
+from app.db import ChatVisibility
+
+from ..load_md import read_prompt_md
+
+
+def build_dynamic_context_section(*, visibility: ChatVisibility) -> str:
+    variant = "team" if visibility == ChatVisibility.SEARCH_SPACE else "private"
+    fragment = read_prompt_md(f"dynamic_context/{variant}.md")
+    return f"\n{fragment}\n" if fragment else ""
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/sections/identity.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/sections/identity.py
new file mode 100644
index 0000000000..82c826491d
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/sections/identity.py
@@ -0,0 +1,19 @@
+"""``<agent_identity>`` section — visibility-aware, with ``{resolved_today}`` injection."""
+
+from __future__ import annotations
+
+from app.db import ChatVisibility
+
+from ..load_md import read_prompt_md
+
+
+def build_identity_section(
+    *,
+    visibility: ChatVisibility,
+    resolved_today: str,
+) -> str:
+    variant = "team" if visibility == ChatVisibility.SEARCH_SPACE else "private"
+    fragment = read_prompt_md(f"identity/{variant}.md")
+    if not fragment:
+        return ""
+    return "\n" + fragment.format(resolved_today=resolved_today) + "\n"
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/sections/memory_protocol.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/sections/memory_protocol.py
new file mode 100644
index 0000000000..9d77189129
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/sections/memory_protocol.py
@@ -0,0 +1,13 @@
+"""``<memory_protocol>`` section — visibility-aware (user vs team memory)."""
+
+from __future__ import annotations
+
+from app.db import ChatVisibility
+
+from ..load_md import read_prompt_md
+
+
+def build_memory_protocol_section(*, visibility: ChatVisibility) -> str:
+    variant = "team" if visibility == ChatVisibility.SEARCH_SPACE else "private"
+    fragment = read_prompt_md(f"memory_protocol/{variant}.md")
+    return f"\n{fragment}\n" if fragment else ""
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/sections/provider.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/sections/provider.py
new file mode 100644
index 0000000000..7de7220804
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/sections/provider.py
@@ -0,0 +1,9 @@
+"""Provider-specific style hints."""
+
+from __future__ import annotations
+
+from ..provider_hints import build_provider_hint_block, detect_provider_variant
+
+
+def build_provider_section(*, model_name: str | None) -> str:
+    return build_provider_hint_block(detect_provider_variant(model_name))
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/sections/specialists.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/sections/specialists.py
new file mode 100644
index 0000000000..a3455bd83c
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/sections/specialists.py
@@ -0,0 +1,15 @@
+"""``<specialists>`` section — live ``task`` roster for this workspace.
+
+The roster is non-empty by contract: ``deliverables`` and ``knowledge_base``
+both declare ``frozenset()`` in ``SUBAGENT_TO_REQUIRED_CONNECTOR_MAP``, so
+they survive every connector-based exclusion pass.
+"""
+
+from __future__ import annotations
+
+
+def build_specialists_section(
+    specialist_lines: list[tuple[str, str]],
+) -> str:
+    bullets = "\n".join(f"- **{name}** — {desc}" for name, desc in specialist_lines)
+    return f"\n<specialists>\n{bullets}\n</specialists>\n"
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/sections/tools.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/sections/tools.py
new file mode 100644
index 0000000000..caf741d45b
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/sections/tools.py
@@ -0,0 +1,20 @@
+"""Main-agent ``<tools>`` block (memory + research builtins + ``task``)."""
+
+from __future__ import annotations
+
+from app.db import ChatVisibility
+
+from ..tool_instruction_block import build_tools_instruction_block
+
+
+def build_tools_section(
+    *,
+    visibility: ChatVisibility,
+    enabled_tool_names: set[str] | None,
+    disabled_tool_names: set[str] | None,
+) -> str:
+    return build_tools_instruction_block(
+        visibility=visibility,
+        enabled_tool_names=enabled_tool_names,
+        disabled_tool_names=disabled_tool_names,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/tool_instruction_block.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/tool_instruction_block.py
new file mode 100644
index 0000000000..cbc8728ca2
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/builder/tool_instruction_block.py
@@ -0,0 +1,84 @@
+"""Compose the ``<tools>`` block from per-tool vertical-slice folders.
+
+Each tool lives in ``prompts/tools/<name>/`` with ``description.md`` and an
+``example.md``. Visibility variants live in ``{private,team}/`` subfolders.
+"""
+
+from __future__ import annotations
+
+from app.db import ChatVisibility
+
+from ...tools import MAIN_AGENT_SURFSENSE_TOOL_NAMES_ORDERED
+from .load_md import read_prompt_md
+
+_MEMORY_VARIANT_TOOLS: frozenset[str] = frozenset({"update_memory"})
+
+
+def _tool_fragment(tool_name: str, variant: str, leaf: str) -> str:
+    if tool_name in _MEMORY_VARIANT_TOOLS:
+        return read_prompt_md(f"tools/{tool_name}/{variant}/{leaf}")
+    return read_prompt_md(f"tools/{tool_name}/{leaf}")
+
+
+def _format_tool_label(tool_name: str) -> str:
+    return tool_name.replace("_", " ").title()
+
+
+def build_tools_instruction_block(
+    *,
+    visibility: ChatVisibility,
+    enabled_tool_names: set[str] | None,
+    disabled_tool_names: set[str] | None,
+) -> str:
+    """Render ``<tools>``. ``task`` is always included: at least ``deliverables``
+    and ``knowledge_base`` are always in ``<specialists>`` (see constants)."""
+    variant = "team" if visibility == ChatVisibility.SEARCH_SPACE else "private"
+
+    parts: list[str] = ["\n<tools>\n"]
+
+    for tool_name in MAIN_AGENT_SURFSENSE_TOOL_NAMES_ORDERED:
+        if enabled_tool_names is not None and tool_name not in enabled_tool_names:
+            continue
+
+        description = _tool_fragment(tool_name, variant, "description.md")
+        example = _tool_fragment(tool_name, variant, "example.md")
+
+        if not description and not example:
+            continue
+
+        if description:
+            parts.append(description + "\n")
+        if example:
+            parts.append("\n" + example + "\n")
+        parts.append("\n")
+
+    task_description = read_prompt_md("tools/task/description.md")
+    task_example = read_prompt_md("tools/task/example.md")
+    if task_description:
+        parts.append(task_description + "\n")
+    if task_example:
+        parts.append("\n" + task_example + "\n")
+    parts.append("\n")
+
+    known_disabled = (
+        set(disabled_tool_names) & set(MAIN_AGENT_SURFSENSE_TOOL_NAMES_ORDERED)
+        if disabled_tool_names
+        else set()
+    )
+    if known_disabled:
+        disabled_list = ", ".join(
+            _format_tool_label(n)
+            for n in MAIN_AGENT_SURFSENSE_TOOL_NAMES_ORDERED
+            if n in known_disabled
+        )
+        parts.append(
+            "<disabled_tools>\n"
+            f"Disabled for this session: {disabled_list}.\n"
+            "Don't claim you can use them. If the user needs that capability,\n"
+            "delegate with `task` when a specialist covers it; otherwise say\n"
+            "the tool is disabled.\n"
+            "</disabled_tools>\n"
+        )
+
+    parts.append("</tools>\n")
+    return "".join(parts)
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/__init__.py
new file mode 100644
index 0000000000..b82bad81fd
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/__init__.py
@@ -0,0 +1 @@
+"""Main-agent prompt fragments loaded by :mod:`...system_prompt.builder.load_md`."""
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/citations/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/citations/__init__.py
new file mode 100644
index 0000000000..ef2bdad343
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/citations/__init__.py
@@ -0,0 +1 @@
+"""``<citations>`` block — ``on`` (cite chunk ids) and ``off`` (hard suppression)."""
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/citations/off.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/citations/off.md
new file mode 100644
index 0000000000..42cb099a67
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/citations/off.md
@@ -0,0 +1,12 @@
+<citations>
+Citation markers are **disabled** in this configuration.
+
+Do NOT include `[citation:…]` markers anywhere, even if tool descriptions or
+examples reference them. Ignore citation-format reminders elsewhere in this
+prompt when they conflict with this block.
+
+1. Answer in plain prose. Optional markdown links to public URLs when
+   sources are URLs.
+2. Do not expose raw chunk ids, document ids, or internal ids to the user.
+3. Present KB or docs facts naturally without attribution markers.
+</citations>
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/citations/on.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/citations/on.md
new file mode 100644
index 0000000000..b200f7a9a6
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/citations/on.md
@@ -0,0 +1,11 @@
+<citations>
+Apply chunk citations only when the runtime injects `<document>` /
+`<chunk id='…'>` blocks.
+
+1. For each factual statement taken from those chunks, add
+   `[citation:chunk_id]` using the exact id from `<chunk id='…'>`.
+2. Multiple chunks → `[citation:id1], [citation:id2]` (comma-separated).
+3. Never invent or normalise ids; if unsure, omit.
+4. Plain brackets only — no markdown links, no footnote numbering.
+5. If no chunk-tagged documents appear this turn, do not fabricate citations.
+</citations>
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/core_behavior.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/core_behavior.md
new file mode 100644
index 0000000000..c7be91cde8
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/core_behavior.md
@@ -0,0 +1,13 @@
+<core_behavior>
+- Be concise and direct. No preamble ("Sure!", "Great question!", "I'll now…").
+- Don't narrate intent — just act. State the outcome, not the plan.
+- If the request is ambiguous, ask before acting. If asked *how* to do
+  something, explain first, then act.
+- Prioritise accuracy over agreement. Disagree respectfully when the user is
+  wrong; avoid unnecessary superlatives or emotional validation.
+- Persist until the task is done or you are genuinely blocked. Don't stop
+  partway and describe what you *would* do.
+- For longer work, give brief progress updates only when they add new
+  information (a discovery, a tradeoff, a blocker, the start of a non-trivial
+  step). Don't narrate routine reads.
+</core_behavior>
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/dynamic_context/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/dynamic_context/__init__.py
new file mode 100644
index 0000000000..2410538724
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/dynamic_context/__init__.py
@@ -0,0 +1 @@
+"""``<dynamic_context>`` block — private and team variants."""
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/dynamic_context/private.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/dynamic_context/private.md
new file mode 100644
index 0000000000..71c86be40a
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/dynamic_context/private.md
@@ -0,0 +1,27 @@
+<dynamic_context>
+The runtime inserts these system messages each turn. They are authoritative
+for *this* turn only.
+
+`<user_memory>` carries the durable personal context the user has accumulated
+across sessions — role, interests, preferences, projects, background,
+standing instructions. It also reports current character usage versus the
+hard limit so you can manage the budget. Treat it as background colour for
+your answer, not as the task itself.
+
+`<priority_documents>` lists the workspace documents most relevant to the
+latest user message, ranked by relevance score, with `[USER-MENTIONED]`
+flagged on anything the user explicitly referenced. When the task is about
+workspace content, read these first; matched passages inside each document
+are flagged via `<chunk_index>` so you can jump straight to them.
+
+`<workspace_tree>` shows the full `/documents/` folder and file layout. Use
+it to resolve paths the user describes in natural language ("my Q2 roadmap",
+"last week's meeting notes") into concrete document references before
+delegating to a specialist.
+
+`<document>` and `<chunk id='…'>` blocks are chunked indexed content returned
+by KB search (from `search_surfsense_docs`, or backing `<priority_documents>`).
+Each chunk carries a stable `id` attribute.
+
+If a block doesn't appear this turn, work from the conversation alone.
+</dynamic_context>
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/dynamic_context/team.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/dynamic_context/team.md
new file mode 100644
index 0000000000..592c2ed9c9
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/dynamic_context/team.md
@@ -0,0 +1,27 @@
+<dynamic_context>
+The runtime inserts these system messages each turn. They are authoritative
+for *this* turn only.
+
+`<team_memory>` carries the durable shared context this team has built up —
+decisions, conventions, architecture notes, processes, key facts. It also
+reports current character usage versus the hard limit so you can manage the
+budget. Treat it as background colour for your answer, not as the task itself.
+
+`<priority_documents>` lists the workspace documents most relevant to the
+latest user message, ranked by relevance score, with `[USER-MENTIONED]`
+flagged on anything someone in the thread explicitly referenced. When the
+task is about workspace content, read these first; matched passages inside
+each document are flagged via `<chunk_index>` so you can jump straight to
+them.
+
+`<workspace_tree>` shows the full `/documents/` folder and file layout. Use
+it to resolve paths described in natural language ("the Q2 roadmap", "last
+week's planning notes") into concrete document references before delegating
+to a specialist.
+
+`<document>` and `<chunk id='…'>` blocks are chunked indexed content returned
+by KB search (from `search_surfsense_docs`, or backing `<priority_documents>`).
+Each chunk carries a stable `id` attribute.
+
+If a block doesn't appear this turn, work from the conversation alone.
+</dynamic_context>
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/identity/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/identity/__init__.py
new file mode 100644
index 0000000000..1e7bf4f216
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/identity/__init__.py
@@ -0,0 +1 @@
+"""``<agent_identity>`` block — private and team variants."""
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/identity/private.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/identity/private.md
new file mode 100644
index 0000000000..38d33cab08
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/identity/private.md
@@ -0,0 +1,8 @@
+<agent_identity>
+You are **SurfSense's main agent**. Your job is to answer the user using their
+knowledge base, lightweight web research, persistent memory, and **specialist
+subagents** invoked via the `task` tool. You are an orchestrator — most
+non-trivial work belongs on a specialist.
+
+Today (UTC): {resolved_today}
+</agent_identity>
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/identity/team.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/identity/team.md
new file mode 100644
index 0000000000..b2d1e169fd
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/identity/team.md
@@ -0,0 +1,11 @@
+<agent_identity>
+You are **SurfSense's main agent**. Your job is to answer the user using their
+shared team knowledge base, lightweight web research, persistent memory, and
+**specialist subagents** invoked via the `task` tool. You are an orchestrator
+— most non-trivial work belongs on a specialist.
+
+Today (UTC): {resolved_today}
+
+You are in a **team thread**. Each message is prefixed with `[DisplayName]`.
+Attribute quotes and decisions to the named author when relevant.
+</agent_identity>
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/kb_first.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/kb_first.md
new file mode 100644
index 0000000000..f06a52c1da
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/kb_first.md
@@ -0,0 +1,19 @@
+<knowledge_base_first>
+CRITICAL — ground factual answers in what you actually receive this turn:
+- injected workspace context (see `<dynamic_context>`),
+- results from your own tool calls (`search_surfsense_docs`, `web_search`,
+  `scrape_webpage`),
+- or substantive summaries returned by a `task` specialist you invoked.
+
+Do **not** answer factual or informational questions from general knowledge
+unless the user explicitly authorises it after you say you couldn't find
+enough in those sources. The flow when nothing is found:
+
+1. Say you couldn't find enough in their workspace, docs, or tool output.
+2. Ask: *"Would you like me to answer from my general knowledge instead?"*
+3. Only answer from general knowledge after a clear yes.
+
+This rule does NOT apply to: casual conversation · meta-questions about
+SurfSense ("what can you do?") · formatting or analysis of content already
+in chat · clear rewrite/edit instructions · lightweight web research.
+</knowledge_base_first>
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/memory_protocol/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/memory_protocol/__init__.py
new file mode 100644
index 0000000000..f172bc1b45
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/memory_protocol/__init__.py
@@ -0,0 +1 @@
+"""``<memory_protocol>`` block — private and team variants."""
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/memory_protocol/private.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/memory_protocol/private.md
new file mode 100644
index 0000000000..4dd511014b
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/memory_protocol/private.md
@@ -0,0 +1,9 @@
+<memory_protocol>
+After understanding each user message, check: does it reveal durable facts
+about the user — role, interests, preferences, projects, background, or
+standing instructions?
+
+If yes, call `update_memory` **alongside** your normal response — don't
+defer it to a later turn. Skip ephemeral chat noise (one-off Q/A, greetings,
+session logistics). Stay within the budget shown in `<user_memory>`.
+</memory_protocol>
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/memory_protocol/team.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/memory_protocol/team.md
new file mode 100644
index 0000000000..decd23c4d3
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/memory_protocol/team.md
@@ -0,0 +1,9 @@
+<memory_protocol>
+After understanding each user message, check: does it reveal durable facts
+about the team — decisions, conventions, architecture notes, processes, or
+key facts?
+
+If yes, call `update_memory` **alongside** your normal response — don't
+defer it to a later turn. Skip ephemeral chat noise (one-off Q/A, greetings,
+session logistics). Stay within the budget shown in `<team_memory>`.
+</memory_protocol>
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/output_format.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/output_format.md
new file mode 100644
index 0000000000..46f492b169
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/output_format.md
@@ -0,0 +1,7 @@
+<output_format>
+- Mathematical formulas: **always** LaTeX. Never backtick code spans or
+  Unicode symbols for math.
+- Never expose internal tool parameter names, backend IDs, or
+  implementation details. Use natural, user-friendly language.
+- External sources: markdown links `[label](url)`, never bare URLs.
+</output_format>
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/providers/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/providers/__init__.py
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/providers/__init__.py
@@ -0,0 +1 @@
+
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/providers/anthropic.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/providers/anthropic.md
new file mode 100644
index 0000000000..89154c443d
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/providers/anthropic.md
@@ -0,0 +1,16 @@
+<provider_hints>
+You are running on an Anthropic Claude model (SurfSense **main agent**).
+
+Structured reasoning:
+- For non-trivial work, `<thinking>` / short `<plan>` before tool calls is fine.
+
+Professional objectivity:
+- Accuracy over flattery; verify with **search_surfsense_docs**, **web_search**, **scrape_webpage**, or **task** when unsure — don’t invent connector access.
+
+Task management:
+- For 3+ steps, use todo tooling; update statuses promptly.
+
+Tool calls:
+- Parallelise independent calls; sequence only when outputs chain.
+- Never pretend you can run connector-specific tools directly — route through **task** when needed.
+</provider_hints>
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/providers/deepseek.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/providers/deepseek.md
new file mode 100644
index 0000000000..4254e9ed57
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/providers/deepseek.md
@@ -0,0 +1,18 @@
+<provider_hints>
+You are running on a DeepSeek model (SurfSense **main agent**).
+
+Reasoning hygiene (R1-aware):
+- Keep internal scratch separate from the user-facing answer; don’t leak chain-of-thought into tool arguments.
+
+Output style:
+- Concise; lead with the answer or the next action; avoid sycophantic openers.
+
+Attribution:
+- When citations are **enabled** and facts come from chunk-tagged context, follow the citation block above.
+- When citations are **disabled**, do not use `[citation:…]`.
+
+Tool calls:
+- Parallelise independent calls.
+- Prefer **search_surfsense_docs** for SurfSense docs/product questions before **web_search** when that fits the ask.
+- Don’t invent paths, chunk ids, or URLs — only values from tools or the user.
+</provider_hints>
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/providers/default.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/providers/default.md
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/providers/default.md
@@ -0,0 +1 @@
+
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/providers/google.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/providers/google.md
new file mode 100644
index 0000000000..dc50735380
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/providers/google.md
@@ -0,0 +1,18 @@
+<provider_hints>
+You are running on a Google Gemini model (SurfSense **main agent**).
+
+Output style:
+- Concise & direct. Fewer than ~3 lines of prose when the task allows (excluding tool output and code).
+- No filler openers/closers — move straight to the answer or the tool call.
+- GitHub-flavoured Markdown; monospace-friendly.
+
+Workflow (Understand → Plan → Act → Verify):
+1. **Understand:** parse the ask; use **search_surfsense_docs** / injected workspace context before guessing.
+2. **Plan:** for multi-step work, a short plan first.
+3. **Act:** only with tools you actually have on this agent (see `<tools>` and `<tool_routing>`). Connector work → **task**.
+4. **Verify:** re-read or re-search only when it materially reduces risk.
+
+Discipline:
+- Do not imply access to connectors, MCP tools, or deliverable generators except via **task**.
+- Pass paths to **task(knowledge_base, …)** only when you saw them in `<workspace_tree>` or `<priority_documents>`. Otherwise describe the document in natural language and let the subagent resolve it.
+</provider_hints>
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/providers/grok.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/providers/grok.md
new file mode 100644
index 0000000000..3219e10d38
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/providers/grok.md
@@ -0,0 +1,16 @@
+<provider_hints>
+You are running on an xAI Grok model (SurfSense **main agent**).
+
+Maximum terseness:
+- Fewer than 4 lines unless detail is requested; skip preamble/postamble.
+
+Tool discipline:
+- Typically one investigative tool per turn unless several independent read-only queries are clearly needed; don’t repeat identical calls.
+
+Attribution:
+- When citations are **enabled** (see citation block above) and you answer from chunk-tagged documents, use `[citation:chunk_id]` exactly as specified there.
+- When citations are **disabled**, never emit `[citation:…]` — plain prose and links per tool guidance.
+
+Style:
+- No emojis unless asked; flat lists for short answers.
+</provider_hints>
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/providers/kimi.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/providers/kimi.md
new file mode 100644
index 0000000000..3fe07d1808
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/providers/kimi.md
@@ -0,0 +1,21 @@
+<provider_hints>
+You are running on a Moonshot Kimi model (Kimi-K1.5 / Kimi-K2 / Kimi-K2.5+), SurfSense **main agent**.
+
+Action bias:
+- Default to taking action with tools rather than describing solutions in prose. If a tool can answer the question, call the tool.
+- Don't narrate routine reads, searches, or obvious next steps. Combine related progress into one short status line.
+- Be thorough in actions (test what you build, verify what you change). Be brief in explanations.
+
+Tool calls:
+- Output multiple non-interfering tool calls in a SINGLE response — parallelism is a major efficiency win on this model.
+- When the `task` tool is available, delegate focused subtasks to a subagent with full context (subagents don't inherit yours).
+- Don't apologise or pre-announce tool calls. The tool call itself is self-explanatory.
+
+Language:
+- Respond in the SAME language as the user's most recent turn unless explicitly instructed otherwise.
+
+Discipline:
+- Stay on track. Never give the user more than what they asked for.
+- Fact-check with tools; don’t fabricate chunk ids or connector outcomes.
+- Keep it stupidly simple. Don't overcomplicate.
+</provider_hints>
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/providers/openai_classic.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/providers/openai_classic.md
new file mode 100644
index 0000000000..7ff3ec912d
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/providers/openai_classic.md
@@ -0,0 +1,20 @@
+<provider_hints>
+You are running on a classic OpenAI chat model (GPT-4 family), SurfSense **main agent**.
+
+Persistence:
+- Finish the user’s request in the same turn when tools allow — don’t stop at intent only.
+- If a tool errors, fix arguments and retry once before giving up.
+
+Planning:
+- For 3+ steps, use the todo / planning tool; mark `in_progress` / `completed` promptly.
+- One short sentence before non-trivial tool use is fine.
+
+Output style:
+- Conversational but professional; bullets for findings; fenced code with language tags when needed.
+- Summarize tool output — don’t paste walls of text.
+
+Tool calls:
+- Parallelise independent calls in one turn.
+- Prefer **search_surfsense_docs** for SurfSense-product questions, **web_search** / **scrape_webpage**
+  for fresh public facts; integrations and heavy workflows → **task**.
+</provider_hints>
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/providers/openai_codex.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/providers/openai_codex.md
new file mode 100644
index 0000000000..aad52f9952
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/providers/openai_codex.md
@@ -0,0 +1,13 @@
+<provider_hints>
+You are running on an OpenAI Codex-class model (SurfSense **main agent**).
+
+Output style:
+- Concise; don’t paste huge fetch blobs — summarize.
+- When citations are **enabled** and you rely on chunk-tagged docs, references may use `[citation:chunk_id]` per the citation block above; when **disabled**, use prose and URLs only.
+- Numbered lists work well when the user should reply with a single option index.
+- No emojis; single-level bullets.
+
+Tool calls:
+- Parallelise independent calls; chain only when required.
+- Don’t ask permission for obvious safe defaults — state what you did.
+</provider_hints>
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/providers/openai_reasoning.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/providers/openai_reasoning.md
new file mode 100644
index 0000000000..6c8a34087b
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/providers/openai_reasoning.md
@@ -0,0 +1,22 @@
+<provider_hints>
+You are running on an OpenAI reasoning model (GPT-5+ / o-series), SurfSense **main agent**.
+
+Output style:
+- Be terse and direct. Don't restate the user's request before answering.
+- Don't begin with conversational openers ("Done!", "Got it", "Great question", "Sure thing"). Get to the answer or the action.
+- Match response complexity to the task: simple questions → one-line answer; substantial work → lead with the outcome, then context, then any next steps.
+- No nested bullets — keep lists flat (single level). For options the user can pick by replying with a number, use `1.` `2.` `3.`.
+- Use inline backticks for paths/commands/identifiers; fenced code blocks (with language tags) for multi-line snippets.
+
+Channels (for clients that support them):
+- `commentary` — short progress updates only when they add genuinely new information (a discovery, a tradeoff, a blocker, the start of a non-trivial step). Don't narrate routine reads or obvious next steps.
+- `final` — the completed response. Keep it self-contained; no "see above" / "see below" cross-references.
+
+Tool calls:
+- Parallelise independent tool calls in a single response (`multi_tool_use.parallel` where supported). Only sequence when a later call needs an earlier one's output.
+- Connector or integration execution belongs in **task**, not invented main-agent tools.
+- Don't ask permission ("Should I proceed?", "Do you want me to…?"). Pick the most reasonable default, do it, and state what you did.
+
+Autonomy:
+- Persist until the task is fully resolved within the current turn whenever feasible — within tools you actually have; delegate the rest via **task**.
+</provider_hints>
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/refusal_and_limits.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/refusal_and_limits.md
new file mode 100644
index 0000000000..7a88d34783
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/refusal_and_limits.md
@@ -0,0 +1,12 @@
+<refusal_and_limits>
+- If a capability is not in `<tools>` and no entry in `<specialists>` covers
+  it, say so plainly and ask whether the user wants to proceed differently.
+  Don't pretend you can do it.
+- If a `task` call errors or the specialist is unavailable, surface that to
+  the user with a clear next step. Don't silently retry forever.
+- Disabled tools announced by the runtime are off-limits even if documented
+  elsewhere — say so and offer a `task` alternative if one exists.
+- Never claim filesystem access, connector access, or persistent storage you
+  don't have. The four direct tools and the `<specialists>` list are your
+  entire surface area.
+</refusal_and_limits>
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/reminder.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/reminder.md
new file mode 100644
index 0000000000..b7ff348a65
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/reminder.md
@@ -0,0 +1,4 @@
+<reminder>
+Concise · KB-grounded · delegation-first · one `task` per turn · no direct
+filesystem · persist memory when durable facts appear.
+</reminder>
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/routing.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/routing.md
new file mode 100644
index 0000000000..4e27381d32
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/routing.md
@@ -0,0 +1,96 @@
+<routing>
+You have two execution channels. Pick the one that owns the work — never
+simulate one with the other.
+
+### 1. Direct tools (you call them yourself)
+- `search_surfsense_docs` — SurfSense product docs (setup, configuration,
+  connector docs, feature behavior).
+- `web_search` — search the public web (anything outside SurfSense docs and
+  the workspace KB).
+- `scrape_webpage` — fetch the body of a specific public URL.
+- `update_memory` — curate persistent memory (see `<memory_protocol>`).
+- `write_todos` — maintain a structured plan when the turn series spans
+  multiple specialists or steps. Mark each item
+  `in_progress` **before** the `task` call that handles it, `completed`
+  once the call returns. Skip for single-step requests.
+
+**You have NO filesystem tools.** Any read, write, edit, move, rename, or
+search inside the user's workspace goes through `task(knowledge_base, …)` —
+never via `write_file`, `ls`, or any direct file operation.
+
+### 2. `task(<specialist>, …)` — specialist subagents
+Use `task` for anything beyond the direct tools above. See
+`<specialists>` for the live roster.
+
+Rules for `task`:
+- **One specialist per `task` call.** A single `task` invocation targets
+  exactly one specialist; that specialist only has tools for its own
+  domain, so any work outside that domain in the same prompt won't run.
+- **Parallelise independent specialist work.** When a turn needs multiple
+  `task` calls whose work doesn't depend on each other's results (e.g.
+  "create a ClickUp ticket AND a Linear ticket"), emit them as parallel
+  `task` calls. Two `task` calls are independent when:
+    - Neither's prompt references the other's output, and
+    - They target different specialists, OR the same specialist with
+      non-overlapping scopes (e.g. reading two unrelated paths).
+- **Serialise dependent work across turns.** If one specialist's output
+  must inform another's input (e.g. "find the roadmap in my KB, then
+  email it to Maya"), invoke them on consecutive turns — first finishes,
+  then you call the second with the first's result baked into its prompt.
+  Use `write_todos` to keep the plan alive across those turns.
+- Within a single specialist, bundle every related step into the same task
+  prompt (read + write + summary go together).
+- Put the **full instructions inside the task prompt** — the specialist
+  cannot see this thread.
+- Don't claim to already know what a specialist's source contains; invoke
+  the specialist and use what it returns.
+
+<example>
+user: "Save these meeting notes to my KB: …"
+→ task(knowledge_base, "Save the meeting notes below to a new document
+  under /documents/notes/. Pick a sensible title and folder; tell me the
+  path you used.\n\n<notes>…</notes>")
+</example>
+
+<example>
+user: "What did Maya say about the Q2 roadmap in Slack last week?"
+→ task(slack, "Find messages from Maya about the Q2 roadmap from the past
+  week. Return the most relevant quotes with channel and timestamp.")
+</example>
+
+<example>
+user: "What's the current USD/INR rate?"
+→ web_search(query="current USD to INR exchange rate")
+</example>
+
+<example>
+user: "Find my Q2 roadmap and summarise the milestones."
+→ task(knowledge_base, "Locate the Q2 roadmap document under /documents
+  and summarise its milestones. Use glob or grep if the path isn't
+  obvious from the workspace tree.")
+</example>
+
+<example>
+user: "Create a ClickUp ticket and a Linear ticket for the new feature flag."
+→ Independent work — call both specialists in parallel:
+    write_todos([
+      {content: "Create ClickUp ticket for feature flag rollout", status: "in_progress"},
+      {content: "Create Linear ticket for feature flag rollout",  status: "in_progress"},
+    ])
+    task(clickup, "Create a ClickUp ticket titled 'Feature flag rollout'
+      in the default list. Description: <…>. Tell me the ticket URL.")
+    task(linear, "Create a Linear ticket titled 'Feature flag rollout'
+      in the default team. Description: <…>. Tell me the ticket URL.")
+</example>
+
+<example>
+user: "Find my Q2 roadmap doc in the KB and email a summary to Maya."
+→ The email body depends on the doc's contents — serialise across turns.
+  This turn:
+    task(knowledge_base, "Find the Q2 roadmap document under /documents
+      and return its full text plus a 3-bullet summary.")
+  Next turn (with the returned summary in hand):
+    task(gmail, "Send an email to Maya with subject 'Q2 roadmap summary'
+      and the following body: <summary returned by knowledge_base>.")
+</example>
+</routing>
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/__init__.py
new file mode 100644
index 0000000000..100dccf66f
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/__init__.py
@@ -0,0 +1 @@
+"""``<tools>`` block — one vertical-slice subfolder per direct main-agent tool."""
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/scrape_webpage/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/scrape_webpage/__init__.py
new file mode 100644
index 0000000000..a101e7e1c1
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/scrape_webpage/__init__.py
@@ -0,0 +1 @@
+"""``scrape_webpage`` — description + few-shot examples."""
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/scrape_webpage/description.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/scrape_webpage/description.md
new file mode 100644
index 0000000000..d8f7313597
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/scrape_webpage/description.md
@@ -0,0 +1,11 @@
+- `scrape_webpage` — Fetch and extract readable content from a single URL.
+  - Use when the user wants the actual page body (article, table, dashboard
+    snapshot), not just search snippets.
+  - Try the tool when a URL is given or referenced; don't refuse without
+    attempting unless the URL is clearly unsafe or invalid.
+  - Public web only. For URLs behind a connector (Notion pages, Linear
+    issues, Confluence, anything that needs auth), use `task` with the
+    matching specialist instead.
+  - Args: `url`, `max_length` (default 50000).
+  - Returns title, metadata, and markdown-ish body. Summarise clearly and
+    link back with `[label](url)`.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/scrape_webpage/example.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/scrape_webpage/example.md
new file mode 100644
index 0000000000..977d40b6d1
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/scrape_webpage/example.md
@@ -0,0 +1,24 @@
+<example>
+user: "Check out https://dev.to/some-article"
+→ scrape_webpage(url="https://dev.to/some-article")
+(Respond with a structured analysis — key points, takeaways.)
+</example>
+
+<example>
+user: "Read this article and summarize it for me: https://example.com/blog/ai-trends"
+→ scrape_webpage(url="https://example.com/blog/ai-trends")
+(Thorough summary using headings and bullets.)
+</example>
+
+<example>
+user: (after discussing https://example.com/stats) "Can you get the live data from that page?"
+→ scrape_webpage(url="https://example.com/stats")
+(Always attempt scraping first. Never refuse before trying.)
+</example>
+
+<example>
+user: "https://example.com/blog/weekend-recipes"
+→ scrape_webpage(url="https://example.com/blog/weekend-recipes")
+(When a user sends just a URL with no instructions, scrape it and provide
+a concise summary.)
+</example>
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/search_surfsense_docs/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/search_surfsense_docs/__init__.py
new file mode 100644
index 0000000000..c2cda318e2
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/search_surfsense_docs/__init__.py
@@ -0,0 +1 @@
+"""``search_surfsense_docs`` — description + few-shot examples."""
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/search_surfsense_docs/description.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/search_surfsense_docs/description.md
new file mode 100644
index 0000000000..256d3f3a46
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/search_surfsense_docs/description.md
@@ -0,0 +1,10 @@
+- `search_surfsense_docs` — Search official SurfSense documentation (product
+  help).
+  - Use when the user asks how SurfSense itself works — setup, configuration,
+    connector documentation, feature behavior, anything covered in the
+    product docs.
+  - Not a substitute for `task` when the user wants actions inside a
+    connected service (Gmail, Slack, Jira, Notion, etc.).
+  - Args: `query`, `top_k` (default 10).
+  - Returns doc excerpts; chunk ids may appear for attribution — see
+    `<citations>` for the contract.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/search_surfsense_docs/example.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/search_surfsense_docs/example.md
new file mode 100644
index 0000000000..d53ad8c916
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/search_surfsense_docs/example.md
@@ -0,0 +1,15 @@
+<example>
+user: "How do I install SurfSense?"
+→ search_surfsense_docs(query="installation setup")
+</example>
+
+<example>
+user: "What connectors does SurfSense support?"
+→ search_surfsense_docs(query="available connectors integrations")
+</example>
+
+<example>
+user: "How do I set up the Notion connector?"
+→ search_surfsense_docs(query="Notion connector setup configuration")
+(Changing data inside Notion itself → `task(notion, …)`, not this tool.)
+</example>
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/task/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/task/__init__.py
new file mode 100644
index 0000000000..5eb371b75f
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/task/__init__.py
@@ -0,0 +1 @@
+"""``task`` — description + few-shot examples for the specialist-delegation tool."""
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/task/description.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/task/description.md
new file mode 100644
index 0000000000..2f47d4df10
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/task/description.md
@@ -0,0 +1,15 @@
+- `task` — Invoke a specialist subagent.
+  - Specialists own workspace knowledge-base operations and connected
+    third-party services (Slack, Notion, Jira, Gmail, etc.). See
+    `<specialists>` for the live roster.
+  - Each subagent runs in isolation with its own tool stack and context,
+    and returns a single synthesized result.
+  - Args:
+    - `subagent_type` — name of the specialist to invoke (must match an
+      entry in `<specialists>`).
+    - `description` — the FULL task prompt. The specialist cannot see this
+      thread, so include all context and constraints, plus what you need
+      back. The specialist will respond in its own format — don't dictate
+      one.
+  - Routing rules (when to call, how often, how to scope) live in
+    `<routing>`.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/task/example.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/task/example.md
new file mode 100644
index 0000000000..87e5e1b6d7
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/task/example.md
@@ -0,0 +1,20 @@
+<example>
+user: "Save these meeting notes to my KB: …"
+→ task(subagent_type="knowledge_base", description="Save the notes below to
+  a new document under /documents/notes/. Pick a sensible title and folder;
+  tell me the path you used.\n\n<notes>…</notes>")
+</example>
+
+<example>
+user: "What did Maya say about the Q2 roadmap in Slack last week?"
+→ task(subagent_type="slack", description="Find messages from Maya about
+  the Q2 roadmap from the past week. Return the most relevant quotes with
+  channel and timestamp.")
+</example>
+
+<example>
+user: "Find my Q2 roadmap and summarise the milestones."
+→ task(subagent_type="knowledge_base", description="Locate the Q2 roadmap
+  document under /documents and summarise its milestones. Use glob or grep
+  if the path isn't obvious from the workspace tree.")
+</example>
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/update_memory/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/update_memory/__init__.py
new file mode 100644
index 0000000000..21f06de5be
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/update_memory/__init__.py
@@ -0,0 +1 @@
+"""``update_memory`` — private and team visibility variants."""
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/update_memory/private/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/update_memory/private/__init__.py
new file mode 100644
index 0000000000..c25dc6336a
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/update_memory/private/__init__.py
@@ -0,0 +1 @@
+"""``update_memory`` (private variant) — description + few-shot examples."""
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/update_memory/private/description.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/update_memory/private/description.md
new file mode 100644
index 0000000000..e7fa842b1f
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/update_memory/private/description.md
@@ -0,0 +1,15 @@
+- `update_memory` — Curate the **personal** long-term memory document for
+  this user.
+  - The current memory (if any) appears in `<user_memory>` with usage vs limit.
+  - Call when the user asks to remember or forget something, or shares
+    durable facts, preferences, or instructions.
+  - Use the first name from `<user_name>` when writing entries — write
+    "Alex prefers…" not "The user prefers…". Don't store the name alone as a
+    memory entry.
+  - Skip ephemeral chat noise (one-off Q/A, greetings, session logistics).
+  - Args: `updated_memory` — FULL replacement markdown (merge and curate,
+    don't only append).
+  - Formatting: bullets `- (YYYY-MM-DD) [marker] text` with markers `[fact]`,
+    `[pref]`, `[instr]` (priority when trimming: `instr > pref > fact`).
+    Group bullets under short `##` headings; stay under the limit shown in
+    `<user_memory>`.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/update_memory/private/example.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/update_memory/private/example.md
new file mode 100644
index 0000000000..2505bdf87d
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/update_memory/private/example.md
@@ -0,0 +1,28 @@
+<example>
+<user_name>Alex</user_name>, <user_memory> is empty.
+user: "I'm a space enthusiast, explain astrophage to me"
+→ update_memory(updated_memory="## Interests & background\n- (2025-03-15) [fact] Alex is a space enthusiast\n")
+(Casual durable fact; use first name, neutral heading.)
+</example>
+
+<example>
+user: "Remember that I prefer concise answers over detailed explanations"
+→ update_memory(updated_memory="## Interests & background\n- (2025-03-15) [fact] Alex is a space enthusiast\n\n## Response style\n- (2025-03-15) [pref] Alex prefers concise answers over detailed explanations\n")
+(Durable preference; merge with existing memory.)
+</example>
+
+<example>
+user: "I actually moved to Tokyo last month"
+→ update_memory(updated_memory="...\n\n## Personal context\n- (2025-03-15) [fact] Alex lives in Tokyo (previously London)\n...")
+(Updated fact; date reflects when recorded.)
+</example>
+
+<example>
+user: "I'm a freelance photographer working on a nature documentary"
+→ update_memory(updated_memory="...\n\n## Current focus\n- (2025-03-15) [fact] Alex is a freelance photographer\n- (2025-03-15) [fact] Alex is working on a nature documentary\n")
+</example>
+
+<example>
+user: "Always respond in bullet points"
+→ update_memory(updated_memory="...\n\n## Response style\n- (2025-03-15) [instr] Always respond to Alex in bullet points\n")
+</example>
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/update_memory/team/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/update_memory/team/__init__.py
new file mode 100644
index 0000000000..85fda97dec
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/update_memory/team/__init__.py
@@ -0,0 +1 @@
+"""``update_memory`` (team variant) — description + few-shot examples."""
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/update_memory/team/description.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/update_memory/team/description.md
new file mode 100644
index 0000000000..13341a9100
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/update_memory/team/description.md
@@ -0,0 +1,16 @@
+- `update_memory` — Curate the team's **shared** long-term memory document
+  for this search space.
+  - The current memory (if any) appears in `<team_memory>` with usage vs limit.
+  - Call when a team member asks to remember or forget something, or when
+    the conversation surfaces durable team decisions, conventions,
+    architecture notes, processes, or key facts.
+  - NEVER store personal memory in team memory (individual bios, personal
+    preferences, user-only standing instructions).
+  - Skip ephemeral chat noise (one-off Q/A, greetings, session logistics).
+  - Args: `updated_memory` — FULL replacement markdown (merge and curate,
+    don't only append).
+  - Formatting: bullets `- (YYYY-MM-DD) [fact] text`. Team memory uses ONLY
+    the `[fact]` marker (never `[pref]` or `[instr]`). Group bullets under
+    short `##` headings (2-3 words each); stay under the limit shown in
+    `<team_memory>`. When trimming, prioritise: decisions/conventions > key
+    facts > current priorities.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/update_memory/team/example.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/update_memory/team/example.md
new file mode 100644
index 0000000000..8bd8fcfe41
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/update_memory/team/example.md
@@ -0,0 +1,9 @@
+<example>
+user: "Let's remember that we decided to do weekly standup meetings on Mondays"
+→ update_memory(updated_memory="...\n\n## Team rituals\n- (2025-03-15) [fact] Weekly standup meetings on Mondays\n...")
+</example>
+
+<example>
+user: "Our office is in downtown Seattle, 5th floor"
+→ update_memory(updated_memory="...\n\n## Workspace\n- (2025-03-15) [fact] Office location: downtown Seattle, 5th floor\n...")
+</example>
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/web_search/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/web_search/__init__.py
new file mode 100644
index 0000000000..95e4549b9b
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/web_search/__init__.py
@@ -0,0 +1 @@
+"""``web_search`` — description + few-shot examples."""
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/web_search/description.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/web_search/description.md
new file mode 100644
index 0000000000..df15a62848
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/web_search/description.md
@@ -0,0 +1,10 @@
+- `web_search` — Search the public web.
+  - Use whenever an answer benefits from external sources — current events,
+    prices, weather, news, technical references, definitions, background
+    facts, anything outside SurfSense docs and the workspace KB. Reach for
+    it whenever freshness matters or you'd otherwise guess from memory.
+  - Don't refuse with "I lack network access" — call the tool.
+  - If results are thin, say so and offer to refine the query.
+  - Args: `query`, `top_k` (default 10, max 50).
+  - Follow up with `scrape_webpage` on the best URL when snippets are too
+    shallow. Present sources with `[label](url)` markdown links.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/web_search/example.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/web_search/example.md
new file mode 100644
index 0000000000..04f9e899c1
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/web_search/example.md
@@ -0,0 +1,15 @@
+<example>
+user: "What's the current USD to INR exchange rate?"
+→ web_search(query="current USD to INR exchange rate")
+(Answer from snippets; scrape a top URL if needed.)
+</example>
+
+<example>
+user: "What's the latest news about AI?"
+→ web_search(query="latest AI news today")
+</example>
+
+<example>
+user: "What's the weather in New York?"
+→ web_search(query="weather New York today")
+</example>
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/tools/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/tools/__init__.py
new file mode 100644
index 0000000000..80e86e5c8f
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/tools/__init__.py
@@ -0,0 +1,10 @@
+"""Main-agent SurfSense tool allowlist."""
+
+from __future__ import annotations
+
+from .index import (
+    MAIN_AGENT_SURFSENSE_TOOL_NAMES,
+    MAIN_AGENT_SURFSENSE_TOOL_NAMES_ORDERED,
+)
+
+__all__ = ["MAIN_AGENT_SURFSENSE_TOOL_NAMES", "MAIN_AGENT_SURFSENSE_TOOL_NAMES_ORDERED"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/tools/index.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/tools/index.py
new file mode 100644
index 0000000000..5d309261cf
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/tools/index.py
@@ -0,0 +1,17 @@
+"""Main-agent SurfSense builtin tool names (not full ``new_chat``).
+
+Connector integrations, MCP, deliverables, etc. are delegated via ``task`` subagents.
+"""
+
+from __future__ import annotations
+
+MAIN_AGENT_SURFSENSE_TOOL_NAMES_ORDERED: tuple[str, ...] = (
+    "search_surfsense_docs",
+    "web_search",
+    "scrape_webpage",
+    "update_memory",
+)
+
+MAIN_AGENT_SURFSENSE_TOOL_NAMES: frozenset[str] = frozenset(
+    MAIN_AGENT_SURFSENSE_TOOL_NAMES_ORDERED,
+)
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/action_log.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/action_log.py
new file mode 100644
index 0000000000..c9f893d974
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/action_log.py
@@ -0,0 +1,36 @@
+"""Audit row per tool call (reversibility metadata)."""
+
+from __future__ import annotations
+
+import logging
+
+from app.agents.new_chat.feature_flags import AgentFeatureFlags
+from app.agents.new_chat.middleware import ActionLogMiddleware
+from app.agents.new_chat.tools.registry import BUILTIN_TOOLS
+
+from ..shared.flags import enabled
+
+
+def build_action_log_mw(
+    *,
+    flags: AgentFeatureFlags,
+    thread_id: int | None,
+    search_space_id: int,
+    user_id: str | None,
+) -> ActionLogMiddleware | None:
+    if not enabled(flags, "enable_action_log") or thread_id is None:
+        return None
+    try:
+        tool_defs_by_name = {td.name: td for td in BUILTIN_TOOLS}
+        return ActionLogMiddleware(
+            thread_id=thread_id,
+            search_space_id=search_space_id,
+            user_id=user_id,
+            tool_definitions=tool_defs_by_name,
+        )
+    except Exception:  # pragma: no cover - defensive
+        logging.warning(
+            "ActionLogMiddleware init failed; running without it.",
+            exc_info=True,
+        )
+        return None
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/anonymous_doc.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/anonymous_doc.py
new file mode 100644
index 0000000000..afd54a2d38
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/anonymous_doc.py
@@ -0,0 +1,16 @@
+"""Anonymous document hydration from Redis (cloud only)."""
+
+from __future__ import annotations
+
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+from app.agents.new_chat.middleware import AnonymousDocumentMiddleware
+
+
+def build_anonymous_doc_mw(
+    *,
+    filesystem_mode: FilesystemMode,
+    anon_session_id: str | None,
+) -> AnonymousDocumentMiddleware | None:
+    if filesystem_mode != FilesystemMode.CLOUD:
+        return None
+    return AnonymousDocumentMiddleware(anon_session_id=anon_session_id)
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/busy_mutex.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/busy_mutex.py
new file mode 100644
index 0000000000..0ea53bf169
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/busy_mutex.py
@@ -0,0 +1,12 @@
+"""Per-thread cooperative lock around the whole turn."""
+
+from __future__ import annotations
+
+from app.agents.new_chat.feature_flags import AgentFeatureFlags
+from app.agents.new_chat.middleware import BusyMutexMiddleware
+
+from ..shared.flags import enabled
+
+
+def build_busy_mutex_mw(flags: AgentFeatureFlags) -> BusyMutexMiddleware | None:
+    return BusyMutexMiddleware() if enabled(flags, "enable_busy_mutex") else None
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/__init__.py
new file mode 100644
index 0000000000..f370a71c76
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/__init__.py
@@ -0,0 +1,32 @@
+"""SubAgent ``task`` tool wiring required for HITL inside subagents.
+
+Replaces upstream ``SubAgentMiddleware`` to:
+
+- share the parent's checkpointer with each subagent,
+- forward ``runtime.config`` (thread_id, recursion_limit, …) into nested invokes,
+- isolate each parallel ``task`` call in its own checkpoint slot via
+  per-call ``thread_id`` namespacing,
+- bridge ``Command(resume=...)`` from the parent into the subagent via the
+  ``config["configurable"]["surfsense_resume_value"]`` side-channel, keyed by
+  ``tool_call_id`` so parallel siblings never race on a shared scalar,
+- target the resume at the captured interrupt id so a follow-up
+  ``HumanInTheLoopMiddleware.after_model`` does not consume the same payload,
+- stamp each subagent's pending interrupt with the parent's ``tool_call_id``
+  so ``stream_resume_chat`` can route a flat ``decisions`` list back to the
+  right paused subagent.
+
+Module layout
+-------------
+
+- ``constants``     — shared keys / limits.
+- ``config``        — RunnableConfig + side-channel resume read + per-call ``thread_id``.
+- ``resume``        — pending-interrupt detection, fan-out, ``Command(resume=...)`` builder.
+- ``propagation``   — ``wrap_with_tool_call_id`` helper for stamping interrupt values.
+- ``resume_routing``— slice a flat decisions list to per-``tool_call_id`` payloads.
+- ``task_tool``     — the ``task`` tool factory (sync + async), and the catch-and-stamp chokepoint.
+- ``middleware``    — :class:`SurfSenseCheckpointedSubAgentMiddleware` itself.
+"""
+
+from .middleware import SurfSenseCheckpointedSubAgentMiddleware
+
+__all__ = ["SurfSenseCheckpointedSubAgentMiddleware"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/config.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/config.py
new file mode 100644
index 0000000000..ad5b586074
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/config.py
@@ -0,0 +1,125 @@
+"""RunnableConfig wiring for nested subagent invocations.
+
+Forwards the parent's ``runtime.config`` (thread_id, …) into the subagent and
+exposes the side-channel ``stream_resume_chat`` uses to ferry resume payloads.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+from langchain.tools import ToolRuntime
+
+from .constants import DEFAULT_SUBAGENT_RECURSION_LIMIT
+
+logger = logging.getLogger(__name__)
+
+# langgraph stores the parent task's scratchpad under this configurable key;
+# subagents inherit the chain via ``parent_scratchpad`` fallback.
+_LANGGRAPH_SCRATCHPAD_KEY = "__pregel_scratchpad"
+
+
+def subagent_invoke_config(runtime: ToolRuntime) -> dict[str, Any]:
+    """RunnableConfig for the nested invoke; raises ``recursion_limit`` and isolates ``thread_id``.
+
+    Each parallel subagent invocation lands in its own checkpoint slot keyed
+    by an extended ``thread_id`` of the form ``{parent_thread}::task:{tool_call_id}``.
+    The same call across the resume cycle keeps reading from the same snapshot
+    (``tool_call_id`` is stable per LLM-emitted call).
+
+    We namespace via ``thread_id`` rather than ``checkpoint_ns`` because
+    langgraph's ``aget_state`` interprets a non-empty ``checkpoint_ns`` as a
+    subgraph path and raises ``ValueError("Subgraph X not found")``.
+    """
+    merged: dict[str, Any] = dict(runtime.config) if runtime.config else {}
+    current_limit = merged.get("recursion_limit")
+    try:
+        current_int = int(current_limit) if current_limit is not None else 0
+    except (TypeError, ValueError):
+        current_int = 0
+    if current_int < DEFAULT_SUBAGENT_RECURSION_LIMIT:
+        merged["recursion_limit"] = DEFAULT_SUBAGENT_RECURSION_LIMIT
+
+    configurable: dict[str, Any] = dict(merged.get("configurable") or {})
+    parent_thread_id = configurable.get("thread_id")
+    per_call_suffix = f"task:{runtime.tool_call_id}"
+    configurable["thread_id"] = (
+        f"{parent_thread_id}::{per_call_suffix}"
+        if parent_thread_id
+        else per_call_suffix
+    )
+    merged["configurable"] = configurable
+    return merged
+
+
+def consume_surfsense_resume(runtime: ToolRuntime) -> Any:
+    """Pop the resume payload for *this* call's ``tool_call_id``.
+
+    The configurable holds ``surfsense_resume_value: dict[tool_call_id, payload]``
+    so parallel sibling subagents (each with their own ``tool_call_id``) read
+    only their own decision and never race on a shared scalar.
+    """
+    cfg = runtime.config or {}
+    configurable = cfg.get("configurable") if isinstance(cfg, dict) else None
+    if not isinstance(configurable, dict):
+        return None
+    by_tcid = configurable.get("surfsense_resume_value")
+    if not isinstance(by_tcid, dict):
+        return None
+    payload = by_tcid.pop(runtime.tool_call_id, None)
+    if not by_tcid:
+        configurable.pop("surfsense_resume_value", None)
+    return payload
+
+
+def has_surfsense_resume(runtime: ToolRuntime) -> bool:
+    """True iff a resume payload for this call's ``tool_call_id`` is queued (non-destructive)."""
+    cfg = runtime.config or {}
+    configurable = cfg.get("configurable") if isinstance(cfg, dict) else None
+    if not isinstance(configurable, dict):
+        return False
+    by_tcid = configurable.get("surfsense_resume_value")
+    if not isinstance(by_tcid, dict):
+        return False
+    return runtime.tool_call_id in by_tcid
+
+
+def drain_parent_null_resume(runtime: ToolRuntime) -> None:
+    """Consume the parent's lingering ``NULL_TASK_ID/RESUME`` write before delegating.
+
+    ``stream_resume_chat`` wakes the main agent with
+    ``Command(resume={tool_call_id: {"decisions": [...]}})`` so the previously
+    propagated parent-level interrupt can return. langgraph stores that
+    payload as the parent task's ``null_resume`` pending write. The ``task``
+    tool then forwards this turn's slice into the subagent via its own
+    ``Command(resume=...)``. While the subagent is mid-execution, any *new*
+    ``interrupt()`` inside it (e.g. a follow-up tool call after a mixed
+    approve/reject) walks ``subagent_scratchpad → parent_scratchpad.get_null_resume``
+    and picks up the parent's still-live decisions — mismatching against a
+    different number of hanging tool calls and crashing
+    ``HumanInTheLoopMiddleware``.
+
+    Draining the write here closes that cross-graph leak so subagent
+    interrupts pause cleanly and bubble back up as a fresh approval card.
+    """
+    cfg = runtime.config or {}
+    configurable = cfg.get("configurable") if isinstance(cfg, dict) else None
+    if not isinstance(configurable, dict):
+        return
+    scratchpad = configurable.get(_LANGGRAPH_SCRATCHPAD_KEY)
+    if scratchpad is None:
+        return
+    consume = getattr(scratchpad, "get_null_resume", None)
+    if not callable(consume):
+        return
+    try:
+        consume(True)
+    except Exception:
+        # Defensive: if langgraph's internal scratchpad shape changes we don't
+        # want to break the resume path. Worst case the original ValueError
+        # still surfaces — same behavior as before this fix.
+        logger.debug(
+            "drain_parent_null_resume: scratchpad.get_null_resume raised",
+            exc_info=True,
+        )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/constants.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/constants.py
new file mode 100644
index 0000000000..6c4519f3aa
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/constants.py
@@ -0,0 +1,18 @@
+"""Constants shared by the checkpointed subagent middleware."""
+
+from __future__ import annotations
+
+# Mirror of deepagents.middleware.subagents._EXCLUDED_STATE_KEYS.
+EXCLUDED_STATE_KEYS = frozenset(
+    {
+        "messages",
+        "todos",
+        "structured_response",
+        "skills_metadata",
+        "memory_contents",
+    }
+)
+
+# Match the parent graph's budget; the LangGraph default of 25 trips on
+# multi-step subagent runs.
+DEFAULT_SUBAGENT_RECURSION_LIMIT = 10_000
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/middleware.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/middleware.py
new file mode 100644
index 0000000000..8f51ffed7e
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/middleware.py
@@ -0,0 +1,98 @@
+"""SubAgent middleware that compiles each subagent against the parent checkpointer."""
+
+from __future__ import annotations
+
+from typing import Any, cast
+
+from deepagents.backends.protocol import BackendFactory, BackendProtocol
+from deepagents.middleware.subagents import (
+    TASK_SYSTEM_PROMPT,
+    CompiledSubAgent,
+    SubAgent,
+    SubAgentMiddleware,
+)
+from langchain.agents import create_agent
+from langchain.chat_models import init_chat_model
+from langgraph.types import Checkpointer
+
+from .task_tool import build_task_tool_with_parent_config
+
+
+class SurfSenseCheckpointedSubAgentMiddleware(SubAgentMiddleware):
+    """``SubAgentMiddleware`` variant that compiles each subagent against the parent checkpointer."""
+
+    def __init__(
+        self,
+        *,
+        checkpointer: Checkpointer,
+        backend: BackendProtocol | BackendFactory,
+        subagents: list[SubAgent | CompiledSubAgent],
+        system_prompt: str | None = TASK_SYSTEM_PROMPT,
+        task_description: str | None = None,
+    ) -> None:
+        self._surf_checkpointer = checkpointer
+        super(SubAgentMiddleware, self).__init__()
+        if not subagents:
+            raise ValueError(
+                "At least one subagent must be specified when using the new API"
+            )
+        self._backend = backend
+        self._subagents = subagents
+        subagent_specs = self._surf_compile_subagent_graphs()
+        task_tool = build_task_tool_with_parent_config(subagent_specs, task_description)
+        if system_prompt and subagent_specs:
+            agents_desc = "\n".join(
+                f"- {s['name']}: {s['description']}" for s in subagent_specs
+            )
+            self.system_prompt = (
+                system_prompt + "\n\nAvailable subagent types:\n" + agents_desc
+            )
+        else:
+            self.system_prompt = system_prompt
+        self.tools = [task_tool]
+
+    def _surf_compile_subagent_graphs(self) -> list[dict[str, Any]]:
+        """Mirror of ``SubAgentMiddleware._get_subagents`` that threads the parent checkpointer."""
+        specs: list[dict[str, Any]] = []
+
+        for spec in self._subagents:
+            if "runnable" in spec:
+                compiled = cast(CompiledSubAgent, spec)
+                specs.append(
+                    {
+                        "name": compiled["name"],
+                        "description": compiled["description"],
+                        "runnable": compiled["runnable"],
+                    }
+                )
+                continue
+
+            if "model" not in spec:
+                msg = f"SubAgent '{spec['name']}' must specify 'model'"
+                raise ValueError(msg)
+            if "tools" not in spec:
+                msg = f"SubAgent '{spec['name']}' must specify 'tools'"
+                raise ValueError(msg)
+
+            model = spec["model"]
+            if isinstance(model, str):
+                model = init_chat_model(model)
+
+            middleware: list[Any] = list(spec.get("middleware", []))
+
+            specs.append(
+                {
+                    "name": spec["name"],
+                    "description": spec["description"],
+                    "runnable": create_agent(
+                        model,
+                        system_prompt=spec["system_prompt"],
+                        tools=spec["tools"],
+                        middleware=middleware,
+                        name=spec["name"],
+                        checkpointer=self._surf_checkpointer,
+                    ),
+                }
+            )
+
+        return specs
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/propagation.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/propagation.py
new file mode 100644
index 0000000000..cfebe1fd94
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/propagation.py
@@ -0,0 +1,38 @@
+"""Stamp the parent's ``tool_call_id`` onto a subagent's pending interrupt value.
+
+When a subagent (compiled as a langgraph subgraph and invoked from a parent
+tool node) hits an ``interrupt(...)`` from its HITL middleware, langgraph
+raises ``GraphInterrupt`` out of ``subagent.[a]invoke(...)``. The parent's
+``task`` tool catches that exception, stamps ``tool_call_id`` onto each
+``Interrupt.value`` using :func:`wrap_with_tool_call_id`, and re-raises a
+fresh ``GraphInterrupt`` whose values carry that stamp.
+
+``stream_resume_chat`` then reads ``parent.state.interrupts[*].value["tool_call_id"]``
+to route a flat ``decisions`` list back to the right paused subagent — without
+the stamp, parallel HITL across siblings would collapse into an ambiguous
+bucket and resume would fail.
+
+This module hosts only the stamping helper; the catch/re-raise lives in
+``task_tool.py`` since that's the single chokepoint where the raw exception
+is in our hands.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+def wrap_with_tool_call_id(value: Any, tool_call_id: str) -> dict[str, Any]:
+    """Return a value dict that always carries the parent's ``tool_call_id``.
+
+    Dict values are shallow-copied with ``tool_call_id`` stamped on top, so
+    any value the subagent may already carry under that key (from a deeper
+    HITL level) is overwritten — the parent's call id is the only one
+    ``stream_resume_chat`` correlates against.
+
+    Non-dict values are wrapped as ``{"value": <original>, "tool_call_id": ...}``
+    so simple ``interrupt("approve?")`` patterns still propagate cleanly.
+    """
+    if isinstance(value, dict):
+        return {**value, "tool_call_id": tool_call_id}
+    return {"value": value, "tool_call_id": tool_call_id}
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/resume.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/resume.py
new file mode 100644
index 0000000000..0bb477b6b6
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/resume.py
@@ -0,0 +1,76 @@
+"""Resume-payload shaping and pending-interrupt detection for subagents.
+
+Splits the work of "given a state snapshot and a parent-stashed resume value,
+produce the right ``Command(resume=...)`` for the subagent" into pure helpers.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langgraph.types import Command
+
+
+def hitlrequest_action_count(pending_value: Any) -> int:
+    """Bundle size for a LangChain ``HITLRequest`` payload; ``0`` for non-bundle interrupts."""
+    if not isinstance(pending_value, dict):
+        return 0
+    actions = pending_value.get("action_requests")
+    if isinstance(actions, list):
+        return len(actions)
+    return 0
+
+
+def fan_out_decisions_to_match(resume_value: Any, expected_count: int) -> Any:
+    """Legacy fallback: pad a 1-decision resume to N for an ``action_requests=N`` bundle.
+
+    Modern frontend submits N decisions per bundle (one per action_request) so
+    this is a no-op; kept for backwards compatibility with old in-flight
+    threads or non-bundle clients that send a single decision.
+    """
+    if expected_count <= 1:
+        return resume_value
+    if not isinstance(resume_value, dict):
+        return resume_value
+    decisions = resume_value.get("decisions")
+    if not isinstance(decisions, list) or len(decisions) >= expected_count:
+        return resume_value
+    if not decisions:
+        return resume_value
+    padded = list(decisions) + [decisions[-1]] * (expected_count - len(decisions))
+    return {**resume_value, "decisions": padded}
+
+
+def get_first_pending_subagent_interrupt(state: Any) -> tuple[str | None, Any]:
+    """First pending ``(interrupt_id, value)``; ``(None, None)`` if no interrupt.
+
+    Assumes at most one pending interrupt per snapshot (sequential tool nodes).
+    Parallel tool nodes would need an id-aware lookup instead of first-wins.
+    """
+    if state is None:
+        return None, None
+    for it in getattr(state, "interrupts", None) or ():
+        value = getattr(it, "value", None)
+        interrupt_id = getattr(it, "id", None)
+        if value is not None:
+            return (
+                interrupt_id if isinstance(interrupt_id, str) else None,
+                value,
+            )
+    for sub_task in getattr(state, "tasks", None) or ():
+        for it in getattr(sub_task, "interrupts", None) or ():
+            value = getattr(it, "value", None)
+            interrupt_id = getattr(it, "id", None)
+            if value is not None:
+                return (
+                    interrupt_id if isinstance(interrupt_id, str) else None,
+                    value,
+                )
+    return None, None
+
+
+def build_resume_command(resume_value: Any, pending_id: str | None) -> Command:
+    """``Command(resume={id: value})`` when ``id`` is known, else fall back to scalar."""
+    if pending_id is None:
+        return Command(resume=resume_value)
+    return Command(resume={pending_id: resume_value})
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/resume_routing.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/resume_routing.py
new file mode 100644
index 0000000000..37f45e42f1
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/resume_routing.py
@@ -0,0 +1,183 @@
+"""Route a flat ``decisions`` list to per-``tool_call_id`` resume payloads.
+
+The frontend submits decisions in the same order the SSE stream emitted
+approval cards. When multiple parallel subagents are paused, the backend uses
+this module to:
+
+1. Read ``state.interrupts`` from the parent's paused snapshot, extracting
+   ``[(tool_call_id, action_count), ...]`` from each interrupt's value.
+   The ``tool_call_id`` is stamped on by ``propagation.wrap_with_tool_call_id``
+   inside ``task_tool``'s catch-and-stamp block when a subagent's
+   ``GraphInterrupt`` bubbles up through ``[a]task``.
+2. Slice the flat ``decisions`` list against that ordered pending list to
+   produce the dict shape expected by ``consume_surfsense_resume``.
+3. Re-key those slices by ``Interrupt.id`` (langgraph's primitive) for use as
+   the parent-level ``Command(resume={interrupt_id: payload})`` input — the
+   only shape langgraph accepts when multiple interrupts are pending.
+
+All helpers are pure: callers own the state and the input decisions; we
+return new structures and never mutate.
+"""
+
+from __future__ import annotations
+
+import logging
+from collections.abc import Iterable
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+def slice_decisions_by_tool_call(
+    decisions: list[dict[str, Any]],
+    pending: Iterable[tuple[str, int]],
+) -> dict[str, dict[str, Any]]:
+    """Slice ``decisions`` into ``{tool_call_id: {"decisions": <slice>}}``.
+
+    Args:
+        decisions: Flat list of decisions in the order the SSE stream rendered
+            them.
+        pending: Ordered ``(tool_call_id, action_count)`` pairs in the same
+            order. The slicer consumes ``decisions`` left-to-right.
+
+    Returns:
+        Per-``tool_call_id`` payload dict ready to be written to
+        ``configurable["surfsense_resume_value"]``.
+
+    Raises:
+        ValueError: When the total expected action count differs from the
+            number of decisions provided. We fail loud rather than silently
+            dropping or padding so a frontend/backend contract drift surfaces
+            immediately.
+    """
+    pending_list = list(pending)
+    expected = sum(count for _, count in pending_list)
+    if expected != len(decisions):
+        raise ValueError(
+            f"Decision count mismatch: pending tool calls expect "
+            f"{expected} actions but received {len(decisions)} decisions."
+        )
+
+    routed: dict[str, dict[str, Any]] = {}
+    cursor = 0
+    for tool_call_id, action_count in pending_list:
+        routed[tool_call_id] = {"decisions": decisions[cursor : cursor + action_count]}
+        cursor += action_count
+    return routed
+
+
+def collect_pending_tool_calls(state: Any) -> list[tuple[str, int]]:
+    """Extract ``[(tool_call_id, action_count), ...]`` from a paused parent state.
+
+    Reads ``state.interrupts`` (the bundle langgraph aggregated from each
+    paused subagent's propagated interrupt). Each interrupt value carries the
+    ``tool_call_id`` that the parent's ``task`` tool was processing — see
+    ``propagation.wrap_with_tool_call_id`` and ``task_tool``'s
+    ``except GraphInterrupt`` chokepoint.
+
+    Order is preserved from ``state.interrupts``, which is the order the SSE
+    stream emitted approval cards. The frontend submits decisions in that
+    same order, so the slicer can consume them left-to-right.
+
+    Interrupts without a ``tool_call_id`` are skipped — they were not
+    produced by our task-routing layer (e.g. parent-side HITL middleware on
+    a different tool); ``stream_resume_chat`` is not responsible for routing
+    those.
+
+    Args:
+        state: A langgraph ``StateSnapshot`` (or any object with an
+            ``interrupts`` attribute).
+
+    Returns:
+        Ordered list of ``(tool_call_id, action_count)``. ``action_count`` is
+        ``len(value["action_requests"])`` for HITL-bundle values, or ``1`` for
+        scalar-style ``interrupt("...")`` values that were wrapped as
+        ``{"value": ..., "tool_call_id": ...}``.
+
+    Raises:
+        ValueError: When an interrupt value carries a ``tool_call_id`` but
+            the action count cannot be determined (contract bug — every
+            propagated value should be either a HITL bundle or a wrapped
+            scalar).
+    """
+    pending: list[tuple[str, int]] = []
+    for idx, interrupt_obj in enumerate(getattr(state, "interrupts", ()) or ()):
+        value = getattr(interrupt_obj, "value", None)
+        if not isinstance(value, dict):
+            logger.warning(
+                "[hitl_route] interrupt[%d] skipped: value not a dict (type=%s)",
+                idx,
+                type(value).__name__,
+            )
+            continue
+        tool_call_id = value.get("tool_call_id")
+        if not isinstance(tool_call_id, str):
+            # Should not happen post-stamping; flag loudly if a regression
+            # ever lets an unstamped value reach the parent state.
+            logger.warning(
+                "[hitl_route] interrupt[%d] skipped: no tool_call_id stamp (keys=%s)",
+                idx,
+                sorted(value.keys()),
+            )
+            continue
+
+        action_requests = value.get("action_requests")
+        if isinstance(action_requests, list):
+            pending.append((tool_call_id, len(action_requests)))
+            continue
+        if "value" in value:
+            pending.append((tool_call_id, 1))
+            continue
+
+        raise ValueError(
+            f"Interrupt for tool_call_id={tool_call_id!r} has no "
+            "``action_requests`` list and is not a wrapped scalar value; "
+            "cannot determine action count for resume routing."
+        )
+
+    return pending
+
+
+def build_lg_resume_map(
+    state: Any, by_tool_call_id: dict[str, dict[str, Any]]
+) -> dict[str, dict[str, Any]]:
+    """Map ``Interrupt.id → resume_payload`` for langgraph's multi-interrupt resume.
+
+    ``stream_resume_chat`` builds ``by_tool_call_id`` via
+    :func:`slice_decisions_by_tool_call`. Langgraph's ``Command(resume=...)``
+    requires ``Interrupt.id`` keys (not our ``tool_call_id`` stamps) when the
+    parent state has multiple pending interrupts. This pure helper re-keys the
+    slice without mutating it, and skips entries that can't be paired (no
+    stamp, no slice) so contract drift surfaces as a count mismatch at the
+    call site instead of a silent mis-route.
+
+    The two key spaces serve two different consumers:
+    - ``surfsense_resume_value`` (keyed by ``tool_call_id``): read by the
+      subagent bridge inside ``task_tool``.
+    - ``Command(resume=...)`` (keyed by ``Interrupt.id``): read by langgraph's
+      pregel to wake each pending interrupt site.
+
+    Args:
+        state: A langgraph ``StateSnapshot`` (or any object with an
+            ``interrupts`` iterable).
+        by_tool_call_id: Output of :func:`slice_decisions_by_tool_call`.
+
+    Returns:
+        Dict ready to be passed as ``Command(resume=<this>)``.
+    """
+    out: dict[str, dict[str, Any]] = {}
+    for interrupt_obj in getattr(state, "interrupts", ()) or ():
+        value = getattr(interrupt_obj, "value", None)
+        if not isinstance(value, dict):
+            continue
+        tool_call_id = value.get("tool_call_id")
+        if not isinstance(tool_call_id, str):
+            continue
+        interrupt_id = getattr(interrupt_obj, "id", None)
+        if not isinstance(interrupt_id, str):
+            continue
+        payload = by_tool_call_id.get(tool_call_id)
+        if payload is None:
+            continue
+        out[interrupt_id] = payload
+    return out
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/task_description.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/task_description.py
new file mode 100644
index 0000000000..73afa68238
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/task_description.py
@@ -0,0 +1,15 @@
+"""Schema-level description for the ``task`` tool.
+
+Loaded from ``prompts/tools/task/description.md`` so the tool-schema text
+and the ``<tools>`` block render from the same source.
+"""
+
+from __future__ import annotations
+
+from app.agents.multi_agent_chat.main_agent.system_prompt.builder.load_md import (
+    read_prompt_md,
+)
+
+TASK_TOOL_DESCRIPTION: str = read_prompt_md("tools/task/description.md")
+
+__all__ = ["TASK_TOOL_DESCRIPTION"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/task_tool.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/task_tool.py
new file mode 100644
index 0000000000..f9b316e237
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/task_tool.py
@@ -0,0 +1,283 @@
+"""Build the ``task`` tool that invokes subagents with HITL bridging.
+
+The tool's body is the only place where the parent and the subagent meet at
+runtime: it reads the parent's stashed resume value, decides whether to send
+fresh state or a targeted ``Command(resume=...)`` to the subagent, then
+re-raises any new pending interrupt back to the parent.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Annotated, Any, NoReturn
+
+from deepagents.middleware.subagents import TASK_TOOL_DESCRIPTION
+from langchain.tools import BaseTool, ToolRuntime
+from langchain_core.messages import HumanMessage, ToolMessage
+from langchain_core.runnables import Runnable
+from langchain_core.tools import StructuredTool
+from langgraph.errors import GraphInterrupt
+from langgraph.types import Command, Interrupt
+
+from .config import (
+    consume_surfsense_resume,
+    drain_parent_null_resume,
+    has_surfsense_resume,
+    subagent_invoke_config,
+)
+from .constants import EXCLUDED_STATE_KEYS
+from .propagation import wrap_with_tool_call_id
+from .resume import (
+    build_resume_command,
+    fan_out_decisions_to_match,
+    get_first_pending_subagent_interrupt,
+    hitlrequest_action_count,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def _reraise_stamped_subagent_interrupt(
+    gi: GraphInterrupt, tool_call_id: str
+) -> NoReturn:
+    """Stamp ``tool_call_id`` onto each pending interrupt value and re-raise.
+
+    See :mod:`...propagation` for why this stamp is required for resume routing.
+    Chained via ``from gi`` so tracebacks point at the subagent's original
+    ``interrupt(...)`` site.
+    """
+    interrupts = gi.args[0] if gi.args else ()
+    stamped = tuple(
+        Interrupt(
+            value=wrap_with_tool_call_id(i.value, tool_call_id),
+            id=i.id,
+        )
+        for i in interrupts
+    )
+    logger.info(
+        "[hitl_route] stamped %d subagent interrupt(s) with tool_call_id=%s",
+        len(stamped),
+        tool_call_id,
+    )
+    raise GraphInterrupt(stamped) from gi
+
+
+def build_task_tool_with_parent_config(
+    subagents: list[dict[str, Any]],
+    task_description: str | None = None,
+) -> BaseTool:
+    """Upstream ``_build_task_tool`` + parent ``runtime.config`` propagation + resume bridging."""
+    subagent_graphs: dict[str, Runnable] = {
+        spec["name"]: spec["runnable"] for spec in subagents
+    }
+    subagent_description_str = "\n".join(
+        f"- {s['name']}: {s['description']}" for s in subagents
+    )
+
+    if task_description is None:
+        description = TASK_TOOL_DESCRIPTION.format(
+            available_agents=subagent_description_str
+        )
+    elif "{available_agents}" in task_description:
+        description = task_description.format(available_agents=subagent_description_str)
+    else:
+        description = task_description
+
+    def _return_command_with_state_update(result: dict, tool_call_id: str) -> Command:
+        if "messages" not in result:
+            msg = (
+                "CompiledSubAgent must return a state containing a 'messages' key. "
+                "Custom StateGraphs used with CompiledSubAgent should include 'messages' "
+                "in their state schema to communicate results back to the main agent."
+            )
+            raise ValueError(msg)
+
+        state_update = {k: v for k, v in result.items() if k not in EXCLUDED_STATE_KEYS}
+        messages = result["messages"]
+        if not messages:
+            msg = (
+                "CompiledSubAgent returned an empty 'messages' list. "
+                "Subagents must produce at least one message so the parent has "
+                "output to forward back to the user."
+            )
+            raise ValueError(msg)
+        last_text = getattr(messages[-1], "text", None) or ""
+        message_text = last_text.rstrip()
+        return Command(
+            update={
+                **state_update,
+                "messages": [ToolMessage(message_text, tool_call_id=tool_call_id)],
+            }
+        )
+
+    def _validate_and_prepare_state(
+        subagent_type: str, description: str, runtime: ToolRuntime
+    ) -> tuple[Runnable, dict]:
+        subagent = subagent_graphs[subagent_type]
+        subagent_state = {
+            k: v for k, v in runtime.state.items() if k not in EXCLUDED_STATE_KEYS
+        }
+        subagent_state["messages"] = [HumanMessage(content=description)]
+        return subagent, subagent_state
+
+    def task(
+        description: Annotated[
+            str,
+            "A detailed description of the task for the subagent to perform autonomously. Include all necessary context and specify the expected output format.",
+        ],
+        subagent_type: Annotated[
+            str,
+            "The type of subagent to use. Must be one of the available agent types listed in the tool description.",
+        ],
+        runtime: ToolRuntime,
+    ) -> str | Command:
+        if subagent_type not in subagent_graphs:
+            allowed_types = ", ".join([f"`{k}`" for k in subagent_graphs])
+            return (
+                f"We cannot invoke subagent {subagent_type} because it does not exist, "
+                f"the only allowed types are {allowed_types}"
+            )
+        if not runtime.tool_call_id:
+            raise ValueError("Tool call ID is required for subagent invocation")
+        subagent, subagent_state = _validate_and_prepare_state(
+            subagent_type, description, runtime
+        )
+        sub_config = subagent_invoke_config(runtime)
+
+        # Resume bridge: forward the parent's stashed decision into the
+        # subagent's pending ``interrupt()``, targeted by id.
+        pending_id: str | None = None
+        pending_value: Any = None
+        get_state = getattr(subagent, "get_state", None)
+        if callable(get_state):
+            try:
+                snapshot = get_state(sub_config)
+                pending_id, pending_value = get_first_pending_subagent_interrupt(
+                    snapshot
+                )
+            except Exception:
+                # Fail loud if a resume is queued: silent fallback would
+                # replay the original interrupt to the user.
+                if has_surfsense_resume(runtime):
+                    logger.exception(
+                        "Subagent %r get_state raised with resume queued; re-raising.",
+                        subagent_type,
+                    )
+                    raise
+                logger.debug(
+                    "Subagent get_state failed; falling back to fresh invoke",
+                    exc_info=True,
+                )
+
+        if pending_value is not None:
+            resume_value = consume_surfsense_resume(runtime)
+            if resume_value is None:
+                # Bridge invariant: a queued resume must accompany any pending
+                # subagent interrupt. Fall-through replay would silently re-prompt
+                # the user; raise so the streaming layer surfaces a clear error.
+                raise RuntimeError(
+                    f"Subagent {subagent_type!r} has a pending interrupt but no "
+                    "surfsense_resume_value on config; resume bridge is broken."
+                )
+            expected = hitlrequest_action_count(pending_value)
+            resume_value = fan_out_decisions_to_match(resume_value, expected)
+            # Prevent the parent's resume payload from leaking into subagent
+            # interrupts via langgraph's parent_scratchpad fallback.
+            drain_parent_null_resume(runtime)
+            try:
+                result = subagent.invoke(
+                    build_resume_command(resume_value, pending_id),
+                    config=sub_config,
+                )
+            except GraphInterrupt as gi:
+                _reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
+        else:
+            try:
+                result = subagent.invoke(subagent_state, config=sub_config)
+            except GraphInterrupt as gi:
+                _reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
+        return _return_command_with_state_update(result, runtime.tool_call_id)
+
+    async def atask(
+        description: Annotated[
+            str,
+            "A detailed description of the task for the subagent to perform autonomously. Include all necessary context and specify the expected output format.",
+        ],
+        subagent_type: Annotated[
+            str,
+            "The type of subagent to use. Must be one of the available agent types listed in the tool description.",
+        ],
+        runtime: ToolRuntime,
+    ) -> str | Command:
+        logger.info(
+            "[hitl_route] atask ENTRY: subagent_type=%r tool_call_id=%s",
+            subagent_type,
+            runtime.tool_call_id,
+        )
+        if subagent_type not in subagent_graphs:
+            allowed_types = ", ".join([f"`{k}`" for k in subagent_graphs])
+            return (
+                f"We cannot invoke subagent {subagent_type} because it does not exist, "
+                f"the only allowed types are {allowed_types}"
+            )
+        if not runtime.tool_call_id:
+            raise ValueError("Tool call ID is required for subagent invocation")
+        subagent, subagent_state = _validate_and_prepare_state(
+            subagent_type, description, runtime
+        )
+        sub_config = subagent_invoke_config(runtime)
+
+        # Resume bridge — see ``task`` above.
+        pending_id: str | None = None
+        pending_value: Any = None
+        aget_state = getattr(subagent, "aget_state", None)
+        if callable(aget_state):
+            try:
+                snapshot = await aget_state(sub_config)
+                pending_id, pending_value = get_first_pending_subagent_interrupt(
+                    snapshot
+                )
+            except Exception:
+                if has_surfsense_resume(runtime):
+                    logger.exception(
+                        "Subagent %r aget_state raised with resume queued; re-raising.",
+                        subagent_type,
+                    )
+                    raise
+                logger.debug(
+                    "Subagent aget_state failed; falling back to fresh ainvoke",
+                    exc_info=True,
+                )
+
+        if pending_value is not None:
+            resume_value = consume_surfsense_resume(runtime)
+            if resume_value is None:
+                raise RuntimeError(
+                    f"Subagent {subagent_type!r} has a pending interrupt but no "
+                    "surfsense_resume_value on config; resume bridge is broken."
+                )
+            expected = hitlrequest_action_count(pending_value)
+            resume_value = fan_out_decisions_to_match(resume_value, expected)
+            # Prevent the parent's resume payload from leaking into subagent
+            # interrupts via langgraph's parent_scratchpad fallback.
+            drain_parent_null_resume(runtime)
+            try:
+                result = await subagent.ainvoke(
+                    build_resume_command(resume_value, pending_id),
+                    config=sub_config,
+                )
+            except GraphInterrupt as gi:
+                _reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
+        else:
+            try:
+                result = await subagent.ainvoke(subagent_state, config=sub_config)
+            except GraphInterrupt as gi:
+                _reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
+        return _return_command_with_state_update(result, runtime.tool_call_id)
+
+    return StructuredTool.from_function(
+        name="task",
+        func=task,
+        coroutine=atask,
+        description=description,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/context_editing.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/context_editing.py
new file mode 100644
index 0000000000..e8f99933e6
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/context_editing.py
@@ -0,0 +1,50 @@
+"""Spill + clear-tool-uses passes to keep payloads under budget."""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+from typing import Any
+
+from langchain_core.tools import BaseTool
+
+from app.agents.multi_agent_chat.main_agent.context_prune.prune_tool_names import (
+    safe_exclude_tools,
+)
+from app.agents.new_chat.feature_flags import AgentFeatureFlags
+from app.agents.new_chat.middleware import (
+    ClearToolUsesEdit,
+    SpillingContextEditingMiddleware,
+    SpillToBackendEdit,
+)
+
+from ..shared.flags import enabled
+
+
+def build_context_editing_mw(
+    *,
+    flags: AgentFeatureFlags,
+    max_input_tokens: int | None,
+    tools: Sequence[BaseTool],
+    backend_resolver: Any,
+) -> SpillingContextEditingMiddleware | None:
+    if not enabled(flags, "enable_context_editing") or not max_input_tokens:
+        return None
+    spill_edit = SpillToBackendEdit(
+        trigger=int(max_input_tokens * 0.55),
+        clear_at_least=int(max_input_tokens * 0.15),
+        keep=5,
+        exclude_tools=safe_exclude_tools(tools),
+        clear_tool_inputs=True,
+    )
+    clear_edit = ClearToolUsesEdit(
+        trigger=int(max_input_tokens * 0.55),
+        clear_at_least=int(max_input_tokens * 0.15),
+        keep=5,
+        exclude_tools=safe_exclude_tools(tools),
+        clear_tool_inputs=True,
+        placeholder="[cleared - older tool output trimmed for context]",
+    )
+    return SpillingContextEditingMiddleware(
+        edits=[spill_edit, clear_edit],
+        backend_resolver=backend_resolver,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/dedup_hitl.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/dedup_hitl.py
new file mode 100644
index 0000000000..66cae300b1
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/dedup_hitl.py
@@ -0,0 +1,13 @@
+"""Drop duplicate HITL tool calls before execution."""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+from langchain_core.tools import BaseTool
+
+from app.agents.new_chat.middleware import DedupHITLToolCallsMiddleware
+
+
+def build_dedup_hitl_mw(tools: Sequence[BaseTool]) -> DedupHITLToolCallsMiddleware:
+    return DedupHITLToolCallsMiddleware(agent_tools=list(tools))
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/doom_loop.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/doom_loop.py
new file mode 100644
index 0000000000..d67b8d518b
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/doom_loop.py
@@ -0,0 +1,14 @@
+"""Stop N identical tool calls in a row via interrupt."""
+
+from __future__ import annotations
+
+from app.agents.new_chat.feature_flags import AgentFeatureFlags
+from app.agents.new_chat.middleware import DoomLoopMiddleware
+
+from ..shared.flags import enabled
+
+
+def build_doom_loop_mw(flags: AgentFeatureFlags) -> DoomLoopMiddleware | None:
+    return (
+        DoomLoopMiddleware(threshold=3) if enabled(flags, "enable_doom_loop") else None
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/kb_persistence.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/kb_persistence.py
new file mode 100644
index 0000000000..4b27581e7e
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/kb_persistence.py
@@ -0,0 +1,23 @@
+"""Commit staged cloud filesystem mutations to Postgres at end of turn."""
+
+from __future__ import annotations
+
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+from app.agents.new_chat.middleware import KnowledgeBasePersistenceMiddleware
+
+
+def build_kb_persistence_mw(
+    *,
+    filesystem_mode: FilesystemMode,
+    search_space_id: int,
+    user_id: str | None,
+    thread_id: int | None,
+) -> KnowledgeBasePersistenceMiddleware | None:
+    if filesystem_mode != FilesystemMode.CLOUD:
+        return None
+    return KnowledgeBasePersistenceMiddleware(
+        search_space_id=search_space_id,
+        created_by_id=user_id,
+        filesystem_mode=filesystem_mode,
+        thread_id=thread_id,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/knowledge_priority.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/knowledge_priority.py
new file mode 100644
index 0000000000..fcdb1c61e8
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/knowledge_priority.py
@@ -0,0 +1,28 @@
+"""KB priority planner: <priority_documents> injection."""
+
+from __future__ import annotations
+
+from langchain_core.language_models import BaseChatModel
+
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+from app.agents.new_chat.middleware import KnowledgePriorityMiddleware
+
+
+def build_knowledge_priority_mw(
+    *,
+    llm: BaseChatModel,
+    search_space_id: int,
+    filesystem_mode: FilesystemMode,
+    available_connectors: list[str] | None,
+    available_document_types: list[str] | None,
+    mentioned_document_ids: list[int] | None,
+) -> KnowledgePriorityMiddleware:
+    return KnowledgePriorityMiddleware(
+        llm=llm,
+        search_space_id=search_space_id,
+        filesystem_mode=filesystem_mode,
+        available_connectors=available_connectors,
+        available_document_types=available_document_types,
+        mentioned_document_ids=mentioned_document_ids,
+        inject_system_message=False,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/knowledge_tree.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/knowledge_tree.py
new file mode 100644
index 0000000000..fb45110679
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/knowledge_tree.py
@@ -0,0 +1,24 @@
+"""<workspace_tree> injection (cloud only)."""
+
+from __future__ import annotations
+
+from langchain_core.language_models import BaseChatModel
+
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+from app.agents.new_chat.middleware import KnowledgeTreeMiddleware
+
+
+def build_knowledge_tree_mw(
+    *,
+    filesystem_mode: FilesystemMode,
+    search_space_id: int,
+    llm: BaseChatModel,
+) -> KnowledgeTreeMiddleware | None:
+    if filesystem_mode != FilesystemMode.CLOUD:
+        return None
+    return KnowledgeTreeMiddleware(
+        search_space_id=search_space_id,
+        filesystem_mode=filesystem_mode,
+        llm=llm,
+        inject_system_message=False,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/noop_injection.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/noop_injection.py
new file mode 100644
index 0000000000..6e6467ad09
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/noop_injection.py
@@ -0,0 +1,12 @@
+"""Provider-compat: append a `_noop` tool when tools=[] but history has tool calls."""
+
+from __future__ import annotations
+
+from app.agents.new_chat.feature_flags import AgentFeatureFlags
+from app.agents.new_chat.middleware import NoopInjectionMiddleware
+
+from ..shared.flags import enabled
+
+
+def build_noop_injection_mw(flags: AgentFeatureFlags) -> NoopInjectionMiddleware | None:
+    return NoopInjectionMiddleware() if enabled(flags, "enable_compaction_v2") else None
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/otel.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/otel.py
new file mode 100644
index 0000000000..bd7516e655
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/otel.py
@@ -0,0 +1,12 @@
+"""OTel spans on model and tool calls."""
+
+from __future__ import annotations
+
+from app.agents.new_chat.feature_flags import AgentFeatureFlags
+from app.agents.new_chat.middleware import OtelSpanMiddleware
+
+from ..shared.flags import enabled
+
+
+def build_otel_mw(flags: AgentFeatureFlags) -> OtelSpanMiddleware | None:
+    return OtelSpanMiddleware() if enabled(flags, "enable_otel") else None
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/plugins.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/plugins.py
new file mode 100644
index 0000000000..4418e38068
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/plugins.py
@@ -0,0 +1,49 @@
+"""Tail-of-stack plugin slot driven by env allowlist."""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+from langchain_core.language_models import BaseChatModel
+
+from app.agents.new_chat.feature_flags import AgentFeatureFlags
+from app.agents.new_chat.plugin_loader import (
+    PluginContext,
+    load_allowed_plugin_names_from_env,
+    load_plugin_middlewares,
+)
+from app.db import ChatVisibility
+
+from ..shared.flags import enabled
+
+
+def build_plugin_middlewares(
+    *,
+    flags: AgentFeatureFlags,
+    search_space_id: int,
+    user_id: str | None,
+    visibility: ChatVisibility,
+    llm: BaseChatModel,
+) -> list[Any]:
+    if not enabled(flags, "enable_plugin_loader"):
+        return []
+    try:
+        allowed_names = load_allowed_plugin_names_from_env()
+        if not allowed_names:
+            return []
+        return load_plugin_middlewares(
+            PluginContext.build(
+                search_space_id=search_space_id,
+                user_id=user_id,
+                thread_visibility=visibility,
+                llm=llm,
+            ),
+            allowed_plugin_names=allowed_names,
+        )
+    except Exception:  # pragma: no cover - defensive
+        logging.warning(
+            "Plugin loader failed; continuing without plugins.",
+            exc_info=True,
+        )
+        return []
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/repair.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/repair.py
new file mode 100644
index 0000000000..378b61be19
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/repair.py
@@ -0,0 +1,50 @@
+"""Repair miscased / unknown tool names to the registered set or invalid_tool."""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+from langchain_core.tools import BaseTool
+
+from app.agents.new_chat.feature_flags import AgentFeatureFlags
+from app.agents.new_chat.middleware import ToolCallNameRepairMiddleware
+
+from ..shared.flags import enabled
+
+# deepagents-built-in tool names the repair pass treats as known.
+_DEEPAGENT_BUILTIN_TOOL_NAMES: frozenset[str] = frozenset(
+    {
+        "write_todos",
+        "ls",
+        "read_file",
+        "write_file",
+        "edit_file",
+        "glob",
+        "grep",
+        "execute",
+        "task",
+        "mkdir",
+        "cd",
+        "pwd",
+        "move_file",
+        "rm",
+        "rmdir",
+        "list_tree",
+        "execute_code",
+    }
+)
+
+
+def build_repair_mw(
+    *,
+    flags: AgentFeatureFlags,
+    tools: Sequence[BaseTool],
+) -> ToolCallNameRepairMiddleware | None:
+    if not enabled(flags, "enable_tool_call_repair"):
+        return None
+    registered_names: set[str] = {t.name for t in tools}
+    registered_names |= _DEEPAGENT_BUILTIN_TOOL_NAMES
+    return ToolCallNameRepairMiddleware(
+        registered_tool_names=registered_names,
+        fuzzy_match_threshold=None,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/skills.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/skills.py
new file mode 100644
index 0000000000..63a57c5a04
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/skills.py
@@ -0,0 +1,39 @@
+"""Skill discovery + injection."""
+
+from __future__ import annotations
+
+import logging
+
+from deepagents.middleware.skills import SkillsMiddleware
+
+from app.agents.new_chat.feature_flags import AgentFeatureFlags
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+from app.agents.new_chat.middleware import (
+    build_skills_backend_factory,
+    default_skills_sources,
+)
+
+from ..shared.flags import enabled
+
+
+def build_skills_mw(
+    *,
+    flags: AgentFeatureFlags,
+    filesystem_mode: FilesystemMode,
+    search_space_id: int,
+) -> SkillsMiddleware | None:
+    if not enabled(flags, "enable_skills"):
+        return None
+    try:
+        skills_factory = build_skills_backend_factory(
+            search_space_id=search_space_id
+            if filesystem_mode == FilesystemMode.CLOUD
+            else None,
+        )
+        return SkillsMiddleware(
+            backend=skills_factory,
+            sources=default_skills_sources(),
+        )
+    except Exception as exc:  # pragma: no cover - defensive
+        logging.warning("SkillsMiddleware init failed; skipping: %s", exc)
+        return None
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/anthropic_cache.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/anthropic_cache.py
new file mode 100644
index 0000000000..f99fb9c7f2
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/anthropic_cache.py
@@ -0,0 +1,9 @@
+"""Anthropic prompt caching annotations on system/tool/message blocks."""
+
+from __future__ import annotations
+
+from langchain_anthropic.middleware import AnthropicPromptCachingMiddleware
+
+
+def build_anthropic_cache_mw() -> AnthropicPromptCachingMiddleware:
+    return AnthropicPromptCachingMiddleware(unsupported_model_behavior="ignore")
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/compaction.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/compaction.py
new file mode 100644
index 0000000000..b59e7d2c4b
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/compaction.py
@@ -0,0 +1,14 @@
+"""Context-window summarization with SurfSense protected sections."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from deepagents.backends import StateBackend
+from langchain_core.language_models import BaseChatModel
+
+from app.agents.new_chat.middleware import create_surfsense_compaction_middleware
+
+
+def build_compaction_mw(llm: BaseChatModel) -> Any:
+    return create_surfsense_compaction_middleware(llm, StateBackend)
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/__init__.py
new file mode 100644
index 0000000000..0dcc567e0f
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/__init__.py
@@ -0,0 +1,11 @@
+"""SurfSense filesystem middleware (multi-agent flavour)."""
+
+from __future__ import annotations
+
+from .index import build_filesystem_mw
+from .middleware import SurfSenseFilesystemMiddleware
+
+__all__ = [
+    "SurfSenseFilesystemMiddleware",
+    "build_filesystem_mw",
+]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/index.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/index.py
new file mode 100644
index 0000000000..fb8dbe2092
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/index.py
@@ -0,0 +1,28 @@
+"""Public composition factory for the filesystem middleware."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+
+from .middleware import SurfSenseFilesystemMiddleware
+
+
+def build_filesystem_mw(
+    *,
+    backend_resolver: Any,
+    filesystem_mode: FilesystemMode,
+    search_space_id: int,
+    user_id: str | None,
+    thread_id: int | None,
+    read_only: bool = False,
+) -> SurfSenseFilesystemMiddleware:
+    return SurfSenseFilesystemMiddleware(
+        backend=backend_resolver,
+        filesystem_mode=filesystem_mode,
+        search_space_id=search_space_id,
+        created_by_id=user_id,
+        thread_id=thread_id,
+        read_only=read_only,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/middleware/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/middleware/__init__.py
new file mode 100644
index 0000000000..466672aa33
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/middleware/__init__.py
@@ -0,0 +1,33 @@
+"""SurfSense filesystem middleware: class + focused-responsibility helpers."""
+
+from __future__ import annotations
+
+from .index import (
+    SurfSenseFilesystemMiddleware,
+    check_cloud_write_namespace,
+    current_cwd,
+    default_cwd,
+    get_contract_suggested_path,
+    is_cloud,
+    normalize_local_mount_path,
+    resolve_list_target_path,
+    resolve_move_target_path,
+    resolve_relative,
+    resolve_write_target_path,
+    run_async_blocking,
+)
+
+__all__ = [
+    "SurfSenseFilesystemMiddleware",
+    "check_cloud_write_namespace",
+    "current_cwd",
+    "default_cwd",
+    "get_contract_suggested_path",
+    "is_cloud",
+    "normalize_local_mount_path",
+    "resolve_list_target_path",
+    "resolve_move_target_path",
+    "resolve_relative",
+    "resolve_write_target_path",
+    "run_async_blocking",
+]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/middleware/async_dispatch.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/middleware/async_dispatch.py
new file mode 100644
index 0000000000..58a2d9168b
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/middleware/async_dispatch.py
@@ -0,0 +1,22 @@
+"""Sync/async dispatcher: drive an async tool body from a sync entry-point."""
+
+from __future__ import annotations
+
+import asyncio
+from typing import Any
+
+
+def run_async_blocking(coro: Any) -> Any:
+    """Run ``coro`` to completion, blocking the current thread.
+
+    Returns an error string instead of raising if the current thread is
+    already inside a running event loop — keeps sync tool entry-points
+    safe to call from any context.
+    """
+    try:
+        loop = asyncio.get_running_loop()
+        if loop.is_running():
+            return "Error: sync filesystem operation not supported inside an active event loop."
+    except RuntimeError:
+        pass
+    return asyncio.run(coro)
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/middleware/index.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/middleware/index.py
new file mode 100644
index 0000000000..b5ceb31023
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/middleware/index.py
@@ -0,0 +1,32 @@
+"""Public surface of the middleware package: class + helpers used by tool factories."""
+
+from __future__ import annotations
+
+from .async_dispatch import run_async_blocking
+from .middleware import SurfSenseFilesystemMiddleware
+from .mode import default_cwd, is_cloud
+from .namespace_policy import check_cloud_write_namespace
+from .path_resolution import (
+    current_cwd,
+    get_contract_suggested_path,
+    normalize_local_mount_path,
+    resolve_list_target_path,
+    resolve_move_target_path,
+    resolve_relative,
+    resolve_write_target_path,
+)
+
+__all__ = [
+    "SurfSenseFilesystemMiddleware",
+    "check_cloud_write_namespace",
+    "current_cwd",
+    "default_cwd",
+    "get_contract_suggested_path",
+    "is_cloud",
+    "normalize_local_mount_path",
+    "resolve_list_target_path",
+    "resolve_move_target_path",
+    "resolve_relative",
+    "resolve_write_target_path",
+    "run_async_blocking",
+]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/middleware/middleware.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/middleware/middleware.py
new file mode 100644
index 0000000000..c32e144388
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/middleware/middleware.py
@@ -0,0 +1,105 @@
+"""``SurfSenseFilesystemMiddleware``: per-session state + tool registration."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from deepagents import FilesystemMiddleware
+from langchain_core.tools import BaseTool
+
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+from app.agents.new_chat.filesystem_state import SurfSenseFilesystemState
+from app.agents.new_chat.sandbox import is_sandbox_enabled
+
+from ..system_prompt import build_system_prompt
+from ..tools import (
+    create_cd_tool,
+    create_edit_file_tool,
+    create_execute_code_tool,
+    create_list_tree_tool,
+    create_ls_tool,
+    create_mkdir_tool,
+    create_move_file_tool,
+    create_pwd_tool,
+    create_read_file_tool,
+    create_rm_tool,
+    create_rmdir_tool,
+    create_write_file_tool,
+)
+from ..tools.glob.description import select_description as glob_description
+from ..tools.grep.description import select_description as grep_description
+from .read_only_policy import READ_ONLY_TOOL_NAMES
+
+
+class SurfSenseFilesystemMiddleware(FilesystemMiddleware):
+    """SurfSense-specific filesystem middleware (cloud + desktop)."""
+
+    state_schema = SurfSenseFilesystemState
+
+    def __init__(
+        self,
+        *,
+        backend: Any = None,
+        filesystem_mode: FilesystemMode = FilesystemMode.CLOUD,
+        search_space_id: int | None = None,
+        created_by_id: str | None = None,
+        thread_id: int | str | None = None,
+        tool_token_limit_before_evict: int | None = 20000,
+        read_only: bool = False,
+    ) -> None:
+        self._filesystem_mode = filesystem_mode
+        self._search_space_id = search_space_id
+        self._created_by_id = created_by_id
+        self._thread_id = thread_id
+        self._read_only = read_only
+        self._sandbox_available = (
+            is_sandbox_enabled() and thread_id is not None and not read_only
+        )
+
+        system_prompt = build_system_prompt(
+            filesystem_mode,
+            sandbox_available=self._sandbox_available,
+        )
+
+        super().__init__(
+            backend=backend,
+            system_prompt=system_prompt,
+            tool_token_limit_before_evict=tool_token_limit_before_evict,
+        )
+        self.tools = [t for t in self.tools if t.name != "execute"]
+        self.tools.append(create_mkdir_tool(self))
+        self.tools.append(create_cd_tool(self))
+        self.tools.append(create_pwd_tool(self))
+        self.tools.append(create_move_file_tool(self))
+        self.tools.append(create_rm_tool(self))
+        self.tools.append(create_rmdir_tool(self))
+        self.tools.append(create_list_tree_tool(self))
+        if self._sandbox_available:
+            self.tools.append(create_execute_code_tool(self))
+
+        if read_only:
+            self.tools = [t for t in self.tools if t.name in READ_ONLY_TOOL_NAMES]
+
+    # ----------------------------------------- base-class tool overrides
+
+    def _create_ls_tool(self) -> BaseTool:
+        return create_ls_tool(self)
+
+    def _create_read_file_tool(self) -> BaseTool:
+        return create_read_file_tool(self)
+
+    def _create_write_file_tool(self) -> BaseTool:
+        return create_write_file_tool(self)
+
+    def _create_edit_file_tool(self) -> BaseTool:
+        return create_edit_file_tool(self)
+
+    def _create_glob_tool(self) -> BaseTool:
+        tool = super()._create_glob_tool()
+        tool.description = glob_description(self._filesystem_mode).rstrip()
+        return tool
+
+    def _create_grep_tool(self) -> BaseTool:
+        tool = super()._create_grep_tool()
+        tool.description = grep_description(self._filesystem_mode).rstrip()
+        return tool
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/middleware/mode.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/middleware/mode.py
new file mode 100644
index 0000000000..a23d775354
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/middleware/mode.py
@@ -0,0 +1,15 @@
+"""Mode-derived facts: ``is_cloud`` and ``default_cwd``."""
+
+from __future__ import annotations
+
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+from app.agents.new_chat.path_resolver import DOCUMENTS_ROOT
+
+
+def is_cloud(mode: FilesystemMode) -> bool:
+    return mode == FilesystemMode.CLOUD
+
+
+def default_cwd(mode: FilesystemMode) -> str:
+    """``/documents`` on cloud; ``/`` on desktop (mounts are children of ``/``)."""
+    return DOCUMENTS_ROOT if is_cloud(mode) else "/"
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/middleware/namespace_policy.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/middleware/namespace_policy.py
new file mode 100644
index 0000000000..5390504141
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/middleware/namespace_policy.py
@@ -0,0 +1,51 @@
+"""Cloud-only write namespace policy.
+
+A write is allowed iff it lands under ``/documents/`` OR its basename uses
+the ``temp_`` scratch prefix. The anonymous uploaded document is read-only
+even when its path is under ``/documents/``.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from langchain.tools import ToolRuntime
+
+from app.agents.new_chat.filesystem_state import SurfSenseFilesystemState
+from app.agents.new_chat.path_resolver import DOCUMENTS_ROOT
+
+from ..shared.paths import TEMP_PREFIX, basename
+from .mode import is_cloud
+
+if TYPE_CHECKING:
+    from .middleware import SurfSenseFilesystemMiddleware
+
+
+def check_cloud_write_namespace(
+    mw: SurfSenseFilesystemMiddleware,
+    path: str,
+    runtime: ToolRuntime[None, SurfSenseFilesystemState],
+) -> str | None:
+    """Return an error string if cloud writes to ``path`` are not allowed.
+
+    Order matters:
+    1. Reject writes to the anonymous read-only doc.
+    2. Allow ``/documents/*``.
+    3. Allow ``temp_*`` basename anywhere.
+    4. Reject everything else.
+    """
+    if not is_cloud(mw._filesystem_mode):
+        return None
+    anon = runtime.state.get("kb_anon_doc") or {}
+    if isinstance(anon, dict):
+        anon_path = str(anon.get("path") or "")
+        if anon_path and anon_path == path:
+            return "Error: the anonymous uploaded document is read-only."
+    if path.startswith(DOCUMENTS_ROOT + "/") or path == DOCUMENTS_ROOT:
+        return None
+    if basename(path).startswith(TEMP_PREFIX):
+        return None
+    return (
+        "Error: cloud writes must target /documents/<...> or use a 'temp_' "
+        f"basename for scratch (got '{path}')."
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/middleware/path_resolution.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/middleware/path_resolution.py
new file mode 100644
index 0000000000..2c8ec6b4d4
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/middleware/path_resolution.py
@@ -0,0 +1,174 @@
+"""Resolve user-supplied paths to absolute paths the backends accept."""
+
+from __future__ import annotations
+
+import posixpath
+from typing import TYPE_CHECKING
+
+from langchain.tools import ToolRuntime
+
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+from app.agents.new_chat.filesystem_state import SurfSenseFilesystemState
+from app.agents.new_chat.middleware.multi_root_local_folder_backend import (
+    MultiRootLocalFolderBackend,
+)
+
+from ..shared.paths import (
+    extract_mount_from_path,
+    local_parent_path,
+    normalize_absolute_path,
+)
+from .mode import default_cwd
+
+if TYPE_CHECKING:
+    from .middleware import SurfSenseFilesystemMiddleware
+
+
+def current_cwd(
+    mw: SurfSenseFilesystemMiddleware,
+    runtime: ToolRuntime[None, SurfSenseFilesystemState],
+) -> str:
+    cwd = runtime.state.get("cwd") if hasattr(runtime, "state") else None
+    if isinstance(cwd, str) and cwd.startswith("/"):
+        return cwd
+    return default_cwd(mw._filesystem_mode)
+
+
+def get_contract_suggested_path(
+    mw: SurfSenseFilesystemMiddleware,
+    runtime: ToolRuntime[None, SurfSenseFilesystemState],
+) -> str:
+    """Read the planner's suggested write path; otherwise default to ``notes.md``."""
+    contract = runtime.state.get("file_operation_contract") or {}
+    suggested = contract.get("suggested_path")
+    if isinstance(suggested, str) and suggested.strip():
+        return normalize_absolute_path(suggested)
+    return default_cwd(mw._filesystem_mode).rstrip("/") + "/notes.md"
+
+
+def resolve_relative(
+    mw: SurfSenseFilesystemMiddleware,
+    path: str,
+    runtime: ToolRuntime[None, SurfSenseFilesystemState],
+) -> str:
+    """Resolve ``path`` against cwd (no-op if already absolute)."""
+    candidate = path.strip()
+    if not candidate:
+        return current_cwd(mw, runtime)
+    if candidate.startswith("/"):
+        return normalize_absolute_path(candidate)
+    cwd = current_cwd(mw, runtime)
+    joined = posixpath.normpath(posixpath.join(cwd, candidate))
+    return normalize_absolute_path(joined)
+
+
+def resolve_write_target_path(
+    mw: SurfSenseFilesystemMiddleware,
+    file_path: str,
+    runtime: ToolRuntime[None, SurfSenseFilesystemState],
+) -> str:
+    """Empty → contract suggestion; desktop → mount-prefix; cloud → cwd-relative."""
+    candidate = file_path.strip()
+    if not candidate:
+        return get_contract_suggested_path(mw, runtime)
+    if mw._filesystem_mode == FilesystemMode.DESKTOP_LOCAL_FOLDER:
+        return normalize_local_mount_path(mw, candidate, runtime)
+    return resolve_relative(mw, candidate, runtime)
+
+
+def resolve_move_target_path(
+    mw: SurfSenseFilesystemMiddleware,
+    file_path: str,
+    runtime: ToolRuntime[None, SurfSenseFilesystemState],
+) -> str:
+    """Empty → empty (caller validates); desktop → mount-prefix; cloud → cwd-relative."""
+    candidate = file_path.strip()
+    if not candidate:
+        return ""
+    if mw._filesystem_mode == FilesystemMode.DESKTOP_LOCAL_FOLDER:
+        return normalize_local_mount_path(mw, candidate, runtime)
+    return resolve_relative(mw, candidate, runtime)
+
+
+def resolve_list_target_path(
+    mw: SurfSenseFilesystemMiddleware,
+    path: str,
+    runtime: ToolRuntime[None, SurfSenseFilesystemState],
+) -> str:
+    """Root stays root; desktop → mount-prefix; cloud → cwd-relative."""
+    candidate = path.strip() or current_cwd(mw, runtime)
+    if candidate == "/":
+        return "/"
+    if mw._filesystem_mode == FilesystemMode.DESKTOP_LOCAL_FOLDER:
+        return normalize_local_mount_path(mw, candidate, runtime)
+    return resolve_relative(mw, candidate, runtime)
+
+
+def normalize_local_mount_path(
+    mw: SurfSenseFilesystemMiddleware,
+    candidate: str,
+    runtime: ToolRuntime[None, SurfSenseFilesystemState],
+) -> str:
+    """Desktop only: prepend a mount prefix when the path doesn't already have one.
+
+    Resolution order: explicit mount prefix → single available mount →
+    contract-suggested mount → mount where the path exists → mount where the
+    parent exists → backend default mount.
+    """
+    normalized = normalize_absolute_path(candidate)
+    backend = mw._get_backend(runtime)
+    if not isinstance(backend, MultiRootLocalFolderBackend):
+        return normalized
+
+    mounts = backend.list_mounts()
+    explicit_mount = extract_mount_from_path(normalized, mounts)
+    if explicit_mount:
+        return normalized
+
+    if len(mounts) == 1:
+        return f"/{mounts[0]}{normalized}"
+
+    suggested_mount: str | None = None
+    contract = runtime.state.get("file_operation_contract") or {}
+    suggested_path = contract.get("suggested_path")
+    if isinstance(suggested_path, str) and suggested_path.strip():
+        normalized_suggested = normalize_absolute_path(suggested_path)
+        suggested_mount = extract_mount_from_path(normalized_suggested, mounts)
+
+    matching_mounts = [
+        mount
+        for mount in mounts
+        if _path_exists_under_mount(backend, mount, normalized)
+    ]
+    if len(matching_mounts) == 1:
+        return f"/{matching_mounts[0]}{normalized}"
+
+    parent_path = local_parent_path(normalized)
+    if parent_path != "/":
+        parent_matching_mounts = [
+            mount
+            for mount in mounts
+            if _path_exists_under_mount(backend, mount, parent_path)
+        ]
+        if len(parent_matching_mounts) == 1:
+            return f"/{parent_matching_mounts[0]}{normalized}"
+
+    if suggested_mount:
+        return f"/{suggested_mount}{normalized}"
+
+    return f"/{backend.default_mount()}{normalized}"
+
+
+def _path_exists_under_mount(
+    backend: MultiRootLocalFolderBackend,
+    mount: str,
+    local_path: str,
+) -> bool:
+    result = backend.list_tree(
+        f"/{mount}{local_path}",
+        max_depth=0,
+        page_size=1,
+        include_files=True,
+        include_dirs=True,
+    )
+    return not bool(result.get("error"))
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/middleware/read_only_policy.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/middleware/read_only_policy.py
new file mode 100644
index 0000000000..c1d8578734
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/middleware/read_only_policy.py
@@ -0,0 +1,7 @@
+"""Allowlist consulted by ``SurfSenseFilesystemMiddleware`` when ``read_only=True``."""
+
+from __future__ import annotations
+
+READ_ONLY_TOOL_NAMES = frozenset(
+    {"ls", "read_file", "glob", "grep", "list_tree", "pwd", "cd"}
+)
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/shared/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/shared/__init__.py
new file mode 100644
index 0000000000..3c86805c16
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/shared/__init__.py
@@ -0,0 +1,21 @@
+"""Stateless utilities shared by the middleware and tool factories."""
+
+from __future__ import annotations
+
+from .paths import (
+    TEMP_PREFIX,
+    basename,
+    extract_mount_from_path,
+    is_ancestor_of,
+    local_parent_path,
+    normalize_absolute_path,
+)
+
+__all__ = [
+    "TEMP_PREFIX",
+    "basename",
+    "extract_mount_from_path",
+    "is_ancestor_of",
+    "local_parent_path",
+    "normalize_absolute_path",
+]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/shared/paths.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/shared/paths.py
new file mode 100644
index 0000000000..39853c519b
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/shared/paths.py
@@ -0,0 +1,51 @@
+"""Stateless path utilities shared by the middleware class and tool factories."""
+
+from __future__ import annotations
+
+import re
+
+TEMP_PREFIX = "temp_"
+
+
+def normalize_absolute_path(candidate: str) -> str:
+    """Collapse slashes / backslashes and force an absolute path."""
+    normalized = re.sub(r"/+", "/", candidate.strip().replace("\\", "/"))
+    if not normalized:
+        return "/"
+    if normalized.startswith("/"):
+        return normalized
+    return f"/{normalized.lstrip('/')}"
+
+
+def extract_mount_from_path(path: str, mounts: tuple[str, ...]) -> str | None:
+    """Return the leading mount segment if it's in ``mounts``, else None."""
+    rel = path.lstrip("/")
+    if not rel:
+        return None
+    mount, _, _ = rel.partition("/")
+    if mount in mounts:
+        return mount
+    return None
+
+
+def local_parent_path(path: str) -> str:
+    """Posix-style parent path (root = ``/``)."""
+    rel = path.lstrip("/")
+    if "/" not in rel:
+        return "/"
+    parent = rel.rsplit("/", 1)[0].strip("/")
+    if not parent:
+        return "/"
+    return f"/{parent}"
+
+
+def basename(path: str) -> str:
+    return path.rsplit("/", 1)[-1]
+
+
+def is_ancestor_of(candidate: str, target: str) -> bool:
+    """True iff ``candidate`` is a strict-or-equal ancestor of ``target``."""
+    if candidate == "/":
+        return target != "/"
+    cand = candidate.rstrip("/")
+    return target == cand or target.startswith(cand + "/")
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/system_prompt/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/system_prompt/__init__.py
new file mode 100644
index 0000000000..e191d70662
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/system_prompt/__init__.py
@@ -0,0 +1,7 @@
+"""Filesystem-middleware system prompt (cloud + desktop modes)."""
+
+from __future__ import annotations
+
+from .index import build_system_prompt
+
+__all__ = ["build_system_prompt"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/system_prompt/cloud.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/system_prompt/cloud.py
new file mode 100644
index 0000000000..98dbbaaabd
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/system_prompt/cloud.py
@@ -0,0 +1,71 @@
+"""Cloud-mode filesystem system prompt body."""
+
+from __future__ import annotations
+
+BODY = """
+## Filesystem Tools
+
+All file paths must start with `/`. Relative paths resolve against the
+current working directory (`cwd`, default `/documents`).
+
+- ls(path, offset=0, limit=200): list files and directories at the given path.
+- read_file(path, offset, limit): read a file (paginated) from the filesystem.
+- write_file(path, content): create a new text file in the workspace.
+- edit_file(path, old, new): exact string-replacement edit (lazy-loads KB
+  documents on first edit).
+- glob(pattern, path): find files matching a glob pattern.
+- grep(pattern, path, glob): substring search across files.
+- mkdir(path): create a folder under `/documents/` (committed at end of turn).
+- cd(path): change the current working directory.
+- pwd(): print the current working directory.
+- move_file(source, dest): move/rename a file under `/documents/`.
+- rm(path): delete a single file under `/documents/` (no `-r`).
+- rmdir(path): delete an empty directory under `/documents/`.
+- list_tree(path, max_depth, page_size): recursively list files/folders.
+
+## Persistence Rules
+
+- Files written under `/documents/<...>` are **persisted** at end of turn as
+  Documents in the user's knowledge base.
+- Files whose **basename** starts with `temp_` (e.g. `temp_plan.md` or
+  `/documents/temp_scratch.md`) are **discarded** at end of turn — use this
+  prefix for any scratch/working content you do NOT want saved.
+- All other paths (outside `/documents/` and not `temp_*`) are rejected.
+- mkdir/move_file/rm/rmdir are staged this turn and committed at end of
+  turn alongside any new/edited documents. Snapshot/revert is enabled
+  for every destructive operation when action logging is on.
+
+## Reading Documents Efficiently
+
+Documents are formatted as XML. Each document contains:
+- `<document_metadata>` — title, type, URL, etc.
+- `<chunk_index>` — a table of every chunk with its **line range** and a
+  `matched="true"` flag for chunks that matched the search query.
+- `<document_content>` — the actual chunks in original document order.
+
+**Workflow**: when reading a large document, read the first ~20 lines to see
+the `<chunk_index>`, identify chunks marked `matched="true"`, then use
+`read_file(path, offset=<start_line>, limit=<lines>)` to jump directly to
+those sections instead of reading the entire file sequentially.
+
+Use `<chunk id='...'>` values as citation IDs in your answers.
+
+## Priority List
+
+You receive a `<priority_documents>` system message each turn listing the
+top-K paths most relevant to the user's query (by hybrid search). Read those
+first — matched sections are flagged inside each document's `<chunk_index>`.
+
+## Workspace Tree
+
+You receive a `<workspace_tree>` system message each turn with the current
+folder/document layout. The tree may be truncated past a hard cap; in that
+case, drill into specific folders with `ls(...)` or `list_tree(...)`.
+
+## grep Line Numbers
+
+`grep` searches across both your in-memory edits and the indexed chunks in
+Postgres. State-cached files return real line numbers; database hits return
+`line=0` because their position depends on per-document XML layout — call
+`read_file(path)` to find the exact line.
+"""
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/system_prompt/common.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/system_prompt/common.py
new file mode 100644
index 0000000000..81c2641182
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/system_prompt/common.py
@@ -0,0 +1,22 @@
+"""Mode-agnostic prompt fragments: header conventions + sandbox addendum."""
+
+from __future__ import annotations
+
+HEADER = """## Following Conventions
+
+- Read files before editing — understand existing content before making changes.
+- Mimic existing style, naming conventions, and patterns.
+- Never claim a file was created/updated unless filesystem tool output confirms success.
+- If a file write/edit fails, explicitly report the failure.
+"""
+
+SANDBOX_ADDENDUM = (
+    "\n- execute_code: run Python code in an isolated sandbox."
+    "\n\n## Code Execution"
+    "\n\nUse execute_code whenever a task benefits from running code."
+    " Never perform arithmetic manually."
+    "\n\nDocuments here are XML-wrapped markdown, not raw data files."
+    " To work with them programmatically, read the document first,"
+    " extract the data, write it as a clean file (CSV, JSON, etc.),"
+    " and then run your code against it."
+)
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/system_prompt/desktop.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/system_prompt/desktop.py
new file mode 100644
index 0000000000..712b51c26d
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/system_prompt/desktop.py
@@ -0,0 +1,49 @@
+"""Desktop-mode filesystem system prompt body."""
+
+from __future__ import annotations
+
+BODY = """
+## Local Folder Mode
+
+This chat operates directly on the user's local folders. Writes and edits
+hit disk immediately — there is no end-of-turn staging, no `/documents/`
+namespace, and no `temp_` semantics.
+
+## Filesystem Tools
+
+All file paths must start with `/` and use mount-prefixed absolute paths
+like `/<mount>/file.ext`. Relative paths resolve against the current working
+directory (`cwd`).
+
+- ls(path, offset=0, limit=200): list files and directories at the given path.
+- read_file(path, offset, limit): read a file (paginated) from disk.
+- write_file(path, content): write a file to disk.
+- edit_file(path, old, new): exact string-replacement edit on disk.
+- glob(pattern, path): find files matching a glob pattern.
+- grep(pattern, path, glob): substring search across files.
+- mkdir(path): create a directory on disk.
+- cd(path): change the current working directory.
+- pwd(): print the current working directory.
+- move_file(source, dest): move/rename a file.
+- rm(path): delete a single file from disk (no `-r`). NOT reversible.
+- rmdir(path): delete an empty directory from disk. NOT reversible.
+- list_tree(path, max_depth, page_size): recursively list files/folders.
+
+## Workflow Tips
+
+- If you are unsure which mounts are available, call `ls('/')` first.
+- For large trees, prefer `list_tree` then `grep` then `read_file` over
+  brute-force directory traversal.
+- Cross-mount moves are not supported.
+- Desktop deletes hit disk immediately and cannot be undone via the
+  agent's revert flow — confirm before calling `rm`/`rmdir`.
+
+## Priority List
+
+You may receive a `<priority_documents>` system message listing the top-K
+documents from the user's SurfSense knowledge base — these are cloud-ingested
+via connectors (Notion, Slack, etc.), not local files. Treat it as a hint:
+consult it when the task spans both local and cloud sources (e.g. drafting a
+local note from a Notion summary); skip when the task is purely about local
+files.
+"""
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/system_prompt/index.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/system_prompt/index.py
new file mode 100644
index 0000000000..9d3cdbae39
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/system_prompt/index.py
@@ -0,0 +1,18 @@
+"""Public assembly of the FS system prompt for a given session."""
+
+from __future__ import annotations
+
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+
+from .cloud import BODY as CLOUD_BODY
+from .common import HEADER, SANDBOX_ADDENDUM
+from .desktop import BODY as DESKTOP_BODY
+
+
+def build_system_prompt(mode: FilesystemMode, *, sandbox_available: bool) -> str:
+    """Assemble the FS prompt: common header + mode body + optional sandbox section."""
+    body = CLOUD_BODY if mode == FilesystemMode.CLOUD else DESKTOP_BODY
+    base = HEADER + body
+    if sandbox_available:
+        base += SANDBOX_ADDENDUM
+    return base
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/__init__.py
new file mode 100644
index 0000000000..6e490bb826
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/__init__.py
@@ -0,0 +1,31 @@
+"""Filesystem tool factories — one vertical slice per tool."""
+
+from __future__ import annotations
+
+from .cd import create_cd_tool
+from .edit_file import create_edit_file_tool
+from .execute_code import create_execute_code_tool
+from .list_tree import create_list_tree_tool
+from .ls import create_ls_tool
+from .mkdir import create_mkdir_tool
+from .move_file import create_move_file_tool
+from .pwd import create_pwd_tool
+from .read_file import create_read_file_tool
+from .rm import create_rm_tool
+from .rmdir import create_rmdir_tool
+from .write_file import create_write_file_tool
+
+__all__ = [
+    "create_cd_tool",
+    "create_edit_file_tool",
+    "create_execute_code_tool",
+    "create_list_tree_tool",
+    "create_ls_tool",
+    "create_mkdir_tool",
+    "create_move_file_tool",
+    "create_pwd_tool",
+    "create_read_file_tool",
+    "create_rm_tool",
+    "create_rmdir_tool",
+    "create_write_file_tool",
+]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/cd/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/cd/__init__.py
new file mode 100644
index 0000000000..4f7d3660a9
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/cd/__init__.py
@@ -0,0 +1,7 @@
+"""Tool: ``cd`` — change the current working directory (cwd)."""
+
+from __future__ import annotations
+
+from .index import create_cd_tool
+
+__all__ = ["create_cd_tool"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/cd/description.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/cd/description.py
new file mode 100644
index 0000000000..6d7b987c8c
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/cd/description.py
@@ -0,0 +1,19 @@
+"""Description string for ``cd`` (mode-agnostic)."""
+
+from __future__ import annotations
+
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+
+_DESCRIPTION = """Changes the current working directory (cwd).
+
+Args:
+- path: absolute or relative directory path. Relative paths resolve against
+  the current cwd.
+
+The new cwd is used by other filesystem tools whenever a relative path is
+given. Returns the resolved cwd.
+"""
+
+
+def select_description(mode: FilesystemMode) -> str:
+    return _DESCRIPTION
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/cd/index.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/cd/index.py
new file mode 100644
index 0000000000..8df6b9edb2
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/cd/index.py
@@ -0,0 +1,80 @@
+"""``cd`` factory: resolve target, verify existence (staged + on-disk), update cwd."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Annotated
+
+from deepagents.backends.utils import validate_path
+from langchain.tools import ToolRuntime
+from langchain_core.messages import ToolMessage
+from langchain_core.tools import BaseTool, StructuredTool
+from langgraph.types import Command
+
+from app.agents.new_chat.filesystem_state import SurfSenseFilesystemState
+from app.agents.new_chat.path_resolver import DOCUMENTS_ROOT
+
+from ...middleware.async_dispatch import run_async_blocking
+from ...middleware.path_resolution import resolve_relative
+from .description import select_description
+
+if TYPE_CHECKING:
+    from ...middleware import SurfSenseFilesystemMiddleware
+
+
+def create_cd_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
+    description = select_description(mw._filesystem_mode)
+
+    async def async_cd(
+        path: Annotated[str, "Absolute or relative directory path to switch into."],
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+    ) -> Command | str:
+        target = resolve_relative(mw, path, runtime)
+        try:
+            validated = validate_path(target)
+        except ValueError as exc:
+            return f"Error: {exc}"
+
+        backend = mw._get_backend(runtime)
+        try:
+            infos = await backend.als_info(validated)
+        except Exception as exc:  # pragma: no cover - defensive
+            return f"Error: {exc}"
+        staged_dirs = list(runtime.state.get("staged_dirs") or [])
+        files = runtime.state.get("files") or {}
+        cwd_exists = (
+            bool(infos)
+            or validated in staged_dirs
+            or any(p == validated for p in files)
+            or any(
+                isinstance(p, str) and p.startswith(validated.rstrip("/") + "/")
+                for p in files
+            )
+            or validated == "/"
+            or validated == DOCUMENTS_ROOT
+        )
+        if not cwd_exists:
+            return f"Error: directory '{validated}' not found."
+        return Command(
+            update={
+                "cwd": validated,
+                "messages": [
+                    ToolMessage(
+                        content=f"cwd changed to {validated}",
+                        tool_call_id=runtime.tool_call_id,
+                    )
+                ],
+            }
+        )
+
+    def sync_cd(
+        path: Annotated[str, "Absolute or relative directory path to switch into."],
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+    ) -> Command | str:
+        return run_async_blocking(async_cd(path, runtime))
+
+    return StructuredTool.from_function(
+        name="cd",
+        description=description,
+        func=sync_cd,
+        coroutine=async_cd,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/edit_file/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/edit_file/__init__.py
new file mode 100644
index 0000000000..7413a69b9c
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/edit_file/__init__.py
@@ -0,0 +1,7 @@
+"""Tool: ``edit_file`` — exact string replacement on a file."""
+
+from __future__ import annotations
+
+from .index import create_edit_file_tool
+
+__all__ = ["create_edit_file_tool"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/edit_file/description.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/edit_file/description.py
new file mode 100644
index 0000000000..de2a476485
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/edit_file/description.py
@@ -0,0 +1,28 @@
+"""Mode-specific description strings for ``edit_file``."""
+
+from __future__ import annotations
+
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+
+_CLOUD_DESCRIPTION = """Performs exact string replacements in files.
+
+IMPORTANT:
+- Read the file before editing.
+- Preserve exact indentation and formatting.
+- Edits to documents under `/documents/` are persisted at end of turn.
+- Edits to `temp_*` files are discarded at end of turn.
+"""
+
+_DESKTOP_DESCRIPTION = """Performs exact string replacements in files on disk.
+
+IMPORTANT:
+- Read the file before editing.
+- Preserve exact indentation and formatting.
+- Edits hit disk immediately.
+"""
+
+
+def select_description(mode: FilesystemMode) -> str:
+    if mode == FilesystemMode.CLOUD:
+        return _CLOUD_DESCRIPTION
+    return _DESKTOP_DESCRIPTION
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/edit_file/index.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/edit_file/index.py
new file mode 100644
index 0000000000..324ef09b02
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/edit_file/index.py
@@ -0,0 +1,132 @@
+"""``edit_file`` factory: lazy-load KB doc, enforce cloud namespace, dispatch to backend."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Annotated, Any
+
+from deepagents.backends.protocol import EditResult
+from deepagents.backends.utils import validate_path
+from langchain.tools import ToolRuntime
+from langchain_core.messages import ToolMessage
+from langchain_core.tools import BaseTool, StructuredTool
+from langgraph.types import Command
+
+from app.agents.new_chat.filesystem_state import SurfSenseFilesystemState
+from app.agents.new_chat.middleware.kb_postgres_backend import KBPostgresBackend
+
+from ...middleware.async_dispatch import run_async_blocking
+from ...middleware.mode import is_cloud
+from ...middleware.namespace_policy import check_cloud_write_namespace
+from ...middleware.path_resolution import resolve_relative
+from .description import select_description
+
+if TYPE_CHECKING:
+    from ...middleware import SurfSenseFilesystemMiddleware
+
+
+def create_edit_file_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
+    description = select_description(mw._filesystem_mode)
+
+    async def async_edit_file(
+        file_path: Annotated[
+            str,
+            "Absolute path to the file to edit. Relative paths resolve against the current cwd.",
+        ],
+        old_string: Annotated[
+            str,
+            "Exact text to replace. Must be unique unless replace_all is True.",
+        ],
+        new_string: Annotated[
+            str,
+            "Replacement text. Must differ from old_string.",
+        ],
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+        *,
+        replace_all: Annotated[
+            bool,
+            "If True, replace all occurrences of old_string. Defaults to False.",
+        ] = False,
+    ) -> Command | str:
+        target = resolve_relative(mw, file_path, runtime)
+        try:
+            validated = validate_path(target)
+        except ValueError as exc:
+            return f"Error: {exc}"
+
+        namespace_error = check_cloud_write_namespace(mw, validated, runtime)
+        if namespace_error:
+            return namespace_error
+
+        backend = mw._get_backend(runtime)
+        files_state = runtime.state.get("files") or {}
+        doc_id_to_attach: int | None = None
+
+        if (
+            is_cloud(mw._filesystem_mode)
+            and validated not in files_state
+            and isinstance(backend, KBPostgresBackend)
+        ):
+            loaded = await backend._load_file_data(validated)
+            if loaded is None:
+                return f"Error: File '{validated}' not found"
+            _, doc_id_to_attach = loaded
+
+        res: EditResult = await backend.aedit(
+            validated, old_string, new_string, replace_all=replace_all
+        )
+        if res.error:
+            return res.error
+
+        path = res.path or validated
+        files_update = res.files_update or {}
+        update: dict[str, Any] = {
+            "files": files_update,
+            "messages": [
+                ToolMessage(
+                    content=(
+                        f"Successfully replaced {res.occurrences} instance(s) "
+                        f"of the string in '{path}'"
+                    ),
+                    tool_call_id=runtime.tool_call_id,
+                )
+            ],
+        }
+        if is_cloud(mw._filesystem_mode):
+            update["dirty_paths"] = [path]
+            update["dirty_path_tool_calls"] = {path: runtime.tool_call_id}
+            if doc_id_to_attach is not None:
+                update["doc_id_by_path"] = {path: doc_id_to_attach}
+        return Command(update=update)
+
+    def sync_edit_file(
+        file_path: Annotated[
+            str,
+            "Absolute path to the file to edit. Relative paths resolve against the current cwd.",
+        ],
+        old_string: Annotated[
+            str,
+            "Exact text to replace. Must be unique unless replace_all is True.",
+        ],
+        new_string: Annotated[
+            str,
+            "Replacement text. Must differ from old_string.",
+        ],
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+        *,
+        replace_all: Annotated[
+            bool,
+            "If True, replace all occurrences of old_string. Defaults to False.",
+        ] = False,
+    ) -> Command | str:
+        return run_async_blocking(
+            async_edit_file(
+                file_path, old_string, new_string, runtime, replace_all=replace_all
+            )
+        )
+
+    return StructuredTool.from_function(
+        name="edit_file",
+        description=description,
+        func=sync_edit_file,
+        coroutine=async_edit_file,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/execute_code/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/execute_code/__init__.py
new file mode 100644
index 0000000000..8ce0af270e
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/execute_code/__init__.py
@@ -0,0 +1,7 @@
+"""Tool: ``execute_code`` — run Python code in an isolated sandbox."""
+
+from __future__ import annotations
+
+from .index import create_execute_code_tool
+
+__all__ = ["create_execute_code_tool"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/execute_code/description.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/execute_code/description.py
new file mode 100644
index 0000000000..89415c2f34
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/execute_code/description.py
@@ -0,0 +1,21 @@
+"""Description string for ``execute_code`` (mode-agnostic)."""
+
+from __future__ import annotations
+
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+
+_DESCRIPTION = """Executes Python code in an isolated sandbox environment.
+
+Common data-science packages are pre-installed (pandas, numpy, matplotlib,
+scipy, scikit-learn).
+
+Usage notes:
+- No outbound network access.
+- Returns combined stdout/stderr with exit code.
+- Use print() to produce output.
+- Use the optional timeout parameter to override the default timeout.
+"""
+
+
+def select_description(mode: FilesystemMode) -> str:
+    return _DESCRIPTION
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/execute_code/helpers.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/execute_code/helpers.py
new file mode 100644
index 0000000000..cda9f535d6
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/execute_code/helpers.py
@@ -0,0 +1,87 @@
+"""Sandbox-execution helpers for ``execute_code``.
+
+Wraps user-supplied code in a heredoc and dispatches it to the Daytona
+sandbox associated with the current chat thread, with a single retry on
+sandbox failure.
+"""
+
+from __future__ import annotations
+
+import logging
+import secrets
+from typing import TYPE_CHECKING
+
+from daytona.common.errors import DaytonaError
+from langchain.tools import ToolRuntime
+
+from app.agents.new_chat.filesystem_state import SurfSenseFilesystemState
+from app.agents.new_chat.sandbox import (
+    _evict_sandbox_cache,
+    delete_sandbox,
+    get_or_create_sandbox,
+)
+
+if TYPE_CHECKING:
+    from ...middleware import SurfSenseFilesystemMiddleware
+
+logger = logging.getLogger(__name__)
+
+MAX_EXECUTE_TIMEOUT = 300
+
+
+def wrap_as_python(code: str) -> str:
+    """Wrap ``code`` in a unique-sentinel heredoc for shell execution."""
+    sentinel = f"_PYEOF_{secrets.token_hex(8)}"
+    return f"python3 << '{sentinel}'\n{code}\n{sentinel}"
+
+
+async def execute_in_sandbox(
+    mw: SurfSenseFilesystemMiddleware,
+    command: str,
+    runtime: ToolRuntime[None, SurfSenseFilesystemState],
+    timeout: int | None,
+) -> str:
+    """Top-level entry: wraps + retries once on sandbox failure."""
+    assert mw._thread_id is not None
+    command = wrap_as_python(command)
+    try:
+        return await _try_sandbox_execute(mw, command, runtime, timeout)
+    except (DaytonaError, Exception) as first_err:
+        logger.warning(
+            "Sandbox execute failed for thread %s, retrying: %s",
+            mw._thread_id,
+            first_err,
+        )
+        try:
+            await delete_sandbox(mw._thread_id)
+        except Exception:
+            _evict_sandbox_cache(mw._thread_id)
+        try:
+            return await _try_sandbox_execute(mw, command, runtime, timeout)
+        except Exception:
+            logger.exception("Sandbox retry also failed for thread %s", mw._thread_id)
+            return "Error: Code execution is temporarily unavailable. Please try again."
+
+
+async def _try_sandbox_execute(
+    mw: SurfSenseFilesystemMiddleware,
+    command: str,
+    runtime: ToolRuntime[None, SurfSenseFilesystemState],
+    timeout: int | None,
+) -> str:
+    """One sandbox-execute attempt: get/create sandbox, run, format output."""
+    sandbox, _is_new = await get_or_create_sandbox(mw._thread_id)
+    result = await sandbox.aexecute(command, timeout=timeout)
+    output = (result.output or "").strip()
+    if not output and result.exit_code == 0:
+        return (
+            "[Code executed successfully but produced no output. "
+            "Use print() to display results, then try again.]"
+        )
+    parts = [result.output]
+    if result.exit_code is not None:
+        status = "succeeded" if result.exit_code == 0 else "failed"
+        parts.append(f"\n[Command {status} with exit code {result.exit_code}]")
+    if result.truncated:
+        parts.append("\n[Output was truncated due to size limits]")
+    return "".join(parts)
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/execute_code/index.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/execute_code/index.py
new file mode 100644
index 0000000000..2711636e4e
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/execute_code/index.py
@@ -0,0 +1,58 @@
+"""``execute_code`` factory: bounds-check timeout, dispatch to the sandbox."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Annotated
+
+from langchain.tools import ToolRuntime
+from langchain_core.tools import BaseTool, StructuredTool
+
+from app.agents.new_chat.filesystem_state import SurfSenseFilesystemState
+
+from ...middleware.async_dispatch import run_async_blocking
+from .description import select_description
+from .helpers import MAX_EXECUTE_TIMEOUT, execute_in_sandbox
+
+if TYPE_CHECKING:
+    from ...middleware import SurfSenseFilesystemMiddleware
+
+
+def create_execute_code_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
+    description = select_description(mw._filesystem_mode)
+
+    def sync_execute_code(
+        command: Annotated[str, "Python code to execute. Use print() to see output."],
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+        timeout: Annotated[
+            int | None,
+            "Optional timeout in seconds.",
+        ] = None,
+    ) -> str:
+        if timeout is not None:
+            if timeout < 0:
+                return f"Error: timeout must be non-negative, got {timeout}."
+            if timeout > MAX_EXECUTE_TIMEOUT:
+                return f"Error: timeout {timeout}s exceeds maximum ({MAX_EXECUTE_TIMEOUT}s)."
+        return run_async_blocking(execute_in_sandbox(mw, command, runtime, timeout))
+
+    async def async_execute_code(
+        command: Annotated[str, "Python code to execute. Use print() to see output."],
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+        timeout: Annotated[
+            int | None,
+            "Optional timeout in seconds.",
+        ] = None,
+    ) -> str:
+        if timeout is not None:
+            if timeout < 0:
+                return f"Error: timeout must be non-negative, got {timeout}."
+            if timeout > MAX_EXECUTE_TIMEOUT:
+                return f"Error: timeout {timeout}s exceeds maximum ({MAX_EXECUTE_TIMEOUT}s)."
+        return await execute_in_sandbox(mw, command, runtime, timeout)
+
+    return StructuredTool.from_function(
+        name="execute_code",
+        description=description,
+        func=sync_execute_code,
+        coroutine=async_execute_code,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/glob/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/glob/__init__.py
new file mode 100644
index 0000000000..7c8c387e15
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/glob/__init__.py
@@ -0,0 +1,7 @@
+"""Tool: ``glob`` — description override (the tool comes from the base middleware)."""
+
+from __future__ import annotations
+
+from .description import select_description
+
+__all__ = ["select_description"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/glob/description.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/glob/description.py
new file mode 100644
index 0000000000..d022f9a7a7
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/glob/description.py
@@ -0,0 +1,15 @@
+"""Description string for ``glob`` (mode-agnostic)."""
+
+from __future__ import annotations
+
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+
+_DESCRIPTION = """Find files matching a glob pattern.
+
+Supports standard glob patterns: `*`, `**`, `?`.
+Returns absolute file paths.
+"""
+
+
+def select_description(mode: FilesystemMode) -> str:
+    return _DESCRIPTION
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/grep/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/grep/__init__.py
new file mode 100644
index 0000000000..8bdb129fbe
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/grep/__init__.py
@@ -0,0 +1,7 @@
+"""Tool: ``grep`` — description override (the tool comes from the base middleware)."""
+
+from __future__ import annotations
+
+from .description import select_description
+
+__all__ = ["select_description"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/grep/description.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/grep/description.py
new file mode 100644
index 0000000000..5d7c393a9e
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/grep/description.py
@@ -0,0 +1,24 @@
+"""Mode-specific description strings for ``grep``."""
+
+from __future__ import annotations
+
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+
+_CLOUD_DESCRIPTION = """Search for a literal text pattern across files.
+
+Searches both your in-memory edits and the indexed chunks in Postgres.
+State-cached file matches include real line numbers; database hits return
+`line=0` because their position depends on per-document XML layout — call
+`read_file(path)` afterwards to find the exact line.
+"""
+
+_DESKTOP_DESCRIPTION = """Search for a literal text pattern across files.
+
+Searches files on disk and any in-memory edits. Returns real line numbers.
+"""
+
+
+def select_description(mode: FilesystemMode) -> str:
+    if mode == FilesystemMode.CLOUD:
+        return _CLOUD_DESCRIPTION
+    return _DESKTOP_DESCRIPTION
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/list_tree/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/list_tree/__init__.py
new file mode 100644
index 0000000000..33b2a2b23d
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/list_tree/__init__.py
@@ -0,0 +1,7 @@
+"""Tool: ``list_tree`` — recursively list files / folders in one bounded call."""
+
+from __future__ import annotations
+
+from .index import create_list_tree_tool
+
+__all__ = ["create_list_tree_tool"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/list_tree/description.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/list_tree/description.py
new file mode 100644
index 0000000000..a24230fb01
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/list_tree/description.py
@@ -0,0 +1,37 @@
+"""Mode-specific description strings for ``list_tree``."""
+
+from __future__ import annotations
+
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+
+_CLOUD_DESCRIPTION = """Lists files/folders recursively in a single bounded call.
+
+Args:
+- path: absolute path to start from. Defaults to `/documents`.
+- max_depth: recursion depth limit (default 8).
+- page_size: maximum number of entries returned (max 1000).
+- include_files / include_dirs: filter returned entry types.
+
+Returns JSON with:
+- entries: [{path, is_dir, size, modified_at, depth}]
+- truncated: true when additional entries were omitted due to page_size.
+"""
+
+_DESKTOP_DESCRIPTION = """Lists files/folders recursively in a single bounded call.
+
+Args:
+- path: absolute path to start from. Defaults to `/`.
+- max_depth: recursion depth limit (default 8).
+- page_size: maximum number of entries returned (max 1000).
+- include_files / include_dirs: filter returned entry types.
+
+Returns JSON with:
+- entries: [{path, is_dir, size, modified_at, depth}]
+- truncated: true when additional entries were omitted due to page_size.
+"""
+
+
+def select_description(mode: FilesystemMode) -> str:
+    if mode == FilesystemMode.CLOUD:
+        return _CLOUD_DESCRIPTION
+    return _DESKTOP_DESCRIPTION
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/list_tree/index.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/list_tree/index.py
new file mode 100644
index 0000000000..8bad88a748
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/list_tree/index.py
@@ -0,0 +1,101 @@
+"""``list_tree`` factory: bounded recursive listing across cloud / desktop backends."""
+
+from __future__ import annotations
+
+import json
+from typing import TYPE_CHECKING, Annotated
+
+from deepagents.backends.utils import validate_path
+from langchain.tools import ToolRuntime
+from langchain_core.tools import BaseTool, StructuredTool
+
+from app.agents.new_chat.filesystem_state import SurfSenseFilesystemState
+from app.agents.new_chat.middleware.kb_postgres_backend import KBPostgresBackend
+
+from ...middleware.async_dispatch import run_async_blocking
+from ...middleware.path_resolution import resolve_list_target_path
+from .description import select_description
+
+if TYPE_CHECKING:
+    from ...middleware import SurfSenseFilesystemMiddleware
+
+
+def create_list_tree_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
+    description = select_description(mw._filesystem_mode)
+
+    async def async_list_tree(
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+        path: Annotated[
+            str,
+            "Absolute path to start from. Defaults to /documents in cloud mode.",
+        ] = "",
+        max_depth: Annotated[int, "Recursion depth limit. Default 8."] = 8,
+        page_size: Annotated[int, "Maximum entries returned. Max 1000."] = 500,
+        include_files: Annotated[bool, "Include file entries."] = True,
+        include_dirs: Annotated[bool, "Include directory entries."] = True,
+    ) -> str:
+        if max_depth < 0:
+            return "Error: max_depth must be >= 0."
+        if page_size < 1:
+            return "Error: page_size must be >= 1."
+        if not include_files and not include_dirs:
+            return "Error: include_files and include_dirs cannot both be false."
+
+        target = resolve_list_target_path(mw, path, runtime)
+        try:
+            validated = validate_path(target)
+        except ValueError as exc:
+            return f"Error: {exc}"
+
+        backend = mw._get_backend(runtime)
+        if isinstance(backend, KBPostgresBackend):
+            result = await backend.alist_tree_listing(
+                validated,
+                max_depth=max_depth,
+                page_size=page_size,
+                include_files=include_files,
+                include_dirs=include_dirs,
+            )
+        elif hasattr(backend, "alist_tree"):
+            result = await backend.alist_tree(
+                validated,
+                max_depth=max_depth,
+                page_size=page_size,
+                include_files=include_files,
+                include_dirs=include_dirs,
+            )
+        else:
+            return "Error: list_tree is not supported by the active backend."
+
+        if isinstance(result, dict) and isinstance(result.get("error"), str):
+            return result["error"]
+        return json.dumps(result, ensure_ascii=True)
+
+    def sync_list_tree(
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+        path: Annotated[
+            str,
+            "Absolute path to start from. Defaults to /documents in cloud mode.",
+        ] = "",
+        max_depth: Annotated[int, "Recursion depth limit. Default 8."] = 8,
+        page_size: Annotated[int, "Maximum entries returned. Max 1000."] = 500,
+        include_files: Annotated[bool, "Include file entries."] = True,
+        include_dirs: Annotated[bool, "Include directory entries."] = True,
+    ) -> str:
+        return run_async_blocking(
+            async_list_tree(
+                runtime,
+                path=path,
+                max_depth=max_depth,
+                page_size=page_size,
+                include_files=include_files,
+                include_dirs=include_dirs,
+            )
+        )
+
+    return StructuredTool.from_function(
+        name="list_tree",
+        description=description,
+        func=sync_list_tree,
+        coroutine=async_list_tree,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/ls/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/ls/__init__.py
new file mode 100644
index 0000000000..b409d34699
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/ls/__init__.py
@@ -0,0 +1,7 @@
+"""Tool: ``ls`` — list files and directories at a path."""
+
+from __future__ import annotations
+
+from .index import create_ls_tool
+
+__all__ = ["create_ls_tool"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/ls/description.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/ls/description.py
new file mode 100644
index 0000000000..8c7e301dc0
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/ls/description.py
@@ -0,0 +1,29 @@
+"""Mode-specific description strings for ``ls``."""
+
+from __future__ import annotations
+
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+
+_CLOUD_DESCRIPTION = """Lists files and directories at the given path.
+
+Usage:
+- Provide an absolute path under `/documents` (relative paths resolve under
+  the current cwd, which defaults to `/documents`).
+- For very large folders, use `offset` and `limit` to paginate the listing.
+- Returns one entry per line; directories end with a trailing `/`.
+"""
+
+_DESKTOP_DESCRIPTION = """Lists files and directories at the given path.
+
+Usage:
+- Provide an absolute path using a mount prefix (e.g. `/<mount>/sub/dir`).
+  Use `ls('/')` to discover available mounts.
+- For very large folders, use `offset` and `limit` to paginate the listing.
+- Returns one entry per line; directories end with a trailing `/`.
+"""
+
+
+def select_description(mode: FilesystemMode) -> str:
+    if mode == FilesystemMode.CLOUD:
+        return _CLOUD_DESCRIPTION
+    return _DESKTOP_DESCRIPTION
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/ls/index.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/ls/index.py
new file mode 100644
index 0000000000..70f31dd044
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/ls/index.py
@@ -0,0 +1,96 @@
+"""``ls`` factory: resolve target, page through backend listing."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Annotated
+
+from deepagents.backends.utils import validate_path
+from langchain.tools import ToolRuntime
+from langchain_core.tools import BaseTool, StructuredTool
+
+from app.agents.new_chat.filesystem_state import SurfSenseFilesystemState
+from app.agents.new_chat.middleware.kb_postgres_backend import paginate_listing
+
+from ...middleware.async_dispatch import run_async_blocking
+from ...middleware.path_resolution import resolve_list_target_path
+from .description import select_description
+
+if TYPE_CHECKING:
+    from ...middleware import SurfSenseFilesystemMiddleware
+
+
+def create_ls_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
+    description = select_description(mw._filesystem_mode)
+
+    async def async_ls(
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+        path: Annotated[
+            str,
+            "Absolute path to the directory to list. Relative paths resolve against the current cwd.",
+        ] = "",
+        offset: Annotated[
+            int,
+            "Number of entries to skip. Use for paginating large folders. Defaults to 0.",
+        ] = 0,
+        limit: Annotated[
+            int,
+            "Maximum number of entries to return. Defaults to 200.",
+        ] = 200,
+    ) -> str:
+        target = resolve_list_target_path(mw, path, runtime)
+        try:
+            validated = validate_path(target)
+        except ValueError as exc:
+            return f"Error: {exc}"
+        if offset < 0:
+            offset = 0
+        if limit < 1:
+            limit = 1
+        backend = mw._get_backend(runtime)
+        infos = await backend.als_info(validated)
+        page = paginate_listing(infos, offset=offset, limit=limit)
+        paths = [
+            f"{fi.get('path', '')}/" if fi.get("is_dir") else fi.get("path", "")
+            for fi in page
+        ]
+        total = len(infos)
+        shown = len(page)
+        header = (
+            f"{validated} ({shown} of {total} entries"
+            f"{f', offset={offset}' if offset else ''})"
+        )
+        if not paths:
+            return f"{header}\n(empty)"
+        body = "\n".join(paths)
+        if total > offset + shown:
+            body += (
+                f"\n... {total - offset - shown} more — call ls("
+                f"'{validated}', offset={offset + shown}, limit={limit})"
+            )
+        return f"{header}\n{body}"
+
+    def sync_ls(
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+        path: Annotated[
+            str,
+            "Absolute path to the directory to list. Relative paths resolve against the current cwd.",
+        ] = "",
+        offset: Annotated[
+            int,
+            "Number of entries to skip. Use for paginating large folders. Defaults to 0.",
+        ] = 0,
+        limit: Annotated[
+            int,
+            "Maximum number of entries to return. Defaults to 200.",
+        ] = 200,
+    ) -> str:
+        return run_async_blocking(
+            async_ls(runtime, path=path, offset=offset, limit=limit)
+        )
+
+    return StructuredTool.from_function(
+        name="ls",
+        description=description,
+        func=sync_ls,
+        coroutine=async_ls,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/mkdir/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/mkdir/__init__.py
new file mode 100644
index 0000000000..42149b7fd3
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/mkdir/__init__.py
@@ -0,0 +1,7 @@
+"""Tool: ``mkdir`` — create a directory."""
+
+from __future__ import annotations
+
+from .index import create_mkdir_tool
+
+__all__ = ["create_mkdir_tool"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/mkdir/description.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/mkdir/description.py
new file mode 100644
index 0000000000..1c86e72f72
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/mkdir/description.py
@@ -0,0 +1,33 @@
+"""Mode-specific description strings for ``mkdir``."""
+
+from __future__ import annotations
+
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+
+_CLOUD_DESCRIPTION = """Creates a directory under `/documents/`.
+
+Stages the folder for end-of-turn commit; the Folder row is inserted only
+after the agent's turn finishes successfully.
+
+Args:
+- path: absolute path of the new directory (must start with
+  `/documents/`).
+
+Notes:
+- Parent folders are created as needed.
+"""
+
+_DESKTOP_DESCRIPTION = """Creates a directory on disk.
+
+Args:
+- path: absolute mount-prefixed path of the new directory.
+
+Notes:
+- Parent folders are created as needed.
+"""
+
+
+def select_description(mode: FilesystemMode) -> str:
+    if mode == FilesystemMode.CLOUD:
+        return _CLOUD_DESCRIPTION
+    return _DESKTOP_DESCRIPTION
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/mkdir/index.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/mkdir/index.py
new file mode 100644
index 0000000000..788381faaf
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/mkdir/index.py
@@ -0,0 +1,94 @@
+"""``mkdir`` factory: cloud stages for end-of-turn; desktop hits disk immediately."""
+
+from __future__ import annotations
+
+import asyncio
+from typing import TYPE_CHECKING, Annotated, Any
+
+from deepagents.backends.utils import validate_path
+from langchain.tools import ToolRuntime
+from langchain_core.messages import ToolMessage
+from langchain_core.tools import BaseTool, StructuredTool
+from langgraph.types import Command
+
+from app.agents.new_chat.filesystem_state import SurfSenseFilesystemState
+from app.agents.new_chat.path_resolver import DOCUMENTS_ROOT
+
+from ...middleware.async_dispatch import run_async_blocking
+from ...middleware.mode import is_cloud
+from ...middleware.path_resolution import resolve_relative
+from .description import select_description
+
+if TYPE_CHECKING:
+    from ...middleware import SurfSenseFilesystemMiddleware
+
+
+def create_mkdir_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
+    description = select_description(mw._filesystem_mode)
+
+    async def async_mkdir(
+        path: Annotated[str, "Absolute or relative directory path to create."],
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+    ) -> Command | str:
+        target = resolve_relative(mw, path, runtime)
+        try:
+            validated = validate_path(target)
+        except ValueError as exc:
+            return f"Error: {exc}"
+
+        if is_cloud(mw._filesystem_mode):
+            if not (
+                validated.startswith(DOCUMENTS_ROOT + "/")
+                or validated == DOCUMENTS_ROOT
+            ):
+                return (
+                    "Error: cloud mkdir must target a path under /documents/ "
+                    f"(got '{validated}')."
+                )
+            return Command(
+                update={
+                    "staged_dirs": [validated],
+                    "staged_dir_tool_calls": {
+                        validated: runtime.tool_call_id,
+                    },
+                    "messages": [
+                        ToolMessage(
+                            content=(
+                                f"Staged directory '{validated}' (will be created "
+                                "at end of turn)."
+                            ),
+                            tool_call_id=runtime.tool_call_id,
+                        )
+                    ],
+                }
+            )
+
+        backend = mw._get_backend(runtime)
+        local_method = getattr(backend, "amkdir", None) or getattr(
+            backend, "mkdir", None
+        )
+        if callable(local_method):
+            try:
+                res: Any = local_method(validated, parents=True, exist_ok=True)
+                if asyncio.iscoroutine(res):
+                    await res
+            except TypeError:
+                res = local_method(validated)
+                if asyncio.iscoroutine(res):
+                    await res
+            except Exception as exc:  # pragma: no cover
+                return f"Error: {exc}"
+        return f"Created directory {validated}"
+
+    def sync_mkdir(
+        path: Annotated[str, "Absolute or relative directory path to create."],
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+    ) -> Command | str:
+        return run_async_blocking(async_mkdir(path, runtime))
+
+    return StructuredTool.from_function(
+        name="mkdir",
+        description=description,
+        func=sync_mkdir,
+        coroutine=async_mkdir,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/move_file/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/move_file/__init__.py
new file mode 100644
index 0000000000..307d863439
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/move_file/__init__.py
@@ -0,0 +1,7 @@
+"""Tool: ``move_file`` — move or rename a file."""
+
+from __future__ import annotations
+
+from .index import create_move_file_tool
+
+__all__ = ["create_move_file_tool"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/move_file/description.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/move_file/description.py
new file mode 100644
index 0000000000..fdba40b297
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/move_file/description.py
@@ -0,0 +1,33 @@
+"""Mode-specific description strings for ``move_file``."""
+
+from __future__ import annotations
+
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+
+_CLOUD_DESCRIPTION = """Moves or renames a file or folder.
+
+Use absolute paths for both source and destination.
+
+Notes:
+- `move_file` is staged this turn and committed at end of turn.
+- The agent cannot overwrite an existing destination — pass a fresh dest
+  path or move the existing destination away first.
+- The anonymous uploaded document is read-only and cannot be moved.
+- Rename is a special case of move (same folder, different filename).
+"""
+
+_DESKTOP_DESCRIPTION = """Moves or renames a file or folder on disk.
+
+Use mount-prefixed absolute paths for both source and destination
+(e.g. `/<mount>/old.txt` -> `/<mount>/new.txt`).
+
+Notes:
+- Cross-mount moves are not supported.
+- Rename is a special case of move (same folder, different filename).
+"""
+
+
+def select_description(mode: FilesystemMode) -> str:
+    if mode == FilesystemMode.CLOUD:
+        return _CLOUD_DESCRIPTION
+    return _DESKTOP_DESCRIPTION
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/move_file/helpers.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/move_file/helpers.py
new file mode 100644
index 0000000000..7613f62f10
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/move_file/helpers.py
@@ -0,0 +1,108 @@
+"""Cloud-mode move helper: stages source/dest into pending_moves + files."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+from langchain.tools import ToolRuntime
+from langchain_core.messages import ToolMessage
+from langgraph.types import Command
+
+from app.agents.new_chat.filesystem_state import SurfSenseFilesystemState
+from app.agents.new_chat.middleware.kb_postgres_backend import KBPostgresBackend
+from app.agents.new_chat.path_resolver import DOCUMENTS_ROOT
+from app.agents.new_chat.state_reducers import _CLEAR
+
+if TYPE_CHECKING:
+    from ...middleware import SurfSenseFilesystemMiddleware
+
+
+async def cloud_move_file(
+    mw: SurfSenseFilesystemMiddleware,
+    runtime: ToolRuntime[None, SurfSenseFilesystemState],
+    source: str,
+    dest: str,
+    *,
+    overwrite: bool,
+) -> Command | str:
+    """Stage a source/dest move in cloud mode (commit at end of turn)."""
+    backend = mw._get_backend(runtime)
+    if not isinstance(backend, KBPostgresBackend):
+        return "Error: cloud move requires KBPostgresBackend."
+
+    if source == dest:
+        return f"Moved '{source}' to '{dest}' (no-op)"
+    if overwrite:
+        return (
+            "Error: overwrite=True is not supported in cloud mode. Move/edit "
+            "the destination doc explicitly first."
+        )
+    if not source.startswith(DOCUMENTS_ROOT + "/"):
+        return (
+            f"Error: cloud move_file source must be under /documents/ (got '{source}')."
+        )
+    if not dest.startswith(DOCUMENTS_ROOT + "/"):
+        return (
+            "Error: cloud move_file destination must be under /documents/ (got "
+            f"'{dest}')."
+        )
+    anon = runtime.state.get("kb_anon_doc") or {}
+    if isinstance(anon, dict):
+        anon_path = str(anon.get("path") or "")
+        if anon_path and (anon_path in (source, dest)):
+            return "Error: the anonymous uploaded document is read-only."
+
+    files = runtime.state.get("files") or {}
+    doc_id_by_path = runtime.state.get("doc_id_by_path") or {}
+    pending_moves = list(runtime.state.get("pending_moves") or [])
+
+    if dest in files:
+        return f"Error: destination '{dest}' already exists."
+    if any(move.get("dest") == dest for move in pending_moves):
+        return f"Error: destination '{dest}' already exists."
+    if dest != source:
+        existing_dest = await backend._load_file_data(dest)
+        if existing_dest is not None:
+            return f"Error: destination '{dest}' already exists."
+
+    source_file_data = files.get(source)
+    source_doc_id = doc_id_by_path.get(source)
+    if source_file_data is None:
+        loaded = await backend._load_file_data(source)
+        if loaded is None:
+            return f"Error: source '{source}' not found."
+        source_file_data, loaded_doc_id = loaded
+        if source_doc_id is None:
+            source_doc_id = loaded_doc_id
+
+    files_update: dict[str, Any] = {source: None, dest: source_file_data}
+    update: dict[str, Any] = {
+        "files": files_update,
+        "pending_moves": [
+            {
+                "source": source,
+                "dest": dest,
+                "overwrite": False,
+                "tool_call_id": runtime.tool_call_id,
+            }
+        ],
+        "messages": [
+            ToolMessage(
+                content=(f"Moved '{source}' to '{dest}' (will commit at end of turn)."),
+                tool_call_id=runtime.tool_call_id,
+            )
+        ],
+    }
+
+    doc_id_update: dict[str, int | None] = {source: None}
+    if source_doc_id is not None:
+        doc_id_update[dest] = source_doc_id
+    update["doc_id_by_path"] = doc_id_update
+
+    dirty_paths = list(runtime.state.get("dirty_paths") or [])
+    if source in dirty_paths:
+        new_dirty: list[Any] = [_CLEAR]
+        for entry in dirty_paths:
+            new_dirty.append(dest if entry == source else entry)
+        update["dirty_paths"] = new_dirty
+    return Command(update=update)
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/move_file/index.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/move_file/index.py
new file mode 100644
index 0000000000..d905359901
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/move_file/index.py
@@ -0,0 +1,96 @@
+"""``move_file`` factory: dispatches cloud (staged) vs desktop (direct disk) moves."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Annotated, Any
+
+from deepagents.backends.protocol import WriteResult
+from deepagents.backends.utils import validate_path
+from langchain.tools import ToolRuntime
+from langchain_core.messages import ToolMessage
+from langchain_core.tools import BaseTool, StructuredTool
+from langgraph.types import Command
+
+from app.agents.new_chat.filesystem_state import SurfSenseFilesystemState
+
+from ...middleware.async_dispatch import run_async_blocking
+from ...middleware.mode import is_cloud
+from ...middleware.path_resolution import resolve_move_target_path
+from .description import select_description
+from .helpers import cloud_move_file
+
+if TYPE_CHECKING:
+    from ...middleware import SurfSenseFilesystemMiddleware
+
+
+def create_move_file_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
+    description = select_description(mw._filesystem_mode)
+
+    async def async_move_file(
+        source_path: Annotated[str, "Absolute or relative source path."],
+        destination_path: Annotated[str, "Absolute or relative destination path."],
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+        *,
+        overwrite: Annotated[
+            bool,
+            "If True, replace existing destination. Cloud mode rejects True. Defaults to False.",
+        ] = False,
+    ) -> Command | str:
+        if not source_path.strip() or not destination_path.strip():
+            return "Error: source_path and destination_path are required."
+
+        source = resolve_move_target_path(mw, source_path, runtime)
+        dest = resolve_move_target_path(mw, destination_path, runtime)
+        try:
+            validated_source = validate_path(source)
+            validated_dest = validate_path(dest)
+        except ValueError as exc:
+            return f"Error: {exc}"
+
+        if is_cloud(mw._filesystem_mode):
+            return await cloud_move_file(
+                mw,
+                runtime,
+                validated_source,
+                validated_dest,
+                overwrite=overwrite,
+            )
+
+        backend = mw._get_backend(runtime)
+        res: WriteResult = await backend.amove(
+            validated_source, validated_dest, overwrite=overwrite
+        )
+        if res.error:
+            return res.error
+        update: dict[str, Any] = {
+            "messages": [
+                ToolMessage(
+                    content=f"Moved '{validated_source}' to '{res.path or validated_dest}'",
+                    tool_call_id=runtime.tool_call_id,
+                )
+            ],
+        }
+        if res.files_update is not None:
+            update["files"] = res.files_update
+        return Command(update=update)
+
+    def sync_move_file(
+        source_path: Annotated[str, "Absolute or relative source path."],
+        destination_path: Annotated[str, "Absolute or relative destination path."],
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+        *,
+        overwrite: Annotated[
+            bool,
+            "If True, replace existing destination. Cloud mode rejects True. Defaults to False.",
+        ] = False,
+    ) -> Command | str:
+        return run_async_blocking(
+            async_move_file(source_path, destination_path, runtime, overwrite=overwrite)
+        )
+
+    return StructuredTool.from_function(
+        name="move_file",
+        description=description,
+        func=sync_move_file,
+        coroutine=async_move_file,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/pwd/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/pwd/__init__.py
new file mode 100644
index 0000000000..8be37c6683
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/pwd/__init__.py
@@ -0,0 +1,7 @@
+"""Tool: ``pwd`` — print the current working directory."""
+
+from __future__ import annotations
+
+from .index import create_pwd_tool
+
+__all__ = ["create_pwd_tool"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/pwd/description.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/pwd/description.py
new file mode 100644
index 0000000000..594a38843e
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/pwd/description.py
@@ -0,0 +1,11 @@
+"""Description string for ``pwd`` (mode-agnostic)."""
+
+from __future__ import annotations
+
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+
+_DESCRIPTION = """Prints the current working directory."""
+
+
+def select_description(mode: FilesystemMode) -> str:
+    return _DESCRIPTION
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/pwd/index.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/pwd/index.py
new file mode 100644
index 0000000000..c15b67114b
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/pwd/index.py
@@ -0,0 +1,37 @@
+"""``pwd`` factory: read the cwd from state."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from langchain.tools import ToolRuntime
+from langchain_core.tools import BaseTool, StructuredTool
+
+from app.agents.new_chat.filesystem_state import SurfSenseFilesystemState
+
+from ...middleware.path_resolution import current_cwd
+from .description import select_description
+
+if TYPE_CHECKING:
+    from ...middleware import SurfSenseFilesystemMiddleware
+
+
+def create_pwd_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
+    description = select_description(mw._filesystem_mode)
+
+    def sync_pwd(
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+    ) -> str:
+        return current_cwd(mw, runtime)
+
+    async def async_pwd(
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+    ) -> str:
+        return current_cwd(mw, runtime)
+
+    return StructuredTool.from_function(
+        name="pwd",
+        description=description,
+        func=sync_pwd,
+        coroutine=async_pwd,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/read_file/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/read_file/__init__.py
new file mode 100644
index 0000000000..681ac6f161
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/read_file/__init__.py
@@ -0,0 +1,7 @@
+"""Tool: ``read_file`` — read a file (paginated) from the filesystem."""
+
+from __future__ import annotations
+
+from .index import create_read_file_tool
+
+__all__ = ["create_read_file_tool"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/read_file/description.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/read_file/description.py
new file mode 100644
index 0000000000..9b5d7623f8
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/read_file/description.py
@@ -0,0 +1,22 @@
+"""Description string for ``read_file`` (mode-agnostic)."""
+
+from __future__ import annotations
+
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+
+_DESCRIPTION = """Reads a file from the filesystem.
+
+Usage:
+- By default, reads up to 100 lines from the beginning.
+- Use `offset` and `limit` for pagination when files are large.
+- Results include line numbers.
+- Documents contain a `<chunk_index>` near the top listing every chunk with
+  its line range and a `matched="true"` flag for search-relevant chunks.
+  Read the index first, then jump to matched chunks with
+  `read_file(path, offset=<start_line>, limit=<num_lines>)`.
+- Use chunk IDs (`<chunk id='...'>`) as citations in answers.
+"""
+
+
+def select_description(mode: FilesystemMode) -> str:
+    return _DESCRIPTION
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/read_file/index.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/read_file/index.py
new file mode 100644
index 0000000000..8b0a1a1c8d
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/read_file/index.py
@@ -0,0 +1,100 @@
+"""``read_file`` factory: state-cache lookup, then lazy KB load, then disk read."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Annotated, Any
+
+from deepagents.backends.utils import format_read_response, validate_path
+from langchain.tools import ToolRuntime
+from langchain_core.messages import ToolMessage
+from langchain_core.tools import BaseTool, StructuredTool
+from langgraph.types import Command
+
+from app.agents.new_chat.filesystem_state import SurfSenseFilesystemState
+from app.agents.new_chat.middleware.kb_postgres_backend import KBPostgresBackend
+
+from ...middleware.async_dispatch import run_async_blocking
+from ...middleware.path_resolution import resolve_relative
+from .description import select_description
+
+if TYPE_CHECKING:
+    from ...middleware import SurfSenseFilesystemMiddleware
+
+
+def create_read_file_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
+    description = select_description(mw._filesystem_mode)
+
+    async def async_read_file(
+        file_path: Annotated[
+            str,
+            "Absolute path to the file to read. Relative paths resolve against the current cwd.",
+        ],
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+        offset: Annotated[
+            int,
+            "Line number to start reading from (0-indexed).",
+        ] = 0,
+        limit: Annotated[
+            int,
+            "Maximum number of lines to read.",
+        ] = 100,
+    ) -> Command | str:
+        target = resolve_relative(mw, file_path, runtime)
+        try:
+            validated = validate_path(target)
+        except ValueError as exc:
+            return f"Error: {exc}"
+
+        files = runtime.state.get("files") or {}
+        if validated in files:
+            return format_read_response(files[validated], offset, limit)
+
+        backend = mw._get_backend(runtime)
+        if isinstance(backend, KBPostgresBackend):
+            loaded = await backend._load_file_data(validated)
+            if loaded is None:
+                return f"Error: File '{validated}' not found"
+            file_data, doc_id = loaded
+            rendered = format_read_response(file_data, offset, limit)
+            update: dict[str, Any] = {
+                "files": {validated: file_data},
+                "messages": [
+                    ToolMessage(
+                        content=rendered,
+                        tool_call_id=runtime.tool_call_id,
+                    )
+                ],
+            }
+            if doc_id is not None:
+                update["doc_id_by_path"] = {validated: doc_id}
+            return Command(update=update)
+
+        try:
+            rendered = await backend.aread(validated, offset=offset, limit=limit)
+        except Exception as exc:  # pragma: no cover - defensive
+            return f"Error: {exc}"
+        return rendered
+
+    def sync_read_file(
+        file_path: Annotated[
+            str,
+            "Absolute path to the file to read. Relative paths resolve against the current cwd.",
+        ],
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+        offset: Annotated[
+            int,
+            "Line number to start reading from (0-indexed).",
+        ] = 0,
+        limit: Annotated[
+            int,
+            "Maximum number of lines to read.",
+        ] = 100,
+    ) -> Command | str:
+        return run_async_blocking(async_read_file(file_path, runtime, offset, limit))
+
+    return StructuredTool.from_function(
+        name="read_file",
+        description=description,
+        func=sync_read_file,
+        coroutine=async_read_file,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/rm/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/rm/__init__.py
new file mode 100644
index 0000000000..866a74ad8f
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/rm/__init__.py
@@ -0,0 +1,7 @@
+"""Tool: ``rm`` — delete a single file."""
+
+from __future__ import annotations
+
+from .index import create_rm_tool
+
+__all__ = ["create_rm_tool"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/rm/description.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/rm/description.py
new file mode 100644
index 0000000000..a9e120e7cf
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/rm/description.py
@@ -0,0 +1,38 @@
+"""Mode-specific description strings for ``rm``."""
+
+from __future__ import annotations
+
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+
+_CLOUD_DESCRIPTION = """Deletes a single file under `/documents/`.
+
+Mirrors POSIX `rm path` (no `-r`, no glob expansion). Stages the deletion
+for end-of-turn commit; the row is removed only after the agent's turn
+finishes successfully.
+
+Args:
+- path: absolute or relative file path. Cannot point at a directory — use
+  `rmdir` for empty folders. Cannot target the root or `/documents`.
+
+Notes:
+- The action is reversible via the per-action revert flow when action
+  logging is enabled.
+- The anonymous uploaded document is read-only and cannot be deleted.
+"""
+
+_DESKTOP_DESCRIPTION = """Deletes a single file from disk.
+
+Mirrors POSIX `rm path` (no `-r`, no glob expansion). The deletion hits
+disk immediately. Desktop deletes are NOT reversible via the agent's
+revert flow.
+
+Args:
+- path: absolute mount-prefixed file path. Cannot point at a directory —
+  use `rmdir` for empty folders.
+"""
+
+
+def select_description(mode: FilesystemMode) -> str:
+    if mode == FilesystemMode.CLOUD:
+        return _CLOUD_DESCRIPTION
+    return _DESKTOP_DESCRIPTION
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/rm/helpers.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/rm/helpers.py
new file mode 100644
index 0000000000..8a02544d83
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/rm/helpers.py
@@ -0,0 +1,130 @@
+"""Cloud and desktop ``rm`` branches.
+
+Both branches receive an already-resolved + validated absolute path.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+from deepagents.backends.protocol import WriteResult
+from langchain.tools import ToolRuntime
+from langchain_core.messages import ToolMessage
+from langgraph.types import Command
+
+from app.agents.new_chat.filesystem_state import SurfSenseFilesystemState
+from app.agents.new_chat.middleware.kb_postgres_backend import KBPostgresBackend
+from app.agents.new_chat.path_resolver import DOCUMENTS_ROOT
+from app.agents.new_chat.state_reducers import _CLEAR
+
+if TYPE_CHECKING:
+    from ...middleware import SurfSenseFilesystemMiddleware
+
+
+async def cloud_rm(
+    mw: SurfSenseFilesystemMiddleware,
+    runtime: ToolRuntime[None, SurfSenseFilesystemState],
+    validated: str,
+) -> Command | str:
+    """Stage a deletion in cloud mode (commit at end of turn)."""
+    if validated in ("/", DOCUMENTS_ROOT):
+        return f"Error: refusing to rm '{validated}'."
+    if not validated.startswith(DOCUMENTS_ROOT + "/"):
+        return (
+            f"Error: cloud rm must target a path under /documents/ (got '{validated}')."
+        )
+
+    anon = runtime.state.get("kb_anon_doc") or {}
+    if isinstance(anon, dict) and str(anon.get("path") or "") == validated:
+        return "Error: the anonymous uploaded document is read-only."
+
+    staged_dirs = list(runtime.state.get("staged_dirs") or [])
+    if validated in staged_dirs:
+        return f"Error: '{validated}' is a directory. Use rmdir for empty directories."
+    pending_dir_deletes = list(runtime.state.get("pending_dir_deletes") or [])
+    if any(
+        isinstance(d, dict) and d.get("path") == validated for d in pending_dir_deletes
+    ):
+        return f"Error: '{validated}' is already queued for rmdir."
+
+    backend = mw._get_backend(runtime)
+    if isinstance(backend, KBPostgresBackend):
+        children = await backend.als_info(validated)
+        if children:
+            return (
+                f"Error: '{validated}' is a directory. Use rmdir for empty directories."
+            )
+
+    pending_deletes = list(runtime.state.get("pending_deletes") or [])
+    if any(isinstance(d, dict) and d.get("path") == validated for d in pending_deletes):
+        return f"'{validated}' is already queued for deletion."
+
+    files_state = runtime.state.get("files") or {}
+    doc_id_by_path = runtime.state.get("doc_id_by_path") or {}
+    resolved_doc_id: int | None = doc_id_by_path.get(validated)
+    if (
+        validated not in files_state
+        and resolved_doc_id is None
+        and isinstance(backend, KBPostgresBackend)
+    ):
+        loaded = await backend._load_file_data(validated)
+        if loaded is None:
+            return f"Error: file '{validated}' not found."
+        _, resolved_doc_id = loaded
+
+    files_update: dict[str, Any] = {validated: None}
+    update: dict[str, Any] = {
+        "pending_deletes": [
+            {
+                "path": validated,
+                "tool_call_id": runtime.tool_call_id,
+            }
+        ],
+        "files": files_update,
+        "doc_id_by_path": {validated: None},
+        "messages": [
+            ToolMessage(
+                content=(
+                    f"Staged delete of '{validated}' (will commit at end of turn)."
+                ),
+                tool_call_id=runtime.tool_call_id,
+            )
+        ],
+    }
+
+    dirty_paths = list(runtime.state.get("dirty_paths") or [])
+    if validated in dirty_paths:
+        new_dirty: list[Any] = [_CLEAR]
+        for entry in dirty_paths:
+            if entry != validated:
+                new_dirty.append(entry)
+        update["dirty_paths"] = new_dirty
+        update["dirty_path_tool_calls"] = {validated: None}
+
+    return Command(update=update)
+
+
+async def desktop_rm(
+    mw: SurfSenseFilesystemMiddleware,
+    runtime: ToolRuntime[None, SurfSenseFilesystemState],
+    validated: str,
+) -> Command | str:
+    """Hit disk immediately in desktop mode."""
+    backend = mw._get_backend(runtime)
+    adelete = getattr(backend, "adelete_file", None)
+    if not callable(adelete):
+        return "Error: rm is not supported by the active backend."
+    res: WriteResult = await adelete(validated)
+    if res.error:
+        return res.error
+    return Command(
+        update={
+            "files": {validated: None},
+            "messages": [
+                ToolMessage(
+                    content=f"Deleted file '{res.path or validated}'",
+                    tool_call_id=runtime.tool_call_id,
+                )
+            ],
+        }
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/rm/index.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/rm/index.py
new file mode 100644
index 0000000000..0c4e2fc719
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/rm/index.py
@@ -0,0 +1,61 @@
+"""``rm`` factory: resolve + validate the path, then dispatch to cloud / desktop."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Annotated
+
+from deepagents.backends.utils import validate_path
+from langchain.tools import ToolRuntime
+from langchain_core.tools import BaseTool, StructuredTool
+from langgraph.types import Command
+
+from app.agents.new_chat.filesystem_state import SurfSenseFilesystemState
+
+from ...middleware.async_dispatch import run_async_blocking
+from ...middleware.mode import is_cloud
+from ...middleware.path_resolution import resolve_relative
+from .description import select_description
+from .helpers import cloud_rm, desktop_rm
+
+if TYPE_CHECKING:
+    from ...middleware import SurfSenseFilesystemMiddleware
+
+
+def create_rm_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
+    description = select_description(mw._filesystem_mode)
+
+    async def async_rm(
+        path: Annotated[
+            str,
+            "Absolute or relative path to the file to delete.",
+        ],
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+    ) -> Command | str:
+        if not path or not path.strip():
+            return "Error: path is required."
+
+        target = resolve_relative(mw, path, runtime)
+        try:
+            validated = validate_path(target)
+        except ValueError as exc:
+            return f"Error: {exc}"
+
+        if is_cloud(mw._filesystem_mode):
+            return await cloud_rm(mw, runtime, validated)
+        return await desktop_rm(mw, runtime, validated)
+
+    def sync_rm(
+        path: Annotated[
+            str,
+            "Absolute or relative path to the file to delete.",
+        ],
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+    ) -> Command | str:
+        return run_async_blocking(async_rm(path, runtime))
+
+    return StructuredTool.from_function(
+        name="rm",
+        description=description,
+        func=sync_rm,
+        coroutine=async_rm,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/rmdir/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/rmdir/__init__.py
new file mode 100644
index 0000000000..24e5e1b21b
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/rmdir/__init__.py
@@ -0,0 +1,7 @@
+"""Tool: ``rmdir`` — delete an empty directory."""
+
+from __future__ import annotations
+
+from .index import create_rmdir_tool
+
+__all__ = ["create_rmdir_tool"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/rmdir/description.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/rmdir/description.py
new file mode 100644
index 0000000000..2b72f815b3
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/rmdir/description.py
@@ -0,0 +1,42 @@
+"""Mode-specific description strings for ``rmdir``."""
+
+from __future__ import annotations
+
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+
+_CLOUD_DESCRIPTION = """Deletes an empty directory under `/documents/`.
+
+Mirrors POSIX `rmdir path`: refuses non-empty directories. Recursive
+deletion (`rm -r`) is intentionally NOT supported — clear contents with
+`rm` first.
+
+Args:
+- path: absolute or relative directory path. Cannot target the root,
+  `/documents`, the current cwd, or any ancestor of cwd (use `cd` to
+  move out first).
+
+Notes:
+- Emptiness is evaluated against the post-staged view, so a same-turn
+  `rm /a/x.md` followed by `rmdir /a` is fine.
+- If the directory was added in this same turn via `mkdir` and never
+  committed, the staged mkdir is dropped instead of issuing a delete.
+- The action is reversible via the per-action revert flow when action
+  logging is enabled.
+"""
+
+_DESKTOP_DESCRIPTION = """Deletes an empty directory from disk.
+
+Mirrors POSIX `rmdir path`: refuses non-empty directories. Recursive
+deletion is NOT supported. The deletion hits disk immediately and is
+NOT reversible via the agent's revert flow.
+
+Args:
+- path: absolute mount-prefixed directory path. Cannot target the mount
+  root or any directory containing files/subfolders.
+"""
+
+
+def select_description(mode: FilesystemMode) -> str:
+    if mode == FilesystemMode.CLOUD:
+        return _CLOUD_DESCRIPTION
+    return _DESKTOP_DESCRIPTION
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/rmdir/helpers.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/rmdir/helpers.py
new file mode 100644
index 0000000000..de5afe722b
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/rmdir/helpers.py
@@ -0,0 +1,136 @@
+"""Cloud and desktop ``rmdir`` branches.
+
+Both branches receive an already-resolved + validated absolute path.
+"""
+
+from __future__ import annotations
+
+import posixpath
+from typing import TYPE_CHECKING
+
+from deepagents.backends.protocol import WriteResult
+from langchain.tools import ToolRuntime
+from langchain_core.messages import ToolMessage
+from langgraph.types import Command
+
+from app.agents.new_chat.filesystem_state import SurfSenseFilesystemState
+from app.agents.new_chat.middleware.kb_postgres_backend import KBPostgresBackend
+from app.agents.new_chat.path_resolver import DOCUMENTS_ROOT
+from app.agents.new_chat.state_reducers import _CLEAR
+
+from ...middleware.path_resolution import current_cwd
+from ...shared.paths import is_ancestor_of
+
+if TYPE_CHECKING:
+    from ...middleware import SurfSenseFilesystemMiddleware
+
+
+async def cloud_rmdir(
+    mw: SurfSenseFilesystemMiddleware,
+    runtime: ToolRuntime[None, SurfSenseFilesystemState],
+    validated: str,
+) -> Command | str:
+    """Stage an empty-folder delete in cloud mode (commit at end of turn)."""
+    if validated in ("/", DOCUMENTS_ROOT):
+        return f"Error: refusing to rmdir '{validated}'."
+    if not validated.startswith(DOCUMENTS_ROOT + "/"):
+        return (
+            "Error: cloud rmdir must target a path under /documents/ "
+            f"(got '{validated}')."
+        )
+
+    cwd = current_cwd(mw, runtime)
+    if validated == cwd or is_ancestor_of(validated, cwd):
+        return (
+            f"Error: cannot rmdir '{validated}' because the current "
+            "cwd is at or under it. cd out first."
+        )
+
+    staged_dirs = list(runtime.state.get("staged_dirs") or [])
+    pending_dir_deletes = list(runtime.state.get("pending_dir_deletes") or [])
+    if any(
+        isinstance(d, dict) and d.get("path") == validated for d in pending_dir_deletes
+    ):
+        return f"'{validated}' is already queued for deletion."
+
+    backend = mw._get_backend(runtime)
+
+    exists_in_staged = validated in staged_dirs
+    children: list = []
+    if isinstance(backend, KBPostgresBackend):
+        children = list(await backend.als_info(validated))
+
+    if isinstance(backend, KBPostgresBackend) and not children and not exists_in_staged:
+        loaded = await backend._load_file_data(validated)
+        if loaded is not None:
+            return f"Error: '{validated}' is a file. Use rm to delete files."
+        parent = posixpath.dirname(validated) or "/"
+        parent_listing = await backend.als_info(parent)
+        parent_has_dir = any(
+            info.get("path") == validated and info.get("is_dir")
+            for info in parent_listing
+        )
+        if not parent_has_dir:
+            return f"Error: directory '{validated}' not found."
+
+    if children:
+        return f"Error: directory '{validated}' is not empty. Remove contents first."
+
+    if exists_in_staged:
+        rest = [d for d in staged_dirs if d != validated]
+        return Command(
+            update={
+                "staged_dirs": [_CLEAR, *rest],
+                "staged_dir_tool_calls": {validated: None},
+                "messages": [
+                    ToolMessage(
+                        content=(f"Un-staged directory '{validated}'."),
+                        tool_call_id=runtime.tool_call_id,
+                    )
+                ],
+            }
+        )
+
+    return Command(
+        update={
+            "pending_dir_deletes": [
+                {
+                    "path": validated,
+                    "tool_call_id": runtime.tool_call_id,
+                }
+            ],
+            "messages": [
+                ToolMessage(
+                    content=(
+                        f"Staged rmdir of '{validated}' (will commit at end of turn)."
+                    ),
+                    tool_call_id=runtime.tool_call_id,
+                )
+            ],
+        }
+    )
+
+
+async def desktop_rmdir(
+    mw: SurfSenseFilesystemMiddleware,
+    runtime: ToolRuntime[None, SurfSenseFilesystemState],
+    validated: str,
+) -> Command | str:
+    """Hit disk immediately in desktop mode."""
+    backend = mw._get_backend(runtime)
+    armdir = getattr(backend, "armdir", None)
+    if not callable(armdir):
+        return "Error: rmdir is not supported by the active backend."
+    res: WriteResult = await armdir(validated)
+    if res.error:
+        return res.error
+    return Command(
+        update={
+            "messages": [
+                ToolMessage(
+                    content=f"Deleted directory '{res.path or validated}'",
+                    tool_call_id=runtime.tool_call_id,
+                )
+            ],
+        }
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/rmdir/index.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/rmdir/index.py
new file mode 100644
index 0000000000..cdf0573533
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/rmdir/index.py
@@ -0,0 +1,61 @@
+"""``rmdir`` factory: resolve + validate the path, then dispatch to cloud / desktop."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Annotated
+
+from deepagents.backends.utils import validate_path
+from langchain.tools import ToolRuntime
+from langchain_core.tools import BaseTool, StructuredTool
+from langgraph.types import Command
+
+from app.agents.new_chat.filesystem_state import SurfSenseFilesystemState
+
+from ...middleware.async_dispatch import run_async_blocking
+from ...middleware.mode import is_cloud
+from ...middleware.path_resolution import resolve_relative
+from .description import select_description
+from .helpers import cloud_rmdir, desktop_rmdir
+
+if TYPE_CHECKING:
+    from ...middleware import SurfSenseFilesystemMiddleware
+
+
+def create_rmdir_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
+    description = select_description(mw._filesystem_mode)
+
+    async def async_rmdir(
+        path: Annotated[
+            str,
+            "Absolute or relative path of the empty directory to delete.",
+        ],
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+    ) -> Command | str:
+        if not path or not path.strip():
+            return "Error: path is required."
+
+        target = resolve_relative(mw, path, runtime)
+        try:
+            validated = validate_path(target)
+        except ValueError as exc:
+            return f"Error: {exc}"
+
+        if is_cloud(mw._filesystem_mode):
+            return await cloud_rmdir(mw, runtime, validated)
+        return await desktop_rmdir(mw, runtime, validated)
+
+    def sync_rmdir(
+        path: Annotated[
+            str,
+            "Absolute or relative path of the empty directory to delete.",
+        ],
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+    ) -> Command | str:
+        return run_async_blocking(async_rmdir(path, runtime))
+
+    return StructuredTool.from_function(
+        name="rmdir",
+        description=description,
+        func=sync_rmdir,
+        coroutine=async_rmdir,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/write_file/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/write_file/__init__.py
new file mode 100644
index 0000000000..cf54fffe4c
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/write_file/__init__.py
@@ -0,0 +1,7 @@
+"""Tool: ``write_file`` — create or overwrite a text file."""
+
+from __future__ import annotations
+
+from .index import create_write_file_tool
+
+__all__ = ["create_write_file_tool"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/write_file/description.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/write_file/description.py
new file mode 100644
index 0000000000..223cc3f265
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/write_file/description.py
@@ -0,0 +1,35 @@
+"""Mode-specific description strings for ``write_file``."""
+
+from __future__ import annotations
+
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+
+_CLOUD_DESCRIPTION = """Writes a new text file to the workspace.
+
+Usage:
+- Files written under `/documents/<...>` are persisted as Documents at end
+  of turn.
+- Use a `temp_` filename prefix (e.g. `temp_plan.md` or `/documents/temp_x.md`)
+  for scratch/working files; they are automatically discarded at end of turn.
+- Writes outside `/documents/` are rejected unless the basename starts with
+  `temp_`.
+- Supported outputs include common LLM-friendly text formats like markdown,
+  json, yaml, csv, xml, html, css, sql, and code files.
+- Avoid placeholders; produce concrete and useful text.
+"""
+
+_DESKTOP_DESCRIPTION = """Writes a text file to disk.
+
+Usage:
+- Use mount-prefixed absolute paths like `/<mount>/sub/file.ext`.
+- Writes hit disk immediately. There is no end-of-turn staging.
+- Supported outputs include common LLM-friendly text formats like markdown,
+  json, yaml, csv, xml, html, css, sql, and code files.
+- Avoid placeholders; produce concrete and useful text.
+"""
+
+
+def select_description(mode: FilesystemMode) -> str:
+    if mode == FilesystemMode.CLOUD:
+        return _CLOUD_DESCRIPTION
+    return _DESKTOP_DESCRIPTION
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/write_file/index.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/write_file/index.py
new file mode 100644
index 0000000000..a42f7ed62f
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/filesystem/tools/write_file/index.py
@@ -0,0 +1,83 @@
+"""``write_file`` factory: resolve target, enforce cloud namespace, dispatch to backend."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Annotated, Any
+
+from deepagents.backends.protocol import WriteResult
+from deepagents.backends.utils import create_file_data, validate_path
+from langchain.tools import ToolRuntime
+from langchain_core.messages import ToolMessage
+from langchain_core.tools import BaseTool, StructuredTool
+from langgraph.types import Command
+
+from app.agents.new_chat.filesystem_state import SurfSenseFilesystemState
+
+from ...middleware.async_dispatch import run_async_blocking
+from ...middleware.mode import is_cloud
+from ...middleware.namespace_policy import check_cloud_write_namespace
+from ...middleware.path_resolution import resolve_write_target_path
+from .description import select_description
+
+if TYPE_CHECKING:
+    from ...middleware import SurfSenseFilesystemMiddleware
+
+
+def create_write_file_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
+    description = select_description(mw._filesystem_mode)
+
+    async def async_write_file(
+        file_path: Annotated[
+            str,
+            "Absolute path where the file should be created. Relative paths resolve against the current cwd.",
+        ],
+        content: Annotated[str, "Text content to write to the file."],
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+    ) -> Command | str:
+        target = resolve_write_target_path(mw, file_path, runtime)
+        try:
+            validated = validate_path(target)
+        except ValueError as exc:
+            return f"Error: {exc}"
+
+        namespace_error = check_cloud_write_namespace(mw, validated, runtime)
+        if namespace_error:
+            return namespace_error
+
+        backend = mw._get_backend(runtime)
+        res: WriteResult = await backend.awrite(validated, content)
+        if res.error:
+            return res.error
+
+        path = res.path or validated
+        files_update = res.files_update or {path: create_file_data(content)}
+        update: dict[str, Any] = {
+            "files": files_update,
+            "messages": [
+                ToolMessage(
+                    content=f"Updated file {path}",
+                    tool_call_id=runtime.tool_call_id,
+                )
+            ],
+        }
+        if is_cloud(mw._filesystem_mode):
+            update["dirty_paths"] = [path]
+            update["dirty_path_tool_calls"] = {path: runtime.tool_call_id}
+        return Command(update=update)
+
+    def sync_write_file(
+        file_path: Annotated[
+            str,
+            "Absolute path where the file should be created. Relative paths resolve against the current cwd.",
+        ],
+        content: Annotated[str, "Text content to write to the file."],
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+    ) -> Command | str:
+        return run_async_blocking(async_write_file(file_path, content, runtime))
+
+    return StructuredTool.from_function(
+        name="write_file",
+        description=description,
+        func=sync_write_file,
+        coroutine=async_write_file,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/flags.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/flags.py
new file mode 100644
index 0000000000..69994ae008
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/flags.py
@@ -0,0 +1,10 @@
+"""Single source of truth for the feature-flag predicate."""
+
+from __future__ import annotations
+
+from app.agents.new_chat.feature_flags import AgentFeatureFlags
+
+
+def enabled(flags: AgentFeatureFlags, attr: str) -> bool:
+    """``flags.<attr>`` is on AND the new-agent-stack kill switch is off."""
+    return getattr(flags, attr) and not flags.disable_new_agent_stack
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/kb_context_projection.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/kb_context_projection.py
new file mode 100644
index 0000000000..01c77bb844
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/kb_context_projection.py
@@ -0,0 +1,54 @@
+"""Project ``workspace_tree_text`` + ``kb_priority`` from state into SystemMessages."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain.agents.middleware import AgentMiddleware, AgentState
+from langchain_core.messages import SystemMessage
+from langgraph.runtime import Runtime
+
+from app.agents.new_chat.filesystem_state import SurfSenseFilesystemState
+from app.agents.new_chat.middleware.knowledge_search import _render_priority_message
+
+
+class KbContextProjectionMiddleware(AgentMiddleware):  # type: ignore[type-arg]
+    """Emit ``<workspace_tree>`` + ``<priority_documents>`` from shared state.
+
+    Read-only consumer: no DB, no LLM, no state writes. The orchestrator's
+    renderer middlewares populate the source fields; this projection lets any
+    agent (orchestrator or subagent) put the same content in front of its
+    own LLM call.
+    """
+
+    tools = ()
+    state_schema = SurfSenseFilesystemState
+
+    def before_agent(  # type: ignore[override]
+        self,
+        state: AgentState,
+        runtime: Runtime[Any],
+    ) -> dict[str, Any] | None:
+        del runtime
+        tree_text = state.get("workspace_tree_text")
+        priority = state.get("kb_priority")
+        if not tree_text and not priority:
+            return None
+
+        messages = list(state.get("messages") or [])
+        insert_at = max(len(messages) - 1, 0)
+        if tree_text:
+            messages.insert(insert_at, SystemMessage(content=tree_text))
+        if priority:
+            messages.insert(insert_at, _render_priority_message(priority))
+        return {"messages": messages}
+
+
+def build_kb_context_projection_mw() -> KbContextProjectionMiddleware:
+    return KbContextProjectionMiddleware()
+
+
+__all__ = [
+    "KbContextProjectionMiddleware",
+    "build_kb_context_projection_mw",
+]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/memory.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/memory.py
new file mode 100644
index 0000000000..9316b3e21c
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/memory.py
@@ -0,0 +1,19 @@
+"""User/team memory injection prepended to the conversation."""
+
+from __future__ import annotations
+
+from app.agents.new_chat.middleware import MemoryInjectionMiddleware
+from app.db import ChatVisibility
+
+
+def build_memory_mw(
+    *,
+    user_id: str | None,
+    search_space_id: int,
+    visibility: ChatVisibility,
+) -> MemoryInjectionMiddleware:
+    return MemoryInjectionMiddleware(
+        user_id=user_id,
+        search_space_id=search_space_id,
+        thread_visibility=visibility,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/patch_tool_calls.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/patch_tool_calls.py
new file mode 100644
index 0000000000..50036dbbed
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/patch_tool_calls.py
@@ -0,0 +1,9 @@
+"""Repair dangling tool-call sequences before each agent turn."""
+
+from __future__ import annotations
+
+from deepagents.middleware.patch_tool_calls import PatchToolCallsMiddleware
+
+
+def build_patch_tool_calls_mw() -> PatchToolCallsMiddleware:
+    return PatchToolCallsMiddleware()
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/__init__.py
new file mode 100644
index 0000000000..c25c2b2816
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/__init__.py
@@ -0,0 +1,11 @@
+"""Pattern-based allow/deny/ask middleware with HITL fallback (vertical slice).
+
+Public surface (one entry point only — every other symbol is an internal of
+the rule engine and stays inside ``middleware/``, ``ask/``, or ``deny.py``):
+
+- :func:`build_permission_mw` — construction recipe shared by every stack.
+"""
+
+from .middleware.factory import build_permission_mw
+
+__all__ = ["build_permission_mw"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/ask/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/ask/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/ask/decision.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/ask/decision.py
new file mode 100644
index 0000000000..f507e85ff5
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/ask/decision.py
@@ -0,0 +1,74 @@
+"""Translate the unified langchain HITL envelope into permission-domain semantics.
+
+``PermissionMiddleware`` works with the canonical shape
+``{decision_type: "once" | "approve_always" | "reject", feedback?: str, edited_args?: dict}``.
+The wire envelope arriving from langgraph already lives in the LC HITL shape
+(parsed once in :mod:`hitl_wire.decision`); this module performs the small
+domain mapping (``approve|edit`` → ``once``, ``approve_always`` →
+``approve_always``, anything else → ``reject``) without re-implementing the
+envelope walk.
+
+Failing closed: any unrecognised decision becomes ``reject`` (with a warning)
+so the middleware never proceeds on ambiguous input.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+from app.agents.multi_agent_chat.subagents.shared.hitl.wire import (
+    LC_DECISION_APPROVE,
+    LC_DECISION_EDIT,
+    LC_DECISION_REJECT,
+    SURFSENSE_DECISION_APPROVE_ALWAYS,
+    parse_lc_envelope,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# ``approve`` and ``edit`` both mean "let this call go through this once". The
+# legacy SurfSense bare-scalar values (``once`` / ``approve_always`` / ``reject``)
+# pass through unchanged so historical resume payloads still work.
+_LC_TO_PERMISSION: dict[str, str] = {
+    LC_DECISION_APPROVE: "once",
+    LC_DECISION_EDIT: "once",
+    SURFSENSE_DECISION_APPROVE_ALWAYS: "approve_always",
+    LC_DECISION_REJECT: "reject",
+    "once": "once",
+    "approve_always": "approve_always",
+    "reject": "reject",
+}
+
+
+def normalize_permission_decision(envelope: Any) -> dict[str, Any]:
+    """Project the user's reply into the canonical permission decision shape.
+
+    Args:
+        envelope: The raw resume value from langgraph (LC HITL envelope, a
+            bare scalar string, or a pre-canonical dict).
+
+    Returns:
+        ``{"decision_type": "once"|"approve_always"|"reject"}`` plus optional
+        ``feedback`` (``reject`` with a user message) and ``edited_args``
+        (``edit`` reply with non-empty arg overrides).
+    """
+    parsed = parse_lc_envelope(envelope)
+    mapped = _LC_TO_PERMISSION.get(parsed.decision_type)
+    if mapped is None:
+        logger.warning(
+            "Unknown permission decision %r; treating as reject",
+            parsed.decision_type,
+        )
+        mapped = "reject"
+
+    out: dict[str, Any] = {"decision_type": mapped}
+    if parsed.message:
+        out["feedback"] = parsed.message
+    if parsed.edited_args:
+        out["edited_args"] = parsed.edited_args
+    return out
+
+
+__all__ = ["normalize_permission_decision"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/ask/edit/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/ask/edit/__init__.py
new file mode 100644
index 0000000000..2921cbe700
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/ask/edit/__init__.py
@@ -0,0 +1,10 @@
+"""Apply ``edit`` permission decisions to tool calls.
+
+Edited-arg extraction now lives in :mod:`hitl_wire.decision` (single parser
+for all approval paths); this module owns the merge step that produces a
+fresh tool-call dict for the orchestrator.
+"""
+
+from .merge import merge_edited_args
+
+__all__ = ["merge_edited_args"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/ask/edit/merge.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/ask/edit/merge.py
new file mode 100644
index 0000000000..21474ad529
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/ask/edit/merge.py
@@ -0,0 +1,22 @@
+"""Apply edited args to a tool call (shallow merge, no mutation).
+
+Edited values override originals; keys absent from ``edited_args`` keep
+their original values, so partial edits are safe. Returns a NEW tool-call
+dict so the caller can swap it into ``AIMessage.tool_calls`` without
+aliasing the live message object.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+def merge_edited_args(
+    tool_call: dict[str, Any], edited_args: dict[str, Any]
+) -> dict[str, Any]:
+    original_args = tool_call.get("args") or {}
+    merged_args = {**original_args, **edited_args}
+    return {**tool_call, "args": merged_args}
+
+
+__all__ = ["merge_edited_args"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/ask/payload.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/ask/payload.py
new file mode 100644
index 0000000000..6c5d011dfe
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/ask/payload.py
@@ -0,0 +1,89 @@
+"""Build the permission-ask interrupt payload (LC HITL wire + SurfSense context)."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.tools import BaseTool
+
+from app.agents.multi_agent_chat.subagents.shared.hitl.wire import (
+    LC_DECISION_APPROVE,
+    LC_DECISION_EDIT,
+    LC_DECISION_REJECT,
+    SURFSENSE_DECISION_APPROVE_ALWAYS,
+    build_lc_hitl_payload,
+)
+from app.agents.new_chat.permissions import Rule
+
+PERMISSION_ASK_INTERRUPT_TYPE = "permission_ask"
+
+_BASE_PERMISSION_ASK_DECISIONS: list[str] = [
+    LC_DECISION_APPROVE,
+    LC_DECISION_REJECT,
+    LC_DECISION_EDIT,
+]
+
+
+def _is_mcp_tool(tool: BaseTool | None) -> bool:
+    """An MCP tool advertises a connector id in its langchain metadata."""
+    if tool is None:
+        return False
+    metadata = getattr(tool, "metadata", None) or {}
+    return metadata.get("mcp_connector_id") is not None
+
+
+def _card_fields_from_tool(tool: BaseTool | None) -> dict[str, Any]:
+    """Project the FE card's tool-scoped fields out of a BaseTool."""
+    if tool is None:
+        return {}
+    metadata = getattr(tool, "metadata", None) or {}
+    fields: dict[str, Any] = {}
+    connector_id = metadata.get("mcp_connector_id")
+    if connector_id is not None:
+        fields["mcp_connector_id"] = connector_id
+    connector_name = metadata.get("mcp_connector_name")
+    if connector_name:
+        fields["mcp_server"] = connector_name
+    if tool.description:
+        fields["tool_description"] = tool.description
+    return fields
+
+
+def build_permission_ask_payload(
+    *,
+    tool_name: str,
+    args: dict[str, Any],
+    patterns: list[str],
+    rules: list[Rule],
+    tool: BaseTool | None = None,
+) -> dict[str, Any]:
+    """Build the permission-ask interrupt payload.
+
+    ``approve_always`` is added to the palette only for MCP tools, since that
+    is the only case where the user's choice can persist beyond the current
+    agent instance (saved to the connector's trusted-tools list). Native
+    tools fall back to the once/reject/edit triad.
+    """
+    allowed_decisions = list(_BASE_PERMISSION_ASK_DECISIONS)
+    if _is_mcp_tool(tool):
+        allowed_decisions.append(SURFSENSE_DECISION_APPROVE_ALWAYS)
+
+    context: dict[str, Any] = {
+        "patterns": patterns,
+        "rules": [
+            {"permission": r.permission, "pattern": r.pattern, "action": r.action}
+            for r in rules
+        ],
+        "always": patterns,
+        **_card_fields_from_tool(tool),
+    }
+    return build_lc_hitl_payload(
+        tool_name=tool_name,
+        args=args,
+        allowed_decisions=allowed_decisions,
+        interrupt_type=PERMISSION_ASK_INTERRUPT_TYPE,
+        context=context,
+    )
+
+
+__all__ = ["PERMISSION_ASK_INTERRUPT_TYPE", "build_permission_ask_payload"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/ask/request.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/ask/request.py
new file mode 100644
index 0000000000..d61d38f345
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/ask/request.py
@@ -0,0 +1,59 @@
+"""Side-effectful entry point: pause the graph and return the permission decision.
+
+Wraps :func:`langgraph.types.interrupt` with the OTel spans the SurfSense
+dashboard expects, then projects the resume value through
+:func:`normalize_permission_decision` so the middleware downstream only
+sees the canonical permission-domain shape.
+
+When ``emit_interrupt`` is ``False`` the call short-circuits to ``reject``;
+this is used by non-interactive deployments where ``ask`` must not block.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.tools import BaseTool
+from langgraph.types import interrupt
+
+from app.agents.new_chat.permissions import Rule
+from app.observability import otel as ot
+
+from .decision import normalize_permission_decision
+from .payload import PERMISSION_ASK_INTERRUPT_TYPE, build_permission_ask_payload
+
+
+def request_permission_decision(
+    *,
+    tool_name: str,
+    args: dict[str, Any],
+    patterns: list[str],
+    rules: list[Rule],
+    emit_interrupt: bool,
+    tool: BaseTool | None = None,
+) -> dict[str, Any]:
+    """Pause for an ``ask`` decision; return the canonical permission decision dict."""
+    if not emit_interrupt:
+        return {"decision_type": "reject"}
+
+    payload = build_permission_ask_payload(
+        tool_name=tool_name,
+        args=args,
+        patterns=patterns,
+        rules=rules,
+        tool=tool,
+    )
+
+    with (
+        ot.permission_asked_span(
+            permission=tool_name,
+            pattern=patterns[0] if patterns else None,
+            extra={"permission.patterns": list(patterns)},
+        ),
+        ot.interrupt_span(interrupt_type=PERMISSION_ASK_INTERRUPT_TYPE),
+    ):
+        decision = interrupt(payload)
+    return normalize_permission_decision(decision)
+
+
+__all__ = ["request_permission_decision"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/deny.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/deny.py
new file mode 100644
index 0000000000..196c4040e2
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/deny.py
@@ -0,0 +1,39 @@
+"""Synthesise a ``ToolMessage`` for a denied tool call.
+
+The denied call is replaced with this message so the model sees a typed
+``permission_denied`` error in ``ToolMessage.additional_kwargs["error"]``
+and can adjust its plan without retrying the same forbidden call.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.messages import ToolMessage
+
+from app.agents.new_chat.errors import StreamingError
+from app.agents.new_chat.permissions import Rule
+
+
+def build_deny_message(tool_call: dict[str, Any], rule: Rule) -> ToolMessage:
+    err = StreamingError(
+        code="permission_denied",
+        retryable=False,
+        suggestion=(
+            f"rule permission={rule.permission!r} pattern={rule.pattern!r} "
+            f"blocked this call"
+        ),
+    )
+    return ToolMessage(
+        content=(
+            f"Permission denied: rule {rule.permission}/{rule.pattern} "
+            f"blocked tool {tool_call.get('name')!r}."
+        ),
+        tool_call_id=tool_call.get("id") or "",
+        name=tool_call.get("name"),
+        status="error",
+        additional_kwargs={"error": err.model_dump()},
+    )
+
+
+__all__ = ["build_deny_message"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/middleware/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/middleware/__init__.py
new file mode 100644
index 0000000000..81cf9d7ca6
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/middleware/__init__.py
@@ -0,0 +1,13 @@
+"""The orchestrator class plus its evaluation and ruleset-view helpers."""
+
+from .core import PermissionMiddleware
+from .evaluation import evaluate_tool_call, resolve_patterns
+from .ruleset_view import all_rulesets, globally_denied
+
+__all__ = [
+    "PermissionMiddleware",
+    "all_rulesets",
+    "evaluate_tool_call",
+    "globally_denied",
+    "resolve_patterns",
+]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/middleware/core.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/middleware/core.py
new file mode 100644
index 0000000000..d2950c5b4b
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/middleware/core.py
@@ -0,0 +1,225 @@
+"""``PermissionMiddleware`` — pattern-based allow/deny/ask with HITL fallback.
+
+LangChain's :class:`HumanInTheLoopMiddleware` only supports a static
+"this tool always asks" decision per tool. There's no rule-based
+allow/deny/ask, no glob patterns, no per-space/per-thread overrides, and
+no auto-deny synthesis.
+
+This middleware layers OpenCode's wildcard-ruleset model on top of the
+unified langchain HITL wire format (see :mod:`hitl_wire`), so it sits
+beside ``HumanInTheLoopMiddleware`` and self-gated approvals on a single
+parallel-HITL routing layer in ``task_tool`` + ``resume_routing``.
+"""
+
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from typing import Any
+
+from langchain.agents.middleware.types import (
+    AgentMiddleware,
+    AgentState,
+    ContextT,
+)
+from langchain_core.messages import AIMessage, ToolMessage
+from langchain_core.tools import BaseTool
+from langgraph.runtime import Runtime
+
+from app.agents.new_chat.errors import CorrectedError, RejectedError
+from app.agents.new_chat.permissions import Ruleset
+from app.services.user_tool_allowlist import TrustedToolSaver
+
+from ..ask.edit import merge_edited_args
+from ..ask.request import request_permission_decision
+from ..deny import build_deny_message
+from .evaluation import evaluate_tool_call
+from .pattern_resolver import PatternResolver
+from .ruleset_view import all_rulesets
+from .runtime_promote import persist_always
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass(frozen=True)
+class _AlwaysPromotion:
+    """A pending request to save an ``approve_always`` decision to the user's trust list."""
+
+    connector_id: int
+    tool_name: str
+
+
+class PermissionMiddleware(AgentMiddleware):  # type: ignore[type-arg]
+    """Allow/deny/ask layer over the agent's tool calls.
+
+    Args:
+        rulesets: Layered rulesets to evaluate (earliest-to-latest wins).
+            Typical layering: ``defaults < global < space < thread < runtime_approved``.
+        pattern_resolvers: Optional per-tool callables that map ``args``
+            to wildcard patterns. Tools without an entry use the bare
+            tool name as the only pattern.
+        runtime_ruleset: Mutable :class:`Ruleset` extended in-place when
+            the user replies ``"approve_always"``. Reused across calls in
+            the same agent instance so newly-allowed rules apply downstream.
+        always_emit_interrupt_payload: Set ``False`` to make ``ask``
+            collapse to ``deny`` (for non-interactive deployments).
+        tools_by_name: Map from tool name to :class:`BaseTool`, used to
+            decorate ``ask`` interrupts with the tool's description and
+            MCP metadata for the FE card.
+        trusted_tool_saver: Async callback invoked on ``approve_always``
+            decisions for MCP tools (those whose ``metadata`` carries an
+            ``mcp_connector_id``). Without it the promotion only lives
+            in-memory for the current agent instance.
+    """
+
+    tools = ()
+
+    def __init__(
+        self,
+        *,
+        rulesets: list[Ruleset] | None = None,
+        pattern_resolvers: dict[str, PatternResolver] | None = None,
+        runtime_ruleset: Ruleset | None = None,
+        always_emit_interrupt_payload: bool = True,
+        tools_by_name: dict[str, BaseTool] | None = None,
+        trusted_tool_saver: TrustedToolSaver | None = None,
+    ) -> None:
+        super().__init__()
+        self._static_rulesets: list[Ruleset] = list(rulesets or [])
+        self._pattern_resolvers: dict[str, PatternResolver] = dict(
+            pattern_resolvers or {}
+        )
+        self._runtime_ruleset: Ruleset = runtime_ruleset or Ruleset(
+            origin="runtime_approved"
+        )
+        self._emit_interrupt = always_emit_interrupt_payload
+        self._tools_by_name: dict[str, BaseTool] = dict(tools_by_name or {})
+        self._trusted_tool_saver: TrustedToolSaver | None = trusted_tool_saver
+
+    def _process(
+        self,
+        state: AgentState,
+        runtime: Runtime[Any],
+    ) -> tuple[dict[str, Any] | None, list[_AlwaysPromotion]]:
+        """Pure decision pass: returns ``(state_update, pending_promotions)``.
+
+        Side effects performed here are in-memory only (rule promotion
+        into ``runtime_ruleset``). DB writes for ``approve_always``
+        decisions are queued as ``_AlwaysPromotion`` and flushed by the
+        async hook.
+        """
+        del runtime
+        messages = state.get("messages") or []
+        if not messages:
+            return None, []
+        last = messages[-1]
+        if not isinstance(last, AIMessage) or not last.tool_calls:
+            return None, []
+
+        rulesets = all_rulesets(self._static_rulesets, self._runtime_ruleset)
+        deny_messages: list[ToolMessage] = []
+        kept_calls: list[dict[str, Any]] = []
+        promotions: list[_AlwaysPromotion] = []
+        any_change = False
+
+        for raw in last.tool_calls:
+            call = (
+                dict(raw)
+                if isinstance(raw, dict)
+                else {
+                    "name": getattr(raw, "name", None),
+                    "args": getattr(raw, "args", {}),
+                    "id": getattr(raw, "id", None),
+                    "type": "tool_call",
+                }
+            )
+            name = call.get("name") or ""
+            args = call.get("args") or {}
+            action, patterns, rules = evaluate_tool_call(
+                name, args, self._pattern_resolvers, rulesets
+            )
+
+            if action == "deny":
+                deny_rule = next((r for r in rules if r.action == "deny"), rules[0])
+                deny_messages.append(build_deny_message(call, deny_rule))
+                any_change = True
+                continue
+
+            if action == "ask":
+                decision = request_permission_decision(
+                    tool_name=name,
+                    args=args,
+                    patterns=patterns,
+                    rules=rules,
+                    emit_interrupt=self._emit_interrupt,
+                    tool=self._tools_by_name.get(name),
+                )
+                kind = str(decision.get("decision_type") or "reject").lower()
+                edited_args = decision.get("edited_args")
+                if kind in ("once", "approve_always"):
+                    final_call = (
+                        merge_edited_args(call, edited_args)
+                        if isinstance(edited_args, dict) and edited_args
+                        else call
+                    )
+                    if final_call is not call:
+                        any_change = True
+                    if kind == "approve_always":
+                        persist_always(self._runtime_ruleset, name, patterns)
+                        promotion = self._build_always_promotion(name)
+                        if promotion is not None:
+                            promotions.append(promotion)
+                    kept_calls.append(final_call)
+                elif kind == "reject":
+                    feedback = decision.get("feedback")
+                    if isinstance(feedback, str) and feedback.strip():
+                        raise CorrectedError(feedback, tool=name)
+                    raise RejectedError(
+                        tool=name, pattern=patterns[0] if patterns else None
+                    )
+                else:
+                    logger.warning(
+                        "Unknown permission decision %r; treating as reject", kind
+                    )
+                    raise RejectedError(tool=name)
+                continue
+
+            kept_calls.append(call)
+
+        if not any_change and len(kept_calls) == len(last.tool_calls):
+            return None, promotions
+
+        updated = last.model_copy(update={"tool_calls": kept_calls})
+        result_messages: list[Any] = [updated]
+        if deny_messages:
+            result_messages.extend(deny_messages)
+        return {"messages": result_messages}, promotions
+
+    def _build_always_promotion(self, tool_name: str) -> _AlwaysPromotion | None:
+        """Return a save request iff the tool exposes an ``mcp_connector_id``."""
+        tool = self._tools_by_name.get(tool_name)
+        metadata = getattr(tool, "metadata", None) or {}
+        connector_id = metadata.get("mcp_connector_id")
+        if not isinstance(connector_id, int):
+            return None
+        return _AlwaysPromotion(connector_id=connector_id, tool_name=tool_name)
+
+    def after_model(  # type: ignore[override]
+        self, state: AgentState, runtime: Runtime[ContextT]
+    ) -> dict[str, Any] | None:
+        update, _ = self._process(state, runtime)
+        return update
+
+    async def aafter_model(  # type: ignore[override]
+        self, state: AgentState, runtime: Runtime[ContextT]
+    ) -> dict[str, Any] | None:
+        update, promotions = self._process(state, runtime)
+        if self._trusted_tool_saver is not None:
+            for promotion in promotions:
+                await self._trusted_tool_saver(
+                    promotion.connector_id, promotion.tool_name
+                )
+        return update
+
+
+__all__ = ["PermissionMiddleware"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/middleware/evaluation.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/middleware/evaluation.py
new file mode 100644
index 0000000000..51531c4eb8
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/middleware/evaluation.py
@@ -0,0 +1,60 @@
+"""Resolve patterns for a tool call and aggregate the resulting rules.
+
+Two stages run on every tool call:
+
+1. :func:`resolve_patterns` asks the tool's resolver (or the default) for
+   the wildcard patterns the rule engine should evaluate. Resolver
+   failures fall back to the bare tool name so a buggy resolver can't
+   cascade into permission decisions.
+2. :func:`evaluate_tool_call` runs the rule engine against those patterns
+   and collapses the per-pattern rules into a single action
+   (``deny`` > ``ask`` > ``allow``).
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+from app.agents.new_chat.permissions import (
+    Rule,
+    RuleAction,
+    Ruleset,
+    aggregate_action,
+    evaluate_many,
+)
+
+from .pattern_resolver import PatternResolver, default_pattern_resolver
+
+logger = logging.getLogger(__name__)
+
+
+def resolve_patterns(
+    tool_name: str,
+    args: dict[str, Any],
+    pattern_resolvers: dict[str, PatternResolver],
+) -> list[str]:
+    resolver = pattern_resolvers.get(tool_name, default_pattern_resolver(tool_name))
+    try:
+        patterns = resolver(args or {})
+    except Exception:
+        logger.exception("Pattern resolver for %s raised; using bare name", tool_name)
+        patterns = [tool_name]
+    if not patterns:
+        patterns = [tool_name]
+    return patterns
+
+
+def evaluate_tool_call(
+    tool_name: str,
+    args: dict[str, Any],
+    pattern_resolvers: dict[str, PatternResolver],
+    rulesets: list[Ruleset],
+) -> tuple[RuleAction, list[str], list[Rule]]:
+    patterns = resolve_patterns(tool_name, args, pattern_resolvers)
+    rules = evaluate_many(tool_name, patterns, *rulesets)
+    action = aggregate_action(rules)
+    return action, patterns, rules
+
+
+__all__ = ["evaluate_tool_call", "resolve_patterns"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/middleware/factory.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/middleware/factory.py
new file mode 100644
index 0000000000..3c061ded62
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/middleware/factory.py
@@ -0,0 +1,88 @@
+"""Construction recipe for :class:`PermissionMiddleware` shared across stacks.
+
+Single source of truth used by both the main-agent stack and every subagent
+stack. Rule layers are evaluated earliest-to-latest (last match wins,
+matching OpenCode's ``permission/index.ts`` evaluation order):
+
+1. ``surfsense_defaults`` — single ``allow */*`` rule. Connector tools
+   already self-gate via :func:`request_approval`, so the rule engine only
+   needs to *deny* what the user has explicitly forbidden; the default
+   ``ask`` fallback would otherwise double-prompt every safe read-only
+   call.
+2. ``subagent_rulesets`` — caller-supplied rulesets contributed by the
+   consuming subagent. Each subagent passes its coded rules (KB:
+   destructive-FS ``ask`` rules; connectors: per-tool ``allow``/``ask``)
+   plus, when present, the user's persisted allow-list for that subagent.
+
+Connector deny synthesis from ``new_chat._synthesize_connector_deny_rules``
+is intentionally NOT replicated: the multi-agent orchestrator already
+excludes entire subagents whose required connectors are missing
+(``SUBAGENT_TO_REQUIRED_CONNECTOR_MAP``), so the per-tool deny pass is
+redundant here.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+from langchain_core.tools import BaseTool
+
+from app.agents.new_chat.feature_flags import AgentFeatureFlags
+from app.agents.new_chat.permissions import Rule, Ruleset
+from app.services.user_tool_allowlist import TrustedToolSaver
+
+from .core import PermissionMiddleware
+
+_SURFSENSE_DEFAULTS = Ruleset(
+    rules=[Rule(permission="*", pattern="*", action="allow")],
+    origin="surfsense_defaults",
+)
+
+
+def build_permission_mw(
+    *,
+    flags: AgentFeatureFlags,
+    subagent_rulesets: list[Ruleset] | None = None,
+    tools: Sequence[BaseTool] | None = None,
+    trusted_tool_saver: TrustedToolSaver | None = None,
+) -> PermissionMiddleware | None:
+    """Return a configured :class:`PermissionMiddleware` or ``None`` when no work is needed.
+
+    Args:
+        flags: Feature toggles. ``enable_permission`` switches the engine on;
+            ``disable_new_agent_stack`` overrides everything for safety.
+        subagent_rulesets: Caller-supplied rulesets layered after the
+            defaults. Subagents pass their own coded ruleset here (and,
+            when present, the user's persisted allow-list for that
+            subagent) so each subagent owns its own rule surface without
+            aliasing a shared engine. Presence of any subagent ruleset
+            forces the middleware on regardless of ``enable_permission`` —
+            an explicit ``ask`` rule always asks.
+        tools: Subagent tools used to decorate ``ask`` interrupts with
+            FE-card metadata (description, MCP connector). Optional.
+        trusted_tool_saver: Async callback invoked when an MCP tool's
+            ``always`` decision lands; persists the user's preference to
+            ``connector.config['trusted_tools']``. Optional.
+
+    Returns:
+        ``None`` when the engine has no rules to enforce
+        (``enable_permission=False`` and no subagent rulesets); a
+        configured middleware otherwise.
+    """
+    permission_enabled = flags.enable_permission and not flags.disable_new_agent_stack
+    has_subagent_rulesets = bool(subagent_rulesets)
+    if not (permission_enabled or has_subagent_rulesets):
+        return None
+
+    rulesets: list[Ruleset] = [_SURFSENSE_DEFAULTS]
+    if subagent_rulesets:
+        rulesets.extend(subagent_rulesets)
+    tools_by_name = {t.name: t for t in (tools or [])}
+    return PermissionMiddleware(
+        rulesets=rulesets,
+        tools_by_name=tools_by_name,
+        trusted_tool_saver=trusted_tool_saver,
+    )
+
+
+__all__ = ["build_permission_mw"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/middleware/pattern_resolver.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/middleware/pattern_resolver.py
new file mode 100644
index 0000000000..32da90302e
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/middleware/pattern_resolver.py
@@ -0,0 +1,28 @@
+"""Per-tool pattern resolution.
+
+A :data:`PatternResolver` turns a tool's ``args`` dict into a list of
+wildcard patterns evaluated against the layered rulesets. The first
+pattern is conventionally the bare tool name (catch-all); later entries
+narrow down to specific resources (file paths, ids, etc.).
+
+Tools without a custom resolver fall back to :func:`default_pattern_resolver`,
+which yields only the bare tool name.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+from typing import Any
+
+PatternResolver = Callable[[dict[str, Any]], list[str]]
+
+
+def default_pattern_resolver(name: str) -> PatternResolver:
+    def _resolve(args: dict[str, Any]) -> list[str]:
+        del args
+        return [name]
+
+    return _resolve
+
+
+__all__ = ["PatternResolver", "default_pattern_resolver"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/middleware/ruleset_view.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/middleware/ruleset_view.py
new file mode 100644
index 0000000000..fbb66d4555
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/middleware/ruleset_view.py
@@ -0,0 +1,27 @@
+"""Combined view over static + runtime rulesets.
+
+Static rulesets come from the agent factory (defaults, space-scoped,
+thread-scoped, etc.). The runtime ruleset is the in-memory one that
+:func:`runtime_promote.persist_always` extends when the user replies
+``"approve_always"``. Evaluators always see them merged in this order so
+newly-promoted rules apply to subsequent calls.
+"""
+
+from __future__ import annotations
+
+from app.agents.new_chat.permissions import Ruleset, aggregate_action, evaluate_many
+
+
+def all_rulesets(
+    static_rulesets: list[Ruleset], runtime_ruleset: Ruleset
+) -> list[Ruleset]:
+    return [*static_rulesets, runtime_ruleset]
+
+
+def globally_denied(tool_name: str, rulesets: list[Ruleset]) -> bool:
+    """True if an unconditional deny rule blocks every invocation of ``tool_name``."""
+    rules = evaluate_many(tool_name, ["*"], *rulesets)
+    return aggregate_action(rules) == "deny"
+
+
+__all__ = ["all_rulesets", "globally_denied"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/middleware/runtime_promote.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/middleware/runtime_promote.py
new file mode 100644
index 0000000000..afc65fdc04
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/middleware/runtime_promote.py
@@ -0,0 +1,22 @@
+"""Promote an ``"approve_always"`` reply into in-memory allow rules.
+
+Subsequent calls within the same agent instance match these new rules and
+proceed without prompting. Durable persistence (to ``agent_permission_rules``)
+is the streaming layer's job — this module keeps the in-memory copy only.
+"""
+
+from __future__ import annotations
+
+from app.agents.new_chat.permissions import Rule, Ruleset
+
+
+def persist_always(
+    runtime_ruleset: Ruleset, tool_name: str, patterns: list[str]
+) -> None:
+    for pattern in patterns:
+        runtime_ruleset.rules.append(
+            Rule(permission=tool_name, pattern=pattern, action="allow")
+        )
+
+
+__all__ = ["persist_always"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/resilience/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/resilience/__init__.py
new file mode 100644
index 0000000000..9f26ffe498
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/resilience/__init__.py
@@ -0,0 +1,7 @@
+"""Resilience middleware shared as the same instances across parent / registry."""
+
+from __future__ import annotations
+
+from .bundle import ResilienceMiddlewares, build_resilience_middlewares
+
+__all__ = ["ResilienceMiddlewares", "build_resilience_middlewares"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/resilience/bundle.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/resilience/bundle.py
new file mode 100644
index 0000000000..1112447846
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/resilience/bundle.py
@@ -0,0 +1,53 @@
+"""Construct each resilience middleware once; same instances flow into every consumer."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+
+from langchain.agents.middleware import (
+    ModelCallLimitMiddleware,
+    ToolCallLimitMiddleware,
+)
+
+from app.agents.new_chat.feature_flags import AgentFeatureFlags
+from app.agents.new_chat.middleware import RetryAfterMiddleware
+from app.agents.new_chat.middleware.scoped_model_fallback import (
+    ScopedModelFallbackMiddleware,
+)
+
+from .fallback import build_fallback_mw
+from .model_call_limit import build_model_call_limit_mw
+from .retry import build_retry_mw
+from .tool_call_limit import build_tool_call_limit_mw
+
+
+@dataclass(frozen=True)
+class ResilienceMiddlewares:
+    """The four resilience middleware instances, any of which may be ``None`` when disabled by flags."""
+
+    retry: RetryAfterMiddleware | None
+    fallback: ScopedModelFallbackMiddleware | None
+    model_call_limit: ModelCallLimitMiddleware | None
+    tool_call_limit: ToolCallLimitMiddleware | None
+
+    def as_list(self) -> list[Any]:
+        return [
+            m
+            for m in (
+                self.retry,
+                self.fallback,
+                self.model_call_limit,
+                self.tool_call_limit,
+            )
+            if m is not None
+        ]
+
+
+def build_resilience_middlewares(flags: AgentFeatureFlags) -> ResilienceMiddlewares:
+    return ResilienceMiddlewares(
+        retry=build_retry_mw(flags),
+        fallback=build_fallback_mw(flags),
+        model_call_limit=build_model_call_limit_mw(flags),
+        tool_call_limit=build_tool_call_limit_mw(flags),
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/resilience/fallback.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/resilience/fallback.py
new file mode 100644
index 0000000000..ea68a764eb
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/resilience/fallback.py
@@ -0,0 +1,27 @@
+"""Switch to a fallback model on provider/network errors only."""
+
+from __future__ import annotations
+
+import logging
+
+from app.agents.new_chat.feature_flags import AgentFeatureFlags
+from app.agents.new_chat.middleware.scoped_model_fallback import (
+    ScopedModelFallbackMiddleware,
+)
+
+from ..flags import enabled
+
+
+def build_fallback_mw(
+    flags: AgentFeatureFlags,
+) -> ScopedModelFallbackMiddleware | None:
+    if not enabled(flags, "enable_model_fallback"):
+        return None
+    try:
+        return ScopedModelFallbackMiddleware(
+            "openai:gpt-4o-mini",
+            "anthropic:claude-3-5-haiku-20241022",
+        )
+    except Exception:
+        logging.warning("ScopedModelFallbackMiddleware init failed; skipping.")
+        return None
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/resilience/model_call_limit.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/resilience/model_call_limit.py
new file mode 100644
index 0000000000..85707a385e
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/resilience/model_call_limit.py
@@ -0,0 +1,21 @@
+"""Cap model calls per thread / per run to prevent runaway cost."""
+
+from __future__ import annotations
+
+from langchain.agents.middleware import ModelCallLimitMiddleware
+
+from app.agents.new_chat.feature_flags import AgentFeatureFlags
+
+from ..flags import enabled
+
+
+def build_model_call_limit_mw(
+    flags: AgentFeatureFlags,
+) -> ModelCallLimitMiddleware | None:
+    if not enabled(flags, "enable_model_call_limit"):
+        return None
+    return ModelCallLimitMiddleware(
+        thread_limit=120,
+        run_limit=80,
+        exit_behavior="end",
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/resilience/retry.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/resilience/retry.py
new file mode 100644
index 0000000000..c98fc40839
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/resilience/retry.py
@@ -0,0 +1,16 @@
+"""Retry on transient model errors (e.g. Retry-After-bearing 429s)."""
+
+from __future__ import annotations
+
+from app.agents.new_chat.feature_flags import AgentFeatureFlags
+from app.agents.new_chat.middleware import RetryAfterMiddleware
+
+from ..flags import enabled
+
+
+def build_retry_mw(flags: AgentFeatureFlags) -> RetryAfterMiddleware | None:
+    return (
+        RetryAfterMiddleware(max_retries=3)
+        if enabled(flags, "enable_retry_after")
+        else None
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/resilience/tool_call_limit.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/resilience/tool_call_limit.py
new file mode 100644
index 0000000000..dcde81f373
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/resilience/tool_call_limit.py
@@ -0,0 +1,21 @@
+"""Cap tool calls per thread / per run to bound infinite-loop blast radius."""
+
+from __future__ import annotations
+
+from langchain.agents.middleware import ToolCallLimitMiddleware
+
+from app.agents.new_chat.feature_flags import AgentFeatureFlags
+
+from ..flags import enabled
+
+
+def build_tool_call_limit_mw(
+    flags: AgentFeatureFlags,
+) -> ToolCallLimitMiddleware | None:
+    if not enabled(flags, "enable_tool_call_limit"):
+        return None
+    return ToolCallLimitMiddleware(
+        thread_limit=300,
+        run_limit=80,
+        exit_behavior="continue",
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/todos.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/todos.py
new file mode 100644
index 0000000000..dac1496275
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/todos.py
@@ -0,0 +1,12 @@
+"""Todo-list middleware (each consumer needs its own instance)."""
+
+from __future__ import annotations
+
+from langchain.agents.middleware import TodoListMiddleware
+
+
+def build_todos_mw(*, system_prompt: str | None = None) -> TodoListMiddleware:
+    """Pass ``system_prompt=""`` to suppress the upstream prompt append. We use a custom system prompt in the main agent."""
+    if system_prompt is None:
+        return TodoListMiddleware()
+    return TodoListMiddleware(system_prompt=system_prompt)
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/stack.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/stack.py
new file mode 100644
index 0000000000..c1ebe31caf
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/stack.py
@@ -0,0 +1,209 @@
+"""Main-agent middleware list assembly: one line per slot.
+
+The main agent is a pure router — filesystem reads/writes are owned by the
+``knowledge_base`` subagent and delegated via the ``task`` tool. The stack
+here only renders KB context (workspace tree + priority docs), projects it
+into system messages, and commits any subagent-side staged writes at end of
+turn (cloud mode).
+"""
+
+from __future__ import annotations
+
+import logging
+from collections.abc import Sequence
+from typing import Any
+
+from deepagents import SubAgent
+from deepagents.backends import StateBackend
+from langchain.agents import create_agent
+from langchain_core.language_models import BaseChatModel
+from langchain_core.tools import BaseTool
+from langgraph.types import Checkpointer
+
+from app.agents.multi_agent_chat.subagents import (
+    build_subagents,
+    get_subagents_to_exclude,
+)
+from app.agents.multi_agent_chat.subagents.builtins.knowledge_base.agent import (
+    READONLY_NAME as KB_READONLY_NAME,
+    build_readonly_subagent as build_kb_readonly_subagent,
+)
+from app.agents.multi_agent_chat.subagents.builtins.knowledge_base.ask_knowledge_base_tool import (
+    build_ask_knowledge_base_tool,
+)
+from app.agents.new_chat.feature_flags import AgentFeatureFlags
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+from app.db import ChatVisibility
+
+from .main_agent.action_log import build_action_log_mw
+from .main_agent.anonymous_doc import build_anonymous_doc_mw
+from .main_agent.busy_mutex import build_busy_mutex_mw
+from .main_agent.checkpointed_subagent_middleware import (
+    SurfSenseCheckpointedSubAgentMiddleware,
+)
+from .main_agent.checkpointed_subagent_middleware.task_description import (
+    TASK_TOOL_DESCRIPTION,
+)
+from .main_agent.context_editing import build_context_editing_mw
+from .main_agent.dedup_hitl import build_dedup_hitl_mw
+from .main_agent.doom_loop import build_doom_loop_mw
+from .main_agent.kb_persistence import build_kb_persistence_mw
+from .main_agent.knowledge_priority import build_knowledge_priority_mw
+from .main_agent.knowledge_tree import build_knowledge_tree_mw
+from .main_agent.noop_injection import build_noop_injection_mw
+from .main_agent.otel import build_otel_mw
+from .main_agent.plugins import build_plugin_middlewares
+from .main_agent.repair import build_repair_mw
+from .main_agent.skills import build_skills_mw
+from .shared.anthropic_cache import build_anthropic_cache_mw
+from .shared.compaction import build_compaction_mw
+from .shared.kb_context_projection import build_kb_context_projection_mw
+from .shared.memory import build_memory_mw
+from .shared.patch_tool_calls import build_patch_tool_calls_mw
+from .shared.permissions import build_permission_mw
+from .shared.resilience import build_resilience_middlewares
+from .shared.todos import build_todos_mw
+from .subagent.middleware_stack import build_subagent_middleware_stack
+
+
+def build_main_agent_deepagent_middleware(
+    *,
+    llm: BaseChatModel,
+    tools: Sequence[BaseTool],
+    backend_resolver: Any,
+    filesystem_mode: FilesystemMode,
+    search_space_id: int,
+    user_id: str | None,
+    thread_id: int | None,
+    visibility: ChatVisibility,
+    anon_session_id: str | None,
+    available_connectors: list[str] | None,
+    available_document_types: list[str] | None,
+    mentioned_document_ids: list[int] | None,
+    max_input_tokens: int | None,
+    flags: AgentFeatureFlags,
+    subagent_dependencies: dict[str, Any],
+    checkpointer: Checkpointer,
+    mcp_tools_by_agent: dict[str, list[BaseTool]] | None = None,
+    disabled_tools: list[str] | None = None,
+) -> list[Any]:
+    """Ordered middleware for ``create_agent`` (None entries already stripped)."""
+    resilience = build_resilience_middlewares(flags)
+
+    memory_mw = build_memory_mw(
+        user_id=user_id,
+        search_space_id=search_space_id,
+        visibility=visibility,
+    )
+
+    subagent_dependencies = {
+        **subagent_dependencies,
+        "backend_resolver": backend_resolver,
+        "filesystem_mode": filesystem_mode,
+        "flags": flags,
+    }
+    shared_subagent_middleware = build_subagent_middleware_stack(
+        resilience=resilience,
+        flags=flags,
+    )
+
+    kb_readonly = build_kb_readonly_subagent(
+        dependencies=subagent_dependencies,
+        model=llm,
+        middleware_stack=shared_subagent_middleware,
+    )
+    kb_readonly_spec = kb_readonly.spec
+    kb_readonly_runnable = create_agent(
+        llm,
+        system_prompt=kb_readonly_spec["system_prompt"],
+        tools=kb_readonly_spec["tools"],
+        middleware=kb_readonly_spec["middleware"],
+        name=KB_READONLY_NAME,
+        checkpointer=checkpointer,
+    )
+    ask_kb_tool = build_ask_knowledge_base_tool(kb_readonly_runnable)
+
+    subagents: list[SubAgent] = build_subagents(
+        dependencies=subagent_dependencies,
+        model=llm,
+        middleware_stack=shared_subagent_middleware,
+        mcp_tools_by_agent=mcp_tools_by_agent or {},
+        exclude=get_subagents_to_exclude(available_connectors),
+        disabled_tools=disabled_tools,
+        ask_kb_tool=ask_kb_tool,
+    )
+    logging.debug("Subagents registry: %s", [s["name"] for s in subagents])
+
+    stack: list[Any] = [
+        build_busy_mutex_mw(flags),
+        build_otel_mw(flags),
+        build_todos_mw(system_prompt=""),
+        memory_mw,
+        build_anonymous_doc_mw(
+            filesystem_mode=filesystem_mode, anon_session_id=anon_session_id
+        ),
+        build_knowledge_tree_mw(
+            filesystem_mode=filesystem_mode,
+            search_space_id=search_space_id,
+            llm=llm,
+        ),
+        build_knowledge_priority_mw(
+            llm=llm,
+            search_space_id=search_space_id,
+            filesystem_mode=filesystem_mode,
+            available_connectors=available_connectors,
+            available_document_types=available_document_types,
+            mentioned_document_ids=mentioned_document_ids,
+        ),
+        build_kb_context_projection_mw(),
+        build_kb_persistence_mw(
+            filesystem_mode=filesystem_mode,
+            search_space_id=search_space_id,
+            user_id=user_id,
+            thread_id=thread_id,
+        ),
+        build_skills_mw(
+            flags=flags,
+            filesystem_mode=filesystem_mode,
+            search_space_id=search_space_id,
+        ),
+        SurfSenseCheckpointedSubAgentMiddleware(
+            checkpointer=checkpointer,
+            backend=StateBackend,
+            subagents=subagents,
+            system_prompt=None,
+            task_description=TASK_TOOL_DESCRIPTION,
+        ),
+        resilience.model_call_limit,
+        resilience.tool_call_limit,
+        build_context_editing_mw(
+            flags=flags,
+            max_input_tokens=max_input_tokens,
+            tools=tools,
+            backend_resolver=backend_resolver,
+        ),
+        build_compaction_mw(llm),
+        build_noop_injection_mw(flags),
+        resilience.retry,
+        resilience.fallback,
+        build_repair_mw(flags=flags, tools=tools),
+        build_permission_mw(flags=flags),
+        build_doom_loop_mw(flags),
+        build_action_log_mw(
+            flags=flags,
+            thread_id=thread_id,
+            search_space_id=search_space_id,
+            user_id=user_id,
+        ),
+        build_patch_tool_calls_mw(),
+        build_dedup_hitl_mw(tools),
+        *build_plugin_middlewares(
+            flags=flags,
+            search_space_id=search_space_id,
+            user_id=user_id,
+            visibility=visibility,
+            llm=llm,
+        ),
+        build_anthropic_cache_mw(),
+    ]
+    return [m for m in stack if m is not None]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/subagent/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/subagent/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/subagent/middleware_stack.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/subagent/middleware_stack.py
new file mode 100644
index 0000000000..aa6211fcc2
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/subagent/middleware_stack.py
@@ -0,0 +1,49 @@
+"""Shared middleware stack threaded into every subagent.
+
+Mirrors ``middleware/stack.py`` (the orchestrator's middleware stack) but
+exposes its contents as a dict keyed by purpose so specialists can pick
+the entries they need and decide ordering. The default consumer
+(:func:`pack_subagent`) prepends every non-``None`` value in insertion
+order, so ``None`` slots are silently skipped.
+
+Registry subagents never touch the SurfSense filesystem — that capability
+belongs to ``knowledge_base`` — so no FS middleware is exposed here.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from app.agents.new_chat.feature_flags import AgentFeatureFlags
+
+from ..shared.permissions import build_permission_mw
+from ..shared.resilience import ResilienceMiddlewares
+from ..shared.todos import build_todos_mw
+
+
+def build_subagent_middleware_stack(
+    *,
+    resilience: ResilienceMiddlewares,
+    flags: AgentFeatureFlags | None = None,
+) -> dict[str, Any]:
+    """Assemble the dict of middlewares prepended to every subagent's stack.
+
+    Args:
+        resilience: Pre-built retry / fallback / call-limit middlewares
+            (shared with the orchestrator stack to keep behaviour symmetric).
+        flags: Feature flags driving optional layers. ``None`` disables the
+            permission layer (used in tests that only need todos+resilience).
+
+    Returns:
+        Insertion-ordered dict; ``None`` values are tolerated and dropped by
+        the consumer so callers can flip slots on/off without reshaping.
+    """
+    permission = build_permission_mw(flags=flags) if flags is not None else None
+    return {
+        "todos": build_todos_mw(),
+        "permission": permission,
+        "retry": resilience.retry,
+        "fallback": resilience.fallback,
+        "model_call_limit": resilience.model_call_limit,
+        "tool_call_limit": resilience.tool_call_limit,
+    }
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/__init__.py
new file mode 100644
index 0000000000..ca9e4aa3e5
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/__init__.py
@@ -0,0 +1,19 @@
+"""Registry-backed subagent builders and helpers."""
+
+from __future__ import annotations
+
+from .registry import (
+    SUBAGENT_BUILDERS_BY_NAME,
+    SubagentBuilder,
+    build_subagents,
+    get_subagents_to_exclude,
+    main_prompt_registry_subagent_lines,
+)
+
+__all__ = [
+    "SUBAGENT_BUILDERS_BY_NAME",
+    "SubagentBuilder",
+    "build_subagents",
+    "get_subagents_to_exclude",
+    "main_prompt_registry_subagent_lines",
+]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/agent.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/agent.py
new file mode 100644
index 0000000000..396e0ec797
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/agent.py
@@ -0,0 +1,44 @@
+"""``deliverables`` route: ``SurfSenseSubagentSpec`` builder for deepagents.
+
+Tools self-gate inside their bodies via :func:`request_approval`; the
+empty :data:`tools.index.RULESET` is layered into a per-subagent
+:class:`PermissionMiddleware` for uniformity.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.language_models import BaseChatModel
+from langchain_core.tools import BaseTool
+
+from app.agents.multi_agent_chat.subagents.shared.md_file_reader import read_md_file
+from app.agents.multi_agent_chat.subagents.shared.spec import SurfSenseSubagentSpec
+from app.agents.multi_agent_chat.subagents.shared.subagent_builder import pack_subagent
+
+from .tools.index import NAME, RULESET, load_tools
+
+
+def build_subagent(
+    *,
+    dependencies: dict[str, Any],
+    model: BaseChatModel | None = None,
+    middleware_stack: dict[str, Any] | None = None,
+    mcp_tools: list[BaseTool] | None = None,
+) -> SurfSenseSubagentSpec:
+    tools = [*load_tools(dependencies=dependencies), *(mcp_tools or [])]
+    description = (
+        read_md_file(__package__, "description").strip()
+        or "Handles deliverables tasks for this workspace."
+    )
+    system_prompt = read_md_file(__package__, "system_prompt").strip()
+    return pack_subagent(
+        name=NAME,
+        description=description,
+        system_prompt=system_prompt,
+        tools=tools,
+        ruleset=RULESET,
+        dependencies=dependencies,
+        model=model,
+        middleware_stack=middleware_stack,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/description.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/description.md
new file mode 100644
index 0000000000..fcebdcaf02
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/description.md
@@ -0,0 +1,2 @@
+Specialist for producing long-form deliverables: reports, podcasts, video presentations, resumes, and generated images.
+Use proactively when the user wants one of these artifacts produced.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/system_prompt.md
new file mode 100644
index 0000000000..c44f131bbc
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/system_prompt.md
@@ -0,0 +1,55 @@
+You are the SurfSense deliverables operations sub-agent.
+You receive delegated instructions from a supervisor agent and return structured results for supervisor synthesis.
+
+<goal>
+Produce **deliverables**: shareable **artifacts** the user keeps (reports, slide-style video presentations, podcasts, resumes, images). Use explicit constraints and reliable proof of what was generated.
+</goal>
+
+<available_tools>
+- `generate_report`
+- `generate_podcast`
+- `generate_video_presentation`
+- `generate_resume`
+- `generate_image`
+</available_tools>
+
+<tool_policy>
+- Use only tools in `<available_tools>`.
+- Require essential generation constraints (audience, format, tone, core content).
+- If critical constraints are missing, return `status=blocked` with `missing_fields`.
+- Never claim artifact generation success without tool confirmation.
+</tool_policy>
+
+<out_of_scope>
+- Do not perform connector data mutations unrelated to artifact generation.
+</out_of_scope>
+
+<safety>
+- Avoid generating artifacts with missing critical constraints.
+- Prefer one complete artifact over partial multi-artifact output.
+</safety>
+
+<failure_policy>
+- On generation failure, return `status=error` with best retry guidance.
+- On missing constraints, return `status=blocked` with required fields.
+</failure_policy>
+
+<output_contract>
+Return **only** one JSON object (no markdown/prose):
+{
+  "status": "success" | "partial" | "blocked" | "error",
+  "action_summary": string,
+  "evidence": {
+    "artifact_type": "report" | "podcast" | "video_presentation" | "resume" | "image" | null,
+    "artifact_id": string | null,
+    "artifact_location": string | null
+  },
+  "next_step": string | null,
+  "missing_fields": string[] | null,
+  "assumptions": string[] | null
+}
+Rules:
+- `status=success` -> `next_step=null`, `missing_fields=null`.
+- `status=partial|blocked|error` -> `next_step` must be non-null.
+- `status=blocked` due to missing required inputs -> `missing_fields` must be non-null.
+</output_contract>
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/__init__.py
new file mode 100644
index 0000000000..d0fe942177
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/__init__.py
@@ -0,0 +1,15 @@
+"""Deliverable generators: reports, podcasts, video decks, resumes, images."""
+
+from .generate_image import create_generate_image_tool
+from .podcast import create_generate_podcast_tool
+from .report import create_generate_report_tool
+from .resume import create_generate_resume_tool
+from .video_presentation import create_generate_video_presentation_tool
+
+__all__ = [
+    "create_generate_image_tool",
+    "create_generate_podcast_tool",
+    "create_generate_report_tool",
+    "create_generate_resume_tool",
+    "create_generate_video_presentation_tool",
+]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/generate_image.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/generate_image.py
new file mode 100644
index 0000000000..ab9dbc0ea9
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/generate_image.py
@@ -0,0 +1,247 @@
+"""Image generation via litellm; resolves model config from the search space and returns UI-ready payloads."""
+
+import hashlib
+import logging
+from typing import Any
+
+from langchain_core.tools import tool
+from litellm import aimage_generation
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.config import config
+from app.db import (
+    ImageGeneration,
+    ImageGenerationConfig,
+    SearchSpace,
+    shielded_async_session,
+)
+from app.services.image_gen_router_service import (
+    IMAGE_GEN_AUTO_MODE_ID,
+    ImageGenRouterService,
+    is_image_gen_auto_mode,
+)
+from app.utils.signed_image_urls import generate_image_token
+
+logger = logging.getLogger(__name__)
+
+# Provider mapping (same as routes)
+_PROVIDER_MAP = {
+    "OPENAI": "openai",
+    "AZURE_OPENAI": "azure",
+    "GOOGLE": "gemini",
+    "VERTEX_AI": "vertex_ai",
+    "BEDROCK": "bedrock",
+    "RECRAFT": "recraft",
+    "OPENROUTER": "openrouter",
+    "XINFERENCE": "xinference",
+    "NSCALE": "nscale",
+}
+
+
+def _build_model_string(
+    provider: str, model_name: str, custom_provider: str | None
+) -> str:
+    if custom_provider:
+        return f"{custom_provider}/{model_name}"
+    prefix = _PROVIDER_MAP.get(provider.upper(), provider.lower())
+    return f"{prefix}/{model_name}"
+
+
+def _get_global_image_gen_config(config_id: int) -> dict | None:
+    """Get a global image gen config by negative ID."""
+    for cfg in config.GLOBAL_IMAGE_GEN_CONFIGS:
+        if cfg.get("id") == config_id:
+            return cfg
+    return None
+
+
+def create_generate_image_tool(
+    search_space_id: int,
+    db_session: AsyncSession,
+):
+    """Create ``generate_image`` with bound search space; DB work uses a per-call session."""
+    del db_session  # use a fresh per-call session, see below
+
+    @tool
+    async def generate_image(
+        prompt: str,
+        n: int = 1,
+    ) -> dict[str, Any]:
+        """
+        Generate an image from a text description using AI image models.
+
+        Use this tool when the user asks you to create, generate, draw, or make an image.
+        The generated image will be displayed directly in the chat.
+
+        Args:
+            prompt: A detailed text description of the image to generate.
+                    Be specific about subject, style, colors, composition, and mood.
+            n: Number of images to generate (1-4). Default: 1
+
+        Returns:
+            A dictionary containing the generated image(s) for display in the chat.
+        """
+        try:
+            # Use a per-call session so concurrent tool calls don't share an
+            # AsyncSession (which is not concurrency-safe). The streaming
+            # task's session is shared across every tool; without isolation,
+            # autoflushes from a concurrent writer poison this tool too.
+            async with shielded_async_session() as session:
+                result = await session.execute(
+                    select(SearchSpace).filter(SearchSpace.id == search_space_id)
+                )
+                search_space = result.scalars().first()
+                if not search_space:
+                    return {"error": "Search space not found"}
+
+                config_id = (
+                    search_space.image_generation_config_id or IMAGE_GEN_AUTO_MODE_ID
+                )
+
+                # Build generation kwargs
+                # NOTE: size, quality, and style are intentionally NOT passed.
+                # Different models support different values for these params
+                # (e.g. DALL-E 3 wants "hd"/"standard" for quality while
+                # gpt-image-1 wants "high"/"medium"/"low"; size options also
+                # differ). Letting the model use its own defaults avoids errors.
+                gen_kwargs: dict[str, Any] = {}
+                if n is not None and n > 1:
+                    gen_kwargs["n"] = n
+
+                # Call litellm based on config type
+                if is_image_gen_auto_mode(config_id):
+                    if not ImageGenRouterService.is_initialized():
+                        return {
+                            "error": "No image generation models configured. "
+                            "Please add an image model in Settings > Image Models."
+                        }
+                    response = await ImageGenRouterService.aimage_generation(
+                        prompt=prompt, model="auto", **gen_kwargs
+                    )
+                elif config_id < 0:
+                    cfg = _get_global_image_gen_config(config_id)
+                    if not cfg:
+                        return {
+                            "error": f"Image generation config {config_id} not found"
+                        }
+
+                    model_string = _build_model_string(
+                        cfg.get("provider", ""),
+                        cfg["model_name"],
+                        cfg.get("custom_provider"),
+                    )
+                    gen_kwargs["api_key"] = cfg.get("api_key")
+                    if cfg.get("api_base"):
+                        gen_kwargs["api_base"] = cfg["api_base"]
+                    if cfg.get("api_version"):
+                        gen_kwargs["api_version"] = cfg["api_version"]
+                    if cfg.get("litellm_params"):
+                        gen_kwargs.update(cfg["litellm_params"])
+
+                    response = await aimage_generation(
+                        prompt=prompt, model=model_string, **gen_kwargs
+                    )
+                else:
+                    # Positive ID = user-created ImageGenerationConfig
+                    cfg_result = await session.execute(
+                        select(ImageGenerationConfig).filter(
+                            ImageGenerationConfig.id == config_id
+                        )
+                    )
+                    db_cfg = cfg_result.scalars().first()
+                    if not db_cfg:
+                        return {
+                            "error": f"Image generation config {config_id} not found"
+                        }
+
+                    model_string = _build_model_string(
+                        db_cfg.provider.value,
+                        db_cfg.model_name,
+                        db_cfg.custom_provider,
+                    )
+                    gen_kwargs["api_key"] = db_cfg.api_key
+                    if db_cfg.api_base:
+                        gen_kwargs["api_base"] = db_cfg.api_base
+                    if db_cfg.api_version:
+                        gen_kwargs["api_version"] = db_cfg.api_version
+                    if db_cfg.litellm_params:
+                        gen_kwargs.update(db_cfg.litellm_params)
+
+                    response = await aimage_generation(
+                        prompt=prompt, model=model_string, **gen_kwargs
+                    )
+
+                # Parse the response and store in DB
+                response_dict = (
+                    response.model_dump()
+                    if hasattr(response, "model_dump")
+                    else dict(response)
+                )
+
+                # Generate a random access token for this image
+                access_token = generate_image_token()
+
+                # Save to image_generations table for history
+                db_image_gen = ImageGeneration(
+                    prompt=prompt,
+                    model=getattr(response, "_hidden_params", {}).get("model"),
+                    n=n,
+                    image_generation_config_id=config_id,
+                    response_data=response_dict,
+                    search_space_id=search_space_id,
+                    access_token=access_token,
+                )
+                session.add(db_image_gen)
+                await session.commit()
+                await session.refresh(db_image_gen)
+                db_image_gen_id = db_image_gen.id
+
+            # Extract image URLs from response
+            images = response_dict.get("data", [])
+            if not images:
+                return {"error": "No images were generated"}
+
+            first_image = images[0]
+            revised_prompt = first_image.get("revised_prompt", prompt)
+
+            # Resolve image URL:
+            # - If the API returned a URL, use it directly.
+            # - If the API returned b64_json (e.g. gpt-image-1), serve the
+            #   image through our backend endpoint to avoid bloating the
+            #   LLM context with megabytes of base64 data.
+            if first_image.get("url"):
+                image_url = first_image["url"]
+            elif first_image.get("b64_json"):
+                backend_url = config.BACKEND_URL or "http://localhost:8000"
+                image_url = (
+                    f"{backend_url}/api/v1/image-generations/"
+                    f"{db_image_gen_id}/image?token={access_token}"
+                )
+            else:
+                return {"error": "No displayable image data in the response"}
+
+            image_id = f"image-{hashlib.md5(image_url.encode()).hexdigest()[:12]}"
+
+            return {
+                "id": image_id,
+                "assetId": image_url,
+                "src": image_url,
+                "alt": revised_prompt or prompt,
+                "title": "Generated Image",
+                "description": revised_prompt if revised_prompt != prompt else None,
+                "domain": "ai-generated",
+                "ratio": "auto",
+                "generated": True,
+                "prompt": prompt,
+                "image_count": len(images),
+            }
+
+        except Exception as e:
+            logger.exception("Image generation failed in tool")
+            return {
+                "error": f"Image generation failed: {e!s}",
+                "prompt": prompt,
+            }
+
+    return generate_image
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/index.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/index.py
new file mode 100644
index 0000000000..5f76f1d52d
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/index.py
@@ -0,0 +1,55 @@
+"""``deliverables`` native tools and (empty) permission ruleset.
+
+Tools self-gate via :func:`request_approval` in their bodies.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.tools import BaseTool
+
+from app.agents.new_chat.permissions import Ruleset
+
+from .generate_image import create_generate_image_tool
+from .podcast import create_generate_podcast_tool
+from .report import create_generate_report_tool
+from .resume import create_generate_resume_tool
+from .video_presentation import create_generate_video_presentation_tool
+
+NAME = "deliverables"
+
+RULESET = Ruleset(origin=NAME, rules=[])
+
+
+def load_tools(
+    *, dependencies: dict[str, Any] | None = None, **kwargs: Any
+) -> list[BaseTool]:
+    d = {**(dependencies or {}), **kwargs}
+    return [
+        create_generate_podcast_tool(
+            search_space_id=d["search_space_id"],
+            db_session=d["db_session"],
+            thread_id=d["thread_id"],
+        ),
+        create_generate_video_presentation_tool(
+            search_space_id=d["search_space_id"],
+            db_session=d["db_session"],
+            thread_id=d["thread_id"],
+        ),
+        create_generate_report_tool(
+            search_space_id=d["search_space_id"],
+            thread_id=d["thread_id"],
+            connector_service=d.get("connector_service"),
+            available_connectors=d.get("available_connectors"),
+            available_document_types=d.get("available_document_types"),
+        ),
+        create_generate_resume_tool(
+            search_space_id=d["search_space_id"],
+            thread_id=d["thread_id"],
+        ),
+        create_generate_image_tool(
+            search_space_id=d["search_space_id"],
+            db_session=d["db_session"],
+        ),
+    ]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/podcast.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/podcast.py
new file mode 100644
index 0000000000..55d9b35654
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/podcast.py
@@ -0,0 +1,92 @@
+"""Factory for a podcast-generation tool that queues background work and returns an ID for polling."""
+
+from typing import Any
+
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import Podcast, PodcastStatus, shielded_async_session
+
+
+def create_generate_podcast_tool(
+    search_space_id: int,
+    db_session: AsyncSession,
+    thread_id: int | None = None,
+):
+    """Create ``generate_podcast`` with bound search space and thread; DB writes use a tool-local session."""
+    del db_session  # writes use a fresh tool-local session, see below
+
+    @tool
+    async def generate_podcast(
+        source_content: str,
+        podcast_title: str = "SurfSense Podcast",
+        user_prompt: str | None = None,
+    ) -> dict[str, Any]:
+        """
+        Generate a podcast from the provided content.
+
+        Use this tool when the user asks to create, generate, or make a podcast.
+        Common triggers include phrases like:
+        - "Give me a podcast about this"
+        - "Create a podcast from this conversation"
+        - "Generate a podcast summary"
+        - "Make a podcast about..."
+        - "Turn this into a podcast"
+
+        Args:
+            source_content: The text content to convert into a podcast.
+            podcast_title: Title for the podcast (default: "SurfSense Podcast")
+            user_prompt: Optional instructions for podcast style, tone, or format.
+
+        Returns:
+            A dictionary containing:
+            - status: PodcastStatus value (pending, generating, or failed)
+            - podcast_id: The podcast ID for polling (when status is pending or generating)
+            - title: The podcast title
+            - message: Status message (or "error" field if status is failed)
+        """
+        try:
+            # One DB session per tool call so parallel invocations never share an AsyncSession.
+            async with shielded_async_session() as session:
+                podcast = Podcast(
+                    title=podcast_title,
+                    status=PodcastStatus.PENDING,
+                    search_space_id=search_space_id,
+                    thread_id=thread_id,
+                )
+                session.add(podcast)
+                await session.commit()
+                await session.refresh(podcast)
+                podcast_id = podcast.id
+
+            from app.tasks.celery_tasks.podcast_tasks import (
+                generate_content_podcast_task,
+            )
+
+            task = generate_content_podcast_task.delay(
+                podcast_id=podcast_id,
+                source_content=source_content,
+                search_space_id=search_space_id,
+                user_prompt=user_prompt,
+            )
+
+            print(f"[generate_podcast] Created podcast {podcast_id}, task: {task.id}")
+
+            return {
+                "status": PodcastStatus.PENDING.value,
+                "podcast_id": podcast_id,
+                "title": podcast_title,
+                "message": "Podcast generation started. This may take a few minutes.",
+            }
+
+        except Exception as e:
+            error_message = str(e)
+            print(f"[generate_podcast] Error: {error_message}")
+            return {
+                "status": PodcastStatus.FAILED.value,
+                "error": error_message,
+                "title": podcast_title,
+                "podcast_id": None,
+            }
+
+    return generate_podcast
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/report.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/report.py
new file mode 100644
index 0000000000..385100c62b
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/report.py
@@ -0,0 +1,1061 @@
+"""Factory for inline Markdown reports: optional KB sourcing, section-aware revision, short-lived DB sessions."""
+
+import asyncio
+import json
+import logging
+import re
+from typing import Any
+
+from langchain_core.callbacks import dispatch_custom_event
+from langchain_core.messages import HumanMessage
+from langchain_core.tools import tool
+
+from app.db import Report, shielded_async_session
+from app.services.connector_service import ConnectorService
+from app.services.llm_service import get_document_summary_llm
+
+logger = logging.getLogger(__name__)
+
+# ─── Shared Formatting Rules ────────────────────────────────────────────────
+# Reusable formatting instructions appended to section-level and review prompts.
+
+_FORMATTING_RULES = """\
+- IMPORTANT: Output raw Markdown directly. Do NOT wrap the entire output in a \
+code fence (e.g. ```markdown, ````markdown, or any backtick fence). Individual \
+code examples and diagrams inside the report should still use fenced code blocks, \
+but the report itself must NOT be enclosed in one.
+- Maintain proper Markdown formatting throughout.
+- When including code examples, ALWAYS format them as proper fenced code blocks \
+with the correct language identifier (e.g. ```java, ```python). Code inside code \
+blocks MUST have proper line breaks and indentation — NEVER put multiple statements \
+on a single line. Each statement, brace, and logical block must be on its own line \
+with correct indentation.
+- When including Mermaid diagrams, use ```mermaid fenced code blocks. Each Mermaid \
+statement MUST be on its own line — NEVER use semicolons to join multiple statements \
+on one line. For line breaks inside node labels, use <br> (NOT <br/>).
+- When including mathematical formulas or equations, ALWAYS use LaTeX notation. \
+NEVER use backtick code spans or Unicode symbols for math."""
+
+# ─── Standard Report Footer ─────────────────────────────────────────────────
+# Appended to every generated report after content generation.
+
+_REPORT_FOOTER = "Powered by SurfSense AI."
+
+# ─── Prompt: Single-Shot Report Generation ───────────────────────────────────
+
+_REPORT_PROMPT = """You are an expert report writer. Generate a comprehensive Markdown report.
+
+**Topic:** {topic}
+**Report Style:** {report_style}
+{user_instructions_section}
+{previous_version_section}
+
+**Source Content:**
+{source_content}
+
+---
+
+{length_instruction}
+
+Write a well-structured Markdown report with a # title, executive summary, organized sections, and conclusion. Cite facts from the source content. Be thorough and professional.
+
+{formatting_rules}
+"""
+
+# ─── Prompt: Full-Document Revision (fallback when section-level fails) ──────
+
+_REVISION_PROMPT = """You are an expert report editor. Apply ONLY the requested changes — do NOT rewrite from scratch.
+
+**Topic:** {topic}
+**Report Style:** {report_style}
+**Modification Instructions:** {user_instructions_section}
+
+**Source Content (use if relevant):**
+{source_content}
+
+---
+
+**EXISTING REPORT:**
+
+{previous_report_content}
+
+---
+
+{length_instruction}
+
+Preserve all structure and content not affected by the modification.
+
+{formatting_rules}
+"""
+
+# ─── Prompt: Section-Level Revision — Identify Affected Sections ─────────────
+
+_IDENTIFY_SECTIONS_PROMPT = """You are analyzing a Markdown report to determine which sections need modification based on the user's request.
+
+**User's Modification Request:** {user_instructions}
+
+**Report Sections (indexed starting at 0):**
+{sections_listing}
+
+---
+
+Determine which sections need to be modified, added, or removed to fulfill the user's request.
+
+Return ONLY a JSON object with these fields:
+- "modify": Array of section indices (0-based) that need content changes
+- "add": Array of objects like {{"after_index": 2, "heading": "## New Section Title", "description": "What this section should cover"}} for new sections to insert
+- "remove": Array of section indices to remove entirely (use sparingly)
+- "reasoning": A brief explanation of your decisions
+
+Guidelines:
+- If the change is GLOBAL (e.g., "change the tone", "make the whole report shorter", "translate to Spanish"), include ALL section indices in "modify".
+- If the change is TARGETED (e.g., "expand the budget section", "fix the conclusion"), include ONLY the affected section indices.
+- For "add a section about X", use the "add" field with the appropriate insertion point.
+- Prefer modifying over removing+adding when possible.
+
+Return ONLY valid JSON, no markdown fences:
+"""
+
+# ─── Prompt: Section-Level Revision — Revise a Single Section ────────────────
+
+_REVISE_SECTION_PROMPT = """Revise ONLY this section based on the instructions. If the instructions don't apply, return it UNCHANGED.
+
+**Modification Instructions:** {user_instructions}
+
+**Current Section:**
+{section_content}
+
+**Context (surrounding sections — for coherence only, do NOT output them):**
+{context_sections}
+
+**Source Content:**
+{source_content}
+
+---
+
+Keep the same heading and heading level. Preserve content not affected by the modification.
+{formatting_rules}
+"""
+
+# ─── Prompt: New Section Generation (for section-level add) ─────────────────
+
+_NEW_SECTION_PROMPT = """You are an expert report writer. Write a new section to be inserted into an existing report.
+
+**Report Topic:** {topic}
+**Report Style:** {report_style}
+**Section Heading:** {heading}
+**Section Goal:** {description}
+**User Instructions:** {user_instructions}
+
+**Surrounding Context:**
+{context_sections}
+
+**Source Content:**
+{source_content}
+
+---
+
+**Rules:**
+1. Write ONLY this section, starting with the heading "{heading}".
+2. Ensure the section flows naturally with the surrounding context.
+3. Be comprehensive — cover the topic described above.
+{formatting_rules}
+
+Write the new section now:
+"""
+
+
+# ─── Utility Functions ──────────────────────────────────────────────────────
+
+
+def _strip_wrapping_code_fences(text: str) -> str:
+    """Remove wrapping code fences that LLMs often add around Markdown output.
+
+    Handles patterns like:
+        ```markdown\\n...content...\\n```
+        ````markdown\\n...content...\\n````
+        ```md\\n...content...\\n```
+        ```\\n...content...\\n```
+        ```json\\n...content...\\n```
+    Supports 3 or more backticks (LLMs escalate when content has triple-backtick blocks).
+    """
+    stripped = text.strip()
+    # Match opening fence with 3+ backticks and optional language tag
+    m = re.match(r"^(`{3,})(?:markdown|md|json)?\s*\n", stripped)
+    if m:
+        fence = m.group(1)  # e.g. "```" or "````"
+        if stripped.endswith(fence):
+            stripped = stripped[m.end() :]  # remove opening fence
+            stripped = stripped[: -len(fence)].rstrip()  # remove closing fence
+    return stripped
+
+
+def _extract_metadata(content: str) -> dict[str, Any]:
+    """Extract metadata from generated Markdown content."""
+    # Count section headings
+    headings = re.findall(r"^(#{1,6})\s+(.+)$", content, re.MULTILINE)
+
+    # Word count
+    word_count = len(content.split())
+
+    # Character count
+    char_count = len(content)
+
+    return {
+        "status": "ready",
+        "word_count": word_count,
+        "char_count": char_count,
+        "section_count": len(headings),
+    }
+
+
+def _parse_sections(content: str) -> list[dict[str, str]]:
+    """Parse Markdown content into sections split by # and ## headings.
+
+    Returns a list of dicts: [{"heading": "## Title", "body": "content..."}, ...]
+    Content before the first heading is captured with heading="".
+    ### and deeper headings are kept inside their parent ## section's body.
+    """
+    lines = content.split("\n")
+    sections: list[dict[str, str]] = []
+    current_heading = ""
+    current_body_lines: list[str] = []
+    in_code_block = False
+
+    for line in lines:
+        # Track code blocks to avoid matching headings inside them
+        stripped = line.strip()
+        if stripped.startswith("```"):
+            in_code_block = not in_code_block
+
+        # Only split on # or ## headings (not ### or deeper) and only outside code blocks
+        is_section_heading = (
+            not in_code_block
+            and re.match(r"^#{1,2}\s+", line)
+            and not re.match(r"^#{3,}\s+", line)
+        )
+
+        if is_section_heading:
+            # Save previous section
+            if current_heading or current_body_lines:
+                sections.append(
+                    {
+                        "heading": current_heading,
+                        "body": "\n".join(current_body_lines).strip(),
+                    }
+                )
+            current_heading = line.strip()
+            current_body_lines = []
+        else:
+            current_body_lines.append(line)
+
+    # Save last section
+    if current_heading or current_body_lines:
+        sections.append(
+            {
+                "heading": current_heading,
+                "body": "\n".join(current_body_lines).strip(),
+            }
+        )
+
+    return sections
+
+
+def _stitch_sections(sections: list[dict[str, str]]) -> str:
+    """Stitch parsed sections back into a single Markdown string."""
+    parts = []
+    for section in sections:
+        if section["heading"]:
+            parts.append(section["heading"])
+        if section["body"]:
+            parts.append(section["body"])
+    return "\n\n".join(parts)
+
+
+# ─── Async Generation Helpers ───────────────────────────────────────────────
+
+
+async def _revise_with_sections(
+    llm: Any,
+    parent_content: str,
+    user_instructions: str,
+    source_content: str,
+    topic: str,
+    report_style: str,
+) -> str | None:
+    """Section-level revision: identify affected sections and revise only those.
+
+    Unchanged sections are kept byte-for-byte identical.
+    Returns the revised content, or None to trigger full-document revision fallback.
+    """
+    # Parse report into sections
+    sections = _parse_sections(parent_content)
+    if len(sections) < 2:
+        logger.info(
+            "[generate_report] Too few sections for section-level revision, using full revision"
+        )
+        return None
+
+    # Build a sections listing for the LLM
+    sections_listing = ""
+    for i, sec in enumerate(sections):
+        heading = sec["heading"] or "(preamble — content before first heading)"
+        body_preview = (
+            sec["body"][:200] + "..." if len(sec["body"]) > 200 else sec["body"]
+        )
+        sections_listing += f"\n[{i}] {heading}\n    Preview: {body_preview}\n"
+
+    # Step 1: Ask LLM which sections need modification
+    identify_prompt = _IDENTIFY_SECTIONS_PROMPT.format(
+        user_instructions=user_instructions,
+        sections_listing=sections_listing,
+    )
+
+    try:
+        response = await llm.ainvoke([HumanMessage(content=identify_prompt)])
+        raw = response.content
+        if not raw or not isinstance(raw, str):
+            return None
+
+        raw = _strip_wrapping_code_fences(raw).strip()
+        json_match = re.search(r"\{[\s\S]*\}", raw)
+        if json_match:
+            raw = json_match.group(0)
+
+        plan = json.loads(raw)
+        modify_indices: list[int] = plan.get("modify", [])
+        add_sections: list[dict[str, Any]] = plan.get("add", [])
+        remove_indices: list[int] = plan.get("remove", [])
+        reasoning = plan.get("reasoning", "")
+
+        logger.info(
+            f"[generate_report] Section-level revision plan: "
+            f"modify={modify_indices}, add={len(add_sections)}, "
+            f"remove={remove_indices}, reasoning={reasoning}"
+        )
+    except Exception:
+        logger.warning(
+            "[generate_report] Failed to identify sections for revision, "
+            "falling back to full revision",
+            exc_info=True,
+        )
+        return None
+
+    # If ALL sections need modification, full revision is more efficient and coherent
+    if len(modify_indices) >= len(sections):
+        logger.info(
+            "[generate_report] All sections need modification, deferring to full revision"
+        )
+        return None
+
+    # Compute total operations for progress tracking
+    total_ops = len(modify_indices) + len(add_sections)
+    current_op = 0
+
+    # Emit plan summary
+    parts = []
+    if modify_indices:
+        parts.append(
+            f"modifying {len(modify_indices)} section{'s' if len(modify_indices) > 1 else ''}"
+        )
+    if add_sections:
+        parts.append(
+            f"adding {len(add_sections)} new section{'s' if len(add_sections) > 1 else ''}"
+        )
+    if remove_indices:
+        parts.append(
+            f"removing {len(remove_indices)} section{'s' if len(remove_indices) > 1 else ''}"
+        )
+    plan_summary = ", ".join(parts) if parts else "no changes needed"
+
+    dispatch_custom_event(
+        "report_progress",
+        {
+            "phase": "revision_plan",
+            "message": plan_summary.capitalize(),
+            "modify_count": len(modify_indices),
+            "add_count": len(add_sections),
+            "remove_count": len(remove_indices),
+            "total_ops": total_ops,
+        },
+    )
+
+    # Step 2: Revise only the affected sections
+    revised_sections = list(sections)  # shallow copy — unmodified sections stay as-is
+
+    for idx in modify_indices:
+        if idx < 0 or idx >= len(sections):
+            continue
+
+        current_op += 1
+        sec = sections[idx]
+
+        # Extract plain section name (strip markdown heading markers)
+        section_name = (
+            re.sub(r"^#+\s*", "", sec["heading"]).strip()
+            if sec["heading"]
+            else "Preamble"
+        )
+        dispatch_custom_event(
+            "report_progress",
+            {
+                "phase": "revising_section",
+                "message": f"Revising: {section_name} ({current_op}/{total_ops})...",
+            },
+        )
+
+        section_content = (
+            f"{sec['heading']}\n\n{sec['body']}" if sec["heading"] else sec["body"]
+        )
+
+        # Build context from surrounding sections
+        context_parts = []
+        if idx > 0:
+            prev = sections[idx - 1]
+            prev_preview = prev["body"][:300] + (
+                "..." if len(prev["body"]) > 300 else ""
+            )
+            context_parts.append(
+                f"**Previous section:** {prev['heading']}\n{prev_preview}"
+            )
+        if idx < len(sections) - 1:
+            nxt = sections[idx + 1]
+            nxt_preview = nxt["body"][:300] + ("..." if len(nxt["body"]) > 300 else "")
+            context_parts.append(f"**Next section:** {nxt['heading']}\n{nxt_preview}")
+        context = (
+            "\n\n".join(context_parts) if context_parts else "(No surrounding sections)"
+        )
+
+        revise_prompt = _REVISE_SECTION_PROMPT.format(
+            user_instructions=user_instructions,
+            section_content=section_content,
+            context_sections=context,
+            source_content=source_content[:40000],
+            formatting_rules=_FORMATTING_RULES,
+        )
+
+        resp = await llm.ainvoke([HumanMessage(content=revise_prompt)])
+        revised_text = resp.content
+        if revised_text and isinstance(revised_text, str):
+            revised_text = _strip_wrapping_code_fences(revised_text).strip()
+            # Parse the LLM output back into heading + body
+            revised_parsed = _parse_sections(revised_text)
+            if revised_parsed:
+                revised_sections[idx] = revised_parsed[0]
+            else:
+                revised_sections[idx] = {
+                    "heading": sec["heading"],
+                    "body": revised_text,
+                }
+
+        logger.info(f"[generate_report] Revised section [{idx}]: {sec['heading']}")
+
+    # Step 3: Handle new section additions (insert in reverse order to preserve indices)
+    for add_info in sorted(
+        add_sections,
+        key=lambda x: x.get("after_index", len(revised_sections) - 1),
+        reverse=True,
+    ):
+        current_op += 1
+        after_idx = add_info.get("after_index", len(revised_sections) - 1)
+        heading = add_info.get("heading", "## New Section")
+        description = add_info.get("description", "")
+
+        # Extract plain section name for progress display
+        plain_heading = re.sub(r"^#+\s*", "", heading).strip()
+        dispatch_custom_event(
+            "report_progress",
+            {
+                "phase": "adding_section",
+                "message": f"Adding: {plain_heading} ({current_op}/{total_ops})...",
+            },
+        )
+
+        # Build context from the surrounding sections at the insertion point
+        ctx_parts = []
+        if 0 <= after_idx < len(revised_sections):
+            before_sec = revised_sections[after_idx]
+            ctx_parts.append(
+                f"**Section before:** {before_sec['heading']}\n{before_sec['body'][:300]}"
+            )
+        insert_idx = min(after_idx + 1, len(revised_sections))
+        if insert_idx < len(revised_sections):
+            after_sec = revised_sections[insert_idx]
+            ctx_parts.append(
+                f"**Section after:** {after_sec['heading']}\n{after_sec['body'][:300]}"
+            )
+
+        new_prompt = _NEW_SECTION_PROMPT.format(
+            topic=topic,
+            report_style=report_style,
+            heading=heading,
+            description=description,
+            user_instructions=user_instructions,
+            context_sections="\n\n".join(ctx_parts) if ctx_parts else "(None)",
+            source_content=source_content[:30000],
+            formatting_rules=_FORMATTING_RULES,
+        )
+
+        resp = await llm.ainvoke([HumanMessage(content=new_prompt)])
+        new_content = resp.content
+        if new_content and isinstance(new_content, str):
+            new_content = _strip_wrapping_code_fences(new_content).strip()
+            new_parsed = _parse_sections(new_content)
+            if new_parsed:
+                revised_sections.insert(insert_idx, new_parsed[0])
+            else:
+                revised_sections.insert(
+                    insert_idx,
+                    {
+                        "heading": heading,
+                        "body": new_content,
+                    },
+                )
+
+        logger.info(
+            f"[generate_report] Added new section after [{after_idx}]: {heading}"
+        )
+
+    # Step 4: Handle removals (reverse order to preserve indices)
+    for idx in sorted(remove_indices, reverse=True):
+        if 0 <= idx < len(revised_sections):
+            logger.info(
+                f"[generate_report] Removed section [{idx}]: "
+                f"{revised_sections[idx]['heading']}"
+            )
+            revised_sections.pop(idx)
+
+    return _stitch_sections(revised_sections)
+
+
+# ─── Tool Factory ───────────────────────────────────────────────────────────
+
+
+def create_generate_report_tool(
+    search_space_id: int,
+    thread_id: int | None = None,
+    connector_service: ConnectorService | None = None,
+    available_connectors: list[str] | None = None,
+    available_document_types: list[str] | None = None,
+):
+    """
+    Factory function to create the generate_report tool with injected dependencies.
+
+    The tool generates a Markdown report inline using the search space's
+    document summary LLM, saves it to the database, and returns immediately.
+
+    Uses short-lived database sessions for each DB operation so no connection
+    is held during the long LLM API call.
+
+    Generation strategies:
+      - New reports: single-shot generation (1 LLM call)
+      - Revisions (targeted edits): section-level (unchanged sections preserved)
+      - Revisions (global changes): full-document revision fallback
+
+    Source strategies:
+      - "provided"/"conversation": use only the supplied source_content
+      - "kb_search": search the knowledge base internally using targeted queries
+      - "auto": use source_content if sufficient, otherwise fall back to KB search
+
+    Args:
+        search_space_id: The user's search space ID
+        thread_id: The chat thread ID for associating the report
+        connector_service: Optional connector service for internal KB search.
+            When provided, the tool can search the knowledge base internally
+            (used by the "kb_search" and "auto" source strategies).
+        available_connectors: Optional list of connector types available in the
+            search space (used to scope internal KB searches).
+
+    Returns:
+        A configured tool function for generating reports
+    """
+
+    @tool
+    async def generate_report(
+        topic: str,
+        source_content: str = "",
+        source_strategy: str = "provided",
+        search_queries: list[str] | None = None,
+        report_style: str = "detailed",
+        user_instructions: str | None = None,
+        parent_report_id: int | None = None,
+    ) -> dict[str, Any]:
+        """
+        Generate a structured Markdown report artifact from provided content.
+
+        Use this tool when the user asks to create, generate, write, produce,
+        draft, or summarize into a report-style deliverable.
+
+        Trigger classes include:
+        - Direct trigger words WITH creation/modification verb: report,
+          document, memo, letter, template, article, guide, blog post,
+          one-pager, briefing, comprehensive guide.
+        - Creation-intent phrases: "write a report", "generate a document",
+          "draft a summary", "create an executive summary".
+        - Modification-intent phrases: "revise the report", "update the
+          report", "make it shorter", "add a section about X", "expand the
+          budget section", "rewrite in formal tone".
+
+        IMPORTANT — what does NOT count as "asking for a report":
+        - Questions or discussion about a report or its topic are NOT report
+          requests. Respond to these conversationally in chat.
+          Examples: "What other examples to put there?", "What else could be
+          added?", "Can you explain section 2?", "Is the data accurate?",
+          "What's missing?", "How could this be improved?", "What other
+          topics are related?"
+        - Quick summary requests, explanations, or follow-up questions.
+        - The test: Does the message contain a creation/modification VERB
+          (write, create, generate, draft, add, revise, update, expand,
+          rewrite, make) directed at producing a deliverable? If no verb
+          → answer in chat.
+
+        FORMAT/EXPORT RULE:
+        - Always generate the report content in Markdown.
+        - If the user requests DOCX/Word/PDF or another file format, export
+          from the generated Markdown report.
+
+        SOURCE STRATEGY (how to collect source material):
+        - source_strategy="conversation" — The conversation already has
+          enough context (prior Q&A, filesystem exploration, pasted text,
+          uploaded files, scraped webpages). Pass a thorough summary as
+          source_content.
+        - source_strategy="kb_search" — Search the knowledge base
+          internally. Provide 1-5 targeted search_queries. The tool
+          handles searching internally — do NOT manually read and dump
+          /documents/ files into source_content.
+        - source_strategy="provided" — Use only what is in source_content
+          (default, backward-compatible).
+        - source_strategy="auto" — Use source_content if it has enough
+          material; otherwise fall back to internal KB search using
+          search_queries.
+
+        CONVERSATION REUSE (HIGH PRIORITY):
+        - If the user has been asking questions in this chat and the
+          conversation contains substantive answers/discussion on the
+          topic, prefer source_strategy="conversation" with a thorough
+          summary of the full chat history as source_content.
+        - The user's prior questions and your answers ARE the source
+          material. Do NOT redundantly search the knowledge base for
+          information that is already in the chat.
+
+        VERSIONING — parent_report_id:
+        - Set parent_report_id when the user wants to MODIFY, REVISE,
+          IMPROVE, UPDATE, EXPAND, or ADD CONTENT TO an existing report
+          that was already generated in this conversation.
+        - This includes both explicit AND implicit modification requests.
+          If the user references the existing report using words like "it",
+          "this", "here", "the report", or clearly refers to a previously
+          generated report, treat it as a revision request.
+        - The value must be the report_id from a previous generate_report
+          result in this same conversation.
+        - Do NOT set parent_report_id when:
+          * The user asks for a report on a completely NEW/DIFFERENT topic
+          * The user says "generate another report" (new report, not revision)
+          * There is no prior report to reference
+
+        Examples of when to SET parent_report_id:
+          User: "Make that report shorter" → parent_report_id = <previous report_id>
+          User: "Add a cost analysis section to the report" → parent_report_id = <previous report_id>
+          User: "Rewrite the report in a more formal tone" → parent_report_id = <previous report_id>
+          User: "I want more details about pricing in here" → parent_report_id = <previous report_id>
+          User: "Include more examples" → parent_report_id = <previous report_id>
+          User: "Can you also cover nutrition in this?" → parent_report_id = <previous report_id>
+          User: "Make it more detailed" → parent_report_id = <previous report_id>
+          User: "Not bad, but expand on the budget section" → parent_report_id = <previous report_id>
+          User: "Also mention the competitor landscape" → parent_report_id = <previous report_id>
+
+        Examples of when to LEAVE parent_report_id as None:
+          User: "Generate a report on climate change" → None (new topic)
+          User: "Write me a report about the budget" → None (new topic)
+          User: "Create another report, this time about marketing" → None
+          User: "Now write one about travel trends in Europe" → None (new topic)
+
+        Args:
+            topic: Short title for the report (max ~8 words).
+            source_content: Text to base the report on. Can be empty when
+                using source_strategy="kb_search".
+            source_strategy: How to collect source material. One of
+                "provided", "conversation", "kb_search", or "auto".
+            search_queries: When source_strategy is "kb_search" or "auto",
+                provide 1-5 targeted search queries for the knowledge base.
+                These should be specific, not just the topic repeated.
+            report_style: "detailed", "deep_research", or "brief".
+            user_instructions: Optional focus or modification instructions.
+                When revising (parent_report_id set), describe WHAT TO CHANGE.
+            parent_report_id: ID of a previous report to revise (creates new
+                version in the same version group).
+
+        Returns:
+            Dict with status, report_id, title, word_count, and message.
+        """
+        # Initialize version tracking variables (used by _save_failed_report closure)
+        parent_report_content: str | None = None
+        report_group_id: int | None = None
+
+        async def _save_failed_report(error_msg: str) -> int | None:
+            """Persist a failed report row using a short-lived session."""
+            try:
+                async with shielded_async_session() as session:
+                    failed_report = Report(
+                        title=topic,
+                        content=None,
+                        report_metadata={
+                            "status": "failed",
+                            "error_message": error_msg,
+                        },
+                        report_style=report_style,
+                        search_space_id=search_space_id,
+                        thread_id=thread_id,
+                        report_group_id=report_group_id,
+                    )
+                    session.add(failed_report)
+                    await session.commit()
+                    await session.refresh(failed_report)
+                    # If this is a new group (v1 failed), set group to self
+                    if not failed_report.report_group_id:
+                        failed_report.report_group_id = failed_report.id
+                        await session.commit()
+                    logger.info(
+                        f"[generate_report] Saved failed report {failed_report.id}: {error_msg}"
+                    )
+                    return failed_report.id
+            except Exception:
+                logger.exception(
+                    "[generate_report] Could not persist failed report row"
+                )
+                return None
+
+        try:
+            # ── Phase 1: READ (short-lived session) ──────────────────────
+            # Fetch parent report and LLM config, then close the session
+            # so no DB connection is held during the long LLM call.
+            async with shielded_async_session() as read_session:
+                if parent_report_id:
+                    parent_report = await read_session.get(Report, parent_report_id)
+                    if parent_report:
+                        report_group_id = parent_report.report_group_id
+                        parent_report_content = parent_report.content
+                        logger.info(
+                            f"[generate_report] Creating new version from parent {parent_report_id} "
+                            f"(group {report_group_id})"
+                        )
+                    else:
+                        logger.warning(
+                            f"[generate_report] parent_report_id={parent_report_id} not found, "
+                            "creating standalone report"
+                        )
+
+                llm = await get_document_summary_llm(read_session, search_space_id)
+            # read_session closed — connection returned to pool
+
+            if not llm:
+                error_msg = (
+                    "No LLM configured. Please configure a language model in Settings."
+                )
+                report_id = await _save_failed_report(error_msg)
+                return {
+                    "status": "failed",
+                    "error": error_msg,
+                    "report_id": report_id,
+                    "title": topic,
+                }
+
+            # Build the user instructions string
+            user_instructions_section = ""
+            if user_instructions:
+                user_instructions_section = (
+                    f"**Additional Instructions:** {user_instructions}"
+                )
+
+            # ── Phase 1b: SOURCE COLLECTION (smart KB search) ────────────
+            # Decide whether to augment source_content with KB search results.
+            effective_source = source_content or ""
+
+            strategy = (source_strategy or "provided").lower().strip()
+
+            needs_kb_search = False
+            if strategy == "kb_search":
+                needs_kb_search = True
+            elif strategy == "auto":
+                # Heuristic: if source_content has fewer than 200 words,
+                # it's likely insufficient — augment with KB search.
+                word_count_estimate = len(effective_source.split())
+                if word_count_estimate < 200:
+                    needs_kb_search = True
+                    logger.info(
+                        f"[generate_report] auto strategy: source has ~{word_count_estimate} words, "
+                        "triggering KB search"
+                    )
+            # "provided" and "conversation" → use source_content as-is
+
+            if needs_kb_search and connector_service and search_queries:
+                query_count = min(len(search_queries), 5)
+                dispatch_custom_event(
+                    "report_progress",
+                    {
+                        "phase": "kb_search",
+                        "message": f"Searching knowledge base ({query_count} queries)...",
+                    },
+                )
+                logger.info(
+                    f"[generate_report] Running internal KB search with "
+                    f"{query_count} queries: {search_queries[:5]}"
+                )
+                try:
+                    from .knowledge_base import search_knowledge_base_async
+
+                    # Run all queries in parallel, each with its own session
+                    async def _run_single_query(q: str) -> str:
+                        async with shielded_async_session() as kb_session:
+                            kb_connector_svc = ConnectorService(
+                                kb_session, search_space_id
+                            )
+                            return await search_knowledge_base_async(
+                                query=q,
+                                search_space_id=search_space_id,
+                                db_session=kb_session,
+                                connector_service=kb_connector_svc,
+                                top_k=10,
+                                available_connectors=available_connectors,
+                                available_document_types=available_document_types,
+                            )
+
+                    kb_results = await asyncio.gather(
+                        *[_run_single_query(q) for q in search_queries[:5]]
+                    )
+
+                    # Merge non-empty results into source_content
+                    kb_text_parts = [r for r in kb_results if r and r.strip()]
+                    if kb_text_parts:
+                        kb_combined = "\n\n---\n\n".join(kb_text_parts)
+                        if effective_source.strip():
+                            effective_source = (
+                                effective_source
+                                + "\n\n--- Knowledge Base Search Results ---\n\n"
+                                + kb_combined
+                            )
+                        else:
+                            effective_source = kb_combined
+
+                        # Count docs found (rough: count <document> tags)
+                        doc_count = kb_combined.count("<document>")
+                        dispatch_custom_event(
+                            "report_progress",
+                            {
+                                "phase": "kb_search_done",
+                                "message": f"Found {doc_count} relevant documents"
+                                if doc_count
+                                else f"Found results from {len(kb_text_parts)} queries",
+                            },
+                        )
+                        logger.info(
+                            f"[generate_report] KB search added ~{len(kb_combined)} chars "
+                            f"from {len(kb_text_parts)} queries"
+                        )
+                    else:
+                        dispatch_custom_event(
+                            "report_progress",
+                            {
+                                "phase": "kb_search_done",
+                                "message": "No results found in knowledge base",
+                            },
+                        )
+                        logger.info("[generate_report] KB search returned no results")
+
+                except Exception as e:
+                    logger.warning(
+                        f"[generate_report] Internal KB search failed: {e}. "
+                        "Proceeding with existing source_content."
+                    )
+            elif needs_kb_search and not connector_service:
+                logger.warning(
+                    "[generate_report] KB search requested but connector_service "
+                    "not available. Using source_content as-is."
+                )
+            elif needs_kb_search and not search_queries:
+                logger.warning(
+                    "[generate_report] KB search requested but no search_queries "
+                    "provided. Using source_content as-is."
+                )
+
+            capped_source = effective_source[:100000]  # Cap source content
+
+            # Length constraint — only when user explicitly asks for brevity
+            length_instruction = ""
+            if report_style == "brief":
+                length_instruction = (
+                    "**LENGTH CONSTRAINT (MANDATORY):** The user wants a SHORT report. "
+                    "Keep it concise — aim for ~400 words (~1 page) unless a different "
+                    "length is specified in the Additional Instructions above. "
+                    "Prioritize brevity over thoroughness. Do NOT write a long report."
+                )
+
+            # ── Phase 2: LLM GENERATION (no DB connection held) ──────────
+
+            report_content: str | None = None
+
+            if parent_report_content:
+                # ─── REVISION MODE ───────────────────────────────────────
+                # Strategy: Try section-level revision first (preserves
+                # unchanged sections byte-for-byte). Falls back to full-
+                # document revision if section identification fails or if
+                # all sections need changes.
+                dispatch_custom_event(
+                    "report_progress",
+                    {
+                        "phase": "revision_start",
+                        "message": "Analyzing sections to modify...",
+                    },
+                )
+                logger.info(
+                    "[generate_report] Revision mode — attempting section-level revision"
+                )
+                report_content = await _revise_with_sections(
+                    llm=llm,
+                    parent_content=parent_report_content,
+                    user_instructions=user_instructions
+                    or "Improve and refine the report.",
+                    source_content=capped_source,
+                    topic=topic,
+                    report_style=report_style,
+                )
+
+                if report_content is None:
+                    # Fallback: full-document revision
+                    dispatch_custom_event(
+                        "report_progress",
+                        {"phase": "writing", "message": "Rewriting your full report"},
+                    )
+                    logger.info(
+                        "[generate_report] Section-level revision deferred, "
+                        "using full-document revision"
+                    )
+                    prompt = _REVISION_PROMPT.format(
+                        topic=topic,
+                        report_style=report_style,
+                        user_instructions_section=user_instructions_section
+                        or "Improve and refine the report.",
+                        source_content=capped_source,
+                        previous_report_content=parent_report_content,
+                        length_instruction=length_instruction,
+                        formatting_rules=_FORMATTING_RULES,
+                    )
+                    response = await llm.ainvoke([HumanMessage(content=prompt)])
+                    report_content = response.content
+
+            else:
+                # ─── NEW REPORT MODE ─────────────────────────────────────
+                # Single-shot generation: one LLM call produces the full
+                # report. Fast, globally coherent, and cost-efficient.
+                dispatch_custom_event(
+                    "report_progress",
+                    {"phase": "writing", "message": "Writing your report"},
+                )
+                logger.info(
+                    "[generate_report] New report — using single-shot generation"
+                )
+                prompt = _REPORT_PROMPT.format(
+                    topic=topic,
+                    report_style=report_style,
+                    user_instructions_section=user_instructions_section,
+                    previous_version_section="",
+                    source_content=capped_source,
+                    length_instruction=length_instruction,
+                    formatting_rules=_FORMATTING_RULES,
+                )
+                response = await llm.ainvoke([HumanMessage(content=prompt)])
+                report_content = response.content
+
+            # ── Validate LLM output ──────────────────────────────────────
+
+            if not report_content or not isinstance(report_content, str):
+                error_msg = "LLM returned empty or invalid content"
+                report_id = await _save_failed_report(error_msg)
+                return {
+                    "status": "failed",
+                    "error": error_msg,
+                    "report_id": report_id,
+                    "title": topic,
+                }
+
+            # LLMs often wrap output in ```markdown ... ``` fences — strip them
+            report_content = _strip_wrapping_code_fences(report_content)
+
+            if not report_content:
+                error_msg = "LLM returned empty or invalid content"
+                report_id = await _save_failed_report(error_msg)
+                return {
+                    "status": "failed",
+                    "error": error_msg,
+                    "report_id": report_id,
+                    "title": topic,
+                }
+
+            # Strip any existing footer(s) carried over from parent version(s)
+            while report_content.rstrip().endswith(_REPORT_FOOTER):
+                idx = report_content.rstrip().rfind(_REPORT_FOOTER)
+                report_content = report_content[:idx].rstrip()
+                if report_content.rstrip().endswith("---"):
+                    report_content = report_content.rstrip()[:-3].rstrip()
+
+            # Append exactly one standard disclaimer
+            report_content += "\n\n---\n\n" + _REPORT_FOOTER
+
+            # Extract metadata (includes "status": "ready")
+            metadata = _extract_metadata(report_content)
+
+            # ── Phase 3: WRITE (short-lived session) ─────────────────────
+            # Save the report to the database, then close the session.
+            async with shielded_async_session() as write_session:
+                report = Report(
+                    title=topic,
+                    content=report_content,
+                    report_metadata=metadata,
+                    report_style=report_style,
+                    search_space_id=search_space_id,
+                    thread_id=thread_id,
+                    report_group_id=report_group_id,
+                )
+                write_session.add(report)
+                await write_session.commit()
+                await write_session.refresh(report)
+
+                # If this is a brand-new report (v1), set report_group_id = own id
+                if not report.report_group_id:
+                    report.report_group_id = report.id
+                    await write_session.commit()
+
+                saved_report_id = report.id
+                saved_group_id = report.report_group_id
+            # write_session closed — connection returned to pool
+
+            logger.info(
+                f"[generate_report] Created report {saved_report_id} "
+                f"(group={saved_group_id}): "
+                f"{metadata.get('word_count', 0)} words, "
+                f"{metadata.get('section_count', 0)} sections"
+            )
+
+            return {
+                "status": "ready",
+                "report_id": saved_report_id,
+                "title": topic,
+                "word_count": metadata.get("word_count", 0),
+                "is_revision": bool(parent_report_content),
+                "report_markdown": report_content,
+                "message": f"Report generated successfully: {topic}",
+            }
+
+        except Exception as e:
+            error_message = str(e)
+            logger.exception(f"[generate_report] Error: {error_message}")
+            report_id = await _save_failed_report(error_message)
+
+            return {
+                "status": "failed",
+                "error": error_message,
+                "report_id": report_id,
+                "title": topic,
+            }
+
+    return generate_report
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/resume.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/resume.py
new file mode 100644
index 0000000000..ece3ce241b
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/resume.py
@@ -0,0 +1,799 @@
+"""Resume as Typst: LLM fills the body; backend prepends a template from ``_TEMPLATES`` and compiles."""
+
+import io
+import logging
+import re
+from datetime import UTC, datetime
+from typing import Any
+
+import pypdf
+import typst
+from langchain_core.callbacks import dispatch_custom_event
+from langchain_core.messages import HumanMessage
+from langchain_core.tools import tool
+
+from app.db import Report, shielded_async_session
+from app.services.llm_service import get_document_summary_llm
+
+logger = logging.getLogger(__name__)
+
+
+# ─── Template Registry ───────────────────────────────────────────────────────
+# Each template defines:
+#   header              - Typst import + show rule with {name}, {year}, {month}, {day} placeholders
+#   component_reference - component docs shown to the LLM
+#   rules               - generation rules for the LLM
+
+_TEMPLATES: dict[str, dict[str, str]] = {
+    "classic": {
+        "header": """\
+#import "@preview/rendercv:0.3.0": *
+
+#show: rendercv.with(
+  name: "{name}",
+  title: "{name} - Resume",
+  footer: context {{ [#emph[{name} -- #str(here().page())\\/#str(counter(page).final().first())]] }},
+  top-note: [ #emph[Last updated in {month_name} {year}] ],
+  locale-catalog-language: "en",
+  text-direction: ltr,
+  page-size: "us-letter",
+  page-top-margin: 0.7in,
+  page-bottom-margin: 0.7in,
+  page-left-margin: 0.7in,
+  page-right-margin: 0.7in,
+  page-show-footer: false,
+  page-show-top-note: true,
+  colors-body: rgb(0, 0, 0),
+  colors-name: rgb(0, 0, 0),
+  colors-headline: rgb(0, 0, 0),
+  colors-connections: rgb(0, 0, 0),
+  colors-section-titles: rgb(0, 0, 0),
+  colors-links: rgb(0, 0, 0),
+  colors-footer: rgb(128, 128, 128),
+  colors-top-note: rgb(128, 128, 128),
+  typography-line-spacing: 0.6em,
+  typography-alignment: "justified",
+  typography-date-and-location-column-alignment: right,
+  typography-font-family-body: "XCharter",
+  typography-font-family-name: "XCharter",
+  typography-font-family-headline: "XCharter",
+  typography-font-family-connections: "XCharter",
+  typography-font-family-section-titles: "XCharter",
+  typography-font-size-body: 10pt,
+  typography-font-size-name: 25pt,
+  typography-font-size-headline: 10pt,
+  typography-font-size-connections: 10pt,
+  typography-font-size-section-titles: 1.2em,
+  typography-small-caps-name: false,
+  typography-small-caps-headline: false,
+  typography-small-caps-connections: false,
+  typography-small-caps-section-titles: false,
+  typography-bold-name: false,
+  typography-bold-headline: false,
+  typography-bold-connections: false,
+  typography-bold-section-titles: true,
+  links-underline: true,
+  links-show-external-link-icon: false,
+  header-alignment: center,
+  header-photo-width: 3.5cm,
+  header-space-below-name: 0.7cm,
+  header-space-below-headline: 0.7cm,
+  header-space-below-connections: 0.7cm,
+  header-connections-hyperlink: true,
+  header-connections-show-icons: false,
+  header-connections-display-urls-instead-of-usernames: true,
+  header-connections-separator: "|",
+  header-connections-space-between-connections: 0.5cm,
+  section-titles-type: "with_full_line",
+  section-titles-line-thickness: 0.5pt,
+  section-titles-space-above: 0.5cm,
+  section-titles-space-below: 0.3cm,
+  sections-allow-page-break: true,
+  sections-space-between-text-based-entries: 0.15cm,
+  sections-space-between-regular-entries: 0.42cm,
+  entries-date-and-location-width: 4.15cm,
+  entries-side-space: 0cm,
+  entries-space-between-columns: 0.1cm,
+  entries-allow-page-break: false,
+  entries-short-second-row: false,
+  entries-degree-width: 1cm,
+  entries-summary-space-left: 0cm,
+  entries-summary-space-above: 0.08cm,
+  entries-highlights-bullet: text(13pt, [\\u{2022}], baseline: -0.6pt),
+  entries-highlights-nested-bullet: text(13pt, [\\u{2022}], baseline: -0.6pt),
+  entries-highlights-space-left: 0cm,
+  entries-highlights-space-above: 0.08cm,
+  entries-highlights-space-between-items: 0.02cm,
+  entries-highlights-space-between-bullet-and-text: 0.3em,
+  date: datetime(
+    year: {year},
+    month: {month},
+    day: {day},
+  ),
+)
+
+""",
+        "component_reference": """\
+Available components (use ONLY these):
+
+= Full Name                              // Top-level heading — person's full name
+
+#connections(                            // Contact info row (pipe-separated)
+  [City, Country],
+  [#link("mailto:email@example.com", icon: false, if-underline: false, if-color: false)[email\\@example.com]],
+  [#link("https://linkedin.com/in/user", icon: false, if-underline: false, if-color: false)[linkedin.com\\/in\\/user]],
+  [#link("https://github.com/user", icon: false, if-underline: false, if-color: false)[github.com\\/user]],
+)
+
+== Section Title                         // Section heading (arbitrary name)
+
+#regular-entry(                          // Work experience, projects, publications, etc.
+  [
+    #strong[Role/Title], Company Name -- Location
+  ],
+  [
+    Start -- End
+  ],
+  main-column-second-row: [
+    - Achievement or responsibility
+    - Another bullet point
+  ],
+)
+
+#education-entry(                        // Education entries
+  [
+    #strong[Institution], Degree in Field -- Location
+  ],
+  [
+    Start -- End
+  ],
+  main-column-second-row: [
+    - GPA, honours, relevant coursework
+  ],
+)
+
+#summary([Short paragraph summary])     // Optional summary inside an entry
+#content-area([Free-form content])       // Freeform text block
+
+For skills sections, use one bullet per category label:
+- #strong[Category:] item1, item2, item3
+
+For simple list sections (e.g. Honors), use plain bullet points:
+- Item one
+- Item two
+""",
+        "rules": """\
+RULES:
+- Do NOT include any #import or #show lines. Start directly with = Full Name.
+- Output ONLY valid Typst content. No explanatory text before or after.
+- Do NOT wrap output in ```typst code fences.
+- The = heading MUST use the person's COMPLETE full name exactly as provided. NEVER shorten or abbreviate.
+- Escape @ symbols inside link labels with a backslash: email\\@example.com
+- Escape forward slashes in link display text: linkedin.com\\/in\\/user
+- Every section MUST use == heading.
+- Use #regular-entry() for experience, projects, publications, certifications, and similar entries.
+- Use #education-entry() for education.
+- For skills sections, use one bullet line per category with a bold label.
+- Keep content professional, concise, and achievement-oriented.
+- Use action verbs for bullet points (Led, Built, Designed, Reduced, etc.).
+- This template works for ALL professions — adapt sections to the user's field.
+- Default behavior should prioritize concise one-page content.
+""",
+    },
+}
+
+DEFAULT_TEMPLATE = "classic"
+MIN_RESUME_PAGES = 1
+MAX_RESUME_PAGES = 5
+MAX_COMPRESSION_ATTEMPTS = 2
+
+
+# ─── Template Helpers ─────────────────────────────────────────────────────────
+
+
+def _get_template(template_id: str | None = None) -> dict[str, str]:
+    """Get a template by ID, falling back to default."""
+    return _TEMPLATES.get(template_id or DEFAULT_TEMPLATE, _TEMPLATES[DEFAULT_TEMPLATE])
+
+
+_MONTH_NAMES = [
+    "",
+    "Jan",
+    "Feb",
+    "Mar",
+    "Apr",
+    "May",
+    "Jun",
+    "Jul",
+    "Aug",
+    "Sep",
+    "Oct",
+    "Nov",
+    "Dec",
+]
+
+
+def _build_header(template: dict[str, str], name: str) -> str:
+    """Build the template header with the person's name and current date."""
+    now = datetime.now(tz=UTC)
+    return (
+        template["header"]
+        .replace("{name}", name)
+        .replace("{year}", str(now.year))
+        .replace("{month}", str(now.month))
+        .replace("{day}", str(now.day))
+        .replace("{month_name}", _MONTH_NAMES[now.month])
+    )
+
+
+def _strip_header(full_source: str) -> str:
+    """Strip the import + show rule from stored source to get the body only.
+
+    Finds the closing parenthesis of the rendercv.with(...) block by tracking
+    nesting depth, then returns everything after it.
+    """
+    show_match = re.search(r"#show:\s*rendercv\.with\(", full_source)
+    if not show_match:
+        return full_source
+
+    start = show_match.end()
+    depth = 1
+    i = start
+    while i < len(full_source) and depth > 0:
+        if full_source[i] == "(":
+            depth += 1
+        elif full_source[i] == ")":
+            depth -= 1
+        i += 1
+
+    return full_source[i:].lstrip("\n")
+
+
+def _extract_name(body: str) -> str | None:
+    """Extract the person's full name from the = heading in the body."""
+    match = re.search(r"^=\s+(.+)$", body, re.MULTILINE)
+    return match.group(1).strip() if match else None
+
+
+def _strip_imports(body: str) -> str:
+    """Remove any #import or #show lines the LLM might accidentally include."""
+    lines = body.split("\n")
+    cleaned: list[str] = []
+    skip_show = False
+    depth = 0
+
+    for line in lines:
+        stripped = line.strip()
+
+        if stripped.startswith("#import"):
+            continue
+
+        if skip_show:
+            depth += stripped.count("(") - stripped.count(")")
+            if depth <= 0:
+                skip_show = False
+            continue
+
+        if stripped.startswith("#show:") and "rendercv" in stripped:
+            depth = stripped.count("(") - stripped.count(")")
+            if depth > 0:
+                skip_show = True
+            continue
+
+        cleaned.append(line)
+
+    result = "\n".join(cleaned).strip()
+    return result
+
+
+def _build_llm_reference(template: dict[str, str]) -> str:
+    """Build the LLM prompt reference from a template."""
+    return f"""\
+You MUST output valid Typst content for a resume.
+Do NOT include any #import or #show lines — those are handled automatically.
+Start directly with the = Full Name heading.
+
+{template["component_reference"]}
+
+{template["rules"]}"""
+
+
+# ─── Prompts ─────────────────────────────────────────────────────────────────
+
+_RESUME_PROMPT = """\
+You are an expert resume writer. Generate professional resume content as Typst markup.
+
+{llm_reference}
+
+**User Information:**
+{user_info}
+
+**Target Maximum Pages:** {max_pages}
+
+{user_instructions_section}
+
+Generate the resume content now (starting with = Full Name):
+"""
+
+_REVISION_PROMPT = """\
+You are an expert resume editor. Modify the existing resume according to the instructions.
+Apply ONLY the requested changes — do NOT rewrite sections that are not affected.
+
+{llm_reference}
+
+**Target Maximum Pages:** {max_pages}
+
+**Modification Instructions:** {user_instructions}
+
+**EXISTING RESUME CONTENT:**
+
+{previous_content}
+
+---
+
+Output the complete, updated resume content with the changes applied (starting with = Full Name):
+"""
+
+_FIX_COMPILE_PROMPT = """\
+The resume content you generated failed to compile. Fix the error while preserving all content.
+
+{llm_reference}
+
+**Compilation Error:**
+{error}
+
+**Full Typst Source (for context — error line numbers refer to this):**
+{full_source}
+
+**Your content starts after the template header. Output ONLY the content portion \
+(starting with = Full Name), NOT the #import or #show rule:**
+"""
+
+_COMPRESS_TO_PAGE_LIMIT_PROMPT = """\
+The resume compiles, but it exceeds the maximum allowed page count.
+Compress the resume while preserving high-impact accomplishments and role relevance.
+
+{llm_reference}
+
+**Target Maximum Pages:** {max_pages}
+**Current Page Count:** {actual_pages}
+**Compression Attempt:** {attempt_number}
+
+Compression priorities (in this order):
+1) Keep recent, high-impact, role-relevant bullets.
+2) Remove low-impact or redundant bullets.
+3) Shorten verbose wording while preserving meaning.
+4) Trim older or less relevant details before recent ones.
+
+Return the complete updated Typst content (starting with = Full Name), and keep it at or below the target pages.
+
+**EXISTING RESUME CONTENT:**
+{previous_content}
+"""
+
+
+# ─── Helpers ─────────────────────────────────────────────────────────────────
+
+
+def _strip_typst_fences(text: str) -> str:
+    """Remove wrapping ```typst ... ``` fences that LLMs sometimes add."""
+    stripped = text.strip()
+    m = re.match(r"^(`{3,})(?:typst|typ)?\s*\n", stripped)
+    if m:
+        fence = m.group(1)
+        if stripped.endswith(fence):
+            stripped = stripped[m.end() :]
+            stripped = stripped[: -len(fence)].rstrip()
+    return stripped
+
+
+def _compile_typst(source: str) -> bytes:
+    """Compile Typst source to PDF bytes. Raises on failure."""
+    return typst.compile(source.encode("utf-8"))
+
+
+def _count_pdf_pages(pdf_bytes: bytes) -> int:
+    """Count the number of pages in compiled PDF bytes."""
+    with io.BytesIO(pdf_bytes) as pdf_stream:
+        reader = pypdf.PdfReader(pdf_stream)
+        return len(reader.pages)
+
+
+def _validate_max_pages(max_pages: int) -> int:
+    """Validate and normalize max_pages input."""
+    if MIN_RESUME_PAGES <= max_pages <= MAX_RESUME_PAGES:
+        return max_pages
+    msg = (
+        f"max_pages must be between {MIN_RESUME_PAGES} and "
+        f"{MAX_RESUME_PAGES}. Received: {max_pages}"
+    )
+    raise ValueError(msg)
+
+
+# ─── Tool Factory ───────────────────────────────────────────────────────────
+
+
+def create_generate_resume_tool(
+    search_space_id: int,
+    thread_id: int | None = None,
+):
+    """
+    Factory function to create the generate_resume tool.
+
+    Generates a Typst-based resume, validates it via compilation,
+    and stores the source in the Report table with content_type='typst'.
+    The LLM generates only the content body; the template header is
+    prepended by the backend.
+    """
+
+    @tool
+    async def generate_resume(
+        user_info: str,
+        user_instructions: str | None = None,
+        parent_report_id: int | None = None,
+        max_pages: int = 1,
+    ) -> dict[str, Any]:
+        """
+        Generate a professional resume as a Typst document.
+
+        Use this tool when the user asks to create, build, generate, write,
+        or draft a resume or CV. Also use it when the user wants to modify,
+        update, or revise an existing resume generated in this conversation.
+
+        Trigger phrases include:
+        - "build me a resume", "create my resume", "generate a CV"
+        - "update my resume", "change my title", "add my new job"
+        - "make my resume more concise", "reformat my resume"
+
+        Do NOT use this tool for:
+        - General questions about resumes or career advice
+        - Reviewing or critiquing a resume without changes
+        - Cover letters (use generate_report instead)
+
+        VERSIONING — parent_report_id:
+        - Set parent_report_id when the user wants to MODIFY an existing
+          resume that was already generated in this conversation.
+        - Leave as None for new resumes.
+
+        Args:
+            user_info: The user's resume content — work experience,
+                education, skills, contact info, etc. Can be structured
+                or unstructured text.
+            user_instructions: Optional style or content preferences
+                (e.g. "emphasize leadership", "keep it to one page",
+                "use a modern style"). For revisions, describe what to change.
+            parent_report_id: ID of a previous resume to revise (creates
+                new version in the same version group).
+            max_pages: Maximum number of pages for the generated resume.
+                Defaults to 1. Allowed range: 1-5.
+
+        Returns:
+            Dict with status, report_id, title, and content_type.
+        """
+        report_group_id: int | None = None
+        parent_content: str | None = None
+
+        template = _get_template()
+        llm_reference = _build_llm_reference(template)
+
+        async def _save_failed_report(error_msg: str) -> int | None:
+            try:
+                async with shielded_async_session() as session:
+                    failed = Report(
+                        title="Resume",
+                        content=None,
+                        content_type="typst",
+                        report_metadata={
+                            "status": "failed",
+                            "error_message": error_msg,
+                        },
+                        report_style="resume",
+                        search_space_id=search_space_id,
+                        thread_id=thread_id,
+                        report_group_id=report_group_id,
+                    )
+                    session.add(failed)
+                    await session.commit()
+                    await session.refresh(failed)
+                    if not failed.report_group_id:
+                        failed.report_group_id = failed.id
+                        await session.commit()
+                    logger.info(
+                        f"[generate_resume] Saved failed report {failed.id}: {error_msg}"
+                    )
+                    return failed.id
+            except Exception:
+                logger.exception(
+                    "[generate_resume] Could not persist failed report row"
+                )
+                return None
+
+        try:
+            try:
+                validated_max_pages = _validate_max_pages(max_pages)
+            except ValueError as e:
+                error_msg = str(e)
+                report_id = await _save_failed_report(error_msg)
+                return {
+                    "status": "failed",
+                    "error": error_msg,
+                    "report_id": report_id,
+                    "title": "Resume",
+                    "content_type": "typst",
+                }
+
+            # ── Phase 1: READ ─────────────────────────────────────────────
+            async with shielded_async_session() as read_session:
+                if parent_report_id:
+                    parent_report = await read_session.get(Report, parent_report_id)
+                    if parent_report:
+                        report_group_id = parent_report.report_group_id
+                        parent_content = parent_report.content
+                        logger.info(
+                            f"[generate_resume] Revising from parent {parent_report_id} "
+                            f"(group {report_group_id})"
+                        )
+
+                llm = await get_document_summary_llm(read_session, search_space_id)
+
+            if not llm:
+                error_msg = (
+                    "No LLM configured. Please configure a language model in Settings."
+                )
+                report_id = await _save_failed_report(error_msg)
+                return {
+                    "status": "failed",
+                    "error": error_msg,
+                    "report_id": report_id,
+                    "title": "Resume",
+                    "content_type": "typst",
+                }
+
+            # ── Phase 2: LLM GENERATION ───────────────────────────────────
+
+            user_instructions_section = ""
+            if user_instructions:
+                user_instructions_section = (
+                    f"**Additional Instructions:** {user_instructions}"
+                )
+
+            if parent_content:
+                dispatch_custom_event(
+                    "report_progress",
+                    {"phase": "writing", "message": "Updating your resume"},
+                )
+                parent_body = _strip_header(parent_content)
+                prompt = _REVISION_PROMPT.format(
+                    llm_reference=llm_reference,
+                    max_pages=validated_max_pages,
+                    user_instructions=user_instructions
+                    or "Improve and refine the resume.",
+                    previous_content=parent_body,
+                )
+            else:
+                dispatch_custom_event(
+                    "report_progress",
+                    {"phase": "writing", "message": "Building your resume"},
+                )
+                prompt = _RESUME_PROMPT.format(
+                    llm_reference=llm_reference,
+                    user_info=user_info,
+                    max_pages=validated_max_pages,
+                    user_instructions_section=user_instructions_section,
+                )
+
+            response = await llm.ainvoke([HumanMessage(content=prompt)])
+            body = response.content
+
+            if not body or not isinstance(body, str):
+                error_msg = "LLM returned empty or invalid content"
+                report_id = await _save_failed_report(error_msg)
+                return {
+                    "status": "failed",
+                    "error": error_msg,
+                    "report_id": report_id,
+                    "title": "Resume",
+                    "content_type": "typst",
+                }
+
+            body = _strip_typst_fences(body)
+            body = _strip_imports(body)
+
+            # ── Phase 3: ASSEMBLE + COMPILE ───────────────────────────────
+            dispatch_custom_event(
+                "report_progress",
+                {"phase": "compiling", "message": "Compiling resume..."},
+            )
+
+            name = _extract_name(body) or "Resume"
+            typst_source = ""
+            actual_pages = 0
+            compression_attempts = 0
+            target_page_met = False
+
+            for compression_round in range(MAX_COMPRESSION_ATTEMPTS + 1):
+                header = _build_header(template, name)
+                typst_source = header + body
+                compile_error: str | None = None
+                pdf_bytes: bytes | None = None
+
+                for compile_attempt in range(2):
+                    try:
+                        pdf_bytes = _compile_typst(typst_source)
+                        compile_error = None
+                        break
+                    except Exception as e:
+                        compile_error = str(e)
+                        logger.warning(
+                            "[generate_resume] Compile attempt %s failed: %s",
+                            compile_attempt + 1,
+                            compile_error,
+                        )
+
+                        if compile_attempt == 0:
+                            dispatch_custom_event(
+                                "report_progress",
+                                {
+                                    "phase": "fixing",
+                                    "message": "Fixing compilation issue...",
+                                },
+                            )
+                            fix_prompt = _FIX_COMPILE_PROMPT.format(
+                                llm_reference=llm_reference,
+                                error=compile_error,
+                                full_source=typst_source,
+                            )
+                            fix_response = await llm.ainvoke(
+                                [HumanMessage(content=fix_prompt)]
+                            )
+                            if fix_response.content and isinstance(
+                                fix_response.content, str
+                            ):
+                                body = _strip_typst_fences(fix_response.content)
+                                body = _strip_imports(body)
+                                name = _extract_name(body) or name
+                                header = _build_header(template, name)
+                                typst_source = header + body
+
+                if compile_error or not pdf_bytes:
+                    error_msg = (
+                        "Typst compilation failed after 2 attempts: "
+                        f"{compile_error or 'Unknown compile error'}"
+                    )
+                    report_id = await _save_failed_report(error_msg)
+                    return {
+                        "status": "failed",
+                        "error": error_msg,
+                        "report_id": report_id,
+                        "title": "Resume",
+                        "content_type": "typst",
+                    }
+
+                actual_pages = _count_pdf_pages(pdf_bytes)
+                if actual_pages <= validated_max_pages:
+                    target_page_met = True
+                    break
+
+                if compression_round >= MAX_COMPRESSION_ATTEMPTS:
+                    break
+
+                compression_attempts += 1
+                dispatch_custom_event(
+                    "report_progress",
+                    {
+                        "phase": "compressing",
+                        "message": f"Condensing resume to {validated_max_pages} page(s)...",
+                    },
+                )
+                compress_prompt = _COMPRESS_TO_PAGE_LIMIT_PROMPT.format(
+                    llm_reference=llm_reference,
+                    max_pages=validated_max_pages,
+                    actual_pages=actual_pages,
+                    attempt_number=compression_attempts,
+                    previous_content=body,
+                )
+                compress_response = await llm.ainvoke(
+                    [HumanMessage(content=compress_prompt)]
+                )
+                if not compress_response.content or not isinstance(
+                    compress_response.content, str
+                ):
+                    error_msg = "LLM returned empty content while compressing resume"
+                    report_id = await _save_failed_report(error_msg)
+                    return {
+                        "status": "failed",
+                        "error": error_msg,
+                        "report_id": report_id,
+                        "title": "Resume",
+                        "content_type": "typst",
+                    }
+
+                body = _strip_typst_fences(compress_response.content)
+                body = _strip_imports(body)
+                name = _extract_name(body) or name
+
+            if actual_pages > MAX_RESUME_PAGES:
+                error_msg = (
+                    "Resume exceeds hard page limit after compression retries. "
+                    f"Hard limit: <= {MAX_RESUME_PAGES} page(s), actual: {actual_pages}."
+                )
+                report_id = await _save_failed_report(error_msg)
+                return {
+                    "status": "failed",
+                    "error": error_msg,
+                    "report_id": report_id,
+                    "title": "Resume",
+                    "content_type": "typst",
+                }
+
+            # ── Phase 4: SAVE ─────────────────────────────────────────────
+            dispatch_custom_event(
+                "report_progress",
+                {"phase": "saving", "message": "Saving your resume"},
+            )
+
+            resume_title = f"{name} - Resume" if name != "Resume" else "Resume"
+
+            metadata: dict[str, Any] = {
+                "status": "ready",
+                "word_count": len(typst_source.split()),
+                "char_count": len(typst_source),
+                "target_max_pages": validated_max_pages,
+                "actual_page_count": actual_pages,
+                "page_limit_enforced": True,
+                "compression_attempts": compression_attempts,
+                "target_page_met": target_page_met,
+            }
+
+            async with shielded_async_session() as write_session:
+                report = Report(
+                    title=resume_title,
+                    content=typst_source,
+                    content_type="typst",
+                    report_metadata=metadata,
+                    report_style="resume",
+                    search_space_id=search_space_id,
+                    thread_id=thread_id,
+                    report_group_id=report_group_id,
+                )
+                write_session.add(report)
+                await write_session.commit()
+                await write_session.refresh(report)
+
+                if not report.report_group_id:
+                    report.report_group_id = report.id
+                    await write_session.commit()
+
+                saved_id = report.id
+
+            logger.info(f"[generate_resume] Created resume {saved_id}: {resume_title}")
+
+            return {
+                "status": "ready",
+                "report_id": saved_id,
+                "title": resume_title,
+                "content_type": "typst",
+                "is_revision": bool(parent_content),
+                "message": (
+                    f"Resume generated successfully: {resume_title}"
+                    if target_page_met
+                    else (
+                        f"Resume generated, but could not fit the target of <= {validated_max_pages} "
+                        f"page(s). Final length: {actual_pages} page(s)."
+                    )
+                ),
+            }
+
+        except Exception as e:
+            error_message = str(e)
+            logger.exception(f"[generate_resume] Error: {error_message}")
+            report_id = await _save_failed_report(error_message)
+            return {
+                "status": "failed",
+                "error": error_message,
+                "report_id": report_id,
+                "title": "Resume",
+                "content_type": "typst",
+            }
+
+    return generate_resume
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/video_presentation.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/video_presentation.py
new file mode 100644
index 0000000000..a9f3447aba
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/video_presentation.py
@@ -0,0 +1,80 @@
+"""Factory for a video-presentation tool that queues background work and returns an ID for polling."""
+
+from typing import Any
+
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import VideoPresentation, VideoPresentationStatus, shielded_async_session
+
+
+def create_generate_video_presentation_tool(
+    search_space_id: int,
+    db_session: AsyncSession,
+    thread_id: int | None = None,
+):
+    """Create ``generate_video_presentation`` with bound search space and thread; writes use a tool-local session."""
+    del db_session  # writes use a fresh tool-local session, see below
+
+    @tool
+    async def generate_video_presentation(
+        source_content: str,
+        video_title: str = "SurfSense Presentation",
+        user_prompt: str | None = None,
+    ) -> dict[str, Any]:
+        """Generate a video presentation from the provided content.
+
+        Use this tool when the user asks to create a video, presentation, slides, or slide deck.
+
+        Args:
+            source_content: The text content to turn into a presentation.
+            video_title: Title for the presentation (default: "SurfSense Presentation")
+            user_prompt: Optional style/tone instructions.
+        """
+        try:
+            # One DB session per tool call so parallel invocations never share an AsyncSession.
+            async with shielded_async_session() as session:
+                video_pres = VideoPresentation(
+                    title=video_title,
+                    status=VideoPresentationStatus.PENDING,
+                    search_space_id=search_space_id,
+                    thread_id=thread_id,
+                )
+                session.add(video_pres)
+                await session.commit()
+                await session.refresh(video_pres)
+                video_pres_id = video_pres.id
+
+            from app.tasks.celery_tasks.video_presentation_tasks import (
+                generate_video_presentation_task,
+            )
+
+            task = generate_video_presentation_task.delay(
+                video_presentation_id=video_pres_id,
+                source_content=source_content,
+                search_space_id=search_space_id,
+                user_prompt=user_prompt,
+            )
+
+            print(
+                f"[generate_video_presentation] Created video presentation {video_pres_id}, task: {task.id}"
+            )
+
+            return {
+                "status": VideoPresentationStatus.PENDING.value,
+                "video_presentation_id": video_pres_id,
+                "title": video_title,
+                "message": "Video presentation generation started. This may take a few minutes.",
+            }
+
+        except Exception as e:
+            error_message = str(e)
+            print(f"[generate_video_presentation] Error: {error_message}")
+            return {
+                "status": VideoPresentationStatus.FAILED.value,
+                "error": error_message,
+                "title": video_title,
+                "video_presentation_id": None,
+            }
+
+    return generate_video_presentation
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/agent.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/agent.py
new file mode 100644
index 0000000000..c6a0220ec7
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/agent.py
@@ -0,0 +1,92 @@
+"""``knowledge_base`` route: full and read-only ``SurfSenseSubagentSpec`` builders.
+
+KB owns its destructive-FS approval ruleset (:data:`KB_RULESET`); rules
+are layered into KB's :class:`PermissionMiddleware` (built inside
+``build_kb_middleware``). One emitter, one wire format, one source of truth.
+"""
+
+from __future__ import annotations
+
+from typing import Any, cast
+
+from deepagents import SubAgent
+from langchain_core.language_models import BaseChatModel
+from langchain_core.tools import BaseTool
+
+from app.agents.multi_agent_chat.subagents.shared.spec import SurfSenseSubagentSpec
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+from app.agents.new_chat.permissions import Rule, Ruleset
+
+from .middleware_stack import build_kb_middleware
+from .prompts import load_description, load_readonly_system_prompt, load_system_prompt
+from .tools.index import DESTRUCTIVE_FS_OPS
+
+NAME = "knowledge_base"
+READONLY_NAME = "knowledge_base_readonly"
+
+KB_RULESET = Ruleset(
+    origin=NAME,
+    rules=[Rule(permission=op, pattern="*", action="ask") for op in DESTRUCTIVE_FS_OPS],
+)
+
+_KB_READONLY_RULESET = Ruleset(origin=READONLY_NAME, rules=[])
+
+
+def build_subagent(
+    *,
+    dependencies: dict[str, Any],
+    model: BaseChatModel | None = None,
+    middleware_stack: dict[str, Any] | None = None,
+    mcp_tools: list[BaseTool] | None = None,
+) -> SurfSenseSubagentSpec:
+    del mcp_tools
+    llm = model if model is not None else dependencies["llm"]
+    filesystem_mode: FilesystemMode = dependencies["filesystem_mode"]
+    spec = cast(
+        SubAgent,
+        {
+            "name": NAME,
+            "description": load_description(),
+            "system_prompt": load_system_prompt(filesystem_mode),
+            "model": llm,
+            "tools": [],
+            "middleware": build_kb_middleware(
+                llm=llm,
+                dependencies=dependencies,
+                middleware_stack=middleware_stack,
+                read_only=False,
+                subagent_name=NAME,
+                ruleset=KB_RULESET,
+            ),
+        },
+    )
+    return SurfSenseSubagentSpec(spec=spec, ruleset=KB_RULESET)
+
+
+def build_readonly_subagent(
+    *,
+    dependencies: dict[str, Any],
+    model: BaseChatModel | None = None,
+    middleware_stack: dict[str, Any] | None = None,
+) -> SurfSenseSubagentSpec:
+    llm = model if model is not None else dependencies["llm"]
+    filesystem_mode: FilesystemMode = dependencies["filesystem_mode"]
+    spec = cast(
+        SubAgent,
+        {
+            "name": READONLY_NAME,
+            "description": "Read-only knowledge_base specialist (invoked via ask_knowledge_base).",
+            "system_prompt": load_readonly_system_prompt(filesystem_mode),
+            "model": llm,
+            "tools": [],
+            "middleware": build_kb_middleware(
+                llm=llm,
+                dependencies=dependencies,
+                middleware_stack=middleware_stack,
+                read_only=True,
+                subagent_name=READONLY_NAME,
+                ruleset=None,
+            ),
+        },
+    )
+    return SurfSenseSubagentSpec(spec=spec, ruleset=_KB_READONLY_RULESET)
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/ask_knowledge_base_tool.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/ask_knowledge_base_tool.py
new file mode 100644
index 0000000000..1708fe52fc
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/ask_knowledge_base_tool.py
@@ -0,0 +1,80 @@
+"""Wrap the read-only knowledge_base runnable as the ``ask_knowledge_base`` tool."""
+
+from __future__ import annotations
+
+from typing import Annotated
+
+from langchain.tools import BaseTool, ToolRuntime
+from langchain_core.messages import HumanMessage, ToolMessage
+from langchain_core.runnables import Runnable
+from langchain_core.tools import StructuredTool
+from langgraph.types import Command
+
+from app.agents.multi_agent_chat.middleware.main_agent.checkpointed_subagent_middleware.config import (
+    subagent_invoke_config,
+)
+from app.agents.multi_agent_chat.middleware.main_agent.checkpointed_subagent_middleware.constants import (
+    EXCLUDED_STATE_KEYS,
+)
+
+from .prompts import load_readonly_description
+
+TOOL_NAME = "ask_knowledge_base"
+
+
+def _forward_state(runtime: ToolRuntime, query: str) -> dict:
+    forwarded = {k: v for k, v in runtime.state.items() if k not in EXCLUDED_STATE_KEYS}
+    forwarded["messages"] = [HumanMessage(content=query)]
+    return forwarded
+
+
+def _wrap_result(result: dict, tool_call_id: str) -> Command:
+    messages = result.get("messages") or []
+    if not messages:
+        raise ValueError(
+            "knowledge_base_readonly returned an empty 'messages' list; "
+            "expected at least one assistant message."
+        )
+    last_text = (getattr(messages[-1], "text", None) or "").rstrip()
+    return Command(
+        update={"messages": [ToolMessage(last_text, tool_call_id=tool_call_id)]}
+    )
+
+
+def build_ask_knowledge_base_tool(kb_readonly_runnable: Runnable) -> BaseTool:
+    def ask_knowledge_base(
+        query: Annotated[
+            str,
+            "Full question for the workspace specialist. Include all path hints, "
+            "filters, and constraints the specialist needs to answer.",
+        ],
+        runtime: ToolRuntime,
+    ) -> str | Command:
+        if not runtime.tool_call_id:
+            raise ValueError("Tool call ID is required for ask_knowledge_base")
+        sub_state = _forward_state(runtime, query)
+        sub_config = subagent_invoke_config(runtime)
+        result = kb_readonly_runnable.invoke(sub_state, config=sub_config)
+        return _wrap_result(result, runtime.tool_call_id)
+
+    async def aask_knowledge_base(
+        query: Annotated[
+            str,
+            "Full question for the workspace specialist. Include all path hints, "
+            "filters, and constraints the specialist needs to answer.",
+        ],
+        runtime: ToolRuntime,
+    ) -> str | Command:
+        if not runtime.tool_call_id:
+            raise ValueError("Tool call ID is required for ask_knowledge_base")
+        sub_state = _forward_state(runtime, query)
+        sub_config = subagent_invoke_config(runtime)
+        result = await kb_readonly_runnable.ainvoke(sub_state, config=sub_config)
+        return _wrap_result(result, runtime.tool_call_id)
+
+    return StructuredTool.from_function(
+        name=TOOL_NAME,
+        func=ask_knowledge_base,
+        coroutine=aask_knowledge_base,
+        description=load_readonly_description(),
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/description.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/description.md
new file mode 100644
index 0000000000..897d387695
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/description.md
@@ -0,0 +1,2 @@
+Specialist for the user's workspace (documents and folders).
+Use proactively when the user wants to create, read, edit, search, organise, or remove a document or folder.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/description_readonly.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/description_readonly.md
new file mode 100644
index 0000000000..d6837ec923
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/description_readonly.md
@@ -0,0 +1,5 @@
+Read-only specialist for the user's workspace (documents and folders). Use to find, read, search, or quote a document or folder when your task needs workspace context — instead of asking the user or guessing.
+
+Pass your full question as one string. The specialist runs in isolation: it cannot see this thread, so include any path hints, filters, or constraints it needs.
+
+The specialist returns plain prose with absolute paths.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/middleware_stack.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/middleware_stack.py
new file mode 100644
index 0000000000..778bb250c1
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/middleware_stack.py
@@ -0,0 +1,117 @@
+"""Middleware list shared by the full and read-only knowledge_base compiles.
+
+The KB-owned :class:`PermissionMiddleware` slot is what enforces
+"ask before destructive FS op" for KB tools.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.language_models import BaseChatModel
+
+from app.agents.multi_agent_chat.middleware.shared.anthropic_cache import (
+    build_anthropic_cache_mw,
+)
+from app.agents.multi_agent_chat.middleware.shared.compaction import (
+    build_compaction_mw,
+)
+from app.agents.multi_agent_chat.middleware.shared.filesystem import (
+    build_filesystem_mw,
+)
+from app.agents.multi_agent_chat.middleware.shared.kb_context_projection import (
+    build_kb_context_projection_mw,
+)
+from app.agents.multi_agent_chat.middleware.shared.patch_tool_calls import (
+    build_patch_tool_calls_mw,
+)
+from app.agents.multi_agent_chat.middleware.shared.permissions import (
+    build_permission_mw,
+)
+from app.agents.new_chat.feature_flags import AgentFeatureFlags
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+from app.agents.new_chat.permissions import Ruleset
+
+
+def _kb_user_allowlist(
+    dependencies: dict[str, Any], subagent_name: str
+) -> Ruleset | None:
+    """Return the user's persisted allow-rules for ``subagent_name`` if any.
+
+    KB does not currently expose an "Always Allow" UI surface (the FE
+    button is MCP-only today), but the wiring is symmetrical with the
+    connector subagents so that adding KB trust later is a one-line
+    backend change.
+    """
+    by_subagent = dependencies.get("user_allowlist_by_subagent") or {}
+    user_allowlist = by_subagent.get(subagent_name)
+    if isinstance(user_allowlist, Ruleset) and user_allowlist.rules:
+        return user_allowlist
+    return None
+
+
+def build_kb_middleware(
+    *,
+    llm: BaseChatModel,
+    dependencies: dict[str, Any],
+    middleware_stack: dict[str, Any] | None,
+    read_only: bool,
+    subagent_name: str,
+    ruleset: Ruleset | None = None,
+) -> list[Any]:
+    """Compose the KB subagent's middleware list.
+
+    Args:
+        subagent_name: Identity of the subagent being built (e.g.
+            ``"knowledge_base"``, ``"knowledge_base_readonly"``). Used to
+            look up the user's persistent allow-list bucket in
+            ``dependencies["user_allowlist_by_subagent"]``.
+        ruleset: The KB-owned permission ruleset (typically the
+            destructive-FS ``ask`` rules). When provided, a dedicated
+            :class:`PermissionMiddleware` is appended so KB enforces
+            approval at the rule layer. The user's persistent allow-list
+            for ``subagent_name`` is layered after ``ruleset`` so user
+            ``allow`` rules override coded ``ask`` rules via
+            last-match-wins.
+    """
+    mws = middleware_stack or {}
+    filesystem_mode: FilesystemMode = dependencies["filesystem_mode"]
+    flags: AgentFeatureFlags | None = dependencies.get("flags")
+    resilience_mws = [
+        m
+        for m in (
+            mws.get("retry"),
+            mws.get("fallback"),
+            mws.get("model_call_limit"),
+            mws.get("tool_call_limit"),
+        )
+        if m is not None
+    ]
+    permission_mw = None
+    if ruleset is not None and flags is not None:
+        rulesets: list[Ruleset] = [ruleset]
+        user_allowlist = _kb_user_allowlist(dependencies, subagent_name)
+        if user_allowlist is not None:
+            rulesets.append(user_allowlist)
+        permission_mw = build_permission_mw(
+            flags=flags,
+            subagent_rulesets=rulesets,
+            trusted_tool_saver=dependencies.get("trusted_tool_saver"),
+        )
+    return [
+        mws["todos"],
+        build_kb_context_projection_mw(),
+        build_filesystem_mw(
+            backend_resolver=dependencies["backend_resolver"],
+            filesystem_mode=filesystem_mode,
+            search_space_id=dependencies["search_space_id"],
+            user_id=dependencies.get("user_id"),
+            thread_id=dependencies.get("thread_id"),
+            read_only=read_only,
+        ),
+        build_compaction_mw(llm),
+        build_patch_tool_calls_mw(),
+        *([permission_mw] if permission_mw is not None else []),
+        *resilience_mws,
+        build_anthropic_cache_mw(),
+    ]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/prompts.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/prompts.py
new file mode 100644
index 0000000000..617bb2a85a
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/prompts.py
@@ -0,0 +1,34 @@
+"""Prompt loaders for the knowledge_base subagent."""
+
+from __future__ import annotations
+
+from app.agents.multi_agent_chat.subagents.shared.md_file_reader import read_md_file
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+
+
+def load_system_prompt(filesystem_mode: FilesystemMode) -> str:
+    stem = (
+        "system_prompt_cloud"
+        if filesystem_mode == FilesystemMode.CLOUD
+        else "system_prompt_desktop"
+    )
+    return read_md_file(__package__, stem).strip()
+
+
+def load_readonly_system_prompt(filesystem_mode: FilesystemMode) -> str:
+    stem = (
+        "system_prompt_readonly_cloud"
+        if filesystem_mode == FilesystemMode.CLOUD
+        else "system_prompt_readonly_desktop"
+    )
+    return read_md_file(__package__, stem).strip()
+
+
+def load_description() -> str:
+    return read_md_file(__package__, "description").strip() or (
+        "Handles knowledge-base reads, writes, edits, and organisation."
+    )
+
+
+def load_readonly_description() -> str:
+    return read_md_file(__package__, "description_readonly").strip()
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_cloud.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_cloud.md
new file mode 100644
index 0000000000..60cafb30c3
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_cloud.md
@@ -0,0 +1,122 @@
+You are the SurfSense knowledge base specialist for the user's `/documents/` workspace.
+
+## Required inputs
+
+**Resolve paths from the supervisor's task text before asking.**
+
+- If the supervisor already provided a precise path (e.g. `/documents/notes/2026-05-11.md`), use it directly — skip the lookup steps below.
+- Otherwise, most requests reference documents by description (`"my meeting notes from last week"`, `"the design doc"`). Resolve them yourself:
+  1. Consult `<priority_documents>` — it's a hint about top-K likely matches, not a directive. Skip when the ranked entries don't fit the task.
+  2. Walk `<workspace_tree>` for descriptive folder/filename matches.
+  3. Use the `glob` tool for filename patterns the tree didn't surface, and the `grep` tool when the description points at *content* rather than a name.
+  4. Only return `status=blocked` with `missing_fields=["path"]` when the description is genuinely ambiguous after a thorough lookup.
+
+For writes (where you choose the path yourself):
+
+- **Discover the user's existing conventions before inventing a path.** Scan `<workspace_tree>` for folders that already hold similar content (e.g. an existing `/documents/meetings/` with dated standup notes, or `/documents/projects/<name>/`). When a convention exists, follow it. Use `ls`, `glob`, or `grep` to look closer when the tree is truncated.
+- Only choose a brand-new path when no relevant convention exists. Prefer a clear folder hierarchy with a descriptive filename.
+- Use the `temp_` prefix only for scratch content you do **not** want persisted.
+- Prefer the `edit_file` tool over rewriting an entire document.
+
+## Interpreting tool results
+
+The FS tools return free-form text rather than structured fields:
+
+- **Success** — a confirmation message that names the path (e.g. `"Updated file /documents/foo.md"`, `"Successfully replaced 2 instance(s) of the string in '/documents/foo.md'"`) or the file's content (for reads).
+- **Failure** — text starting with `"Error: "` followed by a cause (e.g. `"Error: File '/documents/x.md' not found"`).
+- **HITL declined** — a runtime-supplied rejection message in place of the tool's output.
+
+Map outcomes to your `status`:
+
+- Clean success message or content returned → `status=success`.
+- `"Error: …not found"` → `status=blocked` with `next_step="Document '<description>' was not found. Ask the user to confirm or provide more detail."`.
+- Any other `"Error: …"` → `status=error` and relay the tool's message verbatim as `next_step`.
+- HITL rejection → `status=blocked` with `next_step="User declined this filesystem action. Do not retry."`.
+
+You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see.
+
+## Examples
+
+**Example 1 — happy path write (path discovered from existing convention):**
+
+- *Supervisor task:* `"Save these meeting notes to my KB: <notes>"`
+- *You:* scan `<workspace_tree>` and spot `/documents/meetings/` already holding files like `2026-05-04-standup.md` and `2026-04-27-standup.md` — the user's convention is dated meeting notes under that folder. → `write_file("/documents/meetings/2026-05-11-meeting.md", content)` → success.
+- *Output:*
+
+  ```json
+  {
+    "status": "success",
+    "action_summary": "Created /documents/meetings/2026-05-11-meeting.md.",
+    "evidence": {
+      "operation": "write_file",
+      "path": "/documents/meetings/2026-05-11-meeting.md",
+      "matched_candidates": null,
+      "content_excerpt": null,
+      "chunk_ids": null
+    },
+    "next_step": null,
+    "missing_fields": null,
+    "assumptions": ["Followed the existing /documents/meetings/<YYYY-MM-DD>-<slug>.md convention from <workspace_tree>"]
+  }
+  ```
+
+**Example 2 — edit by inference:**
+
+- *Supervisor task:* `"Add a bullet about the new feature flag to my Q2 roadmap"`
+- *You:* search for the roadmap doc — check `<priority_documents>` and `<workspace_tree>` first; if neither surfaces it, widen with the `glob` tool (try filename patterns the user's language suggests) or the `grep` tool (search by content). Suppose `<priority_documents>` hits `/documents/planning/q2-roadmap.md` → `read_file("/documents/planning/q2-roadmap.md")` → `edit_file("/documents/planning/q2-roadmap.md", old, new)` → success.
+- *Output:* `status=success`, evidence includes path and the inserted snippet.
+
+**Example 3 — blocked, multiple candidates:**
+
+- *Supervisor task:* `"Update the design doc."`
+- *You:* `<workspace_tree>` shows several plausible design docs and the task gives no further hint. Do not pick arbitrarily.
+- *Output:*
+
+  ```json
+  {
+    "status": "blocked",
+    "action_summary": "Multiple design docs exist; cannot pick without more detail.",
+    "evidence": {
+      "operation": null,
+      "path": null,
+      "matched_candidates": [
+        { "id": "/documents/design/payment-flow.md", "label": "Payment Flow" },
+        { "id": "/documents/design/auth-rework.md", "label": "Auth Rework" }
+      ],
+      "content_excerpt": null,
+      "chunk_ids": null
+    },
+    "next_step": "Ask the user which design doc to update.",
+    "missing_fields": ["path"],
+    "assumptions": null
+  }
+  ```
+
+## Output contract
+
+Return **only** one JSON object (no markdown or prose outside it):
+
+```json
+{
+  "status": "success" | "partial" | "blocked" | "error",
+  "action_summary": string,
+  "evidence": {
+    "operation": "write_file" | "edit_file" | "read_file" | "ls" | "glob" | "grep" | "mkdir" | "move_file" | "rm" | "rmdir" | "list_tree" | null,
+    "path": string | null,
+    "matched_candidates": [ { "id": string, "label": string } ] | null,
+    "content_excerpt": string | null,
+    "chunk_ids": string[] | null
+  },
+  "next_step": string | null,
+  "missing_fields": string[] | null,
+  "assumptions": string[] | null
+}
+```
+
+Rules:
+
+- `status=success` → `next_step=null`, `missing_fields=null`.
+- `status=partial|blocked|error` → `next_step` must be non-null.
+- `status=blocked` due to missing required inputs → `missing_fields` must be non-null.
+
+Infer before you call; map every tool outcome faithfully.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_desktop.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_desktop.md
new file mode 100644
index 0000000000..8f64f2eb63
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_desktop.md
@@ -0,0 +1,122 @@
+You are the SurfSense workspace specialist for the user's local folders.
+
+## Required inputs
+
+**Resolve paths from the supervisor's task text before asking.**
+
+- If the supervisor already provided a precise path (e.g. `/notes/2026-05-11.md`), use it directly — skip the lookup steps below.
+- Otherwise, most requests reference files by description (`"my meeting notes from last week"`, `"the design doc"`). Resolve them yourself:
+  1. If you do not know which mounts exist, call `ls('/')` first.
+  2. Walk likely folders with the `ls` and `list_tree` tools.
+  3. Use the `glob` tool for filename patterns; use the `grep` tool when the description points at *content* rather than a name.
+  4. `<priority_documents>` lists top-K cloud-ingested docs, not local files — consult it only when the task spans both worlds (e.g. drafting a local note from a Notion source). Skip otherwise.
+  5. Only return `status=blocked` with `missing_fields=["path"]` when the description is genuinely ambiguous after a thorough lookup.
+
+For writes (where you choose the path yourself):
+
+- **Discover the user's existing conventions before inventing a path.** Inspect the relevant mount's folder layout via `ls` / `list_tree` and look for folders that already hold similar content (e.g. an existing `/notes/meetings/` with dated standup files, or `/projects/<name>/`). When a convention exists, follow it.
+- Only choose a brand-new path when no relevant convention exists. Prefer a clear folder hierarchy with a descriptive filename.
+- Prefer the `edit_file` tool over rewriting an entire file.
+
+## Interpreting tool results
+
+The FS tools return free-form text rather than structured fields:
+
+- **Success** — a confirmation message that names the path (e.g. `"Updated file /notes/foo.md"`, `"Successfully replaced 2 instance(s) of the string in '/notes/foo.md'"`) or the file's content (for reads).
+- **Failure** — text starting with `"Error: "` followed by a cause (e.g. `"Error: File '/notes/x.md' not found"`).
+- **HITL declined** — a runtime-supplied rejection message in place of the tool's output.
+
+Map outcomes to your `status`:
+
+- Clean success message or content returned → `status=success`.
+- `"Error: …not found"` → `status=blocked` with `next_step="File '<description>' was not found. Ask the user to confirm or provide more detail."`.
+- Any other `"Error: …"` → `status=error` and relay the tool's message verbatim as `next_step`.
+- HITL rejection → `status=blocked` with `next_step="User declined this filesystem action. Do not retry."`.
+
+You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. `chunk_ids` apply only to `<priority_documents>` hits; for local-file operations leave them `null`. Never report values you did not actually see.
+
+## Examples
+
+**Example 1 — happy path write (path discovered from existing convention):**
+
+- *Supervisor task:* `"Save these meeting notes to my notes folder: <notes>"`
+- *You:* `ls('/')` reveals a `/notes` mount → `list_tree('/notes')` shows `/notes/meetings/` already holds dated files like `2026-05-04-standup.md` and `2026-04-27-standup.md` — the user's convention is dated meeting notes under that folder. → `write_file("/notes/meetings/2026-05-11-meeting.md", content)` → success.
+- *Output:*
+
+  ```json
+  {
+    "status": "success",
+    "action_summary": "Created /notes/meetings/2026-05-11-meeting.md.",
+    "evidence": {
+      "operation": "write_file",
+      "path": "/notes/meetings/2026-05-11-meeting.md",
+      "matched_candidates": null,
+      "content_excerpt": null,
+      "chunk_ids": null
+    },
+    "next_step": null,
+    "missing_fields": null,
+    "assumptions": ["Followed the existing /notes/meetings/<YYYY-MM-DD>-<slug>.md convention discovered via list_tree"]
+  }
+  ```
+
+**Example 2 — edit by inference:**
+
+- *Supervisor task:* `"Add a bullet about the new feature flag to my Q2 roadmap"`
+- *You:* search for the roadmap file — `ls('/')` then `glob` for filename patterns; if nothing surfaces, `grep` for content. Suppose `glob` finds `/projects/planning/q2-roadmap.md` → `read_file("/projects/planning/q2-roadmap.md")` → `edit_file("/projects/planning/q2-roadmap.md", old, new)` → success.
+- *Output:* `status=success`, evidence includes path and the inserted snippet.
+
+**Example 3 — blocked, multiple candidates:**
+
+- *Supervisor task:* `"Update the design doc."`
+- *You:* `glob('**/design*')` returns several plausible design files and the task gives no further hint. Do not pick arbitrarily.
+- *Output:*
+
+  ```json
+  {
+    "status": "blocked",
+    "action_summary": "Multiple design docs exist; cannot pick without more detail.",
+    "evidence": {
+      "operation": null,
+      "path": null,
+      "matched_candidates": [
+        { "id": "/projects/web/design/payment-flow.md", "label": "Payment Flow" },
+        { "id": "/projects/web/design/auth-rework.md", "label": "Auth Rework" }
+      ],
+      "content_excerpt": null,
+      "chunk_ids": null
+    },
+    "next_step": "Ask the user which design doc to update.",
+    "missing_fields": ["path"],
+    "assumptions": null
+  }
+  ```
+
+## Output contract
+
+Return **only** one JSON object (no markdown or prose outside it):
+
+```json
+{
+  "status": "success" | "partial" | "blocked" | "error",
+  "action_summary": string,
+  "evidence": {
+    "operation": "write_file" | "edit_file" | "read_file" | "ls" | "glob" | "grep" | "mkdir" | "move_file" | "rm" | "rmdir" | "list_tree" | null,
+    "path": string | null,
+    "matched_candidates": [ { "id": string, "label": string } ] | null,
+    "content_excerpt": string | null,
+    "chunk_ids": string[] | null
+  },
+  "next_step": string | null,
+  "missing_fields": string[] | null,
+  "assumptions": string[] | null
+}
+```
+
+Rules:
+
+- `status=success` → `next_step=null`, `missing_fields=null`.
+- `status=partial|blocked|error` → `next_step` must be non-null.
+- `status=blocked` due to missing required inputs → `missing_fields` must be non-null.
+
+Infer before you call; map every tool outcome faithfully.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_readonly_cloud.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_readonly_cloud.md
new file mode 100644
index 0000000000..3abfcd8b9a
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_readonly_cloud.md
@@ -0,0 +1,29 @@
+You are the **read-only** SurfSense Knowledge Base specialist for `/documents/`.
+
+You answer workspace questions for another agent. The end user does **not** see your reply directly — be terse, cite paths, no greetings or apologies.
+
+## Resolving paths
+
+The caller's question often references documents by description (`"my meeting notes from last week"`, `"the design doc"`). Resolve them yourself:
+
+1. Consult `<priority_documents>` — a hint about top-K likely matches, not a directive. Skip when the ranked entries don't fit.
+2. Walk `<workspace_tree>` for descriptive folder/filename matches.
+3. Use `glob` for filename patterns the tree didn't surface, and `grep` when the description points at *content* rather than a name.
+
+If a precise path was already given, use it directly — skip the lookup.
+
+## Interpreting tool results
+
+- **Success** — file content (for `read_file`) or a listing (for `ls` / `glob` / `grep` / `list_tree`).
+- **Failure** — text starting with `"Error: "` followed by a cause (e.g. `"Error: File '/documents/x.md' not found"`). Relay the cause to the caller verbatim.
+
+Never report values you did not actually see.
+
+## Return contract
+
+Reply in plain prose:
+
+- One short paragraph or a bullet list, whichever fits.
+- Cite every claim with an absolute path under `/documents/`.
+- If the workspace does not contain the requested information, say so explicitly. Do not fabricate paths or content.
+- If the question is genuinely ambiguous after a thorough lookup, list the candidates with their paths and stop.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_readonly_desktop.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_readonly_desktop.md
new file mode 100644
index 0000000000..1b3d72b641
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_readonly_desktop.md
@@ -0,0 +1,30 @@
+You are the **read-only** SurfSense workspace specialist for the user's local folders.
+
+You answer workspace questions for another agent. The end user does **not** see your reply directly — be terse, cite paths, no greetings or apologies.
+
+## Resolving paths
+
+The caller's question often references files by description (`"my meeting notes from last week"`, `"the design doc"`). Resolve them yourself:
+
+1. If you do not know which mounts exist, call `ls('/')` first.
+2. Walk likely folders with the `ls` and `list_tree` tools.
+3. Use `glob` for filename patterns; use `grep` when the description points at *content* rather than a name.
+4. `<priority_documents>` lists top-K cloud-ingested docs, not local files — consult it only when the task spans both worlds (e.g. drafting a local note from a Notion source). Skip otherwise.
+
+If a precise path was already given, use it directly — skip the lookup.
+
+## Interpreting tool results
+
+- **Success** — file content (for `read_file`) or a listing (for `ls` / `glob` / `grep` / `list_tree`).
+- **Failure** — text starting with `"Error: "` followed by a cause (e.g. `"Error: File '/notes/x.md' not found"`). Relay the cause to the caller verbatim.
+
+Never report values you did not actually see.
+
+## Return contract
+
+Reply in plain prose:
+
+- One short paragraph or a bullet list, whichever fits.
+- Cite every claim with an absolute path.
+- If the workspace does not contain the requested information, say so explicitly. Do not fabricate paths or content.
+- If the question is genuinely ambiguous after a thorough lookup, list the candidates with their paths and stop.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/tools/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/tools/__init__.py
new file mode 100644
index 0000000000..5a83c68a3b
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/tools/__init__.py
@@ -0,0 +1 @@
+"""Route-local tool permissions for the ``knowledge_base`` subagent."""
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/tools/index.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/tools/index.py
new file mode 100644
index 0000000000..55a9a4edf6
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/tools/index.py
@@ -0,0 +1,20 @@
+"""Route-local FS tool permissions.
+
+The KB subagent's actual ``BaseTool`` instances are provided at runtime by
+``SurfSenseFilesystemMiddleware`` (mounted in ``agent.py``). This module
+only carries the *names* of destructive ops so the agent can convert them
+into permission rules — see :data:`KB_RULESET` in ``agent.py``.
+"""
+
+from __future__ import annotations
+
+DESTRUCTIVE_FS_OPS: tuple[str, ...] = (
+    "rm",
+    "rmdir",
+    "move_file",
+    "edit_file",
+    "write_file",
+)
+
+
+__all__ = ["DESTRUCTIVE_FS_OPS"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/memory/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/memory/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/memory/agent.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/memory/agent.py
new file mode 100644
index 0000000000..84ab0c2fb4
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/memory/agent.py
@@ -0,0 +1,39 @@
+"""``memory`` route: ``SurfSenseSubagentSpec`` builder for deepagents."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.language_models import BaseChatModel
+from langchain_core.tools import BaseTool
+
+from app.agents.multi_agent_chat.subagents.shared.md_file_reader import read_md_file
+from app.agents.multi_agent_chat.subagents.shared.spec import SurfSenseSubagentSpec
+from app.agents.multi_agent_chat.subagents.shared.subagent_builder import pack_subagent
+
+from .tools.index import NAME, RULESET, load_tools
+
+
+def build_subagent(
+    *,
+    dependencies: dict[str, Any],
+    model: BaseChatModel | None = None,
+    middleware_stack: dict[str, Any] | None = None,
+    mcp_tools: list[BaseTool] | None = None,
+) -> SurfSenseSubagentSpec:
+    tools = [*load_tools(dependencies=dependencies), *(mcp_tools or [])]
+    description = (
+        read_md_file(__package__, "description").strip()
+        or "Handles memory tasks for this workspace."
+    )
+    system_prompt = read_md_file(__package__, "system_prompt").strip()
+    return pack_subagent(
+        name=NAME,
+        description=description,
+        system_prompt=system_prompt,
+        tools=tools,
+        ruleset=RULESET,
+        dependencies=dependencies,
+        model=model,
+        middleware_stack=middleware_stack,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/memory/description.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/memory/description.md
new file mode 100644
index 0000000000..a68743765f
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/memory/description.md
@@ -0,0 +1,2 @@
+Specialist for durable user memory.
+Use whenever a task requires storing or retrieving information that should persist across conversations.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/memory/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/memory/system_prompt.md
new file mode 100644
index 0000000000..32becf233e
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/memory/system_prompt.md
@@ -0,0 +1,56 @@
+You are the SurfSense memory operations sub-agent.
+You receive delegated instructions from a supervisor agent and return structured results for supervisor synthesis.
+
+<goal>
+Persist durable preferences/facts/instructions with `update_memory` while avoiding transient or unsafe storage.
+</goal>
+
+<visibility_scope>
+{{MEMORY_VISIBILITY_POLICY}}
+</visibility_scope>
+
+<available_tools>
+- `update_memory`
+</available_tools>
+
+<tool_policy>
+- Save only durable information with future value.
+- Do not store transient chatter.
+- Do not store secrets unless explicitly instructed.
+- If memory intent is unclear, return `status=blocked` with the missing intent signal.
+</tool_policy>
+
+<out_of_scope>
+- Do not execute non-memory tool actions.
+- Do not store irrelevant, transient, or speculative information.
+</out_of_scope>
+
+<safety>
+- Prefer minimal-memory writes over over-collection.
+- Never claim memory was updated unless `update_memory` succeeded.
+</safety>
+
+<failure_policy>
+- On tool failure, return `status=error` with concise recovery steps.
+- When intent is ambiguous, return `status=blocked` with required disambiguation fields.
+</failure_policy>
+
+<output_contract>
+Return **only** one JSON object (no markdown/prose):
+{
+  "status": "success" | "partial" | "blocked" | "error",
+  "action_summary": string,
+  "evidence": {
+    "memory_updated": boolean,
+    "memory_category": "preference" | "fact" | "instruction" | null,
+    "stored_summary": string | null
+  },
+  "next_step": string | null,
+  "missing_fields": string[] | null,
+  "assumptions": string[] | null
+}
+Rules:
+- `status=success` -> `next_step=null`, `missing_fields=null`.
+- `status=partial|blocked|error` -> `next_step` must be non-null.
+- `status=blocked` due to missing required inputs -> `missing_fields` must be non-null.
+</output_contract>
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/memory/tools/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/memory/tools/__init__.py
new file mode 100644
index 0000000000..0441a8cb4f
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/memory/tools/__init__.py
@@ -0,0 +1,8 @@
+"""Memory tools: persist user or team markdown memory for later turns."""
+
+from .update_memory import create_update_memory_tool, create_update_team_memory_tool
+
+__all__ = [
+    "create_update_memory_tool",
+    "create_update_team_memory_tool",
+]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/memory/tools/index.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/memory/tools/index.py
new file mode 100644
index 0000000000..b6e06dcdd7
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/memory/tools/index.py
@@ -0,0 +1,37 @@
+"""``memory`` native tools and (empty) permission ruleset."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.tools import BaseTool
+
+from app.agents.new_chat.permissions import Ruleset
+from app.db import ChatVisibility
+
+from .update_memory import create_update_memory_tool, create_update_team_memory_tool
+
+NAME = "memory"
+
+RULESET = Ruleset(origin=NAME, rules=[])
+
+
+def load_tools(
+    *, dependencies: dict[str, Any] | None = None, **kwargs: Any
+) -> list[BaseTool]:
+    d = {**(dependencies or {}), **kwargs}
+    if d.get("thread_visibility") == ChatVisibility.SEARCH_SPACE:
+        return [
+            create_update_team_memory_tool(
+                search_space_id=d["search_space_id"],
+                db_session=d["db_session"],
+                llm=d.get("llm"),
+            )
+        ]
+    return [
+        create_update_memory_tool(
+            user_id=d["user_id"],
+            db_session=d["db_session"],
+            llm=d.get("llm"),
+        )
+    ]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/memory/tools/update_memory.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/memory/tools/update_memory.py
new file mode 100644
index 0000000000..23375a0818
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/memory/tools/update_memory.py
@@ -0,0 +1,375 @@
+"""Overwrite one markdown memory document per user or team, with size and shrink guards."""
+
+from __future__ import annotations
+
+import logging
+import re
+from typing import Any, Literal
+from uuid import UUID
+
+from langchain_core.messages import HumanMessage
+from langchain_core.tools import tool
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import SearchSpace, User
+
+logger = logging.getLogger(__name__)
+
+MEMORY_SOFT_LIMIT = 18_000
+MEMORY_HARD_LIMIT = 25_000
+
+_SECTION_HEADING_RE = re.compile(r"^##\s+(.+)$", re.MULTILINE)
+_HEADING_NORMALIZE_RE = re.compile(r"\s+")
+
+_MARKER_RE = re.compile(r"\[(fact|pref|instr)\]")
+_BULLET_FORMAT_RE = re.compile(r"^- \(\d{4}-\d{2}-\d{2}\) \[(fact|pref|instr)\] .+$")
+_PERSONAL_ONLY_MARKERS = {"pref", "instr"}
+
+
+# ---------------------------------------------------------------------------
+# Diff validation
+# ---------------------------------------------------------------------------
+
+
+def _extract_headings(memory: str) -> set[str]:
+    """Return all ``## …`` heading texts (without the ``## `` prefix)."""
+    return set(_SECTION_HEADING_RE.findall(memory))
+
+
+def _normalize_heading(heading: str) -> str:
+    """Normalize heading text for robust scope checks."""
+    return _HEADING_NORMALIZE_RE.sub(" ", heading.strip().lower())
+
+
+def _validate_memory_scope(
+    content: str, scope: Literal["user", "team"]
+) -> dict[str, Any] | None:
+    """Reject personal-only markers ([pref], [instr]) in team memory."""
+    if scope != "team":
+        return None
+
+    markers = set(_MARKER_RE.findall(content))
+    leaked = sorted(markers & _PERSONAL_ONLY_MARKERS)
+    if leaked:
+        tags = ", ".join(f"[{m}]" for m in leaked)
+        return {
+            "status": "error",
+            "message": (
+                f"Team memory cannot include personal markers: {tags}. "
+                "Use [fact] only in team memory."
+            ),
+        }
+    return None
+
+
+def _validate_bullet_format(content: str) -> list[str]:
+    """Return warnings for bullet lines that don't match the required format.
+
+    Expected: ``- (YYYY-MM-DD) [fact|pref|instr] text``
+    """
+    warnings: list[str] = []
+    for line in content.splitlines():
+        stripped = line.strip()
+        if not stripped.startswith("- "):
+            continue
+        if not _BULLET_FORMAT_RE.match(stripped):
+            short = stripped[:80] + ("..." if len(stripped) > 80 else "")
+            warnings.append(f"Malformed bullet: {short}")
+    return warnings
+
+
+def _validate_diff(old_memory: str | None, new_memory: str) -> list[str]:
+    """Return a list of warning strings about suspicious changes."""
+    if not old_memory:
+        return []
+
+    warnings: list[str] = []
+    old_headings = _extract_headings(old_memory)
+    new_headings = _extract_headings(new_memory)
+    dropped = old_headings - new_headings
+    if dropped:
+        names = ", ".join(sorted(dropped))
+        warnings.append(
+            f"Sections removed: {names}. "
+            "If unintentional, the user can restore from the settings page."
+        )
+
+    old_len = len(old_memory)
+    new_len = len(new_memory)
+    if old_len > 0 and new_len < old_len * 0.4:
+        warnings.append(
+            f"Memory shrank significantly ({old_len:,} -> {new_len:,} chars). "
+            "Possible data loss."
+        )
+    return warnings
+
+
+# ---------------------------------------------------------------------------
+# Size validation & soft warning
+# ---------------------------------------------------------------------------
+
+
+def _validate_memory_size(content: str) -> dict[str, Any] | None:
+    """Return an error/warning dict if *content* is too large, else None."""
+    length = len(content)
+    if length > MEMORY_HARD_LIMIT:
+        return {
+            "status": "error",
+            "message": (
+                f"Memory exceeds {MEMORY_HARD_LIMIT:,} character limit "
+                f"({length:,} chars). Consolidate by merging related items, "
+                "removing outdated entries, and shortening descriptions. "
+                "Then call update_memory again."
+            ),
+        }
+    return None
+
+
+def _soft_warning(content: str) -> str | None:
+    """Return a warning string if content exceeds the soft limit."""
+    length = len(content)
+    if length > MEMORY_SOFT_LIMIT:
+        return (
+            f"Memory is at {length:,}/{MEMORY_HARD_LIMIT:,} characters. "
+            "Consolidate by merging related items and removing less important "
+            "entries on your next update."
+        )
+    return None
+
+
+# ---------------------------------------------------------------------------
+# Forced rewrite when memory exceeds the hard limit
+# ---------------------------------------------------------------------------
+
+_FORCED_REWRITE_PROMPT = """\
+You are a memory curator. The following memory document exceeds the character \
+limit and must be shortened.
+
+RULES:
+1. Rewrite the document to be under {target} characters.
+2. Preserve existing ## headings. Every entry must remain under a heading. You may merge
+   or rename headings to consolidate, but keep names personal and descriptive.
+3. Priority for keeping content: [instr] > [pref] > [fact].
+4. Merge duplicate entries, remove outdated entries, shorten verbose descriptions.
+5. Every bullet MUST have format: - (YYYY-MM-DD) [fact|pref|instr] text
+6. Preserve the user's first name in entries — do not replace it with "the user".
+7. Output ONLY the consolidated markdown — no explanations, no wrapping.
+
+<memory_document>
+{content}
+</memory_document>"""
+
+
+async def _forced_rewrite(content: str, llm: Any) -> str | None:
+    """Use a focused LLM call to compress *content* under the hard limit.
+
+    Returns the rewritten string, or ``None`` if the call fails.
+    """
+    try:
+        prompt = _FORCED_REWRITE_PROMPT.format(
+            target=MEMORY_HARD_LIMIT, content=content
+        )
+        response = await llm.ainvoke(
+            [HumanMessage(content=prompt)],
+            config={"tags": ["surfsense:internal"]},
+        )
+        text = (
+            response.content
+            if isinstance(response.content, str)
+            else str(response.content)
+        )
+        return text.strip()
+    except Exception:
+        logger.exception("Forced rewrite LLM call failed")
+        return None
+
+
+# ---------------------------------------------------------------------------
+# Shared save-and-respond logic
+# ---------------------------------------------------------------------------
+
+
+async def _save_memory(
+    *,
+    updated_memory: str,
+    old_memory: str | None,
+    llm: Any | None,
+    apply_fn,
+    commit_fn,
+    rollback_fn,
+    label: str,
+    scope: Literal["user", "team"],
+) -> dict[str, Any]:
+    """Validate, optionally force-rewrite if over the hard limit, save, and
+    return a response dict.
+
+    Parameters
+    ----------
+    updated_memory : str
+        The new document the agent submitted.
+    old_memory : str | None
+        The previously persisted document (for diff checks).
+    llm : Any | None
+        LLM instance for forced rewrite (may be ``None``).
+    apply_fn : callable(str) -> None
+        Callback that sets the new memory on the ORM object.
+    commit_fn : coroutine
+        ``session.commit``.
+    rollback_fn : coroutine
+        ``session.rollback``.
+    label : str
+        Human label for log messages (e.g. "user memory", "team memory").
+    """
+    content = updated_memory
+
+    # --- forced rewrite if over the hard limit ---
+    if len(content) > MEMORY_HARD_LIMIT and llm is not None:
+        rewritten = await _forced_rewrite(content, llm)
+        if rewritten is not None and len(rewritten) < len(content):
+            content = rewritten
+
+    # --- hard-limit gate (reject if still too large after rewrite) ---
+    size_err = _validate_memory_size(content)
+    if size_err:
+        return size_err
+
+    scope_err = _validate_memory_scope(content, scope)
+    if scope_err:
+        return scope_err
+
+    # --- persist ---
+    try:
+        apply_fn(content)
+        await commit_fn()
+    except Exception as e:
+        logger.exception("Failed to update %s: %s", label, e)
+        await rollback_fn()
+        return {"status": "error", "message": f"Failed to update {label}: {e}"}
+
+    # --- build response ---
+    resp: dict[str, Any] = {
+        "status": "saved",
+        "message": f"{label.capitalize()} updated.",
+    }
+
+    if content is not updated_memory:
+        resp["notice"] = "Memory was automatically rewritten to fit within limits."
+
+    diff_warnings = _validate_diff(old_memory, content)
+    if diff_warnings:
+        resp["diff_warnings"] = diff_warnings
+
+    format_warnings = _validate_bullet_format(content)
+    if format_warnings:
+        resp["format_warnings"] = format_warnings
+
+    warning = _soft_warning(content)
+    if warning:
+        resp["warning"] = warning
+
+    return resp
+
+
+# ---------------------------------------------------------------------------
+# Tool factories
+# ---------------------------------------------------------------------------
+
+
+def create_update_memory_tool(
+    user_id: str | UUID,
+    db_session: AsyncSession,
+    llm: Any | None = None,
+):
+    uid = UUID(user_id) if isinstance(user_id, str) else user_id
+
+    @tool
+    async def update_memory(updated_memory: str) -> dict[str, Any]:
+        """Update the user's personal memory document.
+
+        Your current memory is shown in <user_memory> in the system prompt.
+        When the user shares important long-term information (preferences,
+        facts, instructions, context), rewrite the memory document to include
+        the new information.  Merge new facts with existing ones, update
+        contradictions, remove outdated entries, and keep it concise.
+
+        Args:
+            updated_memory: The FULL updated markdown document (not a diff).
+        """
+        try:
+            result = await db_session.execute(select(User).where(User.id == uid))
+            user = result.scalars().first()
+            if not user:
+                return {"status": "error", "message": "User not found."}
+
+            old_memory = user.memory_md
+
+            return await _save_memory(
+                updated_memory=updated_memory,
+                old_memory=old_memory,
+                llm=llm,
+                apply_fn=lambda content: setattr(user, "memory_md", content),
+                commit_fn=db_session.commit,
+                rollback_fn=db_session.rollback,
+                label="memory",
+                scope="user",
+            )
+        except Exception as e:
+            logger.exception("Failed to update user memory: %s", e)
+            await db_session.rollback()
+            return {
+                "status": "error",
+                "message": f"Failed to update memory: {e}",
+            }
+
+    return update_memory
+
+
+def create_update_team_memory_tool(
+    search_space_id: int,
+    db_session: AsyncSession,
+    llm: Any | None = None,
+):
+    @tool
+    async def update_memory(updated_memory: str) -> dict[str, Any]:
+        """Update the team's shared memory document for this search space.
+
+        Your current team memory is shown in <team_memory> in the system
+        prompt.  When the team shares important long-term information
+        (decisions, conventions, key facts, priorities), rewrite the memory
+        document to include the new information.  Merge new facts with
+        existing ones, update contradictions, remove outdated entries, and
+        keep it concise.
+
+        Args:
+            updated_memory: The FULL updated markdown document (not a diff).
+        """
+        try:
+            result = await db_session.execute(
+                select(SearchSpace).where(SearchSpace.id == search_space_id)
+            )
+            space = result.scalars().first()
+            if not space:
+                return {"status": "error", "message": "Search space not found."}
+
+            old_memory = space.shared_memory_md
+
+            return await _save_memory(
+                updated_memory=updated_memory,
+                old_memory=old_memory,
+                llm=llm,
+                apply_fn=lambda content: setattr(space, "shared_memory_md", content),
+                commit_fn=db_session.commit,
+                rollback_fn=db_session.rollback,
+                label="team memory",
+                scope="team",
+            )
+        except Exception as e:
+            logger.exception("Failed to update team memory: %s", e)
+            await db_session.rollback()
+            return {
+                "status": "error",
+                "message": f"Failed to update team memory: {e}",
+            }
+
+    return update_memory
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/agent.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/agent.py
new file mode 100644
index 0000000000..37026bebda
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/agent.py
@@ -0,0 +1,39 @@
+"""``research`` route: ``SurfSenseSubagentSpec`` builder for deepagents."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.language_models import BaseChatModel
+from langchain_core.tools import BaseTool
+
+from app.agents.multi_agent_chat.subagents.shared.md_file_reader import read_md_file
+from app.agents.multi_agent_chat.subagents.shared.spec import SurfSenseSubagentSpec
+from app.agents.multi_agent_chat.subagents.shared.subagent_builder import pack_subagent
+
+from .tools.index import NAME, RULESET, load_tools
+
+
+def build_subagent(
+    *,
+    dependencies: dict[str, Any],
+    model: BaseChatModel | None = None,
+    middleware_stack: dict[str, Any] | None = None,
+    mcp_tools: list[BaseTool] | None = None,
+) -> SurfSenseSubagentSpec:
+    tools = [*load_tools(dependencies=dependencies), *(mcp_tools or [])]
+    description = (
+        read_md_file(__package__, "description").strip()
+        or "Handles research tasks for this workspace."
+    )
+    system_prompt = read_md_file(__package__, "system_prompt").strip()
+    return pack_subagent(
+        name=NAME,
+        description=description,
+        system_prompt=system_prompt,
+        tools=tools,
+        ruleset=RULESET,
+        dependencies=dependencies,
+        model=model,
+        middleware_stack=middleware_stack,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/description.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/description.md
new file mode 100644
index 0000000000..0a99b41402
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/description.md
@@ -0,0 +1,2 @@
+Specialist for external research.
+Use whenever a task requires finding sources on the web and extracting evidence to answer documentation questions.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/system_prompt.md
new file mode 100644
index 0000000000..cf558db62c
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/system_prompt.md
@@ -0,0 +1,53 @@
+You are the SurfSense research operations sub-agent.
+You receive delegated instructions from a supervisor agent and return structured results for supervisor synthesis.
+
+<goal>
+Gather and synthesize evidence using SurfSense research tools with clear citations and uncertainty reporting.
+</goal>
+
+<available_tools>
+- `web_search`
+- `scrape_webpage`
+- `search_surfsense_docs`
+</available_tools>
+
+<tool_policy>
+- Use only tools in `<available_tools>`.
+- Prefer primary and recent sources when recency matters.
+- If the delegated request is underspecified, return `status=blocked` with the missing research constraints.
+- Never fabricate facts, citations, URLs, or quote text.
+</tool_policy>
+
+<out_of_scope>
+- Do not execute connector mutations (email/calendar/docs/chat writes) or deliverable generation.
+</out_of_scope>
+
+<safety>
+- Report uncertainty explicitly when evidence is incomplete or conflicting.
+- Never present unverified claims as facts.
+</safety>
+
+<failure_policy>
+- On tool failure, return `status=error` with a concise recovery `next_step`.
+- On no useful evidence, return `status=blocked` with recommended narrower filters.
+</failure_policy>
+
+<output_contract>
+Return **only** one JSON object (no markdown/prose):
+{
+  "status": "success" | "partial" | "blocked" | "error",
+  "action_summary": string,
+  "evidence": {
+    "findings": string[],
+    "sources": string[],
+    "confidence": "high" | "medium" | "low"
+  },
+  "next_step": string | null,
+  "missing_fields": string[] | null,
+  "assumptions": string[] | null
+}
+Rules:
+- `status=success` -> `next_step=null`, `missing_fields=null`.
+- `status=partial|blocked|error` -> `next_step` must be non-null.
+- `status=blocked` due to missing required inputs -> `missing_fields` must be non-null.
+</output_contract>
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/tools/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/tools/__init__.py
new file mode 100644
index 0000000000..414cc96f42
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/tools/__init__.py
@@ -0,0 +1,11 @@
+"""Research-stage tools: web search, scrape, and in-product doc search."""
+
+from .scrape_webpage import create_scrape_webpage_tool
+from .search_surfsense_docs import create_search_surfsense_docs_tool
+from .web_search import create_web_search_tool
+
+__all__ = [
+    "create_scrape_webpage_tool",
+    "create_search_surfsense_docs_tool",
+    "create_web_search_tool",
+]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/tools/index.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/tools/index.py
new file mode 100644
index 0000000000..ea544a8dae
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/tools/index.py
@@ -0,0 +1,31 @@
+"""``research`` native tools and (empty) permission ruleset."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.tools import BaseTool
+
+from app.agents.new_chat.permissions import Ruleset
+
+from .scrape_webpage import create_scrape_webpage_tool
+from .search_surfsense_docs import create_search_surfsense_docs_tool
+from .web_search import create_web_search_tool
+
+NAME = "research"
+
+RULESET = Ruleset(origin=NAME, rules=[])
+
+
+def load_tools(
+    *, dependencies: dict[str, Any] | None = None, **kwargs: Any
+) -> list[BaseTool]:
+    d = {**(dependencies or {}), **kwargs}
+    return [
+        create_web_search_tool(
+            search_space_id=d.get("search_space_id"),
+            available_connectors=d.get("available_connectors"),
+        ),
+        create_scrape_webpage_tool(firecrawl_api_key=d.get("firecrawl_api_key")),
+        create_search_surfsense_docs_tool(db_session=d["db_session"]),
+    ]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/tools/scrape_webpage.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/tools/scrape_webpage.py
new file mode 100644
index 0000000000..bb7c8e5a36
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/tools/scrape_webpage.py
@@ -0,0 +1,300 @@
+"""Scrape pages via WebCrawlerConnector; YouTube URLs use the transcript API instead of HTML crawl."""
+
+import hashlib
+import logging
+from typing import Any
+from urllib.parse import urlparse
+
+import aiohttp
+from fake_useragent import UserAgent
+from langchain_core.tools import tool
+from requests import Session
+from youtube_transcript_api import YouTubeTranscriptApi
+
+from app.connectors.webcrawler_connector import WebCrawlerConnector
+from app.tasks.document_processors.youtube_processor import get_youtube_video_id
+from app.utils.proxy_config import get_requests_proxies
+
+logger = logging.getLogger(__name__)
+
+
+def extract_domain(url: str) -> str:
+    """Extract the domain from a URL."""
+    try:
+        parsed = urlparse(url)
+        domain = parsed.netloc
+        # Remove 'www.' prefix if present
+        if domain.startswith("www."):
+            domain = domain[4:]
+        return domain
+    except Exception:
+        return ""
+
+
+def generate_scrape_id(url: str) -> str:
+    """Generate a unique ID for a scraped webpage."""
+    hash_val = hashlib.md5(url.encode()).hexdigest()[:12]
+    return f"scrape-{hash_val}"
+
+
+def truncate_content(content: str, max_length: int = 50000) -> tuple[str, bool]:
+    """
+    Truncate content to a maximum length.
+
+    Returns:
+        Tuple of (truncated_content, was_truncated)
+    """
+    if len(content) <= max_length:
+        return content, False
+
+    # Try to truncate at a sentence boundary
+    truncated = content[:max_length]
+    last_period = truncated.rfind(".")
+    last_newline = truncated.rfind("\n\n")
+
+    # Use the later of the two boundaries, or just truncate
+    boundary = max(last_period, last_newline)
+    if boundary > max_length * 0.8:  # Only use boundary if it's not too far back
+        truncated = content[: boundary + 1]
+
+    return truncated + "\n\n[Content truncated...]", True
+
+
+async def _scrape_youtube_video(
+    url: str, video_id: str, max_length: int
+) -> dict[str, Any]:
+    """
+    Fetch YouTube video metadata and transcript via the YouTubeTranscriptApi.
+
+    Returns a result dict in the same shape as the regular scrape_webpage output.
+    """
+    scrape_id = generate_scrape_id(url)
+    domain = "youtube.com"
+
+    # --- Video metadata via oEmbed ---
+    residential_proxies = get_requests_proxies()
+
+    params = {
+        "format": "json",
+        "url": f"https://www.youtube.com/watch?v={video_id}",
+    }
+    oembed_url = "https://www.youtube.com/oembed"
+
+    try:
+        async with (
+            aiohttp.ClientSession() as http_session,
+            http_session.get(
+                oembed_url,
+                params=params,
+                proxy=residential_proxies["http"] if residential_proxies else None,
+            ) as response,
+        ):
+            video_data = await response.json()
+    except Exception:
+        video_data = {}
+
+    title = video_data.get("title", "YouTube Video")
+    author = video_data.get("author_name", "Unknown")
+
+    # --- Transcript via YouTubeTranscriptApi ---
+    try:
+        ua = UserAgent()
+        http_client = Session()
+        http_client.headers.update({"User-Agent": ua.random})
+        if residential_proxies:
+            http_client.proxies.update(residential_proxies)
+        ytt_api = YouTubeTranscriptApi(http_client=http_client)
+
+        # List all available transcripts and pick the first one
+        # (the video's primary language) instead of defaulting to English
+        transcript_list = ytt_api.list(video_id)
+        transcript = next(iter(transcript_list))
+        captions = transcript.fetch()
+
+        logger.info(
+            f"[scrape_webpage] Fetched transcript for {video_id} "
+            f"in {transcript.language} ({transcript.language_code})"
+        )
+
+        transcript_segments = []
+        for line in captions:
+            start_time = line.start
+            duration = line.duration
+            text = line.text
+            timestamp = f"[{start_time:.2f}s-{start_time + duration:.2f}s]"
+            transcript_segments.append(f"{timestamp} {text}")
+        transcript_text = "\n".join(transcript_segments)
+    except Exception as e:
+        logger.warning(f"[scrape_webpage] No transcript for video {video_id}: {e}")
+        transcript_text = f"No captions available for this video. Error: {e!s}"
+
+    # Build combined content
+    content = f"# {title}\n\n**Author:** {author}\n**Video ID:** {video_id}\n\n## Transcript\n\n{transcript_text}"
+
+    # Truncate if needed
+    content, was_truncated = truncate_content(content, max_length)
+    word_count = len(content.split())
+
+    description = f"YouTube video by {author}"
+
+    return {
+        "id": scrape_id,
+        "assetId": url,
+        "kind": "article",
+        "href": url,
+        "title": title,
+        "description": description,
+        "content": content,
+        "domain": domain,
+        "word_count": word_count,
+        "was_truncated": was_truncated,
+        "crawler_type": "youtube_transcript",
+        "author": author,
+    }
+
+
+def create_scrape_webpage_tool(firecrawl_api_key: str | None = None):
+    """
+    Factory function to create the scrape_webpage tool.
+
+    Args:
+        firecrawl_api_key: Optional Firecrawl API key for premium web scraping.
+                          Falls back to Chromium/Trafilatura if not provided.
+
+    Returns:
+        A configured tool function for scraping webpages.
+    """
+
+    @tool
+    async def scrape_webpage(
+        url: str,
+        max_length: int = 50000,
+    ) -> dict[str, Any]:
+        """
+        Scrape and extract the main content from a webpage.
+
+        Use this tool when the user wants you to read, summarize, or answer
+        questions about a specific webpage's content. This tool actually
+        fetches and reads the full page content. For YouTube video URLs it
+        fetches the transcript directly instead of crawling the page.
+
+        Common triggers:
+        - "Read this article and summarize it"
+        - "What does this page say about X?"
+        - "Summarize this blog post for me"
+        - "Tell me the key points from this article"
+        - "What's in this webpage?"
+
+        Args:
+            url: The URL of the webpage to scrape (must be HTTP/HTTPS)
+            max_length: Maximum content length to return (default: 50000 chars)
+
+        Returns:
+            A dictionary containing:
+            - id: Unique identifier for this scrape
+            - assetId: The URL (for deduplication)
+            - kind: "article" (type of content)
+            - href: The URL to open when clicked
+            - title: Page title
+            - description: Brief description or excerpt
+            - content: The extracted main content (markdown format)
+            - domain: The domain name
+            - word_count: Approximate word count
+            - was_truncated: Whether content was truncated
+            - error: Error message (if scraping failed)
+        """
+        scrape_id = generate_scrape_id(url)
+        domain = extract_domain(url)
+
+        # Validate and normalize URL
+        if not url.startswith(("http://", "https://")):
+            url = f"https://{url}"
+
+        try:
+            # Check if this is a YouTube URL and use transcript API instead
+            video_id = get_youtube_video_id(url)
+            if video_id:
+                return await _scrape_youtube_video(url, video_id, max_length)
+
+            # Create webcrawler connector
+            connector = WebCrawlerConnector(firecrawl_api_key=firecrawl_api_key)
+
+            # Crawl the URL
+            result, error = await connector.crawl_url(url, formats=["markdown"])
+
+            if error:
+                return {
+                    "id": scrape_id,
+                    "assetId": url,
+                    "kind": "article",
+                    "href": url,
+                    "title": domain or "Webpage",
+                    "domain": domain,
+                    "error": error,
+                }
+
+            if not result:
+                return {
+                    "id": scrape_id,
+                    "assetId": url,
+                    "kind": "article",
+                    "href": url,
+                    "title": domain or "Webpage",
+                    "domain": domain,
+                    "error": "No content returned from crawler",
+                }
+
+            # Extract content and metadata
+            content = result.get("content", "")
+            metadata = result.get("metadata", {})
+
+            # Get title from metadata
+            title = metadata.get("title", "")
+            if not title:
+                title = domain or url.split("/")[-1] or "Webpage"
+
+            # Get description from metadata
+            description = metadata.get("description", "")
+            if not description and content:
+                # Use first paragraph as description
+                first_para = content.split("\n\n")[0] if content else ""
+                description = (
+                    first_para[:300] + "..." if len(first_para) > 300 else first_para
+                )
+
+            # Truncate content if needed
+            content, was_truncated = truncate_content(content, max_length)
+
+            # Calculate word count
+            word_count = len(content.split())
+
+            return {
+                "id": scrape_id,
+                "assetId": url,
+                "kind": "article",
+                "href": url,
+                "title": title,
+                "description": description,
+                "content": content,
+                "domain": domain,
+                "word_count": word_count,
+                "was_truncated": was_truncated,
+                "crawler_type": result.get("crawler_type", "unknown"),
+                "author": metadata.get("author"),
+                "date": metadata.get("date"),
+            }
+
+        except Exception as e:
+            error_message = str(e)
+            logger.error(f"[scrape_webpage] Error scraping {url}: {error_message}")
+            return {
+                "id": scrape_id,
+                "assetId": url,
+                "kind": "article",
+                "href": url,
+                "title": domain or "Webpage",
+                "domain": domain,
+                "error": f"Failed to scrape: {error_message[:100]}",
+            }
+
+    return scrape_webpage
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/tools/search_surfsense_docs.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/tools/search_surfsense_docs.py
new file mode 100644
index 0000000000..ccc5c49e2e
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/tools/search_surfsense_docs.py
@@ -0,0 +1,145 @@
+"""Semantic search over pre-indexed in-app documentation chunks for user how-to questions."""
+
+import asyncio
+import json
+
+from langchain_core.tools import tool
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument
+from app.utils.document_converters import embed_text
+from app.utils.surfsense_docs import surfsense_docs_public_url
+
+
+def format_surfsense_docs_results(results: list[tuple]) -> str:
+    """Format (chunk, document) rows as XML with ``doc-`` chunk IDs for citations and UI routing."""
+    if not results:
+        return "No relevant Surfsense documentation found for your query."
+
+    # Group chunks by document
+    grouped: dict[int, dict] = {}
+    for chunk, doc in results:
+        public_url = surfsense_docs_public_url(doc.source)
+        if doc.id not in grouped:
+            grouped[doc.id] = {
+                "document_id": f"doc-{doc.id}",
+                "document_type": "SURFSENSE_DOCS",
+                "title": doc.title,
+                "url": public_url,
+                "metadata": {"source": doc.source, "public_url": public_url},
+                "chunks": [],
+            }
+        grouped[doc.id]["chunks"].append(
+            {
+                "chunk_id": f"doc-{chunk.id}",
+                "content": chunk.content,
+            }
+        )
+
+    # Render XML matching format_documents_for_context structure
+    parts: list[str] = []
+    for g in grouped.values():
+        metadata_json = json.dumps(g["metadata"], ensure_ascii=False)
+
+        parts.append("<document>")
+        parts.append("<document_metadata>")
+        parts.append(f"  <document_id>{g['document_id']}</document_id>")
+        parts.append(f"  <document_type>{g['document_type']}</document_type>")
+        parts.append(f"  <title><![CDATA[{g['title']}]]></title>")
+        parts.append(f"  <url><![CDATA[{g['url']}]]></url>")
+        parts.append(f"  <metadata_json><![CDATA[{metadata_json}]]></metadata_json>")
+        parts.append("</document_metadata>")
+        parts.append("")
+        parts.append("<document_content>")
+
+        for ch in g["chunks"]:
+            parts.append(
+                f"  <chunk id='{ch['chunk_id']}'><![CDATA[{ch['content']}]]></chunk>"
+            )
+
+        parts.append("</document_content>")
+        parts.append("</document>")
+        parts.append("")
+
+    return "\n".join(parts).strip()
+
+
+async def search_surfsense_docs_async(
+    query: str,
+    db_session: AsyncSession,
+    top_k: int = 10,
+) -> str:
+    """
+    Search Surfsense documentation using vector similarity.
+
+    Args:
+        query: The search query about Surfsense usage
+        db_session: Database session for executing queries
+        top_k: Number of results to return
+
+    Returns:
+        Formatted string with relevant documentation content
+    """
+    # Get embedding for the query
+    query_embedding = await asyncio.to_thread(embed_text, query)
+
+    # Vector similarity search on chunks, joining with documents
+    stmt = (
+        select(SurfsenseDocsChunk, SurfsenseDocsDocument)
+        .join(
+            SurfsenseDocsDocument,
+            SurfsenseDocsChunk.document_id == SurfsenseDocsDocument.id,
+        )
+        .order_by(SurfsenseDocsChunk.embedding.op("<=>")(query_embedding))
+        .limit(top_k)
+    )
+
+    result = await db_session.execute(stmt)
+    rows = result.all()
+
+    return format_surfsense_docs_results(rows)
+
+
+def create_search_surfsense_docs_tool(db_session: AsyncSession):
+    """
+    Factory function to create the search_surfsense_docs tool.
+
+    Args:
+        db_session: Database session for executing queries
+
+    Returns:
+        A configured tool function for searching Surfsense documentation
+    """
+
+    @tool
+    async def search_surfsense_docs(query: str, top_k: int = 10) -> str:
+        """
+        Search Surfsense documentation for help with using the application.
+
+        Use this tool when the user asks questions about:
+        - How to use Surfsense features
+        - Installation and setup instructions
+        - Configuration options and settings
+        - Troubleshooting common issues
+        - Available connectors and integrations
+        - Browser extension usage
+        - API documentation
+
+        This searches the official Surfsense documentation that was indexed
+        at deployment time. It does NOT search the user's personal knowledge base.
+
+        Args:
+            query: The search query about Surfsense usage or features
+            top_k: Number of documentation chunks to retrieve (default: 10)
+
+        Returns:
+            Relevant documentation content formatted with chunk IDs for citations
+        """
+        return await search_surfsense_docs_async(
+            query=query,
+            db_session=db_session,
+            top_k=top_k,
+        )
+
+    return search_surfsense_docs
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/tools/web_search.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/tools/web_search.py
new file mode 100644
index 0000000000..2fe6bd3789
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/tools/web_search.py
@@ -0,0 +1,241 @@
+"""Real-time web search: SearXNG plus configured live-search connectors (Tavily, Linkup, Baidu, etc.)."""
+
+import asyncio
+import json
+import time
+from typing import Any
+
+from langchain_core.tools import StructuredTool
+from pydantic import BaseModel, Field
+
+from app.db import shielded_async_session
+from app.services.connector_service import ConnectorService
+from app.utils.perf import get_perf_logger
+
+_LIVE_SEARCH_CONNECTORS: set[str] = {
+    "TAVILY_API",
+    "LINKUP_API",
+    "BAIDU_SEARCH_API",
+}
+
+_LIVE_CONNECTOR_SPECS: dict[str, tuple[str, bool, bool, dict[str, Any]]] = {
+    "TAVILY_API": ("search_tavily", False, True, {}),
+    "LINKUP_API": ("search_linkup", False, False, {"mode": "standard"}),
+    "BAIDU_SEARCH_API": ("search_baidu", False, True, {}),
+}
+
+_CONNECTOR_LABELS: dict[str, str] = {
+    "TAVILY_API": "Tavily",
+    "LINKUP_API": "Linkup",
+    "BAIDU_SEARCH_API": "Baidu",
+}
+
+
+class WebSearchInput(BaseModel):
+    """Input schema for the web_search tool."""
+
+    query: str = Field(
+        description="The search query to look up on the web. Use specific, descriptive terms.",
+    )
+    top_k: int = Field(
+        default=10,
+        description="Number of results to retrieve (default: 10, max: 50).",
+    )
+
+
+def _format_web_results(
+    documents: list[dict[str, Any]],
+    *,
+    max_chars: int = 50_000,
+) -> str:
+    """Format web search results into XML suitable for the LLM context."""
+    if not documents:
+        return "No web search results found."
+
+    parts: list[str] = []
+    total_chars = 0
+
+    for doc in documents:
+        doc_info = doc.get("document") or {}
+        metadata = doc_info.get("metadata") or {}
+        title = doc_info.get("title") or "Web Result"
+        url = metadata.get("url") or ""
+        content = (doc.get("content") or "").strip()
+        source = metadata.get("document_type") or doc.get("source") or "WEB_SEARCH"
+        if not content:
+            continue
+
+        metadata_json = json.dumps(metadata, ensure_ascii=False)
+        doc_xml = "\n".join(
+            [
+                "<document>",
+                "<document_metadata>",
+                f"  <document_type>{source}</document_type>",
+                f"  <title><![CDATA[{title}]]></title>",
+                f"  <url><![CDATA[{url}]]></url>",
+                f"  <metadata_json><![CDATA[{metadata_json}]]></metadata_json>",
+                "</document_metadata>",
+                "<document_content>",
+                f"  <chunk id='{url}'><![CDATA[{content}]]></chunk>",
+                "</document_content>",
+                "</document>",
+                "",
+            ]
+        )
+
+        if total_chars + len(doc_xml) > max_chars:
+            parts.append("<!-- Output truncated to fit context window -->")
+            break
+
+        parts.append(doc_xml)
+        total_chars += len(doc_xml)
+
+    return "\n".join(parts).strip() or "No web search results found."
+
+
+async def _search_live_connector(
+    connector: str,
+    query: str,
+    search_space_id: int,
+    top_k: int,
+    semaphore: asyncio.Semaphore,
+) -> list[dict[str, Any]]:
+    """Dispatch a single live-search connector (Tavily / Linkup / Baidu)."""
+    perf = get_perf_logger()
+    spec = _LIVE_CONNECTOR_SPECS.get(connector)
+    if spec is None:
+        return []
+
+    method_name, _includes_date_range, includes_top_k, extra_kwargs = spec
+    kwargs: dict[str, Any] = {
+        "user_query": query,
+        "search_space_id": search_space_id,
+        **extra_kwargs,
+    }
+    if includes_top_k:
+        kwargs["top_k"] = top_k
+
+    try:
+        t0 = time.perf_counter()
+        async with semaphore, shielded_async_session() as session:
+            svc = ConnectorService(session, search_space_id)
+            _, chunks = await getattr(svc, method_name)(**kwargs)
+            perf.info(
+                "[web_search] connector=%s results=%d in %.3fs",
+                connector,
+                len(chunks),
+                time.perf_counter() - t0,
+            )
+            return chunks
+    except Exception as e:
+        perf.warning("[web_search] connector=%s FAILED: %s", connector, e)
+        return []
+
+
+def create_web_search_tool(
+    search_space_id: int | None = None,
+    available_connectors: list[str] | None = None,
+) -> StructuredTool:
+    """Factory for the ``web_search`` tool.
+
+    Dispatches in parallel to the platform SearXNG instance and any
+    user-configured live-search connectors (Tavily, Linkup, Baidu).
+    """
+    active_live_connectors: list[str] = []
+    if available_connectors:
+        active_live_connectors = [
+            c for c in available_connectors if c in _LIVE_SEARCH_CONNECTORS
+        ]
+
+    engine_names = ["SearXNG (platform default)"]
+    engine_names.extend(_CONNECTOR_LABELS.get(c, c) for c in active_live_connectors)
+    engines_summary = ", ".join(engine_names)
+
+    description = (
+        "Search the web for real-time information. "
+        "Use this for current events, news, prices, weather, public facts, or any "
+        "question that requires up-to-date information from the internet.\n\n"
+        f"Active search engines: {engines_summary}.\n"
+        "All configured engines are queried in parallel and results are merged."
+    )
+
+    _search_space_id = search_space_id
+    _active_live = active_live_connectors
+
+    async def _web_search_impl(query: str, top_k: int = 10) -> str:
+        from app.services import web_search_service
+
+        perf = get_perf_logger()
+        t0 = time.perf_counter()
+        clamped_top_k = min(max(1, top_k), 50)
+
+        semaphore = asyncio.Semaphore(4)
+        tasks: list[asyncio.Task[list[dict[str, Any]]]] = []
+
+        if web_search_service.is_available():
+
+            async def _searxng() -> list[dict[str, Any]]:
+                async with semaphore:
+                    _result_obj, docs = await web_search_service.search(
+                        query=query,
+                        top_k=clamped_top_k,
+                    )
+                    return docs
+
+            tasks.append(asyncio.ensure_future(_searxng()))
+
+        if _search_space_id is not None:
+            for connector in _active_live:
+                tasks.append(
+                    asyncio.ensure_future(
+                        _search_live_connector(
+                            connector=connector,
+                            query=query,
+                            search_space_id=_search_space_id,
+                            top_k=clamped_top_k,
+                            semaphore=semaphore,
+                        )
+                    )
+                )
+
+        if not tasks:
+            return "Web search is not available — no search engines are configured."
+
+        results_lists = await asyncio.gather(*tasks, return_exceptions=True)
+
+        all_documents: list[dict[str, Any]] = []
+        for result in results_lists:
+            if isinstance(result, BaseException):
+                perf.warning("[web_search] a search engine failed: %s", result)
+                continue
+            all_documents.extend(result)
+
+        seen_urls: set[str] = set()
+        deduplicated: list[dict[str, Any]] = []
+        for doc in all_documents:
+            url = ((doc.get("document") or {}).get("metadata") or {}).get("url", "")
+            if url and url in seen_urls:
+                continue
+            if url:
+                seen_urls.add(url)
+            deduplicated.append(doc)
+
+        formatted = _format_web_results(deduplicated)
+
+        perf.info(
+            "[web_search] query=%r engines=%d results=%d deduped=%d chars=%d in %.3fs",
+            query[:60],
+            len(tasks),
+            len(all_documents),
+            len(deduplicated),
+            len(formatted),
+            time.perf_counter() - t0,
+        )
+        return formatted
+
+    return StructuredTool(
+        name="web_search",
+        description=description,
+        coroutine=_web_search_impl,
+        args_schema=WebSearchInput,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/airtable/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/airtable/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/airtable/agent.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/airtable/agent.py
new file mode 100644
index 0000000000..d7648d4071
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/airtable/agent.py
@@ -0,0 +1,43 @@
+"""``airtable`` route: ``SurfSenseSubagentSpec`` builder for deepagents.
+
+Tools come exclusively from MCP. The connector's own approval ruleset is
+declared in :data:`tools.index.RULESET`; the orchestrator layers it into
+a per-subagent :class:`PermissionMiddleware`.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.language_models import BaseChatModel
+from langchain_core.tools import BaseTool
+
+from app.agents.multi_agent_chat.subagents.shared.md_file_reader import read_md_file
+from app.agents.multi_agent_chat.subagents.shared.spec import SurfSenseSubagentSpec
+from app.agents.multi_agent_chat.subagents.shared.subagent_builder import pack_subagent
+
+from .tools.index import NAME, RULESET
+
+
+def build_subagent(
+    *,
+    dependencies: dict[str, Any],
+    model: BaseChatModel | None = None,
+    middleware_stack: dict[str, Any] | None = None,
+    mcp_tools: list[BaseTool] | None = None,
+) -> SurfSenseSubagentSpec:
+    description = (
+        read_md_file(__package__, "description").strip()
+        or "Handles airtable tasks for this workspace."
+    )
+    system_prompt = read_md_file(__package__, "system_prompt").strip()
+    return pack_subagent(
+        name=NAME,
+        description=description,
+        system_prompt=system_prompt,
+        tools=list(mcp_tools or []),
+        ruleset=RULESET,
+        dependencies=dependencies,
+        model=model,
+        middleware_stack=middleware_stack,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/airtable/description.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/airtable/description.md
new file mode 100644
index 0000000000..29b9e145ff
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/airtable/description.md
@@ -0,0 +1,2 @@
+Specialist for bases, tables, and records in the user's Airtable.
+Use proactively when the user wants to find, create, or update an Airtable record.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/airtable/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/airtable/system_prompt.md
new file mode 100644
index 0000000000..1b7e847104
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/airtable/system_prompt.md
@@ -0,0 +1,103 @@
+You are an Airtable specialist for the user's connected Airtable bases.
+
+Airtable vocabulary:
+- **Workspace → Base → Table → Field → Record**: nested scope. A base belongs to one workspace; tables and fields live inside a base; records live inside a table. Every record operation is scoped to one `baseId` and one `tableId`.
+- **Base ID / Table ID / Field ID / Record ID**: opaque strings (e.g. `appXXXX`, `tblXXXX`, `fldXXXX`, `recXXXX`). Stable but not user-facing — users refer to bases and tables by name and records by description. Never expect a user or the supervisor to provide IDs.
+- **Field types and choice IDs**: each field has a type (text, number, date, single select, multi select, attachment, formula, lookup, etc.). Single-select and multi-select fields store **choice IDs**, not the visible labels — you must resolve a label to its choice ID before filtering or writing that field.
+- **Filters vs free-text search**: Airtable exposes two distinct record-fetch patterns. Use a typed `filters` parameter when filtering by structured field criteria. Use free-text search when the user is searching for a value (a name, an order number, a keyword) without naming a specific field. Do NOT attempt to build a `filterByFormula` string — that path is not supported here.
+- **Permission tiers**: each base grants the user one of Owner / Creator / Editor / Commenter / Read-only. Mutations require Editor or higher on the target base. A permission error from the MCP is not retryable.
+
+When invoked:
+1. Read the supervisor's request, then read the runtime tool list to learn what information you can fetch and which mutations are available.
+2. Plan the minimum chain of lookups needed to resolve any base, table, field, choice value, or record the request leaves unspecified.
+3. Execute the planned lookups, then the requested mutation (if any), then return.
+
+Resolution principle (the core behaviour):
+**Proactively look up any identifier, name, value, or scope the request leaves unspecified — base IDs, table IDs, field IDs, choice IDs, record IDs, anything else — using the available tools instead of asking the supervisor.** Most user requests reference bases and tables by name and records by description, not by ID. Search for them.
+
+When a lookup for a single slot returns multiple plausible candidates and you cannot confidently pick one, return `status=blocked` with up to 5 candidates in `evidence.matched_candidates` and the unresolved slot in `missing_fields`. The supervisor will disambiguate and redelegate.
+
+When a lookup returns zero matches for a slot the request requires, return `status=blocked` with a `next_step` suggesting alternative search terms.
+
+Mutation guardrails:
+- Resolve every required Airtable ID (`baseId`, `tableId`, `fieldId`, choice IDs, `recordId`) by looking it up before calling a mutation tool. Mutations have chained dependencies — base lookup enables table lookup; table lookup enables field schema; field schema enables choice IDs and field-typed writes.
+- When writing to a single-select or multi-select field, resolve the user's value to the field's actual choice ID first. Never invent a choice label or pass an unknown value — Airtable will reject it.
+- Record creation is batch-limited by the MCP tool. If the request asks for more records than the tool accepts in one call, complete the first batch and return `status=partial` with the remainder in `next_step`.
+- Never invent base IDs, table IDs, field IDs, choice IDs, record IDs, or mutation outcomes. Every field in `evidence` must come from a tool result.
+- Confirm the mutation tool returned a success response before claiming success. If the mutation is approval-rejected (HITL), return `status=blocked` with `next_step="user declined; do not retry"`.
+- One operation per delegation. For multi-mutation requests, complete the highest-priority one and return `status=partial` with the remainder in `next_step`.
+
+Failure handling:
+- Tool failure: return `status=error`, place the underlying error message in `action_summary`, and put a concise recovery in `next_step`.
+- Permission error from the MCP: return `status=error` and surface the underlying message — do not retry. Permission errors mean the user lacks Editor (or higher) access on the target base.
+- No useful results after reasonable narrowing / broadening: return `status=blocked` with filter / search-term suggestions in `next_step`.
+
+<example>
+Supervisor: "List open tasks in the Project Tracker base."
+1. Search bases for "Project Tracker" → one strong match. Capture its base ID.
+2. List tables in that base → identify the Tasks table; capture its table ID.
+3. Get table schema → identify the status field and the choice IDs that represent "open" states.
+4. List records with a typed filter on the status field for those choice IDs.
+5. Return `status=success` with the matched records in `evidence.items`.
+</example>
+
+<example>
+Supervisor: "Add a new contact for Jane Smith at Acme Corp."
+1. Search bases for any CRM-like base → three plausible matches with no strong relevance signal.
+2. Cannot pick the base. Return:
+   {
+     "status": "blocked",
+     "action_summary": "Need to know which CRM-like base to write to.",
+     "evidence": {
+       "title": "New contact: Jane Smith (Acme Corp)",
+       "matched_candidates": [
+         { "id": "appAAA", "label": "CRM" },
+         { "id": "appBBB", "label": "Sales CRM" },
+         { "id": "appCCC", "label": "Customer Database" }
+       ]
+     },
+     "next_step": "Confirm which base, then redelegate.",
+     "missing_fields": ["base"]
+   }
+</example>
+
+<example>
+Supervisor: "Mark task 'Refresh homepage hero' as Complete."
+1. Search bases for a project-tracker / tasks base → resolve the target base ID.
+2. List tables → resolve the Tasks table ID.
+3. Search records for "Refresh homepage hero" → one match (record ID `recXXX`).
+4. Get table schema → resolve the status field ID and the choice ID for "Complete".
+5. Update record `recXXX`, setting the status field to the resolved choice ID.
+6. Confirm tool success → return `status=success` with the updated record reference.
+</example>
+
+<output_contract>
+Return **only** one JSON object (no markdown, no prose):
+{
+  "status": "success" | "partial" | "blocked" | "error",
+  "action_summary": string,
+  "evidence": {
+    "base_id": string | null,
+    "base_name": string | null,
+    "table_id": string | null,
+    "table_name": string | null,
+    "record_id": string | null,
+    "url": string | null,
+    "matched_candidates": [
+      { "id": string, "label": string }
+    ] | null,
+    "items": object | null
+  },
+  "next_step": string | null,
+  "missing_fields": string[] | null,
+  "assumptions": string[] | null
+}
+Rules:
+- `status=success` → `next_step=null`, `missing_fields=null`.
+- `status=partial|blocked|error` → `next_step` must be non-null.
+- `status=blocked` due to missing required inputs → `missing_fields` must be non-null.
+- For blocked ambiguity, populate `evidence.matched_candidates` with up to 5 options (`id` + `label` — works for any kind of candidate: base, table, field, choice, record, etc.).
+- For discovery-only queries (lists), populate `evidence.items` with the structured list.
+</output_contract>
+
+Discover before you mutate; never guess identifiers, choice IDs, or required fields.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/airtable/tools/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/airtable/tools/__init__.py
new file mode 100644
index 0000000000..a9b004975b
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/airtable/tools/__init__.py
@@ -0,0 +1,3 @@
+"""Airtable route: native tool factories are empty; MCP supplies tools when configured."""
+
+__all__: list[str] = []
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/airtable/tools/index.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/airtable/tools/index.py
new file mode 100644
index 0000000000..9eebd2395d
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/airtable/tools/index.py
@@ -0,0 +1,21 @@
+"""``airtable`` permission ruleset (rules over MCP tool names)."""
+
+from __future__ import annotations
+
+from app.agents.new_chat.permissions import Rule, Ruleset
+
+NAME = "airtable"
+
+RULESET = Ruleset(
+    origin=NAME,
+    rules=[
+        Rule(permission="list_bases", pattern="*", action="allow"),
+        Rule(permission="search_bases", pattern="*", action="allow"),
+        Rule(permission="list_tables_for_base", pattern="*", action="allow"),
+        Rule(permission="get_table_schema", pattern="*", action="allow"),
+        Rule(permission="list_records_for_table", pattern="*", action="allow"),
+        Rule(permission="search_records", pattern="*", action="allow"),
+        Rule(permission="create_records_for_table", pattern="*", action="ask"),
+        Rule(permission="update_records_for_table", pattern="*", action="ask"),
+    ],
+)
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/agent.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/agent.py
new file mode 100644
index 0000000000..7ef706c3d9
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/agent.py
@@ -0,0 +1,44 @@
+"""``calendar`` route: ``SurfSenseSubagentSpec`` builder for deepagents.
+
+Tools self-gate inside their bodies via :func:`request_approval`; the
+empty :data:`tools.index.RULESET` is layered into a per-subagent
+:class:`PermissionMiddleware` for uniformity with MCP-backed connectors.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.language_models import BaseChatModel
+from langchain_core.tools import BaseTool
+
+from app.agents.multi_agent_chat.subagents.shared.md_file_reader import read_md_file
+from app.agents.multi_agent_chat.subagents.shared.spec import SurfSenseSubagentSpec
+from app.agents.multi_agent_chat.subagents.shared.subagent_builder import pack_subagent
+
+from .tools.index import NAME, RULESET, load_tools
+
+
+def build_subagent(
+    *,
+    dependencies: dict[str, Any],
+    model: BaseChatModel | None = None,
+    middleware_stack: dict[str, Any] | None = None,
+    mcp_tools: list[BaseTool] | None = None,
+) -> SurfSenseSubagentSpec:
+    tools = [*load_tools(dependencies=dependencies), *(mcp_tools or [])]
+    description = (
+        read_md_file(__package__, "description").strip()
+        or "Handles calendar tasks for this workspace."
+    )
+    system_prompt = read_md_file(__package__, "system_prompt").strip()
+    return pack_subagent(
+        name=NAME,
+        description=description,
+        system_prompt=system_prompt,
+        tools=tools,
+        ruleset=RULESET,
+        dependencies=dependencies,
+        model=model,
+        middleware_stack=middleware_stack,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/description.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/description.md
new file mode 100644
index 0000000000..a8b5e2c05e
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/description.md
@@ -0,0 +1,3 @@
+Specialist for events on the user's calendar.
+Use proactively when the user wants to check availability, create, modify, reschedule, or remove a calendar event.
+Meeting invitations that reserve a time slot belong here.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/system_prompt.md
new file mode 100644
index 0000000000..5b27c18bab
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/system_prompt.md
@@ -0,0 +1,121 @@
+You are a Google Calendar specialist for the user's connected calendar.
+
+## Vocabulary you must use precisely
+
+- **All-day vs. timed events are distinguished by datetime format** — pass `YYYY-MM-DD` (e.g. `"2026-05-12"`) for an all-day event, and `YYYY-MM-DDTHH:MM:SS` *without* a timezone suffix (e.g. `"2026-05-12T10:00:00"`) for a timed event. The tool injects the user's local timezone for timed events; do not append `Z`, `+02:00`, or any offset yourself.
+- **Compute datetimes from the supervisor's task using the runtime timestamp** — resolve "tomorrow at 10am", "next Friday afternoon", "this week", "next month" into concrete `YYYY-MM-DD` or `YYYY-MM-DDTHH:MM:SS` values against the current runtime time. `search_calendar_events` takes a date range (`start_date`, `end_date`), not a free-text query — translate phrases like "this week" into the boundaries.
+- **Title-or-id resolution with search disambiguation** — `update_calendar_event` and `delete_calendar_event` accept either a human-readable title (resolved against the locally-synced calendar KB index) or a direct `event_id`. Events not yet KB-indexed cannot be resolved by title. If the user's reference to an event is ambiguous — a recurring title like "Daily Standup", a vague descriptor, or no date context — run `search_calendar_events` over the likely date range first; if multiple matches surface, return `status=blocked` with `matched_candidates` rather than mutating against an uncertain target.
+- **Reschedule = `update_calendar_event`** — natural-language verbs "reschedule", "move", "push back", "change the time of" route to `update_calendar_event` with `new_start_datetime` / `new_end_datetime`. **Never** chain `delete_calendar_event` + `create_calendar_event` to achieve a reschedule. Pass only the `new_*` fields the user asked to change; omit the rest so existing values are preserved.
+
+## Required inputs
+
+**For every required input below, first try to infer it from the supervisor's task text** — extract summaries from natural phrasing (`"a meeting with Alice"` → `"Meeting with Alice"`), compute datetimes from runtime-relative references, infer the target event from descriptors in the task. Only return `status=blocked` with `missing_fields` when an input is genuinely absent or ambiguous after a thorough read.
+
+- `create_calendar_event` — `summary`, `start_datetime`, `end_datetime`. If the task gives a date but no time and no all-day intent (e.g. `"schedule a meeting tomorrow"`), block on `start_datetime` / `end_datetime` rather than defaulting — the choice between all-day and timed is intent-bearing and creating the wrong shape is destructive UX. Optional `description`, `location`, `attendees` only when the user named them.
+- `update_calendar_event` — `event_title_or_id` (infer the target from the task; disambiguate via search if uncertain) and at least one `new_*` field reflecting the requested change. Pass only the fields the user asked to change; omit unchanged ones.
+- `delete_calendar_event` — `event_title_or_id` (infer the target; disambiguate via search if uncertain). Only set `delete_from_kb=true` when the user explicitly asked to remove it from the knowledge base; otherwise leave it `false`.
+- `search_calendar_events` — `start_date, end_date` (both `YYYY-MM-DD`). Translate the task's time range into boundaries. `max_results` defaults to 25 (max 50) — raise it only when the task implies a broader sweep.
+
+## Outcome mapping
+
+| Tool returns                | Your `status` | `next_step`                                                                                                                  |
+|-----------------------------|---------------|------------------------------------------------------------------------------------------------------------------------------|
+| `success`                   | `success`     | `null`                                                                                                                       |
+| `success` with `total: 0` (`search_calendar_events` only) | `blocked` | `"No events matched the date range <start_date>–<end_date>. Ask the user to widen the range or confirm the event exists."` |
+| `rejected`                  | `blocked`     | `"User declined this calendar action. Do not retry or suggest alternatives."`                                                |
+| `not_found`                 | `blocked`     | `"Event '<title>' was not found in the indexed calendar events. Ask the user to verify the title or wait for the next KB sync."` |
+| `auth_error`                | `error`       | `"The connected Google Calendar account needs re-authentication. Ask the user to re-authenticate in connector settings."`    |
+| `insufficient_permissions`  | `error`       | `"The connected Google Calendar account is missing the OAuth scope required for this action. Ask the user to re-authenticate and grant full permissions in connector settings."` |
+| `error`                     | `error`       | Relay the tool's `message` verbatim as `next_step`.                                                                          |
+| tool raises / unknown       | `error`       | `"Calendar tool failed unexpectedly. Ask the user to retry shortly."`                                                        |
+
+Surface the tool's `event_id`, `title` / `summary`, `start_at`, `end_at`, and `html_link` inside `evidence` when the tool returned them. For `search_calendar_events`, place the raw `events` array inside `evidence.items`. Never invent a field the tool did not return.
+
+## Examples
+
+**Example 1 — happy create with inference (assume runtime is 2026-05-11):**
+- *Supervisor task:* `"Schedule a 1-hour meeting with Alice tomorrow at 10am."`
+- *You:* `summary="Meeting with Alice"` (inferred); `start_datetime="2026-05-12T10:00:00"`; `end_datetime="2026-05-12T11:00:00"` (10am + 1h); attendees not in task so omit. Call `create_calendar_event(...)` → tool returns `status=success`.
+- *Output:*
+
+  ```json
+  {
+    "status": "success",
+    "action_summary": "Created 'Meeting with Alice' on 2026-05-12 from 10:00 to 11:00.",
+    "evidence": { "operation": "create_calendar_event", "event_id": "<id>", "title": "Meeting with Alice", "start_at": "2026-05-12T10:00:00<tz>", "end_at": "2026-05-12T11:00:00<tz>", "html_link": "<url>", "matched_candidates": null, "items": null },
+    "next_step": null,
+    "missing_fields": null,
+    "assumptions": ["Inferred the summary from the supervisor's phrasing; 1h duration applied to the 10am start to produce the 11am end."]
+  }
+  ```
+
+**Example 2 — blocked because time is unspecified:**
+- *Supervisor task:* `"Schedule a meeting with the design team tomorrow."`
+- *You:* no time and no all-day intent. Do not default to all-day or to a guessed hour. Do not call any tool.
+- *Output:*
+
+  ```json
+  {
+    "status": "blocked",
+    "action_summary": "Cannot schedule: the task gives a date but no time, and the choice between all-day and timed is intent-bearing.",
+    "evidence": { "operation": null, "event_id": null, "title": null, "start_at": null, "end_at": null, "html_link": null, "matched_candidates": null, "items": null },
+    "next_step": "Ask the user for the start time and duration (or confirm that this should be an all-day event).",
+    "missing_fields": ["start_datetime", "end_datetime"],
+    "assumptions": null
+  }
+  ```
+
+**Example 3 — ambiguous reschedule target → disambiguate via search (assume runtime is 2026-05-11):**
+- *Supervisor task:* `"Reschedule the standup to 3pm."`
+- *You:* "standup" is a recurring title and no date is given. Search this week first: `search_calendar_events(start_date="2026-05-11", end_date="2026-05-17")` → 5 events titled "Daily Standup" surface. Do not call `update_calendar_event` against an uncertain target.
+- *Output:*
+
+  ```json
+  {
+    "status": "blocked",
+    "action_summary": "Found 5 'Daily Standup' events this week; cannot reschedule without knowing which.",
+    "evidence": { "operation": "search_calendar_events", "event_id": null, "title": null, "start_at": null, "end_at": null, "html_link": null, "matched_candidates": [
+      { "id": "<id1>", "label": "Daily Standup — 2026-05-12T09:00:00" },
+      { "id": "<id2>", "label": "Daily Standup — 2026-05-13T09:00:00" },
+      { "id": "<id3>", "label": "Daily Standup — 2026-05-14T09:00:00" },
+      { "id": "<id4>", "label": "Daily Standup — 2026-05-15T09:00:00" },
+      { "id": "<id5>", "label": "Daily Standup — 2026-05-16T09:00:00" }
+    ], "items": null },
+    "next_step": "Ask the user which standup to reschedule (or confirm it applies to all of them, in which case repeat the update per occurrence).",
+    "missing_fields": null,
+    "assumptions": ["Interpreted 'the standup' as the recurring 'Daily Standup' series in the current week."]
+  }
+  ```
+
+## Output contract
+
+Return **only** one JSON object (no markdown or prose outside it):
+
+```json
+{
+  "status": "success" | "partial" | "blocked" | "error",
+  "action_summary": string,
+  "evidence": {
+    "operation": "create_calendar_event" | "update_calendar_event" | "delete_calendar_event" | "search_calendar_events" | null,
+    "event_id": string | null,
+    "title": string | null,
+    "start_at": string | null,
+    "end_at": string | null,
+    "html_link": string | null,
+    "matched_candidates": [ { "id": string, "label": string } ] | null,
+    "items": object | null
+  },
+  "next_step": string | null,
+  "missing_fields": string[] | null,
+  "assumptions": string[] | null
+}
+```
+
+Rules:
+- `status=success` → `next_step=null`, `missing_fields=null`.
+- `status=partial|blocked|error` → `next_step` must be non-null.
+- `status=blocked` due to missing required inputs → `missing_fields` must be non-null.
+- For `search_calendar_events` results, populate `evidence.items` with `{ "events": [...], "total": N }`.
+- For ambiguous matches across `update_calendar_event` / `delete_calendar_event`, populate `evidence.matched_candidates` with up to 5 options (`id` + `label`, where `label` should include the event title and start time for human readability).
+
+Infer before you call; map every tool outcome faithfully.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/tools/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/tools/__init__.py
new file mode 100644
index 0000000000..13d4c06cbd
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/tools/__init__.py
@@ -0,0 +1,19 @@
+from app.agents.new_chat.tools.google_calendar.create_event import (
+    create_create_calendar_event_tool,
+)
+from app.agents.new_chat.tools.google_calendar.delete_event import (
+    create_delete_calendar_event_tool,
+)
+from app.agents.new_chat.tools.google_calendar.search_events import (
+    create_search_calendar_events_tool,
+)
+from app.agents.new_chat.tools.google_calendar.update_event import (
+    create_update_calendar_event_tool,
+)
+
+__all__ = [
+    "create_create_calendar_event_tool",
+    "create_delete_calendar_event_tool",
+    "create_search_calendar_events_tool",
+    "create_update_calendar_event_tool",
+]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/tools/create_event.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/tools/create_event.py
new file mode 100644
index 0000000000..e5262bd43d
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/tools/create_event.py
@@ -0,0 +1,351 @@
+import asyncio
+import logging
+from datetime import datetime
+from typing import Any
+
+from google.oauth2.credentials import Credentials
+from googleapiclient.discovery import build
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.agents.multi_agent_chat.subagents.shared.hitl.approvals.self_gated import (
+    request_approval,
+)
+from app.services.google_calendar import GoogleCalendarToolMetadataService
+
+logger = logging.getLogger(__name__)
+
+
+def create_create_calendar_event_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    @tool
+    async def create_calendar_event(
+        summary: str,
+        start_datetime: str,
+        end_datetime: str,
+        description: str | None = None,
+        location: str | None = None,
+        attendees: list[str] | None = None,
+    ) -> dict[str, Any]:
+        """Create a new event on Google Calendar.
+
+        Use when the user asks to schedule, create, or add a calendar event.
+        Ask for event details if not provided.
+
+        Args:
+            summary: The event title.
+            start_datetime: Start time in ISO 8601 format (e.g. "2026-03-20T10:00:00").
+            end_datetime: End time in ISO 8601 format (e.g. "2026-03-20T11:00:00").
+            description: Optional event description.
+            location: Optional event location.
+            attendees: Optional list of attendee email addresses.
+
+        Returns:
+            Dictionary with:
+            - status: "success", "rejected", "auth_error", or "error"
+            - event_id: Google Calendar event ID (if success)
+            - html_link: URL to open the event (if success)
+            - message: Result message
+
+            IMPORTANT:
+            - If status is "rejected", the user explicitly declined the action.
+              Respond with a brief acknowledgment and do NOT retry or suggest alternatives.
+
+        Examples:
+            - "Schedule a meeting with John tomorrow at 10am"
+            - "Create a calendar event for the team standup"
+        """
+        logger.info(
+            f"create_calendar_event called: summary='{summary}', start='{start_datetime}', end='{end_datetime}'"
+        )
+
+        if db_session is None or search_space_id is None or user_id is None:
+            return {
+                "status": "error",
+                "message": "Google Calendar tool not properly configured. Please contact support.",
+            }
+
+        try:
+            metadata_service = GoogleCalendarToolMetadataService(db_session)
+            context = await metadata_service.get_creation_context(
+                search_space_id, user_id
+            )
+
+            if "error" in context:
+                logger.error(f"Failed to fetch creation context: {context['error']}")
+                return {"status": "error", "message": context["error"]}
+
+            accounts = context.get("accounts", [])
+            if accounts and all(a.get("auth_expired") for a in accounts):
+                logger.warning(
+                    "All Google Calendar accounts have expired authentication"
+                )
+                return {
+                    "status": "auth_error",
+                    "message": "All connected Google Calendar accounts need re-authentication. Please re-authenticate in your connector settings.",
+                    "connector_type": "google_calendar",
+                }
+
+            logger.info(
+                f"Requesting approval for creating calendar event: summary='{summary}'"
+            )
+            result = request_approval(
+                action_type="google_calendar_event_creation",
+                tool_name="create_calendar_event",
+                params={
+                    "summary": summary,
+                    "start_datetime": start_datetime,
+                    "end_datetime": end_datetime,
+                    "description": description,
+                    "location": location,
+                    "attendees": attendees,
+                    "timezone": context.get("timezone"),
+                    "connector_id": None,
+                },
+                context=context,
+            )
+
+            if result.rejected:
+                return {
+                    "status": "rejected",
+                    "message": "User declined. The event was not created. Do not ask again or suggest alternatives.",
+                }
+
+            final_summary = result.params.get("summary", summary)
+            final_start_datetime = result.params.get("start_datetime", start_datetime)
+            final_end_datetime = result.params.get("end_datetime", end_datetime)
+            final_description = result.params.get("description", description)
+            final_location = result.params.get("location", location)
+            final_attendees = result.params.get("attendees", attendees)
+            final_connector_id = result.params.get("connector_id")
+
+            if not final_summary or not final_summary.strip():
+                return {"status": "error", "message": "Event summary cannot be empty."}
+
+            from sqlalchemy.future import select
+
+            from app.db import SearchSourceConnector, SearchSourceConnectorType
+
+            _calendar_types = [
+                SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR,
+                SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
+            ]
+
+            if final_connector_id is not None:
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.id == final_connector_id,
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type.in_(_calendar_types),
+                    )
+                )
+                connector = result.scalars().first()
+                if not connector:
+                    return {
+                        "status": "error",
+                        "message": "Selected Google Calendar connector is invalid or has been disconnected.",
+                    }
+                actual_connector_id = connector.id
+            else:
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type.in_(_calendar_types),
+                    )
+                )
+                connector = result.scalars().first()
+                if not connector:
+                    return {
+                        "status": "error",
+                        "message": "No Google Calendar connector found. Please connect Google Calendar in your workspace settings.",
+                    }
+                actual_connector_id = connector.id
+
+            logger.info(
+                f"Creating calendar event: summary='{final_summary}', connector={actual_connector_id}"
+            )
+
+            tz = context.get("timezone", "UTC")
+
+            if (
+                connector.connector_type
+                == SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR
+            ):
+                cca_id = connector.config.get("composio_connected_account_id")
+                if not cca_id:
+                    return {
+                        "status": "error",
+                        "message": "Composio connected account ID not found for this connector.",
+                    }
+
+                from app.services.composio_service import ComposioService
+
+                (
+                    event_id,
+                    html_link,
+                    error,
+                ) = await ComposioService().create_calendar_event(
+                    connected_account_id=cca_id,
+                    entity_id=f"surfsense_{user_id}",
+                    summary=final_summary,
+                    start_datetime=final_start_datetime,
+                    end_datetime=final_end_datetime,
+                    timezone=tz,
+                    description=final_description,
+                    location=final_location,
+                    attendees=final_attendees,
+                )
+                if error:
+                    return {"status": "error", "message": error}
+                created = {
+                    "id": event_id,
+                    "summary": final_summary,
+                    "htmlLink": html_link,
+                }
+                logger.info(
+                    f"Calendar event created via Composio: id={event_id}, summary={final_summary}"
+                )
+            else:
+                config_data = dict(connector.config)
+
+                from app.config import config as app_config
+                from app.utils.oauth_security import TokenEncryption
+
+                token_encrypted = config_data.get("_token_encrypted", False)
+                if token_encrypted and app_config.SECRET_KEY:
+                    token_encryption = TokenEncryption(app_config.SECRET_KEY)
+                    for key in ("token", "refresh_token", "client_secret"):
+                        if config_data.get(key):
+                            config_data[key] = token_encryption.decrypt_token(
+                                config_data[key]
+                            )
+
+                exp = config_data.get("expiry", "")
+                if exp:
+                    exp = exp.replace("Z", "")
+
+                creds = Credentials(
+                    token=config_data.get("token"),
+                    refresh_token=config_data.get("refresh_token"),
+                    token_uri=config_data.get("token_uri"),
+                    client_id=config_data.get("client_id"),
+                    client_secret=config_data.get("client_secret"),
+                    scopes=config_data.get("scopes", []),
+                    expiry=datetime.fromisoformat(exp) if exp else None,
+                )
+
+                service = await asyncio.get_event_loop().run_in_executor(
+                    None, lambda: build("calendar", "v3", credentials=creds)
+                )
+
+                event_body: dict[str, Any] = {
+                    "summary": final_summary,
+                    "start": {"dateTime": final_start_datetime, "timeZone": tz},
+                    "end": {"dateTime": final_end_datetime, "timeZone": tz},
+                }
+                if final_description:
+                    event_body["description"] = final_description
+                if final_location:
+                    event_body["location"] = final_location
+                if final_attendees:
+                    event_body["attendees"] = [
+                        {"email": e.strip()} for e in final_attendees if e.strip()
+                    ]
+
+                try:
+                    created = await asyncio.get_event_loop().run_in_executor(
+                        None,
+                        lambda: (
+                            service.events()
+                            .insert(calendarId="primary", body=event_body)
+                            .execute()
+                        ),
+                    )
+                except Exception as api_err:
+                    from googleapiclient.errors import HttpError
+
+                    if isinstance(api_err, HttpError) and api_err.resp.status == 403:
+                        logger.warning(
+                            f"Insufficient permissions for connector {actual_connector_id}: {api_err}"
+                        )
+                        try:
+                            from sqlalchemy.orm.attributes import flag_modified
+
+                            _res = await db_session.execute(
+                                select(SearchSourceConnector).where(
+                                    SearchSourceConnector.id == actual_connector_id
+                                )
+                            )
+                            _conn = _res.scalar_one_or_none()
+                            if _conn and not _conn.config.get("auth_expired"):
+                                _conn.config = {**_conn.config, "auth_expired": True}
+                                flag_modified(_conn, "config")
+                                await db_session.commit()
+                        except Exception:
+                            logger.warning(
+                                "Failed to persist auth_expired for connector %s",
+                                actual_connector_id,
+                                exc_info=True,
+                            )
+                        return {
+                            "status": "insufficient_permissions",
+                            "connector_id": actual_connector_id,
+                            "message": "This Google Calendar account needs additional permissions. Please re-authenticate in connector settings.",
+                        }
+                    raise
+
+                logger.info(
+                    f"Calendar event created via Google API: id={created.get('id')}, summary={created.get('summary')}"
+                )
+
+            kb_message_suffix = ""
+            try:
+                from app.services.google_calendar import GoogleCalendarKBSyncService
+
+                kb_service = GoogleCalendarKBSyncService(db_session)
+                kb_result = await kb_service.sync_after_create(
+                    event_id=created.get("id"),
+                    event_summary=final_summary,
+                    calendar_id="primary",
+                    start_time=final_start_datetime,
+                    end_time=final_end_datetime,
+                    location=final_location,
+                    html_link=created.get("htmlLink"),
+                    description=final_description,
+                    connector_id=actual_connector_id,
+                    search_space_id=search_space_id,
+                    user_id=user_id,
+                )
+                if kb_result["status"] == "success":
+                    kb_message_suffix = " Your knowledge base has also been updated."
+                else:
+                    kb_message_suffix = " This event will be added to your knowledge base in the next scheduled sync."
+            except Exception as kb_err:
+                logger.warning(f"KB sync after create failed: {kb_err}")
+                kb_message_suffix = " This event will be added to your knowledge base in the next scheduled sync."
+
+            return {
+                "status": "success",
+                "event_id": created.get("id"),
+                "html_link": created.get("htmlLink"),
+                "message": f"Successfully created '{final_summary}' on Google Calendar.{kb_message_suffix}",
+            }
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+
+            logger.error(f"Error creating calendar event: {e}", exc_info=True)
+            return {
+                "status": "error",
+                "message": "Something went wrong while creating the event. Please try again.",
+            }
+
+    return create_calendar_event
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/tools/delete_event.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/tools/delete_event.py
new file mode 100644
index 0000000000..2f907e7463
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/tools/delete_event.py
@@ -0,0 +1,312 @@
+import asyncio
+import logging
+from datetime import datetime
+from typing import Any
+
+from google.oauth2.credentials import Credentials
+from googleapiclient.discovery import build
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.agents.multi_agent_chat.subagents.shared.hitl.approvals.self_gated import (
+    request_approval,
+)
+from app.services.google_calendar import GoogleCalendarToolMetadataService
+
+logger = logging.getLogger(__name__)
+
+
+def create_delete_calendar_event_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    @tool
+    async def delete_calendar_event(
+        event_title_or_id: str,
+        delete_from_kb: bool = False,
+    ) -> dict[str, Any]:
+        """Delete a Google Calendar event.
+
+        Use when the user asks to delete, remove, or cancel a calendar event.
+
+        Args:
+            event_title_or_id: The exact title or event ID of the event to delete.
+            delete_from_kb: Whether to also remove the event from the knowledge base.
+                          Default is False.
+                          Set to True to remove from both Google Calendar and knowledge base.
+
+        Returns:
+            Dictionary with:
+            - status: "success", "rejected", "not_found", "auth_error", or "error"
+            - event_id: Google Calendar event ID (if success)
+            - deleted_from_kb: whether the document was removed from the knowledge base
+            - message: Result message
+
+            IMPORTANT:
+            - If status is "rejected", the user explicitly declined. Respond with a brief
+              acknowledgment and do NOT retry or suggest alternatives.
+            - If status is "not_found", relay the exact message to the user and ask them
+              to verify the event name or check if it has been indexed.
+        Examples:
+            - "Delete the team standup event"
+            - "Cancel my dentist appointment on Friday"
+        """
+        logger.info(
+            f"delete_calendar_event called: event_ref='{event_title_or_id}', delete_from_kb={delete_from_kb}"
+        )
+
+        if db_session is None or search_space_id is None or user_id is None:
+            return {
+                "status": "error",
+                "message": "Google Calendar tool not properly configured. Please contact support.",
+            }
+
+        try:
+            metadata_service = GoogleCalendarToolMetadataService(db_session)
+            context = await metadata_service.get_deletion_context(
+                search_space_id, user_id, event_title_or_id
+            )
+
+            if "error" in context:
+                error_msg = context["error"]
+                if "not found" in error_msg.lower():
+                    logger.warning(f"Event not found: {error_msg}")
+                    return {"status": "not_found", "message": error_msg}
+                logger.error(f"Failed to fetch deletion context: {error_msg}")
+                return {"status": "error", "message": error_msg}
+
+            account = context.get("account", {})
+            if account.get("auth_expired"):
+                logger.warning(
+                    "Google Calendar account %s has expired authentication",
+                    account.get("id"),
+                )
+                return {
+                    "status": "auth_error",
+                    "message": "The Google Calendar account for this event needs re-authentication. Please re-authenticate in your connector settings.",
+                    "connector_type": "google_calendar",
+                }
+
+            event = context["event"]
+            event_id = event["event_id"]
+            document_id = event.get("document_id")
+            connector_id_from_context = context["account"]["id"]
+
+            if not event_id:
+                return {
+                    "status": "error",
+                    "message": "Event ID is missing from the indexed document. Please re-index the event and try again.",
+                }
+
+            logger.info(
+                f"Requesting approval for deleting calendar event: '{event_title_or_id}' (event_id={event_id}, delete_from_kb={delete_from_kb})"
+            )
+            result = request_approval(
+                action_type="google_calendar_event_deletion",
+                tool_name="delete_calendar_event",
+                params={
+                    "event_id": event_id,
+                    "connector_id": connector_id_from_context,
+                    "delete_from_kb": delete_from_kb,
+                },
+                context=context,
+            )
+
+            if result.rejected:
+                return {
+                    "status": "rejected",
+                    "message": "User declined. The event was not deleted. Do not ask again or suggest alternatives.",
+                }
+
+            final_event_id = result.params.get("event_id", event_id)
+            final_connector_id = result.params.get(
+                "connector_id", connector_id_from_context
+            )
+            final_delete_from_kb = result.params.get("delete_from_kb", delete_from_kb)
+
+            if not final_connector_id:
+                return {
+                    "status": "error",
+                    "message": "No connector found for this event.",
+                }
+
+            from sqlalchemy.future import select
+
+            from app.db import SearchSourceConnector, SearchSourceConnectorType
+
+            _calendar_types = [
+                SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR,
+                SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
+            ]
+
+            result = await db_session.execute(
+                select(SearchSourceConnector).filter(
+                    SearchSourceConnector.id == final_connector_id,
+                    SearchSourceConnector.search_space_id == search_space_id,
+                    SearchSourceConnector.user_id == user_id,
+                    SearchSourceConnector.connector_type.in_(_calendar_types),
+                )
+            )
+            connector = result.scalars().first()
+            if not connector:
+                return {
+                    "status": "error",
+                    "message": "Selected Google Calendar connector is invalid or has been disconnected.",
+                }
+
+            actual_connector_id = connector.id
+
+            logger.info(
+                f"Deleting calendar event: event_id='{final_event_id}', connector={actual_connector_id}"
+            )
+
+            if (
+                connector.connector_type
+                == SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR
+            ):
+                cca_id = connector.config.get("composio_connected_account_id")
+                if not cca_id:
+                    return {
+                        "status": "error",
+                        "message": "Composio connected account ID not found for this connector.",
+                    }
+
+                from app.services.composio_service import ComposioService
+
+                error = await ComposioService().delete_calendar_event(
+                    connected_account_id=cca_id,
+                    entity_id=f"surfsense_{user_id}",
+                    event_id=final_event_id,
+                )
+                if error:
+                    return {"status": "error", "message": error}
+            else:
+                config_data = dict(connector.config)
+
+                from app.config import config as app_config
+                from app.utils.oauth_security import TokenEncryption
+
+                token_encrypted = config_data.get("_token_encrypted", False)
+                if token_encrypted and app_config.SECRET_KEY:
+                    token_encryption = TokenEncryption(app_config.SECRET_KEY)
+                    for key in ("token", "refresh_token", "client_secret"):
+                        if config_data.get(key):
+                            config_data[key] = token_encryption.decrypt_token(
+                                config_data[key]
+                            )
+
+                exp = config_data.get("expiry", "")
+                if exp:
+                    exp = exp.replace("Z", "")
+
+                creds = Credentials(
+                    token=config_data.get("token"),
+                    refresh_token=config_data.get("refresh_token"),
+                    token_uri=config_data.get("token_uri"),
+                    client_id=config_data.get("client_id"),
+                    client_secret=config_data.get("client_secret"),
+                    scopes=config_data.get("scopes", []),
+                    expiry=datetime.fromisoformat(exp) if exp else None,
+                )
+
+                service = await asyncio.get_event_loop().run_in_executor(
+                    None, lambda: build("calendar", "v3", credentials=creds)
+                )
+
+                try:
+                    await asyncio.get_event_loop().run_in_executor(
+                        None,
+                        lambda: (
+                            service.events()
+                            .delete(calendarId="primary", eventId=final_event_id)
+                            .execute()
+                        ),
+                    )
+                except Exception as api_err:
+                    from googleapiclient.errors import HttpError
+
+                    if isinstance(api_err, HttpError) and api_err.resp.status == 403:
+                        logger.warning(
+                            f"Insufficient permissions for connector {actual_connector_id}: {api_err}"
+                        )
+                        try:
+                            from sqlalchemy.orm.attributes import flag_modified
+
+                            _res = await db_session.execute(
+                                select(SearchSourceConnector).where(
+                                    SearchSourceConnector.id == actual_connector_id
+                                )
+                            )
+                            _conn = _res.scalar_one_or_none()
+                            if _conn and not _conn.config.get("auth_expired"):
+                                _conn.config = {**_conn.config, "auth_expired": True}
+                                flag_modified(_conn, "config")
+                                await db_session.commit()
+                        except Exception:
+                            logger.warning(
+                                "Failed to persist auth_expired for connector %s",
+                                actual_connector_id,
+                                exc_info=True,
+                            )
+                        return {
+                            "status": "insufficient_permissions",
+                            "connector_id": actual_connector_id,
+                            "message": "This Google Calendar account needs additional permissions. Please re-authenticate in connector settings.",
+                        }
+                    raise
+
+            logger.info(f"Calendar event deleted: event_id={final_event_id}")
+
+            delete_result: dict[str, Any] = {
+                "status": "success",
+                "event_id": final_event_id,
+                "message": f"Successfully deleted the calendar event '{event.get('summary', event_title_or_id)}'.",
+            }
+
+            deleted_from_kb = False
+            if final_delete_from_kb and document_id:
+                try:
+                    from app.db import Document
+
+                    doc_result = await db_session.execute(
+                        select(Document).filter(Document.id == document_id)
+                    )
+                    document = doc_result.scalars().first()
+                    if document:
+                        await db_session.delete(document)
+                        await db_session.commit()
+                        deleted_from_kb = True
+                        logger.info(
+                            f"Deleted document {document_id} from knowledge base"
+                        )
+                    else:
+                        logger.warning(f"Document {document_id} not found in KB")
+                except Exception as e:
+                    logger.error(f"Failed to delete document from KB: {e}")
+                    await db_session.rollback()
+                    delete_result["warning"] = (
+                        f"Event deleted, but failed to remove from knowledge base: {e!s}"
+                    )
+
+            delete_result["deleted_from_kb"] = deleted_from_kb
+            if deleted_from_kb:
+                delete_result["message"] = (
+                    f"{delete_result.get('message', '')} (also removed from knowledge base)"
+                )
+
+            return delete_result
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+
+            logger.error(f"Error deleting calendar event: {e}", exc_info=True)
+            return {
+                "status": "error",
+                "message": "Something went wrong while deleting the event. Please try again.",
+            }
+
+    return delete_calendar_event
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/tools/index.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/tools/index.py
new file mode 100644
index 0000000000..2570a51b2a
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/tools/index.py
@@ -0,0 +1,39 @@
+"""``calendar`` native tools and (empty) permission ruleset.
+
+Tools self-gate via :func:`request_approval` in their bodies, so the
+ruleset just falls through to the SurfSense allow-by-default rules.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.tools import BaseTool
+
+from app.agents.new_chat.permissions import Ruleset
+
+from .create_event import create_create_calendar_event_tool
+from .delete_event import create_delete_calendar_event_tool
+from .search_events import create_search_calendar_events_tool
+from .update_event import create_update_calendar_event_tool
+
+NAME = "calendar"
+
+RULESET = Ruleset(origin=NAME, rules=[])
+
+
+def load_tools(
+    *, dependencies: dict[str, Any] | None = None, **kwargs: Any
+) -> list[BaseTool]:
+    d = {**(dependencies or {}), **kwargs}
+    common = {
+        "db_session": d["db_session"],
+        "search_space_id": d["search_space_id"],
+        "user_id": d["user_id"],
+    }
+    return [
+        create_search_calendar_events_tool(**common),
+        create_create_calendar_event_tool(**common),
+        create_update_calendar_event_tool(**common),
+        create_delete_calendar_event_tool(**common),
+    ]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/tools/search_events.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/tools/search_events.py
new file mode 100644
index 0000000000..6772d5a1e7
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/tools/search_events.py
@@ -0,0 +1,165 @@
+import logging
+from typing import Any
+
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.future import select
+
+from app.agents.new_chat.tools.gmail.search_emails import _build_credentials
+from app.db import SearchSourceConnector, SearchSourceConnectorType
+
+logger = logging.getLogger(__name__)
+
+_CALENDAR_TYPES = [
+    SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR,
+    SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
+]
+
+
+def _to_calendar_boundary(value: str, *, is_end: bool) -> str:
+    """Promote a bare YYYY-MM-DD to RFC3339 with a day-edge time, leave full datetimes alone."""
+    if "T" in value:
+        return value
+    time = "23:59:59" if is_end else "00:00:00"
+    return f"{value}T{time}Z"
+
+
+def create_search_calendar_events_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    @tool
+    async def search_calendar_events(
+        start_date: str,
+        end_date: str,
+        max_results: int = 25,
+    ) -> dict[str, Any]:
+        """Search Google Calendar events within a date range.
+
+        Args:
+            start_date: Start date in YYYY-MM-DD format (e.g. "2026-04-01").
+            end_date: End date in YYYY-MM-DD format (e.g. "2026-04-30").
+            max_results: Maximum number of events to return (default 25, max 50).
+
+        Returns:
+            Dictionary with status and a list of events including
+            event_id, summary, start, end, location, attendees.
+        """
+        if db_session is None or search_space_id is None or user_id is None:
+            return {
+                "status": "error",
+                "message": "Calendar tool not properly configured.",
+            }
+
+        max_results = min(max_results, 50)
+
+        try:
+            result = await db_session.execute(
+                select(SearchSourceConnector).filter(
+                    SearchSourceConnector.search_space_id == search_space_id,
+                    SearchSourceConnector.user_id == user_id,
+                    SearchSourceConnector.connector_type.in_(_CALENDAR_TYPES),
+                )
+            )
+            connector = result.scalars().first()
+            if not connector:
+                return {
+                    "status": "error",
+                    "message": "No Google Calendar connector found. Please connect Google Calendar in your workspace settings.",
+                }
+
+            if (
+                connector.connector_type
+                == SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR
+            ):
+                cca_id = connector.config.get("composio_connected_account_id")
+                if not cca_id:
+                    return {
+                        "status": "error",
+                        "message": "Composio connected account ID not found for this connector.",
+                    }
+
+                from app.services.composio_service import ComposioService
+
+                events_raw, error = await ComposioService().get_calendar_events(
+                    connected_account_id=cca_id,
+                    entity_id=f"surfsense_{user_id}",
+                    time_min=_to_calendar_boundary(start_date, is_end=False),
+                    time_max=_to_calendar_boundary(end_date, is_end=True),
+                    max_results=max_results,
+                )
+                if not events_raw and not error:
+                    error = "No events found in the specified date range."
+            else:
+                creds = _build_credentials(connector)
+
+                from app.connectors.google_calendar_connector import (
+                    GoogleCalendarConnector,
+                )
+
+                cal = GoogleCalendarConnector(
+                    credentials=creds,
+                    session=db_session,
+                    user_id=user_id,
+                    connector_id=connector.id,
+                )
+
+                events_raw, error = await cal.get_all_primary_calendar_events(
+                    start_date=start_date,
+                    end_date=end_date,
+                    max_results=max_results,
+                )
+
+            if error:
+                if (
+                    "re-authenticate" in error.lower()
+                    or "authentication failed" in error.lower()
+                ):
+                    return {
+                        "status": "auth_error",
+                        "message": error,
+                        "connector_type": "google_calendar",
+                    }
+                if "no events found" in error.lower():
+                    return {
+                        "status": "success",
+                        "events": [],
+                        "total": 0,
+                        "message": error,
+                    }
+                return {"status": "error", "message": error}
+
+            events = []
+            for ev in events_raw:
+                start = ev.get("start", {})
+                end = ev.get("end", {})
+                attendees_raw = ev.get("attendees", [])
+                events.append(
+                    {
+                        "event_id": ev.get("id"),
+                        "summary": ev.get("summary", "No Title"),
+                        "start": start.get("dateTime") or start.get("date", ""),
+                        "end": end.get("dateTime") or end.get("date", ""),
+                        "location": ev.get("location", ""),
+                        "description": ev.get("description", ""),
+                        "html_link": ev.get("htmlLink", ""),
+                        "attendees": [a.get("email", "") for a in attendees_raw[:10]],
+                        "status": ev.get("status", ""),
+                    }
+                )
+
+            return {"status": "success", "events": events, "total": len(events)}
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+            logger.error("Error searching calendar events: %s", e, exc_info=True)
+            return {
+                "status": "error",
+                "message": "Failed to search calendar events. Please try again.",
+            }
+
+    return search_calendar_events
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/tools/update_event.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/tools/update_event.py
new file mode 100644
index 0000000000..e6f9f098e2
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/tools/update_event.py
@@ -0,0 +1,398 @@
+import asyncio
+import logging
+from datetime import datetime
+from typing import Any
+
+from google.oauth2.credentials import Credentials
+from googleapiclient.discovery import build
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.agents.multi_agent_chat.subagents.shared.hitl.approvals.self_gated import (
+    request_approval,
+)
+from app.services.google_calendar import GoogleCalendarToolMetadataService
+
+logger = logging.getLogger(__name__)
+
+
+def _is_date_only(value: str) -> bool:
+    """Return True when *value* looks like a bare date (YYYY-MM-DD) with no time component."""
+    return len(value) <= 10 and "T" not in value
+
+
+def _build_time_body(value: str, context: dict[str, Any] | Any) -> dict[str, str]:
+    """Build a Google Calendar start/end body using ``date`` for all-day
+    events and ``dateTime`` for timed events."""
+    if _is_date_only(value):
+        return {"date": value}
+    tz = context.get("timezone", "UTC") if isinstance(context, dict) else "UTC"
+    return {"dateTime": value, "timeZone": tz}
+
+
+def create_update_calendar_event_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    @tool
+    async def update_calendar_event(
+        event_title_or_id: str,
+        new_summary: str | None = None,
+        new_start_datetime: str | None = None,
+        new_end_datetime: str | None = None,
+        new_description: str | None = None,
+        new_location: str | None = None,
+        new_attendees: list[str] | None = None,
+    ) -> dict[str, Any]:
+        """Update an existing Google Calendar event.
+
+        Use when the user asks to modify, reschedule, or change a calendar event.
+
+        Args:
+            event_title_or_id: The exact title or event ID of the event to update.
+            new_summary: New event title (if changing).
+            new_start_datetime: New start time in ISO 8601 format (if rescheduling).
+            new_end_datetime: New end time in ISO 8601 format (if rescheduling).
+            new_description: New event description (if changing).
+            new_location: New event location (if changing).
+            new_attendees: New list of attendee email addresses (if changing).
+
+        Returns:
+            Dictionary with:
+            - status: "success", "rejected", "not_found", "auth_error", or "error"
+            - event_id: Google Calendar event ID (if success)
+            - html_link: URL to open the event (if success)
+            - message: Result message
+
+            IMPORTANT:
+            - If status is "rejected", the user explicitly declined. Respond with a brief
+              acknowledgment and do NOT retry or suggest alternatives.
+            - If status is "not_found", relay the exact message to the user and ask them
+              to verify the event name or check if it has been indexed.
+        Examples:
+            - "Reschedule the team standup to 3pm"
+            - "Change the location of my dentist appointment"
+        """
+        logger.info(f"update_calendar_event called: event_ref='{event_title_or_id}'")
+
+        if db_session is None or search_space_id is None or user_id is None:
+            return {
+                "status": "error",
+                "message": "Google Calendar tool not properly configured. Please contact support.",
+            }
+
+        try:
+            metadata_service = GoogleCalendarToolMetadataService(db_session)
+            context = await metadata_service.get_update_context(
+                search_space_id, user_id, event_title_or_id
+            )
+
+            if "error" in context:
+                error_msg = context["error"]
+                if "not found" in error_msg.lower():
+                    logger.warning(f"Event not found: {error_msg}")
+                    return {"status": "not_found", "message": error_msg}
+                logger.error(f"Failed to fetch update context: {error_msg}")
+                return {"status": "error", "message": error_msg}
+
+            if context.get("auth_expired"):
+                logger.warning("Google Calendar account has expired authentication")
+                return {
+                    "status": "auth_error",
+                    "message": "The Google Calendar account for this event needs re-authentication. Please re-authenticate in your connector settings.",
+                    "connector_type": "google_calendar",
+                }
+
+            event = context["event"]
+            event_id = event["event_id"]
+            document_id = event.get("document_id")
+            connector_id_from_context = context["account"]["id"]
+
+            if not event_id:
+                return {
+                    "status": "error",
+                    "message": "Event ID is missing from the indexed document. Please re-index the event and try again.",
+                }
+
+            logger.info(
+                f"Requesting approval for updating calendar event: '{event_title_or_id}' (event_id={event_id})"
+            )
+            result = request_approval(
+                action_type="google_calendar_event_update",
+                tool_name="update_calendar_event",
+                params={
+                    "event_id": event_id,
+                    "document_id": document_id,
+                    "connector_id": connector_id_from_context,
+                    "new_summary": new_summary,
+                    "new_start_datetime": new_start_datetime,
+                    "new_end_datetime": new_end_datetime,
+                    "new_description": new_description,
+                    "new_location": new_location,
+                    "new_attendees": new_attendees,
+                },
+                context=context,
+            )
+
+            if result.rejected:
+                return {
+                    "status": "rejected",
+                    "message": "User declined. The event was not updated. Do not ask again or suggest alternatives.",
+                }
+
+            final_event_id = result.params.get("event_id", event_id)
+            final_connector_id = result.params.get(
+                "connector_id", connector_id_from_context
+            )
+            final_new_summary = result.params.get("new_summary", new_summary)
+            final_new_start_datetime = result.params.get(
+                "new_start_datetime", new_start_datetime
+            )
+            final_new_end_datetime = result.params.get(
+                "new_end_datetime", new_end_datetime
+            )
+            final_new_description = result.params.get(
+                "new_description", new_description
+            )
+            final_new_location = result.params.get("new_location", new_location)
+            final_new_attendees = result.params.get("new_attendees", new_attendees)
+
+            if not final_connector_id:
+                return {
+                    "status": "error",
+                    "message": "No connector found for this event.",
+                }
+
+            from sqlalchemy.future import select
+
+            from app.db import SearchSourceConnector, SearchSourceConnectorType
+
+            _calendar_types = [
+                SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR,
+                SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
+            ]
+
+            result = await db_session.execute(
+                select(SearchSourceConnector).filter(
+                    SearchSourceConnector.id == final_connector_id,
+                    SearchSourceConnector.search_space_id == search_space_id,
+                    SearchSourceConnector.user_id == user_id,
+                    SearchSourceConnector.connector_type.in_(_calendar_types),
+                )
+            )
+            connector = result.scalars().first()
+            if not connector:
+                return {
+                    "status": "error",
+                    "message": "Selected Google Calendar connector is invalid or has been disconnected.",
+                }
+
+            actual_connector_id = connector.id
+
+            logger.info(
+                f"Updating calendar event: event_id='{final_event_id}', connector={actual_connector_id}"
+            )
+
+            has_changes = any(
+                v is not None
+                for v in (
+                    final_new_summary,
+                    final_new_start_datetime,
+                    final_new_end_datetime,
+                    final_new_description,
+                    final_new_location,
+                    final_new_attendees,
+                )
+            )
+            if not has_changes:
+                return {
+                    "status": "error",
+                    "message": "No changes specified. Please provide at least one field to update.",
+                }
+
+            if (
+                connector.connector_type
+                == SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR
+            ):
+                cca_id = connector.config.get("composio_connected_account_id")
+                if not cca_id:
+                    return {
+                        "status": "error",
+                        "message": "Composio connected account ID not found for this connector.",
+                    }
+
+                from app.services.composio_service import ComposioService
+
+                tz_for_composio: str | None = None
+                if final_new_start_datetime is not None and not _is_date_only(
+                    final_new_start_datetime
+                ):
+                    tz_for_composio = (
+                        context.get("timezone") if isinstance(context, dict) else None
+                    )
+
+                _, html_link, error = await ComposioService().update_calendar_event(
+                    connected_account_id=cca_id,
+                    entity_id=f"surfsense_{user_id}",
+                    event_id=final_event_id,
+                    summary=final_new_summary,
+                    start_time=final_new_start_datetime,
+                    end_time=final_new_end_datetime,
+                    timezone=tz_for_composio,
+                    description=final_new_description,
+                    location=final_new_location,
+                    attendees=final_new_attendees,
+                )
+                if error:
+                    return {"status": "error", "message": error}
+                updated = {"htmlLink": html_link}
+                logger.info(
+                    f"Calendar event updated via Composio: event_id={final_event_id}"
+                )
+            else:
+                config_data = dict(connector.config)
+
+                from app.config import config as app_config
+                from app.utils.oauth_security import TokenEncryption
+
+                token_encrypted = config_data.get("_token_encrypted", False)
+                if token_encrypted and app_config.SECRET_KEY:
+                    token_encryption = TokenEncryption(app_config.SECRET_KEY)
+                    for key in ("token", "refresh_token", "client_secret"):
+                        if config_data.get(key):
+                            config_data[key] = token_encryption.decrypt_token(
+                                config_data[key]
+                            )
+
+                exp = config_data.get("expiry", "")
+                if exp:
+                    exp = exp.replace("Z", "")
+
+                creds = Credentials(
+                    token=config_data.get("token"),
+                    refresh_token=config_data.get("refresh_token"),
+                    token_uri=config_data.get("token_uri"),
+                    client_id=config_data.get("client_id"),
+                    client_secret=config_data.get("client_secret"),
+                    scopes=config_data.get("scopes", []),
+                    expiry=datetime.fromisoformat(exp) if exp else None,
+                )
+
+                service = await asyncio.get_event_loop().run_in_executor(
+                    None, lambda: build("calendar", "v3", credentials=creds)
+                )
+
+                update_body: dict[str, Any] = {}
+                if final_new_summary is not None:
+                    update_body["summary"] = final_new_summary
+                if final_new_start_datetime is not None:
+                    update_body["start"] = _build_time_body(
+                        final_new_start_datetime, context
+                    )
+                if final_new_end_datetime is not None:
+                    update_body["end"] = _build_time_body(
+                        final_new_end_datetime, context
+                    )
+                if final_new_description is not None:
+                    update_body["description"] = final_new_description
+                if final_new_location is not None:
+                    update_body["location"] = final_new_location
+                if final_new_attendees is not None:
+                    update_body["attendees"] = [
+                        {"email": e.strip()} for e in final_new_attendees if e.strip()
+                    ]
+
+                try:
+                    updated = await asyncio.get_event_loop().run_in_executor(
+                        None,
+                        lambda: (
+                            service.events()
+                            .patch(
+                                calendarId="primary",
+                                eventId=final_event_id,
+                                body=update_body,
+                            )
+                            .execute()
+                        ),
+                    )
+                except Exception as api_err:
+                    from googleapiclient.errors import HttpError
+
+                    if isinstance(api_err, HttpError) and api_err.resp.status == 403:
+                        logger.warning(
+                            f"Insufficient permissions for connector {actual_connector_id}: {api_err}"
+                        )
+                        try:
+                            from sqlalchemy.orm.attributes import flag_modified
+
+                            _res = await db_session.execute(
+                                select(SearchSourceConnector).where(
+                                    SearchSourceConnector.id == actual_connector_id
+                                )
+                            )
+                            _conn = _res.scalar_one_or_none()
+                            if _conn and not _conn.config.get("auth_expired"):
+                                _conn.config = {**_conn.config, "auth_expired": True}
+                                flag_modified(_conn, "config")
+                                await db_session.commit()
+                        except Exception:
+                            logger.warning(
+                                "Failed to persist auth_expired for connector %s",
+                                actual_connector_id,
+                                exc_info=True,
+                            )
+                        return {
+                            "status": "insufficient_permissions",
+                            "connector_id": actual_connector_id,
+                            "message": "This Google Calendar account needs additional permissions. Please re-authenticate in connector settings.",
+                        }
+                    raise
+
+                logger.info(
+                    f"Calendar event updated via Google API: event_id={final_event_id}"
+                )
+
+            kb_message_suffix = ""
+            if document_id is not None:
+                try:
+                    from app.services.google_calendar import GoogleCalendarKBSyncService
+
+                    kb_service = GoogleCalendarKBSyncService(db_session)
+                    kb_result = await kb_service.sync_after_update(
+                        document_id=document_id,
+                        event_id=final_event_id,
+                        connector_id=actual_connector_id,
+                        search_space_id=search_space_id,
+                        user_id=user_id,
+                    )
+                    if kb_result["status"] == "success":
+                        kb_message_suffix = (
+                            " Your knowledge base has also been updated."
+                        )
+                    else:
+                        kb_message_suffix = " The knowledge base will be updated in the next scheduled sync."
+                except Exception as kb_err:
+                    logger.warning(f"KB sync after update failed: {kb_err}")
+                    kb_message_suffix = " The knowledge base will be updated in the next scheduled sync."
+
+            return {
+                "status": "success",
+                "event_id": final_event_id,
+                "html_link": updated.get("htmlLink"),
+                "message": f"Successfully updated the calendar event.{kb_message_suffix}",
+            }
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+
+            logger.error(f"Error updating calendar event: {e}", exc_info=True)
+            return {
+                "status": "error",
+                "message": "Something went wrong while updating the event. Please try again.",
+            }
+
+    return update_calendar_event
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/clickup/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/clickup/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/clickup/agent.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/clickup/agent.py
new file mode 100644
index 0000000000..e1308a1006
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/clickup/agent.py
@@ -0,0 +1,43 @@
+"""``clickup`` route: ``SurfSenseSubagentSpec`` builder for deepagents.
+
+Tools come exclusively from MCP. The connector's own approval ruleset is
+declared in :data:`tools.index.RULESET`; the orchestrator layers it into
+a per-subagent :class:`PermissionMiddleware`.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.language_models import BaseChatModel
+from langchain_core.tools import BaseTool
+
+from app.agents.multi_agent_chat.subagents.shared.md_file_reader import read_md_file
+from app.agents.multi_agent_chat.subagents.shared.spec import SurfSenseSubagentSpec
+from app.agents.multi_agent_chat.subagents.shared.subagent_builder import pack_subagent
+
+from .tools.index import NAME, RULESET
+
+
+def build_subagent(
+    *,
+    dependencies: dict[str, Any],
+    model: BaseChatModel | None = None,
+    middleware_stack: dict[str, Any] | None = None,
+    mcp_tools: list[BaseTool] | None = None,
+) -> SurfSenseSubagentSpec:
+    description = (
+        read_md_file(__package__, "description").strip()
+        or "Handles clickup tasks for this workspace."
+    )
+    system_prompt = read_md_file(__package__, "system_prompt").strip()
+    return pack_subagent(
+        name=NAME,
+        description=description,
+        system_prompt=system_prompt,
+        tools=list(mcp_tools or []),
+        ruleset=RULESET,
+        dependencies=dependencies,
+        model=model,
+        middleware_stack=middleware_stack,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/clickup/description.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/clickup/description.md
new file mode 100644
index 0000000000..7c94caca4f
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/clickup/description.md
@@ -0,0 +1,2 @@
+Specialist for tasks and lists in the user's ClickUp workspace.
+Use proactively when the user wants to find, create, change, or progress a ClickUp task.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/clickup/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/clickup/system_prompt.md
new file mode 100644
index 0000000000..eaea5827ba
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/clickup/system_prompt.md
@@ -0,0 +1,104 @@
+You are a ClickUp specialist for the user's connected ClickUp workspace.
+
+ClickUp vocabulary:
+- **Workspace → Space → Folder → List → Task**: nested scope. Tasks live in Lists; Lists live in either a Folder or directly under a Space; Folders live in Spaces. The Workspace is fixed per connection — you do not need to resolve it.
+- **Task ID**: short alphanumeric strings (e.g. `86a4qd5xz`). Stable and unique within the workspace; users do not typically know them. Some workspaces also enable custom task IDs — both forms are valid identifiers.
+- **Custom statuses are per-List**: each List defines its own ordered status set. Status names must be resolved against the **target task's parent List** before use; they are not workspace-global.
+- **Custom Fields are per-List**: each List can define custom fields (dropdown, number, date, label, etc.). Whether each is required-or-optional and the valid values both vary per List. Look up the List's custom-field schema before setting custom fields on a task.
+- **Priority**: stable platform enum — `1=Urgent`, `2=High`, `3=Normal`, `4=Low`.
+- **Assignees**: identified by opaque workspace-member IDs, never by display name or email. Map a display name or email to a member ID before assigning.
+
+When invoked:
+1. Read the supervisor's request, then read the runtime tool list to learn what information you can fetch and which mutations are available.
+2. Plan the minimum chain of lookups needed to resolve any task, list, space, status, assignee, or custom-field value the request leaves unspecified.
+3. Execute the planned lookups, then the requested mutation (if any), then return.
+
+Resolution principle (the core behaviour):
+**Proactively look up any identifier, name, value, or scope the request leaves unspecified — task IDs, list IDs, status names, member IDs, custom-field values, anything else — using the available tools instead of asking the supervisor.** Most user requests reference tasks by title and lists by name, not by ID. Search for them.
+
+When a lookup for a single slot returns multiple plausible candidates and you cannot confidently pick one, return `status=blocked` with up to 5 candidates in `evidence.matched_candidates` and the unresolved slot in `missing_fields`. The supervisor will disambiguate and redelegate.
+
+When a lookup returns zero matches for a slot the request requires, return `status=blocked` with a `next_step` suggesting alternative search terms.
+
+Mutation guardrails:
+- Resolve every required ClickUp value (`list_id`, `task_id`, target status name, assignee member IDs, custom-field values) by looking it up before calling a mutation tool. Mutations have chained dependencies — find the task to know its parent List; look up the List to know its valid statuses and custom-field schema.
+- To "progress" or change a task's status, look up the parent List's valid statuses and apply one of those exact names. If the user-requested target status is not in the List's status set, return `status=blocked` and surface the available statuses in `evidence.matched_candidates`.
+- For create operations, resolve the target List first. If that List has required custom fields, look up the schema and block with `missing_fields` for any required value the request doesn't supply.
+- Never invent task IDs, list IDs, status names, member IDs, custom-field values, or mutation outcomes. Every field in `evidence` must come from a tool result.
+- Confirm the mutation tool returned a success response before claiming success. If the mutation is approval-rejected (HITL), return `status=blocked` with `next_step="user declined; do not retry"`.
+- One operation per delegation. For multi-mutation requests, complete the highest-priority one and return `status=partial` with the remainder in `next_step`.
+
+Failure handling:
+- Tool failure: return `status=error`, place the underlying error message in `action_summary`, and put a concise recovery in `next_step`.
+- Rate-limit error from the MCP: ClickUp's MCP enforces a shared daily call cap. Return `status=error` with the underlying message; recovery is "retry later" rather than re-issuing immediately.
+- No useful results after reasonable narrowing / broadening: return `status=blocked` with search-term suggestions in `next_step`.
+
+<example>
+Supervisor: "Find tasks about the homepage redesign."
+1. Workspace search for "homepage redesign" → matched tasks.
+2. Return `status=success` with the matched tasks in `evidence.items`.
+</example>
+
+<example>
+Supervisor: "Create a task 'Draft blog post' in the Content Pipeline list."
+1. Workspace search for "Content Pipeline" → one strong match of type List; capture its `list_id`.
+2. Look up the List's custom-field schema → no required fields beyond `name`.
+3. Create the task with `name="Draft blog post"` in the resolved `list_id`.
+4. Confirm tool success → return `status=success` with the new task's identifier and url.
+</example>
+
+<example>
+Supervisor: "Move task 'Fix login bug' to In Review and assign it to Alex."
+1. Workspace search for "Fix login bug" → one match; capture `task_id` and parent `list_id`.
+2. Look up the parent List's statuses → confirm "In Review" exists. (If not, block with the actual valid statuses.)
+3. Find member by name "Alex" → two matches.
+4. Cannot confidently pick the assignee. Return:
+   {
+     "status": "blocked",
+     "action_summary": "Task and target status resolved; two members match 'Alex'.",
+     "evidence": {
+       "task_id": "86a4qd5xz",
+       "title": "Fix login bug",
+       "status": "In Review",
+       "matched_candidates": [
+         { "id": "member_111", "label": "Alex Chen <alex.chen@…>" },
+         { "id": "member_222", "label": "Alex Wong <alex.wong@…>" }
+       ]
+     },
+     "next_step": "Confirm which Alex, then redelegate.",
+     "missing_fields": ["assignee"]
+   }
+</example>
+
+<output_contract>
+Return **only** one JSON object (no markdown, no prose):
+{
+  "status": "success" | "partial" | "blocked" | "error",
+  "action_summary": string,
+  "evidence": {
+    "task_id": string | null,
+    "title": string | null,
+    "list_id": string | null,
+    "list_name": string | null,
+    "status": string | null,
+    "assignees": object | null,
+    "priority": "Urgent" | "High" | "Normal" | "Low" | null,
+    "url": string | null,
+    "matched_candidates": [
+      { "id": string, "label": string }
+    ] | null,
+    "items": object | null
+  },
+  "next_step": string | null,
+  "missing_fields": string[] | null,
+  "assumptions": string[] | null
+}
+Rules:
+- `status=success` → `next_step=null`, `missing_fields=null`.
+- `status=partial|blocked|error` → `next_step` must be non-null.
+- `status=blocked` due to missing required inputs → `missing_fields` must be non-null.
+- For blocked ambiguity, populate `evidence.matched_candidates` with up to 5 options (`id` + `label` — works for any kind of candidate: task, list, member, status, custom-field choice, etc.).
+- For discovery-only queries (lists), populate `evidence.items` with the structured list.
+</output_contract>
+
+Discover before you mutate; never guess identifiers, list statuses, or assignees.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/clickup/tools/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/clickup/tools/__init__.py
new file mode 100644
index 0000000000..b629234f94
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/clickup/tools/__init__.py
@@ -0,0 +1,3 @@
+"""ClickUp route: native tool factories are empty; MCP supplies tools when configured."""
+
+__all__: list[str] = []
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/clickup/tools/index.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/clickup/tools/index.py
new file mode 100644
index 0000000000..b2c5230800
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/clickup/tools/index.py
@@ -0,0 +1,20 @@
+"""``clickup`` permission ruleset (rules over MCP tool names)."""
+
+from __future__ import annotations
+
+from app.agents.new_chat.permissions import Rule, Ruleset
+
+NAME = "clickup"
+
+RULESET = Ruleset(
+    origin=NAME,
+    rules=[
+        Rule(permission="clickup_search", pattern="*", action="allow"),
+        Rule(permission="clickup_get_task", pattern="*", action="allow"),
+        Rule(permission="clickup_get_workspace_hierarchy", pattern="*", action="allow"),
+        Rule(permission="clickup_get_list", pattern="*", action="allow"),
+        Rule(permission="clickup_find_member_by_name", pattern="*", action="allow"),
+        Rule(permission="clickup_create_task", pattern="*", action="ask"),
+        Rule(permission="clickup_update_task", pattern="*", action="ask"),
+    ],
+)
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/confluence/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/confluence/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/confluence/agent.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/confluence/agent.py
new file mode 100644
index 0000000000..5e95c876d5
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/confluence/agent.py
@@ -0,0 +1,44 @@
+"""``confluence`` route: ``SurfSenseSubagentSpec`` builder for deepagents.
+
+Tools self-gate inside their bodies via :func:`request_approval`; the
+empty :data:`tools.index.RULESET` is layered into a per-subagent
+:class:`PermissionMiddleware` for uniformity.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.language_models import BaseChatModel
+from langchain_core.tools import BaseTool
+
+from app.agents.multi_agent_chat.subagents.shared.md_file_reader import read_md_file
+from app.agents.multi_agent_chat.subagents.shared.spec import SurfSenseSubagentSpec
+from app.agents.multi_agent_chat.subagents.shared.subagent_builder import pack_subagent
+
+from .tools.index import NAME, RULESET, load_tools
+
+
+def build_subagent(
+    *,
+    dependencies: dict[str, Any],
+    model: BaseChatModel | None = None,
+    middleware_stack: dict[str, Any] | None = None,
+    mcp_tools: list[BaseTool] | None = None,
+) -> SurfSenseSubagentSpec:
+    tools = [*load_tools(dependencies=dependencies), *(mcp_tools or [])]
+    description = (
+        read_md_file(__package__, "description").strip()
+        or "Handles confluence tasks for this workspace."
+    )
+    system_prompt = read_md_file(__package__, "system_prompt").strip()
+    return pack_subagent(
+        name=NAME,
+        description=description,
+        system_prompt=system_prompt,
+        tools=tools,
+        ruleset=RULESET,
+        dependencies=dependencies,
+        model=model,
+        middleware_stack=middleware_stack,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/confluence/description.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/confluence/description.md
new file mode 100644
index 0000000000..f8eb5bdee6
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/confluence/description.md
@@ -0,0 +1,2 @@
+Specialist for pages in the user's Confluence wiki.
+Use proactively when the user wants to create, change, or remove a Confluence page.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/confluence/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/confluence/system_prompt.md
new file mode 100644
index 0000000000..991ec3d031
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/confluence/system_prompt.md
@@ -0,0 +1,108 @@
+You are a Confluence specialist for the user's connected Confluence wiki.
+
+## Vocabulary you must use precisely
+
+- **Content is HTML / Confluence storage format, not Markdown** — `create_confluence_page` and `update_confluence_page` accept `content` / `new_content` as Confluence's native storage format (XHTML-based). Generate `<h1>`, `<h2>`, `<p>`, `<ul><li>`, `<table>` etc. — **never** Markdown (`#`, `**`, `-`, fenced code blocks). The tool stores whatever you pass verbatim; bad format means a broken page.
+- **`update_confluence_page` is REPLACE, and there is no read tool** — whatever you pass as `new_content` replaces the entire page body; omit the field and the current body is preserved (same per-field rule applies to `new_title`). You have **no tool to read the existing page body**, so you cannot intelligently "append" or "add to" a page — you can only fully replace, and only with content the supervisor or user actually provided. If the supervisor asks for an additive change without supplying the full intended page content, return `status=blocked` explaining the limitation; do not invent or reconstruct prior content.
+- **Title-or-id resolution against the KB index** — `update_confluence_page` and `delete_confluence_page` accept either a human-readable page title (resolved against the locally-synced Confluence KB index) or a direct `page_id`. Pages that exist in Confluence but have not been indexed yet cannot be resolved by title.
+
+## Required inputs
+
+**For every required input below, first try to infer it from the supervisor's task text** — extract titles from natural phrasing (`"the Q2 Plan page"`, `"my Onboarding doc"`), topics from `"about X"` constructions. Only return `status=blocked` with `missing_fields` when an input is genuinely absent or ambiguous after a thorough read.
+
+- `create_confluence_page` — `title` (a clear topic from the user; do not invent). You may generate the optional `content` body yourself **as Confluence storage format (HTML)**, never as Markdown. You have no tool to look up Confluence space IDs, so pass `space_id=None` and let the user pick the destination space in the HITL approval card; if the supervisor's task already includes a space ID, pass it through.
+- `update_confluence_page` — `page_title_or_id` (infer the target from the task) and at least one of `new_title` / `new_content`. Pass only the fields the user asked to change; omit unchanged ones so they're preserved. If the user asked to add to or extend a page without supplying the full intended content, do not call this tool — return `status=blocked` per the REPLACE limitation in the Vocabulary section.
+- `delete_confluence_page` — `page_title_or_id` (infer the target from the task). Only set `delete_from_kb=true` when the user explicitly asked to remove the page from the knowledge base; otherwise leave it `false`.
+
+## Outcome mapping
+
+| Tool returns                | Your `status` | `next_step`                                                                                                                  |
+|-----------------------------|---------------|------------------------------------------------------------------------------------------------------------------------------|
+| `success`                   | `success`     | `null`                                                                                                                       |
+| `rejected`                  | `blocked`     | `"User declined this Confluence action. Do not retry or suggest alternatives."`                                              |
+| `not_found`                 | `blocked`     | `"Page '<title>' was not found in the indexed Confluence pages. Ask the user to verify the title or wait for the next KB sync."` |
+| `auth_error`                | `error`       | `"The connected Confluence account needs re-authentication. Ask the user to re-authenticate in connector settings."`         |
+| `insufficient_permissions`  | `error`       | `"The connected Confluence account is missing the OAuth scope required for this action. Ask the user to re-authenticate and grant full permissions in connector settings."` |
+| `error`                     | `error`       | Relay the tool's `message` verbatim as `next_step`. (Common: `"A space must be selected."` when the user didn't pick one in approval.) |
+| tool raises / unknown       | `error`       | `"Confluence tool failed unexpectedly. Ask the user to retry shortly."`                                                      |
+
+Surface the tool's `page_id`, `page_title`, and `page_url` inside `evidence` when the tool returned them. Never invent a field the tool did not return.
+
+## Examples
+
+**Example 1 — happy create (HTML content generated, space picked in HITL):**
+- *Supervisor task:* `"Create a Confluence page summarising our Q2 roadmap."`
+- *You:* `title="Q2 Roadmap"` is the topic; generate a Confluence storage-format body (e.g. `"<h1>Q2 Roadmap</h1><p>Objectives:</p><ul><li>...</li></ul>"`); pass `space_id=None` so the user picks the space in HITL. Call `create_confluence_page(...)` → tool returns `status=success`.
+- *Output:*
+
+  ```json
+  {
+    "status": "success",
+    "action_summary": "Created Confluence page 'Q2 Roadmap' in the space selected by the user.",
+    "evidence": { "operation": "create_confluence_page", "page_id": "<id>", "page_title": "Q2 Roadmap", "page_url": "<url>", "matched_candidates": null, "items": null },
+    "next_step": null,
+    "missing_fields": null,
+    "assumptions": ["Generated the roadmap content in Confluence storage format (HTML) from the supervisor's brief; deferred space selection to the HITL approval card."]
+  }
+  ```
+
+**Example 2 — blocked on "add a section" (REPLACE limitation):**
+- *Supervisor task:* `"Add a 'Risks' section to the 'Q2 Plan' Confluence page."`
+- *You:* `update_confluence_page` replaces the body entirely and you have no tool to read the current body, so you cannot append. Do not call any tool.
+- *Output:*
+
+  ```json
+  {
+    "status": "blocked",
+    "action_summary": "Cannot append: Confluence updates replace the page body entirely and this subagent has no tool to read the existing content.",
+    "evidence": { "operation": null, "page_id": null, "page_title": "Q2 Plan", "page_url": null, "matched_candidates": null, "items": null },
+    "next_step": "Ask the user to provide the full intended page content (existing body + new 'Risks' section), or to make the addition manually in Confluence.",
+    "missing_fields": null,
+    "assumptions": null
+  }
+  ```
+
+**Example 3 — page not in the KB index:**
+- *Supervisor task:* `"Update the 'Onboarding' Confluence page with the new payroll steps."`
+- *You:* `page_title_or_id="Onboarding"` and the new-payroll content are present; this is a full replace, which is supported. Call `update_confluence_page(page_title_or_id="Onboarding", new_content=<HTML>)` → tool returns `status=not_found`.
+- *Output:*
+
+  ```json
+  {
+    "status": "blocked",
+    "action_summary": "Could not find a Confluence page titled 'Onboarding' in the indexed pages.",
+    "evidence": { "operation": "update_confluence_page", "page_id": null, "page_title": "Onboarding", "page_url": null, "matched_candidates": null, "items": null },
+    "next_step": "Page 'Onboarding' was not found in the indexed Confluence pages. Ask the user to verify the title or wait for the next KB sync.",
+    "missing_fields": null,
+    "assumptions": null
+  }
+  ```
+
+## Output contract
+
+Return **only** one JSON object (no markdown or prose outside it):
+
+```json
+{
+  "status": "success" | "partial" | "blocked" | "error",
+  "action_summary": string,
+  "evidence": {
+    "operation": "create_confluence_page" | "update_confluence_page" | "delete_confluence_page" | null,
+    "page_id": string | null,
+    "page_title": string | null,
+    "page_url": string | null,
+    "matched_candidates": [ { "id": string, "label": string } ] | null,
+    "items": object | null
+  },
+  "next_step": string | null,
+  "missing_fields": string[] | null,
+  "assumptions": string[] | null
+}
+```
+
+Rules:
+- `status=success` → `next_step=null`, `missing_fields=null`.
+- `status=partial|blocked|error` → `next_step` must be non-null.
+- `status=blocked` due to missing required inputs → `missing_fields` must be non-null.
+
+Infer before you call; map every tool outcome faithfully.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/confluence/tools/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/confluence/tools/__init__.py
new file mode 100644
index 0000000000..3bf80b61bc
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/confluence/tools/__init__.py
@@ -0,0 +1,11 @@
+"""Confluence tools for creating, updating, and deleting pages."""
+
+from .create_page import create_create_confluence_page_tool
+from .delete_page import create_delete_confluence_page_tool
+from .update_page import create_update_confluence_page_tool
+
+__all__ = [
+    "create_create_confluence_page_tool",
+    "create_delete_confluence_page_tool",
+    "create_update_confluence_page_tool",
+]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/confluence/tools/create_page.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/confluence/tools/create_page.py
new file mode 100644
index 0000000000..f33dc8e234
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/confluence/tools/create_page.py
@@ -0,0 +1,213 @@
+import logging
+from typing import Any
+
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.orm.attributes import flag_modified
+
+from app.agents.multi_agent_chat.subagents.shared.hitl.approvals.self_gated import (
+    request_approval,
+)
+from app.connectors.confluence_history import ConfluenceHistoryConnector
+from app.services.confluence import ConfluenceToolMetadataService
+
+logger = logging.getLogger(__name__)
+
+
+def create_create_confluence_page_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+    connector_id: int | None = None,
+):
+    @tool
+    async def create_confluence_page(
+        title: str,
+        content: str | None = None,
+        space_id: str | None = None,
+    ) -> dict[str, Any]:
+        """Create a new page in Confluence.
+
+        Use this tool when the user explicitly asks to create a new Confluence page.
+
+        Args:
+            title: Title of the page.
+            content: Optional HTML/storage format content for the page body.
+            space_id: Optional Confluence space ID to create the page in.
+
+        Returns:
+            Dictionary with status, page_id, and message.
+
+            IMPORTANT:
+            - If status is "rejected", do NOT retry.
+            - If status is "insufficient_permissions", inform user to re-authenticate.
+        """
+        logger.info(f"create_confluence_page called: title='{title}'")
+
+        if db_session is None or search_space_id is None or user_id is None:
+            return {
+                "status": "error",
+                "message": "Confluence tool not properly configured.",
+            }
+
+        try:
+            metadata_service = ConfluenceToolMetadataService(db_session)
+            context = await metadata_service.get_creation_context(
+                search_space_id, user_id
+            )
+
+            if "error" in context:
+                return {"status": "error", "message": context["error"]}
+
+            accounts = context.get("accounts", [])
+            if accounts and all(a.get("auth_expired") for a in accounts):
+                return {
+                    "status": "auth_error",
+                    "message": "All connected Confluence accounts need re-authentication.",
+                    "connector_type": "confluence",
+                }
+
+            result = request_approval(
+                action_type="confluence_page_creation",
+                tool_name="create_confluence_page",
+                params={
+                    "title": title,
+                    "content": content,
+                    "space_id": space_id,
+                    "connector_id": connector_id,
+                },
+                context=context,
+            )
+
+            if result.rejected:
+                return {
+                    "status": "rejected",
+                    "message": "User declined. Do not retry or suggest alternatives.",
+                }
+
+            final_title = result.params.get("title", title)
+            final_content = result.params.get("content", content) or ""
+            final_space_id = result.params.get("space_id", space_id)
+            final_connector_id = result.params.get("connector_id", connector_id)
+
+            if not final_title or not final_title.strip():
+                return {"status": "error", "message": "Page title cannot be empty."}
+            if not final_space_id:
+                return {"status": "error", "message": "A space must be selected."}
+
+            from sqlalchemy.future import select
+
+            from app.db import SearchSourceConnector, SearchSourceConnectorType
+
+            actual_connector_id = final_connector_id
+            if actual_connector_id is None:
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type
+                        == SearchSourceConnectorType.CONFLUENCE_CONNECTOR,
+                    )
+                )
+                connector = result.scalars().first()
+                if not connector:
+                    return {
+                        "status": "error",
+                        "message": "No Confluence connector found.",
+                    }
+                actual_connector_id = connector.id
+            else:
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.id == actual_connector_id,
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type
+                        == SearchSourceConnectorType.CONFLUENCE_CONNECTOR,
+                    )
+                )
+                connector = result.scalars().first()
+                if not connector:
+                    return {
+                        "status": "error",
+                        "message": "Selected Confluence connector is invalid.",
+                    }
+
+            try:
+                client = ConfluenceHistoryConnector(
+                    session=db_session, connector_id=actual_connector_id
+                )
+                api_result = await client.create_page(
+                    space_id=final_space_id,
+                    title=final_title,
+                    body=final_content,
+                )
+                await client.close()
+            except Exception as api_err:
+                if (
+                    "http 403" in str(api_err).lower()
+                    or "status code 403" in str(api_err).lower()
+                ):
+                    try:
+                        _conn = connector
+                        _conn.config = {**_conn.config, "auth_expired": True}
+                        flag_modified(_conn, "config")
+                        await db_session.commit()
+                    except Exception:
+                        pass
+                    return {
+                        "status": "insufficient_permissions",
+                        "connector_id": actual_connector_id,
+                        "message": "This Confluence account needs additional permissions. Please re-authenticate in connector settings.",
+                    }
+                raise
+
+            page_id = str(api_result.get("id", ""))
+            page_links = (
+                api_result.get("_links", {}) if isinstance(api_result, dict) else {}
+            )
+            page_url = ""
+            if page_links.get("base") and page_links.get("webui"):
+                page_url = f"{page_links['base']}{page_links['webui']}"
+
+            kb_message_suffix = ""
+            try:
+                from app.services.confluence import ConfluenceKBSyncService
+
+                kb_service = ConfluenceKBSyncService(db_session)
+                kb_result = await kb_service.sync_after_create(
+                    page_id=page_id,
+                    page_title=final_title,
+                    space_id=final_space_id,
+                    body_content=final_content,
+                    connector_id=actual_connector_id,
+                    search_space_id=search_space_id,
+                    user_id=user_id,
+                )
+                if kb_result["status"] == "success":
+                    kb_message_suffix = " Your knowledge base has also been updated."
+                else:
+                    kb_message_suffix = " This page will be added to your knowledge base in the next scheduled sync."
+            except Exception as kb_err:
+                logger.warning(f"KB sync after create failed: {kb_err}")
+                kb_message_suffix = " This page will be added to your knowledge base in the next scheduled sync."
+
+            return {
+                "status": "success",
+                "page_id": page_id,
+                "page_url": page_url,
+                "message": f"Confluence page '{final_title}' created successfully.{kb_message_suffix}",
+            }
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+            logger.error(f"Error creating Confluence page: {e}", exc_info=True)
+            return {
+                "status": "error",
+                "message": "Something went wrong while creating the page.",
+            }
+
+    return create_confluence_page
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/confluence/tools/delete_page.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/confluence/tools/delete_page.py
new file mode 100644
index 0000000000..7a3a4f2c7e
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/confluence/tools/delete_page.py
@@ -0,0 +1,191 @@
+import logging
+from typing import Any
+
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.orm.attributes import flag_modified
+
+from app.agents.multi_agent_chat.subagents.shared.hitl.approvals.self_gated import (
+    request_approval,
+)
+from app.connectors.confluence_history import ConfluenceHistoryConnector
+from app.services.confluence import ConfluenceToolMetadataService
+
+logger = logging.getLogger(__name__)
+
+
+def create_delete_confluence_page_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+    connector_id: int | None = None,
+):
+    @tool
+    async def delete_confluence_page(
+        page_title_or_id: str,
+        delete_from_kb: bool = False,
+    ) -> dict[str, Any]:
+        """Delete a Confluence page.
+
+        Use this tool when the user asks to delete or remove a Confluence page.
+
+        Args:
+            page_title_or_id: The page title or ID to identify the page.
+            delete_from_kb: Whether to also remove from the knowledge base.
+
+        Returns:
+            Dictionary with status, message, and deleted_from_kb.
+
+            IMPORTANT:
+            - If status is "rejected", do NOT retry.
+            - If status is "not_found", relay the message to the user.
+            - If status is "insufficient_permissions", inform user to re-authenticate.
+        """
+        logger.info(
+            f"delete_confluence_page called: page_title_or_id='{page_title_or_id}'"
+        )
+
+        if db_session is None or search_space_id is None or user_id is None:
+            return {
+                "status": "error",
+                "message": "Confluence tool not properly configured.",
+            }
+
+        try:
+            metadata_service = ConfluenceToolMetadataService(db_session)
+            context = await metadata_service.get_deletion_context(
+                search_space_id, user_id, page_title_or_id
+            )
+
+            if "error" in context:
+                error_msg = context["error"]
+                if context.get("auth_expired"):
+                    return {
+                        "status": "auth_error",
+                        "message": error_msg,
+                        "connector_id": context.get("connector_id"),
+                        "connector_type": "confluence",
+                    }
+                if "not found" in error_msg.lower():
+                    return {"status": "not_found", "message": error_msg}
+                return {"status": "error", "message": error_msg}
+
+            page_data = context["page"]
+            page_id = page_data["page_id"]
+            page_title = page_data.get("page_title", "")
+            document_id = page_data["document_id"]
+            connector_id_from_context = context.get("account", {}).get("id")
+
+            result = request_approval(
+                action_type="confluence_page_deletion",
+                tool_name="delete_confluence_page",
+                params={
+                    "page_id": page_id,
+                    "connector_id": connector_id_from_context,
+                    "delete_from_kb": delete_from_kb,
+                },
+                context=context,
+            )
+
+            if result.rejected:
+                return {
+                    "status": "rejected",
+                    "message": "User declined. Do not retry or suggest alternatives.",
+                }
+
+            final_page_id = result.params.get("page_id", page_id)
+            final_connector_id = result.params.get(
+                "connector_id", connector_id_from_context
+            )
+            final_delete_from_kb = result.params.get("delete_from_kb", delete_from_kb)
+
+            from sqlalchemy.future import select
+
+            from app.db import SearchSourceConnector, SearchSourceConnectorType
+
+            if not final_connector_id:
+                return {
+                    "status": "error",
+                    "message": "No connector found for this page.",
+                }
+
+            result = await db_session.execute(
+                select(SearchSourceConnector).filter(
+                    SearchSourceConnector.id == final_connector_id,
+                    SearchSourceConnector.search_space_id == search_space_id,
+                    SearchSourceConnector.user_id == user_id,
+                    SearchSourceConnector.connector_type
+                    == SearchSourceConnectorType.CONFLUENCE_CONNECTOR,
+                )
+            )
+            connector = result.scalars().first()
+            if not connector:
+                return {
+                    "status": "error",
+                    "message": "Selected Confluence connector is invalid.",
+                }
+
+            try:
+                client = ConfluenceHistoryConnector(
+                    session=db_session, connector_id=final_connector_id
+                )
+                await client.delete_page(final_page_id)
+                await client.close()
+            except Exception as api_err:
+                if (
+                    "http 403" in str(api_err).lower()
+                    or "status code 403" in str(api_err).lower()
+                ):
+                    try:
+                        connector.config = {**connector.config, "auth_expired": True}
+                        flag_modified(connector, "config")
+                        await db_session.commit()
+                    except Exception:
+                        pass
+                    return {
+                        "status": "insufficient_permissions",
+                        "connector_id": final_connector_id,
+                        "message": "This Confluence account needs additional permissions. Please re-authenticate in connector settings.",
+                    }
+                raise
+
+            deleted_from_kb = False
+            if final_delete_from_kb and document_id:
+                try:
+                    from app.db import Document
+
+                    doc_result = await db_session.execute(
+                        select(Document).filter(Document.id == document_id)
+                    )
+                    document = doc_result.scalars().first()
+                    if document:
+                        await db_session.delete(document)
+                        await db_session.commit()
+                        deleted_from_kb = True
+                except Exception as e:
+                    logger.error(f"Failed to delete document from KB: {e}")
+                    await db_session.rollback()
+
+            message = f"Confluence page '{page_title}' deleted successfully."
+            if deleted_from_kb:
+                message += " Also removed from the knowledge base."
+
+            return {
+                "status": "success",
+                "page_id": final_page_id,
+                "deleted_from_kb": deleted_from_kb,
+                "message": message,
+            }
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+            logger.error(f"Error deleting Confluence page: {e}", exc_info=True)
+            return {
+                "status": "error",
+                "message": "Something went wrong while deleting the page.",
+            }
+
+    return delete_confluence_page
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/confluence/tools/index.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/confluence/tools/index.py
new file mode 100644
index 0000000000..b38503c5c4
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/confluence/tools/index.py
@@ -0,0 +1,37 @@
+"""``confluence`` native tools and (empty) permission ruleset.
+
+Tools self-gate via :func:`request_approval` in their bodies.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.tools import BaseTool
+
+from app.agents.new_chat.permissions import Ruleset
+
+from .create_page import create_create_confluence_page_tool
+from .delete_page import create_delete_confluence_page_tool
+from .update_page import create_update_confluence_page_tool
+
+NAME = "confluence"
+
+RULESET = Ruleset(origin=NAME, rules=[])
+
+
+def load_tools(
+    *, dependencies: dict[str, Any] | None = None, **kwargs: Any
+) -> list[BaseTool]:
+    d = {**(dependencies or {}), **kwargs}
+    common = {
+        "db_session": d["db_session"],
+        "search_space_id": d["search_space_id"],
+        "user_id": d["user_id"],
+        "connector_id": d.get("connector_id"),
+    }
+    return [
+        create_create_confluence_page_tool(**common),
+        create_update_confluence_page_tool(**common),
+        create_delete_confluence_page_tool(**common),
+    ]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/confluence/tools/update_page.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/confluence/tools/update_page.py
new file mode 100644
index 0000000000..7a8207a00d
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/confluence/tools/update_page.py
@@ -0,0 +1,220 @@
+import logging
+from typing import Any
+
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.orm.attributes import flag_modified
+
+from app.agents.multi_agent_chat.subagents.shared.hitl.approvals.self_gated import (
+    request_approval,
+)
+from app.connectors.confluence_history import ConfluenceHistoryConnector
+from app.services.confluence import ConfluenceToolMetadataService
+
+logger = logging.getLogger(__name__)
+
+
+def create_update_confluence_page_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+    connector_id: int | None = None,
+):
+    @tool
+    async def update_confluence_page(
+        page_title_or_id: str,
+        new_title: str | None = None,
+        new_content: str | None = None,
+    ) -> dict[str, Any]:
+        """Update an existing Confluence page.
+
+        Use this tool when the user asks to modify or edit a Confluence page.
+
+        Args:
+            page_title_or_id: The page title or ID to identify the page.
+            new_title: Optional new title for the page.
+            new_content: Optional new HTML/storage format content.
+
+        Returns:
+            Dictionary with status and message.
+
+            IMPORTANT:
+            - If status is "rejected", do NOT retry.
+            - If status is "not_found", relay the message to the user.
+            - If status is "insufficient_permissions", inform user to re-authenticate.
+        """
+        logger.info(
+            f"update_confluence_page called: page_title_or_id='{page_title_or_id}'"
+        )
+
+        if db_session is None or search_space_id is None or user_id is None:
+            return {
+                "status": "error",
+                "message": "Confluence tool not properly configured.",
+            }
+
+        try:
+            metadata_service = ConfluenceToolMetadataService(db_session)
+            context = await metadata_service.get_update_context(
+                search_space_id, user_id, page_title_or_id
+            )
+
+            if "error" in context:
+                error_msg = context["error"]
+                if context.get("auth_expired"):
+                    return {
+                        "status": "auth_error",
+                        "message": error_msg,
+                        "connector_id": context.get("connector_id"),
+                        "connector_type": "confluence",
+                    }
+                if "not found" in error_msg.lower():
+                    return {"status": "not_found", "message": error_msg}
+                return {"status": "error", "message": error_msg}
+
+            page_data = context["page"]
+            page_id = page_data["page_id"]
+            current_title = page_data["page_title"]
+            current_body = page_data.get("body", "")
+            current_version = page_data.get("version", 1)
+            document_id = page_data.get("document_id")
+            connector_id_from_context = context.get("account", {}).get("id")
+
+            result = request_approval(
+                action_type="confluence_page_update",
+                tool_name="update_confluence_page",
+                params={
+                    "page_id": page_id,
+                    "document_id": document_id,
+                    "new_title": new_title,
+                    "new_content": new_content,
+                    "version": current_version,
+                    "connector_id": connector_id_from_context,
+                },
+                context=context,
+            )
+
+            if result.rejected:
+                return {
+                    "status": "rejected",
+                    "message": "User declined. Do not retry or suggest alternatives.",
+                }
+
+            final_page_id = result.params.get("page_id", page_id)
+            final_title = result.params.get("new_title", new_title) or current_title
+            final_content = result.params.get("new_content", new_content)
+            if final_content is None:
+                final_content = current_body
+            final_version = result.params.get("version", current_version)
+            final_connector_id = result.params.get(
+                "connector_id", connector_id_from_context
+            )
+            final_document_id = result.params.get("document_id", document_id)
+
+            from sqlalchemy.future import select
+
+            from app.db import SearchSourceConnector, SearchSourceConnectorType
+
+            if not final_connector_id:
+                return {
+                    "status": "error",
+                    "message": "No connector found for this page.",
+                }
+
+            result = await db_session.execute(
+                select(SearchSourceConnector).filter(
+                    SearchSourceConnector.id == final_connector_id,
+                    SearchSourceConnector.search_space_id == search_space_id,
+                    SearchSourceConnector.user_id == user_id,
+                    SearchSourceConnector.connector_type
+                    == SearchSourceConnectorType.CONFLUENCE_CONNECTOR,
+                )
+            )
+            connector = result.scalars().first()
+            if not connector:
+                return {
+                    "status": "error",
+                    "message": "Selected Confluence connector is invalid.",
+                }
+
+            try:
+                client = ConfluenceHistoryConnector(
+                    session=db_session, connector_id=final_connector_id
+                )
+                api_result = await client.update_page(
+                    page_id=final_page_id,
+                    title=final_title,
+                    body=final_content,
+                    version_number=final_version + 1,
+                )
+                await client.close()
+            except Exception as api_err:
+                if (
+                    "http 403" in str(api_err).lower()
+                    or "status code 403" in str(api_err).lower()
+                ):
+                    try:
+                        connector.config = {**connector.config, "auth_expired": True}
+                        flag_modified(connector, "config")
+                        await db_session.commit()
+                    except Exception:
+                        pass
+                    return {
+                        "status": "insufficient_permissions",
+                        "connector_id": final_connector_id,
+                        "message": "This Confluence account needs additional permissions. Please re-authenticate in connector settings.",
+                    }
+                raise
+
+            page_links = (
+                api_result.get("_links", {}) if isinstance(api_result, dict) else {}
+            )
+            page_url = ""
+            if page_links.get("base") and page_links.get("webui"):
+                page_url = f"{page_links['base']}{page_links['webui']}"
+
+            kb_message_suffix = ""
+            if final_document_id:
+                try:
+                    from app.services.confluence import ConfluenceKBSyncService
+
+                    kb_service = ConfluenceKBSyncService(db_session)
+                    kb_result = await kb_service.sync_after_update(
+                        document_id=final_document_id,
+                        page_id=final_page_id,
+                        user_id=user_id,
+                        search_space_id=search_space_id,
+                    )
+                    if kb_result["status"] == "success":
+                        kb_message_suffix = (
+                            " Your knowledge base has also been updated."
+                        )
+                    else:
+                        kb_message_suffix = (
+                            " The knowledge base will be updated in the next sync."
+                        )
+                except Exception as kb_err:
+                    logger.warning(f"KB sync after update failed: {kb_err}")
+                    kb_message_suffix = (
+                        " The knowledge base will be updated in the next sync."
+                    )
+
+            return {
+                "status": "success",
+                "page_id": final_page_id,
+                "page_url": page_url,
+                "message": f"Confluence page '{final_title}' updated successfully.{kb_message_suffix}",
+            }
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+            logger.error(f"Error updating Confluence page: {e}", exc_info=True)
+            return {
+                "status": "error",
+                "message": "Something went wrong while updating the page.",
+            }
+
+    return update_confluence_page
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/agent.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/agent.py
new file mode 100644
index 0000000000..567e729735
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/agent.py
@@ -0,0 +1,44 @@
+"""``discord`` route: ``SurfSenseSubagentSpec`` builder for deepagents.
+
+Tools self-gate inside their bodies via :func:`request_approval`; the
+empty :data:`tools.index.RULESET` is layered into a per-subagent
+:class:`PermissionMiddleware` for uniformity.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.language_models import BaseChatModel
+from langchain_core.tools import BaseTool
+
+from app.agents.multi_agent_chat.subagents.shared.md_file_reader import read_md_file
+from app.agents.multi_agent_chat.subagents.shared.spec import SurfSenseSubagentSpec
+from app.agents.multi_agent_chat.subagents.shared.subagent_builder import pack_subagent
+
+from .tools.index import NAME, RULESET, load_tools
+
+
+def build_subagent(
+    *,
+    dependencies: dict[str, Any],
+    model: BaseChatModel | None = None,
+    middleware_stack: dict[str, Any] | None = None,
+    mcp_tools: list[BaseTool] | None = None,
+) -> SurfSenseSubagentSpec:
+    tools = [*load_tools(dependencies=dependencies), *(mcp_tools or [])]
+    description = (
+        read_md_file(__package__, "description").strip()
+        or "Handles discord tasks for this workspace."
+    )
+    system_prompt = read_md_file(__package__, "system_prompt").strip()
+    return pack_subagent(
+        name=NAME,
+        description=description,
+        system_prompt=system_prompt,
+        tools=tools,
+        ruleset=RULESET,
+        dependencies=dependencies,
+        model=model,
+        middleware_stack=middleware_stack,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/description.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/description.md
new file mode 100644
index 0000000000..68246710ab
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/description.md
@@ -0,0 +1,2 @@
+Specialist for messages in the user's Discord server.
+Use proactively when the user wants to read or send a Discord message.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/system_prompt.md
new file mode 100644
index 0000000000..a0ba6d87e7
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/system_prompt.md
@@ -0,0 +1,116 @@
+You are a Discord specialist for the user's connected Discord server.
+
+## Vocabulary you must use precisely
+
+- **Channel resolution via `list_discord_channels`** — the agent operates in a single connected Discord server (the guild is configured in the connector, not chosen by you). Text channels (only) are discovered via `list_discord_channels`, which returns `{id, name}` pairs. Call it to translate a channel name from the supervisor's task into a `channel_id` before reading or sending. Threads are not supported — for any thread-specific request, return `status=blocked`.
+- **Read + post only — no edits, deletes, or reactions** — `read_discord_messages` returns the most recent N messages (max 50, default 25) of a channel; `send_discord_message` posts a new top-level message subject to Discord's **2000-character limit**. Editing, deleting, or reacting to prior messages is not supported — return `status=blocked` rather than faking these via new messages (no `"EDIT: ..."` follow-ups, no `"Please delete this"` posts).
+
+## Required inputs
+
+**For every required input below, first try to infer it from the supervisor's task text** — extract channel names from `#mentions` or natural phrasing (`"the announcements channel"`, `"#general"`), and message content from any details the supervisor already provided. Only return `status=blocked` with `missing_fields` when an input is genuinely absent or ambiguous after a thorough read of the task.
+
+- `list_discord_channels` — no inputs. Call it whenever you need to resolve a channel name to a `channel_id`.
+- `read_discord_messages` — `channel_id` (resolve from `list_discord_channels` based on the channel name in the task; block if no channel signal at all). Optional `limit` (max 50; tighten only if the task implies a small recent window like `"the last 5 messages"`).
+- `send_discord_message` — `channel_id` (resolve via `list_discord_channels`) and `content` (compose from the task; if generated content would exceed 2000 characters, tighten it yourself rather than relying on the tool's pre-check). Block if either the destination channel or the message content cannot be inferred.
+
+## Outcome mapping
+
+| Tool returns                                          | Your `status` | `next_step` |
+|-------------------------------------------------------|---------------|------------------------------------------------------------------------------------------------------------------------------|
+| `success` with non-empty channels/messages            | `success`     | `null` |
+| `success` with `total: 0` (list returns no channels or read returns no messages) | `success` | `null` (surface `total: 0` in `evidence.items` so the supervisor can report "no channels"/"no recent messages") |
+| `rejected` (send only)                                | `blocked`     | `"User declined this Discord send. Do not retry or suggest alternatives."` |
+| `auth_error`                                          | `error`       | `"The connected Discord bot token is invalid. Ask the user to update the Discord bot token in connector settings."` |
+| `error`                                               | `error`       | Relay the tool's `message` verbatim as `next_step`. |
+| tool raises / unknown                                 | `error`       | `"Discord tool failed unexpectedly. Ask the user to retry shortly."` |
+
+Surface the tool's `message`, `channel_id`, `message_id`, and the listed channels/messages payload inside `evidence` when the tool returned them. Never invent a field the tool did not return.
+
+## Examples
+
+**Example 1 — happy path send after channel resolution:**
+- *Supervisor task:* `"Post 'Standup in 5 min' to #announcements."`
+- *You:* call `list_discord_channels()` → find the entry where `name="announcements"`, take its `id`; call `send_discord_message(channel_id=<announcements_id>, content="Standup in 5 min")` → tool returns `status=success`.
+- *Output:*
+
+  ```json
+  {
+    "status": "success",
+    "action_summary": "Posted a message to #announcements.",
+    "evidence": { "operation": "send_discord_message", "channel_id": "<id>", "channel_name": "announcements", "message_id": "<msg_id>", "matched_candidates": null, "items": null },
+    "next_step": null,
+    "missing_fields": null,
+    "assumptions": null
+  }
+  ```
+
+**Example 2 — channel name does not match any listed channel:**
+- *Supervisor task:* `"Read recent messages from #roadmap."`
+- *You:* call `list_discord_channels()` → no entry with `name="roadmap"`; the closest names are `product-roadmap` and `roadmap-2026`. Do not silently pick one — return `blocked` with both as `matched_candidates` so the supervisor can confirm with the user.
+- *Output:*
+
+  ```json
+  {
+    "status": "blocked",
+    "action_summary": "No Discord channel exactly named 'roadmap' was found.",
+    "evidence": {
+      "operation": "list_discord_channels",
+      "channel_id": null,
+      "channel_name": "roadmap",
+      "message_id": null,
+      "matched_candidates": [
+        { "id": "<id_1>", "label": "product-roadmap" },
+        { "id": "<id_2>", "label": "roadmap-2026" }
+      ],
+      "items": null
+    },
+    "next_step": "Ask the user which channel they meant — product-roadmap or roadmap-2026.",
+    "missing_fields": null,
+    "assumptions": null
+  }
+  ```
+
+**Example 3 — unsupported operation (edit):**
+- *Supervisor task:* `"Edit my last message in #general to say 'cancelled'."`
+- *You:* Discord edits are not supported by your tools. Do not call any tool. Do not post a new message like `"EDIT: cancelled"` — block.
+- *Output:*
+
+  ```json
+  {
+    "status": "blocked",
+    "action_summary": "Editing prior Discord messages is not supported.",
+    "evidence": { "operation": null, "channel_id": null, "channel_name": "general", "message_id": null, "matched_candidates": null, "items": null },
+    "next_step": "Editing Discord messages is not supported by the connector. Ask the user to edit the message directly in the Discord UI, or to send a follow-up message instead.",
+    "missing_fields": null,
+    "assumptions": null
+  }
+  ```
+
+## Output contract
+
+Return **only** one JSON object (no markdown or prose outside it):
+
+```json
+{
+  "status": "success" | "partial" | "blocked" | "error",
+  "action_summary": string,
+  "evidence": {
+    "operation": "list_discord_channels" | "read_discord_messages" | "send_discord_message" | null,
+    "channel_id": string | null,
+    "channel_name": string | null,
+    "message_id": string | null,
+    "matched_candidates": [ { "id": string, "label": string } ] | null,
+    "items": object | null
+  },
+  "next_step": string | null,
+  "missing_fields": string[] | null,
+  "assumptions": string[] | null
+}
+```
+
+Rules:
+- `status=success` → `next_step=null`, `missing_fields=null`.
+- `status=partial|blocked|error` → `next_step` must be non-null.
+- `status=blocked` due to missing required inputs → `missing_fields` must be non-null.
+
+Resolve before you call; verify before you send; map every tool outcome faithfully.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/tools/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/tools/__init__.py
new file mode 100644
index 0000000000..b4eaec1f08
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/tools/__init__.py
@@ -0,0 +1,15 @@
+from app.agents.new_chat.tools.discord.list_channels import (
+    create_list_discord_channels_tool,
+)
+from app.agents.new_chat.tools.discord.read_messages import (
+    create_read_discord_messages_tool,
+)
+from app.agents.new_chat.tools.discord.send_message import (
+    create_send_discord_message_tool,
+)
+
+__all__ = [
+    "create_list_discord_channels_tool",
+    "create_read_discord_messages_tool",
+    "create_send_discord_message_tool",
+]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/tools/_auth.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/tools/_auth.py
new file mode 100644
index 0000000000..7636aff71f
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/tools/_auth.py
@@ -0,0 +1,43 @@
+"""Builds Discord REST API auth headers for connector-backed tools."""
+
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.future import select
+
+from app.config import config
+from app.db import SearchSourceConnector, SearchSourceConnectorType
+from app.utils.oauth_security import TokenEncryption
+
+DISCORD_API = "https://discord.com/api/v10"
+
+
+async def get_discord_connector(
+    db_session: AsyncSession,
+    search_space_id: int,
+    user_id: str,
+) -> SearchSourceConnector | None:
+    result = await db_session.execute(
+        select(SearchSourceConnector).filter(
+            SearchSourceConnector.search_space_id == search_space_id,
+            SearchSourceConnector.user_id == user_id,
+            SearchSourceConnector.connector_type
+            == SearchSourceConnectorType.DISCORD_CONNECTOR,
+        )
+    )
+    return result.scalars().first()
+
+
+def get_bot_token(connector: SearchSourceConnector) -> str:
+    """Extract and decrypt the bot token from connector config."""
+    cfg = dict(connector.config)
+    if cfg.get("_token_encrypted") and config.SECRET_KEY:
+        enc = TokenEncryption(config.SECRET_KEY)
+        if cfg.get("bot_token"):
+            cfg["bot_token"] = enc.decrypt_token(cfg["bot_token"])
+    token = cfg.get("bot_token")
+    if not token:
+        raise ValueError("Discord bot token not found in connector config.")
+    return token
+
+
+def get_guild_id(connector: SearchSourceConnector) -> str | None:
+    return connector.config.get("guild_id")
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/tools/index.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/tools/index.py
new file mode 100644
index 0000000000..c69ef3e5c5
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/tools/index.py
@@ -0,0 +1,36 @@
+"""``discord`` native tools and (empty) permission ruleset.
+
+Tools self-gate via :func:`request_approval` in their bodies.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.tools import BaseTool
+
+from app.agents.new_chat.permissions import Ruleset
+
+from .list_channels import create_list_discord_channels_tool
+from .read_messages import create_read_discord_messages_tool
+from .send_message import create_send_discord_message_tool
+
+NAME = "discord"
+
+RULESET = Ruleset(origin=NAME, rules=[])
+
+
+def load_tools(
+    *, dependencies: dict[str, Any] | None = None, **kwargs: Any
+) -> list[BaseTool]:
+    d = {**(dependencies or {}), **kwargs}
+    common = {
+        "db_session": d["db_session"],
+        "search_space_id": d["search_space_id"],
+        "user_id": d["user_id"],
+    }
+    return [
+        create_list_discord_channels_tool(**common),
+        create_read_discord_messages_tool(**common),
+        create_send_discord_message_tool(**common),
+    ]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/tools/list_channels.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/tools/list_channels.py
new file mode 100644
index 0000000000..3cc99ac173
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/tools/list_channels.py
@@ -0,0 +1,87 @@
+import logging
+from typing import Any
+
+import httpx
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from ._auth import DISCORD_API, get_bot_token, get_discord_connector, get_guild_id
+
+logger = logging.getLogger(__name__)
+
+
+def create_list_discord_channels_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    @tool
+    async def list_discord_channels() -> dict[str, Any]:
+        """List text channels in the connected Discord server.
+
+        Returns:
+            Dictionary with status and a list of channels (id, name).
+        """
+        if db_session is None or search_space_id is None or user_id is None:
+            return {
+                "status": "error",
+                "message": "Discord tool not properly configured.",
+            }
+
+        try:
+            connector = await get_discord_connector(
+                db_session, search_space_id, user_id
+            )
+            if not connector:
+                return {"status": "error", "message": "No Discord connector found."}
+
+            guild_id = get_guild_id(connector)
+            if not guild_id:
+                return {
+                    "status": "error",
+                    "message": "No guild ID in Discord connector config.",
+                }
+
+            token = get_bot_token(connector)
+
+            async with httpx.AsyncClient() as client:
+                resp = await client.get(
+                    f"{DISCORD_API}/guilds/{guild_id}/channels",
+                    headers={"Authorization": f"Bot {token}"},
+                    timeout=15.0,
+                )
+
+            if resp.status_code == 401:
+                return {
+                    "status": "auth_error",
+                    "message": "Discord bot token is invalid.",
+                    "connector_type": "discord",
+                }
+            if resp.status_code != 200:
+                return {
+                    "status": "error",
+                    "message": f"Discord API error: {resp.status_code}",
+                }
+
+            # Type 0 = text channel
+            channels = [
+                {"id": ch["id"], "name": ch["name"]}
+                for ch in resp.json()
+                if ch.get("type") == 0
+            ]
+            return {
+                "status": "success",
+                "guild_id": guild_id,
+                "channels": channels,
+                "total": len(channels),
+            }
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+            logger.error("Error listing Discord channels: %s", e, exc_info=True)
+            return {"status": "error", "message": "Failed to list Discord channels."}
+
+    return list_discord_channels
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/tools/read_messages.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/tools/read_messages.py
new file mode 100644
index 0000000000..d8bf989a1a
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/tools/read_messages.py
@@ -0,0 +1,100 @@
+import logging
+from typing import Any
+
+import httpx
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from ._auth import DISCORD_API, get_bot_token, get_discord_connector
+
+logger = logging.getLogger(__name__)
+
+
+def create_read_discord_messages_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    @tool
+    async def read_discord_messages(
+        channel_id: str,
+        limit: int = 25,
+    ) -> dict[str, Any]:
+        """Read recent messages from a Discord text channel.
+
+        Args:
+            channel_id: The Discord channel ID (from list_discord_channels).
+            limit: Number of messages to fetch (default 25, max 50).
+
+        Returns:
+            Dictionary with status and a list of messages including
+            id, author, content, timestamp.
+        """
+        if db_session is None or search_space_id is None or user_id is None:
+            return {
+                "status": "error",
+                "message": "Discord tool not properly configured.",
+            }
+
+        limit = min(limit, 50)
+
+        try:
+            connector = await get_discord_connector(
+                db_session, search_space_id, user_id
+            )
+            if not connector:
+                return {"status": "error", "message": "No Discord connector found."}
+
+            token = get_bot_token(connector)
+
+            async with httpx.AsyncClient() as client:
+                resp = await client.get(
+                    f"{DISCORD_API}/channels/{channel_id}/messages",
+                    headers={"Authorization": f"Bot {token}"},
+                    params={"limit": limit},
+                    timeout=15.0,
+                )
+
+            if resp.status_code == 401:
+                return {
+                    "status": "auth_error",
+                    "message": "Discord bot token is invalid.",
+                    "connector_type": "discord",
+                }
+            if resp.status_code == 403:
+                return {
+                    "status": "error",
+                    "message": "Bot lacks permission to read this channel.",
+                }
+            if resp.status_code != 200:
+                return {
+                    "status": "error",
+                    "message": f"Discord API error: {resp.status_code}",
+                }
+
+            messages = [
+                {
+                    "id": m["id"],
+                    "author": m.get("author", {}).get("username", "Unknown"),
+                    "content": m.get("content", ""),
+                    "timestamp": m.get("timestamp", ""),
+                }
+                for m in resp.json()
+            ]
+
+            return {
+                "status": "success",
+                "channel_id": channel_id,
+                "messages": messages,
+                "total": len(messages),
+            }
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+            logger.error("Error reading Discord messages: %s", e, exc_info=True)
+            return {"status": "error", "message": "Failed to read Discord messages."}
+
+    return read_discord_messages
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/tools/send_message.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/tools/send_message.py
new file mode 100644
index 0000000000..95890ed10f
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/tools/send_message.py
@@ -0,0 +1,119 @@
+import logging
+from typing import Any
+
+import httpx
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.agents.multi_agent_chat.subagents.shared.hitl.approvals.self_gated import (
+    request_approval,
+)
+
+from ._auth import DISCORD_API, get_bot_token, get_discord_connector
+
+logger = logging.getLogger(__name__)
+
+
+def create_send_discord_message_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    @tool
+    async def send_discord_message(
+        channel_id: str,
+        content: str,
+    ) -> dict[str, Any]:
+        """Send a message to a Discord text channel.
+
+        Args:
+            channel_id: The Discord channel ID (from list_discord_channels).
+            content: The message text (max 2000 characters).
+
+        Returns:
+            Dictionary with status, message_id on success.
+
+            IMPORTANT:
+            - If status is "rejected", the user explicitly declined. Do NOT retry.
+        """
+        if db_session is None or search_space_id is None or user_id is None:
+            return {
+                "status": "error",
+                "message": "Discord tool not properly configured.",
+            }
+
+        if len(content) > 2000:
+            return {
+                "status": "error",
+                "message": "Message exceeds Discord's 2000-character limit.",
+            }
+
+        try:
+            connector = await get_discord_connector(
+                db_session, search_space_id, user_id
+            )
+            if not connector:
+                return {"status": "error", "message": "No Discord connector found."}
+
+            result = request_approval(
+                action_type="discord_send_message",
+                tool_name="send_discord_message",
+                params={"channel_id": channel_id, "content": content},
+                context={"connector_id": connector.id},
+            )
+
+            if result.rejected:
+                return {
+                    "status": "rejected",
+                    "message": "User declined. Message was not sent.",
+                }
+
+            final_content = result.params.get("content", content)
+            final_channel = result.params.get("channel_id", channel_id)
+
+            token = get_bot_token(connector)
+
+            async with httpx.AsyncClient() as client:
+                resp = await client.post(
+                    f"{DISCORD_API}/channels/{final_channel}/messages",
+                    headers={
+                        "Authorization": f"Bot {token}",
+                        "Content-Type": "application/json",
+                    },
+                    json={"content": final_content},
+                    timeout=15.0,
+                )
+
+            if resp.status_code == 401:
+                return {
+                    "status": "auth_error",
+                    "message": "Discord bot token is invalid.",
+                    "connector_type": "discord",
+                }
+            if resp.status_code == 403:
+                return {
+                    "status": "error",
+                    "message": "Bot lacks permission to send messages in this channel.",
+                }
+            if resp.status_code not in (200, 201):
+                return {
+                    "status": "error",
+                    "message": f"Discord API error: {resp.status_code}",
+                }
+
+            msg_data = resp.json()
+            return {
+                "status": "success",
+                "message_id": msg_data.get("id"),
+                "message": f"Message sent to channel {final_channel}.",
+            }
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+            logger.error("Error sending Discord message: %s", e, exc_info=True)
+            return {"status": "error", "message": "Failed to send Discord message."}
+
+    return send_discord_message
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/dropbox/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/dropbox/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/dropbox/agent.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/dropbox/agent.py
new file mode 100644
index 0000000000..d3ae6dc839
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/dropbox/agent.py
@@ -0,0 +1,44 @@
+"""``dropbox`` route: ``SurfSenseSubagentSpec`` builder for deepagents.
+
+Tools self-gate inside their bodies via :func:`request_approval`; the
+empty :data:`tools.index.RULESET` is layered into a per-subagent
+:class:`PermissionMiddleware` for uniformity.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.language_models import BaseChatModel
+from langchain_core.tools import BaseTool
+
+from app.agents.multi_agent_chat.subagents.shared.md_file_reader import read_md_file
+from app.agents.multi_agent_chat.subagents.shared.spec import SurfSenseSubagentSpec
+from app.agents.multi_agent_chat.subagents.shared.subagent_builder import pack_subagent
+
+from .tools.index import NAME, RULESET, load_tools
+
+
+def build_subagent(
+    *,
+    dependencies: dict[str, Any],
+    model: BaseChatModel | None = None,
+    middleware_stack: dict[str, Any] | None = None,
+    mcp_tools: list[BaseTool] | None = None,
+) -> SurfSenseSubagentSpec:
+    tools = [*load_tools(dependencies=dependencies), *(mcp_tools or [])]
+    description = (
+        read_md_file(__package__, "description").strip()
+        or "Handles dropbox tasks for this workspace."
+    )
+    system_prompt = read_md_file(__package__, "system_prompt").strip()
+    return pack_subagent(
+        name=NAME,
+        description=description,
+        system_prompt=system_prompt,
+        tools=tools,
+        ruleset=RULESET,
+        dependencies=dependencies,
+        model=model,
+        middleware_stack=middleware_stack,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/dropbox/description.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/dropbox/description.md
new file mode 100644
index 0000000000..3b2171cf36
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/dropbox/description.md
@@ -0,0 +1,2 @@
+Specialist for files in the user's Dropbox.
+Use proactively when the user wants to create or remove a Dropbox file.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/dropbox/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/dropbox/system_prompt.md
new file mode 100644
index 0000000000..a963b0ec69
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/dropbox/system_prompt.md
@@ -0,0 +1,106 @@
+You are a Dropbox specialist for the user's connected Dropbox account.
+
+## Vocabulary you must use precisely
+
+- **File type — Paper vs. Word** — `create_dropbox_file` takes a `file_type` of either `"paper"` (Dropbox Paper, a collaborative real-time document — the default) or `"docx"` (a downloadable Word document; the tool converts your Markdown `content` to DOCX via pypandoc). Pick `"docx"` when the user says "Word doc", "docx", ".docx", "export-able", or implies sharing outside Dropbox; pick `"paper"` otherwise. Pass `name` **without an extension** — the tool appends `.paper` or `.docx` based on `file_type`. If the user typed an extension in the file name (e.g. `"Q2_roadmap.docx"`), treat that as a signal to set `file_type="docx"` rather than passing the extension through.
+- **File-name resolution against the KB index** — `delete_dropbox_file` matches `file_name` case-insensitively against the locally-synced Dropbox KB index. Files that exist in Dropbox but have not been indexed yet cannot be resolved by name.
+
+## Required inputs
+
+**For every required input below, first try to infer it from the supervisor's task text** — extract topics from natural phrasing (`"about our launch plan"` → `name="Launch Plan"`), file-type signals from words like "Word doc" / "Paper" / ".docx" / ".paper". Only return `status=blocked` with `missing_fields` when an input is genuinely absent or ambiguous after a thorough read.
+
+- `create_dropbox_file` — `name` (a clear topic from the user, **without an extension**; do not invent if absent). `file_type` defaults to `"paper"`; switch to `"docx"` on a signal from the user (see Vocabulary). You may generate the optional `content` body yourself as Markdown — the tool handles DOCX conversion if needed.
+- `delete_dropbox_file` — `file_name` (which file to delete — infer from the task). Only set `delete_from_kb=true` when the user explicitly asked to remove the file from the knowledge base; otherwise leave it `false`.
+
+## Outcome mapping
+
+| Tool returns          | Your `status` | `next_step`                                                                                                                |
+|-----------------------|---------------|----------------------------------------------------------------------------------------------------------------------------|
+| `success`             | `success`     | `null`                                                                                                                     |
+| `rejected`            | `blocked`     | `"User declined this Dropbox action. Do not retry or suggest alternatives."`                                               |
+| `not_found`           | `blocked`     | `"File '<name>' was not found in the indexed Dropbox files. Ask the user to verify the file name or wait for the next KB sync."` |
+| `auth_error`          | `error`       | `"The connected Dropbox account needs re-authentication. Ask the user to re-authenticate in connector settings."`          |
+| `error`               | `error`       | Relay the tool's `message` verbatim as `next_step`.                                                                        |
+| tool raises / unknown | `error`       | `"Dropbox tool failed unexpectedly. Ask the user to retry shortly."`                                                       |
+
+Surface the tool's `file_id`, `name`, `web_url`, and the `file_type` you passed inside `evidence` when the tool returned them. Never invent a field the tool did not return.
+
+## Examples
+
+**Example 1 — happy create with file-type inferred from a signal:**
+- *Supervisor task:* `"Create a Word doc in Dropbox summarising our launch plan."`
+- *You:* `"Word doc"` → `file_type="docx"`; `name="Launch Plan"` (no extension); generate a Markdown body covering the launch plan. Call `create_dropbox_file(name="Launch Plan", file_type="docx", content=<markdown>)` → tool returns `status=success`.
+- *Output:*
+
+  ```json
+  {
+    "status": "success",
+    "action_summary": "Created Dropbox Word document 'Launch Plan.docx'.",
+    "evidence": { "operation": "create_dropbox_file", "file_id": "<id>", "name": "Launch Plan.docx", "file_type": "docx", "web_url": "<url>", "matched_candidates": null, "items": null },
+    "next_step": null,
+    "missing_fields": null,
+    "assumptions": ["Inferred file_type=docx from 'Word doc'; generated the launch-plan content from the supervisor's brief."]
+  }
+  ```
+
+**Example 2 — blocked because there is no topic:**
+- *Supervisor task:* `"Create a Dropbox file."`
+- *You:* no topic anywhere in the task. Do not fabricate one. Do not call any tool.
+- *Output:*
+
+  ```json
+  {
+    "status": "blocked",
+    "action_summary": "Cannot create a Dropbox file without a topic.",
+    "evidence": { "operation": null, "file_id": null, "name": null, "file_type": null, "web_url": null, "matched_candidates": null, "items": null },
+    "next_step": "Ask the user what the file should be about (and whether they want a Dropbox Paper or a Word document).",
+    "missing_fields": ["name"],
+    "assumptions": null
+  }
+  ```
+
+**Example 3 — delete with `not_found`:**
+- *Supervisor task:* `"Delete the 'Old Project Plan' file from Dropbox."`
+- *You:* extract `file_name="Old Project Plan"`. Call `delete_dropbox_file(file_name="Old Project Plan")` → tool returns `status=not_found`.
+- *Output:*
+
+  ```json
+  {
+    "status": "blocked",
+    "action_summary": "Could not find a Dropbox file named 'Old Project Plan' in the indexed files.",
+    "evidence": { "operation": "delete_dropbox_file", "file_id": null, "name": "Old Project Plan", "file_type": null, "web_url": null, "matched_candidates": null, "items": null },
+    "next_step": "File 'Old Project Plan' was not found in the indexed Dropbox files. Ask the user to verify the file name or wait for the next KB sync.",
+    "missing_fields": null,
+    "assumptions": null
+  }
+  ```
+
+## Output contract
+
+Return **only** one JSON object (no markdown or prose outside it):
+
+```json
+{
+  "status": "success" | "partial" | "blocked" | "error",
+  "action_summary": string,
+  "evidence": {
+    "operation": "create_dropbox_file" | "delete_dropbox_file" | null,
+    "file_id": string | null,
+    "name": string | null,
+    "file_type": "paper" | "docx" | null,
+    "web_url": string | null,
+    "matched_candidates": [ { "id": string, "label": string } ] | null,
+    "items": object | null
+  },
+  "next_step": string | null,
+  "missing_fields": string[] | null,
+  "assumptions": string[] | null
+}
+```
+
+Rules:
+- `status=success` → `next_step=null`, `missing_fields=null`.
+- `status=partial|blocked|error` → `next_step` must be non-null.
+- `status=blocked` due to missing required inputs → `missing_fields` must be non-null.
+
+Infer before you call; map every tool outcome faithfully.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/dropbox/tools/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/dropbox/tools/__init__.py
new file mode 100644
index 0000000000..836b9ee41a
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/dropbox/tools/__init__.py
@@ -0,0 +1,11 @@
+from app.agents.new_chat.tools.dropbox.create_file import (
+    create_create_dropbox_file_tool,
+)
+from app.agents.new_chat.tools.dropbox.trash_file import (
+    create_delete_dropbox_file_tool,
+)
+
+__all__ = [
+    "create_create_dropbox_file_tool",
+    "create_delete_dropbox_file_tool",
+]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/dropbox/tools/create_file.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/dropbox/tools/create_file.py
new file mode 100644
index 0000000000..2de7c301f0
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/dropbox/tools/create_file.py
@@ -0,0 +1,277 @@
+import logging
+import os
+import tempfile
+from pathlib import Path
+from typing import Any, Literal
+
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.future import select
+
+from app.agents.multi_agent_chat.subagents.shared.hitl.approvals.self_gated import (
+    request_approval,
+)
+from app.connectors.dropbox.client import DropboxClient
+from app.db import SearchSourceConnector, SearchSourceConnectorType
+
+logger = logging.getLogger(__name__)
+
+DOCX_MIME = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+
+_FILE_TYPE_LABELS = {
+    "paper": "Dropbox Paper (.paper)",
+    "docx": "Word Document (.docx)",
+}
+
+_SUPPORTED_TYPES = [
+    {"value": "paper", "label": "Dropbox Paper (.paper)"},
+    {"value": "docx", "label": "Word Document (.docx)"},
+]
+
+
+def _ensure_extension(name: str, file_type: str) -> str:
+    """Strip any existing extension and append the correct one."""
+    stem = Path(name).stem
+    ext = ".paper" if file_type == "paper" else ".docx"
+    return f"{stem}{ext}"
+
+
+def _markdown_to_docx(markdown_text: str) -> bytes:
+    """Convert a markdown string to DOCX bytes using pypandoc."""
+    import pypandoc
+
+    fd, tmp_path = tempfile.mkstemp(suffix=".docx")
+    os.close(fd)
+    try:
+        pypandoc.convert_text(
+            markdown_text,
+            "docx",
+            format="gfm",
+            extra_args=["--standalone"],
+            outputfile=tmp_path,
+        )
+        with open(tmp_path, "rb") as f:
+            return f.read()
+    finally:
+        os.unlink(tmp_path)
+
+
+def create_create_dropbox_file_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    @tool
+    async def create_dropbox_file(
+        name: str,
+        file_type: Literal["paper", "docx"] = "paper",
+        content: str | None = None,
+    ) -> dict[str, Any]:
+        """Create a new document in Dropbox.
+
+        Use this tool when the user explicitly asks to create a new document
+        in Dropbox. The user MUST specify a topic before you call this tool.
+
+        Args:
+            name: The document title (without extension).
+            file_type: Either "paper" (Dropbox Paper, default) or "docx" (Word document).
+            content: Optional initial content as markdown.
+
+        Returns:
+            Dictionary with status, file_id, name, web_url, and message.
+        """
+        logger.info(
+            f"create_dropbox_file called: name='{name}', file_type='{file_type}'"
+        )
+
+        if db_session is None or search_space_id is None or user_id is None:
+            return {
+                "status": "error",
+                "message": "Dropbox tool not properly configured.",
+            }
+
+        try:
+            result = await db_session.execute(
+                select(SearchSourceConnector).filter(
+                    SearchSourceConnector.search_space_id == search_space_id,
+                    SearchSourceConnector.user_id == user_id,
+                    SearchSourceConnector.connector_type
+                    == SearchSourceConnectorType.DROPBOX_CONNECTOR,
+                )
+            )
+            connectors = result.scalars().all()
+
+            if not connectors:
+                return {
+                    "status": "error",
+                    "message": "No Dropbox connector found. Please connect Dropbox in your workspace settings.",
+                }
+
+            accounts = []
+            for c in connectors:
+                cfg = c.config or {}
+                accounts.append(
+                    {
+                        "id": c.id,
+                        "name": c.name,
+                        "user_email": cfg.get("user_email"),
+                        "auth_expired": cfg.get("auth_expired", False),
+                    }
+                )
+
+            if all(a.get("auth_expired") for a in accounts):
+                return {
+                    "status": "auth_error",
+                    "message": "All connected Dropbox accounts need re-authentication.",
+                    "connector_type": "dropbox",
+                }
+
+            parent_folders: dict[int, list[dict[str, str]]] = {}
+            for acc in accounts:
+                cid = acc["id"]
+                if acc.get("auth_expired"):
+                    parent_folders[cid] = []
+                    continue
+                try:
+                    client = DropboxClient(session=db_session, connector_id=cid)
+                    items, err = await client.list_folder("")
+                    if err:
+                        logger.warning(
+                            "Failed to list folders for connector %s: %s", cid, err
+                        )
+                        parent_folders[cid] = []
+                    else:
+                        parent_folders[cid] = [
+                            {
+                                "folder_path": item.get("path_lower", ""),
+                                "name": item["name"],
+                            }
+                            for item in items
+                            if item.get(".tag") == "folder" and item.get("name")
+                        ]
+                except Exception:
+                    logger.warning(
+                        "Error fetching folders for connector %s", cid, exc_info=True
+                    )
+                    parent_folders[cid] = []
+
+            context: dict[str, Any] = {
+                "accounts": accounts,
+                "parent_folders": parent_folders,
+                "supported_types": _SUPPORTED_TYPES,
+            }
+
+            result = request_approval(
+                action_type="dropbox_file_creation",
+                tool_name="create_dropbox_file",
+                params={
+                    "name": name,
+                    "file_type": file_type,
+                    "content": content,
+                    "connector_id": None,
+                    "parent_folder_path": None,
+                },
+                context=context,
+            )
+
+            if result.rejected:
+                return {
+                    "status": "rejected",
+                    "message": "User declined. Do not retry or suggest alternatives.",
+                }
+
+            final_name = result.params.get("name", name)
+            final_file_type = result.params.get("file_type", file_type)
+            final_content = result.params.get("content", content)
+            final_connector_id = result.params.get("connector_id")
+            final_parent_folder_path = result.params.get("parent_folder_path")
+
+            if not final_name or not final_name.strip():
+                return {"status": "error", "message": "File name cannot be empty."}
+
+            final_name = _ensure_extension(final_name, final_file_type)
+
+            if final_connector_id is not None:
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.id == final_connector_id,
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type
+                        == SearchSourceConnectorType.DROPBOX_CONNECTOR,
+                    )
+                )
+                connector = result.scalars().first()
+            else:
+                connector = connectors[0]
+
+            if not connector:
+                return {
+                    "status": "error",
+                    "message": "Selected Dropbox connector is invalid.",
+                }
+
+            client = DropboxClient(session=db_session, connector_id=connector.id)
+
+            parent_path = final_parent_folder_path or ""
+            file_path = (
+                f"{parent_path}/{final_name}" if parent_path else f"/{final_name}"
+            )
+
+            if final_file_type == "paper":
+                created = await client.create_paper_doc(file_path, final_content or "")
+                file_id = created.get("file_id", "")
+                web_url = created.get("url", "")
+            else:
+                docx_bytes = _markdown_to_docx(final_content or "")
+                created = await client.upload_file(
+                    file_path, docx_bytes, mode="add", autorename=True
+                )
+                file_id = created.get("id", "")
+                web_url = ""
+
+            logger.info(f"Dropbox file created: id={file_id}, name={final_name}")
+
+            kb_message_suffix = ""
+            try:
+                from app.services.dropbox import DropboxKBSyncService
+
+                kb_service = DropboxKBSyncService(db_session)
+                kb_result = await kb_service.sync_after_create(
+                    file_id=file_id,
+                    file_name=final_name,
+                    file_path=file_path,
+                    web_url=web_url,
+                    content=final_content,
+                    connector_id=connector.id,
+                    search_space_id=search_space_id,
+                    user_id=user_id,
+                )
+                if kb_result["status"] == "success":
+                    kb_message_suffix = " Your knowledge base has also been updated."
+                else:
+                    kb_message_suffix = " This file will be added to your knowledge base in the next scheduled sync."
+            except Exception as kb_err:
+                logger.warning(f"KB sync after create failed: {kb_err}")
+                kb_message_suffix = " This file will be added to your knowledge base in the next scheduled sync."
+
+            return {
+                "status": "success",
+                "file_id": file_id,
+                "name": final_name,
+                "web_url": web_url,
+                "message": f"Successfully created '{final_name}' in Dropbox.{kb_message_suffix}",
+            }
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+            logger.error(f"Error creating Dropbox file: {e}", exc_info=True)
+            return {
+                "status": "error",
+                "message": "Something went wrong while creating the file. Please try again.",
+            }
+
+    return create_dropbox_file
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/dropbox/tools/index.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/dropbox/tools/index.py
new file mode 100644
index 0000000000..68e02866a7
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/dropbox/tools/index.py
@@ -0,0 +1,34 @@
+"""``dropbox`` native tools and (empty) permission ruleset.
+
+Tools self-gate via :func:`request_approval` in their bodies.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.tools import BaseTool
+
+from app.agents.new_chat.permissions import Ruleset
+
+from .create_file import create_create_dropbox_file_tool
+from .trash_file import create_delete_dropbox_file_tool
+
+NAME = "dropbox"
+
+RULESET = Ruleset(origin=NAME, rules=[])
+
+
+def load_tools(
+    *, dependencies: dict[str, Any] | None = None, **kwargs: Any
+) -> list[BaseTool]:
+    d = {**(dependencies or {}), **kwargs}
+    common = {
+        "db_session": d["db_session"],
+        "search_space_id": d["search_space_id"],
+        "user_id": d["user_id"],
+    }
+    return [
+        create_create_dropbox_file_tool(**common),
+        create_delete_dropbox_file_tool(**common),
+    ]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/dropbox/tools/trash_file.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/dropbox/tools/trash_file.py
new file mode 100644
index 0000000000..7cb652d5df
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/dropbox/tools/trash_file.py
@@ -0,0 +1,279 @@
+import logging
+from typing import Any
+
+from langchain_core.tools import tool
+from sqlalchemy import String, and_, cast, func
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.future import select
+
+from app.agents.multi_agent_chat.subagents.shared.hitl.approvals.self_gated import (
+    request_approval,
+)
+from app.connectors.dropbox.client import DropboxClient
+from app.db import (
+    Document,
+    DocumentType,
+    SearchSourceConnector,
+    SearchSourceConnectorType,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def create_delete_dropbox_file_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    @tool
+    async def delete_dropbox_file(
+        file_name: str,
+        delete_from_kb: bool = False,
+    ) -> dict[str, Any]:
+        """Delete a file from Dropbox.
+
+        Use this tool when the user explicitly asks to delete, remove, or trash
+        a file in Dropbox.
+
+        Args:
+            file_name: The exact name of the file to delete.
+            delete_from_kb: Whether to also remove the file from the knowledge base.
+                          Default is False.
+
+        Returns:
+            Dictionary with:
+            - status: "success", "rejected", "not_found", or "error"
+            - file_id: Dropbox file ID (if success)
+            - deleted_from_kb: whether the document was removed from the knowledge base
+            - message: Result message
+
+            IMPORTANT:
+            - If status is "rejected", the user explicitly declined. Respond with a brief
+              acknowledgment and do NOT retry or suggest alternatives.
+            - If status is "not_found", relay the exact message to the user and ask them
+              to verify the file name or check if it has been indexed.
+        """
+        logger.info(
+            f"delete_dropbox_file called: file_name='{file_name}', delete_from_kb={delete_from_kb}"
+        )
+
+        if db_session is None or search_space_id is None or user_id is None:
+            return {
+                "status": "error",
+                "message": "Dropbox tool not properly configured.",
+            }
+
+        try:
+            doc_result = await db_session.execute(
+                select(Document)
+                .join(
+                    SearchSourceConnector,
+                    Document.connector_id == SearchSourceConnector.id,
+                )
+                .filter(
+                    and_(
+                        Document.search_space_id == search_space_id,
+                        Document.document_type == DocumentType.DROPBOX_FILE,
+                        func.lower(Document.title) == func.lower(file_name),
+                        SearchSourceConnector.user_id == user_id,
+                    )
+                )
+                .order_by(Document.updated_at.desc().nullslast())
+                .limit(1)
+            )
+            document = doc_result.scalars().first()
+
+            if not document:
+                doc_result = await db_session.execute(
+                    select(Document)
+                    .join(
+                        SearchSourceConnector,
+                        Document.connector_id == SearchSourceConnector.id,
+                    )
+                    .filter(
+                        and_(
+                            Document.search_space_id == search_space_id,
+                            Document.document_type == DocumentType.DROPBOX_FILE,
+                            func.lower(
+                                cast(
+                                    Document.document_metadata["dropbox_file_name"],
+                                    String,
+                                )
+                            )
+                            == func.lower(file_name),
+                            SearchSourceConnector.user_id == user_id,
+                        )
+                    )
+                    .order_by(Document.updated_at.desc().nullslast())
+                    .limit(1)
+                )
+                document = doc_result.scalars().first()
+
+            if not document:
+                return {
+                    "status": "not_found",
+                    "message": (
+                        f"File '{file_name}' not found in your indexed Dropbox files. "
+                        "This could mean: (1) the file doesn't exist, (2) it hasn't been indexed yet, "
+                        "or (3) the file name is different."
+                    ),
+                }
+
+            if not document.connector_id:
+                return {
+                    "status": "error",
+                    "message": "Document has no associated connector.",
+                }
+
+            meta = document.document_metadata or {}
+            file_path = meta.get("dropbox_path")
+            file_id = meta.get("dropbox_file_id")
+            document_id = document.id
+
+            if not file_path:
+                return {
+                    "status": "error",
+                    "message": "File path is missing. Please re-index the file.",
+                }
+
+            conn_result = await db_session.execute(
+                select(SearchSourceConnector).filter(
+                    and_(
+                        SearchSourceConnector.id == document.connector_id,
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type
+                        == SearchSourceConnectorType.DROPBOX_CONNECTOR,
+                    )
+                )
+            )
+            connector = conn_result.scalars().first()
+            if not connector:
+                return {
+                    "status": "error",
+                    "message": "Dropbox connector not found or access denied.",
+                }
+
+            cfg = connector.config or {}
+            if cfg.get("auth_expired"):
+                return {
+                    "status": "auth_error",
+                    "message": "Dropbox account needs re-authentication. Please re-authenticate in your connector settings.",
+                    "connector_type": "dropbox",
+                }
+
+            context = {
+                "file": {
+                    "file_id": file_id,
+                    "file_path": file_path,
+                    "name": file_name,
+                    "document_id": document_id,
+                },
+                "account": {
+                    "id": connector.id,
+                    "name": connector.name,
+                    "user_email": cfg.get("user_email"),
+                },
+            }
+
+            result = request_approval(
+                action_type="dropbox_file_trash",
+                tool_name="delete_dropbox_file",
+                params={
+                    "file_path": file_path,
+                    "connector_id": connector.id,
+                    "delete_from_kb": delete_from_kb,
+                },
+                context=context,
+            )
+
+            if result.rejected:
+                return {
+                    "status": "rejected",
+                    "message": "User declined. Do not retry or suggest alternatives.",
+                }
+
+            final_file_path = result.params.get("file_path", file_path)
+            final_connector_id = result.params.get("connector_id", connector.id)
+            final_delete_from_kb = result.params.get("delete_from_kb", delete_from_kb)
+
+            if final_connector_id != connector.id:
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        and_(
+                            SearchSourceConnector.id == final_connector_id,
+                            SearchSourceConnector.search_space_id == search_space_id,
+                            SearchSourceConnector.user_id == user_id,
+                            SearchSourceConnector.connector_type
+                            == SearchSourceConnectorType.DROPBOX_CONNECTOR,
+                        )
+                    )
+                )
+                validated_connector = result.scalars().first()
+                if not validated_connector:
+                    return {
+                        "status": "error",
+                        "message": "Selected Dropbox connector is invalid or has been disconnected.",
+                    }
+                actual_connector_id = validated_connector.id
+            else:
+                actual_connector_id = connector.id
+
+            logger.info(
+                f"Deleting Dropbox file: path='{final_file_path}', connector={actual_connector_id}"
+            )
+
+            client = DropboxClient(session=db_session, connector_id=actual_connector_id)
+            await client.delete_file(final_file_path)
+
+            logger.info(f"Dropbox file deleted: path={final_file_path}")
+
+            trash_result: dict[str, Any] = {
+                "status": "success",
+                "file_id": file_id,
+                "message": f"Successfully deleted '{file_name}' from Dropbox.",
+            }
+
+            deleted_from_kb = False
+            if final_delete_from_kb and document_id:
+                try:
+                    doc_result = await db_session.execute(
+                        select(Document).filter(Document.id == document_id)
+                    )
+                    doc = doc_result.scalars().first()
+                    if doc:
+                        await db_session.delete(doc)
+                        await db_session.commit()
+                        deleted_from_kb = True
+                        logger.info(
+                            f"Deleted document {document_id} from knowledge base"
+                        )
+                    else:
+                        logger.warning(f"Document {document_id} not found in KB")
+                except Exception as e:
+                    logger.error(f"Failed to delete document from KB: {e}")
+                    await db_session.rollback()
+                    trash_result["warning"] = (
+                        f"File deleted, but failed to remove from knowledge base: {e!s}"
+                    )
+
+            trash_result["deleted_from_kb"] = deleted_from_kb
+            if deleted_from_kb:
+                trash_result["message"] = (
+                    f"{trash_result.get('message', '')} (also removed from knowledge base)"
+                )
+
+            return trash_result
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+            logger.error(f"Error deleting Dropbox file: {e}", exc_info=True)
+            return {
+                "status": "error",
+                "message": "Something went wrong while deleting the file. Please try again.",
+            }
+
+    return delete_dropbox_file
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/agent.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/agent.py
new file mode 100644
index 0000000000..082400eb91
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/agent.py
@@ -0,0 +1,44 @@
+"""``gmail`` route: ``SurfSenseSubagentSpec`` builder for deepagents.
+
+Tools self-gate inside their bodies via :func:`request_approval`; the
+empty :data:`tools.index.RULESET` is layered into a per-subagent
+:class:`PermissionMiddleware` for uniformity.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.language_models import BaseChatModel
+from langchain_core.tools import BaseTool
+
+from app.agents.multi_agent_chat.subagents.shared.md_file_reader import read_md_file
+from app.agents.multi_agent_chat.subagents.shared.spec import SurfSenseSubagentSpec
+from app.agents.multi_agent_chat.subagents.shared.subagent_builder import pack_subagent
+
+from .tools.index import NAME, RULESET, load_tools
+
+
+def build_subagent(
+    *,
+    dependencies: dict[str, Any],
+    model: BaseChatModel | None = None,
+    middleware_stack: dict[str, Any] | None = None,
+    mcp_tools: list[BaseTool] | None = None,
+) -> SurfSenseSubagentSpec:
+    tools = [*load_tools(dependencies=dependencies), *(mcp_tools or [])]
+    description = (
+        read_md_file(__package__, "description").strip()
+        or "Handles gmail tasks for this workspace."
+    )
+    system_prompt = read_md_file(__package__, "system_prompt").strip()
+    return pack_subagent(
+        name=NAME,
+        description=description,
+        system_prompt=system_prompt,
+        tools=tools,
+        ruleset=RULESET,
+        dependencies=dependencies,
+        model=model,
+        middleware_stack=middleware_stack,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/description.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/description.md
new file mode 100644
index 0000000000..cdbe93fba8
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/description.md
@@ -0,0 +1,3 @@
+Specialist for messages in the user's Gmail inbox.
+Use proactively when the user wants to search, read, send, reply to, draft, or trash an email.
+Email-only conversations belong here, including discussions about meetings that do not reserve a time slot.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/system_prompt.md
new file mode 100644
index 0000000000..d74e9bdc4a
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/system_prompt.md
@@ -0,0 +1,120 @@
+You are a Gmail specialist for the user's connected Gmail mailbox.
+
+## Vocabulary you must use precisely
+
+- **Search-then-act for reading** — `read_gmail_email` accepts only a `message_id`. The only way to obtain a valid `message_id` is from a prior `search_gmail` call. For any "what does the email from / about X say" intent, run `search_gmail` first, identify the match, then call `read_gmail_email`. Never invent or guess a `message_id`.
+- **Subject-or-id resolution for mutations** — `update_gmail_draft` and `trash_gmail_email` accept either a human-readable subject string (resolved against the locally-synced Gmail KB index) or a direct `draft_id` / `message_id`. Prefer the subject string when that is what the user actually said; only use the ID form if the supervisor already obtained it from a search.
+- **Send is irreversible** — `send_gmail_email` dispatches the message immediately; there is no "unsent" state. `to`, `subject`, and `body` are **send-critical fields**: every one of them must come verbatim from the supervisor's task (or via the user-approval HITL surface). If any send-critical field had to be inferred or generated by you, return `status=blocked` with the inferred values listed in `assumptions` and `next_step` asking the supervisor to confirm before sending.
+- **Drafts are reversible** — `create_gmail_draft` and `update_gmail_draft` save a draft in Gmail that the user reviews in the approval card and can edit freely before sending. Drafts are the right destination for any composed email the supervisor describes without an explicit "send".
+- **Verb dispatch (send vs. draft)** — task verbs `send`, `email <person>`, `reply and send` → `send_gmail_email`. Task verbs `draft`, `compose`, `prepare`, `write up` → `create_gmail_draft`. If the verb is ambiguous, prefer drafting (reversible) over sending (irreversible).
+- **Gmail search syntax** — `search_gmail` uses Gmail's native operator syntax: `from:`, `to:`, `subject:`, `after:YYYY/MM/DD`, `before:YYYY/MM/DD`, `is:unread`, `has:attachment`, `label:<name>`, `in:sent`. Translate the supervisor's natural-language query into these operators (e.g. `"unread emails from Alice last week"` → `from:alice@... is:unread after:<date>`). Resolve relative dates against the runtime timestamp.
+
+## Required inputs
+
+**For every required input below, first try to infer it from the supervisor's task text** — extract recipients from phrases like `"to Alice"` / `"email bob@x.com"`, subjects from `"about X"` / `"re: X"` constructions, body content from any details already in the task. Only return `status=blocked` with `missing_fields` when an input is genuinely absent or ambiguous after a thorough read.
+
+- `send_gmail_email` — `to`, `subject`, `body`. **Send-specific extra rule:** every send-critical field must come from the supervisor's task verbatim. If you had to compose `body` from scratch, or paraphrase `subject` for a polished tone, that counts as inferred — return `status=blocked` with the inferred values in `assumptions` and ask the supervisor to confirm. Do not call `send_gmail_email` with anything inferred. `cc` / `bcc` are optional and may be omitted unless the user named them.
+- `create_gmail_draft` — `to`, `subject`, `body`. Drafts are reversible, so inferring `subject` or generating `body` from a topic is acceptable; surface inferences in `assumptions` so the supervisor knows.
+- `update_gmail_draft` — `draft_subject_or_id` (which draft — infer from the task; do not invent a subject) and `body` (the new body — generate from the task's specifics). Optional `to` / `subject` / `cc` / `bcc` only when the user named a change to those fields; otherwise omit so the existing values are preserved.
+- `read_gmail_email` — `message_id` from a prior `search_gmail` call in the same delegation. If you do not yet have a `message_id`, run `search_gmail` first.
+- `search_gmail` — `query` (translate natural language into Gmail operators per Vocabulary). `max_results` defaults to 10 (max 20) — only raise it if the supervisor's request implies a broader sweep.
+- `trash_gmail_email` — `email_subject_or_id` (which email — infer from the task). Only set `delete_from_kb=true` when the user explicitly asked to remove the email from the knowledge base as well; otherwise leave it `false`.
+
+## Outcome mapping
+
+| Tool returns                | Your `status` | `next_step`                                                                                                                  |
+|-----------------------------|---------------|------------------------------------------------------------------------------------------------------------------------------|
+| `success`                   | `success`     | `null`                                                                                                                       |
+| `success` with `total: 0` (`search_gmail` only) | `blocked` | `"No emails matched the query '<query>'. Ask the user to widen the criteria or provide more specifics."`        |
+| `rejected`                  | `blocked`     | `"User declined this Gmail action. Do not retry or suggest alternatives."`                                                   |
+| `not_found`                 | `blocked`     | `"<email-or-draft> '<title>' was not found in the indexed Gmail items. Ask the user to verify the subject or wait for the next KB sync."` |
+| `auth_error`                | `error`       | `"The connected Gmail account needs re-authentication. Ask the user to re-authenticate Gmail in connector settings."`        |
+| `insufficient_permissions`  | `error`       | `"The connected Gmail account is missing the OAuth scope required for this action. Ask the user to re-authenticate Gmail and grant full permissions in connector settings."` |
+| `error`                     | `error`       | Relay the tool's `message` verbatim as `next_step`.                                                                          |
+| tool raises / unknown       | `error`       | `"Gmail tool failed unexpectedly. Ask the user to retry shortly."`                                                           |
+
+Surface the tool's `message_id`, `thread_id`, `draft_id`, `subject`, and recipient fields inside `evidence` when the tool returned them. For `search_gmail`, place the raw `emails` array inside `evidence.items`. Never invent a field the tool did not return.
+
+## Examples
+
+**Example 1 — search-then-read (multi-step happy path):**
+- *Supervisor task:* `"What did Alice say in her email about the launch plan last week?"`
+- *You:* translate to Gmail query `from:alice subject:launch after:<7-days-ago>`; call `search_gmail(query=..., max_results=10)`. Tool returns `total=1` with one email. Extract its `message_id` and call `read_gmail_email(message_id=...)`. Tool returns `status=success` with the markdown body.
+- *Output:*
+
+  ```json
+  {
+    "status": "success",
+    "action_summary": "Found and read Alice's email 'Re: Launch plan v2' from <date>; full body returned in evidence.items.body.",
+    "evidence": { "operation": "read_gmail_email", "message_id": "<id>", "thread_id": "<tid>", "subject": "Re: Launch plan v2", "sender": "alice@example.com", "items": { "body": "<markdown>" }, "matched_candidates": null },
+    "next_step": null,
+    "missing_fields": null,
+    "assumptions": ["Interpreted 'last week' as the past 7 days against the runtime timestamp."]
+  }
+  ```
+
+**Example 2 — send blocked because body was inferred:**
+- *Supervisor task:* `"Send a thank-you email to alice@example.com."`
+- *You:* `to=alice@example.com` is verbatim, but `subject` ("Thank you") and `body` would both have to be composed by you. Send is irreversible — do not dispatch inferred content. Do not call `send_gmail_email`.
+- *Output:*
+
+  ```json
+  {
+    "status": "blocked",
+    "action_summary": "Cannot send: subject and body would be inferred, and send is irreversible.",
+    "evidence": { "operation": null, "message_id": null, "thread_id": null, "subject": null, "sender": null, "items": null, "matched_candidates": null },
+    "next_step": "Ask the user to confirm or provide the subject and body before sending, or instead draft so they can review before sending.",
+    "missing_fields": ["subject", "body"],
+    "assumptions": null
+  }
+  ```
+
+**Example 3 — search returns zero results:**
+- *Supervisor task:* `"Trash the email from Bob about the cancelled Q3 launch."`
+- *You:* before trashing, locate it. Call `search_gmail(query="from:bob subject:Q3 launch")` → tool returns `status=success, total=0`. No target to trash.
+- *Output:*
+
+  ```json
+  {
+    "status": "blocked",
+    "action_summary": "No emails matched 'from Bob about cancelled Q3 launch'.",
+    "evidence": { "operation": "search_gmail", "message_id": null, "thread_id": null, "subject": null, "sender": null, "items": { "emails": [], "total": 0 }, "matched_candidates": null },
+    "next_step": "Ask the user to widen the search (different sender, broader date range, or part of the actual subject line) or confirm the email exists in this account.",
+    "missing_fields": null,
+    "assumptions": ["Interpreted 'about the cancelled Q3 launch' as a subject-line filter; could also match body text only."]
+  }
+  ```
+
+## Output contract
+
+Return **only** one JSON object (no markdown or prose outside it):
+
+```json
+{
+  "status": "success" | "partial" | "blocked" | "error",
+  "action_summary": string,
+  "evidence": {
+    "operation": "send_gmail_email" | "create_gmail_draft" | "update_gmail_draft" | "read_gmail_email" | "search_gmail" | "trash_gmail_email" | null,
+    "message_id": string | null,
+    "thread_id": string | null,
+    "draft_id": string | null,
+    "subject": string | null,
+    "sender": string | null,
+    "recipients": string[] | null,
+    "matched_candidates": [ { "id": string, "label": string } ] | null,
+    "items": object | null
+  },
+  "next_step": string | null,
+  "missing_fields": string[] | null,
+  "assumptions": string[] | null
+}
+```
+
+Rules:
+- `status=success` → `next_step=null`, `missing_fields=null`.
+- `status=partial|blocked|error` → `next_step` must be non-null.
+- `status=blocked` due to missing required inputs → `missing_fields` must be non-null.
+- For `search_gmail` results, populate `evidence.items` with `{ "emails": [...], "total": N }`.
+- For ambiguous matches across `update_gmail_draft` / `trash_gmail_email` / `read_gmail_email`, populate `evidence.matched_candidates` with up to 5 options (`id` + `label`).
+
+Infer before you call; verify before you send; map every tool outcome faithfully.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/tools/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/tools/__init__.py
new file mode 100644
index 0000000000..294840122c
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/tools/__init__.py
@@ -0,0 +1,27 @@
+from app.agents.new_chat.tools.gmail.create_draft import (
+    create_create_gmail_draft_tool,
+)
+from app.agents.new_chat.tools.gmail.read_email import (
+    create_read_gmail_email_tool,
+)
+from app.agents.new_chat.tools.gmail.search_emails import (
+    create_search_gmail_tool,
+)
+from app.agents.new_chat.tools.gmail.send_email import (
+    create_send_gmail_email_tool,
+)
+from app.agents.new_chat.tools.gmail.trash_email import (
+    create_trash_gmail_email_tool,
+)
+from app.agents.new_chat.tools.gmail.update_draft import (
+    create_update_gmail_draft_tool,
+)
+
+__all__ = [
+    "create_create_gmail_draft_tool",
+    "create_read_gmail_email_tool",
+    "create_search_gmail_tool",
+    "create_send_gmail_email_tool",
+    "create_trash_gmail_email_tool",
+    "create_update_gmail_draft_tool",
+]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/tools/create_draft.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/tools/create_draft.py
new file mode 100644
index 0000000000..fb1461d7cf
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/tools/create_draft.py
@@ -0,0 +1,340 @@
+import asyncio
+import base64
+import logging
+from datetime import datetime
+from email.mime.text import MIMEText
+from typing import Any
+
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.agents.multi_agent_chat.subagents.shared.hitl.approvals.self_gated import (
+    request_approval,
+)
+from app.services.gmail import GmailToolMetadataService
+
+logger = logging.getLogger(__name__)
+
+
+def create_create_gmail_draft_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    @tool
+    async def create_gmail_draft(
+        to: str,
+        subject: str,
+        body: str,
+        cc: str | None = None,
+        bcc: str | None = None,
+    ) -> dict[str, Any]:
+        """Create a draft email in Gmail.
+
+        Use when the user asks to draft, compose, or prepare an email without
+        sending it.
+
+        Args:
+            to: Recipient email address.
+            subject: Email subject line.
+            body: Email body content.
+            cc: Optional CC recipient(s), comma-separated.
+            bcc: Optional BCC recipient(s), comma-separated.
+
+        Returns:
+            Dictionary with:
+            - status: "success", "rejected", or "error"
+            - draft_id: Gmail draft ID (if success)
+            - message: Result message
+
+            IMPORTANT:
+            - If status is "rejected", the user explicitly declined the action.
+              Respond with a brief acknowledgment and do NOT retry or suggest alternatives.
+            - If status is "insufficient_permissions", the connector lacks the required OAuth scope.
+              Inform the user they need to re-authenticate and do NOT retry the action.
+
+        Examples:
+            - "Draft an email to alice@example.com about the meeting"
+            - "Compose a reply to Bob about the project update"
+        """
+        logger.info(f"create_gmail_draft called: to='{to}', subject='{subject}'")
+
+        if db_session is None or search_space_id is None or user_id is None:
+            return {
+                "status": "error",
+                "message": "Gmail tool not properly configured. Please contact support.",
+            }
+
+        try:
+            metadata_service = GmailToolMetadataService(db_session)
+            context = await metadata_service.get_creation_context(
+                search_space_id, user_id
+            )
+
+            if "error" in context:
+                logger.error(f"Failed to fetch creation context: {context['error']}")
+                return {"status": "error", "message": context["error"]}
+
+            accounts = context.get("accounts", [])
+            if accounts and all(a.get("auth_expired") for a in accounts):
+                logger.warning("All Gmail accounts have expired authentication")
+                return {
+                    "status": "auth_error",
+                    "message": "All connected Gmail accounts need re-authentication. Please re-authenticate in your connector settings.",
+                    "connector_type": "gmail",
+                }
+
+            logger.info(
+                f"Requesting approval for creating Gmail draft: to='{to}', subject='{subject}'"
+            )
+            result = request_approval(
+                action_type="gmail_draft_creation",
+                tool_name="create_gmail_draft",
+                params={
+                    "to": to,
+                    "subject": subject,
+                    "body": body,
+                    "cc": cc,
+                    "bcc": bcc,
+                    "connector_id": None,
+                },
+                context=context,
+            )
+
+            if result.rejected:
+                return {
+                    "status": "rejected",
+                    "message": "User declined. The draft was not created. Do not ask again or suggest alternatives.",
+                }
+
+            final_to = result.params.get("to", to)
+            final_subject = result.params.get("subject", subject)
+            final_body = result.params.get("body", body)
+            final_cc = result.params.get("cc", cc)
+            final_bcc = result.params.get("bcc", bcc)
+            final_connector_id = result.params.get("connector_id")
+
+            from sqlalchemy.future import select
+
+            from app.db import SearchSourceConnector, SearchSourceConnectorType
+
+            _gmail_types = [
+                SearchSourceConnectorType.GOOGLE_GMAIL_CONNECTOR,
+                SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR,
+            ]
+
+            if final_connector_id is not None:
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.id == final_connector_id,
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type.in_(_gmail_types),
+                    )
+                )
+                connector = result.scalars().first()
+                if not connector:
+                    return {
+                        "status": "error",
+                        "message": "Selected Gmail connector is invalid or has been disconnected.",
+                    }
+                actual_connector_id = connector.id
+            else:
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type.in_(_gmail_types),
+                    )
+                )
+                connector = result.scalars().first()
+                if not connector:
+                    return {
+                        "status": "error",
+                        "message": "No Gmail connector found. Please connect Gmail in your workspace settings.",
+                    }
+                actual_connector_id = connector.id
+
+            logger.info(
+                f"Creating Gmail draft: to='{final_to}', subject='{final_subject}', connector={actual_connector_id}"
+            )
+
+            if (
+                connector.connector_type
+                == SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR
+            ):
+                cca_id = connector.config.get("composio_connected_account_id")
+                if not cca_id:
+                    return {
+                        "status": "error",
+                        "message": "Composio connected account ID not found for this Gmail connector.",
+                    }
+
+                from app.services.composio_service import ComposioService
+
+                (
+                    draft_id,
+                    draft_message_id,
+                    draft_thread_id,
+                    error,
+                ) = await ComposioService().create_gmail_draft(
+                    connected_account_id=cca_id,
+                    entity_id=f"surfsense_{user_id}",
+                    to=final_to,
+                    subject=final_subject,
+                    body=final_body,
+                    cc=final_cc,
+                    bcc=final_bcc,
+                )
+                if error:
+                    return {"status": "error", "message": error}
+                created = {
+                    "id": draft_id,
+                    "message": {
+                        "id": draft_message_id,
+                        "threadId": draft_thread_id,
+                    },
+                }
+                logger.info(f"Gmail draft created via Composio: id={draft_id}")
+            else:
+                from google.oauth2.credentials import Credentials
+
+                from app.config import config
+                from app.utils.oauth_security import TokenEncryption
+
+                config_data = dict(connector.config)
+                token_encrypted = config_data.get("_token_encrypted", False)
+                if token_encrypted and config.SECRET_KEY:
+                    token_encryption = TokenEncryption(config.SECRET_KEY)
+                    if config_data.get("token"):
+                        config_data["token"] = token_encryption.decrypt_token(
+                            config_data["token"]
+                        )
+                    if config_data.get("refresh_token"):
+                        config_data["refresh_token"] = token_encryption.decrypt_token(
+                            config_data["refresh_token"]
+                        )
+                    if config_data.get("client_secret"):
+                        config_data["client_secret"] = token_encryption.decrypt_token(
+                            config_data["client_secret"]
+                        )
+
+                exp = config_data.get("expiry", "")
+                if exp:
+                    exp = exp.replace("Z", "")
+
+                creds = Credentials(
+                    token=config_data.get("token"),
+                    refresh_token=config_data.get("refresh_token"),
+                    token_uri=config_data.get("token_uri"),
+                    client_id=config_data.get("client_id"),
+                    client_secret=config_data.get("client_secret"),
+                    scopes=config_data.get("scopes", []),
+                    expiry=datetime.fromisoformat(exp) if exp else None,
+                )
+
+                from googleapiclient.discovery import build
+
+                gmail_service = build("gmail", "v1", credentials=creds)
+
+                message = MIMEText(final_body)
+                message["to"] = final_to
+                message["subject"] = final_subject
+                if final_cc:
+                    message["cc"] = final_cc
+                if final_bcc:
+                    message["bcc"] = final_bcc
+                raw = base64.urlsafe_b64encode(message.as_bytes()).decode()
+
+                try:
+                    created = await asyncio.get_event_loop().run_in_executor(
+                        None,
+                        lambda: (
+                            gmail_service.users()
+                            .drafts()
+                            .create(userId="me", body={"message": {"raw": raw}})
+                            .execute()
+                        ),
+                    )
+                except Exception as api_err:
+                    from googleapiclient.errors import HttpError
+
+                    if isinstance(api_err, HttpError) and api_err.resp.status == 403:
+                        logger.warning(
+                            f"Insufficient permissions for connector {actual_connector_id}: {api_err}"
+                        )
+                        try:
+                            from sqlalchemy.orm.attributes import flag_modified
+
+                            _res = await db_session.execute(
+                                select(SearchSourceConnector).where(
+                                    SearchSourceConnector.id == actual_connector_id
+                                )
+                            )
+                            _conn = _res.scalar_one_or_none()
+                            if _conn and not _conn.config.get("auth_expired"):
+                                _conn.config = {**_conn.config, "auth_expired": True}
+                                flag_modified(_conn, "config")
+                                await db_session.commit()
+                        except Exception:
+                            logger.warning(
+                                "Failed to persist auth_expired for connector %s",
+                                actual_connector_id,
+                                exc_info=True,
+                            )
+                        return {
+                            "status": "insufficient_permissions",
+                            "connector_id": actual_connector_id,
+                            "message": "This Gmail account needs additional permissions. Please re-authenticate in connector settings.",
+                        }
+                    raise
+
+                logger.info(
+                    f"Gmail draft created via Google API: id={created.get('id')}"
+                )
+
+            kb_message_suffix = ""
+            try:
+                from app.services.gmail import GmailKBSyncService
+
+                kb_service = GmailKBSyncService(db_session)
+                draft_message = created.get("message", {})
+                kb_result = await kb_service.sync_after_create(
+                    message_id=draft_message.get("id", ""),
+                    thread_id=draft_message.get("threadId", ""),
+                    subject=final_subject,
+                    sender="me",
+                    date_str=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                    body_text=final_body,
+                    connector_id=actual_connector_id,
+                    search_space_id=search_space_id,
+                    user_id=user_id,
+                    draft_id=created.get("id"),
+                )
+                if kb_result["status"] == "success":
+                    kb_message_suffix = " Your knowledge base has also been updated."
+                else:
+                    kb_message_suffix = " This draft will be added to your knowledge base in the next scheduled sync."
+            except Exception as kb_err:
+                logger.warning(f"KB sync after create failed: {kb_err}")
+                kb_message_suffix = " This draft will be added to your knowledge base in the next scheduled sync."
+
+            return {
+                "status": "success",
+                "draft_id": created.get("id"),
+                "message": f"Successfully created Gmail draft with subject '{final_subject}'.{kb_message_suffix}",
+            }
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+
+            logger.error(f"Error creating Gmail draft: {e}", exc_info=True)
+            return {
+                "status": "error",
+                "message": "Something went wrong while creating the draft. Please try again.",
+            }
+
+    return create_gmail_draft
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/tools/index.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/tools/index.py
new file mode 100644
index 0000000000..020089ebb6
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/tools/index.py
@@ -0,0 +1,42 @@
+"""``gmail`` native tools and (empty) permission ruleset.
+
+Tools self-gate via :func:`request_approval` in their bodies.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.tools import BaseTool
+
+from app.agents.new_chat.permissions import Ruleset
+
+from .create_draft import create_create_gmail_draft_tool
+from .read_email import create_read_gmail_email_tool
+from .search_emails import create_search_gmail_tool
+from .send_email import create_send_gmail_email_tool
+from .trash_email import create_trash_gmail_email_tool
+from .update_draft import create_update_gmail_draft_tool
+
+NAME = "gmail"
+
+RULESET = Ruleset(origin=NAME, rules=[])
+
+
+def load_tools(
+    *, dependencies: dict[str, Any] | None = None, **kwargs: Any
+) -> list[BaseTool]:
+    d = {**(dependencies or {}), **kwargs}
+    common = {
+        "db_session": d["db_session"],
+        "search_space_id": d["search_space_id"],
+        "user_id": d["user_id"],
+    }
+    return [
+        create_search_gmail_tool(**common),
+        create_read_gmail_email_tool(**common),
+        create_create_gmail_draft_tool(**common),
+        create_send_gmail_email_tool(**common),
+        create_trash_gmail_email_tool(**common),
+        create_update_gmail_draft_tool(**common),
+    ]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/tools/read_email.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/tools/read_email.py
new file mode 100644
index 0000000000..39526f25ef
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/tools/read_email.py
@@ -0,0 +1,149 @@
+import logging
+from typing import Any
+
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.future import select
+
+from app.db import SearchSourceConnector, SearchSourceConnectorType
+
+logger = logging.getLogger(__name__)
+
+_GMAIL_TYPES = [
+    SearchSourceConnectorType.GOOGLE_GMAIL_CONNECTOR,
+    SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR,
+]
+
+
+def create_read_gmail_email_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    @tool
+    async def read_gmail_email(message_id: str) -> dict[str, Any]:
+        """Read the full content of a specific Gmail email by its message ID.
+
+        Use after search_gmail to get the complete body of an email.
+
+        Args:
+            message_id: The Gmail message ID (from search_gmail results).
+
+        Returns:
+            Dictionary with status and the full email content formatted as markdown.
+        """
+        if db_session is None or search_space_id is None or user_id is None:
+            return {"status": "error", "message": "Gmail tool not properly configured."}
+
+        try:
+            result = await db_session.execute(
+                select(SearchSourceConnector).filter(
+                    SearchSourceConnector.search_space_id == search_space_id,
+                    SearchSourceConnector.user_id == user_id,
+                    SearchSourceConnector.connector_type.in_(_GMAIL_TYPES),
+                )
+            )
+            connector = result.scalars().first()
+            if not connector:
+                return {
+                    "status": "error",
+                    "message": "No Gmail connector found. Please connect Gmail in your workspace settings.",
+                }
+
+            if (
+                connector.connector_type
+                == SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR
+            ):
+                cca_id = connector.config.get("composio_connected_account_id")
+                if not cca_id:
+                    return {
+                        "status": "error",
+                        "message": "Composio connected account ID not found for this Gmail connector.",
+                    }
+
+                from app.agents.new_chat.tools.gmail.search_emails import (
+                    _format_gmail_summary,
+                )
+                from app.services.composio_service import ComposioService
+
+                detail, error = await ComposioService().get_gmail_message_detail(
+                    connected_account_id=cca_id,
+                    entity_id=f"surfsense_{user_id}",
+                    message_id=message_id,
+                )
+                if error:
+                    return {"status": "error", "message": error}
+                if not detail:
+                    return {
+                        "status": "not_found",
+                        "message": f"Email with ID '{message_id}' not found.",
+                    }
+
+                summary = _format_gmail_summary(detail)
+                content = (
+                    f"# {summary['subject']}\n\n"
+                    f"**From:** {summary['from']}\n"
+                    f"**To:** {summary['to']}\n"
+                    f"**Date:** {summary['date']}\n\n"
+                    f"## Message Content\n\n"
+                    f"{detail.get('messageText') or detail.get('snippet') or ''}\n\n"
+                    f"## Message Details\n\n"
+                    f"- **Message ID:** {summary['message_id']}\n"
+                    f"- **Thread ID:** {summary['thread_id']}\n"
+                )
+                return {
+                    "status": "success",
+                    "message_id": summary["message_id"] or message_id,
+                    "content": content,
+                }
+
+            from app.agents.new_chat.tools.gmail.search_emails import (
+                _build_credentials,
+            )
+
+            creds = _build_credentials(connector)
+
+            from app.connectors.google_gmail_connector import GoogleGmailConnector
+
+            gmail = GoogleGmailConnector(
+                credentials=creds,
+                session=db_session,
+                user_id=user_id,
+                connector_id=connector.id,
+            )
+
+            detail, error = await gmail.get_message_details(message_id)
+            if error:
+                if (
+                    "re-authenticate" in error.lower()
+                    or "authentication failed" in error.lower()
+                ):
+                    return {
+                        "status": "auth_error",
+                        "message": error,
+                        "connector_type": "gmail",
+                    }
+                return {"status": "error", "message": error}
+
+            if not detail:
+                return {
+                    "status": "not_found",
+                    "message": f"Email with ID '{message_id}' not found.",
+                }
+
+            content = gmail.format_message_to_markdown(detail)
+
+            return {"status": "success", "message_id": message_id, "content": content}
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+            logger.error("Error reading Gmail email: %s", e, exc_info=True)
+            return {
+                "status": "error",
+                "message": "Failed to read email. Please try again.",
+            }
+
+    return read_gmail_email
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/tools/search_emails.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/tools/search_emails.py
new file mode 100644
index 0000000000..a9d7cdedf9
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/tools/search_emails.py
@@ -0,0 +1,174 @@
+import logging
+from typing import Any
+
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.future import select
+
+from app.db import SearchSourceConnector, SearchSourceConnectorType
+
+logger = logging.getLogger(__name__)
+
+_GMAIL_TYPES = [
+    SearchSourceConnectorType.GOOGLE_GMAIL_CONNECTOR,
+    SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR,
+]
+
+
+def create_search_gmail_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    @tool
+    async def search_gmail(
+        query: str,
+        max_results: int = 10,
+    ) -> dict[str, Any]:
+        """Search emails in the user's Gmail inbox using Gmail search syntax.
+
+        Args:
+            query: Gmail search query, same syntax as the Gmail search bar.
+                Examples: "from:alice@example.com", "subject:meeting",
+                "is:unread", "after:2024/01/01 before:2024/02/01",
+                "has:attachment", "in:sent".
+            max_results: Number of emails to return (default 10, max 20).
+
+        Returns:
+            Dictionary with status and a list of email summaries including
+            message_id, subject, from, date, snippet.
+        """
+        if db_session is None or search_space_id is None or user_id is None:
+            return {"status": "error", "message": "Gmail tool not properly configured."}
+
+        max_results = min(max_results, 20)
+
+        try:
+            result = await db_session.execute(
+                select(SearchSourceConnector).filter(
+                    SearchSourceConnector.search_space_id == search_space_id,
+                    SearchSourceConnector.user_id == user_id,
+                    SearchSourceConnector.connector_type.in_(_GMAIL_TYPES),
+                )
+            )
+            connector = result.scalars().first()
+            if not connector:
+                return {
+                    "status": "error",
+                    "message": "No Gmail connector found. Please connect Gmail in your workspace settings.",
+                }
+
+            if (
+                connector.connector_type
+                == SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR
+            ):
+                cca_id = connector.config.get("composio_connected_account_id")
+                if not cca_id:
+                    return {
+                        "status": "error",
+                        "message": "Composio connected account ID not found for this Gmail connector.",
+                    }
+
+                from app.agents.new_chat.tools.gmail.search_emails import (
+                    _format_gmail_summary,
+                )
+                from app.services.composio_service import ComposioService
+
+                (
+                    messages,
+                    _next,
+                    _estimate,
+                    error,
+                ) = await ComposioService().get_gmail_messages(
+                    connected_account_id=cca_id,
+                    entity_id=f"surfsense_{user_id}",
+                    query=query,
+                    max_results=max_results,
+                )
+                if error:
+                    return {"status": "error", "message": error}
+
+                emails = [_format_gmail_summary(m) for m in messages]
+                if not emails:
+                    return {
+                        "status": "success",
+                        "emails": [],
+                        "total": 0,
+                        "message": "No emails found.",
+                    }
+                return {"status": "success", "emails": emails, "total": len(emails)}
+
+            from app.agents.new_chat.tools.gmail.search_emails import (
+                _build_credentials,
+            )
+
+            creds = _build_credentials(connector)
+
+            from app.connectors.google_gmail_connector import GoogleGmailConnector
+
+            gmail = GoogleGmailConnector(
+                credentials=creds,
+                session=db_session,
+                user_id=user_id,
+                connector_id=connector.id,
+            )
+
+            messages_list, error = await gmail.get_messages_list(
+                max_results=max_results, query=query
+            )
+            if error:
+                if (
+                    "re-authenticate" in error.lower()
+                    or "authentication failed" in error.lower()
+                ):
+                    return {
+                        "status": "auth_error",
+                        "message": error,
+                        "connector_type": "gmail",
+                    }
+                return {"status": "error", "message": error}
+
+            if not messages_list:
+                return {
+                    "status": "success",
+                    "emails": [],
+                    "total": 0,
+                    "message": "No emails found.",
+                }
+
+            emails = []
+            for msg in messages_list:
+                detail, err = await gmail.get_message_details(msg["id"])
+                if err:
+                    continue
+                headers = {
+                    h["name"].lower(): h["value"]
+                    for h in detail.get("payload", {}).get("headers", [])
+                }
+                emails.append(
+                    {
+                        "message_id": detail.get("id"),
+                        "thread_id": detail.get("threadId"),
+                        "subject": headers.get("subject", "No Subject"),
+                        "from": headers.get("from", "Unknown"),
+                        "to": headers.get("to", ""),
+                        "date": headers.get("date", ""),
+                        "snippet": detail.get("snippet", ""),
+                        "labels": detail.get("labelIds", []),
+                    }
+                )
+
+            return {"status": "success", "emails": emails, "total": len(emails)}
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+            logger.error("Error searching Gmail: %s", e, exc_info=True)
+            return {
+                "status": "error",
+                "message": "Failed to search Gmail. Please try again.",
+            }
+
+    return search_gmail
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/tools/send_email.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/tools/send_email.py
new file mode 100644
index 0000000000..578233b57e
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/tools/send_email.py
@@ -0,0 +1,332 @@
+import asyncio
+import base64
+import logging
+from datetime import datetime
+from email.mime.text import MIMEText
+from typing import Any
+
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.agents.multi_agent_chat.subagents.shared.hitl.approvals.self_gated import (
+    request_approval,
+)
+from app.services.gmail import GmailToolMetadataService
+
+logger = logging.getLogger(__name__)
+
+
+def create_send_gmail_email_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    @tool
+    async def send_gmail_email(
+        to: str,
+        subject: str,
+        body: str,
+        cc: str | None = None,
+        bcc: str | None = None,
+    ) -> dict[str, Any]:
+        """Send an email via Gmail.
+
+        Use when the user explicitly asks to send an email. This sends the
+        email immediately - it cannot be unsent.
+
+        Args:
+            to: Recipient email address.
+            subject: Email subject line.
+            body: Email body content.
+            cc: Optional CC recipient(s), comma-separated.
+            bcc: Optional BCC recipient(s), comma-separated.
+
+        Returns:
+            Dictionary with:
+            - status: "success", "rejected", or "error"
+            - message_id: Gmail message ID (if success)
+            - thread_id: Gmail thread ID (if success)
+            - message: Result message
+
+            IMPORTANT:
+            - If status is "rejected", the user explicitly declined the action.
+              Respond with a brief acknowledgment and do NOT retry or suggest alternatives.
+            - If status is "insufficient_permissions", the connector lacks the required OAuth scope.
+              Inform the user they need to re-authenticate and do NOT retry the action.
+
+        Examples:
+            - "Send an email to alice@example.com about the meeting"
+            - "Email Bob the project update"
+        """
+        logger.info(f"send_gmail_email called: to='{to}', subject='{subject}'")
+
+        if db_session is None or search_space_id is None or user_id is None:
+            return {
+                "status": "error",
+                "message": "Gmail tool not properly configured. Please contact support.",
+            }
+
+        try:
+            metadata_service = GmailToolMetadataService(db_session)
+            context = await metadata_service.get_creation_context(
+                search_space_id, user_id
+            )
+
+            if "error" in context:
+                logger.error(f"Failed to fetch creation context: {context['error']}")
+                return {"status": "error", "message": context["error"]}
+
+            accounts = context.get("accounts", [])
+            if accounts and all(a.get("auth_expired") for a in accounts):
+                logger.warning("All Gmail accounts have expired authentication")
+                return {
+                    "status": "auth_error",
+                    "message": "All connected Gmail accounts need re-authentication. Please re-authenticate in your connector settings.",
+                    "connector_type": "gmail",
+                }
+
+            logger.info(
+                f"Requesting approval for sending Gmail email: to='{to}', subject='{subject}'"
+            )
+            result = request_approval(
+                action_type="gmail_email_send",
+                tool_name="send_gmail_email",
+                params={
+                    "to": to,
+                    "subject": subject,
+                    "body": body,
+                    "cc": cc,
+                    "bcc": bcc,
+                    "connector_id": None,
+                },
+                context=context,
+            )
+
+            if result.rejected:
+                return {
+                    "status": "rejected",
+                    "message": "User declined. The email was not sent. Do not ask again or suggest alternatives.",
+                }
+
+            final_to = result.params.get("to", to)
+            final_subject = result.params.get("subject", subject)
+            final_body = result.params.get("body", body)
+            final_cc = result.params.get("cc", cc)
+            final_bcc = result.params.get("bcc", bcc)
+            final_connector_id = result.params.get("connector_id")
+
+            from sqlalchemy.future import select
+
+            from app.db import SearchSourceConnector, SearchSourceConnectorType
+
+            _gmail_types = [
+                SearchSourceConnectorType.GOOGLE_GMAIL_CONNECTOR,
+                SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR,
+            ]
+
+            if final_connector_id is not None:
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.id == final_connector_id,
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type.in_(_gmail_types),
+                    )
+                )
+                connector = result.scalars().first()
+                if not connector:
+                    return {
+                        "status": "error",
+                        "message": "Selected Gmail connector is invalid or has been disconnected.",
+                    }
+                actual_connector_id = connector.id
+            else:
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type.in_(_gmail_types),
+                    )
+                )
+                connector = result.scalars().first()
+                if not connector:
+                    return {
+                        "status": "error",
+                        "message": "No Gmail connector found. Please connect Gmail in your workspace settings.",
+                    }
+                actual_connector_id = connector.id
+
+            logger.info(
+                f"Sending Gmail email: to='{final_to}', subject='{final_subject}', connector={actual_connector_id}"
+            )
+
+            if (
+                connector.connector_type
+                == SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR
+            ):
+                cca_id = connector.config.get("composio_connected_account_id")
+                if not cca_id:
+                    return {
+                        "status": "error",
+                        "message": "Composio connected account ID not found for this Gmail connector.",
+                    }
+
+                from app.services.composio_service import ComposioService
+
+                (
+                    sent_message_id,
+                    sent_thread_id,
+                    error,
+                ) = await ComposioService().send_gmail_email(
+                    connected_account_id=cca_id,
+                    entity_id=f"surfsense_{user_id}",
+                    to=final_to,
+                    subject=final_subject,
+                    body=final_body,
+                    cc=final_cc,
+                    bcc=final_bcc,
+                )
+                if error:
+                    return {"status": "error", "message": error}
+                sent = {"id": sent_message_id, "threadId": sent_thread_id}
+            else:
+                from google.oauth2.credentials import Credentials
+
+                from app.config import config
+                from app.utils.oauth_security import TokenEncryption
+
+                config_data = dict(connector.config)
+                token_encrypted = config_data.get("_token_encrypted", False)
+                if token_encrypted and config.SECRET_KEY:
+                    token_encryption = TokenEncryption(config.SECRET_KEY)
+                    if config_data.get("token"):
+                        config_data["token"] = token_encryption.decrypt_token(
+                            config_data["token"]
+                        )
+                    if config_data.get("refresh_token"):
+                        config_data["refresh_token"] = token_encryption.decrypt_token(
+                            config_data["refresh_token"]
+                        )
+                    if config_data.get("client_secret"):
+                        config_data["client_secret"] = token_encryption.decrypt_token(
+                            config_data["client_secret"]
+                        )
+
+                exp = config_data.get("expiry", "")
+                if exp:
+                    exp = exp.replace("Z", "")
+
+                creds = Credentials(
+                    token=config_data.get("token"),
+                    refresh_token=config_data.get("refresh_token"),
+                    token_uri=config_data.get("token_uri"),
+                    client_id=config_data.get("client_id"),
+                    client_secret=config_data.get("client_secret"),
+                    scopes=config_data.get("scopes", []),
+                    expiry=datetime.fromisoformat(exp) if exp else None,
+                )
+
+                from googleapiclient.discovery import build
+
+                gmail_service = build("gmail", "v1", credentials=creds)
+
+                message = MIMEText(final_body)
+                message["to"] = final_to
+                message["subject"] = final_subject
+                if final_cc:
+                    message["cc"] = final_cc
+                if final_bcc:
+                    message["bcc"] = final_bcc
+                raw = base64.urlsafe_b64encode(message.as_bytes()).decode()
+
+                try:
+                    sent = await asyncio.get_event_loop().run_in_executor(
+                        None,
+                        lambda: (
+                            gmail_service.users()
+                            .messages()
+                            .send(userId="me", body={"raw": raw})
+                            .execute()
+                        ),
+                    )
+                except Exception as api_err:
+                    from googleapiclient.errors import HttpError
+
+                    if isinstance(api_err, HttpError) and api_err.resp.status == 403:
+                        logger.warning(
+                            f"Insufficient permissions for connector {actual_connector_id}: {api_err}"
+                        )
+                        try:
+                            from sqlalchemy.orm.attributes import flag_modified
+
+                            _res = await db_session.execute(
+                                select(SearchSourceConnector).where(
+                                    SearchSourceConnector.id == actual_connector_id
+                                )
+                            )
+                            _conn = _res.scalar_one_or_none()
+                            if _conn and not _conn.config.get("auth_expired"):
+                                _conn.config = {**_conn.config, "auth_expired": True}
+                                flag_modified(_conn, "config")
+                                await db_session.commit()
+                        except Exception:
+                            logger.warning(
+                                "Failed to persist auth_expired for connector %s",
+                                actual_connector_id,
+                                exc_info=True,
+                            )
+                        return {
+                            "status": "insufficient_permissions",
+                            "connector_id": actual_connector_id,
+                            "message": "This Gmail account needs additional permissions. Please re-authenticate in connector settings.",
+                        }
+                    raise
+
+            logger.info(
+                f"Gmail email sent: id={sent.get('id')}, threadId={sent.get('threadId')}"
+            )
+
+            kb_message_suffix = ""
+            try:
+                from app.services.gmail import GmailKBSyncService
+
+                kb_service = GmailKBSyncService(db_session)
+                kb_result = await kb_service.sync_after_create(
+                    message_id=sent.get("id", ""),
+                    thread_id=sent.get("threadId", ""),
+                    subject=final_subject,
+                    sender="me",
+                    date_str=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                    body_text=final_body,
+                    connector_id=actual_connector_id,
+                    search_space_id=search_space_id,
+                    user_id=user_id,
+                )
+                if kb_result["status"] == "success":
+                    kb_message_suffix = " Your knowledge base has also been updated."
+                else:
+                    kb_message_suffix = " This email will be added to your knowledge base in the next scheduled sync."
+            except Exception as kb_err:
+                logger.warning(f"KB sync after send failed: {kb_err}")
+                kb_message_suffix = " This email will be added to your knowledge base in the next scheduled sync."
+
+            return {
+                "status": "success",
+                "message_id": sent.get("id"),
+                "thread_id": sent.get("threadId"),
+                "message": f"Successfully sent email to '{final_to}' with subject '{final_subject}'.{kb_message_suffix}",
+            }
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+
+            logger.error(f"Error sending Gmail email: {e}", exc_info=True)
+            return {
+                "status": "error",
+                "message": "Something went wrong while sending the email. Please try again.",
+            }
+
+    return send_gmail_email
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/tools/trash_email.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/tools/trash_email.py
new file mode 100644
index 0000000000..b24e9ebe4c
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/tools/trash_email.py
@@ -0,0 +1,317 @@
+import asyncio
+import logging
+from datetime import datetime
+from typing import Any
+
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.agents.multi_agent_chat.subagents.shared.hitl.approvals.self_gated import (
+    request_approval,
+)
+from app.services.gmail import GmailToolMetadataService
+
+logger = logging.getLogger(__name__)
+
+
+def create_trash_gmail_email_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    @tool
+    async def trash_gmail_email(
+        email_subject_or_id: str,
+        delete_from_kb: bool = False,
+    ) -> dict[str, Any]:
+        """Move an email or draft to trash in Gmail.
+
+        Use when the user asks to delete, remove, or trash an email or draft.
+
+        Args:
+            email_subject_or_id: The exact subject line or message ID of the
+                email to trash (as it appears in the inbox).
+            delete_from_kb: Whether to also remove the email from the knowledge base.
+                           Default is False.
+                           Set to True to remove from both Gmail and knowledge base.
+
+        Returns:
+            Dictionary with:
+            - status: "success", "rejected", "not_found", or "error"
+            - message_id: Gmail message ID (if success)
+            - deleted_from_kb: whether the document was removed from the knowledge base
+            - message: Result message
+
+            IMPORTANT:
+            - If status is "rejected", the user explicitly declined. Respond with a brief
+              acknowledgment and do NOT retry or suggest alternatives.
+            - If status is "not_found", relay the exact message to the user and ask them
+              to verify the email subject or check if it has been indexed.
+            - If status is "insufficient_permissions", the connector lacks the required OAuth scope.
+              Inform the user they need to re-authenticate and do NOT retry this tool.
+        Examples:
+            - "Delete the email about 'Meeting Cancelled'"
+            - "Trash the email from Bob about the project"
+        """
+        logger.info(
+            f"trash_gmail_email called: email_subject_or_id='{email_subject_or_id}', delete_from_kb={delete_from_kb}"
+        )
+
+        if db_session is None or search_space_id is None or user_id is None:
+            return {
+                "status": "error",
+                "message": "Gmail tool not properly configured. Please contact support.",
+            }
+
+        try:
+            metadata_service = GmailToolMetadataService(db_session)
+            context = await metadata_service.get_trash_context(
+                search_space_id, user_id, email_subject_or_id
+            )
+
+            if "error" in context:
+                error_msg = context["error"]
+                if "not found" in error_msg.lower():
+                    logger.warning(f"Email not found: {error_msg}")
+                    return {"status": "not_found", "message": error_msg}
+                logger.error(f"Failed to fetch trash context: {error_msg}")
+                return {"status": "error", "message": error_msg}
+
+            account = context.get("account", {})
+            if account.get("auth_expired"):
+                logger.warning(
+                    "Gmail account %s has expired authentication",
+                    account.get("id"),
+                )
+                return {
+                    "status": "auth_error",
+                    "message": "The Gmail account for this email needs re-authentication. Please re-authenticate in your connector settings.",
+                    "connector_type": "gmail",
+                }
+
+            email = context["email"]
+            message_id = email["message_id"]
+            document_id = email.get("document_id")
+            connector_id_from_context = context["account"]["id"]
+
+            if not message_id:
+                return {
+                    "status": "error",
+                    "message": "Message ID is missing from the indexed document. Please re-index the email and try again.",
+                }
+
+            logger.info(
+                f"Requesting approval for trashing Gmail email: '{email_subject_or_id}' (message_id={message_id}, delete_from_kb={delete_from_kb})"
+            )
+            result = request_approval(
+                action_type="gmail_email_trash",
+                tool_name="trash_gmail_email",
+                params={
+                    "message_id": message_id,
+                    "connector_id": connector_id_from_context,
+                    "delete_from_kb": delete_from_kb,
+                },
+                context=context,
+            )
+
+            if result.rejected:
+                return {
+                    "status": "rejected",
+                    "message": "User declined. The email was not trashed. Do not ask again or suggest alternatives.",
+                }
+
+            final_message_id = result.params.get("message_id", message_id)
+            final_connector_id = result.params.get(
+                "connector_id", connector_id_from_context
+            )
+            final_delete_from_kb = result.params.get("delete_from_kb", delete_from_kb)
+
+            if not final_connector_id:
+                return {
+                    "status": "error",
+                    "message": "No connector found for this email.",
+                }
+
+            from sqlalchemy.future import select
+
+            from app.db import SearchSourceConnector, SearchSourceConnectorType
+
+            _gmail_types = [
+                SearchSourceConnectorType.GOOGLE_GMAIL_CONNECTOR,
+                SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR,
+            ]
+
+            result = await db_session.execute(
+                select(SearchSourceConnector).filter(
+                    SearchSourceConnector.id == final_connector_id,
+                    SearchSourceConnector.search_space_id == search_space_id,
+                    SearchSourceConnector.user_id == user_id,
+                    SearchSourceConnector.connector_type.in_(_gmail_types),
+                )
+            )
+            connector = result.scalars().first()
+            if not connector:
+                return {
+                    "status": "error",
+                    "message": "Selected Gmail connector is invalid or has been disconnected.",
+                }
+
+            logger.info(
+                f"Trashing Gmail email: message_id='{final_message_id}', connector={final_connector_id}"
+            )
+
+            if (
+                connector.connector_type
+                == SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR
+            ):
+                cca_id = connector.config.get("composio_connected_account_id")
+                if not cca_id:
+                    return {
+                        "status": "error",
+                        "message": "Composio connected account ID not found for this Gmail connector.",
+                    }
+
+                from app.services.composio_service import ComposioService
+
+                error = await ComposioService().trash_gmail_message(
+                    connected_account_id=cca_id,
+                    entity_id=f"surfsense_{user_id}",
+                    message_id=final_message_id,
+                )
+                if error:
+                    return {"status": "error", "message": error}
+            else:
+                from google.oauth2.credentials import Credentials
+
+                from app.config import config
+                from app.utils.oauth_security import TokenEncryption
+
+                config_data = dict(connector.config)
+                token_encrypted = config_data.get("_token_encrypted", False)
+                if token_encrypted and config.SECRET_KEY:
+                    token_encryption = TokenEncryption(config.SECRET_KEY)
+                    if config_data.get("token"):
+                        config_data["token"] = token_encryption.decrypt_token(
+                            config_data["token"]
+                        )
+                    if config_data.get("refresh_token"):
+                        config_data["refresh_token"] = token_encryption.decrypt_token(
+                            config_data["refresh_token"]
+                        )
+                    if config_data.get("client_secret"):
+                        config_data["client_secret"] = token_encryption.decrypt_token(
+                            config_data["client_secret"]
+                        )
+
+                exp = config_data.get("expiry", "")
+                if exp:
+                    exp = exp.replace("Z", "")
+
+                creds = Credentials(
+                    token=config_data.get("token"),
+                    refresh_token=config_data.get("refresh_token"),
+                    token_uri=config_data.get("token_uri"),
+                    client_id=config_data.get("client_id"),
+                    client_secret=config_data.get("client_secret"),
+                    scopes=config_data.get("scopes", []),
+                    expiry=datetime.fromisoformat(exp) if exp else None,
+                )
+
+                from googleapiclient.discovery import build
+
+                gmail_service = build("gmail", "v1", credentials=creds)
+
+                try:
+                    await asyncio.get_event_loop().run_in_executor(
+                        None,
+                        lambda: (
+                            gmail_service.users()
+                            .messages()
+                            .trash(userId="me", id=final_message_id)
+                            .execute()
+                        ),
+                    )
+                except Exception as api_err:
+                    from googleapiclient.errors import HttpError
+
+                    if isinstance(api_err, HttpError) and api_err.resp.status == 403:
+                        logger.warning(
+                            f"Insufficient permissions for connector {connector.id}: {api_err}"
+                        )
+                        try:
+                            from sqlalchemy.orm.attributes import flag_modified
+
+                            if not connector.config.get("auth_expired"):
+                                connector.config = {
+                                    **connector.config,
+                                    "auth_expired": True,
+                                }
+                                flag_modified(connector, "config")
+                                await db_session.commit()
+                        except Exception:
+                            logger.warning(
+                                "Failed to persist auth_expired for connector %s",
+                                connector.id,
+                                exc_info=True,
+                            )
+                        return {
+                            "status": "insufficient_permissions",
+                            "connector_id": connector.id,
+                            "message": "This Gmail account needs additional permissions. Please re-authenticate in connector settings.",
+                        }
+                    raise
+
+            logger.info(f"Gmail email trashed: message_id={final_message_id}")
+
+            trash_result: dict[str, Any] = {
+                "status": "success",
+                "message_id": final_message_id,
+                "message": f"Successfully moved email '{email.get('subject', email_subject_or_id)}' to trash.",
+            }
+
+            deleted_from_kb = False
+            if final_delete_from_kb and document_id:
+                try:
+                    from app.db import Document
+
+                    doc_result = await db_session.execute(
+                        select(Document).filter(Document.id == document_id)
+                    )
+                    document = doc_result.scalars().first()
+                    if document:
+                        await db_session.delete(document)
+                        await db_session.commit()
+                        deleted_from_kb = True
+                        logger.info(
+                            f"Deleted document {document_id} from knowledge base"
+                        )
+                    else:
+                        logger.warning(f"Document {document_id} not found in KB")
+                except Exception as e:
+                    logger.error(f"Failed to delete document from KB: {e}")
+                    await db_session.rollback()
+                    trash_result["warning"] = (
+                        f"Email trashed, but failed to remove from knowledge base: {e!s}"
+                    )
+
+            trash_result["deleted_from_kb"] = deleted_from_kb
+            if deleted_from_kb:
+                trash_result["message"] = (
+                    f"{trash_result.get('message', '')} (also removed from knowledge base)"
+                )
+
+            return trash_result
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+
+            logger.error(f"Error trashing Gmail email: {e}", exc_info=True)
+            return {
+                "status": "error",
+                "message": "Something went wrong while trashing the email. Please try again.",
+            }
+
+    return trash_gmail_email
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/tools/update_draft.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/tools/update_draft.py
new file mode 100644
index 0000000000..1ab9d30cfc
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/tools/update_draft.py
@@ -0,0 +1,449 @@
+import asyncio
+import base64
+import logging
+from datetime import datetime
+from email.mime.text import MIMEText
+from typing import Any
+
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.agents.multi_agent_chat.subagents.shared.hitl.approvals.self_gated import (
+    request_approval,
+)
+from app.services.gmail import GmailToolMetadataService
+
+logger = logging.getLogger(__name__)
+
+
+def create_update_gmail_draft_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    @tool
+    async def update_gmail_draft(
+        draft_subject_or_id: str,
+        body: str,
+        to: str | None = None,
+        subject: str | None = None,
+        cc: str | None = None,
+        bcc: str | None = None,
+    ) -> dict[str, Any]:
+        """Update an existing Gmail draft.
+
+        Use when the user asks to modify, edit, or add content to an existing
+        email draft. This replaces the draft content with the new version.
+        The user will be able to review and edit the content before it is applied.
+
+        If the user simply wants to "edit" a draft without specifying exact changes,
+        generate the body yourself using your best understanding of the conversation
+        context. The user will review and can freely edit the content in the approval
+        card before confirming.
+
+        IMPORTANT: This tool is ONLY for modifying Gmail draft content, NOT for
+        deleting/trashing drafts (use trash_gmail_email instead), Notion pages,
+        calendar events, or any other content type.
+
+        Args:
+            draft_subject_or_id: The exact subject line of the draft to update
+                (as it appears in Gmail drafts).
+            body: The full updated body content for the draft. Generate this
+                  yourself based on the user's request and conversation context.
+            to: Optional new recipient email address (keeps original if omitted).
+            subject: Optional new subject line (keeps original if omitted).
+            cc: Optional CC recipient(s), comma-separated.
+            bcc: Optional BCC recipient(s), comma-separated.
+
+        Returns:
+            Dictionary with:
+            - status: "success", "rejected", "not_found", or "error"
+            - draft_id: Gmail draft ID (if success)
+            - message: Result message
+
+            IMPORTANT:
+            - If status is "rejected", the user explicitly declined the action.
+              Respond with a brief acknowledgment and do NOT retry or suggest alternatives.
+            - If status is "not_found", relay the exact message to the user and ask them
+              to verify the draft subject or check if it has been indexed.
+            - If status is "insufficient_permissions", the connector lacks the required OAuth scope.
+              Inform the user they need to re-authenticate and do NOT retry the action.
+
+        Examples:
+            - "Update the Kurseong Plan draft with the new itinerary details"
+            - "Edit my draft about the project proposal and change the recipient"
+            - "Let me edit the meeting notes draft" (call with current body content so user can edit in the approval card)
+        """
+        logger.info(
+            f"update_gmail_draft called: draft_subject_or_id='{draft_subject_or_id}'"
+        )
+
+        if db_session is None or search_space_id is None or user_id is None:
+            return {
+                "status": "error",
+                "message": "Gmail tool not properly configured. Please contact support.",
+            }
+
+        try:
+            metadata_service = GmailToolMetadataService(db_session)
+            context = await metadata_service.get_update_context(
+                search_space_id, user_id, draft_subject_or_id
+            )
+
+            if "error" in context:
+                error_msg = context["error"]
+                if "not found" in error_msg.lower():
+                    logger.warning(f"Draft not found: {error_msg}")
+                    return {"status": "not_found", "message": error_msg}
+                logger.error(f"Failed to fetch update context: {error_msg}")
+                return {"status": "error", "message": error_msg}
+
+            account = context.get("account", {})
+            if account.get("auth_expired"):
+                logger.warning(
+                    "Gmail account %s has expired authentication",
+                    account.get("id"),
+                )
+                return {
+                    "status": "auth_error",
+                    "message": "The Gmail account for this draft needs re-authentication. Please re-authenticate in your connector settings.",
+                    "connector_type": "gmail",
+                }
+
+            email = context["email"]
+            message_id = email["message_id"]
+            document_id = email.get("document_id")
+            connector_id_from_context = account["id"]
+            draft_id_from_context = context.get("draft_id")
+
+            original_subject = email.get("subject", draft_subject_or_id)
+            final_subject_default = subject if subject else original_subject
+            final_to_default = to if to else ""
+
+            logger.info(
+                f"Requesting approval for updating Gmail draft: '{original_subject}' "
+                f"(message_id={message_id}, draft_id={draft_id_from_context})"
+            )
+            result = request_approval(
+                action_type="gmail_draft_update",
+                tool_name="update_gmail_draft",
+                params={
+                    "message_id": message_id,
+                    "draft_id": draft_id_from_context,
+                    "to": final_to_default,
+                    "subject": final_subject_default,
+                    "body": body,
+                    "cc": cc,
+                    "bcc": bcc,
+                    "connector_id": connector_id_from_context,
+                },
+                context=context,
+            )
+
+            if result.rejected:
+                return {
+                    "status": "rejected",
+                    "message": "User declined. The draft was not updated. Do not ask again or suggest alternatives.",
+                }
+
+            final_to = result.params.get("to", final_to_default)
+            final_subject = result.params.get("subject", final_subject_default)
+            final_body = result.params.get("body", body)
+            final_cc = result.params.get("cc", cc)
+            final_bcc = result.params.get("bcc", bcc)
+            final_connector_id = result.params.get(
+                "connector_id", connector_id_from_context
+            )
+            final_draft_id = result.params.get("draft_id", draft_id_from_context)
+
+            if not final_connector_id:
+                return {
+                    "status": "error",
+                    "message": "No connector found for this draft.",
+                }
+
+            from sqlalchemy.future import select
+
+            from app.db import SearchSourceConnector, SearchSourceConnectorType
+
+            _gmail_types = [
+                SearchSourceConnectorType.GOOGLE_GMAIL_CONNECTOR,
+                SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR,
+            ]
+
+            result = await db_session.execute(
+                select(SearchSourceConnector).filter(
+                    SearchSourceConnector.id == final_connector_id,
+                    SearchSourceConnector.search_space_id == search_space_id,
+                    SearchSourceConnector.user_id == user_id,
+                    SearchSourceConnector.connector_type.in_(_gmail_types),
+                )
+            )
+            connector = result.scalars().first()
+            if not connector:
+                return {
+                    "status": "error",
+                    "message": "Selected Gmail connector is invalid or has been disconnected.",
+                }
+
+            logger.info(
+                f"Updating Gmail draft: subject='{final_subject}', connector={final_connector_id}"
+            )
+
+            if (
+                connector.connector_type
+                == SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR
+            ):
+                cca_id = connector.config.get("composio_connected_account_id")
+                if not cca_id:
+                    return {
+                        "status": "error",
+                        "message": "Composio connected account ID not found for this Gmail connector.",
+                    }
+
+                if not final_draft_id:
+                    return {
+                        "status": "error",
+                        "message": (
+                            "Could not find this draft in Gmail. "
+                            "It may have already been sent or deleted."
+                        ),
+                    }
+
+                from app.services.composio_service import ComposioService
+
+                (
+                    new_draft_id,
+                    new_message_id,
+                    error,
+                ) = await ComposioService().update_gmail_draft(
+                    connected_account_id=cca_id,
+                    entity_id=f"surfsense_{user_id}",
+                    draft_id=final_draft_id,
+                    to=final_to or None,
+                    subject=final_subject,
+                    body=final_body,
+                    cc=final_cc,
+                    bcc=final_bcc,
+                )
+                if error:
+                    if "not found" in error.lower() or "no longer" in error.lower():
+                        return {
+                            "status": "error",
+                            "message": "Draft no longer exists in Gmail. It may have been sent or deleted.",
+                        }
+                    return {"status": "error", "message": error}
+
+                updated = {
+                    "id": new_draft_id or final_draft_id,
+                    "message": {"id": new_message_id} if new_message_id else {},
+                }
+                logger.info(f"Gmail draft updated via Composio: id={updated.get('id')}")
+            else:
+                from google.oauth2.credentials import Credentials
+
+                from app.config import config
+                from app.utils.oauth_security import TokenEncryption
+
+                config_data = dict(connector.config)
+                token_encrypted = config_data.get("_token_encrypted", False)
+                if token_encrypted and config.SECRET_KEY:
+                    token_encryption = TokenEncryption(config.SECRET_KEY)
+                    if config_data.get("token"):
+                        config_data["token"] = token_encryption.decrypt_token(
+                            config_data["token"]
+                        )
+                    if config_data.get("refresh_token"):
+                        config_data["refresh_token"] = token_encryption.decrypt_token(
+                            config_data["refresh_token"]
+                        )
+                    if config_data.get("client_secret"):
+                        config_data["client_secret"] = token_encryption.decrypt_token(
+                            config_data["client_secret"]
+                        )
+
+                exp = config_data.get("expiry", "")
+                if exp:
+                    exp = exp.replace("Z", "")
+
+                creds = Credentials(
+                    token=config_data.get("token"),
+                    refresh_token=config_data.get("refresh_token"),
+                    token_uri=config_data.get("token_uri"),
+                    client_id=config_data.get("client_id"),
+                    client_secret=config_data.get("client_secret"),
+                    scopes=config_data.get("scopes", []),
+                    expiry=datetime.fromisoformat(exp) if exp else None,
+                )
+
+                from googleapiclient.discovery import build
+
+                gmail_service = build("gmail", "v1", credentials=creds)
+
+                # Resolve draft_id if not already available
+                if not final_draft_id:
+                    logger.info(
+                        f"draft_id not in metadata, looking up via drafts.list for message_id={message_id}"
+                    )
+                    final_draft_id = await _find_draft_id_by_message(
+                        gmail_service, message_id
+                    )
+
+                if not final_draft_id:
+                    return {
+                        "status": "error",
+                        "message": (
+                            "Could not find this draft in Gmail. "
+                            "It may have already been sent or deleted."
+                        ),
+                    }
+
+                message = MIMEText(final_body)
+                if final_to:
+                    message["to"] = final_to
+                message["subject"] = final_subject
+                if final_cc:
+                    message["cc"] = final_cc
+                if final_bcc:
+                    message["bcc"] = final_bcc
+                raw = base64.urlsafe_b64encode(message.as_bytes()).decode()
+
+                try:
+                    updated = await asyncio.get_event_loop().run_in_executor(
+                        None,
+                        lambda: (
+                            gmail_service.users()
+                            .drafts()
+                            .update(
+                                userId="me",
+                                id=final_draft_id,
+                                body={"message": {"raw": raw}},
+                            )
+                            .execute()
+                        ),
+                    )
+                except Exception as api_err:
+                    from googleapiclient.errors import HttpError
+
+                    if isinstance(api_err, HttpError) and api_err.resp.status == 403:
+                        logger.warning(
+                            f"Insufficient permissions for connector {connector.id}: {api_err}"
+                        )
+                        try:
+                            from sqlalchemy.orm.attributes import flag_modified
+
+                            if not connector.config.get("auth_expired"):
+                                connector.config = {
+                                    **connector.config,
+                                    "auth_expired": True,
+                                }
+                                flag_modified(connector, "config")
+                                await db_session.commit()
+                        except Exception:
+                            logger.warning(
+                                "Failed to persist auth_expired for connector %s",
+                                connector.id,
+                                exc_info=True,
+                            )
+                        return {
+                            "status": "insufficient_permissions",
+                            "connector_id": connector.id,
+                            "message": "This Gmail account needs additional permissions. Please re-authenticate in connector settings.",
+                        }
+                    if isinstance(api_err, HttpError) and api_err.resp.status == 404:
+                        return {
+                            "status": "error",
+                            "message": "Draft no longer exists in Gmail. It may have been sent or deleted.",
+                        }
+                    raise
+
+                logger.info(
+                    f"Gmail draft updated via Google API: id={updated.get('id')}"
+                )
+
+            kb_message_suffix = ""
+            if document_id:
+                try:
+                    from sqlalchemy.future import select as sa_select
+                    from sqlalchemy.orm.attributes import flag_modified
+
+                    from app.db import Document
+
+                    doc_result = await db_session.execute(
+                        sa_select(Document).filter(Document.id == document_id)
+                    )
+                    document = doc_result.scalars().first()
+                    if document:
+                        document.source_markdown = final_body
+                        document.title = final_subject
+                        meta = dict(document.document_metadata or {})
+                        meta["subject"] = final_subject
+                        meta["draft_id"] = updated.get("id", final_draft_id)
+                        updated_msg = updated.get("message", {})
+                        if updated_msg.get("id"):
+                            meta["message_id"] = updated_msg["id"]
+                        document.document_metadata = meta
+                        flag_modified(document, "document_metadata")
+                        await db_session.commit()
+                        kb_message_suffix = (
+                            " Your knowledge base has also been updated."
+                        )
+                        logger.info(
+                            f"KB document {document_id} updated for draft {final_draft_id}"
+                        )
+                    else:
+                        kb_message_suffix = " This draft will be fully updated in your knowledge base in the next scheduled sync."
+                except Exception as kb_err:
+                    logger.warning(f"KB update after draft edit failed: {kb_err}")
+                    await db_session.rollback()
+                    kb_message_suffix = " This draft will be fully updated in your knowledge base in the next scheduled sync."
+
+            return {
+                "status": "success",
+                "draft_id": updated.get("id"),
+                "message": f"Successfully updated Gmail draft with subject '{final_subject}'.{kb_message_suffix}",
+            }
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+
+            logger.error(f"Error updating Gmail draft: {e}", exc_info=True)
+            return {
+                "status": "error",
+                "message": "Something went wrong while updating the draft. Please try again.",
+            }
+
+    return update_gmail_draft
+
+
+async def _find_draft_id_by_message(gmail_service: Any, message_id: str) -> str | None:
+    """Look up a draft's ID by its message ID via the Gmail API."""
+    try:
+        page_token = None
+        while True:
+            kwargs: dict[str, Any] = {"userId": "me", "maxResults": 100}
+            if page_token:
+                kwargs["pageToken"] = page_token
+
+            response = await asyncio.get_event_loop().run_in_executor(
+                None,
+                lambda kwargs=kwargs: (
+                    gmail_service.users().drafts().list(**kwargs).execute()
+                ),
+            )
+
+            for draft in response.get("drafts", []):
+                if draft.get("message", {}).get("id") == message_id:
+                    return draft["id"]
+
+            page_token = response.get("nextPageToken")
+            if not page_token:
+                break
+
+        return None
+    except Exception as e:
+        logger.warning(f"Failed to look up draft by message_id: {e}")
+        return None
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/google_drive/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/google_drive/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/google_drive/agent.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/google_drive/agent.py
new file mode 100644
index 0000000000..fb4a24ddd3
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/google_drive/agent.py
@@ -0,0 +1,44 @@
+"""``google_drive`` route: ``SurfSenseSubagentSpec`` builder for deepagents.
+
+Tools self-gate inside their bodies via :func:`request_approval`; the
+empty :data:`tools.index.RULESET` is layered into a per-subagent
+:class:`PermissionMiddleware` for uniformity.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.language_models import BaseChatModel
+from langchain_core.tools import BaseTool
+
+from app.agents.multi_agent_chat.subagents.shared.md_file_reader import read_md_file
+from app.agents.multi_agent_chat.subagents.shared.spec import SurfSenseSubagentSpec
+from app.agents.multi_agent_chat.subagents.shared.subagent_builder import pack_subagent
+
+from .tools.index import NAME, RULESET, load_tools
+
+
+def build_subagent(
+    *,
+    dependencies: dict[str, Any],
+    model: BaseChatModel | None = None,
+    middleware_stack: dict[str, Any] | None = None,
+    mcp_tools: list[BaseTool] | None = None,
+) -> SurfSenseSubagentSpec:
+    tools = [*load_tools(dependencies=dependencies), *(mcp_tools or [])]
+    description = (
+        read_md_file(__package__, "description").strip()
+        or "Handles google drive tasks for this workspace."
+    )
+    system_prompt = read_md_file(__package__, "system_prompt").strip()
+    return pack_subagent(
+        name=NAME,
+        description=description,
+        system_prompt=system_prompt,
+        tools=tools,
+        ruleset=RULESET,
+        dependencies=dependencies,
+        model=model,
+        middleware_stack=middleware_stack,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/google_drive/description.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/google_drive/description.md
new file mode 100644
index 0000000000..72d160506a
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/google_drive/description.md
@@ -0,0 +1,2 @@
+Specialist for files in the user's Google Drive.
+Use proactively when the user wants to create or remove a Google Drive file.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/google_drive/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/google_drive/system_prompt.md
new file mode 100644
index 0000000000..b78e1f7c60
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/google_drive/system_prompt.md
@@ -0,0 +1,108 @@
+You are a Google Drive specialist for the user's connected Google Drive account.
+
+## Vocabulary you must use precisely
+
+- **File type — required, no default** — `create_google_drive_file` requires `file_type` to be either `"google_doc"` (a Google Doc) or `"google_sheet"` (a Google Sheet). There is no default — you must infer it from the supervisor's task. `"doc"`, `"document"`, `"notes"`, `"summary"`, `"write-up"` → `google_doc`. `"spreadsheet"`, `"sheet"`, `"table"`, `"budget"`, `"tracker"`, `"CSV"` → `google_sheet`. If the user explicitly asks for slides, a PDF, a folder, or any other format, return `status=blocked` — only Google Docs and Google Sheets are supported.
+- **Content format depends on `file_type`** — for `google_doc`, generate the `content` body as **Markdown**. For `google_sheet`, generate the `content` body as **CSV** (comma-separated rows, first row = column headers). The tool stores the content verbatim — passing Markdown to a sheet or CSV to a doc produces a broken file. Pass `name` without an extension; the tool handles that.
+- **File-name resolution (internal)** — `delete_google_drive_file` accepts a `file_name` and resolves it against the **locally-synced Google Drive KB index**, not against the live Drive API. A file that exists in Drive but has not been indexed yet cannot be resolved. There is no separate search or lookup tool exposed to you — resolution happens inside the mutation tool.
+
+## Required inputs
+
+**For every required input below, first try to infer it from the supervisor's task text** — extract names from natural phrasing (`"the Meeting Notes file"`, `"my Q3 Budget spreadsheet"`), topics from `"about X"` constructions, file_type from the vocabulary signals above, and content from any details the supervisor already provided. Only return `status=blocked` with `missing_fields` when an input is genuinely absent or ambiguous after a thorough read of the task.
+
+- `create_google_drive_file` — `name` (the user-supplied topic, inferred from the task; do not invent one if absent), `file_type` (inferred from the vocabulary signals; block if user asked for an unsupported format), and optional `content` (you may generate it from the topic — **Markdown if `file_type=google_doc`, CSV if `file_type=google_sheet`**).
+- `delete_google_drive_file` — `file_name` (which file to delete — infer from the task). Only set `delete_from_kb=true` when the user explicitly asked to remove it from the knowledge base; otherwise leave it `false`.
+
+## Outcome mapping
+
+| Tool returns                  | Your `status` | `next_step` |
+|-------------------------------|---------------|------------------------------------------------------------------------------------------------------------------------------|
+| `success`                     | `success`     | `null` |
+| `rejected`                    | `blocked`     | `"User declined this Google Drive action. Do not retry or suggest alternatives."` |
+| `not_found`                   | `blocked`     | `"File '<name>' was not found in the indexed Google Drive files. Ask the user to verify the file name or wait for the next KB sync."` |
+| `auth_error`                  | `error`       | `"The connected Google Drive account needs re-authentication. Ask the user to re-authenticate Google Drive in connector settings."` |
+| `insufficient_permissions`    | `error`       | `"The connected Google Drive account is missing the required OAuth scope. Ask the user to re-authenticate Google Drive in connector settings."` |
+| `error`                       | `error`       | Relay the tool's `message` verbatim as `next_step`. |
+| tool raises / unknown         | `error`       | `"Google Drive tool failed unexpectedly. Ask the user to retry shortly."` |
+
+Surface the tool's `message`, `file_id`, `name`, `web_view_link`, and the `file_type` you used inside `evidence` when the tool returned them. Never invent a field the tool did not return.
+
+## Examples
+
+**Example 1 — happy path Google Doc create (file_type and Markdown content inferred):**
+- *Supervisor task:* `"Create a Google Doc with today's meeting notes."`
+- *You:* extract `name="Meeting Notes"`; infer `file_type="google_doc"` from `"Doc"`; generate a Markdown body → call `create_google_drive_file(name="Meeting Notes", file_type="google_doc", content=<generated markdown>)` → tool returns `status=success`.
+- *Output:*
+
+  ```json
+  {
+    "status": "success",
+    "action_summary": "Created Google Doc 'Meeting Notes'.",
+    "evidence": { "operation": "create_google_drive_file", "file_id": "<id>", "file_name": "Meeting Notes", "file_type": "google_doc", "web_view_link": "<url>", "matched_candidates": null, "items": null },
+    "next_step": null,
+    "missing_fields": null,
+    "assumptions": null
+  }
+  ```
+
+**Example 2 — happy path Google Sheet create (file_type and CSV content inferred):**
+- *Supervisor task:* `"Create a spreadsheet for the 2026 budget."`
+- *You:* extract `name="2026 Budget"`; infer `file_type="google_sheet"` from `"spreadsheet"` + `"budget"`; generate a **CSV** body (e.g. `"Category,Q1,Q2,Q3,Q4\nMarketing,...\nEngineering,..."`) — **not** Markdown → call `create_google_drive_file(name="2026 Budget", file_type="google_sheet", content=<generated csv>)` → tool returns `status=success`.
+- *Output:*
+
+  ```json
+  {
+    "status": "success",
+    "action_summary": "Created Google Sheet '2026 Budget'.",
+    "evidence": { "operation": "create_google_drive_file", "file_id": "<id>", "file_name": "2026 Budget", "file_type": "google_sheet", "web_view_link": "<url>", "matched_candidates": null, "items": null },
+    "next_step": null,
+    "missing_fields": null,
+    "assumptions": null
+  }
+  ```
+
+**Example 3 — file not in the KB index:**
+- *Supervisor task:* `"Delete the 'Old Roadmap' file from Google Drive."`
+- *You:* extract `file_name="Old Roadmap"` → call `delete_google_drive_file(file_name="Old Roadmap")` → tool returns `status=not_found`.
+- *Output:*
+
+  ```json
+  {
+    "status": "blocked",
+    "action_summary": "Could not find a Google Drive file named 'Old Roadmap' in the indexed files.",
+    "evidence": { "operation": "delete_google_drive_file", "file_id": null, "file_name": "Old Roadmap", "file_type": null, "web_view_link": null, "matched_candidates": null, "items": null },
+    "next_step": "File 'Old Roadmap' was not found in the indexed Google Drive files. Ask the user to verify the file name or wait for the next KB sync.",
+    "missing_fields": null,
+    "assumptions": null
+  }
+  ```
+
+## Output contract
+
+Return **only** one JSON object (no markdown or prose outside it):
+
+```json
+{
+  "status": "success" | "partial" | "blocked" | "error",
+  "action_summary": string,
+  "evidence": {
+    "operation": "create_google_drive_file" | "delete_google_drive_file" | null,
+    "file_id": string | null,
+    "file_name": string | null,
+    "file_type": "google_doc" | "google_sheet" | null,
+    "web_view_link": string | null,
+    "matched_candidates": [ { "id": string, "label": string } ] | null,
+    "items": object | null
+  },
+  "next_step": string | null,
+  "missing_fields": string[] | null,
+  "assumptions": string[] | null
+}
+```
+
+Rules:
+- `status=success` → `next_step=null`, `missing_fields=null`.
+- `status=partial|blocked|error` → `next_step` must be non-null.
+- `status=blocked` due to missing required inputs → `missing_fields` must be non-null.
+
+Infer before you call; map every tool outcome faithfully.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/google_drive/tools/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/google_drive/tools/__init__.py
new file mode 100644
index 0000000000..9c63bceb1a
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/google_drive/tools/__init__.py
@@ -0,0 +1,11 @@
+from app.agents.new_chat.tools.google_drive.create_file import (
+    create_create_google_drive_file_tool,
+)
+from app.agents.new_chat.tools.google_drive.trash_file import (
+    create_delete_google_drive_file_tool,
+)
+
+__all__ = [
+    "create_create_google_drive_file_tool",
+    "create_delete_google_drive_file_tool",
+]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/google_drive/tools/create_file.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/google_drive/tools/create_file.py
new file mode 100644
index 0000000000..70f5eea74f
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/google_drive/tools/create_file.py
@@ -0,0 +1,322 @@
+import logging
+from typing import Any, Literal
+
+from googleapiclient.errors import HttpError
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.agents.multi_agent_chat.subagents.shared.hitl.approvals.self_gated import (
+    request_approval,
+)
+from app.connectors.google_drive.client import GoogleDriveClient
+from app.connectors.google_drive.file_types import GOOGLE_DOC, GOOGLE_SHEET
+from app.services.google_drive import GoogleDriveToolMetadataService
+
+logger = logging.getLogger(__name__)
+
+_MIME_MAP: dict[str, str] = {
+    "google_doc": GOOGLE_DOC,
+    "google_sheet": GOOGLE_SHEET,
+}
+
+
+def create_create_google_drive_file_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    @tool
+    async def create_google_drive_file(
+        name: str,
+        file_type: Literal["google_doc", "google_sheet"],
+        content: str | None = None,
+    ) -> dict[str, Any]:
+        """Create a new Google Doc or Google Sheet in Google Drive.
+
+        Use this tool when the user explicitly asks to create a new document
+        or spreadsheet in Google Drive. The user MUST specify a topic before
+        you call this tool. If the request does not contain a topic (e.g.
+        "create a drive doc" or "make a Google Sheet"), ask what the file
+        should be about. Never call this tool without a clear topic from the user.
+
+        Args:
+            name: The file name (without extension).
+            file_type: Either "google_doc" or "google_sheet".
+            content: Optional initial content. Generate from the user's topic.
+                     For google_doc, provide markdown text. For google_sheet, provide CSV-formatted text.
+
+        Returns:
+            Dictionary with:
+            - status: "success", "rejected", or "error"
+            - file_id: Google Drive file ID (if success)
+            - name: File name (if success)
+            - web_view_link: URL to open the file (if success)
+            - message: Result message
+
+            IMPORTANT:
+            - If status is "rejected", the user explicitly declined the action.
+              Respond with a brief acknowledgment and do NOT retry or suggest alternatives.
+            - If status is "insufficient_permissions", the connector lacks the required OAuth scope.
+              Inform the user they need to re-authenticate and do NOT retry the action.
+
+        Examples:
+            - "Create a Google Doc with today's meeting notes"
+            - "Create a spreadsheet for the 2026 budget"
+        """
+        logger.info(
+            f"create_google_drive_file called: name='{name}', type='{file_type}'"
+        )
+
+        if db_session is None or search_space_id is None or user_id is None:
+            return {
+                "status": "error",
+                "message": "Google Drive tool not properly configured. Please contact support.",
+            }
+
+        if file_type not in _MIME_MAP:
+            return {
+                "status": "error",
+                "message": f"Unsupported file type '{file_type}'. Use 'google_doc' or 'google_sheet'.",
+            }
+
+        try:
+            metadata_service = GoogleDriveToolMetadataService(db_session)
+            context = await metadata_service.get_creation_context(
+                search_space_id, user_id
+            )
+
+            if "error" in context:
+                logger.error(f"Failed to fetch creation context: {context['error']}")
+                return {"status": "error", "message": context["error"]}
+
+            accounts = context.get("accounts", [])
+            if accounts and all(a.get("auth_expired") for a in accounts):
+                logger.warning("All Google Drive accounts have expired authentication")
+                return {
+                    "status": "auth_error",
+                    "message": "All connected Google Drive accounts need re-authentication. Please re-authenticate in your connector settings.",
+                    "connector_type": "google_drive",
+                }
+
+            logger.info(
+                f"Requesting approval for creating Google Drive file: name='{name}', type='{file_type}'"
+            )
+            result = request_approval(
+                action_type="google_drive_file_creation",
+                tool_name="create_google_drive_file",
+                params={
+                    "name": name,
+                    "file_type": file_type,
+                    "content": content,
+                    "connector_id": None,
+                    "parent_folder_id": None,
+                },
+                context=context,
+            )
+
+            if result.rejected:
+                return {
+                    "status": "rejected",
+                    "message": "User declined. The file was not created. Do not ask again or suggest alternatives.",
+                }
+
+            final_name = result.params.get("name", name)
+            final_file_type = result.params.get("file_type", file_type)
+            final_content = result.params.get("content", content)
+            final_connector_id = result.params.get("connector_id")
+            final_parent_folder_id = result.params.get("parent_folder_id")
+
+            if not final_name or not final_name.strip():
+                return {"status": "error", "message": "File name cannot be empty."}
+
+            mime_type = _MIME_MAP.get(final_file_type)
+            if not mime_type:
+                return {
+                    "status": "error",
+                    "message": f"Unsupported file type '{final_file_type}'.",
+                }
+
+            from sqlalchemy.future import select
+
+            from app.db import SearchSourceConnector, SearchSourceConnectorType
+
+            _drive_types = [
+                SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR,
+                SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR,
+            ]
+
+            if final_connector_id is not None:
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.id == final_connector_id,
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type.in_(_drive_types),
+                    )
+                )
+                connector = result.scalars().first()
+                if not connector:
+                    return {
+                        "status": "error",
+                        "message": "Selected Google Drive connector is invalid or has been disconnected.",
+                    }
+                actual_connector_id = connector.id
+            else:
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type.in_(_drive_types),
+                    )
+                )
+                connector = result.scalars().first()
+                if not connector:
+                    return {
+                        "status": "error",
+                        "message": "No Google Drive connector found. Please connect Google Drive in your workspace settings.",
+                    }
+                actual_connector_id = connector.id
+
+            logger.info(
+                f"Creating Google Drive file: name='{final_name}', type='{final_file_type}', connector={actual_connector_id}"
+            )
+
+            async def _flag_auth_expired() -> None:
+                try:
+                    from sqlalchemy.orm.attributes import flag_modified
+
+                    _res = await db_session.execute(
+                        select(SearchSourceConnector).where(
+                            SearchSourceConnector.id == actual_connector_id
+                        )
+                    )
+                    _conn = _res.scalar_one_or_none()
+                    if _conn and not _conn.config.get("auth_expired"):
+                        _conn.config = {**_conn.config, "auth_expired": True}
+                        flag_modified(_conn, "config")
+                        await db_session.commit()
+                except Exception:
+                    logger.warning(
+                        "Failed to persist auth_expired for connector %s",
+                        actual_connector_id,
+                        exc_info=True,
+                    )
+
+            if (
+                connector.connector_type
+                == SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR
+            ):
+                cca_id = connector.config.get("composio_connected_account_id")
+                if not cca_id:
+                    return {
+                        "status": "error",
+                        "message": "Composio connected account ID not found for this Google Drive connector.",
+                    }
+
+                from app.services.composio_service import ComposioService
+
+                created, error = await ComposioService().create_drive_file_from_text(
+                    connected_account_id=cca_id,
+                    entity_id=f"surfsense_{user_id}",
+                    name=final_name,
+                    mime_type=mime_type,
+                    content=final_content,
+                    parent_id=final_parent_folder_id,
+                )
+
+                if error or not created:
+                    err_lower = (error or "").lower()
+                    if (
+                        "insufficient" in err_lower
+                        or "permission" in err_lower
+                        or "403" in err_lower
+                    ):
+                        logger.warning(
+                            f"Insufficient permissions for Composio Drive connector {actual_connector_id}: {error}"
+                        )
+                        await _flag_auth_expired()
+                        return {
+                            "status": "insufficient_permissions",
+                            "connector_id": actual_connector_id,
+                            "message": "This Google Drive account needs additional permissions. Please re-authenticate in connector settings.",
+                        }
+                    logger.error(
+                        f"Composio Drive create_file failed for connector {actual_connector_id}: {error}"
+                    )
+                    return {
+                        "status": "error",
+                        "message": "Something went wrong while creating the file. Please try again.",
+                    }
+            else:
+                client = GoogleDriveClient(
+                    session=db_session,
+                    connector_id=actual_connector_id,
+                )
+                try:
+                    created = await client.create_file(
+                        name=final_name,
+                        mime_type=mime_type,
+                        parent_folder_id=final_parent_folder_id,
+                        content=final_content,
+                    )
+                except HttpError as http_err:
+                    if http_err.resp.status == 403:
+                        logger.warning(
+                            f"Insufficient permissions for connector {actual_connector_id}: {http_err}"
+                        )
+                        await _flag_auth_expired()
+                        return {
+                            "status": "insufficient_permissions",
+                            "connector_id": actual_connector_id,
+                            "message": "This Google Drive account needs additional permissions. Please re-authenticate in connector settings.",
+                        }
+                    raise
+
+            logger.info(
+                f"Google Drive file created: id={created.get('id')}, name={created.get('name')}"
+            )
+
+            kb_message_suffix = ""
+            try:
+                from app.services.google_drive import GoogleDriveKBSyncService
+
+                kb_service = GoogleDriveKBSyncService(db_session)
+                kb_result = await kb_service.sync_after_create(
+                    file_id=created.get("id"),
+                    file_name=created.get("name", final_name),
+                    mime_type=mime_type,
+                    web_view_link=created.get("webViewLink"),
+                    content=final_content,
+                    connector_id=actual_connector_id,
+                    search_space_id=search_space_id,
+                    user_id=user_id,
+                )
+                if kb_result["status"] == "success":
+                    kb_message_suffix = " Your knowledge base has also been updated."
+                else:
+                    kb_message_suffix = " This file will be added to your knowledge base in the next scheduled sync."
+            except Exception as kb_err:
+                logger.warning(f"KB sync after create failed: {kb_err}")
+                kb_message_suffix = " This file will be added to your knowledge base in the next scheduled sync."
+
+            return {
+                "status": "success",
+                "file_id": created.get("id"),
+                "name": created.get("name"),
+                "web_view_link": created.get("webViewLink"),
+                "message": f"Successfully created '{created.get('name')}' in Google Drive.{kb_message_suffix}",
+            }
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+
+            logger.error(f"Error creating Google Drive file: {e}", exc_info=True)
+            return {
+                "status": "error",
+                "message": "Something went wrong while creating the file. Please try again.",
+            }
+
+    return create_google_drive_file
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/google_drive/tools/index.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/google_drive/tools/index.py
new file mode 100644
index 0000000000..dd05374a17
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/google_drive/tools/index.py
@@ -0,0 +1,34 @@
+"""``google_drive`` native tools and (empty) permission ruleset.
+
+Tools self-gate via :func:`request_approval` in their bodies.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.tools import BaseTool
+
+from app.agents.new_chat.permissions import Ruleset
+
+from .create_file import create_create_google_drive_file_tool
+from .trash_file import create_delete_google_drive_file_tool
+
+NAME = "google_drive"
+
+RULESET = Ruleset(origin=NAME, rules=[])
+
+
+def load_tools(
+    *, dependencies: dict[str, Any] | None = None, **kwargs: Any
+) -> list[BaseTool]:
+    d = {**(dependencies or {}), **kwargs}
+    common = {
+        "db_session": d["db_session"],
+        "search_space_id": d["search_space_id"],
+        "user_id": d["user_id"],
+    }
+    return [
+        create_create_google_drive_file_tool(**common),
+        create_delete_google_drive_file_tool(**common),
+    ]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/google_drive/tools/trash_file.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/google_drive/tools/trash_file.py
new file mode 100644
index 0000000000..7fbcd74a3e
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/google_drive/tools/trash_file.py
@@ -0,0 +1,297 @@
+import logging
+from typing import Any
+
+from googleapiclient.errors import HttpError
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.agents.multi_agent_chat.subagents.shared.hitl.approvals.self_gated import (
+    request_approval,
+)
+from app.connectors.google_drive.client import GoogleDriveClient
+from app.services.google_drive import GoogleDriveToolMetadataService
+
+logger = logging.getLogger(__name__)
+
+
+def create_delete_google_drive_file_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    @tool
+    async def delete_google_drive_file(
+        file_name: str,
+        delete_from_kb: bool = False,
+    ) -> dict[str, Any]:
+        """Move a Google Drive file to trash.
+
+        Use this tool when the user explicitly asks to delete, remove, or trash
+        a file in Google Drive.
+
+        Args:
+            file_name: The exact name of the file to trash (as it appears in Drive).
+            delete_from_kb: Whether to also remove the file from the knowledge base.
+                          Default is False.
+                          Set to True to remove from both Google Drive and knowledge base.
+
+        Returns:
+            Dictionary with:
+            - status: "success", "rejected", "not_found", or "error"
+            - file_id: Google Drive file ID (if success)
+            - deleted_from_kb: whether the document was removed from the knowledge base
+            - message: Result message
+
+            IMPORTANT:
+            - If status is "rejected", the user explicitly declined. Respond with a brief
+              acknowledgment and do NOT retry or suggest alternatives.
+            - If status is "not_found", relay the exact message to the user and ask them
+              to verify the file name or check if it has been indexed.
+            - If status is "insufficient_permissions", the connector lacks the required OAuth scope.
+              Inform the user they need to re-authenticate and do NOT retry this tool.
+        Examples:
+            - "Delete the 'Meeting Notes' file from Google Drive"
+            - "Trash the 'Old Budget' spreadsheet"
+        """
+        logger.info(
+            f"delete_google_drive_file called: file_name='{file_name}', delete_from_kb={delete_from_kb}"
+        )
+
+        if db_session is None or search_space_id is None or user_id is None:
+            return {
+                "status": "error",
+                "message": "Google Drive tool not properly configured. Please contact support.",
+            }
+
+        try:
+            metadata_service = GoogleDriveToolMetadataService(db_session)
+            context = await metadata_service.get_trash_context(
+                search_space_id, user_id, file_name
+            )
+
+            if "error" in context:
+                error_msg = context["error"]
+                if "not found" in error_msg.lower():
+                    logger.warning(f"File not found: {error_msg}")
+                    return {"status": "not_found", "message": error_msg}
+                logger.error(f"Failed to fetch trash context: {error_msg}")
+                return {"status": "error", "message": error_msg}
+
+            account = context.get("account", {})
+            if account.get("auth_expired"):
+                logger.warning(
+                    "Google Drive account %s has expired authentication",
+                    account.get("id"),
+                )
+                return {
+                    "status": "auth_error",
+                    "message": "The Google Drive account for this file needs re-authentication. Please re-authenticate in your connector settings.",
+                    "connector_type": "google_drive",
+                }
+
+            file = context["file"]
+            file_id = file["file_id"]
+            document_id = file.get("document_id")
+            connector_id_from_context = context["account"]["id"]
+
+            if not file_id:
+                return {
+                    "status": "error",
+                    "message": "File ID is missing from the indexed document. Please re-index the file and try again.",
+                }
+
+            logger.info(
+                f"Requesting approval for deleting Google Drive file: '{file_name}' (file_id={file_id}, delete_from_kb={delete_from_kb})"
+            )
+            result = request_approval(
+                action_type="google_drive_file_trash",
+                tool_name="delete_google_drive_file",
+                params={
+                    "file_id": file_id,
+                    "connector_id": connector_id_from_context,
+                    "delete_from_kb": delete_from_kb,
+                },
+                context=context,
+            )
+
+            if result.rejected:
+                return {
+                    "status": "rejected",
+                    "message": "User declined. The file was not trashed. Do not ask again or suggest alternatives.",
+                }
+
+            final_file_id = result.params.get("file_id", file_id)
+            final_connector_id = result.params.get(
+                "connector_id", connector_id_from_context
+            )
+            final_delete_from_kb = result.params.get("delete_from_kb", delete_from_kb)
+
+            if not final_connector_id:
+                return {
+                    "status": "error",
+                    "message": "No connector found for this file.",
+                }
+
+            from sqlalchemy.future import select
+
+            from app.db import SearchSourceConnector, SearchSourceConnectorType
+
+            _drive_types = [
+                SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR,
+                SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR,
+            ]
+
+            result = await db_session.execute(
+                select(SearchSourceConnector).filter(
+                    SearchSourceConnector.id == final_connector_id,
+                    SearchSourceConnector.search_space_id == search_space_id,
+                    SearchSourceConnector.user_id == user_id,
+                    SearchSourceConnector.connector_type.in_(_drive_types),
+                )
+            )
+            connector = result.scalars().first()
+            if not connector:
+                return {
+                    "status": "error",
+                    "message": "Selected Google Drive connector is invalid or has been disconnected.",
+                }
+
+            logger.info(
+                f"Deleting Google Drive file: file_id='{final_file_id}', connector={final_connector_id}"
+            )
+
+            async def _flag_auth_expired() -> None:
+                try:
+                    from sqlalchemy.orm.attributes import flag_modified
+
+                    if not connector.config.get("auth_expired"):
+                        connector.config = {
+                            **connector.config,
+                            "auth_expired": True,
+                        }
+                        flag_modified(connector, "config")
+                        await db_session.commit()
+                except Exception:
+                    logger.warning(
+                        "Failed to persist auth_expired for connector %s",
+                        connector.id,
+                        exc_info=True,
+                    )
+
+            if (
+                connector.connector_type
+                == SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR
+            ):
+                cca_id = connector.config.get("composio_connected_account_id")
+                if not cca_id:
+                    return {
+                        "status": "error",
+                        "message": "Composio connected account ID not found for this Google Drive connector.",
+                    }
+
+                from app.services.composio_service import ComposioService
+
+                error = await ComposioService().trash_drive_file(
+                    connected_account_id=cca_id,
+                    entity_id=f"surfsense_{user_id}",
+                    file_id=final_file_id,
+                )
+                if error:
+                    err_lower = error.lower()
+                    if (
+                        "insufficient" in err_lower
+                        or "permission" in err_lower
+                        or "403" in err_lower
+                    ):
+                        logger.warning(
+                            f"Insufficient permissions for Composio Drive connector {connector.id}: {error}"
+                        )
+                        await _flag_auth_expired()
+                        return {
+                            "status": "insufficient_permissions",
+                            "connector_id": connector.id,
+                            "message": "This Google Drive account needs additional permissions. Please re-authenticate in connector settings.",
+                        }
+                    logger.error(
+                        f"Composio Drive trash_file failed for connector {connector.id}: {error}"
+                    )
+                    return {
+                        "status": "error",
+                        "message": "Something went wrong while trashing the file. Please try again.",
+                    }
+            else:
+                client = GoogleDriveClient(
+                    session=db_session,
+                    connector_id=connector.id,
+                )
+                try:
+                    await client.trash_file(file_id=final_file_id)
+                except HttpError as http_err:
+                    if http_err.resp.status == 403:
+                        logger.warning(
+                            f"Insufficient permissions for connector {connector.id}: {http_err}"
+                        )
+                        await _flag_auth_expired()
+                        return {
+                            "status": "insufficient_permissions",
+                            "connector_id": connector.id,
+                            "message": "This Google Drive account needs additional permissions. Please re-authenticate in connector settings.",
+                        }
+                    raise
+
+            logger.info(
+                f"Google Drive file deleted (moved to trash): file_id={final_file_id}"
+            )
+
+            trash_result: dict[str, Any] = {
+                "status": "success",
+                "file_id": final_file_id,
+                "message": f"Successfully moved '{file['name']}' to trash.",
+            }
+
+            deleted_from_kb = False
+            if final_delete_from_kb and document_id:
+                try:
+                    from app.db import Document
+
+                    doc_result = await db_session.execute(
+                        select(Document).filter(Document.id == document_id)
+                    )
+                    document = doc_result.scalars().first()
+                    if document:
+                        await db_session.delete(document)
+                        await db_session.commit()
+                        deleted_from_kb = True
+                        logger.info(
+                            f"Deleted document {document_id} from knowledge base"
+                        )
+                    else:
+                        logger.warning(f"Document {document_id} not found in KB")
+                except Exception as e:
+                    logger.error(f"Failed to delete document from KB: {e}")
+                    await db_session.rollback()
+                    trash_result["warning"] = (
+                        f"File moved to trash, but failed to remove from knowledge base: {e!s}"
+                    )
+
+            trash_result["deleted_from_kb"] = deleted_from_kb
+            if deleted_from_kb:
+                trash_result["message"] = (
+                    f"{trash_result.get('message', '')} (also removed from knowledge base)"
+                )
+
+            return trash_result
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+
+            logger.error(f"Error deleting Google Drive file: {e}", exc_info=True)
+            return {
+                "status": "error",
+                "message": "Something went wrong while trashing the file. Please try again.",
+            }
+
+    return delete_google_drive_file
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/jira/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/jira/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/jira/agent.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/jira/agent.py
new file mode 100644
index 0000000000..ff71d4cf76
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/jira/agent.py
@@ -0,0 +1,43 @@
+"""``jira`` route: ``SurfSenseSubagentSpec`` builder for deepagents.
+
+Tools come exclusively from MCP. The connector's own approval ruleset is
+declared in :data:`tools.index.RULESET`; the orchestrator layers it into
+a per-subagent :class:`PermissionMiddleware`.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.language_models import BaseChatModel
+from langchain_core.tools import BaseTool
+
+from app.agents.multi_agent_chat.subagents.shared.md_file_reader import read_md_file
+from app.agents.multi_agent_chat.subagents.shared.spec import SurfSenseSubagentSpec
+from app.agents.multi_agent_chat.subagents.shared.subagent_builder import pack_subagent
+
+from .tools.index import NAME, RULESET
+
+
+def build_subagent(
+    *,
+    dependencies: dict[str, Any],
+    model: BaseChatModel | None = None,
+    middleware_stack: dict[str, Any] | None = None,
+    mcp_tools: list[BaseTool] | None = None,
+) -> SurfSenseSubagentSpec:
+    description = (
+        read_md_file(__package__, "description").strip()
+        or "Handles jira tasks for this workspace."
+    )
+    system_prompt = read_md_file(__package__, "system_prompt").strip()
+    return pack_subagent(
+        name=NAME,
+        description=description,
+        system_prompt=system_prompt,
+        tools=list(mcp_tools or []),
+        ruleset=RULESET,
+        dependencies=dependencies,
+        model=model,
+        middleware_stack=middleware_stack,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/jira/description.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/jira/description.md
new file mode 100644
index 0000000000..e2b66cb357
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/jira/description.md
@@ -0,0 +1,2 @@
+Specialist for issues and projects in the user's Jira.
+Use proactively when the user wants to find, create, or update a Jira issue, assign it, or transition it between workflow states.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/jira/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/jira/system_prompt.md
new file mode 100644
index 0000000000..79c46f8a03
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/jira/system_prompt.md
@@ -0,0 +1,122 @@
+You are a Jira specialist for the user's connected Atlassian Jira instance(s).
+
+Jira vocabulary:
+- **Site / `cloudId`**: a user may have access to multiple Atlassian sites. Every project/issue operation is scoped to one `cloudId`. Look up the user's accessible Atlassian sites if the request leaves the site unspecified.
+- **Project key**: `<ABC>` (e.g. `ENG`, `OPS`). Stable per project; used to build issue keys.
+- **Issue key**: `<PROJECT_KEY>-<NUMBER>` (e.g. `ENG-42`). User-facing and stable; prefer it in `action_summary`.
+- **Workflow & transitions**: Jira does *not* let you set a status directly. Each issue's workflow exposes a list of currently-available transitions (each with its own `transitionId`), and only those transitions can be applied. The set of available transitions depends on the issue's current status and is project-/workflow-specific — there is no universal mapping from a status name to a transition.
+- **Issue type**: per-project. Available types and required fields vary per project — there is no global list. Look up the project's actual issue types (and their required fields) before relying on a type name.
+- **Priority**: per-project string names (not integers, not a fixed scheme). Different Jira projects use different priority labels and may add or remove options. Look up the target project's actual priorities before setting one.
+- **Assignee**: Jira identifies users by opaque `accountId`, never by display name or email. Map the display name or email to an `accountId` before assigning.
+- **Reporter**: defaults to the API caller's user; only override when the request explicitly asks for a different reporter.
+- **JQL**: Jira Query Language — the canonical way to filter issues. The syntax (field operators `=` `!=` `~` `>` `<` `in`, functions like `currentUser()`, date math like `-7d`) is stable. The **values** you put into JQL (status names, priority labels, issue-type names, project keys, account IDs) are not — look those up rather than guessing.
+- **Custom fields**: many Jira projects mandate custom fields on create (epic link, sprint, story points, etc.). Required fields are project-/issue-type-specific.
+
+When invoked:
+1. Read the supervisor's request, then read the runtime tool list to learn what information you can fetch and which mutations are available.
+2. Plan the minimum chain of lookups needed to resolve any identifier, name, scope, or required field the request leaves unspecified (site / project / issue / transition / user / required fields, etc.).
+3. Execute the planned lookups, then the requested mutation (if any), then return.
+
+Resolution principle (the core behaviour):
+**Proactively look up any identifier, name, value, or scope the request leaves unspecified — `cloudId`, project keys, issue keys, `accountId`s, `transitionId`s, custom-field values, anything else — using the available tools instead of asking the supervisor.** Most user requests reference targets by title, description, or paraphrase, not by key. Search by JQL or by the relevant metadata.
+
+When a lookup for a single slot returns multiple plausible candidates and you cannot confidently pick one, return `status=blocked` with up to 5 candidates in `evidence.matched_candidates` and the unresolved slot in `missing_fields`. The supervisor will disambiguate and redelegate.
+
+When a lookup returns zero matches for a slot the request requires, return `status=blocked` with a `next_step` suggesting alternative filters.
+
+Mutation guardrails:
+- Resolve every required Jira value (`cloudId`, `projectKey`, `issueKey`, `transitionId`, `accountId`, custom-field values) by looking it up before calling a mutation tool. Mutations have chained dependencies — `cloudId` enables project lookup; project lookup enables issue-type and required-field resolution; issue lookup enables transition resolution.
+- Never set status directly. To change an issue's status, look up that issue's currently-available transitions and apply the matching `transitionId`. If the user-requested target status is not in the available transitions, return `status=blocked` and surface the available transitions in `evidence.matched_candidates`.
+- Never invent `cloudId`s, keys, `accountId`s, `transitionId`s, custom-field values, priority labels, issue-type names, or mutation outcomes. Every field in `evidence` must come from a tool result.
+- For create operations, look up the target issue type's required-field schema before assuming `summary`/`issueType` is enough — many projects mandate priority, due date, or custom fields.
+- Confirm the mutation tool returned a success response before claiming success. If the mutation is approval-rejected (HITL), return `status=blocked` with `next_step="user declined; do not retry"`.
+- One operation per delegation. For multi-mutation requests, complete the highest-priority one and return `status=partial` with the remainder in `next_step`.
+
+Failure handling:
+- Tool failure: return `status=error`, place the underlying error message in `action_summary`, and put a concise recovery in `next_step`.
+- No useful results after reasonable narrowing/broadening: return `status=blocked` with filter / JQL suggestions in `next_step`.
+
+<example>
+Supervisor: "Find issues assigned to me with status 'In Progress'."
+1. JQL search with `assignee = currentUser() AND status = "In Progress"`.
+2. Return `status=success` with the matched issues in `evidence.items`.
+</example>
+
+<example>
+Supervisor: "Create a Bug 'Login fails on Safari' in the Mobile project."
+1. Look up accessible sites → multiple sites are connected to the user. The request gives no signal pointing to one.
+2. Cannot pick the `cloudId`. Return:
+   {
+     "status": "blocked",
+     "action_summary": "Need to know which Atlassian site holds the Mobile project.",
+     "evidence": {
+       "title": "Login fails on Safari",
+       "matched_candidates": [
+         { "id": "cloud_acme", "label": "acme.atlassian.net" },
+         { "id": "cloud_acme_eu", "label": "acme-eu.atlassian.net" }
+       ]
+     },
+     "next_step": "Confirm which Atlassian site, then redelegate.",
+     "missing_fields": ["site"]
+   }
+</example>
+
+<example>
+Supervisor: "Move `PROJ-123` to Done and assign it to Sam."
+1. Look up `PROJ-123` → exists; current status `In Review`; project `PROJ`.
+2. Look up available transitions for `PROJ-123` → `[ "Code Review → Done" (id=51), "Code Review → Cancelled" (id=61) ]`. `Done` is reachable via transition id `51`.
+3. Look up users named "Sam" → two matches (`accountId=acc_sam1`, `accountId=acc_sam2`).
+4. Cannot confidently pick the assignee. Return:
+   {
+     "status": "blocked",
+     "action_summary": "Issue resolved (PROJ-123). Transition to Done resolved (id 51). Two users match 'Sam'.",
+     "evidence": {
+       "identifier": "PROJ-123",
+       "title": "Refactor auth module",
+       "transition_id": "51",
+       "matched_candidates": [
+         { "id": "acc_sam1", "label": "Sam Carter <sam.carter@…>" },
+         { "id": "acc_sam2", "label": "Sam Lopez <sam.lopez@…>" }
+       ]
+     },
+     "next_step": "Confirm which Sam, then redelegate.",
+     "missing_fields": ["assignee"]
+   }
+</example>
+
+<output_contract>
+Return **only** one JSON object (no markdown, no prose):
+{
+  "status": "success" | "partial" | "blocked" | "error",
+  "action_summary": string,
+  "evidence": {
+    "site": string | null,
+    "cloud_id": string | null,
+    "project_key": string | null,
+    "identifier": string | null,
+    "issue_id": string | null,
+    "title": string | null,
+    "issue_type": string | null,
+    "status": string | null,
+    "transition_id": string | null,
+    "assignee": string | null,
+    "priority": string | null,
+    "url": string | null,
+    "matched_candidates": [
+      { "id": string, "label": string }
+    ] | null,
+    "items": object | null
+  },
+  "next_step": string | null,
+  "missing_fields": string[] | null,
+  "assumptions": string[] | null
+}
+Rules:
+- `status=success` → `next_step=null`, `missing_fields=null`.
+- `status=partial|blocked|error` → `next_step` must be non-null.
+- `status=blocked` due to missing required inputs → `missing_fields` must be non-null.
+- For blocked ambiguity, populate `evidence.matched_candidates` with up to 5 options (`id` + `label` — works for any kind of candidate: site, project, issue, user, transition, etc.).
+- For discovery-only queries (lists), populate `evidence.items` with the structured list.
+</output_contract>
+
+Discover before you mutate; never guess identifiers, transitions, or required fields.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/jira/tools/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/jira/tools/__init__.py
new file mode 100644
index 0000000000..dc721013a9
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/jira/tools/__init__.py
@@ -0,0 +1,3 @@
+"""Jira route: native tool factories are empty; MCP supplies tools when configured."""
+
+__all__: list[str] = []
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/jira/tools/index.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/jira/tools/index.py
new file mode 100644
index 0000000000..24f1bdc011
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/jira/tools/index.py
@@ -0,0 +1,26 @@
+"""``jira`` permission ruleset (rules over MCP tool names)."""
+
+from __future__ import annotations
+
+from app.agents.new_chat.permissions import Rule, Ruleset
+
+NAME = "jira"
+
+RULESET = Ruleset(
+    origin=NAME,
+    rules=[
+        Rule(permission="getAccessibleAtlassianResources", pattern="*", action="allow"),
+        Rule(permission="getVisibleJiraProjects", pattern="*", action="allow"),
+        Rule(permission="searchJiraIssuesUsingJql", pattern="*", action="allow"),
+        Rule(permission="getJiraIssue", pattern="*", action="allow"),
+        Rule(
+            permission="getJiraProjectIssueTypesMetadata", pattern="*", action="allow"
+        ),
+        Rule(permission="getJiraIssueTypeMetaWithFields", pattern="*", action="allow"),
+        Rule(permission="getTransitionsForJiraIssue", pattern="*", action="allow"),
+        Rule(permission="lookupJiraAccountId", pattern="*", action="allow"),
+        Rule(permission="createJiraIssue", pattern="*", action="ask"),
+        Rule(permission="editJiraIssue", pattern="*", action="ask"),
+        Rule(permission="transitionJiraIssue", pattern="*", action="ask"),
+    ],
+)
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/linear/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/linear/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/linear/agent.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/linear/agent.py
new file mode 100644
index 0000000000..d9b282f2bd
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/linear/agent.py
@@ -0,0 +1,43 @@
+"""``linear`` route: ``SurfSenseSubagentSpec`` builder for deepagents.
+
+Tools come exclusively from MCP. The connector's own approval ruleset is
+declared in :data:`tools.index.RULESET`; the orchestrator layers it into
+a per-subagent :class:`PermissionMiddleware`.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.language_models import BaseChatModel
+from langchain_core.tools import BaseTool
+
+from app.agents.multi_agent_chat.subagents.shared.md_file_reader import read_md_file
+from app.agents.multi_agent_chat.subagents.shared.spec import SurfSenseSubagentSpec
+from app.agents.multi_agent_chat.subagents.shared.subagent_builder import pack_subagent
+
+from .tools.index import NAME, RULESET
+
+
+def build_subagent(
+    *,
+    dependencies: dict[str, Any],
+    model: BaseChatModel | None = None,
+    middleware_stack: dict[str, Any] | None = None,
+    mcp_tools: list[BaseTool] | None = None,
+) -> SurfSenseSubagentSpec:
+    description = (
+        read_md_file(__package__, "description").strip()
+        or "Handles linear tasks for this workspace."
+    )
+    system_prompt = read_md_file(__package__, "system_prompt").strip()
+    return pack_subagent(
+        name=NAME,
+        description=description,
+        system_prompt=system_prompt,
+        tools=list(mcp_tools or []),
+        ruleset=RULESET,
+        dependencies=dependencies,
+        model=model,
+        middleware_stack=middleware_stack,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/linear/description.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/linear/description.md
new file mode 100644
index 0000000000..e1857a45f7
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/linear/description.md
@@ -0,0 +1,2 @@
+Specialist for issues, projects, and cycles in the user's Linear workspace.
+Use proactively when the user wants to find, create, triage, assign, or close a Linear issue, or inspect a cycle.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/linear/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/linear/system_prompt.md
new file mode 100644
index 0000000000..f7dbeb9a94
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/linear/system_prompt.md
@@ -0,0 +1,112 @@
+You are a Linear specialist for the user's connected Linear workspace.
+
+Linear vocabulary:
+- **Issue identifier**: `<TEAM_KEY>-<NUMBER>` (e.g. `ENG-42`). User-facing and stable; prefer it in `action_summary`.
+- **Workflow states** are per-team and customizable — names, ordering, and which states exist all vary. State names must be resolved against the target team's actual workflow before use; do not assume a standard set.
+- **Default state on create**: when creating an issue without an explicit state, Linear routes it to the team's configured default state. Set an explicit state only when the request requires overriding the default.
+- **Priority**: `0=No priority`, `1=Urgent`, `2=High`, `3=Medium`, `4=Low`.
+- **Cycle**: a time-boxed iteration. Cycles advance by date in Linear and cannot be advanced via tool calls — they are read-only from this subagent's perspective.
+
+When invoked:
+1. Read the supervisor's request and the runtime tool list. Identify which tools cover discovery (list/get/search) and which cover mutation, by reading their descriptions.
+2. Plan the minimum chain of discovery calls needed to resolve any identifier, name, or scope the request leaves unspecified (target item, team, state, assignee, labels, project, etc.).
+3. Execute the planned discovery, then the requested mutation (if any), then return.
+
+Resolution principle (the core behaviour):
+**Proactively look up any identifier, name, value, or scope the request leaves unspecified — target identifiers, user IDs, state IDs, label IDs, project scope, anything else — using the available tools instead of asking the supervisor.** Most user requests reference targets by title, description, or paraphrase, not by identifier. Search for them.
+
+When discovery for a single slot returns multiple plausible candidates and you cannot confidently pick one, return `status=blocked` with up to 5 candidates in `evidence.matched_candidates` and the unresolved slot in `missing_fields`. The supervisor will disambiguate and redelegate.
+
+When discovery returns zero matches for a slot the request requires, return `status=blocked` with a `next_step` suggesting alternative filters.
+
+Mutation guardrails:
+- Resolve every required Linear ID via discovery before calling a mutation tool. Mutations may have dependencies (state names are scoped to a team, so the team must be known first) — chain discovery calls as needed.
+- Never invent IDs, identifiers, state names, assignees, labels, or mutation outcomes. Every field in `evidence` must come from a tool result.
+- Confirm the mutation tool returned a success response before claiming success. If the mutation is approval-rejected (HITL), return `status=blocked` with `next_step="user declined; do not retry"`.
+- One operation per delegation. For multi-mutation requests, complete the highest-priority one and return `status=partial` with the remainder in `next_step`.
+
+Failure handling:
+- Tool failure: return `status=error`, place the underlying error message in `action_summary`, and put a concise recovery in `next_step`.
+- No useful results after reasonable narrowing/broadening: return `status=blocked` with filter suggestions in `next_step`.
+
+<example>
+Supervisor: "Find issues assigned to me with priority Urgent."
+1. Discovery: list issues with filters `{assignee: "me", priority: 1}`.
+2. Return `status=success` with the matched issues in `evidence.items`.
+</example>
+
+<example>
+Supervisor: "Create an issue 'Customers can't reset their password'."
+1. Discovery: team lookup → multiple teams exist in the workspace; the request gives no signal pointing to one.
+2. Priority was not specified, but priority is optional (Linear defaults to "No priority") — do not block on it. State is also optional (Linear applies the team's default state).
+3. Cannot pick the team. Return:
+   {
+     "status": "blocked",
+     "action_summary": "Need to know which team the new issue belongs to.",
+     "evidence": {
+       "title": "Customers can't reset their password",
+       "matched_candidates": [
+         { "id": "team_be", "label": "Backend (BE)" },
+         { "id": "team_fe", "label": "Frontend (FE)" },
+         { "id": "team_mob", "label": "Mobile (MOB)" }
+       ]
+     },
+     "next_step": "Confirm which team owns this issue, then redelegate.",
+     "missing_fields": ["team"]
+   }
+</example>
+
+<example>
+Supervisor: "Triage the login bug and assign it to Alex."
+1. Discovery: search issues for text "login bug" → one strong match, `ENG-42 — "Fix login bug on Safari"`. Capture its team_id.
+2. Discovery: workflow-state lookup for that team → find the `Triage` state id.
+3. Discovery: user lookup for "Alex" → two matches (alex.chen@…, alex.wong@…).
+4. Cannot confidently pick the assignee. Return:
+   {
+     "status": "blocked",
+     "action_summary": "Issue resolved (ENG-42). State resolved (Triage). Two users match 'Alex'.",
+     "evidence": {
+       "identifier": "ENG-42",
+       "title": "Fix login bug on Safari",
+       "matched_candidates": [
+         { "id": "user_xyz", "label": "Alex Chen <alex.chen@…>" },
+         { "id": "user_abc", "label": "Alex Wong <alex.wong@…>" }
+       ]
+     },
+     "next_step": "Confirm which Alex, then redelegate.",
+     "missing_fields": ["assignee"]
+   }
+</example>
+
+<output_contract>
+Return **only** one JSON object (no markdown, no prose):
+{
+  "status": "success" | "partial" | "blocked" | "error",
+  "action_summary": string,
+  "evidence": {
+    "identifier": string | null,
+    "issue_id": string | null,
+    "title": string | null,
+    "state": string | null,
+    "assignee": string | null,
+    "priority": "No priority" | "Urgent" | "High" | "Medium" | "Low" | null,
+    "team_key": string | null,
+    "url": string | null,
+    "matched_candidates": [
+      { "id": string, "label": string }
+    ] | null,
+    "items": object | null
+  },
+  "next_step": string | null,
+  "missing_fields": string[] | null,
+  "assumptions": string[] | null
+}
+Rules:
+- `status=success` → `next_step=null`, `missing_fields=null`.
+- `status=partial|blocked|error` → `next_step` must be non-null.
+- `status=blocked` due to missing required inputs → `missing_fields` must be non-null.
+- For blocked ambiguity, populate `evidence.matched_candidates` with up to 5 options (`id` + `label` — works for any kind of candidate: issue, user, project, state, etc.).
+- For discovery-only queries (lists), populate `evidence.items` with the structured list.
+</output_contract>
+
+Discover before you mutate; never guess identifiers.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/linear/tools/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/linear/tools/__init__.py
new file mode 100644
index 0000000000..5b464a9dfd
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/linear/tools/__init__.py
@@ -0,0 +1,3 @@
+"""Linear route: native tool factories are empty; MCP supplies tools when configured."""
+
+__all__: list[str] = []
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/linear/tools/index.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/linear/tools/index.py
new file mode 100644
index 0000000000..4a71a31b85
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/linear/tools/index.py
@@ -0,0 +1,31 @@
+"""``linear`` permission ruleset (rules over MCP tool names)."""
+
+from __future__ import annotations
+
+from app.agents.new_chat.permissions import Rule, Ruleset
+
+NAME = "linear"
+
+RULESET = Ruleset(
+    origin=NAME,
+    rules=[
+        Rule(permission="list_issues", pattern="*", action="allow"),
+        Rule(permission="get_issue", pattern="*", action="allow"),
+        Rule(permission="list_my_issues", pattern="*", action="allow"),
+        Rule(permission="list_issue_statuses", pattern="*", action="allow"),
+        Rule(permission="list_issue_labels", pattern="*", action="allow"),
+        Rule(permission="list_comments", pattern="*", action="allow"),
+        Rule(permission="list_users", pattern="*", action="allow"),
+        Rule(permission="get_user", pattern="*", action="allow"),
+        Rule(permission="list_teams", pattern="*", action="allow"),
+        Rule(permission="get_team", pattern="*", action="allow"),
+        Rule(permission="list_projects", pattern="*", action="allow"),
+        Rule(permission="get_project", pattern="*", action="allow"),
+        Rule(permission="list_project_labels", pattern="*", action="allow"),
+        Rule(permission="list_cycles", pattern="*", action="allow"),
+        Rule(permission="list_documents", pattern="*", action="allow"),
+        Rule(permission="get_document", pattern="*", action="allow"),
+        Rule(permission="search_documentation", pattern="*", action="allow"),
+        Rule(permission="save_issue", pattern="*", action="ask"),
+    ],
+)
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/agent.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/agent.py
new file mode 100644
index 0000000000..d84efaed87
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/agent.py
@@ -0,0 +1,44 @@
+"""``luma`` route: ``SurfSenseSubagentSpec`` builder for deepagents.
+
+Tools self-gate inside their bodies via :func:`request_approval`; the
+empty :data:`tools.index.RULESET` is layered into a per-subagent
+:class:`PermissionMiddleware` for uniformity.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.language_models import BaseChatModel
+from langchain_core.tools import BaseTool
+
+from app.agents.multi_agent_chat.subagents.shared.md_file_reader import read_md_file
+from app.agents.multi_agent_chat.subagents.shared.spec import SurfSenseSubagentSpec
+from app.agents.multi_agent_chat.subagents.shared.subagent_builder import pack_subagent
+
+from .tools.index import NAME, RULESET, load_tools
+
+
+def build_subagent(
+    *,
+    dependencies: dict[str, Any],
+    model: BaseChatModel | None = None,
+    middleware_stack: dict[str, Any] | None = None,
+    mcp_tools: list[BaseTool] | None = None,
+) -> SurfSenseSubagentSpec:
+    tools = [*load_tools(dependencies=dependencies), *(mcp_tools or [])]
+    description = (
+        read_md_file(__package__, "description").strip()
+        or "Handles luma tasks for this workspace."
+    )
+    system_prompt = read_md_file(__package__, "system_prompt").strip()
+    return pack_subagent(
+        name=NAME,
+        description=description,
+        system_prompt=system_prompt,
+        tools=tools,
+        ruleset=RULESET,
+        dependencies=dependencies,
+        model=model,
+        middleware_stack=middleware_stack,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/description.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/description.md
new file mode 100644
index 0000000000..7e04925c4b
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/description.md
@@ -0,0 +1,2 @@
+Specialist for events in the user's Luma account.
+Use proactively when the user wants to find, view, or create a Luma event.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/system_prompt.md
new file mode 100644
index 0000000000..953dbff580
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/system_prompt.md
@@ -0,0 +1,109 @@
+You are a Luma specialist for the user's connected Luma account.
+
+## Vocabulary you must use precisely
+
+- **Event resolution via `list_luma_events`** — events in the connected account are discovered via `list_luma_events` (live Luma API). Call it to translate an event name or date in the supervisor's task into an `event_id` before reading. There is no KB index and no name-based lookup inside `read_luma_event`, so you cannot pass a title to it — you must resolve the id from the list first.
+- **Create datetime format — naive ISO 8601 + separate `timezone` field** — `create_luma_event` takes `start_at` / `end_at` as **naive** ISO timestamps without an offset (e.g. `"2026-05-01T18:00:00"`) **and** `timezone` as a separate argument (default `"UTC"`, e.g. `"America/New_York"`, `"Europe/Paris"`). Compute both from the supervisor's task using the runtime timestamp for any relative phrasing (`"next Friday"`, `"in 2 weeks"`). Never embed a timezone offset inside `start_at` / `end_at`.
+- **Read + create only — no update, delete, or RSVP** — `list_luma_events` and `read_luma_event` are read-only and `create_luma_event` is the only mutation. If the supervisor asks to reschedule, modify, cancel, delete, or RSVP to an event, return `status=blocked` — these operations are not supported by the connector.
+
+## Required inputs
+
+**For every required input below, first try to infer it from the supervisor's task text** — extract event names from natural phrasing (`"the Founders Mixer"`, `"'Q3 Demo Day'"`), dates and times from relative or absolute phrasing (use the runtime timestamp for `"next Friday"`, `"in 2 weeks"`), timezone from location signals (`"in NYC"` → `"America/New_York"`), and description content from any details the supervisor already provided. Only return `status=blocked` with `missing_fields` when an input is genuinely absent or ambiguous after a thorough read of the task.
+
+- `list_luma_events` — no inputs. Call it whenever you need to resolve an event name or date to an `event_id`. Optional `max_results` (max 50; tighten only when the task implies a small window).
+- `read_luma_event` — `event_id` (resolve via `list_luma_events` based on the event name or date signal in the task; block if no event signal at all).
+- `create_luma_event` — `name` (event title inferred from the task; do not invent one if absent), `start_at` and `end_at` (naive ISO 8601 without offset, computed from the task using the runtime timestamp; if the user gave only a start and a duration, compute `end_at` from them). Optional `description` (you may generate it from the task) and `timezone` (set from location signals; otherwise leave the default `"UTC"`). Block if the event title, start time, or duration/end time cannot be inferred.
+
+## Outcome mapping
+
+| Tool returns                                       | Your `status` | `next_step` |
+|----------------------------------------------------|---------------|------------------------------------------------------------------------------------------------------------------------------|
+| `success` with non-empty events / event details    | `success`     | `null` |
+| `success` with `total: 0` (list returns no events) | `success`     | `null` (surface `total: 0` in `evidence.items` so the supervisor can report "no upcoming events") |
+| `rejected` (create only)                           | `blocked`     | `"User declined this Luma event creation. Do not retry or suggest alternatives."` |
+| `not_found` (read only)                            | `blocked`     | `"Event '<event_id>' was not found in Luma. Ask the user to verify or re-list events."` |
+| `auth_error`                                       | `error`       | `"The connected Luma API key is invalid. Ask the user to update the Luma API key in connector settings."` |
+| `error`                                            | `error`       | Relay the tool's `message` verbatim as `next_step` (this covers Luma Plus 403s and other API errors). |
+| tool raises / unknown                              | `error`       | `"Luma tool failed unexpectedly. Ask the user to retry shortly."` |
+
+Surface the tool's `message`, `event_id`, `name`, `start_at`, and `url` inside `evidence` when the tool returned them. Never invent a field the tool did not return.
+
+## Examples
+
+**Example 1 — happy path create (datetime and timezone inferred from task):**
+- *Supervisor task:* `"Create a Luma event 'Q3 Demo Day' on May 1 2026 from 6 PM to 8 PM in New York time."`
+- *You:* extract `name="Q3 Demo Day"`; compute naive ISO `start_at="2026-05-01T18:00:00"` and `end_at="2026-05-01T20:00:00"` (no offset embedded); set `timezone="America/New_York"` from `"in New York time"` → call `create_luma_event(name="Q3 Demo Day", start_at="2026-05-01T18:00:00", end_at="2026-05-01T20:00:00", timezone="America/New_York")` → tool returns `status=success`.
+- *Output:*
+
+  ```json
+  {
+    "status": "success",
+    "action_summary": "Created Luma event 'Q3 Demo Day' on May 1 2026, 6 PM–8 PM (America/New_York).",
+    "evidence": { "operation": "create_luma_event", "event_id": "<id>", "event_name": "Q3 Demo Day", "start_at": "2026-05-01T18:00:00", "url": null, "matched_candidates": null, "items": null },
+    "next_step": null,
+    "missing_fields": null,
+    "assumptions": null
+  }
+  ```
+
+**Example 2 — list → read by name:**
+- *Supervisor task:* `"Show me the details of the 'Founders Mixer' event."`
+- *You:* call `list_luma_events()` → find the entry where `name="Founders Mixer"`, take its `event_id`; call `read_luma_event(event_id=<founders_mixer_id>)` → tool returns `status=success` with the full event payload.
+- *Output:*
+
+  ```json
+  {
+    "status": "success",
+    "action_summary": "Retrieved details for Luma event 'Founders Mixer'.",
+    "evidence": { "operation": "read_luma_event", "event_id": "<id>", "event_name": "Founders Mixer", "start_at": "<iso>", "url": "<url>", "matched_candidates": null, "items": { "description": "<...>", "location_name": "<...>", "meeting_url": "<...>" } },
+    "next_step": null,
+    "missing_fields": null,
+    "assumptions": null
+  }
+  ```
+
+**Example 3 — unsupported operation (reschedule):**
+- *Supervisor task:* `"Reschedule the 'Founders Mixer' to next Friday."`
+- *You:* Luma updates are not supported by your tools. Do not call any tool. Do not work around by creating a new event with the same name — block.
+- *Output:*
+
+  ```json
+  {
+    "status": "blocked",
+    "action_summary": "Rescheduling Luma events is not supported.",
+    "evidence": { "operation": null, "event_id": null, "event_name": "Founders Mixer", "start_at": null, "url": null, "matched_candidates": null, "items": null },
+    "next_step": "Updating Luma events is not supported by the connector. Ask the user to reschedule the event directly in the Luma UI.",
+    "missing_fields": null,
+    "assumptions": null
+  }
+  ```
+
+## Output contract
+
+Return **only** one JSON object (no markdown or prose outside it):
+
+```json
+{
+  "status": "success" | "partial" | "blocked" | "error",
+  "action_summary": string,
+  "evidence": {
+    "operation": "list_luma_events" | "read_luma_event" | "create_luma_event" | null,
+    "event_id": string | null,
+    "event_name": string | null,
+    "start_at": string | null,
+    "url": string | null,
+    "matched_candidates": [ { "id": string, "label": string } ] | null,
+    "items": object | null
+  },
+  "next_step": string | null,
+  "missing_fields": string[] | null,
+  "assumptions": string[] | null
+}
+```
+
+Rules:
+- `status=success` → `next_step=null`, `missing_fields=null`.
+- `status=partial|blocked|error` → `next_step` must be non-null.
+- `status=blocked` due to missing required inputs → `missing_fields` must be non-null.
+
+Infer before you call; verify before you create; map every tool outcome faithfully.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/tools/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/tools/__init__.py
new file mode 100644
index 0000000000..255119bee7
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/tools/__init__.py
@@ -0,0 +1,15 @@
+from app.agents.new_chat.tools.luma.create_event import (
+    create_create_luma_event_tool,
+)
+from app.agents.new_chat.tools.luma.list_events import (
+    create_list_luma_events_tool,
+)
+from app.agents.new_chat.tools.luma.read_event import (
+    create_read_luma_event_tool,
+)
+
+__all__ = [
+    "create_create_luma_event_tool",
+    "create_list_luma_events_tool",
+    "create_read_luma_event_tool",
+]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/tools/_auth.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/tools/_auth.py
new file mode 100644
index 0000000000..c6d1cd1484
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/tools/_auth.py
@@ -0,0 +1,39 @@
+"""Builds Luma API auth for connector-backed event tools."""
+
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.future import select
+
+from app.db import SearchSourceConnector, SearchSourceConnectorType
+
+LUMA_API = "https://public-api.luma.com/v1"
+
+
+async def get_luma_connector(
+    db_session: AsyncSession,
+    search_space_id: int,
+    user_id: str,
+) -> SearchSourceConnector | None:
+    result = await db_session.execute(
+        select(SearchSourceConnector).filter(
+            SearchSourceConnector.search_space_id == search_space_id,
+            SearchSourceConnector.user_id == user_id,
+            SearchSourceConnector.connector_type
+            == SearchSourceConnectorType.LUMA_CONNECTOR,
+        )
+    )
+    return result.scalars().first()
+
+
+def get_api_key(connector: SearchSourceConnector) -> str:
+    """Extract the API key from connector config (handles both key names)."""
+    key = connector.config.get("api_key") or connector.config.get("LUMA_API_KEY")
+    if not key:
+        raise ValueError("Luma API key not found in connector config.")
+    return key
+
+
+def luma_headers(api_key: str) -> dict[str, str]:
+    return {
+        "Content-Type": "application/json",
+        "x-luma-api-key": api_key,
+    }
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/tools/create_event.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/tools/create_event.py
new file mode 100644
index 0000000000..e3e1126fd6
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/tools/create_event.py
@@ -0,0 +1,131 @@
+import logging
+from typing import Any
+
+import httpx
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.agents.multi_agent_chat.subagents.shared.hitl.approvals.self_gated import (
+    request_approval,
+)
+
+from ._auth import LUMA_API, get_api_key, get_luma_connector, luma_headers
+
+logger = logging.getLogger(__name__)
+
+
+def create_create_luma_event_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    @tool
+    async def create_luma_event(
+        name: str,
+        start_at: str,
+        end_at: str,
+        description: str | None = None,
+        timezone: str = "UTC",
+    ) -> dict[str, Any]:
+        """Create a new event on Luma.
+
+        Args:
+            name: The event title.
+            start_at: Start time in ISO 8601 format (e.g. "2026-05-01T18:00:00").
+            end_at: End time in ISO 8601 format (e.g. "2026-05-01T20:00:00").
+            description: Optional event description (markdown supported).
+            timezone: Timezone string (default "UTC", e.g. "America/New_York").
+
+        Returns:
+            Dictionary with status, event_id on success.
+
+            IMPORTANT:
+            - If status is "rejected", the user explicitly declined. Do NOT retry.
+        """
+        if db_session is None or search_space_id is None or user_id is None:
+            return {"status": "error", "message": "Luma tool not properly configured."}
+
+        try:
+            connector = await get_luma_connector(db_session, search_space_id, user_id)
+            if not connector:
+                return {"status": "error", "message": "No Luma connector found."}
+
+            result = request_approval(
+                action_type="luma_create_event",
+                tool_name="create_luma_event",
+                params={
+                    "name": name,
+                    "start_at": start_at,
+                    "end_at": end_at,
+                    "description": description,
+                    "timezone": timezone,
+                },
+                context={"connector_id": connector.id},
+            )
+
+            if result.rejected:
+                return {
+                    "status": "rejected",
+                    "message": "User declined. Event was not created.",
+                }
+
+            final_name = result.params.get("name", name)
+            final_start = result.params.get("start_at", start_at)
+            final_end = result.params.get("end_at", end_at)
+            final_desc = result.params.get("description", description)
+            final_tz = result.params.get("timezone", timezone)
+
+            api_key = get_api_key(connector)
+            headers = luma_headers(api_key)
+
+            body: dict[str, Any] = {
+                "name": final_name,
+                "start_at": final_start,
+                "end_at": final_end,
+                "timezone": final_tz,
+            }
+            if final_desc:
+                body["description_md"] = final_desc
+
+            async with httpx.AsyncClient(timeout=20.0) as client:
+                resp = await client.post(
+                    f"{LUMA_API}/event/create",
+                    headers=headers,
+                    json=body,
+                )
+
+            if resp.status_code == 401:
+                return {
+                    "status": "auth_error",
+                    "message": "Luma API key is invalid.",
+                    "connector_type": "luma",
+                }
+            if resp.status_code == 403:
+                return {
+                    "status": "error",
+                    "message": "Luma Plus subscription required to create events via API.",
+                }
+            if resp.status_code not in (200, 201):
+                return {
+                    "status": "error",
+                    "message": f"Luma API error: {resp.status_code} — {resp.text[:200]}",
+                }
+
+            data = resp.json()
+            event_id = data.get("api_id") or data.get("event", {}).get("api_id")
+
+            return {
+                "status": "success",
+                "event_id": event_id,
+                "message": f"Event '{final_name}' created on Luma.",
+            }
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+            logger.error("Error creating Luma event: %s", e, exc_info=True)
+            return {"status": "error", "message": "Failed to create Luma event."}
+
+    return create_luma_event
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/tools/index.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/tools/index.py
new file mode 100644
index 0000000000..dbde010610
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/tools/index.py
@@ -0,0 +1,36 @@
+"""``luma`` native tools and (empty) permission ruleset.
+
+Tools self-gate via :func:`request_approval` in their bodies.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.tools import BaseTool
+
+from app.agents.new_chat.permissions import Ruleset
+
+from .create_event import create_create_luma_event_tool
+from .list_events import create_list_luma_events_tool
+from .read_event import create_read_luma_event_tool
+
+NAME = "luma"
+
+RULESET = Ruleset(origin=NAME, rules=[])
+
+
+def load_tools(
+    *, dependencies: dict[str, Any] | None = None, **kwargs: Any
+) -> list[BaseTool]:
+    d = {**(dependencies or {}), **kwargs}
+    common = {
+        "db_session": d["db_session"],
+        "search_space_id": d["search_space_id"],
+        "user_id": d["user_id"],
+    }
+    return [
+        create_list_luma_events_tool(**common),
+        create_read_luma_event_tool(**common),
+        create_create_luma_event_tool(**common),
+    ]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/tools/list_events.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/tools/list_events.py
new file mode 100644
index 0000000000..aec5ad2208
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/tools/list_events.py
@@ -0,0 +1,111 @@
+import logging
+from typing import Any
+
+import httpx
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from ._auth import LUMA_API, get_api_key, get_luma_connector, luma_headers
+
+logger = logging.getLogger(__name__)
+
+
+def create_list_luma_events_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    @tool
+    async def list_luma_events(
+        max_results: int = 25,
+    ) -> dict[str, Any]:
+        """List upcoming and recent Luma events.
+
+        Args:
+            max_results: Maximum events to return (default 25, max 50).
+
+        Returns:
+            Dictionary with status and a list of events including
+            event_id, name, start_at, end_at, location, url.
+        """
+        if db_session is None or search_space_id is None or user_id is None:
+            return {"status": "error", "message": "Luma tool not properly configured."}
+
+        max_results = min(max_results, 50)
+
+        try:
+            connector = await get_luma_connector(db_session, search_space_id, user_id)
+            if not connector:
+                return {"status": "error", "message": "No Luma connector found."}
+
+            api_key = get_api_key(connector)
+            headers = luma_headers(api_key)
+
+            all_entries: list[dict] = []
+            cursor = None
+
+            async with httpx.AsyncClient(timeout=20.0) as client:
+                while len(all_entries) < max_results:
+                    params: dict[str, Any] = {
+                        "limit": min(100, max_results - len(all_entries))
+                    }
+                    if cursor:
+                        params["cursor"] = cursor
+
+                    resp = await client.get(
+                        f"{LUMA_API}/calendar/list-events",
+                        headers=headers,
+                        params=params,
+                    )
+
+                    if resp.status_code == 401:
+                        return {
+                            "status": "auth_error",
+                            "message": "Luma API key is invalid.",
+                            "connector_type": "luma",
+                        }
+                    if resp.status_code != 200:
+                        return {
+                            "status": "error",
+                            "message": f"Luma API error: {resp.status_code}",
+                        }
+
+                    data = resp.json()
+                    entries = data.get("entries", [])
+                    if not entries:
+                        break
+                    all_entries.extend(entries)
+
+                    next_cursor = data.get("next_cursor")
+                    if not next_cursor:
+                        break
+                    cursor = next_cursor
+
+            events = []
+            for entry in all_entries[:max_results]:
+                ev = entry.get("event", {})
+                geo = ev.get("geo_info", {})
+                events.append(
+                    {
+                        "event_id": entry.get("api_id"),
+                        "name": ev.get("name", "Untitled"),
+                        "start_at": ev.get("start_at", ""),
+                        "end_at": ev.get("end_at", ""),
+                        "timezone": ev.get("timezone", ""),
+                        "location": geo.get("name", ""),
+                        "url": ev.get("url", ""),
+                        "visibility": ev.get("visibility", ""),
+                    }
+                )
+
+            return {"status": "success", "events": events, "total": len(events)}
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+            logger.error("Error listing Luma events: %s", e, exc_info=True)
+            return {"status": "error", "message": "Failed to list Luma events."}
+
+    return list_luma_events
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/tools/read_event.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/tools/read_event.py
new file mode 100644
index 0000000000..b37a9d6173
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/tools/read_event.py
@@ -0,0 +1,92 @@
+import logging
+from typing import Any
+
+import httpx
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from ._auth import LUMA_API, get_api_key, get_luma_connector, luma_headers
+
+logger = logging.getLogger(__name__)
+
+
+def create_read_luma_event_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    @tool
+    async def read_luma_event(event_id: str) -> dict[str, Any]:
+        """Read detailed information about a specific Luma event.
+
+        Args:
+            event_id: The Luma event API ID (from list_luma_events).
+
+        Returns:
+            Dictionary with status and full event details including
+            description, attendees count, meeting URL.
+        """
+        if db_session is None or search_space_id is None or user_id is None:
+            return {"status": "error", "message": "Luma tool not properly configured."}
+
+        try:
+            connector = await get_luma_connector(db_session, search_space_id, user_id)
+            if not connector:
+                return {"status": "error", "message": "No Luma connector found."}
+
+            api_key = get_api_key(connector)
+            headers = luma_headers(api_key)
+
+            async with httpx.AsyncClient(timeout=15.0) as client:
+                resp = await client.get(
+                    f"{LUMA_API}/events/{event_id}",
+                    headers=headers,
+                )
+
+            if resp.status_code == 401:
+                return {
+                    "status": "auth_error",
+                    "message": "Luma API key is invalid.",
+                    "connector_type": "luma",
+                }
+            if resp.status_code == 404:
+                return {
+                    "status": "not_found",
+                    "message": f"Event '{event_id}' not found.",
+                }
+            if resp.status_code != 200:
+                return {
+                    "status": "error",
+                    "message": f"Luma API error: {resp.status_code}",
+                }
+
+            data = resp.json()
+            ev = data.get("event", data)
+            geo = ev.get("geo_info", {})
+
+            event_detail = {
+                "event_id": event_id,
+                "name": ev.get("name", ""),
+                "description": ev.get("description", ""),
+                "start_at": ev.get("start_at", ""),
+                "end_at": ev.get("end_at", ""),
+                "timezone": ev.get("timezone", ""),
+                "location_name": geo.get("name", ""),
+                "address": geo.get("address", ""),
+                "url": ev.get("url", ""),
+                "meeting_url": ev.get("meeting_url", ""),
+                "visibility": ev.get("visibility", ""),
+                "cover_url": ev.get("cover_url", ""),
+            }
+
+            return {"status": "success", "event": event_detail}
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+            logger.error("Error reading Luma event: %s", e, exc_info=True)
+            return {"status": "error", "message": "Failed to read Luma event."}
+
+    return read_luma_event
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/agent.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/agent.py
new file mode 100644
index 0000000000..8de86b2d88
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/agent.py
@@ -0,0 +1,44 @@
+"""``notion`` route: ``SurfSenseSubagentSpec`` builder for deepagents.
+
+Tools self-gate inside their bodies via :func:`request_approval`; the
+empty :data:`tools.index.RULESET` is layered into a per-subagent
+:class:`PermissionMiddleware` for uniformity.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.language_models import BaseChatModel
+from langchain_core.tools import BaseTool
+
+from app.agents.multi_agent_chat.subagents.shared.md_file_reader import read_md_file
+from app.agents.multi_agent_chat.subagents.shared.spec import SurfSenseSubagentSpec
+from app.agents.multi_agent_chat.subagents.shared.subagent_builder import pack_subagent
+
+from .tools.index import NAME, RULESET, load_tools
+
+
+def build_subagent(
+    *,
+    dependencies: dict[str, Any],
+    model: BaseChatModel | None = None,
+    middleware_stack: dict[str, Any] | None = None,
+    mcp_tools: list[BaseTool] | None = None,
+) -> SurfSenseSubagentSpec:
+    tools = [*load_tools(dependencies=dependencies), *(mcp_tools or [])]
+    description = (
+        read_md_file(__package__, "description").strip()
+        or "Handles notion tasks for this workspace."
+    )
+    system_prompt = read_md_file(__package__, "system_prompt").strip()
+    return pack_subagent(
+        name=NAME,
+        description=description,
+        system_prompt=system_prompt,
+        tools=tools,
+        ruleset=RULESET,
+        dependencies=dependencies,
+        model=model,
+        middleware_stack=middleware_stack,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/description.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/description.md
new file mode 100644
index 0000000000..9a02c7561f
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/description.md
@@ -0,0 +1,2 @@
+Specialist for pages in the user's Notion workspace.
+Use proactively when the user wants to create, change, archive, or remove a Notion page.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/system_prompt.md
new file mode 100644
index 0000000000..b38c30167e
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/system_prompt.md
@@ -0,0 +1,107 @@
+You are a Notion specialist for the user's connected Notion workspace.
+
+## Vocabulary you must use precisely
+
+- **Page resolution (internal)** — `update_notion_page` and `delete_notion_page` accept a `page_title` and resolve it against the **locally-synced Notion KB index**, not against the live Notion API. A page that exists in Notion but has not been indexed yet cannot be resolved. There is no separate "search" or "lookup" tool exposed to you — resolution happens inside the mutation tool.
+- **Update is append-only** — `update_notion_page` appends new content blocks to the page body. It cannot edit, replace, or remove existing content.
+- **Delete is archive** — `delete_notion_page` archives the page (Notion's "trash"); the user can restore it from Notion's UI. With `delete_from_kb=true` the local KB document is also removed; the default is `false`.
+
+## Required inputs
+
+**For every required input below, first try to infer it from the supervisor's task text** — extract titles from natural phrasing (`"the Weekly Sync page"`, `"my Q1 retro"`), topics from `"about X"` constructions, content from any details the supervisor already provided. Only return `status=blocked` with `missing_fields` when an input is genuinely absent or ambiguous after a thorough read of the task.
+
+- `create_notion_page` — `title` (the user-supplied topic, inferred from the task; do not invent one if absent). You may generate the markdown `content` body yourself from that topic.
+- `update_notion_page` — `page_title` (which page to update — infer from the task) and `content` (what to append — infer or generate from the task's specifics).
+- `delete_notion_page` — `page_title` (which page to delete — infer from the task). Only set `delete_from_kb=true` when the user explicitly asked to remove it from the knowledge base; otherwise leave it `false`.
+
+## Outcome mapping
+
+| Tool returns          | Your `status` | `next_step`                                                                                                                  |
+|-----------------------|---------------|------------------------------------------------------------------------------------------------------------------------------|
+| `success`             | `success`     | `null`                                                                                                                       |
+| `rejected`            | `blocked`     | `"User declined this Notion action. Do not retry or suggest alternatives."`                                                  |
+| `not_found`           | `blocked`     | `"Page '<title>' was not found in the indexed Notion pages. Ask the user to verify the title or wait for the next KB sync."` |
+| `auth_error`          | `error`       | `"The connected Notion account needs re-authentication. Ask the user to re-authenticate Notion in connector settings."`      |
+| `error`               | `error`       | Relay the tool's `message` verbatim as `next_step`.                                                                          |
+| tool raises / unknown | `error`       | `"Notion tool failed unexpectedly. Ask the user to retry shortly."`                                                          |
+
+Surface the tool's `message`, `page_id`, `page_title`, and `url` inside `evidence` when the tool returned them. Never invent a field the tool did not return.
+
+## Examples
+
+**Example 1 — happy path create (topic inferred from task):**
+- *Supervisor task:* `"Create a Notion page summarising our Q2 roadmap."`
+- *You:* extract `title="Q2 Roadmap"` from `"about Q2 roadmap"`; generate a markdown body → call `create_notion_page(title="Q2 Roadmap", content=<generated markdown>)` → tool returns `status=success`.
+- *Output:*
+
+  ```json
+  {
+    "status": "success",
+    "action_summary": "Created Notion page 'Q2 Roadmap'.",
+    "evidence": { "operation": "create_notion_page", "page_id": "<id>", "page_title": "Q2 Roadmap", "url": "<url>", "matched_candidates": null, "items": null },
+    "next_step": null,
+    "missing_fields": null,
+    "assumptions": null
+  }
+  ```
+
+**Example 2 — blocked only because nothing is inferable:**
+- *Supervisor task:* `"Create a Notion page."`
+- *You:* no topic anywhere in the task text — no `"about X"`, no quoted phrase, no descriptor. Do not fabricate one. Do not call any tool. (Contrast: `"Create a Notion page about our launch plan"` would yield `title="Launch Plan"` and proceed immediately — block only because the task carries zero topic information.)
+- *Output:*
+
+  ```json
+  {
+    "status": "blocked",
+    "action_summary": "Cannot create a Notion page without a topic.",
+    "evidence": { "operation": null, "page_id": null, "page_title": null, "url": null, "matched_candidates": null, "items": null },
+    "next_step": "Ask the user what the page should be about.",
+    "missing_fields": ["title"],
+    "assumptions": null
+  }
+  ```
+
+**Example 3 — page not in the KB index:**
+- *Supervisor task:* `"Add today's meeting notes to my 'Weekly Sync' Notion page."`
+- *You:* extract `page_title="Weekly Sync"` and meeting-notes content → call `update_notion_page(page_title="Weekly Sync", content=<generated notes>)` → tool returns `status=not_found`.
+- *Output:*
+
+  ```json
+  {
+    "status": "blocked",
+    "action_summary": "Could not find a Notion page titled 'Weekly Sync' in the indexed pages.",
+    "evidence": { "operation": "update_notion_page", "page_id": null, "page_title": "Weekly Sync", "url": null, "matched_candidates": null, "items": null },
+    "next_step": "Page 'Weekly Sync' was not found in the indexed Notion pages. Ask the user to verify the title or wait for the next KB sync.",
+    "missing_fields": null,
+    "assumptions": null
+  }
+  ```
+
+## Output contract
+
+Return **only** one JSON object (no markdown or prose outside it):
+
+```json
+{
+  "status": "success" | "partial" | "blocked" | "error",
+  "action_summary": string,
+  "evidence": {
+    "operation": "create_notion_page" | "update_notion_page" | "delete_notion_page" | null,
+    "page_id": string | null,
+    "page_title": string | null,
+    "url": string | null,
+    "matched_candidates": [ { "id": string, "label": string } ] | null,
+    "items": object | null
+  },
+  "next_step": string | null,
+  "missing_fields": string[] | null,
+  "assumptions": string[] | null
+}
+```
+
+Rules:
+- `status=success` → `next_step=null`, `missing_fields=null`.
+- `status=partial|blocked|error` → `next_step` must be non-null.
+- `status=blocked` due to missing required inputs → `missing_fields` must be non-null.
+
+Infer before you call; map every tool outcome faithfully.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/tools/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/tools/__init__.py
new file mode 100644
index 0000000000..6ce825dca4
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/tools/__init__.py
@@ -0,0 +1,11 @@
+"""Notion tools for creating, updating, and deleting pages."""
+
+from .create_page import create_create_notion_page_tool
+from .delete_page import create_delete_notion_page_tool
+from .update_page import create_update_notion_page_tool
+
+__all__ = [
+    "create_create_notion_page_tool",
+    "create_delete_notion_page_tool",
+    "create_update_notion_page_tool",
+]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/tools/create_page.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/tools/create_page.py
new file mode 100644
index 0000000000..20862eb56b
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/tools/create_page.py
@@ -0,0 +1,246 @@
+import logging
+from typing import Any
+
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.agents.multi_agent_chat.subagents.shared.hitl.approvals.self_gated import (
+    request_approval,
+)
+from app.connectors.notion_history import NotionAPIError, NotionHistoryConnector
+from app.services.notion import NotionToolMetadataService
+
+logger = logging.getLogger(__name__)
+
+
+def create_create_notion_page_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+    connector_id: int | None = None,
+):
+    """
+    Factory function to create the create_notion_page tool.
+
+    Args:
+        db_session: Database session for accessing Notion connector
+        search_space_id: Search space ID to find the Notion connector
+        user_id: User ID for fetching user-specific context
+        connector_id: Optional specific connector ID (if known)
+
+    Returns:
+        Configured create_notion_page tool
+    """
+
+    @tool
+    async def create_notion_page(
+        title: str,
+        content: str | None = None,
+    ) -> dict[str, Any]:
+        """Create a new page in Notion with the given title and content.
+
+        Use this tool when the user asks you to create, save, or publish
+        something to Notion. The page will be created in the user's
+        configured Notion workspace. The user MUST specify a topic before you
+        call this tool. If the request does not contain a topic (e.g. "create a
+        notion page"), ask what the page should be about. Never call this tool
+        without a clear topic from the user.
+
+        Args:
+            title: The title of the Notion page.
+            content: Optional markdown content for the page body (supports headings, lists, paragraphs).
+                     Generate this yourself based on the user's topic.
+
+        Returns:
+            Dictionary with:
+            - status: "success", "rejected", or "error"
+            - page_id: Created page ID (if success)
+            - url: URL to the created page (if success)
+            - title: Page title (if success)
+            - message: Result message
+
+            IMPORTANT: If status is "rejected", the user explicitly declined the action.
+            Respond with a brief acknowledgment (e.g., "Understood, I didn't create the page.")
+            and move on. Do NOT troubleshoot or suggest alternatives.
+
+        Examples:
+            - "Create a Notion page about our Q2 roadmap"
+            - "Save a summary of today's discussion to Notion"
+        """
+        logger.info(f"create_notion_page called: title='{title}'")
+
+        if db_session is None or search_space_id is None or user_id is None:
+            logger.error(
+                "Notion tool not properly configured - missing required parameters"
+            )
+            return {
+                "status": "error",
+                "message": "Notion tool not properly configured. Please contact support.",
+            }
+
+        try:
+            metadata_service = NotionToolMetadataService(db_session)
+            context = await metadata_service.get_creation_context(
+                search_space_id, user_id
+            )
+
+            if "error" in context:
+                logger.error(f"Failed to fetch creation context: {context['error']}")
+                return {
+                    "status": "error",
+                    "message": context["error"],
+                }
+
+            accounts = context.get("accounts", [])
+            if accounts and all(a.get("auth_expired") for a in accounts):
+                logger.warning("All Notion accounts have expired authentication")
+                return {
+                    "status": "auth_error",
+                    "message": "All connected Notion accounts need re-authentication. Please re-authenticate in your connector settings.",
+                    "connector_type": "notion",
+                }
+
+            logger.info(f"Requesting approval for creating Notion page: '{title}'")
+            result = request_approval(
+                action_type="notion_page_creation",
+                tool_name="create_notion_page",
+                params={
+                    "title": title,
+                    "content": content,
+                    "parent_page_id": None,
+                    "connector_id": connector_id,
+                },
+                context=context,
+            )
+
+            if result.rejected:
+                logger.info("Notion page creation rejected by user")
+                return {
+                    "status": "rejected",
+                    "message": "User declined. Do not retry or suggest alternatives.",
+                }
+
+            final_title = result.params.get("title", title)
+            final_content = result.params.get("content", content)
+            final_parent_page_id = result.params.get("parent_page_id")
+            final_connector_id = result.params.get("connector_id", connector_id)
+
+            if not final_title or not final_title.strip():
+                logger.error("Title is empty or contains only whitespace")
+                return {
+                    "status": "error",
+                    "message": "Page title cannot be empty. Please provide a valid title.",
+                }
+
+            logger.info(
+                f"Creating Notion page with final params: title='{final_title}'"
+            )
+
+            from sqlalchemy.future import select
+
+            from app.db import SearchSourceConnector, SearchSourceConnectorType
+
+            actual_connector_id = final_connector_id
+            if actual_connector_id is None:
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type
+                        == SearchSourceConnectorType.NOTION_CONNECTOR,
+                    )
+                )
+                connector = result.scalars().first()
+
+                if not connector:
+                    logger.warning(
+                        f"No Notion connector found for search_space_id={search_space_id}"
+                    )
+                    return {
+                        "status": "error",
+                        "message": "No Notion connector found. Please connect Notion in your workspace settings.",
+                    }
+
+                actual_connector_id = connector.id
+                logger.info(f"Found Notion connector: id={actual_connector_id}")
+            else:
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.id == actual_connector_id,
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type
+                        == SearchSourceConnectorType.NOTION_CONNECTOR,
+                    )
+                )
+                connector = result.scalars().first()
+
+                if not connector:
+                    logger.error(
+                        f"Invalid connector_id={actual_connector_id} for search_space_id={search_space_id}"
+                    )
+                    return {
+                        "status": "error",
+                        "message": "Selected Notion account is invalid or has been disconnected. Please select a valid account.",
+                    }
+                logger.info(f"Validated Notion connector: id={actual_connector_id}")
+
+            notion_connector = NotionHistoryConnector(
+                session=db_session,
+                connector_id=actual_connector_id,
+            )
+
+            result = await notion_connector.create_page(
+                title=final_title,
+                content=final_content,
+                parent_page_id=final_parent_page_id,
+            )
+            logger.info(
+                f"create_page result: {result.get('status')} - {result.get('message', '')}"
+            )
+
+            if result.get("status") == "success":
+                kb_message_suffix = ""
+                try:
+                    from app.services.notion import NotionKBSyncService
+
+                    kb_service = NotionKBSyncService(db_session)
+                    kb_result = await kb_service.sync_after_create(
+                        page_id=result.get("page_id"),
+                        page_title=result.get("title", final_title),
+                        page_url=result.get("url"),
+                        content=final_content,
+                        connector_id=actual_connector_id,
+                        search_space_id=search_space_id,
+                        user_id=user_id,
+                    )
+                    if kb_result["status"] == "success":
+                        kb_message_suffix = (
+                            " Your knowledge base has also been updated."
+                        )
+                    else:
+                        kb_message_suffix = " This page will be added to your knowledge base in the next scheduled sync."
+                except Exception as kb_err:
+                    logger.warning(f"KB sync after create failed: {kb_err}")
+                    kb_message_suffix = " This page will be added to your knowledge base in the next scheduled sync."
+
+                result["message"] = result.get("message", "") + kb_message_suffix
+
+            return result
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+
+            logger.error(f"Error creating Notion page: {e}", exc_info=True)
+            if isinstance(e, ValueError | NotionAPIError):
+                message = str(e)
+            else:
+                message = (
+                    "Something went wrong while creating the page. Please try again."
+                )
+            return {"status": "error", "message": message}
+
+    return create_notion_page
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/tools/delete_page.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/tools/delete_page.py
new file mode 100644
index 0000000000..85d0ef22e5
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/tools/delete_page.py
@@ -0,0 +1,264 @@
+import logging
+from typing import Any
+
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.agents.multi_agent_chat.subagents.shared.hitl.approvals.self_gated import (
+    request_approval,
+)
+from app.connectors.notion_history import NotionAPIError, NotionHistoryConnector
+from app.services.notion.tool_metadata_service import NotionToolMetadataService
+
+logger = logging.getLogger(__name__)
+
+
+def create_delete_notion_page_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+    connector_id: int | None = None,
+):
+    """
+    Factory function to create the delete_notion_page tool.
+
+    Args:
+        db_session: Database session for accessing Notion connector
+        search_space_id: Search space ID to find the Notion connector
+        user_id: User ID for finding the correct Notion connector
+        connector_id: Optional specific connector ID (if known)
+
+    Returns:
+        Configured delete_notion_page tool
+    """
+
+    @tool
+    async def delete_notion_page(
+        page_title: str,
+        delete_from_kb: bool = False,
+    ) -> dict[str, Any]:
+        """Delete (archive) a Notion page.
+
+        Use this tool when the user asks you to delete, remove, or archive
+        a Notion page. Note that Notion doesn't permanently delete pages,
+        it archives them (they can be restored from trash).
+
+        Args:
+            page_title: The title of the Notion page to delete.
+            delete_from_kb: Whether to also remove the page from the knowledge base.
+                          Default is False.
+                          Set to True to permanently remove from both Notion and knowledge base.
+
+        Returns:
+            Dictionary with:
+            - status: "success", "rejected", "not_found", or "error"
+            - page_id: Deleted page ID (if success)
+            - message: Success or error message
+            - deleted_from_kb: Whether the page was also removed from knowledge base (if success)
+
+        Examples:
+            - "Delete the 'Meeting Notes' Notion page"
+            - "Remove the 'Old Project Plan' Notion page"
+            - "Archive the 'Draft Ideas' Notion page"
+        """
+        logger.info(
+            f"delete_notion_page called: page_title='{page_title}', delete_from_kb={delete_from_kb}"
+        )
+
+        if db_session is None or search_space_id is None or user_id is None:
+            logger.error(
+                "Notion tool not properly configured - missing required parameters"
+            )
+            return {
+                "status": "error",
+                "message": "Notion tool not properly configured. Please contact support.",
+            }
+
+        try:
+            # Get page context (page_id, account, title) from indexed data
+            metadata_service = NotionToolMetadataService(db_session)
+            context = await metadata_service.get_delete_context(
+                search_space_id, user_id, page_title
+            )
+
+            if "error" in context:
+                error_msg = context["error"]
+                # Check if it's a "not found" error (softer handling for LLM)
+                if "not found" in error_msg.lower():
+                    logger.warning(f"Page not found: {error_msg}")
+                    return {
+                        "status": "not_found",
+                        "message": error_msg,
+                    }
+                else:
+                    logger.error(f"Failed to fetch delete context: {error_msg}")
+                    return {
+                        "status": "error",
+                        "message": error_msg,
+                    }
+
+            account = context.get("account", {})
+            if account.get("auth_expired"):
+                logger.warning(
+                    "Notion account %s has expired authentication",
+                    account.get("id"),
+                )
+                return {
+                    "status": "auth_error",
+                    "message": "The Notion account for this page needs re-authentication. Please re-authenticate in your connector settings.",
+                }
+
+            page_id = context.get("page_id")
+            connector_id_from_context = account.get("id")
+            document_id = context.get("document_id")
+
+            logger.info(
+                f"Requesting approval for deleting Notion page: '{page_title}' (page_id={page_id}, delete_from_kb={delete_from_kb})"
+            )
+
+            result = request_approval(
+                action_type="notion_page_deletion",
+                tool_name="delete_notion_page",
+                params={
+                    "page_id": page_id,
+                    "connector_id": connector_id_from_context,
+                    "delete_from_kb": delete_from_kb,
+                },
+                context=context,
+            )
+
+            if result.rejected:
+                logger.info("Notion page deletion rejected by user")
+                return {
+                    "status": "rejected",
+                    "message": "User declined. Do not retry or suggest alternatives.",
+                }
+
+            final_page_id = result.params.get("page_id", page_id)
+            final_connector_id = result.params.get(
+                "connector_id", connector_id_from_context
+            )
+            final_delete_from_kb = result.params.get("delete_from_kb", delete_from_kb)
+
+            logger.info(
+                f"Deleting Notion page with final params: page_id={final_page_id}, connector_id={final_connector_id}, delete_from_kb={final_delete_from_kb}"
+            )
+
+            from sqlalchemy.future import select
+
+            from app.db import SearchSourceConnector, SearchSourceConnectorType
+
+            # Validate the connector
+            if final_connector_id:
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.id == final_connector_id,
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type
+                        == SearchSourceConnectorType.NOTION_CONNECTOR,
+                    )
+                )
+                connector = result.scalars().first()
+
+                if not connector:
+                    logger.error(
+                        f"Invalid connector_id={final_connector_id} for search_space_id={search_space_id}"
+                    )
+                    return {
+                        "status": "error",
+                        "message": "Selected Notion account is invalid or has been disconnected. Please select a valid account.",
+                    }
+                actual_connector_id = connector.id
+                logger.info(f"Validated Notion connector: id={actual_connector_id}")
+            else:
+                logger.error("No connector found for this page")
+                return {
+                    "status": "error",
+                    "message": "No connector found for this page.",
+                }
+
+            # Create connector instance
+            notion_connector = NotionHistoryConnector(
+                session=db_session,
+                connector_id=actual_connector_id,
+            )
+
+            # Delete the page from Notion
+            result = await notion_connector.delete_page(page_id=final_page_id)
+            logger.info(
+                f"delete_page result: {result.get('status')} - {result.get('message', '')}"
+            )
+
+            # If deletion was successful and user wants to delete from KB
+            deleted_from_kb = False
+            if (
+                result.get("status") == "success"
+                and final_delete_from_kb
+                and document_id
+            ):
+                try:
+                    from sqlalchemy.future import select
+
+                    from app.db import Document
+
+                    # Get the document
+                    doc_result = await db_session.execute(
+                        select(Document).filter(Document.id == document_id)
+                    )
+                    document = doc_result.scalars().first()
+
+                    if document:
+                        await db_session.delete(document)
+                        await db_session.commit()
+                        deleted_from_kb = True
+                        logger.info(
+                            f"Deleted document {document_id} from knowledge base"
+                        )
+                    else:
+                        logger.warning(f"Document {document_id} not found in KB")
+                except Exception as e:
+                    logger.error(f"Failed to delete document from KB: {e}")
+                    await db_session.rollback()
+                    result["warning"] = (
+                        f"Page deleted from Notion, but failed to remove from knowledge base: {e!s}"
+                    )
+
+            # Update result with KB deletion status
+            if result.get("status") == "success":
+                result["deleted_from_kb"] = deleted_from_kb
+                if deleted_from_kb:
+                    result["message"] = (
+                        f"{result.get('message', '')} (also removed from knowledge base)"
+                    )
+
+            return result
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+
+            logger.error(f"Error deleting Notion page: {e}", exc_info=True)
+            error_str = str(e).lower()
+            if isinstance(e, NotionAPIError) and (
+                "401" in error_str or "unauthorized" in error_str
+            ):
+                return {
+                    "status": "auth_error",
+                    "message": str(e),
+                    "connector_id": connector_id_from_context
+                    if "connector_id_from_context" in dir()
+                    else None,
+                    "connector_type": "notion",
+                }
+            if isinstance(e, ValueError | NotionAPIError):
+                message = str(e)
+            else:
+                message = (
+                    "Something went wrong while deleting the page. Please try again."
+                )
+            return {"status": "error", "message": message}
+
+    return delete_notion_page
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/tools/index.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/tools/index.py
new file mode 100644
index 0000000000..0475e9dd08
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/tools/index.py
@@ -0,0 +1,36 @@
+"""``notion`` native tools and (empty) permission ruleset.
+
+Tools self-gate via :func:`request_approval` in their bodies.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.tools import BaseTool
+
+from app.agents.new_chat.permissions import Ruleset
+
+from .create_page import create_create_notion_page_tool
+from .delete_page import create_delete_notion_page_tool
+from .update_page import create_update_notion_page_tool
+
+NAME = "notion"
+
+RULESET = Ruleset(origin=NAME, rules=[])
+
+
+def load_tools(
+    *, dependencies: dict[str, Any] | None = None, **kwargs: Any
+) -> list[BaseTool]:
+    d = {**(dependencies or {}), **kwargs}
+    common = {
+        "db_session": d["db_session"],
+        "search_space_id": d["search_space_id"],
+        "user_id": d["user_id"],
+    }
+    return [
+        create_create_notion_page_tool(**common),
+        create_update_notion_page_tool(**common),
+        create_delete_notion_page_tool(**common),
+    ]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/tools/update_page.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/tools/update_page.py
new file mode 100644
index 0000000000..2b9ce3a6c3
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/tools/update_page.py
@@ -0,0 +1,267 @@
+import logging
+from typing import Any
+
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.agents.multi_agent_chat.subagents.shared.hitl.approvals.self_gated import (
+    request_approval,
+)
+from app.connectors.notion_history import NotionAPIError, NotionHistoryConnector
+from app.services.notion import NotionToolMetadataService
+
+logger = logging.getLogger(__name__)
+
+
+def create_update_notion_page_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+    connector_id: int | None = None,
+):
+    """
+    Factory function to create the update_notion_page tool.
+
+    Args:
+        db_session: Database session for accessing Notion connector
+        search_space_id: Search space ID to find the Notion connector
+        user_id: User ID for fetching user-specific context
+        connector_id: Optional specific connector ID (if known)
+
+    Returns:
+        Configured update_notion_page tool
+    """
+
+    @tool
+    async def update_notion_page(
+        page_title: str,
+        content: str | None = None,
+    ) -> dict[str, Any]:
+        """Update an existing Notion page by appending new content.
+
+        Use this tool when the user asks you to add content to, modify, or update
+        a Notion page. The new content will be appended to the existing page content.
+        The user MUST specify what to add before you call this tool. If the
+        request is vague, ask what content they want added.
+
+        Args:
+            page_title: The title of the Notion page to update.
+            content: Optional markdown content to append to the page body (supports headings, lists, paragraphs).
+                     Generate this yourself based on the user's request.
+
+        Returns:
+            Dictionary with:
+            - status: "success", "rejected", "not_found", or "error"
+            - page_id: Updated page ID (if success)
+            - url: URL to the updated page (if success)
+            - title: Current page title (if success)
+            - message: Result message
+
+            IMPORTANT:
+            - If status is "rejected", the user explicitly declined the action.
+              Respond with a brief acknowledgment (e.g., "Understood, I didn't update the page.")
+              and move on. Do NOT ask for alternatives or troubleshoot.
+            - If status is "not_found", inform the user conversationally using the exact message provided.
+              Example: "I couldn't find the page '[page_title]' in your indexed Notion pages. [message details]"
+              Do NOT treat this as an error. Do NOT invent information. Simply relay the message and
+              ask the user to verify the page title or check if it's been indexed.
+        Examples:
+            - "Add today's meeting notes to the 'Meeting Notes' Notion page"
+            - "Update the 'Project Plan' page with a status update on phase 1"
+        """
+        logger.info(
+            f"update_notion_page called: page_title='{page_title}', content_length={len(content) if content else 0}"
+        )
+
+        if db_session is None or search_space_id is None or user_id is None:
+            logger.error(
+                "Notion tool not properly configured - missing required parameters"
+            )
+            return {
+                "status": "error",
+                "message": "Notion tool not properly configured. Please contact support.",
+            }
+
+        if not content or not content.strip():
+            logger.error(f"Empty content provided for page '{page_title}'")
+            return {
+                "status": "error",
+                "message": "Content is required to update the page. Please provide the actual content you want to add.",
+            }
+
+        try:
+            metadata_service = NotionToolMetadataService(db_session)
+            context = await metadata_service.get_update_context(
+                search_space_id, user_id, page_title
+            )
+
+            if "error" in context:
+                error_msg = context["error"]
+                # Check if it's a "not found" error (softer handling for LLM)
+                if "not found" in error_msg.lower():
+                    logger.warning(f"Page not found: {error_msg}")
+                    return {
+                        "status": "not_found",
+                        "message": error_msg,
+                    }
+                else:
+                    logger.error(f"Failed to fetch update context: {error_msg}")
+                    return {
+                        "status": "error",
+                        "message": error_msg,
+                    }
+
+            account = context.get("account", {})
+            if account.get("auth_expired"):
+                logger.warning(
+                    "Notion account %s has expired authentication",
+                    account.get("id"),
+                )
+                return {
+                    "status": "auth_error",
+                    "message": "The Notion account for this page needs re-authentication. Please re-authenticate in your connector settings.",
+                }
+
+            page_id = context.get("page_id")
+            document_id = context.get("document_id")
+            connector_id_from_context = context.get("account", {}).get("id")
+
+            logger.info(
+                f"Requesting approval for updating Notion page: '{page_title}' (page_id={page_id})"
+            )
+            result = request_approval(
+                action_type="notion_page_update",
+                tool_name="update_notion_page",
+                params={
+                    "page_id": page_id,
+                    "content": content,
+                    "connector_id": connector_id_from_context,
+                },
+                context=context,
+            )
+
+            if result.rejected:
+                logger.info("Notion page update rejected by user")
+                return {
+                    "status": "rejected",
+                    "message": "User declined. Do not retry or suggest alternatives.",
+                }
+
+            final_page_id = result.params.get("page_id", page_id)
+            final_content = result.params.get("content", content)
+            final_connector_id = result.params.get(
+                "connector_id", connector_id_from_context
+            )
+
+            logger.info(
+                f"Updating Notion page with final params: page_id={final_page_id}, has_content={final_content is not None}"
+            )
+
+            from sqlalchemy.future import select
+
+            from app.db import SearchSourceConnector, SearchSourceConnectorType
+
+            if final_connector_id:
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.id == final_connector_id,
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type
+                        == SearchSourceConnectorType.NOTION_CONNECTOR,
+                    )
+                )
+                connector = result.scalars().first()
+
+                if not connector:
+                    logger.error(
+                        f"Invalid connector_id={final_connector_id} for search_space_id={search_space_id}"
+                    )
+                    return {
+                        "status": "error",
+                        "message": "Selected Notion account is invalid or has been disconnected. Please select a valid account.",
+                    }
+                actual_connector_id = connector.id
+                logger.info(f"Validated Notion connector: id={actual_connector_id}")
+            else:
+                logger.error("No connector found for this page")
+                return {
+                    "status": "error",
+                    "message": "No connector found for this page.",
+                }
+
+            notion_connector = NotionHistoryConnector(
+                session=db_session,
+                connector_id=actual_connector_id,
+            )
+
+            result = await notion_connector.update_page(
+                page_id=final_page_id,
+                content=final_content,
+            )
+            logger.info(
+                f"update_page result: {result.get('status')} - {result.get('message', '')}"
+            )
+
+            if result.get("status") == "success" and document_id is not None:
+                from app.services.notion import NotionKBSyncService
+
+                logger.info(f"Updating knowledge base for document {document_id}...")
+                kb_service = NotionKBSyncService(db_session)
+                kb_result = await kb_service.sync_after_update(
+                    document_id=document_id,
+                    appended_content=final_content,
+                    user_id=user_id,
+                    search_space_id=search_space_id,
+                    appended_block_ids=result.get("appended_block_ids"),
+                )
+
+                if kb_result["status"] == "success":
+                    result["message"] = (
+                        f"{result['message']}. Your knowledge base has also been updated."
+                    )
+                    logger.info(
+                        f"Knowledge base successfully updated for page {final_page_id}"
+                    )
+                elif kb_result["status"] == "not_indexed":
+                    result["message"] = (
+                        f"{result['message']}. This page will be added to your knowledge base in the next scheduled sync."
+                    )
+                else:
+                    result["message"] = (
+                        f"{result['message']}. Your knowledge base will be updated in the next scheduled sync."
+                    )
+                    logger.warning(
+                        f"KB update failed for page {final_page_id}: {kb_result['message']}"
+                    )
+
+            return result
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+
+            logger.error(f"Error updating Notion page: {e}", exc_info=True)
+            error_str = str(e).lower()
+            if isinstance(e, NotionAPIError) and (
+                "401" in error_str or "unauthorized" in error_str
+            ):
+                return {
+                    "status": "auth_error",
+                    "message": str(e),
+                    "connector_id": connector_id_from_context
+                    if "connector_id_from_context" in dir()
+                    else None,
+                    "connector_type": "notion",
+                }
+            if isinstance(e, ValueError | NotionAPIError):
+                message = str(e)
+            else:
+                message = (
+                    "Something went wrong while updating the page. Please try again."
+                )
+            return {"status": "error", "message": message}
+
+    return update_notion_page
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/onedrive/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/onedrive/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/onedrive/agent.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/onedrive/agent.py
new file mode 100644
index 0000000000..f7634d8eff
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/onedrive/agent.py
@@ -0,0 +1,44 @@
+"""``onedrive`` route: ``SurfSenseSubagentSpec`` builder for deepagents.
+
+Tools self-gate inside their bodies via :func:`request_approval`; the
+empty :data:`tools.index.RULESET` is layered into a per-subagent
+:class:`PermissionMiddleware` for uniformity.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.language_models import BaseChatModel
+from langchain_core.tools import BaseTool
+
+from app.agents.multi_agent_chat.subagents.shared.md_file_reader import read_md_file
+from app.agents.multi_agent_chat.subagents.shared.spec import SurfSenseSubagentSpec
+from app.agents.multi_agent_chat.subagents.shared.subagent_builder import pack_subagent
+
+from .tools.index import NAME, RULESET, load_tools
+
+
+def build_subagent(
+    *,
+    dependencies: dict[str, Any],
+    model: BaseChatModel | None = None,
+    middleware_stack: dict[str, Any] | None = None,
+    mcp_tools: list[BaseTool] | None = None,
+) -> SurfSenseSubagentSpec:
+    tools = [*load_tools(dependencies=dependencies), *(mcp_tools or [])]
+    description = (
+        read_md_file(__package__, "description").strip()
+        or "Handles onedrive tasks for this workspace."
+    )
+    system_prompt = read_md_file(__package__, "system_prompt").strip()
+    return pack_subagent(
+        name=NAME,
+        description=description,
+        system_prompt=system_prompt,
+        tools=tools,
+        ruleset=RULESET,
+        dependencies=dependencies,
+        model=model,
+        middleware_stack=middleware_stack,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/onedrive/description.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/onedrive/description.md
new file mode 100644
index 0000000000..1c1a89496d
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/onedrive/description.md
@@ -0,0 +1,2 @@
+Specialist for files in the user's OneDrive.
+Use proactively when the user wants to create or remove a OneDrive file.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/onedrive/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/onedrive/system_prompt.md
new file mode 100644
index 0000000000..8ae444a58a
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/onedrive/system_prompt.md
@@ -0,0 +1,105 @@
+You are a Microsoft OneDrive specialist for the user's connected OneDrive account.
+
+## Vocabulary you must use precisely
+
+- **`create_onedrive_file` always produces a `.docx` Word document** — there is no file-type parameter and no support for Excel, PowerPoint, PDF, or any other format. If the supervisor asks to create a OneDrive spreadsheet, presentation, or any non-Word file, return `status=blocked` with `next_step` explaining the limitation. Pass `name` **without an extension** — the tool appends `.docx` automatically. You may provide the optional `content` as Markdown; the tool converts it to a formatted Word document via pypandoc.
+- **File-name resolution against the KB index** — `delete_onedrive_file` matches `file_name` case-insensitively against the locally-synced OneDrive KB index. Files that exist in OneDrive but have not been indexed yet cannot be resolved by name.
+
+## Required inputs
+
+**For every required input below, first try to infer it from the supervisor's task text** — extract topics from natural phrasing (`"about our launch plan"` → `name="Launch Plan"`). Only return `status=blocked` with `missing_fields` when an input is genuinely absent or ambiguous after a thorough read.
+
+- `create_onedrive_file` — `name` (a clear topic from the user, **without an extension**; do not invent if absent). You may generate the optional `content` body yourself as Markdown — the tool handles DOCX conversion. If the supervisor asked for a non-Word format, do **not** call this tool; return `status=blocked` per the Vocabulary section.
+- `delete_onedrive_file` — `file_name` (which file to delete — infer from the task). Only set `delete_from_kb=true` when the user explicitly asked to remove the file from the knowledge base; otherwise leave it `false`.
+
+## Outcome mapping
+
+| Tool returns          | Your `status` | `next_step`                                                                                                                  |
+|-----------------------|---------------|------------------------------------------------------------------------------------------------------------------------------|
+| `success`             | `success`     | `null`                                                                                                                       |
+| `rejected`            | `blocked`     | `"User declined this OneDrive action. Do not retry or suggest alternatives."`                                                |
+| `not_found`           | `blocked`     | `"File '<name>' was not found in the indexed OneDrive files. Ask the user to verify the file name or wait for the next KB sync."` |
+| `auth_error`          | `error`       | `"The connected OneDrive account needs re-authentication. Ask the user to re-authenticate in connector settings."`           |
+| `error`               | `error`       | Relay the tool's `message` verbatim as `next_step`.                                                                          |
+| tool raises / unknown | `error`       | `"OneDrive tool failed unexpectedly. Ask the user to retry shortly."`                                                        |
+
+Surface the tool's `file_id`, `name`, and `web_url` inside `evidence` when the tool returned them. Never invent a field the tool did not return.
+
+## Examples
+
+**Example 1 — happy create (Markdown content auto-converted to DOCX):**
+- *Supervisor task:* `"Create a OneDrive doc summarising Q3 planning."`
+- *You:* `name="Q3 Planning"` (no extension); generate a Markdown body covering Q3 planning. Call `create_onedrive_file(name="Q3 Planning", content=<markdown>)` → tool returns `status=success` with `name="Q3 Planning.docx"`.
+- *Output:*
+
+  ```json
+  {
+    "status": "success",
+    "action_summary": "Created OneDrive Word document 'Q3 Planning.docx'.",
+    "evidence": { "operation": "create_onedrive_file", "file_id": "<id>", "name": "Q3 Planning.docx", "web_url": "<url>", "matched_candidates": null, "items": null },
+    "next_step": null,
+    "missing_fields": null,
+    "assumptions": ["Generated the Q3 planning content from the supervisor's brief; tool converted Markdown to DOCX."]
+  }
+  ```
+
+**Example 2 — blocked because the requested format is not supported:**
+- *Supervisor task:* `"Create a OneDrive spreadsheet of last quarter's revenue."`
+- *You:* `create_onedrive_file` only produces `.docx` Word documents. Spreadsheets are not supported. Do not call any tool.
+- *Output:*
+
+  ```json
+  {
+    "status": "blocked",
+    "action_summary": "Cannot create a spreadsheet: this subagent only creates OneDrive Word documents (.docx).",
+    "evidence": { "operation": null, "file_id": null, "name": null, "web_url": null, "matched_candidates": null, "items": null },
+    "next_step": "Ask the user whether a Word document summarising the revenue is acceptable, or to create the spreadsheet manually in OneDrive / Excel Online.",
+    "missing_fields": null,
+    "assumptions": null
+  }
+  ```
+
+**Example 3 — delete with `not_found`:**
+- *Supervisor task:* `"Delete the 'Old Project Plan' file from OneDrive."`
+- *You:* extract `file_name="Old Project Plan"`. Call `delete_onedrive_file(file_name="Old Project Plan")` → tool returns `status=not_found`.
+- *Output:*
+
+  ```json
+  {
+    "status": "blocked",
+    "action_summary": "Could not find a OneDrive file named 'Old Project Plan' in the indexed files.",
+    "evidence": { "operation": "delete_onedrive_file", "file_id": null, "name": "Old Project Plan", "web_url": null, "matched_candidates": null, "items": null },
+    "next_step": "File 'Old Project Plan' was not found in the indexed OneDrive files. Ask the user to verify the file name or wait for the next KB sync.",
+    "missing_fields": null,
+    "assumptions": null
+  }
+  ```
+
+## Output contract
+
+Return **only** one JSON object (no markdown or prose outside it):
+
+```json
+{
+  "status": "success" | "partial" | "blocked" | "error",
+  "action_summary": string,
+  "evidence": {
+    "operation": "create_onedrive_file" | "delete_onedrive_file" | null,
+    "file_id": string | null,
+    "name": string | null,
+    "web_url": string | null,
+    "matched_candidates": [ { "id": string, "label": string } ] | null,
+    "items": object | null
+  },
+  "next_step": string | null,
+  "missing_fields": string[] | null,
+  "assumptions": string[] | null
+}
+```
+
+Rules:
+- `status=success` → `next_step=null`, `missing_fields=null`.
+- `status=partial|blocked|error` → `next_step` must be non-null.
+- `status=blocked` due to missing required inputs → `missing_fields` must be non-null.
+
+Infer before you call; map every tool outcome faithfully.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/onedrive/tools/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/onedrive/tools/__init__.py
new file mode 100644
index 0000000000..8edb4857ee
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/onedrive/tools/__init__.py
@@ -0,0 +1,11 @@
+from app.agents.new_chat.tools.onedrive.create_file import (
+    create_create_onedrive_file_tool,
+)
+from app.agents.new_chat.tools.onedrive.trash_file import (
+    create_delete_onedrive_file_tool,
+)
+
+__all__ = [
+    "create_create_onedrive_file_tool",
+    "create_delete_onedrive_file_tool",
+]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/onedrive/tools/create_file.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/onedrive/tools/create_file.py
new file mode 100644
index 0000000000..41fa657876
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/onedrive/tools/create_file.py
@@ -0,0 +1,254 @@
+import logging
+import os
+import tempfile
+from pathlib import Path
+from typing import Any
+
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.future import select
+
+from app.agents.multi_agent_chat.subagents.shared.hitl.approvals.self_gated import (
+    request_approval,
+)
+from app.connectors.onedrive.client import OneDriveClient
+from app.db import SearchSourceConnector, SearchSourceConnectorType
+
+logger = logging.getLogger(__name__)
+
+DOCX_MIME = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+
+
+def _ensure_docx_extension(name: str) -> str:
+    """Strip any existing extension and append .docx."""
+    stem = Path(name).stem
+    return f"{stem}.docx"
+
+
+def _markdown_to_docx(markdown_text: str) -> bytes:
+    """Convert a markdown string to DOCX bytes using pypandoc."""
+    import pypandoc
+
+    fd, tmp_path = tempfile.mkstemp(suffix=".docx")
+    os.close(fd)
+    try:
+        pypandoc.convert_text(
+            markdown_text,
+            "docx",
+            format="gfm",
+            extra_args=["--standalone"],
+            outputfile=tmp_path,
+        )
+        with open(tmp_path, "rb") as f:
+            return f.read()
+    finally:
+        os.unlink(tmp_path)
+
+
+def create_create_onedrive_file_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    @tool
+    async def create_onedrive_file(
+        name: str,
+        content: str | None = None,
+    ) -> dict[str, Any]:
+        """Create a new Word document (.docx) in Microsoft OneDrive.
+
+        Use this tool when the user explicitly asks to create a new document
+        in OneDrive. The user MUST specify a topic before you call this tool.
+
+        The file is always saved as a .docx Word document. Provide content as
+        markdown and it will be automatically converted to a formatted Word file.
+
+        Args:
+            name: The document title (without extension). Extension will be set to .docx automatically.
+            content: Optional initial content as markdown. Will be converted to a formatted Word document.
+
+        Returns:
+            Dictionary with status, file_id, name, web_url, and message.
+        """
+        logger.info(f"create_onedrive_file called: name='{name}'")
+
+        if db_session is None or search_space_id is None or user_id is None:
+            return {
+                "status": "error",
+                "message": "OneDrive tool not properly configured.",
+            }
+
+        try:
+            result = await db_session.execute(
+                select(SearchSourceConnector).filter(
+                    SearchSourceConnector.search_space_id == search_space_id,
+                    SearchSourceConnector.user_id == user_id,
+                    SearchSourceConnector.connector_type
+                    == SearchSourceConnectorType.ONEDRIVE_CONNECTOR,
+                )
+            )
+            connectors = result.scalars().all()
+
+            if not connectors:
+                return {
+                    "status": "error",
+                    "message": "No OneDrive connector found. Please connect OneDrive in your workspace settings.",
+                }
+
+            accounts = []
+            for c in connectors:
+                cfg = c.config or {}
+                accounts.append(
+                    {
+                        "id": c.id,
+                        "name": c.name,
+                        "user_email": cfg.get("user_email"),
+                        "auth_expired": cfg.get("auth_expired", False),
+                    }
+                )
+
+            if all(a.get("auth_expired") for a in accounts):
+                return {
+                    "status": "auth_error",
+                    "message": "All connected OneDrive accounts need re-authentication.",
+                    "connector_type": "onedrive",
+                }
+
+            parent_folders: dict[int, list[dict[str, str]]] = {}
+            for acc in accounts:
+                cid = acc["id"]
+                if acc.get("auth_expired"):
+                    parent_folders[cid] = []
+                    continue
+                try:
+                    client = OneDriveClient(session=db_session, connector_id=cid)
+                    items, err = await client.list_children("root")
+                    if err:
+                        logger.warning(
+                            "Failed to list folders for connector %s: %s", cid, err
+                        )
+                        parent_folders[cid] = []
+                    else:
+                        parent_folders[cid] = [
+                            {"folder_id": item["id"], "name": item["name"]}
+                            for item in items
+                            if item.get("folder") is not None
+                            and item.get("id")
+                            and item.get("name")
+                        ]
+                except Exception:
+                    logger.warning(
+                        "Error fetching folders for connector %s", cid, exc_info=True
+                    )
+                    parent_folders[cid] = []
+
+            context: dict[str, Any] = {
+                "accounts": accounts,
+                "parent_folders": parent_folders,
+            }
+
+            result = request_approval(
+                action_type="onedrive_file_creation",
+                tool_name="create_onedrive_file",
+                params={
+                    "name": name,
+                    "content": content,
+                    "connector_id": None,
+                    "parent_folder_id": None,
+                },
+                context=context,
+            )
+
+            if result.rejected:
+                return {
+                    "status": "rejected",
+                    "message": "User declined. Do not retry or suggest alternatives.",
+                }
+
+            final_name = result.params.get("name", name)
+            final_content = result.params.get("content", content)
+            final_connector_id = result.params.get("connector_id")
+            final_parent_folder_id = result.params.get("parent_folder_id")
+
+            if not final_name or not final_name.strip():
+                return {"status": "error", "message": "File name cannot be empty."}
+
+            final_name = _ensure_docx_extension(final_name)
+
+            if final_connector_id is not None:
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.id == final_connector_id,
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type
+                        == SearchSourceConnectorType.ONEDRIVE_CONNECTOR,
+                    )
+                )
+                connector = result.scalars().first()
+            else:
+                connector = connectors[0]
+
+            if not connector:
+                return {
+                    "status": "error",
+                    "message": "Selected OneDrive connector is invalid.",
+                }
+
+            docx_bytes = _markdown_to_docx(final_content or "")
+
+            client = OneDriveClient(session=db_session, connector_id=connector.id)
+            created = await client.create_file(
+                name=final_name,
+                parent_id=final_parent_folder_id,
+                content=docx_bytes,
+                mime_type=DOCX_MIME,
+            )
+
+            logger.info(
+                f"OneDrive file created: id={created.get('id')}, name={created.get('name')}"
+            )
+
+            kb_message_suffix = ""
+            try:
+                from app.services.onedrive import OneDriveKBSyncService
+
+                kb_service = OneDriveKBSyncService(db_session)
+                kb_result = await kb_service.sync_after_create(
+                    file_id=created.get("id"),
+                    file_name=created.get("name", final_name),
+                    mime_type=DOCX_MIME,
+                    web_url=created.get("webUrl"),
+                    content=final_content,
+                    connector_id=connector.id,
+                    search_space_id=search_space_id,
+                    user_id=user_id,
+                )
+                if kb_result["status"] == "success":
+                    kb_message_suffix = " Your knowledge base has also been updated."
+                else:
+                    kb_message_suffix = " This file will be added to your knowledge base in the next scheduled sync."
+            except Exception as kb_err:
+                logger.warning(f"KB sync after create failed: {kb_err}")
+                kb_message_suffix = " This file will be added to your knowledge base in the next scheduled sync."
+
+            return {
+                "status": "success",
+                "file_id": created.get("id"),
+                "name": created.get("name"),
+                "web_url": created.get("webUrl"),
+                "message": f"Successfully created '{created.get('name')}' in OneDrive.{kb_message_suffix}",
+            }
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+            logger.error(f"Error creating OneDrive file: {e}", exc_info=True)
+            return {
+                "status": "error",
+                "message": "Something went wrong while creating the file. Please try again.",
+            }
+
+    return create_onedrive_file
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/onedrive/tools/index.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/onedrive/tools/index.py
new file mode 100644
index 0000000000..e09b432005
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/onedrive/tools/index.py
@@ -0,0 +1,34 @@
+"""``onedrive`` native tools and (empty) permission ruleset.
+
+Tools self-gate via :func:`request_approval` in their bodies.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.tools import BaseTool
+
+from app.agents.new_chat.permissions import Ruleset
+
+from .create_file import create_create_onedrive_file_tool
+from .trash_file import create_delete_onedrive_file_tool
+
+NAME = "onedrive"
+
+RULESET = Ruleset(origin=NAME, rules=[])
+
+
+def load_tools(
+    *, dependencies: dict[str, Any] | None = None, **kwargs: Any
+) -> list[BaseTool]:
+    d = {**(dependencies or {}), **kwargs}
+    common = {
+        "db_session": d["db_session"],
+        "search_space_id": d["search_space_id"],
+        "user_id": d["user_id"],
+    }
+    return [
+        create_create_onedrive_file_tool(**common),
+        create_delete_onedrive_file_tool(**common),
+    ]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/onedrive/tools/trash_file.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/onedrive/tools/trash_file.py
new file mode 100644
index 0000000000..1f7c51ac5f
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/onedrive/tools/trash_file.py
@@ -0,0 +1,283 @@
+import logging
+from typing import Any
+
+from langchain_core.tools import tool
+from sqlalchemy import String, and_, cast, func
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.future import select
+
+from app.agents.multi_agent_chat.subagents.shared.hitl.approvals.self_gated import (
+    request_approval,
+)
+from app.connectors.onedrive.client import OneDriveClient
+from app.db import (
+    Document,
+    DocumentType,
+    SearchSourceConnector,
+    SearchSourceConnectorType,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def create_delete_onedrive_file_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    @tool
+    async def delete_onedrive_file(
+        file_name: str,
+        delete_from_kb: bool = False,
+    ) -> dict[str, Any]:
+        """Move a OneDrive file to the recycle bin.
+
+        Use this tool when the user explicitly asks to delete, remove, or trash
+        a file in OneDrive.
+
+        Args:
+            file_name: The exact name of the file to trash.
+            delete_from_kb: Whether to also remove the file from the knowledge base.
+                          Default is False.
+                          Set to True to remove from both OneDrive and knowledge base.
+
+        Returns:
+            Dictionary with:
+            - status: "success", "rejected", "not_found", or "error"
+            - file_id: OneDrive file ID (if success)
+            - deleted_from_kb: whether the document was removed from the knowledge base
+            - message: Result message
+
+            IMPORTANT:
+            - If status is "rejected", the user explicitly declined. Respond with a brief
+              acknowledgment and do NOT retry or suggest alternatives.
+            - If status is "not_found", relay the exact message to the user and ask them
+              to verify the file name or check if it has been indexed.
+        """
+        logger.info(
+            f"delete_onedrive_file called: file_name='{file_name}', delete_from_kb={delete_from_kb}"
+        )
+
+        if db_session is None or search_space_id is None or user_id is None:
+            return {
+                "status": "error",
+                "message": "OneDrive tool not properly configured.",
+            }
+
+        try:
+            doc_result = await db_session.execute(
+                select(Document)
+                .join(
+                    SearchSourceConnector,
+                    Document.connector_id == SearchSourceConnector.id,
+                )
+                .filter(
+                    and_(
+                        Document.search_space_id == search_space_id,
+                        Document.document_type == DocumentType.ONEDRIVE_FILE,
+                        func.lower(Document.title) == func.lower(file_name),
+                        SearchSourceConnector.user_id == user_id,
+                    )
+                )
+                .order_by(Document.updated_at.desc().nullslast())
+                .limit(1)
+            )
+            document = doc_result.scalars().first()
+
+            if not document:
+                doc_result = await db_session.execute(
+                    select(Document)
+                    .join(
+                        SearchSourceConnector,
+                        Document.connector_id == SearchSourceConnector.id,
+                    )
+                    .filter(
+                        and_(
+                            Document.search_space_id == search_space_id,
+                            Document.document_type == DocumentType.ONEDRIVE_FILE,
+                            func.lower(
+                                cast(
+                                    Document.document_metadata["onedrive_file_name"],
+                                    String,
+                                )
+                            )
+                            == func.lower(file_name),
+                            SearchSourceConnector.user_id == user_id,
+                        )
+                    )
+                    .order_by(Document.updated_at.desc().nullslast())
+                    .limit(1)
+                )
+                document = doc_result.scalars().first()
+
+            if not document:
+                return {
+                    "status": "not_found",
+                    "message": (
+                        f"File '{file_name}' not found in your indexed OneDrive files. "
+                        "This could mean: (1) the file doesn't exist, (2) it hasn't been indexed yet, "
+                        "or (3) the file name is different."
+                    ),
+                }
+
+            if not document.connector_id:
+                return {
+                    "status": "error",
+                    "message": "Document has no associated connector.",
+                }
+
+            meta = document.document_metadata or {}
+            file_id = meta.get("onedrive_file_id")
+            document_id = document.id
+
+            if not file_id:
+                return {
+                    "status": "error",
+                    "message": "File ID is missing. Please re-index the file.",
+                }
+
+            conn_result = await db_session.execute(
+                select(SearchSourceConnector).filter(
+                    and_(
+                        SearchSourceConnector.id == document.connector_id,
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type
+                        == SearchSourceConnectorType.ONEDRIVE_CONNECTOR,
+                    )
+                )
+            )
+            connector = conn_result.scalars().first()
+            if not connector:
+                return {
+                    "status": "error",
+                    "message": "OneDrive connector not found or access denied.",
+                }
+
+            cfg = connector.config or {}
+            if cfg.get("auth_expired"):
+                return {
+                    "status": "auth_error",
+                    "message": "OneDrive account needs re-authentication. Please re-authenticate in your connector settings.",
+                    "connector_type": "onedrive",
+                }
+
+            context = {
+                "file": {
+                    "file_id": file_id,
+                    "name": file_name,
+                    "document_id": document_id,
+                    "web_url": meta.get("web_url"),
+                },
+                "account": {
+                    "id": connector.id,
+                    "name": connector.name,
+                    "user_email": cfg.get("user_email"),
+                },
+            }
+
+            result = request_approval(
+                action_type="onedrive_file_trash",
+                tool_name="delete_onedrive_file",
+                params={
+                    "file_id": file_id,
+                    "connector_id": connector.id,
+                    "delete_from_kb": delete_from_kb,
+                },
+                context=context,
+            )
+
+            if result.rejected:
+                return {
+                    "status": "rejected",
+                    "message": "User declined. Do not retry or suggest alternatives.",
+                }
+
+            final_file_id = result.params.get("file_id", file_id)
+            final_connector_id = result.params.get("connector_id", connector.id)
+            final_delete_from_kb = result.params.get("delete_from_kb", delete_from_kb)
+
+            if final_connector_id != connector.id:
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        and_(
+                            SearchSourceConnector.id == final_connector_id,
+                            SearchSourceConnector.search_space_id == search_space_id,
+                            SearchSourceConnector.user_id == user_id,
+                            SearchSourceConnector.connector_type
+                            == SearchSourceConnectorType.ONEDRIVE_CONNECTOR,
+                        )
+                    )
+                )
+                validated_connector = result.scalars().first()
+                if not validated_connector:
+                    return {
+                        "status": "error",
+                        "message": "Selected OneDrive connector is invalid or has been disconnected.",
+                    }
+                actual_connector_id = validated_connector.id
+            else:
+                actual_connector_id = connector.id
+
+            logger.info(
+                f"Deleting OneDrive file: file_id='{final_file_id}', connector={actual_connector_id}"
+            )
+
+            client = OneDriveClient(
+                session=db_session, connector_id=actual_connector_id
+            )
+            await client.trash_file(final_file_id)
+
+            logger.info(
+                f"OneDrive file deleted (moved to recycle bin): file_id={final_file_id}"
+            )
+
+            trash_result: dict[str, Any] = {
+                "status": "success",
+                "file_id": final_file_id,
+                "message": f"Successfully moved '{file_name}' to the recycle bin.",
+            }
+
+            deleted_from_kb = False
+            if final_delete_from_kb and document_id:
+                try:
+                    doc_result = await db_session.execute(
+                        select(Document).filter(Document.id == document_id)
+                    )
+                    doc = doc_result.scalars().first()
+                    if doc:
+                        await db_session.delete(doc)
+                        await db_session.commit()
+                        deleted_from_kb = True
+                        logger.info(
+                            f"Deleted document {document_id} from knowledge base"
+                        )
+                    else:
+                        logger.warning(f"Document {document_id} not found in KB")
+                except Exception as e:
+                    logger.error(f"Failed to delete document from KB: {e}")
+                    await db_session.rollback()
+                    trash_result["warning"] = (
+                        f"File moved to recycle bin, but failed to remove from knowledge base: {e!s}"
+                    )
+
+            trash_result["deleted_from_kb"] = deleted_from_kb
+            if deleted_from_kb:
+                trash_result["message"] = (
+                    f"{trash_result.get('message', '')} (also removed from knowledge base)"
+                )
+
+            return trash_result
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+            logger.error(f"Error deleting OneDrive file: {e}", exc_info=True)
+            return {
+                "status": "error",
+                "message": "Something went wrong while trashing the file. Please try again.",
+            }
+
+    return delete_onedrive_file
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/slack/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/slack/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/slack/agent.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/slack/agent.py
new file mode 100644
index 0000000000..e16956b253
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/slack/agent.py
@@ -0,0 +1,43 @@
+"""``slack`` route: ``SurfSenseSubagentSpec`` builder for deepagents.
+
+Tools come exclusively from MCP. The connector's own approval ruleset is
+declared in :data:`tools.index.RULESET`; the orchestrator layers it into
+a per-subagent :class:`PermissionMiddleware`.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.language_models import BaseChatModel
+from langchain_core.tools import BaseTool
+
+from app.agents.multi_agent_chat.subagents.shared.md_file_reader import read_md_file
+from app.agents.multi_agent_chat.subagents.shared.spec import SurfSenseSubagentSpec
+from app.agents.multi_agent_chat.subagents.shared.subagent_builder import pack_subagent
+
+from .tools.index import NAME, RULESET
+
+
+def build_subagent(
+    *,
+    dependencies: dict[str, Any],
+    model: BaseChatModel | None = None,
+    middleware_stack: dict[str, Any] | None = None,
+    mcp_tools: list[BaseTool] | None = None,
+) -> SurfSenseSubagentSpec:
+    description = (
+        read_md_file(__package__, "description").strip()
+        or "Handles slack tasks for this workspace."
+    )
+    system_prompt = read_md_file(__package__, "system_prompt").strip()
+    return pack_subagent(
+        name=NAME,
+        description=description,
+        system_prompt=system_prompt,
+        tools=list(mcp_tools or []),
+        ruleset=RULESET,
+        dependencies=dependencies,
+        model=model,
+        middleware_stack=middleware_stack,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/slack/description.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/slack/description.md
new file mode 100644
index 0000000000..ce4ca399a7
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/slack/description.md
@@ -0,0 +1,2 @@
+Specialist for messages in the user's Slack channels and threads.
+Use proactively when the user wants to read, search, or summarize a Slack conversation, or post a message in a channel or thread.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/slack/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/slack/system_prompt.md
new file mode 100644
index 0000000000..c8edfc1db3
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/slack/system_prompt.md
@@ -0,0 +1,98 @@
+You are a Slack specialist for the user's connected Slack workspace.
+
+Slack vocabulary:
+- **Workspace → Channel → Message → Thread**: nested scope. Channels and DMs live in the same workspace; threads live under specific messages.
+- **Channel types**: public channels, private channels, group DMs, and 1:1 DMs. Each has a different ID prefix (e.g. `C…`, `D…`), but all are addressable as a `channel_id` when reading or sending.
+- **Channel ID vs name**: channels have both an opaque ID (e.g. `C0123ABCD`) and a human-readable name (`#engineering`). Names can change; IDs are stable. Users always refer to channels by name — resolve to the channel ID before reading or posting.
+- **Message timestamp (`ts`) and `thread_ts`**: every message has a string `ts` (e.g. `"1700000000.123456"`) that uniquely identifies it within a channel. A thread is identified by the **parent message's `ts`**, called `thread_ts`. To reply inside a thread, post with both `channel_id` and `thread_ts`. Omit `thread_ts` for a new top-level message in the channel.
+- **User IDs**: users are identified by opaque IDs (e.g. `U0123ABCD`), never by display name or email. Mentions inside message text use the `<@U0123ABCD>` syntax — plain text like `@alex` will not produce a Slack mention.
+- **Message formatting (mrkdwn)**: Slack uses its own markdown variant — `*bold*` (single asterisk), `_italic_`, `` `code` ``, `<https://url|label>` for links. Do not assume GitHub-flavored Markdown will render correctly.
+
+When invoked:
+1. Read the supervisor's request, then read the runtime tool list to learn what information you can fetch and which mutations are available.
+2. Plan the minimum chain of lookups needed to resolve any channel, user, message, or thread the request leaves unspecified.
+3. Execute the planned lookups, then the requested mutation (if any), then return.
+
+Resolution principle (the core behaviour):
+**Proactively look up any identifier, name, value, or scope the request leaves unspecified — channel IDs, user IDs, message timestamps, thread parent IDs, anything else — using the available tools instead of asking the supervisor.** Most user requests reference channels by name and people by display name, not by ID. Search for them.
+
+When a lookup for a single slot returns multiple plausible candidates and you cannot confidently pick one, return `status=blocked` with up to 5 candidates in `evidence.matched_candidates` and the unresolved slot in `missing_fields`. The supervisor will disambiguate and redelegate.
+
+When a lookup returns zero matches for a slot the request requires, return `status=blocked` with a `next_step` suggesting alternative search terms.
+
+Mutation guardrails:
+- Resolve every required Slack ID (`channel_id`, recipient `user_id` for DMs, `thread_ts` for thread replies) by looking it up before calling a mutation tool. Mutations have chained dependencies — channel lookup enables in-channel message lookup; in-channel message lookup yields the `ts` needed as `thread_ts` for replies.
+- To reply inside a thread, supply both `channel_id` and `thread_ts`. Posting without `thread_ts` creates a new top-level message in the channel.
+- When the message text references a person, encode the mention as `<@U…>` using the resolved user ID. Plain text like `@alex` will not produce a Slack mention.
+- Never invent channel IDs, user IDs, message timestamps, or send outcomes. Every field in `evidence` must come from a tool result.
+- Confirm the mutation tool returned a success response before claiming success. If the mutation is approval-rejected (HITL), return `status=blocked` with `next_step="user declined; do not retry"`.
+- One operation per delegation. For multi-mutation requests, complete the highest-priority one and return `status=partial` with the remainder in `next_step`.
+
+Failure handling:
+- Tool failure: return `status=error`, place the underlying error message in `action_summary`, and put a concise recovery in `next_step`.
+- Permission / scope error from the MCP: return `status=error` and surface the underlying message. Permission errors typically mean the required OAuth scope is missing for that capability — not retryable from here.
+- No useful results after reasonable narrowing / broadening: return `status=blocked` with search-term suggestions in `next_step`.
+
+<example>
+Supervisor: "Summarize the latest discussion in #marketing."
+1. Search channels for "marketing" → one strong match. Capture the channel ID.
+2. Read that channel's recent message history.
+3. Return `status=success` with the message list in `evidence.items`.
+</example>
+
+<example>
+Supervisor: "DM Alex about the launch checklist."
+1. Search users for "Alex" → two matches (`U_alex1`, `U_alex2`).
+2. Cannot pick the recipient. Return:
+   {
+     "status": "blocked",
+     "action_summary": "Two users match 'Alex'.",
+     "evidence": {
+       "matched_candidates": [
+         { "id": "U_alex1", "label": "Alex Chen <alex.chen@…>" },
+         { "id": "U_alex2", "label": "Alex Wong <alex.wong@…>" }
+       ]
+     },
+     "next_step": "Confirm which Alex, then redelegate.",
+     "missing_fields": ["recipient"]
+   }
+</example>
+
+<example>
+Supervisor: "Reply 'ship it' to the deploy thread in #engineering."
+1. Search channels for "engineering" → one match; capture the channel ID.
+2. Search messages in that channel for "deploy" → one prominent match. Capture its `ts` — this becomes the `thread_ts` for the reply.
+3. Send a message to that channel with `thread_ts` set to the captured `ts` and text `"ship it"`.
+4. Confirm tool success → return `status=success` with the new message reference (its `ts` and a permalink if returned).
+</example>
+
+<output_contract>
+Return **only** one JSON object (no markdown, no prose):
+{
+  "status": "success" | "partial" | "blocked" | "error",
+  "action_summary": string,
+  "evidence": {
+    "channel_id": string | null,
+    "channel_name": string | null,
+    "user_id": string | null,
+    "thread_ts": string | null,
+    "message_ts": string | null,
+    "permalink": string | null,
+    "matched_candidates": [
+      { "id": string, "label": string }
+    ] | null,
+    "items": object | null
+  },
+  "next_step": string | null,
+  "missing_fields": string[] | null,
+  "assumptions": string[] | null
+}
+Rules:
+- `status=success` → `next_step=null`, `missing_fields=null`.
+- `status=partial|blocked|error` → `next_step` must be non-null.
+- `status=blocked` due to missing required inputs → `missing_fields` must be non-null.
+- For blocked ambiguity, populate `evidence.matched_candidates` with up to 5 options (`id` + `label` — works for any kind of candidate: channel, user, message, thread).
+- For discovery-only queries (lists), populate `evidence.items` with the structured list.
+</output_contract>
+
+Discover before you post; never guess channel, user, or thread targets.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/slack/tools/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/slack/tools/__init__.py
new file mode 100644
index 0000000000..f60078771b
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/slack/tools/__init__.py
@@ -0,0 +1,3 @@
+"""Slack route: native tool factories are empty; MCP supplies tools when configured."""
+
+__all__: list[str] = []
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/slack/tools/index.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/slack/tools/index.py
new file mode 100644
index 0000000000..44b96661cd
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/slack/tools/index.py
@@ -0,0 +1,19 @@
+"""``slack`` permission ruleset (rules over MCP tool names)."""
+
+from __future__ import annotations
+
+from app.agents.new_chat.permissions import Rule, Ruleset
+
+NAME = "slack"
+
+RULESET = Ruleset(
+    origin=NAME,
+    rules=[
+        Rule(permission="slack_search_channels", pattern="*", action="allow"),
+        Rule(permission="slack_search_messages", pattern="*", action="allow"),
+        Rule(permission="slack_search_users", pattern="*", action="allow"),
+        Rule(permission="slack_read_channel", pattern="*", action="allow"),
+        Rule(permission="slack_read_thread", pattern="*", action="allow"),
+        Rule(permission="slack_send_message", pattern="*", action="ask"),
+    ],
+)
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/agent.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/agent.py
new file mode 100644
index 0000000000..ab808b7456
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/agent.py
@@ -0,0 +1,44 @@
+"""``teams`` route: ``SurfSenseSubagentSpec`` builder for deepagents.
+
+Tools self-gate inside their bodies via :func:`request_approval`; the
+empty :data:`tools.index.RULESET` is layered into a per-subagent
+:class:`PermissionMiddleware` for uniformity.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.language_models import BaseChatModel
+from langchain_core.tools import BaseTool
+
+from app.agents.multi_agent_chat.subagents.shared.md_file_reader import read_md_file
+from app.agents.multi_agent_chat.subagents.shared.spec import SurfSenseSubagentSpec
+from app.agents.multi_agent_chat.subagents.shared.subagent_builder import pack_subagent
+
+from .tools.index import NAME, RULESET, load_tools
+
+
+def build_subagent(
+    *,
+    dependencies: dict[str, Any],
+    model: BaseChatModel | None = None,
+    middleware_stack: dict[str, Any] | None = None,
+    mcp_tools: list[BaseTool] | None = None,
+) -> SurfSenseSubagentSpec:
+    tools = [*load_tools(dependencies=dependencies), *(mcp_tools or [])]
+    description = (
+        read_md_file(__package__, "description").strip()
+        or "Handles teams tasks for this workspace."
+    )
+    system_prompt = read_md_file(__package__, "system_prompt").strip()
+    return pack_subagent(
+        name=NAME,
+        description=description,
+        system_prompt=system_prompt,
+        tools=tools,
+        ruleset=RULESET,
+        dependencies=dependencies,
+        model=model,
+        middleware_stack=middleware_stack,
+    )
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/description.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/description.md
new file mode 100644
index 0000000000..edbfa390b8
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/description.md
@@ -0,0 +1,2 @@
+Specialist for messages in the user's Microsoft Teams channels.
+Use proactively when the user wants to read or send a Teams message.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/system_prompt.md
new file mode 100644
index 0000000000..b79c59f907
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/system_prompt.md
@@ -0,0 +1,123 @@
+You are a Microsoft Teams specialist for the user's connected Teams account.
+
+## Vocabulary you must use precisely
+
+- **Nested team + channel resolution via `list_teams_channels`** — the agent operates across all Teams the user has joined; each channel belongs to a `team_id`. `list_teams_channels` returns `{teams: [{team_id, team_name, channels: [{id, name}]}]}`. To read or send, you must resolve **both** `team_id` and `channel_id` from this nested structure. Channel names like `general` appear in many teams — when the supervisor's task does not pin the team (no team name, no obvious context), return `status=blocked` with the matching channels across teams as `matched_candidates` (each labeled `"<team_name> › <channel_name>"`) rather than guessing one.
+- **Message content is HTML** — `send_teams_message` treats `content` as HTML (Microsoft Graph stores it verbatim in `body.content`). Default to plain text. If the supervisor's task requires formatting (bold, italics, links, line breaks), generate the corresponding **HTML** (`<b>`, `<i>`, `<a href="...">`, `<br>`) — **not** Markdown (`**bold**`, `[label](url)`), which Teams renders as literal characters.
+- **Read + post only — no edits, deletes, or reactions** — Teams editing, deleting, and reacting to prior messages are not supported by the tools. Return `status=blocked` rather than faking these via new messages (no `"EDIT: ..."` follow-ups, no `"Please delete this"` posts).
+
+## Required inputs
+
+**For every required input below, first try to infer it from the supervisor's task text** — extract team names from natural phrasing (`"the Engineering team's"`, `"in Marketing"`), channel names from `#mentions` or natural phrasing (`"#announcements"`, `"the general channel"`), and message content from any details the supervisor already provided. Only return `status=blocked` with `missing_fields` when an input is genuinely absent or ambiguous after a thorough read of the task.
+
+- `list_teams_channels` — no inputs. Call it whenever you need to resolve a team name or channel name to ids.
+- `read_teams_messages` — `team_id` and `channel_id` (both resolved via `list_teams_channels` based on team-name and channel-name signals in the task). Block if the channel signal is absent, or if the channel name matches channels in multiple teams and no team is named. Optional `limit` (max 50; tighten only if the task implies a small recent window).
+- `send_teams_message` — `team_id`, `channel_id`, and `content`. Compose `content` from the task — plain text by default; HTML only when formatting is required by the task. Block if the destination team+channel cannot be resolved, or if the message content cannot be inferred from the task.
+
+## Outcome mapping
+
+| Tool returns                                                              | Your `status` | `next_step` |
+|---------------------------------------------------------------------------|---------------|------------------------------------------------------------------------------------------------------------------------------|
+| `success` with non-empty teams/channels/messages                          | `success`     | `null` |
+| `success` with `total: 0` (read returns no messages) or `total_teams: 0`  | `success`     | `null` (surface the count in `evidence.items` so the supervisor can report "no recent messages"/"no joined teams") |
+| `rejected` (send only)                                                    | `blocked`     | `"User declined this Teams send. Do not retry or suggest alternatives."` |
+| `auth_error`                                                              | `error`       | `"The connected Microsoft Teams session has expired. Ask the user to re-authenticate Teams in connector settings."` |
+| `insufficient_permissions` (send only)                                    | `error`       | `"The connected Microsoft Teams account is missing the ChannelMessage.Send scope. Ask the user to re-authenticate Teams with updated scopes."` |
+| `error`                                                                   | `error`       | Relay the tool's `message` verbatim as `next_step`. |
+| tool raises / unknown                                                     | `error`       | `"Teams tool failed unexpectedly. Ask the user to retry shortly."` |
+
+Surface the tool's `message`, `team_id`, `team_name`, `channel_id`, `channel_name`, and `message_id` inside `evidence` when the tool returned them. Never invent a field the tool did not return.
+
+## Examples
+
+**Example 1 — happy path send after nested resolution (team specified, plain text):**
+- *Supervisor task:* `"Post 'Standup in 5 min' to the Engineering team's #general."`
+- *You:* call `list_teams_channels()` → find the team where `team_name="Engineering"`, take its `team_id`; inside that team's channels, find the entry where `name="general"`, take its `id` as `channel_id`; call `send_teams_message(team_id=<eng_id>, channel_id=<general_id>, content="Standup in 5 min")` → tool returns `status=success`.
+- *Output:*
+
+  ```json
+  {
+    "status": "success",
+    "action_summary": "Posted a message to Engineering › general.",
+    "evidence": { "operation": "send_teams_message", "team_id": "<id>", "team_name": "Engineering", "channel_id": "<id>", "channel_name": "general", "message_id": "<msg_id>", "matched_candidates": null, "items": null },
+    "next_step": null,
+    "missing_fields": null,
+    "assumptions": null
+  }
+  ```
+
+**Example 2 — cross-team channel ambiguity:**
+- *Supervisor task:* `"Post 'Standup in 5 min' to #general."`
+- *You:* call `list_teams_channels()` → find `general` channels in multiple teams (Engineering, Marketing, Operations). The supervisor did not pin a team. Do not silently pick one — return `blocked` with all matching channels as `matched_candidates` so the supervisor can confirm with the user.
+- *Output:*
+
+  ```json
+  {
+    "status": "blocked",
+    "action_summary": "Multiple teams have a 'general' channel; cannot disambiguate.",
+    "evidence": {
+      "operation": "list_teams_channels",
+      "team_id": null,
+      "team_name": null,
+      "channel_id": null,
+      "channel_name": "general",
+      "message_id": null,
+      "matched_candidates": [
+        { "id": "<channel_id_1>", "label": "Engineering › general" },
+        { "id": "<channel_id_2>", "label": "Marketing › general" },
+        { "id": "<channel_id_3>", "label": "Operations › general" }
+      ],
+      "items": null
+    },
+    "next_step": "Ask the user which team's #general they meant — Engineering, Marketing, or Operations.",
+    "missing_fields": null,
+    "assumptions": null
+  }
+  ```
+
+**Example 3 — unsupported operation (edit):**
+- *Supervisor task:* `"Edit my last Teams message in the Engineering team's #general to say 'cancelled'."`
+- *You:* Teams edits are not supported by your tools. Do not call any tool. Do not post a new message like `"EDIT: cancelled"` — block.
+- *Output:*
+
+  ```json
+  {
+    "status": "blocked",
+    "action_summary": "Editing prior Teams messages is not supported.",
+    "evidence": { "operation": null, "team_id": null, "team_name": "Engineering", "channel_id": null, "channel_name": "general", "message_id": null, "matched_candidates": null, "items": null },
+    "next_step": "Editing Teams messages is not supported by the connector. Ask the user to edit the message directly in the Teams UI, or to send a follow-up message instead.",
+    "missing_fields": null,
+    "assumptions": null
+  }
+  ```
+
+## Output contract
+
+Return **only** one JSON object (no markdown or prose outside it):
+
+```json
+{
+  "status": "success" | "partial" | "blocked" | "error",
+  "action_summary": string,
+  "evidence": {
+    "operation": "list_teams_channels" | "read_teams_messages" | "send_teams_message" | null,
+    "team_id": string | null,
+    "team_name": string | null,
+    "channel_id": string | null,
+    "channel_name": string | null,
+    "message_id": string | null,
+    "matched_candidates": [ { "id": string, "label": string } ] | null,
+    "items": object | null
+  },
+  "next_step": string | null,
+  "missing_fields": string[] | null,
+  "assumptions": string[] | null
+}
+```
+
+Rules:
+- `status=success` → `next_step=null`, `missing_fields=null`.
+- `status=partial|blocked|error` → `next_step` must be non-null.
+- `status=blocked` due to missing required inputs → `missing_fields` must be non-null.
+
+Resolve before you call; verify before you send; map every tool outcome faithfully.
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/tools/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/tools/__init__.py
new file mode 100644
index 0000000000..60e2add495
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/tools/__init__.py
@@ -0,0 +1,15 @@
+from app.agents.new_chat.tools.teams.list_channels import (
+    create_list_teams_channels_tool,
+)
+from app.agents.new_chat.tools.teams.read_messages import (
+    create_read_teams_messages_tool,
+)
+from app.agents.new_chat.tools.teams.send_message import (
+    create_send_teams_message_tool,
+)
+
+__all__ = [
+    "create_list_teams_channels_tool",
+    "create_read_teams_messages_tool",
+    "create_send_teams_message_tool",
+]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/tools/_auth.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/tools/_auth.py
new file mode 100644
index 0000000000..7cdbeb8194
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/tools/_auth.py
@@ -0,0 +1,38 @@
+"""Builds Microsoft Graph auth headers for Teams connector tools."""
+
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.future import select
+
+from app.db import SearchSourceConnector, SearchSourceConnectorType
+
+GRAPH_API = "https://graph.microsoft.com/v1.0"
+
+
+async def get_teams_connector(
+    db_session: AsyncSession,
+    search_space_id: int,
+    user_id: str,
+) -> SearchSourceConnector | None:
+    result = await db_session.execute(
+        select(SearchSourceConnector).filter(
+            SearchSourceConnector.search_space_id == search_space_id,
+            SearchSourceConnector.user_id == user_id,
+            SearchSourceConnector.connector_type
+            == SearchSourceConnectorType.TEAMS_CONNECTOR,
+        )
+    )
+    return result.scalars().first()
+
+
+async def get_access_token(
+    db_session: AsyncSession,
+    connector: SearchSourceConnector,
+) -> str:
+    """Get a valid Microsoft Graph access token, refreshing if expired."""
+    from app.connectors.teams_connector import TeamsConnector
+
+    tc = TeamsConnector(
+        session=db_session,
+        connector_id=connector.id,
+    )
+    return await tc._get_valid_token()
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/tools/index.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/tools/index.py
new file mode 100644
index 0000000000..41661651f2
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/tools/index.py
@@ -0,0 +1,36 @@
+"""``teams`` native tools and (empty) permission ruleset.
+
+Tools self-gate via :func:`request_approval` in their bodies.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from langchain_core.tools import BaseTool
+
+from app.agents.new_chat.permissions import Ruleset
+
+from .list_channels import create_list_teams_channels_tool
+from .read_messages import create_read_teams_messages_tool
+from .send_message import create_send_teams_message_tool
+
+NAME = "teams"
+
+RULESET = Ruleset(origin=NAME, rules=[])
+
+
+def load_tools(
+    *, dependencies: dict[str, Any] | None = None, **kwargs: Any
+) -> list[BaseTool]:
+    d = {**(dependencies or {}), **kwargs}
+    common = {
+        "db_session": d["db_session"],
+        "search_space_id": d["search_space_id"],
+        "user_id": d["user_id"],
+    }
+    return [
+        create_list_teams_channels_tool(**common),
+        create_read_teams_messages_tool(**common),
+        create_send_teams_message_tool(**common),
+    ]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/tools/list_channels.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/tools/list_channels.py
new file mode 100644
index 0000000000..d7b0008531
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/tools/list_channels.py
@@ -0,0 +1,92 @@
+import logging
+from typing import Any
+
+import httpx
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from ._auth import GRAPH_API, get_access_token, get_teams_connector
+
+logger = logging.getLogger(__name__)
+
+
+def create_list_teams_channels_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    @tool
+    async def list_teams_channels() -> dict[str, Any]:
+        """List all Microsoft Teams and their channels the user has access to.
+
+        Returns:
+            Dictionary with status and a list of teams, each containing
+            team_id, team_name, and a list of channels (id, name).
+        """
+        if db_session is None or search_space_id is None or user_id is None:
+            return {"status": "error", "message": "Teams tool not properly configured."}
+
+        try:
+            connector = await get_teams_connector(db_session, search_space_id, user_id)
+            if not connector:
+                return {"status": "error", "message": "No Teams connector found."}
+
+            token = await get_access_token(db_session, connector)
+            headers = {"Authorization": f"Bearer {token}"}
+
+            async with httpx.AsyncClient(timeout=20.0) as client:
+                teams_resp = await client.get(
+                    f"{GRAPH_API}/me/joinedTeams", headers=headers
+                )
+
+            if teams_resp.status_code == 401:
+                return {
+                    "status": "auth_error",
+                    "message": "Teams token expired. Please re-authenticate.",
+                    "connector_type": "teams",
+                }
+            if teams_resp.status_code != 200:
+                return {
+                    "status": "error",
+                    "message": f"Graph API error: {teams_resp.status_code}",
+                }
+
+            teams_data = teams_resp.json().get("value", [])
+            result_teams = []
+
+            async with httpx.AsyncClient(timeout=20.0) as client:
+                for team in teams_data:
+                    team_id = team["id"]
+                    ch_resp = await client.get(
+                        f"{GRAPH_API}/teams/{team_id}/channels",
+                        headers=headers,
+                    )
+                    channels = []
+                    if ch_resp.status_code == 200:
+                        channels = [
+                            {"id": ch["id"], "name": ch.get("displayName", "")}
+                            for ch in ch_resp.json().get("value", [])
+                        ]
+                    result_teams.append(
+                        {
+                            "team_id": team_id,
+                            "team_name": team.get("displayName", ""),
+                            "channels": channels,
+                        }
+                    )
+
+            return {
+                "status": "success",
+                "teams": result_teams,
+                "total_teams": len(result_teams),
+            }
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+            logger.error("Error listing Teams channels: %s", e, exc_info=True)
+            return {"status": "error", "message": "Failed to list Teams channels."}
+
+    return list_teams_channels
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/tools/read_messages.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/tools/read_messages.py
new file mode 100644
index 0000000000..d24a7e4d3e
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/tools/read_messages.py
@@ -0,0 +1,103 @@
+import logging
+from typing import Any
+
+import httpx
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from ._auth import GRAPH_API, get_access_token, get_teams_connector
+
+logger = logging.getLogger(__name__)
+
+
+def create_read_teams_messages_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    @tool
+    async def read_teams_messages(
+        team_id: str,
+        channel_id: str,
+        limit: int = 25,
+    ) -> dict[str, Any]:
+        """Read recent messages from a Microsoft Teams channel.
+
+        Args:
+            team_id: The team ID (from list_teams_channels).
+            channel_id: The channel ID (from list_teams_channels).
+            limit: Number of messages to fetch (default 25, max 50).
+
+        Returns:
+            Dictionary with status and a list of messages including
+            id, sender, content, timestamp.
+        """
+        if db_session is None or search_space_id is None or user_id is None:
+            return {"status": "error", "message": "Teams tool not properly configured."}
+
+        limit = min(limit, 50)
+
+        try:
+            connector = await get_teams_connector(db_session, search_space_id, user_id)
+            if not connector:
+                return {"status": "error", "message": "No Teams connector found."}
+
+            token = await get_access_token(db_session, connector)
+
+            async with httpx.AsyncClient(timeout=20.0) as client:
+                resp = await client.get(
+                    f"{GRAPH_API}/teams/{team_id}/channels/{channel_id}/messages",
+                    headers={"Authorization": f"Bearer {token}"},
+                    params={"$top": limit},
+                )
+
+            if resp.status_code == 401:
+                return {
+                    "status": "auth_error",
+                    "message": "Teams token expired. Please re-authenticate.",
+                    "connector_type": "teams",
+                }
+            if resp.status_code == 403:
+                return {
+                    "status": "error",
+                    "message": "Insufficient permissions to read this channel.",
+                }
+            if resp.status_code != 200:
+                return {
+                    "status": "error",
+                    "message": f"Graph API error: {resp.status_code}",
+                }
+
+            raw_msgs = resp.json().get("value", [])
+            messages = []
+            for m in raw_msgs:
+                sender = m.get("from", {})
+                user_info = sender.get("user", {}) if sender else {}
+                body = m.get("body", {})
+                messages.append(
+                    {
+                        "id": m.get("id"),
+                        "sender": user_info.get("displayName", "Unknown"),
+                        "content": body.get("content", ""),
+                        "content_type": body.get("contentType", "text"),
+                        "timestamp": m.get("createdDateTime", ""),
+                    }
+                )
+
+            return {
+                "status": "success",
+                "team_id": team_id,
+                "channel_id": channel_id,
+                "messages": messages,
+                "total": len(messages),
+            }
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+            logger.error("Error reading Teams messages: %s", e, exc_info=True)
+            return {"status": "error", "message": "Failed to read Teams messages."}
+
+    return read_teams_messages
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/tools/send_message.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/tools/send_message.py
new file mode 100644
index 0000000000..f1469e3e1a
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/tools/send_message.py
@@ -0,0 +1,117 @@
+import logging
+from typing import Any
+
+import httpx
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.agents.multi_agent_chat.subagents.shared.hitl.approvals.self_gated import (
+    request_approval,
+)
+
+from ._auth import GRAPH_API, get_access_token, get_teams_connector
+
+logger = logging.getLogger(__name__)
+
+
+def create_send_teams_message_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    @tool
+    async def send_teams_message(
+        team_id: str,
+        channel_id: str,
+        content: str,
+    ) -> dict[str, Any]:
+        """Send a message to a Microsoft Teams channel.
+
+        Requires the ChannelMessage.Send OAuth scope. If the user gets a
+        permission error, they may need to re-authenticate with updated scopes.
+
+        Args:
+            team_id: The team ID (from list_teams_channels).
+            channel_id: The channel ID (from list_teams_channels).
+            content: The message text (HTML supported).
+
+        Returns:
+            Dictionary with status, message_id on success.
+
+            IMPORTANT:
+            - If status is "rejected", the user explicitly declined. Do NOT retry.
+        """
+        if db_session is None or search_space_id is None or user_id is None:
+            return {"status": "error", "message": "Teams tool not properly configured."}
+
+        try:
+            connector = await get_teams_connector(db_session, search_space_id, user_id)
+            if not connector:
+                return {"status": "error", "message": "No Teams connector found."}
+
+            result = request_approval(
+                action_type="teams_send_message",
+                tool_name="send_teams_message",
+                params={
+                    "team_id": team_id,
+                    "channel_id": channel_id,
+                    "content": content,
+                },
+                context={"connector_id": connector.id},
+            )
+
+            if result.rejected:
+                return {
+                    "status": "rejected",
+                    "message": "User declined. Message was not sent.",
+                }
+
+            final_content = result.params.get("content", content)
+            final_team = result.params.get("team_id", team_id)
+            final_channel = result.params.get("channel_id", channel_id)
+
+            token = await get_access_token(db_session, connector)
+
+            async with httpx.AsyncClient(timeout=20.0) as client:
+                resp = await client.post(
+                    f"{GRAPH_API}/teams/{final_team}/channels/{final_channel}/messages",
+                    headers={
+                        "Authorization": f"Bearer {token}",
+                        "Content-Type": "application/json",
+                    },
+                    json={"body": {"content": final_content}},
+                )
+
+            if resp.status_code == 401:
+                return {
+                    "status": "auth_error",
+                    "message": "Teams token expired. Please re-authenticate.",
+                    "connector_type": "teams",
+                }
+            if resp.status_code == 403:
+                return {
+                    "status": "insufficient_permissions",
+                    "message": "Missing ChannelMessage.Send permission. Please re-authenticate with updated scopes.",
+                }
+            if resp.status_code not in (200, 201):
+                return {
+                    "status": "error",
+                    "message": f"Graph API error: {resp.status_code} — {resp.text[:200]}",
+                }
+
+            msg_data = resp.json()
+            return {
+                "status": "success",
+                "message_id": msg_data.get("id"),
+                "message": "Message sent to Teams channel.",
+            }
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+            logger.error("Error sending Teams message: %s", e, exc_info=True)
+            return {"status": "error", "message": "Failed to send Teams message."}
+
+    return send_teams_message
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/mcp_tools/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/mcp_tools/__init__.py
new file mode 100644
index 0000000000..5d07076107
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/mcp_tools/__init__.py
@@ -0,0 +1,15 @@
+"""Load MCP tools and partition them by connector agent."""
+
+from __future__ import annotations
+
+from .index import (
+    fetch_mcp_connector_metadata_maps,
+    load_mcp_tools_by_connector,
+    partition_mcp_tools_by_connector,
+)
+
+__all__ = [
+    "fetch_mcp_connector_metadata_maps",
+    "load_mcp_tools_by_connector",
+    "partition_mcp_tools_by_connector",
+]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/mcp_tools/index.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/mcp_tools/index.py
new file mode 100644
index 0000000000..16dc09ac54
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/mcp_tools/index.py
@@ -0,0 +1,113 @@
+"""Discover MCP tools and bucket them by connector agent.
+
+Tool gating is no longer the loader's concern: each subagent declares its
+own :class:`Ruleset` and the per-subagent :class:`PermissionMiddleware`
+enforces it at runtime. This module just routes flat ``BaseTool`` lists
+to the right subagents.
+"""
+
+from __future__ import annotations
+
+import logging
+from collections import defaultdict
+from collections.abc import Sequence
+from typing import Any
+
+from langchain_core.tools import BaseTool
+from sqlalchemy import cast, select
+from sqlalchemy.dialects.postgresql import JSONB
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.agents.multi_agent_chat.constants import (
+    CONNECTOR_TYPE_TO_CONNECTOR_AGENT_MAPS,
+)
+from app.agents.new_chat.tools.mcp_tool import load_mcp_tools
+from app.db import SearchSourceConnector
+
+logger = logging.getLogger(__name__)
+
+
+async def fetch_mcp_connector_metadata_maps(
+    session: AsyncSession,
+    search_space_id: int,
+) -> tuple[dict[int, str], dict[str, str]]:
+    """Resolve connector id and display name to connector type for MCP tool routing."""
+    result = await session.execute(
+        select(SearchSourceConnector).filter(
+            SearchSourceConnector.search_space_id == search_space_id,
+            cast(SearchSourceConnector.config, JSONB).has_key("server_config"),
+        ),
+    )
+    id_to_type: dict[int, str] = {}
+    name_to_type: dict[str, str] = {}
+    for connector in result.scalars():
+        ct = (
+            connector.connector_type.value
+            if hasattr(connector.connector_type, "value")
+            else str(connector.connector_type)
+        )
+        id_to_type[connector.id] = ct
+        if connector.name:
+            name_to_type[connector.name] = ct
+    return id_to_type, name_to_type
+
+
+def partition_mcp_tools_by_connector(
+    tools: Sequence[BaseTool],
+    connector_id_to_type: dict[int, str],
+    connector_name_to_type: dict[str, str],
+) -> dict[str, list[BaseTool]]:
+    """Assign each MCP tool to one connector-agent bucket from connector metadata."""
+    buckets: dict[str, list[BaseTool]] = defaultdict(list)
+
+    for tool in tools:
+        meta: dict[str, Any] = getattr(tool, "metadata", None) or {}
+        connector_type: str | None = None
+
+        cid = meta.get("mcp_connector_id")
+        if cid is not None:
+            try:
+                cid_int = int(cid)
+            except (TypeError, ValueError):
+                cid_int = None
+            if cid_int is not None:
+                connector_type = connector_id_to_type.get(cid_int)
+
+        if connector_type is None and meta.get("mcp_transport") == "stdio":
+            cname = meta.get("mcp_connector_name")
+            if cname:
+                connector_type = connector_name_to_type.get(str(cname))
+
+        if connector_type is None:
+            logger.debug(
+                "Skipping MCP tool %r — could not resolve connector type from metadata",
+                getattr(tool, "name", None),
+            )
+            continue
+
+        connector_agent = CONNECTOR_TYPE_TO_CONNECTOR_AGENT_MAPS.get(connector_type)
+        if connector_agent is None:
+            logger.warning(
+                "MCP tool %r has unmapped connector type %s — skipped",
+                getattr(tool, "name", None),
+                connector_type,
+            )
+            continue
+
+        buckets[connector_agent].append(tool)
+
+    return dict(buckets)
+
+
+async def load_mcp_tools_by_connector(
+    session: AsyncSession,
+    search_space_id: int,
+) -> dict[str, list[BaseTool]]:
+    """Load MCP tools and route them to each subagent as a flat list.
+
+    ``bypass_internal_hitl=True`` is set so tool gating is uniformly the
+    consuming subagent's :class:`PermissionMiddleware` responsibility.
+    """
+    flat = await load_mcp_tools(session, search_space_id, bypass_internal_hitl=True)
+    id_map, name_map = await fetch_mcp_connector_metadata_maps(session, search_space_id)
+    return partition_mcp_tools_by_connector(flat, id_map, name_map)
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/registry.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/registry.py
new file mode 100644
index 0000000000..27c1476729
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/registry.py
@@ -0,0 +1,210 @@
+"""Central registry of route ``build_subagent`` callables (keyed by ``NAME``)."""
+
+from __future__ import annotations
+
+from typing import Any, Protocol
+
+from deepagents import SubAgent
+from langchain_core.language_models import BaseChatModel
+from langchain_core.tools import BaseTool
+
+from app.agents.multi_agent_chat.constants import (
+    SUBAGENT_TO_REQUIRED_CONNECTOR_MAP,
+)
+from app.agents.multi_agent_chat.subagents.builtins.deliverables.agent import (
+    build_subagent as build_deliverables_subagent,
+)
+from app.agents.multi_agent_chat.subagents.builtins.knowledge_base.agent import (
+    build_subagent as build_knowledge_base_subagent,
+)
+from app.agents.multi_agent_chat.subagents.builtins.memory.agent import (
+    build_subagent as build_memory_subagent,
+)
+from app.agents.multi_agent_chat.subagents.builtins.research.agent import (
+    build_subagent as build_research_subagent,
+)
+from app.agents.multi_agent_chat.subagents.connectors.airtable.agent import (
+    build_subagent as build_airtable_subagent,
+)
+from app.agents.multi_agent_chat.subagents.connectors.calendar.agent import (
+    build_subagent as build_calendar_subagent,
+)
+from app.agents.multi_agent_chat.subagents.connectors.clickup.agent import (
+    build_subagent as build_clickup_subagent,
+)
+from app.agents.multi_agent_chat.subagents.connectors.confluence.agent import (
+    build_subagent as build_confluence_subagent,
+)
+from app.agents.multi_agent_chat.subagents.connectors.discord.agent import (
+    build_subagent as build_discord_subagent,
+)
+from app.agents.multi_agent_chat.subagents.connectors.dropbox.agent import (
+    build_subagent as build_dropbox_subagent,
+)
+from app.agents.multi_agent_chat.subagents.connectors.gmail.agent import (
+    build_subagent as build_gmail_subagent,
+)
+from app.agents.multi_agent_chat.subagents.connectors.google_drive.agent import (
+    build_subagent as build_google_drive_subagent,
+)
+from app.agents.multi_agent_chat.subagents.connectors.jira.agent import (
+    build_subagent as build_jira_subagent,
+)
+from app.agents.multi_agent_chat.subagents.connectors.linear.agent import (
+    build_subagent as build_linear_subagent,
+)
+from app.agents.multi_agent_chat.subagents.connectors.luma.agent import (
+    build_subagent as build_luma_subagent,
+)
+from app.agents.multi_agent_chat.subagents.connectors.notion.agent import (
+    build_subagent as build_notion_subagent,
+)
+from app.agents.multi_agent_chat.subagents.connectors.onedrive.agent import (
+    build_subagent as build_onedrive_subagent,
+)
+from app.agents.multi_agent_chat.subagents.connectors.slack.agent import (
+    build_subagent as build_slack_subagent,
+)
+from app.agents.multi_agent_chat.subagents.connectors.teams.agent import (
+    build_subagent as build_teams_subagent,
+)
+from app.agents.multi_agent_chat.subagents.shared.md_file_reader import (
+    read_md_file,
+)
+from app.agents.multi_agent_chat.subagents.shared.spec import SurfSenseSubagentSpec
+
+
+class SubagentBuilder(Protocol):
+    def __call__(
+        self,
+        *,
+        dependencies: dict[str, Any],
+        model: BaseChatModel | None = None,
+        middleware_stack: dict[str, Any] | None = None,
+        mcp_tools: list[BaseTool] | None = None,
+    ) -> SurfSenseSubagentSpec: ...
+
+
+SUBAGENT_BUILDERS_BY_NAME: dict[str, SubagentBuilder] = {
+    "airtable": build_airtable_subagent,
+    "calendar": build_calendar_subagent,
+    "clickup": build_clickup_subagent,
+    "confluence": build_confluence_subagent,
+    "deliverables": build_deliverables_subagent,
+    "discord": build_discord_subagent,
+    "dropbox": build_dropbox_subagent,
+    "gmail": build_gmail_subagent,
+    "google_drive": build_google_drive_subagent,
+    "jira": build_jira_subagent,
+    "knowledge_base": build_knowledge_base_subagent,
+    "linear": build_linear_subagent,
+    "luma": build_luma_subagent,
+    "memory": build_memory_subagent,
+    "notion": build_notion_subagent,
+    "onedrive": build_onedrive_subagent,
+    "research": build_research_subagent,
+    "slack": build_slack_subagent,
+    "teams": build_teams_subagent,
+}
+
+
+def _route_resource_package(builder: SubagentBuilder) -> str:
+    mod = builder.__module__
+    return mod[: -len(".agent")] if mod.endswith(".agent") else mod.rsplit(".", 1)[0]
+
+
+def main_prompt_registry_subagent_lines(exclude: list[str]) -> list[tuple[str, str]]:
+    """(name, description) for registry specialists included for **task** (same rules as ``build_subagents``)."""
+    banned = frozenset(("memory", "research")) | frozenset(exclude)
+    rows: list[tuple[str, str]] = []
+    for name in sorted(SUBAGENT_BUILDERS_BY_NAME):
+        if name in banned:
+            continue
+        builder = SUBAGENT_BUILDERS_BY_NAME[name]
+        pkg = _route_resource_package(builder)
+        blurb = read_md_file(pkg, "description").strip()
+        if not blurb:
+            blurb = name.replace("_", " ").title()
+        rows.append((name, blurb))
+    return rows
+
+
+def get_subagents_to_exclude(
+    available_connectors: list[str] | None,
+) -> list[str]:
+    if available_connectors is None:
+        return []
+    available_tokens = frozenset(available_connectors)
+    excluded_names: set[str] = set()
+    for builder_name in SUBAGENT_BUILDERS_BY_NAME:
+        required_tokens = SUBAGENT_TO_REQUIRED_CONNECTOR_MAP.get(builder_name)
+        if required_tokens is None:
+            excluded_names.add(builder_name)
+            continue
+        if not required_tokens:
+            continue
+        if not (required_tokens & available_tokens):
+            excluded_names.add(builder_name)
+    return sorted(excluded_names)
+
+
+def _filter_disabled_tools_in_place(
+    spec: SubAgent,
+    disabled_names: frozenset[str],
+) -> None:
+    """Drop UI-disabled tools from ``spec["tools"]``."""
+    if not disabled_names:
+        return
+    tools = spec.get("tools")  # type: ignore[typeddict-item]
+    if isinstance(tools, list):
+        spec["tools"] = [  # type: ignore[typeddict-unknown-key]
+            t for t in tools if getattr(t, "name", None) not in disabled_names
+        ]
+
+
+def _inject_ask_kb_tool_in_place(spec: SubAgent, ask_kb_tool: BaseTool) -> None:
+    """Append ``ask_knowledge_base`` to every non-KB spec (skips a self-call)."""
+    if spec["name"] == "knowledge_base":
+        return
+    tools = spec.get("tools")  # type: ignore[typeddict-item]
+    if not isinstance(tools, list):
+        spec["tools"] = [ask_kb_tool]  # type: ignore[typeddict-unknown-key]
+        return
+    if any(getattr(t, "name", None) == ask_kb_tool.name for t in tools):
+        return
+    tools.append(ask_kb_tool)
+
+
+def build_subagents(
+    *,
+    dependencies: dict[str, Any],
+    model: BaseChatModel | None = None,
+    middleware_stack: dict[str, Any] | None = None,
+    mcp_tools_by_agent: dict[str, list[BaseTool]] | None = None,
+    exclude: list[str] | None = None,
+    disabled_tools: list[str] | None = None,
+    ask_kb_tool: BaseTool | None = None,
+) -> list[SubAgent]:
+    """Build registry subagents; skip memory/research; skip names in exclude."""
+    mcp = mcp_tools_by_agent or {}
+    specs: list[SubAgent] = []
+    excluded = ["memory", "research"]
+    if exclude:
+        excluded.extend(exclude)
+    disabled_names = frozenset(disabled_tools or ())
+    for name in sorted(SUBAGENT_BUILDERS_BY_NAME):
+        if name in excluded:
+            continue
+        builder = SUBAGENT_BUILDERS_BY_NAME[name]
+        result = builder(
+            dependencies=dependencies,
+            model=model,
+            middleware_stack=middleware_stack,
+            mcp_tools=mcp.get(name),
+        )
+        spec = result.spec
+        _filter_disabled_tools_in_place(spec, disabled_names)
+        if ask_kb_tool is not None:
+            _inject_ask_kb_tool_in_place(spec, ask_kb_tool)
+        specs.append(spec)
+    return specs
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/__init__.py
new file mode 100644
index 0000000000..70d3dfe392
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/__init__.py
@@ -0,0 +1,17 @@
+"""Cross-slice helpers for route subagents."""
+
+from __future__ import annotations
+
+from app.agents.multi_agent_chat.subagents.shared.md_file_reader import (
+    read_md_file,
+)
+from app.agents.multi_agent_chat.subagents.shared.spec import SurfSenseSubagentSpec
+from app.agents.multi_agent_chat.subagents.shared.subagent_builder import (
+    pack_subagent,
+)
+
+__all__ = [
+    "SurfSenseSubagentSpec",
+    "pack_subagent",
+    "read_md_file",
+]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/hitl/approvals/self_gated/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/hitl/approvals/self_gated/__init__.py
new file mode 100644
index 0000000000..038ec5652a
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/hitl/approvals/self_gated/__init__.py
@@ -0,0 +1,17 @@
+"""Self-gated approval primitive — tools that pause from inside their own body.
+
+Public surface:
+- :func:`request_approval` — entry point for sensitive tool bodies.
+- :class:`HITLResult` — outcome contract.
+- ``DEFAULT_AUTO_APPROVED_TOOLS`` — safe-by-construction allowlist.
+"""
+
+from .auto_approved import DEFAULT_AUTO_APPROVED_TOOLS
+from .request import request_approval
+from .result import HITLResult
+
+__all__ = [
+    "DEFAULT_AUTO_APPROVED_TOOLS",
+    "HITLResult",
+    "request_approval",
+]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/hitl/approvals/self_gated/auto_approved.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/hitl/approvals/self_gated/auto_approved.py
new file mode 100644
index 0000000000..b99b26f3a5
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/hitl/approvals/self_gated/auto_approved.py
@@ -0,0 +1,35 @@
+"""Default safe-by-construction allowlist for self-gated approvals.
+
+Tools listed here mirror the safety profile of ``write_file`` against the
+SurfSense KB: each call creates exactly one artifact in the user's own
+workspace with no external visibility (drafts aren't sent; new files aren't
+shared unless the user shares them later). Auto-approving them lets the agent
+seed scratch artifacts without firing a popup on every call.
+
+Members still flow through :func:`request_approval` — the function returns
+immediately with ``decision_type="auto_approved"`` and the original params
+untouched. This keeps tool bodies (logging, metadata fetches, account
+fallbacks) symmetrical with the prompted path; the only behavior change is
+"no interrupt fires".
+
+Per-search-space ``agent_permission_rules`` (when wired) take precedence and
+can re-enable prompting for any of these.
+"""
+
+from __future__ import annotations
+
+DEFAULT_AUTO_APPROVED_TOOLS: frozenset[str] = frozenset(
+    {
+        "create_gmail_draft",
+        "update_gmail_draft",
+        "create_calendar_event",
+        "create_notion_page",
+        "create_confluence_page",
+        "create_google_drive_file",
+        "create_dropbox_file",
+        "create_onedrive_file",
+    }
+)
+
+
+__all__ = ["DEFAULT_AUTO_APPROVED_TOOLS"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/hitl/approvals/self_gated/request.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/hitl/approvals/self_gated/request.py
new file mode 100644
index 0000000000..8729ea85b9
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/hitl/approvals/self_gated/request.py
@@ -0,0 +1,119 @@
+"""Self-gated approval entry point — pause from inside a tool body.
+
+Sensitive connector tools (Gmail send, Notion delete, Linear issue create…)
+call :func:`request_approval` to ask the user before performing the side
+effect. The function emits the unified langchain HITL wire payload (so the
+parallel-HITL routing layer in ``task_tool`` and ``resume_routing`` sees the
+same shape it sees for middleware-gated approvals) and returns a typed
+:class:`HITLResult`.
+
+Synchronous on purpose: ``langgraph.types.interrupt`` raises ``GraphInterrupt``
+inline; the langgraph runtime catches it. Making this ``async`` would only
+move the throw site without changing semantics.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+from langgraph.types import interrupt
+
+from app.agents.multi_agent_chat.subagents.shared.hitl.wire import (
+    LC_DECISION_APPROVE,
+    LC_DECISION_EDIT,
+    LC_DECISION_REJECT,
+    build_lc_hitl_payload,
+    parse_lc_envelope,
+)
+
+from .auto_approved import DEFAULT_AUTO_APPROVED_TOOLS
+from .result import HITLResult
+
+logger = logging.getLogger(__name__)
+
+# Decisions a self-gated card may carry back. ``"always"`` is reserved for
+# permission-rule promotion (middleware-gated path) and intentionally absent
+# here.
+_SELF_GATED_DECISIONS: list[str] = [
+    LC_DECISION_APPROVE,
+    LC_DECISION_REJECT,
+    LC_DECISION_EDIT,
+]
+
+
+def request_approval(
+    *,
+    action_type: str,
+    tool_name: str,
+    params: dict[str, Any],
+    context: dict[str, Any] | None = None,
+    trusted_tools: list[str] | None = None,
+) -> HITLResult:
+    """Pause the graph for user approval and return the user's decision.
+
+    Args:
+        action_type: FE card discriminator (``"gmail_email_send"``,
+            ``"mcp_tool_call"``…). Forwarded as ``interrupt_type`` on the
+            wire so the FE can mount the right card variant.
+        tool_name: Registered langchain tool name (``"send_gmail_email"``…)
+            shown in the card header and used for trust-list lookups.
+        params: Original tool arguments. Rendered to the user and used as
+            defaults when no edits are made.
+        context: Rich metadata (account info, folder lists, MCP server name…)
+            forwarded verbatim to the FE for richer card chrome.
+        trusted_tools: Per-session allowlist; when ``tool_name`` is in it the
+            interrupt is skipped and the tool runs immediately.
+
+    Returns:
+        :class:`HITLResult` with ``rejected=True`` if the user declined or
+        the resume envelope was unparseable; otherwise ``params`` carries
+        the original args (or args shallow-merged with the user's edits on
+        ``"edit"``).
+    """
+    if trusted_tools and tool_name in trusted_tools:
+        logger.info("Tool %r is user-trusted — skipping HITL", tool_name)
+        return HITLResult(rejected=False, decision_type="trusted", params=dict(params))
+
+    if tool_name in DEFAULT_AUTO_APPROVED_TOOLS:
+        logger.info(
+            "Tool %r is in DEFAULT_AUTO_APPROVED_TOOLS — skipping HITL", tool_name
+        )
+        return HITLResult(
+            rejected=False, decision_type="auto_approved", params=dict(params)
+        )
+
+    payload = build_lc_hitl_payload(
+        tool_name=tool_name,
+        args=params,
+        allowed_decisions=_SELF_GATED_DECISIONS,
+        interrupt_type=action_type,
+        context=context,
+    )
+    approval = interrupt(payload)
+
+    parsed = parse_lc_envelope(approval)
+    logger.info("User decision for %r: %s", tool_name, parsed.decision_type)
+
+    if parsed.decision_type == LC_DECISION_REJECT:
+        return HITLResult(rejected=True, decision_type="reject", params=dict(params))
+
+    # Anything outside approve/edit at this point is unexpected — fail closed
+    # so a malformed FE envelope can't smuggle a side effect through.
+    if parsed.decision_type not in (LC_DECISION_APPROVE, LC_DECISION_EDIT):
+        logger.warning(
+            "Unrecognized decision %r for %r — rejecting for safety",
+            parsed.decision_type,
+            tool_name,
+        )
+        return HITLResult(rejected=True, decision_type="error", params=dict(params))
+
+    final_params = (
+        {**params, **parsed.edited_args} if parsed.edited_args else dict(params)
+    )
+    return HITLResult(
+        rejected=False, decision_type=parsed.decision_type, params=final_params
+    )
+
+
+__all__ = ["request_approval"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/hitl/approvals/self_gated/result.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/hitl/approvals/self_gated/result.py
new file mode 100644
index 0000000000..645e6d47e0
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/hitl/approvals/self_gated/result.py
@@ -0,0 +1,34 @@
+"""Outcome contract returned by :func:`request_approval`.
+
+Lives in its own file so callers that only need the type for annotations don't
+drag in ``langgraph`` imports through the entry-point module.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+
+@dataclass(frozen=True, slots=True)
+class HITLResult:
+    """Outcome of a self-gated human-in-the-loop approval request.
+
+    Attributes:
+        rejected: ``True`` when the tool MUST NOT execute (user said no, or
+            the wire envelope was unparseable). Always check this first.
+        decision_type: Reason tag for logging / metrics —
+            ``"approve" | "edit" | "reject" | "trusted" | "auto_approved"
+            | "error"``. Callers shouldn't branch on this for control flow;
+            use ``rejected`` for that.
+        params: Final parameters to pass to the underlying tool. On
+            ``"edit"`` this is the original ``params`` shallow-merged with
+            the user's edits; otherwise it's a copy of the originals.
+    """
+
+    rejected: bool
+    decision_type: str
+    params: dict[str, Any] = field(default_factory=dict)
+
+
+__all__ = ["HITLResult"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/hitl/wire/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/hitl/wire/__init__.py
new file mode 100644
index 0000000000..2d35ac0564
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/hitl/wire/__init__.py
@@ -0,0 +1,26 @@
+"""Single source of truth for the langchain HITL wire format used by every approval path.
+
+Public surface:
+- :func:`build_lc_hitl_payload` — outbound (interrupt argument).
+- :func:`parse_lc_envelope` + :class:`ParsedLcDecision` — inbound (resume value).
+- Decision-type constants for callers that care about identity rather than literals.
+"""
+
+from .decision import ParsedLcDecision, parse_lc_envelope
+from .payload import (
+    LC_DECISION_APPROVE,
+    LC_DECISION_EDIT,
+    LC_DECISION_REJECT,
+    SURFSENSE_DECISION_APPROVE_ALWAYS,
+    build_lc_hitl_payload,
+)
+
+__all__ = [
+    "LC_DECISION_APPROVE",
+    "LC_DECISION_EDIT",
+    "LC_DECISION_REJECT",
+    "SURFSENSE_DECISION_APPROVE_ALWAYS",
+    "ParsedLcDecision",
+    "build_lc_hitl_payload",
+    "parse_lc_envelope",
+]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/hitl/wire/decision.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/hitl/wire/decision.py
new file mode 100644
index 0000000000..43fd0382c2
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/hitl/wire/decision.py
@@ -0,0 +1,121 @@
+"""Parse the langchain HITL resume envelope into a typed decision.
+
+Both self-gated approvals (``request_approval``) and middleware-gated paths
+(``PermissionMiddleware``) receive the user's reply through langgraph's
+``Command(resume=...)`` channel as ``{"decisions": [{"type": ..., ...}]}``.
+This module owns the decoding so the wire-shape knowledge lives in exactly
+one place; callers project the parsed values into their own domain decisions
+(``HITLResult`` for self-gated, ``decision_type`` for permissions) without
+re-implementing the envelope walk.
+
+Failing closed: any unrecognized envelope shape collapses to
+``decision_type="reject"`` (with a warning) so callers never proceed on
+ambiguous input.
+"""
+
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass(frozen=True, slots=True)
+class ParsedLcDecision:
+    """Decoded resume reply with the fields callers actually need.
+
+    Attributes:
+        decision_type: Lower-cased decision identifier — ``"approve"``,
+            ``"reject"``, ``"edit"``, ``"approve_always"``, or any custom value
+            the FE may emit. Callers map this to their own domain semantics.
+        edited_args: Populated only on ``"edit"`` replies that actually carry
+            args; ``None`` otherwise so callers can use truthiness directly.
+        message: Free-form user feedback (typically attached to ``"reject"``).
+            ``None`` when absent or when the value isn't a non-empty string.
+    """
+
+    decision_type: str
+    edited_args: dict[str, Any] | None = None
+    message: str | None = None
+
+
+def parse_lc_envelope(envelope: Any) -> ParsedLcDecision:
+    """Extract a typed decision from a langgraph resume envelope.
+
+    Accepts:
+
+    - ``{"decisions": [{"type": "approve" | "reject" | "edit", ...}]}`` — the
+      langchain HITL standard envelope.
+    - A bare scalar string (``"once"``, ``"approve_always"``, ``"reject"``) —
+      used by the legacy SurfSense permission wire. We tolerate it so the
+      parser can sit behind both call sites without a second adapter.
+
+    Edit args are read from the standard ``edited_action.args`` first, then
+    fall back to a flat ``args`` field for legacy compatibility — both shapes
+    are produced by the FE depending on which card variant was rendered.
+
+    Args:
+        envelope: The raw resume value as it arrived from langgraph.
+
+    Returns:
+        A :class:`ParsedLcDecision` describing the user's intent.
+    """
+    if isinstance(envelope, str):
+        return ParsedLcDecision(decision_type=envelope.lower())
+
+    if not isinstance(envelope, dict):
+        logger.warning(
+            "Resume envelope is not a dict (got %s); treating as reject",
+            type(envelope).__name__,
+        )
+        return ParsedLcDecision(decision_type="reject")
+
+    payload: dict[str, Any] = envelope
+    decisions = envelope.get("decisions")
+    if isinstance(decisions, list) and decisions:
+        first = decisions[0]
+        if isinstance(first, dict):
+            payload = first
+
+    raw_type = payload.get("type") or payload.get("decision_type")
+    if not raw_type:
+        logger.warning(
+            "Resume payload missing decision type (keys=%s); treating as reject",
+            list(payload.keys()),
+        )
+        return ParsedLcDecision(decision_type="reject")
+
+    decision_type = str(raw_type).lower()
+    edited_args = _extract_edited_args(payload) if decision_type == "edit" else None
+    message = _extract_message(payload)
+    return ParsedLcDecision(
+        decision_type=decision_type,
+        edited_args=edited_args,
+        message=message,
+    )
+
+
+def _extract_edited_args(payload: dict[str, Any]) -> dict[str, Any] | None:
+    """Pull non-empty edited args from either the LC nested or flat shape."""
+    edited_action = payload.get("edited_action")
+    if isinstance(edited_action, dict):
+        nested = edited_action.get("args")
+        if isinstance(nested, dict) and nested:
+            return nested
+    flat = payload.get("args")
+    if isinstance(flat, dict) and flat:
+        return flat
+    return None
+
+
+def _extract_message(payload: dict[str, Any]) -> str | None:
+    """Pull a non-empty user-feedback string, accepting either field name."""
+    raw = payload.get("feedback") or payload.get("message")
+    if isinstance(raw, str) and raw.strip():
+        return raw
+    return None
+
+
+__all__ = ["ParsedLcDecision", "parse_lc_envelope"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/hitl/wire/payload.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/hitl/wire/payload.py
new file mode 100644
index 0000000000..bac4a6677c
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/hitl/wire/payload.py
@@ -0,0 +1,86 @@
+"""Build the langchain HITL ``interrupt(...)`` payload — single source of truth.
+
+Every approval path in the multi-agent stack — self-gated tool bodies that call
+``request_approval``, and middleware-gated paths (``HumanInTheLoopMiddleware``,
+``PermissionMiddleware``) — emits the SAME wire shape from this module so the
+parallel-HITL routing layer (``task_tool``, ``resume_routing``) only ever sees
+one format. SurfSense-specific extras (FE card discriminator, structured
+context) ride alongside the langchain standard fields without colliding with
+them.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+LC_DECISION_APPROVE = "approve"
+LC_DECISION_REJECT = "reject"
+LC_DECISION_EDIT = "edit"
+
+# ``approve_always`` is a SurfSense extension surfaced by ``PermissionMiddleware``
+# so a single click can promote the matched pattern to a runtime allow rule and
+# (for MCP tools) save it to the user's trusted-tools list. The FE renders an
+# extra button when it appears in ``allowed_decisions``.
+SURFSENSE_DECISION_APPROVE_ALWAYS = "approve_always"
+
+
+def build_lc_hitl_payload(
+    *,
+    tool_name: str,
+    args: dict[str, Any],
+    allowed_decisions: list[str],
+    interrupt_type: str,
+    description: str | None = None,
+    context: dict[str, Any] | None = None,
+) -> dict[str, Any]:
+    """Build the unified langchain HITL interrupt payload.
+
+    Args:
+        tool_name: The langchain tool's registered name (drives both the action
+            request and the review config so the FE can pair them).
+        args: Tool call arguments shown to the user. ``None`` is normalized to
+            an empty dict so the FE always has a stable shape to render.
+        allowed_decisions: Subset of
+            ``[LC_DECISION_APPROVE, LC_DECISION_REJECT, LC_DECISION_EDIT,
+            SURFSENSE_DECISION_APPROVE_ALWAYS]``. Other values are passed through
+            but the FE may not render a control for them.
+        interrupt_type: SurfSense card discriminator (``"gmail_email_send"``,
+            ``"permission_ask"``, etc.); the FE keys off this to mount the
+            right card.
+        description: Optional human-readable line shown above the args block.
+        context: Optional structured metadata (account info, matched permission
+            rules, etc.) forwarded verbatim for richer card chrome.
+
+    Returns:
+        A dict suitable for ``langgraph.types.interrupt(...)``. Top-level
+        ``action_requests`` and ``review_configs`` are what
+        ``collect_pending_tool_calls`` reads at the routing layer; the
+        SurfSense extensions (``interrupt_type``, ``context``) sit alongside
+        them — langchain ignores unknown keys, so the contract stays clean.
+    """
+    request: dict[str, Any] = {"name": tool_name, "args": args or {}}
+    if description:
+        request["description"] = description
+
+    payload: dict[str, Any] = {
+        "action_requests": [request],
+        "review_configs": [
+            {
+                "action_name": tool_name,
+                "allowed_decisions": list(allowed_decisions),
+            }
+        ],
+        "interrupt_type": interrupt_type,
+    }
+    if context:
+        payload["context"] = context
+    return payload
+
+
+__all__ = [
+    "LC_DECISION_APPROVE",
+    "LC_DECISION_EDIT",
+    "LC_DECISION_REJECT",
+    "SURFSENSE_DECISION_APPROVE_ALWAYS",
+    "build_lc_hitl_payload",
+]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/md_file_reader.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/md_file_reader.py
new file mode 100644
index 0000000000..2fce413a60
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/md_file_reader.py
@@ -0,0 +1,14 @@
+"""Load markdown files shipped alongside a route package."""
+
+from __future__ import annotations
+
+from importlib import resources
+
+
+def read_md_file(package: str, stem: str) -> str:
+    """Load ``{stem}.md`` from ``package`` via importlib resources, or return empty."""
+    ref = resources.files(package).joinpath(f"{stem}.md")
+    if not ref.is_file():
+        return ""
+    text = ref.read_text(encoding="utf-8")
+    return text.rstrip("\n")
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/spec.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/spec.py
new file mode 100644
index 0000000000..797ab535b1
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/spec.py
@@ -0,0 +1,29 @@
+"""SurfSense's subagent contribution: deepagents spec + permission ruleset."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from deepagents import SubAgent
+
+from app.agents.new_chat.permissions import Ruleset
+
+
+@dataclass(frozen=True, slots=True)
+class SurfSenseSubagentSpec:
+    """A subagent contribution from a SurfSense route.
+
+    Attributes:
+        spec: The deepagents-shaped dict handed to ``create_agent``. Holds
+            only fields ``deepagents.SubAgent`` recognises.
+        ruleset: Permission rules this subagent contributes. The orchestrator
+            layers them into the subagent's :class:`PermissionMiddleware`,
+            so each subagent owns its own ruleset without aliasing the
+            shared rule engine.
+    """
+
+    spec: SubAgent
+    ruleset: Ruleset
+
+
+__all__ = ["SurfSenseSubagentSpec"]
diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/subagent_builder.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/subagent_builder.py
new file mode 100644
index 0000000000..7173901f93
--- /dev/null
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/subagent_builder.py
@@ -0,0 +1,102 @@
+"""Build delegated sub-agent specs from route-local pieces."""
+
+from __future__ import annotations
+
+from typing import Any, cast
+
+from deepagents import SubAgent
+from deepagents.middleware.patch_tool_calls import PatchToolCallsMiddleware
+from langchain_core.language_models import BaseChatModel
+from langchain_core.tools import BaseTool
+
+from app.agents.multi_agent_chat.middleware.shared.permissions import (
+    build_permission_mw,
+)
+from app.agents.multi_agent_chat.subagents.shared.spec import SurfSenseSubagentSpec
+from app.agents.new_chat.permissions import Ruleset
+
+
+def _user_allowlist_for(
+    dependencies: dict[str, Any], subagent_name: str
+) -> Ruleset | None:
+    """Return the user's persisted allow-rules for ``subagent_name`` if any.
+
+    Populated by the agent factory from
+    :func:`app.services.user_tool_allowlist.fetch_user_allowlist_rulesets`.
+    Returning ``None`` is the common case (fresh accounts, non-MCP
+    subagents, or no "Always Allow" interactions yet).
+    """
+    by_subagent = dependencies.get("user_allowlist_by_subagent") or {}
+    user_allowlist = by_subagent.get(subagent_name)
+    if isinstance(user_allowlist, Ruleset) and user_allowlist.rules:
+        return user_allowlist
+    return None
+
+
+def pack_subagent(
+    *,
+    name: str,
+    description: str,
+    system_prompt: str,
+    tools: list[BaseTool],
+    ruleset: Ruleset,
+    dependencies: dict[str, Any],
+    model: BaseChatModel | None = None,
+    middleware_stack: dict[str, Any] | None = None,
+) -> SurfSenseSubagentSpec:
+    """Pack the route-local pieces into one sub-agent spec + its Ruleset.
+
+    Tool gating is uniformly performed by a per-subagent
+    :class:`PermissionMiddleware`. Three rule layers are evaluated
+    earliest-to-latest (last match wins):
+
+    1. SurfSense defaults — single ``allow */*`` rule (added by
+       :func:`build_permission_mw`).
+    2. ``ruleset`` — the subagent's coded approval rules (e.g. KB's
+       destructive-FS ``ask`` rules, connector ``ask`` writes).
+    3. The user's persisted allow-list for this subagent — pulled from
+       ``dependencies['user_allowlist_by_subagent'][name]``. User
+       ``allow`` rules layered last override coded ``ask`` rules,
+       implementing the "Always Allow" UX without re-asking on the
+       next turn.
+
+    The shared ``permission`` slot from ``middleware_stack`` is dropped
+    so each subagent owns its own rule surface and cannot accidentally
+    share state with the main agent's permission middleware.
+    """
+    if not system_prompt.strip():
+        msg = f"Subagent {name!r}: system_prompt is empty"
+        raise ValueError(msg)
+
+    flags = dependencies["flags"]
+    user_allowlist = _user_allowlist_for(dependencies, name)
+    subagent_rulesets: list[Ruleset] = [ruleset]
+    if user_allowlist is not None:
+        subagent_rulesets.append(user_allowlist)
+    per_subagent_perm = build_permission_mw(
+        flags=flags,
+        subagent_rulesets=subagent_rulesets,
+        tools=tools,
+        trusted_tool_saver=dependencies.get("trusted_tool_saver"),
+    )
+
+    prepended: list[Any] = []
+    for slot, mw in (middleware_stack or {}).items():
+        if mw is None:
+            continue
+        if slot == "permission":
+            continue
+        prepended.append(mw)
+    if per_subagent_perm is not None:
+        prepended.append(per_subagent_perm)
+    middleware: list[Any] = [*prepended, PatchToolCallsMiddleware()]
+    spec_dict: dict[str, Any] = {
+        "name": name,
+        "description": description,
+        "system_prompt": system_prompt,
+        "tools": tools,
+        "middleware": middleware,
+    }
+    if model is not None:
+        spec_dict["model"] = model
+    return SurfSenseSubagentSpec(spec=cast(SubAgent, spec_dict), ruleset=ruleset)
diff --git a/surfsense_backend/app/agents/new_chat/__init__.py b/surfsense_backend/app/agents/new_chat/__init__.py
index 3dd9b6df4f..4b2eb89eb4 100644
--- a/surfsense_backend/app/agents/new_chat/__init__.py
+++ b/surfsense_backend/app/agents/new_chat/__init__.py
@@ -22,7 +22,11 @@
 from .context import SurfSenseContextSchema
 
 # LLM config
-from .llm_config import create_chat_litellm_from_config, load_llm_config_from_yaml
+from .llm_config import (
+    create_chat_litellm_from_config,
+    load_global_llm_config_by_id,
+    load_llm_config_from_yaml,
+)
 
 # Middleware
 from .middleware import (
@@ -81,6 +85,7 @@
     "get_all_tool_names",
     "get_default_enabled_tools",
     "get_tool_by_name",
+    "load_global_llm_config_by_id",
     "load_llm_config_from_yaml",
     "search_knowledge_base_async",
 ]
diff --git a/surfsense_backend/app/agents/new_chat/agent_cache.py b/surfsense_backend/app/agents/new_chat/agent_cache.py
new file mode 100644
index 0000000000..fa8e6fb720
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/agent_cache.py
@@ -0,0 +1,357 @@
+"""TTL-LRU cache for compiled SurfSense deep agents.
+
+Why this exists
+---------------
+
+``create_surfsense_deep_agent`` runs a 4-5 second pipeline on EVERY chat
+turn:
+
+1. Discover connectors & document types from Postgres (~50-200ms)
+2. Build the tool list (built-in + MCP) (~200ms-1.7s)
+3. Compose the system prompt
+4. Construct ~15 middleware instances (CPU)
+5. Eagerly compile the general-purpose subagent
+   (``SubAgentMiddleware.__init__`` calls ``create_agent`` synchronously,
+   which builds a second LangGraph + Pydantic schemas — ~1.5-2s of pure
+   CPU work)
+6. Compile the outer LangGraph
+
+For a single thread, all six steps produce the SAME object on every turn
+unless the user has changed their LLM config, toggled a feature flag,
+added a connector, etc. The right answer is to compile ONCE per
+"agent shape" and reuse the resulting :class:`CompiledStateGraph` for
+every subsequent turn on the same thread.
+
+Why a per-thread key (not a global pool)
+----------------------------------------
+
+Most middleware in the SurfSense stack captures per-thread state in
+``__init__`` closures (``thread_id``, ``user_id``, ``search_space_id``,
+``filesystem_mode``, ``mentioned_document_ids``). Cross-thread reuse
+would silently leak state across users and threads. Keying the cache on
+``(llm_config_id, thread_id, ...)`` gives us safe reuse for repeated
+turns on the same thread without changing any middleware's behavior.
+
+Phase 2 will move those captured fields onto :class:`SurfSenseContextSchema`
+(read via ``runtime.context``) so the cache can collapse to a single
+``(llm_config_id, search_space_id, ...)`` key shared across threads. Until
+then, per-thread keying is the only safe option.
+
+Cache shape
+-----------
+
+* TTL-LRU: entries auto-expire after ``ttl_seconds`` (default 1800s, 30
+  minutes — matches a typical chat session). ``maxsize`` (default 256)
+  caps memory; LRU evicts least-recently-used on overflow.
+* In-flight de-duplication: per-key :class:`asyncio.Lock` so concurrent
+  cold misses on the same key wait for the first build instead of
+  building N times.
+* Process-local: this is an in-memory cache. Multi-replica deployments
+  pay the build cost once per replica per key. That's fine; the working
+  set per replica is small (one entry per active thread on that replica).
+
+Telemetry
+---------
+
+Every lookup logs ``[agent_cache]`` lines through ``surfsense.perf``:
+
+  * ``hit`` — cache hit, microseconds-fast
+  * ``miss`` — first build for this key, includes build duration
+  * ``stale`` — entry was found but expired; rebuilt
+  * ``evict`` — LRU eviction (size-limited)
+  * ``size`` — current cache occupancy at lookup time
+"""
+
+from __future__ import annotations
+
+import asyncio
+import hashlib
+import logging
+import os
+import time
+from collections import OrderedDict
+from collections.abc import Awaitable, Callable
+from dataclasses import dataclass
+from typing import Any
+
+from app.utils.perf import get_perf_logger
+
+logger = logging.getLogger(__name__)
+_perf_log = get_perf_logger()
+
+
+# ---------------------------------------------------------------------------
+# Public API: signature helpers (cache key components)
+# ---------------------------------------------------------------------------
+
+
+def stable_hash(*parts: Any) -> str:
+    """Compute a deterministic SHA1 of the str repr of ``parts``.
+
+    Used for cache key components that need a fixed-width representation
+    (system prompt, tool list, etc.). SHA1 is fine here — this is not a
+    security boundary, just a content fingerprint.
+    """
+    h = hashlib.sha1(usedforsecurity=False)
+    for p in parts:
+        h.update(repr(p).encode("utf-8", errors="replace"))
+        h.update(b"\x1f")  # ASCII unit separator between parts
+    return h.hexdigest()
+
+
+def tools_signature(
+    tools: list[Any] | tuple[Any, ...],
+    *,
+    available_connectors: list[str] | None,
+    available_document_types: list[str] | None,
+) -> str:
+    """Hash the bound-tool surface for cache-key purposes.
+
+    The signature changes whenever:
+
+    * A tool is added or removed from the bound list (built-in toggles,
+      MCP tools loaded for the user changes, gating rules flip, etc.).
+    * The available connectors / document types for the search space
+      change (new connector added, last connector removed, new document
+      type indexed). Because :func:`get_connector_gated_tools` derives
+      ``modified_disabled_tools`` from ``available_connectors``, the
+      tool surface is technically already covered — but we hash the
+      connector list separately so an empty-list "no tools changed"
+      situation still rotates the key when, say, the user re-adds a
+      connector that gates a tool we were already not exposing.
+
+    Stays stable across:
+
+    * Process restarts (tool names + descriptions are static).
+    * Different replicas (everyone gets the same hash for the same
+      inputs).
+    """
+    tool_descriptors = sorted(
+        (getattr(t, "name", repr(t)), getattr(t, "description", "")) for t in tools
+    )
+    connectors = sorted(available_connectors or [])
+    doc_types = sorted(available_document_types or [])
+    return stable_hash(tool_descriptors, connectors, doc_types)
+
+
+def flags_signature(flags: Any) -> str:
+    """Hash the resolved :class:`AgentFeatureFlags` dataclass.
+
+    Frozen dataclasses are deterministically reprable, so a SHA1 of their
+    repr is a stable fingerprint. Restart safe (flags are read once at
+    process boot).
+    """
+    return stable_hash(repr(flags))
+
+
+def system_prompt_hash(system_prompt: str) -> str:
+    """Hash a system prompt string. Cheap, ~30µs for typical prompts."""
+    return hashlib.sha1(
+        system_prompt.encode("utf-8", errors="replace"),
+        usedforsecurity=False,
+    ).hexdigest()
+
+
+# ---------------------------------------------------------------------------
+# Cache implementation
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class _Entry:
+    value: Any
+    created_at: float
+    last_used_at: float
+
+
+class _AgentCache:
+    """In-process TTL-LRU cache with per-key in-flight de-duplication.
+
+    NOT THREAD-SAFE in the multithreading sense — designed for a single
+    asyncio event loop. Uvicorn runs one event loop per worker process,
+    so this is fine; multi-worker deployments simply each maintain their
+    own cache.
+    """
+
+    def __init__(self, *, maxsize: int, ttl_seconds: float) -> None:
+        self._maxsize = maxsize
+        self._ttl = ttl_seconds
+        self._entries: OrderedDict[str, _Entry] = OrderedDict()
+        # One lock per key — guards "build" so concurrent cold misses on
+        # the same key wait for the first build instead of all racing.
+        self._locks: dict[str, asyncio.Lock] = {}
+
+    def _now(self) -> float:
+        return time.monotonic()
+
+    def _is_fresh(self, entry: _Entry) -> bool:
+        return (self._now() - entry.created_at) < self._ttl
+
+    def _evict_if_full(self) -> None:
+        while len(self._entries) >= self._maxsize:
+            evicted_key, _ = self._entries.popitem(last=False)
+            self._locks.pop(evicted_key, None)
+            _perf_log.info(
+                "[agent_cache] evict key=%s reason=lru size=%d",
+                _short(evicted_key),
+                len(self._entries),
+            )
+
+    def _touch(self, key: str, entry: _Entry) -> None:
+        entry.last_used_at = self._now()
+        self._entries.move_to_end(key, last=True)
+
+    async def get_or_build(
+        self,
+        key: str,
+        *,
+        builder: Callable[[], Awaitable[Any]],
+    ) -> Any:
+        """Return the cached value for ``key`` or call ``builder()`` to make it.
+
+        ``builder`` MUST be idempotent — concurrent cold misses on the
+        same key collapse to a single ``builder()`` call (the others
+        wait on the in-flight lock and observe the populated entry on
+        wake).
+        """
+        # Fast path: hot hit.
+        entry = self._entries.get(key)
+        if entry is not None and self._is_fresh(entry):
+            self._touch(key, entry)
+            _perf_log.info(
+                "[agent_cache] hit key=%s age=%.1fs size=%d",
+                _short(key),
+                self._now() - entry.created_at,
+                len(self._entries),
+            )
+            return entry.value
+
+        # Stale entry — drop it; rebuild below.
+        if entry is not None and not self._is_fresh(entry):
+            _perf_log.info(
+                "[agent_cache] stale key=%s age=%.1fs ttl=%.0fs",
+                _short(key),
+                self._now() - entry.created_at,
+                self._ttl,
+            )
+            self._entries.pop(key, None)
+
+        # Slow path: serialize concurrent misses for the same key.
+        lock = self._locks.setdefault(key, asyncio.Lock())
+        async with lock:
+            # Double-check after acquiring the lock — another waiter may
+            # have populated the entry while we slept.
+            entry = self._entries.get(key)
+            if entry is not None and self._is_fresh(entry):
+                self._touch(key, entry)
+                _perf_log.info(
+                    "[agent_cache] hit key=%s age=%.1fs size=%d coalesced=true",
+                    _short(key),
+                    self._now() - entry.created_at,
+                    len(self._entries),
+                )
+                return entry.value
+
+            t0 = time.perf_counter()
+            try:
+                value = await builder()
+            except BaseException:
+                # Don't cache failed builds; let the next caller retry.
+                _perf_log.warning(
+                    "[agent_cache] build_failed key=%s elapsed=%.3fs",
+                    _short(key),
+                    time.perf_counter() - t0,
+                )
+                raise
+            elapsed = time.perf_counter() - t0
+
+            # Insert + evict.
+            self._evict_if_full()
+            now = self._now()
+            self._entries[key] = _Entry(value=value, created_at=now, last_used_at=now)
+            self._entries.move_to_end(key, last=True)
+            _perf_log.info(
+                "[agent_cache] miss key=%s build=%.3fs size=%d",
+                _short(key),
+                elapsed,
+                len(self._entries),
+            )
+            return value
+
+    def invalidate(self, key: str) -> bool:
+        """Drop a single entry; return True if anything was removed."""
+        removed = self._entries.pop(key, None) is not None
+        self._locks.pop(key, None)
+        if removed:
+            _perf_log.info(
+                "[agent_cache] invalidate key=%s size=%d",
+                _short(key),
+                len(self._entries),
+            )
+        return removed
+
+    def invalidate_prefix(self, prefix: str) -> int:
+        """Drop every entry whose key starts with ``prefix``. Returns count."""
+        keys = [k for k in self._entries if k.startswith(prefix)]
+        for k in keys:
+            self._entries.pop(k, None)
+            self._locks.pop(k, None)
+        if keys:
+            _perf_log.info(
+                "[agent_cache] invalidate_prefix prefix=%s removed=%d size=%d",
+                _short(prefix),
+                len(keys),
+                len(self._entries),
+            )
+        return len(keys)
+
+    def clear(self) -> None:
+        n = len(self._entries)
+        self._entries.clear()
+        self._locks.clear()
+        if n:
+            _perf_log.info("[agent_cache] clear removed=%d", n)
+
+    def stats(self) -> dict[str, Any]:
+        return {
+            "size": len(self._entries),
+            "maxsize": self._maxsize,
+            "ttl_seconds": self._ttl,
+        }
+
+
+def _short(key: str, n: int = 16) -> str:
+    """Truncate keys for log lines so they don't blow up log volume."""
+    return key if len(key) <= n else f"{key[:n]}..."
+
+
+# ---------------------------------------------------------------------------
+# Module-level singleton
+# ---------------------------------------------------------------------------
+
+_DEFAULT_MAXSIZE = int(os.getenv("SURFSENSE_AGENT_CACHE_MAXSIZE", "256"))
+_DEFAULT_TTL = float(os.getenv("SURFSENSE_AGENT_CACHE_TTL_SECONDS", "1800"))
+
+_cache: _AgentCache = _AgentCache(maxsize=_DEFAULT_MAXSIZE, ttl_seconds=_DEFAULT_TTL)
+
+
+def get_cache() -> _AgentCache:
+    """Return the process-wide compiled-agent cache singleton."""
+    return _cache
+
+
+def reload_for_tests(*, maxsize: int = 256, ttl_seconds: float = 1800.0) -> _AgentCache:
+    """Replace the singleton with a fresh cache. Tests only."""
+    global _cache
+    _cache = _AgentCache(maxsize=maxsize, ttl_seconds=ttl_seconds)
+    return _cache
+
+
+__all__ = [
+    "flags_signature",
+    "get_cache",
+    "reload_for_tests",
+    "stable_hash",
+    "system_prompt_hash",
+    "tools_signature",
+]
diff --git a/surfsense_backend/app/agents/new_chat/chat_deepagent.py b/surfsense_backend/app/agents/new_chat/chat_deepagent.py
index fc1e80d28e..605c314166 100644
--- a/surfsense_backend/app/agents/new_chat/chat_deepagent.py
+++ b/surfsense_backend/app/agents/new_chat/chat_deepagent.py
@@ -10,7 +10,9 @@
 This lets us swap in ``SurfSenseFilesystemMiddleware`` — a customisable
 subclass of the default ``FilesystemMiddleware`` — while preserving every
 other behaviour that ``create_deep_agent`` provides (todo-list, subagents,
-summarisation, prompt-caching, etc.).
+summarisation, etc.). Prompt caching is configured at LLM-build time via
+``apply_litellm_prompt_caching`` (LiteLLM-native, multi-provider) rather
+than as a middleware.
 """
 
 import asyncio
@@ -23,34 +25,120 @@
 from deepagents.backends import StateBackend
 from deepagents.graph import BASE_AGENT_PROMPT
 from deepagents.middleware.patch_tool_calls import PatchToolCallsMiddleware
+from deepagents.middleware.skills import SkillsMiddleware
 from deepagents.middleware.subagents import GENERAL_PURPOSE_SUBAGENT
-from deepagents.middleware.summarization import create_summarization_middleware
 from langchain.agents import create_agent
-from langchain.agents.middleware import TodoListMiddleware
-from langchain_anthropic.middleware import AnthropicPromptCachingMiddleware
+from langchain.agents.middleware import (
+    LLMToolSelectorMiddleware,
+    ModelCallLimitMiddleware,
+    TodoListMiddleware,
+    ToolCallLimitMiddleware,
+)
 from langchain_core.language_models import BaseChatModel
 from langchain_core.tools import BaseTool
 from langgraph.types import Checkpointer
 from sqlalchemy.ext.asyncio import AsyncSession
 
+from app.agents.new_chat.agent_cache import (
+    flags_signature,
+    get_cache,
+    stable_hash,
+    system_prompt_hash,
+    tools_signature,
+)
 from app.agents.new_chat.context import SurfSenseContextSchema
+from app.agents.new_chat.feature_flags import AgentFeatureFlags, get_flags
+from app.agents.new_chat.filesystem_backends import build_backend_resolver
+from app.agents.new_chat.filesystem_selection import FilesystemMode, FilesystemSelection
 from app.agents.new_chat.llm_config import AgentConfig
 from app.agents.new_chat.middleware import (
+    ActionLogMiddleware,
+    AnonymousDocumentMiddleware,
+    BusyMutexMiddleware,
+    ClearToolUsesEdit,
     DedupHITLToolCallsMiddleware,
-    KnowledgeBaseSearchMiddleware,
+    DoomLoopMiddleware,
+    FileIntentMiddleware,
+    FlattenSystemMessageMiddleware,
+    KnowledgeBasePersistenceMiddleware,
+    KnowledgePriorityMiddleware,
+    KnowledgeTreeMiddleware,
+    MemoryInjectionMiddleware,
+    NoopInjectionMiddleware,
+    OtelSpanMiddleware,
+    PermissionMiddleware,
+    RetryAfterMiddleware,
+    SpillingContextEditingMiddleware,
+    SpillToBackendEdit,
     SurfSenseFilesystemMiddleware,
+    ToolCallNameRepairMiddleware,
+    build_skills_backend_factory,
+    create_surfsense_compaction_middleware,
+    default_skills_sources,
+)
+from app.agents.new_chat.middleware.scoped_model_fallback import (
+    ScopedModelFallbackMiddleware,
 )
+from app.agents.new_chat.permissions import Rule, Ruleset
+from app.agents.new_chat.plugin_loader import (
+    PluginContext,
+    load_allowed_plugin_names_from_env,
+    load_plugin_middlewares,
+)
+from app.agents.new_chat.prompt_caching import apply_litellm_prompt_caching
+from app.agents.new_chat.subagents import build_specialized_subagents
 from app.agents.new_chat.system_prompt import (
     build_configurable_system_prompt,
     build_surfsense_system_prompt,
 )
-from app.agents.new_chat.tools.registry import build_tools_async
+from app.agents.new_chat.tools.invalid_tool import (
+    INVALID_TOOL_NAME,
+    invalid_tool,
+)
+from app.agents.new_chat.tools.registry import (
+    BUILTIN_TOOLS,
+    build_tools_async,
+    get_connector_gated_tools,
+)
 from app.db import ChatVisibility
 from app.services.connector_service import ConnectorService
 from app.utils.perf import get_perf_logger
 
 _perf_log = get_perf_logger()
 
+
+def _resolve_prompt_model_name(
+    agent_config: AgentConfig | None,
+    llm: BaseChatModel,
+) -> str | None:
+    """Resolve the model id to feed to provider-variant detection.
+
+    Preference order (matches the established idiom in
+    ``llm_router_service.py`` — see ``params.get("base_model") or
+    params.get("model", "")`` usages there):
+
+    1. ``agent_config.litellm_params["base_model"]`` — required for Azure
+       deployments where ``model_name`` is the deployment slug, not the
+       underlying family. Without this, a deployment named e.g.
+       ``"prod-chat-001"`` would silently miss every provider regex.
+    2. ``agent_config.model_name`` — the user's configured model id.
+    3. ``getattr(llm, "model", None)`` — fallback for direct callers that
+       don't supply an ``AgentConfig`` (currently a defensive path; all
+       production callers pass ``agent_config``).
+
+    Returns ``None`` when nothing is available; ``compose_system_prompt``
+    treats that as the ``"default"`` variant (no provider block emitted).
+    """
+    if agent_config is not None:
+        params = agent_config.litellm_params or {}
+        base_model = params.get("base_model")
+        if isinstance(base_model, str) and base_model.strip():
+            return base_model
+        if agent_config.model_name:
+            return agent_config.model_name
+    return getattr(llm, "model", None)
+
+
 # =============================================================================
 # Connector Type Mapping
 # =============================================================================
@@ -160,6 +248,8 @@ async def create_surfsense_deep_agent(
     firecrawl_api_key: str | None = None,
     thread_visibility: ChatVisibility | None = None,
     mentioned_document_ids: list[int] | None = None,
+    anon_session_id: str | None = None,
+    filesystem_selection: FilesystemSelection | None = None,
 ):
     """
     Create a SurfSense deep agent with configurable tools and prompts.
@@ -168,8 +258,7 @@ async def create_surfsense_deep_agent(
     - generate_podcast: Generate audio podcasts from content
     - generate_image: Generate images from text descriptions using AI models
     - scrape_webpage: Extract content from webpages
-    - save_memory: Store facts/preferences about the user
-    - recall_memory: Retrieve relevant user memories
+    - update_memory: Update the user's personal or team memory document
 
     The agent also includes TodoListMiddleware by default (via create_deep_agent) which provides:
     - write_todos: Create and update planning/todo lists for complex tasks
@@ -236,23 +325,54 @@ async def create_surfsense_deep_agent(
     """
     _t_agent_total = time.perf_counter()
 
-    # Discover available connectors and document types for this search space
+    # Layer thread-aware prompt caching onto the LLM. Idempotent with the
+    # build-time call in ``llm_config.py``; this run merely adds
+    # ``prompt_cache_key=f"surfsense-thread-{thread_id}"`` for OpenAI-family
+    # configs now that ``thread_id`` is known. No-op when ``thread_id`` is
+    # None or the provider is non-OpenAI-family.
+    apply_litellm_prompt_caching(llm, agent_config=agent_config, thread_id=thread_id)
+
+    filesystem_selection = filesystem_selection or FilesystemSelection()
+    backend_resolver = build_backend_resolver(
+        filesystem_selection,
+        search_space_id=search_space_id
+        if filesystem_selection.mode == FilesystemMode.CLOUD
+        else None,
+    )
+
+    # Discover available connectors and document types for this search space.
+    #
+    # NOTE: These two calls cannot be parallelized via ``asyncio.gather``.
+    # ``ConnectorService`` shares a single ``AsyncSession`` (``self.session``);
+    # SQLAlchemy explicitly forbids concurrent operations on the same session
+    # ("This session is provisioning a new connection; concurrent operations
+    # are not permitted on the same session"). The Phase 1.4 in-process TTL
+    # cache in ``connector_service`` already collapses the warm path to a
+    # near-zero pair of dict lookups, so sequential awaits cost nothing in
+    # the common case while remaining correct on cold cache misses.
     available_connectors: list[str] | None = None
     available_document_types: list[str] | None = None
 
     _t0 = time.perf_counter()
     try:
-        connector_types = await connector_service.get_available_connectors(
-            search_space_id
-        )
-        if connector_types:
-            available_connectors = _map_connectors_to_searchable_types(connector_types)
-
-        available_document_types = await connector_service.get_available_document_types(
-            search_space_id
-        )
+        try:
+            connector_types_result = await connector_service.get_available_connectors(
+                search_space_id
+            )
+            if connector_types_result:
+                available_connectors = _map_connectors_to_searchable_types(
+                    connector_types_result
+                )
+        except Exception as e:
+            logging.warning("Failed to discover available connectors: %s", e)
 
-    except Exception as e:
+        try:
+            available_document_types = (
+                await connector_service.get_available_document_types(search_space_id)
+            )
+        except Exception as e:
+            logging.warning("Failed to discover available document types: %s", e)
+    except Exception as e:  # pragma: no cover - defensive outer guard
         logging.warning(f"Failed to discover available connectors/document types: {e}")
     _perf_log.info(
         "[create_agent] Connector/doc-type discovery in %.3fs",
@@ -281,109 +401,15 @@ async def create_surfsense_deep_agent(
         "available_connectors": available_connectors,
         "available_document_types": available_document_types,
         "max_input_tokens": _max_input_tokens,
+        "llm": llm,
     }
 
-    # Disable Notion action tools if no Notion connector is configured
     modified_disabled_tools = list(disabled_tools) if disabled_tools else []
-    has_notion_connector = (
-        available_connectors is not None and "NOTION_CONNECTOR" in available_connectors
-    )
-    if not has_notion_connector:
-        notion_tools = [
-            "create_notion_page",
-            "update_notion_page",
-            "delete_notion_page",
-        ]
-        modified_disabled_tools.extend(notion_tools)
+    modified_disabled_tools.extend(get_connector_gated_tools(available_connectors))
 
-    # Disable Linear action tools if no Linear connector is configured
-    has_linear_connector = (
-        available_connectors is not None and "LINEAR_CONNECTOR" in available_connectors
-    )
-    if not has_linear_connector:
-        linear_tools = [
-            "create_linear_issue",
-            "update_linear_issue",
-            "delete_linear_issue",
-        ]
-        modified_disabled_tools.extend(linear_tools)
-
-    # Disable Google Drive action tools if no Google Drive connector is configured
-    has_google_drive_connector = (
-        available_connectors is not None and "GOOGLE_DRIVE_FILE" in available_connectors
-    )
-    if not has_google_drive_connector:
-        google_drive_tools = [
-            "create_google_drive_file",
-            "delete_google_drive_file",
-        ]
-        modified_disabled_tools.extend(google_drive_tools)
-
-    has_dropbox_connector = (
-        available_connectors is not None and "DROPBOX_FILE" in available_connectors
-    )
-    if not has_dropbox_connector:
-        modified_disabled_tools.extend(["create_dropbox_file", "delete_dropbox_file"])
-
-    has_onedrive_connector = (
-        available_connectors is not None and "ONEDRIVE_FILE" in available_connectors
-    )
-    if not has_onedrive_connector:
-        modified_disabled_tools.extend(["create_onedrive_file", "delete_onedrive_file"])
-
-    # Disable Google Calendar action tools if no Google Calendar connector is configured
-    has_google_calendar_connector = (
-        available_connectors is not None
-        and "GOOGLE_CALENDAR_CONNECTOR" in available_connectors
-    )
-    if not has_google_calendar_connector:
-        calendar_tools = [
-            "create_calendar_event",
-            "update_calendar_event",
-            "delete_calendar_event",
-        ]
-        modified_disabled_tools.extend(calendar_tools)
-
-    # Disable Gmail action tools if no Gmail connector is configured
-    has_gmail_connector = (
-        available_connectors is not None
-        and "GOOGLE_GMAIL_CONNECTOR" in available_connectors
-    )
-    if not has_gmail_connector:
-        gmail_tools = [
-            "create_gmail_draft",
-            "update_gmail_draft",
-            "send_gmail_email",
-            "trash_gmail_email",
-        ]
-        modified_disabled_tools.extend(gmail_tools)
-
-    # Disable Jira action tools if no Jira connector is configured
-    has_jira_connector = (
-        available_connectors is not None and "JIRA_CONNECTOR" in available_connectors
-    )
-    if not has_jira_connector:
-        jira_tools = [
-            "create_jira_issue",
-            "update_jira_issue",
-            "delete_jira_issue",
-        ]
-        modified_disabled_tools.extend(jira_tools)
-
-    # Disable Confluence action tools if no Confluence connector is configured
-    has_confluence_connector = (
-        available_connectors is not None
-        and "CONFLUENCE_CONNECTOR" in available_connectors
-    )
-    if not has_confluence_connector:
-        confluence_tools = [
-            "create_confluence_page",
-            "update_confluence_page",
-            "delete_confluence_page",
-        ]
-        modified_disabled_tools.extend(confluence_tools)
-
-    # Remove direct KB search tool; we now pre-seed a scoped filesystem via middleware.
+    # Remove direct KB search tool; KnowledgePriorityMiddleware now runs hybrid
+    # search per turn and surfaces hits as a <priority_documents> hint plus
+    # `<chunk_index matched="true">` markers inside lazy-loaded XML.
     if "search_knowledge_base" not in modified_disabled_tools:
         modified_disabled_tools.append("search_knowledge_base")
 
@@ -395,6 +421,18 @@ async def create_surfsense_deep_agent(
         disabled_tools=modified_disabled_tools,
         additional_tools=list(additional_tools) if additional_tools else None,
     )
+
+    # Register the ``invalid`` tool only when tool-call repair is on. It
+    # is dispatched only when :class:`ToolCallNameRepairMiddleware`
+    # rewrites a malformed call. We intentionally append it AFTER
+    # ``build_tools_async`` so it never appears in the system-prompt
+    # tool list (which is built from the registry, not the bound tool
+    # list).
+    _flags: AgentFeatureFlags = get_flags()
+    if _flags.enable_tool_call_repair and INVALID_TOOL_NAME not in {
+        t.name for t in tools
+    }:
+        tools = [*list(tools), invalid_tool]
     _perf_log.info(
         "[create_agent] build_tools_async in %.3fs (%d tools)",
         time.perf_counter() - _t0,
@@ -405,6 +443,21 @@ async def create_surfsense_deep_agent(
     _t0 = time.perf_counter()
     _enabled_tool_names = {t.name for t in tools}
     _user_disabled_tool_names = set(disabled_tools) if disabled_tools else set()
+
+    # Collect generic MCP connector info so the system prompt can route queries
+    # to their tools instead of falling back to "not in knowledge base".
+    _mcp_connector_tools: dict[str, list[str]] = {}
+    for t in tools:
+        meta = getattr(t, "metadata", None) or {}
+        if meta.get("mcp_is_generic") and meta.get("mcp_connector_name"):
+            _mcp_connector_tools.setdefault(
+                meta["mcp_connector_name"],
+                [],
+            ).append(t.name)
+
+    if _mcp_connector_tools:
+        _perf_log.info("MCP connector tool routing: %s", _mcp_connector_tools)
+
     if agent_config is not None:
         system_prompt = build_configurable_system_prompt(
             custom_system_instructions=agent_config.system_instructions,
@@ -413,28 +466,227 @@ async def create_surfsense_deep_agent(
             thread_visibility=thread_visibility,
             enabled_tool_names=_enabled_tool_names,
             disabled_tool_names=_user_disabled_tool_names,
+            mcp_connector_tools=_mcp_connector_tools,
+            model_name=_resolve_prompt_model_name(agent_config, llm),
         )
     else:
         system_prompt = build_surfsense_system_prompt(
             thread_visibility=thread_visibility,
             enabled_tool_names=_enabled_tool_names,
             disabled_tool_names=_user_disabled_tool_names,
+            mcp_connector_tools=_mcp_connector_tools,
+            model_name=_resolve_prompt_model_name(agent_config, llm),
         )
     _perf_log.info(
         "[create_agent] System prompt built in %.3fs", time.perf_counter() - _t0
     )
 
-    # -- Build the middleware stack (mirrors create_deep_agent internals) ------
+    # Combine system_prompt with BASE_AGENT_PROMPT (same as create_deep_agent)
+    final_system_prompt = system_prompt + "\n\n" + BASE_AGENT_PROMPT
+
+    # The middleware stack — and especially ``SubAgentMiddleware`` — is *not*
+    # cheap to build. ``SubAgentMiddleware.__init__`` calls ``create_agent``
+    # synchronously to compile the general-purpose subagent's full state graph
+    # (every tool + every middleware → pydantic schemas + langgraph compile).
+    # On gpt-5.x agents that's roughly 1.5-2s of pure CPU work. If we run it
+    # directly here it blocks the asyncio event loop for the whole streaming
+    # task (and any other coroutine sharing this loop), which is why
+    # "agent creation" wall-clock time used to stretch to ~3-4s. Move the
+    # entire middleware build + main-graph compile into a single
+    # ``asyncio.to_thread`` so the heavy CPU work runs off-loop and the
+    # event loop stays responsive.
+    #
+    # PHASE 1: cache the resulting compiled graph. ``agent_cache`` is keyed
+    # on every per-request value that any middleware in the stack closes
+    # over in ``__init__`` — drop one and you risk leaking state across
+    # threads. Hits collapse this whole block to a microsecond lookup;
+    # misses pay the original CPU cost AND populate the cache.
+    config_id = agent_config.config_id if agent_config is not None else None
+
+    async def _build_agent() -> Any:
+        return await asyncio.to_thread(
+            _build_compiled_agent_blocking,
+            llm=llm,
+            tools=tools,
+            final_system_prompt=final_system_prompt,
+            backend_resolver=backend_resolver,
+            filesystem_mode=filesystem_selection.mode,
+            search_space_id=search_space_id,
+            user_id=user_id,
+            thread_id=thread_id,
+            visibility=visibility,
+            anon_session_id=anon_session_id,
+            available_connectors=available_connectors,
+            available_document_types=available_document_types,
+            # ``mentioned_document_ids`` is consumed by
+            # ``KnowledgePriorityMiddleware`` per turn via
+            # ``runtime.context`` (Phase 1.5). We still pass the
+            # caller-provided list here for the legacy fallback path
+            # (cache disabled / context not propagated) — the middleware
+            # drains its own copy after the first read so a cached graph
+            # never replays stale mentions.
+            mentioned_document_ids=mentioned_document_ids,
+            max_input_tokens=_max_input_tokens,
+            flags=_flags,
+            checkpointer=checkpointer,
+        )
+
+    _t0 = time.perf_counter()
+    if _flags.enable_agent_cache and not _flags.disable_new_agent_stack:
+        # Cache key components — order matters only for human readability;
+        # the resulting hash is what's stored. Every component must
+        # rotate on a real shape change AND stay stable across identical
+        # invocations.
+        cache_key = stable_hash(
+            "v1",  # schema version of the key — bump if components change
+            config_id,
+            thread_id,
+            user_id,
+            search_space_id,
+            visibility,
+            filesystem_selection.mode,
+            anon_session_id,
+            tools_signature(
+                tools,
+                available_connectors=available_connectors,
+                available_document_types=available_document_types,
+            ),
+            flags_signature(_flags),
+            system_prompt_hash(final_system_prompt),
+            _max_input_tokens,
+            # ``mentioned_document_ids`` deliberately omitted — middleware
+            # reads it from ``runtime.context`` (Phase 1.5).
+        )
+        agent = await get_cache().get_or_build(cache_key, builder=_build_agent)
+    else:
+        agent = await _build_agent()
+    _perf_log.info(
+        "[create_agent] Middleware stack + graph compiled in %.3fs (cache=%s)",
+        time.perf_counter() - _t0,
+        "on"
+        if _flags.enable_agent_cache and not _flags.disable_new_agent_stack
+        else "off",
+    )
+
+    _perf_log.info(
+        "[create_agent] Total agent creation in %.3fs",
+        time.perf_counter() - _t_agent_total,
+    )
+    return agent
+
+
+# Tools whose output is too costly / lossy to discard. Keep this
+# conservative — anything listed here is *never* pruned by
+# :class:`ContextEditingMiddleware`. The list is filtered against
+# actually-bound tool names so disabled connectors don't show up here.
+_PRUNE_PROTECTED_TOOL_NAMES: frozenset[str] = frozenset(
+    {
+        "generate_report",
+        "generate_resume",
+        "generate_podcast",
+        "generate_video_presentation",
+        "generate_image",
+        # Read-heavy connector reads — recomputing them is expensive
+        "read_email",
+        "search_emails",
+        # The fallback for malformed tool calls — keep its replies visible
+        "invalid",
+    }
+)
+
+
+def _safe_exclude_tools(tools: Sequence[BaseTool]) -> tuple[str, ...]:
+    """Return ``exclude_tools`` derived from the actually-bound tool list.
+
+    Filters :data:`_PRUNE_PROTECTED_TOOL_NAMES` against the bound tools
+    so we never list tools that don't exist (would be a silent no-op).
+    """
+    enabled = {t.name for t in tools}
+    return tuple(name for name in _PRUNE_PROTECTED_TOOL_NAMES if name in enabled)
+
+
+# Connector gating: any tool whose ``ToolDefinition.required_connector``
+# isn't actually wired up gets a synthesized permission deny rule so
+# execution attempts short-circuit with ``permission_denied`` instead of
+# bubbling up provider-specific 401/404 errors. Mirrors OpenCode's
+# ``Permission.disabled`` (declarative, per-tool gating) — replaces the
+# legacy binary ``_CONNECTOR_TYPE_TO_SEARCHABLE`` substring-heuristic.
+def _synthesize_connector_deny_rules(
+    *,
+    available_connectors: list[str] | None,
+    enabled_tool_names: set[str],
+) -> list[Rule]:
+    """Build deny rules for tools whose required connector is not enabled.
+
+    Source of truth is ``ToolDefinition.required_connector`` in
+    :data:`BUILTIN_TOOLS`. A tool only gets a deny rule when:
+
+    1. It is currently bound (``enabled_tool_names``).
+    2. It declares a ``required_connector``.
+    3. That connector is *not* in ``available_connectors``.
+    """
+    available = set(available_connectors or [])
+    deny: list[Rule] = []
+    for tool_def in BUILTIN_TOOLS:
+        if tool_def.name not in enabled_tool_names:
+            continue
+        rc = tool_def.required_connector
+        if rc and rc not in available:
+            deny.append(Rule(permission=tool_def.name, pattern="*", action="deny"))
+    return deny
+
+
+def _build_compiled_agent_blocking(
+    *,
+    llm: BaseChatModel,
+    tools: Sequence[BaseTool],
+    final_system_prompt: str,
+    backend_resolver: Any,
+    filesystem_mode: FilesystemMode,
+    search_space_id: int,
+    user_id: str | None,
+    thread_id: int | None,
+    visibility: ChatVisibility,
+    anon_session_id: str | None,
+    available_connectors: list[str] | None,
+    available_document_types: list[str] | None,
+    mentioned_document_ids: list[int] | None,
+    max_input_tokens: int | None,
+    flags: AgentFeatureFlags,
+    checkpointer: Checkpointer,
+):
+    """Build the middleware stack and compile the agent graph synchronously.
+
+    Runs in a worker thread (see ``asyncio.to_thread`` call site) so the heavy
+    CPU work — most notably ``SubAgentMiddleware.__init__`` eagerly calling
+    ``create_agent`` to compile the general-purpose subagent — does not block
+    the event loop.
+    """
+    _memory_middleware = MemoryInjectionMiddleware(
+        user_id=user_id,
+        search_space_id=search_space_id,
+        thread_visibility=visibility,
+    )
+
     # General-purpose subagent middleware
+    # Subagent omits AnonymousDocumentMiddleware, KnowledgeTreeMiddleware,
+    # KnowledgePriorityMiddleware, and KnowledgeBasePersistenceMiddleware - it
+    # inherits state and tools from the parent, but should not (a) re-load
+    # anon docs / re-render the tree / re-run hybrid search, or (b) commit at
+    # its own completion (only the top-level agent's aafter_agent commits).
     gp_middleware = [
         TodoListMiddleware(),
+        _memory_middleware,
+        FileIntentMiddleware(llm=llm),
         SurfSenseFilesystemMiddleware(
+            backend=backend_resolver,
+            filesystem_mode=filesystem_mode,
             search_space_id=search_space_id,
             created_by_id=user_id,
+            thread_id=thread_id,
         ),
-        create_summarization_middleware(llm, StateBackend),
+        create_surfsense_compaction_middleware(llm, StateBackend),
         PatchToolCallsMiddleware(),
-        AnthropicPromptCachingMiddleware(unsupported_model_behavior="ignore"),
     ]
 
     general_purpose_spec: SubAgent = {  # type: ignore[typeddict-unknown-key]
@@ -444,41 +696,464 @@ async def create_surfsense_deep_agent(
         "middleware": gp_middleware,
     }
 
+    # Specialized user-facing subagents (explore, report_writer,
+    # connector_negotiator). Registered through SubAgentMiddleware alongside
+    # the general-purpose spec so the parent's `task` tool can address them
+    # by name. Off by default until the flag flips so existing deployments
+    # don't see new agent types in the task tool description.
+    specialized_subagents: list[SubAgent] = []
+    if flags.enable_specialized_subagents and not flags.disable_new_agent_stack:
+        try:
+            # Specialized subagents share the parent's filesystem +
+            # todo view so their system prompts (which promise
+            # ``read_file``, ``ls``, ``grep``, ``glob``, ``write_todos``)
+            # actually match runtime behavior. Build *fresh* instances
+            # rather than aliasing the parent's GP middleware to avoid
+            # subtle state coupling across compiled graphs.
+            subagent_extra_middleware: list = [
+                TodoListMiddleware(),
+                SurfSenseFilesystemMiddleware(
+                    backend=backend_resolver,
+                    filesystem_mode=filesystem_mode,
+                    search_space_id=search_space_id,
+                    created_by_id=user_id,
+                    thread_id=thread_id,
+                ),
+            ]
+            specialized_subagents = build_specialized_subagents(
+                tools=tools,
+                model=llm,
+                extra_middleware=subagent_extra_middleware,
+            )
+            logging.info(
+                "Specialized subagents registered for task tool: %s",
+                [s["name"] for s in specialized_subagents],
+            )
+        except Exception as exc:  # pragma: no cover - defensive
+            logging.warning(
+                "Specialized subagent build failed; running without them: %s",
+                exc,
+            )
+            specialized_subagents = []
+
+    subagent_specs: list[SubAgent] = [general_purpose_spec, *specialized_subagents]
+
     # Main agent middleware
+    # Order: AnonDoc -> Tree -> Priority -> FileIntent -> Filesystem -> Persistence -> ...
+    # before_agent hooks run in declared order; later injections sit closer to
+    # the latest human turn. Tree (large + cacheable) is injected earliest so
+    # provider-side prefix caching has more material to hit; FileIntent (most
+    # actionable per-turn contract) is injected closest to the user message.
+    #
+    # ``wrap_model_call`` ordering: the FIRST middleware in the list is the
+    # OUTERMOST wrapper. To ensure prune executes before summarization,
+    # place ``SpillingContextEditingMiddleware`` before
+    # ``SurfSenseCompactionMiddleware``. Compaction is the canonical
+    # token-budget defense; the Bedrock buffer-empty defense is folded
+    # into ``SurfSenseCompactionMiddleware``.
+    summarization_mw = create_surfsense_compaction_middleware(llm, StateBackend)
+    _ = flags.enable_compaction_v2  # historical flag; retained for telemetry parity
+
+    # ContextEditing prune. Trigger at 55% of ``max_input_tokens``,
+    # earlier than summarization (~85%). When disabled, no edit runs.
+    context_edit_mw = None
+    if (
+        flags.enable_context_editing
+        and not flags.disable_new_agent_stack
+        and max_input_tokens
+    ):
+        spill_edit = SpillToBackendEdit(
+            trigger=int(max_input_tokens * 0.55),
+            clear_at_least=int(max_input_tokens * 0.15),
+            keep=5,
+            exclude_tools=_safe_exclude_tools(tools),
+            clear_tool_inputs=True,
+        )
+        clear_edit = ClearToolUsesEdit(
+            trigger=int(max_input_tokens * 0.55),
+            clear_at_least=int(max_input_tokens * 0.15),
+            keep=5,
+            exclude_tools=_safe_exclude_tools(tools),
+            clear_tool_inputs=True,
+            placeholder="[cleared - older tool output trimmed for context]",
+        )
+        context_edit_mw = SpillingContextEditingMiddleware(
+            edits=[spill_edit, clear_edit],
+            backend_resolver=backend_resolver,
+        )
+
+    # Resilience knobs: header-aware retry, model fallback, and
+    # per-thread / per-run call-count limits. The fallback / limit
+    # middlewares are vanilla LangChain primitives; ``RetryAfter`` is
+    # SurfSense's header-aware variant (see its module docstring).
+    retry_mw = (
+        RetryAfterMiddleware(max_retries=3)
+        if flags.enable_retry_after and not flags.disable_new_agent_stack
+        else None
+    )
+    # Fallback chain — primary is the agent's own model; we add cheap
+    # alternatives. Off by default; only the first call site that
+    # configures the chain via env should enable it.
+    fallback_mw: ScopedModelFallbackMiddleware | None = None
+    if flags.enable_model_fallback and not flags.disable_new_agent_stack:
+        try:
+            fallback_mw = ScopedModelFallbackMiddleware(
+                "openai:gpt-4o-mini",
+                "anthropic:claude-3-5-haiku-20241022",
+            )
+        except Exception:
+            logging.warning("ScopedModelFallbackMiddleware init failed; skipping.")
+            fallback_mw = None
+    model_call_limit_mw = (
+        ModelCallLimitMiddleware(
+            thread_limit=120,
+            run_limit=80,
+            exit_behavior="end",
+        )
+        if flags.enable_model_call_limit and not flags.disable_new_agent_stack
+        else None
+    )
+    tool_call_limit_mw = (
+        ToolCallLimitMiddleware(
+            thread_limit=300, run_limit=80, exit_behavior="continue"
+        )
+        if flags.enable_tool_call_limit and not flags.disable_new_agent_stack
+        else None
+    )
+
+    # Provider-compat ``_noop`` injection (mirrors OpenCode's
+    # ``llm.ts`` workaround for providers that reject empty assistant
+    # turns or alternating-role constraints).
+    noop_mw = (
+        NoopInjectionMiddleware()
+        if flags.enable_compaction_v2 and not flags.disable_new_agent_stack
+        else None
+    )
+
+    # Tool-call name repair (lowercase + ``invalid`` fallback).
+    #
+    # ``registered_tool_names`` MUST cover every tool the model can legitimately
+    # call. That includes the bound ``tools`` list AND every tool provided by
+    # middleware in the stack — ``FilesystemMiddleware`` (read_file, ls, grep,
+    # glob, edit_file, write_file, execute), ``TodoListMiddleware``
+    # (write_todos), ``SubAgentMiddleware`` (task), ``SkillsMiddleware`` (skill
+    # loaders), etc. If we only inspect ``tools`` here, every call to
+    # ``read_file`` / ``ls`` / ``grep`` from the model will be rewritten to
+    # ``invalid`` because the repair middleware doesn't recognize them. The
+    # built-in deepagents middleware aren't in scope yet at this point of the
+    # function but they're added unconditionally below, so we hard-code their
+    # canonical names alongside the dynamic ``tools`` set.
+    repair_mw = None
+    if flags.enable_tool_call_repair and not flags.disable_new_agent_stack:
+        registered_names: set[str] = {t.name for t in tools}
+        # Tools owned by the standard deepagents middleware stack and the
+        # SurfSense filesystem extension.
+        registered_names |= {
+            "write_todos",
+            "ls",
+            "read_file",
+            "write_file",
+            "edit_file",
+            "glob",
+            "grep",
+            "execute",
+            "task",
+            "mkdir",
+            "cd",
+            "pwd",
+            "move_file",
+            "rm",
+            "rmdir",
+            "list_tree",
+            "execute_code",
+        }
+        repair_mw = ToolCallNameRepairMiddleware(
+            registered_tool_names=registered_names,
+            # Disable fuzzy matching to avoid silent rewrites; the
+            # lowercase + ``invalid`` fallback alone covers >95% of
+            # observed model errors.
+            fuzzy_match_threshold=None,
+        )
+
+    # Doom-loop detector. Off by default until the frontend handles
+    # ``permission == "doom_loop"`` interrupts.
+    doom_loop_mw = (
+        DoomLoopMiddleware(threshold=3)
+        if flags.enable_doom_loop and not flags.disable_new_agent_stack
+        else None
+    )
+
+    # PermissionMiddleware. Layers, earliest -> latest (last match wins,
+    # same evaluation order as OpenCode's ``permission/index.ts``):
+    #
+    # 1. ``surfsense_defaults`` — single ``allow */*`` rule. SurfSense
+    #    already runs per-tool HITL (see ``tools/hitl.py``) for mutating
+    #    connector tools, so we only want PermissionMiddleware to *deny*
+    #    things the user has gated off; the default fallback in
+    #    ``permissions.evaluate`` is ``ask``, which would double-prompt
+    #    on every safe read-only call (``ls``, ``read_file``, ``grep``,
+    #    ``glob``, ``web_search`` …) and, on resume, replay the previous
+    #    reject decision into innocent calls.
+    # 2. ``desktop_safety`` — ``ask`` for destructive filesystem ops when
+    #    the agent is operating against the user's real disk. Cloud mode
+    #    has full revision-based revert via ``revert_service``, but
+    #    desktop mode hits disk immediately with no undo, so an
+    #    accidental ``rm`` / ``rmdir`` / ``move_file`` / ``edit_file`` /
+    #    ``write_file`` is unrecoverable. This layer is forced on in
+    #    desktop mode regardless of ``enable_permission`` because the
+    #    safety net is non-negotiable.
+    # 3. ``connector_synthesized`` — deny rules for tools whose required
+    #    connector is not connected to this space. Overrides #1/#2.
+    # 4. (future) user-defined rules from ``agent_permission_rules`` table
+    #    via the Agent Permissions UI. Loaded last so they override all.
+    permission_mw: PermissionMiddleware | None = None
+    is_desktop_fs = filesystem_mode == FilesystemMode.DESKTOP_LOCAL_FOLDER
+    permission_enabled = flags.enable_permission and not flags.disable_new_agent_stack
+    # Build the middleware whenever it has work to do: either the user
+    # opted into the rule engine, OR we're in desktop mode and need the
+    # safety rules unconditionally.
+    if permission_enabled or is_desktop_fs:
+        rulesets: list[Ruleset] = [
+            Ruleset(
+                rules=[Rule(permission="*", pattern="*", action="allow")],
+                origin="surfsense_defaults",
+            ),
+        ]
+        if is_desktop_fs:
+            rulesets.append(
+                Ruleset(
+                    rules=[
+                        Rule(permission="rm", pattern="*", action="ask"),
+                        Rule(permission="rmdir", pattern="*", action="ask"),
+                        Rule(permission="move_file", pattern="*", action="ask"),
+                        Rule(permission="edit_file", pattern="*", action="ask"),
+                        Rule(permission="write_file", pattern="*", action="ask"),
+                    ],
+                    origin="desktop_safety",
+                )
+            )
+        if permission_enabled:
+            synthesized = _synthesize_connector_deny_rules(
+                available_connectors=available_connectors,
+                enabled_tool_names={t.name for t in tools},
+            )
+            rulesets.append(Ruleset(rules=synthesized, origin="connector_synthesized"))
+        permission_mw = PermissionMiddleware(rulesets=rulesets)
+
+    # ActionLogMiddleware. Off by default until the ``agent_action_log``
+    # table is migrated. When enabled, persists one row per tool call
+    # with optional reverse_descriptor for
+    # ``POST /api/threads/{thread_id}/revert/{action_id}``. Sits inside
+    # ``permission`` so denied calls aren't logged as completions.
+    action_log_mw: ActionLogMiddleware | None = None
+    if (
+        flags.enable_action_log
+        and not flags.disable_new_agent_stack
+        and thread_id is not None
+    ):
+        try:
+            tool_defs_by_name = {td.name: td for td in BUILTIN_TOOLS}
+            action_log_mw = ActionLogMiddleware(
+                thread_id=thread_id,
+                search_space_id=search_space_id,
+                user_id=user_id,
+                tool_definitions=tool_defs_by_name,
+            )
+        except Exception:  # pragma: no cover - defensive
+            logging.warning(
+                "ActionLogMiddleware init failed; running without it.",
+                exc_info=True,
+            )
+            action_log_mw = None
+
+    # Per-thread busy mutex (refuse a second concurrent turn on the same
+    # thread; see :class:`BusyMutexMiddleware` docstring).
+    busy_mutex_mw: BusyMutexMiddleware | None = (
+        BusyMutexMiddleware()
+        if flags.enable_busy_mutex and not flags.disable_new_agent_stack
+        else None
+    )
+
+    # OpenTelemetry spans (model.call + tool.call). Lives just inside
+    # BusyMutex so it spans every retry/fallback attempt of the current
+    # turn but never wraps a queued/blocked turn.
+    otel_mw: OtelSpanMiddleware | None = (
+        OtelSpanMiddleware()
+        if flags.enable_otel and not flags.disable_new_agent_stack
+        else None
+    )
+
+    # Plugin entry-point loader. Off by default; opt-in via the
+    # ``SURFSENSE_ENABLE_PLUGIN_LOADER`` flag. The allowlist is read from
+    # the ``SURFSENSE_ALLOWED_PLUGINS`` env var (comma-separated). A future
+    # PR can wire it through ``global_llm_config.yaml``.
+    plugin_middlewares: list[Any] = []
+    if flags.enable_plugin_loader and not flags.disable_new_agent_stack:
+        try:
+            allowed_names = load_allowed_plugin_names_from_env()
+            if allowed_names:
+                plugin_middlewares = load_plugin_middlewares(
+                    PluginContext.build(
+                        search_space_id=search_space_id,
+                        user_id=user_id,
+                        thread_visibility=visibility,
+                        llm=llm,
+                    ),
+                    allowed_plugin_names=allowed_names,
+                )
+        except Exception:  # pragma: no cover - defensive
+            logging.warning(
+                "Plugin loader failed; continuing without plugins.",
+                exc_info=True,
+            )
+            plugin_middlewares = []
+
+    # SkillsMiddleware (deepagents) loads built-in + space-authored
+    # skills via a CompositeBackend. Sources are layered: built-in first,
+    # space last, so a search-space-authored skill of the same name
+    # overrides the bundled one.
+    skills_mw: SkillsMiddleware | None = None
+    if flags.enable_skills and not flags.disable_new_agent_stack:
+        try:
+            skills_factory = build_skills_backend_factory(
+                search_space_id=search_space_id
+                if filesystem_mode == FilesystemMode.CLOUD
+                else None,
+            )
+            skills_mw = SkillsMiddleware(
+                backend=skills_factory,
+                sources=default_skills_sources(),
+            )
+        except Exception as exc:  # pragma: no cover - defensive
+            logging.warning("SkillsMiddleware init failed; skipping: %s", exc)
+            skills_mw = None
+
+    # LangChain's LLM-driven tool selection — only enabled for stacks
+    # large enough to need narrowing (>30 tools).
+    selector_mw: LLMToolSelectorMiddleware | None = None
+    if (
+        flags.enable_llm_tool_selector
+        and not flags.disable_new_agent_stack
+        and len(tools) > 30
+    ):
+        try:
+            selector_mw = LLMToolSelectorMiddleware(
+                model="openai:gpt-4o-mini",
+                max_tools=12,
+                always_include=[
+                    name
+                    for name in (
+                        "update_memory",
+                        "get_connected_accounts",
+                        "scrape_webpage",
+                    )
+                    if name in {t.name for t in tools}
+                ],
+            )
+        except Exception:
+            logging.warning("LLMToolSelectorMiddleware init failed; skipping.")
+            selector_mw = None
+
     deepagent_middleware = [
+        # BusyMutex is OUTERMOST: it must wrap the entire stream so no
+        # other turn can sneak in while this one is mid-flight.
+        busy_mutex_mw,
+        # OTel spans sit just inside BusyMutex so each retry attempt
+        # gets its own model.call / tool.call span.
+        otel_mw,
         TodoListMiddleware(),
-        KnowledgeBaseSearchMiddleware(
+        _memory_middleware,
+        AnonymousDocumentMiddleware(
+            anon_session_id=anon_session_id,
+        )
+        if filesystem_mode == FilesystemMode.CLOUD
+        else None,
+        KnowledgeTreeMiddleware(
+            search_space_id=search_space_id,
+            filesystem_mode=filesystem_mode,
+            llm=llm,
+        )
+        if filesystem_mode == FilesystemMode.CLOUD
+        else None,
+        KnowledgePriorityMiddleware(
             llm=llm,
             search_space_id=search_space_id,
+            filesystem_mode=filesystem_mode,
             available_connectors=available_connectors,
             available_document_types=available_document_types,
             mentioned_document_ids=mentioned_document_ids,
         ),
+        FileIntentMiddleware(llm=llm),
         SurfSenseFilesystemMiddleware(
+            backend=backend_resolver,
+            filesystem_mode=filesystem_mode,
             search_space_id=search_space_id,
             created_by_id=user_id,
+            thread_id=thread_id,
         ),
-        SubAgentMiddleware(backend=StateBackend, subagents=[general_purpose_spec]),
-        create_summarization_middleware(llm, StateBackend),
+        KnowledgeBasePersistenceMiddleware(
+            search_space_id=search_space_id,
+            created_by_id=user_id,
+            filesystem_mode=filesystem_mode,
+            thread_id=thread_id,
+        )
+        if filesystem_mode == FilesystemMode.CLOUD
+        else None,
+        # Skill loader. Placed before SubAgentMiddleware so subagents
+        # inherit the same skill metadata (subagent specs reference the
+        # same source paths via ``default_skills_sources()``).
+        skills_mw,
+        SubAgentMiddleware(backend=StateBackend, subagents=subagent_specs),
+        # Tool selection (only when >30 tools and flag on).
+        selector_mw,
+        # Defensive caps, then prune, then summarize.
+        model_call_limit_mw,
+        tool_call_limit_mw,
+        context_edit_mw,
+        summarization_mw,
+        # Provider compatibility + retry chain — placed after prune/compact
+        # so retries happen on the already-trimmed payload.
+        noop_mw,
+        retry_mw,
+        fallback_mw,
+        # Coalesce a multi-text-block system message into one block
+        # immediately before the model call. Sits innermost on the
+        # system-message-mutation chain so it observes every appender
+        # (todo / filesystem / skills / subagents …) and prevents
+        # OpenRouter→Anthropic from redistributing ``cache_control``
+        # across N blocks and tripping Anthropic's 4-breakpoint cap.
+        # See ``middleware/flatten_system.py`` for full rationale.
+        FlattenSystemMessageMiddleware(),
+        # Tool-call repair must run after model emits but before
+        # permission / dedup / doom-loop interpret the calls.
+        repair_mw,
+        # Permission deny/ask BEFORE the calls are forwarded to tool nodes.
+        permission_mw,
+        doom_loop_mw,
+        # Action log sits inside permission so denied calls don't appear
+        # as completions, and outside dedup so each unique tool invocation
+        # gets its own row.
+        action_log_mw,
         PatchToolCallsMiddleware(),
-        DedupHITLToolCallsMiddleware(),
-        AnthropicPromptCachingMiddleware(unsupported_model_behavior="ignore"),
+        DedupHITLToolCallsMiddleware(agent_tools=list(tools)),
+        # Plugin slot — sits at the tail so plugin-side transforms see the
+        # final tool result. Prompt caching is now applied at LLM build time
+        # via ``apply_litellm_prompt_caching`` (see prompt_caching.py), so no
+        # caching middleware is needed here. Multiple plugins run in declared
+        # order; loader filtered by the admin allowlist already.
+        *plugin_middlewares,
     ]
+    deepagent_middleware = [m for m in deepagent_middleware if m is not None]
 
-    # Combine system_prompt with BASE_AGENT_PROMPT (same as create_deep_agent)
-    final_system_prompt = system_prompt + "\n\n" + BASE_AGENT_PROMPT
-
-    _t0 = time.perf_counter()
-    agent = await asyncio.to_thread(
-        create_agent,
+    agent = create_agent(
         llm,
         system_prompt=final_system_prompt,
-        tools=tools,
+        tools=list(tools),
         middleware=deepagent_middleware,
         context_schema=SurfSenseContextSchema,
         checkpointer=checkpointer,
     )
-    agent = agent.with_config(
+    return agent.with_config(
         {
             "recursion_limit": 10_000,
             "metadata": {
@@ -487,13 +1162,3 @@ async def create_surfsense_deep_agent(
             },
         }
     )
-    _perf_log.info(
-        "[create_agent] Graph compiled (create_agent) in %.3fs",
-        time.perf_counter() - _t0,
-    )
-
-    _perf_log.info(
-        "[create_agent] Total agent creation in %.3fs",
-        time.perf_counter() - _t_agent_total,
-    )
-    return agent
diff --git a/surfsense_backend/app/agents/new_chat/context.py b/surfsense_backend/app/agents/new_chat/context.py
index da113adf49..a20a43a669 100644
--- a/surfsense_backend/app/agents/new_chat/context.py
+++ b/surfsense_backend/app/agents/new_chat/context.py
@@ -1,28 +1,69 @@
 """
 Context schema definitions for SurfSense agents.
 
-This module defines the custom state schema used by the SurfSense deep agent.
+This module defines the per-invocation context object passed to the SurfSense
+deep agent via ``agent.astream_events(..., context=ctx)`` (LangGraph >= 0.6).
+
+The agent's compiled graph is the same across invocations (and cached by
+``agent_cache``), so anything that varies per turn — the user mentions a
+specific document, the front-end issues a unique ``request_id``, etc. —
+MUST live on this context object instead of being captured into a
+middleware ``__init__`` closure. Middlewares read fields back via
+``runtime.context.<field>``; tools read them via ``runtime.context``.
+
+This object is read inside both ``KnowledgePriorityMiddleware`` (for
+``mentioned_document_ids``) and any future middleware that needs
+per-request state without invalidating the compiled-agent cache.
 """
 
+from __future__ import annotations
+
+from dataclasses import dataclass, field
 from typing import TypedDict
 
 
-class SurfSenseContextSchema(TypedDict):
+class FileOperationContractState(TypedDict):
+    intent: str
+    confidence: float
+    suggested_path: str
+    timestamp: str
+    turn_id: str
+
+
+@dataclass
+class SurfSenseContextSchema:
     """
-    Custom state schema for the SurfSense deep agent.
-
-    This extends the default agent state with custom fields.
-    The default state already includes:
-    - messages: Conversation history
-    - todos: Task list from TodoListMiddleware
-    - files: Virtual filesystem from FilesystemMiddleware
-
-    We're adding fields needed for knowledge base search:
-    - search_space_id: The user's search space ID
-    - db_session: Database session (injected at runtime)
-    - connector_service: Connector service instance (injected at runtime)
+    Per-invocation context for the SurfSense deep agent.
+
+    Defaults are chosen so the dataclass can be safely default-constructed
+    (LangGraph's ``Runtime.context`` itself defaults to ``None`` if no
+    context is supplied — see ``langgraph.runtime.Runtime``). All fields
+    are optional; consumers must None-check before reading.
+
+    Phase 1.5 fields:
+        search_space_id: Search space the request is scoped to.
+        mentioned_document_ids: KB documents the user @-mentioned this turn.
+            Read by ``KnowledgePriorityMiddleware`` to seed its priority
+            list. Stays out of the compiled-agent cache key — that's the
+            whole point of putting it here.
+        mentioned_folder_ids: KB folders the user @-mentioned this turn
+            (cloud filesystem mode). Surfaced as ``[USER-MENTIONED]``
+            entries in ``<priority_documents>`` so the agent prioritises
+            walking those folders with ``ls`` / ``find_documents``.
+        file_operation_contract: One-shot file operation contract emitted
+            by ``FileIntentMiddleware`` for the upcoming turn.
+        turn_id / request_id: Correlation IDs surfaced by the streaming
+            task; populated for telemetry.
+
+    Phase 2 will extend with: thread_id, user_id, visibility,
+    filesystem_mode, anon_session_id, available_connectors,
+    available_document_types, created_by_id (everything currently captured
+    by middleware ``__init__`` closures).
     """
 
-    search_space_id: int
-    # These are runtime-injected and won't be serialized
-    # db_session and connector_service are passed when invoking the agent
+    search_space_id: int | None = None
+    mentioned_document_ids: list[int] = field(default_factory=list)
+    mentioned_folder_ids: list[int] = field(default_factory=list)
+    file_operation_contract: FileOperationContractState | None = None
+    turn_id: str | None = None
+    request_id: str | None = None
diff --git a/surfsense_backend/app/agents/new_chat/document_xml.py b/surfsense_backend/app/agents/new_chat/document_xml.py
new file mode 100644
index 0000000000..60e586ae14
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/document_xml.py
@@ -0,0 +1,103 @@
+"""Shared XML builder for KB documents.
+
+Produces the citation-friendly XML used by every read of a knowledge-base
+document (lazy-loaded by :class:`KBPostgresBackend` and synthetic anonymous
+files). The XML carries a ``<chunk_index>`` near the top so the LLM can jump
+directly to matched-chunk line ranges via ``read_file(offset=…, limit=…)``.
+
+Extracted from the original ``knowledge_search.py`` so the backend, the
+priority middleware, and any future renderer share a single implementation.
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Any
+
+
+def build_document_xml(
+    document: dict[str, Any],
+    matched_chunk_ids: set[int] | None = None,
+) -> str:
+    """Build citation-friendly XML with a ``<chunk_index>`` for smart seeking.
+
+    Args:
+        document: Dict shape produced by hybrid search / lazy-load helpers.
+            Expected keys: ``document`` (with ``id``, ``title``,
+            ``document_type``, ``metadata``) and ``chunks``
+            (list of ``{chunk_id, content}``).
+        matched_chunk_ids: Optional set of chunk IDs to flag as
+            ``matched="true"`` in the chunk index.
+    """
+    matched = matched_chunk_ids or set()
+
+    doc_meta = document.get("document") or {}
+    metadata = (doc_meta.get("metadata") or {}) if isinstance(doc_meta, dict) else {}
+    document_id = doc_meta.get("id", document.get("document_id", "unknown"))
+    document_type = doc_meta.get("document_type", document.get("source", "UNKNOWN"))
+    title = doc_meta.get("title") or metadata.get("title") or "Untitled Document"
+    url = (
+        metadata.get("url") or metadata.get("source") or metadata.get("page_url") or ""
+    )
+    metadata_json = json.dumps(metadata, ensure_ascii=False)
+
+    metadata_lines: list[str] = [
+        "<document>",
+        "<document_metadata>",
+        f"  <document_id>{document_id}</document_id>",
+        f"  <document_type>{document_type}</document_type>",
+        f"  <title><![CDATA[{title}]]></title>",
+        f"  <url><![CDATA[{url}]]></url>",
+        f"  <metadata_json><![CDATA[{metadata_json}]]></metadata_json>",
+        "</document_metadata>",
+        "",
+    ]
+
+    chunks = document.get("chunks") or []
+    chunk_entries: list[tuple[int | None, str]] = []
+    if isinstance(chunks, list):
+        for chunk in chunks:
+            if not isinstance(chunk, dict):
+                continue
+            chunk_id = chunk.get("chunk_id") or chunk.get("id")
+            chunk_content = str(chunk.get("content", "")).strip()
+            if not chunk_content:
+                continue
+            if chunk_id is None:
+                xml = f"  <chunk><![CDATA[{chunk_content}]]></chunk>"
+            else:
+                xml = f"  <chunk id='{chunk_id}'><![CDATA[{chunk_content}]]></chunk>"
+            chunk_entries.append((chunk_id, xml))
+
+    index_overhead = 1 + len(chunk_entries) + 1 + 1 + 1
+    first_chunk_line = len(metadata_lines) + index_overhead + 1
+
+    current_line = first_chunk_line
+    index_entry_lines: list[str] = []
+    for cid, xml_str in chunk_entries:
+        num_lines = xml_str.count("\n") + 1
+        end_line = current_line + num_lines - 1
+        matched_attr = ' matched="true"' if cid is not None and cid in matched else ""
+        if cid is not None:
+            index_entry_lines.append(
+                f'  <entry chunk_id="{cid}" lines="{current_line}-{end_line}"{matched_attr}/>'
+            )
+        else:
+            index_entry_lines.append(
+                f'  <entry lines="{current_line}-{end_line}"{matched_attr}/>'
+            )
+        current_line = end_line + 1
+
+    lines = metadata_lines.copy()
+    lines.append("<chunk_index>")
+    lines.extend(index_entry_lines)
+    lines.append("</chunk_index>")
+    lines.append("")
+    lines.append("<document_content>")
+    for _, xml_str in chunk_entries:
+        lines.append(xml_str)
+    lines.extend(["</document_content>", "</document>"])
+    return "\n".join(lines)
+
+
+__all__ = ["build_document_xml"]
diff --git a/surfsense_backend/app/agents/new_chat/errors.py b/surfsense_backend/app/agents/new_chat/errors.py
new file mode 100644
index 0000000000..a17333acc8
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/errors.py
@@ -0,0 +1,95 @@
+"""
+Typed error taxonomy for the SurfSense agent stack.
+
+Used by:
+- :class:`RetryAfterMiddleware` — its ``retry_on`` callable consults
+  the error code to decide whether a retry is appropriate.
+- :class:`PermissionMiddleware` — emits ``code="permission_denied"``
+  errors when a deny rule trips.
+- All tools — return :class:`StreamingError` payloads in
+  ``ToolMessage.additional_kwargs["error"]`` so the model and the
+  retry/permission layers share a contract.
+"""
+
+from __future__ import annotations
+
+from typing import Literal
+
+from pydantic import BaseModel, Field
+
+ErrorCode = Literal[
+    "rate_limit",
+    "auth",
+    "tool_validation",
+    "tool_runtime",
+    "context_overflow",
+    "provider",
+    "permission_denied",
+    "doom_loop",
+    "busy",
+    "cancelled",
+]
+
+
+class StreamingError(BaseModel):
+    """Structured error payload attached to ``ToolMessage.additional_kwargs["error"]``.
+
+    Tools and middleware emit this so retry, permission, and routing
+    layers can decide what to do without parsing free-form strings.
+    """
+
+    code: ErrorCode
+    retryable: bool = False
+    suggestion: str | None = None
+    correlation_id: str | None = None
+    detail: str | None = Field(
+        default=None,
+        description="Free-form additional context. Not surfaced to the model.",
+    )
+
+    class Config:
+        frozen = True
+
+
+class RejectedError(Exception):
+    """Raised when the user rejects a permission ask without feedback.
+
+    Caught by :class:`PermissionMiddleware`; the agent stops the current
+    tool fan-out and surfaces a user-facing rejection.
+    """
+
+    def __init__(self, *, tool: str | None = None, pattern: str | None = None) -> None:
+        super().__init__(f"Permission rejected for tool {tool!r}, pattern {pattern!r}")
+        self.tool = tool
+        self.pattern = pattern
+
+
+class CorrectedError(Exception):
+    """Raised when the user rejects a permission ask *with* feedback.
+
+    The :class:`PermissionMiddleware` translates the feedback into a
+    synthetic ``ToolMessage`` so the model sees the user's correction
+    and can retry the request differently.
+    """
+
+    def __init__(self, feedback: str, *, tool: str | None = None) -> None:
+        super().__init__(feedback)
+        self.feedback = feedback
+        self.tool = tool
+
+
+class BusyError(Exception):
+    """Raised when a second prompt arrives while the same thread is mid-stream."""
+
+    def __init__(self, request_id: str | None = None) -> None:
+        super().__init__("Thread is busy with another request")
+        self.request_id = request_id
+
+
+__all__ = [
+    "BusyError",
+    "CorrectedError",
+    "ErrorCode",
+    "RejectedError",
+    "StreamingError",
+]
diff --git a/surfsense_backend/app/agents/new_chat/feature_flags.py b/surfsense_backend/app/agents/new_chat/feature_flags.py
new file mode 100644
index 0000000000..3cea051ef5
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/feature_flags.py
@@ -0,0 +1,257 @@
+"""
+Feature flags for the SurfSense new_chat agent stack.
+
+These flags gate the newer agent middleware (some ported from OpenCode,
+some sourced from ``langchain.agents.middleware`` / ``deepagents``, some
+SurfSense-native). Most shipped agent-stack upgrades default ON so Docker
+image updates work even when older installs do not have newly introduced
+environment variables. Risky/experimental integrations stay default OFF,
+and the master kill-switch can still disable everything new.
+
+All new middleware checks its flag at agent build time. If the master
+kill-switch ``SURFSENSE_DISABLE_NEW_AGENT_STACK`` is set, every new
+middleware is disabled regardless of its individual flag. This gives
+operators a single switch to revert to pre-port behavior.
+
+Examples
+--------
+
+Defaults:
+
+    SURFSENSE_ENABLE_CONTEXT_EDITING=true
+    SURFSENSE_ENABLE_COMPACTION_V2=true
+    SURFSENSE_ENABLE_RETRY_AFTER=true
+    SURFSENSE_ENABLE_MODEL_FALLBACK=false
+    SURFSENSE_ENABLE_MODEL_CALL_LIMIT=true
+    SURFSENSE_ENABLE_TOOL_CALL_LIMIT=true
+    SURFSENSE_ENABLE_TOOL_CALL_REPAIR=true
+    SURFSENSE_ENABLE_PERMISSION=true
+    SURFSENSE_ENABLE_DOOM_LOOP=true
+    SURFSENSE_ENABLE_LLM_TOOL_SELECTOR=false  # adds a per-turn LLM call
+
+Master kill-switch (overrides everything else):
+
+    SURFSENSE_DISABLE_NEW_AGENT_STACK=true
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+from dataclasses import dataclass
+
+logger = logging.getLogger(__name__)
+
+
+def _env_bool(name: str, default: bool) -> bool:
+    """Parse a boolean env var. Accepts ``1``/``true``/``yes``/``on`` (case-insensitive)."""
+    raw = os.environ.get(name)
+    if raw is None:
+        return default
+    return raw.strip().lower() in ("1", "true", "yes", "on")
+
+
+@dataclass(frozen=True)
+class AgentFeatureFlags:
+    """Resolved feature-flag state for one agent build.
+
+    Constructed via :meth:`from_env`. The dataclass is frozen so it can be
+    safely shared across coroutines.
+    """
+
+    # Master kill-switch — when true, every flag below resolves to False
+    # regardless of its env value. Used for rapid rollback.
+    disable_new_agent_stack: bool = False
+
+    # Agent quality — context budget, retry/limits, name-repair, doom-loop
+    enable_context_editing: bool = True
+    enable_compaction_v2: bool = True
+    enable_retry_after: bool = True
+    enable_model_fallback: bool = False
+    enable_model_call_limit: bool = True
+    enable_tool_call_limit: bool = True
+    enable_tool_call_repair: bool = True
+    enable_doom_loop: bool = True
+
+    # Safety — permissions, concurrency, tool-set narrowing
+    enable_permission: bool = True
+    enable_busy_mutex: bool = True
+    enable_llm_tool_selector: bool = False  # Default OFF — adds per-turn LLM cost
+
+    # Skills + subagents
+    enable_skills: bool = True
+    enable_specialized_subagents: bool = True
+    enable_kb_planner_runnable: bool = True
+
+    # Snapshot / revert
+    enable_action_log: bool = True
+    enable_revert_route: bool = True
+
+    # Plugins
+    enable_plugin_loader: bool = False
+
+    # Observability — OTel (orthogonal; also requires OTEL_EXPORTER_OTLP_ENDPOINT)
+    enable_otel: bool = False
+
+    # Performance — compiled-agent cache (Phase 1 + Phase 2).
+    # When ON, ``create_surfsense_deep_agent`` reuses a previously-compiled
+    # graph if the cache key matches (LLM config + thread + tool surface +
+    # flags + system prompt + filesystem mode). Cuts per-turn agent-build
+    # wall clock from ~4-5s to <50µs on cache hits.
+    #
+    # SAFETY (Phase 2 unblocked this default-on):
+    # All connector mutation tools (``tools/notion``, ``tools/gmail``,
+    # ``tools/google_drive``, ``tools/dropbox``, ``tools/onedrive``,
+    # ``tools/google_calendar``, ``tools/confluence``, ``tools/discord``,
+    # ``tools/teams``, ``tools/luma``, ``connected_accounts``,
+    # ``update_memory``, ``search_surfsense_docs``) now acquire fresh
+    # short-lived ``AsyncSession`` instances per call via
+    # :data:`async_session_maker`. The factory still accepts ``db_session``
+    # for registry compatibility but ``del``'s it immediately — see any
+    # of those files' factory docstrings for the rationale. The ``llm``
+    # closure is per-(provider, model, config_id) which is already in
+    # the cache key, so the LLM is safe to share across cached hits of
+    # the same key. The KB priority middleware reads
+    # ``mentioned_document_ids`` from ``runtime.context`` (Phase 1.5),
+    # not its constructor closure, so the same compiled agent serves
+    # turns with different mention lists correctly.
+    #
+    # Rollback: set ``SURFSENSE_ENABLE_AGENT_CACHE=false`` in the
+    # environment if a regression surfaces. The path is exercised by
+    # the ``tests/unit/agents/new_chat/test_agent_cache_*`` suite.
+    enable_agent_cache: bool = True
+    # Phase 1 (deferred — measure first): pre-build & share the
+    # general-purpose subagent ``CompiledSubAgent`` across cold-cache
+    # misses. Only helps when the outer cache MISSES (cache hits already
+    # reuse the entire SubAgentMiddleware-compiled graph). Off by default
+    # until we have data showing cold misses are frequent enough to
+    # justify the extra global state.
+    enable_agent_cache_share_gp_subagent: bool = False
+
+    @classmethod
+    def from_env(cls) -> AgentFeatureFlags:
+        """Read flags from environment.
+
+        Master kill-switch is evaluated first; when set, all other flags
+        force to False.
+        """
+        master_off = _env_bool("SURFSENSE_DISABLE_NEW_AGENT_STACK", False)
+        if master_off:
+            logger.info(
+                "SURFSENSE_DISABLE_NEW_AGENT_STACK is set: every new agent "
+                "middleware is forced OFF for this build."
+            )
+            return cls(
+                disable_new_agent_stack=True,
+                enable_context_editing=False,
+                enable_compaction_v2=False,
+                enable_retry_after=False,
+                enable_model_fallback=False,
+                enable_model_call_limit=False,
+                enable_tool_call_limit=False,
+                enable_tool_call_repair=False,
+                enable_doom_loop=False,
+                enable_permission=False,
+                enable_busy_mutex=False,
+                enable_llm_tool_selector=False,
+                enable_skills=False,
+                enable_specialized_subagents=False,
+                enable_kb_planner_runnable=False,
+                enable_action_log=False,
+                enable_revert_route=False,
+                enable_plugin_loader=False,
+                enable_otel=False,
+                enable_agent_cache=False,
+                enable_agent_cache_share_gp_subagent=False,
+            )
+
+        return cls(
+            disable_new_agent_stack=False,
+            # Agent quality
+            enable_context_editing=_env_bool("SURFSENSE_ENABLE_CONTEXT_EDITING", True),
+            enable_compaction_v2=_env_bool("SURFSENSE_ENABLE_COMPACTION_V2", True),
+            enable_retry_after=_env_bool("SURFSENSE_ENABLE_RETRY_AFTER", True),
+            enable_model_fallback=_env_bool("SURFSENSE_ENABLE_MODEL_FALLBACK", False),
+            enable_model_call_limit=_env_bool(
+                "SURFSENSE_ENABLE_MODEL_CALL_LIMIT", True
+            ),
+            enable_tool_call_limit=_env_bool("SURFSENSE_ENABLE_TOOL_CALL_LIMIT", True),
+            enable_tool_call_repair=_env_bool(
+                "SURFSENSE_ENABLE_TOOL_CALL_REPAIR", True
+            ),
+            enable_doom_loop=_env_bool("SURFSENSE_ENABLE_DOOM_LOOP", True),
+            # Safety
+            enable_permission=_env_bool("SURFSENSE_ENABLE_PERMISSION", True),
+            enable_busy_mutex=_env_bool("SURFSENSE_ENABLE_BUSY_MUTEX", True),
+            enable_llm_tool_selector=_env_bool(
+                "SURFSENSE_ENABLE_LLM_TOOL_SELECTOR", False
+            ),
+            # Skills + subagents
+            enable_skills=_env_bool("SURFSENSE_ENABLE_SKILLS", True),
+            enable_specialized_subagents=_env_bool(
+                "SURFSENSE_ENABLE_SPECIALIZED_SUBAGENTS", True
+            ),
+            enable_kb_planner_runnable=_env_bool(
+                "SURFSENSE_ENABLE_KB_PLANNER_RUNNABLE", True
+            ),
+            # Snapshot / revert
+            enable_action_log=_env_bool("SURFSENSE_ENABLE_ACTION_LOG", True),
+            enable_revert_route=_env_bool("SURFSENSE_ENABLE_REVERT_ROUTE", True),
+            # Plugins
+            enable_plugin_loader=_env_bool("SURFSENSE_ENABLE_PLUGIN_LOADER", False),
+            # Observability
+            enable_otel=_env_bool("SURFSENSE_ENABLE_OTEL", False),
+            # Performance
+            enable_agent_cache=_env_bool("SURFSENSE_ENABLE_AGENT_CACHE", True),
+            enable_agent_cache_share_gp_subagent=_env_bool(
+                "SURFSENSE_ENABLE_AGENT_CACHE_SHARE_GP_SUBAGENT", False
+            ),
+        )
+
+    def any_new_middleware_enabled(self) -> bool:
+        """Return True if any new middleware flag is on."""
+        if self.disable_new_agent_stack:
+            return False
+        return any(
+            (
+                self.enable_context_editing,
+                self.enable_compaction_v2,
+                self.enable_retry_after,
+                self.enable_model_fallback,
+                self.enable_model_call_limit,
+                self.enable_tool_call_limit,
+                self.enable_tool_call_repair,
+                self.enable_doom_loop,
+                self.enable_permission,
+                self.enable_busy_mutex,
+                self.enable_llm_tool_selector,
+                self.enable_skills,
+                self.enable_specialized_subagents,
+                self.enable_kb_planner_runnable,
+                self.enable_action_log,
+                self.enable_revert_route,
+                self.enable_plugin_loader,
+            )
+        )
+
+
+def get_flags() -> AgentFeatureFlags:
+    """Return the resolved feature-flag state from the **current** process environment.
+
+    Intentionally **not** cached: ``load_dotenv`` and operator edits to env vars
+    must affect the next agent build without requiring a full process restart.
+    Cost is negligible (reads ``os.environ`` once per call).
+    """
+    return AgentFeatureFlags.from_env()
+
+
+def reload_for_tests() -> AgentFeatureFlags:
+    """Compatibility helper for tests; equivalent to :func:`get_flags`."""
+    return AgentFeatureFlags.from_env()
+
+
+__all__ = [
+    "AgentFeatureFlags",
+    "get_flags",
+    "reload_for_tests",
+]
diff --git a/surfsense_backend/app/agents/new_chat/filesystem_backends.py b/surfsense_backend/app/agents/new_chat/filesystem_backends.py
new file mode 100644
index 0000000000..c8288be719
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/filesystem_backends.py
@@ -0,0 +1,63 @@
+"""Filesystem backend resolver for cloud and desktop-local modes."""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+from functools import lru_cache
+
+from deepagents.backends.protocol import BackendProtocol
+from deepagents.backends.state import StateBackend
+from langgraph.prebuilt.tool_node import ToolRuntime
+
+from app.agents.new_chat.filesystem_selection import FilesystemMode, FilesystemSelection
+from app.agents.new_chat.middleware.kb_postgres_backend import KBPostgresBackend
+from app.agents.new_chat.middleware.multi_root_local_folder_backend import (
+    MultiRootLocalFolderBackend,
+)
+
+
+@lru_cache(maxsize=64)
+def _cached_multi_root_backend(
+    mounts: tuple[tuple[str, str], ...],
+) -> MultiRootLocalFolderBackend:
+    return MultiRootLocalFolderBackend(mounts)
+
+
+def build_backend_resolver(
+    selection: FilesystemSelection,
+    *,
+    search_space_id: int | None = None,
+) -> Callable[[ToolRuntime], BackendProtocol]:
+    """Create deepagents backend resolver for the selected filesystem mode.
+
+    In cloud mode the resolver returns a fresh :class:`KBPostgresBackend`
+    bound to the current ``runtime`` so the backend can read staging state
+    (``staged_dirs``, ``pending_moves``, ``files`` cache, ``kb_anon_doc``,
+    ``kb_matched_chunk_ids``) for each tool call. When no ``search_space_id``
+    is provided, the resolver falls back to :class:`StateBackend` (used by
+    sub-agents and tests that don't need DB-backed reads).
+
+    Desktop-local mode unchanged.
+    """
+
+    if selection.mode == FilesystemMode.DESKTOP_LOCAL_FOLDER and selection.local_mounts:
+
+        def _resolve_local(_runtime: ToolRuntime) -> MultiRootLocalFolderBackend:
+            mounts = tuple(
+                (entry.mount_id, entry.root_path) for entry in selection.local_mounts
+            )
+            return _cached_multi_root_backend(mounts)
+
+        return _resolve_local
+
+    if search_space_id is not None:
+
+        def _resolve_kb(runtime: ToolRuntime) -> BackendProtocol:
+            return KBPostgresBackend(search_space_id, runtime)
+
+        return _resolve_kb
+
+    def _resolve_state(runtime: ToolRuntime) -> StateBackend:
+        return StateBackend(runtime)
+
+    return _resolve_state
diff --git a/surfsense_backend/app/agents/new_chat/filesystem_selection.py b/surfsense_backend/app/agents/new_chat/filesystem_selection.py
new file mode 100644
index 0000000000..bf0497d260
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/filesystem_selection.py
@@ -0,0 +1,41 @@
+"""Filesystem mode contracts and selection helpers for chat sessions."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from enum import StrEnum
+
+
+class FilesystemMode(StrEnum):
+    """Supported filesystem backends for agent tool execution."""
+
+    CLOUD = "cloud"
+    DESKTOP_LOCAL_FOLDER = "desktop_local_folder"
+
+
+class ClientPlatform(StrEnum):
+    """Client runtime reported by the caller."""
+
+    WEB = "web"
+    DESKTOP = "desktop"
+
+
+@dataclass(slots=True)
+class LocalFilesystemMount:
+    """Canonical mount mapping provided by desktop runtime."""
+
+    mount_id: str
+    root_path: str
+
+
+@dataclass(slots=True)
+class FilesystemSelection:
+    """Resolved filesystem selection for a single chat request."""
+
+    mode: FilesystemMode = FilesystemMode.CLOUD
+    client_platform: ClientPlatform = ClientPlatform.WEB
+    local_mounts: tuple[LocalFilesystemMount, ...] = ()
+
+    @property
+    def is_local_mode(self) -> bool:
+        return self.mode == FilesystemMode.DESKTOP_LOCAL_FOLDER
diff --git a/surfsense_backend/app/agents/new_chat/filesystem_state.py b/surfsense_backend/app/agents/new_chat/filesystem_state.py
new file mode 100644
index 0000000000..cc674be768
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/filesystem_state.py
@@ -0,0 +1,182 @@
+"""LangGraph state schema additions used by the SurfSense filesystem agent.
+
+This schema extends deepagents' upstream :class:`FilesystemState` with the
+extra fields needed to implement Postgres-backed virtual filesystem semantics:
+
+* ``cwd`` — current working directory (per-thread checkpointed).
+* ``staged_dirs`` — pending mkdir requests (cloud only).
+* ``staged_dir_tool_calls`` — sidecar map ``path -> tool_call_id`` for staged dirs.
+* ``pending_moves`` — pending move_file requests (cloud only).
+* ``pending_deletes`` — pending ``rm`` requests (cloud only).
+* ``pending_dir_deletes`` — pending ``rmdir`` requests (cloud only).
+* ``doc_id_by_path`` — virtual_path -> Document.id, populated by lazy reads.
+* ``dirty_paths`` — paths whose state file content differs from DB.
+* ``dirty_path_tool_calls`` — sidecar map ``path -> latest tool_call_id`` for
+  dirty paths; used to bind the per-path snapshot to an action_id.
+* ``kb_priority`` — top-K priority hints rendered into a system message.
+* ``kb_matched_chunk_ids`` — internal hand-off for matched-chunk highlighting.
+* ``kb_anon_doc`` — Redis-loaded anonymous document (if any).
+* ``tree_version`` — bumped by persistence; invalidates the tree render cache.
+* ``workspace_tree_text`` — pre-rendered ``<workspace_tree>`` body for the turn.
+
+Tools mutate these fields ONLY via ``Command(update=...)`` returns; the
+reducers in :mod:`app.agents.new_chat.state_reducers` handle merging.
+"""
+
+from __future__ import annotations
+
+from typing import Annotated, Any, NotRequired
+
+from deepagents.middleware.filesystem import FilesystemState
+from typing_extensions import TypedDict
+
+from app.agents.new_chat.state_reducers import (
+    _add_unique_reducer,
+    _dict_merge_with_tombstones_reducer,
+    _list_append_reducer,
+    _replace_reducer,
+)
+
+
+class PendingMove(TypedDict, total=False):
+    """A staged move_file operation pending end-of-turn commit.
+
+    ``tool_call_id`` is optional for backward compatibility with checkpoints
+    written before the snapshot/revert pipeline was wired up; new entries
+    always include it so the persistence body can resolve an action_id.
+    """
+
+    source: str
+    dest: str
+    overwrite: bool
+    tool_call_id: str
+
+
+class PendingDelete(TypedDict, total=False):
+    """A staged ``rm`` or ``rmdir`` operation pending end-of-turn commit.
+
+    ``tool_call_id`` is required for new entries (it's the binding key used
+    by :class:`KnowledgeBasePersistenceMiddleware` to find the matching
+    :class:`AgentActionLog` row and bind the snapshot to it). Marked
+    ``total=False`` only to tolerate older checkpoint payloads.
+    """
+
+    path: str
+    tool_call_id: str
+
+
+class KbPriorityEntry(TypedDict, total=False):
+    path: str
+    score: float
+    document_id: int | None
+    title: str
+    mentioned: bool
+
+
+class KbAnonDoc(TypedDict, total=False):
+    """In-memory anonymous-session document loaded from Redis."""
+
+    path: str
+    title: str
+    content: str
+    chunks: list[dict[str, Any]]
+
+
+class SurfSenseFilesystemState(FilesystemState):
+    """Filesystem state used by the SurfSense agent (cloud + desktop).
+
+    Extends deepagents' :class:`FilesystemState` (which provides ``files``)
+    with cloud-mode staging fields and search-priority hints. All extra fields
+    are reducer-backed so that ``Command(update=...)`` payloads merge cleanly
+    across agent steps and across checkpoints.
+    """
+
+    cwd: NotRequired[Annotated[str, _replace_reducer]]
+    """Current working directory.
+
+    Defaults to ``"/documents"`` in cloud mode and ``"/"`` (or first mount) in
+    desktop mode. Initialized once per thread by ``KnowledgeTreeMiddleware``.
+    """
+
+    staged_dirs: NotRequired[Annotated[list[str], _add_unique_reducer]]
+    """mkdir paths staged for end-of-turn folder creation (cloud only)."""
+
+    staged_dir_tool_calls: NotRequired[
+        Annotated[dict[str, str], _dict_merge_with_tombstones_reducer]
+    ]
+    """``path -> tool_call_id`` sidecar for ``staged_dirs``.
+
+    Used by :class:`KnowledgeBasePersistenceMiddleware` to bind the
+    :class:`FolderRevision` snapshot to the originating ``mkdir`` action.
+    Kept separate from ``staged_dirs`` (which stays a unique-string list)
+    to avoid breaking ``_add_unique_reducer`` semantics.
+    """
+
+    pending_moves: NotRequired[Annotated[list[PendingMove], _list_append_reducer]]
+    """move_file ops staged for end-of-turn commit (cloud only)."""
+
+    pending_deletes: NotRequired[Annotated[list[PendingDelete], _list_append_reducer]]
+    """``rm`` ops staged for end-of-turn ``DELETE FROM documents`` (cloud only).
+
+    Each entry is a dict ``{"path": ..., "tool_call_id": ...}``. Per-path
+    uniqueness is enforced inside the commit body, not the reducer (we keep
+    ``tool_call_id`` per occurrence so snapshot binding works).
+    """
+
+    pending_dir_deletes: NotRequired[
+        Annotated[list[PendingDelete], _list_append_reducer]
+    ]
+    """``rmdir`` ops staged for end-of-turn ``DELETE FROM folders`` (cloud only).
+
+    Same shape as :data:`pending_deletes`. Commit body re-verifies the
+    folder is empty (in-DB AND with this turn's pending changes accounted
+    for) before issuing the DELETE.
+    """
+
+    doc_id_by_path: NotRequired[
+        Annotated[dict[str, int], _dict_merge_with_tombstones_reducer]
+    ]
+    """virtual_path -> ``Document.id`` for lazily loaded files.
+
+    Populated on first read of a KB document. Used by edit_file/move_file/
+    aafter_agent to map paths back to a real DB row. ``None`` values delete
+    the key (tombstones).
+    """
+
+    dirty_paths: NotRequired[Annotated[list[str], _add_unique_reducer]]
+    """Paths whose ``state["files"]`` content has been modified this turn."""
+
+    dirty_path_tool_calls: NotRequired[
+        Annotated[dict[str, str], _dict_merge_with_tombstones_reducer]
+    ]
+    """``path -> latest tool_call_id`` sidecar for ``dirty_paths``.
+
+    The persistence body coalesces multiple writes/edits to the same path
+    into one snapshot per turn. This map captures the most-recent
+    ``tool_call_id`` so the resulting :class:`DocumentRevision` is bound
+    to the latest action_id (the one the user is most likely to revert).
+    """
+
+    kb_priority: NotRequired[Annotated[list[KbPriorityEntry], _replace_reducer]]
+    """Top-K priority hints rendered as a system message before the user turn."""
+
+    kb_matched_chunk_ids: NotRequired[Annotated[dict[int, list[int]], _replace_reducer]]
+    """Internal: ``Document.id`` -> list of matched chunk IDs from hybrid search."""
+
+    kb_anon_doc: NotRequired[Annotated[KbAnonDoc | None, _replace_reducer]]
+    """Anonymous-session document loaded from Redis (read-only, no DB row)."""
+
+    tree_version: NotRequired[Annotated[int, _replace_reducer]]
+    """Monotonically increasing counter; bumped when commits change the KB tree."""
+
+    workspace_tree_text: NotRequired[Annotated[str, _replace_reducer]]
+    """Pre-rendered ``<workspace_tree>`` body; shared with subagents to skip re-render."""
+
+
+__all__ = [
+    "KbAnonDoc",
+    "KbPriorityEntry",
+    "PendingDelete",
+    "PendingMove",
+    "SurfSenseFilesystemState",
+]
diff --git a/surfsense_backend/app/agents/new_chat/llm_config.py b/surfsense_backend/app/agents/new_chat/llm_config.py
index 60cd2a452a..bc37bf1c43 100644
--- a/surfsense_backend/app/agents/new_chat/llm_config.py
+++ b/surfsense_backend/app/agents/new_chat/llm_config.py
@@ -10,58 +10,98 @@
 managing prompt configurations.
 """
 
+from collections.abc import AsyncIterator
 from dataclasses import dataclass
 from pathlib import Path
+from typing import Any
 
 import yaml
+from langchain_core.callbacks import (
+    AsyncCallbackManagerForLLMRun,
+    CallbackManagerForLLMRun,
+)
+from langchain_core.messages import AIMessage, BaseMessage
+from langchain_core.outputs import ChatGenerationChunk, ChatResult
 from langchain_litellm import ChatLiteLLM
 from litellm import get_model_info
 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
 
+from app.agents.new_chat.prompt_caching import apply_litellm_prompt_caching
 from app.services.llm_router_service import (
     AUTO_MODE_ID,
     ChatLiteLLMRouter,
     LLMRouterService,
+    _sanitize_content,
     get_auto_mode_llm,
     is_auto_mode,
 )
 
-# Provider mapping for LiteLLM model string construction
-PROVIDER_MAP = {
-    "OPENAI": "openai",
-    "ANTHROPIC": "anthropic",
-    "GROQ": "groq",
-    "COHERE": "cohere",
-    "GOOGLE": "gemini",
-    "OLLAMA": "ollama_chat",
-    "MISTRAL": "mistral",
-    "AZURE_OPENAI": "azure",
-    "OPENROUTER": "openrouter",
-    "XAI": "xai",
-    "BEDROCK": "bedrock",
-    "VERTEX_AI": "vertex_ai",
-    "TOGETHER_AI": "together_ai",
-    "FIREWORKS_AI": "fireworks_ai",
-    "DEEPSEEK": "openai",
-    "ALIBABA_QWEN": "openai",
-    "MOONSHOT": "openai",
-    "ZHIPU": "openai",
-    "GITHUB_MODELS": "github",
-    "REPLICATE": "replicate",
-    "PERPLEXITY": "perplexity",
-    "ANYSCALE": "anyscale",
-    "DEEPINFRA": "deepinfra",
-    "CEREBRAS": "cerebras",
-    "SAMBANOVA": "sambanova",
-    "AI21": "ai21",
-    "CLOUDFLARE": "cloudflare",
-    "DATABRICKS": "databricks",
-    "COMETAPI": "cometapi",
-    "HUGGINGFACE": "huggingface",
-    "MINIMAX": "openai",
-    "CUSTOM": "custom",
-}
+
+def _sanitize_messages(messages: list[BaseMessage]) -> list[BaseMessage]:
+    """Sanitize content on every message so it is safe for any provider.
+
+    Handles three cross-provider incompatibilities:
+    - List content with provider-specific blocks (e.g. ``thinking``)
+    - List content with bare strings or empty text blocks
+    - AI messages with empty content + tool calls: some providers (Bedrock)
+      convert ``""`` to ``[{"type":"text","text":""}]`` server-side then
+      reject the blank text.  The OpenAI spec says ``content`` should be
+      ``null`` when an assistant message only carries tool calls.
+    """
+    for msg in messages:
+        if isinstance(msg.content, list):
+            msg.content = _sanitize_content(msg.content)
+        if (
+            isinstance(msg, AIMessage)
+            and (not msg.content or msg.content == "")
+            and getattr(msg, "tool_calls", None)
+        ):
+            msg.content = None  # type: ignore[assignment]
+    return messages
+
+
+class SanitizedChatLiteLLM(ChatLiteLLM):
+    """ChatLiteLLM subclass that strips provider-specific content blocks
+    (e.g. ``thinking`` from reasoning models) and normalises bare strings
+    in content arrays before forwarding to the underlying provider."""
+
+    def _generate(
+        self,
+        messages: list[BaseMessage],
+        stop: list[str] | None = None,
+        run_manager: CallbackManagerForLLMRun | None = None,
+        **kwargs: Any,
+    ) -> ChatResult:
+        return super()._generate(
+            _sanitize_messages(messages), stop, run_manager, **kwargs
+        )
+
+    async def _astream(
+        self,
+        messages: list[BaseMessage],
+        stop: list[str] | None = None,
+        run_manager: AsyncCallbackManagerForLLMRun | None = None,
+        **kwargs: Any,
+    ) -> AsyncIterator[ChatGenerationChunk]:
+        async for chunk in super()._astream(
+            _sanitize_messages(messages), stop, run_manager, **kwargs
+        ):
+            yield chunk
+
+
+# Provider mapping for LiteLLM model string construction.
+#
+# Single source of truth lives in
+# :mod:`app.services.provider_capabilities` so the YAML loader (which
+# runs during ``app.config`` class-body init) can resolve provider
+# prefixes without dragging the agent / tools tree into module load
+# order. Re-exported here under the historical ``PROVIDER_MAP`` name
+# so existing callers (``llm_router_service``, ``image_gen_router_service``,
+# tests) keep working unchanged.
+from app.services.provider_capabilities import (  # noqa: E402
+    _PROVIDER_PREFIX_MAP as PROVIDER_MAP,
+)
 
 
 def _attach_model_profile(llm: ChatLiteLLM, model_string: str) -> None:
@@ -109,6 +149,23 @@ class AgentConfig:
     # Auto mode flag
     is_auto_mode: bool = False
 
+    # Token quota and policy
+    billing_tier: str = "free"
+    is_premium: bool = False
+    anonymous_enabled: bool = False
+    quota_reserve_tokens: int | None = None
+
+    # Capability flag: best-effort True for the chat selector / catalog.
+    # Resolved via :func:`provider_capabilities.derive_supports_image_input`
+    # which prefers OpenRouter's ``architecture.input_modalities`` and
+    # otherwise consults LiteLLM's authoritative model map. Default True
+    # is the conservative-allow stance — the streaming-task safety net
+    # (``is_known_text_only_chat_model``) is the *only* place a False
+    # actually blocks a request. Setting this to False here without an
+    # authoritative source would silently hide vision-capable models
+    # (the regression we're fixing).
+    supports_image_input: bool = True
+
     @classmethod
     def from_auto_mode(cls) -> "AgentConfig":
         """
@@ -130,6 +187,16 @@ def from_auto_mode(cls) -> "AgentConfig":
             config_id=AUTO_MODE_ID,
             config_name="Auto (Fastest)",
             is_auto_mode=True,
+            billing_tier="free",
+            is_premium=False,
+            anonymous_enabled=False,
+            quota_reserve_tokens=None,
+            # Auto routes across the configured pool, which usually
+            # contains at least one vision-capable deployment; the router
+            # will surface a 404 from a non-vision deployment as a normal
+            # ``allowed_fails`` event and fail over rather than blocking
+            # the request outright.
+            supports_image_input=True,
         )
 
     @classmethod
@@ -143,10 +210,24 @@ def from_new_llm_config(cls, config) -> "AgentConfig":
         Returns:
             AgentConfig instance
         """
-        return cls(
-            provider=config.provider.value
+        # Lazy import to avoid pulling provider_capabilities (and its
+        # transitive litellm import) into module-init order.
+        from app.services.provider_capabilities import derive_supports_image_input
+
+        provider_value = (
+            config.provider.value
             if hasattr(config.provider, "value")
-            else str(config.provider),
+            else str(config.provider)
+        )
+        litellm_params = config.litellm_params or {}
+        base_model = (
+            litellm_params.get("base_model")
+            if isinstance(litellm_params, dict)
+            else None
+        )
+
+        return cls(
+            provider=provider_value,
             model_name=config.model_name,
             api_key=config.api_key,
             api_base=config.api_base,
@@ -158,6 +239,20 @@ def from_new_llm_config(cls, config) -> "AgentConfig":
             config_id=config.id,
             config_name=config.name,
             is_auto_mode=False,
+            billing_tier="free",
+            is_premium=False,
+            anonymous_enabled=False,
+            quota_reserve_tokens=None,
+            # BYOK rows have no operator-curated capability flag, so we
+            # ask LiteLLM (default-allow on unknown). The streaming
+            # safety net still blocks if the model is *explicitly*
+            # marked text-only.
+            supports_image_input=derive_supports_image_input(
+                provider=provider_value,
+                model_name=config.model_name,
+                base_model=base_model,
+                custom_provider=config.custom_provider,
+            ),
         )
 
     @classmethod
@@ -176,15 +271,46 @@ def from_yaml_config(cls, yaml_config: dict) -> "AgentConfig":
         Returns:
             AgentConfig instance
         """
+        # Lazy import to avoid pulling provider_capabilities (and its
+        # transitive litellm import) into module-init order.
+        from app.services.provider_capabilities import derive_supports_image_input
+
         # Get system instructions from YAML, default to empty string
         system_instructions = yaml_config.get("system_instructions", "")
 
+        provider = yaml_config.get("provider", "").upper()
+        model_name = yaml_config.get("model_name", "")
+        custom_provider = yaml_config.get("custom_provider")
+        litellm_params = yaml_config.get("litellm_params") or {}
+        base_model = (
+            litellm_params.get("base_model")
+            if isinstance(litellm_params, dict)
+            else None
+        )
+
+        # Explicit YAML override wins; otherwise derive from LiteLLM /
+        # OpenRouter modalities. The YAML loader already populates this
+        # field, but this method is also called from
+        # ``load_global_llm_config_by_id``'s file fallback (hot reload),
+        # so we re-derive here for safety. The bool() coercion preserves
+        # the loader's behaviour for explicit ``true`` / ``false``
+        # strings that PyYAML may surface.
+        if "supports_image_input" in yaml_config:
+            supports_image_input = bool(yaml_config.get("supports_image_input"))
+        else:
+            supports_image_input = derive_supports_image_input(
+                provider=provider,
+                model_name=model_name,
+                base_model=base_model,
+                custom_provider=custom_provider,
+            )
+
         return cls(
-            provider=yaml_config.get("provider", "").upper(),
-            model_name=yaml_config.get("model_name", ""),
+            provider=provider,
+            model_name=model_name,
             api_key=yaml_config.get("api_key", ""),
             api_base=yaml_config.get("api_base"),
-            custom_provider=yaml_config.get("custom_provider"),
+            custom_provider=custom_provider,
             litellm_params=yaml_config.get("litellm_params"),
             # Prompt configuration from YAML (with defaults for backwards compatibility)
             system_instructions=system_instructions if system_instructions else None,
@@ -195,6 +321,11 @@ def from_yaml_config(cls, yaml_config: dict) -> "AgentConfig":
             config_id=yaml_config.get("id"),
             config_name=yaml_config.get("name"),
             is_auto_mode=False,
+            billing_tier=yaml_config.get("billing_tier", "free"),
+            is_premium=yaml_config.get("billing_tier", "free") == "premium",
+            anonymous_enabled=yaml_config.get("anonymous_enabled", False),
+            quota_reserve_tokens=yaml_config.get("quota_reserve_tokens"),
+            supports_image_input=supports_image_input,
         )
 
 
@@ -234,6 +365,28 @@ def load_llm_config_from_yaml(llm_config_id: int = -1) -> dict | None:
         return None
 
 
+def load_global_llm_config_by_id(llm_config_id: int) -> dict | None:
+    """
+    Load a global LLM config by ID, checking in-memory configs first.
+
+    This handles both static YAML configs and dynamically injected configs
+    (e.g. OpenRouter integration models that only exist in memory).
+
+    Args:
+        llm_config_id: The negative ID of the global config to load
+
+    Returns:
+        LLM config dict or None if not found
+    """
+    from app.config import config as app_config
+
+    for cfg in app_config.GLOBAL_LLM_CONFIGS:
+        if cfg.get("id") == llm_config_id:
+            return cfg
+    # Fallback to YAML file read (covers edge cases like hot-reload)
+    return load_llm_config_from_yaml(llm_config_id)
+
+
 async def load_new_llm_config_from_db(
     session: AsyncSession,
     config_id: int,
@@ -341,7 +494,13 @@ async def load_agent_config(
         return AgentConfig.from_auto_mode()
 
     if config_id < 0:
-        # Load from YAML (global configs have negative IDs)
+        # Check in-memory configs first (includes static YAML + dynamic OpenRouter)
+        from app.config import config as app_config
+
+        for cfg in app_config.GLOBAL_LLM_CONFIGS:
+            if cfg.get("id") == config_id:
+                return AgentConfig.from_yaml_config(cfg)
+        # Fallback to YAML file read for safety
         yaml_config = load_llm_config_from_yaml(config_id)
         if yaml_config:
             return AgentConfig.from_yaml_config(yaml_config)
@@ -384,8 +543,13 @@ def create_chat_litellm_from_config(llm_config: dict) -> ChatLiteLLM | None:
     if llm_config.get("litellm_params"):
         litellm_kwargs.update(llm_config["litellm_params"])
 
-    llm = ChatLiteLLM(**litellm_kwargs)
+    llm = SanitizedChatLiteLLM(**litellm_kwargs)
     _attach_model_profile(llm, model_string)
+    # Configure LiteLLM-native prompt caching (cache_control_injection_points
+    # for Anthropic/Bedrock/Vertex/Gemini/Azure-AI/OpenRouter/Databricks/etc.).
+    # ``agent_config=None`` here — the YAML path doesn't have provider intent
+    # in a structured form, so we set only the universal injection points.
+    apply_litellm_prompt_caching(llm)
     return llm
 
 
@@ -410,7 +574,16 @@ def create_chat_litellm_from_agent_config(
             print("Error: Auto mode requested but LLM Router not initialized")
             return None
         try:
-            return get_auto_mode_llm()
+            router_llm = get_auto_mode_llm()
+            if router_llm is not None:
+                # Universal cache_control_injection_points only — auto-mode
+                # fans out across providers, so OpenAI-only kwargs (e.g.
+                # ``prompt_cache_key``) are left off here. ``drop_params``
+                # would strip them at the provider boundary anyway, but
+                # there's no point setting them when we don't know the
+                # destination.
+                apply_litellm_prompt_caching(router_llm, agent_config=agent_config)
+            return router_llm
         except Exception as e:
             print(f"Error creating ChatLiteLLMRouter: {e}")
             return None
@@ -439,6 +612,11 @@ def create_chat_litellm_from_agent_config(
     if agent_config.litellm_params:
         litellm_kwargs.update(agent_config.litellm_params)
 
-    llm = ChatLiteLLM(**litellm_kwargs)
+    llm = SanitizedChatLiteLLM(**litellm_kwargs)
     _attach_model_profile(llm, model_string)
+    # Build-time prompt caching: sets ``cache_control_injection_points`` for
+    # all providers and (for OpenAI/DeepSeek/xAI) ``prompt_cache_retention``.
+    # Per-thread ``prompt_cache_key`` is layered on later in
+    # ``create_surfsense_deep_agent`` once ``thread_id`` is known.
+    apply_litellm_prompt_caching(llm, agent_config=agent_config)
     return llm
diff --git a/surfsense_backend/app/agents/new_chat/memory_extraction.py b/surfsense_backend/app/agents/new_chat/memory_extraction.py
new file mode 100644
index 0000000000..e31774a7c5
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/memory_extraction.py
@@ -0,0 +1,232 @@
+"""Background memory extraction for the SurfSense agent.
+
+After each agent response, if the agent did not call ``update_memory`` during
+the turn, this module can run a lightweight LLM call to decide whether the
+latest message contains long-term information worth persisting.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+from uuid import UUID
+
+from langchain_core.messages import HumanMessage
+from sqlalchemy import select
+
+from app.agents.new_chat.tools.update_memory import _save_memory
+from app.db import SearchSpace, User, shielded_async_session
+from app.utils.content_utils import extract_text_content
+
+logger = logging.getLogger(__name__)
+
+_MEMORY_EXTRACT_PROMPT = """\
+You are a memory extraction assistant. Analyze the user's message and decide \
+if it contains any long-term information worth persisting to memory.
+
+Worth remembering: preferences, background/identity, goals, projects, \
+instructions, tools/languages they use, decisions, expertise, workplace — \
+durable facts that will matter in future conversations.
+
+NOT worth remembering: greetings, one-off factual questions, session \
+logistics, ephemeral requests, follow-up clarifications with no new personal \
+info, things that only matter for the current task.
+
+If the message contains memorizable information, output the FULL updated \
+memory document with the new facts merged into the existing content. Follow \
+these rules:
+- Every entry MUST be under a ## heading. Preserve existing headings; create new ones
+  freely. Keep heading names short (2-3 words) and natural. Do NOT include the user's
+  name in headings.
+- Keep entries as single bullet points. Be descriptive but concise — include relevant
+  details and context rather than just a few words.
+- Every bullet MUST use format: - (YYYY-MM-DD) [fact|pref|instr] text
+  [fact] = durable facts, [pref] = preferences, [instr] = standing instructions.
+- Use the user's first name (from <user_name>) in entry text, not "the user".
+- If a new fact contradicts an existing entry, update the existing entry.
+- Do not duplicate information that is already present.
+
+If nothing is worth remembering, output exactly: NO_UPDATE
+
+<user_name>{user_name}</user_name>
+
+<current_memory>
+{current_memory}
+</current_memory>
+
+<user_message>
+{user_message}
+</user_message>"""
+
+_TEAM_MEMORY_EXTRACT_PROMPT = """\
+You are a team-memory extraction assistant. Analyze the latest message and \
+decide if it contains durable TEAM-level information worth persisting.
+
+Decision policy:
+- Prioritize recall for durable team context, while avoiding personal-only facts.
+- Do NOT require explicit consensus language. A direct team-level statement can
+  be stored if it is stable and broadly useful for future team chats.
+- If evidence is weak or clearly tentative, output NO_UPDATE.
+
+Worth remembering (team-level only):
+- Decisions and defaults that guide future team work
+- Team conventions/standards (naming, review policy, coding norms)
+- Stable org/project facts (locations, ownership, constraints)
+- Long-lived architecture/process facts
+- Ongoing priorities that are likely relevant beyond this turn
+
+NOT worth remembering:
+- Personal preferences or biography of one person
+- Questions, brainstorming, tentative ideas, or speculation
+- One-off requests, status updates, TODOs, logistics for this session
+- Information scoped only to a single ephemeral task
+
+If the message contains memorizable team information, output the FULL updated \
+team memory document with new facts merged into existing content. Follow rules:
+- Every entry MUST be under a ## heading. Preserve existing headings; create new ones
+  freely. Keep heading names short (2-3 words) and natural.
+- Keep entries as single bullet points. Be descriptive but concise — include relevant
+  details and context rather than just a few words.
+- Every bullet MUST use format: - (YYYY-MM-DD) [fact] text
+  Team memory uses ONLY the [fact] marker. Never use [pref] or [instr].
+- If a new fact contradicts an existing entry, update the existing entry.
+- Do not duplicate existing information.
+- Preserve neutral team phrasing; avoid person-specific memory unless role-anchored.
+
+If nothing is worth remembering, output exactly: NO_UPDATE
+
+<current_team_memory>
+{current_memory}
+</current_team_memory>
+
+<latest_message_author>
+{author}
+</latest_message_author>
+
+<latest_message>
+{user_message}
+</latest_message>"""
+
+
+async def extract_and_save_memory(
+    *,
+    user_message: str,
+    user_id: str | None,
+    llm: Any,
+) -> None:
+    """Background task: extract memorizable info and persist it.
+
+    Designed to be fire-and-forget — catches all exceptions internally.
+    """
+    if not user_id:
+        return
+
+    try:
+        uid = UUID(user_id) if isinstance(user_id, str) else user_id
+
+        async with shielded_async_session() as session:
+            result = await session.execute(select(User).where(User.id == uid))
+            user = result.scalars().first()
+            if not user:
+                return
+
+            old_memory = user.memory_md
+            first_name = (
+                user.display_name.strip().split()[0]
+                if user.display_name and user.display_name.strip()
+                else "The user"
+            )
+            prompt = _MEMORY_EXTRACT_PROMPT.format(
+                current_memory=old_memory or "(empty)",
+                user_message=user_message,
+                user_name=first_name,
+            )
+            response = await llm.ainvoke(
+                [HumanMessage(content=prompt)],
+                config={"tags": ["surfsense:internal", "memory-extraction"]},
+            )
+            text = extract_text_content(response.content).strip()
+
+            if text == "NO_UPDATE" or not text:
+                logger.debug("Memory extraction: no update needed (user %s)", uid)
+                return
+
+            save_result = await _save_memory(
+                updated_memory=text,
+                old_memory=old_memory,
+                llm=llm,
+                apply_fn=lambda content: setattr(user, "memory_md", content),
+                commit_fn=session.commit,
+                rollback_fn=session.rollback,
+                label="memory",
+                scope="user",
+            )
+            logger.info(
+                "Background memory extraction for user %s: %s",
+                uid,
+                save_result.get("status"),
+            )
+    except Exception:
+        logger.exception("Background user memory extraction failed")
+
+
+async def extract_and_save_team_memory(
+    *,
+    user_message: str,
+    search_space_id: int | None,
+    llm: Any,
+    author_display_name: str | None = None,
+) -> None:
+    """Background task: extract team-level memory and persist it.
+
+    Runs only for shared threads. Designed to be fire-and-forget and catches
+    exceptions internally.
+    """
+    if not search_space_id:
+        return
+
+    try:
+        async with shielded_async_session() as session:
+            result = await session.execute(
+                select(SearchSpace).where(SearchSpace.id == search_space_id)
+            )
+            space = result.scalars().first()
+            if not space:
+                return
+
+            old_memory = space.shared_memory_md
+            prompt = _TEAM_MEMORY_EXTRACT_PROMPT.format(
+                current_memory=old_memory or "(empty)",
+                author=author_display_name or "Unknown team member",
+                user_message=user_message,
+            )
+            response = await llm.ainvoke(
+                [HumanMessage(content=prompt)],
+                config={"tags": ["surfsense:internal", "team-memory-extraction"]},
+            )
+            text = extract_text_content(response.content).strip()
+
+            if text == "NO_UPDATE" or not text:
+                logger.debug(
+                    "Team memory extraction: no update needed (space %s)",
+                    search_space_id,
+                )
+                return
+
+            save_result = await _save_memory(
+                updated_memory=text,
+                old_memory=old_memory,
+                llm=llm,
+                apply_fn=lambda content: setattr(space, "shared_memory_md", content),
+                commit_fn=session.commit,
+                rollback_fn=session.rollback,
+                label="team memory",
+                scope="team",
+            )
+            logger.info(
+                "Background team memory extraction for space %s: %s",
+                search_space_id,
+                save_result.get("status"),
+            )
+    except Exception:
+        logger.exception("Background team memory extraction failed")
diff --git a/surfsense_backend/app/agents/new_chat/mention_resolver.py b/surfsense_backend/app/agents/new_chat/mention_resolver.py
new file mode 100644
index 0000000000..00bb7e71f4
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/mention_resolver.py
@@ -0,0 +1,281 @@
+"""Resolve @-mention chips to canonical virtual paths and substitute the
+user-visible ``@title`` tokens with backtick-wrapped paths in the prompt
+the agent sees.
+
+The frontend's mention seam is a single discriminated-union list of
+``{kind: "doc" | "folder", id, title, document_type?}`` chips (see
+``surfsense_web/atoms/chat/mentioned-documents.atom.ts``). When a turn
+reaches the backend stream task we have three needs that this module
+centralises:
+
+1. Map each chip to its canonical virtual path
+   (``/documents/.../file.xml`` for docs, ``/documents/MyFolder/`` for
+   folders) so the agent sees concrete filesystem locations instead of
+   ambiguous ``@``-titles.
+2. Substitute ``@title`` tokens in the user-typed text with backtick-
+   wrapped paths so the path becomes part of the ``HumanMessage`` body
+   the LLM consumes — without rewriting the persisted user message
+   text (which keeps ``@title`` so chip rendering on reload is
+   unchanged).
+3. Surface the resolved id sets (docs + folders) to the priority
+   middleware so it can render ``[USER-MENTIONED]`` priority entries
+   without re-doing path resolution.
+
+This is intentionally one module — see the architectural note in
+``mention-paths-and-folders`` plan: previously the doc-resolution lived
+inline in ``stream_new_chat`` and the folder mention had no resolution
+at all. Centralising both behind a single ``resolve_mentions`` call
+turns a leaky multi-field seam into a single deeper interface.
+"""
+
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass, field
+
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.agents.new_chat.path_resolver import (
+    DOCUMENTS_ROOT,
+    build_path_index,
+    doc_to_virtual_path,
+)
+from app.db import Document, Folder
+from app.schemas.new_chat import MentionedDocumentInfo
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass(frozen=True)
+class ResolvedMention:
+    """Canonical view of a single @-mention chip.
+
+    ``virtual_path`` is the path the agent will see (no trailing slash
+    for documents, trailing ``/`` for folders to match the convention
+    used by ``KnowledgeTreeMiddleware``).
+    """
+
+    kind: str  # "doc" | "folder"
+    id: int
+    title: str
+    virtual_path: str
+
+
+@dataclass
+class ResolvedMentionSet:
+    """Aggregate result of resolving a turn's mention chips.
+
+    ``token_to_path`` maps ``@title`` (the literal token the user typed
+    and the editor emitted) to the canonical virtual path for that
+    chip. It is produced longest-token-first so substitution mirrors
+    ``parseMentionSegments`` on the frontend (a longer title like
+    ``@Project Roadmap`` is never shadowed by a shorter prefix
+    ``@Project``).
+
+    ``mentioned_document_ids`` collapses doc + surfsense_doc chips into
+    a single ordered, deduped list because the priority middleware
+    treats them uniformly downstream — see
+    ``KnowledgePriorityMiddleware._compute_priority_paths``.
+    """
+
+    mentions: list[ResolvedMention] = field(default_factory=list)
+    token_to_path: list[tuple[str, str]] = field(default_factory=list)
+    mentioned_document_ids: list[int] = field(default_factory=list)
+    mentioned_folder_ids: list[int] = field(default_factory=list)
+
+
+def _folder_virtual_path(folder_id: int, folder_paths: dict[int, str]) -> str:
+    """Return ``/documents/Folder/Sub/`` for a folder id.
+
+    Falls back to the documents root when the folder is missing from
+    the index (deleted or in a different search space). Trailing slash
+    matches ``KnowledgeTreeMiddleware`` (``/documents/MyFolder/``) so
+    the agent's ``ls`` can dispatch on it as a directory.
+    """
+    base = folder_paths.get(folder_id, DOCUMENTS_ROOT)
+    return f"{base}/" if not base.endswith("/") else base
+
+
+async def resolve_mentions(
+    session: AsyncSession,
+    *,
+    search_space_id: int,
+    mentioned_documents: list[MentionedDocumentInfo] | None,
+    mentioned_document_ids: list[int] | None = None,
+    mentioned_surfsense_doc_ids: list[int] | None = None,
+    mentioned_folder_ids: list[int] | None = None,
+) -> ResolvedMentionSet:
+    """Resolve every @-mention chip on a turn into virtual paths.
+
+    The function takes both the ``mentioned_documents`` discriminated
+    list (chip metadata used for substitution + persistence) and the
+    parallel id arrays (``mentioned_document_ids``,
+    ``mentioned_surfsense_doc_ids``, ``mentioned_folder_ids``) for two
+    reasons:
+
+    * Legacy clients that haven't migrated to the unified chip list
+      still send the id arrays — we treat the union as authoritative.
+    * The id arrays are the canonical input to
+      ``KnowledgePriorityMiddleware`` (via ``SurfSenseContextSchema``);
+      returning the deduped, validated lists lets the route forward
+      them unchanged.
+
+    Resolution is best-effort: a chip whose id no longer exists (e.g.
+    document was deleted between mention and submit) is silently
+    dropped. The agent still sees the user's original text, just
+    without a backtick-path substitution for that chip.
+    """
+    chip_doc_ids: list[int] = []
+    chip_folder_ids: list[int] = []
+    chip_titles_by_id: dict[tuple[str, int], str] = {}
+    if mentioned_documents:
+        for chip in mentioned_documents:
+            kind = chip.kind
+            if kind == "folder":
+                chip_folder_ids.append(chip.id)
+            else:
+                chip_doc_ids.append(chip.id)
+            chip_titles_by_id[(kind, chip.id)] = chip.title
+
+    doc_id_pool: list[int] = list(
+        dict.fromkeys(
+            [
+                *(mentioned_document_ids or []),
+                *(mentioned_surfsense_doc_ids or []),
+                *chip_doc_ids,
+            ]
+        )
+    )
+    folder_id_pool: list[int] = list(
+        dict.fromkeys([*(mentioned_folder_ids or []), *chip_folder_ids])
+    )
+
+    if not doc_id_pool and not folder_id_pool:
+        return ResolvedMentionSet()
+
+    index = await build_path_index(session, search_space_id)
+
+    doc_rows: dict[int, Document] = {}
+    if doc_id_pool:
+        result = await session.execute(
+            select(Document).where(
+                Document.search_space_id == search_space_id,
+                Document.id.in_(doc_id_pool),
+            )
+        )
+        for row in result.scalars().all():
+            doc_rows[row.id] = row
+
+    folder_rows: dict[int, Folder] = {}
+    if folder_id_pool:
+        result = await session.execute(
+            select(Folder).where(
+                Folder.search_space_id == search_space_id,
+                Folder.id.in_(folder_id_pool),
+            )
+        )
+        for row in result.scalars().all():
+            folder_rows[row.id] = row
+
+    resolved: list[ResolvedMention] = []
+    accepted_doc_ids: list[int] = []
+    accepted_folder_ids: list[int] = []
+
+    for doc_id in doc_id_pool:
+        row = doc_rows.get(doc_id)
+        if row is None:
+            logger.debug(
+                "mention_resolver: dropping doc id=%s (not found in space=%s)",
+                doc_id,
+                search_space_id,
+            )
+            continue
+        title = chip_titles_by_id.get(("doc", doc_id), str(row.title or ""))
+        path = doc_to_virtual_path(
+            doc_id=row.id,
+            title=str(row.title or "untitled"),
+            folder_id=row.folder_id,
+            index=index,
+        )
+        resolved.append(
+            ResolvedMention(kind="doc", id=row.id, title=title, virtual_path=path)
+        )
+        accepted_doc_ids.append(row.id)
+
+    for folder_id in folder_id_pool:
+        row = folder_rows.get(folder_id)
+        if row is None:
+            logger.debug(
+                "mention_resolver: dropping folder id=%s (not found in space=%s)",
+                folder_id,
+                search_space_id,
+            )
+            continue
+        title = chip_titles_by_id.get(("folder", folder_id), str(row.name or ""))
+        path = _folder_virtual_path(row.id, index.folder_paths)
+        resolved.append(
+            ResolvedMention(kind="folder", id=row.id, title=title, virtual_path=path)
+        )
+        accepted_folder_ids.append(row.id)
+
+    token_to_path: list[tuple[str, str]] = []
+    seen_tokens: set[str] = set()
+    for mention in resolved:
+        if not mention.title:
+            continue
+        token = f"@{mention.title}"
+        if token in seen_tokens:
+            continue
+        seen_tokens.add(token)
+        token_to_path.append((token, mention.virtual_path))
+    token_to_path.sort(key=lambda pair: len(pair[0]), reverse=True)
+
+    return ResolvedMentionSet(
+        mentions=resolved,
+        token_to_path=token_to_path,
+        mentioned_document_ids=accepted_doc_ids,
+        mentioned_folder_ids=accepted_folder_ids,
+    )
+
+
+def substitute_in_text(text: str, token_to_path: list[tuple[str, str]]) -> str:
+    """Replace each ``@title`` token with a backtick-wrapped virtual path.
+
+    Mirrors ``parseMentionSegments`` on the frontend: longest token
+    first, single forward pass, no regex (titles can contain regex
+    metacharacters). The substitution is idempotent for already-
+    substituted text because the backtick-wrapped path no longer
+    starts with ``@``.
+
+    Empty / no-op cases short-circuit so callers can pass this through
+    unconditionally without paying for a scan.
+    """
+    if not text or not token_to_path:
+        return text
+
+    out: list[str] = []
+    i = 0
+    n = len(text)
+    while i < n:
+        matched: tuple[str, str] | None = None
+        for token, path in token_to_path:
+            if text.startswith(token, i):
+                matched = (token, path)
+                break
+        if matched is None:
+            out.append(text[i])
+            i += 1
+            continue
+        token, path = matched
+        out.append(f"`{path}`")
+        i += len(token)
+    return "".join(out)
+
+
+__all__ = [
+    "ResolvedMention",
+    "ResolvedMentionSet",
+    "resolve_mentions",
+    "substitute_in_text",
+]
diff --git a/surfsense_backend/app/agents/new_chat/middleware/__init__.py b/surfsense_backend/app/agents/new_chat/middleware/__init__.py
index 91996f702c..6742bd8de0 100644
--- a/surfsense_backend/app/agents/new_chat/middleware/__init__.py
+++ b/surfsense_backend/app/agents/new_chat/middleware/__init__.py
@@ -1,17 +1,87 @@
 """Middleware components for the SurfSense new chat agent."""
 
+from app.agents.new_chat.middleware.action_log import ActionLogMiddleware
+from app.agents.new_chat.middleware.anonymous_document import (
+    AnonymousDocumentMiddleware,
+)
+from app.agents.new_chat.middleware.busy_mutex import BusyMutexMiddleware
+from app.agents.new_chat.middleware.compaction import (
+    SurfSenseCompactionMiddleware,
+    create_surfsense_compaction_middleware,
+)
+from app.agents.new_chat.middleware.context_editing import (
+    ClearToolUsesEdit,
+    SpillingContextEditingMiddleware,
+    SpillToBackendEdit,
+)
 from app.agents.new_chat.middleware.dedup_tool_calls import (
     DedupHITLToolCallsMiddleware,
 )
+from app.agents.new_chat.middleware.doom_loop import DoomLoopMiddleware
+from app.agents.new_chat.middleware.file_intent import (
+    FileIntentMiddleware,
+)
 from app.agents.new_chat.middleware.filesystem import (
     SurfSenseFilesystemMiddleware,
 )
+from app.agents.new_chat.middleware.flatten_system import (
+    FlattenSystemMessageMiddleware,
+)
+from app.agents.new_chat.middleware.kb_persistence import (
+    KnowledgeBasePersistenceMiddleware,
+    commit_staged_filesystem_state,
+)
 from app.agents.new_chat.middleware.knowledge_search import (
     KnowledgeBaseSearchMiddleware,
+    KnowledgePriorityMiddleware,
+)
+from app.agents.new_chat.middleware.knowledge_tree import (
+    KnowledgeTreeMiddleware,
+)
+from app.agents.new_chat.middleware.memory_injection import (
+    MemoryInjectionMiddleware,
+)
+from app.agents.new_chat.middleware.noop_injection import NoopInjectionMiddleware
+from app.agents.new_chat.middleware.otel_span import OtelSpanMiddleware
+from app.agents.new_chat.middleware.permission import PermissionMiddleware
+from app.agents.new_chat.middleware.retry_after import RetryAfterMiddleware
+from app.agents.new_chat.middleware.skills_backends import (
+    BuiltinSkillsBackend,
+    SearchSpaceSkillsBackend,
+    build_skills_backend_factory,
+    default_skills_sources,
+)
+from app.agents.new_chat.middleware.tool_call_repair import (
+    ToolCallNameRepairMiddleware,
 )
 
 __all__ = [
+    "ActionLogMiddleware",
+    "AnonymousDocumentMiddleware",
+    "BuiltinSkillsBackend",
+    "BusyMutexMiddleware",
+    "ClearToolUsesEdit",
     "DedupHITLToolCallsMiddleware",
+    "DoomLoopMiddleware",
+    "FileIntentMiddleware",
+    "FlattenSystemMessageMiddleware",
+    "KnowledgeBasePersistenceMiddleware",
     "KnowledgeBaseSearchMiddleware",
+    "KnowledgePriorityMiddleware",
+    "KnowledgeTreeMiddleware",
+    "MemoryInjectionMiddleware",
+    "NoopInjectionMiddleware",
+    "OtelSpanMiddleware",
+    "PermissionMiddleware",
+    "RetryAfterMiddleware",
+    "SearchSpaceSkillsBackend",
+    "SpillToBackendEdit",
+    "SpillingContextEditingMiddleware",
+    "SurfSenseCompactionMiddleware",
     "SurfSenseFilesystemMiddleware",
+    "ToolCallNameRepairMiddleware",
+    "build_skills_backend_factory",
+    "commit_staged_filesystem_state",
+    "create_surfsense_compaction_middleware",
+    "default_skills_sources",
 ]
diff --git a/surfsense_backend/app/agents/new_chat/middleware/action_log.py b/surfsense_backend/app/agents/new_chat/middleware/action_log.py
new file mode 100644
index 0000000000..716a1616c1
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/middleware/action_log.py
@@ -0,0 +1,363 @@
+"""Append-only action-log middleware for the SurfSense agent.
+
+Wraps every tool call via :meth:`AgentMiddleware.awrap_tool_call` and writes
+a row to :class:`~app.db.AgentActionLog` after the tool returns. Tools opt
+into reversibility by declaring a ``reverse`` callable on their
+:class:`~app.agents.new_chat.tools.registry.ToolDefinition`; the rendered
+descriptor is persisted in ``reverse_descriptor`` for use by
+``/api/threads/{thread_id}/revert/{action_id}``.
+
+Design points:
+
+* **Defensive.** Logging never blocks the agent. We catch every exception
+  on the DB write path and emit a warning; the tool's ``ToolMessage``
+  result is always returned untouched.
+* **Lightweight payload.** Only the tool ``name`` + ``args`` (capped) +
+  ``result_id`` + ``reverse_descriptor`` are stored. Tool output text
+  remains in the LangGraph checkpoint / spilled tool-output files.
+* **Best-effort reversibility.** We invoke ``reverse(args, result_obj)``
+  with the parsed JSON result when the tool's content is a JSON object;
+  otherwise the raw text is passed. Exceptions in the reverse callable
+  are swallowed and logged — a failed descriptor render simply means the
+  action is NOT marked reversible.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from collections.abc import Awaitable, Callable
+from typing import TYPE_CHECKING, Any
+
+from langchain.agents.middleware import AgentMiddleware
+from langchain_core.callbacks import adispatch_custom_event
+from langchain_core.messages import ToolMessage
+
+from app.agents.new_chat.feature_flags import get_flags
+from app.agents.new_chat.tools.registry import ToolDefinition
+
+if TYPE_CHECKING:  # pragma: no cover - type-only
+    from langchain.agents.middleware.types import ToolCallRequest
+    from langgraph.types import Command
+
+
+logger = logging.getLogger(__name__)
+
+
+# Cap for the persisted ``args`` JSON to avoid bloating the action log with
+# accidentally-huge inputs. Values are truncated and a flag is set in the
+# stored payload so consumers can detect truncation.
+_MAX_ARGS_PERSIST_BYTES = 32 * 1024  # 32KB
+
+
+class ActionLogMiddleware(AgentMiddleware):
+    """Persist a row in :class:`AgentActionLog` after every tool call.
+
+    Should be placed near the OUTERMOST end of the tool-call wrapping stack
+    so that it sees the *final* :class:`ToolMessage` after all retries,
+    permission checks, and dedup logic have run. In practice that means
+    placing it just inside :class:`PermissionMiddleware` and outside
+    :class:`DedupHITLToolCallsMiddleware`.
+
+    The middleware is fully a no-op when:
+
+    * the master kill-switch ``SURFSENSE_DISABLE_NEW_AGENT_STACK`` is set
+      (checked via :func:`get_flags`),
+    * the per-feature flag ``enable_action_log`` is off, or
+    * persistence raises (defensive: tool-call dispatch always succeeds).
+
+    Args:
+        thread_id: The current chat thread's primary-key id. Required to
+            persist a row; if ``None`` the middleware silently no-ops.
+        search_space_id: Search-space id for cascade-on-delete safety.
+        user_id: UUID string of the user driving this turn (nullable in
+            anonymous mode).
+        tool_definitions: Optional mapping of tool name -> :class:`ToolDefinition`
+            so the middleware can look up the tool's ``reverse`` callable.
+            When omitted, no actions are marked reversible.
+    """
+
+    tools = ()
+
+    def __init__(
+        self,
+        *,
+        thread_id: int | None,
+        search_space_id: int,
+        user_id: str | None,
+        tool_definitions: dict[str, ToolDefinition] | None = None,
+    ) -> None:
+        super().__init__()
+        self._thread_id = thread_id
+        self._search_space_id = search_space_id
+        self._user_id = user_id
+        self._tool_definitions = dict(tool_definitions or {})
+
+    def _enabled(self) -> bool:
+        flags = get_flags()
+        if flags.disable_new_agent_stack:
+            return False
+        return bool(flags.enable_action_log) and self._thread_id is not None
+
+    async def awrap_tool_call(
+        self,
+        request: ToolCallRequest,
+        handler: Callable[[ToolCallRequest], Awaitable[ToolMessage | Command[Any]]],
+    ) -> ToolMessage | Command[Any]:
+        if not self._enabled():
+            return await handler(request)
+
+        result: ToolMessage | Command[Any]
+        error_payload: dict[str, Any] | None = None
+        try:
+            result = await handler(request)
+        except Exception as exc:
+            # Persist the failure too so revert/audit can see it, then
+            # re-raise so downstream middleware (RetryAfter, etc.) handles it.
+            error_payload = {"type": type(exc).__name__, "message": str(exc)}
+            await self._record(
+                request=request,
+                result=None,
+                error_payload=error_payload,
+            )
+            raise
+
+        await self._record(request=request, result=result, error_payload=None)
+        return result
+
+    async def _record(
+        self,
+        *,
+        request: ToolCallRequest,
+        result: ToolMessage | Command[Any] | None,
+        error_payload: dict[str, Any] | None,
+    ) -> None:
+        """Persist one ``agent_action_log`` row. Defensive: never raises."""
+        try:
+            from app.db import AgentActionLog, shielded_async_session
+
+            tool_name = _resolve_tool_name(request)
+            args_payload = _resolve_args_payload(request)
+            result_id = _resolve_result_id(result)
+            reverse_descriptor, reversible = self._render_reverse(
+                tool_name=tool_name,
+                args=_resolve_args_dict(request),
+                result=result,
+            )
+
+            tool_call_id = _resolve_tool_call_id(request)
+            chat_turn_id = _resolve_chat_turn_id(request)
+
+            row = AgentActionLog(
+                thread_id=self._thread_id,
+                user_id=self._user_id,
+                search_space_id=self._search_space_id,
+                # ``turn_id`` is the deprecated alias of ``tool_call_id``
+                # kept for one release for safe rollback. New consumers
+                # should read ``tool_call_id`` directly.
+                turn_id=tool_call_id,
+                tool_call_id=tool_call_id,
+                chat_turn_id=chat_turn_id,
+                message_id=_resolve_message_id(request),
+                tool_name=tool_name,
+                args=args_payload,
+                result_id=result_id,
+                reversible=reversible,
+                reverse_descriptor=reverse_descriptor,
+                error=error_payload,
+            )
+            async with shielded_async_session() as session:
+                session.add(row)
+                await session.commit()
+                row_id = int(row.id) if row.id is not None else None
+                row_created_at = row.created_at
+        except Exception:
+            logger.warning(
+                "ActionLogMiddleware failed to persist action log row",
+                exc_info=True,
+            )
+            return
+
+        # Surface a side-channel SSE event so the chat tool card can
+        # render a Revert button immediately after the row is durable.
+        # ``stream_new_chat`` translates this into a
+        # ``data-action-log`` SSE event. We DO NOT include the
+        # ``reverse_descriptor`` payload here; only a presence flag.
+        try:
+            await adispatch_custom_event(
+                "action_log",
+                {
+                    "id": row_id,
+                    "lc_tool_call_id": tool_call_id,
+                    "chat_turn_id": chat_turn_id,
+                    "tool_name": tool_name,
+                    "reversible": bool(reversible),
+                    "reverse_descriptor_present": reverse_descriptor is not None,
+                    "created_at": row_created_at.isoformat()
+                    if row_created_at
+                    else None,
+                    "error": error_payload is not None,
+                },
+            )
+        except Exception:
+            logger.debug(
+                "ActionLogMiddleware failed to dispatch action_log event",
+                exc_info=True,
+            )
+
+    def _render_reverse(
+        self,
+        *,
+        tool_name: str,
+        args: dict[str, Any] | None,
+        result: ToolMessage | Command[Any] | None,
+    ) -> tuple[dict[str, Any] | None, bool]:
+        """Run the tool's ``reverse`` callable and return its descriptor.
+
+        Returns a tuple of ``(descriptor_or_None, reversible_bool)``. When
+        the tool has no ``reverse`` callable, or when the callable raises,
+        the action is marked non-reversible.
+        """
+        if not result or not isinstance(result, ToolMessage):
+            return None, False
+        if args is None:
+            return None, False
+        tool_def = self._tool_definitions.get(tool_name)
+        if tool_def is None or tool_def.reverse is None:
+            return None, False
+        try:
+            parsed_result = _parse_tool_result_content(result)
+            descriptor = tool_def.reverse(args, parsed_result)
+        except Exception:
+            logger.warning(
+                "Reverse descriptor render failed for tool %s",
+                tool_name,
+                exc_info=True,
+            )
+            return None, False
+        if not isinstance(descriptor, dict):
+            return None, False
+        return descriptor, True
+
+
+# ---------------------------------------------------------------------------
+# Resolution helpers — defensive against tool_call request shape variation.
+# ---------------------------------------------------------------------------
+
+
+def _resolve_tool_name(request: Any) -> str:
+    try:
+        tool = getattr(request, "tool", None)
+        if tool is not None:
+            name = getattr(tool, "name", None)
+            if isinstance(name, str) and name:
+                return name
+        call = getattr(request, "tool_call", None) or {}
+        if isinstance(call, dict):
+            name = call.get("name")
+            if isinstance(name, str) and name:
+                return name
+    except Exception:  # pragma: no cover - defensive
+        pass
+    return "unknown"
+
+
+def _resolve_args_dict(request: Any) -> dict[str, Any] | None:
+    try:
+        call = getattr(request, "tool_call", None)
+        if not isinstance(call, dict):
+            return None
+        args = call.get("args")
+        if isinstance(args, dict):
+            return args
+        return None
+    except Exception:  # pragma: no cover - defensive
+        return None
+
+
+def _resolve_args_payload(request: Any) -> dict[str, Any] | None:
+    """Return a JSON-serializable args dict, truncated if too big."""
+    args = _resolve_args_dict(request)
+    if args is None:
+        return None
+    try:
+        encoded = json.dumps(args, default=str)
+    except Exception:
+        return {"_repr": repr(args)[:_MAX_ARGS_PERSIST_BYTES]}
+    if len(encoded) <= _MAX_ARGS_PERSIST_BYTES:
+        return args
+    return {
+        "_truncated": True,
+        "_size": len(encoded),
+        "_preview": encoded[:_MAX_ARGS_PERSIST_BYTES],
+    }
+
+
+def _resolve_tool_call_id(request: Any) -> str | None:
+    """Return the LangChain ``tool_call.id`` for this request, if any."""
+    try:
+        call = getattr(request, "tool_call", None) or {}
+        if isinstance(call, dict):
+            tid = call.get("id")
+            if isinstance(tid, str):
+                return tid
+    except Exception:  # pragma: no cover
+        pass
+    return None
+
+
+# Deprecated alias kept for one release. Old callers and tests treated
+# ``turn_id`` as if it carried the LangChain tool_call id; the new column
+# lives under ``tool_call_id``. Both resolve to the same value today.
+_resolve_turn_id = _resolve_tool_call_id
+
+
+def _resolve_chat_turn_id(request: Any) -> str | None:
+    """Return ``configurable.turn_id`` for this request, if accessible.
+
+    ``ToolRuntime.config`` is exposed by LangGraph (see
+    ``langgraph/prebuilt/tool_node.py``); the chat-turn correlation id
+    lives at ``runtime.config["configurable"]["turn_id"]``.
+    """
+    try:
+        runtime = getattr(request, "runtime", None)
+        if runtime is None:
+            return None
+        config = getattr(runtime, "config", None)
+        if not isinstance(config, dict):
+            return None
+        configurable = config.get("configurable")
+        if not isinstance(configurable, dict):
+            return None
+        value = configurable.get("turn_id")
+        if isinstance(value, str) and value:
+            return value
+    except Exception:  # pragma: no cover - defensive
+        pass
+    return None
+
+
+def _resolve_message_id(request: Any) -> str | None:
+    """Tool-call IDs serve as best-available message correlator at this layer."""
+    return _resolve_tool_call_id(request)
+
+
+def _resolve_result_id(result: Any) -> str | None:
+    if isinstance(result, ToolMessage):
+        msg_id = getattr(result, "id", None)
+        if isinstance(msg_id, str):
+            return msg_id
+    return None
+
+
+def _parse_tool_result_content(result: ToolMessage) -> Any:
+    content = result.content
+    if isinstance(content, str):
+        try:
+            return json.loads(content)
+        except (json.JSONDecodeError, ValueError):
+            return content
+    return content
+
+
+__all__ = ["ActionLogMiddleware"]
diff --git a/surfsense_backend/app/agents/new_chat/middleware/anonymous_document.py b/surfsense_backend/app/agents/new_chat/middleware/anonymous_document.py
new file mode 100644
index 0000000000..2893d2e119
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/middleware/anonymous_document.py
@@ -0,0 +1,91 @@
+"""Lightweight middleware that loads the anonymous-session document into state.
+
+Anonymous chats receive a single uploaded document via Redis (no DB row,
+read-only). This middleware loads it once on the first turn into
+``state['kb_anon_doc']`` so:
+
+* :class:`KnowledgeTreeMiddleware` can render the synthetic ``/documents``
+  view without touching the DB.
+* :class:`KnowledgePriorityMiddleware` skips hybrid search and emits a
+  degenerate priority list.
+* :class:`KBPostgresBackend` (``als_info`` / ``aread`` / ``_load_file_data``)
+  recognises the synthetic path.
+
+The middleware is a no-op when ``anon_session_id`` is not provided or when
+the document is already cached in state.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from typing import Any
+
+from langchain.agents.middleware import AgentMiddleware, AgentState
+from langgraph.runtime import Runtime
+
+from app.agents.new_chat.filesystem_state import SurfSenseFilesystemState
+from app.agents.new_chat.path_resolver import DOCUMENTS_ROOT, safe_filename
+
+logger = logging.getLogger(__name__)
+
+
+class AnonymousDocumentMiddleware(AgentMiddleware):  # type: ignore[type-arg]
+    """Load the anonymous user's uploaded document from Redis into state."""
+
+    tools = ()
+    state_schema = SurfSenseFilesystemState
+
+    def __init__(self, *, anon_session_id: str | None) -> None:
+        self.anon_session_id = anon_session_id
+
+    async def abefore_agent(  # type: ignore[override]
+        self,
+        state: AgentState,
+        runtime: Runtime[Any],
+    ) -> dict[str, Any] | None:
+        del runtime
+        if not self.anon_session_id:
+            return None
+        if state.get("kb_anon_doc"):
+            return None
+
+        anon_doc = await self._load_anon_document()
+        if anon_doc is None:
+            return None
+        return {"kb_anon_doc": anon_doc}
+
+    async def _load_anon_document(self) -> dict[str, Any] | None:
+        """Read ``anon:doc:<session_id>`` from Redis."""
+        try:
+            import redis.asyncio as aioredis  # local import to keep cold paths cheap
+
+            from app.config import config
+
+            redis_client = aioredis.from_url(
+                config.REDIS_APP_URL, decode_responses=True
+            )
+            try:
+                redis_key = f"anon:doc:{self.anon_session_id}"
+                data = await redis_client.get(redis_key)
+                if not data:
+                    return None
+                payload = json.loads(data)
+            finally:
+                await redis_client.aclose()
+        except Exception as exc:
+            logger.warning("Failed to load anonymous document from Redis: %s", exc)
+            return None
+
+        title = str(payload.get("filename") or "uploaded_document")
+        content = str(payload.get("content") or "")
+        path = f"{DOCUMENTS_ROOT}/{safe_filename(title)}"
+        return {
+            "path": path,
+            "title": title,
+            "content": content,
+            "chunks": [{"chunk_id": -1, "content": content}] if content else [],
+        }
+
+
+__all__ = ["AnonymousDocumentMiddleware"]
diff --git a/surfsense_backend/app/agents/new_chat/middleware/busy_mutex.py b/surfsense_backend/app/agents/new_chat/middleware/busy_mutex.py
new file mode 100644
index 0000000000..e7d9b8f75a
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/middleware/busy_mutex.py
@@ -0,0 +1,328 @@
+"""
+BusyMutexMiddleware — per-thread asyncio lock + cancel token.
+
+LangChain has no built-in concept of "this thread is already running a
+turn — refuse the second concurrent request". Without it, a user
+double-clicking "send" or refreshing the page mid-stream can spawn two
+turns racing on the same checkpoint, producing duplicated tool calls
+and mangled state.
+
+Ported from OpenCode's ``Stream.scoped(AbortController)`` pattern: a
+single-process, in-memory lock + cooperative cancellation token keyed by
+``thread_id``. For multi-worker deployments a distributed lock backend
+(Redis or PostgreSQL advisory locks) is a phase-2 follow-up.
+
+What this provides:
+- A ``WeakValueDictionary[str, asyncio.Lock]`` keyed by ``thread_id``;
+  acquiring the lock during ``before_agent`` blocks any concurrent
+  prompt on the same thread until release.
+- A per-thread ``asyncio.Event`` (``cancel_event``) that long-running
+  tools can poll to abort cooperatively. The event is reset between
+  turns. Tools should check ``runtime.context.cancel_event.is_set()``
+  in tight inner loops.
+- A typed :class:`~app.agents.new_chat.errors.BusyError` raised when a
+  second turn arrives while the lock is held.
+
+Note: SurfSense's ``stream_new_chat`` is the call site that should
+acquire/release. Wiring this as middleware means the contract is
+explicit and the lock manager is shared with subagents that compile
+their own ``create_agent`` runnables.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import time
+import weakref
+from typing import Any
+
+from langchain.agents.middleware.types import (
+    AgentMiddleware,
+    AgentState,
+    ContextT,
+    ResponseT,
+)
+from langgraph.config import get_config
+from langgraph.runtime import Runtime
+
+from app.agents.new_chat.errors import BusyError
+
+logger = logging.getLogger(__name__)
+
+
+class _ThreadLockManager:
+    """Process-local registry of per-thread asyncio locks + cancel events."""
+
+    def __init__(self) -> None:
+        self._locks: weakref.WeakValueDictionary[str, asyncio.Lock] = (
+            weakref.WeakValueDictionary()
+        )
+        self._cancel_events: dict[str, asyncio.Event] = {}
+        self._cancel_requested_at_ms: dict[str, int] = {}
+        self._cancel_attempt_count: dict[str, int] = {}
+        # Monotonic per-thread epoch used to prevent stale middleware
+        # teardown from releasing a newer turn's lock.
+        self._turn_epoch: dict[str, int] = {}
+
+    def lock_for(self, thread_id: str) -> asyncio.Lock:
+        lock = self._locks.get(thread_id)
+        if lock is None:
+            lock = asyncio.Lock()
+            self._locks[thread_id] = lock
+        return lock
+
+    def cancel_event(self, thread_id: str) -> asyncio.Event:
+        event = self._cancel_events.get(thread_id)
+        if event is None:
+            event = asyncio.Event()
+            self._cancel_events[thread_id] = event
+        return event
+
+    def request_cancel(self, thread_id: str) -> bool:
+        event = self._cancel_events.get(thread_id)
+        if event is None:
+            event = asyncio.Event()
+            self._cancel_events[thread_id] = event
+        event.set()
+        now_ms = int(time.time() * 1000)
+        self._cancel_requested_at_ms[thread_id] = now_ms
+        self._cancel_attempt_count[thread_id] = (
+            self._cancel_attempt_count.get(thread_id, 0) + 1
+        )
+        return True
+
+    def is_cancel_requested(self, thread_id: str) -> bool:
+        event = self._cancel_events.get(thread_id)
+        return bool(event and event.is_set())
+
+    def cancel_state(self, thread_id: str) -> tuple[int, int] | None:
+        if not self.is_cancel_requested(thread_id):
+            return None
+        attempts = self._cancel_attempt_count.get(thread_id, 1)
+        requested_at_ms = self._cancel_requested_at_ms.get(thread_id, 0)
+        return attempts, requested_at_ms
+
+    def reset(self, thread_id: str) -> None:
+        event = self._cancel_events.get(thread_id)
+        if event is not None:
+            event.clear()
+        self._cancel_requested_at_ms.pop(thread_id, None)
+        self._cancel_attempt_count.pop(thread_id, None)
+
+    def bump_turn_epoch(self, thread_id: str) -> int:
+        epoch = self._turn_epoch.get(thread_id, 0) + 1
+        self._turn_epoch[thread_id] = epoch
+        return epoch
+
+    def current_turn_epoch(self, thread_id: str) -> int:
+        return self._turn_epoch.get(thread_id, 0)
+
+    def end_turn(self, thread_id: str) -> None:
+        """Best-effort terminal cleanup for a thread turn.
+
+        This is intentionally idempotent and safe to call from outer stream
+        finally-blocks where middleware teardown might be skipped due to abort
+        or disconnect edge-cases.
+        """
+        # Invalidate any in-flight middleware holder first. This guarantees a
+        # stale ``aafter_agent`` from an older attempt cannot unlock a newer
+        # retry that already acquired the lock for the same thread.
+        self.bump_turn_epoch(thread_id)
+        lock = self._locks.get(thread_id)
+        if lock is not None and lock.locked():
+            lock.release()
+        self.reset(thread_id)
+
+    def release(self, thread_id: str) -> bool:
+        """Force-release the per-thread lock; safety-net for turns that end before ``__end__``.
+
+        ``BusyMutexMiddleware.aafter_agent`` only releases on graph completion, so
+        an ``interrupt()`` pause or an early streaming bail-out would otherwise
+        leak the lock and block the next request with :class:`BusyError`. Returns
+        ``True`` when a held lock was released, ``False`` otherwise.
+        """
+        lock = self._locks.get(thread_id)
+        if lock is None or not lock.locked():
+            return False
+        try:
+            lock.release()
+        except RuntimeError:
+            return False
+        return True
+
+
+# Module-level singleton — process-local but reused across all agent
+# instances built in this process. Subagents created in nested
+# ``create_agent`` calls also get this so locks are coherent.
+manager = _ThreadLockManager()
+
+
+def get_cancel_event(thread_id: str) -> asyncio.Event:
+    """Public accessor used by long-running tools to poll cancellation."""
+    return manager.cancel_event(thread_id)
+
+
+def request_cancel(thread_id: str) -> bool:
+    """Trip the cancel event for ``thread_id``. Always returns True."""
+    return manager.request_cancel(thread_id)
+
+
+def is_cancel_requested(thread_id: str) -> bool:
+    """Return whether ``thread_id`` currently has a pending cancel signal."""
+    return manager.is_cancel_requested(thread_id)
+
+
+def get_cancel_state(thread_id: str) -> tuple[int, int] | None:
+    """Return ``(attempt_count, requested_at_ms)`` for pending cancel state."""
+    return manager.cancel_state(thread_id)
+
+
+def reset_cancel(thread_id: str) -> None:
+    """Reset the cancel event for ``thread_id`` (called between turns)."""
+    manager.reset(thread_id)
+
+
+def end_turn(thread_id: str) -> None:
+    """Force end-of-turn cleanup for lock + cancel state."""
+    manager.end_turn(thread_id)
+
+
+class BusyMutexMiddleware(AgentMiddleware[AgentState[ResponseT], ContextT, ResponseT]):
+    """Block concurrent prompts on the same thread.
+
+    Acquires the thread's lock in ``abefore_agent`` and releases in
+    ``aafter_agent``. If the lock is held, raises :class:`BusyError`
+    so the caller can emit a ``surfsense.busy`` SSE event with the
+    in-flight request id.
+
+    Args:
+        require_thread_id: When True, raise :class:`BusyError` if no
+            ``thread_id`` can be resolved from the active
+            ``RunnableConfig``. Default is False — we treat a missing
+            thread_id as "this turn has nothing to lock against" and
+            no-op the mutex. Set True only when you trust the call
+            site to always provide ``configurable.thread_id`` (e.g.
+            in production where ``stream_new_chat`` always does).
+    """
+
+    def __init__(self, *, require_thread_id: bool = False) -> None:
+        super().__init__()
+        self._require_thread_id = require_thread_id
+        self.tools = []
+        # Per-call lock ownership tracked as (lock, epoch). ``aafter_agent``
+        # only releases when its epoch still matches the manager's current
+        # epoch for the thread, preventing stale unlock races.
+        self._held_locks: dict[str, tuple[asyncio.Lock, int]] = {}
+
+    @staticmethod
+    def _thread_id(runtime: Runtime[ContextT]) -> str | None:
+        """Extract ``thread_id`` from the active LangGraph ``RunnableConfig``.
+
+        ``langgraph.runtime.Runtime`` deliberately does NOT expose ``config``.
+        The runnable config (where ``configurable.thread_id`` lives) must be
+        fetched via :func:`langgraph.config.get_config` from inside a node /
+        middleware. We fall back to ``getattr(runtime, "config", None)`` for
+        unit tests / legacy runtimes that synthesize a config-bearing stub.
+        """
+
+        def _from_dict(cfg: Any) -> str | None:
+            if not isinstance(cfg, dict):
+                return None
+            tid = (cfg.get("configurable") or {}).get("thread_id")
+            return str(tid) if tid is not None else None
+
+        # Preferred path: real LangGraph runtime context.
+        try:
+            tid = _from_dict(get_config())
+        except Exception:
+            tid = None
+        if tid is not None:
+            return tid
+
+        # Fallback for tests and any runtime that surfaces a config dict
+        # directly on the runtime instance.
+        return _from_dict(getattr(runtime, "config", None))
+
+    async def abefore_agent(  # type: ignore[override]
+        self,
+        state: AgentState[Any],
+        runtime: Runtime[ContextT],
+    ) -> dict[str, Any] | None:
+        del state
+        thread_id = self._thread_id(runtime)
+        if thread_id is None:
+            if self._require_thread_id:
+                raise BusyError("no thread_id configured")
+            logger.debug(
+                "BusyMutexMiddleware: no thread_id resolved from RunnableConfig; "
+                "skipping per-thread lock for this turn."
+            )
+            return None
+
+        lock = manager.lock_for(thread_id)
+        if lock.locked():
+            raise BusyError(request_id=thread_id)
+        await lock.acquire()
+        epoch = manager.bump_turn_epoch(thread_id)
+        self._held_locks[thread_id] = (lock, epoch)
+        # Reset the cancel event so this turn starts fresh
+        reset_cancel(thread_id)
+        return None
+
+    async def aafter_agent(  # type: ignore[override]
+        self,
+        state: AgentState[Any],
+        runtime: Runtime[ContextT],
+    ) -> dict[str, Any] | None:
+        del state
+        thread_id = self._thread_id(runtime)
+        if thread_id is None:
+            return None
+        held = self._held_locks.pop(thread_id, None)
+        if held is None:
+            return None
+        lock, held_epoch = held
+        if held_epoch != manager.current_turn_epoch(thread_id):
+            # Stale teardown from an older attempt (e.g. runtime-recovery path
+            # already advanced epoch). Do not touch current lock/cancel state.
+            return None
+        if lock.locked():
+            lock.release()
+        # Always clear cancel event between turns so a stale signal
+        # doesn't leak into the next request.
+        reset_cancel(thread_id)
+        return None
+
+    # Provide sync no-ops because the middleware base class allows them
+    def before_agent(  # type: ignore[override]
+        self, state: AgentState[Any], runtime: Runtime[ContextT]
+    ) -> dict[str, Any] | None:
+        # Sync path: no asyncio.Lock to acquire. Best we can do is reject
+        # if anyone else is in flight.
+        thread_id = self._thread_id(runtime)
+        if thread_id is None:
+            if self._require_thread_id:
+                raise BusyError("no thread_id configured")
+            return None
+        lock = manager.lock_for(thread_id)
+        if lock.locked():
+            raise BusyError(request_id=thread_id)
+        return None
+
+    def after_agent(  # type: ignore[override]
+        self, state: AgentState[Any], runtime: Runtime[ContextT]
+    ) -> dict[str, Any] | None:
+        return None
+
+
+__all__ = [
+    "BusyMutexMiddleware",
+    "end_turn",
+    "get_cancel_event",
+    "get_cancel_state",
+    "is_cancel_requested",
+    "manager",
+    "request_cancel",
+    "reset_cancel",
+]
diff --git a/surfsense_backend/app/agents/new_chat/middleware/compaction.py b/surfsense_backend/app/agents/new_chat/middleware/compaction.py
new file mode 100644
index 0000000000..16361e16b3
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/middleware/compaction.py
@@ -0,0 +1,254 @@
+"""
+SurfSense compaction middleware.
+
+Subclasses :class:`deepagents.middleware.summarization.SummarizationMiddleware`
+to add SurfSense-specific behavior:
+
+1. **Structured summary template** (OpenCode-style ``## Goal / Constraints /
+   Progress / Key Decisions / Next Steps / Critical Context / Relevant Files``)
+   — see :data:`SURFSENSE_SUMMARY_PROMPT` below. The base
+   ``SummarizationMiddleware`` only ships a freeform "summarize this"
+   prompt; the structured template is ported from OpenCode's
+   ``compaction.ts``.
+2. **Protect SurfSense-specific SystemMessages** so injected hints
+   (``<priority_documents>``, ``<workspace_tree>``, ``<file_operation_contract>``,
+   ``<user_memory>``, ``<team_memory>``, ``<user_name>``, ``<memory_warning>``)
+   are *not* summarized away and are kept verbatim in the post-summary
+   message list. Mirrors OpenCode's ``PRUNE_PROTECTED_TOOLS`` philosophy
+   (some message types are part of the agent's contract and must survive
+   compaction unchanged).
+3. **Sanitize ``content=None``** when feeding messages into ``get_buffer_string``
+   (Azure OpenAI / LiteLLM defense — when a provider streams an AIMessage
+   containing only tool_calls and no text, ``content`` can be ``None`` and
+   ``get_buffer_string`` crashes iterating over ``None``). SurfSense-specific.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any
+
+from deepagents.middleware.summarization import (
+    SummarizationMiddleware,
+    compute_summarization_defaults,
+)
+from langchain_core.messages import SystemMessage
+
+from app.observability import otel as ot
+
+if TYPE_CHECKING:
+    from deepagents.backends.protocol import BACKEND_TYPES
+    from langchain_core.language_models import BaseChatModel
+    from langchain_core.messages import AnyMessage
+
+logger = logging.getLogger(__name__)
+
+# Structured summary template ported from OpenCode's
+# ``opencode/packages/opencode/src/session/compaction.ts:40-75``. Kept as a
+# module-level constant so unit tests can assert on its sections.
+SURFSENSE_SUMMARY_PROMPT = """<role>
+SurfSense Conversation Compaction Assistant
+</role>
+
+<primary_objective>
+Extract the most important context from the conversation history below into a structured summary that will replace the older messages.
+</primary_objective>
+
+<instructions>
+You are running because the conversation has grown beyond the model's input window. The conversation history below will be summarized and replaced with your output. Use the structured template that follows; keep each section concise but comprehensive enough that the agent can resume work without losing context. Each section is a checklist — populate it with relevant content or write "None" if there is nothing to report.
+
+## Goal
+What is the user's primary goal or request? State it in one or two sentences.
+
+## Constraints
+What boundaries must the agent respect (citations rules, visibility scope, allowed tools, user-imposed style, deadlines, deny-listed topics)?
+
+## Progress
+What has the agent already accomplished? List each completed step succinctly. Do not reproduce tool output; just record the conclusion.
+
+## Key Decisions
+What choices were made and why? Include rejected alternatives and the reasoning behind selecting the current path.
+
+## Next Steps
+What specific tasks remain to achieve the goal? Order them by dependency.
+
+## Critical Context
+What facts, IDs, document titles, query keywords, error messages, or partial answers must persist into the next turn? Include verbatim quotes only when the exact wording matters (e.g. a precise filter clause or a literal name).
+
+## Relevant Files
+What documents or paths in the SurfSense knowledge base are in play? Use ``/documents/...`` paths exactly as they appeared in the workspace tree.
+</instructions>
+
+<messages>
+Messages to summarize:
+{messages}
+</messages>
+
+Respond ONLY with the structured summary. Do not include any text before or after.
+"""
+
+# SystemMessage prefixes that must NOT be summarized away. They are
+# re-injected on every turn by the corresponding middleware, but the
+# compaction step happens *before* re-injection in some paths, so we
+# must preserve them verbatim across the cutoff.
+PROTECTED_SYSTEM_PREFIXES: tuple[str, ...] = (
+    "<priority_documents>",  # KnowledgePriorityMiddleware
+    "<workspace_tree>",  # KnowledgeTreeMiddleware
+    "<file_operation_contract>",  # FileIntentMiddleware
+    "<user_memory>",  # MemoryInjectionMiddleware
+    "<team_memory>",  # MemoryInjectionMiddleware
+    "<user_name>",  # MemoryInjectionMiddleware
+    "<memory_warning>",  # MemoryInjectionMiddleware
+)
+
+
+def _is_protected_system_message(msg: AnyMessage) -> bool:
+    """Return True if ``msg`` is a SystemMessage we must not summarize."""
+    if not isinstance(msg, SystemMessage):
+        return False
+    content = msg.content
+    if not isinstance(content, str):
+        return False
+    stripped = content.lstrip()
+    return any(stripped.startswith(prefix) for prefix in PROTECTED_SYSTEM_PREFIXES)
+
+
+def _sanitize_message_content(msg: AnyMessage) -> AnyMessage:
+    """Return ``msg`` with ``content=None`` coerced to ``""``.
+
+    Folds in the historical defense from ``safe_summarization.py`` —
+    ``get_buffer_string`` reads ``m.text`` which iterates ``self.content``,
+    so a ``None`` content (Azure OpenAI / LiteLLM streaming a tool-only
+    AIMessage) explodes. We return a copy with empty string content so
+    downstream consumers see an empty body without mutating the original.
+    """
+    if getattr(msg, "content", "not-missing") is not None:
+        return msg
+    try:
+        return msg.model_copy(update={"content": ""})
+    except AttributeError:
+        import copy
+
+        new_msg = copy.copy(msg)
+        try:
+            new_msg.content = ""
+        except Exception:
+            logger.debug(
+                "Could not sanitize content=None on message of type %s",
+                type(msg).__name__,
+            )
+            return msg
+        return new_msg
+
+
+class SurfSenseCompactionMiddleware(SummarizationMiddleware):
+    """SummarizationMiddleware tuned for SurfSense.
+
+    Notes
+    -----
+    - Overrides :meth:`_partition_messages` so protected SystemMessages
+      survive into the ``preserved_messages`` half regardless of cutoff.
+    - Overrides :meth:`_filter_summary_messages` so the buffer-string path
+      never iterates ``None`` content.
+    - Inherits everything else (auto-trigger, backend offload,
+      ``_summarization_event`` plumbing, ``ContextOverflowError`` fallback).
+    """
+
+    def _partition_messages(  # type: ignore[override]
+        self,
+        conversation_messages: list[AnyMessage],
+        cutoff_index: int,
+    ) -> tuple[list[AnyMessage], list[AnyMessage]]:
+        """Split messages but always preserve SurfSense protected SystemMessages.
+
+        Mirrors OpenCode's ``PRUNE_PROTECTED_TOOLS`` philosophy
+        (``opencode/packages/opencode/src/session/compaction.ts``): some
+        message types are always kept verbatim because they are part of the
+        agent's working contract, not transient output.
+
+        Also opens a ``compaction.run`` OTel span (no-op when OTel is off)
+        so dashboards can count compaction events and message-volume
+        without having to instrument upstream callers.
+        """
+        # Opening a span here is appropriate because partitioning is the
+        # first call SummarizationMiddleware makes when it has decided to
+        # summarize; we record the volume and then close as a normal span.
+        with ot.compaction_span(
+            reason="auto",
+            messages_in=len(conversation_messages),
+            extra={"compaction.cutoff_index": int(cutoff_index)},
+        ):
+            messages_to_summarize, preserved_messages = super()._partition_messages(
+                conversation_messages, cutoff_index
+            )
+
+            protected: list[AnyMessage] = []
+            kept_for_summary: list[AnyMessage] = []
+            for msg in messages_to_summarize:
+                if _is_protected_system_message(msg):
+                    protected.append(msg)
+                else:
+                    kept_for_summary.append(msg)
+
+            # Place protected blocks at the *front* of preserved_messages so
+            # they keep their original ordering relative to the summary
+            # HumanMessage that precedes the rest of the preserved tail.
+            return kept_for_summary, [*protected, *preserved_messages]
+
+    def _filter_summary_messages(  # type: ignore[override]
+        self, messages: list[AnyMessage]
+    ) -> list[AnyMessage]:
+        """Filter previous summaries AND sanitize ``content=None``.
+
+        Folds the ``safe_summarization.py`` defense in: when the buffer
+        builder iterates ``m.text`` over ``None`` it explodes; sanitizing
+        here covers both the sync and async offload paths.
+        """
+        filtered = super()._filter_summary_messages(messages)
+        return [_sanitize_message_content(m) for m in filtered]
+
+
+def create_surfsense_compaction_middleware(
+    model: BaseChatModel,
+    backend: BACKEND_TYPES,
+    *,
+    summary_prompt: str | None = None,
+    history_path_prefix: str = "/conversation_history",
+    **overrides: Any,
+) -> SurfSenseCompactionMiddleware:
+    """Build a :class:`SurfSenseCompactionMiddleware` with sensible defaults.
+
+    Pulls profile-aware ``trigger`` / ``keep`` / ``truncate_args_settings``
+    via :func:`deepagents.middleware.summarization.compute_summarization_defaults`
+    so callers get the same behavior as ``create_summarization_middleware``
+    plus our overrides.
+
+    Args:
+        model: Chat model to call for summary generation.
+        backend: Backend instance or factory for offloading conversation history.
+        summary_prompt: Optional override; defaults to :data:`SURFSENSE_SUMMARY_PROMPT`.
+        history_path_prefix: Path prefix for offloaded conversation history.
+        **overrides: Forwarded to :class:`SurfSenseCompactionMiddleware`.
+    """
+    defaults = compute_summarization_defaults(model)
+    return SurfSenseCompactionMiddleware(
+        model=model,
+        backend=backend,
+        trigger=overrides.pop("trigger", defaults["trigger"]),
+        keep=overrides.pop("keep", defaults["keep"]),
+        trim_tokens_to_summarize=overrides.pop("trim_tokens_to_summarize", None),
+        truncate_args_settings=overrides.pop(
+            "truncate_args_settings", defaults["truncate_args_settings"]
+        ),
+        summary_prompt=summary_prompt or SURFSENSE_SUMMARY_PROMPT,
+        history_path_prefix=history_path_prefix,
+        **overrides,
+    )
+
+
+__all__ = [
+    "PROTECTED_SYSTEM_PREFIXES",
+    "SURFSENSE_SUMMARY_PROMPT",
+    "SurfSenseCompactionMiddleware",
+    "create_surfsense_compaction_middleware",
+]
diff --git a/surfsense_backend/app/agents/new_chat/middleware/context_editing.py b/surfsense_backend/app/agents/new_chat/middleware/context_editing.py
new file mode 100644
index 0000000000..39bc57c8b7
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/middleware/context_editing.py
@@ -0,0 +1,350 @@
+"""
+SpillToBackendEdit + SpillingContextEditingMiddleware.
+
+LangChain's :class:`ClearToolUsesEdit` discards old ``ToolMessage.content``
+when the context-editing budget triggers, replacing the body with a fixed
+placeholder. That's lossy: anything the agent might want to revisit is
+gone. The spill-to-disk pattern (originally from OpenCode's
+``opencode/packages/opencode/src/tool/truncate.ts``) keeps the prune
+behavior but writes the full original payload to the runtime backend
+under ``/tool_outputs/{thread_id}/{message_id}.txt`` first. The
+placeholder is then upgraded to point at the spill path so the agent
+(or a subagent) can read it back on demand.
+
+Why this is a middleware subclass instead of a plain ``ContextEdit``:
+``ContextEdit.apply`` is sync, but writing to the backend is async. We
+capture the spill payloads inside ``apply`` and flush them via
+``await backend.aupload_files(...)`` from ``awrap_model_call`` *before*
+delegating to the handler, so the explore subagent can always read what
+the placeholder advertises.
+"""
+
+from __future__ import annotations
+
+import logging
+import threading
+from collections.abc import Awaitable, Callable, Sequence
+from copy import deepcopy
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any
+
+from langchain.agents.middleware.context_editing import (
+    ClearToolUsesEdit,
+    ContextEdit,
+    ContextEditingMiddleware,
+    TokenCounter,
+)
+from langchain_core.messages import (
+    AIMessage,
+    AnyMessage,
+    BaseMessage,
+    ToolMessage,
+)
+from langchain_core.messages.utils import count_tokens_approximately
+from langgraph.config import get_config
+
+if TYPE_CHECKING:
+    from deepagents.backends.protocol import BackendProtocol
+    from langchain.agents.middleware.types import (
+        ModelRequest,
+        ModelResponse,
+    )
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_SPILL_PREFIX = "/tool_outputs"
+
+
+def _build_spill_placeholder(spill_path: str) -> str:
+    """Build the user-facing placeholder text shown to the model."""
+    return (
+        f"[cleared — full output at {spill_path}; ask the explore subagent to read it]"
+    )
+
+
+def _get_thread_id_or_session() -> str:
+    """Best-effort thread_id discovery for the spill path.
+
+    Falls back to a process-stable string if no LangGraph config is
+    available (e.g. unit tests). The exact value doesn't matter as long
+    as it's stable within one stream so the placeholder paths line up
+    with the actual upload path.
+    """
+    try:
+        config = get_config()
+        thread_id = config.get("configurable", {}).get("thread_id")
+        if thread_id is not None:
+            return str(thread_id)
+    except RuntimeError:
+        pass
+    return "no_thread"
+
+
+@dataclass(slots=True)
+class SpillToBackendEdit(ContextEdit):
+    """Capture-and-replace context edit that spills full tool output to the backend.
+
+    Behaves like :class:`ClearToolUsesEdit` (same trigger / keep / exclude
+    semantics) **and** records the original ``ToolMessage.content`` in
+    :attr:`pending_spills` so the wrapping middleware can flush them
+    before the model call.
+
+    Args:
+        trigger: Token threshold above which the edit fires.
+        clear_at_least: Minimum number of tokens to reclaim (best effort).
+        keep: Number of most-recent ``ToolMessage`` instances to leave
+            untouched.
+        exclude_tools: Names of tools whose output is NOT spilled.
+        clear_tool_inputs: Also clear the originating ``AIMessage.tool_calls``
+            args when their pair is cleared.
+        path_prefix: Path under the backend where spills are written.
+            Default ``"/tool_outputs"``.
+    """
+
+    trigger: int = 100_000
+    clear_at_least: int = 0
+    keep: int = 3
+    clear_tool_inputs: bool = False
+    exclude_tools: Sequence[str] = ()
+    path_prefix: str = DEFAULT_SPILL_PREFIX
+
+    pending_spills: list[tuple[str, bytes]] = field(default_factory=list)
+    _lock: threading.Lock = field(default_factory=threading.Lock)
+
+    def drain_pending(self) -> list[tuple[str, bytes]]:
+        """Return and clear the pending-spill list atomically."""
+        with self._lock:
+            out = list(self.pending_spills)
+            self.pending_spills.clear()
+        return out
+
+    def apply(
+        self,
+        messages: list[AnyMessage],
+        *,
+        count_tokens: TokenCounter,
+    ) -> None:
+        """Mirror ``ClearToolUsesEdit.apply`` but capture originals first."""
+        tokens = count_tokens(messages)
+        if tokens <= self.trigger:
+            return
+
+        candidates = [
+            (idx, msg)
+            for idx, msg in enumerate(messages)
+            if isinstance(msg, ToolMessage)
+        ]
+        if self.keep >= len(candidates):
+            return
+        if self.keep:
+            candidates = candidates[: -self.keep]
+
+        thread_id = _get_thread_id_or_session()
+        excluded_tools = set(self.exclude_tools)
+
+        for idx, tool_message in candidates:
+            if tool_message.response_metadata.get("context_editing", {}).get("cleared"):
+                continue
+
+            ai_message = next(
+                (m for m in reversed(messages[:idx]) if isinstance(m, AIMessage)),
+                None,
+            )
+            if ai_message is None:
+                continue
+
+            tool_call = next(
+                (
+                    call
+                    for call in ai_message.tool_calls
+                    if call.get("id") == tool_message.tool_call_id
+                ),
+                None,
+            )
+            if tool_call is None:
+                continue
+
+            tool_name = tool_message.name or tool_call["name"]
+            if tool_name in excluded_tools:
+                continue
+
+            message_id = tool_message.id or tool_message.tool_call_id or "unknown"
+            spill_path = f"{self.path_prefix}/{thread_id}/{message_id}.txt"
+
+            original = tool_message.content
+            payload = self._encode_payload(original)
+            with self._lock:
+                self.pending_spills.append((spill_path, payload))
+
+            messages[idx] = tool_message.model_copy(
+                update={
+                    "artifact": None,
+                    "content": _build_spill_placeholder(spill_path),
+                    "response_metadata": {
+                        **tool_message.response_metadata,
+                        "context_editing": {
+                            "cleared": True,
+                            "strategy": "spill_to_backend",
+                            "spill_path": spill_path,
+                        },
+                    },
+                }
+            )
+
+            if self.clear_tool_inputs:
+                ai_idx = messages.index(ai_message)
+                messages[ai_idx] = self._clear_input_args(
+                    ai_message, tool_message.tool_call_id or ""
+                )
+
+            if self.clear_at_least > 0:
+                new_token_count = count_tokens(messages)
+                cleared_tokens = max(0, tokens - new_token_count)
+                if cleared_tokens >= self.clear_at_least:
+                    break
+
+    @staticmethod
+    def _encode_payload(content: Any) -> bytes:
+        """Serialize ``ToolMessage.content`` to bytes for upload."""
+        if isinstance(content, bytes):
+            return content
+        if isinstance(content, str):
+            return content.encode("utf-8")
+        try:
+            import json
+
+            return json.dumps(content, default=str).encode("utf-8")
+        except Exception:
+            return str(content).encode("utf-8")
+
+    @staticmethod
+    def _clear_input_args(message: AIMessage, tool_call_id: str) -> AIMessage:
+        updated_tool_calls: list[dict[str, Any]] = []
+        cleared_any = False
+        for tool_call in message.tool_calls:
+            updated = dict(tool_call)
+            if updated.get("id") == tool_call_id:
+                updated["args"] = {}
+                cleared_any = True
+            updated_tool_calls.append(updated)
+
+        metadata = dict(getattr(message, "response_metadata", {}))
+        if cleared_any:
+            ctx = dict(metadata.get("context_editing", {}))
+            ids = set(ctx.get("cleared_tool_inputs", []))
+            ids.add(tool_call_id)
+            ctx["cleared_tool_inputs"] = sorted(ids)
+            metadata["context_editing"] = ctx
+        return message.model_copy(
+            update={
+                "tool_calls": updated_tool_calls,
+                "response_metadata": metadata,
+            }
+        )
+
+
+BackendResolver = "Callable[[Any], BackendProtocol] | BackendProtocol"
+
+
+class SpillingContextEditingMiddleware(ContextEditingMiddleware):
+    """:class:`ContextEditingMiddleware` that flushes :class:`SpillToBackendEdit` writes.
+
+    Runs the configured edits as the parent does, then flushes any
+    pending spills via the supplied backend resolver before delegating
+    to the model handler. Spill failures are logged but never abort the
+    model call — the placeholder text is already in the message, so the
+    worst case is the agent gets a placeholder it cannot follow up on.
+    """
+
+    def __init__(
+        self,
+        *,
+        edits: Sequence[ContextEdit],
+        backend_resolver: BackendResolver | None = None,
+        token_count_method: str = "approximate",
+    ) -> None:
+        super().__init__(edits=list(edits), token_count_method=token_count_method)  # type: ignore[arg-type]
+        self._backend_resolver = backend_resolver
+
+    def _resolve_backend(self, request: ModelRequest) -> BackendProtocol | None:
+        if self._backend_resolver is None:
+            return None
+        if callable(self._backend_resolver):
+            try:
+                from langchain.tools import ToolRuntime
+
+                tool_runtime = ToolRuntime(
+                    state=getattr(request, "state", {}),
+                    context=getattr(request.runtime, "context", None),
+                    stream_writer=getattr(request.runtime, "stream_writer", None),
+                    store=getattr(request.runtime, "store", None),
+                    config=getattr(request.runtime, "config", None) or {},
+                    tool_call_id=None,
+                )
+                return self._backend_resolver(tool_runtime)
+            except Exception:
+                logger.exception("Failed to resolve spill backend")
+                return None
+        return self._backend_resolver  # type: ignore[return-value]
+
+    def _collect_pending(self) -> list[tuple[str, bytes]]:
+        out: list[tuple[str, bytes]] = []
+        for edit in self.edits:
+            if isinstance(edit, SpillToBackendEdit):
+                out.extend(edit.drain_pending())
+        return out
+
+    async def awrap_model_call(  # type: ignore[override]
+        self,
+        request: ModelRequest,
+        handler: Callable[[ModelRequest], Awaitable[ModelResponse]],
+    ) -> Any:
+        if not request.messages:
+            return await handler(request)
+
+        if self.token_count_method == "approximate":
+
+            def count_tokens(messages: Sequence[BaseMessage]) -> int:
+                return count_tokens_approximately(messages)
+
+        else:
+            system_msg = [request.system_message] if request.system_message else []
+
+            def count_tokens(messages: Sequence[BaseMessage]) -> int:
+                return request.model.get_num_tokens_from_messages(
+                    system_msg + list(messages), request.tools
+                )
+
+        edited_messages = deepcopy(list(request.messages))
+        for edit in self.edits:
+            edit.apply(edited_messages, count_tokens=count_tokens)
+
+        pending = self._collect_pending()
+        if pending:
+            backend = self._resolve_backend(request)
+            if backend is not None:
+                try:
+                    await backend.aupload_files(pending)
+                except Exception:
+                    logger.exception(
+                        "Spill-to-backend upload failed (%d files); placeholders "
+                        "remain in messages but content is unrecoverable",
+                        len(pending),
+                    )
+            else:
+                logger.warning(
+                    "SpillToBackendEdit produced %d pending spills but no backend "
+                    "resolver was configured; content is unrecoverable",
+                    len(pending),
+                )
+
+        return await handler(request.override(messages=edited_messages))
+
+
+__all__ = [
+    "DEFAULT_SPILL_PREFIX",
+    "ClearToolUsesEdit",
+    "SpillToBackendEdit",
+    "SpillingContextEditingMiddleware",
+    "_build_spill_placeholder",
+]
diff --git a/surfsense_backend/app/agents/new_chat/middleware/dedup_tool_calls.py b/surfsense_backend/app/agents/new_chat/middleware/dedup_tool_calls.py
index f5e8f12355..a6d2ce3108 100644
--- a/surfsense_backend/app/agents/new_chat/middleware/dedup_tool_calls.py
+++ b/surfsense_backend/app/agents/new_chat/middleware/dedup_tool_calls.py
@@ -2,17 +2,28 @@
 
 When the LLM emits multiple calls to the same HITL tool with the same
 primary argument (e.g. two ``delete_calendar_event("Doctor Appointment")``),
-only the first call is kept.  Non-HITL tools are never touched.
+only the first call is kept. Non-HITL tools are never touched.
 
 This runs in the ``after_model`` hook — **before** any tool executes — so
 the duplicate call is stripped from the AIMessage that gets checkpointed.
 That means it is also safe across LangGraph ``interrupt()`` boundaries:
 the removed call will never appear on graph resume.
+
+Dedup-key resolution order:
+
+1. :class:`ToolDefinition.dedup_key` — callable provided by the registry
+   entry. This is the canonical mechanism.
+2. ``tool.metadata["hitl_dedup_key"]`` — string with a primary arg name;
+   used by MCP / Composio tools whose schemas the registry doesn't see.
+
+A tool with no resolver from either path simply opts out of dedup.
 """
 
 from __future__ import annotations
 
+import json
 import logging
+from collections.abc import Callable
 from typing import Any
 
 from langchain.agents.middleware import AgentMiddleware, AgentState
@@ -20,45 +31,97 @@
 
 logger = logging.getLogger(__name__)
 
-_HITL_TOOL_DEDUP_KEYS: dict[str, str] = {
-    "delete_calendar_event": "event_title_or_id",
-    "update_calendar_event": "event_title_or_id",
-    "trash_gmail_email": "email_subject_or_id",
-    "update_gmail_draft": "draft_subject_or_id",
-    "delete_google_drive_file": "file_name",
-    "delete_onedrive_file": "file_name",
-    "delete_notion_page": "page_title",
-    "update_notion_page": "page_title",
-    "delete_linear_issue": "issue_ref",
-    "update_linear_issue": "issue_ref",
-    "update_jira_issue": "issue_title_or_key",
-    "delete_jira_issue": "issue_title_or_key",
-    "update_confluence_page": "page_title_or_id",
-    "delete_confluence_page": "page_title_or_id",
-}
+# Resolver type — given the tool ``args`` dict returns a stable
+# string used to dedupe consecutive calls. ``None`` means no dedup.
+DedupResolver = Callable[[dict[str, Any]], str]
+
+
+def wrap_dedup_key_by_arg_name(arg_name: str) -> DedupResolver:
+    """Adapt a string-arg name into a :data:`DedupResolver`.
+
+    Convenience helper used by registry entries that just want to dedupe
+    on a single arg's lowercased value (the most common case for native
+    HITL tools like ``send_gmail_email`` keyed on ``subject``).
+
+    Example::
+
+        ToolDefinition(
+            name="send_gmail_email",
+            ...,
+            dedup_key=wrap_dedup_key_by_arg_name("subject"),
+        )
+    """
+
+    def _resolver(args: dict[str, Any]) -> str:
+        return str(args.get(arg_name, "")).lower()
+
+    return _resolver
+
+
+def dedup_key_full_args(args: dict[str, Any]) -> str:
+    """Resolver that collapses calls only when **every** argument is identical.
+
+    Safe default for tools where no single field uniquely identifies a call
+    (e.g. MCP tools whose first required field is a shared workspace id).
+    """
+
+    try:
+        return json.dumps(args, sort_keys=True, default=str)
+    except (TypeError, ValueError):
+        return repr(sorted(args.items())) if isinstance(args, dict) else repr(args)
+
+
+# Backwards-compatible alias for code that imported the original
+# private name. New callers should use :func:`wrap_dedup_key_by_arg_name`.
+_wrap_string_key = wrap_dedup_key_by_arg_name
 
 
 class DedupHITLToolCallsMiddleware(AgentMiddleware):  # type: ignore[type-arg]
     """Remove duplicate HITL tool calls from a single LLM response.
 
-    Only the **first** occurrence of each (tool-name, primary-arg-value)
+    Only the **first** occurrence of each ``(tool-name, dedup_key)``
     pair is kept; subsequent duplicates are silently dropped.
+
+    The dedup-resolver map is built from two sources, in priority order:
+
+    1. ``tool.metadata["dedup_key"]`` — callable provided by the registry's
+       ``ToolDefinition.dedup_key``. Receives the args dict and returns
+       a string signature. This is the canonical mechanism.
+    2. ``tool.metadata["hitl_dedup_key"]`` — string with a primary arg
+       name; primarily used by MCP / Composio tools.
     """
 
     tools = ()
 
+    def __init__(self, *, agent_tools: list[Any] | None = None) -> None:
+        self._resolvers: dict[str, DedupResolver] = {}
+
+        for t in agent_tools or []:
+            meta = getattr(t, "metadata", None) or {}
+            callable_key = meta.get("dedup_key")
+            if callable(callable_key):
+                self._resolvers[t.name] = callable_key
+                continue
+            if meta.get("hitl") and meta.get("hitl_dedup_key"):
+                self._resolvers[t.name] = wrap_dedup_key_by_arg_name(
+                    meta["hitl_dedup_key"]
+                )
+
     def after_model(
         self, state: AgentState, runtime: Runtime[Any]
     ) -> dict[str, Any] | None:
-        return self._dedup(state)
+        return self._dedup(state, self._resolvers)
 
     async def aafter_model(
         self, state: AgentState, runtime: Runtime[Any]
     ) -> dict[str, Any] | None:
-        return self._dedup(state)
+        return self._dedup(state, self._resolvers)
 
     @staticmethod
-    def _dedup(state: AgentState) -> dict[str, Any] | None:  # type: ignore[type-arg]
+    def _dedup(
+        state: AgentState,
+        resolvers: dict[str, DedupResolver],
+    ) -> dict[str, Any] | None:
         messages = state.get("messages")
         if not messages:
             return None
@@ -73,9 +136,16 @@ def _dedup(state: AgentState) -> dict[str, Any] | None:  # type: ignore[type-arg
 
         for tc in tool_calls:
             name = tc.get("name", "")
-            dedup_key_arg = _HITL_TOOL_DEDUP_KEYS.get(name)
-            if dedup_key_arg is not None:
-                arg_val = str(tc.get("args", {}).get(dedup_key_arg, "")).lower()
+            resolver = resolvers.get(name)
+            if resolver is not None:
+                try:
+                    arg_val = resolver(tc.get("args", {}) or {})
+                except Exception:
+                    logger.exception(
+                        "Dedup resolver for tool %s raised; keeping call", name
+                    )
+                    deduped.append(tc)
+                    continue
                 key = (name, arg_val)
                 if key in seen:
                     logger.info(
diff --git a/surfsense_backend/app/agents/new_chat/middleware/doom_loop.py b/surfsense_backend/app/agents/new_chat/middleware/doom_loop.py
new file mode 100644
index 0000000000..850ecd1d2d
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/middleware/doom_loop.py
@@ -0,0 +1,237 @@
+"""
+DoomLoopMiddleware — pattern-based detector for repeated identical tool calls.
+
+LangChain has :class:`ToolCallLimitMiddleware` which caps the *total* number
+of tool calls per turn — but it can't tell apart "10 distinct, useful
+calls" from "the same call 10 times in a row". This middleware fills that
+gap with a sliding-window check on tool-call signatures, ported from
+OpenCode's ``packages/opencode/src/session/processor.ts``.
+
+When the same tool with the same arguments is called N times in a row,
+the agent has likely entered an infinite loop. We surface this to the
+user as an interrupt with ``permission="doom_loop"`` so the UI can
+render an "Are you stuck? Continue / cancel?" affordance.
+
+This ships **OFF by default** until the frontend explicitly handles
+``context.permission == "doom_loop"`` interrupts.
+
+Wire format: uses SurfSense's existing ``interrupt()`` payload shape
+(see ``app/agents/new_chat/tools/hitl.py``):
+
+    {
+        "type": "permission_ask",
+        "action": {"tool": <name>, "params": <args>},
+        "context": {"permission": "doom_loop", "recent_signatures": [...]},
+    }
+
+so the frontend that already handles HITL prompts can render this with
+no changes beyond a string check.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import json
+import logging
+from collections import deque
+from typing import Any
+
+from langchain.agents.middleware.types import (
+    AgentMiddleware,
+    AgentState,
+    ContextT,
+    ResponseT,
+)
+from langchain_core.messages import AIMessage
+from langgraph.config import get_config
+from langgraph.runtime import Runtime
+from langgraph.types import interrupt
+
+from app.observability import otel as ot
+
+logger = logging.getLogger(__name__)
+
+
+def _signature(name: str, args: Any) -> str:
+    """Hash a tool call ``(name, args)`` to a short signature."""
+    try:
+        canonical = json.dumps(args, sort_keys=True, default=str)
+    except (TypeError, ValueError):
+        canonical = repr(args)
+    digest = hashlib.sha1(f"{name}::{canonical}".encode()).hexdigest()
+    return digest[:16]
+
+
+class DoomLoopMiddleware(AgentMiddleware[AgentState[ResponseT], ContextT, ResponseT]):
+    """Detect repeated identical tool calls and prompt the user.
+
+    Tracks a sliding window of the most-recent ``threshold`` tool-call
+    signatures across the live request. When all entries match, raise
+    a SurfSense-style HITL interrupt with ``permission="doom_loop"``.
+
+    Args:
+        threshold: How many consecutive identical signatures count as a
+            doom loop. Default 3 (matches OpenCode's processor.ts).
+    """
+
+    def __init__(self, *, threshold: int = 3) -> None:
+        super().__init__()
+        if threshold < 2:
+            raise ValueError("DoomLoopMiddleware threshold must be >= 2")
+        self._threshold = threshold
+        self.tools = []
+        # Per-thread sliding windows. We can't put this in graph state
+        # without state-schema gymnastics; for one process-lifetime it's
+        # fine to keep an in-memory map keyed by thread_id.
+        self._windows: dict[str, deque[str]] = {}
+
+    @staticmethod
+    def _thread_id_from_runtime(runtime: Runtime[ContextT]) -> str:
+        """Resolve the thread id for sliding-window keying.
+
+        Prefer LangGraph's ``get_config()`` (the only way to read
+        ``RunnableConfig`` inside a node — :class:`Runtime` does NOT carry
+        a ``config`` attribute). Fall back to ``runtime.config`` for unit
+        tests that synthesize a config-bearing stub. Default
+        ``"no_thread"`` is intentionally only used when both lookups fail
+        — it would collapse all threads into one window so we keep the
+        debug log loud.
+        """
+
+        def _from_dict(cfg: Any) -> str | None:
+            if not isinstance(cfg, dict):
+                return None
+            tid = (cfg.get("configurable") or {}).get("thread_id")
+            return str(tid) if tid is not None else None
+
+        try:
+            tid = _from_dict(get_config())
+        except Exception:
+            tid = None
+        if tid is not None:
+            return tid
+
+        tid = _from_dict(getattr(runtime, "config", None))
+        if tid is not None:
+            return tid
+
+        logger.debug(
+            "DoomLoopMiddleware: no thread_id resolved from RunnableConfig; "
+            "falling back to shared 'no_thread' window."
+        )
+        return "no_thread"
+
+    def _window(self, thread_id: str) -> deque[str]:
+        win = self._windows.get(thread_id)
+        if win is None:
+            win = deque(maxlen=self._threshold)
+            self._windows[thread_id] = win
+        return win
+
+    def _detect(
+        self, message: AIMessage, runtime: Runtime[ContextT]
+    ) -> tuple[bool, list[str], dict[str, Any] | None]:
+        if not message.tool_calls:
+            return False, [], None
+
+        thread_id = self._thread_id_from_runtime(runtime)
+        window = self._window(thread_id)
+
+        triggered_call: dict[str, Any] | None = None
+        for call in message.tool_calls:
+            name = (
+                call.get("name")
+                if isinstance(call, dict)
+                else getattr(call, "name", None)
+            )
+            args = (
+                call.get("args")
+                if isinstance(call, dict)
+                else getattr(call, "args", {})
+            )
+            if not isinstance(name, str):
+                continue
+            sig = _signature(name, args)
+            window.append(sig)
+            if len(window) >= self._threshold and len(set(window)) == 1:
+                triggered_call = {"name": name, "params": args or {}}
+                break
+
+        if triggered_call is None:
+            return False, list(window), None
+        return True, list(window), triggered_call
+
+    def after_model(  # type: ignore[override]
+        self,
+        state: AgentState[ResponseT],
+        runtime: Runtime[ContextT],
+    ) -> dict[str, Any] | None:
+        messages = state.get("messages") or []
+        if not messages:
+            return None
+        last = messages[-1]
+        if not isinstance(last, AIMessage):
+            return None
+
+        triggered, signatures, action = self._detect(last, runtime)
+        if not triggered:
+            return None
+
+        logger.warning(
+            "Doom loop detected: tool %s called %d times in a row (sig=%s)",
+            action["name"] if action else "<unknown>",
+            self._threshold,
+            signatures[-1] if signatures else "<empty>",
+        )
+
+        # Open an interrupt.raised span with permission=doom_loop attribute
+        # so dashboards can break out doom-loop interrupts from regular
+        # permission asks via the ``interrupt.permission`` attribute.
+        with ot.interrupt_span(
+            interrupt_type="permission_ask",
+            extra={
+                "interrupt.permission": "doom_loop",
+                "interrupt.threshold": self._threshold,
+                "interrupt.tool": (action or {}).get("tool", "<unknown>"),
+            },
+        ):
+            decision = interrupt(
+                {
+                    "type": "permission_ask",
+                    "action": action or {"tool": "<unknown>", "params": {}},
+                    "context": {
+                        "permission": "doom_loop",
+                        "recent_signatures": signatures,
+                        "threshold": self._threshold,
+                    },
+                }
+            )
+
+        # Reset window so the next decision (continue/cancel) starts fresh.
+        thread_id = self._thread_id_from_runtime(runtime)
+        self._windows.pop(thread_id, None)
+
+        # Decision shape mirrors ``tools/hitl.py``: {"decision_type": "..."}
+        # If the user cancelled, jump to end. Otherwise return ``None`` so the
+        # tool call proceeds. The frontend's exact reply names may differ —
+        # we tolerate any shape that contains a string with "reject"/"cancel".
+        if isinstance(decision, dict):
+            kind = str(
+                decision.get("decision_type") or decision.get("type") or ""
+            ).lower()
+            if "reject" in kind or "cancel" in kind:
+                return {"jump_to": "end"}
+        return None
+
+    async def aafter_model(  # type: ignore[override]
+        self,
+        state: AgentState[ResponseT],
+        runtime: Runtime[ContextT],
+    ) -> dict[str, Any] | None:
+        return self.after_model(state, runtime)
+
+
+__all__ = [
+    "DoomLoopMiddleware",
+    "_signature",
+]
diff --git a/surfsense_backend/app/agents/new_chat/middleware/file_intent.py b/surfsense_backend/app/agents/new_chat/middleware/file_intent.py
new file mode 100644
index 0000000000..7897e13d69
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/middleware/file_intent.py
@@ -0,0 +1,334 @@
+"""Semantic file-intent routing middleware for new chat turns.
+
+This middleware classifies the latest human turn into a small intent set:
+- chat_only
+- file_write
+- file_read
+
+For ``file_write`` turns it injects a strict system contract so the model
+uses filesystem tools before claiming success, and provides a deterministic
+fallback path when no filename is specified by the user.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import re
+from datetime import UTC, datetime
+from enum import StrEnum
+from typing import Any
+
+from langchain.agents.middleware import AgentMiddleware, AgentState
+from langchain_core.language_models import BaseChatModel
+from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage
+from langgraph.runtime import Runtime
+from pydantic import BaseModel, Field, ValidationError
+
+logger = logging.getLogger(__name__)
+
+
+class FileOperationIntent(StrEnum):
+    CHAT_ONLY = "chat_only"
+    FILE_WRITE = "file_write"
+    FILE_READ = "file_read"
+
+
+class FileIntentPlan(BaseModel):
+    intent: FileOperationIntent = Field(
+        description="Primary user intent for this turn."
+    )
+    confidence: float = Field(
+        ge=0.0,
+        le=1.0,
+        default=0.5,
+        description="Model confidence in the selected intent.",
+    )
+    suggested_filename: str | None = Field(
+        default=None,
+        description="Optional filename (e.g. notes.md) inferred from user request.",
+    )
+    suggested_directory: str | None = Field(
+        default=None,
+        description=(
+            "Optional directory path (e.g. /reports/q2 or reports/q2) inferred from "
+            "user request."
+        ),
+    )
+    suggested_path: str | None = Field(
+        default=None,
+        description=(
+            "Optional full file path (e.g. /reports/q2/summary.md). If present, this "
+            "takes precedence over suggested_directory + suggested_filename."
+        ),
+    )
+
+
+def _extract_text_from_message(message: BaseMessage) -> str:
+    content = getattr(message, "content", "")
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        parts: list[str] = []
+        for item in content:
+            if isinstance(item, str):
+                parts.append(item)
+            elif isinstance(item, dict) and item.get("type") == "text":
+                parts.append(str(item.get("text", "")))
+        return "\n".join(part for part in parts if part)
+    return str(content)
+
+
+def _extract_json_payload(text: str) -> str:
+    stripped = text.strip()
+    fenced = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", stripped, re.DOTALL)
+    if fenced:
+        return fenced.group(1)
+    start = stripped.find("{")
+    end = stripped.rfind("}")
+    if start != -1 and end != -1 and end > start:
+        return stripped[start : end + 1]
+    return stripped
+
+
+def _sanitize_filename(value: str) -> str:
+    name = re.sub(r"[\\/:*?\"<>|]+", "_", value).strip()
+    name = re.sub(r"\s+", "-", name)
+    name = name.strip("._-")
+    if not name:
+        name = "note"
+    if len(name) > 80:
+        name = name[:80].rstrip("-_.")
+    return name
+
+
+def _sanitize_path_segment(value: str) -> str:
+    segment = re.sub(r"[\\/:*?\"<>|]+", "_", value).strip()
+    segment = re.sub(r"\s+", "_", segment)
+    segment = segment.strip("._-")
+    return segment
+
+
+def _normalize_directory(value: str) -> str:
+    raw = value.strip().replace("\\", "/")
+    raw = raw.strip("/")
+    if not raw:
+        return ""
+    parts = [_sanitize_path_segment(part) for part in raw.split("/") if part.strip()]
+    parts = [part for part in parts if part]
+    return "/".join(parts)
+
+
+def _normalize_file_path(value: str) -> str:
+    raw = value.strip().replace("\\", "/").strip()
+    if not raw:
+        return ""
+    had_trailing_slash = raw.endswith("/")
+    raw = raw.strip("/")
+    if not raw:
+        return ""
+    parts = [_sanitize_path_segment(part) for part in raw.split("/") if part.strip()]
+    parts = [part for part in parts if part]
+    if not parts:
+        return ""
+    if had_trailing_slash:
+        return f"/{'/'.join(parts)}/"
+    return f"/{'/'.join(parts)}"
+
+
+def _infer_directory_from_user_text(user_text: str) -> str | None:
+    patterns = (
+        r"\b(?:in|inside|under)\s+(?:the\s+)?([a-zA-Z0-9 _\-/]+?)\s+folder\b",
+        r"\b(?:in|inside|under)\s+([a-zA-Z0-9 _\-/]+?)\b",
+    )
+    lowered = user_text.lower()
+    for pattern in patterns:
+        match = re.search(pattern, lowered, flags=re.IGNORECASE)
+        if not match:
+            continue
+        candidate = match.group(1).strip()
+        if candidate in {"the", "a", "an"}:
+            continue
+        normalized = _normalize_directory(candidate)
+        if normalized:
+            return normalized
+    return None
+
+
+def _fallback_path(
+    suggested_filename: str | None,
+    *,
+    suggested_directory: str | None = None,
+    suggested_path: str | None = None,
+    user_text: str,
+) -> str:
+    inferred_dir = _infer_directory_from_user_text(user_text)
+
+    sanitized_filename = ""
+    if suggested_filename:
+        sanitized_filename = _sanitize_filename(suggested_filename)
+        if sanitized_filename.lower().endswith(".txt"):
+            sanitized_filename = f"{sanitized_filename[:-4]}.md"
+    if not sanitized_filename:
+        sanitized_filename = "notes.md"
+    elif "." not in sanitized_filename:
+        sanitized_filename = f"{sanitized_filename}.md"
+
+    normalized_suggested_path = (
+        _normalize_file_path(suggested_path) if suggested_path else ""
+    )
+    if normalized_suggested_path:
+        if normalized_suggested_path.endswith("/"):
+            return f"{normalized_suggested_path.rstrip('/')}/{sanitized_filename}"
+        return normalized_suggested_path
+
+    directory = _normalize_directory(suggested_directory or "")
+    if not directory and inferred_dir:
+        directory = inferred_dir
+    if directory:
+        return f"/{directory}/{sanitized_filename}"
+
+    return f"/{sanitized_filename}"
+
+
+def _build_classifier_prompt(*, recent_conversation: str, user_text: str) -> str:
+    return (
+        "Classify the latest user request into a filesystem intent for an AI agent.\n"
+        "Return JSON only with this exact schema:\n"
+        '{"intent":"chat_only|file_write|file_read","confidence":0.0,"suggested_filename":"string or null","suggested_directory":"string or null","suggested_path":"string or null"}\n\n'
+        "Rules:\n"
+        "- Use semantic intent, not literal keywords.\n"
+        "- file_write: user asks to create/save/write/update/edit content as a file.\n"
+        "- file_read: user asks to open/read/list/search existing files.\n"
+        "- chat_only: conversational/analysis responses without required file operations.\n"
+        "- For file_write, choose a concise semantic suggested_filename and match the requested format.\n"
+        "- If the user mentions a folder/directory, populate suggested_directory.\n"
+        "- If user specifies an explicit full path, populate suggested_path.\n"
+        "- Use extensions that match user intent (e.g. .md, .json, .yaml, .csv, .py, .ts, .js, .html, .css, .sql).\n"
+        "- Do not use .txt; prefer .md for generic text notes.\n"
+        "- Do not include dates or timestamps in suggested_filename unless explicitly requested.\n"
+        "- Never include markdown or explanation.\n\n"
+        f"Recent conversation:\n{recent_conversation or '(none)'}\n\n"
+        f"Latest user message:\n{user_text}"
+    )
+
+
+def _build_recent_conversation(
+    messages: list[BaseMessage], *, max_messages: int = 6
+) -> str:
+    rows: list[str] = []
+    filtered: list[tuple[str, BaseMessage]] = []
+    for msg in messages:
+        role: str | None = None
+        if isinstance(msg, HumanMessage):
+            role = "user"
+        elif isinstance(msg, AIMessage):
+            if getattr(msg, "tool_calls", None):
+                continue
+            role = "assistant"
+        else:
+            continue
+        filtered.append((role, msg))
+    for role, msg in filtered[-max_messages:]:
+        text = re.sub(r"\s+", " ", _extract_text_from_message(msg)).strip()
+        if text:
+            rows.append(f"{role}: {text[:280]}")
+    return "\n".join(rows)
+
+
+class FileIntentMiddleware(AgentMiddleware):  # type: ignore[type-arg]
+    """Classify file intent and inject a strict file-write contract."""
+
+    tools = ()
+
+    def __init__(self, *, llm: BaseChatModel | None = None) -> None:
+        self.llm = llm
+
+    async def _classify_intent(
+        self, *, messages: list[BaseMessage], user_text: str
+    ) -> FileIntentPlan:
+        if self.llm is None:
+            return FileIntentPlan(intent=FileOperationIntent.CHAT_ONLY, confidence=0.0)
+
+        prompt = _build_classifier_prompt(
+            recent_conversation=_build_recent_conversation(messages),
+            user_text=user_text,
+        )
+        try:
+            response = await self.llm.ainvoke(
+                [HumanMessage(content=prompt)],
+                config={"tags": ["surfsense:internal"]},
+            )
+            payload = json.loads(
+                _extract_json_payload(_extract_text_from_message(response))
+            )
+            plan = FileIntentPlan.model_validate(payload)
+            return plan
+        except (json.JSONDecodeError, ValidationError, ValueError) as exc:
+            logger.warning("File intent classifier returned invalid output: %s", exc)
+        except Exception as exc:  # pragma: no cover - defensive fallback
+            logger.warning("File intent classifier failed: %s", exc)
+
+        return FileIntentPlan(intent=FileOperationIntent.CHAT_ONLY, confidence=0.0)
+
+    async def abefore_agent(  # type: ignore[override]
+        self,
+        state: AgentState,
+        runtime: Runtime[Any],
+    ) -> dict[str, Any] | None:
+        del runtime
+        messages = state.get("messages") or []
+        if not messages:
+            return None
+
+        last_human: HumanMessage | None = None
+        for msg in reversed(messages):
+            if isinstance(msg, HumanMessage):
+                last_human = msg
+                break
+        if last_human is None:
+            return None
+
+        user_text = _extract_text_from_message(last_human).strip()
+        if not user_text:
+            return None
+
+        plan = await self._classify_intent(messages=messages, user_text=user_text)
+        suggested_path = _fallback_path(
+            plan.suggested_filename,
+            suggested_directory=plan.suggested_directory,
+            suggested_path=plan.suggested_path,
+            user_text=user_text,
+        )
+        contract = {
+            "intent": plan.intent.value,
+            "confidence": plan.confidence,
+            "suggested_path": suggested_path,
+            "timestamp": datetime.now(UTC).isoformat(),
+            "turn_id": state.get("turn_id", ""),
+        }
+
+        if plan.intent != FileOperationIntent.FILE_WRITE:
+            return {"file_operation_contract": contract}
+
+        contract_msg = SystemMessage(
+            content=(
+                "<file_operation_contract>\n"
+                "This turn intent is file_write.\n"
+                f"Suggested default path: {suggested_path}\n"
+                "Rules:\n"
+                "- You MUST call write_file or edit_file before claiming success.\n"
+                "- If no path is provided by the user, use the suggested default path.\n"
+                "- Do not claim a file was created/updated unless tool output confirms it.\n"
+                "- If the write/edit fails, clearly report failure instead of success.\n"
+                "- Do not include timestamps or dates in generated file content unless the user explicitly asks for them.\n"
+                "- For open-ended requests (e.g., random note), generate useful concrete content, not placeholders.\n"
+                "</file_operation_contract>"
+            )
+        )
+
+        # Insert just before the latest human turn so it applies to this request.
+        new_messages = list(messages)
+        insert_at = max(len(new_messages) - 1, 0)
+        new_messages.insert(insert_at, contract_msg)
+        return {"messages": new_messages, "file_operation_contract": contract}
diff --git a/surfsense_backend/app/agents/new_chat/middleware/filesystem.py b/surfsense_backend/app/agents/new_chat/middleware/filesystem.py
index d7697ef15a..c46eb98a56 100644
--- a/surfsense_backend/app/agents/new_chat/middleware/filesystem.py
+++ b/surfsense_backend/app/agents/new_chat/middleware/filesystem.py
@@ -1,56 +1,122 @@
 """Custom filesystem middleware for the SurfSense agent.
 
-This middleware customizes prompts and persists write/edit operations for
-`/documents/*` files into SurfSense's `Document`/`Chunk` tables.
+This middleware fully overrides every deepagents filesystem tool so that the
+``Command(update=...)`` payload can carry SurfSense-specific state fields
+(``cwd``, ``staged_dirs``, ``pending_moves``, ``doc_id_by_path``,
+``dirty_paths``) atomically alongside the standard ``files`` update.
+
+In CLOUD mode the backend is :class:`KBPostgresBackend` (lazy DB reads, no DB
+writes). End-of-turn persistence is handled by
+:class:`KnowledgeBasePersistenceMiddleware`. In DESKTOP_LOCAL_FOLDER mode the
+backend is :class:`MultiRootLocalFolderBackend` and writes go straight to disk.
+
+New tools introduced here:
+
+* ``mkdir`` — cloud-only stages folder paths to ``state['staged_dirs']``;
+  desktop creates real directories.
+* ``cd`` / ``pwd`` — manage ``state['cwd']`` (per-thread).
+* ``move_file`` — staged commit in cloud, real disk move in desktop.
+* ``list_tree`` — works in both modes (cloud uses
+  :func:`KBPostgresBackend.alist_tree_listing`).
+
+The middleware no longer ships ``save_document``; persistence is inferred
+from ``write_file`` / ``edit_file`` against ``/documents/*`` paths.
 """
 
 from __future__ import annotations
 
 import asyncio
+import json
+import logging
+import posixpath
 import re
-from datetime import UTC, datetime
+import secrets
 from typing import Annotated, Any
 
+from daytona.common.errors import DaytonaError
 from deepagents import FilesystemMiddleware
 from deepagents.backends.protocol import EditResult, WriteResult
-from deepagents.backends.utils import validate_path
-from deepagents.middleware.filesystem import FilesystemState
-from fractional_indexing import generate_key_between
+from deepagents.backends.utils import (
+    create_file_data,
+    format_read_response,
+    validate_path,
+)
 from langchain.tools import ToolRuntime
-from langchain_core.callbacks import dispatch_custom_event
 from langchain_core.messages import ToolMessage
 from langchain_core.tools import BaseTool, StructuredTool
 from langgraph.types import Command
-from sqlalchemy import delete, select
-
-from app.db import Chunk, Document, DocumentType, Folder, shielded_async_session
-from app.indexing_pipeline.document_chunker import chunk_text
-from app.utils.document_converters import (
-    embed_texts,
-    generate_content_hash,
-    generate_unique_identifier_hash,
+
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+from app.agents.new_chat.filesystem_state import SurfSenseFilesystemState
+from app.agents.new_chat.middleware.kb_postgres_backend import (
+    KBPostgresBackend,
+    paginate_listing,
+)
+from app.agents.new_chat.middleware.multi_root_local_folder_backend import (
+    MultiRootLocalFolderBackend,
+)
+from app.agents.new_chat.path_resolver import DOCUMENTS_ROOT
+from app.agents.new_chat.sandbox import (
+    _evict_sandbox_cache,
+    delete_sandbox,
+    get_or_create_sandbox,
+    is_sandbox_enabled,
 )
+from app.agents.new_chat.state_reducers import _CLEAR
+
+logger = logging.getLogger(__name__)
+
 
 # =============================================================================
-# System Prompt (injected into every model call by wrap_model_call)
+# System Prompt (built per-session based on filesystem_mode)
 # =============================================================================
+#
+# Each chat session runs in exactly one filesystem mode. Including rules for
+# the OTHER mode just wastes tokens and confuses the model, so we build the
+# prompt + tool descriptions for the active mode only.
 
-SURFSENSE_FILESYSTEM_SYSTEM_PROMPT = """## Following Conventions
+_COMMON_PROMPT_HEADER = """## Following Conventions
 
 - Read files before editing — understand existing content before making changes.
 - Mimic existing style, naming conventions, and patterns.
+- Never claim a file was created/updated unless filesystem tool output confirms success.
+- If a file write/edit fails, explicitly report the failure.
+"""
 
-## Filesystem Tools `ls`, `read_file`, `write_file`, `edit_file`, `glob`, `grep`, `save_document`
-
-All file paths must start with a `/`.
-- ls: list files and directories at a given path.
-- read_file: read a file from the filesystem.
-- write_file: create a temporary file in the session (not persisted).
-- edit_file: edit a file in the session (not persisted for /documents/ files).
-- glob: find files matching a pattern (e.g., "**/*.xml").
-- grep: search for text within files.
-- save_document: **permanently** save a new document to the user's knowledge
-  base. Use only when the user explicitly asks to save/create a document.
+_CLOUD_SYSTEM_PROMPT = (
+    _COMMON_PROMPT_HEADER
+    + """
+## Filesystem Tools
+
+All file paths must start with `/`. Relative paths resolve against the
+current working directory (`cwd`, default `/documents`).
+
+- ls(path, offset=0, limit=200): list files and directories at the given path.
+- read_file(path, offset, limit): read a file (paginated) from the filesystem.
+- write_file(path, content): create a new text file in the workspace.
+- edit_file(path, old, new): exact string-replacement edit (lazy-loads KB
+  documents on first edit).
+- glob(pattern, path): find files matching a glob pattern.
+- grep(pattern, path, glob): substring search across files.
+- mkdir(path): create a folder under `/documents/` (committed at end of turn).
+- cd(path): change the current working directory.
+- pwd(): print the current working directory.
+- move_file(source, dest): move/rename a file under `/documents/`.
+- rm(path): delete a single file under `/documents/` (no `-r`).
+- rmdir(path): delete an empty directory under `/documents/`.
+- list_tree(path, max_depth, page_size): recursively list files/folders.
+
+## Persistence Rules
+
+- Files written under `/documents/<...>` are **persisted** at end of turn as
+  Documents in the user's knowledge base.
+- Files whose **basename** starts with `temp_` (e.g. `temp_plan.md` or
+  `/documents/temp_scratch.md`) are **discarded** at end of turn — use this
+  prefix for any scratch/working content you do NOT want saved.
+- All other paths (outside `/documents/` and not `temp_*`) are rejected.
+- mkdir/move_file/rm/rmdir are staged this turn and committed at end of
+  turn alongside any new/edited documents. Snapshot/revert is enabled
+  for every destructive operation when action logging is on.
 
 ## Reading Documents Efficiently
 
@@ -67,23 +133,111 @@
 
 Use `<chunk id='...'>` values as citation IDs in your answers.
 
-## User-Mentioned Documents
+## Priority List
 
-When the `ls` output tags a file with `[MENTIONED BY USER — read deeply]`,
-the user **explicitly selected** that document. These files are your highest-
-priority sources:
-1. **Always read them thoroughly** — scan the full `<chunk_index>`, then read
-   all major sections, not just matched chunks.
-2. **Prefer their content** over other search results when answering.
-3. **Cite from them first** whenever applicable.
+You receive a `<priority_documents>` system message each turn listing the
+top-K paths most relevant to the user's query (by hybrid search). Read those
+first — matched sections are flagged inside each document's `<chunk_index>`.
+
+## Workspace Tree
+
+You receive a `<workspace_tree>` system message each turn with the current
+folder/document layout. The tree may be truncated past a hard cap; in that
+case, drill into specific folders with `ls(...)` or `list_tree(...)`.
+
+## grep Line Numbers
+
+`grep` searches across both your in-memory edits and the indexed chunks in
+Postgres. State-cached files return real line numbers; database hits return
+`line=0` because their position depends on per-document XML layout — call
+`read_file(path)` to find the exact line.
 """
+)
+
+_DESKTOP_SYSTEM_PROMPT = (
+    _COMMON_PROMPT_HEADER
+    + """
+## Local Folder Mode
+
+This chat operates directly on the user's local folders. Writes and edits
+hit disk immediately — there is no end-of-turn staging, no `/documents/`
+namespace, and no `temp_` semantics.
+
+## Filesystem Tools
+
+All file paths must start with `/` and use mount-prefixed absolute paths
+like `/<mount>/file.ext`. Relative paths resolve against the current working
+directory (`cwd`).
+
+- ls(path, offset=0, limit=200): list files and directories at the given path.
+- read_file(path, offset, limit): read a file (paginated) from disk.
+- write_file(path, content): write a file to disk.
+- edit_file(path, old, new): exact string-replacement edit on disk.
+- glob(pattern, path): find files matching a glob pattern.
+- grep(pattern, path, glob): substring search across files.
+- mkdir(path): create a directory on disk.
+- cd(path): change the current working directory.
+- pwd(): print the current working directory.
+- move_file(source, dest): move/rename a file.
+- rm(path): delete a single file from disk (no `-r`). NOT reversible.
+- rmdir(path): delete an empty directory from disk. NOT reversible.
+- list_tree(path, max_depth, page_size): recursively list files/folders.
+
+## Workflow Tips
+
+- If you are unsure which mounts are available, call `ls('/')` first.
+- For large trees, prefer `list_tree` then `grep` then `read_file` over
+  brute-force directory traversal.
+- Cross-mount moves are not supported.
+- Desktop deletes hit disk immediately and cannot be undone via the
+  agent's revert flow — confirm before calling `rm`/`rmdir`.
+"""
+)
+
+_SANDBOX_PROMPT_ADDENDUM = (
+    "\n- execute_code: run Python code in an isolated sandbox."
+    "\n\n## Code Execution"
+    "\n\nUse execute_code whenever a task benefits from running code."
+    " Never perform arithmetic manually."
+    "\n\nDocuments here are XML-wrapped markdown, not raw data files."
+    " To work with them programmatically, read the document first,"
+    " extract the data, write it as a clean file (CSV, JSON, etc.),"
+    " and then run your code against it."
+)
+
+
+def _build_filesystem_system_prompt(
+    filesystem_mode: FilesystemMode,
+    *,
+    sandbox_available: bool,
+) -> str:
+    """Build the filesystem system prompt for a given session mode.
+
+    The prompt only describes rules and tools that actually apply in the
+    chosen mode — there is no cross-mode noise.
+    """
+    base = (
+        _CLOUD_SYSTEM_PROMPT
+        if filesystem_mode == FilesystemMode.CLOUD
+        else _DESKTOP_SYSTEM_PROMPT
+    )
+    if sandbox_available:
+        base += _SANDBOX_PROMPT_ADDENDUM
+    return base
+
+
+# Backwards-compatible alias retained for any external imports.
+SURFSENSE_FILESYSTEM_SYSTEM_PROMPT = _CLOUD_SYSTEM_PROMPT
 
 # =============================================================================
 # Per-Tool Descriptions (shown to the LLM as the tool's docstring)
 # =============================================================================
 
-SURFSENSE_LIST_FILES_TOOL_DESCRIPTION = """Lists files and directories at the given path.
-"""
+# =============================================================================
+# Per-Tool Descriptions (mode-specific; injected as the tool's docstring)
+# =============================================================================
+
+# --- mode-agnostic ---------------------------------------------------------
 
 SURFSENSE_READ_FILE_TOOL_DESCRIPTION = """Reads a file from the filesystem.
 
@@ -98,458 +252,795 @@
 - Use chunk IDs (`<chunk id='...'>`) as citations in answers.
 """
 
-SURFSENSE_WRITE_FILE_TOOL_DESCRIPTION = """Writes a new file to the in-memory filesystem (session-only).
+SURFSENSE_GLOB_TOOL_DESCRIPTION = """Find files matching a glob pattern.
 
-Use this to create scratch/working files during the conversation. Files created
-here are ephemeral and will not be saved to the user's knowledge base.
+Supports standard glob patterns: `*`, `**`, `?`.
+Returns absolute file paths.
+"""
 
-To permanently save a document to the user's knowledge base, use the
-`save_document` tool instead.
+SURFSENSE_CD_TOOL_DESCRIPTION = """Changes the current working directory (cwd).
+
+Args:
+- path: absolute or relative directory path. Relative paths resolve against
+  the current cwd.
+
+The new cwd is used by other filesystem tools whenever a relative path is
+given. Returns the resolved cwd.
+"""
+
+SURFSENSE_PWD_TOOL_DESCRIPTION = """Prints the current working directory."""
+
+SURFSENSE_EXECUTE_CODE_TOOL_DESCRIPTION = """Executes Python code in an isolated sandbox environment.
+
+Common data-science packages are pre-installed (pandas, numpy, matplotlib,
+scipy, scikit-learn).
+
+Usage notes:
+- No outbound network access.
+- Returns combined stdout/stderr with exit code.
+- Use print() to produce output.
+- Use the optional timeout parameter to override the default timeout.
+"""
+
+# --- cloud-only ------------------------------------------------------------
+
+_CLOUD_LIST_FILES_TOOL_DESCRIPTION = """Lists files and directories at the given path.
+
+Usage:
+- Provide an absolute path under `/documents` (relative paths resolve under
+  the current cwd, which defaults to `/documents`).
+- For very large folders, use `offset` and `limit` to paginate the listing.
+- Returns one entry per line; directories end with a trailing `/`.
+"""
+
+_CLOUD_WRITE_FILE_TOOL_DESCRIPTION = """Writes a new text file to the workspace.
+
+Usage:
+- Files written under `/documents/<...>` are persisted as Documents at end
+  of turn.
+- Use a `temp_` filename prefix (e.g. `temp_plan.md` or `/documents/temp_x.md`)
+  for scratch/working files; they are automatically discarded at end of turn.
+- Writes outside `/documents/` are rejected unless the basename starts with
+  `temp_`.
+- Supported outputs include common LLM-friendly text formats like markdown,
+  json, yaml, csv, xml, html, css, sql, and code files.
+- Avoid placeholders; produce concrete and useful text.
 """
 
-SURFSENSE_EDIT_FILE_TOOL_DESCRIPTION = """Performs exact string replacements in files.
+_CLOUD_EDIT_FILE_TOOL_DESCRIPTION = """Performs exact string replacements in files.
 
 IMPORTANT:
 - Read the file before editing.
 - Preserve exact indentation and formatting.
-- Edits to documents under `/documents/` are session-only (not persisted to the
-  database) because those files use an XML citation wrapper around the original
-  content.
+- Edits to documents under `/documents/` are persisted at end of turn.
+- Edits to `temp_*` files are discarded at end of turn.
 """
 
-SURFSENSE_GLOB_TOOL_DESCRIPTION = """Find files matching a glob pattern.
+_CLOUD_MOVE_FILE_TOOL_DESCRIPTION = """Moves or renames a file or folder.
 
-Supports standard glob patterns: `*`, `**`, `?`.
-Returns absolute file paths.
+Use absolute paths for both source and destination.
+
+Notes:
+- `move_file` is staged this turn and committed at end of turn.
+- The agent cannot overwrite an existing destination — pass a fresh dest
+  path or move the existing destination away first.
+- The anonymous uploaded document is read-only and cannot be moved.
+- Rename is a special case of move (same folder, different filename).
 """
 
-SURFSENSE_GREP_TOOL_DESCRIPTION = """Search for a literal text pattern across files.
+_CLOUD_LIST_TREE_TOOL_DESCRIPTION = """Lists files/folders recursively in a single bounded call.
 
-Use this to locate relevant document files/chunks before reading full files.
+Args:
+- path: absolute path to start from. Defaults to `/documents`.
+- max_depth: recursion depth limit (default 8).
+- page_size: maximum number of entries returned (max 1000).
+- include_files / include_dirs: filter returned entry types.
+
+Returns JSON with:
+- entries: [{path, is_dir, size, modified_at, depth}]
+- truncated: true when additional entries were omitted due to page_size.
 """
 
-SURFSENSE_SAVE_DOCUMENT_TOOL_DESCRIPTION = """Permanently saves a document to the user's knowledge base.
+_CLOUD_GREP_TOOL_DESCRIPTION = """Search for a literal text pattern across files.
 
-This is an expensive operation — it creates a new Document record in the
-database, chunks the content, and generates embeddings for search.
+Searches both your in-memory edits and the indexed chunks in Postgres.
+State-cached file matches include real line numbers; database hits return
+`line=0` because their position depends on per-document XML layout — call
+`read_file(path)` afterwards to find the exact line.
+"""
 
-Use ONLY when the user explicitly asks to save/create/store a document.
-Do NOT use this for scratch work; use `write_file` for temporary files.
+_CLOUD_MKDIR_TOOL_DESCRIPTION = """Creates a directory under `/documents/`.
+
+Stages the folder for end-of-turn commit; the Folder row is inserted only
+after the agent's turn finishes successfully.
 
 Args:
-  title: The document title (e.g., "Meeting Notes 2025-06-01").
-  content: The plain-text or markdown content to save. Do NOT include XML
-           citation wrappers — pass only the actual document text.
-  folder_path: Optional folder path under /documents/ (e.g., "Work/Notes").
-               Folders are created automatically if they don't exist.
+- path: absolute path of the new directory (must start with
+  `/documents/`).
+
+Notes:
+- Parent folders are created as needed.
 """
 
+_CLOUD_RM_TOOL_DESCRIPTION = """Deletes a single file under `/documents/`.
+
+Mirrors POSIX `rm path` (no `-r`, no glob expansion). Stages the deletion
+for end-of-turn commit; the row is removed only after the agent's turn
+finishes successfully.
+
+Args:
+- path: absolute or relative file path. Cannot point at a directory — use
+  `rmdir` for empty folders. Cannot target the root or `/documents`.
+
+Notes:
+- The action is reversible via the per-action revert flow when action
+  logging is enabled.
+- The anonymous uploaded document is read-only and cannot be deleted.
+"""
+
+_CLOUD_RMDIR_TOOL_DESCRIPTION = """Deletes an empty directory under `/documents/`.
+
+Mirrors POSIX `rmdir path`: refuses non-empty directories. Recursive
+deletion (`rm -r`) is intentionally NOT supported — clear contents with
+`rm` first.
+
+Args:
+- path: absolute or relative directory path. Cannot target the root,
+  `/documents`, the current cwd, or any ancestor of cwd (use `cd` to
+  move out first).
+
+Notes:
+- Emptiness is evaluated against the post-staged view, so a same-turn
+  `rm /a/x.md` followed by `rmdir /a` is fine.
+- If the directory was added in this same turn via `mkdir` and never
+  committed, the staged mkdir is dropped instead of issuing a delete.
+- The action is reversible via the per-action revert flow when action
+  logging is enabled.
+"""
+
+# --- desktop-only ----------------------------------------------------------
+
+_DESKTOP_LIST_FILES_TOOL_DESCRIPTION = """Lists files and directories at the given path.
+
+Usage:
+- Provide an absolute path using a mount prefix (e.g. `/<mount>/sub/dir`).
+  Use `ls('/')` to discover available mounts.
+- For very large folders, use `offset` and `limit` to paginate the listing.
+- Returns one entry per line; directories end with a trailing `/`.
+"""
+
+_DESKTOP_WRITE_FILE_TOOL_DESCRIPTION = """Writes a text file to disk.
+
+Usage:
+- Use mount-prefixed absolute paths like `/<mount>/sub/file.ext`.
+- Writes hit disk immediately. There is no end-of-turn staging.
+- Supported outputs include common LLM-friendly text formats like markdown,
+  json, yaml, csv, xml, html, css, sql, and code files.
+- Avoid placeholders; produce concrete and useful text.
+"""
+
+_DESKTOP_EDIT_FILE_TOOL_DESCRIPTION = """Performs exact string replacements in files on disk.
+
+IMPORTANT:
+- Read the file before editing.
+- Preserve exact indentation and formatting.
+- Edits hit disk immediately.
+"""
+
+_DESKTOP_MOVE_FILE_TOOL_DESCRIPTION = """Moves or renames a file or folder on disk.
+
+Use mount-prefixed absolute paths for both source and destination
+(e.g. `/<mount>/old.txt` -> `/<mount>/new.txt`).
+
+Notes:
+- Cross-mount moves are not supported.
+- Rename is a special case of move (same folder, different filename).
+"""
+
+_DESKTOP_LIST_TREE_TOOL_DESCRIPTION = """Lists files/folders recursively in a single bounded call.
+
+Args:
+- path: absolute path to start from. Defaults to `/`.
+- max_depth: recursion depth limit (default 8).
+- page_size: maximum number of entries returned (max 1000).
+- include_files / include_dirs: filter returned entry types.
+
+Returns JSON with:
+- entries: [{path, is_dir, size, modified_at, depth}]
+- truncated: true when additional entries were omitted due to page_size.
+"""
+
+_DESKTOP_GREP_TOOL_DESCRIPTION = """Search for a literal text pattern across files.
+
+Searches files on disk and any in-memory edits. Returns real line numbers.
+"""
+
+_DESKTOP_MKDIR_TOOL_DESCRIPTION = """Creates a directory on disk.
+
+Args:
+- path: absolute mount-prefixed path of the new directory.
+
+Notes:
+- Parent folders are created as needed.
+"""
+
+_DESKTOP_RM_TOOL_DESCRIPTION = """Deletes a single file from disk.
+
+Mirrors POSIX `rm path` (no `-r`, no glob expansion). The deletion hits
+disk immediately. Desktop deletes are NOT reversible via the agent's
+revert flow.
+
+Args:
+- path: absolute mount-prefixed file path. Cannot point at a directory —
+  use `rmdir` for empty folders.
+"""
+
+_DESKTOP_RMDIR_TOOL_DESCRIPTION = """Deletes an empty directory from disk.
+
+Mirrors POSIX `rmdir path`: refuses non-empty directories. Recursive
+deletion is NOT supported. The deletion hits disk immediately and is
+NOT reversible via the agent's revert flow.
+
+Args:
+- path: absolute mount-prefixed directory path. Cannot target the mount
+  root or any directory containing files/subfolders.
+"""
+
+
+def _build_tool_descriptions(filesystem_mode: FilesystemMode) -> dict[str, str]:
+    """Pick the active-mode description for every filesystem tool."""
+    if filesystem_mode == FilesystemMode.CLOUD:
+        return {
+            "ls": _CLOUD_LIST_FILES_TOOL_DESCRIPTION,
+            "read_file": SURFSENSE_READ_FILE_TOOL_DESCRIPTION,
+            "write_file": _CLOUD_WRITE_FILE_TOOL_DESCRIPTION,
+            "edit_file": _CLOUD_EDIT_FILE_TOOL_DESCRIPTION,
+            "move_file": _CLOUD_MOVE_FILE_TOOL_DESCRIPTION,
+            "list_tree": _CLOUD_LIST_TREE_TOOL_DESCRIPTION,
+            "glob": SURFSENSE_GLOB_TOOL_DESCRIPTION,
+            "grep": _CLOUD_GREP_TOOL_DESCRIPTION,
+            "mkdir": _CLOUD_MKDIR_TOOL_DESCRIPTION,
+            "cd": SURFSENSE_CD_TOOL_DESCRIPTION,
+            "pwd": SURFSENSE_PWD_TOOL_DESCRIPTION,
+            "rm": _CLOUD_RM_TOOL_DESCRIPTION,
+            "rmdir": _CLOUD_RMDIR_TOOL_DESCRIPTION,
+        }
+    return {
+        "ls": _DESKTOP_LIST_FILES_TOOL_DESCRIPTION,
+        "read_file": SURFSENSE_READ_FILE_TOOL_DESCRIPTION,
+        "write_file": _DESKTOP_WRITE_FILE_TOOL_DESCRIPTION,
+        "edit_file": _DESKTOP_EDIT_FILE_TOOL_DESCRIPTION,
+        "move_file": _DESKTOP_MOVE_FILE_TOOL_DESCRIPTION,
+        "list_tree": _DESKTOP_LIST_TREE_TOOL_DESCRIPTION,
+        "glob": SURFSENSE_GLOB_TOOL_DESCRIPTION,
+        "grep": _DESKTOP_GREP_TOOL_DESCRIPTION,
+        "mkdir": _DESKTOP_MKDIR_TOOL_DESCRIPTION,
+        "cd": SURFSENSE_CD_TOOL_DESCRIPTION,
+        "pwd": SURFSENSE_PWD_TOOL_DESCRIPTION,
+        "rm": _DESKTOP_RM_TOOL_DESCRIPTION,
+        "rmdir": _DESKTOP_RMDIR_TOOL_DESCRIPTION,
+    }
+
+
+# Backwards-compatible aliases retained for any external imports/tests that
+# referenced the original CLOUD-flavoured constants.
+SURFSENSE_LIST_FILES_TOOL_DESCRIPTION = _CLOUD_LIST_FILES_TOOL_DESCRIPTION
+SURFSENSE_WRITE_FILE_TOOL_DESCRIPTION = _CLOUD_WRITE_FILE_TOOL_DESCRIPTION
+SURFSENSE_EDIT_FILE_TOOL_DESCRIPTION = _CLOUD_EDIT_FILE_TOOL_DESCRIPTION
+SURFSENSE_MOVE_FILE_TOOL_DESCRIPTION = _CLOUD_MOVE_FILE_TOOL_DESCRIPTION
+SURFSENSE_LIST_TREE_TOOL_DESCRIPTION = _CLOUD_LIST_TREE_TOOL_DESCRIPTION
+SURFSENSE_GREP_TOOL_DESCRIPTION = _CLOUD_GREP_TOOL_DESCRIPTION
+SURFSENSE_MKDIR_TOOL_DESCRIPTION = _CLOUD_MKDIR_TOOL_DESCRIPTION
+
+
+# =============================================================================
+# Helpers
+# =============================================================================
+
+
+_TEMP_PREFIX = "temp_"
+
+
+def _basename(path: str) -> str:
+    return path.rsplit("/", 1)[-1]
+
+
+def _is_ancestor_of(candidate: str, target: str) -> bool:
+    """True iff ``candidate`` is a strict ancestor directory of ``target``.
+
+    ``target`` itself is NOT considered an ancestor (use equality for that).
+    Both paths are assumed to be canonicalised, absolute, and free of
+    trailing slashes (except the root ``/``).
+    """
+    if not candidate.startswith("/") or not target.startswith("/"):
+        return False
+    if candidate == target:
+        return False
+    prefix = candidate.rstrip("/") + "/"
+    return target.startswith(prefix)
+
 
 class SurfSenseFilesystemMiddleware(FilesystemMiddleware):
-    """SurfSense-specific filesystem middleware with DB persistence for docs."""
+    """SurfSense-specific filesystem middleware (cloud + desktop)."""
+
+    state_schema = SurfSenseFilesystemState
+
+    _MAX_EXECUTE_TIMEOUT = 300
 
     def __init__(
         self,
         *,
+        backend: Any = None,
+        filesystem_mode: FilesystemMode = FilesystemMode.CLOUD,
         search_space_id: int | None = None,
         created_by_id: str | None = None,
+        thread_id: int | str | None = None,
         tool_token_limit_before_evict: int | None = 20000,
     ) -> None:
+        self._filesystem_mode = filesystem_mode
         self._search_space_id = search_space_id
         self._created_by_id = created_by_id
+        self._thread_id = thread_id
+        self._sandbox_available = is_sandbox_enabled() and thread_id is not None
+
+        # Build the prompt + tool descriptions for the active mode only —
+        # mixing both modes wastes tokens and confuses the model with rules
+        # it can't actually use this session.
+        system_prompt = _build_filesystem_system_prompt(
+            filesystem_mode,
+            sandbox_available=self._sandbox_available,
+        )
+
         super().__init__(
-            system_prompt=SURFSENSE_FILESYSTEM_SYSTEM_PROMPT,
-            custom_tool_descriptions={
-                "ls": SURFSENSE_LIST_FILES_TOOL_DESCRIPTION,
-                "read_file": SURFSENSE_READ_FILE_TOOL_DESCRIPTION,
-                "write_file": SURFSENSE_WRITE_FILE_TOOL_DESCRIPTION,
-                "edit_file": SURFSENSE_EDIT_FILE_TOOL_DESCRIPTION,
-                "glob": SURFSENSE_GLOB_TOOL_DESCRIPTION,
-                "grep": SURFSENSE_GREP_TOOL_DESCRIPTION,
-            },
+            backend=backend,
+            system_prompt=system_prompt,
+            custom_tool_descriptions=_build_tool_descriptions(filesystem_mode),
             tool_token_limit_before_evict=tool_token_limit_before_evict,
+            max_execute_timeout=self._MAX_EXECUTE_TIMEOUT,
         )
-        # Remove the execute tool (no sandbox backend)
         self.tools = [t for t in self.tools if t.name != "execute"]
-        self.tools.append(self._create_save_document_tool())
+        self.tools.append(self._create_mkdir_tool())
+        self.tools.append(self._create_cd_tool())
+        self.tools.append(self._create_pwd_tool())
+        self.tools.append(self._create_move_file_tool())
+        self.tools.append(self._create_rm_tool())
+        self.tools.append(self._create_rmdir_tool())
+        self.tools.append(self._create_list_tree_tool())
+        if self._sandbox_available:
+            self.tools.append(self._create_execute_code_tool())
+
+    # ------------------------------------------------------------------ helpers
+
+    def _is_cloud(self) -> bool:
+        return self._filesystem_mode == FilesystemMode.CLOUD
 
     @staticmethod
     def _run_async_blocking(coro: Any) -> Any:
-        """Run async coroutine from sync code path when no event loop is running."""
         try:
             loop = asyncio.get_running_loop()
             if loop.is_running():
-                return "Error: sync filesystem persistence not supported inside an active event loop."
+                return "Error: sync filesystem operation not supported inside an active event loop."
         except RuntimeError:
             pass
         return asyncio.run(coro)
 
     @staticmethod
-    def _parse_virtual_path(file_path: str) -> tuple[list[str], str]:
-        """Parse /documents/... path into folder parts and a document title."""
-        if not file_path.startswith("/documents/"):
-            return [], ""
-        rel = file_path[len("/documents/") :].strip("/")
-        if not rel:
-            return [], ""
-        parts = [part for part in rel.split("/") if part]
-        file_name = parts[-1]
-        title = file_name[:-4] if file_name.lower().endswith(".xml") else file_name
-        return parts[:-1], title
+    def _normalize_absolute_path(candidate: str) -> str:
+        normalized = re.sub(r"/+", "/", candidate.strip().replace("\\", "/"))
+        if not normalized:
+            return "/"
+        if normalized.startswith("/"):
+            return normalized
+        return f"/{normalized.lstrip('/')}"
 
-    async def _ensure_folder_hierarchy(
-        self,
-        *,
-        folder_parts: list[str],
-        search_space_id: int,
-    ) -> int | None:
-        """Ensure folder hierarchy exists and return leaf folder ID."""
-        if not folder_parts:
+    @staticmethod
+    def _extract_mount_from_path(path: str, mounts: tuple[str, ...]) -> str | None:
+        rel = path.lstrip("/")
+        if not rel:
             return None
-        async with shielded_async_session() as session:
-            parent_id: int | None = None
-            for name in folder_parts:
-                result = await session.execute(
-                    select(Folder).where(
-                        Folder.search_space_id == search_space_id,
-                        Folder.parent_id == parent_id
-                        if parent_id is not None
-                        else Folder.parent_id.is_(None),
-                        Folder.name == name,
-                    )
-                )
-                folder = result.scalar_one_or_none()
-                if folder is None:
-                    sibling_result = await session.execute(
-                        select(Folder.position)
-                        .where(
-                            Folder.search_space_id == search_space_id,
-                            Folder.parent_id == parent_id
-                            if parent_id is not None
-                            else Folder.parent_id.is_(None),
-                        )
-                        .order_by(Folder.position.desc())
-                        .limit(1)
-                    )
-                    last_position = sibling_result.scalar_one_or_none()
-                    folder = Folder(
-                        name=name,
-                        position=generate_key_between(last_position, None),
-                        parent_id=parent_id,
-                        search_space_id=search_space_id,
-                        created_by_id=self._created_by_id,
-                        updated_at=datetime.now(UTC),
-                    )
-                    session.add(folder)
-                    await session.flush()
-                parent_id = folder.id
-            await session.commit()
-            return parent_id
-
-    async def _persist_new_document(
-        self, *, file_path: str, content: str
-    ) -> dict[str, Any] | str:
-        """Persist a new NOTE document from a newly written file.
-
-        Returns a dict with document metadata on success, or an error string.
-        """
-        if self._search_space_id is None:
-            return {}
-        folder_parts, title = self._parse_virtual_path(file_path)
-        if not title:
-            return "Error: write_file for document persistence requires path under /documents/<name>.xml"
-        folder_id = await self._ensure_folder_hierarchy(
-            folder_parts=folder_parts,
-            search_space_id=self._search_space_id,
+        mount, _, _ = rel.partition("/")
+        if mount in mounts:
+            return mount
+        return None
+
+    @staticmethod
+    def _local_parent_path(path: str) -> str:
+        rel = path.lstrip("/")
+        if "/" not in rel:
+            return "/"
+        parent = rel.rsplit("/", 1)[0].strip("/")
+        if not parent:
+            return "/"
+        return f"/{parent}"
+
+    @staticmethod
+    def _path_exists_under_mount(
+        backend: MultiRootLocalFolderBackend,
+        mount: str,
+        local_path: str,
+    ) -> bool:
+        result = backend.list_tree(
+            f"/{mount}{local_path}",
+            max_depth=0,
+            page_size=1,
+            include_files=True,
+            include_dirs=True,
         )
-        async with shielded_async_session() as session:
-            content_hash = generate_content_hash(content, self._search_space_id)
-            existing = await session.execute(
-                select(Document.id).where(Document.content_hash == content_hash)
-            )
-            if existing.scalar_one_or_none() is not None:
-                return "Error: A document with identical content already exists."
-            unique_identifier_hash = generate_unique_identifier_hash(
-                DocumentType.NOTE,
-                file_path,
-                self._search_space_id,
-            )
-            doc = Document(
-                title=title,
-                document_type=DocumentType.NOTE,
-                document_metadata={"virtual_path": file_path},
-                content=content,
-                content_hash=content_hash,
-                unique_identifier_hash=unique_identifier_hash,
-                source_markdown=content,
-                search_space_id=self._search_space_id,
-                folder_id=folder_id,
-                created_by_id=self._created_by_id,
-                updated_at=datetime.now(UTC),
+        return not bool(result.get("error"))
+
+    def _normalize_local_mount_path(
+        self,
+        candidate: str,
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+    ) -> str:
+        normalized = self._normalize_absolute_path(candidate)
+        backend = self._get_backend(runtime)
+        if not isinstance(backend, MultiRootLocalFolderBackend):
+            return normalized
+
+        mounts = backend.list_mounts()
+        explicit_mount = self._extract_mount_from_path(normalized, mounts)
+        if explicit_mount:
+            return normalized
+
+        if len(mounts) == 1:
+            return f"/{mounts[0]}{normalized}"
+
+        suggested_mount: str | None = None
+        contract = runtime.state.get("file_operation_contract") or {}
+        suggested_path = contract.get("suggested_path")
+        if isinstance(suggested_path, str) and suggested_path.strip():
+            normalized_suggested = self._normalize_absolute_path(suggested_path)
+            suggested_mount = self._extract_mount_from_path(
+                normalized_suggested, mounts
             )
-            session.add(doc)
-            await session.flush()
-
-            summary_embedding = embed_texts([content])[0]
-            doc.embedding = summary_embedding
-            chunk_texts = chunk_text(content)
-            if chunk_texts:
-                chunk_embeddings = embed_texts(chunk_texts)
-                chunks = [
-                    Chunk(document_id=doc.id, content=text, embedding=embedding)
-                    for text, embedding in zip(
-                        chunk_texts, chunk_embeddings, strict=True
-                    )
-                ]
-                session.add_all(chunks)
-            await session.commit()
-
-            return {
-                "id": doc.id,
-                "title": title,
-                "documentType": DocumentType.NOTE.value,
-                "searchSpaceId": self._search_space_id,
-                "folderId": folder_id,
-                "createdById": str(self._created_by_id)
-                if self._created_by_id
-                else None,
-            }
 
-    async def _persist_edited_document(
-        self, *, file_path: str, updated_content: str
+        matching_mounts = [
+            mount
+            for mount in mounts
+            if self._path_exists_under_mount(backend, mount, normalized)
+        ]
+        if len(matching_mounts) == 1:
+            return f"/{matching_mounts[0]}{normalized}"
+
+        parent_path = self._local_parent_path(normalized)
+        if parent_path != "/":
+            parent_matching_mounts = [
+                mount
+                for mount in mounts
+                if self._path_exists_under_mount(backend, mount, parent_path)
+            ]
+            if len(parent_matching_mounts) == 1:
+                return f"/{parent_matching_mounts[0]}{normalized}"
+
+        if suggested_mount:
+            return f"/{suggested_mount}{normalized}"
+
+        return f"/{backend.default_mount()}{normalized}"
+
+    def _default_cwd(self) -> str:
+        return DOCUMENTS_ROOT if self._is_cloud() else "/"
+
+    def _current_cwd(self, runtime: ToolRuntime[None, SurfSenseFilesystemState]) -> str:
+        cwd = runtime.state.get("cwd") if hasattr(runtime, "state") else None
+        if isinstance(cwd, str) and cwd.startswith("/"):
+            return cwd
+        return self._default_cwd()
+
+    def _get_contract_suggested_path(
+        self, runtime: ToolRuntime[None, SurfSenseFilesystemState]
+    ) -> str:
+        contract = runtime.state.get("file_operation_contract") or {}
+        suggested = contract.get("suggested_path")
+        if isinstance(suggested, str) and suggested.strip():
+            return self._normalize_absolute_path(suggested)
+        return self._default_cwd().rstrip("/") + "/notes.md"
+
+    def _resolve_relative(
+        self,
+        path: str,
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+    ) -> str:
+        candidate = path.strip()
+        if not candidate:
+            return self._current_cwd(runtime)
+        if candidate.startswith("/"):
+            return self._normalize_absolute_path(candidate)
+        cwd = self._current_cwd(runtime)
+        joined = posixpath.normpath(posixpath.join(cwd, candidate))
+        return self._normalize_absolute_path(joined)
+
+    def _resolve_write_target_path(
+        self,
+        file_path: str,
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+    ) -> str:
+        candidate = file_path.strip()
+        if not candidate:
+            return self._get_contract_suggested_path(runtime)
+        if self._filesystem_mode == FilesystemMode.DESKTOP_LOCAL_FOLDER:
+            return self._normalize_local_mount_path(candidate, runtime)
+        return self._resolve_relative(candidate, runtime)
+
+    def _resolve_move_target_path(
+        self,
+        file_path: str,
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+    ) -> str:
+        candidate = file_path.strip()
+        if not candidate:
+            return ""
+        if self._filesystem_mode == FilesystemMode.DESKTOP_LOCAL_FOLDER:
+            return self._normalize_local_mount_path(candidate, runtime)
+        return self._resolve_relative(candidate, runtime)
+
+    def _resolve_list_target_path(
+        self,
+        path: str,
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+    ) -> str:
+        candidate = path.strip() or self._current_cwd(runtime)
+        if candidate == "/":
+            return "/"
+        if self._filesystem_mode == FilesystemMode.DESKTOP_LOCAL_FOLDER:
+            return self._normalize_local_mount_path(candidate, runtime)
+        return self._resolve_relative(candidate, runtime)
+
+    # ------------------------------------------------------------------ namespace policy
+
+    def _check_cloud_write_namespace(
+        self,
+        path: str,
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
     ) -> str | None:
-        """Persist edits for an existing NOTE document and recreate chunks."""
-        if self._search_space_id is None:
+        """Return an error string if cloud writes to ``path`` are not allowed.
+
+        Order matters:
+        1. Reject writes to the anonymous read-only doc.
+        2. Allow ``/documents/*``.
+        3. Allow ``temp_*`` basename anywhere.
+        4. Reject everything else.
+        """
+        if not self._is_cloud():
+            return None
+        anon = runtime.state.get("kb_anon_doc") or {}
+        if isinstance(anon, dict):
+            anon_path = str(anon.get("path") or "")
+            if anon_path and anon_path == path:
+                return "Error: the anonymous uploaded document is read-only."
+        if path.startswith(DOCUMENTS_ROOT + "/") or path == DOCUMENTS_ROOT:
             return None
-        unique_identifier_hash = generate_unique_identifier_hash(
-            DocumentType.NOTE,
-            file_path,
-            self._search_space_id,
+        if _basename(path).startswith(_TEMP_PREFIX):
+            return None
+        return (
+            "Error: cloud writes must target /documents/<...> or use a 'temp_' "
+            f"basename for scratch (got '{path}')."
         )
-        doc_id_from_xml: int | None = None
-        match = re.search(r"<document_id>\s*(\d+)\s*</document_id>", updated_content)
-        if match:
-            doc_id_from_xml = int(match.group(1))
-        async with shielded_async_session() as session:
-            doc_result = await session.execute(
-                select(Document).where(
-                    Document.search_space_id == self._search_space_id,
-                    Document.unique_identifier_hash == unique_identifier_hash,
-                )
-            )
-            document = doc_result.scalar_one_or_none()
-            if document is None and doc_id_from_xml is not None:
-                by_id_result = await session.execute(
-                    select(Document).where(
-                        Document.search_space_id == self._search_space_id,
-                        Document.id == doc_id_from_xml,
-                    )
-                )
-                document = by_id_result.scalar_one_or_none()
-            if document is None:
-                return "Error: Could not map edited file to an existing document."
-
-            document.content = updated_content
-            document.source_markdown = updated_content
-            document.content_hash = generate_content_hash(
-                updated_content, self._search_space_id
-            )
-            document.updated_at = datetime.now(UTC)
-            if not document.document_metadata:
-                document.document_metadata = {}
-            document.document_metadata["virtual_path"] = file_path
-
-            summary_embedding = embed_texts([updated_content])[0]
-            document.embedding = summary_embedding
-
-            await session.execute(delete(Chunk).where(Chunk.document_id == document.id))
-            chunk_texts = chunk_text(updated_content)
-            if chunk_texts:
-                chunk_embeddings = embed_texts(chunk_texts)
-                session.add_all(
-                    [
-                        Chunk(
-                            document_id=document.id, content=text, embedding=embedding
-                        )
-                        for text, embedding in zip(
-                            chunk_texts, chunk_embeddings, strict=True
-                        )
-                    ]
-                )
-            await session.commit()
-        return None
 
-    def _create_save_document_tool(self) -> BaseTool:
-        """Create save_document tool that persists a new document to the KB."""
+    # ------------------------------------------------------------------ tool: ls
 
-        def sync_save_document(
-            title: Annotated[str, "Title for the new document."],
-            content: Annotated[
-                str,
-                "Plain-text or markdown content to save. Do NOT include XML wrappers.",
-            ],
-            runtime: ToolRuntime[None, FilesystemState],
-            folder_path: Annotated[
+    def _create_ls_tool(self) -> BaseTool:
+        tool_description = (
+            self._custom_tool_descriptions.get("ls")
+            or SURFSENSE_LIST_FILES_TOOL_DESCRIPTION
+        )
+
+        def sync_ls(
+            runtime: ToolRuntime[None, SurfSenseFilesystemState],
+            path: Annotated[
                 str,
-                "Optional folder path under /documents/ (e.g. 'Work/Notes'). Created automatically.",
+                "Absolute path to the directory to list. Relative paths resolve against the current cwd.",
             ] = "",
-        ) -> Command | str:
-            if not content.strip():
-                return "Error: content cannot be empty."
-            file_name = re.sub(r'[\\/:*?"<>|]+', "_", title).strip() or "untitled"
-            if not file_name.lower().endswith(".xml"):
-                file_name = f"{file_name}.xml"
-            folder = folder_path.strip().strip("/") if folder_path else ""
-            virtual_path = (
-                f"/documents/{folder}/{file_name}"
-                if folder
-                else f"/documents/{file_name}"
+            offset: Annotated[
+                int,
+                "Number of entries to skip. Use for paginating large folders. Defaults to 0.",
+            ] = 0,
+            limit: Annotated[
+                int,
+                "Maximum number of entries to return. Defaults to 200.",
+            ] = 200,
+        ) -> str:
+            return self._run_async_blocking(
+                async_ls(runtime, path=path, offset=offset, limit=limit)
             )
 
-            persist_result = self._run_async_blocking(
-                self._persist_new_document(file_path=virtual_path, content=content)
-            )
-            if isinstance(persist_result, str):
-                return persist_result
-            if isinstance(persist_result, dict) and persist_result.get("id"):
-                dispatch_custom_event("document_created", persist_result)
-            return f"Document '{title}' saved to knowledge base (path: {virtual_path})."
-
-        async def async_save_document(
-            title: Annotated[str, "Title for the new document."],
-            content: Annotated[
+        async def async_ls(
+            runtime: ToolRuntime[None, SurfSenseFilesystemState],
+            path: Annotated[
                 str,
-                "Plain-text or markdown content to save. Do NOT include XML wrappers.",
-            ],
-            runtime: ToolRuntime[None, FilesystemState],
-            folder_path: Annotated[
-                str,
-                "Optional folder path under /documents/ (e.g. 'Work/Notes'). Created automatically.",
+                "Absolute path to the directory to list. Relative paths resolve against the current cwd.",
             ] = "",
-        ) -> Command | str:
-            if not content.strip():
-                return "Error: content cannot be empty."
-            file_name = re.sub(r'[\\/:*?"<>|]+', "_", title).strip() or "untitled"
-            if not file_name.lower().endswith(".xml"):
-                file_name = f"{file_name}.xml"
-            folder = folder_path.strip().strip("/") if folder_path else ""
-            virtual_path = (
-                f"/documents/{folder}/{file_name}"
-                if folder
-                else f"/documents/{file_name}"
-            )
-
-            persist_result = await self._persist_new_document(
-                file_path=virtual_path, content=content
+            offset: Annotated[
+                int,
+                "Number of entries to skip. Use for paginating large folders. Defaults to 0.",
+            ] = 0,
+            limit: Annotated[
+                int,
+                "Maximum number of entries to return. Defaults to 200.",
+            ] = 200,
+        ) -> str:
+            target = self._resolve_list_target_path(path, runtime)
+            try:
+                validated = validate_path(target)
+            except ValueError as exc:
+                return f"Error: {exc}"
+            if offset < 0:
+                offset = 0
+            if limit < 1:
+                limit = 1
+            backend = self._get_backend(runtime)
+            infos = await backend.als_info(validated)
+            page = paginate_listing(infos, offset=offset, limit=limit)
+            paths = [
+                f"{fi.get('path', '')}/" if fi.get("is_dir") else fi.get("path", "")
+                for fi in page
+            ]
+            total = len(infos)
+            shown = len(page)
+            header = (
+                f"{validated} ({shown} of {total} entries"
+                f"{f', offset={offset}' if offset else ''})"
             )
-            if isinstance(persist_result, str):
-                return persist_result
-            if isinstance(persist_result, dict) and persist_result.get("id"):
-                dispatch_custom_event("document_created", persist_result)
-            return f"Document '{title}' saved to knowledge base (path: {virtual_path})."
+            if not paths:
+                return f"{header}\n(empty)"
+            body = "\n".join(paths)
+            if total > offset + shown:
+                body += (
+                    f"\n... {total - offset - shown} more — call ls("
+                    f"'{validated}', offset={offset + shown}, limit={limit})"
+                )
+            return f"{header}\n{body}"
 
         return StructuredTool.from_function(
-            name="save_document",
-            description=SURFSENSE_SAVE_DOCUMENT_TOOL_DESCRIPTION,
-            func=sync_save_document,
-            coroutine=async_save_document,
+            name="ls",
+            description=tool_description,
+            func=sync_ls,
+            coroutine=async_ls,
         )
 
-    def _create_write_file_tool(self) -> BaseTool:
-        """Create write_file — ephemeral for /documents/*, persisted otherwise."""
+    # ------------------------------------------------------------------ tool: read_file
+
+    def _create_read_file_tool(self) -> BaseTool:
         tool_description = (
-            self._custom_tool_descriptions.get("write_file")
-            or SURFSENSE_WRITE_FILE_TOOL_DESCRIPTION
+            self._custom_tool_descriptions.get("read_file")
+            or SURFSENSE_READ_FILE_TOOL_DESCRIPTION
         )
 
-        def sync_write_file(
+        async def async_read_file(
             file_path: Annotated[
                 str,
-                "Absolute path where the file should be created. Must be absolute, not relative.",
+                "Absolute path to the file to read. Relative paths resolve against the current cwd.",
             ],
-            content: Annotated[
-                str,
-                "The text content to write to the file. This parameter is required.",
-            ],
-            runtime: ToolRuntime[None, FilesystemState],
+            runtime: ToolRuntime[None, SurfSenseFilesystemState],
+            offset: Annotated[
+                int,
+                "Line number to start reading from (0-indexed).",
+            ] = 0,
+            limit: Annotated[
+                int,
+                "Maximum number of lines to read.",
+            ] = 100,
         ) -> Command | str:
-            resolved_backend = self._get_backend(runtime)
+            target = self._resolve_relative(file_path, runtime)
             try:
-                validated_path = validate_path(file_path)
+                validated = validate_path(target)
             except ValueError as exc:
                 return f"Error: {exc}"
-            res: WriteResult = resolved_backend.write(validated_path, content)
-            if res.error:
-                return res.error
 
-            if not self._is_kb_document(validated_path):
-                persist_result = self._run_async_blocking(
-                    self._persist_new_document(
-                        file_path=validated_path, content=content
-                    )
-                )
-                if isinstance(persist_result, str):
-                    return persist_result
-                if isinstance(persist_result, dict) and persist_result.get("id"):
-                    dispatch_custom_event("document_created", persist_result)
+            files = runtime.state.get("files") or {}
+            if validated in files:
+                return format_read_response(files[validated], offset, limit)
+
+            backend = self._get_backend(runtime)
+            if isinstance(backend, KBPostgresBackend):
+                loaded = await backend._load_file_data(validated)
+                if loaded is None:
+                    return f"Error: File '{validated}' not found"
+                file_data, doc_id = loaded
+                rendered = format_read_response(file_data, offset, limit)
+                update: dict[str, Any] = {
+                    "files": {validated: file_data},
+                    "messages": [
+                        ToolMessage(
+                            content=rendered,
+                            tool_call_id=runtime.tool_call_id,
+                        )
+                    ],
+                }
+                if doc_id is not None:
+                    update["doc_id_by_path"] = {validated: doc_id}
+                return Command(update=update)
 
-            if res.files_update is not None:
-                return Command(
-                    update={
-                        "files": res.files_update,
-                        "messages": [
-                            ToolMessage(
-                                content=f"Updated file {res.path}",
-                                tool_call_id=runtime.tool_call_id,
-                            )
-                        ],
-                    }
-                )
-            return f"Updated file {res.path}"
+            try:
+                rendered = await backend.aread(validated, offset=offset, limit=limit)
+            except Exception as exc:  # pragma: no cover - defensive
+                return f"Error: {exc}"
+            return rendered
 
-        async def async_write_file(
+        def sync_read_file(
             file_path: Annotated[
                 str,
-                "Absolute path where the file should be created. Must be absolute, not relative.",
+                "Absolute path to the file to read. Relative paths resolve against the current cwd.",
             ],
-            content: Annotated[
+            runtime: ToolRuntime[None, SurfSenseFilesystemState],
+            offset: Annotated[
+                int,
+                "Line number to start reading from (0-indexed).",
+            ] = 0,
+            limit: Annotated[
+                int,
+                "Maximum number of lines to read.",
+            ] = 100,
+        ) -> Command | str:
+            return self._run_async_blocking(
+                async_read_file(file_path, runtime, offset, limit)
+            )
+
+        return StructuredTool.from_function(
+            name="read_file",
+            description=tool_description,
+            func=sync_read_file,
+            coroutine=async_read_file,
+        )
+
+    # ------------------------------------------------------------------ tool: write_file
+
+    def _create_write_file_tool(self) -> BaseTool:
+        tool_description = (
+            self._custom_tool_descriptions.get("write_file")
+            or SURFSENSE_WRITE_FILE_TOOL_DESCRIPTION
+        )
+
+        async def async_write_file(
+            file_path: Annotated[
                 str,
-                "The text content to write to the file. This parameter is required.",
+                "Absolute path where the file should be created. Relative paths resolve against the current cwd.",
             ],
-            runtime: ToolRuntime[None, FilesystemState],
+            content: Annotated[str, "Text content to write to the file."],
+            runtime: ToolRuntime[None, SurfSenseFilesystemState],
         ) -> Command | str:
-            resolved_backend = self._get_backend(runtime)
+            target = self._resolve_write_target_path(file_path, runtime)
             try:
-                validated_path = validate_path(file_path)
+                validated = validate_path(target)
             except ValueError as exc:
                 return f"Error: {exc}"
-            res: WriteResult = await resolved_backend.awrite(validated_path, content)
+
+            namespace_error = self._check_cloud_write_namespace(validated, runtime)
+            if namespace_error:
+                return namespace_error
+
+            backend = self._get_backend(runtime)
+            res: WriteResult = await backend.awrite(validated, content)
             if res.error:
                 return res.error
 
-            if not self._is_kb_document(validated_path):
-                persist_result = await self._persist_new_document(
-                    file_path=validated_path,
-                    content=content,
-                )
-                if isinstance(persist_result, str):
-                    return persist_result
-                if isinstance(persist_result, dict) and persist_result.get("id"):
-                    dispatch_custom_event("document_created", persist_result)
+            path = res.path or validated
+            files_update = res.files_update or {path: create_file_data(content)}
+            update: dict[str, Any] = {
+                "files": files_update,
+                "messages": [
+                    ToolMessage(
+                        content=f"Updated file {path}",
+                        tool_call_id=runtime.tool_call_id,
+                    )
+                ],
+            }
+            if self._is_cloud():
+                update["dirty_paths"] = [path]
+                update["dirty_path_tool_calls"] = {path: runtime.tool_call_id}
+            return Command(update=update)
 
-            if res.files_update is not None:
-                return Command(
-                    update={
-                        "files": res.files_update,
-                        "messages": [
-                            ToolMessage(
-                                content=f"Updated file {res.path}",
-                                tool_call_id=runtime.tool_call_id,
-                            )
-                        ],
-                    }
-                )
-            return f"Updated file {res.path}"
+        def sync_write_file(
+            file_path: Annotated[
+                str,
+                "Absolute path where the file should be created. Relative paths resolve against the current cwd.",
+            ],
+            content: Annotated[str, "Text content to write to the file."],
+            runtime: ToolRuntime[None, SurfSenseFilesystemState],
+        ) -> Command | str:
+            return self._run_async_blocking(
+                async_write_file(file_path, content, runtime)
+            )
 
         return StructuredTool.from_function(
             name="write_file",
@@ -558,147 +1049,950 @@ async def async_write_file(
             coroutine=async_write_file,
         )
 
-    @staticmethod
-    def _is_kb_document(path: str) -> bool:
-        """Return True for paths under /documents/ (KB-sourced, XML-wrapped)."""
-        return path.startswith("/documents/")
+    # ------------------------------------------------------------------ tool: edit_file
 
     def _create_edit_file_tool(self) -> BaseTool:
-        """Create edit_file with DB persistence (skipped for KB documents)."""
         tool_description = (
             self._custom_tool_descriptions.get("edit_file")
             or SURFSENSE_EDIT_FILE_TOOL_DESCRIPTION
         )
 
-        def sync_edit_file(
+        async def async_edit_file(
             file_path: Annotated[
                 str,
-                "Absolute path to the file to edit. Must be absolute, not relative.",
+                "Absolute path to the file to edit. Relative paths resolve against the current cwd.",
             ],
             old_string: Annotated[
                 str,
-                "The exact text to find and replace. Must be unique in the file unless replace_all is True.",
+                "Exact text to replace. Must be unique unless replace_all is True.",
             ],
             new_string: Annotated[
                 str,
-                "The text to replace old_string with. Must be different from old_string.",
+                "Replacement text. Must differ from old_string.",
             ],
-            runtime: ToolRuntime[None, FilesystemState],
+            runtime: ToolRuntime[None, SurfSenseFilesystemState],
             *,
             replace_all: Annotated[
                 bool,
-                "If True, replace all occurrences of old_string. If False (default), old_string must be unique.",
+                "If True, replace all occurrences of old_string. Defaults to False.",
             ] = False,
         ) -> Command | str:
-            resolved_backend = self._get_backend(runtime)
+            target = self._resolve_relative(file_path, runtime)
             try:
-                validated_path = validate_path(file_path)
+                validated = validate_path(target)
             except ValueError as exc:
                 return f"Error: {exc}"
-            res: EditResult = resolved_backend.edit(
-                validated_path,
-                old_string,
-                new_string,
-                replace_all=replace_all,
+
+            namespace_error = self._check_cloud_write_namespace(validated, runtime)
+            if namespace_error:
+                return namespace_error
+
+            backend = self._get_backend(runtime)
+            files_state = runtime.state.get("files") or {}
+            doc_id_to_attach: int | None = None
+
+            if (
+                self._is_cloud()
+                and validated not in files_state
+                and isinstance(backend, KBPostgresBackend)
+            ):
+                loaded = await backend._load_file_data(validated)
+                if loaded is None:
+                    return f"Error: File '{validated}' not found"
+                _, doc_id_to_attach = loaded
+
+            res: EditResult = await backend.aedit(
+                validated, old_string, new_string, replace_all=replace_all
             )
             if res.error:
                 return res.error
 
-            if not self._is_kb_document(validated_path):
-                read_result = resolved_backend.read(
-                    validated_path, offset=0, limit=200000
-                )
-                if read_result.error or read_result.file_data is None:
-                    return f"Error: could not reload edited file '{validated_path}' for persistence."
-                updated_content = read_result.file_data["content"]
-                persist_result = self._run_async_blocking(
-                    self._persist_edited_document(
-                        file_path=validated_path,
-                        updated_content=updated_content,
+            path = res.path or validated
+            files_update = res.files_update or {}
+            update: dict[str, Any] = {
+                "files": files_update,
+                "messages": [
+                    ToolMessage(
+                        content=(
+                            f"Successfully replaced {res.occurrences} instance(s) "
+                            f"of the string in '{path}'"
+                        ),
+                        tool_call_id=runtime.tool_call_id,
                     )
+                ],
+            }
+            if self._is_cloud():
+                update["dirty_paths"] = [path]
+                update["dirty_path_tool_calls"] = {path: runtime.tool_call_id}
+                if doc_id_to_attach is not None:
+                    update["doc_id_by_path"] = {path: doc_id_to_attach}
+            return Command(update=update)
+
+        def sync_edit_file(
+            file_path: Annotated[
+                str,
+                "Absolute path to the file to edit. Relative paths resolve against the current cwd.",
+            ],
+            old_string: Annotated[
+                str,
+                "Exact text to replace. Must be unique unless replace_all is True.",
+            ],
+            new_string: Annotated[
+                str,
+                "Replacement text. Must differ from old_string.",
+            ],
+            runtime: ToolRuntime[None, SurfSenseFilesystemState],
+            *,
+            replace_all: Annotated[
+                bool,
+                "If True, replace all occurrences of old_string. Defaults to False.",
+            ] = False,
+        ) -> Command | str:
+            return self._run_async_blocking(
+                async_edit_file(
+                    file_path, old_string, new_string, runtime, replace_all=replace_all
                 )
-                if isinstance(persist_result, str):
-                    return persist_result
+            )
 
-            if res.files_update is not None:
+        return StructuredTool.from_function(
+            name="edit_file",
+            description=tool_description,
+            func=sync_edit_file,
+            coroutine=async_edit_file,
+        )
+
+    # ------------------------------------------------------------------ tool: mkdir
+
+    def _create_mkdir_tool(self) -> BaseTool:
+        tool_description = (
+            self._custom_tool_descriptions.get("mkdir")
+            or SURFSENSE_MKDIR_TOOL_DESCRIPTION
+        )
+
+        async def async_mkdir(
+            path: Annotated[str, "Absolute or relative directory path to create."],
+            runtime: ToolRuntime[None, SurfSenseFilesystemState],
+        ) -> Command | str:
+            target = self._resolve_relative(path, runtime)
+            try:
+                validated = validate_path(target)
+            except ValueError as exc:
+                return f"Error: {exc}"
+
+            if self._is_cloud():
+                if not (
+                    validated.startswith(DOCUMENTS_ROOT + "/")
+                    or validated == DOCUMENTS_ROOT
+                ):
+                    return (
+                        "Error: cloud mkdir must target a path under /documents/ "
+                        f"(got '{validated}')."
+                    )
                 return Command(
                     update={
-                        "files": res.files_update,
+                        "staged_dirs": [validated],
+                        "staged_dir_tool_calls": {
+                            validated: runtime.tool_call_id,
+                        },
                         "messages": [
                             ToolMessage(
-                                content=f"Successfully replaced {res.occurrences} instance(s) of the string in '{res.path}'",
+                                content=(
+                                    f"Staged directory '{validated}' (will be created "
+                                    "at end of turn)."
+                                ),
                                 tool_call_id=runtime.tool_call_id,
                             )
                         ],
                     }
                 )
-            return f"Successfully replaced {res.occurrences} instance(s) of the string in '{res.path}'"
 
-        async def async_edit_file(
-            file_path: Annotated[
-                str,
-                "Absolute path to the file to edit. Must be absolute, not relative.",
-            ],
-            old_string: Annotated[
-                str,
-                "The exact text to find and replace. Must be unique in the file unless replace_all is True.",
-            ],
-            new_string: Annotated[
-                str,
-                "The text to replace old_string with. Must be different from old_string.",
-            ],
-            runtime: ToolRuntime[None, FilesystemState],
+            backend = self._get_backend(runtime)
+            local_method = getattr(backend, "amkdir", None) or getattr(
+                backend, "mkdir", None
+            )
+            if callable(local_method):
+                try:
+                    res = local_method(validated, parents=True, exist_ok=True)
+                    if asyncio.iscoroutine(res):
+                        await res
+                except TypeError:
+                    res = local_method(validated)
+                    if asyncio.iscoroutine(res):
+                        await res
+                except Exception as exc:  # pragma: no cover
+                    return f"Error: {exc}"
+            return f"Created directory {validated}"
+
+        def sync_mkdir(
+            path: Annotated[str, "Absolute or relative directory path to create."],
+            runtime: ToolRuntime[None, SurfSenseFilesystemState],
+        ) -> Command | str:
+            return self._run_async_blocking(async_mkdir(path, runtime))
+
+        return StructuredTool.from_function(
+            name="mkdir",
+            description=tool_description,
+            func=sync_mkdir,
+            coroutine=async_mkdir,
+        )
+
+    # ------------------------------------------------------------------ tool: cd
+
+    def _create_cd_tool(self) -> BaseTool:
+        tool_description = (
+            self._custom_tool_descriptions.get("cd") or SURFSENSE_CD_TOOL_DESCRIPTION
+        )
+
+        async def async_cd(
+            path: Annotated[str, "Absolute or relative directory path to switch into."],
+            runtime: ToolRuntime[None, SurfSenseFilesystemState],
+        ) -> Command | str:
+            target = self._resolve_relative(path, runtime)
+            try:
+                validated = validate_path(target)
+            except ValueError as exc:
+                return f"Error: {exc}"
+
+            backend = self._get_backend(runtime)
+            try:
+                infos = await backend.als_info(validated)
+            except Exception as exc:  # pragma: no cover - defensive
+                return f"Error: {exc}"
+            staged_dirs = list(runtime.state.get("staged_dirs") or [])
+            files = runtime.state.get("files") or {}
+            cwd_exists = (
+                bool(infos)
+                or validated in staged_dirs
+                or any(p == validated for p in files)
+                or any(
+                    isinstance(p, str) and p.startswith(validated.rstrip("/") + "/")
+                    for p in files
+                )
+                or validated == "/"
+                or validated == DOCUMENTS_ROOT
+            )
+            if not cwd_exists:
+                return f"Error: directory '{validated}' not found."
+            return Command(
+                update={
+                    "cwd": validated,
+                    "messages": [
+                        ToolMessage(
+                            content=f"cwd changed to {validated}",
+                            tool_call_id=runtime.tool_call_id,
+                        )
+                    ],
+                }
+            )
+
+        def sync_cd(
+            path: Annotated[str, "Absolute or relative directory path to switch into."],
+            runtime: ToolRuntime[None, SurfSenseFilesystemState],
+        ) -> Command | str:
+            return self._run_async_blocking(async_cd(path, runtime))
+
+        return StructuredTool.from_function(
+            name="cd",
+            description=tool_description,
+            func=sync_cd,
+            coroutine=async_cd,
+        )
+
+    # ------------------------------------------------------------------ tool: pwd
+
+    def _create_pwd_tool(self) -> BaseTool:
+        tool_description = (
+            self._custom_tool_descriptions.get("pwd") or SURFSENSE_PWD_TOOL_DESCRIPTION
+        )
+
+        def sync_pwd(
+            runtime: ToolRuntime[None, SurfSenseFilesystemState],
+        ) -> str:
+            return self._current_cwd(runtime)
+
+        async def async_pwd(
+            runtime: ToolRuntime[None, SurfSenseFilesystemState],
+        ) -> str:
+            return self._current_cwd(runtime)
+
+        return StructuredTool.from_function(
+            name="pwd",
+            description=tool_description,
+            func=sync_pwd,
+            coroutine=async_pwd,
+        )
+
+    # ------------------------------------------------------------------ tool: move_file
+
+    def _create_move_file_tool(self) -> BaseTool:
+        tool_description = (
+            self._custom_tool_descriptions.get("move_file")
+            or SURFSENSE_MOVE_FILE_TOOL_DESCRIPTION
+        )
+
+        async def async_move_file(
+            source_path: Annotated[str, "Absolute or relative source path."],
+            destination_path: Annotated[str, "Absolute or relative destination path."],
+            runtime: ToolRuntime[None, SurfSenseFilesystemState],
             *,
-            replace_all: Annotated[
+            overwrite: Annotated[
                 bool,
-                "If True, replace all occurrences of old_string. If False (default), old_string must be unique.",
+                "If True, replace existing destination. Cloud mode rejects True. Defaults to False.",
             ] = False,
         ) -> Command | str:
-            resolved_backend = self._get_backend(runtime)
+            if not source_path.strip() or not destination_path.strip():
+                return "Error: source_path and destination_path are required."
+
+            source = self._resolve_move_target_path(source_path, runtime)
+            dest = self._resolve_move_target_path(destination_path, runtime)
             try:
-                validated_path = validate_path(file_path)
+                validated_source = validate_path(source)
+                validated_dest = validate_path(dest)
             except ValueError as exc:
                 return f"Error: {exc}"
-            res: EditResult = await resolved_backend.aedit(
-                validated_path,
-                old_string,
-                new_string,
-                replace_all=replace_all,
+
+            if self._is_cloud():
+                return await self._cloud_move_file(
+                    runtime,
+                    validated_source,
+                    validated_dest,
+                    overwrite=overwrite,
+                )
+
+            backend = self._get_backend(runtime)
+            res: WriteResult = await backend.amove(
+                validated_source, validated_dest, overwrite=overwrite
             )
             if res.error:
                 return res.error
+            update: dict[str, Any] = {
+                "messages": [
+                    ToolMessage(
+                        content=f"Moved '{validated_source}' to '{res.path or validated_dest}'",
+                        tool_call_id=runtime.tool_call_id,
+                    )
+                ],
+            }
+            if res.files_update is not None:
+                update["files"] = res.files_update
+            return Command(update=update)
 
-            if not self._is_kb_document(validated_path):
-                read_result = await resolved_backend.aread(
-                    validated_path, offset=0, limit=200000
+        def sync_move_file(
+            source_path: Annotated[str, "Absolute or relative source path."],
+            destination_path: Annotated[str, "Absolute or relative destination path."],
+            runtime: ToolRuntime[None, SurfSenseFilesystemState],
+            *,
+            overwrite: Annotated[
+                bool,
+                "If True, replace existing destination. Cloud mode rejects True. Defaults to False.",
+            ] = False,
+        ) -> Command | str:
+            return self._run_async_blocking(
+                async_move_file(
+                    source_path, destination_path, runtime, overwrite=overwrite
                 )
-                if read_result.error or read_result.file_data is None:
-                    return f"Error: could not reload edited file '{validated_path}' for persistence."
-                updated_content = read_result.file_data["content"]
-                persist_error = await self._persist_edited_document(
-                    file_path=validated_path,
-                    updated_content=updated_content,
+            )
+
+        return StructuredTool.from_function(
+            name="move_file",
+            description=tool_description,
+            func=sync_move_file,
+            coroutine=async_move_file,
+        )
+
+    async def _cloud_move_file(
+        self,
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+        source: str,
+        dest: str,
+        *,
+        overwrite: bool,
+    ) -> Command | str:
+        backend = self._get_backend(runtime)
+        if not isinstance(backend, KBPostgresBackend):
+            return "Error: cloud move requires KBPostgresBackend."
+
+        if source == dest:
+            return f"Moved '{source}' to '{dest}' (no-op)"
+        if overwrite:
+            return (
+                "Error: overwrite=True is not supported in cloud mode. Move/edit "
+                "the destination doc explicitly first."
+            )
+        if not source.startswith(DOCUMENTS_ROOT + "/"):
+            return (
+                "Error: cloud move_file source must be under /documents/ (got "
+                f"'{source}')."
+            )
+        if not dest.startswith(DOCUMENTS_ROOT + "/"):
+            return (
+                "Error: cloud move_file destination must be under /documents/ (got "
+                f"'{dest}')."
+            )
+        anon = runtime.state.get("kb_anon_doc") or {}
+        if isinstance(anon, dict):
+            anon_path = str(anon.get("path") or "")
+            if anon_path and (anon_path in (source, dest)):
+                return "Error: the anonymous uploaded document is read-only."
+
+        files = runtime.state.get("files") or {}
+        doc_id_by_path = runtime.state.get("doc_id_by_path") or {}
+        pending_moves = list(runtime.state.get("pending_moves") or [])
+
+        # Dest collision: occupied in state, in pending moves, or in DB.
+        if dest in files:
+            return f"Error: destination '{dest}' already exists."
+        if any(move.get("dest") == dest for move in pending_moves):
+            return f"Error: destination '{dest}' already exists."
+        if dest != source:
+            existing_dest = await backend._load_file_data(dest)
+            if existing_dest is not None:
+                return f"Error: destination '{dest}' already exists."
+
+        # Source materialization: lazy load if not in state.
+        source_file_data = files.get(source)
+        source_doc_id = doc_id_by_path.get(source)
+        if source_file_data is None:
+            loaded = await backend._load_file_data(source)
+            if loaded is None:
+                return f"Error: source '{source}' not found."
+            source_file_data, loaded_doc_id = loaded
+            if source_doc_id is None:
+                source_doc_id = loaded_doc_id
+
+        files_update: dict[str, Any] = {source: None, dest: source_file_data}
+        update: dict[str, Any] = {
+            "files": files_update,
+            "pending_moves": [
+                {
+                    "source": source,
+                    "dest": dest,
+                    "overwrite": False,
+                    "tool_call_id": runtime.tool_call_id,
+                }
+            ],
+            "messages": [
+                ToolMessage(
+                    content=(
+                        f"Moved '{source}' to '{dest}' (will commit at end of turn)."
+                    ),
+                    tool_call_id=runtime.tool_call_id,
                 )
-                if persist_error:
-                    return persist_error
+            ],
+        }
+
+        doc_id_update: dict[str, int | None] = {source: None}
+        if source_doc_id is not None:
+            doc_id_update[dest] = source_doc_id
+        update["doc_id_by_path"] = doc_id_update
+
+        dirty_paths = list(runtime.state.get("dirty_paths") or [])
+        if source in dirty_paths:
+            new_dirty: list[Any] = [_CLEAR]
+            for entry in dirty_paths:
+                new_dirty.append(dest if entry == source else entry)
+            update["dirty_paths"] = new_dirty
+        return Command(update=update)
+
+    # ------------------------------------------------------------------ tool: rm
+
+    def _create_rm_tool(self) -> BaseTool:
+        tool_description = (
+            self._custom_tool_descriptions.get("rm") or _CLOUD_RM_TOOL_DESCRIPTION
+        )
+
+        async def async_rm(
+            path: Annotated[
+                str,
+                "Absolute or relative path to the file to delete.",
+            ],
+            runtime: ToolRuntime[None, SurfSenseFilesystemState],
+        ) -> Command | str:
+            if not path or not path.strip():
+                return "Error: path is required."
+
+            target = self._resolve_relative(path, runtime)
+            try:
+                validated = validate_path(target)
+            except ValueError as exc:
+                return f"Error: {exc}"
+
+            if self._is_cloud():
+                if validated in ("/", DOCUMENTS_ROOT):
+                    return f"Error: refusing to rm '{validated}'."
+                if not validated.startswith(DOCUMENTS_ROOT + "/"):
+                    return (
+                        "Error: cloud rm must target a path under /documents/ "
+                        f"(got '{validated}')."
+                    )
+
+                anon = runtime.state.get("kb_anon_doc") or {}
+                if isinstance(anon, dict) and str(anon.get("path") or "") == validated:
+                    return "Error: the anonymous uploaded document is read-only."
+
+                # Refuse if the path looks like a directory.
+                staged_dirs = list(runtime.state.get("staged_dirs") or [])
+                if validated in staged_dirs:
+                    return (
+                        f"Error: '{validated}' is a directory. Use rmdir for "
+                        "empty directories."
+                    )
+                pending_dir_deletes = list(
+                    runtime.state.get("pending_dir_deletes") or []
+                )
+                if any(
+                    isinstance(d, dict) and d.get("path") == validated
+                    for d in pending_dir_deletes
+                ):
+                    return f"Error: '{validated}' is already queued for rmdir."
+
+                backend = self._get_backend(runtime)
+                if isinstance(backend, KBPostgresBackend):
+                    # Detect "is a directory" via `ls`: if the path lists
+                    # children we know it's a folder. Otherwise we still
+                    # need to confirm it's a real file before staging.
+                    children = await backend.als_info(validated)
+                    if children:
+                        return (
+                            f"Error: '{validated}' is a directory. Use rmdir for "
+                            "empty directories."
+                        )
+
+                # Already queued for delete this turn?
+                pending_deletes = list(runtime.state.get("pending_deletes") or [])
+                if any(
+                    isinstance(d, dict) and d.get("path") == validated
+                    for d in pending_deletes
+                ):
+                    return f"'{validated}' is already queued for deletion."
+
+                # Resolve doc_id (best-effort): file in state or DB.
+                files_state = runtime.state.get("files") or {}
+                doc_id_by_path = runtime.state.get("doc_id_by_path") or {}
+                resolved_doc_id: int | None = doc_id_by_path.get(validated)
+                if (
+                    validated not in files_state
+                    and resolved_doc_id is None
+                    and isinstance(backend, KBPostgresBackend)
+                ):
+                    loaded = await backend._load_file_data(validated)
+                    if loaded is None:
+                        return f"Error: file '{validated}' not found."
+                    _, resolved_doc_id = loaded
+
+                files_update: dict[str, Any] = {validated: None}
+                update: dict[str, Any] = {
+                    "pending_deletes": [
+                        {
+                            "path": validated,
+                            "tool_call_id": runtime.tool_call_id,
+                        }
+                    ],
+                    "files": files_update,
+                    "doc_id_by_path": {validated: None},
+                    "messages": [
+                        ToolMessage(
+                            content=(
+                                f"Staged delete of '{validated}' (will commit at "
+                                "end of turn)."
+                            ),
+                            tool_call_id=runtime.tool_call_id,
+                        )
+                    ],
+                }
+
+                # Drop the path from dirty_paths so a same-turn write+rm
+                # doesn't recreate the doc at commit time.
+                dirty_paths = list(runtime.state.get("dirty_paths") or [])
+                if validated in dirty_paths:
+                    new_dirty: list[Any] = [_CLEAR]
+                    for entry in dirty_paths:
+                        if entry != validated:
+                            new_dirty.append(entry)
+                    update["dirty_paths"] = new_dirty
+                    update["dirty_path_tool_calls"] = {validated: None}
+
+                return Command(update=update)
+
+            # Desktop mode — hit disk immediately.
+            backend = self._get_backend(runtime)
+            adelete = getattr(backend, "adelete_file", None)
+            if not callable(adelete):
+                return "Error: rm is not supported by the active backend."
+            res: WriteResult = await adelete(validated)
+            if res.error:
+                return res.error
+            update_desktop: dict[str, Any] = {
+                "files": {validated: None},
+                "messages": [
+                    ToolMessage(
+                        content=f"Deleted file '{res.path or validated}'",
+                        tool_call_id=runtime.tool_call_id,
+                    )
+                ],
+            }
+            return Command(update=update_desktop)
+
+        def sync_rm(
+            path: Annotated[
+                str,
+                "Absolute or relative path to the file to delete.",
+            ],
+            runtime: ToolRuntime[None, SurfSenseFilesystemState],
+        ) -> Command | str:
+            return self._run_async_blocking(async_rm(path, runtime))
+
+        return StructuredTool.from_function(
+            name="rm",
+            description=tool_description,
+            func=sync_rm,
+            coroutine=async_rm,
+        )
+
+    # ------------------------------------------------------------------ tool: rmdir
+
+    def _create_rmdir_tool(self) -> BaseTool:
+        tool_description = (
+            self._custom_tool_descriptions.get("rmdir") or _CLOUD_RMDIR_TOOL_DESCRIPTION
+        )
+
+        async def async_rmdir(
+            path: Annotated[
+                str,
+                "Absolute or relative path of the empty directory to delete.",
+            ],
+            runtime: ToolRuntime[None, SurfSenseFilesystemState],
+        ) -> Command | str:
+            if not path or not path.strip():
+                return "Error: path is required."
+
+            target = self._resolve_relative(path, runtime)
+            try:
+                validated = validate_path(target)
+            except ValueError as exc:
+                return f"Error: {exc}"
+
+            if self._is_cloud():
+                if validated in ("/", DOCUMENTS_ROOT):
+                    return f"Error: refusing to rmdir '{validated}'."
+                if not validated.startswith(DOCUMENTS_ROOT + "/"):
+                    return (
+                        "Error: cloud rmdir must target a path under /documents/ "
+                        f"(got '{validated}')."
+                    )
+
+                cwd = self._current_cwd(runtime)
+                if validated == cwd or _is_ancestor_of(validated, cwd):
+                    return (
+                        f"Error: cannot rmdir '{validated}' because the current "
+                        "cwd is at or under it. cd out first."
+                    )
+
+                staged_dirs = list(runtime.state.get("staged_dirs") or [])
+                pending_dir_deletes = list(
+                    runtime.state.get("pending_dir_deletes") or []
+                )
+                if any(
+                    isinstance(d, dict) and d.get("path") == validated
+                    for d in pending_dir_deletes
+                ):
+                    return f"'{validated}' is already queued for deletion."
+
+                backend = self._get_backend(runtime)
+
+                # The path must currently exist either in DB folder paths or
+                # in staged_dirs. We rely on KBPostgresBackend.als_info (which
+                # already accounts for pending deletes/moves) to evaluate
+                # both existence and emptiness against the post-staged view.
+                exists_in_staged = validated in staged_dirs
+                children: list[Any] = []
+                if isinstance(backend, KBPostgresBackend):
+                    children = list(await backend.als_info(validated))
+
+                # Detect "is a file" — if als_info returns no children but
+                # the path is actually a file, we should reject. We use
+                # _load_file_data to disambiguate file vs missing folder.
+                if (
+                    isinstance(backend, KBPostgresBackend)
+                    and not children
+                    and not exists_in_staged
+                ):
+                    loaded = await backend._load_file_data(validated)
+                    if loaded is not None:
+                        return (
+                            f"Error: '{validated}' is a file. Use rm to delete files."
+                        )
+                    # Confirm folder exists in DB by checking the parent listing.
+                    parent = posixpath.dirname(validated) or "/"
+                    parent_listing = await backend.als_info(parent)
+                    parent_has_dir = any(
+                        info.get("path") == validated and info.get("is_dir")
+                        for info in parent_listing
+                    )
+                    if not parent_has_dir:
+                        return f"Error: directory '{validated}' not found."
+
+                if children:
+                    return (
+                        f"Error: directory '{validated}' is not empty. "
+                        "Remove contents first."
+                    )
+
+                # Same-turn mkdir un-stage: drop the staged_dirs entry
+                # entirely and skip queuing a DB delete (nothing was ever
+                # committed).
+                if exists_in_staged:
+                    rest = [d for d in staged_dirs if d != validated]
+                    return Command(
+                        update={
+                            "staged_dirs": [_CLEAR, *rest],
+                            "staged_dir_tool_calls": {validated: None},
+                            "messages": [
+                                ToolMessage(
+                                    content=(f"Un-staged directory '{validated}'."),
+                                    tool_call_id=runtime.tool_call_id,
+                                )
+                            ],
+                        }
+                    )
 
-            if res.files_update is not None:
                 return Command(
                     update={
-                        "files": res.files_update,
+                        "pending_dir_deletes": [
+                            {
+                                "path": validated,
+                                "tool_call_id": runtime.tool_call_id,
+                            }
+                        ],
                         "messages": [
                             ToolMessage(
-                                content=f"Successfully replaced {res.occurrences} instance(s) of the string in '{res.path}'",
+                                content=(
+                                    f"Staged rmdir of '{validated}' (will commit "
+                                    "at end of turn)."
+                                ),
                                 tool_call_id=runtime.tool_call_id,
                             )
                         ],
                     }
                 )
-            return f"Successfully replaced {res.occurrences} instance(s) of the string in '{res.path}'"
+
+            # Desktop mode — hit disk immediately.
+            backend = self._get_backend(runtime)
+            armdir = getattr(backend, "armdir", None)
+            if not callable(armdir):
+                return "Error: rmdir is not supported by the active backend."
+            res: WriteResult = await armdir(validated)
+            if res.error:
+                return res.error
+            return Command(
+                update={
+                    "messages": [
+                        ToolMessage(
+                            content=f"Deleted directory '{res.path or validated}'",
+                            tool_call_id=runtime.tool_call_id,
+                        )
+                    ],
+                }
+            )
+
+        def sync_rmdir(
+            path: Annotated[
+                str,
+                "Absolute or relative path of the empty directory to delete.",
+            ],
+            runtime: ToolRuntime[None, SurfSenseFilesystemState],
+        ) -> Command | str:
+            return self._run_async_blocking(async_rmdir(path, runtime))
 
         return StructuredTool.from_function(
-            name="edit_file",
+            name="rmdir",
             description=tool_description,
-            func=sync_edit_file,
-            coroutine=async_edit_file,
+            func=sync_rmdir,
+            coroutine=async_rmdir,
+        )
+
+    # ------------------------------------------------------------------ tool: list_tree
+
+    def _create_list_tree_tool(self) -> BaseTool:
+        tool_description = (
+            self._custom_tool_descriptions.get("list_tree")
+            or SURFSENSE_LIST_TREE_TOOL_DESCRIPTION
+        )
+
+        async def async_list_tree(
+            runtime: ToolRuntime[None, SurfSenseFilesystemState],
+            path: Annotated[
+                str,
+                "Absolute path to start from. Defaults to /documents in cloud mode.",
+            ] = "",
+            max_depth: Annotated[int, "Recursion depth limit. Default 8."] = 8,
+            page_size: Annotated[int, "Maximum entries returned. Max 1000."] = 500,
+            include_files: Annotated[bool, "Include file entries."] = True,
+            include_dirs: Annotated[bool, "Include directory entries."] = True,
+        ) -> str:
+            if max_depth < 0:
+                return "Error: max_depth must be >= 0."
+            if page_size < 1:
+                return "Error: page_size must be >= 1."
+            if not include_files and not include_dirs:
+                return "Error: include_files and include_dirs cannot both be false."
+
+            target = self._resolve_list_target_path(path, runtime)
+            try:
+                validated = validate_path(target)
+            except ValueError as exc:
+                return f"Error: {exc}"
+
+            backend = self._get_backend(runtime)
+            if isinstance(backend, KBPostgresBackend):
+                result = await backend.alist_tree_listing(
+                    validated,
+                    max_depth=max_depth,
+                    page_size=page_size,
+                    include_files=include_files,
+                    include_dirs=include_dirs,
+                )
+            elif hasattr(backend, "alist_tree"):
+                result = await backend.alist_tree(
+                    validated,
+                    max_depth=max_depth,
+                    page_size=page_size,
+                    include_files=include_files,
+                    include_dirs=include_dirs,
+                )
+            else:
+                return "Error: list_tree is not supported by the active backend."
+
+            if isinstance(result, dict) and isinstance(result.get("error"), str):
+                return result["error"]
+            return json.dumps(result, ensure_ascii=True)
+
+        def sync_list_tree(
+            runtime: ToolRuntime[None, SurfSenseFilesystemState],
+            path: Annotated[
+                str,
+                "Absolute path to start from. Defaults to /documents in cloud mode.",
+            ] = "",
+            max_depth: Annotated[int, "Recursion depth limit. Default 8."] = 8,
+            page_size: Annotated[int, "Maximum entries returned. Max 1000."] = 500,
+            include_files: Annotated[bool, "Include file entries."] = True,
+            include_dirs: Annotated[bool, "Include directory entries."] = True,
+        ) -> str:
+            return self._run_async_blocking(
+                async_list_tree(
+                    runtime,
+                    path=path,
+                    max_depth=max_depth,
+                    page_size=page_size,
+                    include_files=include_files,
+                    include_dirs=include_dirs,
+                )
+            )
+
+        return StructuredTool.from_function(
+            name="list_tree",
+            description=tool_description,
+            func=sync_list_tree,
+            coroutine=async_list_tree,
+        )
+
+    # ------------------------------------------------------------------ tool: execute_code (sandbox)
+
+    def _create_execute_code_tool(self) -> BaseTool:
+        def sync_execute_code(
+            command: Annotated[
+                str, "Python code to execute. Use print() to see output."
+            ],
+            runtime: ToolRuntime[None, SurfSenseFilesystemState],
+            timeout: Annotated[
+                int | None,
+                "Optional timeout in seconds.",
+            ] = None,
+        ) -> str:
+            if timeout is not None:
+                if timeout < 0:
+                    return f"Error: timeout must be non-negative, got {timeout}."
+                if timeout > self._MAX_EXECUTE_TIMEOUT:
+                    return f"Error: timeout {timeout}s exceeds maximum ({self._MAX_EXECUTE_TIMEOUT}s)."
+            return self._run_async_blocking(
+                self._execute_in_sandbox(command, runtime, timeout)
+            )
+
+        async def async_execute_code(
+            command: Annotated[
+                str, "Python code to execute. Use print() to see output."
+            ],
+            runtime: ToolRuntime[None, SurfSenseFilesystemState],
+            timeout: Annotated[
+                int | None,
+                "Optional timeout in seconds.",
+            ] = None,
+        ) -> str:
+            if timeout is not None:
+                if timeout < 0:
+                    return f"Error: timeout must be non-negative, got {timeout}."
+                if timeout > self._MAX_EXECUTE_TIMEOUT:
+                    return f"Error: timeout {timeout}s exceeds maximum ({self._MAX_EXECUTE_TIMEOUT}s)."
+            return await self._execute_in_sandbox(command, runtime, timeout)
+
+        return StructuredTool.from_function(
+            name="execute_code",
+            description=SURFSENSE_EXECUTE_CODE_TOOL_DESCRIPTION,
+            func=sync_execute_code,
+            coroutine=async_execute_code,
         )
+
+    @staticmethod
+    def _wrap_as_python(code: str) -> str:
+        sentinel = f"_PYEOF_{secrets.token_hex(8)}"
+        return f"python3 << '{sentinel}'\n{code}\n{sentinel}"
+
+    async def _execute_in_sandbox(
+        self,
+        command: str,
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+        timeout: int | None,
+    ) -> str:
+        assert self._thread_id is not None
+        command = self._wrap_as_python(command)
+        try:
+            return await self._try_sandbox_execute(command, runtime, timeout)
+        except (DaytonaError, Exception) as first_err:
+            logger.warning(
+                "Sandbox execute failed for thread %s, retrying: %s",
+                self._thread_id,
+                first_err,
+            )
+            try:
+                await delete_sandbox(self._thread_id)
+            except Exception:
+                _evict_sandbox_cache(self._thread_id)
+            try:
+                return await self._try_sandbox_execute(command, runtime, timeout)
+            except Exception:
+                logger.exception(
+                    "Sandbox retry also failed for thread %s", self._thread_id
+                )
+                return "Error: Code execution is temporarily unavailable. Please try again."
+
+    async def _try_sandbox_execute(
+        self,
+        command: str,
+        runtime: ToolRuntime[None, SurfSenseFilesystemState],
+        timeout: int | None,
+    ) -> str:
+        sandbox, _is_new = await get_or_create_sandbox(self._thread_id)
+        result = await sandbox.aexecute(command, timeout=timeout)
+        output = (result.output or "").strip()
+        if not output and result.exit_code == 0:
+            return (
+                "[Code executed successfully but produced no output. "
+                "Use print() to display results, then try again.]"
+            )
+        parts = [result.output]
+        if result.exit_code is not None:
+            status = "succeeded" if result.exit_code == 0 else "failed"
+            parts.append(f"\n[Command {status} with exit code {result.exit_code}]")
+        if result.truncated:
+            parts.append("\n[Output was truncated due to size limits]")
+        return "".join(parts)
diff --git a/surfsense_backend/app/agents/new_chat/middleware/flatten_system.py b/surfsense_backend/app/agents/new_chat/middleware/flatten_system.py
new file mode 100644
index 0000000000..29cd57aa0f
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/middleware/flatten_system.py
@@ -0,0 +1,233 @@
+r"""Coalesce multi-block system messages into a single text block.
+
+Several middlewares in our deepagent stack each call
+``append_to_system_message`` on the way down to the model
+(``TodoListMiddleware``, ``SurfSenseFilesystemMiddleware``,
+``SkillsMiddleware``, ``SubAgentMiddleware`` …). By the time the
+request reaches the LLM, the system message has 5+ separate text blocks.
+
+Anthropic enforces a hard cap of **4 ``cache_control`` blocks per
+request**, and we configure 2 injection points
+(``index: 0`` + ``index: -1``). With ``index: 0`` always targeting
+the prepended ``request.system_message``, this middleware is the
+defensive partner: it guarantees that "the system block" is *one*
+content block, so LiteLLM's ``AnthropicCacheControlHook`` and any
+OpenRouter→Anthropic transformer can never multiply our budget into
+several breakpoints by spreading ``cache_control`` across multiple
+text blocks of a multi-block system content.
+
+Without flattening we used to see::
+
+    OpenrouterException - {"error":{"message":"Provider returned error",
+    "code":400,"metadata":{"raw":"...A maximum of 4 blocks with
+    cache_control may be provided. Found 5."}}}
+
+(Same error class documented in
+https://github.com/BerriAI/litellm/issues/15696 and
+https://github.com/BerriAI/litellm/issues/20485 — the litellm-side fix
+in PR #15395 covers the litellm transformer but does not protect us
+when the OpenRouter SaaS itself does the redistribution.)
+
+A separate fix in :mod:`app.agents.new_chat.prompt_caching` (switching
+the first injection point from ``role: system`` to ``index: 0``)
+neutralises the *primary* cause of the same 400 — multiple
+``SystemMessage``\ s injected by ``before_agent`` middlewares
+(priority/tree/memory/file-intent/anonymous-doc) accumulating across
+turns, each tagged with ``cache_control`` by the ``role: system``
+matcher. This middleware remains useful as defence-in-depth against
+the multi-block redistribution path.
+
+Placement: innermost on the system-message-mutation chain, after every
+appender (``todo``/``filesystem``/``skills``/``subagents``) and after
+summarization, but before ``noop``/``retry``/``fallback`` so each retry
+attempt sees a flattened payload. See ``chat_deepagent.py``.
+
+Idempotent: a string-content system message is left untouched. A list
+that contains anything other than plain text blocks (e.g. an image) is
+also left untouched — those are rare on system messages and we'd lose
+the non-text payload by joining.
+"""
+
+from __future__ import annotations
+
+import logging
+from collections.abc import Awaitable, Callable
+from typing import Any
+
+from langchain.agents.middleware.types import (
+    AgentMiddleware,
+    AgentState,
+    ContextT,
+    ModelRequest,
+    ModelResponse,
+    ResponseT,
+)
+from langchain_core.messages import SystemMessage
+
+logger = logging.getLogger(__name__)
+
+
+def _flatten_text_blocks(content: list[Any]) -> str | None:
+    """Return joined text if every block is a plain ``{"type": "text"}``.
+
+    Returns ``None`` when the list contains anything that isn't a text
+    block we can safely concatenate (image, audio, file, non-standard
+    blocks, dicts with extra non-cache_control fields). The caller
+    leaves the original content untouched in that case rather than
+    silently dropping payload.
+
+    ``cache_control`` on individual blocks is intentionally discarded —
+    the whole point of flattening is to let LiteLLM's
+    ``cache_control_injection_points`` re-place a single breakpoint on
+    the resulting one-block system content.
+    """
+    chunks: list[str] = []
+    for block in content:
+        if isinstance(block, str):
+            chunks.append(block)
+            continue
+        if not isinstance(block, dict):
+            return None
+        if block.get("type") != "text":
+            return None
+        text = block.get("text")
+        if not isinstance(text, str):
+            return None
+        chunks.append(text)
+    return "\n\n".join(chunks)
+
+
+def _flattened_request(
+    request: ModelRequest[ContextT],
+) -> ModelRequest[ContextT] | None:
+    """Return a request with system_message flattened, or ``None`` for no-op."""
+    sys_msg = request.system_message
+    if sys_msg is None:
+        return None
+    content = sys_msg.content
+    if not isinstance(content, list) or len(content) <= 1:
+        return None
+
+    flattened = _flatten_text_blocks(content)
+    if flattened is None:
+        return None
+
+    new_sys = SystemMessage(
+        content=flattened,
+        additional_kwargs=dict(sys_msg.additional_kwargs),
+        response_metadata=dict(sys_msg.response_metadata),
+    )
+    if sys_msg.id is not None:
+        new_sys.id = sys_msg.id
+    return request.override(system_message=new_sys)
+
+
+def _diagnostic_summary(request: ModelRequest[Any]) -> str:
+    """One-line dump of cache_control-relevant request shape.
+
+    Temporary diagnostic to prove where the ``Found N`` cache_control
+    breakpoints are coming from when Anthropic 400s. Removed once the
+    root cause is confirmed and a fix is in place.
+    """
+    sys_msg = request.system_message
+    if sys_msg is None:
+        sys_shape = "none"
+    elif isinstance(sys_msg.content, str):
+        sys_shape = f"str(len={len(sys_msg.content)})"
+    elif isinstance(sys_msg.content, list):
+        sys_shape = f"list(blocks={len(sys_msg.content)})"
+    else:
+        sys_shape = f"other({type(sys_msg.content).__name__})"
+
+    role_hist: list[str] = []
+    multi_block_msgs = 0
+    msgs_with_cc = 0
+    sys_msgs_in_history = 0
+    for m in request.messages:
+        mtype = getattr(m, "type", type(m).__name__)
+        role_hist.append(mtype)
+        if isinstance(m, SystemMessage):
+            sys_msgs_in_history += 1
+        c = getattr(m, "content", None)
+        if isinstance(c, list):
+            multi_block_msgs += 1
+            for blk in c:
+                if isinstance(blk, dict) and "cache_control" in blk:
+                    msgs_with_cc += 1
+                    break
+        if "cache_control" in getattr(m, "additional_kwargs", {}) or {}:
+            msgs_with_cc += 1
+
+    tools = request.tools or []
+    tools_with_cc = 0
+    for t in tools:
+        if isinstance(t, dict) and (
+            "cache_control" in t or "cache_control" in t.get("function", {})
+        ):
+            tools_with_cc += 1
+
+    return (
+        f"sys={sys_shape} msgs={len(request.messages)} "
+        f"sys_msgs_in_history={sys_msgs_in_history} "
+        f"multi_block_msgs={multi_block_msgs} pre_existing_msg_cc={msgs_with_cc} "
+        f"tools={len(tools)} pre_existing_tool_cc={tools_with_cc} "
+        f"roles={role_hist[-8:]}"
+    )
+
+
+class FlattenSystemMessageMiddleware(
+    AgentMiddleware[AgentState[ResponseT], ContextT, ResponseT]
+):
+    """Collapse a multi-text-block system message to a single string.
+
+    Sits innermost on the system-message-mutation chain so it observes
+    every middleware's contribution. Has no other side effect — the
+    body of every block is preserved, just joined with ``"\\n\\n"``.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.tools = []
+
+    def wrap_model_call(  # type: ignore[override]
+        self,
+        request: ModelRequest[ContextT],
+        handler: Callable[[ModelRequest[ContextT]], ModelResponse[ResponseT]],
+    ) -> Any:
+        if logger.isEnabledFor(logging.DEBUG):
+            logger.debug("[flatten_system_diag] %s", _diagnostic_summary(request))
+        flattened = _flattened_request(request)
+        if flattened is not None:
+            if logger.isEnabledFor(logging.DEBUG):
+                logger.debug(
+                    "[flatten_system] collapsed %d system blocks to one",
+                    len(request.system_message.content),  # type: ignore[arg-type, union-attr]
+                )
+            return handler(flattened)
+        return handler(request)
+
+    async def awrap_model_call(  # type: ignore[override]
+        self,
+        request: ModelRequest[ContextT],
+        handler: Callable[
+            [ModelRequest[ContextT]], Awaitable[ModelResponse[ResponseT]]
+        ],
+    ) -> Any:
+        if logger.isEnabledFor(logging.DEBUG):
+            logger.debug("[flatten_system_diag] %s", _diagnostic_summary(request))
+        flattened = _flattened_request(request)
+        if flattened is not None:
+            if logger.isEnabledFor(logging.DEBUG):
+                logger.debug(
+                    "[flatten_system] collapsed %d system blocks to one",
+                    len(request.system_message.content),  # type: ignore[arg-type, union-attr]
+                )
+            return await handler(flattened)
+        return await handler(request)
+
+
+__all__ = [
+    "FlattenSystemMessageMiddleware",
+    "_flatten_text_blocks",
+    "_flattened_request",
+]
diff --git a/surfsense_backend/app/agents/new_chat/middleware/kb_persistence.py b/surfsense_backend/app/agents/new_chat/middleware/kb_persistence.py
new file mode 100644
index 0000000000..d577441ddc
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/middleware/kb_persistence.py
@@ -0,0 +1,1469 @@
+"""End-of-turn persistence for the cloud-mode SurfSense filesystem.
+
+This middleware runs ``aafter_agent`` once per turn (cloud only). It commits
+all staged folder creations, file moves, content writes/edits, file deletes
+(``rm``), and directory deletes (``rmdir``) to Postgres in a single ordered
+pass:
+
+1. Materialize ``staged_dirs`` into ``Folder`` rows.
+2. Apply ``pending_moves`` in order (chained moves resolved via
+   ``doc_id_by_path``).
+3. Normalize ``dirty_paths`` through ``pending_moves`` so write-then-move
+   sequences commit at the final path. Paths queued for ``rm`` this turn
+   are dropped here so a write+rm sequence doesn't recreate the doc.
+4. Commit content writes / edits for ``/documents/*`` paths, skipping
+   ``temp_*`` basenames.
+5. Apply ``pending_deletes`` (``rm``) — file deletes run BEFORE directory
+   deletes so a same-turn ``rm /a/x.md`` + ``rmdir /a`` sequence works.
+6. Apply ``pending_dir_deletes`` (``rmdir``); re-verifies emptiness against
+   the post-step-5 DB state.
+
+When ``flags.enable_action_log`` is on every destructive op also writes a
+``DocumentRevision`` / ``FolderRevision`` snapshot bound to the
+originating ``AgentActionLog`` row via ``tool_call_id``. ``rm``/``rmdir``
+share a single ``SAVEPOINT`` with their snapshot — if the snapshot fails
+the DELETE rolls back and we surface the error rather than silently
+making the data irreversible.
+
+The commit body is exposed as a free function ``commit_staged_filesystem_state``
+so the optional stream-task fallback (``stream_new_chat.py``) can call the
+exact same routine when ``aafter_agent`` was skipped (e.g. client disconnect).
+"""
+
+from __future__ import annotations
+
+import logging
+from datetime import UTC, datetime
+from typing import Any
+
+from fractional_indexing import generate_key_between
+from langchain.agents.middleware import AgentMiddleware, AgentState
+from langchain_core.callbacks import adispatch_custom_event, dispatch_custom_event
+from langgraph.runtime import Runtime
+from sqlalchemy import delete, select, update
+from sqlalchemy.exc import IntegrityError
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.agents.new_chat.feature_flags import get_flags
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+from app.agents.new_chat.filesystem_state import SurfSenseFilesystemState
+from app.agents.new_chat.path_resolver import (
+    DOCUMENTS_ROOT,
+    parse_documents_path,
+    safe_folder_segment,
+    virtual_path_to_doc,
+)
+from app.agents.new_chat.state_reducers import _CLEAR
+from app.db import (
+    AgentActionLog,
+    Chunk,
+    Document,
+    DocumentRevision,
+    DocumentType,
+    Folder,
+    FolderRevision,
+    shielded_async_session,
+)
+from app.indexing_pipeline.document_chunker import chunk_text
+from app.utils.document_converters import (
+    embed_texts,
+    generate_content_hash,
+    generate_unique_identifier_hash,
+)
+
+logger = logging.getLogger(__name__)
+
+
+_TEMP_PREFIX = "temp_"
+
+
+def _basename(path: str) -> str:
+    return path.rsplit("/", 1)[-1]
+
+
+# ---------------------------------------------------------------------------
+# Folder helpers
+# ---------------------------------------------------------------------------
+
+
+async def _ensure_folder_hierarchy(
+    session: AsyncSession,
+    *,
+    search_space_id: int,
+    created_by_id: str | None,
+    folder_parts: list[str],
+) -> int | None:
+    """Ensure a chain of folder names exists under the search space.
+
+    Returns the leaf folder id, or ``None`` if ``folder_parts`` is empty
+    (i.e. a document directly under ``/documents/``).
+    """
+    if not folder_parts:
+        return None
+    parent_id: int | None = None
+    for raw in folder_parts:
+        name = safe_folder_segment(str(raw))
+        query = select(Folder).where(
+            Folder.search_space_id == search_space_id,
+            Folder.name == name,
+        )
+        if parent_id is None:
+            query = query.where(Folder.parent_id.is_(None))
+        else:
+            query = query.where(Folder.parent_id == parent_id)
+        result = await session.execute(query)
+        folder = result.scalar_one_or_none()
+        if folder is None:
+            sibling_query = (
+                select(Folder.position).order_by(Folder.position.desc()).limit(1)
+            )
+            sibling_query = sibling_query.where(
+                Folder.search_space_id == search_space_id
+            )
+            if parent_id is None:
+                sibling_query = sibling_query.where(Folder.parent_id.is_(None))
+            else:
+                sibling_query = sibling_query.where(Folder.parent_id == parent_id)
+            sibling_result = await session.execute(sibling_query)
+            last_position = sibling_result.scalar_one_or_none()
+            folder = Folder(
+                name=name,
+                position=generate_key_between(last_position, None),
+                parent_id=parent_id,
+                search_space_id=search_space_id,
+                created_by_id=created_by_id,
+                updated_at=datetime.now(UTC),
+            )
+            session.add(folder)
+            await session.flush()
+        parent_id = folder.id
+    return parent_id
+
+
+async def _resolve_folder_id(
+    session: AsyncSession,
+    *,
+    search_space_id: int,
+    folder_parts: list[str],
+) -> int | None:
+    """Look up an existing folder chain without creating anything.
+
+    Returns ``None`` if any segment is missing. Used by ``rmdir`` snapshot
+    capture and by parent-folder lookup at ``rmdir`` commit time.
+    """
+    if not folder_parts:
+        return None
+    parent_id: int | None = None
+    for raw in folder_parts:
+        name = safe_folder_segment(str(raw))
+        query = select(Folder).where(
+            Folder.search_space_id == search_space_id,
+            Folder.name == name,
+        )
+        query = (
+            query.where(Folder.parent_id.is_(None))
+            if parent_id is None
+            else query.where(Folder.parent_id == parent_id)
+        )
+        result = await session.execute(query)
+        folder = result.scalar_one_or_none()
+        if folder is None:
+            return None
+        parent_id = folder.id
+    return parent_id
+
+
+def _split_folder_path(folder_path: str) -> list[str]:
+    """Return the folder segments under ``/documents/`` for a path."""
+    if not folder_path.startswith(DOCUMENTS_ROOT):
+        return []
+    rel = folder_path[len(DOCUMENTS_ROOT) :].strip("/")
+    return [p for p in rel.split("/") if p]
+
+
+# ---------------------------------------------------------------------------
+# Document helpers
+# ---------------------------------------------------------------------------
+
+
+async def _create_document(
+    session: AsyncSession,
+    *,
+    virtual_path: str,
+    content: str,
+    search_space_id: int,
+    created_by_id: str | None,
+) -> Document:
+    """Create a NOTE Document + Chunks for ``virtual_path``."""
+    folder_parts, title = parse_documents_path(virtual_path)
+    if not title:
+        raise ValueError(f"invalid /documents path '{virtual_path}'")
+    folder_id = await _ensure_folder_hierarchy(
+        session,
+        search_space_id=search_space_id,
+        created_by_id=created_by_id,
+        folder_parts=folder_parts,
+    )
+    unique_identifier_hash = generate_unique_identifier_hash(
+        DocumentType.NOTE,
+        virtual_path,
+        search_space_id,
+    )
+    # Filesystem-parity invariant: the only thing that *must* be unique is
+    # the path. Two notes can legitimately share content (e.g. ``cp a b``).
+    # Guard against the path-derived ``unique_identifier_hash`` constraint
+    # so we surface a clean ValueError instead of letting the INSERT poison
+    # the session with an IntegrityError.
+    path_collision = await session.execute(
+        select(Document.id).where(
+            Document.search_space_id == search_space_id,
+            Document.unique_identifier_hash == unique_identifier_hash,
+        )
+    )
+    if path_collision.scalar_one_or_none() is not None:
+        raise ValueError(
+            f"a document already exists at path '{virtual_path}' "
+            "(unique_identifier_hash collision)"
+        )
+    # ``content_hash`` is intentionally NOT checked for uniqueness here.
+    # In a real filesystem two files at different paths can hold identical
+    # bytes, and the agent's ``write_file`` path needs that semantic to
+    # support copy/duplicate operations. The hash remains useful as a
+    # change-detection hint for connector indexers, which still consult it
+    # via :func:`check_duplicate_document` but do so with a non-unique
+    # lookup (``.first()``).
+    content_hash = generate_content_hash(content, search_space_id)
+    doc = Document(
+        title=title,
+        document_type=DocumentType.NOTE,
+        document_metadata={"virtual_path": virtual_path},
+        content=content,
+        content_hash=content_hash,
+        unique_identifier_hash=unique_identifier_hash,
+        source_markdown=content,
+        search_space_id=search_space_id,
+        folder_id=folder_id,
+        created_by_id=created_by_id,
+        updated_at=datetime.now(UTC),
+    )
+    session.add(doc)
+    await session.flush()
+
+    summary_embedding = embed_texts([content])[0]
+    doc.embedding = summary_embedding
+    chunks = chunk_text(content)
+    if chunks:
+        chunk_embeddings = embed_texts(chunks)
+        session.add_all(
+            [
+                Chunk(document_id=doc.id, content=text, embedding=embedding)
+                for text, embedding in zip(chunks, chunk_embeddings, strict=True)
+            ]
+        )
+    return doc
+
+
+async def _update_document(
+    session: AsyncSession,
+    *,
+    doc_id: int,
+    content: str,
+    virtual_path: str,
+    search_space_id: int,
+) -> Document | None:
+    """Update an existing Document's content + chunks."""
+    result = await session.execute(
+        select(Document).where(
+            Document.id == doc_id,
+            Document.search_space_id == search_space_id,
+        )
+    )
+    document = result.scalar_one_or_none()
+    if document is None:
+        return None
+
+    document.content = content
+    document.source_markdown = content
+    document.content_hash = generate_content_hash(content, search_space_id)
+    document.updated_at = datetime.now(UTC)
+    metadata = dict(document.document_metadata or {})
+    metadata["virtual_path"] = virtual_path
+    document.document_metadata = metadata
+    document.unique_identifier_hash = generate_unique_identifier_hash(
+        DocumentType.NOTE,
+        virtual_path,
+        search_space_id,
+    )
+
+    summary_embedding = embed_texts([content])[0]
+    document.embedding = summary_embedding
+
+    await session.execute(delete(Chunk).where(Chunk.document_id == document.id))
+    chunks = chunk_text(content)
+    if chunks:
+        chunk_embeddings = embed_texts(chunks)
+        session.add_all(
+            [
+                Chunk(document_id=document.id, content=text, embedding=embedding)
+                for text, embedding in zip(chunks, chunk_embeddings, strict=True)
+            ]
+        )
+    return document
+
+
+# ---------------------------------------------------------------------------
+# Move helpers
+# ---------------------------------------------------------------------------
+
+
+async def _apply_move(
+    session: AsyncSession,
+    *,
+    search_space_id: int,
+    created_by_id: str | None,
+    move: dict[str, Any],
+    doc_id_by_path: dict[str, int],
+    doc_id_path_tombstones: dict[str, int | None],
+) -> dict[str, Any] | None:
+    """Apply a single staged move; updates the in-memory mapping for chain resolution."""
+    source = str(move.get("source") or "")
+    dest = str(move.get("dest") or "")
+    if not source or not dest or source == dest:
+        return None
+
+    if not source.startswith(DOCUMENTS_ROOT + "/") or not dest.startswith(
+        DOCUMENTS_ROOT + "/"
+    ):
+        return None
+
+    doc_id: int | None = doc_id_by_path.get(source)
+    document: Document | None = None
+    if doc_id is not None:
+        result = await session.execute(
+            select(Document).where(
+                Document.id == doc_id,
+                Document.search_space_id == search_space_id,
+            )
+        )
+        document = result.scalar_one_or_none()
+    if document is None:
+        document = await virtual_path_to_doc(
+            session,
+            search_space_id=search_space_id,
+            virtual_path=source,
+        )
+    if document is None:
+        logger.info(
+            "kb_persistence: skipping move %s -> %s (source not found)",
+            source,
+            dest,
+        )
+        return None
+
+    folder_parts, new_title = parse_documents_path(dest)
+    if not new_title:
+        return None
+    folder_id = await _ensure_folder_hierarchy(
+        session,
+        search_space_id=search_space_id,
+        created_by_id=created_by_id,
+        folder_parts=folder_parts,
+    )
+
+    document.title = new_title
+    document.folder_id = folder_id
+    metadata = dict(document.document_metadata or {})
+    metadata["virtual_path"] = dest
+    document.document_metadata = metadata
+    document.unique_identifier_hash = generate_unique_identifier_hash(
+        DocumentType.NOTE,
+        dest,
+        search_space_id,
+    )
+    document.updated_at = datetime.now(UTC)
+
+    doc_id_by_path.pop(source, None)
+    doc_id_by_path[dest] = document.id
+    doc_id_path_tombstones[source] = None
+    doc_id_path_tombstones[dest] = document.id
+    return {"id": document.id, "source": source, "dest": dest, "title": new_title}
+
+
+# ---------------------------------------------------------------------------
+# Action log binding helpers
+# ---------------------------------------------------------------------------
+
+
+async def _find_action_ids_batch(
+    session: AsyncSession,
+    *,
+    thread_id: int | None,
+    tool_call_ids: set[str],
+) -> dict[str, int]:
+    """Resolve ``tool_call_id -> AgentActionLog.id`` in a single query.
+
+    Returns an empty dict when ``thread_id`` or ``tool_call_ids`` are
+    missing — callers treat that as "no binding available" and write the
+    revision with ``agent_action_id = NULL``.
+    """
+    if thread_id is None or not tool_call_ids:
+        return {}
+    rows = await session.execute(
+        select(AgentActionLog.id, AgentActionLog.tool_call_id).where(
+            AgentActionLog.thread_id == thread_id,
+            AgentActionLog.tool_call_id.in_(list(tool_call_ids)),
+        )
+    )
+    mapping: dict[str, int] = {}
+    for row in rows.all():
+        if row.tool_call_id and row.id:
+            mapping[str(row.tool_call_id)] = int(row.id)
+    return mapping
+
+
+async def _mark_action_reversible(
+    session: AsyncSession,
+    *,
+    action_id: int | None,
+) -> None:
+    """Flip ``agent_action_log.reversible = TRUE`` for ``action_id``.
+
+    Best-effort: caller may invoke from inside a SAVEPOINT and treat
+    failure as a soft demotion (snapshot persists, just no Revert button).
+
+    Callers should also call ``_dispatch_reversibility_update`` (defined
+    below) AFTER the enclosing SAVEPOINT block exits successfully so the
+    chat tool card can light up its Revert button without
+    re-fetching ``GET /threads/.../actions``. Dispatching from inside the
+    SAVEPOINT would risk emitting "reversible=true" for rows whose
+    update gets rolled back if the surrounding destructive op fails.
+    """
+    if action_id is None:
+        return
+    await session.execute(
+        update(AgentActionLog)
+        .where(AgentActionLog.id == action_id)
+        .values(reversible=True)
+    )
+
+
+async def _dispatch_reversibility_update(action_id: int | None) -> None:
+    """Best-effort dispatch of an ``action_log_updated`` custom event.
+
+    Surfaces the post-SAVEPOINT reversibility flip to the SSE layer so
+    the chat tool card can flip its Revert button live. Defensive:
+    failures are logged at debug level and swallowed; the
+    REST endpoint ``GET /threads/.../actions`` is still authoritative.
+
+    .. warning::
+        Inside :func:`commit_staged_filesystem_state` we DEFER all
+        dispatches until the outer ``session.commit()`` succeeds — see
+        the ``deferred_dispatches`` queue in that function. Dispatching
+        from inside a SAVEPOINT block while the outer transaction is
+        still pending would emit ``reversible=true`` for rows whose
+        snapshots get rolled back if the outer commit fails. Direct
+        callers (e.g. the optional stream-task fallback) that own the
+        full session lifetime can still call this helper inline.
+    """
+    if action_id is None:
+        return
+    try:
+        await adispatch_custom_event(
+            "action_log_updated",
+            {"id": int(action_id), "reversible": True},
+        )
+    except Exception:
+        logger.debug(
+            "kb_persistence.aafter_agent failed to dispatch action_log_updated",
+            exc_info=True,
+        )
+
+
+# ---------------------------------------------------------------------------
+# Snapshot helpers
+# ---------------------------------------------------------------------------
+#
+# Best-effort helpers swallow + log so a snapshot failure can never break
+# the destructive op for non-destructive tools (write/edit/move/mkdir).
+# Strict helpers run inside the SAME ``begin_nested()`` SAVEPOINT as the
+# destructive DELETE — failure aborts the savepoint and leaves the doc /
+# folder intact, so revertable ops never become irreversible silently.
+
+
+def _doc_revision_payload(
+    doc: Document,
+    *,
+    chunks_before: list[dict[str, str]] | None = None,
+) -> dict[str, Any]:
+    """Pre-mutation field map for ``DocumentRevision``."""
+    metadata = dict(doc.document_metadata or {})
+    return {
+        "content_before": doc.content,
+        "title_before": doc.title,
+        "folder_id_before": doc.folder_id,
+        "chunks_before": chunks_before,
+        "metadata_before": metadata or None,
+    }
+
+
+async def _load_chunks_for_snapshot(
+    session: AsyncSession, *, doc_id: int
+) -> list[dict[str, str]]:
+    rows = await session.execute(
+        select(Chunk.content).where(Chunk.document_id == doc_id).order_by(Chunk.id)
+    )
+    return [{"content": row.content} for row in rows.all() if row.content is not None]
+
+
+async def _snapshot_document_pre_write(
+    session: AsyncSession,
+    *,
+    doc: Document,
+    action_id: int | None,
+    search_space_id: int,
+    turn_id: str | None = None,
+    deferred_dispatches: list[int] | None = None,
+) -> int | None:
+    """Best-effort snapshot ahead of an in-place ``write_file``/``edit_file``.
+
+    When ``deferred_dispatches`` is provided, on success the action id
+    is APPENDED to it and the SSE dispatch is left to the caller (so it
+    can be flushed only after the outer ``session.commit()`` succeeds).
+    """
+    try:
+        async with session.begin_nested():
+            chunks = await _load_chunks_for_snapshot(session, doc_id=doc.id)
+            payload = _doc_revision_payload(doc, chunks_before=chunks)
+            rev = DocumentRevision(
+                document_id=doc.id,
+                search_space_id=search_space_id,
+                created_by_turn_id=turn_id,
+                agent_action_id=action_id,
+                **payload,
+            )
+            session.add(rev)
+            await session.flush()
+            await _mark_action_reversible(session, action_id=action_id)
+            rev_id = rev.id
+        if deferred_dispatches is None:
+            await _dispatch_reversibility_update(action_id)
+        elif action_id is not None:
+            deferred_dispatches.append(int(action_id))
+        return rev_id
+    except Exception as exc:  # pragma: no cover - defensive
+        logger.warning(
+            "kb_persistence: pre-write snapshot for doc=%s failed: %s",
+            doc.id,
+            exc,
+        )
+        return None
+
+
+async def _snapshot_document_pre_create(
+    session: AsyncSession,
+    *,
+    action_id: int | None,
+    search_space_id: int,
+    turn_id: str | None = None,
+    deferred_dispatches: list[int] | None = None,
+) -> int | None:
+    """Best-effort placeholder revision for a fresh ``write_file`` create.
+
+    ``document_id`` is patched in by the caller after the new doc is
+    flushed and gets an ID; the placeholder lets us bind the action_id
+    even though no parent row exists yet.
+    """
+    try:
+        async with session.begin_nested():
+            rev = DocumentRevision(
+                document_id=None,
+                search_space_id=search_space_id,
+                content_before=None,
+                title_before=None,
+                folder_id_before=None,
+                chunks_before=None,
+                metadata_before=None,
+                created_by_turn_id=turn_id,
+                agent_action_id=action_id,
+            )
+            session.add(rev)
+            await session.flush()
+            await _mark_action_reversible(session, action_id=action_id)
+            rev_id = rev.id
+        if deferred_dispatches is None:
+            await _dispatch_reversibility_update(action_id)
+        elif action_id is not None:
+            deferred_dispatches.append(int(action_id))
+        return rev_id
+    except Exception as exc:  # pragma: no cover - defensive
+        logger.warning("kb_persistence: pre-create snapshot failed: %s", exc)
+        return None
+
+
+async def _snapshot_document_pre_move(
+    session: AsyncSession,
+    *,
+    doc: Document,
+    action_id: int | None,
+    search_space_id: int,
+    turn_id: str | None = None,
+    deferred_dispatches: list[int] | None = None,
+) -> int | None:
+    """Best-effort snapshot ahead of a ``move_file``."""
+    try:
+        async with session.begin_nested():
+            payload = _doc_revision_payload(doc, chunks_before=None)
+            rev = DocumentRevision(
+                document_id=doc.id,
+                search_space_id=search_space_id,
+                created_by_turn_id=turn_id,
+                agent_action_id=action_id,
+                **payload,
+            )
+            session.add(rev)
+            await session.flush()
+            await _mark_action_reversible(session, action_id=action_id)
+            rev_id = rev.id
+        if deferred_dispatches is None:
+            await _dispatch_reversibility_update(action_id)
+        elif action_id is not None:
+            deferred_dispatches.append(int(action_id))
+        return rev_id
+    except Exception as exc:  # pragma: no cover - defensive
+        logger.warning(
+            "kb_persistence: pre-move snapshot for doc=%s failed: %s",
+            doc.id,
+            exc,
+        )
+        return None
+
+
+async def _snapshot_folder_pre_mkdir(
+    session: AsyncSession,
+    *,
+    folder: Folder,
+    action_id: int | None,
+    search_space_id: int,
+    turn_id: str | None = None,
+    deferred_dispatches: list[int] | None = None,
+) -> int | None:
+    """Best-effort placeholder for an ``mkdir`` (revert deletes the folder).
+
+    The "before" state is "did not exist", so all ``*_before`` fields are
+    NULL — revert routes by ``tool_name == "mkdir"`` and DELETEs.
+    """
+    try:
+        async with session.begin_nested():
+            rev = FolderRevision(
+                folder_id=folder.id,
+                search_space_id=search_space_id,
+                name_before=None,
+                parent_id_before=None,
+                position_before=None,
+                created_by_turn_id=turn_id,
+                agent_action_id=action_id,
+            )
+            session.add(rev)
+            await session.flush()
+            await _mark_action_reversible(session, action_id=action_id)
+            rev_id = rev.id
+        if deferred_dispatches is None:
+            await _dispatch_reversibility_update(action_id)
+        elif action_id is not None:
+            deferred_dispatches.append(int(action_id))
+        return rev_id
+    except Exception as exc:  # pragma: no cover - defensive
+        logger.warning(
+            "kb_persistence: pre-mkdir snapshot for folder=%s failed: %s",
+            folder.id,
+            exc,
+        )
+        return None
+
+
+# ---------------------------------------------------------------------------
+# Commit body
+# ---------------------------------------------------------------------------
+
+
+async def commit_staged_filesystem_state(
+    state: dict[str, Any] | AgentState,
+    *,
+    search_space_id: int,
+    created_by_id: str | None,
+    filesystem_mode: FilesystemMode = FilesystemMode.CLOUD,
+    thread_id: int | None = None,
+    dispatch_events: bool = True,
+) -> dict[str, Any] | None:
+    """Commit all staged filesystem changes; return the state delta for reducers.
+
+    Shared between :class:`KnowledgeBasePersistenceMiddleware.aafter_agent`
+    and the optional stream-task fallback.
+
+    When ``flags.enable_action_log`` is on every destructive op also writes
+    a ``DocumentRevision`` / ``FolderRevision`` snapshot bound to the
+    originating ``AgentActionLog`` row via ``tool_call_id``. Snapshot
+    durability is best-effort for non-destructive ops and STRICT for
+    ``rm``/``rmdir`` (snapshot + DELETE share a SAVEPOINT — snapshot
+    failure aborts the delete).
+    """
+    if filesystem_mode != FilesystemMode.CLOUD:
+        return None
+
+    state_dict: dict[str, Any] = (
+        dict(state)
+        if isinstance(state, dict)
+        else dict(getattr(state, "values", {}) or {})
+    )
+
+    files: dict[str, Any] = state_dict.get("files") or {}
+    staged_dirs: list[str] = list(state_dict.get("staged_dirs") or [])
+    staged_dir_tool_calls: dict[str, str] = dict(
+        state_dict.get("staged_dir_tool_calls") or {}
+    )
+    pending_moves: list[dict[str, Any]] = list(state_dict.get("pending_moves") or [])
+    pending_deletes: list[dict[str, Any]] = list(
+        state_dict.get("pending_deletes") or []
+    )
+    pending_dir_deletes: list[dict[str, Any]] = list(
+        state_dict.get("pending_dir_deletes") or []
+    )
+    dirty_paths: list[str] = list(state_dict.get("dirty_paths") or [])
+    dirty_path_tool_calls: dict[str, str] = dict(
+        state_dict.get("dirty_path_tool_calls") or {}
+    )
+    doc_id_by_path: dict[str, int] = dict(state_dict.get("doc_id_by_path") or {})
+    kb_anon_doc = state_dict.get("kb_anon_doc")
+
+    if kb_anon_doc:
+        temp_paths = [
+            p
+            for p in files
+            if isinstance(p, str) and _basename(p).startswith(_TEMP_PREFIX)
+        ]
+        return {
+            "dirty_paths": [_CLEAR],
+            "staged_dirs": [_CLEAR],
+            "staged_dir_tool_calls": {_CLEAR: True},
+            "pending_moves": [_CLEAR],
+            "pending_deletes": [_CLEAR],
+            "pending_dir_deletes": [_CLEAR],
+            "dirty_path_tool_calls": {_CLEAR: True},
+            "files": dict.fromkeys(temp_paths),
+        }
+
+    if not (
+        staged_dirs
+        or pending_moves
+        or dirty_paths
+        or pending_deletes
+        or pending_dir_deletes
+    ):
+        return None
+
+    flags = get_flags()
+    snapshot_enabled = flags.enable_action_log
+
+    # De-duplicate pending deletes per-path while preserving the latest
+    # tool_call_id (the one the user is most likely to revert via the UI).
+    file_delete_paths: dict[str, str] = {}
+    for entry in pending_deletes:
+        if not isinstance(entry, dict):
+            continue
+        path = str(entry.get("path") or "")
+        if path:
+            file_delete_paths[path] = str(entry.get("tool_call_id") or "")
+    dir_delete_paths: dict[str, str] = {}
+    for entry in pending_dir_deletes:
+        if not isinstance(entry, dict):
+            continue
+        path = str(entry.get("path") or "")
+        if path:
+            dir_delete_paths[path] = str(entry.get("tool_call_id") or "")
+
+    committed_creates: list[dict[str, Any]] = []
+    committed_updates: list[dict[str, Any]] = []
+    committed_deletes: list[dict[str, Any]] = []
+    committed_folder_deletes: list[dict[str, Any]] = []
+    discarded: list[str] = []
+    applied_moves: list[dict[str, Any]] = []
+    doc_id_path_tombstones: dict[str, int | None] = {}
+    tree_changed = False
+    # Reversibility-flip dispatches are deferred until AFTER the outer
+    # ``session.commit()`` succeeds. Dispatching from inside the
+    # SAVEPOINT chain while the outer transaction is still pending
+    # would emit ``reversible=true`` for rows whose snapshots get rolled
+    # back if the final commit raises. Snapshot helpers append on
+    # success; we drain this list after commit and silently abandon it
+    # on rollback so the UI stays consistent with durable state.
+    deferred_dispatches: list[int] = []
+
+    try:
+        async with shielded_async_session() as session:
+            # ------------------------------------------------------------------
+            # Resolve action-id bindings up front. One SELECT per turn for all
+            # tool_call_ids, NOT one per op — important because a turn that
+            # touches 50 paths would otherwise issue 50 lookups.
+            # ------------------------------------------------------------------
+            action_id_by_call: dict[str, int] = {}
+            if snapshot_enabled and thread_id is not None:
+                tool_call_ids: set[str] = set()
+                tool_call_ids.update(
+                    tcid for tcid in staged_dir_tool_calls.values() if tcid
+                )
+                for move in pending_moves:
+                    tcid = str(move.get("tool_call_id") or "")
+                    if tcid:
+                        tool_call_ids.add(tcid)
+                tool_call_ids.update(
+                    tcid for tcid in dirty_path_tool_calls.values() if tcid
+                )
+                tool_call_ids.update(
+                    tcid for tcid in file_delete_paths.values() if tcid
+                )
+                tool_call_ids.update(tcid for tcid in dir_delete_paths.values() if tcid)
+                action_id_by_call = await _find_action_ids_batch(
+                    session,
+                    thread_id=thread_id,
+                    tool_call_ids=tool_call_ids,
+                )
+
+            def _action_id_for(tool_call_id: str | None) -> int | None:
+                if not snapshot_enabled or not tool_call_id:
+                    return None
+                return action_id_by_call.get(str(tool_call_id))
+
+            turn_id_for_revision = (
+                next(iter(action_id_by_call), None) if action_id_by_call else None
+            )
+
+            # ------------------------------------------------------------------
+            # 1. staged_dirs -> Folder rows. Snapshot post-flush so the new
+            # folder_id is available for the FK.
+            # ------------------------------------------------------------------
+            for folder_path in staged_dirs:
+                if not isinstance(folder_path, str):
+                    continue
+                if not folder_path.startswith(DOCUMENTS_ROOT):
+                    continue
+                folder_parts_full = _split_folder_path(folder_path)
+                if not folder_parts_full:
+                    continue
+                folder_id = await _ensure_folder_hierarchy(
+                    session,
+                    search_space_id=search_space_id,
+                    created_by_id=created_by_id,
+                    folder_parts=folder_parts_full,
+                )
+                tree_changed = True
+
+                if snapshot_enabled and folder_id is not None:
+                    tcid = staged_dir_tool_calls.get(folder_path)
+                    action_id = _action_id_for(tcid)
+                    if action_id is not None:
+                        # Re-read the folder for the snapshot.
+                        result = await session.execute(
+                            select(Folder).where(Folder.id == folder_id)
+                        )
+                        folder_row = result.scalar_one_or_none()
+                        if folder_row is not None:
+                            await _snapshot_folder_pre_mkdir(
+                                session,
+                                folder=folder_row,
+                                action_id=action_id,
+                                search_space_id=search_space_id,
+                                turn_id=tcid,
+                                deferred_dispatches=deferred_dispatches,
+                            )
+
+            # ------------------------------------------------------------------
+            # 2. pending_moves. Snapshot pre-move (in-place restore on revert).
+            # ------------------------------------------------------------------
+            for move in pending_moves:
+                source = str(move.get("source") or "")
+                if snapshot_enabled and source:
+                    tcid = str(move.get("tool_call_id") or "")
+                    action_id = _action_id_for(tcid)
+                    if action_id is not None:
+                        # Resolve the doc to snapshot BEFORE we mutate it.
+                        doc_id_pre = doc_id_by_path.get(source)
+                        document_pre: Document | None = None
+                        if doc_id_pre is not None:
+                            res_pre = await session.execute(
+                                select(Document).where(
+                                    Document.id == doc_id_pre,
+                                    Document.search_space_id == search_space_id,
+                                )
+                            )
+                            document_pre = res_pre.scalar_one_or_none()
+                        if document_pre is None:
+                            document_pre = await virtual_path_to_doc(
+                                session,
+                                search_space_id=search_space_id,
+                                virtual_path=source,
+                            )
+                        if document_pre is not None:
+                            await _snapshot_document_pre_move(
+                                session,
+                                doc=document_pre,
+                                action_id=action_id,
+                                search_space_id=search_space_id,
+                                turn_id=tcid,
+                                deferred_dispatches=deferred_dispatches,
+                            )
+
+                applied = await _apply_move(
+                    session,
+                    search_space_id=search_space_id,
+                    created_by_id=created_by_id,
+                    move=move,
+                    doc_id_by_path=doc_id_by_path,
+                    doc_id_path_tombstones=doc_id_path_tombstones,
+                )
+                if applied:
+                    applied_moves.append(applied)
+                    tree_changed = True
+
+            move_alias = {
+                m["source"]: m["dest"] for m in pending_moves if m.get("source")
+            }
+
+            def _final_path(path: str) -> str:
+                seen: set[str] = set()
+                while path in move_alias and path not in seen:
+                    seen.add(path)
+                    path = move_alias[path]
+                return path
+
+            # ------------------------------------------------------------------
+            # 3. dirty_paths -> writes/edits. Skip any path queued for ``rm``
+            # this turn so a write+rm sequence doesn't recreate the doc.
+            # ------------------------------------------------------------------
+            kb_dirty_seen: set[str] = set()
+            kb_dirty: list[str] = []
+            kb_dirty_origin: dict[str, str] = {}
+            for raw in dirty_paths:
+                if not isinstance(raw, str):
+                    continue
+                final = _final_path(raw)
+                if not final.startswith(DOCUMENTS_ROOT + "/"):
+                    continue
+                if final in kb_dirty_seen:
+                    continue
+                if final in file_delete_paths:
+                    discarded.append(final)
+                    continue
+                kb_dirty_seen.add(final)
+                kb_dirty.append(final)
+                kb_dirty_origin[final] = raw
+
+            for path in kb_dirty:
+                basename = _basename(path)
+                if basename.startswith(_TEMP_PREFIX):
+                    discarded.append(path)
+                    continue
+                file_data = files.get(path)
+                if not isinstance(file_data, dict):
+                    continue
+                content = "\n".join(file_data.get("content") or [])
+                doc_id = doc_id_by_path.get(path)
+                # Path ↔ tool_call_id binding: the dirty_paths list dedupes via
+                # _add_unique_reducer, so we look up the latest tool_call_id by
+                # path (or by the un-renamed origin).
+                origin = kb_dirty_origin.get(path, path)
+                tcid = dirty_path_tool_calls.get(path) or dirty_path_tool_calls.get(
+                    origin
+                )
+                action_id = _action_id_for(tcid)
+
+                if doc_id is None:
+                    # The in-memory ``doc_id_by_path`` is per-thread and starts
+                    # empty in every new chat. If the agent writes to a path
+                    # that already exists in the DB (e.g. a previous chat's
+                    # ``notes.md``), we must NOT try to INSERT — it would hit
+                    # ``unique_identifier_hash`` (path-derived). Look up the
+                    # existing doc and update it in place instead.
+                    existing = await virtual_path_to_doc(
+                        session,
+                        search_space_id=search_space_id,
+                        virtual_path=path,
+                    )
+                    if existing is not None:
+                        doc_id = existing.id
+                        doc_id_by_path[path] = existing.id
+                if doc_id is not None:
+                    if snapshot_enabled and action_id is not None:
+                        result_doc = await session.execute(
+                            select(Document).where(
+                                Document.id == doc_id,
+                                Document.search_space_id == search_space_id,
+                            )
+                        )
+                        existing_doc = result_doc.scalar_one_or_none()
+                        if existing_doc is not None:
+                            await _snapshot_document_pre_write(
+                                session,
+                                doc=existing_doc,
+                                action_id=action_id,
+                                search_space_id=search_space_id,
+                                turn_id=tcid,
+                                deferred_dispatches=deferred_dispatches,
+                            )
+                    updated = await _update_document(
+                        session,
+                        doc_id=doc_id,
+                        content=content,
+                        virtual_path=path,
+                        search_space_id=search_space_id,
+                    )
+                    if updated is not None:
+                        committed_updates.append(
+                            {
+                                "id": updated.id,
+                                "title": updated.title,
+                                "documentType": DocumentType.NOTE.value,
+                                "searchSpaceId": search_space_id,
+                                "folderId": updated.folder_id,
+                                "createdById": str(created_by_id)
+                                if created_by_id
+                                else None,
+                                "virtualPath": path,
+                            }
+                        )
+                else:
+                    # Fresh create. Wrap each create in a SAVEPOINT so a
+                    # residual ``IntegrityError`` (e.g. a deployment that
+                    # hasn't run migration 133 yet, where
+                    # ``documents.content_hash`` still carries its legacy
+                    # global UNIQUE constraint) rolls back only this one
+                    # create instead of poisoning the whole turn.
+                    placeholder_revision_id: int | None = None
+                    if snapshot_enabled and action_id is not None:
+                        placeholder_revision_id = await _snapshot_document_pre_create(
+                            session,
+                            action_id=action_id,
+                            search_space_id=search_space_id,
+                            turn_id=tcid,
+                            deferred_dispatches=deferred_dispatches,
+                        )
+                    try:
+                        async with session.begin_nested():
+                            new_doc = await _create_document(
+                                session,
+                                virtual_path=path,
+                                content=content,
+                                search_space_id=search_space_id,
+                                created_by_id=created_by_id,
+                            )
+                    except ValueError as exc:
+                        logger.warning(
+                            "kb_persistence: skipping %s create: %s", path, exc
+                        )
+                        # Roll back the placeholder revision since the create
+                        # never happened.
+                        if placeholder_revision_id is not None:
+                            await session.execute(
+                                delete(DocumentRevision).where(
+                                    DocumentRevision.id == placeholder_revision_id
+                                )
+                            )
+                        continue
+                    except IntegrityError as exc:
+                        msg = str(exc.orig) if exc.orig is not None else str(exc)
+                        logger.error(
+                            "kb_persistence: IntegrityError creating %s: %s. "
+                            "If this mentions content_hash, run alembic "
+                            "upgrade to apply migration 133 which drops the "
+                            "global UNIQUE constraint on documents.content_hash.",
+                            path,
+                            msg,
+                        )
+                        if placeholder_revision_id is not None:
+                            await session.execute(
+                                delete(DocumentRevision).where(
+                                    DocumentRevision.id == placeholder_revision_id
+                                )
+                            )
+                        continue
+                    doc_id_by_path[path] = new_doc.id
+                    if placeholder_revision_id is not None:
+                        await session.execute(
+                            update(DocumentRevision)
+                            .where(DocumentRevision.id == placeholder_revision_id)
+                            .values(document_id=new_doc.id)
+                        )
+                    committed_creates.append(
+                        {
+                            "id": new_doc.id,
+                            "title": new_doc.title,
+                            "documentType": DocumentType.NOTE.value,
+                            "searchSpaceId": search_space_id,
+                            "folderId": new_doc.folder_id,
+                            "createdById": str(created_by_id)
+                            if created_by_id
+                            else None,
+                            "virtualPath": path,
+                        }
+                    )
+                    tree_changed = True
+
+            # ------------------------------------------------------------------
+            # 4. pending_deletes -> ``rm``. STRICT durability: snapshot + DELETE
+            # share a SAVEPOINT. If the snapshot insert fails, the DELETE
+            # rolls back too and we surface the error rather than silently
+            # making the data irreversible.
+            # ------------------------------------------------------------------
+            for raw_path, tcid in file_delete_paths.items():
+                final = _final_path(raw_path)
+                if not final.startswith(DOCUMENTS_ROOT + "/"):
+                    continue
+                action_id = _action_id_for(tcid)
+
+                # Resolve the doc.
+                doc_id_for_delete = doc_id_by_path.get(final)
+                document_to_delete: Document | None = None
+                if doc_id_for_delete is not None:
+                    result = await session.execute(
+                        select(Document).where(
+                            Document.id == doc_id_for_delete,
+                            Document.search_space_id == search_space_id,
+                        )
+                    )
+                    document_to_delete = result.scalar_one_or_none()
+                if document_to_delete is None:
+                    document_to_delete = await virtual_path_to_doc(
+                        session,
+                        search_space_id=search_space_id,
+                        virtual_path=final,
+                    )
+                if document_to_delete is None:
+                    logger.info(
+                        "kb_persistence: skipping rm %s (target not found)", final
+                    )
+                    continue
+
+                doc_pk = document_to_delete.id
+                doc_title = document_to_delete.title
+                doc_folder_id = document_to_delete.folder_id
+
+                try:
+                    async with session.begin_nested():
+                        # Strict: snapshot first; failure aborts the delete.
+                        if snapshot_enabled and action_id is not None:
+                            chunks = await _load_chunks_for_snapshot(
+                                session, doc_id=doc_pk
+                            )
+                            payload = _doc_revision_payload(
+                                document_to_delete, chunks_before=chunks
+                            )
+                            rev = DocumentRevision(
+                                document_id=doc_pk,
+                                search_space_id=search_space_id,
+                                created_by_turn_id=tcid,
+                                agent_action_id=action_id,
+                                **payload,
+                            )
+                            session.add(rev)
+                            await session.flush()
+                            await _mark_action_reversible(session, action_id=action_id)
+                        await session.execute(
+                            delete(Document).where(Document.id == doc_pk)
+                        )
+                except Exception as exc:
+                    logger.exception(
+                        "kb_persistence: strict rm SAVEPOINT for path=%s failed: %s",
+                        final,
+                        exc,
+                    )
+                    continue
+
+                # B1 — SAVEPOINT released. Defer the reversibility-flip
+                # dispatch until AFTER the outer commit succeeds so we
+                # never tell the UI a row is reversible if its snapshot
+                # gets rolled back.
+                if snapshot_enabled and action_id is not None:
+                    deferred_dispatches.append(int(action_id))
+
+                doc_id_by_path.pop(final, None)
+                doc_id_path_tombstones[final] = None
+                committed_deletes.append(
+                    {
+                        "id": doc_pk,
+                        "title": doc_title,
+                        "documentType": DocumentType.NOTE.value,
+                        "searchSpaceId": search_space_id,
+                        "folderId": doc_folder_id,
+                        "createdById": str(created_by_id) if created_by_id else None,
+                        "virtualPath": final,
+                    }
+                )
+                tree_changed = True
+
+            # ------------------------------------------------------------------
+            # 5. pending_dir_deletes -> ``rmdir``. STRICT durability + final
+            # emptiness check (after step 4's deletes have run, an "empty
+            # mid-turn" directory really IS empty in DB now).
+            # ------------------------------------------------------------------
+            for raw_path, tcid in dir_delete_paths.items():
+                final = _final_path(raw_path)
+                if not final.startswith(DOCUMENTS_ROOT + "/"):
+                    continue
+                action_id = _action_id_for(tcid)
+
+                folder_parts = _split_folder_path(final)
+                if not folder_parts:
+                    continue
+                folder_id = await _resolve_folder_id(
+                    session,
+                    search_space_id=search_space_id,
+                    folder_parts=folder_parts,
+                )
+                if folder_id is None:
+                    logger.info(
+                        "kb_persistence: skipping rmdir %s (folder not found)", final
+                    )
+                    continue
+
+                # Re-check emptiness against in-DB state.
+                docs_in_folder = await session.execute(
+                    select(Document.id)
+                    .where(Document.folder_id == folder_id)
+                    .where(Document.search_space_id == search_space_id)
+                    .limit(1)
+                )
+                if docs_in_folder.scalar_one_or_none() is not None:
+                    logger.warning(
+                        "kb_persistence: refusing rmdir %s — non-empty at commit time",
+                        final,
+                    )
+                    continue
+                child_folders = await session.execute(
+                    select(Folder.id)
+                    .where(Folder.parent_id == folder_id)
+                    .where(Folder.search_space_id == search_space_id)
+                    .limit(1)
+                )
+                if child_folders.scalar_one_or_none() is not None:
+                    logger.warning(
+                        "kb_persistence: refusing rmdir %s — has child folders "
+                        "at commit time",
+                        final,
+                    )
+                    continue
+
+                folder_to_delete_res = await session.execute(
+                    select(Folder).where(Folder.id == folder_id)
+                )
+                folder_to_delete = folder_to_delete_res.scalar_one_or_none()
+                if folder_to_delete is None:
+                    continue
+
+                folder_pk = folder_to_delete.id
+                folder_name = folder_to_delete.name
+                folder_parent_id = folder_to_delete.parent_id
+                folder_position = folder_to_delete.position
+
+                try:
+                    async with session.begin_nested():
+                        if snapshot_enabled and action_id is not None:
+                            rev = FolderRevision(
+                                folder_id=folder_pk,
+                                search_space_id=search_space_id,
+                                name_before=folder_name,
+                                parent_id_before=folder_parent_id,
+                                position_before=folder_position,
+                                created_by_turn_id=tcid,
+                                agent_action_id=action_id,
+                            )
+                            session.add(rev)
+                            await session.flush()
+                            await _mark_action_reversible(session, action_id=action_id)
+                        await session.execute(
+                            delete(Folder).where(Folder.id == folder_pk)
+                        )
+                except Exception as exc:
+                    logger.exception(
+                        "kb_persistence: strict rmdir SAVEPOINT for path=%s failed: %s",
+                        final,
+                        exc,
+                    )
+                    continue
+
+                # B1 — SAVEPOINT released. Defer the reversibility-flip
+                # dispatch until AFTER the outer commit succeeds so we
+                # never tell the UI a row is reversible if its snapshot
+                # gets rolled back.
+                if snapshot_enabled and action_id is not None:
+                    deferred_dispatches.append(int(action_id))
+
+                committed_folder_deletes.append(
+                    {
+                        "id": folder_pk,
+                        "name": folder_name,
+                        "searchSpaceId": search_space_id,
+                        "parentId": folder_parent_id,
+                        "virtualPath": final,
+                    }
+                )
+                tree_changed = True
+
+            await session.commit()
+    except Exception:  # pragma: no cover - rollback safety net
+        logger.exception(
+            "kb_persistence: commit failed (search_space=%s)", search_space_id
+        )
+        # Outer commit raised — every SAVEPOINT-released change above
+        # (snapshots + reversibility flips) is now rolled back. Drop
+        # the deferred SSE dispatches so the UI stays consistent with
+        # durable state.
+        deferred_dispatches.clear()
+        return None
+
+    # Outer commit succeeded; flush deferred reversibility-flip
+    # dispatches now so the chat tool card can light up its Revert
+    # button without re-fetching ``GET /threads/.../actions``. De-dup
+    # to avoid emitting the same id twice (e.g. write-then-rm in the
+    # same turn dispatches once for each snapshot site).
+    if deferred_dispatches and dispatch_events:
+        for action_id in dict.fromkeys(deferred_dispatches):
+            try:
+                await _dispatch_reversibility_update(action_id)
+            except Exception:
+                logger.debug(
+                    "kb_persistence: deferred reversibility dispatch failed for action_id=%s",
+                    action_id,
+                    exc_info=True,
+                )
+
+    if dispatch_events:
+        for payload in committed_creates:
+            try:
+                dispatch_custom_event("document_created", payload)
+            except Exception:
+                logger.exception(
+                    "kb_persistence: failed to dispatch document_created event"
+                )
+        for payload in committed_updates:
+            try:
+                dispatch_custom_event("document_updated", payload)
+            except Exception:
+                logger.exception(
+                    "kb_persistence: failed to dispatch document_updated event"
+                )
+        for payload in committed_deletes:
+            try:
+                dispatch_custom_event("document_deleted", payload)
+            except Exception:
+                logger.exception(
+                    "kb_persistence: failed to dispatch document_deleted event"
+                )
+        for payload in committed_folder_deletes:
+            try:
+                dispatch_custom_event("folder_deleted", payload)
+            except Exception:
+                logger.exception(
+                    "kb_persistence: failed to dispatch folder_deleted event"
+                )
+
+    temp_paths = [
+        p for p in files if isinstance(p, str) and _basename(p).startswith(_TEMP_PREFIX)
+    ]
+
+    # Tombstone every committed-delete path so a stale ``state["files"]`` entry
+    # (which als_info would otherwise interpret as content) cannot survive into
+    # the next turn and make a now-empty folder look non-empty.
+    deleted_file_paths = [
+        str(payload.get("virtualPath") or "")
+        for payload in committed_deletes
+        if payload.get("virtualPath")
+    ]
+
+    doc_id_update: dict[str, int | None] = {**doc_id_path_tombstones}
+    for payload in committed_creates:
+        doc_id_update[str(payload.get("virtualPath") or "")] = int(payload["id"])
+
+    delta: dict[str, Any] = {
+        "dirty_paths": [_CLEAR],
+        "staged_dirs": [_CLEAR],
+        "staged_dir_tool_calls": {_CLEAR: True},
+        "pending_moves": [_CLEAR],
+        "pending_deletes": [_CLEAR],
+        "pending_dir_deletes": [_CLEAR],
+        "dirty_path_tool_calls": {_CLEAR: True},
+    }
+    files_delta: dict[str, Any] = {}
+    if temp_paths:
+        files_delta.update(dict.fromkeys(temp_paths))
+    for path in deleted_file_paths:
+        files_delta[path] = None
+    if files_delta:
+        delta["files"] = files_delta
+    if doc_id_update:
+        delta["doc_id_by_path"] = doc_id_update
+    if tree_changed:
+        delta["tree_version"] = int(state_dict.get("tree_version") or 0) + 1
+
+    # Avoid 'unused' lint when turn_id_for_revision was only useful for
+    # diagnostic purposes inside the SAVEPOINT chain above.
+    _ = turn_id_for_revision
+
+    logger.info(
+        "kb_persistence: commit (search_space=%s) creates=%d updates=%d "
+        "moves=%d staged_dirs=%d deletes=%d folder_deletes=%d discarded=%d",
+        search_space_id,
+        len(committed_creates),
+        len(committed_updates),
+        len(applied_moves),
+        len(staged_dirs),
+        len(committed_deletes),
+        len(committed_folder_deletes),
+        len(discarded),
+    )
+    return delta
+
+
+# ---------------------------------------------------------------------------
+# Middleware
+# ---------------------------------------------------------------------------
+
+
+class KnowledgeBasePersistenceMiddleware(AgentMiddleware):  # type: ignore[type-arg]
+    """End-of-turn cloud persistence for the SurfSense filesystem agent."""
+
+    tools = ()
+    state_schema = SurfSenseFilesystemState
+
+    def __init__(
+        self,
+        *,
+        search_space_id: int,
+        created_by_id: str | None,
+        filesystem_mode: FilesystemMode,
+        thread_id: int | None = None,
+    ) -> None:
+        self.search_space_id = search_space_id
+        self.created_by_id = created_by_id
+        self.filesystem_mode = filesystem_mode
+        self.thread_id = thread_id
+
+    async def aafter_agent(  # type: ignore[override]
+        self,
+        state: AgentState,
+        runtime: Runtime[Any],
+    ) -> dict[str, Any] | None:
+        del runtime
+        if self.filesystem_mode != FilesystemMode.CLOUD:
+            return None
+        return await commit_staged_filesystem_state(
+            state,
+            search_space_id=self.search_space_id,
+            created_by_id=self.created_by_id,
+            filesystem_mode=self.filesystem_mode,
+            thread_id=self.thread_id,
+        )
+
+
+__all__ = [
+    "KnowledgeBasePersistenceMiddleware",
+    "commit_staged_filesystem_state",
+]
diff --git a/surfsense_backend/app/agents/new_chat/middleware/kb_postgres_backend.py b/surfsense_backend/app/agents/new_chat/middleware/kb_postgres_backend.py
new file mode 100644
index 0000000000..7cf3bf8cda
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/middleware/kb_postgres_backend.py
@@ -0,0 +1,1038 @@
+"""Postgres-backed virtual filesystem for the SurfSense agent (cloud mode).
+
+The backend is **strictly conforming** to deepagents'
+:class:`BackendProtocol`. It returns ``WriteResult`` / ``EditResult`` / list
+shapes exactly as upstream expects (no extra fields). All side-state
+plumbing — ``dirty_paths``, ``doc_id_by_path``, ``staged_dirs``,
+``pending_moves``, ``files`` cache — is appended by the overridden tool
+wrappers in :class:`SurfSenseFilesystemMiddleware` via ``Command.update``.
+
+The backend never writes to Postgres. End-of-turn persistence is handled by
+:class:`KnowledgeBasePersistenceMiddleware`. This module is purely a
+read-side and a state-merging helper.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import contextlib
+import fnmatch
+import logging
+import re
+from datetime import UTC
+from typing import Any
+
+from deepagents.backends.protocol import (
+    BackendProtocol,
+    EditResult,
+    FileDownloadResponse,
+    FileInfo,
+    FileUploadResponse,
+    GrepMatch,
+    WriteResult,
+)
+from deepagents.backends.utils import (
+    create_file_data,
+    file_data_to_string,
+    format_read_response,
+    perform_string_replacement,
+    update_file_data,
+)
+from langchain.tools import ToolRuntime
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.agents.new_chat.document_xml import build_document_xml
+from app.agents.new_chat.path_resolver import (
+    DOCUMENTS_ROOT,
+    build_path_index,
+    doc_to_virtual_path,
+    virtual_path_to_doc,
+)
+from app.db import Chunk, Document, shielded_async_session
+
+logger = logging.getLogger(__name__)
+
+_TEMP_PREFIX = "temp_"
+_GREP_MAX_TOTAL_MATCHES = 50
+_GREP_MAX_PER_DOC = 5
+
+
+def _basename(path: str) -> str:
+    return path.rsplit("/", 1)[-1]
+
+
+def _is_under(child: str, parent: str) -> bool:
+    """Return True iff ``child`` is at-or-under ``parent`` (directory semantics)."""
+    if parent == "/":
+        return child.startswith("/")
+    return child == parent or child.startswith(parent.rstrip("/") + "/")
+
+
+def paginate_listing(
+    infos: list[FileInfo],
+    *,
+    offset: int = 0,
+    limit: int | None = None,
+) -> list[FileInfo]:
+    """Paginate a listing produced by :meth:`KBPostgresBackend.als_info`."""
+    if offset < 0:
+        offset = 0
+    end: int | None
+    end = None if limit is None or limit < 0 else offset + limit
+    return list(infos[offset:end])
+
+
+class KBPostgresBackend(BackendProtocol):
+    """Lazy, read-only Postgres view for ``/documents/*`` virtual paths.
+
+    The backend exposes a virtual ``/documents/`` namespace mirroring the
+    ``Folder``/``Document`` graph. Reads materialize XML on first access and
+    cache it via the overriding tool wrappers (NOT here). Writes never touch
+    the DB — they return ``files_update`` deltas that the wrappers turn into
+    Command updates, and the persistence middleware commits them at end of
+    turn.
+    """
+
+    _IMAGE_EXTENSIONS = frozenset({".png", ".jpg", ".jpeg", ".gif", ".webp"})
+
+    def __init__(self, search_space_id: int, runtime: ToolRuntime) -> None:
+        self.search_space_id = search_space_id
+        self.runtime = runtime
+
+    @property
+    def state(self) -> dict[str, Any]:
+        return getattr(self.runtime, "state", {}) or {}
+
+    # ------------------------------------------------------------------ helpers
+
+    def _state_files(self) -> dict[str, Any]:
+        return dict(self.state.get("files") or {})
+
+    def _staged_dirs(self) -> list[str]:
+        return list(self.state.get("staged_dirs") or [])
+
+    def _pending_moves(self) -> list[dict[str, Any]]:
+        return list(self.state.get("pending_moves") or [])
+
+    def _pending_deletes(self) -> list[dict[str, Any]]:
+        return list(self.state.get("pending_deletes") or [])
+
+    def _pending_dir_deletes(self) -> list[dict[str, Any]]:
+        return list(self.state.get("pending_dir_deletes") or [])
+
+    def _kb_anon_doc(self) -> dict[str, Any] | None:
+        anon = self.state.get("kb_anon_doc")
+        return anon if isinstance(anon, dict) else None
+
+    def _matched_chunk_ids(self, doc_id: int) -> set[int]:
+        mapping = self.state.get("kb_matched_chunk_ids") or {}
+        try:
+            return set(mapping.get(doc_id, []) or [])
+        except TypeError:
+            return set()
+
+    @staticmethod
+    def _file_data_size(file_data: dict[str, Any]) -> int:
+        try:
+            return len("\n".join(file_data.get("content") or []))
+        except Exception:
+            return 0
+
+    def _normalize_listing_path(self, path: str) -> str:
+        if not path:
+            return DOCUMENTS_ROOT
+        if path == "/":
+            return path
+        return path.rstrip("/") if path != "/" else path
+
+    def _pending_filesystem_view(
+        self,
+        existing: dict[str, dict[str, Any]],
+    ) -> tuple[set[str], dict[str, str], set[str]]:
+        """Compute removed/aliased/dir-suppressed paths from staged ops.
+
+        Returns ``(removed, alias, deleted_dirs)`` where:
+
+        * ``removed`` — paths to drop from listings (sources of pending moves
+          AND paths queued for ``rm``).
+        * ``alias`` — ``{source: dest}`` for pending moves; the dest should
+          appear as a virtual entry even when no DB row is at that path yet.
+        * ``deleted_dirs`` — folder paths queued for ``rmdir``; their entire
+          subtree (descendants) is suppressed from listings/glob/grep.
+
+        Entries in ``existing`` (the ``files`` state cache) keyed by a
+        removed path are popped so a same-turn delete-after-write doesn't
+        leave a stale virtual file in listings.
+        """
+        removed: set[str] = set()
+        alias: dict[str, str] = {}
+        deleted_dirs: set[str] = set()
+        for move in self._pending_moves():
+            src = move.get("source")
+            dst = move.get("dest")
+            if not src or not dst:
+                continue
+            removed.add(src)
+            alias[src] = dst
+            existing.pop(src, None)
+        for entry in self._pending_deletes():
+            path = entry.get("path") if isinstance(entry, dict) else None
+            if not path:
+                continue
+            removed.add(path)
+            existing.pop(path, None)
+        for entry in self._pending_dir_deletes():
+            path = entry.get("path") if isinstance(entry, dict) else None
+            if not path:
+                continue
+            deleted_dirs.add(path)
+        return removed, alias, deleted_dirs
+
+    @staticmethod
+    def _is_dir_suppressed(path: str, deleted_dirs: set[str]) -> bool:
+        """Return True iff ``path`` is at-or-under any directory in ``deleted_dirs``."""
+        return any(path == d or _is_under(path, d) for d in deleted_dirs)
+
+    # ------------------------------------------------------------------ ls/read
+
+    async def als_info(self, path: str) -> list[FileInfo]:  # type: ignore[override]
+        normalized = self._normalize_listing_path(path)
+        infos: list[FileInfo] = []
+        seen: set[str] = set()
+
+        anon = self._kb_anon_doc()
+        if anon:
+            anon_path = str(anon.get("path") or "")
+            if (
+                anon_path
+                and _is_under(anon_path, normalized)
+                and anon_path != normalized
+                and anon_path not in seen
+            ):
+                infos.append(
+                    FileInfo(
+                        path=anon_path,
+                        is_dir=False,
+                        size=len(str(anon.get("content") or "")),
+                        modified_at="",
+                    )
+                )
+                seen.add(anon_path)
+
+        files = self._state_files()
+        moved_removed, moved_alias, deleted_dirs = self._pending_filesystem_view(files)
+
+        if normalized.startswith(DOCUMENTS_ROOT) or normalized == "/":
+            try:
+                async with shielded_async_session() as session:
+                    db_infos, subdir_paths = await self._list_db_directory(
+                        session, normalized
+                    )
+            except Exception as exc:  # pragma: no cover - defensive
+                logger.warning("KBPostgresBackend.als_info DB error: %s", exc)
+                db_infos, subdir_paths = [], set()
+
+            for info in db_infos:
+                p = info.get("path", "")
+                if (
+                    not p
+                    or p in seen
+                    or p in moved_removed
+                    or self._is_dir_suppressed(p, deleted_dirs)
+                ):
+                    continue
+                infos.append(info)
+                seen.add(p)
+
+            for src, dst in moved_alias.items():
+                if src not in seen:
+                    if not _is_under(dst, normalized):
+                        continue
+                    if self._is_dir_suppressed(dst, deleted_dirs):
+                        continue
+                    rel = (
+                        dst[len(normalized) :].lstrip("/")
+                        if normalized != "/"
+                        else dst.lstrip("/")
+                    )
+                    if "/" in rel:
+                        subdir_paths.add(
+                            (normalized.rstrip("/") + "/" + rel.split("/", 1)[0])
+                            if normalized != "/"
+                            else "/" + rel.split("/", 1)[0]
+                        )
+                        continue
+                    if dst in seen:
+                        continue
+                    fd = files.get(dst)
+                    size = self._file_data_size(fd) if isinstance(fd, dict) else 0
+                    infos.append(
+                        FileInfo(
+                            path=dst,
+                            is_dir=False,
+                            size=int(size),
+                            modified_at=fd.get("modified_at", "")
+                            if isinstance(fd, dict)
+                            else "",
+                        )
+                    )
+                    seen.add(dst)
+
+            for staged in self._staged_dirs():
+                if not staged or not staged.startswith(DOCUMENTS_ROOT):
+                    continue
+                if staged == normalized:
+                    continue
+                if not _is_under(staged, normalized):
+                    continue
+                if self._is_dir_suppressed(staged, deleted_dirs):
+                    continue
+                rel = (
+                    staged[len(normalized) :].lstrip("/")
+                    if normalized != "/"
+                    else staged.lstrip("/")
+                )
+                if not rel:
+                    continue
+                first = rel.split("/", 1)[0]
+                immediate = (
+                    normalized.rstrip("/") + "/" + first
+                    if normalized != "/"
+                    else "/" + first
+                )
+                subdir_paths.add(immediate)
+
+            for sub in sorted(subdir_paths):
+                if sub in seen:
+                    continue
+                if self._is_dir_suppressed(sub, deleted_dirs):
+                    continue
+                infos.append(FileInfo(path=sub, is_dir=True, size=0, modified_at=""))
+                seen.add(sub)
+
+        for path_key, fd in files.items():
+            if not isinstance(path_key, str) or path_key in seen:
+                continue
+            # Tombstones (None values) are deletion markers from `rm`. The
+            # deepagents reducer normally pops them, but a stale tombstone
+            # surviving a checkpoint must NOT be reported as a child here —
+            # otherwise rmdir mistakenly sees the deleted file as content.
+            if fd is None:
+                continue
+            if not _is_under(path_key, normalized) or path_key == normalized:
+                continue
+            if path_key in moved_removed or self._is_dir_suppressed(
+                path_key, deleted_dirs
+            ):
+                continue
+            if normalized == "/":
+                rel = path_key.lstrip("/")
+            else:
+                rel = path_key[len(normalized) :].lstrip("/")
+            if not rel:
+                continue
+            if "/" in rel:
+                first = rel.split("/", 1)[0]
+                immediate = (
+                    normalized.rstrip("/") + "/" + first
+                    if normalized != "/"
+                    else "/" + first
+                )
+                if immediate not in seen:
+                    infos.append(
+                        FileInfo(path=immediate, is_dir=True, size=0, modified_at="")
+                    )
+                    seen.add(immediate)
+                continue
+            include = path_key.startswith(DOCUMENTS_ROOT) or _basename(
+                path_key
+            ).startswith(_TEMP_PREFIX)
+            if not include:
+                continue
+            size = self._file_data_size(fd) if isinstance(fd, dict) else 0
+            infos.append(
+                FileInfo(
+                    path=path_key,
+                    is_dir=False,
+                    size=int(size),
+                    modified_at=fd.get("modified_at", "")
+                    if isinstance(fd, dict)
+                    else "",
+                )
+            )
+            seen.add(path_key)
+
+        infos.sort(key=lambda fi: (not fi.get("is_dir", False), fi.get("path", "")))
+        return infos
+
+    def ls_info(self, path: str) -> list[FileInfo]:  # type: ignore[override]
+        return asyncio.run(self.als_info(path))
+
+    async def _list_db_directory(
+        self,
+        session: AsyncSession,
+        normalized_path: str,
+    ) -> tuple[list[FileInfo], set[str]]:
+        """List immediate Folders + Documents at ``normalized_path``.
+
+        Returns ``(file_infos, subdirectory_paths)``. ``normalized_path`` may
+        be ``/`` (synthesizes ``/documents``) or a path under ``/documents``.
+        """
+        if normalized_path == "/":
+            return (
+                [],
+                {DOCUMENTS_ROOT},
+            )
+
+        if not normalized_path.startswith(DOCUMENTS_ROOT):
+            return [], set()
+
+        index = await build_path_index(session, self.search_space_id)
+        target_folder_id: int | None = None
+        if normalized_path != DOCUMENTS_ROOT:
+            target_path = normalized_path
+            matches = [
+                fid for fid, fpath in index.folder_paths.items() if fpath == target_path
+            ]
+            if not matches:
+                return [], set()
+            target_folder_id = matches[0]
+
+        result = await session.execute(
+            select(Document.id, Document.title, Document.folder_id, Document.updated_at)
+            .where(Document.search_space_id == self.search_space_id)
+            .where(
+                Document.folder_id == target_folder_id
+                if target_folder_id is not None
+                else Document.folder_id.is_(None)
+            )
+        )
+        rows = result.all()
+
+        file_infos: list[FileInfo] = []
+        for row in rows:
+            path = doc_to_virtual_path(
+                doc_id=row.id,
+                title=str(row.title or "untitled"),
+                folder_id=row.folder_id,
+                index=index,
+            )
+            modified = ""
+            if row.updated_at is not None:
+                with contextlib.suppress(Exception):
+                    modified = row.updated_at.astimezone(UTC).isoformat()
+            file_infos.append(
+                FileInfo(
+                    path=path,
+                    is_dir=False,
+                    size=0,
+                    modified_at=modified,
+                )
+            )
+
+        subdirs: set[str] = set()
+        for _fid, fpath in index.folder_paths.items():
+            if fpath == normalized_path:
+                continue
+            base = normalized_path.rstrip("/")
+            if not fpath.startswith(base + "/"):
+                continue
+            rel = fpath[len(base) + 1 :]
+            if "/" in rel:
+                continue
+            subdirs.add(base + "/" + rel)
+        return file_infos, subdirs
+
+    async def aread(  # type: ignore[override]
+        self,
+        file_path: str,
+        offset: int = 0,
+        limit: int = 2000,
+    ) -> str:
+        files = self._state_files()
+        file_data = files.get(file_path)
+        if file_data is not None:
+            return format_read_response(file_data, offset, limit)
+
+        loaded = await self._load_file_data(file_path)
+        if loaded is None:
+            return f"Error: File '{file_path}' not found"
+        file_data, _ = loaded
+        return format_read_response(file_data, offset, limit)
+
+    def read(self, file_path: str, offset: int = 0, limit: int = 2000) -> str:  # type: ignore[override]
+        return asyncio.run(self.aread(file_path, offset, limit))
+
+    async def _load_file_data(
+        self,
+        path: str,
+    ) -> tuple[dict[str, Any], int | None] | None:
+        """Lazy-load a virtual KB document into a deepagents ``FileData``.
+
+        Returns ``(file_data, doc_id)`` or ``None`` if the path doesn't map
+        to any known document. ``doc_id`` is ``None`` for the synthetic
+        anonymous document so the caller doesn't track it as a DB-backed file.
+        """
+        anon = self._kb_anon_doc()
+        if anon and str(anon.get("path") or "") == path:
+            doc_payload = {
+                "document_id": -1,
+                "chunks": list(anon.get("chunks") or []),
+                "matched_chunk_ids": [],
+                "document": {
+                    "id": -1,
+                    "title": anon.get("title") or "uploaded_document",
+                    "document_type": "FILE",
+                    "metadata": {"source": "anonymous_upload"},
+                },
+                "source": "FILE",
+            }
+            xml = build_document_xml(doc_payload, matched_chunk_ids=set())
+            file_data = create_file_data(xml)
+            return file_data, None
+
+        if not path.startswith(DOCUMENTS_ROOT):
+            return None
+
+        async with shielded_async_session() as session:
+            document = await virtual_path_to_doc(
+                session,
+                search_space_id=self.search_space_id,
+                virtual_path=path,
+            )
+            if document is None:
+                return None
+            chunk_rows = await session.execute(
+                select(Chunk.id, Chunk.content)
+                .where(Chunk.document_id == document.id)
+                .order_by(Chunk.id)
+            )
+            chunks = [
+                {"chunk_id": row.id, "content": row.content} for row in chunk_rows.all()
+            ]
+
+        doc_payload = {
+            "document_id": document.id,
+            "chunks": chunks,
+            "matched_chunk_ids": list(self._matched_chunk_ids(document.id)),
+            "document": {
+                "id": document.id,
+                "title": document.title,
+                "document_type": (
+                    document.document_type.value
+                    if getattr(document, "document_type", None) is not None
+                    else "UNKNOWN"
+                ),
+                "metadata": dict(document.document_metadata or {}),
+            },
+            "source": (
+                document.document_type.value
+                if getattr(document, "document_type", None) is not None
+                else "UNKNOWN"
+            ),
+        }
+        xml = build_document_xml(
+            doc_payload,
+            matched_chunk_ids=self._matched_chunk_ids(document.id),
+        )
+        file_data = create_file_data(xml)
+        return file_data, document.id
+
+    # ------------------------------------------------------------------ writes
+
+    async def awrite(self, file_path: str, content: str) -> WriteResult:  # type: ignore[override]
+        files = self._state_files()
+        if file_path in files:
+            return WriteResult(
+                error=(
+                    f"Cannot write to {file_path} because it already exists. "
+                    "Read and then make an edit, or write to a new path."
+                )
+            )
+        new_file_data = create_file_data(content)
+        return WriteResult(path=file_path, files_update={file_path: new_file_data})
+
+    def write(self, file_path: str, content: str) -> WriteResult:  # type: ignore[override]
+        return asyncio.run(self.awrite(file_path, content))
+
+    async def aedit(  # type: ignore[override]
+        self,
+        file_path: str,
+        old_string: str,
+        new_string: str,
+        replace_all: bool = False,
+    ) -> EditResult:
+        files = self._state_files()
+        file_data = files.get(file_path)
+        if file_data is None:
+            loaded = await self._load_file_data(file_path)
+            if loaded is None:
+                return EditResult(error=f"Error: File '{file_path}' not found")
+            file_data, _ = loaded
+
+        content = file_data_to_string(file_data)
+        result = perform_string_replacement(
+            content, old_string, new_string, replace_all
+        )
+        if isinstance(result, str):
+            return EditResult(error=result)
+
+        new_content, occurrences = result
+        new_file_data = update_file_data(file_data, new_content)
+        return EditResult(
+            path=file_path,
+            files_update={file_path: new_file_data},
+            occurrences=int(occurrences),
+        )
+
+    def edit(  # type: ignore[override]
+        self,
+        file_path: str,
+        old_string: str,
+        new_string: str,
+        replace_all: bool = False,
+    ) -> EditResult:
+        return asyncio.run(self.aedit(file_path, old_string, new_string, replace_all))
+
+    # ------------------------------------------------------------------ glob/grep
+
+    async def aglob_info(self, pattern: str, path: str = "/") -> list[FileInfo]:  # type: ignore[override]
+        normalized = self._normalize_listing_path(path)
+        results: list[FileInfo] = []
+        seen: set[str] = set()
+
+        files = self._state_files()
+        moved_removed, _, deleted_dirs = self._pending_filesystem_view(files)
+        regex = re.compile(fnmatch.translate(pattern))
+        for path_key, fd in files.items():
+            if path_key in moved_removed or self._is_dir_suppressed(
+                path_key, deleted_dirs
+            ):
+                continue
+            if not _is_under(path_key, normalized):
+                continue
+            rel = (
+                path_key[len(normalized) :].lstrip("/")
+                if normalized != "/"
+                else path_key.lstrip("/")
+            )
+            if not regex.match(rel) and not regex.match(path_key):
+                continue
+            if path_key in seen:
+                continue
+            size = self._file_data_size(fd) if isinstance(fd, dict) else 0
+            results.append(
+                FileInfo(
+                    path=path_key,
+                    is_dir=False,
+                    size=int(size),
+                    modified_at=fd.get("modified_at", "")
+                    if isinstance(fd, dict)
+                    else "",
+                )
+            )
+            seen.add(path_key)
+
+        if normalized.startswith(DOCUMENTS_ROOT) or normalized == "/":
+            try:
+                async with shielded_async_session() as session:
+                    index = await build_path_index(session, self.search_space_id)
+                    rows = await session.execute(
+                        select(Document.id, Document.title, Document.folder_id).where(
+                            Document.search_space_id == self.search_space_id
+                        )
+                    )
+                    for row in rows.all():
+                        candidate = doc_to_virtual_path(
+                            doc_id=row.id,
+                            title=str(row.title or "untitled"),
+                            folder_id=row.folder_id,
+                            index=index,
+                        )
+                        if (
+                            candidate in seen
+                            or candidate in moved_removed
+                            or self._is_dir_suppressed(candidate, deleted_dirs)
+                        ):
+                            continue
+                        if not _is_under(candidate, normalized):
+                            continue
+                        rel = (
+                            candidate[len(normalized) :].lstrip("/")
+                            if normalized != "/"
+                            else candidate.lstrip("/")
+                        )
+                        if not regex.match(rel) and not regex.match(candidate):
+                            continue
+                        results.append(
+                            FileInfo(
+                                path=candidate, is_dir=False, size=0, modified_at=""
+                            )
+                        )
+                        seen.add(candidate)
+            except Exception as exc:  # pragma: no cover - defensive
+                logger.warning("KBPostgresBackend.aglob_info DB error: %s", exc)
+
+        results.sort(key=lambda fi: fi.get("path", ""))
+        return results
+
+    def glob_info(self, pattern: str, path: str = "/") -> list[FileInfo]:  # type: ignore[override]
+        return asyncio.run(self.aglob_info(pattern, path))
+
+    async def agrep_raw(  # type: ignore[override]
+        self,
+        pattern: str,
+        path: str | None = None,
+        glob: str | None = None,
+    ) -> list[GrepMatch] | str:
+        if not pattern:
+            return "Error: pattern cannot be empty"
+
+        normalized = self._normalize_listing_path(path or "/")
+        matches: list[GrepMatch] = []
+
+        files = self._state_files()
+        moved_removed, _, deleted_dirs = self._pending_filesystem_view(files)
+        glob_re = re.compile(fnmatch.translate(glob)) if glob else None
+        for path_key, fd in files.items():
+            if path_key in moved_removed or self._is_dir_suppressed(
+                path_key, deleted_dirs
+            ):
+                continue
+            if not _is_under(path_key, normalized):
+                continue
+            if glob_re is not None and not glob_re.match(_basename(path_key)):
+                continue
+            if not isinstance(fd, dict):
+                continue
+            for line_no, line in enumerate(fd.get("content") or [], 1):
+                if pattern in line:
+                    matches.append(
+                        GrepMatch(path=path_key, line=int(line_no), text=str(line))
+                    )
+                    if len(matches) >= _GREP_MAX_TOTAL_MATCHES:
+                        return matches
+
+        if normalized.startswith(DOCUMENTS_ROOT) or normalized == "/":
+            try:
+                async with shielded_async_session() as session:
+                    index = await build_path_index(session, self.search_space_id)
+                    sub = (
+                        select(Chunk.document_id, Chunk.id, Chunk.content)
+                        .join(Document, Document.id == Chunk.document_id)
+                        .where(Document.search_space_id == self.search_space_id)
+                        .where(Chunk.content.ilike(f"%{pattern}%"))
+                        .order_by(Chunk.document_id, Chunk.id)
+                    )
+                    chunk_rows = await session.execute(sub)
+                    per_doc: dict[int, int] = {}
+                    doc_id_to_path: dict[int, str] = {}
+                    needed_doc_ids: set[int] = set()
+                    chunk_buffer: list[tuple[int, int, str]] = []
+                    for row in chunk_rows.all():
+                        per_doc.setdefault(row.document_id, 0)
+                        if per_doc[row.document_id] >= _GREP_MAX_PER_DOC:
+                            continue
+                        per_doc[row.document_id] += 1
+                        chunk_buffer.append((row.document_id, row.id, row.content))
+                        needed_doc_ids.add(row.document_id)
+                        if sum(per_doc.values()) >= _GREP_MAX_TOTAL_MATCHES - len(
+                            matches
+                        ):
+                            break
+                    if needed_doc_ids:
+                        doc_rows = await session.execute(
+                            select(
+                                Document.id, Document.title, Document.folder_id
+                            ).where(Document.id.in_(list(needed_doc_ids)))
+                        )
+                        for row in doc_rows.all():
+                            doc_id_to_path[row.id] = doc_to_virtual_path(
+                                doc_id=row.id,
+                                title=str(row.title or "untitled"),
+                                folder_id=row.folder_id,
+                                index=index,
+                            )
+                    for doc_id, chunk_id, content in chunk_buffer:
+                        candidate = doc_id_to_path.get(doc_id)
+                        if (
+                            not candidate
+                            or candidate in moved_removed
+                            or self._is_dir_suppressed(candidate, deleted_dirs)
+                        ):
+                            continue
+                        if not _is_under(candidate, normalized):
+                            continue
+                        if glob_re is not None and not glob_re.match(
+                            _basename(candidate)
+                        ):
+                            continue
+                        snippet = " ".join(str(content).split())[:240]
+                        matches.append(
+                            GrepMatch(
+                                path=candidate,
+                                line=0,
+                                text=(
+                                    f"<chunk-match in {candidate} chunk_id={chunk_id}>: "
+                                    f"{snippet}"
+                                ),
+                            )
+                        )
+                        if len(matches) >= _GREP_MAX_TOTAL_MATCHES:
+                            break
+            except Exception as exc:  # pragma: no cover - defensive
+                logger.warning("KBPostgresBackend.agrep_raw DB error: %s", exc)
+
+        return matches
+
+    def grep_raw(  # type: ignore[override]
+        self,
+        pattern: str,
+        path: str | None = None,
+        glob: str | None = None,
+    ) -> list[GrepMatch] | str:
+        return asyncio.run(self.agrep_raw(pattern, path, glob))
+
+    # ------------------------------------------------------------------ list_tree (helper)
+
+    async def alist_tree_listing(
+        self,
+        path: str = DOCUMENTS_ROOT,
+        *,
+        max_depth: int | None = 8,
+        page_size: int = 500,
+        include_files: bool = True,
+        include_dirs: bool = True,
+    ) -> dict[str, Any]:
+        """Recursive tree listing for cloud mode.
+
+        Mirrors the shape returned by :class:`MultiRootLocalFolderBackend.list_tree`:
+        ``{"entries": [{path, is_dir, size, modified_at, depth}, ...], "truncated": bool}``.
+        """
+        normalized = self._normalize_listing_path(path or DOCUMENTS_ROOT)
+        if not normalized.startswith(DOCUMENTS_ROOT) and normalized != "/":
+            return {"error": "Error: path must be under /documents/"}
+
+        entries: list[dict[str, Any]] = []
+        truncated = False
+
+        try:
+            async with shielded_async_session() as session:
+                index = await build_path_index(session, self.search_space_id)
+                doc_rows_raw = await session.execute(
+                    select(
+                        Document.id,
+                        Document.title,
+                        Document.folder_id,
+                        Document.updated_at,
+                    ).where(Document.search_space_id == self.search_space_id)
+                )
+                doc_rows = list(doc_rows_raw.all())
+        except Exception as exc:  # pragma: no cover
+            logger.warning("KBPostgresBackend.alist_tree_listing DB error: %s", exc)
+            return {"entries": [], "truncated": False}
+
+        files = self._state_files()
+        moved_removed, _, deleted_dirs = self._pending_filesystem_view(files)
+        anon = self._kb_anon_doc()
+        anon_path = str(anon.get("path") or "") if anon else ""
+
+        def _depth_of(p: str) -> int:
+            if p == DOCUMENTS_ROOT:
+                return 0
+            rel_root = (
+                p[len(DOCUMENTS_ROOT) :].lstrip("/")
+                if normalized.startswith(DOCUMENTS_ROOT)
+                else p.lstrip("/")
+            )
+            return len([part for part in rel_root.split("/") if part])
+
+        def _add_entry(entry: dict[str, Any]) -> bool:
+            nonlocal truncated
+            if len(entries) >= page_size:
+                truncated = True
+                return False
+            entries.append(entry)
+            return True
+
+        if include_dirs:
+            for _fid, fpath in sorted(index.folder_paths.items(), key=lambda kv: kv[1]):
+                if not _is_under(fpath, normalized):
+                    continue
+                if self._is_dir_suppressed(fpath, deleted_dirs):
+                    continue
+                depth = _depth_of(fpath)
+                if max_depth is not None and depth > max_depth:
+                    continue
+                if not _add_entry(
+                    {
+                        "path": fpath,
+                        "is_dir": True,
+                        "size": 0,
+                        "modified_at": "",
+                        "depth": depth,
+                    }
+                ):
+                    return {"entries": entries, "truncated": True}
+            for staged in self._staged_dirs():
+                if not _is_under(staged, normalized):
+                    continue
+                if self._is_dir_suppressed(staged, deleted_dirs):
+                    continue
+                depth = _depth_of(staged)
+                if max_depth is not None and depth > max_depth:
+                    continue
+                if any(e["path"] == staged for e in entries):
+                    continue
+                if not _add_entry(
+                    {
+                        "path": staged,
+                        "is_dir": True,
+                        "size": 0,
+                        "modified_at": "",
+                        "depth": depth,
+                    }
+                ):
+                    return {"entries": entries, "truncated": True}
+
+        if include_files:
+            for row in sorted(doc_rows, key=lambda r: str(r.title or "")):
+                candidate = doc_to_virtual_path(
+                    doc_id=row.id,
+                    title=str(row.title or "untitled"),
+                    folder_id=row.folder_id,
+                    index=index,
+                )
+                if candidate in moved_removed or self._is_dir_suppressed(
+                    candidate, deleted_dirs
+                ):
+                    continue
+                if not _is_under(candidate, normalized):
+                    continue
+                depth = _depth_of(candidate)
+                if max_depth is not None and depth > max_depth:
+                    continue
+                modified = ""
+                if row.updated_at is not None:
+                    with contextlib.suppress(Exception):
+                        modified = row.updated_at.astimezone(UTC).isoformat()
+                if not _add_entry(
+                    {
+                        "path": candidate,
+                        "is_dir": False,
+                        "size": 0,
+                        "modified_at": modified,
+                        "depth": depth,
+                    }
+                ):
+                    return {"entries": entries, "truncated": True}
+
+            if anon_path and _is_under(anon_path, normalized):
+                depth = _depth_of(anon_path)
+                if (max_depth is None or depth <= max_depth) and not _add_entry(
+                    {
+                        "path": anon_path,
+                        "is_dir": False,
+                        "size": len(str(anon.get("content") or "")),
+                        "modified_at": "",
+                        "depth": depth,
+                    }
+                ):
+                    return {"entries": entries, "truncated": True}
+
+            for path_key, fd in files.items():
+                if not isinstance(path_key, str):
+                    continue
+                if not _is_under(path_key, normalized):
+                    continue
+                if path_key in moved_removed or self._is_dir_suppressed(
+                    path_key, deleted_dirs
+                ):
+                    continue
+                if any(e["path"] == path_key for e in entries):
+                    continue
+                if not (
+                    path_key.startswith(DOCUMENTS_ROOT)
+                    or _basename(path_key).startswith(_TEMP_PREFIX)
+                ):
+                    continue
+                depth = _depth_of(path_key)
+                if max_depth is not None and depth > max_depth:
+                    continue
+                size = self._file_data_size(fd) if isinstance(fd, dict) else 0
+                if not _add_entry(
+                    {
+                        "path": path_key,
+                        "is_dir": False,
+                        "size": int(size),
+                        "modified_at": fd.get("modified_at", "")
+                        if isinstance(fd, dict)
+                        else "",
+                        "depth": depth,
+                    }
+                ):
+                    return {"entries": entries, "truncated": True}
+
+        return {"entries": entries, "truncated": truncated}
+
+    # ------------------------------------------------------------------ uploads (unsupported)
+
+    def upload_files(  # type: ignore[override]
+        self, files: list[tuple[str, bytes]]
+    ) -> list[FileUploadResponse]:
+        msg = "KBPostgresBackend does not support upload_files."
+        raise NotImplementedError(msg)
+
+    def download_files(  # type: ignore[override]
+        self, paths: list[str]
+    ) -> list[FileDownloadResponse]:
+        responses: list[FileDownloadResponse] = []
+        files = self._state_files()
+        for path in paths:
+            fd = files.get(path)
+            if fd is None:
+                responses.append(
+                    FileDownloadResponse(
+                        path=path, content=None, error="file_not_found"
+                    )
+                )
+                continue
+            content_str = file_data_to_string(fd)
+            responses.append(
+                FileDownloadResponse(
+                    path=path,
+                    content=content_str.encode("utf-8"),
+                    error=None,
+                )
+            )
+        return responses
+
+
+# --- module-level small helpers ---------------------------------------------
+
+
+async def list_tree_listing(
+    backend: KBPostgresBackend,
+    path: str,
+    *,
+    max_depth: int | None = 8,
+    page_size: int = 500,
+    include_files: bool = True,
+    include_dirs: bool = True,
+) -> dict[str, Any]:
+    """Async helper used by the overridden ``list_tree`` tool wrapper."""
+    return await backend.alist_tree_listing(
+        path,
+        max_depth=max_depth,
+        page_size=page_size,
+        include_files=include_files,
+        include_dirs=include_dirs,
+    )
+
+
+__all__ = [
+    "KBPostgresBackend",
+    "list_tree_listing",
+    "paginate_listing",
+]
diff --git a/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py b/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py
index 7b0dd2f717..dc06f87631 100644
--- a/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py
+++ b/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py
@@ -1,10 +1,24 @@
-"""Knowledge-base pre-search middleware for the SurfSense new chat agent.
-
-This middleware runs before the main agent loop and seeds a virtual filesystem
-(`files` state) with relevant documents retrieved via hybrid search.  On each
-turn the filesystem is *expanded* — new results merge with documents loaded
-during prior turns — and a synthetic ``ls`` result is injected into the message
-history so the LLM is immediately aware of the current filesystem structure.
+"""Hybrid-search priority middleware for the SurfSense new chat agent.
+
+This middleware runs ``before_agent`` on every turn and writes:
+
+* ``state["kb_priority"]`` — the top-K most relevant documents for the
+  current user message, used to render a ``<priority_documents>`` system
+  message immediately before the user turn.
+* ``state["kb_matched_chunk_ids"]`` — internal hand-off mapping
+  (``Document.id`` → matched chunk IDs) consumed by
+  :class:`KBPostgresBackend._load_file_data` when the agent first reads each
+  document, so the XML wrapper can flag matched sections in
+  ``<chunk_index>``.
+
+The previous "scoped filesystem" behaviour (synthetic ``ls`` + state
+``files`` seeding) is intentionally removed: documents are now lazy-loaded
+from Postgres on demand, with the full workspace tree rendered separately
+by :class:`KnowledgeTreeMiddleware`.
+
+In anonymous mode the middleware skips hybrid search entirely and emits a
+single-entry priority list pointing at the Redis-loaded document
+(``state["kb_anon_doc"]``).
 """
 
 from __future__ import annotations
@@ -13,20 +27,28 @@
 import json
 import logging
 import re
-import uuid
 from collections.abc import Sequence
 from datetime import UTC, datetime
 from typing import Any
 
+from langchain.agents import create_agent
 from langchain.agents.middleware import AgentMiddleware, AgentState
 from langchain_core.language_models import BaseChatModel
-from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, ToolMessage
+from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage
+from langchain_core.runnables import Runnable
 from langgraph.runtime import Runtime
 from litellm import token_counter
 from pydantic import BaseModel, Field, ValidationError
 from sqlalchemy import select
-from sqlalchemy.ext.asyncio import AsyncSession
 
+from app.agents.new_chat.feature_flags import get_flags
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+from app.agents.new_chat.filesystem_state import SurfSenseFilesystemState
+from app.agents.new_chat.path_resolver import (
+    PathIndex,
+    build_path_index,
+    doc_to_virtual_path,
+)
 from app.agents.new_chat.utils import parse_date_or_datetime, resolve_date_range
 from app.db import (
     NATIVE_TO_LEGACY_DOCTYPE,
@@ -58,10 +80,17 @@ class KBSearchPlan(BaseModel):
         default=None,
         description="Optional ISO end date or datetime for KB search filtering.",
     )
+    is_recency_query: bool = Field(
+        default=False,
+        description=(
+            "True when the user's intent is primarily about recency or temporal "
+            "ordering (e.g. 'latest', 'newest', 'most recent', 'last uploaded') "
+            "rather than topical relevance."
+        ),
+    )
 
 
 def _extract_text_from_message(message: BaseMessage) -> str:
-    """Extract plain text from a message content."""
     content = getattr(message, "content", "")
     if isinstance(content, str):
         return content
@@ -76,19 +105,6 @@ def _extract_text_from_message(message: BaseMessage) -> str:
     return str(content)
 
 
-def _safe_filename(value: str, *, fallback: str = "untitled.xml") -> str:
-    """Convert arbitrary text into a filesystem-safe filename."""
-    name = re.sub(r"[\\/:*?\"<>|]+", "_", value).strip()
-    name = re.sub(r"\s+", " ", name)
-    if not name:
-        name = fallback
-    if len(name) > 180:
-        name = name[:180].rstrip()
-    if not name.lower().endswith(".xml"):
-        name = f"{name}.xml"
-    return name
-
-
 def _render_recent_conversation(
     messages: Sequence[BaseMessage],
     *,
@@ -98,10 +114,9 @@ def _render_recent_conversation(
 ) -> str:
     """Render recent dialogue for internal planning under a token budget.
 
-    Prefers the latest messages and uses the project's existing model-aware
-    token budgeting hooks when available on the LLM (`_count_tokens`,
-    `_get_max_input_tokens`). Falls back to the prior fixed-message heuristic
-    if token counting is unavailable.
+    Filters to ``HumanMessage`` and ``AIMessage`` (without tool_calls) so that
+    injected ``SystemMessage`` artefacts (priority list, workspace tree,
+    file-write contract) don't pollute the planner prompt.
     """
     rendered: list[tuple[str, str]] = []
     for message in messages:
@@ -124,8 +139,6 @@ def _render_recent_conversation(
     if not rendered:
         return ""
 
-    # Exclude the latest user message from "recent conversation" because it is
-    # already passed separately as "Latest user message" in the planner prompt.
     if rendered and rendered[-1][0] == "user" and rendered[-1][1] == user_text.strip():
         rendered = rendered[:-1]
 
@@ -207,8 +220,6 @@ def _count_prompt_tokens(conversation_text: str) -> int | None:
             selected_lines = candidate_lines
             continue
 
-        # If the full message does not fit, keep as much of this most-recent
-        # older message as possible via binary search.
         lo, hi = 1, len(text)
         best_line: str | None = None
         while lo <= hi:
@@ -240,12 +251,11 @@ def _build_kb_planner_prompt(
     recent_conversation: str,
     user_text: str,
 ) -> str:
-    """Build a compact internal prompt for KB query rewriting and date scoping."""
     today = datetime.now(UTC).date().isoformat()
     return (
         "You optimize internal knowledge-base search inputs for document retrieval.\n"
         "Return JSON only with this exact shape:\n"
-        '{"optimized_query":"string","start_date":"ISO string or null","end_date":"ISO string or null"}\n\n'
+        '{"optimized_query":"string","start_date":"ISO string or null","end_date":"ISO string or null","is_recency_query":bool}\n\n'
         "Rules:\n"
         "- Preserve the user's intent.\n"
         "- Rewrite the query to improve retrieval using concrete entities, acronyms, projects, tools, people, and document-specific terms when helpful.\n"
@@ -253,6 +263,11 @@ def _build_kb_planner_prompt(
         "- Only use date filters when the latest user request or recent dialogue clearly implies a time range.\n"
         "- If you use date filters, prefer returning both bounds.\n"
         "- If no date filter is useful, return null for both dates.\n"
+        '- Set "is_recency_query" to true ONLY when the user\'s primary intent is about '
+        "recency or temporal ordering rather than topical relevance. Examples: "
+        '"latest file", "newest upload", "most recent document", "what did I save last", '
+        '"show me files from today", "last thing I added". '
+        "When true, results will be sorted by date instead of relevance.\n"
         "- Do not include markdown, prose, or explanations.\n\n"
         f"Today's UTC date: {today}\n\n"
         f"Recent conversation:\n{recent_conversation or '(none)'}\n\n"
@@ -261,12 +276,10 @@ def _build_kb_planner_prompt(
 
 
 def _extract_json_payload(text: str) -> str:
-    """Extract a JSON object from a raw LLM response."""
     stripped = text.strip()
     fenced = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", stripped, re.DOTALL)
     if fenced:
         return fenced.group(1)
-
     start = stripped.find("{")
     end = stripped.rfind("}")
     if start != -1 and end != -1 and end > start:
@@ -275,7 +288,6 @@ def _extract_json_payload(text: str) -> str:
 
 
 def _parse_kb_search_plan_response(response_text: str) -> KBSearchPlan:
-    """Parse and validate the planner's JSON response."""
     payload = json.loads(_extract_json_payload(response_text))
     return KBSearchPlan.model_validate(payload)
 
@@ -284,212 +296,19 @@ def _normalize_optional_date_range(
     start_date: str | None,
     end_date: str | None,
 ) -> tuple[datetime | None, datetime | None]:
-    """Normalize optional planner dates into a UTC datetime range."""
     parsed_start = parse_date_or_datetime(start_date) if start_date else None
     parsed_end = parse_date_or_datetime(end_date) if end_date else None
 
     if parsed_start is None and parsed_end is None:
         return None, None
 
-    resolved_start, resolved_end = resolve_date_range(parsed_start, parsed_end)
-    return resolved_start, resolved_end
-
-
-def _build_document_xml(
-    document: dict[str, Any],
-    matched_chunk_ids: set[int] | None = None,
-) -> str:
-    """Build citation-friendly XML with a ``<chunk_index>`` for smart seeking.
-
-    The ``<chunk_index>`` at the top of each document lists every chunk with its
-    line range inside ``<document_content>`` and flags chunks that directly
-    matched the search query (``matched="true"``).  This lets the LLM jump
-    straight to the most relevant section via ``read_file(offset=…, limit=…)``
-    instead of reading sequentially from the start.
-    """
-    matched = matched_chunk_ids or set()
-
-    doc_meta = document.get("document") or {}
-    metadata = (doc_meta.get("metadata") or {}) if isinstance(doc_meta, dict) else {}
-    document_id = doc_meta.get("id", document.get("document_id", "unknown"))
-    document_type = doc_meta.get("document_type", document.get("source", "UNKNOWN"))
-    title = doc_meta.get("title") or metadata.get("title") or "Untitled Document"
-    url = (
-        metadata.get("url") or metadata.get("source") or metadata.get("page_url") or ""
-    )
-    metadata_json = json.dumps(metadata, ensure_ascii=False)
-
-    # --- 1. Metadata header (fixed structure) ---
-    metadata_lines: list[str] = [
-        "<document>",
-        "<document_metadata>",
-        f"  <document_id>{document_id}</document_id>",
-        f"  <document_type>{document_type}</document_type>",
-        f"  <title><![CDATA[{title}]]></title>",
-        f"  <url><![CDATA[{url}]]></url>",
-        f"  <metadata_json><![CDATA[{metadata_json}]]></metadata_json>",
-        "</document_metadata>",
-        "",
-    ]
-
-    # --- 2. Pre-build chunk XML strings to compute line counts ---
-    chunks = document.get("chunks") or []
-    chunk_entries: list[tuple[int | None, str]] = []  # (chunk_id, xml_string)
-    if isinstance(chunks, list):
-        for chunk in chunks:
-            if not isinstance(chunk, dict):
-                continue
-            chunk_id = chunk.get("chunk_id") or chunk.get("id")
-            chunk_content = str(chunk.get("content", "")).strip()
-            if not chunk_content:
-                continue
-            if chunk_id is None:
-                xml = f"  <chunk><![CDATA[{chunk_content}]]></chunk>"
-            else:
-                xml = f"  <chunk id='{chunk_id}'><![CDATA[{chunk_content}]]></chunk>"
-            chunk_entries.append((chunk_id, xml))
-
-    # --- 3. Compute line numbers for every chunk ---
-    # Layout (1-indexed lines for read_file):
-    #   metadata_lines          -> len(metadata_lines) lines
-    #   <chunk_index>           -> 1 line
-    #   index entries           -> len(chunk_entries) lines
-    #   </chunk_index>          -> 1 line
-    #   (empty line)            -> 1 line
-    #   <document_content>      -> 1 line
-    #   chunk xml lines…
-    #   </document_content>     -> 1 line
-    #   </document>             -> 1 line
-    index_overhead = (
-        1 + len(chunk_entries) + 1 + 1 + 1
-    )  # tags + empty + <document_content>
-    first_chunk_line = len(metadata_lines) + index_overhead + 1  # 1-indexed
-
-    current_line = first_chunk_line
-    index_entry_lines: list[str] = []
-    for cid, xml_str in chunk_entries:
-        num_lines = xml_str.count("\n") + 1
-        end_line = current_line + num_lines - 1
-        matched_attr = ' matched="true"' if cid is not None and cid in matched else ""
-        if cid is not None:
-            index_entry_lines.append(
-                f'  <entry chunk_id="{cid}" lines="{current_line}-{end_line}"{matched_attr}/>'
-            )
-        else:
-            index_entry_lines.append(
-                f'  <entry lines="{current_line}-{end_line}"{matched_attr}/>'
-            )
-        current_line = end_line + 1
-
-    # --- 4. Assemble final XML ---
-    lines = metadata_lines.copy()
-    lines.append("<chunk_index>")
-    lines.extend(index_entry_lines)
-    lines.append("</chunk_index>")
-    lines.append("")
-    lines.append("<document_content>")
-    for _, xml_str in chunk_entries:
-        lines.append(xml_str)
-    lines.extend(["</document_content>", "</document>"])
-    return "\n".join(lines)
-
-
-async def _get_folder_paths(
-    session: AsyncSession, search_space_id: int
-) -> dict[int, str]:
-    """Return a map of folder_id -> virtual folder path under /documents."""
-    result = await session.execute(
-        select(Folder.id, Folder.name, Folder.parent_id).where(
-            Folder.search_space_id == search_space_id
-        )
-    )
-    rows = result.all()
-    by_id = {row.id: {"name": row.name, "parent_id": row.parent_id} for row in rows}
-
-    cache: dict[int, str] = {}
-
-    def resolve_path(folder_id: int) -> str:
-        if folder_id in cache:
-            return cache[folder_id]
-        parts: list[str] = []
-        cursor: int | None = folder_id
-        visited: set[int] = set()
-        while cursor is not None and cursor in by_id and cursor not in visited:
-            visited.add(cursor)
-            entry = by_id[cursor]
-            parts.append(
-                _safe_filename(str(entry["name"]), fallback="folder").removesuffix(
-                    ".xml"
-                )
-            )
-            cursor = entry["parent_id"]
-        parts.reverse()
-        path = "/documents/" + "/".join(parts) if parts else "/documents"
-        cache[folder_id] = path
-        return path
-
-    for folder_id in by_id:
-        resolve_path(folder_id)
-    return cache
-
-
-def _build_synthetic_ls(
-    existing_files: dict[str, Any] | None,
-    new_files: dict[str, Any],
-    *,
-    mentioned_paths: set[str] | None = None,
-) -> tuple[AIMessage, ToolMessage]:
-    """Build a synthetic ls("/documents") tool-call + result for the LLM context.
-
-    Mentioned files are listed first.  A separate header tells the LLM which
-    files the user explicitly selected; the path list itself stays clean so
-    paths can be passed directly to ``read_file`` without stripping tags.
-    """
-    _mentioned = mentioned_paths or set()
-    merged: dict[str, Any] = {**(existing_files or {}), **new_files}
-    doc_paths = [
-        p for p, v in merged.items() if p.startswith("/documents/") and v is not None
-    ]
-
-    new_set = set(new_files)
-    mentioned_list = [p for p in doc_paths if p in _mentioned]
-    new_non_mentioned = [p for p in doc_paths if p in new_set and p not in _mentioned]
-    old_paths = [p for p in doc_paths if p not in new_set]
-    ordered = mentioned_list + new_non_mentioned + old_paths
-
-    parts: list[str] = []
-    if mentioned_list:
-        parts.append(
-            "USER-MENTIONED documents (read these thoroughly before answering):"
-        )
-        for p in mentioned_list:
-            parts.append(f"  {p}")
-        parts.append("")
-    parts.append(str(ordered) if ordered else "No documents found.")
-
-    tool_call_id = f"auto_ls_{uuid.uuid4().hex[:12]}"
-    ai_msg = AIMessage(
-        content="",
-        tool_calls=[{"name": "ls", "args": {"path": "/documents"}, "id": tool_call_id}],
-    )
-    tool_msg = ToolMessage(
-        content="\n".join(parts),
-        tool_call_id=tool_call_id,
-    )
-    return ai_msg, tool_msg
+    return resolve_date_range(parsed_start, parsed_end)
 
 
 def _resolve_search_types(
     available_connectors: list[str] | None,
     available_document_types: list[str] | None,
 ) -> list[str] | None:
-    """Build a flat list of document-type strings for the chunk retriever.
-
-    Includes legacy equivalents from ``NATIVE_TO_LEGACY_DOCTYPE`` so that
-    old documents indexed under Composio names are still found.
-
-    Returns ``None`` when no filtering is desired (search all types).
-    """
     types: set[str] = set()
     if available_document_types:
         types.update(available_document_types)
@@ -506,6 +325,124 @@ def _resolve_search_types(
     return list(expanded) if expanded else None
 
 
+_RECENCY_MAX_CHUNKS_PER_DOC = 5
+
+
+async def browse_recent_documents(
+    *,
+    search_space_id: int,
+    document_type: list[str] | None = None,
+    top_k: int = 10,
+    start_date: datetime | None = None,
+    end_date: datetime | None = None,
+) -> list[dict[str, Any]]:
+    """Return documents ordered by recency (newest first), no relevance ranking."""
+    from sqlalchemy import func
+
+    from app.db import DocumentType
+
+    async with shielded_async_session() as session:
+        base_conditions = [
+            Document.search_space_id == search_space_id,
+            func.coalesce(Document.status["state"].astext, "ready") != "deleting",
+        ]
+
+        if document_type is not None:
+            import contextlib
+
+            doc_type_enums = []
+            for dt in document_type:
+                if isinstance(dt, str):
+                    with contextlib.suppress(KeyError):
+                        doc_type_enums.append(DocumentType[dt])
+                else:
+                    doc_type_enums.append(dt)
+            if doc_type_enums:
+                if len(doc_type_enums) == 1:
+                    base_conditions.append(Document.document_type == doc_type_enums[0])
+                else:
+                    base_conditions.append(Document.document_type.in_(doc_type_enums))
+
+        if start_date is not None:
+            base_conditions.append(Document.updated_at >= start_date)
+        if end_date is not None:
+            base_conditions.append(Document.updated_at <= end_date)
+
+        doc_query = (
+            select(Document)
+            .where(*base_conditions)
+            .order_by(Document.updated_at.desc())
+            .limit(top_k)
+        )
+        result = await session.execute(doc_query)
+        documents = result.scalars().unique().all()
+
+        if not documents:
+            return []
+
+        doc_ids = [d.id for d in documents]
+        numbered = (
+            select(
+                Chunk.id.label("chunk_id"),
+                Chunk.document_id,
+                Chunk.content,
+                func.row_number()
+                .over(partition_by=Chunk.document_id, order_by=Chunk.id)
+                .label("rn"),
+            )
+            .where(Chunk.document_id.in_(doc_ids))
+            .subquery("numbered")
+        )
+
+        chunk_query = (
+            select(numbered.c.chunk_id, numbered.c.document_id, numbered.c.content)
+            .where(numbered.c.rn <= _RECENCY_MAX_CHUNKS_PER_DOC)
+            .order_by(numbered.c.document_id, numbered.c.chunk_id)
+        )
+        chunk_result = await session.execute(chunk_query)
+        fetched_chunks = chunk_result.all()
+
+    doc_chunks: dict[int, list[dict[str, Any]]] = {d.id: [] for d in documents}
+    for row in fetched_chunks:
+        if row.document_id in doc_chunks:
+            doc_chunks[row.document_id].append(
+                {"chunk_id": row.chunk_id, "content": row.content}
+            )
+
+    results: list[dict[str, Any]] = []
+    for doc in documents:
+        chunks_list = doc_chunks.get(doc.id, [])
+        metadata = doc.document_metadata or {}
+        results.append(
+            {
+                "document_id": doc.id,
+                "content": "\n\n".join(
+                    c["content"] for c in chunks_list if c.get("content")
+                ),
+                "score": 0.0,
+                "chunks": chunks_list,
+                "matched_chunk_ids": [],
+                "document": {
+                    "id": doc.id,
+                    "title": doc.title,
+                    "document_type": (
+                        doc.document_type.value
+                        if getattr(doc, "document_type", None)
+                        else None
+                    ),
+                    "metadata": metadata,
+                    "folder_id": getattr(doc, "folder_id", None),
+                },
+                "source": (
+                    doc.document_type.value
+                    if getattr(doc, "document_type", None)
+                    else None
+                ),
+            }
+        )
+    return results
+
+
 async def search_knowledge_base(
     *,
     query: str,
@@ -516,17 +453,11 @@ async def search_knowledge_base(
     start_date: datetime | None = None,
     end_date: datetime | None = None,
 ) -> list[dict[str, Any]]:
-    """Run a single unified hybrid search against the knowledge base.
-
-    Uses one ``ChucksHybridSearchRetriever`` call across all document types
-    instead of fanning out per-connector.  This reduces the number of DB
-    queries from ~10 to 2 (one RRF query + one chunk fetch).
-    """
+    """Run a single unified hybrid search against the knowledge base."""
     if not query:
         return []
 
     [embedding] = embed_texts([query])
-
     doc_types = _resolve_search_types(available_connectors, available_document_types)
     retriever_top_k = min(top_k * 3, 30)
 
@@ -550,14 +481,7 @@ async def fetch_mentioned_documents(
     document_ids: list[int],
     search_space_id: int,
 ) -> list[dict[str, Any]]:
-    """Fetch explicitly mentioned documents with *all* their chunks.
-
-    Returns the same dict structure as ``search_knowledge_base`` so results
-    can be merged directly into ``build_scoped_filesystem``.  Unlike search
-    results, every chunk is included (no top-K limiting) and none are marked
-    as ``matched`` since the entire document is relevant by virtue of the
-    user's explicit mention.
-    """
+    """Fetch explicitly mentioned documents."""
     if not document_ids:
         return []
 
@@ -607,6 +531,7 @@ async def fetch_mentioned_documents(
                         else None
                     ),
                     "metadata": metadata,
+                    "folder_id": getattr(doc, "folder_id", None),
                 },
                 "source": (
                     doc.document_type.value
@@ -619,95 +544,111 @@ async def fetch_mentioned_documents(
     return results
 
 
-async def build_scoped_filesystem(
-    *,
-    documents: Sequence[dict[str, Any]],
-    search_space_id: int,
-) -> tuple[dict[str, dict[str, str]], dict[int, str]]:
-    """Build a StateBackend-compatible files dict from search results.
-
-    Returns ``(files, doc_id_to_path)`` so callers can reliably map a
-    document id back to its filesystem path without guessing by title.
-    Paths are collision-proof: when two documents resolve to the same
-    path the doc-id is appended to disambiguate.
-    """
-    async with shielded_async_session() as session:
-        folder_paths = await _get_folder_paths(session, search_space_id)
-        doc_ids = [
-            (doc.get("document") or {}).get("id")
-            for doc in documents
-            if isinstance(doc, dict)
-        ]
-        doc_ids = [doc_id for doc_id in doc_ids if isinstance(doc_id, int)]
-        folder_by_doc_id: dict[int, int | None] = {}
-        if doc_ids:
-            doc_rows = await session.execute(
-                select(Document.id, Document.folder_id).where(
-                    Document.search_space_id == search_space_id,
-                    Document.id.in_(doc_ids),
-                )
-            )
-            folder_by_doc_id = {
-                row.id: row.folder_id for row in doc_rows.all() if row.id is not None
-            }
-
-    files: dict[str, dict[str, str]] = {}
-    doc_id_to_path: dict[int, str] = {}
-    for document in documents:
-        doc_meta = document.get("document") or {}
-        title = str(doc_meta.get("title") or "untitled")
-        doc_id = doc_meta.get("id")
-        folder_id = folder_by_doc_id.get(doc_id) if isinstance(doc_id, int) else None
-        base_folder = folder_paths.get(folder_id, "/documents")
-        file_name = _safe_filename(title)
-        path = f"{base_folder}/{file_name}"
-        if path in files:
-            stem = file_name.removesuffix(".xml")
-            path = f"{base_folder}/{stem} ({doc_id}).xml"
-        matched_ids = set(document.get("matched_chunk_ids") or [])
-        xml_content = _build_document_xml(document, matched_chunk_ids=matched_ids)
-        files[path] = {
-            "content": xml_content.split("\n"),
-            "encoding": "utf-8",
-            "created_at": "",
-            "modified_at": "",
-        }
-        if isinstance(doc_id, int):
-            doc_id_to_path[doc_id] = path
-    return files, doc_id_to_path
+def _render_priority_message(priority: list[dict[str, Any]]) -> SystemMessage:
+    """Render the priority list as a single ``<priority_documents>`` system message."""
+    if not priority:
+        body = "(no priority documents for this turn)"
+    else:
+        lines: list[str] = []
+        for entry in priority:
+            score = entry.get("score")
+            mentioned = entry.get("mentioned")
+            score_str = f"{score:.3f}" if isinstance(score, int | float) else "n/a"
+            mark = " [USER-MENTIONED]" if mentioned else ""
+            lines.append(f"- {entry.get('path', '')} (score={score_str}){mark}")
+        body = "\n".join(lines)
+    return SystemMessage(
+        content=(
+            "<priority_documents>\n"
+            "These documents are most relevant to the latest user message; "
+            "read them first. Matched sections are flagged inside each "
+            "document's <chunk_index>.\n"
+            f"{body}\n"
+            "</priority_documents>"
+        )
+    )
 
 
-class KnowledgeBaseSearchMiddleware(AgentMiddleware):  # type: ignore[type-arg]
-    """Pre-agent middleware that always searches the KB and seeds a scoped filesystem."""
+class KnowledgePriorityMiddleware(AgentMiddleware):  # type: ignore[type-arg]
+    """Compute hybrid-search priority hints for the current turn."""
 
     tools = ()
+    state_schema = SurfSenseFilesystemState
 
     def __init__(
         self,
         *,
         llm: BaseChatModel | None = None,
         search_space_id: int,
+        filesystem_mode: FilesystemMode = FilesystemMode.CLOUD,
         available_connectors: list[str] | None = None,
         available_document_types: list[str] | None = None,
         top_k: int = 10,
         mentioned_document_ids: list[int] | None = None,
+        inject_system_message: bool = True,  # For backwards compatibility
     ) -> None:
         self.llm = llm
         self.search_space_id = search_space_id
+        self.filesystem_mode = filesystem_mode
         self.available_connectors = available_connectors
         self.available_document_types = available_document_types
         self.top_k = top_k
         self.mentioned_document_ids = mentioned_document_ids or []
+        self.inject_system_message = inject_system_message
+        # Build the kb-planner private Runnable ONCE here so we don't pay
+        # the ``create_agent`` compile cost (50-200ms) on every turn.
+        # Disabled by default behind ``enable_kb_planner_runnable``; when
+        # off the planner falls back to the legacy ``self.llm.ainvoke``
+        # path.
+        self._planner: Runnable | None = None
+        self._planner_compile_failed = False
+
+    def _build_kb_planner_runnable(self) -> Runnable | None:
+        """Compile the kb-planner private :class:`Runnable` once.
+
+        Returns ``None`` when the feature flag is disabled, when the LLM is
+        unavailable, or when ``create_agent`` raises (we fall back to the
+        legacy ``self.llm.ainvoke`` path in that case). Compilation happens
+        lazily on first call, then memoized via ``self._planner``.
+
+        The compiled agent is constructed without tools — the planner's
+        contract is "answer with structured JSON" — but it inherits the
+        :class:`RetryAfterMiddleware` so transient rate-limit errors
+        from the planner LLM call don't fail the whole turn.
+        """
+        if self._planner is not None or self._planner_compile_failed:
+            return self._planner
+        if self.llm is None:
+            return None
+        flags = get_flags()
+        if not flags.enable_kb_planner_runnable or flags.disable_new_agent_stack:
+            return None
+
+        from app.agents.new_chat.middleware.retry_after import RetryAfterMiddleware
+
+        try:
+            self._planner = create_agent(
+                self.llm,
+                tools=[],
+                middleware=[RetryAfterMiddleware(max_retries=2)],
+            )
+        except Exception as exc:  # pragma: no cover - defensive
+            logger.warning(
+                "kb-planner Runnable compile failed; falling back to llm.ainvoke: %s",
+                exc,
+            )
+            self._planner_compile_failed = True
+            self._planner = None
+        return self._planner
 
     async def _plan_search_inputs(
         self,
         *,
         messages: Sequence[BaseMessage],
         user_text: str,
-    ) -> tuple[str, datetime | None, datetime | None]:
-        """Rewrite the KB query and infer optional date filters with the LLM."""
+    ) -> tuple[str, datetime | None, datetime | None, bool]:
         if self.llm is None:
-            return user_text, None, None
+            return user_text, None, None, False
 
         recent_conversation = _render_recent_conversation(
             messages,
@@ -721,11 +662,32 @@ async def _plan_search_inputs(
         loop = asyncio.get_running_loop()
         t0 = loop.time()
 
+        # Prefer the compiled-once planner Runnable when enabled; otherwise
+        # fall back to ``self.llm.ainvoke``. The ``surfsense:internal`` tag
+        # is preserved on both paths so ``_stream_agent_events`` still
+        # suppresses the planner's intermediate events from the UI.
+        planner = self._build_kb_planner_runnable()
         try:
-            response = await self.llm.ainvoke(
-                [HumanMessage(content=prompt)],
-                config={"tags": ["surfsense:internal"]},
-            )
+            if planner is not None:
+                planner_state = await planner.ainvoke(
+                    {"messages": [HumanMessage(content=prompt)]},
+                    config={"tags": ["surfsense:internal"]},
+                )
+                response_messages = (
+                    planner_state.get("messages", [])
+                    if isinstance(planner_state, dict)
+                    else []
+                )
+                response = (
+                    response_messages[-1]
+                    if response_messages
+                    else AIMessage(content="")
+                )
+            else:
+                response = await self.llm.ainvoke(
+                    [HumanMessage(content=prompt)],
+                    config={"tags": ["surfsense:internal"]},
+                )
             plan = _parse_kb_search_plan_response(_extract_text_from_message(response))
             optimized_query = (
                 re.sub(r"\s+", " ", plan.optimized_query).strip() or user_text
@@ -734,15 +696,18 @@ async def _plan_search_inputs(
                 plan.start_date,
                 plan.end_date,
             )
+            is_recency = plan.is_recency_query
             _perf_log.info(
-                "[kb_fs_middleware] planner in %.3fs query=%r optimized=%r start=%s end=%s",
+                "[kb_priority] planner in %.3fs query=%r optimized=%r "
+                "start=%s end=%s recency=%s",
                 loop.time() - t0,
                 user_text[:80],
                 optimized_query[:120],
                 start_date.isoformat() if start_date else None,
                 end_date.isoformat() if end_date else None,
+                is_recency,
             )
-            return optimized_query, start_date, end_date
+            return optimized_query, start_date, end_date, is_recency
         except (json.JSONDecodeError, ValidationError, ValueError) as exc:
             logger.warning(
                 "KB planner returned invalid output, using raw query: %s", exc
@@ -750,7 +715,7 @@ async def _plan_search_inputs(
         except Exception as exc:  # pragma: no cover - defensive fallback
             logger.warning("KB planner failed, using raw query: %s", exc)
 
-        return user_text, None, None
+        return user_text, None, None, False
 
     def before_agent(  # type: ignore[override]
         self,
@@ -770,91 +735,313 @@ async def abefore_agent(  # type: ignore[override]
         state: AgentState,
         runtime: Runtime[Any],
     ) -> dict[str, Any] | None:
-        del runtime
+        if self.filesystem_mode != FilesystemMode.CLOUD:
+            return None
+
         messages = state.get("messages") or []
         if not messages:
             return None
-        last_message = messages[-1]
-        if not isinstance(last_message, HumanMessage):
-            return None
 
-        user_text = _extract_text_from_message(last_message).strip()
+        last_human: HumanMessage | None = None
+        for msg in reversed(messages):
+            if isinstance(msg, HumanMessage):
+                last_human = msg
+                break
+        if last_human is None:
+            return None
+        user_text = _extract_text_from_message(last_human).strip()
         if not user_text:
             return None
 
-        t0 = _perf_log and asyncio.get_event_loop().time()
-        existing_files = state.get("files")
-        planned_query, start_date, end_date = await self._plan_search_inputs(
+        anon_doc = state.get("kb_anon_doc")
+        if anon_doc:
+            return self._anon_priority(state, anon_doc)
+
+        return await self._authenticated_priority(state, messages, user_text, runtime)
+
+    def _anon_priority(
+        self,
+        state: AgentState,
+        anon_doc: dict[str, Any],
+    ) -> dict[str, Any]:
+        path = str(anon_doc.get("path") or "")
+        title = str(anon_doc.get("title") or "uploaded_document")
+        priority = [
+            {
+                "path": path,
+                "score": 1.0,
+                "document_id": None,
+                "title": title,
+                "mentioned": True,
+            }
+        ]
+        update: dict[str, Any] = {
+            "kb_priority": priority,
+            "kb_matched_chunk_ids": {},
+        }
+        if self.inject_system_message:
+            new_messages = list(state.get("messages") or [])
+            insert_at = max(len(new_messages) - 1, 0)
+            new_messages.insert(insert_at, _render_priority_message(priority))
+            update["messages"] = new_messages
+        return update
+
+    async def _authenticated_priority(
+        self,
+        state: AgentState,
+        messages: Sequence[BaseMessage],
+        user_text: str,
+        runtime: Runtime[Any] | None = None,
+    ) -> dict[str, Any]:
+        t0 = asyncio.get_event_loop().time()
+        (
+            planned_query,
+            start_date,
+            end_date,
+            is_recency,
+        ) = await self._plan_search_inputs(
             messages=messages,
             user_text=user_text,
         )
 
-        # --- 1. Fetch mentioned documents (user-selected, all chunks) ---
+        # Per-turn ``mentioned_document_ids`` flow:
+        # 1. Preferred path (Phase 1.5+): read from ``runtime.context`` — the
+        #    streaming task supplies a fresh :class:`SurfSenseContextSchema`
+        #    on every ``astream_events`` call, so this list is naturally
+        #    scoped to the current turn. Allows cross-turn graph reuse via
+        #    ``agent_cache``.
+        # 2. Legacy fallback (cache disabled / context not propagated): the
+        #    constructor-injected ``self.mentioned_document_ids`` list. We
+        #    drain it after the first read so a cached graph (no Phase 1.5
+        #    wiring) doesn't keep replaying the same mentions on every
+        #    turn.
+        #
+        # CRITICAL: distinguish "context absent" (legacy caller, no field at
+        # all) from "context provided but empty" (turn with no mentions).
+        # ``ctx_mentions`` is a ``list[int]``; an empty list is falsy in
+        # Python, so a naive ``if ctx_mentions:`` would fall through to the
+        # legacy closure on every no-mention follow-up turn — replaying the
+        # mentions baked in by turn 1's cache-miss build. Always drain the
+        # closure once the runtime path has fired so a cached middleware
+        # instance can never resurrect stale state.
+        mention_ids: list[int] = []
+        ctx = getattr(runtime, "context", None) if runtime is not None else None
+        ctx_mentions = getattr(ctx, "mentioned_document_ids", None) if ctx else None
+        if ctx_mentions is not None:
+            # Runtime path is authoritative — even an empty list means
+            # "this turn has no mentions", NOT "look at the closure".
+            mention_ids = list(ctx_mentions)
+            if self.mentioned_document_ids:
+                self.mentioned_document_ids = []
+        elif self.mentioned_document_ids:
+            mention_ids = list(self.mentioned_document_ids)
+            self.mentioned_document_ids = []
+
+        # Folder mentions live alongside doc mentions on the runtime
+        # context. They never feed hybrid search (folders aren't
+        # embedded) — they're surfaced purely as ``[USER-MENTIONED]``
+        # priority entries so the agent walks the folder with ``ls`` /
+        # ``find_documents`` instead of ignoring it. Cloud filesystem
+        # mode only.
+        folder_mention_ids: list[int] = []
+        if (
+            ctx is not None
+            and getattr(self, "filesystem_mode", FilesystemMode.CLOUD)
+            == FilesystemMode.CLOUD
+        ):
+            ctx_folders = getattr(ctx, "mentioned_folder_ids", None)
+            if ctx_folders:
+                folder_mention_ids = list(ctx_folders)
+
         mentioned_results: list[dict[str, Any]] = []
-        if self.mentioned_document_ids:
+        if mention_ids:
             mentioned_results = await fetch_mentioned_documents(
-                document_ids=self.mentioned_document_ids,
+                document_ids=mention_ids,
                 search_space_id=self.search_space_id,
             )
-            # Clear after first turn so they are not re-fetched on subsequent
-            # messages within the same agent instance.
-            self.mentioned_document_ids = []
 
-        # --- 2. Run KB hybrid search ---
-        search_results = await search_knowledge_base(
-            query=planned_query,
-            search_space_id=self.search_space_id,
-            available_connectors=self.available_connectors,
-            available_document_types=self.available_document_types,
-            top_k=self.top_k,
-            start_date=start_date,
-            end_date=end_date,
-        )
+        if is_recency:
+            doc_types = _resolve_search_types(
+                self.available_connectors, self.available_document_types
+            )
+            search_results = await browse_recent_documents(
+                search_space_id=self.search_space_id,
+                document_type=doc_types,
+                top_k=self.top_k,
+                start_date=start_date,
+                end_date=end_date,
+            )
+        else:
+            search_results = await search_knowledge_base(
+                query=planned_query,
+                search_space_id=self.search_space_id,
+                available_connectors=self.available_connectors,
+                available_document_types=self.available_document_types,
+                top_k=self.top_k,
+                start_date=start_date,
+                end_date=end_date,
+            )
 
-        # --- 3. Merge: mentioned first, then search (dedup by doc id) ---
         seen_doc_ids: set[int] = set()
         merged: list[dict[str, Any]] = []
         for doc in mentioned_results:
             doc_id = (doc.get("document") or {}).get("id")
-            if doc_id is not None:
+            if isinstance(doc_id, int):
                 seen_doc_ids.add(doc_id)
             merged.append(doc)
         for doc in search_results:
             doc_id = (doc.get("document") or {}).get("id")
-            if doc_id is not None and doc_id in seen_doc_ids:
+            if isinstance(doc_id, int) and doc_id in seen_doc_ids:
                 continue
             merged.append(doc)
 
-        # --- 4. Build scoped filesystem ---
-        new_files, doc_id_to_path = await build_scoped_filesystem(
-            documents=merged,
-            search_space_id=self.search_space_id,
-        )
+        priority, matched_chunk_ids = await self._materialize_priority(merged)
 
-        # Identify which paths belong to user-mentioned documents using
-        # the authoritative doc_id -> path mapping (no title guessing).
-        mentioned_doc_ids = {
-            (d.get("document") or {}).get("id") for d in mentioned_results
-        }
-        mentioned_paths = {
-            doc_id_to_path[did] for did in mentioned_doc_ids if did in doc_id_to_path
-        }
+        if folder_mention_ids:
+            folder_entries = await self._materialize_folder_priority(folder_mention_ids)
+            priority = folder_entries + priority
 
-        ai_msg, tool_msg = _build_synthetic_ls(
-            existing_files,
-            new_files,
-            mentioned_paths=mentioned_paths,
+        _perf_log.info(
+            "[kb_priority] completed in %.3fs query=%r priority=%d mentioned=%d folders=%d",
+            asyncio.get_event_loop().time() - t0,
+            user_text[:80],
+            len(priority),
+            len(mentioned_results),
+            len(folder_mention_ids),
         )
 
-        if t0 is not None:
-            _perf_log.info(
-                "[kb_fs_middleware] completed in %.3fs query=%r optimized=%r "
-                "mentioned=%d new_files=%d total=%d",
-                asyncio.get_event_loop().time() - t0,
-                user_text[:80],
-                planned_query[:120],
-                len(mentioned_results),
-                len(new_files),
-                len(new_files) + len(existing_files or {}),
+        update: dict[str, Any] = {
+            "kb_priority": priority,
+            "kb_matched_chunk_ids": matched_chunk_ids,
+        }
+        if self.inject_system_message:
+            new_messages = list(messages)
+            insert_at = max(len(new_messages) - 1, 0)
+            new_messages.insert(insert_at, _render_priority_message(priority))
+            update["messages"] = new_messages
+        return update
+
+    async def _materialize_folder_priority(
+        self, folder_ids: list[int]
+    ) -> list[dict[str, Any]]:
+        """Resolve user-mentioned folder ids to ``<priority_documents>`` entries.
+
+        Each entry uses the canonical ``/documents/Folder/Sub/`` virtual
+        path (matching ``KnowledgeTreeMiddleware`` and the agent's
+        ``ls`` adapter) and is flagged ``mentioned=True`` so the
+        rendered line carries ``[USER-MENTIONED]``. ``score`` is left
+        ``None`` so the renderer prints ``n/a`` — folders aren't
+        ranked, the agent decides which children to read.
+        """
+        if not folder_ids:
+            return []
+        async with shielded_async_session() as session:
+            index: PathIndex = await build_path_index(session, self.search_space_id)
+            folder_rows = await session.execute(
+                select(Folder.id, Folder.name).where(
+                    Folder.search_space_id == self.search_space_id,
+                    Folder.id.in_(folder_ids),
+                )
+            )
+            folder_titles: dict[int, str] = {
+                row.id: row.name for row in folder_rows.all()
+            }
+
+        entries: list[dict[str, Any]] = []
+        seen: set[int] = set()
+        for folder_id in folder_ids:
+            if folder_id in seen:
+                continue
+            seen.add(folder_id)
+            base = index.folder_paths.get(folder_id)
+            if base is None:
+                logger.debug(
+                    "kb_priority: dropping folder id=%s (missing from path index)",
+                    folder_id,
+                )
+                continue
+            path = base if base.endswith("/") else f"{base}/"
+            entries.append(
+                {
+                    "path": path,
+                    "score": None,
+                    "document_id": None,
+                    "folder_id": folder_id,
+                    "title": folder_titles.get(folder_id, ""),
+                    "mentioned": True,
+                }
+            )
+        return entries
+
+    async def _materialize_priority(
+        self, merged: list[dict[str, Any]]
+    ) -> tuple[list[dict[str, Any]], dict[int, list[int]]]:
+        """Resolve canonical paths and matched chunk ids for the priority list."""
+        priority: list[dict[str, Any]] = []
+        matched_chunk_ids: dict[int, list[int]] = {}
+
+        if not merged:
+            return priority, matched_chunk_ids
+
+        async with shielded_async_session() as session:
+            index: PathIndex = await build_path_index(session, self.search_space_id)
+            doc_ids = [
+                (doc.get("document") or {}).get("id")
+                for doc in merged
+                if isinstance(doc, dict)
+            ]
+            doc_ids = [doc_id for doc_id in doc_ids if isinstance(doc_id, int)]
+            folder_by_doc_id: dict[int, int | None] = {}
+            if doc_ids:
+                folder_rows = await session.execute(
+                    select(Document.id, Document.folder_id).where(
+                        Document.search_space_id == self.search_space_id,
+                        Document.id.in_(doc_ids),
+                    )
+                )
+                folder_by_doc_id = {row.id: row.folder_id for row in folder_rows.all()}
+
+        for doc in merged:
+            doc_meta = doc.get("document") or {}
+            doc_id = doc_meta.get("id")
+            title = doc_meta.get("title") or "untitled"
+            folder_id = (
+                folder_by_doc_id.get(doc_id)
+                if isinstance(doc_id, int)
+                else doc_meta.get("folder_id")
+            )
+            path = doc_to_virtual_path(
+                doc_id=doc_id if isinstance(doc_id, int) else None,
+                title=str(title),
+                folder_id=folder_id if isinstance(folder_id, int) else None,
+                index=index,
+            )
+            priority.append(
+                {
+                    "path": path,
+                    "score": float(doc.get("score") or 0.0),
+                    "document_id": doc_id if isinstance(doc_id, int) else None,
+                    "title": str(title),
+                    "mentioned": bool(doc.get("_user_mentioned")),
+                }
             )
-        return {"files": new_files, "messages": [ai_msg, tool_msg]}
+            if isinstance(doc_id, int):
+                chunk_ids = doc.get("matched_chunk_ids") or []
+                if chunk_ids:
+                    matched_chunk_ids[doc_id] = [
+                        int(cid) for cid in chunk_ids if isinstance(cid, int | str)
+                    ]
+        return priority, matched_chunk_ids
+
+
+# Backwards-compatible alias for any external imports.
+KnowledgeBaseSearchMiddleware = KnowledgePriorityMiddleware
+
+
+__all__ = [
+    "KnowledgeBaseSearchMiddleware",
+    "KnowledgePriorityMiddleware",
+    "browse_recent_documents",
+    "fetch_mentioned_documents",
+    "search_knowledge_base",
+]
diff --git a/surfsense_backend/app/agents/new_chat/middleware/knowledge_tree.py b/surfsense_backend/app/agents/new_chat/middleware/knowledge_tree.py
new file mode 100644
index 0000000000..4ed1e53f4d
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/middleware/knowledge_tree.py
@@ -0,0 +1,315 @@
+"""Workspace-tree middleware for the SurfSense agent.
+
+Renders the full ``Folder``+``Document`` tree under ``/documents/`` once per
+turn (cloud only), caches it by ``(search_space_id, tree_version)``, and
+injects the result as a ``<workspace_tree>`` system message immediately
+before the latest human turn.
+
+The render is bounded by two truncation layers:
+
+1. **Entry cap** — at most ``MAX_TREE_ENTRIES`` lines. The remainder is
+   replaced with a "use ls" hint.
+2. **Token cap** — at most ``MAX_TREE_TOKENS`` tokens (using the LLM's
+   token-count profile when available). If the entry-truncated tree still
+   exceeds the token cap we fall back to a root-only summary.
+
+Anonymous mode renders only ``state['kb_anon_doc']`` (no DB calls).
+
+This middleware also performs a one-time initialization of ``state['cwd']``
+to ``"/documents"`` so subsequent middlewares and tools always see a valid
+cwd in cloud mode.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+from typing import Any
+
+from langchain.agents.middleware import AgentMiddleware, AgentState
+from langchain_core.language_models import BaseChatModel
+from langchain_core.messages import SystemMessage
+from langgraph.runtime import Runtime
+from sqlalchemy import select
+
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+from app.agents.new_chat.filesystem_state import SurfSenseFilesystemState
+from app.agents.new_chat.path_resolver import (
+    DOCUMENTS_ROOT,
+    PathIndex,
+    build_path_index,
+    doc_to_virtual_path,
+)
+from app.db import Document, shielded_async_session
+
+try:
+    from litellm import token_counter
+except Exception:  # pragma: no cover - optional dep
+    token_counter = None  # type: ignore[assignment]
+
+logger = logging.getLogger(__name__)
+
+
+MAX_TREE_ENTRIES = 500
+MAX_TREE_TOKENS = 4000
+
+
+def _approx_tokens(text: str) -> int:
+    """Cheap fallback token estimate (1 token ~= 4 chars)."""
+    return max(1, (len(text) + 3) // 4)
+
+
+def _count_tokens(text: str, *, llm: BaseChatModel | None) -> int:
+    if llm is None:
+        return _approx_tokens(text)
+    count_fn = getattr(llm, "_count_tokens", None)
+    if callable(count_fn):
+        try:
+            return int(count_fn([{"role": "user", "content": text}]))
+        except Exception:
+            pass
+    profile = getattr(llm, "profile", None)
+    model_names: list[str] = []
+    if isinstance(profile, dict):
+        tcms = profile.get("token_count_models")
+        if isinstance(tcms, list):
+            model_names.extend(name for name in tcms if isinstance(name, str) and name)
+        tcm = profile.get("token_count_model")
+        if isinstance(tcm, str) and tcm and tcm not in model_names:
+            model_names.append(tcm)
+    model_name = model_names[0] if model_names else getattr(llm, "model", None)
+    if not isinstance(model_name, str) or not model_name or token_counter is None:
+        return _approx_tokens(text)
+    try:
+        return int(
+            token_counter(
+                messages=[{"role": "user", "content": text}],
+                model=model_name,
+            )
+        )
+    except Exception:
+        return _approx_tokens(text)
+
+
+class KnowledgeTreeMiddleware(AgentMiddleware):  # type: ignore[type-arg]
+    """Inject the workspace folder/document tree into the agent's context."""
+
+    tools = ()
+    state_schema = SurfSenseFilesystemState
+
+    def __init__(
+        self,
+        *,
+        search_space_id: int,
+        filesystem_mode: FilesystemMode,
+        llm: BaseChatModel | None = None,
+        max_entries: int = MAX_TREE_ENTRIES,
+        max_tokens: int = MAX_TREE_TOKENS,
+        inject_system_message: bool = True,  # For backwards compatibility
+    ) -> None:
+        self.search_space_id = search_space_id
+        self.filesystem_mode = filesystem_mode
+        self.llm = llm
+        self.max_entries = max_entries
+        self.max_tokens = max_tokens
+        self.inject_system_message = inject_system_message
+        self._cache: dict[tuple[int, int, bool], str] = {}
+
+    async def abefore_agent(  # type: ignore[override]
+        self,
+        state: AgentState,
+        runtime: Runtime[Any],
+    ) -> dict[str, Any] | None:
+        del runtime
+        if self.filesystem_mode != FilesystemMode.CLOUD:
+            return None
+
+        update: dict[str, Any] = {}
+        if not state.get("cwd"):
+            update["cwd"] = DOCUMENTS_ROOT
+
+        anon_doc = state.get("kb_anon_doc")
+        if anon_doc:
+            tree_msg = self._render_anon_tree(anon_doc)
+        else:
+            tree_msg = await self._render_kb_tree(state)
+
+        update["workspace_tree_text"] = tree_msg
+
+        if self.inject_system_message:
+            messages = list(state.get("messages") or [])
+            insert_at = max(len(messages) - 1, 0)
+            messages.insert(insert_at, SystemMessage(content=tree_msg))
+            update["messages"] = messages
+        return update
+
+    def before_agent(  # type: ignore[override]
+        self,
+        state: AgentState,
+        runtime: Runtime[Any],
+    ) -> dict[str, Any] | None:
+        try:
+            loop = asyncio.get_running_loop()
+            if loop.is_running():
+                return None
+        except RuntimeError:
+            pass
+        return asyncio.run(self.abefore_agent(state, runtime))
+
+    # ------------------------------------------------------------------ render
+
+    def _render_anon_tree(self, anon_doc: dict[str, Any]) -> str:
+        path = str(anon_doc.get("path") or "")
+        title = str(anon_doc.get("title") or "uploaded_document")
+        return (
+            "<workspace_tree>\n"
+            "Anonymous session — only one read-only document is available.\n"
+            f"{DOCUMENTS_ROOT}/\n"
+            f"  {path} — {title}\n"
+            "</workspace_tree>"
+        )
+
+    async def _render_kb_tree(self, state: AgentState) -> str:
+        version = int(state.get("tree_version") or 0)
+        cache_key = (self.search_space_id, version, False)
+        cached = self._cache.get(cache_key)
+        if cached is not None:
+            return cached
+
+        try:
+            async with shielded_async_session() as session:
+                index = await build_path_index(session, self.search_space_id)
+                doc_rows = await session.execute(
+                    select(Document.id, Document.title, Document.folder_id).where(
+                        Document.search_space_id == self.search_space_id
+                    )
+                )
+                docs = list(doc_rows.all())
+        except Exception as exc:  # pragma: no cover - defensive
+            logger.warning("knowledge_tree: DB error %s", exc)
+            return "<workspace_tree>\n(unavailable)\n</workspace_tree>"
+
+        rendered = self._format_tree(index, docs)
+        self._cache[cache_key] = rendered
+        return rendered
+
+    def _format_tree(self, index: PathIndex, docs: list[Any]) -> str:
+        folder_paths = sorted(set(index.folder_paths.values()))
+        doc_paths = sorted(
+            doc_to_virtual_path(
+                doc_id=row.id,
+                title=str(row.title or "untitled"),
+                folder_id=row.folder_id,
+                index=index,
+            )
+            for row in docs
+        )
+        all_paths = sorted(set(folder_paths + doc_paths + [DOCUMENTS_ROOT]))
+
+        # Pre-compute which folders have at least one descendant (folder or doc).
+        # A folder is "empty" iff no path in `all_paths` is strictly under it.
+        # Used to emit an explicit "(empty)" marker so the LLM doesn't have to
+        # infer emptiness from indentation alone.
+        non_empty_folders = self._compute_non_empty_folders(folder_paths, doc_paths)
+
+        lines: list[str] = []
+        for path in all_paths:
+            depth = (
+                0
+                if path == DOCUMENTS_ROOT
+                else len([p for p in path[len(DOCUMENTS_ROOT) :].split("/") if p])
+            )
+            indent = "  " * depth
+            is_dir = path == DOCUMENTS_ROOT or path in folder_paths
+            display = (
+                path.rsplit("/", 1)[-1] if path != DOCUMENTS_ROOT else "/documents"
+            )
+            if is_dir:
+                if path != DOCUMENTS_ROOT and path not in non_empty_folders:
+                    lines.append(f"{indent}{display}/ (empty)")
+                else:
+                    lines.append(f"{indent}{display}/")
+            else:
+                lines.append(f"{indent}{display}")
+            if len(lines) >= self.max_entries:
+                remaining = len(all_paths) - len(lines)
+                if remaining > 0:
+                    lines.append(
+                        f"... {remaining} more entries — use "
+                        "ls('/documents/<folder>', offset, limit) to expand"
+                    )
+                break
+
+        body = "\n".join(lines)
+        rendered = f"<workspace_tree>\n{body}\n</workspace_tree>"
+
+        token_count = _count_tokens(rendered, llm=self.llm)
+        if token_count <= self.max_tokens:
+            return rendered
+
+        return self._format_root_summary(folder_paths, doc_paths)
+
+    @staticmethod
+    def _compute_non_empty_folders(
+        folder_paths: list[str], doc_paths: list[str]
+    ) -> set[str]:
+        """Return the set of folder paths that contain at least one descendant.
+
+        A folder is "non-empty" if any document path or any other folder path
+        is strictly under it. Documents propagate emptiness up to every
+        ancestor folder, while a sub-folder only marks its direct ancestors
+        non-empty (so a chain of empty folders all read ``(empty)``).
+        """
+        non_empty: set[str] = set()
+        folder_set = set(folder_paths)
+
+        for doc_path in doc_paths:
+            parent = doc_path.rsplit("/", 1)[0]
+            while parent and parent != DOCUMENTS_ROOT:
+                if parent in folder_set:
+                    non_empty.add(parent)
+                parent = parent.rsplit("/", 1)[0]
+
+        for child in folder_paths:
+            parent = child.rsplit("/", 1)[0]
+            while parent and parent != DOCUMENTS_ROOT and parent in folder_set:
+                non_empty.add(parent)
+                parent = parent.rsplit("/", 1)[0]
+
+        return non_empty
+
+    def _format_root_summary(
+        self, folder_paths: list[str], doc_paths: list[str]
+    ) -> str:
+        top_level: dict[str, int] = {}
+        loose_docs = 0
+        for path in doc_paths:
+            rel = path[len(DOCUMENTS_ROOT) :].lstrip("/")
+            if "/" in rel:
+                top = rel.split("/", 1)[0]
+                top_level[top] = top_level.get(top, 0) + 1
+            else:
+                loose_docs += 1
+        for path in folder_paths:
+            rel = path[len(DOCUMENTS_ROOT) :].lstrip("/")
+            if not rel:
+                continue
+            top = rel.split("/", 1)[0]
+            top_level.setdefault(top, 0)
+
+        lines = [DOCUMENTS_ROOT + "/"]
+        for name in sorted(top_level):
+            count = top_level[name]
+            lines.append(f"  {name}/ ({count} document{'s' if count != 1 else ''})")
+        if loose_docs:
+            lines.append(
+                f"  ({loose_docs} loose document{'s' if loose_docs != 1 else ''})"
+            )
+        lines.append(
+            "Tree is large; use list_tree('/documents/<folder>') to drill in "
+            "or ls('/documents/<folder>', offset, limit) for paginated listings."
+        )
+        return "<workspace_tree>\n" + "\n".join(lines) + "\n</workspace_tree>"
+
+
+__all__ = ["KnowledgeTreeMiddleware"]
diff --git a/surfsense_backend/app/agents/new_chat/middleware/local_folder_backend.py b/surfsense_backend/app/agents/new_chat/middleware/local_folder_backend.py
new file mode 100644
index 0000000000..4db9943cbf
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/middleware/local_folder_backend.py
@@ -0,0 +1,613 @@
+"""Desktop local-folder filesystem backend for deepagents tools."""
+
+from __future__ import annotations
+
+import asyncio
+import fnmatch
+import os
+import threading
+from collections import deque
+from contextlib import ExitStack
+from pathlib import Path
+from typing import Any
+
+from deepagents.backends.protocol import (
+    EditResult,
+    FileDownloadResponse,
+    FileInfo,
+    FileUploadResponse,
+    GrepMatch,
+    WriteResult,
+)
+from deepagents.backends.utils import (
+    create_file_data,
+    format_read_response,
+    perform_string_replacement,
+)
+
+_INVALID_PATH = "invalid_path"
+_FILE_NOT_FOUND = "file_not_found"
+_IS_DIRECTORY = "is_directory"
+
+
+class LocalFolderBackend:
+    """Filesystem backend rooted to a single local folder."""
+
+    def __init__(self, root_path: str) -> None:
+        root = Path(root_path).expanduser().resolve()
+        if not root.exists() or not root.is_dir():
+            msg = f"Local filesystem root does not exist or is not a directory: {root_path}"
+            raise ValueError(msg)
+        self._root = root
+        self._locks: dict[str, threading.Lock] = {}
+        self._locks_mu = threading.Lock()
+
+    def _lock_for(self, path: str) -> threading.Lock:
+        with self._locks_mu:
+            if path not in self._locks:
+                self._locks[path] = threading.Lock()
+            return self._locks[path]
+
+    def _resolve_virtual(self, virtual_path: str, *, allow_root: bool = False) -> Path:
+        if not virtual_path.startswith("/"):
+            msg = f"Invalid path (must be absolute): {virtual_path}"
+            raise ValueError(msg)
+        rel = virtual_path.lstrip("/")
+        candidate = self._root if rel == "" else (self._root / rel)
+        resolved = candidate.resolve()
+        if not allow_root and resolved == self._root:
+            msg = "Path must refer to a file or child directory under root"
+            raise ValueError(msg)
+        if not resolved.is_relative_to(self._root):
+            msg = f"Path escapes local filesystem root: {virtual_path}"
+            raise ValueError(msg)
+        return resolved
+
+    @staticmethod
+    def _to_virtual(path: Path, root: Path) -> str:
+        rel = path.relative_to(root).as_posix()
+        return "/" if rel == "." else f"/{rel}"
+
+    def _write_text_atomic(self, path: Path, content: str) -> None:
+        path.parent.mkdir(parents=True, exist_ok=True)
+        temp_path = path.with_suffix(f"{path.suffix}.tmp")
+        temp_path.write_text(content, encoding="utf-8")
+        os.replace(temp_path, path)
+
+    def _acquire_path_locks(self, *paths: str) -> ExitStack:
+        ordered_paths = sorted(set(paths))
+        stack = ExitStack()
+        for path in ordered_paths:
+            stack.enter_context(self._lock_for(path))
+        return stack
+
+    @staticmethod
+    def _clamp_page_size(page_size: int) -> int:
+        return max(1, min(page_size, 1000))
+
+    def _read_dir_entries(self, directory_path: str) -> list[dict[str, Any]]:
+        directory = Path(directory_path)
+        try:
+            children = sorted(
+                directory.iterdir(),
+                key=lambda p: (not p.is_dir(), p.name.lower()),
+            )
+        except OSError:
+            return []
+
+        entries: list[dict[str, Any]] = []
+        for child in children:
+            try:
+                stat_result = child.stat()
+            except OSError:
+                continue
+            entries.append(
+                {
+                    "path": self._to_virtual(child, self._root),
+                    "is_dir": child.is_dir(),
+                    "size": stat_result.st_size if child.is_file() else 0,
+                    "modified_at": str(stat_result.st_mtime),
+                    "absolute_path": str(child),
+                }
+            )
+        return entries
+
+    def ls_info(self, path: str) -> list[FileInfo]:
+        try:
+            target = self._resolve_virtual(path, allow_root=True)
+        except ValueError:
+            return []
+        if not target.exists() or not target.is_dir():
+            return []
+        infos: list[FileInfo] = []
+        for child in sorted(
+            target.iterdir(), key=lambda p: (not p.is_dir(), p.name.lower())
+        ):
+            infos.append(
+                FileInfo(
+                    path=self._to_virtual(child, self._root),
+                    is_dir=child.is_dir(),
+                    size=child.stat().st_size if child.is_file() else 0,
+                    modified_at=str(child.stat().st_mtime),
+                )
+            )
+        return infos
+
+    async def als_info(self, path: str) -> list[FileInfo]:
+        return await asyncio.to_thread(self.ls_info, path)
+
+    def read(self, file_path: str, offset: int = 0, limit: int = 2000) -> str:
+        try:
+            path = self._resolve_virtual(file_path)
+        except ValueError:
+            return f"Error: Invalid path '{file_path}'"
+        if not path.exists():
+            return f"Error: File '{file_path}' not found"
+        if not path.is_file():
+            return f"Error: Path '{file_path}' is not a file"
+        content = path.read_text(encoding="utf-8", errors="replace")
+        file_data = create_file_data(content)
+        return format_read_response(file_data, offset, limit)
+
+    async def aread(self, file_path: str, offset: int = 0, limit: int = 2000) -> str:
+        return await asyncio.to_thread(self.read, file_path, offset, limit)
+
+    def read_raw(self, file_path: str) -> str:
+        """Read raw file text without line-number formatting."""
+        try:
+            path = self._resolve_virtual(file_path)
+        except ValueError:
+            return f"Error: Invalid path '{file_path}'"
+        if not path.exists():
+            return f"Error: File '{file_path}' not found"
+        if not path.is_file():
+            return f"Error: Path '{file_path}' is not a file"
+        return path.read_text(encoding="utf-8", errors="replace")
+
+    async def aread_raw(self, file_path: str) -> str:
+        """Async variant of read_raw."""
+        return await asyncio.to_thread(self.read_raw, file_path)
+
+    def write(self, file_path: str, content: str) -> WriteResult:
+        try:
+            path = self._resolve_virtual(file_path)
+        except ValueError:
+            return WriteResult(error=f"Error: Invalid path '{file_path}'")
+        lock = self._lock_for(file_path)
+        with lock:
+            if path.exists():
+                return WriteResult(
+                    error=(
+                        f"Cannot write to {file_path} because it already exists. "
+                        "Read and then make an edit, or write to a new path."
+                    )
+                )
+            parent = path.parent
+            if not parent.exists() or not parent.is_dir():
+                return WriteResult(
+                    error=(
+                        f"Error: parent directory for '{file_path}' does not exist. "
+                        "Create the folder first or write to an existing directory."
+                    )
+                )
+            self._write_text_atomic(path, content)
+        return WriteResult(path=file_path, files_update=None)
+
+    async def awrite(self, file_path: str, content: str) -> WriteResult:
+        return await asyncio.to_thread(self.write, file_path, content)
+
+    def list_tree(
+        self,
+        path: str = "/",
+        *,
+        max_depth: int | None = 8,
+        page_size: int = 500,
+        include_files: bool = True,
+        include_dirs: bool = True,
+    ) -> dict[str, Any]:
+        if not include_files and not include_dirs:
+            return {
+                "entries": [],
+                "truncated": False,
+            }
+
+        normalized_depth = None if max_depth is None else max(0, int(max_depth))
+        page_limit = self._clamp_page_size(int(page_size))
+        try:
+            start = self._resolve_virtual(path, allow_root=True)
+        except ValueError:
+            return {"error": f"Error: invalid path '{path}'"}
+        if not start.exists():
+            return {"error": f"Error: path '{path}' not found"}
+        if start.is_file():
+            stat_result = start.stat()
+            if include_files:
+                return {
+                    "entries": [
+                        {
+                            "path": self._to_virtual(start, self._root),
+                            "is_dir": False,
+                            "size": stat_result.st_size,
+                            "modified_at": str(stat_result.st_mtime),
+                            "depth": 0,
+                        }
+                    ],
+                    "truncated": False,
+                }
+            return {
+                "entries": [],
+                "truncated": False,
+            }
+
+        pending_dirs: deque[tuple[str, int]] = deque([(str(start), 0)])
+        entries: list[dict[str, Any]] = []
+        truncated = False
+        while pending_dirs and not truncated:
+            next_dir_path, next_depth = pending_dirs.popleft()
+            active_entries = self._read_dir_entries(next_dir_path)
+            for item in active_entries:
+                item_depth = next_depth + 1
+                if normalized_depth is not None and item_depth > normalized_depth:
+                    continue
+                if item["is_dir"]:
+                    if normalized_depth is None or item_depth <= normalized_depth:
+                        pending_dirs.append((item["absolute_path"], item_depth))
+                    if include_dirs:
+                        entries.append(
+                            {
+                                "path": item["path"],
+                                "is_dir": True,
+                                "size": 0,
+                                "modified_at": item["modified_at"],
+                                "depth": item_depth,
+                            }
+                        )
+                elif include_files:
+                    entries.append(
+                        {
+                            "path": item["path"],
+                            "is_dir": False,
+                            "size": item["size"],
+                            "modified_at": item["modified_at"],
+                            "depth": item_depth,
+                        }
+                    )
+                if len(entries) >= page_limit:
+                    truncated = True
+                    break
+
+        return {
+            "entries": entries,
+            "truncated": truncated,
+        }
+
+    async def alist_tree(
+        self,
+        path: str = "/",
+        *,
+        max_depth: int | None = 8,
+        page_size: int = 500,
+        include_files: bool = True,
+        include_dirs: bool = True,
+    ) -> dict[str, Any]:
+        return await asyncio.to_thread(
+            self.list_tree,
+            path,
+            max_depth=max_depth,
+            page_size=page_size,
+            include_files=include_files,
+            include_dirs=include_dirs,
+        )
+
+    def move(
+        self,
+        source_path: str,
+        destination_path: str,
+        overwrite: bool = False,
+    ) -> WriteResult:
+        try:
+            source = self._resolve_virtual(source_path)
+            destination = self._resolve_virtual(destination_path)
+        except ValueError:
+            return WriteResult(
+                error=(
+                    f"Error: invalid source '{source_path}' or destination "
+                    f"'{destination_path}' path"
+                )
+            )
+        if source == destination:
+            return WriteResult(error="Error: source and destination paths are the same")
+        with self._acquire_path_locks(source_path, destination_path):
+            if not source.exists():
+                return WriteResult(
+                    error=f"Error: source path '{source_path}' not found"
+                )
+            if destination.exists():
+                if not overwrite:
+                    return WriteResult(
+                        error=(
+                            f"Error: destination path '{destination_path}' already exists. "
+                            "Set overwrite=True to replace files."
+                        )
+                    )
+                if source.is_dir() or destination.is_dir():
+                    return WriteResult(
+                        error=(
+                            "Error: overwrite=True is only supported for file-to-file moves."
+                        )
+                    )
+            destination.parent.mkdir(parents=True, exist_ok=True)
+            try:
+                if overwrite:
+                    os.replace(source, destination)
+                else:
+                    source.rename(destination)
+            except OSError as exc:
+                return WriteResult(
+                    error=f"Error: failed to move '{source_path}': {exc}"
+                )
+        return WriteResult(
+            path=self._to_virtual(destination, self._root), files_update=None
+        )
+
+    async def amove(
+        self,
+        source_path: str,
+        destination_path: str,
+        overwrite: bool = False,
+    ) -> WriteResult:
+        return await asyncio.to_thread(
+            self.move, source_path, destination_path, overwrite
+        )
+
+    def delete_file(self, file_path: str) -> WriteResult:
+        """Hard-delete a single file under root.
+
+        Refuses directories, root, and missing paths. Roughly mirrors POSIX
+        ``rm path``; ``-r`` recursion and glob expansion are explicitly
+        out of scope.
+        """
+        try:
+            path = self._resolve_virtual(file_path)
+        except ValueError:
+            return WriteResult(error=f"Error: Invalid path '{file_path}'")
+        with self._lock_for(file_path):
+            if not path.exists():
+                return WriteResult(error=f"Error: File '{file_path}' not found")
+            if path.is_dir():
+                return WriteResult(
+                    error=(
+                        f"Error: '{file_path}' is a directory. "
+                        "Use rmdir for empty directories."
+                    )
+                )
+            try:
+                os.unlink(path)
+            except OSError as exc:
+                return WriteResult(
+                    error=f"Error: failed to delete '{file_path}': {exc}"
+                )
+        return WriteResult(path=file_path, files_update=None)
+
+    async def adelete_file(self, file_path: str) -> WriteResult:
+        return await asyncio.to_thread(self.delete_file, file_path)
+
+    def rmdir(self, dir_path: str) -> WriteResult:
+        """Hard-delete an empty directory under root.
+
+        Refuses files, root, missing paths, and non-empty directories.
+        ``os.rmdir`` is naturally empty-only; we pre-check so the error is
+        clearer for the agent.
+        """
+        try:
+            path = self._resolve_virtual(dir_path)
+        except ValueError:
+            return WriteResult(error=f"Error: Invalid path '{dir_path}'")
+        with self._lock_for(dir_path):
+            if not path.exists():
+                return WriteResult(error=f"Error: Directory '{dir_path}' not found")
+            if not path.is_dir():
+                return WriteResult(error=f"Error: '{dir_path}' is not a directory")
+            try:
+                next(path.iterdir())
+            except StopIteration:
+                pass
+            else:
+                return WriteResult(
+                    error=(
+                        f"Error: directory '{dir_path}' is not empty. "
+                        "Remove its contents first."
+                    )
+                )
+            try:
+                os.rmdir(path)
+            except OSError as exc:
+                return WriteResult(error=f"Error: failed to rmdir '{dir_path}': {exc}")
+        return WriteResult(path=dir_path, files_update=None)
+
+    async def armdir(self, dir_path: str) -> WriteResult:
+        return await asyncio.to_thread(self.rmdir, dir_path)
+
+    def edit(
+        self,
+        file_path: str,
+        old_string: str,
+        new_string: str,
+        replace_all: bool = False,
+    ) -> EditResult:
+        try:
+            path = self._resolve_virtual(file_path)
+        except ValueError:
+            return EditResult(error=f"Error: Invalid path '{file_path}'")
+        lock = self._lock_for(file_path)
+        with lock:
+            if not path.exists() or not path.is_file():
+                return EditResult(error=f"Error: File '{file_path}' not found")
+            content = path.read_text(encoding="utf-8", errors="replace")
+            result = perform_string_replacement(
+                content, old_string, new_string, replace_all
+            )
+            if isinstance(result, str):
+                return EditResult(error=result)
+            updated_content, occurrences = result
+            self._write_text_atomic(path, updated_content)
+        return EditResult(
+            path=file_path, files_update=None, occurrences=int(occurrences)
+        )
+
+    async def aedit(
+        self,
+        file_path: str,
+        old_string: str,
+        new_string: str,
+        replace_all: bool = False,
+    ) -> EditResult:
+        return await asyncio.to_thread(
+            self.edit, file_path, old_string, new_string, replace_all
+        )
+
+    def glob_info(self, pattern: str, path: str = "/") -> list[FileInfo]:
+        try:
+            base = self._resolve_virtual(path, allow_root=True)
+        except ValueError:
+            return []
+
+        if pattern.startswith("/"):
+            search_base = self._root
+            normalized_pattern = pattern.lstrip("/")
+        else:
+            search_base = base
+            normalized_pattern = pattern
+
+        matches: list[FileInfo] = []
+        for hit in search_base.glob(normalized_pattern):
+            try:
+                resolved = hit.resolve()
+                if not resolved.is_relative_to(self._root):
+                    continue
+            except Exception:
+                continue
+            matches.append(
+                FileInfo(
+                    path=self._to_virtual(resolved, self._root),
+                    is_dir=resolved.is_dir(),
+                    size=resolved.stat().st_size if resolved.is_file() else 0,
+                    modified_at=str(resolved.stat().st_mtime),
+                )
+            )
+        return matches
+
+    async def aglob_info(self, pattern: str, path: str = "/") -> list[FileInfo]:
+        return await asyncio.to_thread(self.glob_info, pattern, path)
+
+    def _iter_candidate_files(self, path: str | None, glob: str | None) -> list[Path]:
+        base_virtual = path or "/"
+        try:
+            base = self._resolve_virtual(base_virtual, allow_root=True)
+        except ValueError:
+            return []
+        if not base.exists():
+            return []
+
+        candidates = [p for p in base.rglob("*") if p.is_file()]
+        if glob:
+            candidates = [
+                p
+                for p in candidates
+                if fnmatch.fnmatch(self._to_virtual(p, self._root), glob)
+                or fnmatch.fnmatch(p.name, glob)
+            ]
+        return candidates
+
+    def grep_raw(
+        self, pattern: str, path: str | None = None, glob: str | None = None
+    ) -> list[GrepMatch] | str:
+        if not pattern:
+            return "Error: pattern cannot be empty"
+        matches: list[GrepMatch] = []
+        for file_path in self._iter_candidate_files(path, glob):
+            try:
+                lines = file_path.read_text(
+                    encoding="utf-8", errors="replace"
+                ).splitlines()
+            except Exception:
+                continue
+            for idx, line in enumerate(lines, start=1):
+                if pattern in line:
+                    matches.append(
+                        GrepMatch(
+                            path=self._to_virtual(file_path, self._root),
+                            line=idx,
+                            text=line,
+                        )
+                    )
+        return matches
+
+    async def agrep_raw(
+        self, pattern: str, path: str | None = None, glob: str | None = None
+    ) -> list[GrepMatch] | str:
+        return await asyncio.to_thread(self.grep_raw, pattern, path, glob)
+
+    def upload_files(self, files: list[tuple[str, bytes]]) -> list[FileUploadResponse]:
+        responses: list[FileUploadResponse] = []
+        for virtual_path, content in files:
+            try:
+                target = self._resolve_virtual(virtual_path)
+                target.parent.mkdir(parents=True, exist_ok=True)
+                temp_path = target.with_suffix(f"{target.suffix}.tmp")
+                temp_path.write_bytes(content)
+                os.replace(temp_path, target)
+                responses.append(FileUploadResponse(path=virtual_path, error=None))
+            except FileNotFoundError:
+                responses.append(
+                    FileUploadResponse(path=virtual_path, error=_FILE_NOT_FOUND)
+                )
+            except IsADirectoryError:
+                responses.append(
+                    FileUploadResponse(path=virtual_path, error=_IS_DIRECTORY)
+                )
+            except Exception:
+                responses.append(
+                    FileUploadResponse(path=virtual_path, error=_INVALID_PATH)
+                )
+        return responses
+
+    async def aupload_files(
+        self, files: list[tuple[str, bytes]]
+    ) -> list[FileUploadResponse]:
+        return await asyncio.to_thread(self.upload_files, files)
+
+    def download_files(self, paths: list[str]) -> list[FileDownloadResponse]:
+        responses: list[FileDownloadResponse] = []
+        for virtual_path in paths:
+            try:
+                target = self._resolve_virtual(virtual_path)
+                if not target.exists():
+                    responses.append(
+                        FileDownloadResponse(
+                            path=virtual_path, content=None, error=_FILE_NOT_FOUND
+                        )
+                    )
+                    continue
+                if target.is_dir():
+                    responses.append(
+                        FileDownloadResponse(
+                            path=virtual_path, content=None, error=_IS_DIRECTORY
+                        )
+                    )
+                    continue
+                responses.append(
+                    FileDownloadResponse(
+                        path=virtual_path, content=target.read_bytes(), error=None
+                    )
+                )
+            except Exception:
+                responses.append(
+                    FileDownloadResponse(
+                        path=virtual_path, content=None, error=_INVALID_PATH
+                    )
+                )
+        return responses
+
+    async def adownload_files(self, paths: list[str]) -> list[FileDownloadResponse]:
+        return await asyncio.to_thread(self.download_files, paths)
diff --git a/surfsense_backend/app/agents/new_chat/middleware/memory_injection.py b/surfsense_backend/app/agents/new_chat/middleware/memory_injection.py
new file mode 100644
index 0000000000..6179adccdc
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/middleware/memory_injection.py
@@ -0,0 +1,138 @@
+"""Memory injection middleware for the SurfSense agent.
+
+Injects memory markdown into the system prompt on every turn:
+- Private threads: only personal memory (<user_memory>)
+- Shared threads: only team memory (<team_memory>)
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+from uuid import UUID
+
+from langchain.agents.middleware import AgentMiddleware, AgentState
+from langchain_core.messages import HumanMessage, SystemMessage
+from langgraph.runtime import Runtime
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.agents.new_chat.tools.update_memory import MEMORY_HARD_LIMIT, MEMORY_SOFT_LIMIT
+from app.db import ChatVisibility, SearchSpace, User, shielded_async_session
+
+logger = logging.getLogger(__name__)
+
+
+class MemoryInjectionMiddleware(AgentMiddleware):  # type: ignore[type-arg]
+    """Injects memory markdown into the conversation on every turn."""
+
+    tools = ()
+
+    def __init__(
+        self,
+        *,
+        user_id: str | UUID | None,
+        search_space_id: int,
+        thread_visibility: ChatVisibility | None = None,
+    ) -> None:
+        self.user_id = UUID(user_id) if isinstance(user_id, str) else user_id
+        self.search_space_id = search_space_id
+        self.visibility = thread_visibility or ChatVisibility.PRIVATE
+
+    async def abefore_agent(  # type: ignore[override]
+        self,
+        state: AgentState,
+        runtime: Runtime[Any],
+    ) -> dict[str, Any] | None:
+        del runtime
+        messages = state.get("messages") or []
+        if not messages:
+            return None
+
+        last_message = messages[-1]
+        if not isinstance(last_message, HumanMessage):
+            return None
+
+        memory_blocks: list[str] = []
+
+        async with shielded_async_session() as session:
+            if self.visibility == ChatVisibility.SEARCH_SPACE:
+                team_memory = await self._load_team_memory(session)
+                if team_memory:
+                    chars = len(team_memory)
+                    memory_blocks.append(
+                        f'<team_memory chars="{chars}" limit="{MEMORY_HARD_LIMIT}">\n'
+                        f"{team_memory}\n"
+                        f"</team_memory>"
+                    )
+                    if chars > MEMORY_SOFT_LIMIT:
+                        memory_blocks.append(
+                            f"<memory_warning>Team memory is at "
+                            f"{chars:,}/{MEMORY_HARD_LIMIT:,} characters and approaching "
+                            f"the hard limit. On your next update_memory call, consolidate "
+                            f"by merging duplicates, removing outdated entries, and "
+                            f"shortening descriptions before adding anything new."
+                            f"</memory_warning>"
+                        )
+            elif self.user_id is not None:
+                user_memory, display_name = await self._load_user_memory(session)
+                if display_name and display_name.strip():
+                    first_name = display_name.strip().split()[0]
+                    memory_blocks.append(f"<user_name>{first_name}</user_name>")
+                if user_memory:
+                    chars = len(user_memory)
+                    memory_blocks.append(
+                        f'<user_memory chars="{chars}" limit="{MEMORY_HARD_LIMIT}">\n'
+                        f"{user_memory}\n"
+                        f"</user_memory>"
+                    )
+                    if chars > MEMORY_SOFT_LIMIT:
+                        memory_blocks.append(
+                            f"<memory_warning>Your personal memory is at "
+                            f"{chars:,}/{MEMORY_HARD_LIMIT:,} characters and approaching "
+                            f"the hard limit. On your next update_memory call, consolidate "
+                            f"by merging duplicates, removing outdated entries, and "
+                            f"shortening descriptions before adding anything new."
+                            f"</memory_warning>"
+                        )
+
+        if not memory_blocks:
+            return None
+
+        memory_text = "\n\n".join(memory_blocks)
+        memory_msg = SystemMessage(content=memory_text)
+
+        new_messages = list(messages)
+        insert_idx = 1 if len(new_messages) > 1 else 0
+        new_messages.insert(insert_idx, memory_msg)
+
+        return {"messages": new_messages}
+
+    async def _load_user_memory(
+        self, session: AsyncSession
+    ) -> tuple[str | None, str | None]:
+        """Return (memory_content, display_name)."""
+        try:
+            result = await session.execute(
+                select(User.memory_md, User.display_name).where(User.id == self.user_id)
+            )
+            row = result.one_or_none()
+            if row is None:
+                return None, None
+            return row.memory_md or None, row.display_name
+        except Exception:
+            logger.exception("Failed to load user memory")
+            return None, None
+
+    async def _load_team_memory(self, session: AsyncSession) -> str | None:
+        try:
+            result = await session.execute(
+                select(SearchSpace.shared_memory_md).where(
+                    SearchSpace.id == self.search_space_id
+                )
+            )
+            row = result.scalar_one_or_none()
+            return row if row else None
+        except Exception:
+            logger.exception("Failed to load team memory")
+            return None
diff --git a/surfsense_backend/app/agents/new_chat/middleware/multi_root_local_folder_backend.py b/surfsense_backend/app/agents/new_chat/middleware/multi_root_local_folder_backend.py
new file mode 100644
index 0000000000..a5add62487
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/middleware/multi_root_local_folder_backend.py
@@ -0,0 +1,489 @@
+"""Aggregate multiple LocalFolderBackend roots behind mount-prefixed virtual paths."""
+
+from __future__ import annotations
+
+import asyncio
+from pathlib import Path
+from typing import Any
+
+from deepagents.backends.protocol import (
+    EditResult,
+    FileDownloadResponse,
+    FileInfo,
+    FileUploadResponse,
+    GrepMatch,
+    WriteResult,
+)
+
+from app.agents.new_chat.middleware.local_folder_backend import LocalFolderBackend
+
+_INVALID_PATH = "invalid_path"
+_FILE_NOT_FOUND = "file_not_found"
+_IS_DIRECTORY = "is_directory"
+
+
+class MultiRootLocalFolderBackend:
+    """Route filesystem operations to one of several mounted local roots.
+
+    Virtual paths are namespaced as:
+    - `/<mount>/...`
+    where `<mount>` is derived from each selected root folder name.
+    """
+
+    def __init__(self, mounts: tuple[tuple[str, str], ...]) -> None:
+        if not mounts:
+            msg = "At least one local mount is required"
+            raise ValueError(msg)
+        self._mount_to_backend: dict[str, LocalFolderBackend] = {}
+        for raw_mount, raw_root in mounts:
+            mount = raw_mount.strip()
+            if not mount:
+                msg = "Mount id cannot be empty"
+                raise ValueError(msg)
+            if mount in self._mount_to_backend:
+                msg = f"Duplicate mount id: {mount}"
+                raise ValueError(msg)
+            normalized_root = str(Path(raw_root).expanduser().resolve())
+            self._mount_to_backend[mount] = LocalFolderBackend(normalized_root)
+        self._mount_order = tuple(self._mount_to_backend.keys())
+
+    def list_mounts(self) -> tuple[str, ...]:
+        return self._mount_order
+
+    def default_mount(self) -> str:
+        return self._mount_order[0]
+
+    def _mount_error(self) -> str:
+        mounts = ", ".join(f"/{mount}" for mount in self._mount_order)
+        return (
+            "Path must start with one of the selected folders: "
+            f"{mounts}. Example: /{self._mount_order[0]}/file.txt"
+        )
+
+    def _split_mount_path(self, virtual_path: str) -> tuple[str, str]:
+        if not virtual_path.startswith("/"):
+            msg = f"Invalid path (must be absolute): {virtual_path}"
+            raise ValueError(msg)
+        rel = virtual_path.lstrip("/")
+        if not rel:
+            raise ValueError(self._mount_error())
+        mount, _, remainder = rel.partition("/")
+        backend = self._mount_to_backend.get(mount)
+        if backend is None:
+            raise ValueError(self._mount_error())
+        local_path = f"/{remainder}" if remainder else "/"
+        return mount, local_path
+
+    @staticmethod
+    def _prefix_mount_path(mount: str, local_path: str) -> str:
+        if local_path == "/":
+            return f"/{mount}"
+        return f"/{mount}{local_path}"
+
+    @staticmethod
+    def _get_value(item: Any, key: str) -> Any:
+        if isinstance(item, dict):
+            return item.get(key)
+        return getattr(item, key, None)
+
+    @classmethod
+    def _get_str(cls, item: Any, key: str) -> str:
+        value = cls._get_value(item, key)
+        return value if isinstance(value, str) else ""
+
+    @classmethod
+    def _get_int(cls, item: Any, key: str) -> int:
+        value = cls._get_value(item, key)
+        return int(value) if isinstance(value, int | float) else 0
+
+    @classmethod
+    def _get_bool(cls, item: Any, key: str) -> bool:
+        value = cls._get_value(item, key)
+        return bool(value)
+
+    def _list_mount_roots(self) -> list[FileInfo]:
+        return [
+            FileInfo(path=f"/{mount}", is_dir=True, size=0, modified_at="0")
+            for mount in self._mount_order
+        ]
+
+    def _transform_infos(self, mount: str, infos: list[FileInfo]) -> list[FileInfo]:
+        transformed: list[FileInfo] = []
+        for info in infos:
+            transformed.append(
+                FileInfo(
+                    path=self._prefix_mount_path(mount, self._get_str(info, "path")),
+                    is_dir=self._get_bool(info, "is_dir"),
+                    size=self._get_int(info, "size"),
+                    modified_at=self._get_str(info, "modified_at"),
+                )
+            )
+        return transformed
+
+    def ls_info(self, path: str) -> list[FileInfo]:
+        if path == "/":
+            return self._list_mount_roots()
+        try:
+            mount, local_path = self._split_mount_path(path)
+        except ValueError:
+            return []
+        return self._transform_infos(
+            mount, self._mount_to_backend[mount].ls_info(local_path)
+        )
+
+    async def als_info(self, path: str) -> list[FileInfo]:
+        return await asyncio.to_thread(self.ls_info, path)
+
+    def list_tree(
+        self,
+        path: str = "/",
+        *,
+        max_depth: int | None = 8,
+        page_size: int = 500,
+        include_files: bool = True,
+        include_dirs: bool = True,
+    ) -> dict[str, Any]:
+        if path == "/":
+            entries = [
+                {
+                    "path": f"/{mount}",
+                    "is_dir": True,
+                    "size": 0,
+                    "modified_at": "0",
+                    "depth": 0,
+                }
+                for mount in self._mount_order
+            ]
+            return {
+                "entries": entries if include_dirs else [],
+                "truncated": False,
+            }
+
+        try:
+            mount, local_path = self._split_mount_path(path)
+        except ValueError as exc:
+            return {"error": f"Error: {exc}"}
+
+        result = self._mount_to_backend[mount].list_tree(
+            local_path,
+            max_depth=max_depth,
+            page_size=page_size,
+            include_files=include_files,
+            include_dirs=include_dirs,
+        )
+        if result.get("error"):
+            return result
+
+        entries: list[dict[str, Any]] = []
+        for entry in result.get("entries", []):
+            raw_path = self._get_str(entry, "path")
+            entries.append(
+                {
+                    "path": self._prefix_mount_path(mount, raw_path),
+                    "is_dir": self._get_bool(entry, "is_dir"),
+                    "size": self._get_int(entry, "size"),
+                    "modified_at": self._get_str(entry, "modified_at"),
+                    "depth": self._get_int(entry, "depth"),
+                }
+            )
+
+        return {
+            "entries": entries,
+            "truncated": self._get_bool(result, "truncated"),
+        }
+
+    async def alist_tree(
+        self,
+        path: str = "/",
+        *,
+        max_depth: int | None = 8,
+        page_size: int = 500,
+        include_files: bool = True,
+        include_dirs: bool = True,
+    ) -> dict[str, Any]:
+        return await asyncio.to_thread(
+            self.list_tree,
+            path,
+            max_depth=max_depth,
+            page_size=page_size,
+            include_files=include_files,
+            include_dirs=include_dirs,
+        )
+
+    def read(self, file_path: str, offset: int = 0, limit: int = 2000) -> str:
+        try:
+            mount, local_path = self._split_mount_path(file_path)
+        except ValueError as exc:
+            return f"Error: {exc}"
+        return self._mount_to_backend[mount].read(local_path, offset, limit)
+
+    async def aread(self, file_path: str, offset: int = 0, limit: int = 2000) -> str:
+        return await asyncio.to_thread(self.read, file_path, offset, limit)
+
+    def read_raw(self, file_path: str) -> str:
+        try:
+            mount, local_path = self._split_mount_path(file_path)
+        except ValueError as exc:
+            return f"Error: {exc}"
+        return self._mount_to_backend[mount].read_raw(local_path)
+
+    async def aread_raw(self, file_path: str) -> str:
+        return await asyncio.to_thread(self.read_raw, file_path)
+
+    def write(self, file_path: str, content: str) -> WriteResult:
+        try:
+            mount, local_path = self._split_mount_path(file_path)
+        except ValueError as exc:
+            return WriteResult(error=f"Error: {exc}")
+        result = self._mount_to_backend[mount].write(local_path, content)
+        if result.path:
+            result.path = self._prefix_mount_path(mount, result.path)
+        return result
+
+    async def awrite(self, file_path: str, content: str) -> WriteResult:
+        return await asyncio.to_thread(self.write, file_path, content)
+
+    def move(
+        self,
+        source_path: str,
+        destination_path: str,
+        overwrite: bool = False,
+    ) -> WriteResult:
+        try:
+            source_mount, source_local_path = self._split_mount_path(source_path)
+            destination_mount, destination_local_path = self._split_mount_path(
+                destination_path
+            )
+        except ValueError as exc:
+            return WriteResult(error=f"Error: {exc}")
+        if source_mount != destination_mount:
+            return WriteResult(
+                error=(
+                    "Error: cross-mount moves are not supported. "
+                    "Source and destination must be under the same mounted root."
+                )
+            )
+        result = self._mount_to_backend[source_mount].move(
+            source_local_path,
+            destination_local_path,
+            overwrite=overwrite,
+        )
+        if result.path:
+            result.path = self._prefix_mount_path(source_mount, result.path)
+        return result
+
+    async def amove(
+        self,
+        source_path: str,
+        destination_path: str,
+        overwrite: bool = False,
+    ) -> WriteResult:
+        return await asyncio.to_thread(
+            self.move,
+            source_path,
+            destination_path,
+            overwrite,
+        )
+
+    def delete_file(self, file_path: str) -> WriteResult:
+        try:
+            mount, local_path = self._split_mount_path(file_path)
+        except ValueError as exc:
+            return WriteResult(error=f"Error: {exc}")
+        result = self._mount_to_backend[mount].delete_file(local_path)
+        if result.path:
+            result.path = self._prefix_mount_path(mount, result.path)
+        return result
+
+    async def adelete_file(self, file_path: str) -> WriteResult:
+        return await asyncio.to_thread(self.delete_file, file_path)
+
+    def rmdir(self, dir_path: str) -> WriteResult:
+        try:
+            mount, local_path = self._split_mount_path(dir_path)
+        except ValueError as exc:
+            return WriteResult(error=f"Error: {exc}")
+        if local_path == "/":
+            return WriteResult(error=f"Error: cannot rmdir mount root '{dir_path}'")
+        result = self._mount_to_backend[mount].rmdir(local_path)
+        if result.path:
+            result.path = self._prefix_mount_path(mount, result.path)
+        return result
+
+    async def armdir(self, dir_path: str) -> WriteResult:
+        return await asyncio.to_thread(self.rmdir, dir_path)
+
+    def edit(
+        self,
+        file_path: str,
+        old_string: str,
+        new_string: str,
+        replace_all: bool = False,
+    ) -> EditResult:
+        try:
+            mount, local_path = self._split_mount_path(file_path)
+        except ValueError as exc:
+            return EditResult(error=f"Error: {exc}")
+        result = self._mount_to_backend[mount].edit(
+            local_path, old_string, new_string, replace_all
+        )
+        if result.path:
+            result.path = self._prefix_mount_path(mount, result.path)
+        return result
+
+    async def aedit(
+        self,
+        file_path: str,
+        old_string: str,
+        new_string: str,
+        replace_all: bool = False,
+    ) -> EditResult:
+        return await asyncio.to_thread(
+            self.edit, file_path, old_string, new_string, replace_all
+        )
+
+    def glob_info(self, pattern: str, path: str = "/") -> list[FileInfo]:
+        if path == "/":
+            prefixed_results: list[FileInfo] = []
+            if pattern.startswith("/"):
+                mount, _, remainder = pattern.lstrip("/").partition("/")
+                backend = self._mount_to_backend.get(mount)
+                if not backend:
+                    return []
+                local_pattern = f"/{remainder}" if remainder else "/"
+                return self._transform_infos(
+                    mount, backend.glob_info(local_pattern, path="/")
+                )
+            for mount, backend in self._mount_to_backend.items():
+                prefixed_results.extend(
+                    self._transform_infos(mount, backend.glob_info(pattern, path="/"))
+                )
+            return prefixed_results
+
+        try:
+            mount, local_path = self._split_mount_path(path)
+        except ValueError:
+            return []
+        return self._transform_infos(
+            mount, self._mount_to_backend[mount].glob_info(pattern, path=local_path)
+        )
+
+    async def aglob_info(self, pattern: str, path: str = "/") -> list[FileInfo]:
+        return await asyncio.to_thread(self.glob_info, pattern, path)
+
+    def grep_raw(
+        self, pattern: str, path: str | None = None, glob: str | None = None
+    ) -> list[GrepMatch] | str:
+        if not pattern:
+            return "Error: pattern cannot be empty"
+        if path is None or path == "/":
+            all_matches: list[GrepMatch] = []
+            for mount, backend in self._mount_to_backend.items():
+                result = backend.grep_raw(pattern, path="/", glob=glob)
+                if isinstance(result, str):
+                    return result
+                all_matches.extend(
+                    [
+                        GrepMatch(
+                            path=self._prefix_mount_path(
+                                mount, self._get_str(match, "path")
+                            ),
+                            line=self._get_int(match, "line"),
+                            text=self._get_str(match, "text"),
+                        )
+                        for match in result
+                    ]
+                )
+            return all_matches
+        try:
+            mount, local_path = self._split_mount_path(path)
+        except ValueError as exc:
+            return f"Error: {exc}"
+
+        result = self._mount_to_backend[mount].grep_raw(
+            pattern, path=local_path, glob=glob
+        )
+        if isinstance(result, str):
+            return result
+        return [
+            GrepMatch(
+                path=self._prefix_mount_path(mount, self._get_str(match, "path")),
+                line=self._get_int(match, "line"),
+                text=self._get_str(match, "text"),
+            )
+            for match in result
+        ]
+
+    async def agrep_raw(
+        self, pattern: str, path: str | None = None, glob: str | None = None
+    ) -> list[GrepMatch] | str:
+        return await asyncio.to_thread(self.grep_raw, pattern, path, glob)
+
+    def upload_files(self, files: list[tuple[str, bytes]]) -> list[FileUploadResponse]:
+        grouped: dict[str, list[tuple[str, bytes]]] = {}
+        invalid: list[FileUploadResponse] = []
+        for virtual_path, content in files:
+            try:
+                mount, local_path = self._split_mount_path(virtual_path)
+            except ValueError:
+                invalid.append(
+                    FileUploadResponse(path=virtual_path, error=_INVALID_PATH)
+                )
+                continue
+            grouped.setdefault(mount, []).append((local_path, content))
+
+        responses = list(invalid)
+        for mount, mount_files in grouped.items():
+            result = self._mount_to_backend[mount].upload_files(mount_files)
+            responses.extend(
+                [
+                    FileUploadResponse(
+                        path=self._prefix_mount_path(
+                            mount, self._get_str(item, "path")
+                        ),
+                        error=self._get_str(item, "error") or None,
+                    )
+                    for item in result
+                ]
+            )
+        return responses
+
+    async def aupload_files(
+        self, files: list[tuple[str, bytes]]
+    ) -> list[FileUploadResponse]:
+        return await asyncio.to_thread(self.upload_files, files)
+
+    def download_files(self, paths: list[str]) -> list[FileDownloadResponse]:
+        grouped: dict[str, list[str]] = {}
+        invalid: list[FileDownloadResponse] = []
+        for virtual_path in paths:
+            try:
+                mount, local_path = self._split_mount_path(virtual_path)
+            except ValueError:
+                invalid.append(
+                    FileDownloadResponse(
+                        path=virtual_path, content=None, error=_INVALID_PATH
+                    )
+                )
+                continue
+            grouped.setdefault(mount, []).append(local_path)
+
+        responses = list(invalid)
+        for mount, mount_paths in grouped.items():
+            result = self._mount_to_backend[mount].download_files(mount_paths)
+            responses.extend(
+                [
+                    FileDownloadResponse(
+                        path=self._prefix_mount_path(
+                            mount, self._get_str(item, "path")
+                        ),
+                        content=self._get_value(item, "content"),
+                        error=self._get_str(item, "error") or None,
+                    )
+                    for item in result
+                ]
+            )
+        return responses
+
+    async def adownload_files(self, paths: list[str]) -> list[FileDownloadResponse]:
+        return await asyncio.to_thread(self.download_files, paths)
diff --git a/surfsense_backend/app/agents/new_chat/middleware/noop_injection.py b/surfsense_backend/app/agents/new_chat/middleware/noop_injection.py
new file mode 100644
index 0000000000..503c73ccc9
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/middleware/noop_injection.py
@@ -0,0 +1,141 @@
+"""
+``_noop`` provider-compatibility tool + injection middleware.
+
+Some providers (LiteLLM, Bedrock, Copilot) 400 when a model call has
+empty ``tools`` but the message history includes prior ``tool_calls`` —
+they treat that shape as malformed even though it's perfectly valid
+LangChain. SurfSense hits this on the compaction summarize call (no
+tools, history full of tool calls).
+
+Ported from OpenCode's ``packages/opencode/src/session/llm.ts:209-228``,
+which discovered and codified the workaround: inject a no-op tool *only*
+on those provider shapes so the request validates without ever being
+called.
+
+Operation: a :class:`NoopInjectionMiddleware` ``wrap_model_call`` checks
+if the request has zero tools but the last AI message in history includes
+``tool_calls``. If yes, it injects the ``_noop`` tool only — never
+globally — mirroring OpenCode's gating exactly. The :func:`noop_tool`
+returns empty content when called (which it should never be in
+practice).
+"""
+
+from __future__ import annotations
+
+import logging
+from collections.abc import Awaitable, Callable
+from typing import Any
+
+from langchain.agents.middleware.types import (
+    AgentMiddleware,
+    AgentState,
+    ContextT,
+    ModelRequest,
+    ModelResponse,
+    ResponseT,
+)
+from langchain_core.messages import AIMessage
+from langchain_core.tools import tool
+
+logger = logging.getLogger(__name__)
+
+NOOP_TOOL_NAME = "_noop"
+NOOP_TOOL_DESCRIPTION = "Do not call this tool. It exists only for API compatibility."
+
+
+@tool(name_or_callable=NOOP_TOOL_NAME, description=NOOP_TOOL_DESCRIPTION)
+def noop_tool() -> str:
+    """Return empty content. Never expected to be called."""
+    return ""
+
+
+# Provider markers that benefit from ``_noop`` injection. These match
+# OpenCode's gating list (``llm.ts:209-228``). We also accept any string
+# containing one of these substrings so e.g. ``litellm`` matches
+# ``ChatLiteLLM``.
+_NOOP_NEEDED_PROVIDERS: tuple[str, ...] = (
+    "litellm",
+    "bedrock",
+    "copilot",
+)
+
+
+def _provider_needs_noop(model: Any) -> bool:
+    """Heuristic: does this model's provider need the _noop injection?"""
+    try:
+        ls_params = model._get_ls_params()
+        provider = str(ls_params.get("ls_provider", "")).lower()
+    except Exception:
+        provider = ""
+
+    if not provider:
+        cls_name = type(model).__name__.lower()
+        provider = cls_name
+
+    return any(needle in provider for needle in _NOOP_NEEDED_PROVIDERS)
+
+
+def _last_ai_has_tool_calls(messages: list[Any]) -> bool:
+    for msg in reversed(messages):
+        if isinstance(msg, AIMessage):
+            return bool(msg.tool_calls)
+    return False
+
+
+class NoopInjectionMiddleware(
+    AgentMiddleware[AgentState[ResponseT], ContextT, ResponseT]
+):
+    """Inject the ``_noop`` tool only when the provider would otherwise 400.
+
+    The check fires per model call, not at agent build time, because the
+    summarization path generates a no-tool subcall at runtime. The
+    extra tool is appended to ``request.tools`` as an instance — the
+    actual ``langchain_core.tools.BaseTool`` is bound on every call site
+    that creates the agent.
+    """
+
+    def __init__(self, *, noop_tool_instance: Any | None = None) -> None:
+        super().__init__()
+        self._noop_tool = noop_tool_instance or noop_tool
+        self.tools = []
+
+    def _should_inject(self, request: ModelRequest[ContextT]) -> bool:
+        if request.tools:
+            return False
+        if not _last_ai_has_tool_calls(request.messages):
+            return False
+        return _provider_needs_noop(request.model)
+
+    def _augmented(self, request: ModelRequest[ContextT]) -> ModelRequest[ContextT]:
+        return request.override(tools=[self._noop_tool])
+
+    def wrap_model_call(  # type: ignore[override]
+        self,
+        request: ModelRequest[ContextT],
+        handler: Callable[[ModelRequest[ContextT]], ModelResponse[ResponseT]],
+    ) -> Any:
+        if self._should_inject(request):
+            logger.debug("Injecting _noop tool for provider compatibility")
+            return handler(self._augmented(request))
+        return handler(request)
+
+    async def awrap_model_call(  # type: ignore[override]
+        self,
+        request: ModelRequest[ContextT],
+        handler: Callable[
+            [ModelRequest[ContextT]], Awaitable[ModelResponse[ResponseT]]
+        ],
+    ) -> Any:
+        if self._should_inject(request):
+            logger.debug("Injecting _noop tool for provider compatibility")
+            return await handler(self._augmented(request))
+        return await handler(request)
+
+
+__all__ = [
+    "NOOP_TOOL_DESCRIPTION",
+    "NOOP_TOOL_NAME",
+    "NoopInjectionMiddleware",
+    "_provider_needs_noop",
+    "noop_tool",
+]
diff --git a/surfsense_backend/app/agents/new_chat/middleware/otel_span.py b/surfsense_backend/app/agents/new_chat/middleware/otel_span.py
new file mode 100644
index 0000000000..cfe1edae4b
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/middleware/otel_span.py
@@ -0,0 +1,202 @@
+"""
+OpenTelemetry span middleware for the SurfSense ``new_chat`` agent.
+
+Wraps both ``model.call`` (LLM invocations) and ``tool.call`` (tool
+executions) with OTel spans, attaching low-cardinality span names and
+high-cardinality identifiers as attributes.
+
+This middleware is intentionally a thin adapter over
+:mod:`app.observability.otel`; when OTel is not configured all spans
+collapse to no-ops and the wrapper adds <1µs overhead per call. When
+OTel **is** configured (``OTEL_EXPORTER_OTLP_ENDPOINT`` set), every
+model and tool call gets a span with the standard attributes our
+dashboards expect.
+"""
+
+from __future__ import annotations
+
+import logging
+from collections.abc import Awaitable, Callable
+from typing import TYPE_CHECKING, Any
+
+from langchain.agents.middleware import AgentMiddleware
+from langchain_core.messages import AIMessage, ToolMessage
+
+from app.observability import otel as ot
+
+if TYPE_CHECKING:  # pragma: no cover — type-only
+    from langchain.agents.middleware.types import (
+        ModelRequest,
+        ModelResponse,
+        ToolCallRequest,
+    )
+    from langgraph.types import Command
+
+logger = logging.getLogger(__name__)
+
+
+class OtelSpanMiddleware(AgentMiddleware):
+    """Emit ``model.call`` and ``tool.call`` OTel spans for every invocation.
+
+    Should be placed near the **outer** end of the middleware list so
+    that the spans encompass retry/fallback wrapper effects (i.e. ``N``
+    model.call spans for ``N`` retry attempts) but inside any concurrency/
+    auth gate. Empirically this means **between** ``BusyMutex`` and
+    ``RetryAfter``.
+    """
+
+    def __init__(self, *, instrumentation_name: str = "surfsense.new_chat") -> None:
+        super().__init__()
+        self._instrumentation_name = instrumentation_name
+
+    # ------------------------------------------------------------------
+    # Model call spans
+    # ------------------------------------------------------------------
+
+    async def awrap_model_call(
+        self,
+        request: ModelRequest,
+        handler: Callable[[ModelRequest], Awaitable[ModelResponse | AIMessage | Any]],
+    ) -> ModelResponse | AIMessage | Any:
+        if not ot.is_enabled():
+            return await handler(request)
+
+        model_id, provider = _resolve_model_attrs(request)
+        with ot.model_call_span(model_id=model_id, provider=provider) as sp:
+            try:
+                result = await handler(request)
+            except Exception:
+                # span context manager records + re-raises
+                raise
+            else:
+                _annotate_model_response(sp, result)
+                return result
+
+    # ------------------------------------------------------------------
+    # Tool call spans
+    # ------------------------------------------------------------------
+
+    async def awrap_tool_call(
+        self,
+        request: ToolCallRequest,
+        handler: Callable[[ToolCallRequest], Awaitable[ToolMessage | Command[Any]]],
+    ) -> ToolMessage | Command[Any]:
+        if not ot.is_enabled():
+            return await handler(request)
+
+        tool_name = _resolve_tool_name(request)
+        input_size = _resolve_input_size(request)
+
+        with ot.tool_call_span(tool_name, input_size=input_size) as sp:
+            result = await handler(request)
+            _annotate_tool_result(sp, result)
+            return result
+
+
+# ---------------------------------------------------------------------------
+# Attribute helpers (kept defensive; we never want OTel bookkeeping to break
+# a real model/tool call).
+# ---------------------------------------------------------------------------
+
+
+def _resolve_model_attrs(request: Any) -> tuple[str | None, str | None]:
+    """Extract ``model.id`` and ``model.provider`` from a ``ModelRequest``."""
+    model_id: str | None = None
+    provider: str | None = None
+    try:
+        model = getattr(request, "model", None)
+        if model is None:
+            return None, None
+        # langchain BaseChatModel exposes a few different identifiers
+        for attr in ("model_name", "model", "model_id"):
+            value = getattr(model, attr, None)
+            if value:
+                model_id = str(value)
+                break
+        # provider sometimes lives on ``_llm_type`` (legacy) or ``provider``
+        for attr in ("provider", "_llm_type"):
+            value = getattr(model, attr, None)
+            if value:
+                provider = str(value)
+                break
+    except Exception:  # pragma: no cover — defensive
+        pass
+    return model_id, provider
+
+
+def _resolve_tool_name(request: Any) -> str:
+    try:
+        tool = getattr(request, "tool", None)
+        if tool is not None:
+            name = getattr(tool, "name", None)
+            if isinstance(name, str) and name:
+                return name
+        # Fall back to the tool_call dict
+        call = getattr(request, "tool_call", None) or {}
+        name = call.get("name") if isinstance(call, dict) else None
+        if isinstance(name, str) and name:
+            return name
+    except Exception:  # pragma: no cover — defensive
+        pass
+    return "unknown"
+
+
+def _resolve_input_size(request: Any) -> int | None:
+    try:
+        call = getattr(request, "tool_call", None)
+        if not isinstance(call, dict) or not call:
+            return None
+        args = call.get("args")
+        if args is None:
+            return None
+        return len(repr(args))
+    except Exception:  # pragma: no cover — defensive
+        return None
+
+
+def _annotate_model_response(span: Any, result: Any) -> None:
+    """Best-effort: attach prompt/completion token counts when available."""
+    try:
+        # ModelResponse may be a dataclass with .result containing AIMessage
+        msg: Any
+        if isinstance(result, AIMessage):
+            msg = result
+        else:
+            inner = getattr(result, "result", None)
+            msg = inner[-1] if isinstance(inner, list) and inner else inner
+        if msg is None:
+            return
+        usage = getattr(msg, "usage_metadata", None) or {}
+        if isinstance(usage, dict):
+            if (n := usage.get("input_tokens")) is not None:
+                span.set_attribute("tokens.prompt", int(n))
+            if (n := usage.get("output_tokens")) is not None:
+                span.set_attribute("tokens.completion", int(n))
+            if (n := usage.get("total_tokens")) is not None:
+                span.set_attribute("tokens.total", int(n))
+        tool_calls = getattr(msg, "tool_calls", None) or []
+        span.set_attribute("model.tool_calls", len(tool_calls))
+    except Exception:  # pragma: no cover — defensive
+        pass
+
+
+def _annotate_tool_result(span: Any, result: Any) -> None:
+    try:
+        if isinstance(result, ToolMessage):
+            content = (
+                result.content
+                if isinstance(result.content, str)
+                else repr(result.content)
+            )
+            span.set_attribute("tool.output.size", len(content))
+            status = getattr(result, "status", None)
+            if isinstance(status, str):
+                span.set_attribute("tool.status", status)
+            kwargs = getattr(result, "additional_kwargs", None) or {}
+            if isinstance(kwargs, dict) and kwargs.get("error"):
+                span.set_attribute("tool.error", True)
+    except Exception:  # pragma: no cover — defensive
+        pass
+
+
+__all__ = ["OtelSpanMiddleware"]
diff --git a/surfsense_backend/app/agents/new_chat/middleware/permission.py b/surfsense_backend/app/agents/new_chat/middleware/permission.py
new file mode 100644
index 0000000000..f77b7e3877
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/middleware/permission.py
@@ -0,0 +1,425 @@
+"""
+PermissionMiddleware — pattern-based allow/deny/ask with HITL fallback.
+
+LangChain's :class:`HumanInTheLoopMiddleware` only supports a static
+"this tool always asks" decision per tool. There's no rule-based
+allow/deny/ask layered ruleset, no glob patterns, no per-search-space or
+per-thread overrides, and no auto-deny synthesis.
+
+This middleware ports OpenCode's ``packages/opencode/src/permission/index.ts``
+ruleset model on top of SurfSense's existing ``interrupt({type, action,
+context})`` payload shape (see ``app/agents/new_chat/tools/hitl.py``) so
+the frontend keeps working unchanged.
+
+Operation:
+1. ``aafter_model`` inspects the latest ``AIMessage.tool_calls``.
+2. For each call, the middleware builds a list of ``patterns`` (the
+   tool name plus any tool-specific patterns from the resolver). It
+   evaluates each pattern against the layered rulesets and aggregates
+   the results: ``deny`` > ``ask`` > ``allow``.
+3. On ``deny``: replaces the call with a synthetic ``ToolMessage``
+   containing a :class:`StreamingError`.
+4. On ``ask``: raises a SurfSense-style ``interrupt(...)``. Both the legacy
+   SurfSense shape and LangChain HITL ``{"decisions": [{"type": ...}]}``
+   replies are accepted via :func:`_normalize_permission_decision`.
+   - ``once``: proceed.
+   - ``approve_always``: also persist allow rules for ``request.always`` patterns.
+   - ``reject`` w/o feedback: raise :class:`RejectedError`.
+   - ``reject`` w/  feedback: raise :class:`CorrectedError`.
+5. On ``allow``: proceed unchanged.
+
+The middleware also performs a *pre-model* tool-filter step (the
+``before_model`` hook) so globally denied tools are stripped from the
+exposed tool list before the model gets to see them. This mirrors
+OpenCode's ``Permission.disabled`` and dramatically reduces the chance
+the model emits a deny-only call.
+"""
+
+from __future__ import annotations
+
+import logging
+from collections.abc import Callable
+from typing import Any
+
+from langchain.agents.middleware.types import (
+    AgentMiddleware,
+    AgentState,
+    ContextT,
+)
+from langchain_core.messages import AIMessage, ToolMessage
+from langgraph.runtime import Runtime
+from langgraph.types import interrupt
+
+from app.agents.new_chat.errors import (
+    CorrectedError,
+    RejectedError,
+    StreamingError,
+)
+from app.agents.new_chat.permissions import (
+    Rule,
+    Ruleset,
+    aggregate_action,
+    evaluate_many,
+)
+from app.observability import otel as ot
+
+logger = logging.getLogger(__name__)
+
+
+# Mapping ``tool_name -> resolver`` that converts ``args`` to a list of
+# patterns to evaluate. The first pattern is conventionally the bare
+# tool name; later entries narrow down to specific resources.
+PatternResolver = Callable[[dict[str, Any]], list[str]]
+
+
+def _default_pattern_resolver(name: str) -> PatternResolver:
+    def _resolve(args: dict[str, Any]) -> list[str]:
+        # Bare name covers the default catch-all; primary-arg fallbacks
+        # are best added per-tool by callers.
+        del args
+        return [name]
+
+    return _resolve
+
+
+# Translation from the LangChain HITL envelope (what ``stream_resume_chat``
+# sends) to SurfSense's legacy ``decision_type`` shape. ``edit`` keeps the
+# original tool args — tools needing argument edits should use
+# ``request_approval`` from ``app/agents/new_chat/tools/hitl.py``.
+_LC_TYPE_TO_PERMISSION_DECISION: dict[str, str] = {
+    "approve": "once",
+    "reject": "reject",
+    "edit": "once",
+    "approve_always": "approve_always",
+}
+
+
+def _normalize_permission_decision(decision: Any) -> dict[str, Any]:
+    """Coerce any accepted reply shape into ``{"decision_type": ..., "feedback"?}``.
+
+    Falls back to ``reject`` (with a warning) on unrecognized payloads so the
+    middleware fails closed.
+    """
+    if isinstance(decision, str):
+        return {"decision_type": decision}
+    if not isinstance(decision, dict):
+        logger.warning(
+            "Unrecognized permission resume value (%s); treating as reject",
+            type(decision).__name__,
+        )
+        return {"decision_type": "reject"}
+
+    if decision.get("decision_type"):
+        return decision
+
+    payload: dict[str, Any] = decision
+    decisions = decision.get("decisions")
+    if isinstance(decisions, list) and decisions:
+        first = decisions[0]
+        if isinstance(first, dict):
+            payload = first
+
+    raw_type = payload.get("type") or payload.get("decision_type")
+    if not raw_type:
+        logger.warning(
+            "Permission resume missing decision type (keys=%s); treating as reject",
+            list(payload.keys()),
+        )
+        return {"decision_type": "reject"}
+
+    raw_type = str(raw_type).lower()
+    mapped = _LC_TYPE_TO_PERMISSION_DECISION.get(raw_type)
+    if mapped is None:
+        # Tolerate legacy values arriving without ``decision_type`` wrapping.
+        if raw_type in {"once", "approve_always", "reject"}:
+            mapped = raw_type
+        else:
+            logger.warning(
+                "Unknown permission decision type %r; treating as reject", raw_type
+            )
+            mapped = "reject"
+
+    if raw_type == "edit":
+        logger.warning(
+            "Permission middleware received an 'edit' decision; original args "
+            "kept (edits not merged here)."
+        )
+
+    out: dict[str, Any] = {"decision_type": mapped}
+    feedback = payload.get("feedback") or payload.get("message")
+    if isinstance(feedback, str) and feedback.strip():
+        out["feedback"] = feedback
+    return out
+
+
+class PermissionMiddleware(AgentMiddleware):  # type: ignore[type-arg]
+    """Allow/deny/ask layer over the agent's tool calls.
+
+    Args:
+        rulesets: Layered rulesets to evaluate. Earlier entries are
+            overridden by later ones (last-match-wins). Typical layering:
+            ``defaults < global < space < thread < runtime_approved``.
+        pattern_resolvers: Optional per-tool callables that return a list
+            of patterns to evaluate. When a tool isn't listed, the bare
+            tool name is used as the only pattern.
+        runtime_ruleset: Mutable :class:`Ruleset` that the middleware
+            extends in-place when the user replies ``"approve_always"`` to
+            an ask interrupt. Reused across all calls in the same agent
+            instance so newly-allowed rules apply to subsequent calls.
+        always_emit_interrupt_payload: If True, every ask uses the
+            SurfSense interrupt wire format (default). Set False to
+            disable interrupts and treat ``ask`` as ``deny`` for
+            non-interactive deployments.
+    """
+
+    tools = ()
+
+    def __init__(
+        self,
+        *,
+        rulesets: list[Ruleset] | None = None,
+        pattern_resolvers: dict[str, PatternResolver] | None = None,
+        runtime_ruleset: Ruleset | None = None,
+        always_emit_interrupt_payload: bool = True,
+    ) -> None:
+        super().__init__()
+        self._static_rulesets: list[Ruleset] = list(rulesets or [])
+        self._pattern_resolvers: dict[str, PatternResolver] = dict(
+            pattern_resolvers or {}
+        )
+        self._runtime_ruleset: Ruleset = runtime_ruleset or Ruleset(
+            origin="runtime_approved"
+        )
+        self._emit_interrupt = always_emit_interrupt_payload
+
+    # ------------------------------------------------------------------
+    # Tool-filter step (mirrors OpenCode's ``Permission.disabled``)
+    # ------------------------------------------------------------------
+
+    def _globally_denied(self, tool_name: str) -> bool:
+        """Return True if a deny rule with no narrowing pattern matches."""
+        rules = evaluate_many(tool_name, ["*"], *self._all_rulesets())
+        return aggregate_action(rules) == "deny"
+
+    def _all_rulesets(self) -> list[Ruleset]:
+        return [*self._static_rulesets, self._runtime_ruleset]
+
+    # NOTE: ``before_model`` filtering of the tools list is left to the
+    # agent factory. This middleware only blocks at execution time — and
+    # only via the rule-evaluator path, not by mutating ``request.tools``.
+    # Mutating ``request.tools`` per-call would invalidate provider
+    # prompt-cache prefixes (see Operational risks: prompt-cache regression).
+
+    # ------------------------------------------------------------------
+    # Tool-call evaluation
+    # ------------------------------------------------------------------
+
+    def _resolve_patterns(self, tool_name: str, args: dict[str, Any]) -> list[str]:
+        resolver = self._pattern_resolvers.get(
+            tool_name, _default_pattern_resolver(tool_name)
+        )
+        try:
+            patterns = resolver(args or {})
+        except Exception:
+            logger.exception(
+                "Pattern resolver for %s raised; using bare name", tool_name
+            )
+            patterns = [tool_name]
+        if not patterns:
+            patterns = [tool_name]
+        return patterns
+
+    def _evaluate(
+        self, tool_name: str, args: dict[str, Any]
+    ) -> tuple[str, list[str], list[Rule]]:
+        patterns = self._resolve_patterns(tool_name, args)
+        rules = evaluate_many(tool_name, patterns, *self._all_rulesets())
+        action = aggregate_action(rules)
+        return action, patterns, rules
+
+    # ------------------------------------------------------------------
+    # HITL ask flow — SurfSense wire format
+    # ------------------------------------------------------------------
+
+    def _raise_interrupt(
+        self,
+        *,
+        tool_name: str,
+        args: dict[str, Any],
+        patterns: list[str],
+        rules: list[Rule],
+    ) -> dict[str, Any]:
+        """Block on user approval via SurfSense's ``interrupt`` shape."""
+        if not self._emit_interrupt:
+            return {"decision_type": "reject"}
+
+        # ``params`` (NOT ``args``) is what SurfSense's streaming
+        # normalizer forwards. Other fields move into ``context``.
+        payload = {
+            "type": "permission_ask",
+            "action": {"tool": tool_name, "params": args or {}},
+            "context": {
+                "patterns": patterns,
+                "rules": [
+                    {
+                        "permission": r.permission,
+                        "pattern": r.pattern,
+                        "action": r.action,
+                    }
+                    for r in rules
+                ],
+                # Rules of thumb for the frontend: surface the patterns
+                # the user can promote to "approve_always" with a single reply.
+                "always": patterns,
+            },
+        }
+        # Open ``permission.asked`` + ``interrupt.raised`` OTel spans
+        # (no-op when OTel is disabled) so dashboards can correlate
+        # "we asked X" with "interrupt was actually delivered".
+        with (
+            ot.permission_asked_span(
+                permission=tool_name,
+                pattern=patterns[0] if patterns else None,
+                extra={"permission.patterns": list(patterns)},
+            ),
+            ot.interrupt_span(interrupt_type="permission_ask"),
+        ):
+            decision = interrupt(payload)
+        return _normalize_permission_decision(decision)
+
+    def _persist_always(self, tool_name: str, patterns: list[str]) -> None:
+        """Promote ``approve_always`` reply into runtime allow rules.
+
+        Persistence to ``agent_permission_rules`` is done by the
+        streaming layer (``stream_new_chat``) once it observes the
+        ``approve_always`` reply — the middleware just keeps an
+        in-memory copy so subsequent calls in the same stream see the rule.
+        """
+        for pattern in patterns:
+            self._runtime_ruleset.rules.append(
+                Rule(permission=tool_name, pattern=pattern, action="allow")
+            )
+
+    # ------------------------------------------------------------------
+    # Synthesizing deny -> ToolMessage
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def _deny_message(
+        tool_call: dict[str, Any],
+        rule: Rule,
+    ) -> ToolMessage:
+        err = StreamingError(
+            code="permission_denied",
+            retryable=False,
+            suggestion=(
+                f"rule permission={rule.permission!r} pattern={rule.pattern!r} "
+                f"blocked this call"
+            ),
+        )
+        return ToolMessage(
+            content=(
+                f"Permission denied: rule {rule.permission}/{rule.pattern} "
+                f"blocked tool {tool_call.get('name')!r}."
+            ),
+            tool_call_id=tool_call.get("id") or "",
+            name=tool_call.get("name"),
+            status="error",
+            additional_kwargs={"error": err.model_dump()},
+        )
+
+    # ------------------------------------------------------------------
+    # The hook: aafter_model
+    # ------------------------------------------------------------------
+
+    def _process(
+        self,
+        state: AgentState,
+        runtime: Runtime[Any],
+    ) -> dict[str, Any] | None:
+        del runtime  # unused
+        messages = state.get("messages") or []
+        if not messages:
+            return None
+        last = messages[-1]
+        if not isinstance(last, AIMessage) or not last.tool_calls:
+            return None
+
+        deny_messages: list[ToolMessage] = []
+        kept_calls: list[dict[str, Any]] = []
+        any_change = False
+
+        for raw in last.tool_calls:
+            call = (
+                dict(raw)
+                if isinstance(raw, dict)
+                else {
+                    "name": getattr(raw, "name", None),
+                    "args": getattr(raw, "args", {}),
+                    "id": getattr(raw, "id", None),
+                    "type": "tool_call",
+                }
+            )
+            name = call.get("name") or ""
+            args = call.get("args") or {}
+            action, patterns, rules = self._evaluate(name, args)
+
+            if action == "deny":
+                # Find the deny rule for the suggestion text
+                deny_rule = next((r for r in rules if r.action == "deny"), rules[0])
+                deny_messages.append(self._deny_message(call, deny_rule))
+                any_change = True
+                continue
+
+            if action == "ask":
+                decision = self._raise_interrupt(
+                    tool_name=name, args=args, patterns=patterns, rules=rules
+                )
+                kind = str(decision.get("decision_type") or "reject").lower()
+                if kind == "once":
+                    kept_calls.append(call)
+                elif kind == "approve_always":
+                    self._persist_always(name, patterns)
+                    kept_calls.append(call)
+                elif kind == "reject":
+                    feedback = decision.get("feedback")
+                    if isinstance(feedback, str) and feedback.strip():
+                        raise CorrectedError(feedback, tool=name)
+                    raise RejectedError(
+                        tool=name, pattern=patterns[0] if patterns else None
+                    )
+                else:
+                    logger.warning(
+                        "Unknown permission decision %r; treating as reject", kind
+                    )
+                    raise RejectedError(tool=name)
+                continue
+
+            # allow
+            kept_calls.append(call)
+
+        if not any_change and len(kept_calls) == len(last.tool_calls):
+            return None
+
+        updated = last.model_copy(update={"tool_calls": kept_calls})
+        result_messages: list[Any] = [updated]
+        if deny_messages:
+            result_messages.extend(deny_messages)
+        return {"messages": result_messages}
+
+    def after_model(  # type: ignore[override]
+        self, state: AgentState, runtime: Runtime[ContextT]
+    ) -> dict[str, Any] | None:
+        return self._process(state, runtime)
+
+    async def aafter_model(  # type: ignore[override]
+        self, state: AgentState, runtime: Runtime[ContextT]
+    ) -> dict[str, Any] | None:
+        return self._process(state, runtime)
+
+
+__all__ = [
+    "PatternResolver",
+    "PermissionMiddleware",
+    "_normalize_permission_decision",
+]
diff --git a/surfsense_backend/app/agents/new_chat/middleware/retry_after.py b/surfsense_backend/app/agents/new_chat/middleware/retry_after.py
new file mode 100644
index 0000000000..0c3d3d017d
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/middleware/retry_after.py
@@ -0,0 +1,257 @@
+"""
+RetryAfterMiddleware — Header-aware retry with custom backoff and SSE eventing.
+
+LangChain's :class:`ModelRetryMiddleware` retries on exceptions but ignores
+the ``Retry-After`` HTTP header — it just runs its own exponential backoff.
+That wastes time when a provider has explicitly told us how long to wait.
+This middleware honors the header (mirroring OpenCode's
+``packages/opencode/src/session/llm.ts`` retry pathway) and emits an SSE
+event so the UI can show "rate-limited, retrying in Ns".
+
+We can't subclass ``ModelRetryMiddleware`` cleanly because its loop calls a
+module-level ``calculate_delay`` inline (no overridable
+``_calculate_delay`` hook), so this is a standalone implementation.
+
+Behaviour:
+- Extracts ``Retry-After`` / ``retry-after-ms`` from
+  ``litellm.exceptions.RateLimitError.response.headers`` (or any exception
+  exposing a similar shape).
+- Sleeps ``max(exponential_backoff, header_delay)`` between retries.
+- Returns ``False`` from ``retry_on`` for ``ContextWindowExceededError`` /
+  ``ContextOverflowError`` so :class:`SurfSenseCompactionMiddleware` (or
+  the LangChain summarization fallback path) handles those instead.
+- Emits ``surfsense.retrying`` via ``adispatch_custom_event`` on each retry
+  so ``stream_new_chat`` can forward it to clients as an SSE event.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import random
+import re
+import time
+from collections.abc import Awaitable, Callable
+from typing import Any
+
+from langchain.agents.middleware.types import (
+    AgentMiddleware,
+    AgentState,
+    ContextT,
+    ModelRequest,
+    ModelResponse,
+    ResponseT,
+)
+from langchain_core.callbacks import adispatch_custom_event, dispatch_custom_event
+from langchain_core.messages import AIMessage
+
+logger = logging.getLogger(__name__)
+
+# Names of exception classes for which a retry would not help — context
+# overflow needs compaction, auth needs human intervention, etc. Detected
+# by class-name substring so we don't have to import LiteLLM/Anthropic
+# here (which would tie this module to optional deps).
+_NON_RETRYABLE_NAME_HINTS: tuple[str, ...] = (
+    "ContextWindowExceeded",
+    "ContextOverflow",
+    "AuthenticationError",
+    "InvalidRequestError",
+    "PermissionDenied",
+    "InvalidApiKey",
+    "ContextLimit",
+)
+
+
+def _is_non_retryable(exc: BaseException) -> bool:
+    name = type(exc).__name__
+    return any(hint in name for hint in _NON_RETRYABLE_NAME_HINTS)
+
+
+def _extract_retry_after_seconds(exc: BaseException) -> float | None:
+    """Return seconds-to-wait suggested by the provider, if any.
+
+    Looks at ``exc.response.headers`` or ``exc.headers`` for the standard
+    HTTP ``Retry-After`` header (in seconds) or its millisecond cousin
+    ``retry-after-ms`` (sometimes used by Anthropic / OpenAI). Falls back
+    to a regex on the exception message for shapes like
+    ``"Please retry after 30s"``.
+    """
+    headers: dict[str, Any] | None = None
+    response = getattr(exc, "response", None)
+    if response is not None:
+        headers = getattr(response, "headers", None)
+    if headers is None:
+        headers = getattr(exc, "headers", None)
+
+    if isinstance(headers, dict):
+        # Normalize keys to lowercase for case-insensitive matching
+        norm = {str(k).lower(): v for k, v in headers.items()}
+        ms = norm.get("retry-after-ms")
+        if ms is not None:
+            try:
+                return float(ms) / 1000.0
+            except (TypeError, ValueError):
+                pass
+        seconds = norm.get("retry-after")
+        if seconds is not None:
+            try:
+                return float(seconds)
+            except (TypeError, ValueError):
+                pass
+
+    # Last resort: scan the message for "retry after Xs" or "X seconds"
+    msg = str(exc)
+    match = re.search(r"retry\s+after\s+([0-9]+(?:\.[0-9]+)?)", msg, re.IGNORECASE)
+    if match:
+        try:
+            return float(match.group(1))
+        except ValueError:
+            return None
+    return None
+
+
+def _exponential_delay(
+    attempt: int,
+    *,
+    initial_delay: float,
+    backoff_factor: float,
+    max_delay: float,
+    jitter: bool,
+) -> float:
+    """Compute an exponential-backoff delay with optional ±25% jitter."""
+    delay = (
+        initial_delay * (backoff_factor**attempt) if backoff_factor else initial_delay
+    )
+    delay = min(delay, max_delay)
+    if jitter and delay > 0:
+        delay *= 1 + random.uniform(-0.25, 0.25)
+    return max(delay, 0.0)
+
+
+class RetryAfterMiddleware(AgentMiddleware[AgentState[ResponseT], ContextT, ResponseT]):
+    """Retry middleware that honors provider-issued Retry-After hints.
+
+    Drop-in replacement for :class:`langchain.agents.middleware.ModelRetryMiddleware`
+    when working with LiteLLM/Anthropic/OpenAI providers that surface
+    rate-limit hints in headers. Always emits ``surfsense.retrying`` SSE
+    events so the UI can show a friendly "rate limited, retrying in Xs"
+    indicator.
+
+    Args:
+        max_retries: Maximum retries after the initial attempt (default 3).
+        initial_delay: Initial backoff delay in seconds.
+        backoff_factor: Exponential growth factor for backoff.
+        max_delay: Cap on per-attempt delay in seconds.
+        jitter: Whether to add ±25% jitter.
+        retry_on: Optional callable that returns True for retryable
+            exceptions. The default retries everything except known
+            non-retryable classes (context overflow, auth, etc.).
+    """
+
+    def __init__(
+        self,
+        *,
+        max_retries: int = 3,
+        initial_delay: float = 1.0,
+        backoff_factor: float = 2.0,
+        max_delay: float = 60.0,
+        jitter: bool = True,
+        retry_on: Callable[[BaseException], bool] | None = None,
+    ) -> None:
+        super().__init__()
+        self.max_retries = max_retries
+        self.initial_delay = initial_delay
+        self.backoff_factor = backoff_factor
+        self.max_delay = max_delay
+        self.jitter = jitter
+        self._retry_on: Callable[[BaseException], bool] = retry_on or (
+            lambda exc: not _is_non_retryable(exc)
+        )
+
+    def _should_retry(self, exc: BaseException) -> bool:
+        try:
+            return bool(self._retry_on(exc))
+        except Exception:
+            logger.exception("retry_on callable raised; defaulting to False")
+            return False
+
+    def _delay_for_attempt(self, attempt: int, exc: BaseException) -> float:
+        backoff = _exponential_delay(
+            attempt,
+            initial_delay=self.initial_delay,
+            backoff_factor=self.backoff_factor,
+            max_delay=self.max_delay,
+            jitter=self.jitter,
+        )
+        header = _extract_retry_after_seconds(exc) or 0.0
+        return max(backoff, header)
+
+    def wrap_model_call(  # type: ignore[override]
+        self,
+        request: ModelRequest[ContextT],
+        handler: Callable[[ModelRequest[ContextT]], ModelResponse[ResponseT]],
+    ) -> ModelResponse[ResponseT] | AIMessage:
+        for attempt in range(self.max_retries + 1):
+            try:
+                return handler(request)
+            except Exception as exc:
+                if not self._should_retry(exc) or attempt >= self.max_retries:
+                    raise
+                delay = self._delay_for_attempt(attempt, exc)
+                try:
+                    dispatch_custom_event(
+                        "surfsense.retrying",
+                        {
+                            "attempt": attempt + 1,
+                            "max_retries": self.max_retries,
+                            "delay_ms": int(delay * 1000),
+                            "reason": type(exc).__name__,
+                        },
+                    )
+                except Exception:
+                    logger.debug(
+                        "dispatch_custom_event failed; suppressed", exc_info=True
+                    )
+                if delay > 0:
+                    time.sleep(delay)
+        # Unreachable
+        raise RuntimeError("RetryAfterMiddleware: retry loop exited without resolution")
+
+    async def awrap_model_call(  # type: ignore[override]
+        self,
+        request: ModelRequest[ContextT],
+        handler: Callable[
+            [ModelRequest[ContextT]], Awaitable[ModelResponse[ResponseT]]
+        ],
+    ) -> ModelResponse[ResponseT] | AIMessage:
+        for attempt in range(self.max_retries + 1):
+            try:
+                return await handler(request)
+            except Exception as exc:
+                if not self._should_retry(exc) or attempt >= self.max_retries:
+                    raise
+                delay = self._delay_for_attempt(attempt, exc)
+                try:
+                    await adispatch_custom_event(
+                        "surfsense.retrying",
+                        {
+                            "attempt": attempt + 1,
+                            "max_retries": self.max_retries,
+                            "delay_ms": int(delay * 1000),
+                            "reason": type(exc).__name__,
+                        },
+                    )
+                except Exception:
+                    logger.debug(
+                        "adispatch_custom_event failed; suppressed", exc_info=True
+                    )
+                if delay > 0:
+                    await asyncio.sleep(delay)
+        raise RuntimeError("RetryAfterMiddleware: retry loop exited without resolution")
+
+
+__all__ = [
+    "RetryAfterMiddleware",
+    "_extract_retry_after_seconds",
+    "_is_non_retryable",
+]
diff --git a/surfsense_backend/app/agents/new_chat/middleware/scoped_model_fallback.py b/surfsense_backend/app/agents/new_chat/middleware/scoped_model_fallback.py
new file mode 100644
index 0000000000..99eb2d74a9
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/middleware/scoped_model_fallback.py
@@ -0,0 +1,91 @@
+"""Fallback only on provider/network errors; let programming bugs raise."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+from langchain.agents.middleware import ModelFallbackMiddleware
+
+if TYPE_CHECKING:
+    from collections.abc import Awaitable, Callable
+
+    from langchain.agents.middleware.types import ModelRequest, ModelResponse
+    from langchain_core.messages import AIMessage
+
+
+# Matched by class name across the MRO so we don't have to import every
+# provider SDK (openai/anthropic/google/...). Extend as new providers ship.
+_FALLBACK_ELIGIBLE_NAMES: frozenset[str] = frozenset(
+    {
+        "RateLimitError",
+        "APIStatusError",
+        "InternalServerError",
+        "ServiceUnavailableError",
+        "BadGatewayError",
+        "GatewayTimeoutError",
+        "APIConnectionError",
+        "APITimeoutError",
+        "ConnectError",
+        "ConnectTimeout",
+        "ReadTimeout",
+        "RemoteProtocolError",
+        "TimeoutError",
+        "TimeoutException",
+    }
+)
+
+
+def _is_fallback_eligible(exc: BaseException) -> bool:
+    return any(cls.__name__ in _FALLBACK_ELIGIBLE_NAMES for cls in type(exc).__mro__)
+
+
+class ScopedModelFallbackMiddleware(ModelFallbackMiddleware):
+    """Re-raise non-provider exceptions instead of walking the fallback chain."""
+
+    def wrap_model_call(  # type: ignore[override]
+        self,
+        request: ModelRequest[Any],
+        handler: Callable[[ModelRequest[Any]], ModelResponse[Any]],
+    ) -> ModelResponse[Any] | AIMessage:
+        last_exception: Exception
+        try:
+            return handler(request)
+        except Exception as e:
+            if not _is_fallback_eligible(e):
+                raise
+            last_exception = e
+
+        for fallback_model in self.models:
+            try:
+                return handler(request.override(model=fallback_model))
+            except Exception as e:
+                if not _is_fallback_eligible(e):
+                    raise
+                last_exception = e
+                continue
+
+        raise last_exception
+
+    async def awrap_model_call(  # type: ignore[override]
+        self,
+        request: ModelRequest[Any],
+        handler: Callable[[ModelRequest[Any]], Awaitable[ModelResponse[Any]]],
+    ) -> ModelResponse[Any] | AIMessage:
+        last_exception: Exception
+        try:
+            return await handler(request)
+        except Exception as e:
+            if not _is_fallback_eligible(e):
+                raise
+            last_exception = e
+
+        for fallback_model in self.models:
+            try:
+                return await handler(request.override(model=fallback_model))
+            except Exception as e:
+                if not _is_fallback_eligible(e):
+                    raise
+                last_exception = e
+                continue
+
+        raise last_exception
diff --git a/surfsense_backend/app/agents/new_chat/middleware/skills_backends.py b/surfsense_backend/app/agents/new_chat/middleware/skills_backends.py
new file mode 100644
index 0000000000..072d73401f
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/middleware/skills_backends.py
@@ -0,0 +1,337 @@
+"""Skills backends for SurfSense.
+
+Implements two minimal :class:`deepagents.backends.protocol.BackendProtocol`
+subclasses tailored for use with :class:`deepagents.middleware.skills.SkillsMiddleware`.
+
+The middleware only needs four methods to load skills from a backend:
+
+* ``ls_info`` / ``als_info`` — list directories under a source path.
+* ``download_files`` / ``adownload_files`` — fetch ``SKILL.md`` bytes.
+
+Other ``BackendProtocol`` methods (``read``/``write``/``edit``/``grep_raw`` …)
+default to ``NotImplementedError`` from the base class. They are never reached
+by the skills middleware because skill content is rendered into the system
+prompt at agent build time, not edited at runtime.
+
+Two backends are provided:
+
+* :class:`BuiltinSkillsBackend` — disk-backed read of bundled skills from
+  ``app/agents/new_chat/skills/builtin/``.
+* :class:`SearchSpaceSkillsBackend` — a thin read-only wrapper over
+  :class:`KBPostgresBackend` that filters notes under the privileged folder
+  ``/documents/_skills/``.
+
+Both backends are intentionally read-only: skill authoring happens out of band
+(via filesystem or a search-space-admin route), so we never expose
+``write`` / ``edit`` / ``upload_files``. The base class' ``NotImplementedError``
+gives a clean failure mode if anything tries.
+"""
+
+from __future__ import annotations
+
+import contextlib
+import logging
+from collections.abc import Callable
+from dataclasses import replace
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from deepagents.backends.composite import CompositeBackend
+from deepagents.backends.protocol import (
+    BackendProtocol,
+    FileDownloadResponse,
+    FileInfo,
+)
+from deepagents.backends.state import StateBackend
+
+if TYPE_CHECKING:
+    from langchain.tools import ToolRuntime
+
+    from app.agents.new_chat.middleware.kb_postgres_backend import KBPostgresBackend
+
+logger = logging.getLogger(__name__)
+
+
+# Limit per Agent Skills spec; matches deepagents.middleware.skills.MAX_SKILL_FILE_SIZE.
+_MAX_SKILL_FILE_SIZE = 10 * 1024 * 1024
+
+
+def _default_builtin_root() -> Path:
+    """Return the absolute path to the bundled builtin skills directory.
+
+    Located at ``app/agents/new_chat/skills/builtin/`` relative to this module.
+    """
+    return (Path(__file__).resolve().parent.parent / "skills" / "builtin").resolve()
+
+
+class BuiltinSkillsBackend(BackendProtocol):
+    """Read-only disk-backed skills source.
+
+    Maps a virtual ``/skills/builtin/`` namespace onto a directory on local disk,
+    where each skill is its own subdirectory containing a ``SKILL.md`` file::
+
+        <root>/<skill-name>/SKILL.md
+
+    The middleware calls :meth:`als_info` with the source path and expects a
+    ``list[FileInfo]`` whose ``is_dir=True`` entries are descended into. Then it
+    calls :meth:`adownload_files` with the synthesized ``SKILL.md`` paths and
+    parses YAML frontmatter from the returned ``content`` bytes.
+
+    Mounting under :class:`~deepagents.backends.composite.CompositeBackend` at
+    prefix ``/skills/builtin/`` means the middleware can issue paths like
+    ``/skills/builtin/kb-research/SKILL.md`` which the composite strips down to
+    ``/kb-research/SKILL.md`` before forwarding here. We treat any leading
+    slash as anchoring at :attr:`root`.
+    """
+
+    def __init__(self, root: Path | str | None = None) -> None:
+        self.root: Path = Path(root).resolve() if root else _default_builtin_root()
+        if not self.root.exists():
+            logger.info(
+                "BuiltinSkillsBackend root %s does not exist; skills will be empty.",
+                self.root,
+            )
+
+    def _resolve(self, path: str) -> Path:
+        """Resolve a virtual posix path under :attr:`root`, refusing escapes."""
+        bare = path.lstrip("/")
+        candidate = (self.root / bare).resolve() if bare else self.root
+        # Refuse symlink/.. traversal that escapes the root.
+        try:
+            candidate.relative_to(self.root)
+        except ValueError as exc:
+            raise ValueError(f"path {path!r} escapes builtin skills root") from exc
+        return candidate
+
+    def ls_info(self, path: str) -> list[FileInfo]:
+        try:
+            target = self._resolve(path)
+        except ValueError as exc:
+            logger.warning("BuiltinSkillsBackend.ls_info refused: %s", exc)
+            return []
+        if not target.exists() or not target.is_dir():
+            return []
+
+        infos: list[FileInfo] = []
+        # Build virtual paths anchored at "/" because CompositeBackend already
+        # stripped the route prefix before calling us.
+        target_virtual = (
+            "/"
+            if target == self.root
+            else ("/" + str(target.relative_to(self.root)).replace("\\", "/"))
+        )
+        for child in sorted(target.iterdir()):
+            child_virtual = (
+                target_virtual.rstrip("/") + "/" + child.name
+                if target_virtual != "/"
+                else "/" + child.name
+            )
+            info: FileInfo = {
+                "path": child_virtual,
+                "is_dir": child.is_dir(),
+            }
+            if child.is_file():
+                with contextlib.suppress(OSError):  # pragma: no cover - defensive
+                    info["size"] = child.stat().st_size
+            infos.append(info)
+        return infos
+
+    def download_files(self, paths: list[str]) -> list[FileDownloadResponse]:
+        responses: list[FileDownloadResponse] = []
+        for p in paths:
+            try:
+                target = self._resolve(p)
+            except ValueError:
+                responses.append(FileDownloadResponse(path=p, error="invalid_path"))
+                continue
+            if not target.exists():
+                responses.append(FileDownloadResponse(path=p, error="file_not_found"))
+                continue
+            if target.is_dir():
+                responses.append(FileDownloadResponse(path=p, error="is_directory"))
+                continue
+            try:
+                # Hard cap to avoid loading rogue mega-files into memory.
+                size = target.stat().st_size
+                if size > _MAX_SKILL_FILE_SIZE:
+                    logger.warning(
+                        "Builtin skill file %s exceeds %d bytes; truncating.",
+                        target,
+                        _MAX_SKILL_FILE_SIZE,
+                    )
+                    with target.open("rb") as fh:
+                        content = fh.read(_MAX_SKILL_FILE_SIZE)
+                else:
+                    content = target.read_bytes()
+            except PermissionError:
+                responses.append(
+                    FileDownloadResponse(path=p, error="permission_denied")
+                )
+                continue
+            except OSError as exc:  # pragma: no cover - defensive
+                logger.warning("Builtin skill read failed %s: %s", target, exc)
+                responses.append(FileDownloadResponse(path=p, error="file_not_found"))
+                continue
+            responses.append(FileDownloadResponse(path=p, content=content, error=None))
+        return responses
+
+
+class SearchSpaceSkillsBackend(BackendProtocol):
+    """Read-only view of search-space-authored skills.
+
+    Wraps a :class:`KBPostgresBackend` and only ever reads under the privileged
+    folder ``/documents/_skills/`` (configurable). The folder is intended to be
+    writable only by search-space admins; this backend never writes.
+
+    The skills middleware expects a layout like::
+
+        /<source_root>/<skill-name>/SKILL.md
+
+    But the KB stores documents like ``/documents/_skills/<name>/SKILL.md``.
+    We expose the inner namespace by remapping each path. When mounted under
+    :class:`CompositeBackend` at prefix ``/skills/space/`` the paths the
+    middleware sees become ``/skills/space/<name>/SKILL.md``; the composite
+    strips ``/skills/space/`` and hands us ``/<name>/SKILL.md``, which we
+    rewrite to ``/documents/_skills/<name>/SKILL.md`` before forwarding to the
+    KB.
+
+    No new database table is needed: the privileged folder convention is
+    enforced server-side outside of this class. We intentionally swallow any
+    write/edit attempts (the base class raises ``NotImplementedError``).
+    """
+
+    DEFAULT_KB_ROOT: str = "/documents/_skills"
+
+    def __init__(
+        self,
+        kb_backend: KBPostgresBackend,
+        *,
+        kb_root: str = DEFAULT_KB_ROOT,
+    ) -> None:
+        self._kb = kb_backend
+        # Normalize trailing slash off so we can join cleanly.
+        self._kb_root = kb_root.rstrip("/") or "/"
+
+    def _to_kb(self, path: str) -> str:
+        """Rewrite a virtual path into the underlying KB namespace."""
+        bare = path.lstrip("/")
+        if not bare:
+            return self._kb_root
+        return f"{self._kb_root}/{bare}"
+
+    def _from_kb(self, kb_path: str) -> str:
+        """Rewrite a KB path back into our virtual namespace."""
+        if not kb_path.startswith(self._kb_root):
+            return kb_path  # pragma: no cover - defensive
+        rel = kb_path[len(self._kb_root) :]
+        return rel if rel.startswith("/") else "/" + rel
+
+    def ls_info(self, path: str) -> list[FileInfo]:
+        # KBPostgresBackend exposes only the async API meaningfully; the sync
+        # path falls back to ``asyncio.to_thread(...)`` in the base class. We
+        # keep this stub to satisfy abstract resolution; the middleware calls
+        # ``als_info``.
+        raise NotImplementedError("SearchSpaceSkillsBackend is async-only")
+
+    async def als_info(self, path: str) -> list[FileInfo]:
+        kb_path = self._to_kb(path)
+        try:
+            infos = await self._kb.als_info(kb_path)
+        except Exception as exc:  # pragma: no cover - defensive
+            logger.warning("SearchSpaceSkillsBackend.als_info failed: %s", exc)
+            return []
+        remapped: list[FileInfo] = []
+        for info in infos:
+            kb_p = info.get("path", "")
+            if not kb_p.startswith(self._kb_root):
+                continue
+            remapped.append({**info, "path": self._from_kb(kb_p)})
+        return remapped
+
+    def download_files(self, paths: list[str]) -> list[FileDownloadResponse]:
+        raise NotImplementedError("SearchSpaceSkillsBackend is async-only")
+
+    async def adownload_files(self, paths: list[str]) -> list[FileDownloadResponse]:
+        kb_paths = [self._to_kb(p) for p in paths]
+        responses = await self._kb.adownload_files(kb_paths)
+        # Re-map response paths back to the virtual namespace so the middleware
+        # correlates them to the input list correctly.
+        remapped: list[FileDownloadResponse] = []
+        for original, resp in zip(paths, responses, strict=True):
+            remapped.append(replace(resp, path=original))
+        return remapped
+
+
+SKILLS_BUILTIN_PREFIX = "/skills/builtin/"
+SKILLS_SPACE_PREFIX = "/skills/space/"
+
+
+def build_skills_backend_factory(
+    *,
+    builtin_root: Path | str | None = None,
+    search_space_id: int | None = None,
+) -> Callable[[ToolRuntime], BackendProtocol]:
+    """Return a runtime-aware factory for the skills :class:`CompositeBackend`.
+
+    When ``search_space_id`` is provided the composite includes a
+    :class:`SearchSpaceSkillsBackend` route at ``/skills/space/`` over a fresh
+    per-runtime :class:`KBPostgresBackend`, mirroring how
+    :func:`build_backend_resolver` constructs the main filesystem backend.
+
+    When ``search_space_id`` is ``None`` (e.g., desktop-local mode or unit
+    tests) only the bundled :class:`BuiltinSkillsBackend` is exposed.
+
+    Returning a factory rather than a fixed instance is intentional: the
+    underlying KB backend depends on per-call ``ToolRuntime`` state
+    (``staged_dirs``, ``files`` cache, runtime config), so a single shared
+    instance cannot serve multiple concurrent agent runs.
+    """
+    builtin = BuiltinSkillsBackend(builtin_root)
+
+    if search_space_id is None:
+
+        def _factory_builtin_only(runtime: ToolRuntime) -> BackendProtocol:
+            # Default StateBackend is intentionally inert: any path outside the
+            # ``/skills/builtin/`` route resolves to an empty per-runtime state
+            # so the SkillsMiddleware can iterate sources without raising.
+            return CompositeBackend(
+                default=StateBackend(runtime),
+                routes={SKILLS_BUILTIN_PREFIX: builtin},
+            )
+
+        return _factory_builtin_only
+
+    def _factory_with_space(runtime: ToolRuntime) -> BackendProtocol:
+        # Imported lazily to avoid a hard dependency at module import time:
+        # ``KBPostgresBackend`` pulls in DB models, which are unnecessary for
+        # the unit-tested builtin path.
+        from app.agents.new_chat.middleware.kb_postgres_backend import (
+            KBPostgresBackend,
+        )
+
+        kb = KBPostgresBackend(search_space_id, runtime)
+        space = SearchSpaceSkillsBackend(kb)
+        return CompositeBackend(
+            default=StateBackend(runtime),
+            routes={
+                SKILLS_BUILTIN_PREFIX: builtin,
+                SKILLS_SPACE_PREFIX: space,
+            },
+        )
+
+    return _factory_with_space
+
+
+def default_skills_sources() -> list[str]:
+    """Return the canonical source list for SkillsMiddleware (built-in then space)."""
+    return [SKILLS_BUILTIN_PREFIX, SKILLS_SPACE_PREFIX]
+
+
+__all__ = [
+    "SKILLS_BUILTIN_PREFIX",
+    "SKILLS_SPACE_PREFIX",
+    "BuiltinSkillsBackend",
+    "SearchSpaceSkillsBackend",
+    "build_skills_backend_factory",
+    "default_skills_sources",
+]
diff --git a/surfsense_backend/app/agents/new_chat/middleware/tool_call_repair.py b/surfsense_backend/app/agents/new_chat/middleware/tool_call_repair.py
new file mode 100644
index 0000000000..9f81a168b6
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/middleware/tool_call_repair.py
@@ -0,0 +1,193 @@
+"""
+ToolCallNameRepairMiddleware — two-stage tool-name repair.
+
+Operation:
+1. **Stage 1 — lowercase repair:** if a tool call's ``name`` is not in
+   the registry but ``name.lower()`` is, rewrite in place. Catches
+   models that emit ``Search`` instead of ``search``.
+2. **Stage 2 — invalid fallback:** if still unmatched, rewrite the call
+   to ``invalid`` with ``args={"tool": original_name, "error": <error>}``
+   so the registered :func:`invalid_tool` returns the error to the model
+   for self-correction.
+
+Ported from OpenCode's ``packages/opencode/src/session/llm.ts:339-358``
++ ``packages/opencode/src/tool/invalid.ts``. LangChain has no equivalent:
+:class:`deepagents.middleware.PatchToolCallsMiddleware` patches
+*dangling* tool calls (no matching ToolMessage) but does nothing about
+wrong names, and the model framework's default behavior on an unknown
+name is to crash the turn rather than route to a self-correction
+fallback.
+"""
+
+from __future__ import annotations
+
+import difflib
+import logging
+from typing import Any
+
+from langchain.agents.middleware.types import (
+    AgentMiddleware,
+    AgentState,
+    ContextT,
+    ResponseT,
+)
+from langchain_core.messages import AIMessage
+from langgraph.runtime import Runtime
+
+from app.agents.new_chat.tools.invalid_tool import INVALID_TOOL_NAME
+
+logger = logging.getLogger(__name__)
+
+
+def _coerce_existing_tool_call(call: Any) -> dict[str, Any]:
+    """Normalize a tool call entry to a mutable dict."""
+    if isinstance(call, dict):
+        return dict(call)
+    return {
+        "name": getattr(call, "name", None),
+        "args": getattr(call, "args", {}),
+        "id": getattr(call, "id", None),
+        "type": "tool_call",
+    }
+
+
+class ToolCallNameRepairMiddleware(
+    AgentMiddleware[AgentState[ResponseT], ContextT, ResponseT]
+):
+    """Two-stage tool-name repair on the most recent ``AIMessage``.
+
+    Args:
+        registered_tool_names: Set of canonically-registered tool names.
+            ``invalid`` should be in this set so the fallback dispatches.
+        fuzzy_match_threshold: Optional ``difflib`` ratio (0-1) for the
+            fuzzy-match step that runs *between* lowercase and invalid.
+            Set to ``None`` to disable fuzzy matching (default in
+            OpenCode; we mirror that to avoid silent rewrites).
+    """
+
+    def __init__(
+        self,
+        *,
+        registered_tool_names: set[str],
+        fuzzy_match_threshold: float | None = 0.85,
+    ) -> None:
+        super().__init__()
+        self._registered = set(registered_tool_names)
+        self._registered_lower = {name.lower(): name for name in self._registered}
+        self._fuzzy_threshold = fuzzy_match_threshold
+        self.tools = []
+
+    def _registered_for_runtime(self, runtime: Runtime[ContextT]) -> set[str]:
+        """Allow runtime overrides to expand the set (e.g. dynamic MCP tools)."""
+        ctx_tools = getattr(runtime.context, "registered_tool_names", None)
+        if isinstance(ctx_tools, set | frozenset):
+            return self._registered | set(ctx_tools)
+        if isinstance(ctx_tools, list | tuple):
+            return self._registered | set(ctx_tools)
+        return self._registered
+
+    def _repair_one(
+        self,
+        call: dict[str, Any],
+        registered: set[str],
+    ) -> dict[str, Any]:
+        name = call.get("name")
+        if not isinstance(name, str):
+            return call
+
+        if name in registered:
+            return call
+
+        # Stage 1 — lowercase
+        lowered = name.lower()
+        if lowered in registered:
+            call["name"] = lowered
+            metadata = dict(call.get("response_metadata") or {})
+            metadata.setdefault("repair", "lowercase")
+            call["response_metadata"] = metadata
+            return call
+
+        # Optional fuzzy step (off by default — see class docstring)
+        if self._fuzzy_threshold is not None:
+            close = difflib.get_close_matches(
+                name, registered, n=1, cutoff=self._fuzzy_threshold
+            )
+            if close:
+                call["name"] = close[0]
+                metadata = dict(call.get("response_metadata") or {})
+                metadata.setdefault("repair", f"fuzzy:{name}->{close[0]}")
+                call["response_metadata"] = metadata
+                return call
+
+        # Stage 2 — invalid fallback
+        if INVALID_TOOL_NAME in registered:
+            original_args = call.get("args") or {}
+            error_msg = (
+                f"Tool name '{name}' is not registered. "
+                f"Original arguments were: {original_args!r}."
+            )
+            call["name"] = INVALID_TOOL_NAME
+            call["args"] = {"tool": name, "error": error_msg}
+            metadata = dict(call.get("response_metadata") or {})
+            metadata.setdefault("repair", f"invalid_fallback:{name}")
+            call["response_metadata"] = metadata
+        else:
+            logger.warning(
+                "Could not repair unknown tool call %r; 'invalid' tool not registered",
+                name,
+            )
+        return call
+
+    def _maybe_repair(
+        self,
+        message: AIMessage,
+        registered: set[str],
+    ) -> AIMessage | None:
+        if not message.tool_calls:
+            return None
+
+        new_calls: list[dict[str, Any]] = []
+        any_changed = False
+        for raw in message.tool_calls:
+            call = _coerce_existing_tool_call(raw)
+            before = (call.get("name"), call.get("args"))
+            repaired = self._repair_one(call, registered)
+            after = (repaired.get("name"), repaired.get("args"))
+            if before != after:
+                any_changed = True
+            new_calls.append(repaired)
+
+        if not any_changed:
+            return None
+
+        return message.model_copy(update={"tool_calls": new_calls})
+
+    def after_model(  # type: ignore[override]
+        self,
+        state: AgentState[ResponseT],
+        runtime: Runtime[ContextT],
+    ) -> dict[str, Any] | None:
+        messages = state.get("messages") or []
+        if not messages:
+            return None
+        last = messages[-1]
+        if not isinstance(last, AIMessage):
+            return None
+
+        registered = self._registered_for_runtime(runtime)
+        repaired = self._maybe_repair(last, registered)
+        if repaired is None:
+            return None
+        return {"messages": [repaired]}
+
+    async def aafter_model(  # type: ignore[override]
+        self,
+        state: AgentState[ResponseT],
+        runtime: Runtime[ContextT],
+    ) -> dict[str, Any] | None:
+        return self.after_model(state, runtime)
+
+
+__all__ = [
+    "ToolCallNameRepairMiddleware",
+]
diff --git a/surfsense_backend/app/agents/new_chat/path_resolver.py b/surfsense_backend/app/agents/new_chat/path_resolver.py
new file mode 100644
index 0000000000..861f48ee74
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/path_resolver.py
@@ -0,0 +1,351 @@
+"""Canonical virtual-path resolver for SurfSense knowledge-base documents.
+
+This module is the single source of truth for mapping ``Document`` rows to
+virtual paths under ``/documents/`` and back. It is used by:
+
+* :class:`KnowledgeTreeMiddleware` (rendering the workspace tree)
+* :class:`KnowledgePriorityMiddleware` (computing priority paths)
+* :class:`KBPostgresBackend` (``als_info`` / ``aread`` / move operations)
+* :class:`KnowledgeBasePersistenceMiddleware` (resolving moves and creates)
+
+Centralising the logic ensures that title-collision suffixes, folder paths,
+and ``unique_identifier_hash`` lookups never drift between renders and
+commits.
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass, field
+
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import Document, DocumentType, Folder
+from app.utils.document_converters import generate_unique_identifier_hash
+
+DOCUMENTS_ROOT = "/documents"
+"""Root virtual folder for all KB documents."""
+
+_INVALID_FILENAME_CHARS = re.compile(r"[\\/:*?\"<>|]+")
+_WHITESPACE_RUN = re.compile(r"\s+")
+
+
+def safe_filename(value: str, *, fallback: str = "untitled.xml") -> str:
+    """Convert arbitrary text into a filesystem-safe ``.xml`` filename."""
+    name = _INVALID_FILENAME_CHARS.sub("_", value).strip()
+    name = _WHITESPACE_RUN.sub(" ", name)
+    if not name:
+        name = fallback
+    if len(name) > 180:
+        name = name[:180].rstrip()
+    if not name.lower().endswith(".xml"):
+        name = f"{name}.xml"
+    return name
+
+
+def safe_folder_segment(value: str, *, fallback: str = "folder") -> str:
+    """Sanitize a single folder name into a path-safe segment."""
+    name = _INVALID_FILENAME_CHARS.sub("_", value).strip()
+    name = _WHITESPACE_RUN.sub(" ", name)
+    if not name:
+        return fallback
+    if len(name) > 180:
+        name = name[:180].rstrip()
+    return name
+
+
+def _suffix_with_doc_id(filename: str, doc_id: int | None) -> str:
+    if doc_id is None:
+        return filename
+    if not filename.lower().endswith(".xml"):
+        return f"{filename} ({doc_id}).xml"
+    stem = filename[:-4]
+    return f"{stem} ({doc_id}).xml"
+
+
+_SUFFIX_PATTERN = re.compile(r"\s\((\d+)\)\.xml$", re.IGNORECASE)
+
+
+def parse_doc_id_suffix(filename: str) -> tuple[str, int | None]:
+    """Strip a trailing ``" (<doc_id>).xml"`` suffix; return ``(stem, doc_id)``.
+
+    If no suffix is present, returns ``(stem_without_xml_extension, None)``.
+    """
+    match = _SUFFIX_PATTERN.search(filename)
+    if match:
+        doc_id = int(match.group(1))
+        stem = filename[: match.start()]
+        return stem, doc_id
+    if filename.lower().endswith(".xml"):
+        return filename[:-4], None
+    return filename, None
+
+
+@dataclass
+class PathIndex:
+    """In-memory occupancy snapshot used by :func:`doc_to_virtual_path`.
+
+    Built once per call site so collision handling is deterministic and so
+    we don't perform N folder lookups per render.
+    """
+
+    folder_paths: dict[int, str] = field(default_factory=dict)
+    """``Folder.id`` -> absolute virtual folder path under ``/documents``."""
+
+    occupants: dict[str, int] = field(default_factory=dict)
+    """virtual path -> ``Document.id`` already occupying that path (this render)."""
+
+
+async def _build_folder_paths(
+    session: AsyncSession,
+    search_space_id: int,
+) -> dict[int, str]:
+    """Compute ``Folder.id`` -> absolute virtual path under ``/documents``."""
+    result = await session.execute(
+        select(Folder.id, Folder.name, Folder.parent_id).where(
+            Folder.search_space_id == search_space_id
+        )
+    )
+    rows = result.all()
+    by_id = {row.id: {"name": row.name, "parent_id": row.parent_id} for row in rows}
+    cache: dict[int, str] = {}
+
+    def resolve(folder_id: int) -> str:
+        if folder_id in cache:
+            return cache[folder_id]
+        parts: list[str] = []
+        cursor: int | None = folder_id
+        visited: set[int] = set()
+        while cursor is not None and cursor in by_id and cursor not in visited:
+            visited.add(cursor)
+            entry = by_id[cursor]
+            parts.append(safe_folder_segment(str(entry["name"])))
+            cursor = entry["parent_id"]
+        parts.reverse()
+        path = f"{DOCUMENTS_ROOT}/" + "/".join(parts) if parts else DOCUMENTS_ROOT
+        cache[folder_id] = path
+        return path
+
+    for folder_id in by_id:
+        resolve(folder_id)
+    return cache
+
+
+async def build_path_index(
+    session: AsyncSession,
+    search_space_id: int,
+    *,
+    populate_occupants: bool = True,
+) -> PathIndex:
+    """Build a :class:`PathIndex` for a search space.
+
+    ``populate_occupants`` controls whether the occupancy map is pre-seeded
+    from existing ``Document`` rows. Most callers want this so that
+    :func:`doc_to_virtual_path` can detect collisions across the whole space;
+    the persistence middleware sets this to ``False`` when it is iterating to
+    decide where to place fresh documents.
+    """
+    folder_paths = await _build_folder_paths(session, search_space_id)
+    occupants: dict[str, int] = {}
+    if populate_occupants:
+        rows = await session.execute(
+            select(Document.id, Document.title, Document.folder_id).where(
+                Document.search_space_id == search_space_id,
+            )
+        )
+        for row in rows.all():
+            base = folder_paths.get(row.folder_id, DOCUMENTS_ROOT)
+            filename = safe_filename(str(row.title or "untitled"))
+            path = f"{base}/{filename}"
+            if path in occupants and occupants[path] != row.id:
+                path = f"{base}/{_suffix_with_doc_id(filename, row.id)}"
+            occupants[path] = row.id
+    return PathIndex(folder_paths=folder_paths, occupants=occupants)
+
+
+def doc_to_virtual_path(
+    *,
+    doc_id: int | None,
+    title: str,
+    folder_id: int | None,
+    index: PathIndex,
+) -> str:
+    """Return the canonical virtual path for a document.
+
+    Mutates ``index.occupants`` so subsequent calls see this assignment and
+    deterministically pick a different suffix for the next colliding doc.
+    """
+    base = index.folder_paths.get(folder_id, DOCUMENTS_ROOT)
+    filename = safe_filename(str(title or "untitled"))
+    path = f"{base}/{filename}"
+    occupant = index.occupants.get(path)
+    if occupant is not None and occupant != doc_id:
+        path = f"{base}/{_suffix_with_doc_id(filename, doc_id)}"
+    if doc_id is not None:
+        index.occupants[path] = doc_id
+    return path
+
+
+async def virtual_path_to_doc(
+    session: AsyncSession,
+    *,
+    search_space_id: int,
+    virtual_path: str,
+) -> Document | None:
+    """Resolve a virtual path back to a ``Document`` row.
+
+    Resolution order:
+    1. ``Document.unique_identifier_hash`` lookup (fast path for paths created
+       by SurfSense itself — every NOTE write goes through this hash).
+    2. If the basename carries a ``" (<doc_id>).xml"`` disambiguation suffix,
+       try a direct id lookup constrained to the search space.
+    3. Title-from-basename + folder-resolution lookup as a last resort.
+    """
+    if not virtual_path or not virtual_path.startswith(DOCUMENTS_ROOT):
+        return None
+
+    unique_hash = generate_unique_identifier_hash(
+        DocumentType.NOTE,
+        virtual_path,
+        search_space_id,
+    )
+    result = await session.execute(
+        select(Document).where(
+            Document.search_space_id == search_space_id,
+            Document.unique_identifier_hash == unique_hash,
+        )
+    )
+    document = result.scalar_one_or_none()
+    if document is not None:
+        return document
+
+    rel = virtual_path[len(DOCUMENTS_ROOT) :].lstrip("/")
+    if not rel:
+        return None
+    parts = [p for p in rel.split("/") if p]
+    if not parts:
+        return None
+    basename = parts[-1]
+    folder_parts = parts[:-1]
+
+    stem, suffix_doc_id = parse_doc_id_suffix(basename)
+    if suffix_doc_id is not None:
+        result = await session.execute(
+            select(Document).where(
+                Document.search_space_id == search_space_id,
+                Document.id == suffix_doc_id,
+            )
+        )
+        document = result.scalar_one_or_none()
+        if document is not None:
+            return document
+
+    folder_id = await _resolve_folder_id(
+        session, search_space_id=search_space_id, folder_parts=folder_parts
+    )
+    title_candidates: list[str] = []
+    raw_title = stem
+    title_candidates.append(raw_title)
+    if raw_title.endswith(".xml"):
+        title_candidates.append(raw_title[:-4])
+
+    for candidate in dict.fromkeys(title_candidates):
+        if not candidate:
+            continue
+        query = select(Document).where(
+            Document.search_space_id == search_space_id,
+            Document.title == candidate,
+        )
+        if folder_id is None:
+            query = query.where(Document.folder_id.is_(None))
+        else:
+            query = query.where(Document.folder_id == folder_id)
+        result = await session.execute(query)
+        document = result.scalars().first()
+        if document is not None:
+            return document
+
+    # Fallback: title-as-string lookup misses when the real DB title contains
+    # characters that ``safe_filename`` lossily replaces (``:``, ``/``, ``*``,
+    # etc.) — common for connector-imported docs (Google Calendar/Drive etc.).
+    # The workspace tree shows the lossy filename, so the agent passes that
+    # filename back here. Scan all documents in the resolved folder and match
+    # by ``safe_filename(title)`` to recover the original document.
+    folder_scan = select(Document).where(
+        Document.search_space_id == search_space_id,
+    )
+    if folder_id is None:
+        folder_scan = folder_scan.where(Document.folder_id.is_(None))
+    else:
+        folder_scan = folder_scan.where(Document.folder_id == folder_id)
+    result = await session.execute(folder_scan)
+    for candidate_doc in result.scalars().all():
+        encoded = safe_filename(str(candidate_doc.title or "untitled"))
+        if encoded == basename:
+            return candidate_doc
+    return None
+
+
+async def _resolve_folder_id(
+    session: AsyncSession,
+    *,
+    search_space_id: int,
+    folder_parts: list[str],
+) -> int | None:
+    """Look up the leaf folder id for a chain of folder names; return ``None`` if missing."""
+    if not folder_parts:
+        return None
+    parent_id: int | None = None
+    for raw in folder_parts:
+        name = safe_folder_segment(raw)
+        query = select(Folder.id).where(
+            Folder.search_space_id == search_space_id,
+            Folder.name == name,
+        )
+        if parent_id is None:
+            query = query.where(Folder.parent_id.is_(None))
+        else:
+            query = query.where(Folder.parent_id == parent_id)
+        result = await session.execute(query)
+        row = result.first()
+        if row is None:
+            return None
+        parent_id = row[0]
+    return parent_id
+
+
+def parse_documents_path(virtual_path: str) -> tuple[list[str], str]:
+    """Parse a ``/documents/...`` path into ``(folder_parts, document_title)``.
+
+    The title has any ``.xml`` extension and trailing ``" (<doc_id>)"``
+    disambiguation suffix stripped.
+    """
+    if not virtual_path or not virtual_path.startswith(DOCUMENTS_ROOT):
+        return [], ""
+    rel = virtual_path[len(DOCUMENTS_ROOT) :].strip("/")
+    if not rel:
+        return [], ""
+    parts = [p for p in rel.split("/") if p]
+    if not parts:
+        return [], ""
+    folder_parts = parts[:-1]
+    basename = parts[-1]
+    stem, _ = parse_doc_id_suffix(basename)
+    title = stem
+    if title.endswith(".xml"):
+        title = title[:-4]
+    return folder_parts, title
+
+
+__all__ = [
+    "DOCUMENTS_ROOT",
+    "PathIndex",
+    "build_path_index",
+    "doc_to_virtual_path",
+    "parse_doc_id_suffix",
+    "parse_documents_path",
+    "safe_filename",
+    "safe_folder_segment",
+    "virtual_path_to_doc",
+]
diff --git a/surfsense_backend/app/agents/new_chat/permissions.py b/surfsense_backend/app/agents/new_chat/permissions.py
new file mode 100644
index 0000000000..523deb11f9
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/permissions.py
@@ -0,0 +1,203 @@
+"""
+Wildcard pattern matching + rule evaluation for the SurfSense permission system.
+
+Ported from OpenCode's ``packages/opencode/src/permission/evaluate.ts`` and
+``packages/opencode/src/util/wildcard.ts``. LangChain has no rule-based
+permission evaluator, so we keep OpenCode's semantics intact:
+
+- ``Wildcard.match`` matches both the ``permission`` and the ``pattern``
+  fields of a rule against the requested ``(permission, pattern)`` pair.
+  ``*`` matches any segment, ``**`` matches across separators.
+- The evaluator runs ``findLast`` over the **flattened** list of rules
+  from all rulesets — last matching rule wins.
+- The default fallback is ``ask`` (NOT deny), matching OpenCode.
+- Multi-pattern requests AND together: if ANY pattern resolves to
+  ``deny``, the whole request is denied; if ANY needs ``ask``, an
+  interrupt is raised; only when all patterns ``allow`` does the
+  request proceed.
+"""
+
+from __future__ import annotations
+
+import re
+from collections.abc import Iterable
+from dataclasses import dataclass, field
+from typing import Literal
+
+RuleAction = Literal["allow", "deny", "ask"]
+
+
+@dataclass(frozen=True)
+class Rule:
+    """A single permission rule.
+
+    Attributes:
+        permission: A wildcard-matched permission identifier
+            (e.g. ``"edit"``, ``"linear_*"``, ``"mcp:*"``,
+            ``"doom_loop"``). Anchored at start AND end of the input.
+        pattern: A wildcard-matched pattern over the request payload
+            (e.g. ``"/documents/secrets/**"``, ``"page_id=123"``,
+            ``"*"``). Anchored at start AND end.
+        action: One of ``"allow"`` / ``"deny"`` / ``"ask"``.
+    """
+
+    permission: str
+    pattern: str
+    action: RuleAction
+
+
+@dataclass
+class Ruleset:
+    """A list of rules with an associated origin used for debugging."""
+
+    rules: list[Rule] = field(default_factory=list)
+    origin: str = "unknown"  # e.g. "defaults", "global", "space", "thread", "runtime"
+
+
+# -----------------------------------------------------------------------------
+# Wildcard matcher
+# -----------------------------------------------------------------------------
+
+
+_GLOB_TOKEN = re.compile(r"\*\*|\*|[^*]+")
+
+
+def _wildcard_to_regex(pattern: str) -> re.Pattern[str]:
+    """Translate an opencode-style wildcard pattern to a compiled regex.
+
+    Rules:
+    - ``**`` matches any sequence of any characters (including separators).
+    - ``*`` matches any sequence of characters that does **not** include
+      the path separator ``/`` — same as glob.
+    - All other characters match literally.
+    - The pattern is anchored at both ends (``^...$``).
+    """
+    parts: list[str] = ["^"]
+    for token in _GLOB_TOKEN.findall(pattern):
+        if token == "**":
+            parts.append(r".*")
+        elif token == "*":
+            parts.append(r"[^/]*")
+        else:
+            parts.append(re.escape(token))
+    parts.append("$")
+    return re.compile("".join(parts))
+
+
+_REGEX_CACHE: dict[str, re.Pattern[str]] = {}
+
+
+def wildcard_match(value: str, pattern: str) -> bool:
+    """Return True if ``value`` matches the wildcard ``pattern``.
+
+    Special case: a bare ``"*"`` pattern matches any value, including
+    those containing ``/`` separators. This mirrors opencode's
+    ``Wildcard.match`` short-circuit and matches the convention that
+    ``pattern="*"`` means "any pattern" in permission rules.
+    """
+    if pattern == "*":
+        return True
+    compiled = _REGEX_CACHE.get(pattern)
+    if compiled is None:
+        compiled = _wildcard_to_regex(pattern)
+        _REGEX_CACHE[pattern] = compiled
+    return compiled.match(value) is not None
+
+
+# -----------------------------------------------------------------------------
+# Evaluator
+# -----------------------------------------------------------------------------
+
+
+def evaluate(
+    permission: str,
+    pattern: str,
+    *rulesets: Ruleset | Iterable[Rule],
+) -> Rule:
+    """Find the last rule matching ``(permission, pattern)`` from ``rulesets``.
+
+    Mirrors opencode ``permission/evaluate.ts:9-15`` precisely:
+    - Flatten rulesets in argument order.
+    - Walk the flat list **in reverse**.
+    - First reverse-match wins (i.e. the last specified rule wins).
+    - When no rule matches, default to ``Rule(permission, "*", "ask")``.
+
+    Args:
+        permission: The permission identifier being requested
+            (e.g. tool name, ``"edit"``, ``"doom_loop"``).
+        pattern: The request-specific pattern (e.g. file path,
+            primary arg value). Use ``"*"`` when no specific pattern
+            applies.
+        *rulesets: Layered rulesets, applied earliest to latest. Later
+            rulesets override earlier ones.
+
+    Returns:
+        The matched :class:`Rule`, or the default ask fallback.
+    """
+    flat: list[Rule] = []
+    for rs in rulesets:
+        if isinstance(rs, Ruleset):
+            flat.extend(rs.rules)
+        else:
+            flat.extend(rs)
+
+    for rule in reversed(flat):
+        if wildcard_match(permission, rule.permission) and wildcard_match(
+            pattern, rule.pattern
+        ):
+            return rule
+
+    return Rule(permission=permission, pattern="*", action="ask")
+
+
+def evaluate_many(
+    permission: str,
+    patterns: Iterable[str],
+    *rulesets: Ruleset | Iterable[Rule],
+) -> list[Rule]:
+    """Evaluate ``permission`` against each of ``patterns`` (multi-pattern AND).
+
+    Returns the list of resolved rules in the same order as ``patterns``.
+    The caller is responsible for combining the results — opencode-style
+    multi-pattern AND collapses ``deny`` first, then ``ask``, then
+    ``allow``.
+    """
+    return [evaluate(permission, p, *rulesets) for p in patterns]
+
+
+def aggregate_action(rules: Iterable[Rule]) -> RuleAction:
+    """Collapse a list of per-pattern rules into one action.
+
+    Order:
+    1. If any rule is ``deny`` -> ``deny``.
+    2. Else if any rule is ``ask`` -> ``ask``.
+    3. Else if at least one rule is ``allow`` -> ``allow``.
+    4. Else (empty input) -> ``ask`` (safe default mirroring ``evaluate``).
+
+    Mirrors opencode's behavior in ``permission/index.ts:180-272``.
+    """
+    saw_ask = False
+    saw_allow = False
+    for rule in rules:
+        if rule.action == "deny":
+            return "deny"
+        if rule.action == "ask":
+            saw_ask = True
+        elif rule.action == "allow":
+            saw_allow = True
+    if saw_ask:
+        return "ask"
+    if saw_allow:
+        return "allow"
+    return "ask"
+
+
+__all__ = [
+    "Rule",
+    "RuleAction",
+    "Ruleset",
+    "aggregate_action",
+    "evaluate",
+    "evaluate_many",
+    "wildcard_match",
+]
diff --git a/surfsense_backend/app/agents/new_chat/plugin_loader.py b/surfsense_backend/app/agents/new_chat/plugin_loader.py
new file mode 100644
index 0000000000..c52620d40c
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/plugin_loader.py
@@ -0,0 +1,158 @@
+"""Entry-point based plugin loader for SurfSense agent middleware.
+
+LangChain's :class:`AgentMiddleware` ABC already covers the practical
+surface most plugins need (``before_agent`` / ``before_model`` /
+``wrap_tool_call`` / their async counterparts), so a SurfSense-specific
+plugin protocol would be redundant. We just need a way to discover and
+admit third-party middleware safely.
+
+A plugin is therefore just an installable Python package that registers a
+factory callable under the ``surfsense.plugins`` entry-point group:
+
+.. code-block:: toml
+
+    # in a plugin package's pyproject.toml
+    [project.entry-points."surfsense.plugins"]
+    year_substituter = "my_plugin:make_middleware"
+
+The factory has the signature ``Callable[[PluginContext], AgentMiddleware]``.
+It receives a small, sanitized :class:`PluginContext` with the IDs and the
+LLM the plugin is allowed to talk to — and **never** raw secrets, DB
+sessions, or other connectors.
+
+## Trust model
+
+Plugins are loaded **only if** their entry-point ``name`` appears in
+``allowed_plugins`` (admin-controlled, sourced from
+``global_llm_config.yaml`` or :func:`load_allowed_plugin_names_from_env`).
+There is **no env-driven auto-load**. A plugin failure is logged and
+isolated; it does not break agent construction.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+from collections.abc import Iterable
+from importlib.metadata import entry_points
+from typing import TYPE_CHECKING
+
+from langchain.agents.middleware import AgentMiddleware
+
+if TYPE_CHECKING:  # pragma: no cover - type-only
+    from langchain_core.language_models import BaseChatModel
+
+    from app.db import ChatVisibility
+
+
+logger = logging.getLogger(__name__)
+
+
+PLUGIN_ENTRY_POINT_GROUP = "surfsense.plugins"
+
+
+class PluginContext(dict):
+    """Sanitized DI bag handed to each plugin factory.
+
+    Backed by ``dict`` so plugins can inspect the keys they care about
+    without coupling to a concrete dataclass shape. Required keys:
+
+    * ``search_space_id`` (int)
+    * ``user_id`` (str | None)
+    * ``thread_visibility`` (:class:`app.db.ChatVisibility`)
+    * ``llm`` (:class:`langchain_core.language_models.BaseChatModel`)
+
+    The context **never** carries DB sessions, raw secrets, or other
+    connectors. If a future plugin genuinely needs DB access, that
+    integration goes through a rate-limited service interface, not
+    through this bag.
+    """
+
+    @classmethod
+    def build(
+        cls,
+        *,
+        search_space_id: int,
+        user_id: str | None,
+        thread_visibility: ChatVisibility,
+        llm: BaseChatModel,
+    ) -> PluginContext:
+        return cls(
+            search_space_id=search_space_id,
+            user_id=user_id,
+            thread_visibility=thread_visibility,
+            llm=llm,
+        )
+
+
+def load_plugin_middlewares(
+    ctx: PluginContext,
+    allowed_plugin_names: Iterable[str],
+) -> list[AgentMiddleware]:
+    """Discover, allowlist-filter, and instantiate plugin middleware.
+
+    For each entry-point in :data:`PLUGIN_ENTRY_POINT_GROUP` whose name is
+    in ``allowed_plugin_names``, load the factory and call it with ``ctx``.
+    The factory's return value must be an :class:`AgentMiddleware` instance;
+    anything else is logged and skipped.
+
+    Errors are isolated — a plugin that raises during ``ep.load()`` or
+    factory invocation is logged at ``ERROR`` and ignored. Agent
+    construction continues with whatever plugins did succeed.
+    """
+    allowed = {name for name in allowed_plugin_names if name}
+    if not allowed:
+        return []
+
+    out: list[AgentMiddleware] = []
+    try:
+        eps = entry_points(group=PLUGIN_ENTRY_POINT_GROUP)
+    except Exception:  # pragma: no cover - defensive (entry_points is robust)
+        logger.exception("Failed to enumerate plugin entry points")
+        return []
+
+    for ep in eps:
+        if ep.name not in allowed:
+            logger.info("Skipping non-allowlisted plugin %s", ep.name)
+            continue
+        try:
+            factory = ep.load()
+        except Exception:
+            logger.exception("Failed to load plugin %s", ep.name)
+            continue
+        try:
+            mw = factory(ctx)
+        except Exception:
+            logger.exception("Plugin %s factory raised", ep.name)
+            continue
+        if not isinstance(mw, AgentMiddleware):
+            logger.warning(
+                "Plugin %s returned %s, expected AgentMiddleware; skipping",
+                ep.name,
+                type(mw).__name__,
+            )
+            continue
+        out.append(mw)
+        logger.info("Loaded plugin %s as %s", ep.name, type(mw).__name__)
+    return out
+
+
+def load_allowed_plugin_names_from_env() -> set[str]:
+    """Read ``SURFSENSE_ALLOWED_PLUGINS`` (comma-separated) into a set.
+
+    Provided as a thin convenience for deployments that don't surface plugins
+    through ``global_llm_config.yaml`` yet. Whitespace is stripped and empty
+    entries are dropped.
+    """
+    raw = os.environ.get("SURFSENSE_ALLOWED_PLUGINS", "").strip()
+    if not raw:
+        return set()
+    return {token.strip() for token in raw.split(",") if token.strip()}
+
+
+__all__ = [
+    "PLUGIN_ENTRY_POINT_GROUP",
+    "PluginContext",
+    "load_allowed_plugin_names_from_env",
+    "load_plugin_middlewares",
+]
diff --git a/surfsense_backend/app/agents/new_chat/plugins/__init__.py b/surfsense_backend/app/agents/new_chat/plugins/__init__.py
new file mode 100644
index 0000000000..cef6bd367c
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/plugins/__init__.py
@@ -0,0 +1,6 @@
+"""Reference plugins bundled with SurfSense.
+
+These plugins are intentionally small and demonstrative. They are NOT
+auto-loaded — they ship as examples that a deployment can opt into via
+``global_llm_config.yaml`` or ``SURFSENSE_ALLOWED_PLUGINS``.
+"""
diff --git a/surfsense_backend/app/agents/new_chat/plugins/year_substituter.py b/surfsense_backend/app/agents/new_chat/plugins/year_substituter.py
new file mode 100644
index 0000000000..2b7781b909
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/plugins/year_substituter.py
@@ -0,0 +1,88 @@
+"""Reference plugin: substitute ``{{year}}`` in tool descriptions.
+
+Demonstrates the :meth:`AgentMiddleware.awrap_tool_call` hook -- the
+plugin sees every tool invocation and can rewrite the request *or* the
+result. This particular plugin is read-only and only transforms the
+*description* the user might see in error messages (no request
+mutation).
+
+The plugin is built as a factory function so the entry-point loader can
+inject :class:`PluginContext` (containing the agent's LLM, search-space
+ID, etc.). The factory signature
+``Callable[[PluginContext], AgentMiddleware]`` is the only contract --
+SurfSense doesn't define a custom plugin protocol on top of LangChain's
+:class:`AgentMiddleware`.
+
+Wire-up in ``pyproject.toml`` (illustrative; the in-repo plugin doesn't
+need this -- it's already on the import path)::
+
+    [project.entry-points."surfsense.plugins"]
+    year_substituter = "app.agents.new_chat.plugins.year_substituter:make_middleware"
+"""
+
+from __future__ import annotations
+
+import logging
+from collections.abc import Awaitable, Callable
+from datetime import UTC, datetime
+from typing import TYPE_CHECKING, Any
+
+from langchain.agents.middleware import AgentMiddleware
+
+if TYPE_CHECKING:  # pragma: no cover - type-only
+    from langchain.agents.middleware.types import ToolCallRequest
+    from langchain_core.messages import ToolMessage
+    from langgraph.types import Command
+
+    from app.agents.new_chat.plugin_loader import PluginContext
+
+
+logger = logging.getLogger(__name__)
+
+
+class _YearSubstituterMiddleware(AgentMiddleware):
+    """Replace ``{{year}}`` in the result text with the current UTC year."""
+
+    tools = ()
+
+    def __init__(self, year: int | None = None) -> None:
+        super().__init__()
+        self._year = str(year if year is not None else datetime.now(UTC).year)
+
+    async def awrap_tool_call(
+        self,
+        request: ToolCallRequest,
+        handler: Callable[[ToolCallRequest], Awaitable[ToolMessage | Command[Any]]],
+    ) -> ToolMessage | Command[Any]:
+        result = await handler(request)
+        try:
+            from langchain_core.messages import ToolMessage
+
+            if (
+                isinstance(result, ToolMessage)
+                and isinstance(result.content, str)
+                and "{{year}}" in result.content
+            ):
+                new_text = result.content.replace("{{year}}", self._year)
+                result = ToolMessage(
+                    content=new_text,
+                    tool_call_id=result.tool_call_id,
+                    id=result.id,
+                    name=result.name,
+                    status=result.status,
+                    artifact=result.artifact,
+                )
+        except Exception:  # pragma: no cover - defensive
+            logger.exception("year_substituter plugin failed; passing original result")
+        return result
+
+
+def make_middleware(ctx: PluginContext) -> AgentMiddleware:
+    """Plugin factory used by :func:`load_plugin_middlewares`."""
+    # Plugin is intentionally small so it has no state to threading-protect
+    # and ignores ``ctx`` beyond demonstrating that the loader passes it in.
+    _ = ctx
+    return _YearSubstituterMiddleware()
+
+
+__all__ = ["make_middleware"]
diff --git a/surfsense_backend/app/agents/new_chat/prompt_caching.py b/surfsense_backend/app/agents/new_chat/prompt_caching.py
new file mode 100644
index 0000000000..9fe47cdace
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompt_caching.py
@@ -0,0 +1,188 @@
+r"""LiteLLM-native prompt caching configuration for SurfSense agents.
+
+Replaces the legacy ``AnthropicPromptCachingMiddleware`` (which never
+activated for our LiteLLM-based stack — its ``isinstance(model, ChatAnthropic)``
+gate always failed) with LiteLLM's universal caching mechanism.
+
+Coverage:
+
+- Marker-based providers (need ``cache_control`` injection, which LiteLLM
+  performs automatically when ``cache_control_injection_points`` is set):
+  ``anthropic/``, ``bedrock/``, ``vertex_ai/``, ``gemini/``, ``azure_ai/``,
+  ``openrouter/`` (Claude/Gemini/MiniMax/GLM/z-ai routes), ``databricks/``
+  (Claude), ``dashscope/`` (Qwen), ``minimax/``, ``zai/`` (GLM).
+- Auto-cached (LiteLLM strips the marker silently): ``openai/``,
+  ``deepseek/``, ``xai/`` — these caches automatically for prompts ≥1024
+  tokens and surface ``prompt_cache_key`` / ``prompt_cache_retention``.
+
+We inject **two** breakpoints per request:
+
+- ``index: 0`` — pins the SurfSense system prompt at the head of the
+  request (provider variant, citation rules, tool catalog, KB tree,
+  skills metadata). The langchain agent factory always prepends
+  ``request.system_message`` at index 0 (see ``factory.py``
+  ``_execute_model_async``), so this targets exactly the main system
+  prompt regardless of how many other ``SystemMessage``\ s the
+  ``before_agent`` injectors (priority, tree, memory, file-intent,
+  anonymous-doc) have inserted into ``state["messages"]``. Using
+  ``role: system`` here would apply ``cache_control`` to **every**
+  system-role message and trip Anthropic's hard cap of 4 cache
+  breakpoints per request once the conversation accumulates enough
+  injected system messages — which surfaces as the upstream 400
+  ``A maximum of 4 blocks with cache_control may be provided. Found N``
+  via OpenRouter→Anthropic.
+- ``index: -1`` — pins the latest message so multi-turn savings compound:
+  Anthropic-family providers use longest-matching-prefix lookup, so turn
+  N+1 still reads turn N's cache up to the shared prefix.
+
+For OpenAI-family configs we additionally pass:
+
+- ``prompt_cache_key=f"surfsense-thread-{thread_id}"`` — routing hint that
+  raises hit rate by sending requests with a shared prefix to the same
+  backend.
+- ``prompt_cache_retention="24h"`` — extends cache TTL beyond the default
+  5-10 min in-memory cache.
+
+Safety net: ``litellm.drop_params=True`` is set globally in
+``app.services.llm_service`` at module-load time. Any kwarg the destination
+provider doesn't recognise is auto-stripped at the provider transformer
+layer, so an OpenAI→Bedrock auto-mode fallback can't 400 on
+``prompt_cache_key`` etc.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any
+
+from langchain_core.language_models import BaseChatModel
+
+if TYPE_CHECKING:
+    from app.agents.new_chat.llm_config import AgentConfig
+
+logger = logging.getLogger(__name__)
+
+
+# Two-breakpoint policy: head-of-request + latest message. See module
+# docstring for rationale. Anthropic caps requests at 4 ``cache_control``
+# blocks; we use 2 here, leaving headroom for Phase-2 tool caching.
+#
+# IMPORTANT: ``index: 0`` (not ``role: system``). The deepagent stack's
+# ``before_agent`` middlewares (priority, tree, memory, file-intent,
+# anonymous-doc) insert ``SystemMessage`` instances into
+# ``state["messages"]`` that accumulate across turns. With
+# ``role: system`` the LiteLLM hook would tag *every* one of them with
+# ``cache_control`` and overflow Anthropic's 4-block limit. ``index: 0``
+# always targets the langchain-prepended ``request.system_message``
+# (which our ``FlattenSystemMessageMiddleware`` reduces to a single text
+# block), giving us exactly one stable cache breakpoint.
+_DEFAULT_INJECTION_POINTS: tuple[dict[str, Any], ...] = (
+    {"location": "message", "index": 0},
+    {"location": "message", "index": -1},
+)
+
+# Providers (uppercase ``AgentConfig.provider`` values) that natively expose
+# OpenAI-style automatic prompt caching with ``prompt_cache_key`` and
+# ``prompt_cache_retention`` kwargs. Strict whitelist — many other providers
+# in ``PROVIDER_MAP`` route through litellm's ``openai`` prefix without
+# implementing the OpenAI prompt-cache surface (e.g. MOONSHOT, ZHIPU,
+# MINIMAX), so we can't infer family from the litellm prefix alone.
+_OPENAI_FAMILY_PROVIDERS: frozenset[str] = frozenset({"OPENAI", "DEEPSEEK", "XAI"})
+
+
+def _is_router_llm(llm: BaseChatModel) -> bool:
+    """Detect ``ChatLiteLLMRouter`` (auto-mode) without an eager import.
+
+    Importing ``app.services.llm_router_service`` at module-load time would
+    create a cycle via ``llm_config -> prompt_caching -> llm_router_service``.
+    Class-name comparison is sufficient since the class is defined in a
+    single place.
+    """
+    return type(llm).__name__ == "ChatLiteLLMRouter"
+
+
+def _is_openai_family_config(agent_config: AgentConfig | None) -> bool:
+    """Whether the config targets an OpenAI-style prompt-cache surface.
+
+    Strict — only returns True when the user explicitly chose OPENAI,
+    DEEPSEEK, or XAI as the provider in their ``NewLLMConfig`` /
+    ``YAMLConfig``. Auto-mode and custom providers return False because
+    we can't statically know the destination.
+    """
+    if agent_config is None or not agent_config.provider:
+        return False
+    if agent_config.is_auto_mode:
+        return False
+    if agent_config.custom_provider:
+        return False
+    return agent_config.provider.upper() in _OPENAI_FAMILY_PROVIDERS
+
+
+def _get_or_init_model_kwargs(llm: BaseChatModel) -> dict[str, Any] | None:
+    """Return ``llm.model_kwargs`` as a writable dict, or ``None`` to bail.
+
+    Initialises the field to ``{}`` when present-but-None on a Pydantic v2
+    model. Returns ``None`` if the LLM type doesn't expose a writable
+    ``model_kwargs`` attribute (caller should treat as no-op).
+    """
+    model_kwargs = getattr(llm, "model_kwargs", None)
+    if isinstance(model_kwargs, dict):
+        return model_kwargs
+    try:
+        llm.model_kwargs = {}  # type: ignore[attr-defined]
+    except Exception:
+        return None
+    refreshed = getattr(llm, "model_kwargs", None)
+    return refreshed if isinstance(refreshed, dict) else None
+
+
+def apply_litellm_prompt_caching(
+    llm: BaseChatModel,
+    *,
+    agent_config: AgentConfig | None = None,
+    thread_id: int | None = None,
+) -> None:
+    """Configure LiteLLM prompt caching on a ChatLiteLLM/ChatLiteLLMRouter.
+
+    Idempotent — values already present in ``llm.model_kwargs`` (e.g. from
+    ``agent_config.litellm_params`` overrides) are preserved. Mutates
+    ``llm.model_kwargs`` in place; the kwargs flow to ``litellm.completion``
+    via ``ChatLiteLLM._default_params`` and via ``self.model_kwargs`` merge
+    in our custom ``ChatLiteLLMRouter``.
+
+    Args:
+        llm: ChatLiteLLM, SanitizedChatLiteLLM, or ChatLiteLLMRouter instance.
+        agent_config: Optional ``AgentConfig`` driving provider-specific
+            behaviour. When omitted (or auto-mode), only the universal
+            ``cache_control_injection_points`` are set.
+        thread_id: Optional thread id used to construct a per-thread
+            ``prompt_cache_key`` for OpenAI-family providers. Caching still
+            works without it (server-side automatic), but the key improves
+            backend routing affinity and therefore hit rate.
+    """
+    model_kwargs = _get_or_init_model_kwargs(llm)
+    if model_kwargs is None:
+        logger.debug(
+            "apply_litellm_prompt_caching: %s exposes no writable model_kwargs; skipping",
+            type(llm).__name__,
+        )
+        return
+
+    if "cache_control_injection_points" not in model_kwargs:
+        model_kwargs["cache_control_injection_points"] = [
+            dict(point) for point in _DEFAULT_INJECTION_POINTS
+        ]
+
+    # OpenAI-family extras only when we statically know the destination is
+    # OpenAI / DeepSeek / xAI. Auto-mode router fans out across providers
+    # so we can't safely set OpenAI-only kwargs there (drop_params would
+    # strip them but it's wasteful to set them in the first place).
+    if _is_router_llm(llm):
+        return
+    if not _is_openai_family_config(agent_config):
+        return
+
+    if thread_id is not None and "prompt_cache_key" not in model_kwargs:
+        model_kwargs["prompt_cache_key"] = f"surfsense-thread-{thread_id}"
+    if "prompt_cache_retention" not in model_kwargs:
+        model_kwargs["prompt_cache_retention"] = "24h"
diff --git a/surfsense_backend/app/agents/new_chat/prompts/__init__.py b/surfsense_backend/app/agents/new_chat/prompts/__init__.py
new file mode 100644
index 0000000000..c91bb8a0bc
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/__init__.py
@@ -0,0 +1,7 @@
+"""SurfSense agent prompt fragments.
+
+The prompt is composed at runtime by :mod:`composer` from the markdown
+fragments under ``base/``, ``providers/``, ``tools/``, ``examples/``, and
+``routing/``. ``system_prompt.py`` is now a thin wrapper that delegates
+to :func:`composer.compose_system_prompt`.
+"""
diff --git a/surfsense_backend/app/agents/new_chat/prompts/base/__init__.py b/surfsense_backend/app/agents/new_chat/prompts/base/__init__.py
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/base/__init__.py
@@ -0,0 +1 @@
+
diff --git a/surfsense_backend/app/agents/new_chat/prompts/base/agent_private.md b/surfsense_backend/app/agents/new_chat/prompts/base/agent_private.md
new file mode 100644
index 0000000000..88554ad4e6
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/base/agent_private.md
@@ -0,0 +1,7 @@
+You are SurfSense, a reasoning and acting AI agent designed to answer user questions using the user's personal knowledge base.
+
+Today's date (UTC): {resolved_today}
+
+When writing mathematical formulas or equations, ALWAYS use LaTeX notation. NEVER use backtick code spans or Unicode symbols for math.
+
+NEVER expose internal tool parameter names, backend IDs, or implementation details to the user. Always use natural, user-friendly language instead.
diff --git a/surfsense_backend/app/agents/new_chat/prompts/base/agent_team.md b/surfsense_backend/app/agents/new_chat/prompts/base/agent_team.md
new file mode 100644
index 0000000000..5fd56ae1b3
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/base/agent_team.md
@@ -0,0 +1,9 @@
+You are SurfSense, a reasoning and acting AI agent designed to answer questions in this team space using the team's shared knowledge base.
+
+In this team thread, each message is prefixed with **[DisplayName of the author]**. Use this to attribute and reference the author of anything in the discussion (who asked a question, made a suggestion, or contributed an idea) and to cite who said what in your answers.
+
+Today's date (UTC): {resolved_today}
+
+When writing mathematical formulas or equations, ALWAYS use LaTeX notation. NEVER use backtick code spans or Unicode symbols for math.
+
+NEVER expose internal tool parameter names, backend IDs, or implementation details to the user. Always use natural, user-friendly language instead.
diff --git a/surfsense_backend/app/agents/new_chat/prompts/base/citations_off.md b/surfsense_backend/app/agents/new_chat/prompts/base/citations_off.md
new file mode 100644
index 0000000000..8288886e99
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/base/citations_off.md
@@ -0,0 +1,16 @@
+<citation_instructions>
+IMPORTANT: Citations are DISABLED for this configuration.
+
+DO NOT include any citations in your responses. Specifically:
+1. Do NOT use the [citation:chunk_id] format anywhere in your response.
+2. Do NOT reference document IDs, chunk IDs, or source IDs.
+3. Simply provide the information naturally without any citation markers.
+4. Write your response as if you're having a normal conversation, incorporating the information from your knowledge seamlessly.
+
+When answering questions based on documents from the knowledge base:
+- Present the information directly and confidently
+- Do not mention that information comes from specific documents or chunks
+- Integrate facts naturally into your response without attribution markers
+
+Your goal is to provide helpful, informative answers in a clean, readable format without any citation notation.
+</citation_instructions>
diff --git a/surfsense_backend/app/agents/new_chat/prompts/base/citations_on.md b/surfsense_backend/app/agents/new_chat/prompts/base/citations_on.md
new file mode 100644
index 0000000000..56291bf3ed
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/base/citations_on.md
@@ -0,0 +1,90 @@
+<citation_instructions>
+CRITICAL CITATION REQUIREMENTS:
+
+1. For EVERY piece of information you include from the documents, add a citation in the format [citation:chunk_id] where chunk_id is the exact value from the `<chunk id='...'>` tag inside `<document_content>`.
+2. Make sure ALL factual statements from the documents have proper citations.
+3. If multiple chunks support the same point, include all relevant citations [citation:chunk_id1], [citation:chunk_id2].
+4. You MUST use the exact chunk_id values from the `<chunk id='...'>` attributes. Do not create your own citation numbers.
+5. Every citation MUST be in the format [citation:chunk_id] where chunk_id is the exact chunk id value.
+6. Never modify or change the chunk_id - always use the original values exactly as provided in the chunk tags.
+7. Do not return citations as clickable links.
+8. Never format citations as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only.
+9. Citations must ONLY appear as [citation:chunk_id] or [citation:chunk_id1], [citation:chunk_id2] format - never with parentheses, hyperlinks, or other formatting.
+10. Never make up chunk IDs. Only use chunk_id values that are explicitly provided in the `<chunk id='...'>` tags.
+11. If you are unsure about a chunk_id, do not include a citation rather than guessing or making one up.
+
+<document_structure_example>
+The documents you receive are structured like this:
+
+**Knowledge base documents (numeric chunk IDs):**
+<document>
+<document_metadata>
+  <document_id>42</document_id>
+  <document_type>GITHUB_CONNECTOR</document_type>
+  <title><![CDATA[Some repo / file / issue title]]></title>
+  <url><![CDATA[https://example.com]]></url>
+  <metadata_json><![CDATA[{{"any":"other metadata"}}]]></metadata_json>
+</document_metadata>
+
+<document_content>
+  <chunk id='123'><![CDATA[First chunk text...]]></chunk>
+  <chunk id='124'><![CDATA[Second chunk text...]]></chunk>
+</document_content>
+</document>
+
+**Web search results (URL chunk IDs):**
+<document>
+<document_metadata>
+  <document_type>WEB_SEARCH</document_type>
+  <title><![CDATA[Some web search result]]></title>
+  <url><![CDATA[https://example.com/article]]></url>
+</document_metadata>
+
+<document_content>
+  <chunk id='https://example.com/article'><![CDATA[Content from web search...]]></chunk>
+</document_content>
+</document>
+
+IMPORTANT: You MUST cite using the EXACT chunk ids from the `<chunk id='...'>` tags.
+- For knowledge base documents, chunk ids are numeric (e.g. 123, 124) or prefixed (e.g. doc-45).
+- For live web search results, chunk ids are URLs (e.g. https://example.com/article).
+Do NOT cite document_id. Always use the chunk id.
+</document_structure_example>
+
+<citation_format>
+- Every fact from the documents must have a citation in the format [citation:chunk_id] where chunk_id is the EXACT id value from a `<chunk id='...'>` tag
+- Citations should appear at the end of the sentence containing the information they support
+- Multiple citations should be separated by commas: [citation:chunk_id1], [citation:chunk_id2], [citation:chunk_id3]
+- No need to return references section. Just citations in answer.
+- NEVER create your own citation format - use the exact chunk_id values from the documents in the [citation:chunk_id] format
+- NEVER format citations as clickable links or as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only
+- NEVER make up chunk IDs if you are unsure about the chunk_id. It is better to omit the citation than to guess
+- Copy the EXACT chunk id from the XML - if it says `<chunk id='doc-123'>`, use [citation:doc-123]
+- If the chunk id is a URL like `<chunk id='https://example.com/page'>`, use [citation:https://example.com/page]
+</citation_format>
+
+<citation_examples>
+CORRECT citation formats:
+- [citation:5] (numeric chunk ID from knowledge base)
+- [citation:doc-123] (for Surfsense documentation chunks)
+- [citation:https://example.com/article] (URL chunk ID from web search results)
+- [citation:chunk_id1], [citation:chunk_id2], [citation:chunk_id3] (multiple citations)
+
+INCORRECT citation formats (DO NOT use):
+- Using parentheses and markdown links: ([citation:5](https://github.com/MODSetter/SurfSense))
+- Using parentheses around brackets: ([citation:5])
+- Using hyperlinked text: [link to source 5](https://example.com)
+- Using footnote style: ... library¹
+- Making up source IDs when source_id is unknown
+- Using old IEEE format: [1], [2], [3]
+- Using source types instead of IDs: [citation:GITHUB_CONNECTOR] instead of [citation:5]
+</citation_examples>
+
+<citation_output_example>
+Based on your GitHub repositories and video content, Python's asyncio library provides tools for writing concurrent code using the async/await syntax [citation:5]. It's particularly useful for I/O-bound and high-level structured network code [citation:5].
+
+According to web search results, the key advantage of asyncio is that it can improve performance by allowing other code to run while waiting for I/O operations to complete [citation:https://docs.python.org/3/library/asyncio.html]. This makes it excellent for scenarios like web scraping, API calls, database operations, or any situation where your program spends time waiting for external resources.
+
+However, from your video learning, it's important to note that asyncio is not suitable for CPU-bound tasks as it runs on a single thread [citation:12]. For computationally intensive work, you'd want to use multiprocessing instead.
+</citation_output_example>
+</citation_instructions>
diff --git a/surfsense_backend/app/agents/new_chat/prompts/base/kb_only_policy_private.md b/surfsense_backend/app/agents/new_chat/prompts/base/kb_only_policy_private.md
new file mode 100644
index 0000000000..9cc767e7ed
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/base/kb_only_policy_private.md
@@ -0,0 +1,15 @@
+<knowledge_base_only_policy>
+CRITICAL RULE — KNOWLEDGE BASE FIRST, NEVER DEFAULT TO GENERAL KNOWLEDGE:
+- You MUST answer questions ONLY using information retrieved from the user's knowledge base, web search results, scraped webpages, or other tool outputs.
+- You MUST NOT answer factual or informational questions from your own training data or general knowledge unless the user explicitly grants permission.
+- If the knowledge base search returns no relevant results AND no other tool provides the answer, you MUST:
+  1. Inform the user that you could not find relevant information in their knowledge base.
+  2. Ask the user: "Would you like me to answer from my general knowledge instead?"
+  3. ONLY provide a general-knowledge answer AFTER the user explicitly says yes.
+- This policy does NOT apply to:
+  * Casual conversation, greetings, or meta-questions about SurfSense itself (e.g., "what can you do?")
+  * Formatting, summarization, or analysis of content already present in the conversation
+  * Following user instructions that are clearly task-oriented (e.g., "rewrite this in bullet points")
+  * Tool-usage actions like generating reports, podcasts, images, or scraping webpages
+  * Queries about services that have direct tools (Linear, ClickUp, Jira, Slack, Airtable) — see <tool_routing> below
+</knowledge_base_only_policy>
diff --git a/surfsense_backend/app/agents/new_chat/prompts/base/kb_only_policy_team.md b/surfsense_backend/app/agents/new_chat/prompts/base/kb_only_policy_team.md
new file mode 100644
index 0000000000..1d806dbaeb
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/base/kb_only_policy_team.md
@@ -0,0 +1,15 @@
+<knowledge_base_only_policy>
+CRITICAL RULE — KNOWLEDGE BASE FIRST, NEVER DEFAULT TO GENERAL KNOWLEDGE:
+- You MUST answer questions ONLY using information retrieved from the team's shared knowledge base, web search results, scraped webpages, or other tool outputs.
+- You MUST NOT answer factual or informational questions from your own training data or general knowledge unless a team member explicitly grants permission.
+- If the knowledge base search returns no relevant results AND no other tool provides the answer, you MUST:
+  1. Inform the team that you could not find relevant information in the shared knowledge base.
+  2. Ask: "Would you like me to answer from my general knowledge instead?"
+  3. ONLY provide a general-knowledge answer AFTER a team member explicitly says yes.
+- This policy does NOT apply to:
+  * Casual conversation, greetings, or meta-questions about SurfSense itself (e.g., "what can you do?")
+  * Formatting, summarization, or analysis of content already present in the conversation
+  * Following user instructions that are clearly task-oriented (e.g., "rewrite this in bullet points")
+  * Tool-usage actions like generating reports, podcasts, images, or scraping webpages
+  * Queries about services that have direct tools (Linear, ClickUp, Jira, Slack, Airtable) — see <tool_routing> below
+</knowledge_base_only_policy>
diff --git a/surfsense_backend/app/agents/new_chat/prompts/base/memory_protocol_private.md b/surfsense_backend/app/agents/new_chat/prompts/base/memory_protocol_private.md
new file mode 100644
index 0000000000..8f7da14f8e
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/base/memory_protocol_private.md
@@ -0,0 +1,6 @@
+<memory_protocol>
+IMPORTANT — After understanding each user message, ALWAYS check: does this message
+reveal durable facts about the user (role, interests, preferences, projects,
+background, or standing instructions)? If yes, you MUST call update_memory
+alongside your normal response — do not defer this to a later turn.
+</memory_protocol>
diff --git a/surfsense_backend/app/agents/new_chat/prompts/base/memory_protocol_team.md b/surfsense_backend/app/agents/new_chat/prompts/base/memory_protocol_team.md
new file mode 100644
index 0000000000..61d89cc5de
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/base/memory_protocol_team.md
@@ -0,0 +1,6 @@
+<memory_protocol>
+IMPORTANT — After understanding each user message, ALWAYS check: does this message
+reveal durable facts about the team (decisions, conventions, architecture, processes,
+or key facts)? If yes, you MUST call update_memory alongside your normal response —
+do not defer this to a later turn.
+</memory_protocol>
diff --git a/surfsense_backend/app/agents/new_chat/prompts/base/parameter_resolution.md b/surfsense_backend/app/agents/new_chat/prompts/base/parameter_resolution.md
new file mode 100644
index 0000000000..77be4d87c9
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/base/parameter_resolution.md
@@ -0,0 +1,39 @@
+<parameter_resolution>
+Some service tools require identifiers or context you do not have (account IDs,
+workspace names, channel IDs, project keys, etc.). NEVER ask the user for raw
+IDs or technical identifiers — they cannot memorise them.
+
+Instead, follow this discovery pattern:
+1. Call a listing/discovery tool to find available options.
+2. ONE result → use it silently, no question to the user.
+3. MULTIPLE results → present the options by their display names and let the
+   user choose. Never show raw UUIDs — always use friendly names.
+
+Discovery tools by level:
+- Which account/workspace? → get_connected_accounts("<service>")
+- Which Jira site (cloudId)? → getAccessibleAtlassianResources
+- Which Jira project?  → getVisibleJiraProjects (after resolving cloudId)
+- Which Jira issue type? → getJiraProjectIssueTypesMetadata (after resolving project)
+- Which channel?  → slack_search_channels
+- Which base?     → list_bases
+- Which table?    → list_tables_for_base (after resolving baseId)
+- Which task?     → clickup_search
+- Which issue?    → list_issues (Linear) or searchJiraIssuesUsingJql (Jira)
+
+For Jira specifically: ALWAYS call getAccessibleAtlassianResources first to
+obtain the cloudId, then pass it to other Jira tools. When creating an issue,
+chain: getAccessibleAtlassianResources → getVisibleJiraProjects → createJiraIssue.
+If there is only one option at each step, use it silently. If multiple, present
+friendly names.
+
+Chain discovery when needed — e.g. for Airtable records: list_bases → pick
+base → list_tables_for_base → pick table → list_records_for_table.
+
+MULTI-ACCOUNT TOOL NAMING: When the user has multiple accounts connected for
+the same service, tool names are prefixed to avoid collisions — e.g.
+linear_25_list_issues and linear_30_list_issues instead of two list_issues.
+Each prefixed tool's description starts with [Account: <display_name>] so you
+know which account it targets. Use get_connected_accounts("<service>") to see
+the full list of accounts with their connector IDs and display names.
+When only one account is connected, tools have their normal unprefixed names.
+</parameter_resolution>
diff --git a/surfsense_backend/app/agents/new_chat/prompts/base/tool_routing_private.md b/surfsense_backend/app/agents/new_chat/prompts/base/tool_routing_private.md
new file mode 100644
index 0000000000..b8bb069e25
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/base/tool_routing_private.md
@@ -0,0 +1,23 @@
+<tool_routing>
+CRITICAL — You have direct tools for these services: Linear, ClickUp, Jira, Slack, Airtable.
+Their data is NEVER in the knowledge base. You MUST call their tools immediately — never
+say "I don't see it in the knowledge base" or ask the user if they want you to check.
+Ignore any knowledge base results for these services.
+
+When to use which tool:
+- Linear (issues, teams, users, projects when MCP exposes them) → hosted Linear MCP read tools (e.g. `list_issues`, `get_issue`, `list_teams`, `list_users`, …) and `save_issue` for create/update; native SurfSense Linear issue tools when present. For **multi-step Linear-only** work (several reads, structured evidence), delegate with the `task` tool to subagent **`linear_specialist`** instead of mixing unrelated tools.
+- ClickUp (tasks) → clickup_search, clickup_get_task
+- Jira (issues) → getAccessibleAtlassianResources (cloudId discovery), getVisibleJiraProjects (project discovery), getJiraProjectIssueTypesMetadata (issue type discovery), searchJiraIssuesUsingJql, createJiraIssue, editJiraIssue
+- Slack (messages, channels) → `slack_search_channels`, `slack_read_channel`, `slack_read_thread`, and other `slack_*` tools when connected. For **multi-step Slack-only** work, delegate with `task` to **`slack_specialist`**.
+- Airtable (bases, tables, records) → list_bases, list_tables_for_base, list_records_for_table
+- Knowledge base content (Notion, GitHub, files, notes) → automatically searched
+- Real-time public web data → call web_search
+- Reading a specific webpage → call scrape_webpage
+
+**`task` subagents (when to delegate):**
+- **`linear_specialist`** — Linear-only investigations and tool use.
+- **`slack_specialist`** — Slack-only investigations and tool use.
+- **`connector_negotiator`** — **Cross-connector** chains (e.g. data from Slack then action in Linear).
+- **`explore`** — Read-only KB + web research with citations.
+- **`report_writer`** — Single `generate_report` deliverable.
+</tool_routing>
diff --git a/surfsense_backend/app/agents/new_chat/prompts/base/tool_routing_team.md b/surfsense_backend/app/agents/new_chat/prompts/base/tool_routing_team.md
new file mode 100644
index 0000000000..b081a21234
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/base/tool_routing_team.md
@@ -0,0 +1,23 @@
+<tool_routing>
+CRITICAL — You have direct tools for these services: Linear, ClickUp, Jira, Slack, Airtable.
+Their data is NEVER in the knowledge base. You MUST call their tools immediately — never
+say "I don't see it in the knowledge base" or ask if they want you to check.
+Ignore any knowledge base results for these services.
+
+When to use which tool:
+- Linear (issues, teams, users, projects when MCP exposes them) → hosted Linear MCP read tools (e.g. `list_issues`, `get_issue`, `list_teams`, `list_users`, …) and `save_issue` for create/update; native SurfSense Linear issue tools when present. For **multi-step Linear-only** work (several reads, structured evidence), delegate with the `task` tool to subagent **`linear_specialist`** instead of mixing unrelated tools.
+- ClickUp (tasks) → clickup_search, clickup_get_task
+- Jira (issues) → getAccessibleAtlassianResources (cloudId discovery), getVisibleJiraProjects (project discovery), getJiraProjectIssueTypesMetadata (issue type discovery), searchJiraIssuesUsingJql, createJiraIssue, editJiraIssue
+- Slack (messages, channels) → `slack_search_channels`, `slack_read_channel`, `slack_read_thread`, and other `slack_*` tools when connected. For **multi-step Slack-only** work, delegate with `task` to **`slack_specialist`**.
+- Airtable (bases, tables, records) → list_bases, list_tables_for_base, list_records_for_table
+- Knowledge base content (Notion, GitHub, files, notes) → automatically searched
+- Real-time public web data → call web_search
+- Reading a specific webpage → call scrape_webpage
+
+**`task` subagents (when to delegate):**
+- **`linear_specialist`** — Linear-only investigations and tool use.
+- **`slack_specialist`** — Slack-only investigations and tool use.
+- **`connector_negotiator`** — **Cross-connector** chains (e.g. data from Slack then action in Linear).
+- **`explore`** — Read-only KB + web research with citations.
+- **`report_writer`** — Single `generate_report` deliverable.
+</tool_routing>
diff --git a/surfsense_backend/app/agents/new_chat/prompts/composer.py b/surfsense_backend/app/agents/new_chat/prompts/composer.py
new file mode 100644
index 0000000000..42f8303e69
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/composer.py
@@ -0,0 +1,405 @@
+"""
+Prompt composer for the SurfSense ``new_chat`` agent.
+
+This module assembles the agent's system prompt from the markdown fragments
+under :mod:`app.agents.new_chat.prompts`. It replaces the monolithic
+``system_prompt.py`` with a clean, fragment-based composition:
+
+::
+
+    prompts/
+      base/                  # agent identity, KB policy, tool routing, …
+      providers/             # provider-specific tweaks (anthropic, gpt5, …)
+      tools/                 # one ``<name>.md`` per tool
+      examples/              # one ``<name>.md`` per tool with call examples
+      routing/               # connector-specific routing notes (linear, slack, …)
+
+The model-family dispatch step (see :func:`detect_provider_variant`)
+mirrors OpenCode's ``packages/opencode/src/session/system.ts`` — different
+model families respond best to differently-styled prompts (Claude likes
+XML/narrative, GPT-5 wants channel-aware pragmatic, Codex needs
+terse/file:line, Gemini wants formal numbered steps, etc.). LangChain's
+``dynamic_prompt`` helper supports per-call prompt swaps but ships no
+out-of-the-box family classifier, so we keep our own.
+
+Backwards compatibility
+=======================
+
+``system_prompt.py`` re-exports :func:`compose_system_prompt` and wraps it
+in functions with the same signatures as the legacy
+``build_surfsense_system_prompt`` / ``build_configurable_system_prompt`` so
+existing call sites do not change.
+"""
+
+from __future__ import annotations
+
+import re
+from collections.abc import Iterable
+from datetime import UTC, datetime
+from importlib import resources
+
+from app.db import ChatVisibility
+
+# -----------------------------------------------------------------------------
+# Provider variant detection
+# -----------------------------------------------------------------------------
+
+# String literal alias for the supported provider-specific prompt variants.
+# When adding a new variant, also drop a matching ``providers/<variant>.md``
+# file in this package and (if appropriate) extend the regex matchers below.
+#
+# Stylistic clusters: each variant is a focused style nudge, NOT a full
+# system prompt — the main prompt is already assembled from base/ +
+# tools/ + routing/. The clustering itself (which models map to which
+# style) follows OpenCode's ``system.ts`` family table; see the module
+# docstring for credits.
+ProviderVariant = str
+# Known values:
+#   "anthropic"        — Claude family (XML-friendly, narrative todos)
+#   "openai_reasoning" — GPT-5 / o-series (channel-aware pragmatic)
+#   "openai_classic"   — GPT-4 family (autonomous persistence)
+#   "openai_codex"     — gpt-*-codex (code-purist, terse, file:line refs)
+#   "google"           — Gemini (formal, <3-line, numbered workflow)
+#   "kimi"             — Moonshot Kimi-K* (action-bias, parallel tools)
+#   "grok"             — xAI Grok (extreme-terse, one-word ok)
+#   "deepseek"         — DeepSeek V3 / R1 (terse, R1-aware reasoning)
+#   "default"          — fallback, no provider-specific block emitted
+
+# IMPORTANT: order of evaluation matters in :func:`detect_provider_variant`.
+# More specific patterns must come first (e.g. ``codex`` before
+# ``openai_reasoning`` because codex model ids contain ``gpt``).
+
+_OPENAI_CODEX_RE = re.compile(
+    r"\b(gpt-codex|codex-mini|gpt-[\d.]+-codex)\b", re.IGNORECASE
+)
+_OPENAI_REASONING_RE = re.compile(r"\b(gpt-5|o\d|o-)", re.IGNORECASE)
+_OPENAI_CLASSIC_RE = re.compile(r"\bgpt-4", re.IGNORECASE)
+_ANTHROPIC_RE = re.compile(r"\bclaude\b", re.IGNORECASE)
+_GOOGLE_RE = re.compile(r"\bgemini\b", re.IGNORECASE)
+_KIMI_RE = re.compile(r"\b(kimi[-\d.]*|moonshot)\b", re.IGNORECASE)
+_GROK_RE = re.compile(r"\bgrok\b", re.IGNORECASE)
+_DEEPSEEK_RE = re.compile(r"\bdeepseek\b", re.IGNORECASE)
+
+
+def detect_provider_variant(model_name: str | None) -> ProviderVariant:
+    """Pick a provider-specific prompt variant from a model id string.
+
+    Heuristic match on the model id; returns ``"default"`` when nothing
+    matches so the composer can fall back to the empty placeholder file.
+
+    Order is significant: more-specific patterns are tried first so
+    ``gpt-5-codex`` routes to ``"openai_codex"`` rather than
+    ``"openai_reasoning"`` — same dispatch order as OpenCode's
+    ``packages/opencode/src/session/system.ts``.
+    """
+    if not model_name:
+        return "default"
+    name = model_name.strip()
+    if _OPENAI_CODEX_RE.search(name):
+        return "openai_codex"
+    if _OPENAI_REASONING_RE.search(name):
+        return "openai_reasoning"
+    if _OPENAI_CLASSIC_RE.search(name):
+        return "openai_classic"
+    if _ANTHROPIC_RE.search(name):
+        return "anthropic"
+    if _GOOGLE_RE.search(name):
+        return "google"
+    if _KIMI_RE.search(name):
+        return "kimi"
+    if _GROK_RE.search(name):
+        return "grok"
+    if _DEEPSEEK_RE.search(name):
+        return "deepseek"
+    return "default"
+
+
+# -----------------------------------------------------------------------------
+# Fragment loading
+# -----------------------------------------------------------------------------
+
+
+_PROMPTS_PACKAGE = "app.agents.new_chat.prompts"
+
+
+def _read_fragment(subpath: str) -> str:
+    """Read a fragment file from the ``prompts/`` resource tree.
+
+    Returns the raw contents stripped of any single trailing newline so
+    composition can append explicit separators without compounding blank
+    lines. Missing files return an empty string so optional fragments
+    (e.g. provider hints) act as no-ops.
+    """
+    parts = subpath.split("/")
+    try:
+        ref = resources.files(_PROMPTS_PACKAGE).joinpath(*parts)
+        if not ref.is_file():
+            return ""
+        text = ref.read_text(encoding="utf-8")
+    except (FileNotFoundError, ModuleNotFoundError):
+        return ""
+    if text.endswith("\n"):
+        text = text[:-1]
+    return text
+
+
+# -----------------------------------------------------------------------------
+# Tool ordering + memory variant resolution
+# -----------------------------------------------------------------------------
+
+
+# Ordered for reading flow: fundamentals first, then artifact generators,
+# then memory at the end (mirrors the legacy ``_ALL_TOOL_NAMES_ORDERED``).
+ALL_TOOL_NAMES_ORDERED: tuple[str, ...] = (
+    "search_surfsense_docs",
+    "web_search",
+    "generate_podcast",
+    "generate_video_presentation",
+    "generate_report",
+    "generate_resume",
+    "generate_image",
+    "scrape_webpage",
+    "update_memory",
+)
+
+
+_MEMORY_VARIANT_TOOLS: frozenset[str] = frozenset({"update_memory"})
+
+
+def _tool_fragment_path(tool_name: str, variant: str) -> str:
+    """Resolve a tool's instruction fragment path.
+
+    Tools listed in :data:`_MEMORY_VARIANT_TOOLS` switch on the conversation
+    visibility and load ``tools/<name>_<variant>.md``; everything else
+    falls back to ``tools/<name>.md``.
+    """
+    if tool_name in _MEMORY_VARIANT_TOOLS:
+        return f"tools/{tool_name}_{variant}.md"
+    return f"tools/{tool_name}.md"
+
+
+def _example_fragment_path(tool_name: str, variant: str) -> str:
+    if tool_name in _MEMORY_VARIANT_TOOLS:
+        return f"examples/{tool_name}_{variant}.md"
+    return f"examples/{tool_name}.md"
+
+
+def _format_tool_label(tool_name: str) -> str:
+    return tool_name.replace("_", " ").title()
+
+
+# -----------------------------------------------------------------------------
+# Section builders
+# -----------------------------------------------------------------------------
+
+
+def _build_system_instructions(
+    *,
+    visibility: ChatVisibility,
+    resolved_today: str,
+) -> str:
+    """Reconstruct the legacy ``<system_instruction>`` block from fragments."""
+    variant = "team" if visibility == ChatVisibility.SEARCH_SPACE else "private"
+
+    sections = [
+        _read_fragment(f"base/agent_{variant}.md"),
+        _read_fragment(f"base/kb_only_policy_{variant}.md"),
+        _read_fragment(f"base/tool_routing_{variant}.md"),
+        _read_fragment("base/parameter_resolution.md"),
+        _read_fragment(f"base/memory_protocol_{variant}.md"),
+    ]
+    body = "\n\n".join(s for s in sections if s)
+    block = f"\n<system_instruction>\n{body}\n\n</system_instruction>\n"
+    return block.format(resolved_today=resolved_today)
+
+
+def _build_mcp_routing_block(
+    mcp_connector_tools: dict[str, list[str]] | None,
+) -> str:
+    """Emit the ``<mcp_tool_routing>`` block when at least one MCP server is wired."""
+    if not mcp_connector_tools:
+        return ""
+    lines: list[str] = [
+        "\n<mcp_tool_routing>",
+        "You also have direct tools from these user-connected MCP servers.",
+        "Their data is NEVER in the knowledge base — call their tools directly.",
+        "",
+    ]
+    for server_name, tool_names in mcp_connector_tools.items():
+        lines.append(f"- {server_name} → {', '.join(tool_names)}")
+    lines.append("</mcp_tool_routing>\n")
+    return "\n".join(lines)
+
+
+def _build_tools_section(
+    *,
+    visibility: ChatVisibility,
+    enabled_tool_names: set[str] | None,
+    disabled_tool_names: set[str] | None,
+) -> str:
+    """Reconstruct the ``<tools>`` block + ``<tool_call_examples>`` block."""
+    variant = "team" if visibility == ChatVisibility.SEARCH_SPACE else "private"
+
+    parts: list[str] = []
+    preamble = _read_fragment("tools/_preamble.md")
+    if preamble:
+        parts.append(preamble + "\n")
+
+    examples: list[str] = []
+
+    for tool_name in ALL_TOOL_NAMES_ORDERED:
+        if enabled_tool_names is not None and tool_name not in enabled_tool_names:
+            continue
+
+        instruction = _read_fragment(_tool_fragment_path(tool_name, variant))
+        if instruction:
+            parts.append(instruction + "\n")
+
+        example = _read_fragment(_example_fragment_path(tool_name, variant))
+        if example:
+            examples.append(example + "\n")
+
+    known_disabled = (
+        set(disabled_tool_names) & set(ALL_TOOL_NAMES_ORDERED)
+        if disabled_tool_names
+        else set()
+    )
+    if known_disabled:
+        disabled_list = ", ".join(
+            _format_tool_label(n) for n in ALL_TOOL_NAMES_ORDERED if n in known_disabled
+        )
+        parts.append(
+            "\n"
+            "DISABLED TOOLS (by user):\n"
+            f"The following tools are available in SurfSense but have been disabled by the user for this session: {disabled_list}.\n"
+            "You do NOT have access to these tools and MUST NOT claim you can use them.\n"
+            "If the user asks about a capability provided by a disabled tool, let them know the relevant tool\n"
+            "is currently disabled and they can re-enable it.\n"
+        )
+
+    parts.append("\n</tools>\n")
+
+    if examples:
+        parts.append("<tool_call_examples>")
+        parts.extend(examples)
+        parts.append("</tool_call_examples>\n")
+
+    return "".join(parts)
+
+
+def _build_provider_block(provider_variant: ProviderVariant) -> str:
+    """Optional provider-tuned hints. Empty for ``"default"``."""
+    if not provider_variant or provider_variant == "default":
+        return ""
+    text = _read_fragment(f"providers/{provider_variant}.md")
+    return f"\n{text}\n" if text else ""
+
+
+def _build_routing_block(connector_routing: Iterable[str] | None) -> str:
+    if not connector_routing:
+        return ""
+    fragments: list[str] = []
+    for name in connector_routing:
+        text = _read_fragment(f"routing/{name}.md")
+        if text:
+            fragments.append(text)
+    if not fragments:
+        return ""
+    return "\n" + "\n\n".join(fragments) + "\n"
+
+
+def _build_citation_block(citations_enabled: bool) -> str:
+    fragment = (
+        _read_fragment("base/citations_on.md")
+        if citations_enabled
+        else _read_fragment("base/citations_off.md")
+    )
+    return f"\n{fragment}\n" if fragment else ""
+
+
+# -----------------------------------------------------------------------------
+# Public API
+# -----------------------------------------------------------------------------
+
+
+def compose_system_prompt(
+    *,
+    today: datetime | None = None,
+    thread_visibility: ChatVisibility | None = None,
+    enabled_tool_names: set[str] | None = None,
+    disabled_tool_names: set[str] | None = None,
+    mcp_connector_tools: dict[str, list[str]] | None = None,
+    custom_system_instructions: str | None = None,
+    use_default_system_instructions: bool = True,
+    citations_enabled: bool = True,
+    provider_variant: ProviderVariant | None = None,
+    model_name: str | None = None,
+    connector_routing: Iterable[str] | None = None,
+) -> str:
+    """Assemble the SurfSense system prompt from disk fragments.
+
+    Args:
+        today: Optional clock injection for tests.
+        thread_visibility: Private vs shared (team) — drives memory wording
+            and a few base block variants.
+        enabled_tool_names: When provided, only these tools' instructions
+            are included; ``None`` keeps the legacy "include everything"
+            behavior.
+        disabled_tool_names: User-disabled tools (note appended to prompt).
+        mcp_connector_tools: ``{server_name: [tool_names...]}`` to inject
+            an explicit MCP routing block.
+        custom_system_instructions: Free-form instructions that override
+            the default ``<system_instruction>`` block (legacy support
+            for ``NewLLMConfig.system_instructions``).
+        use_default_system_instructions: When ``custom_system_instructions``
+            is empty/None, fall back to defaults (legacy semantics).
+        citations_enabled: Include ``citations_on.md`` (true) or
+            ``citations_off.md`` (false).
+        provider_variant: Explicit provider variant override
+            (``"anthropic" | "openai_reasoning" | "openai_classic" | "google" | "default"``).
+            When ``None``, falls back to :func:`detect_provider_variant`
+            on ``model_name``.
+        model_name: Used to auto-detect ``provider_variant`` when not
+            provided explicitly.
+        connector_routing: Optional list of routing fragment names
+            (``["linear", "slack", ...]``) to include from
+            ``prompts/routing/``.
+
+    Returns:
+        The fully composed system prompt string.
+    """
+    resolved_today = (today or datetime.now(UTC)).astimezone(UTC).date().isoformat()
+    visibility = thread_visibility or ChatVisibility.PRIVATE
+
+    if custom_system_instructions and custom_system_instructions.strip():
+        sys_block = custom_system_instructions.format(resolved_today=resolved_today)
+    elif use_default_system_instructions:
+        sys_block = _build_system_instructions(
+            visibility=visibility, resolved_today=resolved_today
+        )
+    else:
+        sys_block = ""
+
+    sys_block += _build_mcp_routing_block(mcp_connector_tools)
+
+    if provider_variant is None:
+        provider_variant = detect_provider_variant(model_name)
+    sys_block += _build_provider_block(provider_variant)
+    sys_block += _build_routing_block(connector_routing)
+
+    tools_block = _build_tools_section(
+        visibility=visibility,
+        enabled_tool_names=enabled_tool_names,
+        disabled_tool_names=disabled_tool_names,
+    )
+    citation_block = _build_citation_block(citations_enabled)
+
+    return sys_block + tools_block + citation_block
+
+
+__all__ = [
+    "ALL_TOOL_NAMES_ORDERED",
+    "ProviderVariant",
+    "compose_system_prompt",
+    "detect_provider_variant",
+]
diff --git a/surfsense_backend/app/agents/new_chat/prompts/examples/__init__.py b/surfsense_backend/app/agents/new_chat/prompts/examples/__init__.py
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/examples/__init__.py
@@ -0,0 +1 @@
+
diff --git a/surfsense_backend/app/agents/new_chat/prompts/examples/generate_image.md b/surfsense_backend/app/agents/new_chat/prompts/examples/generate_image.md
new file mode 100644
index 0000000000..216c2926a7
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/examples/generate_image.md
@@ -0,0 +1,12 @@
+
+- User: "Generate an image of a cat"
+  - Call: `generate_image(prompt="A fluffy orange tabby cat sitting on a windowsill, bathed in warm golden sunlight, soft bokeh background with green houseplants, photorealistic style, cozy atmosphere")`
+  - The generated image will automatically be displayed in the chat.
+- User: "Draw me a logo for a coffee shop called Bean Dream"
+  - Call: `generate_image(prompt="Minimalist modern logo design for a coffee shop called 'Bean Dream', featuring a stylized coffee bean with dream-like swirls of steam, clean vector style, warm brown and cream color palette, white background, professional branding")`
+  - The generated image will automatically be displayed in the chat.
+- User: "Show me this image: https://example.com/image.png"
+  - Simply include it in your response using markdown: `![Image](https://example.com/image.png)`
+- User uploads an image file and asks: "What is this image about?"
+  - The user's uploaded image is already visible in the chat.
+  - Simply analyze the image content and respond directly.
diff --git a/surfsense_backend/app/agents/new_chat/prompts/examples/generate_podcast.md b/surfsense_backend/app/agents/new_chat/prompts/examples/generate_podcast.md
new file mode 100644
index 0000000000..aabf8ce7a6
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/examples/generate_podcast.md
@@ -0,0 +1,7 @@
+
+- User: "Give me a podcast about AI trends based on what we discussed"
+  - First search for relevant content, then call: `generate_podcast(source_content="Based on our conversation and search results: [detailed summary of chat + search findings]", podcast_title="AI Trends Podcast")`
+- User: "Create a podcast summary of this conversation"
+  - Call: `generate_podcast(source_content="Complete conversation summary:\n\nUser asked about [topic 1]:\n[Your detailed response]\n\nUser then asked about [topic 2]:\n[Your detailed response]\n\n[Continue for all exchanges in the conversation]", podcast_title="Conversation Summary")`
+- User: "Make a podcast about quantum computing"
+  - First explore `/documents/` (ls/glob/grep/read_file), then: `generate_podcast(source_content="Key insights about quantum computing from retrieved files:\n\n[Comprehensive summary of findings]", podcast_title="Quantum Computing Explained")`
diff --git a/surfsense_backend/app/agents/new_chat/prompts/examples/generate_report.md b/surfsense_backend/app/agents/new_chat/prompts/examples/generate_report.md
new file mode 100644
index 0000000000..7e9d0a5957
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/examples/generate_report.md
@@ -0,0 +1,13 @@
+
+- User: "Generate a report about AI trends"
+  - Call: `generate_report(topic="AI Trends Report", source_strategy="kb_search", search_queries=["AI trends recent developments", "artificial intelligence industry trends", "AI market growth and predictions"], report_style="detailed")`
+  - WHY: Has creation verb "generate" → call the tool. No prior discussion → use kb_search.
+- User: "Write a research report from this conversation"
+  - Call: `generate_report(topic="Research Report", source_strategy="conversation", source_content="Complete conversation summary:\n\n...", report_style="deep_research")`
+  - WHY: Has creation verb "write" → call the tool. Conversation has the content → use source_strategy="conversation".
+- User: (after a report on Climate Change was generated) "Add a section about carbon capture technologies"
+  - Call: `generate_report(topic="Climate Crisis: Causes, Impacts, and Solutions", source_strategy="conversation", source_content="[summary of conversation context if any]", parent_report_id=<previous_report_id>, user_instructions="Add a new section about carbon capture technologies")`
+  - WHY: Has modification verb "add" + specific deliverable target → call the tool with parent_report_id.
+- User: (after a report was generated) "What else could we add to have more depth?"
+  - Do NOT call generate_report. Answer in chat with suggestions.
+  - WHY: No creation/modification verb directed at producing a deliverable.
diff --git a/surfsense_backend/app/agents/new_chat/prompts/examples/generate_resume.md b/surfsense_backend/app/agents/new_chat/prompts/examples/generate_resume.md
new file mode 100644
index 0000000000..d8a6c381ee
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/examples/generate_resume.md
@@ -0,0 +1,19 @@
+
+- User: "Build me a resume. I'm John Doe, engineer at Acme Corp..."
+  - Call: `generate_resume(user_info="John Doe, engineer at Acme Corp...", max_pages=1)`
+  - WHY: Has creation verb "build" + resume → call the tool.
+- User: "Create my CV with this info: [experience, education, skills]"
+  - Call: `generate_resume(user_info="[experience, education, skills]", max_pages=1)`
+- User: "Build me a resume" (and there is a resume/CV document in the conversation context)
+  - Extract the FULL content from the document in context, then call:
+    `generate_resume(user_info="Name: John Doe\nEmail: john@example.com\n\nExperience:\n- Senior Engineer at Acme Corp (2020-2024)\n  Led team of 5...\n\nEducation:\n- BS Computer Science, MIT (2016-2020)\n\nSkills: Python, TypeScript, AWS...", max_pages=1)`
+  - WHY: Document content is available in context — extract ALL of it into user_info. Do NOT ignore referenced documents.
+- User: (after resume generated) "Change my title to Senior Engineer"
+  - Call: `generate_resume(user_info="", user_instructions="Change the job title to Senior Engineer", parent_report_id=<previous_report_id>, max_pages=1)`
+  - WHY: Modification verb "change" + refers to existing resume → set parent_report_id.
+- User: (after resume generated) "Make this 2 pages and expand projects"
+  - Call: `generate_resume(user_info="", user_instructions="Expand projects and keep this to at most 2 pages", parent_report_id=<previous_report_id>, max_pages=2)`
+  - WHY: Explicit page increase request → set max_pages to 2.
+- User: "How should I structure my resume?"
+  - Do NOT call generate_resume. Answer in chat with advice.
+  - WHY: No creation/modification verb.
diff --git a/surfsense_backend/app/agents/new_chat/prompts/examples/generate_video_presentation.md b/surfsense_backend/app/agents/new_chat/prompts/examples/generate_video_presentation.md
new file mode 100644
index 0000000000..257ec86cf1
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/examples/generate_video_presentation.md
@@ -0,0 +1,7 @@
+
+- User: "Give me a presentation about AI trends based on what we discussed"
+  - First search for relevant content, then call: `generate_video_presentation(source_content="Based on our conversation and search results: [detailed summary of chat + search findings]", video_title="AI Trends Presentation")`
+- User: "Create slides summarizing this conversation"
+  - Call: `generate_video_presentation(source_content="Complete conversation summary:\n\nUser asked about [topic 1]:\n[Your detailed response]\n\nUser then asked about [topic 2]:\n[Your detailed response]\n\n[Continue for all exchanges in the conversation]", video_title="Conversation Summary")`
+- User: "Make a video presentation about quantum computing"
+  - First explore `/documents/` (ls/glob/grep/read_file), then: `generate_video_presentation(source_content="Key insights about quantum computing from retrieved files:\n\n[Comprehensive summary of findings]", video_title="Quantum Computing Explained")`
diff --git a/surfsense_backend/app/agents/new_chat/prompts/examples/scrape_webpage.md b/surfsense_backend/app/agents/new_chat/prompts/examples/scrape_webpage.md
new file mode 100644
index 0000000000..0f156bf24b
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/examples/scrape_webpage.md
@@ -0,0 +1,13 @@
+
+- User: "Check out https://dev.to/some-article"
+  - Call: `scrape_webpage(url="https://dev.to/some-article")`
+  - Respond with a structured analysis — key points, takeaways.
+- User: "Read this article and summarize it for me: https://example.com/blog/ai-trends"
+  - Call: `scrape_webpage(url="https://example.com/blog/ai-trends")`
+  - Respond with a thorough summary using headings and bullet points.
+- User: (after discussing https://example.com/stats) "Can you get the live data from that page?"
+  - Call: `scrape_webpage(url="https://example.com/stats")`
+  - IMPORTANT: Always attempt scraping first. Never refuse before trying the tool.
+- User: "https://example.com/blog/weekend-recipes"
+  - Call: `scrape_webpage(url="https://example.com/blog/weekend-recipes")`
+  - When a user sends just a URL with no instructions, scrape it and provide a concise summary of the content.
diff --git a/surfsense_backend/app/agents/new_chat/prompts/examples/search_surfsense_docs.md b/surfsense_backend/app/agents/new_chat/prompts/examples/search_surfsense_docs.md
new file mode 100644
index 0000000000..b90f2b7a7e
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/examples/search_surfsense_docs.md
@@ -0,0 +1,9 @@
+
+- User: "How do I install SurfSense?"
+  - Call: `search_surfsense_docs(query="installation setup")`
+- User: "What connectors does SurfSense support?"
+  - Call: `search_surfsense_docs(query="available connectors integrations")`
+- User: "How do I set up the Notion connector?"
+  - Call: `search_surfsense_docs(query="Notion connector setup configuration")`
+- User: "How do I use Docker to run SurfSense?"
+  - Call: `search_surfsense_docs(query="Docker installation setup")`
diff --git a/surfsense_backend/app/agents/new_chat/prompts/examples/update_memory_private.md b/surfsense_backend/app/agents/new_chat/prompts/examples/update_memory_private.md
new file mode 100644
index 0000000000..f83fe40b43
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/examples/update_memory_private.md
@@ -0,0 +1,16 @@
+
+- <user_name>Alex</user_name>, <user_memory> is empty. User: "I'm a space enthusiast, explain astrophage to me"
+  - The user casually shared a durable fact. Use their first name in the entry, short neutral heading:
+    update_memory(updated_memory="## Interests & background\n- (2025-03-15) [fact] Alex is a space enthusiast\n")
+- User: "Remember that I prefer concise answers over detailed explanations"
+  - Durable preference. Merge with existing memory, add a new heading:
+    update_memory(updated_memory="## Interests & background\n- (2025-03-15) [fact] Alex is a space enthusiast\n\n## Response style\n- (2025-03-15) [pref] Alex prefers concise answers over detailed explanations\n")
+- User: "I actually moved to Tokyo last month"
+  - Updated fact, date prefix reflects when recorded:
+    update_memory(updated_memory="## Interests & background\n...\n\n## Personal context\n- (2025-03-15) [fact] Alex lives in Tokyo (previously London)\n...")
+- User: "I'm a freelance photographer working on a nature documentary"
+  - Durable background info under a fitting heading:
+    update_memory(updated_memory="...\n\n## Current focus\n- (2025-03-15) [fact] Alex is a freelance photographer\n- (2025-03-15) [fact] Alex is working on a nature documentary\n")
+- User: "Always respond in bullet points"
+  - Standing instruction:
+    update_memory(updated_memory="...\n\n## Response style\n- (2025-03-15) [instr] Always respond to Alex in bullet points\n")
diff --git a/surfsense_backend/app/agents/new_chat/prompts/examples/update_memory_team.md b/surfsense_backend/app/agents/new_chat/prompts/examples/update_memory_team.md
new file mode 100644
index 0000000000..1c74fdf6e6
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/examples/update_memory_team.md
@@ -0,0 +1,7 @@
+
+- User: "Let's remember that we decided to do weekly standup meetings on Mondays"
+  - Durable team decision:
+    update_memory(updated_memory="- (2025-03-15) [fact] Weekly standup meetings on Mondays\n...")
+- User: "Our office is in downtown Seattle, 5th floor"
+  - Durable team fact:
+    update_memory(updated_memory="- (2025-03-15) [fact] Office location: downtown Seattle, 5th floor\n...")
diff --git a/surfsense_backend/app/agents/new_chat/prompts/examples/web_search.md b/surfsense_backend/app/agents/new_chat/prompts/examples/web_search.md
new file mode 100644
index 0000000000..6b9828ac79
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/examples/web_search.md
@@ -0,0 +1,8 @@
+
+- User: "What's the current USD to INR exchange rate?"
+  - Call: `web_search(query="current USD to INR exchange rate")`
+  - Then answer using the returned web results with citations.
+- User: "What's the latest news about AI?"
+  - Call: `web_search(query="latest AI news today")`
+- User: "What's the weather in New York?"
+  - Call: `web_search(query="weather New York today")`
diff --git a/surfsense_backend/app/agents/new_chat/prompts/providers/__init__.py b/surfsense_backend/app/agents/new_chat/prompts/providers/__init__.py
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/providers/__init__.py
@@ -0,0 +1 @@
+
diff --git a/surfsense_backend/app/agents/new_chat/prompts/providers/anthropic.md b/surfsense_backend/app/agents/new_chat/prompts/providers/anthropic.md
new file mode 100644
index 0000000000..f574da5410
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/providers/anthropic.md
@@ -0,0 +1,20 @@
+<provider_hints>
+You are running on an Anthropic Claude model.
+
+Structured reasoning:
+- Use XML tags liberally to organise intermediate reasoning when a task is non-trivial. `<thinking>...</thinking>` blocks are encouraged before tool calls or before producing a complex final answer.
+- For multi-step requests, briefly outline a plan inside a `<plan>` block before issuing the first tool call.
+
+Professional objectivity:
+- Prioritise technical accuracy over validating the user's beliefs. Provide direct, factual guidance without unnecessary superlatives, praise, or emotional validation.
+- When uncertain, investigate (search the KB, fetch the page) rather than confirming the user's assumption.
+- Disagree with the user when the evidence warrants it; respectful correction beats false agreement.
+
+Task management:
+- For tasks with 3+ distinct steps use the todo / planning tool aggressively. Mark items in_progress before starting, completed immediately when finished — do not batch completions.
+- Narrate progress through the todo list itself, not through chatty status lines.
+
+Tool calls:
+- Run independent tool calls in parallel within one response. Sequence them only when a later call genuinely needs an earlier one's output.
+- Never chain bash-like commands with `;` or `&&` to "narrate" — use prose between tool calls instead.
+</provider_hints>
diff --git a/surfsense_backend/app/agents/new_chat/prompts/providers/deepseek.md b/surfsense_backend/app/agents/new_chat/prompts/providers/deepseek.md
new file mode 100644
index 0000000000..8acf008ca5
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/providers/deepseek.md
@@ -0,0 +1,18 @@
+<provider_hints>
+You are running on a DeepSeek model (DeepSeek-V3 chat / DeepSeek-R1 reasoning).
+
+Reasoning hygiene (R1-aware):
+- If the model surfaces explicit `<think>` blocks, keep that internal scratch focused — do NOT restate the user's question inside it; jump straight to the analysis.
+- Never paste the contents of `<think>` into your final answer. Final answer should reflect only the conclusion, citations, and any user-facing rationale.
+- Do not let chain-of-thought leak into tool-call arguments — keep tool inputs minimal and structural.
+
+Output style:
+- Be concise. Default to a one-paragraph answer; expand only when the user asks for detail.
+- Don't open with sycophantic phrasing ("Great question", "Sure, here you go"). Lead with the answer or the next action.
+- For factual answers, cite once with `[citation:chunk_id]` and stop.
+
+Tool calls:
+- Issue independent tool calls in parallel within a single turn.
+- Prefer the knowledge-base search tools before any web-search; this model has strong recall but stale training data.
+- Don't fabricate file paths, chunk ids, or URLs — only use values returned by tools or provided by the user.
+</provider_hints>
diff --git a/surfsense_backend/app/agents/new_chat/prompts/providers/default.md b/surfsense_backend/app/agents/new_chat/prompts/providers/default.md
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/providers/default.md
@@ -0,0 +1 @@
+
diff --git a/surfsense_backend/app/agents/new_chat/prompts/providers/google.md b/surfsense_backend/app/agents/new_chat/prompts/providers/google.md
new file mode 100644
index 0000000000..cac3b328b8
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/providers/google.md
@@ -0,0 +1,20 @@
+<provider_hints>
+You are running on a Google Gemini model.
+
+Output style:
+- Concise & direct. Aim for fewer than 3 lines of prose (excluding tool output, citations, and code/snippets) when the task allows.
+- No conversational filler — skip openers like "Okay, I will now…" and closers like "I have finished the changes…". Get straight to the action or answer.
+- Format with GitHub-flavoured Markdown; assume monospace rendering.
+- For one-line factual answers, just answer. No headers, no bullets.
+
+Workflow for non-trivial tasks (Understand → Plan → Act → Verify):
+1. **Understand:** read the user's request and the relevant KB / connector context. Use search and read tools (in parallel when independent) before assuming anything.
+2. **Plan:** when the task touches multiple steps, share an extremely concise plan first.
+3. **Act:** call the appropriate tools, strictly adhering to the prompts/routing already established for this agent.
+4. **Verify:** confirm with a follow-up read or search where it materially de-risks the answer.
+
+Discipline:
+- Do not take significant actions beyond the clear scope of the user's request without confirming first.
+- Do not assume a connector / tool / file exists — check (e.g. via `get_connected_accounts`) before referencing it.
+- Path arguments must be the exact strings returned by tools; do not synthesise file paths.
+</provider_hints>
diff --git a/surfsense_backend/app/agents/new_chat/prompts/providers/grok.md b/surfsense_backend/app/agents/new_chat/prompts/providers/grok.md
new file mode 100644
index 0000000000..95b8fcc14a
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/providers/grok.md
@@ -0,0 +1,17 @@
+<provider_hints>
+You are running on an xAI Grok model.
+
+Maximum terseness:
+- Answer in fewer than 4 lines unless the user asks for detail. One-word answers are best when they suffice.
+- No preamble ("The answer is", "Here's what I'll do"), no postamble ("Hope that helps", "Let me know"). Get straight to the answer.
+- Avoid restating the user's question.
+- For factual lookups inside the knowledge base, give the answer with a single `[citation:chunk_id]` and stop.
+
+Tool discipline:
+- Use exactly ONE tool per assistant turn when investigating; wait for the result before deciding the next call. Do not loop on the same tool with the same arguments — pick a result and act.
+- For obviously parallelizable read-only batches (multiple independent searches), one turn with several tool calls is fine — but never chain into a fishing expedition.
+
+Style:
+- No emojis unless the user asked. No nested bullets, no headers for short answers.
+- If you can't help, say so in 1-2 sentences without explaining "why this could lead to…".
+</provider_hints>
diff --git a/surfsense_backend/app/agents/new_chat/prompts/providers/kimi.md b/surfsense_backend/app/agents/new_chat/prompts/providers/kimi.md
new file mode 100644
index 0000000000..c3c11ad5e5
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/providers/kimi.md
@@ -0,0 +1,21 @@
+<provider_hints>
+You are running on a Moonshot Kimi model (Kimi-K1.5 / Kimi-K2 / Kimi-K2.5+).
+
+Action bias:
+- Default to taking action with tools rather than describing solutions in prose. If a tool can answer the question, call the tool.
+- Don't narrate routine reads, searches, or obvious next steps. Combine related progress into one short status line.
+- Be thorough in actions (test what you build, verify what you change). Be brief in explanations.
+
+Tool calls:
+- Output multiple non-interfering tool calls in a SINGLE response — parallelism is a major efficiency win on this model.
+- When the `task` tool is available, delegate focused subtasks to a subagent with full context (subagents don't inherit yours).
+- Don't apologise or pre-announce tool calls. The tool call itself is self-explanatory.
+
+Language:
+- Respond in the SAME language as the user's most recent turn unless explicitly instructed otherwise.
+
+Discipline:
+- Stay on track. Never give the user more than what they asked for.
+- Fact-check before stating anything as factual; don't fabricate citations.
+- Keep it stupidly simple. Don't overcomplicate.
+</provider_hints>
diff --git a/surfsense_backend/app/agents/new_chat/prompts/providers/openai_classic.md b/surfsense_backend/app/agents/new_chat/prompts/providers/openai_classic.md
new file mode 100644
index 0000000000..9128609e06
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/providers/openai_classic.md
@@ -0,0 +1,21 @@
+<provider_hints>
+You are running on a classic OpenAI chat model (GPT-4 family).
+
+Persistence:
+- Keep going until the user's query is completely resolved before yielding back. Don't end the turn at "I would do X" — actually do X.
+- When you say "Next I will…" or "Now I will…", you MUST actually take that action in the same turn.
+- If a tool call fails, diagnose and try again with corrected arguments; do not surface the raw error and stop.
+
+Planning:
+- Plan extensively before each tool call and reflect briefly on the result of the previous call. For tasks with 3+ steps, use the todo / planning tool and mark items as `in_progress` / `completed` as you go.
+- Always announce the next action in ONE concise sentence before making a non-trivial tool call ("I'll search the KB for the migration spec.").
+
+Output style:
+- Conversational but professional. Plain prose for explanations, bullet points for findings, fenced code blocks (with language tags) for code.
+- Don't dump tool output verbatim — summarise the relevant lines.
+- Don't add a closing recap unless the user asked for one. After completing the work, just stop.
+
+Tool calls:
+- Issue independent tool calls in parallel within one response.
+- Use specialised tools over generic ones (e.g. KB search before web search; named connectors over MCP fallback).
+</provider_hints>
diff --git a/surfsense_backend/app/agents/new_chat/prompts/providers/openai_codex.md b/surfsense_backend/app/agents/new_chat/prompts/providers/openai_codex.md
new file mode 100644
index 0000000000..6167d4b069
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/providers/openai_codex.md
@@ -0,0 +1,19 @@
+<provider_hints>
+You are running on an OpenAI Codex-class model (gpt-codex / codex-mini / gpt-*-codex).
+
+Output style:
+- Be concise. Don't dump fetched/searched content back at the user — reference paths or chunk ids instead.
+- Reference sources as `path:line` (or `chunk:<id>`) so they're clickable. Stand-alone paths per reference, even when repeated.
+- Prefer numbered lists (`1.`, `2.`, `3.`) when offering options the user can pick by replying with a single number.
+- Skip headers and heavy formatting for simple confirmations.
+- No emojis, no em-dashes, no nested bullets. Single-level lists only.
+
+Code & structured-output tasks:
+- Lead with a one-sentence explanation of the change before context. Don't open with "Summary:" — jump in.
+- Suggest natural next steps (run tests, diff review, commit) only when they're genuinely the next move.
+- For multi-line snippets use fenced code blocks with a language tag.
+
+Tool calls:
+- Run independent tool calls in parallel; chain only when later calls need earlier results.
+- Don't ask permission ("Should I proceed?") — proceed with the most reasonable default and state what you did.
+</provider_hints>
diff --git a/surfsense_backend/app/agents/new_chat/prompts/providers/openai_reasoning.md b/surfsense_backend/app/agents/new_chat/prompts/providers/openai_reasoning.md
new file mode 100644
index 0000000000..dd7a61536e
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/providers/openai_reasoning.md
@@ -0,0 +1,21 @@
+<provider_hints>
+You are running on an OpenAI reasoning model (GPT-5+ / o-series).
+
+Output style:
+- Be terse and direct. Don't restate the user's request before answering.
+- Don't begin with conversational openers ("Done!", "Got it", "Great question", "Sure thing"). Get to the answer or the action.
+- Match response complexity to the task: simple questions → one-line answer; substantial work → lead with the outcome, then context, then any next steps.
+- No nested bullets — keep lists flat (single level). For options the user can pick by replying with a number, use `1.` `2.` `3.`.
+- Use inline backticks for paths/commands/identifiers; fenced code blocks (with language tags) for multi-line snippets.
+
+Channels (for clients that support them):
+- `commentary` — short progress updates only when they add genuinely new information (a discovery, a tradeoff, a blocker, the start of a non-trivial step). Don't narrate routine reads or obvious next steps.
+- `final` — the completed response. Keep it self-contained; no "see above" / "see below" cross-references.
+
+Tool calls:
+- Parallelise independent tool calls in a single response (`multi_tool_use.parallel` where supported). Only sequence when a later call needs an earlier one's output.
+- Don't ask permission ("Should I proceed?", "Do you want me to…?"). Pick the most reasonable default, do it, and state what you did.
+
+Autonomy:
+- Persist until the task is fully resolved within the current turn whenever feasible. Don't stop at analysis when the user clearly wants the change applied.
+</provider_hints>
diff --git a/surfsense_backend/app/agents/new_chat/prompts/routing/__init__.py b/surfsense_backend/app/agents/new_chat/prompts/routing/__init__.py
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/routing/__init__.py
@@ -0,0 +1 @@
+
diff --git a/surfsense_backend/app/agents/new_chat/prompts/routing/jira.md b/surfsense_backend/app/agents/new_chat/prompts/routing/jira.md
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/routing/jira.md
@@ -0,0 +1 @@
+
diff --git a/surfsense_backend/app/agents/new_chat/prompts/routing/linear.md b/surfsense_backend/app/agents/new_chat/prompts/routing/linear.md
new file mode 100644
index 0000000000..2f1bfacd90
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/routing/linear.md
@@ -0,0 +1,3 @@
+<linear_routing>
+**Linear:** Prefer the `task` tool with subagent **`linear_specialist`** when the user’s request is **only about Linear** and may need several tool calls (list issues, inspect one issue, teams, users, statuses, comments, documents). Use **`connector_negotiator`** when Linear is one hop in a **multi-connector** workflow. Call Linear MCP tools directly from the parent when a **single** quick call is enough.
+</linear_routing>
diff --git a/surfsense_backend/app/agents/new_chat/prompts/routing/slack.md b/surfsense_backend/app/agents/new_chat/prompts/routing/slack.md
new file mode 100644
index 0000000000..4b5d07a9a8
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/routing/slack.md
@@ -0,0 +1,3 @@
+<slack_routing>
+**Slack:** Prefer `task` with **`slack_specialist`** for **Slack-only** multi-step work (channels, threads, reads, writes that need approval in the specialist). Use **`connector_negotiator`** when Slack feeds another connector in one chain. Use direct `slack_*` tools from the parent for a **single** quick read or write when appropriate.
+</slack_routing>
diff --git a/surfsense_backend/app/agents/new_chat/prompts/tools/__init__.py b/surfsense_backend/app/agents/new_chat/prompts/tools/__init__.py
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/tools/__init__.py
@@ -0,0 +1 @@
+
diff --git a/surfsense_backend/app/agents/new_chat/prompts/tools/_preamble.md b/surfsense_backend/app/agents/new_chat/prompts/tools/_preamble.md
new file mode 100644
index 0000000000..2c169e0151
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/tools/_preamble.md
@@ -0,0 +1,6 @@
+<tools>
+You have access to the following tools:
+
+IMPORTANT: You can ONLY use the tools listed below. If a capability is not listed here, you do NOT have it.
+Do NOT claim you can do something if the corresponding tool is not listed.
+
diff --git a/surfsense_backend/app/agents/new_chat/prompts/tools/generate_image.md b/surfsense_backend/app/agents/new_chat/prompts/tools/generate_image.md
new file mode 100644
index 0000000000..8bde13f224
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/tools/generate_image.md
@@ -0,0 +1,11 @@
+
+- generate_image: Generate images from text descriptions using AI image models.
+  - Use this when the user asks you to create, generate, draw, design, or make an image.
+  - Trigger phrases: "generate an image of", "create a picture of", "draw me", "make an image", "design a logo", "create artwork"
+  - Args:
+    - prompt: A detailed text description of the image to generate. Be specific about subject, style, colors, composition, and mood.
+    - n: Number of images to generate (1-4, default: 1)
+  - Returns: A dictionary with the generated image metadata. The image will automatically be displayed in the chat.
+  - IMPORTANT: Write a detailed, descriptive prompt for best results. Don't just pass the user's words verbatim -
+    expand and improve the prompt with specific details about style, lighting, composition, and mood.
+  - If the user's request is vague (e.g., "make me an image of a cat"), enhance the prompt with artistic details.
diff --git a/surfsense_backend/app/agents/new_chat/prompts/tools/generate_podcast.md b/surfsense_backend/app/agents/new_chat/prompts/tools/generate_podcast.md
new file mode 100644
index 0000000000..58be143d78
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/tools/generate_podcast.md
@@ -0,0 +1,15 @@
+
+- generate_podcast: Generate an audio podcast from provided content.
+  - Use this when the user asks to create, generate, or make a podcast.
+  - Trigger phrases: "give me a podcast about", "create a podcast", "generate a podcast", "make a podcast", "turn this into a podcast"
+  - Args:
+    - source_content: The text content to convert into a podcast. This MUST be comprehensive and include:
+      * If discussing the current conversation: Include a detailed summary of the FULL chat history (all user questions and your responses)
+      * If based on knowledge base search: Include the key findings and insights from the search results
+      * You can combine both: conversation context + search results for richer podcasts
+      * The more detailed the source_content, the better the podcast quality
+    - podcast_title: Optional title for the podcast (default: "SurfSense Podcast")
+    - user_prompt: Optional instructions for podcast style/format (e.g., "Make it casual and fun")
+  - Returns: A task_id for tracking. The podcast will be generated in the background.
+  - IMPORTANT: Only one podcast can be generated at a time. If a podcast is already being generated, the tool will return status "already_generating".
+  - After calling this tool, inform the user that podcast generation has started and they will see the player when it's ready (takes 3-5 minutes).
diff --git a/surfsense_backend/app/agents/new_chat/prompts/tools/generate_report.md b/surfsense_backend/app/agents/new_chat/prompts/tools/generate_report.md
new file mode 100644
index 0000000000..8a285a4331
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/tools/generate_report.md
@@ -0,0 +1,39 @@
+
+- generate_report: Generate or revise a structured Markdown report artifact.
+  - WHEN TO CALL THIS TOOL — the message must contain a creation or modification VERB directed at producing a deliverable:
+    * Creation verbs: write, create, generate, draft, produce, summarize into, turn into, make
+    * Modification verbs: revise, update, expand, add (a section), rewrite, make (it shorter/longer/formal)
+    * Example triggers: "generate a report about...", "write a document on...", "add a section about budget", "make the report shorter", "rewrite in formal tone"
+  - WHEN NOT TO CALL THIS TOOL (answer in chat instead):
+    * Questions or discussion about the report: "What can we add?", "What's missing?", "Is the data accurate?", "How could this be improved?"
+    * Suggestions or brainstorming: "What other topics could be covered?", "What else could be added?", "What would make this better?"
+    * Asking for explanations: "Can you explain section 2?", "Why did you include that?", "What does this part mean?"
+    * Quick follow-ups or critiques: "Is the conclusion strong enough?", "Are there any gaps?", "What about the competitors?"
+    * THE TEST: Does the message contain a creation/modification VERB (from the list above) directed at producing or changing a deliverable? If NO verb → answer conversationally in chat. Do NOT assume the user wants a revision just because a report exists in the conversation.
+  - IMPORTANT FORMAT RULE: Reports are ALWAYS generated in Markdown.
+  - Args:
+    - topic: Short title for the report (max ~8 words).
+    - source_content: The text content to base the report on.
+      * For source_strategy="conversation" or "provided": Include a comprehensive summary of the relevant content.
+      * For source_strategy="kb_search": Can be empty or minimal — the tool handles searching internally.
+      * For source_strategy="auto": Include what you have; the tool searches KB if it's not enough.
+    - source_strategy: Controls how the tool collects source material. One of:
+      * "conversation" — The conversation already contains enough context (prior Q&A, discussion, pasted text, scraped pages). Pass a thorough summary as source_content.
+      * "kb_search" — The tool will search the knowledge base internally. Provide search_queries with 1-5 targeted queries.
+      * "auto" — Use source_content if sufficient, otherwise fall back to internal KB search using search_queries.
+      * "provided" — Use only what is in source_content (default, backward-compatible).
+    - search_queries: When source_strategy is "kb_search" or "auto", provide 1-5 specific search queries for the knowledge base. These should be precise, not just the topic name repeated.
+    - report_style: Controls report depth. Options: "detailed" (DEFAULT), "deep_research", "brief".
+      Use "brief" ONLY when the user explicitly asks for a short/concise/one-page report (e.g., "one page", "keep it short", "brief report", "500 words"). Default to "detailed" for all other requests.
+    - user_instructions: Optional specific instructions (e.g., "focus on financial impacts", "include recommendations"). When revising (parent_report_id set), describe WHAT TO CHANGE. If the user mentions a length preference (e.g., "one page", "500 words", "2 pages"), include that VERBATIM here AND set report_style="brief".
+    - parent_report_id: Set this to the report_id from a previous generate_report result when the user wants to MODIFY an existing report. Do NOT set it for new reports or questions about reports.
+  - Returns: A dictionary with status "ready" or "failed", report_id, title, and word_count.
+  - The report is generated immediately in Markdown and displayed inline in the chat.
+  - Export/download formats (PDF, DOCX, HTML, LaTeX, EPUB, ODT, plain text) are produced from the generated Markdown report.
+  - SOURCE STRATEGY DECISION (HIGH PRIORITY — follow this exactly):
+    * If the conversation already has substantive Q&A / discussion on the topic → use source_strategy="conversation" with a comprehensive summary as source_content.
+    * If the user wants a report on a topic not yet discussed → use source_strategy="kb_search" with targeted search_queries.
+    * If you have some content but might need more → use source_strategy="auto" with both source_content and search_queries.
+    * When revising an existing report (parent_report_id set) and the conversation has relevant context → use source_strategy="conversation". The revision will use the previous report content plus your source_content.
+    * NEVER run a separate KB lookup step and then pass those results to generate_report. The tool handles KB search internally.
+  - AFTER CALLING THIS TOOL: Do NOT repeat, summarize, or reproduce the report content in the chat. The report is already displayed as an interactive card that the user can open, read, copy, and export. Simply confirm that the report was generated (e.g., "I've generated your report on [topic]. You can view the Markdown report now, and export it in various formats from the card."). NEVER write out the report text in the chat.
diff --git a/surfsense_backend/app/agents/new_chat/prompts/tools/generate_resume.md b/surfsense_backend/app/agents/new_chat/prompts/tools/generate_resume.md
new file mode 100644
index 0000000000..321ea90c90
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/tools/generate_resume.md
@@ -0,0 +1,30 @@
+
+- generate_resume: Generate or revise a professional resume as a Typst document.
+  - WHEN TO CALL: The user asks to create, build, generate, write, or draft a resume or CV.
+    Also when they ask to modify, update, or revise an existing resume from this conversation.
+  - WHEN NOT TO CALL: General career advice, resume tips, cover letters, or reviewing
+    a resume without making changes. For cover letters, use generate_report instead.
+  - The tool produces Typst source code that is compiled to a PDF preview automatically.
+  - PAGE POLICY:
+    - Default behavior is ONE PAGE. For new resume creation, set max_pages=1 unless the user explicitly asks for more.
+    - If the user requests a longer resume (e.g., "make it 2 pages"), set max_pages to that value.
+  - Args:
+    - user_info: The user's resume content — work experience, education, skills, contact
+      info, etc. Can be structured or unstructured text.
+      CRITICAL: user_info must be COMPREHENSIVE. Do NOT just pass the user's raw message.
+      You MUST gather and consolidate ALL available information:
+        * Content from referenced/mentioned documents (e.g., uploaded resumes, CVs, LinkedIn profiles)
+          that appear in the conversation context — extract and include their FULL content.
+        * Information the user shared across multiple messages in the conversation.
+        * Any relevant details from knowledge base search results in the context.
+      The more complete the user_info, the better the resume. Include names, contact info,
+      work experience with dates, education, skills, projects, certifications — everything available.
+    - user_instructions: Optional style or content preferences (e.g. "emphasize leadership",
+      "keep it to one page"). For revisions, describe what to change.
+    - parent_report_id: Set this when the user wants to MODIFY an existing resume from
+      this conversation. Use the report_id from a previous generate_resume result.
+    - max_pages: Maximum resume length in pages (integer 1-5). Default is 1.
+  - Returns: Dict with status, report_id, title, and content_type.
+  - After calling: Give a brief confirmation. Do NOT paste resume content in chat. Do NOT mention report_id or any internal IDs — the resume card is shown automatically.
+  - VERSIONING: Same rules as generate_report — set parent_report_id for modifications
+    of an existing resume, leave as None for new resumes.
diff --git a/surfsense_backend/app/agents/new_chat/prompts/tools/generate_video_presentation.md b/surfsense_backend/app/agents/new_chat/prompts/tools/generate_video_presentation.md
new file mode 100644
index 0000000000..c3def88f2d
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/tools/generate_video_presentation.md
@@ -0,0 +1,9 @@
+
+- generate_video_presentation: Generate a video presentation from provided content.
+  - Use this when the user asks to create a video, presentation, slides, or slide deck.
+  - Trigger phrases: "give me a presentation", "create slides", "generate a video", "make a slide deck", "turn this into a presentation"
+  - Args:
+    - source_content: The text content to turn into a presentation. The more detailed, the better.
+    - video_title: Optional title (default: "SurfSense Presentation")
+    - user_prompt: Optional style instructions (e.g., "Make it technical and detailed")
+  - After calling this tool, inform the user that generation has started and they will see the presentation when it's ready.
diff --git a/surfsense_backend/app/agents/new_chat/prompts/tools/scrape_webpage.md b/surfsense_backend/app/agents/new_chat/prompts/tools/scrape_webpage.md
new file mode 100644
index 0000000000..46e299392f
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/tools/scrape_webpage.md
@@ -0,0 +1,30 @@
+
+- scrape_webpage: Scrape and extract the main content from a webpage.
+  - Use this when the user wants you to READ and UNDERSTAND the actual content of a webpage.
+  - CRITICAL — WHEN TO USE (always attempt scraping, never refuse before trying):
+    * When a user asks to "get", "fetch", "pull", "grab", "scrape", or "read" content from a URL
+    * When the user wants live/dynamic data from a specific webpage (e.g., tables, scores, stats, prices)
+    * When a URL was mentioned earlier in the conversation and the user asks for its actual content
+    * When `/documents/` knowledge-base data is insufficient and the user wants more
+  - Trigger scenarios:
+    * "Read this article and summarize it"
+    * "What does this page say about X?"
+    * "Summarize this blog post for me"
+    * "Tell me the key points from this article"
+    * "What's in this webpage?"
+    * "Can you analyze this article?"
+    * "Can you get the live table/data from [URL]?"
+    * "Scrape it" / "Can you scrape that?" (referring to a previously mentioned URL)
+    * "Fetch the content from [URL]"
+    * "Pull the data from that page"
+  - Args:
+    - url: The URL of the webpage to scrape (must be HTTP/HTTPS)
+    - max_length: Maximum content length to return (default: 50000 chars)
+  - Returns: The page title, description, full content (in markdown), word count, and metadata
+  - After scraping, provide a comprehensive, well-structured summary with key takeaways using headings or bullet points.
+  - Reference the source using markdown links [descriptive text](url) — never bare URLs.
+  - IMAGES: The scraped content may contain image URLs in markdown format like `![alt text](image_url)`.
+    * When you find relevant/important images in the scraped content, include them in your response using standard markdown image syntax: `![alt text](image_url)`.
+    * This makes your response more visual and engaging.
+    * Prioritize showing: diagrams, charts, infographics, key illustrations, or images that help explain the content.
+    * Don't show every image - just the most relevant 1-3 images that enhance understanding.
diff --git a/surfsense_backend/app/agents/new_chat/prompts/tools/search_surfsense_docs.md b/surfsense_backend/app/agents/new_chat/prompts/tools/search_surfsense_docs.md
new file mode 100644
index 0000000000..133717fec0
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/tools/search_surfsense_docs.md
@@ -0,0 +1,7 @@
+
+- search_surfsense_docs: Search the official SurfSense documentation.
+  - Use this tool when the user asks anything about SurfSense itself (the application they are using).
+  - Args:
+    - query: The search query about SurfSense
+    - top_k: Number of documentation chunks to retrieve (default: 10)
+  - Returns: Documentation content with chunk IDs for citations (prefixed with 'doc-', e.g., [citation:doc-123])
diff --git a/surfsense_backend/app/agents/new_chat/prompts/tools/update_memory_private.md b/surfsense_backend/app/agents/new_chat/prompts/tools/update_memory_private.md
new file mode 100644
index 0000000000..184013804b
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/tools/update_memory_private.md
@@ -0,0 +1,31 @@
+
+- update_memory: Update your personal memory document about the user.
+  - Your current memory is already in <user_memory> in your context.  The `chars` and
+    `limit` attributes show your current usage and the maximum allowed size.
+  - This is your curated long-term memory — the distilled essence of what you know about
+    the user, not raw conversation logs.
+  - Call update_memory when:
+    * The user explicitly asks to remember or forget something
+    * The user shares durable facts or preferences that will matter in future conversations
+  - The user's first name is provided in <user_name>. Use it in memory entries
+    instead of "the user" (e.g. "{name} works at..." not "The user works at...").
+    Do not store the name itself as a separate memory entry.
+  - Do not store short-lived or ephemeral info: one-off questions, greetings,
+    session logistics, or things that only matter for the current task.
+  - Args:
+    - updated_memory: The FULL updated markdown document (not a diff).
+      Merge new facts with existing ones, update contradictions, remove outdated entries.
+      Treat every update as a curation pass — consolidate, don't just append.
+  - Every bullet MUST use this format: - (YYYY-MM-DD) [marker] text
+    Markers:
+      [fact]  — durable facts (role, background, projects, tools, expertise)
+      [pref]  — preferences (response style, languages, formats, tools)
+      [instr] — standing instructions (always/never do, response rules)
+  - Keep it concise and well under the character limit shown in <user_memory>.
+  - Every entry MUST be under a `##` heading. Keep heading names short (2-3 words) and
+    natural. Do NOT include the user's name in headings. Organize by context — e.g.
+    who they are, what they're focused on, how they prefer things. Create, split, or
+    merge headings freely as the memory grows.
+  - Each entry MUST be a single bullet point. Be descriptive but concise — include relevant
+    details and context rather than just a few words.
+  - During consolidation, prioritize keeping: [instr] > [pref] > [fact].
diff --git a/surfsense_backend/app/agents/new_chat/prompts/tools/update_memory_team.md b/surfsense_backend/app/agents/new_chat/prompts/tools/update_memory_team.md
new file mode 100644
index 0000000000..7eaca88184
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/tools/update_memory_team.md
@@ -0,0 +1,26 @@
+
+- update_memory: Update the team's shared memory document for this search space.
+  - Your current team memory is already in <team_memory> in your context.  The `chars`
+    and `limit` attributes show current usage and the maximum allowed size.
+  - This is the team's curated long-term memory — decisions, conventions, key facts.
+  - NEVER store personal memory in team memory (e.g. personal bio, individual
+    preferences, or user-only standing instructions).
+  - Call update_memory when:
+    * A team member explicitly asks to remember or forget something
+    * The conversation surfaces durable team decisions, conventions, or facts
+      that will matter in future conversations
+  - Do not store short-lived or ephemeral info: one-off questions, greetings,
+    session logistics, or things that only matter for the current task.
+  - Args:
+    - updated_memory: The FULL updated markdown document (not a diff).
+      Merge new facts with existing ones, update contradictions, remove outdated entries.
+      Treat every update as a curation pass — consolidate, don't just append.
+  - Every bullet MUST use this format: - (YYYY-MM-DD) [fact] text
+    Team memory uses ONLY the [fact] marker. Never use [pref] or [instr] in team memory.
+  - Keep it concise and well under the character limit shown in <team_memory>.
+  - Every entry MUST be under a `##` heading. Keep heading names short (2-3 words) and
+    natural. Organize by context — e.g. what the team decided, current architecture,
+    active processes. Create, split, or merge headings freely as the memory grows.
+  - Each entry MUST be a single bullet point. Be descriptive but concise — include relevant
+    details and context rather than just a few words.
+  - During consolidation, prioritize keeping: decisions/conventions > key facts > current priorities.
diff --git a/surfsense_backend/app/agents/new_chat/prompts/tools/web_search.md b/surfsense_backend/app/agents/new_chat/prompts/tools/web_search.md
new file mode 100644
index 0000000000..7ed7c332d2
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/prompts/tools/web_search.md
@@ -0,0 +1,18 @@
+
+- web_search: Search the web for real-time information using all configured search engines.
+  - Use this for current events, news, prices, weather, public facts, or any question requiring
+    up-to-date information from the internet.
+  - This tool dispatches to all configured search engines (SearXNG, Tavily, Linkup, Baidu) in
+    parallel and merges the results.
+  - IMPORTANT (REAL-TIME / PUBLIC WEB QUERIES): For questions that require current public web data
+    (e.g., live exchange rates, stock prices, breaking news, weather, current events), you MUST call
+    `web_search` instead of answering from memory.
+  - For these real-time/public web queries, DO NOT answer from memory and DO NOT say you lack internet
+    access before attempting a web search.
+  - If the search returns no relevant results, explain that web sources did not return enough
+    data and ask the user if they want you to retry with a refined query.
+  - Args:
+    - query: The search query - use specific, descriptive terms
+    - top_k: Number of results to retrieve (default: 10, max: 50)
+  - If search snippets are insufficient for the user's question, use `scrape_webpage` on the most relevant result URL for full content.
+  - When presenting results, reference sources as markdown links [descriptive text](url) — never bare URLs.
diff --git a/surfsense_backend/app/agents/new_chat/sandbox.py b/surfsense_backend/app/agents/new_chat/sandbox.py
index 8b634993bc..efac7aae83 100644
--- a/surfsense_backend/app/agents/new_chat/sandbox.py
+++ b/surfsense_backend/app/agents/new_chat/sandbox.py
@@ -16,6 +16,7 @@
 import logging
 import os
 import shutil
+import threading
 from pathlib import Path
 
 from daytona import (
@@ -42,7 +43,7 @@ class _TimeoutAwareSandbox(DaytonaSandbox):
     """
 
     def execute(self, command: str, *, timeout: int | None = None) -> ExecuteResponse:
-        t = timeout if timeout is not None else self._timeout
+        t = timeout if timeout is not None else self._default_timeout
         result = self._sandbox.process.exec(command, timeout=t)
         return ExecuteResponse(
             output=result.result,
@@ -55,11 +56,20 @@ async def aexecute(
     ) -> ExecuteResponse:  # type: ignore[override]
         return await asyncio.to_thread(self.execute, command, timeout=timeout)
 
+    def download_file(self, path: str) -> bytes:
+        """Download a file from the sandbox filesystem."""
+        return self._sandbox.fs.download_file(path)
+
 
 _daytona_client: Daytona | None = None
+_client_lock = threading.Lock()
 _sandbox_cache: dict[str, _TimeoutAwareSandbox] = {}
+_sandbox_locks: dict[str, asyncio.Lock] = {}
+_sandbox_locks_mu = asyncio.Lock()
+_seeded_files: dict[str, dict[str, str]] = {}
 _SANDBOX_CACHE_MAX_SIZE = 20
 THREAD_LABEL_KEY = "surfsense_thread"
+SANDBOX_DOCUMENTS_ROOT = "/home/daytona/documents"
 
 
 def is_sandbox_enabled() -> bool:
@@ -68,24 +78,40 @@ def is_sandbox_enabled() -> bool:
 
 def _get_client() -> Daytona:
     global _daytona_client
-    if _daytona_client is None:
-        config = DaytonaConfig(
-            api_key=os.environ.get("DAYTONA_API_KEY", ""),
-            api_url=os.environ.get("DAYTONA_API_URL", "https://app.daytona.io/api"),
-            target=os.environ.get("DAYTONA_TARGET", "us"),
-        )
-        _daytona_client = Daytona(config)
-    return _daytona_client
+    with _client_lock:
+        if _daytona_client is None:
+            config = DaytonaConfig(
+                api_key=os.environ.get("DAYTONA_API_KEY", ""),
+                api_url=os.environ.get("DAYTONA_API_URL", "https://app.daytona.io/api"),
+                target=os.environ.get("DAYTONA_TARGET", "us"),
+            )
+            _daytona_client = Daytona(config)
+        return _daytona_client
+
+
+def _sandbox_create_params(
+    labels: dict[str, str],
+) -> CreateSandboxFromSnapshotParams:
+    snapshot_id = os.environ.get("DAYTONA_SNAPSHOT_ID") or None
+    return CreateSandboxFromSnapshotParams(
+        language="python",
+        labels=labels,
+        snapshot=snapshot_id,
+        network_block_all=True,
+        auto_stop_interval=10,
+        auto_delete_interval=60,
+    )
 
 
-def _find_or_create(thread_id: str) -> _TimeoutAwareSandbox:
+def _find_or_create(thread_id: str) -> tuple[_TimeoutAwareSandbox, bool]:
     """Find an existing sandbox for *thread_id*, or create a new one.
 
-    If an existing sandbox is found but is stopped/archived, it will be
-    restarted automatically before returning.
+    Returns a tuple of (sandbox, is_new) where *is_new* is True when a
+    fresh sandbox was created (first time or replacement after failure).
     """
     client = _get_client()
     labels = {THREAD_LABEL_KEY: thread_id}
+    is_new = False
 
     try:
         sandbox = client.find_one(labels=labels)
@@ -109,54 +135,143 @@ def _find_or_create(thread_id: str) -> _TimeoutAwareSandbox:
                 sandbox.id,
                 sandbox.state,
             )
-            sandbox = client.create(
-                CreateSandboxFromSnapshotParams(language="python", labels=labels)
-            )
+            try:
+                client.delete(sandbox)
+            except Exception:
+                logger.debug(
+                    "Could not delete broken sandbox %s", sandbox.id, exc_info=True
+                )
+            sandbox = client.create(_sandbox_create_params(labels))
+            is_new = True
             logger.info("Created replacement sandbox: %s", sandbox.id)
         elif sandbox.state != SandboxState.STARTED:
             sandbox.wait_for_sandbox_start(timeout=60)
 
-    except Exception:
+    except DaytonaError:
         logger.info("No existing sandbox for thread %s — creating one", thread_id)
-        sandbox = client.create(
-            CreateSandboxFromSnapshotParams(language="python", labels=labels)
-        )
+        sandbox = client.create(_sandbox_create_params(labels))
+        is_new = True
         logger.info("Created new sandbox: %s", sandbox.id)
 
-    return _TimeoutAwareSandbox(sandbox=sandbox)
+    return _TimeoutAwareSandbox(sandbox=sandbox), is_new
 
 
-async def get_or_create_sandbox(thread_id: int | str) -> _TimeoutAwareSandbox:
+async def _get_thread_lock(key: str) -> asyncio.Lock:
+    """Return a per-thread asyncio lock, creating one if needed."""
+    async with _sandbox_locks_mu:
+        lock = _sandbox_locks.get(key)
+        if lock is None:
+            lock = asyncio.Lock()
+            _sandbox_locks[key] = lock
+        return lock
+
+
+async def get_or_create_sandbox(
+    thread_id: int | str,
+) -> tuple[_TimeoutAwareSandbox, bool]:
     """Get or create a sandbox for a conversation thread.
 
     Uses an in-process cache keyed by thread_id so subsequent messages
     in the same conversation reuse the sandbox object without an API call.
-
-    Args:
-        thread_id: The conversation thread identifier.
+    A per-thread async lock prevents duplicate sandbox creation from
+    concurrent requests.
 
     Returns:
-        DaytonaSandbox connected to the sandbox.
+        Tuple of (sandbox, is_new). *is_new* is True when a fresh sandbox
+        was created, signalling that file tracking should be reset.
     """
     key = str(thread_id)
-    cached = _sandbox_cache.get(key)
-    if cached is not None:
-        logger.info("Reusing cached sandbox for thread %s", key)
-        return cached
-    sandbox = await asyncio.to_thread(_find_or_create, key)
-    _sandbox_cache[key] = sandbox
+    lock = await _get_thread_lock(key)
+
+    async with lock:
+        cached = _sandbox_cache.get(key)
+        if cached is not None:
+            logger.info("Reusing cached sandbox for thread %s", key)
+            return cached, False
+        sandbox, is_new = await asyncio.to_thread(_find_or_create, key)
+        _sandbox_cache[key] = sandbox
 
-    if len(_sandbox_cache) > _SANDBOX_CACHE_MAX_SIZE:
-        oldest_key = next(iter(_sandbox_cache))
-        _sandbox_cache.pop(oldest_key, None)
-        logger.debug("Evicted oldest sandbox cache entry: %s", oldest_key)
+        if len(_sandbox_cache) > _SANDBOX_CACHE_MAX_SIZE:
+            oldest_key = next(iter(_sandbox_cache))
+            if oldest_key != key:
+                evicted = _sandbox_cache.pop(oldest_key, None)
+                _seeded_files.pop(oldest_key, None)
+                logger.debug("Evicted sandbox cache entry: %s", oldest_key)
+                if evicted is not None:
+                    _schedule_sandbox_delete(evicted)
 
-    return sandbox
+        return sandbox, is_new
+
+
+def _schedule_sandbox_delete(sandbox: _TimeoutAwareSandbox) -> None:
+    """Best-effort background deletion of an evicted sandbox."""
+
+    def _delete() -> None:
+        try:
+            client = _get_client()
+            client.delete(sandbox._sandbox)
+            logger.info("Deleted evicted sandbox: %s", sandbox._sandbox.id)
+        except Exception:
+            logger.debug("Could not delete evicted sandbox", exc_info=True)
+
+    try:
+        loop = asyncio.get_running_loop()
+        loop.run_in_executor(None, _delete)
+    except RuntimeError:
+        pass
+
+
+async def sync_files_to_sandbox(
+    thread_id: int | str,
+    files: dict[str, dict],
+    sandbox: _TimeoutAwareSandbox,
+    is_new: bool,
+) -> None:
+    """Upload new or changed virtual-filesystem files to the sandbox.
+
+    Compares *files* (from ``state["files"]``) against the ``_seeded_files``
+    tracking dict and uploads only what has changed.  When *is_new* is True
+    the tracking is reset so every file is re-uploaded.
+    """
+    key = str(thread_id)
+    if is_new:
+        _seeded_files.pop(key, None)
+
+    tracked = _seeded_files.get(key, {})
+    to_upload: list[tuple[str, bytes]] = []
+
+    for vpath, fdata in files.items():
+        modified_at = fdata.get("modified_at", "")
+        if tracked.get(vpath) == modified_at:
+            continue
+        content = "\n".join(fdata.get("content", []))
+        sandbox_path = f"{SANDBOX_DOCUMENTS_ROOT}{vpath}"
+        to_upload.append((sandbox_path, content.encode("utf-8")))
+
+    if not to_upload:
+        return
+
+    def _upload() -> None:
+        sandbox.upload_files(to_upload)
+
+    await asyncio.to_thread(_upload)
+
+    new_tracked = dict(tracked)
+    for vpath, fdata in files.items():
+        new_tracked[vpath] = fdata.get("modified_at", "")
+    _seeded_files[key] = new_tracked
+    logger.info("Synced %d file(s) to sandbox for thread %s", len(to_upload), key)
+
+
+def _evict_sandbox_cache(thread_id: int | str) -> None:
+    key = str(thread_id)
+    _sandbox_cache.pop(key, None)
+    _seeded_files.pop(key, None)
 
 
 async def delete_sandbox(thread_id: int | str) -> None:
     """Delete the sandbox for a conversation thread."""
-    _sandbox_cache.pop(str(thread_id), None)
+    _evict_sandbox_cache(thread_id)
 
     def _delete() -> None:
         client = _get_client()
@@ -193,7 +308,11 @@ def _get_sandbox_files_dir() -> Path:
 def _local_path_for(thread_id: int | str, sandbox_path: str) -> Path:
     """Map a sandbox-internal absolute path to a local filesystem path."""
     relative = sandbox_path.lstrip("/")
-    return _get_sandbox_files_dir() / str(thread_id) / relative
+    base = (_get_sandbox_files_dir() / str(thread_id)).resolve()
+    target = (base / relative).resolve()
+    if not target.is_relative_to(base):
+        raise ValueError(f"Path traversal blocked: {sandbox_path}")
+    return target
 
 
 def get_local_sandbox_file(thread_id: int | str, sandbox_path: str) -> bytes | None:
@@ -226,7 +345,7 @@ async def persist_and_delete_sandbox(
     Per-file errors are logged but do **not** prevent the sandbox from
     being deleted — freeing Daytona storage is the priority.
     """
-    _sandbox_cache.pop(str(thread_id), None)
+    _evict_sandbox_cache(thread_id)
 
     def _persist_and_delete() -> None:
         client = _get_client()
diff --git a/surfsense_backend/app/agents/new_chat/skills/__init__.py b/surfsense_backend/app/agents/new_chat/skills/__init__.py
new file mode 100644
index 0000000000..bb7ac055c1
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/skills/__init__.py
@@ -0,0 +1,7 @@
+"""SurfSense built-in agent skills (Anthropic Skills format).
+
+Each subdirectory corresponds to one skill and contains a ``SKILL.md`` file
+with YAML frontmatter (name, description, allowed_tools) plus markdown
+instructions. The :class:`BuiltinSkillsBackend` exposes them to the
+deepagents :class:`SkillsMiddleware`.
+"""
diff --git a/surfsense_backend/app/agents/new_chat/skills/builtin/__init__.py b/surfsense_backend/app/agents/new_chat/skills/builtin/__init__.py
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/skills/builtin/__init__.py
@@ -0,0 +1 @@
+
diff --git a/surfsense_backend/app/agents/new_chat/skills/builtin/email-drafting/SKILL.md b/surfsense_backend/app/agents/new_chat/skills/builtin/email-drafting/SKILL.md
new file mode 100644
index 0000000000..32e599e98a
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/skills/builtin/email-drafting/SKILL.md
@@ -0,0 +1,25 @@
+---
+name: email-drafting
+description: Draft an email matching the user's voice, with structured intent and CTA
+allowed-tools: search_surfsense_docs
+---
+
+# Email drafting
+
+## When to use this skill
+"Draft an email to ...", "reply to this thread", "write a follow-up to X". Plain "summarize the email" is **not** in scope — that's a comprehension task.
+
+## Voice
+Search the KB for prior emails from the user to similar audiences (same recipient, same topic class). Mirror tone, opening style, sign-off, and length distribution. If there is no precedent, default to: warm, direct, no filler, short paragraphs, one clear ask.
+
+## Required structure
+Every draft includes, in this order:
+
+1. **Subject line** — concrete, ≤ 8 words, no clickbait, no `Re:` unless replying.
+2. **Opening (1 sentence)** — context the recipient already shares; never restate what they wrote unless the thread is long.
+3. **Body** — the actual point in one short paragraph. Bullets only if there are >3 discrete items.
+4. **Single explicit CTA** — what you want the recipient to do, with a soft deadline if relevant.
+5. **Sign-off** — match the user's prior closing style.
+
+## Always offer alternatives
+End your message with: "Want me to make it shorter, more formal, or add a different angle?" — give the user one obvious next step.
diff --git a/surfsense_backend/app/agents/new_chat/skills/builtin/kb-research/SKILL.md b/surfsense_backend/app/agents/new_chat/skills/builtin/kb-research/SKILL.md
new file mode 100644
index 0000000000..c268278ab7
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/skills/builtin/kb-research/SKILL.md
@@ -0,0 +1,23 @@
+---
+name: kb-research
+description: Structured approach to finding and synthesizing information from the user's knowledge base
+allowed-tools: search_surfsense_docs, scrape_webpage, read_file, ls_tree, grep, web_search
+---
+
+# Knowledge-base research
+
+## When to use this skill
+- The user asks "find/look up/research" something specifically inside their knowledge base.
+- The user references documents, notes, repos, or connector data they expect to exist already.
+- A multi-document synthesis is required (e.g., "summarize what we've discussed about X across all my notes").
+
+## Plan
+1. Decompose the user's question into 2-4 specific, citation-worthy sub-questions.
+2. For each sub-question, run **one** targeted KB search (focused on terms the user would have written, not synonyms). Open the most relevant 2-3 documents fully via `read_file` if their excerpts are too short.
+3. Use `grep` to find supporting passages in long files instead of re-reading them end to end.
+4. Cite every claim with `[citation:chunk_id]` exactly as the chunk tag specifies.
+
+## What good output looks like
+- Short paragraphs with inline citations.
+- Quoted phrases when wording matters.
+- An explicit "Not found in your knowledge base" callout when a sub-question has no support — never fabricate.
diff --git a/surfsense_backend/app/agents/new_chat/skills/builtin/meeting-prep/SKILL.md b/surfsense_backend/app/agents/new_chat/skills/builtin/meeting-prep/SKILL.md
new file mode 100644
index 0000000000..9657eb0786
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/skills/builtin/meeting-prep/SKILL.md
@@ -0,0 +1,22 @@
+---
+name: meeting-prep
+description: Pull together briefing materials before a scheduled meeting
+allowed-tools: search_surfsense_docs, web_search, scrape_webpage, read_file
+---
+
+# Meeting preparation
+
+## When to use this skill
+The user mentions an upcoming meeting, call, or interview and asks you to "prep", "brief me", "pull background", or "what do I need to know about X before tomorrow".
+
+## Output structure
+Always produce these sections (omit any with no signal — don't pad):
+
+1. **Attendees & context** — who's in the room, their roles, what they care about. Pull from KB notes about prior interactions; supplement with public profile facts via `web_search` when names or companies are unfamiliar.
+2. **Open threads** — outstanding action items, unresolved decisions, last-mentioned blockers from prior conversation history.
+3. **Recent moves** — within the last 30 days: relevant launches, hires, news. Cite KB chunks when present, otherwise external sources.
+4. **Suggested questions** — 3-5 questions the user could ask, tailored to the open threads and the attendees' likely priorities.
+
+## Source ordering
+- Always check the user's KB **first** for prior meeting notes, internal docs, or Slack threads about these attendees.
+- Only fall back to `web_search` for *publicly verifiable* facts — never to fabricate a participant's preferences or relationships.
diff --git a/surfsense_backend/app/agents/new_chat/skills/builtin/report-writing/SKILL.md b/surfsense_backend/app/agents/new_chat/skills/builtin/report-writing/SKILL.md
new file mode 100644
index 0000000000..17ac2f391e
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/skills/builtin/report-writing/SKILL.md
@@ -0,0 +1,23 @@
+---
+name: report-writing
+description: How to scope, draft, and revise a Markdown report artifact via generate_report
+allowed-tools: generate_report, search_surfsense_docs, read_file
+---
+
+# Report writing
+
+## When to use this skill
+The user explicitly requests a deliverable: "write a report on …", "draft a memo", "produce a brief", "expand the previous report". A creation or modification verb pointed at an artifact is required (see `generate_report`'s when-to-call rules).
+
+## Decision flow
+1. **Source strategy.** Decide which `source_strategy` fits:
+   - `conversation` — substantive Q&A on the topic already in chat.
+   - `kb_search` — fresh topic; supply 1–5 precise `search_queries`.
+   - `auto` — partial conversation context; let the tool fall back.
+   - `provided` — verbatim source text only.
+2. **Style.** Default to `report_style="detailed"` unless the user explicitly asks for "brief", "one page", "500 words".
+3. **Revisions.** When modifying an existing report from this conversation, set `parent_report_id` and put the change list in `user_instructions` ("add carbon-capture section", "tighten conclusion").
+4. **Never paste the report back into chat** after `generate_report` returns — confirm and let the artifact card render itself.
+
+## Hooks for KB-only mode
+If `kb_search`/`auto` returns no results, do **not** silently switch to general knowledge. Surface the gap in your confirmation message.
diff --git a/surfsense_backend/app/agents/new_chat/skills/builtin/slack-summary/SKILL.md b/surfsense_backend/app/agents/new_chat/skills/builtin/slack-summary/SKILL.md
new file mode 100644
index 0000000000..33b9e72a21
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/skills/builtin/slack-summary/SKILL.md
@@ -0,0 +1,26 @@
+---
+name: slack-summary
+description: Distill a Slack channel or thread into actionable summary
+allowed-tools: search_surfsense_docs
+---
+
+# Slack summarization
+
+## When to use this skill
+The user asks to summarize Slack ("what happened in #eng-platform this week", "what did Alice say about the launch", "catch me up on the design channel").
+
+## Required inputs
+Confirm before searching:
+- **Which channel(s) or thread(s)?** Don't guess if ambiguous.
+- **What time window?** Default to the last 7 days when not specified, but say so.
+
+## Output shape
+Produce three concise sections:
+1. **Key decisions** — explicit choices that were made, with the deciding message cited.
+2. **Open questions** — things asked but not answered, with the asking message cited.
+3. **Action items** — `@mention` who owes what by when, *only if explicitly stated*. Don't invent assignees.
+
+## What not to do
+- Never produce a chronological play-by-play of every message — distill.
+- Never quote private messages without flagging them as such.
+- If the channel was empty in the time window, say so — don't fabricate filler.
diff --git a/surfsense_backend/app/agents/new_chat/state_reducers.py b/surfsense_backend/app/agents/new_chat/state_reducers.py
new file mode 100644
index 0000000000..89fc863676
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/state_reducers.py
@@ -0,0 +1,205 @@
+"""Reducers and sentinels for SurfSense filesystem state.
+
+These reducers back the extra state fields used by the cloud-mode filesystem
+agent (`cwd`, `staged_dirs`, `pending_moves`, `dirty_paths`, `doc_id_by_path`,
+`kb_priority`, `kb_matched_chunk_ids`, `kb_anon_doc`, `tree_version`).
+
+Tools mutate these fields ONLY via `Command(update={...})` returns; the
+reducers are responsible for merging successive updates atomically and for
+honouring an explicit reset sentinel (`_CLEAR`) so that a single update can
+both reset and reseed a list (used by `move_file` / `aafter_agent`).
+
+The sentinel is intentionally a plain string constant rather than a custom
+object so that LangGraph's checkpointer (which serializes raw `Command.update`
+deltas via ``ormsgpack`` BEFORE reducers are applied) can round-trip writes
+that contain it. The token uses a NUL-bracketed form that cannot collide with
+any real virtual path, document title, or dict key produced by the agent.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Final, TypeVar
+
+_CLEAR: Final[str] = "\x00__SURFSENSE_FILESYSTEM_CLEAR__\x00"
+"""Reset sentinel; pass it inside a list/dict update to request a reset.
+
+For list reducers: ``[_CLEAR, *items]`` resets the field then appends ``items``.
+For dict reducers: ``{_CLEAR: True, **items}`` resets the field then merges ``items``.
+
+Because the value is a plain string with embedded NUL bytes, it is natively
+serializable by ``ormsgpack`` (used by LangGraph's PostgreSQL checkpointer)
+yet still distinct from any real path / key produced by application code.
+"""
+
+
+T = TypeVar("T")
+
+
+def _replace_reducer[T](left: T | None, right: T | None) -> T | None:
+    """Replace `left` outright with `right`. ``None`` on the right is honored as a reset."""
+    return right
+
+
+def _is_clear(value: Any) -> bool:
+    return isinstance(value, str) and value == _CLEAR
+
+
+def _add_unique_reducer(
+    left: list[Any] | None,
+    right: list[Any] | None,
+) -> list[Any]:
+    """Append items from ``right`` to ``left`` while preserving uniqueness.
+
+    Semantics:
+    - If ``right`` is ``None`` or empty, return ``left`` unchanged.
+    - If ``right`` contains the ``_CLEAR`` sentinel anywhere, the result is
+      reseeded with only the items that appear AFTER the LAST occurrence of
+      ``_CLEAR`` (deduplicated, preserving first-seen order). This gives a
+      single-update "reset and reseed" capability.
+    - Otherwise, items from ``right`` are appended to ``left`` (order preserved
+      from first seen) while skipping values that are already present.
+    """
+    if right is None:
+        return list(left or [])
+    if not right:
+        return list(left or [])
+
+    last_clear = -1
+    for index, item in enumerate(right):
+        if _is_clear(item):
+            last_clear = index
+
+    if last_clear >= 0:
+        seed: list[Any] = []
+        seen: set[Any] = set()
+        for item in right[last_clear + 1 :]:
+            if _is_clear(item):
+                continue
+            try:
+                if item in seen:
+                    continue
+                seen.add(item)
+            except TypeError:
+                if item in seed:
+                    continue
+            seed.append(item)
+        return seed
+
+    base = list(left or [])
+    try:
+        seen: set[Any] = set(base)
+    except TypeError:
+        seen = set()
+    for item in right:
+        if _is_clear(item):
+            continue
+        try:
+            if item in seen:
+                continue
+            seen.add(item)
+        except TypeError:
+            if item in base:
+                continue
+        base.append(item)
+    return base
+
+
+def _list_append_reducer(
+    left: list[Any] | None,
+    right: list[Any] | None,
+) -> list[Any]:
+    """Append items from ``right`` to ``left`` preserving order and duplicates.
+
+    Honours the ``_CLEAR`` sentinel exactly like :func:`_add_unique_reducer`,
+    but does NOT deduplicate. Used for queues whose ordering and duplicate
+    occurrences matter (e.g. ``pending_moves``).
+    """
+    if right is None:
+        return list(left or [])
+    if not right:
+        return list(left or [])
+
+    last_clear = -1
+    for index, item in enumerate(right):
+        if _is_clear(item):
+            last_clear = index
+
+    if last_clear >= 0:
+        return [item for item in right[last_clear + 1 :] if not _is_clear(item)]
+
+    base = list(left or [])
+    base.extend(item for item in right if not _is_clear(item))
+    return base
+
+
+def _dict_merge_with_tombstones_reducer(
+    left: dict[Any, Any] | None,
+    right: dict[Any, Any] | None,
+) -> dict[Any, Any]:
+    """Merge ``right`` into ``left`` with two extra capabilities:
+
+    * Keys whose value is ``None`` are removed from the merged result
+      (tombstone semantics, matching the deepagents file-data reducer).
+    * The special key ``_CLEAR`` (with any truthy value) resets ``left`` to
+      ``{}`` before merging the remaining keys from ``right``. This makes it
+      possible to atomically clear and reseed the dictionary in a single
+      update.
+    """
+    if right is None:
+        return dict(left or {})
+
+    if _CLEAR in right or any(_is_clear(k) for k in right):
+        result: dict[Any, Any] = {}
+        for key, value in right.items():
+            if _is_clear(key):
+                continue
+            if value is None:
+                result.pop(key, None)
+                continue
+            result[key] = value
+        return result
+
+    if left is None:
+        return {key: value for key, value in right.items() if value is not None}
+
+    result = dict(left)
+    for key, value in right.items():
+        if value is None:
+            result.pop(key, None)
+        else:
+            result[key] = value
+    return result
+
+
+def _initial_filesystem_state() -> dict[str, Any]:
+    """Default empty values for SurfSense filesystem state fields.
+
+    Consumers should always treat these fields as ``state.get(key) or
+    DEFAULT`` so that fresh threads (without checkpointed state) work
+    correctly.
+    """
+    return {
+        "cwd": "/documents",
+        "staged_dirs": [],
+        "staged_dir_tool_calls": {},
+        "pending_moves": [],
+        "pending_deletes": [],
+        "pending_dir_deletes": [],
+        "doc_id_by_path": {},
+        "dirty_paths": [],
+        "dirty_path_tool_calls": {},
+        "kb_priority": [],
+        "kb_matched_chunk_ids": {},
+        "kb_anon_doc": None,
+        "tree_version": 0,
+    }
+
+
+__all__ = [
+    "_CLEAR",
+    "_add_unique_reducer",
+    "_dict_merge_with_tombstones_reducer",
+    "_initial_filesystem_state",
+    "_list_append_reducer",
+    "_replace_reducer",
+]
diff --git a/surfsense_backend/app/agents/new_chat/subagents/__init__.py b/surfsense_backend/app/agents/new_chat/subagents/__init__.py
new file mode 100644
index 0000000000..bd1823b57f
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/subagents/__init__.py
@@ -0,0 +1,33 @@
+"""Specialized user-facing subagents for the SurfSense agent.
+
+The :class:`deepagents.SubAgentMiddleware` already provides the
+materialization machinery (each :class:`deepagents.SubAgent` typed-dict
+spec is compiled into an ephemeral runnable invoked via the ``task``
+tool); what's specific to SurfSense is the *seeding* of those subagents
+with declarative deny rules.
+
+Per-subagent permission rules are injected as a
+:class:`PermissionMiddleware` entry inside the subagent's ``middleware``
+field. The auto-deny pattern (e.g. forbid ``task``/``todowrite``
+recursion, block write tools for read-only research roles) is borrowed
+from OpenCode's ``packages/opencode/src/tool/task.ts``, which has
+analogous logic for restricting child sessions.
+"""
+
+from .config import (
+    build_connector_negotiator_subagent,
+    build_explore_subagent,
+    build_report_writer_subagent,
+    build_specialized_subagents,
+)
+from .providers.linear import build_linear_specialist_subagent
+from .providers.slack import build_slack_specialist_subagent
+
+__all__ = [
+    "build_connector_negotiator_subagent",
+    "build_explore_subagent",
+    "build_linear_specialist_subagent",
+    "build_report_writer_subagent",
+    "build_slack_specialist_subagent",
+    "build_specialized_subagents",
+]
diff --git a/surfsense_backend/app/agents/new_chat/subagents/config.py b/surfsense_backend/app/agents/new_chat/subagents/config.py
new file mode 100644
index 0000000000..b993d2b063
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/subagents/config.py
@@ -0,0 +1,439 @@
+"""Builders for specialized SurfSense subagents.
+
+Each subagent is built from three pieces:
+
+1. A name + description + system prompt (the user-facing contract for
+   when ``task`` should delegate to this role).
+2. A filtered tool list (subset of the parent's bound tools).
+3. A :class:`PermissionMiddleware` instance carrying a deny ruleset that
+   prevents the subagent from acting outside its scope (e.g. an
+   explore-only role cannot mutate state).
+
+Skill sources (``/skills/builtin/`` + ``/skills/space/``) are inherited
+from the parent unconditionally — every subagent benefits from the same
+authored guidance documents.
+"""
+
+from __future__ import annotations
+
+import logging
+from collections.abc import Iterable, Sequence
+from typing import TYPE_CHECKING, Any
+
+from app.agents.new_chat.middleware.skills_backends import default_skills_sources
+from app.agents.new_chat.permissions import Rule, Ruleset
+from app.agents.new_chat.subagents.providers.linear import (
+    build_linear_specialist_subagent,
+)
+from app.agents.new_chat.subagents.providers.slack import (
+    build_slack_specialist_subagent,
+)
+
+if TYPE_CHECKING:
+    from deepagents import SubAgent
+    from langchain_core.language_models import BaseChatModel
+    from langchain_core.tools import BaseTool
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Tool name constants
+# ---------------------------------------------------------------------------
+
+# Read-only tools that ``explore`` is permitted to use. Names match the
+# tools provided by the deepagents ``FilesystemMiddleware`` (``ls``, ``read_file``,
+# ``glob``, ``grep``) plus the SurfSense-side read tools.
+EXPLORE_READ_TOOLS: frozenset[str] = frozenset(
+    {
+        "search_surfsense_docs",
+        "web_search",
+        "scrape_webpage",
+        "read_file",
+        "ls",
+        "glob",
+        "grep",
+    }
+)
+
+# Tools ``report_writer`` may call. The set is intentionally narrow so the
+# subagent doesn't drift into tangential research; if richer source-gathering
+# is needed, the parent should hand off to ``explore`` first.
+REPORT_WRITER_TOOLS: frozenset[str] = frozenset(
+    {
+        "search_surfsense_docs",
+        "read_file",
+        "generate_report",
+    }
+)
+
+# Wildcard patterns that match write tools we deny by default in read-only
+# subagents. Anchored at start AND end via :func:`Rule` semantics. We use
+# substring-style ``*verb*`` patterns because connector tool names typically
+# put the verb in the middle (``linear_create_issue``, ``slack_send_message``,
+# ``notion_update_page``); strict suffix patterns (``*_create``) miss those.
+#
+# A handful of canonical exact-match names is appended so that bare verbs
+# (``edit``, ``write``) are also blocked even when a connector dropped the
+# usual prefix.
+WRITE_TOOL_DENY_PATTERNS: tuple[str, ...] = (
+    "*create*",
+    "*update*",
+    "*delete*",
+    "*send*",
+    "*write*",
+    "*edit*",
+    "*move*",
+    "*mkdir*",
+    "*upload*",
+    "edit_file",
+    "write_file",
+    "move_file",
+    "mkdir",
+    "rm",
+    "rmdir",
+    "update_memory",
+    "update_memory_team",
+    "update_memory_private",
+)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+# Tool names that are NOT in the registry's ``tools`` list because they
+# are provided dynamically by middleware at compile time. We don't pass
+# them through ``_filter_tools`` (the actual ``BaseTool`` instances live
+# inside the middleware), but we do exempt them from the "missing" warning
+# below — operators were seeing spurious noise like
+# ``missing: ['glob', 'grep', 'ls', 'read_file']`` even though those
+# tools are reachable via :class:`SurfSenseFilesystemMiddleware` once the
+# subagent is compiled.
+_MIDDLEWARE_PROVIDED_TOOL_NAMES: frozenset[str] = frozenset(
+    {
+        "ls",
+        "read_file",
+        "write_file",
+        "edit_file",
+        "glob",
+        "grep",
+        "execute",
+        "write_todos",
+        "task",
+    }
+)
+
+
+def _filter_tools(
+    tools: Sequence[BaseTool],
+    allowed_names: Iterable[str],
+) -> list[BaseTool]:
+    """Return only tools whose ``name`` appears in ``allowed_names``.
+
+    Tools are looked up by exact name. Names matching
+    :data:`_MIDDLEWARE_PROVIDED_TOOL_NAMES` are intentionally absent from
+    ``tools`` (they're injected by middleware at compile time) and are
+    silently excluded from the "missing" warning so operators don't see
+    false positives every build.
+    """
+    allowed = set(allowed_names)
+    selected = [t for t in tools if t.name in allowed]
+    missing = sorted(
+        (allowed - {t.name for t in selected}) - _MIDDLEWARE_PROVIDED_TOOL_NAMES
+    )
+    if missing:
+        logger.info(
+            "Subagent build: %d/%d registry tools available; missing: %s",
+            len(selected),
+            len(allowed - _MIDDLEWARE_PROVIDED_TOOL_NAMES),
+            missing,
+        )
+    return selected
+
+
+def _read_only_deny_rules() -> list[Rule]:
+    """Synthesize a list of deny rules covering common write-tool patterns."""
+    return [
+        Rule(permission=pattern, pattern="*", action="deny")
+        for pattern in WRITE_TOOL_DENY_PATTERNS
+    ]
+
+
+def _build_permission_middleware(deny_rules: list[Rule], origin: str):
+    """Construct a :class:`PermissionMiddleware` seeded with ``deny_rules``.
+
+    Imported lazily because the middleware module pulls in interrupt/HITL
+    machinery we don't want at import time of this config file.
+    """
+    from app.agents.new_chat.middleware.permission import PermissionMiddleware
+
+    return PermissionMiddleware(
+        rulesets=[Ruleset(rules=deny_rules, origin=origin)],
+    )
+
+
+def _wrap_with_subagent_essentials(
+    custom_middleware: list,
+    *,
+    agent_tools: Sequence[BaseTool],
+    extra_middleware: Sequence[Any] | None = None,
+):
+    """Compose the final middleware list for a specialized subagent.
+
+    Order, outer to inner:
+
+    1. ``extra_middleware`` — provided by the caller (typically the parent
+       agent's ``SurfSenseFilesystemMiddleware`` and ``TodoListMiddleware``)
+       so the subagent inherits the parent's filesystem/todo view. These
+       run **before** the subagent-local middleware so their tools are
+       wired up before permissioning kicks in.
+    2. ``custom_middleware`` — subagent-local rules (e.g. permission deny
+       lists).
+    3. :class:`PatchToolCallsMiddleware` — normalizes tool-call shapes.
+    4. :class:`DedupHITLToolCallsMiddleware` — collapses duplicate HITL
+       calls using metadata declared at registry time.
+
+    Without ``extra_middleware`` the subagent will only have the registry
+    tools listed in its ``tools`` field — meaning ``read_file``, ``ls``,
+    ``grep``, etc. won't exist. Always pass ``extra_middleware`` from the
+    parent unless you specifically want a sandboxed subagent.
+    """
+    from deepagents.middleware.patch_tool_calls import PatchToolCallsMiddleware
+
+    from app.agents.new_chat.middleware import DedupHITLToolCallsMiddleware
+
+    return [
+        *(extra_middleware or []),
+        *custom_middleware,
+        PatchToolCallsMiddleware(),
+        DedupHITLToolCallsMiddleware(agent_tools=list(agent_tools)),
+    ]
+
+
+# ---------------------------------------------------------------------------
+# System prompts
+# ---------------------------------------------------------------------------
+
+EXPLORE_SYSTEM_PROMPT = """You are the **explore** subagent for SurfSense.
+
+## Your job
+Conduct read-only research across the user's knowledge base, the web, and any documents the parent agent has surfaced. Return a synthesized answer with explicit citations — never speculate beyond the sources you have actually inspected.
+
+## Tools available
+- `search_surfsense_docs` — fast hybrid search over the user's knowledge base.
+- `web_search` — only when the user's KB clearly does not contain the answer.
+- `scrape_webpage` — to read a URL the user or the search results provided.
+- `read_file`, `ls`, `glob`, `grep` — to inspect specific documents or trees the parent has flagged.
+
+## Rules
+- Read-only. You cannot create, edit, delete, send, or move anything.
+- Cite every claim. Use `[citation:chunk_id]` exactly as the chunk tag specifies.
+- If a sub-question has no support in the inspected sources, say so explicitly. Do not fabricate.
+- Return the most useful synthesis in your single final message. The parent agent will not be able to follow up.
+"""
+
+
+REPORT_WRITER_SYSTEM_PROMPT = """You are the **report_writer** subagent for SurfSense.
+
+## Your job
+Produce a single high-quality report deliverable using `generate_report`. The parent has already gathered (or knows where to gather) the underlying sources.
+
+## Workflow
+1. **Outline first.** Before calling `generate_report`, write a one-paragraph outline of the sections you plan to produce. Confirm the outline reflects the parent's instructions.
+2. **Source resolution.** Decide whether to call `search_surfsense_docs` and `read_file` for any final-checks, or whether the parent's earlier tool calls already cover the source set.
+3. **One report.** Call `generate_report` exactly once with `source_strategy` chosen per the topic and chat history (see the `report-writing` skill).
+4. **Confirm.** End with a one-sentence summary in your final message — never paste the report back into chat; the artifact card renders itself.
+"""
+
+
+CONNECTOR_NEGOTIATOR_SYSTEM_PROMPT = """You are the **connector_negotiator** subagent for SurfSense.
+
+## Your job
+Coordinate cross-connector workflows: chains where the result of one service's tool feeds into another's. Common shapes include "find Linear issues mentioned in last week's Slack messages", "draft a Gmail reply citing a Notion doc", or "list Linear tickets opened by the same person who filed Jira FOO-123".
+
+## Workflow
+1. **Plan.** Identify the connector hops needed and the order they should run in. Write a short plan in your first message.
+2. **Verify access.** Use `get_connected_accounts` to confirm the relevant connectors are actually wired up before issuing tool calls. If a connector is missing, stop and report — do not fabricate.
+3. **Execute.** Run each hop, citing IDs (issue keys, message ts, page IDs) in your scratch notes so the parent can audit.
+4. **Hand back.** Return a structured summary with the final answer plus the chain of evidence (issue → message → page, etc.).
+
+## Caveats
+- If a hop fails, do not retry blindly — return the partial result and explain.
+- Mutating tools (create, update, delete, send) require parent permission; you are NOT cleared to call them on your own.
+"""
+
+
+# ---------------------------------------------------------------------------
+# Subagent builders
+# ---------------------------------------------------------------------------
+
+
+def build_explore_subagent(
+    *,
+    tools: Sequence[BaseTool],
+    model: BaseChatModel | None = None,
+    extra_middleware: Sequence[Any] | None = None,
+) -> SubAgent:
+    """Build the read-only ``explore`` subagent spec.
+
+    Pass ``extra_middleware`` (typically the parent's filesystem + todo
+    middleware) so the subagent can actually use ``read_file``, ``ls``,
+    ``grep``, ``glob`` — which its system prompt promises but which only
+    exist when their middleware is mounted.
+    """
+    from deepagents import SubAgent  # noqa: F401  (TypedDict for type clarity)
+
+    selected_tools = _filter_tools(tools, EXPLORE_READ_TOOLS)
+    deny_rules = _read_only_deny_rules()
+    permission_mw = _build_permission_middleware(deny_rules, origin="subagent_explore")
+
+    spec: dict = {
+        "name": "explore",
+        "description": (
+            "Read-only research across the user's knowledge base and the web. "
+            "Use when the parent needs deeply-cited synthesis without "
+            "modifying anything."
+        ),
+        "system_prompt": EXPLORE_SYSTEM_PROMPT,
+        "tools": selected_tools,
+        "middleware": _wrap_with_subagent_essentials(
+            [permission_mw],
+            agent_tools=selected_tools,
+            extra_middleware=extra_middleware,
+        ),
+        "skills": default_skills_sources(),
+    }
+    if model is not None:
+        spec["model"] = model
+    return spec  # type: ignore[return-value]
+
+
+def build_report_writer_subagent(
+    *,
+    tools: Sequence[BaseTool],
+    model: BaseChatModel | None = None,
+    extra_middleware: Sequence[Any] | None = None,
+) -> SubAgent:
+    """Build the ``report_writer`` subagent spec.
+
+    Read-only deny ruleset still applies — the subagent should call
+    ``generate_report`` and nothing else mutating. ``generate_report``
+    creates a report artifact via a backend service and is intentionally
+    **not** denied.
+
+    Pass ``extra_middleware`` (typically the parent's filesystem + todo
+    middleware) so the subagent can run ``read_file`` for source-checks
+    before calling ``generate_report``.
+    """
+    selected_tools = _filter_tools(tools, REPORT_WRITER_TOOLS)
+    deny_rules = _read_only_deny_rules()
+    permission_mw = _build_permission_middleware(
+        deny_rules, origin="subagent_report_writer"
+    )
+
+    spec: dict = {
+        "name": "report_writer",
+        "description": (
+            "Produce a single Markdown report artifact via generate_report, "
+            "using the outline-then-fill protocol. Use when the parent has "
+            "decided a deliverable is needed."
+        ),
+        "system_prompt": REPORT_WRITER_SYSTEM_PROMPT,
+        "tools": selected_tools,
+        "middleware": _wrap_with_subagent_essentials(
+            [permission_mw],
+            agent_tools=selected_tools,
+            extra_middleware=extra_middleware,
+        ),
+        "skills": default_skills_sources(),
+    }
+    if model is not None:
+        spec["model"] = model
+    return spec  # type: ignore[return-value]
+
+
+def build_connector_negotiator_subagent(
+    *,
+    tools: Sequence[BaseTool],
+    model: BaseChatModel | None = None,
+    extra_middleware: Sequence[Any] | None = None,
+) -> SubAgent:
+    """Build the ``connector_negotiator`` subagent spec.
+
+    Inherits all MCP / connector tools the parent has plus
+    ``get_connected_accounts``. Read-only by default; permission rules deny
+    write/mutation patterns. The parent agent re-asks for permission if a
+    connector mutation is genuinely needed.
+
+    Pass ``extra_middleware`` (typically the parent's filesystem + todo
+    middleware) so this subagent shares the parent's filesystem view when
+    citing evidence across hops.
+    """
+    parent_tool_names = {t.name for t in tools}
+    allowed: set[str] = set()
+    if "get_connected_accounts" in parent_tool_names:
+        allowed.add("get_connected_accounts")
+    # Inherit anything that smells connector- or MCP-related but is not a
+    # bulk-write API. Heuristic: keep all parent tools; rely on the deny
+    # ruleset to block mutation patterns. This mirrors the plan: "all
+    # MCP/connector tools the parent has".
+    for name in parent_tool_names:
+        allowed.add(name)
+    selected_tools = _filter_tools(tools, allowed)
+
+    deny_rules = _read_only_deny_rules()
+    permission_mw = _build_permission_middleware(
+        deny_rules, origin="subagent_connector_negotiator"
+    )
+
+    spec: dict = {
+        "name": "connector_negotiator",
+        "description": (
+            "Coordinate read-only chains across connectors (Slack → Linear, "
+            "Notion → Gmail, etc.). Returns a structured summary with the "
+            "evidence chain. Cannot mutate connector state."
+        ),
+        "system_prompt": CONNECTOR_NEGOTIATOR_SYSTEM_PROMPT,
+        "tools": selected_tools,
+        "middleware": _wrap_with_subagent_essentials(
+            [permission_mw],
+            agent_tools=selected_tools,
+            extra_middleware=extra_middleware,
+        ),
+        "skills": default_skills_sources(),
+    }
+    if model is not None:
+        spec["model"] = model
+    return spec  # type: ignore[return-value]
+
+
+def build_specialized_subagents(
+    *,
+    tools: Sequence[BaseTool],
+    model: BaseChatModel | None = None,
+    extra_middleware: Sequence[Any] | None = None,
+) -> list[SubAgent]:
+    """Return the canonical list of specialized subagents to register.
+
+    Order matters only for the order they appear in the ``task`` tool
+    description — most useful first.
+    """
+    return [
+        build_explore_subagent(
+            tools=tools, model=model, extra_middleware=extra_middleware
+        ),
+        build_report_writer_subagent(
+            tools=tools, model=model, extra_middleware=extra_middleware
+        ),
+        build_linear_specialist_subagent(
+            tools=tools, model=model, extra_middleware=extra_middleware
+        ),
+        build_slack_specialist_subagent(
+            tools=tools, model=model, extra_middleware=extra_middleware
+        ),
+        build_connector_negotiator_subagent(
+            tools=tools, model=model, extra_middleware=extra_middleware
+        ),
+    ]
diff --git a/surfsense_backend/app/agents/new_chat/subagents/constants.py b/surfsense_backend/app/agents/new_chat/subagents/constants.py
new file mode 100644
index 0000000000..cb1da499bb
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/subagents/constants.py
@@ -0,0 +1,35 @@
+"""Shared constants for provider subagent safety policies."""
+
+from __future__ import annotations
+
+# Generic mutation-deny patterns for read-only specialist roles.
+WRITE_TOOL_DENY_PATTERNS: tuple[str, ...] = (
+    "*create*",
+    "*update*",
+    "*delete*",
+    "*send*",
+    "*write*",
+    "*edit*",
+    "*move*",
+    "*mkdir*",
+    "*upload*",
+    "edit_file",
+    "write_file",
+    "move_file",
+    "mkdir",
+    "update_memory",
+    "update_memory_team",
+    "update_memory_private",
+)
+
+# Tools that mutate virtual KB filesystem or parent/global chat state.
+# Provider specialists should not mutate these surfaces directly.
+NON_PROVIDER_STATE_MUTATION_DENY: frozenset[str] = frozenset(
+    {
+        # Exact tool names from shared deny patterns.
+        *{name for name in WRITE_TOOL_DENY_PATTERNS if "*" not in name},
+        # Additional non-provider state mutation controls.
+        "write_todos",
+        "task",
+    }
+)
diff --git a/surfsense_backend/app/agents/new_chat/subagents/providers/linear.py b/surfsense_backend/app/agents/new_chat/subagents/providers/linear.py
new file mode 100644
index 0000000000..da332fe28d
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/subagents/providers/linear.py
@@ -0,0 +1,162 @@
+"""Linear provider specialist subagent.
+
+This file is intentionally standalone so provider specialists can be reviewed
+and evolved independently (one provider per file).
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+from typing import TYPE_CHECKING, Any
+
+from app.agents.new_chat.permissions import Rule, Ruleset
+from app.agents.new_chat.subagents.constants import NON_PROVIDER_STATE_MUTATION_DENY
+from app.services.mcp_oauth.registry import (
+    LINEAR_MCP_READONLY_TOOL_NAMES,
+    linear_mcp_original_tool_name,
+)
+
+if TYPE_CHECKING:
+    from deepagents import SubAgent
+    from langchain_core.language_models import BaseChatModel
+    from langchain_core.tools import BaseTool
+
+
+# Read vs write Linear MCP tools are defined in
+# ``app.services.mcp_oauth.registry`` (``LINEAR_MCP_READONLY_TOOL_NAMES`` /
+# ``LINEAR_MCP_WRITE_TOOL_NAMES``). Any other Linear-domain tool requires approval.
+
+LINEAR_SYSTEM_PROMPT = """You are the linear_specialist subagent for SurfSense.
+
+Role:
+- You are the Linear domain specialist. Handle Linear-only requests accurately.
+
+Primary objective:
+- Resolve the user's Linear task and return a concise, auditable result.
+
+Routing boundary:
+- Use this subagent for Linear-domain tasks (issues, status, assignees, labels,
+  teams, and project references).
+- If the task is primarily non-Linear or cross-connector orchestration, return
+  status=needs_input and hand control back to the parent with the exact next hop.
+
+Execution steps:
+1) Verify Linear access first (use get_connected_accounts if needed).
+2) Prefer read/list tools first to gather current issue facts before concluding.
+3) Track key identifiers in your reasoning: issue ID, issue key, team ID, label ID.
+4) If required identifiers are missing, ask the parent for exactly what is missing.
+5) Return a compact result with findings + evidence references.
+
+Output format:
+- status: success | needs_input | blocked | error
+- summary: one short paragraph
+- evidence: bullet list of concrete IDs / issue keys used
+- next_step: one sentence (only when blocked or needs_input)
+
+Constraints:
+- Do not invent issue keys, IDs, or workflow state names.
+- Mutating Linear operations are allowed only with explicit approval.
+- If Linear connector access is unavailable, stop and return status=blocked.
+"""
+
+
+def _select_linear_tools(tools: Sequence[BaseTool]) -> list[BaseTool]:
+    """Keep Linear tools plus minimal shared read utilities."""
+    allowed_exact = {
+        "get_connected_accounts",
+        "read_file",
+        "ls",
+        "glob",
+        "grep",
+    }
+    selected: list[BaseTool] = []
+    for tool in tools:
+        if tool.name in allowed_exact:
+            selected.append(tool)
+            continue
+        if linear_mcp_original_tool_name(tool.name) is not None:
+            selected.append(tool)
+            continue
+        if tool.name.startswith("linear_") or tool.name.endswith("_linear_issue"):
+            selected.append(tool)
+    return selected
+
+
+def _is_linear_readonly_tool_name(name: str) -> bool:
+    """Return True when a tool name maps to a read-only Linear MCP operation."""
+    base = linear_mcp_original_tool_name(name)
+    return base is not None and base in LINEAR_MCP_READONLY_TOOL_NAMES
+
+
+def _is_linear_domain_tool_name(name: str) -> bool:
+    """Return True for Linear-domain tools handled by this specialist."""
+    if linear_mcp_original_tool_name(name) is not None:
+        return True
+    return name.startswith("linear_") or name.endswith("_linear_issue")
+
+
+def _permission_middleware(*, selected_tools: Sequence[BaseTool]) -> Any:
+    """Permission policy for Linear specialist."""
+    from app.agents.new_chat.middleware.permission import PermissionMiddleware
+
+    ask_tools = sorted(
+        {
+            tool.name
+            for tool in selected_tools
+            if _is_linear_domain_tool_name(tool.name)
+            and not _is_linear_readonly_tool_name(tool.name)
+        }
+    )
+    rules: list[Rule] = [Rule(permission="*", pattern="*", action="allow")]
+    rules.extend(
+        Rule(permission=name, pattern="*", action="deny")
+        for name in NON_PROVIDER_STATE_MUTATION_DENY
+    )
+    rules.extend(Rule(permission=name, pattern="*", action="ask") for name in ask_tools)
+    return PermissionMiddleware(
+        rulesets=[Ruleset(rules=rules, origin="subagent_linear_specialist")]
+    )
+
+
+def _wrap_subagent_middleware(
+    *,
+    selected_tools: Sequence[BaseTool],
+    extra_middleware: Sequence[Any] | None,
+) -> list[Any]:
+    """Apply standard middleware chain used by other subagents."""
+    from deepagents.middleware.patch_tool_calls import PatchToolCallsMiddleware
+
+    from app.agents.new_chat.middleware import DedupHITLToolCallsMiddleware
+
+    return [
+        *(extra_middleware or []),
+        _permission_middleware(selected_tools=selected_tools),
+        PatchToolCallsMiddleware(),
+        DedupHITLToolCallsMiddleware(agent_tools=list(selected_tools)),
+    ]
+
+
+def build_linear_specialist_subagent(
+    *,
+    tools: Sequence[BaseTool],
+    model: BaseChatModel | None = None,
+    extra_middleware: Sequence[Any] | None = None,
+) -> SubAgent:
+    """Build the ``linear_specialist`` provider subagent spec."""
+    selected_tools = _select_linear_tools(tools)
+    spec: dict[str, Any] = {
+        "name": "linear_specialist",
+        "description": (
+            "Linear operations specialist for issue and workflow requests, "
+            "with strict evidence tracking and approval-gated mutating operations."
+        ),
+        "system_prompt": LINEAR_SYSTEM_PROMPT,
+        "tools": selected_tools,
+        "middleware": _wrap_subagent_middleware(
+            selected_tools=selected_tools,
+            extra_middleware=extra_middleware,
+        ),
+    }
+    if model is not None:
+        spec["model"] = model
+    return spec  # type: ignore[return-value]
diff --git a/surfsense_backend/app/agents/new_chat/subagents/providers/slack.py b/surfsense_backend/app/agents/new_chat/subagents/providers/slack.py
new file mode 100644
index 0000000000..90ca801522
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/subagents/providers/slack.py
@@ -0,0 +1,170 @@
+"""Slack provider specialist subagent.
+
+This file is intentionally standalone so provider specialists can be reviewed
+and evolved independently (one provider per file).
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+from typing import TYPE_CHECKING, Any
+
+from app.agents.new_chat.permissions import Rule, Ruleset
+from app.agents.new_chat.subagents.constants import NON_PROVIDER_STATE_MUTATION_DENY
+
+if TYPE_CHECKING:
+    from deepagents import SubAgent
+    from langchain_core.language_models import BaseChatModel
+    from langchain_core.tools import BaseTool
+
+
+# Official references:
+# - https://docs.slack.dev/ai/slack-mcp-server
+# - https://www.npmjs.com/package/@modelcontextprotocol/server-slack
+#
+# Policy: only known read-only Slack tools are auto-allowed. Any other
+# ``slack_*`` tool is treated as mutating and requires explicit approval.
+SLACK_READONLY_TOOL_NAMES: frozenset[str] = frozenset(
+    {
+        # Slack-hosted MCP read tools
+        "slack_search_channels",
+        "slack_read_channel",
+        "slack_read_thread",
+        "slack_read_canvas",
+        "slack_read_user_profile",
+        # modelcontextprotocol/server-slack read tools
+        "slack_list_channels",
+        "slack_get_channel_history",
+        "slack_get_thread_replies",
+        "slack_get_users",
+        "slack_get_user_profile",
+    }
+)
+
+SLACK_SYSTEM_PROMPT = """You are the slack_specialist subagent for SurfSense.
+
+Role:
+- You are the Slack domain specialist. Handle Slack-only requests accurately.
+
+Primary objective:
+- Resolve the user's Slack task and return a concise, auditable result.
+
+Routing boundary:
+- Use this subagent for Slack-domain tasks (channels, threads, users, messages,
+  and Slack canvases).
+- If the task is primarily non-Slack or cross-connector orchestration, return
+  status=needs_input and hand control back to the parent with the exact next hop.
+
+Execution steps:
+1) Verify Slack access first (use get_connected_accounts if needed).
+2) Prefer read/list tools first to gather facts before concluding.
+3) Track key identifiers in your reasoning: channel ID, message ts, thread ts, user ID.
+4) If required identifiers are missing, ask the parent for exactly what is missing.
+5) Return a compact result with findings + evidence references.
+
+Output format:
+- status: success | needs_input | blocked | error
+- summary: one short paragraph
+- evidence: bullet list of concrete IDs / timestamps used
+- next_step: one sentence (only when blocked or needs_input)
+
+Constraints:
+- Do not invent Slack IDs, channels, users, or message content.
+- Mutating Slack operations are allowed only with explicit approval.
+- If Slack connector access is unavailable, stop and return status=blocked.
+"""
+
+
+def _select_slack_tools(tools: Sequence[BaseTool]) -> list[BaseTool]:
+    """Keep Slack tools plus minimal shared read utilities."""
+    allowed_exact = {
+        "get_connected_accounts",
+        "read_file",
+        "ls",
+        "glob",
+        "grep",
+    }
+    slack_prefix = "slack_"
+    selected: list[BaseTool] = []
+    for tool in tools:
+        if tool.name in allowed_exact:
+            selected.append(tool)
+            continue
+        if tool.name.startswith(slack_prefix):
+            selected.append(tool)
+    return selected
+
+
+def _permission_middleware(*, selected_tools: Sequence[BaseTool]) -> Any:
+    """Permission policy for Slack specialist.
+
+    Intent:
+    - Allow Slack-domain operations by default.
+    - Gate Slack mutating operations behind approval (`ask`).
+    - Hard-deny non-Slack state mutations, especially KB virtual filesystem
+      mutation and parent-context mutation tools.
+    """
+    from app.agents.new_chat.middleware.permission import PermissionMiddleware
+
+    ask_tools = sorted(
+        {
+            tool.name
+            for tool in selected_tools
+            if tool.name.startswith("slack_")
+            and tool.name not in SLACK_READONLY_TOOL_NAMES
+        }
+    )
+    rules: list[Rule] = [Rule(permission="*", pattern="*", action="allow")]
+    rules.extend(
+        Rule(permission=name, pattern="*", action="deny")
+        for name in NON_PROVIDER_STATE_MUTATION_DENY
+    )
+    rules.extend(Rule(permission=name, pattern="*", action="ask") for name in ask_tools)
+    return PermissionMiddleware(
+        rulesets=[Ruleset(rules=rules, origin="subagent_slack_specialist")]
+    )
+
+
+def _wrap_subagent_middleware(
+    *,
+    selected_tools: Sequence[BaseTool],
+    extra_middleware: Sequence[Any] | None,
+) -> list[Any]:
+    """Apply standard middleware chain used by other subagents."""
+    from deepagents.middleware.patch_tool_calls import PatchToolCallsMiddleware
+
+    from app.agents.new_chat.middleware import DedupHITLToolCallsMiddleware
+
+    return [
+        *(extra_middleware or []),
+        _permission_middleware(selected_tools=selected_tools),
+        PatchToolCallsMiddleware(),
+        DedupHITLToolCallsMiddleware(agent_tools=list(selected_tools)),
+    ]
+
+
+def build_slack_specialist_subagent(
+    *,
+    tools: Sequence[BaseTool],
+    model: BaseChatModel | None = None,
+    extra_middleware: Sequence[Any] | None = None,
+) -> SubAgent:
+    """Build the ``slack_specialist`` provider subagent spec."""
+    selected_tools = _select_slack_tools(tools)
+    spec: dict[str, Any] = {
+        "name": "slack_specialist",
+        "description": (
+            "Slack operations specialist for any Slack-domain request "
+            "(channels, threads, users, and messages), with strict evidence "
+            "tracking and approval-gated mutating operations."
+        ),
+        "system_prompt": SLACK_SYSTEM_PROMPT,
+        "tools": selected_tools,
+        "middleware": _wrap_subagent_middleware(
+            selected_tools=selected_tools,
+            extra_middleware=extra_middleware,
+        ),
+    }
+    if model is not None:
+        spec["model"] = model
+    return spec  # type: ignore[return-value]
diff --git a/surfsense_backend/app/agents/new_chat/system_prompt.py b/surfsense_backend/app/agents/new_chat/system_prompt.py
index 3d2442be84..70634c65de 100644
--- a/surfsense_backend/app/agents/new_chat/system_prompt.py
+++ b/surfsense_backend/app/agents/new_chat/system_prompt.py
@@ -1,653 +1,47 @@
 """
-System prompt building for SurfSense agents.
+Thin compatibility wrapper around :mod:`app.agents.new_chat.prompts.composer`.
 
-This module provides functions and constants for building the SurfSense system prompt
-with configurable user instructions and citation support.
+The composer split the previous monolithic prompt string into a fragment
+tree under ``prompts/`` plus a model-family dispatch step (see the
+composer module docstring for credits). This module preserves the public
+function surface (``build_surfsense_system_prompt`` /
+``build_configurable_system_prompt`` /
+``get_default_system_instructions`` / ``SURFSENSE_SYSTEM_PROMPT``) so
+that existing call sites — `chat_deepagent.py`, anonymous chat routes,
+and the configurable-prompt admin path — keep working without churn.
 
-The prompt is composed of three parts:
-1. System Instructions (configurable via NewLLMConfig)
-2. Tools Instructions (always included, not configurable)
-3. Citation Instructions (toggleable via NewLLMConfig.citations_enabled)
+For new call sites prefer importing ``compose_system_prompt`` directly
+from :mod:`app.agents.new_chat.prompts.composer`.
 """
 
+from __future__ import annotations
+
 from datetime import UTC, datetime
 
 from app.db import ChatVisibility
 
-# Default system instructions - can be overridden via NewLLMConfig.system_instructions
-SURFSENSE_SYSTEM_INSTRUCTIONS = """
-<system_instruction>
-You are SurfSense, a reasoning and acting AI agent designed to answer user questions using the user's personal knowledge base.
-
-Today's date (UTC): {resolved_today}
-
-When writing mathematical formulas or equations, ALWAYS use LaTeX notation. NEVER use backtick code spans or Unicode symbols for math.
-
-NEVER expose internal tool parameter names, backend IDs, or implementation details to the user. Always use natural, user-friendly language instead.
-
-<knowledge_base_only_policy>
-CRITICAL RULE — KNOWLEDGE BASE FIRST, NEVER DEFAULT TO GENERAL KNOWLEDGE:
-- You MUST answer questions ONLY using information retrieved from the user's knowledge base, web search results, scraped webpages, or other tool outputs.
-- You MUST NOT answer factual or informational questions from your own training data or general knowledge unless the user explicitly grants permission.
-- If the knowledge base search returns no relevant results AND no other tool provides the answer, you MUST:
-  1. Inform the user that you could not find relevant information in their knowledge base.
-  2. Ask the user: "Would you like me to answer from my general knowledge instead?"
-  3. ONLY provide a general-knowledge answer AFTER the user explicitly says yes.
-- This policy does NOT apply to:
-  * Casual conversation, greetings, or meta-questions about SurfSense itself (e.g., "what can you do?")
-  * Formatting, summarization, or analysis of content already present in the conversation
-  * Following user instructions that are clearly task-oriented (e.g., "rewrite this in bullet points")
-  * Tool-usage actions like generating reports, podcasts, images, or scraping webpages
-</knowledge_base_only_policy>
-
-</system_instruction>
-"""
-
-# Default system instructions for shared (team) threads: team context + message format for attribution
-_SYSTEM_INSTRUCTIONS_SHARED = """
-<system_instruction>
-You are SurfSense, a reasoning and acting AI agent designed to answer questions in this team space using the team's shared knowledge base.
-
-In this team thread, each message is prefixed with **[DisplayName of the author]**. Use this to attribute and reference the author of anything in the discussion (who asked a question, made a suggestion, or contributed an idea) and to cite who said what in your answers.
-
-Today's date (UTC): {resolved_today}
-
-When writing mathematical formulas or equations, ALWAYS use LaTeX notation. NEVER use backtick code spans or Unicode symbols for math.
-
-NEVER expose internal tool parameter names, backend IDs, or implementation details to the user. Always use natural, user-friendly language instead.
-
-<knowledge_base_only_policy>
-CRITICAL RULE — KNOWLEDGE BASE FIRST, NEVER DEFAULT TO GENERAL KNOWLEDGE:
-- You MUST answer questions ONLY using information retrieved from the team's shared knowledge base, web search results, scraped webpages, or other tool outputs.
-- You MUST NOT answer factual or informational questions from your own training data or general knowledge unless a team member explicitly grants permission.
-- If the knowledge base search returns no relevant results AND no other tool provides the answer, you MUST:
-  1. Inform the team that you could not find relevant information in the shared knowledge base.
-  2. Ask: "Would you like me to answer from my general knowledge instead?"
-  3. ONLY provide a general-knowledge answer AFTER a team member explicitly says yes.
-- This policy does NOT apply to:
-  * Casual conversation, greetings, or meta-questions about SurfSense itself (e.g., "what can you do?")
-  * Formatting, summarization, or analysis of content already present in the conversation
-  * Following user instructions that are clearly task-oriented (e.g., "rewrite this in bullet points")
-  * Tool-usage actions like generating reports, podcasts, images, or scraping webpages
-</knowledge_base_only_policy>
-
-</system_instruction>
-"""
-
-
-def _get_system_instructions(
-    thread_visibility: ChatVisibility | None = None, today: datetime | None = None
-) -> str:
-    """Build system instructions based on thread visibility (private vs shared)."""
-
-    resolved_today = (today or datetime.now(UTC)).astimezone(UTC).date().isoformat()
-    visibility = thread_visibility or ChatVisibility.PRIVATE
-    if visibility == ChatVisibility.SEARCH_SPACE:
-        return _SYSTEM_INSTRUCTIONS_SHARED.format(resolved_today=resolved_today)
-    else:
-        return SURFSENSE_SYSTEM_INSTRUCTIONS.format(resolved_today=resolved_today)
+from .prompts.composer import (
+    _read_fragment,
+    compose_system_prompt,
+    detect_provider_variant,
+)
 
+# Optional routing fragments under ``prompts/routing/`` (see composer).
+_DEFAULT_CONNECTOR_ROUTING: tuple[str, ...] = ("linear", "slack")
 
-# =============================================================================
-# Per-tool prompt instructions keyed by registry tool name.
-# Only tools present in the enabled set will be included in the system prompt.
-# =============================================================================
-
-_TOOLS_PREAMBLE = """
-<tools>
-You have access to the following tools:
-
-IMPORTANT: You can ONLY use the tools listed below. If a capability is not listed here, you do NOT have it.
-Do NOT claim you can do something if the corresponding tool is not listed.
-
-"""
+# Public re-exports for backwards compatibility (some legacy code reads the
+# raw default-instructions text directly).
+SURFSENSE_SYSTEM_INSTRUCTIONS_TEMPLATE = (
+    "<system_instruction>\nDefault SurfSense agent system instructions are now\n"
+    "composed from prompts/base/*.md. See compose_system_prompt() for details.\n"
+    "</system_instruction>"
+)
 
-_TOOL_INSTRUCTIONS: dict[str, str] = {}
-
-_TOOL_INSTRUCTIONS["search_surfsense_docs"] = """
-- search_surfsense_docs: Search the official SurfSense documentation.
-  - Use this tool when the user asks anything about SurfSense itself (the application they are using).
-  - Args:
-    - query: The search query about SurfSense
-    - top_k: Number of documentation chunks to retrieve (default: 10)
-  - Returns: Documentation content with chunk IDs for citations (prefixed with 'doc-', e.g., [citation:doc-123])
-"""
-
-_TOOL_INSTRUCTIONS["generate_podcast"] = """
-- generate_podcast: Generate an audio podcast from provided content.
-  - Use this when the user asks to create, generate, or make a podcast.
-  - Trigger phrases: "give me a podcast about", "create a podcast", "generate a podcast", "make a podcast", "turn this into a podcast"
-  - Args:
-    - source_content: The text content to convert into a podcast. This MUST be comprehensive and include:
-      * If discussing the current conversation: Include a detailed summary of the FULL chat history (all user questions and your responses)
-      * If based on knowledge base search: Include the key findings and insights from the search results
-      * You can combine both: conversation context + search results for richer podcasts
-      * The more detailed the source_content, the better the podcast quality
-    - podcast_title: Optional title for the podcast (default: "SurfSense Podcast")
-    - user_prompt: Optional instructions for podcast style/format (e.g., "Make it casual and fun")
-  - Returns: A task_id for tracking. The podcast will be generated in the background.
-  - IMPORTANT: Only one podcast can be generated at a time. If a podcast is already being generated, the tool will return status "already_generating".
-  - After calling this tool, inform the user that podcast generation has started and they will see the player when it's ready (takes 3-5 minutes).
-"""
-
-_TOOL_INSTRUCTIONS["generate_video_presentation"] = """
-- generate_video_presentation: Generate a video presentation from provided content.
-  - Use this when the user asks to create a video, presentation, slides, or slide deck.
-  - Trigger phrases: "give me a presentation", "create slides", "generate a video", "make a slide deck", "turn this into a presentation"
-  - Args:
-    - source_content: The text content to turn into a presentation. The more detailed, the better.
-    - video_title: Optional title (default: "SurfSense Presentation")
-    - user_prompt: Optional style instructions (e.g., "Make it technical and detailed")
-  - After calling this tool, inform the user that generation has started and they will see the presentation when it's ready.
-"""
-
-_TOOL_INSTRUCTIONS["generate_report"] = """
-- generate_report: Generate or revise a structured Markdown report artifact.
-  - WHEN TO CALL THIS TOOL — the message must contain a creation or modification VERB directed at producing a deliverable:
-    * Creation verbs: write, create, generate, draft, produce, summarize into, turn into, make
-    * Modification verbs: revise, update, expand, add (a section), rewrite, make (it shorter/longer/formal)
-    * Example triggers: "generate a report about...", "write a document on...", "add a section about budget", "make the report shorter", "rewrite in formal tone"
-  - WHEN NOT TO CALL THIS TOOL (answer in chat instead):
-    * Questions or discussion about the report: "What can we add?", "What's missing?", "Is the data accurate?", "How could this be improved?"
-    * Suggestions or brainstorming: "What other topics could be covered?", "What else could be added?", "What would make this better?"
-    * Asking for explanations: "Can you explain section 2?", "Why did you include that?", "What does this part mean?"
-    * Quick follow-ups or critiques: "Is the conclusion strong enough?", "Are there any gaps?", "What about the competitors?"
-    * THE TEST: Does the message contain a creation/modification VERB (from the list above) directed at producing or changing a deliverable? If NO verb → answer conversationally in chat. Do NOT assume the user wants a revision just because a report exists in the conversation.
-  - IMPORTANT FORMAT RULE: Reports are ALWAYS generated in Markdown.
-  - Args:
-    - topic: Short title for the report (max ~8 words).
-    - source_content: The text content to base the report on.
-      * For source_strategy="conversation" or "provided": Include a comprehensive summary of the relevant content.
-      * For source_strategy="kb_search": Can be empty or minimal — the tool handles searching internally.
-      * For source_strategy="auto": Include what you have; the tool searches KB if it's not enough.
-    - source_strategy: Controls how the tool collects source material. One of:
-      * "conversation" — The conversation already contains enough context (prior Q&A, discussion, pasted text, scraped pages). Pass a thorough summary as source_content.
-      * "kb_search" — The tool will search the knowledge base internally. Provide search_queries with 1-5 targeted queries.
-      * "auto" — Use source_content if sufficient, otherwise fall back to internal KB search using search_queries.
-      * "provided" — Use only what is in source_content (default, backward-compatible).
-    - search_queries: When source_strategy is "kb_search" or "auto", provide 1-5 specific search queries for the knowledge base. These should be precise, not just the topic name repeated.
-    - report_style: Controls report depth. Options: "detailed" (DEFAULT), "deep_research", "brief".
-      Use "brief" ONLY when the user explicitly asks for a short/concise/one-page report (e.g., "one page", "keep it short", "brief report", "500 words"). Default to "detailed" for all other requests.
-    - user_instructions: Optional specific instructions (e.g., "focus on financial impacts", "include recommendations"). When revising (parent_report_id set), describe WHAT TO CHANGE. If the user mentions a length preference (e.g., "one page", "500 words", "2 pages"), include that VERBATIM here AND set report_style="brief".
-    - parent_report_id: Set this to the report_id from a previous generate_report result when the user wants to MODIFY an existing report. Do NOT set it for new reports or questions about reports.
-  - Returns: A dictionary with status "ready" or "failed", report_id, title, and word_count.
-  - The report is generated immediately in Markdown and displayed inline in the chat.
-  - Export/download formats (PDF, DOCX, HTML, LaTeX, EPUB, ODT, plain text) are produced from the generated Markdown report.
-  - SOURCE STRATEGY DECISION (HIGH PRIORITY — follow this exactly):
-    * If the conversation already has substantive Q&A / discussion on the topic → use source_strategy="conversation" with a comprehensive summary as source_content.
-    * If the user wants a report on a topic not yet discussed → use source_strategy="kb_search" with targeted search_queries.
-    * If you have some content but might need more → use source_strategy="auto" with both source_content and search_queries.
-    * When revising an existing report (parent_report_id set) and the conversation has relevant context → use source_strategy="conversation". The revision will use the previous report content plus your source_content.
-    * NEVER run a separate KB lookup step and then pass those results to generate_report. The tool handles KB search internally.
-  - AFTER CALLING THIS TOOL: Do NOT repeat, summarize, or reproduce the report content in the chat. The report is already displayed as an interactive card that the user can open, read, copy, and export. Simply confirm that the report was generated (e.g., "I've generated your report on [topic]. You can view the Markdown report now, and export it in various formats from the card."). NEVER write out the report text in the chat.
-"""
-
-_TOOL_INSTRUCTIONS["generate_image"] = """
-- generate_image: Generate images from text descriptions using AI image models.
-  - Use this when the user asks you to create, generate, draw, design, or make an image.
-  - Trigger phrases: "generate an image of", "create a picture of", "draw me", "make an image", "design a logo", "create artwork"
-  - Args:
-    - prompt: A detailed text description of the image to generate. Be specific about subject, style, colors, composition, and mood.
-    - n: Number of images to generate (1-4, default: 1)
-  - Returns: A dictionary with the generated image metadata. The image will automatically be displayed in the chat.
-  - IMPORTANT: Write a detailed, descriptive prompt for best results. Don't just pass the user's words verbatim -
-    expand and improve the prompt with specific details about style, lighting, composition, and mood.
-  - If the user's request is vague (e.g., "make me an image of a cat"), enhance the prompt with artistic details.
-"""
-
-_TOOL_INSTRUCTIONS["scrape_webpage"] = """
-- scrape_webpage: Scrape and extract the main content from a webpage.
-  - Use this when the user wants you to READ and UNDERSTAND the actual content of a webpage.
-  - CRITICAL — WHEN TO USE (always attempt scraping, never refuse before trying):
-    * When a user asks to "get", "fetch", "pull", "grab", "scrape", or "read" content from a URL
-    * When the user wants live/dynamic data from a specific webpage (e.g., tables, scores, stats, prices)
-    * When a URL was mentioned earlier in the conversation and the user asks for its actual content
-    * When preloaded `/documents/` data is insufficient and the user wants more
-  - Trigger scenarios:
-    * "Read this article and summarize it"
-    * "What does this page say about X?"
-    * "Summarize this blog post for me"
-    * "Tell me the key points from this article"
-    * "What's in this webpage?"
-    * "Can you analyze this article?"
-    * "Can you get the live table/data from [URL]?"
-    * "Scrape it" / "Can you scrape that?" (referring to a previously mentioned URL)
-    * "Fetch the content from [URL]"
-    * "Pull the data from that page"
-  - Args:
-    - url: The URL of the webpage to scrape (must be HTTP/HTTPS)
-    - max_length: Maximum content length to return (default: 50000 chars)
-  - Returns: The page title, description, full content (in markdown), word count, and metadata
-  - After scraping, provide a comprehensive, well-structured summary with key takeaways using headings or bullet points.
-  - Reference the source using markdown links [descriptive text](url) — never bare URLs.
-  - IMAGES: The scraped content may contain image URLs in markdown format like `![alt text](image_url)`.
-    * When you find relevant/important images in the scraped content, include them in your response using standard markdown image syntax: `![alt text](image_url)`.
-    * This makes your response more visual and engaging.
-    * Prioritize showing: diagrams, charts, infographics, key illustrations, or images that help explain the content.
-    * Don't show every image - just the most relevant 1-3 images that enhance understanding.
-"""
-
-_TOOL_INSTRUCTIONS["web_search"] = """
-- web_search: Search the web for real-time information using all configured search engines.
-  - Use this for current events, news, prices, weather, public facts, or any question requiring
-    up-to-date information from the internet.
-  - This tool dispatches to all configured search engines (SearXNG, Tavily, Linkup, Baidu) in
-    parallel and merges the results.
-  - IMPORTANT (REAL-TIME / PUBLIC WEB QUERIES): For questions that require current public web data
-    (e.g., live exchange rates, stock prices, breaking news, weather, current events), you MUST call
-    `web_search` instead of answering from memory.
-  - For these real-time/public web queries, DO NOT answer from memory and DO NOT say you lack internet
-    access before attempting a web search.
-  - If the search returns no relevant results, explain that web sources did not return enough
-    data and ask the user if they want you to retry with a refined query.
-  - Args:
-    - query: The search query - use specific, descriptive terms
-    - top_k: Number of results to retrieve (default: 10, max: 50)
-  - If search snippets are insufficient for the user's question, use `scrape_webpage` on the most relevant result URL for full content.
-  - When presenting results, reference sources as markdown links [descriptive text](url) — never bare URLs.
-"""
-
-# Memory tool instructions have private and shared variants.
-# We store them keyed as "save_memory" / "recall_memory" with sub-keys.
-_MEMORY_TOOL_INSTRUCTIONS: dict[str, dict[str, str]] = {
-    "save_memory": {
-        "private": """
-- save_memory: Save facts, preferences, or context for personalized responses.
-  - Use this when the user explicitly or implicitly shares information worth remembering.
-  - Trigger scenarios:
-    * User says "remember this", "keep this in mind", "note that", or similar
-    * User shares personal preferences (e.g., "I prefer Python over JavaScript")
-    * User shares facts about themselves (e.g., "I'm a senior developer at Company X")
-    * User gives standing instructions (e.g., "always respond in bullet points")
-    * User shares project context (e.g., "I'm working on migrating our codebase to TypeScript")
-  - Args:
-    - content: The fact/preference to remember. Phrase it clearly:
-      * "User prefers dark mode for all interfaces"
-      * "User is a senior Python developer"
-      * "User wants responses in bullet point format"
-      * "User is working on project called ProjectX"
-    - category: Type of memory:
-      * "preference": User preferences (coding style, tools, formats)
-      * "fact": Facts about the user (role, expertise, background)
-      * "instruction": Standing instructions (response format, communication style)
-      * "context": Current context (ongoing projects, goals, challenges)
-  - Returns: Confirmation of saved memory
-  - IMPORTANT: Only save information that would be genuinely useful for future conversations.
-    Don't save trivial or temporary information.
-""",
-        "shared": """
-- save_memory: Save a fact, preference, or context to the team's shared memory for future reference.
-  - Use this when the user or a team member says "remember this", "keep this in mind", or similar in this shared chat.
-  - Use when the team agrees on something to remember (e.g., decisions, conventions).
-  - Someone shares a preference or fact that should be visible to the whole team.
-  - The saved information will be available in future shared conversations in this space.
-  - Args:
-    - content: The fact/preference/context to remember. Phrase it clearly, e.g. "API keys are stored in Vault", "The team prefers weekly demos on Fridays"
-    - category: Type of memory. One of:
-      * "preference": Team or workspace preferences
-      * "fact": Facts the team agreed on (e.g., processes, locations)
-      * "instruction": Standing instructions for the team
-      * "context": Current context (e.g., ongoing projects, goals)
-  - Returns: Confirmation of saved memory; returned context may include who added it (added_by).
-  - IMPORTANT: Only save information that would be genuinely useful for future team conversations in this space.
-""",
-    },
-    "recall_memory": {
-        "private": """
-- recall_memory: Retrieve relevant memories about the user for personalized responses.
-  - Use this to access stored information about the user.
-  - Trigger scenarios:
-    * You need user context to give a better, more personalized answer
-    * User references something they mentioned before
-    * User asks "what do you know about me?" or similar
-    * Personalization would significantly improve response quality
-    * Before making recommendations that should consider user preferences
-  - Args:
-    - query: Optional search query to find specific memories (e.g., "programming preferences")
-    - category: Optional filter by category ("preference", "fact", "instruction", "context")
-    - top_k: Number of memories to retrieve (default: 5)
-  - Returns: Relevant memories formatted as context
-  - IMPORTANT: Use the recalled memories naturally in your response without explicitly
-    stating "Based on your memory..." - integrate the context seamlessly.
-""",
-        "shared": """
-- recall_memory: Recall relevant team memories for this space to provide contextual responses.
-  - Use when you need team context to answer (e.g., "where do we store X?", "what did we decide about Y?").
-  - Use when someone asks about something the team agreed to remember.
-  - Use when team preferences or conventions would improve the response.
-  - Args:
-    - query: Optional search query to find specific memories. If not provided, returns the most recent memories.
-    - category: Optional filter by category ("preference", "fact", "instruction", "context")
-    - top_k: Number of memories to retrieve (default: 5, max: 20)
-  - Returns: Relevant team memories and formatted context (may include added_by). Integrate naturally without saying "Based on team memory...".
-""",
-    },
-}
-
-_MEMORY_TOOL_EXAMPLES: dict[str, dict[str, str]] = {
-    "save_memory": {
-        "private": """
-- User: "Remember that I prefer TypeScript over JavaScript"
-  - Call: `save_memory(content="User prefers TypeScript over JavaScript for development", category="preference")`
-- User: "I'm a data scientist working on ML pipelines"
-  - Call: `save_memory(content="User is a data scientist working on ML pipelines", category="fact")`
-- User: "Always give me code examples in Python"
-  - Call: `save_memory(content="User wants code examples to be written in Python", category="instruction")`
-""",
-        "shared": """
-- User: "Remember that API keys are stored in Vault"
-  - Call: `save_memory(content="API keys are stored in Vault", category="fact")`
-- User: "Let's remember that the team prefers weekly demos on Fridays"
-  - Call: `save_memory(content="The team prefers weekly demos on Fridays", category="preference")`
-""",
-    },
-    "recall_memory": {
-        "private": """
-- User: "What programming language should I use for this project?"
-  - First recall: `recall_memory(query="programming language preferences")`
-  - Then provide a personalized recommendation based on their preferences
-- User: "What do you know about me?"
-  - Call: `recall_memory(top_k=10)`
-  - Then summarize the stored memories
-""",
-        "shared": """
-- User: "What did we decide about the release date?"
-  - First recall: `recall_memory(query="release date decision")`
-  - Then answer based on the team memories
-- User: "Where do we document onboarding?"
-  - Call: `recall_memory(query="onboarding documentation")`
-  - Then answer using the recalled team context
-""",
-    },
-}
-
-# Per-tool examples keyed by tool name. Only examples for enabled tools are included.
-_TOOL_EXAMPLES: dict[str, str] = {}
-
-_TOOL_EXAMPLES["search_surfsense_docs"] = """
-- User: "How do I install SurfSense?"
-  - Call: `search_surfsense_docs(query="installation setup")`
-- User: "What connectors does SurfSense support?"
-  - Call: `search_surfsense_docs(query="available connectors integrations")`
-- User: "How do I set up the Notion connector?"
-  - Call: `search_surfsense_docs(query="Notion connector setup configuration")`
-- User: "How do I use Docker to run SurfSense?"
-  - Call: `search_surfsense_docs(query="Docker installation setup")`
-"""
-
-_TOOL_EXAMPLES["generate_podcast"] = """
-- User: "Give me a podcast about AI trends based on what we discussed"
-  - First search for relevant content, then call: `generate_podcast(source_content="Based on our conversation and search results: [detailed summary of chat + search findings]", podcast_title="AI Trends Podcast")`
-- User: "Create a podcast summary of this conversation"
-  - Call: `generate_podcast(source_content="Complete conversation summary:\\n\\nUser asked about [topic 1]:\\n[Your detailed response]\\n\\nUser then asked about [topic 2]:\\n[Your detailed response]\\n\\n[Continue for all exchanges in the conversation]", podcast_title="Conversation Summary")`
-- User: "Make a podcast about quantum computing"
-  - First explore `/documents/` (ls/glob/grep/read_file), then: `generate_podcast(source_content="Key insights about quantum computing from retrieved files:\\n\\n[Comprehensive summary of findings]", podcast_title="Quantum Computing Explained")`
-"""
-
-_TOOL_EXAMPLES["generate_video_presentation"] = """
-- User: "Give me a presentation about AI trends based on what we discussed"
-  - First search for relevant content, then call: `generate_video_presentation(source_content="Based on our conversation and search results: [detailed summary of chat + search findings]", video_title="AI Trends Presentation")`
-- User: "Create slides summarizing this conversation"
-  - Call: `generate_video_presentation(source_content="Complete conversation summary:\\n\\nUser asked about [topic 1]:\\n[Your detailed response]\\n\\nUser then asked about [topic 2]:\\n[Your detailed response]\\n\\n[Continue for all exchanges in the conversation]", video_title="Conversation Summary")`
-- User: "Make a video presentation about quantum computing"
-  - First explore `/documents/` (ls/glob/grep/read_file), then: `generate_video_presentation(source_content="Key insights about quantum computing from retrieved files:\\n\\n[Comprehensive summary of findings]", video_title="Quantum Computing Explained")`
-"""
-
-_TOOL_EXAMPLES["generate_report"] = """
-- User: "Generate a report about AI trends"
-  - Call: `generate_report(topic="AI Trends Report", source_strategy="kb_search", search_queries=["AI trends recent developments", "artificial intelligence industry trends", "AI market growth and predictions"], report_style="detailed")`
-  - WHY: Has creation verb "generate" → call the tool. No prior discussion → use kb_search.
-- User: "Write a research report from this conversation"
-  - Call: `generate_report(topic="Research Report", source_strategy="conversation", source_content="Complete conversation summary:\\n\\n...", report_style="deep_research")`
-  - WHY: Has creation verb "write" → call the tool. Conversation has the content → use source_strategy="conversation".
-- User: (after a report on Climate Change was generated) "Add a section about carbon capture technologies"
-  - Call: `generate_report(topic="Climate Crisis: Causes, Impacts, and Solutions", source_strategy="conversation", source_content="[summary of conversation context if any]", parent_report_id=<previous_report_id>, user_instructions="Add a new section about carbon capture technologies")`
-  - WHY: Has modification verb "add" + specific deliverable target → call the tool with parent_report_id.
-- User: (after a report was generated) "What else could we add to have more depth?"
-  - Do NOT call generate_report. Answer in chat with suggestions.
-  - WHY: No creation/modification verb directed at producing a deliverable.
-"""
-
-_TOOL_EXAMPLES["scrape_webpage"] = """
-- User: "Check out https://dev.to/some-article"
-  - Call: `scrape_webpage(url="https://dev.to/some-article")`
-  - Respond with a structured analysis — key points, takeaways.
-- User: "Read this article and summarize it for me: https://example.com/blog/ai-trends"
-  - Call: `scrape_webpage(url="https://example.com/blog/ai-trends")`
-  - Respond with a thorough summary using headings and bullet points.
-- User: (after discussing https://example.com/stats) "Can you get the live data from that page?"
-  - Call: `scrape_webpage(url="https://example.com/stats")`
-  - IMPORTANT: Always attempt scraping first. Never refuse before trying the tool.
-- User: "https://example.com/blog/weekend-recipes"
-  - Call: `scrape_webpage(url="https://example.com/blog/weekend-recipes")`
-  - When a user sends just a URL with no instructions, scrape it and provide a concise summary of the content.
-"""
-
-_TOOL_EXAMPLES["generate_image"] = """
-- User: "Generate an image of a cat"
-  - Call: `generate_image(prompt="A fluffy orange tabby cat sitting on a windowsill, bathed in warm golden sunlight, soft bokeh background with green houseplants, photorealistic style, cozy atmosphere")`
-  - The generated image will automatically be displayed in the chat.
-- User: "Draw me a logo for a coffee shop called Bean Dream"
-  - Call: `generate_image(prompt="Minimalist modern logo design for a coffee shop called 'Bean Dream', featuring a stylized coffee bean with dream-like swirls of steam, clean vector style, warm brown and cream color palette, white background, professional branding")`
-  - The generated image will automatically be displayed in the chat.
-- User: "Show me this image: https://example.com/image.png"
-  - Simply include it in your response using markdown: `![Image](https://example.com/image.png)`
-- User uploads an image file and asks: "What is this image about?"
-  - The user's uploaded image is already visible in the chat.
-  - Simply analyze the image content and respond directly.
-"""
-
-_TOOL_EXAMPLES["web_search"] = """
-- User: "What's the current USD to INR exchange rate?"
-  - Call: `web_search(query="current USD to INR exchange rate")`
-  - Then answer using the returned web results with citations.
-- User: "What's the latest news about AI?"
-  - Call: `web_search(query="latest AI news today")`
-- User: "What's the weather in New York?"
-  - Call: `web_search(query="weather New York today")`
-"""
-
-# All tool names that have prompt instructions (order matters for prompt readability)
-_ALL_TOOL_NAMES_ORDERED = [
-    "search_surfsense_docs",
-    "web_search",
-    "generate_podcast",
-    "generate_video_presentation",
-    "generate_report",
-    "generate_image",
-    "scrape_webpage",
-    "save_memory",
-    "recall_memory",
-]
-
-
-def _format_tool_name(name: str) -> str:
-    """Convert snake_case tool name to a human-readable label."""
-    return name.replace("_", " ").title()
-
-
-def _get_tools_instructions(
-    thread_visibility: ChatVisibility | None = None,
-    enabled_tool_names: set[str] | None = None,
-    disabled_tool_names: set[str] | None = None,
-) -> str:
-    """Build tools instructions containing only the enabled tools.
-
-    Args:
-        thread_visibility: Private vs shared — affects memory tool wording.
-        enabled_tool_names: Set of tool names that are actually bound to the agent.
-            When None, all tools are included (backward-compatible default).
-        disabled_tool_names: Set of tool names that the user explicitly disabled.
-            When provided, a note is appended telling the model about these tools
-            so it can inform the user they can re-enable them.
-    """
-    visibility = thread_visibility or ChatVisibility.PRIVATE
-    memory_variant = (
-        "shared" if visibility == ChatVisibility.SEARCH_SPACE else "private"
-    )
-
-    parts: list[str] = [_TOOLS_PREAMBLE]
-    examples: list[str] = []
-
-    for tool_name in _ALL_TOOL_NAMES_ORDERED:
-        if enabled_tool_names is not None and tool_name not in enabled_tool_names:
-            continue
-
-        if tool_name in _TOOL_INSTRUCTIONS:
-            parts.append(_TOOL_INSTRUCTIONS[tool_name])
-        elif tool_name in _MEMORY_TOOL_INSTRUCTIONS:
-            parts.append(_MEMORY_TOOL_INSTRUCTIONS[tool_name][memory_variant])
-
-        if tool_name in _TOOL_EXAMPLES:
-            examples.append(_TOOL_EXAMPLES[tool_name])
-        elif tool_name in _MEMORY_TOOL_EXAMPLES:
-            examples.append(_MEMORY_TOOL_EXAMPLES[tool_name][memory_variant])
-
-    # Append a note about user-disabled tools so the model can inform the user
-    known_disabled = (
-        disabled_tool_names & set(_ALL_TOOL_NAMES_ORDERED)
-        if disabled_tool_names
-        else set()
-    )
-    if known_disabled:
-        disabled_list = ", ".join(
-            _format_tool_name(n) for n in _ALL_TOOL_NAMES_ORDERED if n in known_disabled
-        )
-        parts.append(f"""
-DISABLED TOOLS (by user):
-The following tools are available in SurfSense but have been disabled by the user for this session: {disabled_list}.
-You do NOT have access to these tools and MUST NOT claim you can use them.
-If the user asks about a capability provided by a disabled tool, let them know the relevant tool
-is currently disabled and they can re-enable it.
-""")
-
-    parts.append("\n</tools>\n")
-
-    if examples:
-        parts.append("<tool_call_examples>")
-        parts.extend(examples)
-        parts.append("</tool_call_examples>\n")
-
-    return "".join(parts)
-
-
-# Backward-compatible constant: all tools included (private memory variant)
-SURFSENSE_TOOLS_INSTRUCTIONS = _get_tools_instructions()
-
-
-SURFSENSE_CITATION_INSTRUCTIONS = """
-<citation_instructions>
-CRITICAL CITATION REQUIREMENTS:
-
-1. For EVERY piece of information you include from the documents, add a citation in the format [citation:chunk_id] where chunk_id is the exact value from the `<chunk id='...'>` tag inside `<document_content>`.
-2. Make sure ALL factual statements from the documents have proper citations.
-3. If multiple chunks support the same point, include all relevant citations [citation:chunk_id1], [citation:chunk_id2].
-4. You MUST use the exact chunk_id values from the `<chunk id='...'>` attributes. Do not create your own citation numbers.
-5. Every citation MUST be in the format [citation:chunk_id] where chunk_id is the exact chunk id value.
-6. Never modify or change the chunk_id - always use the original values exactly as provided in the chunk tags.
-7. Do not return citations as clickable links.
-8. Never format citations as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only.
-9. Citations must ONLY appear as [citation:chunk_id] or [citation:chunk_id1], [citation:chunk_id2] format - never with parentheses, hyperlinks, or other formatting.
-10. Never make up chunk IDs. Only use chunk_id values that are explicitly provided in the `<chunk id='...'>` tags.
-11. If you are unsure about a chunk_id, do not include a citation rather than guessing or making one up.
-
-<document_structure_example>
-The documents you receive are structured like this:
-
-**Knowledge base documents (numeric chunk IDs):**
-<document>
-<document_metadata>
-  <document_id>42</document_id>
-  <document_type>GITHUB_CONNECTOR</document_type>
-  <title><![CDATA[Some repo / file / issue title]]></title>
-  <url><![CDATA[https://example.com]]></url>
-  <metadata_json><![CDATA[{{"any":"other metadata"}}]]></metadata_json>
-</document_metadata>
-
-<document_content>
-  <chunk id='123'><![CDATA[First chunk text...]]></chunk>
-  <chunk id='124'><![CDATA[Second chunk text...]]></chunk>
-</document_content>
-</document>
-
-**Web search results (URL chunk IDs):**
-<document>
-<document_metadata>
-  <document_type>WEB_SEARCH</document_type>
-  <title><![CDATA[Some web search result]]></title>
-  <url><![CDATA[https://example.com/article]]></url>
-</document_metadata>
-
-<document_content>
-  <chunk id='https://example.com/article'><![CDATA[Content from web search...]]></chunk>
-</document_content>
-</document>
-
-IMPORTANT: You MUST cite using the EXACT chunk ids from the `<chunk id='...'>` tags.
-- For knowledge base documents, chunk ids are numeric (e.g. 123, 124) or prefixed (e.g. doc-45).
-- For live web search results, chunk ids are URLs (e.g. https://example.com/article).
-Do NOT cite document_id. Always use the chunk id.
-</document_structure_example>
-
-<citation_format>
-- Every fact from the documents must have a citation in the format [citation:chunk_id] where chunk_id is the EXACT id value from a `<chunk id='...'>` tag
-- Citations should appear at the end of the sentence containing the information they support
-- Multiple citations should be separated by commas: [citation:chunk_id1], [citation:chunk_id2], [citation:chunk_id3]
-- No need to return references section. Just citations in answer.
-- NEVER create your own citation format - use the exact chunk_id values from the documents in the [citation:chunk_id] format
-- NEVER format citations as clickable links or as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only
-- NEVER make up chunk IDs if you are unsure about the chunk_id. It is better to omit the citation than to guess
-- Copy the EXACT chunk id from the XML - if it says `<chunk id='doc-123'>`, use [citation:doc-123]
-- If the chunk id is a URL like `<chunk id='https://example.com/page'>`, use [citation:https://example.com/page]
-</citation_format>
-
-<citation_examples>
-CORRECT citation formats:
-- [citation:5] (numeric chunk ID from knowledge base)
-- [citation:doc-123] (for Surfsense documentation chunks)
-- [citation:https://example.com/article] (URL chunk ID from web search results)
-- [citation:chunk_id1], [citation:chunk_id2], [citation:chunk_id3] (multiple citations)
-
-INCORRECT citation formats (DO NOT use):
-- Using parentheses and markdown links: ([citation:5](https://github.com/MODSetter/SurfSense))
-- Using parentheses around brackets: ([citation:5])
-- Using hyperlinked text: [link to source 5](https://example.com)
-- Using footnote style: ... library¹
-- Making up source IDs when source_id is unknown
-- Using old IEEE format: [1], [2], [3]
-- Using source types instead of IDs: [citation:GITHUB_CONNECTOR] instead of [citation:5]
-</citation_examples>
-
-<citation_output_example>
-Based on your GitHub repositories and video content, Python's asyncio library provides tools for writing concurrent code using the async/await syntax [citation:5]. It's particularly useful for I/O-bound and high-level structured network code [citation:5].
-
-According to web search results, the key advantage of asyncio is that it can improve performance by allowing other code to run while waiting for I/O operations to complete [citation:https://docs.python.org/3/library/asyncio.html]. This makes it excellent for scenarios like web scraping, API calls, database operations, or any situation where your program spends time waiting for external resources.
-
-However, from your video learning, it's important to note that asyncio is not suitable for CPU-bound tasks as it runs on a single thread [citation:12]. For computationally intensive work, you'd want to use multiprocessing instead.
-</citation_output_example>
-</citation_instructions>
-"""
-
-# Anti-citation prompt - used when citations are disabled
-# This explicitly tells the model NOT to include citations
-SURFSENSE_NO_CITATION_INSTRUCTIONS = """
-<citation_instructions>
-IMPORTANT: Citations are DISABLED for this configuration.
-
-DO NOT include any citations in your responses. Specifically:
-1. Do NOT use the [citation:chunk_id] format anywhere in your response.
-2. Do NOT reference document IDs, chunk IDs, or source IDs.
-3. Simply provide the information naturally without any citation markers.
-4. Write your response as if you're having a normal conversation, incorporating the information from your knowledge seamlessly.
-
-When answering questions based on documents from the knowledge base:
-- Present the information directly and confidently
-- Do not mention that information comes from specific documents or chunks
-- Integrate facts naturally into your response without attribution markers
-
-Your goal is to provide helpful, informative answers in a clean, readable format without any citation notation.
-</citation_instructions>
-"""
+# Citation block re-exposed for legacy importers that referenced this constant
+# directly. The composer is the canonical source; this is a frozen snapshot
+# loaded at module-init time.
+SURFSENSE_CITATION_INSTRUCTIONS = _read_fragment("base/citations_on.md")
+SURFSENSE_NO_CITATION_INSTRUCTIONS = _read_fragment("base/citations_off.md")
 
 
 def build_surfsense_system_prompt(
@@ -655,32 +49,25 @@ def build_surfsense_system_prompt(
     thread_visibility: ChatVisibility | None = None,
     enabled_tool_names: set[str] | None = None,
     disabled_tool_names: set[str] | None = None,
+    mcp_connector_tools: dict[str, list[str]] | None = None,
+    *,
+    model_name: str | None = None,
 ) -> str:
-    """
-    Build the SurfSense system prompt with default settings.
+    """Build the default SurfSense system prompt (citations on, defaults).
 
-    This is a convenience function that builds the prompt with:
-    - Default system instructions
-    - Tools instructions (only for enabled tools)
-    - Citation instructions enabled
-
-    Args:
-        today: Optional datetime for today's date (defaults to current UTC date)
-        thread_visibility: Optional; when provided, used for conditional prompt (e.g. private vs shared memory wording). Defaults to private behavior when None.
-        enabled_tool_names: Set of tool names actually bound to the agent. When None all tools are included.
-        disabled_tool_names: Set of tool names the user explicitly disabled. Included as a note so the model can inform the user.
-
-    Returns:
-        Complete system prompt string
+    See :func:`app.agents.new_chat.prompts.composer.compose_system_prompt`
+    for full parameter docs.
     """
-
-    visibility = thread_visibility or ChatVisibility.PRIVATE
-    system_instructions = _get_system_instructions(visibility, today)
-    tools_instructions = _get_tools_instructions(
-        visibility, enabled_tool_names, disabled_tool_names
+    return compose_system_prompt(
+        today=today,
+        thread_visibility=thread_visibility,
+        enabled_tool_names=enabled_tool_names,
+        disabled_tool_names=disabled_tool_names,
+        mcp_connector_tools=mcp_connector_tools,
+        citations_enabled=True,
+        model_name=model_name,
+        connector_routing=_DEFAULT_CONNECTOR_ROUTING,
     )
-    citation_instructions = SURFSENSE_CITATION_INSTRUCTIONS
-    return system_instructions + tools_instructions + citation_instructions
 
 
 def build_configurable_system_prompt(
@@ -691,70 +78,56 @@ def build_configurable_system_prompt(
     thread_visibility: ChatVisibility | None = None,
     enabled_tool_names: set[str] | None = None,
     disabled_tool_names: set[str] | None = None,
+    mcp_connector_tools: dict[str, list[str]] | None = None,
+    *,
+    model_name: str | None = None,
 ) -> str:
-    """
-    Build a configurable SurfSense system prompt based on NewLLMConfig settings.
-
-    The prompt is composed of three parts:
-    1. System Instructions - either custom or default SURFSENSE_SYSTEM_INSTRUCTIONS
-    2. Tools Instructions - only for enabled tools, with a note about disabled ones
-    3. Citation Instructions - either SURFSENSE_CITATION_INSTRUCTIONS or SURFSENSE_NO_CITATION_INSTRUCTIONS
+    """Build a configurable SurfSense system prompt (NewLLMConfig path).
 
-    Args:
-        custom_system_instructions: Custom system instructions to use. If empty/None and
-                                   use_default_system_instructions is True, defaults to
-                                   SURFSENSE_SYSTEM_INSTRUCTIONS.
-        use_default_system_instructions: Whether to use default instructions when
-                                        custom_system_instructions is empty/None.
-        citations_enabled: Whether to include citation instructions (True) or
-                          anti-citation instructions (False).
-        today: Optional datetime for today's date (defaults to current UTC date)
-        thread_visibility: Optional; when provided, used for conditional prompt (e.g. private vs shared memory wording). Defaults to private behavior when None.
-        enabled_tool_names: Set of tool names actually bound to the agent. When None all tools are included.
-        disabled_tool_names: Set of tool names the user explicitly disabled. Included as a note so the model can inform the user.
-
-    Returns:
-        Complete system prompt string
+    See :func:`app.agents.new_chat.prompts.composer.compose_system_prompt`
+    for full parameter docs.
     """
-    resolved_today = (today or datetime.now(UTC)).astimezone(UTC).date().isoformat()
-
-    # Determine system instructions
-    if custom_system_instructions and custom_system_instructions.strip():
-        system_instructions = custom_system_instructions.format(
-            resolved_today=resolved_today
-        )
-    elif use_default_system_instructions:
-        visibility = thread_visibility or ChatVisibility.PRIVATE
-        system_instructions = _get_system_instructions(visibility, today)
-    else:
-        system_instructions = ""
-
-    # Tools instructions: only include enabled tools, note disabled ones
-    tools_instructions = _get_tools_instructions(
-        thread_visibility, enabled_tool_names, disabled_tool_names
-    )
-
-    # Citation instructions based on toggle
-    citation_instructions = (
-        SURFSENSE_CITATION_INSTRUCTIONS
-        if citations_enabled
-        else SURFSENSE_NO_CITATION_INSTRUCTIONS
+    return compose_system_prompt(
+        today=today,
+        thread_visibility=thread_visibility,
+        enabled_tool_names=enabled_tool_names,
+        disabled_tool_names=disabled_tool_names,
+        mcp_connector_tools=mcp_connector_tools,
+        custom_system_instructions=custom_system_instructions,
+        use_default_system_instructions=use_default_system_instructions,
+        citations_enabled=citations_enabled,
+        model_name=model_name,
+        connector_routing=_DEFAULT_CONNECTOR_ROUTING,
     )
 
-    return system_instructions + tools_instructions + citation_instructions
-
 
 def get_default_system_instructions() -> str:
-    """
-    Get the default system instructions template.
+    """Return the default ``<system_instruction>`` block (no tools / citations).
 
-    This is useful for populating the UI with the default value when
-    creating a new NewLLMConfig.
-
-    Returns:
-        Default system instructions string (with {resolved_today} placeholder)
+    Useful for populating the UI when seeding ``NewLLMConfig.system_instructions``.
+    The output reflects the current fragment tree, not a baked-in constant.
     """
-    return SURFSENSE_SYSTEM_INSTRUCTIONS.strip()
+    resolved_today = datetime.now(UTC).date().isoformat()
+    from .prompts.composer import _build_system_instructions  # local import
+
+    return _build_system_instructions(
+        visibility=ChatVisibility.PRIVATE,
+        resolved_today=resolved_today,
+    ).strip()
 
 
+# Backwards compatibility — some modules import the constant directly.
 SURFSENSE_SYSTEM_PROMPT = build_surfsense_system_prompt()
+
+
+__all__ = [
+    "SURFSENSE_CITATION_INSTRUCTIONS",
+    "SURFSENSE_NO_CITATION_INSTRUCTIONS",
+    "SURFSENSE_SYSTEM_INSTRUCTIONS_TEMPLATE",
+    "SURFSENSE_SYSTEM_PROMPT",
+    "build_configurable_system_prompt",
+    "build_surfsense_system_prompt",
+    "compose_system_prompt",
+    "detect_provider_variant",
+    "get_default_system_instructions",
+]
diff --git a/surfsense_backend/app/agents/new_chat/tools/__init__.py b/surfsense_backend/app/agents/new_chat/tools/__init__.py
index a46fadf112..bc444b0c06 100644
--- a/surfsense_backend/app/agents/new_chat/tools/__init__.py
+++ b/surfsense_backend/app/agents/new_chat/tools/__init__.py
@@ -10,8 +10,7 @@
 - generate_video_presentation: Generate video presentations with slides and narration
 - generate_image: Generate images from text descriptions using AI models
 - scrape_webpage: Extract content from webpages
-- save_memory: Store facts/preferences about the user
-- recall_memory: Retrieve relevant user memories
+- update_memory: Update the user's / team's memory document
 """
 
 # Registry exports
@@ -33,7 +32,7 @@
 )
 from .scrape_webpage import create_scrape_webpage_tool
 from .search_surfsense_docs import create_search_surfsense_docs_tool
-from .user_memory import create_recall_memory_tool, create_save_memory_tool
+from .update_memory import create_update_memory_tool, create_update_team_memory_tool
 from .video_presentation import create_generate_video_presentation_tool
 
 __all__ = [
@@ -47,10 +46,10 @@
     "create_generate_image_tool",
     "create_generate_podcast_tool",
     "create_generate_video_presentation_tool",
-    "create_recall_memory_tool",
-    "create_save_memory_tool",
     "create_scrape_webpage_tool",
     "create_search_surfsense_docs_tool",
+    "create_update_memory_tool",
+    "create_update_team_memory_tool",
     "format_documents_for_context",
     "get_all_tool_names",
     "get_default_enabled_tools",
diff --git a/surfsense_backend/app/agents/new_chat/tools/confluence/create_page.py b/surfsense_backend/app/agents/new_chat/tools/confluence/create_page.py
index b4d532b766..c56db15285 100644
--- a/surfsense_backend/app/agents/new_chat/tools/confluence/create_page.py
+++ b/surfsense_backend/app/agents/new_chat/tools/confluence/create_page.py
@@ -2,11 +2,12 @@
 from typing import Any
 
 from langchain_core.tools import tool
-from langgraph.types import interrupt
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.orm.attributes import flag_modified
 
+from app.agents.new_chat.tools.hitl import request_approval
 from app.connectors.confluence_history import ConfluenceHistoryConnector
+from app.db import async_session_maker
 from app.services.confluence import ConfluenceToolMetadataService
 
 logger = logging.getLogger(__name__)
@@ -18,6 +19,23 @@ def create_create_confluence_page_tool(
     user_id: str | None = None,
     connector_id: int | None = None,
 ):
+    """
+    Factory function to create the create_confluence_page tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+
+    Returns:
+        Configured create_confluence_page tool
+    """
+    del db_session  # per-call session — see docstring
+
     @tool
     async def create_confluence_page(
         title: str,
@@ -42,186 +60,163 @@ async def create_confluence_page(
         """
         logger.info(f"create_confluence_page called: title='{title}'")
 
-        if db_session is None or search_space_id is None or user_id is None:
+        if search_space_id is None or user_id is None:
             return {
                 "status": "error",
                 "message": "Confluence tool not properly configured.",
             }
 
         try:
-            metadata_service = ConfluenceToolMetadataService(db_session)
-            context = await metadata_service.get_creation_context(
-                search_space_id, user_id
-            )
-
-            if "error" in context:
-                return {"status": "error", "message": context["error"]}
+            async with async_session_maker() as db_session:
+                metadata_service = ConfluenceToolMetadataService(db_session)
+                context = await metadata_service.get_creation_context(
+                    search_space_id, user_id
+                )
 
-            accounts = context.get("accounts", [])
-            if accounts and all(a.get("auth_expired") for a in accounts):
-                return {
-                    "status": "auth_error",
-                    "message": "All connected Confluence accounts need re-authentication.",
-                    "connector_type": "confluence",
-                }
+                if "error" in context:
+                    return {"status": "error", "message": context["error"]}
 
-            approval = interrupt(
-                {
-                    "type": "confluence_page_creation",
-                    "action": {
-                        "tool": "create_confluence_page",
-                        "params": {
-                            "title": title,
-                            "content": content,
-                            "space_id": space_id,
-                            "connector_id": connector_id,
-                        },
-                    },
-                    "context": context,
-                }
-            )
-
-            decisions_raw = (
-                approval.get("decisions", []) if isinstance(approval, dict) else []
-            )
-            decisions = (
-                decisions_raw if isinstance(decisions_raw, list) else [decisions_raw]
-            )
-            decisions = [d for d in decisions if isinstance(d, dict)]
-            if not decisions:
-                return {"status": "error", "message": "No approval decision received"}
-
-            decision = decisions[0]
-            decision_type = decision.get("type") or decision.get("decision_type")
-
-            if decision_type == "reject":
-                return {
-                    "status": "rejected",
-                    "message": "User declined. The page was not created.",
-                }
-
-            final_params: dict[str, Any] = {}
-            edited_action = decision.get("edited_action")
-            if isinstance(edited_action, dict):
-                edited_args = edited_action.get("args")
-                if isinstance(edited_args, dict):
-                    final_params = edited_args
-            elif isinstance(decision.get("args"), dict):
-                final_params = decision["args"]
-
-            final_title = final_params.get("title", title)
-            final_content = final_params.get("content", content) or ""
-            final_space_id = final_params.get("space_id", space_id)
-            final_connector_id = final_params.get("connector_id", connector_id)
-
-            if not final_title or not final_title.strip():
-                return {"status": "error", "message": "Page title cannot be empty."}
-            if not final_space_id:
-                return {"status": "error", "message": "A space must be selected."}
-
-            from sqlalchemy.future import select
-
-            from app.db import SearchSourceConnector, SearchSourceConnectorType
-
-            actual_connector_id = final_connector_id
-            if actual_connector_id is None:
-                result = await db_session.execute(
-                    select(SearchSourceConnector).filter(
-                        SearchSourceConnector.search_space_id == search_space_id,
-                        SearchSourceConnector.user_id == user_id,
-                        SearchSourceConnector.connector_type
-                        == SearchSourceConnectorType.CONFLUENCE_CONNECTOR,
-                    )
-                )
-                connector = result.scalars().first()
-                if not connector:
-                    return {
-                        "status": "error",
-                        "message": "No Confluence connector found.",
-                    }
-                actual_connector_id = connector.id
-            else:
-                result = await db_session.execute(
-                    select(SearchSourceConnector).filter(
-                        SearchSourceConnector.id == actual_connector_id,
-                        SearchSourceConnector.search_space_id == search_space_id,
-                        SearchSourceConnector.user_id == user_id,
-                        SearchSourceConnector.connector_type
-                        == SearchSourceConnectorType.CONFLUENCE_CONNECTOR,
-                    )
-                )
-                connector = result.scalars().first()
-                if not connector:
+                accounts = context.get("accounts", [])
+                if accounts and all(a.get("auth_expired") for a in accounts):
                     return {
-                        "status": "error",
-                        "message": "Selected Confluence connector is invalid.",
+                        "status": "auth_error",
+                        "message": "All connected Confluence accounts need re-authentication.",
+                        "connector_type": "confluence",
                     }
 
-            try:
-                client = ConfluenceHistoryConnector(
-                    session=db_session, connector_id=actual_connector_id
-                )
-                api_result = await client.create_page(
-                    space_id=final_space_id,
-                    title=final_title,
-                    body=final_content,
+                result = request_approval(
+                    action_type="confluence_page_creation",
+                    tool_name="create_confluence_page",
+                    params={
+                        "title": title,
+                        "content": content,
+                        "space_id": space_id,
+                        "connector_id": connector_id,
+                    },
+                    context=context,
                 )
-                await client.close()
-            except Exception as api_err:
-                if (
-                    "http 403" in str(api_err).lower()
-                    or "status code 403" in str(api_err).lower()
-                ):
-                    try:
-                        _conn = connector
-                        _conn.config = {**_conn.config, "auth_expired": True}
-                        flag_modified(_conn, "config")
-                        await db_session.commit()
-                    except Exception:
-                        pass
+
+                if result.rejected:
                     return {
-                        "status": "insufficient_permissions",
-                        "connector_id": actual_connector_id,
-                        "message": "This Confluence account needs additional permissions. Please re-authenticate in connector settings.",
+                        "status": "rejected",
+                        "message": "User declined. Do not retry or suggest alternatives.",
                     }
-                raise
 
-            page_id = str(api_result.get("id", ""))
-            page_links = (
-                api_result.get("_links", {}) if isinstance(api_result, dict) else {}
-            )
-            page_url = ""
-            if page_links.get("base") and page_links.get("webui"):
-                page_url = f"{page_links['base']}{page_links['webui']}"
-
-            kb_message_suffix = ""
-            try:
-                from app.services.confluence import ConfluenceKBSyncService
-
-                kb_service = ConfluenceKBSyncService(db_session)
-                kb_result = await kb_service.sync_after_create(
-                    page_id=page_id,
-                    page_title=final_title,
-                    space_id=final_space_id,
-                    body_content=final_content,
-                    connector_id=actual_connector_id,
-                    search_space_id=search_space_id,
-                    user_id=user_id,
-                )
-                if kb_result["status"] == "success":
-                    kb_message_suffix = " Your knowledge base has also been updated."
+                final_title = result.params.get("title", title)
+                final_content = result.params.get("content", content) or ""
+                final_space_id = result.params.get("space_id", space_id)
+                final_connector_id = result.params.get("connector_id", connector_id)
+
+                if not final_title or not final_title.strip():
+                    return {"status": "error", "message": "Page title cannot be empty."}
+                if not final_space_id:
+                    return {"status": "error", "message": "A space must be selected."}
+
+                from sqlalchemy.future import select
+
+                from app.db import SearchSourceConnector, SearchSourceConnectorType
+
+                actual_connector_id = final_connector_id
+                if actual_connector_id is None:
+                    result = await db_session.execute(
+                        select(SearchSourceConnector).filter(
+                            SearchSourceConnector.search_space_id == search_space_id,
+                            SearchSourceConnector.user_id == user_id,
+                            SearchSourceConnector.connector_type
+                            == SearchSourceConnectorType.CONFLUENCE_CONNECTOR,
+                        )
+                    )
+                    connector = result.scalars().first()
+                    if not connector:
+                        return {
+                            "status": "error",
+                            "message": "No Confluence connector found.",
+                        }
+                    actual_connector_id = connector.id
                 else:
+                    result = await db_session.execute(
+                        select(SearchSourceConnector).filter(
+                            SearchSourceConnector.id == actual_connector_id,
+                            SearchSourceConnector.search_space_id == search_space_id,
+                            SearchSourceConnector.user_id == user_id,
+                            SearchSourceConnector.connector_type
+                            == SearchSourceConnectorType.CONFLUENCE_CONNECTOR,
+                        )
+                    )
+                    connector = result.scalars().first()
+                    if not connector:
+                        return {
+                            "status": "error",
+                            "message": "Selected Confluence connector is invalid.",
+                        }
+
+                try:
+                    client = ConfluenceHistoryConnector(
+                        session=db_session, connector_id=actual_connector_id
+                    )
+                    api_result = await client.create_page(
+                        space_id=final_space_id,
+                        title=final_title,
+                        body=final_content,
+                    )
+                    await client.close()
+                except Exception as api_err:
+                    if (
+                        "http 403" in str(api_err).lower()
+                        or "status code 403" in str(api_err).lower()
+                    ):
+                        try:
+                            _conn = connector
+                            _conn.config = {**_conn.config, "auth_expired": True}
+                            flag_modified(_conn, "config")
+                            await db_session.commit()
+                        except Exception:
+                            pass
+                        return {
+                            "status": "insufficient_permissions",
+                            "connector_id": actual_connector_id,
+                            "message": "This Confluence account needs additional permissions. Please re-authenticate in connector settings.",
+                        }
+                    raise
+
+                page_id = str(api_result.get("id", ""))
+                page_links = (
+                    api_result.get("_links", {}) if isinstance(api_result, dict) else {}
+                )
+                page_url = ""
+                if page_links.get("base") and page_links.get("webui"):
+                    page_url = f"{page_links['base']}{page_links['webui']}"
+
+                kb_message_suffix = ""
+                try:
+                    from app.services.confluence import ConfluenceKBSyncService
+
+                    kb_service = ConfluenceKBSyncService(db_session)
+                    kb_result = await kb_service.sync_after_create(
+                        page_id=page_id,
+                        page_title=final_title,
+                        space_id=final_space_id,
+                        body_content=final_content,
+                        connector_id=actual_connector_id,
+                        search_space_id=search_space_id,
+                        user_id=user_id,
+                    )
+                    if kb_result["status"] == "success":
+                        kb_message_suffix = (
+                            " Your knowledge base has also been updated."
+                        )
+                    else:
+                        kb_message_suffix = " This page will be added to your knowledge base in the next scheduled sync."
+                except Exception as kb_err:
+                    logger.warning(f"KB sync after create failed: {kb_err}")
                     kb_message_suffix = " This page will be added to your knowledge base in the next scheduled sync."
-            except Exception as kb_err:
-                logger.warning(f"KB sync after create failed: {kb_err}")
-                kb_message_suffix = " This page will be added to your knowledge base in the next scheduled sync."
 
-            return {
-                "status": "success",
-                "page_id": page_id,
-                "page_url": page_url,
-                "message": f"Confluence page '{final_title}' created successfully.{kb_message_suffix}",
-            }
+                return {
+                    "status": "success",
+                    "page_id": page_id,
+                    "page_url": page_url,
+                    "message": f"Confluence page '{final_title}' created successfully.{kb_message_suffix}",
+                }
 
         except Exception as e:
             from langgraph.errors import GraphInterrupt
diff --git a/surfsense_backend/app/agents/new_chat/tools/confluence/delete_page.py b/surfsense_backend/app/agents/new_chat/tools/confluence/delete_page.py
index ba1dae6534..d4cd5032f7 100644
--- a/surfsense_backend/app/agents/new_chat/tools/confluence/delete_page.py
+++ b/surfsense_backend/app/agents/new_chat/tools/confluence/delete_page.py
@@ -2,11 +2,12 @@
 from typing import Any
 
 from langchain_core.tools import tool
-from langgraph.types import interrupt
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.orm.attributes import flag_modified
 
+from app.agents.new_chat.tools.hitl import request_approval
 from app.connectors.confluence_history import ConfluenceHistoryConnector
+from app.db import async_session_maker
 from app.services.confluence import ConfluenceToolMetadataService
 
 logger = logging.getLogger(__name__)
@@ -18,6 +19,23 @@ def create_delete_confluence_page_tool(
     user_id: str | None = None,
     connector_id: int | None = None,
 ):
+    """
+    Factory function to create the delete_confluence_page tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+
+    Returns:
+        Configured delete_confluence_page tool
+    """
+    del db_session  # per-call session — see docstring
+
     @tool
     async def delete_confluence_page(
         page_title_or_id: str,
@@ -43,163 +61,143 @@ async def delete_confluence_page(
             f"delete_confluence_page called: page_title_or_id='{page_title_or_id}'"
         )
 
-        if db_session is None or search_space_id is None or user_id is None:
+        if search_space_id is None or user_id is None:
             return {
                 "status": "error",
                 "message": "Confluence tool not properly configured.",
             }
 
         try:
-            metadata_service = ConfluenceToolMetadataService(db_session)
-            context = await metadata_service.get_deletion_context(
-                search_space_id, user_id, page_title_or_id
-            )
-
-            if "error" in context:
-                error_msg = context["error"]
-                if context.get("auth_expired"):
-                    return {
-                        "status": "auth_error",
-                        "message": error_msg,
-                        "connector_id": context.get("connector_id"),
-                        "connector_type": "confluence",
-                    }
-                if "not found" in error_msg.lower():
-                    return {"status": "not_found", "message": error_msg}
-                return {"status": "error", "message": error_msg}
-
-            page_data = context["page"]
-            page_id = page_data["page_id"]
-            page_title = page_data.get("page_title", "")
-            document_id = page_data["document_id"]
-            connector_id_from_context = context.get("account", {}).get("id")
-
-            approval = interrupt(
-                {
-                    "type": "confluence_page_deletion",
-                    "action": {
-                        "tool": "delete_confluence_page",
-                        "params": {
-                            "page_id": page_id,
-                            "connector_id": connector_id_from_context,
-                            "delete_from_kb": delete_from_kb,
-                        },
+            async with async_session_maker() as db_session:
+                metadata_service = ConfluenceToolMetadataService(db_session)
+                context = await metadata_service.get_deletion_context(
+                    search_space_id, user_id, page_title_or_id
+                )
+
+                if "error" in context:
+                    error_msg = context["error"]
+                    if context.get("auth_expired"):
+                        return {
+                            "status": "auth_error",
+                            "message": error_msg,
+                            "connector_id": context.get("connector_id"),
+                            "connector_type": "confluence",
+                        }
+                    if "not found" in error_msg.lower():
+                        return {"status": "not_found", "message": error_msg}
+                    return {"status": "error", "message": error_msg}
+
+                page_data = context["page"]
+                page_id = page_data["page_id"]
+                page_title = page_data.get("page_title", "")
+                document_id = page_data["document_id"]
+                connector_id_from_context = context.get("account", {}).get("id")
+
+                result = request_approval(
+                    action_type="confluence_page_deletion",
+                    tool_name="delete_confluence_page",
+                    params={
+                        "page_id": page_id,
+                        "connector_id": connector_id_from_context,
+                        "delete_from_kb": delete_from_kb,
                     },
-                    "context": context,
-                }
-            )
-
-            decisions_raw = (
-                approval.get("decisions", []) if isinstance(approval, dict) else []
-            )
-            decisions = (
-                decisions_raw if isinstance(decisions_raw, list) else [decisions_raw]
-            )
-            decisions = [d for d in decisions if isinstance(d, dict)]
-            if not decisions:
-                return {"status": "error", "message": "No approval decision received"}
-
-            decision = decisions[0]
-            decision_type = decision.get("type") or decision.get("decision_type")
-
-            if decision_type == "reject":
-                return {
-                    "status": "rejected",
-                    "message": "User declined. The page was not deleted.",
-                }
+                    context=context,
+                )
 
-            final_params: dict[str, Any] = {}
-            edited_action = decision.get("edited_action")
-            if isinstance(edited_action, dict):
-                edited_args = edited_action.get("args")
-                if isinstance(edited_args, dict):
-                    final_params = edited_args
-            elif isinstance(decision.get("args"), dict):
-                final_params = decision["args"]
+                if result.rejected:
+                    return {
+                        "status": "rejected",
+                        "message": "User declined. Do not retry or suggest alternatives.",
+                    }
 
-            final_page_id = final_params.get("page_id", page_id)
-            final_connector_id = final_params.get(
-                "connector_id", connector_id_from_context
-            )
-            final_delete_from_kb = final_params.get("delete_from_kb", delete_from_kb)
+                final_page_id = result.params.get("page_id", page_id)
+                final_connector_id = result.params.get(
+                    "connector_id", connector_id_from_context
+                )
+                final_delete_from_kb = result.params.get(
+                    "delete_from_kb", delete_from_kb
+                )
 
-            from sqlalchemy.future import select
+                from sqlalchemy.future import select
 
-            from app.db import SearchSourceConnector, SearchSourceConnectorType
+                from app.db import SearchSourceConnector, SearchSourceConnectorType
 
-            if not final_connector_id:
-                return {
-                    "status": "error",
-                    "message": "No connector found for this page.",
-                }
-
-            result = await db_session.execute(
-                select(SearchSourceConnector).filter(
-                    SearchSourceConnector.id == final_connector_id,
-                    SearchSourceConnector.search_space_id == search_space_id,
-                    SearchSourceConnector.user_id == user_id,
-                    SearchSourceConnector.connector_type
-                    == SearchSourceConnectorType.CONFLUENCE_CONNECTOR,
-                )
-            )
-            connector = result.scalars().first()
-            if not connector:
-                return {
-                    "status": "error",
-                    "message": "Selected Confluence connector is invalid.",
-                }
+                if not final_connector_id:
+                    return {
+                        "status": "error",
+                        "message": "No connector found for this page.",
+                    }
 
-            try:
-                client = ConfluenceHistoryConnector(
-                    session=db_session, connector_id=final_connector_id
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.id == final_connector_id,
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type
+                        == SearchSourceConnectorType.CONFLUENCE_CONNECTOR,
+                    )
                 )
-                await client.delete_page(final_page_id)
-                await client.close()
-            except Exception as api_err:
-                if (
-                    "http 403" in str(api_err).lower()
-                    or "status code 403" in str(api_err).lower()
-                ):
-                    try:
-                        connector.config = {**connector.config, "auth_expired": True}
-                        flag_modified(connector, "config")
-                        await db_session.commit()
-                    except Exception:
-                        pass
+                connector = result.scalars().first()
+                if not connector:
                     return {
-                        "status": "insufficient_permissions",
-                        "connector_id": final_connector_id,
-                        "message": "This Confluence account needs additional permissions. Please re-authenticate in connector settings.",
+                        "status": "error",
+                        "message": "Selected Confluence connector is invalid.",
                     }
-                raise
 
-            deleted_from_kb = False
-            if final_delete_from_kb and document_id:
                 try:
-                    from app.db import Document
-
-                    doc_result = await db_session.execute(
-                        select(Document).filter(Document.id == document_id)
+                    client = ConfluenceHistoryConnector(
+                        session=db_session, connector_id=final_connector_id
                     )
-                    document = doc_result.scalars().first()
-                    if document:
-                        await db_session.delete(document)
-                        await db_session.commit()
-                        deleted_from_kb = True
-                except Exception as e:
-                    logger.error(f"Failed to delete document from KB: {e}")
-                    await db_session.rollback()
-
-            message = f"Confluence page '{page_title}' deleted successfully."
-            if deleted_from_kb:
-                message += " Also removed from the knowledge base."
+                    await client.delete_page(final_page_id)
+                    await client.close()
+                except Exception as api_err:
+                    if (
+                        "http 403" in str(api_err).lower()
+                        or "status code 403" in str(api_err).lower()
+                    ):
+                        try:
+                            connector.config = {
+                                **connector.config,
+                                "auth_expired": True,
+                            }
+                            flag_modified(connector, "config")
+                            await db_session.commit()
+                        except Exception:
+                            pass
+                        return {
+                            "status": "insufficient_permissions",
+                            "connector_id": final_connector_id,
+                            "message": "This Confluence account needs additional permissions. Please re-authenticate in connector settings.",
+                        }
+                    raise
+
+                deleted_from_kb = False
+                if final_delete_from_kb and document_id:
+                    try:
+                        from app.db import Document
+
+                        doc_result = await db_session.execute(
+                            select(Document).filter(Document.id == document_id)
+                        )
+                        document = doc_result.scalars().first()
+                        if document:
+                            await db_session.delete(document)
+                            await db_session.commit()
+                            deleted_from_kb = True
+                    except Exception as e:
+                        logger.error(f"Failed to delete document from KB: {e}")
+                        await db_session.rollback()
+
+                message = f"Confluence page '{page_title}' deleted successfully."
+                if deleted_from_kb:
+                    message += " Also removed from the knowledge base."
 
-            return {
-                "status": "success",
-                "page_id": final_page_id,
-                "deleted_from_kb": deleted_from_kb,
-                "message": message,
-            }
+                return {
+                    "status": "success",
+                    "page_id": final_page_id,
+                    "deleted_from_kb": deleted_from_kb,
+                    "message": message,
+                }
 
         except Exception as e:
             from langgraph.errors import GraphInterrupt
diff --git a/surfsense_backend/app/agents/new_chat/tools/confluence/update_page.py b/surfsense_backend/app/agents/new_chat/tools/confluence/update_page.py
index 913896f839..51c205e005 100644
--- a/surfsense_backend/app/agents/new_chat/tools/confluence/update_page.py
+++ b/surfsense_backend/app/agents/new_chat/tools/confluence/update_page.py
@@ -2,11 +2,12 @@
 from typing import Any
 
 from langchain_core.tools import tool
-from langgraph.types import interrupt
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.orm.attributes import flag_modified
 
+from app.agents.new_chat.tools.hitl import request_approval
 from app.connectors.confluence_history import ConfluenceHistoryConnector
+from app.db import async_session_maker
 from app.services.confluence import ConfluenceToolMetadataService
 
 logger = logging.getLogger(__name__)
@@ -18,6 +19,23 @@ def create_update_confluence_page_tool(
     user_id: str | None = None,
     connector_id: int | None = None,
 ):
+    """
+    Factory function to create the update_confluence_page tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+
+    Returns:
+        Configured update_confluence_page tool
+    """
+    del db_session  # per-call session — see docstring
+
     @tool
     async def update_confluence_page(
         page_title_or_id: str,
@@ -45,190 +63,168 @@ async def update_confluence_page(
             f"update_confluence_page called: page_title_or_id='{page_title_or_id}'"
         )
 
-        if db_session is None or search_space_id is None or user_id is None:
+        if search_space_id is None or user_id is None:
             return {
                 "status": "error",
                 "message": "Confluence tool not properly configured.",
             }
 
         try:
-            metadata_service = ConfluenceToolMetadataService(db_session)
-            context = await metadata_service.get_update_context(
-                search_space_id, user_id, page_title_or_id
-            )
-
-            if "error" in context:
-                error_msg = context["error"]
-                if context.get("auth_expired"):
-                    return {
-                        "status": "auth_error",
-                        "message": error_msg,
-                        "connector_id": context.get("connector_id"),
-                        "connector_type": "confluence",
-                    }
-                if "not found" in error_msg.lower():
-                    return {"status": "not_found", "message": error_msg}
-                return {"status": "error", "message": error_msg}
-
-            page_data = context["page"]
-            page_id = page_data["page_id"]
-            current_title = page_data["page_title"]
-            current_body = page_data.get("body", "")
-            current_version = page_data.get("version", 1)
-            document_id = page_data.get("document_id")
-            connector_id_from_context = context.get("account", {}).get("id")
-
-            approval = interrupt(
-                {
-                    "type": "confluence_page_update",
-                    "action": {
-                        "tool": "update_confluence_page",
-                        "params": {
-                            "page_id": page_id,
-                            "document_id": document_id,
-                            "new_title": new_title,
-                            "new_content": new_content,
-                            "version": current_version,
-                            "connector_id": connector_id_from_context,
-                        },
+            async with async_session_maker() as db_session:
+                metadata_service = ConfluenceToolMetadataService(db_session)
+                context = await metadata_service.get_update_context(
+                    search_space_id, user_id, page_title_or_id
+                )
+
+                if "error" in context:
+                    error_msg = context["error"]
+                    if context.get("auth_expired"):
+                        return {
+                            "status": "auth_error",
+                            "message": error_msg,
+                            "connector_id": context.get("connector_id"),
+                            "connector_type": "confluence",
+                        }
+                    if "not found" in error_msg.lower():
+                        return {"status": "not_found", "message": error_msg}
+                    return {"status": "error", "message": error_msg}
+
+                page_data = context["page"]
+                page_id = page_data["page_id"]
+                current_title = page_data["page_title"]
+                current_body = page_data.get("body", "")
+                current_version = page_data.get("version", 1)
+                document_id = page_data.get("document_id")
+                connector_id_from_context = context.get("account", {}).get("id")
+
+                result = request_approval(
+                    action_type="confluence_page_update",
+                    tool_name="update_confluence_page",
+                    params={
+                        "page_id": page_id,
+                        "document_id": document_id,
+                        "new_title": new_title,
+                        "new_content": new_content,
+                        "version": current_version,
+                        "connector_id": connector_id_from_context,
                     },
-                    "context": context,
-                }
-            )
-
-            decisions_raw = (
-                approval.get("decisions", []) if isinstance(approval, dict) else []
-            )
-            decisions = (
-                decisions_raw if isinstance(decisions_raw, list) else [decisions_raw]
-            )
-            decisions = [d for d in decisions if isinstance(d, dict)]
-            if not decisions:
-                return {"status": "error", "message": "No approval decision received"}
-
-            decision = decisions[0]
-            decision_type = decision.get("type") or decision.get("decision_type")
-
-            if decision_type == "reject":
-                return {
-                    "status": "rejected",
-                    "message": "User declined. The page was not updated.",
-                }
+                    context=context,
+                )
 
-            final_params: dict[str, Any] = {}
-            edited_action = decision.get("edited_action")
-            if isinstance(edited_action, dict):
-                edited_args = edited_action.get("args")
-                if isinstance(edited_args, dict):
-                    final_params = edited_args
-            elif isinstance(decision.get("args"), dict):
-                final_params = decision["args"]
-
-            final_page_id = final_params.get("page_id", page_id)
-            final_title = final_params.get("new_title", new_title) or current_title
-            final_content = final_params.get("new_content", new_content)
-            if final_content is None:
-                final_content = current_body
-            final_version = final_params.get("version", current_version)
-            final_connector_id = final_params.get(
-                "connector_id", connector_id_from_context
-            )
-            final_document_id = final_params.get("document_id", document_id)
-
-            from sqlalchemy.future import select
-
-            from app.db import SearchSourceConnector, SearchSourceConnectorType
-
-            if not final_connector_id:
-                return {
-                    "status": "error",
-                    "message": "No connector found for this page.",
-                }
+                if result.rejected:
+                    return {
+                        "status": "rejected",
+                        "message": "User declined. Do not retry or suggest alternatives.",
+                    }
 
-            result = await db_session.execute(
-                select(SearchSourceConnector).filter(
-                    SearchSourceConnector.id == final_connector_id,
-                    SearchSourceConnector.search_space_id == search_space_id,
-                    SearchSourceConnector.user_id == user_id,
-                    SearchSourceConnector.connector_type
-                    == SearchSourceConnectorType.CONFLUENCE_CONNECTOR,
+                final_page_id = result.params.get("page_id", page_id)
+                final_title = result.params.get("new_title", new_title) or current_title
+                final_content = result.params.get("new_content", new_content)
+                if final_content is None:
+                    final_content = current_body
+                final_version = result.params.get("version", current_version)
+                final_connector_id = result.params.get(
+                    "connector_id", connector_id_from_context
                 )
-            )
-            connector = result.scalars().first()
-            if not connector:
-                return {
-                    "status": "error",
-                    "message": "Selected Confluence connector is invalid.",
-                }
+                final_document_id = result.params.get("document_id", document_id)
 
-            try:
-                client = ConfluenceHistoryConnector(
-                    session=db_session, connector_id=final_connector_id
-                )
-                api_result = await client.update_page(
-                    page_id=final_page_id,
-                    title=final_title,
-                    body=final_content,
-                    version_number=final_version + 1,
-                )
-                await client.close()
-            except Exception as api_err:
-                if (
-                    "http 403" in str(api_err).lower()
-                    or "status code 403" in str(api_err).lower()
-                ):
-                    try:
-                        connector.config = {**connector.config, "auth_expired": True}
-                        flag_modified(connector, "config")
-                        await db_session.commit()
-                    except Exception:
-                        pass
+                from sqlalchemy.future import select
+
+                from app.db import SearchSourceConnector, SearchSourceConnectorType
+
+                if not final_connector_id:
                     return {
-                        "status": "insufficient_permissions",
-                        "connector_id": final_connector_id,
-                        "message": "This Confluence account needs additional permissions. Please re-authenticate in connector settings.",
+                        "status": "error",
+                        "message": "No connector found for this page.",
                     }
-                raise
 
-            page_links = (
-                api_result.get("_links", {}) if isinstance(api_result, dict) else {}
-            )
-            page_url = ""
-            if page_links.get("base") and page_links.get("webui"):
-                page_url = f"{page_links['base']}{page_links['webui']}"
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.id == final_connector_id,
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type
+                        == SearchSourceConnectorType.CONFLUENCE_CONNECTOR,
+                    )
+                )
+                connector = result.scalars().first()
+                if not connector:
+                    return {
+                        "status": "error",
+                        "message": "Selected Confluence connector is invalid.",
+                    }
 
-            kb_message_suffix = ""
-            if final_document_id:
                 try:
-                    from app.services.confluence import ConfluenceKBSyncService
-
-                    kb_service = ConfluenceKBSyncService(db_session)
-                    kb_result = await kb_service.sync_after_update(
-                        document_id=final_document_id,
+                    client = ConfluenceHistoryConnector(
+                        session=db_session, connector_id=final_connector_id
+                    )
+                    api_result = await client.update_page(
                         page_id=final_page_id,
-                        user_id=user_id,
-                        search_space_id=search_space_id,
+                        title=final_title,
+                        body=final_content,
+                        version_number=final_version + 1,
                     )
-                    if kb_result["status"] == "success":
-                        kb_message_suffix = (
-                            " Your knowledge base has also been updated."
+                    await client.close()
+                except Exception as api_err:
+                    if (
+                        "http 403" in str(api_err).lower()
+                        or "status code 403" in str(api_err).lower()
+                    ):
+                        try:
+                            connector.config = {
+                                **connector.config,
+                                "auth_expired": True,
+                            }
+                            flag_modified(connector, "config")
+                            await db_session.commit()
+                        except Exception:
+                            pass
+                        return {
+                            "status": "insufficient_permissions",
+                            "connector_id": final_connector_id,
+                            "message": "This Confluence account needs additional permissions. Please re-authenticate in connector settings.",
+                        }
+                    raise
+
+                page_links = (
+                    api_result.get("_links", {}) if isinstance(api_result, dict) else {}
+                )
+                page_url = ""
+                if page_links.get("base") and page_links.get("webui"):
+                    page_url = f"{page_links['base']}{page_links['webui']}"
+
+                kb_message_suffix = ""
+                if final_document_id:
+                    try:
+                        from app.services.confluence import ConfluenceKBSyncService
+
+                        kb_service = ConfluenceKBSyncService(db_session)
+                        kb_result = await kb_service.sync_after_update(
+                            document_id=final_document_id,
+                            page_id=final_page_id,
+                            user_id=user_id,
+                            search_space_id=search_space_id,
                         )
-                    else:
+                        if kb_result["status"] == "success":
+                            kb_message_suffix = (
+                                " Your knowledge base has also been updated."
+                            )
+                        else:
+                            kb_message_suffix = (
+                                " The knowledge base will be updated in the next sync."
+                            )
+                    except Exception as kb_err:
+                        logger.warning(f"KB sync after update failed: {kb_err}")
                         kb_message_suffix = (
                             " The knowledge base will be updated in the next sync."
                         )
-                except Exception as kb_err:
-                    logger.warning(f"KB sync after update failed: {kb_err}")
-                    kb_message_suffix = (
-                        " The knowledge base will be updated in the next sync."
-                    )
 
-            return {
-                "status": "success",
-                "page_id": final_page_id,
-                "page_url": page_url,
-                "message": f"Confluence page '{final_title}' updated successfully.{kb_message_suffix}",
-            }
+                return {
+                    "status": "success",
+                    "page_id": final_page_id,
+                    "page_url": page_url,
+                    "message": f"Confluence page '{final_title}' updated successfully.{kb_message_suffix}",
+                }
 
         except Exception as e:
             from langgraph.errors import GraphInterrupt
diff --git a/surfsense_backend/app/agents/new_chat/tools/connected_accounts.py b/surfsense_backend/app/agents/new_chat/tools/connected_accounts.py
new file mode 100644
index 0000000000..6420a90e6d
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/tools/connected_accounts.py
@@ -0,0 +1,135 @@
+"""Connected-accounts discovery tool.
+
+Lets the LLM discover which accounts are connected for a given service
+(e.g. "jira", "linear", "slack") and retrieve the metadata it needs to
+call action tools — such as Jira's ``cloudId``.
+
+The tool returns **only** non-sensitive fields explicitly listed in the
+service's ``account_metadata_keys`` (see ``registry.py``), plus the
+always-present ``display_name`` and ``connector_id``.
+"""
+
+import logging
+from typing import Any
+
+from langchain_core.tools import StructuredTool
+from pydantic import BaseModel, Field
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.future import select
+
+from app.db import SearchSourceConnector, SearchSourceConnectorType, async_session_maker
+from app.services.mcp_oauth.registry import MCP_SERVICES
+
+logger = logging.getLogger(__name__)
+
+_SERVICE_KEY_BY_CONNECTOR_TYPE: dict[str, str] = {
+    cfg.connector_type: key for key, cfg in MCP_SERVICES.items()
+}
+
+
+class GetConnectedAccountsInput(BaseModel):
+    service: str = Field(
+        description=(
+            "Service key to look up connected accounts for. "
+            "Valid values: " + ", ".join(sorted(MCP_SERVICES.keys()))
+        ),
+    )
+
+
+def _extract_display_name(connector: SearchSourceConnector) -> str:
+    """Best-effort human-readable label for a connector."""
+    cfg = connector.config or {}
+    if cfg.get("display_name"):
+        return cfg["display_name"]
+    if cfg.get("base_url"):
+        return f"{connector.name} ({cfg['base_url']})"
+    if cfg.get("organization_name"):
+        return f"{connector.name} ({cfg['organization_name']})"
+    return connector.name
+
+
+def create_get_connected_accounts_tool(
+    db_session: AsyncSession,
+    search_space_id: int,
+    user_id: str,
+) -> StructuredTool:
+    """Factory function to create the get_connected_accounts tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+        search_space_id: Search space ID to scope account discovery to.
+        user_id: User ID to scope account discovery to.
+
+    Returns:
+        Configured StructuredTool for connected-accounts discovery.
+    """
+    del db_session  # per-call session — see docstring
+
+    async def _run(service: str) -> list[dict[str, Any]]:
+        svc_cfg = MCP_SERVICES.get(service)
+        if not svc_cfg:
+            return [
+                {
+                    "error": f"Unknown service '{service}'. Valid: {', '.join(sorted(MCP_SERVICES.keys()))}"
+                }
+            ]
+
+        try:
+            connector_type = SearchSourceConnectorType(svc_cfg.connector_type)
+        except ValueError:
+            return [{"error": f"Connector type '{svc_cfg.connector_type}' not found."}]
+
+        async with async_session_maker() as db_session:
+            result = await db_session.execute(
+                select(SearchSourceConnector).filter(
+                    SearchSourceConnector.search_space_id == search_space_id,
+                    SearchSourceConnector.user_id == user_id,
+                    SearchSourceConnector.connector_type == connector_type,
+                )
+            )
+            connectors = result.scalars().all()
+
+            if not connectors:
+                return [
+                    {
+                        "error": f"No {svc_cfg.name} accounts connected. Ask the user to connect one in settings."
+                    }
+                ]
+
+            is_multi = len(connectors) > 1
+
+            accounts: list[dict[str, Any]] = []
+            for conn in connectors:
+                cfg = conn.config or {}
+                entry: dict[str, Any] = {
+                    "connector_id": conn.id,
+                    "display_name": _extract_display_name(conn),
+                    "service": service,
+                }
+                if is_multi:
+                    entry["tool_prefix"] = f"{service}_{conn.id}"
+                for key in svc_cfg.account_metadata_keys:
+                    if key in cfg:
+                        entry[key] = cfg[key]
+                accounts.append(entry)
+
+            return accounts
+
+    return StructuredTool(
+        name="get_connected_accounts",
+        description=(
+            "Discover which accounts are connected for a service (e.g. jira, linear, slack, clickup, airtable). "
+            "Returns display names and service-specific metadata the action tools need "
+            "(e.g. Jira's cloudId). Call this BEFORE using a service's action tools when "
+            "you need an account identifier or are unsure which account to use."
+        ),
+        coroutine=_run,
+        args_schema=GetConnectedAccountsInput,
+        metadata={"hitl": False},
+    )
diff --git a/surfsense_backend/app/agents/new_chat/tools/discord/__init__.py b/surfsense_backend/app/agents/new_chat/tools/discord/__init__.py
new file mode 100644
index 0000000000..b4eaec1f08
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/tools/discord/__init__.py
@@ -0,0 +1,15 @@
+from app.agents.new_chat.tools.discord.list_channels import (
+    create_list_discord_channels_tool,
+)
+from app.agents.new_chat.tools.discord.read_messages import (
+    create_read_discord_messages_tool,
+)
+from app.agents.new_chat.tools.discord.send_message import (
+    create_send_discord_message_tool,
+)
+
+__all__ = [
+    "create_list_discord_channels_tool",
+    "create_read_discord_messages_tool",
+    "create_send_discord_message_tool",
+]
diff --git a/surfsense_backend/app/agents/new_chat/tools/discord/_auth.py b/surfsense_backend/app/agents/new_chat/tools/discord/_auth.py
new file mode 100644
index 0000000000..c345f8a5e4
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/tools/discord/_auth.py
@@ -0,0 +1,43 @@
+"""Shared auth helper for Discord agent tools (REST API, not gateway bot)."""
+
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.future import select
+
+from app.config import config
+from app.db import SearchSourceConnector, SearchSourceConnectorType
+from app.utils.oauth_security import TokenEncryption
+
+DISCORD_API = "https://discord.com/api/v10"
+
+
+async def get_discord_connector(
+    db_session: AsyncSession,
+    search_space_id: int,
+    user_id: str,
+) -> SearchSourceConnector | None:
+    result = await db_session.execute(
+        select(SearchSourceConnector).filter(
+            SearchSourceConnector.search_space_id == search_space_id,
+            SearchSourceConnector.user_id == user_id,
+            SearchSourceConnector.connector_type
+            == SearchSourceConnectorType.DISCORD_CONNECTOR,
+        )
+    )
+    return result.scalars().first()
+
+
+def get_bot_token(connector: SearchSourceConnector) -> str:
+    """Extract and decrypt the bot token from connector config."""
+    cfg = dict(connector.config)
+    if cfg.get("_token_encrypted") and config.SECRET_KEY:
+        enc = TokenEncryption(config.SECRET_KEY)
+        if cfg.get("bot_token"):
+            cfg["bot_token"] = enc.decrypt_token(cfg["bot_token"])
+    token = cfg.get("bot_token")
+    if not token:
+        raise ValueError("Discord bot token not found in connector config.")
+    return token
+
+
+def get_guild_id(connector: SearchSourceConnector) -> str | None:
+    return connector.config.get("guild_id")
diff --git a/surfsense_backend/app/agents/new_chat/tools/discord/list_channels.py b/surfsense_backend/app/agents/new_chat/tools/discord/list_channels.py
new file mode 100644
index 0000000000..01159a261f
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/tools/discord/list_channels.py
@@ -0,0 +1,107 @@
+import logging
+from typing import Any
+
+import httpx
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import async_session_maker
+
+from ._auth import DISCORD_API, get_bot_token, get_discord_connector, get_guild_id
+
+logger = logging.getLogger(__name__)
+
+
+def create_list_discord_channels_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    """
+    Factory function to create the list_discord_channels tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+
+    Returns:
+        Configured list_discord_channels tool
+    """
+    del db_session  # per-call session — see docstring
+
+    @tool
+    async def list_discord_channels() -> dict[str, Any]:
+        """List text channels in the connected Discord server.
+
+        Returns:
+            Dictionary with status and a list of channels (id, name).
+        """
+        if search_space_id is None or user_id is None:
+            return {
+                "status": "error",
+                "message": "Discord tool not properly configured.",
+            }
+
+        try:
+            async with async_session_maker() as db_session:
+                connector = await get_discord_connector(
+                    db_session, search_space_id, user_id
+                )
+                if not connector:
+                    return {"status": "error", "message": "No Discord connector found."}
+
+                guild_id = get_guild_id(connector)
+                if not guild_id:
+                    return {
+                        "status": "error",
+                        "message": "No guild ID in Discord connector config.",
+                    }
+
+                token = get_bot_token(connector)
+
+                async with httpx.AsyncClient() as client:
+                    resp = await client.get(
+                        f"{DISCORD_API}/guilds/{guild_id}/channels",
+                        headers={"Authorization": f"Bot {token}"},
+                        timeout=15.0,
+                    )
+
+                if resp.status_code == 401:
+                    return {
+                        "status": "auth_error",
+                        "message": "Discord bot token is invalid.",
+                        "connector_type": "discord",
+                    }
+                if resp.status_code != 200:
+                    return {
+                        "status": "error",
+                        "message": f"Discord API error: {resp.status_code}",
+                    }
+
+                # Type 0 = text channel
+                channels = [
+                    {"id": ch["id"], "name": ch["name"]}
+                    for ch in resp.json()
+                    if ch.get("type") == 0
+                ]
+                return {
+                    "status": "success",
+                    "guild_id": guild_id,
+                    "channels": channels,
+                    "total": len(channels),
+                }
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+            logger.error("Error listing Discord channels: %s", e, exc_info=True)
+            return {"status": "error", "message": "Failed to list Discord channels."}
+
+    return list_discord_channels
diff --git a/surfsense_backend/app/agents/new_chat/tools/discord/read_messages.py b/surfsense_backend/app/agents/new_chat/tools/discord/read_messages.py
new file mode 100644
index 0000000000..88d6cdd499
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/tools/discord/read_messages.py
@@ -0,0 +1,120 @@
+import logging
+from typing import Any
+
+import httpx
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import async_session_maker
+
+from ._auth import DISCORD_API, get_bot_token, get_discord_connector
+
+logger = logging.getLogger(__name__)
+
+
+def create_read_discord_messages_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    """
+    Factory function to create the read_discord_messages tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+
+    Returns:
+        Configured read_discord_messages tool
+    """
+    del db_session  # per-call session — see docstring
+
+    @tool
+    async def read_discord_messages(
+        channel_id: str,
+        limit: int = 25,
+    ) -> dict[str, Any]:
+        """Read recent messages from a Discord text channel.
+
+        Args:
+            channel_id: The Discord channel ID (from list_discord_channels).
+            limit: Number of messages to fetch (default 25, max 50).
+
+        Returns:
+            Dictionary with status and a list of messages including
+            id, author, content, timestamp.
+        """
+        if search_space_id is None or user_id is None:
+            return {
+                "status": "error",
+                "message": "Discord tool not properly configured.",
+            }
+
+        limit = min(limit, 50)
+
+        try:
+            async with async_session_maker() as db_session:
+                connector = await get_discord_connector(
+                    db_session, search_space_id, user_id
+                )
+                if not connector:
+                    return {"status": "error", "message": "No Discord connector found."}
+
+                token = get_bot_token(connector)
+
+                async with httpx.AsyncClient() as client:
+                    resp = await client.get(
+                        f"{DISCORD_API}/channels/{channel_id}/messages",
+                        headers={"Authorization": f"Bot {token}"},
+                        params={"limit": limit},
+                        timeout=15.0,
+                    )
+
+                if resp.status_code == 401:
+                    return {
+                        "status": "auth_error",
+                        "message": "Discord bot token is invalid.",
+                        "connector_type": "discord",
+                    }
+                if resp.status_code == 403:
+                    return {
+                        "status": "error",
+                        "message": "Bot lacks permission to read this channel.",
+                    }
+                if resp.status_code != 200:
+                    return {
+                        "status": "error",
+                        "message": f"Discord API error: {resp.status_code}",
+                    }
+
+                messages = [
+                    {
+                        "id": m["id"],
+                        "author": m.get("author", {}).get("username", "Unknown"),
+                        "content": m.get("content", ""),
+                        "timestamp": m.get("timestamp", ""),
+                    }
+                    for m in resp.json()
+                ]
+
+                return {
+                    "status": "success",
+                    "channel_id": channel_id,
+                    "messages": messages,
+                    "total": len(messages),
+                }
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+            logger.error("Error reading Discord messages: %s", e, exc_info=True)
+            return {"status": "error", "message": "Failed to read Discord messages."}
+
+    return read_discord_messages
diff --git a/surfsense_backend/app/agents/new_chat/tools/discord/send_message.py b/surfsense_backend/app/agents/new_chat/tools/discord/send_message.py
new file mode 100644
index 0000000000..5fe6fde356
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/tools/discord/send_message.py
@@ -0,0 +1,136 @@
+import logging
+from typing import Any
+
+import httpx
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.agents.new_chat.tools.hitl import request_approval
+from app.db import async_session_maker
+
+from ._auth import DISCORD_API, get_bot_token, get_discord_connector
+
+logger = logging.getLogger(__name__)
+
+
+def create_send_discord_message_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    """
+    Factory function to create the send_discord_message tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+
+    Returns:
+        Configured send_discord_message tool
+    """
+    del db_session  # per-call session — see docstring
+
+    @tool
+    async def send_discord_message(
+        channel_id: str,
+        content: str,
+    ) -> dict[str, Any]:
+        """Send a message to a Discord text channel.
+
+        Args:
+            channel_id: The Discord channel ID (from list_discord_channels).
+            content: The message text (max 2000 characters).
+
+        Returns:
+            Dictionary with status, message_id on success.
+
+            IMPORTANT:
+            - If status is "rejected", the user explicitly declined. Do NOT retry.
+        """
+        if search_space_id is None or user_id is None:
+            return {
+                "status": "error",
+                "message": "Discord tool not properly configured.",
+            }
+
+        if len(content) > 2000:
+            return {
+                "status": "error",
+                "message": "Message exceeds Discord's 2000-character limit.",
+            }
+
+        try:
+            async with async_session_maker() as db_session:
+                connector = await get_discord_connector(
+                    db_session, search_space_id, user_id
+                )
+                if not connector:
+                    return {"status": "error", "message": "No Discord connector found."}
+
+                result = request_approval(
+                    action_type="discord_send_message",
+                    tool_name="send_discord_message",
+                    params={"channel_id": channel_id, "content": content},
+                    context={"connector_id": connector.id},
+                )
+
+                if result.rejected:
+                    return {
+                        "status": "rejected",
+                        "message": "User declined. Message was not sent.",
+                    }
+
+                final_content = result.params.get("content", content)
+                final_channel = result.params.get("channel_id", channel_id)
+
+                token = get_bot_token(connector)
+
+                async with httpx.AsyncClient() as client:
+                    resp = await client.post(
+                        f"{DISCORD_API}/channels/{final_channel}/messages",
+                        headers={
+                            "Authorization": f"Bot {token}",
+                            "Content-Type": "application/json",
+                        },
+                        json={"content": final_content},
+                        timeout=15.0,
+                    )
+
+                if resp.status_code == 401:
+                    return {
+                        "status": "auth_error",
+                        "message": "Discord bot token is invalid.",
+                        "connector_type": "discord",
+                    }
+                if resp.status_code == 403:
+                    return {
+                        "status": "error",
+                        "message": "Bot lacks permission to send messages in this channel.",
+                    }
+                if resp.status_code not in (200, 201):
+                    return {
+                        "status": "error",
+                        "message": f"Discord API error: {resp.status_code}",
+                    }
+
+                msg_data = resp.json()
+                return {
+                    "status": "success",
+                    "message_id": msg_data.get("id"),
+                    "message": f"Message sent to channel {final_channel}.",
+                }
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+            logger.error("Error sending Discord message: %s", e, exc_info=True)
+            return {"status": "error", "message": "Failed to send Discord message."}
+
+    return send_discord_message
diff --git a/surfsense_backend/app/agents/new_chat/tools/dropbox/create_file.py b/surfsense_backend/app/agents/new_chat/tools/dropbox/create_file.py
index ed8034861c..7aae034ccd 100644
--- a/surfsense_backend/app/agents/new_chat/tools/dropbox/create_file.py
+++ b/surfsense_backend/app/agents/new_chat/tools/dropbox/create_file.py
@@ -5,12 +5,12 @@
 from typing import Any, Literal
 
 from langchain_core.tools import tool
-from langgraph.types import interrupt
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
 
+from app.agents.new_chat.tools.hitl import request_approval
 from app.connectors.dropbox.client import DropboxClient
-from app.db import SearchSourceConnector, SearchSourceConnectorType
+from app.db import SearchSourceConnector, SearchSourceConnectorType, async_session_maker
 
 logger = logging.getLogger(__name__)
 
@@ -59,6 +59,23 @@ def create_create_dropbox_file_tool(
     search_space_id: int | None = None,
     user_id: str | None = None,
 ):
+    """
+    Factory function to create the create_dropbox_file tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+
+    Returns:
+        Configured create_dropbox_file tool
+    """
+    del db_session  # per-call session — see docstring
+
     @tool
     async def create_dropbox_file(
         name: str,
@@ -82,210 +99,191 @@ async def create_dropbox_file(
             f"create_dropbox_file called: name='{name}', file_type='{file_type}'"
         )
 
-        if db_session is None or search_space_id is None or user_id is None:
+        if search_space_id is None or user_id is None:
             return {
                 "status": "error",
                 "message": "Dropbox tool not properly configured.",
             }
 
         try:
-            result = await db_session.execute(
-                select(SearchSourceConnector).filter(
-                    SearchSourceConnector.search_space_id == search_space_id,
-                    SearchSourceConnector.user_id == user_id,
-                    SearchSourceConnector.connector_type
-                    == SearchSourceConnectorType.DROPBOX_CONNECTOR,
+            async with async_session_maker() as db_session:
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type
+                        == SearchSourceConnectorType.DROPBOX_CONNECTOR,
+                    )
                 )
-            )
-            connectors = result.scalars().all()
+                connectors = result.scalars().all()
 
-            if not connectors:
-                return {
-                    "status": "error",
-                    "message": "No Dropbox connector found. Please connect Dropbox in your workspace settings.",
-                }
-
-            accounts = []
-            for c in connectors:
-                cfg = c.config or {}
-                accounts.append(
-                    {
-                        "id": c.id,
-                        "name": c.name,
-                        "user_email": cfg.get("user_email"),
-                        "auth_expired": cfg.get("auth_expired", False),
+                if not connectors:
+                    return {
+                        "status": "error",
+                        "message": "No Dropbox connector found. Please connect Dropbox in your workspace settings.",
                     }
-                )
 
-            if all(a.get("auth_expired") for a in accounts):
-                return {
-                    "status": "auth_error",
-                    "message": "All connected Dropbox accounts need re-authentication.",
-                    "connector_type": "dropbox",
-                }
+                accounts = []
+                for c in connectors:
+                    cfg = c.config or {}
+                    accounts.append(
+                        {
+                            "id": c.id,
+                            "name": c.name,
+                            "user_email": cfg.get("user_email"),
+                            "auth_expired": cfg.get("auth_expired", False),
+                        }
+                    )
 
-            parent_folders: dict[int, list[dict[str, str]]] = {}
-            for acc in accounts:
-                cid = acc["id"]
-                if acc.get("auth_expired"):
-                    parent_folders[cid] = []
-                    continue
-                try:
-                    client = DropboxClient(session=db_session, connector_id=cid)
-                    items, err = await client.list_folder("")
-                    if err:
+                if all(a.get("auth_expired") for a in accounts):
+                    return {
+                        "status": "auth_error",
+                        "message": "All connected Dropbox accounts need re-authentication.",
+                        "connector_type": "dropbox",
+                    }
+
+                parent_folders: dict[int, list[dict[str, str]]] = {}
+                for acc in accounts:
+                    cid = acc["id"]
+                    if acc.get("auth_expired"):
+                        parent_folders[cid] = []
+                        continue
+                    try:
+                        client = DropboxClient(session=db_session, connector_id=cid)
+                        items, err = await client.list_folder("")
+                        if err:
+                            logger.warning(
+                                "Failed to list folders for connector %s: %s", cid, err
+                            )
+                            parent_folders[cid] = []
+                        else:
+                            parent_folders[cid] = [
+                                {
+                                    "folder_path": item.get("path_lower", ""),
+                                    "name": item["name"],
+                                }
+                                for item in items
+                                if item.get(".tag") == "folder" and item.get("name")
+                            ]
+                    except Exception:
                         logger.warning(
-                            "Failed to list folders for connector %s: %s", cid, err
+                            "Error fetching folders for connector %s",
+                            cid,
+                            exc_info=True,
                         )
                         parent_folders[cid] = []
-                    else:
-                        parent_folders[cid] = [
-                            {
-                                "folder_path": item.get("path_lower", ""),
-                                "name": item["name"],
-                            }
-                            for item in items
-                            if item.get(".tag") == "folder" and item.get("name")
-                        ]
-                except Exception:
-                    logger.warning(
-                        "Error fetching folders for connector %s", cid, exc_info=True
-                    )
-                    parent_folders[cid] = []
 
-            context: dict[str, Any] = {
-                "accounts": accounts,
-                "parent_folders": parent_folders,
-                "supported_types": _SUPPORTED_TYPES,
-            }
+                context: dict[str, Any] = {
+                    "accounts": accounts,
+                    "parent_folders": parent_folders,
+                    "supported_types": _SUPPORTED_TYPES,
+                }
 
-            approval = interrupt(
-                {
-                    "type": "dropbox_file_creation",
-                    "action": {
-                        "tool": "create_dropbox_file",
-                        "params": {
-                            "name": name,
-                            "file_type": file_type,
-                            "content": content,
-                            "connector_id": None,
-                            "parent_folder_path": None,
-                        },
+                result = request_approval(
+                    action_type="dropbox_file_creation",
+                    tool_name="create_dropbox_file",
+                    params={
+                        "name": name,
+                        "file_type": file_type,
+                        "content": content,
+                        "connector_id": None,
+                        "parent_folder_path": None,
                     },
-                    "context": context,
-                }
-            )
-
-            decisions_raw = (
-                approval.get("decisions", []) if isinstance(approval, dict) else []
-            )
-            decisions = (
-                decisions_raw if isinstance(decisions_raw, list) else [decisions_raw]
-            )
-            decisions = [d for d in decisions if isinstance(d, dict)]
-            if not decisions:
-                return {"status": "error", "message": "No approval decision received"}
-
-            decision = decisions[0]
-            decision_type = decision.get("type") or decision.get("decision_type")
-
-            if decision_type == "reject":
-                return {
-                    "status": "rejected",
-                    "message": "User declined. The file was not created.",
-                }
+                    context=context,
+                )
 
-            final_params: dict[str, Any] = {}
-            edited_action = decision.get("edited_action")
-            if isinstance(edited_action, dict):
-                edited_args = edited_action.get("args")
-                if isinstance(edited_args, dict):
-                    final_params = edited_args
-            elif isinstance(decision.get("args"), dict):
-                final_params = decision["args"]
+                if result.rejected:
+                    return {
+                        "status": "rejected",
+                        "message": "User declined. Do not retry or suggest alternatives.",
+                    }
 
-            final_name = final_params.get("name", name)
-            final_file_type = final_params.get("file_type", file_type)
-            final_content = final_params.get("content", content)
-            final_connector_id = final_params.get("connector_id")
-            final_parent_folder_path = final_params.get("parent_folder_path")
+                final_name = result.params.get("name", name)
+                final_file_type = result.params.get("file_type", file_type)
+                final_content = result.params.get("content", content)
+                final_connector_id = result.params.get("connector_id")
+                final_parent_folder_path = result.params.get("parent_folder_path")
+
+                if not final_name or not final_name.strip():
+                    return {"status": "error", "message": "File name cannot be empty."}
+
+                final_name = _ensure_extension(final_name, final_file_type)
+
+                if final_connector_id is not None:
+                    result = await db_session.execute(
+                        select(SearchSourceConnector).filter(
+                            SearchSourceConnector.id == final_connector_id,
+                            SearchSourceConnector.search_space_id == search_space_id,
+                            SearchSourceConnector.user_id == user_id,
+                            SearchSourceConnector.connector_type
+                            == SearchSourceConnectorType.DROPBOX_CONNECTOR,
+                        )
+                    )
+                    connector = result.scalars().first()
+                else:
+                    connector = connectors[0]
 
-            if not final_name or not final_name.strip():
-                return {"status": "error", "message": "File name cannot be empty."}
+                if not connector:
+                    return {
+                        "status": "error",
+                        "message": "Selected Dropbox connector is invalid.",
+                    }
 
-            final_name = _ensure_extension(final_name, final_file_type)
+                client = DropboxClient(session=db_session, connector_id=connector.id)
 
-            if final_connector_id is not None:
-                result = await db_session.execute(
-                    select(SearchSourceConnector).filter(
-                        SearchSourceConnector.id == final_connector_id,
-                        SearchSourceConnector.search_space_id == search_space_id,
-                        SearchSourceConnector.user_id == user_id,
-                        SearchSourceConnector.connector_type
-                        == SearchSourceConnectorType.DROPBOX_CONNECTOR,
-                    )
+                parent_path = final_parent_folder_path or ""
+                file_path = (
+                    f"{parent_path}/{final_name}" if parent_path else f"/{final_name}"
                 )
-                connector = result.scalars().first()
-            else:
-                connector = connectors[0]
-
-            if not connector:
-                return {
-                    "status": "error",
-                    "message": "Selected Dropbox connector is invalid.",
-                }
 
-            client = DropboxClient(session=db_session, connector_id=connector.id)
-
-            parent_path = final_parent_folder_path or ""
-            file_path = (
-                f"{parent_path}/{final_name}" if parent_path else f"/{final_name}"
-            )
-
-            if final_file_type == "paper":
-                created = await client.create_paper_doc(file_path, final_content or "")
-                file_id = created.get("file_id", "")
-                web_url = created.get("url", "")
-            else:
-                docx_bytes = _markdown_to_docx(final_content or "")
-                created = await client.upload_file(
-                    file_path, docx_bytes, mode="add", autorename=True
-                )
-                file_id = created.get("id", "")
-                web_url = ""
-
-            logger.info(f"Dropbox file created: id={file_id}, name={final_name}")
-
-            kb_message_suffix = ""
-            try:
-                from app.services.dropbox import DropboxKBSyncService
-
-                kb_service = DropboxKBSyncService(db_session)
-                kb_result = await kb_service.sync_after_create(
-                    file_id=file_id,
-                    file_name=final_name,
-                    file_path=file_path,
-                    web_url=web_url,
-                    content=final_content,
-                    connector_id=connector.id,
-                    search_space_id=search_space_id,
-                    user_id=user_id,
-                )
-                if kb_result["status"] == "success":
-                    kb_message_suffix = " Your knowledge base has also been updated."
+                if final_file_type == "paper":
+                    created = await client.create_paper_doc(
+                        file_path, final_content or ""
+                    )
+                    file_id = created.get("file_id", "")
+                    web_url = created.get("url", "")
                 else:
+                    docx_bytes = _markdown_to_docx(final_content or "")
+                    created = await client.upload_file(
+                        file_path, docx_bytes, mode="add", autorename=True
+                    )
+                    file_id = created.get("id", "")
+                    web_url = ""
+
+                logger.info(f"Dropbox file created: id={file_id}, name={final_name}")
+
+                kb_message_suffix = ""
+                try:
+                    from app.services.dropbox import DropboxKBSyncService
+
+                    kb_service = DropboxKBSyncService(db_session)
+                    kb_result = await kb_service.sync_after_create(
+                        file_id=file_id,
+                        file_name=final_name,
+                        file_path=file_path,
+                        web_url=web_url,
+                        content=final_content,
+                        connector_id=connector.id,
+                        search_space_id=search_space_id,
+                        user_id=user_id,
+                    )
+                    if kb_result["status"] == "success":
+                        kb_message_suffix = (
+                            " Your knowledge base has also been updated."
+                        )
+                    else:
+                        kb_message_suffix = " This file will be added to your knowledge base in the next scheduled sync."
+                except Exception as kb_err:
+                    logger.warning(f"KB sync after create failed: {kb_err}")
                     kb_message_suffix = " This file will be added to your knowledge base in the next scheduled sync."
-            except Exception as kb_err:
-                logger.warning(f"KB sync after create failed: {kb_err}")
-                kb_message_suffix = " This file will be added to your knowledge base in the next scheduled sync."
 
-            return {
-                "status": "success",
-                "file_id": file_id,
-                "name": final_name,
-                "web_url": web_url,
-                "message": f"Successfully created '{final_name}' in Dropbox.{kb_message_suffix}",
-            }
+                return {
+                    "status": "success",
+                    "file_id": file_id,
+                    "name": final_name,
+                    "web_url": web_url,
+                    "message": f"Successfully created '{final_name}' in Dropbox.{kb_message_suffix}",
+                }
 
         except Exception as e:
             from langgraph.errors import GraphInterrupt
diff --git a/surfsense_backend/app/agents/new_chat/tools/dropbox/trash_file.py b/surfsense_backend/app/agents/new_chat/tools/dropbox/trash_file.py
index e15dc30927..0e59e49db9 100644
--- a/surfsense_backend/app/agents/new_chat/tools/dropbox/trash_file.py
+++ b/surfsense_backend/app/agents/new_chat/tools/dropbox/trash_file.py
@@ -2,17 +2,18 @@
 from typing import Any
 
 from langchain_core.tools import tool
-from langgraph.types import interrupt
 from sqlalchemy import String, and_, cast, func
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
 
+from app.agents.new_chat.tools.hitl import request_approval
 from app.connectors.dropbox.client import DropboxClient
 from app.db import (
     Document,
     DocumentType,
     SearchSourceConnector,
     SearchSourceConnectorType,
+    async_session_maker,
 )
 
 logger = logging.getLogger(__name__)
@@ -23,6 +24,23 @@ def create_delete_dropbox_file_tool(
     search_space_id: int | None = None,
     user_id: str | None = None,
 ):
+    """
+    Factory function to create the delete_dropbox_file tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+
+    Returns:
+        Configured delete_dropbox_file tool
+    """
+    del db_session  # per-call session — see docstring
+
     @tool
     async def delete_dropbox_file(
         file_name: str,
@@ -55,33 +73,14 @@ async def delete_dropbox_file(
             f"delete_dropbox_file called: file_name='{file_name}', delete_from_kb={delete_from_kb}"
         )
 
-        if db_session is None or search_space_id is None or user_id is None:
+        if search_space_id is None or user_id is None:
             return {
                 "status": "error",
                 "message": "Dropbox tool not properly configured.",
             }
 
         try:
-            doc_result = await db_session.execute(
-                select(Document)
-                .join(
-                    SearchSourceConnector,
-                    Document.connector_id == SearchSourceConnector.id,
-                )
-                .filter(
-                    and_(
-                        Document.search_space_id == search_space_id,
-                        Document.document_type == DocumentType.DROPBOX_FILE,
-                        func.lower(Document.title) == func.lower(file_name),
-                        SearchSourceConnector.user_id == user_id,
-                    )
-                )
-                .order_by(Document.updated_at.desc().nullslast())
-                .limit(1)
-            )
-            document = doc_result.scalars().first()
-
-            if not document:
+            async with async_session_maker() as db_session:
                 doc_result = await db_session.execute(
                     select(Document)
                     .join(
@@ -92,13 +91,7 @@ async def delete_dropbox_file(
                         and_(
                             Document.search_space_id == search_space_id,
                             Document.document_type == DocumentType.DROPBOX_FILE,
-                            func.lower(
-                                cast(
-                                    Document.document_metadata["dropbox_file_name"],
-                                    String,
-                                )
-                            )
-                            == func.lower(file_name),
+                            func.lower(Document.title) == func.lower(file_name),
                             SearchSourceConnector.user_id == user_id,
                         )
                     )
@@ -107,126 +100,63 @@ async def delete_dropbox_file(
                 )
                 document = doc_result.scalars().first()
 
-            if not document:
-                return {
-                    "status": "not_found",
-                    "message": (
-                        f"File '{file_name}' not found in your indexed Dropbox files. "
-                        "This could mean: (1) the file doesn't exist, (2) it hasn't been indexed yet, "
-                        "or (3) the file name is different."
-                    ),
-                }
-
-            if not document.connector_id:
-                return {
-                    "status": "error",
-                    "message": "Document has no associated connector.",
-                }
-
-            meta = document.document_metadata or {}
-            file_path = meta.get("dropbox_path")
-            file_id = meta.get("dropbox_file_id")
-            document_id = document.id
-
-            if not file_path:
-                return {
-                    "status": "error",
-                    "message": "File path is missing. Please re-index the file.",
-                }
-
-            conn_result = await db_session.execute(
-                select(SearchSourceConnector).filter(
-                    and_(
-                        SearchSourceConnector.id == document.connector_id,
-                        SearchSourceConnector.search_space_id == search_space_id,
-                        SearchSourceConnector.user_id == user_id,
-                        SearchSourceConnector.connector_type
-                        == SearchSourceConnectorType.DROPBOX_CONNECTOR,
+                if not document:
+                    doc_result = await db_session.execute(
+                        select(Document)
+                        .join(
+                            SearchSourceConnector,
+                            Document.connector_id == SearchSourceConnector.id,
+                        )
+                        .filter(
+                            and_(
+                                Document.search_space_id == search_space_id,
+                                Document.document_type == DocumentType.DROPBOX_FILE,
+                                func.lower(
+                                    cast(
+                                        Document.document_metadata["dropbox_file_name"],
+                                        String,
+                                    )
+                                )
+                                == func.lower(file_name),
+                                SearchSourceConnector.user_id == user_id,
+                            )
+                        )
+                        .order_by(Document.updated_at.desc().nullslast())
+                        .limit(1)
                     )
-                )
-            )
-            connector = conn_result.scalars().first()
-            if not connector:
-                return {
-                    "status": "error",
-                    "message": "Dropbox connector not found or access denied.",
-                }
+                    document = doc_result.scalars().first()
 
-            cfg = connector.config or {}
-            if cfg.get("auth_expired"):
-                return {
-                    "status": "auth_error",
-                    "message": "Dropbox account needs re-authentication. Please re-authenticate in your connector settings.",
-                    "connector_type": "dropbox",
-                }
+                if not document:
+                    return {
+                        "status": "not_found",
+                        "message": (
+                            f"File '{file_name}' not found in your indexed Dropbox files. "
+                            "This could mean: (1) the file doesn't exist, (2) it hasn't been indexed yet, "
+                            "or (3) the file name is different."
+                        ),
+                    }
 
-            context = {
-                "file": {
-                    "file_id": file_id,
-                    "file_path": file_path,
-                    "name": file_name,
-                    "document_id": document_id,
-                },
-                "account": {
-                    "id": connector.id,
-                    "name": connector.name,
-                    "user_email": cfg.get("user_email"),
-                },
-            }
+                if not document.connector_id:
+                    return {
+                        "status": "error",
+                        "message": "Document has no associated connector.",
+                    }
 
-            approval = interrupt(
-                {
-                    "type": "dropbox_file_trash",
-                    "action": {
-                        "tool": "delete_dropbox_file",
-                        "params": {
-                            "file_path": file_path,
-                            "connector_id": connector.id,
-                            "delete_from_kb": delete_from_kb,
-                        },
-                    },
-                    "context": context,
-                }
-            )
-
-            decisions_raw = (
-                approval.get("decisions", []) if isinstance(approval, dict) else []
-            )
-            decisions = (
-                decisions_raw if isinstance(decisions_raw, list) else [decisions_raw]
-            )
-            decisions = [d for d in decisions if isinstance(d, dict)]
-            if not decisions:
-                return {"status": "error", "message": "No approval decision received"}
-
-            decision = decisions[0]
-            decision_type = decision.get("type") or decision.get("decision_type")
-            logger.info(f"User decision: {decision_type}")
-
-            if decision_type == "reject":
-                return {
-                    "status": "rejected",
-                    "message": "User declined. The file was not deleted. Do not ask again or suggest alternatives.",
-                }
+                meta = document.document_metadata or {}
+                file_path = meta.get("dropbox_path")
+                file_id = meta.get("dropbox_file_id")
+                document_id = document.id
+
+                if not file_path:
+                    return {
+                        "status": "error",
+                        "message": "File path is missing. Please re-index the file.",
+                    }
 
-            final_params: dict[str, Any] = {}
-            edited_action = decision.get("edited_action")
-            if isinstance(edited_action, dict):
-                edited_args = edited_action.get("args")
-                if isinstance(edited_args, dict):
-                    final_params = edited_args
-            elif isinstance(decision.get("args"), dict):
-                final_params = decision["args"]
-
-            final_file_path = final_params.get("file_path", file_path)
-            final_connector_id = final_params.get("connector_id", connector.id)
-            final_delete_from_kb = final_params.get("delete_from_kb", delete_from_kb)
-
-            if final_connector_id != connector.id:
-                result = await db_session.execute(
+                conn_result = await db_session.execute(
                     select(SearchSourceConnector).filter(
                         and_(
-                            SearchSourceConnector.id == final_connector_id,
+                            SearchSourceConnector.id == document.connector_id,
                             SearchSourceConnector.search_space_id == search_space_id,
                             SearchSourceConnector.user_id == user_id,
                             SearchSourceConnector.connector_type
@@ -234,61 +164,128 @@ async def delete_dropbox_file(
                         )
                     )
                 )
-                validated_connector = result.scalars().first()
-                if not validated_connector:
+                connector = conn_result.scalars().first()
+                if not connector:
                     return {
                         "status": "error",
-                        "message": "Selected Dropbox connector is invalid or has been disconnected.",
+                        "message": "Dropbox connector not found or access denied.",
                     }
-                actual_connector_id = validated_connector.id
-            else:
-                actual_connector_id = connector.id
 
-            logger.info(
-                f"Deleting Dropbox file: path='{final_file_path}', connector={actual_connector_id}"
-            )
+                cfg = connector.config or {}
+                if cfg.get("auth_expired"):
+                    return {
+                        "status": "auth_error",
+                        "message": "Dropbox account needs re-authentication. Please re-authenticate in your connector settings.",
+                        "connector_type": "dropbox",
+                    }
 
-            client = DropboxClient(session=db_session, connector_id=actual_connector_id)
-            await client.delete_file(final_file_path)
+                context = {
+                    "file": {
+                        "file_id": file_id,
+                        "file_path": file_path,
+                        "name": file_name,
+                        "document_id": document_id,
+                    },
+                    "account": {
+                        "id": connector.id,
+                        "name": connector.name,
+                        "user_email": cfg.get("user_email"),
+                    },
+                }
 
-            logger.info(f"Dropbox file deleted: path={final_file_path}")
+                result = request_approval(
+                    action_type="dropbox_file_trash",
+                    tool_name="delete_dropbox_file",
+                    params={
+                        "file_path": file_path,
+                        "connector_id": connector.id,
+                        "delete_from_kb": delete_from_kb,
+                    },
+                    context=context,
+                )
 
-            trash_result: dict[str, Any] = {
-                "status": "success",
-                "file_id": file_id,
-                "message": f"Successfully deleted '{file_name}' from Dropbox.",
-            }
+                if result.rejected:
+                    return {
+                        "status": "rejected",
+                        "message": "User declined. Do not retry or suggest alternatives.",
+                    }
 
-            deleted_from_kb = False
-            if final_delete_from_kb and document_id:
-                try:
-                    doc_result = await db_session.execute(
-                        select(Document).filter(Document.id == document_id)
-                    )
-                    doc = doc_result.scalars().first()
-                    if doc:
-                        await db_session.delete(doc)
-                        await db_session.commit()
-                        deleted_from_kb = True
-                        logger.info(
-                            f"Deleted document {document_id} from knowledge base"
+                final_file_path = result.params.get("file_path", file_path)
+                final_connector_id = result.params.get("connector_id", connector.id)
+                final_delete_from_kb = result.params.get(
+                    "delete_from_kb", delete_from_kb
+                )
+
+                if final_connector_id != connector.id:
+                    result = await db_session.execute(
+                        select(SearchSourceConnector).filter(
+                            and_(
+                                SearchSourceConnector.id == final_connector_id,
+                                SearchSourceConnector.search_space_id
+                                == search_space_id,
+                                SearchSourceConnector.user_id == user_id,
+                                SearchSourceConnector.connector_type
+                                == SearchSourceConnectorType.DROPBOX_CONNECTOR,
+                            )
                         )
-                    else:
-                        logger.warning(f"Document {document_id} not found in KB")
-                except Exception as e:
-                    logger.error(f"Failed to delete document from KB: {e}")
-                    await db_session.rollback()
-                    trash_result["warning"] = (
-                        f"File deleted, but failed to remove from knowledge base: {e!s}"
                     )
+                    validated_connector = result.scalars().first()
+                    if not validated_connector:
+                        return {
+                            "status": "error",
+                            "message": "Selected Dropbox connector is invalid or has been disconnected.",
+                        }
+                    actual_connector_id = validated_connector.id
+                else:
+                    actual_connector_id = connector.id
+
+                logger.info(
+                    f"Deleting Dropbox file: path='{final_file_path}', connector={actual_connector_id}"
+                )
 
-            trash_result["deleted_from_kb"] = deleted_from_kb
-            if deleted_from_kb:
-                trash_result["message"] = (
-                    f"{trash_result.get('message', '')} (also removed from knowledge base)"
+                client = DropboxClient(
+                    session=db_session, connector_id=actual_connector_id
                 )
+                await client.delete_file(final_file_path)
+
+                logger.info(f"Dropbox file deleted: path={final_file_path}")
+
+                trash_result: dict[str, Any] = {
+                    "status": "success",
+                    "file_id": file_id,
+                    "message": f"Successfully deleted '{file_name}' from Dropbox.",
+                }
+
+                deleted_from_kb = False
+                if final_delete_from_kb and document_id:
+                    try:
+                        doc_result = await db_session.execute(
+                            select(Document).filter(Document.id == document_id)
+                        )
+                        doc = doc_result.scalars().first()
+                        if doc:
+                            await db_session.delete(doc)
+                            await db_session.commit()
+                            deleted_from_kb = True
+                            logger.info(
+                                f"Deleted document {document_id} from knowledge base"
+                            )
+                        else:
+                            logger.warning(f"Document {document_id} not found in KB")
+                    except Exception as e:
+                        logger.error(f"Failed to delete document from KB: {e}")
+                        await db_session.rollback()
+                        trash_result["warning"] = (
+                            f"File deleted, but failed to remove from knowledge base: {e!s}"
+                        )
+
+                trash_result["deleted_from_kb"] = deleted_from_kb
+                if deleted_from_kb:
+                    trash_result["message"] = (
+                        f"{trash_result.get('message', '')} (also removed from knowledge base)"
+                    )
 
-            return trash_result
+                return trash_result
 
         except Exception as e:
             from langgraph.errors import GraphInterrupt
diff --git a/surfsense_backend/app/agents/new_chat/tools/generate_image.py b/surfsense_backend/app/agents/new_chat/tools/generate_image.py
index d94d55b1a0..9e287ac519 100644
--- a/surfsense_backend/app/agents/new_chat/tools/generate_image.py
+++ b/surfsense_backend/app/agents/new_chat/tools/generate_image.py
@@ -20,12 +20,18 @@
 from sqlalchemy.ext.asyncio import AsyncSession
 
 from app.config import config
-from app.db import ImageGeneration, ImageGenerationConfig, SearchSpace
+from app.db import (
+    ImageGeneration,
+    ImageGenerationConfig,
+    SearchSpace,
+    shielded_async_session,
+)
 from app.services.image_gen_router_service import (
     IMAGE_GEN_AUTO_MODE_ID,
     ImageGenRouterService,
     is_image_gen_auto_mode,
 )
+from app.services.provider_api_base import resolve_api_base
 from app.utils.signed_image_urls import generate_image_token
 
 logger = logging.getLogger(__name__)
@@ -44,12 +50,16 @@
 }
 
 
+def _resolve_provider_prefix(provider: str, custom_provider: str | None) -> str:
+    if custom_provider:
+        return custom_provider
+    return _PROVIDER_MAP.get(provider.upper(), provider.lower())
+
+
 def _build_model_string(
     provider: str, model_name: str, custom_provider: str | None
 ) -> str:
-    if custom_provider:
-        return f"{custom_provider}/{model_name}"
-    prefix = _PROVIDER_MAP.get(provider.upper(), provider.lower())
+    prefix = _resolve_provider_prefix(provider, custom_provider)
     return f"{prefix}/{model_name}"
 
 
@@ -70,8 +80,13 @@ def create_generate_image_tool(
 
     Args:
         search_space_id: The search space ID (for config resolution)
-        db_session: Async database session
+        db_session: Reserved for compatibility with the tool registry.
+            The streaming task's ``AsyncSession`` is shared by every tool;
+            because AsyncSession is not concurrency-safe, parallel tool calls
+            would interleave flushes (e.g. podcast + image in the same step)
+            and poison the transaction. This tool opens its own session.
     """
+    del db_session  # use a fresh per-call session, see below
 
     @tool
     async def generate_image(
@@ -93,110 +108,127 @@ async def generate_image(
             A dictionary containing the generated image(s) for display in the chat.
         """
         try:
-            # Resolve the image generation config from the search space preference
-            result = await db_session.execute(
-                select(SearchSpace).filter(SearchSpace.id == search_space_id)
-            )
-            search_space = result.scalars().first()
-            if not search_space:
-                return {"error": "Search space not found"}
-
-            config_id = (
-                search_space.image_generation_config_id or IMAGE_GEN_AUTO_MODE_ID
-            )
-
-            # Build generation kwargs
-            # NOTE: size, quality, and style are intentionally NOT passed.
-            # Different models support different values for these params
-            # (e.g. DALL-E 3 wants "hd"/"standard" for quality while
-            # gpt-image-1 wants "high"/"medium"/"low"; size options also
-            # differ). Letting the model use its own defaults avoids errors.
-            gen_kwargs: dict[str, Any] = {}
-            if n is not None and n > 1:
-                gen_kwargs["n"] = n
-
-            # Call litellm based on config type
-            if is_image_gen_auto_mode(config_id):
-                if not ImageGenRouterService.is_initialized():
-                    return {
-                        "error": "No image generation models configured. "
-                        "Please add an image model in Settings > Image Models."
-                    }
-                response = await ImageGenRouterService.aimage_generation(
-                    prompt=prompt, model="auto", **gen_kwargs
+            # Use a per-call session so concurrent tool calls don't share an
+            # AsyncSession (which is not concurrency-safe). The streaming
+            # task's session is shared across every tool; without isolation,
+            # autoflushes from a concurrent writer poison this tool too.
+            async with shielded_async_session() as session:
+                result = await session.execute(
+                    select(SearchSpace).filter(SearchSpace.id == search_space_id)
                 )
-            elif config_id < 0:
-                cfg = _get_global_image_gen_config(config_id)
-                if not cfg:
-                    return {"error": f"Image generation config {config_id} not found"}
-
-                model_string = _build_model_string(
-                    cfg.get("provider", ""),
-                    cfg["model_name"],
-                    cfg.get("custom_provider"),
-                )
-                gen_kwargs["api_key"] = cfg.get("api_key")
-                if cfg.get("api_base"):
-                    gen_kwargs["api_base"] = cfg["api_base"]
-                if cfg.get("api_version"):
-                    gen_kwargs["api_version"] = cfg["api_version"]
-                if cfg.get("litellm_params"):
-                    gen_kwargs.update(cfg["litellm_params"])
-
-                response = await aimage_generation(
-                    prompt=prompt, model=model_string, **gen_kwargs
+                search_space = result.scalars().first()
+                if not search_space:
+                    return {"error": "Search space not found"}
+
+                config_id = (
+                    search_space.image_generation_config_id or IMAGE_GEN_AUTO_MODE_ID
                 )
-            else:
-                # Positive ID = user-created ImageGenerationConfig
-                cfg_result = await db_session.execute(
-                    select(ImageGenerationConfig).filter(
-                        ImageGenerationConfig.id == config_id
+
+                # Build generation kwargs
+                # NOTE: size, quality, and style are intentionally NOT passed.
+                # Different models support different values for these params
+                # (e.g. DALL-E 3 wants "hd"/"standard" for quality while
+                # gpt-image-1 wants "high"/"medium"/"low"; size options also
+                # differ). Letting the model use its own defaults avoids errors.
+                gen_kwargs: dict[str, Any] = {}
+                if n is not None and n > 1:
+                    gen_kwargs["n"] = n
+
+                # Call litellm based on config type
+                if is_image_gen_auto_mode(config_id):
+                    if not ImageGenRouterService.is_initialized():
+                        return {
+                            "error": "No image generation models configured. "
+                            "Please add an image model in Settings > Image Models."
+                        }
+                    response = await ImageGenRouterService.aimage_generation(
+                        prompt=prompt, model="auto", **gen_kwargs
                     )
-                )
-                db_cfg = cfg_result.scalars().first()
-                if not db_cfg:
-                    return {"error": f"Image generation config {config_id} not found"}
-
-                model_string = _build_model_string(
-                    db_cfg.provider.value,
-                    db_cfg.model_name,
-                    db_cfg.custom_provider,
-                )
-                gen_kwargs["api_key"] = db_cfg.api_key
-                if db_cfg.api_base:
-                    gen_kwargs["api_base"] = db_cfg.api_base
-                if db_cfg.api_version:
-                    gen_kwargs["api_version"] = db_cfg.api_version
-                if db_cfg.litellm_params:
-                    gen_kwargs.update(db_cfg.litellm_params)
-
-                response = await aimage_generation(
-                    prompt=prompt, model=model_string, **gen_kwargs
+                elif config_id < 0:
+                    cfg = _get_global_image_gen_config(config_id)
+                    if not cfg:
+                        return {
+                            "error": f"Image generation config {config_id} not found"
+                        }
+
+                    provider_prefix = _resolve_provider_prefix(
+                        cfg.get("provider", ""), cfg.get("custom_provider")
+                    )
+                    model_string = f"{provider_prefix}/{cfg['model_name']}"
+                    gen_kwargs["api_key"] = cfg.get("api_key")
+                    api_base = resolve_api_base(
+                        provider=cfg.get("provider"),
+                        provider_prefix=provider_prefix,
+                        config_api_base=cfg.get("api_base"),
+                    )
+                    if api_base:
+                        gen_kwargs["api_base"] = api_base
+                    if cfg.get("api_version"):
+                        gen_kwargs["api_version"] = cfg["api_version"]
+                    if cfg.get("litellm_params"):
+                        gen_kwargs.update(cfg["litellm_params"])
+
+                    response = await aimage_generation(
+                        prompt=prompt, model=model_string, **gen_kwargs
+                    )
+                else:
+                    # Positive ID = user-created ImageGenerationConfig
+                    cfg_result = await session.execute(
+                        select(ImageGenerationConfig).filter(
+                            ImageGenerationConfig.id == config_id
+                        )
+                    )
+                    db_cfg = cfg_result.scalars().first()
+                    if not db_cfg:
+                        return {
+                            "error": f"Image generation config {config_id} not found"
+                        }
+
+                    provider_prefix = _resolve_provider_prefix(
+                        db_cfg.provider.value, db_cfg.custom_provider
+                    )
+                    model_string = f"{provider_prefix}/{db_cfg.model_name}"
+                    gen_kwargs["api_key"] = db_cfg.api_key
+                    api_base = resolve_api_base(
+                        provider=db_cfg.provider.value,
+                        provider_prefix=provider_prefix,
+                        config_api_base=db_cfg.api_base,
+                    )
+                    if api_base:
+                        gen_kwargs["api_base"] = api_base
+                    if db_cfg.api_version:
+                        gen_kwargs["api_version"] = db_cfg.api_version
+                    if db_cfg.litellm_params:
+                        gen_kwargs.update(db_cfg.litellm_params)
+
+                    response = await aimage_generation(
+                        prompt=prompt, model=model_string, **gen_kwargs
+                    )
+
+                # Parse the response and store in DB
+                response_dict = (
+                    response.model_dump()
+                    if hasattr(response, "model_dump")
+                    else dict(response)
                 )
 
-            # Parse the response and store in DB
-            response_dict = (
-                response.model_dump()
-                if hasattr(response, "model_dump")
-                else dict(response)
-            )
-
-            # Generate a random access token for this image
-            access_token = generate_image_token()
-
-            # Save to image_generations table for history
-            db_image_gen = ImageGeneration(
-                prompt=prompt,
-                model=getattr(response, "_hidden_params", {}).get("model"),
-                n=n,
-                image_generation_config_id=config_id,
-                response_data=response_dict,
-                search_space_id=search_space_id,
-                access_token=access_token,
-            )
-            db_session.add(db_image_gen)
-            await db_session.commit()
-            await db_session.refresh(db_image_gen)
+                # Generate a random access token for this image
+                access_token = generate_image_token()
+
+                # Save to image_generations table for history
+                db_image_gen = ImageGeneration(
+                    prompt=prompt,
+                    model=getattr(response, "_hidden_params", {}).get("model"),
+                    n=n,
+                    image_generation_config_id=config_id,
+                    response_data=response_dict,
+                    search_space_id=search_space_id,
+                    access_token=access_token,
+                )
+                session.add(db_image_gen)
+                await session.commit()
+                await session.refresh(db_image_gen)
+                db_image_gen_id = db_image_gen.id
 
             # Extract image URLs from response
             images = response_dict.get("data", [])
@@ -217,7 +249,7 @@ async def generate_image(
                 backend_url = config.BACKEND_URL or "http://localhost:8000"
                 image_url = (
                     f"{backend_url}/api/v1/image-generations/"
-                    f"{db_image_gen.id}/image?token={access_token}"
+                    f"{db_image_gen_id}/image?token={access_token}"
                 )
             else:
                 return {"error": "No displayable image data in the response"}
diff --git a/surfsense_backend/app/agents/new_chat/tools/gmail/__init__.py b/surfsense_backend/app/agents/new_chat/tools/gmail/__init__.py
index efb2fb0fab..294840122c 100644
--- a/surfsense_backend/app/agents/new_chat/tools/gmail/__init__.py
+++ b/surfsense_backend/app/agents/new_chat/tools/gmail/__init__.py
@@ -1,6 +1,12 @@
 from app.agents.new_chat.tools.gmail.create_draft import (
     create_create_gmail_draft_tool,
 )
+from app.agents.new_chat.tools.gmail.read_email import (
+    create_read_gmail_email_tool,
+)
+from app.agents.new_chat.tools.gmail.search_emails import (
+    create_search_gmail_tool,
+)
 from app.agents.new_chat.tools.gmail.send_email import (
     create_send_gmail_email_tool,
 )
@@ -13,6 +19,8 @@
 
 __all__ = [
     "create_create_gmail_draft_tool",
+    "create_read_gmail_email_tool",
+    "create_search_gmail_tool",
     "create_send_gmail_email_tool",
     "create_trash_gmail_email_tool",
     "create_update_gmail_draft_tool",
diff --git a/surfsense_backend/app/agents/new_chat/tools/gmail/composio_helpers.py b/surfsense_backend/app/agents/new_chat/tools/gmail/composio_helpers.py
new file mode 100644
index 0000000000..0ca1191a42
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/tools/gmail/composio_helpers.py
@@ -0,0 +1,41 @@
+from typing import Any
+
+from app.db import SearchSourceConnector
+from app.services.composio_service import ComposioService
+
+
+def split_recipients(value: str | None) -> list[str]:
+    if not value:
+        return []
+    return [recipient.strip() for recipient in value.split(",") if recipient.strip()]
+
+
+def unwrap_composio_data(data: Any) -> Any:
+    if isinstance(data, dict):
+        inner = data.get("data", data)
+        if isinstance(inner, dict):
+            return inner.get("response_data", inner)
+        return inner
+    return data
+
+
+async def execute_composio_gmail_tool(
+    connector: SearchSourceConnector,
+    user_id: str,
+    tool_name: str,
+    params: dict[str, Any],
+) -> tuple[Any, str | None]:
+    cca_id = connector.config.get("composio_connected_account_id")
+    if not cca_id:
+        return None, "Composio connected account ID not found for this Gmail connector."
+
+    result = await ComposioService().execute_tool(
+        connected_account_id=cca_id,
+        tool_name=tool_name,
+        params=params,
+        entity_id=f"surfsense_{user_id}",
+    )
+    if not result.get("success"):
+        return None, result.get("error", "Unknown Composio Gmail error")
+
+    return unwrap_composio_data(result.get("data")), None
diff --git a/surfsense_backend/app/agents/new_chat/tools/gmail/create_draft.py b/surfsense_backend/app/agents/new_chat/tools/gmail/create_draft.py
index a812f621a9..c88b48d2d0 100644
--- a/surfsense_backend/app/agents/new_chat/tools/gmail/create_draft.py
+++ b/surfsense_backend/app/agents/new_chat/tools/gmail/create_draft.py
@@ -6,9 +6,10 @@
 from typing import Any
 
 from langchain_core.tools import tool
-from langgraph.types import interrupt
 from sqlalchemy.ext.asyncio import AsyncSession
 
+from app.agents.new_chat.tools.hitl import request_approval
+from app.db import async_session_maker
 from app.services.gmail import GmailToolMetadataService
 
 logger = logging.getLogger(__name__)
@@ -19,6 +20,23 @@ def create_create_gmail_draft_tool(
     search_space_id: int | None = None,
     user_id: str | None = None,
 ):
+    """
+    Factory function to create the create_gmail_draft tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+
+    Returns:
+        Configured create_gmail_draft tool
+    """
+    del db_session  # per-call session — see docstring
+
     @tool
     async def create_gmail_draft(
         to: str,
@@ -57,274 +75,276 @@ async def create_gmail_draft(
         """
         logger.info(f"create_gmail_draft called: to='{to}', subject='{subject}'")
 
-        if db_session is None or search_space_id is None or user_id is None:
+        if search_space_id is None or user_id is None:
             return {
                 "status": "error",
                 "message": "Gmail tool not properly configured. Please contact support.",
             }
 
         try:
-            metadata_service = GmailToolMetadataService(db_session)
-            context = await metadata_service.get_creation_context(
-                search_space_id, user_id
-            )
-
-            if "error" in context:
-                logger.error(f"Failed to fetch creation context: {context['error']}")
-                return {"status": "error", "message": context["error"]}
-
-            accounts = context.get("accounts", [])
-            if accounts and all(a.get("auth_expired") for a in accounts):
-                logger.warning("All Gmail accounts have expired authentication")
-                return {
-                    "status": "auth_error",
-                    "message": "All connected Gmail accounts need re-authentication. Please re-authenticate in your connector settings.",
-                    "connector_type": "gmail",
-                }
-
-            logger.info(
-                f"Requesting approval for creating Gmail draft: to='{to}', subject='{subject}'"
-            )
-            approval = interrupt(
-                {
-                    "type": "gmail_draft_creation",
-                    "action": {
-                        "tool": "create_gmail_draft",
-                        "params": {
-                            "to": to,
-                            "subject": subject,
-                            "body": body,
-                            "cc": cc,
-                            "bcc": bcc,
-                            "connector_id": None,
-                        },
-                    },
-                    "context": context,
-                }
-            )
-
-            decisions_raw = (
-                approval.get("decisions", []) if isinstance(approval, dict) else []
-            )
-            decisions = (
-                decisions_raw if isinstance(decisions_raw, list) else [decisions_raw]
-            )
-            decisions = [d for d in decisions if isinstance(d, dict)]
-            if not decisions:
-                logger.warning("No approval decision received")
-                return {"status": "error", "message": "No approval decision received"}
-
-            decision = decisions[0]
-            decision_type = decision.get("type") or decision.get("decision_type")
-            logger.info(f"User decision: {decision_type}")
-
-            if decision_type == "reject":
-                return {
-                    "status": "rejected",
-                    "message": "User declined. The draft was not created. Do not ask again or suggest alternatives.",
-                }
-
-            final_params: dict[str, Any] = {}
-            edited_action = decision.get("edited_action")
-            if isinstance(edited_action, dict):
-                edited_args = edited_action.get("args")
-                if isinstance(edited_args, dict):
-                    final_params = edited_args
-            elif isinstance(decision.get("args"), dict):
-                final_params = decision["args"]
-
-            final_to = final_params.get("to", to)
-            final_subject = final_params.get("subject", subject)
-            final_body = final_params.get("body", body)
-            final_cc = final_params.get("cc", cc)
-            final_bcc = final_params.get("bcc", bcc)
-            final_connector_id = final_params.get("connector_id")
-
-            from sqlalchemy.future import select
-
-            from app.db import SearchSourceConnector, SearchSourceConnectorType
-
-            _gmail_types = [
-                SearchSourceConnectorType.GOOGLE_GMAIL_CONNECTOR,
-                SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR,
-            ]
-
-            if final_connector_id is not None:
-                result = await db_session.execute(
-                    select(SearchSourceConnector).filter(
-                        SearchSourceConnector.id == final_connector_id,
-                        SearchSourceConnector.search_space_id == search_space_id,
-                        SearchSourceConnector.user_id == user_id,
-                        SearchSourceConnector.connector_type.in_(_gmail_types),
-                    )
+            async with async_session_maker() as db_session:
+                metadata_service = GmailToolMetadataService(db_session)
+                context = await metadata_service.get_creation_context(
+                    search_space_id, user_id
                 )
-                connector = result.scalars().first()
-                if not connector:
-                    return {
-                        "status": "error",
-                        "message": "Selected Gmail connector is invalid or has been disconnected.",
-                    }
-                actual_connector_id = connector.id
-            else:
-                result = await db_session.execute(
-                    select(SearchSourceConnector).filter(
-                        SearchSourceConnector.search_space_id == search_space_id,
-                        SearchSourceConnector.user_id == user_id,
-                        SearchSourceConnector.connector_type.in_(_gmail_types),
+
+                if "error" in context:
+                    logger.error(
+                        f"Failed to fetch creation context: {context['error']}"
                     )
-                )
-                connector = result.scalars().first()
-                if not connector:
+                    return {"status": "error", "message": context["error"]}
+
+                accounts = context.get("accounts", [])
+                if accounts and all(a.get("auth_expired") for a in accounts):
+                    logger.warning("All Gmail accounts have expired authentication")
                     return {
-                        "status": "error",
-                        "message": "No Gmail connector found. Please connect Gmail in your workspace settings.",
+                        "status": "auth_error",
+                        "message": "All connected Gmail accounts need re-authentication. Please re-authenticate in your connector settings.",
+                        "connector_type": "gmail",
                     }
-                actual_connector_id = connector.id
-
-            logger.info(
-                f"Creating Gmail draft: to='{final_to}', subject='{final_subject}', connector={actual_connector_id}"
-            )
 
-            if (
-                connector.connector_type
-                == SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR
-            ):
-                from app.utils.google_credentials import build_composio_credentials
+                logger.info(
+                    f"Requesting approval for creating Gmail draft: to='{to}', subject='{subject}'"
+                )
+                result = request_approval(
+                    action_type="gmail_draft_creation",
+                    tool_name="create_gmail_draft",
+                    params={
+                        "to": to,
+                        "subject": subject,
+                        "body": body,
+                        "cc": cc,
+                        "bcc": bcc,
+                        "connector_id": None,
+                    },
+                    context=context,
+                )
 
-                cca_id = connector.config.get("composio_connected_account_id")
-                if cca_id:
-                    creds = build_composio_credentials(cca_id)
-                else:
+                if result.rejected:
                     return {
-                        "status": "error",
-                        "message": "Composio connected account ID not found for this Gmail connector.",
+                        "status": "rejected",
+                        "message": "User declined. The draft was not created. Do not ask again or suggest alternatives.",
                     }
-            else:
-                from google.oauth2.credentials import Credentials
-
-                from app.config import config
-                from app.utils.oauth_security import TokenEncryption
-
-                config_data = dict(connector.config)
-                token_encrypted = config_data.get("_token_encrypted", False)
-                if token_encrypted and config.SECRET_KEY:
-                    token_encryption = TokenEncryption(config.SECRET_KEY)
-                    if config_data.get("token"):
-                        config_data["token"] = token_encryption.decrypt_token(
-                            config_data["token"]
-                        )
-                    if config_data.get("refresh_token"):
-                        config_data["refresh_token"] = token_encryption.decrypt_token(
-                            config_data["refresh_token"]
+
+                final_to = result.params.get("to", to)
+                final_subject = result.params.get("subject", subject)
+                final_body = result.params.get("body", body)
+                final_cc = result.params.get("cc", cc)
+                final_bcc = result.params.get("bcc", bcc)
+                final_connector_id = result.params.get("connector_id")
+
+                from sqlalchemy.future import select
+
+                from app.db import SearchSourceConnector, SearchSourceConnectorType
+
+                _gmail_types = [
+                    SearchSourceConnectorType.GOOGLE_GMAIL_CONNECTOR,
+                    SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR,
+                ]
+
+                if final_connector_id is not None:
+                    result = await db_session.execute(
+                        select(SearchSourceConnector).filter(
+                            SearchSourceConnector.id == final_connector_id,
+                            SearchSourceConnector.search_space_id == search_space_id,
+                            SearchSourceConnector.user_id == user_id,
+                            SearchSourceConnector.connector_type.in_(_gmail_types),
                         )
-                    if config_data.get("client_secret"):
-                        config_data["client_secret"] = token_encryption.decrypt_token(
-                            config_data["client_secret"]
+                    )
+                    connector = result.scalars().first()
+                    if not connector:
+                        return {
+                            "status": "error",
+                            "message": "Selected Gmail connector is invalid or has been disconnected.",
+                        }
+                    actual_connector_id = connector.id
+                else:
+                    result = await db_session.execute(
+                        select(SearchSourceConnector).filter(
+                            SearchSourceConnector.search_space_id == search_space_id,
+                            SearchSourceConnector.user_id == user_id,
+                            SearchSourceConnector.connector_type.in_(_gmail_types),
                         )
-
-                exp = config_data.get("expiry", "")
-                if exp:
-                    exp = exp.replace("Z", "")
-
-                creds = Credentials(
-                    token=config_data.get("token"),
-                    refresh_token=config_data.get("refresh_token"),
-                    token_uri=config_data.get("token_uri"),
-                    client_id=config_data.get("client_id"),
-                    client_secret=config_data.get("client_secret"),
-                    scopes=config_data.get("scopes", []),
-                    expiry=datetime.fromisoformat(exp) if exp else None,
+                    )
+                    connector = result.scalars().first()
+                    if not connector:
+                        return {
+                            "status": "error",
+                            "message": "No Gmail connector found. Please connect Gmail in your workspace settings.",
+                        }
+                    actual_connector_id = connector.id
+
+                logger.info(
+                    f"Creating Gmail draft: to='{final_to}', subject='{final_subject}', connector={actual_connector_id}"
                 )
 
-            from googleapiclient.discovery import build
-
-            gmail_service = build("gmail", "v1", credentials=creds)
-
-            message = MIMEText(final_body)
-            message["to"] = final_to
-            message["subject"] = final_subject
-            if final_cc:
-                message["cc"] = final_cc
-            if final_bcc:
-                message["bcc"] = final_bcc
-            raw = base64.urlsafe_b64encode(message.as_bytes()).decode()
-
-            try:
-                created = await asyncio.get_event_loop().run_in_executor(
-                    None,
-                    lambda: (
-                        gmail_service.users()
-                        .drafts()
-                        .create(userId="me", body={"message": {"raw": raw}})
-                        .execute()
-                    ),
+                is_composio_gmail = (
+                    connector.connector_type
+                    == SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR
                 )
-            except Exception as api_err:
-                from googleapiclient.errors import HttpError
+                if is_composio_gmail:
+                    cca_id = connector.config.get("composio_connected_account_id")
+                    if not cca_id:
+                        return {
+                            "status": "error",
+                            "message": "Composio connected account ID not found for this Gmail connector.",
+                        }
+                else:
+                    from google.oauth2.credentials import Credentials
+
+                    from app.config import config
+                    from app.utils.oauth_security import TokenEncryption
+
+                    config_data = dict(connector.config)
+                    token_encrypted = config_data.get("_token_encrypted", False)
+                    if token_encrypted and config.SECRET_KEY:
+                        token_encryption = TokenEncryption(config.SECRET_KEY)
+                        if config_data.get("token"):
+                            config_data["token"] = token_encryption.decrypt_token(
+                                config_data["token"]
+                            )
+                        if config_data.get("refresh_token"):
+                            config_data["refresh_token"] = (
+                                token_encryption.decrypt_token(
+                                    config_data["refresh_token"]
+                                )
+                            )
+                        if config_data.get("client_secret"):
+                            config_data["client_secret"] = (
+                                token_encryption.decrypt_token(
+                                    config_data["client_secret"]
+                                )
+                            )
 
-                if isinstance(api_err, HttpError) and api_err.resp.status == 403:
-                    logger.warning(
-                        f"Insufficient permissions for connector {actual_connector_id}: {api_err}"
+                    exp = config_data.get("expiry", "")
+                    if exp:
+                        exp = exp.replace("Z", "")
+
+                    creds = Credentials(
+                        token=config_data.get("token"),
+                        refresh_token=config_data.get("refresh_token"),
+                        token_uri=config_data.get("token_uri"),
+                        client_id=config_data.get("client_id"),
+                        client_secret=config_data.get("client_secret"),
+                        scopes=config_data.get("scopes", []),
+                        expiry=datetime.fromisoformat(exp) if exp else None,
                     )
-                    try:
-                        from sqlalchemy.orm.attributes import flag_modified
 
-                        _res = await db_session.execute(
-                            select(SearchSourceConnector).where(
-                                SearchSourceConnector.id == actual_connector_id
-                            )
+                message = MIMEText(final_body)
+                message["to"] = final_to
+                message["subject"] = final_subject
+                if final_cc:
+                    message["cc"] = final_cc
+                if final_bcc:
+                    message["bcc"] = final_bcc
+                raw = base64.urlsafe_b64encode(message.as_bytes()).decode()
+
+                try:
+                    if is_composio_gmail:
+                        from app.agents.new_chat.tools.gmail.composio_helpers import (
+                            execute_composio_gmail_tool,
+                            split_recipients,
+                        )
+
+                        created, error = await execute_composio_gmail_tool(
+                            connector,
+                            user_id,
+                            "GMAIL_CREATE_EMAIL_DRAFT",
+                            {
+                                "user_id": "me",
+                                "recipient_email": final_to,
+                                "subject": final_subject,
+                                "body": final_body,
+                                "cc": split_recipients(final_cc),
+                                "bcc": split_recipients(final_bcc),
+                                "is_html": False,
+                            },
                         )
-                        _conn = _res.scalar_one_or_none()
-                        if _conn and not _conn.config.get("auth_expired"):
-                            _conn.config = {**_conn.config, "auth_expired": True}
-                            flag_modified(_conn, "config")
-                            await db_session.commit()
-                    except Exception:
+                        if error:
+                            raise RuntimeError(error)
+                        if not isinstance(created, dict):
+                            created = {}
+                    else:
+                        from googleapiclient.discovery import build
+
+                        gmail_service = build("gmail", "v1", credentials=creds)
+                        created = await asyncio.get_event_loop().run_in_executor(
+                            None,
+                            lambda: (
+                                gmail_service.users()
+                                .drafts()
+                                .create(userId="me", body={"message": {"raw": raw}})
+                                .execute()
+                            ),
+                        )
+                except Exception as api_err:
+                    from googleapiclient.errors import HttpError
+
+                    if isinstance(api_err, HttpError) and api_err.resp.status == 403:
                         logger.warning(
-                            "Failed to persist auth_expired for connector %s",
-                            actual_connector_id,
-                            exc_info=True,
+                            f"Insufficient permissions for connector {actual_connector_id}: {api_err}"
                         )
-                    return {
-                        "status": "insufficient_permissions",
-                        "connector_id": actual_connector_id,
-                        "message": "This Gmail account needs additional permissions. Please re-authenticate in connector settings.",
-                    }
-                raise
+                        try:
+                            from sqlalchemy.orm.attributes import flag_modified
 
-            logger.info(f"Gmail draft created: id={created.get('id')}")
-
-            kb_message_suffix = ""
-            try:
-                from app.services.gmail import GmailKBSyncService
-
-                kb_service = GmailKBSyncService(db_session)
-                draft_message = created.get("message", {})
-                kb_result = await kb_service.sync_after_create(
-                    message_id=draft_message.get("id", ""),
-                    thread_id=draft_message.get("threadId", ""),
-                    subject=final_subject,
-                    sender="me",
-                    date_str=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
-                    body_text=final_body,
-                    connector_id=actual_connector_id,
-                    search_space_id=search_space_id,
-                    user_id=user_id,
-                    draft_id=created.get("id"),
-                )
-                if kb_result["status"] == "success":
-                    kb_message_suffix = " Your knowledge base has also been updated."
-                else:
+                            _res = await db_session.execute(
+                                select(SearchSourceConnector).where(
+                                    SearchSourceConnector.id == actual_connector_id
+                                )
+                            )
+                            _conn = _res.scalar_one_or_none()
+                            if _conn and not _conn.config.get("auth_expired"):
+                                _conn.config = {**_conn.config, "auth_expired": True}
+                                flag_modified(_conn, "config")
+                                await db_session.commit()
+                        except Exception:
+                            logger.warning(
+                                "Failed to persist auth_expired for connector %s",
+                                actual_connector_id,
+                                exc_info=True,
+                            )
+                        return {
+                            "status": "insufficient_permissions",
+                            "connector_id": actual_connector_id,
+                            "message": "This Gmail account needs additional permissions. Please re-authenticate in connector settings.",
+                        }
+                    raise
+
+                logger.info(f"Gmail draft created: id={created.get('id')}")
+
+                kb_message_suffix = ""
+                try:
+                    from app.services.gmail import GmailKBSyncService
+
+                    kb_service = GmailKBSyncService(db_session)
+                    draft_message = created.get("message", {})
+                    kb_result = await kb_service.sync_after_create(
+                        message_id=draft_message.get("id", ""),
+                        thread_id=draft_message.get("threadId", ""),
+                        subject=final_subject,
+                        sender="me",
+                        date_str=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                        body_text=final_body,
+                        connector_id=actual_connector_id,
+                        search_space_id=search_space_id,
+                        user_id=user_id,
+                        draft_id=created.get("id"),
+                    )
+                    if kb_result["status"] == "success":
+                        kb_message_suffix = (
+                            " Your knowledge base has also been updated."
+                        )
+                    else:
+                        kb_message_suffix = " This draft will be added to your knowledge base in the next scheduled sync."
+                except Exception as kb_err:
+                    logger.warning(f"KB sync after create failed: {kb_err}")
                     kb_message_suffix = " This draft will be added to your knowledge base in the next scheduled sync."
-            except Exception as kb_err:
-                logger.warning(f"KB sync after create failed: {kb_err}")
-                kb_message_suffix = " This draft will be added to your knowledge base in the next scheduled sync."
 
-            return {
-                "status": "success",
-                "draft_id": created.get("id"),
-                "message": f"Successfully created Gmail draft with subject '{final_subject}'.{kb_message_suffix}",
-            }
+                return {
+                    "status": "success",
+                    "draft_id": created.get("id"),
+                    "message": f"Successfully created Gmail draft with subject '{final_subject}'.{kb_message_suffix}",
+                }
 
         except Exception as e:
             from langgraph.errors import GraphInterrupt
diff --git a/surfsense_backend/app/agents/new_chat/tools/gmail/read_email.py b/surfsense_backend/app/agents/new_chat/tools/gmail/read_email.py
new file mode 100644
index 0000000000..4647135918
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/tools/gmail/read_email.py
@@ -0,0 +1,172 @@
+import logging
+from typing import Any
+
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.future import select
+
+from app.db import SearchSourceConnector, SearchSourceConnectorType, async_session_maker
+
+logger = logging.getLogger(__name__)
+
+_GMAIL_TYPES = [
+    SearchSourceConnectorType.GOOGLE_GMAIL_CONNECTOR,
+    SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR,
+]
+
+
+def create_read_gmail_email_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    """
+    Factory function to create the read_gmail_email tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+
+    Returns:
+        Configured read_gmail_email tool
+    """
+    del db_session  # per-call session — see docstring
+
+    @tool
+    async def read_gmail_email(message_id: str) -> dict[str, Any]:
+        """Read the full content of a specific Gmail email by its message ID.
+
+        Use after search_gmail to get the complete body of an email.
+
+        Args:
+            message_id: The Gmail message ID (from search_gmail results).
+
+        Returns:
+            Dictionary with status and the full email content formatted as markdown.
+        """
+        if search_space_id is None or user_id is None:
+            return {"status": "error", "message": "Gmail tool not properly configured."}
+
+        try:
+            async with async_session_maker() as db_session:
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type.in_(_GMAIL_TYPES),
+                    )
+                )
+                connector = result.scalars().first()
+                if not connector:
+                    return {
+                        "status": "error",
+                        "message": "No Gmail connector found. Please connect Gmail in your workspace settings.",
+                    }
+
+                if (
+                    connector.connector_type
+                    == SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR
+                ):
+                    cca_id = connector.config.get("composio_connected_account_id")
+                    if not cca_id:
+                        return {
+                            "status": "error",
+                            "message": "Composio connected account ID not found.",
+                        }
+
+                    from app.agents.new_chat.tools.gmail.search_emails import (
+                        _format_gmail_summary,
+                    )
+                    from app.services.composio_service import ComposioService
+
+                    service = ComposioService()
+                    detail, error = await service.get_gmail_message_detail(
+                        connected_account_id=cca_id,
+                        entity_id=f"surfsense_{user_id}",
+                        message_id=message_id,
+                    )
+                    if error:
+                        return {"status": "error", "message": error}
+                    if not detail:
+                        return {
+                            "status": "not_found",
+                            "message": f"Email with ID '{message_id}' not found.",
+                        }
+
+                    summary = _format_gmail_summary(detail)
+                    content = (
+                        f"# {summary['subject']}\n\n"
+                        f"**From:** {summary['from']}\n"
+                        f"**To:** {summary['to']}\n"
+                        f"**Date:** {summary['date']}\n\n"
+                        f"## Message Content\n\n"
+                        f"{detail.get('messageText') or detail.get('snippet') or ''}\n\n"
+                        f"## Message Details\n\n"
+                        f"- **Message ID:** {summary['message_id']}\n"
+                        f"- **Thread ID:** {summary['thread_id']}\n"
+                    )
+                    return {
+                        "status": "success",
+                        "message_id": summary["message_id"] or message_id,
+                        "content": content,
+                    }
+
+                from app.agents.new_chat.tools.gmail.search_emails import (
+                    _build_credentials,
+                )
+
+                creds = _build_credentials(connector)
+
+                from app.connectors.google_gmail_connector import GoogleGmailConnector
+
+                gmail = GoogleGmailConnector(
+                    credentials=creds,
+                    session=db_session,
+                    user_id=user_id,
+                    connector_id=connector.id,
+                )
+
+                detail, error = await gmail.get_message_details(message_id)
+                if error:
+                    if (
+                        "re-authenticate" in error.lower()
+                        or "authentication failed" in error.lower()
+                    ):
+                        return {
+                            "status": "auth_error",
+                            "message": error,
+                            "connector_type": "gmail",
+                        }
+                    return {"status": "error", "message": error}
+
+                if not detail:
+                    return {
+                        "status": "not_found",
+                        "message": f"Email with ID '{message_id}' not found.",
+                    }
+
+                content = gmail.format_message_to_markdown(detail)
+
+                return {
+                    "status": "success",
+                    "message_id": message_id,
+                    "content": content,
+                }
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+            logger.error("Error reading Gmail email: %s", e, exc_info=True)
+            return {
+                "status": "error",
+                "message": "Failed to read email. Please try again.",
+            }
+
+    return read_gmail_email
diff --git a/surfsense_backend/app/agents/new_chat/tools/gmail/search_emails.py b/surfsense_backend/app/agents/new_chat/tools/gmail/search_emails.py
new file mode 100644
index 0000000000..3ce154c53c
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/tools/gmail/search_emails.py
@@ -0,0 +1,260 @@
+import logging
+from datetime import datetime
+from typing import Any
+
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.future import select
+
+from app.db import SearchSourceConnector, SearchSourceConnectorType, async_session_maker
+
+logger = logging.getLogger(__name__)
+
+_GMAIL_TYPES = [
+    SearchSourceConnectorType.GOOGLE_GMAIL_CONNECTOR,
+    SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR,
+]
+
+_token_encryption_cache: object | None = None
+
+
+def _get_token_encryption():
+    global _token_encryption_cache
+    if _token_encryption_cache is None:
+        from app.config import config
+        from app.utils.oauth_security import TokenEncryption
+
+        if not config.SECRET_KEY:
+            raise RuntimeError("SECRET_KEY not configured for token decryption.")
+        _token_encryption_cache = TokenEncryption(config.SECRET_KEY)
+    return _token_encryption_cache
+
+
+def _build_credentials(connector: SearchSourceConnector):
+    """Build Google OAuth Credentials from a connector's stored config.
+
+    Handles both native OAuth connectors (with encrypted tokens) and
+    Composio-backed connectors. Shared by Gmail and Calendar tools.
+    """
+    from app.utils.google_credentials import COMPOSIO_GOOGLE_CONNECTOR_TYPES
+
+    if connector.connector_type in COMPOSIO_GOOGLE_CONNECTOR_TYPES:
+        raise ValueError("Composio connectors must use Composio tool execution.")
+
+    from google.oauth2.credentials import Credentials
+
+    cfg = dict(connector.config)
+    if cfg.get("_token_encrypted"):
+        enc = _get_token_encryption()
+        for key in ("token", "refresh_token", "client_secret"):
+            if cfg.get(key):
+                cfg[key] = enc.decrypt_token(cfg[key])
+
+    exp = (cfg.get("expiry") or "").replace("Z", "")
+    return Credentials(
+        token=cfg.get("token"),
+        refresh_token=cfg.get("refresh_token"),
+        token_uri=cfg.get("token_uri"),
+        client_id=cfg.get("client_id"),
+        client_secret=cfg.get("client_secret"),
+        scopes=cfg.get("scopes", []),
+        expiry=datetime.fromisoformat(exp) if exp else None,
+    )
+
+
+def _gmail_headers(message: dict[str, Any]) -> dict[str, str]:
+    headers = message.get("payload", {}).get("headers", [])
+    return {
+        header.get("name", "").lower(): header.get("value", "")
+        for header in headers
+        if isinstance(header, dict)
+    }
+
+
+def _format_gmail_summary(message: dict[str, Any]) -> dict[str, Any]:
+    headers = _gmail_headers(message)
+    return {
+        "message_id": message.get("id") or message.get("messageId"),
+        "thread_id": message.get("threadId"),
+        "subject": message.get("subject") or headers.get("subject", "No Subject"),
+        "from": message.get("sender") or headers.get("from", "Unknown"),
+        "to": message.get("to") or headers.get("to", ""),
+        "date": message.get("messageTimestamp") or headers.get("date", ""),
+        "snippet": message.get("snippet") or message.get("messageText", "")[:300],
+        "labels": message.get("labelIds", []),
+    }
+
+
+async def _search_composio_gmail(
+    connector: SearchSourceConnector,
+    user_id: str,
+    query: str,
+    max_results: int,
+) -> dict[str, Any]:
+    cca_id = connector.config.get("composio_connected_account_id")
+    if not cca_id:
+        return {
+            "status": "error",
+            "message": "Composio connected account ID not found.",
+        }
+
+    from app.services.composio_service import ComposioService
+
+    service = ComposioService()
+    messages, _next_token, _estimate, error = await service.get_gmail_messages(
+        connected_account_id=cca_id,
+        entity_id=f"surfsense_{user_id}",
+        query=query,
+        max_results=max_results,
+    )
+    if error:
+        return {"status": "error", "message": error}
+
+    emails = [_format_gmail_summary(message) for message in messages]
+    return {
+        "status": "success",
+        "emails": emails,
+        "total": len(emails),
+        "message": "No emails found." if not emails else None,
+    }
+
+
+def create_search_gmail_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    """
+    Factory function to create the search_gmail tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+
+    Returns:
+        Configured search_gmail tool
+    """
+    del db_session  # per-call session — see docstring
+
+    @tool
+    async def search_gmail(
+        query: str,
+        max_results: int = 10,
+    ) -> dict[str, Any]:
+        """Search emails in the user's Gmail inbox using Gmail search syntax.
+
+        Args:
+            query: Gmail search query, same syntax as the Gmail search bar.
+                Examples: "from:alice@example.com", "subject:meeting",
+                "is:unread", "after:2024/01/01 before:2024/02/01",
+                "has:attachment", "in:sent".
+            max_results: Number of emails to return (default 10, max 20).
+
+        Returns:
+            Dictionary with status and a list of email summaries including
+            message_id, subject, from, date, snippet.
+        """
+        if search_space_id is None or user_id is None:
+            return {"status": "error", "message": "Gmail tool not properly configured."}
+
+        max_results = min(max_results, 20)
+
+        try:
+            async with async_session_maker() as db_session:
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type.in_(_GMAIL_TYPES),
+                    )
+                )
+                connector = result.scalars().first()
+                if not connector:
+                    return {
+                        "status": "error",
+                        "message": "No Gmail connector found. Please connect Gmail in your workspace settings.",
+                    }
+
+                if (
+                    connector.connector_type
+                    == SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR
+                ):
+                    return await _search_composio_gmail(
+                        connector, str(user_id), query, max_results
+                    )
+
+                creds = _build_credentials(connector)
+
+                from app.connectors.google_gmail_connector import GoogleGmailConnector
+
+                gmail = GoogleGmailConnector(
+                    credentials=creds,
+                    session=db_session,
+                    user_id=user_id,
+                    connector_id=connector.id,
+                )
+
+                messages_list, error = await gmail.get_messages_list(
+                    max_results=max_results, query=query
+                )
+                if error:
+                    if (
+                        "re-authenticate" in error.lower()
+                        or "authentication failed" in error.lower()
+                    ):
+                        return {
+                            "status": "auth_error",
+                            "message": error,
+                            "connector_type": "gmail",
+                        }
+                    return {"status": "error", "message": error}
+
+                if not messages_list:
+                    return {
+                        "status": "success",
+                        "emails": [],
+                        "total": 0,
+                        "message": "No emails found.",
+                    }
+
+                emails = []
+                for msg in messages_list:
+                    detail, err = await gmail.get_message_details(msg["id"])
+                    if err:
+                        continue
+                    headers = {
+                        h["name"].lower(): h["value"]
+                        for h in detail.get("payload", {}).get("headers", [])
+                    }
+                    emails.append(
+                        {
+                            "message_id": detail.get("id"),
+                            "thread_id": detail.get("threadId"),
+                            "subject": headers.get("subject", "No Subject"),
+                            "from": headers.get("from", "Unknown"),
+                            "to": headers.get("to", ""),
+                            "date": headers.get("date", ""),
+                            "snippet": detail.get("snippet", ""),
+                            "labels": detail.get("labelIds", []),
+                        }
+                    )
+
+                return {"status": "success", "emails": emails, "total": len(emails)}
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+            logger.error("Error searching Gmail: %s", e, exc_info=True)
+            return {
+                "status": "error",
+                "message": "Failed to search Gmail. Please try again.",
+            }
+
+    return search_gmail
diff --git a/surfsense_backend/app/agents/new_chat/tools/gmail/send_email.py b/surfsense_backend/app/agents/new_chat/tools/gmail/send_email.py
index 2599578bd9..4d5aa3bcce 100644
--- a/surfsense_backend/app/agents/new_chat/tools/gmail/send_email.py
+++ b/surfsense_backend/app/agents/new_chat/tools/gmail/send_email.py
@@ -6,9 +6,10 @@
 from typing import Any
 
 from langchain_core.tools import tool
-from langgraph.types import interrupt
 from sqlalchemy.ext.asyncio import AsyncSession
 
+from app.agents.new_chat.tools.hitl import request_approval
+from app.db import async_session_maker
 from app.services.gmail import GmailToolMetadataService
 
 logger = logging.getLogger(__name__)
@@ -19,6 +20,23 @@ def create_send_gmail_email_tool(
     search_space_id: int | None = None,
     user_id: str | None = None,
 ):
+    """
+    Factory function to create the send_gmail_email tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+
+    Returns:
+        Configured send_gmail_email tool
+    """
+    del db_session  # per-call session — see docstring
+
     @tool
     async def send_gmail_email(
         to: str,
@@ -58,275 +76,277 @@ async def send_gmail_email(
         """
         logger.info(f"send_gmail_email called: to='{to}', subject='{subject}'")
 
-        if db_session is None or search_space_id is None or user_id is None:
+        if search_space_id is None or user_id is None:
             return {
                 "status": "error",
                 "message": "Gmail tool not properly configured. Please contact support.",
             }
 
         try:
-            metadata_service = GmailToolMetadataService(db_session)
-            context = await metadata_service.get_creation_context(
-                search_space_id, user_id
-            )
-
-            if "error" in context:
-                logger.error(f"Failed to fetch creation context: {context['error']}")
-                return {"status": "error", "message": context["error"]}
-
-            accounts = context.get("accounts", [])
-            if accounts and all(a.get("auth_expired") for a in accounts):
-                logger.warning("All Gmail accounts have expired authentication")
-                return {
-                    "status": "auth_error",
-                    "message": "All connected Gmail accounts need re-authentication. Please re-authenticate in your connector settings.",
-                    "connector_type": "gmail",
-                }
-
-            logger.info(
-                f"Requesting approval for sending Gmail email: to='{to}', subject='{subject}'"
-            )
-            approval = interrupt(
-                {
-                    "type": "gmail_email_send",
-                    "action": {
-                        "tool": "send_gmail_email",
-                        "params": {
-                            "to": to,
-                            "subject": subject,
-                            "body": body,
-                            "cc": cc,
-                            "bcc": bcc,
-                            "connector_id": None,
-                        },
-                    },
-                    "context": context,
-                }
-            )
-
-            decisions_raw = (
-                approval.get("decisions", []) if isinstance(approval, dict) else []
-            )
-            decisions = (
-                decisions_raw if isinstance(decisions_raw, list) else [decisions_raw]
-            )
-            decisions = [d for d in decisions if isinstance(d, dict)]
-            if not decisions:
-                logger.warning("No approval decision received")
-                return {"status": "error", "message": "No approval decision received"}
-
-            decision = decisions[0]
-            decision_type = decision.get("type") or decision.get("decision_type")
-            logger.info(f"User decision: {decision_type}")
-
-            if decision_type == "reject":
-                return {
-                    "status": "rejected",
-                    "message": "User declined. The email was not sent. Do not ask again or suggest alternatives.",
-                }
-
-            final_params: dict[str, Any] = {}
-            edited_action = decision.get("edited_action")
-            if isinstance(edited_action, dict):
-                edited_args = edited_action.get("args")
-                if isinstance(edited_args, dict):
-                    final_params = edited_args
-            elif isinstance(decision.get("args"), dict):
-                final_params = decision["args"]
-
-            final_to = final_params.get("to", to)
-            final_subject = final_params.get("subject", subject)
-            final_body = final_params.get("body", body)
-            final_cc = final_params.get("cc", cc)
-            final_bcc = final_params.get("bcc", bcc)
-            final_connector_id = final_params.get("connector_id")
-
-            from sqlalchemy.future import select
-
-            from app.db import SearchSourceConnector, SearchSourceConnectorType
-
-            _gmail_types = [
-                SearchSourceConnectorType.GOOGLE_GMAIL_CONNECTOR,
-                SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR,
-            ]
-
-            if final_connector_id is not None:
-                result = await db_session.execute(
-                    select(SearchSourceConnector).filter(
-                        SearchSourceConnector.id == final_connector_id,
-                        SearchSourceConnector.search_space_id == search_space_id,
-                        SearchSourceConnector.user_id == user_id,
-                        SearchSourceConnector.connector_type.in_(_gmail_types),
-                    )
+            async with async_session_maker() as db_session:
+                metadata_service = GmailToolMetadataService(db_session)
+                context = await metadata_service.get_creation_context(
+                    search_space_id, user_id
                 )
-                connector = result.scalars().first()
-                if not connector:
-                    return {
-                        "status": "error",
-                        "message": "Selected Gmail connector is invalid or has been disconnected.",
-                    }
-                actual_connector_id = connector.id
-            else:
-                result = await db_session.execute(
-                    select(SearchSourceConnector).filter(
-                        SearchSourceConnector.search_space_id == search_space_id,
-                        SearchSourceConnector.user_id == user_id,
-                        SearchSourceConnector.connector_type.in_(_gmail_types),
+
+                if "error" in context:
+                    logger.error(
+                        f"Failed to fetch creation context: {context['error']}"
                     )
-                )
-                connector = result.scalars().first()
-                if not connector:
+                    return {"status": "error", "message": context["error"]}
+
+                accounts = context.get("accounts", [])
+                if accounts and all(a.get("auth_expired") for a in accounts):
+                    logger.warning("All Gmail accounts have expired authentication")
                     return {
-                        "status": "error",
-                        "message": "No Gmail connector found. Please connect Gmail in your workspace settings.",
+                        "status": "auth_error",
+                        "message": "All connected Gmail accounts need re-authentication. Please re-authenticate in your connector settings.",
+                        "connector_type": "gmail",
                     }
-                actual_connector_id = connector.id
-
-            logger.info(
-                f"Sending Gmail email: to='{final_to}', subject='{final_subject}', connector={actual_connector_id}"
-            )
 
-            if (
-                connector.connector_type
-                == SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR
-            ):
-                from app.utils.google_credentials import build_composio_credentials
+                logger.info(
+                    f"Requesting approval for sending Gmail email: to='{to}', subject='{subject}'"
+                )
+                result = request_approval(
+                    action_type="gmail_email_send",
+                    tool_name="send_gmail_email",
+                    params={
+                        "to": to,
+                        "subject": subject,
+                        "body": body,
+                        "cc": cc,
+                        "bcc": bcc,
+                        "connector_id": None,
+                    },
+                    context=context,
+                )
 
-                cca_id = connector.config.get("composio_connected_account_id")
-                if cca_id:
-                    creds = build_composio_credentials(cca_id)
-                else:
+                if result.rejected:
                     return {
-                        "status": "error",
-                        "message": "Composio connected account ID not found for this Gmail connector.",
+                        "status": "rejected",
+                        "message": "User declined. The email was not sent. Do not ask again or suggest alternatives.",
                     }
-            else:
-                from google.oauth2.credentials import Credentials
-
-                from app.config import config
-                from app.utils.oauth_security import TokenEncryption
-
-                config_data = dict(connector.config)
-                token_encrypted = config_data.get("_token_encrypted", False)
-                if token_encrypted and config.SECRET_KEY:
-                    token_encryption = TokenEncryption(config.SECRET_KEY)
-                    if config_data.get("token"):
-                        config_data["token"] = token_encryption.decrypt_token(
-                            config_data["token"]
-                        )
-                    if config_data.get("refresh_token"):
-                        config_data["refresh_token"] = token_encryption.decrypt_token(
-                            config_data["refresh_token"]
+
+                final_to = result.params.get("to", to)
+                final_subject = result.params.get("subject", subject)
+                final_body = result.params.get("body", body)
+                final_cc = result.params.get("cc", cc)
+                final_bcc = result.params.get("bcc", bcc)
+                final_connector_id = result.params.get("connector_id")
+
+                from sqlalchemy.future import select
+
+                from app.db import SearchSourceConnector, SearchSourceConnectorType
+
+                _gmail_types = [
+                    SearchSourceConnectorType.GOOGLE_GMAIL_CONNECTOR,
+                    SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR,
+                ]
+
+                if final_connector_id is not None:
+                    result = await db_session.execute(
+                        select(SearchSourceConnector).filter(
+                            SearchSourceConnector.id == final_connector_id,
+                            SearchSourceConnector.search_space_id == search_space_id,
+                            SearchSourceConnector.user_id == user_id,
+                            SearchSourceConnector.connector_type.in_(_gmail_types),
                         )
-                    if config_data.get("client_secret"):
-                        config_data["client_secret"] = token_encryption.decrypt_token(
-                            config_data["client_secret"]
+                    )
+                    connector = result.scalars().first()
+                    if not connector:
+                        return {
+                            "status": "error",
+                            "message": "Selected Gmail connector is invalid or has been disconnected.",
+                        }
+                    actual_connector_id = connector.id
+                else:
+                    result = await db_session.execute(
+                        select(SearchSourceConnector).filter(
+                            SearchSourceConnector.search_space_id == search_space_id,
+                            SearchSourceConnector.user_id == user_id,
+                            SearchSourceConnector.connector_type.in_(_gmail_types),
                         )
-
-                exp = config_data.get("expiry", "")
-                if exp:
-                    exp = exp.replace("Z", "")
-
-                creds = Credentials(
-                    token=config_data.get("token"),
-                    refresh_token=config_data.get("refresh_token"),
-                    token_uri=config_data.get("token_uri"),
-                    client_id=config_data.get("client_id"),
-                    client_secret=config_data.get("client_secret"),
-                    scopes=config_data.get("scopes", []),
-                    expiry=datetime.fromisoformat(exp) if exp else None,
+                    )
+                    connector = result.scalars().first()
+                    if not connector:
+                        return {
+                            "status": "error",
+                            "message": "No Gmail connector found. Please connect Gmail in your workspace settings.",
+                        }
+                    actual_connector_id = connector.id
+
+                logger.info(
+                    f"Sending Gmail email: to='{final_to}', subject='{final_subject}', connector={actual_connector_id}"
                 )
 
-            from googleapiclient.discovery import build
-
-            gmail_service = build("gmail", "v1", credentials=creds)
-
-            message = MIMEText(final_body)
-            message["to"] = final_to
-            message["subject"] = final_subject
-            if final_cc:
-                message["cc"] = final_cc
-            if final_bcc:
-                message["bcc"] = final_bcc
-            raw = base64.urlsafe_b64encode(message.as_bytes()).decode()
-
-            try:
-                sent = await asyncio.get_event_loop().run_in_executor(
-                    None,
-                    lambda: (
-                        gmail_service.users()
-                        .messages()
-                        .send(userId="me", body={"raw": raw})
-                        .execute()
-                    ),
+                is_composio_gmail = (
+                    connector.connector_type
+                    == SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR
                 )
-            except Exception as api_err:
-                from googleapiclient.errors import HttpError
+                if is_composio_gmail:
+                    cca_id = connector.config.get("composio_connected_account_id")
+                    if not cca_id:
+                        return {
+                            "status": "error",
+                            "message": "Composio connected account ID not found for this Gmail connector.",
+                        }
+                else:
+                    from google.oauth2.credentials import Credentials
+
+                    from app.config import config
+                    from app.utils.oauth_security import TokenEncryption
+
+                    config_data = dict(connector.config)
+                    token_encrypted = config_data.get("_token_encrypted", False)
+                    if token_encrypted and config.SECRET_KEY:
+                        token_encryption = TokenEncryption(config.SECRET_KEY)
+                        if config_data.get("token"):
+                            config_data["token"] = token_encryption.decrypt_token(
+                                config_data["token"]
+                            )
+                        if config_data.get("refresh_token"):
+                            config_data["refresh_token"] = (
+                                token_encryption.decrypt_token(
+                                    config_data["refresh_token"]
+                                )
+                            )
+                        if config_data.get("client_secret"):
+                            config_data["client_secret"] = (
+                                token_encryption.decrypt_token(
+                                    config_data["client_secret"]
+                                )
+                            )
 
-                if isinstance(api_err, HttpError) and api_err.resp.status == 403:
-                    logger.warning(
-                        f"Insufficient permissions for connector {actual_connector_id}: {api_err}"
+                    exp = config_data.get("expiry", "")
+                    if exp:
+                        exp = exp.replace("Z", "")
+
+                    creds = Credentials(
+                        token=config_data.get("token"),
+                        refresh_token=config_data.get("refresh_token"),
+                        token_uri=config_data.get("token_uri"),
+                        client_id=config_data.get("client_id"),
+                        client_secret=config_data.get("client_secret"),
+                        scopes=config_data.get("scopes", []),
+                        expiry=datetime.fromisoformat(exp) if exp else None,
                     )
-                    try:
-                        from sqlalchemy.orm.attributes import flag_modified
 
-                        _res = await db_session.execute(
-                            select(SearchSourceConnector).where(
-                                SearchSourceConnector.id == actual_connector_id
-                            )
+                message = MIMEText(final_body)
+                message["to"] = final_to
+                message["subject"] = final_subject
+                if final_cc:
+                    message["cc"] = final_cc
+                if final_bcc:
+                    message["bcc"] = final_bcc
+                raw = base64.urlsafe_b64encode(message.as_bytes()).decode()
+
+                try:
+                    if is_composio_gmail:
+                        from app.agents.new_chat.tools.gmail.composio_helpers import (
+                            execute_composio_gmail_tool,
+                            split_recipients,
+                        )
+
+                        sent, error = await execute_composio_gmail_tool(
+                            connector,
+                            user_id,
+                            "GMAIL_SEND_EMAIL",
+                            {
+                                "user_id": "me",
+                                "recipient_email": final_to,
+                                "subject": final_subject,
+                                "body": final_body,
+                                "cc": split_recipients(final_cc),
+                                "bcc": split_recipients(final_bcc),
+                                "is_html": False,
+                            },
                         )
-                        _conn = _res.scalar_one_or_none()
-                        if _conn and not _conn.config.get("auth_expired"):
-                            _conn.config = {**_conn.config, "auth_expired": True}
-                            flag_modified(_conn, "config")
-                            await db_session.commit()
-                    except Exception:
+                        if error:
+                            raise RuntimeError(error)
+                        if not isinstance(sent, dict):
+                            sent = {}
+                    else:
+                        from googleapiclient.discovery import build
+
+                        gmail_service = build("gmail", "v1", credentials=creds)
+                        sent = await asyncio.get_event_loop().run_in_executor(
+                            None,
+                            lambda: (
+                                gmail_service.users()
+                                .messages()
+                                .send(userId="me", body={"raw": raw})
+                                .execute()
+                            ),
+                        )
+                except Exception as api_err:
+                    from googleapiclient.errors import HttpError
+
+                    if isinstance(api_err, HttpError) and api_err.resp.status == 403:
                         logger.warning(
-                            "Failed to persist auth_expired for connector %s",
-                            actual_connector_id,
-                            exc_info=True,
+                            f"Insufficient permissions for connector {actual_connector_id}: {api_err}"
                         )
-                    return {
-                        "status": "insufficient_permissions",
-                        "connector_id": actual_connector_id,
-                        "message": "This Gmail account needs additional permissions. Please re-authenticate in connector settings.",
-                    }
-                raise
+                        try:
+                            from sqlalchemy.orm.attributes import flag_modified
 
-            logger.info(
-                f"Gmail email sent: id={sent.get('id')}, threadId={sent.get('threadId')}"
-            )
-
-            kb_message_suffix = ""
-            try:
-                from app.services.gmail import GmailKBSyncService
-
-                kb_service = GmailKBSyncService(db_session)
-                kb_result = await kb_service.sync_after_create(
-                    message_id=sent.get("id", ""),
-                    thread_id=sent.get("threadId", ""),
-                    subject=final_subject,
-                    sender="me",
-                    date_str=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
-                    body_text=final_body,
-                    connector_id=actual_connector_id,
-                    search_space_id=search_space_id,
-                    user_id=user_id,
+                            _res = await db_session.execute(
+                                select(SearchSourceConnector).where(
+                                    SearchSourceConnector.id == actual_connector_id
+                                )
+                            )
+                            _conn = _res.scalar_one_or_none()
+                            if _conn and not _conn.config.get("auth_expired"):
+                                _conn.config = {**_conn.config, "auth_expired": True}
+                                flag_modified(_conn, "config")
+                                await db_session.commit()
+                        except Exception:
+                            logger.warning(
+                                "Failed to persist auth_expired for connector %s",
+                                actual_connector_id,
+                                exc_info=True,
+                            )
+                        return {
+                            "status": "insufficient_permissions",
+                            "connector_id": actual_connector_id,
+                            "message": "This Gmail account needs additional permissions. Please re-authenticate in connector settings.",
+                        }
+                    raise
+
+                logger.info(
+                    f"Gmail email sent: id={sent.get('id')}, threadId={sent.get('threadId')}"
                 )
-                if kb_result["status"] == "success":
-                    kb_message_suffix = " Your knowledge base has also been updated."
-                else:
+
+                kb_message_suffix = ""
+                try:
+                    from app.services.gmail import GmailKBSyncService
+
+                    kb_service = GmailKBSyncService(db_session)
+                    kb_result = await kb_service.sync_after_create(
+                        message_id=sent.get("id", ""),
+                        thread_id=sent.get("threadId", ""),
+                        subject=final_subject,
+                        sender="me",
+                        date_str=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                        body_text=final_body,
+                        connector_id=actual_connector_id,
+                        search_space_id=search_space_id,
+                        user_id=user_id,
+                    )
+                    if kb_result["status"] == "success":
+                        kb_message_suffix = (
+                            " Your knowledge base has also been updated."
+                        )
+                    else:
+                        kb_message_suffix = " This email will be added to your knowledge base in the next scheduled sync."
+                except Exception as kb_err:
+                    logger.warning(f"KB sync after send failed: {kb_err}")
                     kb_message_suffix = " This email will be added to your knowledge base in the next scheduled sync."
-            except Exception as kb_err:
-                logger.warning(f"KB sync after send failed: {kb_err}")
-                kb_message_suffix = " This email will be added to your knowledge base in the next scheduled sync."
 
-            return {
-                "status": "success",
-                "message_id": sent.get("id"),
-                "thread_id": sent.get("threadId"),
-                "message": f"Successfully sent email to '{final_to}' with subject '{final_subject}'.{kb_message_suffix}",
-            }
+                return {
+                    "status": "success",
+                    "message_id": sent.get("id"),
+                    "thread_id": sent.get("threadId"),
+                    "message": f"Successfully sent email to '{final_to}' with subject '{final_subject}'.{kb_message_suffix}",
+                }
 
         except Exception as e:
             from langgraph.errors import GraphInterrupt
diff --git a/surfsense_backend/app/agents/new_chat/tools/gmail/trash_email.py b/surfsense_backend/app/agents/new_chat/tools/gmail/trash_email.py
index 1460208458..95f5b4e6c5 100644
--- a/surfsense_backend/app/agents/new_chat/tools/gmail/trash_email.py
+++ b/surfsense_backend/app/agents/new_chat/tools/gmail/trash_email.py
@@ -4,9 +4,10 @@
 from typing import Any
 
 from langchain_core.tools import tool
-from langgraph.types import interrupt
 from sqlalchemy.ext.asyncio import AsyncSession
 
+from app.agents.new_chat.tools.hitl import request_approval
+from app.db import async_session_maker
 from app.services.gmail import GmailToolMetadataService
 
 logger = logging.getLogger(__name__)
@@ -17,6 +18,23 @@ def create_trash_gmail_email_tool(
     search_space_id: int | None = None,
     user_id: str | None = None,
 ):
+    """
+    Factory function to create the trash_gmail_email tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+
+    Returns:
+        Configured trash_gmail_email tool
+    """
+    del db_session  # per-call session — see docstring
+
     @tool
     async def trash_gmail_email(
         email_subject_or_id: str,
@@ -55,272 +73,261 @@ async def trash_gmail_email(
             f"trash_gmail_email called: email_subject_or_id='{email_subject_or_id}', delete_from_kb={delete_from_kb}"
         )
 
-        if db_session is None or search_space_id is None or user_id is None:
+        if search_space_id is None or user_id is None:
             return {
                 "status": "error",
                 "message": "Gmail tool not properly configured. Please contact support.",
             }
 
         try:
-            metadata_service = GmailToolMetadataService(db_session)
-            context = await metadata_service.get_trash_context(
-                search_space_id, user_id, email_subject_or_id
-            )
-
-            if "error" in context:
-                error_msg = context["error"]
-                if "not found" in error_msg.lower():
-                    logger.warning(f"Email not found: {error_msg}")
-                    return {"status": "not_found", "message": error_msg}
-                logger.error(f"Failed to fetch trash context: {error_msg}")
-                return {"status": "error", "message": error_msg}
-
-            account = context.get("account", {})
-            if account.get("auth_expired"):
-                logger.warning(
-                    "Gmail account %s has expired authentication",
-                    account.get("id"),
+            async with async_session_maker() as db_session:
+                metadata_service = GmailToolMetadataService(db_session)
+                context = await metadata_service.get_trash_context(
+                    search_space_id, user_id, email_subject_or_id
                 )
-                return {
-                    "status": "auth_error",
-                    "message": "The Gmail account for this email needs re-authentication. Please re-authenticate in your connector settings.",
-                    "connector_type": "gmail",
-                }
-
-            email = context["email"]
-            message_id = email["message_id"]
-            document_id = email.get("document_id")
-            connector_id_from_context = context["account"]["id"]
-
-            if not message_id:
-                return {
-                    "status": "error",
-                    "message": "Message ID is missing from the indexed document. Please re-index the email and try again.",
-                }
 
-            logger.info(
-                f"Requesting approval for trashing Gmail email: '{email_subject_or_id}' (message_id={message_id}, delete_from_kb={delete_from_kb})"
-            )
-            approval = interrupt(
-                {
-                    "type": "gmail_email_trash",
-                    "action": {
-                        "tool": "trash_gmail_email",
-                        "params": {
-                            "message_id": message_id,
-                            "connector_id": connector_id_from_context,
-                            "delete_from_kb": delete_from_kb,
-                        },
-                    },
-                    "context": context,
-                }
-            )
-
-            decisions_raw = (
-                approval.get("decisions", []) if isinstance(approval, dict) else []
-            )
-            decisions = (
-                decisions_raw if isinstance(decisions_raw, list) else [decisions_raw]
-            )
-            decisions = [d for d in decisions if isinstance(d, dict)]
-            if not decisions:
-                logger.warning("No approval decision received")
-                return {"status": "error", "message": "No approval decision received"}
-
-            decision = decisions[0]
-            decision_type = decision.get("type") or decision.get("decision_type")
-            logger.info(f"User decision: {decision_type}")
-
-            if decision_type == "reject":
-                return {
-                    "status": "rejected",
-                    "message": "User declined. The email was not trashed. Do not ask again or suggest alternatives.",
-                }
-
-            edited_action = decision.get("edited_action")
-            final_params: dict[str, Any] = {}
-            if isinstance(edited_action, dict):
-                edited_args = edited_action.get("args")
-                if isinstance(edited_args, dict):
-                    final_params = edited_args
-            elif isinstance(decision.get("args"), dict):
-                final_params = decision["args"]
-
-            final_message_id = final_params.get("message_id", message_id)
-            final_connector_id = final_params.get(
-                "connector_id", connector_id_from_context
-            )
-            final_delete_from_kb = final_params.get("delete_from_kb", delete_from_kb)
-
-            if not final_connector_id:
-                return {
-                    "status": "error",
-                    "message": "No connector found for this email.",
-                }
+                if "error" in context:
+                    error_msg = context["error"]
+                    if "not found" in error_msg.lower():
+                        logger.warning(f"Email not found: {error_msg}")
+                        return {"status": "not_found", "message": error_msg}
+                    logger.error(f"Failed to fetch trash context: {error_msg}")
+                    return {"status": "error", "message": error_msg}
 
-            from sqlalchemy.future import select
+                account = context.get("account", {})
+                if account.get("auth_expired"):
+                    logger.warning(
+                        "Gmail account %s has expired authentication",
+                        account.get("id"),
+                    )
+                    return {
+                        "status": "auth_error",
+                        "message": "The Gmail account for this email needs re-authentication. Please re-authenticate in your connector settings.",
+                        "connector_type": "gmail",
+                    }
 
-            from app.db import SearchSourceConnector, SearchSourceConnectorType
+                email = context["email"]
+                message_id = email["message_id"]
+                document_id = email.get("document_id")
+                connector_id_from_context = context["account"]["id"]
 
-            _gmail_types = [
-                SearchSourceConnectorType.GOOGLE_GMAIL_CONNECTOR,
-                SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR,
-            ]
+                if not message_id:
+                    return {
+                        "status": "error",
+                        "message": "Message ID is missing from the indexed document. Please re-index the email and try again.",
+                    }
 
-            result = await db_session.execute(
-                select(SearchSourceConnector).filter(
-                    SearchSourceConnector.id == final_connector_id,
-                    SearchSourceConnector.search_space_id == search_space_id,
-                    SearchSourceConnector.user_id == user_id,
-                    SearchSourceConnector.connector_type.in_(_gmail_types),
+                logger.info(
+                    f"Requesting approval for trashing Gmail email: '{email_subject_or_id}' (message_id={message_id}, delete_from_kb={delete_from_kb})"
+                )
+                result = request_approval(
+                    action_type="gmail_email_trash",
+                    tool_name="trash_gmail_email",
+                    params={
+                        "message_id": message_id,
+                        "connector_id": connector_id_from_context,
+                        "delete_from_kb": delete_from_kb,
+                    },
+                    context=context,
                 )
-            )
-            connector = result.scalars().first()
-            if not connector:
-                return {
-                    "status": "error",
-                    "message": "Selected Gmail connector is invalid or has been disconnected.",
-                }
 
-            logger.info(
-                f"Trashing Gmail email: message_id='{final_message_id}', connector={final_connector_id}"
-            )
+                if result.rejected:
+                    return {
+                        "status": "rejected",
+                        "message": "User declined. The email was not trashed. Do not ask again or suggest alternatives.",
+                    }
 
-            if (
-                connector.connector_type
-                == SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR
-            ):
-                from app.utils.google_credentials import build_composio_credentials
+                final_message_id = result.params.get("message_id", message_id)
+                final_connector_id = result.params.get(
+                    "connector_id", connector_id_from_context
+                )
+                final_delete_from_kb = result.params.get(
+                    "delete_from_kb", delete_from_kb
+                )
 
-                cca_id = connector.config.get("composio_connected_account_id")
-                if cca_id:
-                    creds = build_composio_credentials(cca_id)
-                else:
+                if not final_connector_id:
                     return {
                         "status": "error",
-                        "message": "Composio connected account ID not found for this Gmail connector.",
+                        "message": "No connector found for this email.",
                     }
-            else:
-                from google.oauth2.credentials import Credentials
-
-                from app.config import config
-                from app.utils.oauth_security import TokenEncryption
-
-                config_data = dict(connector.config)
-                token_encrypted = config_data.get("_token_encrypted", False)
-                if token_encrypted and config.SECRET_KEY:
-                    token_encryption = TokenEncryption(config.SECRET_KEY)
-                    if config_data.get("token"):
-                        config_data["token"] = token_encryption.decrypt_token(
-                            config_data["token"]
-                        )
-                    if config_data.get("refresh_token"):
-                        config_data["refresh_token"] = token_encryption.decrypt_token(
-                            config_data["refresh_token"]
-                        )
-                    if config_data.get("client_secret"):
-                        config_data["client_secret"] = token_encryption.decrypt_token(
-                            config_data["client_secret"]
-                        )
 
-                exp = config_data.get("expiry", "")
-                if exp:
-                    exp = exp.replace("Z", "")
-
-                creds = Credentials(
-                    token=config_data.get("token"),
-                    refresh_token=config_data.get("refresh_token"),
-                    token_uri=config_data.get("token_uri"),
-                    client_id=config_data.get("client_id"),
-                    client_secret=config_data.get("client_secret"),
-                    scopes=config_data.get("scopes", []),
-                    expiry=datetime.fromisoformat(exp) if exp else None,
-                )
+                from sqlalchemy.future import select
 
-            from googleapiclient.discovery import build
+                from app.db import SearchSourceConnector, SearchSourceConnectorType
 
-            gmail_service = build("gmail", "v1", credentials=creds)
+                _gmail_types = [
+                    SearchSourceConnectorType.GOOGLE_GMAIL_CONNECTOR,
+                    SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR,
+                ]
 
-            try:
-                await asyncio.get_event_loop().run_in_executor(
-                    None,
-                    lambda: (
-                        gmail_service.users()
-                        .messages()
-                        .trash(userId="me", id=final_message_id)
-                        .execute()
-                    ),
-                )
-            except Exception as api_err:
-                from googleapiclient.errors import HttpError
-
-                if isinstance(api_err, HttpError) and api_err.resp.status == 403:
-                    logger.warning(
-                        f"Insufficient permissions for connector {connector.id}: {api_err}"
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.id == final_connector_id,
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type.in_(_gmail_types),
                     )
-                    try:
-                        from sqlalchemy.orm.attributes import flag_modified
-
-                        if not connector.config.get("auth_expired"):
-                            connector.config = {
-                                **connector.config,
-                                "auth_expired": True,
-                            }
-                            flag_modified(connector, "config")
-                            await db_session.commit()
-                    except Exception:
-                        logger.warning(
-                            "Failed to persist auth_expired for connector %s",
-                            connector.id,
-                            exc_info=True,
-                        )
+                )
+                connector = result.scalars().first()
+                if not connector:
                     return {
-                        "status": "insufficient_permissions",
-                        "connector_id": connector.id,
-                        "message": "This Gmail account needs additional permissions. Please re-authenticate in connector settings.",
+                        "status": "error",
+                        "message": "Selected Gmail connector is invalid or has been disconnected.",
                     }
-                raise
 
-            logger.info(f"Gmail email trashed: message_id={final_message_id}")
+                logger.info(
+                    f"Trashing Gmail email: message_id='{final_message_id}', connector={final_connector_id}"
+                )
 
-            trash_result: dict[str, Any] = {
-                "status": "success",
-                "message_id": final_message_id,
-                "message": f"Successfully moved email '{email.get('subject', email_subject_or_id)}' to trash.",
-            }
+                is_composio_gmail = (
+                    connector.connector_type
+                    == SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR
+                )
+                if is_composio_gmail:
+                    cca_id = connector.config.get("composio_connected_account_id")
+                    if not cca_id:
+                        return {
+                            "status": "error",
+                            "message": "Composio connected account ID not found for this Gmail connector.",
+                        }
+                else:
+                    from google.oauth2.credentials import Credentials
+
+                    from app.config import config
+                    from app.utils.oauth_security import TokenEncryption
+
+                    config_data = dict(connector.config)
+                    token_encrypted = config_data.get("_token_encrypted", False)
+                    if token_encrypted and config.SECRET_KEY:
+                        token_encryption = TokenEncryption(config.SECRET_KEY)
+                        if config_data.get("token"):
+                            config_data["token"] = token_encryption.decrypt_token(
+                                config_data["token"]
+                            )
+                        if config_data.get("refresh_token"):
+                            config_data["refresh_token"] = (
+                                token_encryption.decrypt_token(
+                                    config_data["refresh_token"]
+                                )
+                            )
+                        if config_data.get("client_secret"):
+                            config_data["client_secret"] = (
+                                token_encryption.decrypt_token(
+                                    config_data["client_secret"]
+                                )
+                            )
+
+                    exp = config_data.get("expiry", "")
+                    if exp:
+                        exp = exp.replace("Z", "")
+
+                    creds = Credentials(
+                        token=config_data.get("token"),
+                        refresh_token=config_data.get("refresh_token"),
+                        token_uri=config_data.get("token_uri"),
+                        client_id=config_data.get("client_id"),
+                        client_secret=config_data.get("client_secret"),
+                        scopes=config_data.get("scopes", []),
+                        expiry=datetime.fromisoformat(exp) if exp else None,
+                    )
 
-            deleted_from_kb = False
-            if final_delete_from_kb and document_id:
                 try:
-                    from app.db import Document
+                    if is_composio_gmail:
+                        from app.agents.new_chat.tools.gmail.composio_helpers import (
+                            execute_composio_gmail_tool,
+                        )
 
-                    doc_result = await db_session.execute(
-                        select(Document).filter(Document.id == document_id)
-                    )
-                    document = doc_result.scalars().first()
-                    if document:
-                        await db_session.delete(document)
-                        await db_session.commit()
-                        deleted_from_kb = True
-                        logger.info(
-                            f"Deleted document {document_id} from knowledge base"
+                        _trashed, error = await execute_composio_gmail_tool(
+                            connector,
+                            user_id,
+                            "GMAIL_MOVE_TO_TRASH",
+                            {"user_id": "me", "message_id": final_message_id},
                         )
+                        if error:
+                            raise RuntimeError(error)
                     else:
-                        logger.warning(f"Document {document_id} not found in KB")
-                except Exception as e:
-                    logger.error(f"Failed to delete document from KB: {e}")
-                    await db_session.rollback()
-                    trash_result["warning"] = (
-                        f"Email trashed, but failed to remove from knowledge base: {e!s}"
-                    )
+                        from googleapiclient.discovery import build
+
+                        gmail_service = build("gmail", "v1", credentials=creds)
+                        await asyncio.get_event_loop().run_in_executor(
+                            None,
+                            lambda: (
+                                gmail_service.users()
+                                .messages()
+                                .trash(userId="me", id=final_message_id)
+                                .execute()
+                            ),
+                        )
+                except Exception as api_err:
+                    from googleapiclient.errors import HttpError
 
-            trash_result["deleted_from_kb"] = deleted_from_kb
-            if deleted_from_kb:
-                trash_result["message"] = (
-                    f"{trash_result.get('message', '')} (also removed from knowledge base)"
-                )
+                    if isinstance(api_err, HttpError) and api_err.resp.status == 403:
+                        logger.warning(
+                            f"Insufficient permissions for connector {connector.id}: {api_err}"
+                        )
+                        try:
+                            from sqlalchemy.orm.attributes import flag_modified
+
+                            if not connector.config.get("auth_expired"):
+                                connector.config = {
+                                    **connector.config,
+                                    "auth_expired": True,
+                                }
+                                flag_modified(connector, "config")
+                                await db_session.commit()
+                        except Exception:
+                            logger.warning(
+                                "Failed to persist auth_expired for connector %s",
+                                connector.id,
+                                exc_info=True,
+                            )
+                        return {
+                            "status": "insufficient_permissions",
+                            "connector_id": connector.id,
+                            "message": "This Gmail account needs additional permissions. Please re-authenticate in connector settings.",
+                        }
+                    raise
+
+                logger.info(f"Gmail email trashed: message_id={final_message_id}")
+
+                trash_result: dict[str, Any] = {
+                    "status": "success",
+                    "message_id": final_message_id,
+                    "message": f"Successfully moved email '{email.get('subject', email_subject_or_id)}' to trash.",
+                }
+
+                deleted_from_kb = False
+                if final_delete_from_kb and document_id:
+                    try:
+                        from app.db import Document
+
+                        doc_result = await db_session.execute(
+                            select(Document).filter(Document.id == document_id)
+                        )
+                        document = doc_result.scalars().first()
+                        if document:
+                            await db_session.delete(document)
+                            await db_session.commit()
+                            deleted_from_kb = True
+                            logger.info(
+                                f"Deleted document {document_id} from knowledge base"
+                            )
+                        else:
+                            logger.warning(f"Document {document_id} not found in KB")
+                    except Exception as e:
+                        logger.error(f"Failed to delete document from KB: {e}")
+                        await db_session.rollback()
+                        trash_result["warning"] = (
+                            f"Email trashed, but failed to remove from knowledge base: {e!s}"
+                        )
+
+                trash_result["deleted_from_kb"] = deleted_from_kb
+                if deleted_from_kb:
+                    trash_result["message"] = (
+                        f"{trash_result.get('message', '')} (also removed from knowledge base)"
+                    )
 
-            return trash_result
+                return trash_result
 
         except Exception as e:
             from langgraph.errors import GraphInterrupt
diff --git a/surfsense_backend/app/agents/new_chat/tools/gmail/update_draft.py b/surfsense_backend/app/agents/new_chat/tools/gmail/update_draft.py
index 28deec2b43..129b7defb5 100644
--- a/surfsense_backend/app/agents/new_chat/tools/gmail/update_draft.py
+++ b/surfsense_backend/app/agents/new_chat/tools/gmail/update_draft.py
@@ -6,9 +6,10 @@
 from typing import Any
 
 from langchain_core.tools import tool
-from langgraph.types import interrupt
 from sqlalchemy.ext.asyncio import AsyncSession
 
+from app.agents.new_chat.tools.hitl import request_approval
+from app.db import async_session_maker
 from app.services.gmail import GmailToolMetadataService
 
 logger = logging.getLogger(__name__)
@@ -19,6 +20,23 @@ def create_update_gmail_draft_tool(
     search_space_id: int | None = None,
     user_id: str | None = None,
 ):
+    """
+    Factory function to create the update_gmail_draft tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+
+    Returns:
+        Configured update_gmail_draft tool
+    """
+    del db_session  # per-call session — see docstring
+
     @tool
     async def update_gmail_draft(
         draft_subject_or_id: str,
@@ -76,322 +94,329 @@ async def update_gmail_draft(
             f"update_gmail_draft called: draft_subject_or_id='{draft_subject_or_id}'"
         )
 
-        if db_session is None or search_space_id is None or user_id is None:
+        if search_space_id is None or user_id is None:
             return {
                 "status": "error",
                 "message": "Gmail tool not properly configured. Please contact support.",
             }
 
         try:
-            metadata_service = GmailToolMetadataService(db_session)
-            context = await metadata_service.get_update_context(
-                search_space_id, user_id, draft_subject_or_id
-            )
-
-            if "error" in context:
-                error_msg = context["error"]
-                if "not found" in error_msg.lower():
-                    logger.warning(f"Draft not found: {error_msg}")
-                    return {"status": "not_found", "message": error_msg}
-                logger.error(f"Failed to fetch update context: {error_msg}")
-                return {"status": "error", "message": error_msg}
-
-            account = context.get("account", {})
-            if account.get("auth_expired"):
-                logger.warning(
-                    "Gmail account %s has expired authentication",
-                    account.get("id"),
+            async with async_session_maker() as db_session:
+                metadata_service = GmailToolMetadataService(db_session)
+                context = await metadata_service.get_update_context(
+                    search_space_id, user_id, draft_subject_or_id
                 )
-                return {
-                    "status": "auth_error",
-                    "message": "The Gmail account for this draft needs re-authentication. Please re-authenticate in your connector settings.",
-                    "connector_type": "gmail",
-                }
 
-            email = context["email"]
-            message_id = email["message_id"]
-            document_id = email.get("document_id")
-            connector_id_from_context = account["id"]
-            draft_id_from_context = context.get("draft_id")
+                if "error" in context:
+                    error_msg = context["error"]
+                    if "not found" in error_msg.lower():
+                        logger.warning(f"Draft not found: {error_msg}")
+                        return {"status": "not_found", "message": error_msg}
+                    logger.error(f"Failed to fetch update context: {error_msg}")
+                    return {"status": "error", "message": error_msg}
 
-            original_subject = email.get("subject", draft_subject_or_id)
-            final_subject_default = subject if subject else original_subject
-            final_to_default = to if to else ""
+                account = context.get("account", {})
+                if account.get("auth_expired"):
+                    logger.warning(
+                        "Gmail account %s has expired authentication",
+                        account.get("id"),
+                    )
+                    return {
+                        "status": "auth_error",
+                        "message": "The Gmail account for this draft needs re-authentication. Please re-authenticate in your connector settings.",
+                        "connector_type": "gmail",
+                    }
 
-            logger.info(
-                f"Requesting approval for updating Gmail draft: '{original_subject}' "
-                f"(message_id={message_id}, draft_id={draft_id_from_context})"
-            )
-            approval = interrupt(
-                {
-                    "type": "gmail_draft_update",
-                    "action": {
-                        "tool": "update_gmail_draft",
-                        "params": {
-                            "message_id": message_id,
-                            "draft_id": draft_id_from_context,
-                            "to": final_to_default,
-                            "subject": final_subject_default,
-                            "body": body,
-                            "cc": cc,
-                            "bcc": bcc,
-                            "connector_id": connector_id_from_context,
-                        },
-                    },
-                    "context": context,
-                }
-            )
+                email = context["email"]
+                message_id = email["message_id"]
+                document_id = email.get("document_id")
+                connector_id_from_context = account["id"]
+                draft_id_from_context = context.get("draft_id")
 
-            decisions_raw = (
-                approval.get("decisions", []) if isinstance(approval, dict) else []
-            )
-            decisions = (
-                decisions_raw if isinstance(decisions_raw, list) else [decisions_raw]
-            )
-            decisions = [d for d in decisions if isinstance(d, dict)]
-            if not decisions:
-                logger.warning("No approval decision received")
-                return {"status": "error", "message": "No approval decision received"}
+                original_subject = email.get("subject", draft_subject_or_id)
+                final_subject_default = subject if subject else original_subject
+                final_to_default = to if to else ""
 
-            decision = decisions[0]
-            decision_type = decision.get("type") or decision.get("decision_type")
-            logger.info(f"User decision: {decision_type}")
+                logger.info(
+                    f"Requesting approval for updating Gmail draft: '{original_subject}' "
+                    f"(message_id={message_id}, draft_id={draft_id_from_context})"
+                )
+                result = request_approval(
+                    action_type="gmail_draft_update",
+                    tool_name="update_gmail_draft",
+                    params={
+                        "message_id": message_id,
+                        "draft_id": draft_id_from_context,
+                        "to": final_to_default,
+                        "subject": final_subject_default,
+                        "body": body,
+                        "cc": cc,
+                        "bcc": bcc,
+                        "connector_id": connector_id_from_context,
+                    },
+                    context=context,
+                )
 
-            if decision_type == "reject":
-                return {
-                    "status": "rejected",
-                    "message": "User declined. The draft was not updated. Do not ask again or suggest alternatives.",
-                }
+                if result.rejected:
+                    return {
+                        "status": "rejected",
+                        "message": "User declined. The draft was not updated. Do not ask again or suggest alternatives.",
+                    }
 
-            final_params: dict[str, Any] = {}
-            edited_action = decision.get("edited_action")
-            if isinstance(edited_action, dict):
-                edited_args = edited_action.get("args")
-                if isinstance(edited_args, dict):
-                    final_params = edited_args
-            elif isinstance(decision.get("args"), dict):
-                final_params = decision["args"]
-
-            final_to = final_params.get("to", final_to_default)
-            final_subject = final_params.get("subject", final_subject_default)
-            final_body = final_params.get("body", body)
-            final_cc = final_params.get("cc", cc)
-            final_bcc = final_params.get("bcc", bcc)
-            final_connector_id = final_params.get(
-                "connector_id", connector_id_from_context
-            )
-            final_draft_id = final_params.get("draft_id", draft_id_from_context)
+                final_to = result.params.get("to", final_to_default)
+                final_subject = result.params.get("subject", final_subject_default)
+                final_body = result.params.get("body", body)
+                final_cc = result.params.get("cc", cc)
+                final_bcc = result.params.get("bcc", bcc)
+                final_connector_id = result.params.get(
+                    "connector_id", connector_id_from_context
+                )
+                final_draft_id = result.params.get("draft_id", draft_id_from_context)
 
-            if not final_connector_id:
-                return {
-                    "status": "error",
-                    "message": "No connector found for this draft.",
-                }
+                if not final_connector_id:
+                    return {
+                        "status": "error",
+                        "message": "No connector found for this draft.",
+                    }
 
-            from sqlalchemy.future import select
+                from sqlalchemy.future import select
 
-            from app.db import SearchSourceConnector, SearchSourceConnectorType
+                from app.db import SearchSourceConnector, SearchSourceConnectorType
 
-            _gmail_types = [
-                SearchSourceConnectorType.GOOGLE_GMAIL_CONNECTOR,
-                SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR,
-            ]
+                _gmail_types = [
+                    SearchSourceConnectorType.GOOGLE_GMAIL_CONNECTOR,
+                    SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR,
+                ]
 
-            result = await db_session.execute(
-                select(SearchSourceConnector).filter(
-                    SearchSourceConnector.id == final_connector_id,
-                    SearchSourceConnector.search_space_id == search_space_id,
-                    SearchSourceConnector.user_id == user_id,
-                    SearchSourceConnector.connector_type.in_(_gmail_types),
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.id == final_connector_id,
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type.in_(_gmail_types),
+                    )
                 )
-            )
-            connector = result.scalars().first()
-            if not connector:
-                return {
-                    "status": "error",
-                    "message": "Selected Gmail connector is invalid or has been disconnected.",
-                }
-
-            logger.info(
-                f"Updating Gmail draft: subject='{final_subject}', connector={final_connector_id}"
-            )
-
-            if (
-                connector.connector_type
-                == SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR
-            ):
-                from app.utils.google_credentials import build_composio_credentials
-
-                cca_id = connector.config.get("composio_connected_account_id")
-                if cca_id:
-                    creds = build_composio_credentials(cca_id)
-                else:
+                connector = result.scalars().first()
+                if not connector:
                     return {
                         "status": "error",
-                        "message": "Composio connected account ID not found for this Gmail connector.",
+                        "message": "Selected Gmail connector is invalid or has been disconnected.",
                     }
-            else:
-                from google.oauth2.credentials import Credentials
-
-                from app.config import config
-                from app.utils.oauth_security import TokenEncryption
-
-                config_data = dict(connector.config)
-                token_encrypted = config_data.get("_token_encrypted", False)
-                if token_encrypted and config.SECRET_KEY:
-                    token_encryption = TokenEncryption(config.SECRET_KEY)
-                    if config_data.get("token"):
-                        config_data["token"] = token_encryption.decrypt_token(
-                            config_data["token"]
-                        )
-                    if config_data.get("refresh_token"):
-                        config_data["refresh_token"] = token_encryption.decrypt_token(
-                            config_data["refresh_token"]
-                        )
-                    if config_data.get("client_secret"):
-                        config_data["client_secret"] = token_encryption.decrypt_token(
-                            config_data["client_secret"]
-                        )
-
-                exp = config_data.get("expiry", "")
-                if exp:
-                    exp = exp.replace("Z", "")
-
-                creds = Credentials(
-                    token=config_data.get("token"),
-                    refresh_token=config_data.get("refresh_token"),
-                    token_uri=config_data.get("token_uri"),
-                    client_id=config_data.get("client_id"),
-                    client_secret=config_data.get("client_secret"),
-                    scopes=config_data.get("scopes", []),
-                    expiry=datetime.fromisoformat(exp) if exp else None,
-                )
 
-            from googleapiclient.discovery import build
-
-            gmail_service = build("gmail", "v1", credentials=creds)
-
-            # Resolve draft_id if not already available
-            if not final_draft_id:
                 logger.info(
-                    f"draft_id not in metadata, looking up via drafts.list for message_id={message_id}"
-                )
-                final_draft_id = await _find_draft_id_by_message(
-                    gmail_service, message_id
+                    f"Updating Gmail draft: subject='{final_subject}', connector={final_connector_id}"
                 )
 
-            if not final_draft_id:
-                return {
-                    "status": "error",
-                    "message": (
-                        "Could not find this draft in Gmail. "
-                        "It may have already been sent or deleted."
-                    ),
-                }
-
-            message = MIMEText(final_body)
-            if final_to:
-                message["to"] = final_to
-            message["subject"] = final_subject
-            if final_cc:
-                message["cc"] = final_cc
-            if final_bcc:
-                message["bcc"] = final_bcc
-            raw = base64.urlsafe_b64encode(message.as_bytes()).decode()
-
-            try:
-                updated = await asyncio.get_event_loop().run_in_executor(
-                    None,
-                    lambda: (
-                        gmail_service.users()
-                        .drafts()
-                        .update(
-                            userId="me",
-                            id=final_draft_id,
-                            body={"message": {"raw": raw}},
-                        )
-                        .execute()
-                    ),
+                is_composio_gmail = (
+                    connector.connector_type
+                    == SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR
                 )
-            except Exception as api_err:
-                from googleapiclient.errors import HttpError
+                if is_composio_gmail:
+                    cca_id = connector.config.get("composio_connected_account_id")
+                    if not cca_id:
+                        return {
+                            "status": "error",
+                            "message": "Composio connected account ID not found for this Gmail connector.",
+                        }
+                else:
+                    from google.oauth2.credentials import Credentials
+
+                    from app.config import config
+                    from app.utils.oauth_security import TokenEncryption
+
+                    config_data = dict(connector.config)
+                    token_encrypted = config_data.get("_token_encrypted", False)
+                    if token_encrypted and config.SECRET_KEY:
+                        token_encryption = TokenEncryption(config.SECRET_KEY)
+                        if config_data.get("token"):
+                            config_data["token"] = token_encryption.decrypt_token(
+                                config_data["token"]
+                            )
+                        if config_data.get("refresh_token"):
+                            config_data["refresh_token"] = (
+                                token_encryption.decrypt_token(
+                                    config_data["refresh_token"]
+                                )
+                            )
+                        if config_data.get("client_secret"):
+                            config_data["client_secret"] = (
+                                token_encryption.decrypt_token(
+                                    config_data["client_secret"]
+                                )
+                            )
+
+                    exp = config_data.get("expiry", "")
+                    if exp:
+                        exp = exp.replace("Z", "")
+
+                    creds = Credentials(
+                        token=config_data.get("token"),
+                        refresh_token=config_data.get("refresh_token"),
+                        token_uri=config_data.get("token_uri"),
+                        client_id=config_data.get("client_id"),
+                        client_secret=config_data.get("client_secret"),
+                        scopes=config_data.get("scopes", []),
+                        expiry=datetime.fromisoformat(exp) if exp else None,
+                    )
 
-                if isinstance(api_err, HttpError) and api_err.resp.status == 403:
-                    logger.warning(
-                        f"Insufficient permissions for connector {connector.id}: {api_err}"
+                # Resolve draft_id if not already available
+                if not final_draft_id:
+                    logger.info(
+                        f"draft_id not in metadata, looking up via drafts.list for message_id={message_id}"
                     )
-                    try:
-                        from sqlalchemy.orm.attributes import flag_modified
+                    if is_composio_gmail:
+                        final_draft_id = await _find_composio_draft_id_by_message(
+                            connector, user_id, message_id
+                        )
+                    else:
+                        from googleapiclient.discovery import build
 
-                        if not connector.config.get("auth_expired"):
-                            connector.config = {
-                                **connector.config,
-                                "auth_expired": True,
-                            }
-                            flag_modified(connector, "config")
-                            await db_session.commit()
-                    except Exception:
-                        logger.warning(
-                            "Failed to persist auth_expired for connector %s",
-                            connector.id,
-                            exc_info=True,
+                        gmail_service = build("gmail", "v1", credentials=creds)
+                        final_draft_id = await _find_draft_id_by_message(
+                            gmail_service, message_id
                         )
-                    return {
-                        "status": "insufficient_permissions",
-                        "connector_id": connector.id,
-                        "message": "This Gmail account needs additional permissions. Please re-authenticate in connector settings.",
-                    }
-                if isinstance(api_err, HttpError) and api_err.resp.status == 404:
+
+                if not final_draft_id:
                     return {
                         "status": "error",
-                        "message": "Draft no longer exists in Gmail. It may have been sent or deleted.",
+                        "message": (
+                            "Could not find this draft in Gmail. "
+                            "It may have already been sent or deleted."
+                        ),
                     }
-                raise
 
-            logger.info(f"Gmail draft updated: id={updated.get('id')}")
+                message = MIMEText(final_body)
+                if final_to:
+                    message["to"] = final_to
+                message["subject"] = final_subject
+                if final_cc:
+                    message["cc"] = final_cc
+                if final_bcc:
+                    message["bcc"] = final_bcc
+                raw = base64.urlsafe_b64encode(message.as_bytes()).decode()
 
-            kb_message_suffix = ""
-            if document_id:
                 try:
-                    from sqlalchemy.future import select as sa_select
-                    from sqlalchemy.orm.attributes import flag_modified
+                    if is_composio_gmail:
+                        from app.agents.new_chat.tools.gmail.composio_helpers import (
+                            execute_composio_gmail_tool,
+                            split_recipients,
+                        )
 
-                    from app.db import Document
+                        updated, error = await execute_composio_gmail_tool(
+                            connector,
+                            user_id,
+                            "GMAIL_UPDATE_DRAFT",
+                            {
+                                "user_id": "me",
+                                "draft_id": final_draft_id,
+                                "recipient_email": final_to,
+                                "subject": final_subject,
+                                "body": final_body,
+                                "cc": split_recipients(final_cc),
+                                "bcc": split_recipients(final_bcc),
+                                "is_html": False,
+                            },
+                        )
+                        if error:
+                            raise RuntimeError(error)
+                        if not isinstance(updated, dict):
+                            updated = {}
+                    else:
+                        from googleapiclient.discovery import build
+
+                        gmail_service = build("gmail", "v1", credentials=creds)
+                        updated = await asyncio.get_event_loop().run_in_executor(
+                            None,
+                            lambda: (
+                                gmail_service.users()
+                                .drafts()
+                                .update(
+                                    userId="me",
+                                    id=final_draft_id,
+                                    body={"message": {"raw": raw}},
+                                )
+                                .execute()
+                            ),
+                        )
+                except Exception as api_err:
+                    from googleapiclient.errors import HttpError
 
-                    doc_result = await db_session.execute(
-                        sa_select(Document).filter(Document.id == document_id)
-                    )
-                    document = doc_result.scalars().first()
-                    if document:
-                        document.source_markdown = final_body
-                        document.title = final_subject
-                        meta = dict(document.document_metadata or {})
-                        meta["subject"] = final_subject
-                        meta["draft_id"] = updated.get("id", final_draft_id)
-                        updated_msg = updated.get("message", {})
-                        if updated_msg.get("id"):
-                            meta["message_id"] = updated_msg["id"]
-                        document.document_metadata = meta
-                        flag_modified(document, "document_metadata")
-                        await db_session.commit()
-                        kb_message_suffix = (
-                            " Your knowledge base has also been updated."
+                    if isinstance(api_err, HttpError) and api_err.resp.status == 403:
+                        logger.warning(
+                            f"Insufficient permissions for connector {connector.id}: {api_err}"
                         )
-                        logger.info(
-                            f"KB document {document_id} updated for draft {final_draft_id}"
+                        try:
+                            from sqlalchemy.orm.attributes import flag_modified
+
+                            if not connector.config.get("auth_expired"):
+                                connector.config = {
+                                    **connector.config,
+                                    "auth_expired": True,
+                                }
+                                flag_modified(connector, "config")
+                                await db_session.commit()
+                        except Exception:
+                            logger.warning(
+                                "Failed to persist auth_expired for connector %s",
+                                connector.id,
+                                exc_info=True,
+                            )
+                        return {
+                            "status": "insufficient_permissions",
+                            "connector_id": connector.id,
+                            "message": "This Gmail account needs additional permissions. Please re-authenticate in connector settings.",
+                        }
+                    if isinstance(api_err, HttpError) and api_err.resp.status == 404:
+                        return {
+                            "status": "error",
+                            "message": "Draft no longer exists in Gmail. It may have been sent or deleted.",
+                        }
+                    raise
+
+                logger.info(f"Gmail draft updated: id={updated.get('id')}")
+
+                kb_message_suffix = ""
+                if document_id:
+                    try:
+                        from sqlalchemy.future import select as sa_select
+                        from sqlalchemy.orm.attributes import flag_modified
+
+                        from app.db import Document
+
+                        doc_result = await db_session.execute(
+                            sa_select(Document).filter(Document.id == document_id)
                         )
-                    else:
+                        document = doc_result.scalars().first()
+                        if document:
+                            document.source_markdown = final_body
+                            document.title = final_subject
+                            meta = dict(document.document_metadata or {})
+                            meta["subject"] = final_subject
+                            meta["draft_id"] = updated.get("id", final_draft_id)
+                            updated_msg = updated.get("message", {})
+                            if updated_msg.get("id"):
+                                meta["message_id"] = updated_msg["id"]
+                            document.document_metadata = meta
+                            flag_modified(document, "document_metadata")
+                            await db_session.commit()
+                            kb_message_suffix = (
+                                " Your knowledge base has also been updated."
+                            )
+                            logger.info(
+                                f"KB document {document_id} updated for draft {final_draft_id}"
+                            )
+                        else:
+                            kb_message_suffix = " This draft will be fully updated in your knowledge base in the next scheduled sync."
+                    except Exception as kb_err:
+                        logger.warning(f"KB update after draft edit failed: {kb_err}")
+                        await db_session.rollback()
                         kb_message_suffix = " This draft will be fully updated in your knowledge base in the next scheduled sync."
-                except Exception as kb_err:
-                    logger.warning(f"KB update after draft edit failed: {kb_err}")
-                    await db_session.rollback()
-                    kb_message_suffix = " This draft will be fully updated in your knowledge base in the next scheduled sync."
 
-            return {
-                "status": "success",
-                "draft_id": updated.get("id"),
-                "message": f"Successfully updated Gmail draft with subject '{final_subject}'.{kb_message_suffix}",
-            }
+                return {
+                    "status": "success",
+                    "draft_id": updated.get("id"),
+                    "message": f"Successfully updated Gmail draft with subject '{final_subject}'.{kb_message_suffix}",
+                }
 
         except Exception as e:
             from langgraph.errors import GraphInterrupt
@@ -436,3 +461,35 @@ async def _find_draft_id_by_message(gmail_service: Any, message_id: str) -> str
     except Exception as e:
         logger.warning(f"Failed to look up draft by message_id: {e}")
         return None
+
+
+async def _find_composio_draft_id_by_message(
+    connector: Any, user_id: str, message_id: str
+) -> str | None:
+    from app.agents.new_chat.tools.gmail.composio_helpers import (
+        execute_composio_gmail_tool,
+    )
+
+    page_token = ""
+    while True:
+        params: dict[str, Any] = {
+            "user_id": "me",
+            "max_results": 100,
+            "verbose": False,
+        }
+        if page_token:
+            params["page_token"] = page_token
+
+        data, error = await execute_composio_gmail_tool(
+            connector, user_id, "GMAIL_LIST_DRAFTS", params
+        )
+        if error or not isinstance(data, dict):
+            return None
+
+        for draft in data.get("drafts", []):
+            if draft.get("message", {}).get("id") == message_id:
+                return draft.get("id")
+
+        page_token = data.get("nextPageToken") or data.get("next_page_token") or ""
+        if not page_token:
+            return None
diff --git a/surfsense_backend/app/agents/new_chat/tools/google_calendar/__init__.py b/surfsense_backend/app/agents/new_chat/tools/google_calendar/__init__.py
index d1ce4e795b..13d4c06cbd 100644
--- a/surfsense_backend/app/agents/new_chat/tools/google_calendar/__init__.py
+++ b/surfsense_backend/app/agents/new_chat/tools/google_calendar/__init__.py
@@ -4,6 +4,9 @@
 from app.agents.new_chat.tools.google_calendar.delete_event import (
     create_delete_calendar_event_tool,
 )
+from app.agents.new_chat.tools.google_calendar.search_events import (
+    create_search_calendar_events_tool,
+)
 from app.agents.new_chat.tools.google_calendar.update_event import (
     create_update_calendar_event_tool,
 )
@@ -11,5 +14,6 @@
 __all__ = [
     "create_create_calendar_event_tool",
     "create_delete_calendar_event_tool",
+    "create_search_calendar_events_tool",
     "create_update_calendar_event_tool",
 ]
diff --git a/surfsense_backend/app/agents/new_chat/tools/google_calendar/create_event.py b/surfsense_backend/app/agents/new_chat/tools/google_calendar/create_event.py
index 592ced5ecc..dec92cc8bd 100644
--- a/surfsense_backend/app/agents/new_chat/tools/google_calendar/create_event.py
+++ b/surfsense_backend/app/agents/new_chat/tools/google_calendar/create_event.py
@@ -6,9 +6,10 @@
 from google.oauth2.credentials import Credentials
 from googleapiclient.discovery import build
 from langchain_core.tools import tool
-from langgraph.types import interrupt
 from sqlalchemy.ext.asyncio import AsyncSession
 
+from app.agents.new_chat.tools.hitl import request_approval
+from app.db import async_session_maker
 from app.services.google_calendar import GoogleCalendarToolMetadataService
 
 logger = logging.getLogger(__name__)
@@ -19,6 +20,23 @@ def create_create_calendar_event_tool(
     search_space_id: int | None = None,
     user_id: str | None = None,
 ):
+    """
+    Factory function to create the create_calendar_event tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+
+    Returns:
+        Configured create_calendar_event tool
+    """
+    del db_session  # per-call session — see docstring
+
     @tool
     async def create_calendar_event(
         summary: str,
@@ -60,282 +78,294 @@ async def create_calendar_event(
             f"create_calendar_event called: summary='{summary}', start='{start_datetime}', end='{end_datetime}'"
         )
 
-        if db_session is None or search_space_id is None or user_id is None:
+        if search_space_id is None or user_id is None:
             return {
                 "status": "error",
                 "message": "Google Calendar tool not properly configured. Please contact support.",
             }
 
         try:
-            metadata_service = GoogleCalendarToolMetadataService(db_session)
-            context = await metadata_service.get_creation_context(
-                search_space_id, user_id
-            )
-
-            if "error" in context:
-                logger.error(f"Failed to fetch creation context: {context['error']}")
-                return {"status": "error", "message": context["error"]}
-
-            accounts = context.get("accounts", [])
-            if accounts and all(a.get("auth_expired") for a in accounts):
-                logger.warning(
-                    "All Google Calendar accounts have expired authentication"
+            async with async_session_maker() as db_session:
+                metadata_service = GoogleCalendarToolMetadataService(db_session)
+                context = await metadata_service.get_creation_context(
+                    search_space_id, user_id
                 )
-                return {
-                    "status": "auth_error",
-                    "message": "All connected Google Calendar accounts need re-authentication. Please re-authenticate in your connector settings.",
-                    "connector_type": "google_calendar",
-                }
 
-            logger.info(
-                f"Requesting approval for creating calendar event: summary='{summary}'"
-            )
-            approval = interrupt(
-                {
-                    "type": "google_calendar_event_creation",
-                    "action": {
-                        "tool": "create_calendar_event",
-                        "params": {
-                            "summary": summary,
-                            "start_datetime": start_datetime,
-                            "end_datetime": end_datetime,
-                            "description": description,
-                            "location": location,
-                            "attendees": attendees,
-                            "timezone": context.get("timezone"),
-                            "connector_id": None,
-                        },
-                    },
-                    "context": context,
-                }
-            )
-
-            decisions_raw = (
-                approval.get("decisions", []) if isinstance(approval, dict) else []
-            )
-            decisions = (
-                decisions_raw if isinstance(decisions_raw, list) else [decisions_raw]
-            )
-            decisions = [d for d in decisions if isinstance(d, dict)]
-            if not decisions:
-                logger.warning("No approval decision received")
-                return {"status": "error", "message": "No approval decision received"}
-
-            decision = decisions[0]
-            decision_type = decision.get("type") or decision.get("decision_type")
-            logger.info(f"User decision: {decision_type}")
-
-            if decision_type == "reject":
-                return {
-                    "status": "rejected",
-                    "message": "User declined. The event was not created. Do not ask again or suggest alternatives.",
-                }
+                if "error" in context:
+                    logger.error(
+                        f"Failed to fetch creation context: {context['error']}"
+                    )
+                    return {"status": "error", "message": context["error"]}
 
-            final_params: dict[str, Any] = {}
-            edited_action = decision.get("edited_action")
-            if isinstance(edited_action, dict):
-                edited_args = edited_action.get("args")
-                if isinstance(edited_args, dict):
-                    final_params = edited_args
-            elif isinstance(decision.get("args"), dict):
-                final_params = decision["args"]
-
-            final_summary = final_params.get("summary", summary)
-            final_start_datetime = final_params.get("start_datetime", start_datetime)
-            final_end_datetime = final_params.get("end_datetime", end_datetime)
-            final_description = final_params.get("description", description)
-            final_location = final_params.get("location", location)
-            final_attendees = final_params.get("attendees", attendees)
-            final_connector_id = final_params.get("connector_id")
-
-            if not final_summary or not final_summary.strip():
-                return {"status": "error", "message": "Event summary cannot be empty."}
-
-            from sqlalchemy.future import select
-
-            from app.db import SearchSourceConnector, SearchSourceConnectorType
-
-            _calendar_types = [
-                SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR,
-                SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
-            ]
-
-            if final_connector_id is not None:
-                result = await db_session.execute(
-                    select(SearchSourceConnector).filter(
-                        SearchSourceConnector.id == final_connector_id,
-                        SearchSourceConnector.search_space_id == search_space_id,
-                        SearchSourceConnector.user_id == user_id,
-                        SearchSourceConnector.connector_type.in_(_calendar_types),
+                accounts = context.get("accounts", [])
+                if accounts and all(a.get("auth_expired") for a in accounts):
+                    logger.warning(
+                        "All Google Calendar accounts have expired authentication"
                     )
-                )
-                connector = result.scalars().first()
-                if not connector:
                     return {
-                        "status": "error",
-                        "message": "Selected Google Calendar connector is invalid or has been disconnected.",
+                        "status": "auth_error",
+                        "message": "All connected Google Calendar accounts need re-authentication. Please re-authenticate in your connector settings.",
+                        "connector_type": "google_calendar",
                     }
-                actual_connector_id = connector.id
-            else:
-                result = await db_session.execute(
-                    select(SearchSourceConnector).filter(
-                        SearchSourceConnector.search_space_id == search_space_id,
-                        SearchSourceConnector.user_id == user_id,
-                        SearchSourceConnector.connector_type.in_(_calendar_types),
-                    )
+
+                logger.info(
+                    f"Requesting approval for creating calendar event: summary='{summary}'"
                 )
-                connector = result.scalars().first()
-                if not connector:
+                result = request_approval(
+                    action_type="google_calendar_event_creation",
+                    tool_name="create_calendar_event",
+                    params={
+                        "summary": summary,
+                        "start_datetime": start_datetime,
+                        "end_datetime": end_datetime,
+                        "description": description,
+                        "location": location,
+                        "attendees": attendees,
+                        "timezone": context.get("timezone"),
+                        "connector_id": None,
+                    },
+                    context=context,
+                )
+
+                if result.rejected:
                     return {
-                        "status": "error",
-                        "message": "No Google Calendar connector found. Please connect Google Calendar in your workspace settings.",
+                        "status": "rejected",
+                        "message": "User declined. The event was not created. Do not ask again or suggest alternatives.",
                     }
-                actual_connector_id = connector.id
 
-            logger.info(
-                f"Creating calendar event: summary='{final_summary}', connector={actual_connector_id}"
-            )
-
-            if (
-                connector.connector_type
-                == SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR
-            ):
-                from app.utils.google_credentials import build_composio_credentials
+                final_summary = result.params.get("summary", summary)
+                final_start_datetime = result.params.get(
+                    "start_datetime", start_datetime
+                )
+                final_end_datetime = result.params.get("end_datetime", end_datetime)
+                final_description = result.params.get("description", description)
+                final_location = result.params.get("location", location)
+                final_attendees = result.params.get("attendees", attendees)
+                final_connector_id = result.params.get("connector_id")
 
-                cca_id = connector.config.get("composio_connected_account_id")
-                if cca_id:
-                    creds = build_composio_credentials(cca_id)
-                else:
+                if not final_summary or not final_summary.strip():
                     return {
                         "status": "error",
-                        "message": "Composio connected account ID not found for this connector.",
+                        "message": "Event summary cannot be empty.",
                     }
-            else:
-                config_data = dict(connector.config)
-
-                from app.config import config as app_config
-                from app.utils.oauth_security import TokenEncryption
-
-                token_encrypted = config_data.get("_token_encrypted", False)
-                if token_encrypted and app_config.SECRET_KEY:
-                    token_encryption = TokenEncryption(app_config.SECRET_KEY)
-                    for key in ("token", "refresh_token", "client_secret"):
-                        if config_data.get(key):
-                            config_data[key] = token_encryption.decrypt_token(
-                                config_data[key]
-                            )
 
-                exp = config_data.get("expiry", "")
-                if exp:
-                    exp = exp.replace("Z", "")
-
-                creds = Credentials(
-                    token=config_data.get("token"),
-                    refresh_token=config_data.get("refresh_token"),
-                    token_uri=config_data.get("token_uri"),
-                    client_id=config_data.get("client_id"),
-                    client_secret=config_data.get("client_secret"),
-                    scopes=config_data.get("scopes", []),
-                    expiry=datetime.fromisoformat(exp) if exp else None,
-                )
+                from sqlalchemy.future import select
 
-            service = await asyncio.get_event_loop().run_in_executor(
-                None, lambda: build("calendar", "v3", credentials=creds)
-            )
+                from app.db import SearchSourceConnector, SearchSourceConnectorType
 
-            tz = context.get("timezone", "UTC")
-            event_body: dict[str, Any] = {
-                "summary": final_summary,
-                "start": {"dateTime": final_start_datetime, "timeZone": tz},
-                "end": {"dateTime": final_end_datetime, "timeZone": tz},
-            }
-            if final_description:
-                event_body["description"] = final_description
-            if final_location:
-                event_body["location"] = final_location
-            if final_attendees:
-                event_body["attendees"] = [
-                    {"email": e.strip()} for e in final_attendees if e.strip()
+                _calendar_types = [
+                    SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR,
+                    SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
                 ]
 
-            try:
-                created = await asyncio.get_event_loop().run_in_executor(
-                    None,
-                    lambda: (
-                        service.events()
-                        .insert(calendarId="primary", body=event_body)
-                        .execute()
-                    ),
+                if final_connector_id is not None:
+                    result = await db_session.execute(
+                        select(SearchSourceConnector).filter(
+                            SearchSourceConnector.id == final_connector_id,
+                            SearchSourceConnector.search_space_id == search_space_id,
+                            SearchSourceConnector.user_id == user_id,
+                            SearchSourceConnector.connector_type.in_(_calendar_types),
+                        )
+                    )
+                    connector = result.scalars().first()
+                    if not connector:
+                        return {
+                            "status": "error",
+                            "message": "Selected Google Calendar connector is invalid or has been disconnected.",
+                        }
+                    actual_connector_id = connector.id
+                else:
+                    result = await db_session.execute(
+                        select(SearchSourceConnector).filter(
+                            SearchSourceConnector.search_space_id == search_space_id,
+                            SearchSourceConnector.user_id == user_id,
+                            SearchSourceConnector.connector_type.in_(_calendar_types),
+                        )
+                    )
+                    connector = result.scalars().first()
+                    if not connector:
+                        return {
+                            "status": "error",
+                            "message": "No Google Calendar connector found. Please connect Google Calendar in your workspace settings.",
+                        }
+                    actual_connector_id = connector.id
+
+                logger.info(
+                    f"Creating calendar event: summary='{final_summary}', connector={actual_connector_id}"
                 )
-            except Exception as api_err:
-                from googleapiclient.errors import HttpError
 
-                if isinstance(api_err, HttpError) and api_err.resp.status == 403:
-                    logger.warning(
-                        f"Insufficient permissions for connector {actual_connector_id}: {api_err}"
+                is_composio_calendar = (
+                    connector.connector_type
+                    == SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR
+                )
+                if is_composio_calendar:
+                    cca_id = connector.config.get("composio_connected_account_id")
+                    if not cca_id:
+                        return {
+                            "status": "error",
+                            "message": "Composio connected account ID not found for this connector.",
+                        }
+                else:
+                    config_data = dict(connector.config)
+
+                    from app.config import config as app_config
+                    from app.utils.oauth_security import TokenEncryption
+
+                    token_encrypted = config_data.get("_token_encrypted", False)
+                    if token_encrypted and app_config.SECRET_KEY:
+                        token_encryption = TokenEncryption(app_config.SECRET_KEY)
+                        for key in ("token", "refresh_token", "client_secret"):
+                            if config_data.get(key):
+                                config_data[key] = token_encryption.decrypt_token(
+                                    config_data[key]
+                                )
+
+                    exp = config_data.get("expiry", "")
+                    if exp:
+                        exp = exp.replace("Z", "")
+
+                    creds = Credentials(
+                        token=config_data.get("token"),
+                        refresh_token=config_data.get("refresh_token"),
+                        token_uri=config_data.get("token_uri"),
+                        client_id=config_data.get("client_id"),
+                        client_secret=config_data.get("client_secret"),
+                        scopes=config_data.get("scopes", []),
+                        expiry=datetime.fromisoformat(exp) if exp else None,
                     )
-                    try:
-                        from sqlalchemy.orm.attributes import flag_modified
 
-                        _res = await db_session.execute(
-                            select(SearchSourceConnector).where(
-                                SearchSourceConnector.id == actual_connector_id
+                tz = context.get("timezone", "UTC")
+                event_body: dict[str, Any] = {
+                    "summary": final_summary,
+                    "start": {"dateTime": final_start_datetime, "timeZone": tz},
+                    "end": {"dateTime": final_end_datetime, "timeZone": tz},
+                }
+                if final_description:
+                    event_body["description"] = final_description
+                if final_location:
+                    event_body["location"] = final_location
+                if final_attendees:
+                    event_body["attendees"] = [
+                        {"email": e.strip()} for e in final_attendees if e.strip()
+                    ]
+
+                try:
+                    if is_composio_calendar:
+                        from app.services.composio_service import ComposioService
+
+                        composio_params = {
+                            "calendar_id": "primary",
+                            "summary": final_summary,
+                            "start_datetime": final_start_datetime,
+                            "end_datetime": final_end_datetime,
+                            "timezone": tz,
+                            "attendees": final_attendees or [],
+                        }
+                        if final_description:
+                            composio_params["description"] = final_description
+                        if final_location:
+                            composio_params["location"] = final_location
+
+                        composio_result = await ComposioService().execute_tool(
+                            connected_account_id=cca_id,
+                            tool_name="GOOGLECALENDAR_CREATE_EVENT",
+                            params=composio_params,
+                            entity_id=f"surfsense_{user_id}",
+                        )
+                        if not composio_result.get("success"):
+                            raise RuntimeError(
+                                composio_result.get(
+                                    "error", "Unknown Composio Calendar error"
+                                )
                             )
+                        created = composio_result.get("data", {})
+                        if isinstance(created, dict):
+                            created = created.get("data", created)
+                            if isinstance(created, dict):
+                                created = created.get("response_data", created)
+                    else:
+                        service = await asyncio.get_event_loop().run_in_executor(
+                            None, lambda: build("calendar", "v3", credentials=creds)
                         )
-                        _conn = _res.scalar_one_or_none()
-                        if _conn and not _conn.config.get("auth_expired"):
-                            _conn.config = {**_conn.config, "auth_expired": True}
-                            flag_modified(_conn, "config")
-                            await db_session.commit()
-                    except Exception:
+                        created = await asyncio.get_event_loop().run_in_executor(
+                            None,
+                            lambda: (
+                                service.events()
+                                .insert(calendarId="primary", body=event_body)
+                                .execute()
+                            ),
+                        )
+                except Exception as api_err:
+                    from googleapiclient.errors import HttpError
+
+                    if isinstance(api_err, HttpError) and api_err.resp.status == 403:
                         logger.warning(
-                            "Failed to persist auth_expired for connector %s",
-                            actual_connector_id,
-                            exc_info=True,
+                            f"Insufficient permissions for connector {actual_connector_id}: {api_err}"
                         )
-                    return {
-                        "status": "insufficient_permissions",
-                        "connector_id": actual_connector_id,
-                        "message": "This Google Calendar account needs additional permissions. Please re-authenticate in connector settings.",
-                    }
-                raise
+                        try:
+                            from sqlalchemy.orm.attributes import flag_modified
 
-            logger.info(
-                f"Calendar event created: id={created.get('id')}, summary={created.get('summary')}"
-            )
-
-            kb_message_suffix = ""
-            try:
-                from app.services.google_calendar import GoogleCalendarKBSyncService
-
-                kb_service = GoogleCalendarKBSyncService(db_session)
-                kb_result = await kb_service.sync_after_create(
-                    event_id=created.get("id"),
-                    event_summary=final_summary,
-                    calendar_id="primary",
-                    start_time=final_start_datetime,
-                    end_time=final_end_datetime,
-                    location=final_location,
-                    html_link=created.get("htmlLink"),
-                    description=final_description,
-                    connector_id=actual_connector_id,
-                    search_space_id=search_space_id,
-                    user_id=user_id,
+                            _res = await db_session.execute(
+                                select(SearchSourceConnector).where(
+                                    SearchSourceConnector.id == actual_connector_id
+                                )
+                            )
+                            _conn = _res.scalar_one_or_none()
+                            if _conn and not _conn.config.get("auth_expired"):
+                                _conn.config = {**_conn.config, "auth_expired": True}
+                                flag_modified(_conn, "config")
+                                await db_session.commit()
+                        except Exception:
+                            logger.warning(
+                                "Failed to persist auth_expired for connector %s",
+                                actual_connector_id,
+                                exc_info=True,
+                            )
+                        return {
+                            "status": "insufficient_permissions",
+                            "connector_id": actual_connector_id,
+                            "message": "This Google Calendar account needs additional permissions. Please re-authenticate in connector settings.",
+                        }
+                    raise
+
+                logger.info(
+                    f"Calendar event created: id={created.get('id')}, summary={created.get('summary')}"
                 )
-                if kb_result["status"] == "success":
-                    kb_message_suffix = " Your knowledge base has also been updated."
-                else:
+
+                kb_message_suffix = ""
+                try:
+                    from app.services.google_calendar import GoogleCalendarKBSyncService
+
+                    kb_service = GoogleCalendarKBSyncService(db_session)
+                    kb_result = await kb_service.sync_after_create(
+                        event_id=created.get("id"),
+                        event_summary=final_summary,
+                        calendar_id="primary",
+                        start_time=final_start_datetime,
+                        end_time=final_end_datetime,
+                        location=final_location,
+                        html_link=created.get("htmlLink"),
+                        description=final_description,
+                        connector_id=actual_connector_id,
+                        search_space_id=search_space_id,
+                        user_id=user_id,
+                    )
+                    if kb_result["status"] == "success":
+                        kb_message_suffix = (
+                            " Your knowledge base has also been updated."
+                        )
+                    else:
+                        kb_message_suffix = " This event will be added to your knowledge base in the next scheduled sync."
+                except Exception as kb_err:
+                    logger.warning(f"KB sync after create failed: {kb_err}")
                     kb_message_suffix = " This event will be added to your knowledge base in the next scheduled sync."
-            except Exception as kb_err:
-                logger.warning(f"KB sync after create failed: {kb_err}")
-                kb_message_suffix = " This event will be added to your knowledge base in the next scheduled sync."
 
-            return {
-                "status": "success",
-                "event_id": created.get("id"),
-                "html_link": created.get("htmlLink"),
-                "message": f"Successfully created '{final_summary}' on Google Calendar.{kb_message_suffix}",
-            }
+                return {
+                    "status": "success",
+                    "event_id": created.get("id"),
+                    "html_link": created.get("htmlLink"),
+                    "message": f"Successfully created '{final_summary}' on Google Calendar.{kb_message_suffix}",
+                }
 
         except Exception as e:
             from langgraph.errors import GraphInterrupt
diff --git a/surfsense_backend/app/agents/new_chat/tools/google_calendar/delete_event.py b/surfsense_backend/app/agents/new_chat/tools/google_calendar/delete_event.py
index 8b088487c9..e7e891b084 100644
--- a/surfsense_backend/app/agents/new_chat/tools/google_calendar/delete_event.py
+++ b/surfsense_backend/app/agents/new_chat/tools/google_calendar/delete_event.py
@@ -6,9 +6,10 @@
 from google.oauth2.credentials import Credentials
 from googleapiclient.discovery import build
 from langchain_core.tools import tool
-from langgraph.types import interrupt
 from sqlalchemy.ext.asyncio import AsyncSession
 
+from app.agents.new_chat.tools.hitl import request_approval
+from app.db import async_session_maker
 from app.services.google_calendar import GoogleCalendarToolMetadataService
 
 logger = logging.getLogger(__name__)
@@ -19,6 +20,23 @@ def create_delete_calendar_event_tool(
     search_space_id: int | None = None,
     user_id: str | None = None,
 ):
+    """
+    Factory function to create the delete_calendar_event tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+
+    Returns:
+        Configured delete_calendar_event tool
+    """
+    del db_session  # per-call session — see docstring
+
     @tool
     async def delete_calendar_event(
         event_title_or_id: str,
@@ -54,268 +72,258 @@ async def delete_calendar_event(
             f"delete_calendar_event called: event_ref='{event_title_or_id}', delete_from_kb={delete_from_kb}"
         )
 
-        if db_session is None or search_space_id is None or user_id is None:
+        if search_space_id is None or user_id is None:
             return {
                 "status": "error",
                 "message": "Google Calendar tool not properly configured. Please contact support.",
             }
 
         try:
-            metadata_service = GoogleCalendarToolMetadataService(db_session)
-            context = await metadata_service.get_deletion_context(
-                search_space_id, user_id, event_title_or_id
-            )
-
-            if "error" in context:
-                error_msg = context["error"]
-                if "not found" in error_msg.lower():
-                    logger.warning(f"Event not found: {error_msg}")
-                    return {"status": "not_found", "message": error_msg}
-                logger.error(f"Failed to fetch deletion context: {error_msg}")
-                return {"status": "error", "message": error_msg}
-
-            account = context.get("account", {})
-            if account.get("auth_expired"):
-                logger.warning(
-                    "Google Calendar account %s has expired authentication",
-                    account.get("id"),
+            async with async_session_maker() as db_session:
+                metadata_service = GoogleCalendarToolMetadataService(db_session)
+                context = await metadata_service.get_deletion_context(
+                    search_space_id, user_id, event_title_or_id
                 )
-                return {
-                    "status": "auth_error",
-                    "message": "The Google Calendar account for this event needs re-authentication. Please re-authenticate in your connector settings.",
-                    "connector_type": "google_calendar",
-                }
-
-            event = context["event"]
-            event_id = event["event_id"]
-            document_id = event.get("document_id")
-            connector_id_from_context = context["account"]["id"]
 
-            if not event_id:
-                return {
-                    "status": "error",
-                    "message": "Event ID is missing from the indexed document. Please re-index the event and try again.",
-                }
+                if "error" in context:
+                    error_msg = context["error"]
+                    if "not found" in error_msg.lower():
+                        logger.warning(f"Event not found: {error_msg}")
+                        return {"status": "not_found", "message": error_msg}
+                    logger.error(f"Failed to fetch deletion context: {error_msg}")
+                    return {"status": "error", "message": error_msg}
 
-            logger.info(
-                f"Requesting approval for deleting calendar event: '{event_title_or_id}' (event_id={event_id}, delete_from_kb={delete_from_kb})"
-            )
-            approval = interrupt(
-                {
-                    "type": "google_calendar_event_deletion",
-                    "action": {
-                        "tool": "delete_calendar_event",
-                        "params": {
-                            "event_id": event_id,
-                            "connector_id": connector_id_from_context,
-                            "delete_from_kb": delete_from_kb,
-                        },
-                    },
-                    "context": context,
-                }
-            )
-
-            decisions_raw = (
-                approval.get("decisions", []) if isinstance(approval, dict) else []
-            )
-            decisions = (
-                decisions_raw if isinstance(decisions_raw, list) else [decisions_raw]
-            )
-            decisions = [d for d in decisions if isinstance(d, dict)]
-            if not decisions:
-                logger.warning("No approval decision received")
-                return {"status": "error", "message": "No approval decision received"}
-
-            decision = decisions[0]
-            decision_type = decision.get("type") or decision.get("decision_type")
-            logger.info(f"User decision: {decision_type}")
-
-            if decision_type == "reject":
-                return {
-                    "status": "rejected",
-                    "message": "User declined. The event was not deleted. Do not ask again or suggest alternatives.",
-                }
+                account = context.get("account", {})
+                if account.get("auth_expired"):
+                    logger.warning(
+                        "Google Calendar account %s has expired authentication",
+                        account.get("id"),
+                    )
+                    return {
+                        "status": "auth_error",
+                        "message": "The Google Calendar account for this event needs re-authentication. Please re-authenticate in your connector settings.",
+                        "connector_type": "google_calendar",
+                    }
 
-            edited_action = decision.get("edited_action")
-            final_params: dict[str, Any] = {}
-            if isinstance(edited_action, dict):
-                edited_args = edited_action.get("args")
-                if isinstance(edited_args, dict):
-                    final_params = edited_args
-            elif isinstance(decision.get("args"), dict):
-                final_params = decision["args"]
-
-            final_event_id = final_params.get("event_id", event_id)
-            final_connector_id = final_params.get(
-                "connector_id", connector_id_from_context
-            )
-            final_delete_from_kb = final_params.get("delete_from_kb", delete_from_kb)
-
-            if not final_connector_id:
-                return {
-                    "status": "error",
-                    "message": "No connector found for this event.",
-                }
+                event = context["event"]
+                event_id = event["event_id"]
+                document_id = event.get("document_id")
+                connector_id_from_context = context["account"]["id"]
 
-            from sqlalchemy.future import select
+                if not event_id:
+                    return {
+                        "status": "error",
+                        "message": "Event ID is missing from the indexed document. Please re-index the event and try again.",
+                    }
 
-            from app.db import SearchSourceConnector, SearchSourceConnectorType
+                logger.info(
+                    f"Requesting approval for deleting calendar event: '{event_title_or_id}' (event_id={event_id}, delete_from_kb={delete_from_kb})"
+                )
+                result = request_approval(
+                    action_type="google_calendar_event_deletion",
+                    tool_name="delete_calendar_event",
+                    params={
+                        "event_id": event_id,
+                        "connector_id": connector_id_from_context,
+                        "delete_from_kb": delete_from_kb,
+                    },
+                    context=context,
+                )
 
-            _calendar_types = [
-                SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR,
-                SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
-            ]
+                if result.rejected:
+                    return {
+                        "status": "rejected",
+                        "message": "User declined. The event was not deleted. Do not ask again or suggest alternatives.",
+                    }
 
-            result = await db_session.execute(
-                select(SearchSourceConnector).filter(
-                    SearchSourceConnector.id == final_connector_id,
-                    SearchSourceConnector.search_space_id == search_space_id,
-                    SearchSourceConnector.user_id == user_id,
-                    SearchSourceConnector.connector_type.in_(_calendar_types),
+                final_event_id = result.params.get("event_id", event_id)
+                final_connector_id = result.params.get(
+                    "connector_id", connector_id_from_context
+                )
+                final_delete_from_kb = result.params.get(
+                    "delete_from_kb", delete_from_kb
                 )
-            )
-            connector = result.scalars().first()
-            if not connector:
-                return {
-                    "status": "error",
-                    "message": "Selected Google Calendar connector is invalid or has been disconnected.",
-                }
 
-            actual_connector_id = connector.id
+                if not final_connector_id:
+                    return {
+                        "status": "error",
+                        "message": "No connector found for this event.",
+                    }
 
-            logger.info(
-                f"Deleting calendar event: event_id='{final_event_id}', connector={actual_connector_id}"
-            )
+                from sqlalchemy.future import select
 
-            if (
-                connector.connector_type
-                == SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR
-            ):
-                from app.utils.google_credentials import build_composio_credentials
+                from app.db import SearchSourceConnector, SearchSourceConnectorType
 
-                cca_id = connector.config.get("composio_connected_account_id")
-                if cca_id:
-                    creds = build_composio_credentials(cca_id)
-                else:
+                _calendar_types = [
+                    SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR,
+                    SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
+                ]
+
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.id == final_connector_id,
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type.in_(_calendar_types),
+                    )
+                )
+                connector = result.scalars().first()
+                if not connector:
                     return {
                         "status": "error",
-                        "message": "Composio connected account ID not found for this connector.",
+                        "message": "Selected Google Calendar connector is invalid or has been disconnected.",
                     }
-            else:
-                config_data = dict(connector.config)
-
-                from app.config import config as app_config
-                from app.utils.oauth_security import TokenEncryption
-
-                token_encrypted = config_data.get("_token_encrypted", False)
-                if token_encrypted and app_config.SECRET_KEY:
-                    token_encryption = TokenEncryption(app_config.SECRET_KEY)
-                    for key in ("token", "refresh_token", "client_secret"):
-                        if config_data.get(key):
-                            config_data[key] = token_encryption.decrypt_token(
-                                config_data[key]
-                            )
 
-                exp = config_data.get("expiry", "")
-                if exp:
-                    exp = exp.replace("Z", "")
-
-                creds = Credentials(
-                    token=config_data.get("token"),
-                    refresh_token=config_data.get("refresh_token"),
-                    token_uri=config_data.get("token_uri"),
-                    client_id=config_data.get("client_id"),
-                    client_secret=config_data.get("client_secret"),
-                    scopes=config_data.get("scopes", []),
-                    expiry=datetime.fromisoformat(exp) if exp else None,
-                )
+                actual_connector_id = connector.id
 
-            service = await asyncio.get_event_loop().run_in_executor(
-                None, lambda: build("calendar", "v3", credentials=creds)
-            )
-
-            try:
-                await asyncio.get_event_loop().run_in_executor(
-                    None,
-                    lambda: (
-                        service.events()
-                        .delete(calendarId="primary", eventId=final_event_id)
-                        .execute()
-                    ),
+                logger.info(
+                    f"Deleting calendar event: event_id='{final_event_id}', connector={actual_connector_id}"
                 )
-            except Exception as api_err:
-                from googleapiclient.errors import HttpError
 
-                if isinstance(api_err, HttpError) and api_err.resp.status == 403:
-                    logger.warning(
-                        f"Insufficient permissions for connector {actual_connector_id}: {api_err}"
+                is_composio_calendar = (
+                    connector.connector_type
+                    == SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR
+                )
+                if is_composio_calendar:
+                    cca_id = connector.config.get("composio_connected_account_id")
+                    if not cca_id:
+                        return {
+                            "status": "error",
+                            "message": "Composio connected account ID not found for this connector.",
+                        }
+                else:
+                    config_data = dict(connector.config)
+
+                    from app.config import config as app_config
+                    from app.utils.oauth_security import TokenEncryption
+
+                    token_encrypted = config_data.get("_token_encrypted", False)
+                    if token_encrypted and app_config.SECRET_KEY:
+                        token_encryption = TokenEncryption(app_config.SECRET_KEY)
+                        for key in ("token", "refresh_token", "client_secret"):
+                            if config_data.get(key):
+                                config_data[key] = token_encryption.decrypt_token(
+                                    config_data[key]
+                                )
+
+                    exp = config_data.get("expiry", "")
+                    if exp:
+                        exp = exp.replace("Z", "")
+
+                    creds = Credentials(
+                        token=config_data.get("token"),
+                        refresh_token=config_data.get("refresh_token"),
+                        token_uri=config_data.get("token_uri"),
+                        client_id=config_data.get("client_id"),
+                        client_secret=config_data.get("client_secret"),
+                        scopes=config_data.get("scopes", []),
+                        expiry=datetime.fromisoformat(exp) if exp else None,
                     )
-                    try:
-                        from sqlalchemy.orm.attributes import flag_modified
 
-                        _res = await db_session.execute(
-                            select(SearchSourceConnector).where(
-                                SearchSourceConnector.id == actual_connector_id
+                try:
+                    if is_composio_calendar:
+                        from app.services.composio_service import ComposioService
+
+                        composio_result = await ComposioService().execute_tool(
+                            connected_account_id=cca_id,
+                            tool_name="GOOGLECALENDAR_DELETE_EVENT",
+                            params={
+                                "calendar_id": "primary",
+                                "event_id": final_event_id,
+                            },
+                            entity_id=f"surfsense_{user_id}",
+                        )
+                        if not composio_result.get("success"):
+                            raise RuntimeError(
+                                composio_result.get(
+                                    "error", "Unknown Composio Calendar error"
+                                )
                             )
+                    else:
+                        service = await asyncio.get_event_loop().run_in_executor(
+                            None, lambda: build("calendar", "v3", credentials=creds)
                         )
-                        _conn = _res.scalar_one_or_none()
-                        if _conn and not _conn.config.get("auth_expired"):
-                            _conn.config = {**_conn.config, "auth_expired": True}
-                            flag_modified(_conn, "config")
-                            await db_session.commit()
-                    except Exception:
-                        logger.warning(
-                            "Failed to persist auth_expired for connector %s",
-                            actual_connector_id,
-                            exc_info=True,
+                        await asyncio.get_event_loop().run_in_executor(
+                            None,
+                            lambda: (
+                                service.events()
+                                .delete(calendarId="primary", eventId=final_event_id)
+                                .execute()
+                            ),
                         )
-                    return {
-                        "status": "insufficient_permissions",
-                        "connector_id": actual_connector_id,
-                        "message": "This Google Calendar account needs additional permissions. Please re-authenticate in connector settings.",
-                    }
-                raise
+                except Exception as api_err:
+                    from googleapiclient.errors import HttpError
 
-            logger.info(f"Calendar event deleted: event_id={final_event_id}")
+                    if isinstance(api_err, HttpError) and api_err.resp.status == 403:
+                        logger.warning(
+                            f"Insufficient permissions for connector {actual_connector_id}: {api_err}"
+                        )
+                        try:
+                            from sqlalchemy.orm.attributes import flag_modified
 
-            delete_result: dict[str, Any] = {
-                "status": "success",
-                "event_id": final_event_id,
-                "message": f"Successfully deleted the calendar event '{event.get('summary', event_title_or_id)}'.",
-            }
+                            _res = await db_session.execute(
+                                select(SearchSourceConnector).where(
+                                    SearchSourceConnector.id == actual_connector_id
+                                )
+                            )
+                            _conn = _res.scalar_one_or_none()
+                            if _conn and not _conn.config.get("auth_expired"):
+                                _conn.config = {**_conn.config, "auth_expired": True}
+                                flag_modified(_conn, "config")
+                                await db_session.commit()
+                        except Exception:
+                            logger.warning(
+                                "Failed to persist auth_expired for connector %s",
+                                actual_connector_id,
+                                exc_info=True,
+                            )
+                        return {
+                            "status": "insufficient_permissions",
+                            "connector_id": actual_connector_id,
+                            "message": "This Google Calendar account needs additional permissions. Please re-authenticate in connector settings.",
+                        }
+                    raise
+
+                logger.info(f"Calendar event deleted: event_id={final_event_id}")
+
+                delete_result: dict[str, Any] = {
+                    "status": "success",
+                    "event_id": final_event_id,
+                    "message": f"Successfully deleted the calendar event '{event.get('summary', event_title_or_id)}'.",
+                }
 
-            deleted_from_kb = False
-            if final_delete_from_kb and document_id:
-                try:
-                    from app.db import Document
+                deleted_from_kb = False
+                if final_delete_from_kb and document_id:
+                    try:
+                        from app.db import Document
 
-                    doc_result = await db_session.execute(
-                        select(Document).filter(Document.id == document_id)
-                    )
-                    document = doc_result.scalars().first()
-                    if document:
-                        await db_session.delete(document)
-                        await db_session.commit()
-                        deleted_from_kb = True
-                        logger.info(
-                            f"Deleted document {document_id} from knowledge base"
+                        doc_result = await db_session.execute(
+                            select(Document).filter(Document.id == document_id)
+                        )
+                        document = doc_result.scalars().first()
+                        if document:
+                            await db_session.delete(document)
+                            await db_session.commit()
+                            deleted_from_kb = True
+                            logger.info(
+                                f"Deleted document {document_id} from knowledge base"
+                            )
+                        else:
+                            logger.warning(f"Document {document_id} not found in KB")
+                    except Exception as e:
+                        logger.error(f"Failed to delete document from KB: {e}")
+                        await db_session.rollback()
+                        delete_result["warning"] = (
+                            f"Event deleted, but failed to remove from knowledge base: {e!s}"
                         )
-                    else:
-                        logger.warning(f"Document {document_id} not found in KB")
-                except Exception as e:
-                    logger.error(f"Failed to delete document from KB: {e}")
-                    await db_session.rollback()
-                    delete_result["warning"] = (
-                        f"Event deleted, but failed to remove from knowledge base: {e!s}"
-                    )
 
-            delete_result["deleted_from_kb"] = deleted_from_kb
-            if deleted_from_kb:
-                delete_result["message"] = (
-                    f"{delete_result.get('message', '')} (also removed from knowledge base)"
-                )
+                delete_result["deleted_from_kb"] = deleted_from_kb
+                if deleted_from_kb:
+                    delete_result["message"] = (
+                        f"{delete_result.get('message', '')} (also removed from knowledge base)"
+                    )
 
-            return delete_result
+                return delete_result
 
         except Exception as e:
             from langgraph.errors import GraphInterrupt
diff --git a/surfsense_backend/app/agents/new_chat/tools/google_calendar/search_events.py b/surfsense_backend/app/agents/new_chat/tools/google_calendar/search_events.py
new file mode 100644
index 0000000000..e5f18f6755
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/tools/google_calendar/search_events.py
@@ -0,0 +1,187 @@
+import logging
+from typing import Any
+
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.future import select
+
+from app.agents.new_chat.tools.gmail.search_emails import _build_credentials
+from app.db import SearchSourceConnector, SearchSourceConnectorType, async_session_maker
+
+logger = logging.getLogger(__name__)
+
+_CALENDAR_TYPES = [
+    SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR,
+    SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
+]
+
+
+def _to_calendar_boundary(value: str, *, is_end: bool) -> str:
+    if "T" in value:
+        return value
+    time = "23:59:59" if is_end else "00:00:00"
+    return f"{value}T{time}Z"
+
+
+def _format_calendar_events(events_raw: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    events = []
+    for ev in events_raw:
+        start = ev.get("start", {})
+        end = ev.get("end", {})
+        attendees_raw = ev.get("attendees", [])
+        events.append(
+            {
+                "event_id": ev.get("id"),
+                "summary": ev.get("summary", "No Title"),
+                "start": start.get("dateTime") or start.get("date", ""),
+                "end": end.get("dateTime") or end.get("date", ""),
+                "location": ev.get("location", ""),
+                "description": ev.get("description", ""),
+                "html_link": ev.get("htmlLink", ""),
+                "attendees": [a.get("email", "") for a in attendees_raw[:10]],
+                "status": ev.get("status", ""),
+            }
+        )
+    return events
+
+
+def create_search_calendar_events_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    """
+    Factory function to create the search_calendar_events tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+
+    Returns:
+        Configured search_calendar_events tool
+    """
+    del db_session  # per-call session — see docstring
+
+    @tool
+    async def search_calendar_events(
+        start_date: str,
+        end_date: str,
+        max_results: int = 25,
+    ) -> dict[str, Any]:
+        """Search Google Calendar events within a date range.
+
+        Args:
+            start_date: Start date in YYYY-MM-DD format (e.g. "2026-04-01").
+            end_date: End date in YYYY-MM-DD format (e.g. "2026-04-30").
+            max_results: Maximum number of events to return (default 25, max 50).
+
+        Returns:
+            Dictionary with status and a list of events including
+            event_id, summary, start, end, location, attendees.
+        """
+        if search_space_id is None or user_id is None:
+            return {
+                "status": "error",
+                "message": "Calendar tool not properly configured.",
+            }
+
+        max_results = min(max_results, 50)
+
+        try:
+            async with async_session_maker() as db_session:
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type.in_(_CALENDAR_TYPES),
+                    )
+                )
+                connector = result.scalars().first()
+                if not connector:
+                    return {
+                        "status": "error",
+                        "message": "No Google Calendar connector found. Please connect Google Calendar in your workspace settings.",
+                    }
+
+                if (
+                    connector.connector_type
+                    == SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR
+                ):
+                    cca_id = connector.config.get("composio_connected_account_id")
+                    if not cca_id:
+                        return {
+                            "status": "error",
+                            "message": "Composio connected account ID not found for this connector.",
+                        }
+
+                    from app.services.composio_service import ComposioService
+
+                    events_raw, error = await ComposioService().get_calendar_events(
+                        connected_account_id=cca_id,
+                        entity_id=f"surfsense_{user_id}",
+                        time_min=_to_calendar_boundary(start_date, is_end=False),
+                        time_max=_to_calendar_boundary(end_date, is_end=True),
+                        max_results=max_results,
+                    )
+                    if not events_raw and not error:
+                        error = "No events found in the specified date range."
+                else:
+                    creds = _build_credentials(connector)
+
+                    from app.connectors.google_calendar_connector import (
+                        GoogleCalendarConnector,
+                    )
+
+                    cal = GoogleCalendarConnector(
+                        credentials=creds,
+                        session=db_session,
+                        user_id=user_id,
+                        connector_id=connector.id,
+                    )
+
+                    events_raw, error = await cal.get_all_primary_calendar_events(
+                        start_date=start_date,
+                        end_date=end_date,
+                        max_results=max_results,
+                    )
+
+                if error:
+                    if (
+                        "re-authenticate" in error.lower()
+                        or "authentication failed" in error.lower()
+                    ):
+                        return {
+                            "status": "auth_error",
+                            "message": error,
+                            "connector_type": "google_calendar",
+                        }
+                    if "no events found" in error.lower():
+                        return {
+                            "status": "success",
+                            "events": [],
+                            "total": 0,
+                            "message": error,
+                        }
+                    return {"status": "error", "message": error}
+
+                events = _format_calendar_events(events_raw)
+
+                return {"status": "success", "events": events, "total": len(events)}
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+            logger.error("Error searching calendar events: %s", e, exc_info=True)
+            return {
+                "status": "error",
+                "message": "Failed to search calendar events. Please try again.",
+            }
+
+    return search_calendar_events
diff --git a/surfsense_backend/app/agents/new_chat/tools/google_calendar/update_event.py b/surfsense_backend/app/agents/new_chat/tools/google_calendar/update_event.py
index ed826f1b8e..b8561fee60 100644
--- a/surfsense_backend/app/agents/new_chat/tools/google_calendar/update_event.py
+++ b/surfsense_backend/app/agents/new_chat/tools/google_calendar/update_event.py
@@ -6,9 +6,10 @@
 from google.oauth2.credentials import Credentials
 from googleapiclient.discovery import build
 from langchain_core.tools import tool
-from langgraph.types import interrupt
 from sqlalchemy.ext.asyncio import AsyncSession
 
+from app.agents.new_chat.tools.hitl import request_approval
+from app.db import async_session_maker
 from app.services.google_calendar import GoogleCalendarToolMetadataService
 
 logger = logging.getLogger(__name__)
@@ -33,6 +34,23 @@ def create_update_calendar_event_tool(
     search_space_id: int | None = None,
     user_id: str | None = None,
 ):
+    """
+    Factory function to create the update_calendar_event tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+
+    Returns:
+        Configured update_calendar_event tool
+    """
+    del db_session  # per-call session — see docstring
+
     @tool
     async def update_calendar_event(
         event_title_or_id: str,
@@ -74,298 +92,317 @@ async def update_calendar_event(
         """
         logger.info(f"update_calendar_event called: event_ref='{event_title_or_id}'")
 
-        if db_session is None or search_space_id is None or user_id is None:
+        if search_space_id is None or user_id is None:
             return {
                 "status": "error",
                 "message": "Google Calendar tool not properly configured. Please contact support.",
             }
 
         try:
-            metadata_service = GoogleCalendarToolMetadataService(db_session)
-            context = await metadata_service.get_update_context(
-                search_space_id, user_id, event_title_or_id
-            )
-
-            if "error" in context:
-                error_msg = context["error"]
-                if "not found" in error_msg.lower():
-                    logger.warning(f"Event not found: {error_msg}")
-                    return {"status": "not_found", "message": error_msg}
-                logger.error(f"Failed to fetch update context: {error_msg}")
-                return {"status": "error", "message": error_msg}
-
-            if context.get("auth_expired"):
-                logger.warning("Google Calendar account has expired authentication")
-                return {
-                    "status": "auth_error",
-                    "message": "The Google Calendar account for this event needs re-authentication. Please re-authenticate in your connector settings.",
-                    "connector_type": "google_calendar",
-                }
-
-            event = context["event"]
-            event_id = event["event_id"]
-            document_id = event.get("document_id")
-            connector_id_from_context = context["account"]["id"]
+            async with async_session_maker() as db_session:
+                metadata_service = GoogleCalendarToolMetadataService(db_session)
+                context = await metadata_service.get_update_context(
+                    search_space_id, user_id, event_title_or_id
+                )
 
-            if not event_id:
-                return {
-                    "status": "error",
-                    "message": "Event ID is missing from the indexed document. Please re-index the event and try again.",
-                }
+                if "error" in context:
+                    error_msg = context["error"]
+                    if "not found" in error_msg.lower():
+                        logger.warning(f"Event not found: {error_msg}")
+                        return {"status": "not_found", "message": error_msg}
+                    logger.error(f"Failed to fetch update context: {error_msg}")
+                    return {"status": "error", "message": error_msg}
 
-            logger.info(
-                f"Requesting approval for updating calendar event: '{event_title_or_id}' (event_id={event_id})"
-            )
-            approval = interrupt(
-                {
-                    "type": "google_calendar_event_update",
-                    "action": {
-                        "tool": "update_calendar_event",
-                        "params": {
-                            "event_id": event_id,
-                            "document_id": document_id,
-                            "connector_id": connector_id_from_context,
-                            "new_summary": new_summary,
-                            "new_start_datetime": new_start_datetime,
-                            "new_end_datetime": new_end_datetime,
-                            "new_description": new_description,
-                            "new_location": new_location,
-                            "new_attendees": new_attendees,
-                        },
-                    },
-                    "context": context,
-                }
-            )
-
-            decisions_raw = (
-                approval.get("decisions", []) if isinstance(approval, dict) else []
-            )
-            decisions = (
-                decisions_raw if isinstance(decisions_raw, list) else [decisions_raw]
-            )
-            decisions = [d for d in decisions if isinstance(d, dict)]
-            if not decisions:
-                logger.warning("No approval decision received")
-                return {"status": "error", "message": "No approval decision received"}
-
-            decision = decisions[0]
-            decision_type = decision.get("type") or decision.get("decision_type")
-            logger.info(f"User decision: {decision_type}")
-
-            if decision_type == "reject":
-                return {
-                    "status": "rejected",
-                    "message": "User declined. The event was not updated. Do not ask again or suggest alternatives.",
-                }
+                if context.get("auth_expired"):
+                    logger.warning("Google Calendar account has expired authentication")
+                    return {
+                        "status": "auth_error",
+                        "message": "The Google Calendar account for this event needs re-authentication. Please re-authenticate in your connector settings.",
+                        "connector_type": "google_calendar",
+                    }
 
-            edited_action = decision.get("edited_action")
-            final_params: dict[str, Any] = {}
-            if isinstance(edited_action, dict):
-                edited_args = edited_action.get("args")
-                if isinstance(edited_args, dict):
-                    final_params = edited_args
-            elif isinstance(decision.get("args"), dict):
-                final_params = decision["args"]
-
-            final_event_id = final_params.get("event_id", event_id)
-            final_connector_id = final_params.get(
-                "connector_id", connector_id_from_context
-            )
-            final_new_summary = final_params.get("new_summary", new_summary)
-            final_new_start_datetime = final_params.get(
-                "new_start_datetime", new_start_datetime
-            )
-            final_new_end_datetime = final_params.get(
-                "new_end_datetime", new_end_datetime
-            )
-            final_new_description = final_params.get("new_description", new_description)
-            final_new_location = final_params.get("new_location", new_location)
-            final_new_attendees = final_params.get("new_attendees", new_attendees)
-
-            if not final_connector_id:
-                return {
-                    "status": "error",
-                    "message": "No connector found for this event.",
-                }
+                event = context["event"]
+                event_id = event["event_id"]
+                document_id = event.get("document_id")
+                connector_id_from_context = context["account"]["id"]
 
-            from sqlalchemy.future import select
+                if not event_id:
+                    return {
+                        "status": "error",
+                        "message": "Event ID is missing from the indexed document. Please re-index the event and try again.",
+                    }
 
-            from app.db import SearchSourceConnector, SearchSourceConnectorType
+                logger.info(
+                    f"Requesting approval for updating calendar event: '{event_title_or_id}' (event_id={event_id})"
+                )
+                result = request_approval(
+                    action_type="google_calendar_event_update",
+                    tool_name="update_calendar_event",
+                    params={
+                        "event_id": event_id,
+                        "document_id": document_id,
+                        "connector_id": connector_id_from_context,
+                        "new_summary": new_summary,
+                        "new_start_datetime": new_start_datetime,
+                        "new_end_datetime": new_end_datetime,
+                        "new_description": new_description,
+                        "new_location": new_location,
+                        "new_attendees": new_attendees,
+                    },
+                    context=context,
+                )
 
-            _calendar_types = [
-                SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR,
-                SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
-            ]
+                if result.rejected:
+                    return {
+                        "status": "rejected",
+                        "message": "User declined. The event was not updated. Do not ask again or suggest alternatives.",
+                    }
 
-            result = await db_session.execute(
-                select(SearchSourceConnector).filter(
-                    SearchSourceConnector.id == final_connector_id,
-                    SearchSourceConnector.search_space_id == search_space_id,
-                    SearchSourceConnector.user_id == user_id,
-                    SearchSourceConnector.connector_type.in_(_calendar_types),
+                final_event_id = result.params.get("event_id", event_id)
+                final_connector_id = result.params.get(
+                    "connector_id", connector_id_from_context
                 )
-            )
-            connector = result.scalars().first()
-            if not connector:
-                return {
-                    "status": "error",
-                    "message": "Selected Google Calendar connector is invalid or has been disconnected.",
-                }
+                final_new_summary = result.params.get("new_summary", new_summary)
+                final_new_start_datetime = result.params.get(
+                    "new_start_datetime", new_start_datetime
+                )
+                final_new_end_datetime = result.params.get(
+                    "new_end_datetime", new_end_datetime
+                )
+                final_new_description = result.params.get(
+                    "new_description", new_description
+                )
+                final_new_location = result.params.get("new_location", new_location)
+                final_new_attendees = result.params.get("new_attendees", new_attendees)
 
-            actual_connector_id = connector.id
+                if not final_connector_id:
+                    return {
+                        "status": "error",
+                        "message": "No connector found for this event.",
+                    }
 
-            logger.info(
-                f"Updating calendar event: event_id='{final_event_id}', connector={actual_connector_id}"
-            )
+                from sqlalchemy.future import select
 
-            if (
-                connector.connector_type
-                == SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR
-            ):
-                from app.utils.google_credentials import build_composio_credentials
+                from app.db import SearchSourceConnector, SearchSourceConnectorType
 
-                cca_id = connector.config.get("composio_connected_account_id")
-                if cca_id:
-                    creds = build_composio_credentials(cca_id)
-                else:
+                _calendar_types = [
+                    SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR,
+                    SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
+                ]
+
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.id == final_connector_id,
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type.in_(_calendar_types),
+                    )
+                )
+                connector = result.scalars().first()
+                if not connector:
                     return {
                         "status": "error",
-                        "message": "Composio connected account ID not found for this connector.",
+                        "message": "Selected Google Calendar connector is invalid or has been disconnected.",
                     }
-            else:
-                config_data = dict(connector.config)
-
-                from app.config import config as app_config
-                from app.utils.oauth_security import TokenEncryption
-
-                token_encrypted = config_data.get("_token_encrypted", False)
-                if token_encrypted and app_config.SECRET_KEY:
-                    token_encryption = TokenEncryption(app_config.SECRET_KEY)
-                    for key in ("token", "refresh_token", "client_secret"):
-                        if config_data.get(key):
-                            config_data[key] = token_encryption.decrypt_token(
-                                config_data[key]
-                            )
 
-                exp = config_data.get("expiry", "")
-                if exp:
-                    exp = exp.replace("Z", "")
-
-                creds = Credentials(
-                    token=config_data.get("token"),
-                    refresh_token=config_data.get("refresh_token"),
-                    token_uri=config_data.get("token_uri"),
-                    client_id=config_data.get("client_id"),
-                    client_secret=config_data.get("client_secret"),
-                    scopes=config_data.get("scopes", []),
-                    expiry=datetime.fromisoformat(exp) if exp else None,
-                )
+                actual_connector_id = connector.id
 
-            service = await asyncio.get_event_loop().run_in_executor(
-                None, lambda: build("calendar", "v3", credentials=creds)
-            )
-
-            update_body: dict[str, Any] = {}
-            if final_new_summary is not None:
-                update_body["summary"] = final_new_summary
-            if final_new_start_datetime is not None:
-                update_body["start"] = _build_time_body(
-                    final_new_start_datetime, context
+                logger.info(
+                    f"Updating calendar event: event_id='{final_event_id}', connector={actual_connector_id}"
                 )
-            if final_new_end_datetime is not None:
-                update_body["end"] = _build_time_body(final_new_end_datetime, context)
-            if final_new_description is not None:
-                update_body["description"] = final_new_description
-            if final_new_location is not None:
-                update_body["location"] = final_new_location
-            if final_new_attendees is not None:
-                update_body["attendees"] = [
-                    {"email": e.strip()} for e in final_new_attendees if e.strip()
-                ]
 
-            if not update_body:
-                return {
-                    "status": "error",
-                    "message": "No changes specified. Please provide at least one field to update.",
-                }
-
-            try:
-                updated = await asyncio.get_event_loop().run_in_executor(
-                    None,
-                    lambda: (
-                        service.events()
-                        .patch(
-                            calendarId="primary",
-                            eventId=final_event_id,
-                            body=update_body,
-                        )
-                        .execute()
-                    ),
+                is_composio_calendar = (
+                    connector.connector_type
+                    == SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR
                 )
-            except Exception as api_err:
-                from googleapiclient.errors import HttpError
+                if is_composio_calendar:
+                    cca_id = connector.config.get("composio_connected_account_id")
+                    if not cca_id:
+                        return {
+                            "status": "error",
+                            "message": "Composio connected account ID not found for this connector.",
+                        }
+                else:
+                    config_data = dict(connector.config)
+
+                    from app.config import config as app_config
+                    from app.utils.oauth_security import TokenEncryption
+
+                    token_encrypted = config_data.get("_token_encrypted", False)
+                    if token_encrypted and app_config.SECRET_KEY:
+                        token_encryption = TokenEncryption(app_config.SECRET_KEY)
+                        for key in ("token", "refresh_token", "client_secret"):
+                            if config_data.get(key):
+                                config_data[key] = token_encryption.decrypt_token(
+                                    config_data[key]
+                                )
+
+                    exp = config_data.get("expiry", "")
+                    if exp:
+                        exp = exp.replace("Z", "")
+
+                    creds = Credentials(
+                        token=config_data.get("token"),
+                        refresh_token=config_data.get("refresh_token"),
+                        token_uri=config_data.get("token_uri"),
+                        client_id=config_data.get("client_id"),
+                        client_secret=config_data.get("client_secret"),
+                        scopes=config_data.get("scopes", []),
+                        expiry=datetime.fromisoformat(exp) if exp else None,
+                    )
 
-                if isinstance(api_err, HttpError) and api_err.resp.status == 403:
-                    logger.warning(
-                        f"Insufficient permissions for connector {actual_connector_id}: {api_err}"
+                update_body: dict[str, Any] = {}
+                if final_new_summary is not None:
+                    update_body["summary"] = final_new_summary
+                if final_new_start_datetime is not None:
+                    update_body["start"] = _build_time_body(
+                        final_new_start_datetime, context
                     )
-                    try:
-                        from sqlalchemy.orm.attributes import flag_modified
+                if final_new_end_datetime is not None:
+                    update_body["end"] = _build_time_body(
+                        final_new_end_datetime, context
+                    )
+                if final_new_description is not None:
+                    update_body["description"] = final_new_description
+                if final_new_location is not None:
+                    update_body["location"] = final_new_location
+                if final_new_attendees is not None:
+                    update_body["attendees"] = [
+                        {"email": e.strip()} for e in final_new_attendees if e.strip()
+                    ]
+
+                if not update_body:
+                    return {
+                        "status": "error",
+                        "message": "No changes specified. Please provide at least one field to update.",
+                    }
 
-                        _res = await db_session.execute(
-                            select(SearchSourceConnector).where(
-                                SearchSourceConnector.id == actual_connector_id
+                try:
+                    if is_composio_calendar:
+                        from app.services.composio_service import ComposioService
+
+                        composio_params: dict[str, Any] = {
+                            "calendar_id": "primary",
+                            "event_id": final_event_id,
+                        }
+                        if final_new_summary is not None:
+                            composio_params["summary"] = final_new_summary
+                        if final_new_start_datetime is not None:
+                            composio_params["start_time"] = final_new_start_datetime
+                        if final_new_end_datetime is not None:
+                            composio_params["end_time"] = final_new_end_datetime
+                        if final_new_description is not None:
+                            composio_params["description"] = final_new_description
+                        if final_new_location is not None:
+                            composio_params["location"] = final_new_location
+                        if final_new_attendees is not None:
+                            composio_params["attendees"] = [
+                                e.strip() for e in final_new_attendees if e.strip()
+                            ]
+                        if not _is_date_only(
+                            final_new_start_datetime or final_new_end_datetime or ""
+                        ):
+                            composio_params["timezone"] = context.get("timezone", "UTC")
+
+                        composio_result = await ComposioService().execute_tool(
+                            connected_account_id=cca_id,
+                            tool_name="GOOGLECALENDAR_PATCH_EVENT",
+                            params=composio_params,
+                            entity_id=f"surfsense_{user_id}",
+                        )
+                        if not composio_result.get("success"):
+                            raise RuntimeError(
+                                composio_result.get(
+                                    "error", "Unknown Composio Calendar error"
+                                )
                             )
+                        updated = composio_result.get("data", {})
+                        if isinstance(updated, dict):
+                            updated = updated.get("data", updated)
+                            if isinstance(updated, dict):
+                                updated = updated.get("response_data", updated)
+                    else:
+                        service = await asyncio.get_event_loop().run_in_executor(
+                            None, lambda: build("calendar", "v3", credentials=creds)
                         )
-                        _conn = _res.scalar_one_or_none()
-                        if _conn and not _conn.config.get("auth_expired"):
-                            _conn.config = {**_conn.config, "auth_expired": True}
-                            flag_modified(_conn, "config")
-                            await db_session.commit()
-                    except Exception:
+                        updated = await asyncio.get_event_loop().run_in_executor(
+                            None,
+                            lambda: (
+                                service.events()
+                                .patch(
+                                    calendarId="primary",
+                                    eventId=final_event_id,
+                                    body=update_body,
+                                )
+                                .execute()
+                            ),
+                        )
+                except Exception as api_err:
+                    from googleapiclient.errors import HttpError
+
+                    if isinstance(api_err, HttpError) and api_err.resp.status == 403:
                         logger.warning(
-                            "Failed to persist auth_expired for connector %s",
-                            actual_connector_id,
-                            exc_info=True,
+                            f"Insufficient permissions for connector {actual_connector_id}: {api_err}"
                         )
-                    return {
-                        "status": "insufficient_permissions",
-                        "connector_id": actual_connector_id,
-                        "message": "This Google Calendar account needs additional permissions. Please re-authenticate in connector settings.",
-                    }
-                raise
+                        try:
+                            from sqlalchemy.orm.attributes import flag_modified
 
-            logger.info(f"Calendar event updated: event_id={final_event_id}")
+                            _res = await db_session.execute(
+                                select(SearchSourceConnector).where(
+                                    SearchSourceConnector.id == actual_connector_id
+                                )
+                            )
+                            _conn = _res.scalar_one_or_none()
+                            if _conn and not _conn.config.get("auth_expired"):
+                                _conn.config = {**_conn.config, "auth_expired": True}
+                                flag_modified(_conn, "config")
+                                await db_session.commit()
+                        except Exception:
+                            logger.warning(
+                                "Failed to persist auth_expired for connector %s",
+                                actual_connector_id,
+                                exc_info=True,
+                            )
+                        return {
+                            "status": "insufficient_permissions",
+                            "connector_id": actual_connector_id,
+                            "message": "This Google Calendar account needs additional permissions. Please re-authenticate in connector settings.",
+                        }
+                    raise
 
-            kb_message_suffix = ""
-            if document_id is not None:
-                try:
-                    from app.services.google_calendar import GoogleCalendarKBSyncService
-
-                    kb_service = GoogleCalendarKBSyncService(db_session)
-                    kb_result = await kb_service.sync_after_update(
-                        document_id=document_id,
-                        event_id=final_event_id,
-                        connector_id=actual_connector_id,
-                        search_space_id=search_space_id,
-                        user_id=user_id,
-                    )
-                    if kb_result["status"] == "success":
-                        kb_message_suffix = (
-                            " Your knowledge base has also been updated."
+                logger.info(f"Calendar event updated: event_id={final_event_id}")
+
+                kb_message_suffix = ""
+                if document_id is not None:
+                    try:
+                        from app.services.google_calendar import (
+                            GoogleCalendarKBSyncService,
                         )
-                    else:
+
+                        kb_service = GoogleCalendarKBSyncService(db_session)
+                        kb_result = await kb_service.sync_after_update(
+                            document_id=document_id,
+                            event_id=final_event_id,
+                            connector_id=actual_connector_id,
+                            search_space_id=search_space_id,
+                            user_id=user_id,
+                        )
+                        if kb_result["status"] == "success":
+                            kb_message_suffix = (
+                                " Your knowledge base has also been updated."
+                            )
+                        else:
+                            kb_message_suffix = " The knowledge base will be updated in the next scheduled sync."
+                    except Exception as kb_err:
+                        logger.warning(f"KB sync after update failed: {kb_err}")
                         kb_message_suffix = " The knowledge base will be updated in the next scheduled sync."
-                except Exception as kb_err:
-                    logger.warning(f"KB sync after update failed: {kb_err}")
-                    kb_message_suffix = " The knowledge base will be updated in the next scheduled sync."
 
-            return {
-                "status": "success",
-                "event_id": final_event_id,
-                "html_link": updated.get("htmlLink"),
-                "message": f"Successfully updated the calendar event.{kb_message_suffix}",
-            }
+                return {
+                    "status": "success",
+                    "event_id": final_event_id,
+                    "html_link": updated.get("htmlLink"),
+                    "message": f"Successfully updated the calendar event.{kb_message_suffix}",
+                }
 
         except Exception as e:
             from langgraph.errors import GraphInterrupt
diff --git a/surfsense_backend/app/agents/new_chat/tools/google_drive/create_file.py b/surfsense_backend/app/agents/new_chat/tools/google_drive/create_file.py
index a4fee09654..66199ca670 100644
--- a/surfsense_backend/app/agents/new_chat/tools/google_drive/create_file.py
+++ b/surfsense_backend/app/agents/new_chat/tools/google_drive/create_file.py
@@ -3,11 +3,12 @@
 
 from googleapiclient.errors import HttpError
 from langchain_core.tools import tool
-from langgraph.types import interrupt
 from sqlalchemy.ext.asyncio import AsyncSession
 
+from app.agents.new_chat.tools.hitl import request_approval
 from app.connectors.google_drive.client import GoogleDriveClient
 from app.connectors.google_drive.file_types import GOOGLE_DOC, GOOGLE_SHEET
+from app.db import async_session_maker
 from app.services.google_drive import GoogleDriveToolMetadataService
 
 logger = logging.getLogger(__name__)
@@ -23,6 +24,25 @@ def create_create_google_drive_file_tool(
     search_space_id: int | None = None,
     user_id: str | None = None,
 ):
+    """
+    Factory function to create the create_google_drive_file tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+        search_space_id: Search space ID to find the Google Drive connector
+        user_id: User ID for fetching user-specific context
+
+    Returns:
+        Configured create_google_drive_file tool
+    """
+    del db_session  # per-call session — see docstring
+
     @tool
     async def create_google_drive_file(
         name: str,
@@ -65,7 +85,7 @@ async def create_google_drive_file(
             f"create_google_drive_file called: name='{name}', type='{file_type}'"
         )
 
-        if db_session is None or search_space_id is None or user_id is None:
+        if search_space_id is None or user_id is None:
             return {
                 "status": "error",
                 "message": "Google Drive tool not properly configured. Please contact support.",
@@ -78,223 +98,232 @@ async def create_google_drive_file(
             }
 
         try:
-            metadata_service = GoogleDriveToolMetadataService(db_session)
-            context = await metadata_service.get_creation_context(
-                search_space_id, user_id
-            )
-
-            if "error" in context:
-                logger.error(f"Failed to fetch creation context: {context['error']}")
-                return {"status": "error", "message": context["error"]}
-
-            accounts = context.get("accounts", [])
-            if accounts and all(a.get("auth_expired") for a in accounts):
-                logger.warning("All Google Drive accounts have expired authentication")
-                return {
-                    "status": "auth_error",
-                    "message": "All connected Google Drive accounts need re-authentication. Please re-authenticate in your connector settings.",
-                    "connector_type": "google_drive",
-                }
+            async with async_session_maker() as db_session:
+                metadata_service = GoogleDriveToolMetadataService(db_session)
+                context = await metadata_service.get_creation_context(
+                    search_space_id, user_id
+                )
 
-            logger.info(
-                f"Requesting approval for creating Google Drive file: name='{name}', type='{file_type}'"
-            )
-            approval = interrupt(
-                {
-                    "type": "google_drive_file_creation",
-                    "action": {
-                        "tool": "create_google_drive_file",
-                        "params": {
-                            "name": name,
-                            "file_type": file_type,
-                            "content": content,
-                            "connector_id": None,
-                            "parent_folder_id": None,
-                        },
-                    },
-                    "context": context,
-                }
-            )
-
-            decisions_raw = (
-                approval.get("decisions", []) if isinstance(approval, dict) else []
-            )
-            decisions = (
-                decisions_raw if isinstance(decisions_raw, list) else [decisions_raw]
-            )
-            decisions = [d for d in decisions if isinstance(d, dict)]
-            if not decisions:
-                logger.warning("No approval decision received")
-                return {"status": "error", "message": "No approval decision received"}
-
-            decision = decisions[0]
-            decision_type = decision.get("type") or decision.get("decision_type")
-            logger.info(f"User decision: {decision_type}")
-
-            if decision_type == "reject":
-                return {
-                    "status": "rejected",
-                    "message": "User declined. The file was not created. Do not ask again or suggest alternatives.",
-                }
+                if "error" in context:
+                    logger.error(
+                        f"Failed to fetch creation context: {context['error']}"
+                    )
+                    return {"status": "error", "message": context["error"]}
 
-            final_params: dict[str, Any] = {}
-            edited_action = decision.get("edited_action")
-            if isinstance(edited_action, dict):
-                edited_args = edited_action.get("args")
-                if isinstance(edited_args, dict):
-                    final_params = edited_args
-            elif isinstance(decision.get("args"), dict):
-                final_params = decision["args"]
-
-            final_name = final_params.get("name", name)
-            final_file_type = final_params.get("file_type", file_type)
-            final_content = final_params.get("content", content)
-            final_connector_id = final_params.get("connector_id")
-            final_parent_folder_id = final_params.get("parent_folder_id")
-
-            if not final_name or not final_name.strip():
-                return {"status": "error", "message": "File name cannot be empty."}
-
-            mime_type = _MIME_MAP.get(final_file_type)
-            if not mime_type:
-                return {
-                    "status": "error",
-                    "message": f"Unsupported file type '{final_file_type}'.",
-                }
+                accounts = context.get("accounts", [])
+                if accounts and all(a.get("auth_expired") for a in accounts):
+                    logger.warning(
+                        "All Google Drive accounts have expired authentication"
+                    )
+                    return {
+                        "status": "auth_error",
+                        "message": "All connected Google Drive accounts need re-authentication. Please re-authenticate in your connector settings.",
+                        "connector_type": "google_drive",
+                    }
+
+                logger.info(
+                    f"Requesting approval for creating Google Drive file: name='{name}', type='{file_type}'"
+                )
+                result = request_approval(
+                    action_type="google_drive_file_creation",
+                    tool_name="create_google_drive_file",
+                    params={
+                        "name": name,
+                        "file_type": file_type,
+                        "content": content,
+                        "connector_id": None,
+                        "parent_folder_id": None,
+                    },
+                    context=context,
+                )
 
-            from sqlalchemy.future import select
+                if result.rejected:
+                    return {
+                        "status": "rejected",
+                        "message": "User declined. The file was not created. Do not ask again or suggest alternatives.",
+                    }
 
-            from app.db import SearchSourceConnector, SearchSourceConnectorType
+                final_name = result.params.get("name", name)
+                final_file_type = result.params.get("file_type", file_type)
+                final_content = result.params.get("content", content)
+                final_connector_id = result.params.get("connector_id")
+                final_parent_folder_id = result.params.get("parent_folder_id")
 
-            _drive_types = [
-                SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR,
-                SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR,
-            ]
+                if not final_name or not final_name.strip():
+                    return {"status": "error", "message": "File name cannot be empty."}
 
-            if final_connector_id is not None:
-                result = await db_session.execute(
-                    select(SearchSourceConnector).filter(
-                        SearchSourceConnector.id == final_connector_id,
-                        SearchSourceConnector.search_space_id == search_space_id,
-                        SearchSourceConnector.user_id == user_id,
-                        SearchSourceConnector.connector_type.in_(_drive_types),
-                    )
-                )
-                connector = result.scalars().first()
-                if not connector:
+                mime_type = _MIME_MAP.get(final_file_type)
+                if not mime_type:
                     return {
                         "status": "error",
-                        "message": "Selected Google Drive connector is invalid or has been disconnected.",
+                        "message": f"Unsupported file type '{final_file_type}'.",
                     }
-                actual_connector_id = connector.id
-            else:
-                result = await db_session.execute(
-                    select(SearchSourceConnector).filter(
-                        SearchSourceConnector.search_space_id == search_space_id,
-                        SearchSourceConnector.user_id == user_id,
-                        SearchSourceConnector.connector_type.in_(_drive_types),
+
+                from sqlalchemy.future import select
+
+                from app.db import SearchSourceConnector, SearchSourceConnectorType
+
+                _drive_types = [
+                    SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR,
+                    SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR,
+                ]
+
+                if final_connector_id is not None:
+                    result = await db_session.execute(
+                        select(SearchSourceConnector).filter(
+                            SearchSourceConnector.id == final_connector_id,
+                            SearchSourceConnector.search_space_id == search_space_id,
+                            SearchSourceConnector.user_id == user_id,
+                            SearchSourceConnector.connector_type.in_(_drive_types),
+                        )
                     )
-                )
-                connector = result.scalars().first()
-                if not connector:
-                    return {
-                        "status": "error",
-                        "message": "No Google Drive connector found. Please connect Google Drive in your workspace settings.",
-                    }
-                actual_connector_id = connector.id
-
-            logger.info(
-                f"Creating Google Drive file: name='{final_name}', type='{final_file_type}', connector={actual_connector_id}"
-            )
-
-            pre_built_creds = None
-            if (
-                connector.connector_type
-                == SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR
-            ):
-                from app.utils.google_credentials import build_composio_credentials
-
-                cca_id = connector.config.get("composio_connected_account_id")
-                if cca_id:
-                    pre_built_creds = build_composio_credentials(cca_id)
-
-            client = GoogleDriveClient(
-                session=db_session,
-                connector_id=actual_connector_id,
-                credentials=pre_built_creds,
-            )
-            try:
-                created = await client.create_file(
-                    name=final_name,
-                    mime_type=mime_type,
-                    parent_folder_id=final_parent_folder_id,
-                    content=final_content,
-                )
-            except HttpError as http_err:
-                if http_err.resp.status == 403:
-                    logger.warning(
-                        f"Insufficient permissions for connector {actual_connector_id}: {http_err}"
+                    connector = result.scalars().first()
+                    if not connector:
+                        return {
+                            "status": "error",
+                            "message": "Selected Google Drive connector is invalid or has been disconnected.",
+                        }
+                    actual_connector_id = connector.id
+                else:
+                    result = await db_session.execute(
+                        select(SearchSourceConnector).filter(
+                            SearchSourceConnector.search_space_id == search_space_id,
+                            SearchSourceConnector.user_id == user_id,
+                            SearchSourceConnector.connector_type.in_(_drive_types),
+                        )
                     )
-                    try:
-                        from sqlalchemy.orm.attributes import flag_modified
+                    connector = result.scalars().first()
+                    if not connector:
+                        return {
+                            "status": "error",
+                            "message": "No Google Drive connector found. Please connect Google Drive in your workspace settings.",
+                        }
+                    actual_connector_id = connector.id
+
+                logger.info(
+                    f"Creating Google Drive file: name='{final_name}', type='{final_file_type}', connector={actual_connector_id}"
+                )
 
-                        _res = await db_session.execute(
-                            select(SearchSourceConnector).where(
-                                SearchSourceConnector.id == actual_connector_id
+                is_composio_drive = (
+                    connector.connector_type
+                    == SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR
+                )
+                if is_composio_drive:
+                    cca_id = connector.config.get("composio_connected_account_id")
+                    if not cca_id:
+                        return {
+                            "status": "error",
+                            "message": "Composio connected account ID not found for this Drive connector.",
+                        }
+                client = GoogleDriveClient(
+                    session=db_session,
+                    connector_id=actual_connector_id,
+                )
+                try:
+                    if is_composio_drive:
+                        from app.services.composio_service import ComposioService
+
+                        params: dict[str, Any] = {
+                            "name": final_name,
+                            "mimeType": mime_type,
+                            "fields": "id,name,webViewLink,mimeType",
+                        }
+                        if final_parent_folder_id:
+                            params["parents"] = [final_parent_folder_id]
+                        if final_content:
+                            params["description"] = final_content[:4096]
+
+                        result = await ComposioService().execute_tool(
+                            connected_account_id=cca_id,
+                            tool_name="GOOGLEDRIVE_CREATE_FILE",
+                            params=params,
+                            entity_id=f"surfsense_{user_id}",
+                        )
+                        if not result.get("success"):
+                            raise RuntimeError(
+                                result.get("error", "Unknown Composio Drive error")
                             )
+                        created = result.get("data", {})
+                        if isinstance(created, dict):
+                            created = created.get("data", created)
+                            if isinstance(created, dict):
+                                created = created.get("response_data", created)
+                        if not isinstance(created, dict):
+                            created = {}
+                    else:
+                        created = await client.create_file(
+                            name=final_name,
+                            mime_type=mime_type,
+                            parent_folder_id=final_parent_folder_id,
+                            content=final_content,
                         )
-                        _conn = _res.scalar_one_or_none()
-                        if _conn and not _conn.config.get("auth_expired"):
-                            _conn.config = {**_conn.config, "auth_expired": True}
-                            flag_modified(_conn, "config")
-                            await db_session.commit()
-                    except Exception:
+                except HttpError as http_err:
+                    if http_err.resp.status == 403:
                         logger.warning(
-                            "Failed to persist auth_expired for connector %s",
-                            actual_connector_id,
-                            exc_info=True,
+                            f"Insufficient permissions for connector {actual_connector_id}: {http_err}"
                         )
-                    return {
-                        "status": "insufficient_permissions",
-                        "connector_id": actual_connector_id,
-                        "message": "This Google Drive account needs additional permissions. Please re-authenticate in connector settings.",
-                    }
-                raise
+                        try:
+                            from sqlalchemy.orm.attributes import flag_modified
 
-            logger.info(
-                f"Google Drive file created: id={created.get('id')}, name={created.get('name')}"
-            )
-
-            kb_message_suffix = ""
-            try:
-                from app.services.google_drive import GoogleDriveKBSyncService
-
-                kb_service = GoogleDriveKBSyncService(db_session)
-                kb_result = await kb_service.sync_after_create(
-                    file_id=created.get("id"),
-                    file_name=created.get("name", final_name),
-                    mime_type=mime_type,
-                    web_view_link=created.get("webViewLink"),
-                    content=final_content,
-                    connector_id=actual_connector_id,
-                    search_space_id=search_space_id,
-                    user_id=user_id,
+                            _res = await db_session.execute(
+                                select(SearchSourceConnector).where(
+                                    SearchSourceConnector.id == actual_connector_id
+                                )
+                            )
+                            _conn = _res.scalar_one_or_none()
+                            if _conn and not _conn.config.get("auth_expired"):
+                                _conn.config = {**_conn.config, "auth_expired": True}
+                                flag_modified(_conn, "config")
+                                await db_session.commit()
+                        except Exception:
+                            logger.warning(
+                                "Failed to persist auth_expired for connector %s",
+                                actual_connector_id,
+                                exc_info=True,
+                            )
+                        return {
+                            "status": "insufficient_permissions",
+                            "connector_id": actual_connector_id,
+                            "message": "This Google Drive account needs additional permissions. Please re-authenticate in connector settings.",
+                        }
+                    raise
+
+                logger.info(
+                    f"Google Drive file created: id={created.get('id')}, name={created.get('name')}"
                 )
-                if kb_result["status"] == "success":
-                    kb_message_suffix = " Your knowledge base has also been updated."
-                else:
+
+                kb_message_suffix = ""
+                try:
+                    from app.services.google_drive import GoogleDriveKBSyncService
+
+                    kb_service = GoogleDriveKBSyncService(db_session)
+                    kb_result = await kb_service.sync_after_create(
+                        file_id=created.get("id"),
+                        file_name=created.get("name", final_name),
+                        mime_type=mime_type,
+                        web_view_link=created.get("webViewLink"),
+                        content=final_content,
+                        connector_id=actual_connector_id,
+                        search_space_id=search_space_id,
+                        user_id=user_id,
+                    )
+                    if kb_result["status"] == "success":
+                        kb_message_suffix = (
+                            " Your knowledge base has also been updated."
+                        )
+                    else:
+                        kb_message_suffix = " This file will be added to your knowledge base in the next scheduled sync."
+                except Exception as kb_err:
+                    logger.warning(f"KB sync after create failed: {kb_err}")
                     kb_message_suffix = " This file will be added to your knowledge base in the next scheduled sync."
-            except Exception as kb_err:
-                logger.warning(f"KB sync after create failed: {kb_err}")
-                kb_message_suffix = " This file will be added to your knowledge base in the next scheduled sync."
 
-            return {
-                "status": "success",
-                "file_id": created.get("id"),
-                "name": created.get("name"),
-                "web_view_link": created.get("webViewLink"),
-                "message": f"Successfully created '{created.get('name')}' in Google Drive.{kb_message_suffix}",
-            }
+                return {
+                    "status": "success",
+                    "file_id": created.get("id"),
+                    "name": created.get("name"),
+                    "web_view_link": created.get("webViewLink"),
+                    "message": f"Successfully created '{created.get('name')}' in Google Drive.{kb_message_suffix}",
+                }
 
         except Exception as e:
             from langgraph.errors import GraphInterrupt
diff --git a/surfsense_backend/app/agents/new_chat/tools/google_drive/trash_file.py b/surfsense_backend/app/agents/new_chat/tools/google_drive/trash_file.py
index fdf7f9cd3f..b3c9240d8a 100644
--- a/surfsense_backend/app/agents/new_chat/tools/google_drive/trash_file.py
+++ b/surfsense_backend/app/agents/new_chat/tools/google_drive/trash_file.py
@@ -3,10 +3,11 @@
 
 from googleapiclient.errors import HttpError
 from langchain_core.tools import tool
-from langgraph.types import interrupt
 from sqlalchemy.ext.asyncio import AsyncSession
 
+from app.agents.new_chat.tools.hitl import request_approval
 from app.connectors.google_drive.client import GoogleDriveClient
+from app.db import async_session_maker
 from app.services.google_drive import GoogleDriveToolMetadataService
 
 logger = logging.getLogger(__name__)
@@ -17,6 +18,25 @@ def create_delete_google_drive_file_tool(
     search_space_id: int | None = None,
     user_id: str | None = None,
 ):
+    """
+    Factory function to create the delete_google_drive_file tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+        search_space_id: Search space ID to find the Google Drive connector
+        user_id: User ID for fetching user-specific context
+
+    Returns:
+        Configured delete_google_drive_file tool
+    """
+    del db_session  # per-call session — see docstring
+
     @tool
     async def delete_google_drive_file(
         file_name: str,
@@ -55,225 +75,214 @@ async def delete_google_drive_file(
             f"delete_google_drive_file called: file_name='{file_name}', delete_from_kb={delete_from_kb}"
         )
 
-        if db_session is None or search_space_id is None or user_id is None:
+        if search_space_id is None or user_id is None:
             return {
                 "status": "error",
                 "message": "Google Drive tool not properly configured. Please contact support.",
             }
 
         try:
-            metadata_service = GoogleDriveToolMetadataService(db_session)
-            context = await metadata_service.get_trash_context(
-                search_space_id, user_id, file_name
-            )
-
-            if "error" in context:
-                error_msg = context["error"]
-                if "not found" in error_msg.lower():
-                    logger.warning(f"File not found: {error_msg}")
-                    return {"status": "not_found", "message": error_msg}
-                logger.error(f"Failed to fetch trash context: {error_msg}")
-                return {"status": "error", "message": error_msg}
-
-            account = context.get("account", {})
-            if account.get("auth_expired"):
-                logger.warning(
-                    "Google Drive account %s has expired authentication",
-                    account.get("id"),
+            async with async_session_maker() as db_session:
+                metadata_service = GoogleDriveToolMetadataService(db_session)
+                context = await metadata_service.get_trash_context(
+                    search_space_id, user_id, file_name
                 )
-                return {
-                    "status": "auth_error",
-                    "message": "The Google Drive account for this file needs re-authentication. Please re-authenticate in your connector settings.",
-                    "connector_type": "google_drive",
-                }
-
-            file = context["file"]
-            file_id = file["file_id"]
-            document_id = file.get("document_id")
-            connector_id_from_context = context["account"]["id"]
 
-            if not file_id:
-                return {
-                    "status": "error",
-                    "message": "File ID is missing from the indexed document. Please re-index the file and try again.",
-                }
+                if "error" in context:
+                    error_msg = context["error"]
+                    if "not found" in error_msg.lower():
+                        logger.warning(f"File not found: {error_msg}")
+                        return {"status": "not_found", "message": error_msg}
+                    logger.error(f"Failed to fetch trash context: {error_msg}")
+                    return {"status": "error", "message": error_msg}
 
-            logger.info(
-                f"Requesting approval for deleting Google Drive file: '{file_name}' (file_id={file_id}, delete_from_kb={delete_from_kb})"
-            )
-            approval = interrupt(
-                {
-                    "type": "google_drive_file_trash",
-                    "action": {
-                        "tool": "delete_google_drive_file",
-                        "params": {
-                            "file_id": file_id,
-                            "connector_id": connector_id_from_context,
-                            "delete_from_kb": delete_from_kb,
-                        },
-                    },
-                    "context": context,
-                }
-            )
-
-            decisions_raw = (
-                approval.get("decisions", []) if isinstance(approval, dict) else []
-            )
-            decisions = (
-                decisions_raw if isinstance(decisions_raw, list) else [decisions_raw]
-            )
-            decisions = [d for d in decisions if isinstance(d, dict)]
-            if not decisions:
-                logger.warning("No approval decision received")
-                return {"status": "error", "message": "No approval decision received"}
-
-            decision = decisions[0]
-            decision_type = decision.get("type") or decision.get("decision_type")
-            logger.info(f"User decision: {decision_type}")
-
-            if decision_type == "reject":
-                return {
-                    "status": "rejected",
-                    "message": "User declined. The file was not trashed. Do not ask again or suggest alternatives.",
-                }
+                account = context.get("account", {})
+                if account.get("auth_expired"):
+                    logger.warning(
+                        "Google Drive account %s has expired authentication",
+                        account.get("id"),
+                    )
+                    return {
+                        "status": "auth_error",
+                        "message": "The Google Drive account for this file needs re-authentication. Please re-authenticate in your connector settings.",
+                        "connector_type": "google_drive",
+                    }
 
-            edited_action = decision.get("edited_action")
-            final_params: dict[str, Any] = {}
-            if isinstance(edited_action, dict):
-                edited_args = edited_action.get("args")
-                if isinstance(edited_args, dict):
-                    final_params = edited_args
-            elif isinstance(decision.get("args"), dict):
-                final_params = decision["args"]
-
-            final_file_id = final_params.get("file_id", file_id)
-            final_connector_id = final_params.get(
-                "connector_id", connector_id_from_context
-            )
-            final_delete_from_kb = final_params.get("delete_from_kb", delete_from_kb)
-
-            if not final_connector_id:
-                return {
-                    "status": "error",
-                    "message": "No connector found for this file.",
-                }
+                file = context["file"]
+                file_id = file["file_id"]
+                document_id = file.get("document_id")
+                connector_id_from_context = context["account"]["id"]
 
-            from sqlalchemy.future import select
+                if not file_id:
+                    return {
+                        "status": "error",
+                        "message": "File ID is missing from the indexed document. Please re-index the file and try again.",
+                    }
 
-            from app.db import SearchSourceConnector, SearchSourceConnectorType
+                logger.info(
+                    f"Requesting approval for deleting Google Drive file: '{file_name}' (file_id={file_id}, delete_from_kb={delete_from_kb})"
+                )
+                result = request_approval(
+                    action_type="google_drive_file_trash",
+                    tool_name="delete_google_drive_file",
+                    params={
+                        "file_id": file_id,
+                        "connector_id": connector_id_from_context,
+                        "delete_from_kb": delete_from_kb,
+                    },
+                    context=context,
+                )
 
-            _drive_types = [
-                SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR,
-                SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR,
-            ]
+                if result.rejected:
+                    return {
+                        "status": "rejected",
+                        "message": "User declined. The file was not trashed. Do not ask again or suggest alternatives.",
+                    }
 
-            result = await db_session.execute(
-                select(SearchSourceConnector).filter(
-                    SearchSourceConnector.id == final_connector_id,
-                    SearchSourceConnector.search_space_id == search_space_id,
-                    SearchSourceConnector.user_id == user_id,
-                    SearchSourceConnector.connector_type.in_(_drive_types),
+                final_file_id = result.params.get("file_id", file_id)
+                final_connector_id = result.params.get(
+                    "connector_id", connector_id_from_context
+                )
+                final_delete_from_kb = result.params.get(
+                    "delete_from_kb", delete_from_kb
                 )
-            )
-            connector = result.scalars().first()
-            if not connector:
-                return {
-                    "status": "error",
-                    "message": "Selected Google Drive connector is invalid or has been disconnected.",
-                }
 
-            logger.info(
-                f"Deleting Google Drive file: file_id='{final_file_id}', connector={final_connector_id}"
-            )
-
-            pre_built_creds = None
-            if (
-                connector.connector_type
-                == SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR
-            ):
-                from app.utils.google_credentials import build_composio_credentials
-
-                cca_id = connector.config.get("composio_connected_account_id")
-                if cca_id:
-                    pre_built_creds = build_composio_credentials(cca_id)
-
-            client = GoogleDriveClient(
-                session=db_session,
-                connector_id=connector.id,
-                credentials=pre_built_creds,
-            )
-            try:
-                await client.trash_file(file_id=final_file_id)
-            except HttpError as http_err:
-                if http_err.resp.status == 403:
-                    logger.warning(
-                        f"Insufficient permissions for connector {connector.id}: {http_err}"
-                    )
-                    try:
-                        from sqlalchemy.orm.attributes import flag_modified
-
-                        if not connector.config.get("auth_expired"):
-                            connector.config = {
-                                **connector.config,
-                                "auth_expired": True,
-                            }
-                            flag_modified(connector, "config")
-                            await db_session.commit()
-                    except Exception:
-                        logger.warning(
-                            "Failed to persist auth_expired for connector %s",
-                            connector.id,
-                            exc_info=True,
-                        )
+                if not final_connector_id:
                     return {
-                        "status": "insufficient_permissions",
-                        "connector_id": connector.id,
-                        "message": "This Google Drive account needs additional permissions. Please re-authenticate in connector settings.",
+                        "status": "error",
+                        "message": "No connector found for this file.",
                     }
-                raise
 
-            logger.info(
-                f"Google Drive file deleted (moved to trash): file_id={final_file_id}"
-            )
+                from sqlalchemy.future import select
 
-            trash_result: dict[str, Any] = {
-                "status": "success",
-                "file_id": final_file_id,
-                "message": f"Successfully moved '{file['name']}' to trash.",
-            }
+                from app.db import SearchSourceConnector, SearchSourceConnectorType
 
-            deleted_from_kb = False
-            if final_delete_from_kb and document_id:
-                try:
-                    from app.db import Document
+                _drive_types = [
+                    SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR,
+                    SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR,
+                ]
 
-                    doc_result = await db_session.execute(
-                        select(Document).filter(Document.id == document_id)
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.id == final_connector_id,
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type.in_(_drive_types),
                     )
-                    document = doc_result.scalars().first()
-                    if document:
-                        await db_session.delete(document)
-                        await db_session.commit()
-                        deleted_from_kb = True
-                        logger.info(
-                            f"Deleted document {document_id} from knowledge base"
+                )
+                connector = result.scalars().first()
+                if not connector:
+                    return {
+                        "status": "error",
+                        "message": "Selected Google Drive connector is invalid or has been disconnected.",
+                    }
+
+                logger.info(
+                    f"Deleting Google Drive file: file_id='{final_file_id}', connector={final_connector_id}"
+                )
+
+                is_composio_drive = (
+                    connector.connector_type
+                    == SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR
+                )
+                if is_composio_drive:
+                    cca_id = connector.config.get("composio_connected_account_id")
+                    if not cca_id:
+                        return {
+                            "status": "error",
+                            "message": "Composio connected account ID not found for this Drive connector.",
+                        }
+
+                client = GoogleDriveClient(
+                    session=db_session,
+                    connector_id=connector.id,
+                )
+                try:
+                    if is_composio_drive:
+                        from app.services.composio_service import ComposioService
+
+                        result = await ComposioService().execute_tool(
+                            connected_account_id=cca_id,
+                            tool_name="GOOGLEDRIVE_TRASH_FILE",
+                            params={"file_id": final_file_id},
+                            entity_id=f"surfsense_{user_id}",
                         )
+                        if not result.get("success"):
+                            raise RuntimeError(
+                                result.get("error", "Unknown Composio Drive error")
+                            )
                     else:
-                        logger.warning(f"Document {document_id} not found in KB")
-                except Exception as e:
-                    logger.error(f"Failed to delete document from KB: {e}")
-                    await db_session.rollback()
-                    trash_result["warning"] = (
-                        f"File moved to trash, but failed to remove from knowledge base: {e!s}"
-                    )
-
-            trash_result["deleted_from_kb"] = deleted_from_kb
-            if deleted_from_kb:
-                trash_result["message"] = (
-                    f"{trash_result.get('message', '')} (also removed from knowledge base)"
+                        await client.trash_file(file_id=final_file_id)
+                except HttpError as http_err:
+                    if http_err.resp.status == 403:
+                        logger.warning(
+                            f"Insufficient permissions for connector {connector.id}: {http_err}"
+                        )
+                        try:
+                            from sqlalchemy.orm.attributes import flag_modified
+
+                            if not connector.config.get("auth_expired"):
+                                connector.config = {
+                                    **connector.config,
+                                    "auth_expired": True,
+                                }
+                                flag_modified(connector, "config")
+                                await db_session.commit()
+                        except Exception:
+                            logger.warning(
+                                "Failed to persist auth_expired for connector %s",
+                                connector.id,
+                                exc_info=True,
+                            )
+                        return {
+                            "status": "insufficient_permissions",
+                            "connector_id": connector.id,
+                            "message": "This Google Drive account needs additional permissions. Please re-authenticate in connector settings.",
+                        }
+                    raise
+
+                logger.info(
+                    f"Google Drive file deleted (moved to trash): file_id={final_file_id}"
                 )
 
-            return trash_result
+                trash_result: dict[str, Any] = {
+                    "status": "success",
+                    "file_id": final_file_id,
+                    "message": f"Successfully moved '{file['name']}' to trash.",
+                }
+
+                deleted_from_kb = False
+                if final_delete_from_kb and document_id:
+                    try:
+                        from app.db import Document
+
+                        doc_result = await db_session.execute(
+                            select(Document).filter(Document.id == document_id)
+                        )
+                        document = doc_result.scalars().first()
+                        if document:
+                            await db_session.delete(document)
+                            await db_session.commit()
+                            deleted_from_kb = True
+                            logger.info(
+                                f"Deleted document {document_id} from knowledge base"
+                            )
+                        else:
+                            logger.warning(f"Document {document_id} not found in KB")
+                    except Exception as e:
+                        logger.error(f"Failed to delete document from KB: {e}")
+                        await db_session.rollback()
+                        trash_result["warning"] = (
+                            f"File moved to trash, but failed to remove from knowledge base: {e!s}"
+                        )
+
+                trash_result["deleted_from_kb"] = deleted_from_kb
+                if deleted_from_kb:
+                    trash_result["message"] = (
+                        f"{trash_result.get('message', '')} (also removed from knowledge base)"
+                    )
+
+                return trash_result
 
         except Exception as e:
             from langgraph.errors import GraphInterrupt
diff --git a/surfsense_backend/app/agents/new_chat/tools/hitl.py b/surfsense_backend/app/agents/new_chat/tools/hitl.py
new file mode 100644
index 0000000000..5b64929de0
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/tools/hitl.py
@@ -0,0 +1,187 @@
+"""Unified HITL (Human-in-the-Loop) approval utility.
+
+Provides a single ``request_approval()`` function that encapsulates the
+interrupt payload creation, decision parsing, and parameter merging logic
+shared by every sensitive tool (native connectors and MCP tools alike).
+
+Usage inside a tool::
+
+    from app.agents.new_chat.tools.hitl import request_approval
+
+    result = request_approval(
+        action_type="gmail_email_send",
+        tool_name="send_gmail_email",
+        params={"to": to, "subject": subject, "body": body},
+        context=context,
+    )
+    if result.rejected:
+        return {"status": "rejected", "message": "User declined."}
+    # result.params contains the final (possibly edited) parameters
+"""
+
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass, field
+from typing import Any
+
+from langgraph.types import interrupt
+
+logger = logging.getLogger(__name__)
+
+
+# Tools that mirror the safety profile of ``write_file`` against the
+# SurfSense KB: each call creates ONE artifact in the user's own workspace
+# with no external visibility (drafts aren't sent; new files aren't shared
+# unless the user shares them later). These are auto-approved by default
+# so the agent can compose drafts and seed scratch files without a popup
+# on every call.
+#
+# Members of this set still call ``request_approval`` exactly as before;
+# the function returns immediately with ``decision_type="auto_approved"``
+# and the original params untouched. This preserves the call-site shape
+# (logging, metadata fetching, account fallbacks) so the only behavior
+# change is "no interrupt fires".
+#
+# To re-enable prompting, the future per-search-space rules table
+# (``agent_permission_rules``) takes precedence — see the ``# (future)``
+# layer-3 comment in :mod:`app.agents.new_chat.chat_deepagent`.
+DEFAULT_AUTO_APPROVED_TOOLS: frozenset[str] = frozenset(
+    {
+        "create_gmail_draft",
+        "update_gmail_draft",
+        "create_calendar_event",
+        "create_notion_page",
+        "create_confluence_page",
+        "create_google_drive_file",
+        "create_dropbox_file",
+        "create_onedrive_file",
+    }
+)
+
+
+@dataclass(frozen=True, slots=True)
+class HITLResult:
+    """Outcome of a human-in-the-loop approval request."""
+
+    rejected: bool
+    decision_type: str
+    params: dict[str, Any] = field(default_factory=dict)
+
+
+def _parse_decision(approval: Any) -> tuple[str, dict[str, Any]]:
+    """Extract the first valid decision and its edited parameters.
+
+    Returns:
+        (decision_type, edited_params) where *decision_type* is one of
+        ``"approve"``, ``"edit"``, or ``"reject"`` and *edited_params* is
+        the dict of user-modified arguments (empty when there are none).
+
+    Raises:
+        ValueError: when no usable decision dict can be found.
+    """
+    decisions_raw = approval.get("decisions", []) if isinstance(approval, dict) else []
+    decisions = decisions_raw if isinstance(decisions_raw, list) else [decisions_raw]
+    decisions = [d for d in decisions if isinstance(d, dict)]
+
+    if not decisions:
+        raise ValueError("No approval decision received")
+
+    decision = decisions[0]
+    decision_type: str = (
+        decision.get("type") or decision.get("decision_type") or "approve"
+    )
+
+    edited_params: dict[str, Any] = {}
+    edited_action = decision.get("edited_action")
+    if isinstance(edited_action, dict):
+        edited_args = edited_action.get("args")
+        if isinstance(edited_args, dict):
+            edited_params = edited_args
+    elif isinstance(decision.get("args"), dict):
+        edited_params = decision["args"]
+
+    return decision_type, edited_params
+
+
+def request_approval(
+    *,
+    action_type: str,
+    tool_name: str,
+    params: dict[str, Any],
+    context: dict[str, Any] | None = None,
+    trusted_tools: list[str] | None = None,
+) -> HITLResult:
+    """Pause the graph for user approval and return the decision.
+
+    This is a **synchronous** helper (not ``async``) because
+    ``langgraph.types.interrupt`` is itself synchronous — it raises a
+    ``GraphInterrupt`` exception that the LangGraph runtime catches.
+
+    Parameters
+    ----------
+    action_type:
+        A label that the frontend uses to select the correct approval card
+        (e.g. ``"gmail_email_send"``, ``"mcp_tool_call"``).
+    tool_name:
+        The registered LangChain tool name (e.g. ``"send_gmail_email"``).
+    params:
+        The original tool arguments.  These are shown in the approval card
+        and used as defaults when the user does not edit anything.
+    context:
+        Rich metadata from a ``*ToolMetadataService`` (accounts, folders,
+        labels, etc.).  For MCP tools this can hold the server name and
+        tool description.
+    trusted_tools:
+        An allow-list of tool names the user has previously marked as
+        "Always Allow".  If *tool_name* appears in this list, HITL is
+        skipped and the tool executes immediately.
+
+    Returns
+    -------
+    HITLResult
+        ``result.rejected`` is ``True`` when the user chose to deny the
+        action.  Otherwise ``result.params`` contains the final parameter
+        dict — either the originals or the user-edited version merged on
+        top.
+    """
+    if trusted_tools and tool_name in trusted_tools:
+        logger.info("Tool '%s' is user-trusted — skipping HITL", tool_name)
+        return HITLResult(rejected=False, decision_type="trusted", params=dict(params))
+
+    if tool_name in DEFAULT_AUTO_APPROVED_TOOLS:
+        # Default policy: low-stakes creation tools (drafts + new-file
+        # creates) skip HITL because they're as recoverable as a local
+        # ``write_file`` against the SurfSense KB. The user can still
+        # delete the artifact in <30s if it's wrong.
+        logger.info(
+            "Tool '%s' is in DEFAULT_AUTO_APPROVED_TOOLS — skipping HITL",
+            tool_name,
+        )
+        return HITLResult(
+            rejected=False, decision_type="auto_approved", params=dict(params)
+        )
+
+    approval = interrupt(
+        {
+            "type": action_type,
+            "action": {"tool": tool_name, "params": params},
+            "context": context or {},
+        }
+    )
+
+    try:
+        decision_type, edited_params = _parse_decision(approval)
+    except ValueError:
+        logger.warning(
+            "No approval decision received for %s — rejecting for safety", tool_name
+        )
+        return HITLResult(rejected=True, decision_type="error", params=params)
+
+    logger.info("User decision for %s: %s", tool_name, decision_type)
+
+    if decision_type == "reject":
+        return HITLResult(rejected=True, decision_type="reject", params=params)
+
+    final_params = {**params, **edited_params} if edited_params else dict(params)
+    return HITLResult(rejected=False, decision_type=decision_type, params=final_params)
diff --git a/surfsense_backend/app/agents/new_chat/tools/invalid_tool.py b/surfsense_backend/app/agents/new_chat/tools/invalid_tool.py
new file mode 100644
index 0000000000..ea4bc0bc17
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/tools/invalid_tool.py
@@ -0,0 +1,53 @@
+"""
+The ``invalid`` fallback tool.
+
+When the model emits a tool call whose name doesn't match any registered
+tool, :class:`ToolCallNameRepairMiddleware` rewrites the call to ``invalid``
+with the original name and a parser/validation error string. This tool's
+execution then returns that error to the model so it can self-correct.
+
+Ported from OpenCode's ``packages/opencode/src/tool/invalid.ts`` —
+LangChain has no equivalent fallback path; the default behavior on an
+unknown tool name is a hard ``ToolNotFoundError`` which kills the turn.
+
+Critically, the :class:`ToolDefinition` for this tool is **excluded** from
+the system-prompt tool list and from ``LLMToolSelectorMiddleware`` selection
+(see ``ToolDefinition.always_include`` filtering in the registry) — the
+model never advertises ``invalid`` as a callable. It only ever shows up
+in the tool registry so LangGraph can dispatch the rewritten call.
+"""
+
+from __future__ import annotations
+
+from langchain_core.tools import tool
+
+INVALID_TOOL_NAME = "invalid"
+INVALID_TOOL_DESCRIPTION = "Do not use"
+
+
+def _format_invalid_message(tool: str | None, error: str | None) -> str:
+    """Return the user-visible error string. Mirrors ``invalid.ts``."""
+    name = tool or "<unknown>"
+    detail = error or "(no error message provided)"
+    return (
+        f"The arguments provided to the tool `{name}` are invalid: {detail}\n"
+        f"Read the tool's docstring carefully and try again with valid arguments."
+    )
+
+
+@tool(name_or_callable=INVALID_TOOL_NAME, description=INVALID_TOOL_DESCRIPTION)
+def invalid_tool(tool: str | None = None, error: str | None = None) -> str:
+    """Return a human-readable explanation of a tool-call validation failure.
+
+    Activated only when :class:`ToolCallNameRepairMiddleware` rewrites a
+    failed tool call to ``invalid`` with the original tool name and the
+    error message produced during validation.
+    """
+    return _format_invalid_message(tool, error)
+
+
+__all__ = [
+    "INVALID_TOOL_DESCRIPTION",
+    "INVALID_TOOL_NAME",
+    "invalid_tool",
+]
diff --git a/surfsense_backend/app/agents/new_chat/tools/jira/create_issue.py b/surfsense_backend/app/agents/new_chat/tools/jira/create_issue.py
index d441c49f3d..0b04f16426 100644
--- a/surfsense_backend/app/agents/new_chat/tools/jira/create_issue.py
+++ b/surfsense_backend/app/agents/new_chat/tools/jira/create_issue.py
@@ -3,11 +3,12 @@
 from typing import Any
 
 from langchain_core.tools import tool
-from langgraph.types import interrupt
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.orm.attributes import flag_modified
 
+from app.agents.new_chat.tools.hitl import request_approval
 from app.connectors.jira_history import JiraHistoryConnector
+from app.db import async_session_maker
 from app.services.jira import JiraToolMetadataService
 
 logger = logging.getLogger(__name__)
@@ -19,6 +20,28 @@ def create_create_jira_issue_tool(
     user_id: str | None = None,
     connector_id: int | None = None,
 ):
+    """Factory function to create the create_jira_issue tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker`. This is critical for the compiled-agent
+    cache: the compiled graph (and therefore this closure) is reused
+    across HTTP requests, so capturing a per-request session here would
+    surface stale/closed sessions on cache hits. Per-call sessions also
+    keep the request's outer transaction free of long-running Jira API
+    blocking.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+        search_space_id: Search space ID to find the Jira connector
+        user_id: User ID for fetching user-specific context
+        connector_id: Optional specific connector ID (if known)
+
+    Returns:
+        Configured create_jira_issue tool
+    """
+    del db_session  # per-call session — see docstring
+
     @tool
     async def create_jira_issue(
         project_key: str,
@@ -49,184 +72,167 @@ async def create_jira_issue(
             f"create_jira_issue called: project_key='{project_key}', summary='{summary}'"
         )
 
-        if db_session is None or search_space_id is None or user_id is None:
+        if search_space_id is None or user_id is None:
             return {"status": "error", "message": "Jira tool not properly configured."}
 
         try:
-            metadata_service = JiraToolMetadataService(db_session)
-            context = await metadata_service.get_creation_context(
-                search_space_id, user_id
-            )
+            async with async_session_maker() as db_session:
+                metadata_service = JiraToolMetadataService(db_session)
+                context = await metadata_service.get_creation_context(
+                    search_space_id, user_id
+                )
 
-            if "error" in context:
-                return {"status": "error", "message": context["error"]}
+                if "error" in context:
+                    return {"status": "error", "message": context["error"]}
 
-            accounts = context.get("accounts", [])
-            if accounts and all(a.get("auth_expired") for a in accounts):
-                return {
-                    "status": "auth_error",
-                    "message": "All connected Jira accounts need re-authentication.",
-                    "connector_type": "jira",
-                }
+                accounts = context.get("accounts", [])
+                if accounts and all(a.get("auth_expired") for a in accounts):
+                    return {
+                        "status": "auth_error",
+                        "message": "All connected Jira accounts need re-authentication.",
+                        "connector_type": "jira",
+                    }
 
-            approval = interrupt(
-                {
-                    "type": "jira_issue_creation",
-                    "action": {
-                        "tool": "create_jira_issue",
-                        "params": {
-                            "project_key": project_key,
-                            "summary": summary,
-                            "issue_type": issue_type,
-                            "description": description,
-                            "priority": priority,
-                            "connector_id": connector_id,
-                        },
+                result = request_approval(
+                    action_type="jira_issue_creation",
+                    tool_name="create_jira_issue",
+                    params={
+                        "project_key": project_key,
+                        "summary": summary,
+                        "issue_type": issue_type,
+                        "description": description,
+                        "priority": priority,
+                        "connector_id": connector_id,
                     },
-                    "context": context,
-                }
-            )
-
-            decisions_raw = (
-                approval.get("decisions", []) if isinstance(approval, dict) else []
-            )
-            decisions = (
-                decisions_raw if isinstance(decisions_raw, list) else [decisions_raw]
-            )
-            decisions = [d for d in decisions if isinstance(d, dict)]
-            if not decisions:
-                return {"status": "error", "message": "No approval decision received"}
-
-            decision = decisions[0]
-            decision_type = decision.get("type") or decision.get("decision_type")
-
-            if decision_type == "reject":
-                return {
-                    "status": "rejected",
-                    "message": "User declined. The issue was not created.",
-                }
-
-            final_params: dict[str, Any] = {}
-            edited_action = decision.get("edited_action")
-            if isinstance(edited_action, dict):
-                edited_args = edited_action.get("args")
-                if isinstance(edited_args, dict):
-                    final_params = edited_args
-            elif isinstance(decision.get("args"), dict):
-                final_params = decision["args"]
-
-            final_project_key = final_params.get("project_key", project_key)
-            final_summary = final_params.get("summary", summary)
-            final_issue_type = final_params.get("issue_type", issue_type)
-            final_description = final_params.get("description", description)
-            final_priority = final_params.get("priority", priority)
-            final_connector_id = final_params.get("connector_id", connector_id)
-
-            if not final_summary or not final_summary.strip():
-                return {"status": "error", "message": "Issue summary cannot be empty."}
-            if not final_project_key:
-                return {"status": "error", "message": "A project must be selected."}
-
-            from sqlalchemy.future import select
-
-            from app.db import SearchSourceConnector, SearchSourceConnectorType
-
-            actual_connector_id = final_connector_id
-            if actual_connector_id is None:
-                result = await db_session.execute(
-                    select(SearchSourceConnector).filter(
-                        SearchSourceConnector.search_space_id == search_space_id,
-                        SearchSourceConnector.user_id == user_id,
-                        SearchSourceConnector.connector_type
-                        == SearchSourceConnectorType.JIRA_CONNECTOR,
-                    )
-                )
-                connector = result.scalars().first()
-                if not connector:
-                    return {"status": "error", "message": "No Jira connector found."}
-                actual_connector_id = connector.id
-            else:
-                result = await db_session.execute(
-                    select(SearchSourceConnector).filter(
-                        SearchSourceConnector.id == actual_connector_id,
-                        SearchSourceConnector.search_space_id == search_space_id,
-                        SearchSourceConnector.user_id == user_id,
-                        SearchSourceConnector.connector_type
-                        == SearchSourceConnectorType.JIRA_CONNECTOR,
-                    )
+                    context=context,
                 )
-                connector = result.scalars().first()
-                if not connector:
+
+                if result.rejected:
                     return {
-                        "status": "error",
-                        "message": "Selected Jira connector is invalid.",
+                        "status": "rejected",
+                        "message": "User declined. Do not retry or suggest alternatives.",
                     }
 
-            try:
-                jira_history = JiraHistoryConnector(
-                    session=db_session, connector_id=actual_connector_id
-                )
-                jira_client = await jira_history._get_jira_client()
-                api_result = await asyncio.to_thread(
-                    jira_client.create_issue,
-                    project_key=final_project_key,
-                    summary=final_summary,
-                    issue_type=final_issue_type,
-                    description=final_description,
-                    priority=final_priority,
-                )
-            except Exception as api_err:
-                if "status code 403" in str(api_err).lower():
-                    try:
-                        _conn = connector
-                        _conn.config = {**_conn.config, "auth_expired": True}
-                        flag_modified(_conn, "config")
-                        await db_session.commit()
-                    except Exception:
-                        pass
+                final_project_key = result.params.get("project_key", project_key)
+                final_summary = result.params.get("summary", summary)
+                final_issue_type = result.params.get("issue_type", issue_type)
+                final_description = result.params.get("description", description)
+                final_priority = result.params.get("priority", priority)
+                final_connector_id = result.params.get("connector_id", connector_id)
+
+                if not final_summary or not final_summary.strip():
                     return {
-                        "status": "insufficient_permissions",
-                        "connector_id": actual_connector_id,
-                        "message": "This Jira account needs additional permissions. Please re-authenticate in connector settings.",
+                        "status": "error",
+                        "message": "Issue summary cannot be empty.",
                     }
-                raise
-
-            issue_key = api_result.get("key", "")
-            issue_url = (
-                f"{jira_history._base_url}/browse/{issue_key}"
-                if jira_history._base_url and issue_key
-                else ""
-            )
-
-            kb_message_suffix = ""
-            try:
-                from app.services.jira import JiraKBSyncService
-
-                kb_service = JiraKBSyncService(db_session)
-                kb_result = await kb_service.sync_after_create(
-                    issue_id=issue_key,
-                    issue_identifier=issue_key,
-                    issue_title=final_summary,
-                    description=final_description,
-                    state="To Do",
-                    connector_id=actual_connector_id,
-                    search_space_id=search_space_id,
-                    user_id=user_id,
-                )
-                if kb_result["status"] == "success":
-                    kb_message_suffix = " Your knowledge base has also been updated."
+                if not final_project_key:
+                    return {"status": "error", "message": "A project must be selected."}
+
+                from sqlalchemy.future import select
+
+                from app.db import SearchSourceConnector, SearchSourceConnectorType
+
+                actual_connector_id = final_connector_id
+                if actual_connector_id is None:
+                    result = await db_session.execute(
+                        select(SearchSourceConnector).filter(
+                            SearchSourceConnector.search_space_id == search_space_id,
+                            SearchSourceConnector.user_id == user_id,
+                            SearchSourceConnector.connector_type
+                            == SearchSourceConnectorType.JIRA_CONNECTOR,
+                        )
+                    )
+                    connector = result.scalars().first()
+                    if not connector:
+                        return {
+                            "status": "error",
+                            "message": "No Jira connector found.",
+                        }
+                    actual_connector_id = connector.id
                 else:
+                    result = await db_session.execute(
+                        select(SearchSourceConnector).filter(
+                            SearchSourceConnector.id == actual_connector_id,
+                            SearchSourceConnector.search_space_id == search_space_id,
+                            SearchSourceConnector.user_id == user_id,
+                            SearchSourceConnector.connector_type
+                            == SearchSourceConnectorType.JIRA_CONNECTOR,
+                        )
+                    )
+                    connector = result.scalars().first()
+                    if not connector:
+                        return {
+                            "status": "error",
+                            "message": "Selected Jira connector is invalid.",
+                        }
+
+                try:
+                    jira_history = JiraHistoryConnector(
+                        session=db_session, connector_id=actual_connector_id
+                    )
+                    jira_client = await jira_history._get_jira_client()
+                    api_result = await asyncio.to_thread(
+                        jira_client.create_issue,
+                        project_key=final_project_key,
+                        summary=final_summary,
+                        issue_type=final_issue_type,
+                        description=final_description,
+                        priority=final_priority,
+                    )
+                except Exception as api_err:
+                    if "status code 403" in str(api_err).lower():
+                        try:
+                            _conn = connector
+                            _conn.config = {**_conn.config, "auth_expired": True}
+                            flag_modified(_conn, "config")
+                            await db_session.commit()
+                        except Exception:
+                            pass
+                        return {
+                            "status": "insufficient_permissions",
+                            "connector_id": actual_connector_id,
+                            "message": "This Jira account needs additional permissions. Please re-authenticate in connector settings.",
+                        }
+                    raise
+
+                issue_key = api_result.get("key", "")
+                issue_url = (
+                    f"{jira_history._base_url}/browse/{issue_key}"
+                    if jira_history._base_url and issue_key
+                    else ""
+                )
+
+                kb_message_suffix = ""
+                try:
+                    from app.services.jira import JiraKBSyncService
+
+                    kb_service = JiraKBSyncService(db_session)
+                    kb_result = await kb_service.sync_after_create(
+                        issue_id=issue_key,
+                        issue_identifier=issue_key,
+                        issue_title=final_summary,
+                        description=final_description,
+                        state="To Do",
+                        connector_id=actual_connector_id,
+                        search_space_id=search_space_id,
+                        user_id=user_id,
+                    )
+                    if kb_result["status"] == "success":
+                        kb_message_suffix = (
+                            " Your knowledge base has also been updated."
+                        )
+                    else:
+                        kb_message_suffix = " This issue will be added to your knowledge base in the next scheduled sync."
+                except Exception as kb_err:
+                    logger.warning(f"KB sync after create failed: {kb_err}")
                     kb_message_suffix = " This issue will be added to your knowledge base in the next scheduled sync."
-            except Exception as kb_err:
-                logger.warning(f"KB sync after create failed: {kb_err}")
-                kb_message_suffix = " This issue will be added to your knowledge base in the next scheduled sync."
 
-            return {
-                "status": "success",
-                "issue_key": issue_key,
-                "issue_url": issue_url,
-                "message": f"Jira issue {issue_key} created successfully.{kb_message_suffix}",
-            }
+                return {
+                    "status": "success",
+                    "issue_key": issue_key,
+                    "issue_url": issue_url,
+                    "message": f"Jira issue {issue_key} created successfully.{kb_message_suffix}",
+                }
 
         except Exception as e:
             from langgraph.errors import GraphInterrupt
diff --git a/surfsense_backend/app/agents/new_chat/tools/jira/delete_issue.py b/surfsense_backend/app/agents/new_chat/tools/jira/delete_issue.py
index 2f8c370adc..c41aedad98 100644
--- a/surfsense_backend/app/agents/new_chat/tools/jira/delete_issue.py
+++ b/surfsense_backend/app/agents/new_chat/tools/jira/delete_issue.py
@@ -3,11 +3,12 @@
 from typing import Any
 
 from langchain_core.tools import tool
-from langgraph.types import interrupt
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.orm.attributes import flag_modified
 
+from app.agents.new_chat.tools.hitl import request_approval
 from app.connectors.jira_history import JiraHistoryConnector
+from app.db import async_session_maker
 from app.services.jira import JiraToolMetadataService
 
 logger = logging.getLogger(__name__)
@@ -19,6 +20,26 @@ def create_delete_jira_issue_tool(
     user_id: str | None = None,
     connector_id: int | None = None,
 ):
+    """Factory function to create the delete_jira_issue tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker`. This is critical for the compiled-agent
+    cache: the compiled graph (and therefore this closure) is reused
+    across HTTP requests, so capturing a per-request session here would
+    surface stale/closed sessions on cache hits.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+        search_space_id: Search space ID to find the Jira connector
+        user_id: User ID for fetching user-specific context
+        connector_id: Optional specific connector ID (if known)
+
+    Returns:
+        Configured delete_jira_issue tool
+    """
+    del db_session  # per-call session — see docstring
+
     @tool
     async def delete_jira_issue(
         issue_title_or_key: str,
@@ -44,156 +65,136 @@ async def delete_jira_issue(
             f"delete_jira_issue called: issue_title_or_key='{issue_title_or_key}'"
         )
 
-        if db_session is None or search_space_id is None or user_id is None:
+        if search_space_id is None or user_id is None:
             return {"status": "error", "message": "Jira tool not properly configured."}
 
         try:
-            metadata_service = JiraToolMetadataService(db_session)
-            context = await metadata_service.get_deletion_context(
-                search_space_id, user_id, issue_title_or_key
-            )
-
-            if "error" in context:
-                error_msg = context["error"]
-                if context.get("auth_expired"):
-                    return {
-                        "status": "auth_error",
-                        "message": error_msg,
-                        "connector_id": context.get("connector_id"),
-                        "connector_type": "jira",
-                    }
-                if "not found" in error_msg.lower():
-                    return {"status": "not_found", "message": error_msg}
-                return {"status": "error", "message": error_msg}
-
-            issue_data = context["issue"]
-            issue_key = issue_data["issue_id"]
-            document_id = issue_data["document_id"]
-            connector_id_from_context = context.get("account", {}).get("id")
-
-            approval = interrupt(
-                {
-                    "type": "jira_issue_deletion",
-                    "action": {
-                        "tool": "delete_jira_issue",
-                        "params": {
-                            "issue_key": issue_key,
-                            "connector_id": connector_id_from_context,
-                            "delete_from_kb": delete_from_kb,
-                        },
-                    },
-                    "context": context,
-                }
-            )
-
-            decisions_raw = (
-                approval.get("decisions", []) if isinstance(approval, dict) else []
-            )
-            decisions = (
-                decisions_raw if isinstance(decisions_raw, list) else [decisions_raw]
-            )
-            decisions = [d for d in decisions if isinstance(d, dict)]
-            if not decisions:
-                return {"status": "error", "message": "No approval decision received"}
-
-            decision = decisions[0]
-            decision_type = decision.get("type") or decision.get("decision_type")
-
-            if decision_type == "reject":
-                return {
-                    "status": "rejected",
-                    "message": "User declined. The issue was not deleted.",
-                }
+            async with async_session_maker() as db_session:
+                metadata_service = JiraToolMetadataService(db_session)
+                context = await metadata_service.get_deletion_context(
+                    search_space_id, user_id, issue_title_or_key
+                )
 
-            final_params: dict[str, Any] = {}
-            edited_action = decision.get("edited_action")
-            if isinstance(edited_action, dict):
-                edited_args = edited_action.get("args")
-                if isinstance(edited_args, dict):
-                    final_params = edited_args
-            elif isinstance(decision.get("args"), dict):
-                final_params = decision["args"]
+                if "error" in context:
+                    error_msg = context["error"]
+                    if context.get("auth_expired"):
+                        return {
+                            "status": "auth_error",
+                            "message": error_msg,
+                            "connector_id": context.get("connector_id"),
+                            "connector_type": "jira",
+                        }
+                    if "not found" in error_msg.lower():
+                        return {"status": "not_found", "message": error_msg}
+                    return {"status": "error", "message": error_msg}
+
+                issue_data = context["issue"]
+                issue_key = issue_data["issue_id"]
+                document_id = issue_data["document_id"]
+                connector_id_from_context = context.get("account", {}).get("id")
+
+                result = request_approval(
+                    action_type="jira_issue_deletion",
+                    tool_name="delete_jira_issue",
+                    params={
+                        "issue_key": issue_key,
+                        "connector_id": connector_id_from_context,
+                        "delete_from_kb": delete_from_kb,
+                    },
+                    context=context,
+                )
 
-            final_issue_key = final_params.get("issue_key", issue_key)
-            final_connector_id = final_params.get(
-                "connector_id", connector_id_from_context
-            )
-            final_delete_from_kb = final_params.get("delete_from_kb", delete_from_kb)
+                if result.rejected:
+                    return {
+                        "status": "rejected",
+                        "message": "User declined. Do not retry or suggest alternatives.",
+                    }
 
-            from sqlalchemy.future import select
+                final_issue_key = result.params.get("issue_key", issue_key)
+                final_connector_id = result.params.get(
+                    "connector_id", connector_id_from_context
+                )
+                final_delete_from_kb = result.params.get(
+                    "delete_from_kb", delete_from_kb
+                )
 
-            from app.db import SearchSourceConnector, SearchSourceConnectorType
+                from sqlalchemy.future import select
 
-            if not final_connector_id:
-                return {
-                    "status": "error",
-                    "message": "No connector found for this issue.",
-                }
+                from app.db import SearchSourceConnector, SearchSourceConnectorType
 
-            result = await db_session.execute(
-                select(SearchSourceConnector).filter(
-                    SearchSourceConnector.id == final_connector_id,
-                    SearchSourceConnector.search_space_id == search_space_id,
-                    SearchSourceConnector.user_id == user_id,
-                    SearchSourceConnector.connector_type
-                    == SearchSourceConnectorType.JIRA_CONNECTOR,
-                )
-            )
-            connector = result.scalars().first()
-            if not connector:
-                return {
-                    "status": "error",
-                    "message": "Selected Jira connector is invalid.",
-                }
+                if not final_connector_id:
+                    return {
+                        "status": "error",
+                        "message": "No connector found for this issue.",
+                    }
 
-            try:
-                jira_history = JiraHistoryConnector(
-                    session=db_session, connector_id=final_connector_id
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.id == final_connector_id,
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type
+                        == SearchSourceConnectorType.JIRA_CONNECTOR,
+                    )
                 )
-                jira_client = await jira_history._get_jira_client()
-                await asyncio.to_thread(jira_client.delete_issue, final_issue_key)
-            except Exception as api_err:
-                if "status code 403" in str(api_err).lower():
-                    try:
-                        connector.config = {**connector.config, "auth_expired": True}
-                        flag_modified(connector, "config")
-                        await db_session.commit()
-                    except Exception:
-                        pass
+                connector = result.scalars().first()
+                if not connector:
                     return {
-                        "status": "insufficient_permissions",
-                        "connector_id": final_connector_id,
-                        "message": "This Jira account needs additional permissions. Please re-authenticate in connector settings.",
+                        "status": "error",
+                        "message": "Selected Jira connector is invalid.",
                     }
-                raise
 
-            deleted_from_kb = False
-            if final_delete_from_kb and document_id:
                 try:
-                    from app.db import Document
-
-                    doc_result = await db_session.execute(
-                        select(Document).filter(Document.id == document_id)
+                    jira_history = JiraHistoryConnector(
+                        session=db_session, connector_id=final_connector_id
                     )
-                    document = doc_result.scalars().first()
-                    if document:
-                        await db_session.delete(document)
-                        await db_session.commit()
-                        deleted_from_kb = True
-                except Exception as e:
-                    logger.error(f"Failed to delete document from KB: {e}")
-                    await db_session.rollback()
-
-            message = f"Jira issue {final_issue_key} deleted successfully."
-            if deleted_from_kb:
-                message += " Also removed from the knowledge base."
+                    jira_client = await jira_history._get_jira_client()
+                    await asyncio.to_thread(jira_client.delete_issue, final_issue_key)
+                except Exception as api_err:
+                    if "status code 403" in str(api_err).lower():
+                        try:
+                            connector.config = {
+                                **connector.config,
+                                "auth_expired": True,
+                            }
+                            flag_modified(connector, "config")
+                            await db_session.commit()
+                        except Exception:
+                            pass
+                        return {
+                            "status": "insufficient_permissions",
+                            "connector_id": final_connector_id,
+                            "message": "This Jira account needs additional permissions. Please re-authenticate in connector settings.",
+                        }
+                    raise
+
+                deleted_from_kb = False
+                if final_delete_from_kb and document_id:
+                    try:
+                        from app.db import Document
+
+                        doc_result = await db_session.execute(
+                            select(Document).filter(Document.id == document_id)
+                        )
+                        document = doc_result.scalars().first()
+                        if document:
+                            await db_session.delete(document)
+                            await db_session.commit()
+                            deleted_from_kb = True
+                    except Exception as e:
+                        logger.error(f"Failed to delete document from KB: {e}")
+                        await db_session.rollback()
+
+                message = f"Jira issue {final_issue_key} deleted successfully."
+                if deleted_from_kb:
+                    message += " Also removed from the knowledge base."
 
-            return {
-                "status": "success",
-                "issue_key": final_issue_key,
-                "deleted_from_kb": deleted_from_kb,
-                "message": message,
-            }
+                return {
+                    "status": "success",
+                    "issue_key": final_issue_key,
+                    "deleted_from_kb": deleted_from_kb,
+                    "message": message,
+                }
 
         except Exception as e:
             from langgraph.errors import GraphInterrupt
diff --git a/surfsense_backend/app/agents/new_chat/tools/jira/update_issue.py b/surfsense_backend/app/agents/new_chat/tools/jira/update_issue.py
index c2b948ae32..0fd7b28b3e 100644
--- a/surfsense_backend/app/agents/new_chat/tools/jira/update_issue.py
+++ b/surfsense_backend/app/agents/new_chat/tools/jira/update_issue.py
@@ -3,11 +3,12 @@
 from typing import Any
 
 from langchain_core.tools import tool
-from langgraph.types import interrupt
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.orm.attributes import flag_modified
 
+from app.agents.new_chat.tools.hitl import request_approval
 from app.connectors.jira_history import JiraHistoryConnector
+from app.db import async_session_maker
 from app.services.jira import JiraToolMetadataService
 
 logger = logging.getLogger(__name__)
@@ -19,6 +20,26 @@ def create_update_jira_issue_tool(
     user_id: str | None = None,
     connector_id: int | None = None,
 ):
+    """Factory function to create the update_jira_issue tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker`. This is critical for the compiled-agent
+    cache: the compiled graph (and therefore this closure) is reused
+    across HTTP requests, so capturing a per-request session here would
+    surface stale/closed sessions on cache hits.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+        search_space_id: Search space ID to find the Jira connector
+        user_id: User ID for fetching user-specific context
+        connector_id: Optional specific connector ID (if known)
+
+    Returns:
+        Configured update_jira_issue tool
+    """
+    del db_session  # per-call session — see docstring
+
     @tool
     async def update_jira_issue(
         issue_title_or_key: str,
@@ -48,195 +69,177 @@ async def update_jira_issue(
             f"update_jira_issue called: issue_title_or_key='{issue_title_or_key}'"
         )
 
-        if db_session is None or search_space_id is None or user_id is None:
+        if search_space_id is None or user_id is None:
             return {"status": "error", "message": "Jira tool not properly configured."}
 
         try:
-            metadata_service = JiraToolMetadataService(db_session)
-            context = await metadata_service.get_update_context(
-                search_space_id, user_id, issue_title_or_key
-            )
-
-            if "error" in context:
-                error_msg = context["error"]
-                if context.get("auth_expired"):
-                    return {
-                        "status": "auth_error",
-                        "message": error_msg,
-                        "connector_id": context.get("connector_id"),
-                        "connector_type": "jira",
-                    }
-                if "not found" in error_msg.lower():
-                    return {"status": "not_found", "message": error_msg}
-                return {"status": "error", "message": error_msg}
-
-            issue_data = context["issue"]
-            issue_key = issue_data["issue_id"]
-            document_id = issue_data.get("document_id")
-            connector_id_from_context = context.get("account", {}).get("id")
-
-            approval = interrupt(
-                {
-                    "type": "jira_issue_update",
-                    "action": {
-                        "tool": "update_jira_issue",
-                        "params": {
-                            "issue_key": issue_key,
-                            "document_id": document_id,
-                            "new_summary": new_summary,
-                            "new_description": new_description,
-                            "new_priority": new_priority,
-                            "connector_id": connector_id_from_context,
-                        },
+            async with async_session_maker() as db_session:
+                metadata_service = JiraToolMetadataService(db_session)
+                context = await metadata_service.get_update_context(
+                    search_space_id, user_id, issue_title_or_key
+                )
+
+                if "error" in context:
+                    error_msg = context["error"]
+                    if context.get("auth_expired"):
+                        return {
+                            "status": "auth_error",
+                            "message": error_msg,
+                            "connector_id": context.get("connector_id"),
+                            "connector_type": "jira",
+                        }
+                    if "not found" in error_msg.lower():
+                        return {"status": "not_found", "message": error_msg}
+                    return {"status": "error", "message": error_msg}
+
+                issue_data = context["issue"]
+                issue_key = issue_data["issue_id"]
+                document_id = issue_data.get("document_id")
+                connector_id_from_context = context.get("account", {}).get("id")
+
+                result = request_approval(
+                    action_type="jira_issue_update",
+                    tool_name="update_jira_issue",
+                    params={
+                        "issue_key": issue_key,
+                        "document_id": document_id,
+                        "new_summary": new_summary,
+                        "new_description": new_description,
+                        "new_priority": new_priority,
+                        "connector_id": connector_id_from_context,
                     },
-                    "context": context,
-                }
-            )
-
-            decisions_raw = (
-                approval.get("decisions", []) if isinstance(approval, dict) else []
-            )
-            decisions = (
-                decisions_raw if isinstance(decisions_raw, list) else [decisions_raw]
-            )
-            decisions = [d for d in decisions if isinstance(d, dict)]
-            if not decisions:
-                return {"status": "error", "message": "No approval decision received"}
-
-            decision = decisions[0]
-            decision_type = decision.get("type") or decision.get("decision_type")
-
-            if decision_type == "reject":
-                return {
-                    "status": "rejected",
-                    "message": "User declined. The issue was not updated.",
-                }
+                    context=context,
+                )
 
-            final_params: dict[str, Any] = {}
-            edited_action = decision.get("edited_action")
-            if isinstance(edited_action, dict):
-                edited_args = edited_action.get("args")
-                if isinstance(edited_args, dict):
-                    final_params = edited_args
-            elif isinstance(decision.get("args"), dict):
-                final_params = decision["args"]
-
-            final_issue_key = final_params.get("issue_key", issue_key)
-            final_summary = final_params.get("new_summary", new_summary)
-            final_description = final_params.get("new_description", new_description)
-            final_priority = final_params.get("new_priority", new_priority)
-            final_connector_id = final_params.get(
-                "connector_id", connector_id_from_context
-            )
-            final_document_id = final_params.get("document_id", document_id)
-
-            from sqlalchemy.future import select
-
-            from app.db import SearchSourceConnector, SearchSourceConnectorType
-
-            if not final_connector_id:
-                return {
-                    "status": "error",
-                    "message": "No connector found for this issue.",
-                }
+                if result.rejected:
+                    return {
+                        "status": "rejected",
+                        "message": "User declined. Do not retry or suggest alternatives.",
+                    }
 
-            result = await db_session.execute(
-                select(SearchSourceConnector).filter(
-                    SearchSourceConnector.id == final_connector_id,
-                    SearchSourceConnector.search_space_id == search_space_id,
-                    SearchSourceConnector.user_id == user_id,
-                    SearchSourceConnector.connector_type
-                    == SearchSourceConnectorType.JIRA_CONNECTOR,
+                final_issue_key = result.params.get("issue_key", issue_key)
+                final_summary = result.params.get("new_summary", new_summary)
+                final_description = result.params.get(
+                    "new_description", new_description
                 )
-            )
-            connector = result.scalars().first()
-            if not connector:
-                return {
-                    "status": "error",
-                    "message": "Selected Jira connector is invalid.",
-                }
+                final_priority = result.params.get("new_priority", new_priority)
+                final_connector_id = result.params.get(
+                    "connector_id", connector_id_from_context
+                )
+                final_document_id = result.params.get("document_id", document_id)
 
-            fields: dict[str, Any] = {}
-            if final_summary:
-                fields["summary"] = final_summary
-            if final_description is not None:
-                fields["description"] = {
-                    "type": "doc",
-                    "version": 1,
-                    "content": [
-                        {
-                            "type": "paragraph",
-                            "content": [{"type": "text", "text": final_description}],
-                        }
-                    ],
-                }
-            if final_priority:
-                fields["priority"] = {"name": final_priority}
+                from sqlalchemy.future import select
 
-            if not fields:
-                return {"status": "error", "message": "No changes specified."}
+                from app.db import SearchSourceConnector, SearchSourceConnectorType
 
-            try:
-                jira_history = JiraHistoryConnector(
-                    session=db_session, connector_id=final_connector_id
-                )
-                jira_client = await jira_history._get_jira_client()
-                await asyncio.to_thread(
-                    jira_client.update_issue, final_issue_key, fields
+                if not final_connector_id:
+                    return {
+                        "status": "error",
+                        "message": "No connector found for this issue.",
+                    }
+
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.id == final_connector_id,
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type
+                        == SearchSourceConnectorType.JIRA_CONNECTOR,
+                    )
                 )
-            except Exception as api_err:
-                if "status code 403" in str(api_err).lower():
-                    try:
-                        connector.config = {**connector.config, "auth_expired": True}
-                        flag_modified(connector, "config")
-                        await db_session.commit()
-                    except Exception:
-                        pass
+                connector = result.scalars().first()
+                if not connector:
                     return {
-                        "status": "insufficient_permissions",
-                        "connector_id": final_connector_id,
-                        "message": "This Jira account needs additional permissions. Please re-authenticate in connector settings.",
+                        "status": "error",
+                        "message": "Selected Jira connector is invalid.",
                     }
-                raise
 
-            issue_url = (
-                f"{jira_history._base_url}/browse/{final_issue_key}"
-                if jira_history._base_url and final_issue_key
-                else ""
-            )
+                fields: dict[str, Any] = {}
+                if final_summary:
+                    fields["summary"] = final_summary
+                if final_description is not None:
+                    fields["description"] = {
+                        "type": "doc",
+                        "version": 1,
+                        "content": [
+                            {
+                                "type": "paragraph",
+                                "content": [
+                                    {"type": "text", "text": final_description}
+                                ],
+                            }
+                        ],
+                    }
+                if final_priority:
+                    fields["priority"] = {"name": final_priority}
+
+                if not fields:
+                    return {"status": "error", "message": "No changes specified."}
 
-            kb_message_suffix = ""
-            if final_document_id:
                 try:
-                    from app.services.jira import JiraKBSyncService
-
-                    kb_service = JiraKBSyncService(db_session)
-                    kb_result = await kb_service.sync_after_update(
-                        document_id=final_document_id,
-                        issue_id=final_issue_key,
-                        user_id=user_id,
-                        search_space_id=search_space_id,
+                    jira_history = JiraHistoryConnector(
+                        session=db_session, connector_id=final_connector_id
                     )
-                    if kb_result["status"] == "success":
-                        kb_message_suffix = (
-                            " Your knowledge base has also been updated."
+                    jira_client = await jira_history._get_jira_client()
+                    await asyncio.to_thread(
+                        jira_client.update_issue, final_issue_key, fields
+                    )
+                except Exception as api_err:
+                    if "status code 403" in str(api_err).lower():
+                        try:
+                            connector.config = {
+                                **connector.config,
+                                "auth_expired": True,
+                            }
+                            flag_modified(connector, "config")
+                            await db_session.commit()
+                        except Exception:
+                            pass
+                        return {
+                            "status": "insufficient_permissions",
+                            "connector_id": final_connector_id,
+                            "message": "This Jira account needs additional permissions. Please re-authenticate in connector settings.",
+                        }
+                    raise
+
+                issue_url = (
+                    f"{jira_history._base_url}/browse/{final_issue_key}"
+                    if jira_history._base_url and final_issue_key
+                    else ""
+                )
+
+                kb_message_suffix = ""
+                if final_document_id:
+                    try:
+                        from app.services.jira import JiraKBSyncService
+
+                        kb_service = JiraKBSyncService(db_session)
+                        kb_result = await kb_service.sync_after_update(
+                            document_id=final_document_id,
+                            issue_id=final_issue_key,
+                            user_id=user_id,
+                            search_space_id=search_space_id,
                         )
-                    else:
+                        if kb_result["status"] == "success":
+                            kb_message_suffix = (
+                                " Your knowledge base has also been updated."
+                            )
+                        else:
+                            kb_message_suffix = (
+                                " The knowledge base will be updated in the next sync."
+                            )
+                    except Exception as kb_err:
+                        logger.warning(f"KB sync after update failed: {kb_err}")
                         kb_message_suffix = (
                             " The knowledge base will be updated in the next sync."
                         )
-                except Exception as kb_err:
-                    logger.warning(f"KB sync after update failed: {kb_err}")
-                    kb_message_suffix = (
-                        " The knowledge base will be updated in the next sync."
-                    )
 
-            return {
-                "status": "success",
-                "issue_key": final_issue_key,
-                "issue_url": issue_url,
-                "message": f"Jira issue {final_issue_key} updated successfully.{kb_message_suffix}",
-            }
+                return {
+                    "status": "success",
+                    "issue_key": final_issue_key,
+                    "issue_url": issue_url,
+                    "message": f"Jira issue {final_issue_key} updated successfully.{kb_message_suffix}",
+                }
 
         except Exception as e:
             from langgraph.errors import GraphInterrupt
diff --git a/surfsense_backend/app/agents/new_chat/tools/linear/create_issue.py b/surfsense_backend/app/agents/new_chat/tools/linear/create_issue.py
index 2b5d379036..f897bee7a4 100644
--- a/surfsense_backend/app/agents/new_chat/tools/linear/create_issue.py
+++ b/surfsense_backend/app/agents/new_chat/tools/linear/create_issue.py
@@ -2,10 +2,11 @@
 from typing import Any
 
 from langchain_core.tools import tool
-from langgraph.types import interrupt
 from sqlalchemy.ext.asyncio import AsyncSession
 
+from app.agents.new_chat.tools.hitl import request_approval
 from app.connectors.linear_connector import LinearAPIError, LinearConnector
+from app.db import async_session_maker
 from app.services.linear import LinearToolMetadataService
 
 logger = logging.getLogger(__name__)
@@ -17,11 +18,17 @@ def create_create_linear_issue_tool(
     user_id: str | None = None,
     connector_id: int | None = None,
 ):
-    """
-    Factory function to create the create_linear_issue tool.
+    """Factory function to create the create_linear_issue tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker`. This is critical for the compiled-agent
+    cache: the compiled graph (and therefore this closure) is reused
+    across HTTP requests, so capturing a per-request session here would
+    surface stale/closed sessions on cache hits.
 
     Args:
-        db_session: Database session for accessing the Linear connector
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
         search_space_id: Search space ID to find the Linear connector
         user_id: User ID for fetching user-specific context
         connector_id: Optional specific connector ID (if known)
@@ -29,6 +36,7 @@ def create_create_linear_issue_tool(
     Returns:
         Configured create_linear_issue tool
     """
+    del db_session  # per-call session — see docstring
 
     @tool
     async def create_linear_issue(
@@ -65,7 +73,7 @@ async def create_linear_issue(
         """
         logger.info(f"create_linear_issue called: title='{title}'")
 
-        if db_session is None or search_space_id is None or user_id is None:
+        if search_space_id is None or user_id is None:
             logger.error(
                 "Linear tool not properly configured - missing required parameters"
             )
@@ -75,188 +83,170 @@ async def create_linear_issue(
             }
 
         try:
-            metadata_service = LinearToolMetadataService(db_session)
-            context = await metadata_service.get_creation_context(
-                search_space_id, user_id
-            )
+            async with async_session_maker() as db_session:
+                metadata_service = LinearToolMetadataService(db_session)
+                context = await metadata_service.get_creation_context(
+                    search_space_id, user_id
+                )
 
-            if "error" in context:
-                logger.error(f"Failed to fetch creation context: {context['error']}")
-                return {"status": "error", "message": context["error"]}
+                if "error" in context:
+                    logger.error(
+                        f"Failed to fetch creation context: {context['error']}"
+                    )
+                    return {"status": "error", "message": context["error"]}
 
-            workspaces = context.get("workspaces", [])
-            if workspaces and all(w.get("auth_expired") for w in workspaces):
-                logger.warning("All Linear accounts have expired authentication")
-                return {
-                    "status": "auth_error",
-                    "message": "All connected Linear accounts need re-authentication. Please re-authenticate in your connector settings.",
-                    "connector_type": "linear",
-                }
+                workspaces = context.get("workspaces", [])
+                if workspaces and all(w.get("auth_expired") for w in workspaces):
+                    logger.warning("All Linear accounts have expired authentication")
+                    return {
+                        "status": "auth_error",
+                        "message": "All connected Linear accounts need re-authentication. Please re-authenticate in your connector settings.",
+                        "connector_type": "linear",
+                    }
 
-            logger.info(f"Requesting approval for creating Linear issue: '{title}'")
-            approval = interrupt(
-                {
-                    "type": "linear_issue_creation",
-                    "action": {
-                        "tool": "create_linear_issue",
-                        "params": {
-                            "title": title,
-                            "description": description,
-                            "team_id": None,
-                            "state_id": None,
-                            "assignee_id": None,
-                            "priority": None,
-                            "label_ids": [],
-                            "connector_id": connector_id,
-                        },
+                logger.info(f"Requesting approval for creating Linear issue: '{title}'")
+                result = request_approval(
+                    action_type="linear_issue_creation",
+                    tool_name="create_linear_issue",
+                    params={
+                        "title": title,
+                        "description": description,
+                        "team_id": None,
+                        "state_id": None,
+                        "assignee_id": None,
+                        "priority": None,
+                        "label_ids": [],
+                        "connector_id": connector_id,
                     },
-                    "context": context,
-                }
-            )
-
-            decisions_raw = (
-                approval.get("decisions", []) if isinstance(approval, dict) else []
-            )
-            decisions = (
-                decisions_raw if isinstance(decisions_raw, list) else [decisions_raw]
-            )
-            decisions = [d for d in decisions if isinstance(d, dict)]
-            if not decisions:
-                logger.warning("No approval decision received")
-                return {"status": "error", "message": "No approval decision received"}
-
-            decision = decisions[0]
-            decision_type = decision.get("type") or decision.get("decision_type")
-            logger.info(f"User decision: {decision_type}")
-
-            if decision_type == "reject":
-                logger.info("Linear issue creation rejected by user")
-                return {
-                    "status": "rejected",
-                    "message": "User declined. The issue was not created. Do not ask again or suggest alternatives.",
-                }
-
-            final_params: dict[str, Any] = {}
-            edited_action = decision.get("edited_action")
-            if isinstance(edited_action, dict):
-                edited_args = edited_action.get("args")
-                if isinstance(edited_args, dict):
-                    final_params = edited_args
-            elif isinstance(decision.get("args"), dict):
-                final_params = decision["args"]
-
-            final_title = final_params.get("title", title)
-            final_description = final_params.get("description", description)
-            final_team_id = final_params.get("team_id")
-            final_state_id = final_params.get("state_id")
-            final_assignee_id = final_params.get("assignee_id")
-            final_priority = final_params.get("priority")
-            final_label_ids = final_params.get("label_ids") or []
-            final_connector_id = final_params.get("connector_id", connector_id)
-
-            if not final_title or not final_title.strip():
-                logger.error("Title is empty or contains only whitespace")
-                return {"status": "error", "message": "Issue title cannot be empty."}
-            if not final_team_id:
-                return {
-                    "status": "error",
-                    "message": "A team must be selected to create an issue.",
-                }
-
-            from sqlalchemy.future import select
+                    context=context,
+                )
 
-            from app.db import SearchSourceConnector, SearchSourceConnectorType
+                if result.rejected:
+                    logger.info("Linear issue creation rejected by user")
+                    return {
+                        "status": "rejected",
+                        "message": "User declined. Do not retry or suggest alternatives.",
+                    }
 
-            actual_connector_id = final_connector_id
-            if actual_connector_id is None:
-                result = await db_session.execute(
-                    select(SearchSourceConnector).filter(
-                        SearchSourceConnector.search_space_id == search_space_id,
-                        SearchSourceConnector.user_id == user_id,
-                        SearchSourceConnector.connector_type
-                        == SearchSourceConnectorType.LINEAR_CONNECTOR,
-                    )
-                )
-                connector = result.scalars().first()
-                if not connector:
+                final_title = result.params.get("title", title)
+                final_description = result.params.get("description", description)
+                final_team_id = result.params.get("team_id")
+                final_state_id = result.params.get("state_id")
+                final_assignee_id = result.params.get("assignee_id")
+                final_priority = result.params.get("priority")
+                final_label_ids = result.params.get("label_ids") or []
+                final_connector_id = result.params.get("connector_id", connector_id)
+
+                if not final_title or not final_title.strip():
+                    logger.error("Title is empty or contains only whitespace")
                     return {
                         "status": "error",
-                        "message": "No Linear connector found. Please connect Linear in your workspace settings.",
+                        "message": "Issue title cannot be empty.",
                     }
-                actual_connector_id = connector.id
-                logger.info(f"Found Linear connector: id={actual_connector_id}")
-            else:
-                result = await db_session.execute(
-                    select(SearchSourceConnector).filter(
-                        SearchSourceConnector.id == actual_connector_id,
-                        SearchSourceConnector.search_space_id == search_space_id,
-                        SearchSourceConnector.user_id == user_id,
-                        SearchSourceConnector.connector_type
-                        == SearchSourceConnectorType.LINEAR_CONNECTOR,
-                    )
-                )
-                connector = result.scalars().first()
-                if not connector:
+                if not final_team_id:
                     return {
                         "status": "error",
-                        "message": "Selected Linear connector is invalid or has been disconnected.",
+                        "message": "A team must be selected to create an issue.",
                     }
-                logger.info(f"Validated Linear connector: id={actual_connector_id}")
 
-            logger.info(
-                f"Creating Linear issue with final params: title='{final_title}'"
-            )
-            linear_client = LinearConnector(
-                session=db_session, connector_id=actual_connector_id
-            )
-            result = await linear_client.create_issue(
-                team_id=final_team_id,
-                title=final_title,
-                description=final_description,
-                state_id=final_state_id,
-                assignee_id=final_assignee_id,
-                priority=final_priority,
-                label_ids=final_label_ids if final_label_ids else None,
-            )
+                from sqlalchemy.future import select
 
-            if result.get("status") == "error":
-                logger.error(f"Failed to create Linear issue: {result.get('message')}")
-                return {"status": "error", "message": result.get("message")}
+                from app.db import SearchSourceConnector, SearchSourceConnectorType
 
-            logger.info(
-                f"Linear issue created: {result.get('identifier')} - {result.get('title')}"
-            )
+                actual_connector_id = final_connector_id
+                if actual_connector_id is None:
+                    result = await db_session.execute(
+                        select(SearchSourceConnector).filter(
+                            SearchSourceConnector.search_space_id == search_space_id,
+                            SearchSourceConnector.user_id == user_id,
+                            SearchSourceConnector.connector_type
+                            == SearchSourceConnectorType.LINEAR_CONNECTOR,
+                        )
+                    )
+                    connector = result.scalars().first()
+                    if not connector:
+                        return {
+                            "status": "error",
+                            "message": "No Linear connector found. Please connect Linear in your workspace settings.",
+                        }
+                    actual_connector_id = connector.id
+                    logger.info(f"Found Linear connector: id={actual_connector_id}")
+                else:
+                    result = await db_session.execute(
+                        select(SearchSourceConnector).filter(
+                            SearchSourceConnector.id == actual_connector_id,
+                            SearchSourceConnector.search_space_id == search_space_id,
+                            SearchSourceConnector.user_id == user_id,
+                            SearchSourceConnector.connector_type
+                            == SearchSourceConnectorType.LINEAR_CONNECTOR,
+                        )
+                    )
+                    connector = result.scalars().first()
+                    if not connector:
+                        return {
+                            "status": "error",
+                            "message": "Selected Linear connector is invalid or has been disconnected.",
+                        }
+                    logger.info(f"Validated Linear connector: id={actual_connector_id}")
+
+                logger.info(
+                    f"Creating Linear issue with final params: title='{final_title}'"
+                )
+                linear_client = LinearConnector(
+                    session=db_session, connector_id=actual_connector_id
+                )
+                result = await linear_client.create_issue(
+                    team_id=final_team_id,
+                    title=final_title,
+                    description=final_description,
+                    state_id=final_state_id,
+                    assignee_id=final_assignee_id,
+                    priority=final_priority,
+                    label_ids=final_label_ids if final_label_ids else None,
+                )
 
-            kb_message_suffix = ""
-            try:
-                from app.services.linear import LinearKBSyncService
+                if result.get("status") == "error":
+                    logger.error(
+                        f"Failed to create Linear issue: {result.get('message')}"
+                    )
+                    return {"status": "error", "message": result.get("message")}
 
-                kb_service = LinearKBSyncService(db_session)
-                kb_result = await kb_service.sync_after_create(
-                    issue_id=result.get("id"),
-                    issue_identifier=result.get("identifier", ""),
-                    issue_title=result.get("title", final_title),
-                    issue_url=result.get("url"),
-                    description=final_description,
-                    connector_id=actual_connector_id,
-                    search_space_id=search_space_id,
-                    user_id=user_id,
+                logger.info(
+                    f"Linear issue created: {result.get('identifier')} - {result.get('title')}"
                 )
-                if kb_result["status"] == "success":
-                    kb_message_suffix = " Your knowledge base has also been updated."
-                else:
+
+                kb_message_suffix = ""
+                try:
+                    from app.services.linear import LinearKBSyncService
+
+                    kb_service = LinearKBSyncService(db_session)
+                    kb_result = await kb_service.sync_after_create(
+                        issue_id=result.get("id"),
+                        issue_identifier=result.get("identifier", ""),
+                        issue_title=result.get("title", final_title),
+                        issue_url=result.get("url"),
+                        description=final_description,
+                        connector_id=actual_connector_id,
+                        search_space_id=search_space_id,
+                        user_id=user_id,
+                    )
+                    if kb_result["status"] == "success":
+                        kb_message_suffix = (
+                            " Your knowledge base has also been updated."
+                        )
+                    else:
+                        kb_message_suffix = " This issue will be added to your knowledge base in the next scheduled sync."
+                except Exception as kb_err:
+                    logger.warning(f"KB sync after create failed: {kb_err}")
                     kb_message_suffix = " This issue will be added to your knowledge base in the next scheduled sync."
-            except Exception as kb_err:
-                logger.warning(f"KB sync after create failed: {kb_err}")
-                kb_message_suffix = " This issue will be added to your knowledge base in the next scheduled sync."
 
-            return {
-                "status": "success",
-                "issue_id": result.get("id"),
-                "identifier": result.get("identifier"),
-                "url": result.get("url"),
-                "message": (result.get("message", "") + kb_message_suffix),
-            }
+                return {
+                    "status": "success",
+                    "issue_id": result.get("id"),
+                    "identifier": result.get("identifier"),
+                    "url": result.get("url"),
+                    "message": (result.get("message", "") + kb_message_suffix),
+                }
 
         except Exception as e:
             from langgraph.errors import GraphInterrupt
diff --git a/surfsense_backend/app/agents/new_chat/tools/linear/delete_issue.py b/surfsense_backend/app/agents/new_chat/tools/linear/delete_issue.py
index 9f4a60953e..c5039a8eb4 100644
--- a/surfsense_backend/app/agents/new_chat/tools/linear/delete_issue.py
+++ b/surfsense_backend/app/agents/new_chat/tools/linear/delete_issue.py
@@ -2,10 +2,11 @@
 from typing import Any
 
 from langchain_core.tools import tool
-from langgraph.types import interrupt
 from sqlalchemy.ext.asyncio import AsyncSession
 
+from app.agents.new_chat.tools.hitl import request_approval
 from app.connectors.linear_connector import LinearAPIError, LinearConnector
+from app.db import async_session_maker
 from app.services.linear import LinearToolMetadataService
 
 logger = logging.getLogger(__name__)
@@ -17,11 +18,17 @@ def create_delete_linear_issue_tool(
     user_id: str | None = None,
     connector_id: int | None = None,
 ):
-    """
-    Factory function to create the delete_linear_issue tool.
+    """Factory function to create the delete_linear_issue tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker`. This is critical for the compiled-agent
+    cache: the compiled graph (and therefore this closure) is reused
+    across HTTP requests, so capturing a per-request session here would
+    surface stale/closed sessions on cache hits.
 
     Args:
-        db_session: Database session for accessing the Linear connector
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
         search_space_id: Search space ID to find the Linear connector
         user_id: User ID for finding the correct Linear connector
         connector_id: Optional specific connector ID (if known)
@@ -29,6 +36,7 @@ def create_delete_linear_issue_tool(
     Returns:
         Configured delete_linear_issue tool
     """
+    del db_session  # per-call session — see docstring
 
     @tool
     async def delete_linear_issue(
@@ -73,7 +81,7 @@ async def delete_linear_issue(
             f"delete_linear_issue called: issue_ref='{issue_ref}', delete_from_kb={delete_from_kb}"
         )
 
-        if db_session is None or search_space_id is None or user_id is None:
+        if search_space_id is None or user_id is None:
             logger.error(
                 "Linear tool not properly configured - missing required parameters"
             )
@@ -83,177 +91,152 @@ async def delete_linear_issue(
             }
 
         try:
-            metadata_service = LinearToolMetadataService(db_session)
-            context = await metadata_service.get_delete_context(
-                search_space_id, user_id, issue_ref
-            )
+            async with async_session_maker() as db_session:
+                metadata_service = LinearToolMetadataService(db_session)
+                context = await metadata_service.get_delete_context(
+                    search_space_id, user_id, issue_ref
+                )
 
-            if "error" in context:
-                error_msg = context["error"]
-                if context.get("auth_expired"):
-                    logger.warning(f"Auth expired for delete context: {error_msg}")
-                    return {
-                        "status": "auth_error",
-                        "message": error_msg,
-                        "connector_id": context.get("connector_id"),
-                        "connector_type": "linear",
-                    }
-                if "not found" in error_msg.lower():
-                    logger.warning(f"Issue not found: {error_msg}")
-                    return {"status": "not_found", "message": error_msg}
-                else:
-                    logger.error(f"Failed to fetch delete context: {error_msg}")
-                    return {"status": "error", "message": error_msg}
+                if "error" in context:
+                    error_msg = context["error"]
+                    if context.get("auth_expired"):
+                        logger.warning(f"Auth expired for delete context: {error_msg}")
+                        return {
+                            "status": "auth_error",
+                            "message": error_msg,
+                            "connector_id": context.get("connector_id"),
+                            "connector_type": "linear",
+                        }
+                    if "not found" in error_msg.lower():
+                        logger.warning(f"Issue not found: {error_msg}")
+                        return {"status": "not_found", "message": error_msg}
+                    else:
+                        logger.error(f"Failed to fetch delete context: {error_msg}")
+                        return {"status": "error", "message": error_msg}
 
-            issue_id = context["issue"]["id"]
-            issue_identifier = context["issue"].get("identifier", "")
-            document_id = context["issue"]["document_id"]
-            connector_id_from_context = context.get("workspace", {}).get("id")
+                issue_id = context["issue"]["id"]
+                issue_identifier = context["issue"].get("identifier", "")
+                document_id = context["issue"]["document_id"]
+                connector_id_from_context = context.get("workspace", {}).get("id")
 
-            logger.info(
-                f"Requesting approval for deleting Linear issue: '{issue_ref}' "
-                f"(id={issue_id}, delete_from_kb={delete_from_kb})"
-            )
-            approval = interrupt(
-                {
-                    "type": "linear_issue_deletion",
-                    "action": {
-                        "tool": "delete_linear_issue",
-                        "params": {
-                            "issue_id": issue_id,
-                            "connector_id": connector_id_from_context,
-                            "delete_from_kb": delete_from_kb,
-                        },
+                logger.info(
+                    f"Requesting approval for deleting Linear issue: '{issue_ref}' "
+                    f"(id={issue_id}, delete_from_kb={delete_from_kb})"
+                )
+                result = request_approval(
+                    action_type="linear_issue_deletion",
+                    tool_name="delete_linear_issue",
+                    params={
+                        "issue_id": issue_id,
+                        "connector_id": connector_id_from_context,
+                        "delete_from_kb": delete_from_kb,
                     },
-                    "context": context,
-                }
-            )
+                    context=context,
+                )
 
-            decisions_raw = (
-                approval.get("decisions", []) if isinstance(approval, dict) else []
-            )
-            decisions = (
-                decisions_raw if isinstance(decisions_raw, list) else [decisions_raw]
-            )
-            decisions = [d for d in decisions if isinstance(d, dict)]
-            if not decisions:
-                logger.warning("No approval decision received")
-                return {"status": "error", "message": "No approval decision received"}
-
-            decision = decisions[0]
-            decision_type = decision.get("type") or decision.get("decision_type")
-            logger.info(f"User decision: {decision_type}")
-
-            if decision_type == "reject":
-                logger.info("Linear issue deletion rejected by user")
-                return {
-                    "status": "rejected",
-                    "message": "User declined. The issue was not deleted. Do not ask again or suggest alternatives.",
-                }
-
-            edited_action = decision.get("edited_action")
-            final_params: dict[str, Any] = {}
-            if isinstance(edited_action, dict):
-                edited_args = edited_action.get("args")
-                if isinstance(edited_args, dict):
-                    final_params = edited_args
-            elif isinstance(decision.get("args"), dict):
-                final_params = decision["args"]
-
-            final_issue_id = final_params.get("issue_id", issue_id)
-            final_connector_id = final_params.get(
-                "connector_id", connector_id_from_context
-            )
-            final_delete_from_kb = final_params.get("delete_from_kb", delete_from_kb)
+                if result.rejected:
+                    logger.info("Linear issue deletion rejected by user")
+                    return {
+                        "status": "rejected",
+                        "message": "User declined. Do not retry or suggest alternatives.",
+                    }
 
-            logger.info(
-                f"Deleting Linear issue with final params: issue_id={final_issue_id}, "
-                f"connector_id={final_connector_id}, delete_from_kb={final_delete_from_kb}"
-            )
+                final_issue_id = result.params.get("issue_id", issue_id)
+                final_connector_id = result.params.get(
+                    "connector_id", connector_id_from_context
+                )
+                final_delete_from_kb = result.params.get(
+                    "delete_from_kb", delete_from_kb
+                )
 
-            from sqlalchemy.future import select
+                logger.info(
+                    f"Deleting Linear issue with final params: issue_id={final_issue_id}, "
+                    f"connector_id={final_connector_id}, delete_from_kb={final_delete_from_kb}"
+                )
 
-            from app.db import SearchSourceConnector, SearchSourceConnectorType
+                from sqlalchemy.future import select
 
-            if final_connector_id:
-                result = await db_session.execute(
-                    select(SearchSourceConnector).filter(
-                        SearchSourceConnector.id == final_connector_id,
-                        SearchSourceConnector.search_space_id == search_space_id,
-                        SearchSourceConnector.user_id == user_id,
-                        SearchSourceConnector.connector_type
-                        == SearchSourceConnectorType.LINEAR_CONNECTOR,
-                    )
-                )
-                connector = result.scalars().first()
-                if not connector:
-                    logger.error(
-                        f"Invalid connector_id={final_connector_id} for search_space_id={search_space_id}"
+                from app.db import SearchSourceConnector, SearchSourceConnectorType
+
+                if final_connector_id:
+                    result = await db_session.execute(
+                        select(SearchSourceConnector).filter(
+                            SearchSourceConnector.id == final_connector_id,
+                            SearchSourceConnector.search_space_id == search_space_id,
+                            SearchSourceConnector.user_id == user_id,
+                            SearchSourceConnector.connector_type
+                            == SearchSourceConnectorType.LINEAR_CONNECTOR,
+                        )
                     )
+                    connector = result.scalars().first()
+                    if not connector:
+                        logger.error(
+                            f"Invalid connector_id={final_connector_id} for search_space_id={search_space_id}"
+                        )
+                        return {
+                            "status": "error",
+                            "message": "Selected Linear connector is invalid or has been disconnected.",
+                        }
+                    actual_connector_id = connector.id
+                    logger.info(f"Validated Linear connector: id={actual_connector_id}")
+                else:
+                    logger.error("No connector found for this issue")
                     return {
                         "status": "error",
-                        "message": "Selected Linear connector is invalid or has been disconnected.",
+                        "message": "No connector found for this issue.",
                     }
-                actual_connector_id = connector.id
-                logger.info(f"Validated Linear connector: id={actual_connector_id}")
-            else:
-                logger.error("No connector found for this issue")
-                return {
-                    "status": "error",
-                    "message": "No connector found for this issue.",
-                }
-
-            linear_client = LinearConnector(
-                session=db_session, connector_id=actual_connector_id
-            )
 
-            result = await linear_client.archive_issue(issue_id=final_issue_id)
+                linear_client = LinearConnector(
+                    session=db_session, connector_id=actual_connector_id
+                )
 
-            logger.info(
-                f"archive_issue result: {result.get('status')} - {result.get('message', '')}"
-            )
+                result = await linear_client.archive_issue(issue_id=final_issue_id)
 
-            deleted_from_kb = False
-            if (
-                result.get("status") == "success"
-                and final_delete_from_kb
-                and document_id
-            ):
-                try:
-                    from app.db import Document
-
-                    doc_result = await db_session.execute(
-                        select(Document).filter(Document.id == document_id)
-                    )
-                    document = doc_result.scalars().first()
-                    if document:
-                        await db_session.delete(document)
-                        await db_session.commit()
-                        deleted_from_kb = True
-                        logger.info(
-                            f"Deleted document {document_id} from knowledge base"
+                logger.info(
+                    f"archive_issue result: {result.get('status')} - {result.get('message', '')}"
+                )
+
+                deleted_from_kb = False
+                if (
+                    result.get("status") == "success"
+                    and final_delete_from_kb
+                    and document_id
+                ):
+                    try:
+                        from app.db import Document
+
+                        doc_result = await db_session.execute(
+                            select(Document).filter(Document.id == document_id)
+                        )
+                        document = doc_result.scalars().first()
+                        if document:
+                            await db_session.delete(document)
+                            await db_session.commit()
+                            deleted_from_kb = True
+                            logger.info(
+                                f"Deleted document {document_id} from knowledge base"
+                            )
+                        else:
+                            logger.warning(f"Document {document_id} not found in KB")
+                    except Exception as e:
+                        logger.error(f"Failed to delete document from KB: {e}")
+                        await db_session.rollback()
+                        result["warning"] = (
+                            f"Issue archived in Linear, but failed to remove from knowledge base: {e!s}"
                         )
-                    else:
-                        logger.warning(f"Document {document_id} not found in KB")
-                except Exception as e:
-                    logger.error(f"Failed to delete document from KB: {e}")
-                    await db_session.rollback()
-                    result["warning"] = (
-                        f"Issue archived in Linear, but failed to remove from knowledge base: {e!s}"
-                    )
 
-            if result.get("status") == "success":
-                result["deleted_from_kb"] = deleted_from_kb
-                if issue_identifier:
-                    result["message"] = (
-                        f"Issue {issue_identifier} archived successfully."
-                    )
-                if deleted_from_kb:
-                    result["message"] = (
-                        f"{result.get('message', '')} Also removed from the knowledge base."
-                    )
+                if result.get("status") == "success":
+                    result["deleted_from_kb"] = deleted_from_kb
+                    if issue_identifier:
+                        result["message"] = (
+                            f"Issue {issue_identifier} archived successfully."
+                        )
+                    if deleted_from_kb:
+                        result["message"] = (
+                            f"{result.get('message', '')} Also removed from the knowledge base."
+                        )
 
-            return result
+                return result
 
         except Exception as e:
             from langgraph.errors import GraphInterrupt
diff --git a/surfsense_backend/app/agents/new_chat/tools/linear/update_issue.py b/surfsense_backend/app/agents/new_chat/tools/linear/update_issue.py
index 19af851c12..d610ce2b73 100644
--- a/surfsense_backend/app/agents/new_chat/tools/linear/update_issue.py
+++ b/surfsense_backend/app/agents/new_chat/tools/linear/update_issue.py
@@ -2,10 +2,11 @@
 from typing import Any
 
 from langchain_core.tools import tool
-from langgraph.types import interrupt
 from sqlalchemy.ext.asyncio import AsyncSession
 
+from app.agents.new_chat.tools.hitl import request_approval
 from app.connectors.linear_connector import LinearAPIError, LinearConnector
+from app.db import async_session_maker
 from app.services.linear import LinearKBSyncService, LinearToolMetadataService
 
 logger = logging.getLogger(__name__)
@@ -17,11 +18,17 @@ def create_update_linear_issue_tool(
     user_id: str | None = None,
     connector_id: int | None = None,
 ):
-    """
-    Factory function to create the update_linear_issue tool.
+    """Factory function to create the update_linear_issue tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker`. This is critical for the compiled-agent
+    cache: the compiled graph (and therefore this closure) is reused
+    across HTTP requests, so capturing a per-request session here would
+    surface stale/closed sessions on cache hits.
 
     Args:
-        db_session: Database session for accessing the Linear connector
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
         search_space_id: Search space ID to find the Linear connector
         user_id: User ID for fetching user-specific context
         connector_id: Optional specific connector ID (if known)
@@ -29,6 +36,7 @@ def create_update_linear_issue_tool(
     Returns:
         Configured update_linear_issue tool
     """
+    del db_session  # per-call session — see docstring
 
     @tool
     async def update_linear_issue(
@@ -86,7 +94,7 @@ async def update_linear_issue(
         """
         logger.info(f"update_linear_issue called: issue_ref='{issue_ref}'")
 
-        if db_session is None or search_space_id is None or user_id is None:
+        if search_space_id is None or user_id is None:
             logger.error(
                 "Linear tool not properly configured - missing required parameters"
             )
@@ -96,200 +104,177 @@ async def update_linear_issue(
             }
 
         try:
-            metadata_service = LinearToolMetadataService(db_session)
-            context = await metadata_service.get_update_context(
-                search_space_id, user_id, issue_ref
-            )
-
-            if "error" in context:
-                error_msg = context["error"]
-                if context.get("auth_expired"):
-                    logger.warning(f"Auth expired for update context: {error_msg}")
-                    return {
-                        "status": "auth_error",
-                        "message": error_msg,
-                        "connector_id": context.get("connector_id"),
-                        "connector_type": "linear",
-                    }
-                if "not found" in error_msg.lower():
-                    logger.warning(f"Issue not found: {error_msg}")
-                    return {"status": "not_found", "message": error_msg}
-                else:
-                    logger.error(f"Failed to fetch update context: {error_msg}")
-                    return {"status": "error", "message": error_msg}
-
-            issue_id = context["issue"]["id"]
-            document_id = context["issue"]["document_id"]
-            connector_id_from_context = context.get("workspace", {}).get("id")
+            async with async_session_maker() as db_session:
+                metadata_service = LinearToolMetadataService(db_session)
+                context = await metadata_service.get_update_context(
+                    search_space_id, user_id, issue_ref
+                )
 
-            team = context.get("team", {})
-            new_state_id = _resolve_state(team, new_state_name)
-            new_assignee_id = _resolve_assignee(team, new_assignee_email)
-            new_label_ids = _resolve_labels(team, new_label_names)
+                if "error" in context:
+                    error_msg = context["error"]
+                    if context.get("auth_expired"):
+                        logger.warning(f"Auth expired for update context: {error_msg}")
+                        return {
+                            "status": "auth_error",
+                            "message": error_msg,
+                            "connector_id": context.get("connector_id"),
+                            "connector_type": "linear",
+                        }
+                    if "not found" in error_msg.lower():
+                        logger.warning(f"Issue not found: {error_msg}")
+                        return {"status": "not_found", "message": error_msg}
+                    else:
+                        logger.error(f"Failed to fetch update context: {error_msg}")
+                        return {"status": "error", "message": error_msg}
+
+                issue_id = context["issue"]["id"]
+                document_id = context["issue"]["document_id"]
+                connector_id_from_context = context.get("workspace", {}).get("id")
+
+                team = context.get("team", {})
+                new_state_id = _resolve_state(team, new_state_name)
+                new_assignee_id = _resolve_assignee(team, new_assignee_email)
+                new_label_ids = _resolve_labels(team, new_label_names)
 
-            logger.info(
-                f"Requesting approval for updating Linear issue: '{issue_ref}' (id={issue_id})"
-            )
-            approval = interrupt(
-                {
-                    "type": "linear_issue_update",
-                    "action": {
-                        "tool": "update_linear_issue",
-                        "params": {
-                            "issue_id": issue_id,
-                            "document_id": document_id,
-                            "new_title": new_title,
-                            "new_description": new_description,
-                            "new_state_id": new_state_id,
-                            "new_assignee_id": new_assignee_id,
-                            "new_priority": new_priority,
-                            "new_label_ids": new_label_ids,
-                            "connector_id": connector_id_from_context,
-                        },
+                logger.info(
+                    f"Requesting approval for updating Linear issue: '{issue_ref}' (id={issue_id})"
+                )
+                result = request_approval(
+                    action_type="linear_issue_update",
+                    tool_name="update_linear_issue",
+                    params={
+                        "issue_id": issue_id,
+                        "document_id": document_id,
+                        "new_title": new_title,
+                        "new_description": new_description,
+                        "new_state_id": new_state_id,
+                        "new_assignee_id": new_assignee_id,
+                        "new_priority": new_priority,
+                        "new_label_ids": new_label_ids,
+                        "connector_id": connector_id_from_context,
                     },
-                    "context": context,
-                }
-            )
-
-            decisions_raw = (
-                approval.get("decisions", []) if isinstance(approval, dict) else []
-            )
-            decisions = (
-                decisions_raw if isinstance(decisions_raw, list) else [decisions_raw]
-            )
-            decisions = [d for d in decisions if isinstance(d, dict)]
-            if not decisions:
-                logger.warning("No approval decision received")
-                return {"status": "error", "message": "No approval decision received"}
+                    context=context,
+                )
 
-            decision = decisions[0]
-            decision_type = decision.get("type") or decision.get("decision_type")
-            logger.info(f"User decision: {decision_type}")
+                if result.rejected:
+                    logger.info("Linear issue update rejected by user")
+                    return {
+                        "status": "rejected",
+                        "message": "User declined. Do not retry or suggest alternatives.",
+                    }
 
-            if decision_type == "reject":
-                logger.info("Linear issue update rejected by user")
-                return {
-                    "status": "rejected",
-                    "message": "User declined. The issue was not updated. Do not ask again or suggest alternatives.",
-                }
+                final_issue_id = result.params.get("issue_id", issue_id)
+                final_document_id = result.params.get("document_id", document_id)
+                final_new_title = result.params.get("new_title", new_title)
+                final_new_description = result.params.get(
+                    "new_description", new_description
+                )
+                final_new_state_id = result.params.get("new_state_id", new_state_id)
+                final_new_assignee_id = result.params.get(
+                    "new_assignee_id", new_assignee_id
+                )
+                final_new_priority = result.params.get("new_priority", new_priority)
+                final_new_label_ids: list[str] | None = result.params.get(
+                    "new_label_ids", new_label_ids
+                )
+                final_connector_id = result.params.get(
+                    "connector_id", connector_id_from_context
+                )
 
-            edited_action = decision.get("edited_action")
-            final_params: dict[str, Any] = {}
-            if isinstance(edited_action, dict):
-                edited_args = edited_action.get("args")
-                if isinstance(edited_args, dict):
-                    final_params = edited_args
-            elif isinstance(decision.get("args"), dict):
-                final_params = decision["args"]
-
-            final_issue_id = final_params.get("issue_id", issue_id)
-            final_document_id = final_params.get("document_id", document_id)
-            final_new_title = final_params.get("new_title", new_title)
-            final_new_description = final_params.get("new_description", new_description)
-            final_new_state_id = final_params.get("new_state_id", new_state_id)
-            final_new_assignee_id = final_params.get("new_assignee_id", new_assignee_id)
-            final_new_priority = final_params.get("new_priority", new_priority)
-            final_new_label_ids: list[str] | None = final_params.get(
-                "new_label_ids", new_label_ids
-            )
-            final_connector_id = final_params.get(
-                "connector_id", connector_id_from_context
-            )
+                if not final_connector_id:
+                    logger.error("No connector found for this issue")
+                    return {
+                        "status": "error",
+                        "message": "No connector found for this issue.",
+                    }
 
-            if not final_connector_id:
-                logger.error("No connector found for this issue")
-                return {
-                    "status": "error",
-                    "message": "No connector found for this issue.",
-                }
+                from sqlalchemy.future import select
 
-            from sqlalchemy.future import select
+                from app.db import SearchSourceConnector, SearchSourceConnectorType
 
-            from app.db import SearchSourceConnector, SearchSourceConnectorType
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.id == final_connector_id,
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type
+                        == SearchSourceConnectorType.LINEAR_CONNECTOR,
+                    )
+                )
+                connector = result.scalars().first()
+                if not connector:
+                    logger.error(
+                        f"Invalid connector_id={final_connector_id} for search_space_id={search_space_id}"
+                    )
+                    return {
+                        "status": "error",
+                        "message": "Selected Linear connector is invalid or has been disconnected.",
+                    }
+                logger.info(f"Validated Linear connector: id={final_connector_id}")
 
-            result = await db_session.execute(
-                select(SearchSourceConnector).filter(
-                    SearchSourceConnector.id == final_connector_id,
-                    SearchSourceConnector.search_space_id == search_space_id,
-                    SearchSourceConnector.user_id == user_id,
-                    SearchSourceConnector.connector_type
-                    == SearchSourceConnectorType.LINEAR_CONNECTOR,
+                logger.info(
+                    f"Updating Linear issue with final params: issue_id={final_issue_id}"
                 )
-            )
-            connector = result.scalars().first()
-            if not connector:
-                logger.error(
-                    f"Invalid connector_id={final_connector_id} for search_space_id={search_space_id}"
+                linear_client = LinearConnector(
+                    session=db_session, connector_id=final_connector_id
                 )
-                return {
-                    "status": "error",
-                    "message": "Selected Linear connector is invalid or has been disconnected.",
-                }
-            logger.info(f"Validated Linear connector: id={final_connector_id}")
-
-            logger.info(
-                f"Updating Linear issue with final params: issue_id={final_issue_id}"
-            )
-            linear_client = LinearConnector(
-                session=db_session, connector_id=final_connector_id
-            )
-            updated_issue = await linear_client.update_issue(
-                issue_id=final_issue_id,
-                title=final_new_title,
-                description=final_new_description,
-                state_id=final_new_state_id,
-                assignee_id=final_new_assignee_id,
-                priority=final_new_priority,
-                label_ids=final_new_label_ids,
-            )
-
-            if updated_issue.get("status") == "error":
-                logger.error(
-                    f"Failed to update Linear issue: {updated_issue.get('message')}"
+                updated_issue = await linear_client.update_issue(
+                    issue_id=final_issue_id,
+                    title=final_new_title,
+                    description=final_new_description,
+                    state_id=final_new_state_id,
+                    assignee_id=final_new_assignee_id,
+                    priority=final_new_priority,
+                    label_ids=final_new_label_ids,
                 )
-                return {
-                    "status": "error",
-                    "message": updated_issue.get("message"),
-                }
 
-            logger.info(
-                f"update_issue result: {updated_issue.get('identifier')} - {updated_issue.get('title')}"
-            )
+                if updated_issue.get("status") == "error":
+                    logger.error(
+                        f"Failed to update Linear issue: {updated_issue.get('message')}"
+                    )
+                    return {
+                        "status": "error",
+                        "message": updated_issue.get("message"),
+                    }
 
-            if final_document_id is not None:
                 logger.info(
-                    f"Updating knowledge base for document {final_document_id}..."
-                )
-                kb_service = LinearKBSyncService(db_session)
-                kb_result = await kb_service.sync_after_update(
-                    document_id=final_document_id,
-                    issue_id=final_issue_id,
-                    user_id=user_id,
-                    search_space_id=search_space_id,
+                    f"update_issue result: {updated_issue.get('identifier')} - {updated_issue.get('title')}"
                 )
-                if kb_result["status"] == "success":
+
+                if final_document_id is not None:
                     logger.info(
-                        f"Knowledge base successfully updated for issue {final_issue_id}"
+                        f"Updating knowledge base for document {final_document_id}..."
                     )
-                    kb_message = " Your knowledge base has also been updated."
-                elif kb_result["status"] == "not_indexed":
-                    kb_message = " This issue will be added to your knowledge base in the next scheduled sync."
-                else:
-                    logger.warning(
-                        f"KB update failed for issue {final_issue_id}: {kb_result.get('message')}"
+                    kb_service = LinearKBSyncService(db_session)
+                    kb_result = await kb_service.sync_after_update(
+                        document_id=final_document_id,
+                        issue_id=final_issue_id,
+                        user_id=user_id,
+                        search_space_id=search_space_id,
                     )
-                    kb_message = " Your knowledge base will be updated in the next scheduled sync."
-            else:
-                kb_message = ""
+                    if kb_result["status"] == "success":
+                        logger.info(
+                            f"Knowledge base successfully updated for issue {final_issue_id}"
+                        )
+                        kb_message = " Your knowledge base has also been updated."
+                    elif kb_result["status"] == "not_indexed":
+                        kb_message = " This issue will be added to your knowledge base in the next scheduled sync."
+                    else:
+                        logger.warning(
+                            f"KB update failed for issue {final_issue_id}: {kb_result.get('message')}"
+                        )
+                        kb_message = " Your knowledge base will be updated in the next scheduled sync."
+                else:
+                    kb_message = ""
 
-            identifier = updated_issue.get("identifier")
-            default_msg = f"Issue {identifier} updated successfully."
-            return {
-                "status": "success",
-                "identifier": identifier,
-                "url": updated_issue.get("url"),
-                "message": f"{updated_issue.get('message', default_msg)}{kb_message}",
-            }
+                identifier = updated_issue.get("identifier")
+                default_msg = f"Issue {identifier} updated successfully."
+                return {
+                    "status": "success",
+                    "identifier": identifier,
+                    "url": updated_issue.get("url"),
+                    "message": f"{updated_issue.get('message', default_msg)}{kb_message}",
+                }
 
         except Exception as e:
             from langgraph.errors import GraphInterrupt
diff --git a/surfsense_backend/app/agents/new_chat/tools/luma/__init__.py b/surfsense_backend/app/agents/new_chat/tools/luma/__init__.py
new file mode 100644
index 0000000000..255119bee7
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/tools/luma/__init__.py
@@ -0,0 +1,15 @@
+from app.agents.new_chat.tools.luma.create_event import (
+    create_create_luma_event_tool,
+)
+from app.agents.new_chat.tools.luma.list_events import (
+    create_list_luma_events_tool,
+)
+from app.agents.new_chat.tools.luma.read_event import (
+    create_read_luma_event_tool,
+)
+
+__all__ = [
+    "create_create_luma_event_tool",
+    "create_list_luma_events_tool",
+    "create_read_luma_event_tool",
+]
diff --git a/surfsense_backend/app/agents/new_chat/tools/luma/_auth.py b/surfsense_backend/app/agents/new_chat/tools/luma/_auth.py
new file mode 100644
index 0000000000..37deb15252
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/tools/luma/_auth.py
@@ -0,0 +1,39 @@
+"""Shared auth helper for Luma agent tools."""
+
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.future import select
+
+from app.db import SearchSourceConnector, SearchSourceConnectorType
+
+LUMA_API = "https://public-api.luma.com/v1"
+
+
+async def get_luma_connector(
+    db_session: AsyncSession,
+    search_space_id: int,
+    user_id: str,
+) -> SearchSourceConnector | None:
+    result = await db_session.execute(
+        select(SearchSourceConnector).filter(
+            SearchSourceConnector.search_space_id == search_space_id,
+            SearchSourceConnector.user_id == user_id,
+            SearchSourceConnector.connector_type
+            == SearchSourceConnectorType.LUMA_CONNECTOR,
+        )
+    )
+    return result.scalars().first()
+
+
+def get_api_key(connector: SearchSourceConnector) -> str:
+    """Extract the API key from connector config (handles both key names)."""
+    key = connector.config.get("api_key") or connector.config.get("LUMA_API_KEY")
+    if not key:
+        raise ValueError("Luma API key not found in connector config.")
+    return key
+
+
+def luma_headers(api_key: str) -> dict[str, str]:
+    return {
+        "Content-Type": "application/json",
+        "x-luma-api-key": api_key,
+    }
diff --git a/surfsense_backend/app/agents/new_chat/tools/luma/create_event.py b/surfsense_backend/app/agents/new_chat/tools/luma/create_event.py
new file mode 100644
index 0000000000..65c177d7ab
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/tools/luma/create_event.py
@@ -0,0 +1,150 @@
+import logging
+from typing import Any
+
+import httpx
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.agents.new_chat.tools.hitl import request_approval
+from app.db import async_session_maker
+
+from ._auth import LUMA_API, get_api_key, get_luma_connector, luma_headers
+
+logger = logging.getLogger(__name__)
+
+
+def create_create_luma_event_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    """
+    Factory function to create the create_luma_event tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+
+    Returns:
+        Configured create_luma_event tool
+    """
+    del db_session  # per-call session — see docstring
+
+    @tool
+    async def create_luma_event(
+        name: str,
+        start_at: str,
+        end_at: str,
+        description: str | None = None,
+        timezone: str = "UTC",
+    ) -> dict[str, Any]:
+        """Create a new event on Luma.
+
+        Args:
+            name: The event title.
+            start_at: Start time in ISO 8601 format (e.g. "2026-05-01T18:00:00").
+            end_at: End time in ISO 8601 format (e.g. "2026-05-01T20:00:00").
+            description: Optional event description (markdown supported).
+            timezone: Timezone string (default "UTC", e.g. "America/New_York").
+
+        Returns:
+            Dictionary with status, event_id on success.
+
+            IMPORTANT:
+            - If status is "rejected", the user explicitly declined. Do NOT retry.
+        """
+        if search_space_id is None or user_id is None:
+            return {"status": "error", "message": "Luma tool not properly configured."}
+
+        try:
+            async with async_session_maker() as db_session:
+                connector = await get_luma_connector(
+                    db_session, search_space_id, user_id
+                )
+                if not connector:
+                    return {"status": "error", "message": "No Luma connector found."}
+
+                result = request_approval(
+                    action_type="luma_create_event",
+                    tool_name="create_luma_event",
+                    params={
+                        "name": name,
+                        "start_at": start_at,
+                        "end_at": end_at,
+                        "description": description,
+                        "timezone": timezone,
+                    },
+                    context={"connector_id": connector.id},
+                )
+
+                if result.rejected:
+                    return {
+                        "status": "rejected",
+                        "message": "User declined. Event was not created.",
+                    }
+
+                final_name = result.params.get("name", name)
+                final_start = result.params.get("start_at", start_at)
+                final_end = result.params.get("end_at", end_at)
+                final_desc = result.params.get("description", description)
+                final_tz = result.params.get("timezone", timezone)
+
+                api_key = get_api_key(connector)
+                headers = luma_headers(api_key)
+
+                body: dict[str, Any] = {
+                    "name": final_name,
+                    "start_at": final_start,
+                    "end_at": final_end,
+                    "timezone": final_tz,
+                }
+                if final_desc:
+                    body["description_md"] = final_desc
+
+                async with httpx.AsyncClient(timeout=20.0) as client:
+                    resp = await client.post(
+                        f"{LUMA_API}/event/create",
+                        headers=headers,
+                        json=body,
+                    )
+
+                if resp.status_code == 401:
+                    return {
+                        "status": "auth_error",
+                        "message": "Luma API key is invalid.",
+                        "connector_type": "luma",
+                    }
+                if resp.status_code == 403:
+                    return {
+                        "status": "error",
+                        "message": "Luma Plus subscription required to create events via API.",
+                    }
+                if resp.status_code not in (200, 201):
+                    return {
+                        "status": "error",
+                        "message": f"Luma API error: {resp.status_code} — {resp.text[:200]}",
+                    }
+
+                data = resp.json()
+                event_id = data.get("api_id") or data.get("event", {}).get("api_id")
+
+                return {
+                    "status": "success",
+                    "event_id": event_id,
+                    "message": f"Event '{final_name}' created on Luma.",
+                }
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+            logger.error("Error creating Luma event: %s", e, exc_info=True)
+            return {"status": "error", "message": "Failed to create Luma event."}
+
+    return create_luma_event
diff --git a/surfsense_backend/app/agents/new_chat/tools/luma/list_events.py b/surfsense_backend/app/agents/new_chat/tools/luma/list_events.py
new file mode 100644
index 0000000000..6885c2049e
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/tools/luma/list_events.py
@@ -0,0 +1,133 @@
+import logging
+from typing import Any
+
+import httpx
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import async_session_maker
+
+from ._auth import LUMA_API, get_api_key, get_luma_connector, luma_headers
+
+logger = logging.getLogger(__name__)
+
+
+def create_list_luma_events_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    """
+    Factory function to create the list_luma_events tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+
+    Returns:
+        Configured list_luma_events tool
+    """
+    del db_session  # per-call session — see docstring
+
+    @tool
+    async def list_luma_events(
+        max_results: int = 25,
+    ) -> dict[str, Any]:
+        """List upcoming and recent Luma events.
+
+        Args:
+            max_results: Maximum events to return (default 25, max 50).
+
+        Returns:
+            Dictionary with status and a list of events including
+            event_id, name, start_at, end_at, location, url.
+        """
+        if search_space_id is None or user_id is None:
+            return {"status": "error", "message": "Luma tool not properly configured."}
+
+        max_results = min(max_results, 50)
+
+        try:
+            async with async_session_maker() as db_session:
+                connector = await get_luma_connector(
+                    db_session, search_space_id, user_id
+                )
+                if not connector:
+                    return {"status": "error", "message": "No Luma connector found."}
+
+                api_key = get_api_key(connector)
+                headers = luma_headers(api_key)
+
+                all_entries: list[dict] = []
+                cursor = None
+
+                async with httpx.AsyncClient(timeout=20.0) as client:
+                    while len(all_entries) < max_results:
+                        params: dict[str, Any] = {
+                            "limit": min(100, max_results - len(all_entries))
+                        }
+                        if cursor:
+                            params["cursor"] = cursor
+
+                        resp = await client.get(
+                            f"{LUMA_API}/calendar/list-events",
+                            headers=headers,
+                            params=params,
+                        )
+
+                        if resp.status_code == 401:
+                            return {
+                                "status": "auth_error",
+                                "message": "Luma API key is invalid.",
+                                "connector_type": "luma",
+                            }
+                        if resp.status_code != 200:
+                            return {
+                                "status": "error",
+                                "message": f"Luma API error: {resp.status_code}",
+                            }
+
+                        data = resp.json()
+                        entries = data.get("entries", [])
+                        if not entries:
+                            break
+                        all_entries.extend(entries)
+
+                        next_cursor = data.get("next_cursor")
+                        if not next_cursor:
+                            break
+                        cursor = next_cursor
+
+                events = []
+                for entry in all_entries[:max_results]:
+                    ev = entry.get("event", {})
+                    geo = ev.get("geo_info", {})
+                    events.append(
+                        {
+                            "event_id": entry.get("api_id"),
+                            "name": ev.get("name", "Untitled"),
+                            "start_at": ev.get("start_at", ""),
+                            "end_at": ev.get("end_at", ""),
+                            "timezone": ev.get("timezone", ""),
+                            "location": geo.get("name", ""),
+                            "url": ev.get("url", ""),
+                            "visibility": ev.get("visibility", ""),
+                        }
+                    )
+
+                return {"status": "success", "events": events, "total": len(events)}
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+            logger.error("Error listing Luma events: %s", e, exc_info=True)
+            return {"status": "error", "message": "Failed to list Luma events."}
+
+    return list_luma_events
diff --git a/surfsense_backend/app/agents/new_chat/tools/luma/read_event.py b/surfsense_backend/app/agents/new_chat/tools/luma/read_event.py
new file mode 100644
index 0000000000..a8484e9c0f
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/tools/luma/read_event.py
@@ -0,0 +1,114 @@
+import logging
+from typing import Any
+
+import httpx
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import async_session_maker
+
+from ._auth import LUMA_API, get_api_key, get_luma_connector, luma_headers
+
+logger = logging.getLogger(__name__)
+
+
+def create_read_luma_event_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    """
+    Factory function to create the read_luma_event tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+
+    Returns:
+        Configured read_luma_event tool
+    """
+    del db_session  # per-call session — see docstring
+
+    @tool
+    async def read_luma_event(event_id: str) -> dict[str, Any]:
+        """Read detailed information about a specific Luma event.
+
+        Args:
+            event_id: The Luma event API ID (from list_luma_events).
+
+        Returns:
+            Dictionary with status and full event details including
+            description, attendees count, meeting URL.
+        """
+        if search_space_id is None or user_id is None:
+            return {"status": "error", "message": "Luma tool not properly configured."}
+
+        try:
+            async with async_session_maker() as db_session:
+                connector = await get_luma_connector(
+                    db_session, search_space_id, user_id
+                )
+                if not connector:
+                    return {"status": "error", "message": "No Luma connector found."}
+
+                api_key = get_api_key(connector)
+                headers = luma_headers(api_key)
+
+                async with httpx.AsyncClient(timeout=15.0) as client:
+                    resp = await client.get(
+                        f"{LUMA_API}/events/{event_id}",
+                        headers=headers,
+                    )
+
+                if resp.status_code == 401:
+                    return {
+                        "status": "auth_error",
+                        "message": "Luma API key is invalid.",
+                        "connector_type": "luma",
+                    }
+                if resp.status_code == 404:
+                    return {
+                        "status": "not_found",
+                        "message": f"Event '{event_id}' not found.",
+                    }
+                if resp.status_code != 200:
+                    return {
+                        "status": "error",
+                        "message": f"Luma API error: {resp.status_code}",
+                    }
+
+                data = resp.json()
+                ev = data.get("event", data)
+                geo = ev.get("geo_info", {})
+
+                event_detail = {
+                    "event_id": event_id,
+                    "name": ev.get("name", ""),
+                    "description": ev.get("description", ""),
+                    "start_at": ev.get("start_at", ""),
+                    "end_at": ev.get("end_at", ""),
+                    "timezone": ev.get("timezone", ""),
+                    "location_name": geo.get("name", ""),
+                    "address": geo.get("address", ""),
+                    "url": ev.get("url", ""),
+                    "meeting_url": ev.get("meeting_url", ""),
+                    "visibility": ev.get("visibility", ""),
+                    "cover_url": ev.get("cover_url", ""),
+                }
+
+                return {"status": "success", "event": event_detail}
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+            logger.error("Error reading Luma event: %s", e, exc_info=True)
+            return {"status": "error", "message": "Failed to read Luma event."}
+
+    return read_luma_event
diff --git a/surfsense_backend/app/agents/new_chat/tools/mcp_client.py b/surfsense_backend/app/agents/new_chat/tools/mcp_client.py
index 44c48344ca..e28ac8bda8 100644
--- a/surfsense_backend/app/agents/new_chat/tools/mcp_client.py
+++ b/surfsense_backend/app/agents/new_chat/tools/mcp_client.py
@@ -45,6 +45,18 @@ def __init__(
     async def connect(self, max_retries: int = MAX_RETRIES):
         """Connect to the MCP server and manage its lifecycle.
 
+        Retries only apply to the **connection** phase (spawning the process,
+        initialising the session).  Once the session is yielded to the caller,
+        any exception raised by the caller propagates normally -- the context
+        manager will NOT retry after ``yield``.
+
+        Previous implementation wrapped both connection AND yield inside the
+        retry loop.  Because ``@asynccontextmanager`` only allows a single
+        ``yield``, a failure after yield caused the generator to attempt a
+        second yield on retry, triggering
+        ``RuntimeError("generator didn't stop after athrow()")`` and orphaning
+        the stdio subprocess.
+
         Args:
             max_retries: Maximum number of connection retry attempts
 
@@ -57,26 +69,22 @@ async def connect(self, max_retries: int = MAX_RETRIES):
         """
         last_error = None
         delay = RETRY_DELAY
+        connected = False
 
         for attempt in range(max_retries):
             try:
-                # Merge env vars with current environment
                 server_env = os.environ.copy()
                 server_env.update(self.env)
 
-                # Create server parameters with env
                 server_params = StdioServerParameters(
                     command=self.command, args=self.args, env=server_env
                 )
 
-                # Spawn server process and create session
-                # Note: Cannot combine these context managers because ClientSession
-                # needs the read/write streams from stdio_client
                 async with stdio_client(server=server_params) as (read, write):  # noqa: SIM117
                     async with ClientSession(read, write) as session:
-                        # Initialize the connection
                         await session.initialize()
                         self.session = session
+                        connected = True
 
                         if attempt > 0:
                             logger.info(
@@ -91,10 +99,16 @@ async def connect(self, max_retries: int = MAX_RETRIES):
                                 self.command,
                                 " ".join(self.args),
                             )
-                        yield session
-                        return  # Success, exit retry loop
+                        try:
+                            yield session
+                        finally:
+                            self.session = None
+                        return
 
             except Exception as e:
+                self.session = None
+                if connected:
+                    raise
                 last_error = e
                 if attempt < max_retries - 1:
                     logger.warning(
@@ -105,7 +119,7 @@ async def connect(self, max_retries: int = MAX_RETRIES):
                         delay,
                     )
                     await asyncio.sleep(delay)
-                    delay *= RETRY_BACKOFF  # Exponential backoff
+                    delay *= RETRY_BACKOFF
                 else:
                     logger.error(
                         "Failed to connect to MCP server after %d attempts: %s",
@@ -113,10 +127,7 @@ async def connect(self, max_retries: int = MAX_RETRIES):
                         e,
                         exc_info=True,
                     )
-            finally:
-                self.session = None
 
-        # All retries exhausted
         error_msg = f"Failed to connect to MCP server '{self.command}' after {max_retries} attempts"
         if last_error:
             error_msg += f": {last_error}"
@@ -161,12 +172,18 @@ async def list_tools(self) -> list[dict[str, Any]]:
             logger.error("Failed to list tools from MCP server: %s", e, exc_info=True)
             raise
 
-    async def call_tool(self, tool_name: str, arguments: dict[str, Any]) -> Any:
+    async def call_tool(
+        self,
+        tool_name: str,
+        arguments: dict[str, Any],
+        timeout: float = 60.0,
+    ) -> Any:
         """Call a tool on the MCP server.
 
         Args:
             tool_name: Name of the tool to call
             arguments: Arguments to pass to the tool
+            timeout: Maximum seconds to wait for the tool to respond
 
         Returns:
             Tool execution result
@@ -185,10 +202,11 @@ async def call_tool(self, tool_name: str, arguments: dict[str, Any]) -> Any:
                 "Calling MCP tool '%s' with arguments: %s", tool_name, arguments
             )
 
-            # Call tools/call RPC method
-            response = await self.session.call_tool(tool_name, arguments=arguments)
+            response = await asyncio.wait_for(
+                self.session.call_tool(tool_name, arguments=arguments),
+                timeout=timeout,
+            )
 
-            # Extract content from response
             result = []
             for content in response.content:
                 if hasattr(content, "text"):
@@ -202,15 +220,15 @@ async def call_tool(self, tool_name: str, arguments: dict[str, Any]) -> Any:
             logger.info("MCP tool '%s' succeeded: %s", tool_name, result_str[:200])
             return result_str
 
+        except TimeoutError:
+            logger.error("MCP tool '%s' timed out after %.0fs", tool_name, timeout)
+            return f"Error: MCP tool '{tool_name}' timed out after {timeout:.0f}s"
         except RuntimeError as e:
-            # Handle validation errors from MCP server responses
-            # Some MCP servers (like server-memory) return extra fields not in their schema
             if "Invalid structured content" in str(e):
                 logger.warning(
                     "MCP server returned data not matching its schema, but continuing: %s",
                     e,
                 )
-                # Try to extract result from error message or return a success message
                 return "Operation completed (server returned unexpected format)"
             raise
         except (ValueError, TypeError, AttributeError, KeyError) as e:
diff --git a/surfsense_backend/app/agents/new_chat/tools/mcp_tool.py b/surfsense_backend/app/agents/new_chat/tools/mcp_tool.py
index 2fb7ffb06d..64368a8786 100644
--- a/surfsense_backend/app/agents/new_chat/tools/mcp_tool.py
+++ b/surfsense_backend/app/agents/new_chat/tools/mcp_tool.py
@@ -7,28 +7,49 @@
 - stdio: Local process-based MCP servers (command, args, env)
 - streamable-http/http/sse: Remote HTTP-based MCP servers (url, headers)
 
-This implements real MCP protocol support similar to Cursor's implementation.
+All MCP tools are unconditionally gated by HITL (Human-in-the-Loop) approval.
+Per the MCP spec: "Clients MUST consider tool annotations to be untrusted unless
+they come from trusted servers."  Users can bypass HITL for specific tools by
+clicking "Always Allow", which adds the tool name to the connector's
+``config.trusted_tools`` allow-list.
 """
 
+from __future__ import annotations
+
+import asyncio
 import logging
 import time
-from typing import Any
+from collections import defaultdict
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    from app.utils.oauth_security import TokenEncryption
 
 from langchain_core.tools import StructuredTool
 from mcp import ClientSession
 from mcp.client.streamable_http import streamablehttp_client
-from pydantic import BaseModel, create_model
-from sqlalchemy import select
+from pydantic import BaseModel, ConfigDict, Field, create_model
+from sqlalchemy import cast, select
+from sqlalchemy.dialects.postgresql import JSONB
 from sqlalchemy.ext.asyncio import AsyncSession
 
+from app.agents.new_chat.middleware.dedup_tool_calls import dedup_key_full_args
+from app.agents.new_chat.tools.hitl import request_approval
 from app.agents.new_chat.tools.mcp_client import MCPClient
-from app.db import SearchSourceConnector, SearchSourceConnectorType
+from app.db import SearchSourceConnector
+from app.services.mcp_oauth.registry import MCP_SERVICES, get_service_by_connector_type
 
 logger = logging.getLogger(__name__)
 
 _MCP_CACHE_TTL_SECONDS = 300  # 5 minutes
 _MCP_CACHE_MAX_SIZE = 50
-_mcp_tools_cache: dict[int, tuple[float, list[StructuredTool]]] = {}
+_MCP_DISCOVERY_TIMEOUT_SECONDS = 30
+_TOOL_CALL_MAX_RETRIES = 3
+_TOOL_CALL_RETRY_DELAY = 1.5  # seconds, doubles per attempt
+# Keyed by ``(search_space_id, bypass_internal_hitl)`` so single-agent and
+# multi-agent paths cannot share tool closures with different HITL wiring.
+_MCPCacheKey = tuple[int, bool]
+_mcp_tools_cache: dict[_MCPCacheKey, tuple[float, list[StructuredTool]]] = {}
 
 
 def _evict_expired_mcp_cache() -> None:
@@ -51,100 +72,173 @@ def _create_dynamic_input_model_from_schema(
 ) -> type[BaseModel]:
     """Create a Pydantic model from MCP tool's JSON schema.
 
-    Args:
-        tool_name: Name of the tool (used for model class name)
-        input_schema: JSON schema from MCP server
-
-    Returns:
-        Pydantic model class for tool input validation
+    Models always allow extra fields (``extra="allow"``) so that parameters
+    missing from a broken or incomplete JSON schema (e.g. ``zod-to-json-schema``
+    producing an empty ``$schema``-only object) can still be forwarded to the
+    MCP server.
 
+    When the schema declares **no** properties, a synthetic ``input_data``
+    field of type ``dict`` is injected so the LLM has a visible parameter to
+    populate.  The caller should unpack ``input_data`` before forwarding to
+    the MCP server (see ``_unpack_synthetic_input_data``).
     """
     properties = input_schema.get("properties", {})
     required_fields = input_schema.get("required", [])
 
-    # Build Pydantic field definitions
     field_definitions = {}
     for param_name, param_schema in properties.items():
         param_description = param_schema.get("description", "")
         is_required = param_name in required_fields
 
-        # Use Any type for complex schemas to preserve structure
-        # This allows the MCP server to do its own validation
-        from typing import Any as AnyType
-
-        from pydantic import Field
-
         if is_required:
             field_definitions[param_name] = (
-                AnyType,
+                Any,
                 Field(..., description=param_description),
             )
         else:
             field_definitions[param_name] = (
-                AnyType | None,
+                Any | None,
                 Field(None, description=param_description),
             )
 
-    # Create dynamic model
+    if not properties:
+        field_definitions["input_data"] = (
+            dict[str, Any] | None,
+            Field(
+                None,
+                description=(
+                    "Arguments to pass to this tool as a JSON object. "
+                    "Infer sensible key names from the tool name and description "
+                    '(e.g. {"search": "my query"} for a search tool).'
+                ),
+            ),
+        )
+
     model_name = f"{tool_name.replace(' ', '').replace('-', '_')}Input"
-    return create_model(model_name, **field_definitions)
+    model = create_model(
+        model_name, __config__=ConfigDict(extra="allow"), **field_definitions
+    )
+    return model
+
+
+def _unpack_synthetic_input_data(kwargs: dict[str, Any]) -> dict[str, Any]:
+    """Unpack the synthetic ``input_data`` field into top-level kwargs.
+
+    When the MCP tool schema is empty, ``_create_dynamic_input_model_from_schema``
+    adds a catch-all ``input_data: dict`` field.  This helper merges that dict
+    back into the top-level kwargs so the MCP server receives flat arguments.
+    """
+    input_data = kwargs.pop("input_data", None)
+    if isinstance(input_data, dict):
+        kwargs.update(input_data)
+    return kwargs
 
 
 async def _create_mcp_tool_from_definition_stdio(
     tool_def: dict[str, Any],
     mcp_client: MCPClient,
+    *,
+    connector_name: str = "",
+    connector_id: int | None = None,
+    trusted_tools: list[str] | None = None,
+    bypass_internal_hitl: bool = False,
 ) -> StructuredTool:
     """Create a LangChain tool from an MCP tool definition (stdio transport).
 
-    Args:
-        tool_def: Tool definition from MCP server with name, description, input_schema
-        mcp_client: MCP client instance for calling the tool
-
-    Returns:
-        LangChain StructuredTool instance
-
+    Set ``bypass_internal_hitl=True`` when an outer ``HumanInTheLoopMiddleware``
+    already gates the tool, otherwise the body's ``request_approval()`` is the
+    sole HITL gate (single-agent path).
     """
     tool_name = tool_def.get("name", "unnamed_tool")
-    tool_description = tool_def.get("description", "No description provided")
+    raw_description = tool_def.get("description", "No description provided")
+    tool_description = (
+        f"[MCP server: {connector_name}] {raw_description}"
+        if connector_name
+        else raw_description
+    )
     input_schema = tool_def.get("input_schema", {"type": "object", "properties": {}})
 
-    # Log the actual schema for debugging
-    logger.info(f"MCP tool '{tool_name}' input schema: {input_schema}")
+    logger.debug("MCP tool '%s' input schema: %s", tool_name, input_schema)
 
-    # Create dynamic input model from schema
     input_model = _create_dynamic_input_model_from_schema(tool_name, input_schema)
 
     async def mcp_tool_call(**kwargs) -> str:
         """Execute the MCP tool call via the client with retry support."""
-        logger.info(f"MCP tool '{tool_name}' called with params: {kwargs}")
+        logger.debug("MCP tool '%s' called", tool_name)
 
-        try:
-            # Connect to server and call tool (connect has built-in retry logic)
-            async with mcp_client.connect():
-                result = await mcp_client.call_tool(tool_name, kwargs)
-                return str(result)
-        except RuntimeError as e:
-            # Connection failures after all retries
-            error_msg = f"MCP tool '{tool_name}' connection failed after retries: {e!s}"
-            logger.error(error_msg)
-            return f"Error: {error_msg}"
-        except Exception as e:
-            # Tool execution or other errors
-            error_msg = f"MCP tool '{tool_name}' execution failed: {e!s}"
-            logger.exception(error_msg)
-            return f"Error: {error_msg}"
+        if bypass_internal_hitl:
+            call_kwargs = _unpack_synthetic_input_data(
+                {k: v for k, v in kwargs.items() if v is not None}
+            )
+        else:
+            # Outside try/except so ``GraphInterrupt`` propagates to LangGraph.
+            hitl_result = request_approval(
+                action_type="mcp_tool_call",
+                tool_name=tool_name,
+                params=kwargs,
+                context={
+                    "mcp_server": connector_name,
+                    "tool_description": raw_description,
+                    "mcp_transport": "stdio",
+                    "mcp_connector_id": connector_id,
+                },
+                trusted_tools=trusted_tools,
+            )
+            if hitl_result.rejected:
+                return "Tool call rejected by user."
+            call_kwargs = _unpack_synthetic_input_data(
+                {k: v for k, v in hitl_result.params.items() if v is not None}
+            )
+
+        last_error: Exception | None = None
+        for attempt in range(_TOOL_CALL_MAX_RETRIES):
+            try:
+                async with mcp_client.connect():
+                    result = await mcp_client.call_tool(tool_name, call_kwargs)
+                    return str(result)
+            except Exception as e:
+                last_error = e
+                if attempt < _TOOL_CALL_MAX_RETRIES - 1:
+                    delay = _TOOL_CALL_RETRY_DELAY * (2**attempt)
+                    logger.warning(
+                        "MCP tool '%s' failed (attempt %d/%d): %s. Retrying in %.1fs...",
+                        tool_name,
+                        attempt + 1,
+                        _TOOL_CALL_MAX_RETRIES,
+                        e,
+                        delay,
+                    )
+                    await asyncio.sleep(delay)
+                else:
+                    logger.error(
+                        "MCP tool '%s' failed after %d attempts: %s",
+                        tool_name,
+                        _TOOL_CALL_MAX_RETRIES,
+                        e,
+                        exc_info=True,
+                    )
+
+        return f"Error: MCP tool '{tool_name}' failed after {_TOOL_CALL_MAX_RETRIES} attempts: {last_error!s}"
 
-    # Create StructuredTool with response_format to preserve exact schema
     tool = StructuredTool(
         name=tool_name,
         description=tool_description,
         coroutine=mcp_tool_call,
         args_schema=input_model,
-        # Store the original MCP schema as metadata so we can access it later
-        metadata={"mcp_input_schema": input_schema, "mcp_transport": "stdio"},
+        metadata={
+            "mcp_input_schema": input_schema,
+            "mcp_transport": "stdio",
+            "mcp_connector_name": connector_name or None,
+            "mcp_connector_id": connector_id,
+            "mcp_is_generic": True,
+            "hitl": True,
+            # Full-args hash: shared identifiers (cloudId, workspaceId, …)
+            # would otherwise collapse legitimate batches.
+            "dedup_key": dedup_key_full_args,
+        },
     )
 
-    logger.info(f"Created MCP tool (stdio): '{tool_name}'")
+    logger.debug("Created MCP tool (stdio): '%s'", tool_name)
     return tool
 
 
@@ -152,66 +246,151 @@ async def _create_mcp_tool_from_definition_http(
     tool_def: dict[str, Any],
     url: str,
     headers: dict[str, str],
+    *,
+    connector_name: str = "",
+    connector_id: int | None = None,
+    trusted_tools: list[str] | None = None,
+    readonly_tools: frozenset[str] | None = None,
+    tool_name_prefix: str | None = None,
+    is_generic_mcp: bool = False,
+    bypass_internal_hitl: bool = False,
 ) -> StructuredTool:
     """Create a LangChain tool from an MCP tool definition (HTTP transport).
 
-    Args:
-        tool_def: Tool definition from MCP server with name, description, input_schema
-        url: URL of the MCP server
-        headers: HTTP headers for authentication
-
-    Returns:
-        LangChain StructuredTool instance
+    Write tools are wrapped with HITL approval; read-only tools (listed in
+    ``readonly_tools``) execute immediately without user confirmation. Set
+    ``bypass_internal_hitl=True`` when an outer ``HumanInTheLoopMiddleware``
+    already gates the tool.
 
+    When ``tool_name_prefix`` is set (multi-account disambiguation), the
+    tool exposed to the LLM gets a prefixed name (e.g. ``linear_25_list_issues``)
+    but the actual MCP ``call_tool`` still uses the original name.
     """
-    tool_name = tool_def.get("name", "unnamed_tool")
-    tool_description = tool_def.get("description", "No description provided")
+    original_tool_name = tool_def.get("name", "unnamed_tool")
+    raw_description = tool_def.get("description", "No description provided")
     input_schema = tool_def.get("input_schema", {"type": "object", "properties": {}})
+    is_readonly = readonly_tools is not None and original_tool_name in readonly_tools
 
-    # Log the actual schema for debugging
-    logger.info(f"MCP HTTP tool '{tool_name}' input schema: {input_schema}")
+    exposed_name = (
+        f"{tool_name_prefix}_{original_tool_name}"
+        if tool_name_prefix
+        else original_tool_name
+    )
+    if tool_name_prefix:
+        tool_description = f"[Account: {connector_name}] {raw_description}"
+    elif is_generic_mcp and connector_name:
+        tool_description = f"[MCP server: {connector_name}] {raw_description}"
+    else:
+        tool_description = raw_description
 
-    # Create dynamic input model from schema
-    input_model = _create_dynamic_input_model_from_schema(tool_name, input_schema)
+    logger.debug("MCP HTTP tool '%s' input schema: %s", exposed_name, input_schema)
+
+    input_model = _create_dynamic_input_model_from_schema(exposed_name, input_schema)
+
+    async def _do_mcp_call(
+        call_headers: dict[str, str],
+        call_kwargs: dict[str, Any],
+        timeout: float = 60.0,
+    ) -> str:
+        """Execute a single MCP HTTP call with the given headers."""
+        async with (
+            streamablehttp_client(url, headers=call_headers) as (read, write, _),
+            ClientSession(read, write) as session,
+        ):
+            await session.initialize()
+            response = await asyncio.wait_for(
+                session.call_tool(original_tool_name, arguments=call_kwargs),
+                timeout=timeout,
+            )
+
+            result = []
+            for content in response.content:
+                if hasattr(content, "text"):
+                    result.append(content.text)
+                elif hasattr(content, "data"):
+                    result.append(str(content.data))
+                else:
+                    result.append(str(content))
+
+            return "\n".join(result) if result else ""
 
     async def mcp_http_tool_call(**kwargs) -> str:
         """Execute the MCP tool call via HTTP transport."""
-        logger.info(f"MCP HTTP tool '{tool_name}' called with params: {kwargs}")
+        logger.debug("MCP HTTP tool '%s' called", exposed_name)
+
+        if is_readonly or bypass_internal_hitl:
+            call_kwargs = _unpack_synthetic_input_data(
+                {k: v for k, v in kwargs.items() if v is not None}
+            )
+        else:
+            hitl_result = request_approval(
+                action_type="mcp_tool_call",
+                tool_name=exposed_name,
+                params=kwargs,
+                context={
+                    "mcp_server": connector_name,
+                    "tool_description": raw_description,
+                    "mcp_transport": "http",
+                    "mcp_connector_id": connector_id,
+                },
+                trusted_tools=trusted_tools,
+            )
+            if hitl_result.rejected:
+                return "Tool call rejected by user."
+            call_kwargs = _unpack_synthetic_input_data(
+                {k: v for k, v in hitl_result.params.items() if v is not None}
+            )
 
         try:
-            async with (
-                streamablehttp_client(url, headers=headers) as (read, write, _),
-                ClientSession(read, write) as session,
-            ):
-                await session.initialize()
-
-                # Call the tool
-                response = await session.call_tool(tool_name, arguments=kwargs)
-
-                # Extract content from response
-                result = []
-                for content in response.content:
-                    if hasattr(content, "text"):
-                        result.append(content.text)
-                    elif hasattr(content, "data"):
-                        result.append(str(content.data))
-                    else:
-                        result.append(str(content))
-
-                result_str = "\n".join(result) if result else ""
+            result_str = await _do_mcp_call(headers, call_kwargs)
+            logger.debug(
+                "MCP HTTP tool '%s' succeeded (len=%d)", exposed_name, len(result_str)
+            )
+            return result_str
+
+        except Exception as first_err:
+            if not _is_auth_error(first_err) or connector_id is None:
+                logger.exception(
+                    "MCP HTTP tool '%s' execution failed: %s", exposed_name, first_err
+                )
+                return f"Error: MCP HTTP tool '{exposed_name}' execution failed: {first_err!s}"
+
+            logger.warning(
+                "MCP HTTP tool '%s' got 401 — attempting token refresh for connector %s",
+                exposed_name,
+                connector_id,
+            )
+            fresh_headers = await _force_refresh_and_get_headers(connector_id)
+            if fresh_headers is None:
+                await _mark_connector_auth_expired(connector_id)
+                return (
+                    f"Error: MCP tool '{exposed_name}' authentication expired. "
+                    "Please re-authenticate the connector in your settings."
+                )
+
+            try:
+                result_str = await _do_mcp_call(fresh_headers, call_kwargs)
                 logger.info(
-                    f"MCP HTTP tool '{tool_name}' succeeded: {result_str[:200]}"
+                    "MCP HTTP tool '%s' succeeded after 401 recovery",
+                    exposed_name,
                 )
                 return result_str
+            except Exception as retry_err:
+                logger.exception(
+                    "MCP HTTP tool '%s' still failing after token refresh: %s",
+                    exposed_name,
+                    retry_err,
+                )
+                if _is_auth_error(retry_err):
+                    await _mark_connector_auth_expired(connector_id)
+                    return (
+                        f"Error: MCP tool '{exposed_name}' authentication expired. "
+                        "Please re-authenticate the connector in your settings."
+                    )
+                return f"Error: MCP HTTP tool '{exposed_name}' execution failed: {retry_err!s}"
 
-        except Exception as e:
-            error_msg = f"MCP HTTP tool '{tool_name}' execution failed: {e!s}"
-            logger.exception(error_msg)
-            return f"Error: {error_msg}"
-
-    # Create StructuredTool
     tool = StructuredTool(
-        name=tool_name,
+        name=exposed_name,
         description=tool_description,
         coroutine=mcp_http_tool_call,
         args_schema=input_model,
@@ -219,10 +398,18 @@ async def mcp_http_tool_call(**kwargs) -> str:
             "mcp_input_schema": input_schema,
             "mcp_transport": "http",
             "mcp_url": url,
+            "mcp_connector_name": connector_name or None,
+            "mcp_is_generic": is_generic_mcp,
+            "hitl": not is_readonly,
+            # Full-args hash: shared identifiers (cloudId, workspaceId, …)
+            # would otherwise collapse legitimate batches.
+            "dedup_key": dedup_key_full_args,
+            "mcp_original_tool_name": original_tool_name,
+            "mcp_connector_id": connector_id,
         },
     )
 
-    logger.info(f"Created MCP tool (HTTP): '{tool_name}'")
+    logger.debug("Created MCP tool (HTTP): '%s'", exposed_name)
     return tool
 
 
@@ -230,64 +417,69 @@ async def _load_stdio_mcp_tools(
     connector_id: int,
     connector_name: str,
     server_config: dict[str, Any],
+    trusted_tools: list[str] | None = None,
+    *,
+    bypass_internal_hitl: bool = False,
 ) -> list[StructuredTool]:
-    """Load tools from a stdio-based MCP server.
-
-    Args:
-        connector_id: Connector ID for logging
-        connector_name: Connector name for logging
-        server_config: Server configuration with command, args, env
-
-    Returns:
-        List of tools from the MCP server
-    """
+    """Load tools from a stdio-based MCP server."""
     tools: list[StructuredTool] = []
 
-    # Validate required command field
     command = server_config.get("command")
     if not command or not isinstance(command, str):
         logger.warning(
-            f"MCP connector {connector_id} (name: '{connector_name}') missing or invalid command field, skipping"
+            "MCP connector %d (name: '%s') missing or invalid command field, skipping",
+            connector_id,
+            connector_name,
         )
         return tools
 
-    # Validate args field (must be list if present)
     args = server_config.get("args", [])
     if not isinstance(args, list):
         logger.warning(
-            f"MCP connector {connector_id} (name: '{connector_name}') has invalid args field (must be list), skipping"
+            "MCP connector %d (name: '%s') has invalid args field (must be list), skipping",
+            connector_id,
+            connector_name,
         )
         return tools
 
-    # Validate env field (must be dict if present)
     env = server_config.get("env", {})
     if not isinstance(env, dict):
         logger.warning(
-            f"MCP connector {connector_id} (name: '{connector_name}') has invalid env field (must be dict), skipping"
+            "MCP connector %d (name: '%s') has invalid env field (must be dict), skipping",
+            connector_id,
+            connector_name,
         )
         return tools
 
-    # Create MCP client
     mcp_client = MCPClient(command, args, env)
 
-    # Connect and discover tools
     async with mcp_client.connect():
         tool_definitions = await mcp_client.list_tools()
 
         logger.info(
-            f"Discovered {len(tool_definitions)} tools from stdio MCP server "
-            f"'{command}' (connector {connector_id})"
+            "Discovered %d tools from stdio MCP server '%s' (connector %d)",
+            len(tool_definitions),
+            command,
+            connector_id,
         )
 
-    # Create LangChain tools from definitions
     for tool_def in tool_definitions:
         try:
-            tool = await _create_mcp_tool_from_definition_stdio(tool_def, mcp_client)
+            tool = await _create_mcp_tool_from_definition_stdio(
+                tool_def,
+                mcp_client,
+                connector_name=connector_name,
+                connector_id=connector_id,
+                trusted_tools=trusted_tools,
+                bypass_internal_hitl=bypass_internal_hitl,
+            )
             tools.append(tool)
         except Exception as e:
             logger.exception(
-                f"Failed to create tool '{tool_def.get('name')}' "
-                f"from connector {connector_id}: {e!s}"
+                "Failed to create tool '%s' from connector %d: %s",
+                tool_def.get("name"),
+                connector_id,
+                e,
             )
 
     return tools
@@ -297,92 +489,450 @@ async def _load_http_mcp_tools(
     connector_id: int,
     connector_name: str,
     server_config: dict[str, Any],
+    trusted_tools: list[str] | None = None,
+    allowed_tools: list[str] | None = None,
+    readonly_tools: frozenset[str] | None = None,
+    tool_name_prefix: str | None = None,
+    is_generic_mcp: bool = False,
+    *,
+    bypass_internal_hitl: bool = False,
 ) -> list[StructuredTool]:
     """Load tools from an HTTP-based MCP server.
 
     Args:
-        connector_id: Connector ID for logging
-        connector_name: Connector name for logging
-        server_config: Server configuration with url, headers
-
-    Returns:
-        List of tools from the MCP server
+        allowed_tools: If non-empty, only tools whose names appear in this
+            list are loaded.  Empty/None means load everything (used for
+            user-managed generic MCP servers).
+        readonly_tools: Tool names that skip HITL approval (read-only operations).
+        tool_name_prefix: If set, each tool name is prefixed for multi-account
+            disambiguation (e.g. ``linear_25``).
     """
     tools: list[StructuredTool] = []
 
-    # Validate required url field
     url = server_config.get("url")
     if not url or not isinstance(url, str):
         logger.warning(
-            f"MCP connector {connector_id} (name: '{connector_name}') missing or invalid url field, skipping"
+            "MCP connector %d (name: '%s') missing or invalid url field, skipping",
+            connector_id,
+            connector_name,
         )
         return tools
 
-    # Validate headers field (must be dict if present)
     headers = server_config.get("headers", {})
     if not isinstance(headers, dict):
         logger.warning(
-            f"MCP connector {connector_id} (name: '{connector_name}') has invalid headers field (must be dict), skipping"
+            "MCP connector %d (name: '%s') has invalid headers field (must be dict), skipping",
+            connector_id,
+            connector_name,
         )
         return tools
 
-    # Connect and discover tools via HTTP
-    try:
+    allowed_set = set(allowed_tools) if allowed_tools else None
+
+    async def _discover(disc_headers: dict[str, str]) -> list[dict[str, Any]]:
+        """Connect, initialize, and list tools from the MCP server."""
         async with (
-            streamablehttp_client(url, headers=headers) as (read, write, _),
+            streamablehttp_client(url, headers=disc_headers) as (read, write, _),
             ClientSession(read, write) as session,
         ):
             await session.initialize()
-
-            # List available tools
             response = await session.list_tools()
-            tool_definitions = []
-            for tool in response.tools:
-                tool_definitions.append(
-                    {
-                        "name": tool.name,
-                        "description": tool.description or "",
-                        "input_schema": tool.inputSchema
-                        if hasattr(tool, "inputSchema")
-                        else {},
-                    }
-                )
+            return [
+                {
+                    "name": tool.name,
+                    "description": tool.description or "",
+                    "input_schema": tool.inputSchema
+                    if hasattr(tool, "inputSchema")
+                    else {},
+                }
+                for tool in response.tools
+            ]
+
+    try:
+        tool_definitions = await _discover(headers)
+    except Exception as first_err:
+        if not _is_auth_error(first_err) or connector_id is None:
+            logger.exception(
+                "Failed to connect to HTTP MCP server at '%s' (connector %d): %s",
+                url,
+                connector_id,
+                first_err,
+            )
+            return tools
+
+        logger.warning(
+            "HTTP MCP discovery for connector %d got 401 — attempting token refresh",
+            connector_id,
+        )
+        fresh_headers = await _force_refresh_and_get_headers(connector_id)
+        if fresh_headers is None:
+            await _mark_connector_auth_expired(connector_id)
+            logger.error(
+                "HTTP MCP discovery for connector %d: token refresh failed, marking auth_expired",
+                connector_id,
+            )
+            return tools
 
+        try:
+            tool_definitions = await _discover(fresh_headers)
+            headers = fresh_headers
             logger.info(
-                f"Discovered {len(tool_definitions)} tools from HTTP MCP server "
-                f"'{url}' (connector {connector_id})"
+                "HTTP MCP discovery for connector %d succeeded after 401 recovery",
+                connector_id,
+            )
+        except Exception as retry_err:
+            logger.exception(
+                "HTTP MCP discovery for connector %d still failing after refresh: %s",
+                connector_id,
+                retry_err,
             )
+            if _is_auth_error(retry_err):
+                await _mark_connector_auth_expired(connector_id)
+            return tools
 
-        # Create LangChain tools from definitions
-        for tool_def in tool_definitions:
-            try:
-                tool = await _create_mcp_tool_from_definition_http(
-                    tool_def, url, headers
-                )
-                tools.append(tool)
-            except Exception as e:
-                logger.exception(
-                    f"Failed to create HTTP tool '{tool_def.get('name')}' "
-                    f"from connector {connector_id}: {e!s}"
-                )
+    total_discovered = len(tool_definitions)
 
-    except Exception as e:
-        logger.exception(
-            f"Failed to connect to HTTP MCP server at '{url}' (connector {connector_id}): {e!s}"
+    if allowed_set:
+        tool_definitions = [td for td in tool_definitions if td["name"] in allowed_set]
+        logger.info(
+            "HTTP MCP server '%s' (connector %d): %d/%d tools after allowlist filter",
+            url,
+            connector_id,
+            len(tool_definitions),
+            total_discovered,
         )
+    else:
+        logger.info(
+            "Discovered %d tools from HTTP MCP server '%s' (connector %d) — no allowlist, loading all",
+            total_discovered,
+            url,
+            connector_id,
+        )
+
+    for tool_def in tool_definitions:
+        try:
+            tool = await _create_mcp_tool_from_definition_http(
+                tool_def,
+                url,
+                headers,
+                connector_name=connector_name,
+                connector_id=connector_id,
+                trusted_tools=trusted_tools,
+                readonly_tools=readonly_tools,
+                tool_name_prefix=tool_name_prefix,
+                is_generic_mcp=is_generic_mcp,
+                bypass_internal_hitl=bypass_internal_hitl,
+            )
+            tools.append(tool)
+        except Exception as e:
+            logger.exception(
+                "Failed to create HTTP tool '%s' from connector %d: %s",
+                tool_def.get("name"),
+                connector_id,
+                e,
+            )
 
     return tools
 
 
-def invalidate_mcp_tools_cache(search_space_id: int | None = None) -> None:
-    """Invalidate cached MCP tools.
+_TOKEN_REFRESH_BUFFER_SECONDS = 300  # refresh 5 min before expiry
 
-    Args:
-        search_space_id: If provided, only invalidate for this search space.
-                        If None, invalidate all cached MCP tools.
+_token_enc: TokenEncryption | None = None
+
+
+def _get_token_enc() -> TokenEncryption:
+    global _token_enc
+    if _token_enc is None:
+        from app.config import config as app_config
+        from app.utils.oauth_security import TokenEncryption
+
+        _token_enc = TokenEncryption(app_config.SECRET_KEY)
+    return _token_enc
+
+
+def _inject_oauth_headers(
+    cfg: dict[str, Any],
+    server_config: dict[str, Any],
+) -> dict[str, Any] | None:
+    """Decrypt the MCP OAuth access token and inject it into server_config headers.
+
+    The DB never stores plaintext tokens in ``server_config.headers``.  This
+    function decrypts ``mcp_oauth.access_token`` at runtime and returns a
+    *copy* of ``server_config`` with the Authorization header set.
+    """
+    mcp_oauth = cfg.get("mcp_oauth", {})
+    encrypted_token = mcp_oauth.get("access_token")
+    if not encrypted_token:
+        return server_config
+
+    try:
+        access_token = _get_token_enc().decrypt_token(encrypted_token)
+
+        result = dict(server_config)
+        result["headers"] = {
+            **server_config.get("headers", {}),
+            "Authorization": f"Bearer {access_token}",
+        }
+        return result
+    except Exception:
+        logger.error(
+            "Failed to decrypt MCP OAuth token — connector will be skipped",
+            exc_info=True,
+        )
+        return None
+
+
+async def _refresh_connector_token(
+    session: AsyncSession,
+    connector: SearchSourceConnector,
+) -> str | None:
+    """Refresh the OAuth token for an MCP connector and persist the result.
+
+    This is the shared core used by both proactive (pre-expiry) and reactive
+    (401 recovery) refresh paths.  It handles:
+    - Decrypting the current refresh token / client secret
+    - Calling the token endpoint
+    - Encrypting and persisting the new tokens
+    - Clearing ``auth_expired`` if it was set
+    - Invalidating the MCP tools cache
+
+    Returns the **plaintext** new access token on success, or ``None`` on
+    failure (no refresh token, IdP error, etc.).
+    """
+    from datetime import UTC, datetime, timedelta
+
+    from sqlalchemy.orm.attributes import flag_modified
+
+    from app.services.mcp_oauth.discovery import refresh_access_token
+
+    cfg = connector.config or {}
+    mcp_oauth = cfg.get("mcp_oauth", {})
+
+    refresh_token = mcp_oauth.get("refresh_token")
+    if not refresh_token:
+        logger.warning(
+            "MCP connector %s: no refresh_token available",
+            connector.id,
+        )
+        return None
+
+    enc = _get_token_enc()
+    decrypted_refresh = enc.decrypt_token(refresh_token)
+    decrypted_secret = (
+        enc.decrypt_token(mcp_oauth["client_secret"])
+        if mcp_oauth.get("client_secret")
+        else ""
+    )
+
+    token_json = await refresh_access_token(
+        token_endpoint=mcp_oauth["token_endpoint"],
+        refresh_token=decrypted_refresh,
+        client_id=mcp_oauth["client_id"],
+        client_secret=decrypted_secret,
+    )
+
+    new_access = token_json.get("access_token")
+    if not new_access:
+        logger.warning(
+            "MCP connector %s: token refresh returned no access_token",
+            connector.id,
+        )
+        return None
+
+    new_expires_at = None
+    if token_json.get("expires_in"):
+        new_expires_at = datetime.now(UTC) + timedelta(
+            seconds=int(token_json["expires_in"])
+        )
+
+    updated_oauth = dict(mcp_oauth)
+    updated_oauth["access_token"] = enc.encrypt_token(new_access)
+    if token_json.get("refresh_token"):
+        updated_oauth["refresh_token"] = enc.encrypt_token(token_json["refresh_token"])
+    updated_oauth["expires_at"] = new_expires_at.isoformat() if new_expires_at else None
+
+    updated_cfg = {**cfg, "mcp_oauth": updated_oauth}
+    updated_cfg.pop("auth_expired", None)
+    connector.config = updated_cfg
+    flag_modified(connector, "config")
+    await session.commit()
+    await session.refresh(connector)
+
+    invalidate_mcp_tools_cache(connector.search_space_id)
+
+    return new_access
+
+
+async def _maybe_refresh_mcp_oauth_token(
+    session: AsyncSession,
+    connector: SearchSourceConnector,
+    cfg: dict[str, Any],
+    server_config: dict[str, Any],
+) -> dict[str, Any]:
+    """Refresh the access token for an MCP OAuth connector if it is about to expire.
+
+    Returns the (possibly updated) ``server_config``.
+    """
+    from datetime import UTC, datetime, timedelta
+
+    mcp_oauth = cfg.get("mcp_oauth", {})
+    expires_at_str = mcp_oauth.get("expires_at")
+    if not expires_at_str:
+        return server_config
+
+    try:
+        expires_at = datetime.fromisoformat(expires_at_str)
+        if expires_at.tzinfo is None:
+            expires_at = expires_at.replace(tzinfo=UTC)
+
+        if datetime.now(UTC) < expires_at - timedelta(
+            seconds=_TOKEN_REFRESH_BUFFER_SECONDS
+        ):
+            return server_config
+    except (ValueError, TypeError):
+        return server_config
+
+    try:
+        new_access = await _refresh_connector_token(session, connector)
+        if not new_access:
+            return server_config
+
+        logger.info(
+            "Proactively refreshed MCP OAuth token for connector %s", connector.id
+        )
+
+        refreshed_config = dict(server_config)
+        refreshed_config["headers"] = {
+            **server_config.get("headers", {}),
+            "Authorization": f"Bearer {new_access}",
+        }
+        return refreshed_config
+
+    except Exception:
+        logger.warning(
+            "Failed to refresh MCP OAuth token for connector %s",
+            connector.id,
+            exc_info=True,
+        )
+        return server_config
+
+
+# ---------------------------------------------------------------------------
+# Reactive 401 handling helpers
+# ---------------------------------------------------------------------------
+
+
+def _is_auth_error(exc: Exception) -> bool:
+    """Check if an exception indicates an HTTP 401 authentication failure."""
+    try:
+        import httpx
+
+        if isinstance(exc, httpx.HTTPStatusError):
+            return exc.response.status_code == 401
+    except ImportError:
+        pass
+    err_str = str(exc).lower()
+    return "401" in err_str or "unauthorized" in err_str
+
+
+async def _force_refresh_and_get_headers(
+    connector_id: int,
+) -> dict[str, str] | None:
+    """Force-refresh OAuth token for a connector and return fresh HTTP headers.
+
+    Opens a **new** DB session so this can be called from inside tool closures
+    that don't have access to the original session.
+
+    Returns ``None`` when the connector is not OAuth-backed, has no
+    refresh token, or the refresh itself fails.
     """
+    from app.db import async_session_maker
+
+    try:
+        async with async_session_maker() as session:
+            result = await session.execute(
+                select(SearchSourceConnector).filter(
+                    SearchSourceConnector.id == connector_id,
+                )
+            )
+            connector = result.scalars().first()
+            if not connector:
+                return None
+
+            cfg = connector.config or {}
+            if not cfg.get("mcp_oauth"):
+                return None
+
+            server_config = cfg.get("server_config", {})
+
+            new_access = await _refresh_connector_token(session, connector)
+            if not new_access:
+                return None
+
+            logger.info(
+                "Force-refreshed MCP OAuth token for connector %s (401 recovery)",
+                connector_id,
+            )
+            return {
+                **server_config.get("headers", {}),
+                "Authorization": f"Bearer {new_access}",
+            }
+
+    except Exception:
+        logger.warning(
+            "Failed to force-refresh MCP OAuth token for connector %s",
+            connector_id,
+            exc_info=True,
+        )
+        return None
+
+
+async def _mark_connector_auth_expired(connector_id: int) -> None:
+    """Set ``config.auth_expired = True`` so the frontend shows re-auth UI."""
+    from app.db import async_session_maker
+
+    try:
+        async with async_session_maker() as session:
+            result = await session.execute(
+                select(SearchSourceConnector).filter(
+                    SearchSourceConnector.id == connector_id,
+                )
+            )
+            connector = result.scalars().first()
+            if not connector:
+                return
+
+            cfg = dict(connector.config or {})
+            if cfg.get("auth_expired"):
+                return
+
+            cfg["auth_expired"] = True
+            connector.config = cfg
+
+            from sqlalchemy.orm.attributes import flag_modified
+
+            flag_modified(connector, "config")
+            await session.commit()
+
+            logger.info(
+                "Marked MCP connector %s as auth_expired after unrecoverable 401",
+                connector_id,
+            )
+            invalidate_mcp_tools_cache(connector.search_space_id)
+
+    except Exception:
+        logger.warning(
+            "Failed to mark connector %s as auth_expired",
+            connector_id,
+            exc_info=True,
+        )
+
+
+def invalidate_mcp_tools_cache(search_space_id: int | None = None) -> None:
+    """Invalidate cached MCP tools (both ``bypass_internal_hitl`` variants together)."""
     if search_space_id is not None:
-        _mcp_tools_cache.pop(search_space_id, None)
+        for key in [k for k in _mcp_tools_cache if k[0] == search_space_id]:
+            _mcp_tools_cache.pop(key, None)
     else:
         _mcp_tools_cache.clear()
 
@@ -390,86 +940,199 @@ def invalidate_mcp_tools_cache(search_space_id: int | None = None) -> None:
 async def load_mcp_tools(
     session: AsyncSession,
     search_space_id: int,
+    *,
+    bypass_internal_hitl: bool = False,
 ) -> list[StructuredTool]:
-    """Load all MCP tools from user's active MCP server connectors.
-
-    This discovers tools dynamically from MCP servers using the protocol.
-    Supports both stdio (local process) and HTTP (remote server) transports.
-
-    Results are cached per search space for up to 5 minutes to avoid
-    re-spawning MCP server processes on every chat message.
-
-    Args:
-        session: Database session
-        search_space_id: User's search space ID
-
-    Returns:
-        List of LangChain StructuredTool instances
+    """Load all MCP tools from the user's active MCP server connectors.
 
+    Results are cached per ``(search_space_id, bypass_internal_hitl)`` for up
+    to 5 minutes; bypass is keyed because each variant builds a different tool
+    closure (with vs. without the in-wrapper ``request_approval`` gate).
     """
     _evict_expired_mcp_cache()
 
     now = time.monotonic()
-    cached = _mcp_tools_cache.get(search_space_id)
+    cache_key: _MCPCacheKey = (search_space_id, bypass_internal_hitl)
+    cached = _mcp_tools_cache.get(cache_key)
     if cached is not None:
         cached_at, cached_tools = cached
         if now - cached_at < _MCP_CACHE_TTL_SECONDS:
             logger.info(
-                "Using cached MCP tools for search space %s (%d tools, age=%.0fs)",
+                "Using cached MCP tools for search space %s (%d tools, age=%.0fs, bypass_hitl=%s)",
                 search_space_id,
                 len(cached_tools),
                 now - cached_at,
+                bypass_internal_hitl,
             )
             return list(cached_tools)
 
     try:
+        # Find all connectors with MCP server config: generic MCP_CONNECTOR type
+        # and service-specific types (LINEAR_CONNECTOR, etc.) created via MCP OAuth.
+        # Cast JSON -> JSONB so we can use has_key to filter by the presence of "server_config".
         result = await session.execute(
             select(SearchSourceConnector).filter(
-                SearchSourceConnector.connector_type
-                == SearchSourceConnectorType.MCP_CONNECTOR,
                 SearchSourceConnector.search_space_id == search_space_id,
+                cast(SearchSourceConnector.config, JSONB).has_key("server_config"),
             ),
         )
 
-        tools: list[StructuredTool] = []
-        for connector in result.scalars():
+        connectors = list(result.scalars())
+
+        # Group connectors by type to detect multi-account scenarios.
+        # When >1 connector shares the same type, tool names would collide
+        # so we prefix them with "{service_key}_{connector_id}_".
+        type_groups: dict[str, list[SearchSourceConnector]] = defaultdict(list)
+        for connector in connectors:
+            ct = (
+                connector.connector_type.value
+                if hasattr(connector.connector_type, "value")
+                else str(connector.connector_type)
+            )
+            type_groups[ct].append(connector)
+
+        multi_account_types: set[str] = {
+            ct for ct, group in type_groups.items() if len(group) > 1
+        }
+        if multi_account_types:
+            logger.info(
+                "Multi-account detected for connector types: %s",
+                multi_account_types,
+            )
+
+        discovery_tasks: list[dict[str, Any]] = []
+        for connector in connectors:
             try:
-                config = connector.config or {}
-                server_config = config.get("server_config", {})
+                cfg = connector.config or {}
+                server_config = cfg.get("server_config", {})
 
                 if not server_config or not isinstance(server_config, dict):
                     logger.warning(
-                        f"MCP connector {connector.id} (name: '{connector.name}') has invalid or missing server_config, skipping"
+                        "MCP connector %d (name: '%s') has invalid or missing server_config, skipping",
+                        connector.id,
+                        connector.name,
                     )
                     continue
 
-                transport = server_config.get("transport", "stdio")
-
-                if transport in ("streamable-http", "http", "sse"):
-                    connector_tools = await _load_http_mcp_tools(
-                        connector.id, connector.name, server_config
+                if cfg.get("mcp_oauth"):
+                    server_config = await _maybe_refresh_mcp_oauth_token(
+                        session,
+                        connector,
+                        cfg,
+                        server_config,
                     )
-                else:
-                    connector_tools = await _load_stdio_mcp_tools(
-                        connector.id, connector.name, server_config
+                    cfg = connector.config or {}
+                    server_config = _inject_oauth_headers(cfg, server_config)
+                    if server_config is None:
+                        logger.warning(
+                            "Skipping MCP connector %d — OAuth token decryption failed",
+                            connector.id,
+                        )
+                        await _mark_connector_auth_expired(connector.id)
+                        continue
+
+                trusted_tools = cfg.get("trusted_tools", [])
+
+                ct = (
+                    connector.connector_type.value
+                    if hasattr(connector.connector_type, "value")
+                    else str(connector.connector_type)
+                )
+
+                svc_cfg = get_service_by_connector_type(ct)
+                allowed_tools = svc_cfg.allowed_tools if svc_cfg else []
+                readonly_tools = svc_cfg.readonly_tools if svc_cfg else frozenset()
+
+                tool_name_prefix: str | None = None
+                if ct in multi_account_types and svc_cfg:
+                    service_key = next(
+                        (k for k, v in MCP_SERVICES.items() if v is svc_cfg),
+                        None,
                     )
+                    if service_key:
+                        tool_name_prefix = f"{service_key}_{connector.id}"
 
-                tools.extend(connector_tools)
+                discovery_tasks.append(
+                    {
+                        "connector_id": connector.id,
+                        "connector_name": connector.name,
+                        "server_config": server_config,
+                        "trusted_tools": trusted_tools,
+                        "allowed_tools": allowed_tools,
+                        "readonly_tools": readonly_tools,
+                        "tool_name_prefix": tool_name_prefix,
+                        "transport": server_config.get("transport", "stdio"),
+                        "is_generic_mcp": svc_cfg is None,
+                    }
+                )
 
             except Exception as e:
                 logger.exception(
-                    f"Failed to load tools from MCP connector {connector.id}: {e!s}"
+                    "Failed to prepare MCP connector %d: %s",
+                    connector.id,
+                    e,
                 )
 
-        _mcp_tools_cache[search_space_id] = (now, tools)
+        async def _discover_one(task: dict[str, Any]) -> list[StructuredTool]:
+            try:
+                if task["transport"] in ("streamable-http", "http", "sse"):
+                    return await asyncio.wait_for(
+                        _load_http_mcp_tools(
+                            task["connector_id"],
+                            task["connector_name"],
+                            task["server_config"],
+                            trusted_tools=task["trusted_tools"],
+                            allowed_tools=task["allowed_tools"],
+                            readonly_tools=task["readonly_tools"],
+                            tool_name_prefix=task["tool_name_prefix"],
+                            is_generic_mcp=task.get("is_generic_mcp", False),
+                            bypass_internal_hitl=bypass_internal_hitl,
+                        ),
+                        timeout=_MCP_DISCOVERY_TIMEOUT_SECONDS,
+                    )
+                else:
+                    return await asyncio.wait_for(
+                        _load_stdio_mcp_tools(
+                            task["connector_id"],
+                            task["connector_name"],
+                            task["server_config"],
+                            trusted_tools=task["trusted_tools"],
+                            bypass_internal_hitl=bypass_internal_hitl,
+                        ),
+                        timeout=_MCP_DISCOVERY_TIMEOUT_SECONDS,
+                    )
+            except TimeoutError:
+                logger.error(
+                    "MCP connector %d timed out after %ds during discovery",
+                    task["connector_id"],
+                    _MCP_DISCOVERY_TIMEOUT_SECONDS,
+                )
+                return []
+            except Exception as e:
+                logger.exception(
+                    "Failed to load tools from MCP connector %d: %s",
+                    task["connector_id"],
+                    e,
+                )
+                return []
+
+        results = await asyncio.gather(*[_discover_one(t) for t in discovery_tasks])
+        tools: list[StructuredTool] = [tool for sublist in results for tool in sublist]
+
+        _mcp_tools_cache[cache_key] = (now, tools)
 
         if len(_mcp_tools_cache) > _MCP_CACHE_MAX_SIZE:
             oldest_key = min(_mcp_tools_cache, key=lambda k: _mcp_tools_cache[k][0])
             del _mcp_tools_cache[oldest_key]
 
-        logger.info(f"Loaded {len(tools)} MCP tools for search space {search_space_id}")
+        logger.info(
+            "Loaded %d MCP tools for search space %d (bypass_hitl=%s)",
+            len(tools),
+            search_space_id,
+            bypass_internal_hitl,
+        )
         return tools
 
     except Exception as e:
-        logger.exception(f"Failed to load MCP tools: {e!s}")
+        logger.exception("Failed to load MCP tools: %s", e)
         return []
diff --git a/surfsense_backend/app/agents/new_chat/tools/notion/create_page.py b/surfsense_backend/app/agents/new_chat/tools/notion/create_page.py
index 5bb0c52d17..6ec95e9f08 100644
--- a/surfsense_backend/app/agents/new_chat/tools/notion/create_page.py
+++ b/surfsense_backend/app/agents/new_chat/tools/notion/create_page.py
@@ -2,10 +2,11 @@
 from typing import Any
 
 from langchain_core.tools import tool
-from langgraph.types import interrupt
 from sqlalchemy.ext.asyncio import AsyncSession
 
+from app.agents.new_chat.tools.hitl import request_approval
 from app.connectors.notion_history import NotionAPIError, NotionHistoryConnector
+from app.db import async_session_maker
 from app.services.notion import NotionToolMetadataService
 
 logger = logging.getLogger(__name__)
@@ -20,8 +21,17 @@ def create_create_notion_page_tool(
     """
     Factory function to create the create_notion_page tool.
 
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker`. This is critical for the compiled-agent
+    cache: the compiled graph (and therefore this closure) is reused
+    across HTTP requests, so capturing a per-request session here would
+    surface stale/closed sessions on cache hits. Per-call sessions also
+    keep the request's outer transaction free of long-running Notion API
+    blocking.
+
     Args:
-        db_session: Database session for accessing Notion connector
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
         search_space_id: Search space ID to find the Notion connector
         user_id: User ID for fetching user-specific context
         connector_id: Optional specific connector ID (if known)
@@ -29,6 +39,7 @@ def create_create_notion_page_tool(
     Returns:
         Configured create_notion_page tool
     """
+    del db_session  # per-call session — see docstring
 
     @tool
     async def create_notion_page(
@@ -67,7 +78,7 @@ async def create_notion_page(
         """
         logger.info(f"create_notion_page called: title='{title}'")
 
-        if db_session is None or search_space_id is None or user_id is None:
+        if search_space_id is None or user_id is None:
             logger.error(
                 "Notion tool not properly configured - missing required parameters"
             )
@@ -77,186 +88,157 @@ async def create_notion_page(
             }
 
         try:
-            metadata_service = NotionToolMetadataService(db_session)
-            context = await metadata_service.get_creation_context(
-                search_space_id, user_id
-            )
-
-            if "error" in context:
-                logger.error(f"Failed to fetch creation context: {context['error']}")
-                return {
-                    "status": "error",
-                    "message": context["error"],
-                }
-
-            accounts = context.get("accounts", [])
-            if accounts and all(a.get("auth_expired") for a in accounts):
-                logger.warning("All Notion accounts have expired authentication")
-                return {
-                    "status": "auth_error",
-                    "message": "All connected Notion accounts need re-authentication. Please re-authenticate in your connector settings.",
-                    "connector_type": "notion",
-                }
-
-            logger.info(f"Requesting approval for creating Notion page: '{title}'")
-            approval = interrupt(
-                {
-                    "type": "notion_page_creation",
-                    "action": {
-                        "tool": "create_notion_page",
-                        "params": {
-                            "title": title,
-                            "content": content,
-                            "parent_page_id": None,
-                            "connector_id": connector_id,
-                        },
-                    },
-                    "context": context,
-                }
-            )
-
-            decisions_raw = (
-                approval.get("decisions", []) if isinstance(approval, dict) else []
-            )
-            decisions = (
-                decisions_raw if isinstance(decisions_raw, list) else [decisions_raw]
-            )
-            decisions = [d for d in decisions if isinstance(d, dict)]
-            if not decisions:
-                logger.warning("No approval decision received")
-                return {
-                    "status": "error",
-                    "message": "No approval decision received",
-                }
-
-            decision = decisions[0]
-            decision_type = decision.get("type") or decision.get("decision_type")
-            logger.info(f"User decision: {decision_type}")
-
-            if decision_type == "reject":
-                logger.info("Notion page creation rejected by user")
-                return {
-                    "status": "rejected",
-                    "message": "User declined. The page was not created. Do not ask again or suggest alternatives.",
-                }
-
-            edited_action = decision.get("edited_action")
-            final_params: dict[str, Any] = {}
-            if isinstance(edited_action, dict):
-                edited_args = edited_action.get("args")
-                if isinstance(edited_args, dict):
-                    final_params = edited_args
-            elif isinstance(decision.get("args"), dict):
-                # Some interrupt payloads place args directly on the decision.
-                final_params = decision["args"]
-
-            final_title = final_params.get("title", title)
-            final_content = final_params.get("content", content)
-            final_parent_page_id = final_params.get("parent_page_id")
-            final_connector_id = final_params.get("connector_id", connector_id)
-
-            if not final_title or not final_title.strip():
-                logger.error("Title is empty or contains only whitespace")
-                return {
-                    "status": "error",
-                    "message": "Page title cannot be empty. Please provide a valid title.",
-                }
-
-            logger.info(
-                f"Creating Notion page with final params: title='{final_title}'"
-            )
-
-            from sqlalchemy.future import select
-
-            from app.db import SearchSourceConnector, SearchSourceConnectorType
-
-            actual_connector_id = final_connector_id
-            if actual_connector_id is None:
-                result = await db_session.execute(
-                    select(SearchSourceConnector).filter(
-                        SearchSourceConnector.search_space_id == search_space_id,
-                        SearchSourceConnector.user_id == user_id,
-                        SearchSourceConnector.connector_type
-                        == SearchSourceConnectorType.NOTION_CONNECTOR,
-                    )
+            async with async_session_maker() as db_session:
+                metadata_service = NotionToolMetadataService(db_session)
+                context = await metadata_service.get_creation_context(
+                    search_space_id, user_id
                 )
-                connector = result.scalars().first()
 
-                if not connector:
-                    logger.warning(
-                        f"No Notion connector found for search_space_id={search_space_id}"
+                if "error" in context:
+                    logger.error(
+                        f"Failed to fetch creation context: {context['error']}"
                     )
                     return {
                         "status": "error",
-                        "message": "No Notion connector found. Please connect Notion in your workspace settings.",
+                        "message": context["error"],
                     }
 
-                actual_connector_id = connector.id
-                logger.info(f"Found Notion connector: id={actual_connector_id}")
-            else:
-                result = await db_session.execute(
-                    select(SearchSourceConnector).filter(
-                        SearchSourceConnector.id == actual_connector_id,
-                        SearchSourceConnector.search_space_id == search_space_id,
-                        SearchSourceConnector.user_id == user_id,
-                        SearchSourceConnector.connector_type
-                        == SearchSourceConnectorType.NOTION_CONNECTOR,
-                    )
+                accounts = context.get("accounts", [])
+                if accounts and all(a.get("auth_expired") for a in accounts):
+                    logger.warning("All Notion accounts have expired authentication")
+                    return {
+                        "status": "auth_error",
+                        "message": "All connected Notion accounts need re-authentication. Please re-authenticate in your connector settings.",
+                        "connector_type": "notion",
+                    }
+
+                logger.info(f"Requesting approval for creating Notion page: '{title}'")
+                result = request_approval(
+                    action_type="notion_page_creation",
+                    tool_name="create_notion_page",
+                    params={
+                        "title": title,
+                        "content": content,
+                        "parent_page_id": None,
+                        "connector_id": connector_id,
+                    },
+                    context=context,
                 )
-                connector = result.scalars().first()
 
-                if not connector:
-                    logger.error(
-                        f"Invalid connector_id={actual_connector_id} for search_space_id={search_space_id}"
-                    )
+                if result.rejected:
+                    logger.info("Notion page creation rejected by user")
+                    return {
+                        "status": "rejected",
+                        "message": "User declined. Do not retry or suggest alternatives.",
+                    }
+
+                final_title = result.params.get("title", title)
+                final_content = result.params.get("content", content)
+                final_parent_page_id = result.params.get("parent_page_id")
+                final_connector_id = result.params.get("connector_id", connector_id)
+
+                if not final_title or not final_title.strip():
+                    logger.error("Title is empty or contains only whitespace")
                     return {
                         "status": "error",
-                        "message": "Selected Notion account is invalid or has been disconnected. Please select a valid account.",
+                        "message": "Page title cannot be empty. Please provide a valid title.",
                     }
-                logger.info(f"Validated Notion connector: id={actual_connector_id}")
 
-            notion_connector = NotionHistoryConnector(
-                session=db_session,
-                connector_id=actual_connector_id,
-            )
+                logger.info(
+                    f"Creating Notion page with final params: title='{final_title}'"
+                )
 
-            result = await notion_connector.create_page(
-                title=final_title,
-                content=final_content,
-                parent_page_id=final_parent_page_id,
-            )
-            logger.info(
-                f"create_page result: {result.get('status')} - {result.get('message', '')}"
-            )
+                from sqlalchemy.future import select
+
+                from app.db import SearchSourceConnector, SearchSourceConnectorType
 
-            if result.get("status") == "success":
-                kb_message_suffix = ""
-                try:
-                    from app.services.notion import NotionKBSyncService
-
-                    kb_service = NotionKBSyncService(db_session)
-                    kb_result = await kb_service.sync_after_create(
-                        page_id=result.get("page_id"),
-                        page_title=result.get("title", final_title),
-                        page_url=result.get("url"),
-                        content=final_content,
-                        connector_id=actual_connector_id,
-                        search_space_id=search_space_id,
-                        user_id=user_id,
+                actual_connector_id = final_connector_id
+                if actual_connector_id is None:
+                    result = await db_session.execute(
+                        select(SearchSourceConnector).filter(
+                            SearchSourceConnector.search_space_id == search_space_id,
+                            SearchSourceConnector.user_id == user_id,
+                            SearchSourceConnector.connector_type
+                            == SearchSourceConnectorType.NOTION_CONNECTOR,
+                        )
                     )
-                    if kb_result["status"] == "success":
-                        kb_message_suffix = (
-                            " Your knowledge base has also been updated."
+                    connector = result.scalars().first()
+
+                    if not connector:
+                        logger.warning(
+                            f"No Notion connector found for search_space_id={search_space_id}"
+                        )
+                        return {
+                            "status": "error",
+                            "message": "No Notion connector found. Please connect Notion in your workspace settings.",
+                        }
+
+                    actual_connector_id = connector.id
+                    logger.info(f"Found Notion connector: id={actual_connector_id}")
+                else:
+                    result = await db_session.execute(
+                        select(SearchSourceConnector).filter(
+                            SearchSourceConnector.id == actual_connector_id,
+                            SearchSourceConnector.search_space_id == search_space_id,
+                            SearchSourceConnector.user_id == user_id,
+                            SearchSourceConnector.connector_type
+                            == SearchSourceConnectorType.NOTION_CONNECTOR,
+                        )
+                    )
+                    connector = result.scalars().first()
+
+                    if not connector:
+                        logger.error(
+                            f"Invalid connector_id={actual_connector_id} for search_space_id={search_space_id}"
+                        )
+                        return {
+                            "status": "error",
+                            "message": "Selected Notion account is invalid or has been disconnected. Please select a valid account.",
+                        }
+                    logger.info(f"Validated Notion connector: id={actual_connector_id}")
+
+                notion_connector = NotionHistoryConnector(
+                    session=db_session,
+                    connector_id=actual_connector_id,
+                )
+
+                result = await notion_connector.create_page(
+                    title=final_title,
+                    content=final_content,
+                    parent_page_id=final_parent_page_id,
+                )
+                logger.info(
+                    f"create_page result: {result.get('status')} - {result.get('message', '')}"
+                )
+
+                if result.get("status") == "success":
+                    kb_message_suffix = ""
+                    try:
+                        from app.services.notion import NotionKBSyncService
+
+                        kb_service = NotionKBSyncService(db_session)
+                        kb_result = await kb_service.sync_after_create(
+                            page_id=result.get("page_id"),
+                            page_title=result.get("title", final_title),
+                            page_url=result.get("url"),
+                            content=final_content,
+                            connector_id=actual_connector_id,
+                            search_space_id=search_space_id,
+                            user_id=user_id,
                         )
-                    else:
+                        if kb_result["status"] == "success":
+                            kb_message_suffix = (
+                                " Your knowledge base has also been updated."
+                            )
+                        else:
+                            kb_message_suffix = " This page will be added to your knowledge base in the next scheduled sync."
+                    except Exception as kb_err:
+                        logger.warning(f"KB sync after create failed: {kb_err}")
                         kb_message_suffix = " This page will be added to your knowledge base in the next scheduled sync."
-                except Exception as kb_err:
-                    logger.warning(f"KB sync after create failed: {kb_err}")
-                    kb_message_suffix = " This page will be added to your knowledge base in the next scheduled sync."
 
-                result["message"] = result.get("message", "") + kb_message_suffix
+                    result["message"] = result.get("message", "") + kb_message_suffix
 
-            return result
+                return result
 
         except Exception as e:
             from langgraph.errors import GraphInterrupt
diff --git a/surfsense_backend/app/agents/new_chat/tools/notion/delete_page.py b/surfsense_backend/app/agents/new_chat/tools/notion/delete_page.py
index fbb7c50041..7b85da4c2c 100644
--- a/surfsense_backend/app/agents/new_chat/tools/notion/delete_page.py
+++ b/surfsense_backend/app/agents/new_chat/tools/notion/delete_page.py
@@ -2,10 +2,11 @@
 from typing import Any
 
 from langchain_core.tools import tool
-from langgraph.types import interrupt
 from sqlalchemy.ext.asyncio import AsyncSession
 
+from app.agents.new_chat.tools.hitl import request_approval
 from app.connectors.notion_history import NotionAPIError, NotionHistoryConnector
+from app.db import async_session_maker
 from app.services.notion.tool_metadata_service import NotionToolMetadataService
 
 logger = logging.getLogger(__name__)
@@ -20,8 +21,14 @@ def create_delete_notion_page_tool(
     """
     Factory function to create the delete_notion_page tool.
 
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+
     Args:
-        db_session: Database session for accessing Notion connector
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
         search_space_id: Search space ID to find the Notion connector
         user_id: User ID for finding the correct Notion connector
         connector_id: Optional specific connector ID (if known)
@@ -29,6 +36,7 @@ def create_delete_notion_page_tool(
     Returns:
         Configured delete_notion_page tool
     """
+    del db_session  # per-call session — see docstring
 
     @tool
     async def delete_notion_page(
@@ -63,7 +71,7 @@ async def delete_notion_page(
             f"delete_notion_page called: page_title='{page_title}', delete_from_kb={delete_from_kb}"
         )
 
-        if db_session is None or search_space_id is None or user_id is None:
+        if search_space_id is None or user_id is None:
             logger.error(
                 "Notion tool not properly configured - missing required parameters"
             )
@@ -73,198 +81,167 @@ async def delete_notion_page(
             }
 
         try:
-            # Get page context (page_id, account, title) from indexed data
-            metadata_service = NotionToolMetadataService(db_session)
-            context = await metadata_service.get_delete_context(
-                search_space_id, user_id, page_title
-            )
+            async with async_session_maker() as db_session:
+                # Get page context (page_id, account, title) from indexed data
+                metadata_service = NotionToolMetadataService(db_session)
+                context = await metadata_service.get_delete_context(
+                    search_space_id, user_id, page_title
+                )
 
-            if "error" in context:
-                error_msg = context["error"]
-                # Check if it's a "not found" error (softer handling for LLM)
-                if "not found" in error_msg.lower():
-                    logger.warning(f"Page not found: {error_msg}")
-                    return {
-                        "status": "not_found",
-                        "message": error_msg,
-                    }
-                else:
-                    logger.error(f"Failed to fetch delete context: {error_msg}")
+                if "error" in context:
+                    error_msg = context["error"]
+                    # Check if it's a "not found" error (softer handling for LLM)
+                    if "not found" in error_msg.lower():
+                        logger.warning(f"Page not found: {error_msg}")
+                        return {
+                            "status": "not_found",
+                            "message": error_msg,
+                        }
+                    else:
+                        logger.error(f"Failed to fetch delete context: {error_msg}")
+                        return {
+                            "status": "error",
+                            "message": error_msg,
+                        }
+
+                account = context.get("account", {})
+                if account.get("auth_expired"):
+                    logger.warning(
+                        "Notion account %s has expired authentication",
+                        account.get("id"),
+                    )
                     return {
-                        "status": "error",
-                        "message": error_msg,
+                        "status": "auth_error",
+                        "message": "The Notion account for this page needs re-authentication. Please re-authenticate in your connector settings.",
                     }
 
-            account = context.get("account", {})
-            if account.get("auth_expired"):
-                logger.warning(
-                    "Notion account %s has expired authentication",
-                    account.get("id"),
-                )
-                return {
-                    "status": "auth_error",
-                    "message": "The Notion account for this page needs re-authentication. Please re-authenticate in your connector settings.",
-                }
-
-            page_id = context.get("page_id")
-            connector_id_from_context = account.get("id")
-            document_id = context.get("document_id")
+                page_id = context.get("page_id")
+                connector_id_from_context = account.get("id")
+                document_id = context.get("document_id")
 
-            logger.info(
-                f"Requesting approval for deleting Notion page: '{page_title}' (page_id={page_id}, delete_from_kb={delete_from_kb})"
-            )
+                logger.info(
+                    f"Requesting approval for deleting Notion page: '{page_title}' (page_id={page_id}, delete_from_kb={delete_from_kb})"
+                )
 
-            # Request approval before deleting
-            approval = interrupt(
-                {
-                    "type": "notion_page_deletion",
-                    "action": {
-                        "tool": "delete_notion_page",
-                        "params": {
-                            "page_id": page_id,
-                            "connector_id": connector_id_from_context,
-                            "delete_from_kb": delete_from_kb,
-                        },
+                result = request_approval(
+                    action_type="notion_page_deletion",
+                    tool_name="delete_notion_page",
+                    params={
+                        "page_id": page_id,
+                        "connector_id": connector_id_from_context,
+                        "delete_from_kb": delete_from_kb,
                     },
-                    "context": context,
-                }
-            )
-
-            decisions_raw = (
-                approval.get("decisions", []) if isinstance(approval, dict) else []
-            )
-            decisions = (
-                decisions_raw if isinstance(decisions_raw, list) else [decisions_raw]
-            )
-            decisions = [d for d in decisions if isinstance(d, dict)]
-            if not decisions:
-                logger.warning("No approval decision received")
-                return {
-                    "status": "error",
-                    "message": "No approval decision received",
-                }
-
-            decision = decisions[0]
-            decision_type = decision.get("type") or decision.get("decision_type")
-            logger.info(f"User decision: {decision_type}")
+                    context=context,
+                )
 
-            if decision_type == "reject":
-                logger.info("Notion page deletion rejected by user")
-                return {
-                    "status": "rejected",
-                    "message": "User declined. The page was not deleted. Do not ask again or suggest alternatives.",
-                }
+                if result.rejected:
+                    logger.info("Notion page deletion rejected by user")
+                    return {
+                        "status": "rejected",
+                        "message": "User declined. Do not retry or suggest alternatives.",
+                    }
 
-            # Extract edited action arguments (if user modified the checkbox)
-            edited_action = decision.get("edited_action")
-            final_params: dict[str, Any] = {}
-            if isinstance(edited_action, dict):
-                edited_args = edited_action.get("args")
-                if isinstance(edited_args, dict):
-                    final_params = edited_args
-            elif isinstance(decision.get("args"), dict):
-                # Some interrupt payloads place args directly on the decision.
-                final_params = decision["args"]
-
-            final_page_id = final_params.get("page_id", page_id)
-            final_connector_id = final_params.get(
-                "connector_id", connector_id_from_context
-            )
-            final_delete_from_kb = final_params.get("delete_from_kb", delete_from_kb)
+                final_page_id = result.params.get("page_id", page_id)
+                final_connector_id = result.params.get(
+                    "connector_id", connector_id_from_context
+                )
+                final_delete_from_kb = result.params.get(
+                    "delete_from_kb", delete_from_kb
+                )
 
-            logger.info(
-                f"Deleting Notion page with final params: page_id={final_page_id}, connector_id={final_connector_id}, delete_from_kb={final_delete_from_kb}"
-            )
+                logger.info(
+                    f"Deleting Notion page with final params: page_id={final_page_id}, connector_id={final_connector_id}, delete_from_kb={final_delete_from_kb}"
+                )
 
-            from sqlalchemy.future import select
+                from sqlalchemy.future import select
 
-            from app.db import SearchSourceConnector, SearchSourceConnectorType
+                from app.db import SearchSourceConnector, SearchSourceConnectorType
 
-            # Validate the connector
-            if final_connector_id:
-                result = await db_session.execute(
-                    select(SearchSourceConnector).filter(
-                        SearchSourceConnector.id == final_connector_id,
-                        SearchSourceConnector.search_space_id == search_space_id,
-                        SearchSourceConnector.user_id == user_id,
-                        SearchSourceConnector.connector_type
-                        == SearchSourceConnectorType.NOTION_CONNECTOR,
+                # Validate the connector
+                if final_connector_id:
+                    result = await db_session.execute(
+                        select(SearchSourceConnector).filter(
+                            SearchSourceConnector.id == final_connector_id,
+                            SearchSourceConnector.search_space_id == search_space_id,
+                            SearchSourceConnector.user_id == user_id,
+                            SearchSourceConnector.connector_type
+                            == SearchSourceConnectorType.NOTION_CONNECTOR,
+                        )
                     )
-                )
-                connector = result.scalars().first()
+                    connector = result.scalars().first()
 
-                if not connector:
-                    logger.error(
-                        f"Invalid connector_id={final_connector_id} for search_space_id={search_space_id}"
-                    )
+                    if not connector:
+                        logger.error(
+                            f"Invalid connector_id={final_connector_id} for search_space_id={search_space_id}"
+                        )
+                        return {
+                            "status": "error",
+                            "message": "Selected Notion account is invalid or has been disconnected. Please select a valid account.",
+                        }
+                    actual_connector_id = connector.id
+                    logger.info(f"Validated Notion connector: id={actual_connector_id}")
+                else:
+                    logger.error("No connector found for this page")
                     return {
                         "status": "error",
-                        "message": "Selected Notion account is invalid or has been disconnected. Please select a valid account.",
+                        "message": "No connector found for this page.",
                     }
-                actual_connector_id = connector.id
-                logger.info(f"Validated Notion connector: id={actual_connector_id}")
-            else:
-                logger.error("No connector found for this page")
-                return {
-                    "status": "error",
-                    "message": "No connector found for this page.",
-                }
 
-            # Create connector instance
-            notion_connector = NotionHistoryConnector(
-                session=db_session,
-                connector_id=actual_connector_id,
-            )
-
-            # Delete the page from Notion
-            result = await notion_connector.delete_page(page_id=final_page_id)
-            logger.info(
-                f"delete_page result: {result.get('status')} - {result.get('message', '')}"
-            )
-
-            # If deletion was successful and user wants to delete from KB
-            deleted_from_kb = False
-            if (
-                result.get("status") == "success"
-                and final_delete_from_kb
-                and document_id
-            ):
-                try:
-                    from sqlalchemy.future import select
+                # Create connector instance
+                notion_connector = NotionHistoryConnector(
+                    session=db_session,
+                    connector_id=actual_connector_id,
+                )
 
-                    from app.db import Document
+                # Delete the page from Notion
+                result = await notion_connector.delete_page(page_id=final_page_id)
+                logger.info(
+                    f"delete_page result: {result.get('status')} - {result.get('message', '')}"
+                )
 
-                    # Get the document
-                    doc_result = await db_session.execute(
-                        select(Document).filter(Document.id == document_id)
-                    )
-                    document = doc_result.scalars().first()
-
-                    if document:
-                        await db_session.delete(document)
-                        await db_session.commit()
-                        deleted_from_kb = True
-                        logger.info(
-                            f"Deleted document {document_id} from knowledge base"
+                # If deletion was successful and user wants to delete from KB
+                deleted_from_kb = False
+                if (
+                    result.get("status") == "success"
+                    and final_delete_from_kb
+                    and document_id
+                ):
+                    try:
+                        from sqlalchemy.future import select
+
+                        from app.db import Document
+
+                        # Get the document
+                        doc_result = await db_session.execute(
+                            select(Document).filter(Document.id == document_id)
+                        )
+                        document = doc_result.scalars().first()
+
+                        if document:
+                            await db_session.delete(document)
+                            await db_session.commit()
+                            deleted_from_kb = True
+                            logger.info(
+                                f"Deleted document {document_id} from knowledge base"
+                            )
+                        else:
+                            logger.warning(f"Document {document_id} not found in KB")
+                    except Exception as e:
+                        logger.error(f"Failed to delete document from KB: {e}")
+                        await db_session.rollback()
+                        result["warning"] = (
+                            f"Page deleted from Notion, but failed to remove from knowledge base: {e!s}"
                         )
-                    else:
-                        logger.warning(f"Document {document_id} not found in KB")
-                except Exception as e:
-                    logger.error(f"Failed to delete document from KB: {e}")
-                    await db_session.rollback()
-                    result["warning"] = (
-                        f"Page deleted from Notion, but failed to remove from knowledge base: {e!s}"
-                    )
 
-            # Update result with KB deletion status
-            if result.get("status") == "success":
-                result["deleted_from_kb"] = deleted_from_kb
-                if deleted_from_kb:
-                    result["message"] = (
-                        f"{result.get('message', '')} (also removed from knowledge base)"
-                    )
+                # Update result with KB deletion status
+                if result.get("status") == "success":
+                    result["deleted_from_kb"] = deleted_from_kb
+                    if deleted_from_kb:
+                        result["message"] = (
+                            f"{result.get('message', '')} (also removed from knowledge base)"
+                        )
 
-            return result
+                return result
 
         except Exception as e:
             from langgraph.errors import GraphInterrupt
diff --git a/surfsense_backend/app/agents/new_chat/tools/notion/update_page.py b/surfsense_backend/app/agents/new_chat/tools/notion/update_page.py
index 25f2b9918e..df757476aa 100644
--- a/surfsense_backend/app/agents/new_chat/tools/notion/update_page.py
+++ b/surfsense_backend/app/agents/new_chat/tools/notion/update_page.py
@@ -2,10 +2,11 @@
 from typing import Any
 
 from langchain_core.tools import tool
-from langgraph.types import interrupt
 from sqlalchemy.ext.asyncio import AsyncSession
 
+from app.agents.new_chat.tools.hitl import request_approval
 from app.connectors.notion_history import NotionAPIError, NotionHistoryConnector
+from app.db import async_session_maker
 from app.services.notion import NotionToolMetadataService
 
 logger = logging.getLogger(__name__)
@@ -20,8 +21,14 @@ def create_update_notion_page_tool(
     """
     Factory function to create the update_notion_page tool.
 
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache (see
+    ``create_create_notion_page_tool`` for the full rationale).
+
     Args:
-        db_session: Database session for accessing Notion connector
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
         search_space_id: Search space ID to find the Notion connector
         user_id: User ID for fetching user-specific context
         connector_id: Optional specific connector ID (if known)
@@ -29,6 +36,7 @@ def create_update_notion_page_tool(
     Returns:
         Configured update_notion_page tool
     """
+    del db_session  # per-call session — see docstring
 
     @tool
     async def update_notion_page(
@@ -71,7 +79,7 @@ async def update_notion_page(
             f"update_notion_page called: page_title='{page_title}', content_length={len(content) if content else 0}"
         )
 
-        if db_session is None or search_space_id is None or user_id is None:
+        if search_space_id is None or user_id is None:
             logger.error(
                 "Notion tool not properly configured - missing required parameters"
             )
@@ -88,184 +96,155 @@ async def update_notion_page(
             }
 
         try:
-            metadata_service = NotionToolMetadataService(db_session)
-            context = await metadata_service.get_update_context(
-                search_space_id, user_id, page_title
-            )
+            async with async_session_maker() as db_session:
+                metadata_service = NotionToolMetadataService(db_session)
+                context = await metadata_service.get_update_context(
+                    search_space_id, user_id, page_title
+                )
 
-            if "error" in context:
-                error_msg = context["error"]
-                # Check if it's a "not found" error (softer handling for LLM)
-                if "not found" in error_msg.lower():
-                    logger.warning(f"Page not found: {error_msg}")
-                    return {
-                        "status": "not_found",
-                        "message": error_msg,
-                    }
-                else:
-                    logger.error(f"Failed to fetch update context: {error_msg}")
+                if "error" in context:
+                    error_msg = context["error"]
+                    # Check if it's a "not found" error (softer handling for LLM)
+                    if "not found" in error_msg.lower():
+                        logger.warning(f"Page not found: {error_msg}")
+                        return {
+                            "status": "not_found",
+                            "message": error_msg,
+                        }
+                    else:
+                        logger.error(f"Failed to fetch update context: {error_msg}")
+                        return {
+                            "status": "error",
+                            "message": error_msg,
+                        }
+
+                account = context.get("account", {})
+                if account.get("auth_expired"):
+                    logger.warning(
+                        "Notion account %s has expired authentication",
+                        account.get("id"),
+                    )
                     return {
-                        "status": "error",
-                        "message": error_msg,
+                        "status": "auth_error",
+                        "message": "The Notion account for this page needs re-authentication. Please re-authenticate in your connector settings.",
                     }
 
-            account = context.get("account", {})
-            if account.get("auth_expired"):
-                logger.warning(
-                    "Notion account %s has expired authentication",
-                    account.get("id"),
-                )
-                return {
-                    "status": "auth_error",
-                    "message": "The Notion account for this page needs re-authentication. Please re-authenticate in your connector settings.",
-                }
-
-            page_id = context.get("page_id")
-            document_id = context.get("document_id")
-            connector_id_from_context = context.get("account", {}).get("id")
+                page_id = context.get("page_id")
+                document_id = context.get("document_id")
+                connector_id_from_context = context.get("account", {}).get("id")
 
-            logger.info(
-                f"Requesting approval for updating Notion page: '{page_title}' (page_id={page_id})"
-            )
-            approval = interrupt(
-                {
-                    "type": "notion_page_update",
-                    "action": {
-                        "tool": "update_notion_page",
-                        "params": {
-                            "page_id": page_id,
-                            "content": content,
-                            "connector_id": connector_id_from_context,
-                        },
+                logger.info(
+                    f"Requesting approval for updating Notion page: '{page_title}' (page_id={page_id})"
+                )
+                result = request_approval(
+                    action_type="notion_page_update",
+                    tool_name="update_notion_page",
+                    params={
+                        "page_id": page_id,
+                        "content": content,
+                        "connector_id": connector_id_from_context,
                     },
-                    "context": context,
-                }
-            )
-
-            decisions_raw = (
-                approval.get("decisions", []) if isinstance(approval, dict) else []
-            )
-            decisions = (
-                decisions_raw if isinstance(decisions_raw, list) else [decisions_raw]
-            )
-            decisions = [d for d in decisions if isinstance(d, dict)]
-            if not decisions:
-                logger.warning("No approval decision received")
-                return {
-                    "status": "error",
-                    "message": "No approval decision received",
-                }
-
-            decision = decisions[0]
-            decision_type = decision.get("type") or decision.get("decision_type")
-            logger.info(f"User decision: {decision_type}")
-
-            if decision_type == "reject":
-                logger.info("Notion page update rejected by user")
-                return {
-                    "status": "rejected",
-                    "message": "User declined. The page was not updated. Do not ask again or suggest alternatives.",
-                }
+                    context=context,
+                )
 
-            edited_action = decision.get("edited_action")
-            final_params: dict[str, Any] = {}
-            if isinstance(edited_action, dict):
-                edited_args = edited_action.get("args")
-                if isinstance(edited_args, dict):
-                    final_params = edited_args
-            elif isinstance(decision.get("args"), dict):
-                # Some interrupt payloads place args directly on the decision.
-                final_params = decision["args"]
-
-            final_page_id = final_params.get("page_id", page_id)
-            final_content = final_params.get("content", content)
-            final_connector_id = final_params.get(
-                "connector_id", connector_id_from_context
-            )
+                if result.rejected:
+                    logger.info("Notion page update rejected by user")
+                    return {
+                        "status": "rejected",
+                        "message": "User declined. Do not retry or suggest alternatives.",
+                    }
 
-            logger.info(
-                f"Updating Notion page with final params: page_id={final_page_id}, has_content={final_content is not None}"
-            )
+                final_page_id = result.params.get("page_id", page_id)
+                final_content = result.params.get("content", content)
+                final_connector_id = result.params.get(
+                    "connector_id", connector_id_from_context
+                )
 
-            from sqlalchemy.future import select
+                logger.info(
+                    f"Updating Notion page with final params: page_id={final_page_id}, has_content={final_content is not None}"
+                )
 
-            from app.db import SearchSourceConnector, SearchSourceConnectorType
+                from sqlalchemy.future import select
 
-            if final_connector_id:
-                result = await db_session.execute(
-                    select(SearchSourceConnector).filter(
-                        SearchSourceConnector.id == final_connector_id,
-                        SearchSourceConnector.search_space_id == search_space_id,
-                        SearchSourceConnector.user_id == user_id,
-                        SearchSourceConnector.connector_type
-                        == SearchSourceConnectorType.NOTION_CONNECTOR,
-                    )
-                )
-                connector = result.scalars().first()
+                from app.db import SearchSourceConnector, SearchSourceConnectorType
 
-                if not connector:
-                    logger.error(
-                        f"Invalid connector_id={final_connector_id} for search_space_id={search_space_id}"
+                if final_connector_id:
+                    result = await db_session.execute(
+                        select(SearchSourceConnector).filter(
+                            SearchSourceConnector.id == final_connector_id,
+                            SearchSourceConnector.search_space_id == search_space_id,
+                            SearchSourceConnector.user_id == user_id,
+                            SearchSourceConnector.connector_type
+                            == SearchSourceConnectorType.NOTION_CONNECTOR,
+                        )
                     )
+                    connector = result.scalars().first()
+
+                    if not connector:
+                        logger.error(
+                            f"Invalid connector_id={final_connector_id} for search_space_id={search_space_id}"
+                        )
+                        return {
+                            "status": "error",
+                            "message": "Selected Notion account is invalid or has been disconnected. Please select a valid account.",
+                        }
+                    actual_connector_id = connector.id
+                    logger.info(f"Validated Notion connector: id={actual_connector_id}")
+                else:
+                    logger.error("No connector found for this page")
                     return {
                         "status": "error",
-                        "message": "Selected Notion account is invalid or has been disconnected. Please select a valid account.",
+                        "message": "No connector found for this page.",
                     }
-                actual_connector_id = connector.id
-                logger.info(f"Validated Notion connector: id={actual_connector_id}")
-            else:
-                logger.error("No connector found for this page")
-                return {
-                    "status": "error",
-                    "message": "No connector found for this page.",
-                }
-
-            notion_connector = NotionHistoryConnector(
-                session=db_session,
-                connector_id=actual_connector_id,
-            )
 
-            result = await notion_connector.update_page(
-                page_id=final_page_id,
-                content=final_content,
-            )
-            logger.info(
-                f"update_page result: {result.get('status')} - {result.get('message', '')}"
-            )
+                notion_connector = NotionHistoryConnector(
+                    session=db_session,
+                    connector_id=actual_connector_id,
+                )
 
-            if result.get("status") == "success" and document_id is not None:
-                from app.services.notion import NotionKBSyncService
-
-                logger.info(f"Updating knowledge base for document {document_id}...")
-                kb_service = NotionKBSyncService(db_session)
-                kb_result = await kb_service.sync_after_update(
-                    document_id=document_id,
-                    appended_content=final_content,
-                    user_id=user_id,
-                    search_space_id=search_space_id,
-                    appended_block_ids=result.get("appended_block_ids"),
+                result = await notion_connector.update_page(
+                    page_id=final_page_id,
+                    content=final_content,
+                )
+                logger.info(
+                    f"update_page result: {result.get('status')} - {result.get('message', '')}"
                 )
 
-                if kb_result["status"] == "success":
-                    result["message"] = (
-                        f"{result['message']}. Your knowledge base has also been updated."
-                    )
+                if result.get("status") == "success" and document_id is not None:
+                    from app.services.notion import NotionKBSyncService
+
                     logger.info(
-                        f"Knowledge base successfully updated for page {final_page_id}"
-                    )
-                elif kb_result["status"] == "not_indexed":
-                    result["message"] = (
-                        f"{result['message']}. This page will be added to your knowledge base in the next scheduled sync."
+                        f"Updating knowledge base for document {document_id}..."
                     )
-                else:
-                    result["message"] = (
-                        f"{result['message']}. Your knowledge base will be updated in the next scheduled sync."
-                    )
-                    logger.warning(
-                        f"KB update failed for page {final_page_id}: {kb_result['message']}"
+                    kb_service = NotionKBSyncService(db_session)
+                    kb_result = await kb_service.sync_after_update(
+                        document_id=document_id,
+                        appended_content=final_content,
+                        user_id=user_id,
+                        search_space_id=search_space_id,
+                        appended_block_ids=result.get("appended_block_ids"),
                     )
 
-            return result
+                    if kb_result["status"] == "success":
+                        result["message"] = (
+                            f"{result['message']}. Your knowledge base has also been updated."
+                        )
+                        logger.info(
+                            f"Knowledge base successfully updated for page {final_page_id}"
+                        )
+                    elif kb_result["status"] == "not_indexed":
+                        result["message"] = (
+                            f"{result['message']}. This page will be added to your knowledge base in the next scheduled sync."
+                        )
+                    else:
+                        result["message"] = (
+                            f"{result['message']}. Your knowledge base will be updated in the next scheduled sync."
+                        )
+                        logger.warning(
+                            f"KB update failed for page {final_page_id}: {kb_result['message']}"
+                        )
+
+                return result
 
         except Exception as e:
             from langgraph.errors import GraphInterrupt
diff --git a/surfsense_backend/app/agents/new_chat/tools/onedrive/create_file.py b/surfsense_backend/app/agents/new_chat/tools/onedrive/create_file.py
index 8dffb18dd9..5f199a41b2 100644
--- a/surfsense_backend/app/agents/new_chat/tools/onedrive/create_file.py
+++ b/surfsense_backend/app/agents/new_chat/tools/onedrive/create_file.py
@@ -5,12 +5,12 @@
 from typing import Any
 
 from langchain_core.tools import tool
-from langgraph.types import interrupt
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
 
+from app.agents.new_chat.tools.hitl import request_approval
 from app.connectors.onedrive.client import OneDriveClient
-from app.db import SearchSourceConnector, SearchSourceConnectorType
+from app.db import SearchSourceConnector, SearchSourceConnectorType, async_session_maker
 
 logger = logging.getLogger(__name__)
 
@@ -48,6 +48,23 @@ def create_create_onedrive_file_tool(
     search_space_id: int | None = None,
     user_id: str | None = None,
 ):
+    """
+    Factory function to create the create_onedrive_file tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+
+    Returns:
+        Configured create_onedrive_file tool
+    """
+    del db_session  # per-call session — see docstring
+
     @tool
     async def create_onedrive_file(
         name: str,
@@ -70,199 +87,178 @@ async def create_onedrive_file(
         """
         logger.info(f"create_onedrive_file called: name='{name}'")
 
-        if db_session is None or search_space_id is None or user_id is None:
+        if search_space_id is None or user_id is None:
             return {
                 "status": "error",
                 "message": "OneDrive tool not properly configured.",
             }
 
         try:
-            result = await db_session.execute(
-                select(SearchSourceConnector).filter(
-                    SearchSourceConnector.search_space_id == search_space_id,
-                    SearchSourceConnector.user_id == user_id,
-                    SearchSourceConnector.connector_type
-                    == SearchSourceConnectorType.ONEDRIVE_CONNECTOR,
+            async with async_session_maker() as db_session:
+                result = await db_session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.search_space_id == search_space_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.connector_type
+                        == SearchSourceConnectorType.ONEDRIVE_CONNECTOR,
+                    )
                 )
-            )
-            connectors = result.scalars().all()
+                connectors = result.scalars().all()
 
-            if not connectors:
-                return {
-                    "status": "error",
-                    "message": "No OneDrive connector found. Please connect OneDrive in your workspace settings.",
-                }
-
-            accounts = []
-            for c in connectors:
-                cfg = c.config or {}
-                accounts.append(
-                    {
-                        "id": c.id,
-                        "name": c.name,
-                        "user_email": cfg.get("user_email"),
-                        "auth_expired": cfg.get("auth_expired", False),
+                if not connectors:
+                    return {
+                        "status": "error",
+                        "message": "No OneDrive connector found. Please connect OneDrive in your workspace settings.",
                     }
-                )
 
-            if all(a.get("auth_expired") for a in accounts):
-                return {
-                    "status": "auth_error",
-                    "message": "All connected OneDrive accounts need re-authentication.",
-                    "connector_type": "onedrive",
-                }
+                accounts = []
+                for c in connectors:
+                    cfg = c.config or {}
+                    accounts.append(
+                        {
+                            "id": c.id,
+                            "name": c.name,
+                            "user_email": cfg.get("user_email"),
+                            "auth_expired": cfg.get("auth_expired", False),
+                        }
+                    )
 
-            parent_folders: dict[int, list[dict[str, str]]] = {}
-            for acc in accounts:
-                cid = acc["id"]
-                if acc.get("auth_expired"):
-                    parent_folders[cid] = []
-                    continue
-                try:
-                    client = OneDriveClient(session=db_session, connector_id=cid)
-                    items, err = await client.list_children("root")
-                    if err:
+                if all(a.get("auth_expired") for a in accounts):
+                    return {
+                        "status": "auth_error",
+                        "message": "All connected OneDrive accounts need re-authentication.",
+                        "connector_type": "onedrive",
+                    }
+
+                parent_folders: dict[int, list[dict[str, str]]] = {}
+                for acc in accounts:
+                    cid = acc["id"]
+                    if acc.get("auth_expired"):
+                        parent_folders[cid] = []
+                        continue
+                    try:
+                        client = OneDriveClient(session=db_session, connector_id=cid)
+                        items, err = await client.list_children("root")
+                        if err:
+                            logger.warning(
+                                "Failed to list folders for connector %s: %s", cid, err
+                            )
+                            parent_folders[cid] = []
+                        else:
+                            parent_folders[cid] = [
+                                {"folder_id": item["id"], "name": item["name"]}
+                                for item in items
+                                if item.get("folder") is not None
+                                and item.get("id")
+                                and item.get("name")
+                            ]
+                    except Exception:
                         logger.warning(
-                            "Failed to list folders for connector %s: %s", cid, err
+                            "Error fetching folders for connector %s",
+                            cid,
+                            exc_info=True,
                         )
                         parent_folders[cid] = []
-                    else:
-                        parent_folders[cid] = [
-                            {"folder_id": item["id"], "name": item["name"]}
-                            for item in items
-                            if item.get("folder") is not None
-                            and item.get("id")
-                            and item.get("name")
-                        ]
-                except Exception:
-                    logger.warning(
-                        "Error fetching folders for connector %s", cid, exc_info=True
-                    )
-                    parent_folders[cid] = []
 
-            context: dict[str, Any] = {
-                "accounts": accounts,
-                "parent_folders": parent_folders,
-            }
+                context: dict[str, Any] = {
+                    "accounts": accounts,
+                    "parent_folders": parent_folders,
+                }
 
-            approval = interrupt(
-                {
-                    "type": "onedrive_file_creation",
-                    "action": {
-                        "tool": "create_onedrive_file",
-                        "params": {
-                            "name": name,
-                            "content": content,
-                            "connector_id": None,
-                            "parent_folder_id": None,
-                        },
+                result = request_approval(
+                    action_type="onedrive_file_creation",
+                    tool_name="create_onedrive_file",
+                    params={
+                        "name": name,
+                        "content": content,
+                        "connector_id": None,
+                        "parent_folder_id": None,
                     },
-                    "context": context,
-                }
-            )
-
-            decisions_raw = (
-                approval.get("decisions", []) if isinstance(approval, dict) else []
-            )
-            decisions = (
-                decisions_raw if isinstance(decisions_raw, list) else [decisions_raw]
-            )
-            decisions = [d for d in decisions if isinstance(d, dict)]
-            if not decisions:
-                return {"status": "error", "message": "No approval decision received"}
-
-            decision = decisions[0]
-            decision_type = decision.get("type") or decision.get("decision_type")
-
-            if decision_type == "reject":
-                return {
-                    "status": "rejected",
-                    "message": "User declined. The file was not created.",
-                }
+                    context=context,
+                )
 
-            final_params: dict[str, Any] = {}
-            edited_action = decision.get("edited_action")
-            if isinstance(edited_action, dict):
-                edited_args = edited_action.get("args")
-                if isinstance(edited_args, dict):
-                    final_params = edited_args
-            elif isinstance(decision.get("args"), dict):
-                final_params = decision["args"]
+                if result.rejected:
+                    return {
+                        "status": "rejected",
+                        "message": "User declined. Do not retry or suggest alternatives.",
+                    }
 
-            final_name = final_params.get("name", name)
-            final_content = final_params.get("content", content)
-            final_connector_id = final_params.get("connector_id")
-            final_parent_folder_id = final_params.get("parent_folder_id")
+                final_name = result.params.get("name", name)
+                final_content = result.params.get("content", content)
+                final_connector_id = result.params.get("connector_id")
+                final_parent_folder_id = result.params.get("parent_folder_id")
 
-            if not final_name or not final_name.strip():
-                return {"status": "error", "message": "File name cannot be empty."}
+                if not final_name or not final_name.strip():
+                    return {"status": "error", "message": "File name cannot be empty."}
 
-            final_name = _ensure_docx_extension(final_name)
+                final_name = _ensure_docx_extension(final_name)
 
-            if final_connector_id is not None:
-                result = await db_session.execute(
-                    select(SearchSourceConnector).filter(
-                        SearchSourceConnector.id == final_connector_id,
-                        SearchSourceConnector.search_space_id == search_space_id,
-                        SearchSourceConnector.user_id == user_id,
-                        SearchSourceConnector.connector_type
-                        == SearchSourceConnectorType.ONEDRIVE_CONNECTOR,
+                if final_connector_id is not None:
+                    result = await db_session.execute(
+                        select(SearchSourceConnector).filter(
+                            SearchSourceConnector.id == final_connector_id,
+                            SearchSourceConnector.search_space_id == search_space_id,
+                            SearchSourceConnector.user_id == user_id,
+                            SearchSourceConnector.connector_type
+                            == SearchSourceConnectorType.ONEDRIVE_CONNECTOR,
+                        )
                     )
-                )
-                connector = result.scalars().first()
-            else:
-                connector = connectors[0]
-
-            if not connector:
-                return {
-                    "status": "error",
-                    "message": "Selected OneDrive connector is invalid.",
-                }
-
-            docx_bytes = _markdown_to_docx(final_content or "")
-
-            client = OneDriveClient(session=db_session, connector_id=connector.id)
-            created = await client.create_file(
-                name=final_name,
-                parent_id=final_parent_folder_id,
-                content=docx_bytes,
-                mime_type=DOCX_MIME,
-            )
+                    connector = result.scalars().first()
+                else:
+                    connector = connectors[0]
 
-            logger.info(
-                f"OneDrive file created: id={created.get('id')}, name={created.get('name')}"
-            )
+                if not connector:
+                    return {
+                        "status": "error",
+                        "message": "Selected OneDrive connector is invalid.",
+                    }
 
-            kb_message_suffix = ""
-            try:
-                from app.services.onedrive import OneDriveKBSyncService
+                docx_bytes = _markdown_to_docx(final_content or "")
 
-                kb_service = OneDriveKBSyncService(db_session)
-                kb_result = await kb_service.sync_after_create(
-                    file_id=created.get("id"),
-                    file_name=created.get("name", final_name),
+                client = OneDriveClient(session=db_session, connector_id=connector.id)
+                created = await client.create_file(
+                    name=final_name,
+                    parent_id=final_parent_folder_id,
+                    content=docx_bytes,
                     mime_type=DOCX_MIME,
-                    web_url=created.get("webUrl"),
-                    content=final_content,
-                    connector_id=connector.id,
-                    search_space_id=search_space_id,
-                    user_id=user_id,
                 )
-                if kb_result["status"] == "success":
-                    kb_message_suffix = " Your knowledge base has also been updated."
-                else:
+
+                logger.info(
+                    f"OneDrive file created: id={created.get('id')}, name={created.get('name')}"
+                )
+
+                kb_message_suffix = ""
+                try:
+                    from app.services.onedrive import OneDriveKBSyncService
+
+                    kb_service = OneDriveKBSyncService(db_session)
+                    kb_result = await kb_service.sync_after_create(
+                        file_id=created.get("id"),
+                        file_name=created.get("name", final_name),
+                        mime_type=DOCX_MIME,
+                        web_url=created.get("webUrl"),
+                        content=final_content,
+                        connector_id=connector.id,
+                        search_space_id=search_space_id,
+                        user_id=user_id,
+                    )
+                    if kb_result["status"] == "success":
+                        kb_message_suffix = (
+                            " Your knowledge base has also been updated."
+                        )
+                    else:
+                        kb_message_suffix = " This file will be added to your knowledge base in the next scheduled sync."
+                except Exception as kb_err:
+                    logger.warning(f"KB sync after create failed: {kb_err}")
                     kb_message_suffix = " This file will be added to your knowledge base in the next scheduled sync."
-            except Exception as kb_err:
-                logger.warning(f"KB sync after create failed: {kb_err}")
-                kb_message_suffix = " This file will be added to your knowledge base in the next scheduled sync."
 
-            return {
-                "status": "success",
-                "file_id": created.get("id"),
-                "name": created.get("name"),
-                "web_url": created.get("webUrl"),
-                "message": f"Successfully created '{created.get('name')}' in OneDrive.{kb_message_suffix}",
-            }
+                return {
+                    "status": "success",
+                    "file_id": created.get("id"),
+                    "name": created.get("name"),
+                    "web_url": created.get("webUrl"),
+                    "message": f"Successfully created '{created.get('name')}' in OneDrive.{kb_message_suffix}",
+                }
 
         except Exception as e:
             from langgraph.errors import GraphInterrupt
diff --git a/surfsense_backend/app/agents/new_chat/tools/onedrive/trash_file.py b/surfsense_backend/app/agents/new_chat/tools/onedrive/trash_file.py
index 79d8222fd4..4857ea9889 100644
--- a/surfsense_backend/app/agents/new_chat/tools/onedrive/trash_file.py
+++ b/surfsense_backend/app/agents/new_chat/tools/onedrive/trash_file.py
@@ -2,17 +2,18 @@
 from typing import Any
 
 from langchain_core.tools import tool
-from langgraph.types import interrupt
 from sqlalchemy import String, and_, cast, func
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
 
+from app.agents.new_chat.tools.hitl import request_approval
 from app.connectors.onedrive.client import OneDriveClient
 from app.db import (
     Document,
     DocumentType,
     SearchSourceConnector,
     SearchSourceConnectorType,
+    async_session_maker,
 )
 
 logger = logging.getLogger(__name__)
@@ -23,6 +24,23 @@ def create_delete_onedrive_file_tool(
     search_space_id: int | None = None,
     user_id: str | None = None,
 ):
+    """
+    Factory function to create the delete_onedrive_file tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+
+    Returns:
+        Configured delete_onedrive_file tool
+    """
+    del db_session  # per-call session — see docstring
+
     @tool
     async def delete_onedrive_file(
         file_name: str,
@@ -56,33 +74,14 @@ async def delete_onedrive_file(
             f"delete_onedrive_file called: file_name='{file_name}', delete_from_kb={delete_from_kb}"
         )
 
-        if db_session is None or search_space_id is None or user_id is None:
+        if search_space_id is None or user_id is None:
             return {
                 "status": "error",
                 "message": "OneDrive tool not properly configured.",
             }
 
         try:
-            doc_result = await db_session.execute(
-                select(Document)
-                .join(
-                    SearchSourceConnector,
-                    Document.connector_id == SearchSourceConnector.id,
-                )
-                .filter(
-                    and_(
-                        Document.search_space_id == search_space_id,
-                        Document.document_type == DocumentType.ONEDRIVE_FILE,
-                        func.lower(Document.title) == func.lower(file_name),
-                        SearchSourceConnector.user_id == user_id,
-                    )
-                )
-                .order_by(Document.updated_at.desc().nullslast())
-                .limit(1)
-            )
-            document = doc_result.scalars().first()
-
-            if not document:
+            async with async_session_maker() as db_session:
                 doc_result = await db_session.execute(
                     select(Document)
                     .join(
@@ -93,13 +92,7 @@ async def delete_onedrive_file(
                         and_(
                             Document.search_space_id == search_space_id,
                             Document.document_type == DocumentType.ONEDRIVE_FILE,
-                            func.lower(
-                                cast(
-                                    Document.document_metadata["onedrive_file_name"],
-                                    String,
-                                )
-                            )
-                            == func.lower(file_name),
+                            func.lower(Document.title) == func.lower(file_name),
                             SearchSourceConnector.user_id == user_id,
                         )
                     )
@@ -108,125 +101,64 @@ async def delete_onedrive_file(
                 )
                 document = doc_result.scalars().first()
 
-            if not document:
-                return {
-                    "status": "not_found",
-                    "message": (
-                        f"File '{file_name}' not found in your indexed OneDrive files. "
-                        "This could mean: (1) the file doesn't exist, (2) it hasn't been indexed yet, "
-                        "or (3) the file name is different."
-                    ),
-                }
-
-            if not document.connector_id:
-                return {
-                    "status": "error",
-                    "message": "Document has no associated connector.",
-                }
-
-            meta = document.document_metadata or {}
-            file_id = meta.get("onedrive_file_id")
-            document_id = document.id
-
-            if not file_id:
-                return {
-                    "status": "error",
-                    "message": "File ID is missing. Please re-index the file.",
-                }
-
-            conn_result = await db_session.execute(
-                select(SearchSourceConnector).filter(
-                    and_(
-                        SearchSourceConnector.id == document.connector_id,
-                        SearchSourceConnector.search_space_id == search_space_id,
-                        SearchSourceConnector.user_id == user_id,
-                        SearchSourceConnector.connector_type
-                        == SearchSourceConnectorType.ONEDRIVE_CONNECTOR,
+                if not document:
+                    doc_result = await db_session.execute(
+                        select(Document)
+                        .join(
+                            SearchSourceConnector,
+                            Document.connector_id == SearchSourceConnector.id,
+                        )
+                        .filter(
+                            and_(
+                                Document.search_space_id == search_space_id,
+                                Document.document_type == DocumentType.ONEDRIVE_FILE,
+                                func.lower(
+                                    cast(
+                                        Document.document_metadata[
+                                            "onedrive_file_name"
+                                        ],
+                                        String,
+                                    )
+                                )
+                                == func.lower(file_name),
+                                SearchSourceConnector.user_id == user_id,
+                            )
+                        )
+                        .order_by(Document.updated_at.desc().nullslast())
+                        .limit(1)
                     )
-                )
-            )
-            connector = conn_result.scalars().first()
-            if not connector:
-                return {
-                    "status": "error",
-                    "message": "OneDrive connector not found or access denied.",
-                }
+                    document = doc_result.scalars().first()
 
-            cfg = connector.config or {}
-            if cfg.get("auth_expired"):
-                return {
-                    "status": "auth_error",
-                    "message": "OneDrive account needs re-authentication. Please re-authenticate in your connector settings.",
-                    "connector_type": "onedrive",
-                }
+                if not document:
+                    return {
+                        "status": "not_found",
+                        "message": (
+                            f"File '{file_name}' not found in your indexed OneDrive files. "
+                            "This could mean: (1) the file doesn't exist, (2) it hasn't been indexed yet, "
+                            "or (3) the file name is different."
+                        ),
+                    }
 
-            context = {
-                "file": {
-                    "file_id": file_id,
-                    "name": file_name,
-                    "document_id": document_id,
-                    "web_url": meta.get("web_url"),
-                },
-                "account": {
-                    "id": connector.id,
-                    "name": connector.name,
-                    "user_email": cfg.get("user_email"),
-                },
-            }
+                if not document.connector_id:
+                    return {
+                        "status": "error",
+                        "message": "Document has no associated connector.",
+                    }
 
-            approval = interrupt(
-                {
-                    "type": "onedrive_file_trash",
-                    "action": {
-                        "tool": "delete_onedrive_file",
-                        "params": {
-                            "file_id": file_id,
-                            "connector_id": connector.id,
-                            "delete_from_kb": delete_from_kb,
-                        },
-                    },
-                    "context": context,
-                }
-            )
-
-            decisions_raw = (
-                approval.get("decisions", []) if isinstance(approval, dict) else []
-            )
-            decisions = (
-                decisions_raw if isinstance(decisions_raw, list) else [decisions_raw]
-            )
-            decisions = [d for d in decisions if isinstance(d, dict)]
-            if not decisions:
-                return {"status": "error", "message": "No approval decision received"}
-
-            decision = decisions[0]
-            decision_type = decision.get("type") or decision.get("decision_type")
-            logger.info(f"User decision: {decision_type}")
-
-            if decision_type == "reject":
-                return {
-                    "status": "rejected",
-                    "message": "User declined. The file was not trashed. Do not ask again or suggest alternatives.",
-                }
+                meta = document.document_metadata or {}
+                file_id = meta.get("onedrive_file_id")
+                document_id = document.id
 
-            final_params: dict[str, Any] = {}
-            edited_action = decision.get("edited_action")
-            if isinstance(edited_action, dict):
-                edited_args = edited_action.get("args")
-                if isinstance(edited_args, dict):
-                    final_params = edited_args
-            elif isinstance(decision.get("args"), dict):
-                final_params = decision["args"]
-
-            final_file_id = final_params.get("file_id", file_id)
-            final_connector_id = final_params.get("connector_id", connector.id)
-            final_delete_from_kb = final_params.get("delete_from_kb", delete_from_kb)
-
-            if final_connector_id != connector.id:
-                result = await db_session.execute(
+                if not file_id:
+                    return {
+                        "status": "error",
+                        "message": "File ID is missing. Please re-index the file.",
+                    }
+
+                conn_result = await db_session.execute(
                     select(SearchSourceConnector).filter(
                         and_(
-                            SearchSourceConnector.id == final_connector_id,
+                            SearchSourceConnector.id == document.connector_id,
                             SearchSourceConnector.search_space_id == search_space_id,
                             SearchSourceConnector.user_id == user_id,
                             SearchSourceConnector.connector_type
@@ -234,65 +166,130 @@ async def delete_onedrive_file(
                         )
                     )
                 )
-                validated_connector = result.scalars().first()
-                if not validated_connector:
+                connector = conn_result.scalars().first()
+                if not connector:
                     return {
                         "status": "error",
-                        "message": "Selected OneDrive connector is invalid or has been disconnected.",
+                        "message": "OneDrive connector not found or access denied.",
                     }
-                actual_connector_id = validated_connector.id
-            else:
-                actual_connector_id = connector.id
-
-            logger.info(
-                f"Deleting OneDrive file: file_id='{final_file_id}', connector={actual_connector_id}"
-            )
-
-            client = OneDriveClient(
-                session=db_session, connector_id=actual_connector_id
-            )
-            await client.trash_file(final_file_id)
-
-            logger.info(
-                f"OneDrive file deleted (moved to recycle bin): file_id={final_file_id}"
-            )
-
-            trash_result: dict[str, Any] = {
-                "status": "success",
-                "file_id": final_file_id,
-                "message": f"Successfully moved '{file_name}' to the recycle bin.",
-            }
 
-            deleted_from_kb = False
-            if final_delete_from_kb and document_id:
-                try:
-                    doc_result = await db_session.execute(
-                        select(Document).filter(Document.id == document_id)
-                    )
-                    doc = doc_result.scalars().first()
-                    if doc:
-                        await db_session.delete(doc)
-                        await db_session.commit()
-                        deleted_from_kb = True
-                        logger.info(
-                            f"Deleted document {document_id} from knowledge base"
+                cfg = connector.config or {}
+                if cfg.get("auth_expired"):
+                    return {
+                        "status": "auth_error",
+                        "message": "OneDrive account needs re-authentication. Please re-authenticate in your connector settings.",
+                        "connector_type": "onedrive",
+                    }
+
+                context = {
+                    "file": {
+                        "file_id": file_id,
+                        "name": file_name,
+                        "document_id": document_id,
+                        "web_url": meta.get("web_url"),
+                    },
+                    "account": {
+                        "id": connector.id,
+                        "name": connector.name,
+                        "user_email": cfg.get("user_email"),
+                    },
+                }
+
+                result = request_approval(
+                    action_type="onedrive_file_trash",
+                    tool_name="delete_onedrive_file",
+                    params={
+                        "file_id": file_id,
+                        "connector_id": connector.id,
+                        "delete_from_kb": delete_from_kb,
+                    },
+                    context=context,
+                )
+
+                if result.rejected:
+                    return {
+                        "status": "rejected",
+                        "message": "User declined. Do not retry or suggest alternatives.",
+                    }
+
+                final_file_id = result.params.get("file_id", file_id)
+                final_connector_id = result.params.get("connector_id", connector.id)
+                final_delete_from_kb = result.params.get(
+                    "delete_from_kb", delete_from_kb
+                )
+
+                if final_connector_id != connector.id:
+                    result = await db_session.execute(
+                        select(SearchSourceConnector).filter(
+                            and_(
+                                SearchSourceConnector.id == final_connector_id,
+                                SearchSourceConnector.search_space_id
+                                == search_space_id,
+                                SearchSourceConnector.user_id == user_id,
+                                SearchSourceConnector.connector_type
+                                == SearchSourceConnectorType.ONEDRIVE_CONNECTOR,
+                            )
                         )
-                    else:
-                        logger.warning(f"Document {document_id} not found in KB")
-                except Exception as e:
-                    logger.error(f"Failed to delete document from KB: {e}")
-                    await db_session.rollback()
-                    trash_result["warning"] = (
-                        f"File moved to recycle bin, but failed to remove from knowledge base: {e!s}"
                     )
+                    validated_connector = result.scalars().first()
+                    if not validated_connector:
+                        return {
+                            "status": "error",
+                            "message": "Selected OneDrive connector is invalid or has been disconnected.",
+                        }
+                    actual_connector_id = validated_connector.id
+                else:
+                    actual_connector_id = connector.id
+
+                logger.info(
+                    f"Deleting OneDrive file: file_id='{final_file_id}', connector={actual_connector_id}"
+                )
+
+                client = OneDriveClient(
+                    session=db_session, connector_id=actual_connector_id
+                )
+                await client.trash_file(final_file_id)
 
-            trash_result["deleted_from_kb"] = deleted_from_kb
-            if deleted_from_kb:
-                trash_result["message"] = (
-                    f"{trash_result.get('message', '')} (also removed from knowledge base)"
+                logger.info(
+                    f"OneDrive file deleted (moved to recycle bin): file_id={final_file_id}"
                 )
 
-            return trash_result
+                trash_result: dict[str, Any] = {
+                    "status": "success",
+                    "file_id": final_file_id,
+                    "message": f"Successfully moved '{file_name}' to the recycle bin.",
+                }
+
+                deleted_from_kb = False
+                if final_delete_from_kb and document_id:
+                    try:
+                        doc_result = await db_session.execute(
+                            select(Document).filter(Document.id == document_id)
+                        )
+                        doc = doc_result.scalars().first()
+                        if doc:
+                            await db_session.delete(doc)
+                            await db_session.commit()
+                            deleted_from_kb = True
+                            logger.info(
+                                f"Deleted document {document_id} from knowledge base"
+                            )
+                        else:
+                            logger.warning(f"Document {document_id} not found in KB")
+                    except Exception as e:
+                        logger.error(f"Failed to delete document from KB: {e}")
+                        await db_session.rollback()
+                        trash_result["warning"] = (
+                            f"File moved to recycle bin, but failed to remove from knowledge base: {e!s}"
+                        )
+
+                trash_result["deleted_from_kb"] = deleted_from_kb
+                if deleted_from_kb:
+                    trash_result["message"] = (
+                        f"{trash_result.get('message', '')} (also removed from knowledge base)"
+                    )
+
+                return trash_result
 
         except Exception as e:
             from langgraph.errors import GraphInterrupt
diff --git a/surfsense_backend/app/agents/new_chat/tools/podcast.py b/surfsense_backend/app/agents/new_chat/tools/podcast.py
index 248a4f4506..2c9b7fa0c7 100644
--- a/surfsense_backend/app/agents/new_chat/tools/podcast.py
+++ b/surfsense_backend/app/agents/new_chat/tools/podcast.py
@@ -11,7 +11,7 @@
 from langchain_core.tools import tool
 from sqlalchemy.ext.asyncio import AsyncSession
 
-from app.db import Podcast, PodcastStatus
+from app.db import Podcast, PodcastStatus, shielded_async_session
 
 
 def create_generate_podcast_tool(
@@ -27,12 +27,16 @@ def create_generate_podcast_tool(
 
     Args:
         search_space_id: The user's search space ID
-        db_session: Database session for creating the podcast record
+        db_session: Reserved for future read-side use; the row is written via a
+            fresh, tool-local session so parallel tool calls (e.g. podcast +
+            video presentation in the same agent step) don't share an
+            ``AsyncSession`` (which is not concurrency-safe).
         thread_id: The chat thread ID for associating the podcast
 
     Returns:
         A configured tool function for generating podcasts
     """
+    del db_session  # writes use a fresh tool-local session, see below
 
     @tool
     async def generate_podcast(
@@ -64,32 +68,40 @@ async def generate_podcast(
             - message: Status message (or "error" field if status is failed)
         """
         try:
-            podcast = Podcast(
-                title=podcast_title,
-                status=PodcastStatus.PENDING,
-                search_space_id=search_space_id,
-                thread_id=thread_id,
-            )
-            db_session.add(podcast)
-            await db_session.commit()
-            await db_session.refresh(podcast)
+            # Open a fresh session per call. The streaming task's session is
+            # shared between every tool, and ``AsyncSession`` is NOT safe for
+            # concurrent use: when the LLM emits parallel tool calls, two
+            # concurrent ``add()`` / ``commit()`` paths interleave and the
+            # second one hits "Session.add() during flush" → the transaction
+            # is poisoned for both tools.
+            async with shielded_async_session() as session:
+                podcast = Podcast(
+                    title=podcast_title,
+                    status=PodcastStatus.PENDING,
+                    search_space_id=search_space_id,
+                    thread_id=thread_id,
+                )
+                session.add(podcast)
+                await session.commit()
+                await session.refresh(podcast)
+                podcast_id = podcast.id
 
             from app.tasks.celery_tasks.podcast_tasks import (
                 generate_content_podcast_task,
             )
 
             task = generate_content_podcast_task.delay(
-                podcast_id=podcast.id,
+                podcast_id=podcast_id,
                 source_content=source_content,
                 search_space_id=search_space_id,
                 user_prompt=user_prompt,
             )
 
-            print(f"[generate_podcast] Created podcast {podcast.id}, task: {task.id}")
+            print(f"[generate_podcast] Created podcast {podcast_id}, task: {task.id}")
 
             return {
                 "status": PodcastStatus.PENDING.value,
-                "podcast_id": podcast.id,
+                "podcast_id": podcast_id,
                 "title": podcast_title,
                 "message": "Podcast generation started. This may take a few minutes.",
             }
diff --git a/surfsense_backend/app/agents/new_chat/tools/registry.py b/surfsense_backend/app/agents/new_chat/tools/registry.py
index 442f3ba35e..b842d7a201 100644
--- a/surfsense_backend/app/agents/new_chat/tools/registry.py
+++ b/surfsense_backend/app/agents/new_chat/tools/registry.py
@@ -43,6 +43,9 @@ async def my_tool(param: str) -> dict:
 
 from langchain_core.tools import BaseTool
 
+from app.agents.new_chat.middleware.dedup_tool_calls import (
+    wrap_dedup_key_by_arg_name,
+)
 from app.db import ChatVisibility
 
 from .confluence import (
@@ -50,6 +53,12 @@ async def my_tool(param: str) -> dict:
     create_delete_confluence_page_tool,
     create_update_confluence_page_tool,
 )
+from .connected_accounts import create_get_connected_accounts_tool
+from .discord import (
+    create_list_discord_channels_tool,
+    create_read_discord_messages_tool,
+    create_send_discord_message_tool,
+)
 from .dropbox import (
     create_create_dropbox_file_tool,
     create_delete_dropbox_file_tool,
@@ -57,6 +66,8 @@ async def my_tool(param: str) -> dict:
 from .generate_image import create_generate_image_tool
 from .gmail import (
     create_create_gmail_draft_tool,
+    create_read_gmail_email_tool,
+    create_search_gmail_tool,
     create_send_gmail_email_tool,
     create_trash_gmail_email_tool,
     create_update_gmail_draft_tool,
@@ -64,21 +75,17 @@ async def my_tool(param: str) -> dict:
 from .google_calendar import (
     create_create_calendar_event_tool,
     create_delete_calendar_event_tool,
+    create_search_calendar_events_tool,
     create_update_calendar_event_tool,
 )
 from .google_drive import (
     create_create_google_drive_file_tool,
     create_delete_google_drive_file_tool,
 )
-from .jira import (
-    create_create_jira_issue_tool,
-    create_delete_jira_issue_tool,
-    create_update_jira_issue_tool,
-)
-from .linear import (
-    create_create_linear_issue_tool,
-    create_delete_linear_issue_tool,
-    create_update_linear_issue_tool,
+from .luma import (
+    create_create_luma_event_tool,
+    create_list_luma_events_tool,
+    create_read_luma_event_tool,
 )
 from .mcp_tool import load_mcp_tools
 from .notion import (
@@ -92,16 +99,20 @@ async def my_tool(param: str) -> dict:
 )
 from .podcast import create_generate_podcast_tool
 from .report import create_generate_report_tool
+from .resume import create_generate_resume_tool
 from .scrape_webpage import create_scrape_webpage_tool
 from .search_surfsense_docs import create_search_surfsense_docs_tool
-from .shared_memory import (
-    create_recall_shared_memory_tool,
-    create_save_shared_memory_tool,
+from .teams import (
+    create_list_teams_channels_tool,
+    create_read_teams_messages_tool,
+    create_send_teams_message_tool,
 )
-from .user_memory import create_recall_memory_tool, create_save_memory_tool
+from .update_memory import create_update_memory_tool, create_update_team_memory_tool
 from .video_presentation import create_generate_video_presentation_tool
 from .web_search import create_web_search_tool
 
+logger = logging.getLogger(__name__)
+
 # =============================================================================
 # Tool Definition
 # =============================================================================
@@ -117,6 +128,14 @@ class ToolDefinition:
         factory: Callable that creates the tool. Receives a dict of dependencies.
         requires: List of dependency names this tool needs (e.g., "search_space_id", "db_session")
         enabled_by_default: Whether the tool is enabled when no explicit config is provided
+        required_connector: Searchable type string (e.g. ``"LINEAR_CONNECTOR"``)
+            that must be in ``available_connectors`` for the tool to be enabled.
+        dedup_key: Optional callable that maps a tool's ``args`` dict to a
+            string signature used by :class:`DedupHITLToolCallsMiddleware`
+            to drop duplicate calls within a single LLM response.
+        reverse: Optional callable that, given the tool's ``(args, result)``,
+            returns a ``ReverseDescriptor`` describing the inverse tool
+            invocation. Consumed by the snapshot/revert pipeline.
 
     """
 
@@ -126,6 +145,9 @@ class ToolDefinition:
     requires: list[str] = field(default_factory=list)
     enabled_by_default: bool = True
     hidden: bool = False
+    required_connector: str | None = None
+    dedup_key: Callable[[dict[str, Any]], str] | None = None
+    reverse: Callable[[dict[str, Any], Any], dict[str, Any]] | None = None
 
 
 # =============================================================================
@@ -175,6 +197,16 @@ class ToolDefinition:
         # are optional — when missing, source_strategy="kb_search" degrades
         # gracefully to "provided"
     ),
+    # Resume generation tool (Typst-based, uses rendercv package)
+    ToolDefinition(
+        name="generate_resume",
+        description="Generate a professional resume as a Typst document",
+        factory=lambda deps: create_generate_resume_tool(
+            search_space_id=deps["search_space_id"],
+            thread_id=deps["thread_id"],
+        ),
+        requires=["search_space_id", "thread_id"],
+    ),
     # Generate image tool - creates images using AI models (DALL-E, GPT Image, etc.)
     ToolDefinition(
         name="generate_image",
@@ -214,76 +246,46 @@ class ToolDefinition:
         requires=["db_session"],
     ),
     # =========================================================================
-    # USER MEMORY TOOLS - private or team store by thread_visibility
+    # SERVICE ACCOUNT DISCOVERY
+    # Generic tool for the LLM to discover connected accounts and resolve
+    # service-specific identifiers (e.g. Jira cloudId, Slack team, etc.)
     # =========================================================================
     ToolDefinition(
-        name="save_memory",
-        description="Save facts, preferences, or context for personalized or team responses",
-        factory=lambda deps: (
-            create_save_shared_memory_tool(
-                search_space_id=deps["search_space_id"],
-                created_by_id=deps["user_id"],
-                db_session=deps["db_session"],
-            )
-            if deps["thread_visibility"] == ChatVisibility.SEARCH_SPACE
-            else create_save_memory_tool(
-                user_id=deps["user_id"],
-                search_space_id=deps["search_space_id"],
-                db_session=deps["db_session"],
-            )
+        name="get_connected_accounts",
+        description="Discover connected accounts for a service and their metadata",
+        factory=lambda deps: create_get_connected_accounts_tool(
+            db_session=deps["db_session"],
+            search_space_id=deps["search_space_id"],
+            user_id=deps["user_id"],
         ),
-        requires=["user_id", "search_space_id", "db_session", "thread_visibility"],
+        requires=["db_session", "search_space_id", "user_id"],
     ),
+    # =========================================================================
+    # MEMORY TOOL - single update_memory, private or team by thread_visibility
+    # =========================================================================
     ToolDefinition(
-        name="recall_memory",
-        description="Recall relevant memories (personal or team) for context",
+        name="update_memory",
+        description="Save important long-term facts, preferences, and instructions to the (personal or team) memory",
         factory=lambda deps: (
-            create_recall_shared_memory_tool(
+            create_update_team_memory_tool(
                 search_space_id=deps["search_space_id"],
                 db_session=deps["db_session"],
+                llm=deps.get("llm"),
             )
             if deps["thread_visibility"] == ChatVisibility.SEARCH_SPACE
-            else create_recall_memory_tool(
+            else create_update_memory_tool(
                 user_id=deps["user_id"],
-                search_space_id=deps["search_space_id"],
                 db_session=deps["db_session"],
+                llm=deps.get("llm"),
             )
         ),
-        requires=["user_id", "search_space_id", "db_session", "thread_visibility"],
-    ),
-    # =========================================================================
-    # LINEAR TOOLS - create, update, delete issues
-    # Auto-disabled when no Linear connector is configured (see chat_deepagent.py)
-    # =========================================================================
-    ToolDefinition(
-        name="create_linear_issue",
-        description="Create a new issue in the user's Linear workspace",
-        factory=lambda deps: create_create_linear_issue_tool(
-            db_session=deps["db_session"],
-            search_space_id=deps["search_space_id"],
-            user_id=deps["user_id"],
-        ),
-        requires=["db_session", "search_space_id", "user_id"],
-    ),
-    ToolDefinition(
-        name="update_linear_issue",
-        description="Update an existing indexed Linear issue",
-        factory=lambda deps: create_update_linear_issue_tool(
-            db_session=deps["db_session"],
-            search_space_id=deps["search_space_id"],
-            user_id=deps["user_id"],
-        ),
-        requires=["db_session", "search_space_id", "user_id"],
-    ),
-    ToolDefinition(
-        name="delete_linear_issue",
-        description="Archive (delete) an existing indexed Linear issue",
-        factory=lambda deps: create_delete_linear_issue_tool(
-            db_session=deps["db_session"],
-            search_space_id=deps["search_space_id"],
-            user_id=deps["user_id"],
-        ),
-        requires=["db_session", "search_space_id", "user_id"],
+        requires=[
+            "user_id",
+            "search_space_id",
+            "db_session",
+            "thread_visibility",
+            "llm",
+        ],
     ),
     # =========================================================================
     # NOTION TOOLS - create, update, delete pages
@@ -298,6 +300,8 @@ class ToolDefinition:
             user_id=deps["user_id"],
         ),
         requires=["db_session", "search_space_id", "user_id"],
+        required_connector="NOTION_CONNECTOR",
+        dedup_key=wrap_dedup_key_by_arg_name("title"),
     ),
     ToolDefinition(
         name="update_notion_page",
@@ -308,6 +312,8 @@ class ToolDefinition:
             user_id=deps["user_id"],
         ),
         requires=["db_session", "search_space_id", "user_id"],
+        required_connector="NOTION_CONNECTOR",
+        dedup_key=wrap_dedup_key_by_arg_name("page_title"),
     ),
     ToolDefinition(
         name="delete_notion_page",
@@ -318,6 +324,8 @@ class ToolDefinition:
             user_id=deps["user_id"],
         ),
         requires=["db_session", "search_space_id", "user_id"],
+        required_connector="NOTION_CONNECTOR",
+        dedup_key=wrap_dedup_key_by_arg_name("page_title"),
     ),
     # =========================================================================
     # GOOGLE DRIVE TOOLS - create files, delete files
@@ -332,6 +340,8 @@ class ToolDefinition:
             user_id=deps["user_id"],
         ),
         requires=["db_session", "search_space_id", "user_id"],
+        required_connector="GOOGLE_DRIVE_FILE",
+        dedup_key=wrap_dedup_key_by_arg_name("file_name"),
     ),
     ToolDefinition(
         name="delete_google_drive_file",
@@ -342,6 +352,8 @@ class ToolDefinition:
             user_id=deps["user_id"],
         ),
         requires=["db_session", "search_space_id", "user_id"],
+        required_connector="GOOGLE_DRIVE_FILE",
+        dedup_key=wrap_dedup_key_by_arg_name("file_name"),
     ),
     # =========================================================================
     # DROPBOX TOOLS - create and trash files
@@ -356,6 +368,8 @@ class ToolDefinition:
             user_id=deps["user_id"],
         ),
         requires=["db_session", "search_space_id", "user_id"],
+        required_connector="DROPBOX_FILE",
+        dedup_key=wrap_dedup_key_by_arg_name("file_name"),
     ),
     ToolDefinition(
         name="delete_dropbox_file",
@@ -366,6 +380,8 @@ class ToolDefinition:
             user_id=deps["user_id"],
         ),
         requires=["db_session", "search_space_id", "user_id"],
+        required_connector="DROPBOX_FILE",
+        dedup_key=wrap_dedup_key_by_arg_name("file_name"),
     ),
     # =========================================================================
     # ONEDRIVE TOOLS - create and trash files
@@ -380,6 +396,8 @@ class ToolDefinition:
             user_id=deps["user_id"],
         ),
         requires=["db_session", "search_space_id", "user_id"],
+        required_connector="ONEDRIVE_FILE",
+        dedup_key=wrap_dedup_key_by_arg_name("file_name"),
     ),
     ToolDefinition(
         name="delete_onedrive_file",
@@ -390,11 +408,24 @@ class ToolDefinition:
             user_id=deps["user_id"],
         ),
         requires=["db_session", "search_space_id", "user_id"],
+        required_connector="ONEDRIVE_FILE",
+        dedup_key=wrap_dedup_key_by_arg_name("file_name"),
     ),
     # =========================================================================
-    # GOOGLE CALENDAR TOOLS - create, update, delete events
+    # GOOGLE CALENDAR TOOLS - search, create, update, delete events
     # Auto-disabled when no Google Calendar connector is configured
     # =========================================================================
+    ToolDefinition(
+        name="search_calendar_events",
+        description="Search Google Calendar events within a date range",
+        factory=lambda deps: create_search_calendar_events_tool(
+            db_session=deps["db_session"],
+            search_space_id=deps["search_space_id"],
+            user_id=deps["user_id"],
+        ),
+        requires=["db_session", "search_space_id", "user_id"],
+        required_connector="GOOGLE_CALENDAR_CONNECTOR",
+    ),
     ToolDefinition(
         name="create_calendar_event",
         description="Create a new event on Google Calendar",
@@ -404,6 +435,8 @@ class ToolDefinition:
             user_id=deps["user_id"],
         ),
         requires=["db_session", "search_space_id", "user_id"],
+        required_connector="GOOGLE_CALENDAR_CONNECTOR",
+        dedup_key=wrap_dedup_key_by_arg_name("title"),
     ),
     ToolDefinition(
         name="update_calendar_event",
@@ -414,6 +447,8 @@ class ToolDefinition:
             user_id=deps["user_id"],
         ),
         requires=["db_session", "search_space_id", "user_id"],
+        required_connector="GOOGLE_CALENDAR_CONNECTOR",
+        dedup_key=wrap_dedup_key_by_arg_name("event_title_or_id"),
     ),
     ToolDefinition(
         name="delete_calendar_event",
@@ -424,11 +459,35 @@ class ToolDefinition:
             user_id=deps["user_id"],
         ),
         requires=["db_session", "search_space_id", "user_id"],
+        required_connector="GOOGLE_CALENDAR_CONNECTOR",
+        dedup_key=wrap_dedup_key_by_arg_name("event_title_or_id"),
     ),
     # =========================================================================
-    # GMAIL TOOLS - create drafts, update drafts, send emails, trash emails
+    # GMAIL TOOLS - search, read, create drafts, update drafts, send, trash
     # Auto-disabled when no Gmail connector is configured
     # =========================================================================
+    ToolDefinition(
+        name="search_gmail",
+        description="Search emails in Gmail using Gmail search syntax",
+        factory=lambda deps: create_search_gmail_tool(
+            db_session=deps["db_session"],
+            search_space_id=deps["search_space_id"],
+            user_id=deps["user_id"],
+        ),
+        requires=["db_session", "search_space_id", "user_id"],
+        required_connector="GOOGLE_GMAIL_CONNECTOR",
+    ),
+    ToolDefinition(
+        name="read_gmail_email",
+        description="Read the full content of a specific Gmail email",
+        factory=lambda deps: create_read_gmail_email_tool(
+            db_session=deps["db_session"],
+            search_space_id=deps["search_space_id"],
+            user_id=deps["user_id"],
+        ),
+        requires=["db_session", "search_space_id", "user_id"],
+        required_connector="GOOGLE_GMAIL_CONNECTOR",
+    ),
     ToolDefinition(
         name="create_gmail_draft",
         description="Create a draft email in Gmail",
@@ -438,6 +497,8 @@ class ToolDefinition:
             user_id=deps["user_id"],
         ),
         requires=["db_session", "search_space_id", "user_id"],
+        required_connector="GOOGLE_GMAIL_CONNECTOR",
+        dedup_key=wrap_dedup_key_by_arg_name("subject"),
     ),
     ToolDefinition(
         name="send_gmail_email",
@@ -448,6 +509,8 @@ class ToolDefinition:
             user_id=deps["user_id"],
         ),
         requires=["db_session", "search_space_id", "user_id"],
+        required_connector="GOOGLE_GMAIL_CONNECTOR",
+        dedup_key=wrap_dedup_key_by_arg_name("subject"),
     ),
     ToolDefinition(
         name="trash_gmail_email",
@@ -458,6 +521,8 @@ class ToolDefinition:
             user_id=deps["user_id"],
         ),
         requires=["db_session", "search_space_id", "user_id"],
+        required_connector="GOOGLE_GMAIL_CONNECTOR",
+        dedup_key=wrap_dedup_key_by_arg_name("email_subject_or_id"),
     ),
     ToolDefinition(
         name="update_gmail_draft",
@@ -468,74 +533,159 @@ class ToolDefinition:
             user_id=deps["user_id"],
         ),
         requires=["db_session", "search_space_id", "user_id"],
+        required_connector="GOOGLE_GMAIL_CONNECTOR",
+        dedup_key=wrap_dedup_key_by_arg_name("draft_subject_or_id"),
+    ),
+    # =========================================================================
+    # CONFLUENCE TOOLS - create, update, delete pages
+    # Auto-disabled when no Confluence connector is configured (see chat_deepagent.py)
+    # =========================================================================
+    ToolDefinition(
+        name="create_confluence_page",
+        description="Create a new page in the user's Confluence space",
+        factory=lambda deps: create_create_confluence_page_tool(
+            db_session=deps["db_session"],
+            search_space_id=deps["search_space_id"],
+            user_id=deps["user_id"],
+        ),
+        requires=["db_session", "search_space_id", "user_id"],
+        required_connector="CONFLUENCE_CONNECTOR",
+        dedup_key=wrap_dedup_key_by_arg_name("title"),
+    ),
+    ToolDefinition(
+        name="update_confluence_page",
+        description="Update an existing indexed Confluence page",
+        factory=lambda deps: create_update_confluence_page_tool(
+            db_session=deps["db_session"],
+            search_space_id=deps["search_space_id"],
+            user_id=deps["user_id"],
+        ),
+        requires=["db_session", "search_space_id", "user_id"],
+        required_connector="CONFLUENCE_CONNECTOR",
+        dedup_key=wrap_dedup_key_by_arg_name("page_title_or_id"),
+    ),
+    ToolDefinition(
+        name="delete_confluence_page",
+        description="Delete an existing indexed Confluence page",
+        factory=lambda deps: create_delete_confluence_page_tool(
+            db_session=deps["db_session"],
+            search_space_id=deps["search_space_id"],
+            user_id=deps["user_id"],
+        ),
+        requires=["db_session", "search_space_id", "user_id"],
+        required_connector="CONFLUENCE_CONNECTOR",
+        dedup_key=wrap_dedup_key_by_arg_name("page_title_or_id"),
     ),
     # =========================================================================
-    # JIRA TOOLS - create, update, delete issues
-    # Auto-disabled when no Jira connector is configured (see chat_deepagent.py)
+    # DISCORD TOOLS - list channels, read messages, send messages
+    # Auto-disabled when no Discord connector is configured
     # =========================================================================
     ToolDefinition(
-        name="create_jira_issue",
-        description="Create a new issue in the user's Jira project",
-        factory=lambda deps: create_create_jira_issue_tool(
+        name="list_discord_channels",
+        description="List text channels in the connected Discord server",
+        factory=lambda deps: create_list_discord_channels_tool(
             db_session=deps["db_session"],
             search_space_id=deps["search_space_id"],
             user_id=deps["user_id"],
         ),
         requires=["db_session", "search_space_id", "user_id"],
+        required_connector="DISCORD_CONNECTOR",
     ),
     ToolDefinition(
-        name="update_jira_issue",
-        description="Update an existing indexed Jira issue",
-        factory=lambda deps: create_update_jira_issue_tool(
+        name="read_discord_messages",
+        description="Read recent messages from a Discord text channel",
+        factory=lambda deps: create_read_discord_messages_tool(
             db_session=deps["db_session"],
             search_space_id=deps["search_space_id"],
             user_id=deps["user_id"],
         ),
         requires=["db_session", "search_space_id", "user_id"],
+        required_connector="DISCORD_CONNECTOR",
     ),
     ToolDefinition(
-        name="delete_jira_issue",
-        description="Delete an existing indexed Jira issue",
-        factory=lambda deps: create_delete_jira_issue_tool(
+        name="send_discord_message",
+        description="Send a message to a Discord text channel",
+        factory=lambda deps: create_send_discord_message_tool(
             db_session=deps["db_session"],
             search_space_id=deps["search_space_id"],
             user_id=deps["user_id"],
         ),
         requires=["db_session", "search_space_id", "user_id"],
+        required_connector="DISCORD_CONNECTOR",
     ),
     # =========================================================================
-    # CONFLUENCE TOOLS - create, update, delete pages
-    # Auto-disabled when no Confluence connector is configured (see chat_deepagent.py)
+    # TEAMS TOOLS - list channels, read messages, send messages
+    # Auto-disabled when no Teams connector is configured
     # =========================================================================
     ToolDefinition(
-        name="create_confluence_page",
-        description="Create a new page in the user's Confluence space",
-        factory=lambda deps: create_create_confluence_page_tool(
+        name="list_teams_channels",
+        description="List Microsoft Teams and their channels",
+        factory=lambda deps: create_list_teams_channels_tool(
             db_session=deps["db_session"],
             search_space_id=deps["search_space_id"],
             user_id=deps["user_id"],
         ),
         requires=["db_session", "search_space_id", "user_id"],
+        required_connector="TEAMS_CONNECTOR",
     ),
     ToolDefinition(
-        name="update_confluence_page",
-        description="Update an existing indexed Confluence page",
-        factory=lambda deps: create_update_confluence_page_tool(
+        name="read_teams_messages",
+        description="Read recent messages from a Microsoft Teams channel",
+        factory=lambda deps: create_read_teams_messages_tool(
             db_session=deps["db_session"],
             search_space_id=deps["search_space_id"],
             user_id=deps["user_id"],
         ),
         requires=["db_session", "search_space_id", "user_id"],
+        required_connector="TEAMS_CONNECTOR",
     ),
     ToolDefinition(
-        name="delete_confluence_page",
-        description="Delete an existing indexed Confluence page",
-        factory=lambda deps: create_delete_confluence_page_tool(
+        name="send_teams_message",
+        description="Send a message to a Microsoft Teams channel",
+        factory=lambda deps: create_send_teams_message_tool(
             db_session=deps["db_session"],
             search_space_id=deps["search_space_id"],
             user_id=deps["user_id"],
         ),
         requires=["db_session", "search_space_id", "user_id"],
+        required_connector="TEAMS_CONNECTOR",
+    ),
+    # =========================================================================
+    # LUMA TOOLS - list events, read event details, create events
+    # Auto-disabled when no Luma connector is configured
+    # =========================================================================
+    ToolDefinition(
+        name="list_luma_events",
+        description="List upcoming and recent Luma events",
+        factory=lambda deps: create_list_luma_events_tool(
+            db_session=deps["db_session"],
+            search_space_id=deps["search_space_id"],
+            user_id=deps["user_id"],
+        ),
+        requires=["db_session", "search_space_id", "user_id"],
+        required_connector="LUMA_CONNECTOR",
+    ),
+    ToolDefinition(
+        name="read_luma_event",
+        description="Read detailed information about a specific Luma event",
+        factory=lambda deps: create_read_luma_event_tool(
+            db_session=deps["db_session"],
+            search_space_id=deps["search_space_id"],
+            user_id=deps["user_id"],
+        ),
+        requires=["db_session", "search_space_id", "user_id"],
+        required_connector="LUMA_CONNECTOR",
+    ),
+    ToolDefinition(
+        name="create_luma_event",
+        description="Create a new event on Luma",
+        factory=lambda deps: create_create_luma_event_tool(
+            db_session=deps["db_session"],
+            search_space_id=deps["search_space_id"],
+            user_id=deps["user_id"],
+        ),
+        requires=["db_session", "search_space_id", "user_id"],
+        required_connector="LUMA_CONNECTOR",
     ),
 ]
 
@@ -553,6 +703,19 @@ def get_tool_by_name(name: str) -> ToolDefinition | None:
     return None
 
 
+def get_connector_gated_tools(
+    available_connectors: list[str] | None,
+) -> list[str]:
+    """Return tool names to disable"""
+    available = set() if available_connectors is None else set(available_connectors)
+
+    disabled: list[str] = []
+    for tool_def in BUILTIN_TOOLS:
+        if tool_def.required_connector and tool_def.required_connector not in available:
+            disabled.append(tool_def.name)
+    return disabled
+
+
 def get_all_tool_names() -> list[str]:
     """Get names of all registered tools."""
     return [tool_def.name for tool_def in BUILTIN_TOOLS]
@@ -624,6 +787,24 @@ def build_tools(
 
         # Create the tool
         tool = tool_def.factory(dependencies)
+        # Propagate the registry-level metadata so middleware (e.g.
+        # ``DedupHITLToolCallsMiddleware``) and the action-log/revert
+        # pipeline can pick the resolvers up via ``tool.metadata`` without
+        # re-importing :data:`BUILTIN_TOOLS`.
+        if tool_def.dedup_key is not None or tool_def.reverse is not None:
+            existing_meta = getattr(tool, "metadata", None) or {}
+            merged_meta = dict(existing_meta)
+            if tool_def.dedup_key is not None:
+                merged_meta.setdefault("dedup_key", tool_def.dedup_key)
+            if tool_def.reverse is not None:
+                merged_meta.setdefault("reverse", tool_def.reverse)
+            try:
+                tool.metadata = merged_meta
+            except Exception:
+                logger.debug(
+                    "Tool %s rejected metadata mutation; relying on registry lookup",
+                    tool_def.name,
+                )
         tools.append(tool)
 
     # Add any additional custom tools
@@ -643,13 +824,22 @@ async def build_tools_async(
     """Async version of build_tools that also loads MCP tools from database.
 
     Design Note:
-    This function exists because MCP tools require database queries to load user configs,
-    while built-in tools are created synchronously from static code.
-
-    Alternative: We could make build_tools() itself async and always query the database,
-    but that would force async everywhere even when only using built-in tools. The current
-    design keeps the simple case (static tools only) synchronous while supporting dynamic
-    database-loaded tools through this async wrapper.
+    This function exists because MCP tools require database queries to load
+    user configs, while built-in tools are created synchronously from static
+    code.
+
+    Alternative: We could make build_tools() itself async and always query
+    the database, but that would force async everywhere even when only using
+    built-in tools. The current design keeps the simple case (static tools
+    only) synchronous while supporting dynamic database-loaded tools through
+    this async wrapper.
+
+    Phase 1.3: built-in tool construction (CPU; runs in a thread pool to
+    avoid event-loop stalls) and MCP tool loading (HTTP/DB I/O; runs on
+    the event loop) are kicked off concurrently. Cold-path savings are
+    bounded by the slower of the two — typically MCP at ~200ms-1.7s —
+    so the parallelization recovers the ~50-200ms previously spent
+    serially on built-in construction.
 
     Args:
         dependencies: Dict containing all possible dependencies
@@ -662,47 +852,84 @@ async def build_tools_async(
         List of configured tool instances ready for the agent, including MCP tools.
 
     """
+    import asyncio
     import time
 
     _perf_log = logging.getLogger("surfsense.perf")
     _perf_log.setLevel(logging.DEBUG)
 
+    can_load_mcp = (
+        include_mcp_tools
+        and "db_session" in dependencies
+        and "search_space_id" in dependencies
+    )
+
+    # Built-in tool construction is synchronous + CPU-only. Off-loop it so
+    # MCP's HTTP/DB I/O can fire concurrently. ``build_tools`` is pure
+    # function over its inputs — safe to thread-shift.
     _t0 = time.perf_counter()
-    tools = build_tools(dependencies, enabled_tools, disabled_tools, additional_tools)
+    builtin_task = asyncio.create_task(
+        asyncio.to_thread(
+            build_tools, dependencies, enabled_tools, disabled_tools, additional_tools
+        )
+    )
+
+    mcp_task: asyncio.Task | None = None
+    if can_load_mcp:
+        mcp_task = asyncio.create_task(
+            load_mcp_tools(
+                dependencies["db_session"],
+                dependencies["search_space_id"],
+            )
+        )
+
+    # Surface failures from each task independently so a flaky MCP
+    # endpoint never poisons built-in tool registration. ``return_exceptions``
+    # gives us per-task exceptions instead of dropping the second result
+    # when the first raises.
+    if mcp_task is not None:
+        builtin_result, mcp_result = await asyncio.gather(
+            builtin_task, mcp_task, return_exceptions=True
+        )
+    else:
+        builtin_result = await builtin_task
+        mcp_result = None
+
+    if isinstance(builtin_result, BaseException):
+        raise builtin_result  # built-in registration failure is non-recoverable
+    tools: list[BaseTool] = builtin_result
     _perf_log.info(
-        "[build_tools_async] Built-in tools in %.3fs (%d tools)",
+        "[build_tools_async] Built-in tools in %.3fs (%d tools, parallel)",
         time.perf_counter() - _t0,
         len(tools),
     )
 
-    # Load MCP tools if requested and dependencies are available
-    if (
-        include_mcp_tools
-        and "db_session" in dependencies
-        and "search_space_id" in dependencies
-    ):
-        try:
-            _t0 = time.perf_counter()
-            mcp_tools = await load_mcp_tools(
-                dependencies["db_session"],
-                dependencies["search_space_id"],
+    if mcp_task is not None:
+        if isinstance(mcp_result, BaseException):
+            # ``return_exceptions=True`` captures the exception out-of-band,
+            # so ``sys.exc_info()`` is empty here. Pass the captured
+            # exception via ``exc_info=`` to get a real traceback.
+            logging.error(
+                "Failed to load MCP tools: %s", mcp_result, exc_info=mcp_result
             )
+        else:
+            mcp_tools = mcp_result or []
             _perf_log.info(
-                "[build_tools_async] MCP tools loaded in %.3fs (%d tools)",
+                "[build_tools_async] MCP tools loaded in %.3fs (%d tools, parallel)",
                 time.perf_counter() - _t0,
                 len(mcp_tools),
             )
             tools.extend(mcp_tools)
             logging.info(
-                f"Registered {len(mcp_tools)} MCP tools: {[t.name for t in mcp_tools]}",
+                "Registered %d MCP tools: %s",
+                len(mcp_tools),
+                [t.name for t in mcp_tools],
             )
-        except Exception as e:
-            # Log error but don't fail - just continue without MCP tools
-            logging.exception(f"Failed to load MCP tools: {e!s}")
 
-    # Log all tools being returned to agent
     logging.info(
-        f"Total tools for agent: {len(tools)} - {[t.name for t in tools]}",
+        "Total tools for agent: %d — %s",
+        len(tools),
+        [t.name for t in tools],
     )
 
     return tools
diff --git a/surfsense_backend/app/agents/new_chat/tools/resume.py b/surfsense_backend/app/agents/new_chat/tools/resume.py
new file mode 100644
index 0000000000..4abe48ba69
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/tools/resume.py
@@ -0,0 +1,812 @@
+"""
+Resume generation tool for the SurfSense agent.
+
+Generates a structured resume as Typst source code using the rendercv package.
+The LLM outputs only the content body (= heading, sections, entries) while
+the template header (import + show rule) is hardcoded and prepended by the
+backend.  This eliminates LLM errors in the complex configuration block.
+
+Templates are stored in a registry so new designs can be added by defining
+a new entry in _TEMPLATES.
+
+Uses the same short-lived session pattern as generate_report so no DB
+connection is held during the long LLM call.
+"""
+
+import io
+import logging
+import re
+from datetime import UTC, datetime
+from typing import Any
+
+import pypdf
+import typst
+from langchain_core.callbacks import dispatch_custom_event
+from langchain_core.messages import HumanMessage
+from langchain_core.tools import tool
+
+from app.db import Report, shielded_async_session
+from app.services.llm_service import get_document_summary_llm
+
+logger = logging.getLogger(__name__)
+
+
+# ─── Template Registry ───────────────────────────────────────────────────────
+# Each template defines:
+#   header              - Typst import + show rule with {name}, {year}, {month}, {day} placeholders
+#   component_reference - component docs shown to the LLM
+#   rules               - generation rules for the LLM
+
+_TEMPLATES: dict[str, dict[str, str]] = {
+    "classic": {
+        "header": """\
+#import "@preview/rendercv:0.3.0": *
+
+#show: rendercv.with(
+  name: "{name}",
+  title: "{name} - Resume",
+  footer: context {{ [#emph[{name} -- #str(here().page())\\/#str(counter(page).final().first())]] }},
+  top-note: [ #emph[Last updated in {month_name} {year}] ],
+  locale-catalog-language: "en",
+  text-direction: ltr,
+  page-size: "us-letter",
+  page-top-margin: 0.7in,
+  page-bottom-margin: 0.7in,
+  page-left-margin: 0.7in,
+  page-right-margin: 0.7in,
+  page-show-footer: false,
+  page-show-top-note: true,
+  colors-body: rgb(0, 0, 0),
+  colors-name: rgb(0, 0, 0),
+  colors-headline: rgb(0, 0, 0),
+  colors-connections: rgb(0, 0, 0),
+  colors-section-titles: rgb(0, 0, 0),
+  colors-links: rgb(0, 0, 0),
+  colors-footer: rgb(128, 128, 128),
+  colors-top-note: rgb(128, 128, 128),
+  typography-line-spacing: 0.6em,
+  typography-alignment: "justified",
+  typography-date-and-location-column-alignment: right,
+  typography-font-family-body: "XCharter",
+  typography-font-family-name: "XCharter",
+  typography-font-family-headline: "XCharter",
+  typography-font-family-connections: "XCharter",
+  typography-font-family-section-titles: "XCharter",
+  typography-font-size-body: 10pt,
+  typography-font-size-name: 25pt,
+  typography-font-size-headline: 10pt,
+  typography-font-size-connections: 10pt,
+  typography-font-size-section-titles: 1.2em,
+  typography-small-caps-name: false,
+  typography-small-caps-headline: false,
+  typography-small-caps-connections: false,
+  typography-small-caps-section-titles: false,
+  typography-bold-name: false,
+  typography-bold-headline: false,
+  typography-bold-connections: false,
+  typography-bold-section-titles: true,
+  links-underline: true,
+  links-show-external-link-icon: false,
+  header-alignment: center,
+  header-photo-width: 3.5cm,
+  header-space-below-name: 0.7cm,
+  header-space-below-headline: 0.7cm,
+  header-space-below-connections: 0.7cm,
+  header-connections-hyperlink: true,
+  header-connections-show-icons: false,
+  header-connections-display-urls-instead-of-usernames: true,
+  header-connections-separator: "|",
+  header-connections-space-between-connections: 0.5cm,
+  section-titles-type: "with_full_line",
+  section-titles-line-thickness: 0.5pt,
+  section-titles-space-above: 0.5cm,
+  section-titles-space-below: 0.3cm,
+  sections-allow-page-break: true,
+  sections-space-between-text-based-entries: 0.15cm,
+  sections-space-between-regular-entries: 0.42cm,
+  entries-date-and-location-width: 4.15cm,
+  entries-side-space: 0cm,
+  entries-space-between-columns: 0.1cm,
+  entries-allow-page-break: false,
+  entries-short-second-row: false,
+  entries-degree-width: 1cm,
+  entries-summary-space-left: 0cm,
+  entries-summary-space-above: 0.08cm,
+  entries-highlights-bullet: text(13pt, [\\u{2022}], baseline: -0.6pt),
+  entries-highlights-nested-bullet: text(13pt, [\\u{2022}], baseline: -0.6pt),
+  entries-highlights-space-left: 0cm,
+  entries-highlights-space-above: 0.08cm,
+  entries-highlights-space-between-items: 0.02cm,
+  entries-highlights-space-between-bullet-and-text: 0.3em,
+  date: datetime(
+    year: {year},
+    month: {month},
+    day: {day},
+  ),
+)
+
+""",
+        "component_reference": """\
+Available components (use ONLY these):
+
+= Full Name                              // Top-level heading — person's full name
+
+#connections(                            // Contact info row (pipe-separated)
+  [City, Country],
+  [#link("mailto:email@example.com", icon: false, if-underline: false, if-color: false)[email\\@example.com]],
+  [#link("https://linkedin.com/in/user", icon: false, if-underline: false, if-color: false)[linkedin.com\\/in\\/user]],
+  [#link("https://github.com/user", icon: false, if-underline: false, if-color: false)[github.com\\/user]],
+)
+
+== Section Title                         // Section heading (arbitrary name)
+
+#regular-entry(                          // Work experience, projects, publications, etc.
+  [
+    #strong[Role/Title], Company Name -- Location
+  ],
+  [
+    Start -- End
+  ],
+  main-column-second-row: [
+    - Achievement or responsibility
+    - Another bullet point
+  ],
+)
+
+#education-entry(                        // Education entries
+  [
+    #strong[Institution], Degree in Field -- Location
+  ],
+  [
+    Start -- End
+  ],
+  main-column-second-row: [
+    - GPA, honours, relevant coursework
+  ],
+)
+
+#summary([Short paragraph summary])     // Optional summary inside an entry
+#content-area([Free-form content])       // Freeform text block
+
+For skills sections, use one bullet per category label:
+- #strong[Category:] item1, item2, item3
+
+For simple list sections (e.g. Honors), use plain bullet points:
+- Item one
+- Item two
+""",
+        "rules": """\
+RULES:
+- Do NOT include any #import or #show lines. Start directly with = Full Name.
+- Output ONLY valid Typst content. No explanatory text before or after.
+- Do NOT wrap output in ```typst code fences.
+- The = heading MUST use the person's COMPLETE full name exactly as provided. NEVER shorten or abbreviate.
+- Escape @ symbols inside link labels with a backslash: email\\@example.com
+- Escape forward slashes in link display text: linkedin.com\\/in\\/user
+- Every section MUST use == heading.
+- Use #regular-entry() for experience, projects, publications, certifications, and similar entries.
+- Use #education-entry() for education.
+- For skills sections, use one bullet line per category with a bold label.
+- Keep content professional, concise, and achievement-oriented.
+- Use action verbs for bullet points (Led, Built, Designed, Reduced, etc.).
+- This template works for ALL professions — adapt sections to the user's field.
+- Default behavior should prioritize concise one-page content.
+""",
+    },
+}
+
+DEFAULT_TEMPLATE = "classic"
+MIN_RESUME_PAGES = 1
+MAX_RESUME_PAGES = 5
+MAX_COMPRESSION_ATTEMPTS = 2
+
+
+# ─── Template Helpers ─────────────────────────────────────────────────────────
+
+
+def _get_template(template_id: str | None = None) -> dict[str, str]:
+    """Get a template by ID, falling back to default."""
+    return _TEMPLATES.get(template_id or DEFAULT_TEMPLATE, _TEMPLATES[DEFAULT_TEMPLATE])
+
+
+_MONTH_NAMES = [
+    "",
+    "Jan",
+    "Feb",
+    "Mar",
+    "Apr",
+    "May",
+    "Jun",
+    "Jul",
+    "Aug",
+    "Sep",
+    "Oct",
+    "Nov",
+    "Dec",
+]
+
+
+def _build_header(template: dict[str, str], name: str) -> str:
+    """Build the template header with the person's name and current date."""
+    now = datetime.now(tz=UTC)
+    return (
+        template["header"]
+        .replace("{name}", name)
+        .replace("{year}", str(now.year))
+        .replace("{month}", str(now.month))
+        .replace("{day}", str(now.day))
+        .replace("{month_name}", _MONTH_NAMES[now.month])
+    )
+
+
+def _strip_header(full_source: str) -> str:
+    """Strip the import + show rule from stored source to get the body only.
+
+    Finds the closing parenthesis of the rendercv.with(...) block by tracking
+    nesting depth, then returns everything after it.
+    """
+    show_match = re.search(r"#show:\s*rendercv\.with\(", full_source)
+    if not show_match:
+        return full_source
+
+    start = show_match.end()
+    depth = 1
+    i = start
+    while i < len(full_source) and depth > 0:
+        if full_source[i] == "(":
+            depth += 1
+        elif full_source[i] == ")":
+            depth -= 1
+        i += 1
+
+    return full_source[i:].lstrip("\n")
+
+
+def _extract_name(body: str) -> str | None:
+    """Extract the person's full name from the = heading in the body."""
+    match = re.search(r"^=\s+(.+)$", body, re.MULTILINE)
+    return match.group(1).strip() if match else None
+
+
+def _strip_imports(body: str) -> str:
+    """Remove any #import or #show lines the LLM might accidentally include."""
+    lines = body.split("\n")
+    cleaned: list[str] = []
+    skip_show = False
+    depth = 0
+
+    for line in lines:
+        stripped = line.strip()
+
+        if stripped.startswith("#import"):
+            continue
+
+        if skip_show:
+            depth += stripped.count("(") - stripped.count(")")
+            if depth <= 0:
+                skip_show = False
+            continue
+
+        if stripped.startswith("#show:") and "rendercv" in stripped:
+            depth = stripped.count("(") - stripped.count(")")
+            if depth > 0:
+                skip_show = True
+            continue
+
+        cleaned.append(line)
+
+    result = "\n".join(cleaned).strip()
+    return result
+
+
+def _build_llm_reference(template: dict[str, str]) -> str:
+    """Build the LLM prompt reference from a template."""
+    return f"""\
+You MUST output valid Typst content for a resume.
+Do NOT include any #import or #show lines — those are handled automatically.
+Start directly with the = Full Name heading.
+
+{template["component_reference"]}
+
+{template["rules"]}"""
+
+
+# ─── Prompts ─────────────────────────────────────────────────────────────────
+
+_RESUME_PROMPT = """\
+You are an expert resume writer. Generate professional resume content as Typst markup.
+
+{llm_reference}
+
+**User Information:**
+{user_info}
+
+**Target Maximum Pages:** {max_pages}
+
+{user_instructions_section}
+
+Generate the resume content now (starting with = Full Name):
+"""
+
+_REVISION_PROMPT = """\
+You are an expert resume editor. Modify the existing resume according to the instructions.
+Apply ONLY the requested changes — do NOT rewrite sections that are not affected.
+
+{llm_reference}
+
+**Target Maximum Pages:** {max_pages}
+
+**Modification Instructions:** {user_instructions}
+
+**EXISTING RESUME CONTENT:**
+
+{previous_content}
+
+---
+
+Output the complete, updated resume content with the changes applied (starting with = Full Name):
+"""
+
+_FIX_COMPILE_PROMPT = """\
+The resume content you generated failed to compile. Fix the error while preserving all content.
+
+{llm_reference}
+
+**Compilation Error:**
+{error}
+
+**Full Typst Source (for context — error line numbers refer to this):**
+{full_source}
+
+**Your content starts after the template header. Output ONLY the content portion \
+(starting with = Full Name), NOT the #import or #show rule:**
+"""
+
+_COMPRESS_TO_PAGE_LIMIT_PROMPT = """\
+The resume compiles, but it exceeds the maximum allowed page count.
+Compress the resume while preserving high-impact accomplishments and role relevance.
+
+{llm_reference}
+
+**Target Maximum Pages:** {max_pages}
+**Current Page Count:** {actual_pages}
+**Compression Attempt:** {attempt_number}
+
+Compression priorities (in this order):
+1) Keep recent, high-impact, role-relevant bullets.
+2) Remove low-impact or redundant bullets.
+3) Shorten verbose wording while preserving meaning.
+4) Trim older or less relevant details before recent ones.
+
+Return the complete updated Typst content (starting with = Full Name), and keep it at or below the target pages.
+
+**EXISTING RESUME CONTENT:**
+{previous_content}
+"""
+
+
+# ─── Helpers ─────────────────────────────────────────────────────────────────
+
+
+def _strip_typst_fences(text: str) -> str:
+    """Remove wrapping ```typst ... ``` fences that LLMs sometimes add."""
+    stripped = text.strip()
+    m = re.match(r"^(`{3,})(?:typst|typ)?\s*\n", stripped)
+    if m:
+        fence = m.group(1)
+        if stripped.endswith(fence):
+            stripped = stripped[m.end() :]
+            stripped = stripped[: -len(fence)].rstrip()
+    return stripped
+
+
+def _compile_typst(source: str) -> bytes:
+    """Compile Typst source to PDF bytes. Raises on failure."""
+    return typst.compile(source.encode("utf-8"))
+
+
+def _count_pdf_pages(pdf_bytes: bytes) -> int:
+    """Count the number of pages in compiled PDF bytes."""
+    with io.BytesIO(pdf_bytes) as pdf_stream:
+        reader = pypdf.PdfReader(pdf_stream)
+        return len(reader.pages)
+
+
+def _validate_max_pages(max_pages: int) -> int:
+    """Validate and normalize max_pages input."""
+    if MIN_RESUME_PAGES <= max_pages <= MAX_RESUME_PAGES:
+        return max_pages
+    msg = (
+        f"max_pages must be between {MIN_RESUME_PAGES} and "
+        f"{MAX_RESUME_PAGES}. Received: {max_pages}"
+    )
+    raise ValueError(msg)
+
+
+# ─── Tool Factory ───────────────────────────────────────────────────────────
+
+
+def create_generate_resume_tool(
+    search_space_id: int,
+    thread_id: int | None = None,
+):
+    """
+    Factory function to create the generate_resume tool.
+
+    Generates a Typst-based resume, validates it via compilation,
+    and stores the source in the Report table with content_type='typst'.
+    The LLM generates only the content body; the template header is
+    prepended by the backend.
+    """
+
+    @tool
+    async def generate_resume(
+        user_info: str,
+        user_instructions: str | None = None,
+        parent_report_id: int | None = None,
+        max_pages: int = 1,
+    ) -> dict[str, Any]:
+        """
+        Generate a professional resume as a Typst document.
+
+        Use this tool when the user asks to create, build, generate, write,
+        or draft a resume or CV. Also use it when the user wants to modify,
+        update, or revise an existing resume generated in this conversation.
+
+        Trigger phrases include:
+        - "build me a resume", "create my resume", "generate a CV"
+        - "update my resume", "change my title", "add my new job"
+        - "make my resume more concise", "reformat my resume"
+
+        Do NOT use this tool for:
+        - General questions about resumes or career advice
+        - Reviewing or critiquing a resume without changes
+        - Cover letters (use generate_report instead)
+
+        VERSIONING — parent_report_id:
+        - Set parent_report_id when the user wants to MODIFY an existing
+          resume that was already generated in this conversation.
+        - Leave as None for new resumes.
+
+        Args:
+            user_info: The user's resume content — work experience,
+                education, skills, contact info, etc. Can be structured
+                or unstructured text.
+            user_instructions: Optional style or content preferences
+                (e.g. "emphasize leadership", "keep it to one page",
+                "use a modern style"). For revisions, describe what to change.
+            parent_report_id: ID of a previous resume to revise (creates
+                new version in the same version group).
+            max_pages: Maximum number of pages for the generated resume.
+                Defaults to 1. Allowed range: 1-5.
+
+        Returns:
+            Dict with status, report_id, title, and content_type.
+        """
+        report_group_id: int | None = None
+        parent_content: str | None = None
+
+        template = _get_template()
+        llm_reference = _build_llm_reference(template)
+
+        async def _save_failed_report(error_msg: str) -> int | None:
+            try:
+                async with shielded_async_session() as session:
+                    failed = Report(
+                        title="Resume",
+                        content=None,
+                        content_type="typst",
+                        report_metadata={
+                            "status": "failed",
+                            "error_message": error_msg,
+                        },
+                        report_style="resume",
+                        search_space_id=search_space_id,
+                        thread_id=thread_id,
+                        report_group_id=report_group_id,
+                    )
+                    session.add(failed)
+                    await session.commit()
+                    await session.refresh(failed)
+                    if not failed.report_group_id:
+                        failed.report_group_id = failed.id
+                        await session.commit()
+                    logger.info(
+                        f"[generate_resume] Saved failed report {failed.id}: {error_msg}"
+                    )
+                    return failed.id
+            except Exception:
+                logger.exception(
+                    "[generate_resume] Could not persist failed report row"
+                )
+                return None
+
+        try:
+            try:
+                validated_max_pages = _validate_max_pages(max_pages)
+            except ValueError as e:
+                error_msg = str(e)
+                report_id = await _save_failed_report(error_msg)
+                return {
+                    "status": "failed",
+                    "error": error_msg,
+                    "report_id": report_id,
+                    "title": "Resume",
+                    "content_type": "typst",
+                }
+
+            # ── Phase 1: READ ─────────────────────────────────────────────
+            async with shielded_async_session() as read_session:
+                if parent_report_id:
+                    parent_report = await read_session.get(Report, parent_report_id)
+                    if parent_report:
+                        report_group_id = parent_report.report_group_id
+                        parent_content = parent_report.content
+                        logger.info(
+                            f"[generate_resume] Revising from parent {parent_report_id} "
+                            f"(group {report_group_id})"
+                        )
+
+                llm = await get_document_summary_llm(read_session, search_space_id)
+
+            if not llm:
+                error_msg = (
+                    "No LLM configured. Please configure a language model in Settings."
+                )
+                report_id = await _save_failed_report(error_msg)
+                return {
+                    "status": "failed",
+                    "error": error_msg,
+                    "report_id": report_id,
+                    "title": "Resume",
+                    "content_type": "typst",
+                }
+
+            # ── Phase 2: LLM GENERATION ───────────────────────────────────
+
+            user_instructions_section = ""
+            if user_instructions:
+                user_instructions_section = (
+                    f"**Additional Instructions:** {user_instructions}"
+                )
+
+            if parent_content:
+                dispatch_custom_event(
+                    "report_progress",
+                    {"phase": "writing", "message": "Updating your resume"},
+                )
+                parent_body = _strip_header(parent_content)
+                prompt = _REVISION_PROMPT.format(
+                    llm_reference=llm_reference,
+                    max_pages=validated_max_pages,
+                    user_instructions=user_instructions
+                    or "Improve and refine the resume.",
+                    previous_content=parent_body,
+                )
+            else:
+                dispatch_custom_event(
+                    "report_progress",
+                    {"phase": "writing", "message": "Building your resume"},
+                )
+                prompt = _RESUME_PROMPT.format(
+                    llm_reference=llm_reference,
+                    user_info=user_info,
+                    max_pages=validated_max_pages,
+                    user_instructions_section=user_instructions_section,
+                )
+
+            response = await llm.ainvoke([HumanMessage(content=prompt)])
+            body = response.content
+
+            if not body or not isinstance(body, str):
+                error_msg = "LLM returned empty or invalid content"
+                report_id = await _save_failed_report(error_msg)
+                return {
+                    "status": "failed",
+                    "error": error_msg,
+                    "report_id": report_id,
+                    "title": "Resume",
+                    "content_type": "typst",
+                }
+
+            body = _strip_typst_fences(body)
+            body = _strip_imports(body)
+
+            # ── Phase 3: ASSEMBLE + COMPILE ───────────────────────────────
+            dispatch_custom_event(
+                "report_progress",
+                {"phase": "compiling", "message": "Compiling resume..."},
+            )
+
+            name = _extract_name(body) or "Resume"
+            typst_source = ""
+            actual_pages = 0
+            compression_attempts = 0
+            target_page_met = False
+
+            for compression_round in range(MAX_COMPRESSION_ATTEMPTS + 1):
+                header = _build_header(template, name)
+                typst_source = header + body
+                compile_error: str | None = None
+                pdf_bytes: bytes | None = None
+
+                for compile_attempt in range(2):
+                    try:
+                        pdf_bytes = _compile_typst(typst_source)
+                        compile_error = None
+                        break
+                    except Exception as e:
+                        compile_error = str(e)
+                        logger.warning(
+                            "[generate_resume] Compile attempt %s failed: %s",
+                            compile_attempt + 1,
+                            compile_error,
+                        )
+
+                        if compile_attempt == 0:
+                            dispatch_custom_event(
+                                "report_progress",
+                                {
+                                    "phase": "fixing",
+                                    "message": "Fixing compilation issue...",
+                                },
+                            )
+                            fix_prompt = _FIX_COMPILE_PROMPT.format(
+                                llm_reference=llm_reference,
+                                error=compile_error,
+                                full_source=typst_source,
+                            )
+                            fix_response = await llm.ainvoke(
+                                [HumanMessage(content=fix_prompt)]
+                            )
+                            if fix_response.content and isinstance(
+                                fix_response.content, str
+                            ):
+                                body = _strip_typst_fences(fix_response.content)
+                                body = _strip_imports(body)
+                                name = _extract_name(body) or name
+                                header = _build_header(template, name)
+                                typst_source = header + body
+
+                if compile_error or not pdf_bytes:
+                    error_msg = (
+                        "Typst compilation failed after 2 attempts: "
+                        f"{compile_error or 'Unknown compile error'}"
+                    )
+                    report_id = await _save_failed_report(error_msg)
+                    return {
+                        "status": "failed",
+                        "error": error_msg,
+                        "report_id": report_id,
+                        "title": "Resume",
+                        "content_type": "typst",
+                    }
+
+                actual_pages = _count_pdf_pages(pdf_bytes)
+                if actual_pages <= validated_max_pages:
+                    target_page_met = True
+                    break
+
+                if compression_round >= MAX_COMPRESSION_ATTEMPTS:
+                    break
+
+                compression_attempts += 1
+                dispatch_custom_event(
+                    "report_progress",
+                    {
+                        "phase": "compressing",
+                        "message": f"Condensing resume to {validated_max_pages} page(s)...",
+                    },
+                )
+                compress_prompt = _COMPRESS_TO_PAGE_LIMIT_PROMPT.format(
+                    llm_reference=llm_reference,
+                    max_pages=validated_max_pages,
+                    actual_pages=actual_pages,
+                    attempt_number=compression_attempts,
+                    previous_content=body,
+                )
+                compress_response = await llm.ainvoke(
+                    [HumanMessage(content=compress_prompt)]
+                )
+                if not compress_response.content or not isinstance(
+                    compress_response.content, str
+                ):
+                    error_msg = "LLM returned empty content while compressing resume"
+                    report_id = await _save_failed_report(error_msg)
+                    return {
+                        "status": "failed",
+                        "error": error_msg,
+                        "report_id": report_id,
+                        "title": "Resume",
+                        "content_type": "typst",
+                    }
+
+                body = _strip_typst_fences(compress_response.content)
+                body = _strip_imports(body)
+                name = _extract_name(body) or name
+
+            if actual_pages > MAX_RESUME_PAGES:
+                error_msg = (
+                    "Resume exceeds hard page limit after compression retries. "
+                    f"Hard limit: <= {MAX_RESUME_PAGES} page(s), actual: {actual_pages}."
+                )
+                report_id = await _save_failed_report(error_msg)
+                return {
+                    "status": "failed",
+                    "error": error_msg,
+                    "report_id": report_id,
+                    "title": "Resume",
+                    "content_type": "typst",
+                }
+
+            # ── Phase 4: SAVE ─────────────────────────────────────────────
+            dispatch_custom_event(
+                "report_progress",
+                {"phase": "saving", "message": "Saving your resume"},
+            )
+
+            resume_title = f"{name} - Resume" if name != "Resume" else "Resume"
+
+            metadata: dict[str, Any] = {
+                "status": "ready",
+                "word_count": len(typst_source.split()),
+                "char_count": len(typst_source),
+                "target_max_pages": validated_max_pages,
+                "actual_page_count": actual_pages,
+                "page_limit_enforced": True,
+                "compression_attempts": compression_attempts,
+                "target_page_met": target_page_met,
+            }
+
+            async with shielded_async_session() as write_session:
+                report = Report(
+                    title=resume_title,
+                    content=typst_source,
+                    content_type="typst",
+                    report_metadata=metadata,
+                    report_style="resume",
+                    search_space_id=search_space_id,
+                    thread_id=thread_id,
+                    report_group_id=report_group_id,
+                )
+                write_session.add(report)
+                await write_session.commit()
+                await write_session.refresh(report)
+
+                if not report.report_group_id:
+                    report.report_group_id = report.id
+                    await write_session.commit()
+
+                saved_id = report.id
+
+            logger.info(f"[generate_resume] Created resume {saved_id}: {resume_title}")
+
+            return {
+                "status": "ready",
+                "report_id": saved_id,
+                "title": resume_title,
+                "content_type": "typst",
+                "is_revision": bool(parent_content),
+                "message": (
+                    f"Resume generated successfully: {resume_title}"
+                    if target_page_met
+                    else (
+                        f"Resume generated, but could not fit the target of <= {validated_max_pages} "
+                        f"page(s). Final length: {actual_pages} page(s)."
+                    )
+                ),
+            }
+
+        except Exception as e:
+            error_message = str(e)
+            logger.exception(f"[generate_resume] Error: {error_message}")
+            report_id = await _save_failed_report(error_message)
+            return {
+                "status": "failed",
+                "error": error_message,
+                "report_id": report_id,
+                "title": "Resume",
+                "content_type": "typst",
+            }
+
+    return generate_resume
diff --git a/surfsense_backend/app/agents/new_chat/tools/search_surfsense_docs.py b/surfsense_backend/app/agents/new_chat/tools/search_surfsense_docs.py
index b8b1527c7e..d8a0efac72 100644
--- a/surfsense_backend/app/agents/new_chat/tools/search_surfsense_docs.py
+++ b/surfsense_backend/app/agents/new_chat/tools/search_surfsense_docs.py
@@ -15,8 +15,9 @@
 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
 
-from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument
+from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument, async_session_maker
 from app.utils.document_converters import embed_text
+from app.utils.surfsense_docs import surfsense_docs_public_url
 
 
 def format_surfsense_docs_results(results: list[tuple]) -> str:
@@ -40,13 +41,14 @@ def format_surfsense_docs_results(results: list[tuple]) -> str:
     # Group chunks by document
     grouped: dict[int, dict] = {}
     for chunk, doc in results:
+        public_url = surfsense_docs_public_url(doc.source)
         if doc.id not in grouped:
             grouped[doc.id] = {
                 "document_id": f"doc-{doc.id}",
                 "document_type": "SURFSENSE_DOCS",
                 "title": doc.title,
-                "url": doc.source,
-                "metadata": {"source": doc.source},
+                "url": public_url,
+                "metadata": {"source": doc.source, "public_url": public_url},
                 "chunks": [],
             }
         grouped[doc.id]["chunks"].append(
@@ -124,12 +126,19 @@ def create_search_surfsense_docs_tool(db_session: AsyncSession):
     """
     Factory function to create the search_surfsense_docs tool.
 
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+
     Args:
-        db_session: Database session for executing queries
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
 
     Returns:
         A configured tool function for searching Surfsense documentation
     """
+    del db_session  # per-call session — see docstring
 
     @tool
     async def search_surfsense_docs(query: str, top_k: int = 10) -> str:
@@ -155,10 +164,11 @@ async def search_surfsense_docs(query: str, top_k: int = 10) -> str:
         Returns:
             Relevant documentation content formatted with chunk IDs for citations
         """
-        return await search_surfsense_docs_async(
-            query=query,
-            db_session=db_session,
-            top_k=top_k,
-        )
+        async with async_session_maker() as db_session:
+            return await search_surfsense_docs_async(
+                query=query,
+                db_session=db_session,
+                top_k=top_k,
+            )
 
     return search_surfsense_docs
diff --git a/surfsense_backend/app/agents/new_chat/tools/shared_memory.py b/surfsense_backend/app/agents/new_chat/tools/shared_memory.py
deleted file mode 100644
index c826d808fa..0000000000
--- a/surfsense_backend/app/agents/new_chat/tools/shared_memory.py
+++ /dev/null
@@ -1,281 +0,0 @@
-"""Shared (team) memory backend for search-space-scoped AI context."""
-
-import asyncio
-import logging
-from typing import Any
-from uuid import UUID
-
-from langchain_core.tools import tool
-from sqlalchemy import select
-from sqlalchemy.ext.asyncio import AsyncSession
-
-from app.db import MemoryCategory, SharedMemory, User
-from app.utils.document_converters import embed_text
-
-logger = logging.getLogger(__name__)
-
-DEFAULT_RECALL_TOP_K = 5
-MAX_MEMORIES_PER_SEARCH_SPACE = 250
-
-
-async def get_shared_memory_count(
-    db_session: AsyncSession,
-    search_space_id: int,
-) -> int:
-    result = await db_session.execute(
-        select(SharedMemory).where(SharedMemory.search_space_id == search_space_id)
-    )
-    return len(result.scalars().all())
-
-
-async def delete_oldest_shared_memory(
-    db_session: AsyncSession,
-    search_space_id: int,
-) -> None:
-    result = await db_session.execute(
-        select(SharedMemory)
-        .where(SharedMemory.search_space_id == search_space_id)
-        .order_by(SharedMemory.updated_at.asc())
-        .limit(1)
-    )
-    oldest = result.scalars().first()
-    if oldest:
-        await db_session.delete(oldest)
-        await db_session.commit()
-
-
-def _to_uuid(value: str | UUID) -> UUID:
-    if isinstance(value, UUID):
-        return value
-    return UUID(value)
-
-
-async def save_shared_memory(
-    db_session: AsyncSession,
-    search_space_id: int,
-    created_by_id: str | UUID,
-    content: str,
-    category: str = "fact",
-) -> dict[str, Any]:
-    category = category.lower() if category else "fact"
-    valid = ["preference", "fact", "instruction", "context"]
-    if category not in valid:
-        category = "fact"
-    try:
-        count = await get_shared_memory_count(db_session, search_space_id)
-        if count >= MAX_MEMORIES_PER_SEARCH_SPACE:
-            await delete_oldest_shared_memory(db_session, search_space_id)
-        embedding = await asyncio.to_thread(embed_text, content)
-        row = SharedMemory(
-            search_space_id=search_space_id,
-            created_by_id=_to_uuid(created_by_id),
-            memory_text=content,
-            category=MemoryCategory(category),
-            embedding=embedding,
-        )
-        db_session.add(row)
-        await db_session.commit()
-        await db_session.refresh(row)
-        return {
-            "status": "saved",
-            "memory_id": row.id,
-            "memory_text": content,
-            "category": category,
-            "message": f"I'll remember: {content}",
-        }
-    except Exception as e:
-        logger.exception("Failed to save shared memory: %s", e)
-        await db_session.rollback()
-        return {
-            "status": "error",
-            "error": str(e),
-            "message": "Failed to save memory. Please try again.",
-        }
-
-
-async def recall_shared_memory(
-    db_session: AsyncSession,
-    search_space_id: int,
-    query: str | None = None,
-    category: str | None = None,
-    top_k: int = DEFAULT_RECALL_TOP_K,
-) -> dict[str, Any]:
-    top_k = min(max(top_k, 1), 20)
-    try:
-        valid_categories = ["preference", "fact", "instruction", "context"]
-        stmt = select(SharedMemory).where(
-            SharedMemory.search_space_id == search_space_id
-        )
-        if category and category in valid_categories:
-            stmt = stmt.where(SharedMemory.category == MemoryCategory(category))
-        if query:
-            query_embedding = await asyncio.to_thread(embed_text, query)
-            stmt = stmt.order_by(
-                SharedMemory.embedding.op("<=>")(query_embedding)
-            ).limit(top_k)
-        else:
-            stmt = stmt.order_by(SharedMemory.updated_at.desc()).limit(top_k)
-        result = await db_session.execute(stmt)
-        rows = result.scalars().all()
-        memory_list = [
-            {
-                "id": m.id,
-                "memory_text": m.memory_text,
-                "category": m.category.value if m.category else "unknown",
-                "updated_at": m.updated_at.isoformat() if m.updated_at else None,
-                "created_by_id": str(m.created_by_id) if m.created_by_id else None,
-            }
-            for m in rows
-        ]
-        created_by_ids = list(
-            {m["created_by_id"] for m in memory_list if m["created_by_id"]}
-        )
-        created_by_map: dict[str, str] = {}
-        if created_by_ids:
-            uuids = [UUID(uid) for uid in created_by_ids]
-            users_result = await db_session.execute(
-                select(User).where(User.id.in_(uuids))
-            )
-            for u in users_result.scalars().all():
-                created_by_map[str(u.id)] = u.display_name or "A team member"
-        formatted_context = format_shared_memories_for_context(
-            memory_list, created_by_map
-        )
-        return {
-            "status": "success",
-            "count": len(memory_list),
-            "memories": memory_list,
-            "formatted_context": formatted_context,
-        }
-    except Exception as e:
-        logger.exception("Failed to recall shared memory: %s", e)
-        await db_session.rollback()
-        return {
-            "status": "error",
-            "error": str(e),
-            "memories": [],
-            "formatted_context": "Failed to recall memories.",
-        }
-
-
-def format_shared_memories_for_context(
-    memories: list[dict[str, Any]],
-    created_by_map: dict[str, str] | None = None,
-) -> str:
-    if not memories:
-        return "No relevant team memories found."
-    created_by_map = created_by_map or {}
-    parts = ["<team_memories>"]
-    for memory in memories:
-        category = memory.get("category", "unknown")
-        text = memory.get("memory_text", "")
-        updated = memory.get("updated_at", "")
-        created_by_id = memory.get("created_by_id")
-        added_by = (
-            created_by_map.get(str(created_by_id), "A team member")
-            if created_by_id is not None
-            else "A team member"
-        )
-        parts.append(
-            f"  <memory category='{category}' updated='{updated}' added_by='{added_by}'>{text}</memory>"
-        )
-    parts.append("</team_memories>")
-    return "\n".join(parts)
-
-
-def create_save_shared_memory_tool(
-    search_space_id: int,
-    created_by_id: str | UUID,
-    db_session: AsyncSession,
-):
-    """
-    Factory function to create the save_memory tool for shared (team) chats.
-
-    Args:
-        search_space_id: The search space ID
-        created_by_id: The user ID of the person adding the memory
-        db_session: Database session for executing queries
-
-    Returns:
-        A configured tool function for saving team memories
-    """
-
-    @tool
-    async def save_memory(
-        content: str,
-        category: str = "fact",
-    ) -> dict[str, Any]:
-        """
-        Save a fact, preference, or context to the team's shared memory for future reference.
-
-        Use this tool when:
-        - User or a team member says "remember this", "keep this in mind", or similar in this shared chat
-        - The team agrees on something to remember (e.g., decisions, conventions, where things live)
-        - Someone shares a preference or fact that should be visible to the whole team
-
-        The saved information will be available in future shared conversations in this space.
-
-        Args:
-            content: The fact/preference/context to remember.
-                    Phrase it clearly, e.g., "API keys are stored in Vault",
-                    "The team prefers weekly demos on Fridays"
-            category: Type of memory. One of:
-                    - "preference": Team or workspace preferences
-                    - "fact": Facts the team agreed on (e.g., processes, locations)
-                    - "instruction": Standing instructions for the team
-                    - "context": Current context (e.g., ongoing projects, goals)
-
-        Returns:
-            A dictionary with the save status and memory details
-        """
-        return await save_shared_memory(
-            db_session, search_space_id, created_by_id, content, category
-        )
-
-    return save_memory
-
-
-def create_recall_shared_memory_tool(
-    search_space_id: int,
-    db_session: AsyncSession,
-):
-    """
-    Factory function to create the recall_memory tool for shared (team) chats.
-
-    Args:
-        search_space_id: The search space ID
-        db_session: Database session for executing queries
-
-    Returns:
-        A configured tool function for recalling team memories
-    """
-
-    @tool
-    async def recall_memory(
-        query: str | None = None,
-        category: str | None = None,
-        top_k: int = DEFAULT_RECALL_TOP_K,
-    ) -> dict[str, Any]:
-        """
-        Recall relevant team memories for this space to provide contextual responses.
-
-        Use this tool when:
-        - You need team context to answer (e.g., "where do we store X?", "what did we decide about Y?")
-        - Someone asks about something the team agreed to remember
-        - Team preferences or conventions would improve the response
-
-        Args:
-            query: Optional search query to find specific memories.
-                  If not provided, returns the most recent memories.
-            category: Optional category filter. One of:
-                     "preference", "fact", "instruction", "context"
-            top_k: Number of memories to retrieve (default: 5, max: 20)
-
-        Returns:
-            A dictionary containing relevant memories and formatted context
-        """
-        return await recall_shared_memory(
-            db_session, search_space_id, query, category, top_k
-        )
-
-    return recall_memory
diff --git a/surfsense_backend/app/agents/new_chat/tools/teams/__init__.py b/surfsense_backend/app/agents/new_chat/tools/teams/__init__.py
new file mode 100644
index 0000000000..60e2add495
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/tools/teams/__init__.py
@@ -0,0 +1,15 @@
+from app.agents.new_chat.tools.teams.list_channels import (
+    create_list_teams_channels_tool,
+)
+from app.agents.new_chat.tools.teams.read_messages import (
+    create_read_teams_messages_tool,
+)
+from app.agents.new_chat.tools.teams.send_message import (
+    create_send_teams_message_tool,
+)
+
+__all__ = [
+    "create_list_teams_channels_tool",
+    "create_read_teams_messages_tool",
+    "create_send_teams_message_tool",
+]
diff --git a/surfsense_backend/app/agents/new_chat/tools/teams/_auth.py b/surfsense_backend/app/agents/new_chat/tools/teams/_auth.py
new file mode 100644
index 0000000000..4345bb4766
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/tools/teams/_auth.py
@@ -0,0 +1,38 @@
+"""Shared auth helper for Teams agent tools (Microsoft Graph REST API)."""
+
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.future import select
+
+from app.db import SearchSourceConnector, SearchSourceConnectorType
+
+GRAPH_API = "https://graph.microsoft.com/v1.0"
+
+
+async def get_teams_connector(
+    db_session: AsyncSession,
+    search_space_id: int,
+    user_id: str,
+) -> SearchSourceConnector | None:
+    result = await db_session.execute(
+        select(SearchSourceConnector).filter(
+            SearchSourceConnector.search_space_id == search_space_id,
+            SearchSourceConnector.user_id == user_id,
+            SearchSourceConnector.connector_type
+            == SearchSourceConnectorType.TEAMS_CONNECTOR,
+        )
+    )
+    return result.scalars().first()
+
+
+async def get_access_token(
+    db_session: AsyncSession,
+    connector: SearchSourceConnector,
+) -> str:
+    """Get a valid Microsoft Graph access token, refreshing if expired."""
+    from app.connectors.teams_connector import TeamsConnector
+
+    tc = TeamsConnector(
+        session=db_session,
+        connector_id=connector.id,
+    )
+    return await tc._get_valid_token()
diff --git a/surfsense_backend/app/agents/new_chat/tools/teams/list_channels.py b/surfsense_backend/app/agents/new_chat/tools/teams/list_channels.py
new file mode 100644
index 0000000000..0fc52b5c7e
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/tools/teams/list_channels.py
@@ -0,0 +1,114 @@
+import logging
+from typing import Any
+
+import httpx
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import async_session_maker
+
+from ._auth import GRAPH_API, get_access_token, get_teams_connector
+
+logger = logging.getLogger(__name__)
+
+
+def create_list_teams_channels_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    """
+    Factory function to create the list_teams_channels tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+
+    Returns:
+        Configured list_teams_channels tool
+    """
+    del db_session  # per-call session — see docstring
+
+    @tool
+    async def list_teams_channels() -> dict[str, Any]:
+        """List all Microsoft Teams and their channels the user has access to.
+
+        Returns:
+            Dictionary with status and a list of teams, each containing
+            team_id, team_name, and a list of channels (id, name).
+        """
+        if search_space_id is None or user_id is None:
+            return {"status": "error", "message": "Teams tool not properly configured."}
+
+        try:
+            async with async_session_maker() as db_session:
+                connector = await get_teams_connector(
+                    db_session, search_space_id, user_id
+                )
+                if not connector:
+                    return {"status": "error", "message": "No Teams connector found."}
+
+                token = await get_access_token(db_session, connector)
+                headers = {"Authorization": f"Bearer {token}"}
+
+                async with httpx.AsyncClient(timeout=20.0) as client:
+                    teams_resp = await client.get(
+                        f"{GRAPH_API}/me/joinedTeams", headers=headers
+                    )
+
+                if teams_resp.status_code == 401:
+                    return {
+                        "status": "auth_error",
+                        "message": "Teams token expired. Please re-authenticate.",
+                        "connector_type": "teams",
+                    }
+                if teams_resp.status_code != 200:
+                    return {
+                        "status": "error",
+                        "message": f"Graph API error: {teams_resp.status_code}",
+                    }
+
+                teams_data = teams_resp.json().get("value", [])
+                result_teams = []
+
+                async with httpx.AsyncClient(timeout=20.0) as client:
+                    for team in teams_data:
+                        team_id = team["id"]
+                        ch_resp = await client.get(
+                            f"{GRAPH_API}/teams/{team_id}/channels",
+                            headers=headers,
+                        )
+                        channels = []
+                        if ch_resp.status_code == 200:
+                            channels = [
+                                {"id": ch["id"], "name": ch.get("displayName", "")}
+                                for ch in ch_resp.json().get("value", [])
+                            ]
+                        result_teams.append(
+                            {
+                                "team_id": team_id,
+                                "team_name": team.get("displayName", ""),
+                                "channels": channels,
+                            }
+                        )
+
+                return {
+                    "status": "success",
+                    "teams": result_teams,
+                    "total_teams": len(result_teams),
+                }
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+            logger.error("Error listing Teams channels: %s", e, exc_info=True)
+            return {"status": "error", "message": "Failed to list Teams channels."}
+
+    return list_teams_channels
diff --git a/surfsense_backend/app/agents/new_chat/tools/teams/read_messages.py b/surfsense_backend/app/agents/new_chat/tools/teams/read_messages.py
new file mode 100644
index 0000000000..0ebda021ee
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/tools/teams/read_messages.py
@@ -0,0 +1,125 @@
+import logging
+from typing import Any
+
+import httpx
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import async_session_maker
+
+from ._auth import GRAPH_API, get_access_token, get_teams_connector
+
+logger = logging.getLogger(__name__)
+
+
+def create_read_teams_messages_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    """
+    Factory function to create the read_teams_messages tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+
+    Returns:
+        Configured read_teams_messages tool
+    """
+    del db_session  # per-call session — see docstring
+
+    @tool
+    async def read_teams_messages(
+        team_id: str,
+        channel_id: str,
+        limit: int = 25,
+    ) -> dict[str, Any]:
+        """Read recent messages from a Microsoft Teams channel.
+
+        Args:
+            team_id: The team ID (from list_teams_channels).
+            channel_id: The channel ID (from list_teams_channels).
+            limit: Number of messages to fetch (default 25, max 50).
+
+        Returns:
+            Dictionary with status and a list of messages including
+            id, sender, content, timestamp.
+        """
+        if search_space_id is None or user_id is None:
+            return {"status": "error", "message": "Teams tool not properly configured."}
+
+        limit = min(limit, 50)
+
+        try:
+            async with async_session_maker() as db_session:
+                connector = await get_teams_connector(
+                    db_session, search_space_id, user_id
+                )
+                if not connector:
+                    return {"status": "error", "message": "No Teams connector found."}
+
+                token = await get_access_token(db_session, connector)
+
+                async with httpx.AsyncClient(timeout=20.0) as client:
+                    resp = await client.get(
+                        f"{GRAPH_API}/teams/{team_id}/channels/{channel_id}/messages",
+                        headers={"Authorization": f"Bearer {token}"},
+                        params={"$top": limit},
+                    )
+
+                if resp.status_code == 401:
+                    return {
+                        "status": "auth_error",
+                        "message": "Teams token expired. Please re-authenticate.",
+                        "connector_type": "teams",
+                    }
+                if resp.status_code == 403:
+                    return {
+                        "status": "error",
+                        "message": "Insufficient permissions to read this channel.",
+                    }
+                if resp.status_code != 200:
+                    return {
+                        "status": "error",
+                        "message": f"Graph API error: {resp.status_code}",
+                    }
+
+                raw_msgs = resp.json().get("value", [])
+                messages = []
+                for m in raw_msgs:
+                    sender = m.get("from", {})
+                    user_info = sender.get("user", {}) if sender else {}
+                    body = m.get("body", {})
+                    messages.append(
+                        {
+                            "id": m.get("id"),
+                            "sender": user_info.get("displayName", "Unknown"),
+                            "content": body.get("content", ""),
+                            "content_type": body.get("contentType", "text"),
+                            "timestamp": m.get("createdDateTime", ""),
+                        }
+                    )
+
+                return {
+                    "status": "success",
+                    "team_id": team_id,
+                    "channel_id": channel_id,
+                    "messages": messages,
+                    "total": len(messages),
+                }
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+            logger.error("Error reading Teams messages: %s", e, exc_info=True)
+            return {"status": "error", "message": "Failed to read Teams messages."}
+
+    return read_teams_messages
diff --git a/surfsense_backend/app/agents/new_chat/tools/teams/send_message.py b/surfsense_backend/app/agents/new_chat/tools/teams/send_message.py
new file mode 100644
index 0000000000..6f40d27e19
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/tools/teams/send_message.py
@@ -0,0 +1,136 @@
+import logging
+from typing import Any
+
+import httpx
+from langchain_core.tools import tool
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.agents.new_chat.tools.hitl import request_approval
+from app.db import async_session_maker
+
+from ._auth import GRAPH_API, get_access_token, get_teams_connector
+
+logger = logging.getLogger(__name__)
+
+
+def create_send_teams_message_tool(
+    db_session: AsyncSession | None = None,
+    search_space_id: int | None = None,
+    user_id: str | None = None,
+):
+    """
+    Factory function to create the send_teams_message tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+
+    Args:
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+
+    Returns:
+        Configured send_teams_message tool
+    """
+    del db_session  # per-call session — see docstring
+
+    @tool
+    async def send_teams_message(
+        team_id: str,
+        channel_id: str,
+        content: str,
+    ) -> dict[str, Any]:
+        """Send a message to a Microsoft Teams channel.
+
+        Requires the ChannelMessage.Send OAuth scope. If the user gets a
+        permission error, they may need to re-authenticate with updated scopes.
+
+        Args:
+            team_id: The team ID (from list_teams_channels).
+            channel_id: The channel ID (from list_teams_channels).
+            content: The message text (HTML supported).
+
+        Returns:
+            Dictionary with status, message_id on success.
+
+            IMPORTANT:
+            - If status is "rejected", the user explicitly declined. Do NOT retry.
+        """
+        if search_space_id is None or user_id is None:
+            return {"status": "error", "message": "Teams tool not properly configured."}
+
+        try:
+            async with async_session_maker() as db_session:
+                connector = await get_teams_connector(
+                    db_session, search_space_id, user_id
+                )
+                if not connector:
+                    return {"status": "error", "message": "No Teams connector found."}
+
+                result = request_approval(
+                    action_type="teams_send_message",
+                    tool_name="send_teams_message",
+                    params={
+                        "team_id": team_id,
+                        "channel_id": channel_id,
+                        "content": content,
+                    },
+                    context={"connector_id": connector.id},
+                )
+
+                if result.rejected:
+                    return {
+                        "status": "rejected",
+                        "message": "User declined. Message was not sent.",
+                    }
+
+                final_content = result.params.get("content", content)
+                final_team = result.params.get("team_id", team_id)
+                final_channel = result.params.get("channel_id", channel_id)
+
+                token = await get_access_token(db_session, connector)
+
+                async with httpx.AsyncClient(timeout=20.0) as client:
+                    resp = await client.post(
+                        f"{GRAPH_API}/teams/{final_team}/channels/{final_channel}/messages",
+                        headers={
+                            "Authorization": f"Bearer {token}",
+                            "Content-Type": "application/json",
+                        },
+                        json={"body": {"content": final_content}},
+                    )
+
+                if resp.status_code == 401:
+                    return {
+                        "status": "auth_error",
+                        "message": "Teams token expired. Please re-authenticate.",
+                        "connector_type": "teams",
+                    }
+                if resp.status_code == 403:
+                    return {
+                        "status": "insufficient_permissions",
+                        "message": "Missing ChannelMessage.Send permission. Please re-authenticate with updated scopes.",
+                    }
+                if resp.status_code not in (200, 201):
+                    return {
+                        "status": "error",
+                        "message": f"Graph API error: {resp.status_code} — {resp.text[:200]}",
+                    }
+
+                msg_data = resp.json()
+                return {
+                    "status": "success",
+                    "message_id": msg_data.get("id"),
+                    "message": "Message sent to Teams channel.",
+                }
+
+        except Exception as e:
+            from langgraph.errors import GraphInterrupt
+
+            if isinstance(e, GraphInterrupt):
+                raise
+            logger.error("Error sending Teams message: %s", e, exc_info=True)
+            return {"status": "error", "message": "Failed to send Teams message."}
+
+    return send_teams_message
diff --git a/surfsense_backend/app/agents/new_chat/tools/tool_response.py b/surfsense_backend/app/agents/new_chat/tools/tool_response.py
new file mode 100644
index 0000000000..8644ada5c3
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/tools/tool_response.py
@@ -0,0 +1,38 @@
+"""Standardised response dict factories for LangChain agent tools."""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+class ToolResponse:
+    @staticmethod
+    def success(message: str, **data: Any) -> dict[str, Any]:
+        return {"status": "success", "message": message, **data}
+
+    @staticmethod
+    def error(error: str, **data: Any) -> dict[str, Any]:
+        return {"status": "error", "error": error, **data}
+
+    @staticmethod
+    def auth_error(service: str, **data: Any) -> dict[str, Any]:
+        return {
+            "status": "auth_error",
+            "error": (
+                f"{service} authentication has expired or been revoked. "
+                "Please re-connect the integration in Settings → Connectors."
+            ),
+            **data,
+        }
+
+    @staticmethod
+    def rejected(message: str = "Action was declined by the user.") -> dict[str, Any]:
+        return {"status": "rejected", "message": message}
+
+    @staticmethod
+    def not_found(resource: str, identifier: str, **data: Any) -> dict[str, Any]:
+        return {
+            "status": "not_found",
+            "error": f"{resource} '{identifier}' was not found.",
+            **data,
+        }
diff --git a/surfsense_backend/app/agents/new_chat/tools/update_memory.py b/surfsense_backend/app/agents/new_chat/tools/update_memory.py
new file mode 100644
index 0000000000..062668aac7
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/tools/update_memory.py
@@ -0,0 +1,441 @@
+"""Markdown-document memory tool for the SurfSense agent.
+
+Replaces the old row-per-fact save_memory / recall_memory tools with a single
+update_memory tool that overwrites a freeform markdown TEXT column.  The LLM
+always sees the current memory in <user_memory> / <team_memory> tags injected
+by MemoryInjectionMiddleware, so it passes the FULL updated document each time.
+
+Overflow handling:
+  - Soft limit (18K chars): a warning is returned telling the agent to
+    consolidate on the next update.
+  - Hard limit (25K chars): a forced LLM-driven rewrite compresses the document.
+    If it still exceeds the limit after rewriting, the save is rejected.
+  - Diff validation: warns when entire ``##`` sections are dropped or when the
+    document shrinks by more than 60%.
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+from typing import Any, Literal
+from uuid import UUID
+
+from langchain_core.messages import HumanMessage
+from langchain_core.tools import tool
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import SearchSpace, User, async_session_maker
+from app.utils.content_utils import extract_text_content
+
+logger = logging.getLogger(__name__)
+
+MEMORY_SOFT_LIMIT = 18_000
+MEMORY_HARD_LIMIT = 25_000
+
+_SECTION_HEADING_RE = re.compile(r"^##\s+(.+)$", re.MULTILINE)
+_HEADING_NORMALIZE_RE = re.compile(r"\s+")
+
+_MARKER_RE = re.compile(r"\[(fact|pref|instr)\]")
+_BULLET_FORMAT_RE = re.compile(r"^- \(\d{4}-\d{2}-\d{2}\) \[(fact|pref|instr)\] .+$")
+_PERSONAL_ONLY_MARKERS = {"pref", "instr"}
+
+
+# ---------------------------------------------------------------------------
+# Diff validation
+# ---------------------------------------------------------------------------
+
+
+def _extract_headings(memory: str) -> set[str]:
+    """Return all ``## …`` heading texts (without the ``## `` prefix)."""
+    return set(_SECTION_HEADING_RE.findall(memory))
+
+
+def _normalize_heading(heading: str) -> str:
+    """Normalize heading text for robust scope checks."""
+    return _HEADING_NORMALIZE_RE.sub(" ", heading.strip().lower())
+
+
+def _validate_memory_scope(
+    content: str, scope: Literal["user", "team"]
+) -> dict[str, Any] | None:
+    """Reject personal-only markers ([pref], [instr]) in team memory."""
+    if scope != "team":
+        return None
+
+    markers = set(_MARKER_RE.findall(content))
+    leaked = sorted(markers & _PERSONAL_ONLY_MARKERS)
+    if leaked:
+        tags = ", ".join(f"[{m}]" for m in leaked)
+        return {
+            "status": "error",
+            "message": (
+                f"Team memory cannot include personal markers: {tags}. "
+                "Use [fact] only in team memory."
+            ),
+        }
+    return None
+
+
+def _validate_bullet_format(content: str) -> list[str]:
+    """Return warnings for bullet lines that don't match the required format.
+
+    Expected: ``- (YYYY-MM-DD) [fact|pref|instr] text``
+    """
+    warnings: list[str] = []
+    for line in content.splitlines():
+        stripped = line.strip()
+        if not stripped.startswith("- "):
+            continue
+        if not _BULLET_FORMAT_RE.match(stripped):
+            short = stripped[:80] + ("..." if len(stripped) > 80 else "")
+            warnings.append(f"Malformed bullet: {short}")
+    return warnings
+
+
+def _validate_diff(old_memory: str | None, new_memory: str) -> list[str]:
+    """Return a list of warning strings about suspicious changes."""
+    if not old_memory:
+        return []
+
+    warnings: list[str] = []
+    old_headings = _extract_headings(old_memory)
+    new_headings = _extract_headings(new_memory)
+    dropped = old_headings - new_headings
+    if dropped:
+        names = ", ".join(sorted(dropped))
+        warnings.append(
+            f"Sections removed: {names}. "
+            "If unintentional, the user can restore from the settings page."
+        )
+
+    old_len = len(old_memory)
+    new_len = len(new_memory)
+    if old_len > 0 and new_len < old_len * 0.4:
+        warnings.append(
+            f"Memory shrank significantly ({old_len:,} -> {new_len:,} chars). "
+            "Possible data loss."
+        )
+    return warnings
+
+
+# ---------------------------------------------------------------------------
+# Size validation & soft warning
+# ---------------------------------------------------------------------------
+
+
+def _validate_memory_size(content: str) -> dict[str, Any] | None:
+    """Return an error/warning dict if *content* is too large, else None."""
+    length = len(content)
+    if length > MEMORY_HARD_LIMIT:
+        return {
+            "status": "error",
+            "message": (
+                f"Memory exceeds {MEMORY_HARD_LIMIT:,} character limit "
+                f"({length:,} chars). Consolidate by merging related items, "
+                "removing outdated entries, and shortening descriptions. "
+                "Then call update_memory again."
+            ),
+        }
+    return None
+
+
+def _soft_warning(content: str) -> str | None:
+    """Return a warning string if content exceeds the soft limit."""
+    length = len(content)
+    if length > MEMORY_SOFT_LIMIT:
+        return (
+            f"Memory is at {length:,}/{MEMORY_HARD_LIMIT:,} characters. "
+            "Consolidate by merging related items and removing less important "
+            "entries on your next update."
+        )
+    return None
+
+
+# ---------------------------------------------------------------------------
+# Forced rewrite when memory exceeds the hard limit
+# ---------------------------------------------------------------------------
+
+_FORCED_REWRITE_PROMPT = """\
+You are a memory curator. The following memory document exceeds the character \
+limit and must be shortened.
+
+RULES:
+1. Rewrite the document to be under {target} characters.
+2. Preserve existing ## headings. Every entry must remain under a heading. You may merge
+   or rename headings to consolidate, but keep names personal and descriptive.
+3. Priority for keeping content: [instr] > [pref] > [fact].
+4. Merge duplicate entries, remove outdated entries, shorten verbose descriptions.
+5. Every bullet MUST have format: - (YYYY-MM-DD) [fact|pref|instr] text
+6. Preserve the user's first name in entries — do not replace it with "the user".
+7. Output ONLY the consolidated markdown — no explanations, no wrapping.
+
+<memory_document>
+{content}
+</memory_document>"""
+
+
+async def _forced_rewrite(content: str, llm: Any) -> str | None:
+    """Use a focused LLM call to compress *content* under the hard limit.
+
+    Returns the rewritten string, or ``None`` if the call fails.
+    """
+    try:
+        prompt = _FORCED_REWRITE_PROMPT.format(
+            target=MEMORY_HARD_LIMIT, content=content
+        )
+        response = await llm.ainvoke(
+            [HumanMessage(content=prompt)],
+            config={"tags": ["surfsense:internal"]},
+        )
+        text = extract_text_content(response.content).strip()
+        if not text:
+            logger.warning("Forced rewrite returned empty text; aborting rewrite")
+            return None
+        return text
+    except Exception:
+        logger.exception("Forced rewrite LLM call failed")
+        return None
+
+
+# ---------------------------------------------------------------------------
+# Shared save-and-respond logic
+# ---------------------------------------------------------------------------
+
+
+async def _save_memory(
+    *,
+    updated_memory: str,
+    old_memory: str | None,
+    llm: Any | None,
+    apply_fn,
+    commit_fn,
+    rollback_fn,
+    label: str,
+    scope: Literal["user", "team"],
+) -> dict[str, Any]:
+    """Validate, optionally force-rewrite if over the hard limit, save, and
+    return a response dict.
+
+    Parameters
+    ----------
+    updated_memory : str
+        The new document the agent submitted.
+    old_memory : str | None
+        The previously persisted document (for diff checks).
+    llm : Any | None
+        LLM instance for forced rewrite (may be ``None``).
+    apply_fn : callable(str) -> None
+        Callback that sets the new memory on the ORM object.
+    commit_fn : coroutine
+        ``session.commit``.
+    rollback_fn : coroutine
+        ``session.rollback``.
+    label : str
+        Human label for log messages (e.g. "user memory", "team memory").
+    """
+    if not isinstance(updated_memory, str):
+        logger.warning(
+            "Refusing non-string memory payload (type=%s)",
+            type(updated_memory).__name__,
+        )
+        return {
+            "status": "error",
+            "message": "Internal error: memory payload must be a string.",
+        }
+
+    content = updated_memory
+
+    # --- forced rewrite if over the hard limit ---
+    if len(content) > MEMORY_HARD_LIMIT and llm is not None:
+        rewritten = await _forced_rewrite(content, llm)
+        if rewritten is not None and len(rewritten) < len(content):
+            content = rewritten
+
+    # --- hard-limit gate (reject if still too large after rewrite) ---
+    size_err = _validate_memory_size(content)
+    if size_err:
+        return size_err
+
+    scope_err = _validate_memory_scope(content, scope)
+    if scope_err:
+        return scope_err
+
+    # --- persist ---
+    try:
+        apply_fn(content)
+        await commit_fn()
+    except Exception as e:
+        logger.exception("Failed to update %s: %s", label, e)
+        await rollback_fn()
+        return {"status": "error", "message": f"Failed to update {label}: {e}"}
+
+    # --- build response ---
+    resp: dict[str, Any] = {
+        "status": "saved",
+        "message": f"{label.capitalize()} updated.",
+    }
+
+    if content is not updated_memory:
+        resp["notice"] = "Memory was automatically rewritten to fit within limits."
+
+    diff_warnings = _validate_diff(old_memory, content)
+    if diff_warnings:
+        resp["diff_warnings"] = diff_warnings
+
+    format_warnings = _validate_bullet_format(content)
+    if format_warnings:
+        resp["format_warnings"] = format_warnings
+
+    warning = _soft_warning(content)
+    if warning:
+        resp["warning"] = warning
+
+    return resp
+
+
+# ---------------------------------------------------------------------------
+# Tool factories
+# ---------------------------------------------------------------------------
+
+
+def create_update_memory_tool(
+    user_id: str | UUID,
+    db_session: AsyncSession,
+    llm: Any | None = None,
+):
+    """Factory function to create the user-memory update tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+    The session's bound ``commit``/``rollback`` methods are captured at
+    call time, after ``async with`` has bound ``db_session`` locally.
+
+    Args:
+        user_id: ID of the user whose memory document is being updated.
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+        llm: Optional LLM for the forced-rewrite path.
+
+    Returns:
+        Configured update_memory tool for the user-memory scope.
+    """
+    del db_session  # per-call session — see docstring
+    uid = UUID(user_id) if isinstance(user_id, str) else user_id
+
+    @tool
+    async def update_memory(updated_memory: str) -> dict[str, Any]:
+        """Update the user's personal memory document.
+
+        Your current memory is shown in <user_memory> in the system prompt.
+        When the user shares important long-term information (preferences,
+        facts, instructions, context), rewrite the memory document to include
+        the new information.  Merge new facts with existing ones, update
+        contradictions, remove outdated entries, and keep it concise.
+
+        Args:
+            updated_memory: The FULL updated markdown document (not a diff).
+        """
+        try:
+            async with async_session_maker() as db_session:
+                result = await db_session.execute(select(User).where(User.id == uid))
+                user = result.scalars().first()
+                if not user:
+                    return {"status": "error", "message": "User not found."}
+
+                old_memory = user.memory_md
+
+                return await _save_memory(
+                    updated_memory=updated_memory,
+                    old_memory=old_memory,
+                    llm=llm,
+                    apply_fn=lambda content: setattr(user, "memory_md", content),
+                    commit_fn=db_session.commit,
+                    rollback_fn=db_session.rollback,
+                    label="memory",
+                    scope="user",
+                )
+        except Exception as e:
+            logger.exception("Failed to update user memory: %s", e)
+            return {
+                "status": "error",
+                "message": f"Failed to update memory: {e}",
+            }
+
+    return update_memory
+
+
+def create_update_team_memory_tool(
+    search_space_id: int,
+    db_session: AsyncSession,
+    llm: Any | None = None,
+):
+    """Factory function to create the team-memory update tool.
+
+    The tool acquires its own short-lived ``AsyncSession`` per call via
+    :data:`async_session_maker` so the closure is safe to share across
+    HTTP requests by the compiled-agent cache. Capturing a per-request
+    session here would surface stale/closed sessions on cache hits.
+    The session's bound ``commit``/``rollback`` methods are captured at
+    call time, after ``async with`` has bound ``db_session`` locally.
+
+    Args:
+        search_space_id: ID of the search space whose team memory is being
+            updated.
+        db_session: Reserved for registry compatibility. Per-call sessions
+            are opened via :data:`async_session_maker` inside the tool body.
+        llm: Optional LLM for the forced-rewrite path.
+
+    Returns:
+        Configured update_memory tool for the team-memory scope.
+    """
+    del db_session  # per-call session — see docstring
+
+    @tool
+    async def update_memory(updated_memory: str) -> dict[str, Any]:
+        """Update the team's shared memory document for this search space.
+
+        Your current team memory is shown in <team_memory> in the system
+        prompt.  When the team shares important long-term information
+        (decisions, conventions, key facts, priorities), rewrite the memory
+        document to include the new information.  Merge new facts with
+        existing ones, update contradictions, remove outdated entries, and
+        keep it concise.
+
+        Args:
+            updated_memory: The FULL updated markdown document (not a diff).
+        """
+        try:
+            async with async_session_maker() as db_session:
+                result = await db_session.execute(
+                    select(SearchSpace).where(SearchSpace.id == search_space_id)
+                )
+                space = result.scalars().first()
+                if not space:
+                    return {"status": "error", "message": "Search space not found."}
+
+                old_memory = space.shared_memory_md
+
+                return await _save_memory(
+                    updated_memory=updated_memory,
+                    old_memory=old_memory,
+                    llm=llm,
+                    apply_fn=lambda content: setattr(
+                        space, "shared_memory_md", content
+                    ),
+                    commit_fn=db_session.commit,
+                    rollback_fn=db_session.rollback,
+                    label="team memory",
+                    scope="team",
+                )
+        except Exception as e:
+            logger.exception("Failed to update team memory: %s", e)
+            return {
+                "status": "error",
+                "message": f"Failed to update team memory: {e}",
+            }
+
+    return update_memory
diff --git a/surfsense_backend/app/agents/new_chat/tools/user_memory.py b/surfsense_backend/app/agents/new_chat/tools/user_memory.py
deleted file mode 100644
index 81e8498569..0000000000
--- a/surfsense_backend/app/agents/new_chat/tools/user_memory.py
+++ /dev/null
@@ -1,351 +0,0 @@
-"""
-User memory tools for the SurfSense agent.
-
-This module provides tools for storing and retrieving user memories,
-enabling personalized AI responses similar to Claude's memory feature.
-
-Features:
-- save_memory: Store facts, preferences, and context about the user
-- recall_memory: Retrieve relevant memories using semantic search
-"""
-
-import asyncio
-import logging
-from typing import Any
-from uuid import UUID
-
-from langchain_core.tools import tool
-from sqlalchemy import select
-from sqlalchemy.ext.asyncio import AsyncSession
-
-from app.db import MemoryCategory, UserMemory
-from app.utils.document_converters import embed_text
-
-logger = logging.getLogger(__name__)
-
-
-# =============================================================================
-# Constants
-# =============================================================================
-
-# Default number of memories to retrieve
-DEFAULT_RECALL_TOP_K = 5
-
-# Maximum number of memories per user (to prevent unbounded growth)
-MAX_MEMORIES_PER_USER = 100
-
-
-# =============================================================================
-# Helper Functions
-# =============================================================================
-
-
-def _to_uuid(user_id: str) -> UUID:
-    """Convert a string user_id to a UUID object."""
-    if isinstance(user_id, UUID):
-        return user_id
-    return UUID(user_id)
-
-
-async def get_user_memory_count(
-    db_session: AsyncSession,
-    user_id: str,
-    search_space_id: int | None = None,
-) -> int:
-    """Get the count of memories for a user."""
-    uuid_user_id = _to_uuid(user_id)
-    query = select(UserMemory).where(UserMemory.user_id == uuid_user_id)
-    if search_space_id is not None:
-        query = query.where(
-            (UserMemory.search_space_id == search_space_id)
-            | (UserMemory.search_space_id.is_(None))
-        )
-    result = await db_session.execute(query)
-    return len(result.scalars().all())
-
-
-async def delete_oldest_memory(
-    db_session: AsyncSession,
-    user_id: str,
-    search_space_id: int | None = None,
-) -> None:
-    """Delete the oldest memory for a user to make room for new ones."""
-    uuid_user_id = _to_uuid(user_id)
-    query = (
-        select(UserMemory)
-        .where(UserMemory.user_id == uuid_user_id)
-        .order_by(UserMemory.updated_at.asc())
-        .limit(1)
-    )
-    if search_space_id is not None:
-        query = query.where(
-            (UserMemory.search_space_id == search_space_id)
-            | (UserMemory.search_space_id.is_(None))
-        )
-    result = await db_session.execute(query)
-    oldest_memory = result.scalars().first()
-    if oldest_memory:
-        await db_session.delete(oldest_memory)
-        await db_session.commit()
-
-
-def format_memories_for_context(memories: list[dict[str, Any]]) -> str:
-    """Format retrieved memories into a readable context string for the LLM."""
-    if not memories:
-        return "No relevant memories found for this user."
-
-    parts = ["<user_memories>"]
-    for memory in memories:
-        category = memory.get("category", "unknown")
-        text = memory.get("memory_text", "")
-        updated = memory.get("updated_at", "")
-        parts.append(
-            f"  <memory category='{category}' updated='{updated}'>{text}</memory>"
-        )
-    parts.append("</user_memories>")
-
-    return "\n".join(parts)
-
-
-# =============================================================================
-# Tool Factory Functions
-# =============================================================================
-
-
-def create_save_memory_tool(
-    user_id: str,
-    search_space_id: int,
-    db_session: AsyncSession,
-):
-    """
-    Factory function to create the save_memory tool.
-
-    Args:
-        user_id: The user's UUID
-        search_space_id: The search space ID (for space-specific memories)
-        db_session: Database session for executing queries
-
-    Returns:
-        A configured tool function for saving user memories
-    """
-
-    @tool
-    async def save_memory(
-        content: str,
-        category: str = "fact",
-    ) -> dict[str, Any]:
-        """
-        Save a fact, preference, or context about the user for future reference.
-
-        Use this tool when:
-        - User explicitly says "remember this", "keep this in mind", or similar
-        - User shares personal preferences (e.g., "I prefer Python over JavaScript")
-        - User shares important facts about themselves (name, role, interests, projects)
-        - User gives standing instructions (e.g., "always respond in bullet points")
-        - User shares relevant context (e.g., "I'm working on project X")
-
-        The saved information will be available in future conversations to provide
-        more personalized and contextual responses.
-
-        Args:
-            content: The fact/preference/context to remember.
-                    Phrase it clearly, e.g., "User prefers dark mode",
-                    "User is a senior Python developer", "User is working on an AI project"
-            category: Type of memory. One of:
-                    - "preference": User preferences (e.g., coding style, tools, formats)
-                    - "fact": Facts about the user (e.g., name, role, expertise)
-                    - "instruction": Standing instructions (e.g., response format preferences)
-                    - "context": Current context (e.g., ongoing projects, goals)
-
-        Returns:
-            A dictionary with the save status and memory details
-        """
-        # Normalize and validate category (LLMs may send uppercase)
-        category = category.lower() if category else "fact"
-        valid_categories = ["preference", "fact", "instruction", "context"]
-        if category not in valid_categories:
-            category = "fact"
-
-        try:
-            # Convert user_id to UUID
-            uuid_user_id = _to_uuid(user_id)
-
-            # Check if we've hit the memory limit
-            memory_count = await get_user_memory_count(
-                db_session, user_id, search_space_id
-            )
-            if memory_count >= MAX_MEMORIES_PER_USER:
-                # Delete oldest memory to make room
-                await delete_oldest_memory(db_session, user_id, search_space_id)
-
-            embedding = await asyncio.to_thread(embed_text, content)
-
-            # Create new memory using ORM
-            # The pgvector Vector column type handles embedding conversion automatically
-            new_memory = UserMemory(
-                user_id=uuid_user_id,
-                search_space_id=search_space_id,
-                memory_text=content,
-                category=MemoryCategory(category),  # Convert string to enum
-                embedding=embedding,  # Pass embedding directly (list or numpy array)
-            )
-
-            db_session.add(new_memory)
-            await db_session.commit()
-            await db_session.refresh(new_memory)
-
-            return {
-                "status": "saved",
-                "memory_id": new_memory.id,
-                "memory_text": content,
-                "category": category,
-                "message": f"I'll remember: {content}",
-            }
-
-        except Exception as e:
-            logger.exception(f"Failed to save memory for user {user_id}: {e}")
-            # Rollback the session to clear any failed transaction state
-            await db_session.rollback()
-            return {
-                "status": "error",
-                "error": str(e),
-                "message": "Failed to save memory. Please try again.",
-            }
-
-    return save_memory
-
-
-def create_recall_memory_tool(
-    user_id: str,
-    search_space_id: int,
-    db_session: AsyncSession,
-):
-    """
-    Factory function to create the recall_memory tool.
-
-    Args:
-        user_id: The user's UUID
-        search_space_id: The search space ID
-        db_session: Database session for executing queries
-
-    Returns:
-        A configured tool function for recalling user memories
-    """
-
-    @tool
-    async def recall_memory(
-        query: str | None = None,
-        category: str | None = None,
-        top_k: int = DEFAULT_RECALL_TOP_K,
-    ) -> dict[str, Any]:
-        """
-        Recall relevant memories about the user to provide personalized responses.
-
-        Use this tool when:
-        - You need user context to give a better, more personalized answer
-        - User asks about their preferences or past information they shared
-        - User references something they told you before
-        - Personalization would significantly improve the response quality
-        - User asks "what do you know about me?" or similar
-
-        Args:
-            query: Optional search query to find specific memories.
-                  If not provided, returns the most recent memories.
-                  Example: "programming preferences", "current projects"
-            category: Optional category filter. One of:
-                     "preference", "fact", "instruction", "context"
-                     If not provided, searches all categories.
-            top_k: Number of memories to retrieve (default: 5, max: 20)
-
-        Returns:
-            A dictionary containing relevant memories and formatted context
-        """
-        top_k = min(max(top_k, 1), 20)  # Clamp between 1 and 20
-
-        try:
-            # Convert user_id to UUID
-            uuid_user_id = _to_uuid(user_id)
-
-            if query:
-                query_embedding = await asyncio.to_thread(embed_text, query)
-
-                # Build query with vector similarity
-                stmt = (
-                    select(UserMemory)
-                    .where(UserMemory.user_id == uuid_user_id)
-                    .where(
-                        (UserMemory.search_space_id == search_space_id)
-                        | (UserMemory.search_space_id.is_(None))
-                    )
-                )
-
-                # Add category filter if specified
-                if category and category in [
-                    "preference",
-                    "fact",
-                    "instruction",
-                    "context",
-                ]:
-                    stmt = stmt.where(UserMemory.category == MemoryCategory(category))
-
-                # Order by vector similarity
-                stmt = stmt.order_by(
-                    UserMemory.embedding.op("<=>")(query_embedding)
-                ).limit(top_k)
-
-            else:
-                # No query - return most recent memories
-                stmt = (
-                    select(UserMemory)
-                    .where(UserMemory.user_id == uuid_user_id)
-                    .where(
-                        (UserMemory.search_space_id == search_space_id)
-                        | (UserMemory.search_space_id.is_(None))
-                    )
-                )
-
-                # Add category filter if specified
-                if category and category in [
-                    "preference",
-                    "fact",
-                    "instruction",
-                    "context",
-                ]:
-                    stmt = stmt.where(UserMemory.category == MemoryCategory(category))
-
-                stmt = stmt.order_by(UserMemory.updated_at.desc()).limit(top_k)
-
-            result = await db_session.execute(stmt)
-            memories = result.scalars().all()
-
-            # Format memories for response
-            memory_list = [
-                {
-                    "id": m.id,
-                    "memory_text": m.memory_text,
-                    "category": m.category.value if m.category else "unknown",
-                    "updated_at": m.updated_at.isoformat() if m.updated_at else None,
-                }
-                for m in memories
-            ]
-
-            formatted_context = format_memories_for_context(memory_list)
-
-            return {
-                "status": "success",
-                "count": len(memory_list),
-                "memories": memory_list,
-                "formatted_context": formatted_context,
-            }
-
-        except Exception as e:
-            logger.exception(f"Failed to recall memories for user {user_id}: {e}")
-            await db_session.rollback()
-            return {
-                "status": "error",
-                "error": str(e),
-                "memories": [],
-                "formatted_context": "Failed to recall memories.",
-            }
-
-    return recall_memory
diff --git a/surfsense_backend/app/agents/new_chat/tools/video_presentation.py b/surfsense_backend/app/agents/new_chat/tools/video_presentation.py
index a90e08ac3b..7bf9a1c3b0 100644
--- a/surfsense_backend/app/agents/new_chat/tools/video_presentation.py
+++ b/surfsense_backend/app/agents/new_chat/tools/video_presentation.py
@@ -11,7 +11,7 @@
 from langchain_core.tools import tool
 from sqlalchemy.ext.asyncio import AsyncSession
 
-from app.db import VideoPresentation, VideoPresentationStatus
+from app.db import VideoPresentation, VideoPresentationStatus, shielded_async_session
 
 
 def create_generate_video_presentation_tool(
@@ -23,8 +23,11 @@ def create_generate_video_presentation_tool(
     Factory function to create the generate_video_presentation tool with injected dependencies.
 
     Pre-creates video presentation record with pending status so the ID is available
-    immediately for frontend polling.
+    immediately for frontend polling. The row is written via a fresh, tool-local
+    session so parallel tool calls (e.g. video + podcast in the same agent step)
+    don't share an ``AsyncSession`` (which is not concurrency-safe).
     """
+    del db_session  # writes use a fresh tool-local session, see below
 
     @tool
     async def generate_video_presentation(
@@ -42,34 +45,40 @@ async def generate_video_presentation(
             user_prompt: Optional style/tone instructions.
         """
         try:
-            video_pres = VideoPresentation(
-                title=video_title,
-                status=VideoPresentationStatus.PENDING,
-                search_space_id=search_space_id,
-                thread_id=thread_id,
-            )
-            db_session.add(video_pres)
-            await db_session.commit()
-            await db_session.refresh(video_pres)
+            # See podcast.py for the rationale: parallel tool calls share the
+            # streaming session, and AsyncSession is not concurrency-safe —
+            # interleaved flushes produce "Session.add() during flush" and
+            # poison the transaction for every concurrent tool.
+            async with shielded_async_session() as session:
+                video_pres = VideoPresentation(
+                    title=video_title,
+                    status=VideoPresentationStatus.PENDING,
+                    search_space_id=search_space_id,
+                    thread_id=thread_id,
+                )
+                session.add(video_pres)
+                await session.commit()
+                await session.refresh(video_pres)
+                video_pres_id = video_pres.id
 
             from app.tasks.celery_tasks.video_presentation_tasks import (
                 generate_video_presentation_task,
             )
 
             task = generate_video_presentation_task.delay(
-                video_presentation_id=video_pres.id,
+                video_presentation_id=video_pres_id,
                 source_content=source_content,
                 search_space_id=search_space_id,
                 user_prompt=user_prompt,
             )
 
             print(
-                f"[generate_video_presentation] Created video presentation {video_pres.id}, task: {task.id}"
+                f"[generate_video_presentation] Created video presentation {video_pres_id}, task: {task.id}"
             )
 
             return {
                 "status": VideoPresentationStatus.PENDING.value,
-                "video_presentation_id": video_pres.id,
+                "video_presentation_id": video_pres_id,
                 "title": video_title,
                 "message": "Video presentation generation started. This may take a few minutes.",
             }
diff --git a/surfsense_backend/app/agents/podcaster/nodes.py b/surfsense_backend/app/agents/podcaster/nodes.py
index 4bdfdfc48d..517d900a34 100644
--- a/surfsense_backend/app/agents/podcaster/nodes.py
+++ b/surfsense_backend/app/agents/podcaster/nodes.py
@@ -13,6 +13,7 @@
 from app.config import config as app_config
 from app.services.kokoro_tts_service import get_kokoro_tts_service
 from app.services.llm_service import get_agent_llm
+from app.utils.content_utils import extract_text_content, strip_markdown_fences
 
 from .configuration import Configuration
 from .prompts import get_podcast_generation_prompt
@@ -53,43 +54,32 @@ async def create_podcast_transcript(
     # Generate the podcast transcript
     llm_response = await llm.ainvoke(messages)
 
-    # First try the direct approach
+    # Reasoning models (e.g. Kimi K2.5) may return content as a list of
+    # blocks including 'reasoning' entries.  Normalise to a plain string.
+    content = strip_markdown_fences(extract_text_content(llm_response.content))
+
     try:
-        podcast_transcript = PodcastTranscripts.model_validate(
-            json.loads(llm_response.content)
-        )
-    except (json.JSONDecodeError, ValueError) as e:
+        podcast_transcript = PodcastTranscripts.model_validate(json.loads(content))
+    except (json.JSONDecodeError, TypeError, ValueError) as e:
         print(f"Direct JSON parsing failed, trying fallback approach: {e!s}")
 
-        # Fallback: Parse the JSON response manually
         try:
-            # Extract JSON content from the response
-            content = llm_response.content
-
-            # Find the JSON in the content (handle case where LLM might add additional text)
             json_start = content.find("{")
             json_end = content.rfind("}") + 1
             if json_start >= 0 and json_end > json_start:
                 json_str = content[json_start:json_end]
-
-                # Parse the JSON string
                 parsed_data = json.loads(json_str)
-
-                # Convert to Pydantic model
                 podcast_transcript = PodcastTranscripts.model_validate(parsed_data)
-
                 print("Successfully parsed podcast transcript using fallback approach")
             else:
-                # If JSON structure not found, raise a clear error
                 error_message = f"Could not find valid JSON in LLM response. Raw response: {content}"
                 print(error_message)
                 raise ValueError(error_message)
 
-        except (json.JSONDecodeError, ValueError) as e2:
-            # Log the error and re-raise it
+        except (json.JSONDecodeError, TypeError, ValueError) as e2:
             error_message = f"Error parsing LLM response (fallback also failed): {e2!s}"
             print(f"Error parsing LLM response: {e2!s}")
-            print(f"Raw response: {llm_response.content}")
+            print(f"Raw response: {content}")
             raise
 
     return {"podcast_transcript": podcast_transcript.podcast_transcripts}
diff --git a/surfsense_backend/app/agents/video_presentation/nodes.py b/surfsense_backend/app/agents/video_presentation/nodes.py
index 1b3d71e844..bdc0f80f8e 100644
--- a/surfsense_backend/app/agents/video_presentation/nodes.py
+++ b/surfsense_backend/app/agents/video_presentation/nodes.py
@@ -16,6 +16,7 @@
 from app.config import config as app_config
 from app.services.kokoro_tts_service import get_kokoro_tts_service
 from app.services.llm_service import get_agent_llm
+from app.utils.content_utils import extract_text_content, strip_markdown_fences
 
 from .configuration import Configuration
 from .prompts import (
@@ -67,16 +68,14 @@ async def create_presentation_slides(
     ]
 
     llm_response = await llm.ainvoke(messages)
+    content = strip_markdown_fences(extract_text_content(llm_response.content))
 
     try:
-        presentation = PresentationSlides.model_validate(
-            json.loads(llm_response.content)
-        )
-    except (json.JSONDecodeError, ValueError) as e:
+        presentation = PresentationSlides.model_validate(json.loads(content))
+    except (json.JSONDecodeError, TypeError, ValueError) as e:
         print(f"Direct JSON parsing failed, trying fallback approach: {e!s}")
 
         try:
-            content = llm_response.content
             json_start = content.find("{")
             json_end = content.rfind("}") + 1
             if json_start >= 0 and json_end > json_start:
@@ -89,10 +88,10 @@ async def create_presentation_slides(
                 print(error_message)
                 raise ValueError(error_message)
 
-        except (json.JSONDecodeError, ValueError) as e2:
+        except (json.JSONDecodeError, TypeError, ValueError) as e2:
             error_message = f"Error parsing LLM response (fallback also failed): {e2!s}"
             print(f"Error parsing LLM response: {e2!s}")
-            print(f"Raw response: {llm_response.content}")
+            print(f"Raw response: {content}")
             raise
 
     return {"slides": presentation.slides}
@@ -308,12 +307,7 @@ async def _assign_themes_with_llm(
             ]
         )
 
-        text = response.content.strip()
-        if text.startswith("```"):
-            lines = text.split("\n")
-            text = "\n".join(
-                line for line in lines if not line.strip().startswith("```")
-            ).strip()
+        text = strip_markdown_fences(extract_text_content(response.content))
 
         assignments = json.loads(text)
         valid_themes = set(THEME_PRESETS)
@@ -424,7 +418,9 @@ async def _generate_scene_for_slide(slide: SlideContent) -> SlideSceneCode:
         )
 
         llm_response = await llm.ainvoke(messages)
-        code, scene_title = _extract_code_and_title(llm_response.content)
+        code, scene_title = _extract_code_and_title(
+            extract_text_content(llm_response.content)
+        )
 
         code = await _refine_if_needed(llm, code, slide.slide_number)
 
@@ -452,7 +448,7 @@ def _extract_code_and_title(content: str) -> tuple[str, str | None]:
 
     Returns (code, title) where title may be None.
     """
-    text = content.strip()
+    text = strip_markdown_fences(content)
 
     if text.startswith("{"):
         try:
@@ -472,18 +468,7 @@ def _extract_code_and_title(content: str) -> tuple[str, str | None]:
             except (json.JSONDecodeError, ValueError):
                 pass
 
-    code = text
-    if code.startswith("```"):
-        lines = code.split("\n")
-        start = 1
-        end = len(lines)
-        for i in range(len(lines) - 1, 0, -1):
-            if lines[i].strip().startswith("```"):
-                end = i
-                break
-        code = "\n".join(lines[start:end]).strip()
-
-    return code, None
+    return text, None
 
 
 async def _refine_if_needed(llm, code: str, slide_number: int) -> str:
@@ -512,7 +497,7 @@ async def _refine_if_needed(llm, code: str, slide_number: int) -> str:
         ]
 
         response = await llm.ainvoke(messages)
-        code, _ = _extract_code_and_title(response.content)
+        code, _ = _extract_code_and_title(extract_text_content(response.content))
 
         error = _basic_syntax_check(code)
         if error is None:
diff --git a/surfsense_backend/app/app.py b/surfsense_backend/app/app.py
index bf9729996d..5130aa47e5 100644
--- a/surfsense_backend/app/app.py
+++ b/surfsense_backend/app/app.py
@@ -2,19 +2,20 @@
 import gc
 import logging
 import time
+import uuid
 from collections import defaultdict
 from contextlib import asynccontextmanager
+from datetime import UTC, datetime
 from threading import Lock
 
 import redis
 from fastapi import Depends, FastAPI, HTTPException, Request, status
+from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
-from limits.storage import MemoryStorage
-from slowapi import Limiter
 from slowapi.errors import RateLimitExceeded
 from slowapi.middleware import SlowAPIMiddleware
-from slowapi.util import get_remote_address
+from slowapi.util import get_remote_address  # noqa: F401 — kept for reference
 from sqlalchemy.ext.asyncio import AsyncSession
 from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint
 from starlette.requests import Request as StarletteRequest
@@ -29,21 +30,28 @@
     config,
     initialize_image_gen_router,
     initialize_llm_router,
+    initialize_openrouter_integration,
+    initialize_pricing_registration,
     initialize_vision_llm_router,
 )
 from app.db import User, create_db_and_tables, get_async_session
+from app.exceptions import GENERIC_5XX_MESSAGE, ISSUES_URL, SurfSenseError
 from app.middleware.proxy_auth import ProxyAuthMiddleware
+from app.rate_limiter import get_real_client_ip, limiter
 from app.routes import router as crud_router
 from app.routes.auth_routes import router as auth_router
-from app.schemas import UserRead, UserUpdate
+from app.schemas import UserCreate, UserRead, UserUpdate
 from app.tasks.surfsense_docs_indexer import seed_surfsense_docs
 from app.users import (
+    auth_backend,
     current_active_user,
     fastapi_users,
     get_user_manager,
 )
 from app.utils.perf import get_perf_logger, log_system_snapshot
 
+_error_logger = logging.getLogger("surfsense.errors")
+
 rate_limit_logger = logging.getLogger("surfsense.rate_limit")
 
 
@@ -53,26 +61,204 @@
 # Uses the same Redis instance as Celery for zero additional infrastructure.
 # Protects auth endpoints from brute force and user enumeration attacks.
 
-# SlowAPI limiter — provides default rate limits (1024/min) for ALL routes
-# via the ASGI middleware. This is the general safety net.
-# in_memory_fallback ensures requests are still served (with per-worker
-# in-memory limiting) when Redis is unreachable, instead of hanging.
-limiter = Limiter(
-    key_func=get_remote_address,
-    storage_uri=config.REDIS_APP_URL,
-    default_limits=["1024/minute"],
-    in_memory_fallback_enabled=True,
-    in_memory_fallback=[MemoryStorage()],
-)
+# limiter is imported from app.rate_limiter (shared module to avoid circular imports)
+
+
+def _get_request_id(request: Request) -> str:
+    """Return the request ID from state, header, or generate a new one."""
+    if hasattr(request.state, "request_id"):
+        return request.state.request_id
+    return request.headers.get("X-Request-ID", f"req_{uuid.uuid4().hex[:12]}")
+
+
+def _build_error_response(
+    status_code: int,
+    message: str,
+    *,
+    code: str = "INTERNAL_ERROR",
+    request_id: str = "",
+    extra_headers: dict[str, str] | None = None,
+) -> JSONResponse:
+    """Build the standardized error envelope (new ``error`` + legacy ``detail``)."""
+    body = {
+        "error": {
+            "code": code,
+            "message": message,
+            "status": status_code,
+            "request_id": request_id,
+            "timestamp": datetime.now(UTC).isoformat(),
+            "report_url": ISSUES_URL,
+        },
+        "detail": message,
+    }
+    headers = {"X-Request-ID": request_id}
+    if extra_headers:
+        headers.update(extra_headers)
+    return JSONResponse(status_code=status_code, content=body, headers=headers)
+
+
+# ---------------------------------------------------------------------------
+# Global exception handlers
+# ---------------------------------------------------------------------------
+
+
+def _surfsense_error_handler(request: Request, exc: SurfSenseError) -> JSONResponse:
+    """Handle our own structured exceptions."""
+    rid = _get_request_id(request)
+    if exc.status_code >= 500:
+        _error_logger.error(
+            "[%s] %s - %s: %s",
+            rid,
+            request.url.path,
+            exc.code,
+            exc,
+            exc_info=True,
+        )
+    message = exc.message if exc.safe_for_client else GENERIC_5XX_MESSAGE
+    return _build_error_response(
+        exc.status_code, message, code=exc.code, request_id=rid
+    )
+
+
+def _http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
+    """Wrap FastAPI/Starlette HTTPExceptions into the standard envelope.
+
+    5xx sanitization policy:
+    - 500 responses are sanitized (replaced with ``GENERIC_5XX_MESSAGE``) because
+      they usually wrap raw internal errors and may leak sensitive info.
+    - Other 5xx statuses (501, 502, 503, 504, ...) are raised explicitly by
+      route code to communicate a specific, user-safe operational state
+      (e.g. 503 "Page purchases are temporarily unavailable."). Those details
+      are preserved so the frontend can render them, but the error is still
+      logged server-side.
+    """
+    rid = _get_request_id(request)
+    should_sanitize = exc.status_code == 500
+
+    # Structured dict details (e.g. {"code": "CAPTCHA_REQUIRED", "message": "..."})
+    # are preserved so the frontend can parse them.
+    if isinstance(exc.detail, dict):
+        err_code = exc.detail.get("code", _status_to_code(exc.status_code))
+        message = exc.detail.get("message", str(exc.detail))
+        if exc.status_code >= 500:
+            _error_logger.error(
+                "[%s] %s - HTTPException %d: %s",
+                rid,
+                request.url.path,
+                exc.status_code,
+                message,
+            )
+        elif exc.status_code >= 400:
+            _error_logger.warning(
+                "[%s] %s %s - HTTPException %d: %s",
+                rid,
+                request.method,
+                request.url.path,
+                exc.status_code,
+                message,
+            )
+        if should_sanitize:
+            message = GENERIC_5XX_MESSAGE
+            err_code = "INTERNAL_ERROR"
+        body = {
+            "error": {
+                "code": err_code,
+                "message": message,
+                "status": exc.status_code,
+                "request_id": rid,
+                "timestamp": datetime.now(UTC).isoformat(),
+                "report_url": ISSUES_URL,
+            },
+            "detail": exc.detail,
+        }
+        return JSONResponse(
+            status_code=exc.status_code,
+            content=body,
+            headers={"X-Request-ID": rid},
+        )
+
+    detail = exc.detail if isinstance(exc.detail, str) else str(exc.detail)
+    if exc.status_code >= 500:
+        _error_logger.error(
+            "[%s] %s - HTTPException %d: %s",
+            rid,
+            request.url.path,
+            exc.status_code,
+            detail,
+        )
+    elif exc.status_code >= 400:
+        _error_logger.warning(
+            "[%s] %s %s - HTTPException %d: %s",
+            rid,
+            request.method,
+            request.url.path,
+            exc.status_code,
+            detail,
+        )
+    if should_sanitize:
+        detail = GENERIC_5XX_MESSAGE
+    code = _status_to_code(exc.status_code, detail)
+    return _build_error_response(exc.status_code, detail, code=code, request_id=rid)
+
+
+def _validation_error_handler(
+    request: Request, exc: RequestValidationError
+) -> JSONResponse:
+    """Return 422 with field-level detail in the standard envelope."""
+    rid = _get_request_id(request)
+    fields = []
+    for err in exc.errors():
+        loc = " -> ".join(str(part) for part in err.get("loc", []))
+        fields.append(f"{loc}: {err.get('msg', 'invalid')}")
+    message = (
+        f"Validation failed: {'; '.join(fields)}" if fields else "Validation failed."
+    )
+    return _build_error_response(422, message, code="VALIDATION_ERROR", request_id=rid)
+
+
+def _unhandled_exception_handler(request: Request, exc: Exception) -> JSONResponse:
+    """Catch-all: log full traceback, return sanitized 500."""
+    rid = _get_request_id(request)
+    _error_logger.error(
+        "[%s] Unhandled exception on %s %s",
+        rid,
+        request.method,
+        request.url.path,
+        exc_info=True,
+    )
+    return _build_error_response(
+        500, GENERIC_5XX_MESSAGE, code="INTERNAL_ERROR", request_id=rid
+    )
+
+
+def _status_to_code(status_code: int, detail: str = "") -> str:
+    if detail == "RATE_LIMIT_EXCEEDED":
+        return "RATE_LIMIT_EXCEEDED"
+    mapping = {
+        400: "BAD_REQUEST",
+        401: "UNAUTHORIZED",
+        403: "FORBIDDEN",
+        404: "NOT_FOUND",
+        405: "METHOD_NOT_ALLOWED",
+        409: "CONFLICT",
+        422: "VALIDATION_ERROR",
+        429: "RATE_LIMIT_EXCEEDED",
+    }
+    return mapping.get(
+        status_code, "INTERNAL_ERROR" if status_code >= 500 else "CLIENT_ERROR"
+    )
 
 
 def _rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded):
-    """Custom 429 handler that returns JSON matching our frontend error format."""
+    """Custom 429 handler that returns JSON matching our error envelope."""
+    rid = _get_request_id(request)
     retry_after = exc.detail.split("per")[-1].strip() if exc.detail else "60"
-    return JSONResponse(
-        status_code=429,
-        content={"detail": "RATE_LIMIT_EXCEEDED"},
-        headers={"Retry-After": retry_after},
+    return _build_error_response(
+        429,
+        "Too many requests. Please slow down and try again.",
+        code="RATE_LIMIT_EXCEEDED",
+        request_id=rid,
+        extra_headers={"Retry-After": retry_after},
     )
 
 
@@ -142,7 +328,7 @@ def _check_rate_limit(
     Uses atomic INCR + EXPIRE to avoid race conditions.
     Falls back to in-memory sliding window if Redis is unavailable.
     """
-    client_ip = get_remote_address(request)
+    client_ip = get_real_client_ip(request)
     key = f"surfsense:auth_rate_limit:{scope}:{client_ip}"
 
     try:
@@ -221,6 +407,155 @@ def _enable_slow_callback_logging(threshold_sec: float = 0.5) -> None:
     )
 
 
+def _start_openrouter_background_refresh() -> None:
+    """Start periodic OpenRouter model refresh if integration is enabled."""
+    from app.services.openrouter_integration_service import OpenRouterIntegrationService
+
+    if not OpenRouterIntegrationService.is_initialized():
+        return
+    settings = config.OPENROUTER_INTEGRATION_SETTINGS
+    if settings:
+        interval = settings.get("refresh_interval_hours", 24)
+        OpenRouterIntegrationService.get_instance().start_background_refresh(interval)
+
+
+def _stop_openrouter_background_refresh() -> None:
+    """Cancel the periodic OpenRouter refresh task on shutdown."""
+    from app.services.openrouter_integration_service import OpenRouterIntegrationService
+
+    if OpenRouterIntegrationService.is_initialized():
+        OpenRouterIntegrationService.get_instance().stop_background_refresh()
+
+
+async def _warm_agent_jit_caches() -> None:
+    """Pay the LangChain / LangGraph / Deepagents JIT cost at startup.
+
+    Why
+    ----
+    A cold ``create_agent`` + ``StateGraph.compile()`` + Pydantic schema
+    generation chain takes 1.5-2 seconds of pure CPU on first invocation
+    inside any Python process: the graph compiler builds reducers,
+    Pydantic v2 generates and JITs validator schemas, deepagents
+    eagerly compiles its general-purpose subagent, etc. Subsequent
+    compiles in the same process pay only ~50% of that cost (the lazy
+    JIT bits are cached in module-level dicts).
+
+    Doing one throwaway compile during ``lifespan`` startup pre-pays
+    that cost so the *first real request* doesn't. We do NOT prime
+    :mod:`agent_cache` because the cache key requires real
+    ``thread_id`` / ``user_id`` / ``search_space_id`` / etc. — the
+    throwaway agent is genuinely thrown away and immediately collected.
+
+    Safety
+    ------
+    * No DB access. We construct a stub LLM (no real keys), pass an
+      empty tools list, and pass ``checkpointer=None`` so we never
+      touch Postgres.
+    * Bounded by ``asyncio.wait_for`` so a hang here can never block
+      worker startup. On any failure, we log + swallow — the worst
+      case is the first real request pays the full cold cost (i.e.
+      pre-warmup behaviour).
+    """
+    import time as _time
+
+    logger = logging.getLogger(__name__)
+    t0 = _time.perf_counter()
+    try:
+        from langchain.agents import create_agent
+        from langchain.agents.middleware import (
+            ModelCallLimitMiddleware,
+            TodoListMiddleware,
+            ToolCallLimitMiddleware,
+        )
+        from langchain_core.language_models.fake_chat_models import (
+            FakeListChatModel,
+        )
+        from langchain_core.tools import tool
+
+        from app.agents.new_chat.context import SurfSenseContextSchema
+
+        # Minimal LLM stub. ``FakeListChatModel`` satisfies
+        # ``BaseChatModel`` without any network or auth — perfect for
+        # exercising the compile path without side effects.
+        stub_llm = FakeListChatModel(responses=["warmup-response"])
+
+        # Two trivial tools with arg + return schemas — exercises the
+        # Pydantic v2 schema JIT path. Without at least one tool the
+        # graph compile skips the tool-loop bytecode generation that
+        # accounts for ~30-50% of cold compile cost.
+        @tool
+        def _warmup_tool_a(query: str, limit: int = 5) -> str:
+            """Warmup tool A — never actually invoked."""
+            return query[:limit]
+
+        @tool
+        def _warmup_tool_b(name: str, value: float | None = None) -> dict[str, object]:
+            """Warmup tool B — never actually invoked."""
+            return {"name": name, "value": value}
+
+        # A handful of common middleware so the compile pre-pays the
+        # ``AgentMiddleware`` resolver path. These instances never run
+        # because the throwaway agent is immediately collected.
+        # ``SubAgentMiddleware`` is the single heaviest line in cold
+        # ``create_surfsense_deep_agent`` (1.5-2s of CPU per call to
+        # compile its general-purpose subagent's full inner graph),
+        # so we include it here to make sure that compile path is JIT'd.
+        warmup_middleware: list = [
+            TodoListMiddleware(),
+            ModelCallLimitMiddleware(
+                thread_limit=120, run_limit=80, exit_behavior="end"
+            ),
+            ToolCallLimitMiddleware(
+                thread_limit=300, run_limit=80, exit_behavior="continue"
+            ),
+        ]
+        try:
+            from deepagents import SubAgentMiddleware
+            from deepagents.backends import StateBackend
+            from deepagents.middleware.subagents import GENERAL_PURPOSE_SUBAGENT
+
+            gp_warmup_spec = {  # type: ignore[var-annotated]
+                **GENERAL_PURPOSE_SUBAGENT,
+                "model": stub_llm,
+                "tools": [_warmup_tool_a],
+                "middleware": [TodoListMiddleware()],
+            }
+            warmup_middleware.append(
+                SubAgentMiddleware(backend=StateBackend, subagents=[gp_warmup_spec])
+            )
+        except Exception:
+            # Deepagents missing/incompatible — middleware-only warmup
+            # still produces a useful (smaller) speedup.
+            logger.debug("[startup] SubAgentMiddleware warmup skipped", exc_info=True)
+
+        compiled = create_agent(
+            stub_llm,
+            tools=[_warmup_tool_a, _warmup_tool_b],
+            system_prompt="You are a warmup stub.",
+            middleware=warmup_middleware,
+            context_schema=SurfSenseContextSchema,
+            checkpointer=None,
+        )
+
+        # Touch the compiled graph's stream_channels / nodes so any
+        # remaining lazy schema work fires now instead of on first
+        # real invocation.
+        _ = list(getattr(compiled, "nodes", {}).keys())
+
+        del compiled
+        logger.info(
+            "[startup] Agent JIT warmup completed in %.3fs",
+            _time.perf_counter() - t0,
+        )
+    except Exception:
+        logger.warning(
+            "[startup] Agent JIT warmup failed in %.3fs (non-fatal — first "
+            "real request will pay the full compile cost)",
+            _time.perf_counter() - t0,
+            exc_info=True,
+        )
+
+
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     # Tune GC: lower gen-2 threshold so long-lived garbage is collected
@@ -231,6 +566,9 @@ async def lifespan(app: FastAPI):
     _enable_slow_callback_logging(threshold_sec=0.5)
     await create_db_and_tables()
     await setup_checkpointer_tables()
+    initialize_openrouter_integration()
+    _start_openrouter_background_refresh()
+    initialize_pricing_registration()
     initialize_llm_router()
     initialize_image_gen_router()
     initialize_vision_llm_router()
@@ -242,14 +580,44 @@ async def lifespan(app: FastAPI):
             "Docs will be indexed on the next restart."
         )
 
+    # Phase 1.7 — JIT warmup. Bounded so a stuck warmup never delays
+    # worker readiness. ``shield`` so Uvicorn cancelling startup
+    # doesn't leave half-warmed Pydantic schemas in an inconsistent
+    # state.
+    try:
+        await asyncio.wait_for(asyncio.shield(_warm_agent_jit_caches()), timeout=20)
+    except (TimeoutError, Exception):  # pragma: no cover - defensive
+        logging.getLogger(__name__).warning(
+            "[startup] Agent JIT warmup hit timeout/error — skipping; "
+            "first real request will pay the full compile cost."
+        )
+
     log_system_snapshot("startup_complete")
 
     yield
 
+    _stop_openrouter_background_refresh()
     await close_checkpointer()
 
 
 def registration_allowed():
+    """Master auth kill switch keyed on the REGISTRATION_ENABLED env var.
+
+    Despite the name, this dependency does NOT only gate registration. When
+    REGISTRATION_ENABLED is FALSE it intentionally blocks every auth surface
+    that could mint or refresh a session for an attacker:
+
+    * email/password ``POST /auth/register``
+    * email/password ``POST /auth/jwt/login``
+    * the Google OAuth router (``/auth/google/authorize`` and the shared
+      ``/auth/google/callback`` handles both new signups and login for
+      existing users, so flipping this off locks both)
+    * the bespoke ``/auth/google/authorize-redirect`` helper used by the UI
+
+    Use it as a temporary "freeze all new sessions" lever during incident
+    response. It is not a way to disable signup while keeping login working;
+    for that, override ``UserManager.oauth_callback`` instead.
+    """
     if not config.REGISTRATION_ENABLED:
         raise HTTPException(
             status_code=status.HTTP_403_FORBIDDEN, detail="Registration is disabled"
@@ -263,6 +631,33 @@ def registration_allowed():
 app.state.limiter = limiter
 app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
 
+# Register structured global exception handlers (order matters: most specific first)
+app.add_exception_handler(SurfSenseError, _surfsense_error_handler)
+app.add_exception_handler(RequestValidationError, _validation_error_handler)
+app.add_exception_handler(HTTPException, _http_exception_handler)
+app.add_exception_handler(Exception, _unhandled_exception_handler)
+
+
+# ---------------------------------------------------------------------------
+# Request-ID middleware
+# ---------------------------------------------------------------------------
+
+
+class RequestIDMiddleware(BaseHTTPMiddleware):
+    """Attach a unique request ID to every request and echo it in the response."""
+
+    async def dispatch(
+        self, request: StarletteRequest, call_next: RequestResponseEndpoint
+    ) -> StarletteResponse:
+        request_id = request.headers.get("X-Request-ID", f"req_{uuid.uuid4().hex[:12]}")
+        request.state.request_id = request_id
+        response = await call_next(request)
+        response.headers["X-Request-ID"] = request_id
+        return response
+
+
+app.add_middleware(RequestIDMiddleware)
+
 
 # ---------------------------------------------------------------------------
 # Request-level performance middleware
@@ -379,8 +774,50 @@ async def dispatch(
     allow_credentials=True,
     allow_methods=["*"],  # Allows all methods
     allow_headers=["*"],  # Allows all headers
+    # Cache CORS preflight (OPTIONS) responses for 24h. Browsers clamp:
+    # Chrome/Edge cap at 7200s, Firefox honours up to 86400s. Setting the
+    # higher value lets each browser cache for as long as it allows. This
+    # eliminates an OPTIONS round-trip on every non-simple request from
+    # FRONTEND_URL to BACKEND_URL.
+    max_age=86400,
 )
 
+# Password / email-based auth routers are only mounted when not running in
+# Google-OAuth-only mode. Mounting them in OAuth-only prod previously left
+# POST /auth/register reachable, which is the bypass that allowed bots to
+# create non-OAuth users in spite of AUTH_TYPE=GOOGLE.
+if config.AUTH_TYPE not in ("GOOGLE", "SSO"):
+    app.include_router(
+        fastapi_users.get_auth_router(auth_backend),
+        prefix="/auth/jwt",
+        tags=["auth"],
+        dependencies=[
+            Depends(rate_limit_login),
+            Depends(
+                registration_allowed
+            ),  # honour REGISTRATION_ENABLED kill switch on login too
+        ],
+    )
+    app.include_router(
+        fastapi_users.get_register_router(UserRead, UserCreate),
+        prefix="/auth",
+        tags=["auth"],
+        dependencies=[
+            Depends(rate_limit_register),
+            Depends(registration_allowed),
+        ],
+    )
+    app.include_router(
+        fastapi_users.get_reset_password_router(),
+        prefix="/auth",
+        tags=["auth"],
+        dependencies=[Depends(rate_limit_password_reset)],
+    )
+    app.include_router(
+        fastapi_users.get_verify_router(UserRead),
+        prefix="/auth",
+        tags=["auth"],
+    )
 
 # Register /users/me BEFORE fastapi_users.get_users_router so our routes take
 # precedence (FastAPI first-match wins). fastapi-users' internal /users/me only
@@ -405,8 +842,6 @@ async def update_current_user_me(
     # JWT-loaded one. user_manager.get() returns the session-attached instance.
     db_user = await user_manager.get(user.id)
     return await user_manager.update(user_update, db_user, safe=True, request=request)
-
-
 app.include_router(
     fastapi_users.get_users_router(UserRead, UserUpdate),
     prefix="/users",
@@ -417,6 +852,130 @@ async def update_current_user_me(
 # Include custom auth routes (refresh token, logout)
 app.include_router(auth_router)
 
+if config.AUTH_TYPE == "GOOGLE":
+    from fastapi.responses import RedirectResponse
+
+    from app.users import google_oauth_client
+
+    # Determine if we're in a secure context (HTTPS) or local development (HTTP)
+    # The CSRF cookie must have secure=False for HTTP (localhost development)
+    is_secure_context = config.BACKEND_URL and config.BACKEND_URL.startswith("https://")
+
+    # For cross-origin OAuth (frontend and backend on different domains):
+    # - SameSite=None is required to allow cross-origin cookie setting
+    # - Secure=True is required when SameSite=None
+    # For same-origin or local development, use SameSite=Lax (default)
+    csrf_cookie_samesite = "none" if is_secure_context else "lax"
+
+    # Extract the domain from BACKEND_URL for cookie domain setting
+    # This helps with cross-site cookie issues in Firefox/Safari
+    csrf_cookie_domain = None
+    if config.BACKEND_URL:
+        from urllib.parse import urlparse
+
+        parsed_url = urlparse(config.BACKEND_URL)
+        csrf_cookie_domain = parsed_url.hostname
+
+    app.include_router(
+        fastapi_users.get_oauth_router(
+            google_oauth_client,
+            auth_backend,
+            SECRET,
+            is_verified_by_default=True,
+            csrf_token_cookie_secure=is_secure_context,
+            csrf_token_cookie_samesite=csrf_cookie_samesite,
+            csrf_token_cookie_httponly=False,  # Required for cross-site OAuth in Firefox/Safari
+        )
+        if not config.BACKEND_URL
+        else fastapi_users.get_oauth_router(
+            google_oauth_client,
+            auth_backend,
+            SECRET,
+            is_verified_by_default=True,
+            redirect_url=f"{config.BACKEND_URL}/auth/google/callback",
+            csrf_token_cookie_secure=is_secure_context,
+            csrf_token_cookie_samesite=csrf_cookie_samesite,
+            csrf_token_cookie_httponly=False,  # Required for cross-site OAuth in Firefox/Safari
+            csrf_token_cookie_domain=csrf_cookie_domain,  # Explicitly set cookie domain
+        ),
+        prefix="/auth/google",
+        tags=["auth"],
+        # REGISTRATION_ENABLED is a master auth kill switch: when set to FALSE
+        # it blocks BOTH new OAuth signups AND login of existing OAuth users
+        # (the fastapi-users OAuth router shares one callback for create+login,
+        # so this dependency closes both paths together).
+        dependencies=[Depends(registration_allowed)],
+    )
+
+    # Add a redirect-based authorize endpoint for Firefox/Safari compatibility
+    # This endpoint performs a server-side redirect instead of returning JSON
+    # which fixes cross-site cookie issues where browsers don't send cookies
+    # set via cross-origin fetch requests on subsequent redirects.
+    # The registration_allowed dependency mirrors the OAuth router above so
+    # the kill switch fails fast here instead of bouncing users to Google
+    # only to 403 on the callback.
+    @app.get(
+        "/auth/google/authorize-redirect",
+        tags=["auth"],
+        dependencies=[Depends(registration_allowed)],
+    )
+    async def google_authorize_redirect(
+        request: Request,
+    ):
+        """
+        Redirect-based OAuth authorization endpoint.
+
+        Unlike the standard /auth/google/authorize endpoint that returns JSON,
+        this endpoint directly redirects the browser to Google's OAuth page.
+        This fixes CSRF cookie issues in Firefox and Safari where cookies set
+        via cross-origin fetch requests are not sent on subsequent redirects.
+        """
+        import secrets
+
+        from fastapi_users.router.oauth import generate_state_token
+
+        # Generate CSRF token
+        csrf_token = secrets.token_urlsafe(32)
+
+        # Build state token
+        state_data = {"csrftoken": csrf_token}
+        state = generate_state_token(state_data, SECRET, lifetime_seconds=3600)
+
+        # Get the callback URL
+        if config.BACKEND_URL:
+            redirect_url = f"{config.BACKEND_URL}/auth/google/callback"
+        else:
+            redirect_url = str(request.url_for("oauth:google.jwt.callback"))
+
+        # Get authorization URL from Google
+        authorization_url = await google_oauth_client.get_authorization_url(
+            redirect_url,
+            state,
+            scope=["openid", "email", "profile"],
+        )
+
+        # Create redirect response and set CSRF cookie
+        response = RedirectResponse(url=authorization_url, status_code=302)
+        response.set_cookie(
+            key="fastapiusersoauthcsrf",
+            value=csrf_token,
+            max_age=3600,
+            path="/",
+            domain=csrf_cookie_domain,
+            secure=is_secure_context,
+            httponly=False,  # Required for cross-site OAuth in Firefox/Safari
+            samesite=csrf_cookie_samesite,
+        )
+
+        return response
+
+
+# Anonymous (no-login) chat routes — mounted at /api/v1/public/anon-chat
+from app.routes.anonymous_chat_routes import (  # noqa: E402
+    router as anonymous_chat_router,
+)
+
+app.include_router(anonymous_chat_router)
 
 app.include_router(crud_router, prefix="/api/v1", tags=["crud"])
 
@@ -428,6 +987,36 @@ async def health_check():
     return {"status": "ok"}
 
 
+@app.get("/ready", tags=["health"])
+@limiter.exempt
+async def readiness_check():
+    """Readiness probe.
+
+    Verifies that the schema state required by downstream services is
+    present. Specifically checks that the ``zero_publication`` Postgres
+    logical-replication publication exists; without it zero-cache crash-loops
+    on `Unknown or invalid publications`.
+
+    Returns 200 when ready, 503 otherwise. Used by the docker-compose
+    backend healthcheck and by ``install.ps1`` / ``install.sh`` post-up
+    verification.
+    """
+    from sqlalchemy import text
+
+    from app.db import async_session_maker
+
+    async with async_session_maker() as session:
+        result = await session.execute(
+            text("SELECT 1 FROM pg_publication WHERE pubname = 'zero_publication'")
+        )
+        if result.first() is None:
+            raise HTTPException(
+                status_code=503,
+                detail="zero_publication missing; run alembic upgrade head",
+            )
+    return {"status": "ready"}
+
+
 @app.get("/verify-token")
 async def authenticated_route(
     user: User = Depends(current_active_user),
diff --git a/surfsense_backend/app/celery_app.py b/surfsense_backend/app/celery_app.py
index bf2fdcb39d..74710d5e1f 100644
--- a/surfsense_backend/app/celery_app.py
+++ b/surfsense_backend/app/celery_app.py
@@ -21,9 +21,13 @@ def init_worker(**kwargs):
     from app.config import (
         initialize_image_gen_router,
         initialize_llm_router,
+        initialize_openrouter_integration,
+        initialize_pricing_registration,
         initialize_vision_llm_router,
     )
 
+    initialize_openrouter_integration()
+    initialize_pricing_registration()
     initialize_llm_router()
     initialize_image_gen_router()
     initialize_vision_llm_router()
@@ -88,6 +92,7 @@ def parse_schedule_interval(interval: str) -> dict:
         "app.tasks.celery_tasks.podcast_tasks",
         "app.tasks.celery_tasks.video_presentation_tasks",
         "app.tasks.celery_tasks.connector_tasks",
+        "app.tasks.celery_tasks.obsidian_tasks",
         "app.tasks.celery_tasks.schedule_checker_task",
         "app.tasks.celery_tasks.document_reindex_tasks",
         "app.tasks.celery_tasks.stale_notification_cleanup_task",
@@ -133,25 +138,17 @@ def parse_schedule_interval(interval: str) -> dict:
     # never block fast user-facing tasks (file uploads, podcasts, etc.)
     task_routes={
         # Connector indexing tasks → connectors queue
-        "index_slack_messages": {"queue": CONNECTORS_QUEUE},
         "index_notion_pages": {"queue": CONNECTORS_QUEUE},
         "index_github_repos": {"queue": CONNECTORS_QUEUE},
-        "index_linear_issues": {"queue": CONNECTORS_QUEUE},
-        "index_jira_issues": {"queue": CONNECTORS_QUEUE},
         "index_confluence_pages": {"queue": CONNECTORS_QUEUE},
-        "index_clickup_tasks": {"queue": CONNECTORS_QUEUE},
         "index_google_calendar_events": {"queue": CONNECTORS_QUEUE},
-        "index_airtable_records": {"queue": CONNECTORS_QUEUE},
         "index_google_gmail_messages": {"queue": CONNECTORS_QUEUE},
         "index_google_drive_files": {"queue": CONNECTORS_QUEUE},
-        "index_discord_messages": {"queue": CONNECTORS_QUEUE},
-        "index_teams_messages": {"queue": CONNECTORS_QUEUE},
-        "index_luma_events": {"queue": CONNECTORS_QUEUE},
         "index_elasticsearch_documents": {"queue": CONNECTORS_QUEUE},
         "index_crawled_urls": {"queue": CONNECTORS_QUEUE},
         "index_bookstack_pages": {"queue": CONNECTORS_QUEUE},
-        "index_obsidian_vault": {"queue": CONNECTORS_QUEUE},
         "index_composio_connector": {"queue": CONNECTORS_QUEUE},
+        "index_obsidian_attachment": {"queue": CONNECTORS_QUEUE},
         # Everything else (document processing, podcasts, reindexing,
         # schedule checker, cleanup) stays on the default fast queue.
     },
@@ -187,4 +184,11 @@ def parse_schedule_interval(interval: str) -> dict:
             "expires": 60,
         },
     },
+    "reconcile-pending-stripe-token-purchases": {
+        "task": "reconcile_pending_stripe_token_purchases",
+        "schedule": crontab(**stripe_reconciliation_schedule_params),
+        "options": {
+            "expires": 60,
+        },
+    },
 }
diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py
index ca7d7d47d5..373f06eeb8 100644
--- a/surfsense_backend/app/config/__init__.py
+++ b/surfsense_backend/app/config/__init__.py
@@ -13,6 +13,9 @@
 env_file = BASE_DIR / ".env"
 load_dotenv(env_file)
 
+os.environ.setdefault("OR_APP_NAME", "SurfSense")
+os.environ.setdefault("OR_SITE_URL", "https://surfsense.com")
+
 
 def is_ffmpeg_installed():
     """
@@ -42,7 +45,72 @@ def load_global_llm_configs():
     try:
         with open(global_config_file, encoding="utf-8") as f:
             data = yaml.safe_load(f)
-            return data.get("global_llm_configs", [])
+            configs = data.get("global_llm_configs", [])
+
+        # Lazy import keeps the `app.config` -> `app.services` edge one-way
+        # and matches the `provider_api_base` pattern used elsewhere.
+        from app.services.provider_capabilities import derive_supports_image_input
+
+        seen_slugs: dict[str, int] = {}
+        for cfg in configs:
+            cfg.setdefault("billing_tier", "free")
+            cfg.setdefault("anonymous_enabled", False)
+            cfg.setdefault("seo_enabled", False)
+            # Capability flag: explicit YAML override always wins. When the
+            # operator has not annotated the model, defer to LiteLLM's
+            # authoritative model map (`supports_vision`) which already
+            # knows GPT-5.x / GPT-4o / Claude 3.x / Gemini 2.x are
+            # vision-capable. Unknown / unmapped models default-allow so
+            # we don't lock the user out of a freshly added third-party
+            # entry; the streaming-task safety net (driven by
+            # `is_known_text_only_chat_model`) is the only place a False
+            # actually blocks a request.
+            if "supports_image_input" not in cfg:
+                litellm_params = cfg.get("litellm_params") or {}
+                base_model = (
+                    litellm_params.get("base_model")
+                    if isinstance(litellm_params, dict)
+                    else None
+                )
+                cfg["supports_image_input"] = derive_supports_image_input(
+                    provider=cfg.get("provider"),
+                    model_name=cfg.get("model_name"),
+                    base_model=base_model,
+                    custom_provider=cfg.get("custom_provider"),
+                )
+
+            if cfg.get("seo_enabled") and cfg.get("seo_slug"):
+                slug = cfg["seo_slug"]
+                if slug in seen_slugs:
+                    print(
+                        f"Warning: Duplicate seo_slug '{slug}' in global LLM configs "
+                        f"(ids {seen_slugs[slug]} and {cfg.get('id')})"
+                    )
+                else:
+                    seen_slugs[slug] = cfg.get("id", 0)
+
+        # Stamp Auto (Fastest) ranking metadata. YAML configs are always
+        # Tier A — operator-curated, locked first when premium-eligible.
+        # The OpenRouter refresh tick later re-stamps health for any cfg
+        # whose provider == "OPENROUTER" via _enrich_health.
+        try:
+            from app.services.quality_score import static_score_yaml
+
+            for cfg in configs:
+                cfg["auto_pin_tier"] = "A"
+                static_q = static_score_yaml(cfg)
+                cfg["quality_score_static"] = static_q
+                cfg["quality_score"] = static_q
+                cfg["quality_score_health"] = None
+                # YAML cfgs whose provider is OPENROUTER are also subject
+                # to health gating against their own /endpoints data — a
+                # hand-picked dead OR model is still dead. _enrich_health
+                # re-stamps health_gated for them on the next refresh tick.
+                cfg["health_gated"] = False
+        except Exception as e:
+            print(f"Warning: Failed to score global LLM configs: {e}")
+
+        return configs
     except Exception as e:
         print(f"Warning: Failed to load global LLM configs: {e}")
         return []
@@ -96,7 +164,11 @@ def load_global_image_gen_configs():
     try:
         with open(global_config_file, encoding="utf-8") as f:
             data = yaml.safe_load(f)
-            return data.get("global_image_generation_configs", [])
+            configs = data.get("global_image_generation_configs", []) or []
+            for cfg in configs:
+                if isinstance(cfg, dict):
+                    cfg.setdefault("billing_tier", "free")
+            return configs
     except Exception as e:
         print(f"Warning: Failed to load global image generation configs: {e}")
         return []
@@ -111,7 +183,11 @@ def load_global_vision_llm_configs():
     try:
         with open(global_config_file, encoding="utf-8") as f:
             data = yaml.safe_load(f)
-            return data.get("global_vision_llm_configs", [])
+            configs = data.get("global_vision_llm_configs", []) or []
+            for cfg in configs:
+                if isinstance(cfg, dict):
+                    cfg.setdefault("billing_tier", "free")
+            return configs
     except Exception as e:
         print(f"Warning: Failed to load global vision LLM configs: {e}")
         return []
@@ -169,24 +245,175 @@ def load_image_gen_router_settings():
         return default_settings
 
 
+def load_openrouter_integration_settings() -> dict | None:
+    """
+    Load OpenRouter integration settings from the YAML config.
+
+    Emits startup warnings for deprecated keys (``billing_tier``,
+    ``anonymous_enabled``) and seeds their replacements for back-compat.
+
+    Returns:
+        dict with settings if present and enabled, None otherwise
+    """
+    global_config_file = BASE_DIR / "app" / "config" / "global_llm_config.yaml"
+
+    if not global_config_file.exists():
+        return None
+
+    try:
+        with open(global_config_file, encoding="utf-8") as f:
+            data = yaml.safe_load(f)
+            settings = data.get("openrouter_integration")
+            if not settings or not settings.get("enabled"):
+                return None
+
+            if "billing_tier" in settings:
+                print(
+                    "Warning: openrouter_integration.billing_tier is deprecated; "
+                    "tier is now derived per model from OpenRouter data "
+                    "(':free' suffix or zero pricing). Remove this key."
+                )
+
+            if "anonymous_enabled" in settings:
+                print(
+                    "Warning: openrouter_integration.anonymous_enabled is "
+                    "deprecated; use anonymous_enabled_paid and/or "
+                    "anonymous_enabled_free instead. Both new flags have been "
+                    "seeded from the legacy value for back-compat."
+                )
+                settings.setdefault(
+                    "anonymous_enabled_paid", settings["anonymous_enabled"]
+                )
+                settings.setdefault(
+                    "anonymous_enabled_free", settings["anonymous_enabled"]
+                )
+
+            # Image generation + vision LLM emission are opt-in (issue L).
+            # OpenRouter's catalogue contains hundreds of image / vision
+            # capable models; auto-injecting all of them into every
+            # deployment would explode the model selector and surprise
+            # operators upgrading from prior versions. Default to False so
+            # admins must explicitly turn them on.
+            settings.setdefault("image_generation_enabled", False)
+            settings.setdefault("vision_enabled", False)
+
+            return settings
+    except Exception as e:
+        print(f"Warning: Failed to load OpenRouter integration settings: {e}")
+        return None
+
+
+def initialize_openrouter_integration():
+    """
+    If enabled, fetch all OpenRouter models and append them to
+    config.GLOBAL_LLM_CONFIGS as dynamic entries. Each model's ``billing_tier``
+    is derived per-model from OpenRouter's API signals (``:free`` suffix or
+    zero pricing), so free OpenRouter models correctly skip premium quota.
+
+    Should be called BEFORE initialize_llm_router(). Dynamic entries are
+    tagged ``router_pool_eligible=False`` so the LiteLLM Router pool (used
+    by title-gen / sub-agent flows) remains scoped to curated YAML configs,
+    while user-facing Auto-mode thread pinning still considers them.
+    """
+    settings = load_openrouter_integration_settings()
+    if not settings:
+        return
+
+    try:
+        from app.services.openrouter_integration_service import (
+            OpenRouterIntegrationService,
+        )
+
+        service = OpenRouterIntegrationService.get_instance()
+        new_configs = service.initialize(settings)
+
+        if new_configs:
+            config.GLOBAL_LLM_CONFIGS.extend(new_configs)
+            free_count = sum(1 for c in new_configs if c.get("billing_tier") == "free")
+            premium_count = sum(
+                1 for c in new_configs if c.get("billing_tier") == "premium"
+            )
+            print(
+                f"Info: OpenRouter integration added {len(new_configs)} models "
+                f"(free={free_count}, premium={premium_count})"
+            )
+        else:
+            print("Info: OpenRouter integration enabled but no models fetched")
+
+        # Image generation + vision LLM emissions are opt-in (issue L).
+        # Both reuse the catalogue already cached by ``service.initialize``
+        # so we don't make additional network calls here.
+        if settings.get("image_generation_enabled"):
+            try:
+                image_configs = service.get_image_generation_configs()
+                if image_configs:
+                    config.GLOBAL_IMAGE_GEN_CONFIGS.extend(image_configs)
+                    print(
+                        f"Info: OpenRouter integration added {len(image_configs)} "
+                        f"image-generation models"
+                    )
+            except Exception as e:
+                print(f"Warning: Failed to inject OpenRouter image-gen configs: {e}")
+
+        if settings.get("vision_enabled"):
+            try:
+                vision_configs = service.get_vision_llm_configs()
+                if vision_configs:
+                    config.GLOBAL_VISION_LLM_CONFIGS.extend(vision_configs)
+                    print(
+                        f"Info: OpenRouter integration added {len(vision_configs)} "
+                        f"vision LLM models"
+                    )
+            except Exception as e:
+                print(f"Warning: Failed to inject OpenRouter vision-LLM configs: {e}")
+    except Exception as e:
+        print(f"Warning: Failed to initialize OpenRouter integration: {e}")
+
+
+def initialize_pricing_registration():
+    """
+    Teach LiteLLM the per-token cost of every deployment in
+    ``config.GLOBAL_LLM_CONFIGS`` (OpenRouter dynamic models pulled
+    from the OpenRouter catalogue + any operator-declared YAML pricing).
+
+    Must run AFTER ``initialize_openrouter_integration()`` so the
+    OpenRouter catalogue is populated and BEFORE the first LLM call so
+    ``response_cost`` is available in ``TokenTrackingCallback``.
+
+    Failures are logged but never raised — startup must not be blocked
+    by a missing pricing entry; the worst-case is the model debits 0.
+    """
+    try:
+        from app.services.pricing_registration import (
+            register_pricing_from_global_configs,
+        )
+
+        register_pricing_from_global_configs()
+    except Exception as e:
+        print(f"Warning: Failed to register LiteLLM pricing: {e}")
+
+
 def initialize_llm_router():
     """
     Initialize the LLM Router service for Auto mode.
-    This should be called during application startup.
+    This should be called during application startup, AFTER
+    initialize_openrouter_integration() so dynamic models are included.
+    Uses config.GLOBAL_LLM_CONFIGS (in-memory) which includes both
+    static YAML configs and dynamic OpenRouter models.
     """
-    global_configs = load_global_llm_configs()
+    all_configs = config.GLOBAL_LLM_CONFIGS
     router_settings = load_router_settings()
 
-    if not global_configs:
+    if not all_configs:
         print("Info: No global LLM configs found, Auto mode will not be available")
         return
 
     try:
         from app.services.llm_router_service import LLMRouterService
 
-        LLMRouterService.initialize(global_configs, router_settings)
+        LLMRouterService.initialize(all_configs, router_settings)
         print(
-            f"Info: LLM Router initialized with {len(global_configs)} models "
+            f"Info: LLM Router initialized with {len(all_configs)} models "
             f"(strategy: {router_settings.get('routing_strategy', 'usage-based-routing')})"
         )
     except Exception as e:
@@ -246,10 +473,16 @@ def initialize_vision_llm_router():
 class Config:
     # Check if ffmpeg is installed
     if not is_ffmpeg_installed():
-        import static_ffmpeg
+        allow_static_ffmpeg = (
+            os.getenv("SURFSENSE_ALLOW_STATIC_FFMPEG_DOWNLOAD", "TRUE").upper()
+            == "TRUE"
+        )
+        if allow_static_ffmpeg:
+            import static_ffmpeg
+
+            # ffmpeg installed on first call to add_paths(), threadsafe.
+            static_ffmpeg.add_paths()
 
-        # ffmpeg installed on first call to add_paths(), threadsafe.
-        static_ffmpeg.add_paths()
         # check if ffmpeg is installed again
         if not is_ffmpeg_installed():
             raise ValueError(
@@ -260,6 +493,9 @@ class Config:
     # self-hosted: Full access to local file system connectors (Obsidian, etc.)
     # cloud: Only cloud-based connectors available
     DEPLOYMENT_MODE = os.getenv("SURFSENSE_DEPLOYMENT_MODE", "self-hosted")
+    ENABLE_DESKTOP_LOCAL_FILESYSTEM = (
+        os.getenv("ENABLE_DESKTOP_LOCAL_FILESYSTEM", "FALSE").upper() == "TRUE"
+    )
 
     @classmethod
     def is_self_hosted(cls) -> bool:
@@ -307,6 +543,108 @@ def is_cloud(cls) -> bool:
         os.getenv("STRIPE_RECONCILIATION_BATCH_SIZE", "100")
     )
 
+    # Premium credit (micro-USD) quota settings.
+    #
+    # Storage unit is integer micro-USD (1_000_000 = $1.00). The legacy
+    # ``PREMIUM_TOKEN_LIMIT`` and ``STRIPE_TOKENS_PER_UNIT`` env vars are
+    # still honoured for one release as fall-back values — the prior
+    # $1-per-1M-tokens Stripe price means every existing value maps 1:1
+    # to micros, so operators upgrading without changing their .env still
+    # get correct behaviour. A startup deprecation warning fires below if
+    # they're set.
+    PREMIUM_CREDIT_MICROS_LIMIT = int(
+        os.getenv("PREMIUM_CREDIT_MICROS_LIMIT")
+        or os.getenv("PREMIUM_TOKEN_LIMIT", "5000000")
+    )
+    STRIPE_PREMIUM_TOKEN_PRICE_ID = os.getenv("STRIPE_PREMIUM_TOKEN_PRICE_ID")
+    STRIPE_CREDIT_MICROS_PER_UNIT = int(
+        os.getenv("STRIPE_CREDIT_MICROS_PER_UNIT")
+        or os.getenv("STRIPE_TOKENS_PER_UNIT", "1000000")
+    )
+    STRIPE_TOKEN_BUYING_ENABLED = (
+        os.getenv("STRIPE_TOKEN_BUYING_ENABLED", "FALSE").upper() == "TRUE"
+    )
+
+    # Safety ceiling on the per-call premium reservation. ``stream_new_chat``
+    # estimates an upper-bound cost from ``litellm.get_model_info`` x the
+    # config's ``quota_reserve_tokens`` and clamps the result to this value
+    # so a misconfigured "$1000/M" model can't lock the user's whole balance
+    # on one call. Default $1.00 covers realistic worst-cases (Opus + 4K
+    # reserve_tokens ≈ $0.36) with headroom.
+    QUOTA_MAX_RESERVE_MICROS = int(os.getenv("QUOTA_MAX_RESERVE_MICROS", "1000000"))
+
+    if os.getenv("PREMIUM_TOKEN_LIMIT") and not os.getenv(
+        "PREMIUM_CREDIT_MICROS_LIMIT"
+    ):
+        print(
+            "Warning: PREMIUM_TOKEN_LIMIT is deprecated; rename to "
+            "PREMIUM_CREDIT_MICROS_LIMIT (1:1 numerical mapping under the "
+            "current Stripe price). The old key will be removed in a "
+            "future release."
+        )
+    if os.getenv("STRIPE_TOKENS_PER_UNIT") and not os.getenv(
+        "STRIPE_CREDIT_MICROS_PER_UNIT"
+    ):
+        print(
+            "Warning: STRIPE_TOKENS_PER_UNIT is deprecated; rename to "
+            "STRIPE_CREDIT_MICROS_PER_UNIT (1:1 numerical mapping). "
+            "The old key will be removed in a future release."
+        )
+
+    # Anonymous / no-login mode settings
+    NOLOGIN_MODE_ENABLED = os.getenv("NOLOGIN_MODE_ENABLED", "FALSE").upper() == "TRUE"
+    MULTI_AGENT_CHAT_ENABLED = (
+        os.getenv("MULTI_AGENT_CHAT_ENABLED", "FALSE").upper() == "TRUE"
+    )
+    ANON_TOKEN_LIMIT = int(os.getenv("ANON_TOKEN_LIMIT", "500000"))
+    ANON_TOKEN_WARNING_THRESHOLD = int(
+        os.getenv("ANON_TOKEN_WARNING_THRESHOLD", "400000")
+    )
+    ANON_TOKEN_QUOTA_TTL_DAYS = int(os.getenv("ANON_TOKEN_QUOTA_TTL_DAYS", "30"))
+    ANON_MAX_UPLOAD_SIZE_MB = int(os.getenv("ANON_MAX_UPLOAD_SIZE_MB", "5"))
+
+    # Default quota reserve tokens when not specified per-model
+    QUOTA_MAX_RESERVE_PER_CALL = int(os.getenv("QUOTA_MAX_RESERVE_PER_CALL", "8000"))
+
+    # Per-image reservation (in micro-USD) used by ``billable_call`` for the
+    # ``POST /image-generations`` endpoint when the global config does not
+    # override it. $0.05 covers realistic worst-cases for current OpenAI /
+    # OpenRouter image-gen pricing. Bypassed entirely for free configs.
+    QUOTA_DEFAULT_IMAGE_RESERVE_MICROS = int(
+        os.getenv("QUOTA_DEFAULT_IMAGE_RESERVE_MICROS", "50000")
+    )
+
+    # Per-podcast reservation (in micro-USD). One agent LLM call generating
+    # a transcript, typically 5k-20k completion tokens. $0.20 covers a long
+    # premium-model run. Tune via env.
+    QUOTA_DEFAULT_PODCAST_RESERVE_MICROS = int(
+        os.getenv("QUOTA_DEFAULT_PODCAST_RESERVE_MICROS", "200000")
+    )
+
+    # Per-video-presentation reservation (in micro-USD). Fan-out of N
+    # slide-scene generations (up to ``VIDEO_PRESENTATION_MAX_SLIDES=30``)
+    # plus refine retries; can produce many premium completions. $1.00
+    # covers worst-case. Tune via env.
+    #
+    # NOTE: this equals the existing ``QUOTA_MAX_RESERVE_MICROS`` default of
+    # 1_000_000. The override path in ``billable_call`` bypasses the
+    # per-call clamp in ``estimate_call_reserve_micros``, so this is the
+    # *actual* hold — raising it via env is fine but means a single video
+    # task can lock $1+ of credit.
+    QUOTA_DEFAULT_VIDEO_PRESENTATION_RESERVE_MICROS = int(
+        os.getenv("QUOTA_DEFAULT_VIDEO_PRESENTATION_RESERVE_MICROS", "1000000")
+    )
+
+    # Abuse prevention: concurrent stream cap and CAPTCHA
+    ANON_MAX_CONCURRENT_STREAMS = int(os.getenv("ANON_MAX_CONCURRENT_STREAMS", "2"))
+    ANON_CAPTCHA_REQUEST_THRESHOLD = int(
+        os.getenv("ANON_CAPTCHA_REQUEST_THRESHOLD", "5")
+    )
+
+    # Cloudflare Turnstile CAPTCHA
+    TURNSTILE_ENABLED = os.getenv("TURNSTILE_ENABLED", "FALSE").upper() == "TRUE"
+    TURNSTILE_SECRET_KEY = os.getenv("TURNSTILE_SECRET_KEY", "")
+
     # Auth — this fork is SSO-only. Default to SSO so a missing envvar doesn't
     # silently fall through to upstream LOCAL behaviour; app.py asserts on boot.
     AUTH_TYPE = os.getenv("AUTH_TYPE", "SSO")
@@ -418,6 +756,9 @@ def is_cloud(cls) -> bool:
     # Router settings for Vision LLM Auto mode
     VISION_LLM_ROUTER_SETTINGS = load_vision_llm_router_settings()
 
+    # OpenRouter Integration settings (optional)
+    OPENROUTER_INTEGRATION_SETTINGS = load_openrouter_integration_settings()
+
     # Chonkie Configuration | Edit this to your needs
     EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
     # Azure OpenAI credentials from environment variables
diff --git a/surfsense_backend/app/config/global_llm_config.example.yaml b/surfsense_backend/app/config/global_llm_config.example.yaml
index e382fdc749..d92640c8d9 100644
--- a/surfsense_backend/app/config/global_llm_config.example.yaml
+++ b/surfsense_backend/app/config/global_llm_config.example.yaml
@@ -1,5 +1,5 @@
 # Global LLM Configuration
-# 
+#
 # SETUP INSTRUCTIONS:
 # 1. For production: Copy this file to global_llm_config.yaml and add your real API keys
 # 2. For testing: The system will use this example file automatically if global_llm_config.yaml doesn't exist
@@ -19,6 +19,24 @@
 # Structure matches NewLLMConfig:
 # - Model configuration (provider, model_name, api_key, etc.)
 # - Prompt configuration (system_instructions, citations_enabled)
+#
+# COST-BASED PREMIUM CREDITS:
+# Each premium config bills the user's USD-credit balance based on the
+# actual provider cost reported by LiteLLM. For models LiteLLM already
+# knows (most OpenAI/Anthropic/etc. names) you don't need to do anything.
+# For custom Azure deployment names (e.g. an in-house "gpt-5.4" deployment)
+# or any model LiteLLM doesn't have in its built-in pricing table, declare
+# per-token costs inline so they bill correctly:
+#
+#   litellm_params:
+#     base_model: "my-custom-azure-deploy"
+#     # USD per token; e.g. 0.000003 == $3.00 per million input tokens
+#     input_cost_per_token: 0.000003
+#     output_cost_per_token: 0.000015
+#
+# OpenRouter dynamic models pull pricing automatically from OpenRouter's
+# API — no inline declaration needed. Models without resolvable pricing
+# debit $0 from the user's balance and log a WARNING.
 
 # Router Settings for Auto Mode
 # These settings control how the LiteLLM Router distributes requests across models
@@ -29,16 +47,16 @@ router_settings:
   # - "least-busy": Routes to least busy deployment
   # - "latency-based-routing": Routes based on response latency
   routing_strategy: "usage-based-routing"
-  
+
   # Number of retries before failing
   num_retries: 3
-  
+
   # Number of failures allowed before cooling down a deployment
   allowed_fails: 3
-  
+
   # Cooldown time in seconds after allowed_fails is exceeded
   cooldown_time: 60
-  
+
   # Fallback models (optional) - when primary fails, try these
   # Format: [{"primary_model": ["fallback1", "fallback2"]}]
   # fallbacks: []
@@ -48,18 +66,23 @@ global_llm_configs:
   - id: -1
     name: "Global GPT-4 Turbo"
     description: "OpenAI's GPT-4 Turbo with default prompts and citations"
+    billing_tier: "free"
+    anonymous_enabled: true
+    seo_enabled: true
+    seo_slug: "gpt-4-turbo"
+    quota_reserve_tokens: 4000
     provider: "OPENAI"
     model_name: "gpt-4-turbo-preview"
     api_key: "sk-your-openai-api-key-here"
     api_base: ""
     # Rate limits for load balancing (requests/tokens per minute)
-    rpm: 500  # Requests per minute
-    tpm: 100000  # Tokens per minute
+    rpm: 500 # Requests per minute
+    tpm: 100000 # Tokens per minute
     litellm_params:
       temperature: 0.7
       max_tokens: 4000
     # Prompt Configuration
-    system_instructions: ""  # Empty = use default SURFSENSE_SYSTEM_INSTRUCTIONS
+    system_instructions: "" # Empty = use default SURFSENSE_SYSTEM_INSTRUCTIONS
     use_default_system_instructions: true
     citations_enabled: true
 
@@ -67,6 +90,11 @@ global_llm_configs:
   - id: -2
     name: "Global Claude 3 Opus"
     description: "Anthropic's most capable model with citations"
+    billing_tier: "free"
+    anonymous_enabled: true
+    seo_enabled: true
+    seo_slug: "claude-3-opus"
+    quota_reserve_tokens: 4000
     provider: "ANTHROPIC"
     model_name: "claude-3-opus-20240229"
     api_key: "sk-ant-your-anthropic-api-key-here"
@@ -84,23 +112,33 @@ global_llm_configs:
   - id: -3
     name: "Global GPT-3.5 Turbo (Fast)"
     description: "Fast responses without citations for quick queries"
+    billing_tier: "free"
+    anonymous_enabled: true
+    seo_enabled: true
+    seo_slug: "gpt-3.5-turbo-fast"
+    quota_reserve_tokens: 2000
     provider: "OPENAI"
     model_name: "gpt-3.5-turbo"
     api_key: "sk-your-openai-api-key-here"
     api_base: ""
-    rpm: 3500  # GPT-3.5 has higher rate limits
+    rpm: 3500 # GPT-3.5 has higher rate limits
     tpm: 200000
     litellm_params:
       temperature: 0.5
       max_tokens: 2000
     system_instructions: ""
     use_default_system_instructions: true
-    citations_enabled: false  # Disabled for faster responses
+    citations_enabled: false # Disabled for faster responses
 
   # Example: Chinese LLM - DeepSeek with custom instructions
   - id: -4
     name: "Global DeepSeek Chat (Chinese)"
     description: "DeepSeek optimized for Chinese language responses"
+    billing_tier: "free"
+    anonymous_enabled: true
+    seo_enabled: true
+    seo_slug: "deepseek-chat-chinese"
+    quota_reserve_tokens: 4000
     provider: "DEEPSEEK"
     model_name: "deepseek-chat"
     api_key: "your-deepseek-api-key-here"
@@ -114,9 +152,9 @@ global_llm_configs:
     system_instructions: |
       <system_instruction>
       You are SurfSense, a reasoning and acting AI agent designed to answer user questions using the user's personal knowledge base.
-      
+
       Today's date (UTC): {resolved_today}
-      
+
       IMPORTANT: Please respond in Chinese (简体中文) unless the user specifically requests another language.
       </system_instruction>
     use_default_system_instructions: false
@@ -128,12 +166,17 @@ global_llm_configs:
   - id: -5
     name: "Global Azure GPT-4o"
     description: "Azure OpenAI GPT-4o deployment"
+    billing_tier: "free"
+    anonymous_enabled: true
+    seo_enabled: true
+    seo_slug: "azure-gpt-4o"
+    quota_reserve_tokens: 4000
     provider: "AZURE"
     # model_name format for Azure: azure/<your-deployment-name>
     model_name: "azure/gpt-4o-deployment"
     api_key: "your-azure-api-key-here"
     api_base: "https://your-resource.openai.azure.com"
-    api_version: "2024-02-15-preview"  # Azure API version
+    api_version: "2024-02-15-preview" # Azure API version
     rpm: 1000
     tpm: 150000
     litellm_params:
@@ -151,6 +194,11 @@ global_llm_configs:
   - id: -6
     name: "Global Azure GPT-4 Turbo"
     description: "Azure OpenAI GPT-4 Turbo deployment"
+    billing_tier: "free"
+    anonymous_enabled: true
+    seo_enabled: true
+    seo_slug: "azure-gpt-4-turbo"
+    quota_reserve_tokens: 4000
     provider: "AZURE"
     model_name: "azure/gpt-4-turbo-deployment"
     api_key: "your-azure-api-key-here"
@@ -161,7 +209,7 @@ global_llm_configs:
     litellm_params:
       temperature: 0.7
       max_tokens: 4000
-      base_model: "gpt-4-turbo"  # Maps to gpt-4-turbo-preview
+      base_model: "gpt-4-turbo" # Maps to gpt-4-turbo-preview
     system_instructions: ""
     use_default_system_instructions: true
     citations_enabled: true
@@ -170,11 +218,16 @@ global_llm_configs:
   - id: -7
     name: "Global Groq Llama 3"
     description: "Ultra-fast Llama 3 70B via Groq"
+    billing_tier: "free"
+    anonymous_enabled: true
+    seo_enabled: true
+    seo_slug: "groq-llama-3"
+    quota_reserve_tokens: 8000
     provider: "GROQ"
     model_name: "llama3-70b-8192"
     api_key: "your-groq-api-key-here"
     api_base: ""
-    rpm: 30  # Groq has lower rate limits on free tier
+    rpm: 30 # Groq has lower rate limits on free tier
     tpm: 14400
     litellm_params:
       temperature: 0.7
@@ -187,6 +240,11 @@ global_llm_configs:
   - id: -8
     name: "Global MiniMax M2.5"
     description: "MiniMax M2.5 with 204K context window and competitive pricing"
+    billing_tier: "free"
+    anonymous_enabled: true
+    seo_enabled: true
+    seo_slug: "minimax-m2.5"
+    quota_reserve_tokens: 4000
     provider: "MINIMAX"
     model_name: "MiniMax-M2.5"
     api_key: "your-minimax-api-key-here"
@@ -194,12 +252,81 @@ global_llm_configs:
     rpm: 60
     tpm: 100000
     litellm_params:
-      temperature: 1.0  # MiniMax requires temperature in (0.0, 1.0], cannot be 0
+      temperature: 1.0 # MiniMax requires temperature in (0.0, 1.0], cannot be 0
       max_tokens: 4000
     system_instructions: ""
     use_default_system_instructions: true
     citations_enabled: true
 
+# =============================================================================
+# OpenRouter Integration
+# =============================================================================
+# When enabled, dynamically fetches ALL available models from the OpenRouter API
+# and injects them as global configs. This gives premium users access to any model
+# on OpenRouter (Claude, Gemini, Llama, Mistral, etc.) via their premium token quota,
+# while free-tier OpenRouter models show up with a green Free badge and do NOT
+# consume premium quota.
+# Models are fetched at startup and refreshed periodically in the background.
+# All calls go through LiteLLM with the openrouter/ prefix.
+openrouter_integration:
+  enabled: false
+  api_key: "sk-or-your-openrouter-api-key"
+
+  # Tier is derived PER MODEL from OpenRouter's own API signals:
+  #   - id ends with ":free"                         -> billing_tier=free
+  #   - pricing.prompt AND pricing.completion == "0" -> billing_tier=free
+  #   - otherwise                                    -> billing_tier=premium
+  # No global billing_tier knob is honored; any legacy value emits a startup warning.
+
+  # Anonymous access is split by tier so operators can expose only free
+  # models to no-login users without leaking paid inference.
+  anonymous_enabled_paid: false
+  anonymous_enabled_free: false
+
+  seo_enabled: false
+  # quota_reserve_tokens: tokens reserved per call for quota enforcement
+  quota_reserve_tokens: 4000
+  # id_offset: base negative ID for dynamically generated configs.
+  # Model IDs are derived deterministically via BLAKE2b so they survive
+  # catalogue churn. Must not overlap with your static global_llm_configs IDs.
+  id_offset: -10000
+  # refresh_interval_hours: how often to re-fetch models from OpenRouter (0 = startup only)
+  refresh_interval_hours: 24
+
+  # Rate limits for PAID OpenRouter models. These are used by LiteLLM Router
+  # for per-deployment accounting when OR premium models participate in the
+  # shared sub-agent "auto" pool. They do NOT cap OpenRouter itself — your
+  # real account limits live at https://openrouter.ai/settings/limits.
+  rpm: 200
+  tpm: 1000000
+
+  # Rate limits for FREE OpenRouter models. Informational only: free OR
+  # models are intentionally kept OUT of the LiteLLM Router pool, because
+  # OpenRouter enforces free-tier limits globally per account (~20 RPM +
+  # 50-1000 daily requests across every ":free" model combined) —
+  # per-deployment router accounting can't represent a shared bucket
+  # correctly. Free OR models stay fully available in the model selector
+  # and for user-facing Auto thread pinning.
+  free_rpm: 20
+  free_tpm: 100000
+
+  # Image generation + vision LLM emission are OPT-IN. OpenRouter's catalogue
+  # contains hundreds of image- and vision-capable models; turning these on
+  # injects them into the global Image-Generation / Vision-LLM model
+  # selectors alongside any static configs. Tier (free/premium) is derived
+  # per model the same way it is for chat (`:free` suffix or zero pricing).
+  # When a user picks a premium image/vision model the call debits the
+  # shared $5 USD-cost-based premium credit pool — so leaving these off
+  # avoids surprise quota burn on existing deployments. Default: false.
+  image_generation_enabled: false
+  vision_enabled: false
+
+  litellm_params:
+    max_tokens: 16384
+  system_instructions: ""
+  use_default_system_instructions: true
+  citations_enabled: true
+
 # =============================================================================
 # Image Generation Configuration
 # =============================================================================
@@ -225,7 +352,7 @@ global_image_generation_configs:
     model_name: "dall-e-3"
     api_key: "sk-your-openai-api-key-here"
     api_base: ""
-    rpm: 50  # Requests per minute (image gen is rate-limited by RPM, not tokens)
+    rpm: 50 # Requests per minute (image gen is rate-limited by RPM, not tokens)
     litellm_params: {}
 
   # Example: OpenAI GPT Image 1
@@ -354,7 +481,7 @@ global_vision_llm_configs:
 #
 # IMAGE GENERATION NOTES:
 # - Image generation configs use the same ID scheme as LLM configs (negative for global)
-# - Supported models: dall-e-2, dall-e-3, gpt-image-1 (OpenAI), azure/* (Azure), 
+# - Supported models: dall-e-2, dall-e-3, gpt-image-1 (OpenAI), azure/* (Azure),
 #   bedrock/* (AWS), vertex_ai/* (Google), recraft/* (Recraft), openrouter/* (OpenRouter)
 # - The router uses litellm.aimage_generation() for async image generation
 # - Only RPM (requests per minute) is relevant for image generation rate limiting.
@@ -365,3 +492,13 @@ global_vision_llm_configs:
 # - Only use vision-capable models (GPT-4o, Gemini, Claude 3, etc.)
 # - Lower temperature (0.3) is recommended for accurate screenshot analysis
 # - Lower max_tokens (1000) is sufficient since autocomplete produces short suggestions
+#
+# TOKEN QUOTA & ANONYMOUS ACCESS NOTES:
+# - billing_tier: "free" or "premium". Controls whether registered users need premium token quota.
+# - anonymous_enabled: true/false. Whether the model appears in the public no-login catalog.
+# - seo_enabled: true/false. Whether a /free/<seo_slug> landing page is generated.
+# - seo_slug: Stable URL slug for SEO pages. Must be unique. Do NOT change once public.
+# - seo_title: Optional HTML title tag override for the model's /free/<slug> page.
+# - seo_description: Optional meta description override for the model's /free/<slug> page.
+# - quota_reserve_tokens: Tokens reserved before each LLM call for quota enforcement.
+#   Independent of litellm_params.max_tokens. Used by the token quota service.
diff --git a/surfsense_backend/app/connectors/dropbox/content_extractor.py b/surfsense_backend/app/connectors/dropbox/content_extractor.py
index 8cbc3e4177..372d2fc82b 100644
--- a/surfsense_backend/app/connectors/dropbox/content_extractor.py
+++ b/surfsense_backend/app/connectors/dropbox/content_extractor.py
@@ -44,6 +44,8 @@ async def _export_paper_content(
 async def download_and_extract_content(
     client: DropboxClient,
     file: dict[str, Any],
+    *,
+    vision_llm=None,
 ) -> tuple[str | None, dict[str, Any], str | None]:
     """Download a Dropbox file and extract its content as markdown.
 
@@ -91,7 +93,7 @@ async def download_and_extract_content(
         from app.etl_pipeline.etl_document import EtlRequest
         from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
 
-        result = await EtlPipelineService().extract(
+        result = await EtlPipelineService(vision_llm=vision_llm).extract(
             EtlRequest(file_path=temp_file_path, filename=file_name)
         )
         markdown = result.markdown_content
diff --git a/surfsense_backend/app/connectors/exceptions.py b/surfsense_backend/app/connectors/exceptions.py
new file mode 100644
index 0000000000..027adbb87b
--- /dev/null
+++ b/surfsense_backend/app/connectors/exceptions.py
@@ -0,0 +1,97 @@
+"""Standard exception hierarchy for all connectors.
+
+ConnectorError
+├── ConnectorAuthError         (401/403 — non-retryable)
+├── ConnectorRateLimitError    (429 — retryable, carries ``retry_after``)
+├── ConnectorTimeoutError      (timeout/504 — retryable)
+└── ConnectorAPIError          (5xx or unexpected — retryable when >= 500)
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+class ConnectorError(Exception):
+    def __init__(
+        self,
+        message: str,
+        *,
+        service: str = "",
+        status_code: int | None = None,
+        response_body: Any = None,
+    ) -> None:
+        super().__init__(message)
+        self.service = service
+        self.status_code = status_code
+        self.response_body = response_body
+
+    @property
+    def retryable(self) -> bool:
+        return False
+
+
+class ConnectorAuthError(ConnectorError):
+    """Token expired, revoked, insufficient scopes, or needs re-auth (401/403)."""
+
+    @property
+    def retryable(self) -> bool:
+        return False
+
+
+class ConnectorRateLimitError(ConnectorError):
+    """429 Too Many Requests."""
+
+    def __init__(
+        self,
+        message: str = "Rate limited",
+        *,
+        service: str = "",
+        retry_after: float | None = None,
+        status_code: int = 429,
+        response_body: Any = None,
+    ) -> None:
+        super().__init__(
+            message,
+            service=service,
+            status_code=status_code,
+            response_body=response_body,
+        )
+        self.retry_after = retry_after
+
+    @property
+    def retryable(self) -> bool:
+        return True
+
+
+class ConnectorTimeoutError(ConnectorError):
+    """Request timeout or gateway timeout (504)."""
+
+    def __init__(
+        self,
+        message: str = "Request timed out",
+        *,
+        service: str = "",
+        status_code: int | None = None,
+        response_body: Any = None,
+    ) -> None:
+        super().__init__(
+            message,
+            service=service,
+            status_code=status_code,
+            response_body=response_body,
+        )
+
+    @property
+    def retryable(self) -> bool:
+        return True
+
+
+class ConnectorAPIError(ConnectorError):
+    """Generic API error (5xx or unexpected status codes)."""
+
+    @property
+    def retryable(self) -> bool:
+        if self.status_code is not None:
+            return self.status_code >= 500
+        return False
diff --git a/surfsense_backend/app/connectors/google_drive/content_extractor.py b/surfsense_backend/app/connectors/google_drive/content_extractor.py
index 83ff32e829..86c789b97a 100644
--- a/surfsense_backend/app/connectors/google_drive/content_extractor.py
+++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py
@@ -27,6 +27,8 @@
 async def download_and_extract_content(
     client: GoogleDriveClient,
     file: dict[str, Any],
+    *,
+    vision_llm=None,
 ) -> tuple[str | None, dict[str, Any], str | None]:
     """Download a Google Drive file and extract its content as markdown.
 
@@ -103,7 +105,9 @@ async def download_and_extract_content(
         etl_filename = (
             file_name + extension if is_google_workspace_file(mime_type) else file_name
         )
-        markdown = await _parse_file_to_markdown(temp_file_path, etl_filename)
+        markdown = await _parse_file_to_markdown(
+            temp_file_path, etl_filename, vision_llm=vision_llm
+        )
         return markdown, drive_metadata, None
 
     except Exception as e:
@@ -115,12 +119,14 @@ async def download_and_extract_content(
                 os.unlink(temp_file_path)
 
 
-async def _parse_file_to_markdown(file_path: str, filename: str) -> str:
+async def _parse_file_to_markdown(
+    file_path: str, filename: str, *, vision_llm=None
+) -> str:
     """Parse a local file to markdown using the unified ETL pipeline."""
     from app.etl_pipeline.etl_document import EtlRequest
     from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
 
-    result = await EtlPipelineService().extract(
+    result = await EtlPipelineService(vision_llm=vision_llm).extract(
         EtlRequest(file_path=file_path, filename=filename)
     )
     return result.markdown_content
diff --git a/surfsense_backend/app/connectors/onedrive/content_extractor.py b/surfsense_backend/app/connectors/onedrive/content_extractor.py
index 2238b8603e..3154f2eca9 100644
--- a/surfsense_backend/app/connectors/onedrive/content_extractor.py
+++ b/surfsense_backend/app/connectors/onedrive/content_extractor.py
@@ -16,6 +16,8 @@
 async def download_and_extract_content(
     client: OneDriveClient,
     file: dict[str, Any],
+    *,
+    vision_llm=None,
 ) -> tuple[str | None, dict[str, Any], str | None]:
     """Download a OneDrive file and extract its content as markdown.
 
@@ -65,7 +67,9 @@ async def download_and_extract_content(
         if error:
             return None, metadata, error
 
-        markdown = await _parse_file_to_markdown(temp_file_path, file_name)
+        markdown = await _parse_file_to_markdown(
+            temp_file_path, file_name, vision_llm=vision_llm
+        )
         return markdown, metadata, None
 
     except Exception as e:
@@ -77,12 +81,14 @@ async def download_and_extract_content(
                 os.unlink(temp_file_path)
 
 
-async def _parse_file_to_markdown(file_path: str, filename: str) -> str:
+async def _parse_file_to_markdown(
+    file_path: str, filename: str, *, vision_llm=None
+) -> str:
     """Parse a local file to markdown using the unified ETL pipeline."""
     from app.etl_pipeline.etl_document import EtlRequest
     from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
 
-    result = await EtlPipelineService().extract(
+    result = await EtlPipelineService(vision_llm=vision_llm).extract(
         EtlRequest(file_path=file_path, filename=filename)
     )
     return result.markdown_content
diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py
index 01a6bbda0d..9fc27fb1f6 100644
--- a/surfsense_backend/app/db.py
+++ b/surfsense_backend/app/db.py
@@ -12,6 +12,7 @@
     ARRAY,
     JSON,
     TIMESTAMP,
+    BigInteger,
     Boolean,
     Column,
     Enum as SQLAlchemyEnum,
@@ -318,6 +319,12 @@ class PagePurchaseStatus(StrEnum):
     FAILED = "failed"
 
 
+class PremiumTokenPurchaseStatus(StrEnum):
+    PENDING = "pending"
+    COMPLETED = "completed"
+    FAILED = "failed"
+
+
 # Centralized configuration for incentive tasks
 # This makes it easy to add new tasks without changing code in multiple places
 INCENTIVE_TASKS_CONFIG = {
@@ -631,6 +638,12 @@ class NewChatThread(BaseModel, TimestampMixin):
         default=False,
         server_default="false",
     )
+    # Auto (Fastest) model pin for this thread: concrete resolved global LLM
+    # config id. NULL means no pin; Auto will resolve on the next turn.
+    # Single-writer invariant: only app.services.auto_model_pin_service sets
+    # or clears this column (plus bulk clears when a search space's
+    # agent_llm_id changes). Unindexed: all reads are by primary key.
+    pinned_llm_config_id = Column(Integer, nullable=True)
 
     # Relationships
     search_space = relationship("SearchSpace", back_populates="new_chat_threads")
@@ -647,6 +660,11 @@ class NewChatThread(BaseModel, TimestampMixin):
         cascade="all, delete-orphan",
         foreign_keys="[PublicChatSnapshot.thread_id]",
     )
+    token_usages = relationship(
+        "TokenUsage",
+        back_populates="thread",
+        cascade="all, delete-orphan",
+    )
 
 
 class NewChatMessage(BaseModel, TimestampMixin):
@@ -657,6 +675,23 @@ class NewChatMessage(BaseModel, TimestampMixin):
 
     __tablename__ = "new_chat_messages"
 
+    # Partial unique index on (thread_id, turn_id, role) where turn_id IS NOT NULL.
+    # Mirrors alembic migration 141. Lets the streaming agent and the
+    # legacy frontend appendMessage call coexist idempotently — the second
+    # writer trips the unique and recovers without creating a duplicate row.
+    # Partial so legacy NULL turn_id rows and clone/snapshot inserts in
+    # app/services/public_chat_service.py (which omit turn_id) are unaffected.
+    __table_args__ = (
+        Index(
+            "uq_new_chat_messages_thread_turn_role",
+            "thread_id",
+            "turn_id",
+            "role",
+            unique=True,
+            postgresql_where=text("turn_id IS NOT NULL"),
+        ),
+    )
+
     role = Column(SQLAlchemyEnum(NewChatMessageRole), nullable=False)
     # Content stored as JSONB to support rich content (text, tool calls, etc.)
     content = Column(JSONB, nullable=False)
@@ -677,6 +712,12 @@ class NewChatMessage(BaseModel, TimestampMixin):
         index=True,
     )
 
+    # Per-turn correlation id sourced from ``configurable.turn_id`` at
+    # streaming time (``f"{chat_id}:{ms}"``). Nullable because legacy rows
+    # predate the column. Used by C1's edit-from-arbitrary-position to map
+    # a message back to the LangGraph checkpoint that produced its turn.
+    turn_id = Column(String(64), nullable=True, index=True)
+
     # Relationships
     thread = relationship("NewChatThread", back_populates="messages")
     author = relationship("User")
@@ -685,6 +726,80 @@ class NewChatMessage(BaseModel, TimestampMixin):
         back_populates="message",
         cascade="all, delete-orphan",
     )
+    token_usage = relationship(
+        "TokenUsage",
+        back_populates="message",
+        uselist=False,
+        cascade="all, delete-orphan",
+    )
+
+
+class TokenUsage(BaseModel, TimestampMixin):
+    """
+    Tracks LLM token consumption per assistant turn.
+
+    One row per usage event. For chat, linked to a specific message via message_id.
+    The usage_type column enables future extension to track non-chat usage
+    (indexing, image generation, podcasts, etc.) without schema changes.
+    """
+
+    __tablename__ = "token_usage"
+
+    # Partial unique index on (message_id) where message_id IS NOT NULL.
+    # Mirrors alembic migration 142. Lets the streaming agent's
+    # ``finalize_assistant_turn`` and the legacy frontend ``append_message``
+    # recovery branch both use ``INSERT ... ON CONFLICT DO NOTHING`` without
+    # racing on a SELECT-then-INSERT window. Partial so non-chat usage rows
+    # (indexing, image generation, podcasts) — which keep ``message_id`` NULL
+    # because there is no per-message anchor — are unaffected.
+    __table_args__ = (
+        Index(
+            "uq_token_usage_message_id",
+            "message_id",
+            unique=True,
+            postgresql_where=text("message_id IS NOT NULL"),
+        ),
+    )
+
+    prompt_tokens = Column(Integer, nullable=False, default=0)
+    completion_tokens = Column(Integer, nullable=False, default=0)
+    total_tokens = Column(Integer, nullable=False, default=0)
+    cost_micros = Column(BigInteger, nullable=False, default=0, server_default="0")
+    model_breakdown = Column(JSONB, nullable=True)
+    call_details = Column(JSONB, nullable=True)
+
+    usage_type = Column(String(50), nullable=False, default="chat", index=True)
+
+    thread_id = Column(
+        Integer,
+        ForeignKey("new_chat_threads.id", ondelete="CASCADE"),
+        nullable=True,
+        index=True,
+    )
+    message_id = Column(
+        Integer,
+        ForeignKey("new_chat_messages.id", ondelete="SET NULL"),
+        nullable=True,
+        index=True,
+    )
+    search_space_id = Column(
+        Integer,
+        ForeignKey("searchspaces.id", ondelete="CASCADE"),
+        nullable=False,
+        index=True,
+    )
+    user_id = Column(
+        UUID(as_uuid=True),
+        ForeignKey("user.id", ondelete="CASCADE"),
+        nullable=False,
+        index=True,
+    )
+
+    # Relationships
+    thread = relationship("NewChatThread", back_populates="token_usages")
+    message = relationship("NewChatMessage", back_populates="token_usage")
+    search_space = relationship("SearchSpace")
+    user = relationship("User")
 
 
 class PublicChatSnapshot(BaseModel, TimestampMixin):
@@ -861,99 +976,6 @@ class ChatSessionState(BaseModel):
     ai_responding_to_user = relationship("User")
 
 
-class MemoryCategory(StrEnum):
-    """Categories for user memories."""
-
-    # Using lowercase keys to match PostgreSQL enum values
-    preference = "preference"  # User preferences (e.g., "prefers dark mode")
-    fact = "fact"  # Facts about the user (e.g., "is a Python developer")
-    instruction = (
-        "instruction"  # Standing instructions (e.g., "always respond in bullet points")
-    )
-    context = "context"  # Contextual information (e.g., "working on project X")
-
-
-class UserMemory(BaseModel, TimestampMixin):
-    """
-    Private memory: facts, preferences, context per user per search space.
-    Used only for private chats (not shared/team chats).
-    """
-
-    __tablename__ = "user_memories"
-
-    user_id = Column(
-        UUID(as_uuid=True),
-        ForeignKey("user.id", ondelete="CASCADE"),
-        nullable=False,
-        index=True,
-    )
-    # Optional association with a search space (if memory is space-specific)
-    search_space_id = Column(
-        Integer,
-        ForeignKey("searchspaces.id", ondelete="CASCADE"),
-        nullable=True,
-        index=True,
-    )
-
-    # The actual memory content
-    memory_text = Column(Text, nullable=False)
-    # Category for organization and filtering
-    category = Column(
-        SQLAlchemyEnum(MemoryCategory),
-        nullable=False,
-        default=MemoryCategory.fact,
-    )
-    # Vector embedding for semantic search
-    embedding = Column(Vector(config.embedding_model_instance.dimension))
-
-    # Track when memory was last updated
-    updated_at = Column(
-        TIMESTAMP(timezone=True),
-        nullable=False,
-        default=lambda: datetime.now(UTC),
-        onupdate=lambda: datetime.now(UTC),
-        index=True,
-    )
-
-    # Relationships
-    user = relationship("User", back_populates="memories")
-    search_space = relationship("SearchSpace", back_populates="user_memories")
-
-
-class SharedMemory(BaseModel, TimestampMixin):
-    __tablename__ = "shared_memories"
-
-    search_space_id = Column(
-        Integer,
-        ForeignKey("searchspaces.id", ondelete="CASCADE"),
-        nullable=False,
-        index=True,
-    )
-    created_by_id = Column(
-        UUID(as_uuid=True),
-        ForeignKey("user.id", ondelete="CASCADE"),
-        nullable=False,
-        index=True,
-    )
-    memory_text = Column(Text, nullable=False)
-    category = Column(
-        SQLAlchemyEnum(MemoryCategory),
-        nullable=False,
-        default=MemoryCategory.fact,
-    )
-    embedding = Column(Vector(config.embedding_model_instance.dimension))
-    updated_at = Column(
-        TIMESTAMP(timezone=True),
-        nullable=False,
-        default=lambda: datetime.now(UTC),
-        onupdate=lambda: datetime.now(UTC),
-        index=True,
-    )
-
-    search_space = relationship("SearchSpace", back_populates="shared_memories")
-    created_by = relationship("User")
-
-
 class Folder(BaseModel, TimestampMixin):
     __tablename__ = "folders"
 
@@ -1000,7 +1022,15 @@ class Document(BaseModel, TimestampMixin):
     document_metadata = Column(JSON, nullable=True)
 
     content = Column(Text, nullable=False)
-    content_hash = Column(String, nullable=False, index=True, unique=True)
+    # ``content_hash`` is intentionally NOT globally unique. In a real
+    # filesystem two files at different paths can hold identical bytes,
+    # and the agent's ``write_file`` flow needs that semantic to support
+    # copy / duplicate operations. Path uniqueness lives on
+    # ``unique_identifier_hash`` (per search space). The hash remains
+    # indexed because connector indexers consult it as a change-detection
+    # / cross-source dedup hint via :func:`check_duplicate_document`.
+    # See migration 133.
+    content_hash = Column(String, nullable=False, index=True)
     unique_identifier_hash = Column(String, nullable=True, index=True, unique=True)
     embedding = Column(Vector(config.embedding_model_instance.dimension))
 
@@ -1222,12 +1252,13 @@ class VideoPresentation(BaseModel, TimestampMixin):
 
 
 class Report(BaseModel, TimestampMixin):
-    """Report model for storing generated Markdown reports."""
+    """Report model for storing generated reports (Markdown or Typst)."""
 
     __tablename__ = "reports"
 
     title = Column(String(500), nullable=False)
-    content = Column(Text, nullable=True)  # Markdown body
+    content = Column(Text, nullable=True)
+    content_type = Column(String(20), nullable=False, server_default="markdown")
     report_metadata = Column(JSONB, nullable=True)  # section headings, word count, etc.
     report_style = Column(
         String(100), nullable=True
@@ -1394,6 +1425,8 @@ class SearchSpace(BaseModel, TimestampMixin):
         Text, nullable=True, default=""
     )  # User's custom instructions
 
+    shared_memory_md = Column(Text, nullable=True, server_default="")
+
     # Search space-level LLM preferences (shared by all members)
     # Note: ID values:
     #   - 0: Auto mode (uses LiteLLM Router for load balancing) - default for new search spaces
@@ -1412,6 +1445,10 @@ class SearchSpace(BaseModel, TimestampMixin):
         Integer, nullable=True, default=0
     )  # For vision/screenshot analysis, defaults to Auto mode
 
+    ai_file_sort_enabled = Column(
+        Boolean, nullable=False, default=False, server_default="false"
+    )
+
     user_id = Column(
         UUID(as_uuid=True), ForeignKey("user.id", ondelete="CASCADE"), nullable=False
     )
@@ -1516,20 +1553,6 @@ class SearchSpace(BaseModel, TimestampMixin):
         cascade="all, delete-orphan",
     )
 
-    # User memories associated with this search space
-    user_memories = relationship(
-        "UserMemory",
-        back_populates="search_space",
-        order_by="UserMemory.updated_at.desc()",
-        cascade="all, delete-orphan",
-    )
-    shared_memories = relationship(
-        "SharedMemory",
-        back_populates="search_space",
-        order_by="SharedMemory.updated_at.desc()",
-        cascade="all, delete-orphan",
-    )
-
 
 class SearchSourceConnector(BaseModel, TimestampMixin):
     __tablename__ = "search_source_connectors"
@@ -1541,6 +1564,31 @@ class SearchSourceConnector(BaseModel, TimestampMixin):
             "name",
             name="uq_searchspace_user_connector_type_name",
         ),
+        # Mirrors migration 129; backs the ``/obsidian/connect`` upsert.
+        Index(
+            "search_source_connectors_obsidian_plugin_vault_uniq",
+            "user_id",
+            text("(config->>'vault_id')"),
+            unique=True,
+            postgresql_where=text(
+                "connector_type = 'OBSIDIAN_CONNECTOR' "
+                "AND config->>'source' = 'plugin' "
+                "AND config->>'vault_id' IS NOT NULL"
+            ),
+        ),
+        # Cross-device dedup: same vault content from different devices
+        # cannot produce two connector rows.
+        Index(
+            "search_source_connectors_obsidian_plugin_fingerprint_uniq",
+            "user_id",
+            text("(config->>'vault_fingerprint')"),
+            unique=True,
+            postgresql_where=text(
+                "connector_type = 'OBSIDIAN_CONNECTOR' "
+                "AND config->>'source' = 'plugin' "
+                "AND config->>'vault_fingerprint' IS NOT NULL"
+            ),
+        ),
     )
 
     name = Column(String(100), nullable=False, index=True)
@@ -1555,6 +1603,13 @@ class SearchSourceConnector(BaseModel, TimestampMixin):
         Boolean, nullable=False, default=False, server_default="false"
     )
 
+    # Vision LLM for image files - disabled by default to save cost/time.
+    # When enabled, images are described via a vision language model instead
+    # of falling back to the document parser.
+    enable_vision_llm = Column(
+        Boolean, nullable=False, default=False, server_default="false"
+    )
+
     # Periodic indexing fields
     periodic_indexing_enabled = Column(Boolean, nullable=False, default=False)
     indexing_frequency_minutes = Column(Integer, nullable=True)
@@ -1771,6 +1826,46 @@ class PagePurchase(Base, TimestampMixin):
     user = relationship("User", back_populates="page_purchases")
 
 
+class PremiumTokenPurchase(Base, TimestampMixin):
+    """Tracks Stripe checkout sessions used to grant additional premium credit (USD micro-units).
+
+    Note: the table name is preserved (``premium_token_purchases``) for
+    operational continuity even though the unit is now USD micro-credits
+    instead of raw tokens. The ``credit_micros_granted`` column replaced
+    the legacy ``tokens_granted`` in migration 140; the stored values
+    were not transformed because the prior $1 = 1M tokens Stripe price
+    makes the unit conversion 1:1 numerically.
+    """
+
+    __tablename__ = "premium_token_purchases"
+    __allow_unmapped__ = True
+
+    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    user_id = Column(
+        UUID(as_uuid=True),
+        ForeignKey("user.id", ondelete="CASCADE"),
+        nullable=False,
+        index=True,
+    )
+    stripe_checkout_session_id = Column(
+        String(255), nullable=False, unique=True, index=True
+    )
+    stripe_payment_intent_id = Column(String(255), nullable=True, index=True)
+    quantity = Column(Integer, nullable=False)
+    credit_micros_granted = Column(BigInteger, nullable=False)
+    amount_total = Column(Integer, nullable=True)
+    currency = Column(String(10), nullable=True)
+    status = Column(
+        SQLAlchemyEnum(PremiumTokenPurchaseStatus),
+        nullable=False,
+        default=PremiumTokenPurchaseStatus.PENDING,
+        index=True,
+    )
+    completed_at = Column(TIMESTAMP(timezone=True), nullable=True)
+
+    user = relationship("User", back_populates="premium_token_purchases")
+
+
 class SearchSpaceRole(BaseModel, TimestampMixin):
     """
     Custom roles that can be defined per search space.
@@ -2030,14 +2125,6 @@ class User(SQLAlchemyBaseUserTableUUID, Base):
             passive_deletes=True,
         )
 
-        # User memories for personalized AI responses
-        memories = relationship(
-            "UserMemory",
-            back_populates="user",
-            order_by="UserMemory.updated_at.desc()",
-            cascade="all, delete-orphan",
-        )
-
         # Incentive tasks completed by this user
         incentive_tasks = relationship(
             "UserIncentiveTask",
@@ -2049,6 +2136,11 @@ class User(SQLAlchemyBaseUserTableUUID, Base):
             back_populates="user",
             cascade="all, delete-orphan",
         )
+        premium_token_purchases = relationship(
+            "PremiumTokenPurchase",
+            back_populates="user",
+            cascade="all, delete-orphan",
+        )
 
         # Page usage tracking for ETL services
         pages_limit = Column(
@@ -2059,12 +2151,27 @@ class User(SQLAlchemyBaseUserTableUUID, Base):
         )
         pages_used = Column(Integer, nullable=False, default=0, server_default="0")
 
+        premium_credit_micros_limit = Column(
+            BigInteger,
+            nullable=False,
+            default=config.PREMIUM_CREDIT_MICROS_LIMIT,
+            server_default=str(config.PREMIUM_CREDIT_MICROS_LIMIT),
+        )
+        premium_credit_micros_used = Column(
+            BigInteger, nullable=False, default=0, server_default="0"
+        )
+        premium_credit_micros_reserved = Column(
+            BigInteger, nullable=False, default=0, server_default="0"
+        )
+
         # User profile from OAuth
         display_name = Column(String, nullable=True)
         avatar_url = Column(String, nullable=True)
 
         last_login = Column(TIMESTAMP(timezone=True), nullable=True)
 
+        memory_md = Column(Text, nullable=True, server_default="")
+
         # Refresh tokens for this user
         refresh_tokens = relationship(
             "RefreshToken",
@@ -2150,14 +2257,6 @@ class User(SQLAlchemyBaseUserTableUUID, Base):
             passive_deletes=True,
         )
 
-        # User memories for personalized AI responses
-        memories = relationship(
-            "UserMemory",
-            back_populates="user",
-            order_by="UserMemory.updated_at.desc()",
-            cascade="all, delete-orphan",
-        )
-
         # Incentive tasks completed by this user
         incentive_tasks = relationship(
             "UserIncentiveTask",
@@ -2169,6 +2268,11 @@ class User(SQLAlchemyBaseUserTableUUID, Base):
             back_populates="user",
             cascade="all, delete-orphan",
         )
+        premium_token_purchases = relationship(
+            "PremiumTokenPurchase",
+            back_populates="user",
+            cascade="all, delete-orphan",
+        )
 
         # Page usage tracking for ETL services
         pages_limit = Column(
@@ -2179,12 +2283,27 @@ class User(SQLAlchemyBaseUserTableUUID, Base):
         )
         pages_used = Column(Integer, nullable=False, default=0, server_default="0")
 
+        premium_credit_micros_limit = Column(
+            BigInteger,
+            nullable=False,
+            default=config.PREMIUM_CREDIT_MICROS_LIMIT,
+            server_default=str(config.PREMIUM_CREDIT_MICROS_LIMIT),
+        )
+        premium_credit_micros_used = Column(
+            BigInteger, nullable=False, default=0, server_default="0"
+        )
+        premium_credit_micros_reserved = Column(
+            BigInteger, nullable=False, default=0, server_default="0"
+        )
+
         # User profile (can be set manually for non-OAuth users)
         display_name = Column(String, nullable=True)
         avatar_url = Column(String, nullable=True)
 
         last_login = Column(TIMESTAMP(timezone=True), nullable=True)
 
+        memory_md = Column(Text, nullable=True, server_default="")
+
         # Refresh tokens for this user
         refresh_tokens = relationship(
             "RefreshToken",
@@ -2193,6 +2312,224 @@ class User(SQLAlchemyBaseUserTableUUID, Base):
         )
 
 
+class AgentActionLog(BaseModel):
+    """Append-only audit trail of every tool call dispatched by the agent.
+
+    One row per ``ToolMessage`` produced; written by ``ActionLogMiddleware``
+    in its ``aafter_tool`` hook. Rows are referenced by the
+    ``/api/threads/{thread_id}/revert/{action_id}`` route to look up an
+    action's stored ``reverse_descriptor`` and replay it.
+
+    The table is intentionally narrow: large tool outputs are NOT stored
+    here. Result text lives in the langgraph checkpoint; this row only
+    keeps a short ``result_id`` (the LangChain ``ToolMessage.id`` or a
+    spilled-content path) for correlation.
+    """
+
+    __tablename__ = "agent_action_log"
+
+    thread_id = Column(
+        Integer,
+        ForeignKey("new_chat_threads.id", ondelete="CASCADE"),
+        nullable=False,
+        index=True,
+    )
+    user_id = Column(
+        UUID(as_uuid=True),
+        ForeignKey("user.id", ondelete="SET NULL"),
+        nullable=True,
+        index=True,
+    )
+    search_space_id = Column(
+        Integer,
+        ForeignKey("searchspaces.id", ondelete="CASCADE"),
+        nullable=False,
+        index=True,
+    )
+    # ``turn_id`` historically held the LangChain ``tool_call.id``. It has
+    # been renamed to ``tool_call_id`` (with a parallel column kept for one
+    # release for back-compat). The real chat-turn id lives in
+    # ``chat_turn_id`` and is sourced from ``configurable.turn_id``.
+    turn_id = Column(String(64), nullable=True, index=True)
+    tool_call_id = Column(String(64), nullable=True, index=True)
+    chat_turn_id = Column(String(64), nullable=True, index=True)
+    message_id = Column(String(128), nullable=True, index=True)
+    tool_name = Column(String(255), nullable=False, index=True)
+    args = Column(JSONB, nullable=True)
+    result_id = Column(String(255), nullable=True)
+    reversible = Column(
+        Boolean, nullable=False, default=False, server_default=text("false")
+    )
+    reverse_descriptor = Column(JSONB, nullable=True)
+    error = Column(JSONB, nullable=True)
+    reverse_of = Column(
+        Integer,
+        ForeignKey("agent_action_log.id", ondelete="SET NULL"),
+        nullable=True,
+        index=True,
+    )
+    created_at = Column(
+        TIMESTAMP(timezone=True),
+        nullable=False,
+        default=lambda: datetime.now(UTC),
+        server_default=text("(now() AT TIME ZONE 'utc')"),
+        index=True,
+    )
+
+    __table_args__ = (
+        Index("ix_agent_action_log_thread_created", "thread_id", "created_at"),
+        # Partial unique index enforces "at most one revert per
+        # original action". Created in migration 137 with
+        # ``WHERE reverse_of IS NOT NULL`` so non-revert rows
+        # (the vast majority) are unaffected and NULLs don't collide.
+        Index(
+            "ux_agent_action_log_reverse_of",
+            "reverse_of",
+            unique=True,
+            postgresql_where=text("reverse_of IS NOT NULL"),
+        ),
+    )
+
+
+class DocumentRevision(BaseModel):
+    """Snapshot of a :class:`Document` row taken before a mutating tool call.
+
+    Written by :class:`KnowledgeBasePersistenceMiddleware` (or its safety-net
+    `commit_staged_filesystem_state`) ahead of any NOTE / FILE / EXTENSION
+    document write. The row is referenced by ``/revert/{action_id}`` to
+    restore the original content in place.
+    """
+
+    __tablename__ = "document_revisions"
+
+    # ``ON DELETE SET NULL`` (not CASCADE) so the snapshot survives the
+    # hard-delete it describes — without that, ``rm`` would wipe the row
+    # we'd need to undo it. See migration ``134_relax_revision_fks``.
+    document_id = Column(
+        Integer,
+        ForeignKey("documents.id", ondelete="SET NULL"),
+        nullable=True,
+        index=True,
+    )
+    search_space_id = Column(
+        Integer,
+        ForeignKey("searchspaces.id", ondelete="CASCADE"),
+        nullable=False,
+        index=True,
+    )
+    content_before = Column(Text, nullable=True)
+    title_before = Column(String, nullable=True)
+    folder_id_before = Column(Integer, nullable=True)
+    chunks_before = Column(JSONB, nullable=True)
+    metadata_before = Column("metadata_before", JSONB, nullable=True)
+    created_by_turn_id = Column(String(64), nullable=True, index=True)
+    agent_action_id = Column(
+        Integer,
+        ForeignKey("agent_action_log.id", ondelete="SET NULL"),
+        nullable=True,
+        index=True,
+    )
+    created_at = Column(
+        TIMESTAMP(timezone=True),
+        nullable=False,
+        default=lambda: datetime.now(UTC),
+        server_default=text("(now() AT TIME ZONE 'utc')"),
+        index=True,
+    )
+
+
+class FolderRevision(BaseModel):
+    """Snapshot of a :class:`Folder` row taken before a mkdir / move."""
+
+    __tablename__ = "folder_revisions"
+
+    # ``ON DELETE SET NULL`` (not CASCADE) so the snapshot survives the
+    # hard-delete it describes — without that, ``rmdir`` would wipe the
+    # row we'd need to undo it. See migration ``134_relax_revision_fks``.
+    folder_id = Column(
+        Integer,
+        ForeignKey("folders.id", ondelete="SET NULL"),
+        nullable=True,
+        index=True,
+    )
+    search_space_id = Column(
+        Integer,
+        ForeignKey("searchspaces.id", ondelete="CASCADE"),
+        nullable=False,
+        index=True,
+    )
+    name_before = Column(String(255), nullable=True)
+    parent_id_before = Column(Integer, nullable=True)
+    position_before = Column(String(50), nullable=True)
+    created_by_turn_id = Column(String(64), nullable=True, index=True)
+    agent_action_id = Column(
+        Integer,
+        ForeignKey("agent_action_log.id", ondelete="SET NULL"),
+        nullable=True,
+        index=True,
+    )
+    created_at = Column(
+        TIMESTAMP(timezone=True),
+        nullable=False,
+        default=lambda: datetime.now(UTC),
+        server_default=text("(now() AT TIME ZONE 'utc')"),
+        index=True,
+    )
+
+
+class AgentPermissionRule(BaseModel):
+    """Persistent permission rule consumed by :class:`PermissionMiddleware`.
+
+    Scoped at one of: search-space-wide (``user_id`` and ``thread_id`` NULL),
+    user-wide (``user_id`` set, ``thread_id`` NULL), or per-thread
+    (``thread_id`` set). Loaded at agent build time and converted to
+    :class:`Rule` instances inside the agent factory.
+    """
+
+    __tablename__ = "agent_permission_rules"
+
+    search_space_id = Column(
+        Integer,
+        ForeignKey("searchspaces.id", ondelete="CASCADE"),
+        nullable=False,
+        index=True,
+    )
+    user_id = Column(
+        UUID(as_uuid=True),
+        ForeignKey("user.id", ondelete="CASCADE"),
+        nullable=True,
+        index=True,
+    )
+    thread_id = Column(
+        Integer,
+        ForeignKey("new_chat_threads.id", ondelete="CASCADE"),
+        nullable=True,
+        index=True,
+    )
+    permission = Column(String(255), nullable=False)
+    pattern = Column(String(255), nullable=False, default="*", server_default="*")
+    action = Column(String(16), nullable=False)  # allow / deny / ask
+    created_at = Column(
+        TIMESTAMP(timezone=True),
+        nullable=False,
+        default=lambda: datetime.now(UTC),
+        server_default=text("(now() AT TIME ZONE 'utc')"),
+        index=True,
+    )
+
+    __table_args__ = (
+        UniqueConstraint(
+            "search_space_id",
+            "user_id",
+            "thread_id",
+            "permission",
+            "pattern",
+            "action",
+            name="uq_agent_permission_rules_scope",
+        ),
+    )
+
+
 class RefreshToken(Base, TimestampMixin):
     """
     Stores refresh tokens for user session management.
diff --git a/surfsense_backend/app/etl_pipeline/etl_document.py b/surfsense_backend/app/etl_pipeline/etl_document.py
index 350c3299f2..f4a01ae14e 100644
--- a/surfsense_backend/app/etl_pipeline/etl_document.py
+++ b/surfsense_backend/app/etl_pipeline/etl_document.py
@@ -1,10 +1,37 @@
+from enum import StrEnum
+
 from pydantic import BaseModel, field_validator
 
 
+class ProcessingMode(StrEnum):
+    BASIC = "basic"
+    PREMIUM = "premium"
+
+    @classmethod
+    def coerce(cls, value: str | None) -> "ProcessingMode":
+        if value is None:
+            return cls.BASIC
+        try:
+            return cls(value.lower())
+        except ValueError:
+            return cls.BASIC
+
+    @property
+    def page_multiplier(self) -> int:
+        return _PAGE_MULTIPLIERS[self]
+
+
+_PAGE_MULTIPLIERS: dict["ProcessingMode", int] = {
+    ProcessingMode.BASIC: 1,
+    ProcessingMode.PREMIUM: 10,
+}
+
+
 class EtlRequest(BaseModel):
     file_path: str
     filename: str
     estimated_pages: int = 0
+    processing_mode: ProcessingMode = ProcessingMode.BASIC
 
     @field_validator("filename")
     @classmethod
diff --git a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
index fbd2e4e73b..87e8138fdc 100644
--- a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
+++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
@@ -15,6 +15,9 @@
 class EtlPipelineService:
     """Single pipeline for extracting markdown from files. All callers use this."""
 
+    def __init__(self, *, vision_llm=None):
+        self._vision_llm = vision_llm
+
     async def extract(self, request: EtlRequest) -> EtlResult:
         category = classify_file(request.filename)
 
@@ -47,8 +50,58 @@ async def extract(self, request: EtlRequest) -> EtlResult:
                 content_type="audio",
             )
 
+        if category == FileCategory.IMAGE:
+            return await self._extract_image(request)
+
         return await self._extract_document(request)
 
+    async def _extract_image(self, request: EtlRequest) -> EtlResult:
+        if self._vision_llm:
+            try:
+                from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
+
+                content = await parse_with_vision_llm(
+                    request.file_path, request.filename, self._vision_llm
+                )
+                return EtlResult(
+                    markdown_content=content,
+                    etl_service="VISION_LLM",
+                    content_type="image",
+                )
+            except Exception as exc:
+                # Special-case quota exhaustion so we log a clearer message
+                # — the vision LLM didn't "fail", the user just ran out of
+                # premium credit. Falling through to the document parser
+                # is a graceful degradation: OCR/Unstructured still
+                # extracts text from the image without burning credit.
+                from app.services.billable_calls import QuotaInsufficientError
+
+                if isinstance(exc, QuotaInsufficientError):
+                    logging.info(
+                        "Vision LLM quota exhausted for %s; falling back to document parser",
+                        request.filename,
+                    )
+                else:
+                    logging.warning(
+                        "Vision LLM failed for %s, falling back to document parser",
+                        request.filename,
+                        exc_info=True,
+                    )
+        else:
+            logging.info(
+                "No vision LLM provided, falling back to document parser for %s",
+                request.filename,
+            )
+
+        try:
+            return await self._extract_document(request)
+        except (EtlUnsupportedFileError, EtlServiceUnavailableError):
+            raise EtlUnsupportedFileError(
+                f"Cannot process image {request.filename}: vision LLM "
+                f"{'failed' if self._vision_llm else 'not configured'} and "
+                f"document parser does not support this format"
+            ) from None
+
     async def _extract_document(self, request: EtlRequest) -> EtlResult:
         from pathlib import PurePosixPath
 
@@ -81,12 +134,90 @@ async def _extract_document(self, request: EtlRequest) -> EtlResult:
         else:
             raise EtlServiceUnavailableError(f"Unknown ETL_SERVICE: {etl_service}")
 
+        # When the operator opts into vision-LLM at ingest, walk the
+        # original file's embedded images and append a structured
+        # "Image Content" section. The parser's own OCR (Docling
+        # do_ocr=True, Azure DI prebuilt-read, etc.) handles text-in-
+        # image; this side handles the *visual* description which the
+        # parsers all drop today.
+        content = await self._maybe_append_picture_descriptions(request, content)
+
         return EtlResult(
             markdown_content=content,
             etl_service=etl_service,
             content_type="document",
         )
 
+    async def _maybe_append_picture_descriptions(
+        self, request: EtlRequest, markdown: str
+    ) -> str:
+        if self._vision_llm is None:
+            return markdown
+
+        from app.etl_pipeline.picture_describer import (
+            describe_pictures,
+            merge_descriptions_into_markdown,
+        )
+
+        # Per-image OCR runner: re-feed each extracted image through
+        # the ETL pipeline *as a standalone image* (no vision LLM, so
+        # the IMAGE branch falls through to the document parser, which
+        # OCRs the image with the configured backend -- Docling /
+        # Azure DI / LlamaCloud). This gives us per-image OCR text
+        # attached to the inline image block, in addition to the
+        # page-level OCR that the parser already merges into the main
+        # markdown stream. The fresh sub-service gets vision_llm=None
+        # so this call cannot recurse back into picture_describer.
+        async def _ocr_image(image_path: str, image_name: str) -> str:
+            try:
+                sub = EtlPipelineService(vision_llm=None)
+                ocr_result = await sub.extract(
+                    EtlRequest(file_path=image_path, filename=image_name)
+                )
+            except (
+                EtlUnsupportedFileError,
+                EtlServiceUnavailableError,
+            ) as exc:
+                # Common case: the configured ETL service can't OCR
+                # this image format (or no service is configured at
+                # all). Don't spam warnings -- just no OCR for it.
+                logging.debug("Skipping per-image OCR for %s: %s", image_name, exc)
+                return ""
+            return ocr_result.markdown_content
+
+        try:
+            result = await describe_pictures(
+                request.file_path,
+                request.filename,
+                self._vision_llm,
+                ocr_runner=_ocr_image,
+            )
+        except Exception:
+            # Picture description is additive; never let it fail an
+            # otherwise-successful document extraction.
+            logging.warning(
+                "Picture description failed for %s, returning parser output unchanged",
+                request.filename,
+                exc_info=True,
+            )
+            return markdown
+
+        if not result.descriptions:
+            return markdown
+
+        merged = merge_descriptions_into_markdown(markdown, result)
+        logging.info(
+            "Vision LLM described %d image(s) in %s "
+            "(skipped: %d small / %d large / %d duplicate, %d failed)",
+            len(result.descriptions),
+            request.filename,
+            result.skipped_too_small,
+            result.skipped_too_large,
+            result.skipped_duplicate,
+            result.failed,
+        )
+        return merged
+
     async def _extract_with_llamacloud(self, request: EtlRequest) -> str:
         """Try Azure Document Intelligence first (when configured) then LlamaCloud.
 
@@ -105,13 +236,17 @@ async def _extract_with_llamacloud(self, request: EtlRequest) -> str:
             and getattr(app_config, "AZURE_DI_KEY", None)
         )
 
+        mode_value = request.processing_mode.value
+
         if azure_configured and ext in AZURE_DI_DOCUMENT_EXTENSIONS:
             try:
                 from app.etl_pipeline.parsers.azure_doc_intelligence import (
                     parse_with_azure_doc_intelligence,
                 )
 
-                return await parse_with_azure_doc_intelligence(request.file_path)
+                return await parse_with_azure_doc_intelligence(
+                    request.file_path, processing_mode=mode_value
+                )
             except Exception:
                 logging.warning(
                     "Azure Document Intelligence failed for %s, "
@@ -122,4 +257,6 @@ async def _extract_with_llamacloud(self, request: EtlRequest) -> str:
 
         from app.etl_pipeline.parsers.llamacloud import parse_with_llamacloud
 
-        return await parse_with_llamacloud(request.file_path, request.estimated_pages)
+        return await parse_with_llamacloud(
+            request.file_path, request.estimated_pages, processing_mode=mode_value
+        )
diff --git a/surfsense_backend/app/etl_pipeline/file_classifier.py b/surfsense_backend/app/etl_pipeline/file_classifier.py
index 4e690bcdc5..120369a273 100644
--- a/surfsense_backend/app/etl_pipeline/file_classifier.py
+++ b/surfsense_backend/app/etl_pipeline/file_classifier.py
@@ -3,6 +3,7 @@
 
 from app.utils.file_extensions import (
     DOCUMENT_EXTENSIONS,
+    IMAGE_EXTENSIONS,
     get_document_extensions_for_service,
 )
 
@@ -105,6 +106,7 @@ class FileCategory(Enum):
     PLAINTEXT = "plaintext"
     AUDIO = "audio"
     DIRECT_CONVERT = "direct_convert"
+    IMAGE = "image"
     UNSUPPORTED = "unsupported"
     DOCUMENT = "document"
 
@@ -117,6 +119,8 @@ def classify_file(filename: str) -> FileCategory:
         return FileCategory.AUDIO
     if suffix in DIRECT_CONVERT_EXTENSIONS:
         return FileCategory.DIRECT_CONVERT
+    if suffix in IMAGE_EXTENSIONS:
+        return FileCategory.IMAGE
     if suffix in DOCUMENT_EXTENSIONS:
         return FileCategory.DOCUMENT
     return FileCategory.UNSUPPORTED
@@ -126,12 +130,14 @@ def should_skip_for_service(filename: str, etl_service: str | None) -> bool:
     """Return True if *filename* cannot be processed by *etl_service*.
 
     Plaintext, audio, and direct-convert files are parser-agnostic and never
-    skipped.  Document files are checked against the per-parser extension set.
+    skipped.  Image and document files are checked against the per-parser
+    extension set (images fall back to the document parser when no vision LLM
+    is available, so the same service constraint applies).
     """
     category = classify_file(filename)
     if category == FileCategory.UNSUPPORTED:
         return True
-    if category == FileCategory.DOCUMENT:
+    if category in (FileCategory.DOCUMENT, FileCategory.IMAGE):
         suffix = PurePosixPath(filename).suffix.lower()
         return suffix not in get_document_extensions_for_service(etl_service)
     return False
diff --git a/surfsense_backend/app/etl_pipeline/parsers/azure_doc_intelligence.py b/surfsense_backend/app/etl_pipeline/parsers/azure_doc_intelligence.py
index b3f9122a00..41de984552 100644
--- a/surfsense_backend/app/etl_pipeline/parsers/azure_doc_intelligence.py
+++ b/surfsense_backend/app/etl_pipeline/parsers/azure_doc_intelligence.py
@@ -10,7 +10,15 @@
 MAX_DELAY = 120
 
 
-async def parse_with_azure_doc_intelligence(file_path: str) -> str:
+AZURE_MODEL_BY_MODE = {
+    "basic": "prebuilt-read",
+    "premium": "prebuilt-layout",
+}
+
+
+async def parse_with_azure_doc_intelligence(
+    file_path: str, processing_mode: str = "basic"
+) -> str:
     from azure.ai.documentintelligence.aio import DocumentIntelligenceClient
     from azure.ai.documentintelligence.models import DocumentContentFormat
     from azure.core.credentials import AzureKeyCredential
@@ -21,9 +29,15 @@ async def parse_with_azure_doc_intelligence(file_path: str) -> str:
         ServiceResponseError,
     )
 
+    model_id = AZURE_MODEL_BY_MODE.get(processing_mode, "prebuilt-read")
     file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
     retryable_exceptions = (ServiceRequestError, ServiceResponseError)
 
+    logging.info(
+        f"Azure Document Intelligence using model={model_id} "
+        f"(mode={processing_mode}, file={file_size_mb:.1f}MB)"
+    )
+
     last_exception = None
     attempt_errors: list[str] = []
 
@@ -36,7 +50,7 @@ async def parse_with_azure_doc_intelligence(file_path: str) -> str:
             async with client:
                 with open(file_path, "rb") as f:
                     poller = await client.begin_analyze_document(
-                        "prebuilt-read",
+                        model_id,
                         body=f,
                         output_content_format=DocumentContentFormat.MARKDOWN,
                     )
@@ -48,10 +62,13 @@ async def parse_with_azure_doc_intelligence(file_path: str) -> str:
                     f"after {len(attempt_errors)} failures"
                 )
 
-            if not result.content:
-                return ""
+            content = result.content or ""
+            if not content.strip():
+                raise RuntimeError(
+                    "Azure Document Intelligence returned empty/whitespace-only content"
+                )
 
-            return result.content
+            return content
 
         except ClientAuthenticationError:
             raise
diff --git a/surfsense_backend/app/etl_pipeline/parsers/llamacloud.py b/surfsense_backend/app/etl_pipeline/parsers/llamacloud.py
index ae2a342343..fbc098ec28 100644
--- a/surfsense_backend/app/etl_pipeline/parsers/llamacloud.py
+++ b/surfsense_backend/app/etl_pipeline/parsers/llamacloud.py
@@ -16,9 +16,37 @@
     calculate_upload_timeout,
 )
 
-
-async def parse_with_llamacloud(file_path: str, estimated_pages: int) -> str:
+LLAMA_PARSE_MODE_MAP = {
+    "basic": "parse_page_with_llm",
+    "premium": "parse_page_with_agent",
+}
+
+
+def _extract_content(result) -> str:
+    """Pull markdown text out of whatever object LlamaParse.aparse returns."""
+    if hasattr(result, "get_markdown_documents"):
+        markdown_docs = result.get_markdown_documents(split_by_page=False)
+        if markdown_docs and hasattr(markdown_docs[0], "text"):
+            return markdown_docs[0].text
+        if hasattr(result, "pages") and result.pages:
+            return "\n\n".join(p.md for p in result.pages if hasattr(p, "md") and p.md)
+
+    if isinstance(result, list):
+        if result and hasattr(result[0], "text"):
+            return result[0].text
+        return "\n\n".join(
+            doc.page_content if hasattr(doc, "page_content") else str(doc)
+            for doc in result
+        )
+
+    return str(result)
+
+
+async def parse_with_llamacloud(
+    file_path: str, estimated_pages: int, processing_mode: str = "basic"
+) -> str:
     from llama_cloud_services import LlamaParse
+    from llama_cloud_services.parse.base import JobFailedException
     from llama_cloud_services.parse.utils import ResultType
 
     file_size_bytes = os.path.getsize(file_path)
@@ -34,10 +62,13 @@ async def parse_with_llamacloud(file_path: str, estimated_pages: int) -> str:
         pool=120.0,
     )
 
+    parse_mode = LLAMA_PARSE_MODE_MAP.get(processing_mode, "parse_page_with_llm")
+
     logging.info(
         f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, "
         f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, "
-        f"job_timeout={job_timeout:.0f}s"
+        f"job_timeout={job_timeout:.0f}s, parse_mode={parse_mode} "
+        f"(mode={processing_mode})"
     )
 
     last_exception = None
@@ -52,6 +83,8 @@ async def parse_with_llamacloud(file_path: str, estimated_pages: int) -> str:
                     verbose=True,
                     language="en",
                     result_type=ResultType.MD,
+                    parse_mode=parse_mode,
+                    ignore_errors=False,
                     max_timeout=int(max(2000, job_timeout + upload_timeout)),
                     job_timeout_in_seconds=job_timeout,
                     job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT,
@@ -65,27 +98,18 @@ async def parse_with_llamacloud(file_path: str, estimated_pages: int) -> str:
                         f"{len(attempt_errors)} failures"
                     )
 
-                if hasattr(result, "get_markdown_documents"):
-                    markdown_docs = result.get_markdown_documents(split_by_page=False)
-                    if markdown_docs and hasattr(markdown_docs[0], "text"):
-                        return markdown_docs[0].text
-                    if hasattr(result, "pages") and result.pages:
-                        return "\n\n".join(
-                            p.md for p in result.pages if hasattr(p, "md") and p.md
-                        )
-                    return str(result)
-
-                if isinstance(result, list):
-                    if result and hasattr(result[0], "text"):
-                        return result[0].text
-                    return "\n\n".join(
-                        doc.page_content if hasattr(doc, "page_content") else str(doc)
-                        for doc in result
+                content = _extract_content(result)
+                if not content or not content.strip():
+                    raise RuntimeError(
+                        "LlamaCloud returned empty/whitespace-only content"
                     )
+                return content
 
-                return str(result)
-
-        except LLAMACLOUD_RETRYABLE_EXCEPTIONS as e:
+        except (
+            *LLAMACLOUD_RETRYABLE_EXCEPTIONS,
+            RuntimeError,
+            JobFailedException,
+        ) as e:
             last_exception = e
             error_type = type(e).__name__
             error_msg = str(e)[:200]
diff --git a/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
new file mode 100644
index 0000000000..14741d6094
--- /dev/null
+++ b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
@@ -0,0 +1,116 @@
+import asyncio
+import base64
+import os
+
+from langchain_core.messages import HumanMessage
+
+# Single-shot prompt used by standalone image uploads (.png/.jpg/etc).
+# A standalone image IS the document, so we want everything: visual
+# content plus any text the model can read off it. The output is
+# combined markdown that the chunker treats as the full document body.
+_PROMPT = (
+    "Describe this image in markdown. "
+    "Transcribe any visible text verbatim. "
+    "Be concise but complete — let the image content guide the level of detail."
+)
+
+# Per-image-in-PDF prompt. Here the image is *inside* a larger
+# document, and the ETL service (Docling/Azure DI/LlamaCloud/...) is
+# already running OCR over the whole page — including text rendered
+# into images. So we explicitly tell the model NOT to transcribe text
+# and to focus only on visual interpretation. This avoids paying
+# output tokens for OCR content the ETL pipeline already captured.
+_DESCRIPTION_PROMPT = (
+    "Describe what this image visually depicts in concise markdown. "
+    "Focus on visual content — anatomy, structures, charts, diagrams, "
+    "spatial relationships, colors, modality (e.g. axial CT, ECG strip, "
+    "histology slide), and any clinically or structurally relevant "
+    "findings.\n\n"
+    "Do NOT transcribe text from the image. Any text in the image "
+    "(axis labels, annotations, scale bars, lab values, etc.) is "
+    "already extracted by a separate OCR pipeline; duplicating it "
+    "here would be redundant. Stick to the visual interpretation."
+)
+
+_MAX_IMAGE_BYTES = (
+    5 * 1024 * 1024
+)  # 5 MB (Anthropic Claude's limit, the most restrictive)
+
+_INVOKE_TIMEOUT_SECONDS = 120
+
+_EXT_TO_MIME: dict[str, str] = {
+    ".png": "image/png",
+    ".jpg": "image/jpeg",
+    ".jpeg": "image/jpeg",
+    ".gif": "image/gif",
+    ".bmp": "image/bmp",
+    ".tiff": "image/tiff",
+    ".tif": "image/tiff",
+    ".webp": "image/webp",
+    ".svg": "image/svg+xml",
+    ".heic": "image/heic",
+    ".heif": "image/heif",
+}
+
+
+def _image_to_data_url(file_path: str) -> str:
+    file_size = os.path.getsize(file_path)
+    if file_size > _MAX_IMAGE_BYTES:
+        raise ValueError(
+            f"Image too large for vision LLM ({file_size / (1024 * 1024):.1f} MB, "
+            f"limit {_MAX_IMAGE_BYTES // (1024 * 1024)} MB): {file_path}"
+        )
+    ext = os.path.splitext(file_path)[1].lower()
+    mime_type = _EXT_TO_MIME.get(ext)
+    if not mime_type:
+        raise ValueError(f"Unsupported image extension {ext!r}: {file_path}")
+    with open(file_path, "rb") as f:
+        encoded = base64.b64encode(f.read()).decode("ascii")
+    return f"data:{mime_type};base64,{encoded}"
+
+
+async def _invoke_vision(llm, prompt: str, data_url: str, filename: str) -> str:
+    message = HumanMessage(
+        content=[
+            {"type": "text", "text": prompt},
+            {"type": "image_url", "image_url": {"url": data_url}},
+        ]
+    )
+    response = await asyncio.wait_for(
+        llm.ainvoke([message]), timeout=_INVOKE_TIMEOUT_SECONDS
+    )
+    text = response.content if hasattr(response, "content") else str(response)
+    if not text or not text.strip():
+        raise ValueError(f"Vision LLM returned empty content for {filename}")
+    return text.strip()
+
+
+async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
+    """Single-shot: returns combined markdown for a standalone image upload.
+
+    Used when the operator uploads an image file directly (jpg/png/etc).
+    The image is the document, so the prompt asks for both visual
+    description and verbatim text in one go.
+    """
+    data_url = _image_to_data_url(file_path)
+    return await _invoke_vision(llm, _PROMPT, data_url, filename)
+
+
+async def parse_image_for_description(file_path: str, filename: str, llm) -> str:
+    """Visual-description-only call for per-image-in-PDF use.
+
+    Used by ``picture_describer`` when an image is embedded inside a
+    larger document. Returns a markdown description of what the image
+    visually depicts; deliberately does NOT include text-in-image OCR
+    because the ETL service (Docling, Azure DI, LlamaCloud, ...) is
+    already running OCR over the entire page and would duplicate that
+    text content.
+    """
+    data_url = _image_to_data_url(file_path)
+    return await _invoke_vision(llm, _DESCRIPTION_PROMPT, data_url, filename)
+
+
+__all__ = [
+    "parse_image_for_description",
+    "parse_with_vision_llm",
+]
diff --git a/surfsense_backend/app/etl_pipeline/picture_describer.py b/surfsense_backend/app/etl_pipeline/picture_describer.py
new file mode 100644
index 0000000000..263e7201af
--- /dev/null
+++ b/surfsense_backend/app/etl_pipeline/picture_describer.py
@@ -0,0 +1,674 @@
+"""Extract embedded images from PDFs, describe them, and inject the
+descriptions inline into the parser's markdown.
+
+When the operator passes ``use_vision_llm=True`` for a PDF, the document
+parsers (DOCLING / LLAMACLOUD / Azure DI / UNSTRUCTURED) extract text
+but mostly drop the actual image content -- a CT scan inside a clinical
+PDF becomes (at best) a ``<!-- image -->`` placeholder in the markdown,
+and the caption text below it.
+
+This module fills that gap. After the document parser produces markdown
+text, we:
+
+1. Walk the original PDF with :mod:`pypdf`, pulling out each embedded
+   image (deduped by sha256, size-capped to match the vision LLM's own
+   limits).
+2. Run the vision LLM on each unique image (visual description) and,
+   in parallel when an OCR runner is provided, re-feed the same image
+   through the ETL service for per-image OCR.
+3. **Inject** a horizontal-rule-delimited markdown section -- with
+   named "OCR text" and "Visual description" sub-sections -- where the
+   image actually appears in the parser's markdown. Two splice modes,
+   chosen by which marker the parser emitted:
+
+   - **Replace** Docling-style ``<!-- image -->`` placeholders (and an
+     optional ``Image: <filename>`` caption line). The placeholder
+     carries no useful content of its own, so we substitute our block
+     for it.
+   - **Append after** layout-aware ``<figure>...</figure>`` blocks
+     (Azure DI ``prebuilt-layout``, LlamaCloud premium). Those blocks
+     already contain parser-extracted chart values / OCR'd labels /
+     captions, which are themselves useful for retrieval -- so we
+     PRESERVE the figure verbatim and add our vision-LLM block
+     immediately after it. The chunk then contains both the parser's
+     structured numbers AND the VLM's semantic interpretation.
+
+   Either way, the image content stays in context with the surrounding
+   document body rather than getting orphaned at the end -- crucial for
+   retrieval, where a single chunk should contain the question, the
+   image content, and the answer options together.
+
+If no placeholders, figures, or captions can be matched (e.g. an
+unusual parser output), we fall back to appending an
+``## Image Content`` section so no image content is silently lost.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import contextlib
+import hashlib
+import logging
+import re
+import tempfile
+from collections.abc import Awaitable, Callable
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+# Type alias for the OCR callback. Takes (file_path, filename), returns
+# the OCR'd markdown text -- or empty string if no text was found, or
+# raises if OCR failed unrecoverably (which the describer catches and
+# treats as "no OCR for this image" rather than failing the whole doc).
+OcrRunner = Callable[[str, str], Awaitable[str]]
+
+logger = logging.getLogger(__name__)
+
+
+# Bound how many vision LLM calls we make in parallel for a single
+# document. Vision models are typically rate-limited; 4 concurrent
+# calls is a safe default that respects most provider limits while
+# keeping wall-clock manageable for image-heavy PDFs.
+_VISION_CONCURRENCY = 4
+
+# Match parse_with_vision_llm's per-image cap so we don't even attempt
+# images that the vision LLM would reject anyway (Anthropic's 5 MB
+# limit is the most restrictive among the major providers).
+_MAX_IMAGE_BYTES = 5 * 1024 * 1024
+
+# Skip degenerate images: tracking pixels, very small decorative dots,
+# scanner-introduced artefacts. We can't cheaply check pixel dimensions
+# without decoding the image, so we approximate: anything under 1 KB is
+# almost certainly not informative content.
+_MIN_IMAGE_BYTES = 1024
+
+
+@dataclass
+class PictureDescription:
+    """A single extracted image with its visual description and (optionally) OCR.
+
+    Two content fields by design, each produced by the *right* tool:
+
+    - ``description``: the vision LLM's visual interpretation. What the
+      image depicts (anatomy, charts, layout, etc.) -- the semantic
+      content that only a vision model can produce.
+    - ``ocr_text``: text-in-image extracted by re-feeding the image
+      through the configured ETL service (Docling/Azure DI/LlamaCloud)
+      *as if it were a standalone image upload*. Specialist OCR engine,
+      per-image attribution, no vision LLM tokens spent on text. None
+      when no OCR was requested or OCR found no text.
+    """
+
+    page_number: int  # 1-indexed
+    ordinal_in_page: int  # 0-indexed within the page
+    name: str  # name pypdf assigned (e.g. "Im0")
+    sha256: str  # hash of the raw image bytes
+    description: str  # visual description (markdown)
+    ocr_text: str | None = None  # OCR text from the ETL service, if any
+
+
+@dataclass
+class PictureExtractionResult:
+    """Aggregate result of extracting all pictures from a document."""
+
+    descriptions: list[PictureDescription] = field(default_factory=list)
+    skipped_too_small: int = 0
+    skipped_too_large: int = 0
+    skipped_duplicate: int = 0
+    failed: int = 0
+
+    @property
+    def has_content(self) -> bool:
+        return bool(self.descriptions)
+
+
+def _is_pdf(filename: str) -> bool:
+    return filename.lower().endswith(".pdf")
+
+
+def _pick_suffix(name: str) -> str:
+    lower = name.lower()
+    for ext in (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif", ".webp"):
+        if lower.endswith(ext):
+            return ".jpeg" if ext == ".jpg" else ext
+    return ".png"
+
+
+def _extract_pdf_images(file_path: str) -> list[tuple[int, int, str, bytes]]:
+    """Pull every embedded image out of a PDF.
+
+    Returns ``(page_number_1_indexed, ordinal_in_page, name, bytes)``.
+    Per-page and per-image failures are logged and skipped -- one bad
+    image must not fail the whole document.
+    """
+
+    from pypdf import PdfReader
+
+    out: list[tuple[int, int, str, bytes]] = []
+    try:
+        reader = PdfReader(file_path)
+    except Exception:
+        logger.warning(
+            "pypdf failed to open %s for image extraction",
+            file_path,
+            exc_info=True,
+        )
+        return out
+
+    for page_idx, page in enumerate(reader.pages):
+        try:
+            images = list(page.images)
+        except Exception:
+            logger.warning(
+                "pypdf failed to enumerate images on page %d of %s",
+                page_idx + 1,
+                file_path,
+                exc_info=True,
+            )
+            continue
+        for img_idx, img in enumerate(images):
+            try:
+                name = getattr(img, "name", None) or f"page{page_idx + 1}_img{img_idx}"
+                data = img.data
+            except Exception:
+                logger.warning(
+                    "pypdf failed to read image %d on page %d of %s",
+                    img_idx,
+                    page_idx + 1,
+                    file_path,
+                    exc_info=True,
+                )
+                continue
+            out.append((page_idx + 1, img_idx, name, data))
+    return out
+
+
+async def _describe_one(
+    page_number: int,
+    ordinal: int,
+    name: str,
+    sha256: str,
+    data: bytes,
+    vision_llm: Any,
+    semaphore: asyncio.Semaphore,
+    ocr_runner: OcrRunner | None,
+) -> PictureDescription | None:
+    from app.etl_pipeline.parsers.vision_llm import parse_image_for_description
+
+    suffix = _pick_suffix(name)
+    # NamedTemporaryFile + delete=False because the vision-LLM helper
+    # and the OCR runner each open the path themselves; we clean up in
+    # the finally. Same temp file feeds both, which is correct: vision
+    # LLM and OCR are looking at the same image, just asking different
+    # questions of it.
+    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
+        tmp.write(data)
+        tmp_path = tmp.name
+    try:
+        async with semaphore:
+            tasks: list[Awaitable[Any]] = [
+                parse_image_for_description(tmp_path, name, vision_llm),
+            ]
+            if ocr_runner is not None:
+                tasks.append(ocr_runner(tmp_path, name))
+
+            # return_exceptions=True so a failure in one branch (most
+            # often OCR) doesn't poison the other.
+            results = await asyncio.gather(*tasks, return_exceptions=True)
+
+        description_result = results[0]
+        if isinstance(description_result, BaseException):
+            logger.warning(
+                "Vision LLM failed for image %s on page %d, skipping",
+                name,
+                page_number,
+                exc_info=description_result,
+            )
+            return None
+        description = str(description_result)
+
+        ocr_text: str | None = None
+        if ocr_runner is not None and len(results) > 1:
+            ocr_result = results[1]
+            if isinstance(ocr_result, BaseException):
+                logger.warning(
+                    "Per-image OCR failed for image %s on page %d, "
+                    "omitting OCR field for this image",
+                    name,
+                    page_number,
+                    exc_info=ocr_result,
+                )
+            else:
+                stripped = str(ocr_result).strip()
+                # Empty OCR (or whitespace-only) means the OCR engine
+                # found no text in this image. Record that as None so
+                # the rendered block doesn't include a useless empty tag.
+                ocr_text = stripped or None
+    finally:
+        with contextlib.suppress(OSError):
+            Path(tmp_path).unlink()
+
+    return PictureDescription(
+        page_number=page_number,
+        ordinal_in_page=ordinal,
+        name=name,
+        sha256=sha256,
+        description=description,
+        ocr_text=ocr_text,
+    )
+
+
+async def describe_pictures(
+    file_path: str,
+    filename: str,
+    vision_llm: Any,
+    *,
+    ocr_runner: OcrRunner | None = None,
+) -> PictureExtractionResult:
+    """Extract embedded images from a document and describe each via vision LLM.
+
+    When ``ocr_runner`` is provided, each image is also passed to it
+    (in parallel with the vision LLM) and the returned text is recorded
+    in :attr:`PictureDescription.ocr_text`. The runner is typically a
+    closure over a vision-LLM-less ``EtlPipelineService`` -- this lets
+    the same OCR engine that processes standalone image uploads
+    (Docling/Azure DI/LlamaCloud) also process embedded-in-PDF images,
+    giving per-image OCR attribution alongside the page-level OCR that
+    the parser already does.
+
+    Currently PDF-only. For non-PDF documents this returns an empty
+    result and the caller should leave the parser's markdown untouched.
+    """
+
+    result = PictureExtractionResult()
+    if not _is_pdf(filename) or vision_llm is None:
+        return result
+
+    raw_images = _extract_pdf_images(file_path)
+    if not raw_images:
+        return result
+
+    seen_hashes: set[str] = set()
+    eligible: list[tuple[int, int, str, str, bytes]] = []
+    for page_number, ordinal, name, data in raw_images:
+        if len(data) > _MAX_IMAGE_BYTES:
+            result.skipped_too_large += 1
+            continue
+        if len(data) < _MIN_IMAGE_BYTES:
+            result.skipped_too_small += 1
+            continue
+        sha = hashlib.sha256(data).hexdigest()
+        if sha in seen_hashes:
+            result.skipped_duplicate += 1
+            continue
+        seen_hashes.add(sha)
+        eligible.append((page_number, ordinal, name, sha, data))
+
+    if not eligible:
+        return result
+
+    semaphore = asyncio.Semaphore(_VISION_CONCURRENCY)
+    tasks = [
+        _describe_one(p, o, n, sha, d, vision_llm, semaphore, ocr_runner)
+        for (p, o, n, sha, d) in eligible
+    ]
+    descriptions = await asyncio.gather(*tasks)
+    for desc in descriptions:
+        if desc is None:
+            result.failed += 1
+        else:
+            result.descriptions.append(desc)
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Rendering: build the per-image markdown block + inject inline.
+# ---------------------------------------------------------------------------
+
+
+def _format_image_block(
+    name: str,
+    description: str,
+    ocr_text: str | None = None,
+) -> str:
+    """Render the per-image block as a horizontal-rule-delimited section.
+
+    Why no blockquote / no raw HTML / no XML?
+    -----------------------------------------
+    We tried each in turn and each failed in the document viewer:
+
+    - **Raw HTML / XML** (``<image>...</image>``): unknown elements
+      have no render rules in Streamdown or PlateJS, so the content
+      survives in the markdown source but is invisible to humans.
+    - **Blockquote with nested blocks**: nested fenced code blocks,
+      bullet lists, numbered lists, tables -- any *block* element
+      inside a ``>``-prefixed blockquote -- gets evicted by Streamdown
+      / remark, dropping everything after it onto the document level.
+      The vision LLM happily produces bulleted descriptions, so this
+      hit the viewer in practice.
+
+    A horizontal-rule-delimited section, by contrast, contains only
+    standard top-level markdown -- bold labels and free-form body --
+    so the description's native markdown (lists, prose, tables) all
+    renders natively in every renderer.
+
+    Layout (OCR section omitted when ``ocr_text`` is None/empty):
+
+        ---
+
+        **Embedded image:** `MM-130-a.jpeg`
+
+        **OCR text:**
+        Slice 24 / 60
+        L
+        R
+
+        **Visual description:**
+
+        - Axial contrast-enhanced CT showing a large cystic mass...
+        - Mass effect on the adjacent stomach.
+
+        ---
+
+    Still LLM-friendly: the ``**Embedded image:** `<filename>``` prefix
+    is unique and trivially regex-able (``^\\*\\*Embedded image:\\*\\* `(.+?)`$``).
+
+    Returned with leading and trailing blank-line padding so the rules
+    never merge with adjacent paragraphs after splicing.
+    """
+
+    parts: list[str] = [f"**Embedded image:** `{name}`"]
+
+    if ocr_text and ocr_text.strip():
+        # Bold "OCR text:" label with trailing two spaces (=> <br>) so
+        # the first OCR line sits directly under the label rather than
+        # forcing a paragraph break that some renderers would style
+        # differently. Subsequent OCR lines also use trailing two spaces
+        # for hard breaks, so multi-line OCR renders line-by-line
+        # without needing a (fragile) fenced code block.
+        ocr_clean_lines = [
+            ln.rstrip() for ln in ocr_text.strip().splitlines() if ln.strip()
+        ]
+        parts.append("")
+        parts.append("**OCR text:**  ")
+        for i, raw in enumerate(ocr_clean_lines):
+            suffix = "" if i == len(ocr_clean_lines) - 1 else "  "
+            parts.append(f"{raw}{suffix}")
+
+    parts.append("")
+    parts.append("**Visual description:**")
+    parts.append("")
+    parts.append(description.strip())
+
+    body = "\n".join(parts)
+    # Wrap with blank lines + horizontal rules so the block is clearly
+    # delimited from surrounding paragraphs and survives splicing into
+    # the middle of any markdown stream.
+    return "\n\n---\n\n" + body + "\n\n---\n\n"
+
+
+# Patterns we'll try to splice into. Each pattern captures the
+# original-PDF filename when one is available (group 1).
+#
+# Replace-style markers (the matched span is substituted with our block
+# because it carries no useful content of its own):
+#
+# 1. Docling's image placeholder followed by an "Image: <filename>"
+#    caption line. This is what our medxpertqa renderer produces:
+#    reportlab places the JPEG, then a caption, and Docling outputs
+#    the placeholder + caption.
+# 2. Docling's image placeholder alone (filename unknown -- we fall
+#    back to pypdf's name).
+# 3. A bare "Image: <filename>" caption line with no preceding
+#    placeholder. Rare in practice, but covers parsers that drop the
+#    placeholder entirely.
+_PLACEHOLDER_WITH_CAPTION = re.compile(
+    r"<!--\s*image\s*-->\s*\n\s*Image:\s*(\S+)\s*(?:\n|$)",
+    re.IGNORECASE,
+)
+_PLACEHOLDER_ONLY = re.compile(
+    r"<!--\s*image\s*-->",
+    re.IGNORECASE,
+)
+_CAPTION_ONLY = re.compile(
+    r"^[ \t]*Image:\s*(\S+)\s*$",
+    re.IGNORECASE | re.MULTILINE,
+)
+
+# Append-after marker (the matched span is preserved verbatim and our
+# block is inserted immediately after it):
+#
+# 4. ``<figure>...</figure>`` as emitted by layout-aware parsers (Azure
+#    Document Intelligence ``prebuilt-layout``, LlamaCloud premium).
+#    The figure's own contents -- chart bar values, axis labels,
+#    inline ``<figcaption>``, embedded ``<table>`` for tabular figures
+#    -- are themselves specialist OCR output, so we keep them and add
+#    our vision-LLM block alongside. ``[^>]*`` in the open tag tolerates
+#    optional attributes like ``<figure id="...">``; ``re.DOTALL``
+#    lets ``.`` cross the newlines inside the block.
+_FIGURE_BLOCK = re.compile(
+    r"<figure\b[^>]*>.*?</figure>",
+    re.DOTALL | re.IGNORECASE,
+)
+
+
+def _replace_one_match(
+    markdown: str,
+    pattern: re.Pattern[str],
+    descriptions: list[PictureDescription],
+    desc_idx: int,
+) -> tuple[str, int]:
+    """Replace the first occurrence of ``pattern`` with the next image block.
+
+    Returns the new markdown and the new ``desc_idx`` (advanced if a
+    replacement happened, unchanged otherwise).
+    """
+
+    if desc_idx >= len(descriptions):
+        return markdown, desc_idx
+
+    match = pattern.search(markdown)
+    if not match:
+        return markdown, desc_idx
+
+    desc = descriptions[desc_idx]
+    captured_name: str | None = None
+    if match.groups():
+        captured_name = match.group(1)
+    name = captured_name or desc.name
+    block = _format_image_block(name, desc.description, desc.ocr_text)
+
+    new_markdown = markdown[: match.start()] + block + markdown[match.end() :]
+    return new_markdown, desc_idx + 1
+
+
+def _splice_after_figures(
+    markdown: str,
+    descriptions: list[PictureDescription],
+    desc_idx: int,
+) -> tuple[str, int]:
+    """Append vision-LLM blocks immediately after each ``<figure>...</figure>``.
+
+    Layout-aware parsers (Azure DI ``prebuilt-layout``, LlamaCloud
+    premium) wrap each figure / chart / inline table in this tag and
+    carry their own OCR of the figure's text content inside it. That
+    content is useful on its own, so we keep the original block
+    verbatim and add our vision-LLM block right after it -- giving
+    retrieval both signals in the same chunk.
+
+    Descriptions are matched to figures in document order (first
+    description -> first figure, etc.). All splice points are computed
+    upfront with :func:`re.finditer` and applied in REVERSE order so
+    earlier offsets stay valid as the markdown grows. Returns the
+    advanced ``desc_idx`` for the caller's leftover-handling.
+    """
+
+    if desc_idx >= len(descriptions):
+        return markdown, desc_idx
+
+    matches = list(_FIGURE_BLOCK.finditer(markdown))
+    if not matches:
+        return markdown, desc_idx
+
+    n_to_splice = min(len(matches), len(descriptions) - desc_idx)
+    if n_to_splice <= 0:
+        return markdown, desc_idx
+
+    out = markdown
+    # Walk in reverse so each splice's end-offset still points at the
+    # right place in the (still-mutating) string.
+    for i in range(n_to_splice - 1, -1, -1):
+        match = matches[i]
+        desc = descriptions[desc_idx + i]
+        block = _format_image_block(desc.name, desc.description, desc.ocr_text)
+        out = out[: match.end()] + block + out[match.end() :]
+
+    return out, desc_idx + n_to_splice
+
+
+def inject_descriptions_inline(
+    markdown: str,
+    result: PictureExtractionResult,
+) -> tuple[str, int]:
+    """Splice per-image markdown blocks into the document at image positions.
+
+    Walks the markdown left-to-right, consuming descriptions in order.
+    Tries two splicing strategies, in this order:
+
+    1. **Append-after** for ``<figure>...</figure>`` blocks emitted by
+       layout-aware parsers (Azure DI ``prebuilt-layout``, LlamaCloud
+       premium). The figure block carries the parser's own OCR of the
+       figure -- we preserve it and add our vision-LLM block right
+       after.
+    2. **Replace** for Docling-style markers, in priority order:
+
+       - ``<!-- image -->`` followed by ``Image: <filename>`` caption,
+       - ``<!-- image -->`` placeholder alone,
+       - bare ``Image: <filename>`` caption.
+
+    A document typically uses one style or the other (depending on
+    which parser produced its markdown), so the two paths don't fight
+    each other in practice. When they do co-occur, figures are
+    consumed first.
+
+    Returns ``(new_markdown, n_inlined)`` -- the count of descriptions
+    that were placed inline. The caller decides what to do with any
+    leftover descriptions (typically: append them at the end).
+    """
+
+    if not result.descriptions:
+        return markdown, 0
+
+    descriptions = result.descriptions
+    desc_idx = 0
+    out = markdown
+
+    # Step 1: layout-aware figures. One-shot batch -- finds ALL
+    # <figure> blocks, splices in document order until we exhaust
+    # either side.
+    out, desc_idx = _splice_after_figures(out, descriptions, desc_idx)
+
+    # Step 2: Docling-style replacement markers. One match per
+    # iteration, so a doc that has both a figure (consumed above) and
+    # a Docling placeholder (consumed below) still works.
+    while desc_idx < len(descriptions):
+        before_idx = desc_idx
+        out, desc_idx = _replace_one_match(
+            out, _PLACEHOLDER_WITH_CAPTION, descriptions, desc_idx
+        )
+        if desc_idx > before_idx:
+            continue
+        out, desc_idx = _replace_one_match(
+            out, _PLACEHOLDER_ONLY, descriptions, desc_idx
+        )
+        if desc_idx > before_idx:
+            continue
+        out, desc_idx = _replace_one_match(out, _CAPTION_ONLY, descriptions, desc_idx)
+        if desc_idx > before_idx:
+            continue
+        # No more positions to splice into.
+        break
+
+    return out, desc_idx
+
+
+def render_appended_section(
+    descriptions: list[PictureDescription],
+    *,
+    skip_notes: PictureExtractionResult | None = None,
+    heading: str = "## Image Content (vision-LLM extracted)",
+) -> str:
+    """Render leftover descriptions as an appended section.
+
+    Used as a fallback when not every description could be inlined
+    (either because the parser produced no detectable image markers,
+    or because there were more extracted images than markers).
+    """
+
+    if not descriptions and not skip_notes:
+        return ""
+
+    parts: list[str] = ["", heading, ""]
+    for desc in descriptions:
+        parts.append(_format_image_block(desc.name, desc.description, desc.ocr_text))
+        parts.append("")
+
+    if skip_notes is not None:
+        notes: list[str] = []
+        if skip_notes.skipped_too_large:
+            notes.append(f"{skip_notes.skipped_too_large} too large (> 5 MB)")
+        if skip_notes.skipped_too_small:
+            notes.append(f"{skip_notes.skipped_too_small} too small (< 1 KB)")
+        if skip_notes.skipped_duplicate:
+            notes.append(f"{skip_notes.skipped_duplicate} duplicate")
+        if skip_notes.failed:
+            notes.append(f"{skip_notes.failed} failed")
+        if notes:
+            parts.append(f"_Note: {', '.join(notes)} image(s) skipped._")
+
+    return "\n".join(parts)
+
+
+def merge_descriptions_into_markdown(
+    markdown: str,
+    result: PictureExtractionResult,
+) -> str:
+    """Top-level: inline what we can, append what's left over.
+
+    This is the function the ETL pipeline actually calls. It guarantees
+    that no successfully-described image is silently dropped: anything
+    we can't splice inline gets appended at the end with a heading
+    that makes it clear those came from the document but weren't
+    location-matched.
+    """
+
+    if not result.descriptions:
+        return markdown
+
+    new_markdown, n_inlined = inject_descriptions_inline(markdown, result)
+    leftover = result.descriptions[n_inlined:]
+
+    if not leftover:
+        return new_markdown
+
+    # Distinguish in the heading whether NONE were inlined (parser
+    # produced no markers at all) vs SOME (mismatched count).
+    heading = (
+        "## Image Content (vision-LLM extracted)"
+        if n_inlined == 0
+        else "## Image Content (additional, no inline marker found)"
+    )
+    section = render_appended_section(leftover, heading=heading)
+    if not section:
+        return new_markdown
+    return f"{new_markdown.rstrip()}\n\n{section.lstrip()}\n"
+
+
+__all__ = [
+    "PictureDescription",
+    "PictureExtractionResult",
+    "describe_pictures",
+    "inject_descriptions_inline",
+    "merge_descriptions_into_markdown",
+    "render_appended_section",
+]
diff --git a/surfsense_backend/app/exceptions.py b/surfsense_backend/app/exceptions.py
new file mode 100644
index 0000000000..183781cd41
--- /dev/null
+++ b/surfsense_backend/app/exceptions.py
@@ -0,0 +1,104 @@
+"""Structured error hierarchy for SurfSense.
+
+Every error response follows a backward-compatible contract:
+
+    {
+      "error": {
+        "code": "SOME_ERROR_CODE",
+        "message": "Human-readable, client-safe message.",
+        "status": 422,
+        "request_id": "req_...",
+        "timestamp": "2026-04-14T12:00:00Z",
+        "report_url": "https://github.com/MODSetter/SurfSense/issues"
+      },
+      "detail": "Human-readable, client-safe message."   # legacy compat
+    }
+"""
+
+from __future__ import annotations
+
+ISSUES_URL = "https://github.com/MODSetter/SurfSense/issues"
+
+GENERIC_5XX_MESSAGE = (
+    "An internal error occurred. Please try again or report this issue if it persists."
+)
+
+
+class SurfSenseError(Exception):
+    """Base exception that global handlers translate into the structured envelope."""
+
+    def __init__(
+        self,
+        message: str = GENERIC_5XX_MESSAGE,
+        *,
+        code: str = "INTERNAL_ERROR",
+        status_code: int = 500,
+        safe_for_client: bool = True,
+    ) -> None:
+        super().__init__(message)
+        self.message = message
+        self.code = code
+        self.status_code = status_code
+        self.safe_for_client = safe_for_client
+
+
+class ConnectorError(SurfSenseError):
+    def __init__(self, message: str, *, code: str = "CONNECTOR_ERROR") -> None:
+        super().__init__(message, code=code, status_code=502)
+
+
+class DatabaseError(SurfSenseError):
+    def __init__(
+        self,
+        message: str = "A database error occurred.",
+        *,
+        code: str = "DATABASE_ERROR",
+    ) -> None:
+        super().__init__(message, code=code, status_code=500)
+
+
+class ConfigurationError(SurfSenseError):
+    def __init__(
+        self,
+        message: str = "A configuration error occurred.",
+        *,
+        code: str = "CONFIGURATION_ERROR",
+    ) -> None:
+        super().__init__(message, code=code, status_code=500)
+
+
+class ExternalServiceError(SurfSenseError):
+    def __init__(
+        self,
+        message: str = "An external service is unavailable.",
+        *,
+        code: str = "EXTERNAL_SERVICE_ERROR",
+    ) -> None:
+        super().__init__(message, code=code, status_code=502)
+
+
+class NotFoundError(SurfSenseError):
+    def __init__(
+        self,
+        message: str = "The requested resource was not found.",
+        *,
+        code: str = "NOT_FOUND",
+    ) -> None:
+        super().__init__(message, code=code, status_code=404)
+
+
+class ForbiddenError(SurfSenseError):
+    def __init__(
+        self,
+        message: str = "You don't have permission to access this resource.",
+        *,
+        code: str = "FORBIDDEN",
+    ) -> None:
+        super().__init__(message, code=code, status_code=403)
+
+
+class ValidationError(SurfSenseError):
+    def __init__(
+        self, message: str = "Validation failed.", *, code: str = "VALIDATION_ERROR"
+    ) -> None:
+        super().__init__(message, code=code, status_code=422)
diff --git a/surfsense_backend/app/indexing_pipeline/document_chunker.py b/surfsense_backend/app/indexing_pipeline/document_chunker.py
index 4f3c698ef3..6ae81b7a85 100644
--- a/surfsense_backend/app/indexing_pipeline/document_chunker.py
+++ b/surfsense_backend/app/indexing_pipeline/document_chunker.py
@@ -1,5 +1,15 @@
+import re
+
 from app.config import config
 
+# Regex that matches a Markdown table block (header + separator + one or more rows)
+# A table block starts with a | at the beginning of a line and ends when a
+# non-table line (or end of string) is encountered.
+_TABLE_BLOCK_RE = re.compile(
+    r"(?:(?:^|\n)(?=[ \t]*\|)(?:[ \t]*\|[^\n]*\n)+)",
+    re.MULTILINE,
+)
+
 
 def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]:
     """Chunk a text string using the configured chunker and return the chunk texts."""
@@ -7,3 +17,43 @@ def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]:
         config.code_chunker_instance if use_code_chunker else config.chunker_instance
     )
     return [c.text for c in chunker.chunk(text)]
+
+
+def chunk_text_hybrid(text: str) -> list[str]:
+    """Table-aware chunker that prevents Markdown tables from being split mid-row.
+
+    Algorithm:
+    1. Scan the document for Markdown table blocks.
+    2. Each table block is emitted as a single, unmodified chunk so that its
+       header, separator row, and data rows always stay together.
+    3. The non-table prose segments between (and around) tables are passed through
+       the normal ``chunk_text`` chunker and their sub-chunks are interleaved in
+       document order.
+
+    This ensures that table data is never sliced in the middle by the token-based
+    chunker, which would otherwise produce garbled rows that are useless for RAG.
+
+    Fixes #1334.
+    """
+    chunks: list[str] = []
+    cursor = 0
+
+    for match in _TABLE_BLOCK_RE.finditer(text):
+        # Prose before this table
+        prose = text[cursor : match.start()].strip()
+        if prose:
+            chunks.extend(chunk_text(prose))
+
+        # The table itself is kept as one indivisible chunk
+        table_block = match.group(0).strip()
+        if table_block:
+            chunks.append(table_block)
+
+        cursor = match.end()
+
+    # Remaining prose after the last table (or entire text if no tables)
+    trailing = text[cursor:].strip()
+    if trailing:
+        chunks.extend(chunk_text(trailing))
+
+    return chunks
diff --git a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py
index 22c552e5ca..2339647ea6 100644
--- a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py
+++ b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py
@@ -19,7 +19,7 @@
     DocumentType,
 )
 from app.indexing_pipeline.connector_document import ConnectorDocument
-from app.indexing_pipeline.document_chunker import chunk_text
+from app.indexing_pipeline.document_chunker import chunk_text, chunk_text_hybrid
 from app.indexing_pipeline.document_embedder import embed_texts
 from app.indexing_pipeline.document_hashing import (
     compute_content_hash,
@@ -387,11 +387,19 @@ async def index(
             )
 
             t_step = time.perf_counter()
-            chunk_texts = await asyncio.to_thread(
-                chunk_text,
-                connector_doc.source_markdown,
-                use_code_chunker=connector_doc.should_use_code_chunker,
-            )
+            if connector_doc.should_use_code_chunker:
+                chunk_texts = await asyncio.to_thread(
+                    chunk_text,
+                    connector_doc.source_markdown,
+                    use_code_chunker=True,
+                )
+            else:
+                # Use the table-aware hybrid chunker so Markdown tables are not
+                # split mid-row (see issue #1334).
+                chunk_texts = await asyncio.to_thread(
+                    chunk_text_hybrid,
+                    connector_doc.source_markdown,
+                )
 
             texts_to_embed = [content, *chunk_texts]
             embeddings = await asyncio.to_thread(embed_texts, texts_to_embed)
@@ -422,6 +430,8 @@ async def index(
             )
             log_index_success(ctx, chunk_count=len(chunks))
 
+            await self._enqueue_ai_sort_if_enabled(document)
+
         except RETRYABLE_LLM_ERRORS as e:
             log_retryable_llm_error(ctx, e)
             await rollback_and_persist_failure(
@@ -457,6 +467,29 @@ async def index(
 
         return document
 
+    async def _enqueue_ai_sort_if_enabled(self, document: Document) -> None:
+        """Fire-and-forget: enqueue incremental AI sort if the search space has it enabled."""
+        try:
+            from app.db import SearchSpace
+
+            result = await self.session.execute(
+                select(SearchSpace.ai_file_sort_enabled).where(
+                    SearchSpace.id == document.search_space_id
+                )
+            )
+            enabled = result.scalar()
+            if not enabled:
+                return
+
+            from app.tasks.celery_tasks.document_tasks import ai_sort_document_task
+
+            user_id = str(document.created_by_id) if document.created_by_id else ""
+            ai_sort_document_task.delay(document.search_space_id, user_id, document.id)
+        except Exception:
+            logging.getLogger(__name__).warning(
+                "Failed to enqueue AI sort for document %s", document.id, exc_info=True
+            )
+
     async def index_batch_parallel(
         self,
         connector_docs: list[ConnectorDocument],
diff --git a/surfsense_backend/app/observability/__init__.py b/surfsense_backend/app/observability/__init__.py
new file mode 100644
index 0000000000..dbf082561b
--- /dev/null
+++ b/surfsense_backend/app/observability/__init__.py
@@ -0,0 +1,7 @@
+"""SurfSense observability surface.
+
+The single user-visible API right now is :mod:`otel`, which exposes a
+small wrapper around the optional ``opentelemetry`` instrumentation. The
+wrapper is a no-op when OTEL is not configured, so importing it from
+performance-critical paths is safe.
+"""
diff --git a/surfsense_backend/app/observability/otel.py b/surfsense_backend/app/observability/otel.py
new file mode 100644
index 0000000000..6791ab499a
--- /dev/null
+++ b/surfsense_backend/app/observability/otel.py
@@ -0,0 +1,314 @@
+"""
+OpenTelemetry instrumentation helpers for the SurfSense agent stack.
+
+Goals
+=====
+
+- Provide one tiny, ergonomic API for the spans we care about
+  (``tool.call``, ``model.call``, ``kb.search``, ``kb.persist``,
+  ``compaction.run``, ``interrupt.raised``, ``permission.asked``).
+- Keep span **names** low-cardinality (``tool.call`` rather than
+  ``tool.call.<name>``); tool name lives in the ``tool.name`` attribute
+  so dashboards aggregate cleanly.
+- Default to **no-op** behavior unless ``OTEL_EXPORTER_OTLP_ENDPOINT`` is
+  set, OR an external SDK has installed a real ``TracerProvider`` already
+  (e.g. via the ``opentelemetry-instrument`` agent).
+- Coexist with LangSmith: we never disable LangSmith tracing; we add OTel
+  alongside.
+- Gracefully degrade if the ``opentelemetry-api`` package is missing.
+"""
+
+from __future__ import annotations
+
+import contextlib
+import logging
+import os
+from collections.abc import Iterator
+from contextlib import contextmanager
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+# -----------------------------------------------------------------------------
+# Lazy/optional OpenTelemetry import
+# -----------------------------------------------------------------------------
+
+try:
+    from opentelemetry import trace as _ot_trace
+    from opentelemetry.trace import (
+        Span as _OtSpan,
+        Status as _OtStatus,
+        StatusCode as _OtStatusCode,
+    )
+
+    _OTEL_AVAILABLE = True
+except ImportError:  # pragma: no cover — optional dep
+    _ot_trace = None  # type: ignore[assignment]
+    _OtSpan = Any  # type: ignore[assignment, misc]
+    _OtStatus = Any  # type: ignore[assignment, misc]
+    _OtStatusCode = Any  # type: ignore[assignment, misc]
+    _OTEL_AVAILABLE = False
+
+
+_INSTRUMENTATION_NAME = "surfsense.new_chat"
+_INSTRUMENTATION_VERSION = "0.1.0"
+
+
+# -----------------------------------------------------------------------------
+# Configuration
+# -----------------------------------------------------------------------------
+
+
+def _resolve_enabled() -> bool:
+    """Return True if OTel spans should actually be emitted."""
+    if not _OTEL_AVAILABLE:
+        return False
+    # Honor an explicit kill-switch first.
+    if os.environ.get("SURFSENSE_DISABLE_OTEL", "").lower() in {"1", "true", "yes"}:
+        return False
+    # Treat a configured endpoint as the canonical "OTel is wired up" signal.
+    if os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT"):
+        return True
+    # Or honor an external SDK that already installed a non-default TracerProvider.
+    if _ot_trace is not None:
+        try:
+            provider = _ot_trace.get_tracer_provider()
+            # The default proxy provider has no real exporter wired up.
+            type_name = type(provider).__name__
+            if type_name not in {"ProxyTracerProvider", "NoOpTracerProvider"}:
+                return True
+        except Exception:  # pragma: no cover — defensive
+            return False
+    return False
+
+
+_ENABLED: bool = _resolve_enabled()
+
+
+def is_enabled() -> bool:
+    """Return True if instrumentation is actively emitting spans."""
+    return _ENABLED
+
+
+def _get_tracer():
+    if not _OTEL_AVAILABLE:
+        return None
+    try:
+        return _ot_trace.get_tracer(_INSTRUMENTATION_NAME, _INSTRUMENTATION_VERSION)
+    except Exception:  # pragma: no cover — defensive
+        return None
+
+
+# -----------------------------------------------------------------------------
+# No-op span used when OTel is disabled (avoids a None check at every call site)
+# -----------------------------------------------------------------------------
+
+
+class _NoopSpan:
+    """A lightweight stand-in that mimics the subset of ``Span`` we use."""
+
+    def set_attribute(self, key: str, value: Any) -> None:
+        return None
+
+    def set_attributes(self, attributes: dict[str, Any]) -> None:
+        return None
+
+    def add_event(self, name: str, attributes: dict[str, Any] | None = None) -> None:
+        return None
+
+    def record_exception(self, exception: BaseException) -> None:
+        return None
+
+    def set_status(self, status: Any) -> None:
+        return None
+
+
+# -----------------------------------------------------------------------------
+# Public span helpers
+# -----------------------------------------------------------------------------
+
+
+@contextmanager
+def span(
+    name: str,
+    *,
+    attributes: dict[str, Any] | None = None,
+) -> Iterator[Any]:
+    """Generic span context manager.
+
+    Yields the underlying span (or a :class:`_NoopSpan` when disabled)
+    so callers can attach attributes/events incrementally.
+
+    On exception, the span records the error via :meth:`record_exception`
+    and sets ``StatusCode.ERROR``; the exception is then re-raised.
+    """
+    if not _ENABLED:
+        yield _NoopSpan()
+        return
+
+    tracer = _get_tracer()
+    if tracer is None:  # pragma: no cover — defensive
+        yield _NoopSpan()
+        return
+
+    with tracer.start_as_current_span(name) as sp:
+        if attributes:
+            with contextlib.suppress(Exception):  # pragma: no cover — defensive
+                sp.set_attributes(attributes)
+        try:
+            yield sp
+        except BaseException as exc:
+            with contextlib.suppress(Exception):  # pragma: no cover — defensive
+                sp.record_exception(exc)
+                sp.set_status(_OtStatus(_OtStatusCode.ERROR, str(exc)))
+            raise
+
+
+# -----------------------------------------------------------------------------
+# Domain-specific shortcuts (mirror the plan's enumerated span list)
+# -----------------------------------------------------------------------------
+
+
+def tool_call_span(
+    tool_name: str,
+    *,
+    input_size: int | None = None,
+    extra: dict[str, Any] | None = None,
+):
+    """Span for an individual tool execution.
+
+    Span name is the constant ``tool.call`` (low-cardinality); the tool
+    identifier lives in the ``tool.name`` attribute.
+    """
+    attrs: dict[str, Any] = {"tool.name": tool_name}
+    if input_size is not None:
+        attrs["tool.input.size"] = int(input_size)
+    if extra:
+        attrs.update(extra)
+    return span("tool.call", attributes=attrs)
+
+
+def model_call_span(
+    *,
+    model_id: str | None = None,
+    provider: str | None = None,
+    extra: dict[str, Any] | None = None,
+):
+    """Span around a single ``astream`` / ``ainvoke`` call to the LLM."""
+    attrs: dict[str, Any] = {}
+    if model_id:
+        attrs["model.id"] = model_id
+    if provider:
+        attrs["model.provider"] = provider
+    if extra:
+        attrs.update(extra)
+    return span("model.call", attributes=attrs)
+
+
+def kb_search_span(
+    *,
+    search_space_id: int | None = None,
+    query_chars: int | None = None,
+    extra: dict[str, Any] | None = None,
+):
+    """Span around knowledge-base search routines."""
+    attrs: dict[str, Any] = {}
+    if search_space_id is not None:
+        attrs["search_space.id"] = int(search_space_id)
+    if query_chars is not None:
+        attrs["query.chars"] = int(query_chars)
+    if extra:
+        attrs.update(extra)
+    return span("kb.search", attributes=attrs)
+
+
+def kb_persist_span(
+    *,
+    document_type: str | None = None,
+    document_id: int | None = None,
+    extra: dict[str, Any] | None = None,
+):
+    """Span around knowledge-base persistence operations (NOTE/EXTENSION/FILE)."""
+    attrs: dict[str, Any] = {}
+    if document_type:
+        attrs["document.type"] = document_type
+    if document_id is not None:
+        attrs["document.id"] = int(document_id)
+    if extra:
+        attrs.update(extra)
+    return span("kb.persist", attributes=attrs)
+
+
+def compaction_span(
+    *,
+    reason: str | None = None,
+    messages_in: int | None = None,
+    extra: dict[str, Any] | None = None,
+):
+    """Span around the compaction (summarization) middleware run."""
+    attrs: dict[str, Any] = {}
+    if reason:
+        attrs["compaction.reason"] = reason
+    if messages_in is not None:
+        attrs["compaction.messages.in"] = int(messages_in)
+    if extra:
+        attrs.update(extra)
+    return span("compaction.run", attributes=attrs)
+
+
+def interrupt_span(
+    *,
+    interrupt_type: str,
+    extra: dict[str, Any] | None = None,
+):
+    """Span recording an interrupt being raised (HITL or permission_ask)."""
+    attrs: dict[str, Any] = {"interrupt.type": interrupt_type}
+    if extra:
+        attrs.update(extra)
+    return span("interrupt.raised", attributes=attrs)
+
+
+def permission_asked_span(
+    *,
+    permission: str,
+    pattern: str | None = None,
+    extra: dict[str, Any] | None = None,
+):
+    """Span recording a permission ask (PermissionMiddleware)."""
+    attrs: dict[str, Any] = {"permission.permission": permission}
+    if pattern:
+        attrs["permission.pattern"] = pattern
+    if extra:
+        attrs.update(extra)
+    return span("permission.asked", attributes=attrs)
+
+
+# -----------------------------------------------------------------------------
+# Test/utility hooks
+# -----------------------------------------------------------------------------
+
+
+def reload_for_tests() -> bool:
+    """Re-evaluate :data:`_ENABLED` from the current environment.
+
+    Tests that toggle ``OTEL_EXPORTER_OTLP_ENDPOINT`` or
+    ``SURFSENSE_DISABLE_OTEL`` can call this to reset cached state.
+    Returns the new value of :func:`is_enabled`.
+    """
+    global _ENABLED
+    _ENABLED = _resolve_enabled()
+    return _ENABLED
+
+
+__all__ = [
+    "compaction_span",
+    "interrupt_span",
+    "is_enabled",
+    "kb_persist_span",
+    "kb_search_span",
+    "model_call_span",
+    "permission_asked_span",
+    "reload_for_tests",
+    "span",
+    "tool_call_span",
+]
diff --git a/surfsense_backend/app/prompts/system_defaults.py b/surfsense_backend/app/prompts/system_defaults.py
index aaf9b64bd3..cc2019b8fd 100644
--- a/surfsense_backend/app/prompts/system_defaults.py
+++ b/surfsense_backend/app/prompts/system_defaults.py
@@ -71,4 +71,11 @@
         "prompt": "Search the web for information about:\n\n{selection}",
         "mode": "explore",
     },
+    {
+        "slug": "build-resume",
+        "version": 1,
+        "name": "Build Resume",
+        "prompt": "Build me a professional resume. Here is my information:\n\n{selection}",
+        "mode": "explore",
+    },
 ]
diff --git a/surfsense_backend/app/rate_limiter.py b/surfsense_backend/app/rate_limiter.py
new file mode 100644
index 0000000000..0a71e3961e
--- /dev/null
+++ b/surfsense_backend/app/rate_limiter.py
@@ -0,0 +1,35 @@
+"""Shared SlowAPI limiter instance used by app.py and route modules."""
+
+from __future__ import annotations
+
+from limits.storage import MemoryStorage
+from slowapi import Limiter
+from starlette.requests import Request
+
+from app.config import config
+
+
+def get_real_client_ip(request: Request) -> str:
+    """Extract the real client IP behind Cloudflare / reverse proxies.
+
+    Priority: CF-Connecting-IP > X-Real-IP > X-Forwarded-For (first entry) > socket peer.
+    """
+    cf_ip = request.headers.get("cf-connecting-ip")
+    if cf_ip:
+        return cf_ip.strip()
+    real_ip = request.headers.get("x-real-ip")
+    if real_ip:
+        return real_ip.strip()
+    forwarded = request.headers.get("x-forwarded-for")
+    if forwarded:
+        return forwarded.split(",")[0].strip()
+    return request.client.host if request.client else "127.0.0.1"
+
+
+limiter = Limiter(
+    key_func=get_real_client_ip,
+    storage_uri=config.REDIS_APP_URL,
+    default_limits=["1024/minute"],
+    in_memory_fallback_enabled=True,
+    in_memory_fallback=[MemoryStorage()],
+)
diff --git a/surfsense_backend/app/routes/__init__.py b/surfsense_backend/app/routes/__init__.py
index 02367606ba..5b6a74376a 100644
--- a/surfsense_backend/app/routes/__init__.py
+++ b/surfsense_backend/app/routes/__init__.py
@@ -1,9 +1,12 @@
 from fastapi import APIRouter
 
+from .agent_action_log_route import router as agent_action_log_router
+from .agent_flags_route import router as agent_flags_router
+from .agent_permissions_route import router as agent_permissions_router
+from .agent_revert_route import router as agent_revert_router
 from .airtable_add_connector_route import (
     router as airtable_add_connector_router,
 )
-from .autocomplete_routes import router as autocomplete_router
 from .chat_comments_routes import router as chat_comments_router
 from .circleback_webhook_route import router as circleback_webhook_router
 from .clickup_add_connector_route import router as clickup_add_connector_router
@@ -13,6 +16,7 @@
 from .documents_routes import router as documents_router
 from .dropbox_add_connector_route import router as dropbox_add_connector_router
 from .editor_routes import router as editor_router
+from .export_routes import router as export_router
 from .folders_routes import router as folders_router
 from .google_calendar_add_connector_route import (
     router as google_calendar_add_connector_router,
@@ -29,12 +33,15 @@
 from .linear_add_connector_route import router as linear_add_connector_router
 from .logs_routes import router as logs_router
 from .luma_add_connector_route import router as luma_add_connector_router
+from .mcp_oauth_route import router as mcp_oauth_router
+from .memory_routes import router as memory_router
 from .model_list_routes import router as model_list_router
 from .new_chat_routes import router as new_chat_router
 from .new_llm_config_routes import router as new_llm_config_router
 from .notes_routes import router as notes_router
 from .notifications_routes import router as notifications_router
 from .notion_add_connector_route import router as notion_add_connector_router
+from .obsidian_plugin_routes import router as obsidian_plugin_router
 from .onedrive_add_connector_route import router as onedrive_add_connector_router
 from .podcasts_routes import router as podcasts_router
 from .prompts_routes import router as prompts_router
@@ -57,10 +64,17 @@
 router.include_router(search_spaces_router)
 router.include_router(rbac_router)  # RBAC routes for roles, members, invites
 router.include_router(editor_router)
+router.include_router(export_router)
 router.include_router(documents_router)
 router.include_router(folders_router)
 router.include_router(notes_router)
 router.include_router(new_chat_router)  # Chat with assistant-ui persistence
+router.include_router(agent_revert_router)  # POST /threads/{id}/revert/{action_id}
+router.include_router(agent_action_log_router)  # GET /threads/{id}/actions
+router.include_router(
+    agent_permissions_router
+)  # CRUD for /searchspaces/{id}/agent/permissions/rules
+router.include_router(agent_flags_router)  # GET /agent/flags
 router.include_router(sandbox_router)  # Sandbox file downloads (Daytona)
 router.include_router(chat_comments_router)
 router.include_router(podcasts_router)  # Podcast task status and audio
@@ -81,6 +95,7 @@
 router.include_router(slack_add_connector_router)
 router.include_router(teams_add_connector_router)
 router.include_router(onedrive_add_connector_router)
+router.include_router(obsidian_plugin_router)  # Obsidian plugin push API
 router.include_router(discord_add_connector_router)
 router.include_router(jira_add_connector_router)
 router.include_router(confluence_add_connector_router)
@@ -92,10 +107,13 @@
 router.include_router(circleback_webhook_router)  # Circleback meeting webhooks
 router.include_router(surfsense_docs_router)  # Surfsense documentation for citations
 router.include_router(notifications_router)  # Notifications with Zero sync
+router.include_router(
+    mcp_oauth_router
+)  # MCP OAuth 2.1 for Linear, Jira, ClickUp, Slack, Airtable
 router.include_router(composio_router)  # Composio OAuth and toolkit management
 router.include_router(public_chat_router)  # Public chat sharing and cloning
 router.include_router(incentive_tasks_router)  # Incentive tasks for earning free pages
 router.include_router(stripe_router)  # Stripe checkout for additional page packs
 router.include_router(youtube_router)  # YouTube playlist resolution
 router.include_router(prompts_router)
-router.include_router(autocomplete_router)  # Lightweight autocomplete with KB context
+router.include_router(memory_router)  # User personal memory (memory.md style)
diff --git a/surfsense_backend/app/routes/agent_action_log_route.py b/surfsense_backend/app/routes/agent_action_log_route.py
new file mode 100644
index 0000000000..2608aa3b10
--- /dev/null
+++ b/surfsense_backend/app/routes/agent_action_log_route.py
@@ -0,0 +1,195 @@
+"""``GET /api/threads/{thread_id}/actions``: list agent action-log entries.
+
+Pairs with ``POST /api/threads/{thread_id}/revert/{action_id}`` (see
+``agent_revert_route.py``). The action log is the read-side surface for
+the audit/undo UI: it returns a paginated list of every tool call
+recorded by :class:`ActionLogMiddleware` against the thread, plus
+metadata about whether the action is reversible and whether it has
+already been reverted.
+
+The route is gated by the same ``SURFSENSE_ENABLE_ACTION_LOG`` flag that
+controls the middleware. When the flag is off the endpoint returns 503
+so the UI can detect "this deployment doesn't have the action log
+enabled" without 404-ing on a missing route.
+
+The list is ordered DESC by ``created_at`` (newest first) so the
+revert UI can render a familiar reverse-chronological feed without an
+additional client-side sort.
+"""
+
+from __future__ import annotations
+
+import logging
+from datetime import datetime
+from typing import Any
+
+from fastapi import APIRouter, Depends, HTTPException, Query
+from pydantic import BaseModel
+from sqlalchemy import func, select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.agents.new_chat.feature_flags import get_flags
+from app.db import (
+    AgentActionLog,
+    NewChatThread,
+    Permission,
+    User,
+    get_async_session,
+)
+from app.users import current_active_user
+from app.utils.rbac import check_permission
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter()
+
+
+# ---------------------------------------------------------------------------
+# Response schemas
+# ---------------------------------------------------------------------------
+
+
+class AgentActionRead(BaseModel):
+    """One row of the action log surfaced to the client."""
+
+    id: int
+    thread_id: int
+    user_id: str | None
+    search_space_id: int
+    tool_name: str
+    args: dict[str, Any] | None
+    result_id: str | None
+    reversible: bool
+    reverse_descriptor: dict[str, Any] | None
+    error: dict[str, Any] | None
+    reverse_of: int | None
+    reverted_by_action_id: int | None
+    is_revert_action: bool
+    # Correlation ids added in migration 135. ``tool_call_id`` is the
+    # LangChain tool-call id (joinable to ``data-action-log`` SSE events
+    # via ``langchainToolCallId``). ``chat_turn_id`` is the per-turn id
+    # from ``configurable.turn_id`` (used by the
+    # ``revert-turn/{chat_turn_id}`` endpoint).
+    tool_call_id: str | None = None
+    chat_turn_id: str | None = None
+    created_at: datetime
+
+
+class AgentActionListResponse(BaseModel):
+    """Paginated list response for the action log."""
+
+    items: list[AgentActionRead]
+    total: int
+    page: int
+    page_size: int
+    has_more: bool
+
+
+# ---------------------------------------------------------------------------
+# Routes
+# ---------------------------------------------------------------------------
+
+
+def _flag_guard() -> None:
+    flags = get_flags()
+    if flags.disable_new_agent_stack or not flags.enable_action_log:
+        raise HTTPException(
+            status_code=503,
+            detail=(
+                "Action log is not available on this deployment. Flip "
+                "SURFSENSE_ENABLE_ACTION_LOG to enable it."
+            ),
+        )
+
+
+@router.get(
+    "/threads/{thread_id}/actions",
+    response_model=AgentActionListResponse,
+)
+async def list_thread_actions(
+    thread_id: int,
+    page: int = Query(0, ge=0),
+    page_size: int = Query(50, ge=1, le=200),
+    session: AsyncSession = Depends(get_async_session),
+    user: User = Depends(current_active_user),
+) -> AgentActionListResponse:
+    """List agent actions for a thread, newest first.
+
+    Authorization:
+    * Caller must be a member of the thread's search space with
+      ``CHATS_READ`` permission.
+
+    Pagination:
+    * ``page`` is 0-indexed.
+    * ``page_size`` defaults to 50, max 200.
+    """
+
+    _flag_guard()
+
+    thread = await session.get(NewChatThread, thread_id)
+    if thread is None:
+        raise HTTPException(status_code=404, detail="Thread not found.")
+
+    await check_permission(
+        session,
+        user,
+        thread.search_space_id,
+        Permission.CHATS_READ.value,
+        "You don't have permission to view this thread's action log.",
+    )
+
+    total_stmt = select(func.count(AgentActionLog.id)).where(
+        AgentActionLog.thread_id == thread_id
+    )
+    total = (await session.execute(total_stmt)).scalar_one()
+
+    rows_stmt = (
+        select(AgentActionLog)
+        .where(AgentActionLog.thread_id == thread_id)
+        .order_by(AgentActionLog.created_at.desc(), AgentActionLog.id.desc())
+        .offset(page * page_size)
+        .limit(page_size)
+    )
+    rows = (await session.execute(rows_stmt)).scalars().all()
+
+    # Build a reverse_of -> revert_action_id map so the UI can render
+    # "Reverted" badges on actions that have already been undone.
+    if rows:
+        original_ids = [r.id for r in rows]
+        reverts_stmt = select(AgentActionLog.id, AgentActionLog.reverse_of).where(
+            AgentActionLog.reverse_of.in_(original_ids)
+        )
+        reverts = (await session.execute(reverts_stmt)).all()
+        revert_map: dict[int, int] = {orig: rev for rev, orig in reverts}
+    else:
+        revert_map = {}
+
+    items = [
+        AgentActionRead(
+            id=row.id,
+            thread_id=row.thread_id,
+            user_id=str(row.user_id) if row.user_id is not None else None,
+            search_space_id=row.search_space_id,
+            tool_name=row.tool_name,
+            args=row.args,
+            result_id=row.result_id,
+            reversible=bool(row.reversible),
+            reverse_descriptor=row.reverse_descriptor,
+            error=row.error,
+            reverse_of=row.reverse_of,
+            reverted_by_action_id=revert_map.get(row.id),
+            is_revert_action=row.reverse_of is not None,
+            tool_call_id=row.tool_call_id,
+            chat_turn_id=row.chat_turn_id,
+            created_at=row.created_at,
+        )
+        for row in rows
+    ]
+
+    return AgentActionListResponse(
+        items=items,
+        total=int(total),
+        page=page,
+        page_size=page_size,
+        has_more=(page + 1) * page_size < int(total),
+    )
diff --git a/surfsense_backend/app/routes/agent_flags_route.py b/surfsense_backend/app/routes/agent_flags_route.py
new file mode 100644
index 0000000000..99388af666
--- /dev/null
+++ b/surfsense_backend/app/routes/agent_flags_route.py
@@ -0,0 +1,77 @@
+"""``GET /api/agent/flags``: read-only feature-flag status.
+
+Surfaces :class:`AgentFeatureFlags` to the frontend so the UI can:
+
+* Render conditional surfaces (e.g. show the action-log button only when
+  ``enable_action_log`` is on).
+* Display an admin diagnostics card so operators can verify which
+  middleware tier is active without shelling into the box.
+
+The endpoint is *read-only*. Flipping flags requires an env-var change
+plus a process restart — by design, since the values are baked into the
+agent factory at build time. The route does not require any special
+permission (any authenticated user can see them) since the flag values
+do not leak data, and the UI surfaces are conditionally rendered based
+on them anyway.
+"""
+
+from __future__ import annotations
+
+from dataclasses import asdict
+
+from fastapi import APIRouter, Depends
+from pydantic import BaseModel
+
+from app.agents.new_chat.feature_flags import AgentFeatureFlags, get_flags
+from app.config import config
+from app.db import User
+from app.users import current_active_user
+
+router = APIRouter()
+
+
+class AgentFeatureFlagsRead(BaseModel):
+    """Mirror of :class:`AgentFeatureFlags`. Updated together with it."""
+
+    disable_new_agent_stack: bool
+
+    enable_context_editing: bool
+    enable_compaction_v2: bool
+    enable_retry_after: bool
+    enable_model_fallback: bool
+    enable_model_call_limit: bool
+    enable_tool_call_limit: bool
+    enable_tool_call_repair: bool
+    enable_doom_loop: bool
+
+    enable_permission: bool
+    enable_busy_mutex: bool
+    enable_llm_tool_selector: bool
+
+    enable_skills: bool
+    enable_specialized_subagents: bool
+    enable_kb_planner_runnable: bool
+
+    enable_action_log: bool
+    enable_revert_route: bool
+
+    enable_plugin_loader: bool
+
+    enable_otel: bool
+
+    enable_desktop_local_filesystem: bool
+
+    @classmethod
+    def from_flags(cls, flags: AgentFeatureFlags) -> AgentFeatureFlagsRead:
+        # asdict() avoids missing-field bugs when AgentFeatureFlags grows.
+        return cls(
+            **asdict(flags),
+            enable_desktop_local_filesystem=config.ENABLE_DESKTOP_LOCAL_FILESYSTEM,
+        )
+
+
+@router.get("/agent/flags", response_model=AgentFeatureFlagsRead)
+async def get_agent_flags(
+    _user: User = Depends(current_active_user),
+) -> AgentFeatureFlagsRead:
+    return AgentFeatureFlagsRead.from_flags(get_flags())
diff --git a/surfsense_backend/app/routes/agent_permissions_route.py b/surfsense_backend/app/routes/agent_permissions_route.py
new file mode 100644
index 0000000000..1c76e00e60
--- /dev/null
+++ b/surfsense_backend/app/routes/agent_permissions_route.py
@@ -0,0 +1,280 @@
+"""CRUD for :class:`app.db.AgentPermissionRule`.
+
+Surfaces the permission rules consumed by
+:class:`PermissionMiddleware`. Rules are scoped at one of three levels:
+
+* **Search-space wide** — both ``user_id`` and ``thread_id`` are NULL.
+* **Per-user** — ``user_id`` set, ``thread_id`` NULL.
+* **Per-thread** — ``thread_id`` set (``user_id`` typically NULL).
+
+The middleware reads these rows at agent build time (see
+``chat_deepagent.py``). UI lets a search-space owner curate them so
+the agent can ask for approval / auto-deny / auto-allow specific
+tool patterns.
+
+The route group is gated by ``SURFSENSE_ENABLE_PERMISSION``: when off
+all endpoints return 503 so the UI can render a "feature not enabled"
+empty state without breaking on a missing route.
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+from datetime import datetime
+from typing import Literal
+
+from fastapi import APIRouter, Depends, HTTPException
+from pydantic import BaseModel, Field
+from sqlalchemy import select
+from sqlalchemy.exc import IntegrityError
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.agents.new_chat.feature_flags import get_flags
+from app.db import (
+    AgentPermissionRule,
+    NewChatThread,
+    Permission,
+    SearchSpace,
+    User,
+    get_async_session,
+)
+from app.users import current_active_user
+from app.utils.rbac import check_permission
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter()
+
+
+# ---------------------------------------------------------------------------
+# Schemas
+# ---------------------------------------------------------------------------
+
+
+_ACTION_VALUES: tuple[str, ...] = ("allow", "deny", "ask")
+_PERMISSION_PATTERN = re.compile(r"^[a-zA-Z0-9_:.\-*]+$")
+
+
+class AgentPermissionRuleRead(BaseModel):
+    id: int
+    search_space_id: int
+    user_id: str | None
+    thread_id: int | None
+    permission: str
+    pattern: str
+    action: Literal["allow", "deny", "ask"]
+    created_at: datetime
+
+
+class AgentPermissionRuleCreate(BaseModel):
+    permission: str = Field(
+        ...,
+        min_length=1,
+        max_length=255,
+        description="Tool / capability the rule targets, e.g. 'tool:create_linear_issue'.",
+    )
+    pattern: str = Field(
+        "*",
+        min_length=1,
+        max_length=255,
+        description="Wildcard pattern (e.g. '*' or 'production-*') applied to the matched tool argument.",
+    )
+    action: Literal["allow", "deny", "ask"]
+    user_id: str | None = None
+    thread_id: int | None = None
+
+
+class AgentPermissionRuleUpdate(BaseModel):
+    pattern: str | None = Field(default=None, min_length=1, max_length=255)
+    action: Literal["allow", "deny", "ask"] | None = None
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _flag_guard() -> None:
+    flags = get_flags()
+    if flags.disable_new_agent_stack or not flags.enable_permission:
+        raise HTTPException(
+            status_code=503,
+            detail=(
+                "Agent permission rules are not enabled on this deployment. "
+                "Flip SURFSENSE_ENABLE_PERMISSION to enable them."
+            ),
+        )
+
+
+def _validate_permission_string(value: str) -> str:
+    if not _PERMISSION_PATTERN.match(value):
+        raise HTTPException(
+            status_code=400,
+            detail=(
+                "permission must contain only alphanumerics, '.', '_', ':', '-', "
+                "or '*' wildcards."
+            ),
+        )
+    return value
+
+
+def _to_read(row: AgentPermissionRule) -> AgentPermissionRuleRead:
+    return AgentPermissionRuleRead(
+        id=row.id,
+        search_space_id=row.search_space_id,
+        user_id=str(row.user_id) if row.user_id is not None else None,
+        thread_id=row.thread_id,
+        permission=row.permission,
+        pattern=row.pattern,
+        action=row.action,  # type: ignore[arg-type]
+        created_at=row.created_at,
+    )
+
+
+async def _ensure_search_space_membership_admin(
+    session: AsyncSession, user: User, search_space_id: int
+) -> None:
+    """Curating agent rules == "settings" administration on the space."""
+    space = await session.get(SearchSpace, search_space_id)
+    if space is None:
+        raise HTTPException(status_code=404, detail="Search space not found.")
+    await check_permission(
+        session,
+        user,
+        search_space_id,
+        Permission.SETTINGS_UPDATE.value,
+        "You don't have permission to manage agent permission rules in this space.",
+    )
+
+
+# ---------------------------------------------------------------------------
+# Routes
+# ---------------------------------------------------------------------------
+
+
+@router.get(
+    "/searchspaces/{search_space_id}/agent/permissions/rules",
+    response_model=list[AgentPermissionRuleRead],
+)
+async def list_rules(
+    search_space_id: int,
+    session: AsyncSession = Depends(get_async_session),
+    user: User = Depends(current_active_user),
+) -> list[AgentPermissionRuleRead]:
+    _flag_guard()
+    await _ensure_search_space_membership_admin(session, user, search_space_id)
+
+    stmt = (
+        select(AgentPermissionRule)
+        .where(AgentPermissionRule.search_space_id == search_space_id)
+        .order_by(AgentPermissionRule.created_at.desc(), AgentPermissionRule.id.desc())
+    )
+    rows = (await session.execute(stmt)).scalars().all()
+    return [_to_read(r) for r in rows]
+
+
+@router.post(
+    "/searchspaces/{search_space_id}/agent/permissions/rules",
+    response_model=AgentPermissionRuleRead,
+    status_code=201,
+)
+async def create_rule(
+    search_space_id: int,
+    payload: AgentPermissionRuleCreate,
+    session: AsyncSession = Depends(get_async_session),
+    user: User = Depends(current_active_user),
+) -> AgentPermissionRuleRead:
+    _flag_guard()
+    await _ensure_search_space_membership_admin(session, user, search_space_id)
+
+    permission = _validate_permission_string(payload.permission.strip())
+    pattern = payload.pattern.strip() or "*"
+
+    if payload.thread_id is not None:
+        thread = await session.get(NewChatThread, payload.thread_id)
+        if thread is None or thread.search_space_id != search_space_id:
+            raise HTTPException(
+                status_code=404,
+                detail="Thread not found in this search space.",
+            )
+
+    row = AgentPermissionRule(
+        search_space_id=search_space_id,
+        user_id=payload.user_id,
+        thread_id=payload.thread_id,
+        permission=permission,
+        pattern=pattern,
+        action=payload.action,
+    )
+    session.add(row)
+    try:
+        await session.commit()
+    except IntegrityError as err:
+        await session.rollback()
+        raise HTTPException(
+            status_code=409,
+            detail=(
+                "An identical rule already exists for this scope. Update the "
+                "existing rule instead."
+            ),
+        ) from err
+    await session.refresh(row)
+    return _to_read(row)
+
+
+@router.patch(
+    "/searchspaces/{search_space_id}/agent/permissions/rules/{rule_id}",
+    response_model=AgentPermissionRuleRead,
+)
+async def update_rule(
+    search_space_id: int,
+    rule_id: int,
+    payload: AgentPermissionRuleUpdate,
+    session: AsyncSession = Depends(get_async_session),
+    user: User = Depends(current_active_user),
+) -> AgentPermissionRuleRead:
+    _flag_guard()
+    await _ensure_search_space_membership_admin(session, user, search_space_id)
+
+    row = await session.get(AgentPermissionRule, rule_id)
+    if row is None or row.search_space_id != search_space_id:
+        raise HTTPException(status_code=404, detail="Rule not found.")
+
+    if payload.pattern is not None:
+        row.pattern = payload.pattern.strip() or "*"
+    if payload.action is not None:
+        row.action = payload.action
+
+    try:
+        await session.commit()
+    except IntegrityError as err:
+        await session.rollback()
+        raise HTTPException(
+            status_code=409,
+            detail="Update would create a duplicate rule for this scope.",
+        ) from err
+    await session.refresh(row)
+    return _to_read(row)
+
+
+@router.delete(
+    "/searchspaces/{search_space_id}/agent/permissions/rules/{rule_id}",
+    status_code=204,
+)
+async def delete_rule(
+    search_space_id: int,
+    rule_id: int,
+    session: AsyncSession = Depends(get_async_session),
+    user: User = Depends(current_active_user),
+) -> None:
+    _flag_guard()
+    await _ensure_search_space_membership_admin(session, user, search_space_id)
+
+    row = await session.get(AgentPermissionRule, rule_id)
+    if row is None or row.search_space_id != search_space_id:
+        raise HTTPException(status_code=404, detail="Rule not found.")
+
+    await session.delete(row)
+    await session.commit()
+    return None
diff --git a/surfsense_backend/app/routes/agent_revert_route.py b/surfsense_backend/app/routes/agent_revert_route.py
new file mode 100644
index 0000000000..711081b150
--- /dev/null
+++ b/surfsense_backend/app/routes/agent_revert_route.py
@@ -0,0 +1,508 @@
+"""POST ``/api/threads/{thread_id}/revert/{action_id}``: undo an agent action.
+
+The route ships **before** the UI lights up the per-message "Undo from
+here" affordance. To prevent accidental usage during the gap we return
+``503 Service Unavailable`` until the ``SURFSENSE_ENABLE_REVERT_ROUTE``
+flag flips. Once enabled, the route runs:
+
+1. Authentication via :func:`current_active_user`.
+2. Action lookup; 404 if the action does not belong to the thread.
+3. Authorization via :func:`app.services.revert_service.can_revert`.
+4. Revert dispatch via :func:`app.services.revert_service.revert_action`.
+5. Idempotent on retries: if the same action is reverted twice the second
+   call returns 409 ``"already reverted"``.
+
+This module also hosts the per-turn batch endpoint
+``POST /api/threads/{thread_id}/revert-turn/{chat_turn_id}``. It
+walks every reversible action emitted during a chat turn in reverse
+``created_at`` order and reverts each independently. Partial success is the
+common case — the response always contains a per-action result list and a
+``status`` of ``"ok"`` or ``"partial"``; we never collapse the batch into a
+whole-batch 4xx.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Literal
+
+from fastapi import APIRouter, Depends, HTTPException
+from pydantic import BaseModel
+from sqlalchemy import select
+from sqlalchemy.exc import IntegrityError
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.agents.new_chat.feature_flags import get_flags
+from app.db import (
+    AgentActionLog,
+    User,
+    get_async_session,
+)
+from app.services.revert_service import (
+    RevertOutcome,
+    can_revert,
+    load_action,
+    load_thread,
+    revert_action,
+)
+from app.users import current_active_user
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter()
+
+
+@router.post("/threads/{thread_id}/revert/{action_id}")
+async def revert_agent_action(
+    thread_id: int,
+    action_id: int,
+    session: AsyncSession = Depends(get_async_session),
+    user: User = Depends(current_active_user),
+) -> dict:
+    flags = get_flags()
+    if flags.disable_new_agent_stack or not flags.enable_revert_route:
+        raise HTTPException(
+            status_code=503,
+            detail=(
+                "Revert is not available on this deployment yet. The route "
+                "ships before the UI; flip SURFSENSE_ENABLE_REVERT_ROUTE to "
+                "enable it."
+            ),
+        )
+
+    thread = await load_thread(session, thread_id=thread_id)
+    if thread is None:
+        raise HTTPException(status_code=404, detail="Thread not found.")
+
+    action = await load_action(session, action_id=action_id, thread_id=thread_id)
+    if action is None:
+        raise HTTPException(
+            status_code=404,
+            detail="Action not found or does not belong to this thread.",
+        )
+
+    # Idempotency: if a successful revert already exists, return 409.
+    existing_revert = await session.execute(
+        select(AgentActionLog).where(AgentActionLog.reverse_of == action.id)
+    )
+    if existing_revert.scalars().first() is not None:
+        raise HTTPException(
+            status_code=409,
+            detail="This action has already been reverted.",
+        )
+
+    if not can_revert(
+        requester_user_id=str(user.id) if user is not None else None,
+        action=action,
+        is_admin=False,  # role lookup is done by RBAC layer; default conservative
+    ):
+        raise HTTPException(
+            status_code=403,
+            detail="You are not allowed to revert this action.",
+        )
+
+    outcome: RevertOutcome
+    try:
+        outcome = await revert_action(
+            session,
+            action=action,
+            requester_user_id=str(user.id) if user is not None else None,
+        )
+    except IntegrityError:
+        # Partial unique index ``ux_agent_action_log_reverse_of`` caught
+        # a concurrent revert. Translate to the existing 409 "already
+        # reverted" contract so racing clients see consistent
+        # behaviour with the pre-flight TOCTOU check above.
+        await session.rollback()
+        raise HTTPException(
+            status_code=409,
+            detail="This action has already been reverted.",
+        ) from None
+    except Exception as err:
+        logger.exception("Revert dispatch raised for action_id=%s", action_id)
+        await session.rollback()
+        raise HTTPException(
+            status_code=500, detail="Internal error during revert."
+        ) from err
+
+    if outcome.status == "ok":
+        try:
+            await session.commit()
+        except IntegrityError:
+            # Race lost on commit (constraint enforced at flush in some
+            # configs but at commit in others — defensive).
+            await session.rollback()
+            raise HTTPException(
+                status_code=409,
+                detail="This action has already been reverted.",
+            ) from None
+        return {
+            "status": "ok",
+            "message": outcome.message,
+            "new_action_id": outcome.new_action_id,
+        }
+
+    await session.rollback()
+
+    if outcome.status == "not_found" or outcome.status == "tool_unavailable":
+        raise HTTPException(status_code=409, detail=outcome.message)
+    if outcome.status == "permission_denied":
+        raise HTTPException(status_code=403, detail=outcome.message)
+    if outcome.status == "reverse_not_implemented":
+        raise HTTPException(status_code=501, detail=outcome.message)
+    # not_reversible
+    raise HTTPException(status_code=409, detail=outcome.message)
+
+
+# ---------------------------------------------------------------------------
+# Per-turn revert batch endpoint
+# ---------------------------------------------------------------------------
+
+
+PerActionStatus = Literal[
+    "reverted",
+    "already_reverted",
+    "not_reversible",
+    "permission_denied",
+    "failed",
+    "skipped",
+]
+
+
+class RevertTurnActionResult(BaseModel):
+    """Per-action outcome inside a ``revert-turn`` batch response."""
+
+    action_id: int
+    tool_name: str
+    status: PerActionStatus
+    message: str | None = None
+    new_action_id: int | None = None
+    error: str | None = None
+
+
+class RevertTurnResponse(BaseModel):
+    """Top-level response for ``POST /threads/{id}/revert-turn/{chat_turn_id}``.
+
+    ``status`` is ``"ok"`` only when every reversible row succeeded. Any
+    ``failed`` / ``not_reversible`` / ``permission_denied`` entry downgrades
+    it to ``"partial"``. Empty turns (no rows) return ``"ok"`` with an empty
+    ``results`` list — callers should treat that as a no-op.
+
+    Counter invariant:
+        ``total == reverted + already_reverted + not_reversible
+                  + permission_denied + failed + skipped``
+
+    Frontend toasts and the ``RevertTurnButton`` summary rely on this
+    invariant to display "X of Y reverted, Z could not be undone" without
+    silently dropping ``permission_denied`` or ``skipped`` rows.
+    """
+
+    status: Literal["ok", "partial"]
+    chat_turn_id: str
+    total: int
+    reverted: int
+    already_reverted: int
+    not_reversible: int
+    permission_denied: int = 0
+    failed: int = 0
+    skipped: int = 0
+    results: list[RevertTurnActionResult]
+
+
+def _classify_outcome(outcome: RevertOutcome) -> PerActionStatus:
+    if outcome.status == "ok":
+        return "reverted"
+    if outcome.status == "permission_denied":
+        return "permission_denied"
+    # ``not_found`` / ``tool_unavailable`` / ``reverse_not_implemented`` /
+    # ``not_reversible`` are all surfaced to the caller as "not_reversible"
+    # — they share the same UX (this row cannot be undone) and only the
+    # ``message`` differs.
+    return "not_reversible"
+
+
+async def _was_already_reverted(session: AsyncSession, *, action_id: int) -> int | None:
+    """Return the id of an existing successful revert row, if any.
+
+    Single-action variant — kept for the post-IntegrityError lookup
+    path where we already know we lost a race for one specific id.
+    """
+    stmt = select(AgentActionLog.id).where(AgentActionLog.reverse_of == action_id)
+    result = await session.execute(stmt)
+    return result.scalars().first()
+
+
+async def _was_already_reverted_batch(
+    session: AsyncSession, *, action_ids: list[int]
+) -> dict[int, int]:
+    """Batch idempotency probe for the revert-turn loop.
+
+    Replaces N individual ``SELECT id WHERE reverse_of = :id`` queries
+    (one per row in the turn) with a single ``SELECT id, reverse_of
+    WHERE reverse_of IN (:ids)``. The route still iterates rows in
+    reverse-chronological order, but the membership check is O(1) per
+    iteration after this query. For a turn with 30 actions that's 30
+    fewer round-trips through asyncpg + a smaller transaction footprint.
+
+    Returns a ``{original_action_id -> revert_action_id}`` map. Missing
+    keys mean "not yet reverted" — callers should treat them as
+    eligible for revert.
+    """
+    if not action_ids:
+        return {}
+    stmt = select(AgentActionLog.id, AgentActionLog.reverse_of).where(
+        AgentActionLog.reverse_of.in_(action_ids)
+    )
+    result = await session.execute(stmt)
+    return {
+        original_id: revert_id
+        for revert_id, original_id in result.all()
+        if original_id is not None
+    }
+
+
+@router.post(
+    "/threads/{thread_id}/revert-turn/{chat_turn_id}",
+    response_model=RevertTurnResponse,
+)
+async def revert_agent_turn(
+    thread_id: int,
+    chat_turn_id: str,
+    session: AsyncSession = Depends(get_async_session),
+    user: User = Depends(current_active_user),
+) -> RevertTurnResponse:
+    """Revert every reversible action emitted during ``chat_turn_id``.
+
+    Walks ``AgentActionLog`` rows for the turn in reverse ``created_at``
+    order so dependencies (e.g. ``mkdir`` -> ``write_file`` inside the new
+    folder) unwind in the right sequence. Each action is reverted in its
+    own SAVEPOINT so a single failure does not poison the batch.
+
+    Partial success is intentional and returned with HTTP 200. Callers
+    must inspect ``results[*].status`` to find rows that need attention.
+    """
+
+    flags = get_flags()
+    if flags.disable_new_agent_stack or not flags.enable_revert_route:
+        raise HTTPException(
+            status_code=503,
+            detail=(
+                "Revert is not available on this deployment yet. The route "
+                "ships before the UI; flip SURFSENSE_ENABLE_REVERT_ROUTE to "
+                "enable it."
+            ),
+        )
+
+    thread = await load_thread(session, thread_id=thread_id)
+    if thread is None:
+        raise HTTPException(status_code=404, detail="Thread not found.")
+
+    # Reverse-chronological so the latest mutation in the turn unwinds
+    # first. ``id.desc()`` is the deterministic tiebreaker for actions
+    # written in the same millisecond.
+    rows_stmt = (
+        select(AgentActionLog)
+        .where(
+            AgentActionLog.thread_id == thread_id,
+            AgentActionLog.chat_turn_id == chat_turn_id,
+        )
+        .order_by(AgentActionLog.created_at.desc(), AgentActionLog.id.desc())
+    )
+    rows = (await session.execute(rows_stmt)).scalars().all()
+
+    requester_user_id = str(user.id) if user is not None else None
+    results: list[RevertTurnActionResult] = []
+    # Counters MUST be exhaustive so the response invariant
+    # ``total == sum(counters)`` always holds. Frontend toasts and
+    # ``RevertTurnButton`` rely on this for "X of Y reverted" math.
+    counts: dict[str, int] = {
+        "reverted": 0,
+        "already_reverted": 0,
+        "not_reversible": 0,
+        "permission_denied": 0,
+        "failed": 0,
+        "skipped": 0,
+    }
+
+    # Single batched idempotency probe replaces the previous per-row
+    # SELECT. ``rows`` are filtered in the loop so we pre-collect only
+    # the original-action ids (skip rows that are themselves
+    # reverts).
+    eligible_ids = [r.id for r in rows if r.reverse_of is None]
+    already_reverted_map = await _was_already_reverted_batch(
+        session, action_ids=eligible_ids
+    )
+
+    for action in rows:
+        # Skip rows that ARE reverts of an earlier action — reverting a
+        # revert is meaningless inside a batch (the user wants to wipe
+        # the original effects, not chase tail).
+        if action.reverse_of is not None:
+            counts["skipped"] += 1
+            results.append(
+                RevertTurnActionResult(
+                    action_id=action.id,
+                    tool_name=action.tool_name,
+                    status="skipped",
+                    message="Row is itself a revert action; skipped.",
+                )
+            )
+            continue
+
+        # Idempotency: surface "already_reverted" instead of failing.
+        existing_revert_id = already_reverted_map.get(action.id)
+        if existing_revert_id is not None:
+            counts["already_reverted"] += 1
+            results.append(
+                RevertTurnActionResult(
+                    action_id=action.id,
+                    tool_name=action.tool_name,
+                    status="already_reverted",
+                    new_action_id=existing_revert_id,
+                )
+            )
+            continue
+
+        if not can_revert(
+            requester_user_id=requester_user_id,
+            action=action,
+            is_admin=False,
+        ):
+            counts["permission_denied"] += 1
+            results.append(
+                RevertTurnActionResult(
+                    action_id=action.id,
+                    tool_name=action.tool_name,
+                    status="permission_denied",
+                    message="You are not allowed to revert this action.",
+                )
+            )
+            continue
+
+        # Per-row SAVEPOINT so one failed revert never poisons later
+        # successful ones.
+        try:
+            async with session.begin_nested():
+                outcome = await revert_action(
+                    session,
+                    action=action,
+                    requester_user_id=requester_user_id,
+                )
+                if outcome.status != "ok":
+                    raise _OutcomeRollbackError(outcome)
+        except _OutcomeRollbackError as rollback:
+            outcome = rollback.outcome
+            classified = _classify_outcome(outcome)
+            if classified == "permission_denied":
+                counts["permission_denied"] += 1
+            else:
+                counts["not_reversible"] += 1
+            results.append(
+                RevertTurnActionResult(
+                    action_id=action.id,
+                    tool_name=action.tool_name,
+                    status=classified,
+                    message=outcome.message,
+                )
+            )
+            continue
+        except IntegrityError:
+            # Partial unique index caught a concurrent revert that won
+            # the race against our pre-flight ``_was_already_reverted``
+            # SELECT. Look up the winner so
+            # we can surface its ``new_action_id`` to the client.
+            existing_revert_id = await _was_already_reverted(
+                session, action_id=action.id
+            )
+            counts["already_reverted"] += 1
+            results.append(
+                RevertTurnActionResult(
+                    action_id=action.id,
+                    tool_name=action.tool_name,
+                    status="already_reverted",
+                    new_action_id=existing_revert_id,
+                )
+            )
+            continue
+        except Exception as err:  # pragma: no cover — defensive, logged
+            logger.exception(
+                "Unexpected revert failure inside batch for action_id=%s",
+                action.id,
+            )
+            counts["failed"] += 1
+            results.append(
+                RevertTurnActionResult(
+                    action_id=action.id,
+                    tool_name=action.tool_name,
+                    status="failed",
+                    error=str(err) or err.__class__.__name__,
+                )
+            )
+            continue
+
+        counts["reverted"] += 1
+        results.append(
+            RevertTurnActionResult(
+                action_id=action.id,
+                tool_name=action.tool_name,
+                status="reverted",
+                message=outcome.message,
+                new_action_id=outcome.new_action_id,
+            )
+        )
+
+    # Single commit at the end — successful SAVEPOINTs above already
+    # released; failed ones rolled back to their savepoint. No row leaks
+    # across the boundary.
+    try:
+        await session.commit()
+    except Exception as err:  # pragma: no cover — defensive
+        logger.exception(
+            "Final commit for revert-turn failed (thread=%s turn=%s)",
+            thread_id,
+            chat_turn_id,
+        )
+        await session.rollback()
+        raise HTTPException(
+            status_code=500,
+            detail="Internal error while finalising revert-turn batch.",
+        ) from err
+
+    has_partial = (
+        counts["failed"] > 0
+        or counts["not_reversible"] > 0
+        or counts["permission_denied"] > 0
+    )
+    overall_status: Literal["ok", "partial"] = "partial" if has_partial else "ok"
+
+    return RevertTurnResponse(
+        status=overall_status,
+        chat_turn_id=chat_turn_id,
+        total=len(rows),
+        reverted=counts["reverted"],
+        already_reverted=counts["already_reverted"],
+        not_reversible=counts["not_reversible"],
+        permission_denied=counts["permission_denied"],
+        failed=counts["failed"],
+        skipped=counts["skipped"],
+        results=results,
+    )
+
+
+class _OutcomeRollbackError(Exception):
+    """Sentinel raised inside the SAVEPOINT to roll back a non-OK outcome.
+
+    ``revert_action`` writes a new ``agent_action_log`` row only on the
+    happy path, but on the failure paths it sometimes mutates the
+    ``DocumentRevision``/``Document`` tables before deciding the action
+    is not reversible. Wrapping each call in ``begin_nested`` and raising
+    this from the failure branch ensures we always discard partial
+    writes for failed rows.
+    """
+
+    def __init__(self, outcome: RevertOutcome) -> None:
+        self.outcome = outcome
+        super().__init__(outcome.message)
+
+
+__all__ = ["router"]
diff --git a/surfsense_backend/app/routes/airtable_add_connector_route.py b/surfsense_backend/app/routes/airtable_add_connector_route.py
index 1e0b1eb5d6..f70b9166b8 100644
--- a/surfsense_backend/app/routes/airtable_add_connector_route.py
+++ b/surfsense_backend/app/routes/airtable_add_connector_route.py
@@ -311,7 +311,7 @@ async def airtable_callback(
         new_connector = SearchSourceConnector(
             name=connector_name,
             connector_type=SearchSourceConnectorType.AIRTABLE_CONNECTOR,
-            is_indexable=True,
+            is_indexable=False,
             config=credentials_dict,
             search_space_id=space_id,
             user_id=user_id,
diff --git a/surfsense_backend/app/routes/anonymous_chat_routes.py b/surfsense_backend/app/routes/anonymous_chat_routes.py
new file mode 100644
index 0000000000..f9d694e5a5
--- /dev/null
+++ b/surfsense_backend/app/routes/anonymous_chat_routes.py
@@ -0,0 +1,629 @@
+"""Public API endpoints for anonymous (no-login) chat."""
+
+from __future__ import annotations
+
+import logging
+import secrets
+import uuid
+from pathlib import PurePosixPath
+from typing import Any
+
+from fastapi import APIRouter, HTTPException, Request, Response, UploadFile, status
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel, Field
+
+from app.config import config
+from app.etl_pipeline.file_classifier import (
+    DIRECT_CONVERT_EXTENSIONS,
+    PLAINTEXT_EXTENSIONS,
+)
+from app.rate_limiter import limiter
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter(prefix="/api/v1/public/anon-chat", tags=["anonymous-chat"])
+
+ANON_COOKIE_NAME = "surfsense_anon_session"
+ANON_COOKIE_MAX_AGE = config.ANON_TOKEN_QUOTA_TTL_DAYS * 86400
+
+# Cross-site cookie settings: when the backend runs on a different domain
+# than the frontend (e.g. api.x.com vs www.y.com), browsers reject
+# SameSite=Lax cookies on fetch() calls.  Use SameSite=None + Secure
+# in production (HTTPS) so the cookie is sent cross-site.
+_IS_SECURE_CONTEXT = bool(
+    config.BACKEND_URL and config.BACKEND_URL.startswith("https://")
+)
+_COOKIE_SAMESITE: str = "none" if _IS_SECURE_CONTEXT else "lax"
+_COOKIE_SECURE: bool = _IS_SECURE_CONTEXT
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _get_or_create_session_id(request: Request, response: Response) -> str:
+    """Read the signed session cookie or create a new one."""
+    session_id = request.cookies.get(ANON_COOKIE_NAME)
+    if session_id and len(session_id) == 43:
+        return session_id
+    session_id = secrets.token_urlsafe(32)
+    response.set_cookie(
+        key=ANON_COOKIE_NAME,
+        value=session_id,
+        max_age=ANON_COOKIE_MAX_AGE,
+        httponly=True,
+        samesite=_COOKIE_SAMESITE,
+        secure=_COOKIE_SECURE,
+        path="/",
+    )
+    return session_id
+
+
+def _get_client_ip(request: Request) -> str:
+    """Extract the real client IP, preferring Cloudflare's header."""
+    cf_ip = request.headers.get("cf-connecting-ip")
+    if cf_ip:
+        return cf_ip.strip()
+    real_ip = request.headers.get("x-real-ip")
+    if real_ip:
+        return real_ip.strip()
+    forwarded = request.headers.get("x-forwarded-for")
+    if forwarded:
+        return forwarded.split(",")[0].strip()
+    return request.client.host if request.client else "unknown"
+
+
+# ---------------------------------------------------------------------------
+# Schemas
+# ---------------------------------------------------------------------------
+
+
+class AnonChatRequest(BaseModel):
+    model_slug: str = Field(..., max_length=100)
+    messages: list[dict[str, Any]] = Field(..., min_length=1)
+    disabled_tools: list[str] | None = None
+    turnstile_token: str | None = None
+
+
+class AnonQuotaResponse(BaseModel):
+    used: int
+    limit: int
+    remaining: int
+    status: str
+    warning_threshold: int
+    captcha_required: bool = False
+
+
+class AnonModelResponse(BaseModel):
+    id: int
+    name: str
+    description: str | None = None
+    provider: str
+    model_name: str
+    billing_tier: str = "free"
+    is_premium: bool = False
+    seo_slug: str | None = None
+    seo_enabled: bool = False
+    seo_title: str | None = None
+    seo_description: str | None = None
+    quota_reserve_tokens: int | None = None
+
+
+# ---------------------------------------------------------------------------
+# Routes
+# ---------------------------------------------------------------------------
+
+
+@router.get("/models", response_model=list[AnonModelResponse])
+async def list_anonymous_models():
+    """Return all models enabled for anonymous access."""
+    if not config.NOLOGIN_MODE_ENABLED:
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail="No-login mode is not enabled.",
+        )
+
+    models = []
+    for cfg in config.GLOBAL_LLM_CONFIGS:
+        if cfg.get("anonymous_enabled", False):
+            models.append(
+                AnonModelResponse(
+                    id=cfg.get("id", 0),
+                    name=cfg.get("name", ""),
+                    description=cfg.get("description"),
+                    provider=cfg.get("provider", ""),
+                    model_name=cfg.get("model_name", ""),
+                    billing_tier=cfg.get("billing_tier", "free"),
+                    is_premium=cfg.get("billing_tier", "free") == "premium",
+                    seo_slug=cfg.get("seo_slug"),
+                    seo_enabled=cfg.get("seo_enabled", False),
+                    seo_title=cfg.get("seo_title"),
+                    seo_description=cfg.get("seo_description"),
+                    quota_reserve_tokens=cfg.get("quota_reserve_tokens"),
+                )
+            )
+    return models
+
+
+@router.get("/models/{slug}", response_model=AnonModelResponse)
+async def get_anonymous_model(slug: str):
+    """Return a single model by its SEO slug."""
+    if not config.NOLOGIN_MODE_ENABLED:
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail="No-login mode is not enabled.",
+        )
+
+    for cfg in config.GLOBAL_LLM_CONFIGS:
+        if cfg.get("anonymous_enabled", False) and cfg.get("seo_slug") == slug:
+            return AnonModelResponse(
+                id=cfg.get("id", 0),
+                name=cfg.get("name", ""),
+                description=cfg.get("description"),
+                provider=cfg.get("provider", ""),
+                model_name=cfg.get("model_name", ""),
+                billing_tier=cfg.get("billing_tier", "free"),
+                is_premium=cfg.get("billing_tier", "free") == "premium",
+                seo_slug=cfg.get("seo_slug"),
+                seo_enabled=cfg.get("seo_enabled", False),
+                seo_title=cfg.get("seo_title"),
+                seo_description=cfg.get("seo_description"),
+                quota_reserve_tokens=cfg.get("quota_reserve_tokens"),
+            )
+
+    raise HTTPException(status_code=404, detail="Model not found")
+
+
+@router.get("/quota", response_model=AnonQuotaResponse)
+@limiter.limit("30/minute")
+async def get_anonymous_quota(request: Request, response: Response):
+    """Return current token usage for the anonymous session.
+
+    Reports the *stricter* of session and IP buckets so that opening a
+    new browser on the same IP doesn't show a misleadingly fresh quota.
+    """
+    if not config.NOLOGIN_MODE_ENABLED:
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail="No-login mode is not enabled.",
+        )
+
+    from app.services.token_quota_service import (
+        TokenQuotaService,
+        compute_anon_identity_key,
+        compute_ip_quota_key,
+    )
+
+    client_ip = _get_client_ip(request)
+
+    session_id = _get_or_create_session_id(request, response)
+    session_key = compute_anon_identity_key(session_id)
+    session_result = await TokenQuotaService.anon_get_usage(session_key)
+
+    ip_key = compute_ip_quota_key(client_ip)
+    ip_result = await TokenQuotaService.anon_get_usage(ip_key)
+
+    # Use whichever bucket has higher usage — that's the real constraint
+    result = ip_result if ip_result.used > session_result.used else session_result
+
+    captcha_needed = False
+    if config.TURNSTILE_ENABLED:
+        req_count = await TokenQuotaService.anon_get_request_count(client_ip)
+        captcha_needed = req_count >= config.ANON_CAPTCHA_REQUEST_THRESHOLD
+
+    return AnonQuotaResponse(
+        used=result.used,
+        limit=result.limit,
+        remaining=result.remaining,
+        status=result.status.value,
+        warning_threshold=config.ANON_TOKEN_WARNING_THRESHOLD,
+        captcha_required=captcha_needed,
+    )
+
+
+@router.post("/stream")
+@limiter.limit("15/minute")
+async def stream_anonymous_chat(
+    body: AnonChatRequest,
+    request: Request,
+    response: Response,
+):
+    """Stream a chat response for an anonymous user with quota enforcement."""
+    if not config.NOLOGIN_MODE_ENABLED:
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail="No-login mode is not enabled.",
+        )
+
+    from app.agents.new_chat.llm_config import (
+        AgentConfig,
+        create_chat_litellm_from_agent_config,
+    )
+    from app.services.token_quota_service import (
+        TokenQuotaService,
+        compute_anon_identity_key,
+        compute_ip_quota_key,
+    )
+    from app.services.turnstile_service import verify_turnstile_token
+
+    # Find the model config by slug
+    model_cfg = None
+    for cfg in config.GLOBAL_LLM_CONFIGS:
+        if (
+            cfg.get("anonymous_enabled", False)
+            and cfg.get("seo_slug") == body.model_slug
+        ):
+            model_cfg = cfg
+            break
+
+    if model_cfg is None:
+        raise HTTPException(
+            status_code=404, detail="Model not found or not available for anonymous use"
+        )
+
+    client_ip = _get_client_ip(request)
+
+    # --- Concurrent stream limit ---
+    slot_acquired = await TokenQuotaService.anon_acquire_stream_slot(
+        client_ip, max_concurrent=config.ANON_MAX_CONCURRENT_STREAMS
+    )
+    if not slot_acquired:
+        raise HTTPException(
+            status_code=status.HTTP_429_TOO_MANY_REQUESTS,
+            detail={
+                "code": "ANON_TOO_MANY_STREAMS",
+                "message": f"Max {config.ANON_MAX_CONCURRENT_STREAMS} concurrent chats allowed. Please wait for a response to finish.",
+            },
+        )
+
+    try:
+        # --- CAPTCHA enforcement (check count without incrementing; count
+        #     is bumped only after a successful response in _generate) ---
+        if config.TURNSTILE_ENABLED:
+            req_count = await TokenQuotaService.anon_get_request_count(client_ip)
+            if req_count >= config.ANON_CAPTCHA_REQUEST_THRESHOLD:
+                if not body.turnstile_token:
+                    raise HTTPException(
+                        status_code=status.HTTP_403_FORBIDDEN,
+                        detail={
+                            "code": "CAPTCHA_REQUIRED",
+                            "message": "Please complete the CAPTCHA to continue chatting.",
+                        },
+                    )
+                valid = await verify_turnstile_token(body.turnstile_token, client_ip)
+                if not valid:
+                    raise HTTPException(
+                        status_code=status.HTTP_403_FORBIDDEN,
+                        detail={
+                            "code": "CAPTCHA_INVALID",
+                            "message": "CAPTCHA verification failed. Please try again.",
+                        },
+                    )
+                await TokenQuotaService.anon_reset_request_count(client_ip)
+
+        # Build identity keys
+        session_id = _get_or_create_session_id(request, response)
+        session_key = compute_anon_identity_key(session_id)
+        ip_key = compute_ip_quota_key(client_ip)
+
+        # Reserve tokens
+        reserve_amount = min(
+            model_cfg.get("quota_reserve_tokens", config.QUOTA_MAX_RESERVE_PER_CALL),
+            config.QUOTA_MAX_RESERVE_PER_CALL,
+        )
+        request_id = uuid.uuid4().hex[:16]
+
+        quota_result = await TokenQuotaService.anon_reserve(
+            session_key=session_key,
+            ip_key=ip_key,
+            request_id=request_id,
+            reserve_tokens=reserve_amount,
+        )
+
+        if not quota_result.allowed:
+            raise HTTPException(
+                status_code=status.HTTP_429_TOO_MANY_REQUESTS,
+                detail={
+                    "code": "ANON_QUOTA_EXCEEDED",
+                    "message": "You've used all your free tokens. Create an account for 5M more!",
+                    "used": quota_result.used,
+                    "limit": quota_result.limit,
+                },
+            )
+
+        # Create agent config from YAML
+        agent_config = AgentConfig.from_yaml_config(model_cfg)
+        llm = create_chat_litellm_from_agent_config(agent_config)
+        if not llm:
+            await TokenQuotaService.anon_release(session_key, ip_key, request_id)
+            raise HTTPException(status_code=500, detail="Failed to create LLM instance")
+
+        # Server-side tool allow-list enforcement
+        anon_allowed_tools = {"web_search"}
+        client_disabled = set(body.disabled_tools) if body.disabled_tools else set()
+        enabled_for_agent = anon_allowed_tools - client_disabled
+
+    except HTTPException:
+        await TokenQuotaService.anon_release_stream_slot(client_ip)
+        raise
+
+    async def _generate():
+        from langchain_core.messages import AIMessage, HumanMessage
+
+        from app.agents.new_chat.chat_deepagent import create_surfsense_deep_agent
+        from app.agents.new_chat.checkpointer import get_checkpointer
+        from app.db import shielded_async_session
+        from app.services.connector_service import ConnectorService
+        from app.services.new_streaming_service import VercelStreamingService
+        from app.services.token_tracking_service import start_turn
+        from app.tasks.chat.stream_new_chat import StreamResult, _stream_agent_events
+
+        accumulator = start_turn()
+        streaming_service = VercelStreamingService()
+
+        try:
+            async with shielded_async_session() as session:
+                connector_service = ConnectorService(session, search_space_id=None)
+                checkpointer = await get_checkpointer()
+
+                anon_thread_id = f"anon-{session_id}-{request_id}"
+
+                agent = await create_surfsense_deep_agent(
+                    llm=llm,
+                    search_space_id=0,
+                    db_session=session,
+                    connector_service=connector_service,
+                    checkpointer=checkpointer,
+                    user_id=None,
+                    thread_id=None,
+                    agent_config=agent_config,
+                    enabled_tools=list(enabled_for_agent),
+                    disabled_tools=None,
+                    anon_session_id=session_id,
+                )
+
+                langchain_messages = []
+                user_query = ""
+                for msg in body.messages:
+                    role = msg.get("role", "")
+                    content = msg.get("content", "")
+                    if role == "user":
+                        langchain_messages.append(HumanMessage(content=content))
+                        user_query = content
+                    elif role == "assistant":
+                        langchain_messages.append(AIMessage(content=content))
+
+                input_state = {
+                    "messages": langchain_messages,
+                    "search_space_id": 0,
+                }
+
+                langgraph_config = {
+                    "configurable": {"thread_id": anon_thread_id},
+                    "recursion_limit": 40,
+                }
+
+                yield streaming_service.format_message_start()
+                yield streaming_service.format_start_step()
+
+                initial_step_id = "thinking-1"
+                query_preview = user_query[:80] + (
+                    "..." if len(user_query) > 80 else ""
+                )
+                initial_items = [f"Processing: {query_preview}"]
+
+                yield streaming_service.format_thinking_step(
+                    step_id=initial_step_id,
+                    title="Understanding your request",
+                    status="in_progress",
+                    items=initial_items,
+                )
+
+                stream_result = StreamResult()
+
+                async for sse in _stream_agent_events(
+                    agent=agent,
+                    config=langgraph_config,
+                    input_data=input_state,
+                    streaming_service=streaming_service,
+                    result=stream_result,
+                    step_prefix="thinking",
+                    initial_step_id=initial_step_id,
+                    initial_step_title="Understanding your request",
+                    initial_step_items=initial_items,
+                ):
+                    yield sse
+
+            # Finalize quota with actual tokens
+            actual_tokens = accumulator.grand_total
+            finalize_result = await TokenQuotaService.anon_finalize(
+                session_key=session_key,
+                ip_key=ip_key,
+                request_id=request_id,
+                actual_tokens=actual_tokens,
+            )
+
+            # Count this as 1 completed response for CAPTCHA threshold
+            if config.TURNSTILE_ENABLED:
+                await TokenQuotaService.anon_increment_request_count(client_ip)
+
+            yield streaming_service.format_data(
+                "anon-quota",
+                {
+                    "used": finalize_result.used,
+                    "limit": finalize_result.limit,
+                    "remaining": finalize_result.remaining,
+                    "status": finalize_result.status.value,
+                },
+            )
+
+            if accumulator.per_message_summary():
+                yield streaming_service.format_data(
+                    "token-usage",
+                    {
+                        "usage": accumulator.per_message_summary(),
+                        "prompt_tokens": accumulator.total_prompt_tokens,
+                        "completion_tokens": accumulator.total_completion_tokens,
+                        "total_tokens": accumulator.grand_total,
+                    },
+                )
+
+            yield streaming_service.format_finish_step()
+            yield streaming_service.format_finish()
+            yield streaming_service.format_done()
+
+        except Exception as e:
+            logger.exception("Anonymous chat stream error")
+            await TokenQuotaService.anon_release(session_key, ip_key, request_id)
+            yield streaming_service.format_error(f"Error during chat: {e!s}")
+            yield streaming_service.format_done()
+        finally:
+            await TokenQuotaService.anon_release_stream_slot(client_ip)
+
+    return StreamingResponse(
+        _generate(),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "X-Accel-Buffering": "no",
+        },
+    )
+
+
+# ---------------------------------------------------------------------------
+# Anonymous Document Upload (1-doc limit, plaintext/direct-convert only)
+# ---------------------------------------------------------------------------
+
+ANON_ALLOWED_EXTENSIONS = PLAINTEXT_EXTENSIONS | DIRECT_CONVERT_EXTENSIONS
+ANON_DOC_REDIS_PREFIX = "anon:doc:"
+
+
+class AnonDocResponse(BaseModel):
+    filename: str
+    size_bytes: int
+    status: str = "uploaded"
+
+
+@router.post("/upload", response_model=AnonDocResponse)
+@limiter.limit("5/minute")
+async def upload_anonymous_document(
+    file: UploadFile,
+    request: Request,
+    response: Response,
+):
+    """Upload a single document for anonymous chat (1-doc limit per session)."""
+    if not config.NOLOGIN_MODE_ENABLED:
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail="No-login mode is not enabled.",
+        )
+
+    session_id = _get_or_create_session_id(request, response)
+
+    if not file.filename:
+        raise HTTPException(status_code=400, detail="No filename provided")
+
+    ext = PurePosixPath(file.filename).suffix.lower()
+    if ext not in ANON_ALLOWED_EXTENSIONS:
+        raise HTTPException(
+            status_code=415,
+            detail=(
+                "File type not supported for anonymous upload. "
+                "Create an account to upload PDFs, Word documents, images, audio, and 20+ more file types. "
+                "Allowed extensions: text, code, CSV, HTML files."
+            ),
+        )
+
+    max_size = config.ANON_MAX_UPLOAD_SIZE_MB * 1024 * 1024
+    content = await file.read()
+    if len(content) > max_size:
+        raise HTTPException(
+            status_code=413,
+            detail=f"File too large. Max size is {config.ANON_MAX_UPLOAD_SIZE_MB} MB.",
+        )
+
+    import json as _json
+
+    import redis.asyncio as aioredis
+
+    redis_client = aioredis.from_url(config.REDIS_APP_URL, decode_responses=True)
+    redis_key = f"{ANON_DOC_REDIS_PREFIX}{session_id}"
+
+    try:
+        existing = await redis_client.exists(redis_key)
+        if existing:
+            raise HTTPException(
+                status_code=409,
+                detail="Document limit reached. Create an account to upload more.",
+            )
+
+        text_content: str
+        if ext in PLAINTEXT_EXTENSIONS:
+            text_content = content.decode("utf-8", errors="replace")
+        elif ext in DIRECT_CONVERT_EXTENSIONS:
+            if ext in {".csv", ".tsv"}:
+                text_content = content.decode("utf-8", errors="replace")
+            else:
+                try:
+                    from markdownify import markdownify
+
+                    text_content = markdownify(
+                        content.decode("utf-8", errors="replace")
+                    )
+                except ImportError:
+                    text_content = content.decode("utf-8", errors="replace")
+        else:
+            text_content = content.decode("utf-8", errors="replace")
+
+        doc_data = _json.dumps(
+            {
+                "filename": file.filename,
+                "size_bytes": len(content),
+                "content": text_content,
+            }
+        )
+
+        ttl_seconds = config.ANON_TOKEN_QUOTA_TTL_DAYS * 86400
+        await redis_client.set(redis_key, doc_data, ex=ttl_seconds)
+
+    finally:
+        await redis_client.aclose()
+
+    return AnonDocResponse(
+        filename=file.filename,
+        size_bytes=len(content),
+    )
+
+
+@router.get("/document")
+async def get_anonymous_document(request: Request, response: Response):
+    """Get metadata of the uploaded document for the anonymous session."""
+    if not config.NOLOGIN_MODE_ENABLED:
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail="No-login mode is not enabled.",
+        )
+
+    session_id = _get_or_create_session_id(request, response)
+
+    import json as _json
+
+    import redis.asyncio as aioredis
+
+    redis_client = aioredis.from_url(config.REDIS_APP_URL, decode_responses=True)
+    redis_key = f"{ANON_DOC_REDIS_PREFIX}{session_id}"
+
+    try:
+        data = await redis_client.get(redis_key)
+        if not data:
+            raise HTTPException(status_code=404, detail="No document uploaded")
+
+        doc = _json.loads(data)
+        return {
+            "filename": doc["filename"],
+            "size_bytes": doc["size_bytes"],
+        }
+    finally:
+        await redis_client.aclose()
diff --git a/surfsense_backend/app/routes/autocomplete_routes.py b/surfsense_backend/app/routes/autocomplete_routes.py
deleted file mode 100644
index a11b7dbc14..0000000000
--- a/surfsense_backend/app/routes/autocomplete_routes.py
+++ /dev/null
@@ -1,45 +0,0 @@
-from fastapi import APIRouter, Depends
-from fastapi.responses import StreamingResponse
-from pydantic import BaseModel, Field
-from sqlalchemy.ext.asyncio import AsyncSession
-
-from app.db import User, get_async_session
-from app.services.new_streaming_service import VercelStreamingService
-from app.services.vision_autocomplete_service import stream_vision_autocomplete
-from app.users import current_active_user
-from app.utils.rbac import check_search_space_access
-
-router = APIRouter(prefix="/autocomplete", tags=["autocomplete"])
-
-MAX_SCREENSHOT_SIZE = 20 * 1024 * 1024  # 20 MB base64 ceiling
-
-
-class VisionAutocompleteRequest(BaseModel):
-    screenshot: str = Field(..., max_length=MAX_SCREENSHOT_SIZE)
-    search_space_id: int
-    app_name: str = ""
-    window_title: str = ""
-
-
-@router.post("/vision/stream")
-async def vision_autocomplete_stream(
-    body: VisionAutocompleteRequest,
-    user: User = Depends(current_active_user),
-    session: AsyncSession = Depends(get_async_session),
-):
-    await check_search_space_access(session, user, body.search_space_id)
-
-    return StreamingResponse(
-        stream_vision_autocomplete(
-            body.screenshot,
-            body.search_space_id,
-            session,
-            app_name=body.app_name,
-            window_title=body.window_title,
-        ),
-        media_type="text/event-stream",
-        headers={
-            **VercelStreamingService.get_response_headers(),
-            "X-Accel-Buffering": "no",
-        },
-    )
diff --git a/surfsense_backend/app/routes/clickup_add_connector_route.py b/surfsense_backend/app/routes/clickup_add_connector_route.py
index 2cd63eca22..f7b0876e5a 100644
--- a/surfsense_backend/app/routes/clickup_add_connector_route.py
+++ b/surfsense_backend/app/routes/clickup_add_connector_route.py
@@ -301,7 +301,7 @@ async def clickup_callback(
             # Update existing connector
             existing_connector.config = connector_config
             existing_connector.name = "ClickUp Connector"
-            existing_connector.is_indexable = True
+            existing_connector.is_indexable = False
             logger.info(
                 f"Updated existing ClickUp connector for user {user_id} in space {space_id}"
             )
@@ -310,7 +310,7 @@ async def clickup_callback(
             new_connector = SearchSourceConnector(
                 name="ClickUp Connector",
                 connector_type=SearchSourceConnectorType.CLICKUP_CONNECTOR,
-                is_indexable=True,
+                is_indexable=False,
                 config=connector_config,
                 search_space_id=space_id,
                 user_id=user_id,
diff --git a/surfsense_backend/app/routes/composio_routes.py b/surfsense_backend/app/routes/composio_routes.py
index 4bf360365a..7bc2addf86 100644
--- a/surfsense_backend/app/routes/composio_routes.py
+++ b/surfsense_backend/app/routes/composio_routes.py
@@ -649,13 +649,9 @@ async def list_composio_drive_folders(
     """
     List folders AND files in user's Google Drive via Composio.
 
-    Uses the same GoogleDriveClient / list_folder_contents path as the native
-    connector, with Composio-sourced credentials.  This means auth errors
-    propagate identically (Google returns 401 → exception → auth_expired flag).
+    Uses Composio's Google Drive tool execution path so managed OAuth tokens
+    do not need to be exposed through connected account state.
     """
-    from app.connectors.google_drive import GoogleDriveClient, list_folder_contents
-    from app.utils.google_credentials import build_composio_credentials
-
     if not ComposioService.is_enabled():
         raise HTTPException(
             status_code=503,
@@ -689,10 +685,37 @@ async def list_composio_drive_folders(
                 detail="Composio connected account not found. Please reconnect the connector.",
             )
 
-        credentials = build_composio_credentials(composio_connected_account_id)
-        drive_client = GoogleDriveClient(session, connector_id, credentials=credentials)
+        service = ComposioService()
+        entity_id = f"surfsense_{user.id}"
+        items = []
+        page_token = None
+        error = None
+
+        while True:
+            page_items, next_token, page_error = await service.get_drive_files(
+                connected_account_id=composio_connected_account_id,
+                entity_id=entity_id,
+                folder_id=parent_id,
+                page_token=page_token,
+                page_size=100,
+            )
+            if page_error:
+                error = page_error
+                break
+
+            items.extend(page_items)
+            if not next_token:
+                break
+            page_token = next_token
+
+        for item in items:
+            item["isFolder"] = (
+                item.get("mimeType") == "application/vnd.google-apps.folder"
+            )
 
-        items, error = await list_folder_contents(drive_client, parent_id=parent_id)
+        items.sort(
+            key=lambda item: (not item["isFolder"], item.get("name", "").lower())
+        )
 
         if error:
             error_lower = error.lower()
diff --git a/surfsense_backend/app/routes/discord_add_connector_route.py b/surfsense_backend/app/routes/discord_add_connector_route.py
index 27bfffc901..4ab48f544f 100644
--- a/surfsense_backend/app/routes/discord_add_connector_route.py
+++ b/surfsense_backend/app/routes/discord_add_connector_route.py
@@ -326,7 +326,7 @@ async def discord_callback(
         new_connector = SearchSourceConnector(
             name=connector_name,
             connector_type=SearchSourceConnectorType.DISCORD_CONNECTOR,
-            is_indexable=True,
+            is_indexable=False,
             config=connector_config,
             search_space_id=space_id,
             user_id=user_id,
diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py
index 53312c6472..96c5d23440 100644
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@@ -2,11 +2,12 @@
 import asyncio
 
 from fastapi import APIRouter, Depends, Form, HTTPException, Query, UploadFile
-from pydantic import BaseModel as PydanticBaseModel
+from pydantic import BaseModel as PydanticBaseModel, Field
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
 from sqlalchemy.orm import selectinload
 
+from app.agents.new_chat.path_resolver import virtual_path_to_doc
 from app.db import (
     Chunk,
     Document,
@@ -123,6 +124,8 @@ async def create_documents_file_upload(
     files: list[UploadFile],
     search_space_id: int = Form(...),
     should_summarize: bool = Form(False),
+    use_vision_llm: bool = Form(False),
+    processing_mode: str = Form("basic"),
     session: AsyncSession = Depends(get_async_session),
     user: User = Depends(current_active_user),
     dispatcher: TaskDispatcher = Depends(get_task_dispatcher),
@@ -141,12 +144,15 @@ async def create_documents_file_upload(
     from datetime import datetime
 
     from app.db import DocumentStatus
+    from app.etl_pipeline.etl_document import ProcessingMode
     from app.tasks.document_processors.base import (
         check_document_by_unique_identifier,
         get_current_timestamp,
     )
     from app.utils.document_converters import generate_unique_identifier_hash
 
+    validated_mode = ProcessingMode.coerce(processing_mode)
+
     try:
         await check_permission(
             session,
@@ -272,6 +278,8 @@ def _write_temp() -> str:
                 search_space_id=search_space_id,
                 user_id=str(user.id),
                 should_summarize=should_summarize,
+                use_vision_llm=use_vision_llm,
+                processing_mode=validated_mode.value,
             )
 
         return {
@@ -738,6 +746,63 @@ async def search_document_titles(
         ) from e
 
 
+@router.get("/documents/by-virtual-path", response_model=DocumentTitleRead)
+async def get_document_by_virtual_path(
+    search_space_id: int,
+    virtual_path: str,
+    session: AsyncSession = Depends(get_async_session),
+    user: User = Depends(current_active_user),
+):
+    """Resolve a knowledge-base document by its agent-facing virtual path.
+
+    The agent renders every document under ``/documents/...`` with a
+    ``.xml`` extension appended via ``safe_filename`` (so a PDF titled
+    ``2025-W2.pdf`` becomes ``/documents/2025-W2.pdf.xml``). When the user
+    clicks that path in an answer, this endpoint must round-trip back to
+    the underlying ``Document`` row regardless of its type — agent-created
+    NOTE docs (which carry ``virtual_path`` in metadata), uploaded PDFs,
+    and connector docs all flow through here.
+
+    Resolution is delegated to :func:`virtual_path_to_doc`, the single
+    source of truth that handles:
+
+    * ``unique_identifier_hash`` lookup (agent NOTE fast path)
+    * ``" (<doc_id>).xml"`` disambiguation suffixes
+    * ``.xml`` extension stripping for title-based fallback
+    * ``safe_filename`` round-trip for connector titles with lossy chars
+    """
+    try:
+        await check_permission(
+            session,
+            user,
+            search_space_id,
+            Permission.DOCUMENTS_READ.value,
+            "You don't have permission to read documents in this search space",
+        )
+
+        document = await virtual_path_to_doc(
+            session,
+            search_space_id=search_space_id,
+            virtual_path=virtual_path,
+        )
+        if document is None:
+            raise HTTPException(status_code=404, detail="Document not found")
+
+        return DocumentTitleRead(
+            id=document.id,
+            title=document.title,
+            document_type=document.document_type,
+            folder_id=document.folder_id,
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to resolve document by virtual path: {e!s}",
+        ) from e
+
+
 @router.get("/documents/status", response_model=DocumentStatusBatchResponse)
 async def get_documents_status(
     search_space_id: int,
@@ -1395,10 +1460,13 @@ class FolderMtimeCheckFile(PydanticBaseModel):
     mtime: float
 
 
+_MAX_MTIME_CHECK_FILES = 10_000
+
+
 class FolderMtimeCheckRequest(PydanticBaseModel):
     folder_name: str
     search_space_id: int
-    files: list[FolderMtimeCheckFile]
+    files: list[FolderMtimeCheckFile] = Field(max_length=_MAX_MTIME_CHECK_FILES)
 
 
 class FolderUnlinkRequest(PydanticBaseModel):
@@ -1487,6 +1555,8 @@ async def folder_upload(
     relative_paths: str = Form(...),
     root_folder_id: int | None = Form(None),
     enable_summary: bool = Form(False),
+    use_vision_llm: bool = Form(False),
+    processing_mode: str = Form("basic"),
     session: AsyncSession = Depends(get_async_session),
     user: User = Depends(current_active_user),
 ):
@@ -1498,6 +1568,10 @@ async def folder_upload(
     import json
     import tempfile
 
+    from app.etl_pipeline.etl_document import ProcessingMode
+
+    validated_mode = ProcessingMode.coerce(processing_mode)
+
     await check_permission(
         session,
         user,
@@ -1531,10 +1605,28 @@ async def folder_upload(
                 f"exceeds the {MAX_FILE_SIZE_BYTES // (1024 * 1024)} MB per-file limit.",
             )
 
+    from app.services.folder_service import MAX_FOLDER_DEPTH
+
+    max_subfolder_depth = max((p.count("/") for p in rel_paths if "/" in p), default=0)
+    if 1 + max_subfolder_depth > MAX_FOLDER_DEPTH:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Folder structure too deep: {1 + max_subfolder_depth} levels "
+            f"exceeds the maximum of {MAX_FOLDER_DEPTH}.",
+        )
+
+    if root_folder_id:
+        root_folder = await session.get(Folder, root_folder_id)
+        if not root_folder or root_folder.search_space_id != search_space_id:
+            raise HTTPException(
+                status_code=404, detail="Root folder not found in this search space"
+            )
+
     if not root_folder_id:
         watched_metadata = {
             "watched": True,
             "folder_path": folder_name,
+            "processing_mode": validated_mode.value,
         }
         existing_root = (
             await session.execute(
@@ -1565,7 +1657,8 @@ async def folder_upload(
 
     async def _read_and_save(file: UploadFile, idx: int) -> dict:
         content = await file.read()
-        filename = file.filename or rel_paths[idx].split("/")[-1]
+        raw_name = file.filename or rel_paths[idx]
+        filename = raw_name.split("/")[-1]
 
         def _write_temp() -> str:
             with tempfile.NamedTemporaryFile(
@@ -1595,7 +1688,9 @@ def _write_temp() -> str:
         folder_name=folder_name,
         root_folder_id=root_folder_id,
         enable_summary=enable_summary,
+        use_vision_llm=use_vision_llm,
         file_mappings=list(file_mappings),
+        processing_mode=validated_mode.value,
     )
 
     return {
diff --git a/surfsense_backend/app/routes/editor_routes.py b/surfsense_backend/app/routes/editor_routes.py
index 829b2cf696..0f986e4167 100644
--- a/surfsense_backend/app/routes/editor_routes.py
+++ b/surfsense_backend/app/routes/editor_routes.py
@@ -139,9 +139,19 @@ def _build_response(md: str) -> dict:
                 status_code=409,
                 detail="This document is still being processed. Please wait a moment and try again.",
             )
+        if state == "failed":
+            reason = (
+                doc_status.get("reason", "Unknown error")
+                if isinstance(doc_status, dict)
+                else "Unknown error"
+            )
+            raise HTTPException(
+                status_code=422,
+                detail=f"Processing failed: {reason}. You can delete this document and re-upload it.",
+            )
         raise HTTPException(
             status_code=400,
-            detail="This document has no viewable content yet. It may still be syncing. Try again in a few seconds, or re-upload if the issue persists.",
+            detail="This document has no content. It may not have been processed correctly. Try deleting and re-uploading it.",
         )
 
     markdown_content = "\n\n".join(chunk_contents)
diff --git a/surfsense_backend/app/routes/export_routes.py b/surfsense_backend/app/routes/export_routes.py
new file mode 100644
index 0000000000..4f2b545a35
--- /dev/null
+++ b/surfsense_backend/app/routes/export_routes.py
@@ -0,0 +1,63 @@
+"""Routes for exporting knowledge base content as ZIP."""
+
+import logging
+import os
+
+from fastapi import APIRouter, Depends, HTTPException, Query
+from fastapi.responses import StreamingResponse
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import Permission, User, get_async_session
+from app.services.export_service import build_export_zip
+from app.users import current_active_user
+from app.utils.rbac import check_permission
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter()
+
+
+@router.get("/search-spaces/{search_space_id}/export")
+async def export_knowledge_base(
+    search_space_id: int,
+    folder_id: int | None = Query(
+        None, description="Export only this folder's subtree"
+    ),
+    session: AsyncSession = Depends(get_async_session),
+    user: User = Depends(current_active_user),
+):
+    """Export documents as a ZIP of markdown files preserving folder structure."""
+    await check_permission(
+        session,
+        user,
+        search_space_id,
+        Permission.DOCUMENTS_READ.value,
+        "You don't have permission to export documents in this search space",
+    )
+
+    try:
+        result = await build_export_zip(session, search_space_id, folder_id)
+    except ValueError as e:
+        raise HTTPException(status_code=404, detail=str(e)) from None
+
+    def stream_and_cleanup():
+        try:
+            with open(result.zip_path, "rb") as f:
+                while chunk := f.read(8192):
+                    yield chunk
+        finally:
+            os.unlink(result.zip_path)
+
+    headers = {
+        "Content-Disposition": f'attachment; filename="{result.export_name}.zip"',
+        "Content-Length": str(result.zip_size),
+    }
+
+    if result.skipped_docs:
+        headers["X-Skipped-Documents"] = str(len(result.skipped_docs))
+
+    return StreamingResponse(
+        stream_and_cleanup(),
+        media_type="application/zip",
+        headers=headers,
+    )
diff --git a/surfsense_backend/app/routes/google_calendar_add_connector_route.py b/surfsense_backend/app/routes/google_calendar_add_connector_route.py
index d7ccf62ca2..a143fd50d1 100644
--- a/surfsense_backend/app/routes/google_calendar_add_connector_route.py
+++ b/surfsense_backend/app/routes/google_calendar_add_connector_route.py
@@ -340,7 +340,7 @@ async def calendar_callback(
                 config=creds_dict,
                 search_space_id=space_id,
                 user_id=user_id,
-                is_indexable=True,
+                is_indexable=False,
             )
             session.add(db_connector)
             await session.commit()
diff --git a/surfsense_backend/app/routes/google_gmail_add_connector_route.py b/surfsense_backend/app/routes/google_gmail_add_connector_route.py
index dd8feb1c7a..9b807a556b 100644
--- a/surfsense_backend/app/routes/google_gmail_add_connector_route.py
+++ b/surfsense_backend/app/routes/google_gmail_add_connector_route.py
@@ -371,7 +371,7 @@ async def gmail_callback(
                 config=creds_dict,
                 search_space_id=space_id,
                 user_id=user_id,
-                is_indexable=True,
+                is_indexable=False,
             )
             session.add(db_connector)
             await session.commit()
diff --git a/surfsense_backend/app/routes/image_generation_routes.py b/surfsense_backend/app/routes/image_generation_routes.py
index 97a3559b9b..018234ad58 100644
--- a/surfsense_backend/app/routes/image_generation_routes.py
+++ b/surfsense_backend/app/routes/image_generation_routes.py
@@ -36,11 +36,17 @@
     ImageGenerationListRead,
     ImageGenerationRead,
 )
+from app.services.billable_calls import (
+    DEFAULT_IMAGE_RESERVE_MICROS,
+    QuotaInsufficientError,
+    billable_call,
+)
 from app.services.image_gen_router_service import (
     IMAGE_GEN_AUTO_MODE_ID,
     ImageGenRouterService,
     is_image_gen_auto_mode,
 )
+from app.services.provider_api_base import resolve_api_base
 from app.users import current_active_user
 from app.utils.rbac import check_permission
 from app.utils.signed_image_urls import verify_image_token
@@ -82,14 +88,62 @@ def _get_global_image_gen_config(config_id: int) -> dict | None:
     return None
 
 
+def _resolve_provider_prefix(provider: str, custom_provider: str | None) -> str:
+    """Resolve the LiteLLM provider prefix used in model strings."""
+    if custom_provider:
+        return custom_provider
+    return _PROVIDER_MAP.get(provider.upper(), provider.lower())
+
+
 def _build_model_string(
     provider: str, model_name: str, custom_provider: str | None
 ) -> str:
     """Build a litellm model string from provider + model_name."""
-    if custom_provider:
-        return f"{custom_provider}/{model_name}"
-    prefix = _PROVIDER_MAP.get(provider.upper(), provider.lower())
-    return f"{prefix}/{model_name}"
+    return f"{_resolve_provider_prefix(provider, custom_provider)}/{model_name}"
+
+
+async def _resolve_billing_for_image_gen(
+    session: AsyncSession,
+    config_id: int | None,
+    search_space: SearchSpace,
+) -> tuple[str, str, int]:
+    """Resolve ``(billing_tier, base_model, reserve_micros)`` for a request.
+
+    The resolution mirrors ``_execute_image_generation``'s lookup tree but
+    only extracts the fields needed for billing — we do this *before*
+    ``billable_call`` so the reservation is correctly sized for the
+    config that will actually run, and so we don't open an
+    ``ImageGeneration`` row for a request that's about to 402.
+
+    User-owned (positive ID) BYOK configs are always free — they cost
+    the user nothing on our side. Auto mode currently treats as free
+    because the underlying router can dispatch to either premium or
+    free YAML configs and we don't surface the resolved deployment up
+    here yet. Bringing Auto under premium billing would require
+    threading the chosen deployment back from ``ImageGenRouterService``.
+    """
+    resolved_id = config_id
+    if resolved_id is None:
+        resolved_id = search_space.image_generation_config_id or IMAGE_GEN_AUTO_MODE_ID
+
+    if is_image_gen_auto_mode(resolved_id):
+        return ("free", "auto", DEFAULT_IMAGE_RESERVE_MICROS)
+
+    if resolved_id < 0:
+        cfg = _get_global_image_gen_config(resolved_id) or {}
+        billing_tier = str(cfg.get("billing_tier", "free")).lower()
+        base_model = _build_model_string(
+            cfg.get("provider", ""),
+            cfg.get("model_name", ""),
+            cfg.get("custom_provider"),
+        )
+        reserve_micros = int(
+            cfg.get("quota_reserve_micros") or DEFAULT_IMAGE_RESERVE_MICROS
+        )
+        return (billing_tier, base_model, reserve_micros)
+
+    # Positive ID = user-owned BYOK image-gen config — always free.
+    return ("free", "user_byok", DEFAULT_IMAGE_RESERVE_MICROS)
 
 
 async def _execute_image_generation(
@@ -138,12 +192,18 @@ async def _execute_image_generation(
         if not cfg:
             raise ValueError(f"Global image generation config {config_id} not found")
 
-        model_string = _build_model_string(
-            cfg.get("provider", ""), cfg["model_name"], cfg.get("custom_provider")
+        provider_prefix = _resolve_provider_prefix(
+            cfg.get("provider", ""), cfg.get("custom_provider")
         )
+        model_string = f"{provider_prefix}/{cfg['model_name']}"
         gen_kwargs["api_key"] = cfg.get("api_key")
-        if cfg.get("api_base"):
-            gen_kwargs["api_base"] = cfg["api_base"]
+        api_base = resolve_api_base(
+            provider=cfg.get("provider"),
+            provider_prefix=provider_prefix,
+            config_api_base=cfg.get("api_base"),
+        )
+        if api_base:
+            gen_kwargs["api_base"] = api_base
         if cfg.get("api_version"):
             gen_kwargs["api_version"] = cfg["api_version"]
         if cfg.get("litellm_params"):
@@ -165,12 +225,18 @@ async def _execute_image_generation(
         if not db_cfg:
             raise ValueError(f"Image generation config {config_id} not found")
 
-        model_string = _build_model_string(
-            db_cfg.provider.value, db_cfg.model_name, db_cfg.custom_provider
+        provider_prefix = _resolve_provider_prefix(
+            db_cfg.provider.value, db_cfg.custom_provider
         )
+        model_string = f"{provider_prefix}/{db_cfg.model_name}"
         gen_kwargs["api_key"] = db_cfg.api_key
-        if db_cfg.api_base:
-            gen_kwargs["api_base"] = db_cfg.api_base
+        api_base = resolve_api_base(
+            provider=db_cfg.provider.value,
+            provider_prefix=provider_prefix,
+            config_api_base=db_cfg.api_base,
+        )
+        if api_base:
+            gen_kwargs["api_base"] = api_base
         if db_cfg.api_version:
             gen_kwargs["api_version"] = db_cfg.api_version
         if db_cfg.litellm_params:
@@ -225,10 +291,15 @@ async def get_global_image_gen_configs(
                     "litellm_params": {},
                     "is_global": True,
                     "is_auto_mode": True,
+                    # Auto mode currently treated as free until per-deployment
+                    # billing-tier surfacing lands (see _resolve_billing_for_image_gen).
+                    "billing_tier": "free",
+                    "is_premium": False,
                 }
             )
 
         for cfg in global_configs:
+            billing_tier = str(cfg.get("billing_tier", "free")).lower()
             safe_configs.append(
                 {
                     "id": cfg.get("id"),
@@ -241,6 +312,12 @@ async def get_global_image_gen_configs(
                     "api_version": cfg.get("api_version") or None,
                     "litellm_params": cfg.get("litellm_params", {}),
                     "is_global": True,
+                    "billing_tier": billing_tier,
+                    # Mirror chat (``new_llm_config_routes``) so the new-chat
+                    # selector's premium badge logic keys off the same
+                    # field across chat / image / vision tabs.
+                    "is_premium": billing_tier == "premium",
+                    "quota_reserve_micros": cfg.get("quota_reserve_micros"),
                 }
             )
 
@@ -454,7 +531,26 @@ async def create_image_generation(
     session: AsyncSession = Depends(get_async_session),
     user: User = Depends(current_active_user),
 ):
-    """Create and execute an image generation request."""
+    """Create and execute an image generation request.
+
+    Premium configs are gated by the user's shared premium credit pool.
+    The flow is:
+
+    1. Permission check + load the search space (cheap, no provider call).
+    2. Resolve which config will run so we know its billing tier and the
+       worst-case reservation size *before* opening any DB rows.
+    3. Wrap the entire ImageGeneration row insert + provider call in
+       ``billable_call``. If quota is denied, ``billable_call`` raises
+       ``QuotaInsufficientError`` *before* we flush a row, which we
+       translate to HTTP 402 (no orphaned rows on the user's account,
+       no inserted error rows for "you ran out of credit").
+    4. On success, the actual ``response_cost`` flows through the
+       LiteLLM callback into the accumulator, and ``billable_call``
+       finalizes the debit at exit. Inner ``try/except`` still catches
+       provider errors and stores them on ``error_message`` (HTTP 200
+       with ``error_message`` set is preserved for failed-but-not-quota
+       scenarios — clients already know how to surface those).
+    """
     try:
         await check_permission(
             session,
@@ -471,33 +567,70 @@ async def create_image_generation(
         if not search_space:
             raise HTTPException(status_code=404, detail="Search space not found")
 
-        db_image_gen = ImageGeneration(
-            prompt=data.prompt,
-            model=data.model,
-            n=data.n,
-            quality=data.quality,
-            size=data.size,
-            style=data.style,
-            response_format=data.response_format,
-            image_generation_config_id=data.image_generation_config_id,
-            search_space_id=data.search_space_id,
-            created_by_id=user.id,
+        billing_tier, base_model, reserve_micros = await _resolve_billing_for_image_gen(
+            session, data.image_generation_config_id, search_space
         )
-        session.add(db_image_gen)
-        await session.flush()
 
-        try:
-            await _execute_image_generation(session, db_image_gen, search_space)
-        except Exception as e:
-            logger.exception("Image generation call failed")
-            db_image_gen.error_message = str(e)
+        # billable_call runs OUTSIDE the inner try/except so QuotaInsufficientError
+        # propagates to the outer ``except QuotaInsufficientError`` handler
+        # below as HTTP 402 — it is intentionally NOT swallowed into
+        # ``error_message`` because that would (1) imply a successful row
+        # exists when none does, and (2) return HTTP 200 to a client
+        # whose request was actively *denied* (issue K).
+        async with billable_call(
+            user_id=search_space.user_id,
+            search_space_id=data.search_space_id,
+            billing_tier=billing_tier,
+            base_model=base_model,
+            quota_reserve_micros_override=reserve_micros,
+            usage_type="image_generation",
+            call_details={"model": base_model, "prompt": data.prompt[:100]},
+        ):
+            db_image_gen = ImageGeneration(
+                prompt=data.prompt,
+                model=data.model,
+                n=data.n,
+                quality=data.quality,
+                size=data.size,
+                style=data.style,
+                response_format=data.response_format,
+                image_generation_config_id=data.image_generation_config_id,
+                search_space_id=data.search_space_id,
+                created_by_id=user.id,
+            )
+            session.add(db_image_gen)
+            await session.flush()
 
-        await session.commit()
-        await session.refresh(db_image_gen)
-        return db_image_gen
+            try:
+                await _execute_image_generation(session, db_image_gen, search_space)
+            except Exception as e:
+                logger.exception("Image generation call failed")
+                db_image_gen.error_message = str(e)
+
+            await session.commit()
+            await session.refresh(db_image_gen)
+            return db_image_gen
 
     except HTTPException:
         raise
+    except QuotaInsufficientError as exc:
+        # The user's premium credit pool is empty. No DB row is created
+        # because ``billable_call`` denies before yielding (issue K).
+        await session.rollback()
+        raise HTTPException(
+            status_code=402,
+            detail={
+                "error_code": "premium_quota_exhausted",
+                "usage_type": exc.usage_type,
+                "used_micros": exc.used_micros,
+                "limit_micros": exc.limit_micros,
+                "remaining_micros": exc.remaining_micros,
+                "message": (
+                    "Out of premium credits for image generation. "
+                    "Purchase additional credits or switch to a free model."
+                ),
+            },
+        ) from exc
     except SQLAlchemyError:
         await session.rollback()
         raise HTTPException(
diff --git a/surfsense_backend/app/routes/jira_add_connector_route.py b/surfsense_backend/app/routes/jira_add_connector_route.py
index 6cd6283d7a..eeb4f91d90 100644
--- a/surfsense_backend/app/routes/jira_add_connector_route.py
+++ b/surfsense_backend/app/routes/jira_add_connector_route.py
@@ -386,7 +386,7 @@ async def jira_callback(
         new_connector = SearchSourceConnector(
             name=connector_name,
             connector_type=SearchSourceConnectorType.JIRA_CONNECTOR,
-            is_indexable=True,
+            is_indexable=False,
             config=connector_config,
             search_space_id=space_id,
             user_id=user_id,
diff --git a/surfsense_backend/app/routes/linear_add_connector_route.py b/surfsense_backend/app/routes/linear_add_connector_route.py
index 9345ae4951..f59c17d251 100644
--- a/surfsense_backend/app/routes/linear_add_connector_route.py
+++ b/surfsense_backend/app/routes/linear_add_connector_route.py
@@ -399,7 +399,7 @@ async def linear_callback(
         new_connector = SearchSourceConnector(
             name=connector_name,
             connector_type=SearchSourceConnectorType.LINEAR_CONNECTOR,
-            is_indexable=True,
+            is_indexable=False,
             config=connector_config,
             search_space_id=space_id,
             user_id=user_id,
diff --git a/surfsense_backend/app/routes/luma_add_connector_route.py b/surfsense_backend/app/routes/luma_add_connector_route.py
index 04d840a08f..7040581bc8 100644
--- a/surfsense_backend/app/routes/luma_add_connector_route.py
+++ b/surfsense_backend/app/routes/luma_add_connector_route.py
@@ -61,7 +61,7 @@ async def add_luma_connector(
         if existing_connector:
             # Update existing connector with new API key
             existing_connector.config = {"api_key": request.api_key}
-            existing_connector.is_indexable = True
+            existing_connector.is_indexable = False
             await session.commit()
             await session.refresh(existing_connector)
 
@@ -82,7 +82,7 @@ async def add_luma_connector(
             config={"api_key": request.api_key},
             search_space_id=request.space_id,
             user_id=user.id,
-            is_indexable=True,
+            is_indexable=False,
         )
 
         session.add(db_connector)
diff --git a/surfsense_backend/app/routes/mcp_oauth_route.py b/surfsense_backend/app/routes/mcp_oauth_route.py
new file mode 100644
index 0000000000..1abc1f1ec3
--- /dev/null
+++ b/surfsense_backend/app/routes/mcp_oauth_route.py
@@ -0,0 +1,667 @@
+"""Generic MCP OAuth 2.1 route for services with official MCP servers.
+
+Handles the full flow: discovery → DCR → PKCE authorization → token exchange
+→ MCP_CONNECTOR creation.  Currently supports Linear, Jira, ClickUp, Slack,
+and Airtable.
+"""
+
+from __future__ import annotations
+
+import logging
+from datetime import UTC, datetime, timedelta
+from typing import Any
+from urllib.parse import urlencode
+from uuid import UUID
+
+from fastapi import APIRouter, Depends, HTTPException
+from fastapi.responses import RedirectResponse
+from sqlalchemy import select
+from sqlalchemy.exc import IntegrityError
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.orm.attributes import flag_modified
+
+from app.config import config
+from app.db import (
+    SearchSourceConnector,
+    SearchSourceConnectorType,
+    User,
+    get_async_session,
+)
+from app.users import current_active_user
+from app.utils.connector_naming import generate_unique_connector_name
+from app.utils.oauth_security import (
+    OAuthStateManager,
+    TokenEncryption,
+    generate_pkce_pair,
+)
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter()
+
+
+async def _fetch_account_metadata(
+    service_key: str,
+    access_token: str,
+    token_json: dict[str, Any],
+) -> dict[str, Any]:
+    """Fetch display-friendly account metadata after a successful token exchange.
+
+    DCR services (Linear, Jira, ClickUp) issue MCP-scoped tokens that cannot
+    call their standard REST/GraphQL APIs — metadata discovery for those
+    happens at runtime through MCP tools instead.
+
+    Pre-configured services (Slack, Airtable) use standard OAuth tokens that
+    *can* call their APIs, so we extract metadata here.
+
+    Failures are logged but never block connector creation.
+    """
+    from app.services.mcp_oauth.registry import MCP_SERVICES
+
+    svc = MCP_SERVICES.get(service_key)
+    if not svc or svc.supports_dcr:
+        return {}
+
+    import httpx
+
+    meta: dict[str, Any] = {}
+
+    try:
+        if service_key == "slack":
+            team_info = token_json.get("team", {})
+            meta["team_id"] = team_info.get("id", "")
+            # TODO: oauth.v2.user.access only returns team.id, not
+            # team.name.  To populate team_name, add "team:read" scope
+            # and call GET /api/team.info here.
+            meta["team_name"] = team_info.get("name", "")
+            if meta["team_name"]:
+                meta["display_name"] = meta["team_name"]
+            elif meta["team_id"]:
+                meta["display_name"] = f"Slack ({meta['team_id']})"
+
+        elif service_key == "airtable":
+            async with httpx.AsyncClient(timeout=15.0) as client:
+                resp = await client.get(
+                    "https://api.airtable.com/v0/meta/whoami",
+                    headers={"Authorization": f"Bearer {access_token}"},
+                )
+                if resp.status_code == 200:
+                    whoami = resp.json()
+                    meta["user_id"] = whoami.get("id", "")
+                    meta["user_email"] = whoami.get("email", "")
+                    meta["display_name"] = whoami.get("email", "Airtable")
+                else:
+                    logger.warning(
+                        "Airtable whoami API returned %d (non-blocking)",
+                        resp.status_code,
+                    )
+
+    except Exception:
+        logger.warning(
+            "Failed to fetch account metadata for %s (non-blocking)",
+            service_key,
+            exc_info=True,
+        )
+
+    return meta
+
+
+_state_manager: OAuthStateManager | None = None
+_token_encryption: TokenEncryption | None = None
+
+
+def _get_state_manager() -> OAuthStateManager:
+    global _state_manager
+    if _state_manager is None:
+        if not config.SECRET_KEY:
+            raise HTTPException(status_code=500, detail="SECRET_KEY not configured.")
+        _state_manager = OAuthStateManager(config.SECRET_KEY)
+    return _state_manager
+
+
+def _get_token_encryption() -> TokenEncryption:
+    global _token_encryption
+    if _token_encryption is None:
+        if not config.SECRET_KEY:
+            raise HTTPException(status_code=500, detail="SECRET_KEY not configured.")
+        _token_encryption = TokenEncryption(config.SECRET_KEY)
+    return _token_encryption
+
+
+def _build_redirect_uri(service: str) -> str:
+    base = config.BACKEND_URL or "http://localhost:8000"
+    return f"{base.rstrip('/')}/api/v1/auth/mcp/{service}/connector/callback"
+
+
+def _frontend_redirect(
+    space_id: int | None,
+    *,
+    success: bool = False,
+    connector_id: int | None = None,
+    error: str | None = None,
+    service: str = "mcp",
+) -> RedirectResponse:
+    if success and space_id:
+        qs = f"success=true&connector={service}-mcp-connector"
+        if connector_id:
+            qs += f"&connectorId={connector_id}"
+        return RedirectResponse(
+            url=f"{config.NEXT_FRONTEND_URL}/dashboard/{space_id}/connectors/callback?{qs}"
+        )
+    if error and space_id:
+        return RedirectResponse(
+            url=f"{config.NEXT_FRONTEND_URL}/dashboard/{space_id}/connectors/callback?error={error}"
+        )
+    return RedirectResponse(url=f"{config.NEXT_FRONTEND_URL}/dashboard")
+
+
+# ---------------------------------------------------------------------------
+# /add — start MCP OAuth flow
+# ---------------------------------------------------------------------------
+
+
+@router.get("/auth/mcp/{service}/connector/add")
+async def connect_mcp_service(
+    service: str,
+    space_id: int,
+    user: User = Depends(current_active_user),
+):
+    from app.services.mcp_oauth.registry import get_service
+
+    svc = get_service(service)
+    if not svc:
+        raise HTTPException(status_code=404, detail=f"Unknown MCP service: {service}")
+
+    try:
+        from app.services.mcp_oauth.discovery import (
+            discover_oauth_metadata,
+            register_client,
+        )
+
+        metadata = await discover_oauth_metadata(
+            svc.mcp_url,
+            origin_override=svc.oauth_discovery_origin,
+        )
+        auth_endpoint = svc.auth_endpoint_override or metadata.get(
+            "authorization_endpoint"
+        )
+        token_endpoint = svc.token_endpoint_override or metadata.get("token_endpoint")
+        registration_endpoint = metadata.get("registration_endpoint")
+
+        if not auth_endpoint or not token_endpoint:
+            raise HTTPException(
+                status_code=502,
+                detail=f"{svc.name} MCP server returned incomplete OAuth metadata.",
+            )
+
+        redirect_uri = _build_redirect_uri(service)
+
+        if svc.supports_dcr and registration_endpoint:
+            dcr = await register_client(registration_endpoint, redirect_uri)
+            client_id = dcr.get("client_id")
+            client_secret = dcr.get("client_secret", "")
+            if not client_id:
+                raise HTTPException(
+                    status_code=502,
+                    detail=f"DCR for {svc.name} did not return a client_id.",
+                )
+        elif svc.client_id_env:
+            client_id = getattr(config, svc.client_id_env, None)
+            client_secret = getattr(config, svc.client_secret_env or "", None) or ""
+            if not client_id:
+                raise HTTPException(
+                    status_code=500,
+                    detail=f"{svc.name} MCP OAuth not configured ({svc.client_id_env}).",
+                )
+        else:
+            raise HTTPException(
+                status_code=502,
+                detail=f"{svc.name} MCP server has no DCR and no fallback credentials.",
+            )
+
+        verifier, challenge = generate_pkce_pair()
+        enc = _get_token_encryption()
+
+        state = _get_state_manager().generate_secure_state(
+            space_id,
+            user.id,
+            service=service,
+            code_verifier=verifier,
+            mcp_client_id=client_id,
+            mcp_client_secret=enc.encrypt_token(client_secret) if client_secret else "",
+            mcp_token_endpoint=token_endpoint,
+            mcp_url=svc.mcp_url,
+        )
+
+        auth_params: dict[str, str] = {
+            "client_id": client_id,
+            "response_type": "code",
+            "redirect_uri": redirect_uri,
+            "code_challenge": challenge,
+            "code_challenge_method": "S256",
+            "state": state,
+        }
+        if svc.scopes:
+            auth_params[svc.scope_param] = " ".join(svc.scopes)
+
+        auth_url = f"{auth_endpoint}?{urlencode(auth_params)}"
+
+        logger.info(
+            "Generated %s MCP OAuth URL for user %s, space %s",
+            svc.name,
+            user.id,
+            space_id,
+        )
+        return {"auth_url": auth_url}
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error("Failed to initiate %s MCP OAuth: %s", service, e, exc_info=True)
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to initiate {service} MCP OAuth.",
+        ) from e
+
+
+# ---------------------------------------------------------------------------
+# /callback — handle OAuth redirect
+# ---------------------------------------------------------------------------
+
+
+@router.get("/auth/mcp/{service}/connector/callback")
+async def mcp_oauth_callback(
+    service: str,
+    code: str | None = None,
+    error: str | None = None,
+    state: str | None = None,
+    session: AsyncSession = Depends(get_async_session),
+):
+    if error:
+        logger.warning("%s MCP OAuth error: %s", service, error)
+        space_id = None
+        if state:
+            try:
+                data = _get_state_manager().validate_state(state)
+                space_id = data.get("space_id")
+            except Exception:
+                pass
+        return _frontend_redirect(
+            space_id,
+            error=f"{service}_mcp_oauth_denied",
+            service=service,
+        )
+
+    if not code:
+        raise HTTPException(status_code=400, detail="Missing authorization code")
+    if not state:
+        raise HTTPException(status_code=400, detail="Missing state parameter")
+
+    data = _get_state_manager().validate_state(state)
+    user_id = UUID(data["user_id"])
+    space_id = data["space_id"]
+    svc_key = data.get("service", service)
+
+    if svc_key != service:
+        raise HTTPException(status_code=400, detail="State/path service mismatch")
+
+    from app.services.mcp_oauth.registry import get_service
+
+    svc = get_service(svc_key)
+    if not svc:
+        raise HTTPException(status_code=404, detail=f"Unknown MCP service: {svc_key}")
+
+    try:
+        from app.services.mcp_oauth.discovery import exchange_code_for_tokens
+
+        enc = _get_token_encryption()
+        client_id = data["mcp_client_id"]
+        client_secret = (
+            enc.decrypt_token(data["mcp_client_secret"])
+            if data.get("mcp_client_secret")
+            else ""
+        )
+        token_endpoint = data["mcp_token_endpoint"]
+        code_verifier = data["code_verifier"]
+        mcp_url = data["mcp_url"]
+        redirect_uri = _build_redirect_uri(service)
+
+        token_json = await exchange_code_for_tokens(
+            token_endpoint=token_endpoint,
+            code=code,
+            redirect_uri=redirect_uri,
+            client_id=client_id,
+            client_secret=client_secret,
+            code_verifier=code_verifier,
+        )
+
+        access_token = token_json.get("access_token")
+        refresh_token = token_json.get("refresh_token")
+        expires_in = token_json.get("expires_in")
+        scope = token_json.get("scope")
+
+        if not access_token and "authed_user" in token_json:
+            authed = token_json["authed_user"]
+            access_token = authed.get("access_token")
+            refresh_token = refresh_token or authed.get("refresh_token")
+            scope = scope or authed.get("scope")
+            expires_in = expires_in or authed.get("expires_in")
+
+        if not access_token:
+            raise HTTPException(
+                status_code=400,
+                detail=f"No access token received from {svc.name}.",
+            )
+
+        expires_at = None
+        if expires_in:
+            expires_at = datetime.now(UTC) + timedelta(seconds=int(expires_in))
+
+        connector_config = {
+            "server_config": {
+                "transport": "streamable-http",
+                "url": mcp_url,
+            },
+            "mcp_service": svc_key,
+            "mcp_oauth": {
+                "client_id": client_id,
+                "client_secret": enc.encrypt_token(client_secret)
+                if client_secret
+                else "",
+                "token_endpoint": token_endpoint,
+                "access_token": enc.encrypt_token(access_token),
+                "refresh_token": enc.encrypt_token(refresh_token)
+                if refresh_token
+                else None,
+                "expires_at": expires_at.isoformat() if expires_at else None,
+                "scope": scope,
+            },
+            "_token_encrypted": True,
+        }
+
+        account_meta = await _fetch_account_metadata(svc_key, access_token, token_json)
+        if account_meta:
+            safe_meta_keys = {
+                "display_name",
+                "team_id",
+                "team_name",
+                "user_id",
+                "user_email",
+                "workspace_id",
+                "workspace_name",
+                "organization_name",
+                "organization_url_key",
+                "cloud_id",
+                "site_name",
+                "base_url",
+            }
+            for k, v in account_meta.items():
+                if k in safe_meta_keys:
+                    connector_config[k] = v
+            logger.info(
+                "Stored account metadata for %s: display_name=%s",
+                svc_key,
+                account_meta.get("display_name", ""),
+            )
+
+        # ---- Re-auth path ----
+        db_connector_type = SearchSourceConnectorType(svc.connector_type)
+        reauth_connector_id = data.get("connector_id")
+        if reauth_connector_id:
+            result = await session.execute(
+                select(SearchSourceConnector).filter(
+                    SearchSourceConnector.id == reauth_connector_id,
+                    SearchSourceConnector.user_id == user_id,
+                    SearchSourceConnector.search_space_id == space_id,
+                    SearchSourceConnector.connector_type == db_connector_type,
+                )
+            )
+            db_connector = result.scalars().first()
+            if not db_connector:
+                raise HTTPException(
+                    status_code=404,
+                    detail="Connector not found during re-auth",
+                )
+
+            db_connector.config = connector_config
+            flag_modified(db_connector, "config")
+            await session.commit()
+            await session.refresh(db_connector)
+
+            _invalidate_cache(space_id)
+
+            logger.info(
+                "Re-authenticated %s MCP connector %s for user %s",
+                svc.name,
+                db_connector.id,
+                user_id,
+            )
+            reauth_return_url = data.get("return_url")
+            if (
+                reauth_return_url
+                and reauth_return_url.startswith("/")
+                and not reauth_return_url.startswith("//")
+            ):
+                return RedirectResponse(
+                    url=f"{config.NEXT_FRONTEND_URL}{reauth_return_url}"
+                )
+            return _frontend_redirect(
+                space_id,
+                success=True,
+                connector_id=db_connector.id,
+                service=service,
+            )
+
+        # ---- New connector path ----
+        naming_identifier = account_meta.get("display_name")
+        connector_name = await generate_unique_connector_name(
+            session,
+            db_connector_type,
+            space_id,
+            user_id,
+            naming_identifier,
+        )
+
+        new_connector = SearchSourceConnector(
+            name=connector_name,
+            connector_type=db_connector_type,
+            is_indexable=False,
+            config=connector_config,
+            search_space_id=space_id,
+            user_id=user_id,
+        )
+        session.add(new_connector)
+
+        try:
+            await session.commit()
+        except IntegrityError as e:
+            await session.rollback()
+            raise HTTPException(
+                status_code=409,
+                detail="A connector for this service already exists.",
+            ) from e
+
+        _invalidate_cache(space_id)
+
+        logger.info(
+            "Created %s MCP connector %s for user %s in space %s",
+            svc.name,
+            new_connector.id,
+            user_id,
+            space_id,
+        )
+        return _frontend_redirect(
+            space_id,
+            success=True,
+            connector_id=new_connector.id,
+            service=service,
+        )
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(
+            "Failed to complete %s MCP OAuth: %s",
+            service,
+            e,
+            exc_info=True,
+        )
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to complete {service} MCP OAuth.",
+        ) from e
+
+
+# ---------------------------------------------------------------------------
+# /reauth — re-authenticate an existing MCP connector
+# ---------------------------------------------------------------------------
+
+
+@router.get("/auth/mcp/{service}/connector/reauth")
+async def reauth_mcp_service(
+    service: str,
+    space_id: int,
+    connector_id: int,
+    return_url: str | None = None,
+    user: User = Depends(current_active_user),
+    session: AsyncSession = Depends(get_async_session),
+):
+    from app.services.mcp_oauth.registry import get_service
+
+    svc = get_service(service)
+    if not svc:
+        raise HTTPException(status_code=404, detail=f"Unknown MCP service: {service}")
+
+    db_connector_type = SearchSourceConnectorType(svc.connector_type)
+    result = await session.execute(
+        select(SearchSourceConnector).filter(
+            SearchSourceConnector.id == connector_id,
+            SearchSourceConnector.user_id == user.id,
+            SearchSourceConnector.search_space_id == space_id,
+            SearchSourceConnector.connector_type == db_connector_type,
+        )
+    )
+    if not result.scalars().first():
+        raise HTTPException(
+            status_code=404,
+            detail="Connector not found or access denied",
+        )
+
+    try:
+        from app.services.mcp_oauth.discovery import (
+            discover_oauth_metadata,
+            register_client,
+        )
+
+        metadata = await discover_oauth_metadata(
+            svc.mcp_url,
+            origin_override=svc.oauth_discovery_origin,
+        )
+        auth_endpoint = svc.auth_endpoint_override or metadata.get(
+            "authorization_endpoint"
+        )
+        token_endpoint = svc.token_endpoint_override or metadata.get("token_endpoint")
+        registration_endpoint = metadata.get("registration_endpoint")
+
+        if not auth_endpoint or not token_endpoint:
+            raise HTTPException(
+                status_code=502,
+                detail=f"{svc.name} MCP server returned incomplete OAuth metadata.",
+            )
+
+        redirect_uri = _build_redirect_uri(service)
+
+        if svc.supports_dcr and registration_endpoint:
+            dcr = await register_client(registration_endpoint, redirect_uri)
+            client_id = dcr.get("client_id")
+            client_secret = dcr.get("client_secret", "")
+            if not client_id:
+                raise HTTPException(
+                    status_code=502,
+                    detail=f"DCR for {svc.name} did not return a client_id.",
+                )
+        elif svc.client_id_env:
+            client_id = getattr(config, svc.client_id_env, None)
+            client_secret = getattr(config, svc.client_secret_env or "", None) or ""
+            if not client_id:
+                raise HTTPException(
+                    status_code=500,
+                    detail=f"{svc.name} MCP OAuth not configured ({svc.client_id_env}).",
+                )
+        else:
+            raise HTTPException(
+                status_code=502,
+                detail=f"{svc.name} MCP server has no DCR and no fallback credentials.",
+            )
+
+        verifier, challenge = generate_pkce_pair()
+        enc = _get_token_encryption()
+
+        extra: dict = {
+            "service": service,
+            "code_verifier": verifier,
+            "mcp_client_id": client_id,
+            "mcp_client_secret": enc.encrypt_token(client_secret)
+            if client_secret
+            else "",
+            "mcp_token_endpoint": token_endpoint,
+            "mcp_url": svc.mcp_url,
+            "connector_id": connector_id,
+        }
+        if return_url and return_url.startswith("/"):
+            extra["return_url"] = return_url
+
+        state = _get_state_manager().generate_secure_state(
+            space_id,
+            user.id,
+            **extra,
+        )
+
+        auth_params: dict[str, str] = {
+            "client_id": client_id,
+            "response_type": "code",
+            "redirect_uri": redirect_uri,
+            "code_challenge": challenge,
+            "code_challenge_method": "S256",
+            "state": state,
+        }
+        if svc.scopes:
+            auth_params[svc.scope_param] = " ".join(svc.scopes)
+
+        auth_url = f"{auth_endpoint}?{urlencode(auth_params)}"
+
+        logger.info(
+            "Initiating %s MCP re-auth for user %s, connector %s",
+            svc.name,
+            user.id,
+            connector_id,
+        )
+        return {"auth_url": auth_url}
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(
+            "Failed to initiate %s MCP re-auth: %s",
+            service,
+            e,
+            exc_info=True,
+        )
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to initiate {service} MCP re-auth.",
+        ) from e
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _invalidate_cache(space_id: int) -> None:
+    try:
+        from app.agents.new_chat.tools.mcp_tool import invalidate_mcp_tools_cache
+
+        invalidate_mcp_tools_cache(space_id)
+    except Exception:
+        logger.debug("MCP cache invalidation skipped", exc_info=True)
diff --git a/surfsense_backend/app/routes/memory_routes.py b/surfsense_backend/app/routes/memory_routes.py
new file mode 100644
index 0000000000..e57ca40552
--- /dev/null
+++ b/surfsense_backend/app/routes/memory_routes.py
@@ -0,0 +1,150 @@
+"""Routes for user memory management (personal memory.md)."""
+
+from __future__ import annotations
+
+import logging
+
+from fastapi import APIRouter, Depends, HTTPException
+from langchain_core.messages import HumanMessage
+from pydantic import BaseModel
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.agents.new_chat.llm_config import (
+    create_chat_litellm_from_agent_config,
+    load_agent_llm_config_for_search_space,
+)
+from app.agents.new_chat.tools.update_memory import MEMORY_HARD_LIMIT, _save_memory
+from app.db import User, get_async_session
+from app.users import current_active_user
+from app.utils.content_utils import extract_text_content
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter()
+
+
+class MemoryRead(BaseModel):
+    memory_md: str
+
+
+class MemoryUpdate(BaseModel):
+    memory_md: str
+
+
+class MemoryEditRequest(BaseModel):
+    query: str
+    search_space_id: int
+
+
+_MEMORY_EDIT_PROMPT = """\
+You are a memory editor. The user wants to modify their memory document. \
+Apply the user's instruction to the existing memory document and output the \
+FULL updated document.
+
+RULES:
+1. If the instruction asks to add something, add it with format: \
+- (YYYY-MM-DD) [fact|pref|instr] text, under an existing or new ## heading. \
+Heading names should be personal and descriptive, not generic categories.
+2. If the instruction asks to remove something, remove the matching entry.
+3. If the instruction asks to change something, update the matching entry.
+4. Preserve existing ## headings and all other entries.
+5. Every bullet must include a marker: [fact], [pref], or [instr].
+6. Use the user's first name (from <user_name>) in entries instead of "the user".
+7. Output ONLY the updated markdown — no explanations, no wrapping.
+
+<user_name>{user_name}</user_name>
+
+<current_memory>
+{current_memory}
+</current_memory>
+
+<user_instruction>
+{instruction}
+</user_instruction>"""
+
+
+@router.get("/users/me/memory", response_model=MemoryRead)
+async def get_user_memory(
+    user: User = Depends(current_active_user),
+    session: AsyncSession = Depends(get_async_session),
+):
+    await session.refresh(user, ["memory_md"])
+    return MemoryRead(memory_md=user.memory_md or "")
+
+
+@router.put("/users/me/memory", response_model=MemoryRead)
+async def update_user_memory(
+    body: MemoryUpdate,
+    user: User = Depends(current_active_user),
+    session: AsyncSession = Depends(get_async_session),
+):
+    if len(body.memory_md) > MEMORY_HARD_LIMIT:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Memory exceeds {MEMORY_HARD_LIMIT:,} character limit ({len(body.memory_md):,} chars).",
+        )
+    user.memory_md = body.memory_md
+    session.add(user)
+    await session.commit()
+    await session.refresh(user, ["memory_md"])
+    return MemoryRead(memory_md=user.memory_md or "")
+
+
+@router.post("/users/me/memory/edit", response_model=MemoryRead)
+async def edit_user_memory(
+    body: MemoryEditRequest,
+    user: User = Depends(current_active_user),
+    session: AsyncSession = Depends(get_async_session),
+):
+    """Apply a natural language edit to the user's personal memory via LLM."""
+    agent_config = await load_agent_llm_config_for_search_space(
+        session, body.search_space_id
+    )
+    if not agent_config:
+        raise HTTPException(status_code=500, detail="No LLM configuration available.")
+    llm = create_chat_litellm_from_agent_config(agent_config)
+    if not llm:
+        raise HTTPException(status_code=500, detail="Failed to create LLM instance.")
+
+    await session.refresh(user, ["memory_md", "display_name"])
+    current_memory = user.memory_md or ""
+    first_name = (
+        user.display_name.strip().split()[0]
+        if user.display_name and user.display_name.strip()
+        else "The user"
+    )
+
+    prompt = _MEMORY_EDIT_PROMPT.format(
+        current_memory=current_memory or "(empty)",
+        instruction=body.query,
+        user_name=first_name,
+    )
+    try:
+        response = await llm.ainvoke(
+            [HumanMessage(content=prompt)],
+            config={"tags": ["surfsense:internal", "memory-edit"]},
+        )
+        updated = extract_text_content(response.content).strip()
+    except Exception as e:
+        logger.exception("Memory edit LLM call failed: %s", e)
+        raise HTTPException(status_code=500, detail="Memory edit failed.") from e
+
+    if not updated:
+        raise HTTPException(status_code=400, detail="LLM returned empty result.")
+
+    result = await _save_memory(
+        updated_memory=updated,
+        old_memory=current_memory,
+        llm=llm,
+        apply_fn=lambda content: setattr(user, "memory_md", content),
+        commit_fn=session.commit,
+        rollback_fn=session.rollback,
+        label="memory",
+        scope="user",
+    )
+
+    if result.get("status") == "error":
+        raise HTTPException(status_code=400, detail=result["message"])
+
+    await session.refresh(user, ["memory_md"])
+    return MemoryRead(memory_md=user.memory_md or "")
diff --git a/surfsense_backend/app/routes/new_chat_routes.py b/surfsense_backend/app/routes/new_chat_routes.py
index 10a6951faa..44fc1c392d 100644
--- a/surfsense_backend/app/routes/new_chat_routes.py
+++ b/surfsense_backend/app/routes/new_chat_routes.py
@@ -11,17 +11,32 @@
 """
 
 import asyncio
+import json
 import logging
 from datetime import UTC, datetime
 
-from fastapi import APIRouter, Depends, HTTPException, Request
+from fastapi import APIRouter, Depends, HTTPException, Request, Response
 from fastapi.responses import StreamingResponse
-from sqlalchemy import func, or_
+from sqlalchemy import func, or_, text as sa_text
+from sqlalchemy.dialects.postgresql import insert as pg_insert
 from sqlalchemy.exc import IntegrityError, OperationalError
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
 from sqlalchemy.orm import selectinload
 
+from app.agents.new_chat.filesystem_selection import (
+    ClientPlatform,
+    FilesystemMode,
+    FilesystemSelection,
+    LocalFilesystemMount,
+)
+from app.agents.new_chat.middleware.busy_mutex import (
+    get_cancel_state,
+    is_cancel_requested,
+    manager,
+    request_cancel,
+)
+from app.config import config
 from app.db import (
     ChatComment,
     ChatVisibility,
@@ -30,12 +45,15 @@
     NewChatThread,
     Permission,
     SearchSpace,
+    TokenUsage,
     User,
     get_async_session,
     shielded_async_session,
 )
 from app.schemas.new_chat import (
     AgentToolInfo,
+    CancelActiveTurnResponse,
+    LocalFilesystemMountPayload,
     NewChatMessageRead,
     NewChatRequest,
     NewChatThreadCreate,
@@ -50,17 +68,412 @@
     ThreadHistoryLoadResponse,
     ThreadListItem,
     ThreadListResponse,
+    TokenUsageSummary,
+    TurnStatusResponse,
+)
+from app.tasks.chat.stream_new_chat import (
+    stream_new_chat,
+    stream_resume_chat,
 )
-from app.tasks.chat.stream_new_chat import stream_new_chat, stream_resume_chat
 from app.users import current_active_user
+from app.utils.perf import get_perf_logger
 from app.utils.rbac import check_permission
+from app.utils.user_message_multimodal import (
+    split_langchain_human_content,
+    split_persisted_user_content_parts,
+)
 
 _logger = logging.getLogger(__name__)
+_perf_log = get_perf_logger()
 _background_tasks: set[asyncio.Task] = set()
+TURN_CANCELLING_INITIAL_DELAY_MS = 200
+TURN_CANCELLING_BACKOFF_FACTOR = 2
+TURN_CANCELLING_MAX_DELAY_MS = 1500
 
 router = APIRouter()
 
 
+def _resolve_filesystem_selection(
+    *,
+    mode: str,
+    client_platform: str,
+    local_mounts: list[LocalFilesystemMountPayload] | None,
+) -> FilesystemSelection:
+    """Validate and normalize filesystem mode settings from request payload."""
+    try:
+        resolved_mode = FilesystemMode(mode)
+    except ValueError as exc:
+        raise HTTPException(status_code=400, detail="Invalid filesystem_mode") from exc
+    try:
+        resolved_platform = ClientPlatform(client_platform)
+    except ValueError as exc:
+        raise HTTPException(status_code=400, detail="Invalid client_platform") from exc
+
+    if resolved_mode == FilesystemMode.DESKTOP_LOCAL_FOLDER:
+        if not config.ENABLE_DESKTOP_LOCAL_FILESYSTEM:
+            raise HTTPException(
+                status_code=400,
+                detail="Desktop local filesystem mode is disabled on this deployment.",
+            )
+        if resolved_platform != ClientPlatform.DESKTOP:
+            raise HTTPException(
+                status_code=400,
+                detail="desktop_local_folder mode is only available on desktop runtime.",
+            )
+        normalized_mounts: list[tuple[str, str]] = []
+        seen_mounts: set[str] = set()
+        for mount in local_mounts or []:
+            mount_id = mount.mount_id.strip()
+            root_path = mount.root_path.strip()
+            if not mount_id or not root_path:
+                continue
+            if mount_id in seen_mounts:
+                continue
+            seen_mounts.add(mount_id)
+            normalized_mounts.append((mount_id, root_path))
+        if not normalized_mounts:
+            raise HTTPException(
+                status_code=400,
+                detail=(
+                    "local_filesystem_mounts must include at least one mount for "
+                    "desktop_local_folder mode."
+                ),
+            )
+        return FilesystemSelection(
+            mode=resolved_mode,
+            client_platform=resolved_platform,
+            local_mounts=tuple(
+                LocalFilesystemMount(mount_id=mount_id, root_path=root_path)
+                for mount_id, root_path in normalized_mounts
+            ),
+        )
+
+    return FilesystemSelection(
+        mode=FilesystemMode.CLOUD,
+        client_platform=resolved_platform,
+    )
+
+
+def _compute_turn_cancelling_retry_delay(attempt: int) -> int:
+    """Bounded exponential delay for TURN_CANCELLING retry hints."""
+    if attempt < 1:
+        attempt = 1
+    delay = TURN_CANCELLING_INITIAL_DELAY_MS * (
+        TURN_CANCELLING_BACKOFF_FACTOR ** (attempt - 1)
+    )
+    return min(delay, TURN_CANCELLING_MAX_DELAY_MS)
+
+
+def _build_turn_status_payload(thread_id: int) -> dict[str, object]:
+    lock = manager.lock_for(str(thread_id))
+    if not lock.locked():
+        return {"status": "idle"}
+
+    if is_cancel_requested(str(thread_id)):
+        cancel_state = get_cancel_state(str(thread_id))
+        attempt = cancel_state[0] if cancel_state else 1
+        retry_after_ms = _compute_turn_cancelling_retry_delay(attempt)
+        retry_after_at = int(datetime.now(UTC).timestamp() * 1000) + retry_after_ms
+        return {
+            "status": "cancelling",
+            "retry_after_ms": retry_after_ms,
+            "retry_after_at": retry_after_at,
+        }
+
+    return {"status": "busy"}
+
+
+def _set_retry_after_headers(response: Response, retry_after_ms: int) -> None:
+    response.headers["retry-after-ms"] = str(retry_after_ms)
+    response.headers["Retry-After"] = str(max(1, (retry_after_ms + 999) // 1000))
+
+
+def _raise_if_thread_busy_for_start(thread_id: int) -> None:
+    status_payload = _build_turn_status_payload(thread_id)
+    status = status_payload["status"]
+    if status == "idle":
+        return
+    if status == "cancelling":
+        retry_after_ms = int(status_payload.get("retry_after_ms") or 0)
+        detail = {
+            "errorCode": "TURN_CANCELLING",
+            "message": "A previous response is still stopping. Please try again in a moment.",
+            "retry_after_ms": retry_after_ms if retry_after_ms > 0 else None,
+            "retry_after_at": status_payload.get("retry_after_at"),
+        }
+        headers = (
+            {
+                "retry-after-ms": str(retry_after_ms),
+                "Retry-After": str(max(1, (retry_after_ms + 999) // 1000)),
+            }
+            if retry_after_ms > 0
+            else None
+        )
+        raise HTTPException(status_code=409, detail=detail, headers=headers)
+
+    raise HTTPException(
+        status_code=409,
+        detail={
+            "errorCode": "THREAD_BUSY",
+            "message": "Another response is still finishing for this thread. Please try again in a moment.",
+        },
+    )
+
+
+def _find_pre_turn_checkpoint_id(
+    checkpoint_tuples: list,
+    *,
+    turn_id: str,
+) -> str | None:
+    """Locate the LangGraph checkpoint immediately before ``turn_id`` started.
+
+    ``checkpoint_tuples`` arrives newest-first from
+    ``checkpointer.alist(config)``. We walk OLDEST-first (``reversed``)
+    and remember the most recent checkpoint that does NOT belong to the
+    edited turn. As soon as we cross into the edited turn (a checkpoint
+    whose ``turn_id`` matches), we return the previously-tracked
+    checkpoint — that's the state immediately before ``turn_id`` began.
+
+    The naive "newest-first, return first non-matching" approach is
+    INCORRECT when later turns exist after ``turn_id``: their
+    checkpoints also satisfy ``cp_turn_id != turn_id`` and would be
+    returned before the real pre-turn boundary is reached.
+
+    Reads from ``cp_tuple.metadata`` (the durable surface promoted from
+    ``configurable`` at write time) rather than ``config["configurable"]``
+    so the lookup is portable across checkpointer implementations.
+
+    Returns ``None`` when no eligible pre-turn checkpoint exists (e.g.
+    the edited turn is the very first turn of the thread). Callers fall
+    back to the oldest available checkpoint in that case.
+    """
+
+    last_pre_turn_target: str | None = None
+    for cp_tuple in reversed(checkpoint_tuples):  # oldest -> newest
+        metadata = getattr(cp_tuple, "metadata", None) or {}
+        cp_turn_id = metadata.get("turn_id") if isinstance(metadata, dict) else None
+        if cp_turn_id == turn_id:
+            # Crossed into the edited turn; the previous tracked
+            # checkpoint is the rewind target. May be ``None`` if we hit
+            # the edited turn on the very first iteration.
+            return last_pre_turn_target
+        try:
+            last_pre_turn_target = cp_tuple.config["configurable"]["checkpoint_id"]
+        except (KeyError, TypeError):
+            continue
+    return last_pre_turn_target
+
+
+async def _revert_turns_for_regenerate(
+    *,
+    thread_id: int,
+    chat_turn_ids: list[str],
+    requester_user_id: str,
+) -> dict:
+    """Best-effort revert pass for every ``chat_turn_id`` in ``chat_turn_ids``.
+
+    Runs BEFORE the regenerate stream so the frontend can surface
+    partial-rollback feedback alongside the new assistant turn. Each
+    turn's actions are reverted in their own SAVEPOINTs (handled
+    inside :mod:`app.routes.agent_revert_route`'s helpers) so a single
+    failure never poisons the batch.
+
+    Sequencing inside the request: revert THEN regenerate. The
+    operation is NOT atomic and partial state IS surfaced — see the
+    plan's "Sequencing inside the request" note.
+    """
+
+    from app.routes.agent_revert_route import (
+        RevertTurnActionResult,
+        _classify_outcome,
+        _OutcomeRollbackError,
+        _was_already_reverted,
+        _was_already_reverted_batch,
+    )
+    from app.services.revert_service import (
+        can_revert,
+        revert_action,
+    )
+
+    aggregated_results: list[dict] = []
+    # Exhaustive counters keep the response invariant
+    # ``total == sum(counters)`` true for ``data-revert-results``.
+    counts = {
+        "reverted": 0,
+        "already_reverted": 0,
+        "not_reversible": 0,
+        "permission_denied": 0,
+        "failed": 0,
+        "skipped": 0,
+    }
+
+    # Local import keeps the route module's existing imports tidy and
+    # avoids a circular dependency at module-load time.
+    from app.db import AgentActionLog as _AgentActionLog
+
+    async with shielded_async_session() as session:
+        for chat_turn_id in chat_turn_ids:
+            rows_stmt = (
+                select(_AgentActionLog)
+                .where(
+                    _AgentActionLog.thread_id == thread_id,
+                    _AgentActionLog.chat_turn_id == chat_turn_id,
+                )
+                .order_by(
+                    _AgentActionLog.created_at.desc(),
+                    _AgentActionLog.id.desc(),
+                )
+            )
+            rows = (await session.execute(rows_stmt)).scalars().all()
+
+            # Batch idempotency probe across the turn (single SELECT
+            # instead of one per row).
+            eligible_ids = [r.id for r in rows if r.reverse_of is None]
+            already_reverted_map = await _was_already_reverted_batch(
+                session, action_ids=eligible_ids
+            )
+
+            for action in rows:
+                if action.reverse_of is not None:
+                    counts["skipped"] += 1
+                    aggregated_results.append(
+                        RevertTurnActionResult(
+                            action_id=action.id,
+                            tool_name=action.tool_name,
+                            status="skipped",
+                            message="Row is itself a revert action; skipped.",
+                        ).model_dump()
+                    )
+                    continue
+
+                existing_revert_id = already_reverted_map.get(action.id)
+                if existing_revert_id is not None:
+                    counts["already_reverted"] += 1
+                    aggregated_results.append(
+                        RevertTurnActionResult(
+                            action_id=action.id,
+                            tool_name=action.tool_name,
+                            status="already_reverted",
+                            new_action_id=existing_revert_id,
+                        ).model_dump()
+                    )
+                    continue
+
+                if not can_revert(
+                    requester_user_id=requester_user_id,
+                    action=action,
+                    is_admin=False,
+                ):
+                    counts["permission_denied"] += 1
+                    aggregated_results.append(
+                        RevertTurnActionResult(
+                            action_id=action.id,
+                            tool_name=action.tool_name,
+                            status="permission_denied",
+                            message="You are not allowed to revert this action.",
+                        ).model_dump()
+                    )
+                    continue
+
+                try:
+                    async with session.begin_nested():
+                        outcome = await revert_action(
+                            session,
+                            action=action,
+                            requester_user_id=requester_user_id,
+                        )
+                        if outcome.status != "ok":
+                            raise _OutcomeRollbackError(outcome)
+                except _OutcomeRollbackError as rollback:
+                    outcome = rollback.outcome
+                    classified = _classify_outcome(outcome)
+                    if classified == "permission_denied":
+                        counts["permission_denied"] += 1
+                    else:
+                        counts["not_reversible"] += 1
+                    aggregated_results.append(
+                        RevertTurnActionResult(
+                            action_id=action.id,
+                            tool_name=action.tool_name,
+                            status=classified,
+                            message=outcome.message,
+                        ).model_dump()
+                    )
+                    continue
+                except IntegrityError:
+                    # Concurrent revert won the race against the
+                    # pre-flight ``_was_already_reverted`` SELECT.
+                    # Surface the winning revert id so the client can
+                    # treat this as a successful idempotent op.
+                    existing_revert_id = await _was_already_reverted(
+                        session, action_id=action.id
+                    )
+                    counts["already_reverted"] += 1
+                    aggregated_results.append(
+                        RevertTurnActionResult(
+                            action_id=action.id,
+                            tool_name=action.tool_name,
+                            status="already_reverted",
+                            new_action_id=existing_revert_id,
+                        ).model_dump()
+                    )
+                    continue
+                except Exception as err:  # pragma: no cover — defensive
+                    _logger.exception(
+                        "Unexpected revert failure during regenerate batch "
+                        "for action_id=%s",
+                        action.id,
+                    )
+                    counts["failed"] += 1
+                    aggregated_results.append(
+                        RevertTurnActionResult(
+                            action_id=action.id,
+                            tool_name=action.tool_name,
+                            status="failed",
+                            error=str(err) or err.__class__.__name__,
+                        ).model_dump()
+                    )
+                    continue
+
+                counts["reverted"] += 1
+                aggregated_results.append(
+                    RevertTurnActionResult(
+                        action_id=action.id,
+                        tool_name=action.tool_name,
+                        status="reverted",
+                        message=outcome.message,
+                        new_action_id=outcome.new_action_id,
+                    ).model_dump()
+                )
+
+        try:
+            await session.commit()
+        except Exception:
+            _logger.exception(
+                "[regenerate-revert] Final commit failed; rolling back batch."
+            )
+            await session.rollback()
+
+    has_partial = (
+        counts["failed"] > 0
+        or counts["not_reversible"] > 0
+        or counts["permission_denied"] > 0
+    )
+
+    return {
+        "status": "partial" if has_partial else "ok",
+        "chat_turn_ids": chat_turn_ids,
+        "total": len(aggregated_results),
+        "reverted": counts["reverted"],
+        "already_reverted": counts["already_reverted"],
+        "not_reversible": counts["not_reversible"],
+        "permission_denied": counts["permission_denied"],
+        "failed": counts["failed"],
+        "skipped": counts["skipped"],
+        "results": aggregated_results,
+    }
+
+
 def _try_delete_sandbox(thread_id: int) -> None:
     """Fire-and-forget sandbox + local file deletion so the HTTP response isn't blocked."""
     from app.agents.new_chat.sandbox import (
@@ -473,10 +886,13 @@ async def get_thread_messages(
         # Check thread-level access based on visibility
         await check_thread_access(session, thread, user)
 
-        # Get messages with their authors loaded
+        # Get messages with their authors and token usage loaded
         messages_result = await session.execute(
             select(NewChatMessage)
-            .options(selectinload(NewChatMessage.author))
+            .options(
+                selectinload(NewChatMessage.author),
+                selectinload(NewChatMessage.token_usage),
+            )
             .filter(NewChatMessage.thread_id == thread_id)
             .order_by(NewChatMessage.created_at)
         )
@@ -493,6 +909,10 @@ async def get_thread_messages(
                 author_id=msg.author_id,
                 author_display_name=msg.author.display_name if msg.author else None,
                 author_avatar_url=msg.author.avatar_url if msg.author else None,
+                token_usage=TokenUsageSummary.model_validate(msg.token_usage)
+                if msg.token_usage
+                else None,
+                turn_id=msg.turn_id,
             )
             for msg in db_messages
         ]
@@ -530,7 +950,11 @@ async def get_thread_full(
     try:
         result = await session.execute(
             select(NewChatThread)
-            .options(selectinload(NewChatThread.messages))
+            .options(
+                selectinload(NewChatThread.messages).selectinload(
+                    NewChatMessage.token_usage
+                ),
+            )
             .filter(NewChatThread.id == thread_id)
         )
         thread = result.scalars().first()
@@ -869,6 +1293,24 @@ async def append_message(
     user: User = Depends(current_active_user),
 ):
     """
+    .. deprecated:: 2026-05
+       Replaced by the **SSE-based message ID handshake**. The streaming
+       generator (`stream_new_chat` / `stream_resume_chat`) now persists
+       both the user and assistant rows server-side via
+       ``persist_user_turn`` / ``persist_assistant_shell`` and emits
+       ``data-user-message-id`` / ``data-assistant-message-id`` SSE events
+       so the frontend can rename its optimistic IDs in real time. The
+       new FE bundle no longer calls this route.
+
+       This handler is retained as a **silent no-op for legacy / cached
+       FE bundles**: the underlying ``INSERT ... ON CONFLICT DO NOTHING``
+       pattern means a stale bundle hitting this route after the SSE
+       handshake already wrote the row simply returns the existing row
+       (200 OK) without raising or duplicating data. After a 2-week soak
+       (target: ``[persist_user_turn] outcome=race_recovered`` rate ~0)
+       this entire route — and the FE ``appendMessage`` function — is
+       earmarked for removal.
+
     Append a message to a thread.
     This is used by ThreadHistoryAdapter.append() to persist messages.
 
@@ -879,6 +1321,22 @@ async def append_message(
     Requires CHATS_UPDATE permission.
     """
     try:
+        # Capture ``user.id`` as a primitive UUID up front. The
+        # ``current_active_user`` dependency hands us a ``User`` ORM
+        # row bound to ``session``; if the outer ``except
+        # IntegrityError`` block below ever fires (an unexpected
+        # constraint like a foreign key violation — the common
+        # ``(thread_id, turn_id, role)`` race is now handled silently
+        # by ``ON CONFLICT DO NOTHING`` so it never raises) it calls
+        # ``session.rollback()``, which expires every attached ORM
+        # row including this user. Any later ``user.id`` access would
+        # then trigger a lazy PK reload — which on async SQLAlchemy
+        # fails with ``MissingGreenlet`` because the reload happens
+        # outside the awaitable greenlet boundary. Reading ``id``
+        # once here pins the value as a plain UUID so all downstream
+        # uses (TokenUsage insert, response build) are immune.
+        user_uuid = user.id
+
         # Parse raw body - extract only role and content, ignoring extra fields
         raw_body = await request.json()
         role = raw_body.get("role")
@@ -921,29 +1379,200 @@ async def append_message(
         # Check thread-level access based on visibility
         await check_thread_access(session, thread, user)
 
-        # Create message
-        db_message = NewChatMessage(
-            thread_id=thread_id,
-            role=message_role,
-            content=content,
-            author_id=user.id,
+        # Create message. ``turn_id`` is the per-turn correlation id from
+        # ``configurable.turn_id`` (added in migration 136) — when the
+        # client streams it back to ``appendMessage``, we persist it so
+        # C1's edit-from-arbitrary-position can later map this message
+        # back to the LangGraph checkpoint that produced its turn.
+        raw_turn_id = raw_body.get("turn_id")
+        turn_id_value = (
+            str(raw_turn_id).strip()
+            if isinstance(raw_turn_id, str) and raw_turn_id.strip()
+            else None
         )
-        session.add(db_message)
 
-        # Update thread's updated_at timestamp
+        # Update thread's updated_at timestamp (always — both insert
+        # and recovery paths represent thread activity).
         thread.updated_at = datetime.now(UTC)
 
-        # flush assigns the PK/defaults without a round-trip SELECT
-        await session.flush()
+        # Insert the new message via ``INSERT ... ON CONFLICT DO NOTHING``
+        # keyed on the ``(thread_id, turn_id, role)`` partial unique
+        # index from migration 141 (``WHERE turn_id IS NOT NULL``).
+        #
+        # Why ON CONFLICT instead of ``session.add() + flush() + except
+        # IntegrityError``:
+        #   1. The conflict between this legacy FE ``appendMessage``
+        #      round-trip and the server-side
+        #      ``finalize_assistant_turn`` writer is a NORMAL,
+        #      *expected* race — every assistant turn fires it. Using
+        #      catch-and-recover means asyncpg raises
+        #      ``UniqueViolationError`` -> SQLAlchemy wraps it as
+        #      ``IntegrityError`` -> our handler catches and recovers.
+        #      Functionally fine, but every ``raise`` event lights up
+        #      VS Code's debugger (debugpy's ``justMyCode=false`` mode
+        #      loses track of the catch frame across SQLAlchemy's
+        #      async greenlet boundary, so even ``Raised Exceptions``
+        #      being unchecked doesn't reliably suppress the pause).
+        #      ON CONFLICT pushes the conflict resolution into Postgres
+        #      where no Python exception is constructed at all.
+        #   2. No ``session.rollback()`` -> no expiring of attached
+        #      ORM rows -> no risk of ``MissingGreenlet`` from
+        #      lazy-loading expired user/thread state later in the
+        #      handler.
+        #   3. Cleaner production logs (no SQLAlchemy ``IntegrityError``
+        #      tracebacks emitted by uvicorn's logger between the
+        #      ``raise`` and our ``except``).
+        #
+        # When ``turn_id_value`` is ``None`` the partial index doesn't
+        # apply and the INSERT proceeds normally. Other constraint
+        # violations (FK, NOT NULL, etc.) still raise ``IntegrityError``
+        # and are caught by the outer ``except IntegrityError`` block
+        # to preserve the legacy 400 behavior.
+        #
+        # Note on ``content``: when we recover the existing row, we
+        # intentionally discard the FE's ``content`` payload from
+        # ``raw_body`` and return the row's existing ``content``. The
+        # streaming task is now the *authoritative writer* for
+        # assistant ``ContentPart[]`` shape (mid-stream
+        # ``AssistantContentBuilder`` -> ``finalize_assistant_turn``)
+        # so the FE's later ``appendMessage`` is just a stale snapshot
+        # of the same data — keeping the server-built rich content
+        # (with full tool-call args / argsText / langchainToolCallId)
+        # is correct, not lossy.
+        insert_stmt = (
+            pg_insert(NewChatMessage)
+            .values(
+                thread_id=thread_id,
+                role=message_role,
+                content=content,
+                author_id=user_uuid,
+                turn_id=turn_id_value,
+            )
+            .on_conflict_do_nothing(
+                index_elements=["thread_id", "turn_id", "role"],
+                index_where=sa_text("turn_id IS NOT NULL"),
+            )
+            .returning(NewChatMessage.id)
+        )
+        inserted_id = (await session.execute(insert_stmt)).scalar()
+
+        if inserted_id is None:
+            # Conflict on partial unique index — server-side stream
+            # already wrote this row. Look it up and reuse it.
+            if turn_id_value is None:
+                # Defensive: ON CONFLICT only fires for ``turn_id IS
+                # NOT NULL`` rows, so this branch should be
+                # unreachable. Preserve the legacy 400 just in case
+                # Postgres ever surprises us.
+                raise HTTPException(
+                    status_code=400,
+                    detail="Database constraint violation. Please check your input data.",
+                ) from None
+            lookup = await session.execute(
+                select(NewChatMessage).filter(
+                    NewChatMessage.thread_id == thread_id,
+                    NewChatMessage.turn_id == turn_id_value,
+                    NewChatMessage.role == message_role,
+                )
+            )
+            existing_message = lookup.scalars().first()
+            if existing_message is None:
+                # Conflict reported but the row vanished between
+                # INSERT and SELECT — extremely unlikely (would
+                # require a concurrent DELETE within the same
+                # transaction visibility), but preserve safe
+                # behavior.
+                raise HTTPException(
+                    status_code=400,
+                    detail="Database constraint violation. Please check your input data.",
+                ) from None
+            db_message = existing_message
+            # Perf signal: counts how often the legacy FE round-trip
+            # races the server-side ``finalize_assistant_turn``. A
+            # rising rate after the rework is OK (it's exactly the
+            # ghost-thread fix's recovery path firing); a sudden drop
+            # to zero would mean the FE isn't posting appendMessage
+            # at all (different bug).
+            _perf_log.info(
+                "[append_message] outcome=recovered_via_unique_index "
+                "thread_id=%s turn_id=%s role=%s message_id=%s",
+                thread_id,
+                turn_id_value,
+                message_role.value,
+                db_message.id,
+            )
+        else:
+            # INSERT succeeded — load the full ORM row so the
+            # response can include server-side-defaulted columns
+            # (``created_at``, etc.) and the relationship surface
+            # stays consistent with the recovery path.
+            inserted_row = await session.get(NewChatMessage, inserted_id)
+            if inserted_row is None:
+                # Should be impossible: we just inserted it in this
+                # same transaction. Fail loud if it happens.
+                raise HTTPException(
+                    status_code=500,
+                    detail="Inserted message could not be loaded.",
+                ) from None
+            db_message = inserted_row
+
+        # Persist token usage if provided (for assistant messages).
+        # ``cost_micros`` is the provider USD cost reported by LiteLLM,
+        # forwarded by the FE through the appendMessage round-trip so
+        # the historical TokenUsage row matches the credit debit applied
+        # at finalize time.
+        #
+        # De-dup: ``finalize_assistant_turn`` may also race to write a
+        # token_usage row for this same ``message_id`` (cross-session,
+        # cross-shielded). Use ``INSERT ... ON CONFLICT DO NOTHING`` keyed
+        # on the ``uq_token_usage_message_id`` partial unique index
+        # (migration 142). The loser silently drops its insert; exactly
+        # one row results regardless of which writer commits first.
+        token_usage_data = raw_body.get("token_usage")
+        if token_usage_data and message_role == NewChatMessageRole.ASSISTANT:
+            insert_stmt = (
+                pg_insert(TokenUsage)
+                .values(
+                    usage_type="chat",
+                    prompt_tokens=token_usage_data.get("prompt_tokens", 0),
+                    completion_tokens=token_usage_data.get("completion_tokens", 0),
+                    total_tokens=token_usage_data.get("total_tokens", 0),
+                    cost_micros=token_usage_data.get("cost_micros", 0),
+                    model_breakdown=token_usage_data.get("usage"),
+                    call_details=token_usage_data.get("call_details"),
+                    thread_id=thread_id,
+                    message_id=db_message.id,
+                    search_space_id=thread.search_space_id,
+                    user_id=user_uuid,
+                )
+                .on_conflict_do_nothing(
+                    index_elements=["message_id"],
+                    index_where=sa_text("message_id IS NOT NULL"),
+                )
+            )
+            await session.execute(insert_stmt)
+
         await session.commit()
 
-        # Return the in-memory object (already has id from flush) instead of
-        # doing an extra refresh() SELECT.
-        return db_message
+        # Build response manually to avoid lazy-loading the token_usage
+        # relationship after commit (which would trigger MissingGreenlet).
+        return NewChatMessageRead(
+            id=db_message.id,
+            thread_id=db_message.thread_id,
+            role=db_message.role,
+            content=db_message.content,
+            created_at=db_message.created_at,
+            author_id=db_message.author_id,
+            token_usage=None,
+            turn_id=db_message.turn_id,
+        )
 
     except HTTPException:
         raise
     except IntegrityError:
+        # Any IntegrityError that escaped the inline handler above
+        # comes from a *different* constraint (foreign key, etc.) —
+        # preserve the legacy 400 path.
         await session.rollback()
         raise HTTPException(
             status_code=400,
@@ -1003,6 +1632,7 @@ async def list_messages(
         # Get messages
         query = (
             select(NewChatMessage)
+            .options(selectinload(NewChatMessage.token_usage))
             .filter(NewChatMessage.thread_id == thread_id)
             .order_by(NewChatMessage.created_at)
             .offset(skip)
@@ -1059,6 +1689,7 @@ async def list_agent_tools(
 @router.post("/new_chat")
 async def handle_new_chat(
     request: NewChatRequest,
+    http_request: Request,
     session: AsyncSession = Depends(get_async_session),
     user: User = Depends(current_active_user),
 ):
@@ -1094,6 +1725,12 @@ async def handle_new_chat(
 
         # Check thread-level access based on visibility
         await check_thread_access(session, thread, user)
+        _raise_if_thread_busy_for_start(request.chat_id)
+        filesystem_selection = _resolve_filesystem_selection(
+            mode=request.filesystem_mode,
+            client_platform=request.client_platform,
+            local_mounts=request.local_filesystem_mounts,
+        )
 
         # Get search space to check LLM config preferences
         search_space_result = await session.execute(
@@ -1123,6 +1760,18 @@ async def handle_new_chat(
         # connection (the "Exception terminating connection" errors).
         await session.close()
 
+        image_urls = (
+            [p.as_data_url() for p in request.user_images]
+            if request.user_images
+            else None
+        )
+
+        mentioned_documents_payload = (
+            [doc.model_dump() for doc in request.mentioned_documents]
+            if request.mentioned_documents
+            else None
+        )
+
         return StreamingResponse(
             stream_new_chat(
                 user_query=request.user_query,
@@ -1132,10 +1781,15 @@ async def handle_new_chat(
                 llm_config_id=llm_config_id,
                 mentioned_document_ids=request.mentioned_document_ids,
                 mentioned_surfsense_doc_ids=request.mentioned_surfsense_doc_ids,
+                mentioned_folder_ids=request.mentioned_folder_ids,
+                mentioned_documents=mentioned_documents_payload,
                 needs_history_bootstrap=thread.needs_history_bootstrap,
                 thread_visibility=thread.visibility,
                 current_user_display_name=user.display_name or "A team member",
                 disabled_tools=request.disabled_tools,
+                filesystem_selection=filesystem_selection,
+                request_id=getattr(http_request.state, "request_id", "unknown"),
+                user_image_data_urls=image_urls,
             ),
             media_type="text/event-stream",
             headers={
@@ -1154,6 +1808,93 @@ async def handle_new_chat(
         ) from None
 
 
+@router.post(
+    "/threads/{thread_id}/cancel-active-turn",
+    response_model=CancelActiveTurnResponse,
+)
+async def cancel_active_turn(
+    thread_id: int,
+    response: Response,
+    session: AsyncSession = Depends(get_async_session),
+    user: User = Depends(current_active_user),
+):
+    """Signal cancellation for the currently running turn on ``thread_id``."""
+    result = await session.execute(
+        select(NewChatThread).filter(NewChatThread.id == thread_id)
+    )
+    thread = result.scalars().first()
+    if not thread:
+        raise HTTPException(status_code=404, detail="Thread not found")
+
+    await check_permission(
+        session,
+        user,
+        thread.search_space_id,
+        Permission.CHATS_UPDATE.value,
+        "You don't have permission to update chats in this search space",
+    )
+    await check_thread_access(session, thread, user)
+
+    status_payload = _build_turn_status_payload(thread_id)
+    if status_payload["status"] == "idle":
+        return CancelActiveTurnResponse(
+            status="idle",
+            error_code="NO_ACTIVE_TURN",
+        )
+
+    request_cancel(str(thread_id))
+    response.status_code = 202
+    updated_payload = _build_turn_status_payload(thread_id)
+    retry_after_ms = int(updated_payload.get("retry_after_ms") or 0)
+    retry_after_at = (
+        int(updated_payload["retry_after_at"])
+        if "retry_after_at" in updated_payload
+        else None
+    )
+    if retry_after_ms > 0:
+        _set_retry_after_headers(response, retry_after_ms)
+    return CancelActiveTurnResponse(
+        status="cancelling",
+        error_code="TURN_CANCELLING",
+        retry_after_ms=retry_after_ms if retry_after_ms > 0 else None,
+        retry_after_at=retry_after_at,
+    )
+
+
+@router.get(
+    "/threads/{thread_id}/turn-status",
+    response_model=TurnStatusResponse,
+)
+async def get_turn_status(
+    thread_id: int,
+    session: AsyncSession = Depends(get_async_session),
+    user: User = Depends(current_active_user),
+):
+    result = await session.execute(
+        select(NewChatThread).filter(NewChatThread.id == thread_id)
+    )
+    thread = result.scalars().first()
+    if not thread:
+        raise HTTPException(status_code=404, detail="Thread not found")
+
+    await check_permission(
+        session,
+        user,
+        thread.search_space_id,
+        Permission.CHATS_READ.value,
+        "You don't have permission to view chats in this search space",
+    )
+    await check_thread_access(session, thread, user)
+
+    status_payload = _build_turn_status_payload(thread_id)
+    return TurnStatusResponse(
+        status=status_payload["status"],  # type: ignore[arg-type]
+        active_turn_id=None,
+        retry_after_ms=status_payload.get("retry_after_ms"),  # type: ignore[arg-type]
+        retry_after_at=status_payload.get("retry_after_at"),  # type: ignore[arg-type]
+    )
+
+
 # =============================================================================
 # Chat Regeneration Endpoint (Edit/Reload)
 # =============================================================================
@@ -1163,6 +1904,7 @@ async def handle_new_chat(
 async def regenerate_response(
     thread_id: int,
     request: RegenerateRequest,
+    http_request: Request,
     session: AsyncSession = Depends(get_async_session),
     user: User = Depends(current_active_user),
 ):
@@ -1208,6 +1950,12 @@ async def regenerate_response(
 
         # Check thread-level access based on visibility
         await check_thread_access(session, thread, user)
+        _raise_if_thread_busy_for_start(thread_id)
+        filesystem_selection = _resolve_filesystem_selection(
+            mode=request.filesystem_mode,
+            client_platform=request.client_platform,
+            local_mounts=request.local_filesystem_mounts,
+        )
 
         # Get the checkpointer and state history
         checkpointer = await get_checkpointer()
@@ -1238,40 +1986,125 @@ async def regenerate_response(
 
         target_checkpoint_id = None
         user_query_to_use = request.user_query
+        regenerate_image_urls: list[str] = []
+
+        # ---------------------------------------------------------------
+        # Edit-from-arbitrary-position. When the client passes
+        # ``from_message_id`` we look up its persisted ``turn_id`` (added
+        # in migration 136) and pick the checkpoint immediately before
+        # that turn started.
+        #
+        # Legacy graceful-degradation contract:
+        #   * Rows persisted BEFORE migration 136 have ``turn_id IS NULL``.
+        #     Returning 400 in that case is the wrong UX — the user is
+        #     editing an old message in an existing thread and just wants
+        #     it to work. We instead skip the checkpoint rewind (the
+        #     stream falls back to the latest state) and skip the revert
+        #     pass (no chat_turn_id available to walk). Deletion still
+        #     uses ``created_at``, so the messages-after-cursor slice is
+        #     correct on both legacy and post-136 rows.
+        # ---------------------------------------------------------------
+        from_message_turn_id: str | None = None
+        from_message_created_at: datetime | None = None
+        legacy_from_message: bool = False
+        if request.from_message_id is not None:
+            from_msg_row = await session.execute(
+                select(NewChatMessage).filter(
+                    NewChatMessage.id == request.from_message_id,
+                    NewChatMessage.thread_id == thread_id,
+                )
+            )
+            from_msg = from_msg_row.scalars().first()
+            if from_msg is None:
+                raise HTTPException(
+                    status_code=404,
+                    detail="from_message_id not found in this thread.",
+                )
+            from_message_created_at = from_msg.created_at
+            if not from_msg.turn_id:
+                # Legacy row — surface the degradation in logs but let
+                # the request proceed with the slice-based delete and a
+                # cold-start checkpoint.
+                legacy_from_message = True
+                _logger.warning(
+                    "[regenerate] from_message_id=%s on thread=%s has no "
+                    "turn_id (legacy row pre-migration-136). Falling back "
+                    "to slice-based delete without checkpoint rewind. "
+                    "revert_actions=%s will be ignored.",
+                    request.from_message_id,
+                    thread_id,
+                    request.revert_actions,
+                )
+            else:
+                from_message_turn_id = from_msg.turn_id
+
+                # Walk oldest-to-newest and pick the LAST checkpoint whose
+                # ``turn_id`` differs from the edited turn — that's the state
+                # immediately before this turn started running. We read from
+                # ``metadata`` (the durable surface) rather than
+                # ``config["configurable"]`` so the lookup works across
+                # checkpointer implementations.
+                target_checkpoint_id = _find_pre_turn_checkpoint_id(
+                    checkpoint_tuples,
+                    turn_id=from_message_turn_id,
+                )
+                if target_checkpoint_id is None and len(checkpoint_tuples) > 0:
+                    # Fall back to the oldest checkpoint — better than
+                    # 400ing when the agent didn't checkpoint pre-turn
+                    # (e.g. very first turn of the thread).
+                    target_checkpoint_id = checkpoint_tuples[-1].config["configurable"][
+                        "checkpoint_id"
+                    ]
 
         # Look through checkpoints to find the right one
-        # We want to find the checkpoint just before the last HumanMessage
-        for i, cp_tuple in enumerate(checkpoint_tuples):
-            # Access the checkpoint's channel_values which contains "messages"
-            checkpoint_data = cp_tuple.checkpoint
-            channel_values = checkpoint_data.get("channel_values", {})
-            state_messages = channel_values.get("messages", [])
-
-            if state_messages:
-                last_msg = state_messages[-1]
-                # Find a checkpoint where the last message is NOT a HumanMessage
-                # This means we're at a state before the user's last message
-                if not isinstance(last_msg, HumanMessage):
-                    # If no new user_query provided (reload), extract from a later checkpoint
-                    if user_query_to_use is None and i > 0:
-                        # Get the user query from a more recent checkpoint
-                        for prev_cp_tuple in checkpoint_tuples[:i]:
-                            prev_checkpoint_data = prev_cp_tuple.checkpoint
-                            prev_channel_values = prev_checkpoint_data.get(
-                                "channel_values", {}
-                            )
-                            prev_messages = prev_channel_values.get("messages", [])
-                            for msg in reversed(prev_messages):
-                                if isinstance(msg, HumanMessage):
-                                    user_query_to_use = msg.content
+        # We want to find the checkpoint just before the last HumanMessage.
+        # We enter this branch when:
+        #   * the client did NOT pin ``from_message_id`` (legacy reload/edit), OR
+        #   * the client pinned ``from_message_id`` but the row is a
+        #     legacy pre-migration-136 row with no ``turn_id`` (we
+        #     downgraded to the same heuristic as a regular reload).
+        # We DO skip it when a real turn_id pinned ``target_checkpoint_id``
+        # — that's the C1 happy path and the heuristic below would just
+        # re-derive a worse target.
+        if request.from_message_id is None or legacy_from_message:
+            for i, cp_tuple in enumerate(checkpoint_tuples):
+                # Access the checkpoint's channel_values which contains "messages"
+                checkpoint_data = cp_tuple.checkpoint
+                channel_values = checkpoint_data.get("channel_values", {})
+                state_messages = channel_values.get("messages", [])
+
+                if state_messages:
+                    last_msg = state_messages[-1]
+                    # Find a checkpoint where the last message is NOT a HumanMessage
+                    # This means we're at a state before the user's last message
+                    if not isinstance(last_msg, HumanMessage):
+                        # If no new user_query provided (reload), extract from a later checkpoint
+                        if user_query_to_use is None and i > 0:
+                            # Get the user query from a more recent checkpoint
+                            for prev_cp_tuple in checkpoint_tuples[:i]:
+                                prev_checkpoint_data = prev_cp_tuple.checkpoint
+                                prev_channel_values = prev_checkpoint_data.get(
+                                    "channel_values", {}
+                                )
+                                prev_messages = prev_channel_values.get("messages", [])
+                                for msg in reversed(prev_messages):
+                                    if isinstance(msg, HumanMessage):
+                                        q, imgs = split_langchain_human_content(
+                                            msg.content
+                                        )
+                                        user_query_to_use = q
+                                        regenerate_image_urls = imgs
+                                        break
+                                if user_query_to_use is not None and (
+                                    str(user_query_to_use).strip()
+                                    or regenerate_image_urls
+                                ):
                                     break
-                            if user_query_to_use:
-                                break
 
-                    target_checkpoint_id = cp_tuple.config["configurable"][
-                        "checkpoint_id"
-                    ]
-                    break
+                        target_checkpoint_id = cp_tuple.config["configurable"][
+                            "checkpoint_id"
+                        ]
+                        break
 
         # If we couldn't find a good checkpoint, try alternative approaches
         if target_checkpoint_id is None and checkpoint_tuples:
@@ -1283,7 +2116,9 @@ async def regenerate_response(
                     state_messages = channel_values.get("messages", [])
                     for msg in state_messages:
                         if isinstance(msg, HumanMessage):
-                            user_query_to_use = msg.content
+                            q, imgs = split_langchain_human_content(msg.content)
+                            user_query_to_use = q
+                            regenerate_image_urls = imgs
                             break
             else:
                 # Use the oldest checkpoint
@@ -1309,33 +2144,74 @@ async def regenerate_response(
                 if isinstance(content, str):
                     user_query_to_use = content
                 elif isinstance(content, list):
-                    # Extract text from content parts
-                    for part in content:
-                        if isinstance(part, dict) and part.get("type") == "text":
-                            user_query_to_use = part.get("text", "")
-                            break
-                        elif isinstance(part, str):
-                            user_query_to_use = part
-                            break
+                    plain, imgs = split_persisted_user_content_parts(content)
+                    user_query_to_use = plain
+                    regenerate_image_urls = imgs
+
+        if isinstance(user_query_to_use, list):
+            user_query_to_use, regenerate_image_urls = split_langchain_human_content(
+                user_query_to_use
+            )
+
+        if request.user_images is not None:
+            regenerate_image_urls = [p.as_data_url() for p in request.user_images]
 
         if user_query_to_use is None:
             raise HTTPException(
                 status_code=400,
                 detail="Could not determine user query for regeneration. Please provide a user_query.",
             )
+        if not str(user_query_to_use).strip() and not regenerate_image_urls:
+            raise HTTPException(
+                status_code=400,
+                detail="Could not determine user query for regeneration. Please provide a user_query.",
+            )
 
-        # Get the last two messages to delete AFTER streaming succeeds
-        # This prevents data loss if streaming fails
-        last_messages_result = await session.execute(
-            select(NewChatMessage)
-            .filter(NewChatMessage.thread_id == thread_id)
-            .order_by(NewChatMessage.created_at.desc())
-            .limit(2)
-        )
+        # Get the messages to delete AFTER streaming succeeds.
+        # This prevents data loss if streaming fails.
+        #
+        # When ``from_message_id`` is set we slice from that message
+        # forward (using ``created_at`` so we also catch any tool/system
+        # messages persisted into the same turn). Otherwise
+        # we keep the legacy "last 2 messages" rewind.
+        if request.from_message_id is not None and from_message_created_at is not None:
+            last_messages_result = await session.execute(
+                select(NewChatMessage)
+                .filter(
+                    NewChatMessage.thread_id == thread_id,
+                    NewChatMessage.created_at >= from_message_created_at,
+                )
+                .order_by(NewChatMessage.created_at.desc())
+            )
+        else:
+            last_messages_result = await session.execute(
+                select(NewChatMessage)
+                .filter(NewChatMessage.thread_id == thread_id)
+                .order_by(NewChatMessage.created_at.desc())
+                .limit(2)
+            )
         messages_to_delete = list(last_messages_result.scalars().all())
 
         message_ids_to_delete = [msg.id for msg in messages_to_delete]
 
+        # When revert_actions is requested, collect the set of
+        # ``chat_turn_id``s present in the slice we're about to delete.
+        # Each one will be reverted (best-effort) BEFORE the regenerate
+        # stream begins. Legacy rows have ``turn_id=None`` and silently
+        # contribute nothing — we already logged the degradation above.
+        revert_turn_ids: list[str] = []
+        if (
+            request.revert_actions
+            and request.from_message_id is not None
+            and not legacy_from_message
+        ):
+            seen_turns: set[str] = set()
+            for msg in messages_to_delete:
+                tid = msg.turn_id
+                if tid and tid not in seen_turns:
+                    seen_turns.add(tid)
+                    revert_turn_ids.append(tid)
+
         # Get search space for LLM config
         search_space_result = await session.execute(
             select(SearchSpace).filter(SearchSpace.id == request.search_space_id)
@@ -1359,20 +2235,49 @@ async def regenerate_response(
         # This prevents data loss if streaming fails (network error, LLM error, etc.)
         async def stream_with_cleanup():
             streaming_completed = False
+            # Best-effort revert pass BEFORE the regenerate stream begins.
+            # Each turn is reverted independently (per-row SAVEPOINTs
+            # inside the route helper) and the per-action results are surfaced
+            # on a single ``data-revert-results`` SSE event so the frontend
+            # can render any failed rows alongside the new turn. Failures here
+            # do NOT abort the regeneration — partial rollback is documented
+            # behaviour.
+            if revert_turn_ids:
+                revert_results = await _revert_turns_for_regenerate(
+                    thread_id=thread_id,
+                    chat_turn_ids=revert_turn_ids,
+                    requester_user_id=str(user.id),
+                )
+                envelope = {
+                    "type": "data-revert-results",
+                    "data": revert_results,
+                }
+                yield f"data: {json.dumps(envelope, default=str)}\n\n".encode()
+            mentioned_documents_payload = (
+                [doc.model_dump() for doc in request.mentioned_documents]
+                if request.mentioned_documents
+                else None
+            )
             try:
                 async for chunk in stream_new_chat(
-                    user_query=user_query_to_use,
+                    user_query=str(user_query_to_use),
                     search_space_id=request.search_space_id,
                     chat_id=thread_id,
                     user_id=str(user.id),
                     llm_config_id=llm_config_id,
                     mentioned_document_ids=request.mentioned_document_ids,
                     mentioned_surfsense_doc_ids=request.mentioned_surfsense_doc_ids,
+                    mentioned_folder_ids=request.mentioned_folder_ids,
+                    mentioned_documents=mentioned_documents_payload,
                     checkpoint_id=target_checkpoint_id,
                     needs_history_bootstrap=thread.needs_history_bootstrap,
                     thread_visibility=thread.visibility,
                     current_user_display_name=user.display_name or "A team member",
                     disabled_tools=request.disabled_tools,
+                    filesystem_selection=filesystem_selection,
+                    request_id=getattr(http_request.state, "request_id", "unknown"),
+                    user_image_data_urls=regenerate_image_urls or None,
+                    flow="regenerate",
                 ):
                     yield chunk
                 streaming_completed = True
@@ -1438,6 +2343,7 @@ async def stream_with_cleanup():
 async def resume_chat(
     thread_id: int,
     request: ResumeRequest,
+    http_request: Request,
     session: AsyncSession = Depends(get_async_session),
     user: User = Depends(current_active_user),
 ):
@@ -1459,6 +2365,12 @@ async def resume_chat(
         )
 
         await check_thread_access(session, thread, user)
+        _raise_if_thread_busy_for_start(thread_id)
+        filesystem_selection = _resolve_filesystem_selection(
+            mode=request.filesystem_mode,
+            client_platform=request.client_platform,
+            local_mounts=request.local_filesystem_mounts,
+        )
 
         search_space_result = await session.execute(
             select(SearchSpace).filter(SearchSpace.id == request.search_space_id)
@@ -1487,6 +2399,9 @@ async def resume_chat(
                 user_id=str(user.id),
                 llm_config_id=llm_config_id,
                 thread_visibility=thread.visibility,
+                filesystem_selection=filesystem_selection,
+                request_id=getattr(http_request.state, "request_id", "unknown"),
+                disabled_tools=request.disabled_tools,
             ),
             media_type="text/event-stream",
             headers={
diff --git a/surfsense_backend/app/routes/new_llm_config_routes.py b/surfsense_backend/app/routes/new_llm_config_routes.py
index 78907c7197..e090a1a7c8 100644
--- a/surfsense_backend/app/routes/new_llm_config_routes.py
+++ b/surfsense_backend/app/routes/new_llm_config_routes.py
@@ -29,6 +29,7 @@
     NewLLMConfigUpdate,
 )
 from app.services.llm_service import validate_llm_config
+from app.services.provider_capabilities import derive_supports_image_input
 from app.users import current_active_user
 from app.utils.rbac import check_permission
 
@@ -36,6 +37,39 @@
 logger = logging.getLogger(__name__)
 
 
+def _serialize_byok_config(config: NewLLMConfig) -> NewLLMConfigRead:
+    """Augment a BYOK chat config row with the derived ``supports_image_input``.
+
+    There is no DB column for ``supports_image_input`` — the value is
+    resolved at the API boundary from LiteLLM's authoritative model map
+    (default-allow on unknown). Returning ``NewLLMConfigRead`` here keeps
+    the response shape consistent across list / detail / create / update
+    endpoints without having to remember to set the field at every call
+    site.
+    """
+    provider_value = (
+        config.provider.value
+        if hasattr(config.provider, "value")
+        else str(config.provider)
+    )
+    litellm_params = config.litellm_params or {}
+    base_model = (
+        litellm_params.get("base_model") if isinstance(litellm_params, dict) else None
+    )
+    supports_image_input = derive_supports_image_input(
+        provider=provider_value,
+        model_name=config.model_name,
+        base_model=base_model,
+        custom_provider=config.custom_provider,
+    )
+    # ``model_validate`` runs the Pydantic conversion using the ORM
+    # attribute access path enabled by ``ConfigDict(from_attributes=True)``,
+    # then we layer the derived field on. ``model_copy(update=...)`` keeps
+    # the surface immutable from the caller's perspective.
+    base_read = NewLLMConfigRead.model_validate(config)
+    return base_read.model_copy(update={"supports_image_input": supports_image_input})
+
+
 # =============================================================================
 # Global Configs Routes
 # =============================================================================
@@ -76,11 +110,49 @@ async def get_global_new_llm_configs(
                     "citations_enabled": True,
                     "is_global": True,
                     "is_auto_mode": True,
+                    "billing_tier": "free",
+                    "is_premium": False,
+                    "anonymous_enabled": False,
+                    "seo_enabled": False,
+                    "seo_slug": None,
+                    "seo_title": None,
+                    "seo_description": None,
+                    "quota_reserve_tokens": None,
+                    # Auto routes across the configured pool, which usually
+                    # includes at least one vision-capable deployment, so
+                    # treat Auto as image-capable. The router itself will
+                    # still pick a vision-capable deployment for messages
+                    # carrying image_url blocks (LiteLLM Router falls back
+                    # on ``404`` per its ``allowed_fails`` policy).
+                    "supports_image_input": True,
                 }
             )
 
         # Add individual global configs
         for cfg in global_configs:
+            # Capability resolution: explicit value (YAML override or OR
+            # `_supports_image_input(model)` payload baked in by the
+            # OpenRouter integration service) wins. Fall back to the
+            # LiteLLM-driven helper which default-allows on unknown so
+            # we don't hide vision-capable models that happen to lack a
+            # YAML annotation. The streaming task safety net is the
+            # only place a False ever blocks.
+            if "supports_image_input" in cfg:
+                supports_image_input = bool(cfg.get("supports_image_input"))
+            else:
+                cfg_litellm_params = cfg.get("litellm_params") or {}
+                cfg_base_model = (
+                    cfg_litellm_params.get("base_model")
+                    if isinstance(cfg_litellm_params, dict)
+                    else None
+                )
+                supports_image_input = derive_supports_image_input(
+                    provider=cfg.get("provider"),
+                    model_name=cfg.get("model_name"),
+                    base_model=cfg_base_model,
+                    custom_provider=cfg.get("custom_provider"),
+                )
+
             safe_config = {
                 "id": cfg.get("id"),
                 "name": cfg.get("name"),
@@ -97,6 +169,15 @@ async def get_global_new_llm_configs(
                 ),
                 "citations_enabled": cfg.get("citations_enabled", True),
                 "is_global": True,
+                "billing_tier": cfg.get("billing_tier", "free"),
+                "is_premium": cfg.get("billing_tier", "free") == "premium",
+                "anonymous_enabled": cfg.get("anonymous_enabled", False),
+                "seo_enabled": cfg.get("seo_enabled", False),
+                "seo_slug": cfg.get("seo_slug"),
+                "seo_title": cfg.get("seo_title"),
+                "seo_description": cfg.get("seo_description"),
+                "quota_reserve_tokens": cfg.get("quota_reserve_tokens"),
+                "supports_image_input": supports_image_input,
             }
             safe_configs.append(safe_config)
 
@@ -155,7 +236,7 @@ async def create_new_llm_config(
         await session.commit()
         await session.refresh(db_config)
 
-        return db_config
+        return _serialize_byok_config(db_config)
 
     except HTTPException:
         raise
@@ -197,7 +278,7 @@ async def list_new_llm_configs(
             .limit(limit)
         )
 
-        return result.scalars().all()
+        return [_serialize_byok_config(cfg) for cfg in result.scalars().all()]
 
     except HTTPException:
         raise
@@ -252,7 +333,7 @@ async def get_new_llm_config(
             "You don't have permission to view LLM configurations in this search space",
         )
 
-        return config
+        return _serialize_byok_config(config)
 
     except HTTPException:
         raise
@@ -344,7 +425,7 @@ async def update_new_llm_config(
         await session.commit()
         await session.refresh(config)
 
-        return config
+        return _serialize_byok_config(config)
 
     except HTTPException:
         raise
diff --git a/surfsense_backend/app/routes/oauth_connector_base.py b/surfsense_backend/app/routes/oauth_connector_base.py
new file mode 100644
index 0000000000..5b75d85198
--- /dev/null
+++ b/surfsense_backend/app/routes/oauth_connector_base.py
@@ -0,0 +1,623 @@
+"""Reusable base for OAuth 2.0 connector routes.
+
+Subclasses override ``fetch_account_info``, ``build_connector_config``,
+and ``get_connector_display_name`` to customise provider-specific behaviour.
+Call ``build_router()`` to get a FastAPI ``APIRouter`` with ``/connector/add``,
+``/connector/callback``, and ``/connector/reauth`` endpoints.
+"""
+
+from __future__ import annotations
+
+import base64
+import contextlib
+import logging
+from datetime import UTC, datetime, timedelta
+from typing import Any
+from urllib.parse import urlencode
+from uuid import UUID
+
+import httpx
+from fastapi import APIRouter, Depends, HTTPException
+from fastapi.responses import RedirectResponse
+from sqlalchemy import select
+from sqlalchemy.exc import IntegrityError
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.orm.attributes import flag_modified
+
+from app.config import config
+from app.db import (
+    SearchSourceConnector,
+    SearchSourceConnectorType,
+    User,
+    get_async_session,
+)
+from app.users import current_active_user
+from app.utils.connector_naming import (
+    check_duplicate_connector,
+    generate_unique_connector_name,
+)
+from app.utils.oauth_security import OAuthStateManager, TokenEncryption
+
+logger = logging.getLogger(__name__)
+
+
+class OAuthConnectorRoute:
+    def __init__(
+        self,
+        *,
+        provider_name: str,
+        connector_type: SearchSourceConnectorType,
+        authorize_url: str,
+        token_url: str,
+        client_id_env: str,
+        client_secret_env: str,
+        redirect_uri_env: str,
+        scopes: list[str],
+        auth_prefix: str,
+        use_pkce: bool = False,
+        token_auth_method: str = "body",
+        is_indexable: bool = True,
+        extra_auth_params: dict[str, str] | None = None,
+    ) -> None:
+        self.provider_name = provider_name
+        self.connector_type = connector_type
+        self.authorize_url = authorize_url
+        self.token_url = token_url
+        self.client_id_env = client_id_env
+        self.client_secret_env = client_secret_env
+        self.redirect_uri_env = redirect_uri_env
+        self.scopes = scopes
+        self.auth_prefix = auth_prefix.rstrip("/")
+        self.use_pkce = use_pkce
+        self.token_auth_method = token_auth_method
+        self.is_indexable = is_indexable
+        self.extra_auth_params = extra_auth_params or {}
+
+        self._state_manager: OAuthStateManager | None = None
+        self._token_encryption: TokenEncryption | None = None
+
+    def _get_client_id(self) -> str:
+        value = getattr(config, self.client_id_env, None)
+        if not value:
+            raise HTTPException(
+                status_code=500,
+                detail=f"{self.provider_name.title()} OAuth not configured "
+                f"({self.client_id_env} missing).",
+            )
+        return value
+
+    def _get_client_secret(self) -> str:
+        value = getattr(config, self.client_secret_env, None)
+        if not value:
+            raise HTTPException(
+                status_code=500,
+                detail=f"{self.provider_name.title()} OAuth not configured "
+                f"({self.client_secret_env} missing).",
+            )
+        return value
+
+    def _get_redirect_uri(self) -> str:
+        value = getattr(config, self.redirect_uri_env, None)
+        if not value:
+            raise HTTPException(
+                status_code=500,
+                detail=f"{self.redirect_uri_env} not configured.",
+            )
+        return value
+
+    def _get_state_manager(self) -> OAuthStateManager:
+        if self._state_manager is None:
+            if not config.SECRET_KEY:
+                raise HTTPException(
+                    status_code=500,
+                    detail="SECRET_KEY not configured for OAuth security.",
+                )
+            self._state_manager = OAuthStateManager(config.SECRET_KEY)
+        return self._state_manager
+
+    def _get_token_encryption(self) -> TokenEncryption:
+        if self._token_encryption is None:
+            if not config.SECRET_KEY:
+                raise HTTPException(
+                    status_code=500,
+                    detail="SECRET_KEY not configured for token encryption.",
+                )
+            self._token_encryption = TokenEncryption(config.SECRET_KEY)
+        return self._token_encryption
+
+    def _frontend_redirect(
+        self,
+        space_id: int | None,
+        *,
+        success: bool = False,
+        connector_id: int | None = None,
+        error: str | None = None,
+    ) -> RedirectResponse:
+        if success and space_id:
+            connector_slug = f"{self.provider_name}-connector"
+            qs = f"success=true&connector={connector_slug}"
+            if connector_id:
+                qs += f"&connectorId={connector_id}"
+            return RedirectResponse(
+                url=f"{config.NEXT_FRONTEND_URL}/dashboard/{space_id}/connectors/callback?{qs}"
+            )
+        if error and space_id:
+            return RedirectResponse(
+                url=f"{config.NEXT_FRONTEND_URL}/dashboard/{space_id}/connectors/callback?error={error}"
+            )
+        if error:
+            return RedirectResponse(
+                url=f"{config.NEXT_FRONTEND_URL}/dashboard?error={error}"
+            )
+        return RedirectResponse(url=f"{config.NEXT_FRONTEND_URL}/dashboard")
+
+    async def fetch_account_info(self, access_token: str) -> dict[str, Any]:
+        """Override to fetch account/workspace info after token exchange.
+
+        Return dict is merged into connector config; key ``"name"`` is used
+        for the display name and dedup.
+        """
+        return {}
+
+    def build_connector_config(
+        self,
+        token_json: dict[str, Any],
+        account_info: dict[str, Any],
+        encryption: TokenEncryption,
+    ) -> dict[str, Any]:
+        """Override for custom config shapes. Default: standard encrypted OAuth fields."""
+        access_token = token_json.get("access_token", "")
+        refresh_token = token_json.get("refresh_token")
+
+        expires_at = None
+        if token_json.get("expires_in"):
+            expires_at = datetime.now(UTC) + timedelta(
+                seconds=int(token_json["expires_in"])
+            )
+
+        cfg: dict[str, Any] = {
+            "access_token": encryption.encrypt_token(access_token),
+            "refresh_token": (
+                encryption.encrypt_token(refresh_token) if refresh_token else None
+            ),
+            "token_type": token_json.get("token_type", "Bearer"),
+            "expires_in": token_json.get("expires_in"),
+            "expires_at": expires_at.isoformat() if expires_at else None,
+            "scope": token_json.get("scope"),
+            "_token_encrypted": True,
+        }
+        cfg.update(account_info)
+        return cfg
+
+    def get_connector_display_name(self, account_info: dict[str, Any]) -> str:
+        return str(account_info.get("name", self.provider_name.title()))
+
+    async def on_token_refresh_failure(
+        self,
+        session: AsyncSession,
+        connector: SearchSourceConnector,
+    ) -> None:
+        try:
+            connector.config = {**connector.config, "auth_expired": True}
+            flag_modified(connector, "config")
+            await session.commit()
+            await session.refresh(connector)
+        except Exception:
+            logger.warning(
+                "Failed to persist auth_expired flag for connector %s",
+                connector.id,
+                exc_info=True,
+            )
+
+    async def _exchange_code(
+        self, code: str, extra_state: dict[str, Any]
+    ) -> dict[str, Any]:
+        client_id = self._get_client_id()
+        client_secret = self._get_client_secret()
+        redirect_uri = self._get_redirect_uri()
+
+        headers: dict[str, str] = {
+            "Content-Type": "application/x-www-form-urlencoded",
+        }
+        body: dict[str, str] = {
+            "grant_type": "authorization_code",
+            "code": code,
+            "redirect_uri": redirect_uri,
+        }
+
+        if self.token_auth_method == "basic":
+            creds = base64.b64encode(f"{client_id}:{client_secret}".encode()).decode()
+            headers["Authorization"] = f"Basic {creds}"
+        else:
+            body["client_id"] = client_id
+            body["client_secret"] = client_secret
+
+        if self.use_pkce:
+            verifier = extra_state.get("code_verifier")
+            if verifier:
+                body["code_verifier"] = verifier
+
+        async with httpx.AsyncClient() as client:
+            resp = await client.post(
+                self.token_url, data=body, headers=headers, timeout=30.0
+            )
+
+        if resp.status_code != 200:
+            detail = resp.text
+            with contextlib.suppress(Exception):
+                detail = resp.json().get("error_description", detail)
+            raise HTTPException(
+                status_code=400, detail=f"Token exchange failed: {detail}"
+            )
+
+        return resp.json()
+
+    async def refresh_token(
+        self, session: AsyncSession, connector: SearchSourceConnector
+    ) -> SearchSourceConnector:
+        encryption = self._get_token_encryption()
+        is_encrypted = connector.config.get("_token_encrypted", False)
+
+        refresh_tok = connector.config.get("refresh_token")
+        if is_encrypted and refresh_tok:
+            try:
+                refresh_tok = encryption.decrypt_token(refresh_tok)
+            except Exception as e:
+                logger.error("Failed to decrypt refresh token: %s", e)
+                raise HTTPException(
+                    status_code=500, detail="Failed to decrypt stored refresh token"
+                ) from e
+
+        if not refresh_tok:
+            await self.on_token_refresh_failure(session, connector)
+            raise HTTPException(
+                status_code=400,
+                detail="No refresh token available. Please re-authenticate.",
+            )
+
+        client_id = self._get_client_id()
+        client_secret = self._get_client_secret()
+
+        headers: dict[str, str] = {
+            "Content-Type": "application/x-www-form-urlencoded",
+        }
+        body: dict[str, str] = {
+            "grant_type": "refresh_token",
+            "refresh_token": refresh_tok,
+        }
+
+        if self.token_auth_method == "basic":
+            creds = base64.b64encode(f"{client_id}:{client_secret}".encode()).decode()
+            headers["Authorization"] = f"Basic {creds}"
+        else:
+            body["client_id"] = client_id
+            body["client_secret"] = client_secret
+
+        async with httpx.AsyncClient() as client:
+            resp = await client.post(
+                self.token_url, data=body, headers=headers, timeout=30.0
+            )
+
+        if resp.status_code != 200:
+            error_detail = resp.text
+            try:
+                ej = resp.json()
+                error_detail = ej.get("error_description", error_detail)
+                error_code = ej.get("error", "")
+            except Exception:
+                error_code = ""
+            combined = (error_detail + error_code).lower()
+            if any(kw in combined for kw in ("invalid_grant", "expired", "revoked")):
+                await self.on_token_refresh_failure(session, connector)
+                raise HTTPException(
+                    status_code=401,
+                    detail=f"{self.provider_name.title()} authentication failed. "
+                    "Please re-authenticate.",
+                )
+            raise HTTPException(
+                status_code=400, detail=f"Token refresh failed: {error_detail}"
+            )
+
+        token_json = resp.json()
+        new_access = token_json.get("access_token")
+        if not new_access:
+            raise HTTPException(
+                status_code=400, detail="No access token received from refresh"
+            )
+
+        expires_at = None
+        if token_json.get("expires_in"):
+            expires_at = datetime.now(UTC) + timedelta(
+                seconds=int(token_json["expires_in"])
+            )
+
+        updated_config = dict(connector.config)
+        updated_config["access_token"] = encryption.encrypt_token(new_access)
+        new_refresh = token_json.get("refresh_token")
+        if new_refresh:
+            updated_config["refresh_token"] = encryption.encrypt_token(new_refresh)
+        updated_config["expires_in"] = token_json.get("expires_in")
+        updated_config["expires_at"] = expires_at.isoformat() if expires_at else None
+        updated_config["scope"] = token_json.get("scope", updated_config.get("scope"))
+        updated_config["_token_encrypted"] = True
+        updated_config.pop("auth_expired", None)
+
+        connector.config = updated_config
+        flag_modified(connector, "config")
+        await session.commit()
+        await session.refresh(connector)
+
+        logger.info(
+            "Refreshed %s token for connector %s",
+            self.provider_name,
+            connector.id,
+        )
+        return connector
+
+    def build_router(self) -> APIRouter:
+        router = APIRouter()
+        oauth = self
+
+        @router.get(f"{oauth.auth_prefix}/connector/add")
+        async def connect(
+            space_id: int,
+            user: User = Depends(current_active_user),
+        ):
+            if not space_id:
+                raise HTTPException(status_code=400, detail="space_id is required")
+
+            client_id = oauth._get_client_id()
+            state_mgr = oauth._get_state_manager()
+
+            extra_state: dict[str, Any] = {}
+            auth_params: dict[str, str] = {
+                "client_id": client_id,
+                "response_type": "code",
+                "redirect_uri": oauth._get_redirect_uri(),
+                "scope": " ".join(oauth.scopes),
+            }
+
+            if oauth.use_pkce:
+                from app.utils.oauth_security import generate_pkce_pair
+
+                verifier, challenge = generate_pkce_pair()
+                extra_state["code_verifier"] = verifier
+                auth_params["code_challenge"] = challenge
+                auth_params["code_challenge_method"] = "S256"
+
+            auth_params.update(oauth.extra_auth_params)
+
+            state_encoded = state_mgr.generate_secure_state(
+                space_id, user.id, **extra_state
+            )
+            auth_params["state"] = state_encoded
+            auth_url = f"{oauth.authorize_url}?{urlencode(auth_params)}"
+
+            logger.info(
+                "Generated %s OAuth URL for user %s, space %s",
+                oauth.provider_name,
+                user.id,
+                space_id,
+            )
+            return {"auth_url": auth_url}
+
+        @router.get(f"{oauth.auth_prefix}/connector/reauth")
+        async def reauth(
+            space_id: int,
+            connector_id: int,
+            return_url: str | None = None,
+            user: User = Depends(current_active_user),
+            session: AsyncSession = Depends(get_async_session),
+        ):
+            result = await session.execute(
+                select(SearchSourceConnector).filter(
+                    SearchSourceConnector.id == connector_id,
+                    SearchSourceConnector.user_id == user.id,
+                    SearchSourceConnector.search_space_id == space_id,
+                    SearchSourceConnector.connector_type == oauth.connector_type,
+                )
+            )
+            if not result.scalars().first():
+                raise HTTPException(
+                    status_code=404,
+                    detail=f"{oauth.provider_name.title()} connector not found "
+                    "or access denied",
+                )
+
+            client_id = oauth._get_client_id()
+            state_mgr = oauth._get_state_manager()
+
+            extra: dict[str, Any] = {"connector_id": connector_id}
+            if (
+                return_url
+                and return_url.startswith("/")
+                and not return_url.startswith("//")
+            ):
+                extra["return_url"] = return_url
+
+            auth_params: dict[str, str] = {
+                "client_id": client_id,
+                "response_type": "code",
+                "redirect_uri": oauth._get_redirect_uri(),
+                "scope": " ".join(oauth.scopes),
+            }
+
+            if oauth.use_pkce:
+                from app.utils.oauth_security import generate_pkce_pair
+
+                verifier, challenge = generate_pkce_pair()
+                extra["code_verifier"] = verifier
+                auth_params["code_challenge"] = challenge
+                auth_params["code_challenge_method"] = "S256"
+
+            auth_params.update(oauth.extra_auth_params)
+
+            state_encoded = state_mgr.generate_secure_state(space_id, user.id, **extra)
+            auth_params["state"] = state_encoded
+            auth_url = f"{oauth.authorize_url}?{urlencode(auth_params)}"
+
+            logger.info(
+                "Initiating %s re-auth for user %s, connector %s",
+                oauth.provider_name,
+                user.id,
+                connector_id,
+            )
+            return {"auth_url": auth_url}
+
+        @router.get(f"{oauth.auth_prefix}/connector/callback")
+        async def callback(
+            code: str | None = None,
+            error: str | None = None,
+            state: str | None = None,
+            session: AsyncSession = Depends(get_async_session),
+        ):
+            error_label = f"{oauth.provider_name}_oauth_denied"
+
+            if error:
+                logger.warning("%s OAuth error: %s", oauth.provider_name, error)
+                space_id = None
+                if state:
+                    try:
+                        data = oauth._get_state_manager().validate_state(state)
+                        space_id = data.get("space_id")
+                    except Exception:
+                        pass
+                return oauth._frontend_redirect(space_id, error=error_label)
+
+            if not code:
+                raise HTTPException(
+                    status_code=400, detail="Missing authorization code"
+                )
+            if not state:
+                raise HTTPException(status_code=400, detail="Missing state parameter")
+
+            state_mgr = oauth._get_state_manager()
+            try:
+                data = state_mgr.validate_state(state)
+            except Exception as e:
+                raise HTTPException(
+                    status_code=400, detail="Invalid or expired state parameter."
+                ) from e
+
+            user_id = UUID(data["user_id"])
+            space_id = data["space_id"]
+
+            token_json = await oauth._exchange_code(code, data)
+
+            access_token = token_json.get("access_token", "")
+            if not access_token:
+                raise HTTPException(
+                    status_code=400,
+                    detail=f"No access token received from {oauth.provider_name.title()}",
+                )
+
+            account_info = await oauth.fetch_account_info(access_token)
+            encryption = oauth._get_token_encryption()
+            connector_config = oauth.build_connector_config(
+                token_json, account_info, encryption
+            )
+
+            display_name = oauth.get_connector_display_name(account_info)
+
+            # --- Re-auth path ---
+            reauth_connector_id = data.get("connector_id")
+            reauth_return_url = data.get("return_url")
+
+            if reauth_connector_id:
+                result = await session.execute(
+                    select(SearchSourceConnector).filter(
+                        SearchSourceConnector.id == reauth_connector_id,
+                        SearchSourceConnector.user_id == user_id,
+                        SearchSourceConnector.search_space_id == space_id,
+                        SearchSourceConnector.connector_type == oauth.connector_type,
+                    )
+                )
+                db_connector = result.scalars().first()
+                if not db_connector:
+                    raise HTTPException(
+                        status_code=404,
+                        detail="Connector not found or access denied during re-auth",
+                    )
+
+                db_connector.config = connector_config
+                flag_modified(db_connector, "config")
+                await session.commit()
+                await session.refresh(db_connector)
+
+                logger.info(
+                    "Re-authenticated %s connector %s for user %s",
+                    oauth.provider_name,
+                    db_connector.id,
+                    user_id,
+                )
+                if (
+                    reauth_return_url
+                    and reauth_return_url.startswith("/")
+                    and not reauth_return_url.startswith("//")
+                ):
+                    return RedirectResponse(
+                        url=f"{config.NEXT_FRONTEND_URL}{reauth_return_url}"
+                    )
+                return oauth._frontend_redirect(
+                    space_id, success=True, connector_id=db_connector.id
+                )
+
+            # --- New connector path ---
+            is_dup = await check_duplicate_connector(
+                session,
+                oauth.connector_type,
+                space_id,
+                user_id,
+                display_name,
+            )
+            if is_dup:
+                logger.warning(
+                    "Duplicate %s connector for user %s (%s)",
+                    oauth.provider_name,
+                    user_id,
+                    display_name,
+                )
+                return oauth._frontend_redirect(
+                    space_id,
+                    error=f"duplicate_account&connector={oauth.provider_name}-connector",
+                )
+
+            connector_name = await generate_unique_connector_name(
+                session,
+                oauth.connector_type,
+                space_id,
+                user_id,
+                display_name,
+            )
+
+            new_connector = SearchSourceConnector(
+                name=connector_name,
+                connector_type=oauth.connector_type,
+                is_indexable=oauth.is_indexable,
+                config=connector_config,
+                search_space_id=space_id,
+                user_id=user_id,
+            )
+            session.add(new_connector)
+
+            try:
+                await session.commit()
+            except IntegrityError as e:
+                await session.rollback()
+                raise HTTPException(
+                    status_code=409,
+                    detail="A connector for this service already exists.",
+                ) from e
+
+            logger.info(
+                "Created %s connector %s for user %s in space %s",
+                oauth.provider_name,
+                new_connector.id,
+                user_id,
+                space_id,
+            )
+            return oauth._frontend_redirect(
+                space_id, success=True, connector_id=new_connector.id
+            )
+
+        return router
diff --git a/surfsense_backend/app/routes/obsidian_plugin_routes.py b/surfsense_backend/app/routes/obsidian_plugin_routes.py
new file mode 100644
index 0000000000..0dae7a463c
--- /dev/null
+++ b/surfsense_backend/app/routes/obsidian_plugin_routes.py
@@ -0,0 +1,706 @@
+"""Obsidian plugin ingestion routes (``/api/v1/obsidian/*``).
+
+Wire surface for the ``surfsense_obsidian/`` plugin. Versioning anchor is
+the ``/api/v1/`` URL prefix; additive feature detection rides the
+``capabilities`` array on /health and /connect.
+"""
+
+from __future__ import annotations
+
+import logging
+from datetime import UTC, datetime
+
+from fastapi import APIRouter, Depends, HTTPException, Query, status
+from sqlalchemy import and_, case, func
+from sqlalchemy.dialects.postgresql import insert as pg_insert
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.future import select
+
+from app.db import (
+    Document,
+    DocumentType,
+    SearchSourceConnector,
+    SearchSourceConnectorType,
+    SearchSpace,
+    User,
+    get_async_session,
+)
+from app.schemas.obsidian_plugin import (
+    ALLOWED_ATTACHMENT_EXTENSIONS,
+    ATTACHMENT_MIME_TYPES,
+    ConnectRequest,
+    ConnectResponse,
+    DeleteAck,
+    DeleteAckItem,
+    DeleteBatchRequest,
+    HealthResponse,
+    ManifestResponse,
+    RenameAck,
+    RenameAckItem,
+    RenameBatchRequest,
+    StatsResponse,
+    SyncAck,
+    SyncAckItem,
+    SyncBatchRequest,
+)
+from app.services.notification_service import NotificationService
+from app.services.obsidian_plugin_indexer import (
+    delete_note,
+    get_manifest,
+    merge_obsidian_connectors,
+    rename_note,
+    upsert_note,
+)
+from app.tasks.celery_tasks.obsidian_tasks import index_obsidian_attachment_task
+from app.users import current_active_user
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter(prefix="/obsidian", tags=["obsidian-plugin"])
+
+
+# Plugins feature-gate on these. Add entries, never rename or remove.
+OBSIDIAN_CAPABILITIES: list[str] = ["sync", "rename", "delete", "manifest", "stats"]
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _build_handshake() -> dict[str, object]:
+    return {"capabilities": list(OBSIDIAN_CAPABILITIES)}
+
+
+def _connector_type_value(connector: SearchSourceConnector) -> str:
+    connector_type = connector.connector_type
+    if hasattr(connector_type, "value"):
+        return str(connector_type.value)
+    return str(connector_type)
+
+
+async def _start_obsidian_sync_notification(
+    session: AsyncSession,
+    *,
+    user: User,
+    connector: SearchSourceConnector,
+    total_count: int,
+):
+    """Create/update the rolling inbox item for Obsidian plugin sync.
+
+    Obsidian sync is continuous and batched, so we keep one stable
+    operation_id per connector instead of creating a new notification per batch.
+    """
+    handler = NotificationService.connector_indexing
+    operation_id = f"obsidian_sync_connector_{connector.id}"
+    connector_name = connector.name or "Obsidian"
+    notification = await handler.find_or_create_notification(
+        session=session,
+        user_id=user.id,
+        operation_id=operation_id,
+        title=f"Syncing: {connector_name}",
+        message="Syncing from Obsidian plugin",
+        search_space_id=connector.search_space_id,
+        initial_metadata={
+            "connector_id": connector.id,
+            "connector_name": connector_name,
+            "connector_type": _connector_type_value(connector),
+            "sync_stage": "processing",
+            "indexed_count": 0,
+            "failed_count": 0,
+            "total_count": total_count,
+            "source": "obsidian_plugin",
+        },
+    )
+    return await handler.update_notification(
+        session=session,
+        notification=notification,
+        status="in_progress",
+        metadata_updates={
+            "sync_stage": "processing",
+            "total_count": total_count,
+        },
+    )
+
+
+async def _finish_obsidian_sync_notification(
+    session: AsyncSession,
+    *,
+    notification,
+    indexed: int,
+    failed: int,
+):
+    """Mark the rolling Obsidian sync inbox item complete or failed."""
+    handler = NotificationService.connector_indexing
+    connector_name = notification.notification_metadata.get(
+        "connector_name", "Obsidian"
+    )
+    if failed > 0 and indexed == 0:
+        title = f"Failed: {connector_name}"
+        message = (
+            f"Sync failed: {failed} file(s) failed"
+            if failed > 1
+            else "Sync failed: 1 file failed"
+        )
+        status_value = "failed"
+        stage = "failed"
+    else:
+        title = f"Ready: {connector_name}"
+        if failed > 0:
+            message = f"Partially synced: {indexed} file(s) synced, {failed} failed."
+        elif indexed == 0:
+            message = "Already up to date!"
+        elif indexed == 1:
+            message = "Now searchable! 1 file synced."
+        else:
+            message = f"Now searchable! {indexed} files synced."
+        status_value = "completed"
+        stage = "completed"
+
+    await handler.update_notification(
+        session=session,
+        notification=notification,
+        title=title,
+        message=message,
+        status=status_value,
+        metadata_updates={
+            "indexed_count": indexed,
+            "failed_count": failed,
+            "sync_stage": stage,
+        },
+    )
+
+
+async def _resolve_vault_connector(
+    session: AsyncSession,
+    *,
+    user: User,
+    vault_id: str,
+) -> SearchSourceConnector:
+    """Find the OBSIDIAN_CONNECTOR row that owns ``vault_id`` for this user."""
+    # ``config`` is core ``JSON`` (not ``JSONB``); ``as_string()`` is the
+    # cross-dialect equivalent of ``.astext`` and compiles to ``->>``.
+    stmt = select(SearchSourceConnector).where(
+        and_(
+            SearchSourceConnector.user_id == user.id,
+            SearchSourceConnector.connector_type
+            == SearchSourceConnectorType.OBSIDIAN_CONNECTOR,
+            SearchSourceConnector.config["vault_id"].as_string() == vault_id,
+            SearchSourceConnector.config["source"].as_string() == "plugin",
+        )
+    )
+
+    connector = (await session.execute(stmt)).scalars().first()
+    if connector is not None:
+        return connector
+
+    raise HTTPException(
+        status_code=status.HTTP_404_NOT_FOUND,
+        detail={
+            "code": "VAULT_NOT_REGISTERED",
+            "message": (
+                "No Obsidian plugin connector found for this vault. "
+                "Call POST /obsidian/connect first."
+            ),
+            "vault_id": vault_id,
+        },
+    )
+
+
+def _queue_obsidian_attachment(
+    *, connector_id: int, note_payload: dict, user_id: str
+) -> None:
+    """Enqueue one non-markdown Obsidian note for background ETL/indexing."""
+    index_obsidian_attachment_task.delay(
+        connector_id=connector_id,
+        payload_data=note_payload,
+        user_id=user_id,
+    )
+
+
+async def _ensure_search_space_access(
+    session: AsyncSession,
+    *,
+    user: User,
+    search_space_id: int,
+) -> SearchSpace:
+    """Owner-only access to the search space (shared spaces are a follow-up)."""
+    result = await session.execute(
+        select(SearchSpace).where(
+            and_(SearchSpace.id == search_space_id, SearchSpace.user_id == user.id)
+        )
+    )
+    space = result.scalars().first()
+    if space is None:
+        raise HTTPException(
+            status_code=status.HTTP_403_FORBIDDEN,
+            detail={
+                "code": "SEARCH_SPACE_FORBIDDEN",
+                "message": "You don't own that search space.",
+            },
+        )
+    return space
+
+
+# ---------------------------------------------------------------------------
+# Endpoints
+# ---------------------------------------------------------------------------
+
+
+@router.get("/health", response_model=HealthResponse)
+async def obsidian_health(
+    user: User = Depends(current_active_user),
+) -> HealthResponse:
+    """Return the API contract handshake; plugin caches it per onload."""
+    return HealthResponse(
+        **_build_handshake(),
+        server_time_utc=datetime.now(UTC),
+    )
+
+
+async def _find_by_vault_id(
+    session: AsyncSession, *, user_id, vault_id: str
+) -> SearchSourceConnector | None:
+    stmt = select(SearchSourceConnector).where(
+        and_(
+            SearchSourceConnector.user_id == user_id,
+            SearchSourceConnector.connector_type
+            == SearchSourceConnectorType.OBSIDIAN_CONNECTOR,
+            SearchSourceConnector.config["source"].as_string() == "plugin",
+            SearchSourceConnector.config["vault_id"].as_string() == vault_id,
+        )
+    )
+    return (await session.execute(stmt)).scalars().first()
+
+
+async def _find_by_fingerprint(
+    session: AsyncSession, *, user_id, vault_fingerprint: str
+) -> SearchSourceConnector | None:
+    stmt = select(SearchSourceConnector).where(
+        and_(
+            SearchSourceConnector.user_id == user_id,
+            SearchSourceConnector.connector_type
+            == SearchSourceConnectorType.OBSIDIAN_CONNECTOR,
+            SearchSourceConnector.config["source"].as_string() == "plugin",
+            SearchSourceConnector.config["vault_fingerprint"].as_string()
+            == vault_fingerprint,
+        )
+    )
+    return (await session.execute(stmt)).scalars().first()
+
+
+def _build_config(payload: ConnectRequest, *, now_iso: str) -> dict[str, object]:
+    return {
+        "vault_id": payload.vault_id,
+        "vault_name": payload.vault_name,
+        "vault_fingerprint": payload.vault_fingerprint,
+        "source": "plugin",
+        "last_connect_at": now_iso,
+    }
+
+
+def _display_name(vault_name: str) -> str:
+    return f"Obsidian - {vault_name}"
+
+
+@router.post("/connect", response_model=ConnectResponse)
+async def obsidian_connect(
+    payload: ConnectRequest,
+    user: User = Depends(current_active_user),
+    session: AsyncSession = Depends(get_async_session),
+) -> ConnectResponse:
+    """Register a vault, refresh an existing one, or adopt another device's row.
+
+    Resolution order:
+      1. ``(user_id, vault_id)`` → known device, refresh metadata.
+      2. ``(user_id, vault_fingerprint)`` → another device of the same vault,
+         caller adopts the surviving ``vault_id``.
+      3. Insert a new row.
+
+    Fingerprint collisions on (1) trigger ``merge_obsidian_connectors`` so
+    the partial unique index can never produce two live rows for one vault.
+    """
+    await _ensure_search_space_access(
+        session, user=user, search_space_id=payload.search_space_id
+    )
+
+    now_iso = datetime.now(UTC).isoformat()
+    cfg = _build_config(payload, now_iso=now_iso)
+    display_name = _display_name(payload.vault_name)
+
+    existing_by_vid = await _find_by_vault_id(
+        session, user_id=user.id, vault_id=payload.vault_id
+    )
+    if existing_by_vid is not None:
+        collision = await _find_by_fingerprint(
+            session, user_id=user.id, vault_fingerprint=payload.vault_fingerprint
+        )
+        if collision is not None and collision.id != existing_by_vid.id:
+            await merge_obsidian_connectors(
+                session, source=existing_by_vid, target=collision
+            )
+            collision_cfg = dict(collision.config or {})
+            collision_cfg["vault_name"] = payload.vault_name
+            collision_cfg["last_connect_at"] = now_iso
+            collision.config = collision_cfg
+            collision.name = _display_name(payload.vault_name)
+            response = ConnectResponse(
+                connector_id=collision.id,
+                vault_id=collision_cfg["vault_id"],
+                search_space_id=collision.search_space_id,
+                server_time_utc=datetime.now(UTC),
+                **_build_handshake(),
+            )
+            await session.commit()
+            return response
+
+        existing_by_vid.name = display_name
+        existing_by_vid.config = cfg
+        existing_by_vid.search_space_id = payload.search_space_id
+        existing_by_vid.is_indexable = False
+        response = ConnectResponse(
+            connector_id=existing_by_vid.id,
+            vault_id=payload.vault_id,
+            search_space_id=existing_by_vid.search_space_id,
+            server_time_utc=datetime.now(UTC),
+            **_build_handshake(),
+        )
+        await session.commit()
+        return response
+
+    existing_by_fp = await _find_by_fingerprint(
+        session, user_id=user.id, vault_fingerprint=payload.vault_fingerprint
+    )
+    if existing_by_fp is not None:
+        survivor_cfg = dict(existing_by_fp.config or {})
+        survivor_cfg["vault_name"] = payload.vault_name
+        survivor_cfg["last_connect_at"] = now_iso
+        existing_by_fp.config = survivor_cfg
+        existing_by_fp.name = display_name
+        response = ConnectResponse(
+            connector_id=existing_by_fp.id,
+            vault_id=survivor_cfg["vault_id"],
+            search_space_id=existing_by_fp.search_space_id,
+            server_time_utc=datetime.now(UTC),
+            **_build_handshake(),
+        )
+        await session.commit()
+        return response
+
+    # ON CONFLICT DO NOTHING matches any unique index (vault_id OR
+    # fingerprint), so concurrent first-time connects from two devices
+    # of the same vault never raise IntegrityError — the loser just
+    # gets an empty RETURNING and falls through to re-fetch the winner.
+    insert_stmt = (
+        pg_insert(SearchSourceConnector)
+        .values(
+            name=display_name,
+            connector_type=SearchSourceConnectorType.OBSIDIAN_CONNECTOR,
+            is_indexable=False,
+            config=cfg,
+            user_id=user.id,
+            search_space_id=payload.search_space_id,
+        )
+        .on_conflict_do_nothing()
+        .returning(
+            SearchSourceConnector.id,
+            SearchSourceConnector.search_space_id,
+        )
+    )
+    inserted = (await session.execute(insert_stmt)).first()
+    if inserted is not None:
+        response = ConnectResponse(
+            connector_id=inserted.id,
+            vault_id=payload.vault_id,
+            search_space_id=inserted.search_space_id,
+            server_time_utc=datetime.now(UTC),
+            **_build_handshake(),
+        )
+        await session.commit()
+        return response
+
+    winner = await _find_by_fingerprint(
+        session, user_id=user.id, vault_fingerprint=payload.vault_fingerprint
+    )
+    if winner is None:
+        winner = await _find_by_vault_id(
+            session, user_id=user.id, vault_id=payload.vault_id
+        )
+    if winner is None:
+        raise HTTPException(
+            status_code=status.HTTP_409_CONFLICT,
+            detail="vault registration conflicted but winning row could not be located",
+        )
+    response = ConnectResponse(
+        connector_id=winner.id,
+        vault_id=(winner.config or {})["vault_id"],
+        search_space_id=winner.search_space_id,
+        server_time_utc=datetime.now(UTC),
+        **_build_handshake(),
+    )
+    await session.commit()
+    return response
+
+
+@router.post("/sync", response_model=SyncAck)
+async def obsidian_sync(
+    payload: SyncBatchRequest,
+    user: User = Depends(current_active_user),
+    session: AsyncSession = Depends(get_async_session),
+) -> SyncAck:
+    """Batch-upsert notes; returns per-note ack so the plugin can dequeue/retry."""
+    connector = await _resolve_vault_connector(
+        session, user=user, vault_id=payload.vault_id
+    )
+    notification = None
+    try:
+        notification = await _start_obsidian_sync_notification(
+            session, user=user, connector=connector, total_count=len(payload.notes)
+        )
+    except Exception:
+        logger.warning(
+            "obsidian sync notification start failed connector=%s user=%s",
+            connector.id,
+            user.id,
+            exc_info=True,
+        )
+
+    items: list[SyncAckItem] = []
+    indexed = 0
+    failed = 0
+
+    for note in payload.notes:
+        try:
+            if note.is_binary:
+                ext = note.extension.lstrip(".").lower()
+                if ext not in ALLOWED_ATTACHMENT_EXTENSIONS:
+                    failed += 1
+                    items.append(
+                        SyncAckItem(
+                            path=note.path,
+                            status="error",
+                            error=f"unsupported attachment extension: .{ext}",
+                        )
+                    )
+                    continue
+                expected_mime = ATTACHMENT_MIME_TYPES[ext]
+                if note.mime_type != expected_mime:
+                    failed += 1
+                    items.append(
+                        SyncAckItem(
+                            path=note.path,
+                            status="error",
+                            error=(
+                                f"mime_type '{note.mime_type}' does not match "
+                                f"extension .{ext}"
+                            ),
+                        )
+                    )
+                    continue
+                _queue_obsidian_attachment(
+                    connector_id=connector.id,
+                    note_payload=note.model_dump(mode="json"),
+                    user_id=str(user.id),
+                )
+                indexed += 1
+                items.append(SyncAckItem(path=note.path, status="queued"))
+                continue
+
+            doc = await upsert_note(
+                session, connector=connector, payload=note, user_id=str(user.id)
+            )
+            indexed += 1
+            items.append(SyncAckItem(path=note.path, status="ok", document_id=doc.id))
+        except HTTPException:
+            raise
+        except Exception as exc:
+            failed += 1
+            logger.exception(
+                "obsidian /sync failed for path=%s vault=%s",
+                note.path,
+                payload.vault_id,
+            )
+            items.append(
+                SyncAckItem(path=note.path, status="error", error=str(exc)[:300])
+            )
+
+    if notification is not None:
+        try:
+            await _finish_obsidian_sync_notification(
+                session,
+                notification=notification,
+                indexed=indexed,
+                failed=failed,
+            )
+        except Exception:
+            logger.warning(
+                "obsidian sync notification finish failed connector=%s user=%s",
+                connector.id,
+                user.id,
+                exc_info=True,
+            )
+
+    return SyncAck(
+        vault_id=payload.vault_id,
+        indexed=indexed,
+        failed=failed,
+        items=items,
+    )
+
+
+@router.post("/rename", response_model=RenameAck)
+async def obsidian_rename(
+    payload: RenameBatchRequest,
+    user: User = Depends(current_active_user),
+    session: AsyncSession = Depends(get_async_session),
+) -> RenameAck:
+    """Apply a batch of vault rename events."""
+    connector = await _resolve_vault_connector(
+        session, user=user, vault_id=payload.vault_id
+    )
+
+    items: list[RenameAckItem] = []
+    renamed = 0
+    missing = 0
+
+    for item in payload.renames:
+        try:
+            doc = await rename_note(
+                session,
+                connector=connector,
+                old_path=item.old_path,
+                new_path=item.new_path,
+                vault_id=payload.vault_id,
+            )
+            if doc is None:
+                missing += 1
+                items.append(
+                    RenameAckItem(
+                        old_path=item.old_path,
+                        new_path=item.new_path,
+                        status="missing",
+                    )
+                )
+            else:
+                renamed += 1
+                items.append(
+                    RenameAckItem(
+                        old_path=item.old_path,
+                        new_path=item.new_path,
+                        status="ok",
+                        document_id=doc.id,
+                    )
+                )
+        except Exception as exc:
+            logger.exception(
+                "obsidian /rename failed for old=%s new=%s vault=%s",
+                item.old_path,
+                item.new_path,
+                payload.vault_id,
+            )
+            items.append(
+                RenameAckItem(
+                    old_path=item.old_path,
+                    new_path=item.new_path,
+                    status="error",
+                    error=str(exc)[:300],
+                )
+            )
+
+    return RenameAck(
+        vault_id=payload.vault_id,
+        renamed=renamed,
+        missing=missing,
+        items=items,
+    )
+
+
+@router.delete("/notes", response_model=DeleteAck)
+async def obsidian_delete_notes(
+    payload: DeleteBatchRequest,
+    user: User = Depends(current_active_user),
+    session: AsyncSession = Depends(get_async_session),
+) -> DeleteAck:
+    """Soft-delete a batch of notes by vault-relative path."""
+    connector = await _resolve_vault_connector(
+        session, user=user, vault_id=payload.vault_id
+    )
+
+    deleted = 0
+    missing = 0
+    items: list[DeleteAckItem] = []
+    for path in payload.paths:
+        try:
+            ok = await delete_note(
+                session,
+                connector=connector,
+                vault_id=payload.vault_id,
+                path=path,
+            )
+            if ok:
+                deleted += 1
+                items.append(DeleteAckItem(path=path, status="ok"))
+            else:
+                missing += 1
+                items.append(DeleteAckItem(path=path, status="missing"))
+        except Exception as exc:
+            logger.exception(
+                "obsidian DELETE /notes failed for path=%s vault=%s",
+                path,
+                payload.vault_id,
+            )
+            items.append(DeleteAckItem(path=path, status="error", error=str(exc)[:300]))
+
+    return DeleteAck(
+        vault_id=payload.vault_id,
+        deleted=deleted,
+        missing=missing,
+        items=items,
+    )
+
+
+@router.get("/manifest", response_model=ManifestResponse)
+async def obsidian_manifest(
+    vault_id: str = Query(..., description="Plugin-side stable vault UUID"),
+    user: User = Depends(current_active_user),
+    session: AsyncSession = Depends(get_async_session),
+) -> ManifestResponse:
+    """Return ``{path: {hash, mtime}}`` for the plugin's onload reconcile diff."""
+    connector = await _resolve_vault_connector(session, user=user, vault_id=vault_id)
+    return await get_manifest(session, connector=connector, vault_id=vault_id)
+
+
+@router.get("/stats", response_model=StatsResponse)
+async def obsidian_stats(
+    vault_id: str = Query(..., description="Plugin-side stable vault UUID"),
+    user: User = Depends(current_active_user),
+    session: AsyncSession = Depends(get_async_session),
+) -> StatsResponse:
+    """Active-note count + last sync time for the web tile.
+
+    ``files_synced`` excludes tombstones so it matches ``/manifest``;
+    ``last_sync_at`` includes them so deletes advance the freshness signal.
+    """
+    connector = await _resolve_vault_connector(session, user=user, vault_id=vault_id)
+
+    is_active = Document.document_metadata["deleted_at"].as_string().is_(None)
+
+    row = (
+        await session.execute(
+            select(
+                func.count(case((is_active, 1))).label("files_synced"),
+                func.max(Document.updated_at).label("last_sync_at"),
+            ).where(
+                and_(
+                    Document.connector_id == connector.id,
+                    Document.document_type == DocumentType.OBSIDIAN_CONNECTOR,
+                )
+            )
+        )
+    ).first()
+
+    return StatsResponse(
+        vault_id=vault_id,
+        files_synced=int(row[0] or 0),
+        last_sync_at=row[1],
+    )
diff --git a/surfsense_backend/app/routes/public_chat_routes.py b/surfsense_backend/app/routes/public_chat_routes.py
index e206bfd113..3181e117c3 100644
--- a/surfsense_backend/app/routes/public_chat_routes.py
+++ b/surfsense_backend/app/routes/public_chat_routes.py
@@ -231,6 +231,57 @@ def _replace_audio_paths_with_public_urls(
     return result
 
 
+@router.get("/{share_token}/reports/{report_id}/preview")
+async def preview_public_report_pdf(
+    share_token: str,
+    report_id: int,
+    session: AsyncSession = Depends(get_async_session),
+):
+    """
+    Return a compiled PDF preview for a Typst-based report in a public snapshot.
+
+    No authentication required - the share_token provides access.
+    """
+    import asyncio
+    import io
+    import re
+
+    import typst as typst_compiler
+
+    report_info = await get_snapshot_report(session, share_token, report_id)
+
+    if not report_info:
+        raise HTTPException(status_code=404, detail="Report not found")
+
+    content = report_info.get("content")
+    content_type = report_info.get("content_type", "markdown")
+
+    if not content:
+        raise HTTPException(status_code=400, detail="Report has no content to preview")
+
+    if content_type != "typst":
+        raise HTTPException(
+            status_code=400,
+            detail="Preview is only available for Typst-based reports",
+        )
+
+    def _compile() -> bytes:
+        return typst_compiler.compile(content.encode("utf-8"))
+
+    pdf_bytes = await asyncio.to_thread(_compile)
+
+    safe_title = re.sub(r"[^\w\s-]", "", report_info.get("title") or "Resume").strip()
+    filename = f"{safe_title}.pdf"
+
+    return StreamingResponse(
+        io.BytesIO(pdf_bytes),
+        media_type="application/pdf",
+        headers={
+            "Content-Disposition": f'inline; filename="{filename}"',
+        },
+    )
+
+
 @router.get("/{share_token}/reports/{report_id}/content")
 async def get_public_report_content(
     share_token: str,
@@ -259,6 +310,7 @@ async def get_public_report_content(
         "id": report_info.get("original_id"),
         "title": report_info.get("title"),
         "content": report_info.get("content"),
+        "content_type": report_info.get("content_type", "markdown"),
         "report_metadata": report_info.get("report_metadata"),
         "report_group_id": report_info.get("report_group_id"),
         "versions": versions,
diff --git a/surfsense_backend/app/routes/reports_routes.py b/surfsense_backend/app/routes/reports_routes.py
index 56ac5ec2d4..19961e1a9e 100644
--- a/surfsense_backend/app/routes/reports_routes.py
+++ b/surfsense_backend/app/routes/reports_routes.py
@@ -279,6 +279,7 @@ async def read_report_content(
             id=report.id,
             title=report.title,
             content=report.content,
+            content_type=report.content_type,
             report_metadata=report.report_metadata,
             report_group_id=report.report_group_id,
             versions=versions,
@@ -319,6 +320,7 @@ async def update_report_content(
             id=report.id,
             title=report.title,
             content=report.content,
+            content_type=report.content_type,
             report_metadata=report.report_metadata,
             report_group_id=report.report_group_id,
             versions=versions,
@@ -333,6 +335,57 @@ async def update_report_content(
         ) from None
 
 
+@router.get("/reports/{report_id}/preview")
+async def preview_report_pdf(
+    report_id: int,
+    session: AsyncSession = Depends(get_async_session),
+    user: User = Depends(current_active_user),
+):
+    """
+    Return a compiled PDF preview for Typst-based reports (resumes).
+
+    Reads the Typst source from the database and compiles it to PDF bytes
+    on-the-fly. Only works for reports with content_type='typst'.
+    """
+    try:
+        report = await _get_report_with_access(report_id, session, user)
+
+        if not report.content:
+            raise HTTPException(
+                status_code=400, detail="Report has no content to preview"
+            )
+
+        if report.content_type != "typst":
+            raise HTTPException(
+                status_code=400,
+                detail="Preview is only available for Typst-based reports",
+            )
+
+        def _compile() -> bytes:
+            return typst.compile(report.content.encode("utf-8"))
+
+        pdf_bytes = await asyncio.to_thread(_compile)
+
+        safe_title = re.sub(r"[^\w\s-]", "", report.title or "Resume").strip()
+        filename = f"{safe_title}.pdf"
+
+        return StreamingResponse(
+            io.BytesIO(pdf_bytes),
+            media_type="application/pdf",
+            headers={
+                "Content-Disposition": f'inline; filename="{filename}"',
+            },
+        )
+    except HTTPException:
+        raise
+    except Exception:
+        logger.exception("Failed to compile Typst preview for report %d", report_id)
+        raise HTTPException(
+            status_code=500,
+            detail="Failed to compile resume preview",
+        ) from None
+
+
 @router.get("/reports/{report_id}/export")
 async def export_report(
     report_id: int,
@@ -354,6 +407,27 @@ async def export_report(
                 status_code=400, detail="Report has no content to export"
             )
 
+        # Typst-based reports (resumes): compile directly without Pandoc
+        if report.content_type == "typst":
+            if format != ExportFormat.PDF:
+                raise HTTPException(
+                    status_code=400,
+                    detail="Typst-based reports currently only support PDF export",
+                )
+
+            def _compile_typst() -> bytes:
+                return typst.compile(report.content.encode("utf-8"))
+
+            pdf_bytes = await asyncio.to_thread(_compile_typst)
+            safe_title = re.sub(r"[^\w\s-]", "", report.title or "Resume").strip()
+            return StreamingResponse(
+                io.BytesIO(pdf_bytes),
+                media_type="application/pdf",
+                headers={
+                    "Content-Disposition": f'attachment; filename="{safe_title}.pdf"',
+                },
+            )
+
         # Strip wrapping code fences that LLMs sometimes add around Markdown.
         # Without this, pandoc treats the entire content as a code block.
         markdown_content = _strip_wrapping_code_fences(report.content)
diff --git a/surfsense_backend/app/routes/sandbox_routes.py b/surfsense_backend/app/routes/sandbox_routes.py
index 2c12c3a1e9..f656e8d76d 100644
--- a/surfsense_backend/app/routes/sandbox_routes.py
+++ b/surfsense_backend/app/routes/sandbox_routes.py
@@ -86,9 +86,8 @@ async def download_sandbox_file(
 
     # Fall back to live sandbox download
     try:
-        sandbox = await get_or_create_sandbox(thread_id)
-        raw_sandbox = sandbox._sandbox
-        content: bytes = await asyncio.to_thread(raw_sandbox.fs.download_file, path)
+        sandbox, _ = await get_or_create_sandbox(thread_id)
+        content: bytes = await asyncio.to_thread(sandbox.download_file, path)
     except Exception as exc:
         logger.warning("Sandbox file download failed for %s: %s", path, exc)
         raise HTTPException(
diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py
index bb20da65df..e9ffb70509 100644
--- a/surfsense_backend/app/routes/search_source_connectors_routes.py
+++ b/surfsense_backend/app/routes/search_source_connectors_routes.py
@@ -81,6 +81,36 @@
 
 # Redis key TTL - notification is stale if no heartbeat in this time
 HEARTBEAT_TTL_SECONDS = 120  # 2 minutes
+# How often the background loop refreshes the Redis key. Must be < TTL so
+# the key cannot expire between refreshes when the indexing function is
+# doing blocking work (e.g. gitingest in Phase 1) that doesn't trigger
+# on_heartbeat_callback.
+HEARTBEAT_REFRESH_INTERVAL = 60
+
+
+async def _run_indexing_heartbeat_loop(notification_id: int) -> None:
+    """Background coroutine that refreshes the Redis heartbeat every
+    HEARTBEAT_REFRESH_INTERVAL seconds while a connector indexing task is
+    running.
+
+    Mirrors `_run_heartbeat_loop` in app/tasks/celery_tasks/document_tasks.py.
+    Cancelled via heartbeat_task.cancel() when the indexing call returns
+    (success or failure). If the worker dies, the coroutine dies with it
+    and the Redis key expires naturally on its TTL.
+    """
+    key = _get_heartbeat_key(notification_id)
+    try:
+        while True:
+            await asyncio.sleep(HEARTBEAT_REFRESH_INTERVAL)
+            try:
+                get_heartbeat_redis_client().setex(key, HEARTBEAT_TTL_SECONDS, "alive")
+            except Exception as e:
+                logger.warning(
+                    f"Failed to refresh Redis heartbeat for notification "
+                    f"{notification_id}: {e}"
+                )
+    except asyncio.CancelledError:
+        pass  # Normal cancellation when the indexing task completes
 
 
 def get_heartbeat_redis_client() -> redis.Redis:
@@ -636,9 +666,16 @@ async def delete_search_source_connector(
             )
 
         # Delete the connector record
+        search_space_id = db_connector.search_space_id
+        is_mcp = db_connector.connector_type == SearchSourceConnectorType.MCP_CONNECTOR
         await session.delete(db_connector)
         await session.commit()
 
+        if is_mcp:
+            from app.agents.new_chat.tools.mcp_tool import invalidate_mcp_tools_cache
+
+            invalidate_mcp_tools_cache(search_space_id)
+
         logger.info(
             f"Connector {connector_id} ({connector_name}) deleted successfully. "
             f"Total documents deleted: {total_deleted}"
@@ -686,27 +723,10 @@ async def index_connector_content(
     user: User = Depends(current_active_user),
 ):
     """
-    Index content from a connector to a search space.
-    Requires CONNECTORS_UPDATE permission (to trigger indexing).
-
-    Currently supports:
-    - SLACK_CONNECTOR: Indexes messages from all accessible Slack channels
-    - TEAMS_CONNECTOR: Indexes messages from all accessible Microsoft Teams channels
-    - NOTION_CONNECTOR: Indexes pages from all accessible Notion pages
-    - GITHUB_CONNECTOR: Indexes code and documentation from GitHub repositories
-    - LINEAR_CONNECTOR: Indexes issues and comments from Linear
-    - JIRA_CONNECTOR: Indexes issues and comments from Jira
-    - DISCORD_CONNECTOR: Indexes messages from all accessible Discord channels
-    - LUMA_CONNECTOR: Indexes events from Luma
-    - ELASTICSEARCH_CONNECTOR: Indexes documents from Elasticsearch
-    - WEBCRAWLER_CONNECTOR: Indexes web pages from crawled websites
+    Index content from a KB connector to a search space.
 
-    Args:
-        connector_id: ID of the connector to use
-        search_space_id: ID of the search space to store indexed content
-
-    Returns:
-        Dictionary with indexing status
+    Live connectors (Slack, Teams, Linear, Jira, ClickUp, Calendar, Airtable,
+    Gmail, Discord, Luma) use real-time agent tools instead.
     """
     try:
         # Get the connector first
@@ -763,9 +783,7 @@ async def index_connector_content(
 
         # For calendar connectors, default to today but allow future dates if explicitly provided
         if connector.connector_type in [
-            SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR,
             SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
-            SearchSourceConnectorType.LUMA_CONNECTOR,
         ]:
             # Default to today if no end_date provided (users can manually select future dates)
             indexing_to = today_str if end_date is None else end_date
@@ -789,33 +807,22 @@ async def index_connector_content(
             # For non-calendar connectors, cap at today
             indexing_to = end_date if end_date else today_str
 
-        if connector.connector_type == SearchSourceConnectorType.SLACK_CONNECTOR:
-            from app.tasks.celery_tasks.connector_tasks import (
-                index_slack_messages_task,
-            )
-
-            logger.info(
-                f"Triggering Slack indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}"
-            )
-            index_slack_messages_task.delay(
-                connector_id, search_space_id, str(user.id), indexing_from, indexing_to
-            )
-            response_message = "Slack indexing started in the background."
-
-        elif connector.connector_type == SearchSourceConnectorType.TEAMS_CONNECTOR:
-            from app.tasks.celery_tasks.connector_tasks import (
-                index_teams_messages_task,
-            )
+        from app.services.mcp_oauth.registry import LIVE_CONNECTOR_TYPES
 
-            logger.info(
-                f"Triggering Teams indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}"
-            )
-            index_teams_messages_task.delay(
-                connector_id, search_space_id, str(user.id), indexing_from, indexing_to
-            )
-            response_message = "Teams indexing started in the background."
+        if connector.connector_type in LIVE_CONNECTOR_TYPES:
+            return {
+                "message": (
+                    f"{connector.connector_type.value} uses real-time agent tools; "
+                    "background indexing is disabled."
+                ),
+                "indexing_started": False,
+                "connector_id": connector_id,
+                "search_space_id": search_space_id,
+                "indexing_from": indexing_from,
+                "indexing_to": indexing_to,
+            }
 
-        elif connector.connector_type == SearchSourceConnectorType.NOTION_CONNECTOR:
+        if connector.connector_type == SearchSourceConnectorType.NOTION_CONNECTOR:
             from app.tasks.celery_tasks.connector_tasks import index_notion_pages_task
 
             logger.info(
@@ -837,28 +844,6 @@ async def index_connector_content(
             )
             response_message = "GitHub indexing started in the background."
 
-        elif connector.connector_type == SearchSourceConnectorType.LINEAR_CONNECTOR:
-            from app.tasks.celery_tasks.connector_tasks import index_linear_issues_task
-
-            logger.info(
-                f"Triggering Linear indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}"
-            )
-            index_linear_issues_task.delay(
-                connector_id, search_space_id, str(user.id), indexing_from, indexing_to
-            )
-            response_message = "Linear indexing started in the background."
-
-        elif connector.connector_type == SearchSourceConnectorType.JIRA_CONNECTOR:
-            from app.tasks.celery_tasks.connector_tasks import index_jira_issues_task
-
-            logger.info(
-                f"Triggering Jira indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}"
-            )
-            index_jira_issues_task.delay(
-                connector_id, search_space_id, str(user.id), indexing_from, indexing_to
-            )
-            response_message = "Jira indexing started in the background."
-
         elif connector.connector_type == SearchSourceConnectorType.CONFLUENCE_CONNECTOR:
             from app.tasks.celery_tasks.connector_tasks import (
                 index_confluence_pages_task,
@@ -885,59 +870,6 @@ async def index_connector_content(
             )
             response_message = "BookStack indexing started in the background."
 
-        elif connector.connector_type == SearchSourceConnectorType.CLICKUP_CONNECTOR:
-            from app.tasks.celery_tasks.connector_tasks import index_clickup_tasks_task
-
-            logger.info(
-                f"Triggering ClickUp indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}"
-            )
-            index_clickup_tasks_task.delay(
-                connector_id, search_space_id, str(user.id), indexing_from, indexing_to
-            )
-            response_message = "ClickUp indexing started in the background."
-
-        elif (
-            connector.connector_type
-            == SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR
-        ):
-            from app.tasks.celery_tasks.connector_tasks import (
-                index_google_calendar_events_task,
-            )
-
-            logger.info(
-                f"Triggering Google Calendar indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}"
-            )
-            index_google_calendar_events_task.delay(
-                connector_id, search_space_id, str(user.id), indexing_from, indexing_to
-            )
-            response_message = "Google Calendar indexing started in the background."
-        elif connector.connector_type == SearchSourceConnectorType.AIRTABLE_CONNECTOR:
-            from app.tasks.celery_tasks.connector_tasks import (
-                index_airtable_records_task,
-            )
-
-            logger.info(
-                f"Triggering Airtable indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}"
-            )
-            index_airtable_records_task.delay(
-                connector_id, search_space_id, str(user.id), indexing_from, indexing_to
-            )
-            response_message = "Airtable indexing started in the background."
-        elif (
-            connector.connector_type == SearchSourceConnectorType.GOOGLE_GMAIL_CONNECTOR
-        ):
-            from app.tasks.celery_tasks.connector_tasks import (
-                index_google_gmail_messages_task,
-            )
-
-            logger.info(
-                f"Triggering Google Gmail indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}"
-            )
-            index_google_gmail_messages_task.delay(
-                connector_id, search_space_id, str(user.id), indexing_from, indexing_to
-            )
-            response_message = "Google Gmail indexing started in the background."
-
         elif (
             connector.connector_type == SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR
         ):
@@ -1082,30 +1014,6 @@ async def index_connector_content(
             )
             response_message = "Dropbox indexing started in the background."
 
-        elif connector.connector_type == SearchSourceConnectorType.DISCORD_CONNECTOR:
-            from app.tasks.celery_tasks.connector_tasks import (
-                index_discord_messages_task,
-            )
-
-            logger.info(
-                f"Triggering Discord indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}"
-            )
-            index_discord_messages_task.delay(
-                connector_id, search_space_id, str(user.id), indexing_from, indexing_to
-            )
-            response_message = "Discord indexing started in the background."
-
-        elif connector.connector_type == SearchSourceConnectorType.LUMA_CONNECTOR:
-            from app.tasks.celery_tasks.connector_tasks import index_luma_events_task
-
-            logger.info(
-                f"Triggering Luma indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}"
-            )
-            index_luma_events_task.delay(
-                connector_id, search_space_id, str(user.id), indexing_from, indexing_to
-            )
-            response_message = "Luma indexing started in the background."
-
         elif (
             connector.connector_type
             == SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR
@@ -1150,25 +1058,6 @@ async def index_connector_content(
                 )
                 response_message = "Web page indexing started in the background."
 
-        elif connector.connector_type == SearchSourceConnectorType.OBSIDIAN_CONNECTOR:
-            from app.config import config as app_config
-            from app.tasks.celery_tasks.connector_tasks import index_obsidian_vault_task
-
-            # Obsidian connector only available in self-hosted mode
-            if not app_config.is_self_hosted():
-                raise HTTPException(
-                    status_code=400,
-                    detail="Obsidian connector is only available in self-hosted mode",
-                )
-
-            logger.info(
-                f"Triggering Obsidian vault indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}"
-            )
-            index_obsidian_vault_task.delay(
-                connector_id, search_space_id, str(user.id), indexing_from, indexing_to
-            )
-            response_message = "Obsidian vault indexing started in the background."
-
         elif (
             connector.connector_type
             == SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR
@@ -1331,57 +1220,6 @@ async def _update_connector_timestamp_by_id(session: AsyncSession, connector_id:
         await session.rollback()
 
 
-async def run_slack_indexing_with_new_session(
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """
-    Create a new session and run the Slack indexing task.
-    This prevents session leaks by creating a dedicated session for the background task.
-    """
-    async with async_session_maker() as session:
-        await run_slack_indexing(
-            session, connector_id, search_space_id, user_id, start_date, end_date
-        )
-
-
-async def run_slack_indexing(
-    session: AsyncSession,
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """
-    Background task to run Slack indexing.
-
-    Args:
-        session: Database session
-        connector_id: ID of the Slack connector
-        search_space_id: ID of the search space
-        user_id: ID of the user
-        start_date: Start date for indexing
-        end_date: End date for indexing
-    """
-    from app.tasks.connector_indexers import index_slack_messages
-
-    await _run_indexing_with_notifications(
-        session=session,
-        connector_id=connector_id,
-        search_space_id=search_space_id,
-        user_id=user_id,
-        start_date=start_date,
-        end_date=end_date,
-        indexing_function=index_slack_messages,
-        update_timestamp_func=_update_connector_timestamp_by_id,
-        supports_heartbeat_callback=True,
-    )
-
-
 _AUTH_ERROR_PATTERNS = (
     "failed to refresh linear oauth",
     "failed to refresh your notion connection",
@@ -1457,6 +1295,7 @@ async def _run_indexing_with_notifications(
 
     notification = None
     connector_lock_acquired = False
+    heartbeat_task: asyncio.Task | None = None
     # Track indexed count for retry notifications and heartbeat
     current_indexed_count = 0
 
@@ -1502,6 +1341,16 @@ async def _run_indexing_with_notifications(
                 except Exception as e:
                     logger.warning(f"Failed to set initial Redis heartbeat: {e}")
 
+                # Start a background coroutine that refreshes the
+                # heartbeat every HEARTBEAT_REFRESH_INTERVAL seconds.
+                # Without this the cleanup_stale_indexing_notifications
+                # task can mark the doc failed when on_heartbeat_callback
+                # doesn't fire — for example during the GitHub
+                # connector's Phase 1 gitingest blocking call (#1295).
+                heartbeat_task = asyncio.create_task(
+                    _run_indexing_heartbeat_loop(notification.id)
+                )
+
         # Update notification to fetching stage
         if notification:
             await NotificationService.connector_indexing.notify_indexing_progress(
@@ -1792,6 +1641,13 @@ async def on_heartbeat_callback(indexed_count: int) -> None:
             except Exception as notif_error:
                 logger.error(f"Failed to update notification: {notif_error!s}")
     finally:
+        # Stop the background heartbeat refresher BEFORE deleting the
+        # Redis key, so the loop cannot race and re-create the key
+        # after we delete it.
+        if heartbeat_task is not None:
+            heartbeat_task.cancel()
+            with suppress(Exception):
+                await asyncio.gather(heartbeat_task, return_exceptions=True)
         # Clean up Redis heartbeat key when task completes (success or failure)
         if notification:
             try:
@@ -1920,26 +1776,28 @@ async def run_github_indexing(
     )
 
 
-# Add new helper functions for Linear indexing
-async def run_linear_indexing_with_new_session(
+# Add new helper functions for Confluence indexing
+async def run_confluence_indexing_with_new_session(
     connector_id: int,
     search_space_id: int,
     user_id: str,
     start_date: str,
     end_date: str,
 ):
-    """Wrapper to run Linear indexing with its own database session."""
+    """Wrapper to run Confluence indexing with its own database session."""
     logger.info(
-        f"Background task started: Indexing Linear connector {connector_id} into space {search_space_id} from {start_date} to {end_date}"
+        f"Background task started: Indexing Confluence connector {connector_id} into space {search_space_id} from {start_date} to {end_date}"
     )
     async with async_session_maker() as session:
-        await run_linear_indexing(
+        await run_confluence_indexing(
             session, connector_id, search_space_id, user_id, start_date, end_date
         )
-    logger.info(f"Background task finished: Indexing Linear connector {connector_id}")
+    logger.info(
+        f"Background task finished: Indexing Confluence connector {connector_id}"
+    )
 
 
-async def run_linear_indexing(
+async def run_confluence_indexing(
     session: AsyncSession,
     connector_id: int,
     search_space_id: int,
@@ -1948,17 +1806,17 @@ async def run_linear_indexing(
     end_date: str,
 ):
     """
-    Background task to run Linear indexing.
+    Background task to run Confluence indexing.
 
     Args:
         session: Database session
-        connector_id: ID of the Linear connector
+        connector_id: ID of the Confluence connector
         search_space_id: ID of the search space
         user_id: ID of the user
         start_date: Start date for indexing
         end_date: End date for indexing
     """
-    from app.tasks.connector_indexers import index_linear_issues
+    from app.tasks.connector_indexers import index_confluence_pages
 
     await _run_indexing_with_notifications(
         session=session,
@@ -1967,31 +1825,34 @@ async def run_linear_indexing(
         user_id=user_id,
         start_date=start_date,
         end_date=end_date,
-        indexing_function=index_linear_issues,
+        indexing_function=index_confluence_pages,
         update_timestamp_func=_update_connector_timestamp_by_id,
         supports_heartbeat_callback=True,
     )
 
 
-# Add new helper functions for discord indexing
-async def run_discord_indexing_with_new_session(
+# Add new helper functions for Google Calendar indexing
+async def run_google_calendar_indexing_with_new_session(
     connector_id: int,
     search_space_id: int,
     user_id: str,
     start_date: str,
     end_date: str,
 ):
-    """
-    Create a new session and run the Discord indexing task.
-    This prevents session leaks by creating a dedicated session for the background task.
-    """
+    """Wrapper to run Google Calendar indexing with its own database session."""
+    logger.info(
+        f"Background task started: Indexing Google Calendar connector {connector_id} into space {search_space_id} from {start_date} to {end_date}"
+    )
     async with async_session_maker() as session:
-        await run_discord_indexing(
+        await run_google_calendar_indexing(
             session, connector_id, search_space_id, user_id, start_date, end_date
         )
+    logger.info(
+        f"Background task finished: Indexing Google Calendar connector {connector_id}"
+    )
 
 
-async def run_discord_indexing(
+async def run_google_calendar_indexing(
     session: AsyncSession,
     connector_id: int,
     search_space_id: int,
@@ -2000,17 +1861,17 @@ async def run_discord_indexing(
     end_date: str,
 ):
     """
-    Background task to run Discord indexing.
+    Background task to run Google Calendar indexing.
 
     Args:
         session: Database session
-        connector_id: ID of the Discord connector
+        connector_id: ID of the Google Calendar connector
         search_space_id: ID of the search space
         user_id: ID of the user
         start_date: Start date for indexing
         end_date: End date for indexing
     """
-    from app.tasks.connector_indexers import index_discord_messages
+    from app.tasks.connector_indexers import index_google_calendar_events
 
     await _run_indexing_with_notifications(
         session=session,
@@ -2019,13 +1880,13 @@ async def run_discord_indexing(
         user_id=user_id,
         start_date=start_date,
         end_date=end_date,
-        indexing_function=index_discord_messages,
+        indexing_function=index_google_calendar_events,
         update_timestamp_func=_update_connector_timestamp_by_id,
         supports_heartbeat_callback=True,
     )
 
 
-async def run_teams_indexing_with_new_session(
+async def run_google_gmail_indexing_with_new_session(
     connector_id: int,
     search_space_id: int,
     user_id: str,
@@ -2033,16 +1894,16 @@ async def run_teams_indexing_with_new_session(
     end_date: str,
 ):
     """
-    Create a new session and run the Microsoft Teams indexing task.
+    Create a new session and run the Google Gmail indexing task.
     This prevents session leaks by creating a dedicated session for the background task.
     """
     async with async_session_maker() as session:
-        await run_teams_indexing(
+        await run_google_gmail_indexing(
             session, connector_id, search_space_id, user_id, start_date, end_date
         )
 
 
-async def run_teams_indexing(
+async def run_google_gmail_indexing(
     session: AsyncSession,
     connector_id: int,
     search_space_id: int,
@@ -2051,17 +1912,42 @@ async def run_teams_indexing(
     end_date: str,
 ):
     """
-    Background task to run Microsoft Teams indexing.
+    Background task to run Google Gmail indexing.
 
     Args:
         session: Database session
-        connector_id: ID of the Teams connector
+        connector_id: ID of the Google Gmail connector
         search_space_id: ID of the search space
         user_id: ID of the user
         start_date: Start date for indexing
         end_date: End date for indexing
     """
-    from app.tasks.connector_indexers.teams_indexer import index_teams_messages
+    from app.tasks.connector_indexers import index_google_gmail_messages
+
+    # Create a wrapper function that calls index_google_gmail_messages with max_messages
+    async def gmail_indexing_wrapper(
+        session: AsyncSession,
+        connector_id: int,
+        search_space_id: int,
+        user_id: str,
+        start_date: str | None,
+        end_date: str | None,
+        update_last_indexed: bool,
+        on_heartbeat_callback=None,
+    ) -> tuple[int, int, str | None]:
+        max_messages = 1000
+        indexed_count, skipped_count, error_message = await index_google_gmail_messages(
+            session=session,
+            connector_id=connector_id,
+            search_space_id=search_space_id,
+            user_id=user_id,
+            start_date=start_date,
+            end_date=end_date,
+            update_last_indexed=update_last_indexed,
+            max_messages=max_messages,
+            on_heartbeat_callback=on_heartbeat_callback,
+        )
+        return indexed_count, skipped_count, error_message if error_message else None
 
     await _run_indexing_with_notifications(
         session=session,
@@ -2070,388 +1956,43 @@ async def run_teams_indexing(
         user_id=user_id,
         start_date=start_date,
         end_date=end_date,
-        indexing_function=index_teams_messages,
+        indexing_function=gmail_indexing_wrapper,
         update_timestamp_func=_update_connector_timestamp_by_id,
         supports_heartbeat_callback=True,
     )
 
 
-# Add new helper functions for Jira indexing
-async def run_jira_indexing_with_new_session(
+async def run_google_drive_indexing(
+    session: AsyncSession,
     connector_id: int,
     search_space_id: int,
     user_id: str,
-    start_date: str,
-    end_date: str,
+    items_dict: dict,  # Dictionary with 'folders', 'files', and 'indexing_options'
 ):
-    """Wrapper to run Jira indexing with its own database session."""
-    logger.info(
-        f"Background task started: Indexing Jira connector {connector_id} into space {search_space_id} from {start_date} to {end_date}"
-    )
-    async with async_session_maker() as session:
-        await run_jira_indexing(
-            session, connector_id, search_space_id, user_id, start_date, end_date
+    """Runs the Google Drive indexing task for folders and files with notifications."""
+    from uuid import UUID
+
+    notification = None
+    try:
+        from app.tasks.connector_indexers.google_drive_indexer import (
+            index_google_drive_files,
+            index_google_drive_selected_files,
         )
-    logger.info(f"Background task finished: Indexing Jira connector {connector_id}")
 
+        # Parse the structured data
+        items = GoogleDriveIndexRequest(**items_dict)
+        indexing_options = items.indexing_options
+        total_indexed = 0
+        total_skipped = 0
+        errors = []
 
-async def run_jira_indexing(
-    session: AsyncSession,
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """
-    Background task to run Jira indexing.
-
-    Args:
-        session: Database session
-        connector_id: ID of the Jira connector
-        search_space_id: ID of the search space
-        user_id: ID of the user
-        start_date: Start date for indexing
-        end_date: End date for indexing
-    """
-    from app.tasks.connector_indexers import index_jira_issues
-
-    await _run_indexing_with_notifications(
-        session=session,
-        connector_id=connector_id,
-        search_space_id=search_space_id,
-        user_id=user_id,
-        start_date=start_date,
-        end_date=end_date,
-        indexing_function=index_jira_issues,
-        update_timestamp_func=_update_connector_timestamp_by_id,
-        supports_heartbeat_callback=True,
-    )
-
-
-# Add new helper functions for Confluence indexing
-async def run_confluence_indexing_with_new_session(
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """Wrapper to run Confluence indexing with its own database session."""
-    logger.info(
-        f"Background task started: Indexing Confluence connector {connector_id} into space {search_space_id} from {start_date} to {end_date}"
-    )
-    async with async_session_maker() as session:
-        await run_confluence_indexing(
-            session, connector_id, search_space_id, user_id, start_date, end_date
-        )
-    logger.info(
-        f"Background task finished: Indexing Confluence connector {connector_id}"
-    )
-
-
-async def run_confluence_indexing(
-    session: AsyncSession,
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """
-    Background task to run Confluence indexing.
-
-    Args:
-        session: Database session
-        connector_id: ID of the Confluence connector
-        search_space_id: ID of the search space
-        user_id: ID of the user
-        start_date: Start date for indexing
-        end_date: End date for indexing
-    """
-    from app.tasks.connector_indexers import index_confluence_pages
-
-    await _run_indexing_with_notifications(
-        session=session,
-        connector_id=connector_id,
-        search_space_id=search_space_id,
-        user_id=user_id,
-        start_date=start_date,
-        end_date=end_date,
-        indexing_function=index_confluence_pages,
-        update_timestamp_func=_update_connector_timestamp_by_id,
-        supports_heartbeat_callback=True,
-    )
-
-
-# Add new helper functions for ClickUp indexing
-async def run_clickup_indexing_with_new_session(
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """Wrapper to run ClickUp indexing with its own database session."""
-    logger.info(
-        f"Background task started: Indexing ClickUp connector {connector_id} into space {search_space_id} from {start_date} to {end_date}"
-    )
-    async with async_session_maker() as session:
-        await run_clickup_indexing(
-            session, connector_id, search_space_id, user_id, start_date, end_date
-        )
-    logger.info(f"Background task finished: Indexing ClickUp connector {connector_id}")
-
-
-async def run_clickup_indexing(
-    session: AsyncSession,
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """
-    Background task to run ClickUp indexing.
-
-    Args:
-        session: Database session
-        connector_id: ID of the ClickUp connector
-        search_space_id: ID of the search space
-        user_id: ID of the user
-        start_date: Start date for indexing
-        end_date: End date for indexing
-    """
-    from app.tasks.connector_indexers import index_clickup_tasks
-
-    await _run_indexing_with_notifications(
-        session=session,
-        connector_id=connector_id,
-        search_space_id=search_space_id,
-        user_id=user_id,
-        start_date=start_date,
-        end_date=end_date,
-        indexing_function=index_clickup_tasks,
-        update_timestamp_func=_update_connector_timestamp_by_id,
-        supports_heartbeat_callback=True,
-    )
-
-
-# Add new helper functions for Airtable indexing
-async def run_airtable_indexing_with_new_session(
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """Wrapper to run Airtable indexing with its own database session."""
-    logger.info(
-        f"Background task started: Indexing Airtable connector {connector_id} into space {search_space_id} from {start_date} to {end_date}"
-    )
-    async with async_session_maker() as session:
-        await run_airtable_indexing(
-            session, connector_id, search_space_id, user_id, start_date, end_date
-        )
-    logger.info(f"Background task finished: Indexing Airtable connector {connector_id}")
-
-
-async def run_airtable_indexing(
-    session: AsyncSession,
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """
-    Background task to run Airtable indexing.
-
-    Args:
-        session: Database session
-        connector_id: ID of the Airtable connector
-        search_space_id: ID of the search space
-        user_id: ID of the user
-        start_date: Start date for indexing
-        end_date: End date for indexing
-    """
-    from app.tasks.connector_indexers import index_airtable_records
-
-    await _run_indexing_with_notifications(
-        session=session,
-        connector_id=connector_id,
-        search_space_id=search_space_id,
-        user_id=user_id,
-        start_date=start_date,
-        end_date=end_date,
-        indexing_function=index_airtable_records,
-        update_timestamp_func=_update_connector_timestamp_by_id,
-        supports_heartbeat_callback=True,
-    )
-
-
-# Add new helper functions for Google Calendar indexing
-async def run_google_calendar_indexing_with_new_session(
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """Wrapper to run Google Calendar indexing with its own database session."""
-    logger.info(
-        f"Background task started: Indexing Google Calendar connector {connector_id} into space {search_space_id} from {start_date} to {end_date}"
-    )
-    async with async_session_maker() as session:
-        await run_google_calendar_indexing(
-            session, connector_id, search_space_id, user_id, start_date, end_date
-        )
-    logger.info(
-        f"Background task finished: Indexing Google Calendar connector {connector_id}"
-    )
-
-
-async def run_google_calendar_indexing(
-    session: AsyncSession,
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """
-    Background task to run Google Calendar indexing.
-
-    Args:
-        session: Database session
-        connector_id: ID of the Google Calendar connector
-        search_space_id: ID of the search space
-        user_id: ID of the user
-        start_date: Start date for indexing
-        end_date: End date for indexing
-    """
-    from app.tasks.connector_indexers import index_google_calendar_events
-
-    await _run_indexing_with_notifications(
-        session=session,
-        connector_id=connector_id,
-        search_space_id=search_space_id,
-        user_id=user_id,
-        start_date=start_date,
-        end_date=end_date,
-        indexing_function=index_google_calendar_events,
-        update_timestamp_func=_update_connector_timestamp_by_id,
-        supports_heartbeat_callback=True,
-    )
-
-
-async def run_google_gmail_indexing_with_new_session(
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """
-    Create a new session and run the Google Gmail indexing task.
-    This prevents session leaks by creating a dedicated session for the background task.
-    """
-    async with async_session_maker() as session:
-        await run_google_gmail_indexing(
-            session, connector_id, search_space_id, user_id, start_date, end_date
-        )
-
-
-async def run_google_gmail_indexing(
-    session: AsyncSession,
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """
-    Background task to run Google Gmail indexing.
-
-    Args:
-        session: Database session
-        connector_id: ID of the Google Gmail connector
-        search_space_id: ID of the search space
-        user_id: ID of the user
-        start_date: Start date for indexing
-        end_date: End date for indexing
-    """
-    from app.tasks.connector_indexers import index_google_gmail_messages
-
-    # Create a wrapper function that calls index_google_gmail_messages with max_messages
-    async def gmail_indexing_wrapper(
-        session: AsyncSession,
-        connector_id: int,
-        search_space_id: int,
-        user_id: str,
-        start_date: str | None,
-        end_date: str | None,
-        update_last_indexed: bool,
-        on_heartbeat_callback=None,
-    ) -> tuple[int, int, str | None]:
-        max_messages = 1000
-        indexed_count, skipped_count, error_message = await index_google_gmail_messages(
-            session=session,
-            connector_id=connector_id,
-            search_space_id=search_space_id,
-            user_id=user_id,
-            start_date=start_date,
-            end_date=end_date,
-            update_last_indexed=update_last_indexed,
-            max_messages=max_messages,
-            on_heartbeat_callback=on_heartbeat_callback,
-        )
-        return indexed_count, skipped_count, error_message if error_message else None
-
-    await _run_indexing_with_notifications(
-        session=session,
-        connector_id=connector_id,
-        search_space_id=search_space_id,
-        user_id=user_id,
-        start_date=start_date,
-        end_date=end_date,
-        indexing_function=gmail_indexing_wrapper,
-        update_timestamp_func=_update_connector_timestamp_by_id,
-        supports_heartbeat_callback=True,
-    )
-
-
-async def run_google_drive_indexing(
-    session: AsyncSession,
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    items_dict: dict,  # Dictionary with 'folders', 'files', and 'indexing_options'
-):
-    """Runs the Google Drive indexing task for folders and files with notifications."""
-    from uuid import UUID
-
-    notification = None
-    try:
-        from app.tasks.connector_indexers.google_drive_indexer import (
-            index_google_drive_files,
-            index_google_drive_selected_files,
-        )
-
-        # Parse the structured data
-        items = GoogleDriveIndexRequest(**items_dict)
-        indexing_options = items.indexing_options
-        total_indexed = 0
-        total_skipped = 0
-        errors = []
-
-        # Get connector info for notification
-        connector_result = await session.execute(
-            select(SearchSourceConnector).where(
-                SearchSourceConnector.id == connector_id
-            )
-        )
-        connector = connector_result.scalar_one_or_none()
+        # Get connector info for notification
+        connector_result = await session.execute(
+            select(SearchSourceConnector).where(
+                SearchSourceConnector.id == connector_id
+            )
+        )
+        connector = connector_result.scalar_one_or_none()
 
         if connector:
             # Create notification when indexing starts
@@ -2828,58 +2369,6 @@ async def run_dropbox_indexing(
                 logger.error(f"Failed to update notification: {notif_error!s}")
 
 
-# Add new helper functions for luma indexing
-async def run_luma_indexing_with_new_session(
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """
-    Create a new session and run the Luma indexing task.
-    This prevents session leaks by creating a dedicated session for the background task.
-    """
-    async with async_session_maker() as session:
-        await run_luma_indexing(
-            session, connector_id, search_space_id, user_id, start_date, end_date
-        )
-
-
-async def run_luma_indexing(
-    session: AsyncSession,
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """
-    Background task to run Luma indexing.
-
-    Args:
-        session: Database session
-        connector_id: ID of the Luma connector
-        search_space_id: ID of the search space
-        user_id: ID of the user
-        start_date: Start date for indexing
-        end_date: End date for indexing
-    """
-    from app.tasks.connector_indexers import index_luma_events
-
-    await _run_indexing_with_notifications(
-        session=session,
-        connector_id=connector_id,
-        search_space_id=search_space_id,
-        user_id=user_id,
-        start_date=start_date,
-        end_date=end_date,
-        indexing_function=index_luma_events,
-        update_timestamp_func=_update_connector_timestamp_by_id,
-        supports_heartbeat_callback=True,
-    )
-
-
 async def run_elasticsearch_indexing_with_new_session(
     connector_id: int,
     search_space_id: int,
@@ -3041,59 +2530,6 @@ async def run_bookstack_indexing(
     )
 
 
-# Add new helper functions for Obsidian indexing
-async def run_obsidian_indexing_with_new_session(
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """Wrapper to run Obsidian indexing with its own database session."""
-    logger.info(
-        f"Background task started: Indexing Obsidian connector {connector_id} into space {search_space_id} from {start_date} to {end_date}"
-    )
-    async with async_session_maker() as session:
-        await run_obsidian_indexing(
-            session, connector_id, search_space_id, user_id, start_date, end_date
-        )
-    logger.info(f"Background task finished: Indexing Obsidian connector {connector_id}")
-
-
-async def run_obsidian_indexing(
-    session: AsyncSession,
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """
-    Background task to run Obsidian vault indexing.
-
-    Args:
-        session: Database session
-        connector_id: ID of the Obsidian connector
-        search_space_id: ID of the search space
-        user_id: ID of the user
-        start_date: Start date for indexing
-        end_date: End date for indexing
-    """
-    from app.tasks.connector_indexers import index_obsidian_vault
-
-    await _run_indexing_with_notifications(
-        session=session,
-        connector_id=connector_id,
-        search_space_id=search_space_id,
-        user_id=user_id,
-        start_date=start_date,
-        end_date=end_date,
-        indexing_function=index_obsidian_vault,
-        update_timestamp_func=_update_connector_timestamp_by_id,
-        supports_heartbeat_callback=True,
-    )
-
-
 async def run_composio_indexing_with_new_session(
     connector_id: int,
     search_space_id: int,
@@ -3624,3 +3060,126 @@ async def get_drive_picker_token(
             status_code=500,
             detail="Failed to retrieve access token. Check server logs for details.",
         ) from e
+
+
+# =============================================================================
+# MCP Tool Trust (Allow-List) Routes
+# =============================================================================
+
+
+class MCPTrustToolRequest(BaseModel):
+    tool_name: str
+
+
+async def _ensure_mcp_connector_for_user(
+    session: AsyncSession, *, user_id, connector_id: int
+) -> int:
+    """Verify ``connector_id`` is an MCP-backed connector owned by ``user_id``.
+
+    The trust-list feature is intentionally MCP-only; native connectors
+    (Gmail, Calendar, Notion, ...) do not have a "trust this tool" UI.
+    The JSONB ``has_key("server_config")`` filter is the same MCP marker
+    used elsewhere in this module.
+
+    Returns the connector's ``search_space_id`` (needed downstream for
+    MCP tool cache invalidation). Raises ``HTTPException(404)`` when the
+    connector does not exist, is not owned by the user, or is not
+    MCP-backed.
+    """
+    from sqlalchemy import cast
+    from sqlalchemy.dialects.postgresql import JSONB as PG_JSONB
+
+    result = await session.execute(
+        select(SearchSourceConnector.search_space_id).where(
+            SearchSourceConnector.id == connector_id,
+            SearchSourceConnector.user_id == user_id,
+            cast(SearchSourceConnector.config, PG_JSONB).has_key("server_config"),
+        )
+    )
+    search_space_id = result.scalar_one_or_none()
+    if search_space_id is None:
+        raise HTTPException(status_code=404, detail="MCP connector not found")
+    return search_space_id
+
+
+@router.post("/connectors/mcp/{connector_id}/trust-tool")
+async def trust_mcp_tool(
+    connector_id: int,
+    body: MCPTrustToolRequest,
+    session: AsyncSession = Depends(get_async_session),
+    user: User = Depends(current_active_user),
+):
+    """Add a tool to the MCP connector's trusted (always-allow) list.
+
+    Once trusted, the tool executes without HITL approval on subsequent
+    calls. Works for both generic ``MCP_CONNECTOR`` and OAuth-backed MCP
+    connectors (``LINEAR_CONNECTOR``, ``JIRA_CONNECTOR``, ...) — the
+    storage primitive is the same JSON list under ``config.trusted_tools``.
+    """
+    from app.agents.new_chat.tools.mcp_tool import invalidate_mcp_tools_cache
+    from app.services.user_tool_allowlist import add_user_trust
+
+    try:
+        search_space_id = await _ensure_mcp_connector_for_user(
+            session, user_id=user.id, connector_id=connector_id
+        )
+        trusted = await add_user_trust(
+            session,
+            user_id=user.id,
+            connector_id=connector_id,
+            tool_name=body.tool_name,
+        )
+        await session.commit()
+        invalidate_mcp_tools_cache(search_space_id)
+        return {"status": "ok", "trusted_tools": trusted}
+
+    except HTTPException:
+        raise
+    except LookupError as e:
+        raise HTTPException(status_code=404, detail="MCP connector not found") from e
+    except Exception as e:
+        logger.error(f"Failed to trust MCP tool: {e!s}", exc_info=True)
+        await session.rollback()
+        raise HTTPException(
+            status_code=500, detail=f"Failed to trust tool: {e!s}"
+        ) from e
+
+
+@router.post("/connectors/mcp/{connector_id}/untrust-tool")
+async def untrust_mcp_tool(
+    connector_id: int,
+    body: MCPTrustToolRequest,
+    session: AsyncSession = Depends(get_async_session),
+    user: User = Depends(current_active_user),
+):
+    """Remove a tool from the MCP connector's trusted list.
+
+    The tool will require HITL approval again on subsequent calls.
+    """
+    from app.agents.new_chat.tools.mcp_tool import invalidate_mcp_tools_cache
+    from app.services.user_tool_allowlist import remove_user_trust
+
+    try:
+        search_space_id = await _ensure_mcp_connector_for_user(
+            session, user_id=user.id, connector_id=connector_id
+        )
+        trusted = await remove_user_trust(
+            session,
+            user_id=user.id,
+            connector_id=connector_id,
+            tool_name=body.tool_name,
+        )
+        await session.commit()
+        invalidate_mcp_tools_cache(search_space_id)
+        return {"status": "ok", "trusted_tools": trusted}
+
+    except HTTPException:
+        raise
+    except LookupError as e:
+        raise HTTPException(status_code=404, detail="MCP connector not found") from e
+    except Exception as e:
+        logger.error(f"Failed to untrust MCP tool: {e!s}", exc_info=True)
+        await session.rollback()
+        raise HTTPException(
+            status_code=500, detail=f"Failed to untrust tool: {e!s}"
+        ) from e
diff --git a/surfsense_backend/app/routes/search_spaces_routes.py b/surfsense_backend/app/routes/search_spaces_routes.py
index 78be97aa16..0f0e430354 100644
--- a/surfsense_backend/app/routes/search_spaces_routes.py
+++ b/surfsense_backend/app/routes/search_spaces_routes.py
@@ -1,13 +1,21 @@
 import logging
 
 from fastapi import APIRouter, Depends, HTTPException
-from sqlalchemy import func
+from langchain_core.messages import HumanMessage
+from pydantic import BaseModel as PydanticBaseModel
+from sqlalchemy import func, update
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
 
+from app.agents.new_chat.llm_config import (
+    create_chat_litellm_from_agent_config,
+    load_agent_llm_config_for_search_space,
+)
+from app.agents.new_chat.tools.update_memory import MEMORY_HARD_LIMIT, _save_memory
 from app.config import config
 from app.db import (
     ImageGenerationConfig,
+    NewChatThread,
     NewLLMConfig,
     Permission,
     SearchSpace,
@@ -27,6 +35,7 @@
     SearchSpaceWithStats,
 )
 from app.users import current_active_user
+from app.utils.content_utils import extract_text_content
 from app.utils.rbac import check_permission, check_search_space_access
 
 logger = logging.getLogger(__name__)
@@ -34,6 +43,34 @@
 router = APIRouter()
 
 
+class _TeamMemoryEditRequest(PydanticBaseModel):
+    query: str
+
+
+_TEAM_MEMORY_EDIT_PROMPT = """\
+You are a memory editor for a team workspace. The user wants to modify the \
+team's shared memory document. Apply the user's instruction to the existing \
+memory document and output the FULL updated document.
+
+RULES:
+1. If the instruction asks to add something, add it with format: \
+- (YYYY-MM-DD) [fact] text, under an existing or new ## heading. \
+Heading names should be descriptive, not generic categories.
+2. If the instruction asks to remove something, remove the matching entry.
+3. If the instruction asks to change something, update the matching entry.
+4. Preserve existing ## headings and all other entries.
+5. NEVER use [pref] or [instr] markers. Team memory uses [fact] only.
+6. Output ONLY the updated markdown — no explanations, no wrapping.
+
+<current_memory>
+{current_memory}
+</current_memory>
+
+<user_instruction>
+{instruction}
+</user_instruction>"""
+
+
 async def create_default_roles_and_membership(
     session: AsyncSession,
     search_space_id: int,
@@ -181,6 +218,7 @@ async def read_search_spaces(
                     user_id=space.user_id,
                     citations_enabled=space.citations_enabled,
                     qna_custom_instructions=space.qna_custom_instructions,
+                    ai_file_sort_enabled=space.ai_file_sort_enabled,
                     member_count=member_count,
                     is_owner=is_owner,
                 )
@@ -255,6 +293,16 @@ async def update_search_space(
             raise HTTPException(status_code=404, detail="Search space not found")
 
         update_data = search_space_update.model_dump(exclude_unset=True)
+
+        if (
+            "shared_memory_md" in update_data
+            and len(update_data["shared_memory_md"] or "") > MEMORY_HARD_LIMIT
+        ):
+            raise HTTPException(
+                status_code=400,
+                detail=f"Team memory exceeds {MEMORY_HARD_LIMIT:,} character limit.",
+            )
+
         for key, value in update_data.items():
             setattr(db_search_space, key, value)
         await session.commit()
@@ -269,6 +317,108 @@ async def update_search_space(
         ) from e
 
 
+@router.post(
+    "/searchspaces/{search_space_id}/memory/edit",
+    response_model=SearchSpaceRead,
+)
+async def edit_team_memory(
+    search_space_id: int,
+    body: _TeamMemoryEditRequest,
+    session: AsyncSession = Depends(get_async_session),
+    user: User = Depends(current_active_user),
+):
+    """Apply a natural language edit to the team memory via LLM."""
+    await check_search_space_access(session, user, search_space_id)
+
+    agent_config = await load_agent_llm_config_for_search_space(
+        session, search_space_id
+    )
+    if not agent_config:
+        raise HTTPException(status_code=500, detail="No LLM configuration available.")
+    llm = create_chat_litellm_from_agent_config(agent_config)
+    if not llm:
+        raise HTTPException(status_code=500, detail="Failed to create LLM instance.")
+
+    result = await session.execute(
+        select(SearchSpace).filter(SearchSpace.id == search_space_id)
+    )
+    db_search_space = result.scalars().first()
+    if not db_search_space:
+        raise HTTPException(status_code=404, detail="Search space not found")
+
+    current_memory = db_search_space.shared_memory_md or ""
+
+    prompt = _TEAM_MEMORY_EDIT_PROMPT.format(
+        current_memory=current_memory or "(empty)",
+        instruction=body.query,
+    )
+    try:
+        response = await llm.ainvoke(
+            [HumanMessage(content=prompt)],
+            config={"tags": ["surfsense:internal", "memory-edit"]},
+        )
+        updated = extract_text_content(response.content).strip()
+    except Exception as e:
+        logger.exception("Team memory edit LLM call failed: %s", e)
+        raise HTTPException(status_code=500, detail="Team memory edit failed.") from e
+
+    if not updated:
+        raise HTTPException(status_code=400, detail="LLM returned empty result.")
+
+    save_result = await _save_memory(
+        updated_memory=updated,
+        old_memory=current_memory,
+        llm=llm,
+        apply_fn=lambda content: setattr(db_search_space, "shared_memory_md", content),
+        commit_fn=session.commit,
+        rollback_fn=session.rollback,
+        label="team memory",
+        scope="team",
+    )
+
+    if save_result.get("status") == "error":
+        raise HTTPException(status_code=400, detail=save_result["message"])
+
+    await session.refresh(db_search_space)
+    return db_search_space
+
+
+@router.post("/searchspaces/{search_space_id}/ai-sort")
+async def trigger_ai_sort(
+    search_space_id: int,
+    session: AsyncSession = Depends(get_async_session),
+    user: User = Depends(current_active_user),
+):
+    """Trigger a full AI file sort for all documents in the search space."""
+    try:
+        await check_permission(
+            session,
+            user,
+            search_space_id,
+            Permission.SETTINGS_UPDATE.value,
+            "You don't have permission to trigger AI sort on this search space",
+        )
+
+        result = await session.execute(
+            select(SearchSpace).filter(SearchSpace.id == search_space_id)
+        )
+        db_search_space = result.scalars().first()
+        if not db_search_space:
+            raise HTTPException(status_code=404, detail="Search space not found")
+
+        from app.tasks.celery_tasks.document_tasks import ai_sort_search_space_task
+
+        ai_sort_search_space_task.delay(search_space_id, str(user.id))
+        return {"message": "AI sort started"}
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Failed to trigger AI sort: {e!s}", exc_info=True)
+        raise HTTPException(
+            status_code=500, detail=f"Failed to trigger AI sort: {e!s}"
+        ) from e
+
+
 @router.delete("/searchspaces/{search_space_id}", response_model=dict)
 async def delete_search_space(
     search_space_id: int,
@@ -441,6 +591,7 @@ async def _get_image_gen_config_by_id(
             "model_name": "auto",
             "is_global": True,
             "is_auto_mode": True,
+            "billing_tier": "free",
         }
 
     if config_id < 0:
@@ -457,6 +608,7 @@ async def _get_image_gen_config_by_id(
                     "api_version": cfg.get("api_version") or None,
                     "litellm_params": cfg.get("litellm_params", {}),
                     "is_global": True,
+                    "billing_tier": cfg.get("billing_tier", "free"),
                 }
         return None
 
@@ -499,6 +651,7 @@ async def _get_vision_llm_config_by_id(
             "model_name": "auto",
             "is_global": True,
             "is_auto_mode": True,
+            "billing_tier": "free",
         }
 
     if config_id < 0:
@@ -515,6 +668,7 @@ async def _get_vision_llm_config_by_id(
                     "api_version": cfg.get("api_version") or None,
                     "litellm_params": cfg.get("litellm_params", {}),
                     "is_global": True,
+                    "billing_tier": cfg.get("billing_tier", "free"),
                 }
         return None
 
@@ -638,9 +792,27 @@ async def update_llm_preferences(
 
         # Update preferences
         update_data = preferences.model_dump(exclude_unset=True)
+        previous_agent_llm_id = search_space.agent_llm_id
         for key, value in update_data.items():
             setattr(search_space, key, value)
 
+        agent_llm_changed = (
+            "agent_llm_id" in update_data
+            and update_data["agent_llm_id"] != previous_agent_llm_id
+        )
+        if agent_llm_changed:
+            await session.execute(
+                update(NewChatThread)
+                .where(NewChatThread.search_space_id == search_space_id)
+                .values(pinned_llm_config_id=None)
+            )
+            logger.info(
+                "Cleared auto model pins for search_space_id=%s after agent_llm_id change (%s -> %s)",
+                search_space_id,
+                previous_agent_llm_id,
+                update_data["agent_llm_id"],
+            )
+
         await session.commit()
         await session.refresh(search_space)
 
diff --git a/surfsense_backend/app/routes/slack_add_connector_route.py b/surfsense_backend/app/routes/slack_add_connector_route.py
index 405ab2c4fd..f6a1458a0f 100644
--- a/surfsense_backend/app/routes/slack_add_connector_route.py
+++ b/surfsense_backend/app/routes/slack_add_connector_route.py
@@ -312,7 +312,7 @@ async def slack_callback(
         new_connector = SearchSourceConnector(
             name=connector_name,
             connector_type=SearchSourceConnectorType.SLACK_CONNECTOR,
-            is_indexable=True,
+            is_indexable=False,
             config=connector_config,
             search_space_id=space_id,
             user_id=user_id,
diff --git a/surfsense_backend/app/routes/stripe_routes.py b/surfsense_backend/app/routes/stripe_routes.py
index 672f67cadb..fc5fded84f 100644
--- a/surfsense_backend/app/routes/stripe_routes.py
+++ b/surfsense_backend/app/routes/stripe_routes.py
@@ -13,13 +13,25 @@
 from stripe import SignatureVerificationError, StripeClient, StripeError
 
 from app.config import config
-from app.db import PagePurchase, PagePurchaseStatus, User, get_async_session
+from app.db import (
+    PagePurchase,
+    PagePurchaseStatus,
+    PremiumTokenPurchase,
+    PremiumTokenPurchaseStatus,
+    User,
+    get_async_session,
+)
 from app.schemas.stripe import (
     CreateCheckoutSessionRequest,
     CreateCheckoutSessionResponse,
+    CreateTokenCheckoutSessionRequest,
+    CreateTokenCheckoutSessionResponse,
+    FinalizeCheckoutResponse,
     PagePurchaseHistoryResponse,
     StripeStatusResponse,
     StripeWebhookResponse,
+    TokenPurchaseHistoryResponse,
+    TokenStripeStatusResponse,
 )
 from app.users import current_active_user
 
@@ -54,7 +66,15 @@ def _get_checkout_urls(search_space_id: int) -> tuple[str, str]:
         )
 
     base_url = config.NEXT_FRONTEND_URL.rstrip("/")
-    success_url = f"{base_url}/dashboard/{search_space_id}/purchase-success"
+    # Stripe substitutes ``{CHECKOUT_SESSION_ID}`` with the actual session id
+    # at redirect time. The frontend uses it to call /stripe/finalize-checkout
+    # which fulfils synchronously without waiting for the webhook — fixing the
+    # webhook-vs-redirect race where users land on /purchase-success before
+    # checkout.session.completed has been delivered.
+    success_url = (
+        f"{base_url}/dashboard/{search_space_id}/purchase-success"
+        f"?session_id={{CHECKOUT_SESSION_ID}}"
+    )
     cancel_url = f"{base_url}/dashboard/{search_space_id}/purchase-cancel"
     return success_url, cancel_url
 
@@ -77,10 +97,62 @@ def _normalize_optional_string(value: Any) -> str | None:
 
 
 def _get_metadata(checkout_session: Any) -> dict[str, str]:
-    metadata = getattr(checkout_session, "metadata", None) or {}
+    """Extract checkout session metadata as a plain ``str -> str`` dict.
+
+    In ``stripe>=15.0`` ``StripeObject`` is no longer a ``dict`` subclass
+    and exposes neither ``items()`` nor ``__iter__`` nor ``keys()``.
+    ``dict(obj)`` falls into the sequence protocol and raises
+    ``KeyError: 0``; ``obj.items()`` raises ``AttributeError``. The
+    supported way to materialize a ``StripeObject`` as a plain dict is
+    its ``to_dict()`` method (added in stripe-python 8.x, present in 15.x).
+    """
+    metadata = getattr(checkout_session, "metadata", None)
+    if metadata is None:
+        return {}
+
+    # 1. Plain dict (older SDKs that subclassed dict, JSON-decoded events
+    #    in tests, etc.).
     if isinstance(metadata, dict):
-        return {str(key): str(value) for key, value in metadata.items()}
-    return dict(metadata)
+        return {str(k): str(v) for k, v in metadata.items()}
+
+    # 2. Modern Stripe SDK: every ``StripeObject`` has ``to_dict()``.
+    #    ``recursive=False`` is correct because Stripe metadata values
+    #    are always primitive strings.
+    to_dict = getattr(metadata, "to_dict", None)
+    if callable(to_dict):
+        try:
+            d = to_dict(recursive=False)
+            if isinstance(d, dict):
+                return {str(k): str(v) for k, v in d.items()}
+        except Exception:
+            logger.exception(
+                "Stripe metadata.to_dict() failed for session %s",
+                getattr(checkout_session, "id", "?"),
+            )
+
+    # 3. Last-resort: read the SDK's private ``_data`` backing dict.
+    #    Stable across stripe-python 6.x -> 15.x.
+    inner = getattr(metadata, "_data", None)
+    if isinstance(inner, dict):
+        return {str(k): str(v) for k, v in inner.items()}
+
+    logger.warning(
+        "Could not extract metadata from checkout session %s (metadata type=%s)",
+        getattr(checkout_session, "id", "?"),
+        type(metadata).__name__,
+    )
+    return {}
+
+
+# Canonical purchase_type metadata values. ``premium_credit`` was emitted
+# by an earlier release of ``create_token_checkout_session`` so it's still
+# accepted on the read side for backward compat with in-flight sessions.
+_PURCHASE_TYPE_TOKEN_VALUES = frozenset({"premium_tokens", "premium_credit"})
+
+
+def _is_token_purchase(metadata: dict[str, str]) -> bool:
+    """Return True for premium-credit (a.k.a. premium_token) purchases."""
+    return metadata.get("purchase_type", "page_packs") in _PURCHASE_TYPE_TOKEN_VALUES
 
 
 async def _get_or_create_purchase_from_checkout_session(
@@ -151,6 +223,26 @@ async def _mark_purchase_failed(
     return StripeWebhookResponse()
 
 
+async def _mark_token_purchase_failed(
+    db_session: AsyncSession, checkout_session_id: str
+) -> StripeWebhookResponse:
+    purchase = (
+        await db_session.execute(
+            select(PremiumTokenPurchase)
+            .where(
+                PremiumTokenPurchase.stripe_checkout_session_id == checkout_session_id
+            )
+            .with_for_update()
+        )
+    ).scalar_one_or_none()
+
+    if purchase is not None and purchase.status == PremiumTokenPurchaseStatus.PENDING:
+        purchase.status = PremiumTokenPurchaseStatus.FAILED
+        await db_session.commit()
+
+    return StripeWebhookResponse()
+
+
 async def _fulfill_completed_purchase(
     db_session: AsyncSession, checkout_session: Any
 ) -> StripeWebhookResponse:
@@ -201,6 +293,98 @@ async def _fulfill_completed_purchase(
     return StripeWebhookResponse()
 
 
+async def _fulfill_completed_token_purchase(
+    db_session: AsyncSession, checkout_session: Any
+) -> StripeWebhookResponse:
+    """Grant premium tokens to the user after a confirmed Stripe payment."""
+    checkout_session_id = str(checkout_session.id)
+    purchase = (
+        await db_session.execute(
+            select(PremiumTokenPurchase)
+            .where(
+                PremiumTokenPurchase.stripe_checkout_session_id == checkout_session_id
+            )
+            .with_for_update()
+        )
+    ).scalar_one_or_none()
+
+    if purchase is None:
+        metadata = _get_metadata(checkout_session)
+        user_id = metadata.get("user_id")
+        quantity = int(metadata.get("quantity", "0"))
+        # Read the new metadata key first, fall back to the legacy one so
+        # in-flight checkout sessions created before the cost-credits
+        # release still fulfil correctly (the unit is numerically the
+        # same: $1 buys 1_000_000 micro-USD == 1_000_000 tokens).
+        credit_micros_per_unit = int(
+            metadata.get("credit_micros_per_unit")
+            or metadata.get("tokens_per_unit", "0")
+        )
+
+        if not user_id or quantity <= 0 or credit_micros_per_unit <= 0:
+            logger.error(
+                "Skipping token fulfillment for session %s: incomplete metadata %s",
+                checkout_session_id,
+                metadata,
+            )
+            return StripeWebhookResponse()
+
+        purchase = PremiumTokenPurchase(
+            user_id=uuid.UUID(user_id),
+            stripe_checkout_session_id=checkout_session_id,
+            stripe_payment_intent_id=_normalize_optional_string(
+                getattr(checkout_session, "payment_intent", None)
+            ),
+            quantity=quantity,
+            credit_micros_granted=quantity * credit_micros_per_unit,
+            amount_total=getattr(checkout_session, "amount_total", None),
+            currency=getattr(checkout_session, "currency", None),
+            status=PremiumTokenPurchaseStatus.PENDING,
+        )
+        db_session.add(purchase)
+        await db_session.flush()
+
+    if purchase.status == PremiumTokenPurchaseStatus.COMPLETED:
+        return StripeWebhookResponse()
+
+    user = (
+        (
+            await db_session.execute(
+                select(User).where(User.id == purchase.user_id).with_for_update(of=User)
+            )
+        )
+        .unique()
+        .scalar_one_or_none()
+    )
+    if user is None:
+        logger.error(
+            "Skipping token fulfillment for session %s: user %s not found",
+            purchase.stripe_checkout_session_id,
+            purchase.user_id,
+        )
+        return StripeWebhookResponse()
+
+    purchase.status = PremiumTokenPurchaseStatus.COMPLETED
+    purchase.completed_at = datetime.now(UTC)
+    purchase.amount_total = getattr(checkout_session, "amount_total", None)
+    purchase.currency = getattr(checkout_session, "currency", None)
+    purchase.stripe_payment_intent_id = _normalize_optional_string(
+        getattr(checkout_session, "payment_intent", None)
+    )
+    # Top up the user's credit balance by the granted micro-USD amount.
+    # ``max(used, limit)`` clamps the case where the legacy code wrote a
+    # used value above the limit (e.g. underbilling rounding) so adding
+    # ``credit_micros_granted`` always lifts the limit by the full pack
+    # size rather than disappearing into past overuse.
+    user.premium_credit_micros_limit = (
+        max(user.premium_credit_micros_used, user.premium_credit_micros_limit)
+        + purchase.credit_micros_granted
+    )
+
+    await db_session.commit()
+    return StripeWebhookResponse()
+
+
 @router.post("/create-checkout-session", response_model=CreateCheckoutSessionResponse)
 async def create_checkout_session(
     body: CreateCheckoutSessionRequest,
@@ -316,33 +500,215 @@ async def stripe_webhook(
             detail="Invalid Stripe webhook signature.",
         ) from exc
 
-    if event.type in {
-        "checkout.session.completed",
-        "checkout.session.async_payment_succeeded",
-    }:
-        checkout_session = event.data.object
-        payment_status = getattr(checkout_session, "payment_status", None)
-
-        if event.type == "checkout.session.completed" and payment_status not in {
-            "paid",
-            "no_payment_required",
+    try:
+        if event.type in {
+            "checkout.session.completed",
+            "checkout.session.async_payment_succeeded",
         }:
-            logger.info(
-                "Received checkout.session.completed for unpaid session %s; waiting for async success.",
-                checkout_session.id,
-            )
-            return StripeWebhookResponse()
+            checkout_session = event.data.object
+            payment_status = getattr(checkout_session, "payment_status", None)
+
+            if event.type == "checkout.session.completed" and payment_status not in {
+                "paid",
+                "no_payment_required",
+            }:
+                logger.info(
+                    "Received checkout.session.completed for unpaid session %s; waiting for async success.",
+                    checkout_session.id,
+                )
+                return StripeWebhookResponse()
+
+            metadata = _get_metadata(checkout_session)
+            if _is_token_purchase(metadata):
+                return await _fulfill_completed_token_purchase(
+                    db_session, checkout_session
+                )
+            return await _fulfill_completed_purchase(db_session, checkout_session)
+
+        if event.type in {
+            "checkout.session.async_payment_failed",
+            "checkout.session.expired",
+        }:
+            checkout_session = event.data.object
+            metadata = _get_metadata(checkout_session)
+            if _is_token_purchase(metadata):
+                return await _mark_token_purchase_failed(
+                    db_session, str(checkout_session.id)
+                )
+            return await _mark_purchase_failed(db_session, str(checkout_session.id))
+    except Exception:
+        # Re-raise so FastAPI returns 500 and Stripe retries this delivery.
+        # Logging here gives us a structured trail with event id + type so
+        # future webhook bugs surface immediately in the logs without
+        # having to grep by request_id.
+        logger.exception(
+            "Stripe webhook handler failed for event id=%s type=%s — Stripe will retry",
+            getattr(event, "id", "?"),
+            getattr(event, "type", "?"),
+        )
+        raise
 
-        return await _fulfill_completed_purchase(db_session, checkout_session)
+    return StripeWebhookResponse()
 
-    if event.type in {
-        "checkout.session.async_payment_failed",
-        "checkout.session.expired",
-    }:
-        checkout_session = event.data.object
-        return await _mark_purchase_failed(db_session, str(checkout_session.id))
 
-    return StripeWebhookResponse()
+@router.get("/finalize-checkout", response_model=FinalizeCheckoutResponse)
+async def finalize_checkout(
+    session_id: str,
+    user: User = Depends(current_active_user),
+    db_session: AsyncSession = Depends(get_async_session),
+) -> FinalizeCheckoutResponse:
+    """Synchronously fulfil a checkout session from the success page.
+
+    Solves the webhook-vs-redirect race: the user lands on
+    ``/dashboard/<id>/purchase-success?session_id=cs_...`` typically a
+    few hundred ms after paying, but Stripe's
+    ``checkout.session.completed`` webhook can take 5-30s+ to arrive.
+    Calling this endpoint on success-page mount fulfils the purchase
+    immediately by retrieving the session from Stripe's API and
+    invoking the same idempotent helpers the webhook uses.
+
+    Idempotency: if the webhook has already fulfilled this purchase
+    (status=COMPLETED), the helpers short-circuit and we just return
+    the latest balance. Concurrent webhook + finalize calls are safe
+    because both acquire ``SELECT ... FOR UPDATE`` on the purchase row.
+
+    Authorization: the session's ``client_reference_id`` must match the
+    authenticated user's id. This prevents a user from finalising
+    someone else's checkout session if they happen to know the id.
+    """
+    stripe_client = get_stripe_client()
+
+    try:
+        checkout_session = stripe_client.v1.checkout.sessions.retrieve(session_id)
+    except StripeError as exc:
+        logger.warning(
+            "finalize_checkout: stripe lookup failed for session=%s user=%s: %s",
+            session_id,
+            user.id,
+            exc,
+        )
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail="Checkout session not found.",
+        ) from exc
+
+    # Authorization check: the user finalising must be the user who
+    # initiated the checkout. ``client_reference_id`` is set in
+    # ``create_checkout_session`` / ``create_token_checkout_session``.
+    client_reference_id = getattr(checkout_session, "client_reference_id", None)
+    if client_reference_id != str(user.id):
+        logger.warning(
+            "finalize_checkout: ownership mismatch session=%s client_ref=%s user=%s",
+            session_id,
+            client_reference_id,
+            user.id,
+        )
+        raise HTTPException(
+            status_code=status.HTTP_403_FORBIDDEN,
+            detail="This checkout session does not belong to you.",
+        )
+
+    metadata = _get_metadata(checkout_session)
+    is_token = _is_token_purchase(metadata)
+    payment_status = getattr(checkout_session, "payment_status", None)
+    session_status = getattr(checkout_session, "status", None)
+
+    # Defensive fallback: if metadata can't be read for any reason
+    # (extraction failure, manually-created session in Stripe dashboard,
+    # SDK upgrade breaking ``to_dict``, etc.) we'd otherwise route every
+    # purchase to the page_packs handler and get stuck. Resolve the
+    # purchase_type by checking which table actually has the row keyed
+    # by this Stripe session id.
+    if not metadata:
+        existing_token_purchase = (
+            await db_session.execute(
+                select(PremiumTokenPurchase.id).where(
+                    PremiumTokenPurchase.stripe_checkout_session_id
+                    == str(checkout_session.id)
+                )
+            )
+        ).scalar_one_or_none()
+        if existing_token_purchase is not None:
+            is_token = True
+        else:
+            existing_page_purchase = (
+                await db_session.execute(
+                    select(PagePurchase.id).where(
+                        PagePurchase.stripe_checkout_session_id
+                        == str(checkout_session.id)
+                    )
+                )
+            ).scalar_one_or_none()
+            if existing_page_purchase is None:
+                logger.error(
+                    "finalize_checkout: no purchase row in either table "
+                    "and metadata is empty for session=%s user=%s",
+                    session_id,
+                    user.id,
+                )
+                # Fall through; downstream path will short-circuit on
+                # missing-row + empty-metadata.
+        logger.info(
+            "finalize_checkout: recovered purchase_type=%s for session=%s "
+            "via DB fallback (metadata was empty)",
+            "premium_tokens" if is_token else "page_packs",
+            session_id,
+        )
+
+    is_paid = payment_status in {"paid", "no_payment_required"}
+    is_expired = session_status == "expired"
+
+    if is_paid:
+        if is_token:
+            await _fulfill_completed_token_purchase(db_session, checkout_session)
+        else:
+            await _fulfill_completed_purchase(db_session, checkout_session)
+    elif is_expired:
+        if is_token:
+            await _mark_token_purchase_failed(db_session, str(checkout_session.id))
+        else:
+            await _mark_purchase_failed(db_session, str(checkout_session.id))
+    # Otherwise (e.g. payment_status="unpaid", session_status="open"),
+    # leave the purchase row alone — frontend will keep polling and the
+    # webhook will eventually win the race.
+
+    # Refresh the user row so the response reflects any update applied
+    # by the fulfilment helpers in this same session.
+    await db_session.refresh(user)
+
+    if is_token:
+        purchase = (
+            await db_session.execute(
+                select(PremiumTokenPurchase).where(
+                    PremiumTokenPurchase.stripe_checkout_session_id
+                    == str(checkout_session.id)
+                )
+            )
+        ).scalar_one_or_none()
+        return FinalizeCheckoutResponse(
+            purchase_type="premium_tokens",
+            status=purchase.status.value if purchase else "pending",
+            premium_credit_micros_limit=user.premium_credit_micros_limit,
+            premium_credit_micros_used=user.premium_credit_micros_used,
+            premium_credit_micros_granted=(
+                purchase.credit_micros_granted if purchase else None
+            ),
+        )
+
+    purchase = (
+        await db_session.execute(
+            select(PagePurchase).where(
+                PagePurchase.stripe_checkout_session_id == str(checkout_session.id)
+            )
+        )
+    ).scalar_one_or_none()
+    return FinalizeCheckoutResponse(
+        purchase_type="page_packs",
+        status=purchase.status.value if purchase else "pending",
+        pages_limit=user.pages_limit,
+        pages_used=user.pages_used,
+        pages_granted=purchase.pages_granted if purchase else None,
+    )
 
 
 @router.get("/purchases", response_model=PagePurchaseHistoryResponse)
@@ -369,3 +735,165 @@ async def get_page_purchases(
     )
 
     return PagePurchaseHistoryResponse(purchases=purchases)
+
+
+# =============================================================================
+# Premium Token Purchase Routes
+# =============================================================================
+
+
+def _ensure_token_buying_enabled() -> None:
+    if not config.STRIPE_TOKEN_BUYING_ENABLED:
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail="Premium token purchases are temporarily unavailable.",
+        )
+
+
+def _get_token_checkout_urls(search_space_id: int) -> tuple[str, str]:
+    if not config.NEXT_FRONTEND_URL:
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail="NEXT_FRONTEND_URL is not configured.",
+        )
+    base_url = config.NEXT_FRONTEND_URL.rstrip("/")
+    # See ``_get_checkout_urls`` for why session_id is appended.
+    success_url = (
+        f"{base_url}/dashboard/{search_space_id}/purchase-success"
+        f"?session_id={{CHECKOUT_SESSION_ID}}"
+    )
+    cancel_url = f"{base_url}/dashboard/{search_space_id}/purchase-cancel"
+    return success_url, cancel_url
+
+
+def _get_required_token_price_id() -> str:
+    if not config.STRIPE_PREMIUM_TOKEN_PRICE_ID:
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail="STRIPE_PREMIUM_TOKEN_PRICE_ID is not configured.",
+        )
+    return config.STRIPE_PREMIUM_TOKEN_PRICE_ID
+
+
+@router.post("/create-token-checkout-session")
+async def create_token_checkout_session(
+    body: CreateTokenCheckoutSessionRequest,
+    user: User = Depends(current_active_user),
+    db_session: AsyncSession = Depends(get_async_session),
+):
+    """Create a Stripe Checkout Session for buying premium credit packs.
+
+    Each pack grants ``STRIPE_CREDIT_MICROS_PER_UNIT`` micro-USD of
+    credit (default 1_000_000 = $1.00). The user's balance is debited
+    at the actual provider cost reported by LiteLLM at finalize time,
+    so $1 of credit always buys $1 worth of provider usage at cost.
+    """
+    _ensure_token_buying_enabled()
+    stripe_client = get_stripe_client()
+    price_id = _get_required_token_price_id()
+    success_url, cancel_url = _get_token_checkout_urls(body.search_space_id)
+    credit_micros_granted = body.quantity * config.STRIPE_CREDIT_MICROS_PER_UNIT
+
+    try:
+        checkout_session = stripe_client.v1.checkout.sessions.create(
+            params={
+                "mode": "payment",
+                "success_url": success_url,
+                "cancel_url": cancel_url,
+                "line_items": [
+                    {
+                        "price": price_id,
+                        "quantity": body.quantity,
+                    }
+                ],
+                "client_reference_id": str(user.id),
+                "customer_email": user.email,
+                "metadata": {
+                    "user_id": str(user.id),
+                    "quantity": str(body.quantity),
+                    "credit_micros_per_unit": str(config.STRIPE_CREDIT_MICROS_PER_UNIT),
+                    # Canonical value matched by ``_is_token_purchase``.
+                    # The legacy ``"premium_credit"`` is still accepted on
+                    # the read side for any in-flight sessions started
+                    # before this rename.
+                    "purchase_type": "premium_tokens",
+                },
+            }
+        )
+    except StripeError as exc:
+        logger.exception("Failed to create token checkout session for user %s", user.id)
+        raise HTTPException(
+            status_code=status.HTTP_502_BAD_GATEWAY,
+            detail="Unable to create Stripe checkout session.",
+        ) from exc
+
+    checkout_url = getattr(checkout_session, "url", None)
+    if not checkout_url:
+        raise HTTPException(
+            status_code=status.HTTP_502_BAD_GATEWAY,
+            detail="Stripe checkout session did not return a URL.",
+        )
+
+    db_session.add(
+        PremiumTokenPurchase(
+            user_id=user.id,
+            stripe_checkout_session_id=str(checkout_session.id),
+            stripe_payment_intent_id=_normalize_optional_string(
+                getattr(checkout_session, "payment_intent", None)
+            ),
+            quantity=body.quantity,
+            credit_micros_granted=credit_micros_granted,
+            amount_total=getattr(checkout_session, "amount_total", None),
+            currency=getattr(checkout_session, "currency", None),
+            status=PremiumTokenPurchaseStatus.PENDING,
+        )
+    )
+    await db_session.commit()
+
+    return CreateTokenCheckoutSessionResponse(checkout_url=checkout_url)
+
+
+@router.get("/token-status")
+async def get_token_status(
+    user: User = Depends(current_active_user),
+):
+    """Return token-buying availability and current premium credit quota for frontend.
+
+    Values are in micro-USD (1_000_000 = $1.00); the FE divides by 1M
+    when displaying. The route name is preserved for back-compat with
+    pinned client deployments.
+    """
+    used = user.premium_credit_micros_used
+    limit = user.premium_credit_micros_limit
+    return TokenStripeStatusResponse(
+        token_buying_enabled=config.STRIPE_TOKEN_BUYING_ENABLED,
+        premium_credit_micros_used=used,
+        premium_credit_micros_limit=limit,
+        premium_credit_micros_remaining=max(0, limit - used),
+    )
+
+
+@router.get("/token-purchases")
+async def get_token_purchases(
+    user: User = Depends(current_active_user),
+    db_session: AsyncSession = Depends(get_async_session),
+    offset: int = 0,
+    limit: int = 50,
+):
+    """Return the authenticated user's premium token purchase history."""
+    limit = min(limit, 100)
+    purchases = (
+        (
+            await db_session.execute(
+                select(PremiumTokenPurchase)
+                .where(PremiumTokenPurchase.user_id == user.id)
+                .order_by(PremiumTokenPurchase.created_at.desc())
+                .offset(offset)
+                .limit(limit)
+            )
+        )
+        .scalars()
+        .all()
+    )
+
+    return TokenPurchaseHistoryResponse(purchases=purchases)
diff --git a/surfsense_backend/app/routes/surfsense_docs_routes.py b/surfsense_backend/app/routes/surfsense_docs_routes.py
index e1713e8a3d..0d5428dec9 100644
--- a/surfsense_backend/app/routes/surfsense_docs_routes.py
+++ b/surfsense_backend/app/routes/surfsense_docs_routes.py
@@ -24,6 +24,7 @@
     SurfsenseDocsDocumentWithChunksRead,
 )
 from app.users import current_active_user
+from app.utils.surfsense_docs import surfsense_docs_public_url
 
 router = APIRouter()
 
@@ -76,6 +77,7 @@ async def get_surfsense_doc_by_chunk_id(
             id=document.id,
             title=document.title,
             source=document.source,
+            public_url=surfsense_docs_public_url(document.source),
             content=document.content,
             chunks=[
                 SurfsenseDocsChunkRead(id=c.id, content=c.content)
@@ -146,6 +148,7 @@ async def list_surfsense_docs(
                 id=doc.id,
                 title=doc.title,
                 source=doc.source,
+                public_url=surfsense_docs_public_url(doc.source),
                 content=doc.content,
                 created_at=doc.created_at,
                 updated_at=doc.updated_at,
diff --git a/surfsense_backend/app/routes/teams_add_connector_route.py b/surfsense_backend/app/routes/teams_add_connector_route.py
index 4442307ba3..9d0f5144fe 100644
--- a/surfsense_backend/app/routes/teams_add_connector_route.py
+++ b/surfsense_backend/app/routes/teams_add_connector_route.py
@@ -45,6 +45,7 @@
     "Team.ReadBasic.All",  # Read basic team information
     "Channel.ReadBasic.All",  # Read basic channel information
     "ChannelMessage.Read.All",  # Read messages in channels
+    "ChannelMessage.Send",  # Send messages in channels
 ]
 
 # Initialize security utilities
@@ -320,7 +321,7 @@ async def teams_callback(
         new_connector = SearchSourceConnector(
             name=connector_name,
             connector_type=SearchSourceConnectorType.TEAMS_CONNECTOR,
-            is_indexable=True,
+            is_indexable=False,
             config=connector_config,
             search_space_id=space_id,
             user_id=user_id,
diff --git a/surfsense_backend/app/routes/vision_llm_routes.py b/surfsense_backend/app/routes/vision_llm_routes.py
index 315c7c9fe8..e4f08f6041 100644
--- a/surfsense_backend/app/routes/vision_llm_routes.py
+++ b/surfsense_backend/app/routes/vision_llm_routes.py
@@ -82,10 +82,15 @@ async def get_global_vision_llm_configs(
                     "litellm_params": {},
                     "is_global": True,
                     "is_auto_mode": True,
+                    # Auto mode treated as free until per-deployment billing-tier
+                    # surfacing lands; see ``get_vision_llm`` for parity.
+                    "billing_tier": "free",
+                    "is_premium": False,
                 }
             )
 
         for cfg in global_configs:
+            billing_tier = str(cfg.get("billing_tier", "free")).lower()
             safe_configs.append(
                 {
                     "id": cfg.get("id"),
@@ -98,6 +103,14 @@ async def get_global_vision_llm_configs(
                     "api_version": cfg.get("api_version") or None,
                     "litellm_params": cfg.get("litellm_params", {}),
                     "is_global": True,
+                    "billing_tier": billing_tier,
+                    # Mirror chat (``new_llm_config_routes``) so the new-chat
+                    # selector's premium badge logic keys off the same
+                    # field across chat / image / vision tabs.
+                    "is_premium": billing_tier == "premium",
+                    "quota_reserve_tokens": cfg.get("quota_reserve_tokens"),
+                    "input_cost_per_token": cfg.get("input_cost_per_token"),
+                    "output_cost_per_token": cfg.get("output_cost_per_token"),
                 }
             )
 
diff --git a/surfsense_backend/app/schemas/image_generation.py b/surfsense_backend/app/schemas/image_generation.py
index 69f534e20c..4262b2b3f4 100644
--- a/surfsense_backend/app/schemas/image_generation.py
+++ b/surfsense_backend/app/schemas/image_generation.py
@@ -215,6 +215,12 @@ class GlobalImageGenConfigRead(BaseModel):
     Schema for reading global image generation configs from YAML.
     Global configs have negative IDs. API key is hidden.
     ID 0 is reserved for Auto mode (LiteLLM Router load balancing).
+
+    The ``billing_tier`` field allows the frontend to show a Premium/Free
+    badge and (more importantly) tells the backend whether to debit the
+    user's premium credit pool when this config is used. ``"free"`` is
+    the default for backward compatibility — admins must explicitly opt
+    a global config into ``"premium"``.
     """
 
     id: int = Field(
@@ -231,3 +237,24 @@ class GlobalImageGenConfigRead(BaseModel):
     litellm_params: dict[str, Any] | None = None
     is_global: bool = True
     is_auto_mode: bool = False
+    billing_tier: str = Field(
+        default="free",
+        description="'free' or 'premium'. Premium debits the user's premium credit pool (USD-cost-based).",
+    )
+    is_premium: bool = Field(
+        default=False,
+        description=(
+            "Convenience boolean derived server-side from "
+            "``billing_tier == 'premium'``. The new-chat model selector "
+            "keys its Free/Premium badge off this field for parity with "
+            "chat (`GlobalLLMConfigRead.is_premium`)."
+        ),
+    )
+    quota_reserve_micros: int | None = Field(
+        default=None,
+        description=(
+            "Optional override for the reservation amount (in micro-USD) used when "
+            "this image generation is premium. Falls back to "
+            "QUOTA_DEFAULT_IMAGE_RESERVE_MICROS when omitted."
+        ),
+    )
diff --git a/surfsense_backend/app/schemas/new_chat.py b/surfsense_backend/app/schemas/new_chat.py
index 5d8ae207eb..c5315cce52 100644
--- a/surfsense_backend/app/schemas/new_chat.py
+++ b/surfsense_backend/app/schemas/new_chat.py
@@ -7,12 +7,13 @@
 """
 
 from datetime import datetime
-from typing import Any, Literal
+from typing import Any, Literal, Self
 from uuid import UUID
 
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
 
 from app.db import ChatVisibility, NewChatMessageRole
+from app.utils.user_message_multimodal import decode_base64_image, to_data_url
 
 from .base import IDModel, TimestampModel
 
@@ -34,6 +35,15 @@ class NewChatMessageCreate(NewChatMessageBase):
     thread_id: int
 
 
+class TokenUsageSummary(BaseModel):
+    prompt_tokens: int = 0
+    completion_tokens: int = 0
+    total_tokens: int = 0
+    cost_micros: int = 0
+    model_breakdown: dict | None = None
+    model_config = ConfigDict(from_attributes=True)
+
+
 class NewChatMessageRead(NewChatMessageBase, IDModel, TimestampModel):
     """Schema for reading a message."""
 
@@ -41,6 +51,12 @@ class NewChatMessageRead(NewChatMessageBase, IDModel, TimestampModel):
     author_id: UUID | None = None
     author_display_name: str | None = None
     author_avatar_url: str | None = None
+    token_usage: TokenUsageSummary | None = None
+    # Per-turn correlation id (``f"{chat_id}:{ms}"``) from
+    # ``configurable.turn_id`` at streaming time. Nullable because
+    # legacy rows predate the column; clients should treat NULL as
+    # "edit-from-this-message is unavailable".
+    turn_id: str | None = None
     model_config = ConfigDict(from_attributes=True)
 
 
@@ -159,6 +175,62 @@ class ChatMessage(BaseModel):
     content: str
 
 
+class LocalFilesystemMountPayload(BaseModel):
+    mount_id: str
+    root_path: str
+
+
+MAX_NEW_CHAT_IMAGE_BYTES = 8 * 1024 * 1024
+MAX_NEW_CHAT_IMAGES = 4
+
+
+class NewChatUserImagePart(BaseModel):
+    """One inline image for a user turn (raw base64 body, no data: URL prefix)."""
+
+    media_type: Literal["image/png", "image/jpeg", "image/webp"]
+    data: str = Field(..., min_length=1)
+
+    @field_validator("data")
+    @classmethod
+    def _validate_payload(cls, v: str) -> str:
+        decode_base64_image(v, max_bytes=MAX_NEW_CHAT_IMAGE_BYTES)
+        return v
+
+    def as_data_url(self) -> str:
+        return to_data_url(self.media_type, self.data)
+
+
+class MentionedDocumentInfo(BaseModel):
+    """Display metadata for a single ``@``-mention chip.
+
+    Carries either a knowledge-base document or a knowledge-base folder
+    (discriminated by ``kind``). The full triple
+    ``{id, title, document_type}`` is forwarded by the frontend mention
+    chip so the server can embed it in the persisted user message
+    ``ContentPart[]`` (single ``mentioned-documents`` part). The
+    history loader then renders the chips on reload without an extra
+    fetch — mirrors the pre-refactor frontend ``persistUserTurn`` shape.
+
+    ``kind`` defaults to ``"doc"`` so legacy clients and persisted rows
+    that predate folder mentions deserialise unchanged.
+    """
+
+    id: int
+    title: str = Field(..., min_length=1, max_length=500)
+    document_type: str = Field(..., min_length=1, max_length=100)
+    kind: Literal["doc", "folder"] = Field(
+        default="doc",
+        description=(
+            "Discriminator for the chip's referent: ``doc`` is a "
+            "knowledge-base ``Document`` row, ``folder`` is a "
+            "knowledge-base ``Folder`` row. Folders carry the sentinel "
+            "``document_type='FOLDER'`` to keep the frontend dedup key "
+            "``(kind:document_type:id)`` from colliding doc and folder "
+            "ids that happen to share an integer value."
+        ),
+    )
+
+
 class NewChatRequest(BaseModel):
     """Request schema for the deep agent chat endpoint."""
 
@@ -172,9 +244,48 @@ class NewChatRequest(BaseModel):
     mentioned_surfsense_doc_ids: list[int] | None = (
         None  # Optional SurfSense documentation IDs mentioned with @ in the chat
     )
+    mentioned_folder_ids: list[int] | None = Field(
+        default=None,
+        description=(
+            "Optional knowledge-base folder IDs the user mentioned with "
+            "@. Resolved to virtual paths (``/documents/.../``) by "
+            "``mention_resolver`` and surfaced to the agent via "
+            "(a) backtick-wrapped substitution in ``user_query`` and "
+            "(b) a ``[USER-MENTIONED]`` entry in ``<priority_documents>``. "
+            "The agent's ``ls`` tool can then walk the folder itself."
+        ),
+    )
+    mentioned_documents: list[MentionedDocumentInfo] | None = Field(
+        default=None,
+        description=(
+            "Display metadata (id, title, document_type, kind) for every "
+            "@-mention chip — both documents and folders. Persisted as a "
+            "``mentioned-documents`` ContentPart on the user message so "
+            "reload renders chips without an extra fetch. Optional and "
+            "additive — when None the user message is persisted without "
+            "a mentioned-documents part."
+        ),
+    )
     disabled_tools: list[str] | None = (
         None  # Optional list of tool names the user has disabled from the UI
     )
+    filesystem_mode: Literal["cloud", "desktop_local_folder"] = "cloud"
+    client_platform: Literal["web", "desktop"] = "web"
+    local_filesystem_mounts: list[LocalFilesystemMountPayload] | None = None
+    user_images: list[NewChatUserImagePart] | None = Field(
+        default=None,
+        description="Optional images for this user turn",
+    )
+
+    @model_validator(mode="after")
+    def _require_text_or_images(self) -> Self:
+        has_text = bool(self.user_query.strip())
+        has_images = bool(self.user_images)
+        if not has_text and not has_images:
+            raise ValueError("Provide non-empty user_query and/or user_images")
+        if self.user_images is not None and len(self.user_images) > MAX_NEW_CHAT_IMAGES:
+            raise ValueError(f"At most {MAX_NEW_CHAT_IMAGES} images allowed")
+        return self
 
 
 class RegenerateRequest(BaseModel):
@@ -186,6 +297,18 @@ class RegenerateRequest(BaseModel):
     2. Reload: Leave user_query empty to regenerate the last AI response with the same query
 
     Both operations rewind the LangGraph checkpointer to the appropriate state.
+
+    For edit, optional user_images (when not None) replaces image URLs resolved from
+    checkpoint/DB so the client can send the full user turn (text and/or images).
+
+    Edit-from-arbitrary-position. When ``from_message_id`` is provided
+    the route slices conversation history starting at that message (instead of
+    the legacy "last 2 messages" rewind), rewinds the LangGraph checkpoint by
+    matching ``configurable.turn_id`` stored on the message (added in migration 136), and
+    optionally reverts every reversible action emitted in turns at or after
+    ``from_message_id``. The revert step is best-effort and runs BEFORE the
+    regenerate stream — partial failures are surfaced via SSE
+    ``data-revert-results`` and do not abort the regeneration.
     """
 
     search_space_id: int
@@ -194,7 +317,68 @@ class RegenerateRequest(BaseModel):
     )
     mentioned_document_ids: list[int] | None = None
     mentioned_surfsense_doc_ids: list[int] | None = None
+    mentioned_folder_ids: list[int] | None = Field(
+        default=None,
+        description=(
+            "Optional knowledge-base folder IDs the user mentioned with "
+            "@ on the edited user turn. Only used when ``user_query`` is "
+            "non-None (edit). Mirrors ``NewChatRequest.mentioned_folder_ids``."
+        ),
+    )
+    mentioned_documents: list[MentionedDocumentInfo] | None = Field(
+        default=None,
+        description=(
+            "Display metadata (id, title, document_type, kind) for every "
+            "@-mention chip on the edited user turn — both documents and "
+            "folders. Only used when ``user_query`` is non-None (edit). "
+            "Persisted as a ``mentioned-documents`` ContentPart on the "
+            "new user message. None means no chip metadata."
+        ),
+    )
     disabled_tools: list[str] | None = None
+    filesystem_mode: Literal["cloud", "desktop_local_folder"] = "cloud"
+    client_platform: Literal["web", "desktop"] = "web"
+    local_filesystem_mounts: list[LocalFilesystemMountPayload] | None = None
+    user_images: list[NewChatUserImagePart] | None = Field(
+        default=None,
+        description="If set, use these images for the regenerated turn (edit); overrides checkpoint/DB",
+    )
+    from_message_id: int | None = Field(
+        default=None,
+        description=(
+            "Message id to rewind to. When set, history is sliced "
+            "from this message forward and the LangGraph checkpoint is "
+            "rewound to the state immediately preceding this turn. Legacy "
+            "rows that predate migration 136 have ``turn_id=None`` and "
+            "still process — the route logs a warning, skips the "
+            "checkpoint rewind, and ignores ``revert_actions`` (no "
+            "chat_turn_id available to walk)."
+        ),
+    )
+    revert_actions: bool = Field(
+        default=False,
+        description=(
+            "When true, every reversible action emitted at or "
+            "after ``from_message_id`` is reverted before the regenerate "
+            "stream begins. Per-action results are surfaced via the "
+            "``data-revert-results`` SSE event. Partial failures DO NOT "
+            "abort the regeneration."
+        ),
+    )
+
+    @model_validator(mode="after")
+    def _validate_regenerate_user_images(self) -> Self:
+        if self.user_images is not None and len(self.user_images) > MAX_NEW_CHAT_IMAGES:
+            raise ValueError(f"At most {MAX_NEW_CHAT_IMAGES} images allowed")
+        return self
+
+    @model_validator(mode="after")
+    def _validate_revert_actions_requires_from_message(self) -> Self:
+        if self.revert_actions and self.from_message_id is None:
+            raise ValueError(
+                "revert_actions requires from_message_id; specify which message to rewind to"
+            )
+        return self
 
 
 # =============================================================================
@@ -211,13 +395,57 @@ class AgentToolInfo(BaseModel):
 
 
 class ResumeDecision(BaseModel):
-    type: Literal["approve", "edit", "reject"]
+    type: Literal["approve", "edit", "reject", "approve_always"]
     edited_action: dict[str, Any] | None = None
 
 
 class ResumeRequest(BaseModel):
     search_space_id: int
     decisions: list[ResumeDecision]
+    # Mirrors ``NewChatRequest.disabled_tools`` so the resumed run sees the
+    # same tool surface as the originating turn.
+    disabled_tools: list[str] | None = None
+    filesystem_mode: Literal["cloud", "desktop_local_folder"] = "cloud"
+    client_platform: Literal["web", "desktop"] = "web"
+    local_filesystem_mounts: list[LocalFilesystemMountPayload] | None = None
+    mentioned_folder_ids: list[int] | None = Field(
+        default=None,
+        description=(
+            "Forwarded for symmetry with /new_chat and /regenerate. "
+            "Resume reuses the original interrupted user turn so this "
+            "field is informational only — the originating turn's "
+            "folder mentions already shaped the priority hints baked "
+            "into the agent's checkpoint."
+        ),
+    )
+    mentioned_documents: list[MentionedDocumentInfo] | None = Field(
+        default=None,
+        description=(
+            "Display metadata forwarded for symmetry with /new_chat and "
+            "/regenerate. Resume reuses the original interrupted user "
+            "turn so the server does not write a new user message. "
+            "Currently unused but accepted to keep request bodies "
+            "uniform across new-message, regenerate, and resume stream routes."
+        ),
+    )
+
+
+class CancelActiveTurnResponse(BaseModel):
+    """Response for canceling an active turn on a chat thread."""
+
+    status: Literal["cancelling", "idle"]
+    error_code: Literal["TURN_CANCELLING", "NO_ACTIVE_TURN"]
+    retry_after_ms: int | None = None
+    retry_after_at: int | None = None
+
+
+class TurnStatusResponse(BaseModel):
+    """Current turn execution status for a thread."""
+
+    status: Literal["idle", "busy", "cancelling"]
+    active_turn_id: str | None = None
+    retry_after_ms: int | None = None
+    retry_after_at: int | None = None
 
 
 # =============================================================================
diff --git a/surfsense_backend/app/schemas/new_llm_config.py b/surfsense_backend/app/schemas/new_llm_config.py
index 4fb3aa400b..e64478d380 100644
--- a/surfsense_backend/app/schemas/new_llm_config.py
+++ b/surfsense_backend/app/schemas/new_llm_config.py
@@ -92,6 +92,20 @@ class NewLLMConfigRead(NewLLMConfigBase):
     created_at: datetime
     search_space_id: int
     user_id: uuid.UUID
+    # Capability flag derived at the API boundary (no DB column). Default
+    # True matches the conservative-allow stance — a BYOK row that the
+    # route forgot to augment is not pre-judged. The streaming-task
+    # safety net is the only place a False actually blocks a request.
+    supports_image_input: bool = Field(
+        default=True,
+        description=(
+            "Whether the BYOK chat config can accept image inputs. Derived "
+            "at the route boundary from LiteLLM's authoritative model map "
+            "(``litellm.supports_vision``) — there is no DB column. "
+            "Default True is the conservative-allow stance for unknown / "
+            "unmapped models."
+        ),
+    )
 
     model_config = ConfigDict(from_attributes=True)
 
@@ -121,6 +135,15 @@ class NewLLMConfigPublic(BaseModel):
     created_at: datetime
     search_space_id: int
     user_id: uuid.UUID
+    # Capability flag derived at the API boundary (see NewLLMConfigRead).
+    supports_image_input: bool = Field(
+        default=True,
+        description=(
+            "Whether the BYOK chat config can accept image inputs. Derived "
+            "at the route boundary from LiteLLM's authoritative model map. "
+            "Default True is the conservative-allow stance."
+        ),
+    )
 
     model_config = ConfigDict(from_attributes=True)
 
@@ -164,6 +187,28 @@ class GlobalNewLLMConfigRead(BaseModel):
     is_global: bool = True  # Always true for global configs
     is_auto_mode: bool = False  # True only for Auto mode (ID 0)
 
+    billing_tier: str = "free"
+    is_premium: bool = False
+    anonymous_enabled: bool = False
+    seo_enabled: bool = False
+    seo_slug: str | None = None
+    seo_title: str | None = None
+    seo_description: str | None = None
+    quota_reserve_tokens: int | None = None
+    supports_image_input: bool = Field(
+        default=True,
+        description=(
+            "Whether the model accepts image inputs (multimodal vision). "
+            "Derived server-side: OpenRouter dynamic configs use "
+            "``architecture.input_modalities``; YAML / BYOK use LiteLLM's "
+            "authoritative model map (``litellm.supports_vision``). The "
+            "new-chat selector hints with a 'No image' badge when this is "
+            "False and there are pending image attachments. The streaming "
+            "task fails fast only when LiteLLM *explicitly* marks a model "
+            "as text-only — unknown / unmapped models default-allow."
+        ),
+    )
+
 
 # =============================================================================
 # LLM Preferences Schemas (for role assignments)
diff --git a/surfsense_backend/app/schemas/obsidian_auth_credentials.py b/surfsense_backend/app/schemas/obsidian_auth_credentials.py
deleted file mode 100644
index ab178eac86..0000000000
--- a/surfsense_backend/app/schemas/obsidian_auth_credentials.py
+++ /dev/null
@@ -1,59 +0,0 @@
-"""
-Obsidian Connector Credentials Schema.
-
-Obsidian is a local-first note-taking app that stores notes as markdown files.
-This connector supports indexing from local file system (self-hosted only).
-"""
-
-from pydantic import BaseModel, field_validator
-
-
-class ObsidianAuthCredentialsBase(BaseModel):
-    """
-    Credentials/configuration for the Obsidian connector.
-
-    Since Obsidian vaults are local directories, this schema primarily
-    holds the vault path and configuration options rather than API tokens.
-    """
-
-    vault_path: str
-    vault_name: str | None = None
-    exclude_folders: list[str] | None = None
-    include_attachments: bool = False
-
-    @field_validator("vault_path")
-    @classmethod
-    def validate_vault_path(cls, v: str) -> str:
-        """Ensure vault path is provided and stripped of whitespace."""
-        if not v or not v.strip():
-            raise ValueError("Vault path is required")
-        return v.strip()
-
-    @field_validator("exclude_folders", mode="before")
-    @classmethod
-    def parse_exclude_folders(cls, v):
-        """Parse exclude_folders from string if needed."""
-        if v is None:
-            return [".trash", ".obsidian", "templates"]
-        if isinstance(v, str):
-            return [f.strip() for f in v.split(",") if f.strip()]
-        return v
-
-    def to_dict(self) -> dict:
-        """Convert credentials to dictionary for storage."""
-        return {
-            "vault_path": self.vault_path,
-            "vault_name": self.vault_name,
-            "exclude_folders": self.exclude_folders,
-            "include_attachments": self.include_attachments,
-        }
-
-    @classmethod
-    def from_dict(cls, data: dict) -> "ObsidianAuthCredentialsBase":
-        """Create credentials from dictionary."""
-        return cls(
-            vault_path=data.get("vault_path", ""),
-            vault_name=data.get("vault_name"),
-            exclude_folders=data.get("exclude_folders"),
-            include_attachments=data.get("include_attachments", False),
-        )
diff --git a/surfsense_backend/app/schemas/obsidian_plugin.py b/surfsense_backend/app/schemas/obsidian_plugin.py
new file mode 100644
index 0000000000..89be08c8eb
--- /dev/null
+++ b/surfsense_backend/app/schemas/obsidian_plugin.py
@@ -0,0 +1,234 @@
+"""Wire schemas spoken between the SurfSense Obsidian plugin and the backend.
+
+All schemas inherit ``extra='ignore'`` from :class:`_PluginBase` so additive
+field changes never break either side; hard breaks live behind a new URL
+prefix (``/api/v2/...``).
+"""
+
+from __future__ import annotations
+
+from datetime import datetime
+from typing import Any, Literal
+
+from pydantic import BaseModel, ConfigDict, Field, model_validator
+
+_PLUGIN_MODEL_CONFIG = ConfigDict(extra="ignore")
+
+
+# Source of truth for the attachment whitelist. Mirrors MIME_BY_EXTENSION in
+# surfsense_obsidian/src/sync-engine.ts — keep in sync.
+ATTACHMENT_MIME_TYPES: dict[str, str] = {
+    "pdf": "application/pdf",
+    "png": "image/png",
+    "jpg": "image/jpeg",
+    "jpeg": "image/jpeg",
+    "gif": "image/gif",
+    "webp": "image/webp",
+    "svg": "image/svg+xml",
+    "txt": "text/plain",
+}
+ALLOWED_ATTACHMENT_EXTENSIONS: frozenset[str] = frozenset(ATTACHMENT_MIME_TYPES)
+
+
+class _PluginBase(BaseModel):
+    """Base schema carrying the shared forward-compatibility config."""
+
+    model_config = _PLUGIN_MODEL_CONFIG
+
+
+class HeadingRef(_PluginBase):
+    """One markdown heading extracted from Obsidian metadata cache."""
+
+    heading: str
+    level: int = Field(ge=1, le=6)
+
+
+class NotePayload(_PluginBase):
+    """One Obsidian note as pushed by the plugin (the source of truth)."""
+
+    vault_id: str = Field(
+        ..., description="Stable plugin-generated UUID for this vault"
+    )
+    path: str = Field(..., description="Vault-relative path, e.g. 'notes/foo.md'")
+    name: str = Field(..., description="File stem (no extension)")
+    extension: str = Field(
+        default="md", description="File extension without leading dot"
+    )
+    content: str = Field(default="", description="Raw markdown body (post-frontmatter)")
+
+    frontmatter: dict[str, Any] = Field(default_factory=dict)
+    tags: list[str] = Field(default_factory=list)
+    headings: list[HeadingRef] = Field(default_factory=list)
+    resolved_links: list[str] = Field(default_factory=list)
+    unresolved_links: list[str] = Field(default_factory=list)
+    embeds: list[str] = Field(default_factory=list)
+    aliases: list[str] = Field(default_factory=list)
+
+    content_hash: str = Field(
+        ..., description="Plugin-computed SHA-256 of the raw content"
+    )
+    is_binary: bool = Field(
+        default=False,
+        description=(
+            "True when payload represents a non-markdown attachment. "
+            "If set, the plugin may include binary_base64 for ETL extraction."
+        ),
+    )
+    binary_base64: str | None = Field(
+        default=None,
+        description=(
+            "Base64-encoded raw file bytes for binary attachments. "
+            "Used by the backend ETL pipeline."
+        ),
+    )
+    mime_type: str | None = Field(
+        default=None,
+        description="Optional MIME type hint for binary attachments.",
+    )
+    size: int | None = Field(
+        default=None,
+        ge=0,
+        description="Byte size of the local file (mtime+size short-circuit signal). Optional for forward compatibility.",
+    )
+    mtime: datetime
+    ctime: datetime
+
+    @model_validator(mode="after")
+    def _enforce_binary_invariants(self) -> NotePayload:
+        if self.is_binary:
+            if not self.binary_base64:
+                raise ValueError("binary_base64 is required when is_binary is True")
+            if not self.mime_type:
+                raise ValueError("mime_type is required when is_binary is True")
+        elif self.binary_base64 is not None or self.mime_type is not None:
+            raise ValueError(
+                "binary_base64 and mime_type must be omitted when is_binary is False",
+            )
+        return self
+
+
+class SyncBatchRequest(_PluginBase):
+    """Batch upsert; plugin sends 10-20 notes per request."""
+
+    vault_id: str
+    notes: list[NotePayload] = Field(default_factory=list, max_length=100)
+
+
+class RenameItem(_PluginBase):
+    old_path: str
+    new_path: str
+
+
+class RenameBatchRequest(_PluginBase):
+    vault_id: str
+    renames: list[RenameItem] = Field(default_factory=list, max_length=200)
+
+
+class DeleteBatchRequest(_PluginBase):
+    vault_id: str
+    paths: list[str] = Field(default_factory=list, max_length=500)
+
+
+class ManifestEntry(_PluginBase):
+    hash: str
+    mtime: datetime
+    size: int | None = Field(
+        default=None,
+        description="Byte size last seen by the server. Enables mtime+size short-circuit; absent when not yet recorded.",
+    )
+
+
+class ManifestResponse(_PluginBase):
+    """Path-keyed manifest of every non-deleted note for a vault."""
+
+    vault_id: str
+    items: dict[str, ManifestEntry] = Field(default_factory=dict)
+
+
+class ConnectRequest(_PluginBase):
+    """Vault registration / heartbeat. Replayed on every plugin onload."""
+
+    vault_id: str
+    vault_name: str
+    search_space_id: int
+    vault_fingerprint: str = Field(
+        ...,
+        description=(
+            "Deterministic SHA-256 over the sorted markdown paths in the vault "
+            "(plus vault_name). Same vault content on any device produces the "
+            "same value; the server uses it to dedup connectors across devices."
+        ),
+    )
+
+
+class ConnectResponse(_PluginBase):
+    """Carries the same handshake fields as ``HealthResponse`` so the plugin
+    learns the contract without a separate ``GET /health`` round-trip."""
+
+    connector_id: int
+    vault_id: str
+    search_space_id: int
+    capabilities: list[str]
+    server_time_utc: datetime
+
+
+class HealthResponse(_PluginBase):
+    """API contract handshake. ``capabilities`` is additive-only string list."""
+
+    capabilities: list[str]
+    server_time_utc: datetime
+
+
+# Per-item batch ack schemas — wire shape is load-bearing for the plugin
+# queue (see api-client.ts / sync-engine.ts:processBatch).
+
+
+class SyncAckItem(_PluginBase):
+    path: str
+    status: Literal["ok", "queued", "error"]
+    document_id: int | None = None
+    error: str | None = None
+
+
+class SyncAck(_PluginBase):
+    vault_id: str
+    indexed: int
+    failed: int
+    items: list[SyncAckItem] = Field(default_factory=list)
+
+
+class RenameAckItem(_PluginBase):
+    old_path: str
+    new_path: str
+    # ``missing`` is treated as success client-side (end state reached).
+    status: Literal["ok", "error", "missing"]
+    document_id: int | None = None
+    error: str | None = None
+
+
+class RenameAck(_PluginBase):
+    vault_id: str
+    renamed: int
+    missing: int
+    items: list[RenameAckItem] = Field(default_factory=list)
+
+
+class DeleteAckItem(_PluginBase):
+    path: str
+    status: Literal["ok", "error", "missing"]
+    error: str | None = None
+
+
+class DeleteAck(_PluginBase):
+    vault_id: str
+    deleted: int
+    missing: int
+    items: list[DeleteAckItem] = Field(default_factory=list)
+
+
+class StatsResponse(_PluginBase):
+    """Backs the Obsidian connector tile in the web UI."""
+
+    vault_id: str
+    files_synced: int
+    last_sync_at: datetime | None = None
diff --git a/surfsense_backend/app/schemas/reports.py b/surfsense_backend/app/schemas/reports.py
index 9a77655072..25ca506077 100644
--- a/surfsense_backend/app/schemas/reports.py
+++ b/surfsense_backend/app/schemas/reports.py
@@ -23,6 +23,7 @@ class ReportRead(BaseModel):
     report_style: str | None = None
     report_metadata: dict[str, Any] | None = None
     report_group_id: int | None = None
+    content_type: str = "markdown"
     created_at: datetime
 
     class Config:
@@ -40,11 +41,12 @@ class Config:
 
 
 class ReportContentRead(BaseModel):
-    """Schema for reading a report with full Markdown content."""
+    """Schema for reading a report with full content (Markdown or Typst)."""
 
     id: int
     title: str
     content: str | None = None
+    content_type: str = "markdown"
     report_metadata: dict[str, Any] | None = None
     report_group_id: int | None = None
     versions: list[ReportVersionInfo] = []
diff --git a/surfsense_backend/app/schemas/search_source_connector.py b/surfsense_backend/app/schemas/search_source_connector.py
index 1b0ed0b13a..aac7b92d51 100644
--- a/surfsense_backend/app/schemas/search_source_connector.py
+++ b/surfsense_backend/app/schemas/search_source_connector.py
@@ -17,6 +17,7 @@ class SearchSourceConnectorBase(BaseModel):
     last_indexed_at: datetime | None = None
     config: dict[str, Any]
     enable_summary: bool = False
+    enable_vision_llm: bool = False
     periodic_indexing_enabled: bool = False
     indexing_frequency_minutes: int | None = None
     next_scheduled_at: datetime | None = None
@@ -67,6 +68,7 @@ class SearchSourceConnectorUpdate(BaseModel):
     last_indexed_at: datetime | None = None
     config: dict[str, Any] | None = None
     enable_summary: bool | None = None
+    enable_vision_llm: bool | None = None
     periodic_indexing_enabled: bool | None = None
     indexing_frequency_minutes: int | None = None
     next_scheduled_at: datetime | None = None
diff --git a/surfsense_backend/app/schemas/search_space.py b/surfsense_backend/app/schemas/search_space.py
index 054fe14654..77e34ea4be 100644
--- a/surfsense_backend/app/schemas/search_space.py
+++ b/surfsense_backend/app/schemas/search_space.py
@@ -21,6 +21,8 @@ class SearchSpaceUpdate(BaseModel):
     description: str | None = None
     citations_enabled: bool | None = None
     qna_custom_instructions: str | None = None
+    shared_memory_md: str | None = None
+    ai_file_sort_enabled: bool | None = None
 
 
 class SearchSpaceRead(SearchSpaceBase, IDModel, TimestampModel):
@@ -29,6 +31,8 @@ class SearchSpaceRead(SearchSpaceBase, IDModel, TimestampModel):
     user_id: uuid.UUID
     citations_enabled: bool
     qna_custom_instructions: str | None = None
+    shared_memory_md: str | None = None
+    ai_file_sort_enabled: bool = False
 
     model_config = ConfigDict(from_attributes=True)
 
diff --git a/surfsense_backend/app/schemas/stripe.py b/surfsense_backend/app/schemas/stripe.py
index 1c31856011..ad13ddf044 100644
--- a/surfsense_backend/app/schemas/stripe.py
+++ b/surfsense_backend/app/schemas/stripe.py
@@ -54,3 +54,76 @@ class StripeWebhookResponse(BaseModel):
     """Generic acknowledgement for Stripe webhook delivery."""
 
     received: bool = True
+
+
+class FinalizeCheckoutResponse(BaseModel):
+    """Response from /stripe/finalize-checkout.
+
+    Returned by the success page so the UI can show the post-purchase
+    balance immediately, even when the Stripe webhook hasn't been
+    delivered yet. ``status`` mirrors the underlying purchase row
+    (``pending`` / ``completed`` / ``failed``); the FE polls this
+    endpoint until it sees ``completed`` or a final ``failed``.
+    """
+
+    purchase_type: str  # "page_packs" | "premium_tokens"
+    status: str  # PagePurchaseStatus / PremiumTokenPurchaseStatus value
+    pages_limit: int | None = None
+    pages_used: int | None = None
+    pages_granted: int | None = None
+    premium_credit_micros_limit: int | None = None
+    premium_credit_micros_used: int | None = None
+    premium_credit_micros_granted: int | None = None
+
+
+class CreateTokenCheckoutSessionRequest(BaseModel):
+    """Request body for creating a premium token purchase checkout session."""
+
+    quantity: int = Field(ge=1, le=100)
+    search_space_id: int = Field(ge=1)
+
+
+class CreateTokenCheckoutSessionResponse(BaseModel):
+    """Response containing the Stripe-hosted checkout URL."""
+
+    checkout_url: str
+
+
+class TokenPurchaseRead(BaseModel):
+    """Serialized premium credit purchase record.
+
+    ``credit_micros_granted`` is in micro-USD (1_000_000 = $1.00). The
+    schema name kept ``Token`` for API back-compat with pinned clients.
+    """
+
+    id: uuid.UUID
+    stripe_checkout_session_id: str
+    stripe_payment_intent_id: str | None = None
+    quantity: int
+    credit_micros_granted: int
+    amount_total: int | None = None
+    currency: str | None = None
+    status: str
+    completed_at: datetime | None = None
+    created_at: datetime
+
+    model_config = ConfigDict(from_attributes=True)
+
+
+class TokenPurchaseHistoryResponse(BaseModel):
+    """Response containing the user's premium credit purchases."""
+
+    purchases: list[TokenPurchaseRead]
+
+
+class TokenStripeStatusResponse(BaseModel):
+    """Response describing premium-credit-buying availability and balance.
+
+    All ``premium_credit_micros_*`` fields are in micro-USD; the FE
+    divides by 1_000_000 to display USD.
+    """
+
+    token_buying_enabled: bool
+    premium_credit_micros_used: int = 0
+    premium_credit_micros_limit: int = 0
+    premium_credit_micros_remaining: int = 0
diff --git a/surfsense_backend/app/schemas/surfsense_docs.py b/surfsense_backend/app/schemas/surfsense_docs.py
index ce32c0ef8c..3adf25032e 100644
--- a/surfsense_backend/app/schemas/surfsense_docs.py
+++ b/surfsense_backend/app/schemas/surfsense_docs.py
@@ -22,6 +22,7 @@ class SurfsenseDocsDocumentRead(BaseModel):
     id: int
     title: str
     source: str
+    public_url: str
     content: str
     created_at: datetime | None = None
     updated_at: datetime | None = None
@@ -35,6 +36,7 @@ class SurfsenseDocsDocumentWithChunksRead(BaseModel):
     id: int
     title: str
     source: str
+    public_url: str
     content: str
     chunks: list[SurfsenseDocsChunkRead]
 
diff --git a/surfsense_backend/app/schemas/vision_llm.py b/surfsense_backend/app/schemas/vision_llm.py
index ab2e609dc2..d0eeaf5c67 100644
--- a/surfsense_backend/app/schemas/vision_llm.py
+++ b/surfsense_backend/app/schemas/vision_llm.py
@@ -62,6 +62,15 @@ class VisionLLMConfigPublic(BaseModel):
 
 
 class GlobalVisionLLMConfigRead(BaseModel):
+    """Schema for reading global vision LLM configs from YAML.
+
+    The ``billing_tier`` field allows the frontend to show a Premium/Free
+    badge and (more importantly) tells the backend whether to debit the
+    user's premium credit pool when this config is used. ``"free"`` is
+    the default for backward compatibility — admins must explicitly opt
+    a global config into ``"premium"``.
+    """
+
     id: int = Field(...)
     name: str
     description: str | None = None
@@ -73,3 +82,35 @@ class GlobalVisionLLMConfigRead(BaseModel):
     litellm_params: dict[str, Any] | None = None
     is_global: bool = True
     is_auto_mode: bool = False
+    billing_tier: str = Field(
+        default="free",
+        description="'free' or 'premium'. Premium debits the user's premium credit pool (USD-cost-based).",
+    )
+    is_premium: bool = Field(
+        default=False,
+        description=(
+            "Convenience boolean derived server-side from "
+            "``billing_tier == 'premium'``. The new-chat model selector "
+            "keys its Free/Premium badge off this field for parity with "
+            "chat (`GlobalLLMConfigRead.is_premium`)."
+        ),
+    )
+    quota_reserve_tokens: int | None = Field(
+        default=None,
+        description=(
+            "Optional override for the per-call reservation in *tokens* — "
+            "converted to micro-USD via the model's input/output prices at "
+            "reservation time. Falls back to QUOTA_DEFAULT_RESERVE_TOKENS."
+        ),
+    )
+    input_cost_per_token: float | None = Field(
+        default=None,
+        description=(
+            "Optional input price in USD/token. Used by pricing_registration to "
+            "register custom Azure / OpenRouter aliases with LiteLLM at startup."
+        ),
+    )
+    output_cost_per_token: float | None = Field(
+        default=None,
+        description="Optional output price in USD/token. Pair with input_cost_per_token.",
+    )
diff --git a/surfsense_backend/app/services/ai_file_sort_service.py b/surfsense_backend/app/services/ai_file_sort_service.py
new file mode 100644
index 0000000000..2f04131a60
--- /dev/null
+++ b/surfsense_backend/app/services/ai_file_sort_service.py
@@ -0,0 +1,329 @@
+"""AI File Sort Service: builds connector-type/date/category/subcategory folder paths."""
+
+from __future__ import annotations
+
+import json
+import logging
+import re
+from datetime import UTC, datetime
+
+from langchain_core.messages import HumanMessage
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.future import select
+from sqlalchemy.orm import selectinload
+
+from app.db import (
+    Chunk,
+    Document,
+    DocumentType,
+    SearchSourceConnector,
+    SearchSourceConnectorType,
+)
+from app.services.folder_service import ensure_folder_hierarchy_with_depth_validation
+
+logger = logging.getLogger(__name__)
+
+_DOCTYPE_TO_CONNECTOR_LABEL: dict[str, str] = {
+    DocumentType.EXTENSION: "Browser Extension",
+    DocumentType.CRAWLED_URL: "Web Crawl",
+    DocumentType.FILE: "File Upload",
+    DocumentType.SLACK_CONNECTOR: "Slack",
+    DocumentType.TEAMS_CONNECTOR: "Teams",
+    DocumentType.ONEDRIVE_FILE: "OneDrive",
+    DocumentType.NOTION_CONNECTOR: "Notion",
+    DocumentType.YOUTUBE_VIDEO: "YouTube",
+    DocumentType.GITHUB_CONNECTOR: "GitHub",
+    DocumentType.LINEAR_CONNECTOR: "Linear",
+    DocumentType.DISCORD_CONNECTOR: "Discord",
+    DocumentType.JIRA_CONNECTOR: "Jira",
+    DocumentType.CONFLUENCE_CONNECTOR: "Confluence",
+    DocumentType.CLICKUP_CONNECTOR: "ClickUp",
+    DocumentType.GOOGLE_CALENDAR_CONNECTOR: "Google Calendar",
+    DocumentType.GOOGLE_GMAIL_CONNECTOR: "Gmail",
+    DocumentType.GOOGLE_DRIVE_FILE: "Google Drive",
+    DocumentType.AIRTABLE_CONNECTOR: "Airtable",
+    DocumentType.LUMA_CONNECTOR: "Luma",
+    DocumentType.ELASTICSEARCH_CONNECTOR: "Elasticsearch",
+    DocumentType.BOOKSTACK_CONNECTOR: "BookStack",
+    DocumentType.CIRCLEBACK: "Circleback",
+    DocumentType.OBSIDIAN_CONNECTOR: "Obsidian",
+    DocumentType.NOTE: "Notes",
+    DocumentType.DROPBOX_FILE: "Dropbox",
+    DocumentType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR: "Google Drive (Composio)",
+    DocumentType.COMPOSIO_GMAIL_CONNECTOR: "Gmail (Composio)",
+    DocumentType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR: "Google Calendar (Composio)",
+    DocumentType.LOCAL_FOLDER_FILE: "Local Folder",
+}
+
+_CONNECTOR_TYPE_LABEL: dict[str, str] = {
+    SearchSourceConnectorType.SERPER_API: "Serper Search",
+    SearchSourceConnectorType.TAVILY_API: "Tavily Search",
+    SearchSourceConnectorType.SEARXNG_API: "SearXNG Search",
+    SearchSourceConnectorType.LINKUP_API: "Linkup Search",
+    SearchSourceConnectorType.BAIDU_SEARCH_API: "Baidu Search",
+    SearchSourceConnectorType.SLACK_CONNECTOR: "Slack",
+    SearchSourceConnectorType.TEAMS_CONNECTOR: "Teams",
+    SearchSourceConnectorType.ONEDRIVE_CONNECTOR: "OneDrive",
+    SearchSourceConnectorType.NOTION_CONNECTOR: "Notion",
+    SearchSourceConnectorType.GITHUB_CONNECTOR: "GitHub",
+    SearchSourceConnectorType.LINEAR_CONNECTOR: "Linear",
+    SearchSourceConnectorType.DISCORD_CONNECTOR: "Discord",
+    SearchSourceConnectorType.JIRA_CONNECTOR: "Jira",
+    SearchSourceConnectorType.CONFLUENCE_CONNECTOR: "Confluence",
+    SearchSourceConnectorType.CLICKUP_CONNECTOR: "ClickUp",
+    SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR: "Google Calendar",
+    SearchSourceConnectorType.GOOGLE_GMAIL_CONNECTOR: "Gmail",
+    SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR: "Google Drive",
+    SearchSourceConnectorType.AIRTABLE_CONNECTOR: "Airtable",
+    SearchSourceConnectorType.LUMA_CONNECTOR: "Luma",
+    SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: "Elasticsearch",
+    SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: "Web Crawl",
+    SearchSourceConnectorType.BOOKSTACK_CONNECTOR: "BookStack",
+    SearchSourceConnectorType.CIRCLEBACK_CONNECTOR: "Circleback",
+    SearchSourceConnectorType.OBSIDIAN_CONNECTOR: "Obsidian",
+    SearchSourceConnectorType.MCP_CONNECTOR: "MCP",
+    SearchSourceConnectorType.DROPBOX_CONNECTOR: "Dropbox",
+    SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR: "Google Drive (Composio)",
+    SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR: "Gmail (Composio)",
+    SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR: "Google Calendar (Composio)",
+}
+
+_MAX_CONTENT_CHARS = 4000
+_MAX_CHUNKS_FOR_CONTEXT = 5
+
+_CATEGORY_PROMPT = (
+    "Based on the document information below, classify it into a broad category "
+    "and a more specific subcategory.\n\n"
+    "Rules:\n"
+    "- category: 1-2 word broad theme (e.g. Science, Finance, Engineering, Communication, Media)\n"
+    "- subcategory: 1-2 word specific topic within the category "
+    "(e.g. Physics, Tax Reports, Backend, Team Updates)\n"
+    "- Use nouns only. Do not include generic terms like 'General' or 'Miscellaneous'.\n\n"
+    "Title: {title}\n\n"
+    "Content: {summary}\n\n"
+    'Respond with ONLY a JSON object: {{"category": "...", "subcategory": "..."}}'
+)
+
+_SAFE_NAME_RE = re.compile(r"[^a-zA-Z0-9 _\-()]")
+_FALLBACK_CATEGORY = "Uncategorized"
+_FALLBACK_SUBCATEGORY = "General"
+
+
+def resolve_root_folder_label(
+    document: Document, connector: SearchSourceConnector | None
+) -> str:
+    if connector is not None:
+        return _CONNECTOR_TYPE_LABEL.get(
+            connector.connector_type, str(connector.connector_type)
+        )
+    return _DOCTYPE_TO_CONNECTOR_LABEL.get(
+        document.document_type, str(document.document_type)
+    )
+
+
+def resolve_date_folder(document: Document) -> str:
+    ts = document.updated_at or document.created_at
+    if ts is None:
+        ts = datetime.now(UTC)
+    return ts.strftime("%Y-%m-%d")
+
+
+def sanitize_category_folder_name(
+    value: str | None, fallback: str = _FALLBACK_CATEGORY
+) -> str:
+    if not value or not value.strip():
+        return fallback
+    cleaned = _SAFE_NAME_RE.sub("", value.strip())
+    cleaned = " ".join(cleaned.split())
+    if not cleaned:
+        return fallback
+    return cleaned[:50]
+
+
+async def _resolve_document_text(
+    session: AsyncSession,
+    document: Document,
+) -> str:
+    """Build the best available text representation for taxonomy generation.
+
+    Prefers ``document.content``; falls back to joining the first few chunks
+    when content is empty or too short to be useful.
+    """
+    text = (document.content or "").strip()
+    if len(text) >= 100:
+        return text[:_MAX_CONTENT_CHARS]
+
+    stmt = (
+        select(Chunk.content)
+        .where(Chunk.document_id == document.id)
+        .order_by(Chunk.id)
+        .limit(_MAX_CHUNKS_FOR_CONTEXT)
+    )
+    result = await session.execute(stmt)
+    chunk_texts = [row[0] for row in result.all() if row[0]]
+    if chunk_texts:
+        combined = "\n\n".join(chunk_texts)
+        return combined[:_MAX_CONTENT_CHARS]
+
+    return text[:_MAX_CONTENT_CHARS]
+
+
+def _get_cached_taxonomy(document: Document) -> tuple[str, str] | None:
+    """Return (category, subcategory) from document metadata cache, or None."""
+    meta = document.document_metadata
+    if not isinstance(meta, dict):
+        return None
+    cat = meta.get("ai_sort_category")
+    subcat = meta.get("ai_sort_subcategory")
+    if cat and subcat and isinstance(cat, str) and isinstance(subcat, str):
+        return cat, subcat
+    return None
+
+
+def _set_cached_taxonomy(document: Document, category: str, subcategory: str) -> None:
+    """Persist the AI taxonomy on document metadata for deterministic re-sorts."""
+    meta = dict(document.document_metadata or {})
+    meta["ai_sort_category"] = category
+    meta["ai_sort_subcategory"] = subcategory
+    document.document_metadata = meta
+
+
+async def generate_ai_taxonomy(
+    title: str,
+    summary_or_content: str,
+    llm,
+) -> tuple[str, str]:
+    """Return (category, subcategory) using a single structured LLM call."""
+    text = (summary_or_content or "").strip()
+    if not text:
+        return _FALLBACK_CATEGORY, _FALLBACK_SUBCATEGORY
+
+    if len(text) > _MAX_CONTENT_CHARS:
+        text = text[:_MAX_CONTENT_CHARS]
+
+    prompt = _CATEGORY_PROMPT.format(title=title or "Untitled", summary=text)
+    try:
+        result = await llm.ainvoke([HumanMessage(content=prompt)])
+        raw = result.content.strip()
+        if raw.startswith("```"):
+            raw = raw.split("\n", 1)[-1].rsplit("```", 1)[0].strip()
+        parsed = json.loads(raw)
+        category = sanitize_category_folder_name(
+            parsed.get("category"), _FALLBACK_CATEGORY
+        )
+        subcategory = sanitize_category_folder_name(
+            parsed.get("subcategory"), _FALLBACK_SUBCATEGORY
+        )
+        return category, subcategory
+    except Exception:
+        logger.warning("AI taxonomy generation failed, using fallback", exc_info=True)
+        return _FALLBACK_CATEGORY, _FALLBACK_SUBCATEGORY
+
+
+def _build_path_segments(
+    root_label: str,
+    date_label: str,
+    category: str,
+    subcategory: str,
+) -> list[dict]:
+    return [
+        {"name": root_label, "metadata": {"ai_sort": True, "ai_sort_level": 1}},
+        {"name": date_label, "metadata": {"ai_sort": True, "ai_sort_level": 2}},
+        {"name": category, "metadata": {"ai_sort": True, "ai_sort_level": 3}},
+        {"name": subcategory, "metadata": {"ai_sort": True, "ai_sort_level": 4}},
+    ]
+
+
+async def _resolve_taxonomy(
+    session: AsyncSession,
+    document: Document,
+    llm,
+) -> tuple[str, str]:
+    """Return (category, subcategory), reusing cached values when available."""
+    cached = _get_cached_taxonomy(document)
+    if cached is not None:
+        return cached
+
+    content_text = await _resolve_document_text(session, document)
+    category, subcategory = await generate_ai_taxonomy(
+        document.title, content_text, llm
+    )
+    _set_cached_taxonomy(document, category, subcategory)
+    return category, subcategory
+
+
+async def ai_sort_document(
+    session: AsyncSession,
+    document: Document,
+    llm,
+) -> Document:
+    """Sort a single document into the 4-level AI folder hierarchy."""
+    connector: SearchSourceConnector | None = None
+    if document.connector_id is not None:
+        connector = await session.get(SearchSourceConnector, document.connector_id)
+
+    root_label = resolve_root_folder_label(document, connector)
+    date_label = resolve_date_folder(document)
+
+    category, subcategory = await _resolve_taxonomy(session, document, llm)
+
+    segments = _build_path_segments(root_label, date_label, category, subcategory)
+
+    leaf_folder = await ensure_folder_hierarchy_with_depth_validation(
+        session,
+        document.search_space_id,
+        segments,
+    )
+
+    document.folder_id = leaf_folder.id
+    await session.flush()
+    return document
+
+
+async def ai_sort_all_documents(
+    session: AsyncSession,
+    search_space_id: int,
+    llm,
+) -> tuple[int, int]:
+    """Sort all documents in a search space. Returns (sorted_count, failed_count)."""
+    stmt = (
+        select(Document)
+        .where(Document.search_space_id == search_space_id)
+        .options(selectinload(Document.connector))
+    )
+    result = await session.execute(stmt)
+    documents = list(result.scalars().all())
+
+    sorted_count = 0
+    failed_count = 0
+
+    for doc in documents:
+        try:
+            connector = doc.connector
+            root_label = resolve_root_folder_label(doc, connector)
+            date_label = resolve_date_folder(doc)
+
+            category, subcategory = await _resolve_taxonomy(session, doc, llm)
+            segments = _build_path_segments(
+                root_label, date_label, category, subcategory
+            )
+
+            leaf_folder = await ensure_folder_hierarchy_with_depth_validation(
+                session,
+                search_space_id,
+                segments,
+            )
+            doc.folder_id = leaf_folder.id
+            sorted_count += 1
+        except Exception:
+            logger.error("Failed to AI-sort document %s", doc.id, exc_info=True)
+            failed_count += 1
+
+    await session.commit()
+    logger.info(
+        "AI sort complete for search_space=%d: sorted=%d, failed=%d",
+        search_space_id,
+        sorted_count,
+        failed_count,
+    )
+    return sorted_count, failed_count
diff --git a/surfsense_backend/app/services/auto_model_pin_service.py b/surfsense_backend/app/services/auto_model_pin_service.py
new file mode 100644
index 0000000000..9bbca8669c
--- /dev/null
+++ b/surfsense_backend/app/services/auto_model_pin_service.py
@@ -0,0 +1,479 @@
+"""Resolve and persist Auto (Fastest) model pins per chat thread.
+
+Auto (Fastest) is represented by ``agent_llm_id == 0``. For chat threads we
+resolve that virtual mode to one concrete global LLM config exactly once and
+persist the chosen config id on ``new_chat_threads.pinned_llm_config_id`` so
+subsequent turns are stable.
+
+Single-writer invariant: this module is the only writer of
+``NewChatThread.pinned_llm_config_id`` (aside from the bulk clear in
+``search_spaces_routes`` when a search space's ``agent_llm_id`` changes).
+Therefore a non-NULL value unambiguously means "this thread has an
+Auto-resolved pin"; no separate source/policy column is needed.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import logging
+import threading
+import time
+from dataclasses import dataclass
+from uuid import UUID
+
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.config import config
+from app.db import NewChatThread
+from app.services.quality_score import _QUALITY_TOP_K
+from app.services.token_quota_service import TokenQuotaService
+
+logger = logging.getLogger(__name__)
+
+AUTO_FASTEST_ID = 0
+AUTO_FASTEST_MODE = "auto_fastest"
+_RUNTIME_COOLDOWN_SECONDS = 600
+_HEALTHY_TTL_SECONDS = 45
+
+# In-memory runtime cooldown map for configs that recently hard-failed at
+# provider runtime (e.g. OpenRouter 429 on a pinned free model). This keeps
+# the same unhealthy config from being reselected immediately during repair.
+_runtime_cooldown_until: dict[int, float] = {}
+_runtime_cooldown_lock = threading.Lock()
+
+# Short-TTL "recently healthy" cache for configs that just passed a runtime
+# preflight ping. Lets back-to-back turns on the same model skip the probe
+# without eroding correctness — entries auto-expire and are wiped any time
+# the same config is cooled down or the OR catalogue is refreshed.
+_healthy_until: dict[int, float] = {}
+_healthy_lock = threading.Lock()
+
+
+@dataclass
+class AutoPinResolution:
+    resolved_llm_config_id: int
+    resolved_tier: str
+    from_existing_pin: bool
+
+
+def _is_usable_global_config(cfg: dict) -> bool:
+    return bool(
+        cfg.get("id") is not None
+        and cfg.get("model_name")
+        and cfg.get("provider")
+        and cfg.get("api_key")
+    )
+
+
+def _prune_runtime_cooldowns(now_ts: float | None = None) -> None:
+    now = time.time() if now_ts is None else now_ts
+    stale = [cid for cid, until in _runtime_cooldown_until.items() if until <= now]
+    for cid in stale:
+        _runtime_cooldown_until.pop(cid, None)
+
+
+def _is_runtime_cooled_down(config_id: int) -> bool:
+    with _runtime_cooldown_lock:
+        _prune_runtime_cooldowns()
+        return config_id in _runtime_cooldown_until
+
+
+def mark_runtime_cooldown(
+    config_id: int,
+    *,
+    reason: str = "rate_limited",
+    cooldown_seconds: int = _RUNTIME_COOLDOWN_SECONDS,
+) -> None:
+    """Temporarily suppress a config from Auto selection.
+
+    Used by runtime error handlers (e.g. OpenRouter 429) so an already pinned
+    config that is currently unhealthy does not get immediately reused on the
+    same thread during repair.
+    """
+    if cooldown_seconds <= 0:
+        cooldown_seconds = _RUNTIME_COOLDOWN_SECONDS
+    until = time.time() + int(cooldown_seconds)
+    with _runtime_cooldown_lock:
+        _runtime_cooldown_until[int(config_id)] = until
+        _prune_runtime_cooldowns()
+    # A cooled cfg can never be "recently healthy"; drop any stale credit so
+    # the next turn that resolves to it (after cooldown) re-runs preflight.
+    clear_healthy(int(config_id))
+    logger.info(
+        "auto_pin_runtime_cooled_down config_id=%s reason=%s cooldown_seconds=%s",
+        config_id,
+        reason,
+        cooldown_seconds,
+    )
+
+
+def clear_runtime_cooldown(config_id: int | None = None) -> None:
+    """Test/ops helper to clear runtime cooldown entries."""
+    with _runtime_cooldown_lock:
+        if config_id is None:
+            _runtime_cooldown_until.clear()
+            return
+        _runtime_cooldown_until.pop(int(config_id), None)
+
+
+def _prune_healthy(now_ts: float | None = None) -> None:
+    now = time.time() if now_ts is None else now_ts
+    stale = [cid for cid, until in _healthy_until.items() if until <= now]
+    for cid in stale:
+        _healthy_until.pop(cid, None)
+
+
+def is_recently_healthy(config_id: int) -> bool:
+    """Return True if ``config_id`` passed preflight within the TTL window."""
+    with _healthy_lock:
+        _prune_healthy()
+        return int(config_id) in _healthy_until
+
+
+def mark_healthy(
+    config_id: int,
+    *,
+    ttl_seconds: int = _HEALTHY_TTL_SECONDS,
+) -> None:
+    """Record that ``config_id`` just passed a preflight probe.
+
+    Subsequent calls within ``ttl_seconds`` can skip the preflight ping. The
+    healthy state is intentionally process-local — it's a latency hint, not a
+    correctness primitive — so multi-worker drift is acceptable.
+    """
+    if ttl_seconds <= 0:
+        ttl_seconds = _HEALTHY_TTL_SECONDS
+    until = time.time() + int(ttl_seconds)
+    with _healthy_lock:
+        _healthy_until[int(config_id)] = until
+        _prune_healthy()
+
+
+def clear_healthy(config_id: int | None = None) -> None:
+    """Drop one (or all) healthy-cache entries.
+
+    Called from runtime cooldown and OR catalogue refresh so a freshly cooled
+    or replaced config never carries stale "healthy" credit.
+    """
+    with _healthy_lock:
+        if config_id is None:
+            _healthy_until.clear()
+            return
+        _healthy_until.pop(int(config_id), None)
+
+
+def _cfg_supports_image_input(cfg: dict) -> bool:
+    """True if the global cfg can accept image inputs.
+
+    Prefers the explicit ``supports_image_input`` flag (set by the YAML
+    loader / OpenRouter integration). Falls back to a LiteLLM lookup so
+    a YAML entry whose flag was somehow stripped doesn't get wrongly
+    excluded. Default-allows on unknown — the streaming-task safety net
+    is the actual block, not this filter.
+    """
+    if "supports_image_input" in cfg:
+        return bool(cfg.get("supports_image_input"))
+    # Lazy import: provider_capabilities -> llm_config -> services chain;
+    # importing at module load would create an init-order cycle through
+    # ``app.config``.
+    from app.services.provider_capabilities import derive_supports_image_input
+
+    cfg_litellm_params = cfg.get("litellm_params") or {}
+    base_model = (
+        cfg_litellm_params.get("base_model")
+        if isinstance(cfg_litellm_params, dict)
+        else None
+    )
+    return derive_supports_image_input(
+        provider=cfg.get("provider"),
+        model_name=cfg.get("model_name"),
+        base_model=base_model,
+        custom_provider=cfg.get("custom_provider"),
+    )
+
+
+def _global_candidates(*, requires_image_input: bool = False) -> list[dict]:
+    """Return Auto-eligible global cfgs.
+
+    Drops cfgs flagged ``health_gated`` (best non-null OpenRouter uptime
+    below ``_HEALTH_GATE_UPTIME_PCT``) so chronically broken providers
+    can't be picked as the thread's pin. Also excludes configs currently
+    in runtime cooldown (e.g. temporary 429 bursts).
+
+    When ``requires_image_input`` is True (image turn), additionally
+    filters out configs whose ``supports_image_input`` resolves to False
+    so a text-only deployment can't be pinned for an image request.
+    """
+    candidates = [
+        cfg
+        for cfg in config.GLOBAL_LLM_CONFIGS
+        if _is_usable_global_config(cfg)
+        and not cfg.get("health_gated")
+        and not _is_runtime_cooled_down(int(cfg.get("id", 0)))
+        and (not requires_image_input or _cfg_supports_image_input(cfg))
+    ]
+    return sorted(candidates, key=lambda c: int(c.get("id", 0)))
+
+
+def _tier_of(cfg: dict) -> str:
+    return str(cfg.get("billing_tier", "free")).lower()
+
+
+def _is_preferred_premium_auto_config(cfg: dict) -> bool:
+    """Return True for the operator-preferred premium Auto model."""
+    return (
+        _tier_of(cfg) == "premium"
+        and str(cfg.get("provider", "")).upper() == "AZURE_OPENAI"
+        and str(cfg.get("model_name", "")).lower() == "gpt-5.4"
+    )
+
+
+def _select_pin(eligible: list[dict], thread_id: int) -> tuple[dict, int]:
+    """Pick a config with quality-first ranking + deterministic spread.
+
+    Tier policy is lock-first: prefer Tier A (operator-curated YAML)
+    cfgs and only fall through to Tier B/C (dynamic OpenRouter) if no
+    Tier A cfg is eligible after upstream filters. Within the locked
+    pool, sort by ``quality_score`` and pick from the top-K via
+    ``SHA256(thread_id)`` so different new threads spread across the
+    best models without ever picking a low-ranked one.
+
+    Returns ``(chosen_cfg, top_k_size)``. ``top_k_size`` is exposed for
+    structured logging in the caller.
+    """
+    tier_a = [c for c in eligible if c.get("auto_pin_tier") in (None, "A")]
+    pool = tier_a if tier_a else eligible
+    pool = sorted(pool, key=lambda c: -int(c.get("quality_score") or 0))
+    top_k = pool[:_QUALITY_TOP_K]
+    digest = hashlib.sha256(f"{AUTO_FASTEST_MODE}:{thread_id}".encode()).digest()
+    idx = int.from_bytes(digest[:8], "big") % len(top_k)
+    return top_k[idx], len(top_k)
+
+
+def _to_uuid(user_id: str | UUID | None) -> UUID | None:
+    if user_id is None:
+        return None
+    if isinstance(user_id, UUID):
+        return user_id
+    try:
+        return UUID(str(user_id))
+    except Exception:
+        return None
+
+
+async def _is_premium_eligible(
+    session: AsyncSession, user_id: str | UUID | None
+) -> bool:
+    parsed = _to_uuid(user_id)
+    if parsed is None:
+        return False
+    usage = await TokenQuotaService.premium_get_usage(session, parsed)
+    return bool(usage.allowed)
+
+
+async def resolve_or_get_pinned_llm_config_id(
+    session: AsyncSession,
+    *,
+    thread_id: int,
+    search_space_id: int,
+    user_id: str | UUID | None,
+    selected_llm_config_id: int,
+    force_repin_free: bool = False,
+    exclude_config_ids: set[int] | None = None,
+    requires_image_input: bool = False,
+) -> AutoPinResolution:
+    """Resolve Auto (Fastest) to one concrete config id and persist the pin.
+
+    For non-auto selections, this function clears any existing pin and returns
+    the selected id as-is.
+
+    When ``requires_image_input`` is True (the current turn carries an
+    ``image_url`` block), the candidate pool is filtered to vision-capable
+    cfgs and any existing pin that can't accept image input is treated as
+    invalid (force re-pin). If no vision-capable cfg is available the
+    function raises ``ValueError`` so the streaming task surfaces the same
+    friendly ``MODEL_DOES_NOT_SUPPORT_IMAGE_INPUT`` error instead of
+    silently routing the image to a text-only deployment.
+    """
+    thread = (
+        (
+            await session.execute(
+                select(NewChatThread)
+                .where(NewChatThread.id == thread_id)
+                .with_for_update(of=NewChatThread)
+            )
+        )
+        .unique()
+        .scalar_one_or_none()
+    )
+    if thread is None:
+        raise ValueError(f"Thread {thread_id} not found")
+    if thread.search_space_id != search_space_id:
+        raise ValueError(
+            f"Thread {thread_id} does not belong to search space {search_space_id}"
+        )
+
+    # Explicit model selected: clear any stale pin.
+    if selected_llm_config_id != AUTO_FASTEST_ID:
+        if thread.pinned_llm_config_id is not None:
+            thread.pinned_llm_config_id = None
+            await session.commit()
+        return AutoPinResolution(
+            resolved_llm_config_id=selected_llm_config_id,
+            resolved_tier="explicit",
+            from_existing_pin=False,
+        )
+
+    excluded_ids = {int(cid) for cid in (exclude_config_ids or set())}
+    candidates = [
+        c
+        for c in _global_candidates(requires_image_input=requires_image_input)
+        if int(c.get("id", 0)) not in excluded_ids
+    ]
+    if not candidates:
+        if requires_image_input:
+            # Distinguish the "no vision-capable cfg" case from generic
+            # "no usable cfg" so the streaming task can map this to the
+            # MODEL_DOES_NOT_SUPPORT_IMAGE_INPUT SSE error.
+            raise ValueError(
+                "No vision-capable global LLM configs are available for Auto mode"
+            )
+        raise ValueError("No usable global LLM configs are available for Auto mode")
+    candidate_by_id = {int(c["id"]): c for c in candidates}
+
+    # Reuse an existing valid pin without re-checking current quota (no silent
+    # tier switch), unless the caller explicitly requests a forced repin to free
+    # *or* the turn requires image input but the pin can't handle it.
+    pinned_id = thread.pinned_llm_config_id
+    if (
+        not force_repin_free
+        and pinned_id is not None
+        and int(pinned_id) in candidate_by_id
+    ):
+        pinned_cfg = candidate_by_id[int(pinned_id)]
+        logger.info(
+            "auto_pin_reused thread_id=%s search_space_id=%s resolved_config_id=%s tier=%s",
+            thread_id,
+            search_space_id,
+            pinned_id,
+            _tier_of(pinned_cfg),
+        )
+        logger.info(
+            "auto_pin_resolved thread_id=%s config_id=%s tier=%s "
+            "auto_pin_tier=%s score=%s top_k_size=0 from_existing_pin=True",
+            thread_id,
+            pinned_id,
+            _tier_of(pinned_cfg),
+            pinned_cfg.get("auto_pin_tier", "?"),
+            int(pinned_cfg.get("quality_score") or 0),
+        )
+        return AutoPinResolution(
+            resolved_llm_config_id=int(pinned_id),
+            resolved_tier=_tier_of(pinned_cfg),
+            from_existing_pin=True,
+        )
+    if pinned_id is not None:
+        # If the pin is *only* invalid because it can't handle the image
+        # turn (it's still a healthy, usable config in the broader pool),
+        # log that explicitly so operators can correlate the re-pin with
+        # the user's image attachment instead of suspecting a cooldown.
+        if requires_image_input:
+            try:
+                pinned_global = next(
+                    c
+                    for c in config.GLOBAL_LLM_CONFIGS
+                    if int(c.get("id", 0)) == int(pinned_id)
+                )
+            except StopIteration:
+                pinned_global = None
+            if pinned_global is not None and not _cfg_supports_image_input(
+                pinned_global
+            ):
+                logger.info(
+                    "auto_pin_repinned_for_image thread_id=%s search_space_id=%s "
+                    "previous_config_id=%s",
+                    thread_id,
+                    search_space_id,
+                    pinned_id,
+                )
+        logger.info(
+            "auto_pin_invalid thread_id=%s search_space_id=%s pinned_config_id=%s",
+            thread_id,
+            search_space_id,
+            pinned_id,
+        )
+
+    premium_eligible = (
+        False if force_repin_free else await _is_premium_eligible(session, user_id)
+    )
+    if premium_eligible:
+        premium_candidates = [c for c in candidates if _tier_of(c) == "premium"]
+        preferred_premium = [
+            c for c in premium_candidates if _is_preferred_premium_auto_config(c)
+        ]
+        eligible = preferred_premium or premium_candidates
+    else:
+        eligible = [c for c in candidates if _tier_of(c) != "premium"]
+
+    if not eligible:
+        if requires_image_input:
+            raise ValueError(
+                "Auto mode could not find a vision-capable LLM config for this user and quota state"
+            )
+        raise ValueError(
+            "Auto mode could not find an eligible LLM config for this user and quota state"
+        )
+
+    selected_cfg, top_k_size = _select_pin(eligible, thread_id)
+    selected_id = int(selected_cfg["id"])
+    selected_tier = _tier_of(selected_cfg)
+
+    thread.pinned_llm_config_id = selected_id
+    await session.commit()
+
+    if force_repin_free:
+        logger.info(
+            "auto_pin_forced_free_repin thread_id=%s search_space_id=%s previous_config_id=%s resolved_config_id=%s",
+            thread_id,
+            search_space_id,
+            pinned_id,
+            selected_id,
+        )
+
+    if pinned_id is None:
+        logger.info(
+            "auto_pin_created thread_id=%s search_space_id=%s resolved_config_id=%s tier=%s premium_eligible=%s",
+            thread_id,
+            search_space_id,
+            selected_id,
+            selected_tier,
+            premium_eligible,
+        )
+    else:
+        logger.info(
+            "auto_pin_repaired thread_id=%s search_space_id=%s previous_config_id=%s resolved_config_id=%s tier=%s premium_eligible=%s",
+            thread_id,
+            search_space_id,
+            pinned_id,
+            selected_id,
+            selected_tier,
+            premium_eligible,
+        )
+
+    logger.info(
+        "auto_pin_resolved thread_id=%s config_id=%s tier=%s "
+        "auto_pin_tier=%s score=%s top_k_size=%d from_existing_pin=False",
+        thread_id,
+        selected_id,
+        selected_tier,
+        selected_cfg.get("auto_pin_tier", "?"),
+        int(selected_cfg.get("quality_score") or 0),
+        top_k_size,
+    )
+
+    return AutoPinResolution(
+        resolved_llm_config_id=selected_id,
+        resolved_tier=selected_tier,
+        from_existing_pin=False,
+    )
diff --git a/surfsense_backend/app/services/billable_calls.py b/surfsense_backend/app/services/billable_calls.py
new file mode 100644
index 0000000000..92ccd6a786
--- /dev/null
+++ b/surfsense_backend/app/services/billable_calls.py
@@ -0,0 +1,566 @@
+"""
+Per-call billable wrapper for image generation, vision LLM extraction, and
+any other short-lived premium operation that must charge against the user's
+shared premium credit pool.
+
+The ``billable_call`` async context manager encapsulates the standard
+"reserve → execute → finalize / release → record audit row" lifecycle in a
+single primitive so callers (the image-generation REST route and the
+vision-LLM wrapper used during indexing) don't have to re-implement it.
+
+KEY DESIGN POINTS (issue A, B):
+
+1. **Session isolation.** ``billable_call`` takes no caller transaction.
+   All ``TokenQuotaService.premium_*`` calls and the audit-row insert run
+   inside their own session context. Route callers use
+   ``shielded_async_session()`` by default; Celery callers can provide a
+   worker-loop-safe session factory. This guarantees that quota
+   commit/rollback can never accidentally flush or roll back rows the caller
+   has staged in its main session (e.g. a freshly-created
+   ``ImageGeneration`` row).
+
+2. **ContextVar safety.** The accumulator is scoped via
+   :func:`scoped_turn` (which uses ``ContextVar.reset(token)``), so a
+   nested ``billable_call`` inside an outer chat turn cannot corrupt the
+   chat turn's accumulator.
+
+3. **Free configs are still audited.** Free calls bypass the reserve /
+   finalize dance entirely but still record a ``TokenUsage`` audit row with
+   the LiteLLM-reported ``cost_micros``. This keeps the cost-attribution
+   pipeline complete for analytics even when nothing is debited.
+
+4. **Quota denial raises ``QuotaInsufficientError``.** The route handler is
+   responsible for translating that into HTTP 402. We *do not* catch the
+   denial inside ``billable_call`` — letting it propagate also prevents
+   the image-generation route from creating an ``ImageGeneration`` row
+   for a request that never actually ran.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+from collections.abc import AsyncIterator, Callable
+from contextlib import AbstractAsyncContextManager, asynccontextmanager, suppress
+from typing import Any
+from uuid import UUID, uuid4
+
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.config import config
+from app.db import shielded_async_session
+from app.services.token_quota_service import (
+    TokenQuotaService,
+    estimate_call_reserve_micros,
+)
+from app.services.token_tracking_service import (
+    TurnTokenAccumulator,
+    record_token_usage,
+    scoped_turn,
+)
+
+logger = logging.getLogger(__name__)
+
+AUDIT_TIMEOUT_SECONDS = 10.0
+BACKGROUND_ARTIFACT_USAGE_TYPES = frozenset(
+    {"video_presentation_generation", "podcast_generation"}
+)
+BillableSessionFactory = Callable[[], AbstractAsyncContextManager[AsyncSession]]
+
+
+class QuotaInsufficientError(Exception):
+    """Raised when ``TokenQuotaService.premium_reserve`` denies a billable
+    call because the user has exhausted their premium credit pool.
+
+    The route handler should catch this and return HTTP 402 Payment
+    Required (or the equivalent for the surface area). Outside of the HTTP
+    layer (e.g. the ``QuotaCheckedVisionLLM`` wrapper used during indexing)
+    callers may catch this and degrade gracefully — e.g. fall back to OCR
+    when vision is unavailable.
+    """
+
+    def __init__(
+        self,
+        *,
+        usage_type: str,
+        used_micros: int,
+        limit_micros: int,
+        remaining_micros: int,
+    ) -> None:
+        self.usage_type = usage_type
+        self.used_micros = used_micros
+        self.limit_micros = limit_micros
+        self.remaining_micros = remaining_micros
+        super().__init__(
+            f"Premium credit exhausted for {usage_type}: "
+            f"used={used_micros} limit={limit_micros} remaining={remaining_micros} (micro-USD)"
+        )
+
+
+class BillingSettlementError(Exception):
+    """Raised when a premium call completed but credit settlement failed."""
+
+    def __init__(self, *, usage_type: str, user_id: UUID, cause: Exception) -> None:
+        self.usage_type = usage_type
+        self.user_id = user_id
+        super().__init__(
+            f"Failed to settle premium credit for {usage_type} user={user_id}: {cause}"
+        )
+
+
+async def _rollback_safely(session: AsyncSession) -> None:
+    rollback = getattr(session, "rollback", None)
+    if rollback is not None:
+        with suppress(Exception):
+            await rollback()
+
+
+async def _record_audit_best_effort(
+    *,
+    session_factory: BillableSessionFactory,
+    usage_type: str,
+    search_space_id: int,
+    user_id: UUID,
+    prompt_tokens: int,
+    completion_tokens: int,
+    total_tokens: int,
+    cost_micros: int,
+    model_breakdown: dict[str, Any],
+    call_details: dict[str, Any] | None,
+    thread_id: int | None,
+    message_id: int | None,
+    audit_label: str,
+    timeout_seconds: float = AUDIT_TIMEOUT_SECONDS,
+) -> None:
+    """Persist a TokenUsage row without letting audit failure block callers.
+
+    Premium settlement is mandatory, but TokenUsage is an audit trail. If the
+    audit insert or commit hangs, user-facing artifacts such as videos and
+    podcasts must still be able to transition to READY after settlement.
+    """
+    audit_thread_id = (
+        None if usage_type in BACKGROUND_ARTIFACT_USAGE_TYPES else thread_id
+    )
+
+    async def _persist() -> None:
+        logger.info(
+            "[billable_call] audit start label=%s usage_type=%s user=%s thread=%s "
+            "total_tokens=%d cost_micros=%d",
+            audit_label,
+            usage_type,
+            user_id,
+            audit_thread_id,
+            total_tokens,
+            cost_micros,
+        )
+        async with session_factory() as audit_session:
+            try:
+                await record_token_usage(
+                    audit_session,
+                    usage_type=usage_type,
+                    search_space_id=search_space_id,
+                    user_id=user_id,
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                    total_tokens=total_tokens,
+                    cost_micros=cost_micros,
+                    model_breakdown=model_breakdown,
+                    call_details=call_details,
+                    thread_id=audit_thread_id,
+                    message_id=message_id,
+                )
+                logger.info(
+                    "[billable_call] audit row staged label=%s usage_type=%s user=%s thread=%s",
+                    audit_label,
+                    usage_type,
+                    user_id,
+                    audit_thread_id,
+                )
+                await audit_session.commit()
+                logger.info(
+                    "[billable_call] audit commit OK label=%s usage_type=%s user=%s thread=%s",
+                    audit_label,
+                    usage_type,
+                    user_id,
+                    audit_thread_id,
+                )
+            except BaseException:
+                await _rollback_safely(audit_session)
+                raise
+
+    try:
+        await asyncio.wait_for(_persist(), timeout=timeout_seconds)
+    except TimeoutError:
+        logger.warning(
+            "[billable_call] audit timed out label=%s usage_type=%s user=%s thread=%s "
+            "timeout=%.1fs total_tokens=%d cost_micros=%d",
+            audit_label,
+            usage_type,
+            user_id,
+            audit_thread_id,
+            timeout_seconds,
+            total_tokens,
+            cost_micros,
+        )
+    except Exception:
+        logger.exception(
+            "[billable_call] audit failed label=%s usage_type=%s user=%s thread=%s "
+            "total_tokens=%d cost_micros=%d",
+            audit_label,
+            usage_type,
+            user_id,
+            audit_thread_id,
+            total_tokens,
+            cost_micros,
+        )
+
+
+@asynccontextmanager
+async def billable_call(
+    *,
+    user_id: UUID,
+    search_space_id: int,
+    billing_tier: str,
+    base_model: str,
+    quota_reserve_tokens: int | None = None,
+    quota_reserve_micros_override: int | None = None,
+    usage_type: str,
+    thread_id: int | None = None,
+    message_id: int | None = None,
+    call_details: dict[str, Any] | None = None,
+    billable_session_factory: BillableSessionFactory | None = None,
+    audit_timeout_seconds: float = AUDIT_TIMEOUT_SECONDS,
+) -> AsyncIterator[TurnTokenAccumulator]:
+    """Wrap a single billable LLM/image call.
+
+    Args:
+        user_id: Owner of the credit pool to debit. For vision-LLM during
+            indexing this is the *search-space owner* (issue M), not the
+            triggering user.
+        search_space_id: Required — recorded on the ``TokenUsage`` audit row.
+        billing_tier: ``"premium"`` debits; anything else (``"free"``) skips
+            the reserve/finalize dance but still records an audit row with
+            the captured cost.
+        base_model: Used by :func:`estimate_call_reserve_micros` to compute
+            a worst-case reservation from LiteLLM's pricing table.
+        quota_reserve_tokens: Optional per-config override for the chat-style
+            reserve estimator (vision LLM uses this).
+        quota_reserve_micros_override: Optional flat micro-USD reservation
+            (image generation uses this — its cost shape is per-image, not
+            per-token).
+        usage_type: ``"image_generation"`` / ``"vision_extraction"`` / etc.
+            Recorded on the ``TokenUsage`` row.
+        thread_id, message_id: Optional FK columns on ``TokenUsage``.
+        call_details: Optional per-call metadata (model name, parameters)
+            forwarded to ``record_token_usage``.
+        billable_session_factory: Optional async context factory used for
+            reserve/finalize/release/audit sessions. Defaults to
+            ``shielded_async_session`` for route callers; Celery callers pass
+            a worker-loop-safe session factory.
+        audit_timeout_seconds: Upper bound for TokenUsage audit persistence.
+            Audit failure is best-effort and does not undo successful
+            settlement.
+
+    Yields:
+        The ``TurnTokenAccumulator`` scoped to this call. The caller invokes
+        the underlying LLM/image API while inside the ``async with``; the
+        ``TokenTrackingCallback`` populates the accumulator automatically.
+
+    Raises:
+        QuotaInsufficientError: when premium and ``premium_reserve`` denies.
+    """
+    is_premium = billing_tier == "premium"
+    session_factory = billable_session_factory or shielded_async_session
+
+    async with scoped_turn() as acc:
+        # ---------- Free path: just audit -------------------------------
+        if not is_premium:
+            try:
+                yield acc
+            finally:
+                # Always audit, even on exception, so we capture cost when
+                # provider returns successfully but the caller raises later.
+                await _record_audit_best_effort(
+                    session_factory=session_factory,
+                    usage_type=usage_type,
+                    search_space_id=search_space_id,
+                    user_id=user_id,
+                    prompt_tokens=acc.total_prompt_tokens,
+                    completion_tokens=acc.total_completion_tokens,
+                    total_tokens=acc.grand_total,
+                    cost_micros=acc.total_cost_micros,
+                    model_breakdown=acc.per_message_summary(),
+                    call_details=call_details,
+                    thread_id=thread_id,
+                    message_id=message_id,
+                    audit_label="free",
+                    timeout_seconds=audit_timeout_seconds,
+                )
+            return
+
+        # ---------- Premium path: reserve → execute → finalize ----------
+        if quota_reserve_micros_override is not None:
+            reserve_micros = max(1, int(quota_reserve_micros_override))
+        else:
+            reserve_micros = estimate_call_reserve_micros(
+                base_model=base_model or "",
+                quota_reserve_tokens=quota_reserve_tokens,
+            )
+
+        request_id = str(uuid4())
+
+        async with session_factory() as quota_session:
+            reserve_result = await TokenQuotaService.premium_reserve(
+                db_session=quota_session,
+                user_id=user_id,
+                request_id=request_id,
+                reserve_micros=reserve_micros,
+            )
+
+        if not reserve_result.allowed:
+            logger.info(
+                "[billable_call] reserve DENIED user=%s usage_type=%s "
+                "reserve=%d used=%d limit=%d remaining=%d",
+                user_id,
+                usage_type,
+                reserve_micros,
+                reserve_result.used,
+                reserve_result.limit,
+                reserve_result.remaining,
+            )
+            raise QuotaInsufficientError(
+                usage_type=usage_type,
+                used_micros=reserve_result.used,
+                limit_micros=reserve_result.limit,
+                remaining_micros=reserve_result.remaining,
+            )
+
+        logger.info(
+            "[billable_call] reserve OK user=%s usage_type=%s reserve_micros=%d "
+            "(remaining=%d)",
+            user_id,
+            usage_type,
+            reserve_micros,
+            reserve_result.remaining,
+        )
+
+        try:
+            yield acc
+        except BaseException:
+            # Release on any failure (including QuotaInsufficientError raised
+            # from a downstream call, asyncio cancellation, etc.). We use
+            # BaseException so cancellation also releases.
+            try:
+                async with session_factory() as quota_session:
+                    await TokenQuotaService.premium_release(
+                        db_session=quota_session,
+                        user_id=user_id,
+                        reserved_micros=reserve_micros,
+                    )
+            except Exception:
+                logger.exception(
+                    "[billable_call] premium_release failed for user=%s "
+                    "reserve_micros=%d (reservation will be GC'd by quota "
+                    "reconciliation if/when implemented)",
+                    user_id,
+                    reserve_micros,
+                )
+            raise
+
+        # ---------- Success: finalize + audit ----------------------------
+        actual_micros = acc.total_cost_micros
+        try:
+            logger.info(
+                "[billable_call] finalize start user=%s usage_type=%s actual=%d "
+                "reserved=%d thread=%s",
+                user_id,
+                usage_type,
+                actual_micros,
+                reserve_micros,
+                thread_id,
+            )
+            async with session_factory() as quota_session:
+                final_result = await TokenQuotaService.premium_finalize(
+                    db_session=quota_session,
+                    user_id=user_id,
+                    request_id=request_id,
+                    actual_micros=actual_micros,
+                    reserved_micros=reserve_micros,
+                )
+            logger.info(
+                "[billable_call] finalize user=%s usage_type=%s actual=%d "
+                "reserved=%d → used=%d/%d (remaining=%d)",
+                user_id,
+                usage_type,
+                actual_micros,
+                reserve_micros,
+                final_result.used,
+                final_result.limit,
+                final_result.remaining,
+            )
+        except Exception as finalize_exc:
+            # Last-ditch: if finalize itself fails, we must at least release
+            # so the reservation doesn't leak.
+            logger.exception(
+                "[billable_call] premium_finalize failed for user=%s; "
+                "attempting release",
+                user_id,
+            )
+            try:
+                async with session_factory() as quota_session:
+                    await TokenQuotaService.premium_release(
+                        db_session=quota_session,
+                        user_id=user_id,
+                        reserved_micros=reserve_micros,
+                    )
+            except Exception:
+                logger.exception(
+                    "[billable_call] release after finalize failure ALSO failed "
+                    "for user=%s",
+                    user_id,
+                )
+            raise BillingSettlementError(
+                usage_type=usage_type,
+                user_id=user_id,
+                cause=finalize_exc,
+            ) from finalize_exc
+
+        await _record_audit_best_effort(
+            session_factory=session_factory,
+            usage_type=usage_type,
+            search_space_id=search_space_id,
+            user_id=user_id,
+            prompt_tokens=acc.total_prompt_tokens,
+            completion_tokens=acc.total_completion_tokens,
+            total_tokens=acc.grand_total,
+            cost_micros=actual_micros,
+            model_breakdown=acc.per_message_summary(),
+            call_details=call_details,
+            thread_id=thread_id,
+            message_id=message_id,
+            audit_label="premium",
+            timeout_seconds=audit_timeout_seconds,
+        )
+
+
+async def _resolve_agent_billing_for_search_space(
+    session: AsyncSession,
+    search_space_id: int,
+    *,
+    thread_id: int | None = None,
+) -> tuple[UUID, str, str]:
+    """Resolve ``(owner_user_id, billing_tier, base_model)`` for the search-space
+    agent LLM.
+
+    Used by Celery tasks (podcast generation, video presentation) to bill the
+    search-space owner's premium credit pool when the agent LLM is premium.
+
+    Resolution rules mirror chat at ``stream_new_chat.py:2294-2351``:
+
+    - Search space not found / no ``agent_llm_id``: raise ``ValueError``.
+    - **Auto mode** (``id == AUTO_FASTEST_ID == 0``):
+        * ``thread_id`` is set: delegate to
+          ``resolve_or_get_pinned_llm_config_id`` (the same call chat uses) and
+          recurse into the resolved id. Reuses chat's existing pin if present
+          so the same model bills for chat + downstream podcast/video. If the
+          user is not premium-eligible, the pin service auto-restricts to free
+          deployments — denial only happens later in
+          ``billable_call.premium_reserve`` if the pin really is premium and
+          credit ran out mid-flow.
+        * ``thread_id`` is None: fallback to ``("free", "auto")``. Forward-compat
+          for any future direct-API path; today both Celery tasks always pass
+          ``thread_id``.
+    - **Negative id** (global YAML / OpenRouter): ``cfg["billing_tier"]``
+      (defaults to ``"free"`` via ``app/config/__init__.py:52`` setdefault),
+      ``base_model = litellm_params.get("base_model") or model_name`` —
+      NOT provider-prefixed, matching chat's cost-map lookup convention.
+    - **Positive id** (user BYOK ``NewLLMConfig``): always free (matches
+      ``AgentConfig.from_new_llm_config`` which hard-codes ``billing_tier="free"``);
+      ``base_model`` from ``litellm_params`` or ``model_name``.
+
+    Note on imports: ``llm_service``, ``auto_model_pin_service``, and
+    ``llm_router_service`` are imported lazily inside the function body to
+    avoid hoisting litellm side-effects (``litellm.callbacks =
+    [token_tracker]``, ``litellm.drop_params``, etc.) into
+    ``billable_calls.py``'s module load path.
+    """
+    from sqlalchemy import select
+
+    from app.db import NewLLMConfig, SearchSpace
+
+    result = await session.execute(
+        select(SearchSpace).where(SearchSpace.id == search_space_id)
+    )
+    search_space = result.scalars().first()
+    if search_space is None:
+        raise ValueError(f"Search space {search_space_id} not found")
+
+    agent_llm_id = search_space.agent_llm_id
+    if agent_llm_id is None:
+        raise ValueError(
+            f"Search space {search_space_id} has no agent_llm_id configured"
+        )
+
+    owner_user_id: UUID = search_space.user_id
+
+    from app.services.auto_model_pin_service import (
+        AUTO_FASTEST_ID,
+        resolve_or_get_pinned_llm_config_id,
+    )
+
+    if agent_llm_id == AUTO_FASTEST_ID:
+        if thread_id is None:
+            return owner_user_id, "free", "auto"
+        try:
+            resolution = await resolve_or_get_pinned_llm_config_id(
+                session,
+                thread_id=thread_id,
+                search_space_id=search_space_id,
+                user_id=str(owner_user_id),
+                selected_llm_config_id=AUTO_FASTEST_ID,
+            )
+        except ValueError:
+            logger.warning(
+                "[agent_billing] Auto-mode pin resolution failed for "
+                "search_space=%s thread=%s; falling back to free",
+                search_space_id,
+                thread_id,
+                exc_info=True,
+            )
+            return owner_user_id, "free", "auto"
+        agent_llm_id = resolution.resolved_llm_config_id
+
+    if agent_llm_id < 0:
+        from app.services.llm_service import get_global_llm_config
+
+        cfg = get_global_llm_config(agent_llm_id) or {}
+        billing_tier = str(cfg.get("billing_tier", "free")).lower()
+        litellm_params = cfg.get("litellm_params") or {}
+        base_model = litellm_params.get("base_model") or cfg.get("model_name") or ""
+        return owner_user_id, billing_tier, base_model
+
+    nlc_result = await session.execute(
+        select(NewLLMConfig).where(
+            NewLLMConfig.id == agent_llm_id,
+            NewLLMConfig.search_space_id == search_space_id,
+        )
+    )
+    nlc = nlc_result.scalars().first()
+    base_model = ""
+    if nlc is not None:
+        litellm_params = nlc.litellm_params or {}
+        base_model = litellm_params.get("base_model") or nlc.model_name or ""
+    return owner_user_id, "free", base_model
+
+
+__all__ = [
+    "BillingSettlementError",
+    "QuotaInsufficientError",
+    "_resolve_agent_billing_for_search_space",
+    "billable_call",
+]
+
+
+# Re-export the config knob so callers don't have to import config just for
+# the default image reserve.
+DEFAULT_IMAGE_RESERVE_MICROS = config.QUOTA_DEFAULT_IMAGE_RESERVE_MICROS
diff --git a/surfsense_backend/app/services/composio_service.py b/surfsense_backend/app/services/composio_service.py
index 13fe378326..d73a0d4ce1 100644
--- a/surfsense_backend/app/services/composio_service.py
+++ b/surfsense_backend/app/services/composio_service.py
@@ -26,7 +26,7 @@
 }
 
 # Toolkits that support indexing (Phase 1: Google services only)
-INDEXABLE_TOOLKITS = {"googledrive", "gmail", "googlecalendar"}
+INDEXABLE_TOOLKITS = {"googledrive"}
 
 # Mapping of toolkit IDs to connector types
 TOOLKIT_TO_CONNECTOR_TYPE = {
@@ -408,12 +408,37 @@ async def get_drive_files(
             files = []
             next_token = None
             if isinstance(data, dict):
+                inner_data = data.get("data", data)
+                response_data = (
+                    inner_data.get("response_data", {})
+                    if isinstance(inner_data, dict)
+                    else {}
+                )
                 # Try direct access first, then nested
-                files = data.get("files", []) or data.get("data", {}).get("files", [])
+                files = (
+                    data.get("files", [])
+                    or (
+                        inner_data.get("files", [])
+                        if isinstance(inner_data, dict)
+                        else []
+                    )
+                    or response_data.get("files", [])
+                )
                 next_token = (
                     data.get("nextPageToken")
                     or data.get("next_page_token")
-                    or data.get("data", {}).get("nextPageToken")
+                    or (
+                        inner_data.get("nextPageToken")
+                        if isinstance(inner_data, dict)
+                        else None
+                    )
+                    or (
+                        inner_data.get("next_page_token")
+                        if isinstance(inner_data, dict)
+                        else None
+                    )
+                    or response_data.get("nextPageToken")
+                    or response_data.get("next_page_token")
                 )
             elif isinstance(data, list):
                 files = data
@@ -819,24 +844,61 @@ async def get_gmail_messages(
             next_token = None
             result_size_estimate = None
             if isinstance(data, dict):
+                inner_data = data.get("data", data)
+                response_data = (
+                    inner_data.get("response_data", {})
+                    if isinstance(inner_data, dict)
+                    else {}
+                )
                 messages = (
                     data.get("messages", [])
-                    or data.get("data", {}).get("messages", [])
+                    or (
+                        inner_data.get("messages", [])
+                        if isinstance(inner_data, dict)
+                        else []
+                    )
+                    or response_data.get("messages", [])
                     or data.get("emails", [])
+                    or (
+                        inner_data.get("emails", [])
+                        if isinstance(inner_data, dict)
+                        else []
+                    )
+                    or response_data.get("emails", [])
                 )
                 # Check for pagination token in various possible locations
                 next_token = (
                     data.get("nextPageToken")
                     or data.get("next_page_token")
-                    or data.get("data", {}).get("nextPageToken")
-                    or data.get("data", {}).get("next_page_token")
+                    or (
+                        inner_data.get("nextPageToken")
+                        if isinstance(inner_data, dict)
+                        else None
+                    )
+                    or (
+                        inner_data.get("next_page_token")
+                        if isinstance(inner_data, dict)
+                        else None
+                    )
+                    or response_data.get("nextPageToken")
+                    or response_data.get("next_page_token")
                 )
                 # Extract resultSizeEstimate if available (Gmail API provides this)
                 result_size_estimate = (
                     data.get("resultSizeEstimate")
                     or data.get("result_size_estimate")
-                    or data.get("data", {}).get("resultSizeEstimate")
-                    or data.get("data", {}).get("result_size_estimate")
+                    or (
+                        inner_data.get("resultSizeEstimate")
+                        if isinstance(inner_data, dict)
+                        else None
+                    )
+                    or (
+                        inner_data.get("result_size_estimate")
+                        if isinstance(inner_data, dict)
+                        else None
+                    )
+                    or response_data.get("resultSizeEstimate")
+                    or response_data.get("result_size_estimate")
                 )
             elif isinstance(data, list):
                 messages = data
@@ -864,7 +926,7 @@ async def get_gmail_message_detail(
         try:
             result = await self.execute_tool(
                 connected_account_id=connected_account_id,
-                tool_name="GMAIL_GET_MESSAGE_BY_MESSAGE_ID",
+                tool_name="GMAIL_FETCH_MESSAGE_BY_MESSAGE_ID",
                 params={"message_id": message_id},  # snake_case
                 entity_id=entity_id,
             )
@@ -872,7 +934,13 @@ async def get_gmail_message_detail(
             if not result.get("success"):
                 return None, result.get("error", "Unknown error")
 
-            return result.get("data"), None
+            data = result.get("data")
+            if isinstance(data, dict):
+                inner_data = data.get("data", data)
+                if isinstance(inner_data, dict):
+                    return inner_data.get("response_data", inner_data), None
+
+            return data, None
 
         except Exception as e:
             logger.error(f"Failed to get Gmail message detail: {e!s}")
@@ -928,10 +996,27 @@ async def get_calendar_events(
             # Try different possible response structures
             events = []
             if isinstance(data, dict):
+                inner_data = data.get("data", data)
+                response_data = (
+                    inner_data.get("response_data", {})
+                    if isinstance(inner_data, dict)
+                    else {}
+                )
                 events = (
                     data.get("items", [])
-                    or data.get("data", {}).get("items", [])
+                    or (
+                        inner_data.get("items", [])
+                        if isinstance(inner_data, dict)
+                        else []
+                    )
+                    or response_data.get("items", [])
                     or data.get("events", [])
+                    or (
+                        inner_data.get("events", [])
+                        if isinstance(inner_data, dict)
+                        else []
+                    )
+                    or response_data.get("events", [])
                 )
             elif isinstance(data, list):
                 events = data
@@ -942,6 +1027,505 @@ async def get_calendar_events(
             logger.error(f"Failed to list Calendar events: {e!s}")
             return [], str(e)
 
+    @staticmethod
+    def _unwrap_response_data(data: Any) -> Any:
+        """Composio responses often nest the meaningful payload under
+        ``data.data.response_data``. Walk that envelope safely and return
+        whichever inner dict actually has the result keys."""
+        if not isinstance(data, dict):
+            return data
+        inner = data.get("data", data)
+        if isinstance(inner, dict):
+            return inner.get("response_data", inner)
+        return inner
+
+    @staticmethod
+    def _split_email_csv(value: str | None) -> list[str] | None:
+        """Tools accept comma-separated cc/bcc strings; Composio expects an array."""
+        if not value:
+            return None
+        addrs = [e.strip() for e in value.split(",") if e.strip()]
+        return addrs or None
+
+    # ===== Gmail write methods =====
+
+    async def send_gmail_email(
+        self,
+        connected_account_id: str,
+        entity_id: str,
+        to: str,
+        subject: str,
+        body: str,
+        cc: str | None = None,
+        bcc: str | None = None,
+        is_html: bool = False,
+    ) -> tuple[str | None, str | None, str | None]:
+        """Send a Gmail message via the Composio ``GMAIL_SEND_EMAIL`` toolkit.
+
+        Returns:
+            Tuple of (message_id, thread_id, error). On success ``error`` is
+            None and at least one of the IDs is populated when Composio
+            returns them; on failure both IDs are None.
+        """
+        try:
+            params: dict[str, Any] = {
+                "recipient_email": to,
+                "subject": subject,
+                "body": body,
+                "is_html": is_html,
+            }
+            if cc:
+                cc_list = self._split_email_csv(cc)
+                if cc_list:
+                    params["cc"] = cc_list
+            if bcc:
+                bcc_list = self._split_email_csv(bcc)
+                if bcc_list:
+                    params["bcc"] = bcc_list
+
+            result = await self.execute_tool(
+                connected_account_id=connected_account_id,
+                tool_name="GMAIL_SEND_EMAIL",
+                params=params,
+                entity_id=entity_id,
+            )
+            if not result.get("success"):
+                return None, None, result.get("error", "Unknown error")
+
+            payload = self._unwrap_response_data(result.get("data", {}))
+            message_id = None
+            thread_id = None
+            if isinstance(payload, dict):
+                message_id = (
+                    payload.get("id")
+                    or payload.get("message_id")
+                    or payload.get("messageId")
+                )
+                thread_id = payload.get("threadId") or payload.get("thread_id")
+            return message_id, thread_id, None
+        except Exception as e:
+            logger.error(f"Failed to send Gmail email: {e!s}")
+            return None, None, str(e)
+
+    async def create_gmail_draft(
+        self,
+        connected_account_id: str,
+        entity_id: str,
+        to: str,
+        subject: str,
+        body: str,
+        cc: str | None = None,
+        bcc: str | None = None,
+        is_html: bool = False,
+    ) -> tuple[str | None, str | None, str | None, str | None]:
+        """Create a Gmail draft via the Composio ``GMAIL_CREATE_EMAIL_DRAFT`` toolkit.
+
+        Returns:
+            Tuple of (draft_id, message_id, thread_id, error). On success
+            ``error`` is None and ``draft_id`` is populated.
+        """
+        try:
+            params: dict[str, Any] = {
+                "recipient_email": to,
+                "subject": subject,
+                "body": body,
+                "is_html": is_html,
+            }
+            cc_list = self._split_email_csv(cc)
+            if cc_list:
+                params["cc"] = cc_list
+            bcc_list = self._split_email_csv(bcc)
+            if bcc_list:
+                params["bcc"] = bcc_list
+
+            result = await self.execute_tool(
+                connected_account_id=connected_account_id,
+                tool_name="GMAIL_CREATE_EMAIL_DRAFT",
+                params=params,
+                entity_id=entity_id,
+            )
+            if not result.get("success"):
+                return None, None, None, result.get("error", "Unknown error")
+
+            payload = self._unwrap_response_data(result.get("data", {}))
+            draft_id = None
+            message_id = None
+            thread_id = None
+            if isinstance(payload, dict):
+                draft_id = payload.get("id") or payload.get("draft_id")
+                draft_message = payload.get("message") or {}
+                if isinstance(draft_message, dict):
+                    message_id = draft_message.get("id") or draft_message.get(
+                        "message_id"
+                    )
+                    thread_id = draft_message.get("threadId") or draft_message.get(
+                        "thread_id"
+                    )
+                if message_id is None:
+                    message_id = payload.get("message_id") or payload.get("messageId")
+                if thread_id is None:
+                    thread_id = payload.get("thread_id") or payload.get("threadId")
+            return draft_id, message_id, thread_id, None
+        except Exception as e:
+            logger.error(f"Failed to create Gmail draft: {e!s}")
+            return None, None, None, str(e)
+
+    async def update_gmail_draft(
+        self,
+        connected_account_id: str,
+        entity_id: str,
+        draft_id: str,
+        to: str | None = None,
+        subject: str | None = None,
+        body: str | None = None,
+        cc: str | None = None,
+        bcc: str | None = None,
+        is_html: bool = False,
+    ) -> tuple[str | None, str | None, str | None]:
+        """Update an existing Gmail draft via ``GMAIL_UPDATE_DRAFT``.
+
+        Returns:
+            Tuple of (draft_id, message_id, error).
+        """
+        try:
+            params: dict[str, Any] = {
+                "draft_id": draft_id,
+                "is_html": is_html,
+            }
+            if to:
+                params["recipient_email"] = to
+            if subject is not None:
+                params["subject"] = subject
+            if body is not None:
+                params["body"] = body
+            cc_list = self._split_email_csv(cc)
+            if cc_list:
+                params["cc"] = cc_list
+            bcc_list = self._split_email_csv(bcc)
+            if bcc_list:
+                params["bcc"] = bcc_list
+
+            result = await self.execute_tool(
+                connected_account_id=connected_account_id,
+                tool_name="GMAIL_UPDATE_DRAFT",
+                params=params,
+                entity_id=entity_id,
+            )
+            if not result.get("success"):
+                return None, None, result.get("error", "Unknown error")
+
+            payload = self._unwrap_response_data(result.get("data", {}))
+            new_draft_id = draft_id
+            message_id = None
+            if isinstance(payload, dict):
+                new_draft_id = payload.get("id") or payload.get("draft_id") or draft_id
+                draft_message = payload.get("message") or {}
+                if isinstance(draft_message, dict):
+                    message_id = draft_message.get("id") or draft_message.get(
+                        "message_id"
+                    )
+                if message_id is None:
+                    message_id = payload.get("message_id") or payload.get("messageId")
+            return new_draft_id, message_id, None
+        except Exception as e:
+            logger.error(f"Failed to update Gmail draft: {e!s}")
+            return None, None, str(e)
+
+    async def trash_gmail_message(
+        self,
+        connected_account_id: str,
+        entity_id: str,
+        message_id: str,
+    ) -> str | None:
+        """Move a Gmail message to trash via ``GMAIL_MOVE_TO_TRASH``.
+
+        Returns the error message on failure, ``None`` on success.
+        """
+        try:
+            result = await self.execute_tool(
+                connected_account_id=connected_account_id,
+                tool_name="GMAIL_MOVE_TO_TRASH",
+                params={"message_id": message_id},
+                entity_id=entity_id,
+            )
+            if not result.get("success"):
+                return result.get("error", "Unknown error")
+            return None
+        except Exception as e:
+            logger.error(f"Failed to trash Gmail message: {e!s}")
+            return str(e)
+
+    # ===== Google Calendar write methods =====
+
+    async def create_calendar_event(
+        self,
+        connected_account_id: str,
+        entity_id: str,
+        summary: str,
+        start_datetime: str,
+        end_datetime: str,
+        timezone: str | None = None,
+        description: str | None = None,
+        location: str | None = None,
+        attendees: list[str] | None = None,
+        calendar_id: str = "primary",
+    ) -> tuple[str | None, str | None, str | None]:
+        """Create a Google Calendar event via ``GOOGLECALENDAR_CREATE_EVENT``.
+
+        Composio strips trailing timezone info on ``start_datetime`` /
+        ``end_datetime`` and uses the ``timezone`` field as the IANA name,
+        so callers may pass ISO 8601 strings with or without offsets.
+
+        Returns:
+            Tuple of (event_id, html_link, error).
+        """
+        try:
+            params: dict[str, Any] = {
+                "summary": summary,
+                "start_datetime": start_datetime,
+                "end_datetime": end_datetime,
+                "calendar_id": calendar_id,
+            }
+            if timezone:
+                params["timezone"] = timezone
+            if description:
+                params["description"] = description
+            if location:
+                params["location"] = location
+            if attendees:
+                params["attendees"] = [a for a in attendees if a]
+
+            result = await self.execute_tool(
+                connected_account_id=connected_account_id,
+                tool_name="GOOGLECALENDAR_CREATE_EVENT",
+                params=params,
+                entity_id=entity_id,
+            )
+            if not result.get("success"):
+                return None, None, result.get("error", "Unknown error")
+
+            payload = self._unwrap_response_data(result.get("data", {}))
+            event_id = None
+            html_link = None
+            if isinstance(payload, dict):
+                event_id = payload.get("id") or payload.get("event_id")
+                html_link = payload.get("htmlLink") or payload.get("html_link")
+            return event_id, html_link, None
+        except Exception as e:
+            logger.error(f"Failed to create Calendar event: {e!s}")
+            return None, None, str(e)
+
+    async def update_calendar_event(
+        self,
+        connected_account_id: str,
+        entity_id: str,
+        event_id: str,
+        summary: str | None = None,
+        start_time: str | None = None,
+        end_time: str | None = None,
+        timezone: str | None = None,
+        description: str | None = None,
+        location: str | None = None,
+        attendees: list[str] | None = None,
+        calendar_id: str = "primary",
+    ) -> tuple[str | None, str | None, str | None]:
+        """Patch an existing Google Calendar event via ``GOOGLECALENDAR_PATCH_EVENT``.
+
+        Uses PATCH (not PUT) semantics so omitted fields are preserved.
+
+        Returns:
+            Tuple of (event_id, html_link, error).
+        """
+        try:
+            params: dict[str, Any] = {
+                "event_id": event_id,
+                "calendar_id": calendar_id,
+            }
+            if summary is not None:
+                params["summary"] = summary
+            if start_time is not None:
+                params["start_time"] = start_time
+            if end_time is not None:
+                params["end_time"] = end_time
+            if timezone:
+                params["timezone"] = timezone
+            if description is not None:
+                params["description"] = description
+            if location is not None:
+                params["location"] = location
+            if attendees is not None:
+                params["attendees"] = [a for a in attendees if a]
+
+            result = await self.execute_tool(
+                connected_account_id=connected_account_id,
+                tool_name="GOOGLECALENDAR_PATCH_EVENT",
+                params=params,
+                entity_id=entity_id,
+            )
+            if not result.get("success"):
+                return None, None, result.get("error", "Unknown error")
+
+            payload = self._unwrap_response_data(result.get("data", {}))
+            new_event_id = event_id
+            html_link = None
+            if isinstance(payload, dict):
+                new_event_id = payload.get("id") or payload.get("event_id") or event_id
+                html_link = payload.get("htmlLink") or payload.get("html_link")
+            return new_event_id, html_link, None
+        except Exception as e:
+            logger.error(f"Failed to patch Calendar event: {e!s}")
+            return None, None, str(e)
+
+    async def delete_calendar_event(
+        self,
+        connected_account_id: str,
+        entity_id: str,
+        event_id: str,
+        calendar_id: str = "primary",
+    ) -> str | None:
+        """Delete a Google Calendar event via ``GOOGLECALENDAR_DELETE_EVENT``.
+
+        Returns the error message on failure, ``None`` on success (idempotent
+        on already-deleted events).
+        """
+        try:
+            result = await self.execute_tool(
+                connected_account_id=connected_account_id,
+                tool_name="GOOGLECALENDAR_DELETE_EVENT",
+                params={
+                    "event_id": event_id,
+                    "calendar_id": calendar_id,
+                },
+                entity_id=entity_id,
+            )
+            if not result.get("success"):
+                return result.get("error", "Unknown error")
+            return None
+        except Exception as e:
+            logger.error(f"Failed to delete Calendar event: {e!s}")
+            return str(e)
+
+    # ===== Google Drive write methods =====
+
+    @staticmethod
+    def _drive_web_view_link(file_id: str, mime_type: str | None) -> str:
+        """Synthesize a Google Drive ``webViewLink`` from id + mimeType.
+
+        Composio's ``GOOGLEDRIVE_CREATE_FILE_FROM_TEXT`` returns flat
+        metadata (id, name, mimeType) but does not always include a
+        ``webViewLink``. We rebuild the canonical UI URL based on the
+        Workspace MIME type so callers can keep using a single field.
+        """
+        if not file_id:
+            return ""
+        mt = (mime_type or "").lower()
+        if mt == "application/vnd.google-apps.document":
+            return f"https://docs.google.com/document/d/{file_id}/edit"
+        if mt == "application/vnd.google-apps.spreadsheet":
+            return f"https://docs.google.com/spreadsheets/d/{file_id}/edit"
+        if mt == "application/vnd.google-apps.presentation":
+            return f"https://docs.google.com/presentation/d/{file_id}/edit"
+        if mt == "application/vnd.google-apps.folder":
+            return f"https://drive.google.com/drive/folders/{file_id}"
+        return f"https://drive.google.com/file/d/{file_id}/view"
+
+    async def create_drive_file_from_text(
+        self,
+        connected_account_id: str,
+        entity_id: str,
+        name: str,
+        mime_type: str,
+        content: str | None = None,
+        parent_id: str | None = None,
+    ) -> tuple[dict[str, Any] | None, str | None]:
+        """Create a Google Drive file from text via ``GOOGLEDRIVE_CREATE_FILE_FROM_TEXT``.
+
+        Composio's tool requires ``text_content`` even for "empty" files;
+        an empty string is accepted. Native Workspace types (Docs, Sheets)
+        are produced by setting ``mime_type`` to the Google Apps MIME, and
+        Drive auto-converts the text payload (e.g. CSV → Sheet).
+
+        Returns:
+            Tuple of (file_meta, error). ``file_meta`` keys:
+            ``id``, ``name``, ``mimeType``, ``webViewLink``.
+        """
+        try:
+            params: dict[str, Any] = {
+                "file_name": name,
+                "mime_type": mime_type,
+                "text_content": content if content is not None else "",
+            }
+            if parent_id:
+                params["parent_id"] = parent_id
+
+            result = await self.execute_tool(
+                connected_account_id=connected_account_id,
+                tool_name="GOOGLEDRIVE_CREATE_FILE_FROM_TEXT",
+                params=params,
+                entity_id=entity_id,
+            )
+            if not result.get("success"):
+                return None, result.get("error", "Unknown error")
+
+            payload = self._unwrap_response_data(result.get("data", {}))
+            file_id: str | None = None
+            file_name: str | None = name
+            mime: str | None = mime_type
+            web_view_link: str | None = None
+
+            if isinstance(payload, dict):
+                file_id = (
+                    payload.get("id") or payload.get("file_id") or payload.get("fileId")
+                )
+                file_name = payload.get("name") or payload.get("file_name") or name
+                mime = payload.get("mimeType") or payload.get("mime_type") or mime_type
+                web_view_link = payload.get("webViewLink") or payload.get(
+                    "web_view_link"
+                )
+
+            if not file_id:
+                return None, "Composio response did not include a file id"
+
+            if not web_view_link:
+                web_view_link = self._drive_web_view_link(file_id, mime)
+
+            return (
+                {
+                    "id": file_id,
+                    "name": file_name,
+                    "mimeType": mime,
+                    "webViewLink": web_view_link,
+                },
+                None,
+            )
+        except Exception as e:
+            logger.error(f"Failed to create Drive file: {e!s}")
+            return None, str(e)
+
+    async def trash_drive_file(
+        self,
+        connected_account_id: str,
+        entity_id: str,
+        file_id: str,
+    ) -> str | None:
+        """Move a Google Drive file to trash via ``GOOGLEDRIVE_TRASH_FILE``.
+
+        Returns the error message on failure, ``None`` on success.
+        """
+        try:
+            result = await self.execute_tool(
+                connected_account_id=connected_account_id,
+                tool_name="GOOGLEDRIVE_TRASH_FILE",
+                params={"file_id": file_id},
+                entity_id=entity_id,
+            )
+            if not result.get("success"):
+                return result.get("error", "Unknown error")
+            return None
+        except Exception as e:
+            logger.error(f"Failed to trash Drive file: {e!s}")
+            return str(e)
+
     # ===== User Info Methods =====
 
     async def get_connected_account_email(
diff --git a/surfsense_backend/app/services/confluence/kb_sync_service.py b/surfsense_backend/app/services/confluence/kb_sync_service.py
index f786a9920c..cae2bef880 100644
--- a/surfsense_backend/app/services/confluence/kb_sync_service.py
+++ b/surfsense_backend/app/services/confluence/kb_sync_service.py
@@ -5,7 +5,6 @@
 
 from app.connectors.confluence_history import ConfluenceHistoryConnector
 from app.db import Document, DocumentType
-from app.services.llm_service import get_user_long_context_llm
 from app.utils.document_converters import (
     create_document_chunks,
     embed_text,
@@ -66,6 +65,8 @@ async def sync_after_create(
             if dup:
                 content_hash = unique_hash
 
+            from app.services.llm_service import get_user_long_context_llm
+
             user_llm = await get_user_long_context_llm(
                 self.db_session,
                 user_id,
@@ -184,6 +185,8 @@ async def sync_after_update(
 
             space_id = (document.document_metadata or {}).get("space_id", "")
 
+            from app.services.llm_service import get_user_long_context_llm
+
             user_llm = await get_user_long_context_llm(
                 self.db_session, user_id, search_space_id, disable_streaming=True
             )
diff --git a/surfsense_backend/app/services/connector_service.py b/surfsense_backend/app/services/connector_service.py
index 7c55da2e54..45bcfd00f8 100644
--- a/surfsense_backend/app/services/connector_service.py
+++ b/surfsense_backend/app/services/connector_service.py
@@ -1,6 +1,8 @@
 import asyncio
+import os
 import time
 from datetime import datetime
+from threading import Lock
 from typing import Any
 
 import httpx
@@ -2769,12 +2771,22 @@ async def get_available_connectors(
         """
         Get all available (enabled) connector types for a search space.
 
+        Phase 1.4: results are cached per ``search_space_id`` for
+        :data:`_DISCOVERY_TTL_SECONDS`. Cache key is independent of session
+        identity — the cached value is plain data, safe to share across
+        requests. Invalidate on connector add/update/delete via
+        :func:`invalidate_connector_discovery_cache`.
+
         Args:
             search_space_id: The search space ID
 
         Returns:
             List of SearchSourceConnectorType enums for enabled connectors
         """
+        cached = _get_cached_connectors(search_space_id)
+        if cached is not None:
+            return list(cached)
+
         query = (
             select(SearchSourceConnector.connector_type)
             .filter(
@@ -2784,8 +2796,9 @@ async def get_available_connectors(
         )
 
         result = await self.session.execute(query)
-        connector_types = result.scalars().all()
-        return list(connector_types)
+        connector_types = list(result.scalars().all())
+        _set_cached_connectors(search_space_id, connector_types)
+        return connector_types
 
     async def get_available_document_types(
         self,
@@ -2794,12 +2807,22 @@ async def get_available_document_types(
         """
         Get all document types that have at least one document in the search space.
 
+        Phase 1.4: cached per ``search_space_id`` for
+        :data:`_DISCOVERY_TTL_SECONDS`. Invalidate via
+        :func:`invalidate_connector_discovery_cache` when a connector
+        finishes indexing new documents (or document types are otherwise
+        added/removed).
+
         Args:
             search_space_id: The search space ID
 
         Returns:
             List of document type strings that have documents indexed
         """
+        cached = _get_cached_doc_types(search_space_id)
+        if cached is not None:
+            return list(cached)
+
         from sqlalchemy import distinct
 
         from app.db import Document
@@ -2809,5 +2832,164 @@ async def get_available_document_types(
         )
 
         result = await self.session.execute(query)
-        doc_types = result.scalars().all()
-        return [str(dt) for dt in doc_types]
+        doc_types = [str(dt) for dt in result.scalars().all()]
+        _set_cached_doc_types(search_space_id, doc_types)
+        return doc_types
+
+
+# ---------------------------------------------------------------------------
+# Connector / document-type discovery TTL cache (Phase 1.4)
+# ---------------------------------------------------------------------------
+#
+# Both ``get_available_connectors`` and ``get_available_document_types`` are
+# called on EVERY chat turn from ``create_surfsense_deep_agent``. Each query
+# hits Postgres and contributes to per-turn agent build latency. Their
+# results change infrequently — only when the user adds/edits/removes a
+# connector, or when an indexer commits a new document type. A short TTL
+# cache (default 30s, env-tunable) collapses N concurrent calls into one
+# DB roundtrip with bounded staleness.
+#
+# Invalidation: connector mutation routes (create / update / delete) call
+# ``invalidate_connector_discovery_cache(search_space_id)`` to clear the
+# entry for the affected space. Multi-replica deployments still pay one
+# DB roundtrip per replica per TTL window, which is fine — staleness is
+# bounded and the alternative (cross-replica fanout) is not worth the
+# coupling here.
+
+_DISCOVERY_TTL_SECONDS: float = float(
+    os.getenv("SURFSENSE_CONNECTOR_DISCOVERY_TTL_SECONDS", "30")
+)
+
+# Per-search-space caches. Keyed by ``search_space_id``; value is
+# ``(expires_at_monotonic, payload)``. Plain dicts protected by a lock —
+# read-mostly workload, sub-microsecond contention.
+_connectors_cache: dict[int, tuple[float, list[SearchSourceConnectorType]]] = {}
+_doc_types_cache: dict[int, tuple[float, list[str]]] = {}
+_cache_lock = Lock()
+
+
+def _get_cached_connectors(
+    search_space_id: int,
+) -> list[SearchSourceConnectorType] | None:
+    if _DISCOVERY_TTL_SECONDS <= 0:
+        return None
+    with _cache_lock:
+        entry = _connectors_cache.get(search_space_id)
+        if entry is None:
+            return None
+        expires_at, payload = entry
+        if time.monotonic() >= expires_at:
+            _connectors_cache.pop(search_space_id, None)
+            return None
+        return payload
+
+
+def _set_cached_connectors(
+    search_space_id: int, payload: list[SearchSourceConnectorType]
+) -> None:
+    if _DISCOVERY_TTL_SECONDS <= 0:
+        return
+    expires_at = time.monotonic() + _DISCOVERY_TTL_SECONDS
+    with _cache_lock:
+        _connectors_cache[search_space_id] = (expires_at, list(payload))
+
+
+def _get_cached_doc_types(search_space_id: int) -> list[str] | None:
+    if _DISCOVERY_TTL_SECONDS <= 0:
+        return None
+    with _cache_lock:
+        entry = _doc_types_cache.get(search_space_id)
+        if entry is None:
+            return None
+        expires_at, payload = entry
+        if time.monotonic() >= expires_at:
+            _doc_types_cache.pop(search_space_id, None)
+            return None
+        return payload
+
+
+def _set_cached_doc_types(search_space_id: int, payload: list[str]) -> None:
+    if _DISCOVERY_TTL_SECONDS <= 0:
+        return
+    expires_at = time.monotonic() + _DISCOVERY_TTL_SECONDS
+    with _cache_lock:
+        _doc_types_cache[search_space_id] = (expires_at, list(payload))
+
+
+def invalidate_connector_discovery_cache(search_space_id: int | None = None) -> None:
+    """Drop cached discovery results for ``search_space_id`` (or all spaces).
+
+    Connector CRUD routes / indexer pipelines call this when they mutate
+    the rows backing :func:`ConnectorService.get_available_connectors` /
+    :func:`get_available_document_types`. ``None`` clears every space —
+    useful in tests and on bulk imports.
+    """
+    with _cache_lock:
+        if search_space_id is None:
+            _connectors_cache.clear()
+            _doc_types_cache.clear()
+        else:
+            _connectors_cache.pop(search_space_id, None)
+            _doc_types_cache.pop(search_space_id, None)
+
+
+def _invalidate_connectors_only(search_space_id: int | None = None) -> None:
+    with _cache_lock:
+        if search_space_id is None:
+            _connectors_cache.clear()
+        else:
+            _connectors_cache.pop(search_space_id, None)
+
+
+def _invalidate_doc_types_only(search_space_id: int | None = None) -> None:
+    with _cache_lock:
+        if search_space_id is None:
+            _doc_types_cache.clear()
+        else:
+            _doc_types_cache.pop(search_space_id, None)
+
+
+def _register_invalidation_listeners() -> None:
+    """Wire SQLAlchemy ORM events so cache stays consistent automatically.
+
+    Listening on ``after_insert`` / ``after_update`` / ``after_delete``
+    means every successful INSERT/UPDATE/DELETE that goes through the ORM
+    invalidates the affected search space's cached discovery payload —
+    no need to sprinkle ``invalidate_*`` calls across 30+ connector
+    routes. Bulk operations that bypass the ORM (e.g.
+    ``session.execute(insert(...))`` without a mapped object) still need
+    explicit invalidation; document indexers already commit through the
+    ORM so document-type discovery is covered.
+    """
+    from sqlalchemy import event
+
+    # Imported here (not at module top) to avoid a circular import:
+    # app.services.connector_service is itself imported from app.db's
+    # ecosystem indirectly via several CRUD modules.
+    from app.db import Document, SearchSourceConnector
+
+    def _connector_changed(_mapper, _connection, target) -> None:
+        sid = getattr(target, "search_space_id", None)
+        if sid is not None:
+            _invalidate_connectors_only(int(sid))
+
+    def _document_changed(_mapper, _connection, target) -> None:
+        sid = getattr(target, "search_space_id", None)
+        if sid is not None:
+            _invalidate_doc_types_only(int(sid))
+
+    for evt in ("after_insert", "after_update", "after_delete"):
+        event.listen(SearchSourceConnector, evt, _connector_changed)
+        event.listen(Document, evt, _document_changed)
+
+
+try:
+    _register_invalidation_listeners()
+except Exception:  # pragma: no cover - defensive; never block module import
+    import logging as _logging
+
+    _logging.getLogger(__name__).exception(
+        "Failed to register connector discovery cache invalidation listeners; "
+        "stale cache risk: explicit invalidate_connector_discovery_cache calls "
+        "may be required."
+    )
diff --git a/surfsense_backend/app/services/docling_service.py b/surfsense_backend/app/services/docling_service.py
index af9a7d2d59..cf51efb4aa 100644
--- a/surfsense_backend/app/services/docling_service.py
+++ b/surfsense_backend/app/services/docling_service.py
@@ -77,10 +77,16 @@ def _initialize_docling(self):
             # Create pipeline options with version-safe attribute checking
             pipeline_options = PdfPipelineOptions()
 
-            # Disable OCR (user request)
+            # Enable OCR so text-in-image (chart axes, ECG annotations,
+            # lab tables embedded as images, scanned pages, etc.) is
+            # lifted into the main markdown stream. This pairs with the
+            # vision-LLM picture-description pass downstream — OCR
+            # captures literal text; vision LLM captures the visual
+            # content. Together they give a faithful representation of
+            # PDFs that mix text and images.
             if hasattr(pipeline_options, "do_ocr"):
-                pipeline_options.do_ocr = False
-                logger.info("⚠️ OCR disabled by user request")
+                pipeline_options.do_ocr = True
+                logger.info("✅ OCR enabled for embedded text-in-image extraction")
             else:
                 logger.warning("⚠️ OCR attribute not available in this Docling version")
 
diff --git a/surfsense_backend/app/services/dropbox/kb_sync_service.py b/surfsense_backend/app/services/dropbox/kb_sync_service.py
index 2a74bdf4b9..9d19510132 100644
--- a/surfsense_backend/app/services/dropbox/kb_sync_service.py
+++ b/surfsense_backend/app/services/dropbox/kb_sync_service.py
@@ -5,7 +5,6 @@
 
 from app.db import Document, DocumentType
 from app.indexing_pipeline.document_hashing import compute_identifier_hash
-from app.services.llm_service import get_user_long_context_llm
 from app.utils.document_converters import (
     create_document_chunks,
     embed_text,
@@ -73,6 +72,8 @@ async def sync_after_create(
                 )
                 content_hash = unique_hash
 
+            from app.services.llm_service import get_user_long_context_llm
+
             user_llm = await get_user_long_context_llm(
                 self.db_session,
                 user_id,
diff --git a/surfsense_backend/app/services/export_service.py b/surfsense_backend/app/services/export_service.py
new file mode 100644
index 0000000000..97f9522233
--- /dev/null
+++ b/surfsense_backend/app/services/export_service.py
@@ -0,0 +1,200 @@
+"""Service for exporting knowledge base content as a ZIP archive."""
+
+import asyncio
+import logging
+import os
+import tempfile
+import zipfile
+from dataclasses import dataclass, field
+
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.future import select
+
+from app.db import Chunk, Document, Folder
+from app.services.folder_service import get_folder_subtree_ids
+
+logger = logging.getLogger(__name__)
+
+
+def _sanitize_filename(title: str) -> str:
+    safe = "".join(c if c.isalnum() or c in " -_." else "_" for c in title).strip()
+    return safe[:80] or "document"
+
+
+def _build_folder_path_map(folders: list[Folder]) -> dict[int, str]:
+    """Build a mapping of folder_id -> full path string (e.g. 'Research/AI')."""
+    id_to_folder = {f.id: f for f in folders}
+    cache: dict[int, str] = {}
+
+    def resolve(folder_id: int) -> str:
+        if folder_id in cache:
+            return cache[folder_id]
+        folder = id_to_folder[folder_id]
+        safe_name = _sanitize_filename(folder.name)
+        if folder.parent_id is None or folder.parent_id not in id_to_folder:
+            cache[folder_id] = safe_name
+        else:
+            cache[folder_id] = f"{resolve(folder.parent_id)}/{safe_name}"
+        return cache[folder_id]
+
+    for f in folders:
+        resolve(f.id)
+
+    return cache
+
+
+async def _get_document_markdown(
+    session: AsyncSession, document: Document
+) -> str | None:
+    """Resolve markdown content using the 3-tier fallback:
+    1. source_markdown  2. blocknote_document conversion  3. chunk concatenation
+    """
+    if document.source_markdown is not None:
+        return document.source_markdown
+
+    if document.blocknote_document:
+        from app.utils.blocknote_to_markdown import blocknote_to_markdown
+
+        md = blocknote_to_markdown(document.blocknote_document)
+        if md:
+            return md
+
+    chunk_result = await session.execute(
+        select(Chunk.content)
+        .filter(Chunk.document_id == document.id)
+        .order_by(Chunk.id)
+    )
+    chunks = chunk_result.scalars().all()
+    if chunks:
+        return "\n\n".join(chunks)
+
+    return None
+
+
+@dataclass
+class ExportResult:
+    zip_path: str
+    export_name: str
+    zip_size: int
+    skipped_docs: list[str] = field(default_factory=list)
+
+
+async def build_export_zip(
+    session: AsyncSession,
+    search_space_id: int,
+    folder_id: int | None = None,
+) -> ExportResult:
+    """Build a ZIP archive of markdown documents preserving folder structure.
+
+    Returns an ExportResult with the path to the temp ZIP file.
+    The caller is responsible for streaming and cleaning up the file.
+
+    Raises ValueError if folder_id is provided but not found.
+    """
+    if folder_id is not None:
+        folder = await session.get(Folder, folder_id)
+        if not folder or folder.search_space_id != search_space_id:
+            raise ValueError("Folder not found")
+        target_folder_ids = set(await get_folder_subtree_ids(session, folder_id))
+    else:
+        target_folder_ids = None
+
+    folder_query = select(Folder).where(Folder.search_space_id == search_space_id)
+    if target_folder_ids is not None:
+        folder_query = folder_query.where(Folder.id.in_(target_folder_ids))
+    folder_result = await session.execute(folder_query)
+    folders = list(folder_result.scalars().all())
+
+    folder_path_map = _build_folder_path_map(folders)
+
+    batch_size = 100
+
+    base_doc_query = select(Document).where(Document.search_space_id == search_space_id)
+    if target_folder_ids is not None:
+        base_doc_query = base_doc_query.where(Document.folder_id.in_(target_folder_ids))
+    base_doc_query = base_doc_query.order_by(Document.id)
+
+    fd, tmp_path = tempfile.mkstemp(suffix=".zip")
+    os.close(fd)
+
+    used_paths: dict[str, int] = {}
+    skipped_docs: list[str] = []
+    is_first_batch = True
+
+    try:
+        offset = 0
+        while True:
+            batch_query = base_doc_query.limit(batch_size).offset(offset)
+            batch_result = await session.execute(batch_query)
+            documents = list(batch_result.scalars().all())
+            if not documents:
+                break
+
+            entries: list[tuple[str, str]] = []
+
+            for doc in documents:
+                status = doc.status or {}
+                state = (
+                    status.get("state", "ready")
+                    if isinstance(status, dict)
+                    else "ready"
+                )
+                if state in ("pending", "processing"):
+                    skipped_docs.append(doc.title or "Untitled")
+                    continue
+
+                markdown = await _get_document_markdown(session, doc)
+                if not markdown or not markdown.strip():
+                    continue
+
+                if doc.folder_id and doc.folder_id in folder_path_map:
+                    dir_path = folder_path_map[doc.folder_id]
+                else:
+                    dir_path = ""
+
+                base_name = _sanitize_filename(doc.title or "Untitled")
+                file_path = (
+                    f"{dir_path}/{base_name}.md" if dir_path else f"{base_name}.md"
+                )
+
+                if file_path in used_paths:
+                    used_paths[file_path] += 1
+                    suffix = used_paths[file_path]
+                    file_path = (
+                        f"{dir_path}/{base_name}_{suffix}.md"
+                        if dir_path
+                        else f"{base_name}_{suffix}.md"
+                    )
+                used_paths[file_path] = used_paths.get(file_path, 0) + 1
+
+                entries.append((file_path, markdown))
+
+            if entries:
+                mode = "w" if is_first_batch else "a"
+                batch_entries = entries
+
+                def _write_batch(m: str = mode, e: list = batch_entries) -> None:
+                    with zipfile.ZipFile(tmp_path, m, zipfile.ZIP_DEFLATED) as zf:
+                        for path, content in e:
+                            zf.writestr(path, content)
+
+                await asyncio.to_thread(_write_batch)
+                is_first_batch = False
+
+            offset += batch_size
+
+        export_name = "knowledge-base"
+        if folder_id is not None and folder_id in folder_path_map:
+            export_name = _sanitize_filename(folder_path_map[folder_id].split("/")[0])
+
+        return ExportResult(
+            zip_path=tmp_path,
+            export_name=export_name,
+            zip_size=os.path.getsize(tmp_path),
+            skipped_docs=skipped_docs,
+        )
+
+    except Exception:
+        if os.path.exists(tmp_path):
+            os.unlink(tmp_path)
+        raise
diff --git a/surfsense_backend/app/services/folder_service.py b/surfsense_backend/app/services/folder_service.py
index fc1fb8a75b..f5b608600f 100644
--- a/surfsense_backend/app/services/folder_service.py
+++ b/surfsense_backend/app/services/folder_service.py
@@ -142,6 +142,58 @@ async def generate_folder_position(
     return generate_key_between(last_position, None)
 
 
+async def ensure_folder_hierarchy_with_depth_validation(
+    session: AsyncSession,
+    search_space_id: int,
+    path_segments: list[dict],
+) -> Folder:
+    """Create or return a nested folder chain, validating depth at each step.
+
+    Each item in ``path_segments`` is a dict with:
+      - ``name``  (str): folder display name
+      - ``metadata`` (dict | None): optional ``folder_metadata`` JSONB payload
+
+    Returns the deepest (leaf) Folder in the chain.
+    """
+    parent_id: int | None = None
+    current_folder: Folder | None = None
+
+    for segment in path_segments:
+        name = segment["name"]
+        metadata = segment.get("metadata")
+
+        stmt = select(Folder).where(
+            Folder.search_space_id == search_space_id,
+            Folder.name == name,
+            Folder.parent_id == parent_id
+            if parent_id is not None
+            else Folder.parent_id.is_(None),
+        )
+        result = await session.execute(stmt)
+        folder = result.scalar_one_or_none()
+
+        if folder is None:
+            await validate_folder_depth(session, parent_id, subtree_depth=0)
+            position = await generate_folder_position(
+                session, search_space_id, parent_id
+            )
+            folder = Folder(
+                name=name,
+                search_space_id=search_space_id,
+                parent_id=parent_id,
+                position=position,
+                folder_metadata=metadata,
+            )
+            session.add(folder)
+            await session.flush()
+
+        current_folder = folder
+        parent_id = folder.id
+
+    assert current_folder is not None, "path_segments must not be empty"
+    return current_folder
+
+
 async def get_folder_subtree_ids(session: AsyncSession, folder_id: int) -> list[int]:
     """Return all folder IDs in the subtree rooted at folder_id (inclusive)."""
     result = await session.execute(
diff --git a/surfsense_backend/app/services/gmail/kb_sync_service.py b/surfsense_backend/app/services/gmail/kb_sync_service.py
index b3b50d3056..885ee4b940 100644
--- a/surfsense_backend/app/services/gmail/kb_sync_service.py
+++ b/surfsense_backend/app/services/gmail/kb_sync_service.py
@@ -4,7 +4,6 @@
 from sqlalchemy.ext.asyncio import AsyncSession
 
 from app.db import Document, DocumentType
-from app.services.llm_service import get_user_long_context_llm
 from app.utils.document_converters import (
     create_document_chunks,
     embed_text,
@@ -78,6 +77,8 @@ async def sync_after_create(
                 )
                 content_hash = unique_hash
 
+            from app.services.llm_service import get_user_long_context_llm
+
             user_llm = await get_user_long_context_llm(
                 self.db_session,
                 user_id,
diff --git a/surfsense_backend/app/services/gmail/tool_metadata_service.py b/surfsense_backend/app/services/gmail/tool_metadata_service.py
index c903e24afc..4855c1cc9d 100644
--- a/surfsense_backend/app/services/gmail/tool_metadata_service.py
+++ b/surfsense_backend/app/services/gmail/tool_metadata_service.py
@@ -17,7 +17,7 @@
     SearchSourceConnector,
     SearchSourceConnectorType,
 )
-from app.utils.google_credentials import build_composio_credentials
+from app.services.composio_service import ComposioService
 
 logger = logging.getLogger(__name__)
 
@@ -78,14 +78,49 @@ class GmailToolMetadataService:
     def __init__(self, db_session: AsyncSession):
         self._db_session = db_session
 
-    async def _build_credentials(self, connector: SearchSourceConnector) -> Credentials:
-        if (
+    def _is_composio_connector(self, connector: SearchSourceConnector) -> bool:
+        return (
             connector.connector_type
             == SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR
-        ):
-            cca_id = connector.config.get("composio_connected_account_id")
-            if cca_id:
-                return build_composio_credentials(cca_id)
+        )
+
+    def _get_composio_connected_account_id(
+        self, connector: SearchSourceConnector
+    ) -> str:
+        cca_id = connector.config.get("composio_connected_account_id")
+        if not cca_id:
+            raise ValueError("Composio connected_account_id not found")
+        return cca_id
+
+    def _unwrap_composio_data(self, data: Any) -> Any:
+        if isinstance(data, dict):
+            inner = data.get("data", data)
+            if isinstance(inner, dict):
+                return inner.get("response_data", inner)
+            return inner
+        return data
+
+    async def _execute_composio_gmail_tool(
+        self,
+        connector: SearchSourceConnector,
+        tool_name: str,
+        params: dict[str, Any],
+    ) -> tuple[Any, str | None]:
+        result = await ComposioService().execute_tool(
+            connected_account_id=self._get_composio_connected_account_id(connector),
+            tool_name=tool_name,
+            params=params,
+            entity_id=f"surfsense_{connector.user_id}",
+        )
+        if not result.get("success"):
+            return None, result.get("error", "Unknown Composio Gmail error")
+        return self._unwrap_composio_data(result.get("data")), None
+
+    async def _build_credentials(self, connector: SearchSourceConnector) -> Credentials:
+        if self._is_composio_connector(connector):
+            raise ValueError(
+                "Composio Gmail connectors must use Composio tool execution"
+            )
 
         config_data = dict(connector.config)
 
@@ -139,6 +174,12 @@ async def _check_account_health(self, connector_id: int) -> bool:
             if not connector:
                 return True
 
+            if self._is_composio_connector(connector):
+                _profile, error = await self._execute_composio_gmail_tool(
+                    connector, "GMAIL_GET_PROFILE", {"user_id": "me"}
+                )
+                return bool(error)
+
             creds = await self._build_credentials(connector)
             service = build("gmail", "v1", credentials=creds)
             await asyncio.get_event_loop().run_in_executor(
@@ -221,14 +262,21 @@ async def get_creation_context(self, search_space_id: int, user_id: str) -> dict
                     )
                     connector = result.scalar_one_or_none()
                     if connector:
-                        creds = await self._build_credentials(connector)
-                        service = build("gmail", "v1", credentials=creds)
-                        profile = await asyncio.get_event_loop().run_in_executor(
-                            None,
-                            lambda service=service: (
-                                service.users().getProfile(userId="me").execute()
-                            ),
-                        )
+                        if self._is_composio_connector(connector):
+                            profile, error = await self._execute_composio_gmail_tool(
+                                connector, "GMAIL_GET_PROFILE", {"user_id": "me"}
+                            )
+                            if error:
+                                raise RuntimeError(error)
+                        else:
+                            creds = await self._build_credentials(connector)
+                            service = build("gmail", "v1", credentials=creds)
+                            profile = await asyncio.get_event_loop().run_in_executor(
+                                None,
+                                lambda service=service: (
+                                    service.users().getProfile(userId="me").execute()
+                                ),
+                            )
                         acc_dict["email"] = profile.get("emailAddress", "")
                 except Exception:
                     logger.warning(
@@ -298,6 +346,23 @@ async def _fetch_draft_body(
         Returns ``None`` on any failure so callers can degrade gracefully.
         """
         try:
+            if self._is_composio_connector(connector):
+                if not draft_id:
+                    draft_id = await self._find_composio_draft_id(connector, message_id)
+                if not draft_id:
+                    return None
+
+                draft, error = await self._execute_composio_gmail_tool(
+                    connector,
+                    "GMAIL_GET_DRAFT",
+                    {"user_id": "me", "draft_id": draft_id, "format": "full"},
+                )
+                if error or not isinstance(draft, dict):
+                    return None
+
+                payload = draft.get("message", {}).get("payload", {})
+                return self._extract_body_from_payload(payload)
+
             creds = await self._build_credentials(connector)
             service = build("gmail", "v1", credentials=creds)
 
@@ -326,6 +391,33 @@ async def _fetch_draft_body(
             )
             return None
 
+    async def _find_composio_draft_id(
+        self, connector: SearchSourceConnector, message_id: str
+    ) -> str | None:
+        page_token = ""
+        while True:
+            params: dict[str, Any] = {
+                "user_id": "me",
+                "max_results": 100,
+                "verbose": False,
+            }
+            if page_token:
+                params["page_token"] = page_token
+
+            data, error = await self._execute_composio_gmail_tool(
+                connector, "GMAIL_LIST_DRAFTS", params
+            )
+            if error or not isinstance(data, dict):
+                return None
+
+            for draft in data.get("drafts", []):
+                if draft.get("message", {}).get("id") == message_id:
+                    return draft.get("id")
+
+            page_token = data.get("nextPageToken") or data.get("next_page_token") or ""
+            if not page_token:
+                return None
+
     async def _find_draft_id(self, service: Any, message_id: str) -> str | None:
         """Resolve a draft ID from its message ID by scanning drafts.list."""
         try:
diff --git a/surfsense_backend/app/services/google_calendar/kb_sync_service.py b/surfsense_backend/app/services/google_calendar/kb_sync_service.py
index 3cda02b9b5..602a557389 100644
--- a/surfsense_backend/app/services/google_calendar/kb_sync_service.py
+++ b/surfsense_backend/app/services/google_calendar/kb_sync_service.py
@@ -14,7 +14,7 @@
     SearchSourceConnector,
     SearchSourceConnectorType,
 )
-from app.services.llm_service import get_user_long_context_llm
+from app.services.composio_service import ComposioService
 from app.utils.document_converters import (
     create_document_chunks,
     embed_text,
@@ -22,7 +22,6 @@
     generate_document_summary,
     generate_unique_identifier_hash,
 )
-from app.utils.google_credentials import build_composio_credentials
 
 logger = logging.getLogger(__name__)
 
@@ -91,6 +90,8 @@ async def sync_after_create(
                 )
                 content_hash = unique_hash
 
+            from app.services.llm_service import get_user_long_context_llm
+
             user_llm = await get_user_long_context_llm(
                 self.db_session,
                 user_id,
@@ -202,23 +203,46 @@ async def sync_after_update(
                 logger.warning("Document %s not found in KB", document_id)
                 return {"status": "not_indexed"}
 
-            creds = await self._build_credentials_for_connector(connector_id)
-            loop = asyncio.get_event_loop()
-            service = await loop.run_in_executor(
-                None, lambda: build("calendar", "v3", credentials=creds)
-            )
-
             calendar_id = (document.document_metadata or {}).get(
                 "calendar_id"
             ) or "primary"
-            live_event = await loop.run_in_executor(
-                None,
-                lambda: (
-                    service.events()
-                    .get(calendarId=calendar_id, eventId=event_id)
-                    .execute()
-                ),
-            )
+            connector = await self._get_connector(connector_id)
+            if (
+                connector.connector_type
+                == SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR
+            ):
+                cca_id = connector.config.get("composio_connected_account_id")
+                if not cca_id:
+                    raise ValueError("Composio connected_account_id not found")
+                composio_result = await ComposioService().execute_tool(
+                    connected_account_id=cca_id,
+                    tool_name="GOOGLECALENDAR_EVENTS_GET",
+                    params={"calendar_id": calendar_id, "event_id": event_id},
+                    entity_id=f"surfsense_{user_id}",
+                )
+                if not composio_result.get("success"):
+                    raise RuntimeError(
+                        composio_result.get("error", "Unknown Composio Calendar error")
+                    )
+                live_event = composio_result.get("data", {})
+                if isinstance(live_event, dict):
+                    live_event = live_event.get("data", live_event)
+                    if isinstance(live_event, dict):
+                        live_event = live_event.get("response_data", live_event)
+            else:
+                creds = await self._build_credentials_for_connector(connector_id)
+                loop = asyncio.get_event_loop()
+                service = await loop.run_in_executor(
+                    None, lambda: build("calendar", "v3", credentials=creds)
+                )
+                live_event = await loop.run_in_executor(
+                    None,
+                    lambda: (
+                        service.events()
+                        .get(calendarId=calendar_id, eventId=event_id)
+                        .execute()
+                    ),
+                )
 
             event_summary = live_event.get("summary", "")
             description = live_event.get("description", "")
@@ -249,6 +273,8 @@ async def sync_after_update(
             if not indexable_content:
                 return {"status": "error", "message": "Event produced empty content"}
 
+            from app.services.llm_service import get_user_long_context_llm
+
             user_llm = await get_user_long_context_llm(
                 self.db_session, user_id, search_space_id, disable_streaming=True
             )
@@ -319,7 +345,7 @@ async def sync_after_update(
             await self.db_session.rollback()
             return {"status": "error", "message": str(e)}
 
-    async def _build_credentials_for_connector(self, connector_id: int) -> Credentials:
+    async def _get_connector(self, connector_id: int) -> SearchSourceConnector:
         result = await self.db_session.execute(
             select(SearchSourceConnector).where(
                 SearchSourceConnector.id == connector_id
@@ -328,15 +354,17 @@ async def _build_credentials_for_connector(self, connector_id: int) -> Credentia
         connector = result.scalar_one_or_none()
         if not connector:
             raise ValueError(f"Connector {connector_id} not found")
+        return connector
 
+    async def _build_credentials_for_connector(self, connector_id: int) -> Credentials:
+        connector = await self._get_connector(connector_id)
         if (
             connector.connector_type
             == SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR
         ):
-            cca_id = connector.config.get("composio_connected_account_id")
-            if cca_id:
-                return build_composio_credentials(cca_id)
-            raise ValueError("Composio connected_account_id not found")
+            raise ValueError(
+                "Composio Calendar connectors must use Composio tool execution"
+            )
 
         config_data = dict(connector.config)
 
diff --git a/surfsense_backend/app/services/google_calendar/tool_metadata_service.py b/surfsense_backend/app/services/google_calendar/tool_metadata_service.py
index c7bfe1d50f..7e50ab0397 100644
--- a/surfsense_backend/app/services/google_calendar/tool_metadata_service.py
+++ b/surfsense_backend/app/services/google_calendar/tool_metadata_service.py
@@ -16,7 +16,7 @@
     SearchSourceConnector,
     SearchSourceConnectorType,
 )
-from app.utils.google_credentials import build_composio_credentials
+from app.services.composio_service import ComposioService
 
 logger = logging.getLogger(__name__)
 
@@ -94,15 +94,49 @@ class GoogleCalendarToolMetadataService:
     def __init__(self, db_session: AsyncSession):
         self._db_session = db_session
 
-    async def _build_credentials(self, connector: SearchSourceConnector) -> Credentials:
-        if (
+    def _is_composio_connector(self, connector: SearchSourceConnector) -> bool:
+        return (
             connector.connector_type
             == SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR
-        ):
-            cca_id = connector.config.get("composio_connected_account_id")
-            if cca_id:
-                return build_composio_credentials(cca_id)
+        )
+
+    def _get_composio_connected_account_id(
+        self, connector: SearchSourceConnector
+    ) -> str:
+        cca_id = connector.config.get("composio_connected_account_id")
+        if not cca_id:
             raise ValueError("Composio connected_account_id not found")
+        return cca_id
+
+    async def _execute_composio_calendar_tool(
+        self,
+        connector: SearchSourceConnector,
+        tool_name: str,
+        params: dict,
+    ) -> tuple[dict | list | None, str | None]:
+        service = ComposioService()
+        result = await service.execute_tool(
+            connected_account_id=self._get_composio_connected_account_id(connector),
+            tool_name=tool_name,
+            params=params,
+            entity_id=f"surfsense_{connector.user_id}",
+        )
+        if not result.get("success"):
+            return None, result.get("error", "Unknown Composio Calendar error")
+
+        data = result.get("data")
+        if isinstance(data, dict):
+            inner = data.get("data", data)
+            if isinstance(inner, dict):
+                return inner.get("response_data", inner), None
+            return inner, None
+        return data, None
+
+    async def _build_credentials(self, connector: SearchSourceConnector) -> Credentials:
+        if self._is_composio_connector(connector):
+            raise ValueError(
+                "Composio Calendar connectors must use Composio tool execution"
+            )
 
         config_data = dict(connector.config)
 
@@ -156,6 +190,14 @@ async def _check_account_health(self, connector_id: int) -> bool:
             if not connector:
                 return True
 
+            if self._is_composio_connector(connector):
+                _data, error = await self._execute_composio_calendar_tool(
+                    connector,
+                    "GOOGLECALENDAR_GET_CALENDAR",
+                    {"calendar_id": "primary"},
+                )
+                return bool(error)
+
             creds = await self._build_credentials(connector)
             loop = asyncio.get_event_loop()
             await loop.run_in_executor(
@@ -255,16 +297,48 @@ async def get_creation_context(self, search_space_id: int, user_id: str) -> dict
         timezone_str = ""
         if connector:
             try:
-                creds = await self._build_credentials(connector)
-                loop = asyncio.get_event_loop()
-                service = await loop.run_in_executor(
-                    None, lambda: build("calendar", "v3", credentials=creds)
-                )
+                if self._is_composio_connector(connector):
+                    cal_list, cal_error = await self._execute_composio_calendar_tool(
+                        connector, "GOOGLECALENDAR_LIST_CALENDARS", {}
+                    )
+                    if cal_error:
+                        raise RuntimeError(cal_error)
+                    (
+                        settings,
+                        settings_error,
+                    ) = await self._execute_composio_calendar_tool(
+                        connector,
+                        "GOOGLECALENDAR_SETTINGS_GET",
+                        {"setting": "timezone"},
+                    )
+                    if not settings_error and isinstance(settings, dict):
+                        timezone_str = settings.get("value", "")
+                else:
+                    creds = await self._build_credentials(connector)
+                    loop = asyncio.get_event_loop()
+                    service = await loop.run_in_executor(
+                        None, lambda: build("calendar", "v3", credentials=creds)
+                    )
 
-                cal_list = await loop.run_in_executor(
-                    None, lambda: service.calendarList().list().execute()
-                )
-                for cal in cal_list.get("items", []):
+                    cal_list = await loop.run_in_executor(
+                        None, lambda: service.calendarList().list().execute()
+                    )
+
+                    tz_setting = await loop.run_in_executor(
+                        None,
+                        lambda: service.settings().get(setting="timezone").execute(),
+                    )
+                    timezone_str = tz_setting.get("value", "")
+
+                calendar_items = []
+                if isinstance(cal_list, dict):
+                    calendar_items = (
+                        cal_list.get("items") or cal_list.get("calendars") or []
+                    )
+                elif isinstance(cal_list, list):
+                    calendar_items = cal_list
+
+                for cal in calendar_items:
                     calendars.append(
                         {
                             "id": cal.get("id", ""),
@@ -272,12 +346,6 @@ async def get_creation_context(self, search_space_id: int, user_id: str) -> dict
                             "primary": cal.get("primary", False),
                         }
                     )
-
-                tz_setting = await loop.run_in_executor(
-                    None,
-                    lambda: service.settings().get(setting="timezone").execute(),
-                )
-                timezone_str = tz_setting.get("value", "")
             except Exception:
                 logger.warning(
                     "Failed to fetch calendars/timezone for connector %s",
@@ -321,20 +389,29 @@ async def get_update_context(
 
         event_dict = event.to_dict()
         try:
-            creds = await self._build_credentials(connector)
-            loop = asyncio.get_event_loop()
-            service = await loop.run_in_executor(
-                None, lambda: build("calendar", "v3", credentials=creds)
-            )
             calendar_id = event.calendar_id or "primary"
-            live_event = await loop.run_in_executor(
-                None,
-                lambda: (
-                    service.events()
-                    .get(calendarId=calendar_id, eventId=event.event_id)
-                    .execute()
-                ),
-            )
+            if self._is_composio_connector(connector):
+                live_event, error = await self._execute_composio_calendar_tool(
+                    connector,
+                    "GOOGLECALENDAR_EVENTS_GET",
+                    {"calendar_id": calendar_id, "event_id": event.event_id},
+                )
+                if error:
+                    raise RuntimeError(error)
+            else:
+                creds = await self._build_credentials(connector)
+                loop = asyncio.get_event_loop()
+                service = await loop.run_in_executor(
+                    None, lambda: build("calendar", "v3", credentials=creds)
+                )
+                live_event = await loop.run_in_executor(
+                    None,
+                    lambda: (
+                        service.events()
+                        .get(calendarId=calendar_id, eventId=event.event_id)
+                        .execute()
+                    ),
+                )
 
             event_dict["summary"] = live_event.get("summary", event_dict["summary"])
             event_dict["description"] = live_event.get(
@@ -376,12 +453,30 @@ async def get_deletion_context(
     ) -> dict:
         resolved = await self._resolve_event(search_space_id, user_id, event_ref)
         if not resolved:
+            live_resolved = await self._resolve_live_event(
+                search_space_id, user_id, event_ref
+            )
+            if not live_resolved:
+                return {
+                    "error": (
+                        f"Event '{event_ref}' not found in your indexed or live Google Calendar events. "
+                        "This could mean: (1) the event doesn't exist, "
+                        "(2) the event name is different, or "
+                        "(3) the connected calendar account cannot access it."
+                    )
+                }
+
+            connector, live_event = live_resolved
+            account = GoogleCalendarAccount.from_connector(connector)
+            acc_dict = account.to_dict()
+            auth_expired = await self._check_account_health(connector.id)
+            acc_dict["auth_expired"] = auth_expired
+            if auth_expired:
+                await self._persist_auth_expired(connector.id)
+
             return {
-                "error": (
-                    f"Event '{event_ref}' not found in your indexed Google Calendar events. "
-                    "This could mean: (1) the event doesn't exist, (2) it hasn't been indexed yet, "
-                    "or (3) the event name is different."
-                )
+                "account": acc_dict,
+                "event": self._event_dict_from_live_event(live_event),
             }
 
         document, connector = resolved
@@ -429,3 +524,110 @@ async def _resolve_event(
         if row:
             return row[0], row[1]
         return None
+
+    async def _resolve_live_event(
+        self, search_space_id: int, user_id: str, event_ref: str
+    ) -> tuple[SearchSourceConnector, dict] | None:
+        result = await self._db_session.execute(
+            select(SearchSourceConnector)
+            .filter(
+                and_(
+                    SearchSourceConnector.search_space_id == search_space_id,
+                    SearchSourceConnector.user_id == user_id,
+                    SearchSourceConnector.connector_type.in_(CALENDAR_CONNECTOR_TYPES),
+                )
+            )
+            .order_by(SearchSourceConnector.last_indexed_at.desc())
+        )
+        connectors = result.scalars().all()
+
+        for connector in connectors:
+            try:
+                events = await self._search_live_events(connector, event_ref)
+            except Exception:
+                logger.warning(
+                    "Failed to search live calendar events for connector %s",
+                    connector.id,
+                    exc_info=True,
+                )
+                continue
+
+            if not events:
+                continue
+
+            normalized_ref = event_ref.strip().lower()
+            exact_match = next(
+                (
+                    event
+                    for event in events
+                    if event.get("summary", "").strip().lower() == normalized_ref
+                ),
+                None,
+            )
+            return connector, exact_match or events[0]
+
+        return None
+
+    async def _search_live_events(
+        self, connector: SearchSourceConnector, event_ref: str
+    ) -> list[dict]:
+        if self._is_composio_connector(connector):
+            data, error = await self._execute_composio_calendar_tool(
+                connector,
+                "GOOGLECALENDAR_EVENTS_LIST",
+                {
+                    "calendar_id": "primary",
+                    "q": event_ref,
+                    "max_results": 10,
+                    "single_events": True,
+                    "order_by": "startTime",
+                },
+            )
+            if error:
+                raise RuntimeError(error)
+            if isinstance(data, dict):
+                return data.get("items") or data.get("events") or []
+            return data if isinstance(data, list) else []
+
+        creds = await self._build_credentials(connector)
+        loop = asyncio.get_event_loop()
+        service = await loop.run_in_executor(
+            None, lambda: build("calendar", "v3", credentials=creds)
+        )
+        response = await loop.run_in_executor(
+            None,
+            lambda: (
+                service.events()
+                .list(
+                    calendarId="primary",
+                    q=event_ref,
+                    maxResults=10,
+                    singleEvents=True,
+                    orderBy="startTime",
+                )
+                .execute()
+            ),
+        )
+        return response.get("items", [])
+
+    def _event_dict_from_live_event(self, event: dict) -> dict:
+        start_data = event.get("start", {})
+        end_data = event.get("end", {})
+        return {
+            "event_id": event.get("id", ""),
+            "summary": event.get("summary", "No Title"),
+            "start": start_data.get("dateTime", start_data.get("date", "")),
+            "end": end_data.get("dateTime", end_data.get("date", "")),
+            "description": event.get("description", ""),
+            "location": event.get("location", ""),
+            "attendees": [
+                {
+                    "email": attendee.get("email", ""),
+                    "responseStatus": attendee.get("responseStatus", ""),
+                }
+                for attendee in event.get("attendees", [])
+            ],
+            "calendar_id": event.get("calendarId", "primary"),
+            "document_id": None,
+            "indexed_at": None,
+        }
diff --git a/surfsense_backend/app/services/google_drive/kb_sync_service.py b/surfsense_backend/app/services/google_drive/kb_sync_service.py
index 92a39f7b9d..0a8eb47a65 100644
--- a/surfsense_backend/app/services/google_drive/kb_sync_service.py
+++ b/surfsense_backend/app/services/google_drive/kb_sync_service.py
@@ -4,7 +4,6 @@
 from sqlalchemy.ext.asyncio import AsyncSession
 
 from app.db import Document, DocumentType
-from app.services.llm_service import get_user_long_context_llm
 from app.utils.document_converters import (
     create_document_chunks,
     embed_text,
@@ -75,6 +74,8 @@ async def sync_after_create(
                 )
                 content_hash = unique_hash
 
+            from app.services.llm_service import get_user_long_context_llm
+
             user_llm = await get_user_long_context_llm(
                 self.db_session,
                 user_id,
diff --git a/surfsense_backend/app/services/google_drive/tool_metadata_service.py b/surfsense_backend/app/services/google_drive/tool_metadata_service.py
index 221bee14ad..0f654bc78e 100644
--- a/surfsense_backend/app/services/google_drive/tool_metadata_service.py
+++ b/surfsense_backend/app/services/google_drive/tool_metadata_service.py
@@ -13,7 +13,7 @@
     SearchSourceConnector,
     SearchSourceConnectorType,
 )
-from app.utils.google_credentials import build_composio_credentials
+from app.services.composio_service import ComposioService
 
 logger = logging.getLogger(__name__)
 
@@ -67,6 +67,42 @@ class GoogleDriveToolMetadataService:
     def __init__(self, db_session: AsyncSession):
         self._db_session = db_session
 
+    def _is_composio_connector(self, connector: SearchSourceConnector) -> bool:
+        return (
+            connector.connector_type
+            == SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR
+        )
+
+    def _get_composio_connected_account_id(
+        self, connector: SearchSourceConnector
+    ) -> str:
+        cca_id = connector.config.get("composio_connected_account_id")
+        if not cca_id:
+            raise ValueError("Composio connected_account_id not found")
+        return cca_id
+
+    async def _execute_composio_drive_tool(
+        self,
+        connector: SearchSourceConnector,
+        tool_name: str,
+        params: dict,
+    ) -> tuple[dict | list | None, str | None]:
+        result = await ComposioService().execute_tool(
+            connected_account_id=self._get_composio_connected_account_id(connector),
+            tool_name=tool_name,
+            params=params,
+            entity_id=f"surfsense_{connector.user_id}",
+        )
+        if not result.get("success"):
+            return None, result.get("error", "Unknown Composio Drive error")
+        data = result.get("data")
+        if isinstance(data, dict):
+            inner = data.get("data", data)
+            if isinstance(inner, dict):
+                return inner.get("response_data", inner), None
+            return inner, None
+        return data, None
+
     async def get_creation_context(self, search_space_id: int, user_id: str) -> dict:
         accounts = await self._get_google_drive_accounts(search_space_id, user_id)
 
@@ -200,19 +236,21 @@ async def _check_account_health(self, connector_id: int) -> bool:
             if not connector:
                 return True
 
-            pre_built_creds = None
-            if (
-                connector.connector_type
-                == SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR
-            ):
-                cca_id = connector.config.get("composio_connected_account_id")
-                if cca_id:
-                    pre_built_creds = build_composio_credentials(cca_id)
+            if self._is_composio_connector(connector):
+                _data, error = await self._execute_composio_drive_tool(
+                    connector,
+                    "GOOGLEDRIVE_LIST_FILES",
+                    {
+                        "q": "trashed = false",
+                        "page_size": 1,
+                        "fields": "files(id)",
+                    },
+                )
+                return bool(error)
 
             client = GoogleDriveClient(
                 session=self._db_session,
                 connector_id=connector_id,
-                credentials=pre_built_creds,
             )
             await client.list_files(
                 query="trashed = false", page_size=1, fields="files(id)"
@@ -274,19 +312,39 @@ async def _get_parent_folders_by_account(
                     parent_folders[connector_id] = []
                     continue
 
-                pre_built_creds = None
-                if (
-                    connector.connector_type
-                    == SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR
-                ):
-                    cca_id = connector.config.get("composio_connected_account_id")
-                    if cca_id:
-                        pre_built_creds = build_composio_credentials(cca_id)
+                if self._is_composio_connector(connector):
+                    data, error = await self._execute_composio_drive_tool(
+                        connector,
+                        "GOOGLEDRIVE_LIST_FILES",
+                        {
+                            "q": "mimeType = 'application/vnd.google-apps.folder' and trashed = false and 'root' in parents",
+                            "fields": "files(id,name)",
+                            "page_size": 50,
+                        },
+                    )
+                    if error:
+                        logger.warning(
+                            "Failed to list folders for connector %s: %s",
+                            connector_id,
+                            error,
+                        )
+                        parent_folders[connector_id] = []
+                        continue
+                    folders = []
+                    if isinstance(data, dict):
+                        folders = data.get("files", [])
+                    elif isinstance(data, list):
+                        folders = data
+                    parent_folders[connector_id] = [
+                        {"folder_id": f["id"], "name": f["name"]}
+                        for f in folders
+                        if f.get("id") and f.get("name")
+                    ]
+                    continue
 
                 client = GoogleDriveClient(
                     session=self._db_session,
                     connector_id=connector_id,
-                    credentials=pre_built_creds,
                 )
 
                 folders, _, error = await client.list_files(
diff --git a/surfsense_backend/app/services/image_gen_router_service.py b/surfsense_backend/app/services/image_gen_router_service.py
index f45a6ab636..b4de2a0bf9 100644
--- a/surfsense_backend/app/services/image_gen_router_service.py
+++ b/surfsense_backend/app/services/image_gen_router_service.py
@@ -20,6 +20,8 @@
 from litellm import Router
 from litellm.utils import ImageResponse
 
+from app.services.provider_api_base import resolve_api_base
+
 logger = logging.getLogger(__name__)
 
 # Special ID for Auto mode - uses router for load balancing
@@ -152,12 +154,12 @@ def _config_to_deployment(cls, config: dict) -> dict | None:
                 return None
 
             # Build model string
+            provider = config.get("provider", "").upper()
             if config.get("custom_provider"):
-                model_string = f"{config['custom_provider']}/{config['model_name']}"
+                provider_prefix = config["custom_provider"]
             else:
-                provider = config.get("provider", "").upper()
                 provider_prefix = IMAGE_GEN_PROVIDER_MAP.get(provider, provider.lower())
-                model_string = f"{provider_prefix}/{config['model_name']}"
+            model_string = f"{provider_prefix}/{config['model_name']}"
 
             # Build litellm params
             litellm_params: dict[str, Any] = {
@@ -165,9 +167,16 @@ def _config_to_deployment(cls, config: dict) -> dict | None:
                 "api_key": config.get("api_key"),
             }
 
-            # Add optional api_base
-            if config.get("api_base"):
-                litellm_params["api_base"] = config["api_base"]
+            # Resolve ``api_base`` so deployments don't silently inherit
+            # ``AZURE_OPENAI_ENDPOINT`` / ``OPENAI_API_BASE`` and 404 against
+            # the wrong provider (see ``provider_api_base`` docstring).
+            api_base = resolve_api_base(
+                provider=provider,
+                provider_prefix=provider_prefix,
+                config_api_base=config.get("api_base"),
+            )
+            if api_base:
+                litellm_params["api_base"] = api_base
 
             # Add api_version (required for Azure)
             if config.get("api_version"):
diff --git a/surfsense_backend/app/services/jira/kb_sync_service.py b/surfsense_backend/app/services/jira/kb_sync_service.py
index 4d2a66e52d..8e88bee817 100644
--- a/surfsense_backend/app/services/jira/kb_sync_service.py
+++ b/surfsense_backend/app/services/jira/kb_sync_service.py
@@ -6,7 +6,6 @@
 
 from app.connectors.jira_history import JiraHistoryConnector
 from app.db import Document, DocumentType
-from app.services.llm_service import get_user_long_context_llm
 from app.utils.document_converters import (
     create_document_chunks,
     embed_text,
@@ -75,6 +74,8 @@ async def sync_after_create(
             if dup:
                 content_hash = unique_hash
 
+            from app.services.llm_service import get_user_long_context_llm
+
             user_llm = await get_user_long_context_llm(
                 self.db_session,
                 user_id,
@@ -190,6 +191,8 @@ async def sync_after_update(
             state = formatted.get("status", "Unknown")
             comment_count = len(formatted.get("comments", []))
 
+            from app.services.llm_service import get_user_long_context_llm
+
             user_llm = await get_user_long_context_llm(
                 self.db_session, user_id, search_space_id, disable_streaming=True
             )
diff --git a/surfsense_backend/app/services/linear/kb_sync_service.py b/surfsense_backend/app/services/linear/kb_sync_service.py
index dab42af558..4712276024 100644
--- a/surfsense_backend/app/services/linear/kb_sync_service.py
+++ b/surfsense_backend/app/services/linear/kb_sync_service.py
@@ -5,7 +5,6 @@
 
 from app.connectors.linear_connector import LinearConnector
 from app.db import Document, DocumentType
-from app.services.llm_service import get_user_long_context_llm
 from app.utils.document_converters import (
     create_document_chunks,
     embed_text,
@@ -85,6 +84,8 @@ async def sync_after_create(
                 )
                 content_hash = unique_hash
 
+            from app.services.llm_service import get_user_long_context_llm
+
             user_llm = await get_user_long_context_llm(
                 self.db_session,
                 user_id,
@@ -226,6 +227,8 @@ async def sync_after_update(
             comment_count = len(formatted_issue.get("comments", []))
             formatted_issue.get("description", "")
 
+            from app.services.llm_service import get_user_long_context_llm
+
             user_llm = await get_user_long_context_llm(
                 self.db_session, user_id, search_space_id, disable_streaming=True
             )
diff --git a/surfsense_backend/app/services/llm_router_service.py b/surfsense_backend/app/services/llm_router_service.py
index 63d8d10b9b..d220aa3466 100644
--- a/surfsense_backend/app/services/llm_router_service.py
+++ b/surfsense_backend/app/services/llm_router_service.py
@@ -28,6 +28,7 @@
     BadRequestError as LiteLLMBadRequestError,
     ContextWindowExceededError,
 )
+from pydantic import Field
 
 from app.utils.perf import get_perf_logger
 
@@ -49,6 +50,49 @@ def _is_context_overflow_error(exc: LiteLLMBadRequestError) -> bool:
     return bool(_CONTEXT_OVERFLOW_PATTERNS.search(str(exc)))
 
 
+_UNIVERSAL_CONTENT_TYPES = {
+    "text",
+    "image_url",
+    "input_audio",
+    "refusal",
+    "audio",
+    "file",
+}
+
+
+def _sanitize_content(content: Any) -> Any:
+    """Normalise a LangChain message ``content`` field so it is safe for any
+    downstream provider (Azure, OpenAI, OpenRouter, etc.).
+
+    * Strips provider-specific block types (e.g. ``thinking`` from reasoning models).
+    * Removes text blocks with blank text (Bedrock rejects ``{"type":"text","text":""}``)
+    * Converts bare strings inside a list to ``{"type": "text", "text": ...}`` objects
+      (Azure rejects raw strings in a content array).
+    * Collapses a single-text-block list to a plain string for maximum compatibility.
+    """
+    if not isinstance(content, list):
+        return content
+
+    filtered: list[dict] = []
+    for block in content:
+        if isinstance(block, str):
+            if block:
+                filtered.append({"type": "text", "text": block})
+        elif isinstance(block, dict):
+            block_type = block.get("type", "text")
+            if block_type not in _UNIVERSAL_CONTENT_TYPES:
+                continue
+            if block_type == "text" and not block.get("text"):
+                continue
+            filtered.append(block)
+
+    if not filtered:
+        return ""
+    if len(filtered) == 1 and filtered[0].get("type") == "text":
+        return filtered[0].get("text", "")
+    return filtered
+
+
 # Special ID for Auto mode - uses router for load balancing
 AUTO_MODE_ID = 0
 
@@ -90,6 +134,16 @@ def _is_context_overflow_error(exc: LiteLLMBadRequestError) -> bool:
 }
 
 
+# ``PROVIDER_DEFAULT_API_BASE`` and ``PROVIDER_KEY_DEFAULT_API_BASE`` were
+# hoisted to ``app.services.provider_api_base`` so vision and image-gen
+# call sites can share the exact same defense (OpenRouter / Groq / etc.
+# 404-ing against an inherited Azure endpoint). Re-exported here for
+# backward compatibility with any external import.
+from app.services.provider_api_base import (  # noqa: E402
+    resolve_api_base,
+)
+
+
 class LLMRouterService:
     """
     Singleton service for managing LiteLLM Router.
@@ -103,6 +157,7 @@ class LLMRouterService:
     _model_list: list[dict] = []
     _router_settings: dict = {}
     _initialized: bool = False
+    _premium_model_strings: set[str] = set()
 
     def __new__(cls):
         if cls._instance is None:
@@ -125,6 +180,12 @@ def initialize(
         """
         Initialize the router with global LLM configurations.
 
+        Configs with ``router_pool_eligible=False`` are skipped so that
+        dynamic OpenRouter entries stay out of the shared router pool used
+        by title-gen / sub-agent ``model="auto"`` flows. Those dynamic
+        entries are still available for user-facing Auto-mode thread pinning
+        via ``auto_model_pin_service``.
+
         Args:
             global_configs: List of global LLM config dictionaries from YAML
             router_settings: Optional router settings (routing_strategy, num_retries, etc.)
@@ -135,19 +196,34 @@ def initialize(
             logger.debug("LLM Router already initialized, skipping")
             return
 
-        # Build model list from global configs
         model_list = []
+        premium_models: set[str] = set()
         for config in global_configs:
+            if config.get("router_pool_eligible") is False:
+                continue
             deployment = cls._config_to_deployment(config)
             if deployment:
                 model_list.append(deployment)
+                if config.get("billing_tier") == "premium":
+                    params = deployment["litellm_params"]
+                    model_string = params["model"]
+                    premium_models.add(model_string)
+                    base = params.get("base_model") or config.get("model_name", "")
+                    if base and base != model_string:
+                        premium_models.add(base)
 
         if not model_list:
             logger.warning("No valid LLM configs found for router initialization")
             return
 
         instance._model_list = model_list
+        instance._premium_model_strings = premium_models
         instance._router_settings = router_settings or {}
+        logger.info(
+            "Router pool: %d deployments, premium model strings: %s",
+            len(model_list),
+            sorted(premium_models),
+        )
 
         # Default router settings optimized for rate limit handling
         default_settings = {
@@ -167,6 +243,16 @@ def initialize(
         # hits ContextWindowExceededError.
         full_model_list, ctx_fallbacks = cls._build_context_fallback_groups(model_list)
 
+        # Build a general-purpose fallback list so NotFound/timeout/rate-limit
+        # style failures on one deployment don't bubble up as hard errors —
+        # the router retries with a sibling deployment in ``auto-large``.
+        # ``auto-large`` is the large-context subset of ``auto``; if it is
+        # empty we fall back to ``auto`` itself so the router at least picks a
+        # different deployment in the same group.
+        fallbacks: list[dict[str, list[str]]] | None = None
+        if ctx_fallbacks:
+            fallbacks = [{"auto": ["auto-large"]}]
+
         try:
             router_kwargs: dict[str, Any] = {
                 "model_list": full_model_list,
@@ -180,20 +266,88 @@ def initialize(
             }
             if ctx_fallbacks:
                 router_kwargs["context_window_fallbacks"] = ctx_fallbacks
+            if fallbacks:
+                router_kwargs["fallbacks"] = fallbacks
 
             instance._router = Router(**router_kwargs)
             instance._initialized = True
+
+            global _cached_context_profile, _cached_context_profile_computed
+            _cached_context_profile = None
+            _cached_context_profile_computed = False
+            _router_instance_cache.clear()
+
             logger.info(
                 "LLM Router initialized with %d deployments, "
-                "strategy: %s, context_window_fallbacks: %s",
+                "strategy: %s, context_window_fallbacks: %s, fallbacks: %s",
                 len(model_list),
                 final_settings.get("routing_strategy"),
                 ctx_fallbacks or "none",
+                fallbacks or "none",
             )
         except Exception as e:
             logger.error(f"Failed to initialize LLM Router: {e}")
             instance._router = None
 
+    @classmethod
+    def rebuild(
+        cls,
+        global_configs: list[dict],
+        router_settings: dict | None = None,
+    ) -> None:
+        """Reset the router and re-run ``initialize`` with fresh configs.
+
+        ``initialize`` short-circuits once it has run to avoid re-creating the
+        LiteLLM Router on every request; ``rebuild`` deliberately clears
+        ``_initialized`` so a caller (e.g. background OpenRouter refresh)
+        can force the pool to be rebuilt after catalogue changes.
+        """
+        instance = cls.get_instance()
+        instance._initialized = False
+        instance._router = None
+        instance._model_list = []
+        instance._premium_model_strings = set()
+        cls.initialize(global_configs, router_settings)
+
+    @classmethod
+    def is_premium_model(cls, model_string: str) -> bool:
+        """Return True if *model_string* belongs to a premium-tier deployment
+        in the LiteLLM router pool.
+
+        Scope: only covers configs with ``router_pool_eligible`` truthy. That
+        includes static YAML premium configs AND dynamic OpenRouter *premium*
+        entries (which opt in at generation time). Dynamic OpenRouter *free*
+        entries are deliberately kept out of the router pool — OpenRouter
+        enforces free-tier limits globally per account, so per-deployment
+        router accounting can't represent them correctly — and therefore
+        return ``False`` here, which matches their ``billing_tier="free"``
+        (no premium quota).
+
+        For per-request premium checks on an arbitrary config (static or
+        dynamic, pool or non-pool), read ``agent_config.is_premium`` instead;
+        that reflects the per-config ``billing_tier`` directly and is what
+        user-facing Auto-mode thread pinning uses to bill correctly.
+        """
+        instance = cls.get_instance()
+        return model_string in instance._premium_model_strings
+
+    @classmethod
+    def compute_premium_tokens(cls, calls: list) -> int:
+        """Sum ``total_tokens`` for calls whose model is premium."""
+        instance = cls.get_instance()
+        total = sum(
+            c.total_tokens for c in calls if c.model in instance._premium_model_strings
+        )
+        if calls:
+            call_models = [c.model for c in calls]
+            logger.info(
+                "[premium_tokens] call models=%s, premium_set=%s, result=%d",
+                call_models,
+                sorted(instance._premium_model_strings),
+                total,
+            )
+        return total
+
     @classmethod
     def _build_context_fallback_groups(
         cls, model_list: list[dict]
@@ -267,10 +421,11 @@ def _config_to_deployment(cls, config: dict) -> dict | None:
                 return None
 
             # Build model string
+            provider = config.get("provider", "").upper()
             if config.get("custom_provider"):
-                model_string = f"{config['custom_provider']}/{config['model_name']}"
+                provider_prefix = config["custom_provider"]
+                model_string = f"{provider_prefix}/{config['model_name']}"
             else:
-                provider = config.get("provider", "").upper()
                 provider_prefix = PROVIDER_MAP.get(provider, provider.lower())
                 model_string = f"{provider_prefix}/{config['model_name']}"
 
@@ -280,9 +435,19 @@ def _config_to_deployment(cls, config: dict) -> dict | None:
                 "api_key": config.get("api_key"),
             }
 
-            # Add optional api_base
-            if config.get("api_base"):
-                litellm_params["api_base"] = config["api_base"]
+            # Resolve ``api_base``. Config value wins; otherwise apply a
+            # provider-aware default so the deployment does not silently
+            # inherit unrelated env vars (e.g. ``AZURE_API_BASE``) and route
+            # requests to the wrong endpoint.  See ``provider_api_base``
+            # docstring for the motivating bug (OpenRouter models 404-ing
+            # against an Azure endpoint).
+            api_base = resolve_api_base(
+                provider=provider,
+                provider_prefix=provider_prefix,
+                config_api_base=config.get("api_base"),
+            )
+            if api_base:
+                litellm_params["api_base"] = api_base
 
             # Add any additional litellm parameters
             if config.get("litellm_params"):
@@ -424,6 +589,11 @@ class ChatLiteLLMRouter(BaseChatModel):
     # Public attributes that Pydantic will manage
     model: str = "auto"
     streaming: bool = True
+    # Static kwargs that flow through to ``litellm.completion(...)`` on every
+    # invocation (e.g. ``cache_control_injection_points`` set by
+    # ``apply_litellm_prompt_caching``). Per-call ``**kwargs`` from
+    # ``invoke()`` still take precedence — see ``_generate``/``_astream``.
+    model_kwargs: dict[str, Any] = Field(default_factory=dict)
 
     # Bound tools and tool choice for tool calling
     _bound_tools: list[dict] | None = None
@@ -749,13 +919,16 @@ def bind_tools(
                     logger.warning(f"Failed to convert tool {tool}: {e}")
                     continue
 
-        # Create a new instance with tools bound
+        # Create a new instance with tools bound. Carry through ``model_kwargs``
+        # so static settings (e.g. cache_control_injection_points) survive the
+        # bind_tools rebuild.
         return ChatLiteLLMRouter(
             router=self._router,
             bound_tools=formatted_tools if formatted_tools else None,
             tool_choice=tool_choice,
             model=self.model,
             streaming=self.streaming,
+            model_kwargs=dict(self.model_kwargs),
             **kwargs,
         )
 
@@ -780,8 +953,10 @@ def _generate(
         formatted_messages = self._convert_messages(messages)
         formatted_messages = self._trim_messages_to_fit_context(formatted_messages)
 
-        # Add tools if bound
-        call_kwargs = {**kwargs}
+        # Merge static model_kwargs (e.g. cache_control_injection_points) under
+        # per-call kwargs so callers can still override per invocation. Then add
+        # bound tools.
+        call_kwargs = {**self.model_kwargs, **kwargs}
         if self._bound_tools:
             call_kwargs["tools"] = self._bound_tools
         if self._tool_choice is not None:
@@ -820,7 +995,9 @@ def _generate(
         )
 
         # Convert response to ChatResult with potential tool calls
-        message = self._convert_response_to_message(response.choices[0].message)
+        message = self._convert_response_to_message(
+            response.choices[0].message, response=response
+        )
         generation = ChatGeneration(message=message)
 
         return ChatResult(generations=[generation])
@@ -846,8 +1023,10 @@ async def _agenerate(
         formatted_messages = self._convert_messages(messages)
         formatted_messages = self._trim_messages_to_fit_context(formatted_messages)
 
-        # Add tools if bound
-        call_kwargs = {**kwargs}
+        # Merge static model_kwargs (e.g. cache_control_injection_points) under
+        # per-call kwargs so callers can still override per invocation. Then add
+        # bound tools.
+        call_kwargs = {**self.model_kwargs, **kwargs}
         if self._bound_tools:
             call_kwargs["tools"] = self._bound_tools
         if self._tool_choice is not None:
@@ -886,7 +1065,9 @@ async def _agenerate(
         )
 
         # Convert response to ChatResult with potential tool calls
-        message = self._convert_response_to_message(response.choices[0].message)
+        message = self._convert_response_to_message(
+            response.choices[0].message, response=response
+        )
         generation = ChatGeneration(message=message)
 
         return ChatResult(generations=[generation])
@@ -907,8 +1088,10 @@ def _stream(
         formatted_messages = self._convert_messages(messages)
         formatted_messages = self._trim_messages_to_fit_context(formatted_messages)
 
-        # Add tools if bound
-        call_kwargs = {**kwargs}
+        # Merge static model_kwargs (e.g. cache_control_injection_points) under
+        # per-call kwargs so callers can still override per invocation. Then add
+        # bound tools.
+        call_kwargs = {**self.model_kwargs, **kwargs}
         if self._bound_tools:
             call_kwargs["tools"] = self._bound_tools
         if self._tool_choice is not None:
@@ -957,8 +1140,10 @@ async def _astream(
         formatted_messages = self._convert_messages(messages)
         formatted_messages = self._trim_messages_to_fit_context(formatted_messages)
 
-        # Add tools if bound
-        call_kwargs = {**kwargs}
+        # Merge static model_kwargs (e.g. cache_control_injection_points) under
+        # per-call kwargs so callers can still override per invocation. Then add
+        # bound tools.
+        call_kwargs = {**self.model_kwargs, **kwargs}
         if self._bound_tools:
             call_kwargs["tools"] = self._bound_tools
         if self._tool_choice is not None:
@@ -970,6 +1155,7 @@ async def _astream(
                 messages=formatted_messages,
                 stop=stop,
                 stream=True,
+                stream_options={"include_usage": True},
                 **call_kwargs,
             )
         except ContextWindowExceededError as e:
@@ -1036,10 +1222,12 @@ def _convert_messages(self, messages: list[BaseMessage]) -> list[dict]:
                 result.append({"role": "user", "content": msg.content})
             elif isinstance(msg, AIMsg):
                 ai_msg: dict[str, Any] = {"role": "assistant"}
-                if msg.content:
-                    ai_msg["content"] = msg.content
-                # Handle tool calls
-                if hasattr(msg, "tool_calls") and msg.tool_calls:
+                has_tool_calls = hasattr(msg, "tool_calls") and msg.tool_calls
+
+                sanitized = _sanitize_content(msg.content) if msg.content else ""
+                ai_msg["content"] = sanitized if sanitized else ""
+
+                if has_tool_calls:
                     ai_msg["tool_calls"] = [
                         {
                             "id": tc.get("id", ""),
@@ -1075,7 +1263,9 @@ def _convert_messages(self, messages: list[BaseMessage]) -> list[dict]:
 
         return result
 
-    def _convert_response_to_message(self, response_message: Any) -> AIMessage:
+    def _convert_response_to_message(
+        self, response_message: Any, response: Any = None
+    ) -> AIMessage:
         """Convert a LiteLLM response message to a LangChain AIMessage."""
         import json
 
@@ -1098,9 +1288,22 @@ def _convert_response_to_message(self, response_message: Any) -> AIMessage:
                         tool_call["args"] = tc.function.arguments
                 tool_calls.append(tool_call)
 
+        extra_kwargs: dict[str, Any] = {}
+        if response:
+            usage = getattr(response, "usage", None)
+            if usage:
+                extra_kwargs["usage_metadata"] = {
+                    "input_tokens": getattr(usage, "prompt_tokens", 0) or 0,
+                    "output_tokens": getattr(usage, "completion_tokens", 0) or 0,
+                    "total_tokens": getattr(usage, "total_tokens", 0) or 0,
+                }
+            extra_kwargs["response_metadata"] = {
+                "model_name": getattr(response, "model", "unknown"),
+            }
+
         if tool_calls:
-            return AIMessage(content=content, tool_calls=tool_calls)
-        return AIMessage(content=content)
+            return AIMessage(content=content, tool_calls=tool_calls, **extra_kwargs)
+        return AIMessage(content=content, **extra_kwargs)
 
     def _convert_delta_to_chunk(self, delta: Any) -> AIMessageChunk | None:
         """Convert a streaming delta to an AIMessageChunk."""
diff --git a/surfsense_backend/app/services/llm_service.py b/surfsense_backend/app/services/llm_service.py
index 723b176076..ade202c727 100644
--- a/surfsense_backend/app/services/llm_service.py
+++ b/surfsense_backend/app/services/llm_service.py
@@ -1,3 +1,4 @@
+import asyncio
 import logging
 
 import litellm
@@ -15,6 +16,8 @@
     get_auto_mode_llm,
     is_auto_mode,
 )
+from app.services.provider_api_base import resolve_api_base
+from app.services.token_tracking_service import token_tracker
 
 # Configure litellm to automatically drop unsupported parameters
 litellm.drop_params = True
@@ -22,13 +25,47 @@
 # Memory controls: prevent unbounded internal accumulation
 litellm.telemetry = False
 litellm.cache = None
-litellm.success_callback = []
 litellm.failure_callback = []
 litellm.input_callback = []
 
+litellm.callbacks = [token_tracker]
+
 logger = logging.getLogger(__name__)
 
 
+# Providers that require an interactive OAuth / device-flow login before
+# issuing any completion. LiteLLM implements these with blocking sync polling
+# (requests + time.sleep), which would freeze the FastAPI event loop if
+# invoked from validation. They are never usable from a headless backend,
+# so we reject them at the edge.
+_INTERACTIVE_AUTH_PROVIDERS: frozenset[str] = frozenset(
+    {
+        "github_copilot",
+        "github-copilot",
+        "githubcopilot",
+        "copilot",
+    }
+)
+
+# Hard upper bound for a single validation call. Must exceed the ChatLiteLLM
+# request timeout (30s) by a small margin so a well-behaved provider never
+# trips the watchdog, while any pathological/blocking provider is killed.
+_VALIDATION_TIMEOUT_SECONDS: float = 35.0
+
+
+def _is_interactive_auth_provider(
+    provider: str | None, custom_provider: str | None
+) -> bool:
+    """Return True if the given provider triggers interactive OAuth in LiteLLM."""
+    for raw in (custom_provider, provider):
+        if not raw:
+            continue
+        normalized = raw.strip().lower().replace(" ", "_")
+        if normalized in _INTERACTIVE_AUTH_PROVIDERS:
+            return True
+    return False
+
+
 class LLMRole:
     AGENT = "agent"  # For agent/chat operations
     DOCUMENT_SUMMARY = "document_summary"  # For document summarization
@@ -90,6 +127,25 @@ async def validate_llm_config(
         - is_valid: True if config works, False otherwise
         - error_message: Empty string if valid, error description if invalid
     """
+    # Reject providers that require interactive OAuth/device-flow auth.
+    # LiteLLM's github_copilot provider (and similar) uses a blocking sync
+    # Authenticator that polls GitHub for up to several minutes and prints a
+    # device code to stdout. Running it on the FastAPI event loop will freeze
+    # the entire backend, so we refuse them up front.
+    if _is_interactive_auth_provider(provider, custom_provider):
+        msg = (
+            "Provider requires interactive OAuth/device-flow authentication "
+            "(e.g. github_copilot) and cannot be used in a hosted backend. "
+            "Please choose a provider that authenticates via API key."
+        )
+        logger.warning(
+            "Rejected LLM config validation for interactive-auth provider "
+            "(provider=%r, custom_provider=%r)",
+            provider,
+            custom_provider,
+        )
+        return False, msg
+
     try:
         # Build the model string for litellm
         if custom_provider:
@@ -148,11 +204,34 @@ async def validate_llm_config(
         if litellm_params:
             litellm_kwargs.update(litellm_params)
 
-        llm = ChatLiteLLM(**litellm_kwargs)
+        from app.agents.new_chat.llm_config import SanitizedChatLiteLLM
 
-        # Make a simple test call
+        llm = SanitizedChatLiteLLM(**litellm_kwargs)
+
+        # Run the test call in a worker thread with a hard timeout. Some
+        # LiteLLM providers have synchronous blocking code paths (e.g. OAuth
+        # authenticators that call time.sleep and requests.post) that would
+        # otherwise freeze the asyncio event loop. Offloading to a thread and
+        # bounding the wait keeps the server responsive even if a provider
+        # misbehaves.
         test_message = HumanMessage(content="Hello")
-        response = await llm.ainvoke([test_message])
+        try:
+            response = await asyncio.wait_for(
+                asyncio.to_thread(llm.invoke, [test_message]),
+                timeout=_VALIDATION_TIMEOUT_SECONDS,
+            )
+        except TimeoutError:
+            logger.warning(
+                "LLM config validation timed out after %ss for model: %s",
+                _VALIDATION_TIMEOUT_SECONDS,
+                model_string,
+            )
+            return (
+                False,
+                f"Validation timed out after {int(_VALIDATION_TIMEOUT_SECONDS)}s. "
+                "The provider is unreachable or requires interactive "
+                "authentication that is not supported by the backend.",
+            )
 
         # If we got here without exception, the config is valid
         if response and response.content:
@@ -300,7 +379,9 @@ async def get_search_space_llm_instance(
             if disable_streaming:
                 litellm_kwargs["disable_streaming"] = True
 
-            return ChatLiteLLM(**litellm_kwargs)
+            from app.agents.new_chat.llm_config import SanitizedChatLiteLLM
+
+            return SanitizedChatLiteLLM(**litellm_kwargs)
 
         # Get the LLM configuration from database (NewLLMConfig)
         result = await session.execute(
@@ -377,7 +458,9 @@ async def get_search_space_llm_instance(
         if disable_streaming:
             litellm_kwargs["disable_streaming"] = True
 
-        return ChatLiteLLM(**litellm_kwargs)
+        from app.agents.new_chat.llm_config import SanitizedChatLiteLLM
+
+        return SanitizedChatLiteLLM(**litellm_kwargs)
 
     except Exception as e:
         logger.error(
@@ -414,8 +497,14 @@ async def get_vision_llm(
     - Auto mode (ID 0): VisionLLMRouterService
     - Global (negative ID): YAML configs
     - DB (positive ID): VisionLLMConfig table
+
+    Premium global configs are wrapped in :class:`QuotaCheckedVisionLLM`
+    so each ``ainvoke`` debits the search-space owner's premium credit
+    pool. User-owned BYOK configs and free global configs are returned
+    unwrapped — they don't consume premium credit (issue M).
     """
     from app.db import VisionLLMConfig
+    from app.services.quota_checked_vision_llm import QuotaCheckedVisionLLM
     from app.services.vision_llm_router_service import (
         VISION_PROVIDER_MAP,
         VisionLLMRouterService,
@@ -437,6 +526,8 @@ async def get_vision_llm(
             logger.error(f"No vision LLM configured for search space {search_space_id}")
             return None
 
+        owner_user_id = search_space.user_id
+
         if is_vision_auto_mode(config_id):
             if not VisionLLMRouterService.is_initialized():
                 logger.error(
@@ -444,6 +535,13 @@ async def get_vision_llm(
                 )
                 return None
             try:
+                # Auto mode is currently treated as free at the wrapper
+                # level — the underlying router can dispatch to either
+                # premium or free YAML configs but routing decisions are
+                # opaque. If/when we want to bill Auto-routed vision
+                # calls we'd need to thread the resolved deployment's
+                # billing_tier back from the router. For now we keep
+                # parity with chat Auto, which also doesn't pre-classify.
                 return ChatLiteLLMRouter(
                     router=VisionLLMRouterService.get_router(),
                     streaming=True,
@@ -459,27 +557,46 @@ async def get_vision_llm(
                 return None
 
             if global_cfg.get("custom_provider"):
-                model_string = (
-                    f"{global_cfg['custom_provider']}/{global_cfg['model_name']}"
-                )
+                provider_prefix = global_cfg["custom_provider"]
+                model_string = f"{provider_prefix}/{global_cfg['model_name']}"
             else:
-                prefix = VISION_PROVIDER_MAP.get(
+                provider_prefix = VISION_PROVIDER_MAP.get(
                     global_cfg["provider"].upper(),
                     global_cfg["provider"].lower(),
                 )
-                model_string = f"{prefix}/{global_cfg['model_name']}"
+                model_string = f"{provider_prefix}/{global_cfg['model_name']}"
 
             litellm_kwargs = {
                 "model": model_string,
                 "api_key": global_cfg["api_key"],
             }
-            if global_cfg.get("api_base"):
-                litellm_kwargs["api_base"] = global_cfg["api_base"]
+            api_base = resolve_api_base(
+                provider=global_cfg.get("provider"),
+                provider_prefix=provider_prefix,
+                config_api_base=global_cfg.get("api_base"),
+            )
+            if api_base:
+                litellm_kwargs["api_base"] = api_base
             if global_cfg.get("litellm_params"):
                 litellm_kwargs.update(global_cfg["litellm_params"])
 
-            return ChatLiteLLM(**litellm_kwargs)
+            from app.agents.new_chat.llm_config import SanitizedChatLiteLLM
 
+            inner_llm = SanitizedChatLiteLLM(**litellm_kwargs)
+
+            billing_tier = str(global_cfg.get("billing_tier", "free")).lower()
+            if billing_tier == "premium":
+                return QuotaCheckedVisionLLM(
+                    inner_llm,
+                    user_id=owner_user_id,
+                    search_space_id=search_space_id,
+                    billing_tier=billing_tier,
+                    base_model=model_string,
+                    quota_reserve_tokens=global_cfg.get("quota_reserve_tokens"),
+                )
+            return inner_llm
+
+        # User-owned (positive ID) BYOK configs — always free.
         result = await session.execute(
             select(VisionLLMConfig).where(
                 VisionLLMConfig.id == config_id,
@@ -494,24 +611,32 @@ async def get_vision_llm(
             return None
 
         if vision_cfg.custom_provider:
-            model_string = f"{vision_cfg.custom_provider}/{vision_cfg.model_name}"
+            provider_prefix = vision_cfg.custom_provider
+            model_string = f"{provider_prefix}/{vision_cfg.model_name}"
         else:
-            prefix = VISION_PROVIDER_MAP.get(
+            provider_prefix = VISION_PROVIDER_MAP.get(
                 vision_cfg.provider.value.upper(),
                 vision_cfg.provider.value.lower(),
             )
-            model_string = f"{prefix}/{vision_cfg.model_name}"
+            model_string = f"{provider_prefix}/{vision_cfg.model_name}"
 
         litellm_kwargs = {
             "model": model_string,
             "api_key": vision_cfg.api_key,
         }
-        if vision_cfg.api_base:
-            litellm_kwargs["api_base"] = vision_cfg.api_base
+        api_base = resolve_api_base(
+            provider=vision_cfg.provider.value,
+            provider_prefix=provider_prefix,
+            config_api_base=vision_cfg.api_base,
+        )
+        if api_base:
+            litellm_kwargs["api_base"] = api_base
         if vision_cfg.litellm_params:
             litellm_kwargs.update(vision_cfg.litellm_params)
 
-        return ChatLiteLLM(**litellm_kwargs)
+        from app.agents.new_chat.llm_config import SanitizedChatLiteLLM
+
+        return SanitizedChatLiteLLM(**litellm_kwargs)
 
     except Exception as e:
         logger.error(
diff --git a/surfsense_backend/app/services/mcp_oauth/__init__.py b/surfsense_backend/app/services/mcp_oauth/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/services/mcp_oauth/discovery.py b/surfsense_backend/app/services/mcp_oauth/discovery.py
new file mode 100644
index 0000000000..dc21443bc2
--- /dev/null
+++ b/surfsense_backend/app/services/mcp_oauth/discovery.py
@@ -0,0 +1,123 @@
+"""MCP OAuth 2.1 metadata discovery, Dynamic Client Registration, and token exchange."""
+
+from __future__ import annotations
+
+import base64
+import logging
+from urllib.parse import urlparse
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+
+async def discover_oauth_metadata(
+    mcp_url: str,
+    *,
+    origin_override: str | None = None,
+    timeout: float = 15.0,
+) -> dict:
+    """Fetch OAuth 2.1 metadata from the MCP server's well-known endpoint.
+
+    Per the MCP spec the discovery document lives at the *origin* of the
+    MCP server URL.  ``origin_override`` can be used when the OAuth server
+    lives on a different domain (e.g. Airtable: MCP at ``mcp.airtable.com``,
+    OAuth at ``airtable.com``).
+    """
+    if origin_override:
+        origin = origin_override.rstrip("/")
+    else:
+        parsed = urlparse(mcp_url)
+        origin = f"{parsed.scheme}://{parsed.netloc}"
+    discovery_url = f"{origin}/.well-known/oauth-authorization-server"
+
+    async with httpx.AsyncClient(follow_redirects=True) as client:
+        resp = await client.get(discovery_url, timeout=timeout)
+        resp.raise_for_status()
+        return resp.json()
+
+
+async def register_client(
+    registration_endpoint: str,
+    redirect_uri: str,
+    *,
+    client_name: str = "SurfSense",
+    timeout: float = 15.0,
+) -> dict:
+    """Perform Dynamic Client Registration (RFC 7591)."""
+    payload = {
+        "client_name": client_name,
+        "redirect_uris": [redirect_uri],
+        "grant_types": ["authorization_code", "refresh_token"],
+        "response_types": ["code"],
+        "token_endpoint_auth_method": "client_secret_basic",
+    }
+
+    async with httpx.AsyncClient(follow_redirects=True) as client:
+        resp = await client.post(
+            registration_endpoint,
+            json=payload,
+            timeout=timeout,
+        )
+        resp.raise_for_status()
+        return resp.json()
+
+
+async def exchange_code_for_tokens(
+    token_endpoint: str,
+    code: str,
+    redirect_uri: str,
+    client_id: str,
+    client_secret: str,
+    code_verifier: str,
+    *,
+    timeout: float = 30.0,
+) -> dict:
+    """Exchange an authorization code for access + refresh tokens."""
+    creds = base64.b64encode(f"{client_id}:{client_secret}".encode()).decode()
+
+    async with httpx.AsyncClient(follow_redirects=True) as client:
+        resp = await client.post(
+            token_endpoint,
+            data={
+                "grant_type": "authorization_code",
+                "code": code,
+                "redirect_uri": redirect_uri,
+                "code_verifier": code_verifier,
+            },
+            headers={
+                "Content-Type": "application/x-www-form-urlencoded",
+                "Authorization": f"Basic {creds}",
+            },
+            timeout=timeout,
+        )
+        resp.raise_for_status()
+        return resp.json()
+
+
+async def refresh_access_token(
+    token_endpoint: str,
+    refresh_token: str,
+    client_id: str,
+    client_secret: str,
+    *,
+    timeout: float = 30.0,
+) -> dict:
+    """Refresh an expired access token."""
+    creds = base64.b64encode(f"{client_id}:{client_secret}".encode()).decode()
+
+    async with httpx.AsyncClient(follow_redirects=True) as client:
+        resp = await client.post(
+            token_endpoint,
+            data={
+                "grant_type": "refresh_token",
+                "refresh_token": refresh_token,
+            },
+            headers={
+                "Content-Type": "application/x-www-form-urlencoded",
+                "Authorization": f"Basic {creds}",
+            },
+            timeout=timeout,
+        )
+        resp.raise_for_status()
+        return resp.json()
diff --git a/surfsense_backend/app/services/mcp_oauth/registry.py b/surfsense_backend/app/services/mcp_oauth/registry.py
new file mode 100644
index 0000000000..310c3f6e83
--- /dev/null
+++ b/surfsense_backend/app/services/mcp_oauth/registry.py
@@ -0,0 +1,218 @@
+"""Registry of MCP services with OAuth support.
+
+Each entry maps a URL-safe service key to its MCP server endpoint and
+authentication configuration.  Services with ``supports_dcr=True`` use
+RFC 7591 Dynamic Client Registration (the MCP server issues its own
+credentials); the rest use pre-configured credentials via env vars.
+
+``allowed_tools`` whitelists which MCP tools to expose to the agent.
+An empty list means "load every tool the server advertises" (used for
+user-managed generic MCP servers).  Service-specific entries should
+curate this list to keep the agent's tool count low and selection
+accuracy high.
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass, field
+
+from app.db import SearchSourceConnectorType
+
+# Linear hosted MCP (https://linear.app/docs/mcp). Tool names are matched at
+# discovery time: names the server does not advertise are ignored.
+# See also https://github.com/linear/linear/issues/1049 for server-reported names.
+LINEAR_MCP_WRITE_TOOL_NAMES: frozenset[str] = frozenset({"save_issue"})
+LINEAR_MCP_READONLY_TOOL_NAMES: frozenset[str] = frozenset(
+    {
+        # Issues
+        "list_issues",
+        "get_issue",
+        "list_my_issues",
+        "list_issue_statuses",
+        "list_issue_labels",
+        "list_comments",
+        # People & teams
+        "list_users",
+        "get_user",
+        "list_teams",
+        "get_team",
+        # Projects & planning
+        "list_projects",
+        "get_project",
+        "list_project_labels",
+        "list_cycles",
+        # Documents
+        "list_documents",
+        "get_document",
+        # Misc read
+        "search_documentation",
+    }
+)
+LINEAR_MCP_TOOL_NAMES: frozenset[str] = (
+    LINEAR_MCP_READONLY_TOOL_NAMES | LINEAR_MCP_WRITE_TOOL_NAMES
+)
+_LINEAR_MCP_PREFIXED_NAME_RE = re.compile(r"^linear_\d+_(.+)$")
+
+
+def linear_mcp_original_tool_name(name: str) -> str | None:
+    """Map ``linear_<connector_id>_<tool>`` or bare MCP tool name to base name."""
+    m = _LINEAR_MCP_PREFIXED_NAME_RE.match(name)
+    if m:
+        base = m.group(1)
+        return base if base in LINEAR_MCP_TOOL_NAMES else None
+    if name in LINEAR_MCP_TOOL_NAMES:
+        return name
+    return None
+
+
+@dataclass(frozen=True)
+class MCPServiceConfig:
+    name: str
+    mcp_url: str
+    connector_type: str
+    supports_dcr: bool = True
+    oauth_discovery_origin: str | None = None
+    client_id_env: str | None = None
+    client_secret_env: str | None = None
+    scopes: list[str] = field(default_factory=list)
+    scope_param: str = "scope"
+    auth_endpoint_override: str | None = None
+    token_endpoint_override: str | None = None
+    allowed_tools: list[str] = field(default_factory=list)
+    readonly_tools: frozenset[str] = field(default_factory=frozenset)
+    account_metadata_keys: list[str] = field(default_factory=list)
+    """``connector.config`` keys exposed by ``get_connected_accounts``.
+
+    Only listed keys are returned to the LLM — tokens and secrets are
+    never included.  Every service should at least have its
+    ``display_name`` populated during OAuth; additional service-specific
+    fields (e.g. Jira ``cloud_id``) are listed here so the LLM can pass
+    them to action tools.
+    """
+
+
+MCP_SERVICES: dict[str, MCPServiceConfig] = {
+    "linear": MCPServiceConfig(
+        name="Linear",
+        mcp_url="https://mcp.linear.app/mcp",
+        connector_type="LINEAR_CONNECTOR",
+        allowed_tools=sorted(LINEAR_MCP_TOOL_NAMES),
+        readonly_tools=LINEAR_MCP_READONLY_TOOL_NAMES,
+        account_metadata_keys=["organization_name", "organization_url_key"],
+    ),
+    "jira": MCPServiceConfig(
+        name="Jira",
+        mcp_url="https://mcp.atlassian.com/v1/mcp",
+        connector_type="JIRA_CONNECTOR",
+        allowed_tools=[
+            "getAccessibleAtlassianResources",
+            "searchJiraIssuesUsingJql",
+            "getVisibleJiraProjects",
+            "getJiraProjectIssueTypesMetadata",
+            "createJiraIssue",
+            "editJiraIssue",
+        ],
+        readonly_tools=frozenset(
+            {
+                "getAccessibleAtlassianResources",
+                "searchJiraIssuesUsingJql",
+                "getVisibleJiraProjects",
+                "getJiraProjectIssueTypesMetadata",
+            }
+        ),
+        account_metadata_keys=["cloud_id", "site_name", "base_url"],
+    ),
+    "clickup": MCPServiceConfig(
+        name="ClickUp",
+        mcp_url="https://mcp.clickup.com/mcp",
+        connector_type="CLICKUP_CONNECTOR",
+        allowed_tools=[
+            "clickup_search",
+            "clickup_get_task",
+        ],
+        readonly_tools=frozenset({"clickup_search", "clickup_get_task"}),
+        account_metadata_keys=["workspace_id", "workspace_name"],
+    ),
+    "slack": MCPServiceConfig(
+        name="Slack",
+        mcp_url="https://mcp.slack.com/mcp",
+        connector_type="SLACK_CONNECTOR",
+        supports_dcr=False,
+        client_id_env="SLACK_CLIENT_ID",
+        client_secret_env="SLACK_CLIENT_SECRET",
+        auth_endpoint_override="https://slack.com/oauth/v2_user/authorize",
+        token_endpoint_override="https://slack.com/api/oauth.v2.user.access",
+        scopes=[
+            "search:read.public",
+            "search:read.private",
+            "search:read.mpim",
+            "search:read.im",
+            "channels:history",
+            "groups:history",
+            "mpim:history",
+            "im:history",
+        ],
+        allowed_tools=[
+            "slack_search_channels",
+            "slack_read_channel",
+            "slack_read_thread",
+        ],
+        readonly_tools=frozenset(
+            {"slack_search_channels", "slack_read_channel", "slack_read_thread"}
+        ),
+        # TODO: oauth.v2.user.access only returns team.id, not team.name.
+        # To populate team_name, either add "team:read" scope and call
+        # GET /api/team.info during OAuth callback, or switch to oauth.v2.access.
+        account_metadata_keys=["team_id", "team_name"],
+    ),
+    "airtable": MCPServiceConfig(
+        name="Airtable",
+        mcp_url="https://mcp.airtable.com/mcp",
+        connector_type="AIRTABLE_CONNECTOR",
+        supports_dcr=False,
+        oauth_discovery_origin="https://airtable.com",
+        client_id_env="AIRTABLE_CLIENT_ID",
+        client_secret_env="AIRTABLE_CLIENT_SECRET",
+        scopes=["data.records:read", "schema.bases:read"],
+        allowed_tools=[
+            "list_bases",
+            "list_tables_for_base",
+            "list_records_for_table",
+        ],
+        readonly_tools=frozenset(
+            {"list_bases", "list_tables_for_base", "list_records_for_table"}
+        ),
+        account_metadata_keys=["user_id", "user_email"],
+    ),
+}
+
+_CONNECTOR_TYPE_TO_SERVICE: dict[str, MCPServiceConfig] = {
+    svc.connector_type: svc for svc in MCP_SERVICES.values()
+}
+
+LIVE_CONNECTOR_TYPES: frozenset[SearchSourceConnectorType] = frozenset(
+    {
+        SearchSourceConnectorType.SLACK_CONNECTOR,
+        SearchSourceConnectorType.TEAMS_CONNECTOR,
+        SearchSourceConnectorType.LINEAR_CONNECTOR,
+        SearchSourceConnectorType.JIRA_CONNECTOR,
+        SearchSourceConnectorType.CLICKUP_CONNECTOR,
+        SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR,
+        SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
+        SearchSourceConnectorType.AIRTABLE_CONNECTOR,
+        SearchSourceConnectorType.GOOGLE_GMAIL_CONNECTOR,
+        SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR,
+        SearchSourceConnectorType.DISCORD_CONNECTOR,
+        SearchSourceConnectorType.LUMA_CONNECTOR,
+    }
+)
+
+
+def get_service(key: str) -> MCPServiceConfig | None:
+    return MCP_SERVICES.get(key)
+
+
+def get_service_by_connector_type(connector_type: str) -> MCPServiceConfig | None:
+    """Look up an MCP service config by its ``connector_type`` enum value."""
+    return _CONNECTOR_TYPE_TO_SERVICE.get(connector_type)
diff --git a/surfsense_backend/app/services/model_list_service.py b/surfsense_backend/app/services/model_list_service.py
index 2a81c2d529..33837a8a01 100644
--- a/surfsense_backend/app/services/model_list_service.py
+++ b/surfsense_backend/app/services/model_list_service.py
@@ -86,12 +86,34 @@ def _is_text_output_model(model: dict) -> bool:
     return output_mods == ["text"]
 
 
+def _supports_tool_calling(model: dict) -> bool:
+    """Return True if the model supports function/tool calling."""
+    supported = model.get("supported_parameters") or []
+    return "tools" in supported
+
+
+MIN_CONTEXT_LENGTH = 100_000
+
+
+def _has_sufficient_context(model: dict) -> bool:
+    """Return True if the model's context window is at least MIN_CONTEXT_LENGTH."""
+    ctx = model.get("context_length") or 0
+    return ctx >= MIN_CONTEXT_LENGTH
+
+
+def _is_allowed_model(model: dict) -> bool:
+    """Reuse the exclusion list from the OpenRouter integration service."""
+    from app.services.openrouter_integration_service import _is_allowed_model as _check
+
+    return _check(model)
+
+
 def _process_models(raw_models: list[dict]) -> list[dict]:
     """
     Transform raw OpenRouter model entries into a flat list of
     {value, label, provider, context_window} dicts.
 
-    Only text-output models are included (audio/image generators are skipped).
+    Only text-output models with tool-calling support are included.
 
     Each OpenRouter model is emitted once for OPENROUTER (full id) and,
     when the slug maps to a native provider, once more with just the
@@ -110,6 +132,15 @@ def _process_models(raw_models: list[dict]) -> list[dict]:
         if not _is_text_output_model(model):
             continue
 
+        if not _supports_tool_calling(model):
+            continue
+
+        if not _has_sufficient_context(model):
+            continue
+
+        if not _is_allowed_model(model):
+            continue
+
         provider_slug, model_name = model_id.split("/", 1)
         context_window = _format_context_length(context_length)
 
diff --git a/surfsense_backend/app/services/new_streaming_service.py b/surfsense_backend/app/services/new_streaming_service.py
index 52a215997e..ba0cb87539 100644
--- a/surfsense_backend/app/services/new_streaming_service.py
+++ b/surfsense_backend/app/services/new_streaming_service.py
@@ -456,6 +456,8 @@ def format_thinking_step(
         title: str,
         status: str = "in_progress",
         items: list[str] | None = None,
+        *,
+        metadata: dict[str, Any] | None = None,
     ) -> str:
         """
         Format a thinking step for chain-of-thought display (SurfSense specific).
@@ -469,15 +471,15 @@ def format_thinking_step(
         Returns:
             str: SSE formatted thinking step data part
         """
-        return self.format_data(
-            "thinking-step",
-            {
-                "id": step_id,
-                "title": title,
-                "status": status,
-                "items": items or [],
-            },
-        )
+        payload: dict[str, Any] = {
+            "id": step_id,
+            "title": title,
+            "status": status,
+            "items": items or [],
+        }
+        if metadata:
+            payload["metadata"] = metadata
+        return self.format_data("thinking-step", payload)
 
     def format_thread_title_update(self, thread_id: int, title: str) -> str:
         """
@@ -565,32 +567,63 @@ def _normalize_interrupt_payload(
     # Error Part
     # =========================================================================
 
-    def format_error(self, error_text: str) -> str:
+    def format_error(
+        self,
+        error_text: str,
+        error_code: str | None = None,
+        extra: dict[str, object] | None = None,
+    ) -> str:
         """
         Format an error message.
 
         Args:
             error_text: The error message text
+            error_code: Optional machine-readable error code for frontend branching
 
         Returns:
             str: SSE formatted error part
 
         Example output:
-            data: {"type":"error","errorText":"Something went wrong"}
+            data: {"type":"error","errorText":"Something went wrong","errorCode":"SOME_CODE"}
         """
-        return self._format_sse({"type": "error", "errorText": error_text})
+        payload: dict[str, object] = {"type": "error", "errorText": error_text}
+        if error_code:
+            payload["errorCode"] = error_code
+        if extra:
+            payload.update(extra)
+        return self._format_sse(payload)
 
     # =========================================================================
     # Tool Parts
     # =========================================================================
 
-    def format_tool_input_start(self, tool_call_id: str, tool_name: str) -> str:
+    def format_tool_input_start(
+        self,
+        tool_call_id: str,
+        tool_name: str,
+        *,
+        langchain_tool_call_id: str | None = None,
+        metadata: dict[str, Any] | None = None,
+    ) -> str:
         """
         Format the start of tool input streaming.
 
         Args:
-            tool_call_id: The unique tool call identifier
-            tool_name: The name of the tool being called
+            tool_call_id: The unique tool call identifier. May be EITHER the
+                synthetic ``call_<run_id>`` id derived from LangGraph
+                ``run_id`` (unmatched chunk fallback when no ``index`` was
+                registered) OR the authoritative LangChain ``tool_call.id``
+                (when the provider streams ``tool_call_chunks`` we register
+                the ``index`` and reuse the lc-id as the card id so live
+                ``tool-input-delta`` events route without a downstream join).
+                Either way, the same id is preserved across
+                ``tool-input-start`` / ``-delta`` / ``-available`` /
+                ``tool-output-available`` for one call.
+            tool_name: The name of the tool being called.
+            langchain_tool_call_id: Optional authoritative LangChain
+                ``tool_call.id``. When set, surfaces as
+                ``langchainToolCallId`` so the frontend can join this card
+                to the action-log row written by ``ActionLogMiddleware``.
 
         Returns:
             str: SSE formatted tool input start part
@@ -598,13 +631,16 @@ def format_tool_input_start(self, tool_call_id: str, tool_name: str) -> str:
         Example output:
             data: {"type":"tool-input-start","toolCallId":"call_abc123","toolName":"getWeather"}
         """
-        return self._format_sse(
-            {
-                "type": "tool-input-start",
-                "toolCallId": tool_call_id,
-                "toolName": tool_name,
-            }
-        )
+        payload: dict[str, Any] = {
+            "type": "tool-input-start",
+            "toolCallId": tool_call_id,
+            "toolName": tool_name,
+        }
+        if langchain_tool_call_id:
+            payload["langchainToolCallId"] = langchain_tool_call_id
+        if metadata:
+            payload["metadata"] = metadata
+        return self._format_sse(payload)
 
     def format_tool_input_delta(self, tool_call_id: str, input_text_delta: str) -> str:
         """
@@ -629,7 +665,13 @@ def format_tool_input_delta(self, tool_call_id: str, input_text_delta: str) -> s
         )
 
     def format_tool_input_available(
-        self, tool_call_id: str, tool_name: str, input_data: dict[str, Any]
+        self,
+        tool_call_id: str,
+        tool_name: str,
+        input_data: dict[str, Any],
+        *,
+        langchain_tool_call_id: str | None = None,
+        metadata: dict[str, Any] | None = None,
     ) -> str:
         """
         Format the completion of tool input.
@@ -638,6 +680,8 @@ def format_tool_input_available(
             tool_call_id: The tool call identifier
             tool_name: The name of the tool
             input_data: The complete tool input parameters
+            langchain_tool_call_id: Optional authoritative LangChain
+                ``tool_call.id`` (see ``format_tool_input_start``).
 
         Returns:
             str: SSE formatted tool input available part
@@ -645,22 +689,37 @@ def format_tool_input_available(
         Example output:
             data: {"type":"tool-input-available","toolCallId":"call_abc123","toolName":"getWeather","input":{"city":"SF"}}
         """
-        return self._format_sse(
-            {
-                "type": "tool-input-available",
-                "toolCallId": tool_call_id,
-                "toolName": tool_name,
-                "input": input_data,
-            }
-        )
+        payload: dict[str, Any] = {
+            "type": "tool-input-available",
+            "toolCallId": tool_call_id,
+            "toolName": tool_name,
+            "input": input_data,
+        }
+        if langchain_tool_call_id:
+            payload["langchainToolCallId"] = langchain_tool_call_id
+        if metadata:
+            payload["metadata"] = metadata
+        return self._format_sse(payload)
 
-    def format_tool_output_available(self, tool_call_id: str, output: Any) -> str:
+    def format_tool_output_available(
+        self,
+        tool_call_id: str,
+        output: Any,
+        *,
+        langchain_tool_call_id: str | None = None,
+        metadata: dict[str, Any] | None = None,
+    ) -> str:
         """
         Format tool execution output.
 
         Args:
             tool_call_id: The tool call identifier
             output: The tool execution result
+            langchain_tool_call_id: Optional authoritative LangChain
+                ``tool_call.id`` extracted from ``ToolMessage.tool_call_id``.
+                When set, the frontend can backfill any card whose
+                ``langchainToolCallId`` was not yet known at
+                ``tool-input-start`` time.
 
         Returns:
             str: SSE formatted tool output available part
@@ -668,13 +727,16 @@ def format_tool_output_available(self, tool_call_id: str, output: Any) -> str:
         Example output:
             data: {"type":"tool-output-available","toolCallId":"call_abc123","output":{"weather":"sunny"}}
         """
-        return self._format_sse(
-            {
-                "type": "tool-output-available",
-                "toolCallId": tool_call_id,
-                "output": output,
-            }
-        )
+        payload: dict[str, Any] = {
+            "type": "tool-output-available",
+            "toolCallId": tool_call_id,
+            "output": output,
+        }
+        if langchain_tool_call_id:
+            payload["langchainToolCallId"] = langchain_tool_call_id
+        if metadata:
+            payload["metadata"] = metadata
+        return self._format_sse(payload)
 
     # =========================================================================
     # Step Parts
diff --git a/surfsense_backend/app/services/notion/kb_sync_service.py b/surfsense_backend/app/services/notion/kb_sync_service.py
index be177c7cac..b10d1b1579 100644
--- a/surfsense_backend/app/services/notion/kb_sync_service.py
+++ b/surfsense_backend/app/services/notion/kb_sync_service.py
@@ -4,7 +4,6 @@
 from sqlalchemy.ext.asyncio import AsyncSession
 
 from app.db import Document, DocumentType
-from app.services.llm_service import get_user_long_context_llm
 from app.utils.document_converters import (
     create_document_chunks,
     embed_text,
@@ -74,6 +73,8 @@ async def sync_after_create(
                 )
                 content_hash = unique_hash
 
+            from app.services.llm_service import get_user_long_context_llm
+
             user_llm = await get_user_long_context_llm(
                 self.db_session,
                 user_id,
@@ -244,6 +245,8 @@ async def sync_after_update(
                 f"Final content length: {len(full_content)} chars, verified={content_verified}"
             )
 
+            from app.services.llm_service import get_user_long_context_llm
+
             logger.debug("Generating summary and embeddings")
             user_llm = await get_user_long_context_llm(
                 self.db_session,
diff --git a/surfsense_backend/app/services/notion/tool_metadata_service.py b/surfsense_backend/app/services/notion/tool_metadata_service.py
index 097ef3461a..19dc1fd890 100644
--- a/surfsense_backend/app/services/notion/tool_metadata_service.py
+++ b/surfsense_backend/app/services/notion/tool_metadata_service.py
@@ -227,8 +227,6 @@ async def _get_notion_accounts(
     async def _check_account_health(self, connector_id: int) -> bool:
         """Check if a Notion connector's token is still valid.
 
-        Uses a lightweight ``users.me()`` call to verify the token.
-
         Returns True if the token is expired/invalid, False if healthy.
         """
         try:
diff --git a/surfsense_backend/app/services/obsidian_plugin_indexer.py b/surfsense_backend/app/services/obsidian_plugin_indexer.py
new file mode 100644
index 0000000000..0fc4f30f4b
--- /dev/null
+++ b/surfsense_backend/app/services/obsidian_plugin_indexer.py
@@ -0,0 +1,621 @@
+"""
+Obsidian plugin indexer service.
+
+Bridges the SurfSense Obsidian plugin's HTTP payloads
+(see ``app/schemas/obsidian_plugin.py``) into the shared
+``IndexingPipelineService``.
+
+Responsibilities:
+
+- ``upsert_note``  — push one note through the indexing pipeline; respects
+  unchanged content (skip) and version-snapshots existing rows before
+  rewrite.
+- ``rename_note``  — rewrite path-derived fields (path metadata,
+  ``unique_identifier_hash``, ``source_url``) without re-indexing content.
+- ``delete_note``  — soft delete with a tombstone in ``document_metadata``
+  so reconciliation can distinguish "user explicitly killed this in the UI"
+  from "plugin hasn't synced yet".
+- ``get_manifest`` — return ``{path: {hash, mtime, size}}`` for every
+  non-deleted note belonging to a vault, used by the plugin's reconcile
+  pass on ``onload``.
+
+Design notes
+------------
+
+The plugin's content hash and the backend's ``content_hash`` are computed
+differently (plugin uses raw SHA-256 of the markdown body; backend salts
+with ``search_space_id``). We persist the plugin's hash in
+``document_metadata['plugin_content_hash']`` so the manifest endpoint can
+return what the plugin sent — that's the only number the plugin can
+compare without re-downloading content.
+"""
+
+from __future__ import annotations
+
+import base64
+import contextlib
+import logging
+import os
+import tempfile
+from datetime import UTC, datetime
+from typing import Any
+from urllib.parse import quote
+
+from sqlalchemy import and_, select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import (
+    Document,
+    DocumentStatus,
+    DocumentType,
+    SearchSourceConnector,
+)
+from app.indexing_pipeline.connector_document import ConnectorDocument
+from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
+from app.schemas.obsidian_plugin import (
+    ManifestEntry,
+    ManifestResponse,
+    NotePayload,
+)
+from app.utils.document_converters import generate_unique_identifier_hash
+from app.utils.document_versioning import create_version_snapshot
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _vault_path_unique_id(vault_id: str, path: str) -> str:
+    """Stable identifier for a note. Vault-scoped so the same path under two
+    different vaults doesn't collide."""
+    return f"{vault_id}:{path}"
+
+
+def _build_source_url(vault_name: str, path: str) -> str:
+    """Build the ``obsidian://`` deep link for the web UI's "Open in Obsidian"
+    button. Both segments are URL-encoded because vault names and paths can
+    contain spaces, ``#``, ``?``, etc.
+    """
+    return (
+        "obsidian://open"
+        f"?vault={quote(vault_name, safe='')}"
+        f"&file={quote(path, safe='')}"
+    )
+
+
+def _build_metadata(
+    payload: NotePayload,
+    *,
+    vault_name: str,
+    connector_id: int,
+    extra: dict[str, Any] | None = None,
+) -> dict[str, Any]:
+    """Flatten the rich plugin payload into the JSONB ``document_metadata``
+    column. Keys here are what the chat UI / search UI surface to users.
+    """
+    meta: dict[str, Any] = {
+        "source": "plugin",
+        "vault_id": payload.vault_id,
+        "vault_name": vault_name,
+        "file_path": payload.path,
+        "file_name": payload.name,
+        "extension": payload.extension,
+        "frontmatter": payload.frontmatter,
+        "tags": payload.tags,
+        "headings": [h.model_dump() for h in payload.headings],
+        "outgoing_links": payload.resolved_links,
+        "unresolved_links": payload.unresolved_links,
+        "embeds": payload.embeds,
+        "aliases": payload.aliases,
+        "plugin_content_hash": payload.content_hash,
+        "plugin_file_size": payload.size,
+        "mtime": payload.mtime.isoformat(),
+        "ctime": payload.ctime.isoformat(),
+        "connector_id": connector_id,
+        "url": _build_source_url(vault_name, payload.path),
+    }
+    if payload.is_binary:
+        meta["is_binary"] = True
+        meta["mime_type"] = payload.mime_type
+    if extra:
+        meta.update(extra)
+    return meta
+
+
+def _build_document_string(
+    payload: NotePayload, vault_name: str, *, content_override: str | None = None
+) -> str:
+    """Compose the indexable string the pipeline embeds and chunks.
+
+    Mirrors the legacy obsidian indexer's METADATA + CONTENT framing so
+    existing search relevance heuristics keep working unchanged.
+    """
+    tags_line = ", ".join(payload.tags) if payload.tags else "None"
+    links_line = ", ".join(payload.resolved_links) if payload.resolved_links else "None"
+    body = payload.content if content_override is None else content_override
+    return (
+        "<METADATA>\n"
+        f"Title: {payload.name}\n"
+        f"Vault: {vault_name}\n"
+        f"Path: {payload.path}\n"
+        f"Tags: {tags_line}\n"
+        f"Links to: {links_line}\n"
+        "</METADATA>\n\n"
+        "<CONTENT>\n"
+        f"{body}\n"
+        "</CONTENT>\n"
+    )
+
+
+async def _extract_binary_attachment_markdown(
+    payload: NotePayload, *, vision_llm
+) -> tuple[str, dict[str, Any]]:
+    try:
+        raw_bytes = base64.b64decode(payload.binary_base64, validate=True)
+    except Exception:
+        logger.warning(
+            "obsidian attachment payload had invalid base64: %s", payload.path
+        )
+        return "", {"attachment_extraction_status": "invalid_binary_payload"}
+
+    suffix = f".{payload.extension.lstrip('.')}"
+    temp_path: str | None = None
+    filename = payload.path.rsplit("/", 1)[-1] or payload.name
+    try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+            tmp.write(raw_bytes)
+            temp_path = tmp.name
+
+        result = await _run_etl_extract(
+            file_path=temp_path,
+            filename=filename,
+            vision_llm=vision_llm,
+        )
+        metadata: dict[str, Any] = {
+            "attachment_extraction_status": "ok",
+            "attachment_etl_service": result.etl_service,
+            "attachment_content_type": result.content_type,
+        }
+        return result.markdown_content, metadata
+    except Exception as exc:
+        logger.warning(
+            "obsidian attachment ETL failed for %s: %s",
+            payload.path,
+            exc,
+            exc_info=True,
+        )
+        return "", {
+            "attachment_extraction_status": "etl_failed",
+            "attachment_extraction_error": str(exc)[:300],
+        }
+    finally:
+        if temp_path and os.path.exists(temp_path):
+            with contextlib.suppress(Exception):
+                os.unlink(temp_path)
+
+
+async def _run_etl_extract(*, file_path: str, filename: str, vision_llm):
+    """Lazy-load ETL dependencies to avoid module-import cycles."""
+    from app.etl_pipeline.etl_document import EtlRequest
+    from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
+
+    return await EtlPipelineService(vision_llm=vision_llm).extract(
+        EtlRequest(file_path=file_path, filename=filename)
+    )
+
+
+def _is_image_attachment(payload: NotePayload) -> bool:
+    ext = payload.extension.lower().lstrip(".")
+    return ext in {"png", "jpg", "jpeg", "gif", "webp", "svg"}
+
+
+async def _resolve_attachment_vision_llm(
+    session: AsyncSession,
+    *,
+    connector: SearchSourceConnector,
+    search_space_id: int,
+    payload: NotePayload,
+):
+    """Match connector indexers: only fetch vision LLM for image attachments
+    when the connector has vision indexing enabled."""
+    if not payload.is_binary:
+        return None
+    if not _is_image_attachment(payload):
+        return None
+    if not getattr(connector, "enable_vision_llm", False):
+        return None
+
+    from app.services.llm_service import get_vision_llm
+
+    return await get_vision_llm(session, search_space_id)
+
+
+async def _resolve_summary_llm(
+    session: AsyncSession, *, user_id: str, search_space_id: int, should_summarize: bool
+):
+    """Fetch summary LLM only when indexing summary is enabled."""
+    if not should_summarize:
+        return None
+
+    from app.services.llm_service import get_user_long_context_llm
+
+    return await get_user_long_context_llm(session, user_id, search_space_id)
+
+
+def _require_extracted_attachment_content(
+    *, content: str, etl_meta: dict[str, Any], path: str
+) -> str:
+    extracted = content.strip()
+    if extracted:
+        return extracted
+
+    status = etl_meta.get("attachment_extraction_status", "unknown")
+    reason = etl_meta.get("attachment_extraction_error")
+    if reason:
+        raise RuntimeError(
+            f"Attachment extraction failed for {path} ({status}): {reason}"
+        )
+    raise RuntimeError(f"Attachment extraction failed for {path} ({status})")
+
+
+async def _find_existing_document(
+    session: AsyncSession,
+    *,
+    search_space_id: int,
+    vault_id: str,
+    path: str,
+) -> Document | None:
+    unique_id = _vault_path_unique_id(vault_id, path)
+    uid_hash = generate_unique_identifier_hash(
+        DocumentType.OBSIDIAN_CONNECTOR,
+        unique_id,
+        search_space_id,
+    )
+    result = await session.execute(
+        select(Document).where(Document.unique_identifier_hash == uid_hash)
+    )
+    return result.scalars().first()
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+
+async def upsert_note(
+    session: AsyncSession,
+    *,
+    connector: SearchSourceConnector,
+    payload: NotePayload,
+    user_id: str,
+) -> Document:
+    """Index or refresh a single note pushed by the plugin.
+
+    Returns the resulting ``Document`` (whether newly created, updated, or
+    a skip-because-unchanged hit).
+    """
+    vault_name: str = (connector.config or {}).get("vault_name") or "Vault"
+    search_space_id = connector.search_space_id
+
+    existing = await _find_existing_document(
+        session,
+        search_space_id=search_space_id,
+        vault_id=payload.vault_id,
+        path=payload.path,
+    )
+
+    plugin_hash = payload.content_hash
+    if existing is not None:
+        existing_meta = existing.document_metadata or {}
+        was_tombstoned = bool(existing_meta.get("deleted_at"))
+
+        if (
+            not was_tombstoned
+            and existing_meta.get("plugin_content_hash") == plugin_hash
+            and DocumentStatus.is_state(existing.status, DocumentStatus.READY)
+        ):
+            return existing
+
+        try:
+            await create_version_snapshot(session, existing)
+        except Exception:
+            logger.debug(
+                "version snapshot failed for obsidian doc %s",
+                existing.id,
+                exc_info=True,
+            )
+
+    content_for_index = payload.content
+    extra_meta: dict[str, Any] = {}
+    vision_llm = None
+    if payload.is_binary:
+        vision_llm = await _resolve_attachment_vision_llm(
+            session,
+            connector=connector,
+            search_space_id=search_space_id,
+            payload=payload,
+        )
+        content_for_index, etl_meta = await _extract_binary_attachment_markdown(
+            payload, vision_llm=vision_llm
+        )
+        extra_meta.update(etl_meta)
+        # Strict KB behavior: do not index metadata-only attachments.
+        content_for_index = _require_extracted_attachment_content(
+            content=content_for_index,
+            etl_meta=etl_meta,
+            path=payload.path,
+        )
+
+    llm = await _resolve_summary_llm(
+        session,
+        user_id=str(user_id),
+        search_space_id=search_space_id,
+        should_summarize=connector.enable_summary,
+    )
+
+    document_string = _build_document_string(
+        payload, vault_name, content_override=content_for_index
+    )
+    metadata = _build_metadata(
+        payload,
+        vault_name=vault_name,
+        connector_id=connector.id,
+        extra=extra_meta,
+    )
+
+    connector_doc = ConnectorDocument(
+        title=payload.name,
+        source_markdown=document_string,
+        unique_id=_vault_path_unique_id(payload.vault_id, payload.path),
+        document_type=DocumentType.OBSIDIAN_CONNECTOR,
+        search_space_id=search_space_id,
+        connector_id=connector.id,
+        created_by_id=str(user_id),
+        should_summarize=connector.enable_summary,
+        fallback_summary=f"Obsidian Note: {payload.name}\n\n{content_for_index}",
+        metadata=metadata,
+    )
+
+    pipeline = IndexingPipelineService(session)
+    prepared = await pipeline.prepare_for_indexing([connector_doc])
+    if not prepared:
+        if existing is not None:
+            return existing
+        raise RuntimeError(f"Indexing pipeline rejected obsidian note {payload.path}")
+
+    document = prepared[0]
+
+    return await pipeline.index(document, connector_doc, llm)
+
+
+async def rename_note(
+    session: AsyncSession,
+    *,
+    connector: SearchSourceConnector,
+    old_path: str,
+    new_path: str,
+    vault_id: str,
+) -> Document | None:
+    """Rewrite path-derived columns without re-indexing content.
+
+    Returns the updated document, or ``None`` if no row matched the
+    ``old_path`` (this happens when the plugin is renaming a file that was
+    never synced — safe to ignore, the next ``sync`` will create it under
+    the new path).
+    """
+    vault_name: str = (connector.config or {}).get("vault_name") or "Vault"
+    search_space_id = connector.search_space_id
+
+    existing = await _find_existing_document(
+        session,
+        search_space_id=search_space_id,
+        vault_id=vault_id,
+        path=old_path,
+    )
+    if existing is None:
+        return None
+
+    new_unique_id = _vault_path_unique_id(vault_id, new_path)
+    new_uid_hash = generate_unique_identifier_hash(
+        DocumentType.OBSIDIAN_CONNECTOR,
+        new_unique_id,
+        search_space_id,
+    )
+
+    collision = await session.execute(
+        select(Document).where(
+            and_(
+                Document.unique_identifier_hash == new_uid_hash,
+                Document.id != existing.id,
+            )
+        )
+    )
+    collision_row = collision.scalars().first()
+    if collision_row is not None:
+        logger.warning(
+            "obsidian rename target already exists "
+            "(vault=%s old=%s new=%s); skipping rename so the next /sync "
+            "can resolve the conflict via content_hash",
+            vault_id,
+            old_path,
+            new_path,
+        )
+        return existing
+
+    new_filename = new_path.rsplit("/", 1)[-1]
+    new_stem = new_filename.rsplit(".", 1)[0] if "." in new_filename else new_filename
+
+    existing.unique_identifier_hash = new_uid_hash
+    existing.title = new_stem
+
+    meta = dict(existing.document_metadata or {})
+    meta["file_path"] = new_path
+    meta["file_name"] = new_stem
+    meta["url"] = _build_source_url(vault_name, new_path)
+    existing.document_metadata = meta
+    existing.updated_at = datetime.now(UTC)
+
+    await session.commit()
+    return existing
+
+
+async def delete_note(
+    session: AsyncSession,
+    *,
+    connector: SearchSourceConnector,
+    vault_id: str,
+    path: str,
+) -> bool:
+    """Soft-delete via tombstone in ``document_metadata``.
+
+    The row is *not* removed and chunks are *not* dropped, so existing
+    citations in chat threads remain resolvable. The manifest endpoint
+    filters tombstoned rows out, so the plugin's reconcile pass will not
+    see this path and won't try to "resurrect" a note the user deleted in
+    the SurfSense UI.
+
+    Returns True if a row was tombstoned, False if no matching row existed.
+    """
+    existing = await _find_existing_document(
+        session,
+        search_space_id=connector.search_space_id,
+        vault_id=vault_id,
+        path=path,
+    )
+    if existing is None:
+        return False
+
+    meta = dict(existing.document_metadata or {})
+    if meta.get("deleted_at"):
+        return True
+
+    meta["deleted_at"] = datetime.now(UTC).isoformat()
+    meta["deleted_by_source"] = "plugin"
+    existing.document_metadata = meta
+    existing.updated_at = datetime.now(UTC)
+
+    await session.commit()
+    return True
+
+
+async def merge_obsidian_connectors(
+    session: AsyncSession,
+    *,
+    source: SearchSourceConnector,
+    target: SearchSourceConnector,
+) -> None:
+    """Fold ``source``'s documents into ``target`` and delete ``source``.
+
+    Triggered when the fingerprint dedup detects two plugin connectors
+    pointing at the same vault (e.g. a mobile install raced with iCloud
+    hydration and got a partial fingerprint, then caught up). Path
+    collisions resolve in favour of ``target`` (the surviving row);
+    ``source``'s duplicate documents are hard-deleted along with their
+    chunks via the ``cascade='all, delete-orphan'`` on ``Document.chunks``.
+    """
+    if source.id == target.id:
+        return
+
+    target_vault_id = (target.config or {}).get("vault_id")
+    target_search_space_id = target.search_space_id
+    if not target_vault_id:
+        raise RuntimeError("merge target is missing vault_id")
+
+    target_paths_result = await session.execute(
+        select(Document).where(
+            and_(
+                Document.connector_id == target.id,
+                Document.document_type == DocumentType.OBSIDIAN_CONNECTOR,
+            )
+        )
+    )
+    target_paths: set[str] = set()
+    for doc in target_paths_result.scalars().all():
+        meta = doc.document_metadata or {}
+        path = meta.get("file_path")
+        if path:
+            target_paths.add(path)
+
+    source_docs_result = await session.execute(
+        select(Document).where(
+            and_(
+                Document.connector_id == source.id,
+                Document.document_type == DocumentType.OBSIDIAN_CONNECTOR,
+            )
+        )
+    )
+
+    for doc in source_docs_result.scalars().all():
+        meta = dict(doc.document_metadata or {})
+        path = meta.get("file_path")
+        if not path or path in target_paths:
+            await session.delete(doc)
+            continue
+
+        new_unique_id = _vault_path_unique_id(target_vault_id, path)
+        new_uid_hash = generate_unique_identifier_hash(
+            DocumentType.OBSIDIAN_CONNECTOR,
+            new_unique_id,
+            target_search_space_id,
+        )
+        meta["vault_id"] = target_vault_id
+        meta["connector_id"] = target.id
+        doc.document_metadata = meta
+        doc.connector_id = target.id
+        doc.search_space_id = target_search_space_id
+        doc.unique_identifier_hash = new_uid_hash
+        target_paths.add(path)
+
+    await session.flush()
+    await session.delete(source)
+
+
+async def get_manifest(
+    session: AsyncSession,
+    *,
+    connector: SearchSourceConnector,
+    vault_id: str,
+) -> ManifestResponse:
+    """Return ``{path: {hash, mtime, size}}`` for every non-deleted note in
+    this vault.
+
+    The plugin compares this against its local vault on every ``onload`` to
+    catch up edits made while offline. Rows missing ``plugin_content_hash``
+    (e.g. tombstoned, or somehow indexed without going through this
+    service) are excluded so the plugin doesn't get confused by partial
+    data.
+    """
+    result = await session.execute(
+        select(Document).where(
+            and_(
+                Document.search_space_id == connector.search_space_id,
+                Document.connector_id == connector.id,
+                Document.document_type == DocumentType.OBSIDIAN_CONNECTOR,
+            )
+        )
+    )
+
+    items: dict[str, ManifestEntry] = {}
+    for doc in result.scalars().all():
+        meta = doc.document_metadata or {}
+        if meta.get("deleted_at"):
+            continue
+        if meta.get("vault_id") != vault_id:
+            continue
+        path = meta.get("file_path")
+        plugin_hash = meta.get("plugin_content_hash")
+        mtime_raw = meta.get("mtime")
+        if not path or not plugin_hash or not mtime_raw:
+            continue
+        try:
+            mtime = datetime.fromisoformat(mtime_raw)
+        except ValueError:
+            continue
+        size_raw = meta.get("plugin_file_size")
+        size = int(size_raw) if isinstance(size_raw, int) else None
+        items[path] = ManifestEntry(hash=plugin_hash, mtime=mtime, size=size)
+
+    return ManifestResponse(vault_id=vault_id, items=items)
diff --git a/surfsense_backend/app/services/onedrive/kb_sync_service.py b/surfsense_backend/app/services/onedrive/kb_sync_service.py
index 962c19fc90..e9b2e38ea2 100644
--- a/surfsense_backend/app/services/onedrive/kb_sync_service.py
+++ b/surfsense_backend/app/services/onedrive/kb_sync_service.py
@@ -5,7 +5,6 @@
 
 from app.db import Document, DocumentType
 from app.indexing_pipeline.document_hashing import compute_identifier_hash
-from app.services.llm_service import get_user_long_context_llm
 from app.utils.document_converters import (
     create_document_chunks,
     embed_text,
@@ -73,6 +72,8 @@ async def sync_after_create(
                 )
                 content_hash = unique_hash
 
+            from app.services.llm_service import get_user_long_context_llm
+
             user_llm = await get_user_long_context_llm(
                 self.db_session,
                 user_id,
diff --git a/surfsense_backend/app/services/openrouter_integration_service.py b/surfsense_backend/app/services/openrouter_integration_service.py
new file mode 100644
index 0000000000..6454e2d58c
--- /dev/null
+++ b/surfsense_backend/app/services/openrouter_integration_service.py
@@ -0,0 +1,990 @@
+"""
+OpenRouter Integration Service
+
+Dynamically fetches all available models from the OpenRouter public API
+and generates virtual global LLM config entries. These entries are injected
+into config.GLOBAL_LLM_CONFIGS so they appear alongside static YAML configs
+in the model selector.
+
+All actual LLM calls go through LiteLLM with the ``openrouter/`` prefix --
+this service only manages the catalogue, not the inference path.
+"""
+
+import asyncio
+import hashlib
+import logging
+import threading
+import time
+from typing import Any
+
+import httpx
+
+from app.services.quality_score import (
+    _HEALTH_BLEND_WEIGHT,
+    _HEALTH_ENRICH_CONCURRENCY,
+    _HEALTH_ENRICH_TOP_N_FREE,
+    _HEALTH_ENRICH_TOP_N_PREMIUM,
+    _HEALTH_FAIL_RATIO_FALLBACK,
+    _HEALTH_FETCH_TIMEOUT_SEC,
+    aggregate_health,
+    static_score_or,
+)
+
+logger = logging.getLogger(__name__)
+
+OPENROUTER_API_URL = "https://openrouter.ai/api/v1/models"
+OPENROUTER_ENDPOINTS_URL_TEMPLATE = (
+    "https://openrouter.ai/api/v1/models/{model_id}/endpoints"
+)
+
+# Sentinel value stored on each generated config so we can distinguish
+# dynamic OpenRouter entries from hand-written YAML entries during refresh.
+_OPENROUTER_DYNAMIC_MARKER = "__openrouter_dynamic__"
+
+# Width of the hash space used by ``_stable_config_id``. 9_000_000 provides
+# enough headroom to avoid frequent collisions for OpenRouter's catalogue
+# (~300 models) while keeping IDs comfortably within Postgres INTEGER range.
+_STABLE_ID_HASH_WIDTH = 9_000_000
+
+
+def _stable_config_id(model_id: str, offset: int, taken: set[int]) -> int:
+    """Derive a deterministic negative config ID from ``model_id``.
+
+    The same ``model_id`` always hashes to the same base value so thread pins
+    survive catalogue churn (models appearing/disappearing/reordering between
+    refreshes). On collision we decrement until we find an unused slot; this
+    keeps the mapping stable for the first config that claimed a slot and
+    only shifts collisions, which is much less disruptive than the legacy
+    index-based scheme that reshuffled every ID when the catalogue changed.
+    """
+    digest = hashlib.blake2b(model_id.encode("utf-8"), digest_size=6).digest()
+    base = offset - (int.from_bytes(digest, "big") % _STABLE_ID_HASH_WIDTH)
+    cid = base
+    while cid in taken:
+        cid -= 1
+    taken.add(cid)
+    return cid
+
+
+def _openrouter_tier(model: dict) -> str:
+    """Classify an OpenRouter model as ``"free"`` or ``"premium"``.
+
+    Per OpenRouter's API contract, a model is free if:
+    - Its id ends with ``:free`` (OpenRouter's own free-variant convention), or
+    - Both ``pricing.prompt`` and ``pricing.completion`` are zero strings.
+
+    Anything else (missing pricing, non-zero pricing) falls through to
+    ``"premium"`` so we never under-charge users. This derivation runs off the
+    already-cached /api/v1/models payload, so it adds no network cost.
+    """
+    if model.get("id", "").endswith(":free"):
+        return "free"
+    pricing = model.get("pricing") or {}
+    prompt = str(pricing.get("prompt", "")).strip()
+    completion = str(pricing.get("completion", "")).strip()
+    if prompt == "0" and completion == "0":
+        return "free"
+    return "premium"
+
+
+def _is_text_output_model(model: dict) -> bool:
+    """Return True if the model produces text output only (skip image/audio generators)."""
+    output_mods = model.get("architecture", {}).get("output_modalities", [])
+    return output_mods == ["text"]
+
+
+def _is_image_output_model(model: dict) -> bool:
+    """Return True if the model can produce image output.
+
+    OpenRouter's ``architecture.output_modalities`` is a list (e.g.
+    ``["image"]`` for pure image generators, ``["text", "image"]`` for
+    multi-modal generators that also emit captions). We accept any model
+    that can output images; the call site decides whether to use the
+    image-generation API or chat completion.
+    """
+    output_mods = model.get("architecture", {}).get("output_modalities", []) or []
+    return "image" in output_mods
+
+
+def _is_vision_input_model(model: dict) -> bool:
+    """Return True if the model can ingest an image AND emit text.
+
+    OpenRouter's ``architecture.input_modalities`` lists what the model
+    accepts; ``output_modalities`` lists what it produces. A vision LLM
+    is a model that takes images in and produces text out — i.e. it can
+    answer questions about a screenshot or extract content from an
+    image. Pure image-to-image models (e.g. style transfer) and
+    text-only models are excluded.
+    """
+    arch = model.get("architecture", {}) or {}
+    input_mods = arch.get("input_modalities", []) or []
+    output_mods = arch.get("output_modalities", []) or []
+    return "image" in input_mods and "text" in output_mods
+
+
+def _supports_image_input(model: dict) -> bool:
+    """Return True if the model accepts ``image`` in its input modalities.
+
+    Differs from :func:`_is_vision_input_model` in that it does NOT
+    require text output — chat-tab models always emit text already (the
+    chat catalog filters by ``_is_text_output_model``), so the only
+    extra capability we need to track per chat config is whether the
+    model can ingest user-attached images. The chat selector and the
+    streaming task both key off this flag to prevent hitting an
+    OpenRouter 404 ``"No endpoints found that support image input"``
+    when the user uploads an image and selects a text-only model
+    (DeepSeek V3, Llama 3.x base, etc.).
+    """
+    arch = model.get("architecture", {}) or {}
+    input_mods = arch.get("input_modalities", []) or []
+    return "image" in input_mods
+
+
+def _supports_tool_calling(model: dict) -> bool:
+    """Return True if the model supports function/tool calling."""
+    supported = model.get("supported_parameters") or []
+    return "tools" in supported
+
+
+MIN_CONTEXT_LENGTH = 100_000
+
+# Provider slugs whose backend is fundamentally incompatible with our agent's
+# tool-call message flow (e.g. Amazon Bedrock requires toolConfig alongside
+# tool history which OpenRouter doesn't relay).
+_EXCLUDED_PROVIDER_SLUGS = {"amazon"}
+
+_EXCLUDED_MODEL_IDS: set[str] = {
+    # Deprecated / removed upstream
+    "openai/gpt-4-1106-preview",
+    "openai/gpt-4-turbo-preview",
+    # Permanently no-capacity variant
+    "openai/gpt-4o:extended",
+    # Non-serverless model that requires a dedicated endpoint
+    "arcee-ai/virtuoso-large",
+    # Deep-research models reject standard params (temperature, etc.)
+    "openai/o3-deep-research",
+    "openai/o4-mini-deep-research",
+    # OpenRouter's own meta-router over free models. We already enumerate every
+    # concrete ``:free`` model into GLOBAL_LLM_CONFIGS and Auto-mode thread
+    # pinning handles churn via the repair path, so exposing an additional
+    # indirection layer would only duplicate the capability with an opaque slug.
+    "openrouter/free",
+}
+
+_EXCLUDED_MODEL_SUFFIXES: tuple[str, ...] = ("-deep-research",)
+
+
+def _has_sufficient_context(model: dict) -> bool:
+    """Return True if the model's context window is at least MIN_CONTEXT_LENGTH."""
+    ctx = model.get("context_length") or 0
+    return ctx >= MIN_CONTEXT_LENGTH
+
+
+def _is_compatible_provider(model: dict) -> bool:
+    """Return False for models from providers known to be incompatible."""
+    model_id = model.get("id", "")
+    slug = model_id.split("/", 1)[0] if "/" in model_id else ""
+    return slug not in _EXCLUDED_PROVIDER_SLUGS
+
+
+def _is_allowed_model(model: dict) -> bool:
+    """Return False for specific model IDs known to be broken or incompatible."""
+    model_id = model.get("id", "")
+    if model_id in _EXCLUDED_MODEL_IDS:
+        return False
+    base_id = model_id.split(":")[0]
+    return not base_id.endswith(_EXCLUDED_MODEL_SUFFIXES)
+
+
+def _fetch_models_sync() -> list[dict] | None:
+    """Synchronous fetch for use during startup (before the event loop is running)."""
+    try:
+        with httpx.Client(timeout=20) as client:
+            response = client.get(OPENROUTER_API_URL)
+            response.raise_for_status()
+            data = response.json()
+            return data.get("data", [])
+    except Exception as e:
+        logger.warning("Failed to fetch OpenRouter models (sync): %s", e)
+        return None
+
+
+async def _fetch_models_async() -> list[dict] | None:
+    """Async fetch for background refresh."""
+    try:
+        async with httpx.AsyncClient(timeout=20) as client:
+            response = await client.get(OPENROUTER_API_URL)
+            response.raise_for_status()
+            data = response.json()
+            return data.get("data", [])
+    except Exception as e:
+        logger.warning("Failed to fetch OpenRouter models (async): %s", e)
+        return None
+
+
+def _extract_raw_pricing(raw_models: list[dict]) -> dict[str, dict[str, str]]:
+    """Return a ``{model_id: {"prompt": str, "completion": str}}`` map.
+
+    Pricing values are kept as the raw OpenRouter strings (e.g.
+    ``"0.000003"``); ``pricing_registration`` converts them to floats
+    when registering with LiteLLM. Models with missing or malformed
+    pricing are simply omitted — operator-side risk if any of those are
+    premium.
+    """
+    pricing: dict[str, dict[str, str]] = {}
+    for model in raw_models:
+        model_id = str(model.get("id") or "").strip()
+        if not model_id:
+            continue
+        p = model.get("pricing") or {}
+        prompt = p.get("prompt")
+        completion = p.get("completion")
+        if prompt is None and completion is None:
+            continue
+        pricing[model_id] = {
+            "prompt": str(prompt) if prompt is not None else "",
+            "completion": str(completion) if completion is not None else "",
+        }
+    return pricing
+
+
+def _generate_configs(
+    raw_models: list[dict],
+    settings: dict[str, Any],
+) -> list[dict]:
+    """Convert raw OpenRouter model entries into global LLM config dicts.
+
+    Tier (``billing_tier``) is derived per-model from OpenRouter's own API
+    signals via ``_openrouter_tier`` — there is no longer a uniform YAML
+    override. Config IDs are derived via ``_stable_config_id`` so they
+    survive catalogue churn across refreshes.
+
+    Router-pool membership is tier-aware:
+
+    - Premium OR models join the LiteLLM router pool (``router_pool_eligible=True``)
+      so sub-agent ``model="auto"`` flows benefit from load balancing and
+      failover across the curated YAML configs and the OR premium passthrough.
+    - Free OR models stay excluded (``router_pool_eligible=False``). LiteLLM
+      Router tracks rate limits per deployment, but OpenRouter enforces a
+      single global free-tier quota (~20 RPM + 50-1000 daily requests
+      account-wide across every ``:free`` model), so rotating across many
+      free deployments would only burn the shared bucket faster. Free OR
+      models remain fully available for user-facing Auto-mode thread pinning
+      via ``auto_model_pin_service``.
+
+    OpenRouter's own ``openrouter/free`` meta-router is filtered out upstream
+    via ``_EXCLUDED_MODEL_IDS``; we don't expose a redundant auto-select layer
+    because our own Auto (Fastest) pin + 24 h refresh + repair logic already
+    cover the catalogue-churn case.
+    """
+    id_offset: int = settings.get("id_offset", -10000)
+    api_key: str = settings.get("api_key", "")
+    seo_enabled: bool = settings.get("seo_enabled", False)
+    quota_reserve_tokens: int = settings.get("quota_reserve_tokens", 4000)
+    rpm: int = settings.get("rpm", 200)
+    tpm: int = settings.get("tpm", 1_000_000)
+    free_rpm: int = settings.get("free_rpm", 20)
+    free_tpm: int = settings.get("free_tpm", 100_000)
+    anon_paid: bool = settings.get("anonymous_enabled_paid", False)
+    anon_free: bool = settings.get("anonymous_enabled_free", False)
+    litellm_params: dict = settings.get("litellm_params") or {}
+    system_instructions: str = settings.get("system_instructions", "")
+    use_default: bool = settings.get("use_default_system_instructions", True)
+    citations_enabled: bool = settings.get("citations_enabled", True)
+
+    text_models = [
+        m
+        for m in raw_models
+        if _is_text_output_model(m)
+        and _supports_tool_calling(m)
+        and _has_sufficient_context(m)
+        and _is_compatible_provider(m)
+        and _is_allowed_model(m)
+        and "/" in m.get("id", "")
+    ]
+
+    configs: list[dict] = []
+    taken: set[int] = set()
+    now_ts = int(time.time())
+
+    for model in text_models:
+        model_id: str = model["id"]
+        name: str = model.get("name", model_id)
+        tier = _openrouter_tier(model)
+
+        static_q = static_score_or(model, now_ts=now_ts)
+
+        cfg: dict[str, Any] = {
+            "id": _stable_config_id(model_id, id_offset, taken),
+            "name": name,
+            "description": f"{name} via OpenRouter",
+            "billing_tier": tier,
+            "anonymous_enabled": anon_free if tier == "free" else anon_paid,
+            "seo_enabled": seo_enabled,
+            "seo_slug": None,
+            "quota_reserve_tokens": quota_reserve_tokens,
+            "provider": "OPENROUTER",
+            "model_name": model_id,
+            "api_key": api_key,
+            "api_base": "",
+            "rpm": free_rpm if tier == "free" else rpm,
+            "tpm": free_tpm if tier == "free" else tpm,
+            "litellm_params": dict(litellm_params),
+            "system_instructions": system_instructions,
+            "use_default_system_instructions": use_default,
+            "citations_enabled": citations_enabled,
+            # Premium OR deployments join the LiteLLM router pool so sub-agent
+            # model="auto" flows can load-balance / fail over across them.
+            # Free OR deployments stay out: OpenRouter's free tier is a single
+            # account-wide quota, so per-deployment routing can't spread load
+            # there — it just drains the shared bucket faster.
+            "router_pool_eligible": tier == "premium",
+            # Capability flag derived from ``architecture.input_modalities``.
+            # Read by the new-chat selector to dim image-incompatible models
+            # when the user has pending image attachments, and by
+            # ``stream_new_chat`` as a fail-fast safety net before the
+            # OpenRouter request would otherwise 404 with
+            # ``"No endpoints found that support image input"``.
+            "supports_image_input": _supports_image_input(model),
+            _OPENROUTER_DYNAMIC_MARKER: True,
+            # Auto (Fastest) ranking metadata. ``quality_score`` is initialised
+            # to the static score and gets re-blended with health on the next
+            # ``_enrich_health`` pass (synchronous on refresh, deferred on cold
+            # start so startup latency is unchanged).
+            "auto_pin_tier": "B" if tier == "premium" else "C",
+            "quality_score_static": static_q,
+            "quality_score_health": None,
+            "quality_score": static_q,
+            "health_gated": False,
+        }
+        configs.append(cfg)
+
+    return configs
+
+
+# ID-offset bands used to keep dynamic OpenRouter configs in their own
+# namespace per surface. Image / vision get separate bands so a single
+# Postgres-INTEGER cfg ID is unambiguous about which selector it belongs to.
+_OPENROUTER_IMAGE_ID_OFFSET_DEFAULT = -20000
+_OPENROUTER_VISION_ID_OFFSET_DEFAULT = -30000
+
+
+def _generate_image_gen_configs(
+    raw_models: list[dict], settings: dict[str, Any]
+) -> list[dict]:
+    """Convert OpenRouter image-generation models into global image-gen
+    config dicts (matches the YAML shape consumed by ``image_generation_routes``).
+
+    Filter:
+      - architecture.output_modalities contains "image"
+      - compatible provider (excluded slugs blocked)
+      - allowed model id (excluded list blocked)
+
+    Notably we *drop* the chat-only filters (``_supports_tool_calling`` and
+    ``_has_sufficient_context``) because tool calls and context windows are
+    irrelevant for the ``aimage_generation`` API. ``billing_tier`` is
+    derived per model the same way as chat (``_openrouter_tier``).
+
+    Cost is intentionally *not* registered with LiteLLM at startup
+    (``pricing_registration`` skips image gen): OpenRouter image-gen
+    models are not in LiteLLM's native cost map and OpenRouter populates
+    ``response_cost`` directly from the response header. A defensive
+    branch in ``_extract_cost_usd`` handles the rare case where
+    ``usage.cost`` is missing — see ``token_tracking_service``.
+    """
+    id_offset: int = int(
+        settings.get("image_id_offset") or _OPENROUTER_IMAGE_ID_OFFSET_DEFAULT
+    )
+    api_key: str = settings.get("api_key", "")
+    rpm: int = settings.get("rpm", 200)
+    free_rpm: int = settings.get("free_rpm", 20)
+    litellm_params: dict = settings.get("litellm_params") or {}
+
+    image_models = [
+        m
+        for m in raw_models
+        if _is_image_output_model(m)
+        and _is_compatible_provider(m)
+        and _is_allowed_model(m)
+        and "/" in m.get("id", "")
+    ]
+
+    configs: list[dict] = []
+    taken: set[int] = set()
+    for model in image_models:
+        model_id: str = model["id"]
+        name: str = model.get("name", model_id)
+        tier = _openrouter_tier(model)
+
+        cfg: dict[str, Any] = {
+            "id": _stable_config_id(model_id, id_offset, taken),
+            "name": name,
+            "description": f"{name} via OpenRouter (image generation)",
+            "provider": "OPENROUTER",
+            "model_name": model_id,
+            "api_key": api_key,
+            # Pin to OpenRouter's public base URL so a downstream call site
+            # that forgets ``resolve_api_base`` still doesn't inherit
+            # ``AZURE_OPENAI_ENDPOINT`` and 404 on
+            # ``image_generation/transformation`` (defense-in-depth, see
+            # ``provider_api_base`` docstring).
+            "api_base": "https://openrouter.ai/api/v1",
+            "api_version": None,
+            "rpm": free_rpm if tier == "free" else rpm,
+            "litellm_params": dict(litellm_params),
+            "billing_tier": tier,
+            _OPENROUTER_DYNAMIC_MARKER: True,
+        }
+        configs.append(cfg)
+
+    return configs
+
+
+def _generate_vision_llm_configs(
+    raw_models: list[dict], settings: dict[str, Any]
+) -> list[dict]:
+    """Convert OpenRouter vision-capable LLMs into global vision-LLM config
+    dicts (matches the YAML shape consumed by ``vision_llm_routes``).
+
+    Filter:
+      - architecture.input_modalities contains "image"
+      - architecture.output_modalities contains "text"
+      - compatible provider (excluded slugs blocked)
+      - allowed model id (excluded list blocked)
+
+    Vision-LLM is invoked from the indexer (image extraction during
+    document upload) via ``langchain_litellm.ChatLiteLLM.ainvoke``, so
+    the chat-only ``_supports_tool_calling`` and ``_has_sufficient_context``
+    filters do not apply: a small-context vision model that doesn't
+    advertise tool-calling is still perfectly viable for "describe this
+    image" prompts.
+    """
+    id_offset: int = int(
+        settings.get("vision_id_offset") or _OPENROUTER_VISION_ID_OFFSET_DEFAULT
+    )
+    api_key: str = settings.get("api_key", "")
+    rpm: int = settings.get("rpm", 200)
+    tpm: int = settings.get("tpm", 1_000_000)
+    free_rpm: int = settings.get("free_rpm", 20)
+    free_tpm: int = settings.get("free_tpm", 100_000)
+    quota_reserve_tokens: int = settings.get("quota_reserve_tokens", 4000)
+    litellm_params: dict = settings.get("litellm_params") or {}
+
+    vision_models = [
+        m
+        for m in raw_models
+        if _is_vision_input_model(m)
+        and _is_compatible_provider(m)
+        and _is_allowed_model(m)
+        and "/" in m.get("id", "")
+    ]
+
+    configs: list[dict] = []
+    taken: set[int] = set()
+    for model in vision_models:
+        model_id: str = model["id"]
+        name: str = model.get("name", model_id)
+        tier = _openrouter_tier(model)
+        pricing = model.get("pricing") or {}
+
+        # Capture per-token prices so ``pricing_registration`` can
+        # register them with LiteLLM at startup (and so the cost
+        # estimator in ``estimate_call_reserve_micros`` can resolve
+        # them at reserve time).
+        try:
+            input_cost = float(pricing.get("prompt", 0) or 0)
+        except (TypeError, ValueError):
+            input_cost = 0.0
+        try:
+            output_cost = float(pricing.get("completion", 0) or 0)
+        except (TypeError, ValueError):
+            output_cost = 0.0
+
+        cfg: dict[str, Any] = {
+            "id": _stable_config_id(model_id, id_offset, taken),
+            "name": name,
+            "description": f"{name} via OpenRouter (vision)",
+            "provider": "OPENROUTER",
+            "model_name": model_id,
+            "api_key": api_key,
+            # Pin to OpenRouter's public base URL so a downstream call site
+            # that forgets ``resolve_api_base`` still doesn't inherit
+            # ``AZURE_OPENAI_ENDPOINT`` (defense-in-depth, see
+            # ``provider_api_base`` docstring).
+            "api_base": "https://openrouter.ai/api/v1",
+            "api_version": None,
+            "rpm": free_rpm if tier == "free" else rpm,
+            "tpm": free_tpm if tier == "free" else tpm,
+            "litellm_params": dict(litellm_params),
+            "billing_tier": tier,
+            "quota_reserve_tokens": quota_reserve_tokens,
+            "input_cost_per_token": input_cost or None,
+            "output_cost_per_token": output_cost or None,
+            _OPENROUTER_DYNAMIC_MARKER: True,
+        }
+        configs.append(cfg)
+
+    return configs
+
+
+class OpenRouterIntegrationService:
+    """Singleton that manages the dynamic OpenRouter model catalogue."""
+
+    _instance: "OpenRouterIntegrationService | None" = None
+    _lock = threading.Lock()
+
+    def __init__(self) -> None:
+        self._settings: dict[str, Any] = {}
+        self._configs: list[dict] = []
+        self._configs_by_id: dict[int, dict] = {}
+        self._initialized = False
+        self._refresh_task: asyncio.Task | None = None
+        # Last-good per-model health snapshot. Survives across refresh
+        # cycles so a transient OpenRouter /endpoints outage doesn't drop
+        # every cfg back to static-only scoring.
+        # Shape: {model_name: {"gated": bool, "score": float | None}}
+        self._health_cache: dict[str, dict[str, Any]] = {}
+        self._enrich_task: asyncio.Task | None = None
+        # Raw OpenRouter pricing per model_id, captured at the same time
+        # we generate configs. Consumed by ``pricing_registration`` to
+        # teach LiteLLM the per-token cost of every dynamic deployment so
+        # the success-callback can populate ``response_cost`` correctly.
+        self._raw_pricing: dict[str, dict[str, str]] = {}
+        # Cached raw catalogue from the most recent fetch. Image / vision
+        # emitters reuse this to avoid a second network call per surface.
+        self._raw_models: list[dict] = []
+        # Image / vision config caches (only populated when the matching
+        # opt-in flag is true on initialize). Refreshed in lockstep with
+        # the chat catalogue.
+        self._image_configs: list[dict] = []
+        self._vision_configs: list[dict] = []
+
+    @classmethod
+    def get_instance(cls) -> "OpenRouterIntegrationService":
+        if cls._instance is None:
+            with cls._lock:
+                if cls._instance is None:
+                    cls._instance = cls()
+        return cls._instance
+
+    @classmethod
+    def is_initialized(cls) -> bool:
+        return cls._instance is not None and cls._instance._initialized
+
+    # ------------------------------------------------------------------
+    # Initialisation (called at startup, before event loop for Celery)
+    # ------------------------------------------------------------------
+
+    def initialize(self, settings: dict[str, Any]) -> list[dict]:
+        """
+        Fetch models synchronously and generate configs.
+        Returns the generated configs list.
+        """
+        self._settings = settings
+        raw_models = _fetch_models_sync()
+        if raw_models is None:
+            logger.warning("OpenRouter integration: could not fetch models at startup")
+            self._initialized = True
+            return []
+
+        self._raw_models = raw_models
+        self._configs = _generate_configs(raw_models, settings)
+        self._configs_by_id = {c["id"]: c for c in self._configs}
+        self._raw_pricing = _extract_raw_pricing(raw_models)
+
+        # Populate image / vision caches when their opt-in flag is set.
+        # Empty otherwise so the accessors return [] without re-running
+        # filters every refresh.
+        if settings.get("image_generation_enabled"):
+            self._image_configs = _generate_image_gen_configs(raw_models, settings)
+            logger.info(
+                "OpenRouter integration: image-gen emission ON (%d models)",
+                len(self._image_configs),
+            )
+        else:
+            self._image_configs = []
+
+        if settings.get("vision_enabled"):
+            self._vision_configs = _generate_vision_llm_configs(raw_models, settings)
+            logger.info(
+                "OpenRouter integration: vision LLM emission ON (%d models)",
+                len(self._vision_configs),
+            )
+        else:
+            self._vision_configs = []
+
+        self._initialized = True
+
+        tier_counts = self._tier_counts(self._configs)
+        logger.info(
+            "OpenRouter integration: loaded %d models (free=%d, premium=%d)",
+            len(self._configs),
+            tier_counts["free"],
+            tier_counts["premium"],
+        )
+
+        # Schedule the first health-enrichment pass as a deferred task so
+        # cold-start latency is unchanged. Only valid when an event loop is
+        # already running (e.g. FastAPI lifespan); Celery worker init is
+        # fully sync so we silently skip — its first refresh tick (or the
+        # next refresh from the web process) will populate health data.
+        try:
+            loop = asyncio.get_running_loop()
+            self._enrich_task = loop.create_task(
+                self._enrich_health_safely(self._configs)
+            )
+        except RuntimeError:
+            pass
+
+        return self._configs
+
+    # ------------------------------------------------------------------
+    # Background refresh
+    # ------------------------------------------------------------------
+
+    async def refresh(self) -> None:
+        """Re-fetch from OpenRouter and atomically swap configs in GLOBAL_LLM_CONFIGS."""
+        raw_models = await _fetch_models_async()
+        if raw_models is None:
+            logger.warning("OpenRouter refresh: fetch failed, keeping stale list")
+            return
+
+        new_configs = _generate_configs(raw_models, self._settings)
+        new_by_id = {c["id"]: c for c in new_configs}
+        self._raw_pricing = _extract_raw_pricing(raw_models)
+        self._raw_models = raw_models
+
+        from app.config import config as app_config
+
+        static_configs = [
+            c
+            for c in app_config.GLOBAL_LLM_CONFIGS
+            if not c.get(_OPENROUTER_DYNAMIC_MARKER)
+        ]
+        app_config.GLOBAL_LLM_CONFIGS = static_configs + new_configs
+
+        self._configs = new_configs
+        self._configs_by_id = new_by_id
+
+        # Image / vision lists are atomic-swapped the same way: filter out
+        # the previous dynamic entries from the live config list and append
+        # the freshly generated ones. No-ops when the opt-in flag is off.
+        if self._settings.get("image_generation_enabled"):
+            new_image = _generate_image_gen_configs(raw_models, self._settings)
+            static_image = [
+                c
+                for c in app_config.GLOBAL_IMAGE_GEN_CONFIGS
+                if not c.get(_OPENROUTER_DYNAMIC_MARKER)
+            ]
+            app_config.GLOBAL_IMAGE_GEN_CONFIGS = static_image + new_image
+            self._image_configs = new_image
+
+        if self._settings.get("vision_enabled"):
+            new_vision = _generate_vision_llm_configs(raw_models, self._settings)
+            static_vision = [
+                c
+                for c in app_config.GLOBAL_VISION_LLM_CONFIGS
+                if not c.get(_OPENROUTER_DYNAMIC_MARKER)
+            ]
+            app_config.GLOBAL_VISION_LLM_CONFIGS = static_vision + new_vision
+            self._vision_configs = new_vision
+
+        # Catalogue churn invalidates per-config "recently healthy" credit
+        # earned by the previous turn's preflight. Drop the whole table so
+        # the next turn re-probes against the freshly loaded configs.
+        try:
+            from app.services.auto_model_pin_service import clear_healthy
+
+            clear_healthy()
+        except Exception:
+            logger.debug(
+                "OpenRouter refresh: clear_healthy import skipped", exc_info=True
+            )
+
+        tier_counts = self._tier_counts(new_configs)
+        logger.info(
+            "OpenRouter refresh: updated to %d models (free=%d, premium=%d)",
+            len(new_configs),
+            tier_counts["free"],
+            tier_counts["premium"],
+        )
+
+        # Re-blend health scores against the freshly fetched catalogue. Also
+        # re-stamps health for any YAML-curated cfg with provider==OPENROUTER
+        # so a hand-picked dead OR model is gated like a dynamic one.
+        await self._enrich_health_safely(static_configs + new_configs, log_summary=True)
+
+        # Re-register LiteLLM pricing for the freshly fetched catalogue
+        # so newly added OR models bill correctly on their first call.
+        # Runs before the router rebuild because the router may issue
+        # cost-table lookups during deployment registration.
+        try:
+            from app.services.pricing_registration import (
+                register_pricing_from_global_configs,
+            )
+
+            register_pricing_from_global_configs()
+        except Exception as exc:
+            logger.warning(
+                "OpenRouter refresh: pricing re-registration skipped (%s)", exc
+            )
+
+        # Rebuild the LiteLLM router so freshly fetched configs flow through
+        # (dynamic OR premium entries now opt into the pool, free ones stay
+        # out; a refresh also needs to pick up any static-config edits and
+        # reset cached context-window profiles).
+        try:
+            from app.config import config as _app_config
+            from app.services.llm_router_service import (
+                LLMRouterService,
+                _router_instance_cache as _chat_router_cache,
+            )
+
+            LLMRouterService.rebuild(
+                _app_config.GLOBAL_LLM_CONFIGS,
+                getattr(_app_config, "ROUTER_SETTINGS", None),
+            )
+            _chat_router_cache.clear()
+        except Exception as exc:
+            logger.warning("OpenRouter refresh: router rebuild skipped (%s)", exc)
+
+    @staticmethod
+    def _tier_counts(configs: list[dict]) -> dict[str, int]:
+        counts = {"free": 0, "premium": 0}
+        for cfg in configs:
+            tier = str(cfg.get("billing_tier", "")).lower()
+            if tier in counts:
+                counts[tier] += 1
+        return counts
+
+    # ------------------------------------------------------------------
+    # Auto (Fastest) health enrichment
+    # ------------------------------------------------------------------
+
+    async def _enrich_health_safely(
+        self, configs: list[dict], *, log_summary: bool = True
+    ) -> None:
+        """Wrapper around ``_enrich_health`` that swallows all errors.
+
+        Health enrichment is best-effort: any failure must leave cfgs in
+        their static-only state and never break refresh / startup.
+        """
+        try:
+            await self._enrich_health(configs, log_summary=log_summary)
+        except Exception:
+            logger.exception("OpenRouter health enrichment failed")
+
+    async def _enrich_health(
+        self, configs: list[dict], *, log_summary: bool = True
+    ) -> None:
+        """Fetch per-model ``/endpoints`` data for the top OR cfgs and blend
+        the resulting health score into ``cfg["quality_score"]``.
+
+        Bounded fan-out: top-N per tier by ``quality_score_static`` only,
+        with ``asyncio.Semaphore(_HEALTH_ENRICH_CONCURRENCY)`` guarding the
+        outbound HTTP. Misses fall back to a per-model last-good cache; if
+        the failure ratio crosses ``_HEALTH_FAIL_RATIO_FALLBACK`` we keep
+        the entire previous cycle's cache for this run.
+        """
+        or_cfgs = [
+            c for c in configs if str(c.get("provider", "")).upper() == "OPENROUTER"
+        ]
+        if not or_cfgs:
+            return
+
+        premium_pool = sorted(
+            [c for c in or_cfgs if str(c.get("billing_tier", "")).lower() == "premium"],
+            key=lambda c: -int(c.get("quality_score_static") or 0),
+        )[:_HEALTH_ENRICH_TOP_N_PREMIUM]
+        free_pool = sorted(
+            [c for c in or_cfgs if str(c.get("billing_tier", "")).lower() == "free"],
+            key=lambda c: -int(c.get("quality_score_static") or 0),
+        )[:_HEALTH_ENRICH_TOP_N_FREE]
+        # De-duplicate while preserving order: a cfg shouldn't fall in both
+        # tiers, but defensive code is cheap here.
+        seen_ids: set[int] = set()
+        selected: list[dict] = []
+        for cfg in premium_pool + free_pool:
+            cid = int(cfg.get("id", 0))
+            if cid in seen_ids:
+                continue
+            seen_ids.add(cid)
+            selected.append(cfg)
+
+        if not selected:
+            return
+
+        api_key = str(self._settings.get("api_key") or "")
+        semaphore = asyncio.Semaphore(_HEALTH_ENRICH_CONCURRENCY)
+
+        async with httpx.AsyncClient(timeout=_HEALTH_FETCH_TIMEOUT_SEC) as client:
+            results = await asyncio.gather(
+                *(
+                    self._fetch_endpoints(client, semaphore, api_key, cfg)
+                    for cfg in selected
+                )
+            )
+
+        fail_count = sum(1 for _, _, err in results if err is not None)
+        fail_ratio = fail_count / len(results) if results else 0.0
+        degraded = fail_ratio >= _HEALTH_FAIL_RATIO_FALLBACK
+        if degraded:
+            logger.warning(
+                "auto_pin_health_enrich_degraded fail_ratio=%.2f total=%d "
+                "using_last_good_cache=true",
+                fail_ratio,
+                len(results),
+            )
+
+        # Per-cfg health update.
+        for cfg, endpoints, err in results:
+            model_name = str(cfg.get("model_name", ""))
+            if not degraded and err is None and endpoints is not None:
+                gated, h_score = aggregate_health(endpoints)
+                cfg["health_gated"] = bool(gated)
+                cfg["quality_score_health"] = h_score
+                self._health_cache[model_name] = {
+                    "gated": bool(gated),
+                    "score": h_score,
+                }
+            else:
+                cached = self._health_cache.get(model_name)
+                if cached is not None:
+                    cfg["health_gated"] = bool(cached.get("gated", False))
+                    cfg["quality_score_health"] = cached.get("score")
+                # else: keep current values (initial defaults from
+                # _generate_configs / load_global_llm_configs).
+
+        # Blend health into the final score for every OR cfg, including
+        # those outside the enriched top-N (they fall through to static).
+        gated_count = 0
+        by_provider: dict[str, int] = {}
+        for cfg in or_cfgs:
+            static_q = int(cfg.get("quality_score_static") or 0)
+            h = cfg.get("quality_score_health")
+            if h is not None and not cfg.get("health_gated"):
+                blended = (
+                    _HEALTH_BLEND_WEIGHT * float(h)
+                    + (1 - _HEALTH_BLEND_WEIGHT) * static_q
+                )
+                cfg["quality_score"] = round(blended)
+            else:
+                cfg["quality_score"] = static_q
+
+            if cfg.get("health_gated"):
+                gated_count += 1
+                model_id = str(cfg.get("model_name", ""))
+                provider_slug = (
+                    model_id.split("/", 1)[0] if "/" in model_id else "unknown"
+                )
+                by_provider[provider_slug] = by_provider.get(provider_slug, 0) + 1
+
+        if log_summary:
+            logger.info(
+                "auto_pin_health_gated count=%d by_provider=%s fail_ratio=%.2f "
+                "total_enriched=%d",
+                gated_count,
+                dict(sorted(by_provider.items(), key=lambda kv: -kv[1])),
+                fail_ratio,
+                len(selected),
+            )
+
+    @staticmethod
+    async def _fetch_endpoints(
+        client: httpx.AsyncClient,
+        semaphore: asyncio.Semaphore,
+        api_key: str,
+        cfg: dict,
+    ) -> tuple[dict, list[dict] | None, Exception | None]:
+        """Fetch ``/api/v1/models/{id}/endpoints`` for one cfg.
+
+        Returns ``(cfg, endpoints, err)`` so the caller can keep batched
+        results aligned with their cfgs without raising.
+        """
+        model_id = str(cfg.get("model_name", ""))
+        if not model_id:
+            return cfg, None, ValueError("missing model_name")
+
+        url = OPENROUTER_ENDPOINTS_URL_TEMPLATE.format(model_id=model_id)
+        headers = {"Authorization": f"Bearer {api_key}"} if api_key else {}
+
+        async with semaphore:
+            try:
+                resp = await client.get(url, headers=headers)
+                resp.raise_for_status()
+                data = resp.json()
+            except Exception as exc:
+                return cfg, None, exc
+
+        payload = data.get("data") if isinstance(data, dict) else None
+        if not isinstance(payload, dict):
+            return cfg, None, ValueError("malformed endpoints payload")
+        endpoints = payload.get("endpoints")
+        if not isinstance(endpoints, list):
+            return cfg, [], None
+        return cfg, endpoints, None
+
+    async def _refresh_loop(self, interval_hours: float) -> None:
+        interval_sec = interval_hours * 3600
+        while True:
+            await asyncio.sleep(interval_sec)
+            try:
+                await self.refresh()
+            except Exception:
+                logger.exception("OpenRouter background refresh failed")
+
+    def start_background_refresh(self, interval_hours: float) -> None:
+        if interval_hours <= 0:
+            return
+        loop = asyncio.get_event_loop()
+        self._refresh_task = loop.create_task(self._refresh_loop(interval_hours))
+        logger.info(
+            "OpenRouter background refresh started (every %.1fh)", interval_hours
+        )
+
+    def stop_background_refresh(self) -> None:
+        if self._refresh_task is not None and not self._refresh_task.done():
+            self._refresh_task.cancel()
+            self._refresh_task = None
+            logger.info("OpenRouter background refresh stopped")
+
+    # ------------------------------------------------------------------
+    # Accessors
+    # ------------------------------------------------------------------
+
+    def get_configs(self) -> list[dict]:
+        return self._configs
+
+    def get_config_by_id(self, config_id: int) -> dict | None:
+        return self._configs_by_id.get(config_id)
+
+    def get_image_generation_configs(self) -> list[dict]:
+        """Return the dynamic OpenRouter image-generation configs (empty
+        list when the ``image_generation_enabled`` flag is off).
+
+        Each entry already has ``billing_tier`` derived per-model from
+        OpenRouter's signals and is shaped to drop directly into
+        ``Config.GLOBAL_IMAGE_GEN_CONFIGS``.
+        """
+        return list(self._image_configs)
+
+    def get_vision_llm_configs(self) -> list[dict]:
+        """Return the dynamic OpenRouter vision-LLM configs (empty list
+        when the ``vision_enabled`` flag is off).
+
+        Each entry exposes ``input_cost_per_token`` / ``output_cost_per_token``
+        so ``pricing_registration`` can teach LiteLLM the cost of these
+        models the same way it does for chat — which keeps the billable
+        wrapper able to debit accurate micro-USD on a vision call.
+        """
+        return list(self._vision_configs)
+
+    def get_raw_pricing(self) -> dict[str, dict[str, str]]:
+        """Return the cached raw OpenRouter pricing map.
+
+        Shape: ``{model_id: {"prompt": str, "completion": str}}``. The
+        values are the strings OpenRouter publishes (USD per token),
+        never converted to floats here so the caller can decide how to
+        handle malformed or unset entries.
+        """
+        return dict(self._raw_pricing)
diff --git a/surfsense_backend/app/services/pricing_registration.py b/surfsense_backend/app/services/pricing_registration.py
new file mode 100644
index 0000000000..de98e50c21
--- /dev/null
+++ b/surfsense_backend/app/services/pricing_registration.py
@@ -0,0 +1,274 @@
+"""
+Pricing registration with LiteLLM.
+
+Many models reach our LiteLLM callback without LiteLLM knowing their
+per-token cost — namely:
+
+* The ~300 dynamic OpenRouter deployments (their pricing only lives on
+  OpenRouter's ``/api/v1/models`` payload, never in LiteLLM's published
+  pricing table).
+* Static YAML deployments whose ``base_model`` name is operator-defined
+  (e.g. custom Azure deployment names like ``gpt-5.4``) and therefore
+  not in LiteLLM's table either.
+
+Without registration, ``kwargs["response_cost"]`` is 0 for those calls
+and the user gets billed nothing — a fail-safe but wrong answer for a
+cost-based credit system. This module runs once at startup, after the
+OpenRouter integration has fetched its catalogue, and registers each
+known model's pricing with ``litellm.register_model()`` under multiple
+plausible alias keys (LiteLLM's cost lookup may use any of them
+depending on whether the call went through the Router, ChatLiteLLM,
+or a direct ``acompletion``).
+
+Operators who run a custom Azure deployment whose ``base_model`` name
+isn't in LiteLLM's table can declare per-token pricing inline in
+``global_llm_config.yaml`` via ``input_cost_per_token`` and
+``output_cost_per_token`` (USD per token, e.g. ``0.000002``). Without
+that declaration the model's calls debit 0 — never overbilled.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+import litellm
+
+logger = logging.getLogger(__name__)
+
+
+def _safe_float(value: Any) -> float:
+    """Return ``float(value)`` if it parses to a positive number, else 0.0."""
+    if value is None:
+        return 0.0
+    try:
+        f = float(value)
+    except (TypeError, ValueError):
+        return 0.0
+    return f if f > 0 else 0.0
+
+
+def _alias_set_for_openrouter(model_id: str) -> list[str]:
+    """Return the alias keys to register an OpenRouter model under.
+
+    LiteLLM's cost-callback lookup key varies by call path:
+    - Router with ``model="openrouter/X"`` → kwargs["model"] is
+      typically ``openrouter/X``.
+    - LiteLLM's own provider routing may strip the prefix and pass the
+      bare ``X`` to the cost-table lookup.
+    Registering under both keeps the lookup hermetic regardless of
+    which path the call took.
+    """
+    aliases = [f"openrouter/{model_id}", model_id]
+    return list(dict.fromkeys(a for a in aliases if a))
+
+
+def _alias_set_for_yaml(provider: str, model_name: str, base_model: str) -> list[str]:
+    """Return the alias keys to register a static YAML deployment under.
+
+    Same reasoning as the OpenRouter set: cover the bare ``base_model``,
+    the ``<provider>/<model>`` form LiteLLM Router constructs, and the
+    bare ``model_name`` because callbacks sometimes see whichever was
+    configured first.
+    """
+    provider_lower = (provider or "").lower()
+    aliases: list[str] = []
+    if base_model:
+        aliases.append(base_model)
+    if provider_lower and base_model:
+        aliases.append(f"{provider_lower}/{base_model}")
+    if model_name and model_name != base_model:
+        aliases.append(model_name)
+    if provider_lower and model_name and model_name != base_model:
+        aliases.append(f"{provider_lower}/{model_name}")
+    # Azure deployments often surface as "azure/<name>"; normalise the
+    # ``azure_openai`` provider slug to the LiteLLM-canonical ``azure``.
+    if provider_lower == "azure_openai":
+        if base_model:
+            aliases.append(f"azure/{base_model}")
+        if model_name and model_name != base_model:
+            aliases.append(f"azure/{model_name}")
+    return list(dict.fromkeys(a for a in aliases if a))
+
+
+def _register(
+    aliases: list[str],
+    *,
+    input_cost: float,
+    output_cost: float,
+    provider: str,
+    mode: str = "chat",
+) -> int:
+    """Register a single pricing entry under every alias in ``aliases``.
+
+    Returns the count of aliases successfully registered.
+    """
+    payload: dict[str, dict[str, Any]] = {}
+    for alias in aliases:
+        payload[alias] = {
+            "input_cost_per_token": input_cost,
+            "output_cost_per_token": output_cost,
+            "litellm_provider": provider,
+            "mode": mode,
+        }
+    if not payload:
+        return 0
+    try:
+        litellm.register_model(payload)
+    except Exception as exc:
+        logger.warning(
+            "[PricingRegistration] register_model failed for aliases=%s: %s",
+            aliases,
+            exc,
+        )
+        return 0
+    return len(payload)
+
+
+def _register_chat_shape_configs(
+    configs: list[dict],
+    *,
+    or_pricing: dict[str, dict[str, str]],
+    label: str,
+) -> tuple[int, int, int, list[str]]:
+    """Common loop that registers per-token pricing for a list of "chat-shape"
+    configs (chat or vision LLM — both use ``input_cost_per_token`` /
+    ``output_cost_per_token`` and the LiteLLM ``mode="chat"`` cost shape).
+
+    Returns ``(registered_models, registered_aliases, skipped, sample_keys)``.
+    """
+    registered_models = 0
+    registered_aliases = 0
+    skipped_no_pricing = 0
+    sample_keys: list[str] = []
+
+    for cfg in configs:
+        provider = str(cfg.get("provider") or "").upper()
+        model_name = str(cfg.get("model_name") or "").strip()
+        litellm_params = cfg.get("litellm_params") or {}
+        base_model = str(litellm_params.get("base_model") or model_name).strip()
+
+        if provider == "OPENROUTER":
+            entry = or_pricing.get(model_name)
+            if entry:
+                input_cost = _safe_float(entry.get("prompt"))
+                output_cost = _safe_float(entry.get("completion"))
+            else:
+                # Vision configs from ``_generate_vision_llm_configs``
+                # carry their pricing inline because the OpenRouter
+                # raw-pricing cache is keyed by chat-catalogue model_id;
+                # vision flows pick up the inline values here.
+                input_cost = _safe_float(cfg.get("input_cost_per_token"))
+                output_cost = _safe_float(cfg.get("output_cost_per_token"))
+            if input_cost == 0.0 and output_cost == 0.0:
+                skipped_no_pricing += 1
+                continue
+            aliases = _alias_set_for_openrouter(model_name)
+            count = _register(
+                aliases,
+                input_cost=input_cost,
+                output_cost=output_cost,
+                provider="openrouter",
+            )
+            if count > 0:
+                registered_models += 1
+                registered_aliases += count
+                if len(sample_keys) < 6:
+                    sample_keys.extend(aliases[:2])
+            continue
+
+        input_cost = _safe_float(
+            cfg.get("input_cost_per_token")
+            or litellm_params.get("input_cost_per_token")
+        )
+        output_cost = _safe_float(
+            cfg.get("output_cost_per_token")
+            or litellm_params.get("output_cost_per_token")
+        )
+        if input_cost == 0.0 and output_cost == 0.0:
+            skipped_no_pricing += 1
+            continue
+        aliases = _alias_set_for_yaml(provider, model_name, base_model)
+        provider_slug = "azure" if provider == "AZURE_OPENAI" else provider.lower()
+        count = _register(
+            aliases,
+            input_cost=input_cost,
+            output_cost=output_cost,
+            provider=provider_slug,
+        )
+        if count > 0:
+            registered_models += 1
+            registered_aliases += count
+            if len(sample_keys) < 6:
+                sample_keys.extend(aliases[:2])
+
+    logger.info(
+        "[PricingRegistration:%s] registered pricing for %d models (%d aliases); "
+        "%d configs had no pricing data; sample registered keys=%s",
+        label,
+        registered_models,
+        registered_aliases,
+        skipped_no_pricing,
+        sample_keys,
+    )
+    return registered_models, registered_aliases, skipped_no_pricing, sample_keys
+
+
+def register_pricing_from_global_configs() -> None:
+    """Register pricing for every known LLM deployment with LiteLLM.
+
+    Walks ``config.GLOBAL_LLM_CONFIGS`` *and* ``config.GLOBAL_VISION_LLM_CONFIGS``
+    so vision calls (during indexing) can resolve cost the same way chat
+    calls do — namely:
+
+    1. ``OPENROUTER``: pulls the cached raw pricing from
+       ``OpenRouterIntegrationService`` (populated during its own
+       startup fetch) and converts the per-token strings to floats. For
+       vision configs that carry pricing inline (``input_cost_per_token`` /
+       ``output_cost_per_token`` set on the cfg itself) we fall back to
+       those values when the OR cache misses the model.
+    2. Anything else: looks for operator-declared
+       ``input_cost_per_token`` / ``output_cost_per_token`` on the YAML
+       config block (top-level or nested under ``litellm_params``).
+
+    **Image generation is intentionally NOT registered here.** The cost
+    shape for image-gen is per-image (``output_cost_per_image``), not
+    per-token, and LiteLLM's ``register_model`` doesn't accept those
+    keys via the chat-cost path. OpenRouter image-gen models populate
+    ``response_cost`` directly from their response header instead, and
+    Azure-native image-gen models are already in LiteLLM's cost map.
+
+    Calls without a resolved pair of costs are skipped, not registered
+    with zeros — operators who forget pricing get a "$0 debit" warning
+    in ``TokenTrackingCallback`` rather than silently overwriting any
+    pricing LiteLLM might know natively.
+    """
+    from app.config import config as app_config
+
+    chat_configs: list[dict] = list(getattr(app_config, "GLOBAL_LLM_CONFIGS", []) or [])
+    vision_configs: list[dict] = list(
+        getattr(app_config, "GLOBAL_VISION_LLM_CONFIGS", []) or []
+    )
+    if not chat_configs and not vision_configs:
+        logger.info("[PricingRegistration] no global configs to register")
+        return
+
+    or_pricing: dict[str, dict[str, str]] = {}
+    try:
+        from app.services.openrouter_integration_service import (
+            OpenRouterIntegrationService,
+        )
+
+        if OpenRouterIntegrationService.is_initialized():
+            or_pricing = OpenRouterIntegrationService.get_instance().get_raw_pricing()
+    except Exception as exc:
+        logger.debug(
+            "[PricingRegistration] OpenRouter pricing not available yet: %s", exc
+        )
+
+    if chat_configs:
+        _register_chat_shape_configs(chat_configs, or_pricing=or_pricing, label="chat")
+    if vision_configs:
+        _register_chat_shape_configs(
+            vision_configs, or_pricing=or_pricing, label="vision"
+        )
diff --git a/surfsense_backend/app/services/provider_api_base.py b/surfsense_backend/app/services/provider_api_base.py
new file mode 100644
index 0000000000..dca1f94626
--- /dev/null
+++ b/surfsense_backend/app/services/provider_api_base.py
@@ -0,0 +1,106 @@
+"""Provider-aware ``api_base`` resolution shared by chat / image-gen / vision.
+
+LiteLLM falls back to the module-global ``litellm.api_base`` when an
+individual call doesn't pass one, which silently inherits provider-agnostic
+env vars like ``AZURE_OPENAI_ENDPOINT`` / ``OPENAI_API_BASE``. Without an
+explicit ``api_base``, an ``openrouter/<model>`` request can end up at an
+Azure endpoint and 404 with ``Resource not found`` (real reproducer:
+[litellm/llms/openrouter/image_generation/transformation.py:242-263] appends
+``/chat/completions`` to whatever inherited base it gets, regardless of
+provider).
+
+The chat router has had this defense for a while
+(``llm_router_service.py:466-478``). This module hoists the maps + cascade
+into a tiny standalone helper so vision and image-gen can share the same
+source of truth without an inter-service circular import.
+"""
+
+from __future__ import annotations
+
+PROVIDER_DEFAULT_API_BASE: dict[str, str] = {
+    "openrouter": "https://openrouter.ai/api/v1",
+    "groq": "https://api.groq.com/openai/v1",
+    "mistral": "https://api.mistral.ai/v1",
+    "perplexity": "https://api.perplexity.ai",
+    "xai": "https://api.x.ai/v1",
+    "cerebras": "https://api.cerebras.ai/v1",
+    "deepinfra": "https://api.deepinfra.com/v1/openai",
+    "fireworks_ai": "https://api.fireworks.ai/inference/v1",
+    "together_ai": "https://api.together.xyz/v1",
+    "anyscale": "https://api.endpoints.anyscale.com/v1",
+    "cometapi": "https://api.cometapi.com/v1",
+    "sambanova": "https://api.sambanova.ai/v1",
+}
+"""Default ``api_base`` per LiteLLM provider prefix (lowercase).
+
+Only providers with a well-known, stable public base URL are listed —
+self-hosted / BYO-endpoint providers (ollama, custom, bedrock, vertex_ai,
+huggingface, databricks, cloudflare, replicate) are intentionally omitted
+so their existing config-driven behaviour is preserved."""
+
+
+PROVIDER_KEY_DEFAULT_API_BASE: dict[str, str] = {
+    "DEEPSEEK": "https://api.deepseek.com/v1",
+    "ALIBABA_QWEN": "https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
+    "MOONSHOT": "https://api.moonshot.ai/v1",
+    "ZHIPU": "https://open.bigmodel.cn/api/paas/v4",
+    "MINIMAX": "https://api.minimax.io/v1",
+}
+"""Canonical provider key (uppercase) → base URL.
+
+Used when the LiteLLM provider prefix is the generic ``openai`` shim but the
+config's ``provider`` field tells us which API it actually is (DeepSeek,
+Alibaba, Moonshot, Zhipu, MiniMax all use the ``openai`` prefix but each
+has its own base URL)."""
+
+
+def resolve_api_base(
+    *,
+    provider: str | None,
+    provider_prefix: str | None,
+    config_api_base: str | None,
+) -> str | None:
+    """Resolve a non-Azure-leaking ``api_base`` for a deployment.
+
+    Cascade (first non-empty wins):
+      1. The config's own ``api_base`` (whitespace-only treated as missing).
+      2. ``PROVIDER_KEY_DEFAULT_API_BASE[provider.upper()]``.
+      3. ``PROVIDER_DEFAULT_API_BASE[provider_prefix.lower()]``.
+      4. ``None`` — caller should NOT set ``api_base`` and let the LiteLLM
+         provider integration apply its own default (e.g. AzureOpenAI's
+         deployment-derived URL, custom provider's per-deployment URL).
+
+    Args:
+        provider: The config's ``provider`` field (e.g. ``"OPENROUTER"``,
+            ``"DEEPSEEK"``). Case-insensitive.
+        provider_prefix: The LiteLLM model-string prefix the same call
+            site builds for the model id (e.g. ``"openrouter"``,
+            ``"groq"``). Case-insensitive.
+        config_api_base: ``api_base`` from the global YAML / DB row /
+            OpenRouter dynamic config. Empty / whitespace-only means
+            "missing" — the resolver still applies the cascade.
+
+    Returns:
+        A URL string, or ``None`` if no default applies for this provider.
+    """
+    if config_api_base and config_api_base.strip():
+        return config_api_base
+
+    if provider:
+        key_default = PROVIDER_KEY_DEFAULT_API_BASE.get(provider.upper())
+        if key_default:
+            return key_default
+
+    if provider_prefix:
+        prefix_default = PROVIDER_DEFAULT_API_BASE.get(provider_prefix.lower())
+        if prefix_default:
+            return prefix_default
+
+    return None
+
+
+__all__ = [
+    "PROVIDER_DEFAULT_API_BASE",
+    "PROVIDER_KEY_DEFAULT_API_BASE",
+    "resolve_api_base",
+]
diff --git a/surfsense_backend/app/services/provider_capabilities.py b/surfsense_backend/app/services/provider_capabilities.py
new file mode 100644
index 0000000000..e9a1c33e10
--- /dev/null
+++ b/surfsense_backend/app/services/provider_capabilities.py
@@ -0,0 +1,280 @@
+"""Capability resolution shared by chat / image / vision call sites.
+
+Why this exists
+---------------
+The chat catalog (YAML + dynamic OpenRouter + BYOK DB rows + Auto) needs a
+single, authoritative answer to one question: *can this chat config accept
+``image_url`` content blocks?* Without it, the new-chat selector can't badge
+incompatible models and the streaming task can't fail fast with a friendly
+error before sending an image to a text-only provider.
+
+Two functions, two intents:
+
+- :func:`derive_supports_image_input` — best-effort *True* for catalog and
+  UI surfacing. Default-allow: an unknown / unmapped model is treated as
+  capable so we never lock the user out of a freshly added or
+  third-party-hosted vision model.
+
+- :func:`is_known_text_only_chat_model` — strict opt-out for the streaming
+  task's safety net. Returns True only when LiteLLM's model map *explicitly*
+  sets ``supports_vision=False`` (or its bare-name variant does). Anything
+  else — missing key, lookup exception, ``supports_vision=True`` — returns
+  False so the request flows through to the provider.
+
+Implementation rule: only public LiteLLM symbols
+------------------------------------------------
+``litellm.supports_vision`` and ``litellm.get_model_info`` are part of the
+typed module surface (see ``litellm.__init__`` lazy stubs) and are stable
+across releases. The private ``_is_explicitly_disabled_factory`` and
+``_get_model_info_helper`` are intentionally avoided so a LiteLLM upgrade
+can't silently break us.
+
+Why the previous round's strict YAML opt-in flag failed
+-------------------------------------------------------
+``supports_image_input: false`` was the YAML loader's setdefault. Operators
+maintaining ``global_llm_config.yaml`` never set it, so every Azure / OpenAI
+YAML chat model — including vision-capable GPT-5.x and GPT-4o — resolved to
+False and the streaming gate rejected every image turn. Sourcing capability
+from LiteLLM's authoritative model map (which already says
+``azure/gpt-5.4 -> supports_vision=true``) removes that operator toil.
+"""
+
+from __future__ import annotations
+
+import logging
+from collections.abc import Iterable
+
+import litellm
+
+logger = logging.getLogger(__name__)
+
+
+# Provider-name → LiteLLM model-prefix map.
+#
+# Owned here because ``app.services.provider_capabilities`` is the
+# only edge that's safe to call from ``app.config``'s YAML loader at
+# class-body init time. ``app.agents.new_chat.llm_config`` re-exports
+# this constant under the historical ``PROVIDER_MAP`` name; placing the
+# map there directly would re-introduce the
+# ``app.config -> ... -> app.agents.new_chat.tools.generate_image ->
+# app.config`` cycle that prompted the move.
+_PROVIDER_PREFIX_MAP: dict[str, str] = {
+    "OPENAI": "openai",
+    "ANTHROPIC": "anthropic",
+    "GROQ": "groq",
+    "COHERE": "cohere",
+    "GOOGLE": "gemini",
+    "OLLAMA": "ollama_chat",
+    "MISTRAL": "mistral",
+    "AZURE_OPENAI": "azure",
+    "OPENROUTER": "openrouter",
+    "XAI": "xai",
+    "BEDROCK": "bedrock",
+    "VERTEX_AI": "vertex_ai",
+    "TOGETHER_AI": "together_ai",
+    "FIREWORKS_AI": "fireworks_ai",
+    "DEEPSEEK": "openai",
+    "ALIBABA_QWEN": "openai",
+    "MOONSHOT": "openai",
+    "ZHIPU": "openai",
+    "GITHUB_MODELS": "github",
+    "REPLICATE": "replicate",
+    "PERPLEXITY": "perplexity",
+    "ANYSCALE": "anyscale",
+    "DEEPINFRA": "deepinfra",
+    "CEREBRAS": "cerebras",
+    "SAMBANOVA": "sambanova",
+    "AI21": "ai21",
+    "CLOUDFLARE": "cloudflare",
+    "DATABRICKS": "databricks",
+    "COMETAPI": "cometapi",
+    "HUGGINGFACE": "huggingface",
+    "MINIMAX": "openai",
+    "CUSTOM": "custom",
+}
+
+
+def _candidate_model_strings(
+    *,
+    provider: str | None,
+    model_name: str | None,
+    base_model: str | None,
+    custom_provider: str | None,
+) -> list[tuple[str, str | None]]:
+    """Return ``[(model_string, custom_llm_provider), ...]`` lookup candidates.
+
+    LiteLLM's capability lookup is keyed by ``model`` + (optional)
+    ``custom_llm_provider``. Different config sources give us different
+    levels of detail, so we try the most-specific keys first and fall back
+    to bare model names so unannotated entries (e.g. an Azure deployment
+    pointing at ``gpt-5.4`` via ``litellm_params.base_model``) still hit the
+    map. Order matters — the first lookup that returns a definitive answer
+    wins for both helpers.
+    """
+    candidates: list[tuple[str, str | None]] = []
+    seen: set[tuple[str, str | None]] = set()
+
+    def _add(model: str | None, llm_provider: str | None) -> None:
+        if not model:
+            return
+        key = (model, llm_provider)
+        if key in seen:
+            return
+        seen.add(key)
+        candidates.append(key)
+
+    provider_prefix: str | None = None
+    if provider:
+        provider_prefix = _PROVIDER_PREFIX_MAP.get(provider.upper(), provider.lower())
+    if custom_provider:
+        # ``custom_provider`` overrides everything for CUSTOM/proxy setups.
+        provider_prefix = custom_provider
+
+    primary_model = base_model or model_name
+    bare_model = model_name
+
+    # Most-specific first: provider-prefixed identifier with explicit
+    # custom_llm_provider so LiteLLM won't have to guess the provider via
+    # ``get_llm_provider``.
+    if primary_model and provider_prefix:
+        # e.g. "azure/gpt-5.4" + custom_llm_provider="azure"
+        if "/" in primary_model:
+            _add(primary_model, provider_prefix)
+        else:
+            _add(f"{provider_prefix}/{primary_model}", provider_prefix)
+
+    # Bare base_model (or model_name) with provider hint — handles entries
+    # the upstream map keys without a provider prefix (most ``gpt-*`` and
+    # ``claude-*`` entries do this).
+    if primary_model:
+        _add(primary_model, provider_prefix)
+
+    # Fallback to model_name when base_model differs (e.g. an Azure
+    # deployment whose model_name is the deployment id but base_model is the
+    # canonical OpenAI sku).
+    if bare_model and bare_model != primary_model:
+        if provider_prefix and "/" not in bare_model:
+            _add(f"{provider_prefix}/{bare_model}", provider_prefix)
+        _add(bare_model, provider_prefix)
+        _add(bare_model, None)
+
+    return candidates
+
+
+def derive_supports_image_input(
+    *,
+    provider: str | None = None,
+    model_name: str | None = None,
+    base_model: str | None = None,
+    custom_provider: str | None = None,
+    openrouter_input_modalities: Iterable[str] | None = None,
+) -> bool:
+    """Best-effort capability flag for the new-chat selector and catalog.
+
+    Resolution order (first definitive answer wins):
+
+    1. ``openrouter_input_modalities`` (when provided as a non-empty
+       iterable). OpenRouter exposes ``architecture.input_modalities`` per
+       model and that's the authoritative source for OR dynamic configs.
+    2. ``litellm.supports_vision`` against each candidate identifier from
+       :func:`_candidate_model_strings`. Returns True as soon as any
+       candidate confirms vision support.
+    3. Default ``True`` — the conservative-allow stance. An unknown /
+       newly-added / third-party-hosted model is *not* pre-judged. The
+       streaming safety net (:func:`is_known_text_only_chat_model`) is the
+       only place a False ever blocks; everywhere else, a False here would
+       just hide a usable model from the user.
+
+    Returns:
+        True if the model can plausibly accept image input, False only when
+        OpenRouter explicitly says it can't.
+    """
+    if openrouter_input_modalities is not None:
+        modalities = list(openrouter_input_modalities)
+        if modalities:
+            return "image" in modalities
+        # Empty list explicitly published by OR — treat as "no image".
+        return False
+
+    for model_string, custom_llm_provider in _candidate_model_strings(
+        provider=provider,
+        model_name=model_name,
+        base_model=base_model,
+        custom_provider=custom_provider,
+    ):
+        try:
+            if litellm.supports_vision(
+                model=model_string, custom_llm_provider=custom_llm_provider
+            ):
+                return True
+        except Exception as exc:
+            logger.debug(
+                "litellm.supports_vision raised for model=%s provider=%s: %s",
+                model_string,
+                custom_llm_provider,
+                exc,
+            )
+            continue
+
+    # Default-allow. ``is_known_text_only_chat_model`` is the strict gate.
+    return True
+
+
+def is_known_text_only_chat_model(
+    *,
+    provider: str | None = None,
+    model_name: str | None = None,
+    base_model: str | None = None,
+    custom_provider: str | None = None,
+) -> bool:
+    """Strict opt-out probe for the streaming-task safety net.
+
+    Returns True only when LiteLLM's model map *explicitly* sets
+    ``supports_vision=False`` for at least one candidate identifier. Missing
+    key, lookup exception, or ``supports_vision=True`` all return False so
+    the streaming task lets the request through. This is the inverse-default
+    of :func:`derive_supports_image_input`.
+
+    Why two functions
+    -----------------
+    The selector wants "show me everything that's plausibly capable" —
+    default-allow. The safety net wants "block only when I'm certain it
+    can't" — default-pass. Mixing the two intents in a single function
+    leads to the regression we're fixing here.
+    """
+    for model_string, custom_llm_provider in _candidate_model_strings(
+        provider=provider,
+        model_name=model_name,
+        base_model=base_model,
+        custom_provider=custom_provider,
+    ):
+        try:
+            info = litellm.get_model_info(
+                model=model_string, custom_llm_provider=custom_llm_provider
+            )
+        except Exception as exc:
+            logger.debug(
+                "litellm.get_model_info raised for model=%s provider=%s: %s",
+                model_string,
+                custom_llm_provider,
+                exc,
+            )
+            continue
+
+        # ``ModelInfo`` is a TypedDict (dict at runtime). ``supports_vision``
+        # may be missing, None, True, or False. We only fire on explicit
+        # False — None / missing / True all mean "don't block".
+        try:
+            value = info.get("supports_vision")  # type: ignore[union-attr]
+        except AttributeError:
+            value = None
+        if value is False:
+            return True
+
+    return False
+
+
+__all__ = [
+    "derive_supports_image_input",
+    "is_known_text_only_chat_model",
+]
diff --git a/surfsense_backend/app/services/public_chat_service.py b/surfsense_backend/app/services/public_chat_service.py
index 376db974fe..e4e0dd33a3 100644
--- a/surfsense_backend/app/services/public_chat_service.py
+++ b/surfsense_backend/app/services/public_chat_service.py
@@ -41,6 +41,7 @@
     "generate_image",
     "generate_podcast",
     "generate_report",
+    "generate_resume",
     "generate_video_presentation",
 }
 
@@ -239,7 +240,7 @@ async def create_snapshot(
                             video_presentation_ids_seen.add(vp_id)
                             part["result"] = {**result_data, "status": "ready"}
 
-                elif tool_name == "generate_report":
+                elif tool_name in ("generate_report", "generate_resume"):
                     result_data = part.get("result", {})
                     report_id = result_data.get("report_id")
                     if report_id and report_id not in report_ids_seen:
@@ -247,7 +248,6 @@ async def create_snapshot(
                         if report_info:
                             reports_data.append(report_info)
                             report_ids_seen.add(report_id)
-                            # Update status to "ready" so frontend renders ReportCard
                             part["result"] = {**result_data, "status": "ready"}
 
         messages_data.append(
@@ -377,6 +377,7 @@ async def _get_report_for_snapshot(
         "original_id": report.id,
         "title": report.title,
         "content": report.content,
+        "content_type": report.content_type,
         "report_metadata": report.report_metadata,
         "report_group_id": report.report_group_id,
         "created_at": report.created_at.isoformat() if report.created_at else None,
diff --git a/surfsense_backend/app/services/quality_score.py b/surfsense_backend/app/services/quality_score.py
new file mode 100644
index 0000000000..2fb37de214
--- /dev/null
+++ b/surfsense_backend/app/services/quality_score.py
@@ -0,0 +1,380 @@
+"""Pure-function quality scoring for Auto (Fastest) model selection.
+
+This module is import-free of any service / request-path dependencies. All
+numbers are computed once during the OpenRouter refresh tick (or YAML load)
+and cached on the cfg dict, so the chat hot path only does a precomputed
+sort and a SHA256 pick.
+
+Score components (0-100 scale, higher is better):
+
+* ``static_score_or``    - derived from the bulk ``/api/v1/models`` payload
+  (provider prestige + ``created`` recency + pricing band + context window
+  + capabilities + narrow tiny/legacy slug penalty).
+* ``static_score_yaml``  - same shape for hand-curated YAML configs, plus
+  an operator-trust bonus (the operator deliberately picked this model).
+* ``aggregate_health``   - run on per-model ``/api/v1/models/{id}/endpoints``
+  responses; returns ``(gated, score_or_none)``.
+
+The blended ``quality_score`` (0.5 * static + 0.5 * health) is computed in
+:mod:`app.services.openrouter_integration_service` because that's the only
+caller that sees both halves.
+"""
+
+from __future__ import annotations
+
+# ---------------------------------------------------------------------------
+# Tunables (constants, not flags)
+# ---------------------------------------------------------------------------
+
+# Top-K size for deterministic spread inside the locked tier.
+_QUALITY_TOP_K: int = 5
+
+# Hard health gate: any cfg whose best non-null uptime is below this %
+# is excluded from Auto-mode selection entirely.
+_HEALTH_GATE_UPTIME_PCT: float = 90.0
+
+# Health/static blend weight when a cfg has fresh /endpoints data.
+_HEALTH_BLEND_WEIGHT: float = 0.5
+
+# Static bonus applied to YAML cfgs because the operator hand-picked them.
+_OPERATOR_TRUST_BONUS: int = 20
+
+# /endpoints fan-out is bounded per refresh tick.
+_HEALTH_ENRICH_TOP_N_PREMIUM: int = 50
+_HEALTH_ENRICH_TOP_N_FREE: int = 30
+_HEALTH_ENRICH_CONCURRENCY: int = 15
+_HEALTH_FETCH_TIMEOUT_SEC: float = 5.0
+
+# If at least this fraction of /endpoints fetches fail in a refresh cycle,
+# fall back to the previous cycle's last-good cache instead of writing
+# partial / stale health values.
+_HEALTH_FAIL_RATIO_FALLBACK: float = 0.25
+
+# Narrow tiny/legacy slug penalties only. We deliberately do NOT penalise
+# ``-nano`` / ``-mini`` / ``-lite`` because modern frontier models ship with
+# those naming patterns (``gpt-5-mini``, ``gemini-2.5-flash-lite`` etc.) and
+# blanket-penalising them suppresses high-quality picks.
+_TINY_LEGACY_PENALTY_PATTERNS: tuple[str, ...] = (
+    "-1b-",
+    "-1.2b-",
+    "-1.5b-",
+    "-2b-",
+    "-3b-",
+    "gemma-3n",
+    "lfm-",
+    "-base",
+    "-distill",
+    ":nitro",
+    "-preview",
+)
+
+
+# ---------------------------------------------------------------------------
+# Provider prestige tables
+# ---------------------------------------------------------------------------
+
+# OpenRouter-side provider slug (the prefix before ``/`` in the model id).
+# Tiers are coarse: frontier labs > strong open / fast-moving labs >
+# specialist labs > everything else.
+PROVIDER_PRESTIGE_OR: dict[str, int] = {
+    # Frontier labs
+    "openai": 50,
+    "anthropic": 50,
+    "google": 50,
+    "x-ai": 50,
+    # Strong open / fast-moving labs
+    "deepseek": 38,
+    "qwen": 38,
+    "meta-llama": 38,
+    "mistralai": 38,
+    "cohere": 38,
+    "nvidia": 38,
+    "alibaba": 38,
+    # Specialist / regional / strong second-tier
+    "microsoft": 28,
+    "01-ai": 28,
+    "minimax": 28,
+    "moonshot": 28,
+    "z-ai": 28,
+    "nousresearch": 28,
+    "ai21": 28,
+    "perplexity": 28,
+    # Smaller / niche providers
+    "liquid": 18,
+    "cognitivecomputations": 18,
+    "venice": 18,
+    "inflection": 18,
+}
+
+# YAML provider field (the upstream API shape the operator selected).
+PROVIDER_PRESTIGE_YAML: dict[str, int] = {
+    "AZURE_OPENAI": 50,
+    "OPENAI": 50,
+    "ANTHROPIC": 50,
+    "GOOGLE": 50,
+    "VERTEX_AI": 50,
+    "GEMINI": 50,
+    "XAI": 50,
+    "MISTRAL": 38,
+    "DEEPSEEK": 38,
+    "COHERE": 38,
+    "GROQ": 30,
+    "TOGETHER_AI": 28,
+    "FIREWORKS_AI": 28,
+    "PERPLEXITY": 28,
+    "MINIMAX": 28,
+    "BEDROCK": 28,
+    "OPENROUTER": 25,
+    "OLLAMA": 12,
+    "CUSTOM": 12,
+}
+
+
+# ---------------------------------------------------------------------------
+# Pure scoring helpers
+# ---------------------------------------------------------------------------
+
+# Calibrated against the live /api/v1/models bulk dump. Frontier models
+# released in the last ~6 months (GPT-5 family, Claude 4.x, Gemini 2.5,
+# Grok 4) score in the 18-20 band; mid-2024 models in the 8-12 band;
+# anything older trails off.
+_RECENCY_BANDS_DAYS: tuple[tuple[int, int], ...] = (
+    (60, 20),
+    (180, 16),
+    (365, 12),
+    (540, 9),
+    (730, 6),
+    (1095, 3),
+)
+
+
+def created_recency_signal(created_ts: int | None, now_ts: int) -> int:
+    """Return 0-20 based on how recently the model was published.
+
+    Uses the OpenRouter ``created`` Unix timestamp (or any equivalent for
+    YAML cfgs). Models without a usable timestamp get 0 (we don't penalise,
+    we just don't reward).
+    """
+    if created_ts is None or created_ts <= 0 or now_ts <= 0:
+        return 0
+    age_days = max(0, (now_ts - int(created_ts)) // 86_400)
+    for cutoff, score in _RECENCY_BANDS_DAYS:
+        if age_days <= cutoff:
+            return score
+    return 0
+
+
+def pricing_band(
+    prompt: str | float | int | None,
+    completion: str | float | int | None,
+) -> int:
+    """Return 0-15 based on combined prompt+completion cost per 1M tokens.
+
+    Higher-priced models tend to be the larger / more capable ones. A free
+    model returns 0 (we use other signals to rank free-vs-free instead).
+    Uncoercible inputs are treated as 0 rather than raising.
+    """
+
+    def _to_float(value) -> float:
+        if value is None:
+            return 0.0
+        try:
+            return float(value)
+        except (TypeError, ValueError):
+            return 0.0
+
+    p = _to_float(prompt)
+    c = _to_float(completion)
+    total_per_million = (p + c) * 1_000_000
+
+    if total_per_million >= 20.0:
+        return 15
+    if total_per_million >= 5.0:
+        return 12
+    if total_per_million >= 1.0:
+        return 9
+    if total_per_million >= 0.3:
+        return 6
+    if total_per_million >= 0.05:
+        return 4
+    if total_per_million > 0.0:
+        return 2
+    return 0
+
+
+def context_signal(ctx: int | None) -> int:
+    """Return 0-10 based on the model's context window."""
+    if not ctx or ctx <= 0:
+        return 0
+    if ctx >= 1_000_000:
+        return 10
+    if ctx >= 400_000:
+        return 8
+    if ctx >= 200_000:
+        return 6
+    if ctx >= 128_000:
+        return 4
+    if ctx >= 100_000:
+        return 2
+    return 0
+
+
+def capabilities_signal(supported_parameters: list[str] | None) -> int:
+    """Return 0-5 for capabilities that matter for our agent flows."""
+    if not supported_parameters:
+        return 0
+    params = set(supported_parameters)
+    score = 0
+    if "tools" in params:
+        score += 2
+    if "structured_outputs" in params or "response_format" in params:
+        score += 2
+    if "reasoning" in params or "include_reasoning" in params:
+        score += 1
+    return min(score, 5)
+
+
+def slug_penalty(model_id: str) -> int:
+    """Return a non-positive number; matches the narrow tiny/legacy patterns."""
+    if not model_id:
+        return 0
+    needle = model_id.lower()
+    for pattern in _TINY_LEGACY_PENALTY_PATTERNS:
+        if pattern in needle:
+            return -10
+    return 0
+
+
+def _provider_prestige_or(model_id: str) -> int:
+    if "/" not in model_id:
+        return 0
+    slug = model_id.split("/", 1)[0].lower()
+    return PROVIDER_PRESTIGE_OR.get(slug, 15)
+
+
+def static_score_or(or_model: dict, *, now_ts: int) -> int:
+    """Score a raw OpenRouter ``/api/v1/models`` entry on a 0-100 scale."""
+    model_id = str(or_model.get("id", ""))
+    pricing = or_model.get("pricing") or {}
+
+    score = (
+        _provider_prestige_or(model_id)
+        + created_recency_signal(or_model.get("created"), now_ts)
+        + pricing_band(pricing.get("prompt"), pricing.get("completion"))
+        + context_signal(or_model.get("context_length"))
+        + capabilities_signal(or_model.get("supported_parameters"))
+        + slug_penalty(model_id)
+    )
+    return max(0, min(100, int(score)))
+
+
+def static_score_yaml(cfg: dict) -> int:
+    """Score a YAML-curated cfg on a 0-100 scale.
+
+    Includes ``_OPERATOR_TRUST_BONUS`` because the operator deliberately
+    listed this model. Pricing / context fall through to lazy ``litellm``
+    lookups; failures are silent (we just lose those sub-points).
+    """
+    provider = str(cfg.get("provider", "")).upper()
+    base = PROVIDER_PRESTIGE_YAML.get(provider, 15)
+
+    model_name = cfg.get("model_name") or ""
+    litellm_params = cfg.get("litellm_params") or {}
+    lookup_name = (
+        litellm_params.get("base_model") or litellm_params.get("model") or model_name
+    )
+
+    ctx = 0
+    p_cost: float = 0.0
+    c_cost: float = 0.0
+    try:
+        from litellm import get_model_info  # lazy: avoid cold-import cost
+
+        info = get_model_info(lookup_name) or {}
+        ctx = int(info.get("max_input_tokens") or info.get("max_tokens") or 0)
+        p_cost = float(info.get("input_cost_per_token") or 0.0)
+        c_cost = float(info.get("output_cost_per_token") or 0.0)
+    except Exception:
+        # Unknown to litellm — that's fine for prestige+operator-bonus weighting.
+        pass
+
+    score = (
+        base
+        + _OPERATOR_TRUST_BONUS
+        + pricing_band(p_cost, c_cost)
+        + context_signal(ctx)
+        + slug_penalty(str(model_name))
+    )
+    return max(0, min(100, int(score)))
+
+
+# ---------------------------------------------------------------------------
+# Health aggregation
+# ---------------------------------------------------------------------------
+
+
+def _coerce_pct(value) -> float | None:
+    try:
+        if value is None:
+            return None
+        f = float(value)
+    except (TypeError, ValueError):
+        return None
+    if f < 0:
+        return None
+    # OpenRouter reports uptime as a 0-1 fraction; some endpoints surface it
+    # as a 0-100 percentage. Normalise.
+    return f * 100.0 if f <= 1.0 else f
+
+
+def _best_uptime(endpoints: list[dict]) -> tuple[float | None, str | None]:
+    """Pick the best (highest) non-null uptime across all endpoints.
+
+    Window preference: ``uptime_last_30m`` > ``uptime_last_1d`` >
+    ``uptime_last_5m``. Returns ``(uptime_pct, window_used)``.
+    """
+    for window in ("uptime_last_30m", "uptime_last_1d", "uptime_last_5m"):
+        values = [_coerce_pct(ep.get(window)) for ep in endpoints]
+        values = [v for v in values if v is not None]
+        if values:
+            return max(values), window
+    return None, None
+
+
+def aggregate_health(endpoints: list[dict]) -> tuple[bool, float | None]:
+    """Aggregate a model's per-endpoint health into ``(gated, score_or_none)``.
+
+    Hard gate (returns ``(True, None)``):
+      * ``endpoints`` empty,
+      * no endpoint reports ``status == 0`` (OK), or
+      * best non-null uptime below ``_HEALTH_GATE_UPTIME_PCT``.
+
+    On a pass, returns a 0-100 health score blending uptime, status, and a
+    freshness-weighted recent uptime sample.
+    """
+    if not endpoints:
+        return True, None
+
+    any_ok = any(int(ep.get("status", 1)) == 0 for ep in endpoints)
+    if not any_ok:
+        return True, None
+
+    best_uptime, _ = _best_uptime(endpoints)
+    if best_uptime is None or best_uptime < _HEALTH_GATE_UPTIME_PCT:
+        return True, None
+
+    # Freshness term: prefer 5m, fall through to 30m / 1d if 5m is missing.
+    freshness = None
+    for window in ("uptime_last_5m", "uptime_last_30m", "uptime_last_1d"):
+        values = [_coerce_pct(ep.get(window)) for ep in endpoints]
+        values = [v for v in values if v is not None]
+        if values:
+            freshness = max(values)
+            break
+
+    uptime_term = best_uptime
+    status_term = 100.0 if any_ok else 0.0
+    freshness_term = freshness if freshness is not None else best_uptime
+
+    score = 0.50 * uptime_term + 0.30 * status_term + 0.20 * freshness_term
+    return False, max(0.0, min(100.0, score))
diff --git a/surfsense_backend/app/services/quota_checked_vision_llm.py b/surfsense_backend/app/services/quota_checked_vision_llm.py
new file mode 100644
index 0000000000..0040e5a5bc
--- /dev/null
+++ b/surfsense_backend/app/services/quota_checked_vision_llm.py
@@ -0,0 +1,105 @@
+"""
+Vision LLM proxy that enforces premium credit quota on every ``ainvoke``.
+
+Used by :func:`app.services.llm_service.get_vision_llm` so callers in the
+indexing pipeline (file processors, connector indexers, etl pipeline) can
+keep invoking the LLM exactly the way they do today — ``await llm.ainvoke(...)``
+— without threading ``user_id`` through every parser. The wrapper looks like
+a chat model from the outside; on the inside it routes each call through
+``billable_call`` so the user's premium credit pool is reserved → finalized
+or released, and a ``TokenUsage`` audit row is written.
+
+Free configs are returned unwrapped from ``get_vision_llm`` (they do not
+need quota enforcement) so this class only ever wraps premium configs.
+
+Why a wrapper instead of plumbing ``user_id`` through every caller:
+
+* The indexer ecosystem has 8+ entry points (Google Drive, OneDrive,
+  Dropbox, local-folder, file-processor, ETL pipeline) each calling
+  ``parse_with_vision_llm(...)``. Adding a ``user_id`` argument to each is
+  invasive, error-prone, and easy for a future indexer to forget.
+* Per the design (issue M), we always debit the *search-space owner*, not
+  the triggering user, so ``user_id`` is fully derivable from the search
+  space the caller is already operating on. The wrapper captures it once
+  at construction time.
+* ``langchain_litellm.ChatLiteLLM`` has no public hook for "before each
+  call run this coroutine"; subclassing isn't safe across versions because
+  it derives from ``BaseChatModel`` which expects specific Pydantic shapes.
+  Composition via attribute proxying (``__getattr__``) is robust to
+  upstream changes — every method other than ``ainvoke`` falls through to
+  the inner LLM unchanged.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+from uuid import UUID
+
+from app.services.billable_calls import QuotaInsufficientError, billable_call
+
+logger = logging.getLogger(__name__)
+
+
+class QuotaCheckedVisionLLM:
+    """Composition wrapper around a langchain chat model that enforces
+    premium credit quota on every ``ainvoke``.
+
+    Anything other than ``ainvoke`` is forwarded to the inner model so
+    ``invoke`` (sync), ``astream``, ``with_structured_output``, etc. all
+    still work — they simply bypass quota enforcement, which is fine
+    because the indexing pipeline only ever calls ``ainvoke`` today.
+    """
+
+    def __init__(
+        self,
+        inner_llm: Any,
+        *,
+        user_id: UUID,
+        search_space_id: int,
+        billing_tier: str,
+        base_model: str,
+        quota_reserve_tokens: int | None,
+        usage_type: str = "vision_extraction",
+    ) -> None:
+        self._inner = inner_llm
+        self._user_id = user_id
+        self._search_space_id = search_space_id
+        self._billing_tier = billing_tier
+        self._base_model = base_model
+        self._quota_reserve_tokens = quota_reserve_tokens
+        self._usage_type = usage_type
+
+    async def ainvoke(self, input: Any, *args: Any, **kwargs: Any) -> Any:
+        """Proxied async invoke that runs the underlying call inside
+        ``billable_call``.
+
+        Raises:
+            QuotaInsufficientError: when the user has exhausted their
+                premium credit pool. Caller (``etl_pipeline_service._extract_image``)
+                catches this and falls back to the document parser.
+        """
+        async with billable_call(
+            user_id=self._user_id,
+            search_space_id=self._search_space_id,
+            billing_tier=self._billing_tier,
+            base_model=self._base_model,
+            quota_reserve_tokens=self._quota_reserve_tokens,
+            usage_type=self._usage_type,
+            call_details={"model": self._base_model},
+        ):
+            return await self._inner.ainvoke(input, *args, **kwargs)
+
+    def __getattr__(self, name: str) -> Any:
+        """Forward everything else (``invoke``, ``astream``, ``bind``,
+        ``with_structured_output``, …) to the inner model.
+
+        ``__getattr__`` is only consulted when the attribute is *not*
+        already found on the proxy, which is exactly the contract we
+        want — methods we override stay on the proxy, the rest fall
+        through.
+        """
+        return getattr(self._inner, name)
+
+
+__all__ = ["QuotaCheckedVisionLLM", "QuotaInsufficientError"]
diff --git a/surfsense_backend/app/services/revert_service.py b/surfsense_backend/app/services/revert_service.py
new file mode 100644
index 0000000000..d02a31345a
--- /dev/null
+++ b/surfsense_backend/app/services/revert_service.py
@@ -0,0 +1,619 @@
+"""Revert service for the SurfSense agent action log.
+
+Implements the actual revert workflow used by
+``POST /api/threads/{thread_id}/revert/{action_id}``. The route handler is a
+thin auth + flag wrapper around the functions defined here.
+
+Operation outcomes mirror the plan:
+
+* **KB-owned actions** (NOTE / FILE / FOLDER mutations): restore from
+  :class:`app.db.DocumentRevision` / :class:`app.db.FolderRevision` rows
+  written before the original mutation. ``rm``/``rmdir`` re-INSERT a fresh
+  row from the snapshot; ``write_file`` create / ``mkdir`` DELETE the row
+  that was created; everything else is an in-place restore.
+* **Connector-owned actions with a declared ``reverse_descriptor``**: invoke
+  the inverse tool through the agent's normal permission stack (NOT
+  bypassed). Out of scope for this PR — returns ``REVERSE_NOT_IMPLEMENTED``.
+* **Anything else** (deprecated tool / no descriptor / schema drift):
+  returns ``NOT_REVERSIBLE`` and the route surfaces it as 409.
+
+A successful revert appends a NEW row to ``agent_action_log`` with
+``reverse_of=<original_action_id>`` and the requesting user's
+``user_id``, preserving an auditable chain.
+
+Dispatch must be exact-match (``tool_name == name``), NOT prefix matching.
+``"rmdir".startswith("rm")`` would otherwise mis-route directory revert
+to the document branch (and ``delete_note`` vs ``delete_folder`` is the
+same trap waiting to happen).
+"""
+
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from datetime import UTC, datetime
+from typing import Any, Literal
+
+from sqlalchemy import delete, select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.agents.new_chat.path_resolver import (
+    DOCUMENTS_ROOT,
+    safe_filename,
+    safe_folder_segment,
+)
+from app.db import (
+    AgentActionLog,
+    Chunk,
+    Document,
+    DocumentRevision,
+    DocumentType,
+    Folder,
+    FolderRevision,
+    NewChatThread,
+)
+from app.utils.document_converters import (
+    embed_texts,
+    generate_content_hash,
+    generate_unique_identifier_hash,
+)
+
+logger = logging.getLogger(__name__)
+
+
+RevertOutcomeStatus = Literal[
+    "ok",
+    "not_reversible",
+    "not_found",
+    "permission_denied",
+    "tool_unavailable",
+    "reverse_not_implemented",
+]
+
+
+@dataclass
+class RevertOutcome:
+    """Structured result of :func:`revert_action`."""
+
+    status: RevertOutcomeStatus
+    message: str
+    new_action_id: int | None = None
+
+
+# ---------------------------------------------------------------------------
+# Lookup helpers
+# ---------------------------------------------------------------------------
+
+
+async def load_action(
+    session: AsyncSession,
+    *,
+    action_id: int,
+    thread_id: int,
+) -> AgentActionLog | None:
+    """Load the action_log row for ``action_id`` if it belongs to the thread."""
+    stmt = select(AgentActionLog).where(
+        AgentActionLog.id == action_id,
+        AgentActionLog.thread_id == thread_id,
+    )
+    result = await session.execute(stmt)
+    return result.scalars().first()
+
+
+async def load_thread(session: AsyncSession, *, thread_id: int) -> NewChatThread | None:
+    stmt = select(NewChatThread).where(NewChatThread.id == thread_id)
+    result = await session.execute(stmt)
+    return result.scalars().first()
+
+
+# ---------------------------------------------------------------------------
+# Authorization
+# ---------------------------------------------------------------------------
+
+
+def can_revert(
+    *,
+    requester_user_id: str | None,
+    action: AgentActionLog,
+    is_admin: bool,
+) -> bool:
+    """Return True iff the requester is allowed to revert this action.
+
+    The plan's rule: "requester must be the original `user_id` on the
+    action, or hold the search-space admin role." Anonymous actions
+    (``action.user_id is None``) can only be reverted by admins.
+    """
+    if is_admin:
+        return True
+    if action.user_id is None:
+        return False
+    return str(action.user_id) == str(requester_user_id)
+
+
+# ---------------------------------------------------------------------------
+# Helper: reconstruct virtual path from a snapshot
+# ---------------------------------------------------------------------------
+
+
+async def _virtual_path_from_snapshot(
+    session: AsyncSession,
+    revision: DocumentRevision,
+) -> str | None:
+    """Reconstruct the virtual_path the document was at before mutation.
+
+    Preference order:
+    1. ``metadata_before["virtual_path"]`` — written by every snapshot
+       helper since this PR.
+    2. Compose ``"<folder_path>/<title_before>"`` from
+       ``folder_id_before`` + ``title_before``. Walks the folder chain via
+       ``parent_id``.
+    """
+    metadata = revision.metadata_before or {}
+    candidate = metadata.get("virtual_path") if isinstance(metadata, dict) else None
+    if isinstance(candidate, str) and candidate.startswith(DOCUMENTS_ROOT):
+        return candidate
+
+    title = revision.title_before
+    if not isinstance(title, str) or not title:
+        return None
+
+    parts: list[str] = []
+    cursor: int | None = revision.folder_id_before
+    visited: set[int] = set()
+    while cursor is not None and cursor not in visited:
+        visited.add(cursor)
+        folder = await session.get(Folder, cursor)
+        if folder is None:
+            return None
+        parts.append(safe_folder_segment(str(folder.name or "")))
+        cursor = folder.parent_id
+    parts.reverse()
+
+    base = f"{DOCUMENTS_ROOT}/" + "/".join(parts) if parts else DOCUMENTS_ROOT
+    filename = safe_filename(title)
+    return f"{base}/{filename}"
+
+
+# ---------------------------------------------------------------------------
+# Document revision restore (write/edit/move/rm)
+# ---------------------------------------------------------------------------
+
+
+def _set_field(target: Any, field: str, value: Any) -> None:
+    if value is not None:
+        setattr(target, field, value)
+
+
+async def _restore_in_place_document(
+    session: AsyncSession,
+    *,
+    revision: DocumentRevision,
+) -> RevertOutcome:
+    """Apply an in-place restore to an existing :class:`Document`."""
+    if revision.document_id is None:
+        return RevertOutcome(
+            status="tool_unavailable",
+            message=(
+                "Original document was hard-deleted; in-place restore is not possible."
+            ),
+        )
+    doc = await session.get(Document, revision.document_id)
+    if doc is None:
+        return RevertOutcome(
+            status="tool_unavailable",
+            message="Original document has been deleted; revert cannot proceed.",
+        )
+
+    _set_field(doc, "content", revision.content_before)
+    _set_field(doc, "source_markdown", revision.content_before)
+    _set_field(doc, "title", revision.title_before)
+    _set_field(doc, "folder_id", revision.folder_id_before)
+    metadata_before = revision.metadata_before or {}
+    if isinstance(metadata_before, dict) and metadata_before:
+        doc.document_metadata = dict(metadata_before)
+
+    if isinstance(revision.content_before, str):
+        doc.content_hash = generate_content_hash(
+            revision.content_before, doc.search_space_id
+        )
+
+    virtual_path = await _virtual_path_from_snapshot(session, revision)
+    if virtual_path:
+        doc.unique_identifier_hash = generate_unique_identifier_hash(
+            DocumentType.NOTE,
+            virtual_path,
+            doc.search_space_id,
+        )
+
+    chunks_before = revision.chunks_before
+    if isinstance(chunks_before, list):
+        await session.execute(delete(Chunk).where(Chunk.document_id == doc.id))
+        chunk_texts = [
+            str(c.get("content"))
+            for c in chunks_before
+            if isinstance(c, dict) and isinstance(c.get("content"), str)
+        ]
+        if chunk_texts:
+            chunk_embeddings = embed_texts(chunk_texts)
+            session.add_all(
+                [
+                    Chunk(document_id=doc.id, content=text, embedding=embedding)
+                    for text, embedding in zip(
+                        chunk_texts, chunk_embeddings, strict=True
+                    )
+                ]
+            )
+            if isinstance(revision.content_before, str):
+                doc.embedding = embed_texts([revision.content_before])[0]
+
+    doc.updated_at = datetime.now(UTC)
+    return RevertOutcome(status="ok", message="Document restored from snapshot.")
+
+
+async def _reinsert_document_from_revision(
+    session: AsyncSession,
+    *,
+    revision: DocumentRevision,
+) -> RevertOutcome:
+    """Re-INSERT a deleted :class:`Document` from a snapshot row (``rm`` revert)."""
+    if not isinstance(revision.title_before, str) or not revision.title_before:
+        return RevertOutcome(
+            status="not_reversible",
+            message="Snapshot lacks title_before; cannot recreate document.",
+        )
+    if not isinstance(revision.content_before, str):
+        return RevertOutcome(
+            status="not_reversible",
+            message="Snapshot lacks content_before; cannot recreate document.",
+        )
+
+    virtual_path = await _virtual_path_from_snapshot(session, revision)
+    if not virtual_path:
+        return RevertOutcome(
+            status="not_reversible",
+            message=(
+                "Snapshot is missing both metadata_before['virtual_path'] AND "
+                "a resolvable (folder_id_before, title_before) pair."
+            ),
+        )
+
+    search_space_id = revision.search_space_id
+    unique_identifier_hash = generate_unique_identifier_hash(
+        DocumentType.NOTE,
+        virtual_path,
+        search_space_id,
+    )
+    collision = await session.execute(
+        select(Document.id).where(
+            Document.search_space_id == search_space_id,
+            Document.unique_identifier_hash == unique_identifier_hash,
+        )
+    )
+    if collision.scalar_one_or_none() is not None:
+        return RevertOutcome(
+            status="tool_unavailable",
+            message=(
+                f"A document already exists at '{virtual_path}'; revert would "
+                "collide. Move the live doc out of the way first."
+            ),
+        )
+
+    metadata = revision.metadata_before or {}
+    if not isinstance(metadata, dict):
+        metadata = {}
+    metadata = dict(metadata)
+    metadata["virtual_path"] = virtual_path
+
+    content = revision.content_before
+    new_doc = Document(
+        title=revision.title_before,
+        document_type=DocumentType.NOTE,
+        document_metadata=metadata,
+        content=content,
+        content_hash=generate_content_hash(content, search_space_id),
+        unique_identifier_hash=unique_identifier_hash,
+        source_markdown=content,
+        search_space_id=search_space_id,
+        folder_id=revision.folder_id_before,
+        updated_at=datetime.now(UTC),
+    )
+    session.add(new_doc)
+    await session.flush()
+
+    new_doc.embedding = embed_texts([content])[0]
+    chunk_texts = []
+    chunks_before = revision.chunks_before
+    if isinstance(chunks_before, list):
+        chunk_texts = [
+            str(c.get("content"))
+            for c in chunks_before
+            if isinstance(c, dict) and isinstance(c.get("content"), str)
+        ]
+    if chunk_texts:
+        chunk_embeddings = embed_texts(chunk_texts)
+        session.add_all(
+            [
+                Chunk(document_id=new_doc.id, content=text, embedding=embedding)
+                for text, embedding in zip(chunk_texts, chunk_embeddings, strict=True)
+            ]
+        )
+
+    # Repoint the snapshot at the recreated row so a follow-up revert of
+    # the same row works as expected.
+    revision.document_id = new_doc.id
+    return RevertOutcome(
+        status="ok",
+        message=f"Re-inserted document '{revision.title_before}' from snapshot.",
+    )
+
+
+async def _delete_created_document(
+    session: AsyncSession,
+    *,
+    revision: DocumentRevision,
+) -> RevertOutcome:
+    """Delete the document that ``write_file`` created (``content_before IS NULL``)."""
+    if revision.document_id is None:
+        return RevertOutcome(
+            status="ok",
+            message="No live row to delete (already removed elsewhere).",
+        )
+    await session.execute(delete(Document).where(Document.id == revision.document_id))
+    return RevertOutcome(
+        status="ok",
+        message="Deleted the document that was created by this action.",
+    )
+
+
+async def _restore_document_revision(
+    session: AsyncSession, *, action: AgentActionLog
+) -> RevertOutcome:
+    """Dispatch document-level revert based on ``action.tool_name``."""
+    stmt = (
+        select(DocumentRevision)
+        .where(DocumentRevision.agent_action_id == action.id)
+        .order_by(DocumentRevision.created_at.desc())
+        .limit(1)
+    )
+    result = await session.execute(stmt)
+    revision = result.scalars().first()
+    if revision is None:
+        return RevertOutcome(
+            status="not_reversible",
+            message="No document_revisions row tied to this action.",
+        )
+
+    tool_name = (action.tool_name or "").lower()
+
+    if tool_name == "rm":
+        return await _reinsert_document_from_revision(session, revision=revision)
+
+    if tool_name == "write_file" and revision.content_before is None:
+        return await _delete_created_document(session, revision=revision)
+
+    return await _restore_in_place_document(session, revision=revision)
+
+
+# ---------------------------------------------------------------------------
+# Folder revision restore (mkdir/rmdir/rename/move)
+# ---------------------------------------------------------------------------
+
+
+async def _restore_in_place_folder(
+    session: AsyncSession,
+    *,
+    revision: FolderRevision,
+) -> RevertOutcome:
+    if revision.folder_id is None:
+        return RevertOutcome(
+            status="tool_unavailable",
+            message="Original folder was hard-deleted; in-place restore is impossible.",
+        )
+    folder = await session.get(Folder, revision.folder_id)
+    if folder is None:
+        return RevertOutcome(
+            status="tool_unavailable",
+            message="Original folder has been deleted; revert cannot proceed.",
+        )
+    _set_field(folder, "name", revision.name_before)
+    _set_field(folder, "parent_id", revision.parent_id_before)
+    _set_field(folder, "position", revision.position_before)
+    folder.updated_at = datetime.now(UTC)
+    return RevertOutcome(status="ok", message="Folder restored from snapshot.")
+
+
+async def _reinsert_folder_from_revision(
+    session: AsyncSession,
+    *,
+    revision: FolderRevision,
+) -> RevertOutcome:
+    if not isinstance(revision.name_before, str) or not revision.name_before:
+        return RevertOutcome(
+            status="not_reversible",
+            message="Snapshot lacks name_before; cannot recreate folder.",
+        )
+    new_folder = Folder(
+        name=revision.name_before,
+        parent_id=revision.parent_id_before,
+        position=revision.position_before,
+        search_space_id=revision.search_space_id,
+        updated_at=datetime.now(UTC),
+    )
+    session.add(new_folder)
+    await session.flush()
+    revision.folder_id = new_folder.id
+    return RevertOutcome(
+        status="ok",
+        message=f"Re-inserted folder '{revision.name_before}' from snapshot.",
+    )
+
+
+async def _delete_created_folder(
+    session: AsyncSession,
+    *,
+    revision: FolderRevision,
+) -> RevertOutcome:
+    if revision.folder_id is None:
+        return RevertOutcome(
+            status="ok",
+            message="No live folder row to delete (already removed elsewhere).",
+        )
+    folder_id = revision.folder_id
+
+    has_doc = await session.execute(
+        select(Document.id).where(Document.folder_id == folder_id).limit(1)
+    )
+    if has_doc.scalar_one_or_none() is not None:
+        return RevertOutcome(
+            status="tool_unavailable",
+            message=(
+                "Folder is no longer empty (documents have been added since "
+                "mkdir); cannot revert."
+            ),
+        )
+    has_child = await session.execute(
+        select(Folder.id).where(Folder.parent_id == folder_id).limit(1)
+    )
+    if has_child.scalar_one_or_none() is not None:
+        return RevertOutcome(
+            status="tool_unavailable",
+            message=(
+                "Folder is no longer empty (sub-folders have been added "
+                "since mkdir); cannot revert."
+            ),
+        )
+
+    await session.execute(delete(Folder).where(Folder.id == folder_id))
+    return RevertOutcome(
+        status="ok",
+        message="Deleted the folder that was created by this action.",
+    )
+
+
+async def _restore_folder_revision(
+    session: AsyncSession, *, action: AgentActionLog
+) -> RevertOutcome:
+    stmt = (
+        select(FolderRevision)
+        .where(FolderRevision.agent_action_id == action.id)
+        .order_by(FolderRevision.created_at.desc())
+        .limit(1)
+    )
+    result = await session.execute(stmt)
+    revision = result.scalars().first()
+    if revision is None:
+        return RevertOutcome(
+            status="not_reversible",
+            message="No folder_revisions row tied to this action.",
+        )
+
+    tool_name = (action.tool_name or "").lower()
+
+    if tool_name == "rmdir":
+        return await _reinsert_folder_from_revision(session, revision=revision)
+
+    if tool_name == "mkdir":
+        return await _delete_created_folder(session, revision=revision)
+
+    return await _restore_in_place_folder(session, revision=revision)
+
+
+# ---------------------------------------------------------------------------
+# Dispatch
+# ---------------------------------------------------------------------------
+#
+# Exact-name dispatch: ``tool_name == name``, NOT ``startswith(...)``.
+# Prefix-matching mis-routes pairs like ``rm``/``rmdir`` and
+# ``delete_note``/``delete_folder``.
+
+_DOC_TOOLS: frozenset[str] = frozenset(
+    {
+        "edit_file",
+        "write_file",
+        "move_file",
+        "rm",
+        "update_memory",
+        "create_note",
+        "update_note",
+        "delete_note",
+    }
+)
+_FOLDER_TOOLS: frozenset[str] = frozenset(
+    {
+        "mkdir",
+        "rmdir",
+        "rename_folder",
+        "delete_folder",
+    }
+)
+
+
+async def revert_action(
+    session: AsyncSession,
+    *,
+    action: AgentActionLog,
+    requester_user_id: str | None,
+) -> RevertOutcome:
+    """Execute the revert for ``action`` and return a structured outcome.
+
+    The function does **not** commit — the caller is expected to commit on
+    success or roll back on failure. A new ``agent_action_log`` row is
+    added to the session on success with ``reverse_of=action.id``.
+    """
+    tool_name = (action.tool_name or "").lower()
+
+    if tool_name in _DOC_TOOLS:
+        outcome = await _restore_document_revision(session, action=action)
+    elif tool_name in _FOLDER_TOOLS:
+        outcome = await _restore_folder_revision(session, action=action)
+    elif action.reverse_descriptor:
+        # Connector-owned reversibles run through the normal permission
+        # stack; out of scope for this PR — the route returns 503 anyway
+        # until UI ships, so 501-style "not implemented" is fine.
+        return RevertOutcome(
+            status="reverse_not_implemented",
+            message=(
+                "Connector-action revert is not yet implemented. The "
+                "reverse_descriptor is stored; future work will replay it "
+                "through PermissionMiddleware."
+            ),
+        )
+    else:
+        return RevertOutcome(
+            status="not_reversible",
+            message=(
+                f"Tool {action.tool_name!r} is not reversible: no document "
+                "revision and no reverse_descriptor."
+            ),
+        )
+
+    if outcome.status != "ok":
+        return outcome
+
+    new_row = AgentActionLog(
+        thread_id=action.thread_id,
+        user_id=requester_user_id,
+        search_space_id=action.search_space_id,
+        turn_id=None,
+        message_id=None,
+        tool_name=f"_revert:{action.tool_name}",
+        args={"reverted_action_id": action.id},
+        result_id=None,
+        reversible=False,
+        reverse_descriptor=None,
+        error=None,
+        reverse_of=action.id,
+    )
+    session.add(new_row)
+    await session.flush()
+    outcome.new_action_id = new_row.id
+    return outcome
+
+
+__all__ = [
+    "RevertOutcome",
+    "can_revert",
+    "load_action",
+    "load_thread",
+    "revert_action",
+]
diff --git a/surfsense_backend/app/services/streaming/__init__.py b/surfsense_backend/app/services/streaming/__init__.py
new file mode 100644
index 0000000000..3ec9b9cf1f
--- /dev/null
+++ b/surfsense_backend/app/services/streaming/__init__.py
@@ -0,0 +1,20 @@
+"""Single-responsibility split of the streaming SSE protocol.
+
+Layout:
+* ``envelope/`` - SSE wire framing + ID generators
+* ``emitter/`` - identity of the agent that emitted an event + runtime registry
+* ``events/`` - one module per SSE event family
+* ``service.py`` - composition root used when emitting chat SSE
+* ``interrupt_correlation.py`` - id-aware lookup over LangGraph state
+
+Naming on the wire:
+* AI SDK protocol fields keep their existing camelCase
+  (``toolCallId``, ``messageId``, ``inputTextDelta``, ``langchainToolCallId``).
+* Every SurfSense-added field uses ``snake_case``, including the
+  top-level ``emitted_by`` envelope and all inner ``data`` payloads.
+
+Production chat uses ``app.services.new_streaming_service`` from
+``app.tasks.chat.stream_new_chat`` and related routes.
+"""
+
+from __future__ import annotations
diff --git a/surfsense_backend/app/services/streaming/emitter/__init__.py b/surfsense_backend/app/services/streaming/emitter/__init__.py
new file mode 100644
index 0000000000..7814894f3b
--- /dev/null
+++ b/surfsense_backend/app/services/streaming/emitter/__init__.py
@@ -0,0 +1,29 @@
+"""Identity of the agent that emitted a streamed event.
+
+The wire field is ``emitted_by``; the Python identity is :class:`Emitter`.
+``EmitterRegistry`` resolves which emitter owns a LangGraph event, with
+LangGraph's own namespace metadata as the primary key and a parent_ids
+walk as a fallback for cases where context vars don't propagate.
+"""
+
+from __future__ import annotations
+
+from .emitter import (
+    MAIN_EMITTER,
+    Emitter,
+    EmitterLevel,
+    attach_emitted_by,
+    main_emitter,
+    subagent_emitter,
+)
+from .registry import EmitterRegistry
+
+__all__ = [
+    "MAIN_EMITTER",
+    "Emitter",
+    "EmitterLevel",
+    "EmitterRegistry",
+    "attach_emitted_by",
+    "main_emitter",
+    "subagent_emitter",
+]
diff --git a/surfsense_backend/app/services/streaming/emitter/emitter.py b/surfsense_backend/app/services/streaming/emitter/emitter.py
new file mode 100644
index 0000000000..08f625a698
--- /dev/null
+++ b/surfsense_backend/app/services/streaming/emitter/emitter.py
@@ -0,0 +1,61 @@
+"""Identity payload describing which agent produced a stream event."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any, Literal
+
+EmitterLevel = Literal["main", "subagent"]
+
+
+@dataclass(frozen=True)
+class Emitter:
+    level: EmitterLevel
+    subagent_type: str | None = None
+    subagent_run_id: str | None = None
+    parent_tool_call_id: str | None = None
+    extra: dict[str, Any] = field(default_factory=dict)
+
+    def to_payload(self) -> dict[str, Any]:
+        payload: dict[str, Any] = {"level": self.level}
+        if self.subagent_type is not None:
+            payload["subagent_type"] = self.subagent_type
+        if self.subagent_run_id is not None:
+            payload["subagent_run_id"] = self.subagent_run_id
+        if self.parent_tool_call_id is not None:
+            payload["parent_tool_call_id"] = self.parent_tool_call_id
+        if self.extra:
+            payload.update(self.extra)
+        return payload
+
+
+MAIN_EMITTER = Emitter(level="main")
+
+
+def main_emitter() -> Emitter:
+    return MAIN_EMITTER
+
+
+def subagent_emitter(
+    *,
+    subagent_type: str,
+    subagent_run_id: str,
+    parent_tool_call_id: str | None = None,
+    extra: dict[str, Any] | None = None,
+) -> Emitter:
+    return Emitter(
+        level="subagent",
+        subagent_type=subagent_type,
+        subagent_run_id=subagent_run_id,
+        parent_tool_call_id=parent_tool_call_id,
+        extra=dict(extra or {}),
+    )
+
+
+def attach_emitted_by(
+    payload: dict[str, Any], emitter: Emitter | None
+) -> dict[str, Any]:
+    if emitter is None:
+        return payload
+    payload["emitted_by"] = emitter.to_payload()
+    return payload
diff --git a/surfsense_backend/app/services/streaming/emitter/registry.py b/surfsense_backend/app/services/streaming/emitter/registry.py
new file mode 100644
index 0000000000..066689691d
--- /dev/null
+++ b/surfsense_backend/app/services/streaming/emitter/registry.py
@@ -0,0 +1,49 @@
+"""Resolve which agent owns a streamed event from its LangGraph run lineage."""
+
+from __future__ import annotations
+
+from collections.abc import Iterable
+
+from .emitter import Emitter, main_emitter
+
+
+class EmitterRegistry:
+    def __init__(self) -> None:
+        self._by_run_id: dict[str, Emitter] = {}
+
+    def register(self, run_id: str, emitter: Emitter) -> None:
+        if not run_id:
+            return
+        self._by_run_id[run_id] = emitter
+
+    def unregister(self, run_id: str) -> Emitter | None:
+        if not run_id:
+            return None
+        return self._by_run_id.pop(run_id, None)
+
+    def get(self, run_id: str | None) -> Emitter | None:
+        if not run_id:
+            return None
+        return self._by_run_id.get(run_id)
+
+    def resolve(
+        self,
+        *,
+        run_id: str | None,
+        parent_ids: Iterable[str] | None,
+    ) -> Emitter:
+        own = self.get(run_id)
+        if own is not None:
+            return own
+        if parent_ids:
+            for ancestor in reversed(list(parent_ids)):
+                emitter = self.get(ancestor)
+                if emitter is not None:
+                    return emitter
+        return main_emitter()
+
+    def has_active_subagents(self) -> bool:
+        return any(emitter.level == "subagent" for emitter in self._by_run_id.values())
+
+    def clear(self) -> None:
+        self._by_run_id.clear()
diff --git a/surfsense_backend/app/services/streaming/envelope/__init__.py b/surfsense_backend/app/services/streaming/envelope/__init__.py
new file mode 100644
index 0000000000..862e84c8df
--- /dev/null
+++ b/surfsense_backend/app/services/streaming/envelope/__init__.py
@@ -0,0 +1,23 @@
+"""Wire framing layer."""
+
+from __future__ import annotations
+
+from .identifiers import (
+    generate_message_id,
+    generate_reasoning_id,
+    generate_subagent_run_id,
+    generate_text_id,
+    generate_tool_call_id,
+)
+from .sse import format_done, format_sse, get_response_headers
+
+__all__ = [
+    "format_done",
+    "format_sse",
+    "generate_message_id",
+    "generate_reasoning_id",
+    "generate_subagent_run_id",
+    "generate_text_id",
+    "generate_tool_call_id",
+    "get_response_headers",
+]
diff --git a/surfsense_backend/app/services/streaming/envelope/identifiers.py b/surfsense_backend/app/services/streaming/envelope/identifiers.py
new file mode 100644
index 0000000000..2fdd6ff09d
--- /dev/null
+++ b/surfsense_backend/app/services/streaming/envelope/identifiers.py
@@ -0,0 +1,25 @@
+"""Prefixed UUID generators for stream parts."""
+
+from __future__ import annotations
+
+import uuid
+
+
+def generate_message_id() -> str:
+    return f"msg_{uuid.uuid4().hex}"
+
+
+def generate_text_id() -> str:
+    return f"text_{uuid.uuid4().hex}"
+
+
+def generate_reasoning_id() -> str:
+    return f"reasoning_{uuid.uuid4().hex}"
+
+
+def generate_tool_call_id() -> str:
+    return f"call_{uuid.uuid4().hex}"
+
+
+def generate_subagent_run_id() -> str:
+    return f"subagent_{uuid.uuid4().hex}"
diff --git a/surfsense_backend/app/services/streaming/envelope/sse.py b/surfsense_backend/app/services/streaming/envelope/sse.py
new file mode 100644
index 0000000000..508fc1b1c4
--- /dev/null
+++ b/surfsense_backend/app/services/streaming/envelope/sse.py
@@ -0,0 +1,25 @@
+"""Server-Sent-Events wire framing."""
+
+from __future__ import annotations
+
+import json
+from typing import Any
+
+
+def format_sse(data: Any) -> str:
+    if isinstance(data, str):
+        return f"data: {data}\n\n"
+    return f"data: {json.dumps(data)}\n\n"
+
+
+def format_done() -> str:
+    return "data: [DONE]\n\n"
+
+
+def get_response_headers() -> dict[str, str]:
+    return {
+        "Content-Type": "text/event-stream",
+        "Cache-Control": "no-cache",
+        "Connection": "keep-alive",
+        "x-vercel-ai-ui-message-stream": "v1",
+    }
diff --git a/surfsense_backend/app/services/streaming/events/__init__.py b/surfsense_backend/app/services/streaming/events/__init__.py
new file mode 100644
index 0000000000..91a8ff8544
--- /dev/null
+++ b/surfsense_backend/app/services/streaming/events/__init__.py
@@ -0,0 +1,29 @@
+"""SSE event payload formatters, one module per event family."""
+
+from __future__ import annotations
+
+from . import (
+    action_log,
+    data,
+    error,
+    interrupt,
+    lifecycle,
+    reasoning,
+    source,
+    subagent_lifecycle,
+    text,
+    tool,
+)
+
+__all__ = [
+    "action_log",
+    "data",
+    "error",
+    "interrupt",
+    "lifecycle",
+    "reasoning",
+    "source",
+    "subagent_lifecycle",
+    "text",
+    "tool",
+]
diff --git a/surfsense_backend/app/services/streaming/events/action_log.py b/surfsense_backend/app/services/streaming/events/action_log.py
new file mode 100644
index 0000000000..0a8e46f0ab
--- /dev/null
+++ b/surfsense_backend/app/services/streaming/events/action_log.py
@@ -0,0 +1,24 @@
+"""Action-log events relayed from ``ActionLogMiddleware`` custom dispatches."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from ..emitter import Emitter
+from .data import format_data
+
+
+def format_action_log(
+    payload: dict[str, Any],
+    *,
+    emitter: Emitter | None = None,
+) -> str:
+    return format_data("action-log", payload, emitter=emitter)
+
+
+def format_action_log_updated(
+    payload: dict[str, Any],
+    *,
+    emitter: Emitter | None = None,
+) -> str:
+    return format_data("action-log-updated", payload, emitter=emitter)
diff --git a/surfsense_backend/app/services/streaming/events/data.py b/surfsense_backend/app/services/streaming/events/data.py
new file mode 100644
index 0000000000..f6e1905780
--- /dev/null
+++ b/surfsense_backend/app/services/streaming/events/data.py
@@ -0,0 +1,118 @@
+"""Generic ``data-*`` envelopes and SurfSense-specific data parts.
+
+Inner ``data`` dict fields use snake_case. Legacy ``threadId`` /
+``messageId`` keys are preserved where they cross the AI SDK boundary.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from ..emitter import Emitter, attach_emitted_by
+from ..envelope import format_sse
+
+
+def format_data(
+    data_type: str,
+    data: Any,
+    *,
+    emitter: Emitter | None = None,
+) -> str:
+    payload: dict[str, Any] = {"type": f"data-{data_type}", "data": data}
+    return format_sse(attach_emitted_by(payload, emitter))
+
+
+def format_terminal_info(
+    text: str,
+    *,
+    message_type: str = "info",
+    emitter: Emitter | None = None,
+) -> str:
+    return format_data(
+        "terminal-info",
+        {"text": text, "type": message_type},
+        emitter=emitter,
+    )
+
+
+def format_further_questions(
+    questions: list[str],
+    *,
+    emitter: Emitter | None = None,
+) -> str:
+    return format_data("further-questions", {"questions": questions}, emitter=emitter)
+
+
+def format_thinking_step(
+    *,
+    step_id: str,
+    title: str,
+    status: str = "in_progress",
+    items: list[str] | None = None,
+    emitter: Emitter | None = None,
+) -> str:
+    return format_data(
+        "thinking-step",
+        {
+            "id": step_id,
+            "title": title,
+            "status": status,
+            "items": items or [],
+        },
+        emitter=emitter,
+    )
+
+
+def format_thread_title_update(
+    *,
+    thread_id: int,
+    title: str,
+    emitter: Emitter | None = None,
+) -> str:
+    return format_data(
+        "thread-title-update",
+        {"threadId": thread_id, "title": title},
+        emitter=emitter,
+    )
+
+
+def format_turn_info(
+    *,
+    chat_turn_id: str,
+    emitter: Emitter | None = None,
+) -> str:
+    return format_data("turn-info", {"chat_turn_id": chat_turn_id}, emitter=emitter)
+
+
+def format_turn_status(
+    *,
+    status: str,
+    emitter: Emitter | None = None,
+) -> str:
+    return format_data("turn-status", {"status": status}, emitter=emitter)
+
+
+def format_user_message_id(
+    *,
+    message_id: str,
+    turn_id: str,
+    emitter: Emitter | None = None,
+) -> str:
+    return format_data(
+        "user-message-id",
+        {"message_id": message_id, "turn_id": turn_id},
+        emitter=emitter,
+    )
+
+
+def format_assistant_message_id(
+    *,
+    message_id: str,
+    turn_id: str,
+    emitter: Emitter | None = None,
+) -> str:
+    return format_data(
+        "assistant-message-id",
+        {"message_id": message_id, "turn_id": turn_id},
+        emitter=emitter,
+    )
diff --git a/surfsense_backend/app/services/streaming/events/error.py b/surfsense_backend/app/services/streaming/events/error.py
new file mode 100644
index 0000000000..a1e8e01ca4
--- /dev/null
+++ b/surfsense_backend/app/services/streaming/events/error.py
@@ -0,0 +1,23 @@
+"""Single terminal error path chat streaming must route through."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from ..emitter import Emitter, attach_emitted_by
+from ..envelope import format_sse
+
+
+def format_error(
+    error_text: str,
+    *,
+    error_code: str | None = None,
+    extra: dict[str, Any] | None = None,
+    emitter: Emitter | None = None,
+) -> str:
+    payload: dict[str, Any] = {"type": "error", "errorText": error_text}
+    if error_code:
+        payload["errorCode"] = error_code
+    if extra:
+        payload.update(extra)
+    return format_sse(attach_emitted_by(payload, emitter))
diff --git a/surfsense_backend/app/services/streaming/events/interrupt.py b/surfsense_backend/app/services/streaming/events/interrupt.py
new file mode 100644
index 0000000000..0334b10b32
--- /dev/null
+++ b/surfsense_backend/app/services/streaming/events/interrupt.py
@@ -0,0 +1,56 @@
+"""Interrupt-request events with a single canonical payload shape."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from ..emitter import Emitter
+from .data import format_data
+
+
+def normalize_interrupt_payload(interrupt_value: dict[str, Any]) -> dict[str, Any]:
+    if "action_requests" in interrupt_value and "review_configs" in interrupt_value:
+        return interrupt_value
+
+    interrupt_type = interrupt_value.get("type", "unknown")
+    message = interrupt_value.get("message")
+    action = interrupt_value.get("action", {}) or {}
+    context = interrupt_value.get("context", {}) or {}
+
+    normalized: dict[str, Any] = {
+        "action_requests": [
+            {
+                "name": action.get("tool", "unknown_tool"),
+                "args": action.get("params", {}),
+            }
+        ],
+        "review_configs": [
+            {
+                "action_name": action.get("tool", "unknown_tool"),
+                "allowed_decisions": ["approve", "edit", "reject"],
+            }
+        ],
+        "interrupt_type": interrupt_type,
+        "context": context,
+    }
+    if message:
+        normalized["message"] = message
+    return normalized
+
+
+def format_interrupt_request(
+    interrupt_value: dict[str, Any],
+    *,
+    interrupt_id: str | None = None,
+    pending_interrupt_count: int | None = None,
+    chat_turn_id: str | None = None,
+    emitter: Emitter | None = None,
+) -> str:
+    payload = normalize_interrupt_payload(interrupt_value)
+    if interrupt_id is not None:
+        payload["interrupt_id"] = interrupt_id
+    if pending_interrupt_count is not None:
+        payload["pending_interrupt_count"] = pending_interrupt_count
+    if chat_turn_id is not None:
+        payload["chat_turn_id"] = chat_turn_id
+    return format_data("interrupt-request", payload, emitter=emitter)
diff --git a/surfsense_backend/app/services/streaming/events/lifecycle.py b/surfsense_backend/app/services/streaming/events/lifecycle.py
new file mode 100644
index 0000000000..019718b677
--- /dev/null
+++ b/surfsense_backend/app/services/streaming/events/lifecycle.py
@@ -0,0 +1,29 @@
+"""High-level message and step lifecycle events.
+
+Wire verbs are fixed by the AI SDK protocol (``start`` / ``finish`` for
+the whole message, ``start-step`` / ``finish-step`` for each step).
+Python helpers always read ``format_<entity>_<verb>`` so pairs are
+visible at the call site.
+"""
+
+from __future__ import annotations
+
+from ..emitter import Emitter, attach_emitted_by
+from ..envelope import format_sse
+
+
+def format_message_start(message_id: str, *, emitter: Emitter | None = None) -> str:
+    payload = {"type": "start", "messageId": message_id}
+    return format_sse(attach_emitted_by(payload, emitter))
+
+
+def format_message_finish(*, emitter: Emitter | None = None) -> str:
+    return format_sse(attach_emitted_by({"type": "finish"}, emitter))
+
+
+def format_step_start(*, emitter: Emitter | None = None) -> str:
+    return format_sse(attach_emitted_by({"type": "start-step"}, emitter))
+
+
+def format_step_finish(*, emitter: Emitter | None = None) -> str:
+    return format_sse(attach_emitted_by({"type": "finish-step"}, emitter))
diff --git a/surfsense_backend/app/services/streaming/events/reasoning.py b/surfsense_backend/app/services/streaming/events/reasoning.py
new file mode 100644
index 0000000000..12843e2c33
--- /dev/null
+++ b/surfsense_backend/app/services/streaming/events/reasoning.py
@@ -0,0 +1,32 @@
+"""Reasoning-block streaming events."""
+
+from __future__ import annotations
+
+from ..emitter import Emitter, attach_emitted_by
+from ..envelope import format_sse
+
+
+def format_reasoning_start(reasoning_id: str, *, emitter: Emitter | None = None) -> str:
+    return format_sse(
+        attach_emitted_by({"type": "reasoning-start", "id": reasoning_id}, emitter)
+    )
+
+
+def format_reasoning_delta(
+    reasoning_id: str,
+    delta: str,
+    *,
+    emitter: Emitter | None = None,
+) -> str:
+    return format_sse(
+        attach_emitted_by(
+            {"type": "reasoning-delta", "id": reasoning_id, "delta": delta},
+            emitter,
+        )
+    )
+
+
+def format_reasoning_end(reasoning_id: str, *, emitter: Emitter | None = None) -> str:
+    return format_sse(
+        attach_emitted_by({"type": "reasoning-end", "id": reasoning_id}, emitter)
+    )
diff --git a/surfsense_backend/app/services/streaming/events/source.py b/surfsense_backend/app/services/streaming/events/source.py
new file mode 100644
index 0000000000..54541e8d20
--- /dev/null
+++ b/surfsense_backend/app/services/streaming/events/source.py
@@ -0,0 +1,59 @@
+"""Source and file reference events."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from ..emitter import Emitter, attach_emitted_by
+from ..envelope import format_sse
+
+
+def format_source_url(
+    url: str,
+    *,
+    source_id: str | None = None,
+    title: str | None = None,
+    emitter: Emitter | None = None,
+) -> str:
+    payload: dict[str, Any] = {
+        "type": "source-url",
+        "sourceId": source_id or url,
+        "url": url,
+    }
+    if title:
+        payload["title"] = title
+    return format_sse(attach_emitted_by(payload, emitter))
+
+
+def format_source_document(
+    source_id: str,
+    *,
+    media_type: str = "file",
+    title: str | None = None,
+    description: str | None = None,
+    emitter: Emitter | None = None,
+) -> str:
+    payload: dict[str, Any] = {
+        "type": "source-document",
+        "sourceId": source_id,
+        "mediaType": media_type,
+    }
+    if title:
+        payload["title"] = title
+    if description:
+        payload["description"] = description
+    return format_sse(attach_emitted_by(payload, emitter))
+
+
+def format_file(
+    url: str,
+    media_type: str,
+    *,
+    emitter: Emitter | None = None,
+) -> str:
+    payload: dict[str, Any] = {
+        "type": "file",
+        "url": url,
+        "mediaType": media_type,
+    }
+    return format_sse(attach_emitted_by(payload, emitter))
diff --git a/surfsense_backend/app/services/streaming/events/subagent_lifecycle.py b/surfsense_backend/app/services/streaming/events/subagent_lifecycle.py
new file mode 100644
index 0000000000..6dd2d4eab1
--- /dev/null
+++ b/surfsense_backend/app/services/streaming/events/subagent_lifecycle.py
@@ -0,0 +1,86 @@
+"""Sub-agent lifecycle events the FE pairs into one timeline lane.
+
+A sub-agent run is a high-level boundary (a whole agent invocation),
+so we use the ``start`` / ``finish`` verb pair, matching how the AI SDK
+spells message- and step-level lifecycles.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from ..emitter import Emitter
+from .data import format_data
+
+
+def format_subagent_start(
+    *,
+    subagent_run_id: str,
+    subagent_type: str,
+    parent_tool_call_id: str,
+    chat_turn_id: str | None = None,
+    description: str | None = None,
+    started_at: str | None = None,
+    emitter: Emitter | None = None,
+) -> str:
+    payload: dict[str, Any] = {
+        "subagent_run_id": subagent_run_id,
+        "subagent_type": subagent_type,
+        "parent_tool_call_id": parent_tool_call_id,
+    }
+    if chat_turn_id is not None:
+        payload["chat_turn_id"] = chat_turn_id
+    if description is not None:
+        payload["description"] = description
+    if started_at is not None:
+        payload["started_at"] = started_at
+    return format_data("subagent-start", payload, emitter=emitter)
+
+
+def format_subagent_finish(
+    *,
+    subagent_run_id: str,
+    subagent_type: str,
+    parent_tool_call_id: str,
+    status: str = "completed",
+    ended_at: str | None = None,
+    duration_ms: int | None = None,
+    emitter: Emitter | None = None,
+) -> str:
+    payload: dict[str, Any] = {
+        "subagent_run_id": subagent_run_id,
+        "subagent_type": subagent_type,
+        "parent_tool_call_id": parent_tool_call_id,
+        "status": status,
+    }
+    if ended_at is not None:
+        payload["ended_at"] = ended_at
+    if duration_ms is not None:
+        payload["duration_ms"] = duration_ms
+    return format_data("subagent-finish", payload, emitter=emitter)
+
+
+def format_subagent_error(
+    *,
+    subagent_run_id: str,
+    subagent_type: str,
+    parent_tool_call_id: str,
+    error_text: str,
+    error_type: str | None = None,
+    ended_at: str | None = None,
+    duration_ms: int | None = None,
+    emitter: Emitter | None = None,
+) -> str:
+    payload: dict[str, Any] = {
+        "subagent_run_id": subagent_run_id,
+        "subagent_type": subagent_type,
+        "parent_tool_call_id": parent_tool_call_id,
+        "error_text": error_text,
+    }
+    if error_type is not None:
+        payload["error_type"] = error_type
+    if ended_at is not None:
+        payload["ended_at"] = ended_at
+    if duration_ms is not None:
+        payload["duration_ms"] = duration_ms
+    return format_data("subagent-error", payload, emitter=emitter)
diff --git a/surfsense_backend/app/services/streaming/events/text.py b/surfsense_backend/app/services/streaming/events/text.py
new file mode 100644
index 0000000000..30489ce499
--- /dev/null
+++ b/surfsense_backend/app/services/streaming/events/text.py
@@ -0,0 +1,27 @@
+"""Text-block streaming events."""
+
+from __future__ import annotations
+
+from ..emitter import Emitter, attach_emitted_by
+from ..envelope import format_sse
+
+
+def format_text_start(text_id: str, *, emitter: Emitter | None = None) -> str:
+    return format_sse(attach_emitted_by({"type": "text-start", "id": text_id}, emitter))
+
+
+def format_text_delta(
+    text_id: str,
+    delta: str,
+    *,
+    emitter: Emitter | None = None,
+) -> str:
+    return format_sse(
+        attach_emitted_by(
+            {"type": "text-delta", "id": text_id, "delta": delta}, emitter
+        )
+    )
+
+
+def format_text_end(text_id: str, *, emitter: Emitter | None = None) -> str:
+    return format_sse(attach_emitted_by({"type": "text-end", "id": text_id}, emitter))
diff --git a/surfsense_backend/app/services/streaming/events/tool.py b/surfsense_backend/app/services/streaming/events/tool.py
new file mode 100644
index 0000000000..c85dc061bd
--- /dev/null
+++ b/surfsense_backend/app/services/streaming/events/tool.py
@@ -0,0 +1,80 @@
+"""Tool-call streaming events.
+
+``toolCallId`` and ``langchainToolCallId`` are AI SDK protocol fields
+and stay camelCase. Sub-agent provenance rides on the snake_case
+top-level ``emitted_by`` envelope added by :func:`attach_emitted_by`.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from ..emitter import Emitter, attach_emitted_by
+from ..envelope import format_sse
+
+
+def format_tool_input_start(
+    tool_call_id: str,
+    tool_name: str,
+    *,
+    langchain_tool_call_id: str | None = None,
+    emitter: Emitter | None = None,
+) -> str:
+    payload: dict[str, Any] = {
+        "type": "tool-input-start",
+        "toolCallId": tool_call_id,
+        "toolName": tool_name,
+    }
+    if langchain_tool_call_id:
+        payload["langchainToolCallId"] = langchain_tool_call_id
+    return format_sse(attach_emitted_by(payload, emitter))
+
+
+def format_tool_input_delta(
+    tool_call_id: str,
+    input_text_delta: str,
+    *,
+    emitter: Emitter | None = None,
+) -> str:
+    payload: dict[str, Any] = {
+        "type": "tool-input-delta",
+        "toolCallId": tool_call_id,
+        "inputTextDelta": input_text_delta,
+    }
+    return format_sse(attach_emitted_by(payload, emitter))
+
+
+def format_tool_input_available(
+    tool_call_id: str,
+    tool_name: str,
+    input_data: dict[str, Any],
+    *,
+    langchain_tool_call_id: str | None = None,
+    emitter: Emitter | None = None,
+) -> str:
+    payload: dict[str, Any] = {
+        "type": "tool-input-available",
+        "toolCallId": tool_call_id,
+        "toolName": tool_name,
+        "input": input_data,
+    }
+    if langchain_tool_call_id:
+        payload["langchainToolCallId"] = langchain_tool_call_id
+    return format_sse(attach_emitted_by(payload, emitter))
+
+
+def format_tool_output_available(
+    tool_call_id: str,
+    output: Any,
+    *,
+    langchain_tool_call_id: str | None = None,
+    emitter: Emitter | None = None,
+) -> str:
+    payload: dict[str, Any] = {
+        "type": "tool-output-available",
+        "toolCallId": tool_call_id,
+        "output": output,
+    }
+    if langchain_tool_call_id:
+        payload["langchainToolCallId"] = langchain_tool_call_id
+    return format_sse(attach_emitted_by(payload, emitter))
diff --git a/surfsense_backend/app/services/streaming/interrupt_correlation.py b/surfsense_backend/app/services/streaming/interrupt_correlation.py
new file mode 100644
index 0000000000..3045dfb6a1
--- /dev/null
+++ b/surfsense_backend/app/services/streaming/interrupt_correlation.py
@@ -0,0 +1,84 @@
+"""Id-aware lookup of pending LangGraph interrupts (replaces first-wins)."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+
+
+@dataclass(frozen=True)
+class PendingInterrupt:
+    interrupt_id: str | None
+    value: dict[str, Any]
+    source_task_id: str | None = None
+
+
+def list_pending_interrupts(state: Any) -> list[PendingInterrupt]:
+    out: list[PendingInterrupt] = []
+
+    for task in getattr(state, "tasks", None) or ():
+        task_id = _safe_str(getattr(task, "id", None))
+        for it in getattr(task, "interrupts", None) or ():
+            value = _coerce_interrupt_value(it)
+            if value is None:
+                continue
+            interrupt_id = _safe_str(getattr(it, "id", None))
+            out.append(
+                PendingInterrupt(
+                    interrupt_id=interrupt_id,
+                    value=value,
+                    source_task_id=task_id,
+                )
+            )
+
+    for it in getattr(state, "interrupts", None) or ():
+        value = _coerce_interrupt_value(it)
+        if value is None:
+            continue
+        interrupt_id = _safe_str(getattr(it, "id", None))
+        out.append(PendingInterrupt(interrupt_id=interrupt_id, value=value))
+
+    return out
+
+
+def get_pending_interrupt_by_id(
+    state: Any, interrupt_id: str
+) -> PendingInterrupt | None:
+    for pending in list_pending_interrupts(state):
+        if pending.interrupt_id == interrupt_id:
+            return pending
+    return None
+
+
+def get_pending_interrupt_for_tool_call(
+    state: Any, tool_call_id: str
+) -> PendingInterrupt | None:
+    for pending in list_pending_interrupts(state):
+        actions = pending.value.get("action_requests")
+        if not isinstance(actions, list):
+            continue
+        for action in actions:
+            if not isinstance(action, dict):
+                continue
+            if action.get("tool_call_id") == tool_call_id:
+                return pending
+    return None
+
+
+def first_pending_interrupt(state: Any) -> PendingInterrupt | None:
+    """Explicit opt-in to legacy first-wins; prefer the id-aware helpers above."""
+    pending = list_pending_interrupts(state)
+    return pending[0] if pending else None
+
+
+def _coerce_interrupt_value(item: Any) -> dict[str, Any] | None:
+    if isinstance(item, dict):
+        return item if item else None
+    value = getattr(item, "value", None)
+    if isinstance(value, dict):
+        return value if value else None
+    return None
+
+
+def _safe_str(value: Any) -> str | None:
+    return value if isinstance(value, str) and value else None
diff --git a/surfsense_backend/app/services/streaming/service.py b/surfsense_backend/app/services/streaming/service.py
new file mode 100644
index 0000000000..a3f1da7850
--- /dev/null
+++ b/surfsense_backend/app/services/streaming/service.py
@@ -0,0 +1,410 @@
+"""Composition root: bundles every formatter + a per-invocation emitter registry."""
+
+from __future__ import annotations
+
+from collections.abc import Iterable
+from typing import Any
+
+from . import envelope
+from .emitter import Emitter, EmitterRegistry
+from .events import (
+    action_log,
+    data,
+    error,
+    interrupt,
+    lifecycle,
+    reasoning,
+    source,
+    subagent_lifecycle,
+    text,
+    tool,
+)
+
+
+class StreamingService:
+    def __init__(self) -> None:
+        self._message_id: str | None = None
+        self.emitter_registry = EmitterRegistry()
+
+    @property
+    def message_id(self) -> str | None:
+        return self._message_id
+
+    def begin_message(self, message_id: str | None = None) -> str:
+        self._message_id = message_id or envelope.generate_message_id()
+        return self._message_id
+
+    @staticmethod
+    def generate_text_id() -> str:
+        return envelope.generate_text_id()
+
+    @staticmethod
+    def generate_reasoning_id() -> str:
+        return envelope.generate_reasoning_id()
+
+    @staticmethod
+    def generate_tool_call_id() -> str:
+        return envelope.generate_tool_call_id()
+
+    @staticmethod
+    def generate_subagent_run_id() -> str:
+        return envelope.generate_subagent_run_id()
+
+    @staticmethod
+    def get_response_headers() -> dict[str, str]:
+        return envelope.get_response_headers()
+
+    @staticmethod
+    def format_done() -> str:
+        return envelope.format_done()
+
+    def resolve_emitter(
+        self,
+        *,
+        run_id: str | None,
+        parent_ids: Iterable[str] | None,
+    ) -> Emitter:
+        return self.emitter_registry.resolve(run_id=run_id, parent_ids=parent_ids)
+
+    def format_message_start(
+        self,
+        message_id: str | None = None,
+        *,
+        emitter: Emitter | None = None,
+    ) -> str:
+        chosen = self.begin_message(message_id)
+        return lifecycle.format_message_start(chosen, emitter=emitter)
+
+    def format_message_finish(self, *, emitter: Emitter | None = None) -> str:
+        return lifecycle.format_message_finish(emitter=emitter)
+
+    def format_step_start(self, *, emitter: Emitter | None = None) -> str:
+        return lifecycle.format_step_start(emitter=emitter)
+
+    def format_step_finish(self, *, emitter: Emitter | None = None) -> str:
+        return lifecycle.format_step_finish(emitter=emitter)
+
+    def format_text_start(self, text_id: str, *, emitter: Emitter | None = None) -> str:
+        return text.format_text_start(text_id, emitter=emitter)
+
+    def format_text_delta(
+        self, text_id: str, delta: str, *, emitter: Emitter | None = None
+    ) -> str:
+        return text.format_text_delta(text_id, delta, emitter=emitter)
+
+    def format_text_end(self, text_id: str, *, emitter: Emitter | None = None) -> str:
+        return text.format_text_end(text_id, emitter=emitter)
+
+    def format_reasoning_start(
+        self, reasoning_id: str, *, emitter: Emitter | None = None
+    ) -> str:
+        return reasoning.format_reasoning_start(reasoning_id, emitter=emitter)
+
+    def format_reasoning_delta(
+        self,
+        reasoning_id: str,
+        delta: str,
+        *,
+        emitter: Emitter | None = None,
+    ) -> str:
+        return reasoning.format_reasoning_delta(reasoning_id, delta, emitter=emitter)
+
+    def format_reasoning_end(
+        self, reasoning_id: str, *, emitter: Emitter | None = None
+    ) -> str:
+        return reasoning.format_reasoning_end(reasoning_id, emitter=emitter)
+
+    def format_tool_input_start(
+        self,
+        tool_call_id: str,
+        tool_name: str,
+        *,
+        langchain_tool_call_id: str | None = None,
+        emitter: Emitter | None = None,
+    ) -> str:
+        return tool.format_tool_input_start(
+            tool_call_id,
+            tool_name,
+            langchain_tool_call_id=langchain_tool_call_id,
+            emitter=emitter,
+        )
+
+    def format_tool_input_delta(
+        self,
+        tool_call_id: str,
+        input_text_delta: str,
+        *,
+        emitter: Emitter | None = None,
+    ) -> str:
+        return tool.format_tool_input_delta(
+            tool_call_id, input_text_delta, emitter=emitter
+        )
+
+    def format_tool_input_available(
+        self,
+        tool_call_id: str,
+        tool_name: str,
+        input_data: dict[str, Any],
+        *,
+        langchain_tool_call_id: str | None = None,
+        emitter: Emitter | None = None,
+    ) -> str:
+        return tool.format_tool_input_available(
+            tool_call_id,
+            tool_name,
+            input_data,
+            langchain_tool_call_id=langchain_tool_call_id,
+            emitter=emitter,
+        )
+
+    def format_tool_output_available(
+        self,
+        tool_call_id: str,
+        output: Any,
+        *,
+        langchain_tool_call_id: str | None = None,
+        emitter: Emitter | None = None,
+    ) -> str:
+        return tool.format_tool_output_available(
+            tool_call_id,
+            output,
+            langchain_tool_call_id=langchain_tool_call_id,
+            emitter=emitter,
+        )
+
+    def format_source_url(
+        self,
+        url: str,
+        *,
+        source_id: str | None = None,
+        title: str | None = None,
+        emitter: Emitter | None = None,
+    ) -> str:
+        return source.format_source_url(
+            url, source_id=source_id, title=title, emitter=emitter
+        )
+
+    def format_source_document(
+        self,
+        source_id: str,
+        *,
+        media_type: str = "file",
+        title: str | None = None,
+        description: str | None = None,
+        emitter: Emitter | None = None,
+    ) -> str:
+        return source.format_source_document(
+            source_id,
+            media_type=media_type,
+            title=title,
+            description=description,
+            emitter=emitter,
+        )
+
+    def format_file(
+        self, url: str, media_type: str, *, emitter: Emitter | None = None
+    ) -> str:
+        return source.format_file(url, media_type, emitter=emitter)
+
+    def format_data(
+        self, data_type: str, payload: Any, *, emitter: Emitter | None = None
+    ) -> str:
+        return data.format_data(data_type, payload, emitter=emitter)
+
+    def format_terminal_info(
+        self,
+        text_value: str,
+        *,
+        message_type: str = "info",
+        emitter: Emitter | None = None,
+    ) -> str:
+        return data.format_terminal_info(
+            text_value, message_type=message_type, emitter=emitter
+        )
+
+    def format_further_questions(
+        self,
+        questions: list[str],
+        *,
+        emitter: Emitter | None = None,
+    ) -> str:
+        return data.format_further_questions(questions, emitter=emitter)
+
+    def format_thinking_step(
+        self,
+        *,
+        step_id: str,
+        title: str,
+        status: str = "in_progress",
+        items: list[str] | None = None,
+        emitter: Emitter | None = None,
+    ) -> str:
+        return data.format_thinking_step(
+            step_id=step_id,
+            title=title,
+            status=status,
+            items=items,
+            emitter=emitter,
+        )
+
+    def format_thread_title_update(
+        self,
+        *,
+        thread_id: int,
+        title: str,
+        emitter: Emitter | None = None,
+    ) -> str:
+        return data.format_thread_title_update(
+            thread_id=thread_id, title=title, emitter=emitter
+        )
+
+    def format_turn_info(
+        self,
+        *,
+        chat_turn_id: str,
+        emitter: Emitter | None = None,
+    ) -> str:
+        return data.format_turn_info(chat_turn_id=chat_turn_id, emitter=emitter)
+
+    def format_turn_status(
+        self,
+        *,
+        status: str,
+        emitter: Emitter | None = None,
+    ) -> str:
+        return data.format_turn_status(status=status, emitter=emitter)
+
+    def format_user_message_id(
+        self,
+        *,
+        message_id: str,
+        turn_id: str,
+        emitter: Emitter | None = None,
+    ) -> str:
+        return data.format_user_message_id(
+            message_id=message_id, turn_id=turn_id, emitter=emitter
+        )
+
+    def format_assistant_message_id(
+        self,
+        *,
+        message_id: str,
+        turn_id: str,
+        emitter: Emitter | None = None,
+    ) -> str:
+        return data.format_assistant_message_id(
+            message_id=message_id, turn_id=turn_id, emitter=emitter
+        )
+
+    def format_error(
+        self,
+        error_text: str,
+        *,
+        error_code: str | None = None,
+        extra: dict[str, Any] | None = None,
+        emitter: Emitter | None = None,
+    ) -> str:
+        return error.format_error(
+            error_text,
+            error_code=error_code,
+            extra=extra,
+            emitter=emitter,
+        )
+
+    def format_interrupt_request(
+        self,
+        interrupt_value: dict[str, Any],
+        *,
+        interrupt_id: str | None = None,
+        pending_interrupt_count: int | None = None,
+        chat_turn_id: str | None = None,
+        emitter: Emitter | None = None,
+    ) -> str:
+        return interrupt.format_interrupt_request(
+            interrupt_value,
+            interrupt_id=interrupt_id,
+            pending_interrupt_count=pending_interrupt_count,
+            chat_turn_id=chat_turn_id,
+            emitter=emitter,
+        )
+
+    def format_subagent_start(
+        self,
+        *,
+        subagent_run_id: str,
+        subagent_type: str,
+        parent_tool_call_id: str,
+        chat_turn_id: str | None = None,
+        description: str | None = None,
+        started_at: str | None = None,
+        emitter: Emitter | None = None,
+    ) -> str:
+        return subagent_lifecycle.format_subagent_start(
+            subagent_run_id=subagent_run_id,
+            subagent_type=subagent_type,
+            parent_tool_call_id=parent_tool_call_id,
+            chat_turn_id=chat_turn_id,
+            description=description,
+            started_at=started_at,
+            emitter=emitter,
+        )
+
+    def format_subagent_finish(
+        self,
+        *,
+        subagent_run_id: str,
+        subagent_type: str,
+        parent_tool_call_id: str,
+        status: str = "completed",
+        ended_at: str | None = None,
+        duration_ms: int | None = None,
+        emitter: Emitter | None = None,
+    ) -> str:
+        return subagent_lifecycle.format_subagent_finish(
+            subagent_run_id=subagent_run_id,
+            subagent_type=subagent_type,
+            parent_tool_call_id=parent_tool_call_id,
+            status=status,
+            ended_at=ended_at,
+            duration_ms=duration_ms,
+            emitter=emitter,
+        )
+
+    def format_subagent_error(
+        self,
+        *,
+        subagent_run_id: str,
+        subagent_type: str,
+        parent_tool_call_id: str,
+        error_text: str,
+        error_type: str | None = None,
+        ended_at: str | None = None,
+        duration_ms: int | None = None,
+        emitter: Emitter | None = None,
+    ) -> str:
+        return subagent_lifecycle.format_subagent_error(
+            subagent_run_id=subagent_run_id,
+            subagent_type=subagent_type,
+            parent_tool_call_id=parent_tool_call_id,
+            error_text=error_text,
+            error_type=error_type,
+            ended_at=ended_at,
+            duration_ms=duration_ms,
+            emitter=emitter,
+        )
+
+    def format_action_log(
+        self,
+        payload: dict[str, Any],
+        *,
+        emitter: Emitter | None = None,
+    ) -> str:
+        return action_log.format_action_log(payload, emitter=emitter)
+
+    def format_action_log_updated(
+        self,
+        payload: dict[str, Any],
+        *,
+        emitter: Emitter | None = None,
+    ) -> str:
+        return action_log.format_action_log_updated(payload, emitter=emitter)
diff --git a/surfsense_backend/app/services/task_dispatcher.py b/surfsense_backend/app/services/task_dispatcher.py
index 9a6fc7d63c..210084102d 100644
--- a/surfsense_backend/app/services/task_dispatcher.py
+++ b/surfsense_backend/app/services/task_dispatcher.py
@@ -19,6 +19,8 @@ async def dispatch_file_processing(
         search_space_id: int,
         user_id: str,
         should_summarize: bool = False,
+        use_vision_llm: bool = False,
+        processing_mode: str = "basic",
     ) -> None: ...
 
 
@@ -34,6 +36,8 @@ async def dispatch_file_processing(
         search_space_id: int,
         user_id: str,
         should_summarize: bool = False,
+        use_vision_llm: bool = False,
+        processing_mode: str = "basic",
     ) -> None:
         from app.tasks.celery_tasks.document_tasks import (
             process_file_upload_with_document_task,
@@ -46,6 +50,8 @@ async def dispatch_file_processing(
             search_space_id=search_space_id,
             user_id=user_id,
             should_summarize=should_summarize,
+            use_vision_llm=use_vision_llm,
+            processing_mode=processing_mode,
         )
 
 
diff --git a/surfsense_backend/app/services/token_quota_service.py b/surfsense_backend/app/services/token_quota_service.py
new file mode 100644
index 0000000000..310c3eb5e4
--- /dev/null
+++ b/surfsense_backend/app/services/token_quota_service.py
@@ -0,0 +1,704 @@
+"""
+Atomic token quota service for anonymous and registered users.
+
+Provides reserve/finalize/release/get_usage operations with race-safe
+implementation using Redis Lua scripts (anonymous) and Postgres row locks
+(registered premium).
+"""
+
+from __future__ import annotations
+
+import hashlib
+import logging
+from enum import StrEnum
+from typing import Any
+
+import redis.asyncio as aioredis
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.config import config
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Per-call reservation estimator (USD micro-units)
+# ---------------------------------------------------------------------------
+
+# Minimum reserve in micros so a user with $0.0001 left can still make a tiny
+# request, and so models without registered pricing reserve at least
+# something while the call runs (debited 0 at finalize anyway when their
+# cost can't be resolved).
+_QUOTA_MIN_RESERVE_MICROS = 100
+
+
+def estimate_call_reserve_micros(
+    *,
+    base_model: str,
+    quota_reserve_tokens: int | None,
+) -> int:
+    """Return the number of micro-USD to reserve for one premium call.
+
+    Computes a worst-case upper bound from LiteLLM's per-token pricing
+    table:
+
+        reserve_usd ≈ reserve_tokens x (input_cost + output_cost)
+
+    so the math scales with model cost — Claude Opus + 4K reserve_tokens
+    naturally reserves ≈ $0.36, while a cheap model reserves only a few
+    cents. Clamped to ``[_QUOTA_MIN_RESERVE_MICROS, QUOTA_MAX_RESERVE_MICROS]``
+    so a misconfigured "$1000/M" model can't lock the whole balance on
+    one call.
+
+    If ``litellm.get_model_info`` raises (model unknown) we fall back to
+    the floor — 100 micros / $0.0001 — which is enough to gate a sane
+    request without over-reserving for a model whose pricing the
+    operator hasn't declared yet.
+    """
+    reserve_tokens = quota_reserve_tokens or config.QUOTA_MAX_RESERVE_PER_CALL
+    if reserve_tokens <= 0:
+        reserve_tokens = config.QUOTA_MAX_RESERVE_PER_CALL
+
+    try:
+        from litellm import get_model_info
+
+        info = get_model_info(base_model) if base_model else {}
+        input_cost = float(info.get("input_cost_per_token") or 0.0)
+        output_cost = float(info.get("output_cost_per_token") or 0.0)
+    except Exception as exc:
+        logger.debug(
+            "[quota_reserve] cost lookup failed for base_model=%s: %s",
+            base_model,
+            exc,
+        )
+        input_cost = 0.0
+        output_cost = 0.0
+
+    if input_cost == 0.0 and output_cost == 0.0:
+        return _QUOTA_MIN_RESERVE_MICROS
+
+    reserve_usd = reserve_tokens * (input_cost + output_cost)
+    reserve_micros = round(reserve_usd * 1_000_000)
+    if reserve_micros < _QUOTA_MIN_RESERVE_MICROS:
+        reserve_micros = _QUOTA_MIN_RESERVE_MICROS
+    if reserve_micros > config.QUOTA_MAX_RESERVE_MICROS:
+        reserve_micros = config.QUOTA_MAX_RESERVE_MICROS
+    return reserve_micros
+
+
+class QuotaScope(StrEnum):
+    ANONYMOUS = "anonymous"
+    PREMIUM = "premium"
+
+
+class QuotaStatus(StrEnum):
+    OK = "ok"
+    WARNING = "warning"
+    BLOCKED = "blocked"
+
+
+class QuotaResult:
+    __slots__ = ("allowed", "limit", "remaining", "reserved", "status", "used")
+
+    def __init__(
+        self,
+        allowed: bool,
+        status: QuotaStatus,
+        used: int,
+        limit: int,
+        reserved: int = 0,
+        remaining: int = 0,
+    ):
+        self.allowed = allowed
+        self.status = status
+        self.used = used
+        self.limit = limit
+        self.reserved = reserved
+        self.remaining = remaining
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "allowed": self.allowed,
+            "status": self.status.value,
+            "used": self.used,
+            "limit": self.limit,
+            "reserved": self.reserved,
+            "remaining": self.remaining,
+        }
+
+
+# ---------------------------------------------------------------------------
+# Redis Lua scripts for atomic anonymous quota operations
+# ---------------------------------------------------------------------------
+
+# KEYS[1] = quota key (e.g. "anon_quota:session:<session_id>")
+# ARGV[1] = reserve_tokens
+# ARGV[2] = limit
+# ARGV[3] = warning_threshold
+# ARGV[4] = request_id
+# ARGV[5] = ttl_seconds
+# Returns: [allowed(0/1), status("ok"/"warning"/"blocked"), used, reserved]
+_RESERVE_LUA = """
+local key = KEYS[1]
+local reserve = tonumber(ARGV[1])
+local limit = tonumber(ARGV[2])
+local warning = tonumber(ARGV[3])
+local req_id = ARGV[4]
+local ttl = tonumber(ARGV[5])
+
+local used = tonumber(redis.call('HGET', key, 'used') or '0')
+local reserved = tonumber(redis.call('HGET', key, 'reserved') or '0')
+
+local effective = used + reserved + reserve
+if effective > limit then
+    return {0, 'blocked', used, reserved}
+end
+
+redis.call('HINCRBY', key, 'reserved', reserve)
+redis.call('HSET', key, 'req:' .. req_id, reserve)
+redis.call('EXPIRE', key, ttl)
+
+local new_reserved = reserved + reserve
+local status = 'ok'
+if (used + new_reserved) >= warning then
+    status = 'warning'
+end
+
+return {1, status, used, new_reserved}
+"""
+
+# KEYS[1] = quota key
+# ARGV[1] = request_id
+# ARGV[2] = actual_tokens
+# ARGV[3] = warning_threshold
+# Returns: [used, reserved, status]
+_FINALIZE_LUA = """
+local key = KEYS[1]
+local req_id = ARGV[1]
+local actual = tonumber(ARGV[2])
+local warning = tonumber(ARGV[3])
+
+local orig_reserve = tonumber(redis.call('HGET', key, 'req:' .. req_id) or '0')
+if orig_reserve == 0 then
+    return {tonumber(redis.call('HGET', key, 'used') or '0'), tonumber(redis.call('HGET', key, 'reserved') or '0'), 'ok'}
+end
+
+redis.call('HDEL', key, 'req:' .. req_id)
+redis.call('HINCRBY', key, 'reserved', -orig_reserve)
+redis.call('HINCRBY', key, 'used', actual)
+
+local used = tonumber(redis.call('HGET', key, 'used') or '0')
+local reserved = tonumber(redis.call('HGET', key, 'reserved') or '0')
+local status = 'ok'
+if used >= warning then
+    status = 'warning'
+end
+return {used, reserved, status}
+"""
+
+# KEYS[1] = quota key
+# ARGV[1] = request_id
+# Returns: 1 if released, 0 if not found
+_RELEASE_LUA = """
+local key = KEYS[1]
+local req_id = ARGV[1]
+
+local orig_reserve = tonumber(redis.call('HGET', key, 'req:' .. req_id) or '0')
+if orig_reserve == 0 then
+    return 0
+end
+
+redis.call('HDEL', key, 'req:' .. req_id)
+redis.call('HINCRBY', key, 'reserved', -orig_reserve)
+return 1
+"""
+
+
+def _get_anon_redis() -> aioredis.Redis:
+    return aioredis.from_url(config.REDIS_APP_URL, decode_responses=True)
+
+
+def compute_anon_identity_key(
+    session_id: str,
+    ip_hash: str | None = None,
+) -> str:
+    """Build the Redis hash key for anonymous quota tracking.
+
+    Uses the signed session cookie as primary identity. The IP hash
+    is tracked separately so cookie-reset evasion is caught.
+    """
+    return f"anon_quota:session:{session_id}"
+
+
+def compute_ip_quota_key(ip_address: str) -> str:
+    """Build IP-only quota key. UA is excluded so rotating User-Agent cannot bypass limits."""
+    h = hashlib.sha256(ip_address.encode()).hexdigest()[:16]
+    return f"anon_quota:ip:{h}"
+
+
+# ---------------------------------------------------------------------------
+# Concurrent stream limiter (per-IP)
+# ---------------------------------------------------------------------------
+
+# Atomic acquire: returns 1 if slot acquired, 0 if at capacity.
+# KEYS[1] = stream counter key   ARGV[1] = max_concurrent   ARGV[2] = safety_ttl
+_ACQUIRE_STREAM_LUA = """
+local key = KEYS[1]
+local max_c = tonumber(ARGV[1])
+local ttl = tonumber(ARGV[2])
+local cur = tonumber(redis.call('GET', key) or '0')
+if cur >= max_c then
+    return 0
+end
+redis.call('INCR', key)
+redis.call('EXPIRE', key, ttl)
+return 1
+"""
+
+# Atomic release: DECR with floor at 0
+_RELEASE_STREAM_LUA = """
+local key = KEYS[1]
+local cur = tonumber(redis.call('GET', key) or '0')
+if cur <= 0 then
+    return 0
+end
+redis.call('DECR', key)
+return 1
+"""
+
+
+def compute_stream_slot_key(ip_address: str) -> str:
+    h = hashlib.sha256(ip_address.encode()).hexdigest()[:16]
+    return f"anon:streams:{h}"
+
+
+def compute_request_count_key(ip_address: str) -> str:
+    h = hashlib.sha256(ip_address.encode()).hexdigest()[:16]
+    return f"anon:reqcount:{h}"
+
+
+class TokenQuotaService:
+    """Unified quota service for anonymous (Redis) and premium (Postgres) scopes."""
+
+    # ------------------------------------------------------------------
+    # Concurrent stream limiter
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    async def anon_acquire_stream_slot(
+        ip_address: str,
+        max_concurrent: int = 2,
+        safety_ttl: int = 300,
+    ) -> bool:
+        key = compute_stream_slot_key(ip_address)
+        r = _get_anon_redis()
+        try:
+            result = await r.eval(
+                _ACQUIRE_STREAM_LUA, 1, key, str(max_concurrent), str(safety_ttl)
+            )
+            return bool(result)
+        finally:
+            await r.aclose()
+
+    @staticmethod
+    async def anon_release_stream_slot(ip_address: str) -> None:
+        key = compute_stream_slot_key(ip_address)
+        r = _get_anon_redis()
+        try:
+            await r.eval(_RELEASE_STREAM_LUA, 1, key)
+        finally:
+            await r.aclose()
+
+    # ------------------------------------------------------------------
+    # Per-IP request counter (for CAPTCHA triggering)
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    async def anon_increment_request_count(ip_address: str, ttl: int = 86400) -> int:
+        """Increment and return current request count for this IP. TTL resets daily."""
+        key = compute_request_count_key(ip_address)
+        r = _get_anon_redis()
+        try:
+            pipe = r.pipeline()
+            pipe.incr(key)
+            pipe.expire(key, ttl)
+            results = await pipe.execute()
+            return int(results[0])
+        finally:
+            await r.aclose()
+
+    @staticmethod
+    async def anon_get_request_count(ip_address: str) -> int:
+        key = compute_request_count_key(ip_address)
+        r = _get_anon_redis()
+        try:
+            val = await r.get(key)
+            return int(val) if val else 0
+        finally:
+            await r.aclose()
+
+    @staticmethod
+    async def anon_reset_request_count(ip_address: str) -> None:
+        key = compute_request_count_key(ip_address)
+        r = _get_anon_redis()
+        try:
+            await r.delete(key)
+        finally:
+            await r.aclose()
+
+    # ------------------------------------------------------------------
+    # Anonymous (Redis-backed)
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    async def anon_reserve(
+        session_key: str,
+        ip_key: str | None,
+        request_id: str,
+        reserve_tokens: int,
+    ) -> QuotaResult:
+        limit = config.ANON_TOKEN_LIMIT
+        warning = config.ANON_TOKEN_WARNING_THRESHOLD
+        ttl = config.ANON_TOKEN_QUOTA_TTL_DAYS * 86400
+
+        r = _get_anon_redis()
+        try:
+            result = await r.eval(
+                _RESERVE_LUA,
+                1,
+                session_key,
+                str(reserve_tokens),
+                str(limit),
+                str(warning),
+                request_id,
+                str(ttl),
+            )
+            allowed = bool(result[0])
+            status_str = result[1] if isinstance(result[1], str) else result[1].decode()
+            used = int(result[2])
+            reserved = int(result[3])
+
+            if ip_key:
+                ip_result = await r.eval(
+                    _RESERVE_LUA,
+                    1,
+                    ip_key,
+                    str(reserve_tokens),
+                    str(limit),
+                    str(warning),
+                    request_id,
+                    str(ttl),
+                )
+                ip_allowed = bool(ip_result[0])
+                ip_used = int(ip_result[2])
+                if not ip_allowed and allowed:
+                    await r.eval(_RELEASE_LUA, 1, session_key, request_id)
+                    allowed = False
+                    status_str = "blocked"
+                    used = max(used, ip_used)
+
+            status = QuotaStatus(status_str)
+            remaining = max(0, limit - used - reserved)
+            return QuotaResult(
+                allowed=allowed,
+                status=status,
+                used=used,
+                limit=limit,
+                reserved=reserved,
+                remaining=remaining,
+            )
+        finally:
+            await r.aclose()
+
+    @staticmethod
+    async def anon_finalize(
+        session_key: str,
+        ip_key: str | None,
+        request_id: str,
+        actual_tokens: int,
+    ) -> QuotaResult:
+        warning = config.ANON_TOKEN_WARNING_THRESHOLD
+        limit = config.ANON_TOKEN_LIMIT
+        r = _get_anon_redis()
+        try:
+            result = await r.eval(
+                _FINALIZE_LUA,
+                1,
+                session_key,
+                request_id,
+                str(actual_tokens),
+                str(warning),
+            )
+            used = int(result[0])
+            reserved = int(result[1])
+            status_str = result[2] if isinstance(result[2], str) else result[2].decode()
+
+            if ip_key:
+                await r.eval(
+                    _FINALIZE_LUA,
+                    1,
+                    ip_key,
+                    request_id,
+                    str(actual_tokens),
+                    str(warning),
+                )
+
+            status = QuotaStatus(status_str)
+            remaining = max(0, limit - used - reserved)
+            return QuotaResult(
+                allowed=True,
+                status=status,
+                used=used,
+                limit=limit,
+                reserved=reserved,
+                remaining=remaining,
+            )
+        finally:
+            await r.aclose()
+
+    @staticmethod
+    async def anon_release(
+        session_key: str,
+        ip_key: str | None,
+        request_id: str,
+    ) -> None:
+        r = _get_anon_redis()
+        try:
+            await r.eval(_RELEASE_LUA, 1, session_key, request_id)
+            if ip_key:
+                await r.eval(_RELEASE_LUA, 1, ip_key, request_id)
+        finally:
+            await r.aclose()
+
+    @staticmethod
+    async def anon_get_usage(session_key: str) -> QuotaResult:
+        limit = config.ANON_TOKEN_LIMIT
+        warning = config.ANON_TOKEN_WARNING_THRESHOLD
+        r = _get_anon_redis()
+        try:
+            data = await r.hgetall(session_key)
+            used = int(data.get("used", 0))
+            reserved = int(data.get("reserved", 0))
+            remaining = max(0, limit - used - reserved)
+
+            if used >= limit:
+                status = QuotaStatus.BLOCKED
+            elif used >= warning:
+                status = QuotaStatus.WARNING
+            else:
+                status = QuotaStatus.OK
+
+            return QuotaResult(
+                allowed=used < limit,
+                status=status,
+                used=used,
+                limit=limit,
+                reserved=reserved,
+                remaining=remaining,
+            )
+        finally:
+            await r.aclose()
+
+    # ------------------------------------------------------------------
+    # Premium (Postgres-backed)
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    async def premium_reserve(
+        db_session: AsyncSession,
+        user_id: Any,
+        request_id: str,
+        reserve_micros: int,
+    ) -> QuotaResult:
+        """Reserve ``reserve_micros`` (USD micro-units) from the user's
+        premium credit balance.
+
+        ``QuotaResult.used``/``limit``/``reserved``/``remaining`` are
+        all in micro-USD on this code path; callers (chat stream,
+        token-status route, FE display) convert to dollars by dividing
+        by 1_000_000.
+        """
+        from app.db import User
+
+        user = (
+            (
+                await db_session.execute(
+                    select(User).where(User.id == user_id).with_for_update(of=User)
+                )
+            )
+            .unique()
+            .scalar_one_or_none()
+        )
+        if user is None:
+            return QuotaResult(
+                allowed=False,
+                status=QuotaStatus.BLOCKED,
+                used=0,
+                limit=0,
+            )
+
+        limit = user.premium_credit_micros_limit
+        used = user.premium_credit_micros_used
+        reserved = user.premium_credit_micros_reserved
+
+        effective = used + reserved + reserve_micros
+        if effective > limit:
+            remaining = max(0, limit - used - reserved)
+            await db_session.rollback()
+            return QuotaResult(
+                allowed=False,
+                status=QuotaStatus.BLOCKED,
+                used=used,
+                limit=limit,
+                reserved=reserved,
+                remaining=remaining,
+            )
+
+        user.premium_credit_micros_reserved = reserved + reserve_micros
+        await db_session.commit()
+
+        new_reserved = reserved + reserve_micros
+        remaining = max(0, limit - used - new_reserved)
+        warning_threshold = int(limit * 0.8)
+
+        if (used + new_reserved) >= limit:
+            status = QuotaStatus.BLOCKED
+        elif (used + new_reserved) >= warning_threshold:
+            status = QuotaStatus.WARNING
+        else:
+            status = QuotaStatus.OK
+
+        return QuotaResult(
+            allowed=True,
+            status=status,
+            used=used,
+            limit=limit,
+            reserved=new_reserved,
+            remaining=remaining,
+        )
+
+    @staticmethod
+    async def premium_finalize(
+        db_session: AsyncSession,
+        user_id: Any,
+        request_id: str,
+        actual_micros: int,
+        reserved_micros: int,
+    ) -> QuotaResult:
+        """Settle the reservation: release ``reserved_micros`` and debit
+        ``actual_micros`` (the LiteLLM-reported provider cost in micro-USD).
+        """
+        from app.db import User
+
+        user = (
+            (
+                await db_session.execute(
+                    select(User).where(User.id == user_id).with_for_update(of=User)
+                )
+            )
+            .unique()
+            .scalar_one_or_none()
+        )
+        if user is None:
+            return QuotaResult(
+                allowed=False, status=QuotaStatus.BLOCKED, used=0, limit=0
+            )
+
+        user.premium_credit_micros_reserved = max(
+            0, user.premium_credit_micros_reserved - reserved_micros
+        )
+        user.premium_credit_micros_used = (
+            user.premium_credit_micros_used + actual_micros
+        )
+
+        await db_session.commit()
+
+        limit = user.premium_credit_micros_limit
+        used = user.premium_credit_micros_used
+        reserved = user.premium_credit_micros_reserved
+        remaining = max(0, limit - used - reserved)
+
+        warning_threshold = int(limit * 0.8)
+        if used >= limit:
+            status = QuotaStatus.BLOCKED
+        elif used >= warning_threshold:
+            status = QuotaStatus.WARNING
+        else:
+            status = QuotaStatus.OK
+
+        return QuotaResult(
+            allowed=True,
+            status=status,
+            used=used,
+            limit=limit,
+            reserved=reserved,
+            remaining=remaining,
+        )
+
+    @staticmethod
+    async def premium_release(
+        db_session: AsyncSession,
+        user_id: Any,
+        reserved_micros: int,
+    ) -> None:
+        """Release ``reserved_micros`` previously held by ``premium_reserve``.
+
+        Used when a request fails before finalize (so the reservation
+        doesn't leak credit).
+        """
+        from app.db import User
+
+        user = (
+            (
+                await db_session.execute(
+                    select(User).where(User.id == user_id).with_for_update(of=User)
+                )
+            )
+            .unique()
+            .scalar_one_or_none()
+        )
+        if user is not None:
+            user.premium_credit_micros_reserved = max(
+                0, user.premium_credit_micros_reserved - reserved_micros
+            )
+            await db_session.commit()
+
+    @staticmethod
+    async def premium_get_usage(
+        db_session: AsyncSession,
+        user_id: Any,
+    ) -> QuotaResult:
+        from app.db import User
+
+        user = (
+            (await db_session.execute(select(User).where(User.id == user_id)))
+            .unique()
+            .scalar_one_or_none()
+        )
+        if user is None:
+            return QuotaResult(
+                allowed=False, status=QuotaStatus.BLOCKED, used=0, limit=0
+            )
+
+        limit = user.premium_credit_micros_limit
+        used = user.premium_credit_micros_used
+        reserved = user.premium_credit_micros_reserved
+        remaining = max(0, limit - used - reserved)
+
+        warning_threshold = int(limit * 0.8)
+        if used >= limit:
+            status = QuotaStatus.BLOCKED
+        elif used >= warning_threshold:
+            status = QuotaStatus.WARNING
+        else:
+            status = QuotaStatus.OK
+
+        return QuotaResult(
+            allowed=used < limit,
+            status=status,
+            used=used,
+            limit=limit,
+            reserved=reserved,
+            remaining=remaining,
+        )
diff --git a/surfsense_backend/app/services/token_tracking_service.py b/surfsense_backend/app/services/token_tracking_service.py
new file mode 100644
index 0000000000..9406d9be44
--- /dev/null
+++ b/surfsense_backend/app/services/token_tracking_service.py
@@ -0,0 +1,433 @@
+"""
+Token usage tracking via LiteLLM custom callback.
+
+Uses a ContextVar-scoped accumulator to group all LLM calls within a single
+async request/turn. The accumulated data is emitted via SSE and persisted
+when the frontend calls appendMessage.
+
+The module also provides ``record_token_usage``, a thin async helper that
+creates a ``TokenUsage`` row for *any* usage type (chat, indexing, image
+generation, podcasts, …).  Call sites should prefer this helper over
+constructing ``TokenUsage`` manually so that logging and error handling
+stay consistent.
+"""
+
+from __future__ import annotations
+
+import dataclasses
+import logging
+from collections.abc import AsyncIterator
+from contextlib import asynccontextmanager
+from contextvars import ContextVar
+from dataclasses import dataclass, field
+from typing import Any
+from uuid import UUID
+
+import litellm
+from litellm.integrations.custom_logger import CustomLogger
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import TokenUsage
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class TokenCallRecord:
+    model: str
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+    cost_micros: int = 0
+    call_kind: str = "chat"
+
+
+@dataclass
+class TurnTokenAccumulator:
+    """Accumulates token usage across all LLM calls within a single user turn."""
+
+    calls: list[TokenCallRecord] = field(default_factory=list)
+
+    def add(
+        self,
+        model: str,
+        prompt_tokens: int,
+        completion_tokens: int,
+        total_tokens: int,
+        cost_micros: int = 0,
+        call_kind: str = "chat",
+    ) -> None:
+        self.calls.append(
+            TokenCallRecord(
+                model=model,
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=total_tokens,
+                cost_micros=cost_micros,
+                call_kind=call_kind,
+            )
+        )
+
+    def per_message_summary(self) -> dict[str, dict[str, int]]:
+        """Return token counts (and cost) grouped by model name."""
+        by_model: dict[str, dict[str, int]] = {}
+        for c in self.calls:
+            entry = by_model.setdefault(
+                c.model,
+                {
+                    "prompt_tokens": 0,
+                    "completion_tokens": 0,
+                    "total_tokens": 0,
+                    "cost_micros": 0,
+                },
+            )
+            entry["prompt_tokens"] += c.prompt_tokens
+            entry["completion_tokens"] += c.completion_tokens
+            entry["total_tokens"] += c.total_tokens
+            entry["cost_micros"] += c.cost_micros
+        return by_model
+
+    @property
+    def grand_total(self) -> int:
+        return sum(c.total_tokens for c in self.calls)
+
+    @property
+    def total_prompt_tokens(self) -> int:
+        return sum(c.prompt_tokens for c in self.calls)
+
+    @property
+    def total_completion_tokens(self) -> int:
+        return sum(c.completion_tokens for c in self.calls)
+
+    @property
+    def total_cost_micros(self) -> int:
+        """Sum of per-call ``cost_micros`` across the entire turn.
+
+        Used by ``stream_new_chat`` to debit a premium turn's actual
+        provider cost (in micro-USD) from the user's premium credit
+        balance. ``cost_micros`` per call is captured by
+        ``TokenTrackingCallback.async_log_success_event`` from
+        ``kwargs["response_cost"]`` (LiteLLM's auto-calculated cost),
+        with multiple fallback paths so OpenRouter dynamic models and
+        custom Azure deployments still bill correctly when our
+        ``pricing_registration`` ran at startup.
+        """
+        return sum(c.cost_micros for c in self.calls)
+
+    def serialized_calls(self) -> list[dict[str, Any]]:
+        return [dataclasses.asdict(c) for c in self.calls]
+
+
+_turn_accumulator: ContextVar[TurnTokenAccumulator | None] = ContextVar(
+    "_turn_accumulator", default=None
+)
+
+
+def start_turn() -> TurnTokenAccumulator:
+    """Create a fresh accumulator for the current async context and return it.
+
+    NOTE: Used by ``stream_new_chat`` for the long-lived chat turn. For
+    short-lived per-call billable wrappers (image generation REST endpoint,
+    vision LLM during indexing) prefer :func:`scoped_turn`, which uses a
+    ContextVar reset token to restore the *previous* accumulator on exit and
+    avoids leaking call records across reservations (issue B).
+    """
+    acc = TurnTokenAccumulator()
+    _turn_accumulator.set(acc)
+    logger.info("[TokenTracking] start_turn: new accumulator created (id=%s)", id(acc))
+    return acc
+
+
+def get_current_accumulator() -> TurnTokenAccumulator | None:
+    return _turn_accumulator.get()
+
+
+@asynccontextmanager
+async def scoped_turn() -> AsyncIterator[TurnTokenAccumulator]:
+    """Async context manager that scopes a fresh ``TurnTokenAccumulator``
+    for the duration of the ``async with`` block, then *resets* the
+    ContextVar to its previous value on exit.
+
+    This is the safe primitive for per-call billable operations
+    (image generation, vision LLM extraction, podcasts) that may run
+    inside an outer chat turn or be called sequentially from the same
+    background worker. Using ``ContextVar.set`` without ``reset`` (as
+    :func:`start_turn` does) would leak the inner accumulator into the
+    outer scope, causing the outer chat turn to debit cost twice.
+
+    Usage::
+
+        async with scoped_turn() as acc:
+            await llm.ainvoke(...)
+            # acc.total_cost_micros captures cost from the LiteLLM callback
+        # Outer accumulator (if any) is restored here.
+    """
+    acc = TurnTokenAccumulator()
+    token = _turn_accumulator.set(acc)
+    logger.debug(
+        "[TokenTracking] scoped_turn: enter (acc id=%s, prev token=%s)",
+        id(acc),
+        token,
+    )
+    try:
+        yield acc
+    finally:
+        _turn_accumulator.reset(token)
+        logger.debug(
+            "[TokenTracking] scoped_turn: exit (acc id=%s captured %d call(s), %d micros total)",
+            id(acc),
+            len(acc.calls),
+            acc.total_cost_micros,
+        )
+
+
+def _extract_cost_usd(
+    kwargs: dict[str, Any],
+    response_obj: Any,
+    model: str,
+    prompt_tokens: int,
+    completion_tokens: int,
+    is_image: bool = False,
+) -> float:
+    """Best-effort USD cost extraction for a single LLM/image call.
+
+    Tries four sources in priority order and returns the first that
+    yields a positive number; returns 0.0 if all four fail (the call
+    will then debit nothing from the user's balance — fail-safe).
+
+    Sources:
+      1. ``kwargs["response_cost"]`` — LiteLLM's standard callback
+         field, populated for ``Router.acompletion`` since PR #12500.
+      2. ``response_obj._hidden_params["response_cost"]`` — same value
+         exposed on the response itself.
+      3. ``litellm.completion_cost(completion_response=response_obj)``
+         — recompute from the response and LiteLLM's pricing table.
+      4. ``litellm.cost_per_token(model, prompt_tokens, completion_tokens)``
+         — manual fallback for OpenRouter/custom-Azure models that
+         only resolve via aliases registered by
+         ``pricing_registration`` at startup. **Skipped for image
+         responses** — ``cost_per_token`` does not support ``ImageResponse``
+         and would raise; the cost map for image-gen lives in different
+         keys (``output_cost_per_image``) handled by ``completion_cost``.
+    """
+    cost = kwargs.get("response_cost")
+    if cost is not None:
+        try:
+            value = float(cost)
+        except (TypeError, ValueError):
+            value = 0.0
+        if value > 0:
+            return value
+
+    hidden = getattr(response_obj, "_hidden_params", None) or {}
+    if isinstance(hidden, dict):
+        cost = hidden.get("response_cost")
+        if cost is not None:
+            try:
+                value = float(cost)
+            except (TypeError, ValueError):
+                value = 0.0
+            if value > 0:
+                return value
+
+    try:
+        value = float(litellm.completion_cost(completion_response=response_obj))
+        if value > 0:
+            return value
+    except Exception as exc:
+        if is_image:
+            # Image-gen path: OpenRouter's image responses can omit
+            # ``usage.cost`` and LiteLLM's ``default_image_cost_calculator``
+            # then *raises* (no cost map for OpenRouter image models).
+            # Bail out with a warning rather than falling through to
+            # cost_per_token (which is also incompatible with ImageResponse).
+            logger.warning(
+                "[TokenTracking] completion_cost failed for image model=%s "
+                "(provider may have omitted usage.cost). Debiting 0. "
+                "Cause: %s",
+                model,
+                exc,
+            )
+            return 0.0
+        logger.debug(
+            "[TokenTracking] completion_cost failed for model=%s: %s", model, exc
+        )
+
+    if is_image:
+        # Never call cost_per_token for ImageResponse — keys mismatch and
+        # the function is documented chat-only.
+        return 0.0
+
+    if model and (prompt_tokens > 0 or completion_tokens > 0):
+        try:
+            prompt_cost, completion_cost = litellm.cost_per_token(
+                model=model,
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+            )
+            value = float(prompt_cost) + float(completion_cost)
+            if value > 0:
+                return value
+        except Exception as exc:
+            logger.debug(
+                "[TokenTracking] cost_per_token failed for model=%s: %s", model, exc
+            )
+
+    return 0.0
+
+
+class TokenTrackingCallback(CustomLogger):
+    """LiteLLM callback that captures token usage into the turn accumulator."""
+
+    async def async_log_success_event(
+        self,
+        kwargs: dict[str, Any],
+        response_obj: Any,
+        start_time: Any,
+        end_time: Any,
+    ) -> None:
+        acc = _turn_accumulator.get()
+        if acc is None:
+            logger.debug(
+                "[TokenTracking] async_log_success_event fired but no accumulator in context"
+            )
+            return
+
+        # Detect image generation responses — they have a different usage
+        # shape (ImageUsage with input_tokens/output_tokens) and require a
+        # different cost-extraction path. We probe by class name to avoid a
+        # hard import dependency on litellm internals.
+        response_cls = type(response_obj).__name__
+        is_image = response_cls == "ImageResponse"
+
+        usage = getattr(response_obj, "usage", None)
+        if not usage:
+            logger.debug(
+                "[TokenTracking] async_log_success_event fired but response has no usage data"
+            )
+            return
+
+        if is_image:
+            # ``ImageUsage`` exposes ``input_tokens`` / ``output_tokens``
+            # (not prompt_tokens/completion_tokens). Several providers
+            # populate only one or neither (e.g. OpenRouter's gpt-image-1
+            # passes through `input_tokens` from the prompt but no
+            # completion); fall through gracefully to 0.
+            prompt_tokens = getattr(usage, "input_tokens", 0) or 0
+            completion_tokens = getattr(usage, "output_tokens", 0) or 0
+            total_tokens = (
+                getattr(usage, "total_tokens", 0) or prompt_tokens + completion_tokens
+            )
+            call_kind = "image_generation"
+        else:
+            prompt_tokens = getattr(usage, "prompt_tokens", 0) or 0
+            completion_tokens = getattr(usage, "completion_tokens", 0) or 0
+            total_tokens = getattr(usage, "total_tokens", 0) or 0
+            call_kind = "chat"
+
+        model = kwargs.get("model", "unknown")
+
+        cost_usd = _extract_cost_usd(
+            kwargs=kwargs,
+            response_obj=response_obj,
+            model=model,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            is_image=is_image,
+        )
+        cost_micros = round(cost_usd * 1_000_000) if cost_usd > 0 else 0
+
+        if cost_micros == 0 and (prompt_tokens > 0 or completion_tokens > 0):
+            logger.warning(
+                "[TokenTracking] No cost resolved for model=%s prompt=%d completion=%d "
+                "kind=%s — debiting 0. Register pricing via pricing_registration or YAML "
+                "input_cost_per_token/output_cost_per_token (or rely on response_cost "
+                "for image generation).",
+                model,
+                prompt_tokens,
+                completion_tokens,
+                call_kind,
+            )
+
+        acc.add(
+            model=model,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=total_tokens,
+            cost_micros=cost_micros,
+            call_kind=call_kind,
+        )
+        logger.info(
+            "[TokenTracking] Captured: model=%s kind=%s prompt=%d completion=%d total=%d "
+            "cost=$%.6f (%d micros) (accumulator now has %d calls)",
+            model,
+            call_kind,
+            prompt_tokens,
+            completion_tokens,
+            total_tokens,
+            cost_usd,
+            cost_micros,
+            len(acc.calls),
+        )
+
+
+token_tracker = TokenTrackingCallback()
+
+
+# ---------------------------------------------------------------------------
+# Persistence helper
+# ---------------------------------------------------------------------------
+
+
+async def record_token_usage(
+    session: AsyncSession,
+    *,
+    usage_type: str,
+    search_space_id: int,
+    user_id: UUID,
+    prompt_tokens: int = 0,
+    completion_tokens: int = 0,
+    total_tokens: int = 0,
+    cost_micros: int = 0,
+    model_breakdown: dict[str, Any] | None = None,
+    call_details: dict[str, Any] | None = None,
+    thread_id: int | None = None,
+    message_id: int | None = None,
+) -> TokenUsage | None:
+    """Persist a single ``TokenUsage`` row.
+
+    Returns the record on success, ``None`` if persistence failed (the
+    failure is logged but never propagated so callers don't need to
+    wrap this in try/except).
+    """
+    try:
+        record = TokenUsage(
+            usage_type=usage_type,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=total_tokens,
+            cost_micros=cost_micros,
+            model_breakdown=model_breakdown,
+            call_details=call_details,
+            thread_id=thread_id,
+            message_id=message_id,
+            search_space_id=search_space_id,
+            user_id=user_id,
+        )
+        session.add(record)
+        logger.debug(
+            "[TokenTracking] recorded %s usage: prompt=%d completion=%d total=%d cost_micros=%d",
+            usage_type,
+            prompt_tokens,
+            completion_tokens,
+            total_tokens,
+            cost_micros,
+        )
+        return record
+    except Exception:
+        logger.warning(
+            "[TokenTracking] failed to record %s token usage",
+            usage_type,
+            exc_info=True,
+        )
+        return None
diff --git a/surfsense_backend/app/services/turnstile_service.py b/surfsense_backend/app/services/turnstile_service.py
new file mode 100644
index 0000000000..b3c6f0cccd
--- /dev/null
+++ b/surfsense_backend/app/services/turnstile_service.py
@@ -0,0 +1,52 @@
+"""Cloudflare Turnstile CAPTCHA verification service."""
+
+from __future__ import annotations
+
+import logging
+
+import httpx
+
+from app.config import config
+
+logger = logging.getLogger(__name__)
+
+TURNSTILE_VERIFY_URL = "https://challenges.cloudflare.com/turnstile/v0/siteverify"
+
+
+async def verify_turnstile_token(token: str, remote_ip: str | None = None) -> bool:
+    """Verify a Turnstile response token with Cloudflare.
+
+    Returns True when the token is valid and the challenge was solved by a
+    real user.  Returns False (never raises) on network errors or invalid
+    tokens so callers can treat it as a simple boolean gate.
+    """
+    if not config.TURNSTILE_ENABLED:
+        return True
+
+    secret = config.TURNSTILE_SECRET_KEY
+    if not secret:
+        logger.warning("TURNSTILE_SECRET_KEY is not set; skipping verification")
+        return True
+
+    payload: dict[str, str] = {
+        "secret": secret,
+        "response": token,
+    }
+    if remote_ip:
+        payload["remoteip"] = remote_ip
+
+    try:
+        async with httpx.AsyncClient(timeout=10) as client:
+            resp = await client.post(TURNSTILE_VERIFY_URL, data=payload)
+            resp.raise_for_status()
+            data = resp.json()
+            success = data.get("success", False)
+            if not success:
+                logger.info(
+                    "Turnstile verification failed: %s",
+                    data.get("error-codes", []),
+                )
+            return bool(success)
+    except Exception:
+        logger.exception("Turnstile verification request failed")
+        return False
diff --git a/surfsense_backend/app/services/user_tool_allowlist.py b/surfsense_backend/app/services/user_tool_allowlist.py
new file mode 100644
index 0000000000..fdfa515605
--- /dev/null
+++ b/surfsense_backend/app/services/user_tool_allowlist.py
@@ -0,0 +1,184 @@
+"""User-scoped trusted-tools list backed by ``SearchSourceConnector.config``.
+
+Storage is per ``(user_id, search_space_id, connector_id)`` under
+``connector.config['trusted_tools']``. The list only ever encodes
+``allow`` decisions; coded ``deny`` rules cannot be overridden here.
+"""
+
+from __future__ import annotations
+
+import logging
+import uuid
+from collections import defaultdict
+from collections.abc import Awaitable, Callable
+
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.orm.attributes import flag_modified
+
+from app.agents.multi_agent_chat.constants import (
+    CONNECTOR_TYPE_TO_CONNECTOR_AGENT_MAPS,
+)
+from app.agents.new_chat.permissions import Rule, Ruleset
+from app.db import SearchSourceConnector, async_session_maker
+
+logger = logging.getLogger(__name__)
+
+_TRUSTED_TOOLS_KEY = "trusted_tools"
+
+TrustedToolSaver = Callable[[int, str], Awaitable[None]]
+
+
+async def _load_owned_connector(
+    session: AsyncSession,
+    *,
+    user_id: uuid.UUID,
+    connector_id: int,
+) -> SearchSourceConnector | None:
+    """Return the connector iff owned by ``user_id``, else ``None``."""
+    result = await session.execute(
+        select(SearchSourceConnector).where(
+            SearchSourceConnector.id == connector_id,
+            SearchSourceConnector.user_id == user_id,
+        )
+    )
+    return result.scalars().first()
+
+
+def _read_trusted(connector: SearchSourceConnector) -> list[str]:
+    config = connector.config or {}
+    raw = config.get(_TRUSTED_TOOLS_KEY, [])
+    if not isinstance(raw, list):
+        return []
+    return [str(item) for item in raw if isinstance(item, str)]
+
+
+def _write_trusted(connector: SearchSourceConnector, trusted: list[str]) -> None:
+    config = dict(connector.config or {})
+    config[_TRUSTED_TOOLS_KEY] = trusted
+    connector.config = config
+    flag_modified(connector, "config")
+
+
+async def add_user_trust(
+    session: AsyncSession,
+    *,
+    user_id: uuid.UUID,
+    connector_id: int,
+    tool_name: str,
+) -> list[str]:
+    """Append ``tool_name`` to the connector's trusted list; raise ``LookupError`` if not owned."""
+    connector = await _load_owned_connector(
+        session, user_id=user_id, connector_id=connector_id
+    )
+    if connector is None:
+        raise LookupError(f"connector {connector_id} not found for user {user_id}")
+
+    trusted = _read_trusted(connector)
+    if tool_name not in trusted:
+        trusted.append(tool_name)
+        _write_trusted(connector, trusted)
+        await session.flush()
+    return trusted
+
+
+async def remove_user_trust(
+    session: AsyncSession,
+    *,
+    user_id: uuid.UUID,
+    connector_id: int,
+    tool_name: str,
+) -> list[str]:
+    """Remove ``tool_name`` from the connector's trusted list; raise ``LookupError`` if not owned."""
+    connector = await _load_owned_connector(
+        session, user_id=user_id, connector_id=connector_id
+    )
+    if connector is None:
+        raise LookupError(f"connector {connector_id} not found for user {user_id}")
+
+    trusted = _read_trusted(connector)
+    if tool_name in trusted:
+        trusted = [t for t in trusted if t != tool_name]
+        _write_trusted(connector, trusted)
+        await session.flush()
+    return trusted
+
+
+async def fetch_user_allowlist_rulesets(
+    session: AsyncSession,
+    *,
+    user_id: uuid.UUID,
+    search_space_id: int,
+) -> dict[str, Ruleset]:
+    """Project the user's trusted tools into per-subagent ``allow`` rulesets.
+
+    Subagents with no trusted tools are absent from the result —
+    callers must treat ``missing == empty``.
+    """
+    result = await session.execute(
+        select(
+            SearchSourceConnector.id,
+            SearchSourceConnector.connector_type,
+            SearchSourceConnector.config,
+        ).where(
+            SearchSourceConnector.user_id == user_id,
+            SearchSourceConnector.search_space_id == search_space_id,
+        )
+    )
+
+    rules_by_subagent: dict[str, list[Rule]] = defaultdict(list)
+    for _connector_id, connector_type, config in result.all():
+        subagent = CONNECTOR_TYPE_TO_CONNECTOR_AGENT_MAPS.get(str(connector_type))
+        if subagent is None:
+            continue
+
+        cfg = config or {}
+        raw = cfg.get(_TRUSTED_TOOLS_KEY, [])
+        if not isinstance(raw, list):
+            continue
+
+        for tool in raw:
+            if not isinstance(tool, str) or not tool:
+                continue
+            rules_by_subagent[subagent].append(
+                Rule(permission=tool, pattern="*", action="allow")
+            )
+
+    return {
+        subagent: Ruleset(rules=rules, origin=f"user_allowlist:{subagent}")
+        for subagent, rules in rules_by_subagent.items()
+    }
+
+
+def make_trusted_tool_saver(user_id: uuid.UUID) -> TrustedToolSaver:
+    """Bind ``user_id`` into a saver closure; failures are logged, never raised."""
+
+    async def trusted_tool_saver(connector_id: int, tool_name: str) -> None:
+        try:
+            async with async_session_maker() as session:
+                await add_user_trust(
+                    session,
+                    user_id=user_id,
+                    connector_id=connector_id,
+                    tool_name=tool_name,
+                )
+                await session.commit()
+        except LookupError as exc:
+            logger.warning("trusted-tool save skipped: %s", exc)
+        except Exception:
+            logger.exception(
+                "trusted-tool save failed for connector=%s tool=%s",
+                connector_id,
+                tool_name,
+            )
+
+    return trusted_tool_saver
+
+
+__all__ = [
+    "TrustedToolSaver",
+    "add_user_trust",
+    "fetch_user_allowlist_rulesets",
+    "make_trusted_tool_saver",
+    "remove_user_trust",
+]
diff --git a/surfsense_backend/app/services/vision_autocomplete_service.py b/surfsense_backend/app/services/vision_autocomplete_service.py
deleted file mode 100644
index c28962b311..0000000000
--- a/surfsense_backend/app/services/vision_autocomplete_service.py
+++ /dev/null
@@ -1,158 +0,0 @@
-"""Vision autocomplete service — agent-based with scoped filesystem.
-
-Optimized pipeline:
-1. Start the SSE stream immediately so the UI shows progress.
-2. Derive a KB search query from window_title (no separate LLM call).
-3. Run KB filesystem pre-computation and agent graph compilation in PARALLEL.
-4. Inject pre-computed KB files as initial state and stream the agent.
-"""
-
-import logging
-from collections.abc import AsyncGenerator
-
-from langchain_core.messages import HumanMessage
-from sqlalchemy.ext.asyncio import AsyncSession
-
-from app.agents.autocomplete import create_autocomplete_agent, stream_autocomplete_agent
-from app.services.llm_service import get_vision_llm
-from app.services.new_streaming_service import VercelStreamingService
-
-logger = logging.getLogger(__name__)
-
-PREP_STEP_ID = "autocomplete-prep"
-
-
-def _derive_kb_query(app_name: str, window_title: str) -> str:
-    parts = [p for p in (window_title, app_name) if p]
-    return " ".join(parts)
-
-
-def _is_vision_unsupported_error(e: Exception) -> bool:
-    msg = str(e).lower()
-    return "content must be a string" in msg or "does not support image" in msg
-
-
-# ---------------------------------------------------------------------------
-# Main entry point
-# ---------------------------------------------------------------------------
-
-
-async def stream_vision_autocomplete(
-    screenshot_data_url: str,
-    search_space_id: int,
-    session: AsyncSession,
-    *,
-    app_name: str = "",
-    window_title: str = "",
-) -> AsyncGenerator[str, None]:
-    """Analyze a screenshot with a vision-LLM agent and stream a text completion."""
-    streaming = VercelStreamingService()
-    vision_error_msg = (
-        "The selected model does not support vision. "
-        "Please set a vision-capable model (e.g. GPT-4o, Gemini) in your search space settings."
-    )
-
-    llm = await get_vision_llm(session, search_space_id)
-    if not llm:
-        yield streaming.format_message_start()
-        yield streaming.format_error("No Vision LLM configured for this search space")
-        yield streaming.format_done()
-        return
-
-    # Start SSE stream immediately so the UI has something to show
-    yield streaming.format_message_start()
-
-    kb_query = _derive_kb_query(app_name, window_title)
-
-    # Show a preparation step while KB search + agent compile run
-    yield streaming.format_thinking_step(
-        step_id=PREP_STEP_ID,
-        title="Searching knowledge base",
-        status="in_progress",
-        items=[kb_query] if kb_query else [],
-    )
-
-    try:
-        agent, kb = await create_autocomplete_agent(
-            llm,
-            search_space_id=search_space_id,
-            kb_query=kb_query,
-            app_name=app_name,
-            window_title=window_title,
-        )
-    except Exception as e:
-        if _is_vision_unsupported_error(e):
-            logger.warning("Vision autocomplete: model does not support vision: %s", e)
-            yield streaming.format_error(vision_error_msg)
-            yield streaming.format_done()
-            return
-        logger.error("Failed to create autocomplete agent: %s", e, exc_info=True)
-        yield streaming.format_error("Autocomplete failed. Please try again.")
-        yield streaming.format_done()
-        return
-
-    has_kb = kb.has_documents
-    doc_count = len(kb.files) if has_kb else 0  # type: ignore[arg-type]
-
-    yield streaming.format_thinking_step(
-        step_id=PREP_STEP_ID,
-        title="Searching knowledge base",
-        status="complete",
-        items=[f"Found {doc_count} document{'s' if doc_count != 1 else ''}"]
-        if kb_query
-        else ["Skipped"],
-    )
-
-    # Build agent input with pre-computed KB as initial state
-    if has_kb:
-        instruction = (
-            "Analyze this screenshot, then explore the knowledge base documents "
-            "listed above — read the chunk index of any document whose title "
-            "looks relevant and check matched chunks for useful facts. "
-            "Finally, generate a concise autocomplete for the active text area, "
-            "enhanced with any relevant KB information you found."
-        )
-    else:
-        instruction = (
-            "Analyze this screenshot and generate a concise autocomplete "
-            "for the active text area based on what you see."
-        )
-
-    user_message = HumanMessage(
-        content=[
-            {"type": "text", "text": instruction},
-            {"type": "image_url", "image_url": {"url": screenshot_data_url}},
-        ]
-    )
-
-    input_data: dict = {"messages": [user_message]}
-
-    if has_kb:
-        input_data["files"] = kb.files
-        input_data["messages"] = [kb.ls_ai_msg, kb.ls_tool_msg, user_message]
-        logger.info(
-            "Autocomplete: injected %d KB files into agent initial state", doc_count
-        )
-    else:
-        logger.info(
-            "Autocomplete: no KB documents found, proceeding with screenshot only"
-        )
-
-    # Stream the agent (message_start already sent above)
-    try:
-        async for sse in stream_autocomplete_agent(
-            agent,
-            input_data,
-            streaming,
-            emit_message_start=False,
-        ):
-            yield sse
-    except Exception as e:
-        if _is_vision_unsupported_error(e):
-            logger.warning("Vision autocomplete: model does not support vision: %s", e)
-            yield streaming.format_error(vision_error_msg)
-            yield streaming.format_done()
-        else:
-            logger.error("Vision autocomplete streaming error: %s", e, exc_info=True)
-            yield streaming.format_error("Autocomplete failed. Please try again.")
-            yield streaming.format_done()
diff --git a/surfsense_backend/app/services/vision_llm_router_service.py b/surfsense_backend/app/services/vision_llm_router_service.py
index 0d782ab2b0..ed5de921c4 100644
--- a/surfsense_backend/app/services/vision_llm_router_service.py
+++ b/surfsense_backend/app/services/vision_llm_router_service.py
@@ -3,6 +3,8 @@
 
 from litellm import Router
 
+from app.services.provider_api_base import resolve_api_base
+
 logger = logging.getLogger(__name__)
 
 VISION_AUTO_MODE_ID = 0
@@ -108,10 +110,11 @@ def _config_to_deployment(cls, config: dict) -> dict | None:
             if not config.get("model_name") or not config.get("api_key"):
                 return None
 
+            provider = config.get("provider", "").upper()
             if config.get("custom_provider"):
-                model_string = f"{config['custom_provider']}/{config['model_name']}"
+                provider_prefix = config["custom_provider"]
+                model_string = f"{provider_prefix}/{config['model_name']}"
             else:
-                provider = config.get("provider", "").upper()
                 provider_prefix = VISION_PROVIDER_MAP.get(provider, provider.lower())
                 model_string = f"{provider_prefix}/{config['model_name']}"
 
@@ -120,8 +123,13 @@ def _config_to_deployment(cls, config: dict) -> dict | None:
                 "api_key": config.get("api_key"),
             }
 
-            if config.get("api_base"):
-                litellm_params["api_base"] = config["api_base"]
+            api_base = resolve_api_base(
+                provider=provider,
+                provider_prefix=provider_prefix,
+                config_api_base=config.get("api_base"),
+            )
+            if api_base:
+                litellm_params["api_base"] = api_base
 
             if config.get("api_version"):
                 litellm_params["api_version"] = config["api_version"]
diff --git a/surfsense_backend/app/tasks/celery_tasks/__init__.py b/surfsense_backend/app/tasks/celery_tasks/__init__.py
index 5b1f2cd136..b23359f36f 100644
--- a/surfsense_backend/app/tasks/celery_tasks/__init__.py
+++ b/surfsense_backend/app/tasks/celery_tasks/__init__.py
@@ -1,10 +1,25 @@
-"""Celery tasks package."""
+"""Celery tasks package.
+
+Also hosts the small helpers every async celery task should use to
+spin up its event loop. See :func:`run_async_celery_task` for the
+canonical pattern.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import contextlib
+import logging
+from collections.abc import Awaitable, Callable
+from typing import TypeVar
 
 from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
 from sqlalchemy.pool import NullPool
 
 from app.config import config
 
+logger = logging.getLogger(__name__)
+
 _celery_engine = None
 _celery_session_maker = None
 
@@ -26,3 +41,86 @@ def get_celery_session_maker() -> async_sessionmaker:
             _celery_engine, expire_on_commit=False
         )
     return _celery_session_maker
+
+
+def _dispose_shared_db_engine(loop: asyncio.AbstractEventLoop) -> None:
+    """Drop the shared ``app.db.engine`` connection pool synchronously.
+
+    The shared engine (used by ``shielded_async_session`` and most
+    routes / services) is a module-level singleton with a real pool.
+    Each celery task creates a fresh ``asyncio`` event loop; asyncpg
+    connections cache a reference to whichever loop opened them. When
+    a subsequent task's loop pulls a stale connection from the pool,
+    SQLAlchemy's ``pool_pre_ping`` checkout crashes with::
+
+        AttributeError: 'NoneType' object has no attribute 'send'
+        File ".../asyncio/proactor_events.py", line 402, in _loop_writing
+            self._write_fut = self._loop._proactor.send(self._sock, data)
+
+    or hangs forever inside the asyncpg ``Connection._cancel`` cleanup
+    coroutine that can never run because its loop is gone.
+
+    Disposing the engine forces the pool to drop every cached
+    connection so the next checkout opens a fresh one on the current
+    loop. Safe to call from a task's finally block; failure is logged
+    but never propagated.
+    """
+    try:
+        from app.db import engine as shared_engine
+
+        loop.run_until_complete(shared_engine.dispose())
+    except Exception:
+        logger.warning("Shared DB engine dispose() failed", exc_info=True)
+
+
+T = TypeVar("T")
+
+
+def run_async_celery_task[T](coro_factory: Callable[[], Awaitable[T]]) -> T:
+    """Run an async coroutine inside a fresh event loop with proper
+    DB-engine cleanup.
+
+    This is the canonical entry point for every async celery task.
+    It performs three responsibilities that were previously copy-pasted
+    (incorrectly) across each task module:
+
+    1. Create a fresh ``asyncio`` loop and install it on the current
+       thread (celery's ``--pool=solo`` runs every task on the main
+       thread, but other pool types don't).
+    2. Dispose the shared ``app.db.engine`` BEFORE the task runs so
+       any stale connections left over from a previous task's loop
+       are dropped — defends against tasks that crashed without
+       cleaning up.
+    3. Dispose the shared engine AFTER the task runs so the
+       connections we opened on this loop are released before the
+       loop closes (avoids ``coroutine 'Connection._cancel' was
+       never awaited`` warnings and the next-task hang).
+
+    Use as::
+
+        @celery_app.task(name="my_task", bind=True)
+        def my_task(self, *args):
+            return run_async_celery_task(lambda: _my_task_impl(*args))
+    """
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    try:
+        # Defense-in-depth: prior task may have crashed before
+        # disposing. Idempotent — no-op if pool is already empty.
+        _dispose_shared_db_engine(loop)
+        return loop.run_until_complete(coro_factory())
+    finally:
+        # Drop any connections this task opened so they don't leak
+        # into the next task's loop.
+        _dispose_shared_db_engine(loop)
+        with contextlib.suppress(Exception):
+            loop.run_until_complete(loop.shutdown_asyncgens())
+        with contextlib.suppress(Exception):
+            asyncio.set_event_loop(None)
+        loop.close()
+
+
+__all__ = [
+    "get_celery_session_maker",
+    "run_async_celery_task",
+]
diff --git a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py
index 57475c9fdb..08d96cfa0f 100644
--- a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py
+++ b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py
@@ -4,7 +4,7 @@
 import traceback
 
 from app.celery_app import celery_app
-from app.tasks.celery_tasks import get_celery_session_maker
+from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task
 
 logger = logging.getLogger(__name__)
 
@@ -39,52 +39,6 @@ def _handle_greenlet_error(e: Exception, task_name: str, connector_id: int) -> N
         )
 
 
-@celery_app.task(name="index_slack_messages", bind=True)
-def index_slack_messages_task(
-    self,
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """Celery task to index Slack messages."""
-    import asyncio
-
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
-    try:
-        loop.run_until_complete(
-            _index_slack_messages(
-                connector_id, search_space_id, user_id, start_date, end_date
-            )
-        )
-    except Exception as e:
-        _handle_greenlet_error(e, "index_slack_messages", connector_id)
-        raise
-    finally:
-        loop.close()
-
-
-async def _index_slack_messages(
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """Index Slack messages with new session."""
-    from app.routes.search_source_connectors_routes import (
-        run_slack_indexing,
-    )
-
-    async with get_celery_session_maker()() as session:
-        await run_slack_indexing(
-            session, connector_id, search_space_id, user_id, start_date, end_date
-        )
-
-
 @celery_app.task(name="index_notion_pages", bind=True)
 def index_notion_pages_task(
     self,
@@ -95,22 +49,15 @@ def index_notion_pages_task(
     end_date: str,
 ):
     """Celery task to index Notion pages."""
-    import asyncio
-
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
     try:
-        loop.run_until_complete(
-            _index_notion_pages(
+        return run_async_celery_task(
+            lambda: _index_notion_pages(
                 connector_id, search_space_id, user_id, start_date, end_date
             )
         )
     except Exception as e:
         _handle_greenlet_error(e, "index_notion_pages", connector_id)
         raise
-    finally:
-        loop.close()
 
 
 async def _index_notion_pages(
@@ -141,19 +88,11 @@ def index_github_repos_task(
     end_date: str,
 ):
     """Celery task to index GitHub repositories."""
-    import asyncio
-
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
-    try:
-        loop.run_until_complete(
-            _index_github_repos(
-                connector_id, search_space_id, user_id, start_date, end_date
-            )
+    return run_async_celery_task(
+        lambda: _index_github_repos(
+            connector_id, search_space_id, user_id, start_date, end_date
         )
-    finally:
-        loop.close()
+    )
 
 
 async def _index_github_repos(
@@ -174,92 +113,6 @@ async def _index_github_repos(
         )
 
 
-@celery_app.task(name="index_linear_issues", bind=True)
-def index_linear_issues_task(
-    self,
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """Celery task to index Linear issues."""
-    import asyncio
-
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
-    try:
-        loop.run_until_complete(
-            _index_linear_issues(
-                connector_id, search_space_id, user_id, start_date, end_date
-            )
-        )
-    finally:
-        loop.close()
-
-
-async def _index_linear_issues(
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """Index Linear issues with new session."""
-    from app.routes.search_source_connectors_routes import (
-        run_linear_indexing,
-    )
-
-    async with get_celery_session_maker()() as session:
-        await run_linear_indexing(
-            session, connector_id, search_space_id, user_id, start_date, end_date
-        )
-
-
-@celery_app.task(name="index_jira_issues", bind=True)
-def index_jira_issues_task(
-    self,
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """Celery task to index Jira issues."""
-    import asyncio
-
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
-    try:
-        loop.run_until_complete(
-            _index_jira_issues(
-                connector_id, search_space_id, user_id, start_date, end_date
-            )
-        )
-    finally:
-        loop.close()
-
-
-async def _index_jira_issues(
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """Index Jira issues with new session."""
-    from app.routes.search_source_connectors_routes import (
-        run_jira_indexing,
-    )
-
-    async with get_celery_session_maker()() as session:
-        await run_jira_indexing(
-            session, connector_id, search_space_id, user_id, start_date, end_date
-        )
-
-
 @celery_app.task(name="index_confluence_pages", bind=True)
 def index_confluence_pages_task(
     self,
@@ -270,19 +123,11 @@ def index_confluence_pages_task(
     end_date: str,
 ):
     """Celery task to index Confluence pages."""
-    import asyncio
-
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
-    try:
-        loop.run_until_complete(
-            _index_confluence_pages(
-                connector_id, search_space_id, user_id, start_date, end_date
-            )
+    return run_async_celery_task(
+        lambda: _index_confluence_pages(
+            connector_id, search_space_id, user_id, start_date, end_date
         )
-    finally:
-        loop.close()
+    )
 
 
 async def _index_confluence_pages(
@@ -303,49 +148,6 @@ async def _index_confluence_pages(
         )
 
 
-@celery_app.task(name="index_clickup_tasks", bind=True)
-def index_clickup_tasks_task(
-    self,
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """Celery task to index ClickUp tasks."""
-    import asyncio
-
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
-    try:
-        loop.run_until_complete(
-            _index_clickup_tasks(
-                connector_id, search_space_id, user_id, start_date, end_date
-            )
-        )
-    finally:
-        loop.close()
-
-
-async def _index_clickup_tasks(
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """Index ClickUp tasks with new session."""
-    from app.routes.search_source_connectors_routes import (
-        run_clickup_indexing,
-    )
-
-    async with get_celery_session_maker()() as session:
-        await run_clickup_indexing(
-            session, connector_id, search_space_id, user_id, start_date, end_date
-        )
-
-
 @celery_app.task(name="index_google_calendar_events", bind=True)
 def index_google_calendar_events_task(
     self,
@@ -356,22 +158,15 @@ def index_google_calendar_events_task(
     end_date: str,
 ):
     """Celery task to index Google Calendar events."""
-    import asyncio
-
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
     try:
-        loop.run_until_complete(
-            _index_google_calendar_events(
+        return run_async_celery_task(
+            lambda: _index_google_calendar_events(
                 connector_id, search_space_id, user_id, start_date, end_date
             )
         )
     except Exception as e:
         _handle_greenlet_error(e, "index_google_calendar_events", connector_id)
         raise
-    finally:
-        loop.close()
 
 
 async def _index_google_calendar_events(
@@ -392,49 +187,6 @@ async def _index_google_calendar_events(
         )
 
 
-@celery_app.task(name="index_airtable_records", bind=True)
-def index_airtable_records_task(
-    self,
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """Celery task to index Airtable records."""
-    import asyncio
-
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
-    try:
-        loop.run_until_complete(
-            _index_airtable_records(
-                connector_id, search_space_id, user_id, start_date, end_date
-            )
-        )
-    finally:
-        loop.close()
-
-
-async def _index_airtable_records(
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """Index Airtable records with new session."""
-    from app.routes.search_source_connectors_routes import (
-        run_airtable_indexing,
-    )
-
-    async with get_celery_session_maker()() as session:
-        await run_airtable_indexing(
-            session, connector_id, search_space_id, user_id, start_date, end_date
-        )
-
-
 @celery_app.task(name="index_google_gmail_messages", bind=True)
 def index_google_gmail_messages_task(
     self,
@@ -445,19 +197,11 @@ def index_google_gmail_messages_task(
     end_date: str,
 ):
     """Celery task to index Google Gmail messages."""
-    import asyncio
-
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
-    try:
-        loop.run_until_complete(
-            _index_google_gmail_messages(
-                connector_id, search_space_id, user_id, start_date, end_date
-            )
+    return run_async_celery_task(
+        lambda: _index_google_gmail_messages(
+            connector_id, search_space_id, user_id, start_date, end_date
         )
-    finally:
-        loop.close()
+    )
 
 
 async def _index_google_gmail_messages(
@@ -487,22 +231,14 @@ def index_google_drive_files_task(
     items_dict: dict,  # Dictionary with 'folders', 'files', and 'indexing_options'
 ):
     """Celery task to index Google Drive folders and files."""
-    import asyncio
-
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
-    try:
-        loop.run_until_complete(
-            _index_google_drive_files(
-                connector_id,
-                search_space_id,
-                user_id,
-                items_dict,
-            )
+    return run_async_celery_task(
+        lambda: _index_google_drive_files(
+            connector_id,
+            search_space_id,
+            user_id,
+            items_dict,
         )
-    finally:
-        loop.close()
+    )
 
 
 async def _index_google_drive_files(
@@ -535,22 +271,14 @@ def index_onedrive_files_task(
     items_dict: dict,
 ):
     """Celery task to index OneDrive folders and files."""
-    import asyncio
-
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
-    try:
-        loop.run_until_complete(
-            _index_onedrive_files(
-                connector_id,
-                search_space_id,
-                user_id,
-                items_dict,
-            )
+    return run_async_celery_task(
+        lambda: _index_onedrive_files(
+            connector_id,
+            search_space_id,
+            user_id,
+            items_dict,
         )
-    finally:
-        loop.close()
+    )
 
 
 async def _index_onedrive_files(
@@ -583,22 +311,14 @@ def index_dropbox_files_task(
     items_dict: dict,
 ):
     """Celery task to index Dropbox folders and files."""
-    import asyncio
-
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
-    try:
-        loop.run_until_complete(
-            _index_dropbox_files(
-                connector_id,
-                search_space_id,
-                user_id,
-                items_dict,
-            )
+    return run_async_celery_task(
+        lambda: _index_dropbox_files(
+            connector_id,
+            search_space_id,
+            user_id,
+            items_dict,
         )
-    finally:
-        loop.close()
+    )
 
 
 async def _index_dropbox_files(
@@ -622,135 +342,6 @@ async def _index_dropbox_files(
         )
 
 
-@celery_app.task(name="index_discord_messages", bind=True)
-def index_discord_messages_task(
-    self,
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """Celery task to index Discord messages."""
-    import asyncio
-
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
-    try:
-        loop.run_until_complete(
-            _index_discord_messages(
-                connector_id, search_space_id, user_id, start_date, end_date
-            )
-        )
-    finally:
-        loop.close()
-
-
-async def _index_discord_messages(
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """Index Discord messages with new session."""
-    from app.routes.search_source_connectors_routes import (
-        run_discord_indexing,
-    )
-
-    async with get_celery_session_maker()() as session:
-        await run_discord_indexing(
-            session, connector_id, search_space_id, user_id, start_date, end_date
-        )
-
-
-@celery_app.task(name="index_teams_messages", bind=True)
-def index_teams_messages_task(
-    self,
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """Celery task to index Microsoft Teams messages."""
-    import asyncio
-
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
-    try:
-        loop.run_until_complete(
-            _index_teams_messages(
-                connector_id, search_space_id, user_id, start_date, end_date
-            )
-        )
-    finally:
-        loop.close()
-
-
-async def _index_teams_messages(
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """Index Microsoft Teams messages with new session."""
-    from app.routes.search_source_connectors_routes import (
-        run_teams_indexing,
-    )
-
-    async with get_celery_session_maker()() as session:
-        await run_teams_indexing(
-            session, connector_id, search_space_id, user_id, start_date, end_date
-        )
-
-
-@celery_app.task(name="index_luma_events", bind=True)
-def index_luma_events_task(
-    self,
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """Celery task to index Luma events."""
-    import asyncio
-
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
-    try:
-        loop.run_until_complete(
-            _index_luma_events(
-                connector_id, search_space_id, user_id, start_date, end_date
-            )
-        )
-    finally:
-        loop.close()
-
-
-async def _index_luma_events(
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """Index Luma events with new session."""
-    from app.routes.search_source_connectors_routes import (
-        run_luma_indexing,
-    )
-
-    async with get_celery_session_maker()() as session:
-        await run_luma_indexing(
-            session, connector_id, search_space_id, user_id, start_date, end_date
-        )
-
-
 @celery_app.task(name="index_elasticsearch_documents", bind=True)
 def index_elasticsearch_documents_task(
     self,
@@ -761,19 +352,11 @@ def index_elasticsearch_documents_task(
     end_date: str,
 ):
     """Celery task to index Elasticsearch documents."""
-    import asyncio
-
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
-    try:
-        loop.run_until_complete(
-            _index_elasticsearch_documents(
-                connector_id, search_space_id, user_id, start_date, end_date
-            )
+    return run_async_celery_task(
+        lambda: _index_elasticsearch_documents(
+            connector_id, search_space_id, user_id, start_date, end_date
         )
-    finally:
-        loop.close()
+    )
 
 
 async def _index_elasticsearch_documents(
@@ -804,22 +387,15 @@ def index_crawled_urls_task(
     end_date: str,
 ):
     """Celery task to index Web page Urls."""
-    import asyncio
-
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
     try:
-        loop.run_until_complete(
-            _index_crawled_urls(
+        return run_async_celery_task(
+            lambda: _index_crawled_urls(
                 connector_id, search_space_id, user_id, start_date, end_date
             )
         )
     except Exception as e:
         _handle_greenlet_error(e, "index_crawled_urls", connector_id)
         raise
-    finally:
-        loop.close()
 
 
 async def _index_crawled_urls(
@@ -850,19 +426,11 @@ def index_bookstack_pages_task(
     end_date: str,
 ):
     """Celery task to index BookStack pages."""
-    import asyncio
-
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
-    try:
-        loop.run_until_complete(
-            _index_bookstack_pages(
-                connector_id, search_space_id, user_id, start_date, end_date
-            )
+    return run_async_celery_task(
+        lambda: _index_bookstack_pages(
+            connector_id, search_space_id, user_id, start_date, end_date
         )
-    finally:
-        loop.close()
+    )
 
 
 async def _index_bookstack_pages(
@@ -883,49 +451,6 @@ async def _index_bookstack_pages(
         )
 
 
-@celery_app.task(name="index_obsidian_vault", bind=True)
-def index_obsidian_vault_task(
-    self,
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """Celery task to index Obsidian vault notes."""
-    import asyncio
-
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
-    try:
-        loop.run_until_complete(
-            _index_obsidian_vault(
-                connector_id, search_space_id, user_id, start_date, end_date
-            )
-        )
-    finally:
-        loop.close()
-
-
-async def _index_obsidian_vault(
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str,
-    end_date: str,
-):
-    """Index Obsidian vault with new session."""
-    from app.routes.search_source_connectors_routes import (
-        run_obsidian_indexing,
-    )
-
-    async with get_celery_session_maker()() as session:
-        await run_obsidian_indexing(
-            session, connector_id, search_space_id, user_id, start_date, end_date
-        )
-
-
 @celery_app.task(name="index_composio_connector", bind=True)
 def index_composio_connector_task(
     self,
@@ -936,19 +461,11 @@ def index_composio_connector_task(
     end_date: str | None,
 ):
     """Celery task to index Composio connector content (Google Drive, Gmail, Calendar via Composio)."""
-    import asyncio
-
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
-    try:
-        loop.run_until_complete(
-            _index_composio_connector(
-                connector_id, search_space_id, user_id, start_date, end_date
-            )
+    return run_async_celery_task(
+        lambda: _index_composio_connector(
+            connector_id, search_space_id, user_id, start_date, end_date
         )
-    finally:
-        loop.close()
+    )
 
 
 async def _index_composio_connector(
diff --git a/surfsense_backend/app/tasks/celery_tasks/document_reindex_tasks.py b/surfsense_backend/app/tasks/celery_tasks/document_reindex_tasks.py
index c2dbe77009..5d6bde6c14 100644
--- a/surfsense_backend/app/tasks/celery_tasks/document_reindex_tasks.py
+++ b/surfsense_backend/app/tasks/celery_tasks/document_reindex_tasks.py
@@ -11,7 +11,7 @@
 from app.indexing_pipeline.adapters.file_upload_adapter import UploadDocumentAdapter
 from app.services.llm_service import get_user_long_context_llm
 from app.services.task_logging_service import TaskLoggingService
-from app.tasks.celery_tasks import get_celery_session_maker
+from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task
 
 logger = logging.getLogger(__name__)
 
@@ -25,15 +25,7 @@ def reindex_document_task(self, document_id: int, user_id: str):
         document_id: ID of document to reindex
         user_id: ID of user who edited the document
     """
-    import asyncio
-
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
-    try:
-        loop.run_until_complete(_reindex_document(document_id, user_id))
-    finally:
-        loop.close()
+    return run_async_celery_task(lambda: _reindex_document(document_id, user_id))
 
 
 async def _reindex_document(document_id: int, user_id: str):
diff --git a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py
index 62720826fc..c78e376bd4 100644
--- a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py
+++ b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py
@@ -4,13 +4,14 @@
 import contextlib
 import logging
 import os
+import time
 from uuid import UUID
 
 from app.celery_app import celery_app
 from app.config import config
 from app.services.notification_service import NotificationService
 from app.services.task_logging_service import TaskLoggingService
-from app.tasks.celery_tasks import get_celery_session_maker
+from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task
 from app.tasks.connector_indexers.local_folder_indexer import (
     index_local_folder,
     index_uploaded_files,
@@ -104,12 +105,7 @@ async def _run_heartbeat_loop(notification_id: int):
 )
 def delete_document_task(self, document_id: int):
     """Celery task to delete a document and its chunks in batches."""
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-    try:
-        loop.run_until_complete(_delete_document_background(document_id))
-    finally:
-        loop.close()
+    return run_async_celery_task(lambda: _delete_document_background(document_id))
 
 
 async def _delete_document_background(document_id: int) -> None:
@@ -152,14 +148,9 @@ def delete_folder_documents_task(
     folder_subtree_ids: list[int] | None = None,
 ):
     """Celery task to delete documents first, then the folder rows."""
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-    try:
-        loop.run_until_complete(
-            _delete_folder_documents(document_ids, folder_subtree_ids)
-        )
-    finally:
-        loop.close()
+    return run_async_celery_task(
+        lambda: _delete_folder_documents(document_ids, folder_subtree_ids)
+    )
 
 
 async def _delete_folder_documents(
@@ -208,12 +199,9 @@ async def _delete_folder_documents(
 )
 def delete_search_space_task(self, search_space_id: int):
     """Celery task to delete a search space and heavy child rows in batches."""
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-    try:
-        loop.run_until_complete(_delete_search_space_background(search_space_id))
-    finally:
-        loop.close()
+    return run_async_celery_task(
+        lambda: _delete_search_space_background(search_space_id)
+    )
 
 
 async def _delete_search_space_background(search_space_id: int) -> None:
@@ -268,18 +256,11 @@ def process_extension_document_task(
         search_space_id: ID of the search space
         user_id: ID of the user
     """
-    # Create a new event loop for this task
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
-    try:
-        loop.run_until_complete(
-            _process_extension_document(
-                individual_document_dict, search_space_id, user_id
-            )
+    return run_async_celery_task(
+        lambda: _process_extension_document(
+            individual_document_dict, search_space_id, user_id
         )
-    finally:
-        loop.close()
+    )
 
 
 async def _process_extension_document(
@@ -418,13 +399,9 @@ def process_youtube_video_task(self, url: str, search_space_id: int, user_id: st
         search_space_id: ID of the search space
         user_id: ID of the user
     """
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
-    try:
-        loop.run_until_complete(_process_youtube_video(url, search_space_id, user_id))
-    finally:
-        loop.close()
+    return run_async_celery_task(
+        lambda: _process_youtube_video(url, search_space_id, user_id)
+    )
 
 
 async def _process_youtube_video(url: str, search_space_id: int, user_id: str):
@@ -572,12 +549,9 @@ def process_file_upload_task(
     except Exception as e:
         logger.warning(f"[process_file_upload] Could not get file size: {e}")
 
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
     try:
-        loop.run_until_complete(
-            _process_file_upload(file_path, filename, search_space_id, user_id)
+        run_async_celery_task(
+            lambda: _process_file_upload(file_path, filename, search_space_id, user_id)
         )
         logger.info(
             f"[process_file_upload] Task completed successfully for: {filename}"
@@ -588,8 +562,6 @@ def process_file_upload_task(
             f"Traceback:\n{traceback.format_exc()}"
         )
         raise
-    finally:
-        loop.close()
 
 
 async def _process_file_upload(
@@ -778,6 +750,8 @@ def process_file_upload_with_document_task(
     search_space_id: int,
     user_id: str,
     should_summarize: bool = False,
+    use_vision_llm: bool = False,
+    processing_mode: str = "basic",
 ):
     """
     Celery task to process uploaded file with existing pending document.
@@ -808,31 +782,25 @@ def process_file_upload_with_document_task(
             "File may have been removed before syncing could start."
         )
         # Mark document as failed since file is missing
-        loop = asyncio.new_event_loop()
-        asyncio.set_event_loop(loop)
-        try:
-            loop.run_until_complete(
-                _mark_document_failed(
-                    document_id,
-                    "File not found. Please re-upload the file.",
-                )
+        run_async_celery_task(
+            lambda: _mark_document_failed(
+                document_id,
+                "File not found. Please re-upload the file.",
             )
-        finally:
-            loop.close()
+        )
         return
 
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
     try:
-        loop.run_until_complete(
-            _process_file_with_document(
+        run_async_celery_task(
+            lambda: _process_file_with_document(
                 document_id,
                 temp_path,
                 filename,
                 search_space_id,
                 user_id,
                 should_summarize=should_summarize,
+                use_vision_llm=use_vision_llm,
+                processing_mode=processing_mode,
             )
         )
         logger.info(
@@ -844,8 +812,6 @@ def process_file_upload_with_document_task(
             f"Traceback:\n{traceback.format_exc()}"
         )
         raise
-    finally:
-        loop.close()
 
 
 async def _mark_document_failed(document_id: int, reason: str):
@@ -869,6 +835,8 @@ async def _process_file_with_document(
     search_space_id: int,
     user_id: str,
     should_summarize: bool = False,
+    use_vision_llm: bool = False,
+    processing_mode: str = "basic",
 ):
     """
     Process file and update existing pending document status.
@@ -971,6 +939,8 @@ async def _process_file_with_document(
                 log_entry=log_entry,
                 notification=notification,
                 should_summarize=should_summarize,
+                use_vision_llm=use_vision_llm,
+                processing_mode=processing_mode,
             )
 
             # Update notification on success
@@ -1110,22 +1080,16 @@ def process_circleback_meeting_task(
         search_space_id: ID of the search space
         connector_id: ID of the Circleback connector (for deletion support)
     """
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
-    try:
-        loop.run_until_complete(
-            _process_circleback_meeting(
-                meeting_id,
-                meeting_name,
-                markdown_content,
-                metadata,
-                search_space_id,
-                connector_id,
-            )
+    return run_async_celery_task(
+        lambda: _process_circleback_meeting(
+            meeting_id,
+            meeting_name,
+            markdown_content,
+            metadata,
+            search_space_id,
+            connector_id,
         )
-    finally:
-        loop.close()
+    )
 
 
 async def _process_circleback_meeting(
@@ -1282,25 +1246,19 @@ def index_local_folder_task(
     target_file_paths: list[str] | None = None,
 ):
     """Celery task to index a local folder. Config is passed directly — no connector row."""
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
-    try:
-        loop.run_until_complete(
-            _index_local_folder_async(
-                search_space_id=search_space_id,
-                user_id=user_id,
-                folder_path=folder_path,
-                folder_name=folder_name,
-                exclude_patterns=exclude_patterns,
-                file_extensions=file_extensions,
-                root_folder_id=root_folder_id,
-                enable_summary=enable_summary,
-                target_file_paths=target_file_paths,
-            )
+    return run_async_celery_task(
+        lambda: _index_local_folder_async(
+            search_space_id=search_space_id,
+            user_id=user_id,
+            folder_path=folder_path,
+            folder_name=folder_name,
+            exclude_patterns=exclude_patterns,
+            file_extensions=file_extensions,
+            root_folder_id=root_folder_id,
+            enable_summary=enable_summary,
+            target_file_paths=target_file_paths,
         )
-    finally:
-        loop.close()
+    )
 
 
 async def _index_local_folder_async(
@@ -1428,23 +1386,22 @@ def index_uploaded_folder_files_task(
     root_folder_id: int,
     enable_summary: bool,
     file_mappings: list[dict],
+    use_vision_llm: bool = False,
+    processing_mode: str = "basic",
 ):
     """Celery task to index files uploaded from the desktop app."""
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-    try:
-        loop.run_until_complete(
-            _index_uploaded_folder_files_async(
-                search_space_id=search_space_id,
-                user_id=user_id,
-                folder_name=folder_name,
-                root_folder_id=root_folder_id,
-                enable_summary=enable_summary,
-                file_mappings=file_mappings,
-            )
+    return run_async_celery_task(
+        lambda: _index_uploaded_folder_files_async(
+            search_space_id=search_space_id,
+            user_id=user_id,
+            folder_name=folder_name,
+            root_folder_id=root_folder_id,
+            enable_summary=enable_summary,
+            file_mappings=file_mappings,
+            use_vision_llm=use_vision_llm,
+            processing_mode=processing_mode,
         )
-    finally:
-        loop.close()
+    )
 
 
 async def _index_uploaded_folder_files_async(
@@ -1454,6 +1411,8 @@ async def _index_uploaded_folder_files_async(
     root_folder_id: int,
     enable_summary: bool,
     file_mappings: list[dict],
+    use_vision_llm: bool = False,
+    processing_mode: str = "basic",
 ):
     """Run upload-based folder indexing with notification + heartbeat."""
     file_count = len(file_mappings)
@@ -1503,6 +1462,8 @@ async def _heartbeat_progress(completed_count: int) -> None:
                 enable_summary=enable_summary,
                 file_mappings=file_mappings,
                 on_heartbeat_callback=_heartbeat_progress,
+                use_vision_llm=use_vision_llm,
+                processing_mode=processing_mode,
             )
 
             if notification:
@@ -1543,3 +1504,113 @@ async def _heartbeat_progress(completed_count: int) -> None:
                 heartbeat_task.cancel()
             if notification_id is not None:
                 _stop_heartbeat(notification_id)
+
+
+# ===== AI File Sort tasks =====
+
+AI_SORT_LOCK_TTL_SECONDS = 600  # 10 minutes
+_ai_sort_redis = None
+
+
+def _get_ai_sort_redis():
+    import redis
+
+    global _ai_sort_redis
+    if _ai_sort_redis is None:
+        _ai_sort_redis = redis.from_url(config.REDIS_APP_URL, decode_responses=True)
+    return _ai_sort_redis
+
+
+def _ai_sort_lock_key(search_space_id: int) -> str:
+    return f"ai_sort:search_space:{search_space_id}:lock"
+
+
+@celery_app.task(name="ai_sort_search_space", bind=True, max_retries=1)
+def ai_sort_search_space_task(self, search_space_id: int, user_id: str):
+    """Full AI sort for all documents in a search space."""
+    return run_async_celery_task(
+        lambda: _ai_sort_search_space_async(search_space_id, user_id)
+    )
+
+
+async def _ai_sort_search_space_async(search_space_id: int, user_id: str):
+    r = _get_ai_sort_redis()
+    lock_key = _ai_sort_lock_key(search_space_id)
+
+    if not r.set(lock_key, "running", nx=True, ex=AI_SORT_LOCK_TTL_SECONDS):
+        logger.info(
+            "AI sort already running for search_space=%d, skipping",
+            search_space_id,
+        )
+        return
+
+    t_start = time.perf_counter()
+    try:
+        from app.services.ai_file_sort_service import ai_sort_all_documents
+        from app.services.llm_service import get_document_summary_llm
+
+        async with get_celery_session_maker()() as session:
+            llm = await get_document_summary_llm(
+                session, search_space_id, disable_streaming=True
+            )
+            if llm is None:
+                logger.warning(
+                    "No LLM configured for search_space=%d, skipping AI sort",
+                    search_space_id,
+                )
+                return
+
+            sorted_count, failed_count = await ai_sort_all_documents(
+                session, search_space_id, llm
+            )
+            elapsed = time.perf_counter() - t_start
+            logger.info(
+                "AI sort search_space=%d done in %.1fs: sorted=%d failed=%d",
+                search_space_id,
+                elapsed,
+                sorted_count,
+                failed_count,
+            )
+    finally:
+        r.delete(lock_key)
+
+
+@celery_app.task(
+    name="ai_sort_document", bind=True, max_retries=2, default_retry_delay=10
+)
+def ai_sort_document_task(self, search_space_id: int, user_id: str, document_id: int):
+    """Incremental AI sort for a single document after indexing."""
+    return run_async_celery_task(
+        lambda: _ai_sort_document_async(search_space_id, user_id, document_id)
+    )
+
+
+async def _ai_sort_document_async(search_space_id: int, user_id: str, document_id: int):
+    from app.db import Document
+    from app.services.ai_file_sort_service import ai_sort_document
+    from app.services.llm_service import get_document_summary_llm
+
+    async with get_celery_session_maker()() as session:
+        document = await session.get(Document, document_id)
+        if document is None:
+            logger.warning("Document %d not found, skipping AI sort", document_id)
+            return
+
+        llm = await get_document_summary_llm(
+            session, search_space_id, disable_streaming=True
+        )
+        if llm is None:
+            logger.warning(
+                "No LLM for search_space=%d, skipping AI sort of doc=%d",
+                search_space_id,
+                document_id,
+            )
+            return
+
+        await ai_sort_document(session, document, llm)
+        await session.commit()
+        logger.info(
+            "AI sorted document=%d into search_space=%d",
+            document_id,
+            search_space_id,
+        )
diff --git a/surfsense_backend/app/tasks/celery_tasks/obsidian_tasks.py b/surfsense_backend/app/tasks/celery_tasks/obsidian_tasks.py
new file mode 100644
index 0000000000..c6c8666f5b
--- /dev/null
+++ b/surfsense_backend/app/tasks/celery_tasks/obsidian_tasks.py
@@ -0,0 +1,53 @@
+"""Celery tasks for Obsidian plugin background processing."""
+
+from __future__ import annotations
+
+import logging
+
+from app.celery_app import celery_app
+from app.db import SearchSourceConnector
+from app.schemas.obsidian_plugin import NotePayload
+from app.services.obsidian_plugin_indexer import upsert_note
+from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task
+
+logger = logging.getLogger(__name__)
+
+
+@celery_app.task(name="index_obsidian_attachment", bind=True)
+def index_obsidian_attachment_task(
+    self,
+    connector_id: int,
+    payload_data: dict,
+    user_id: str,
+) -> None:
+    """Process one Obsidian non-markdown attachment asynchronously."""
+    return run_async_celery_task(
+        lambda: _index_obsidian_attachment(
+            connector_id=connector_id,
+            payload_data=payload_data,
+            user_id=user_id,
+        )
+    )
+
+
+async def _index_obsidian_attachment(
+    *,
+    connector_id: int,
+    payload_data: dict,
+    user_id: str,
+) -> None:
+    async with get_celery_session_maker()() as session:
+        connector = await session.get(SearchSourceConnector, connector_id)
+        if connector is None:
+            logger.warning(
+                "obsidian attachment task skipped: connector %s not found", connector_id
+            )
+            return
+
+        payload = NotePayload.model_validate(payload_data)
+        await upsert_note(
+            session,
+            connector=connector,
+            payload=payload,
+            user_id=user_id,
+        )
diff --git a/surfsense_backend/app/tasks/celery_tasks/podcast_tasks.py b/surfsense_backend/app/tasks/celery_tasks/podcast_tasks.py
index 953011ecf0..8b311576e0 100644
--- a/surfsense_backend/app/tasks/celery_tasks/podcast_tasks.py
+++ b/surfsense_backend/app/tasks/celery_tasks/podcast_tasks.py
@@ -3,14 +3,22 @@
 import asyncio
 import logging
 import sys
+from contextlib import asynccontextmanager
 
 from sqlalchemy import select
 
 from app.agents.podcaster.graph import graph as podcaster_graph
 from app.agents.podcaster.state import State as PodcasterState
 from app.celery_app import celery_app
+from app.config import config as app_config
 from app.db import Podcast, PodcastStatus
-from app.tasks.celery_tasks import get_celery_session_maker
+from app.services.billable_calls import (
+    BillingSettlementError,
+    QuotaInsufficientError,
+    _resolve_agent_billing_for_search_space,
+    billable_call,
+)
+from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task
 
 logger = logging.getLogger(__name__)
 
@@ -28,6 +36,13 @@
 # =============================================================================
 
 
+@asynccontextmanager
+async def _celery_billable_session():
+    """Session factory used by billable_call inside the Celery worker loop."""
+    async with get_celery_session_maker()() as session:
+        yield session
+
+
 @celery_app.task(name="generate_content_podcast", bind=True)
 def generate_content_podcast_task(
     self,
@@ -40,27 +55,22 @@ def generate_content_podcast_task(
     Celery task to generate podcast from source content.
     Updates existing podcast record created by the tool.
     """
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
     try:
-        result = loop.run_until_complete(
-            _generate_content_podcast(
+        return run_async_celery_task(
+            lambda: _generate_content_podcast(
                 podcast_id,
                 source_content,
                 search_space_id,
                 user_prompt,
             )
         )
-        loop.run_until_complete(loop.shutdown_asyncgens())
-        return result
     except Exception as e:
         logger.error(f"Error generating content podcast: {e!s}")
-        loop.run_until_complete(_mark_podcast_failed(podcast_id))
+        try:
+            run_async_celery_task(lambda: _mark_podcast_failed(podcast_id))
+        except Exception:
+            logger.exception("Failed to mark podcast %s as failed", podcast_id)
         return {"status": "failed", "podcast_id": podcast_id}
-    finally:
-        asyncio.set_event_loop(None)
-        loop.close()
 
 
 async def _mark_podcast_failed(podcast_id: int) -> None:
@@ -96,6 +106,31 @@ async def _generate_content_podcast(
             podcast.status = PodcastStatus.GENERATING
             await session.commit()
 
+            try:
+                (
+                    owner_user_id,
+                    billing_tier,
+                    base_model,
+                ) = await _resolve_agent_billing_for_search_space(
+                    session,
+                    search_space_id,
+                    thread_id=podcast.thread_id,
+                )
+            except ValueError as resolve_err:
+                logger.error(
+                    "Podcast %s: cannot resolve billing for search_space=%s: %s",
+                    podcast.id,
+                    search_space_id,
+                    resolve_err,
+                )
+                podcast.status = PodcastStatus.FAILED
+                await session.commit()
+                return {
+                    "status": "failed",
+                    "podcast_id": podcast.id,
+                    "reason": "billing_resolution_failed",
+                }
+
             graph_config = {
                 "configurable": {
                     "podcast_title": podcast.title,
@@ -109,9 +144,52 @@ async def _generate_content_podcast(
                 db_session=session,
             )
 
-            graph_result = await podcaster_graph.ainvoke(
-                initial_state, config=graph_config
-            )
+            try:
+                async with billable_call(
+                    user_id=owner_user_id,
+                    search_space_id=search_space_id,
+                    billing_tier=billing_tier,
+                    base_model=base_model,
+                    quota_reserve_micros_override=app_config.QUOTA_DEFAULT_PODCAST_RESERVE_MICROS,
+                    usage_type="podcast_generation",
+                    call_details={
+                        "podcast_id": podcast.id,
+                        "title": podcast.title,
+                        "thread_id": podcast.thread_id,
+                    },
+                    billable_session_factory=_celery_billable_session,
+                ):
+                    graph_result = await podcaster_graph.ainvoke(
+                        initial_state, config=graph_config
+                    )
+            except QuotaInsufficientError as exc:
+                logger.info(
+                    "Podcast %s denied: out of premium credits "
+                    "(used=%d/%d remaining=%d)",
+                    podcast.id,
+                    exc.used_micros,
+                    exc.limit_micros,
+                    exc.remaining_micros,
+                )
+                podcast.status = PodcastStatus.FAILED
+                await session.commit()
+                return {
+                    "status": "failed",
+                    "podcast_id": podcast.id,
+                    "reason": "premium_quota_exhausted",
+                }
+            except BillingSettlementError:
+                logger.exception(
+                    "Podcast %s: premium billing settlement failed",
+                    podcast.id,
+                )
+                podcast.status = PodcastStatus.FAILED
+                await session.commit()
+                return {
+                    "status": "failed",
+                    "podcast_id": podcast.id,
+                    "reason": "billing_settlement_failed",
+                }
 
             podcast_transcript = graph_result.get("podcast_transcript", [])
             file_path = graph_result.get("final_podcast_file_path", "")
@@ -133,7 +211,14 @@ async def _generate_content_podcast(
             podcast.podcast_transcript = serializable_transcript
             podcast.file_location = file_path
             podcast.status = PodcastStatus.READY
+            logger.info(
+                "Podcast %s: committing READY transcript_entries=%d file=%s",
+                podcast.id,
+                len(serializable_transcript),
+                file_path,
+            )
             await session.commit()
+            logger.info("Podcast %s: READY commit complete", podcast.id)
 
             logger.info(f"Successfully generated podcast: {podcast.id}")
 
diff --git a/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py b/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py
index e6890b0a8b..e412514075 100644
--- a/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py
+++ b/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py
@@ -7,7 +7,7 @@
 
 from app.celery_app import celery_app
 from app.db import Notification, SearchSourceConnector, SearchSourceConnectorType
-from app.tasks.celery_tasks import get_celery_session_maker
+from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task
 from app.utils.indexing_locks import is_connector_indexing_locked
 
 logger = logging.getLogger(__name__)
@@ -20,15 +20,7 @@ def check_periodic_schedules_task():
     This task runs every minute and triggers indexing for any connector
     whose next_scheduled_at time has passed.
     """
-    import asyncio
-
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
-    try:
-        loop.run_until_complete(_check_and_trigger_schedules())
-    finally:
-        loop.close()
+    return run_async_celery_task(_check_and_trigger_schedules)
 
 
 async def _check_and_trigger_schedules():
@@ -51,50 +43,51 @@ async def _check_and_trigger_schedules():
 
             logger.info(f"Found {len(due_connectors)} connectors due for indexing")
 
-            # Import all indexing tasks
+            # Import indexing tasks for KB connectors only.
+            # Live connectors (Linear, Slack, Jira, ClickUp, Airtable, Discord,
+            # Teams, Gmail, Calendar, Luma) use real-time tools instead.
             from app.tasks.celery_tasks.connector_tasks import (
-                index_airtable_records_task,
-                index_clickup_tasks_task,
                 index_confluence_pages_task,
                 index_crawled_urls_task,
-                index_discord_messages_task,
                 index_elasticsearch_documents_task,
                 index_github_repos_task,
-                index_google_calendar_events_task,
                 index_google_drive_files_task,
-                index_google_gmail_messages_task,
-                index_jira_issues_task,
-                index_linear_issues_task,
-                index_luma_events_task,
                 index_notion_pages_task,
-                index_slack_messages_task,
             )
 
-            # Map connector types to their tasks
             task_map = {
-                SearchSourceConnectorType.SLACK_CONNECTOR: index_slack_messages_task,
                 SearchSourceConnectorType.NOTION_CONNECTOR: index_notion_pages_task,
                 SearchSourceConnectorType.GITHUB_CONNECTOR: index_github_repos_task,
-                SearchSourceConnectorType.LINEAR_CONNECTOR: index_linear_issues_task,
-                SearchSourceConnectorType.JIRA_CONNECTOR: index_jira_issues_task,
                 SearchSourceConnectorType.CONFLUENCE_CONNECTOR: index_confluence_pages_task,
-                SearchSourceConnectorType.CLICKUP_CONNECTOR: index_clickup_tasks_task,
-                SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR: index_google_calendar_events_task,
-                SearchSourceConnectorType.AIRTABLE_CONNECTOR: index_airtable_records_task,
-                SearchSourceConnectorType.GOOGLE_GMAIL_CONNECTOR: index_google_gmail_messages_task,
-                SearchSourceConnectorType.DISCORD_CONNECTOR: index_discord_messages_task,
-                SearchSourceConnectorType.LUMA_CONNECTOR: index_luma_events_task,
                 SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task,
                 SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_crawled_urls_task,
                 SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR: index_google_drive_files_task,
-                # Composio connector types (unified with native Google tasks)
                 SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR: index_google_drive_files_task,
-                SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR: index_google_gmail_messages_task,
-                SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR: index_google_calendar_events_task,
             }
 
+            from app.services.mcp_oauth.registry import LIVE_CONNECTOR_TYPES
+
+            # Disable obsolete periodic indexing for live connectors in one batch.
+            live_disabled = []
+            for connector in due_connectors:
+                if connector.connector_type in LIVE_CONNECTOR_TYPES:
+                    connector.periodic_indexing_enabled = False
+                    connector.next_scheduled_at = None
+                    live_disabled.append(connector)
+            if live_disabled:
+                await session.commit()
+                for c in live_disabled:
+                    logger.info(
+                        "Disabled obsolete periodic indexing for live connector %s (%s)",
+                        c.id,
+                        c.connector_type.value,
+                    )
+
             # Trigger indexing for each due connector
             for connector in due_connectors:
+                if connector in live_disabled:
+                    continue
+
                 # Primary guard: Redis lock indicates a task is currently running.
                 if is_connector_indexing_locked(connector.id):
                     logger.info(
diff --git a/surfsense_backend/app/tasks/celery_tasks/stale_notification_cleanup_task.py b/surfsense_backend/app/tasks/celery_tasks/stale_notification_cleanup_task.py
index e05ae94359..d51c85dee9 100644
--- a/surfsense_backend/app/tasks/celery_tasks/stale_notification_cleanup_task.py
+++ b/surfsense_backend/app/tasks/celery_tasks/stale_notification_cleanup_task.py
@@ -34,7 +34,7 @@
 from app.celery_app import celery_app
 from app.config import config
 from app.db import Document, DocumentStatus, Notification
-from app.tasks.celery_tasks import get_celery_session_maker
+from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task
 
 logger = logging.getLogger(__name__)
 
@@ -69,16 +69,12 @@ def cleanup_stale_indexing_notifications_task():
     Detection: Redis heartbeat key with 2-min TTL. Missing key = stale task.
     Also marks associated pending/processing documents as failed.
     """
-    import asyncio
 
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
+    async def _both() -> None:
+        await _cleanup_stale_notifications()
+        await _cleanup_stale_document_processing_notifications()
 
-    try:
-        loop.run_until_complete(_cleanup_stale_notifications())
-        loop.run_until_complete(_cleanup_stale_document_processing_notifications())
-    finally:
-        loop.close()
+    return run_async_celery_task(_both)
 
 
 async def _cleanup_stale_notifications():
diff --git a/surfsense_backend/app/tasks/celery_tasks/stripe_reconciliation_task.py b/surfsense_backend/app/tasks/celery_tasks/stripe_reconciliation_task.py
index 9b64ee0caa..ace6ef7ca7 100644
--- a/surfsense_backend/app/tasks/celery_tasks/stripe_reconciliation_task.py
+++ b/surfsense_backend/app/tasks/celery_tasks/stripe_reconciliation_task.py
@@ -1,8 +1,7 @@
-"""Reconcile pending Stripe page purchases that might miss webhook fulfillment."""
+"""Reconcile pending Stripe purchases that might miss webhook fulfillment."""
 
 from __future__ import annotations
 
-import asyncio
 import logging
 from datetime import UTC, datetime, timedelta
 
@@ -11,9 +10,14 @@
 
 from app.celery_app import celery_app
 from app.config import config
-from app.db import PagePurchase, PagePurchaseStatus
+from app.db import (
+    PagePurchase,
+    PagePurchaseStatus,
+    PremiumTokenPurchase,
+    PremiumTokenPurchaseStatus,
+)
 from app.routes import stripe_routes
-from app.tasks.celery_tasks import get_celery_session_maker
+from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task
 
 logger = logging.getLogger(__name__)
 
@@ -31,13 +35,7 @@ def get_stripe_client() -> StripeClient | None:
 @celery_app.task(name="reconcile_pending_stripe_page_purchases")
 def reconcile_pending_stripe_page_purchases_task():
     """Recover paid purchases that were left pending due to missed webhook handling."""
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
-    try:
-        loop.run_until_complete(_reconcile_pending_page_purchases())
-    finally:
-        loop.close()
+    return run_async_celery_task(_reconcile_pending_page_purchases)
 
 
 async def _reconcile_pending_page_purchases() -> None:
@@ -126,7 +124,102 @@ async def _reconcile_pending_page_purchases() -> None:
                 await db_session.rollback()
 
         logger.info(
-            "Stripe reconciliation completed. fulfilled=%s failed=%s checked=%s",
+            "Stripe page reconciliation completed. fulfilled=%s failed=%s checked=%s",
+            fulfilled_count,
+            failed_count,
+            len(pending_purchases),
+        )
+
+
+@celery_app.task(name="reconcile_pending_stripe_token_purchases")
+def reconcile_pending_stripe_token_purchases_task():
+    """Recover paid token purchases that were left pending due to missed webhook handling."""
+    return run_async_celery_task(_reconcile_pending_token_purchases)
+
+
+async def _reconcile_pending_token_purchases() -> None:
+    """Reconcile stale pending token purchases against Stripe source of truth."""
+    stripe_client = get_stripe_client()
+    if stripe_client is None:
+        return
+
+    lookback_minutes = max(config.STRIPE_RECONCILIATION_LOOKBACK_MINUTES, 0)
+    batch_size = max(config.STRIPE_RECONCILIATION_BATCH_SIZE, 1)
+    cutoff = datetime.now(UTC) - timedelta(minutes=lookback_minutes)
+
+    async with get_celery_session_maker()() as db_session:
+        pending_purchases = (
+            (
+                await db_session.execute(
+                    select(PremiumTokenPurchase)
+                    .where(
+                        PremiumTokenPurchase.status
+                        == PremiumTokenPurchaseStatus.PENDING,
+                        PremiumTokenPurchase.created_at <= cutoff,
+                    )
+                    .order_by(PremiumTokenPurchase.created_at.asc())
+                    .limit(batch_size)
+                )
+            )
+            .scalars()
+            .all()
+        )
+
+        if not pending_purchases:
+            logger.debug(
+                "Stripe token reconciliation found no pending purchases older than %s minutes.",
+                lookback_minutes,
+            )
+            return
+
+        logger.info(
+            "Stripe token reconciliation checking %s pending purchases (cutoff=%s, batch=%s).",
+            len(pending_purchases),
+            lookback_minutes,
+            batch_size,
+        )
+
+        fulfilled_count = 0
+        failed_count = 0
+
+        for purchase in pending_purchases:
+            checkout_session_id = purchase.stripe_checkout_session_id
+
+            try:
+                checkout_session = stripe_client.v1.checkout.sessions.retrieve(
+                    checkout_session_id
+                )
+            except StripeError:
+                logger.exception(
+                    "Stripe token reconciliation failed to retrieve checkout session %s",
+                    checkout_session_id,
+                )
+                await db_session.rollback()
+                continue
+
+            payment_status = getattr(checkout_session, "payment_status", None)
+            session_status = getattr(checkout_session, "status", None)
+
+            try:
+                if payment_status in {"paid", "no_payment_required"}:
+                    await stripe_routes._fulfill_completed_token_purchase(
+                        db_session, checkout_session
+                    )
+                    fulfilled_count += 1
+                elif session_status == "expired":
+                    await stripe_routes._mark_token_purchase_failed(
+                        db_session, str(checkout_session.id)
+                    )
+                    failed_count += 1
+            except Exception:
+                logger.exception(
+                    "Stripe token reconciliation failed while processing checkout session %s",
+                    checkout_session_id,
+                )
+                await db_session.rollback()
+
+        logger.info(
+            "Stripe token reconciliation completed. fulfilled=%s failed=%s checked=%s",
             fulfilled_count,
             failed_count,
             len(pending_purchases),
diff --git a/surfsense_backend/app/tasks/celery_tasks/video_presentation_tasks.py b/surfsense_backend/app/tasks/celery_tasks/video_presentation_tasks.py
index 7880b385fc..08f22140c5 100644
--- a/surfsense_backend/app/tasks/celery_tasks/video_presentation_tasks.py
+++ b/surfsense_backend/app/tasks/celery_tasks/video_presentation_tasks.py
@@ -3,14 +3,22 @@
 import asyncio
 import logging
 import sys
+from contextlib import asynccontextmanager
 
 from sqlalchemy import select
 
 from app.agents.video_presentation.graph import graph as video_presentation_graph
 from app.agents.video_presentation.state import State as VideoPresentationState
 from app.celery_app import celery_app
+from app.config import config as app_config
 from app.db import VideoPresentation, VideoPresentationStatus
-from app.tasks.celery_tasks import get_celery_session_maker
+from app.services.billable_calls import (
+    BillingSettlementError,
+    QuotaInsufficientError,
+    _resolve_agent_billing_for_search_space,
+    billable_call,
+)
+from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task
 
 logger = logging.getLogger(__name__)
 
@@ -23,6 +31,13 @@
         )
 
 
+@asynccontextmanager
+async def _celery_billable_session():
+    """Session factory used by billable_call inside the Celery worker loop."""
+    async with get_celery_session_maker()() as session:
+        yield session
+
+
 @celery_app.task(name="generate_video_presentation", bind=True)
 def generate_video_presentation_task(
     self,
@@ -35,27 +50,30 @@ def generate_video_presentation_task(
     Celery task to generate video presentation from source content.
     Updates existing video presentation record created by the tool.
     """
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
     try:
-        result = loop.run_until_complete(
-            _generate_video_presentation(
+        return run_async_celery_task(
+            lambda: _generate_video_presentation(
                 video_presentation_id,
                 source_content,
                 search_space_id,
                 user_prompt,
             )
         )
-        loop.run_until_complete(loop.shutdown_asyncgens())
-        return result
     except Exception as e:
         logger.error(f"Error generating video presentation: {e!s}")
-        loop.run_until_complete(_mark_video_presentation_failed(video_presentation_id))
+        # Mark FAILED in a fresh loop — the previous loop is closed.
+        # Swallow secondary failures; the row will simply stay in
+        # GENERATING and be flushed by the periodic stale cleanup.
+        try:
+            run_async_celery_task(
+                lambda: _mark_video_presentation_failed(video_presentation_id)
+            )
+        except Exception:
+            logger.exception(
+                "Failed to mark video presentation %s as failed",
+                video_presentation_id,
+            )
         return {"status": "failed", "video_presentation_id": video_presentation_id}
-    finally:
-        asyncio.set_event_loop(None)
-        loop.close()
 
 
 async def _mark_video_presentation_failed(video_presentation_id: int) -> None:
@@ -97,6 +115,32 @@ async def _generate_video_presentation(
             video_pres.status = VideoPresentationStatus.GENERATING
             await session.commit()
 
+            try:
+                (
+                    owner_user_id,
+                    billing_tier,
+                    base_model,
+                ) = await _resolve_agent_billing_for_search_space(
+                    session,
+                    search_space_id,
+                    thread_id=video_pres.thread_id,
+                )
+            except ValueError as resolve_err:
+                logger.error(
+                    "VideoPresentation %s: cannot resolve billing for "
+                    "search_space=%s: %s",
+                    video_pres.id,
+                    search_space_id,
+                    resolve_err,
+                )
+                video_pres.status = VideoPresentationStatus.FAILED
+                await session.commit()
+                return {
+                    "status": "failed",
+                    "video_presentation_id": video_pres.id,
+                    "reason": "billing_resolution_failed",
+                }
+
             graph_config = {
                 "configurable": {
                     "video_title": video_pres.title,
@@ -110,9 +154,52 @@ async def _generate_video_presentation(
                 db_session=session,
             )
 
-            graph_result = await video_presentation_graph.ainvoke(
-                initial_state, config=graph_config
-            )
+            try:
+                async with billable_call(
+                    user_id=owner_user_id,
+                    search_space_id=search_space_id,
+                    billing_tier=billing_tier,
+                    base_model=base_model,
+                    quota_reserve_micros_override=app_config.QUOTA_DEFAULT_VIDEO_PRESENTATION_RESERVE_MICROS,
+                    usage_type="video_presentation_generation",
+                    call_details={
+                        "video_presentation_id": video_pres.id,
+                        "title": video_pres.title,
+                        "thread_id": video_pres.thread_id,
+                    },
+                    billable_session_factory=_celery_billable_session,
+                ):
+                    graph_result = await video_presentation_graph.ainvoke(
+                        initial_state, config=graph_config
+                    )
+            except QuotaInsufficientError as exc:
+                logger.info(
+                    "VideoPresentation %s denied: out of premium credits "
+                    "(used=%d/%d remaining=%d)",
+                    video_pres.id,
+                    exc.used_micros,
+                    exc.limit_micros,
+                    exc.remaining_micros,
+                )
+                video_pres.status = VideoPresentationStatus.FAILED
+                await session.commit()
+                return {
+                    "status": "failed",
+                    "video_presentation_id": video_pres.id,
+                    "reason": "premium_quota_exhausted",
+                }
+            except BillingSettlementError:
+                logger.exception(
+                    "VideoPresentation %s: premium billing settlement failed",
+                    video_pres.id,
+                )
+                video_pres.status = VideoPresentationStatus.FAILED
+                await session.commit()
+                return {
+                    "status": "failed",
+                    "video_presentation_id": video_pres.id,
+                    "reason": "billing_settlement_failed",
+                }
 
             # Serialize slides (parsed content + audio info merged)
             slides_raw = graph_result.get("slides", [])
@@ -143,7 +230,14 @@ async def _generate_video_presentation(
             video_pres.slides = serializable_slides
             video_pres.scene_codes = serializable_scene_codes
             video_pres.status = VideoPresentationStatus.READY
+            logger.info(
+                "VideoPresentation %s: committing READY slides=%d scene_codes=%d",
+                video_pres.id,
+                len(serializable_slides),
+                len(serializable_scene_codes),
+            )
             await session.commit()
+            logger.info("VideoPresentation %s: READY commit complete", video_pres.id)
 
             logger.info(f"Successfully generated video presentation: {video_pres.id}")
 
diff --git a/surfsense_backend/app/tasks/chat/content_builder.py b/surfsense_backend/app/tasks/chat/content_builder.py
new file mode 100644
index 0000000000..c8f79c0450
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/content_builder.py
@@ -0,0 +1,554 @@
+"""Server-side mirror of the frontend's assistant-ui ``ContentPart`` projection.
+
+Background
+----------
+The streaming chat task in ``stream_new_chat`` / ``stream_resume_chat`` yields
+SSE events that the frontend folds into a ``ContentPartsState`` (see
+``surfsense_web/lib/chat/streaming-state.ts`` and the matching pipeline in
+``stream-pipeline.ts``). When a turn ends, the frontend calls
+``buildContentForPersistence(...)`` and round-trips that ``ContentPart[]``
+JSONB to ``POST /threads/{id}/messages``, which is what was historically
+written to ``new_chat_messages.content``.
+
+After the ghost-thread fix moved persistence server-side, the assistant
+row is written by ``finalize_assistant_turn`` in the streaming finally
+block. The frontend's later ``appendMessage`` is now a no-op (recovers
+via the ``(thread_id, turn_id, role)`` partial unique index added in
+migration 141), which means the *server* is now responsible for
+producing the rich ``ContentPart[]`` shape the FE expects on history
+reload — text + reasoning + tool-call cards (with ``args``, ``argsText``,
+``result``, ``langchainToolCallId``) + thinking-step buckets +
+step-separators.
+
+This module is the in-memory accumulator that mirrors the FE state for
+exactly that purpose. The streaming code calls ``on_text_*`` / ``on_reasoning_*``
+/ ``on_tool_*`` / ``on_thinking_step`` / ``on_step_separator`` /
+``mark_interrupted`` at the same call sites it yields the matching
+``streaming_service.format_*`` SSE event, so the in-memory ``parts`` list
+stays in lockstep with what the FE's pipeline would have produced live.
+``snapshot()`` is then taken once in the ``finally`` block and persisted
+in a single UPDATE.
+
+Pure synchronous state — no DB I/O, no async, no flush callbacks. The
+streaming code is responsible for driving lifecycle methods; this class
+is a thin projection helper.
+"""
+
+from __future__ import annotations
+
+import copy
+import json
+import logging
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+# Mirrors the FE's filter in ``buildContentForPersistence`` / ``buildContentForUI``:
+# only text/reasoning/tool-call parts count as "meaningful". data-thinking-steps
+# and data-step-separator decorate the meaningful parts but never stand alone
+# in a successful turn.
+_MEANINGFUL_PART_TYPES: frozenset[str] = frozenset({"text", "reasoning", "tool-call"})
+
+
+def _merge_tool_part_metadata(
+    part: dict[str, Any], metadata: dict[str, Any] | None
+) -> None:
+    """Shallow-merge ``metadata`` into ``part["metadata"]``; first key wins.
+
+    Used for tool-call linkage (``spanId``, ``thinkingStepId``, …): a later
+    event must not overwrite an existing key so chunk order vs ``on_tool_start``
+    stays stable.
+    """
+    if not metadata:
+        return
+    md = part.setdefault("metadata", {})
+    for k, v in metadata.items():
+        if k not in md:
+            md[k] = v
+
+
+class AssistantContentBuilder:
+    """Server-side projection of ``surfsense_web/lib/chat/streaming-state.ts``.
+
+    Output shape (deep copy of ``self.parts`` via ``snapshot()``) strictly
+    matches the FE ``ContentPart`` union::
+
+        | { type: "text"; text: string }
+        | { type: "reasoning"; text: string }
+        | { type: "tool-call"; toolCallId: str; toolName: str;
+            args: dict; result?: any; argsText?: str; langchainToolCallId?: str;
+            metadata?: { spanId?: str; thinkingStepId?: str; ... };
+            state?: "aborted" }
+        | { type: "data-thinking-steps"; data: { steps: ThinkingStepData[] } }
+        | { type: "data-step-separator"; data: { stepIndex: int } }
+
+    Order matches the wire order of the SSE events that drive the lifecycle
+    methods, with two FE-mirrored exceptions:
+
+    1. ``data-thinking-steps`` is a *singleton* and pinned at index 0 the
+       first time we see a ``data-thinking-step`` SSE event (the FE's
+       ``updateThinkingSteps`` does ``unshift`` on first sight). Subsequent
+       thinking-step updates mutate that singleton in place.
+    2. ``data-step-separator`` is appended only when the message already has
+       meaningful content and the previous part isn't itself a separator
+       (so the FIRST step of a turn doesn't generate a leading divider).
+    """
+
+    def __init__(self) -> None:
+        self.parts: list[dict[str, Any]] = []
+        # Index of the active text/reasoning part within ``parts`` while
+        # streaming is open; -1 means "no active part" and the next delta
+        # opens a fresh one. Mirrors ``ContentPartsState.currentTextPartIndex``.
+        self._current_text_idx: int = -1
+        self._current_reasoning_idx: int = -1
+        # ``ui_id``-keyed indexes for tool-call parts. ``ui_id`` is the
+        # synthetic ``call_<run_id>`` (chunk fallback) or the LangChain
+        # ``tool_call.id`` (indexed chunk path) — same key the streaming layer
+        # threads through every ``tool-input-*`` / ``tool-output-*`` event.
+        self._tool_call_idx_by_ui_id: dict[str, int] = {}
+        # Live argsText accumulator (concatenated ``tool-input-delta`` chunks)
+        # so we can reproduce the FE's ``appendToolInputDelta`` behaviour
+        # before ``tool-input-available`` overwrites it with the
+        # pretty-printed final JSON.
+        self._args_text_by_ui_id: dict[str, str] = {}
+
+    # ------------------------------------------------------------------
+    # Text
+    # ------------------------------------------------------------------
+
+    def on_text_start(self, text_id: str) -> None:
+        """Begin a fresh text block.
+
+        Symmetric to FE ``appendText``: opening text closes any active
+        reasoning so the renderer treats them as separate parts. The
+        actual text part isn't materialised here — it's lazily created
+        on the first ``on_text_delta`` so an empty start/end pair
+        leaves no trace. Matches the FE pipeline which has no explicit
+        ``text-start`` handler at all.
+        """
+        if self._current_reasoning_idx >= 0:
+            self._current_reasoning_idx = -1
+
+    def on_text_delta(self, text_id: str, delta: str) -> None:
+        if not delta:
+            return
+        if self._current_reasoning_idx >= 0:
+            # FE behaviour: a text delta after reasoning implicitly
+            # closes the reasoning block (see ``appendText`` lines
+            # 178-180).
+            self._current_reasoning_idx = -1
+        if (
+            self._current_text_idx >= 0
+            and 0 <= self._current_text_idx < len(self.parts)
+            and self.parts[self._current_text_idx].get("type") == "text"
+        ):
+            self.parts[self._current_text_idx]["text"] += delta
+            return
+        self.parts.append({"type": "text", "text": delta})
+        self._current_text_idx = len(self.parts) - 1
+
+    def on_text_end(self, text_id: str) -> None:
+        """Close the active text block.
+
+        Mirrors the wire-level ``text-end`` boundary the streaming layer
+        emits before tool calls / reasoning / step boundaries. The FE
+        pipeline implicitly closes via ``currentTextPartIndex = -1``
+        in ``addToolCall`` / ``appendReasoning`` / ``addStepSeparator``;
+        our helper does the same explicitly so callers don't have to
+        maintain that invariant per call site.
+        """
+        self._current_text_idx = -1
+
+    # ------------------------------------------------------------------
+    # Reasoning
+    # ------------------------------------------------------------------
+
+    def on_reasoning_start(self, reasoning_id: str) -> None:
+        if self._current_text_idx >= 0:
+            self._current_text_idx = -1
+
+    def on_reasoning_delta(self, reasoning_id: str, delta: str) -> None:
+        if not delta:
+            return
+        if self._current_text_idx >= 0:
+            self._current_text_idx = -1
+        if (
+            self._current_reasoning_idx >= 0
+            and 0 <= self._current_reasoning_idx < len(self.parts)
+            and self.parts[self._current_reasoning_idx].get("type") == "reasoning"
+        ):
+            self.parts[self._current_reasoning_idx]["text"] += delta
+            return
+        self.parts.append({"type": "reasoning", "text": delta})
+        self._current_reasoning_idx = len(self.parts) - 1
+
+    def on_reasoning_end(self, reasoning_id: str) -> None:
+        self._current_reasoning_idx = -1
+
+    # ------------------------------------------------------------------
+    # Tool calls
+    # ------------------------------------------------------------------
+
+    def on_tool_input_start(
+        self,
+        ui_id: str,
+        tool_name: str,
+        langchain_tool_call_id: str | None,
+        *,
+        metadata: dict[str, Any] | None = None,
+    ) -> None:
+        """Register a tool-call card. Args are filled in by later events.
+
+        Optional ``metadata`` (``spanId``, ``thinkingStepId``, …) is stored on the
+        part; duplicate ``tool-input-start`` calls merge with first-key-wins.
+        """
+        if not ui_id:
+            return
+        # Skip duplicate registration: the stream may emit
+        # ``tool-input-start`` from both ``on_chat_model_stream``
+        # (when tool_call_chunks register a name) and ``on_tool_start``
+        # (the canonical path). The FE de-dupes via ``toolCallIndices``;
+        # we mirror that here.
+        if ui_id in self._tool_call_idx_by_ui_id:
+            idx = self._tool_call_idx_by_ui_id[ui_id]
+            part = self.parts[idx]
+            if langchain_tool_call_id and not part.get("langchainToolCallId"):
+                part["langchainToolCallId"] = langchain_tool_call_id
+            _merge_tool_part_metadata(part, metadata)
+            return
+
+        part: dict[str, Any] = {
+            "type": "tool-call",
+            "toolCallId": ui_id,
+            "toolName": tool_name,
+            "args": {},
+        }
+        if langchain_tool_call_id:
+            part["langchainToolCallId"] = langchain_tool_call_id
+        if metadata:
+            part["metadata"] = dict(metadata)
+        self.parts.append(part)
+        self._tool_call_idx_by_ui_id[ui_id] = len(self.parts) - 1
+
+        self._current_text_idx = -1
+        self._current_reasoning_idx = -1
+
+    def on_tool_input_delta(self, ui_id: str, args_chunk: str) -> None:
+        """Append a streamed args-delta chunk to the matching card's argsText.
+
+        Mirrors FE ``appendToolInputDelta``: no-ops when no card has been
+        registered yet for the given ``ui_id`` — the deltas have nowhere
+        safe to land.
+        """
+        if not ui_id or not args_chunk:
+            return
+        idx = self._tool_call_idx_by_ui_id.get(ui_id)
+        if idx is None:
+            return
+        if not (0 <= idx < len(self.parts)):
+            return
+        part = self.parts[idx]
+        if part.get("type") != "tool-call":
+            return
+        new_text = (part.get("argsText") or "") + args_chunk
+        part["argsText"] = new_text
+        self._args_text_by_ui_id[ui_id] = new_text
+
+    def on_tool_input_available(
+        self,
+        ui_id: str,
+        tool_name: str,
+        args: dict[str, Any],
+        langchain_tool_call_id: str | None,
+        *,
+        metadata: dict[str, Any] | None = None,
+    ) -> None:
+        """Finalize the tool-call card's input.
+
+        Mirrors FE ``stream-pipeline.ts`` lines 127-153: replaces ``argsText``
+        with ``json.dumps(input, indent=2)`` so the post-stream card renders
+        pretty-printed JSON, sets the full ``args`` dict, and backfills
+        ``langchainToolCallId`` if it wasn't known at ``tool-input-start`` time.
+        Also creates the card if no prior ``tool-input-start`` registered it
+        (late-registration when no prior ``tool-input-start``).
+        """
+        if not ui_id:
+            return
+        try:
+            final_args_text = json.dumps(args or {}, indent=2, ensure_ascii=False)
+        except (TypeError, ValueError):
+            # Defensive: ``args`` should already be JSON-safe (the
+            # streaming layer sanitizes it before emitting), but if a
+            # caller hands us a non-serializable value we still want
+            # to record the call without breaking the snapshot.
+            final_args_text = str(args)
+
+        idx = self._tool_call_idx_by_ui_id.get(ui_id)
+        if idx is not None and 0 <= idx < len(self.parts):
+            part = self.parts[idx]
+            if part.get("type") == "tool-call":
+                part["args"] = args or {}
+                part["argsText"] = final_args_text
+                if langchain_tool_call_id and not part.get("langchainToolCallId"):
+                    part["langchainToolCallId"] = langchain_tool_call_id
+                _merge_tool_part_metadata(part, metadata)
+                return
+
+        # No prior tool-input-start: register the card now.
+        new_part: dict[str, Any] = {
+            "type": "tool-call",
+            "toolCallId": ui_id,
+            "toolName": tool_name,
+            "args": args or {},
+            "argsText": final_args_text,
+        }
+        if langchain_tool_call_id:
+            new_part["langchainToolCallId"] = langchain_tool_call_id
+        _merge_tool_part_metadata(new_part, metadata)
+        self.parts.append(new_part)
+        self._tool_call_idx_by_ui_id[ui_id] = len(self.parts) - 1
+
+        self._current_text_idx = -1
+        self._current_reasoning_idx = -1
+
+    def on_tool_output_available(
+        self,
+        ui_id: str,
+        output: Any,
+        langchain_tool_call_id: str | None,
+        *,
+        metadata: dict[str, Any] | None = None,
+    ) -> None:
+        """Attach the tool's output (``result``) to the matching card.
+
+        Mirrors FE ``updateToolCall``: backfill ``langchainToolCallId``
+        only if not already set (a NULL late-arriving value never blows
+        away an earlier known good one).
+        """
+        if not ui_id:
+            return
+        idx = self._tool_call_idx_by_ui_id.get(ui_id)
+        if idx is None or not (0 <= idx < len(self.parts)):
+            return
+        part = self.parts[idx]
+        if part.get("type") != "tool-call":
+            return
+        part["result"] = output
+        if langchain_tool_call_id and not part.get("langchainToolCallId"):
+            part["langchainToolCallId"] = langchain_tool_call_id
+        _merge_tool_part_metadata(part, metadata)
+
+    # ------------------------------------------------------------------
+    # Thinking steps & step separators
+    # ------------------------------------------------------------------
+
+    def on_thinking_step(
+        self,
+        step_id: str,
+        title: str,
+        status: str,
+        items: list[str] | None,
+        *,
+        metadata: dict[str, Any] | None = None,
+    ) -> None:
+        """Update / insert the singleton ``data-thinking-steps`` part.
+
+        Mirrors FE ``updateThinkingSteps``: maintain a single
+        ``data-thinking-steps`` part anchored at index 0, replacing or
+        unshifting on first sight. Each ``on_thinking_step`` call
+        replaces the entry in the steps list keyed by ``step_id`` (or
+        appends if new).
+        """
+        if not step_id:
+            return
+
+        new_step: dict[str, Any] = {
+            "id": step_id,
+            "title": title or "",
+            "status": status or "in_progress",
+            "items": list(items) if items else [],
+        }
+        if metadata:
+            new_step["metadata"] = dict(metadata)
+
+        # Find existing data-thinking-steps part.
+        existing_idx = -1
+        for i, p in enumerate(self.parts):
+            if p.get("type") == "data-thinking-steps":
+                existing_idx = i
+                break
+
+        if existing_idx >= 0:
+            current_steps = self.parts[existing_idx].get("data", {}).get("steps") or []
+            replaced = False
+            for i, step in enumerate(current_steps):
+                if step.get("id") == step_id:
+                    if not metadata and step.get("metadata"):
+                        new_step["metadata"] = dict(step["metadata"])
+                    current_steps[i] = new_step
+                    replaced = True
+                    break
+            if not replaced:
+                current_steps.append(new_step)
+            self.parts[existing_idx] = {
+                "type": "data-thinking-steps",
+                "data": {"steps": current_steps},
+            }
+            return
+
+        # First sight: unshift to position 0 (FE parity).
+        self.parts.insert(
+            0,
+            {
+                "type": "data-thinking-steps",
+                "data": {"steps": [new_step]},
+            },
+        )
+        # Bump tracked indices since we inserted at the head.
+        if self._current_text_idx >= 0:
+            self._current_text_idx += 1
+        if self._current_reasoning_idx >= 0:
+            self._current_reasoning_idx += 1
+        for ui_id, idx in list(self._tool_call_idx_by_ui_id.items()):
+            self._tool_call_idx_by_ui_id[ui_id] = idx + 1
+
+    def on_step_separator(self) -> None:
+        """Append a ``data-step-separator`` between consecutive model steps.
+
+        Mirrors FE ``addStepSeparator``: only emit when the message
+        already has meaningful content AND the previous part isn't
+        itself a separator. ``stepIndex`` is the running count of
+        separators already in ``parts``.
+        """
+        has_content = any(p.get("type") in _MEANINGFUL_PART_TYPES for p in self.parts)
+        if not has_content:
+            return
+        if self.parts and self.parts[-1].get("type") == "data-step-separator":
+            return
+        step_index = sum(
+            1 for p in self.parts if p.get("type") == "data-step-separator"
+        )
+        self.parts.append(
+            {
+                "type": "data-step-separator",
+                "data": {"stepIndex": step_index},
+            }
+        )
+        self._current_text_idx = -1
+        self._current_reasoning_idx = -1
+
+    # ------------------------------------------------------------------
+    # Interruption handling
+    # ------------------------------------------------------------------
+
+    def mark_interrupted(self) -> None:
+        """Close any open text/reasoning and flip running tools to aborted.
+
+        Called from the streaming ``finally`` block before ``snapshot()`` so
+        the persisted JSONB reflects a coherent end-state even when the
+        client disconnected mid-turn or the agent hit a fatal error.
+
+        - Active text/reasoning blocks: simply lose their "active"
+          marker (no synthetic content appended). Whatever was streamed
+          stays as-is.
+        - Tool-call parts that never received a ``result`` get
+          ``state="aborted"`` so the FE history loader can render them
+          as "interrupted" rather than "still running".
+        """
+        self._current_text_idx = -1
+        self._current_reasoning_idx = -1
+        for part in self.parts:
+            if part.get("type") != "tool-call":
+                continue
+            if "result" in part:
+                continue
+            part["state"] = "aborted"
+
+    # ------------------------------------------------------------------
+    # Snapshot & introspection
+    # ------------------------------------------------------------------
+
+    def snapshot(self) -> list[dict[str, Any]]:
+        """Return a deep copy of ``parts`` ready for SQL UPDATE / json.dumps.
+
+        Deep-copied so callers that finalize from the shielded ``finally``
+        block can't accidentally mutate the persisted payload while the
+        SQL UPDATE is in flight (the streaming layer doesn't touch the
+        builder after this call, but defensive copies are cheap and cheap
+        is what we want in a finally block).
+        """
+        return copy.deepcopy(self.parts)
+
+    def is_empty(self) -> bool:
+        """True if no meaningful content was captured.
+
+        ``data-thinking-steps`` and ``data-step-separator`` decorate
+        meaningful content but don't count on their own — a turn that
+        only emitted a thinking step before being interrupted should
+        still be treated as empty for the status-marker fallback.
+        """
+        return not any(p.get("type") in _MEANINGFUL_PART_TYPES for p in self.parts)
+
+    def stats(self) -> dict[str, int]:
+        """Return counts of each part-type plus rough byte size.
+
+        Used by the streaming layer's perf logger so an ops dashboard
+        can correlate finalize latency with payload size, and so a
+        regression that quietly stops emitting tool-call parts (or
+        starts emitting hundreds) shows up in [PERF] grep rather than
+        only as a "history reload looks weird" bug report.
+
+        ``bytes`` is the JSON-serialised payload length — what actually
+        crosses the wire to PostgreSQL's JSONB column. We compute it
+        with ``ensure_ascii=False`` to match the JSONB encoder's UTF-8
+        on-disk layout closely enough for back-of-the-envelope sizing.
+        Reasoning/text/tool-call/thinking-step/step-separator counts are
+        independent so any one can spike without the others.
+
+        Defensive: ``json.dumps`` failure (a non-serializable value
+        slipped past the streaming layer's sanitization) is reported as
+        ``bytes=-1`` rather than raised — perf logging must not be the
+        thing that breaks the streaming finally block.
+        """
+        text_blocks = 0
+        reasoning_blocks = 0
+        tool_calls = 0
+        tool_calls_completed = 0
+        tool_calls_aborted = 0
+        thinking_step_parts = 0
+        step_separators = 0
+
+        for part in self.parts:
+            kind = part.get("type")
+            if kind == "text":
+                text_blocks += 1
+            elif kind == "reasoning":
+                reasoning_blocks += 1
+            elif kind == "tool-call":
+                tool_calls += 1
+                if part.get("state") == "aborted":
+                    tool_calls_aborted += 1
+                elif "result" in part:
+                    tool_calls_completed += 1
+            elif kind == "data-thinking-steps":
+                thinking_step_parts += 1
+            elif kind == "data-step-separator":
+                step_separators += 1
+
+        try:
+            byte_size = len(json.dumps(self.parts, ensure_ascii=False, default=str))
+        except (TypeError, ValueError):
+            byte_size = -1
+
+        return {
+            "parts": len(self.parts),
+            "bytes": byte_size,
+            "text": text_blocks,
+            "reasoning": reasoning_blocks,
+            "tool_calls": tool_calls,
+            "tool_calls_completed": tool_calls_completed,
+            "tool_calls_aborted": tool_calls_aborted,
+            "thinking_step_parts": thinking_step_parts,
+            "step_separators": step_separators,
+        }
diff --git a/surfsense_backend/app/tasks/chat/persistence.py b/surfsense_backend/app/tasks/chat/persistence.py
new file mode 100644
index 0000000000..37be507052
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/persistence.py
@@ -0,0 +1,538 @@
+"""Server-side message persistence helpers for the streaming chat agent.
+
+Historically the streaming task (``stream_new_chat``/``stream_resume_chat``)
+left ``new_chat_messages`` empty and relied on the frontend to round-trip
+``POST /threads/{id}/messages`` afterwards. That gave authenticated clients
+a "ghost-thread" abuse vector: skip the round-trip and burn LLM tokens
+without leaving an audit trail. These helpers move both writes (the user
+turn that triggered the stream and the assistant turn the stream produced)
+into the server itself, idempotent against the partial unique index
+``uq_new_chat_messages_thread_turn_role`` so legacy frontends that *do*
+keep posting via ``appendMessage`` simply hit the unique-index recovery
+path on the second writer instead of creating duplicates.
+
+Assistant turn lifecycle
+------------------------
+The assistant side is split into two helpers so we can capture the row id
+*before* the stream produces any output:
+
+* ``persist_assistant_shell`` runs immediately after ``persist_user_turn``
+  and INSERTs an empty assistant row anchored to ``(thread_id, turn_id,
+  ASSISTANT)``. Returns the row id so the streaming layer can correlate
+  later writes (token_usage, AgentActionLog future-correlation) against
+  a stable PK from the start of the turn.
+* ``finalize_assistant_turn`` runs from the streaming ``finally`` block.
+  It UPDATEs the row's ``content`` to the rich ``ContentPart[]`` snapshot
+  produced server-side by ``AssistantContentBuilder`` and writes the
+  ``token_usage`` row using ``INSERT ... ON CONFLICT DO NOTHING`` against
+  the ``uq_token_usage_message_id`` partial unique index from migration
+  142, hard-eliminating any race against ``append_message``'s recovery
+  branch.
+
+Defensive contract
+------------------
+
+* Every helper runs inside ``shielded_async_session()`` so ``session.close()``
+  survives starlette's mid-stream cancel scope on client disconnect.
+* ``persist_user_turn`` and ``persist_assistant_shell`` use ``INSERT ... ON
+  CONFLICT DO NOTHING ... RETURNING id`` keyed on the ``(thread_id, turn_id,
+  role)`` partial unique index. On conflict the insert silently no-ops at
+  the DB level — no Python ``IntegrityError`` is constructed, which
+  eliminates spurious debugger pauses and keeps logs clean. On conflict a
+  follow-up ``SELECT`` resolves the existing row id so the streaming layer
+  can correlate writes against a stable PK.
+* ``finalize_assistant_turn`` is best-effort: it never raises. The
+  streaming ``finally`` block calls it from within
+  ``anyio.CancelScope(shield=True)`` and any raised exception there
+  would mask the real error.
+"""
+
+from __future__ import annotations
+
+import logging
+import time
+from datetime import UTC, datetime
+from typing import Any
+from uuid import UUID
+
+from sqlalchemy import text as sa_text
+from sqlalchemy.dialects.postgresql import insert as pg_insert
+from sqlalchemy.future import select
+
+from app.db import (
+    NewChatMessage,
+    NewChatMessageRole,
+    NewChatThread,
+    TokenUsage,
+    shielded_async_session,
+)
+from app.services.token_tracking_service import (
+    TurnTokenAccumulator,
+)
+from app.utils.perf import get_perf_logger
+
+logger = logging.getLogger(__name__)
+_perf_log = get_perf_logger()
+
+
+# Empty initial assistant content. ``finalize_assistant_turn`` overwrites
+# this in a single UPDATE at end-of-stream with the full ``ContentPart[]``
+# snapshot produced by ``AssistantContentBuilder``. We persist a one-element
+# list with an empty text part so a crash between shell-INSERT and finalize
+# leaves the row in a FE-renderable shape (blank bubble) instead of
+# blowing up the history loader.
+_EMPTY_SHELL_CONTENT: list[dict[str, Any]] = [{"type": "text", "text": ""}]
+
+# Substituted content for genuinely empty turns (no text, no reasoning,
+# no tool calls). The streaming layer flips to this when
+# ``AssistantContentBuilder.is_empty()`` returns True so the persisted
+# row is at least somewhat self-describing instead of an empty text
+# bubble. The FE's ``ContentPart`` union doesn't include ``status``
+# yet, so the history loader will silently drop this part and render
+# a blank bubble (matches today's behaviour for empty turns); a follow-up
+# FE PR adds the explicit "no response" rendering.
+_STATUS_NO_RESPONSE: list[dict[str, Any]] = [
+    {"type": "status", "text": "(no text response)"}
+]
+
+
+def _build_user_content(
+    user_query: str,
+    user_image_data_urls: list[str] | None,
+    mentioned_documents: list[dict[str, Any]] | None = None,
+) -> list[dict[str, Any]]:
+    """Build the persisted user-message ``content`` (assistant-ui v2 parts).
+
+    Mirrors the shape the existing frontend posts via
+    ``appendMessage`` (see ``surfsense_web/.../new-chat/[[...chat_id]]/page.tsx``):
+
+        [{"type": "text", "text": "..."},
+         {"type": "image", "image": "data:..."},
+         {"type": "mentioned-documents", "documents": [{"id": int,
+            "title": str, "document_type": str, "kind": "doc" | "folder"},
+            ...]}]
+
+    The companion reader is
+    ``app.utils.user_message_multimodal.split_persisted_user_content_parts``
+    which expects exactly this shape — keep them in sync.
+
+    ``mentioned_documents``: optional list of mention chip dicts. Each
+    dict may include a ``kind`` discriminator (``"doc"`` or ``"folder"``)
+    so the persisted ContentPart round-trips folder chips on reload.
+    When ``kind`` is missing we default to ``"doc"`` so legacy clients
+    that haven't migrated to the union schema still persist correctly.
+    """
+    parts: list[dict[str, Any]] = [{"type": "text", "text": user_query or ""}]
+    for url in user_image_data_urls or ():
+        if isinstance(url, str) and url:
+            parts.append({"type": "image", "image": url})
+    if mentioned_documents:
+        normalized: list[dict[str, Any]] = []
+        for doc in mentioned_documents:
+            if not isinstance(doc, dict):
+                continue
+            doc_id = doc.get("id")
+            title = doc.get("title")
+            document_type = doc.get("document_type")
+            if doc_id is None or title is None or document_type is None:
+                continue
+            kind_raw = doc.get("kind", "doc")
+            kind = kind_raw if kind_raw in ("doc", "folder") else "doc"
+            normalized.append(
+                {
+                    "id": doc_id,
+                    "title": str(title),
+                    "document_type": str(document_type),
+                    "kind": kind,
+                }
+            )
+        if normalized:
+            parts.append({"type": "mentioned-documents", "documents": normalized})
+    return parts
+
+
+async def persist_user_turn(
+    *,
+    chat_id: int,
+    user_id: str | None,
+    turn_id: str,
+    user_query: str,
+    user_image_data_urls: list[str] | None = None,
+    mentioned_documents: list[dict[str, Any]] | None = None,
+) -> int | None:
+    """Persist the user-side row for a chat turn and return its ``id``.
+
+    Uses ``INSERT ... ON CONFLICT DO NOTHING ... RETURNING id`` keyed on the
+    ``(thread_id, turn_id, role)`` partial unique index from migration 141
+    (``WHERE turn_id IS NOT NULL``). On conflict the insert silently no-ops
+    at the DB level — no Python ``IntegrityError`` is constructed, which
+    eliminates the debugger pause that ``justMyCode=false`` + async greenlet
+    interactions used to produce, and keeps production logs clean.
+
+    Returns the ``id`` of the row that exists for this turn after the call:
+    the freshly inserted ``id`` on the happy path, or the existing ``id``
+    when a previous writer (legacy FE ``appendMessage`` racing the SSE
+    stream, redelivered request, etc.) already wrote it. Returns ``None``
+    only on genuine DB failure; the caller should yield a streaming error
+    and abort the turn so we never produce a title/assistant row that
+    isn't anchored to a persisted user message.
+
+    Other constraint violations (FK, NOT NULL, etc.) still raise
+    ``IntegrityError`` — only the ``(thread_id, turn_id, role)`` collision
+    is silenced.
+    """
+    if not turn_id:
+        # Defensive: turn_id is always populated by the streaming path
+        # before this helper is called. If it isn't, we cannot be
+        # idempotent against the unique index — refuse to write rather
+        # than create a row the unique index can't dedupe.
+        logger.error(
+            "persist_user_turn called without a turn_id (chat_id=%s); skipping",
+            chat_id,
+        )
+        return None
+
+    t0 = time.perf_counter()
+    outcome = "failed"
+    resolved_id: int | None = None
+    try:
+        async with shielded_async_session() as ws:
+            # Re-attach the thread row so we can also bump updated_at
+            # in the same write — keeps the sidebar ordering accurate
+            # when a user fires off a turn but never reaches the
+            # legacy appendMessage.
+            thread = await ws.get(NewChatThread, chat_id)
+            author_uuid: UUID | None = None
+            if user_id:
+                try:
+                    author_uuid = UUID(user_id)
+                except (TypeError, ValueError):
+                    logger.warning(
+                        "persist_user_turn: invalid user_id=%r, persisting as anonymous",
+                        user_id,
+                    )
+
+            content_payload = _build_user_content(
+                user_query, user_image_data_urls, mentioned_documents
+            )
+            insert_stmt = (
+                pg_insert(NewChatMessage)
+                .values(
+                    thread_id=chat_id,
+                    role=NewChatMessageRole.USER,
+                    content=content_payload,
+                    author_id=author_uuid,
+                    turn_id=turn_id,
+                )
+                .on_conflict_do_nothing(
+                    index_elements=["thread_id", "turn_id", "role"],
+                    index_where=sa_text("turn_id IS NOT NULL"),
+                )
+                .returning(NewChatMessage.id)
+            )
+            inserted_id = (await ws.execute(insert_stmt)).scalar()
+
+            if inserted_id is None:
+                # Conflict on partial unique index — another writer
+                # (legacy FE appendMessage, redelivered request, etc.)
+                # already persisted this row. Look it up and reuse.
+                lookup = await ws.execute(
+                    select(NewChatMessage.id).where(
+                        NewChatMessage.thread_id == chat_id,
+                        NewChatMessage.turn_id == turn_id,
+                        NewChatMessage.role == NewChatMessageRole.USER,
+                    )
+                )
+                existing_id = lookup.scalars().first()
+                if existing_id is None:
+                    # Conflict reported but no row found — extremely
+                    # unlikely (concurrent DELETE). Surface as failure.
+                    logger.warning(
+                        "persist_user_turn: conflict but no matching row "
+                        "(chat_id=%s, turn_id=%s)",
+                        chat_id,
+                        turn_id,
+                    )
+                    outcome = "integrity_no_match"
+                    return None
+                resolved_id = int(existing_id)
+                outcome = "race_recovered"
+            else:
+                resolved_id = int(inserted_id)
+                outcome = "inserted"
+                # Bump thread.updated_at only on a real insert — when
+                # we recovered an existing row the prior writer
+                # already touched the thread.
+                if thread is not None:
+                    thread.updated_at = datetime.now(UTC)
+
+            await ws.commit()
+            return resolved_id
+    except Exception:
+        logger.exception(
+            "persist_user_turn failed (chat_id=%s, turn_id=%s)",
+            chat_id,
+            turn_id,
+        )
+        return None
+    finally:
+        _perf_log.info(
+            "[persist_user_turn] outcome=%s chat_id=%s turn_id=%s "
+            "message_id=%s query_len=%d images=%d mentioned_docs=%d "
+            "in %.3fs",
+            outcome,
+            chat_id,
+            turn_id,
+            resolved_id,
+            len(user_query or ""),
+            len(user_image_data_urls or ()),
+            len(mentioned_documents or ()),
+            time.perf_counter() - t0,
+        )
+
+
+async def persist_assistant_shell(
+    *,
+    chat_id: int,
+    user_id: str | None,
+    turn_id: str,
+) -> int | None:
+    """Pre-write an empty assistant row for the turn and return its id.
+
+    Inserts a placeholder ``new_chat_messages`` row (empty text content) so
+    the streaming layer has a stable ``message_id`` to correlate against
+    for the rest of the turn. ``finalize_assistant_turn`` overwrites the
+    ``content`` field at end-of-stream with the rich ``ContentPart[]``
+    snapshot produced by ``AssistantContentBuilder``.
+
+    Returns the row id on success, ``None`` on a genuine DB failure (caller
+    should abort the turn rather than stream into a void).
+
+    Idempotent against the ``(thread_id, turn_id, ASSISTANT)`` partial unique
+    index from migration 141: if a row already exists (resume retry, racing
+    legacy frontend, redelivered request, etc.) we look it up by
+    ``(thread_id, turn_id, role)`` and return its existing id. The streaming
+    layer is then free to UPDATE that row at finalize time.
+    """
+    if not turn_id:
+        logger.error(
+            "persist_assistant_shell called without a turn_id (chat_id=%s); skipping",
+            chat_id,
+        )
+        return None
+
+    t0 = time.perf_counter()
+    outcome = "failed"
+    resolved_id: int | None = None
+    try:
+        async with shielded_async_session() as ws:
+            insert_stmt = (
+                pg_insert(NewChatMessage)
+                .values(
+                    thread_id=chat_id,
+                    role=NewChatMessageRole.ASSISTANT,
+                    content=_EMPTY_SHELL_CONTENT,
+                    author_id=None,
+                    turn_id=turn_id,
+                )
+                .on_conflict_do_nothing(
+                    index_elements=["thread_id", "turn_id", "role"],
+                    index_where=sa_text("turn_id IS NOT NULL"),
+                )
+                .returning(NewChatMessage.id)
+            )
+            inserted_id = (await ws.execute(insert_stmt)).scalar()
+
+            if inserted_id is None:
+                # Conflict — another writer (legacy FE appendMessage,
+                # resume retry, redelivered request) wrote the
+                # (thread_id, turn_id, ASSISTANT) row first. Look it up
+                # so the streaming layer can UPDATE the same row at
+                # finalize time.
+                lookup = await ws.execute(
+                    select(NewChatMessage.id).where(
+                        NewChatMessage.thread_id == chat_id,
+                        NewChatMessage.turn_id == turn_id,
+                        NewChatMessage.role == NewChatMessageRole.ASSISTANT,
+                    )
+                )
+                existing_id = lookup.scalars().first()
+                if existing_id is None:
+                    logger.warning(
+                        "persist_assistant_shell: conflict but no matching "
+                        "(thread_id, turn_id, role) row found "
+                        "(chat_id=%s, turn_id=%s)",
+                        chat_id,
+                        turn_id,
+                    )
+                    outcome = "integrity_no_match"
+                    return None
+                resolved_id = int(existing_id)
+                outcome = "race_recovered"
+            else:
+                resolved_id = int(inserted_id)
+                outcome = "inserted"
+
+            await ws.commit()
+            return resolved_id
+    except Exception:
+        logger.exception(
+            "persist_assistant_shell failed (chat_id=%s, turn_id=%s)",
+            chat_id,
+            turn_id,
+        )
+        return None
+    finally:
+        _perf_log.info(
+            "[persist_assistant_shell] outcome=%s chat_id=%s turn_id=%s "
+            "message_id=%s in %.3fs",
+            outcome,
+            chat_id,
+            turn_id,
+            resolved_id,
+            time.perf_counter() - t0,
+        )
+
+
+async def finalize_assistant_turn(
+    *,
+    message_id: int,
+    chat_id: int,
+    search_space_id: int,
+    user_id: str | None,
+    turn_id: str,
+    content: list[dict[str, Any]],
+    accumulator: TurnTokenAccumulator | None,
+) -> None:
+    """Finalize the assistant row and write its token_usage.
+
+    Two writes in a single shielded session:
+
+    1. ``UPDATE new_chat_messages SET content = :c, updated_at = now()
+       WHERE id = :id`` — overwrites the placeholder ``persist_assistant_shell``
+       wrote with the full ``ContentPart[]`` snapshot produced server-side.
+    2. ``INSERT INTO token_usage (...) VALUES (...) ON CONFLICT (message_id)
+       WHERE message_id IS NOT NULL DO NOTHING`` — uses the partial unique
+       index ``uq_token_usage_message_id`` from migration 142 to make the
+       insert idempotent against ``append_message``'s recovery branch
+       (which uses the same ON CONFLICT clause).
+
+    Substitutes the status-marker payload when ``content`` is empty
+    (pure tool-call turn that aborted before any output, or interrupt
+    before any event arrived). The status marker is preferable to a
+    blank text bubble because token accounting still runs and an ops
+    dashboard can flag the row.
+
+    Best-effort — never raises. The streaming ``finally`` calls this
+    from within ``anyio.CancelScope(shield=True)``; any raised exception
+    here would mask the real error that triggered the cleanup.
+    """
+    if not turn_id:
+        logger.error(
+            "finalize_assistant_turn called without turn_id "
+            "(chat_id=%s, message_id=%s); skipping",
+            chat_id,
+            message_id,
+        )
+        return
+    if not message_id:
+        logger.error(
+            "finalize_assistant_turn called without message_id "
+            "(chat_id=%s, turn_id=%s); skipping",
+            chat_id,
+            turn_id,
+        )
+        return
+
+    payload: list[dict[str, Any]]
+    is_status_marker = False
+    if content:
+        payload = content
+    else:
+        payload = _STATUS_NO_RESPONSE
+        is_status_marker = True
+
+    t0 = time.perf_counter()
+    outcome = "failed"
+    token_usage_attempted = bool(
+        accumulator is not None and accumulator.calls and user_id
+    )
+    try:
+        async with shielded_async_session() as ws:
+            assistant_row = await ws.get(NewChatMessage, message_id)
+            if assistant_row is None:
+                logger.warning(
+                    "finalize_assistant_turn: row not found "
+                    "(chat_id=%s, message_id=%s, turn_id=%s); skipping",
+                    chat_id,
+                    message_id,
+                    turn_id,
+                )
+                outcome = "row_missing"
+                return
+
+            assistant_row.content = payload
+            assistant_row.updated_at = datetime.now(UTC)
+
+            # Token usage. ``record_token_usage`` (used elsewhere) does
+            # SELECT-then-INSERT in two statements which races with
+            # ``append_message``. Switch to a single INSERT ... ON
+            # CONFLICT DO NOTHING keyed on the migration-142 partial
+            # unique index so the loser silently drops its write at
+            # the DB level — exactly one row per ``message_id``,
+            # regardless of which session committed first.
+            if accumulator is not None and accumulator.calls and user_id:
+                try:
+                    user_uuid = UUID(user_id)
+                except (TypeError, ValueError):
+                    logger.warning(
+                        "finalize_assistant_turn: invalid user_id=%r, "
+                        "skipping token_usage row",
+                        user_id,
+                    )
+                else:
+                    insert_stmt = (
+                        pg_insert(TokenUsage)
+                        .values(
+                            usage_type="chat",
+                            prompt_tokens=accumulator.total_prompt_tokens,
+                            completion_tokens=accumulator.total_completion_tokens,
+                            total_tokens=accumulator.grand_total,
+                            cost_micros=accumulator.total_cost_micros,
+                            model_breakdown=accumulator.per_message_summary(),
+                            call_details={"calls": accumulator.serialized_calls()},
+                            thread_id=chat_id,
+                            message_id=message_id,
+                            search_space_id=search_space_id,
+                            user_id=user_uuid,
+                        )
+                        .on_conflict_do_nothing(
+                            index_elements=["message_id"],
+                            index_where=sa_text("message_id IS NOT NULL"),
+                        )
+                    )
+                    await ws.execute(insert_stmt)
+
+            await ws.commit()
+            outcome = "ok"
+    except Exception:
+        logger.exception(
+            "finalize_assistant_turn failed (chat_id=%s, message_id=%s, turn_id=%s)",
+            chat_id,
+            message_id,
+            turn_id,
+        )
+    finally:
+        _perf_log.info(
+            "[finalize_assistant_turn] outcome=%s chat_id=%s message_id=%s "
+            "turn_id=%s parts=%d status_marker=%s "
+            "token_usage_attempted=%s in %.3fs",
+            outcome,
+            chat_id,
+            message_id,
+            turn_id,
+            len(payload),
+            is_status_marker,
+            token_usage_attempted,
+            time.perf_counter() - t0,
+        )
diff --git a/surfsense_backend/app/tasks/chat/stream_new_chat.py b/surfsense_backend/app/tasks/chat/stream_new_chat.py
index 5ff9074597..9a69b61646 100644
--- a/surfsense_backend/app/tasks/chat/stream_new_chat.py
+++ b/surfsense_backend/app/tasks/chat/stream_new_chat.py
@@ -9,33 +9,48 @@
 - NewLLMConfig database table (positive IDs for user-created configs with prompt settings)
 """
 
-import ast
 import asyncio
 import contextlib
 import gc
 import json
 import logging
-import re
 import time
 from collections.abc import AsyncGenerator
 from dataclasses import dataclass, field
-from typing import Any
+from functools import partial
+from typing import Any, Literal
 from uuid import UUID
 
 import anyio
 from langchain_core.messages import HumanMessage
-from sqlalchemy import func
 from sqlalchemy.future import select
 from sqlalchemy.orm import selectinload
 
+from app.agents.multi_agent_chat import create_multi_agent_chat_deep_agent
 from app.agents.new_chat.chat_deepagent import create_surfsense_deep_agent
 from app.agents.new_chat.checkpointer import get_checkpointer
+from app.agents.new_chat.context import SurfSenseContextSchema
+from app.agents.new_chat.errors import BusyError
+from app.agents.new_chat.filesystem_selection import FilesystemMode, FilesystemSelection
 from app.agents.new_chat.llm_config import (
     AgentConfig,
     create_chat_litellm_from_agent_config,
     create_chat_litellm_from_config,
     load_agent_config,
-    load_llm_config_from_yaml,
+    load_global_llm_config_by_id,
+)
+from app.agents.new_chat.memory_extraction import (
+    extract_and_save_memory,
+    extract_and_save_team_memory,
+)
+from app.agents.new_chat.mention_resolver import resolve_mentions, substitute_in_text
+from app.agents.new_chat.middleware.busy_mutex import (
+    end_turn,
+    get_cancel_state,
+    is_cancel_requested,
+)
+from app.agents.new_chat.middleware.kb_persistence import (
+    commit_staged_filesystem_state,
 )
 from app.db import (
     ChatVisibility,
@@ -47,19 +62,144 @@
     async_session_maker,
     shielded_async_session,
 )
-from app.prompts import TITLE_GENERATION_PROMPT_TEMPLATE
+from app.prompts import TITLE_GENERATION_PROMPT
+from app.services.auto_model_pin_service import (
+    is_recently_healthy,
+    mark_healthy,
+    mark_runtime_cooldown,
+    resolve_or_get_pinned_llm_config_id,
+)
 from app.services.chat_session_state_service import (
     clear_ai_responding,
     set_ai_responding,
 )
 from app.services.connector_service import ConnectorService
 from app.services.new_streaming_service import VercelStreamingService
+from app.tasks.chat.streaming.graph_stream.event_stream import stream_output
+from app.tasks.chat.streaming.helpers.interrupt_inspector import (
+    all_interrupt_values,
+)
 from app.utils.content_utils import bootstrap_history_from_db
 from app.utils.perf import get_perf_logger, log_system_snapshot, trim_native_heap
+from app.utils.surfsense_docs import surfsense_docs_public_url
+from app.utils.user_message_multimodal import build_human_message_content
 
+_background_tasks: set[asyncio.Task] = set()
 _perf_log = get_perf_logger()
+logger = logging.getLogger(__name__)
 
-_background_tasks: set[asyncio.Task] = set()
+TURN_CANCELLING_INITIAL_DELAY_MS = 200
+TURN_CANCELLING_BACKOFF_FACTOR = 2
+TURN_CANCELLING_MAX_DELAY_MS = 1500
+
+
+def _resume_step_prefix(turn_id: str) -> str:
+    """Build the per-turn ``step_prefix`` for a resume invocation.
+
+    Each ``_stream_agent_events`` call constructs a fresh
+    :class:`AgentEventRelayState` with ``thinking_step_counter=0``, so two
+    consecutive resume turns would otherwise both emit ``thinking-resume-1``,
+    ``-2`` etc. The frontend rehydrates ``currentThinkingSteps`` from the
+    immediate prior assistant message at the start of every resume — if the
+    new stream's IDs collide with the seeded ones, React renders sibling
+    Timeline rows with the same key. Salting with ``turn_id`` guarantees
+    disjoint IDs across resumes within one thread.
+    """
+    return f"thinking-resume-{turn_id}"
+
+
+def _compute_turn_cancelling_retry_delay(attempt: int) -> int:
+    if attempt < 1:
+        attempt = 1
+    delay = TURN_CANCELLING_INITIAL_DELAY_MS * (
+        TURN_CANCELLING_BACKOFF_FACTOR ** (attempt - 1)
+    )
+    return min(delay, TURN_CANCELLING_MAX_DELAY_MS)
+
+
+def _extract_chunk_parts(chunk: Any) -> dict[str, Any]:
+    """Decompose an ``AIMessageChunk`` into typed text/reasoning/tool-call parts.
+
+    Returns a dict with three keys:
+
+    * ``text`` — concatenated string content (empty string if the chunk
+      contributes none).
+    * ``reasoning`` — concatenated reasoning content (empty string if the
+      chunk contributes none).
+    * ``tool_call_chunks`` — flat list of LangChain ``tool_call_chunk``
+      dicts surfaced from either the typed-block list or the
+      ``tool_call_chunks`` attribute.
+
+    Background
+    ----------
+    ``AIMessageChunk.content`` can be:
+
+    * a ``str`` (most providers), or
+    * a ``list`` of typed blocks ``{type: 'text' | 'reasoning' |
+      'tool_call_chunk' | 'tool_use' | ..., text/content/...}`` for
+      Anthropic, Bedrock, and several reasoning configurations.
+
+    Reasoning may also live under
+    ``chunk.additional_kwargs['reasoning_content']`` (some providers
+    surface it that way instead of as a typed block). Tool-call chunks
+    may live under ``chunk.tool_call_chunks`` even when ``content`` is a
+    plain string.
+
+    Earlier versions only handled the ``isinstance(content, str)`` branch
+    and silently dropped reasoning blocks + tool-call chunks emitted by
+    LangChain ``AIMessageChunk``s.
+    """
+    out: dict[str, Any] = {"text": "", "reasoning": "", "tool_call_chunks": []}
+    if chunk is None:
+        return out
+
+    content = getattr(chunk, "content", None)
+    if isinstance(content, str):
+        if content:
+            out["text"] = content
+    elif isinstance(content, list):
+        text_parts: list[str] = []
+        reasoning_parts: list[str] = []
+        for block in content:
+            if not isinstance(block, dict):
+                continue
+            block_type = block.get("type")
+            if block_type == "text":
+                value = block.get("text") or block.get("content") or ""
+                if isinstance(value, str) and value:
+                    text_parts.append(value)
+            elif block_type == "reasoning":
+                value = (
+                    block.get("reasoning")
+                    or block.get("text")
+                    or block.get("content")
+                    or ""
+                )
+                if isinstance(value, str) and value:
+                    reasoning_parts.append(value)
+            elif block_type in ("tool_call_chunk", "tool_use"):
+                out["tool_call_chunks"].append(block)
+        if text_parts:
+            out["text"] = "".join(text_parts)
+        if reasoning_parts:
+            out["reasoning"] = "".join(reasoning_parts)
+
+    additional = getattr(chunk, "additional_kwargs", None) or {}
+    if isinstance(additional, dict):
+        extra_reasoning = additional.get("reasoning_content")
+        if isinstance(extra_reasoning, str) and extra_reasoning:
+            existing = out["reasoning"]
+            out["reasoning"] = (
+                (existing + extra_reasoning) if existing else extra_reasoning
+            )
+
+    extra_tool_chunks = getattr(chunk, "tool_call_chunks", None)
+    if isinstance(extra_tool_chunks, list):
+        for tcc in extra_tool_chunks:
+            if isinstance(tcc, dict):
+                out["tool_call_chunks"].append(tcc)
+
+    return out
 
 
 def format_mentioned_surfsense_docs_as_context(
@@ -77,14 +217,17 @@ def format_mentioned_surfsense_docs_as_context(
     )
 
     for doc in documents:
-        metadata_json = json.dumps({"source": doc.source}, ensure_ascii=False)
+        public_url = surfsense_docs_public_url(doc.source)
+        metadata_json = json.dumps(
+            {"source": doc.source, "public_url": public_url}, ensure_ascii=False
+        )
 
         context_parts.append("<document>")
         context_parts.append("<document_metadata>")
         context_parts.append(f"  <document_id>doc-{doc.id}</document_id>")
         context_parts.append("  <document_type>SURFSENSE_DOCS</document_type>")
         context_parts.append(f"  <title><![CDATA[{doc.title}]]></title>")
-        context_parts.append(f"  <url><![CDATA[{doc.source}]]></url>")
+        context_parts.append(f"  <url><![CDATA[{public_url}]]></url>")
         context_parts.append(
             f"  <metadata_json><![CDATA[{metadata_json}]]></metadata_json>"
         )
@@ -139,8 +282,458 @@ def extract_todos_from_deepagents(command_output) -> dict:
 class StreamResult:
     accumulated_text: str = ""
     is_interrupted: bool = False
-    interrupt_value: dict[str, Any] | None = None
-    sandbox_files: list[str] = field(default_factory=list)  # unused, kept for compat
+    sandbox_files: list[str] = field(default_factory=list)
+    agent_called_update_memory: bool = False
+    request_id: str | None = None
+    turn_id: str = ""
+    filesystem_mode: str = "cloud"
+    client_platform: str = "web"
+    intent_detected: str = "chat_only"
+    intent_confidence: float = 0.0
+    write_attempted: bool = False
+    write_succeeded: bool = False
+    verification_succeeded: bool = False
+    commit_gate_passed: bool = True
+    commit_gate_reason: str = ""
+    # Pre-allocated assistant ``new_chat_messages.id`` for this turn,
+    # captured by ``persist_assistant_shell`` right after the user row is
+    # persisted. ``None`` for the legacy / anonymous code paths that don't
+    # opt in to server-side ``ContentPart[]`` projection.
+    assistant_message_id: int | None = None
+    # In-memory mirror of the FE's assistant-ui ``ContentPartsState``,
+    # populated by the lifecycle methods called from ``_stream_agent_events``
+    # at each ``streaming_service.format_*`` yield site. Snapshot in the
+    # streaming ``finally`` to produce the rich JSONB persisted by
+    # ``finalize_assistant_turn``. ``repr=False`` keeps the
+    # log-on-error path (``StreamResult`` is logged in some error
+    # branches) from dumping a potentially-large parts list.
+    content_builder: Any | None = field(default=None, repr=False)
+
+
+def _safe_float(value: Any, default: float = 0.0) -> float:
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return default
+
+
+def _tool_output_to_text(tool_output: Any) -> str:
+    if isinstance(tool_output, dict):
+        if isinstance(tool_output.get("result"), str):
+            return tool_output["result"]
+        if isinstance(tool_output.get("error"), str):
+            return tool_output["error"]
+        return json.dumps(tool_output, ensure_ascii=False)
+    return str(tool_output)
+
+
+def _tool_output_has_error(tool_output: Any) -> bool:
+    if isinstance(tool_output, dict):
+        if tool_output.get("error"):
+            return True
+        result = tool_output.get("result")
+        return bool(
+            isinstance(result, str) and result.strip().lower().startswith("error:")
+        )
+    if isinstance(tool_output, str):
+        return tool_output.strip().lower().startswith("error:")
+    return False
+
+
+def _extract_resolved_file_path(
+    *, tool_name: str, tool_output: Any, tool_input: Any | None = None
+) -> str | None:
+    if isinstance(tool_output, dict):
+        path_value = tool_output.get("path")
+        if isinstance(path_value, str) and path_value.strip():
+            return path_value.strip()
+    if tool_name in ("write_file", "edit_file") and isinstance(tool_input, dict):
+        file_path = tool_input.get("file_path")
+        if isinstance(file_path, str) and file_path.strip():
+            return file_path.strip()
+    return None
+
+
+def _contract_enforcement_active(result: StreamResult) -> bool:
+    # Keep policy deterministic with no env-driven progression modes:
+    # enforce the file-operation contract only in desktop local-folder mode.
+    return result.filesystem_mode == "desktop_local_folder"
+
+
+def _evaluate_file_contract_outcome(result: StreamResult) -> tuple[bool, str]:
+    if result.intent_detected != "file_write":
+        return True, ""
+    if not result.write_attempted:
+        return False, "no_write_attempt"
+    if not result.write_succeeded:
+        return False, "write_failed"
+    if not result.verification_succeeded:
+        return False, "verification_failed"
+    return True, ""
+
+
+def _log_file_contract(stage: str, result: StreamResult, **extra: Any) -> None:
+    payload: dict[str, Any] = {
+        "stage": stage,
+        "request_id": result.request_id or "unknown",
+        "turn_id": result.turn_id or "unknown",
+        "chat_id": result.turn_id.split(":", 1)[0]
+        if ":" in result.turn_id
+        else "unknown",
+        "filesystem_mode": result.filesystem_mode,
+        "client_platform": result.client_platform,
+        "intent_detected": result.intent_detected,
+        "intent_confidence": result.intent_confidence,
+        "write_attempted": result.write_attempted,
+        "write_succeeded": result.write_succeeded,
+        "verification_succeeded": result.verification_succeeded,
+        "commit_gate_passed": result.commit_gate_passed,
+        "commit_gate_reason": result.commit_gate_reason or None,
+    }
+    payload.update(extra)
+    _perf_log.info(
+        "[file_operation_contract] %s", json.dumps(payload, ensure_ascii=False)
+    )
+
+
+def _log_chat_stream_error(
+    *,
+    flow: Literal["new", "resume", "regenerate"],
+    error_kind: str,
+    error_code: str | None,
+    severity: Literal["info", "warn", "error"],
+    is_expected: bool,
+    request_id: str | None,
+    thread_id: int | None,
+    search_space_id: int | None,
+    user_id: str | None,
+    message: str,
+    extra: dict[str, Any] | None = None,
+) -> None:
+    payload: dict[str, Any] = {
+        "event": "chat_stream_error",
+        "flow": flow,
+        "error_kind": error_kind,
+        "error_code": error_code,
+        "severity": severity,
+        "is_expected": is_expected,
+        "request_id": request_id or "unknown",
+        "thread_id": thread_id,
+        "search_space_id": search_space_id,
+        "user_id": user_id,
+        "message": message,
+    }
+    if extra:
+        payload.update(extra)
+
+    logger = logging.getLogger(__name__)
+    rendered = json.dumps(payload, ensure_ascii=False)
+    if severity == "error":
+        logger.error("[chat_stream_error] %s", rendered)
+    elif severity == "warn":
+        logger.warning("[chat_stream_error] %s", rendered)
+    else:
+        logger.info("[chat_stream_error] %s", rendered)
+
+
+def _parse_error_payload(message: str) -> dict[str, Any] | None:
+    candidates = [message]
+    first_brace_idx = message.find("{")
+    if first_brace_idx >= 0:
+        candidates.append(message[first_brace_idx:])
+
+    for candidate in candidates:
+        try:
+            parsed = json.loads(candidate)
+            if isinstance(parsed, dict):
+                return parsed
+        except Exception:
+            continue
+    return None
+
+
+def _extract_provider_error_code(parsed: dict[str, Any] | None) -> int | None:
+    if not isinstance(parsed, dict):
+        return None
+    candidates: list[Any] = [parsed.get("code")]
+    nested = parsed.get("error")
+    if isinstance(nested, dict):
+        candidates.append(nested.get("code"))
+    for value in candidates:
+        try:
+            if value is None:
+                continue
+            return int(value)
+        except Exception:
+            continue
+    return None
+
+
+def _is_provider_rate_limited(exc: BaseException) -> bool:
+    """Best-effort detection for provider-side runtime throttling.
+
+    Covers LiteLLM/OpenRouter shapes like:
+    - class name contains ``RateLimit``
+    - nested payload ``{"error": {"code": 429}}``
+    - nested payload ``{"error": {"type": "rate_limit_error"}}``
+    """
+    raw = str(exc)
+    lowered = raw.lower()
+    if "ratelimit" in type(exc).__name__.lower():
+        return True
+    parsed = _parse_error_payload(raw)
+    provider_code = _extract_provider_error_code(parsed)
+    if provider_code == 429:
+        return True
+
+    provider_error_type = ""
+    if parsed:
+        top_type = parsed.get("type")
+        if isinstance(top_type, str):
+            provider_error_type = top_type.lower()
+        nested = parsed.get("error")
+        if isinstance(nested, dict):
+            nested_type = nested.get("type")
+            if isinstance(nested_type, str):
+                provider_error_type = nested_type.lower()
+    if provider_error_type == "rate_limit_error":
+        return True
+
+    return (
+        "rate limited" in lowered
+        or "rate-limited" in lowered
+        or "temporarily rate-limited upstream" in lowered
+    )
+
+
+_PREFLIGHT_TIMEOUT_SEC: float = 2.5
+_PREFLIGHT_MAX_TOKENS: int = 1
+
+
+async def _preflight_llm(llm: Any) -> None:
+    """Issue a minimal completion to confirm the pinned model isn't 429'ing.
+
+    Used before agent build / planner / classifier / title-gen so a known-bad
+    free OpenRouter deployment is detected and repinned before it cascades
+    into multiple wasted internal calls. The probe is intentionally cheap:
+    one token, low timeout, tagged ``surfsense:internal`` so token tracking
+    and SSE pipelines treat it as overhead rather than user output.
+
+    Raises the original exception when the provider responds with a
+    rate-limit-shaped error so the caller can drive the cooldown/repin
+    branch via :func:`_is_provider_rate_limited`. Other transient failures
+    are swallowed — the caller continues to the normal stream path and the
+    in-stream recovery loop remains the safety net.
+    """
+    from litellm import acompletion
+
+    model = getattr(llm, "model", None)
+    if not model or model == "auto":
+        # Auto-mode router doesn't have a single deployment to ping; the
+        # router itself handles per-deployment rate-limit accounting.
+        return
+
+    try:
+        await acompletion(
+            model=model,
+            messages=[{"role": "user", "content": "ping"}],
+            api_key=getattr(llm, "api_key", None),
+            api_base=getattr(llm, "api_base", None),
+            max_tokens=_PREFLIGHT_MAX_TOKENS,
+            timeout=_PREFLIGHT_TIMEOUT_SEC,
+            stream=False,
+            metadata={"tags": ["surfsense:internal", "auto-pin-preflight"]},
+        )
+    except Exception as exc:
+        if _is_provider_rate_limited(exc):
+            raise
+        logging.getLogger(__name__).debug(
+            "auto_pin_preflight non_rate_limit_error model=%s err=%s",
+            model,
+            exc,
+        )
+
+
+async def _build_main_agent_for_thread(
+    agent_factory: Any,
+    *,
+    llm: Any,
+    search_space_id: int,
+    db_session: Any,
+    connector_service: ConnectorService,
+    checkpointer: Any,
+    user_id: str | None,
+    thread_id: int | None,
+    agent_config: AgentConfig | None,
+    firecrawl_api_key: str | None,
+    thread_visibility: ChatVisibility | None,
+    filesystem_selection: FilesystemSelection | None,
+    disabled_tools: list[str] | None = None,
+    mentioned_document_ids: list[int] | None = None,
+) -> Any:
+    """Single (re)build path so the agent factory cannot drift across
+    initial build, preflight repin, and mid-stream 429 recovery for one
+    ``thread_id``: a graph swap mid-turn would corrupt checkpointer state."""
+    return await agent_factory(
+        llm=llm,
+        search_space_id=search_space_id,
+        db_session=db_session,
+        connector_service=connector_service,
+        checkpointer=checkpointer,
+        user_id=user_id,
+        thread_id=thread_id,
+        agent_config=agent_config,
+        firecrawl_api_key=firecrawl_api_key,
+        thread_visibility=thread_visibility,
+        filesystem_selection=filesystem_selection,
+        disabled_tools=disabled_tools,
+        mentioned_document_ids=mentioned_document_ids,
+    )
+
+
+async def _settle_speculative_agent_build(task: asyncio.Task[Any]) -> None:
+    """Wait for a discarded speculative agent build to release shared state.
+
+    Used by the parallel preflight + agent-build path. The speculative build
+    closes over the request-scoped ``AsyncSession`` (for the brief connector
+    discovery / tool-factory window before its CPU work moves into a worker
+    thread). If preflight reports a 429 we want to fall back to the original
+    repin → reload → rebuild path, but we MUST NOT touch ``session`` again
+    until any in-flight session work owned by the speculative build has
+    fully settled — :class:`sqlalchemy.ext.asyncio.AsyncSession` is not
+    concurrency-safe and the same hazard cost us a hard ``InvalidRequestError``
+    earlier in this PR (see ``connector_service`` parallel-gather revert).
+
+    We simply ``await`` the task and swallow any exception: in this path the
+    build's outcome is irrelevant — success populates the agent cache (a free
+    side effect), failure is discarded. The wasted CPU is acceptable since
+    429 fallbacks are rare and the original sequential code also paid the
+    full build cost on the same path.
+    """
+    with contextlib.suppress(BaseException):
+        await task
+
+
+def _classify_stream_exception(
+    exc: Exception,
+    *,
+    flow_label: str,
+) -> tuple[
+    str, str, Literal["info", "warn", "error"], bool, str, dict[str, Any] | None
+]:
+    raw = str(exc)
+    if isinstance(exc, BusyError) or "Thread is busy with another request" in raw:
+        busy_thread_id = str(exc.request_id) if isinstance(exc, BusyError) else None
+        if busy_thread_id and is_cancel_requested(busy_thread_id):
+            cancel_state = get_cancel_state(busy_thread_id)
+            attempt = cancel_state[0] if cancel_state else 1
+            retry_after_ms = _compute_turn_cancelling_retry_delay(attempt)
+            retry_after_at = int(time.time() * 1000) + retry_after_ms
+            return (
+                "thread_busy",
+                "TURN_CANCELLING",
+                "info",
+                True,
+                "A previous response is still stopping. Please try again in a moment.",
+                {
+                    "retry_after_ms": retry_after_ms,
+                    "retry_after_at": retry_after_at,
+                },
+            )
+        return (
+            "thread_busy",
+            "THREAD_BUSY",
+            "warn",
+            True,
+            "Another response is still finishing for this thread. Please try again in a moment.",
+            None,
+        )
+
+    if _is_provider_rate_limited(exc):
+        return (
+            "rate_limited",
+            "RATE_LIMITED",
+            "warn",
+            True,
+            "This model is temporarily rate-limited. Please try again in a few seconds or switch models.",
+            None,
+        )
+
+    return (
+        "server_error",
+        "SERVER_ERROR",
+        "error",
+        False,
+        f"Error during {flow_label}: {raw}",
+        None,
+    )
+
+
+def _emit_stream_terminal_error(
+    *,
+    streaming_service: VercelStreamingService,
+    flow: str,
+    request_id: str | None,
+    thread_id: int,
+    search_space_id: int,
+    user_id: str | None,
+    message: str,
+    error_kind: str = "server_error",
+    error_code: str = "SERVER_ERROR",
+    severity: Literal["info", "warn", "error"] = "error",
+    is_expected: bool = False,
+    extra: dict[str, Any] | None = None,
+) -> str:
+    _log_chat_stream_error(
+        flow=flow,
+        error_kind=error_kind,
+        error_code=error_code,
+        severity=severity,
+        is_expected=is_expected,
+        request_id=request_id,
+        thread_id=thread_id,
+        search_space_id=search_space_id,
+        user_id=user_id,
+        message=message,
+        extra=extra,
+    )
+    return streaming_service.format_error(message, error_code=error_code, extra=extra)
+
+
+def _legacy_match_lc_id(
+    pending_tool_call_chunks: list[dict[str, Any]],
+    tool_name: str,
+    run_id: str,
+    lc_tool_call_id_by_run: dict[str, str],
+) -> str | None:
+    """Best-effort match a buffered ``tool_call_chunk`` to a tool name.
+
+    Pure extract of the in-line match used at ``on_tool_start`` when the
+    chunk path didn't register an index for this call. Pops the next
+    id-bearing chunk whose ``name``
+    matches ``tool_name`` (or any id-bearing chunk as a fallback) and
+    returns its id. Mutates ``pending_tool_call_chunks`` and
+    ``lc_tool_call_id_by_run`` in place.
+    """
+    matched_idx: int | None = None
+    for idx, tcc in enumerate(pending_tool_call_chunks):
+        if tcc.get("name") == tool_name and tcc.get("id"):
+            matched_idx = idx
+            break
+    if matched_idx is None:
+        for idx, tcc in enumerate(pending_tool_call_chunks):
+            if tcc.get("id"):
+                matched_idx = idx
+                break
+    if matched_idx is None:
+        return None
+    matched = pending_tool_call_chunks.pop(matched_idx)
+    candidate = matched.get("id")
+    if isinstance(candidate, str) and candidate:
+        if run_id:
+            lc_tool_call_id_by_run[run_id] = candidate
+        return candidate
+    return None
 
 
 async def _stream_agent_events(
@@ -153,6 +746,13 @@ async def _stream_agent_events(
     initial_step_id: str | None = None,
     initial_step_title: str = "",
     initial_step_items: list[str] | None = None,
+    *,
+    fallback_commit_search_space_id: int | None = None,
+    fallback_commit_created_by_id: str | None = None,
+    fallback_commit_filesystem_mode: FilesystemMode = FilesystemMode.CLOUD,
+    fallback_commit_thread_id: int | None = None,
+    runtime_context: Any = None,
+    content_builder: Any | None = None,
 ) -> AsyncGenerator[str, None]:
     """Shared async generator that streams and formats astream_events from the agent.
 
@@ -169,955 +769,141 @@ async def _stream_agent_events(
         initial_step_id: If set, the helper inherits an already-active thinking step.
         initial_step_title: Title of the inherited thinking step.
         initial_step_items: Items of the inherited thinking step.
+        content_builder: Optional ``AssistantContentBuilder``. When set, every
+            ``streaming_service.format_*`` yield site also drives the matching
+            builder lifecycle method (``on_text_*``, ``on_reasoning_*``,
+            ``on_tool_*``, ``on_thinking_step``, ``on_step_separator``) so the
+            in-memory ``ContentPart[]`` projection stays in lockstep with what
+            the FE renders live. Pure in-memory accumulation — no DB I/O —
+            consumed by the streaming ``finally`` to produce the rich JSONB
+            persisted via ``finalize_assistant_turn``. ``None`` (the default)
+            is used by the anonymous / legacy code paths and is a no-op.
 
     Yields:
         SSE-formatted strings for each event.
     """
-    accumulated_text = ""
-    current_text_id: str | None = None
-    thinking_step_counter = 1 if initial_step_id else 0
-    tool_step_ids: dict[str, str] = {}
-    completed_step_ids: set[str] = set()
-    last_active_step_id: str | None = initial_step_id
-    last_active_step_title: str = initial_step_title
-    last_active_step_items: list[str] = initial_step_items or []
-    just_finished_tool: bool = False
-    active_tool_depth: int = 0  # Track nesting: >0 means we're inside a tool
-
-    def next_thinking_step_id() -> str:
-        nonlocal thinking_step_counter
-        thinking_step_counter += 1
-        return f"{step_prefix}-{thinking_step_counter}"
-
-    def complete_current_step() -> str | None:
-        nonlocal last_active_step_id
-        if last_active_step_id and last_active_step_id not in completed_step_ids:
-            completed_step_ids.add(last_active_step_id)
-            event = streaming_service.format_thinking_step(
-                step_id=last_active_step_id,
-                title=last_active_step_title,
-                status="completed",
-                items=last_active_step_items if last_active_step_items else None,
-            )
-            last_active_step_id = None
-            return event
-        return None
-
-    async for event in agent.astream_events(input_data, config=config, version="v2"):
-        event_type = event.get("event", "")
-
-        if event_type == "on_chat_model_stream":
-            if active_tool_depth > 0:
-                continue  # Suppress inner-tool LLM tokens from leaking into chat
-            if "surfsense:internal" in event.get("tags", []):
-                continue  # Suppress middleware-internal LLM tokens (e.g. KB search classification)
-            chunk = event.get("data", {}).get("chunk")
-            if chunk and hasattr(chunk, "content"):
-                content = chunk.content
-                if content and isinstance(content, str):
-                    if current_text_id is None:
-                        completion_event = complete_current_step()
-                        if completion_event:
-                            yield completion_event
-                        if just_finished_tool:
-                            last_active_step_id = None
-                            last_active_step_title = ""
-                            last_active_step_items = []
-                            just_finished_tool = False
-                        current_text_id = streaming_service.generate_text_id()
-                        yield streaming_service.format_text_start(current_text_id)
-                    yield streaming_service.format_text_delta(current_text_id, content)
-                    accumulated_text += content
-
-        elif event_type == "on_tool_start":
-            active_tool_depth += 1
-            tool_name = event.get("name", "unknown_tool")
-            run_id = event.get("run_id", "")
-            tool_input = event.get("data", {}).get("input", {})
-
-            if current_text_id is not None:
-                yield streaming_service.format_text_end(current_text_id)
-                current_text_id = None
-
-            if last_active_step_title != "Synthesizing response":
-                completion_event = complete_current_step()
-                if completion_event:
-                    yield completion_event
-
-            just_finished_tool = False
-            tool_step_id = next_thinking_step_id()
-            tool_step_ids[run_id] = tool_step_id
-            last_active_step_id = tool_step_id
-
-            if tool_name == "ls":
-                ls_path = (
-                    tool_input.get("path", "/")
-                    if isinstance(tool_input, dict)
-                    else str(tool_input)
-                )
-                last_active_step_title = "Listing files"
-                last_active_step_items = [ls_path]
-                yield streaming_service.format_thinking_step(
-                    step_id=tool_step_id,
-                    title="Listing files",
-                    status="in_progress",
-                    items=last_active_step_items,
-                )
-            elif tool_name == "read_file":
-                fp = (
-                    tool_input.get("file_path", "")
-                    if isinstance(tool_input, dict)
-                    else str(tool_input)
-                )
-                display_fp = fp if len(fp) <= 80 else "…" + fp[-77:]
-                last_active_step_title = "Reading file"
-                last_active_step_items = [display_fp]
-                yield streaming_service.format_thinking_step(
-                    step_id=tool_step_id,
-                    title="Reading file",
-                    status="in_progress",
-                    items=last_active_step_items,
-                )
-            elif tool_name == "write_file":
-                fp = (
-                    tool_input.get("file_path", "")
-                    if isinstance(tool_input, dict)
-                    else str(tool_input)
-                )
-                display_fp = fp if len(fp) <= 80 else "…" + fp[-77:]
-                last_active_step_title = "Writing file"
-                last_active_step_items = [display_fp]
-                yield streaming_service.format_thinking_step(
-                    step_id=tool_step_id,
-                    title="Writing file",
-                    status="in_progress",
-                    items=last_active_step_items,
-                )
-            elif tool_name == "edit_file":
-                fp = (
-                    tool_input.get("file_path", "")
-                    if isinstance(tool_input, dict)
-                    else str(tool_input)
-                )
-                display_fp = fp if len(fp) <= 80 else "…" + fp[-77:]
-                last_active_step_title = "Editing file"
-                last_active_step_items = [display_fp]
-                yield streaming_service.format_thinking_step(
-                    step_id=tool_step_id,
-                    title="Editing file",
-                    status="in_progress",
-                    items=last_active_step_items,
-                )
-            elif tool_name == "glob":
-                pat = (
-                    tool_input.get("pattern", "")
-                    if isinstance(tool_input, dict)
-                    else str(tool_input)
-                )
-                base_path = (
-                    tool_input.get("path", "/") if isinstance(tool_input, dict) else "/"
-                )
-                last_active_step_title = "Searching files"
-                last_active_step_items = [f"{pat} in {base_path}"]
-                yield streaming_service.format_thinking_step(
-                    step_id=tool_step_id,
-                    title="Searching files",
-                    status="in_progress",
-                    items=last_active_step_items,
-                )
-            elif tool_name == "grep":
-                pat = (
-                    tool_input.get("pattern", "")
-                    if isinstance(tool_input, dict)
-                    else str(tool_input)
-                )
-                grep_path = (
-                    tool_input.get("path", "") if isinstance(tool_input, dict) else ""
-                )
-                display_pat = pat[:60] + ("…" if len(pat) > 60 else "")
-                last_active_step_title = "Searching content"
-                last_active_step_items = [
-                    f'"{display_pat}"' + (f" in {grep_path}" if grep_path else "")
-                ]
-                yield streaming_service.format_thinking_step(
-                    step_id=tool_step_id,
-                    title="Searching content",
-                    status="in_progress",
-                    items=last_active_step_items,
-                )
-            elif tool_name == "save_document":
-                doc_title = (
-                    tool_input.get("title", "")
-                    if isinstance(tool_input, dict)
-                    else str(tool_input)
-                )
-                display_title = doc_title[:60] + ("…" if len(doc_title) > 60 else "")
-                last_active_step_title = "Saving document"
-                last_active_step_items = [display_title]
-                yield streaming_service.format_thinking_step(
-                    step_id=tool_step_id,
-                    title="Saving document",
-                    status="in_progress",
-                    items=last_active_step_items,
-                )
-            elif tool_name == "generate_image":
-                prompt = (
-                    tool_input.get("prompt", "")
-                    if isinstance(tool_input, dict)
-                    else str(tool_input)
-                )
-                last_active_step_title = "Generating image"
-                last_active_step_items = [
-                    f"Prompt: {prompt[:80]}{'...' if len(prompt) > 80 else ''}"
-                ]
-                yield streaming_service.format_thinking_step(
-                    step_id=tool_step_id,
-                    title="Generating image",
-                    status="in_progress",
-                    items=last_active_step_items,
-                )
-            elif tool_name == "scrape_webpage":
-                url = (
-                    tool_input.get("url", "")
-                    if isinstance(tool_input, dict)
-                    else str(tool_input)
-                )
-                last_active_step_title = "Scraping webpage"
-                last_active_step_items = [
-                    f"URL: {url[:80]}{'...' if len(url) > 80 else ''}"
-                ]
-                yield streaming_service.format_thinking_step(
-                    step_id=tool_step_id,
-                    title="Scraping webpage",
-                    status="in_progress",
-                    items=last_active_step_items,
-                )
-            elif tool_name == "generate_podcast":
-                podcast_title = (
-                    tool_input.get("podcast_title", "SurfSense Podcast")
-                    if isinstance(tool_input, dict)
-                    else "SurfSense Podcast"
-                )
-                content_len = len(
-                    tool_input.get("source_content", "")
-                    if isinstance(tool_input, dict)
-                    else ""
-                )
-                last_active_step_title = "Generating podcast"
-                last_active_step_items = [
-                    f"Title: {podcast_title}",
-                    f"Content: {content_len:,} characters",
-                    "Preparing audio generation...",
-                ]
-                yield streaming_service.format_thinking_step(
-                    step_id=tool_step_id,
-                    title="Generating podcast",
-                    status="in_progress",
-                    items=last_active_step_items,
-                )
-            elif tool_name == "generate_report":
-                report_topic = (
-                    tool_input.get("topic", "Report")
-                    if isinstance(tool_input, dict)
-                    else "Report"
-                )
-                is_revision = bool(
-                    isinstance(tool_input, dict) and tool_input.get("parent_report_id")
-                )
-                step_title = "Revising report" if is_revision else "Generating report"
-                last_active_step_title = step_title
-                last_active_step_items = [
-                    f"Topic: {report_topic}",
-                    "Analyzing source content...",
-                ]
-                yield streaming_service.format_thinking_step(
-                    step_id=tool_step_id,
-                    title=step_title,
-                    status="in_progress",
-                    items=last_active_step_items,
-                )
-            elif tool_name == "execute":
-                cmd = (
-                    tool_input.get("command", "")
-                    if isinstance(tool_input, dict)
-                    else str(tool_input)
-                )
-                display_cmd = cmd[:80] + ("…" if len(cmd) > 80 else "")
-                last_active_step_title = "Running command"
-                last_active_step_items = [f"$ {display_cmd}"]
-                yield streaming_service.format_thinking_step(
-                    step_id=tool_step_id,
-                    title="Running command",
-                    status="in_progress",
-                    items=last_active_step_items,
-                )
-            else:
-                last_active_step_title = f"Using {tool_name.replace('_', ' ')}"
-                last_active_step_items = []
-                yield streaming_service.format_thinking_step(
-                    step_id=tool_step_id,
-                    title=last_active_step_title,
-                    status="in_progress",
-                )
+    async for sse in stream_output(
+        agent=agent,
+        config=config,
+        input_data=input_data,
+        streaming_service=streaming_service,
+        result=result,
+        step_prefix=step_prefix,
+        initial_step_id=initial_step_id,
+        initial_step_title=initial_step_title,
+        initial_step_items=initial_step_items,
+        content_builder=content_builder,
+        runtime_context=runtime_context,
+    ):
+        yield sse
+
+    accumulated_text = result.accumulated_text
 
-            tool_call_id = (
-                f"call_{run_id[:32]}"
-                if run_id
-                else streaming_service.generate_tool_call_id()
-            )
-            yield streaming_service.format_tool_input_start(tool_call_id, tool_name)
-            # Sanitize tool_input: strip runtime-injected non-serializable
-            # values (e.g. LangChain ToolRuntime) before sending over SSE.
-            if isinstance(tool_input, dict):
-                _safe_input: dict[str, Any] = {}
-                for _k, _v in tool_input.items():
-                    try:
-                        json.dumps(_v)
-                        _safe_input[_k] = _v
-                    except (TypeError, ValueError, OverflowError):
-                        pass
-            else:
-                _safe_input = {"input": tool_input}
-            yield streaming_service.format_tool_input_available(
-                tool_call_id,
-                tool_name,
-                _safe_input,
+    state = await agent.aget_state(config)
+    state_values = getattr(state, "values", {}) or {}
+
+    # Safety net: if astream_events was cancelled before
+    # KnowledgeBasePersistenceMiddleware.aafter_agent ran, any staged work
+    # (dirty_paths / staged_dirs / pending_moves / pending_deletes /
+    # pending_dir_deletes) will still be in the checkpointed state. Run
+    # the SAME shared commit helper here so the turn's writes don't get
+    # lost on client disconnect, then push the delta back into the graph
+    # using `as_node=...` so reducers fire as if the after_agent hook
+    # produced it.
+    if (
+        fallback_commit_filesystem_mode == FilesystemMode.CLOUD
+        and fallback_commit_search_space_id is not None
+        and (
+            (state_values.get("dirty_paths") or [])
+            or (state_values.get("staged_dirs") or [])
+            or (state_values.get("pending_moves") or [])
+            or (state_values.get("pending_deletes") or [])
+            or (state_values.get("pending_dir_deletes") or [])
+        )
+    ):
+        try:
+            delta = await commit_staged_filesystem_state(
+                state_values,
+                search_space_id=fallback_commit_search_space_id,
+                created_by_id=fallback_commit_created_by_id,
+                filesystem_mode=fallback_commit_filesystem_mode,
+                thread_id=fallback_commit_thread_id,
+                dispatch_events=False,
             )
+            if delta:
+                await agent.aupdate_state(
+                    config,
+                    delta,
+                    as_node="KnowledgeBasePersistenceMiddleware.after_agent",
+                )
+        except Exception as exc:
+            _perf_log.warning("[stream_new_chat] safety-net commit failed: %s", exc)
+
+    contract_state = state_values.get("file_operation_contract") or {}
+    contract_turn_id = contract_state.get("turn_id")
+    current_turn_id = config.get("configurable", {}).get("turn_id", "")
+    intent_value = contract_state.get("intent")
+    if (
+        isinstance(intent_value, str)
+        and intent_value in ("chat_only", "file_write", "file_read")
+        and contract_turn_id == current_turn_id
+    ):
+        result.intent_detected = intent_value
+    if (
+        isinstance(intent_value, str)
+        and intent_value
+        in (
+            "chat_only",
+            "file_write",
+            "file_read",
+        )
+        and contract_turn_id != current_turn_id
+    ):
+        # Ignore stale intent contracts from previous turns/checkpoints.
+        result.intent_detected = "chat_only"
+    result.intent_confidence = (
+        _safe_float(contract_state.get("confidence"), default=0.0)
+        if contract_turn_id == current_turn_id
+        else 0.0
+    )
 
-        elif event_type == "on_tool_end":
-            active_tool_depth = max(0, active_tool_depth - 1)
-            run_id = event.get("run_id", "")
-            tool_name = event.get("name", "unknown_tool")
-            raw_output = event.get("data", {}).get("output", "")
-
-            if hasattr(raw_output, "content"):
-                content = raw_output.content
-                if isinstance(content, str):
-                    try:
-                        tool_output = json.loads(content)
-                    except (json.JSONDecodeError, TypeError):
-                        tool_output = {"result": content}
-                elif isinstance(content, dict):
-                    tool_output = content
-                else:
-                    tool_output = {"result": str(content)}
-            elif isinstance(raw_output, dict):
-                tool_output = raw_output
-            else:
-                tool_output = {"result": str(raw_output) if raw_output else "completed"}
-
-            tool_call_id = f"call_{run_id[:32]}" if run_id else "call_unknown"
-            original_step_id = tool_step_ids.get(
-                run_id, f"{step_prefix}-unknown-{run_id[:8]}"
+    if result.intent_detected == "file_write":
+        result.commit_gate_passed, result.commit_gate_reason = (
+            _evaluate_file_contract_outcome(result)
+        )
+        if not result.commit_gate_passed and _contract_enforcement_active(result):
+            gate_notice = (
+                "I could not complete the requested file write because no successful "
+                "write_file/edit_file operation was confirmed."
             )
-            completed_step_ids.add(original_step_id)
-
-            if tool_name == "read_file":
-                yield streaming_service.format_thinking_step(
-                    step_id=original_step_id,
-                    title="Reading file",
-                    status="completed",
-                    items=last_active_step_items,
-                )
-            elif tool_name == "write_file":
-                yield streaming_service.format_thinking_step(
-                    step_id=original_step_id,
-                    title="Writing file",
-                    status="completed",
-                    items=last_active_step_items,
-                )
-            elif tool_name == "edit_file":
-                yield streaming_service.format_thinking_step(
-                    step_id=original_step_id,
-                    title="Editing file",
-                    status="completed",
-                    items=last_active_step_items,
-                )
-            elif tool_name == "glob":
-                yield streaming_service.format_thinking_step(
-                    step_id=original_step_id,
-                    title="Searching files",
-                    status="completed",
-                    items=last_active_step_items,
-                )
-            elif tool_name == "grep":
-                yield streaming_service.format_thinking_step(
-                    step_id=original_step_id,
-                    title="Searching content",
-                    status="completed",
-                    items=last_active_step_items,
-                )
-            elif tool_name == "save_document":
-                result_str = (
-                    tool_output.get("result", "")
-                    if isinstance(tool_output, dict)
-                    else str(tool_output)
-                )
-                is_error = "Error" in result_str
-                completed_items = [
-                    *last_active_step_items,
-                    result_str[:80] if is_error else "Saved to knowledge base",
-                ]
-                yield streaming_service.format_thinking_step(
-                    step_id=original_step_id,
-                    title="Saving document",
-                    status="completed",
-                    items=completed_items,
-                )
-            elif tool_name == "generate_image":
-                if isinstance(tool_output, dict) and not tool_output.get("error"):
-                    completed_items = [
-                        *last_active_step_items,
-                        "Image generated successfully",
-                    ]
-                else:
-                    error_msg = (
-                        tool_output.get("error", "Generation failed")
-                        if isinstance(tool_output, dict)
-                        else "Generation failed"
-                    )
-                    completed_items = [*last_active_step_items, f"Error: {error_msg}"]
-                yield streaming_service.format_thinking_step(
-                    step_id=original_step_id,
-                    title="Generating image",
-                    status="completed",
-                    items=completed_items,
-                )
-            elif tool_name == "scrape_webpage":
-                if isinstance(tool_output, dict):
-                    title = tool_output.get("title", "Webpage")
-                    word_count = tool_output.get("word_count", 0)
-                    has_error = "error" in tool_output
-                    if has_error:
-                        completed_items = [
-                            *last_active_step_items,
-                            f"Error: {tool_output.get('error', 'Failed to scrape')[:50]}",
-                        ]
-                    else:
-                        completed_items = [
-                            *last_active_step_items,
-                            f"Title: {title[:50]}{'...' if len(title) > 50 else ''}",
-                            f"Extracted: {word_count:,} words",
-                        ]
-                else:
-                    completed_items = [*last_active_step_items, "Content extracted"]
-                yield streaming_service.format_thinking_step(
-                    step_id=original_step_id,
-                    title="Scraping webpage",
-                    status="completed",
-                    items=completed_items,
-                )
-            elif tool_name == "generate_podcast":
-                podcast_status = (
-                    tool_output.get("status", "unknown")
-                    if isinstance(tool_output, dict)
-                    else "unknown"
-                )
-                podcast_title = (
-                    tool_output.get("title", "Podcast")
-                    if isinstance(tool_output, dict)
-                    else "Podcast"
-                )
-                if podcast_status == "processing":
-                    completed_items = [
-                        f"Title: {podcast_title}",
-                        "Audio generation started",
-                        "Processing in background...",
-                    ]
-                elif podcast_status == "already_generating":
-                    completed_items = [
-                        f"Title: {podcast_title}",
-                        "Podcast already in progress",
-                        "Please wait for it to complete",
-                    ]
-                elif podcast_status == "error":
-                    error_msg = (
-                        tool_output.get("error", "Unknown error")
-                        if isinstance(tool_output, dict)
-                        else "Unknown error"
-                    )
-                    completed_items = [
-                        f"Title: {podcast_title}",
-                        f"Error: {error_msg[:50]}",
-                    ]
-                else:
-                    completed_items = last_active_step_items
-                yield streaming_service.format_thinking_step(
-                    step_id=original_step_id,
-                    title="Generating podcast",
-                    status="completed",
-                    items=completed_items,
-                )
-            elif tool_name == "generate_video_presentation":
-                vp_status = (
-                    tool_output.get("status", "unknown")
-                    if isinstance(tool_output, dict)
-                    else "unknown"
-                )
-                vp_title = (
-                    tool_output.get("title", "Presentation")
-                    if isinstance(tool_output, dict)
-                    else "Presentation"
-                )
-                if vp_status in ("pending", "generating"):
-                    completed_items = [
-                        f"Title: {vp_title}",
-                        "Presentation generation started",
-                        "Processing in background...",
-                    ]
-                elif vp_status == "failed":
-                    error_msg = (
-                        tool_output.get("error", "Unknown error")
-                        if isinstance(tool_output, dict)
-                        else "Unknown error"
-                    )
-                    completed_items = [
-                        f"Title: {vp_title}",
-                        f"Error: {error_msg[:50]}",
-                    ]
-                else:
-                    completed_items = last_active_step_items
-                yield streaming_service.format_thinking_step(
-                    step_id=original_step_id,
-                    title="Generating video presentation",
-                    status="completed",
-                    items=completed_items,
-                )
-            elif tool_name == "generate_report":
-                report_status = (
-                    tool_output.get("status", "unknown")
-                    if isinstance(tool_output, dict)
-                    else "unknown"
-                )
-                report_title = (
-                    tool_output.get("title", "Report")
-                    if isinstance(tool_output, dict)
-                    else "Report"
-                )
-                word_count = (
-                    tool_output.get("word_count", 0)
-                    if isinstance(tool_output, dict)
-                    else 0
-                )
-                is_revision = (
-                    tool_output.get("is_revision", False)
-                    if isinstance(tool_output, dict)
-                    else False
-                )
-                step_title = "Revising report" if is_revision else "Generating report"
-
-                if report_status == "ready":
-                    completed_items = [
-                        f"Topic: {report_title}",
-                        f"{word_count:,} words",
-                        "Report ready",
-                    ]
-                elif report_status == "failed":
-                    error_msg = (
-                        tool_output.get("error", "Unknown error")
-                        if isinstance(tool_output, dict)
-                        else "Unknown error"
-                    )
-                    completed_items = [
-                        f"Topic: {report_title}",
-                        f"Error: {error_msg[:50]}",
-                    ]
-                else:
-                    completed_items = last_active_step_items
-
-                yield streaming_service.format_thinking_step(
-                    step_id=original_step_id,
-                    title=step_title,
-                    status="completed",
-                    items=completed_items,
-                )
-            elif tool_name == "execute":
-                raw_text = (
-                    tool_output.get("result", "")
-                    if isinstance(tool_output, dict)
-                    else str(tool_output)
-                )
-                m = re.match(r"^Exit code:\s*(\d+)", raw_text)
-                exit_code_val = int(m.group(1)) if m else None
-                if exit_code_val is not None and exit_code_val == 0:
-                    completed_items = [
-                        *last_active_step_items,
-                        "Completed successfully",
-                    ]
-                elif exit_code_val is not None:
-                    completed_items = [
-                        *last_active_step_items,
-                        f"Exit code: {exit_code_val}",
-                    ]
-                else:
-                    completed_items = [*last_active_step_items, "Finished"]
-                yield streaming_service.format_thinking_step(
-                    step_id=original_step_id,
-                    title="Running command",
-                    status="completed",
-                    items=completed_items,
-                )
-            elif tool_name == "ls":
-                if isinstance(tool_output, dict):
-                    ls_output = tool_output.get("result", "")
-                elif isinstance(tool_output, str):
-                    ls_output = tool_output
-                else:
-                    ls_output = str(tool_output) if tool_output else ""
-                file_names: list[str] = []
-                if ls_output:
-                    paths: list[str] = []
-                    try:
-                        parsed = ast.literal_eval(ls_output)
-                        if isinstance(parsed, list):
-                            paths = [str(p) for p in parsed]
-                    except (ValueError, SyntaxError):
-                        paths = [
-                            line.strip()
-                            for line in ls_output.strip().split("\n")
-                            if line.strip()
-                        ]
-                    for p in paths:
-                        name = p.rstrip("/").split("/")[-1]
-                        if name and len(name) <= 40:
-                            file_names.append(name)
-                        elif name:
-                            file_names.append(name[:37] + "...")
-                if file_names:
-                    if len(file_names) <= 5:
-                        completed_items = [f"[{name}]" for name in file_names]
-                    else:
-                        completed_items = [f"[{name}]" for name in file_names[:4]]
-                        completed_items.append(f"(+{len(file_names) - 4} more)")
-                else:
-                    completed_items = ["No files found"]
-                yield streaming_service.format_thinking_step(
-                    step_id=original_step_id,
-                    title="Listing files",
-                    status="completed",
-                    items=completed_items,
-                )
-            else:
-                yield streaming_service.format_thinking_step(
-                    step_id=original_step_id,
-                    title=f"Using {tool_name.replace('_', ' ')}",
-                    status="completed",
-                    items=last_active_step_items,
-                )
-
-            just_finished_tool = True
-            last_active_step_id = None
-            last_active_step_title = ""
-            last_active_step_items = []
-
-            if tool_name == "generate_podcast":
-                yield streaming_service.format_tool_output_available(
-                    tool_call_id,
-                    tool_output
-                    if isinstance(tool_output, dict)
-                    else {"result": tool_output},
-                )
-                if (
-                    isinstance(tool_output, dict)
-                    and tool_output.get("status") == "success"
-                ):
-                    yield streaming_service.format_terminal_info(
-                        f"Podcast generated successfully: {tool_output.get('title', 'Podcast')}",
-                        "success",
-                    )
-                else:
-                    error_msg = (
-                        tool_output.get("error", "Unknown error")
-                        if isinstance(tool_output, dict)
-                        else "Unknown error"
-                    )
-                    yield streaming_service.format_terminal_info(
-                        f"Podcast generation failed: {error_msg}",
-                        "error",
-                    )
-            elif tool_name == "generate_video_presentation":
-                yield streaming_service.format_tool_output_available(
-                    tool_call_id,
-                    tool_output
-                    if isinstance(tool_output, dict)
-                    else {"result": tool_output},
-                )
-                if (
-                    isinstance(tool_output, dict)
-                    and tool_output.get("status") == "pending"
-                ):
-                    yield streaming_service.format_terminal_info(
-                        f"Video presentation queued: {tool_output.get('title', 'Presentation')}",
-                        "success",
-                    )
-                elif (
-                    isinstance(tool_output, dict)
-                    and tool_output.get("status") == "failed"
-                ):
-                    error_msg = (
-                        tool_output.get("error", "Unknown error")
-                        if isinstance(tool_output, dict)
-                        else "Unknown error"
-                    )
-                    yield streaming_service.format_terminal_info(
-                        f"Presentation generation failed: {error_msg}",
-                        "error",
-                    )
-            elif tool_name == "generate_image":
-                yield streaming_service.format_tool_output_available(
-                    tool_call_id,
-                    tool_output
-                    if isinstance(tool_output, dict)
-                    else {"result": tool_output},
-                )
-                if isinstance(tool_output, dict):
-                    if tool_output.get("error"):
-                        yield streaming_service.format_terminal_info(
-                            f"Image generation failed: {tool_output['error'][:60]}",
-                            "error",
-                        )
-                    else:
-                        yield streaming_service.format_terminal_info(
-                            "Image generated successfully",
-                            "success",
-                        )
-            elif tool_name == "scrape_webpage":
-                if isinstance(tool_output, dict):
-                    display_output = {
-                        k: v for k, v in tool_output.items() if k != "content"
-                    }
-                    if "content" in tool_output:
-                        content = tool_output.get("content", "")
-                        display_output["content_preview"] = (
-                            content[:500] + "..." if len(content) > 500 else content
-                        )
-                    yield streaming_service.format_tool_output_available(
-                        tool_call_id,
-                        display_output,
-                    )
-                else:
-                    yield streaming_service.format_tool_output_available(
-                        tool_call_id,
-                        {"result": tool_output},
-                    )
-                if isinstance(tool_output, dict) and "error" not in tool_output:
-                    title = tool_output.get("title", "Webpage")
-                    word_count = tool_output.get("word_count", 0)
-                    yield streaming_service.format_terminal_info(
-                        f"Scraped: {title[:40]}{'...' if len(title) > 40 else ''} ({word_count:,} words)",
-                        "success",
-                    )
-                else:
-                    error_msg = (
-                        tool_output.get("error", "Failed to scrape")
-                        if isinstance(tool_output, dict)
-                        else "Failed to scrape"
-                    )
-                    yield streaming_service.format_terminal_info(
-                        f"Scrape failed: {error_msg}",
-                        "error",
-                    )
-            elif tool_name == "generate_report":
-                # Stream the full report result so frontend can render the ReportCard
-                yield streaming_service.format_tool_output_available(
-                    tool_call_id,
-                    tool_output
-                    if isinstance(tool_output, dict)
-                    else {"result": tool_output},
-                )
-                # Send appropriate terminal message based on status
-                if (
-                    isinstance(tool_output, dict)
-                    and tool_output.get("status") == "ready"
-                ):
-                    word_count = tool_output.get("word_count", 0)
-                    yield streaming_service.format_terminal_info(
-                        f"Report generated: {tool_output.get('title', 'Report')} ({word_count:,} words)",
-                        "success",
-                    )
-                else:
-                    error_msg = (
-                        tool_output.get("error", "Unknown error")
-                        if isinstance(tool_output, dict)
-                        else "Unknown error"
-                    )
-                    yield streaming_service.format_terminal_info(
-                        f"Report generation failed: {error_msg}",
-                        "error",
-                    )
-            elif tool_name in (
-                "create_notion_page",
-                "update_notion_page",
-                "delete_notion_page",
-                "create_linear_issue",
-                "update_linear_issue",
-                "delete_linear_issue",
-                "create_google_drive_file",
-                "delete_google_drive_file",
-                "create_onedrive_file",
-                "delete_onedrive_file",
-                "create_dropbox_file",
-                "delete_dropbox_file",
-                "create_gmail_draft",
-                "update_gmail_draft",
-                "send_gmail_email",
-                "trash_gmail_email",
-                "create_calendar_event",
-                "update_calendar_event",
-                "delete_calendar_event",
-                "create_jira_issue",
-                "update_jira_issue",
-                "delete_jira_issue",
-                "create_confluence_page",
-                "update_confluence_page",
-                "delete_confluence_page",
-            ):
-                yield streaming_service.format_tool_output_available(
-                    tool_call_id,
-                    tool_output
-                    if isinstance(tool_output, dict)
-                    else {"result": tool_output},
-                )
-            elif tool_name == "execute":
-                raw_text = (
-                    tool_output.get("result", "")
-                    if isinstance(tool_output, dict)
-                    else str(tool_output)
-                )
-                exit_code: int | None = None
-                output_text = raw_text
-                m = re.match(r"^Exit code:\s*(\d+)", raw_text)
-                if m:
-                    exit_code = int(m.group(1))
-                    om = re.search(r"\nOutput:\n([\s\S]*)", raw_text)
-                    output_text = om.group(1) if om else ""
-                thread_id_str = config.get("configurable", {}).get("thread_id", "")
-
-                for sf_match in re.finditer(
-                    r"^SANDBOX_FILE:\s*(.+)$", output_text, re.MULTILINE
-                ):
-                    fpath = sf_match.group(1).strip()
-                    if fpath and fpath not in result.sandbox_files:
-                        result.sandbox_files.append(fpath)
-
-                yield streaming_service.format_tool_output_available(
-                    tool_call_id,
-                    {
-                        "exit_code": exit_code,
-                        "output": output_text,
-                        "thread_id": thread_id_str,
-                    },
-                )
-            elif tool_name == "web_search":
-                xml = (
-                    tool_output.get("result", str(tool_output))
-                    if isinstance(tool_output, dict)
-                    else str(tool_output)
-                )
-                citations: dict[str, dict[str, str]] = {}
-                for m in re.finditer(
-                    r"<title><!\[CDATA\[(.*?)\]\]></title>\s*<url><!\[CDATA\[(.*?)\]\]></url>",
-                    xml,
-                ):
-                    title, url = m.group(1).strip(), m.group(2).strip()
-                    if url.startswith("http") and url not in citations:
-                        citations[url] = {"title": title}
-                for m in re.finditer(
-                    r"<chunk\s+id='([^']*)'><!\[CDATA\[([\s\S]*?)\]\]></chunk>",
-                    xml,
-                ):
-                    chunk_url, content = m.group(1).strip(), m.group(2).strip()
-                    if (
-                        chunk_url.startswith("http")
-                        and chunk_url in citations
-                        and content
-                    ):
-                        citations[chunk_url]["snippet"] = (
-                            content[:200] + "…" if len(content) > 200 else content
-                        )
-                yield streaming_service.format_tool_output_available(
-                    tool_call_id,
-                    {"status": "completed", "citations": citations},
-                )
-            else:
-                yield streaming_service.format_tool_output_available(
-                    tool_call_id,
-                    {"status": "completed", "result_length": len(str(tool_output))},
-                )
-                yield streaming_service.format_terminal_info(
-                    f"Tool {tool_name} completed", "success"
-                )
-
-        elif event_type == "on_custom_event" and event.get("name") == "report_progress":
-            # Live progress updates from inside the generate_report tool
-            data = event.get("data", {})
-            message = data.get("message", "")
-            if message and last_active_step_id:
-                phase = data.get("phase", "")
-                # Always keep the "Topic: ..." line
-                topic_items = [
-                    item for item in last_active_step_items if item.startswith("Topic:")
-                ]
-
-                if phase in ("revising_section", "adding_section"):
-                    # During section-level ops: keep plan summary + show current op
-                    plan_items = [
-                        item
-                        for item in last_active_step_items
-                        if item.startswith("Topic:")
-                        or item.startswith("Modifying ")
-                        or item.startswith("Adding ")
-                        or item.startswith("Removing ")
-                    ]
-                    # Only keep plan_items that don't end with "..." (not progress lines)
-                    plan_items = [
-                        item for item in plan_items if not item.endswith("...")
-                    ]
-                    last_active_step_items = [*plan_items, message]
-                else:
-                    # Phase transitions: replace everything after topic
-                    last_active_step_items = [*topic_items, message]
-
-                yield streaming_service.format_thinking_step(
-                    step_id=last_active_step_id,
-                    title=last_active_step_title,
-                    status="in_progress",
-                    items=last_active_step_items,
-                )
-
-        elif (
-            event_type == "on_custom_event" and event.get("name") == "document_created"
-        ):
-            data = event.get("data", {})
-            if data.get("id"):
-                yield streaming_service.format_data(
-                    "documents-updated",
-                    {
-                        "action": "created",
-                        "document": data,
-                    },
-                )
-
-        elif event_type in ("on_chain_end", "on_agent_end"):
-            if current_text_id is not None:
-                yield streaming_service.format_text_end(current_text_id)
-                current_text_id = None
-
-    if current_text_id is not None:
-        yield streaming_service.format_text_end(current_text_id)
-
-    completion_event = complete_current_step()
-    if completion_event:
-        yield completion_event
+            gate_text_id = streaming_service.generate_text_id()
+            yield streaming_service.format_text_start(gate_text_id)
+            if content_builder is not None:
+                content_builder.on_text_start(gate_text_id)
+            yield streaming_service.format_text_delta(gate_text_id, gate_notice)
+            if content_builder is not None:
+                content_builder.on_text_delta(gate_text_id, gate_notice)
+            yield streaming_service.format_text_end(gate_text_id)
+            if content_builder is not None:
+                content_builder.on_text_end(gate_text_id)
+            yield streaming_service.format_terminal_info(gate_notice, "error")
+            accumulated_text = gate_notice
+    else:
+        result.commit_gate_passed = True
+        result.commit_gate_reason = ""
 
     result.accumulated_text = accumulated_text
+    _log_file_contract("turn_outcome", result)
 
-    state = await agent.aget_state(config)
-    is_interrupted = state.tasks and any(task.interrupts for task in state.tasks)
-    if is_interrupted:
+    pending_values = all_interrupt_values(state)
+    if pending_values:
         result.is_interrupted = True
-        result.interrupt_value = state.tasks[0].interrupts[0].value
-        yield streaming_service.format_interrupt_request(result.interrupt_value)
+        # One frame per paused subagent so each parallel HITL renders its own
+        # approval card on the wire. Order matches ``state.interrupts``, which
+        # the resume slicer in ``checkpointed_subagent_middleware.resume_routing``
+        # consumes in the same order — keeping emit and resume in lock-step.
+        for interrupt_value in pending_values:
+            yield streaming_service.format_interrupt_request(interrupt_value)
 
 
 async def stream_new_chat(
@@ -1128,11 +914,17 @@ async def stream_new_chat(
     llm_config_id: int = -1,
     mentioned_document_ids: list[int] | None = None,
     mentioned_surfsense_doc_ids: list[int] | None = None,
+    mentioned_folder_ids: list[int] | None = None,
+    mentioned_documents: list[dict[str, Any]] | None = None,
     checkpoint_id: str | None = None,
     needs_history_bootstrap: bool = False,
     thread_visibility: ChatVisibility | None = None,
     current_user_display_name: str | None = None,
     disabled_tools: list[str] | None = None,
+    filesystem_selection: FilesystemSelection | None = None,
+    request_id: str | None = None,
+    user_image_data_urls: list[str] | None = None,
+    flow: Literal["new", "regenerate"] = "new",
 ) -> AsyncGenerator[str, None]:
     """
     Stream chat responses from the new SurfSense deep agent.
@@ -1152,6 +944,7 @@ async def stream_new_chat(
         needs_history_bootstrap: If True, load message history from DB (for cloned chats)
         mentioned_document_ids: Optional list of document IDs mentioned with @ in the chat
         mentioned_surfsense_doc_ids: Optional list of SurfSense doc IDs mentioned with @ in the chat
+        mentioned_folder_ids: Optional list of knowledge-base folder IDs mentioned with @ (cloud mode)
         checkpoint_id: Optional checkpoint ID to rewind/fork from (for edit/reload operations)
 
     Yields:
@@ -1160,8 +953,46 @@ async def stream_new_chat(
     streaming_service = VercelStreamingService()
     stream_result = StreamResult()
     _t_total = time.perf_counter()
+    fs_mode = filesystem_selection.mode.value if filesystem_selection else "cloud"
+    fs_platform = (
+        filesystem_selection.client_platform.value if filesystem_selection else "web"
+    )
+    stream_result.request_id = request_id
+    stream_result.turn_id = f"{chat_id}:{int(time.time() * 1000)}"
+    stream_result.filesystem_mode = fs_mode
+    stream_result.client_platform = fs_platform
+    _log_file_contract("turn_start", stream_result)
+    _perf_log.info(
+        "[stream_new_chat] filesystem_mode=%s client_platform=%s",
+        fs_mode,
+        fs_platform,
+    )
     log_system_snapshot("stream_new_chat_START")
 
+    from app.services.token_tracking_service import start_turn
+
+    accumulator = start_turn()
+
+    # Premium credit (USD micro-units) tracking state. Stores the
+    # amount reserved up front so we can release it on cancellation
+    # and finalize-debit the actual provider cost reported by LiteLLM.
+    _premium_reserved_micros = 0
+    _premium_request_id: str | None = None
+
+    # ``BusyError`` fires before the lock is acquired; the ``finally`` must
+    # not release the in-flight caller's lock.
+    _busy_error_raised = False
+
+    _emit_stream_error = partial(
+        _emit_stream_terminal_error,
+        streaming_service=streaming_service,
+        flow=flow,
+        request_id=request_id,
+        thread_id=chat_id,
+        search_space_id=search_space_id,
+        user_id=user_id,
+    )
+
     session = async_session_maker()
     try:
         # Mark AI as responding to this user for live collaboration
@@ -1169,49 +1000,280 @@ async def stream_new_chat(
             await set_ai_responding(session, chat_id, UUID(user_id))
         # Load LLM config - supports both YAML (negative IDs) and database (positive IDs)
         agent_config: AgentConfig | None = None
+        requested_llm_config_id = llm_config_id
+
+        async def _load_llm_bundle(
+            config_id: int,
+        ) -> tuple[Any, AgentConfig | None, str | None]:
+            if config_id >= 0:
+                loaded_agent_config = await load_agent_config(
+                    session=session,
+                    config_id=config_id,
+                    search_space_id=search_space_id,
+                )
+                if not loaded_agent_config:
+                    return (
+                        None,
+                        None,
+                        f"Failed to load NewLLMConfig with id {config_id}",
+                    )
+                return (
+                    create_chat_litellm_from_agent_config(loaded_agent_config),
+                    loaded_agent_config,
+                    None,
+                )
+
+            loaded_llm_config = load_global_llm_config_by_id(config_id)
+            if not loaded_llm_config:
+                return None, None, f"Failed to load LLM config with id {config_id}"
+            return (
+                create_chat_litellm_from_config(loaded_llm_config),
+                AgentConfig.from_yaml_config(loaded_llm_config),
+                None,
+            )
 
         _t0 = time.perf_counter()
-        if llm_config_id >= 0:
-            # Positive ID: Load from NewLLMConfig database table
-            agent_config = await load_agent_config(
-                session=session,
-                config_id=llm_config_id,
-                search_space_id=search_space_id,
+        # Image-bearing turns force the Auto-pin resolver to filter the
+        # candidate pool to vision-capable cfgs (and force-repin a
+        # text-only existing pin). For explicit selections this flag is
+        # a no-op — the resolver returns the user's chosen id unchanged.
+        _requires_image_input = bool(user_image_data_urls)
+        try:
+            llm_config_id = (
+                await resolve_or_get_pinned_llm_config_id(
+                    session,
+                    thread_id=chat_id,
+                    search_space_id=search_space_id,
+                    user_id=user_id,
+                    selected_llm_config_id=llm_config_id,
+                    requires_image_input=_requires_image_input,
+                )
+            ).resolved_llm_config_id
+        except ValueError as pin_error:
+            # Auto-pin's "no vision-capable cfg" path raises a ValueError
+            # whose message we map to the friendly image-input SSE error
+            # so the user sees the same message regardless of whether
+            # the gate fired in Auto-mode or in the agent_config check
+            # below.
+            error_code = (
+                "MODEL_DOES_NOT_SUPPORT_IMAGE_INPUT"
+                if _requires_image_input and "vision-capable" in str(pin_error)
+                else "SERVER_ERROR"
             )
-            if not agent_config:
-                yield streaming_service.format_error(
-                    f"Failed to load NewLLMConfig with id {llm_config_id}"
-                )
-                yield streaming_service.format_done()
-                return
-
-            # Create ChatLiteLLM from AgentConfig
-            llm = create_chat_litellm_from_agent_config(agent_config)
-        else:
-            # Negative ID: Load from YAML (global configs)
-            llm_config = load_llm_config_from_yaml(llm_config_id=llm_config_id)
-            if not llm_config:
-                yield streaming_service.format_error(
-                    f"Failed to load LLM config with id {llm_config_id}"
-                )
-                yield streaming_service.format_done()
-                return
+            error_kind = (
+                "user_error"
+                if error_code == "MODEL_DOES_NOT_SUPPORT_IMAGE_INPUT"
+                else "server_error"
+            )
+            yield _emit_stream_error(
+                message=str(pin_error),
+                error_kind=error_kind,
+                error_code=error_code,
+            )
+            yield streaming_service.format_done()
+            return
 
-            # Create ChatLiteLLM from YAML config dict
-            llm = create_chat_litellm_from_config(llm_config)
-            # Create AgentConfig from YAML for consistency (uses defaults for prompt settings)
-            agent_config = AgentConfig.from_yaml_config(llm_config)
+        llm, agent_config, llm_load_error = await _load_llm_bundle(llm_config_id)
+        if llm_load_error:
+            yield _emit_stream_error(
+                message=llm_load_error,
+                error_kind="server_error",
+                error_code="SERVER_ERROR",
+            )
+            yield streaming_service.format_done()
+            return
         _perf_log.info(
             "[stream_new_chat] LLM config loaded in %.3fs (config_id=%s)",
             time.perf_counter() - _t0,
             llm_config_id,
         )
 
+        # Capability safety net: a turn carrying user-uploaded images
+        # cannot be routed to a chat config that LiteLLM's authoritative
+        # model map *explicitly* marks as text-only (``supports_vision``
+        # set to False). The check is intentionally narrow — it only
+        # fires when LiteLLM is *certain* the model can't accept image
+        # input. Unknown / unmapped / vision-capable models pass
+        # through. Without this guard a known-text-only model would 404
+        # at the provider with ``"No endpoints found that support image
+        # input"``, surfacing as an opaque ``SERVER_ERROR`` SSE chunk;
+        # failing here lets us return a friendly message that tells the
+        # user what to change.
+        if user_image_data_urls and agent_config is not None:
+            from app.services.provider_capabilities import (
+                is_known_text_only_chat_model,
+            )
+
+            agent_litellm_params = agent_config.litellm_params or {}
+            agent_base_model = (
+                agent_litellm_params.get("base_model")
+                if isinstance(agent_litellm_params, dict)
+                else None
+            )
+            if is_known_text_only_chat_model(
+                provider=agent_config.provider,
+                model_name=agent_config.model_name,
+                base_model=agent_base_model,
+                custom_provider=agent_config.custom_provider,
+            ):
+                model_label = (
+                    agent_config.config_name or agent_config.model_name or "model"
+                )
+                yield _emit_stream_error(
+                    message=(
+                        f"The selected model ({model_label}) does not support "
+                        "image input. Switch to a vision-capable model "
+                        "(e.g. GPT-4o, Claude, Gemini) or remove the image "
+                        "attachment and try again."
+                    ),
+                    error_kind="user_error",
+                    error_code="MODEL_DOES_NOT_SUPPORT_IMAGE_INPUT",
+                )
+                yield streaming_service.format_done()
+                return
+
+        # Premium quota reservation for pinned premium model only.
+        _needs_premium_quota = (
+            agent_config is not None and user_id and agent_config.is_premium
+        )
+        if _needs_premium_quota:
+            import uuid as _uuid
+
+            from app.services.token_quota_service import (
+                TokenQuotaService,
+                estimate_call_reserve_micros,
+            )
+
+            _premium_request_id = _uuid.uuid4().hex[:16]
+            _agent_litellm_params = agent_config.litellm_params or {}
+            _agent_base_model = (
+                _agent_litellm_params.get("base_model") or agent_config.model_name or ""
+            )
+            reserve_amount_micros = estimate_call_reserve_micros(
+                base_model=_agent_base_model,
+                quota_reserve_tokens=agent_config.quota_reserve_tokens,
+            )
+            async with shielded_async_session() as quota_session:
+                quota_result = await TokenQuotaService.premium_reserve(
+                    db_session=quota_session,
+                    user_id=UUID(user_id),
+                    request_id=_premium_request_id,
+                    reserve_micros=reserve_amount_micros,
+                )
+            _premium_reserved_micros = reserve_amount_micros
+            if not quota_result.allowed:
+                if requested_llm_config_id == 0:
+                    try:
+                        llm_config_id = (
+                            await resolve_or_get_pinned_llm_config_id(
+                                session,
+                                thread_id=chat_id,
+                                search_space_id=search_space_id,
+                                user_id=user_id,
+                                selected_llm_config_id=0,
+                                force_repin_free=True,
+                                requires_image_input=_requires_image_input,
+                            )
+                        ).resolved_llm_config_id
+                    except ValueError as pin_error:
+                        yield _emit_stream_error(
+                            message=str(pin_error),
+                            error_kind="server_error",
+                            error_code="SERVER_ERROR",
+                        )
+                        yield streaming_service.format_done()
+                        return
+
+                    llm, agent_config, llm_load_error = await _load_llm_bundle(
+                        llm_config_id
+                    )
+                    if llm_load_error:
+                        yield _emit_stream_error(
+                            message=llm_load_error,
+                            error_kind="server_error",
+                            error_code="SERVER_ERROR",
+                        )
+                        yield streaming_service.format_done()
+                        return
+                    _premium_request_id = None
+                    _premium_reserved_micros = 0
+                    _log_chat_stream_error(
+                        flow=flow,
+                        error_kind="premium_quota_exhausted",
+                        error_code="PREMIUM_QUOTA_EXHAUSTED",
+                        severity="info",
+                        is_expected=True,
+                        request_id=request_id,
+                        thread_id=chat_id,
+                        search_space_id=search_space_id,
+                        user_id=user_id,
+                        message=(
+                            "Premium quota exhausted on pinned model; auto-fallback switched to a free model"
+                        ),
+                        extra={
+                            "fallback_config_id": llm_config_id,
+                            "auto_fallback": True,
+                        },
+                    )
+                else:
+                    yield _emit_stream_error(
+                        message=(
+                            "Buy more tokens to continue with this model, or switch to a free model"
+                        ),
+                        error_kind="premium_quota_exhausted",
+                        error_code="PREMIUM_QUOTA_EXHAUSTED",
+                        severity="info",
+                        is_expected=True,
+                        extra={
+                            "resolved_config_id": llm_config_id,
+                            "auto_fallback": False,
+                        },
+                    )
+                    yield streaming_service.format_done()
+                    return
+
         if not llm:
-            yield streaming_service.format_error("Failed to create LLM instance")
+            yield _emit_stream_error(
+                message="Failed to create LLM instance",
+                error_kind="server_error",
+                error_code="SERVER_ERROR",
+            )
             yield streaming_service.format_done()
             return
 
+        # Auto-mode preflight ping. Runs ONLY for thread-pinned auto cfgs
+        # (negative ids selected via ``resolve_or_get_pinned_llm_config_id``)
+        # whose health hasn't already been confirmed within the TTL window.
+        # Detecting a 429 here lets us repin BEFORE the planner/classifier/
+        # title-generation LLM calls fan out and each independently hit the
+        # same upstream rate limit.
+        #
+        # PERF: preflight is a network round-trip to the LLM provider (~1-5s)
+        # and is independent of the agent build (CPU-bound, ~5-7s). They used
+        # to run sequentially → ``preflight + build`` on cold cache = 11.5s.
+        # We now kick off preflight as a background task FIRST, then run the
+        # synchronous setup work and the agent build in parallel. In the
+        # success path (the common case) total wall time drops to roughly
+        # ``max(preflight, build)`` — the preflight finishes during the
+        # agent compile and we just consume its result. In the rare 429
+        # path the speculative build is awaited to completion (so its
+        # session usage is fully released) via
+        # :func:`_settle_speculative_agent_build`, then discarded, and
+        # we fall back to the original repin-and-rebuild flow.
+        preflight_needed = (
+            requested_llm_config_id == 0
+            and llm_config_id < 0
+            and not is_recently_healthy(llm_config_id)
+        )
+        preflight_task: asyncio.Task[None] | None = None
+        _t_preflight = 0.0
+        if preflight_needed:
+            _t_preflight = time.perf_counter()
+            preflight_task = asyncio.create_task(
+                _preflight_llm(llm),
+                name=f"auto_pin_preflight:{llm_config_id}",
+            )
+
         # Create connector service
         _t0 = time.perf_counter()
         connector_service = ConnectorService(session, search_space_id=search_space_id)
@@ -1234,22 +1296,147 @@ async def stream_new_chat(
             "[stream_new_chat] Checkpointer ready in %.3fs", time.perf_counter() - _t0
         )
 
-        visibility = thread_visibility or ChatVisibility.PRIVATE
-        _t0 = time.perf_counter()
-        agent = await create_surfsense_deep_agent(
-            llm=llm,
-            search_space_id=search_space_id,
-            db_session=session,
-            connector_service=connector_service,
-            checkpointer=checkpointer,
-            user_id=user_id,
-            thread_id=chat_id,
-            agent_config=agent_config,
-            firecrawl_api_key=firecrawl_api_key,
-            thread_visibility=visibility,
-            disabled_tools=disabled_tools,
-            mentioned_document_ids=mentioned_document_ids,
-        )
+        visibility = thread_visibility or ChatVisibility.PRIVATE
+        from app.config import config as _app_config
+
+        use_multi_agent = bool(_app_config.MULTI_AGENT_CHAT_ENABLED)
+
+        _t0 = time.perf_counter()
+        agent_factory = (
+            create_multi_agent_chat_deep_agent
+            if use_multi_agent
+            else create_surfsense_deep_agent
+        )
+        # Speculative agent build — runs in parallel with the preflight
+        # task (if any). Built with the *current* ``llm`` / ``agent_config``;
+        # if preflight reports 429 we will discard this future and rebuild
+        # against the freshly pinned config below.
+        agent_build_task = asyncio.create_task(
+            _build_main_agent_for_thread(
+                agent_factory,
+                llm=llm,
+                search_space_id=search_space_id,
+                db_session=session,
+                connector_service=connector_service,
+                checkpointer=checkpointer,
+                user_id=user_id,
+                thread_id=chat_id,
+                agent_config=agent_config,
+                firecrawl_api_key=firecrawl_api_key,
+                thread_visibility=visibility,
+                filesystem_selection=filesystem_selection,
+                disabled_tools=disabled_tools,
+                mentioned_document_ids=mentioned_document_ids,
+            ),
+            name="agent_build:stream_new_chat",
+        )
+
+        agent: Any = None
+        if preflight_task is not None:
+            try:
+                await preflight_task
+                mark_healthy(llm_config_id)
+                _perf_log.info(
+                    "[stream_new_chat] auto_pin_preflight ok config_id=%s took=%.3fs (parallel)",
+                    llm_config_id,
+                    time.perf_counter() - _t_preflight,
+                )
+            except Exception as preflight_exc:
+                # Both branches below need the session: the non-429 path
+                # may unwind via cleanup that uses ``session``, and the
+                # 429 path explicitly calls ``resolve_or_get_pinned_llm_config_id``
+                # against it. Wait for the speculative build to release its
+                # session usage before we proceed.
+                await _settle_speculative_agent_build(agent_build_task)
+                if not _is_provider_rate_limited(preflight_exc):
+                    raise
+                # 429: speculative agent is discarded; run the original
+                # repin → reload → rebuild path against the freshly
+                # pinned config.
+                previous_config_id = llm_config_id
+                mark_runtime_cooldown(
+                    previous_config_id, reason="preflight_rate_limited"
+                )
+                try:
+                    llm_config_id = (
+                        await resolve_or_get_pinned_llm_config_id(
+                            session,
+                            thread_id=chat_id,
+                            search_space_id=search_space_id,
+                            user_id=user_id,
+                            selected_llm_config_id=0,
+                            exclude_config_ids={previous_config_id},
+                            requires_image_input=_requires_image_input,
+                        )
+                    ).resolved_llm_config_id
+                except ValueError as pin_error:
+                    yield _emit_stream_error(
+                        message=str(pin_error),
+                        error_kind="server_error",
+                        error_code="SERVER_ERROR",
+                    )
+                    yield streaming_service.format_done()
+                    return
+
+                llm, agent_config, llm_load_error = await _load_llm_bundle(
+                    llm_config_id
+                )
+                if llm_load_error or not llm:
+                    yield _emit_stream_error(
+                        message=llm_load_error or "Failed to create LLM instance",
+                        error_kind="server_error",
+                        error_code="SERVER_ERROR",
+                    )
+                    yield streaming_service.format_done()
+                    return
+                # Trust the freshly-resolved cfg for the remainder of this
+                # turn rather than recursing into another preflight; the
+                # in-stream 429 recovery loop is still in place as the
+                # safety net if even this fallback hits an upstream cap.
+                mark_healthy(llm_config_id)
+                _log_chat_stream_error(
+                    flow=flow,
+                    error_kind="rate_limited",
+                    error_code="RATE_LIMITED",
+                    severity="info",
+                    is_expected=True,
+                    request_id=request_id,
+                    thread_id=chat_id,
+                    search_space_id=search_space_id,
+                    user_id=user_id,
+                    message=(
+                        "Auto-pinned model failed preflight; switched to another "
+                        "eligible model and continuing."
+                    ),
+                    extra={
+                        "auto_runtime_recover": True,
+                        "preflight": True,
+                        "previous_config_id": previous_config_id,
+                        "fallback_config_id": llm_config_id,
+                    },
+                )
+                # Rebuild against the new llm/agent_config. Sequential
+                # here because we no longer have anything to overlap with.
+                agent = await agent_factory(
+                    llm=llm,
+                    search_space_id=search_space_id,
+                    db_session=session,
+                    connector_service=connector_service,
+                    checkpointer=checkpointer,
+                    user_id=user_id,
+                    thread_id=chat_id,
+                    agent_config=agent_config,
+                    firecrawl_api_key=firecrawl_api_key,
+                    thread_visibility=visibility,
+                    disabled_tools=disabled_tools,
+                    mentioned_document_ids=mentioned_document_ids,
+                    filesystem_selection=filesystem_selection,
+                )
+
+        if agent is None:
+            # Either no preflight was needed, or preflight succeeded —
+            # in both cases the speculative build is the agent we want.
+            agent = await agent_build_task
         _perf_log.info(
             "[stream_new_chat] Agent created in %.3fs", time.perf_counter() - _t0
         )
@@ -1302,8 +1489,63 @@ async def stream_new_chat(
         )
         recent_reports = list(recent_reports_result.scalars().all())
 
-        # Format the user query with context (SurfSense docs + reports only)
-        final_query = user_query
+        # Resolve @-mention chips to canonical virtual paths and rewrite
+        # the user-typed text so the LLM sees ``\`/documents/...\``` instead
+        # of bare ``@title``. The substitution lands in ``agent_user_query``
+        # ONLY — the original ``user_query`` (with ``@title`` tokens) flows
+        # untouched into ``persist_user_turn`` below so chip rendering on
+        # reload still works (``UserTextPart`` → ``parseMentionSegments``
+        # matches ``@title``, not ``\`/documents/...\```). It also feeds
+        # the human-readable surfaces — SSE "Processing X" status, auto
+        # thread title, memory seed — which all want what the user typed.
+        # See ``persistence._build_user_content``.
+        #
+        # Cloud mode only: local-folder mode keeps the legacy
+        # ``@title`` text path; mention support there is a follow-up
+        # task because the path scheme (mount-rooted) and the picker
+        # UI both need separate work.
+        agent_user_query = user_query
+        accepted_folder_ids: list[int] = []
+        if fs_mode == FilesystemMode.CLOUD.value and (
+            mentioned_document_ids
+            or mentioned_surfsense_doc_ids
+            or mentioned_folder_ids
+            or mentioned_documents
+        ):
+            from app.schemas.new_chat import (
+                MentionedDocumentInfo as _MentionedDocumentInfo,
+            )
+
+            chip_objs: list[_MentionedDocumentInfo] | None = None
+            if mentioned_documents:
+                chip_objs = []
+                for raw in mentioned_documents:
+                    if isinstance(raw, _MentionedDocumentInfo):
+                        chip_objs.append(raw)
+                        continue
+                    try:
+                        chip_objs.append(_MentionedDocumentInfo.model_validate(raw))
+                    except Exception:
+                        logger.debug(
+                            "stream_new_chat: dropping malformed mention chip %r",
+                            raw,
+                        )
+
+            resolved = await resolve_mentions(
+                session,
+                search_space_id=search_space_id,
+                mentioned_documents=chip_objs,
+                mentioned_document_ids=mentioned_document_ids,
+                mentioned_surfsense_doc_ids=mentioned_surfsense_doc_ids,
+                mentioned_folder_ids=mentioned_folder_ids,
+            )
+            agent_user_query = substitute_in_text(user_query, resolved.token_to_path)
+            accepted_folder_ids = resolved.mentioned_folder_ids
+
+        # Format the user query with context (SurfSense docs + reports only).
+        # Uses ``agent_user_query`` so the LLM sees backtick-wrapped paths
+        # instead of bare ``@title`` tokens.
+        final_query = agent_user_query
         context_parts = []
 
         if mentioned_surfsense_docs:
@@ -1334,7 +1576,7 @@ async def stream_new_chat(
 
         if context_parts:
             context = "\n\n".join(context_parts)
-            final_query = f"{context}\n\n<user_query>{user_query}</user_query>"
+            final_query = f"{context}\n\n<user_query>{agent_user_query}</user_query>"
 
         if visibility == ChatVisibility.SEARCH_SPACE and current_user_display_name:
             final_query = f"**[{current_user_display_name}]:** {final_query}"
@@ -1347,14 +1589,18 @@ async def stream_new_chat(
         #         elif msg.role == "assistant":
         #             langchain_messages.append(AIMessage(content=msg.content))
         # else:
-        # Fallback: just use the current user query with attachment context
-        langchain_messages.append(HumanMessage(content=final_query))
+        human_content = build_human_message_content(
+            final_query, list(user_image_data_urls or ())
+        )
+        langchain_messages.append(HumanMessage(content=human_content))
 
         input_state = {
             # Lets not pass this message atm because we are using the checkpointer to manage the conversation history
             # We will use this to simulate group chat functionality in the future
             "messages": langchain_messages,
             "search_space_id": search_space_id,
+            "request_id": request_id or "unknown",
+            "turn_id": stream_result.turn_id,
         }
 
         _perf_log.info(
@@ -1384,18 +1630,131 @@ async def stream_new_chat(
         # Configure LangGraph with thread_id for memory
         # If checkpoint_id is provided, fork from that checkpoint (for edit/reload)
         configurable = {"thread_id": str(chat_id)}
+        configurable["request_id"] = request_id or "unknown"
+        configurable["turn_id"] = stream_result.turn_id
         if checkpoint_id:
             configurable["checkpoint_id"] = checkpoint_id
 
         config = {
             "configurable": configurable,
-            "recursion_limit": 80,  # Increase from default 25 to allow more tool iterations
+            # Effectively uncapped, matching the agent-level
+            # ``with_config`` default in ``chat_deepagent.create_agent``
+            # and the unbounded ``while(true)`` loop used by OpenCode's
+            # ``session/processor.ts``. Real circuit-breakers live in
+            # middleware: ``DoomLoopMiddleware`` (sliding-window tool
+            # signature check), plus ``enable_tool_call_limit`` /
+            # ``enable_model_call_limit`` when those flags are set. The
+            # original LangGraph default of 25 (and our previous 80
+            # bump) hit users on legitimate multi-tool plans.
+            "recursion_limit": 10_000,
         }
 
         # Start the message stream
         yield streaming_service.format_message_start()
         yield streaming_service.format_start_step()
 
+        # Surface the per-turn correlation id at the very start of the
+        # stream so the frontend can stamp it onto the in-flight
+        # assistant message and replay it via ``appendMessage``
+        # for durable storage. Tool/action-log events DO carry it later,
+        # but pure-text turns never produce action-log events; this
+        # event guarantees the frontend learns the turn id regardless.
+        yield streaming_service.format_data(
+            "turn-info",
+            {"chat_turn_id": stream_result.turn_id},
+        )
+        yield streaming_service.format_data("turn-status", {"status": "busy"})
+
+        # Persist the user-side row for this turn before any expensive
+        # work runs. Closes the "ghost-thread" abuse vector
+        # (authenticated client hits POST /new_chat then never calls
+        # /messages — empty new_chat_messages, free LLM completion).
+        # Idempotent against the unique index in migration 141 so the
+        # legacy frontend appendMessage call is a no-op on the second
+        # writer. Hard failure aborts the turn so we never produce a
+        # title or assistant row that isn't anchored to a persisted
+        # user message.
+        from app.tasks.chat.content_builder import AssistantContentBuilder
+        from app.tasks.chat.persistence import (
+            persist_assistant_shell,
+            persist_user_turn,
+        )
+
+        user_message_id = await persist_user_turn(
+            chat_id=chat_id,
+            user_id=user_id,
+            turn_id=stream_result.turn_id,
+            user_query=user_query,
+            user_image_data_urls=user_image_data_urls,
+            mentioned_documents=mentioned_documents,
+        )
+        if user_message_id is None:
+            yield _emit_stream_error(
+                message=(
+                    "We couldn't save your message. Please try again in a moment."
+                ),
+                error_kind="server_error",
+                error_code="MESSAGE_PERSIST_FAILED",
+            )
+            yield streaming_service.format_data("turn-status", {"status": "idle"})
+            yield streaming_service.format_finish_step()
+            yield streaming_service.format_finish()
+            yield streaming_service.format_done()
+            return
+
+        # Emit canonical user message id BEFORE any LLM streaming so the
+        # FE can rename its optimistic ``msg-user-XXX`` placeholder to
+        # ``msg-{user_message_id}`` and unlock features gated on a real
+        # DB id (comments, edit-from-this-message). See B4 in
+        # ``sse-based_message_id_handshake`` plan.
+        yield streaming_service.format_data(
+            "user-message-id",
+            {"message_id": user_message_id, "turn_id": stream_result.turn_id},
+        )
+
+        # Pre-write the assistant row for this turn so we have a stable
+        # ``message_id`` to anchor mid-stream metadata (token_usage,
+        # future agent_action_log.message_id correlation) and a
+        # write-once UPDATE target at finalize time. Idempotent against
+        # the (thread_id, turn_id, ASSISTANT) partial unique index from
+        # migration 141 — if the legacy frontend appendMessage races
+        # this, we recover the existing row's id.
+        assistant_message_id = await persist_assistant_shell(
+            chat_id=chat_id,
+            user_id=user_id,
+            turn_id=stream_result.turn_id,
+        )
+        if assistant_message_id is None:
+            # Genuine DB failure — abort the turn rather than stream
+            # into a void. The user row is already persisted so the
+            # legacy "ghost-thread" gate isn't reopened.
+            yield _emit_stream_error(
+                message=(
+                    "We couldn't initialize the assistant message. Please try again."
+                ),
+                error_kind="server_error",
+                error_code="MESSAGE_PERSIST_FAILED",
+            )
+            yield streaming_service.format_data("turn-status", {"status": "idle"})
+            yield streaming_service.format_finish_step()
+            yield streaming_service.format_finish()
+            yield streaming_service.format_done()
+            return
+
+        # Emit canonical assistant message id BEFORE any LLM streaming
+        # so the FE can rename its optimistic ``msg-assistant-XXX``
+        # placeholder to ``msg-{assistant_message_id}`` and bind
+        # ``tokenUsageStore`` / ``pendingInterrupt`` to the real id
+        # immediately. See B4 in ``sse-based_message_id_handshake``
+        # plan.
+        yield streaming_service.format_data(
+            "assistant-message-id",
+            {"message_id": assistant_message_id, "turn_id": stream_result.turn_id},
+        )
+
+        stream_result.assistant_message_id = assistant_message_id
+        stream_result.content_builder = AssistantContentBuilder()
+
         # Initial thinking step - analyzing the request
         if mentioned_surfsense_docs:
             initial_title = "Analyzing referenced content"
@@ -1405,8 +1764,13 @@ async def stream_new_chat(
             action_verb = "Processing"
 
         processing_parts = []
-        query_text = user_query[:80] + ("..." if len(user_query) > 80 else "")
-        processing_parts.append(query_text)
+        if user_query.strip():
+            query_text = user_query[:80] + ("..." if len(user_query) > 80 else "")
+            processing_parts.append(query_text)
+        elif user_image_data_urls:
+            processing_parts.append(f"[{len(user_image_data_urls)} image(s)]")
+        else:
+            processing_parts.append("(message)")
 
         if mentioned_surfsense_docs:
             doc_names = []
@@ -1423,6 +1787,15 @@ async def stream_new_chat(
         initial_items = [f"{action_verb}: {' '.join(processing_parts)}"]
         initial_step_id = "thinking-1"
 
+        # Drive the builder for this initial thinking step too — the
+        # ``_emit_thinking_step`` helper lives inside ``_stream_agent_events``
+        # so it isn't in scope here, but the FE folds this step into
+        # the same singleton ``data-thinking-steps`` part as everything
+        # the agent stream emits later. Mirror that fold server-side.
+        if stream_result.content_builder is not None:
+            stream_result.content_builder.on_thinking_step(
+                initial_step_id, initial_title, "in_progress", initial_items
+            )
         yield streaming_service.format_thinking_step(
             step_id=initial_step_id,
             title=initial_title,
@@ -1439,75 +1812,299 @@ async def stream_new_chat(
         # Check if this is the first assistant response so we can generate
         # a title in parallel with the agent stream (better UX than waiting
         # until after the full response).
-        assistant_count_result = await session.execute(
-            select(func.count(NewChatMessage.id)).filter(
+        # Use a LIMIT 1 EXISTS-style probe rather than COUNT(*) because
+        # this is now a hot path executed on every turn, and COUNT scales
+        # with thread length (server-side persistence can grow rows
+        # quickly under power users).
+        #
+        # IMPORTANT: ``persist_assistant_shell`` above (line ~3112) already
+        # inserted THIS turn's assistant row. We must therefore exclude
+        # it from the probe — otherwise the gate fires on every turn
+        # except the very first, and title generation never runs for new
+        # threads. Excluding by primary key (``id != assistant_message_id``)
+        # is bulletproof regardless of ``turn_id`` shape (legacy NULLs,
+        # resume turns, etc.).
+        first_assistant_probe = await session.execute(
+            select(NewChatMessage.id)
+            .filter(
                 NewChatMessage.thread_id == chat_id,
                 NewChatMessage.role == "assistant",
+                NewChatMessage.id != assistant_message_id,
             )
+            .limit(1)
         )
-        is_first_response = (assistant_count_result.scalar() or 0) == 0
+        is_first_response = first_assistant_probe.scalars().first() is None
+
+        title_task: asyncio.Task[tuple[str | None, dict | None]] | None = None
+        # Gate title generation on a persisted user message so a stream
+        # that fails before persistence (we abort above) can never leave
+        # behind a thread with a generated title and no anchoring rows.
+        if is_first_response and user_message_id is not None:
+
+            async def _generate_title() -> tuple[str | None, dict | None]:
+                """Generate a short title via litellm.acompletion.
+
+                Returns (title, usage_dict).  Usage is extracted directly from
+                the response object because litellm fires its async callback
+                via fire-and-forget ``create_task``, so the
+                ``TokenTrackingCallback`` would run too late.  We also blank
+                the accumulator in this child-task context so the late callback
+                doesn't double-count.
+                """
+                try:
+                    from litellm import acompletion
 
-        title_task: asyncio.Task[str | None] | None = None
-        if is_first_response:
+                    from app.services.llm_router_service import LLMRouterService
+                    from app.services.provider_api_base import resolve_api_base
+                    from app.services.token_tracking_service import _turn_accumulator
 
-            async def _generate_title() -> str | None:
-                try:
-                    title_chain = TITLE_GENERATION_PROMPT_TEMPLATE | llm
-                    title_result = await title_chain.ainvoke(
-                        {"user_query": user_query[:500]}
+                    _turn_accumulator.set(None)
+
+                    title_seed = user_query.strip() or (
+                        f"[{len(user_image_data_urls or [])} image(s)]"
+                        if user_image_data_urls
+                        else ""
+                    )
+                    prompt = TITLE_GENERATION_PROMPT.replace(
+                        "{user_query}", title_seed[:500] or "(message)"
                     )
-                    if title_result and hasattr(title_result, "content"):
-                        raw_title = title_result.content.strip()
-                        if raw_title and len(raw_title) <= 100:
-                            return raw_title.strip("\"'")
+                    messages = [{"role": "user", "content": prompt}]
+
+                    if getattr(llm, "model", None) == "auto":
+                        router = LLMRouterService.get_router()
+                        response = await router.acompletion(
+                            model="auto", messages=messages
+                        )
+                    else:
+                        # Apply the same ``api_base`` cascade chat / vision /
+                        # image-gen call sites use so we never inherit
+                        # ``litellm.api_base`` (commonly set by
+                        # ``AZURE_OPENAI_ENDPOINT``) when the chat config
+                        # itself ships an empty ``api_base``. Without this
+                        # the title-gen on an OpenRouter chat config would
+                        # 404 against the inherited Azure endpoint — see
+                        # ``provider_api_base`` docstring for the same
+                        # bug repro on the image-gen / vision paths.
+                        raw_model = getattr(llm, "model", "") or ""
+                        provider_prefix = (
+                            raw_model.split("/", 1)[0] if "/" in raw_model else None
+                        )
+                        provider_value = (
+                            agent_config.provider if agent_config is not None else None
+                        )
+                        title_api_base = resolve_api_base(
+                            provider=provider_value,
+                            provider_prefix=provider_prefix,
+                            config_api_base=getattr(llm, "api_base", None),
+                        )
+                        response = await acompletion(
+                            model=raw_model,
+                            messages=messages,
+                            api_key=getattr(llm, "api_key", None),
+                            api_base=title_api_base,
+                        )
+
+                    usage_info = None
+                    usage = getattr(response, "usage", None)
+                    if usage:
+                        raw_model = getattr(llm, "model", "") or ""
+                        model_name = (
+                            raw_model.split("/", 1)[-1]
+                            if "/" in raw_model
+                            else (raw_model or response.model or "unknown")
+                        )
+                        usage_info = {
+                            "model": model_name,
+                            "prompt_tokens": getattr(usage, "prompt_tokens", 0) or 0,
+                            "completion_tokens": getattr(usage, "completion_tokens", 0)
+                            or 0,
+                            "total_tokens": getattr(usage, "total_tokens", 0) or 0,
+                        }
+
+                    raw_title = response.choices[0].message.content.strip()
+                    if raw_title and len(raw_title) <= 100:
+                        return raw_title.strip("\"'"), usage_info
+                    return None, usage_info
                 except Exception:
-                    pass
-                return None
+                    logging.getLogger(__name__).exception(
+                        "[TitleGen] _generate_title failed"
+                    )
+                    return None, None
 
             title_task = asyncio.create_task(_generate_title())
 
         title_emitted = False
 
+        # Build the per-invocation runtime context (Phase 1.5).
+        # ``mentioned_document_ids`` is read by ``KnowledgePriorityMiddleware``
+        # via ``runtime.context.mentioned_document_ids`` instead of its
+        # ``__init__`` closure — that way the same compiled-agent instance
+        # can serve multiple turns with different mention lists.
+        runtime_context = SurfSenseContextSchema(
+            search_space_id=search_space_id,
+            mentioned_document_ids=list(mentioned_document_ids or []),
+            mentioned_folder_ids=list(
+                accepted_folder_ids or mentioned_folder_ids or []
+            ),
+            request_id=request_id,
+            turn_id=stream_result.turn_id,
+        )
+
         _t_stream_start = time.perf_counter()
         _first_event_logged = False
-        async for sse in _stream_agent_events(
-            agent=agent,
-            config=config,
-            input_data=input_state,
-            streaming_service=streaming_service,
-            result=stream_result,
-            step_prefix="thinking",
-            initial_step_id=initial_step_id,
-            initial_step_title=initial_title,
-            initial_step_items=initial_items,
-        ):
-            if not _first_event_logged:
-                _perf_log.info(
-                    "[stream_new_chat] First agent event in %.3fs (time since stream start), "
-                    "%.3fs (total since request start) (chat_id=%s)",
-                    time.perf_counter() - _t_stream_start,
-                    time.perf_counter() - _t_total,
-                    chat_id,
-                )
-                _first_event_logged = True
-            yield sse
-
-            # Inject title update mid-stream as soon as the background task finishes
-            if title_task is not None and title_task.done() and not title_emitted:
-                generated_title = title_task.result()
-                if generated_title:
-                    async with shielded_async_session() as title_session:
-                        title_thread_result = await title_session.execute(
-                            select(NewChatThread).filter(NewChatThread.id == chat_id)
+        runtime_rate_limit_recovered = False
+        while True:
+            try:
+                async for sse in _stream_agent_events(
+                    agent=agent,
+                    config=config,
+                    input_data=input_state,
+                    streaming_service=streaming_service,
+                    result=stream_result,
+                    step_prefix="thinking",
+                    initial_step_id=initial_step_id,
+                    initial_step_title=initial_title,
+                    initial_step_items=initial_items,
+                    fallback_commit_search_space_id=search_space_id,
+                    fallback_commit_created_by_id=user_id,
+                    fallback_commit_filesystem_mode=(
+                        filesystem_selection.mode
+                        if filesystem_selection
+                        else FilesystemMode.CLOUD
+                    ),
+                    fallback_commit_thread_id=chat_id,
+                    runtime_context=runtime_context,
+                    content_builder=stream_result.content_builder,
+                ):
+                    if not _first_event_logged:
+                        _perf_log.info(
+                            "[stream_new_chat] First agent event in %.3fs (time since stream start), "
+                            "%.3fs (total since request start) (chat_id=%s)",
+                            time.perf_counter() - _t_stream_start,
+                            time.perf_counter() - _t_total,
+                            chat_id,
                         )
-                        title_thread = title_thread_result.scalars().first()
-                        if title_thread:
-                            title_thread.title = generated_title
-                            await title_session.commit()
-                    yield streaming_service.format_thread_title_update(
-                        chat_id, generated_title
+                        _first_event_logged = True
+                    yield sse
+
+                    # Inject title update mid-stream as soon as the background
+                    # task finishes.
+                    if (
+                        title_task is not None
+                        and title_task.done()
+                        and not title_emitted
+                    ):
+                        generated_title, title_usage = title_task.result()
+                        if title_usage:
+                            accumulator.add(**title_usage)
+                        if generated_title:
+                            async with shielded_async_session() as title_session:
+                                title_thread_result = await title_session.execute(
+                                    select(NewChatThread).filter(
+                                        NewChatThread.id == chat_id
+                                    )
+                                )
+                                title_thread = title_thread_result.scalars().first()
+                                if title_thread:
+                                    title_thread.title = generated_title
+                                    await title_session.commit()
+                            yield streaming_service.format_thread_title_update(
+                                chat_id, generated_title
+                            )
+                        title_emitted = True
+                break
+            except Exception as stream_exc:
+                can_runtime_recover = (
+                    not runtime_rate_limit_recovered
+                    and requested_llm_config_id == 0
+                    and llm_config_id < 0
+                    and not _first_event_logged
+                    and _is_provider_rate_limited(stream_exc)
+                )
+                if not can_runtime_recover:
+                    raise
+
+                runtime_rate_limit_recovered = True
+                previous_config_id = llm_config_id
+                # The failed attempt may still hold the per-thread busy mutex
+                # (middleware teardown can lag behind raised provider errors).
+                # Force release before we retry within the same request.
+                end_turn(str(chat_id))
+                mark_runtime_cooldown(
+                    previous_config_id,
+                    reason="provider_rate_limited",
+                )
+
+                llm_config_id = (
+                    await resolve_or_get_pinned_llm_config_id(
+                        session,
+                        thread_id=chat_id,
+                        search_space_id=search_space_id,
+                        user_id=user_id,
+                        selected_llm_config_id=0,
+                        exclude_config_ids={previous_config_id},
+                        requires_image_input=_requires_image_input,
                     )
-                title_emitted = True
+                ).resolved_llm_config_id
+
+                llm, agent_config, llm_load_error = await _load_llm_bundle(
+                    llm_config_id
+                )
+                if llm_load_error:
+                    raise stream_exc
+
+                # Title generation uses the initial llm object. After a runtime
+                # repin we keep the stream focused on response recovery and skip
+                # title generation for this turn.
+                if title_task is not None and not title_task.done():
+                    title_task.cancel()
+                title_task = None
+
+                _t0 = time.perf_counter()
+                agent = await _build_main_agent_for_thread(
+                    agent_factory,
+                    llm=llm,
+                    search_space_id=search_space_id,
+                    db_session=session,
+                    connector_service=connector_service,
+                    checkpointer=checkpointer,
+                    user_id=user_id,
+                    thread_id=chat_id,
+                    agent_config=agent_config,
+                    firecrawl_api_key=firecrawl_api_key,
+                    thread_visibility=visibility,
+                    filesystem_selection=filesystem_selection,
+                    disabled_tools=disabled_tools,
+                    mentioned_document_ids=mentioned_document_ids,
+                )
+                _perf_log.info(
+                    "[stream_new_chat] Runtime rate-limit recovery repinned "
+                    "config_id=%s -> %s and rebuilt agent in %.3fs",
+                    previous_config_id,
+                    llm_config_id,
+                    time.perf_counter() - _t0,
+                )
+                _log_chat_stream_error(
+                    flow=flow,
+                    error_kind="rate_limited",
+                    error_code="RATE_LIMITED",
+                    severity="info",
+                    is_expected=True,
+                    request_id=request_id,
+                    thread_id=chat_id,
+                    search_space_id=search_space_id,
+                    user_id=user_id,
+                    message=(
+                        "Auto-pinned model hit runtime rate limit; switched to "
+                        "another eligible model and retried."
+                    ),
+                    extra={
+                        "auto_runtime_recover": True,
+                        "previous_config_id": previous_config_id,
+                        "fallback_config_id": llm_config_id,
+                    },
+                )
+                continue
 
         _perf_log.info(
             "[stream_new_chat] Agent stream completed in %.3fs (chat_id=%s)",
@@ -1519,6 +2116,28 @@ async def _generate_title() -> str | None:
         if stream_result.is_interrupted:
             if title_task is not None and not title_task.done():
                 title_task.cancel()
+
+            usage_summary = accumulator.per_message_summary()
+            _perf_log.info(
+                "[token_usage] interrupted new_chat: calls=%d total=%d cost_micros=%d summary=%s",
+                len(accumulator.calls),
+                accumulator.grand_total,
+                accumulator.total_cost_micros,
+                usage_summary,
+            )
+            if usage_summary:
+                yield streaming_service.format_data(
+                    "token-usage",
+                    {
+                        "usage": usage_summary,
+                        "prompt_tokens": accumulator.total_prompt_tokens,
+                        "completion_tokens": accumulator.total_completion_tokens,
+                        "total_tokens": accumulator.grand_total,
+                        "cost_micros": accumulator.total_cost_micros,
+                        "call_details": accumulator.serialized_calls(),
+                    },
+                )
+
             yield streaming_service.format_finish_step()
             yield streaming_service.format_finish()
             yield streaming_service.format_done()
@@ -1526,7 +2145,9 @@ async def _generate_title() -> str | None:
 
         # If the title task didn't finish during streaming, await it now
         if title_task is not None and not title_emitted:
-            generated_title = await title_task
+            generated_title, title_usage = await title_task
+            if title_usage:
+                accumulator.add(**title_usage)
             if generated_title:
                 async with shielded_async_session() as title_session:
                     title_thread_result = await title_session.execute(
@@ -1540,7 +2161,85 @@ async def _generate_title() -> str | None:
                     chat_id, generated_title
                 )
 
+        # Finalize premium credit debit with the actual provider cost
+        # reported by LiteLLM, summed across every call in the turn.
+        # Mirrors the pre-cost behaviour of "premium turn → all calls
+        # count" so free sub-agent calls during a premium turn still
+        # contribute to the bill (they're $0 in practice anyway).
+        if _premium_request_id and user_id:
+            try:
+                from app.services.token_quota_service import TokenQuotaService
+
+                async with shielded_async_session() as quota_session:
+                    await TokenQuotaService.premium_finalize(
+                        db_session=quota_session,
+                        user_id=UUID(user_id),
+                        request_id=_premium_request_id,
+                        actual_micros=accumulator.total_cost_micros,
+                        reserved_micros=_premium_reserved_micros,
+                    )
+                _premium_request_id = None
+                _premium_reserved_micros = 0
+            except Exception:
+                logging.getLogger(__name__).warning(
+                    "Failed to finalize premium quota for user %s",
+                    user_id,
+                    exc_info=True,
+                )
+
+        usage_summary = accumulator.per_message_summary()
+        _perf_log.info(
+            "[token_usage] normal new_chat: calls=%d total=%d cost_micros=%d summary=%s",
+            len(accumulator.calls),
+            accumulator.grand_total,
+            accumulator.total_cost_micros,
+            usage_summary,
+        )
+        if usage_summary:
+            yield streaming_service.format_data(
+                "token-usage",
+                {
+                    "usage": usage_summary,
+                    "prompt_tokens": accumulator.total_prompt_tokens,
+                    "completion_tokens": accumulator.total_completion_tokens,
+                    "total_tokens": accumulator.grand_total,
+                    "cost_micros": accumulator.total_cost_micros,
+                    "call_details": accumulator.serialized_calls(),
+                },
+            )
+
+        # Fire background memory extraction if the agent didn't handle it.
+        # Shared threads write to team memory; private threads write to user memory.
+        if not stream_result.agent_called_update_memory:
+            memory_seed = user_query.strip() or (
+                f"[{len(user_image_data_urls or [])} image(s)]"
+                if user_image_data_urls
+                else "(message)"
+            )
+            if visibility == ChatVisibility.SEARCH_SPACE:
+                task = asyncio.create_task(
+                    extract_and_save_team_memory(
+                        user_message=memory_seed,
+                        search_space_id=search_space_id,
+                        llm=llm,
+                        author_display_name=current_user_display_name,
+                    )
+                )
+                _background_tasks.add(task)
+                task.add_done_callback(_background_tasks.discard)
+            elif user_id:
+                task = asyncio.create_task(
+                    extract_and_save_memory(
+                        user_message=memory_seed,
+                        user_id=user_id,
+                        llm=llm,
+                    )
+                )
+                _background_tasks.add(task)
+                task.add_done_callback(_background_tasks.discard)
+
         # Finish the step and message
+        yield streaming_service.format_data("turn-status", {"status": "idle"})
         yield streaming_service.format_finish_step()
         yield streaming_service.format_finish()
         yield streaming_service.format_done()
@@ -1549,12 +2248,41 @@ async def _generate_title() -> str | None:
         # Handle any errors
         import traceback
 
+        # ``BusyError`` fires before the agent acquires the lock; the
+        # cleanup path must skip lock release to avoid freeing the
+        # in-flight caller's lock. Classification is handled below.
+        if isinstance(e, BusyError):
+            _busy_error_raised = True
+
+        (
+            error_kind,
+            error_code,
+            severity,
+            is_expected,
+            user_message,
+            error_extra,
+        ) = _classify_stream_exception(e, flow_label="chat")
         error_message = f"Error during chat: {e!s}"
         print(f"[stream_new_chat] {error_message}")
         print(f"[stream_new_chat] Exception type: {type(e).__name__}")
         print(f"[stream_new_chat] Traceback:\n{traceback.format_exc()}")
-
-        yield streaming_service.format_error(error_message)
+        if error_code == "TURN_CANCELLING":
+            status_payload: dict[str, Any] = {"status": "cancelling"}
+            if error_extra:
+                status_payload.update(error_extra)
+            yield streaming_service.format_data("turn-status", status_payload)
+        else:
+            yield streaming_service.format_data("turn-status", {"status": "busy"})
+
+        yield _emit_stream_error(
+            message=user_message,
+            error_kind=error_kind,
+            error_code=error_code,
+            severity=severity,
+            is_expected=is_expected,
+            extra=error_extra,
+        )
+        yield streaming_service.format_data("turn-status", {"status": "idle"})
         yield streaming_service.format_finish_step()
         yield streaming_service.format_finish()
         yield streaming_service.format_done()
@@ -1570,6 +2298,27 @@ async def _generate_title() -> str | None:
         # (CancelledError is a BaseException), and the rest of the
         # finally block — including session.close() — would never run.
         with anyio.CancelScope(shield=True):
+            # Authoritative fallback cleanup for lock/cancel state. Middleware
+            # teardown can be skipped on some client-abort paths.
+            end_turn(str(chat_id))
+
+            # Release premium reservation if not finalized
+            if _premium_request_id and _premium_reserved_micros > 0 and user_id:
+                try:
+                    from app.services.token_quota_service import TokenQuotaService
+
+                    async with shielded_async_session() as quota_session:
+                        await TokenQuotaService.premium_release(
+                            db_session=quota_session,
+                            user_id=UUID(user_id),
+                            reserved_micros=_premium_reserved_micros,
+                        )
+                    _premium_reserved_micros = 0
+                except Exception:
+                    logging.getLogger(__name__).warning(
+                        "Failed to release premium quota for user %s", user_id
+                    )
+
             try:
                 await session.rollback()
                 await clear_ai_responding(session, chat_id)
@@ -1588,6 +2337,106 @@ async def _generate_title() -> str | None:
             with contextlib.suppress(Exception):
                 await session.close()
 
+            # Server-side assistant-message + token_usage finalization.
+            # Runs after the main session has been closed (uses its own
+            # shielded session) so we don't fight the same DB connection.
+            # Idempotent against the legacy frontend appendMessage:
+            #  * the assistant row was already INSERTed by
+            #    ``persist_assistant_shell`` above, so this just UPDATEs
+            #    it with the rich ContentPart[] from the builder.
+            #  * token_usage uses INSERT ... ON CONFLICT DO NOTHING
+            #    against migration 142's partial unique index, so a
+            #    racing append_message recovery branch can never
+            #    double-write.
+            # ``mark_interrupted`` closes any open text/reasoning blocks
+            # and flips running tool-calls (no result) to state=aborted
+            # so the persisted JSONB reflects a coherent end-state even
+            # on client disconnect.
+            # Never raises (best-effort, logs only).
+            if (
+                stream_result
+                and stream_result.turn_id
+                and stream_result.assistant_message_id
+            ):
+                from app.tasks.chat.persistence import finalize_assistant_turn
+
+                builder_stats: dict[str, int] | None = None
+                if stream_result.content_builder is not None:
+                    stream_result.content_builder.mark_interrupted()
+                    # Snapshot stats BEFORE deepcopy in ``snapshot()`` so
+                    # the perf log records the actual finalised payload
+                    # (post-mark_interrupted), not the live-mutating
+                    # builder state.
+                    builder_stats = stream_result.content_builder.stats()
+                    content_payload = stream_result.content_builder.snapshot()
+                else:
+                    # Defensive fallback — we always set the builder
+                    # alongside ``assistant_message_id`` above, so this
+                    # branch only fires if a future refactor ever
+                    # decouples them. Persist whatever accumulated
+                    # text we captured so the row at least renders.
+                    content_payload = [
+                        {
+                            "type": "text",
+                            "text": stream_result.accumulated_text or "",
+                        }
+                    ]
+
+                if builder_stats is not None:
+                    _perf_log.info(
+                        "[stream_new_chat] finalize_payload chat_id=%s "
+                        "message_id=%s parts=%d bytes=%d text=%d "
+                        "reasoning=%d tool_calls=%d "
+                        "tool_calls_completed=%d tool_calls_aborted=%d "
+                        "thinking_step_parts=%d step_separators=%d",
+                        chat_id,
+                        stream_result.assistant_message_id,
+                        builder_stats["parts"],
+                        builder_stats["bytes"],
+                        builder_stats["text"],
+                        builder_stats["reasoning"],
+                        builder_stats["tool_calls"],
+                        builder_stats["tool_calls_completed"],
+                        builder_stats["tool_calls_aborted"],
+                        builder_stats["thinking_step_parts"],
+                        builder_stats["step_separators"],
+                    )
+
+                await finalize_assistant_turn(
+                    message_id=stream_result.assistant_message_id,
+                    chat_id=chat_id,
+                    search_space_id=search_space_id,
+                    user_id=user_id,
+                    turn_id=stream_result.turn_id,
+                    content=content_payload,
+                    accumulator=accumulator,
+                )
+
+        # Persist any sandbox-produced files to local storage so they
+        # remain downloadable after the Daytona sandbox auto-deletes.
+        if stream_result and stream_result.sandbox_files:
+            with contextlib.suppress(Exception):
+                from app.agents.new_chat.sandbox import (
+                    is_sandbox_enabled,
+                    persist_and_delete_sandbox,
+                )
+
+                if is_sandbox_enabled():
+                    with anyio.CancelScope(shield=True):
+                        await persist_and_delete_sandbox(
+                            chat_id, stream_result.sandbox_files
+                        )
+
+        # ``aafter_agent`` doesn't fire on ``interrupt()`` or early bailout.
+        # Skip on ``BusyError`` (caller never acquired the lock).
+        if not _busy_error_raised:
+            with contextlib.suppress(Exception):
+                end_turn(str(chat_id))
+                _perf_log.info(
+                    "[stream_new_chat] end_turn cleanup (chat_id=%s)",
+                    chat_id,
+                )
+
         # Break circular refs held by the agent graph, tools, and LLM
         # wrappers so the GC can reclaim them in a single pass.
         agent = llm = connector_service = None
@@ -1612,10 +2461,43 @@ async def stream_resume_chat(
     user_id: str | None = None,
     llm_config_id: int = -1,
     thread_visibility: ChatVisibility | None = None,
+    filesystem_selection: FilesystemSelection | None = None,
+    request_id: str | None = None,
+    disabled_tools: list[str] | None = None,
 ) -> AsyncGenerator[str, None]:
     streaming_service = VercelStreamingService()
     stream_result = StreamResult()
     _t_total = time.perf_counter()
+    fs_mode = filesystem_selection.mode.value if filesystem_selection else "cloud"
+    fs_platform = (
+        filesystem_selection.client_platform.value if filesystem_selection else "web"
+    )
+    stream_result.request_id = request_id
+    stream_result.turn_id = f"{chat_id}:{int(time.time() * 1000)}"
+    stream_result.filesystem_mode = fs_mode
+    stream_result.client_platform = fs_platform
+    _log_file_contract("turn_start", stream_result)
+    _perf_log.info(
+        "[stream_resume] filesystem_mode=%s client_platform=%s",
+        fs_mode,
+        fs_platform,
+    )
+    from app.services.token_tracking_service import start_turn
+
+    accumulator = start_turn()
+
+    # Skip the finally release on ``BusyError`` (caller never acquired the lock).
+    _busy_error_raised = False
+
+    _emit_stream_error = partial(
+        _emit_stream_terminal_error,
+        streaming_service=streaming_service,
+        flow="resume",
+        request_id=request_id,
+        thread_id=chat_id,
+        search_space_id=search_space_id,
+        user_id=user_id,
+    )
 
     session = async_session_maker()
     try:
@@ -1623,39 +2505,202 @@ async def stream_resume_chat(
             await set_ai_responding(session, chat_id, UUID(user_id))
 
         agent_config: AgentConfig | None = None
+        requested_llm_config_id = llm_config_id
+
+        async def _load_llm_bundle(
+            config_id: int,
+        ) -> tuple[Any, AgentConfig | None, str | None]:
+            if config_id >= 0:
+                loaded_agent_config = await load_agent_config(
+                    session=session,
+                    config_id=config_id,
+                    search_space_id=search_space_id,
+                )
+                if not loaded_agent_config:
+                    return (
+                        None,
+                        None,
+                        f"Failed to load NewLLMConfig with id {config_id}",
+                    )
+                return (
+                    create_chat_litellm_from_agent_config(loaded_agent_config),
+                    loaded_agent_config,
+                    None,
+                )
+
+            loaded_llm_config = load_global_llm_config_by_id(config_id)
+            if not loaded_llm_config:
+                return None, None, f"Failed to load LLM config with id {config_id}"
+            return (
+                create_chat_litellm_from_config(loaded_llm_config),
+                AgentConfig.from_yaml_config(loaded_llm_config),
+                None,
+            )
+
         _t0 = time.perf_counter()
-        if llm_config_id >= 0:
-            agent_config = await load_agent_config(
-                session=session,
-                config_id=llm_config_id,
-                search_space_id=search_space_id,
+        try:
+            llm_config_id = (
+                await resolve_or_get_pinned_llm_config_id(
+                    session,
+                    thread_id=chat_id,
+                    search_space_id=search_space_id,
+                    user_id=user_id,
+                    selected_llm_config_id=llm_config_id,
+                )
+            ).resolved_llm_config_id
+        except ValueError as pin_error:
+            yield _emit_stream_error(
+                message=str(pin_error),
+                error_kind="server_error",
+                error_code="SERVER_ERROR",
             )
-            if not agent_config:
-                yield streaming_service.format_error(
-                    f"Failed to load NewLLMConfig with id {llm_config_id}"
-                )
-                yield streaming_service.format_done()
-                return
-            llm = create_chat_litellm_from_agent_config(agent_config)
-        else:
-            llm_config = load_llm_config_from_yaml(llm_config_id=llm_config_id)
-            if not llm_config:
-                yield streaming_service.format_error(
-                    f"Failed to load LLM config with id {llm_config_id}"
-                )
-                yield streaming_service.format_done()
-                return
-            llm = create_chat_litellm_from_config(llm_config)
-            agent_config = AgentConfig.from_yaml_config(llm_config)
+            yield streaming_service.format_done()
+            return
+
+        llm, agent_config, llm_load_error = await _load_llm_bundle(llm_config_id)
+        if llm_load_error:
+            yield _emit_stream_error(
+                message=llm_load_error,
+                error_kind="server_error",
+                error_code="SERVER_ERROR",
+            )
+            yield streaming_service.format_done()
+            return
         _perf_log.info(
             "[stream_resume] LLM config loaded in %.3fs", time.perf_counter() - _t0
         )
 
+        # Premium credit reservation (same logic as stream_new_chat).
+        _resume_premium_reserved_micros = 0
+        _resume_premium_request_id: str | None = None
+        _resume_needs_premium = (
+            agent_config is not None and user_id and agent_config.is_premium
+        )
+        if _resume_needs_premium:
+            import uuid as _uuid
+
+            from app.services.token_quota_service import (
+                TokenQuotaService,
+                estimate_call_reserve_micros,
+            )
+
+            _resume_premium_request_id = _uuid.uuid4().hex[:16]
+            _resume_litellm_params = agent_config.litellm_params or {}
+            _resume_base_model = (
+                _resume_litellm_params.get("base_model")
+                or agent_config.model_name
+                or ""
+            )
+            reserve_amount_micros = estimate_call_reserve_micros(
+                base_model=_resume_base_model,
+                quota_reserve_tokens=agent_config.quota_reserve_tokens,
+            )
+            async with shielded_async_session() as quota_session:
+                quota_result = await TokenQuotaService.premium_reserve(
+                    db_session=quota_session,
+                    user_id=UUID(user_id),
+                    request_id=_resume_premium_request_id,
+                    reserve_micros=reserve_amount_micros,
+                )
+            _resume_premium_reserved_micros = reserve_amount_micros
+            if not quota_result.allowed:
+                if requested_llm_config_id == 0:
+                    try:
+                        llm_config_id = (
+                            await resolve_or_get_pinned_llm_config_id(
+                                session,
+                                thread_id=chat_id,
+                                search_space_id=search_space_id,
+                                user_id=user_id,
+                                selected_llm_config_id=0,
+                                force_repin_free=True,
+                            )
+                        ).resolved_llm_config_id
+                    except ValueError as pin_error:
+                        yield _emit_stream_error(
+                            message=str(pin_error),
+                            error_kind="server_error",
+                            error_code="SERVER_ERROR",
+                        )
+                        yield streaming_service.format_done()
+                        return
+
+                    llm, agent_config, llm_load_error = await _load_llm_bundle(
+                        llm_config_id
+                    )
+                    if llm_load_error:
+                        yield _emit_stream_error(
+                            message=llm_load_error,
+                            error_kind="server_error",
+                            error_code="SERVER_ERROR",
+                        )
+                        yield streaming_service.format_done()
+                        return
+                    _resume_premium_request_id = None
+                    _resume_premium_reserved_micros = 0
+                    _log_chat_stream_error(
+                        flow="resume",
+                        error_kind="premium_quota_exhausted",
+                        error_code="PREMIUM_QUOTA_EXHAUSTED",
+                        severity="info",
+                        is_expected=True,
+                        request_id=request_id,
+                        thread_id=chat_id,
+                        search_space_id=search_space_id,
+                        user_id=user_id,
+                        message=(
+                            "Premium quota exhausted on pinned model; auto-fallback switched to a free model"
+                        ),
+                        extra={
+                            "fallback_config_id": llm_config_id,
+                            "auto_fallback": True,
+                        },
+                    )
+                else:
+                    yield _emit_stream_error(
+                        message=(
+                            "Buy more tokens to continue with this model, or switch to a free model"
+                        ),
+                        error_kind="premium_quota_exhausted",
+                        error_code="PREMIUM_QUOTA_EXHAUSTED",
+                        severity="info",
+                        is_expected=True,
+                        extra={
+                            "resolved_config_id": llm_config_id,
+                            "auto_fallback": False,
+                        },
+                    )
+                    yield streaming_service.format_done()
+                    return
+
         if not llm:
-            yield streaming_service.format_error("Failed to create LLM instance")
+            yield _emit_stream_error(
+                message="Failed to create LLM instance",
+                error_kind="server_error",
+                error_code="SERVER_ERROR",
+            )
             yield streaming_service.format_done()
             return
 
+        # Auto-mode preflight ping (resume path). Mirrors ``stream_new_chat``:
+        # one cheap probe before the agent is rebuilt so a 429'd pin gets
+        # repinned without burning planner/classifier/title calls first.
+        # See ``stream_new_chat`` for the full rationale on the speculative
+        # parallel build pattern below.
+        preflight_needed = (
+            requested_llm_config_id == 0
+            and llm_config_id < 0
+            and not is_recently_healthy(llm_config_id)
+        )
+        preflight_task: asyncio.Task[None] | None = None
+        _t_preflight = 0.0
+        if preflight_needed:
+            _t_preflight = time.perf_counter()
+            preflight_task = asyncio.create_task(
+                _preflight_llm(llm),
+                name=f"auto_pin_preflight_resume:{llm_config_id}",
+            )
+
         _t0 = time.perf_counter()
         connector_service = ConnectorService(session, search_space_id=search_space_id)
 
@@ -1677,20 +2722,123 @@ async def stream_resume_chat(
         )
 
         visibility = thread_visibility or ChatVisibility.PRIVATE
+        from app.config import config as _app_config
 
         _t0 = time.perf_counter()
-        agent = await create_surfsense_deep_agent(
-            llm=llm,
-            search_space_id=search_space_id,
-            db_session=session,
-            connector_service=connector_service,
-            checkpointer=checkpointer,
-            user_id=user_id,
-            thread_id=chat_id,
-            agent_config=agent_config,
-            firecrawl_api_key=firecrawl_api_key,
-            thread_visibility=visibility,
+        agent_factory = (
+            create_multi_agent_chat_deep_agent
+            if _app_config.MULTI_AGENT_CHAT_ENABLED
+            else create_surfsense_deep_agent
+        )
+        agent_build_task = asyncio.create_task(
+            _build_main_agent_for_thread(
+                agent_factory,
+                llm=llm,
+                search_space_id=search_space_id,
+                db_session=session,
+                connector_service=connector_service,
+                checkpointer=checkpointer,
+                user_id=user_id,
+                thread_id=chat_id,
+                agent_config=agent_config,
+                firecrawl_api_key=firecrawl_api_key,
+                thread_visibility=visibility,
+                filesystem_selection=filesystem_selection,
+                disabled_tools=disabled_tools,
+            ),
+            name="agent_build:stream_resume",
         )
+
+        agent: Any = None
+        if preflight_task is not None:
+            try:
+                await preflight_task
+                mark_healthy(llm_config_id)
+                _perf_log.info(
+                    "[stream_resume] auto_pin_preflight ok config_id=%s took=%.3fs (parallel)",
+                    llm_config_id,
+                    time.perf_counter() - _t_preflight,
+                )
+            except Exception as preflight_exc:
+                # Same session-safety rationale as ``stream_new_chat``.
+                await _settle_speculative_agent_build(agent_build_task)
+                if not _is_provider_rate_limited(preflight_exc):
+                    raise
+                previous_config_id = llm_config_id
+                mark_runtime_cooldown(
+                    previous_config_id, reason="preflight_rate_limited"
+                )
+                try:
+                    llm_config_id = (
+                        await resolve_or_get_pinned_llm_config_id(
+                            session,
+                            thread_id=chat_id,
+                            search_space_id=search_space_id,
+                            user_id=user_id,
+                            selected_llm_config_id=0,
+                            exclude_config_ids={previous_config_id},
+                        )
+                    ).resolved_llm_config_id
+                except ValueError as pin_error:
+                    yield _emit_stream_error(
+                        message=str(pin_error),
+                        error_kind="server_error",
+                        error_code="SERVER_ERROR",
+                    )
+                    yield streaming_service.format_done()
+                    return
+
+                llm, agent_config, llm_load_error = await _load_llm_bundle(
+                    llm_config_id
+                )
+                if llm_load_error or not llm:
+                    yield _emit_stream_error(
+                        message=llm_load_error or "Failed to create LLM instance",
+                        error_kind="server_error",
+                        error_code="SERVER_ERROR",
+                    )
+                    yield streaming_service.format_done()
+                    return
+                mark_healthy(llm_config_id)
+                _log_chat_stream_error(
+                    flow="resume",
+                    error_kind="rate_limited",
+                    error_code="RATE_LIMITED",
+                    severity="info",
+                    is_expected=True,
+                    request_id=request_id,
+                    thread_id=chat_id,
+                    search_space_id=search_space_id,
+                    user_id=user_id,
+                    message=(
+                        "Auto-pinned model failed preflight; switched to another "
+                        "eligible model and continuing."
+                    ),
+                    extra={
+                        "auto_runtime_recover": True,
+                        "preflight": True,
+                        "previous_config_id": previous_config_id,
+                        "fallback_config_id": llm_config_id,
+                    },
+                )
+                agent = await _build_main_agent_for_thread(
+                    agent_factory,
+                    llm=llm,
+                    search_space_id=search_space_id,
+                    db_session=session,
+                    connector_service=connector_service,
+                    checkpointer=checkpointer,
+                    user_id=user_id,
+                    thread_id=chat_id,
+                    agent_config=agent_config,
+                    firecrawl_api_key=firecrawl_api_key,
+                    thread_visibility=visibility,
+                    filesystem_selection=filesystem_selection,
+                    disabled_tools=disabled_tools,
+                )
+
+        if agent is None:
+            agent = await agent_build_task
         _perf_log.info(
             "[stream_resume] Agent created in %.3fs", time.perf_counter() - _t0
         )
@@ -1707,44 +2855,305 @@ async def stream_resume_chat(
 
         from langgraph.types import Command
 
+        from app.agents.multi_agent_chat.middleware.main_agent.checkpointed_subagent_middleware.resume_routing import (
+            build_lg_resume_map,
+            collect_pending_tool_calls,
+            slice_decisions_by_tool_call,
+        )
+
+        # Each pending interrupt is stamped with its originating ``tool_call_id``
+        # (see ``checkpointed_subagent_middleware.propagation``) so we can route
+        # a flat ``decisions`` list back to the right paused subagent.
+        parent_state = await agent.aget_state(
+            {"configurable": {"thread_id": str(chat_id)}}
+        )
+        pending = collect_pending_tool_calls(parent_state)
+        _perf_log.info(
+            "[hitl_route] resume_entry chat_id=%s decisions=%d pending_subagents=%d",
+            chat_id,
+            len(decisions),
+            len(pending),
+        )
+        routed_resume_value = slice_decisions_by_tool_call(decisions, pending)
+        # Langgraph rejects scalar ``Command(resume=...)`` when multiple
+        # interrupts are pending (parallel HITL); the mapped form works
+        # for the single-pause case too, so we always use it.
+        lg_resume_map = build_lg_resume_map(parent_state, routed_resume_value)
+
         config = {
-            "configurable": {"thread_id": str(chat_id)},
-            "recursion_limit": 80,
+            "configurable": {
+                "thread_id": str(chat_id),
+                "request_id": request_id or "unknown",
+                "turn_id": stream_result.turn_id,
+                # Per-``tool_call_id`` resume slices read by
+                # ``SurfSenseCheckpointedSubAgentMiddleware``. Parallel
+                # siblings each pop their own entry, so they never race.
+                "surfsense_resume_value": routed_resume_value,
+            },
+            # See ``stream_new_chat`` above for rationale: effectively
+            # uncapped to mirror the agent default and OpenCode's
+            # session loop. Doom-loop / call-limit middleware enforce
+            # the real ceiling.
+            "recursion_limit": 10_000,
         }
 
         yield streaming_service.format_message_start()
         yield streaming_service.format_start_step()
+        # Same rationale as ``stream_new_chat``: emit the turn id so
+        # resumed streams can be persisted with their correlation id
+        # intact.
+        yield streaming_service.format_data(
+            "turn-info",
+            {"chat_turn_id": stream_result.turn_id},
+        )
+        yield streaming_service.format_data("turn-status", {"status": "busy"})
+
+        # Pre-write a fresh assistant row for this resume turn. The
+        # original (interrupted) ``stream_new_chat`` invocation already
+        # persisted its own assistant row anchored to a different
+        # ``turn_id``; resume allocates a new ``turn_id`` (above) so we
+        # need a separate row keyed on the same ``(thread_id, turn_id,
+        # ASSISTANT)`` invariant. Idempotent against migration 141's
+        # partial unique index — recovers existing id on retry.
+        from app.tasks.chat.content_builder import AssistantContentBuilder
+        from app.tasks.chat.persistence import persist_assistant_shell
+
+        assistant_message_id = await persist_assistant_shell(
+            chat_id=chat_id,
+            user_id=user_id,
+            turn_id=stream_result.turn_id,
+        )
+        if assistant_message_id is None:
+            yield _emit_stream_error(
+                message=(
+                    "We couldn't initialize the assistant message. Please try again."
+                ),
+                error_kind="server_error",
+                error_code="MESSAGE_PERSIST_FAILED",
+            )
+            yield streaming_service.format_data("turn-status", {"status": "idle"})
+            yield streaming_service.format_finish_step()
+            yield streaming_service.format_finish()
+            yield streaming_service.format_done()
+            return
+
+        # Emit canonical assistant message id BEFORE any LLM streaming
+        # so the FE can rename ``pendingInterrupt.assistantMsgId`` to
+        # ``msg-{assistant_message_id}`` immediately. Resume does NOT
+        # emit ``data-user-message-id`` because the user row is from
+        # the original interrupted turn (different ``turn_id``) and is
+        # never re-persisted here. See B5 in the
+        # ``sse-based_message_id_handshake`` plan.
+        yield streaming_service.format_data(
+            "assistant-message-id",
+            {"message_id": assistant_message_id, "turn_id": stream_result.turn_id},
+        )
+
+        stream_result.assistant_message_id = assistant_message_id
+        stream_result.content_builder = AssistantContentBuilder()
+
+        # Resume path doesn't carry new ``mentioned_document_ids`` —
+        # those are seeded in the original turn. We still pass a
+        # context so future middleware extensions (Phase 2) can rely on
+        # ``runtime.context`` always being populated.
+        runtime_context = SurfSenseContextSchema(
+            search_space_id=search_space_id,
+            request_id=request_id,
+            turn_id=stream_result.turn_id,
+        )
 
         _t_stream_start = time.perf_counter()
         _first_event_logged = False
-        async for sse in _stream_agent_events(
-            agent=agent,
-            config=config,
-            input_data=Command(resume={"decisions": decisions}),
-            streaming_service=streaming_service,
-            result=stream_result,
-            step_prefix="thinking-resume",
-        ):
-            if not _first_event_logged:
+        runtime_rate_limit_recovered = False
+        while True:
+            try:
+                async for sse in _stream_agent_events(
+                    agent=agent,
+                    config=config,
+                    input_data=Command(resume=lg_resume_map),
+                    streaming_service=streaming_service,
+                    result=stream_result,
+                    step_prefix=_resume_step_prefix(stream_result.turn_id),
+                    fallback_commit_search_space_id=search_space_id,
+                    fallback_commit_created_by_id=user_id,
+                    fallback_commit_filesystem_mode=(
+                        filesystem_selection.mode
+                        if filesystem_selection
+                        else FilesystemMode.CLOUD
+                    ),
+                    fallback_commit_thread_id=chat_id,
+                    runtime_context=runtime_context,
+                    content_builder=stream_result.content_builder,
+                ):
+                    if not _first_event_logged:
+                        _perf_log.info(
+                            "[stream_resume] First agent event in %.3fs (stream), %.3fs (total) (chat_id=%s)",
+                            time.perf_counter() - _t_stream_start,
+                            time.perf_counter() - _t_total,
+                            chat_id,
+                        )
+                        _first_event_logged = True
+                    yield sse
+                break
+            except Exception as stream_exc:
+                can_runtime_recover = (
+                    not runtime_rate_limit_recovered
+                    and requested_llm_config_id == 0
+                    and llm_config_id < 0
+                    and not _first_event_logged
+                    and _is_provider_rate_limited(stream_exc)
+                )
+                if not can_runtime_recover:
+                    raise
+
+                runtime_rate_limit_recovered = True
+                previous_config_id = llm_config_id
+                # Ensure the same-request recovery retry does not trip the
+                # BusyMutex lock retained by the failed attempt.
+                end_turn(str(chat_id))
+                mark_runtime_cooldown(
+                    previous_config_id,
+                    reason="provider_rate_limited",
+                )
+                llm_config_id = (
+                    await resolve_or_get_pinned_llm_config_id(
+                        session,
+                        thread_id=chat_id,
+                        search_space_id=search_space_id,
+                        user_id=user_id,
+                        selected_llm_config_id=0,
+                        exclude_config_ids={previous_config_id},
+                    )
+                ).resolved_llm_config_id
+
+                llm, agent_config, llm_load_error = await _load_llm_bundle(
+                    llm_config_id
+                )
+                if llm_load_error:
+                    raise stream_exc
+
+                _t0 = time.perf_counter()
+                agent = await _build_main_agent_for_thread(
+                    agent_factory,
+                    llm=llm,
+                    search_space_id=search_space_id,
+                    db_session=session,
+                    connector_service=connector_service,
+                    checkpointer=checkpointer,
+                    user_id=user_id,
+                    thread_id=chat_id,
+                    agent_config=agent_config,
+                    firecrawl_api_key=firecrawl_api_key,
+                    thread_visibility=visibility,
+                    filesystem_selection=filesystem_selection,
+                    disabled_tools=disabled_tools,
+                )
                 _perf_log.info(
-                    "[stream_resume] First agent event in %.3fs (stream), %.3fs (total) (chat_id=%s)",
-                    time.perf_counter() - _t_stream_start,
-                    time.perf_counter() - _t_total,
-                    chat_id,
+                    "[stream_resume] Runtime rate-limit recovery repinned "
+                    "config_id=%s -> %s and rebuilt agent in %.3fs",
+                    previous_config_id,
+                    llm_config_id,
+                    time.perf_counter() - _t0,
+                )
+                _log_chat_stream_error(
+                    flow="resume",
+                    error_kind="rate_limited",
+                    error_code="RATE_LIMITED",
+                    severity="info",
+                    is_expected=True,
+                    request_id=request_id,
+                    thread_id=chat_id,
+                    search_space_id=search_space_id,
+                    user_id=user_id,
+                    message=(
+                        "Auto-pinned model hit runtime rate limit; switched to "
+                        "another eligible model and retried."
+                    ),
+                    extra={
+                        "auto_runtime_recover": True,
+                        "previous_config_id": previous_config_id,
+                        "fallback_config_id": llm_config_id,
+                    },
                 )
-                _first_event_logged = True
-            yield sse
+                continue
         _perf_log.info(
             "[stream_resume] Agent stream completed in %.3fs (chat_id=%s)",
             time.perf_counter() - _t_stream_start,
             chat_id,
         )
         if stream_result.is_interrupted:
+            usage_summary = accumulator.per_message_summary()
+            _perf_log.info(
+                "[token_usage] interrupted resume_chat: calls=%d total=%d cost_micros=%d summary=%s",
+                len(accumulator.calls),
+                accumulator.grand_total,
+                accumulator.total_cost_micros,
+                usage_summary,
+            )
+            if usage_summary:
+                yield streaming_service.format_data(
+                    "token-usage",
+                    {
+                        "usage": usage_summary,
+                        "prompt_tokens": accumulator.total_prompt_tokens,
+                        "completion_tokens": accumulator.total_completion_tokens,
+                        "total_tokens": accumulator.grand_total,
+                        "cost_micros": accumulator.total_cost_micros,
+                        "call_details": accumulator.serialized_calls(),
+                    },
+                )
+
             yield streaming_service.format_finish_step()
             yield streaming_service.format_finish()
             yield streaming_service.format_done()
             return
 
+        # Finalize premium credit debit for resume path with the actual
+        # provider cost reported by LiteLLM (sum of cost across all
+        # calls in the turn).
+        if _resume_premium_request_id and user_id:
+            try:
+                from app.services.token_quota_service import TokenQuotaService
+
+                async with shielded_async_session() as quota_session:
+                    await TokenQuotaService.premium_finalize(
+                        db_session=quota_session,
+                        user_id=UUID(user_id),
+                        request_id=_resume_premium_request_id,
+                        actual_micros=accumulator.total_cost_micros,
+                        reserved_micros=_resume_premium_reserved_micros,
+                    )
+                _resume_premium_request_id = None
+                _resume_premium_reserved_micros = 0
+            except Exception:
+                logging.getLogger(__name__).warning(
+                    "Failed to finalize premium quota for user %s (resume)",
+                    user_id,
+                    exc_info=True,
+                )
+
+        usage_summary = accumulator.per_message_summary()
+        _perf_log.info(
+            "[token_usage] normal resume_chat: calls=%d total=%d cost_micros=%d summary=%s",
+            len(accumulator.calls),
+            accumulator.grand_total,
+            accumulator.total_cost_micros,
+            usage_summary,
+        )
+        if usage_summary:
+            yield streaming_service.format_data(
+                "token-usage",
+                {
+                    "usage": usage_summary,
+                    "prompt_tokens": accumulator.total_prompt_tokens,
+                    "completion_tokens": accumulator.total_completion_tokens,
+                    "total_tokens": accumulator.grand_total,
+                    "cost_micros": accumulator.total_cost_micros,
+                    "call_details": accumulator.serialized_calls(),
+                },
+            )
+
+        yield streaming_service.format_data("turn-status", {"status": "idle"})
         yield streaming_service.format_finish_step()
         yield streaming_service.format_finish()
         yield streaming_service.format_done()
@@ -1752,16 +3161,70 @@ async def stream_resume_chat(
     except Exception as e:
         import traceback
 
+        # ``BusyError`` fires before the agent acquires the lock; the
+        # cleanup path must skip lock release to avoid freeing the
+        # in-flight caller's lock. Classification is handled below.
+        if isinstance(e, BusyError):
+            _busy_error_raised = True
+
+        (
+            error_kind,
+            error_code,
+            severity,
+            is_expected,
+            user_message,
+            error_extra,
+        ) = _classify_stream_exception(e, flow_label="resume")
         error_message = f"Error during resume: {e!s}"
         print(f"[stream_resume_chat] {error_message}")
         print(f"[stream_resume_chat] Traceback:\n{traceback.format_exc()}")
-        yield streaming_service.format_error(error_message)
+        if error_code == "TURN_CANCELLING":
+            status_payload: dict[str, Any] = {"status": "cancelling"}
+            if error_extra:
+                status_payload.update(error_extra)
+            yield streaming_service.format_data("turn-status", status_payload)
+        else:
+            yield streaming_service.format_data("turn-status", {"status": "busy"})
+        yield _emit_stream_error(
+            message=user_message,
+            error_kind=error_kind,
+            error_code=error_code,
+            severity=severity,
+            is_expected=is_expected,
+            extra=error_extra,
+        )
+        yield streaming_service.format_data("turn-status", {"status": "idle"})
         yield streaming_service.format_finish_step()
         yield streaming_service.format_finish()
         yield streaming_service.format_done()
 
     finally:
         with anyio.CancelScope(shield=True):
+            # Authoritative fallback cleanup for lock/cancel state. Middleware
+            # teardown can be skipped on some client-abort paths.
+            end_turn(str(chat_id))
+
+            # Release premium reservation if not finalized
+            if (
+                _resume_premium_request_id
+                and _resume_premium_reserved_micros > 0
+                and user_id
+            ):
+                try:
+                    from app.services.token_quota_service import TokenQuotaService
+
+                    async with shielded_async_session() as quota_session:
+                        await TokenQuotaService.premium_release(
+                            db_session=quota_session,
+                            user_id=UUID(user_id),
+                            reserved_micros=_resume_premium_reserved_micros,
+                        )
+                    _resume_premium_reserved_micros = 0
+                except Exception:
+                    logging.getLogger(__name__).warning(
+                        "Failed to release premium quota for user %s (resume)", user_id
+                    )
+
             try:
                 await session.rollback()
                 await clear_ai_responding(session, chat_id)
@@ -1780,6 +3243,74 @@ async def stream_resume_chat(
             with contextlib.suppress(Exception):
                 await session.close()
 
+            # Server-side assistant-message + token_usage finalization for
+            # the resume flow. The original user message was persisted by
+            # the original (interrupted) ``stream_new_chat`` invocation;
+            # the resume's own ``persist_assistant_shell`` write lives at
+            # the new ``turn_id`` above. This finalize updates that row
+            # with the rich ContentPart[] from the builder and writes
+            # token_usage idempotently via migration 142's partial
+            # unique index. Best-effort, never raises.
+            if (
+                stream_result
+                and stream_result.turn_id
+                and stream_result.assistant_message_id
+            ):
+                from app.tasks.chat.persistence import finalize_assistant_turn
+
+                builder_stats: dict[str, int] | None = None
+                if stream_result.content_builder is not None:
+                    stream_result.content_builder.mark_interrupted()
+                    builder_stats = stream_result.content_builder.stats()
+                    content_payload = stream_result.content_builder.snapshot()
+                else:
+                    content_payload = [
+                        {
+                            "type": "text",
+                            "text": stream_result.accumulated_text or "",
+                        }
+                    ]
+
+                if builder_stats is not None:
+                    _perf_log.info(
+                        "[stream_resume] finalize_payload chat_id=%s "
+                        "message_id=%s parts=%d bytes=%d text=%d "
+                        "reasoning=%d tool_calls=%d "
+                        "tool_calls_completed=%d tool_calls_aborted=%d "
+                        "thinking_step_parts=%d step_separators=%d",
+                        chat_id,
+                        stream_result.assistant_message_id,
+                        builder_stats["parts"],
+                        builder_stats["bytes"],
+                        builder_stats["text"],
+                        builder_stats["reasoning"],
+                        builder_stats["tool_calls"],
+                        builder_stats["tool_calls_completed"],
+                        builder_stats["tool_calls_aborted"],
+                        builder_stats["thinking_step_parts"],
+                        builder_stats["step_separators"],
+                    )
+
+                await finalize_assistant_turn(
+                    message_id=stream_result.assistant_message_id,
+                    chat_id=chat_id,
+                    search_space_id=search_space_id,
+                    user_id=user_id,
+                    turn_id=stream_result.turn_id,
+                    content=content_payload,
+                    accumulator=accumulator,
+                )
+
+        # Release the lock from the original interrupted turn or any
+        # re-interrupt/bailout. Skip on ``BusyError`` (lock not held here).
+        if not _busy_error_raised:
+            with contextlib.suppress(Exception):
+                end_turn(str(chat_id))
+                _perf_log.info(
+                    "[stream_resume] end_turn cleanup (chat_id=%s)",
+                    chat_id,
+                )
+
         agent = llm = connector_service = None
         stream_result = None
         session = None
diff --git a/surfsense_backend/app/tasks/chat/streaming/__init__.py b/surfsense_backend/app/tasks/chat/streaming/__init__.py
new file mode 100644
index 0000000000..70c99342ae
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/__init__.py
@@ -0,0 +1,3 @@
+"""Chat streaming helpers (e.g. LangGraph → SSE relay under ``graph_stream``)."""
+
+from __future__ import annotations
diff --git a/surfsense_backend/app/tasks/chat/streaming/errors/__init__.py b/surfsense_backend/app/tasks/chat/streaming/errors/__init__.py
new file mode 100644
index 0000000000..02284d4b07
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/errors/__init__.py
@@ -0,0 +1,3 @@
+"""Error classification, structured logging, and terminal-error SSE emission."""
+
+from __future__ import annotations
diff --git a/surfsense_backend/app/tasks/chat/streaming/errors/classifier.py b/surfsense_backend/app/tasks/chat/streaming/errors/classifier.py
new file mode 100644
index 0000000000..3af2b9f9f8
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/errors/classifier.py
@@ -0,0 +1,187 @@
+"""Classify stream exceptions for logging and client error payloads."""
+
+from __future__ import annotations
+
+import json
+import logging
+import time
+from typing import Any, Literal
+
+from app.agents.new_chat.errors import BusyError
+from app.agents.new_chat.middleware.busy_mutex import (
+    get_cancel_state,
+    is_cancel_requested,
+)
+
+TURN_CANCELLING_INITIAL_DELAY_MS = 200
+TURN_CANCELLING_BACKOFF_FACTOR = 2
+TURN_CANCELLING_MAX_DELAY_MS = 1500
+
+
+def compute_turn_cancelling_retry_delay(attempt: int) -> int:
+    if attempt < 1:
+        attempt = 1
+    delay = TURN_CANCELLING_INITIAL_DELAY_MS * (
+        TURN_CANCELLING_BACKOFF_FACTOR ** (attempt - 1)
+    )
+    return min(delay, TURN_CANCELLING_MAX_DELAY_MS)
+
+
+def log_chat_stream_error(
+    *,
+    flow: Literal["new", "resume", "regenerate"],
+    error_kind: str,
+    error_code: str | None,
+    severity: Literal["info", "warn", "error"],
+    is_expected: bool,
+    request_id: str | None,
+    thread_id: int | None,
+    search_space_id: int | None,
+    user_id: str | None,
+    message: str,
+    extra: dict[str, Any] | None = None,
+) -> None:
+    payload: dict[str, Any] = {
+        "event": "chat_stream_error",
+        "flow": flow,
+        "error_kind": error_kind,
+        "error_code": error_code,
+        "severity": severity,
+        "is_expected": is_expected,
+        "request_id": request_id or "unknown",
+        "thread_id": thread_id,
+        "search_space_id": search_space_id,
+        "user_id": user_id,
+        "message": message,
+    }
+    if extra:
+        payload.update(extra)
+
+    logger = logging.getLogger(__name__)
+    rendered = json.dumps(payload, ensure_ascii=False)
+    if severity == "error":
+        logger.error("[chat_stream_error] %s", rendered)
+    elif severity == "warn":
+        logger.warning("[chat_stream_error] %s", rendered)
+    else:
+        logger.info("[chat_stream_error] %s", rendered)
+
+
+def _parse_error_payload(message: str) -> dict[str, Any] | None:
+    candidates = [message]
+    first_brace_idx = message.find("{")
+    if first_brace_idx >= 0:
+        candidates.append(message[first_brace_idx:])
+
+    for candidate in candidates:
+        try:
+            parsed = json.loads(candidate)
+            if isinstance(parsed, dict):
+                return parsed
+        except Exception:
+            continue
+    return None
+
+
+def _extract_provider_error_code(parsed: dict[str, Any] | None) -> int | None:
+    if not isinstance(parsed, dict):
+        return None
+    candidates: list[Any] = [parsed.get("code")]
+    nested = parsed.get("error")
+    if isinstance(nested, dict):
+        candidates.append(nested.get("code"))
+    for value in candidates:
+        try:
+            if value is None:
+                continue
+            return int(value)
+        except Exception:
+            continue
+    return None
+
+
+def is_provider_rate_limited(exc: BaseException) -> bool:
+    """Return True if the exception looks like an upstream HTTP 429 / rate limit."""
+    raw = str(exc)
+    lowered = raw.lower()
+    if "ratelimit" in type(exc).__name__.lower():
+        return True
+    parsed = _parse_error_payload(raw)
+    provider_code = _extract_provider_error_code(parsed)
+    if provider_code == 429:
+        return True
+
+    provider_error_type = ""
+    if parsed:
+        top_type = parsed.get("type")
+        if isinstance(top_type, str):
+            provider_error_type = top_type.lower()
+        nested = parsed.get("error")
+        if isinstance(nested, dict):
+            nested_type = nested.get("type")
+            if isinstance(nested_type, str):
+                provider_error_type = nested_type.lower()
+    if provider_error_type == "rate_limit_error":
+        return True
+
+    return (
+        "rate limited" in lowered
+        or "rate-limited" in lowered
+        or "temporarily rate-limited upstream" in lowered
+    )
+
+
+def classify_stream_exception(
+    exc: Exception,
+    *,
+    flow_label: str,
+) -> tuple[
+    str, str, Literal["info", "warn", "error"], bool, str, dict[str, Any] | None
+]:
+    """Return kind, code, severity, expected flag, message, and optional extra dict."""
+    raw = str(exc)
+    if isinstance(exc, BusyError) or "Thread is busy with another request" in raw:
+        busy_thread_id = str(exc.request_id) if isinstance(exc, BusyError) else None
+        if busy_thread_id and is_cancel_requested(busy_thread_id):
+            cancel_state = get_cancel_state(busy_thread_id)
+            attempt = cancel_state[0] if cancel_state else 1
+            retry_after_ms = compute_turn_cancelling_retry_delay(attempt)
+            retry_after_at = int(time.time() * 1000) + retry_after_ms
+            return (
+                "thread_busy",
+                "TURN_CANCELLING",
+                "info",
+                True,
+                "A previous response is still stopping. Please try again in a moment.",
+                {
+                    "retry_after_ms": retry_after_ms,
+                    "retry_after_at": retry_after_at,
+                },
+            )
+        return (
+            "thread_busy",
+            "THREAD_BUSY",
+            "warn",
+            True,
+            "Another response is still finishing for this thread. Please try again in a moment.",
+            None,
+        )
+
+    if is_provider_rate_limited(exc):
+        return (
+            "rate_limited",
+            "RATE_LIMITED",
+            "warn",
+            True,
+            "This model is temporarily rate-limited. Please try again in a few seconds or switch models.",
+            None,
+        )
+
+    return (
+        "server_error",
+        "SERVER_ERROR",
+        "error",
+        False,
+        f"Error during {flow_label}: {raw}",
+        None,
+    )
diff --git a/surfsense_backend/app/tasks/chat/streaming/errors/emitter.py b/surfsense_backend/app/tasks/chat/streaming/errors/emitter.py
new file mode 100644
index 0000000000..95806ab87f
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/errors/emitter.py
@@ -0,0 +1,38 @@
+"""Emit one terminal error SSE frame and log via the stream error classifier."""
+
+from __future__ import annotations
+
+from typing import Any, Literal
+
+from .classifier import log_chat_stream_error
+
+
+def emit_stream_terminal_error(
+    *,
+    streaming_service: Any,
+    flow: Literal["new", "resume", "regenerate"],
+    request_id: str | None,
+    thread_id: int,
+    search_space_id: int,
+    user_id: str | None,
+    message: str,
+    error_kind: str = "server_error",
+    error_code: str = "SERVER_ERROR",
+    severity: Literal["info", "warn", "error"] = "error",
+    is_expected: bool = False,
+    extra: dict[str, Any] | None = None,
+) -> str:
+    log_chat_stream_error(
+        flow=flow,
+        error_kind=error_kind,
+        error_code=error_code,
+        severity=severity,
+        is_expected=is_expected,
+        request_id=request_id,
+        thread_id=thread_id,
+        search_space_id=search_space_id,
+        user_id=user_id,
+        message=message,
+        extra=extra,
+    )
+    return streaming_service.format_error(message, error_code=error_code, extra=extra)
diff --git a/surfsense_backend/app/tasks/chat/streaming/graph_stream/__init__.py b/surfsense_backend/app/tasks/chat/streaming/graph_stream/__init__.py
new file mode 100644
index 0000000000..e3bf0426cb
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/graph_stream/__init__.py
@@ -0,0 +1,21 @@
+"""LangGraph ``astream_events`` → SSE (``stream_output`` + ``StreamingResult``).
+
+Imports are lazy to avoid a circular import with ``relay.event_relay``.
+"""
+
+from __future__ import annotations
+
+__all__ = ["StreamingResult", "stream_output"]
+
+
+def __getattr__(name: str):
+    if name == "stream_output":
+        from app.tasks.chat.streaming.graph_stream.event_stream import stream_output
+
+        return stream_output
+    if name == "StreamingResult":
+        from app.tasks.chat.streaming.graph_stream.result import StreamingResult
+
+        return StreamingResult
+    msg = f"module {__name__!r} has no attribute {name!r}"
+    raise AttributeError(msg)
diff --git a/surfsense_backend/app/tasks/chat/streaming/graph_stream/event_stream.py b/surfsense_backend/app/tasks/chat/streaming/graph_stream/event_stream.py
new file mode 100644
index 0000000000..9a309f9d75
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/graph_stream/event_stream.py
@@ -0,0 +1,51 @@
+"""Run LangGraph event streams through ``EventRelay``."""
+
+from __future__ import annotations
+
+from collections.abc import AsyncIterator
+from typing import Any
+
+from app.tasks.chat.streaming.graph_stream.result import StreamingResult
+from app.tasks.chat.streaming.relay.event_relay import EventRelay
+from app.tasks.chat.streaming.relay.state import AgentEventRelayState
+
+
+async def stream_output(
+    *,
+    agent: Any,
+    config: dict[str, Any],
+    input_data: Any,
+    streaming_service: Any,
+    result: StreamingResult,
+    step_prefix: str = "thinking",
+    initial_step_id: str | None = None,
+    initial_step_title: str = "",
+    initial_step_items: list[str] | None = None,
+    content_builder: Any | None = None,
+    runtime_context: Any = None,
+) -> AsyncIterator[str]:
+    """Yield SSE frames from agent ``astream_events`` via ``EventRelay``."""
+    state = AgentEventRelayState.for_invocation(
+        initial_step_id=initial_step_id,
+        initial_step_title=initial_step_title,
+        initial_step_items=initial_step_items,
+    )
+
+    astream_kwargs: dict[str, Any] = {"config": config, "version": "v2"}
+    if runtime_context is not None:
+        astream_kwargs["context"] = runtime_context
+
+    events = agent.astream_events(input_data, **astream_kwargs)
+    relay = EventRelay(streaming_service=streaming_service)
+    async for frame in relay.relay(
+        events,
+        state=state,
+        result=result,
+        step_prefix=step_prefix,
+        content_builder=content_builder,
+        config=config,
+    ):
+        yield frame
+
+    result.accumulated_text = state.accumulated_text
+    result.agent_called_update_memory = state.called_update_memory
diff --git a/surfsense_backend/app/tasks/chat/streaming/graph_stream/result.py b/surfsense_backend/app/tasks/chat/streaming/graph_stream/result.py
new file mode 100644
index 0000000000..391f14f24d
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/graph_stream/result.py
@@ -0,0 +1,27 @@
+"""Mutable facts collected while relaying one agent stream (``stream_output``)."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+
+@dataclass
+class StreamingResult:
+    accumulated_text: str = ""
+    is_interrupted: bool = False
+    sandbox_files: list[str] = field(default_factory=list)
+    agent_called_update_memory: bool = False
+    request_id: str | None = None
+    turn_id: str = ""
+    filesystem_mode: str = "cloud"
+    client_platform: str = "web"
+    intent_detected: str = "chat_only"
+    intent_confidence: float = 0.0
+    write_attempted: bool = False
+    write_succeeded: bool = False
+    verification_succeeded: bool = False
+    commit_gate_passed: bool = True
+    commit_gate_reason: str = ""
+    assistant_message_id: int | None = None
+    content_builder: Any | None = field(default=None, repr=False)
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/__init__.py b/surfsense_backend/app/tasks/chat/streaming/handlers/__init__.py
new file mode 100644
index 0000000000..3e21659321
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/__init__.py
@@ -0,0 +1,3 @@
+"""LangGraph stream handlers by event kind."""
+
+from __future__ import annotations
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/chain_end.py b/surfsense_backend/app/tasks/chat/streaming/handlers/chain_end.py
new file mode 100644
index 0000000000..c61058ac7b
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/chain_end.py
@@ -0,0 +1,23 @@
+"""Close open text when a LangGraph chain or agent node finishes."""
+
+from __future__ import annotations
+
+from collections.abc import Iterator
+from typing import Any
+
+from app.tasks.chat.streaming.relay.state import AgentEventRelayState
+
+
+def iter_chain_end_frames(
+    _event: dict[str, Any],
+    *,
+    state: AgentEventRelayState,
+    streaming_service: Any,
+    content_builder: Any | None,
+) -> Iterator[str]:
+    """Close the open text stream if one is open."""
+    if state.current_text_id is not None:
+        yield streaming_service.format_text_end(state.current_text_id)
+        if content_builder is not None:
+            content_builder.on_text_end(state.current_text_id)
+        state.current_text_id = None
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/chat_model_stream.py b/surfsense_backend/app/tasks/chat/streaming/handlers/chat_model_stream.py
new file mode 100644
index 0000000000..c3f6d6d595
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/chat_model_stream.py
@@ -0,0 +1,159 @@
+"""Chat model stream: text, reasoning, and tool-call chunk SSE."""
+
+from __future__ import annotations
+
+from collections.abc import Iterator
+from typing import Any
+
+from app.tasks.chat.streaming.helpers.chunk_parts import extract_chunk_parts
+from app.tasks.chat.streaming.relay.state import AgentEventRelayState
+from app.tasks.chat.streaming.relay.task_span import ensure_pending_task_span_for_lc
+from app.tasks.chat.streaming.relay.thinking_step_completion import (
+    complete_active_thinking_step,
+)
+
+
+def iter_chat_model_stream_frames(
+    event: dict[str, Any],
+    *,
+    state: AgentEventRelayState,
+    streaming_service: Any,
+    content_builder: Any | None,
+    step_prefix: str,
+) -> Iterator[str]:
+    """SSE frames for one chat-model chunk."""
+    if state.active_tool_depth > 0:
+        return
+    if "surfsense:internal" in event.get("tags", []):
+        return
+    chunk = event.get("data", {}).get("chunk")
+    if not chunk:
+        return
+    parts = extract_chunk_parts(chunk)
+
+    reasoning_delta = parts["reasoning"]
+    text_delta = parts["text"]
+
+    if reasoning_delta:
+        if state.current_text_id is not None:
+            yield streaming_service.format_text_end(state.current_text_id)
+            if content_builder is not None:
+                content_builder.on_text_end(state.current_text_id)
+            state.current_text_id = None
+        if state.current_reasoning_id is None:
+            comp, new_active = complete_active_thinking_step(
+                state=state,
+                streaming_service=streaming_service,
+                content_builder=content_builder,
+                last_active_step_id=state.last_active_step_id,
+                last_active_step_title=state.last_active_step_title,
+                last_active_step_items=state.last_active_step_items,
+                completed_step_ids=state.completed_step_ids,
+            )
+            if comp:
+                yield comp
+            state.last_active_step_id = new_active
+            if state.just_finished_tool:
+                state.last_active_step_id = None
+                state.last_active_step_title = ""
+                state.last_active_step_items = []
+                state.just_finished_tool = False
+            state.current_reasoning_id = streaming_service.generate_reasoning_id()
+            yield streaming_service.format_reasoning_start(state.current_reasoning_id)
+            if content_builder is not None:
+                content_builder.on_reasoning_start(state.current_reasoning_id)
+        yield streaming_service.format_reasoning_delta(
+            state.current_reasoning_id, reasoning_delta
+        )
+        if content_builder is not None:
+            content_builder.on_reasoning_delta(
+                state.current_reasoning_id, reasoning_delta
+            )
+
+    if text_delta:
+        if state.current_reasoning_id is not None:
+            yield streaming_service.format_reasoning_end(state.current_reasoning_id)
+            if content_builder is not None:
+                content_builder.on_reasoning_end(state.current_reasoning_id)
+            state.current_reasoning_id = None
+        if state.current_text_id is None:
+            comp, new_active = complete_active_thinking_step(
+                state=state,
+                streaming_service=streaming_service,
+                content_builder=content_builder,
+                last_active_step_id=state.last_active_step_id,
+                last_active_step_title=state.last_active_step_title,
+                last_active_step_items=state.last_active_step_items,
+                completed_step_ids=state.completed_step_ids,
+            )
+            if comp:
+                yield comp
+            state.last_active_step_id = new_active
+            if state.just_finished_tool:
+                state.last_active_step_id = None
+                state.last_active_step_title = ""
+                state.last_active_step_items = []
+                state.just_finished_tool = False
+            state.current_text_id = streaming_service.generate_text_id()
+            yield streaming_service.format_text_start(state.current_text_id)
+            if content_builder is not None:
+                content_builder.on_text_start(state.current_text_id)
+        yield streaming_service.format_text_delta(state.current_text_id, text_delta)
+        state.accumulated_text += text_delta
+        if content_builder is not None:
+            content_builder.on_text_delta(state.current_text_id, text_delta)
+
+    if parts["tool_call_chunks"]:
+        for tcc in parts["tool_call_chunks"]:
+            idx = tcc.get("index")
+
+            if idx is not None and idx not in state.index_to_meta:
+                lc_id = tcc.get("id")
+                name = tcc.get("name")
+                if lc_id and name:
+                    ui_id = lc_id
+                    tool_input_metadata: dict[str, Any] | None = None
+                    if name == "task":
+                        sid = ensure_pending_task_span_for_lc(state, str(lc_id))
+                        tool_input_metadata = {"spanId": sid}
+
+                    if state.current_text_id is not None:
+                        yield streaming_service.format_text_end(state.current_text_id)
+                        if content_builder is not None:
+                            content_builder.on_text_end(state.current_text_id)
+                        state.current_text_id = None
+                    if state.current_reasoning_id is not None:
+                        yield streaming_service.format_reasoning_end(
+                            state.current_reasoning_id
+                        )
+                        if content_builder is not None:
+                            content_builder.on_reasoning_end(state.current_reasoning_id)
+                        state.current_reasoning_id = None
+
+                    state.index_to_meta[idx] = {
+                        "ui_id": ui_id,
+                        "lc_id": lc_id,
+                        "name": name,
+                    }
+                    yield streaming_service.format_tool_input_start(
+                        ui_id,
+                        name,
+                        langchain_tool_call_id=lc_id,
+                        metadata=tool_input_metadata,
+                    )
+                    if content_builder is not None:
+                        content_builder.on_tool_input_start(
+                            ui_id, name, lc_id, metadata=tool_input_metadata
+                        )
+
+            meta = state.index_to_meta.get(idx) if idx is not None else None
+            if meta:
+                args_chunk = tcc.get("args") or ""
+                if args_chunk:
+                    yield streaming_service.format_tool_input_delta(
+                        meta["ui_id"], args_chunk
+                    )
+                    if content_builder is not None:
+                        content_builder.on_tool_input_delta(meta["ui_id"], args_chunk)
+            else:
+                state.pending_tool_call_chunks.append(tcc)
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/custom_event_dispatch.py b/surfsense_backend/app/tasks/chat/streaming/handlers/custom_event_dispatch.py
new file mode 100644
index 0000000000..69f4b8a249
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/custom_event_dispatch.py
@@ -0,0 +1,57 @@
+"""Custom graph events routed to SSE (documents, action logs, report progress)."""
+
+from __future__ import annotations
+
+from collections.abc import Iterator
+from typing import Any
+
+from app.tasks.chat.streaming.handlers.custom_events import (
+    handle_action_log,
+    handle_action_log_updated,
+    handle_document_created,
+    handle_report_progress,
+)
+from app.tasks.chat.streaming.relay.state import AgentEventRelayState
+
+
+def iter_custom_event_frames(
+    event: dict[str, Any],
+    *,
+    state: AgentEventRelayState,
+    streaming_service: Any,
+    content_builder: Any | None,
+) -> Iterator[str]:
+    """Yield any SSE produced by ad-hoc graph events (documents, action logs, report progress)."""
+    name = event.get("name")
+    data = event.get("data", {})
+
+    if name == "report_progress":
+        frame, state.last_active_step_items = handle_report_progress(
+            data,
+            last_active_step_id=state.last_active_step_id,
+            last_active_step_title=state.last_active_step_title,
+            last_active_step_items=state.last_active_step_items,
+            streaming_service=streaming_service,
+            content_builder=content_builder,
+            thinking_metadata=state.span_metadata_if_active(),
+        )
+        if frame:
+            yield frame
+        return
+
+    if name == "document_created":
+        frame = handle_document_created(data, streaming_service=streaming_service)
+        if frame:
+            yield frame
+        return
+
+    if name == "action_log":
+        frame = handle_action_log(data, streaming_service=streaming_service)
+        if frame:
+            yield frame
+        return
+
+    if name == "action_log_updated":
+        frame = handle_action_log_updated(data, streaming_service=streaming_service)
+        if frame:
+            yield frame
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/custom_events.py b/surfsense_backend/app/tasks/chat/streaming/handlers/custom_events.py
new file mode 100644
index 0000000000..81116e2051
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/custom_events.py
@@ -0,0 +1,79 @@
+"""Custom-event payloads turned into SSE (no model/tool stream handling)."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from app.tasks.chat.streaming.relay.thinking_step_sse import emit_thinking_step_frame
+
+
+def handle_report_progress(
+    data: dict[str, Any],
+    *,
+    last_active_step_id: str | None,
+    last_active_step_title: str,
+    last_active_step_items: list[str],
+    streaming_service: Any,
+    content_builder: Any | None,
+    thinking_metadata: dict[str, Any] | None = None,
+) -> tuple[str | None, list[str]]:
+    """Update report step items; may emit one thinking SSE frame.
+
+    Returns (frame or None, items list after update).
+    """
+    message = data.get("message", "")
+    if not message or not last_active_step_id:
+        return None, last_active_step_items
+
+    phase = data.get("phase", "")
+    topic_items = [item for item in last_active_step_items if item.startswith("Topic:")]
+
+    if phase in ("revising_section", "adding_section"):
+        plan_items = [
+            item
+            for item in last_active_step_items
+            if item.startswith("Topic:")
+            or item.startswith("Modifying ")
+            or item.startswith("Adding ")
+            or item.startswith("Removing ")
+        ]
+        plan_items = [item for item in plan_items if not item.endswith("...")]
+        new_items = [*plan_items, message]
+    else:
+        new_items = [*topic_items, message]
+
+    frame = emit_thinking_step_frame(
+        streaming_service=streaming_service,
+        content_builder=content_builder,
+        step_id=last_active_step_id,
+        title=last_active_step_title,
+        status="in_progress",
+        items=new_items,
+        metadata=thinking_metadata,
+    )
+    return frame, new_items
+
+
+def handle_document_created(
+    data: dict[str, Any], *, streaming_service: Any
+) -> str | None:
+    if not data.get("id"):
+        return None
+    return streaming_service.format_data(
+        "documents-updated",
+        {"action": "created", "document": data},
+    )
+
+
+def handle_action_log(data: dict[str, Any], *, streaming_service: Any) -> str | None:
+    if data.get("id") is None:
+        return None
+    return streaming_service.format_data("action-log", data)
+
+
+def handle_action_log_updated(
+    data: dict[str, Any], *, streaming_service: Any
+) -> str | None:
+    if data.get("id") is None:
+        return None
+    return streaming_service.format_data("action-log-updated", data)
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tool_end.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tool_end.py
new file mode 100644
index 0000000000..57ab617c57
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tool_end.py
@@ -0,0 +1,119 @@
+"""Tool end: thinking completion, tool output, and terminal SSE."""
+
+from __future__ import annotations
+
+import json
+from collections.abc import Iterator
+from typing import Any
+
+from app.tasks.chat.streaming.handlers.tools import (
+    ToolCompletionEmissionContext,
+    iter_tool_completion_emission_frames,
+    resolve_tool_completed_thinking_step,
+)
+from app.tasks.chat.streaming.helpers.tool_output import tool_output_has_error
+from app.tasks.chat.streaming.relay.state import AgentEventRelayState
+from app.tasks.chat.streaming.relay.task_span import (
+    clear_task_span_if_delegating_task_ended,
+)
+from app.tasks.chat.streaming.relay.thinking_step_sse import emit_thinking_step_frame
+
+
+def iter_tool_end_frames(
+    event: dict[str, Any],
+    *,
+    state: AgentEventRelayState,
+    streaming_service: Any,
+    content_builder: Any | None,
+    result: Any,
+    step_prefix: str,
+    config: dict[str, Any],
+) -> Iterator[str]:
+    """SSE frames when one tool run finishes."""
+    state.active_tool_depth = max(0, state.active_tool_depth - 1)
+    run_id = event.get("run_id", "")
+    tool_name = event.get("name", "unknown_tool")
+    raw_output = event.get("data", {}).get("output", "")
+    staged_file_path = state.file_path_by_run.pop(run_id, None) if run_id else None
+
+    if tool_name == "update_memory":
+        state.called_update_memory = True
+
+    if hasattr(raw_output, "content"):
+        content = raw_output.content
+        if isinstance(content, str):
+            try:
+                tool_output = json.loads(content)
+            except (json.JSONDecodeError, TypeError):
+                tool_output = {"result": content}
+        elif isinstance(content, dict):
+            tool_output = content
+        else:
+            tool_output = {"result": str(content)}
+    elif isinstance(raw_output, dict):
+        tool_output = raw_output
+    else:
+        tool_output = {"result": str(raw_output) if raw_output else "completed"}
+
+    if tool_name in ("write_file", "edit_file"):
+        if tool_output_has_error(tool_output):
+            pass
+        else:
+            result.write_succeeded = True
+            result.verification_succeeded = True
+
+    tool_call_id = state.ui_tool_call_id_by_run.get(
+        run_id,
+        f"call_{run_id[:32]}" if run_id else "call_unknown",
+    )
+    original_step_id = state.tool_step_ids.get(
+        run_id, f"{step_prefix}-unknown-{run_id[:8]}"
+    )
+    state.completed_step_ids.add(original_step_id)
+
+    holder = state.current_lc_tool_call_id
+    holder["value"] = None
+    authoritative = getattr(raw_output, "tool_call_id", None)
+    if isinstance(authoritative, str) and authoritative:
+        holder["value"] = authoritative
+        if run_id:
+            state.lc_tool_call_id_by_run[run_id] = authoritative
+    elif run_id and run_id in state.lc_tool_call_id_by_run:
+        holder["value"] = state.lc_tool_call_id_by_run[run_id]
+
+    items = state.last_active_step_items
+    title, completed_items = resolve_tool_completed_thinking_step(
+        tool_name, tool_output, items
+    )
+    yield emit_thinking_step_frame(
+        streaming_service=streaming_service,
+        content_builder=content_builder,
+        step_id=original_step_id,
+        title=title,
+        status="completed",
+        items=completed_items,
+        metadata=state.span_metadata_if_active(),
+    )
+
+    state.just_finished_tool = True
+    state.last_active_step_id = None
+    state.last_active_step_title = ""
+    state.last_active_step_items = []
+
+    emission_ctx = ToolCompletionEmissionContext(
+        tool_name=tool_name,
+        tool_call_id=tool_call_id,
+        tool_output=tool_output,
+        streaming_service=streaming_service,
+        content_builder=content_builder,
+        langchain_tool_call_id_holder=holder,
+        stream_result=result,
+        langgraph_config=config,
+        staged_workspace_file_path=staged_file_path,
+        tool_metadata=state.tool_activity_metadata(
+            thinking_step_id=original_step_id,
+        ),
+    )
+    yield from iter_tool_completion_emission_frames(emission_ctx)
+
+    clear_task_span_if_delegating_task_ended(state, tool_name=tool_name, run_id=run_id)
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tool_output_frame.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tool_output_frame.py
new file mode 100644
index 0000000000..4cd8e3274a
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tool_output_frame.py
@@ -0,0 +1,29 @@
+"""Emit tool-output SSE and optional assistant content updates."""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+def emit_tool_output_available_frame(
+    *,
+    streaming_service: Any,
+    content_builder: Any | None,
+    langchain_id_holder: dict[str, str | None],
+    call_id: str,
+    output: Any,
+    tool_metadata: dict[str, Any] | None = None,
+) -> str:
+    if content_builder is not None:
+        content_builder.on_tool_output_available(
+            call_id,
+            output,
+            langchain_id_holder["value"],
+            metadata=tool_metadata,
+        )
+    return streaming_service.format_tool_output_available(
+        call_id,
+        output,
+        langchain_tool_call_id=langchain_id_holder["value"],
+        metadata=tool_metadata,
+    )
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tool_start.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tool_start.py
new file mode 100644
index 0000000000..e0cac307cc
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tool_start.py
@@ -0,0 +1,161 @@
+"""Tool start: thinking-step and tool-input SSE."""
+
+from __future__ import annotations
+
+import json
+from collections.abc import Iterator
+from typing import Any
+
+from app.tasks.chat.streaming.handlers.tools import resolve_tool_start_thinking
+from app.tasks.chat.streaming.helpers.tool_call_matching import (
+    match_buffered_langchain_tool_call_id,
+)
+from app.tasks.chat.streaming.relay.state import AgentEventRelayState
+from app.tasks.chat.streaming.relay.task_span import open_task_span
+from app.tasks.chat.streaming.relay.thinking_step_completion import (
+    complete_active_thinking_step,
+)
+from app.tasks.chat.streaming.relay.thinking_step_sse import emit_thinking_step_frame
+
+
+def iter_tool_start_frames(
+    event: dict[str, Any],
+    *,
+    state: AgentEventRelayState,
+    streaming_service: Any,
+    content_builder: Any | None,
+    result: Any,
+    step_prefix: str,
+) -> Iterator[str]:
+    """SSE frames for the start of one tool run."""
+    state.active_tool_depth += 1
+    tool_name = event.get("name", "unknown_tool")
+    run_id = event.get("run_id", "")
+    tool_input = event.get("data", {}).get("input", {})
+    if tool_name in ("write_file", "edit_file"):
+        result.write_attempted = True
+        if isinstance(tool_input, dict):
+            file_path = tool_input.get("file_path")
+            if isinstance(file_path, str) and file_path.strip() and run_id:
+                state.file_path_by_run[run_id] = file_path.strip()
+
+    if state.current_text_id is not None:
+        yield streaming_service.format_text_end(state.current_text_id)
+        if content_builder is not None:
+            content_builder.on_text_end(state.current_text_id)
+        state.current_text_id = None
+
+    if state.last_active_step_title != "Synthesizing response":
+        comp, new_active = complete_active_thinking_step(
+            state=state,
+            streaming_service=streaming_service,
+            content_builder=content_builder,
+            last_active_step_id=state.last_active_step_id,
+            last_active_step_title=state.last_active_step_title,
+            last_active_step_items=state.last_active_step_items,
+            completed_step_ids=state.completed_step_ids,
+        )
+        if comp:
+            yield comp
+        state.last_active_step_id = new_active
+
+    state.just_finished_tool = False
+    tool_step_id = state.next_thinking_step_id(step_prefix)
+    state.tool_step_ids[run_id] = tool_step_id
+    state.last_active_step_id = tool_step_id
+
+    matched_meta: dict[str, str] | None = None
+    taken_ui_ids = set(state.ui_tool_call_id_by_run.values())
+    for meta in state.index_to_meta.values():
+        if meta["name"] == tool_name and meta["ui_id"] not in taken_ui_ids:
+            matched_meta = meta
+            break
+
+    tool_call_id: str
+    langchain_tool_call_id: str | None = None
+    if matched_meta is not None:
+        tool_call_id = matched_meta["ui_id"]
+        langchain_tool_call_id = matched_meta["lc_id"]
+        if run_id:
+            state.lc_tool_call_id_by_run[run_id] = matched_meta["lc_id"]
+    else:
+        tool_call_id = (
+            f"call_{run_id[:32]}"
+            if run_id
+            else streaming_service.generate_tool_call_id()
+        )
+        langchain_tool_call_id = match_buffered_langchain_tool_call_id(
+            state.pending_tool_call_chunks,
+            tool_name,
+            run_id,
+            state.lc_tool_call_id_by_run,
+        )
+
+    if tool_name == "task":
+        open_task_span(
+            state,
+            run_id=run_id,
+            langchain_tool_call_id=langchain_tool_call_id,
+        )
+
+    span_md = state.span_metadata_if_active()
+    tool_md = state.tool_activity_metadata(thinking_step_id=tool_step_id)
+
+    if matched_meta is None:
+        yield streaming_service.format_tool_input_start(
+            tool_call_id,
+            tool_name,
+            langchain_tool_call_id=langchain_tool_call_id,
+            metadata=tool_md,
+        )
+        if content_builder is not None:
+            content_builder.on_tool_input_start(
+                tool_call_id,
+                tool_name,
+                langchain_tool_call_id,
+                metadata=tool_md,
+            )
+
+    thinking = resolve_tool_start_thinking(tool_name, tool_input)
+    state.last_active_step_title = thinking.title
+    state.last_active_step_items = thinking.items
+    frame_kw: dict[str, Any] = {
+        "streaming_service": streaming_service,
+        "content_builder": content_builder,
+        "step_id": tool_step_id,
+        "title": thinking.title,
+        "status": "in_progress",
+        "metadata": span_md,
+    }
+    if thinking.include_items_on_frame:
+        frame_kw["items"] = thinking.items
+    yield emit_thinking_step_frame(**frame_kw)
+
+    if run_id:
+        state.ui_tool_call_id_by_run[run_id] = tool_call_id
+
+    if isinstance(tool_input, dict):
+        _safe_input: dict[str, Any] = {}
+        for _k, _v in tool_input.items():
+            try:
+                json.dumps(_v)
+                _safe_input[_k] = _v
+            except (TypeError, ValueError, OverflowError):
+                pass
+    else:
+        _safe_input = {"input": tool_input}
+    yield streaming_service.format_tool_input_available(
+        tool_call_id,
+        tool_name,
+        _safe_input,
+        langchain_tool_call_id=langchain_tool_call_id,
+        metadata=tool_md,
+    )
+    if content_builder is not None:
+        content_builder.on_tool_input_available(
+            tool_call_id,
+            tool_name,
+            _safe_input,
+            langchain_tool_call_id,
+            metadata=tool_md,
+        )
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/__init__.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/__init__.py
new file mode 100644
index 0000000000..4b191c100f
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/__init__.py
@@ -0,0 +1,23 @@
+"""Per-tool streaming: thinking-step and completion emission."""
+
+from __future__ import annotations
+
+from app.tasks.chat.streaming.handlers.tools.emission_context import (
+    ToolCompletionEmissionContext,
+)
+from app.tasks.chat.streaming.handlers.tools.registry import (
+    iter_tool_completion_emission_frames,
+    resolve_tool_completed_thinking_step,
+    resolve_tool_start_thinking,
+)
+from app.tasks.chat.streaming.handlers.tools.shared.model import (
+    ToolStartThinking,
+)
+
+__all__ = [
+    "ToolCompletionEmissionContext",
+    "ToolStartThinking",
+    "iter_tool_completion_emission_frames",
+    "resolve_tool_completed_thinking_step",
+    "resolve_tool_start_thinking",
+]
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/connector/__init__.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/connector/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/connector/shared/__init__.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/connector/shared/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/connector/shared/emission.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/connector/shared/emission.py
new file mode 100644
index 0000000000..8e19dc224c
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/connector/shared/emission.py
@@ -0,0 +1,15 @@
+from __future__ import annotations
+
+from collections.abc import Iterator
+
+from app.tasks.chat.streaming.handlers.tools.emission_context import (
+    ToolCompletionEmissionContext,
+)
+
+
+def iter_completion_emission_frames(
+    ctx: ToolCompletionEmissionContext,
+) -> Iterator[str]:
+    out = ctx.tool_output
+    payload = out if isinstance(out, dict) else {"result": out}
+    yield ctx.emit_tool_output_card(payload)
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/connector/shared/thinking.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/connector/shared/thinking.py
new file mode 100644
index 0000000000..2eed8855ae
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/connector/shared/thinking.py
@@ -0,0 +1,24 @@
+from __future__ import annotations
+
+from typing import Any
+
+from app.tasks.chat.streaming.handlers.tools.default import (
+    thinking as default_thinking,
+)
+from app.tasks.chat.streaming.handlers.tools.shared.model import (
+    ToolStartThinking,
+)
+
+
+def resolve_start_thinking(tool_name: str, tool_input: Any) -> ToolStartThinking:
+    return default_thinking.resolve_start_thinking(tool_name, tool_input)
+
+
+def resolve_completed_thinking(
+    tool_name: str,
+    tool_output: Any,
+    last_items: list[str],
+) -> tuple[str, list[str]]:
+    return default_thinking.resolve_completed_thinking(
+        tool_name, tool_output, last_items
+    )
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/connector/shared/tool_names.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/connector/shared/tool_names.py
new file mode 100644
index 0000000000..ab698b32db
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/connector/shared/tool_names.py
@@ -0,0 +1,31 @@
+from __future__ import annotations
+
+SHARED_CONNECTOR_TOOLS: frozenset[str] = frozenset(
+    {
+        "create_calendar_event",
+        "create_confluence_page",
+        "create_dropbox_file",
+        "create_gmail_draft",
+        "create_google_drive_file",
+        "create_jira_issue",
+        "create_linear_issue",
+        "create_notion_page",
+        "create_onedrive_file",
+        "delete_calendar_event",
+        "delete_confluence_page",
+        "delete_dropbox_file",
+        "delete_google_drive_file",
+        "delete_jira_issue",
+        "delete_linear_issue",
+        "delete_notion_page",
+        "delete_onedrive_file",
+        "send_gmail_email",
+        "trash_gmail_email",
+        "update_calendar_event",
+        "update_confluence_page",
+        "update_gmail_draft",
+        "update_jira_issue",
+        "update_linear_issue",
+        "update_notion_page",
+    }
+)
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/default/__init__.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/default/__init__.py
new file mode 100644
index 0000000000..5e84a37f4f
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/default/__init__.py
@@ -0,0 +1,3 @@
+"""Fallback tool package."""
+
+from __future__ import annotations
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/default/emission.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/default/emission.py
new file mode 100644
index 0000000000..e24c619a74
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/default/emission.py
@@ -0,0 +1,24 @@
+"""Default tool-output card and a short completion terminal line."""
+
+from __future__ import annotations
+
+from collections.abc import Iterator
+
+from app.tasks.chat.streaming.handlers.tools.emission_context import (
+    ToolCompletionEmissionContext,
+)
+
+
+def iter_completion_emission_frames(
+    ctx: ToolCompletionEmissionContext,
+) -> Iterator[str]:
+    yield ctx.emit_tool_output_card(
+        {
+            "status": "completed",
+            "result_length": len(str(ctx.tool_output)),
+        },
+    )
+    yield ctx.streaming_service.format_terminal_info(
+        f"Tool {ctx.tool_name} completed",
+        "success",
+    )
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/default/thinking.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/default/thinking.py
new file mode 100644
index 0000000000..46d15a4e7f
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/default/thinking.py
@@ -0,0 +1,23 @@
+"""Fallback thinking-step copy for unknown tools and connectors without custom UI."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from app.tasks.chat.streaming.handlers.tools.shared.model import (
+    ToolStartThinking,
+)
+
+
+def resolve_start_thinking(tool_name: str, tool_input: Any) -> ToolStartThinking:
+    del tool_input
+    title = tool_name.replace("_", " ").strip().capitalize() or tool_name
+    return ToolStartThinking(title=title, items=[], include_items_on_frame=False)
+
+
+def resolve_completed_thinking(
+    tool_name: str, tool_output: Any, last_items: list[str]
+) -> tuple[str, list[str]]:
+    del tool_output
+    title = tool_name.replace("_", " ").strip().capitalize() or tool_name
+    return (title, last_items)
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/__init__.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_image/__init__.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_image/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_image/emission.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_image/emission.py
new file mode 100644
index 0000000000..762f75cca6
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_image/emission.py
@@ -0,0 +1,28 @@
+"""generate_image: tool card + terminal summary."""
+
+from __future__ import annotations
+
+from collections.abc import Iterator
+
+from app.tasks.chat.streaming.handlers.tools.emission_context import (
+    ToolCompletionEmissionContext,
+)
+
+
+def iter_completion_emission_frames(
+    ctx: ToolCompletionEmissionContext,
+) -> Iterator[str]:
+    out = ctx.tool_output
+    payload = out if isinstance(out, dict) else {"result": out}
+    yield ctx.emit_tool_output_card(payload)
+    if isinstance(out, dict):
+        if out.get("error"):
+            yield ctx.streaming_service.format_terminal_info(
+                f"Image generation failed: {out['error'][:60]}",
+                "error",
+            )
+        else:
+            yield ctx.streaming_service.format_terminal_info(
+                "Image generated successfully",
+                "success",
+            )
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_image/thinking.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_image/thinking.py
new file mode 100644
index 0000000000..c094aabf6a
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_image/thinking.py
@@ -0,0 +1,41 @@
+"""generate_image: thinking-step copy."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from app.tasks.chat.streaming.handlers.tools.deliverables.shared.tool_input import (
+    as_tool_input_dict,
+)
+from app.tasks.chat.streaming.handlers.tools.shared.model import (
+    ToolStartThinking,
+)
+
+
+def resolve_start_thinking(tool_name: str, tool_input: Any) -> ToolStartThinking:
+    del tool_name
+    d = as_tool_input_dict(tool_input)
+    prompt = d.get("prompt", "") if isinstance(tool_input, dict) else str(tool_input)
+    return ToolStartThinking(
+        title="Generating image",
+        items=[f"Prompt: {prompt[:80]}{'...' if len(prompt) > 80 else ''}"],
+    )
+
+
+def resolve_completed_thinking(
+    tool_name: str,
+    tool_output: Any,
+    last_items: list[str],
+) -> tuple[str, list[str]]:
+    del tool_name
+    items = last_items
+    if isinstance(tool_output, dict) and not tool_output.get("error"):
+        completed = [*items, "Image generated successfully"]
+    else:
+        error_msg = (
+            tool_output.get("error", "Generation failed")
+            if isinstance(tool_output, dict)
+            else "Generation failed"
+        )
+        completed = [*items, f"Error: {error_msg}"]
+    return ("Generating image", completed)
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_podcast/__init__.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_podcast/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_podcast/emission.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_podcast/emission.py
new file mode 100644
index 0000000000..f1a1e9c374
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_podcast/emission.py
@@ -0,0 +1,37 @@
+"""generate_podcast: tool card + queue / success / failure terminal lines."""
+
+from __future__ import annotations
+
+from collections.abc import Iterator
+
+from app.tasks.chat.streaming.handlers.tools.emission_context import (
+    ToolCompletionEmissionContext,
+)
+
+
+def iter_completion_emission_frames(
+    ctx: ToolCompletionEmissionContext,
+) -> Iterator[str]:
+    out = ctx.tool_output
+    payload = out if isinstance(out, dict) else {"result": out}
+    yield ctx.emit_tool_output_card(payload)
+    if isinstance(out, dict) and out.get("status") in (
+        "pending",
+        "generating",
+        "processing",
+    ):
+        yield ctx.streaming_service.format_terminal_info(
+            f"Podcast queued: {out.get('title', 'Podcast')}",
+            "success",
+        )
+    elif isinstance(out, dict) and out.get("status") in ("ready", "success"):
+        yield ctx.streaming_service.format_terminal_info(
+            f"Podcast generated successfully: {out.get('title', 'Podcast')}",
+            "success",
+        )
+    elif isinstance(out, dict) and out.get("status") in ("failed", "error"):
+        error_msg = out.get("error", "Unknown error")
+        yield ctx.streaming_service.format_terminal_info(
+            f"Podcast generation failed: {error_msg}",
+            "error",
+        )
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_podcast/thinking.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_podcast/thinking.py
new file mode 100644
index 0000000000..5cf78ea72c
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_podcast/thinking.py
@@ -0,0 +1,82 @@
+"""generate_podcast: thinking-step copy."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from app.tasks.chat.streaming.handlers.tools.deliverables.shared.tool_input import (
+    as_tool_input_dict,
+)
+from app.tasks.chat.streaming.handlers.tools.shared.model import (
+    ToolStartThinking,
+)
+
+
+def resolve_start_thinking(tool_name: str, tool_input: Any) -> ToolStartThinking:
+    del tool_name
+    d = as_tool_input_dict(tool_input)
+    podcast_title = (
+        d.get("podcast_title", "SurfSense Podcast")
+        if isinstance(tool_input, dict)
+        else "SurfSense Podcast"
+    )
+    content_len = len(
+        d.get("source_content", "") if isinstance(tool_input, dict) else ""
+    )
+    return ToolStartThinking(
+        title="Generating podcast",
+        items=[
+            f"Title: {podcast_title}",
+            f"Content: {content_len:,} characters",
+            "Preparing audio generation...",
+        ],
+    )
+
+
+def resolve_completed_thinking(
+    tool_name: str,
+    tool_output: Any,
+    last_items: list[str],
+) -> tuple[str, list[str]]:
+    del tool_name
+    items = last_items
+    podcast_status = (
+        tool_output.get("status", "unknown")
+        if isinstance(tool_output, dict)
+        else "unknown"
+    )
+    podcast_title = (
+        tool_output.get("title", "Podcast")
+        if isinstance(tool_output, dict)
+        else "Podcast"
+    )
+    if podcast_status in ("pending", "generating", "processing"):
+        completed = [
+            f"Title: {podcast_title}",
+            "Podcast generation started",
+            "Processing in background...",
+        ]
+    elif podcast_status == "already_generating":
+        completed = [
+            f"Title: {podcast_title}",
+            "Podcast already in progress",
+            "Please wait for it to complete",
+        ]
+    elif podcast_status in ("failed", "error"):
+        error_msg = (
+            tool_output.get("error", "Unknown error")
+            if isinstance(tool_output, dict)
+            else "Unknown error"
+        )
+        completed = [
+            f"Title: {podcast_title}",
+            f"Error: {error_msg[:50]}",
+        ]
+    elif podcast_status in ("ready", "success"):
+        completed = [
+            f"Title: {podcast_title}",
+            "Podcast ready",
+        ]
+    else:
+        completed = items
+    return ("Generating podcast", completed)
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_report/__init__.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_report/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_report/emission.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_report/emission.py
new file mode 100644
index 0000000000..1c5c71b8b9
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_report/emission.py
@@ -0,0 +1,33 @@
+"""generate_report: full payload + terminal line."""
+
+from __future__ import annotations
+
+from collections.abc import Iterator
+
+from app.tasks.chat.streaming.handlers.tools.emission_context import (
+    ToolCompletionEmissionContext,
+)
+
+
+def iter_completion_emission_frames(
+    ctx: ToolCompletionEmissionContext,
+) -> Iterator[str]:
+    out = ctx.tool_output
+    payload = out if isinstance(out, dict) else {"result": out}
+    yield ctx.emit_tool_output_card(payload)
+    if isinstance(out, dict) and out.get("status") == "ready":
+        word_count = out.get("word_count", 0)
+        yield ctx.streaming_service.format_terminal_info(
+            f"Report generated: {out.get('title', 'Report')} ({word_count:,} words)",
+            "success",
+        )
+    else:
+        error_msg = (
+            out.get("error", "Unknown error")
+            if isinstance(out, dict)
+            else "Unknown error"
+        )
+        yield ctx.streaming_service.format_terminal_info(
+            f"Report generation failed: {error_msg}",
+            "error",
+        )
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_report/thinking.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_report/thinking.py
new file mode 100644
index 0000000000..3893cd5501
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_report/thinking.py
@@ -0,0 +1,77 @@
+"""generate_report: thinking-step copy."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from app.tasks.chat.streaming.handlers.tools.deliverables.shared.tool_input import (
+    as_tool_input_dict,
+)
+from app.tasks.chat.streaming.handlers.tools.shared.model import (
+    ToolStartThinking,
+)
+
+
+def resolve_start_thinking(tool_name: str, tool_input: Any) -> ToolStartThinking:
+    del tool_name
+    d = as_tool_input_dict(tool_input)
+    report_topic = (
+        d.get("topic", "Report") if isinstance(tool_input, dict) else "Report"
+    )
+    is_revision = bool(
+        isinstance(tool_input, dict) and tool_input.get("parent_report_id")
+    )
+    step_title = "Revising report" if is_revision else "Generating report"
+    return ToolStartThinking(
+        title=step_title,
+        items=[f"Topic: {report_topic}", "Analyzing source content..."],
+    )
+
+
+def resolve_completed_thinking(
+    tool_name: str,
+    tool_output: Any,
+    last_items: list[str],
+) -> tuple[str, list[str]]:
+    del tool_name
+    items = last_items
+    report_status = (
+        tool_output.get("status", "unknown")
+        if isinstance(tool_output, dict)
+        else "unknown"
+    )
+    report_title = (
+        tool_output.get("title", "Report")
+        if isinstance(tool_output, dict)
+        else "Report"
+    )
+    word_count = (
+        tool_output.get("word_count", 0) if isinstance(tool_output, dict) else 0
+    )
+    is_revision = (
+        tool_output.get("is_revision", False)
+        if isinstance(tool_output, dict)
+        else False
+    )
+    step_title = "Revising report" if is_revision else "Generating report"
+
+    if report_status == "ready":
+        completed = [
+            f"Topic: {report_title}",
+            f"{word_count:,} words",
+            "Report ready",
+        ]
+    elif report_status == "failed":
+        error_msg = (
+            tool_output.get("error", "Unknown error")
+            if isinstance(tool_output, dict)
+            else "Unknown error"
+        )
+        completed = [
+            f"Topic: {report_title}",
+            f"Error: {error_msg[:50]}",
+        ]
+    else:
+        completed = items
+
+    return (step_title, completed)
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_resume/__init__.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_resume/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_resume/emission.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_resume/emission.py
new file mode 100644
index 0000000000..dc8d3c7fcd
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_resume/emission.py
@@ -0,0 +1,32 @@
+"""generate_resume: full payload + terminal line."""
+
+from __future__ import annotations
+
+from collections.abc import Iterator
+
+from app.tasks.chat.streaming.handlers.tools.emission_context import (
+    ToolCompletionEmissionContext,
+)
+
+
+def iter_completion_emission_frames(
+    ctx: ToolCompletionEmissionContext,
+) -> Iterator[str]:
+    out = ctx.tool_output
+    payload = out if isinstance(out, dict) else {"result": out}
+    yield ctx.emit_tool_output_card(payload)
+    if isinstance(out, dict) and out.get("status") == "ready":
+        yield ctx.streaming_service.format_terminal_info(
+            f"Resume generated: {out.get('title', 'Resume')}",
+            "success",
+        )
+    else:
+        error_msg = (
+            out.get("error", "Unknown error")
+            if isinstance(out, dict)
+            else "Unknown error"
+        )
+        yield ctx.streaming_service.format_terminal_info(
+            f"Resume generation failed: {error_msg}",
+            "error",
+        )
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_resume/thinking.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_resume/thinking.py
new file mode 100644
index 0000000000..5a54da84dd
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_resume/thinking.py
@@ -0,0 +1,26 @@
+"""generate_resume: generic thinking titles and items."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from app.tasks.chat.streaming.handlers.tools.default import (
+    thinking as default_thinking,
+)
+from app.tasks.chat.streaming.handlers.tools.shared.model import (
+    ToolStartThinking,
+)
+
+
+def resolve_start_thinking(tool_name: str, tool_input: Any) -> ToolStartThinking:
+    return default_thinking.resolve_start_thinking(tool_name, tool_input)
+
+
+def resolve_completed_thinking(
+    tool_name: str,
+    tool_output: Any,
+    last_items: list[str],
+) -> tuple[str, list[str]]:
+    return default_thinking.resolve_completed_thinking(
+        tool_name, tool_output, last_items
+    )
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_video_presentation/__init__.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_video_presentation/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_video_presentation/emission.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_video_presentation/emission.py
new file mode 100644
index 0000000000..21e27d4c3b
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_video_presentation/emission.py
@@ -0,0 +1,28 @@
+"""generate_video_presentation: tool card + terminal line."""
+
+from __future__ import annotations
+
+from collections.abc import Iterator
+
+from app.tasks.chat.streaming.handlers.tools.emission_context import (
+    ToolCompletionEmissionContext,
+)
+
+
+def iter_completion_emission_frames(
+    ctx: ToolCompletionEmissionContext,
+) -> Iterator[str]:
+    out = ctx.tool_output
+    payload = out if isinstance(out, dict) else {"result": out}
+    yield ctx.emit_tool_output_card(payload)
+    if isinstance(out, dict) and out.get("status") == "pending":
+        yield ctx.streaming_service.format_terminal_info(
+            f"Video presentation queued: {out.get('title', 'Presentation')}",
+            "success",
+        )
+    elif isinstance(out, dict) and out.get("status") == "failed":
+        error_msg = out.get("error", "Unknown error")
+        yield ctx.streaming_service.format_terminal_info(
+            f"Presentation generation failed: {error_msg}",
+            "error",
+        )
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_video_presentation/thinking.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_video_presentation/thinking.py
new file mode 100644
index 0000000000..fcf4259500
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_video_presentation/thinking.py
@@ -0,0 +1,54 @@
+"""generate_video_presentation: generic in-progress thinking; completion is status-driven."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from app.tasks.chat.streaming.handlers.tools.default import (
+    thinking as default_thinking,
+)
+from app.tasks.chat.streaming.handlers.tools.shared.model import (
+    ToolStartThinking,
+)
+
+
+def resolve_start_thinking(tool_name: str, tool_input: Any) -> ToolStartThinking:
+    return default_thinking.resolve_start_thinking(tool_name, tool_input)
+
+
+def resolve_completed_thinking(
+    tool_name: str,
+    tool_output: Any,
+    last_items: list[str],
+) -> tuple[str, list[str]]:
+    del tool_name
+    items = last_items
+    vp_status = (
+        tool_output.get("status", "unknown")
+        if isinstance(tool_output, dict)
+        else "unknown"
+    )
+    vp_title = (
+        tool_output.get("title", "Presentation")
+        if isinstance(tool_output, dict)
+        else "Presentation"
+    )
+    if vp_status in ("pending", "generating"):
+        completed = [
+            f"Title: {vp_title}",
+            "Presentation generation started",
+            "Processing in background...",
+        ]
+    elif vp_status == "failed":
+        error_msg = (
+            tool_output.get("error", "Unknown error")
+            if isinstance(tool_output, dict)
+            else "Unknown error"
+        )
+        completed = [
+            f"Title: {vp_title}",
+            f"Error: {error_msg[:50]}",
+        ]
+    else:
+        completed = items
+    return ("Generating video presentation", completed)
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/save_document/__init__.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/save_document/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/save_document/emission.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/save_document/emission.py
new file mode 100644
index 0000000000..68c93dede3
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/save_document/emission.py
@@ -0,0 +1,16 @@
+"""save_document: default completion card and terminal line."""
+
+from __future__ import annotations
+
+from collections.abc import Iterator
+
+from app.tasks.chat.streaming.handlers.tools.default import emission as _default
+from app.tasks.chat.streaming.handlers.tools.emission_context import (
+    ToolCompletionEmissionContext,
+)
+
+
+def iter_completion_emission_frames(
+    ctx: ToolCompletionEmissionContext,
+) -> Iterator[str]:
+    yield from _default.iter_completion_emission_frames(ctx)
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/save_document/thinking.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/save_document/thinking.py
new file mode 100644
index 0000000000..96e2b67436
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/save_document/thinking.py
@@ -0,0 +1,40 @@
+"""save_document: thinking-step copy."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from app.tasks.chat.streaming.handlers.tools.deliverables.shared.tool_input import (
+    as_tool_input_dict,
+)
+from app.tasks.chat.streaming.handlers.tools.shared.model import (
+    ToolStartThinking,
+)
+
+
+def resolve_start_thinking(tool_name: str, tool_input: Any) -> ToolStartThinking:
+    del tool_name
+    d = as_tool_input_dict(tool_input)
+    doc_title = d.get("title", "") if isinstance(tool_input, dict) else str(tool_input)
+    display_title = doc_title[:60] + ("…" if len(doc_title) > 60 else "")
+    return ToolStartThinking(title="Saving document", items=[display_title])
+
+
+def resolve_completed_thinking(
+    tool_name: str,
+    tool_output: Any,
+    last_items: list[str],
+) -> tuple[str, list[str]]:
+    del tool_name
+    items = last_items
+    result_str = (
+        tool_output.get("result", "")
+        if isinstance(tool_output, dict)
+        else str(tool_output)
+    )
+    is_error = "Error" in result_str
+    completed = [
+        *items,
+        result_str[:80] if is_error else "Saved to knowledge base",
+    ]
+    return ("Saving document", completed)
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/shared/__init__.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/shared/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/shared/tool_input.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/shared/tool_input.py
new file mode 100644
index 0000000000..1303cf09fa
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/shared/tool_input.py
@@ -0,0 +1,9 @@
+"""Tool-call args for deliverable thinking modules."""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+def as_tool_input_dict(tool_input: Any) -> dict[str, Any]:
+    return tool_input if isinstance(tool_input, dict) else {}
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/tool_names.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/tool_names.py
new file mode 100644
index 0000000000..5924af196c
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/tool_names.py
@@ -0,0 +1,12 @@
+from __future__ import annotations
+
+DELIVERABLE_TOOLS: frozenset[str] = frozenset(
+    {
+        "generate_image",
+        "generate_podcast",
+        "generate_report",
+        "generate_resume",
+        "generate_video_presentation",
+        "save_document",
+    }
+)
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/emission_context.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/emission_context.py
new file mode 100644
index 0000000000..baa1d73361
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/emission_context.py
@@ -0,0 +1,36 @@
+"""Context for one tool-completion emission pass."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+
+from app.tasks.chat.streaming.handlers.tool_output_frame import (
+    emit_tool_output_available_frame,
+)
+
+
+@dataclass
+class ToolCompletionEmissionContext:
+    """Streaming service, tool output, and ids for completion frames."""
+
+    tool_name: str
+    tool_call_id: str
+    tool_output: Any
+    streaming_service: Any
+    content_builder: Any | None
+    langchain_tool_call_id_holder: dict[str, str | None]
+    stream_result: Any
+    langgraph_config: dict[str, Any]
+    staged_workspace_file_path: str | None
+    tool_metadata: dict[str, Any] | None = None
+
+    def emit_tool_output_card(self, payload: Any) -> str:
+        return emit_tool_output_available_frame(
+            streaming_service=self.streaming_service,
+            content_builder=self.content_builder,
+            langchain_id_holder=self.langchain_tool_call_id_holder,
+            call_id=self.tool_call_id,
+            output=payload,
+            tool_metadata=self.tool_metadata,
+        )
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/__init__.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/edit_file/__init__.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/edit_file/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/edit_file/thinking.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/edit_file/thinking.py
new file mode 100644
index 0000000000..7937c26e9a
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/edit_file/thinking.py
@@ -0,0 +1,29 @@
+"""edit_file: thinking-step copy."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from app.tasks.chat.streaming.handlers.tools.filesystem.shared.tool_input import (
+    as_tool_input_dict,
+    truncate_path,
+)
+from app.tasks.chat.streaming.handlers.tools.shared.model import (
+    ToolStartThinking,
+)
+
+
+def resolve_start_thinking(tool_name: str, tool_input: Any) -> ToolStartThinking:
+    del tool_name
+    d = as_tool_input_dict(tool_input)
+    fp = d.get("file_path", "") if isinstance(tool_input, dict) else str(tool_input)
+    return ToolStartThinking(title="Editing file", items=[truncate_path(fp)])
+
+
+def resolve_completed_thinking(
+    tool_name: str,
+    tool_output: Any,
+    last_items: list[str],
+) -> tuple[str, list[str]]:
+    del tool_output, tool_name
+    return ("Editing file", last_items)
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/execute/__init__.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/execute/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/execute/emission.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/execute/emission.py
new file mode 100644
index 0000000000..780cd77d8e
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/execute/emission.py
@@ -0,0 +1,38 @@
+"""execute: exit code, stdout, sandbox file hints."""
+
+from __future__ import annotations
+
+import re
+from collections.abc import Iterator
+
+from app.tasks.chat.streaming.handlers.tools.emission_context import (
+    ToolCompletionEmissionContext,
+)
+
+
+def iter_completion_emission_frames(
+    ctx: ToolCompletionEmissionContext,
+) -> Iterator[str]:
+    out = ctx.tool_output
+    raw_text = out.get("result", "") if isinstance(out, dict) else str(out)
+    exit_code: int | None = None
+    output_text = raw_text
+    m = re.match(r"^Exit code:\s*(\d+)", raw_text)
+    if m:
+        exit_code = int(m.group(1))
+        om = re.search(r"\nOutput:\n([\s\S]*)", raw_text)
+        output_text = om.group(1) if om else ""
+    thread_id_str = ctx.langgraph_config.get("configurable", {}).get("thread_id", "")
+
+    for sf_match in re.finditer(r"^SANDBOX_FILE:\s*(.+)$", output_text, re.MULTILINE):
+        fpath = sf_match.group(1).strip()
+        if fpath and fpath not in ctx.stream_result.sandbox_files:
+            ctx.stream_result.sandbox_files.append(fpath)
+
+    yield ctx.emit_tool_output_card(
+        {
+            "exit_code": exit_code,
+            "output": output_text,
+            "thread_id": thread_id_str,
+        },
+    )
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/execute/thinking.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/execute/thinking.py
new file mode 100644
index 0000000000..968140363b
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/execute/thinking.py
@@ -0,0 +1,44 @@
+"""execute: sandbox command thinking + completion lines."""
+
+from __future__ import annotations
+
+import re
+from typing import Any
+
+from app.tasks.chat.streaming.handlers.tools.filesystem.shared.tool_input import (
+    as_tool_input_dict,
+)
+from app.tasks.chat.streaming.handlers.tools.shared.model import (
+    ToolStartThinking,
+)
+
+
+def resolve_start_thinking(tool_name: str, tool_input: Any) -> ToolStartThinking:
+    del tool_name
+    d = as_tool_input_dict(tool_input)
+    cmd = d.get("command", "") if isinstance(tool_input, dict) else str(tool_input)
+    display_cmd = cmd[:80] + ("…" if len(cmd) > 80 else "")
+    return ToolStartThinking(title="Running command", items=[f"$ {display_cmd}"])
+
+
+def resolve_completed_thinking(
+    tool_name: str,
+    tool_output: Any,
+    last_items: list[str],
+) -> tuple[str, list[str]]:
+    del tool_name
+    items = last_items
+    raw_text = (
+        tool_output.get("result", "")
+        if isinstance(tool_output, dict)
+        else str(tool_output)
+    )
+    m = re.match(r"^Exit code:\s*(\d+)", raw_text)
+    exit_code_val = int(m.group(1)) if m else None
+    if exit_code_val is not None and exit_code_val == 0:
+        completed = [*items, "Completed successfully"]
+    elif exit_code_val is not None:
+        completed = [*items, f"Exit code: {exit_code_val}"]
+    else:
+        completed = [*items, "Finished"]
+    return ("Running command", completed)
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/glob/__init__.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/glob/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/glob/thinking.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/glob/thinking.py
new file mode 100644
index 0000000000..bdcaecd45a
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/glob/thinking.py
@@ -0,0 +1,29 @@
+"""glob: thinking-step copy."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from app.tasks.chat.streaming.handlers.tools.filesystem.shared.tool_input import (
+    as_tool_input_dict,
+)
+from app.tasks.chat.streaming.handlers.tools.shared.model import (
+    ToolStartThinking,
+)
+
+
+def resolve_start_thinking(tool_name: str, tool_input: Any) -> ToolStartThinking:
+    del tool_name
+    d = as_tool_input_dict(tool_input)
+    pat = d.get("pattern", "") if isinstance(tool_input, dict) else str(tool_input)
+    base = d.get("path", "/") if isinstance(tool_input, dict) else "/"
+    return ToolStartThinking(title="Searching files", items=[f"{pat} in {base}"])
+
+
+def resolve_completed_thinking(
+    tool_name: str,
+    tool_output: Any,
+    last_items: list[str],
+) -> tuple[str, list[str]]:
+    del tool_output, tool_name
+    return ("Searching files", last_items)
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/grep/__init__.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/grep/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/grep/thinking.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/grep/thinking.py
new file mode 100644
index 0000000000..36e077599d
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/grep/thinking.py
@@ -0,0 +1,33 @@
+"""grep: thinking-step copy."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from app.tasks.chat.streaming.handlers.tools.filesystem.shared.tool_input import (
+    as_tool_input_dict,
+)
+from app.tasks.chat.streaming.handlers.tools.shared.model import (
+    ToolStartThinking,
+)
+
+
+def resolve_start_thinking(tool_name: str, tool_input: Any) -> ToolStartThinking:
+    del tool_name
+    d = as_tool_input_dict(tool_input)
+    pat = d.get("pattern", "") if isinstance(tool_input, dict) else str(tool_input)
+    grep_path = d.get("path", "") if isinstance(tool_input, dict) else ""
+    display_pat = pat[:60] + ("…" if len(pat) > 60 else "")
+    return ToolStartThinking(
+        title="Searching content",
+        items=[f'"{display_pat}"' + (f" in {grep_path}" if grep_path else "")],
+    )
+
+
+def resolve_completed_thinking(
+    tool_name: str,
+    tool_output: Any,
+    last_items: list[str],
+) -> tuple[str, list[str]]:
+    del tool_output, tool_name
+    return ("Searching content", last_items)
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/ls/__init__.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/ls/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/ls/thinking.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/ls/thinking.py
new file mode 100644
index 0000000000..1e89f3a122
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/ls/thinking.py
@@ -0,0 +1,59 @@
+"""ls: thinking-step copy for directory listing."""
+
+from __future__ import annotations
+
+import ast
+from typing import Any
+
+from app.tasks.chat.streaming.handlers.tools.shared.model import (
+    ToolStartThinking,
+)
+
+
+def resolve_start_thinking(tool_name: str, tool_input: Any) -> ToolStartThinking:
+    del tool_name
+    if isinstance(tool_input, dict):
+        path = tool_input.get("path", "/")
+    else:
+        path = str(tool_input)
+    return ToolStartThinking(title="Listing files", items=[path])
+
+
+def resolve_completed_thinking(
+    tool_name: str,
+    tool_output: Any,
+    last_items: list[str],
+) -> tuple[str, list[str]]:
+    del tool_name
+    if isinstance(tool_output, dict):
+        ls_output = tool_output.get("result", "")
+    elif isinstance(tool_output, str):
+        ls_output = tool_output
+    else:
+        ls_output = str(tool_output) if tool_output else ""
+    file_names: list[str] = []
+    if ls_output:
+        paths: list[str] = []
+        try:
+            parsed = ast.literal_eval(ls_output)
+            if isinstance(parsed, list):
+                paths = [str(p) for p in parsed]
+        except (ValueError, SyntaxError):
+            paths = [
+                line.strip() for line in ls_output.strip().split("\n") if line.strip()
+            ]
+        for p in paths:
+            name = p.rstrip("/").split("/")[-1]
+            if name and len(name) <= 40:
+                file_names.append(name)
+            elif name:
+                file_names.append(name[:37] + "...")
+    if file_names:
+        if len(file_names) <= 5:
+            completed = [f"[{name}]" for name in file_names]
+        else:
+            completed = [f"[{name}]" for name in file_names[:4]]
+            completed.append(f"(+{len(file_names) - 4} more)")
+    else:
+        completed = ["No files found"]
+    return ("Listing files", completed)
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/mkdir/__init__.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/mkdir/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/mkdir/thinking.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/mkdir/thinking.py
new file mode 100644
index 0000000000..1cfb0fc674
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/mkdir/thinking.py
@@ -0,0 +1,31 @@
+"""mkdir: thinking-step copy."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from app.tasks.chat.streaming.handlers.tools.filesystem.shared.tool_input import (
+    as_tool_input_dict,
+)
+from app.tasks.chat.streaming.handlers.tools.shared.model import (
+    ToolStartThinking,
+)
+
+
+def resolve_start_thinking(tool_name: str, tool_input: Any) -> ToolStartThinking:
+    del tool_name
+    d = as_tool_input_dict(tool_input)
+    p = d.get("path", "") if isinstance(tool_input, dict) else str(tool_input)
+    display = p if len(p) <= 80 else "…" + p[-77:]
+    return ToolStartThinking(
+        title="Creating folder", items=[display] if display else []
+    )
+
+
+def resolve_completed_thinking(
+    tool_name: str,
+    tool_output: Any,
+    last_items: list[str],
+) -> tuple[str, list[str]]:
+    del tool_output, tool_name
+    return ("Creating folder", last_items)
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/move_file/__init__.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/move_file/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/move_file/thinking.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/move_file/thinking.py
new file mode 100644
index 0000000000..a83a33ef7e
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/move_file/thinking.py
@@ -0,0 +1,35 @@
+"""move_file: thinking-step copy."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from app.tasks.chat.streaming.handlers.tools.filesystem.shared.tool_input import (
+    as_tool_input_dict,
+    truncate_middle,
+)
+from app.tasks.chat.streaming.handlers.tools.shared.model import (
+    ToolStartThinking,
+)
+
+
+def resolve_start_thinking(tool_name: str, tool_input: Any) -> ToolStartThinking:
+    del tool_name
+    d = as_tool_input_dict(tool_input)
+    src = d.get("source_path", "") if isinstance(tool_input, dict) else ""
+    dst = d.get("destination_path", "") if isinstance(tool_input, dict) else ""
+    display_src = truncate_middle(src, max_len=60)
+    display_dst = truncate_middle(dst, max_len=60)
+    return ToolStartThinking(
+        title="Moving file",
+        items=[f"{display_src} → {display_dst}"] if src or dst else [],
+    )
+
+
+def resolve_completed_thinking(
+    tool_name: str,
+    tool_output: Any,
+    last_items: list[str],
+) -> tuple[str, list[str]]:
+    del tool_output, tool_name
+    return ("Moving file", last_items)
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/read_file/__init__.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/read_file/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/read_file/thinking.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/read_file/thinking.py
new file mode 100644
index 0000000000..2957aed052
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/read_file/thinking.py
@@ -0,0 +1,29 @@
+"""read_file: thinking-step copy."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from app.tasks.chat.streaming.handlers.tools.filesystem.shared.tool_input import (
+    as_tool_input_dict,
+    truncate_path,
+)
+from app.tasks.chat.streaming.handlers.tools.shared.model import (
+    ToolStartThinking,
+)
+
+
+def resolve_start_thinking(tool_name: str, tool_input: Any) -> ToolStartThinking:
+    del tool_name
+    d = as_tool_input_dict(tool_input)
+    fp = d.get("file_path", "") if isinstance(tool_input, dict) else str(tool_input)
+    return ToolStartThinking(title="Reading file", items=[truncate_path(fp)])
+
+
+def resolve_completed_thinking(
+    tool_name: str,
+    tool_output: Any,
+    last_items: list[str],
+) -> tuple[str, list[str]]:
+    del tool_output, tool_name
+    return ("Reading file", last_items)
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/rm/__init__.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/rm/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/rm/thinking.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/rm/thinking.py
new file mode 100644
index 0000000000..9d21c306dc
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/rm/thinking.py
@@ -0,0 +1,30 @@
+"""rm: thinking-step copy."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from app.tasks.chat.streaming.handlers.tools.filesystem.shared.tool_input import (
+    as_tool_input_dict,
+    truncate_path,
+)
+from app.tasks.chat.streaming.handlers.tools.shared.model import (
+    ToolStartThinking,
+)
+
+
+def resolve_start_thinking(tool_name: str, tool_input: Any) -> ToolStartThinking:
+    del tool_name
+    d = as_tool_input_dict(tool_input)
+    rm_path = d.get("path", "") if isinstance(tool_input, dict) else str(tool_input)
+    display = truncate_path(rm_path)
+    return ToolStartThinking(title="Deleting file", items=[display] if display else [])
+
+
+def resolve_completed_thinking(
+    tool_name: str,
+    tool_output: Any,
+    last_items: list[str],
+) -> tuple[str, list[str]]:
+    del tool_output, tool_name
+    return ("Deleting file", last_items)
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/rmdir/__init__.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/rmdir/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/rmdir/thinking.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/rmdir/thinking.py
new file mode 100644
index 0000000000..eab0e78ecd
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/rmdir/thinking.py
@@ -0,0 +1,31 @@
+"""rmdir: thinking-step copy."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from app.tasks.chat.streaming.handlers.tools.filesystem.shared.tool_input import (
+    as_tool_input_dict,
+)
+from app.tasks.chat.streaming.handlers.tools.shared.model import (
+    ToolStartThinking,
+)
+
+
+def resolve_start_thinking(tool_name: str, tool_input: Any) -> ToolStartThinking:
+    del tool_name
+    d = as_tool_input_dict(tool_input)
+    p = d.get("path", "") if isinstance(tool_input, dict) else str(tool_input)
+    display = p if len(p) <= 80 else "…" + p[-77:]
+    return ToolStartThinking(
+        title="Deleting folder", items=[display] if display else []
+    )
+
+
+def resolve_completed_thinking(
+    tool_name: str,
+    tool_output: Any,
+    last_items: list[str],
+) -> tuple[str, list[str]]:
+    del tool_output, tool_name
+    return ("Deleting folder", last_items)
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/shared/__init__.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/shared/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/shared/tool_input.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/shared/tool_input.py
new file mode 100644
index 0000000000..5077822839
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/shared/tool_input.py
@@ -0,0 +1,17 @@
+"""Tool-call args + display truncation for filesystem thinking modules."""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+def as_tool_input_dict(tool_input: Any) -> dict[str, Any]:
+    return tool_input if isinstance(tool_input, dict) else {}
+
+
+def truncate_path(fp: str, *, max_len: int = 80) -> str:
+    return fp if len(fp) <= max_len else "…" + fp[-(max_len - 3) :]
+
+
+def truncate_middle(s: str, *, max_len: int = 60) -> str:
+    return s if len(s) <= max_len else "…" + s[-(max_len - 3) :]
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/tool_names.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/tool_names.py
new file mode 100644
index 0000000000..e2ad337369
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/tool_names.py
@@ -0,0 +1,18 @@
+from __future__ import annotations
+
+FILESYSTEM_TOOLS: frozenset[str] = frozenset(
+    {
+        "read_file",
+        "glob",
+        "grep",
+        "ls",
+        "mkdir",
+        "move_file",
+        "rm",
+        "rmdir",
+        "write_todos",
+        "write_file",
+        "edit_file",
+        "execute",
+    }
+)
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/write_file/__init__.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/write_file/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/write_file/emission.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/write_file/emission.py
new file mode 100644
index 0000000000..820235379c
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/write_file/emission.py
@@ -0,0 +1,43 @@
+"""write_file: path + status envelope on the tool-output card."""
+
+from __future__ import annotations
+
+from collections.abc import Iterator
+
+from app.tasks.chat.streaming.handlers.tools.emission_context import (
+    ToolCompletionEmissionContext,
+)
+from app.tasks.chat.streaming.helpers.tool_output import (
+    extract_resolved_file_path,
+    tool_output_has_error,
+    tool_output_to_text,
+)
+
+
+def iter_completion_emission_frames(
+    ctx: ToolCompletionEmissionContext,
+) -> Iterator[str]:
+    resolved_path = extract_resolved_file_path(
+        tool_name=ctx.tool_name,
+        tool_output=ctx.tool_output,
+        tool_input={"file_path": ctx.staged_workspace_file_path}
+        if ctx.staged_workspace_file_path
+        else None,
+    )
+    result_text = tool_output_to_text(ctx.tool_output)
+    if tool_output_has_error(ctx.tool_output):
+        yield ctx.emit_tool_output_card(
+            {
+                "status": "error",
+                "error": result_text,
+                "path": resolved_path,
+            },
+        )
+    else:
+        yield ctx.emit_tool_output_card(
+            {
+                "status": "completed",
+                "path": resolved_path,
+                "result": result_text,
+            },
+        )
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/write_file/thinking.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/write_file/thinking.py
new file mode 100644
index 0000000000..07a6cb9e18
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/write_file/thinking.py
@@ -0,0 +1,29 @@
+"""write_file: thinking-step copy."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from app.tasks.chat.streaming.handlers.tools.filesystem.shared.tool_input import (
+    as_tool_input_dict,
+    truncate_path,
+)
+from app.tasks.chat.streaming.handlers.tools.shared.model import (
+    ToolStartThinking,
+)
+
+
+def resolve_start_thinking(tool_name: str, tool_input: Any) -> ToolStartThinking:
+    del tool_name
+    d = as_tool_input_dict(tool_input)
+    fp = d.get("file_path", "") if isinstance(tool_input, dict) else str(tool_input)
+    return ToolStartThinking(title="Writing file", items=[truncate_path(fp)])
+
+
+def resolve_completed_thinking(
+    tool_name: str,
+    tool_output: Any,
+    last_items: list[str],
+) -> tuple[str, list[str]]:
+    del tool_output, tool_name
+    return ("Writing file", last_items)
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/write_todos/__init__.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/write_todos/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/write_todos/thinking.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/write_todos/thinking.py
new file mode 100644
index 0000000000..5cc631fb5e
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/filesystem/write_todos/thinking.py
@@ -0,0 +1,34 @@
+"""write_todos: thinking-step copy."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from app.tasks.chat.streaming.handlers.tools.filesystem.shared.tool_input import (
+    as_tool_input_dict,
+)
+from app.tasks.chat.streaming.handlers.tools.shared.model import (
+    ToolStartThinking,
+)
+
+
+def resolve_start_thinking(tool_name: str, tool_input: Any) -> ToolStartThinking:
+    del tool_name
+    d = as_tool_input_dict(tool_input)
+    todos = d.get("todos", []) if isinstance(tool_input, dict) else []
+    todo_count = len(todos) if isinstance(todos, list) else 0
+    return ToolStartThinking(
+        title="Planning tasks",
+        items=(
+            [f"{todo_count} task{'s' if todo_count != 1 else ''}"] if todo_count else []
+        ),
+    )
+
+
+def resolve_completed_thinking(
+    tool_name: str,
+    tool_output: Any,
+    last_items: list[str],
+) -> tuple[str, list[str]]:
+    del tool_output, tool_name
+    return ("Planning tasks", last_items)
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/registry.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/registry.py
new file mode 100644
index 0000000000..06f06f90af
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/registry.py
@@ -0,0 +1,92 @@
+"""Resolve thinking and emission modules by tool name."""
+
+from __future__ import annotations
+
+import importlib
+from collections.abc import Iterator
+from typing import Any
+
+from app.tasks.chat.streaming.handlers.tools.connector.shared.tool_names import (
+    SHARED_CONNECTOR_TOOLS,
+)
+from app.tasks.chat.streaming.handlers.tools.deliverables.tool_names import (
+    DELIVERABLE_TOOLS,
+)
+from app.tasks.chat.streaming.handlers.tools.emission_context import (
+    ToolCompletionEmissionContext,
+)
+from app.tasks.chat.streaming.handlers.tools.filesystem.tool_names import (
+    FILESYSTEM_TOOLS,
+)
+from app.tasks.chat.streaming.handlers.tools.shared.model import (
+    ToolStartThinking,
+)
+
+_BASE = "app.tasks.chat.streaming.handlers.tools"
+_CONNECTOR_SHARED = "connector.shared"
+
+_THINKING_ALIAS: dict[str, str] = {
+    "execute_code": "filesystem.execute",
+}
+_EMISSION_ALIAS: dict[str, str] = {
+    "edit_file": "filesystem.write_file",
+    "execute_code": "filesystem.execute",
+}
+
+
+def _thinking_module(tool_name: str) -> str:
+    if tool_name in SHARED_CONNECTOR_TOOLS:
+        return _CONNECTOR_SHARED
+    if tool_name in FILESYSTEM_TOOLS:
+        return f"filesystem.{tool_name}"
+    if tool_name in DELIVERABLE_TOOLS:
+        return f"deliverables.{tool_name}"
+    return _THINKING_ALIAS.get(tool_name, tool_name)
+
+
+def _emission_module(tool_name: str) -> str:
+    if tool_name in _EMISSION_ALIAS:
+        return _EMISSION_ALIAS[tool_name]
+    if tool_name in SHARED_CONNECTOR_TOOLS:
+        return _CONNECTOR_SHARED
+    if tool_name in DELIVERABLE_TOOLS:
+        return f"deliverables.{tool_name}"
+    if tool_name in FILESYSTEM_TOOLS:
+        return f"filesystem.{tool_name}"
+    return tool_name
+
+
+def _import_thinking(tool_name: str):
+    try:
+        return importlib.import_module(
+            f"{_BASE}.{_thinking_module(tool_name)}.thinking"
+        )
+    except ModuleNotFoundError:
+        return importlib.import_module(f"{_BASE}.default.thinking")
+
+
+def _import_emission(tool_name: str):
+    try:
+        return importlib.import_module(
+            f"{_BASE}.{_emission_module(tool_name)}.emission"
+        )
+    except ModuleNotFoundError:
+        return importlib.import_module(f"{_BASE}.default.emission")
+
+
+def resolve_tool_start_thinking(tool_name: str, tool_input: Any) -> ToolStartThinking:
+    return _import_thinking(tool_name).resolve_start_thinking(tool_name, tool_input)
+
+
+def resolve_tool_completed_thinking_step(
+    tool_name: str, tool_output: Any, last_items: list[str]
+) -> tuple[str, list[str]]:
+    return _import_thinking(tool_name).resolve_completed_thinking(
+        tool_name, tool_output, last_items
+    )
+
+
+def iter_tool_completion_emission_frames(
+    ctx: ToolCompletionEmissionContext,
+) -> Iterator[str]:
+    yield from _import_emission(ctx.tool_name).iter_completion_emission_frames(ctx)
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/scrape_webpage/emission.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/scrape_webpage/emission.py
new file mode 100644
index 0000000000..293d2a1e95
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/scrape_webpage/emission.py
@@ -0,0 +1,43 @@
+"""scrape_webpage: redacted payload + terminal summary."""
+
+from __future__ import annotations
+
+from collections.abc import Iterator
+
+from app.tasks.chat.streaming.handlers.tools.emission_context import (
+    ToolCompletionEmissionContext,
+)
+
+
+def iter_completion_emission_frames(
+    ctx: ToolCompletionEmissionContext,
+) -> Iterator[str]:
+    out = ctx.tool_output
+    if isinstance(out, dict):
+        display_output = {k: v for k, v in out.items() if k != "content"}
+        if "content" in out:
+            content = out.get("content", "")
+            display_output["content_preview"] = (
+                content[:500] + "..." if len(content) > 500 else content
+            )
+        yield ctx.emit_tool_output_card(display_output)
+    else:
+        yield ctx.emit_tool_output_card({"result": out})
+
+    if isinstance(out, dict) and "error" not in out:
+        title = out.get("title", "Webpage")
+        word_count = out.get("word_count", 0)
+        yield ctx.streaming_service.format_terminal_info(
+            f"Scraped: {title[:40]}{'...' if len(title) > 40 else ''} ({word_count:,} words)",
+            "success",
+        )
+    else:
+        error_msg = (
+            out.get("error", "Failed to scrape")
+            if isinstance(out, dict)
+            else "Failed to scrape"
+        )
+        yield ctx.streaming_service.format_terminal_info(
+            f"Scrape failed: {error_msg}",
+            "error",
+        )
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/scrape_webpage/shared/__init__.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/scrape_webpage/shared/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/scrape_webpage/shared/tool_input.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/scrape_webpage/shared/tool_input.py
new file mode 100644
index 0000000000..581f0e64ab
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/scrape_webpage/shared/tool_input.py
@@ -0,0 +1,9 @@
+"""Tool-call args for scrape_webpage thinking."""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+def as_tool_input_dict(tool_input: Any) -> dict[str, Any]:
+    return tool_input if isinstance(tool_input, dict) else {}
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/scrape_webpage/thinking.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/scrape_webpage/thinking.py
new file mode 100644
index 0000000000..8a04acbe6e
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/scrape_webpage/thinking.py
@@ -0,0 +1,49 @@
+"""scrape_webpage: thinking-step copy."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from app.tasks.chat.streaming.handlers.tools.scrape_webpage.shared.tool_input import (
+    as_tool_input_dict,
+)
+from app.tasks.chat.streaming.handlers.tools.shared.model import (
+    ToolStartThinking,
+)
+
+
+def resolve_start_thinking(tool_name: str, tool_input: Any) -> ToolStartThinking:
+    del tool_name
+    d = as_tool_input_dict(tool_input)
+    url = d.get("url", "") if isinstance(tool_input, dict) else str(tool_input)
+    return ToolStartThinking(
+        title="Scraping webpage",
+        items=[f"URL: {url[:80]}{'...' if len(url) > 80 else ''}"],
+    )
+
+
+def resolve_completed_thinking(
+    tool_name: str,
+    tool_output: Any,
+    last_items: list[str],
+) -> tuple[str, list[str]]:
+    del tool_name
+    items = last_items
+    if isinstance(tool_output, dict):
+        title = tool_output.get("title", "Webpage")
+        word_count = tool_output.get("word_count", 0)
+        has_error = "error" in tool_output
+        if has_error:
+            completed = [
+                *items,
+                f"Error: {tool_output.get('error', 'Failed to scrape')[:50]}",
+            ]
+        else:
+            completed = [
+                *items,
+                f"Title: {title[:50]}{'...' if len(title) > 50 else ''}",
+                f"Extracted: {word_count:,} words",
+            ]
+    else:
+        completed = [*items, "Content extracted"]
+    return ("Scraping webpage", completed)
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/shared/__init__.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/shared/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/shared/model.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/shared/model.py
new file mode 100644
index 0000000000..047a843749
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/shared/model.py
@@ -0,0 +1,12 @@
+"""In-progress thinking-step title and bullet lines."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True, slots=True)
+class ToolStartThinking:
+    title: str
+    items: list[str]
+    include_items_on_frame: bool = True
diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/web_search/emission.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/web_search/emission.py
new file mode 100644
index 0000000000..3efe45d0c7
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/web_search/emission.py
@@ -0,0 +1,37 @@
+"""web_search: citations parsed from provider XML."""
+
+from __future__ import annotations
+
+import re
+from collections.abc import Iterator
+
+from app.tasks.chat.streaming.handlers.tools.emission_context import (
+    ToolCompletionEmissionContext,
+)
+
+
+def iter_completion_emission_frames(
+    ctx: ToolCompletionEmissionContext,
+) -> Iterator[str]:
+    out = ctx.tool_output
+    xml = out.get("result", str(out)) if isinstance(out, dict) else str(out)
+    citations: dict[str, dict[str, str]] = {}
+    for m in re.finditer(
+        r"<title><!\[CDATA\[(.*?)\]\]></title>\s*<url><!\[CDATA\[(.*?)\]\]></url>",
+        xml,
+    ):
+        title, url = m.group(1).strip(), m.group(2).strip()
+        if url.startswith("http") and url not in citations:
+            citations[url] = {"title": title}
+    for m in re.finditer(
+        r"<chunk\s+id='([^']*)'><!\[CDATA\[([\s\S]*?)\]\]></chunk>",
+        xml,
+    ):
+        chunk_url, content = m.group(1).strip(), m.group(2).strip()
+        if chunk_url.startswith("http") and chunk_url in citations and content:
+            citations[chunk_url]["snippet"] = (
+                content[:200] + "…" if len(content) > 200 else content
+            )
+    yield ctx.emit_tool_output_card(
+        {"status": "completed", "citations": citations},
+    )
diff --git a/surfsense_backend/app/tasks/chat/streaming/helpers/__init__.py b/surfsense_backend/app/tasks/chat/streaming/helpers/__init__.py
new file mode 100644
index 0000000000..151dfdaac9
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/helpers/__init__.py
@@ -0,0 +1,3 @@
+"""Pure helpers for chat streaming."""
+
+from __future__ import annotations
diff --git a/surfsense_backend/app/tasks/chat/streaming/helpers/chunk_parts.py b/surfsense_backend/app/tasks/chat/streaming/helpers/chunk_parts.py
new file mode 100644
index 0000000000..48b44fc1de
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/helpers/chunk_parts.py
@@ -0,0 +1,60 @@
+"""Split a model chunk into text, reasoning, and tool-call fragment lists."""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+def extract_chunk_parts(chunk: Any) -> dict[str, Any]:
+    """Return dict with keys text, reasoning, and tool_call_chunks (merged from chunk fields)."""
+    out: dict[str, Any] = {"text": "", "reasoning": "", "tool_call_chunks": []}
+    if chunk is None:
+        return out
+
+    content = getattr(chunk, "content", None)
+    if isinstance(content, str):
+        if content:
+            out["text"] = content
+    elif isinstance(content, list):
+        text_parts: list[str] = []
+        reasoning_parts: list[str] = []
+        for block in content:
+            if not isinstance(block, dict):
+                continue
+            block_type = block.get("type")
+            if block_type == "text":
+                value = block.get("text") or block.get("content") or ""
+                if isinstance(value, str) and value:
+                    text_parts.append(value)
+            elif block_type == "reasoning":
+                value = (
+                    block.get("reasoning")
+                    or block.get("text")
+                    or block.get("content")
+                    or ""
+                )
+                if isinstance(value, str) and value:
+                    reasoning_parts.append(value)
+            elif block_type in ("tool_call_chunk", "tool_use"):
+                out["tool_call_chunks"].append(block)
+        if text_parts:
+            out["text"] = "".join(text_parts)
+        if reasoning_parts:
+            out["reasoning"] = "".join(reasoning_parts)
+
+    additional = getattr(chunk, "additional_kwargs", None) or {}
+    if isinstance(additional, dict):
+        extra_reasoning = additional.get("reasoning_content")
+        if isinstance(extra_reasoning, str) and extra_reasoning:
+            existing = out["reasoning"]
+            out["reasoning"] = (
+                (existing + extra_reasoning) if existing else extra_reasoning
+            )
+
+    extra_tool_chunks = getattr(chunk, "tool_call_chunks", None)
+    if isinstance(extra_tool_chunks, list):
+        for tcc in extra_tool_chunks:
+            if isinstance(tcc, dict):
+                out["tool_call_chunks"].append(tcc)
+
+    return out
diff --git a/surfsense_backend/app/tasks/chat/streaming/helpers/interrupt_inspector.py b/surfsense_backend/app/tasks/chat/streaming/helpers/interrupt_inspector.py
new file mode 100644
index 0000000000..f4b00431cd
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/helpers/interrupt_inspector.py
@@ -0,0 +1,64 @@
+"""Read every pending interrupt payload from a LangGraph state snapshot.
+
+The chat-stream emit loop yields one ``data-interrupt-request`` SSE frame per
+pending interrupt so parallel HITL across siblings stays addressable on the
+wire (the resume slicer in ``checkpointed_subagent_middleware.resume_routing``
+correlates each frame back to the right paused subagent via the stamped
+``tool_call_id``). This helper produces that flat, ordered list.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+def all_interrupt_values(state: Any) -> list[dict[str, Any]]:
+    """Return every interrupt payload across the snapshot, in traversal order.
+
+    Walks ``state.tasks[*].interrupts`` first (langgraph's per-task buckets,
+    which carry one interrupt per paused subagent) and falls back to
+    ``state.interrupts`` when the per-task lists are empty. Order matches the
+    snapshot's iteration order so the emit-time order on the SSE stream agrees
+    with ``collect_pending_tool_calls`` consumption order on resume.
+
+    Defensive against malformed snapshots: tasks/interrupts that raise on
+    attribute access are skipped silently. Non-dict values are skipped — the
+    chat-stream contract requires structured interrupt payloads.
+    """
+
+    def _extract(candidate: Any) -> dict[str, Any] | None:
+        if isinstance(candidate, dict):
+            value = candidate.get("value", candidate)
+            return value if isinstance(value, dict) else None
+        value = getattr(candidate, "value", None)
+        if isinstance(value, dict):
+            return value
+        return None
+
+    values: list[dict[str, Any]] = []
+    saw_task_interrupt = False
+
+    for task in getattr(state, "tasks", ()) or ():
+        try:
+            interrupts = getattr(task, "interrupts", ()) or ()
+        except (AttributeError, IndexError, TypeError):
+            interrupts = ()
+        if interrupts:
+            saw_task_interrupt = True
+            for interrupt_item in interrupts:
+                extracted = _extract(interrupt_item)
+                if extracted is not None:
+                    values.append(extracted)
+
+    if saw_task_interrupt:
+        return values
+
+    try:
+        state_interrupts = getattr(state, "interrupts", ()) or ()
+    except (AttributeError, IndexError, TypeError):
+        state_interrupts = ()
+    for interrupt_item in state_interrupts:
+        extracted = _extract(interrupt_item)
+        if extracted is not None:
+            values.append(extracted)
+    return values
diff --git a/surfsense_backend/app/tasks/chat/streaming/helpers/tool_call_matching.py b/surfsense_backend/app/tasks/chat/streaming/helpers/tool_call_matching.py
new file mode 100644
index 0000000000..fbe4c94b77
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/helpers/tool_call_matching.py
@@ -0,0 +1,32 @@
+"""Match buffered model tool-call chunks to a tool start when ids were missing."""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+def match_buffered_langchain_tool_call_id(
+    pending_tool_call_chunks: list[dict[str, Any]],
+    tool_name: str,
+    run_id: str,
+    lc_tool_call_id_by_run: dict[str, str],
+) -> str | None:
+    matched_idx: int | None = None
+    for idx, tcc in enumerate(pending_tool_call_chunks):
+        if tcc.get("name") == tool_name and tcc.get("id"):
+            matched_idx = idx
+            break
+    if matched_idx is None:
+        for idx, tcc in enumerate(pending_tool_call_chunks):
+            if tcc.get("id"):
+                matched_idx = idx
+                break
+    if matched_idx is None:
+        return None
+    matched = pending_tool_call_chunks.pop(matched_idx)
+    candidate = matched.get("id")
+    if isinstance(candidate, str) and candidate:
+        if run_id:
+            lc_tool_call_id_by_run[run_id] = candidate
+        return candidate
+    return None
diff --git a/surfsense_backend/app/tasks/chat/streaming/helpers/tool_output.py b/surfsense_backend/app/tasks/chat/streaming/helpers/tool_output.py
new file mode 100644
index 0000000000..a7c401dee5
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/helpers/tool_output.py
@@ -0,0 +1,43 @@
+"""Normalize filesystem tool payloads for SSE cards and messages."""
+
+from __future__ import annotations
+
+import json
+from typing import Any
+
+
+def tool_output_to_text(tool_output: Any) -> str:
+    if isinstance(tool_output, dict):
+        if isinstance(tool_output.get("result"), str):
+            return tool_output["result"]
+        if isinstance(tool_output.get("error"), str):
+            return tool_output["error"]
+        return json.dumps(tool_output, ensure_ascii=False)
+    return str(tool_output)
+
+
+def tool_output_has_error(tool_output: Any) -> bool:
+    if isinstance(tool_output, dict):
+        if tool_output.get("error"):
+            return True
+        result = tool_output.get("result")
+        return bool(
+            isinstance(result, str) and result.strip().lower().startswith("error:")
+        )
+    if isinstance(tool_output, str):
+        return tool_output.strip().lower().startswith("error:")
+    return False
+
+
+def extract_resolved_file_path(
+    *, tool_name: str, tool_output: Any, tool_input: Any | None = None
+) -> str | None:
+    if isinstance(tool_output, dict):
+        path_value = tool_output.get("path")
+        if isinstance(path_value, str) and path_value.strip():
+            return path_value.strip()
+    if tool_name in ("write_file", "edit_file") and isinstance(tool_input, dict):
+        file_path = tool_input.get("file_path")
+        if isinstance(file_path, str) and file_path.strip():
+            return file_path.strip()
+    return None
diff --git a/surfsense_backend/app/tasks/chat/streaming/relay/__init__.py b/surfsense_backend/app/tasks/chat/streaming/relay/__init__.py
new file mode 100644
index 0000000000..18eda9a6d7
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/relay/__init__.py
@@ -0,0 +1,23 @@
+"""Relay: thinking steps, tool bookkeeping, and ``EventRelay``.
+
+Package imports are lazy so ``relay.thinking_step_sse`` (and siblings) can load
+without pulling in ``event_relay`` (which imports handler modules that may
+import those siblings).
+"""
+
+from __future__ import annotations
+
+__all__ = ["EventRelay", "EventRelayConfig"]
+
+
+def __getattr__(name: str):
+    if name == "EventRelay":
+        from app.tasks.chat.streaming.relay.event_relay import EventRelay
+
+        return EventRelay
+    if name == "EventRelayConfig":
+        from app.tasks.chat.streaming.relay.event_relay import EventRelayConfig
+
+        return EventRelayConfig
+    msg = f"module {__name__!r} has no attribute {name!r}"
+    raise AttributeError(msg)
diff --git a/surfsense_backend/app/tasks/chat/streaming/relay/event_relay.py b/surfsense_backend/app/tasks/chat/streaming/relay/event_relay.py
new file mode 100644
index 0000000000..03d6a66e62
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/relay/event_relay.py
@@ -0,0 +1,128 @@
+"""Turn LangGraph astream_events into SSE strings via the handler modules."""
+
+from __future__ import annotations
+
+from collections.abc import AsyncIterator
+from dataclasses import dataclass, field
+from typing import Any
+
+from app.services.streaming.emitter import EmitterRegistry
+from app.tasks.chat.streaming.graph_stream.result import StreamingResult
+from app.tasks.chat.streaming.handlers.chain_end import iter_chain_end_frames
+from app.tasks.chat.streaming.handlers.chat_model_stream import (
+    iter_chat_model_stream_frames,
+)
+from app.tasks.chat.streaming.handlers.custom_event_dispatch import (
+    iter_custom_event_frames,
+)
+from app.tasks.chat.streaming.handlers.tool_end import iter_tool_end_frames
+from app.tasks.chat.streaming.handlers.tool_start import iter_tool_start_frames
+from app.tasks.chat.streaming.relay.state import AgentEventRelayState
+from app.tasks.chat.streaming.relay.thinking_step_completion import (
+    complete_active_thinking_step,
+)
+
+
+@dataclass
+class EventRelayConfig:
+    """Optional relay tuning (sub-agent tools, text suppression)."""
+
+    subagent_entry_tool_names: frozenset[str] = field(
+        default_factory=lambda: frozenset({"task"})
+    )
+    suppress_main_text_inside_tools: bool = True
+
+
+class EventRelay:
+    """Dispatches graph events to streaming handlers and optional emitters."""
+
+    def __init__(
+        self,
+        *,
+        streaming_service: Any,
+        config: EventRelayConfig | None = None,
+    ) -> None:
+        self.streaming_service = streaming_service
+        self.config = config or EventRelayConfig()
+        reg = getattr(streaming_service, "emitter_registry", None)
+        self.emitter_registry = reg if reg is not None else EmitterRegistry()
+
+    async def relay(
+        self,
+        events: AsyncIterator[dict[str, Any]],
+        *,
+        state: AgentEventRelayState,
+        result: StreamingResult,
+        step_prefix: str = "thinking",
+        content_builder: Any | None = None,
+        config: dict[str, Any] | None = None,
+    ) -> AsyncIterator[str]:
+        """Yield SSE for each event from the async iterator, then finalize text/thinking."""
+        graph_config = config or {}
+        async for event in events:
+            event_type = event.get("event", "")
+            if event_type == "on_chat_model_stream":
+                for frame in iter_chat_model_stream_frames(
+                    event,
+                    state=state,
+                    streaming_service=self.streaming_service,
+                    content_builder=content_builder,
+                    step_prefix=step_prefix,
+                ):
+                    yield frame
+            elif event_type == "on_tool_start":
+                for frame in iter_tool_start_frames(
+                    event,
+                    state=state,
+                    streaming_service=self.streaming_service,
+                    content_builder=content_builder,
+                    result=result,
+                    step_prefix=step_prefix,
+                ):
+                    yield frame
+            elif event_type == "on_tool_end":
+                for frame in iter_tool_end_frames(
+                    event,
+                    state=state,
+                    streaming_service=self.streaming_service,
+                    content_builder=content_builder,
+                    result=result,
+                    step_prefix=step_prefix,
+                    config=graph_config,
+                ):
+                    yield frame
+            elif event_type == "on_custom_event":
+                for frame in iter_custom_event_frames(
+                    event,
+                    state=state,
+                    streaming_service=self.streaming_service,
+                    content_builder=content_builder,
+                ):
+                    yield frame
+            elif event_type in ("on_chain_end", "on_agent_end"):
+                for frame in iter_chain_end_frames(
+                    event,
+                    state=state,
+                    streaming_service=self.streaming_service,
+                    content_builder=content_builder,
+                ):
+                    yield frame
+
+        if state.current_text_id is not None:
+            yield self.streaming_service.format_text_end(state.current_text_id)
+            if content_builder is not None:
+                content_builder.on_text_end(state.current_text_id)
+            state.current_text_id = None
+
+        completion_event, new_active = complete_active_thinking_step(
+            state=state,
+            streaming_service=self.streaming_service,
+            content_builder=content_builder,
+            last_active_step_id=state.last_active_step_id,
+            last_active_step_title=state.last_active_step_title,
+            last_active_step_items=state.last_active_step_items,
+            completed_step_ids=state.completed_step_ids,
+        )
+        if completion_event:
+            yield completion_event
+        state.last_active_step_id = new_active
diff --git a/surfsense_backend/app/tasks/chat/streaming/relay/state.py b/surfsense_backend/app/tasks/chat/streaming/relay/state.py
new file mode 100644
index 0000000000..27898403db
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/relay/state.py
@@ -0,0 +1,98 @@
+"""Mutable counters and maps for one agent stream."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+
+@dataclass
+class AgentEventRelayState:
+    """Tracks text, thinking steps, tool depth, and pending tool-call metadata.
+
+    **Task span (`spanId`)** — ``active_span_id`` groups steps and tools for one
+    open delegating ``task`` episode. ``active_task_run_id`` is the LangGraph
+    ``run_id`` of that ``task`` so the span clears only when that run ends, not
+    when child tools end. Open/close uses ``relay.task_span`` helpers.
+
+    **Tool ↔ thinking link (`thinkingStepId`)** — Each tool run gets a thinking-row
+    id (``tool_step_ids[run_id]``, emitted as ``data-thinking-step`` ``data.id``).
+    ``tool_activity_metadata`` supplies ``metadata`` for ``tool-input-start`` /
+    ``tool-input-available`` (``handlers.tool_start``) and
+    ``tool-output-available`` (``handlers.tool_end``).
+    """
+
+    accumulated_text: str = ""
+    current_text_id: str | None = None
+    thinking_step_counter: int = 0
+    tool_step_ids: dict[str, str] = field(default_factory=dict)
+    completed_step_ids: set[str] = field(default_factory=set)
+    last_active_step_id: str | None = None
+    last_active_step_title: str = ""
+    last_active_step_items: list[str] = field(default_factory=list)
+    just_finished_tool: bool = False
+    active_tool_depth: int = 0
+    called_update_memory: bool = False
+    current_reasoning_id: str | None = None
+    pending_tool_call_chunks: list[dict[str, Any]] = field(default_factory=list)
+    lc_tool_call_id_by_run: dict[str, str] = field(default_factory=dict)
+    file_path_by_run: dict[str, str] = field(default_factory=dict)
+    index_to_meta: dict[int, dict[str, str]] = field(default_factory=dict)
+    ui_tool_call_id_by_run: dict[str, str] = field(default_factory=dict)
+    current_lc_tool_call_id: dict[str, str | None] = field(
+        default_factory=lambda: {"value": None}
+    )
+    # Open ``task`` delegation span (one id shared by nested activity); unset outside.
+    active_span_id: str | None = None
+    active_task_run_id: str | None = None
+    # Span id minted when a ``task`` tool_call_chunk registers (before ``on_tool_start``).
+    pending_task_span_by_lc: dict[str, str] = field(default_factory=dict)
+
+    def span_metadata_if_active(self) -> dict[str, Any] | None:
+        """``{"spanId": ...}`` when a span is active; ``None`` otherwise."""
+        if self.active_span_id:
+            return {"spanId": self.active_span_id}
+        return None
+
+    def tool_activity_metadata(
+        self, *, thinking_step_id: str | None
+    ) -> dict[str, Any] | None:
+        """Build ``metadata`` for tool SSE and ``tool-call`` persistence.
+
+        Contract (keys omitted when not applicable):
+
+        - ``spanId`` (str): present while a task-delegation span is active
+          (same value as ``span_metadata_if_active()``).
+        - ``thinkingStepId`` (str): equals the thinking-step row ``id`` for this
+          tool (``data-thinking-step`` payload ``data.id`` on the wire).
+
+        Returns ``None`` if neither applies. Whitespace-only
+        ``thinking_step_id`` is ignored.
+        """
+        out: dict[str, Any] = {}
+        if self.active_span_id:
+            out["spanId"] = self.active_span_id
+        tid = (thinking_step_id or "").strip()
+        if tid:
+            out["thinkingStepId"] = tid
+        return out if out else None
+
+    @classmethod
+    def for_invocation(
+        cls,
+        *,
+        initial_step_id: str | None = None,
+        initial_step_title: str = "",
+        initial_step_items: list[str] | None = None,
+    ) -> AgentEventRelayState:
+        counter = 1 if initial_step_id else 0
+        return cls(
+            thinking_step_counter=counter,
+            last_active_step_id=initial_step_id,
+            last_active_step_title=initial_step_title,
+            last_active_step_items=list(initial_step_items or []),
+        )
+
+    def next_thinking_step_id(self, step_prefix: str) -> str:
+        self.thinking_step_counter += 1
+        return f"{step_prefix}-{self.thinking_step_counter}"
diff --git a/surfsense_backend/app/tasks/chat/streaming/relay/task_span.py b/surfsense_backend/app/tasks/chat/streaming/relay/task_span.py
new file mode 100644
index 0000000000..c4cdf24ba5
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/relay/task_span.py
@@ -0,0 +1,74 @@
+"""Open/close ``active_span_id`` around a delegating ``task`` tool run."""
+
+from __future__ import annotations
+
+import uuid
+
+from app.tasks.chat.streaming.relay.state import AgentEventRelayState
+
+
+def new_span_id() -> str:
+    """One delegation-episode id (shared by activity under an open ``task``)."""
+    return f"spn_{uuid.uuid4().hex}"
+
+
+def _run_key(run_id: str) -> str:
+    return (run_id or "").strip()
+
+
+def _lc_key(langchain_tool_call_id: str | None) -> str:
+    return (langchain_tool_call_id or "").strip()
+
+
+def ensure_pending_task_span_for_lc(state: AgentEventRelayState, lc_id: str) -> str:
+    """Return span id for this LangChain tool call id, storing it in ``pending`` if new.
+
+    Used from ``chat_model_stream`` when the first ``task`` chunk registers so
+    early ``tool-input-start`` can carry ``metadata.spanId`` before ``on_tool_start``.
+    """
+    key = _lc_key(lc_id)
+    if not key:
+        return new_span_id()
+    existing = state.pending_task_span_by_lc.get(key)
+    if existing:
+        return existing
+    sid = new_span_id()
+    state.pending_task_span_by_lc[key] = sid
+    return sid
+
+
+def open_task_span(
+    state: AgentEventRelayState,
+    *,
+    run_id: str,
+    langchain_tool_call_id: str | None = None,
+) -> str:
+    """Set ``active_span_id`` from pending (same lc) or mint; remember ``active_task_run_id``.
+
+    Call when the ``task`` tool **starts**. Nested ``task`` is not supported:
+    a second call replaces the previous span without restoring it.
+    """
+    key = _lc_key(langchain_tool_call_id)
+    sid: str | None = state.pending_task_span_by_lc.pop(key, None) if key else None
+    if not sid:
+        sid = new_span_id()
+    state.active_span_id = sid
+    state.active_task_run_id = _run_key(run_id) or None
+    return sid
+
+
+def clear_task_span_if_delegating_task_ended(
+    state: AgentEventRelayState,
+    *,
+    tool_name: str,
+    run_id: str,
+) -> None:
+    """Clear span state only when this event is the end of the opening ``task`` run."""
+    if tool_name != "task":
+        return
+    if state.active_task_run_id is None:
+        return
+    if state.active_task_run_id != _run_key(run_id):
+        return
+    state.active_span_id = None
+    state.active_task_run_id = None
diff --git a/surfsense_backend/app/tasks/chat/streaming/relay/thinking_step_completion.py b/surfsense_backend/app/tasks/chat/streaming/relay/thinking_step_completion.py
new file mode 100644
index 0000000000..ad09303410
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/relay/thinking_step_completion.py
@@ -0,0 +1,34 @@
+"""Close the in-progress thinking step with a completed status frame."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from .state import AgentEventRelayState
+from .thinking_step_sse import emit_thinking_step_frame
+
+
+def complete_active_thinking_step(
+    *,
+    state: AgentEventRelayState,
+    streaming_service: Any,
+    content_builder: Any | None,
+    last_active_step_id: str | None,
+    last_active_step_title: str,
+    last_active_step_items: list[str],
+    completed_step_ids: set[str],
+) -> tuple[str | None, str | None]:
+    """Emit a completed thinking-step frame once; return (frame or None, next active step id)."""
+    if last_active_step_id and last_active_step_id not in completed_step_ids:
+        completed_step_ids.add(last_active_step_id)
+        event = emit_thinking_step_frame(
+            streaming_service=streaming_service,
+            content_builder=content_builder,
+            step_id=last_active_step_id,
+            title=last_active_step_title,
+            status="completed",
+            items=last_active_step_items if last_active_step_items else None,
+            metadata=state.span_metadata_if_active(),
+        )
+        return event, None
+    return None, last_active_step_id
diff --git a/surfsense_backend/app/tasks/chat/streaming/relay/thinking_step_sse.py b/surfsense_backend/app/tasks/chat/streaming/relay/thinking_step_sse.py
new file mode 100644
index 0000000000..6737f536bd
--- /dev/null
+++ b/surfsense_backend/app/tasks/chat/streaming/relay/thinking_step_sse.py
@@ -0,0 +1,28 @@
+"""Thinking-step SSE plus optional content-builder updates."""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+def emit_thinking_step_frame(
+    *,
+    streaming_service: Any,
+    content_builder: Any | None,
+    step_id: str,
+    title: str,
+    status: str = "in_progress",
+    items: list[str] | None = None,
+    metadata: dict[str, Any] | None = None,
+) -> str:
+    if content_builder is not None:
+        content_builder.on_thinking_step(
+            step_id, title, status, items, metadata=metadata
+        )
+    return streaming_service.format_thinking_step(
+        step_id=step_id,
+        title=title,
+        status=status,
+        items=items,
+        metadata=metadata,
+    )
diff --git a/surfsense_backend/app/tasks/connector_indexers/__init__.py b/surfsense_backend/app/tasks/connector_indexers/__init__.py
index 1b032d54a9..218f210664 100644
--- a/surfsense_backend/app/tasks/connector_indexers/__init__.py
+++ b/surfsense_backend/app/tasks/connector_indexers/__init__.py
@@ -1,77 +1,29 @@
 """
 Connector indexers module for background tasks.
 
-This module provides a collection of connector indexers for different platforms
-and services. Each indexer is responsible for handling the indexing of content
-from a specific connector type.
-
-Available indexers:
-- Slack: Index messages from Slack channels
-- Notion: Index pages from Notion workspaces
-- GitHub: Index repositories and files from GitHub
-- Linear: Index issues from Linear workspaces
-- Jira: Index issues from Jira projects
-- Confluence: Index pages from Confluence spaces
-- BookStack: Index pages from BookStack wiki instances
-- Discord: Index messages from Discord servers
-- ClickUp: Index tasks from ClickUp workspaces
-- Google Gmail: Index messages from Google Gmail
-- Google Calendar: Index events from Google Calendar
-- Luma: Index events from Luma
-- Webcrawler: Index crawled URLs
-- Elasticsearch: Index documents from Elasticsearch instances
+Each indexer handles content indexing from a specific connector type.
+Live connectors (Slack, Linear, Jira, ClickUp, Airtable, Discord, Teams,
+Luma) now use real-time agent tools instead of background indexing.
 """
 
-# Communication platforms
-# Calendar and scheduling
-from .airtable_indexer import index_airtable_records
 from .bookstack_indexer import index_bookstack_pages
-
-# Note: composio_indexer is imported directly in connector_tasks.py to avoid circular imports
-from .clickup_indexer import index_clickup_tasks
 from .confluence_indexer import index_confluence_pages
-from .discord_indexer import index_discord_messages
-
-# Development platforms
 from .elasticsearch_indexer import index_elasticsearch_documents
 from .github_indexer import index_github_repos
 from .google_calendar_indexer import index_google_calendar_events
 from .google_drive_indexer import index_google_drive_files
 from .google_gmail_indexer import index_google_gmail_messages
-from .jira_indexer import index_jira_issues
-
-# Issue tracking and project management
-from .linear_indexer import index_linear_issues
-
-# Documentation and knowledge management
-from .luma_indexer import index_luma_events
 from .notion_indexer import index_notion_pages
-from .obsidian_indexer import index_obsidian_vault
-from .slack_indexer import index_slack_messages
 from .webcrawler_indexer import index_crawled_urls
 
-__all__ = [  # noqa: RUF022
-    "index_airtable_records",
+__all__ = [
     "index_bookstack_pages",
-    # "index_composio_connector",  # Imported directly in connector_tasks.py to avoid circular imports
-    "index_clickup_tasks",
     "index_confluence_pages",
-    "index_discord_messages",
-    # Development platforms
+    "index_crawled_urls",
     "index_elasticsearch_documents",
     "index_github_repos",
-    # Calendar and scheduling
     "index_google_calendar_events",
     "index_google_drive_files",
-    "index_luma_events",
-    "index_jira_issues",
-    # Issue tracking and project management
-    "index_linear_issues",
-    # Documentation and knowledge management
-    "index_notion_pages",
-    "index_obsidian_vault",
-    "index_crawled_urls",
-    # Communication platforms
-    "index_slack_messages",
     "index_google_gmail_messages",
+    "index_notion_pages",
 ]
diff --git a/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py b/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py
index 4a49944c25..9f8c1a33a3 100644
--- a/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py
@@ -164,6 +164,7 @@ async def _download_files_parallel(
     enable_summary: bool,
     max_concurrency: int = 3,
     on_heartbeat: HeartbeatCallbackType | None = None,
+    vision_llm=None,
 ) -> tuple[list[ConnectorDocument], int]:
     """Download and ETL files in parallel. Returns (docs, failed_count)."""
     results: list[ConnectorDocument] = []
@@ -176,7 +177,7 @@ async def _download_one(file: dict) -> ConnectorDocument | None:
         nonlocal last_heartbeat, completed_count
         async with sem:
             markdown, db_metadata, error = await download_and_extract_content(
-                dropbox_client, file
+                dropbox_client, file, vision_llm=vision_llm
             )
             if error or not markdown:
                 file_name = file.get("name", "Unknown")
@@ -224,6 +225,7 @@ async def _download_and_index(
     user_id: str,
     enable_summary: bool,
     on_heartbeat: HeartbeatCallbackType | None = None,
+    vision_llm=None,
 ) -> tuple[int, int]:
     """Parallel download then parallel indexing. Returns (batch_indexed, total_failed)."""
     connector_docs, download_failed = await _download_files_parallel(
@@ -234,6 +236,7 @@ async def _download_and_index(
         user_id=user_id,
         enable_summary=enable_summary,
         on_heartbeat=on_heartbeat,
+        vision_llm=vision_llm,
     )
 
     batch_indexed = 0
@@ -287,6 +290,7 @@ async def _index_with_delta_sync(
     max_files: int,
     on_heartbeat_callback: HeartbeatCallbackType | None = None,
     enable_summary: bool = True,
+    vision_llm=None,
 ) -> tuple[int, int, int, str]:
     """Delta sync using Dropbox cursor-based change tracking.
 
@@ -359,6 +363,7 @@ async def _index_with_delta_sync(
         user_id=user_id,
         enable_summary=enable_summary,
         on_heartbeat=on_heartbeat_callback,
+        vision_llm=vision_llm,
     )
 
     indexed = renamed_count + batch_indexed
@@ -384,6 +389,7 @@ async def _index_full_scan(
     incremental_sync: bool = True,
     on_heartbeat_callback: HeartbeatCallbackType | None = None,
     enable_summary: bool = True,
+    vision_llm=None,
 ) -> tuple[int, int, int]:
     """Full scan indexing of a folder.
 
@@ -469,6 +475,7 @@ async def _index_full_scan(
         user_id=user_id,
         enable_summary=enable_summary,
         on_heartbeat=on_heartbeat_callback,
+        vision_llm=vision_llm,
     )
 
     if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@@ -498,6 +505,7 @@ async def _index_selected_files(
     enable_summary: bool,
     incremental_sync: bool = True,
     on_heartbeat: HeartbeatCallbackType | None = None,
+    vision_llm=None,
 ) -> tuple[int, int, int, list[str]]:
     """Index user-selected files using the parallel pipeline."""
     page_limit_service = PageLimitService(session)
@@ -557,6 +565,7 @@ async def _index_selected_files(
         user_id=user_id,
         enable_summary=enable_summary,
         on_heartbeat=on_heartbeat,
+        vision_llm=vision_llm,
     )
 
     if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@@ -621,6 +630,13 @@ async def index_dropbox_files(
             return 0, 0, error_msg, 0
 
         connector_enable_summary = getattr(connector, "enable_summary", True)
+        connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
+        vision_llm = None
+        if connector_enable_vision_llm:
+            from app.services.llm_service import get_vision_llm
+
+            vision_llm = await get_vision_llm(session, search_space_id)
+
         dropbox_client = DropboxClient(session, connector_id)
 
         indexing_options = items_dict.get("indexing_options", {})
@@ -650,6 +666,7 @@ async def index_dropbox_files(
                 user_id=user_id,
                 enable_summary=connector_enable_summary,
                 incremental_sync=incremental_sync,
+                vision_llm=vision_llm,
             )
             total_indexed += indexed
             total_skipped += skipped
@@ -684,6 +701,7 @@ async def index_dropbox_files(
                     log_entry,
                     max_files,
                     enable_summary=connector_enable_summary,
+                    vision_llm=vision_llm,
                 )
                 folder_cursors[folder_path] = new_cursor
                 total_unsupported += unsup
@@ -703,6 +721,7 @@ async def index_dropbox_files(
                     include_subfolders,
                     incremental_sync=incremental_sync,
                     enable_summary=connector_enable_summary,
+                    vision_llm=vision_llm,
                 )
                 total_unsupported += unsup
 
diff --git a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
index 6912ffe5af..3c9f273034 100644
--- a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
@@ -20,12 +20,10 @@
     IndexingPipelineService,
     PlaceholderInfo,
 )
+from app.services.composio_service import ComposioService
 from app.services.llm_service import get_user_long_context_llm
 from app.services.task_logging_service import TaskLoggingService
-from app.utils.google_credentials import (
-    COMPOSIO_GOOGLE_CONNECTOR_TYPES,
-    build_composio_credentials,
-)
+from app.utils.google_credentials import COMPOSIO_GOOGLE_CONNECTOR_TYPES
 
 from .base import (
     check_duplicate_document_by_hash,
@@ -44,6 +42,10 @@
 HEARTBEAT_INTERVAL_SECONDS = 30
 
 
+def _format_calendar_event_to_markdown(event: dict) -> str:
+    return GoogleCalendarConnector.format_event_to_markdown(None, event)
+
+
 def _build_connector_doc(
     event: dict,
     event_markdown: str,
@@ -150,7 +152,14 @@ async def index_google_calendar_events(
             )
             return 0, 0, f"Connector with ID {connector_id} not found"
 
-        # ── Credential building ───────────────────────────────────────
+        is_composio_connector = (
+            connector.connector_type in COMPOSIO_GOOGLE_CONNECTOR_TYPES
+        )
+        calendar_client = None
+        composio_service = None
+        connected_account_id = None
+
+        # ── Credential/client building ────────────────────────────────
         if connector.connector_type in COMPOSIO_GOOGLE_CONNECTOR_TYPES:
             connected_account_id = connector.config.get("composio_connected_account_id")
             if not connected_account_id:
@@ -161,7 +170,7 @@ async def index_google_calendar_events(
                     {"error_type": "MissingComposioAccount"},
                 )
                 return 0, 0, "Composio connected_account_id not found"
-            credentials = build_composio_credentials(connected_account_id)
+            composio_service = ComposioService()
         else:
             config_data = connector.config
 
@@ -229,12 +238,13 @@ async def index_google_calendar_events(
             {"stage": "client_initialization"},
         )
 
-        calendar_client = GoogleCalendarConnector(
-            credentials=credentials,
-            session=session,
-            user_id=user_id,
-            connector_id=connector_id,
-        )
+        if not is_composio_connector:
+            calendar_client = GoogleCalendarConnector(
+                credentials=credentials,
+                session=session,
+                user_id=user_id,
+                connector_id=connector_id,
+            )
 
         # Handle 'undefined' string from frontend (treat as None)
         if start_date == "undefined" or start_date == "":
@@ -300,9 +310,26 @@ async def index_google_calendar_events(
         )
 
         try:
-            events, error = await calendar_client.get_all_primary_calendar_events(
-                start_date=start_date_str, end_date=end_date_str
-            )
+            if is_composio_connector:
+                start_dt = parse_date_flexible(start_date_str).replace(
+                    hour=0, minute=0, second=0, microsecond=0
+                )
+                end_dt = parse_date_flexible(end_date_str).replace(
+                    hour=23, minute=59, second=59, microsecond=0
+                )
+                events, error = await composio_service.get_calendar_events(
+                    connected_account_id=connected_account_id,
+                    entity_id=f"surfsense_{user_id}",
+                    time_min=start_dt.isoformat(),
+                    time_max=end_dt.isoformat(),
+                    max_results=250,
+                )
+                if not events and not error:
+                    error = "No events found in the specified date range."
+            else:
+                events, error = await calendar_client.get_all_primary_calendar_events(
+                    start_date=start_date_str, end_date=end_date_str
+                )
 
             if error:
                 if "No events found" in error:
@@ -381,7 +408,7 @@ async def index_google_calendar_events(
                     documents_skipped += 1
                     continue
 
-                event_markdown = calendar_client.format_event_to_markdown(event)
+                event_markdown = _format_calendar_event_to_markdown(event)
                 if not event_markdown.strip():
                     logger.warning(f"Skipping event with no content: {event_summary}")
                     documents_skipped += 1
diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
index b11087fe6c..686f13d9e4 100644
--- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
@@ -9,6 +9,8 @@
 import logging
 import time
 from collections.abc import Awaitable, Callable
+from pathlib import Path
+from typing import Any
 
 from sqlalchemy import String, cast, select
 from sqlalchemy.exc import SQLAlchemyError
@@ -37,6 +39,7 @@
     IndexingPipelineService,
     PlaceholderInfo,
 )
+from app.services.composio_service import ComposioService
 from app.services.llm_service import get_user_long_context_llm
 from app.services.page_limit_service import PageLimitService
 from app.services.task_logging_service import TaskLoggingService
@@ -45,10 +48,7 @@
     get_connector_by_id,
     update_connector_last_indexed,
 )
-from app.utils.google_credentials import (
-    COMPOSIO_GOOGLE_CONNECTOR_TYPES,
-    build_composio_credentials,
-)
+from app.utils.google_credentials import COMPOSIO_GOOGLE_CONNECTOR_TYPES
 
 ACCEPTED_DRIVE_CONNECTOR_TYPES = {
     SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR,
@@ -61,6 +61,209 @@
 logger = logging.getLogger(__name__)
 
 
+class ComposioDriveClient:
+    """Google Drive client facade backed by Composio tool execution.
+
+    Composio-managed OAuth connections can execute tools without exposing raw
+    OAuth tokens through connected account state.
+    """
+
+    def __init__(
+        self,
+        session: AsyncSession,
+        connector_id: int,
+        connected_account_id: str,
+        entity_id: str,
+    ):
+        self.session = session
+        self.connector_id = connector_id
+        self.connected_account_id = connected_account_id
+        self.entity_id = entity_id
+        self.composio = ComposioService()
+
+    async def list_files(
+        self,
+        query: str = "",
+        fields: str = "nextPageToken, files(id, name, mimeType, modifiedTime, md5Checksum, size, webViewLink, parents, owners, createdTime, description)",
+        page_size: int = 100,
+        page_token: str | None = None,
+    ) -> tuple[list[dict[str, Any]], str | None, str | None]:
+        params: dict[str, Any] = {
+            "page_size": min(page_size, 100),
+            "fields": fields,
+        }
+        if query:
+            params["q"] = query
+        if page_token:
+            params["page_token"] = page_token
+
+        result = await self.composio.execute_tool(
+            connected_account_id=self.connected_account_id,
+            tool_name="GOOGLEDRIVE_LIST_FILES",
+            params=params,
+            entity_id=self.entity_id,
+        )
+        if not result.get("success"):
+            return [], None, result.get("error", "Unknown error")
+
+        data = result.get("data", {})
+        files = []
+        next_token = None
+        if isinstance(data, dict):
+            inner_data = data.get("data", data)
+            if isinstance(inner_data, dict):
+                files = inner_data.get("files", [])
+                next_token = inner_data.get("nextPageToken") or inner_data.get(
+                    "next_page_token"
+                )
+        elif isinstance(data, list):
+            files = data
+
+        return files, next_token, None
+
+    async def get_file_metadata(
+        self, file_id: str, fields: str = "*"
+    ) -> tuple[dict[str, Any] | None, str | None]:
+        result = await self.composio.execute_tool(
+            connected_account_id=self.connected_account_id,
+            tool_name="GOOGLEDRIVE_GET_FILE_METADATA",
+            params={"file_id": file_id, "fields": fields},
+            entity_id=self.entity_id,
+        )
+        if not result.get("success"):
+            return None, result.get("error", "Unknown error")
+
+        data = result.get("data", {})
+        if isinstance(data, dict):
+            inner_data = data.get("data", data)
+            if isinstance(inner_data, dict):
+                return inner_data, None
+
+        return None, "Could not extract metadata from Composio response"
+
+    async def download_file(self, file_id: str) -> tuple[bytes | None, str | None]:
+        return await self._download_file_content(file_id)
+
+    async def download_file_to_disk(
+        self,
+        file_id: str,
+        dest_path: str,
+        chunksize: int = 5 * 1024 * 1024,
+    ) -> str | None:
+        del chunksize
+        content, error = await self.download_file(file_id)
+        if error:
+            return error
+        if content is None:
+            return "No content returned from Composio"
+        Path(dest_path).write_bytes(content)
+        return None
+
+    async def export_google_file(
+        self, file_id: str, mime_type: str
+    ) -> tuple[bytes | None, str | None]:
+        return await self._download_file_content(file_id, mime_type=mime_type)
+
+    async def _download_file_content(
+        self, file_id: str, mime_type: str | None = None
+    ) -> tuple[bytes | None, str | None]:
+        params: dict[str, Any] = {"file_id": file_id}
+        if mime_type:
+            params["mime_type"] = mime_type
+
+        result = await self.composio.execute_tool(
+            connected_account_id=self.connected_account_id,
+            tool_name="GOOGLEDRIVE_DOWNLOAD_FILE",
+            params=params,
+            entity_id=self.entity_id,
+        )
+        if not result.get("success"):
+            return None, result.get("error", "Unknown error")
+
+        return self._read_download_result(result.get("data"))
+
+    def _read_download_result(self, data: Any) -> tuple[bytes | None, str | None]:
+        if isinstance(data, bytes):
+            return data, None
+
+        file_path: str | None = None
+        if isinstance(data, str):
+            file_path = data
+        elif isinstance(data, dict):
+            inner_data = data.get("data", data)
+            if isinstance(inner_data, dict):
+                for key in ("file_path", "downloaded_file_content", "path", "uri"):
+                    value = inner_data.get(key)
+                    if isinstance(value, str):
+                        file_path = value
+                        break
+                    if isinstance(value, dict):
+                        nested = (
+                            value.get("file_path")
+                            or value.get("downloaded_file_content")
+                            or value.get("path")
+                            or value.get("uri")
+                            or value.get("s3url")
+                        )
+                        if isinstance(nested, str):
+                            file_path = nested
+                            break
+
+        if not file_path:
+            return None, "No file path/content returned from Composio"
+
+        if file_path.startswith(("http://", "https://")):
+            try:
+                import urllib.request
+
+                with urllib.request.urlopen(file_path, timeout=60) as response:
+                    return response.read(), None
+            except Exception as e:
+                return None, f"Failed to download Composio file URL: {e!s}"
+
+        path_obj = Path(file_path)
+        if path_obj.is_absolute() or ".composio" in str(path_obj):
+            if not path_obj.exists():
+                return None, f"File not found at path: {file_path}"
+            return path_obj.read_bytes(), None
+
+        try:
+            import base64
+
+            return base64.b64decode(file_path), None
+        except Exception:
+            return file_path.encode("utf-8"), None
+
+
+def _build_drive_client_for_connector(
+    session: AsyncSession,
+    connector_id: int,
+    connector: object,
+    user_id: str,
+) -> tuple[GoogleDriveClient | ComposioDriveClient | None, str | None]:
+    if connector.connector_type in COMPOSIO_GOOGLE_CONNECTOR_TYPES:
+        connected_account_id = connector.config.get("composio_connected_account_id")
+        if not connected_account_id:
+            return None, (
+                f"Composio connected_account_id not found for connector {connector_id}"
+            )
+        return (
+            ComposioDriveClient(
+                session,
+                connector_id,
+                connected_account_id,
+                entity_id=f"surfsense_{user_id}",
+            ),
+            None,
+        )
+
+    token_encrypted = connector.config.get("_token_encrypted", False)
+    if token_encrypted and not config.SECRET_KEY:
+        return None, "SECRET_KEY not configured but credentials are marked as encrypted"
+
+    return GoogleDriveClient(session, connector_id), None
+
+
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
@@ -261,6 +464,7 @@ async def _download_files_parallel(
     enable_summary: bool,
     max_concurrency: int = 3,
     on_heartbeat: HeartbeatCallbackType | None = None,
+    vision_llm=None,
 ) -> tuple[list[ConnectorDocument], int]:
     """Download and ETL files in parallel, returning ConnectorDocuments.
 
@@ -276,7 +480,7 @@ async def _download_one(file: dict) -> ConnectorDocument | None:
         nonlocal last_heartbeat, completed_count
         async with sem:
             markdown, drive_metadata, error = await download_and_extract_content(
-                drive_client, file
+                drive_client, file, vision_llm=vision_llm
             )
             if error or not markdown:
                 file_name = file.get("name", "Unknown")
@@ -322,6 +526,7 @@ async def _process_single_file(
     search_space_id: int,
     user_id: str,
     enable_summary: bool = True,
+    vision_llm=None,
 ) -> tuple[int, int, int]:
     """Download, extract, and index a single Drive file via the pipeline.
 
@@ -343,7 +548,7 @@ async def _process_single_file(
         await page_limit_service.check_page_limit(user_id, estimated_pages)
 
         markdown, drive_metadata, error = await download_and_extract_content(
-            drive_client, file
+            drive_client, file, vision_llm=vision_llm
         )
         if error or not markdown:
             logger.warning(f"ETL failed for {file_name}: {error}")
@@ -433,6 +638,7 @@ async def _download_and_index(
     user_id: str,
     enable_summary: bool,
     on_heartbeat: HeartbeatCallbackType | None = None,
+    vision_llm=None,
 ) -> tuple[int, int]:
     """Phase 2+3: parallel download then parallel indexing.
 
@@ -446,6 +652,7 @@ async def _download_and_index(
         user_id=user_id,
         enable_summary=enable_summary,
         on_heartbeat=on_heartbeat,
+        vision_llm=vision_llm,
     )
 
     batch_indexed = 0
@@ -476,6 +683,7 @@ async def _index_selected_files(
     user_id: str,
     enable_summary: bool,
     on_heartbeat: HeartbeatCallbackType | None = None,
+    vision_llm=None,
 ) -> tuple[int, int, int, list[str]]:
     """Index user-selected files using the parallel pipeline.
 
@@ -540,6 +748,7 @@ async def _index_selected_files(
         user_id=user_id,
         enable_summary=enable_summary,
         on_heartbeat=on_heartbeat,
+        vision_llm=vision_llm,
     )
 
     if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@@ -573,6 +782,7 @@ async def _index_full_scan(
     include_subfolders: bool = False,
     on_heartbeat_callback: HeartbeatCallbackType | None = None,
     enable_summary: bool = True,
+    vision_llm=None,
 ) -> tuple[int, int, int]:
     """Full scan indexing of a folder.
 
@@ -703,6 +913,7 @@ async def _index_full_scan(
         user_id=user_id,
         enable_summary=enable_summary,
         on_heartbeat=on_heartbeat_callback,
+        vision_llm=vision_llm,
     )
 
     if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@@ -736,6 +947,7 @@ async def _index_with_delta_sync(
     include_subfolders: bool = False,
     on_heartbeat_callback: HeartbeatCallbackType | None = None,
     enable_summary: bool = True,
+    vision_llm=None,
 ) -> tuple[int, int, int]:
     """Delta sync using change tracking.
 
@@ -844,6 +1056,7 @@ async def _index_with_delta_sync(
         user_id=user_id,
         enable_summary=enable_summary,
         on_heartbeat=on_heartbeat_callback,
+        vision_llm=vision_llm,
     )
 
     if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@@ -917,40 +1130,25 @@ async def index_google_drive_files(
             {"stage": "client_initialization"},
         )
 
-        pre_built_credentials = None
-        if connector.connector_type in COMPOSIO_GOOGLE_CONNECTOR_TYPES:
-            connected_account_id = connector.config.get("composio_connected_account_id")
-            if not connected_account_id:
-                error_msg = f"Composio connected_account_id not found for connector {connector_id}"
-                await task_logger.log_task_failure(
-                    log_entry,
-                    error_msg,
-                    "Missing Composio account",
-                    {"error_type": "MissingComposioAccount"},
-                )
-                return 0, 0, error_msg, 0
-            pre_built_credentials = build_composio_credentials(connected_account_id)
-        else:
-            token_encrypted = connector.config.get("_token_encrypted", False)
-            if token_encrypted and not config.SECRET_KEY:
-                await task_logger.log_task_failure(
-                    log_entry,
-                    "SECRET_KEY not configured but credentials are encrypted",
-                    "Missing SECRET_KEY",
-                    {"error_type": "MissingSecretKey"},
-                )
-                return (
-                    0,
-                    0,
-                    "SECRET_KEY not configured but credentials are marked as encrypted",
-                    0,
-                )
+        drive_client, client_error = _build_drive_client_for_connector(
+            session, connector_id, connector, user_id
+        )
+        if client_error or not drive_client:
+            await task_logger.log_task_failure(
+                log_entry,
+                client_error or "Failed to initialize Google Drive client",
+                "Missing connector credentials",
+                {"error_type": "ClientInitializationError"},
+            )
+            return 0, 0, client_error, 0
 
         connector_enable_summary = getattr(connector, "enable_summary", True)
-        drive_client = GoogleDriveClient(
-            session, connector_id, credentials=pre_built_credentials
-        )
+        connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
+        vision_llm = None
+        if connector_enable_vision_llm:
+            from app.services.llm_service import get_vision_llm
 
+            vision_llm = await get_vision_llm(session, search_space_id)
         if not folder_id:
             error_msg = "folder_id is required for Google Drive indexing"
             await task_logger.log_task_failure(
@@ -963,8 +1161,14 @@ async def index_google_drive_files(
 
         folder_tokens = connector.config.get("folder_tokens", {})
         start_page_token = folder_tokens.get(target_folder_id)
+        is_composio_connector = (
+            connector.connector_type in COMPOSIO_GOOGLE_CONNECTOR_TYPES
+        )
         can_use_delta = (
-            use_delta_sync and start_page_token and connector.last_indexed_at
+            not is_composio_connector
+            and use_delta_sync
+            and start_page_token
+            and connector.last_indexed_at
         )
 
         documents_unsupported = 0
@@ -986,6 +1190,7 @@ async def index_google_drive_files(
                 include_subfolders,
                 on_heartbeat_callback,
                 connector_enable_summary,
+                vision_llm=vision_llm,
             )
             documents_unsupported += du
             logger.info("Running reconciliation scan after delta sync")
@@ -1004,6 +1209,7 @@ async def index_google_drive_files(
                 include_subfolders,
                 on_heartbeat_callback,
                 connector_enable_summary,
+                vision_llm=vision_llm,
             )
             documents_indexed += ri
             documents_skipped += rs
@@ -1029,10 +1235,20 @@ async def index_google_drive_files(
                 include_subfolders,
                 on_heartbeat_callback,
                 connector_enable_summary,
+                vision_llm=vision_llm,
             )
 
         if documents_indexed > 0 or can_use_delta:
-            new_token, token_error = await get_start_page_token(drive_client)
+            if isinstance(drive_client, ComposioDriveClient):
+                (
+                    new_token,
+                    token_error,
+                ) = await drive_client.composio.get_drive_start_page_token(
+                    drive_client.connected_account_id,
+                    drive_client.entity_id,
+                )
+            else:
+                new_token, token_error = await get_start_page_token(drive_client)
             if new_token and not token_error:
                 await session.refresh(connector)
                 if "folder_tokens" not in connector.config:
@@ -1118,38 +1334,25 @@ async def index_google_drive_single_file(
             )
             return 0, error_msg
 
-        pre_built_credentials = None
-        if connector.connector_type in COMPOSIO_GOOGLE_CONNECTOR_TYPES:
-            connected_account_id = connector.config.get("composio_connected_account_id")
-            if not connected_account_id:
-                error_msg = f"Composio connected_account_id not found for connector {connector_id}"
-                await task_logger.log_task_failure(
-                    log_entry,
-                    error_msg,
-                    "Missing Composio account",
-                    {"error_type": "MissingComposioAccount"},
-                )
-                return 0, error_msg
-            pre_built_credentials = build_composio_credentials(connected_account_id)
-        else:
-            token_encrypted = connector.config.get("_token_encrypted", False)
-            if token_encrypted and not config.SECRET_KEY:
-                await task_logger.log_task_failure(
-                    log_entry,
-                    "SECRET_KEY not configured but credentials are encrypted",
-                    "Missing SECRET_KEY",
-                    {"error_type": "MissingSecretKey"},
-                )
-                return (
-                    0,
-                    "SECRET_KEY not configured but credentials are marked as encrypted",
-                )
+        drive_client, client_error = _build_drive_client_for_connector(
+            session, connector_id, connector, user_id
+        )
+        if client_error or not drive_client:
+            await task_logger.log_task_failure(
+                log_entry,
+                client_error or "Failed to initialize Google Drive client",
+                "Missing connector credentials",
+                {"error_type": "ClientInitializationError"},
+            )
+            return 0, client_error
 
         connector_enable_summary = getattr(connector, "enable_summary", True)
-        drive_client = GoogleDriveClient(
-            session, connector_id, credentials=pre_built_credentials
-        )
+        connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
+        vision_llm = None
+        if connector_enable_vision_llm:
+            from app.services.llm_service import get_vision_llm
 
+            vision_llm = await get_vision_llm(session, search_space_id)
         file, error = await get_file_by_id(drive_client, file_id)
         if error or not file:
             error_msg = f"Failed to fetch file {file_id}: {error or 'File not found'}"
@@ -1168,6 +1371,7 @@ async def index_google_drive_single_file(
             search_space_id,
             user_id,
             connector_enable_summary,
+            vision_llm=vision_llm,
         )
         await session.commit()
 
@@ -1250,38 +1454,26 @@ async def index_google_drive_selected_files(
             )
             return 0, 0, [error_msg]
 
-        pre_built_credentials = None
-        if connector.connector_type in COMPOSIO_GOOGLE_CONNECTOR_TYPES:
-            connected_account_id = connector.config.get("composio_connected_account_id")
-            if not connected_account_id:
-                error_msg = f"Composio connected_account_id not found for connector {connector_id}"
-                await task_logger.log_task_failure(
-                    log_entry,
-                    error_msg,
-                    "Missing Composio account",
-                    {"error_type": "MissingComposioAccount"},
-                )
-                return 0, 0, [error_msg]
-            pre_built_credentials = build_composio_credentials(connected_account_id)
-        else:
-            token_encrypted = connector.config.get("_token_encrypted", False)
-            if token_encrypted and not config.SECRET_KEY:
-                error_msg = (
-                    "SECRET_KEY not configured but credentials are marked as encrypted"
-                )
-                await task_logger.log_task_failure(
-                    log_entry,
-                    error_msg,
-                    "Missing SECRET_KEY",
-                    {"error_type": "MissingSecretKey"},
-                )
-                return 0, 0, [error_msg]
+        drive_client, client_error = _build_drive_client_for_connector(
+            session, connector_id, connector, user_id
+        )
+        if client_error or not drive_client:
+            error_msg = client_error or "Failed to initialize Google Drive client"
+            await task_logger.log_task_failure(
+                log_entry,
+                error_msg,
+                "Missing connector credentials",
+                {"error_type": "ClientInitializationError"},
+            )
+            return 0, 0, [error_msg]
 
         connector_enable_summary = getattr(connector, "enable_summary", True)
-        drive_client = GoogleDriveClient(
-            session, connector_id, credentials=pre_built_credentials
-        )
+        connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
+        vision_llm = None
+        if connector_enable_vision_llm:
+            from app.services.llm_service import get_vision_llm
 
+            vision_llm = await get_vision_llm(session, search_space_id)
         indexed, skipped, unsupported, errors = await _index_selected_files(
             drive_client,
             session,
@@ -1291,6 +1483,7 @@ async def index_google_drive_selected_files(
             user_id=user_id,
             enable_summary=connector_enable_summary,
             on_heartbeat=on_heartbeat_callback,
+            vision_llm=vision_llm,
         )
 
         if unsupported > 0:
diff --git a/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py
index ef226087ba..6697c0eb11 100644
--- a/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py
@@ -20,12 +20,10 @@
     IndexingPipelineService,
     PlaceholderInfo,
 )
+from app.services.composio_service import ComposioService
 from app.services.llm_service import get_user_long_context_llm
 from app.services.task_logging_service import TaskLoggingService
-from app.utils.google_credentials import (
-    COMPOSIO_GOOGLE_CONNECTOR_TYPES,
-    build_composio_credentials,
-)
+from app.utils.google_credentials import COMPOSIO_GOOGLE_CONNECTOR_TYPES
 
 from .base import (
     calculate_date_range,
@@ -44,6 +42,62 @@
 HEARTBEAT_INTERVAL_SECONDS = 30
 
 
+def _normalize_composio_gmail_message(message: dict) -> dict:
+    if message.get("payload"):
+        return message
+
+    headers = []
+    header_values = {
+        "Subject": message.get("subject"),
+        "From": message.get("from") or message.get("sender"),
+        "To": message.get("to") or message.get("recipient"),
+        "Date": message.get("date"),
+    }
+    for name, value in header_values.items():
+        if value:
+            headers.append({"name": name, "value": value})
+
+    return {
+        **message,
+        "id": message.get("id")
+        or message.get("message_id")
+        or message.get("messageId"),
+        "threadId": message.get("threadId") or message.get("thread_id"),
+        "payload": {"headers": headers},
+        "snippet": message.get("snippet", ""),
+        "messageText": message.get("messageText") or message.get("body") or "",
+    }
+
+
+def _format_gmail_message_to_markdown(message: dict) -> str:
+    headers = {
+        header.get("name", "").lower(): header.get("value", "")
+        for header in message.get("payload", {}).get("headers", [])
+        if isinstance(header, dict)
+    }
+    subject = headers.get("subject", "No Subject")
+    from_email = headers.get("from", "Unknown Sender")
+    to_email = headers.get("to", "Unknown Recipient")
+    date_str = headers.get("date", "Unknown Date")
+    message_text = (
+        message.get("messageText")
+        or message.get("body")
+        or message.get("text")
+        or message.get("snippet", "")
+    )
+
+    return (
+        f"# {subject}\n\n"
+        f"**From:** {from_email}\n"
+        f"**To:** {to_email}\n"
+        f"**Date:** {date_str}\n\n"
+        f"## Message Content\n\n{message_text}\n\n"
+        f"## Message Details\n\n"
+        f"- **Message ID:** {message.get('id', 'Unknown')}\n"
+        f"- **Thread ID:** {message.get('threadId', 'Unknown')}\n"
+    )
+
+
 def _build_connector_doc(
     message: dict,
     markdown_content: str,
@@ -162,7 +216,14 @@ async def index_google_gmail_messages(
             )
             return 0, 0, error_msg
 
-        # ── Credential building ───────────────────────────────────────
+        is_composio_connector = (
+            connector.connector_type in COMPOSIO_GOOGLE_CONNECTOR_TYPES
+        )
+        gmail_connector = None
+        composio_service = None
+        connected_account_id = None
+
+        # ── Credential/client building ────────────────────────────────
         if connector.connector_type in COMPOSIO_GOOGLE_CONNECTOR_TYPES:
             connected_account_id = connector.config.get("composio_connected_account_id")
             if not connected_account_id:
@@ -173,7 +234,7 @@ async def index_google_gmail_messages(
                     {"error_type": "MissingComposioAccount"},
                 )
                 return 0, 0, "Composio connected_account_id not found"
-            credentials = build_composio_credentials(connected_account_id)
+            composio_service = ComposioService()
         else:
             config_data = connector.config
 
@@ -241,9 +302,10 @@ async def index_google_gmail_messages(
             {"stage": "client_initialization"},
         )
 
-        gmail_connector = GoogleGmailConnector(
-            credentials, session, user_id, connector_id
-        )
+        if not is_composio_connector:
+            gmail_connector = GoogleGmailConnector(
+                credentials, session, user_id, connector_id
+            )
 
         calculated_start_date, calculated_end_date = calculate_date_range(
             connector, start_date, end_date, default_days_back=365
@@ -254,11 +316,60 @@ async def index_google_gmail_messages(
             f"Fetching emails for connector {connector_id} "
             f"from {calculated_start_date} to {calculated_end_date}"
         )
-        messages, error = await gmail_connector.get_recent_messages(
-            max_results=max_messages,
-            start_date=calculated_start_date,
-            end_date=calculated_end_date,
-        )
+        if is_composio_connector:
+            query_parts = []
+            if calculated_start_date:
+                query_parts.append(f"after:{calculated_start_date.replace('-', '/')}")
+            if calculated_end_date:
+                query_parts.append(f"before:{calculated_end_date.replace('-', '/')}")
+            query = " ".join(query_parts)
+
+            messages = []
+            page_token = None
+            error = None
+            while len(messages) < max_messages:
+                page_size = min(50, max_messages - len(messages))
+                (
+                    page_messages,
+                    page_token,
+                    _estimate,
+                    page_error,
+                ) = await composio_service.get_gmail_messages(
+                    connected_account_id=connected_account_id,
+                    entity_id=f"surfsense_{user_id}",
+                    query=query,
+                    max_results=page_size,
+                    page_token=page_token,
+                )
+                if page_error:
+                    error = page_error
+                    break
+                for page_message in page_messages:
+                    message_id = (
+                        page_message.get("id")
+                        or page_message.get("message_id")
+                        or page_message.get("messageId")
+                    )
+                    if message_id:
+                        (
+                            detail,
+                            detail_error,
+                        ) = await composio_service.get_gmail_message_detail(
+                            connected_account_id=connected_account_id,
+                            entity_id=f"surfsense_{user_id}",
+                            message_id=message_id,
+                        )
+                        if not detail_error and isinstance(detail, dict):
+                            page_message = detail
+                    messages.append(_normalize_composio_gmail_message(page_message))
+                if not page_token:
+                    break
+        else:
+            messages, error = await gmail_connector.get_recent_messages(
+                max_results=max_messages,
+                start_date=calculated_start_date,
+                end_date=calculated_end_date,
+            )
 
         if error:
             error_message = error
@@ -326,7 +437,12 @@ def _gmail_subject(msg: dict) -> str:
                     documents_skipped += 1
                     continue
 
-                markdown_content = gmail_connector.format_message_to_markdown(message)
+                if is_composio_connector:
+                    markdown_content = _format_gmail_message_to_markdown(message)
+                else:
+                    markdown_content = gmail_connector.format_message_to_markdown(
+                        message
+                    )
                 if not markdown_content.strip():
                     logger.warning(f"Skipping message with no content: {message_id}")
                     documents_skipped += 1
diff --git a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
index f503ff864e..9352b60e07 100644
--- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
@@ -60,14 +60,16 @@ async def _check_page_limit_or_skip(
     page_limit_service: PageLimitService,
     user_id: str,
     file_path: str,
-) -> int:
+    page_multiplier: int = 1,
+) -> tuple[int, int]:
     """Estimate pages and check the limit; raises PageLimitExceededError if over quota.
 
-    Returns the estimated page count on success.
+    Returns (estimated_pages, billable_pages).
     """
     estimated = _estimate_pages_safe(page_limit_service, file_path)
-    await page_limit_service.check_page_limit(user_id, estimated)
-    return estimated
+    billable = estimated * page_multiplier
+    await page_limit_service.check_page_limit(user_id, billable)
+    return estimated, billable
 
 
 def _compute_final_pages(
@@ -153,17 +155,20 @@ def scan_folder(
     return files
 
 
-async def _read_file_content(file_path: str, filename: str) -> str:
+async def _read_file_content(
+    file_path: str, filename: str, *, vision_llm=None, processing_mode: str = "basic"
+) -> str:
     """Read file content via the unified ETL pipeline.
 
-    All file types (plaintext, audio, direct-convert, document) are handled
-    by ``EtlPipelineService``.
+    All file types (plaintext, audio, direct-convert, document, image) are
+    handled by ``EtlPipelineService``.
     """
-    from app.etl_pipeline.etl_document import EtlRequest
+    from app.etl_pipeline.etl_document import EtlRequest, ProcessingMode
     from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
 
-    result = await EtlPipelineService().extract(
-        EtlRequest(file_path=file_path, filename=filename)
+    mode = ProcessingMode.coerce(processing_mode)
+    result = await EtlPipelineService(vision_llm=vision_llm).extract(
+        EtlRequest(file_path=file_path, filename=filename, processing_mode=mode)
     )
     return result.markdown_content
 
@@ -199,12 +204,17 @@ async def _compute_file_content_hash(
     file_path: str,
     filename: str,
     search_space_id: int,
+    *,
+    vision_llm=None,
+    processing_mode: str = "basic",
 ) -> tuple[str, str]:
     """Read a file (via ETL if needed) and compute its content hash.
 
     Returns (content_text, content_hash).
     """
-    content = await _read_file_content(file_path, filename)
+    content = await _read_file_content(
+        file_path, filename, vision_llm=vision_llm, processing_mode=processing_mode
+    )
     return content, _content_hash(content, search_space_id)
 
 
@@ -692,7 +702,7 @@ async def index_local_folder(
                         continue
 
                     try:
-                        estimated_pages = await _check_page_limit_or_skip(
+                        estimated_pages, _billable = await _check_page_limit_or_skip(
                             page_limit_service, user_id, file_path_abs
                         )
                     except PageLimitExceededError:
@@ -704,7 +714,9 @@ async def index_local_folder(
 
                     try:
                         content, content_hash = await _compute_file_content_hash(
-                            file_path_abs, file_info["relative_path"], search_space_id
+                            file_path_abs,
+                            file_info["relative_path"],
+                            search_space_id,
                         )
                     except Exception as read_err:
                         logger.warning(f"Could not read {file_path_abs}: {read_err}")
@@ -726,7 +738,7 @@ async def index_local_folder(
                     await create_version_snapshot(session, existing_document)
                 else:
                     try:
-                        estimated_pages = await _check_page_limit_or_skip(
+                        estimated_pages, _billable = await _check_page_limit_or_skip(
                             page_limit_service, user_id, file_path_abs
                         )
                     except PageLimitExceededError:
@@ -738,7 +750,9 @@ async def index_local_folder(
 
                     try:
                         content, content_hash = await _compute_file_content_hash(
-                            file_path_abs, file_info["relative_path"], search_space_id
+                            file_path_abs,
+                            file_info["relative_path"],
+                            search_space_id,
                         )
                     except Exception as read_err:
                         logger.warning(f"Could not read {file_path_abs}: {read_err}")
@@ -1074,7 +1088,7 @@ async def _index_single_file(
 
         page_limit_service = PageLimitService(session)
         try:
-            estimated_pages = await _check_page_limit_or_skip(
+            estimated_pages, _billable = await _check_page_limit_or_skip(
                 page_limit_service, user_id, str(full_path)
             )
         except PageLimitExceededError as e:
@@ -1264,6 +1278,8 @@ async def index_uploaded_files(
     enable_summary: bool,
     file_mappings: list[dict],
     on_heartbeat_callback: HeartbeatCallbackType | None = None,
+    use_vision_llm: bool = False,
+    processing_mode: str = "basic",
 ) -> tuple[int, int, str | None]:
     """Index files uploaded from the desktop app via temp paths.
 
@@ -1274,12 +1290,16 @@ async def index_uploaded_files(
 
     Returns ``(indexed_count, failed_count, error_summary_or_none)``.
     """
+    from app.etl_pipeline.etl_document import ProcessingMode
+
+    mode = ProcessingMode.coerce(processing_mode)
+
     task_logger = TaskLoggingService(session, search_space_id)
     log_entry = await task_logger.log_task_start(
         task_name="local_folder_indexing",
         source="uploaded_folder_indexing",
         message=f"Indexing {len(file_mappings)} uploaded file(s) for {folder_name}",
-        metadata={"file_count": len(file_mappings)},
+        metadata={"file_count": len(file_mappings), "processing_mode": mode.value},
     )
 
     try:
@@ -1300,6 +1320,12 @@ async def index_uploaded_files(
         pipeline = IndexingPipelineService(session)
         llm = await get_user_long_context_llm(session, user_id, search_space_id)
 
+        vision_llm_instance = None
+        if use_vision_llm:
+            from app.services.llm_service import get_vision_llm
+
+            vision_llm_instance = await get_vision_llm(session, search_space_id)
+
         indexed_count = 0
         failed_count = 0
         errors: list[str] = []
@@ -1337,8 +1363,11 @@ async def index_uploaded_files(
                         continue
 
                 try:
-                    estimated_pages = await _check_page_limit_or_skip(
-                        page_limit_service, user_id, temp_path
+                    estimated_pages, _billable_pages = await _check_page_limit_or_skip(
+                        page_limit_service,
+                        user_id,
+                        temp_path,
+                        page_multiplier=mode.page_multiplier,
                     )
                 except PageLimitExceededError:
                     logger.warning(f"Page limit exceeded, skipping: {relative_path}")
@@ -1347,7 +1376,11 @@ async def index_uploaded_files(
 
                 try:
                     content, content_hash = await _compute_file_content_hash(
-                        temp_path, filename, search_space_id
+                        temp_path,
+                        filename,
+                        search_space_id,
+                        vision_llm=vision_llm_instance,
+                        processing_mode=mode.value,
                     )
                 except Exception as e:
                     logger.warning(f"Could not read {relative_path}: {e}")
@@ -1413,8 +1446,9 @@ async def index_uploaded_files(
                     final_pages = _compute_final_pages(
                         page_limit_service, estimated_pages, len(content)
                     )
+                    final_billable = final_pages * mode.page_multiplier
                     await page_limit_service.update_page_usage(
-                        user_id, final_pages, allow_exceed=True
+                        user_id, final_billable, allow_exceed=True
                     )
                 else:
                     failed_count += 1
diff --git a/surfsense_backend/app/tasks/connector_indexers/obsidian_indexer.py b/surfsense_backend/app/tasks/connector_indexers/obsidian_indexer.py
deleted file mode 100644
index 5356ecfb7f..0000000000
--- a/surfsense_backend/app/tasks/connector_indexers/obsidian_indexer.py
+++ /dev/null
@@ -1,676 +0,0 @@
-"""
-Obsidian connector indexer.
-
-Indexes markdown notes from a local Obsidian vault.
-This connector is only available in self-hosted mode.
-
-Implements 2-phase document status updates for real-time UI feedback:
-- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
-- Phase 2: Process each document: pending → processing → ready/failed
-"""
-
-import os
-import re
-import time
-from collections.abc import Awaitable, Callable
-from datetime import UTC, datetime
-from pathlib import Path
-
-import yaml
-from sqlalchemy.exc import SQLAlchemyError
-from sqlalchemy.ext.asyncio import AsyncSession
-
-from app.config import config
-from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
-from app.services.llm_service import get_user_long_context_llm
-from app.services.task_logging_service import TaskLoggingService
-from app.utils.document_converters import (
-    create_document_chunks,
-    embed_text,
-    generate_content_hash,
-    generate_document_summary,
-    generate_unique_identifier_hash,
-)
-
-from .base import (
-    build_document_metadata_string,
-    check_document_by_unique_identifier,
-    check_duplicate_document_by_hash,
-    get_connector_by_id,
-    get_current_timestamp,
-    logger,
-    safe_set_chunks,
-    update_connector_last_indexed,
-)
-
-# Type hint for heartbeat callback
-HeartbeatCallbackType = Callable[[int], Awaitable[None]]
-
-# Heartbeat interval in seconds
-HEARTBEAT_INTERVAL_SECONDS = 30
-
-
-def parse_frontmatter(content: str) -> tuple[dict | None, str]:
-    """
-    Parse YAML frontmatter from markdown content.
-
-    Args:
-        content: The full markdown content
-
-    Returns:
-        Tuple of (frontmatter dict or None, content without frontmatter)
-    """
-    if not content.startswith("---"):
-        return None, content
-
-    # Find the closing ---
-    end_match = re.search(r"\n---\n", content[3:])
-    if not end_match:
-        return None, content
-
-    frontmatter_str = content[3 : end_match.start() + 3]
-    remaining_content = content[end_match.end() + 3 :]
-
-    try:
-        frontmatter = yaml.safe_load(frontmatter_str)
-        return frontmatter, remaining_content.strip()
-    except yaml.YAMLError:
-        return None, content
-
-
-def extract_wiki_links(content: str) -> list[str]:
-    """
-    Extract [[wiki-style links]] from content.
-
-    Args:
-        content: Markdown content
-
-    Returns:
-        List of linked note names
-    """
-    # Match [[link]] or [[link|alias]]
-    pattern = r"\[\[([^\]|]+)(?:\|[^\]]+)?\]\]"
-    matches = re.findall(pattern, content)
-    return list(set(matches))
-
-
-def extract_tags(content: str) -> list[str]:
-    """
-    Extract #tags from content (both inline and frontmatter).
-
-    Args:
-        content: Markdown content
-
-    Returns:
-        List of tags (without # prefix)
-    """
-    # Match #tag but not ## headers
-    pattern = r"(?<!\S)#([a-zA-Z][a-zA-Z0-9_/-]*)"
-    matches = re.findall(pattern, content)
-    return list(set(matches))
-
-
-def scan_vault(
-    vault_path: str,
-    exclude_folders: list[str] | None = None,
-) -> list[dict]:
-    """
-    Scan an Obsidian vault for markdown files.
-
-    Args:
-        vault_path: Path to the Obsidian vault
-        exclude_folders: List of folder names to exclude
-
-    Returns:
-        List of file info dicts with path, name, modified time
-    """
-    if exclude_folders is None:
-        exclude_folders = [".trash", ".obsidian", "templates"]
-
-    vault = Path(vault_path)
-    if not vault.exists():
-        raise ValueError(f"Vault path does not exist: {vault_path}")
-
-    files = []
-    for md_file in vault.rglob("*.md"):
-        # Check if file is in an excluded folder
-        relative_path = md_file.relative_to(vault)
-        parts = relative_path.parts
-
-        if any(excluded in parts for excluded in exclude_folders):
-            continue
-
-        try:
-            stat = md_file.stat()
-            files.append(
-                {
-                    "path": str(md_file),
-                    "relative_path": str(relative_path),
-                    "name": md_file.stem,
-                    "modified_at": datetime.fromtimestamp(stat.st_mtime, tz=UTC),
-                    "created_at": datetime.fromtimestamp(stat.st_ctime, tz=UTC),
-                    "size": stat.st_size,
-                }
-            )
-        except OSError as e:
-            logger.warning(f"Could not stat file {md_file}: {e}")
-
-    return files
-
-
-async def index_obsidian_vault(
-    session: AsyncSession,
-    connector_id: int,
-    search_space_id: int,
-    user_id: str,
-    start_date: str | None = None,
-    end_date: str | None = None,
-    update_last_indexed: bool = True,
-    on_heartbeat_callback: HeartbeatCallbackType | None = None,
-) -> tuple[int, str | None]:
-    """
-    Index notes from a local Obsidian vault.
-
-    This indexer is only available in self-hosted mode as it requires
-    direct file system access to the user's Obsidian vault.
-
-    Args:
-        session: Database session
-        connector_id: ID of the Obsidian connector
-        search_space_id: ID of the search space to store documents in
-        user_id: ID of the user
-        start_date: Start date for filtering (YYYY-MM-DD format) - optional
-        end_date: End date for filtering (YYYY-MM-DD format) - optional
-        update_last_indexed: Whether to update the last_indexed_at timestamp
-        on_heartbeat_callback: Optional callback to update notification during long-running indexing.
-
-    Returns:
-        Tuple containing (number of documents indexed, error message or None)
-    """
-    task_logger = TaskLoggingService(session, search_space_id)
-
-    # Check if self-hosted mode
-    if not config.is_self_hosted():
-        return 0, "Obsidian connector is only available in self-hosted mode"
-
-    # Log task start
-    log_entry = await task_logger.log_task_start(
-        task_name="obsidian_vault_indexing",
-        source="connector_indexing_task",
-        message=f"Starting Obsidian vault indexing for connector {connector_id}",
-        metadata={
-            "connector_id": connector_id,
-            "user_id": str(user_id),
-            "start_date": start_date,
-            "end_date": end_date,
-        },
-    )
-
-    try:
-        # Get the connector
-        await task_logger.log_task_progress(
-            log_entry,
-            f"Retrieving Obsidian connector {connector_id} from database",
-            {"stage": "connector_retrieval"},
-        )
-
-        connector = await get_connector_by_id(
-            session, connector_id, SearchSourceConnectorType.OBSIDIAN_CONNECTOR
-        )
-
-        if not connector:
-            await task_logger.log_task_failure(
-                log_entry,
-                f"Connector with ID {connector_id} not found or is not an Obsidian connector",
-                "Connector not found",
-                {"error_type": "ConnectorNotFound"},
-            )
-            return (
-                0,
-                f"Connector with ID {connector_id} not found or is not an Obsidian connector",
-            )
-
-        # Get vault path from connector config
-        vault_path = connector.config.get("vault_path")
-        if not vault_path:
-            await task_logger.log_task_failure(
-                log_entry,
-                "Vault path not configured for this connector",
-                "Missing vault path",
-                {"error_type": "MissingVaultPath"},
-            )
-            return 0, "Vault path not configured for this connector"
-
-        # Validate vault path exists
-        if not os.path.exists(vault_path):
-            await task_logger.log_task_failure(
-                log_entry,
-                f"Vault path does not exist: {vault_path}",
-                "Vault path not found",
-                {"error_type": "VaultNotFound", "vault_path": vault_path},
-            )
-            return 0, f"Vault path does not exist: {vault_path}"
-
-        # Get configuration options
-        exclude_folders = connector.config.get(
-            "exclude_folders", [".trash", ".obsidian", "templates"]
-        )
-        vault_name = connector.config.get("vault_name") or os.path.basename(vault_path)
-
-        await task_logger.log_task_progress(
-            log_entry,
-            f"Scanning Obsidian vault: {vault_name}",
-            {"stage": "vault_scan", "vault_path": vault_path},
-        )
-
-        # Scan vault for markdown files
-        try:
-            files = scan_vault(vault_path, exclude_folders)
-        except Exception as e:
-            await task_logger.log_task_failure(
-                log_entry,
-                f"Failed to scan vault: {e}",
-                "Vault scan error",
-                {"error_type": "VaultScanError"},
-            )
-            return 0, f"Failed to scan vault: {e}"
-
-        logger.info(f"Found {len(files)} markdown files in vault")
-
-        await task_logger.log_task_progress(
-            log_entry,
-            f"Found {len(files)} markdown files to process",
-            {"stage": "files_discovered", "file_count": len(files)},
-        )
-
-        # Filter by date if provided (handle "undefined" string from frontend)
-        # Also handle inverted dates (start > end) by skipping filtering
-        start_dt = None
-        end_dt = None
-
-        if start_date and start_date != "undefined":
-            start_dt = datetime.strptime(start_date, "%Y-%m-%d").replace(tzinfo=UTC)
-
-        if end_date and end_date != "undefined":
-            # Make end_date inclusive (end of day)
-            end_dt = datetime.strptime(end_date, "%Y-%m-%d").replace(tzinfo=UTC)
-            end_dt = end_dt.replace(hour=23, minute=59, second=59)
-
-        # Only apply date filtering if dates are valid and in correct order
-        if start_dt and end_dt and start_dt > end_dt:
-            logger.warning(
-                f"start_date ({start_date}) is after end_date ({end_date}), skipping date filter"
-            )
-        else:
-            if start_dt:
-                files = [f for f in files if f["modified_at"] >= start_dt]
-                logger.info(
-                    f"After start_date filter ({start_date}): {len(files)} files"
-                )
-            if end_dt:
-                files = [f for f in files if f["modified_at"] <= end_dt]
-                logger.info(f"After end_date filter ({end_date}): {len(files)} files")
-
-        logger.info(f"Processing {len(files)} files after date filtering")
-
-        indexed_count = 0
-        skipped_count = 0
-        failed_count = 0
-        duplicate_content_count = 0
-
-        # Heartbeat tracking - update notification periodically to prevent appearing stuck
-        last_heartbeat_time = time.time()
-
-        # =======================================================================
-        # PHASE 1: Analyze all files, create pending documents
-        # This makes ALL documents visible in the UI immediately with pending status
-        # =======================================================================
-        files_to_process = []  # List of dicts with document and file data
-        new_documents_created = False
-
-        for file_info in files:
-            try:
-                file_path = file_info["path"]
-                relative_path = file_info["relative_path"]
-
-                # Read file content
-                try:
-                    with open(file_path, encoding="utf-8") as f:
-                        content = f.read()
-                except UnicodeDecodeError:
-                    logger.warning(f"Could not decode file {file_path}, skipping")
-                    skipped_count += 1
-                    continue
-
-                if not content.strip():
-                    logger.debug(f"Empty file {file_path}, skipping")
-                    skipped_count += 1
-                    continue
-
-                # Parse frontmatter and extract metadata
-                frontmatter, body_content = parse_frontmatter(content)
-                wiki_links = extract_wiki_links(content)
-                tags = extract_tags(content)
-
-                # Get title from frontmatter or filename
-                title = file_info["name"]
-                if frontmatter:
-                    title = frontmatter.get("title", title)
-                    # Also extract tags from frontmatter
-                    fm_tags = frontmatter.get("tags", [])
-                    if isinstance(fm_tags, list):
-                        tags = list({*tags, *fm_tags})
-                    elif isinstance(fm_tags, str):
-                        tags = list({*tags, fm_tags})
-
-                # Generate unique identifier using vault name and relative path
-                unique_identifier = f"{vault_name}:{relative_path}"
-                unique_identifier_hash = generate_unique_identifier_hash(
-                    DocumentType.OBSIDIAN_CONNECTOR,
-                    unique_identifier,
-                    search_space_id,
-                )
-
-                # Generate content hash
-                content_hash = generate_content_hash(content, search_space_id)
-
-                # Check for existing document
-                existing_document = await check_document_by_unique_identifier(
-                    session, unique_identifier_hash
-                )
-
-                if existing_document:
-                    # Document exists - check if content has changed
-                    if existing_document.content_hash == content_hash:
-                        # Ensure status is ready (might have been stuck in processing/pending)
-                        if not DocumentStatus.is_state(
-                            existing_document.status, DocumentStatus.READY
-                        ):
-                            existing_document.status = DocumentStatus.ready()
-                        logger.debug(f"Note {title} unchanged, skipping")
-                        skipped_count += 1
-                        continue
-
-                    # Queue existing document for update (will be set to processing in Phase 2)
-                    files_to_process.append(
-                        {
-                            "document": existing_document,
-                            "is_new": False,
-                            "file_info": file_info,
-                            "content": content,
-                            "body_content": body_content,
-                            "frontmatter": frontmatter,
-                            "wiki_links": wiki_links,
-                            "tags": tags,
-                            "title": title,
-                            "relative_path": relative_path,
-                            "content_hash": content_hash,
-                            "unique_identifier_hash": unique_identifier_hash,
-                        }
-                    )
-                    continue
-
-                # Document doesn't exist by unique_identifier_hash
-                # Check if a document with the same content_hash exists (from another connector)
-                with session.no_autoflush:
-                    duplicate_by_content = await check_duplicate_document_by_hash(
-                        session, content_hash
-                    )
-
-                if duplicate_by_content:
-                    logger.info(
-                        f"Obsidian note {title} already indexed by another connector "
-                        f"(existing document ID: {duplicate_by_content.id}, "
-                        f"type: {duplicate_by_content.document_type}). Skipping."
-                    )
-                    duplicate_content_count += 1
-                    skipped_count += 1
-                    continue
-
-                # Create new document with PENDING status (visible in UI immediately)
-                document = Document(
-                    search_space_id=search_space_id,
-                    title=title,
-                    document_type=DocumentType.OBSIDIAN_CONNECTOR,
-                    document_metadata={
-                        "vault_name": vault_name,
-                        "file_path": relative_path,
-                        "connector_id": connector_id,
-                    },
-                    content="Pending...",  # Placeholder until processed
-                    content_hash=unique_identifier_hash,  # Temporary unique value - updated when ready
-                    unique_identifier_hash=unique_identifier_hash,
-                    embedding=None,
-                    chunks=[],  # Empty at creation - safe for async
-                    status=DocumentStatus.pending(),  # Pending until processing starts
-                    updated_at=get_current_timestamp(),
-                    created_by_id=user_id,
-                    connector_id=connector_id,
-                )
-                session.add(document)
-                new_documents_created = True
-
-                files_to_process.append(
-                    {
-                        "document": document,
-                        "is_new": True,
-                        "file_info": file_info,
-                        "content": content,
-                        "body_content": body_content,
-                        "frontmatter": frontmatter,
-                        "wiki_links": wiki_links,
-                        "tags": tags,
-                        "title": title,
-                        "relative_path": relative_path,
-                        "content_hash": content_hash,
-                        "unique_identifier_hash": unique_identifier_hash,
-                    }
-                )
-
-            except Exception as e:
-                logger.exception(
-                    f"Error in Phase 1 for file {file_info.get('path', 'unknown')}: {e}"
-                )
-                failed_count += 1
-                continue
-
-        # Commit all pending documents - they all appear in UI now
-        if new_documents_created:
-            logger.info(
-                f"Phase 1: Committing {len([f for f in files_to_process if f['is_new']])} pending documents"
-            )
-            await session.commit()
-
-        # =======================================================================
-        # PHASE 2: Process each document one by one
-        # Each document transitions: pending → processing → ready/failed
-        # =======================================================================
-        logger.info(f"Phase 2: Processing {len(files_to_process)} documents")
-
-        # Get LLM for summarization
-        long_context_llm = await get_user_long_context_llm(
-            session, user_id, search_space_id
-        )
-
-        for item in files_to_process:
-            # Send heartbeat periodically
-            if on_heartbeat_callback:
-                current_time = time.time()
-                if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
-                    await on_heartbeat_callback(indexed_count)
-                    last_heartbeat_time = current_time
-
-            document = item["document"]
-            try:
-                # Set to PROCESSING and commit - shows "processing" in UI for THIS document only
-                document.status = DocumentStatus.processing()
-                await session.commit()
-
-                # Extract data from item
-                title = item["title"]
-                relative_path = item["relative_path"]
-                content = item["content"]
-                body_content = item["body_content"]
-                frontmatter = item["frontmatter"]
-                wiki_links = item["wiki_links"]
-                tags = item["tags"]
-                content_hash = item["content_hash"]
-                file_info = item["file_info"]
-
-                # Build metadata
-                document_metadata = {
-                    "vault_name": vault_name,
-                    "file_path": relative_path,
-                    "tags": tags,
-                    "outgoing_links": wiki_links,
-                    "frontmatter": frontmatter,
-                    "modified_at": file_info["modified_at"].isoformat(),
-                    "created_at": file_info["created_at"].isoformat(),
-                    "word_count": len(body_content.split()),
-                }
-
-                # Build document content with metadata
-                metadata_sections = [
-                    (
-                        "METADATA",
-                        [
-                            f"Title: {title}",
-                            f"Vault: {vault_name}",
-                            f"Path: {relative_path}",
-                            f"Tags: {', '.join(tags) if tags else 'None'}",
-                            f"Links to: {', '.join(wiki_links) if wiki_links else 'None'}",
-                        ],
-                    ),
-                    ("CONTENT", [body_content]),
-                ]
-                document_string = build_document_metadata_string(metadata_sections)
-
-                # Generate summary
-                summary_content = ""
-                if long_context_llm and connector.enable_summary:
-                    summary_content, _ = await generate_document_summary(
-                        document_string,
-                        long_context_llm,
-                        document_metadata,
-                    )
-
-                # Generate embedding
-                embedding = embed_text(document_string)
-
-                # Add URL and summary to metadata
-                document_metadata["url"] = f"obsidian://{vault_name}/{relative_path}"
-                document_metadata["summary"] = summary_content
-                document_metadata["connector_id"] = connector_id
-
-                # Create chunks
-                chunks = await create_document_chunks(document_string)
-
-                # Update document to READY with actual content
-                document.title = title
-                document.content = document_string
-                document.content_hash = content_hash
-                document.embedding = embedding
-                document.document_metadata = document_metadata
-                await safe_set_chunks(session, document, chunks)
-                document.updated_at = get_current_timestamp()
-                document.status = DocumentStatus.ready()
-
-                indexed_count += 1
-
-                # Batch commit every 10 documents (for ready status updates)
-                if indexed_count % 10 == 0:
-                    logger.info(
-                        f"Committing batch: {indexed_count} Obsidian notes processed so far"
-                    )
-                    await session.commit()
-
-            except Exception as e:
-                logger.exception(
-                    f"Error processing file {item.get('file_info', {}).get('path', 'unknown')}: {e}"
-                )
-                # Mark document as failed with reason (visible in UI)
-                try:
-                    document.status = DocumentStatus.failed(str(e))
-                    document.updated_at = get_current_timestamp()
-                except Exception as status_error:
-                    logger.error(
-                        f"Failed to update document status to failed: {status_error}"
-                    )
-                failed_count += 1
-                continue
-
-        # CRITICAL: Always update timestamp (even if 0 documents indexed) so Zero syncs
-        await update_connector_last_indexed(session, connector, update_last_indexed)
-
-        # Final commit for any remaining documents not yet committed in batches
-        logger.info(f"Final commit: Total {indexed_count} Obsidian notes processed")
-        try:
-            await session.commit()
-            logger.info(
-                "Successfully committed all Obsidian document changes to database"
-            )
-        except Exception as e:
-            # Handle any remaining integrity errors gracefully (race conditions, etc.)
-            if (
-                "duplicate key value violates unique constraint" in str(e).lower()
-                or "uniqueviolationerror" in str(e).lower()
-            ):
-                logger.warning(
-                    f"Duplicate content_hash detected during final commit. "
-                    f"This may occur if the same note was indexed by multiple connectors. "
-                    f"Rolling back and continuing. Error: {e!s}"
-                )
-                await session.rollback()
-                # Don't fail the entire task - some documents may have been successfully indexed
-            else:
-                raise
-
-        # Build warning message if there were issues
-        warning_parts = []
-        if duplicate_content_count > 0:
-            warning_parts.append(f"{duplicate_content_count} duplicate")
-        if failed_count > 0:
-            warning_parts.append(f"{failed_count} failed")
-        warning_message = ", ".join(warning_parts) if warning_parts else None
-
-        total_processed = indexed_count
-
-        await task_logger.log_task_success(
-            log_entry,
-            f"Successfully completed Obsidian vault indexing for connector {connector_id}",
-            {
-                "notes_processed": total_processed,
-                "documents_indexed": indexed_count,
-                "documents_skipped": skipped_count,
-                "documents_failed": failed_count,
-                "duplicate_content_count": duplicate_content_count,
-            },
-        )
-
-        logger.info(
-            f"Obsidian vault indexing completed: {indexed_count} ready, "
-            f"{skipped_count} skipped, {failed_count} failed "
-            f"({duplicate_content_count} duplicate content)"
-        )
-        return total_processed, warning_message
-
-    except SQLAlchemyError as e:
-        logger.exception(f"Database error during Obsidian indexing: {e}")
-        await session.rollback()
-        await task_logger.log_task_failure(
-            log_entry,
-            f"Database error during Obsidian indexing: {e}",
-            "Database error",
-            {"error_type": "SQLAlchemyError"},
-        )
-        return 0, f"Database error: {e}"
-
-    except Exception as e:
-        logger.exception(f"Error during Obsidian indexing: {e}")
-        await task_logger.log_task_failure(
-            log_entry,
-            f"Error during Obsidian indexing: {e}",
-            "Unexpected error",
-            {"error_type": type(e).__name__},
-        )
-        return 0, str(e)
diff --git a/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py
index 06517f5424..2def799f30 100644
--- a/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py
@@ -171,6 +171,7 @@ async def _download_files_parallel(
     enable_summary: bool,
     max_concurrency: int = 3,
     on_heartbeat: HeartbeatCallbackType | None = None,
+    vision_llm=None,
 ) -> tuple[list[ConnectorDocument], int]:
     """Download and ETL files in parallel. Returns (docs, failed_count)."""
     results: list[ConnectorDocument] = []
@@ -183,7 +184,7 @@ async def _download_one(file: dict) -> ConnectorDocument | None:
         nonlocal last_heartbeat, completed_count
         async with sem:
             markdown, od_metadata, error = await download_and_extract_content(
-                onedrive_client, file
+                onedrive_client, file, vision_llm=vision_llm
             )
             if error or not markdown:
                 file_name = file.get("name", "Unknown")
@@ -231,6 +232,7 @@ async def _download_and_index(
     user_id: str,
     enable_summary: bool,
     on_heartbeat: HeartbeatCallbackType | None = None,
+    vision_llm=None,
 ) -> tuple[int, int]:
     """Parallel download then parallel indexing. Returns (batch_indexed, total_failed)."""
     connector_docs, download_failed = await _download_files_parallel(
@@ -241,6 +243,7 @@ async def _download_and_index(
         user_id=user_id,
         enable_summary=enable_summary,
         on_heartbeat=on_heartbeat,
+        vision_llm=vision_llm,
     )
 
     batch_indexed = 0
@@ -293,6 +296,7 @@ async def _index_selected_files(
     user_id: str,
     enable_summary: bool,
     on_heartbeat: HeartbeatCallbackType | None = None,
+    vision_llm=None,
 ) -> tuple[int, int, int, list[str]]:
     """Index user-selected files using the parallel pipeline."""
     page_limit_service = PageLimitService(session)
@@ -343,6 +347,7 @@ async def _index_selected_files(
         user_id=user_id,
         enable_summary=enable_summary,
         on_heartbeat=on_heartbeat,
+        vision_llm=vision_llm,
     )
 
     if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@@ -375,6 +380,7 @@ async def _index_full_scan(
     include_subfolders: bool = True,
     on_heartbeat_callback: HeartbeatCallbackType | None = None,
     enable_summary: bool = True,
+    vision_llm=None,
 ) -> tuple[int, int, int]:
     """Full scan indexing of a folder.
 
@@ -450,6 +456,7 @@ async def _index_full_scan(
         user_id=user_id,
         enable_summary=enable_summary,
         on_heartbeat=on_heartbeat_callback,
+        vision_llm=vision_llm,
     )
 
     if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@@ -481,6 +488,7 @@ async def _index_with_delta_sync(
     max_files: int,
     on_heartbeat_callback: HeartbeatCallbackType | None = None,
     enable_summary: bool = True,
+    vision_llm=None,
 ) -> tuple[int, int, int, str | None]:
     """Delta sync using OneDrive change tracking.
 
@@ -573,6 +581,7 @@ async def _index_with_delta_sync(
         user_id=user_id,
         enable_summary=enable_summary,
         on_heartbeat=on_heartbeat_callback,
+        vision_llm=vision_llm,
     )
 
     if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@@ -643,6 +652,13 @@ async def index_onedrive_files(
             return 0, 0, error_msg, 0
 
         connector_enable_summary = getattr(connector, "enable_summary", True)
+        connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
+        vision_llm = None
+        if connector_enable_vision_llm:
+            from app.services.llm_service import get_vision_llm
+
+            vision_llm = await get_vision_llm(session, search_space_id)
+
         onedrive_client = OneDriveClient(session, connector_id)
 
         indexing_options = items_dict.get("indexing_options", {})
@@ -666,6 +682,7 @@ async def index_onedrive_files(
                 search_space_id=search_space_id,
                 user_id=user_id,
                 enable_summary=connector_enable_summary,
+                vision_llm=vision_llm,
             )
             total_indexed += indexed
             total_skipped += skipped
@@ -695,6 +712,7 @@ async def index_onedrive_files(
                     log_entry,
                     max_files,
                     enable_summary=connector_enable_summary,
+                    vision_llm=vision_llm,
                 )
                 total_indexed += indexed
                 total_skipped += skipped
@@ -721,6 +739,7 @@ async def index_onedrive_files(
                     max_files,
                     include_subfolders,
                     enable_summary=connector_enable_summary,
+                    vision_llm=vision_llm,
                 )
                 total_indexed += ri
                 total_skipped += rs
@@ -740,6 +759,7 @@ async def index_onedrive_files(
                     max_files,
                     include_subfolders,
                     enable_summary=connector_enable_summary,
+                    vision_llm=vision_llm,
                 )
                 total_indexed += indexed
                 total_skipped += skipped
diff --git a/surfsense_backend/app/tasks/document_processors/_direct_converters.py b/surfsense_backend/app/tasks/document_processors/_direct_converters.py
index bbff4838e2..76c3ea209d 100644
--- a/surfsense_backend/app/tasks/document_processors/_direct_converters.py
+++ b/surfsense_backend/app/tasks/document_processors/_direct_converters.py
@@ -21,6 +21,42 @@
 # at import time so every csv.reader call in this module can handle large fields.
 csv.field_size_limit(2**31 - 1)
 
+_BOM_ENCODINGS: list[tuple[bytes, str]] = [
+    (b"\xff\xfe\x00\x00", "utf-32-le"),
+    (b"\x00\x00\xfe\xff", "utf-32-be"),
+    (b"\xff\xfe", "utf-16-le"),
+    (b"\xfe\xff", "utf-16-be"),
+    (b"\xef\xbb\xbf", "utf-8-sig"),
+]
+
+
+def _detect_encoding(file_path: str) -> str:
+    """Sniff the BOM to pick an encoding, falling back to utf-8."""
+    head = Path(file_path).read_bytes()[:4]
+    for bom, encoding in _BOM_ENCODINGS:
+        if head.startswith(bom):
+            return encoding
+    return "utf-8"
+
+
+def _read_text(file_path: str) -> str:
+    """Read a file with automatic encoding detection.
+
+    Tries BOM-based detection first, then utf-8, then latin-1 as a
+    last resort (latin-1 accepts every byte value).
+    """
+    encoding = _detect_encoding(file_path)
+    try:
+        return Path(file_path).read_text(encoding=encoding)
+    except (UnicodeDecodeError, UnicodeError):
+        pass
+    if encoding != "utf-8":
+        try:
+            return Path(file_path).read_text(encoding="utf-8")
+        except (UnicodeDecodeError, UnicodeError):
+            pass
+    return Path(file_path).read_text(encoding="latin-1")
+
 
 def _escape_pipe(cell: str) -> str:
     """Escape literal pipe characters inside a markdown table cell."""
@@ -33,9 +69,9 @@ def csv_to_markdown(file_path: str, *, delimiter: str = ",") -> str:
     The first row is treated as the header.  An empty file returns an
     empty string so the caller can decide how to handle it.
     """
-    with open(file_path, encoding="utf-8", newline="") as fh:
-        reader = csv.reader(fh, delimiter=delimiter)
-        rows = list(reader)
+    text = _read_text(file_path)
+    reader = csv.reader(text.splitlines(), delimiter=delimiter)
+    rows = list(reader)
 
     if not rows:
         return ""
@@ -64,7 +100,7 @@ def tsv_to_markdown(file_path: str) -> str:
 
 def html_to_markdown(file_path: str) -> str:
     """Convert an HTML file to markdown via ``markdownify``."""
-    html = Path(file_path).read_text(encoding="utf-8")
+    html = _read_text(file_path)
     return markdownify(html).strip()
 
 
diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py
index c765dbd876..137c27cdaa 100644
--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@@ -46,6 +46,8 @@ class _ProcessingContext:
     log_entry: Log
     connector: dict | None = None
     notification: Notification | None = None
+    use_vision_llm: bool = False
+    processing_mode: str = "basic"
     enable_summary: bool = field(init=False)
 
     def __post_init__(self) -> None:
@@ -118,7 +120,7 @@ async def _log_page_divergence(
 
 
 async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | None:
-    """Extract content from a non-document file (plaintext/direct_convert/audio) via the unified ETL pipeline."""
+    """Extract content from a non-document file (plaintext/direct_convert/audio/image) via the unified ETL pipeline."""
     from app.etl_pipeline.etl_document import EtlRequest
     from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
 
@@ -129,7 +131,17 @@ async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | No
         {"processing_stage": "extracting"},
     )
 
-    etl_result = await EtlPipelineService().extract(
+    # Fetch the vision LLM whenever the operator opts in. The ETL
+    # pipeline decides what to do with it: image files run through the
+    # vision LLM directly; document files (PDFs) get per-image
+    # descriptions appended via picture_describer.
+    vision_llm = None
+    if ctx.use_vision_llm:
+        from app.services.llm_service import get_vision_llm
+
+        vision_llm = await get_vision_llm(ctx.session, ctx.search_space_id)
+
+    etl_result = await EtlPipelineService(vision_llm=vision_llm).extract(
         EtlRequest(file_path=ctx.file_path, filename=ctx.filename)
     )
 
@@ -176,21 +188,28 @@ async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | No
 
 async def _process_document_upload(ctx: _ProcessingContext) -> Document | None:
     """Route a document file to the configured ETL service via the unified pipeline."""
-    from app.etl_pipeline.etl_document import EtlRequest
+    from app.etl_pipeline.etl_document import EtlRequest, ProcessingMode
     from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
     from app.services.page_limit_service import PageLimitExceededError, PageLimitService
 
+    mode = ProcessingMode.coerce(ctx.processing_mode)
     page_limit_service = PageLimitService(ctx.session)
     estimated_pages = _estimate_pages_safe(page_limit_service, ctx.file_path)
+    billable_pages = estimated_pages * mode.page_multiplier
 
     await ctx.task_logger.log_task_progress(
         ctx.log_entry,
         f"Estimated {estimated_pages} pages for file: {ctx.filename}",
-        {"estimated_pages": estimated_pages, "file_type": "document"},
+        {
+            "estimated_pages": estimated_pages,
+            "billable_pages": billable_pages,
+            "processing_mode": mode.value,
+            "file_type": "document",
+        },
     )
 
     try:
-        await page_limit_service.check_page_limit(ctx.user_id, estimated_pages)
+        await page_limit_service.check_page_limit(ctx.user_id, billable_pages)
     except PageLimitExceededError as e:
         await ctx.task_logger.log_task_failure(
             ctx.log_entry,
@@ -201,6 +220,8 @@ async def _process_document_upload(ctx: _ProcessingContext) -> Document | None:
                 "pages_used": e.pages_used,
                 "pages_limit": e.pages_limit,
                 "estimated_pages": estimated_pages,
+                "billable_pages": billable_pages,
+                "processing_mode": mode.value,
             },
         )
         with contextlib.suppress(Exception):
@@ -209,11 +230,21 @@ async def _process_document_upload(ctx: _ProcessingContext) -> Document | None:
 
     await _notify(ctx, "parsing", "Extracting content")
 
-    etl_result = await EtlPipelineService().extract(
+    # Document files (PDF, docx, etc.) get vision LLM treatment too:
+    # the ETL pipeline appends a per-image description section when
+    # vision_llm is provided. See picture_describer.describe_pictures.
+    vision_llm = None
+    if ctx.use_vision_llm:
+        from app.services.llm_service import get_vision_llm
+
+        vision_llm = await get_vision_llm(ctx.session, ctx.search_space_id)
+
+    etl_result = await EtlPipelineService(vision_llm=vision_llm).extract(
         EtlRequest(
             file_path=ctx.file_path,
             filename=ctx.filename,
             estimated_pages=estimated_pages,
+            processing_mode=mode,
         )
     )
 
@@ -235,7 +266,7 @@ async def _process_document_upload(ctx: _ProcessingContext) -> Document | None:
 
     if result:
         await page_limit_service.update_page_usage(
-            ctx.user_id, estimated_pages, allow_exceed=True
+            ctx.user_id, billable_pages, allow_exceed=True
         )
         if ctx.connector:
             await update_document_from_connector(result, ctx.connector, ctx.session)
@@ -248,6 +279,8 @@ async def _process_document_upload(ctx: _ProcessingContext) -> Document | None:
                 "file_type": "document",
                 "etl_service": etl_result.etl_service,
                 "pages_processed": estimated_pages,
+                "billable_pages": billable_pages,
+                "processing_mode": mode.value,
             },
         )
     else:
@@ -278,6 +311,8 @@ async def process_file_in_background(
     log_entry: Log,
     connector: dict | None = None,
     notification: Notification | None = None,
+    use_vision_llm: bool = False,
+    processing_mode: str = "basic",
 ) -> Document | None:
     ctx = _ProcessingContext(
         session=session,
@@ -289,6 +324,8 @@ async def process_file_in_background(
         log_entry=log_entry,
         connector=connector,
         notification=notification,
+        use_vision_llm=use_vision_llm,
+        processing_mode=processing_mode,
     )
 
     try:
@@ -333,33 +370,39 @@ async def process_file_in_background(
 async def _extract_file_content(
     file_path: str,
     filename: str,
+    search_space_id: int,
     session: AsyncSession,
     user_id: str,
     task_logger: TaskLoggingService,
     log_entry: Log,
     notification: Notification | None,
-) -> tuple[str, str]:
+    use_vision_llm: bool = False,
+    processing_mode: str = "basic",
+) -> tuple[str, str, int]:
     """
     Extract markdown content from a file regardless of type.
 
     Returns:
-        Tuple of (markdown_content, etl_service_name).
+        Tuple of (markdown_content, etl_service_name, billable_pages).
     """
-    from app.etl_pipeline.etl_document import EtlRequest
+    from app.etl_pipeline.etl_document import EtlRequest, ProcessingMode
     from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
     from app.etl_pipeline.file_classifier import (
         FileCategory,
         classify_file as etl_classify,
     )
 
+    mode = ProcessingMode.coerce(processing_mode)
     category = etl_classify(filename)
     estimated_pages = 0
+    billable_pages = 0
 
     if notification:
         stage_messages = {
             FileCategory.PLAINTEXT: "Reading file",
             FileCategory.DIRECT_CONVERT: "Converting file",
             FileCategory.AUDIO: "Transcribing audio",
+            FileCategory.IMAGE: "Analyzing image",
             FileCategory.UNSUPPORTED: "Unsupported file type",
             FileCategory.DOCUMENT: "Extracting content",
         }
@@ -381,28 +424,35 @@ async def _extract_file_content(
 
         page_limit_service = PageLimitService(session)
         estimated_pages = _estimate_pages_safe(page_limit_service, file_path)
-        await page_limit_service.check_page_limit(user_id, estimated_pages)
+        billable_pages = estimated_pages * mode.page_multiplier
+        await page_limit_service.check_page_limit(user_id, billable_pages)
+
+    # Vision LLM is provided to the ETL pipeline for any file category
+    # when the operator opts in. Image files run through it directly;
+    # document files (PDFs) get per-image descriptions appended via
+    # picture_describer.
+    vision_llm = None
+    if use_vision_llm:
+        from app.services.llm_service import get_vision_llm
+
+        vision_llm = await get_vision_llm(session, search_space_id)
 
-    result = await EtlPipelineService().extract(
+    result = await EtlPipelineService(vision_llm=vision_llm).extract(
         EtlRequest(
             file_path=file_path,
             filename=filename,
             estimated_pages=estimated_pages,
+            processing_mode=mode,
         )
     )
 
-    if category == FileCategory.DOCUMENT:
-        await page_limit_service.update_page_usage(
-            user_id, estimated_pages, allow_exceed=True
-        )
-
     with contextlib.suppress(Exception):
         os.unlink(file_path)
 
-    if not result.markdown_content:
+    if not result.markdown_content or not result.markdown_content.strip():
         raise RuntimeError(f"Failed to extract content from file: {filename}")
 
-    return result.markdown_content, result.etl_service
+    return result.markdown_content, result.etl_service, billable_pages
 
 
 async def process_file_in_background_with_document(
@@ -417,12 +467,17 @@ async def process_file_in_background_with_document(
     connector: dict | None = None,
     notification: Notification | None = None,
     should_summarize: bool = False,
+    use_vision_llm: bool = False,
+    processing_mode: str = "basic",
 ) -> Document | None:
     """
     Process file and update existing pending document (2-phase pattern).
 
     Phase 1 (API layer): Created document with pending status.
     Phase 2 (this function): Process file and update document to ready/failed.
+
+    Page usage is deferred until after dedup check and successful indexing
+    to avoid charging for duplicate or failed uploads.
     """
     from app.indexing_pipeline.adapters.file_upload_adapter import (
         UploadDocumentAdapter,
@@ -435,21 +490,22 @@ async def process_file_in_background_with_document(
     doc_id = document.id
 
     try:
-        # Step 1: extract content
-        markdown_content, etl_service = await _extract_file_content(
+        markdown_content, etl_service, billable_pages = await _extract_file_content(
             file_path,
             filename,
+            search_space_id,
             session,
             user_id,
             task_logger,
             log_entry,
             notification,
+            use_vision_llm=use_vision_llm,
+            processing_mode=processing_mode,
         )
 
         if not markdown_content:
             raise RuntimeError(f"Failed to extract content from file: {filename}")
 
-        # Step 2: duplicate check
         content_hash = generate_content_hash(markdown_content, search_space_id)
         existing_by_content = await check_duplicate_document(session, content_hash)
         if existing_by_content and existing_by_content.id != doc_id:
@@ -459,7 +515,6 @@ async def process_file_in_background_with_document(
             )
             return None
 
-        # Step 3: index via pipeline
         if notification:
             await NotificationService.document_processing.notify_processing_progress(
                 session,
@@ -480,6 +535,14 @@ async def process_file_in_background_with_document(
             should_summarize=should_summarize,
         )
 
+        if billable_pages > 0:
+            from app.services.page_limit_service import PageLimitService
+
+            page_limit_service = PageLimitService(session)
+            await page_limit_service.update_page_usage(
+                user_id, billable_pages, allow_exceed=True
+            )
+
         await task_logger.log_task_success(
             log_entry,
             f"Successfully processed file: {filename}",
@@ -487,6 +550,8 @@ async def process_file_in_background_with_document(
                 "document_id": doc_id,
                 "content_hash": content_hash,
                 "file_type": etl_service,
+                "billable_pages": billable_pages,
+                "processing_mode": processing_mode,
             },
         )
         return document
diff --git a/surfsense_backend/app/utils/async_retry.py b/surfsense_backend/app/utils/async_retry.py
new file mode 100644
index 0000000000..607b7a1562
--- /dev/null
+++ b/surfsense_backend/app/utils/async_retry.py
@@ -0,0 +1,126 @@
+"""Async retry decorators for connector API calls, built on tenacity."""
+
+from __future__ import annotations
+
+import contextlib
+import logging
+from collections.abc import Callable
+from typing import TypeVar
+
+import httpx
+from tenacity import (
+    before_sleep_log,
+    retry,
+    retry_if_exception,
+    stop_after_attempt,
+    stop_after_delay,
+    wait_exponential_jitter,
+)
+
+from app.connectors.exceptions import (
+    ConnectorAPIError,
+    ConnectorAuthError,
+    ConnectorError,
+    ConnectorRateLimitError,
+    ConnectorTimeoutError,
+)
+
+logger = logging.getLogger(__name__)
+
+F = TypeVar("F", bound=Callable)
+
+
+def _is_retryable(exc: BaseException) -> bool:
+    if isinstance(exc, ConnectorError):
+        return exc.retryable
+    return bool(isinstance(exc, httpx.TimeoutException | httpx.ConnectError))
+
+
+def build_retry(
+    *,
+    max_attempts: int = 4,
+    max_delay: float = 60.0,
+    initial_delay: float = 1.0,
+    total_timeout: float = 180.0,
+    service: str = "",
+) -> Callable:
+    """Configurable tenacity ``@retry`` decorator with exponential backoff + jitter."""
+    _logger = logging.getLogger(f"connector.retry.{service}") if service else logger
+
+    return retry(
+        retry=retry_if_exception(_is_retryable),
+        stop=(stop_after_attempt(max_attempts) | stop_after_delay(total_timeout)),
+        wait=wait_exponential_jitter(initial=initial_delay, max=max_delay),
+        reraise=True,
+        before_sleep=before_sleep_log(_logger, logging.WARNING),
+    )
+
+
+def retry_on_transient(
+    *,
+    service: str = "",
+    max_attempts: int = 4,
+) -> Callable:
+    """Shorthand: retry up to *max_attempts* on rate-limits, timeouts, and 5xx."""
+    return build_retry(max_attempts=max_attempts, service=service)
+
+
+def raise_for_status(
+    response: httpx.Response,
+    *,
+    service: str = "",
+) -> None:
+    """Map non-2xx httpx responses to the appropriate ``ConnectorError``."""
+    if response.is_success:
+        return
+
+    status = response.status_code
+
+    try:
+        body = response.json()
+    except Exception:
+        body = response.text[:500] if response.text else None
+
+    if status == 429:
+        retry_after_raw = response.headers.get("Retry-After")
+        retry_after: float | None = None
+        if retry_after_raw:
+            with contextlib.suppress(ValueError, TypeError):
+                retry_after = float(retry_after_raw)
+        raise ConnectorRateLimitError(
+            f"{service} rate limited (429)",
+            service=service,
+            retry_after=retry_after,
+            response_body=body,
+        )
+
+    if status in (401, 403):
+        raise ConnectorAuthError(
+            f"{service} authentication failed ({status})",
+            service=service,
+            status_code=status,
+            response_body=body,
+        )
+
+    if status == 504:
+        raise ConnectorTimeoutError(
+            f"{service} gateway timeout (504)",
+            service=service,
+            status_code=status,
+            response_body=body,
+        )
+
+    if status >= 500:
+        raise ConnectorAPIError(
+            f"{service} server error ({status})",
+            service=service,
+            status_code=status,
+            response_body=body,
+        )
+
+    raise ConnectorAPIError(
+        f"{service} request failed ({status})",
+        service=service,
+        status_code=status,
+        response_body=body,
+    )
diff --git a/surfsense_backend/app/utils/connector_naming.py b/surfsense_backend/app/utils/connector_naming.py
index 610be4a22a..99c8243a5d 100644
--- a/surfsense_backend/app/utils/connector_naming.py
+++ b/surfsense_backend/app/utils/connector_naming.py
@@ -39,7 +39,7 @@
 def get_base_name_for_type(connector_type: SearchSourceConnectorType) -> str:
     """Get a friendly display name for a connector type."""
     return BASE_NAME_FOR_TYPE.get(
-        connector_type, connector_type.replace("_", " ").title()
+        connector_type, connector_type.value.replace("_", " ").title()
     )
 
 
@@ -231,9 +231,14 @@ async def generate_unique_connector_name(
     base = get_base_name_for_type(connector_type)
 
     if identifier:
-        return f"{base} - {identifier}"
+        name = f"{base} - {identifier}"
+        return await ensure_unique_connector_name(
+            session,
+            name,
+            search_space_id,
+            user_id,
+        )
 
-    # Fallback: use counter for uniqueness
     count = await count_connectors_of_type(
         session, connector_type, search_space_id, user_id
     )
diff --git a/surfsense_backend/app/utils/content_utils.py b/surfsense_backend/app/utils/content_utils.py
index 5f2a4cd46a..05a4610c71 100644
--- a/surfsense_backend/app/utils/content_utils.py
+++ b/surfsense_backend/app/utils/content_utils.py
@@ -22,6 +22,20 @@
     from app.db import ChatVisibility
 
 
+import re
+
+_FENCE_RE = re.compile(
+    r"^```(?:\w+)?\s*\n(.*?)```\s*$",
+    re.DOTALL,
+)
+
+
+def strip_markdown_fences(text: str) -> str:
+    """Remove a single markdown code fence (```json ... ```) wrapper if present."""
+    m = _FENCE_RE.match(text.strip())
+    return m.group(1).strip() if m else text
+
+
 def extract_text_content(content: str | dict | list) -> str:
     """Extract plain text content from various message formats."""
     if isinstance(content, str):
diff --git a/surfsense_backend/app/utils/file_extensions.py b/surfsense_backend/app/utils/file_extensions.py
index 16ac585b7d..e8be1b83ab 100644
--- a/surfsense_backend/app/utils/file_extensions.py
+++ b/surfsense_backend/app/utils/file_extensions.py
@@ -7,10 +7,33 @@
 DIRECT_CONVERT_EXTENSIONS in file_classifier are NOT repeated here -- these
 sets are exclusively for the "document" ETL path (Docling / LlamaParse /
 Unstructured).
+
+Image extensions intentionally remain in the per-parser sets for fallback
+compatibility.  IMAGE_EXTENSIONS is used only for routing classification.
 """
 
 from pathlib import PurePosixPath
 
+# ---------------------------------------------------------------------------
+# Image extensions (used by file_classifier for routing to vision LLM)
+# ---------------------------------------------------------------------------
+
+IMAGE_EXTENSIONS: frozenset[str] = frozenset(
+    {
+        ".png",
+        ".jpg",
+        ".jpeg",
+        ".gif",
+        ".bmp",
+        ".tiff",
+        ".tif",
+        ".webp",
+        ".svg",
+        ".heic",
+        ".heif",
+    }
+)
+
 # ---------------------------------------------------------------------------
 # Per-parser document extension sets (from official documentation)
 # ---------------------------------------------------------------------------
diff --git a/surfsense_backend/app/utils/periodic_scheduler.py b/surfsense_backend/app/utils/periodic_scheduler.py
index 9ea45df63a..35e8ad781c 100644
--- a/surfsense_backend/app/utils/periodic_scheduler.py
+++ b/surfsense_backend/app/utils/periodic_scheduler.py
@@ -18,23 +18,12 @@
 
 # Mapping of connector types to their corresponding Celery task names
 CONNECTOR_TASK_MAP = {
-    SearchSourceConnectorType.SLACK_CONNECTOR: "index_slack_messages",
-    SearchSourceConnectorType.TEAMS_CONNECTOR: "index_teams_messages",
     SearchSourceConnectorType.NOTION_CONNECTOR: "index_notion_pages",
     SearchSourceConnectorType.GITHUB_CONNECTOR: "index_github_repos",
-    SearchSourceConnectorType.LINEAR_CONNECTOR: "index_linear_issues",
-    SearchSourceConnectorType.JIRA_CONNECTOR: "index_jira_issues",
     SearchSourceConnectorType.CONFLUENCE_CONNECTOR: "index_confluence_pages",
-    SearchSourceConnectorType.CLICKUP_CONNECTOR: "index_clickup_tasks",
-    SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR: "index_google_calendar_events",
-    SearchSourceConnectorType.AIRTABLE_CONNECTOR: "index_airtable_records",
-    SearchSourceConnectorType.GOOGLE_GMAIL_CONNECTOR: "index_google_gmail_messages",
-    SearchSourceConnectorType.DISCORD_CONNECTOR: "index_discord_messages",
-    SearchSourceConnectorType.LUMA_CONNECTOR: "index_luma_events",
     SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: "index_elasticsearch_documents",
     SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: "index_crawled_urls",
     SearchSourceConnectorType.BOOKSTACK_CONNECTOR: "index_bookstack_pages",
-    SearchSourceConnectorType.OBSIDIAN_CONNECTOR: "index_obsidian_vault",
 }
 
 
@@ -84,44 +73,22 @@ def create_periodic_schedule(
             f"(frequency: {frequency_minutes} minutes). Triggering first run..."
         )
 
-        # Import all indexing tasks
         from app.tasks.celery_tasks.connector_tasks import (
-            index_airtable_records_task,
             index_bookstack_pages_task,
-            index_clickup_tasks_task,
             index_confluence_pages_task,
             index_crawled_urls_task,
-            index_discord_messages_task,
             index_elasticsearch_documents_task,
             index_github_repos_task,
-            index_google_calendar_events_task,
-            index_google_gmail_messages_task,
-            index_jira_issues_task,
-            index_linear_issues_task,
-            index_luma_events_task,
             index_notion_pages_task,
-            index_obsidian_vault_task,
-            index_slack_messages_task,
         )
 
-        # Map connector type to task
         task_map = {
-            SearchSourceConnectorType.SLACK_CONNECTOR: index_slack_messages_task,
             SearchSourceConnectorType.NOTION_CONNECTOR: index_notion_pages_task,
             SearchSourceConnectorType.GITHUB_CONNECTOR: index_github_repos_task,
-            SearchSourceConnectorType.LINEAR_CONNECTOR: index_linear_issues_task,
-            SearchSourceConnectorType.JIRA_CONNECTOR: index_jira_issues_task,
             SearchSourceConnectorType.CONFLUENCE_CONNECTOR: index_confluence_pages_task,
-            SearchSourceConnectorType.CLICKUP_CONNECTOR: index_clickup_tasks_task,
-            SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR: index_google_calendar_events_task,
-            SearchSourceConnectorType.AIRTABLE_CONNECTOR: index_airtable_records_task,
-            SearchSourceConnectorType.GOOGLE_GMAIL_CONNECTOR: index_google_gmail_messages_task,
-            SearchSourceConnectorType.DISCORD_CONNECTOR: index_discord_messages_task,
-            SearchSourceConnectorType.LUMA_CONNECTOR: index_luma_events_task,
             SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task,
             SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_crawled_urls_task,
             SearchSourceConnectorType.BOOKSTACK_CONNECTOR: index_bookstack_pages_task,
-            SearchSourceConnectorType.OBSIDIAN_CONNECTOR: index_obsidian_vault_task,
         }
 
         # Trigger the first run immediately
diff --git a/surfsense_backend/app/utils/surfsense_docs.py b/surfsense_backend/app/utils/surfsense_docs.py
new file mode 100644
index 0000000000..9a6ab11a9e
--- /dev/null
+++ b/surfsense_backend/app/utils/surfsense_docs.py
@@ -0,0 +1,13 @@
+"""Utilities for SurfSense's built-in documentation index."""
+
+from pathlib import PurePosixPath
+
+DOCS_PUBLIC_ROOT = PurePosixPath("/docs")
+
+
+def surfsense_docs_public_url(source: str) -> str:
+    """Return the public docs route for an indexed documentation source path."""
+    docs_path = PurePosixPath(source).with_suffix("")
+    if docs_path.name == "index":
+        docs_path = docs_path.parent
+    return (DOCS_PUBLIC_ROOT / docs_path).as_posix()
diff --git a/surfsense_backend/app/utils/user_message_multimodal.py b/surfsense_backend/app/utils/user_message_multimodal.py
new file mode 100644
index 0000000000..dc9a6fe767
--- /dev/null
+++ b/surfsense_backend/app/utils/user_message_multimodal.py
@@ -0,0 +1,82 @@
+"""Helpers for multimodal user turns (text + inline images) in LangChain messages."""
+
+from __future__ import annotations
+
+import base64
+import binascii
+from typing import Any
+
+
+def build_human_message_content(
+    final_query: str, image_data_urls: list[str]
+) -> str | list[dict[str, Any]]:
+    if not image_data_urls:
+        return final_query
+    parts: list[dict[str, Any]] = [{"type": "text", "text": final_query}]
+    for url in image_data_urls:
+        parts.append({"type": "image_url", "image_url": {"url": url}})
+    return parts
+
+
+def split_langchain_human_content(content: str | list[Any]) -> tuple[str, list[str]]:
+    """Return plain text and data URLs from a LangChain HumanMessage ``content`` value."""
+    if isinstance(content, str):
+        return content, []
+    if not isinstance(content, list):
+        return "", []
+
+    text_chunks: list[str] = []
+    urls: list[str] = []
+    for block in content:
+        if not isinstance(block, dict):
+            continue
+        btype = block.get("type")
+        if btype == "text":
+            t = block.get("text")
+            if isinstance(t, str) and t:
+                text_chunks.append(t)
+        elif btype == "image_url":
+            iu = block.get("image_url")
+            if isinstance(iu, dict):
+                u = iu.get("url")
+                if isinstance(u, str) and u.startswith("data:"):
+                    urls.append(u)
+            elif isinstance(iu, str) and iu.startswith("data:"):
+                urls.append(iu)
+    return "\n".join(text_chunks), urls
+
+
+def decode_base64_image(data: str, *, max_bytes: int) -> bytes:
+    raw = data.strip()
+    if not raw:
+        raise ValueError("empty image payload")
+    try:
+        decoded = base64.b64decode(raw, validate=True)
+    except binascii.Error as e:
+        raise ValueError("invalid base64 image data") from e
+    if len(decoded) > max_bytes:
+        raise ValueError("image exceeds maximum size")
+    return decoded
+
+
+def to_data_url(media_type: str, raw_b64: str) -> str:
+    return f"data:{media_type};base64,{raw_b64.strip()}"
+
+
+def split_persisted_user_content_parts(parts: list[Any]) -> tuple[str, list[str]]:
+    """Extract plain text and data URLs from persisted assistant-ui style user ``content``."""
+    text_chunks: list[str] = []
+    urls: list[str] = []
+    for block in parts:
+        if not isinstance(block, dict):
+            continue
+        btype = block.get("type")
+        if btype == "text":
+            t = block.get("text")
+            if isinstance(t, str):
+                text_chunks.append(t)
+        elif btype == "image":
+            u = block.get("image")
+            if isinstance(u, str) and u.startswith("data:"):
+                urls.append(u)
+    return "".join(text_chunks), urls
diff --git a/surfsense_backend/pyproject.toml b/surfsense_backend/pyproject.toml
index f0ca6d7d16..26fee1bc30 100644
--- a/surfsense_backend/pyproject.toml
+++ b/surfsense_backend/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "surf-new-backend"
-version = "0.0.15"
+version = "0.0.24"
 description = "SurfSense Backend"
 requires-python = ">=3.12"
 dependencies = [
@@ -71,11 +71,11 @@ dependencies = [
     "langchain>=1.2.13",
     "langgraph>=1.1.3",
     "langchain-community>=0.4.1",
-    "deepagents>=0.4.12",
     "stripe>=15.0.0",
     "azure-ai-documentintelligence>=1.0.2",
-    "litellm>=1.83.0",
+    "litellm>=1.83.7",
     "langchain-litellm>=0.6.4",
+    "deepagents>=0.4.12,<0.5",
 ]
 
 [dependency-groups]
diff --git a/surfsense_backend/scripts/create_sandbox_snapshot.py b/surfsense_backend/scripts/create_sandbox_snapshot.py
new file mode 100644
index 0000000000..82d4521749
--- /dev/null
+++ b/surfsense_backend/scripts/create_sandbox_snapshot.py
@@ -0,0 +1,105 @@
+"""Create the Daytona snapshot used by SurfSense code-execution sandboxes.
+
+Run from the backend directory:
+    cd surfsense_backend
+    uv run python scripts/create_sandbox_snapshot.py
+
+Prerequisites:
+    - DAYTONA_API_KEY set in surfsense_backend/.env (or exported in shell)
+    - DAYTONA_API_URL=https://app.daytona.io/api
+    - DAYTONA_TARGET=us  (or eu)
+
+After this script succeeds, add to surfsense_backend/.env:
+    DAYTONA_SNAPSHOT_ID=surfsense-sandbox
+"""
+
+import os
+import sys
+import time
+from pathlib import Path
+
+from dotenv import load_dotenv
+
+_here = Path(__file__).parent
+for candidate in [
+    _here / "../surfsense_backend/.env",
+    _here / ".env",
+    _here / "../.env",
+]:
+    if candidate.exists():
+        load_dotenv(candidate)
+        break
+
+from daytona import CreateSnapshotParams, Daytona, Image  # noqa: E402
+
+SNAPSHOT_NAME = "surfsense-sandbox"
+
+PACKAGES = [
+    "pandas",
+    "numpy",
+    "matplotlib",
+    "scipy",
+    "scikit-learn",
+]
+
+
+def build_image() -> Image:
+    """Build the sandbox image with data-science packages and a /documents symlink."""
+    return (
+        Image.debian_slim("3.12")
+        .pip_install(*PACKAGES)
+        # Symlink /documents → /home/daytona/documents so the LLM can use
+        # the same /documents/ path it sees in the virtual filesystem.
+        .run_commands(
+            "mkdir -p /home/daytona/documents",
+            "ln -sfn /home/daytona/documents /documents",
+        )
+    )
+
+
+def main() -> None:
+    api_key = os.environ.get("DAYTONA_API_KEY")
+    if not api_key:
+        print("ERROR: DAYTONA_API_KEY is not set.", file=sys.stderr)
+        print(
+            "Add it to surfsense_backend/.env or export it in your shell.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    daytona = Daytona()
+
+    try:
+        existing = daytona.snapshot.get(SNAPSHOT_NAME)
+        print(f"Deleting existing snapshot '{SNAPSHOT_NAME}' …")
+        daytona.snapshot.delete(existing)
+        print(f"Deleted '{SNAPSHOT_NAME}'. Waiting for removal to propagate …")
+        for _attempt in range(30):
+            time.sleep(2)
+            try:
+                daytona.snapshot.get(SNAPSHOT_NAME)
+            except Exception:
+                print(f"Confirmed '{SNAPSHOT_NAME}' is gone.\n")
+                break
+        else:
+            print(
+                f"WARNING: '{SNAPSHOT_NAME}' may still exist after 60s. Proceeding anyway.\n"
+            )
+    except Exception:
+        pass
+
+    print(f"Building snapshot '{SNAPSHOT_NAME}' …")
+    print(f"Packages: {', '.join(PACKAGES)}\n")
+
+    daytona.snapshot.create(
+        CreateSnapshotParams(name=SNAPSHOT_NAME, image=build_image()),
+        on_logs=lambda chunk: print(chunk, end="", flush=True),
+    )
+
+    print(f"\n\nSnapshot '{SNAPSHOT_NAME}' is ready.")
+    print("\nAdd this to surfsense_backend/.env:")
+    print(f"    DAYTONA_SNAPSHOT_ID={SNAPSHOT_NAME}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/surfsense_backend/scripts/docker/entrypoint.e2e.sh b/surfsense_backend/scripts/docker/entrypoint.e2e.sh
new file mode 100755
index 0000000000..b44e1ee95e
--- /dev/null
+++ b/surfsense_backend/scripts/docker/entrypoint.e2e.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+# =============================================================================
+# E2E entrypoint for the multi-stage Dockerfile's `e2e` target.
+#
+# Dispatches on SERVICE_ROLE to the test-only entrypoints under tests/e2e/.
+# Those scripts apply sys.modules hijacks and LLM/embedding patches BEFORE
+# importing production app code (see tests/e2e/run_backend.py for rationale).
+#
+# Production never sees this file: tests/ is excluded from the production
+# stage, and the production stage uses scripts/docker/entrypoint.sh.
+# =============================================================================
+set -euo pipefail
+
+SERVICE_ROLE="${SERVICE_ROLE:-api}"
+echo "[e2e-entrypoint] starting role=${SERVICE_ROLE}"
+
+wait_for_db() {
+    # Block until the database is reachable. We don't loop forever — Compose
+    # depends_on/healthchecks already gate on db readiness, this is just
+    # belt-and-suspenders so a slow first connection doesn't race migrations.
+    for i in {1..60}; do
+        echo "[e2e-entrypoint] db check attempt ${i}/60"
+        if python -c "from app.db import engine; import asyncio; asyncio.run(engine.dispose())"; then
+            echo "[e2e-entrypoint] db reachable after ${i} attempts"
+            return 0
+        fi
+        sleep 1
+    done
+    echo "[e2e-entrypoint] ERROR: db not reachable after 60s" >&2
+    return 1
+}
+
+case "${SERVICE_ROLE}" in
+    api)
+        wait_for_db
+        echo "[e2e-entrypoint] running alembic upgrade head"
+        alembic upgrade head
+        # `exec` so SIGTERM from `docker stop` reaches Python directly,
+        # without a shell wrapper interposing.
+        exec python tests/e2e/run_backend.py
+        ;;
+    worker)
+        # Worker doesn't run migrations — the api role does that exactly once.
+        # We still wait for db so Celery's broker connection check doesn't
+        # race against an unready Postgres on cold start.
+        wait_for_db
+        exec python tests/e2e/run_celery.py
+        ;;
+    *)
+        echo "[e2e-entrypoint] ERROR: unknown SERVICE_ROLE='${SERVICE_ROLE}' (expected: api | worker)" >&2
+        exit 1
+        ;;
+esac
diff --git a/surfsense_backend/scripts/docker/entrypoint.sh b/surfsense_backend/scripts/docker/entrypoint.sh
index 7bfcfce865..81db1ae847 100755
--- a/surfsense_backend/scripts/docker/entrypoint.sh
+++ b/surfsense_backend/scripts/docker/entrypoint.sh
@@ -4,10 +4,15 @@ set -e
 # ─────────────────────────────────────────────────────────────
 # SERVICE_ROLE controls which process(es) this container runs.
 #
-#   api     – FastAPI backend only  (runs migrations on startup)
+#   migrate – Run `alembic upgrade head`, verify zero_publication,
+#             then exit 0. Used by the dedicated `migrations` service
+#             in docker-compose.yml so downstream services can gate
+#             on `condition: service_completed_successfully`.
+#   api     – FastAPI backend only (does NOT run migrations)
 #   worker  – Celery worker only
 #   beat    – Celery beat scheduler only
-#   all     – All three in one container (legacy / dev default)
+#   all     – migrations + api + worker + beat in one container
+#             (legacy / dev default; fails fast on migration error)
 #
 # Set SERVICE_ROLE as an environment variable in Coolify for
 # each service deployment.
@@ -41,7 +46,13 @@ cleanup() {
 
 trap cleanup SIGTERM SIGINT
 
-# ── Database migrations (only for api / all) ─────────────────
+# ── Database migrations (only for migrate / all) ─────────────
+# Fail-fast contract:
+#   - alembic upgrade head must succeed within ${MIGRATION_TIMEOUT:-900}s
+#   - zero_publication must exist in pg_publication afterwards
+# Either failure exits non-zero so the dedicated `migrations` compose
+# service exits non-zero, halting the rest of the stack instead of
+# silently producing a half-built system that crash-loops zero-cache.
 run_migrations() {
     echo "Running database migrations..."
     for i in {1..30}; do
@@ -53,11 +64,66 @@ run_migrations() {
         sleep 1
     done
 
-    if timeout 300 alembic upgrade head 2>&1; then
-        echo "Migrations completed successfully."
-    else
-        echo "WARNING: Migration failed or timed out. Continuing anyway..."
-        echo "You may need to run migrations manually: alembic upgrade head"
+    local timeout_secs="${MIGRATION_TIMEOUT:-900}"
+    echo "Running alembic upgrade head (timeout=${timeout_secs}s)..."
+    if ! timeout "${timeout_secs}" alembic upgrade head; then
+        echo "ERROR: alembic upgrade head failed (or exceeded ${timeout_secs}s timeout)." >&2
+        echo "Refusing to start. Inspect the error above and re-run." >&2
+        exit 1
+    fi
+    echo "Migrations completed successfully."
+
+    echo "Verifying zero_publication exists in Postgres..."
+    local pub_oid
+    pub_oid=$(python <<'PY' 2>/dev/null || true
+import asyncio
+import sys
+from sqlalchemy import text
+from app.db import engine
+
+
+async def get_oid():
+    async with engine.connect() as conn:
+        result = await conn.execute(
+            text("SELECT oid FROM pg_publication WHERE pubname = 'zero_publication'")
+        )
+        row = result.first()
+        if row is None:
+            sys.exit(1)
+        print(int(row[0]))
+
+
+asyncio.run(get_oid())
+PY
+)
+    if [ -z "${pub_oid}" ]; then
+        echo "ERROR: zero_publication is missing from Postgres after running alembic." >&2
+        echo "This usually means migration 116 (or a later publication migration) did not run." >&2
+        echo "Inspect alembic state with:" >&2
+        echo "  docker compose exec db psql -U \"\$DB_USER\" -d \"\$DB_NAME\" -c 'SELECT * FROM alembic_version;'" >&2
+        exit 1
+    fi
+    echo "zero_publication verified (oid=${pub_oid})."
+
+    # Stale-replica safety net: if /zero-init is mounted (i.e. we are the
+    # dedicated `migrations` compose service), drop a marker file when the
+    # publication oid changed (or on first run) so the wrapped zero-cache
+    # entrypoint can wipe /data/zero.db before starting. This recovers from
+    # the case where a previous zero-cache crashed mid-init and left a
+    # half-built SQLite replica without a `_zero.tableMetadata` table.
+    if [ -d /zero-init ]; then
+        local stored_oid=""
+        [ -f /zero-init/last_pub_oid ] && stored_oid=$(cat /zero-init/last_pub_oid 2>/dev/null || true)
+        if [ -z "${stored_oid}" ] || [ "${stored_oid}" != "${pub_oid}" ]; then
+            echo "Publication oid changed (stored=${stored_oid:-<none>}, current=${pub_oid}); writing /zero-init/needs_reset."
+            : > /zero-init/needs_reset
+            chmod 666 /zero-init/needs_reset 2>/dev/null || true
+        fi
+        echo "${pub_oid}" > /zero-init/last_pub_oid
+        chmod 666 /zero-init/last_pub_oid 2>/dev/null || true
+        # World-writable dir so the (possibly non-root) zero-cache container
+        # can `rm -f /zero-init/needs_reset` after acting on the marker.
+        chmod 777 /zero-init 2>/dev/null || true
     fi
 }
 
@@ -102,8 +168,12 @@ start_beat() {
 
 # ── Main: run based on role ──────────────────────────────────
 case "${SERVICE_ROLE}" in
-    api)
+    migrate)
         run_migrations
+        echo "Migrations complete; exiting cleanly."
+        exit 0
+        ;;
+    api)
         start_api
         ;;
     worker)
@@ -121,7 +191,7 @@ case "${SERVICE_ROLE}" in
         start_beat
         ;;
     *)
-        echo "ERROR: Unknown SERVICE_ROLE '${SERVICE_ROLE}'. Use: api, worker, beat, or all"
+        echo "ERROR: Unknown SERVICE_ROLE '${SERVICE_ROLE}'. Use: migrate, api, worker, beat, or all"
         exit 1
         ;;
 esac
diff --git a/surfsense_backend/scripts/verify_chat_image_capability.py b/surfsense_backend/scripts/verify_chat_image_capability.py
new file mode 100644
index 0000000000..a49d4eab26
--- /dev/null
+++ b/surfsense_backend/scripts/verify_chat_image_capability.py
@@ -0,0 +1,558 @@
+"""End-to-end smoke test for vision / image config wiring.
+
+Loads the live ``global_llm_config.yaml`` (no mocking, no fixtures) and
+exercises every chat / vision / image-generation config + the OpenRouter
+dynamic catalog. For each config the script:
+
+1. Reports the resolver classification (catalog-allow vs strict-block).
+2. Optionally fires a tiny live API call against the provider:
+   - Chat configs: ``litellm.acompletion`` with a 1x1 PNG and the prompt
+     ``"reply with one word: ok"``.
+   - Vision configs: same, against the dedicated vision router pool.
+   - Image-gen configs: ``litellm.aimage_generation`` with a single tiny
+     prompt and ``n=1``.
+   - OpenRouter integration: samples one chat, one vision, one image-gen
+     model from the dynamically fetched catalog.
+
+Usage::
+
+    python -m scripts.verify_chat_image_capability             # capability + connectivity
+    python -m scripts.verify_chat_image_capability --no-live   # capability resolver only
+
+The script is meant to be runnable from the repository root or from
+``surfsense_backend/`` and prints a short PASS/FAIL/SKIP summary at the
+end so it's usable as a CI smoke check too.
+
+Live-mode caveat: each successful call costs a small amount of provider
+credit (a few tokens or one tiny generated image per config). The
+default size for image generation is ``1024x1024`` because Azure
+GPT-image deployments reject smaller sizes; OpenRouter image-gen models
+generally accept the same size.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import logging
+import os
+import sys
+import time
+from dataclasses import dataclass, field
+from typing import Any
+
+# Bootstrap the surfsense_backend package on sys.path so the script runs
+# from the repo root or from `surfsense_backend/` interchangeably.
+_HERE = os.path.dirname(os.path.abspath(__file__))
+_BACKEND_ROOT = os.path.dirname(_HERE)
+if _BACKEND_ROOT not in sys.path:
+    sys.path.insert(0, _BACKEND_ROOT)
+
+import litellm  # noqa: E402
+
+from app.config import config  # noqa: E402
+from app.services.openrouter_integration_service import (  # noqa: E402
+    _OPENROUTER_DYNAMIC_MARKER,
+    OpenRouterIntegrationService,
+)
+from app.services.provider_api_base import resolve_api_base  # noqa: E402
+from app.services.provider_capabilities import (  # noqa: E402
+    derive_supports_image_input,
+    is_known_text_only_chat_model,
+)
+
+logging.basicConfig(
+    level=logging.WARNING,
+    format="%(asctime)s | %(levelname)-8s | %(name)s | %(message)s",
+)
+# Quiet down LiteLLM's verbose router/cost logs so the script output is
+# scannable.
+logging.getLogger("LiteLLM").setLevel(logging.ERROR)
+logging.getLogger("litellm").setLevel(logging.ERROR)
+logging.getLogger("httpx").setLevel(logging.ERROR)
+
+# 1x1 transparent PNG — used as the cheapest possible vision payload.
+_TINY_PNG_B64 = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=="
+_TINY_PNG_DATA_URL = f"data:image/png;base64,{_TINY_PNG_B64}"
+
+
+# ---------------------------------------------------------------------------
+# Result accounting
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class ProbeResult:
+    label: str
+    surface: str
+    config_id: int | str
+    capability_ok: bool | None = None
+    capability_note: str = ""
+    live_ok: bool | None = None
+    live_note: str = ""
+    duration_s: float = 0.0
+
+
+@dataclass
+class Report:
+    results: list[ProbeResult] = field(default_factory=list)
+
+    def add(self, r: ProbeResult) -> None:
+        self.results.append(r)
+
+    def render(self) -> int:
+        passed = failed = skipped = 0
+        print()
+        print("=" * 92)
+        print(
+            f"{'Surface':<14}{'ID':>8}  {'Cap':>5}  {'Live':>5}  {'Time':>6}  Label  /  notes"
+        )
+        print("-" * 92)
+        for r in self.results:
+
+            def _flag(value: bool | None) -> str:
+                if value is None:
+                    return "skip"
+                return "ok" if value else "fail"
+
+            cap = _flag(r.capability_ok)
+            live = _flag(r.live_ok)
+            if r.capability_ok is False or r.live_ok is False:
+                failed += 1
+            elif r.capability_ok is None and r.live_ok is None:
+                skipped += 1
+            else:
+                passed += 1
+            print(
+                f"{r.surface:<14}{r.config_id!s:>8}  {cap:>5}  {live:>5}  "
+                f"{r.duration_s:>5.2f}s  {r.label}"
+            )
+            if r.capability_note:
+                print(f"               cap:  {r.capability_note}")
+            if r.live_note:
+                print(f"               live: {r.live_note}")
+        print("-" * 92)
+        print(
+            f"Total: {passed} ok / {failed} fail / {skipped} skip "
+            f"(of {len(self.results)} probes)"
+        )
+        print("=" * 92)
+        return failed
+
+
+# ---------------------------------------------------------------------------
+# Capability probes (no network)
+# ---------------------------------------------------------------------------
+
+
+def _probe_chat_capability(cfg: dict) -> tuple[bool, str]:
+    """For chat configs the catalog flag is *expected* True (vision-capable
+    pool). The probe reports both the resolver value and the strict
+    safety-net value to surface any drift between them."""
+    litellm_params = cfg.get("litellm_params") or {}
+    base_model = (
+        litellm_params.get("base_model") if isinstance(litellm_params, dict) else None
+    )
+    cap = derive_supports_image_input(
+        provider=cfg.get("provider"),
+        model_name=cfg.get("model_name"),
+        base_model=base_model,
+        custom_provider=cfg.get("custom_provider"),
+    )
+    block = is_known_text_only_chat_model(
+        provider=cfg.get("provider"),
+        model_name=cfg.get("model_name"),
+        base_model=base_model,
+        custom_provider=cfg.get("custom_provider"),
+    )
+    note = f"derive={cap} strict_block={block}"
+    if not cap and not block:
+        # Resolver said False but strict gate is also False — that means
+        # OR modalities published [text] explicitly. Surface it.
+        note += " (OR modality says text-only)"
+    # We accept a True derive *or* (False derive AND False block) as
+    # 'capability ok' — either way, the streaming task will flow through.
+    ok = cap or not block
+    return ok, note
+
+
+def _build_chat_model_string(cfg: dict) -> str:
+    if cfg.get("custom_provider"):
+        return f"{cfg['custom_provider']}/{cfg['model_name']}"
+    from app.services.provider_capabilities import _PROVIDER_PREFIX_MAP
+
+    prefix = _PROVIDER_PREFIX_MAP.get(
+        (cfg.get("provider") or "").upper(), (cfg.get("provider") or "").lower()
+    )
+    return f"{prefix}/{cfg['model_name']}"
+
+
+# ---------------------------------------------------------------------------
+# Live probes (network calls)
+# ---------------------------------------------------------------------------
+
+
+async def _live_chat_image_call(cfg: dict) -> tuple[bool, str]:
+    """Send a 1x1 PNG + `reply with one word: ok` to the chat config."""
+    model_string = _build_chat_model_string(cfg)
+    api_base = resolve_api_base(
+        provider=cfg.get("provider"),
+        provider_prefix=model_string.split("/", 1)[0],
+        config_api_base=cfg.get("api_base") or None,
+    )
+    kwargs: dict[str, Any] = {
+        "model": model_string,
+        "api_key": cfg.get("api_key"),
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "reply with one word: ok"},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": _TINY_PNG_DATA_URL},
+                    },
+                ],
+            }
+        ],
+        "max_tokens": 16,
+        "timeout": 60,
+    }
+    if api_base:
+        kwargs["api_base"] = api_base
+    if cfg.get("litellm_params"):
+        # Strip pricing keys — they're tracking-only and confuse some
+        # provider validators (e.g. azure/openai reject unknown kwargs
+        # in strict mode).
+        merged = {
+            k: v
+            for k, v in dict(cfg["litellm_params"]).items()
+            if k
+            not in {
+                "input_cost_per_token",
+                "output_cost_per_token",
+                "input_cost_per_pixel",
+                "output_cost_per_pixel",
+            }
+        }
+        kwargs.update(merged)
+    try:
+        resp = await litellm.acompletion(**kwargs)
+    except Exception as exc:
+        return False, f"{type(exc).__name__}: {exc}"
+    text = resp.choices[0].message.content if resp.choices else ""
+    return True, f"got reply ({(text or '').strip()[:40]!r})"
+
+
+# Gemini image models occasionally return zero-length ``data`` for the
+# minimal "red dot on white" prompt (provider-side safety / empty-output
+# quirk reproducible against ``google/gemini-2.5-flash-image`` even when
+# the request itself succeeds). Use a more naturalistic prompt and
+# retry once with a different one before giving up.
+_IMAGE_GEN_PROMPTS: tuple[str, ...] = (
+    "A simple icon of a coffee cup, flat illustration",
+    "A small green leaf on a white background",
+)
+
+
+async def _live_image_gen_call(cfg: dict) -> tuple[bool, str]:
+    """Generate one tiny image to verify the deployment is reachable."""
+    from app.services.provider_capabilities import _PROVIDER_PREFIX_MAP
+
+    if cfg.get("custom_provider"):
+        prefix = cfg["custom_provider"]
+    else:
+        prefix = _PROVIDER_PREFIX_MAP.get(
+            (cfg.get("provider") or "").upper(), (cfg.get("provider") or "").lower()
+        )
+    model_string = f"{prefix}/{cfg['model_name']}"
+    api_base = resolve_api_base(
+        provider=cfg.get("provider"),
+        provider_prefix=prefix,
+        config_api_base=cfg.get("api_base") or None,
+    )
+    base_kwargs: dict[str, Any] = {
+        "model": model_string,
+        "api_key": cfg.get("api_key"),
+        "n": 1,
+        "size": "1024x1024",
+        "timeout": 120,
+    }
+    if api_base:
+        base_kwargs["api_base"] = api_base
+    if cfg.get("api_version"):
+        base_kwargs["api_version"] = cfg["api_version"]
+    if cfg.get("litellm_params"):
+        base_kwargs.update(
+            {
+                k: v
+                for k, v in dict(cfg["litellm_params"]).items()
+                if k
+                not in {
+                    "input_cost_per_token",
+                    "output_cost_per_token",
+                    "input_cost_per_pixel",
+                    "output_cost_per_pixel",
+                }
+            }
+        )
+
+    last_note = ""
+    for attempt, prompt in enumerate(_IMAGE_GEN_PROMPTS, start=1):
+        try:
+            resp = await litellm.aimage_generation(prompt=prompt, **base_kwargs)
+        except Exception as exc:
+            last_note = f"{type(exc).__name__}: {exc}"
+            continue
+        data_count = len(getattr(resp, "data", None) or [])
+        if data_count > 0:
+            return True, (
+                f"received {data_count} image(s) on attempt {attempt} "
+                f"(prompt={prompt!r})"
+            )
+        last_note = (
+            f"call ok but received 0 images on attempt {attempt} (prompt={prompt!r})"
+        )
+    return False, last_note
+
+
+# ---------------------------------------------------------------------------
+# Probe drivers
+# ---------------------------------------------------------------------------
+
+
+def _is_or_dynamic(cfg: dict) -> bool:
+    return bool(cfg.get(_OPENROUTER_DYNAMIC_MARKER))
+
+
+async def probe_chat_configs(report: Report, *, live: bool) -> None:
+    print("\n[chat configs from global_llm_configs (YAML-static)]")
+    for cfg in config.GLOBAL_LLM_CONFIGS:
+        # Skip OR dynamic entries here — handled in the OR section so
+        # the YAML / OR split stays clear in the report.
+        if _is_or_dynamic(cfg):
+            continue
+        result = ProbeResult(
+            label=str(cfg.get("name") or cfg.get("model_name")),
+            surface="chat-yaml",
+            config_id=cfg.get("id"),
+        )
+        cap_ok, cap_note = _probe_chat_capability(cfg)
+        result.capability_ok = cap_ok
+        result.capability_note = cap_note
+        if live:
+            t0 = time.perf_counter()
+            ok, note = await _live_chat_image_call(cfg)
+            result.live_ok = ok
+            result.live_note = note
+            result.duration_s = time.perf_counter() - t0
+        report.add(result)
+
+
+async def probe_vision_configs(report: Report, *, live: bool) -> None:
+    print("\n[vision configs from global_vision_llm_configs (YAML-static)]")
+    for cfg in config.GLOBAL_VISION_LLM_CONFIGS:
+        if _is_or_dynamic(cfg):
+            continue
+        result = ProbeResult(
+            label=str(cfg.get("name") or cfg.get("model_name")),
+            surface="vision",
+            config_id=cfg.get("id"),
+        )
+        # For vision configs, capability is implied — they're in the
+        # dedicated vision pool. Run the same resolver to flag any
+        # surprise disagreement.
+        cap_ok, cap_note = _probe_chat_capability(cfg)
+        result.capability_ok = cap_ok
+        result.capability_note = cap_note
+        if live:
+            t0 = time.perf_counter()
+            ok, note = await _live_chat_image_call(cfg)
+            result.live_ok = ok
+            result.live_note = note
+            result.duration_s = time.perf_counter() - t0
+        report.add(result)
+
+
+async def probe_image_gen_configs(report: Report, *, live: bool) -> None:
+    print(
+        "\n[image generation configs from global_image_generation_configs (YAML-static)]"
+    )
+    for cfg in config.GLOBAL_IMAGE_GEN_CONFIGS:
+        if _is_or_dynamic(cfg):
+            continue
+        result = ProbeResult(
+            label=str(cfg.get("name") or cfg.get("model_name")),
+            surface="image-gen",
+            config_id=cfg.get("id"),
+        )
+        # Image gen configs don't have a "supports_image_input" flag;
+        # the catalog tracks output, not input. Mark capability as None
+        # (skip) for the report.
+        if live:
+            t0 = time.perf_counter()
+            ok, note = await _live_image_gen_call(cfg)
+            result.live_ok = ok
+            result.live_note = note
+            result.duration_s = time.perf_counter() - t0
+        report.add(result)
+
+
+async def probe_openrouter_catalog(report: Report, *, live: bool) -> None:
+    """Sample one chat (vision-capable), one vision, one image-gen model
+    from the live OpenRouter catalogue. Doesn't iterate the full pool
+    (would be hundreds of probes); just validates the integration end-
+    to-end on a representative model from each surface."""
+    print("\n[OpenRouter integration: sampled probes]")
+    settings = config.OPENROUTER_INTEGRATION_SETTINGS
+    if not settings:
+        report.add(
+            ProbeResult(
+                label="OpenRouter integration",
+                surface="openrouter",
+                config_id="settings",
+                capability_ok=None,
+                capability_note="openrouter_integration disabled in YAML — skipping",
+                live_ok=None,
+            )
+        )
+        return
+
+    service = OpenRouterIntegrationService.get_instance()
+    or_chat = [
+        c
+        for c in config.GLOBAL_LLM_CONFIGS
+        if c.get("provider") == "OPENROUTER" and c.get("supports_image_input")
+    ]
+    or_vision = [
+        c for c in config.GLOBAL_VISION_LLM_CONFIGS if c.get("provider") == "OPENROUTER"
+    ]
+    or_image_gen = [
+        c for c in config.GLOBAL_IMAGE_GEN_CONFIGS if c.get("provider") == "OPENROUTER"
+    ]
+
+    # Pick one representative per provider family per surface so a single
+    # broken vendor (e.g. Anthropic key revoked, Google quota exceeded)
+    # surfaces independently of the others. Each needle matches the
+    # OpenRouter ``model_name`` prefix; the first match wins.
+    def _pick_first(pool: list[dict], needle: str) -> dict | None:
+        for c in pool:
+            if (c.get("model_name") or "").lower().startswith(needle):
+                return c
+        return None
+
+    chat_picks = [
+        ("or-chat", _pick_first(or_chat, "openai/gpt-4o")),
+        ("or-chat", _pick_first(or_chat, "anthropic/claude")),
+        ("or-chat", _pick_first(or_chat, "google/gemini-2.5-flash")),
+    ]
+    vision_picks = [
+        ("or-vision", _pick_first(or_vision, "openai/gpt-4o")),
+        ("or-vision", _pick_first(or_vision, "anthropic/claude")),
+        ("or-vision", _pick_first(or_vision, "google/gemini-2.5-flash")),
+    ]
+    image_picks = [
+        ("or-image", _pick_first(or_image_gen, "google/gemini-2.5-flash-image")),
+        # OpenRouter publishes OpenAI image models as ``openai/gpt-5-image*``
+        # / ``openai/gpt-5.4-image-2`` (no ``gpt-image`` literal). Match
+        # the actual prefix.
+        ("or-image", _pick_first(or_image_gen, "openai/gpt-5-image")),
+    ]
+
+    print(
+        f"  catalog: chat={len(or_chat)} vision={len(or_vision)} image_gen={len(or_image_gen)} "
+        f"(service initialized={service.is_initialized() if hasattr(service, 'is_initialized') else 'n/a'})"
+    )
+
+    for surface, picked in chat_picks + vision_picks + image_picks:
+        if not picked:
+            report.add(
+                ProbeResult(
+                    label=f"<no candidate for {surface}>",
+                    surface=surface,
+                    config_id="-",
+                    capability_ok=None,
+                    capability_note="no candidate found in OR catalog",
+                )
+            )
+            continue
+        runner = (
+            _live_image_gen_call if surface == "or-image" else _live_chat_image_call
+        )
+        result = ProbeResult(
+            label=str(picked.get("model_name")),
+            surface=surface,
+            config_id=picked.get("id"),
+        )
+        if surface != "or-image":
+            cap_ok, cap_note = _probe_chat_capability(picked)
+            result.capability_ok = cap_ok
+            result.capability_note = cap_note
+        if live:
+            t0 = time.perf_counter()
+            ok, note = await runner(picked)
+            result.live_ok = ok
+            result.live_note = note
+            result.duration_s = time.perf_counter() - t0
+        report.add(result)
+
+
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+
+
+async def main(args: argparse.Namespace) -> int:
+    print("Loaded global configs:")
+    print(f"  chat:      {len(config.GLOBAL_LLM_CONFIGS)} entries")
+    print(f"  vision:    {len(config.GLOBAL_VISION_LLM_CONFIGS)} entries")
+    print(f"  image-gen: {len(config.GLOBAL_IMAGE_GEN_CONFIGS)} entries")
+    print(f"  OR settings present: {bool(config.OPENROUTER_INTEGRATION_SETTINGS)}")
+
+    # Initialize the OpenRouter integration so the catalog is populated
+    # (this is what main.py does at startup). It's idempotent.
+    if config.OPENROUTER_INTEGRATION_SETTINGS:
+        try:
+            from app.config import initialize_openrouter_integration
+
+            initialize_openrouter_integration()
+        except Exception as exc:
+            print(f"  WARNING: OpenRouter integration init failed: {exc}")
+
+    print(
+        f"\nMode: {'LIVE (will hit providers)' if args.live else 'DRY (capability only)'}"
+    )
+
+    report = Report()
+    if not args.skip_chat:
+        await probe_chat_configs(report, live=args.live)
+    if not args.skip_vision:
+        await probe_vision_configs(report, live=args.live)
+    if not args.skip_image_gen:
+        await probe_image_gen_configs(report, live=args.live)
+    if not args.skip_openrouter:
+        await probe_openrouter_catalog(report, live=args.live)
+
+    failed = report.render()
+    return 1 if failed else 0
+
+
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--no-live",
+        dest="live",
+        action="store_false",
+        help="Skip live API calls — capability resolver only.",
+    )
+    parser.set_defaults(live=True)
+    parser.add_argument("--skip-chat", action="store_true")
+    parser.add_argument("--skip-vision", action="store_true")
+    parser.add_argument("--skip-image-gen", action="store_true")
+    parser.add_argument("--skip-openrouter", action="store_true")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = _parse_args()
+    sys.exit(asyncio.run(main(args)))
diff --git a/surfsense_backend/tests/e2e/README.md b/surfsense_backend/tests/e2e/README.md
new file mode 100644
index 0000000000..caa0f89b0c
--- /dev/null
+++ b/surfsense_backend/tests/e2e/README.md
@@ -0,0 +1,142 @@
+# Backend E2E Harness
+
+This directory contains the test-only backend entrypoints and fakes used by
+Playwright. They are not part of the production image: `.dockerignore` excludes
+`tests/`, and the E2E Docker stage copies this directory through a separate
+build context.
+
+## Files
+
+| Path | Purpose |
+| --- | --- |
+| `run_backend.py` | Starts FastAPI after installing the test fakes into `sys.modules`. |
+| `run_celery.py` | Starts the Celery worker with the same fake setup. |
+| `middleware/scenario.py` | Reads `X-E2E-Scenario` into a request-scoped context var. |
+| `fakes/composio_module.py` | Fake `composio` package used by connector flows. |
+| `fakes/llm.py` | Fake chat model factory. |
+| `fakes/embeddings.py` | Deterministic embedding helpers. |
+| `fakes/fixtures/drive_files.json` | Drive fixture data and canary file contents. |
+
+## Why the import hook exists
+
+Some production modules import SDK clients at module load time, for example
+`from composio import Composio`. By the time `app.app` has been imported, those
+bindings are already fixed.
+
+The E2E entrypoints install fake modules in `sys.modules` before importing any
+`app.*` module. That lets the normal production code run while SDK calls resolve
+to local fakes.
+
+The fakes should fail loudly. If production starts using a new SDK method that
+the fake does not implement, add that method to the fake instead of letting the
+test call the real service.
+
+## Adding a fake
+
+1. Add `fakes/<sdk>_module.py`.
+2. Register it in both `run_backend.py` and `run_celery.py` before importing
+   `app.app` or `app.celery_app`.
+3. If the fake needs per-test behavior, read the current scenario from
+   `tests.e2e.middleware.scenario.current_scenario()`.
+
+## Shared with backend integration tests
+
+Backend integration tests can use the same fakes when they need production route
+code without the real SDK:
+
+```python
+from tests.e2e.fakes import composio_module as _fake_composio
+sys.modules["composio"] = _fake_composio
+from app.app import app
+```
+
+See `surfsense_backend/tests/integration/composio/conftest.py` for the current
+pattern.
+
+## Running locally
+
+The recommended local flow runs only Postgres and Redis in Docker, and the
+backend + Celery worker on the host. No `.env` file is required: both
+entrypoints `setdefault` every variable they need (DB URL, Redis URL,
+sentinel API keys, etc.) to values that match `docker-compose.deps-only.yml`.
+
+### One-time setup
+
+From `surfsense_web/`:
+
+```bash
+pnpm install
+pnpm exec playwright install --with-deps chromium
+```
+
+### Each run
+
+**1. Bring up Postgres + Redis** from the repo root (the other deps-only
+services (SearXNG, Zero, pgAdmin) are not needed for E2E):
+
+```bash
+docker compose -f docker/docker-compose.deps-only.yml up -d db redis
+```
+
+**2. Start the backend** in `surfsense_backend/`, terminal A:
+
+```bash
+uv sync
+uv run alembic upgrade head
+uv run python tests/e2e/run_backend.py
+```
+
+**3. Start the Celery worker** in `surfsense_backend/`, terminal B:
+
+```bash
+uv run python tests/e2e/run_celery.py
+```
+
+**4. Register the Playwright user**:
+
+```bash
+curl -X POST http://localhost:8000/auth/register \
+  -H "Content-Type: application/json" \
+  -d '{"email":"e2e-test@surfsense.net","password":"E2eTestPassword123!"}'
+```
+
+**5. Run Playwright** from `surfsense_web/`, terminal C:
+
+```bash
+pnpm test:e2e             # dev server (fast iteration)
+pnpm test:e2e:headed      # show the browser
+pnpm test:e2e:ui          # Playwright UI mode
+pnpm test:e2e:prod        # build + start (matches CI exactly)
+```
+
+`playwright.config.ts` and the run scripts share defaults, so this works on a
+fresh checkout. Set `PLAYWRIGHT_TEST_EMAIL`, `PLAYWRIGHT_TEST_PASSWORD`,
+`NEXT_PUBLIC_FASTAPI_BACKEND_URL`, or any backend env (e.g. `DATABASE_URL`)
+only when pointing tests at a different stack.
+
+### Cleanup
+
+```bash
+docker compose -f docker/docker-compose.deps-only.yml down
+```
+
+Add `-v` to also wipe the Postgres volume.
+
+### Hermetic alternative (matches CI)
+
+To reproduce the CI environment exactly — backend and Celery in containers,
+network egress denied at L3 — replace steps 1–3 with:
+
+```bash
+docker compose -f docker/docker-compose.e2e.yml up -d --build --wait
+```
+
+Then run steps 4 (curl register) and 5 (`pnpm test:e2e:prod`) as above. Tear
+down with:
+
+```bash
+docker compose -f docker/docker-compose.e2e.yml down -v --remove-orphans
+```
+
+This builds the ~9 GB `surfsense-e2e-backend:local` image, so the deps-only
+flow above is faster for day-to-day development.
diff --git a/surfsense_backend/tests/e2e/__init__.py b/surfsense_backend/tests/e2e/__init__.py
new file mode 100644
index 0000000000..4b4aa76ae0
--- /dev/null
+++ b/surfsense_backend/tests/e2e/__init__.py
@@ -0,0 +1,7 @@
+"""E2E test harness root.
+
+This package is loaded only by the test entrypoints
+(`tests/e2e/run_backend.py`, `tests/e2e/run_celery.py`). It is excluded
+from the production Docker image via `surfsense_backend/.dockerignore`,
+so production binaries never see this code.
+"""
diff --git a/surfsense_backend/tests/e2e/auth_mint.py b/surfsense_backend/tests/e2e/auth_mint.py
new file mode 100644
index 0000000000..f489ed2746
--- /dev/null
+++ b/surfsense_backend/tests/e2e/auth_mint.py
@@ -0,0 +1,66 @@
+"""Test-only token mint endpoint for the E2E backend entrypoint.
+
+Mounted by ``tests/e2e/run_backend.py`` so Playwright can authenticate
+the seeded e2e user without hitting ``/auth/jwt/login`` (rate-limited
+to 5/min/IP in production). NEVER ships to production: this whole
+``tests/`` tree is excluded from the production Docker image by
+``surfsense_backend/.dockerignore``.
+
+Authn: shared secret in ``X-E2E-Mint-Secret``. Same value is set on the
+backend container env (``docker/docker-compose.e2e.yml``) and exported
+to the Playwright runner (``.github/workflows/e2e-tests.yml``).
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+
+from fastapi import APIRouter, FastAPI, Header, HTTPException
+from pydantic import BaseModel
+from sqlalchemy import select
+
+from app.db import User, async_session_maker
+from app.users import get_jwt_strategy
+
+_logger = logging.getLogger("surfsense.e2e.auth_mint")
+
+
+class MintRequest(BaseModel):
+    email: str = "e2e-test@surfsense.net"
+
+
+class MintResponse(BaseModel):
+    access_token: str
+    token_type: str = "bearer"
+
+
+def _expected_secret() -> str:
+    return os.environ.get("E2E_MINT_SECRET", "local-e2e-mint-secret-not-for-production")
+
+
+router = APIRouter(prefix="/__e2e__", tags=["__e2e__"])
+
+
+@router.post("/auth/token", response_model=MintResponse)
+async def mint_test_token(
+    body: MintRequest,
+    x_e2e_mint_secret: str = Header(..., alias="X-E2E-Mint-Secret"),
+) -> MintResponse:
+    if x_e2e_mint_secret != _expected_secret():
+        raise HTTPException(status_code=403, detail="invalid e2e mint secret")
+    async with async_session_maker() as session:
+        result = await session.execute(select(User).where(User.email == body.email))
+        user = result.scalar_one_or_none()
+    if user is None:
+        raise HTTPException(
+            status_code=404, detail=f"e2e user {body.email!r} not seeded"
+        )
+    token = await get_jwt_strategy().write_token(user)
+    return MintResponse(access_token=token)
+
+
+def install(app: FastAPI) -> None:
+    """Mount the test-only mint router onto the given FastAPI app."""
+    app.include_router(router)
+    _logger.warning("[e2e] mounted POST /__e2e__/auth/token (test-only token mint)")
diff --git a/surfsense_backend/tests/e2e/fakes/__init__.py b/surfsense_backend/tests/e2e/fakes/__init__.py
new file mode 100644
index 0000000000..6683b69315
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fakes/__init__.py
@@ -0,0 +1,8 @@
+"""Strict fakes for third-party SDKs, used in E2E mode only.
+
+Every fake here implements __getattr__ that raises NotImplementedError
+on any unknown surface. Combined with sys.modules-level hijacking in
+run_backend.py / run_celery.py, this makes silent pass-through to the
+real SDK impossible: a future production code path that introduces a
+new SDK call site fails CI with a clear "add this to the fake" message.
+"""
diff --git a/surfsense_backend/tests/e2e/fakes/binary_loader.py b/surfsense_backend/tests/e2e/fakes/binary_loader.py
new file mode 100644
index 0000000000..5e885b8aa0
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fakes/binary_loader.py
@@ -0,0 +1,23 @@
+"""Helpers for serving text and binary fixture file bodies."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+
+def _resolve_file_bytes(
+    fixture: dict[str, Any], key: str | None, fixtures_dir: Path
+) -> bytes | None:
+    """Resolve a fake file body, preferring binary fixture files over text."""
+    if not key:
+        return None
+
+    binary_path = fixture.get("_file_binary_paths", {}).get(key)
+    if binary_path is not None:
+        return (fixtures_dir / binary_path).read_bytes()
+
+    content = fixture.get("_file_contents", {}).get(key)
+    if content is None:
+        return None
+    return content.encode("utf-8")
diff --git a/surfsense_backend/tests/e2e/fakes/chat_llm.py b/surfsense_backend/tests/e2e/fakes/chat_llm.py
new file mode 100644
index 0000000000..fa3a2b158e
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fakes/chat_llm.py
@@ -0,0 +1,733 @@
+from __future__ import annotations
+
+import json
+from collections.abc import AsyncIterator
+from typing import Any, Self
+
+from langchain_core.callbacks import (
+    AsyncCallbackManagerForLLMRun,
+    CallbackManagerForLLMRun,
+)
+from langchain_core.language_models import BaseChatModel
+from langchain_core.messages import AIMessage, AIMessageChunk, BaseMessage
+from langchain_core.outputs import ChatGeneration, ChatGenerationChunk, ChatResult
+
+DRIVE_CANARY_TOKEN = "SURFSENSE_E2E_CANARY_TOKEN_DRIVE_001"
+DRIVE_PDF_CANARY_TOKEN = "SURFSENSE_E2E_CANARY_TOKEN_DRIVE_PDF_001"
+DRIVE_PDF_CANARY_FILE = "e2e-canary.pdf"
+COMPOSIO_DRIVE_PDF_CANARY_TOKEN = "SURFSENSE_E2E_CANARY_TOKEN_COMPOSIO_DRIVE_PDF_001"
+COMPOSIO_DRIVE_PDF_CANARY_FILE = "e2e-composio-canary.pdf"
+GMAIL_CANARY_TOKEN = "SURFSENSE_E2E_CANARY_TOKEN_GMAIL_001"
+GMAIL_CANARY_SUBJECT = "E2E Canary Email"
+GMAIL_CANARY_MESSAGE_ID = "fake-msg-canary-001"
+CALENDAR_CANARY_TOKEN = "SURFSENSE_E2E_CANARY_TOKEN_CALENDAR_001"
+CALENDAR_CANARY_SUMMARY = "E2E Canary Calendar Event"
+ONEDRIVE_CANARY_TOKEN = "SURFSENSE_E2E_CANARY_TOKEN_ONEDRIVE_001"
+ONEDRIVE_CANARY_FILE = "e2e-onedrive-canary.txt"
+ONEDRIVE_PDF_CANARY_TOKEN = "SURFSENSE_E2E_CANARY_TOKEN_ONEDRIVE_PDF_001"
+ONEDRIVE_PDF_CANARY_FILE = "e2e-onedrive-canary.pdf"
+DROPBOX_CANARY_TOKEN = "SURFSENSE_E2E_CANARY_TOKEN_DROPBOX_001"
+DROPBOX_CANARY_FILE = "e2e-dropbox-canary.txt"
+DROPBOX_PDF_CANARY_TOKEN = "SURFSENSE_E2E_CANARY_TOKEN_DROPBOX_PDF_001"
+DROPBOX_PDF_CANARY_FILE = "e2e-dropbox-canary.pdf"
+NOTION_CANARY_TOKEN = "SURFSENSE_E2E_CANARY_TOKEN_NOTION_001"
+NOTION_CANARY_TITLE = "E2E Canary Notion Page"
+CONFLUENCE_CANARY_TOKEN = "SURFSENSE_E2E_CANARY_TOKEN_CONFLUENCE_001"
+CONFLUENCE_CANARY_TITLE = "E2E Canary Confluence Page"
+LINEAR_CANARY_TOKEN = "SURFSENSE_E2E_CANARY_TOKEN_LINEAR_001"
+LINEAR_CANARY_TITLE = "E2E Canary Linear Issue"
+JIRA_CANARY_TOKEN = "SURFSENSE_E2E_CANARY_TOKEN_JIRA_001"
+JIRA_CANARY_SUMMARY = "E2E Canary Jira Issue"
+JIRA_CANARY_KEY = "E2E-101"
+SLACK_CANARY_TOKEN = "SURFSENSE_E2E_CANARY_TOKEN_SLACK_001"
+SLACK_CANARY_CHANNEL = "slack-e2e-canary"
+CLICKUP_CANARY_TOKEN = "SURFSENSE_E2E_CANARY_TOKEN_CLICKUP_001"
+CLICKUP_CANARY_TITLE = "E2E Canary ClickUp Task"
+CLICKUP_CANARY_TASK_ID = "fake-clickup-task-canary-001"
+MANUAL_UPLOAD_MD_CANARY_TOKEN = "E2E-MANUAL-UPLOAD-MD-CANARY-7f3a"
+MANUAL_UPLOAD_MD_CANARY_FILE = "canary.md"
+MANUAL_UPLOAD_PDF_CANARY_TOKEN = "E2E-MANUAL-UPLOAD-PDF-CANARY-9d2b"
+MANUAL_UPLOAD_PDF_CANARY_FILE = "canary.pdf"
+NO_RELEVANT_CONTENT_SENTINEL = "No relevant indexed content found."
+NO_RELEVANT_CONTENT_QUERY = "E2E_NO_RELEVANT_CONTENT_SMOKE"
+
+
+def _content_to_text(content: Any) -> str:
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        return " ".join(_content_to_text(item) for item in content)
+    if isinstance(content, dict):
+        text = content.get("text") or content.get("content")
+        if isinstance(text, str):
+            return text
+        return json.dumps(content, sort_keys=True)
+    if content is None:
+        return ""
+    return str(content)
+
+
+def _messages_to_text(messages: list[BaseMessage]) -> str:
+    return "\n".join(_content_to_text(message.content) for message in messages)
+
+
+def _contains_any(text: str, needles: tuple[str, ...]) -> bool:
+    lowered = text.lower()
+    return any(needle.lower() in lowered for needle in needles)
+
+
+def _latest_tool_message(messages: list[BaseMessage]) -> BaseMessage | None:
+    return next(
+        (message for message in reversed(messages) if message.type == "tool"), None
+    )
+
+
+class FakeChatLLM(BaseChatModel):
+    @property
+    def _llm_type(self) -> str:
+        return "e2e-fake-chat"
+
+    def bind_tools(self, tools: Any, **kwargs: Any) -> Self:
+        return self
+
+    def _response_for(self, messages: list[BaseMessage]) -> str:
+        latest_human = next(
+            (
+                _content_to_text(message.content)
+                for message in reversed(messages)
+                if message.type == "human"
+            ),
+            "",
+        )
+        if NO_RELEVANT_CONTENT_QUERY in latest_human:
+            return NO_RELEVANT_CONTENT_SENTINEL
+
+        prompt_text = _messages_to_text(messages)
+        latest_tool = _latest_tool_message(messages)
+        latest_tool_name = getattr(latest_tool, "name", None)
+        latest_tool_text = _content_to_text(latest_tool.content) if latest_tool else ""
+
+        if (
+            latest_tool_name == "read_gmail_email"
+            and GMAIL_CANARY_TOKEN in latest_tool_text
+        ):
+            return f"Gmail live tool content found: {GMAIL_CANARY_TOKEN}"
+        if (
+            latest_tool_name == "search_gmail"
+            and GMAIL_CANARY_MESSAGE_ID in latest_tool_text
+        ):
+            return "Reading the matching Gmail message next."
+        if (
+            latest_tool_name == "search_calendar_events"
+            and CALENDAR_CANARY_TOKEN in latest_tool_text
+        ):
+            return f"Calendar live tool content found: {CALENDAR_CANARY_TOKEN}"
+        if (
+            latest_tool_name == "list_issues"
+            and LINEAR_CANARY_TOKEN in latest_tool_text
+        ):
+            return f"Linear live tool content found: {LINEAR_CANARY_TOKEN}"
+        if (
+            latest_tool_name == "searchJiraIssuesUsingJql"
+            and JIRA_CANARY_TOKEN in latest_tool_text
+        ):
+            return f"Jira live tool content found: {JIRA_CANARY_TOKEN}"
+        if (
+            latest_tool_name == "slack_search_channels"
+            and SLACK_CANARY_TOKEN in latest_tool_text
+        ):
+            return f"Slack live tool content found: {SLACK_CANARY_TOKEN}"
+        if (
+            latest_tool_name in {"clickup_search", "clickup_get_task"}
+            and CLICKUP_CANARY_TOKEN in latest_tool_text
+        ):
+            return f"ClickUp live tool content found: {CLICKUP_CANARY_TOKEN}"
+
+        wants_gmail = _contains_any(
+            latest_human,
+            ("gmail", "email", "message", GMAIL_CANARY_SUBJECT),
+        )
+        wants_calendar = _contains_any(
+            latest_human,
+            ("calendar", "event", "meeting", CALENDAR_CANARY_SUMMARY),
+        )
+        wants_drive = _contains_any(
+            latest_human,
+            ("drive", "file", "e2e-canary.txt"),
+        )
+        wants_drive_pdf = _contains_any(
+            latest_human,
+            (
+                "drive pdf",
+                DRIVE_PDF_CANARY_FILE,
+                DRIVE_PDF_CANARY_TOKEN,
+                COMPOSIO_DRIVE_PDF_CANARY_FILE,
+                COMPOSIO_DRIVE_PDF_CANARY_TOKEN,
+            ),
+        ) or (wants_drive and "pdf" in latest_human.lower())
+        wants_onedrive = _contains_any(
+            latest_human,
+            ("onedrive", ONEDRIVE_CANARY_FILE, ONEDRIVE_CANARY_TOKEN),
+        )
+        wants_onedrive_pdf = wants_onedrive and _contains_any(
+            latest_human,
+            ("pdf", ONEDRIVE_PDF_CANARY_FILE, ONEDRIVE_PDF_CANARY_TOKEN),
+        )
+        wants_dropbox = _contains_any(
+            latest_human,
+            ("dropbox", DROPBOX_CANARY_FILE, DROPBOX_CANARY_TOKEN),
+        )
+        wants_dropbox_pdf = wants_dropbox and _contains_any(
+            latest_human,
+            ("pdf", DROPBOX_PDF_CANARY_FILE, DROPBOX_PDF_CANARY_TOKEN),
+        )
+        wants_notion = _contains_any(
+            latest_human,
+            ("notion", "page", NOTION_CANARY_TITLE),
+        )
+        wants_confluence = _contains_any(
+            latest_human,
+            ("confluence", CONFLUENCE_CANARY_TITLE),
+        )
+        wants_linear = _contains_any(
+            latest_human,
+            ("linear", "issue", LINEAR_CANARY_TITLE),
+        )
+        wants_jira = _contains_any(
+            latest_human,
+            (
+                "jira",
+                "atlassian",
+                JIRA_CANARY_SUMMARY,
+                JIRA_CANARY_KEY,
+                "surfsense-e2e.atlassian.net",
+                "fake-jira-cloud-001",
+            ),
+        )
+        wants_slack = _contains_any(
+            latest_human,
+            ("slack", SLACK_CANARY_TOKEN),
+        )
+        wants_clickup = _contains_any(
+            latest_human,
+            ("clickup", CLICKUP_CANARY_TITLE),
+        )
+        wants_manual_upload = _contains_any(
+            latest_human,
+            (
+                "uploaded",
+                "manual upload",
+                MANUAL_UPLOAD_MD_CANARY_FILE,
+                MANUAL_UPLOAD_PDF_CANARY_FILE,
+                MANUAL_UPLOAD_MD_CANARY_TOKEN,
+                MANUAL_UPLOAD_PDF_CANARY_TOKEN,
+            ),
+        )
+        wants_manual_upload_pdf = wants_manual_upload and _contains_any(
+            latest_human,
+            ("pdf", MANUAL_UPLOAD_PDF_CANARY_FILE, MANUAL_UPLOAD_PDF_CANARY_TOKEN),
+        )
+        wants_manual_upload_md = wants_manual_upload and _contains_any(
+            latest_human,
+            (
+                "markdown",
+                ".md",
+                MANUAL_UPLOAD_MD_CANARY_FILE,
+                MANUAL_UPLOAD_MD_CANARY_TOKEN,
+            ),
+        )
+        has_gmail_evidence = (
+            GMAIL_CANARY_SUBJECT in prompt_text
+            or GMAIL_CANARY_MESSAGE_ID in prompt_text
+            or GMAIL_CANARY_TOKEN in prompt_text
+        )
+        has_calendar_evidence = (
+            CALENDAR_CANARY_SUMMARY in prompt_text
+            or CALENDAR_CANARY_TOKEN in prompt_text
+        )
+        has_drive_evidence = (
+            "e2e-canary.txt" in prompt_text
+            or "fake-file-canary" in prompt_text
+            or DRIVE_CANARY_TOKEN in prompt_text
+        )
+        has_native_drive_pdf_evidence = (
+            DRIVE_PDF_CANARY_FILE in prompt_text
+            or "fake-file-pdf-native" in prompt_text
+            or DRIVE_PDF_CANARY_TOKEN in prompt_text
+        )
+        has_composio_drive_pdf_evidence = (
+            COMPOSIO_DRIVE_PDF_CANARY_FILE in prompt_text
+            or "fake-file-pdf-composio" in prompt_text
+            or COMPOSIO_DRIVE_PDF_CANARY_TOKEN in prompt_text
+        )
+        has_onedrive_evidence = (
+            ONEDRIVE_CANARY_FILE in prompt_text
+            or "fake-onedrive-canary" in prompt_text
+            or ONEDRIVE_CANARY_TOKEN in prompt_text
+        )
+        has_onedrive_pdf_evidence = (
+            ONEDRIVE_PDF_CANARY_FILE in prompt_text
+            or "fake-onedrive-pdf-canary" in prompt_text
+            or ONEDRIVE_PDF_CANARY_TOKEN in prompt_text
+        )
+        has_dropbox_evidence = (
+            DROPBOX_CANARY_FILE in prompt_text
+            or "id:fake-dropbox-canary" in prompt_text
+            or DROPBOX_CANARY_TOKEN in prompt_text
+        )
+        has_dropbox_pdf_evidence = (
+            DROPBOX_PDF_CANARY_FILE in prompt_text
+            or "id:fake-dropbox-pdf-canary" in prompt_text
+            or DROPBOX_PDF_CANARY_TOKEN in prompt_text
+        )
+        has_notion_evidence = (
+            NOTION_CANARY_TITLE in prompt_text or NOTION_CANARY_TOKEN in prompt_text
+        )
+        has_confluence_evidence = (
+            CONFLUENCE_CANARY_TITLE in prompt_text
+            or CONFLUENCE_CANARY_TOKEN in prompt_text
+            or "fake-confluence-page-canary-001" in prompt_text
+            or "fake-confluence-space-001" in prompt_text
+        )
+        has_linear_evidence = (
+            LINEAR_CANARY_TITLE in prompt_text
+            or LINEAR_CANARY_TOKEN in prompt_text
+            or "fake-linear-issue-canary-001" in prompt_text
+        )
+        has_jira_evidence = (
+            JIRA_CANARY_SUMMARY in prompt_text
+            or JIRA_CANARY_TOKEN in prompt_text
+            or JIRA_CANARY_KEY in prompt_text
+            or "fake-jira-issue-canary-001" in prompt_text
+            or "fake-jira-cloud-001" in prompt_text
+            or "surfsense-e2e.atlassian.net" in prompt_text
+        )
+        has_slack_evidence = (
+            SLACK_CANARY_CHANNEL in prompt_text
+            or SLACK_CANARY_TOKEN in prompt_text
+            or "C_FAKE_SLACK_CANARY" in prompt_text
+            or "T_FAKE_SLACK_TEAM" in prompt_text
+        )
+        has_clickup_evidence = (
+            CLICKUP_CANARY_TITLE in prompt_text
+            or CLICKUP_CANARY_TOKEN in prompt_text
+            or CLICKUP_CANARY_TASK_ID in prompt_text
+        )
+        has_manual_upload_md_evidence = (
+            MANUAL_UPLOAD_MD_CANARY_FILE in prompt_text
+            or MANUAL_UPLOAD_MD_CANARY_TOKEN in prompt_text
+        )
+        has_manual_upload_pdf_evidence = (
+            MANUAL_UPLOAD_PDF_CANARY_FILE in prompt_text
+            or MANUAL_UPLOAD_PDF_CANARY_TOKEN in prompt_text
+        )
+
+        if wants_clickup and has_clickup_evidence:
+            return f"ClickUp content found: {CLICKUP_CANARY_TOKEN}"
+        if wants_slack and has_slack_evidence:
+            return f"Slack content found: {SLACK_CANARY_TOKEN}"
+        if wants_jira and has_jira_evidence:
+            return f"Jira content found: {JIRA_CANARY_TOKEN}"
+        if wants_linear and has_linear_evidence:
+            return f"Linear content found: {LINEAR_CANARY_TOKEN}"
+        if wants_confluence and has_confluence_evidence:
+            return f"Confluence content found: {CONFLUENCE_CANARY_TOKEN}"
+        if wants_notion and has_notion_evidence:
+            return f"Notion content found: {NOTION_CANARY_TOKEN}"
+        if wants_calendar and has_calendar_evidence:
+            return f"Calendar content found: {CALENDAR_CANARY_TOKEN}"
+        if wants_gmail and has_gmail_evidence:
+            return f"Gmail content found: {GMAIL_CANARY_TOKEN}"
+        if wants_onedrive_pdf and has_onedrive_pdf_evidence:
+            return f"OneDrive PDF content found: {ONEDRIVE_PDF_CANARY_TOKEN}"
+        if wants_onedrive and has_onedrive_evidence:
+            return f"OneDrive content found: {ONEDRIVE_CANARY_TOKEN}"
+        if wants_dropbox_pdf and has_dropbox_pdf_evidence:
+            return f"Dropbox PDF content found: {DROPBOX_PDF_CANARY_TOKEN}"
+        if wants_dropbox and has_dropbox_evidence:
+            return f"Dropbox content found: {DROPBOX_CANARY_TOKEN}"
+        if wants_drive_pdf and has_native_drive_pdf_evidence:
+            return f"Drive PDF content found: {DRIVE_PDF_CANARY_TOKEN}"
+        if wants_drive_pdf and has_composio_drive_pdf_evidence:
+            return f"Drive PDF content found: {COMPOSIO_DRIVE_PDF_CANARY_TOKEN}"
+        if wants_drive and has_drive_evidence:
+            return f"Drive content found: {DRIVE_CANARY_TOKEN}"
+        if wants_manual_upload_pdf and has_manual_upload_pdf_evidence:
+            return f"Manual upload PDF content found: {MANUAL_UPLOAD_PDF_CANARY_TOKEN}"
+        if wants_manual_upload_md and has_manual_upload_md_evidence:
+            return f"Manual upload MD content found: {MANUAL_UPLOAD_MD_CANARY_TOKEN}"
+        if (
+            has_notion_evidence
+            and not has_confluence_evidence
+            and not has_jira_evidence
+            and not has_linear_evidence
+            and not has_calendar_evidence
+            and not has_gmail_evidence
+            and not has_drive_evidence
+            and not has_onedrive_evidence
+            and not has_dropbox_evidence
+            and not has_slack_evidence
+            and not has_clickup_evidence
+        ):
+            return f"Notion content found: {NOTION_CANARY_TOKEN}"
+        if (
+            has_confluence_evidence
+            and not has_jira_evidence
+            and not has_linear_evidence
+            and not has_notion_evidence
+            and not has_calendar_evidence
+            and not has_gmail_evidence
+            and not has_drive_evidence
+            and not has_onedrive_evidence
+            and not has_dropbox_evidence
+            and not has_slack_evidence
+            and not has_clickup_evidence
+        ):
+            return f"Confluence content found: {CONFLUENCE_CANARY_TOKEN}"
+        if (
+            has_jira_evidence
+            and not has_confluence_evidence
+            and not has_linear_evidence
+            and not has_notion_evidence
+            and not has_calendar_evidence
+            and not has_gmail_evidence
+            and not has_drive_evidence
+            and not has_onedrive_evidence
+            and not has_dropbox_evidence
+            and not has_slack_evidence
+            and not has_clickup_evidence
+        ):
+            return f"Jira content found: {JIRA_CANARY_TOKEN}"
+        if (
+            has_linear_evidence
+            and not has_confluence_evidence
+            and not has_jira_evidence
+            and not has_notion_evidence
+            and not has_calendar_evidence
+            and not has_gmail_evidence
+            and not has_drive_evidence
+            and not has_onedrive_evidence
+            and not has_dropbox_evidence
+            and not has_slack_evidence
+            and not has_clickup_evidence
+        ):
+            return f"Linear content found: {LINEAR_CANARY_TOKEN}"
+        if (
+            has_calendar_evidence
+            and not has_confluence_evidence
+            and not has_jira_evidence
+            and not has_linear_evidence
+            and not has_notion_evidence
+            and not has_gmail_evidence
+            and not has_drive_evidence
+            and not has_onedrive_evidence
+            and not has_dropbox_evidence
+            and not has_slack_evidence
+            and not has_clickup_evidence
+        ):
+            return f"Calendar content found: {CALENDAR_CANARY_TOKEN}"
+        if (
+            has_gmail_evidence
+            and not has_confluence_evidence
+            and not has_jira_evidence
+            and not has_linear_evidence
+            and not has_notion_evidence
+            and not has_drive_evidence
+            and not has_onedrive_evidence
+            and not has_dropbox_evidence
+            and not has_slack_evidence
+            and not has_clickup_evidence
+        ):
+            return f"Gmail content found: {GMAIL_CANARY_TOKEN}"
+        if (
+            has_onedrive_evidence
+            and not has_confluence_evidence
+            and not has_jira_evidence
+            and not has_linear_evidence
+            and not has_notion_evidence
+            and not has_calendar_evidence
+            and not has_gmail_evidence
+            and not has_drive_evidence
+            and not has_dropbox_evidence
+            and not has_slack_evidence
+            and not has_clickup_evidence
+        ):
+            return f"OneDrive content found: {ONEDRIVE_CANARY_TOKEN}"
+        if (
+            has_dropbox_evidence
+            and not has_confluence_evidence
+            and not has_jira_evidence
+            and not has_linear_evidence
+            and not has_notion_evidence
+            and not has_calendar_evidence
+            and not has_gmail_evidence
+            and not has_drive_evidence
+            and not has_onedrive_evidence
+            and not has_slack_evidence
+            and not has_clickup_evidence
+        ):
+            return f"Dropbox content found: {DROPBOX_CANARY_TOKEN}"
+        if (
+            has_drive_evidence
+            and not has_confluence_evidence
+            and not has_jira_evidence
+            and not has_linear_evidence
+            and not has_notion_evidence
+            and not has_gmail_evidence
+            and not has_onedrive_evidence
+            and not has_dropbox_evidence
+            and not has_slack_evidence
+            and not has_clickup_evidence
+        ):
+            return f"Drive content found: {DRIVE_CANARY_TOKEN}"
+        if (
+            has_slack_evidence
+            and not has_confluence_evidence
+            and not has_jira_evidence
+            and not has_linear_evidence
+            and not has_notion_evidence
+            and not has_calendar_evidence
+            and not has_gmail_evidence
+            and not has_drive_evidence
+            and not has_onedrive_evidence
+            and not has_dropbox_evidence
+            and not has_clickup_evidence
+        ):
+            return f"Slack content found: {SLACK_CANARY_TOKEN}"
+        if (
+            has_clickup_evidence
+            and not has_confluence_evidence
+            and not has_jira_evidence
+            and not has_linear_evidence
+            and not has_notion_evidence
+            and not has_calendar_evidence
+            and not has_gmail_evidence
+            and not has_drive_evidence
+            and not has_onedrive_evidence
+            and not has_dropbox_evidence
+            and not has_slack_evidence
+        ):
+            return f"ClickUp content found: {CLICKUP_CANARY_TOKEN}"
+        if (
+            has_manual_upload_pdf_evidence
+            and not has_confluence_evidence
+            and not has_jira_evidence
+            and not has_linear_evidence
+            and not has_notion_evidence
+            and not has_calendar_evidence
+            and not has_gmail_evidence
+            and not has_drive_evidence
+            and not has_onedrive_evidence
+            and not has_dropbox_evidence
+            and not has_slack_evidence
+            and not has_clickup_evidence
+        ):
+            return f"Manual upload PDF content found: {MANUAL_UPLOAD_PDF_CANARY_TOKEN}"
+        if (
+            has_manual_upload_md_evidence
+            and not has_confluence_evidence
+            and not has_jira_evidence
+            and not has_linear_evidence
+            and not has_notion_evidence
+            and not has_calendar_evidence
+            and not has_gmail_evidence
+            and not has_drive_evidence
+            and not has_onedrive_evidence
+            and not has_dropbox_evidence
+            and not has_slack_evidence
+            and not has_clickup_evidence
+        ):
+            return f"Manual upload MD content found: {MANUAL_UPLOAD_MD_CANARY_TOKEN}"
+        return NO_RELEVANT_CONTENT_SENTINEL
+
+    def _tool_call_message_for(self, messages: list[BaseMessage]) -> AIMessage | None:
+        latest_human = next(
+            (
+                _content_to_text(message.content)
+                for message in reversed(messages)
+                if message.type == "human"
+            ),
+            "",
+        )
+        latest_tool = _latest_tool_message(messages)
+        latest_tool_name = getattr(latest_tool, "name", None)
+        latest_tool_text = _content_to_text(latest_tool.content) if latest_tool else ""
+
+        if (
+            latest_tool_name == "search_gmail"
+            and GMAIL_CANARY_MESSAGE_ID in latest_tool_text
+        ):
+            return AIMessage(
+                content="",
+                tool_calls=[
+                    {
+                        "name": "read_gmail_email",
+                        "args": {"message_id": GMAIL_CANARY_MESSAGE_ID},
+                        "id": "call_e2e_read_gmail",
+                    }
+                ],
+            )
+
+        if latest_tool is None and _contains_any(
+            latest_human,
+            ("gmail", "email", "message", GMAIL_CANARY_SUBJECT),
+        ):
+            return AIMessage(
+                content="",
+                tool_calls=[
+                    {
+                        "name": "search_gmail",
+                        "args": {
+                            "query": f"subject:{GMAIL_CANARY_SUBJECT}",
+                            "max_results": 5,
+                        },
+                        "id": "call_e2e_search_gmail",
+                    }
+                ],
+            )
+
+        if latest_tool is None and _contains_any(
+            latest_human,
+            ("calendar", "event", "meeting", CALENDAR_CANARY_SUMMARY),
+        ):
+            return AIMessage(
+                content="",
+                tool_calls=[
+                    {
+                        "name": "search_calendar_events",
+                        "args": {
+                            "start_date": "2026-05-07",
+                            "end_date": "2026-05-21",
+                            "max_results": 10,
+                        },
+                        "id": "call_e2e_search_calendar_events",
+                    }
+                ],
+            )
+
+        if latest_tool is None and _contains_any(
+            latest_human,
+            (
+                "jira",
+                "atlassian",
+                JIRA_CANARY_SUMMARY,
+                JIRA_CANARY_KEY,
+                "surfsense-e2e.atlassian.net",
+                "fake-jira-cloud-001",
+            ),
+        ):
+            return AIMessage(
+                content="",
+                tool_calls=[
+                    {
+                        "name": "searchJiraIssuesUsingJql",
+                        "args": {
+                            "jql": f'summary ~ "{JIRA_CANARY_SUMMARY}"',
+                            "maxResults": 5,
+                        },
+                        "id": "call_e2e_search_jira_issues",
+                    }
+                ],
+            )
+
+        if latest_tool is None and _contains_any(
+            latest_human,
+            ("linear", "issue", LINEAR_CANARY_TITLE),
+        ):
+            return AIMessage(
+                content="",
+                tool_calls=[
+                    {
+                        "name": "list_issues",
+                        "args": {"query": LINEAR_CANARY_TITLE, "limit": 5},
+                        "id": "call_e2e_list_linear_issues",
+                    }
+                ],
+            )
+
+        if latest_tool is None and _contains_any(
+            latest_human,
+            ("slack", SLACK_CANARY_TOKEN),
+        ):
+            return AIMessage(
+                content="",
+                tool_calls=[
+                    {
+                        "name": "slack_search_channels",
+                        "args": {"query": SLACK_CANARY_CHANNEL, "limit": 5},
+                        "id": "call_e2e_search_slack_channels",
+                    }
+                ],
+            )
+
+        if latest_tool is None and _contains_any(
+            latest_human,
+            ("clickup", CLICKUP_CANARY_TITLE),
+        ):
+            return AIMessage(
+                content="",
+                tool_calls=[
+                    {
+                        "name": "clickup_search",
+                        "args": {"query": CLICKUP_CANARY_TITLE, "limit": 5},
+                        "id": "call_e2e_search_clickup_tasks",
+                    }
+                ],
+            )
+
+        return None
+
+    def _generate(
+        self,
+        messages: list[BaseMessage],
+        stop: list[str] | None = None,
+        run_manager: CallbackManagerForLLMRun | None = None,
+        **kwargs: Any,
+    ) -> ChatResult:
+        del stop, run_manager, kwargs
+        message = self._tool_call_message_for(messages) or AIMessage(
+            content=self._response_for(messages), tool_calls=[]
+        )
+        return ChatResult(generations=[ChatGeneration(message=message)])
+
+    async def _astream(
+        self,
+        messages: list[BaseMessage],
+        stop: list[str] | None = None,
+        run_manager: AsyncCallbackManagerForLLMRun | None = None,
+        **kwargs: Any,
+    ) -> AsyncIterator[ChatGenerationChunk]:
+        del stop, run_manager, kwargs
+        tool_call_message = self._tool_call_message_for(messages)
+        if tool_call_message:
+            for tool_call in tool_call_message.tool_calls:
+                yield ChatGenerationChunk(
+                    message=AIMessageChunk(
+                        content="",
+                        tool_call_chunks=[
+                            {
+                                "name": tool_call["name"],
+                                "args": json.dumps(tool_call["args"]),
+                                "id": tool_call["id"],
+                                "index": 0,
+                            }
+                        ],
+                    )
+                )
+            return
+
+        yield ChatGenerationChunk(
+            message=AIMessageChunk(content=self._response_for(messages))
+        )
+
+
+def fake_create_chat_litellm_from_agent_config(
+    *args: Any, **kwargs: Any
+) -> FakeChatLLM:
+    del args, kwargs
+    return FakeChatLLM()
+
+
+def fake_create_chat_litellm_from_config(*args: Any, **kwargs: Any) -> FakeChatLLM:
+    del args, kwargs
+    return FakeChatLLM()
diff --git a/surfsense_backend/tests/e2e/fakes/clickup_module.py b/surfsense_backend/tests/e2e/fakes/clickup_module.py
new file mode 100644
index 0000000000..b847ac73f4
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fakes/clickup_module.py
@@ -0,0 +1,134 @@
+"""Strict ClickUp MCP OAuth/tool fakes for Playwright E2E."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from types import SimpleNamespace
+from typing import Any
+
+from tests.e2e.fakes import mcp_oauth_runtime, mcp_runtime
+
+_FIXTURE_PATH = Path(__file__).parent / "fixtures" / "clickup_tasks.json"
+
+_AUTHORIZATION_URL = "https://mcp.clickup.com/authorize"
+_REGISTRATION_URL = "https://mcp.clickup.com/register"
+_TOKEN_URL = "https://mcp.clickup.com/token"
+_MCP_URL = "https://mcp.clickup.com/mcp"
+
+_CLIENT_ID = "fake-clickup-mcp-client-id"
+_CLIENT_SECRET = "fake-clickup-mcp-client-secret"
+_ACCESS_TOKEN = "fake-clickup-mcp-access-token"
+_REFRESH_TOKEN = "fake-clickup-mcp-refresh-token"
+_OAUTH_CODE = "fake-clickup-oauth-code"
+
+
+def _load_fixture() -> dict[str, Any]:
+    with _FIXTURE_PATH.open() as f:
+        return json.load(f)
+
+
+_FIXTURE = _load_fixture()
+
+
+def _task_text(task: dict[str, Any]) -> str:
+    return (
+        f"{task['name']}\n"
+        f"id: {task['id']}\n"
+        f"workspace: {task['workspace_name']} ({task['workspace_id']})\n"
+        f"list: {task['list_name']}\n"
+        f"status: {task['status']}\n"
+        f"description: {task['description']}"
+    )
+
+
+async def _list_tools() -> SimpleNamespace:
+    return SimpleNamespace(
+        tools=[
+            SimpleNamespace(
+                name="clickup_search",
+                description="Search ClickUp tasks visible to the authenticated user.",
+                inputSchema={
+                    "type": "object",
+                    "properties": {
+                        "query": {
+                            "type": "string",
+                            "description": "Text to search for in ClickUp tasks.",
+                        },
+                        "limit": {
+                            "type": "integer",
+                            "description": "Maximum number of tasks to return.",
+                        },
+                    },
+                    "required": ["query"],
+                },
+            ),
+            SimpleNamespace(
+                name="clickup_get_task",
+                description="Get a ClickUp task by id.",
+                inputSchema={
+                    "type": "object",
+                    "properties": {
+                        "task_id": {
+                            "type": "string",
+                            "description": "ClickUp task id.",
+                        }
+                    },
+                    "required": ["task_id"],
+                },
+            ),
+        ]
+    )
+
+
+async def _call_tool(
+    tool_name: str, arguments: dict[str, Any] | None = None
+) -> SimpleNamespace:
+    arguments = arguments or {}
+    task = _FIXTURE["tasks"][0]
+
+    if tool_name == "clickup_search":
+        query = str(arguments.get("query", ""))
+        if query and task["name"].lower() not in query.lower():
+            raise ValueError(f"Unexpected ClickUp task query: {query!r}")
+        return SimpleNamespace(content=[SimpleNamespace(text=_task_text(task))])
+
+    if tool_name == "clickup_get_task":
+        task_id = arguments.get("task_id")
+        if task_id != task["id"]:
+            raise ValueError(f"Unexpected ClickUp task id: {task_id!r}")
+        return SimpleNamespace(content=[SimpleNamespace(text=_task_text(task))])
+
+    raise NotImplementedError(f"Unexpected ClickUp MCP tool call: {tool_name!r}")
+
+
+def install(active_patches: list[Any]) -> None:
+    """Register ClickUp MCP OAuth/tool handlers with the shared dispatchers."""
+    del active_patches
+    mcp_oauth_runtime.register_service(
+        mcp_url=_MCP_URL,
+        discovery_metadata={
+            "issuer": "https://mcp.clickup.com",
+            "authorization_endpoint": _AUTHORIZATION_URL,
+            "token_endpoint": _TOKEN_URL,
+            "registration_endpoint": _REGISTRATION_URL,
+            "code_challenge_methods_supported": ["S256"],
+            "grant_types_supported": ["authorization_code", "refresh_token"],
+            "response_types_supported": ["code"],
+        },
+        client_id=_CLIENT_ID,
+        client_secret=_CLIENT_SECRET,
+        token_endpoint=_TOKEN_URL,
+        registration_endpoint=_REGISTRATION_URL,
+        oauth_code=_OAUTH_CODE,
+        access_token=_ACCESS_TOKEN,
+        refresh_token=_REFRESH_TOKEN,
+        scope="read write",
+        redirect_uri_substring="/api/v1/auth/mcp/clickup/connector/callback",
+    )
+    mcp_runtime.register(
+        url=_MCP_URL,
+        expected_bearer=_ACCESS_TOKEN,
+        list_tools=_list_tools,
+        call_tool=_call_tool,
+    )
diff --git a/surfsense_backend/tests/e2e/fakes/composio_module.py b/surfsense_backend/tests/e2e/fakes/composio_module.py
new file mode 100644
index 0000000000..16a93d0f16
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fakes/composio_module.py
@@ -0,0 +1,578 @@
+"""Strict drop-in replacement for the `composio` Python SDK.
+
+Registered as `sys.modules["composio"]` by `tests/e2e/run_backend.py`
+and `tests/e2e/run_celery.py` BEFORE any production code imports
+`composio`. From that point on, every `from composio import Composio`
+in production resolves to `Composio` defined here.
+
+Every class implements __getattr__ that raises NotImplementedError on
+unknown attributes. A future production code path that introduces a
+new SDK call (e.g. `client.bulk_operations.run`) fails CI loudly with
+a clear "add this surface to the fake" message instead of silently
+passing through to the real SDK.
+
+Scenario branching is read from the request-scoped ContextVar in
+`tests/e2e/middleware/scenario.py`, set by the X-E2E-Scenario header.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from pathlib import Path
+from typing import Any
+
+from .binary_loader import _resolve_file_bytes
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Fixture loading
+# ---------------------------------------------------------------------------
+
+_FIXTURES_DIR = Path(__file__).parent / "fixtures"
+_DRIVE_FIXTURE_PATH = _FIXTURES_DIR / "drive_files.json"
+_GMAIL_FIXTURE_PATH = _FIXTURES_DIR / "gmail_messages.json"
+_CALENDAR_FIXTURE_PATH = _FIXTURES_DIR / "calendar_events.json"
+_DRIVE_DOWNLOAD_DIR = Path("/tmp/surfsense-e2e-composio-downloads")
+
+
+def _load_drive_fixture() -> dict[str, Any]:
+    """Load the canned Drive fixture once per process."""
+    with _DRIVE_FIXTURE_PATH.open() as f:
+        return json.load(f)
+
+
+def _load_gmail_fixture() -> dict[str, Any]:
+    """Load the canned Gmail fixture once per process."""
+    with _GMAIL_FIXTURE_PATH.open() as f:
+        return json.load(f)
+
+
+def _load_calendar_fixture() -> dict[str, Any]:
+    """Load the canned Calendar fixture once per process."""
+    with _CALENDAR_FIXTURE_PATH.open() as f:
+        return json.load(f)
+
+
+_DRIVE_FIXTURE = _load_drive_fixture()
+_GMAIL_FIXTURE = _load_gmail_fixture()
+_CALENDAR_FIXTURE = _load_calendar_fixture()
+
+
+def _get_scenario() -> str:
+    """Return the current X-E2E-Scenario, defaulting to 'happy'.
+
+    Imported lazily so the fake module can be loaded BEFORE
+    `tests.e2e.middleware.scenario` is importable (during sys.modules
+    hijack at the very top of the entrypoint).
+    """
+    try:
+        from tests.e2e.middleware.scenario import current_scenario
+
+        return current_scenario()
+    except Exception:
+        return "happy"
+
+
+# ---------------------------------------------------------------------------
+# Strict mixin: every fake class raises on unknown attribute access
+# ---------------------------------------------------------------------------
+
+
+class _StrictFakeMixin:
+    """Base class for fakes. Any unknown attribute access fails loudly."""
+
+    _component_name: str = "<unknown>"
+
+    def __getattr__(self, name: str) -> Any:
+        raise NotImplementedError(
+            f"E2E Composio fake missing surface: {self._component_name}.{name!r}. "
+            f"If production code needs this, add an explicit method to "
+            f"surfsense_backend/tests/e2e/fakes/composio_module.py — "
+            f"the strict fake refuses to silently fall through to the real SDK."
+        )
+
+
+# ---------------------------------------------------------------------------
+# Result objects mimicking the real SDK's response dataclasses
+# ---------------------------------------------------------------------------
+
+
+class _ConnectionRequest:
+    """Mimics composio.connected_accounts.initiate(...) return value."""
+
+    def __init__(self, *, redirect_url: str, account_id: str) -> None:
+        self.redirect_url = redirect_url
+        self.id = account_id
+
+
+class _ConnectedAccount:
+    """Mimics a connected_account row returned by wait_for_connection / refresh."""
+
+    def __init__(
+        self,
+        *,
+        account_id: str,
+        status: str = "ACTIVE",
+        redirect_url: str | None = None,
+        access_token: str = "fake-e2e-access-token-not-real-do-not-use-32chars",
+    ) -> None:
+        self.id = account_id
+        self.status = status
+        self.redirect_url = redirect_url
+        self.state = _AccountState(access_token=access_token)
+
+
+class _AccountState:
+    def __init__(self, *, access_token: str) -> None:
+        self.val = _AccountStateVal(access_token=access_token)
+
+
+class _AccountStateVal:
+    def __init__(self, *, access_token: str) -> None:
+        self.access_token = access_token
+
+
+class _AuthConfig:
+    """Mimics one auth_config row returned by client.auth_configs.list().items."""
+
+    def __init__(self, *, config_id: str, toolkit_slug: str) -> None:
+        self.id = config_id
+        self.toolkit = _Toolkit(slug=toolkit_slug)
+
+
+class _Toolkit:
+    def __init__(self, *, slug: str) -> None:
+        self.slug = slug
+
+
+class _AuthConfigsListResult:
+    def __init__(self, items: list[_AuthConfig]) -> None:
+        self.items = items
+
+
+# ---------------------------------------------------------------------------
+# Sub-clients on the Composio top-level object
+# ---------------------------------------------------------------------------
+
+
+class _ConnectedAccounts(_StrictFakeMixin):
+    """Strict fake for client.connected_accounts.*"""
+
+    _component_name = "connected_accounts"
+
+    def initiate(
+        self,
+        *,
+        user_id: str,
+        auth_config_id: str,
+        callback_url: str,
+        allow_multiple: bool = True,
+        **_: Any,
+    ) -> _ConnectionRequest:
+        scenario = _get_scenario()
+        # Synthesize a deterministic account ID. Same toolkit on the same
+        # entity yields the same ID so the duplicate-connection scenario
+        # exercises the reconnect branch in composio_routes.py.
+        toolkit_id = auth_config_id.replace("auth-config-", "")
+        account_id = f"fake-acct-{toolkit_id}-{user_id}"
+
+        # The SDK's redirect_url normally points to Composio's hosted OAuth
+        # UI which then bounces to the third-party provider. In E2E we
+        # short-circuit straight back to OUR same-origin callback to avoid
+        # any real network requirement.
+        if scenario == "denied":
+            redirect = (
+                f"{callback_url}&error=access_denied"
+                if "?" in callback_url
+                else f"{callback_url}?error=access_denied"
+            )
+        else:
+            redirect = (
+                f"{callback_url}&connectedAccountId={account_id}"
+                if "?" in callback_url
+                else f"{callback_url}?connectedAccountId={account_id}"
+            )
+        logger.info(
+            "[fake-composio] initiate scenario=%s toolkit=%s redirect=%s",
+            scenario,
+            toolkit_id,
+            redirect,
+        )
+        return _ConnectionRequest(redirect_url=redirect, account_id=account_id)
+
+    def wait_for_connection(
+        self, *, id: str, timeout: float = 30.0, **_: Any
+    ) -> _ConnectedAccount:
+        return _ConnectedAccount(account_id=id, status="ACTIVE")
+
+    def get(self, *, nanoid: str, **_: Any) -> _ConnectedAccount:
+        return _ConnectedAccount(account_id=nanoid, status="ACTIVE")
+
+    def delete(self, account_id: str, /, **_: Any) -> dict[str, Any]:
+        logger.info("[fake-composio] delete account=%s", account_id)
+        return {"success": True, "id": account_id}
+
+    def refresh(
+        self,
+        *,
+        nanoid: str,
+        body_redirect_url: str | None = None,
+        **_: Any,
+    ) -> _ConnectedAccount:
+        return _ConnectedAccount(
+            account_id=nanoid,
+            status="ACTIVE",
+            redirect_url=body_redirect_url,
+        )
+
+
+class _AuthConfigs(_StrictFakeMixin):
+    """Strict fake for client.auth_configs.*"""
+
+    _component_name = "auth_configs"
+
+    def list(self, **_: Any) -> _AuthConfigsListResult:
+        # Return one auth config per toolkit we plan to test. The real
+        # SDK lets you have multiple, but one is enough for E2E.
+        return _AuthConfigsListResult(
+            items=[
+                _AuthConfig(
+                    config_id="auth-config-googledrive", toolkit_slug="googledrive"
+                ),
+                _AuthConfig(config_id="auth-config-gmail", toolkit_slug="gmail"),
+                _AuthConfig(
+                    config_id="auth-config-googlecalendar",
+                    toolkit_slug="googlecalendar",
+                ),
+            ]
+        )
+
+
+class _Tools(_StrictFakeMixin):
+    """Strict fake for client.tools.*"""
+
+    _component_name = "tools"
+
+    def execute(
+        self,
+        *,
+        slug: str,
+        connected_account_id: str,
+        user_id: str | None = None,
+        arguments: dict[str, Any] | None = None,
+        dangerously_skip_version_check: bool = True,
+        **_: Any,
+    ) -> dict[str, Any]:
+        scenario = _get_scenario()
+        args = arguments or {}
+        logger.info(
+            "[fake-composio] tools.execute slug=%s scenario=%s args=%s",
+            slug,
+            scenario,
+            list(args.keys()),
+        )
+
+        if scenario == "auth_expired":
+            # Match the error strings that composio_routes.py classifies
+            # as authentication failures (see lines ~720-728).
+            raise _AuthExpiredError(
+                "Token has been expired or revoked. (HTTP 401: invalid_grant)"
+            )
+
+        if slug == "GOOGLEDRIVE_LIST_FILES":
+            return _drive_list_files(args)
+        if slug == "GOOGLEDRIVE_DOWNLOAD_FILE":
+            return _drive_download_file(args)
+        if slug == "GOOGLEDRIVE_GET_FILE_METADATA":
+            return _drive_get_metadata(args)
+        if slug == "GOOGLEDRIVE_GET_CHANGES_START_PAGE_TOKEN":
+            return {"data": {"startPageToken": "fake-start-page-token-1"}}
+        if slug == "GOOGLEDRIVE_LIST_CHANGES":
+            return {
+                "data": {"changes": [], "newStartPageToken": "fake-start-page-token-1"}
+            }
+        if slug == "GOOGLEDRIVE_GET_ABOUT":
+            # Used by ComposioService.get_connected_account_email for
+            # googledrive. Returning a fake email lets the connector get a
+            # nice display name; failure is non-fatal.
+            return {"data": {"user": {"emailAddress": "e2e-fake@surfsense.example"}}}
+        if slug == "GMAIL_GET_PROFILE":
+            return {"data": {"emailAddress": "e2e-fake@surfsense.example"}}
+        if slug == "GMAIL_FETCH_EMAILS":
+            return _gmail_fetch_emails(args)
+        if slug == "GMAIL_FETCH_MESSAGE_BY_MESSAGE_ID":
+            return _gmail_fetch_message_by_message_id(args)
+        if slug == "GOOGLECALENDAR_EVENTS_LIST":
+            return _calendar_events_list(args)
+        if slug == "GOOGLECALENDAR_GET_CALENDAR":
+            return {
+                "data": {
+                    "id": "primary",
+                    "summary": "e2e-fake@surfsense.example",
+                    "primary": True,
+                    "timeZone": "UTC",
+                }
+            }
+        if slug == "GOOGLECALENDAR_CALENDARS_LIST":
+            return {
+                "data": {
+                    "items": [
+                        {
+                            "id": "primary",
+                            "summary": "e2e-fake@surfsense.example",
+                            "primary": True,
+                        }
+                    ]
+                }
+            }
+
+        # No silent passthrough: a slug we have not modelled is a test bug.
+        raise NotImplementedError(
+            f"E2E Composio fake has no handler for tool slug {slug!r}. "
+            f"Add it to surfsense_backend/tests/e2e/fakes/composio_module.py "
+            f"in `_Tools.execute` if production code needs it."
+        )
+
+
+# ---------------------------------------------------------------------------
+# Drive tool handlers
+# ---------------------------------------------------------------------------
+
+
+def _drive_list_files(args: dict[str, Any]) -> dict[str, Any]:
+    """Mimic GOOGLEDRIVE_LIST_FILES.
+
+    The real SDK accepts a Drive-style `q=` query like
+    `'<folder_id>' in parents and trashed = false ...`. We parse out the
+    folder id and serve the matching fixture list.
+    """
+    q = args.get("q", "")
+    if "in owners" in q:
+        return {
+            "data": {
+                "files": [
+                    {
+                        "id": "fake-file-owner-probe",
+                        "name": "owner-probe",
+                        "owners": [
+                            {
+                                "me": True,
+                                "emailAddress": "e2e-fake@surfsense.example",
+                            }
+                        ],
+                    }
+                ],
+                "nextPageToken": None,
+            }
+        }
+
+    folder_id = "root"
+    if "in parents" in q:
+        # q looks like:  '<folder_id>' in parents and trashed = false ...
+        try:
+            folder_id = q.split("'")[1]
+        except IndexError:
+            folder_id = "root"
+
+    files = _filter_drive_files_for_query(q, _DRIVE_FIXTURE.get(folder_id, []))
+    return {
+        "data": {
+            "files": files,
+            "nextPageToken": None,
+        }
+    }
+
+
+def _extract_quoted_value(q: str, anchor: str) -> str | None:
+    anchor_idx = q.find(anchor)
+    if anchor_idx == -1:
+        return None
+
+    after_anchor = q[anchor_idx + len(anchor) :]
+    first_quote_idx = after_anchor.find("'")
+    if first_quote_idx == -1:
+        return None
+
+    after_first_quote = after_anchor[first_quote_idx + 1 :]
+    second_quote_idx = after_first_quote.find("'")
+    if second_quote_idx == -1:
+        return None
+
+    return after_first_quote[:second_quote_idx]
+
+
+def _filter_drive_files_for_query(
+    q: str, files: list[dict[str, Any]]
+) -> list[dict[str, Any]]:
+    filtered = list(files)
+
+    if "trashed = false" in q:
+        filtered = [entry for entry in filtered if entry.get("trashed") is not True]
+
+    excluded_mime_type = _extract_quoted_value(q, "mimeType !=")
+    if excluded_mime_type:
+        filtered = [
+            entry for entry in filtered if entry.get("mimeType") != excluded_mime_type
+        ]
+
+    included_mime_type = _extract_quoted_value(q, "mimeType =")
+    if included_mime_type:
+        filtered = [
+            entry for entry in filtered if entry.get("mimeType") == included_mime_type
+        ]
+
+    return filtered
+
+
+def _drive_download_file(args: dict[str, Any]) -> dict[str, Any]:
+    """Mimic GOOGLEDRIVE_DOWNLOAD_FILE.
+
+    The real SDK writes the downloaded bytes to a local file and returns
+    the path. composio_service.py then reads bytes from that path. We
+    reproduce that behaviour by writing fixture content into a tmp
+    directory and returning the path.
+    """
+    file_id = args.get("file_id", "")
+    contents = _resolve_file_bytes(_DRIVE_FIXTURE, file_id, _FIXTURES_DIR)
+    if contents is None:
+        # Unknown file id is a test bug, fail loudly.
+        raise NotImplementedError(
+            f"E2E Composio fake has no canned content for file_id={file_id!r}. "
+            f"Add it under '_file_contents' in "
+            f"surfsense_backend/tests/e2e/fakes/fixtures/drive_files.json."
+        )
+
+    _DRIVE_DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True)
+    metadata = _drive_get_metadata({"file_id": file_id})["data"]
+    file_name = metadata.get("name") or f"{file_id}.txt"
+    out_path = _DRIVE_DOWNLOAD_DIR / file_name
+    out_path.write_bytes(contents)
+    return {
+        "data": {
+            "file_path": str(out_path),
+            "file_name": file_name,
+            "size": len(contents),
+        }
+    }
+
+
+def _drive_get_metadata(args: dict[str, Any]) -> dict[str, Any]:
+    """Mimic GOOGLEDRIVE_GET_FILE_METADATA."""
+    file_id = args.get("file_id", "")
+    # Search every folder fixture for the file
+    for items in _DRIVE_FIXTURE.values():
+        if not isinstance(items, list):
+            continue
+        for entry in items:
+            if entry.get("id") == file_id:
+                return {"data": entry}
+    raise NotImplementedError(
+        f"E2E fake: no metadata fixture for file_id={file_id!r}. "
+        f"Add it to drive_files.json."
+    )
+
+
+# ---------------------------------------------------------------------------
+# Gmail tool handlers
+# ---------------------------------------------------------------------------
+
+
+def _gmail_fetch_emails(args: dict[str, Any]) -> dict[str, Any]:
+    """Mimic GMAIL_FETCH_EMAILS.
+
+    The production indexer uses this as a list page, then calls
+    GMAIL_FETCH_MESSAGE_BY_MESSAGE_ID for the full body of each id.
+    """
+    del args
+    messages = list(_GMAIL_FIXTURE.get("messages", []))
+    return {
+        "data": {
+            "messages": messages,
+            "nextPageToken": None,
+            "resultSizeEstimate": len(messages),
+        }
+    }
+
+
+def _gmail_fetch_message_by_message_id(args: dict[str, Any]) -> dict[str, Any]:
+    """Mimic GMAIL_FETCH_MESSAGE_BY_MESSAGE_ID."""
+    message_id = args.get("message_id", "")
+    details = _GMAIL_FIXTURE.get("details", {})
+    detail = details.get(message_id)
+    if detail is None:
+        raise NotImplementedError(
+            f"E2E Composio fake has no Gmail detail fixture for "
+            f"message_id={message_id!r}. Add it under 'details' in "
+            f"surfsense_backend/tests/e2e/fakes/fixtures/gmail_messages.json."
+        )
+    return {"data": detail}
+
+
+# ---------------------------------------------------------------------------
+# Google Calendar tool handlers
+# ---------------------------------------------------------------------------
+
+
+def _calendar_events_list(args: dict[str, Any]) -> dict[str, Any]:
+    """Mimic GOOGLECALENDAR_EVENTS_LIST for live calendar chat tools."""
+    max_results = int(args.get("max_results", 250) or 250)
+    items = list(_CALENDAR_FIXTURE.get("items", []))[:max_results]
+    return {
+        "data": {
+            "items": items,
+            "summary": "e2e-fake@surfsense.example",
+            "timeZone": "UTC",
+        }
+    }
+
+
+# ---------------------------------------------------------------------------
+# Errors
+# ---------------------------------------------------------------------------
+
+
+class _AuthExpiredError(Exception):
+    """Raised by the fake when scenario=auth_expired.
+
+    composio_service.execute_tool catches every exception and surfaces
+    str(error) inside the result dict; composio_routes.py then classifies
+    "expired or revoked" / "401" tokens and sets connector.config.auth_expired.
+    """
+
+
+# ---------------------------------------------------------------------------
+# Top-level Composio class — the only public symbol production imports
+# ---------------------------------------------------------------------------
+
+
+class Composio(_StrictFakeMixin):
+    """Drop-in replacement for `composio.Composio`.
+
+    Production calls:  `Composio(api_key=..., file_download_dir=...)`
+    """
+
+    _component_name = "Composio"
+
+    def __init__(
+        self,
+        *,
+        api_key: str | None = None,
+        file_download_dir: str | None = None,
+        **_: Any,
+    ) -> None:
+        self.api_key = api_key
+        self.file_download_dir = file_download_dir
+        self.connected_accounts = _ConnectedAccounts()
+        self.tools = _Tools()
+        self.auth_configs = _AuthConfigs()
+        logger.info(
+            "[fake-composio] Composio() constructed (E2E mode, no real network)"
+        )
+
+
+# Public re-exports so `from composio import Composio` resolves correctly
+# when this module is registered as sys.modules["composio"].
+__all__ = ["Composio"]
diff --git a/surfsense_backend/tests/e2e/fakes/confluence_indexer.py b/surfsense_backend/tests/e2e/fakes/confluence_indexer.py
new file mode 100644
index 0000000000..3629b18b76
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fakes/confluence_indexer.py
@@ -0,0 +1,58 @@
+"""Strict Confluence indexer fake for Playwright E2E."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+from unittest.mock import patch
+
+_FIXTURE_PATH = Path(__file__).parent / "fixtures" / "confluence_pages.json"
+
+
+def _load_fixture() -> dict[str, Any]:
+    with _FIXTURE_PATH.open() as f:
+        return json.load(f)
+
+
+_FIXTURE = _load_fixture()
+
+
+class _FakeConfluenceHistoryConnector:
+    def __init__(self, *, session: Any, connector_id: int, **kwargs: Any):
+        del session, kwargs
+        self.connector_id = connector_id
+
+    async def get_pages_by_date_range(
+        self,
+        *,
+        start_date: str,
+        end_date: str,
+        include_comments: bool = False,
+    ) -> tuple[list[dict[str, Any]], None]:
+        if not start_date or not end_date:
+            raise ValueError(
+                "Confluence indexer fake expected start_date and end_date."
+            )
+        del include_comments
+        return _FIXTURE["pages"], None
+
+    async def close(self) -> None:
+        return None
+
+    def __getattr__(self, name: str) -> Any:
+        raise NotImplementedError(
+            "E2E Confluence indexer fake missing surface: "
+            f"ConfluenceHistoryConnector.{name!r}. "
+            "Add it to surfsense_backend/tests/e2e/fakes/confluence_indexer.py."
+        )
+
+
+def install(active_patches: list[Any]) -> None:
+    """Patch only the Confluence indexer's bound connector class."""
+    p = patch(
+        "app.tasks.connector_indexers.confluence_indexer.ConfluenceHistoryConnector",
+        _FakeConfluenceHistoryConnector,
+    )
+    p.start()
+    active_patches.append(p)
diff --git a/surfsense_backend/tests/e2e/fakes/confluence_oauth.py b/surfsense_backend/tests/e2e/fakes/confluence_oauth.py
new file mode 100644
index 0000000000..53da99e47a
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fakes/confluence_oauth.py
@@ -0,0 +1,146 @@
+"""Strict Confluence OAuth fakes for Playwright E2E."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+from unittest.mock import patch
+
+_FIXTURE_PATH = Path(__file__).parent / "fixtures" / "confluence_pages.json"
+
+_TOKEN_URL = "https://auth.atlassian.com/oauth/token"
+_RESOURCES_URL = "https://api.atlassian.com/oauth/token/accessible-resources"
+_ACCESS_TOKEN = "fake-confluence-access-token"
+_REFRESH_TOKEN = "fake-confluence-refresh-token"
+_OAUTH_CODE = "fake-confluence-oauth-code"
+
+
+def _load_fixture() -> dict[str, Any]:
+    with _FIXTURE_PATH.open() as f:
+        return json.load(f)
+
+
+_FIXTURE = _load_fixture()
+
+
+class _StrictFakeMixin:
+    _component_name: str = "<unknown>"
+
+    def __getattr__(self, name: str) -> Any:
+        raise NotImplementedError(
+            f"E2E Confluence OAuth fake missing surface: "
+            f"{self._component_name}.{name!r}. "
+            "Add it to surfsense_backend/tests/e2e/fakes/confluence_oauth.py."
+        )
+
+
+class _FakeResponse(_StrictFakeMixin):
+    _component_name = "httpx.Response"
+
+    def __init__(self, payload: Any, status_code: int = 200):
+        self._payload = payload
+        self.status_code = status_code
+        self.text = json.dumps(payload, sort_keys=True)
+
+    def json(self) -> Any:
+        return self._payload
+
+
+class _FakeHttpxAsyncClient(_StrictFakeMixin):
+    _component_name = "httpx.AsyncClient"
+
+    def __init__(self, *args: Any, **kwargs: Any):
+        del args, kwargs
+
+    async def __aenter__(self) -> _FakeHttpxAsyncClient:
+        return self
+
+    async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> None:
+        del exc_type, exc, tb
+
+    async def post(self, url: str, **kwargs: Any) -> _FakeResponse:
+        if url != _TOKEN_URL:
+            raise NotImplementedError(f"Unexpected Confluence OAuth POST url={url!r}")
+
+        data = kwargs.get("json") or {}
+        headers = kwargs.get("headers") or {}
+        if headers.get("Content-Type") != "application/json":
+            raise ValueError("Confluence OAuth token exchange expected JSON headers.")
+
+        grant_type = data.get("grant_type")
+        if grant_type == "authorization_code":
+            if data.get("code") != _OAUTH_CODE:
+                raise ValueError(
+                    f"Unexpected fake Confluence OAuth code: {data.get('code')!r}"
+                )
+            if not data.get("client_id") or not data.get("client_secret"):
+                raise ValueError(
+                    "Confluence OAuth token exchange missing client creds."
+                )
+            if "/api/v1/auth/confluence/connector/callback" not in str(
+                data.get("redirect_uri", "")
+            ):
+                raise ValueError(
+                    "Confluence OAuth token exchange got unexpected redirect_uri: "
+                    f"{data.get('redirect_uri')!r}"
+                )
+        elif grant_type == "refresh_token":
+            if data.get("refresh_token") != _REFRESH_TOKEN:
+                raise ValueError(
+                    "Unexpected fake Confluence refresh token: "
+                    f"{data.get('refresh_token')!r}"
+                )
+        else:
+            raise ValueError(f"Unexpected fake Confluence grant_type: {grant_type!r}")
+
+        return _FakeResponse(
+            {
+                "access_token": _ACCESS_TOKEN,
+                "refresh_token": _REFRESH_TOKEN,
+                "expires_in": 3600,
+                "scope": "read:confluence-user read:space:confluence read:page:confluence",
+                "token_type": "Bearer",
+            }
+        )
+
+    async def get(self, url: str, **kwargs: Any) -> _FakeResponse:
+        if url != _RESOURCES_URL:
+            raise NotImplementedError(f"Unexpected Confluence OAuth GET url={url!r}")
+
+        headers = kwargs.get("headers") or {}
+        auth = headers.get("Authorization")
+        if auth != f"Bearer {_ACCESS_TOKEN}":
+            raise ValueError(f"Unexpected Confluence resources Authorization: {auth!r}")
+
+        site = _FIXTURE["site"]
+        return _FakeResponse(
+            [
+                {
+                    "id": site["cloud_id"],
+                    "name": site["name"],
+                    "url": site["url"],
+                    "scopes": [
+                        "read:confluence-user",
+                        "read:space:confluence",
+                        "read:page:confluence",
+                    ],
+                }
+            ]
+        )
+
+
+class _FakeHttpxModule(_StrictFakeMixin):
+    _component_name = "httpx"
+
+    AsyncClient = _FakeHttpxAsyncClient
+
+
+def install(active_patches: list[Any]) -> None:
+    """Patch only Confluence route-local HTTP OAuth calls."""
+    p = patch(
+        "app.routes.confluence_add_connector_route.httpx",
+        _FakeHttpxModule(),
+    )
+    p.start()
+    active_patches.append(p)
diff --git a/surfsense_backend/tests/e2e/fakes/docling_service.py b/surfsense_backend/tests/e2e/fakes/docling_service.py
new file mode 100644
index 0000000000..9dd09d6038
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fakes/docling_service.py
@@ -0,0 +1,141 @@
+"""Stub DoclingService.process_document for E2E.
+
+The real ``DoclingService.process_document`` calls
+``DocumentConverter.convert(file_path)`` which lazily downloads the
+``docling-project/docling-layout-heron`` model from Hugging Face Hub.
+The hermetic E2E container sets ``HF_HUB_OFFLINE=1`` (see
+``docker/docker-compose.e2e.yml``), so that download fails with
+``LocalEntryNotFoundError`` and the indexing Celery task retries until
+the Playwright test hits its ~4-minute step timeout. In CI that is the
+difference between the suite finishing and the 30-minute job timeout
+killing the run before any report can upload.
+
+Stubbing ``process_document`` bypasses ``DocumentConverter.convert()``
+entirely. ``DoclingService.__init__`` is intentionally left untouched
+because constructing ``DocumentConverter(...)`` is cheap and offline —
+it is only ``.convert()`` that triggers the offline-model download.
+
+Every canary PDF under ``tests/e2e/fakes/fixtures/binary/`` is produced
+by ``generate_canary_pdfs.py`` and embeds its canary token as plain
+``(text) Tj`` PDF text operators. Extracting those operators gives us
+the canary string back, which is what the Playwright assertions look
+for in the resulting Document row.
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+from pathlib import Path
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+# Matches the `(escaped text) Tj` text-show operator emitted by
+# generate_canary_pdfs.py. Inside the parens, the escape rules are:
+#   \\  -> backslash
+#   \(  -> literal (
+#   \)  -> literal )
+# The character class [^\\()] consumes any non-escape byte; \\. consumes
+# an escape sequence. Sufficient for our synthetic fixtures.
+_TJ_PATTERN = re.compile(rb"\(((?:[^\\()]|\\.)*)\)\s*Tj")
+
+
+def _extract_text_from_synthetic_pdf(file_path: str) -> str:
+    """Pull every ``(text) Tj`` payload out of a fixture PDF in order.
+
+    Returns an empty string if the file cannot be read. We do not try to
+    handle arbitrary PDFs because the fake is only ever invoked against
+    fixtures we generate ourselves.
+    """
+    try:
+        data = Path(file_path).read_bytes()
+    except OSError as exc:
+        logger.warning("[fake-docling] could not read %s: %s", file_path, exc)
+        return ""
+
+    lines: list[str] = []
+    for match in _TJ_PATTERN.finditer(data):
+        raw = match.group(1)
+        # Order-sensitive unescape via sentinel: protect `\\` first so
+        # the subsequent `\(` / `\)` passes do not corrupt it.
+        text = (
+            raw.replace(rb"\\", b"\x00")
+            .replace(rb"\(", b"(")
+            .replace(rb"\)", b")")
+            .replace(b"\x00", b"\\")
+        )
+        try:
+            lines.append(text.decode("utf-8"))
+        except UnicodeDecodeError:
+            lines.append(text.decode("latin-1"))
+    return "\n".join(lines)
+
+
+async def fake_process_document(
+    self,
+    file_path: str,
+    filename: str | None = None,
+) -> dict[str, Any]:
+    """Drop-in replacement for ``DoclingService.process_document``.
+
+    Returns the same dict shape as the production method so callers
+    (``app/etl_pipeline/parsers/docling.py``) can keep reading
+    ``result["content"]`` without changes.
+    """
+    extracted = _extract_text_from_synthetic_pdf(file_path)
+    display_name = filename or Path(file_path).name
+
+    if extracted:
+        content = f"# {display_name}\n\n{extracted}\n"
+    else:
+        # Empty fallback so the indexing pipeline does not error out on
+        # an unexpected payload. A failing canary assertion is a much
+        # clearer failure mode than a hard parser exception.
+        content = (
+            f"# {display_name}\n\n(empty docling fake — no text-show operators found)\n"
+        )
+
+    logger.info(
+        "[fake-docling] returning %d chars for %s",
+        len(content),
+        display_name,
+    )
+
+    return {
+        "content": content,
+        "full_text": content,
+        "service_used": "docling-fake",
+        "status": "success",
+        "processing_notes": "e2e fake DoclingService — no real PDF parsing",
+    }
+
+
+def install(patches: list[Any]) -> None:
+    """Patch ``DoclingService.process_document`` at the class level.
+
+    Patching the class method (rather than each call site) is correct
+    here because every consumer goes through
+    ``create_docling_service()`` → ``DoclingService()`` → instance method
+    dispatch, so the descriptor protocol picks up our replacement. There
+    is exactly one such consumer today
+    (``app/etl_pipeline/parsers/docling.py``), but patching the class is
+    future-proof.
+
+    Fails loud rather than warning, because a silent passthrough means
+    real Docling + ``HF_HUB_OFFLINE=1`` = 4 minutes of CI hang per test.
+    """
+    from unittest.mock import patch as _patch
+
+    target = "app.services.docling_service.DoclingService.process_document"
+    try:
+        p = _patch(target, fake_process_document)
+        p.start()
+        patches.append(p)
+        logger.info("[fake-docling] patched %s", target)
+    except (ModuleNotFoundError, AttributeError) as exc:
+        raise RuntimeError(
+            f"Could not patch Docling binding {target!r}: {exc!s}. "
+            f"Update surfsense_backend/tests/e2e/fakes/docling_service.py "
+            f"to point at the new binding site."
+        ) from exc
diff --git a/surfsense_backend/tests/e2e/fakes/dropbox_api.py b/surfsense_backend/tests/e2e/fakes/dropbox_api.py
new file mode 100644
index 0000000000..3fb4f63ffe
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fakes/dropbox_api.py
@@ -0,0 +1,182 @@
+"""Strict Dropbox HTTP/API fakes for Playwright E2E.
+
+This module patches the Dropbox OAuth route and indexer consumer-site
+bindings. It keeps the production add/callback/indexing flow intact while
+serving deterministic Dropbox-shaped token, profile, metadata, and file
+content responses.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+from unittest.mock import patch
+
+import httpx
+
+from .binary_loader import _resolve_file_bytes
+
+_FIXTURES_DIR = Path(__file__).parent / "fixtures"
+_DROPBOX_FIXTURE_PATH = _FIXTURES_DIR / "dropbox_files.json"
+
+
+def _load_dropbox_fixture() -> dict[str, Any]:
+    with _DROPBOX_FIXTURE_PATH.open() as f:
+        return json.load(f)
+
+
+_DROPBOX_FIXTURE = _load_dropbox_fixture()
+
+
+class _StrictFakeMixin:
+    _component_name: str = "<unknown>"
+
+    def __getattr__(self, name: str) -> Any:
+        raise NotImplementedError(
+            f"E2E Dropbox fake missing surface: "
+            f"{self._component_name}.{name!r}. Add it to "
+            f"surfsense_backend/tests/e2e/fakes/dropbox_api.py."
+        )
+
+
+class _FakeDropboxClient(_StrictFakeMixin):
+    _component_name = "DropboxClient"
+
+    def __init__(self, session: Any, connector_id: int):
+        self._session = session
+        self._connector_id = connector_id
+
+    async def _get_valid_token(self) -> str:
+        return "fake-dropbox-access-token"
+
+    async def list_folder(
+        self, path: str = ""
+    ) -> tuple[list[dict[str, Any]], str | None]:
+        items = _DROPBOX_FIXTURE.get(path)
+        if not isinstance(items, list):
+            return [], f"E2E Dropbox fake has no folder for path={path!r}."
+        return [dict(item) for item in items], None
+
+    async def get_latest_cursor(self, path: str = "") -> tuple[str | None, str | None]:
+        if path not in _DROPBOX_FIXTURE:
+            return None, f"E2E Dropbox fake has no cursor for path={path!r}."
+        return f"fake-dropbox-cursor:{path or 'root'}", None
+
+    async def get_changes(
+        self, cursor: str
+    ) -> tuple[list[dict[str, Any]], str | None, str | None]:
+        return [], cursor, None
+
+    async def get_metadata(self, path: str) -> tuple[dict[str, Any] | None, str | None]:
+        metadata = _dropbox_get_metadata(path)
+        if metadata is None:
+            return None, f"E2E Dropbox fake has no metadata for path={path!r}."
+        return metadata, None
+
+    async def download_file(self, path: str) -> tuple[bytes | None, str | None]:
+        content = _resolve_file_bytes(_DROPBOX_FIXTURE, path, _FIXTURES_DIR)
+        if content is None:
+            return None, f"E2E Dropbox fake has no content for path={path!r}."
+        return content, None
+
+    async def download_file_to_disk(self, path: str, dest_path: str) -> str | None:
+        content = _resolve_file_bytes(_DROPBOX_FIXTURE, path, _FIXTURES_DIR)
+        if content is None:
+            return f"E2E Dropbox fake has no content for path={path!r}."
+        with open(dest_path, "wb") as f:
+            f.write(content)
+        return None
+
+    async def get_current_account(self) -> tuple[dict[str, Any] | None, str | None]:
+        return _dropbox_current_account(), None
+
+
+class _FakeAsyncClient(_StrictFakeMixin):
+    _component_name = "httpx.AsyncClient"
+
+    def __init__(self, *args: Any, **kwargs: Any):
+        del args, kwargs
+
+    async def __aenter__(self) -> _FakeAsyncClient:
+        return self
+
+    async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> None:
+        del exc_type, exc, tb
+
+    async def post(self, url: str, *args: Any, **kwargs: Any) -> httpx.Response:
+        del args, kwargs
+        if url == "https://api.dropboxapi.com/oauth2/token":
+            return _json_response(
+                "POST",
+                url,
+                {
+                    "access_token": "fake-dropbox-access-token",
+                    "refresh_token": "fake-dropbox-refresh-token",
+                    "token_type": "bearer",
+                    "expires_in": 3600,
+                    "account_id": "dbid:fake-dropbox-account",
+                },
+            )
+        if url == "https://api.dropboxapi.com/2/users/get_current_account":
+            return _json_response("POST", url, _dropbox_current_account())
+        raise NotImplementedError(f"E2E Dropbox fake unexpected POST URL: {url!r}")
+
+    async def request(
+        self, method: str, url: str, *args: Any, **kwargs: Any
+    ) -> httpx.Response:
+        del args, kwargs
+        raise NotImplementedError(
+            f"E2E Dropbox fake unexpected request: {method!r} {url!r}"
+        )
+
+
+class _FakeHttpxModule(_StrictFakeMixin):
+    _component_name = "httpx"
+
+    AsyncClient = _FakeAsyncClient
+
+
+def _json_response(
+    method: str, url: str, payload: dict[str, Any], status_code: int = 200
+) -> httpx.Response:
+    return httpx.Response(
+        status_code=status_code,
+        json=payload,
+        request=httpx.Request(method, url),
+    )
+
+
+def _dropbox_current_account() -> dict[str, Any]:
+    return {
+        "email": "dropbox-e2e@surfsense.example",
+        "name": {"display_name": "SurfSense Dropbox E2E"},
+        "account_id": "dbid:fake-dropbox-account",
+    }
+
+
+def _dropbox_get_metadata(path: str | None) -> dict[str, Any] | None:
+    for items in _DROPBOX_FIXTURE.values():
+        if not isinstance(items, list):
+            continue
+        for entry in items:
+            if entry.get("path_lower") == path or entry.get("id") == path:
+                return dict(entry)
+    return None
+
+
+def install(active_patches: list[Any]) -> None:
+    """Patch production Dropbox bindings to use strict Dropbox fakes."""
+    targets = [
+        ("app.routes.dropbox_add_connector_route.httpx", _FakeHttpxModule()),
+        ("app.routes.dropbox_add_connector_route.DropboxClient", _FakeDropboxClient),
+        (
+            "app.tasks.connector_indexers.dropbox_indexer.DropboxClient",
+            _FakeDropboxClient,
+        ),
+        ("app.connectors.dropbox.client.httpx", _FakeHttpxModule()),
+    ]
+    for target, replacement in targets:
+        p = patch(target, replacement)
+        p.start()
+        active_patches.append(p)
diff --git a/surfsense_backend/tests/e2e/fakes/embeddings.py b/surfsense_backend/tests/e2e/fakes/embeddings.py
new file mode 100644
index 0000000000..ab9e24df9f
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fakes/embeddings.py
@@ -0,0 +1,80 @@
+"""Deterministic embedding fakes for E2E.
+
+Mirrors the existing `patched_embed_texts` fixture in
+`surfsense_backend/tests/integration/conftest.py`:
+
+    MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts])
+
+The dimension matches whatever `config.embedding_model_instance.dimension`
+returns in the running process so the fakes are vector-compatible with
+the documents.embedding pgvector column.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+import numpy as np
+
+from app.config import config
+
+logger = logging.getLogger(__name__)
+
+
+def _embedding_dim() -> int:
+    """Resolve the dimension once, lazily, so tests work for any embedding model."""
+    return int(config.embedding_model_instance.dimension)
+
+
+def fake_embed_text(text: str) -> np.ndarray:
+    """Deterministic single-text embedding."""
+    return np.full(shape=(_embedding_dim(),), fill_value=0.1, dtype=np.float32)
+
+
+def fake_embed_texts(texts: list[str]) -> list[np.ndarray]:
+    """Deterministic batch embedding. One vector per input text."""
+    if not texts:
+        return []
+    dim = _embedding_dim()
+    return [np.full(shape=(dim,), fill_value=0.1, dtype=np.float32) for _ in texts]
+
+
+def install(patches: list[Any]) -> None:
+    """Install embedding patches at every binding site we know about.
+
+    Caller passes a `patches` list that the entrypoint will track in
+    order to start them (and, in principle, stop them on shutdown — we
+    intentionally never stop because the process exits when the test
+    server stops).
+    """
+    from unittest.mock import patch as _patch
+
+    targets = [
+        # Source binding (where the real implementation lives)
+        ("app.utils.document_converters.embed_text", fake_embed_text),
+        ("app.utils.document_converters.embed_texts", fake_embed_texts),
+        # Consumers that did `from app.utils.document_converters import embed_text/texts`
+        ("app.indexing_pipeline.document_embedder.embed_text", fake_embed_text),
+        ("app.indexing_pipeline.document_embedder.embed_texts", fake_embed_texts),
+        # Pipeline service binding (the actual call site for indexing.index)
+        (
+            "app.indexing_pipeline.indexing_pipeline_service.embed_texts",
+            fake_embed_texts,
+        ),
+    ]
+    for target, replacement in targets:
+        try:
+            p = _patch(target, replacement)
+            p.start()
+            patches.append(p)
+            logger.info("[fake-embeddings] patched %s", target)
+        except (ModuleNotFoundError, AttributeError) as exc:
+            # If a future refactor moves a binding, fail loudly — silent
+            # passthrough to a real embedding model would be expensive
+            # and non-deterministic.
+            raise RuntimeError(
+                f"Could not patch embedding binding {target!r}: {exc!s}. "
+                f"Update surfsense_backend/tests/e2e/fakes/embeddings.py "
+                f"to point at the new binding site."
+            ) from exc
diff --git a/surfsense_backend/tests/e2e/fakes/fixtures/binary/composio-drive-canary.pdf b/surfsense_backend/tests/e2e/fakes/fixtures/binary/composio-drive-canary.pdf
new file mode 100644
index 0000000000..e4ed9f0011
Binary files /dev/null and b/surfsense_backend/tests/e2e/fakes/fixtures/binary/composio-drive-canary.pdf differ
diff --git a/surfsense_backend/tests/e2e/fakes/fixtures/binary/drive-canary.pdf b/surfsense_backend/tests/e2e/fakes/fixtures/binary/drive-canary.pdf
new file mode 100644
index 0000000000..d171c63897
Binary files /dev/null and b/surfsense_backend/tests/e2e/fakes/fixtures/binary/drive-canary.pdf differ
diff --git a/surfsense_backend/tests/e2e/fakes/fixtures/binary/dropbox-canary.pdf b/surfsense_backend/tests/e2e/fakes/fixtures/binary/dropbox-canary.pdf
new file mode 100644
index 0000000000..14ebc02d8b
Binary files /dev/null and b/surfsense_backend/tests/e2e/fakes/fixtures/binary/dropbox-canary.pdf differ
diff --git a/surfsense_backend/tests/e2e/fakes/fixtures/binary/generate_canary_pdfs.py b/surfsense_backend/tests/e2e/fakes/fixtures/binary/generate_canary_pdfs.py
new file mode 100644
index 0000000000..9fb3360589
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fakes/fixtures/binary/generate_canary_pdfs.py
@@ -0,0 +1,84 @@
+"""Generate deterministic one-page PDFs for connector E2E fixtures."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+PDF_FIXTURES = {
+    "drive-canary.pdf": (
+        "Native Drive PDF Canary",
+        "This one-page text-layer PDF proves native Drive Docling coverage.",
+        "SURFSENSE_E2E_CANARY_TOKEN_DRIVE_PDF_001",
+    ),
+    "onedrive-canary.pdf": (
+        "OneDrive PDF Canary",
+        "This one-page text-layer PDF proves OneDrive Docling coverage.",
+        "SURFSENSE_E2E_CANARY_TOKEN_ONEDRIVE_PDF_001",
+    ),
+    "dropbox-canary.pdf": (
+        "Dropbox PDF Canary",
+        "This one-page text-layer PDF proves Dropbox Docling coverage.",
+        "SURFSENSE_E2E_CANARY_TOKEN_DROPBOX_PDF_001",
+    ),
+    "composio-drive-canary.pdf": (
+        "Composio Drive PDF Canary",
+        "This one-page text-layer PDF proves Composio Drive Docling coverage.",
+        "SURFSENSE_E2E_CANARY_TOKEN_COMPOSIO_DRIVE_PDF_001",
+    ),
+}
+
+
+def _escape_pdf_text(text: str) -> str:
+    return text.replace("\\", "\\\\").replace("(", "\\(").replace(")", "\\)")
+
+
+def _build_pdf(lines: tuple[str, str, str]) -> bytes:
+    text_ops = ["BT", "/F1 12 Tf", "72 760 Td"]
+    for index, line in enumerate(lines):
+        if index:
+            text_ops.append("0 -18 Td")
+        text_ops.append(f"({_escape_pdf_text(line)}) Tj")
+    text_ops.append("ET")
+    stream = "\n".join(text_ops).encode("ascii")
+
+    objects = [
+        b"<< /Type /Catalog /Pages 2 0 R >>",
+        b"<< /Type /Pages /Kids [3 0 R] /Count 1 >>",
+        b"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] "
+        b"/Resources << /Font << /F1 4 0 R >> >> /Contents 5 0 R >>",
+        b"<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>",
+        b"<< /Length "
+        + str(len(stream)).encode("ascii")
+        + b" >>\nstream\n"
+        + stream
+        + b"\nendstream",
+    ]
+
+    pdf = bytearray(b"%PDF-1.4\n")
+    offsets = [0]
+    for obj_number, obj in enumerate(objects, start=1):
+        offsets.append(len(pdf))
+        pdf.extend(f"{obj_number} 0 obj\n".encode("ascii"))
+        pdf.extend(obj)
+        pdf.extend(b"\nendobj\n")
+
+    xref_offset = len(pdf)
+    pdf.extend(f"xref\n0 {len(objects) + 1}\n".encode("ascii"))
+    pdf.extend(b"0000000000 65535 f \n")
+    for offset in offsets[1:]:
+        pdf.extend(f"{offset:010d} 00000 n \n".encode("ascii"))
+    pdf.extend(
+        f"trailer\n<< /Size {len(objects) + 1} /Root 1 0 R >>\n"
+        f"startxref\n{xref_offset}\n%%EOF\n".encode("ascii")
+    )
+    return bytes(pdf)
+
+
+def main() -> None:
+    out_dir = Path(__file__).parent
+    for filename, lines in PDF_FIXTURES.items():
+        (out_dir / filename).write_bytes(_build_pdf(lines))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/surfsense_backend/tests/e2e/fakes/fixtures/binary/onedrive-canary.pdf b/surfsense_backend/tests/e2e/fakes/fixtures/binary/onedrive-canary.pdf
new file mode 100644
index 0000000000..ed0636b31c
Binary files /dev/null and b/surfsense_backend/tests/e2e/fakes/fixtures/binary/onedrive-canary.pdf differ
diff --git a/surfsense_backend/tests/e2e/fakes/fixtures/calendar_events.json b/surfsense_backend/tests/e2e/fakes/fixtures/calendar_events.json
new file mode 100644
index 0000000000..d71c6d3b74
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fakes/fixtures/calendar_events.json
@@ -0,0 +1,48 @@
+{
+  "items": [
+    {
+      "id": "fake-calendar-event-canary-001",
+      "status": "confirmed",
+      "summary": "E2E Canary Calendar Event",
+      "description": "This Calendar event proves the Composio Calendar live tool fetched event details. SURFSENSE_E2E_CANARY_TOKEN_CALENDAR_001",
+      "location": "SurfSense E2E Room",
+      "htmlLink": "https://calendar.google.com/calendar/event?eid=fake-calendar-event-canary-001",
+      "start": {
+        "dateTime": "2026-05-12T10:00:00Z",
+        "timeZone": "UTC"
+      },
+      "end": {
+        "dateTime": "2026-05-12T10:30:00Z",
+        "timeZone": "UTC"
+      },
+      "attendees": [
+        {
+          "email": "e2e-fake@surfsense.example",
+          "responseStatus": "accepted"
+        }
+      ]
+    },
+    {
+      "id": "fake-calendar-event-planning-001",
+      "status": "confirmed",
+      "summary": "E2E Planning Sync",
+      "description": "Non-canary planning sync used to prove list responses can contain multiple events.",
+      "location": "SurfSense Planning Room",
+      "htmlLink": "https://calendar.google.com/calendar/event?eid=fake-calendar-event-planning-001",
+      "start": {
+        "dateTime": "2026-05-13T15:00:00Z",
+        "timeZone": "UTC"
+      },
+      "end": {
+        "dateTime": "2026-05-13T15:45:00Z",
+        "timeZone": "UTC"
+      },
+      "attendees": [
+        {
+          "email": "planner@surfsense.example",
+          "responseStatus": "needsAction"
+        }
+      ]
+    }
+  ]
+}
diff --git a/surfsense_backend/tests/e2e/fakes/fixtures/clickup_tasks.json b/surfsense_backend/tests/e2e/fakes/fixtures/clickup_tasks.json
new file mode 100644
index 0000000000..1c49b41b6c
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fakes/fixtures/clickup_tasks.json
@@ -0,0 +1,13 @@
+{
+  "tasks": [
+    {
+      "id": "fake-clickup-task-canary-001",
+      "name": "E2E Canary ClickUp Task",
+      "list_name": "SurfSense E2E ClickUp List",
+      "workspace_id": "fake-clickup-workspace-001",
+      "workspace_name": "SurfSense E2E ClickUp Workspace",
+      "status": "open",
+      "description": "Canary task body containing SURFSENSE_E2E_CANARY_TOKEN_CLICKUP_001"
+    }
+  ]
+}
diff --git a/surfsense_backend/tests/e2e/fakes/fixtures/confluence_pages.json b/surfsense_backend/tests/e2e/fakes/fixtures/confluence_pages.json
new file mode 100644
index 0000000000..1a9a533c2d
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fakes/fixtures/confluence_pages.json
@@ -0,0 +1,33 @@
+{
+  "site": {
+    "cloud_id": "fake-confluence-cloud-001",
+    "name": "SurfSense E2E Confluence",
+    "url": "https://surfsense-e2e-confluence.atlassian.net"
+  },
+  "pages": [
+    {
+      "id": "fake-confluence-page-canary-001",
+      "title": "E2E Canary Confluence Page",
+      "spaceId": "fake-confluence-space-001",
+      "body": {
+        "storage": {
+          "value": "<h1>E2E Canary Confluence Page</h1><p>This page proves Confluence OAuth indexing works end-to-end. SURFSENSE_E2E_CANARY_TOKEN_CONFLUENCE_001</p>"
+        }
+      },
+      "comments": [
+        {
+          "id": "fake-confluence-comment-canary-001",
+          "body": {
+            "storage": {
+              "value": "<p>Confluence comment content is included in indexed markdown.</p>"
+            }
+          },
+          "version": {
+            "authorId": "fake-confluence-user-001",
+            "createdAt": "2026-05-08T00:00:00.000Z"
+          }
+        }
+      ]
+    }
+  ]
+}
diff --git a/surfsense_backend/tests/e2e/fakes/fixtures/drive_files.json b/surfsense_backend/tests/e2e/fakes/fixtures/drive_files.json
new file mode 100644
index 0000000000..cb1bd80415
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fakes/fixtures/drive_files.json
@@ -0,0 +1,99 @@
+{
+	"root": [
+		{
+			"id": "fake-folder-projects",
+			"name": "Projects",
+			"mimeType": "application/vnd.google-apps.folder",
+			"modifiedTime": "2025-01-15T10:00:00.000Z",
+			"createdTime": "2024-12-01T08:00:00.000Z"
+		},
+		{
+			"id": "fake-folder-archive",
+			"name": "Archive",
+			"mimeType": "application/vnd.google-apps.folder",
+			"modifiedTime": "2024-11-20T14:30:00.000Z",
+			"createdTime": "2024-09-10T12:00:00.000Z"
+		},
+		{
+			"id": "fake-file-readme",
+			"name": "README.md",
+			"mimeType": "text/markdown",
+			"modifiedTime": "2025-02-01T09:15:00.000Z",
+			"createdTime": "2025-01-30T16:20:00.000Z"
+		},
+		{
+			"id": "fake-file-canary",
+			"name": "e2e-canary.txt",
+			"mimeType": "text/plain",
+			"modifiedTime": "2025-02-10T11:00:00.000Z",
+			"createdTime": "2025-02-10T11:00:00.000Z"
+		},
+		{
+			"id": "fake-file-pdf-native",
+			"name": "e2e-canary.pdf",
+			"mimeType": "application/pdf",
+			"size": 735,
+			"modifiedTime": "2025-02-10T11:05:00.000Z",
+			"createdTime": "2025-02-10T11:05:00.000Z"
+		},
+		{
+			"id": "fake-file-pdf-composio",
+			"name": "e2e-composio-canary.pdf",
+			"mimeType": "application/pdf",
+			"size": 748,
+			"modifiedTime": "2025-02-10T11:10:00.000Z",
+			"createdTime": "2025-02-10T11:10:00.000Z"
+		},
+		{
+			"id": "fake-file-budget",
+			"name": "Q1-Budget.csv",
+			"mimeType": "text/csv",
+			"modifiedTime": "2025-01-25T13:45:00.000Z",
+			"createdTime": "2025-01-25T13:45:00.000Z"
+		},
+		{
+			"id": "fake-shortcut-canary",
+			"name": "Shortcut to Canary",
+			"mimeType": "application/vnd.google-apps.shortcut",
+			"modifiedTime": "2025-02-10T12:00:00.000Z",
+			"createdTime": "2025-02-10T12:00:00.000Z"
+		},
+		{
+			"id": "fake-file-trashed",
+			"name": "trashed-e2e-note.txt",
+			"mimeType": "text/plain",
+			"modifiedTime": "2025-02-11T09:00:00.000Z",
+			"createdTime": "2025-02-11T09:00:00.000Z",
+			"trashed": true
+		}
+	],
+	"fake-folder-projects": [
+		{
+			"id": "fake-file-roadmap",
+			"name": "2025-Roadmap.md",
+			"mimeType": "text/markdown",
+			"modifiedTime": "2025-02-12T08:30:00.000Z",
+			"createdTime": "2025-01-05T10:00:00.000Z"
+		}
+	],
+	"fake-folder-archive": [
+		{
+			"id": "fake-file-old-notes",
+			"name": "old-meeting-notes.txt",
+			"mimeType": "text/plain",
+			"modifiedTime": "2024-08-15T15:00:00.000Z",
+			"createdTime": "2024-08-15T15:00:00.000Z"
+		}
+	],
+	"_file_contents": {
+		"fake-file-readme": "# E2E Fake Drive\n\nThis README is served by the strict Composio fake. SURFSENSE_E2E_README_MARKER",
+		"fake-file-canary": "Canary token for E2E tests: SURFSENSE_E2E_CANARY_TOKEN_DRIVE_001\nThis file's content is asserted by indexing.spec.ts to confirm the indexing pipeline ran end-to-end.",
+		"fake-file-budget": "Quarter,Revenue,Expenses\nQ1,100000,75000\nSURFSENSE_E2E_BUDGET_MARKER,2025,test",
+		"fake-file-roadmap": "# 2025 Roadmap\n\n- E2E Drive indexing\n- Composio Gmail/Calendar SURFSENSE_E2E_ROADMAP_MARKER",
+		"fake-file-old-notes": "Old meeting notes archived in 2024. SURFSENSE_E2E_ARCHIVE_MARKER"
+	},
+	"_file_binary_paths": {
+		"fake-file-pdf-native": "binary/drive-canary.pdf",
+		"fake-file-pdf-composio": "binary/composio-drive-canary.pdf"
+	}
+}
diff --git a/surfsense_backend/tests/e2e/fakes/fixtures/dropbox_files.json b/surfsense_backend/tests/e2e/fakes/fixtures/dropbox_files.json
new file mode 100644
index 0000000000..911dee6580
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fakes/fixtures/dropbox_files.json
@@ -0,0 +1,34 @@
+{
+	"": [
+		{
+			".tag": "file",
+			"id": "id:fake-dropbox-canary",
+			"name": "e2e-dropbox-canary.txt",
+			"path_lower": "/e2e-dropbox-canary.txt",
+			"path_display": "/e2e-dropbox-canary.txt",
+			"size": 152,
+			"is_downloadable": true,
+			"server_modified": "2026-05-08T00:00:00Z",
+			"client_modified": "2026-05-08T00:00:00Z",
+			"content_hash": "fake-dropbox-hash-001"
+		},
+		{
+			".tag": "file",
+			"id": "id:fake-dropbox-pdf-canary",
+			"name": "e2e-dropbox-canary.pdf",
+			"path_lower": "/e2e-dropbox-canary.pdf",
+			"path_display": "/e2e-dropbox-canary.pdf",
+			"size": 727,
+			"is_downloadable": true,
+			"server_modified": "2026-05-08T00:05:00Z",
+			"client_modified": "2026-05-08T00:05:00Z",
+			"content_hash": "fake-dropbox-hash-pdf-001"
+		}
+	],
+	"_file_contents": {
+		"/e2e-dropbox-canary.txt": "Canary token for Dropbox E2E tests: SURFSENSE_E2E_CANARY_TOKEN_DROPBOX_001\nThis file proves the Dropbox indexing pipeline ran end-to-end."
+	},
+	"_file_binary_paths": {
+		"/e2e-dropbox-canary.pdf": "binary/dropbox-canary.pdf"
+	}
+}
diff --git a/surfsense_backend/tests/e2e/fakes/fixtures/gmail_messages.json b/surfsense_backend/tests/e2e/fakes/fixtures/gmail_messages.json
new file mode 100644
index 0000000000..74bbd9965a
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fakes/fixtures/gmail_messages.json
@@ -0,0 +1,34 @@
+{
+  "messages": [
+    {
+      "id": "fake-msg-canary-001",
+      "threadId": "fake-thread-canary-001",
+      "snippet": "E2E canary email body is loaded through the Gmail detail endpoint."
+    },
+    {
+      "id": "fake-msg-planning-001",
+      "threadId": "fake-thread-planning-001",
+      "snippet": "Planning email used to keep Gmail fixtures representative."
+    }
+  ],
+  "details": {
+    "fake-msg-canary-001": {
+      "id": "fake-msg-canary-001",
+      "threadId": "fake-thread-canary-001",
+      "subject": "E2E Canary Email",
+      "from": "sender@surfsense.example",
+      "to": "e2e-fake@surfsense.example",
+      "date": "Mon, 10 Feb 2025 11:00:00 +0000",
+      "messageText": "This Gmail message body proves the Composio Gmail live tool fetched message details. SURFSENSE_E2E_CANARY_TOKEN_GMAIL_001"
+    },
+    "fake-msg-planning-001": {
+      "id": "fake-msg-planning-001",
+      "threadId": "fake-thread-planning-001",
+      "subject": "E2E Planning Notes",
+      "from": "planner@surfsense.example",
+      "to": "e2e-fake@surfsense.example",
+      "date": "Tue, 11 Feb 2025 09:30:00 +0000",
+      "messageText": "Planning notes for a non-canary Gmail fixture."
+    }
+  }
+}
diff --git a/surfsense_backend/tests/e2e/fakes/fixtures/jira_issues.json b/surfsense_backend/tests/e2e/fakes/fixtures/jira_issues.json
new file mode 100644
index 0000000000..3acea1993d
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fakes/fixtures/jira_issues.json
@@ -0,0 +1,15 @@
+{
+  "site": {
+    "cloud_id": "fake-jira-cloud-001",
+    "name": "SurfSense E2E Atlassian",
+    "url": "https://surfsense-e2e.atlassian.net"
+  },
+  "issues": [
+    {
+      "id": "fake-jira-issue-canary-001",
+      "key": "E2E-101",
+      "summary": "E2E Canary Jira Issue",
+      "description": "This Jira issue proves live MCP tool calls work end-to-end. SURFSENSE_E2E_CANARY_TOKEN_JIRA_001"
+    }
+  ]
+}
diff --git a/surfsense_backend/tests/e2e/fakes/fixtures/linear_issues.json b/surfsense_backend/tests/e2e/fakes/fixtures/linear_issues.json
new file mode 100644
index 0000000000..b50a05996a
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fakes/fixtures/linear_issues.json
@@ -0,0 +1,17 @@
+{
+  "organization": {
+    "name": "SurfSense E2E Linear Org",
+    "url_key": "surfsense-e2e"
+  },
+  "issues": [
+    {
+      "id": "fake-linear-issue-canary-001",
+      "identifier": "E2E-101",
+      "title": "E2E Canary Linear Issue",
+      "description": "This Linear issue proves the live MCP tool fetched issue details. SURFSENSE_E2E_CANARY_TOKEN_LINEAR_001",
+      "state": "Todo",
+      "assignee": "E2E Owner",
+      "team": "SurfSense E2E"
+    }
+  ]
+}
diff --git a/surfsense_backend/tests/e2e/fakes/fixtures/notion_pages.json b/surfsense_backend/tests/e2e/fakes/fixtures/notion_pages.json
new file mode 100644
index 0000000000..c0056e6d85
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fakes/fixtures/notion_pages.json
@@ -0,0 +1,64 @@
+{
+  "pages": [
+    {
+      "object": "page",
+      "id": "fake-notion-page-canary-001",
+      "created_time": "2026-05-07T00:00:00.000Z",
+      "last_edited_time": "2026-05-07T00:00:00.000Z",
+      "url": "https://notion.so/fake-notion-page-canary-001",
+      "properties": {
+        "Name": {
+          "id": "title",
+          "type": "title",
+          "title": [
+            {
+              "type": "text",
+              "plain_text": "E2E Canary Notion Page",
+              "text": {
+                "content": "E2E Canary Notion Page"
+              }
+            }
+          ]
+        }
+      }
+    }
+  ],
+  "blocks": {
+    "fake-notion-page-canary-001": [
+      {
+        "object": "block",
+        "id": "fake-notion-block-heading-001",
+        "type": "heading_2",
+        "has_children": false,
+        "heading_2": {
+          "rich_text": [
+            {
+              "type": "text",
+              "plain_text": "E2E Notion Canary",
+              "text": {
+                "content": "E2E Notion Canary"
+              }
+            }
+          ]
+        }
+      },
+      {
+        "object": "block",
+        "id": "fake-notion-block-body-001",
+        "type": "paragraph",
+        "has_children": false,
+        "paragraph": {
+          "rich_text": [
+            {
+              "type": "text",
+              "plain_text": "This Notion page proves the indexed connector fetched Notion blocks through OAuth credentials. SURFSENSE_E2E_CANARY_TOKEN_NOTION_001",
+              "text": {
+                "content": "This Notion page proves the indexed connector fetched Notion blocks through OAuth credentials. SURFSENSE_E2E_CANARY_TOKEN_NOTION_001"
+              }
+            }
+          ]
+        }
+      }
+    ]
+  }
+}
diff --git a/surfsense_backend/tests/e2e/fakes/fixtures/onedrive_files.json b/surfsense_backend/tests/e2e/fakes/fixtures/onedrive_files.json
new file mode 100644
index 0000000000..ae8aa68c45
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fakes/fixtures/onedrive_files.json
@@ -0,0 +1,82 @@
+{
+	"root": [
+		{
+			"id": "fake-onedrive-folder-projects",
+			"name": "Projects",
+			"size": 0,
+			"folder": {
+				"childCount": 1
+			},
+			"parentReference": {
+				"id": "root",
+				"path": "/drive/root:"
+			},
+			"createdDateTime": "2025-02-01T08:00:00Z",
+			"lastModifiedDateTime": "2025-02-10T10:00:00Z",
+			"webUrl": "https://onedrive.example/fake/projects"
+		},
+		{
+			"id": "fake-onedrive-canary",
+			"name": "e2e-onedrive-canary.txt",
+			"size": 164,
+			"file": {
+				"mimeType": "text/plain",
+				"hashes": {
+					"quickXorHash": "fake-onedrive-canary-qxh"
+				}
+			},
+			"parentReference": {
+				"id": "root",
+				"path": "/drive/root:"
+			},
+			"createdDateTime": "2025-02-10T11:00:00Z",
+			"lastModifiedDateTime": "2025-02-10T11:00:00Z",
+			"webUrl": "https://onedrive.example/fake/e2e-onedrive-canary.txt"
+		},
+		{
+			"id": "fake-onedrive-pdf-canary",
+			"name": "e2e-onedrive-canary.pdf",
+			"size": 730,
+			"file": {
+				"mimeType": "application/pdf",
+				"hashes": {
+					"quickXorHash": "fake-onedrive-pdf-canary-qxh"
+				}
+			},
+			"parentReference": {
+				"id": "root",
+				"path": "/drive/root:"
+			},
+			"createdDateTime": "2025-02-10T11:05:00Z",
+			"lastModifiedDateTime": "2025-02-10T11:05:00Z",
+			"webUrl": "https://onedrive.example/fake/e2e-onedrive-canary.pdf"
+		}
+	],
+	"fake-onedrive-folder-projects": [
+		{
+			"id": "fake-onedrive-roadmap",
+			"name": "OneDrive Roadmap.md",
+			"size": 82,
+			"file": {
+				"mimeType": "text/markdown",
+				"hashes": {
+					"quickXorHash": "fake-onedrive-roadmap-qxh"
+				}
+			},
+			"parentReference": {
+				"id": "fake-onedrive-folder-projects",
+				"path": "/drive/root:/Projects"
+			},
+			"createdDateTime": "2025-02-11T08:00:00Z",
+			"lastModifiedDateTime": "2025-02-12T08:30:00Z",
+			"webUrl": "https://onedrive.example/fake/projects/onedrive-roadmap.md"
+		}
+	],
+	"_file_contents": {
+		"fake-onedrive-canary": "Canary token for OneDrive E2E tests: SURFSENSE_E2E_CANARY_TOKEN_ONEDRIVE_001\nThis file proves the OneDrive indexing pipeline ran end-to-end.",
+		"fake-onedrive-roadmap": "# OneDrive Roadmap\n\n- E2E OneDrive indexing\n- Deterministic Graph fake"
+	},
+	"_file_binary_paths": {
+		"fake-onedrive-pdf-canary": "binary/onedrive-canary.pdf"
+	}
+}
diff --git a/surfsense_backend/tests/e2e/fakes/fixtures/slack_messages.json b/surfsense_backend/tests/e2e/fakes/fixtures/slack_messages.json
new file mode 100644
index 0000000000..6b5f652dd3
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fakes/fixtures/slack_messages.json
@@ -0,0 +1,18 @@
+{
+  "team": {
+    "id": "T_FAKE_SLACK_TEAM",
+    "name": "SurfSense E2E Slack Workspace"
+  },
+  "channel": {
+    "id": "C_FAKE_SLACK_CANARY",
+    "name": "slack-e2e-canary",
+    "purpose": "SurfSense E2E Slack canary channel"
+  },
+  "messages": [
+    {
+      "ts": "1715000000.000100",
+      "user": "U_FAKE_SLACK_USER",
+      "text": "This Slack message proves the live MCP tool fetched channel content. SURFSENSE_E2E_CANARY_TOKEN_SLACK_001"
+    }
+  ]
+}
diff --git a/surfsense_backend/tests/e2e/fakes/jira_module.py b/surfsense_backend/tests/e2e/fakes/jira_module.py
new file mode 100644
index 0000000000..d675312182
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fakes/jira_module.py
@@ -0,0 +1,133 @@
+"""Strict Jira MCP OAuth/tool fakes for Playwright E2E."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from types import SimpleNamespace
+from typing import Any
+
+from tests.e2e.fakes import mcp_oauth_runtime, mcp_runtime
+
+_FIXTURE_PATH = Path(__file__).parent / "fixtures" / "jira_issues.json"
+
+_AUTHORIZATION_URL = "https://mcp.atlassian.com/v1/authorize"
+_REGISTRATION_URL = "https://cf.mcp.atlassian.com/v1/register"
+_TOKEN_URL = "https://cf.mcp.atlassian.com/v1/token"
+_MCP_URL = "https://mcp.atlassian.com/v1/mcp"
+
+_CLIENT_ID = "fake-jira-mcp-client-id"
+_CLIENT_SECRET = "fake-jira-mcp-client-secret"
+_ACCESS_TOKEN = "fake-jira-mcp-access-token"
+_REFRESH_TOKEN = "fake-jira-mcp-refresh-token"
+_OAUTH_CODE = "fake-jira-oauth-code"
+
+
+def _load_fixture() -> dict[str, Any]:
+    with _FIXTURE_PATH.open() as f:
+        return json.load(f)
+
+
+_FIXTURE = _load_fixture()
+
+
+def _issue_text(issue: dict[str, Any]) -> str:
+    return (
+        f"{issue['key']} {issue['summary']}\n"
+        f"id: {issue['id']}\n"
+        f"description: {issue['description']}"
+    )
+
+
+async def _list_tools() -> SimpleNamespace:
+    return SimpleNamespace(
+        tools=[
+            SimpleNamespace(
+                name="getAccessibleAtlassianResources",
+                description="Get Jira sites accessible to the authenticated Atlassian user.",
+                inputSchema={
+                    "type": "object",
+                    "properties": {},
+                    "required": [],
+                },
+            ),
+            SimpleNamespace(
+                name="searchJiraIssuesUsingJql",
+                description="Search Jira issues using a Jira Query Language expression.",
+                inputSchema={
+                    "type": "object",
+                    "properties": {
+                        "jql": {
+                            "type": "string",
+                            "description": "JQL query used to search Jira issues.",
+                        },
+                        "maxResults": {
+                            "type": "integer",
+                            "description": "Maximum number of matching issues to return.",
+                        },
+                    },
+                    "required": ["jql"],
+                },
+            ),
+        ]
+    )
+
+
+async def _call_tool(
+    tool_name: str, arguments: dict[str, Any] | None = None
+) -> SimpleNamespace:
+    arguments = arguments or {}
+    site = _FIXTURE["site"]
+    issue = _FIXTURE["issues"][0]
+
+    if tool_name == "getAccessibleAtlassianResources":
+        if arguments:
+            raise ValueError(
+                f"Unexpected Jira getAccessibleAtlassianResources args: {arguments!r}"
+            )
+        text = f"{site['name']}\ncloud_id: {site['cloud_id']}\nurl: {site['url']}"
+        return SimpleNamespace(content=[SimpleNamespace(text=text)])
+
+    if tool_name == "searchJiraIssuesUsingJql":
+        jql = str(arguments.get("jql", ""))
+        if (
+            issue["summary"].lower() not in jql.lower()
+            and issue["key"].lower() not in jql.lower()
+        ):
+            raise ValueError(f"Unexpected Jira JQL query: {jql!r}")
+        text = _issue_text(issue)
+        return SimpleNamespace(content=[SimpleNamespace(text=text)])
+
+    raise NotImplementedError(f"Unexpected Jira MCP tool call: {tool_name!r}")
+
+
+def install(active_patches: list[Any]) -> None:
+    """Register Jira MCP OAuth/tool handlers with the shared dispatchers."""
+    del active_patches
+    mcp_oauth_runtime.register_service(
+        mcp_url=_MCP_URL,
+        discovery_metadata={
+            "issuer": "https://mcp.atlassian.com",
+            "authorization_endpoint": _AUTHORIZATION_URL,
+            "token_endpoint": _TOKEN_URL,
+            "registration_endpoint": _REGISTRATION_URL,
+            "code_challenge_methods_supported": ["S256"],
+            "grant_types_supported": ["authorization_code", "refresh_token"],
+            "response_types_supported": ["code"],
+        },
+        client_id=_CLIENT_ID,
+        client_secret=_CLIENT_SECRET,
+        token_endpoint=_TOKEN_URL,
+        registration_endpoint=_REGISTRATION_URL,
+        oauth_code=_OAUTH_CODE,
+        access_token=_ACCESS_TOKEN,
+        refresh_token=_REFRESH_TOKEN,
+        scope="read:jira-work read:me write:jira-work",
+        redirect_uri_substring="/api/v1/auth/mcp/jira/connector/callback",
+    )
+    mcp_runtime.register(
+        url=_MCP_URL,
+        expected_bearer=_ACCESS_TOKEN,
+        list_tools=_list_tools,
+        call_tool=_call_tool,
+    )
diff --git a/surfsense_backend/tests/e2e/fakes/linear_module.py b/surfsense_backend/tests/e2e/fakes/linear_module.py
new file mode 100644
index 0000000000..b6eb482429
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fakes/linear_module.py
@@ -0,0 +1,305 @@
+"""Strict Linear MCP OAuth/tool fakes for Playwright E2E."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from types import SimpleNamespace
+from typing import Any
+
+from tests.e2e.fakes import mcp_oauth_runtime, mcp_runtime
+
+_FIXTURE_PATH = Path(__file__).parent / "fixtures" / "linear_issues.json"
+
+_DISCOVERY_URL = "https://mcp.linear.app/.well-known/oauth-authorization-server"
+_AUTHORIZATION_URL = "https://mcp.linear.app/authorize"
+_REGISTRATION_URL = "https://mcp.linear.app/register"
+_TOKEN_URL = "https://mcp.linear.app/token"
+_MCP_URL = "https://mcp.linear.app/mcp"
+
+_CLIENT_ID = "fake-linear-mcp-client-id"
+_CLIENT_SECRET = "fake-linear-mcp-client-secret"
+_ACCESS_TOKEN = "fake-linear-mcp-access-token"
+_REFRESH_TOKEN = "fake-linear-mcp-refresh-token"
+_OAUTH_CODE = "fake-linear-oauth-code"
+
+
+def _load_fixture() -> dict[str, Any]:
+    with _FIXTURE_PATH.open() as f:
+        return json.load(f)
+
+
+_FIXTURE = _load_fixture()
+
+
+class _StrictFakeMixin:
+    _component_name: str = "<unknown>"
+
+    def __getattr__(self, name: str) -> Any:
+        raise NotImplementedError(
+            f"E2E Linear fake missing surface: {self._component_name}.{name!r}. "
+            "Add it to surfsense_backend/tests/e2e/fakes/linear_module.py."
+        )
+
+
+async def _fake_discover_oauth_metadata(
+    mcp_url: str,
+    *,
+    origin_override: str | None = None,
+    timeout: float = 15.0,
+) -> dict[str, Any]:
+    del origin_override, timeout
+    if mcp_url != _MCP_URL:
+        raise NotImplementedError(f"Unexpected Linear MCP discovery url={mcp_url!r}")
+    return {
+        "issuer": "https://mcp.linear.app",
+        "authorization_endpoint": _AUTHORIZATION_URL,
+        "token_endpoint": _TOKEN_URL,
+        "registration_endpoint": _REGISTRATION_URL,
+        "code_challenge_methods_supported": ["S256"],
+        "grant_types_supported": ["authorization_code", "refresh_token"],
+        "response_types_supported": ["code"],
+    }
+
+
+async def _fake_register_client(
+    registration_endpoint: str,
+    redirect_uri: str,
+    *,
+    client_name: str = "SurfSense",
+    timeout: float = 15.0,
+) -> dict[str, Any]:
+    del timeout
+    if registration_endpoint != _REGISTRATION_URL:
+        raise NotImplementedError(
+            f"Unexpected Linear DCR endpoint={registration_endpoint!r}"
+        )
+    if client_name != "SurfSense":
+        raise ValueError(f"Unexpected Linear DCR client_name={client_name!r}")
+    if "/api/v1/auth/mcp/linear/connector/callback" not in redirect_uri:
+        raise ValueError(f"Unexpected Linear redirect_uri={redirect_uri!r}")
+    return {
+        "client_id": _CLIENT_ID,
+        "client_secret": _CLIENT_SECRET,
+        "client_id_issued_at": 1_776_621_600,
+        "token_endpoint_auth_method": "client_secret_basic",
+    }
+
+
+async def _fake_exchange_code_for_tokens(
+    token_endpoint: str,
+    code: str,
+    redirect_uri: str,
+    client_id: str,
+    client_secret: str,
+    code_verifier: str,
+    *,
+    timeout: float = 30.0,
+) -> dict[str, Any]:
+    del timeout
+    if token_endpoint != _TOKEN_URL:
+        raise NotImplementedError(
+            f"Unexpected Linear token_endpoint={token_endpoint!r}"
+        )
+    if code != _OAUTH_CODE:
+        raise ValueError(f"Unexpected fake Linear OAuth code: {code!r}")
+    if "/api/v1/auth/mcp/linear/connector/callback" not in redirect_uri:
+        raise ValueError(f"Unexpected Linear redirect_uri={redirect_uri!r}")
+    if client_id != _CLIENT_ID or client_secret != _CLIENT_SECRET:
+        raise ValueError(
+            "Unexpected Linear client credentials: "
+            f"client_id={client_id!r} client_secret={client_secret!r}"
+        )
+    if not code_verifier:
+        raise ValueError("Linear token exchange missing code_verifier.")
+    return {
+        "access_token": _ACCESS_TOKEN,
+        "refresh_token": _REFRESH_TOKEN,
+        "expires_in": 3600,
+        "scope": "read write",
+        "token_type": "Bearer",
+    }
+
+
+async def _fake_refresh_access_token(
+    token_endpoint: str,
+    refresh_token: str,
+    client_id: str,
+    client_secret: str,
+    *,
+    timeout: float = 30.0,
+) -> dict[str, Any]:
+    del timeout
+    if token_endpoint != _TOKEN_URL:
+        raise NotImplementedError(
+            f"Unexpected Linear token_endpoint={token_endpoint!r}"
+        )
+    if refresh_token != _REFRESH_TOKEN:
+        raise ValueError(f"Unexpected fake Linear refresh token: {refresh_token!r}")
+    if client_id != _CLIENT_ID or client_secret != _CLIENT_SECRET:
+        raise ValueError(
+            "Unexpected Linear refresh client credentials: "
+            f"client_id={client_id!r} client_secret={client_secret!r}"
+        )
+    return {
+        "access_token": _ACCESS_TOKEN,
+        "refresh_token": _REFRESH_TOKEN,
+        "expires_in": 3600,
+        "scope": "read write",
+        "token_type": "Bearer",
+    }
+
+
+class _FakeStreamableHttpClient(_StrictFakeMixin):
+    _component_name = "streamablehttp_client"
+
+    def __init__(
+        self, url: str, *, headers: dict[str, str] | None = None, **kwargs: Any
+    ):
+        del kwargs
+        if url != _MCP_URL:
+            raise NotImplementedError(f"Unexpected Linear MCP url={url!r}")
+        auth = (headers or {}).get("Authorization")
+        if auth != f"Bearer {_ACCESS_TOKEN}":
+            raise ValueError(f"Unexpected Linear MCP Authorization header: {auth!r}")
+        self.url = url
+        self.headers = headers or {}
+
+    async def __aenter__(self) -> tuple[object, object, None]:
+        return object(), object(), None
+
+    async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> None:
+        del exc_type, exc, tb
+
+
+class _FakeClientSession(_StrictFakeMixin):
+    _component_name = "ClientSession"
+
+    def __init__(self, read: object, write: object):
+        self.read = read
+        self.write = write
+
+    async def __aenter__(self) -> _FakeClientSession:
+        return self
+
+    async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> None:
+        del exc_type, exc, tb
+
+    async def initialize(self) -> None:
+        return None
+
+    async def list_tools(self) -> SimpleNamespace:
+        return SimpleNamespace(
+            tools=[
+                SimpleNamespace(
+                    name="list_issues",
+                    description="List Linear issues visible to the authenticated user.",
+                    inputSchema={
+                        "type": "object",
+                        "properties": {
+                            "query": {
+                                "type": "string",
+                                "description": "Text to search for in Linear issues.",
+                            },
+                            "limit": {
+                                "type": "integer",
+                                "description": "Maximum number of issues to return.",
+                            },
+                        },
+                        "required": [],
+                    },
+                ),
+                SimpleNamespace(
+                    name="get_issue",
+                    description="Get a Linear issue by id or identifier.",
+                    inputSchema={
+                        "type": "object",
+                        "properties": {
+                            "id": {
+                                "type": "string",
+                                "description": "Issue id or identifier.",
+                            }
+                        },
+                        "required": ["id"],
+                    },
+                ),
+            ]
+        )
+
+    async def call_tool(
+        self, tool_name: str, *, arguments: dict[str, Any] | None = None
+    ) -> SimpleNamespace:
+        arguments = arguments or {}
+        issue = _FIXTURE["issues"][0]
+
+        if tool_name == "list_issues":
+            query = str(arguments.get("query", ""))
+            if query and issue["title"].lower() not in query.lower():
+                raise ValueError(f"Unexpected Linear issue query: {query!r}")
+            text = (
+                f"{issue['identifier']} {issue['title']}\n"
+                f"id: {issue['id']}\n"
+                f"description: {issue['description']}"
+            )
+            return SimpleNamespace(content=[SimpleNamespace(text=text)])
+
+        if tool_name == "get_issue":
+            issue_id = arguments.get("id")
+            if issue_id not in {issue["id"], issue["identifier"]}:
+                raise ValueError(f"Unexpected Linear issue id: {issue_id!r}")
+            text = (
+                f"{issue['identifier']} {issue['title']}\n"
+                f"id: {issue['id']}\n"
+                f"description: {issue['description']}"
+            )
+            return SimpleNamespace(content=[SimpleNamespace(text=text)])
+
+        raise NotImplementedError(f"Unexpected Linear MCP tool call: {tool_name!r}")
+
+
+def _fake_streamablehttp_client(
+    url: str, *, headers: dict[str, str] | None = None, **kwargs: Any
+) -> _FakeStreamableHttpClient:
+    return _FakeStreamableHttpClient(url, headers=headers, **kwargs)
+
+
+async def _list_tools() -> SimpleNamespace:
+    return await _FakeClientSession(object(), object()).list_tools()
+
+
+async def _call_tool(tool_name: str, arguments: dict[str, Any]) -> SimpleNamespace:
+    return await _FakeClientSession(object(), object()).call_tool(
+        tool_name, arguments=arguments
+    )
+
+
+def install(active_patches: list[Any]) -> None:
+    """Register Linear MCP OAuth/tool handlers with the shared dispatchers."""
+    del active_patches
+    mcp_oauth_runtime.register_service(
+        mcp_url=_MCP_URL,
+        discovery_metadata={
+            "issuer": "https://mcp.linear.app",
+            "authorization_endpoint": _AUTHORIZATION_URL,
+            "token_endpoint": _TOKEN_URL,
+            "registration_endpoint": _REGISTRATION_URL,
+            "code_challenge_methods_supported": ["S256"],
+            "grant_types_supported": ["authorization_code", "refresh_token"],
+            "response_types_supported": ["code"],
+        },
+        client_id=_CLIENT_ID,
+        client_secret=_CLIENT_SECRET,
+        token_endpoint=_TOKEN_URL,
+        registration_endpoint=_REGISTRATION_URL,
+        oauth_code=_OAUTH_CODE,
+        access_token=_ACCESS_TOKEN,
+        refresh_token=_REFRESH_TOKEN,
+        scope="read write",
+        redirect_uri_substring="/api/v1/auth/mcp/linear/connector/callback",
+    )
+    mcp_runtime.register(
+        url=_MCP_URL,
+        expected_bearer=_ACCESS_TOKEN,
+        list_tools=_list_tools,
+        call_tool=_call_tool,
+    )
diff --git a/surfsense_backend/tests/e2e/fakes/llm.py b/surfsense_backend/tests/e2e/fakes/llm.py
new file mode 100644
index 0000000000..9d2370e2c5
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fakes/llm.py
@@ -0,0 +1,48 @@
+"""Deterministic LLM fake for the E2E indexing pipeline.
+
+The production indexing pipeline summarizes documents with:
+
+    summary_chain = SUMMARY_PROMPT_TEMPLATE | llm
+    summary_result = await summary_chain.ainvoke({"document": ...})
+    summary_content = summary_result.content
+
+The `llm` parameter is supplied per-document by
+`app.services.llm_service.get_user_long_context_llm`. We patch THAT
+function to return a langchain-native FakeListChatModel so the rest of
+the chain works unchanged. No real LLM provider package is touched.
+
+Run-backend / run-celery use unittest.mock.patch.start() to install
+this at every binding site (the source module + every consumer that
+did `from app.services.llm_service import get_user_long_context_llm`
+at module load time).
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+from langchain_core.language_models.fake_chat_models import FakeListChatModel
+
+logger = logging.getLogger(__name__)
+
+
+def _make_fake_llm() -> FakeListChatModel:
+    """Build a fresh FakeListChatModel that returns a deterministic summary."""
+    # FakeListChatModel cycles through `responses` for each invocation. We
+    # supply a single deterministic string. The summary content is tagged
+    # with a marker that specs CAN assert on if they want, but the
+    # primary indexing assertion is on the file content (chunked + stored
+    # separately by the pipeline).
+    fake = FakeListChatModel(
+        responses=[
+            "E2E_FAKE_SUMMARY: Indexed by Playwright E2E run with deterministic LLM stub."
+        ]
+    )
+    return fake
+
+
+async def fake_get_user_long_context_llm(*args: Any, **kwargs: Any) -> Any:
+    """Drop-in replacement for app.services.llm_service.get_user_long_context_llm."""
+    logger.info("[fake-llm] returning FakeListChatModel for E2E indexing")
+    return _make_fake_llm()
diff --git a/surfsense_backend/tests/e2e/fakes/mcp_oauth_runtime.py b/surfsense_backend/tests/e2e/fakes/mcp_oauth_runtime.py
new file mode 100644
index 0000000000..5eda4b67e8
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fakes/mcp_oauth_runtime.py
@@ -0,0 +1,217 @@
+"""Shared strict MCP OAuth fake dispatcher for E2E tests."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+from unittest.mock import patch
+
+
+@dataclass(frozen=True)
+class _OAuthHandler:
+    mcp_url: str
+    discovery_metadata: dict[str, Any]
+    client_id: str
+    client_secret: str
+    token_endpoint: str
+    registration_endpoint: str
+    oauth_code: str
+    access_token: str
+    refresh_token: str
+    scope: str
+    redirect_uri_substring: str
+    expected_origin_override: str | None = None
+
+
+_SERVICES_BY_MCP_URL: dict[str, _OAuthHandler] = {}
+_SERVICES_BY_REGISTRATION_URL: dict[str, _OAuthHandler] = {}
+_SERVICES_BY_TOKEN_URL: dict[str, _OAuthHandler] = {}
+
+
+def register_service(
+    *,
+    mcp_url: str,
+    discovery_metadata: dict[str, Any],
+    client_id: str,
+    client_secret: str,
+    token_endpoint: str,
+    registration_endpoint: str,
+    oauth_code: str,
+    access_token: str,
+    refresh_token: str,
+    scope: str,
+    redirect_uri_substring: str,
+    expected_origin_override: str | None = None,
+) -> None:
+    """Register deterministic MCP OAuth behavior for one service."""
+    handler = _OAuthHandler(
+        mcp_url=mcp_url,
+        discovery_metadata=discovery_metadata,
+        client_id=client_id,
+        client_secret=client_secret,
+        token_endpoint=token_endpoint,
+        registration_endpoint=registration_endpoint,
+        oauth_code=oauth_code,
+        access_token=access_token,
+        refresh_token=refresh_token,
+        scope=scope,
+        redirect_uri_substring=redirect_uri_substring,
+        expected_origin_override=expected_origin_override,
+    )
+    _register_unique(_SERVICES_BY_MCP_URL, mcp_url, handler, "MCP URL")
+    _register_unique(
+        _SERVICES_BY_REGISTRATION_URL,
+        registration_endpoint,
+        handler,
+        "registration endpoint",
+    )
+    _register_unique(_SERVICES_BY_TOKEN_URL, token_endpoint, handler, "token endpoint")
+
+
+def _register_unique(
+    registry: dict[str, _OAuthHandler],
+    key: str,
+    handler: _OAuthHandler,
+    label: str,
+) -> None:
+    existing = registry.get(key)
+    if existing is not None and existing != handler:
+        raise ValueError(f"MCP OAuth fake {label} already registered: {key!r}.")
+    registry[key] = handler
+
+
+async def _fake_discover_oauth_metadata(
+    mcp_url: str,
+    *,
+    origin_override: str | None = None,
+    timeout: float = 15.0,
+) -> dict[str, Any]:
+    del timeout
+    handler = _SERVICES_BY_MCP_URL.get(mcp_url)
+    if handler is None:
+        raise NotImplementedError(f"Unexpected MCP OAuth discovery url={mcp_url!r}")
+    if origin_override != handler.expected_origin_override:
+        raise ValueError(
+            f"Unexpected MCP OAuth origin_override for {mcp_url!r}: {origin_override!r}"
+        )
+    return dict(handler.discovery_metadata)
+
+
+async def _fake_register_client(
+    registration_endpoint: str,
+    redirect_uri: str,
+    *,
+    client_name: str = "SurfSense",
+    timeout: float = 15.0,
+) -> dict[str, Any]:
+    del timeout
+    handler = _SERVICES_BY_REGISTRATION_URL.get(registration_endpoint)
+    if handler is None:
+        raise NotImplementedError(
+            f"Unexpected MCP OAuth DCR endpoint={registration_endpoint!r}"
+        )
+    if client_name != "SurfSense":
+        raise ValueError(f"Unexpected MCP OAuth DCR client_name={client_name!r}")
+    if handler.redirect_uri_substring not in redirect_uri:
+        raise ValueError(
+            f"Unexpected MCP OAuth DCR redirect_uri={redirect_uri!r}; "
+            f"expected {handler.redirect_uri_substring!r}"
+        )
+    return {
+        "client_id": handler.client_id,
+        "client_secret": handler.client_secret,
+        "client_id_issued_at": 1_776_621_600,
+        "token_endpoint_auth_method": "client_secret_basic",
+    }
+
+
+async def _fake_exchange_code_for_tokens(
+    token_endpoint: str,
+    code: str,
+    redirect_uri: str,
+    client_id: str,
+    client_secret: str,
+    code_verifier: str,
+    *,
+    timeout: float = 30.0,
+) -> dict[str, Any]:
+    del timeout
+    handler = _SERVICES_BY_TOKEN_URL.get(token_endpoint)
+    if handler is None:
+        raise NotImplementedError(
+            f"Unexpected MCP OAuth token_endpoint={token_endpoint!r}"
+        )
+    if code != handler.oauth_code:
+        raise ValueError(f"Unexpected fake MCP OAuth code: {code!r}")
+    if handler.redirect_uri_substring not in redirect_uri:
+        raise ValueError(
+            f"Unexpected MCP OAuth token redirect_uri={redirect_uri!r}; "
+            f"expected {handler.redirect_uri_substring!r}"
+        )
+    if client_id != handler.client_id or client_secret != handler.client_secret:
+        raise ValueError(
+            "Unexpected MCP OAuth client credentials: "
+            f"client_id={client_id!r} client_secret={client_secret!r}"
+        )
+    if not code_verifier:
+        raise ValueError("MCP OAuth token exchange missing code_verifier.")
+    return {
+        "access_token": handler.access_token,
+        "refresh_token": handler.refresh_token,
+        "expires_in": 3600,
+        "scope": handler.scope,
+        "token_type": "Bearer",
+    }
+
+
+async def _fake_refresh_access_token(
+    token_endpoint: str,
+    refresh_token: str,
+    client_id: str,
+    client_secret: str,
+    *,
+    timeout: float = 30.0,
+) -> dict[str, Any]:
+    del timeout
+    handler = _SERVICES_BY_TOKEN_URL.get(token_endpoint)
+    if handler is None:
+        raise NotImplementedError(
+            f"Unexpected MCP OAuth refresh token_endpoint={token_endpoint!r}"
+        )
+    if refresh_token != handler.refresh_token:
+        raise ValueError(f"Unexpected fake MCP OAuth refresh token: {refresh_token!r}")
+    if client_id != handler.client_id or client_secret != handler.client_secret:
+        raise ValueError(
+            "Unexpected MCP OAuth refresh client credentials: "
+            f"client_id={client_id!r} client_secret={client_secret!r}"
+        )
+    return {
+        "access_token": handler.access_token,
+        "refresh_token": handler.refresh_token,
+        "expires_in": 3600,
+        "scope": handler.scope,
+        "token_type": "Bearer",
+    }
+
+
+def install(active_patches: list[Any]) -> None:
+    """Patch generic MCP OAuth helper boundaries exactly once."""
+    targets = [
+        (
+            "app.services.mcp_oauth.discovery.discover_oauth_metadata",
+            _fake_discover_oauth_metadata,
+        ),
+        ("app.services.mcp_oauth.discovery.register_client", _fake_register_client),
+        (
+            "app.services.mcp_oauth.discovery.exchange_code_for_tokens",
+            _fake_exchange_code_for_tokens,
+        ),
+        (
+            "app.services.mcp_oauth.discovery.refresh_access_token",
+            _fake_refresh_access_token,
+        ),
+    ]
+    for target, replacement in targets:
+        p = patch(target, replacement)
+        p.start()
+        active_patches.append(p)
diff --git a/surfsense_backend/tests/e2e/fakes/mcp_runtime.py b/surfsense_backend/tests/e2e/fakes/mcp_runtime.py
new file mode 100644
index 0000000000..e772bb63a8
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fakes/mcp_runtime.py
@@ -0,0 +1,148 @@
+"""Shared strict MCP streamable-HTTP runtime fake for E2E tests."""
+
+from __future__ import annotations
+
+import inspect
+from collections.abc import Awaitable, Callable
+from dataclasses import dataclass
+from types import SimpleNamespace
+from typing import Any
+from unittest.mock import patch
+
+ListToolsFn = Callable[[], Any | Awaitable[Any]]
+CallToolFn = Callable[[str, dict[str, Any]], Any | Awaitable[Any]]
+
+
+@dataclass(frozen=True)
+class _RuntimeHandler:
+    expected_bearer: str
+    list_tools: ListToolsFn
+    call_tool: CallToolFn
+
+
+_HANDLERS: dict[str, _RuntimeHandler] = {}
+
+
+class _StrictFakeMixin:
+    _component_name: str = "<unknown>"
+
+    def __getattr__(self, name: str) -> Any:
+        raise NotImplementedError(
+            f"E2E MCP runtime fake missing surface: {self._component_name}.{name!r}. "
+            "Add it to surfsense_backend/tests/e2e/fakes/mcp_runtime.py."
+        )
+
+
+class _FakeEndpoint(_StrictFakeMixin):
+    _component_name = "streamablehttp_endpoint"
+
+    def __init__(self, url: str, handler: _RuntimeHandler):
+        self.url = url
+        self.handler = handler
+
+
+class _FakeStreamableHttpClient(_StrictFakeMixin):
+    _component_name = "streamablehttp_client"
+
+    def __init__(
+        self, url: str, *, headers: dict[str, str] | None = None, **kwargs: Any
+    ):
+        del kwargs
+        handler = _HANDLERS.get(url)
+        if handler is None:
+            raise NotImplementedError(f"Unexpected MCP streamable-http url={url!r}")
+
+        auth = (headers or {}).get("Authorization")
+        expected = f"Bearer {handler.expected_bearer}"
+        if auth != expected:
+            raise ValueError(
+                f"Unexpected MCP Authorization header for {url!r}: {auth!r}"
+            )
+
+        self.url = url
+        self.headers = headers or {}
+        self.handler = handler
+
+    async def __aenter__(self) -> tuple[_FakeEndpoint, _FakeEndpoint, None]:
+        return (
+            _FakeEndpoint(self.url, self.handler),
+            _FakeEndpoint(self.url, self.handler),
+            None,
+        )
+
+    async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> None:
+        del exc_type, exc, tb
+
+
+class _FakeClientSession(_StrictFakeMixin):
+    _component_name = "ClientSession"
+
+    def __init__(self, read: _FakeEndpoint, write: _FakeEndpoint):
+        if read.handler is not write.handler:
+            raise ValueError("MCP fake received mismatched read/write endpoints.")
+        self.read = read
+        self.write = write
+        self.handler = read.handler
+
+    async def __aenter__(self) -> _FakeClientSession:
+        return self
+
+    async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> None:
+        del exc_type, exc, tb
+
+    async def initialize(self) -> None:
+        return None
+
+    async def list_tools(self) -> SimpleNamespace:
+        result = self.handler.list_tools()
+        if inspect.isawaitable(result):
+            result = await result
+        return result
+
+    async def call_tool(
+        self, tool_name: str, *, arguments: dict[str, Any] | None = None
+    ) -> SimpleNamespace:
+        result = self.handler.call_tool(tool_name, arguments or {})
+        if inspect.isawaitable(result):
+            result = await result
+        return result
+
+
+def _fake_streamablehttp_client(
+    url: str, *, headers: dict[str, str] | None = None, **kwargs: Any
+) -> _FakeStreamableHttpClient:
+    return _FakeStreamableHttpClient(url, headers=headers, **kwargs)
+
+
+def register(
+    *,
+    url: str,
+    expected_bearer: str,
+    list_tools: ListToolsFn,
+    call_tool: CallToolFn,
+) -> None:
+    """Register a fake streamable-HTTP MCP server by canonical MCP URL."""
+    existing = _HANDLERS.get(url)
+    handler = _RuntimeHandler(
+        expected_bearer=expected_bearer,
+        list_tools=list_tools,
+        call_tool=call_tool,
+    )
+    if existing is not None and existing != handler:
+        raise ValueError(f"MCP runtime fake handler already registered for {url!r}.")
+    _HANDLERS[url] = handler
+
+
+def install(active_patches: list[Any]) -> None:
+    """Patch production MCP streamable-HTTP boundaries exactly once."""
+    targets = [
+        (
+            "app.agents.new_chat.tools.mcp_tool.streamablehttp_client",
+            _fake_streamablehttp_client,
+        ),
+        ("app.agents.new_chat.tools.mcp_tool.ClientSession", _FakeClientSession),
+    ]
+    for target, replacement in targets:
+        p = patch(target, replacement)
+        p.start()
+        active_patches.append(p)
diff --git a/surfsense_backend/tests/e2e/fakes/native_google.py b/surfsense_backend/tests/e2e/fakes/native_google.py
new file mode 100644
index 0000000000..73c8cc7385
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fakes/native_google.py
@@ -0,0 +1,444 @@
+"""Strict native Google SDK fakes for Playwright E2E.
+
+This module patches the production Google OAuth and Drive SDK bindings used by
+the native Google connector happy paths. It deliberately does not replace the
+whole Google package; unmodelled service methods fail loudly.
+"""
+
+from __future__ import annotations
+
+import base64
+import json
+from datetime import UTC, datetime, timedelta
+from pathlib import Path
+from typing import Any
+from unittest.mock import patch
+from urllib.parse import parse_qs, urlencode, urlparse, urlunparse
+
+from google.oauth2.credentials import Credentials
+
+from .binary_loader import _resolve_file_bytes
+
+_FIXTURES_DIR = Path(__file__).parent / "fixtures"
+_DRIVE_FIXTURE_PATH = _FIXTURES_DIR / "drive_files.json"
+_GMAIL_FIXTURE_PATH = _FIXTURES_DIR / "gmail_messages.json"
+_CALENDAR_FIXTURE_PATH = _FIXTURES_DIR / "calendar_events.json"
+
+
+def _load_drive_fixture() -> dict[str, Any]:
+    with _DRIVE_FIXTURE_PATH.open() as f:
+        return json.load(f)
+
+
+_DRIVE_FIXTURE = _load_drive_fixture()
+
+
+def _load_gmail_fixture() -> dict[str, Any]:
+    with _GMAIL_FIXTURE_PATH.open() as f:
+        return json.load(f)
+
+
+_GMAIL_FIXTURE = _load_gmail_fixture()
+
+
+def _load_calendar_fixture() -> dict[str, Any]:
+    with _CALENDAR_FIXTURE_PATH.open() as f:
+        return json.load(f)
+
+
+_CALENDAR_FIXTURE = _load_calendar_fixture()
+
+
+class _StrictFakeMixin:
+    _component_name: str = "<unknown>"
+
+    def __getattr__(self, name: str) -> Any:
+        raise NotImplementedError(
+            f"E2E native Google fake missing surface: "
+            f"{self._component_name}.{name!r}. Add it to "
+            f"surfsense_backend/tests/e2e/fakes/native_google.py."
+        )
+
+
+class _FakeFlow(_StrictFakeMixin):
+    _component_name = "Flow"
+
+    def __init__(
+        self, *, redirect_uri: str | None = None, scopes: list[str] | None = None
+    ):
+        self.redirect_uri = redirect_uri
+        self.scopes = scopes or []
+        self.code_verifier: str | None = None
+        self.credentials = _fake_credentials(scopes=self.scopes)
+
+    @classmethod
+    def from_client_config(
+        cls,
+        client_config: dict[str, Any],
+        scopes: list[str],
+        redirect_uri: str | None = None,
+        **_: Any,
+    ) -> _FakeFlow:
+        del client_config
+        return cls(redirect_uri=redirect_uri, scopes=scopes)
+
+    def authorization_url(self, *, state: str, **_: Any) -> tuple[str, str]:
+        if not self.redirect_uri:
+            raise ValueError("Fake Google Flow requires redirect_uri.")
+
+        parsed = urlparse(self.redirect_uri)
+        query = parse_qs(parsed.query)
+        query["code"] = ["fake-native-drive-oauth-code"]
+        query["state"] = [state]
+        redirect = urlunparse(parsed._replace(query=urlencode(query, doseq=True)))
+        return redirect, state
+
+    def fetch_token(self, *, code: str, **_: Any) -> None:
+        if code != "fake-native-drive-oauth-code":
+            raise ValueError(f"Unexpected fake Google OAuth code: {code!r}")
+        self.credentials = _fake_credentials(scopes=self.scopes)
+
+
+def _fake_credentials(*, scopes: list[str] | None = None) -> Credentials:
+    return Credentials(
+        token="fake-native-drive-access-token",
+        refresh_token="fake-native-drive-refresh-token",
+        token_uri="https://oauth2.googleapis.com/token",
+        client_id="fake-native-drive-client-id",
+        client_secret="fake-native-drive-client-secret",
+        scopes=scopes or ["https://www.googleapis.com/auth/drive"],
+        expiry=datetime.now(UTC).replace(tzinfo=None) + timedelta(hours=1),
+    )
+
+
+class _FakeRequest(_StrictFakeMixin):
+    _component_name = "request"
+
+    def __init__(self, payload: Any):
+        self.payload = payload
+        self.http = None
+
+    def execute(self, **_: Any) -> Any:
+        return self.payload
+
+
+class _FakeMediaRequest(_StrictFakeMixin):
+    _component_name = "media_request"
+
+    def __init__(self, content: bytes):
+        self.content = content
+        self.http = None
+
+
+class _FakeMediaIoBaseDownload(_StrictFakeMixin):
+    _component_name = "MediaIoBaseDownload"
+
+    def __init__(self, fd, request: _FakeMediaRequest, chunksize: int | None = None):
+        del chunksize
+        self.fd = fd
+        self.request = request
+        self._done = False
+
+    def next_chunk(self) -> tuple[None, bool]:
+        if not self._done:
+            self.fd.write(self.request.content)
+            self._done = True
+        return None, True
+
+
+class _FakeDriveFiles(_StrictFakeMixin):
+    _component_name = "drive.files"
+
+    def list(self, **kwargs: Any) -> _FakeRequest:
+        q = kwargs.get("q", "")
+        folder_id = "root"
+        if "in parents" in q:
+            try:
+                folder_id = q.split("'")[1]
+            except IndexError:
+                folder_id = "root"
+
+        files = _filter_drive_files_for_query(q, _DRIVE_FIXTURE.get(folder_id, []))
+        return _FakeRequest({"files": files, "nextPageToken": None})
+
+    def get(self, **kwargs: Any) -> _FakeRequest:
+        file_id = kwargs.get("fileId")
+        metadata = _drive_get_metadata(file_id)
+        return _FakeRequest(metadata)
+
+    def get_media(self, **kwargs: Any) -> _FakeMediaRequest:
+        file_id = kwargs.get("fileId")
+        content = _resolve_file_bytes(_DRIVE_FIXTURE, file_id, _FIXTURES_DIR)
+        if content is None:
+            raise NotImplementedError(
+                f"E2E native Google fake has no content for fileId={file_id!r}."
+            )
+        return _FakeMediaRequest(content)
+
+    def export(self, **kwargs: Any) -> _FakeRequest:
+        file_id = kwargs.get("fileId")
+        content = _DRIVE_FIXTURE.get("_file_contents", {}).get(file_id)
+        if content is None:
+            raise NotImplementedError(
+                f"E2E native Google fake has no export content for fileId={file_id!r}."
+            )
+        return _FakeRequest(content.encode("utf-8"))
+
+
+class _FakeDriveChanges(_StrictFakeMixin):
+    _component_name = "drive.changes"
+
+    def getStartPageToken(self, **_: Any) -> _FakeRequest:  # noqa: N802
+        return _FakeRequest({"startPageToken": "fake-native-start-page-token-1"})
+
+    def list(self, **_: Any) -> _FakeRequest:
+        return _FakeRequest(
+            {"changes": [], "newStartPageToken": "fake-native-start-page-token-1"}
+        )
+
+
+class _FakeDriveService(_StrictFakeMixin):
+    _component_name = "drive_service"
+
+    def files(self) -> _FakeDriveFiles:
+        return _FakeDriveFiles()
+
+    def changes(self) -> _FakeDriveChanges:
+        return _FakeDriveChanges()
+
+
+class _FakeGmailUsers(_StrictFakeMixin):
+    _component_name = "gmail.users"
+
+    def getProfile(self, **kwargs: Any) -> _FakeRequest:  # noqa: N802
+        user_id = kwargs.get("userId")
+        if user_id != "me":
+            raise NotImplementedError(
+                f"Unexpected fake Gmail profile userId={user_id!r}"
+            )
+        return _FakeRequest({"emailAddress": "native-drive-e2e@surfsense.example"})
+
+    def messages(self) -> _FakeGmailMessages:
+        return _FakeGmailMessages()
+
+
+class _FakeGmailMessages(_StrictFakeMixin):
+    _component_name = "gmail.messages"
+
+    def list(self, **kwargs: Any) -> _FakeRequest:
+        user_id = kwargs.get("userId")
+        if user_id != "me":
+            raise NotImplementedError(f"Unexpected fake Gmail list userId={user_id!r}")
+
+        max_results = int(kwargs.get("maxResults", 10) or 10)
+        messages = list(_GMAIL_FIXTURE.get("messages", []))[:max_results]
+        return _FakeRequest(
+            {
+                "messages": [
+                    {
+                        "id": message["id"],
+                        "threadId": message.get("threadId"),
+                    }
+                    for message in messages
+                ],
+                "resultSizeEstimate": len(messages),
+            }
+        )
+
+    def get(self, **kwargs: Any) -> _FakeRequest:
+        user_id = kwargs.get("userId")
+        if user_id != "me":
+            raise NotImplementedError(f"Unexpected fake Gmail get userId={user_id!r}")
+
+        message_id = kwargs.get("id")
+        detail = _GMAIL_FIXTURE.get("details", {}).get(message_id)
+        if detail is None:
+            raise NotImplementedError(
+                f"E2E native Gmail fake has no message detail for id={message_id!r}."
+            )
+        return _FakeRequest(_gmail_detail_to_native_message(detail))
+
+
+class _FakeGmailService(_StrictFakeMixin):
+    _component_name = "gmail_service"
+
+    def users(self) -> _FakeGmailUsers:
+        return _FakeGmailUsers()
+
+
+class _FakeCalendarEvents(_StrictFakeMixin):
+    _component_name = "calendar.events"
+
+    def list(self, **kwargs: Any) -> _FakeRequest:
+        calendar_id = kwargs.get("calendarId")
+        if calendar_id != "primary":
+            raise NotImplementedError(
+                f"Unexpected fake Calendar events calendarId={calendar_id!r}"
+            )
+
+        max_results = int(kwargs.get("maxResults", 250) or 250)
+        time_min = kwargs.get("timeMin")
+        time_max = kwargs.get("timeMax")
+        items = [
+            event
+            for event in _CALENDAR_FIXTURE.get("items", [])
+            if _calendar_event_in_range(event, time_min, time_max)
+        ][:max_results]
+
+        return _FakeRequest(
+            {
+                "items": items,
+                "summary": "native-calendar-e2e@surfsense.example",
+                "timeZone": "UTC",
+            }
+        )
+
+
+class _FakeCalendarService(_StrictFakeMixin):
+    _component_name = "calendar_service"
+
+    def events(self) -> _FakeCalendarEvents:
+        return _FakeCalendarEvents()
+
+
+def _fake_build(service_name: str, version: str, **_: Any) -> Any:
+    if service_name == "drive" and version == "v3":
+        return _FakeDriveService()
+    if service_name == "gmail" and version == "v1":
+        return _FakeGmailService()
+    if service_name == "calendar" and version == "v3":
+        return _FakeCalendarService()
+    raise NotImplementedError(
+        f"E2E native Google fake cannot build {service_name!r} {version!r}."
+    )
+
+
+def _extract_quoted_value(q: str, anchor: str) -> str | None:
+    anchor_idx = q.find(anchor)
+    if anchor_idx == -1:
+        return None
+    after_anchor = q[anchor_idx + len(anchor) :]
+    first_quote_idx = after_anchor.find("'")
+    if first_quote_idx == -1:
+        return None
+    after_first_quote = after_anchor[first_quote_idx + 1 :]
+    second_quote_idx = after_first_quote.find("'")
+    if second_quote_idx == -1:
+        return None
+    return after_first_quote[:second_quote_idx]
+
+
+def _filter_drive_files_for_query(
+    q: str, files: list[dict[str, Any]]
+) -> list[dict[str, Any]]:
+    filtered = list(files)
+
+    if "trashed = false" in q:
+        filtered = [entry for entry in filtered if entry.get("trashed") is not True]
+
+    excluded_mime_type = _extract_quoted_value(q, "mimeType !=")
+    if excluded_mime_type:
+        filtered = [
+            entry for entry in filtered if entry.get("mimeType") != excluded_mime_type
+        ]
+
+    included_mime_type = _extract_quoted_value(q, "mimeType =")
+    if included_mime_type:
+        filtered = [
+            entry for entry in filtered if entry.get("mimeType") == included_mime_type
+        ]
+
+    return filtered
+
+
+def _drive_get_metadata(file_id: str | None) -> dict[str, Any]:
+    for items in _DRIVE_FIXTURE.values():
+        if not isinstance(items, list):
+            continue
+        for entry in items:
+            if entry.get("id") == file_id:
+                return dict(entry)
+    raise NotImplementedError(
+        f"E2E native Google fake has no metadata for fileId={file_id!r}."
+    )
+
+
+def _parse_rfc3339(value: str | None) -> datetime | None:
+    if not value:
+        return None
+    normalized = value.replace("Z", "+00:00")
+    return datetime.fromisoformat(normalized)
+
+
+def _calendar_event_start(event: dict[str, Any]) -> datetime | None:
+    start = event.get("start", {})
+    value = start.get("dateTime") or start.get("date")
+    parsed = _parse_rfc3339(value)
+    if parsed and parsed.tzinfo is None:
+        return parsed.replace(tzinfo=UTC)
+    return parsed
+
+
+def _calendar_event_in_range(
+    event: dict[str, Any], time_min: str | None, time_max: str | None
+) -> bool:
+    event_start = _calendar_event_start(event)
+    if event_start is None:
+        return True
+
+    parsed_min = _parse_rfc3339(time_min)
+    parsed_max = _parse_rfc3339(time_max)
+    if parsed_min and event_start < parsed_min:
+        return False
+    return not (parsed_max and event_start > parsed_max)
+
+
+def _gmail_detail_to_native_message(detail: dict[str, Any]) -> dict[str, Any]:
+    message_text = detail.get("messageText", "")
+    encoded_body = base64.urlsafe_b64encode(message_text.encode("utf-8")).decode(
+        "ascii"
+    )
+
+    return {
+        "id": detail.get("id"),
+        "threadId": detail.get("threadId"),
+        "labelIds": ["INBOX", "IMPORTANT"],
+        "snippet": message_text[:160],
+        "payload": {
+            "mimeType": "text/plain",
+            "headers": [
+                {"name": "Subject", "value": detail.get("subject", "No Subject")},
+                {"name": "From", "value": detail.get("from", "Unknown Sender")},
+                {"name": "To", "value": detail.get("to", "Unknown Recipient")},
+                {"name": "Date", "value": detail.get("date", "Unknown Date")},
+            ],
+            "body": {
+                "data": encoded_body,
+                "size": len(message_text),
+            },
+        },
+    }
+
+
+def install(active_patches: list[Any]) -> None:
+    """Patch production bindings to use native Google SDK fakes."""
+    targets = [
+        ("app.routes.google_drive_add_connector_route.Flow", _FakeFlow),
+        ("app.routes.google_gmail_add_connector_route.Flow", _FakeFlow),
+        ("app.routes.google_calendar_add_connector_route.Flow", _FakeFlow),
+        ("app.connectors.google_drive.client.build", _fake_build),
+        ("app.connectors.google_gmail_connector.build", _fake_build),
+        ("app.connectors.google_calendar_connector.build", _fake_build),
+        ("app.agents.new_chat.tools.google_calendar.create_event.build", _fake_build),
+        ("app.agents.new_chat.tools.google_calendar.update_event.build", _fake_build),
+        ("app.agents.new_chat.tools.google_calendar.delete_event.build", _fake_build),
+        ("googleapiclient.http.MediaIoBaseDownload", _FakeMediaIoBaseDownload),
+        (
+            "app.connectors.google_drive.client._build_thread_http",
+            lambda credentials: None,
+        ),
+    ]
+    for target, replacement in targets:
+        p = patch(target, replacement)
+        p.start()
+        active_patches.append(p)
diff --git a/surfsense_backend/tests/e2e/fakes/notion_module.py b/surfsense_backend/tests/e2e/fakes/notion_module.py
new file mode 100644
index 0000000000..6ad2bc1afd
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fakes/notion_module.py
@@ -0,0 +1,206 @@
+"""Strict Notion OAuth/API fakes for Playwright E2E."""
+
+from __future__ import annotations
+
+import json
+import types
+from pathlib import Path
+from typing import Any
+from unittest.mock import patch
+
+_FIXTURE_PATH = Path(__file__).parent / "fixtures" / "notion_pages.json"
+_TOKEN_URL = "https://api.notion.com/v1/oauth/token"
+_ACCESS_TOKEN = "fake-notion-access-token"
+_REFRESH_TOKEN = "fake-notion-refresh-token"
+
+
+def _load_fixture() -> dict[str, Any]:
+    with _FIXTURE_PATH.open() as f:
+        return json.load(f)
+
+
+_FIXTURE = _load_fixture()
+
+
+class _StrictFakeMixin:
+    _component_name: str = "<unknown>"
+
+    def __getattr__(self, name: str) -> Any:
+        raise NotImplementedError(
+            f"E2E Notion fake missing surface: {self._component_name}.{name!r}. "
+            "Add it to surfsense_backend/tests/e2e/fakes/notion_module.py."
+        )
+
+
+class APIResponseError(Exception):
+    def __init__(
+        self,
+        message: str,
+        *,
+        status: int = 400,
+        code: str = "validation_error",
+        headers: dict[str, str] | None = None,
+        body: Any | None = None,
+    ):
+        super().__init__(message)
+        self.status = status
+        self.code = code
+        self.headers = headers or {}
+        self.body = body or {"message": message}
+
+
+errors = types.ModuleType("notion_client.errors")
+errors.APIResponseError = APIResponseError
+
+
+class _FakeBlocksChildren(_StrictFakeMixin):
+    _component_name = "notion.blocks.children"
+
+    async def list(self, **kwargs: Any) -> dict[str, Any]:
+        block_id = kwargs.get("block_id")
+        start_cursor = kwargs.get("start_cursor")
+        if start_cursor is not None:
+            raise NotImplementedError(
+                f"E2E Notion fake does not model block pagination cursor={start_cursor!r}."
+            )
+
+        blocks = _FIXTURE.get("blocks", {}).get(block_id)
+        if blocks is None:
+            raise APIResponseError(
+                f"Could not find block: {block_id}",
+                status=404,
+                code="object_not_found",
+                body={"message": f"Could not find block: {block_id}"},
+            )
+        return {
+            "object": "list",
+            "results": blocks,
+            "has_more": False,
+            "next_cursor": None,
+        }
+
+
+class _FakeBlocks(_StrictFakeMixin):
+    _component_name = "notion.blocks"
+
+    def __init__(self) -> None:
+        self.children = _FakeBlocksChildren()
+
+
+class AsyncClient(_StrictFakeMixin):
+    _component_name = "notion.AsyncClient"
+
+    def __init__(self, *, auth: str, **kwargs: Any):
+        del kwargs
+        if auth != _ACCESS_TOKEN:
+            raise ValueError(f"Unexpected fake Notion auth token: {auth!r}")
+        self.auth = auth
+        self.blocks = _FakeBlocks()
+
+    async def search(self, **kwargs: Any) -> dict[str, Any]:
+        unsupported = set(kwargs) - {"filter", "sort", "start_cursor"}
+        if unsupported:
+            raise NotImplementedError(
+                f"E2E Notion fake search got unsupported kwargs: {sorted(unsupported)}"
+            )
+        if kwargs.get("start_cursor") is not None:
+            raise NotImplementedError(
+                f"E2E Notion fake does not model search cursor={kwargs['start_cursor']!r}."
+            )
+        expected_filter = {"value": "page", "property": "object"}
+        if kwargs.get("filter") != expected_filter:
+            raise NotImplementedError(
+                f"E2E Notion fake search expected filter={expected_filter!r}, "
+                f"got {kwargs.get('filter')!r}."
+            )
+        return {
+            "object": "list",
+            "results": _FIXTURE.get("pages", []),
+            "has_more": False,
+            "next_cursor": None,
+        }
+
+    async def aclose(self) -> None:
+        return None
+
+
+class _FakeTokenResponse(_StrictFakeMixin):
+    _component_name = "notion.oauth.response"
+
+    def __init__(self, payload: dict[str, Any], status_code: int = 200):
+        self._payload = payload
+        self.status_code = status_code
+        self.text = json.dumps(payload, sort_keys=True)
+
+    def json(self) -> dict[str, Any]:
+        return self._payload
+
+
+class _FakeHttpxAsyncClient(_StrictFakeMixin):
+    _component_name = "httpx.AsyncClient"
+
+    def __init__(self, *args: Any, **kwargs: Any):
+        del args, kwargs
+
+    async def __aenter__(self) -> _FakeHttpxAsyncClient:
+        return self
+
+    async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> None:
+        del exc_type, exc, tb
+
+    async def post(self, url: str, **kwargs: Any) -> _FakeTokenResponse:
+        if url != _TOKEN_URL:
+            raise NotImplementedError(f"Unexpected Notion OAuth POST url={url!r}")
+
+        data = kwargs.get("json") or {}
+        headers = kwargs.get("headers") or {}
+        if "Authorization" not in headers:
+            raise ValueError(
+                "Notion OAuth token exchange missing Authorization header."
+            )
+
+        grant_type = data.get("grant_type")
+        if grant_type == "authorization_code":
+            if data.get("code") != "fake-notion-oauth-code":
+                raise ValueError(
+                    f"Unexpected fake Notion OAuth code: {data.get('code')!r}"
+                )
+        elif grant_type == "refresh_token":
+            if data.get("refresh_token") != _REFRESH_TOKEN:
+                raise ValueError(
+                    f"Unexpected fake Notion refresh token: {data.get('refresh_token')!r}"
+                )
+        else:
+            raise ValueError(f"Unexpected fake Notion grant_type: {grant_type!r}")
+
+        return _FakeTokenResponse(
+            {
+                "access_token": _ACCESS_TOKEN,
+                "refresh_token": _REFRESH_TOKEN,
+                "expires_in": 3600,
+                "workspace_id": "fake-notion-workspace-001",
+                "workspace_name": "SurfSense E2E Notion Workspace",
+                "workspace_icon": "https://surfsense.example/notion-icon.png",
+                "bot_id": "fake-notion-bot-001",
+            }
+        )
+
+
+class _FakeHttpxModule(_StrictFakeMixin):
+    _component_name = "httpx"
+
+    AsyncClient = _FakeHttpxAsyncClient
+
+
+def install(active_patches: list[Any]) -> None:
+    """Patch production bindings that cannot be covered by sys.modules hijack."""
+    targets = [
+        (
+            "app.routes.notion_add_connector_route.httpx",
+            _FakeHttpxModule(),
+        ),
+    ]
+    for target, replacement in targets:
+        p = patch(target, replacement)
+        p.start()
+        active_patches.append(p)
diff --git a/surfsense_backend/tests/e2e/fakes/onedrive_graph.py b/surfsense_backend/tests/e2e/fakes/onedrive_graph.py
new file mode 100644
index 0000000000..1fdacdc0e4
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fakes/onedrive_graph.py
@@ -0,0 +1,190 @@
+"""Strict Microsoft OneDrive Graph fakes for Playwright E2E.
+
+This module patches the OneDrive OAuth route and indexer consumer-site
+bindings. It keeps the production add/callback/indexing flow intact while
+serving deterministic Microsoft-shaped token, profile, metadata, and file
+content responses.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+from unittest.mock import patch
+
+import httpx
+
+from .binary_loader import _resolve_file_bytes
+
+_FIXTURES_DIR = Path(__file__).parent / "fixtures"
+_ONEDRIVE_FIXTURE_PATH = _FIXTURES_DIR / "onedrive_files.json"
+
+
+def _load_onedrive_fixture() -> dict[str, Any]:
+    with _ONEDRIVE_FIXTURE_PATH.open() as f:
+        return json.load(f)
+
+
+_ONEDRIVE_FIXTURE = _load_onedrive_fixture()
+
+
+class _StrictFakeMixin:
+    _component_name: str = "<unknown>"
+
+    def __getattr__(self, name: str) -> Any:
+        raise NotImplementedError(
+            f"E2E OneDrive fake missing surface: "
+            f"{self._component_name}.{name!r}. Add it to "
+            f"surfsense_backend/tests/e2e/fakes/onedrive_graph.py."
+        )
+
+
+class _FakeOneDriveClient(_StrictFakeMixin):
+    _component_name = "OneDriveClient"
+
+    def __init__(self, session: Any, connector_id: int):
+        self._session = session
+        self._connector_id = connector_id
+
+    async def list_children(
+        self, item_id: str = "root"
+    ) -> tuple[list[dict[str, Any]], str | None]:
+        items = _ONEDRIVE_FIXTURE.get(item_id)
+        if not isinstance(items, list):
+            return [], f"E2E OneDrive fake has no children for item_id={item_id!r}."
+        return [dict(item) for item in items], None
+
+    async def get_item_metadata(
+        self, item_id: str
+    ) -> tuple[dict[str, Any] | None, str | None]:
+        metadata = _onedrive_get_metadata(item_id)
+        if metadata is None:
+            return None, f"E2E OneDrive fake has no metadata for item_id={item_id!r}."
+        return metadata, None
+
+    async def download_file(self, item_id: str) -> tuple[bytes | None, str | None]:
+        content = _resolve_file_bytes(_ONEDRIVE_FIXTURE, item_id, _FIXTURES_DIR)
+        if content is None:
+            return None, f"E2E OneDrive fake has no content for item_id={item_id!r}."
+        return content, None
+
+    async def download_file_to_disk(self, item_id: str, dest_path: str) -> str | None:
+        content = _resolve_file_bytes(_ONEDRIVE_FIXTURE, item_id, _FIXTURES_DIR)
+        if content is None:
+            return f"E2E OneDrive fake has no content for item_id={item_id!r}."
+        with open(dest_path, "wb") as f:
+            f.write(content)
+        return None
+
+    async def get_delta(
+        self, folder_id: str | None = None, delta_link: str | None = None
+    ) -> tuple[list[dict[str, Any]], str | None, str | None]:
+        folder_key = folder_id or "root"
+        if delta_link:
+            folder_key = delta_link.rsplit("/", 1)[-1].removesuffix("-delta")
+        if folder_key not in _ONEDRIVE_FIXTURE:
+            return (
+                [],
+                None,
+                f"E2E OneDrive fake has no delta for folder={folder_key!r}.",
+            )
+        return (
+            [],
+            f"https://graph.microsoft.com/v1.0/fake-delta/{folder_key}-delta",
+            None,
+        )
+
+
+class _FakeAsyncClient(_StrictFakeMixin):
+    _component_name = "httpx.AsyncClient"
+
+    def __init__(self, *args: Any, **kwargs: Any):
+        del args, kwargs
+
+    async def __aenter__(self) -> _FakeAsyncClient:
+        return self
+
+    async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> None:
+        del exc_type, exc, tb
+
+    async def post(self, url: str, *args: Any, **kwargs: Any) -> httpx.Response:
+        del args, kwargs
+        if "login.microsoftonline.com" in url and url.endswith("/token"):
+            return _json_response(
+                "POST",
+                url,
+                {
+                    "access_token": "fake-onedrive-access-token",
+                    "refresh_token": "fake-onedrive-refresh-token",
+                    "token_type": "Bearer",
+                    "expires_in": 3600,
+                    "scope": "offline_access User.Read Files.Read.All Files.ReadWrite.All",
+                },
+            )
+        raise NotImplementedError(f"E2E OneDrive fake unexpected POST URL: {url!r}")
+
+    async def get(self, url: str, *args: Any, **kwargs: Any) -> httpx.Response:
+        del args, kwargs
+        if url == "https://graph.microsoft.com/v1.0/me":
+            return _json_response(
+                "GET",
+                url,
+                {
+                    "mail": "onedrive-e2e@surfsense.example",
+                    "userPrincipalName": "onedrive-e2e@surfsense.example",
+                    "displayName": "SurfSense OneDrive E2E",
+                },
+            )
+        raise NotImplementedError(f"E2E OneDrive fake unexpected GET URL: {url!r}")
+
+    async def request(
+        self, method: str, url: str, *args: Any, **kwargs: Any
+    ) -> httpx.Response:
+        del args, kwargs
+        raise NotImplementedError(
+            f"E2E OneDrive fake unexpected request: {method!r} {url!r}"
+        )
+
+
+class _FakeHttpxModule(_StrictFakeMixin):
+    _component_name = "httpx"
+
+    AsyncClient = _FakeAsyncClient
+
+
+def _json_response(
+    method: str, url: str, payload: dict[str, Any], status_code: int = 200
+) -> httpx.Response:
+    return httpx.Response(
+        status_code=status_code,
+        json=payload,
+        request=httpx.Request(method, url),
+    )
+
+
+def _onedrive_get_metadata(item_id: str | None) -> dict[str, Any] | None:
+    for items in _ONEDRIVE_FIXTURE.values():
+        if not isinstance(items, list):
+            continue
+        for entry in items:
+            if entry.get("id") == item_id:
+                return dict(entry)
+    return None
+
+
+def install(active_patches: list[Any]) -> None:
+    """Patch production OneDrive bindings to use strict Graph fakes."""
+    targets = [
+        ("app.routes.onedrive_add_connector_route.httpx", _FakeHttpxModule()),
+        ("app.routes.onedrive_add_connector_route.OneDriveClient", _FakeOneDriveClient),
+        (
+            "app.tasks.connector_indexers.onedrive_indexer.OneDriveClient",
+            _FakeOneDriveClient,
+        ),
+        ("app.connectors.onedrive.client.httpx", _FakeHttpxModule()),
+    ]
+    for target, replacement in targets:
+        p = patch(target, replacement)
+        p.start()
+        active_patches.append(p)
diff --git a/surfsense_backend/tests/e2e/fakes/slack_module.py b/surfsense_backend/tests/e2e/fakes/slack_module.py
new file mode 100644
index 0000000000..e63c6e049e
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fakes/slack_module.py
@@ -0,0 +1,213 @@
+"""Strict Slack MCP OAuth/tool fakes for Playwright E2E."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from types import SimpleNamespace
+from typing import Any
+from unittest.mock import patch
+
+from tests.e2e.fakes import mcp_oauth_runtime, mcp_runtime
+
+_FIXTURE_PATH = Path(__file__).parent / "fixtures" / "slack_messages.json"
+
+_AUTHORIZATION_URL = "https://slack.com/oauth/v2_user/authorize"
+_REGISTRATION_URL = "https://e2e-fake.invalid/mcp/slack-unused-register"
+_TOKEN_URL = "https://slack.com/api/oauth.v2.user.access"
+_MCP_URL = "https://mcp.slack.com/mcp"
+
+_CLIENT_ID = "fake-slack-mcp-client-id"
+_CLIENT_SECRET = "fake-slack-mcp-client-secret"
+_ACCESS_TOKEN = "fake-slack-mcp-access-token"
+_REFRESH_TOKEN = "fake-slack-mcp-refresh-token"
+_OAUTH_CODE = "fake-slack-oauth-code"
+_SCOPE = (
+    "search:read.public search:read.private search:read.mpim search:read.im "
+    "channels:history groups:history mpim:history im:history"
+)
+
+
+def _load_fixture() -> dict[str, Any]:
+    with _FIXTURE_PATH.open() as f:
+        return json.load(f)
+
+
+_FIXTURE = _load_fixture()
+
+
+async def _list_tools() -> SimpleNamespace:
+    return SimpleNamespace(
+        tools=[
+            SimpleNamespace(
+                name="slack_search_channels",
+                description="Search Slack channels visible to the authenticated user.",
+                inputSchema={
+                    "type": "object",
+                    "properties": {
+                        "query": {
+                            "type": "string",
+                            "description": "Text to search for in Slack channel names.",
+                        },
+                        "limit": {
+                            "type": "integer",
+                            "description": "Maximum number of channels to return.",
+                        },
+                    },
+                    "required": [],
+                },
+            ),
+            SimpleNamespace(
+                name="slack_read_channel",
+                description="Read messages from a Slack channel.",
+                inputSchema={
+                    "type": "object",
+                    "properties": {
+                        "channel_id": {
+                            "type": "string",
+                            "description": "Slack channel id.",
+                        }
+                    },
+                    "required": ["channel_id"],
+                },
+            ),
+            SimpleNamespace(
+                name="slack_read_thread",
+                description="Read a Slack thread from a channel.",
+                inputSchema={
+                    "type": "object",
+                    "properties": {
+                        "channel_id": {
+                            "type": "string",
+                            "description": "Slack channel id.",
+                        },
+                        "thread_ts": {
+                            "type": "string",
+                            "description": "Slack thread timestamp.",
+                        },
+                    },
+                    "required": ["channel_id", "thread_ts"],
+                },
+            ),
+        ]
+    )
+
+
+async def _call_tool(
+    tool_name: str, arguments: dict[str, Any] | None = None
+) -> SimpleNamespace:
+    arguments = arguments or {}
+    channel = _FIXTURE["channel"]
+    message = _FIXTURE["messages"][0]
+
+    if tool_name == "slack_search_channels":
+        query = str(arguments.get("query", ""))
+        if query and channel["name"].lower() not in query.lower():
+            raise ValueError(f"Unexpected Slack channel query: {query!r}")
+        text = (
+            f"#{channel['name']} ({channel['id']})\n"
+            f"purpose: {channel['purpose']}\n"
+            f"latest_message: {message['text']}"
+        )
+        return SimpleNamespace(content=[SimpleNamespace(text=text)])
+
+    if tool_name in {"slack_read_channel", "slack_read_thread"}:
+        raise NotImplementedError(
+            f"Slack E2E fake does not exercise {tool_name!r}; "
+            "extend slack_module.py before using it in a journey."
+        )
+
+    raise NotImplementedError(f"Unexpected Slack MCP tool call: {tool_name!r}")
+
+
+async def _fake_exchange_code_for_tokens(
+    token_endpoint: str,
+    code: str,
+    redirect_uri: str,
+    client_id: str,
+    client_secret: str,
+    code_verifier: str,
+    *,
+    timeout: float = 30.0,
+) -> dict[str, Any]:
+    if token_endpoint != _TOKEN_URL:
+        return await mcp_oauth_runtime._fake_exchange_code_for_tokens(
+            token_endpoint,
+            code,
+            redirect_uri,
+            client_id,
+            client_secret,
+            code_verifier,
+            timeout=timeout,
+        )
+    del timeout
+
+    if code != _OAUTH_CODE:
+        raise ValueError(f"Unexpected fake Slack OAuth code: {code!r}")
+    if "/api/v1/auth/mcp/slack/connector/callback" not in redirect_uri:
+        raise ValueError(f"Unexpected Slack redirect_uri={redirect_uri!r}")
+    if client_id != _CLIENT_ID or client_secret != _CLIENT_SECRET:
+        raise ValueError(
+            "Unexpected Slack client credentials: "
+            f"client_id={client_id!r} client_secret={client_secret!r}"
+        )
+    if not code_verifier:
+        raise ValueError("Slack token exchange missing code_verifier.")
+
+    team = _FIXTURE["team"]
+    return {
+        "ok": True,
+        "scope": _SCOPE,
+        "token_type": "Bearer",
+        "expires_in": 3600,
+        "refresh_token": _REFRESH_TOKEN,
+        "authed_user": {
+            "id": "U_FAKE_SLACK_USER",
+            "scope": _SCOPE,
+            "access_token": _ACCESS_TOKEN,
+            "refresh_token": _REFRESH_TOKEN,
+            "expires_in": 3600,
+            "token_type": "Bearer",
+        },
+        "team": {
+            "id": team["id"],
+            "name": team["name"],
+        },
+    }
+
+
+def install(active_patches: list[Any]) -> None:
+    """Register Slack MCP OAuth/tool handlers with the shared dispatchers."""
+    mcp_oauth_runtime.register_service(
+        mcp_url=_MCP_URL,
+        discovery_metadata={
+            "issuer": "https://slack.com",
+            "authorization_endpoint": _AUTHORIZATION_URL,
+            "token_endpoint": _TOKEN_URL,
+            "registration_endpoint": _REGISTRATION_URL,
+            "code_challenge_methods_supported": ["S256"],
+            "grant_types_supported": ["authorization_code", "refresh_token"],
+            "response_types_supported": ["code"],
+        },
+        client_id=_CLIENT_ID,
+        client_secret=_CLIENT_SECRET,
+        token_endpoint=_TOKEN_URL,
+        registration_endpoint=_REGISTRATION_URL,
+        oauth_code=_OAUTH_CODE,
+        access_token=_ACCESS_TOKEN,
+        refresh_token=_REFRESH_TOKEN,
+        scope=_SCOPE,
+        redirect_uri_substring="/api/v1/auth/mcp/slack/connector/callback",
+    )
+    mcp_runtime.register(
+        url=_MCP_URL,
+        expected_bearer=_ACCESS_TOKEN,
+        list_tools=_list_tools,
+        call_tool=_call_tool,
+    )
+    p = patch(
+        "app.services.mcp_oauth.discovery.exchange_code_for_tokens",
+        _fake_exchange_code_for_tokens,
+    )
+    p.start()
+    active_patches.append(p)
diff --git a/surfsense_backend/tests/e2e/fixtures/global_llm_config.yaml b/surfsense_backend/tests/e2e/fixtures/global_llm_config.yaml
new file mode 100644
index 0000000000..017fa1eb30
--- /dev/null
+++ b/surfsense_backend/tests/e2e/fixtures/global_llm_config.yaml
@@ -0,0 +1,71 @@
+# Synthetic Global LLM configuration for E2E ONLY.
+#
+# Why this file exists:
+#   surfsense_backend/app/config/global_llm_config.yaml is gitignored
+#   (operators ship real API keys there). In CI that file does not exist,
+#   so app.config.load_global_llm_configs() returns [], every chat-stream
+#   test fails fast with "No usable global LLM configs are available for
+#   Auto mode" raised by auto_model_pin_service._global_candidates().
+#
+# What this file does:
+#   tests/e2e/run_backend.py and tests/e2e/run_celery.py copy this file
+#   to app/config/global_llm_config.yaml at startup, BEFORE app.config
+#   is imported. The copy lives only inside the E2E Docker container.
+#
+# Why a fake api_key is safe:
+#   tests.e2e.fakes.chat_llm patches
+#     app.tasks.chat.stream_new_chat.create_chat_litellm_from_agent_config
+#     app.tasks.chat.stream_new_chat.create_chat_litellm_from_config
+#   so the resolved auto-pin id is never sent to a real LLM provider.
+#   The values below only need to pass
+#   auto_model_pin_service._is_usable_global_config()
+#   which requires id / model_name / provider / api_key all truthy.
+#
+# Why TWO entries (premium + free):
+#   auto_model_pin_service.resolve_or_get_pinned_llm_config_id() splits
+#   candidates by billing_tier based on _is_premium_eligible(user):
+#     premium_eligible == True  -> keeps only tier=="premium" configs
+#     premium_eligible == False -> keeps only tier!="premium" configs
+#   A single-tier fixture would fail one of the two branches with
+#   "Auto mode could not find an eligible LLM config for this user and
+#   quota state". Shipping one of each guarantees every quota state
+#   resolves to a viable pin in E2E.
+
+router_settings:
+  routing_strategy: "simple-shuffle"
+  num_retries: 0
+  allowed_fails: 1
+  cooldown_time: 1
+
+global_llm_configs:
+  - id: -9001
+    name: "E2E Fake Auto Model (premium)"
+    billing_tier: "premium"
+    anonymous_enabled: false
+    seo_enabled: false
+    quality_score: 1.0
+    provider: "OPENAI"
+    model_name: "fake-e2e-model-premium"
+    api_key: "fake-e2e-api-key-not-for-production"
+    supports_image_input: false
+    quota_reserve_tokens: 1024
+    rpm: 1000
+    tpm: 100000
+    litellm_params:
+      model: "openai/fake-e2e-model-premium"
+
+  - id: -9002
+    name: "E2E Fake Auto Model (free)"
+    billing_tier: "free"
+    anonymous_enabled: false
+    seo_enabled: false
+    quality_score: 1.0
+    provider: "OPENAI"
+    model_name: "fake-e2e-model-free"
+    api_key: "fake-e2e-api-key-not-for-production"
+    supports_image_input: false
+    quota_reserve_tokens: 1024
+    rpm: 1000
+    tpm: 100000
+    litellm_params:
+      model: "openai/fake-e2e-model-free"
diff --git a/surfsense_backend/tests/e2e/middleware/__init__.py b/surfsense_backend/tests/e2e/middleware/__init__.py
new file mode 100644
index 0000000000..4713978895
--- /dev/null
+++ b/surfsense_backend/tests/e2e/middleware/__init__.py
@@ -0,0 +1,4 @@
+"""Test-only middleware. Mounted on the FastAPI `app` object inside
+`tests/e2e/run_backend.py`, never registered by production startup
+(`python main.py`).
+"""
diff --git a/surfsense_backend/tests/e2e/middleware/scenario.py b/surfsense_backend/tests/e2e/middleware/scenario.py
new file mode 100644
index 0000000000..9614dbc2be
--- /dev/null
+++ b/surfsense_backend/tests/e2e/middleware/scenario.py
@@ -0,0 +1,54 @@
+"""X-E2E-Scenario middleware.
+
+Reads the X-E2E-Scenario request header and pipes the value into a
+ContextVar that the strict fakes consult to switch between happy-path
+and error scenarios on a per-request basis.
+
+Mounted by tests/e2e/run_backend.py only. Production never adds this
+middleware, so production never reads the header.
+
+Supported scenarios:
+- "happy" (default): everything succeeds with deterministic fixtures.
+- "denied": Composio.connected_accounts.initiate returns a redirect URL
+  pointing at our callback with ?error=access_denied.
+- "auth_expired": GOOGLEDRIVE_LIST_FILES returns an authentication
+  failure that the route translates to connector.config.auth_expired.
+- "duplicate": no special fake behavior; the duplicate path is exercised
+  by running the OAuth flow twice with the same toolkit.
+"""
+
+from __future__ import annotations
+
+from contextvars import ContextVar
+
+from starlette.middleware.base import BaseHTTPMiddleware
+from starlette.requests import Request
+from starlette.responses import Response
+from starlette.types import ASGIApp
+
+_scenario: ContextVar[str] = ContextVar("e2e_scenario", default="happy")
+
+
+def current_scenario() -> str:
+    """Return the active E2E scenario for the current request context."""
+    return _scenario.get()
+
+
+class ScenarioMiddleware(BaseHTTPMiddleware):
+    """Reads X-E2E-Scenario and exposes it via a ContextVar.
+
+    The header is also forwarded as state on the request so route
+    handlers can branch if they ever need to (Composio routes do not).
+    """
+
+    def __init__(self, app: ASGIApp) -> None:
+        super().__init__(app)
+
+    async def dispatch(self, request: Request, call_next) -> Response:
+        value = request.headers.get("X-E2E-Scenario", "happy")
+        token = _scenario.set(value)
+        try:
+            request.state.e2e_scenario = value
+            return await call_next(request)
+        finally:
+            _scenario.reset(token)
diff --git a/surfsense_backend/tests/e2e/run_backend.py b/surfsense_backend/tests/e2e/run_backend.py
new file mode 100644
index 0000000000..5a787ac520
--- /dev/null
+++ b/surfsense_backend/tests/e2e/run_backend.py
@@ -0,0 +1,374 @@
+"""E2E backend entrypoint.
+
+Hijacks third-party SDKs at sys.modules level BEFORE any production
+code is imported, then starts the same FastAPI app + uvicorn that
+`main.py` would run.
+
+Production code is byte-identical with or without this file:
+- `python main.py` is the production entrypoint (unchanged).
+- `python tests/e2e/run_backend.py` is the test entrypoint, never imported by production.
+- `surfsense_backend/.dockerignore` excludes `tests/`, so this file
+  physically does not exist in the production Docker image.
+
+Defense in depth (see Composio Drive E2E Phase 1 plan):
+1. sys.modules hijack here (Composio).
+2. Strict __getattr__ inside fakes (NotImplementedError on unknown surface).
+3. Network deny-list set in CI env (HTTPS_PROXY=http://127.0.0.1:1
+   plus sentinel API keys) so any leaked outbound HTTP fails loudly.
+
+Usage:
+    cd surfsense_backend
+    uv run python tests/e2e/run_backend.py
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import os
+import sys
+
+import uvicorn
+
+# Make the surfsense_backend root importable as a top-level package so
+# `import tests.e2e.fakes...` works regardless of how the entrypoint is
+# invoked (uv run python tests/e2e/run_backend.py from repo root or from
+# surfsense_backend/).
+_THIS_DIR = os.path.dirname(os.path.abspath(__file__))
+_BACKEND_ROOT = os.path.abspath(os.path.join(_THIS_DIR, "..", ".."))
+if _BACKEND_ROOT not in sys.path:
+    sys.path.insert(0, _BACKEND_ROOT)
+
+
+logger = logging.getLogger("surfsense.e2e.backend")
+
+# Patches started during bootstrap are kept alive for the lifetime of the
+# process. We never call .stop() on them.
+_active_patches: list = []
+
+
+def _hijack_external_sdks() -> None:
+    """Replace composio + notion_client in sys.modules.
+
+    Production does ``from composio import Composio`` and
+    ``import notion_client`` at import time. With this hijack in place,
+    those imports resolve to our strict fakes.
+
+    MUST run before _import_production_app().
+    """
+    import tests.e2e.fakes.composio_module as _fake_composio
+    import tests.e2e.fakes.notion_module as _fake_notion
+
+    sys.modules["composio"] = _fake_composio
+    sys.modules["notion_client"] = _fake_notion
+    sys.modules["notion_client.errors"] = _fake_notion.errors
+
+
+def _load_dotenv_and_set_env_defaults() -> None:
+    """Load .env and set every env var the production config reads on import.
+
+    MUST run before _import_production_app(), since app.config consumes
+    these values at import time.
+    """
+    from dotenv import load_dotenv
+
+    load_dotenv()
+
+    os.environ.setdefault(
+        "DATABASE_URL",
+        "postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense",
+    )
+    os.environ.setdefault("CELERY_BROKER_URL", "redis://localhost:6379/0")
+    os.environ.setdefault("CELERY_RESULT_BACKEND", "redis://localhost:6379/0")
+    os.environ.setdefault("REDIS_APP_URL", "redis://localhost:6379/0")
+    os.environ.setdefault("CELERY_TASK_DEFAULT_QUEUE", "surfsense")
+    os.environ.setdefault("SECRET_KEY", "local-e2e-secret-not-for-production")
+    os.environ.setdefault("AUTH_TYPE", "LOCAL")
+    os.environ.setdefault("REGISTRATION_ENABLED", "TRUE")
+    os.environ.setdefault("ETL_SERVICE", "DOCLING")
+    os.environ.setdefault("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
+    os.environ.setdefault("NEXT_FRONTEND_URL", "http://localhost:3000")
+
+    # Sentinel keys — fakes never read them; turns leaked real calls into 401s.
+    os.environ.setdefault("COMPOSIO_API_KEY", "local-deny-real-call-sentinel")
+    os.environ.setdefault("COMPOSIO_ENABLED", "TRUE")
+    os.environ.setdefault("OPENAI_API_KEY", "local-deny-real-call-sentinel")
+    os.environ.setdefault("ANTHROPIC_API_KEY", "local-deny-real-call-sentinel")
+    os.environ.setdefault("LITELLM_API_KEY", "local-deny-real-call-sentinel")
+
+    os.environ.setdefault("ATLASSIAN_CLIENT_ID", "fake-atlassian-client-id")
+    os.environ.setdefault("ATLASSIAN_CLIENT_SECRET", "fake-atlassian-client-secret")
+    os.environ.setdefault(
+        "CONFLUENCE_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/confluence/connector/callback",
+    )
+    os.environ.setdefault("NOTION_CLIENT_ID", "fake-notion-client-id")
+    os.environ.setdefault("NOTION_CLIENT_SECRET", "fake-notion-client-secret")
+    os.environ.setdefault(
+        "NOTION_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/notion/connector/callback",
+    )
+    os.environ.setdefault("MICROSOFT_CLIENT_ID", "fake-microsoft-client-id")
+    os.environ.setdefault("MICROSOFT_CLIENT_SECRET", "fake-microsoft-client-secret")
+    os.environ.setdefault(
+        "ONEDRIVE_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/onedrive/connector/callback",
+    )
+    os.environ.setdefault("DROPBOX_APP_KEY", "fake-dropbox-app-key")
+    os.environ.setdefault("DROPBOX_APP_SECRET", "fake-dropbox-app-secret")
+    os.environ.setdefault(
+        "DROPBOX_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/dropbox/connector/callback",
+    )
+    # Native Google OAuth — fake Flow in tests.e2e.fakes.native_google
+    # raises "Fake Google Flow requires redirect_uri." if these are empty,
+    # so connector/add routes return 500 in CI where no .env supplies them.
+    os.environ.setdefault(
+        "GOOGLE_DRIVE_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/google/drive/connector/callback",
+    )
+    os.environ.setdefault(
+        "GOOGLE_GMAIL_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/google/gmail/connector/callback",
+    )
+    os.environ.setdefault(
+        "GOOGLE_CALENDAR_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/google/calendar/connector/callback",
+    )
+    os.environ["SLACK_CLIENT_ID"] = "fake-slack-mcp-client-id"
+    os.environ["SLACK_CLIENT_SECRET"] = "fake-slack-mcp-client-secret"
+
+
+def _install_synthetic_global_llm_config() -> None:
+    """Materialise a fake ``app/config/global_llm_config.yaml`` for E2E.
+
+    The real file is gitignored (production operators ship their own with
+    real API keys), so a fresh CI checkout has no YAML at the path
+    ``app.config.load_global_llm_configs()`` reads. With an empty
+    ``GLOBAL_LLM_CONFIGS`` list, ``auto_model_pin_service`` raises
+    ``"No usable global LLM configs are available for Auto mode"`` on
+    every chat-stream request.
+
+    We copy the synthetic fixture from ``tests/e2e/fixtures/`` into the
+    production-expected location BEFORE ``_import_production_app()`` so
+    ``app.config`` picks it up on import. Production code is untouched —
+    this is purely a test-time scaffold.
+
+    Only installs when the destination is missing. A developer running
+    the E2E entrypoint locally keeps their real ``global_llm_config.yaml``
+    intact (the patched ``create_chat_litellm_from_*`` factories make the
+    actual model values irrelevant either way).
+
+    MUST run before _import_production_app().
+    """
+    import shutil
+
+    src = os.path.join(_THIS_DIR, "fixtures", "global_llm_config.yaml")
+    dst = os.path.join(_BACKEND_ROOT, "app", "config", "global_llm_config.yaml")
+
+    if not os.path.exists(src):
+        raise RuntimeError(
+            f"E2E synthetic global LLM config fixture missing at {src!r}. "
+            f"This file is checked into tests/e2e/fixtures/ — if it has gone "
+            f"missing, restore it from VCS before running the E2E entrypoint."
+        )
+
+    if os.path.exists(dst):
+        logger.info(
+            "[e2e-global-llm-config] %s already exists; leaving it alone "
+            "(local dev config preserved)",
+            dst,
+        )
+        return
+
+    os.makedirs(os.path.dirname(dst), exist_ok=True)
+    shutil.copyfile(src, dst)
+    logger.info("[e2e-global-llm-config] installed %s -> %s", src, dst)
+
+
+def _import_production_app():
+    """Import and return the production FastAPI app.
+
+    Every module under ``app.*`` loads here, creating their bindings.
+    The LLM/embedding factories captured at this point will be replaced
+    by patches in _patch_llm_bindings() below.
+    """
+    from app.app import app as production_app
+
+    return production_app
+
+
+def _patch_llm_bindings() -> None:
+    """Replace LLM factories at every known binding site."""
+    from unittest.mock import patch
+
+    from tests.e2e.fakes.chat_llm import (
+        fake_create_chat_litellm_from_agent_config,
+        fake_create_chat_litellm_from_config,
+    )
+    from tests.e2e.fakes.llm import fake_get_user_long_context_llm
+
+    targets = [
+        "app.services.llm_service.get_user_long_context_llm",
+        "app.tasks.connector_indexers.confluence_indexer.get_user_long_context_llm",
+        "app.tasks.connector_indexers.google_drive_indexer.get_user_long_context_llm",
+        "app.tasks.connector_indexers.google_gmail_indexer.get_user_long_context_llm",
+        "app.tasks.connector_indexers.notion_indexer.get_user_long_context_llm",
+        "app.tasks.connector_indexers.onedrive_indexer.get_user_long_context_llm",
+        "app.tasks.connector_indexers.dropbox_indexer.get_user_long_context_llm",
+        "app.tasks.connector_indexers.local_folder_indexer.get_user_long_context_llm",
+        "app.tasks.document_processors._save.get_user_long_context_llm",
+        "app.tasks.document_processors.markdown_processor.get_user_long_context_llm",
+    ]
+    for target in targets:
+        try:
+            p = patch(target, fake_get_user_long_context_llm)
+            p.start()
+            _active_patches.append(p)
+            logger.info("[fake-llm] patched %s", target)
+        except (ModuleNotFoundError, AttributeError) as exc:
+            # Some indexers may not be loaded in every env. Log and move
+            # on — but do not silently let a known binding through.
+            logger.warning(
+                "[fake-llm] could not patch %s: %s. If production code "
+                "uses this path in E2E it will hit the real provider; "
+                "update tests/e2e/run_backend.py.",
+                target,
+                exc,
+            )
+
+    chat_targets = [
+        (
+            "app.agents.new_chat.llm_config.create_chat_litellm_from_agent_config",
+            fake_create_chat_litellm_from_agent_config,
+        ),
+        (
+            "app.agents.new_chat.llm_config.create_chat_litellm_from_config",
+            fake_create_chat_litellm_from_config,
+        ),
+        (
+            "app.tasks.chat.stream_new_chat.create_chat_litellm_from_agent_config",
+            fake_create_chat_litellm_from_agent_config,
+        ),
+        (
+            "app.tasks.chat.stream_new_chat.create_chat_litellm_from_config",
+            fake_create_chat_litellm_from_config,
+        ),
+    ]
+    for target, replacement in chat_targets:
+        try:
+            p = patch(target, replacement)
+            p.start()
+            _active_patches.append(p)
+            logger.info("[fake-chat-llm] patched %s", target)
+        except (ModuleNotFoundError, AttributeError) as exc:
+            logger.warning("[fake-chat-llm] could not patch %s: %s.", target, exc)
+
+
+def _install_runtime_fakes() -> None:
+    """Run each fake's install() against the active patch stack."""
+    from tests.e2e.fakes import (
+        clickup_module as _fake_clickup_module,
+        confluence_indexer as _fake_confluence_indexer,
+        confluence_oauth as _fake_confluence_oauth,
+        docling_service as _fake_docling_service,
+        dropbox_api as _fake_dropbox_api,
+        embeddings as _fake_embeddings,
+        jira_module as _fake_jira_module,
+        linear_module as _fake_linear_module,
+        mcp_oauth_runtime as _fake_mcp_oauth_runtime,
+        mcp_runtime as _fake_mcp_runtime,
+        native_google as _fake_native_google,
+        notion_module as _fake_notion_module,
+        onedrive_graph as _fake_onedrive_graph,
+        slack_module as _fake_slack_module,
+    )
+
+    _fake_embeddings.install(_active_patches)
+    _fake_docling_service.install(_active_patches)
+    _fake_confluence_oauth.install(_active_patches)
+    _fake_confluence_indexer.install(_active_patches)
+    _fake_native_google.install(_active_patches)
+    _fake_onedrive_graph.install(_active_patches)
+    _fake_dropbox_api.install(_active_patches)
+    _fake_notion_module.install(_active_patches)
+    _fake_linear_module.install(_active_patches)
+    _fake_jira_module.install(_active_patches)
+    _fake_clickup_module.install(_active_patches)
+    _fake_mcp_runtime.install(_active_patches)
+    _fake_mcp_oauth_runtime.install(_active_patches)
+    _fake_slack_module.install(_active_patches)
+
+
+def _install_test_only_app_extensions(app) -> None:
+    """Mount test-only middleware + the /__e2e__ token mint router.
+
+    POST /__e2e__/auth/token bypasses /auth/jwt/login's 5/min/IP rate
+    limit so Playwright workers can authenticate without thrashing the
+    production auth surface. See tests/e2e/auth_mint.py.
+    """
+    from tests.e2e.auth_mint import install as install_e2e_mint
+    from tests.e2e.middleware.scenario import ScenarioMiddleware
+
+    app.add_middleware(ScenarioMiddleware)
+    install_e2e_mint(app)
+
+
+def _bootstrap():
+    """Run the full E2E bootstrap and return the production FastAPI app.
+
+    Ordering is load-bearing:
+      1) Hijack composio + notion_client in sys.modules.
+      2) Load .env + set env defaults (app.config reads env on import).
+      3) Configure logging.
+      4) Materialise the synthetic global_llm_config.yaml so Auto-mode
+         pin resolution finds at least one usable candidate.
+      5) Import production app (which transitively imports the now-faked
+         external SDKs and reads the env defaults + YAML).
+      6) Patch LLM / embedding bindings at every consumer site.
+      7) Mount test-only middleware + /__e2e__ routes onto the app.
+    """
+    _hijack_external_sdks()
+    _load_dotenv_and_set_env_defaults()
+
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    logger.warning(
+        "*** SURFSENSE E2E BACKEND ENTRYPOINT — fake Composio + LLM + embeddings ***"
+    )
+
+    _install_synthetic_global_llm_config()
+    production_app = _import_production_app()
+    _patch_llm_bindings()
+    _install_runtime_fakes()
+    _install_test_only_app_extensions(production_app)
+    return production_app
+
+
+app = _bootstrap()
+
+
+def _main() -> None:
+    if sys.platform == "win32":
+        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
+
+    host = os.getenv("UVICORN_HOST", "0.0.0.0")
+    port = int(os.getenv("UVICORN_PORT", "8000"))
+    log_level = os.getenv("UVICORN_LOG_LEVEL", "info")
+
+    config = uvicorn.Config(
+        app=app,
+        host=host,
+        port=port,
+        log_level=log_level,
+        reload=False,
+    )
+    server = uvicorn.Server(config)
+    server.run()
+
+
+if __name__ == "__main__":
+    _main()
diff --git a/surfsense_backend/tests/e2e/run_celery.py b/surfsense_backend/tests/e2e/run_celery.py
new file mode 100644
index 0000000000..e4091d6893
--- /dev/null
+++ b/surfsense_backend/tests/e2e/run_celery.py
@@ -0,0 +1,339 @@
+"""E2E Celery worker entrypoint.
+
+Same sys.modules hijack + LLM/embedding patches as run_backend.py,
+applied before importing the production celery_app. Celery workers
+run in a separate Python interpreter, so the patches must be applied
+here too — they would NOT carry over from the FastAPI process.
+
+Production is unaffected: celery_worker.py at the repo root is the
+production entrypoint and never imports this file.
+
+Usage:
+    cd surfsense_backend
+    uv run python tests/e2e/run_celery.py
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import sys
+
+_THIS_DIR = os.path.dirname(os.path.abspath(__file__))
+_BACKEND_ROOT = os.path.abspath(os.path.join(_THIS_DIR, "..", ".."))
+if _BACKEND_ROOT not in sys.path:
+    sys.path.insert(0, _BACKEND_ROOT)
+
+
+logger = logging.getLogger("surfsense.e2e.celery")
+
+# Patches started during bootstrap are kept alive for the lifetime of the
+# process. We never call .stop() on them.
+_active_patches: list = []
+
+
+def _hijack_external_sdks() -> None:
+    """Replace composio + notion_client in sys.modules.
+
+    Production does ``from composio import Composio`` and
+    ``import notion_client`` at import time. With this hijack in place,
+    those imports resolve to our strict fakes.
+
+    MUST run before _import_celery_app().
+    """
+    import tests.e2e.fakes.composio_module as _fake_composio
+    import tests.e2e.fakes.notion_module as _fake_notion
+
+    sys.modules["composio"] = _fake_composio
+    sys.modules["notion_client"] = _fake_notion
+    sys.modules["notion_client.errors"] = _fake_notion.errors
+
+
+def _load_dotenv_and_set_env_defaults() -> None:
+    """Load .env and set every env var the production config reads on import.
+
+    MUST run before _import_celery_app(), since app.config consumes
+    these values at import time.
+    """
+    from dotenv import load_dotenv
+
+    load_dotenv()
+
+    os.environ.setdefault(
+        "DATABASE_URL",
+        "postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense",
+    )
+    os.environ.setdefault("CELERY_BROKER_URL", "redis://localhost:6379/0")
+    os.environ.setdefault("CELERY_RESULT_BACKEND", "redis://localhost:6379/0")
+    os.environ.setdefault("REDIS_APP_URL", "redis://localhost:6379/0")
+    os.environ.setdefault("CELERY_TASK_DEFAULT_QUEUE", "surfsense")
+    os.environ.setdefault("SECRET_KEY", "local-e2e-secret-not-for-production")
+    os.environ.setdefault("AUTH_TYPE", "LOCAL")
+    os.environ.setdefault("REGISTRATION_ENABLED", "TRUE")
+    os.environ.setdefault("ETL_SERVICE", "DOCLING")
+    os.environ.setdefault("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
+    os.environ.setdefault("NEXT_FRONTEND_URL", "http://localhost:3000")
+
+    # Sentinel keys — fakes never read them; turns leaked real calls into 401s.
+    os.environ.setdefault("COMPOSIO_API_KEY", "local-deny-real-call-sentinel")
+    os.environ.setdefault("COMPOSIO_ENABLED", "TRUE")
+    os.environ.setdefault("OPENAI_API_KEY", "local-deny-real-call-sentinel")
+    os.environ.setdefault("ANTHROPIC_API_KEY", "local-deny-real-call-sentinel")
+    os.environ.setdefault("LITELLM_API_KEY", "local-deny-real-call-sentinel")
+
+    os.environ.setdefault("ATLASSIAN_CLIENT_ID", "fake-atlassian-client-id")
+    os.environ.setdefault("ATLASSIAN_CLIENT_SECRET", "fake-atlassian-client-secret")
+    os.environ.setdefault(
+        "CONFLUENCE_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/confluence/connector/callback",
+    )
+    os.environ.setdefault("NOTION_CLIENT_ID", "fake-notion-client-id")
+    os.environ.setdefault("NOTION_CLIENT_SECRET", "fake-notion-client-secret")
+    os.environ.setdefault(
+        "NOTION_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/notion/connector/callback",
+    )
+    os.environ.setdefault("MICROSOFT_CLIENT_ID", "fake-microsoft-client-id")
+    os.environ.setdefault("MICROSOFT_CLIENT_SECRET", "fake-microsoft-client-secret")
+    os.environ.setdefault(
+        "ONEDRIVE_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/onedrive/connector/callback",
+    )
+    os.environ.setdefault("DROPBOX_APP_KEY", "fake-dropbox-app-key")
+    os.environ.setdefault("DROPBOX_APP_SECRET", "fake-dropbox-app-secret")
+    os.environ.setdefault(
+        "DROPBOX_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/dropbox/connector/callback",
+    )
+    # Native Google OAuth — fake Flow in tests.e2e.fakes.native_google raises
+    # "Fake Google Flow requires redirect_uri." when these are empty.
+    os.environ.setdefault(
+        "GOOGLE_DRIVE_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/google/drive/connector/callback",
+    )
+    os.environ.setdefault(
+        "GOOGLE_GMAIL_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/google/gmail/connector/callback",
+    )
+    os.environ.setdefault(
+        "GOOGLE_CALENDAR_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/google/calendar/connector/callback",
+    )
+    os.environ["SLACK_CLIENT_ID"] = "fake-slack-mcp-client-id"
+    os.environ["SLACK_CLIENT_SECRET"] = "fake-slack-mcp-client-secret"
+
+
+def _install_synthetic_global_llm_config() -> None:
+    """Materialise a fake ``app/config/global_llm_config.yaml`` for E2E.
+
+    The real file is gitignored (production operators ship their own with
+    real API keys), so a fresh CI checkout has no YAML at the path
+    ``app.config.load_global_llm_configs()`` reads. With an empty
+    ``GLOBAL_LLM_CONFIGS`` list, the worker's view of the config diverges
+    from the API container.
+
+    We copy the synthetic fixture from ``tests/e2e/fixtures/`` into the
+    production-expected location BEFORE _import_celery_app() so
+    ``app.config`` picks it up on import. Install-only-if-missing so a
+    developer's local config (with real API keys) is preserved.
+
+    MUST run before _import_celery_app().
+    """
+    import shutil
+
+    src = os.path.join(_THIS_DIR, "fixtures", "global_llm_config.yaml")
+    dst = os.path.join(_BACKEND_ROOT, "app", "config", "global_llm_config.yaml")
+
+    if not os.path.exists(src):
+        raise RuntimeError(
+            f"E2E synthetic global LLM config fixture missing at {src!r}. "
+            f"Restore tests/e2e/fixtures/global_llm_config.yaml from VCS."
+        )
+
+    if os.path.exists(dst):
+        logger.info(
+            "[e2e-global-llm-config] %s already exists; leaving it alone "
+            "(local dev config preserved)",
+            dst,
+        )
+        return
+
+    os.makedirs(os.path.dirname(dst), exist_ok=True)
+    shutil.copyfile(src, dst)
+    logger.info("[e2e-global-llm-config] installed %s -> %s", src, dst)
+
+
+def _import_celery_app():
+    """Import and return the production Celery app.
+
+    Every module under ``app.*`` (including all task modules) loads here,
+    creating their bindings. The LLM/embedding factories captured at this
+    point will be replaced by patches in _patch_llm_bindings() below.
+    """
+    from app.celery_app import celery_app
+
+    return celery_app
+
+
+def _patch_llm_bindings() -> None:
+    """Replace LLM factories at every known binding site in worker tasks."""
+    from unittest.mock import patch
+
+    from tests.e2e.fakes.chat_llm import (
+        fake_create_chat_litellm_from_agent_config,
+        fake_create_chat_litellm_from_config,
+    )
+    from tests.e2e.fakes.llm import fake_get_user_long_context_llm
+
+    targets = [
+        "app.services.llm_service.get_user_long_context_llm",
+        "app.tasks.connector_indexers.confluence_indexer.get_user_long_context_llm",
+        "app.tasks.connector_indexers.google_drive_indexer.get_user_long_context_llm",
+        "app.tasks.connector_indexers.google_gmail_indexer.get_user_long_context_llm",
+        "app.tasks.connector_indexers.notion_indexer.get_user_long_context_llm",
+        "app.tasks.connector_indexers.onedrive_indexer.get_user_long_context_llm",
+        "app.tasks.connector_indexers.dropbox_indexer.get_user_long_context_llm",
+        "app.tasks.connector_indexers.local_folder_indexer.get_user_long_context_llm",
+        "app.tasks.document_processors._save.get_user_long_context_llm",
+        "app.tasks.document_processors.markdown_processor.get_user_long_context_llm",
+    ]
+    for target in targets:
+        try:
+            p = patch(target, fake_get_user_long_context_llm)
+            p.start()
+            _active_patches.append(p)
+            logger.info("[fake-llm] patched %s in celery worker", target)
+        except (ModuleNotFoundError, AttributeError) as exc:
+            logger.warning(
+                "[fake-llm] could not patch %s in celery worker: %s.",
+                target,
+                exc,
+            )
+
+    chat_targets = [
+        (
+            "app.agents.new_chat.llm_config.create_chat_litellm_from_agent_config",
+            fake_create_chat_litellm_from_agent_config,
+        ),
+        (
+            "app.agents.new_chat.llm_config.create_chat_litellm_from_config",
+            fake_create_chat_litellm_from_config,
+        ),
+        (
+            "app.tasks.chat.stream_new_chat.create_chat_litellm_from_agent_config",
+            fake_create_chat_litellm_from_agent_config,
+        ),
+        (
+            "app.tasks.chat.stream_new_chat.create_chat_litellm_from_config",
+            fake_create_chat_litellm_from_config,
+        ),
+    ]
+    for target, replacement in chat_targets:
+        try:
+            p = patch(target, replacement)
+            p.start()
+            _active_patches.append(p)
+            logger.info("[fake-chat-llm] patched %s in celery worker", target)
+        except (ModuleNotFoundError, AttributeError) as exc:
+            logger.warning(
+                "[fake-chat-llm] could not patch %s in celery worker: %s.",
+                target,
+                exc,
+            )
+
+
+def _install_runtime_fakes() -> None:
+    """Run each fake's install() against the active patch stack."""
+    from tests.e2e.fakes import (
+        clickup_module as _fake_clickup_module,
+        confluence_indexer as _fake_confluence_indexer,
+        confluence_oauth as _fake_confluence_oauth,
+        docling_service as _fake_docling_service,
+        dropbox_api as _fake_dropbox_api,
+        embeddings as _fake_embeddings,
+        jira_module as _fake_jira_module,
+        linear_module as _fake_linear_module,
+        mcp_oauth_runtime as _fake_mcp_oauth_runtime,
+        mcp_runtime as _fake_mcp_runtime,
+        native_google as _fake_native_google,
+        notion_module as _fake_notion_module,
+        onedrive_graph as _fake_onedrive_graph,
+        slack_module as _fake_slack_module,
+    )
+
+    _fake_embeddings.install(_active_patches)
+    _fake_docling_service.install(_active_patches)
+    _fake_confluence_oauth.install(_active_patches)
+    _fake_confluence_indexer.install(_active_patches)
+    _fake_native_google.install(_active_patches)
+    _fake_onedrive_graph.install(_active_patches)
+    _fake_dropbox_api.install(_active_patches)
+    _fake_notion_module.install(_active_patches)
+    _fake_linear_module.install(_active_patches)
+    _fake_jira_module.install(_active_patches)
+    _fake_clickup_module.install(_active_patches)
+    _fake_mcp_runtime.install(_active_patches)
+    _fake_mcp_oauth_runtime.install(_active_patches)
+    _fake_slack_module.install(_active_patches)
+
+
+def _bootstrap():
+    """Run the full E2E bootstrap and return the production Celery app.
+
+    Ordering is load-bearing:
+      1) Hijack composio + notion_client in sys.modules.
+      2) Load .env + set env defaults (app.config reads env on import).
+      3) Configure logging.
+      4) Materialise the synthetic global_llm_config.yaml so the worker's
+         view of GLOBAL_LLM_CONFIGS matches the API container.
+      5) Import production celery_app (which transitively imports the
+         now-faked external SDKs and reads the env defaults + YAML).
+      6) Patch LLM / embedding bindings at every consumer site.
+      7) Install runtime fakes for connectors and chat backends.
+    """
+    _hijack_external_sdks()
+    _load_dotenv_and_set_env_defaults()
+
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    logger.warning(
+        "*** SURFSENSE E2E CELERY WORKER — fake Composio + LLM + embeddings ***"
+    )
+
+    _install_synthetic_global_llm_config()
+    celery_app = _import_celery_app()
+    _patch_llm_bindings()
+    _install_runtime_fakes()
+    return celery_app
+
+
+celery_app = _bootstrap()
+
+
+def _main() -> None:
+    queue_name = os.getenv("CELERY_TASK_DEFAULT_QUEUE", "surfsense")
+    queues = f"{queue_name},{queue_name}.connectors"
+
+    # macOS forks-after-MPS-init crash prefork workers; threads avoid it.
+    default_pool = "threads" if sys.platform == "darwin" else "prefork"
+    pool = os.getenv("CELERY_POOL", default_pool)
+    concurrency = os.getenv("CELERY_CONCURRENCY", "2")
+
+    celery_app.worker_main(
+        argv=[
+            "worker",
+            "--loglevel=info",
+            f"--queues={queues}",
+            f"--pool={pool}",
+            f"--concurrency={concurrency}",
+            "--without-gossip",
+            "--without-mingle",
+        ]
+    )
+
+
+if __name__ == "__main__":
+    _main()
diff --git a/surfsense_backend/tests/integration/chat/__init__.py b/surfsense_backend/tests/integration/chat/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/tests/integration/chat/test_append_message_recovery.py b/surfsense_backend/tests/integration/chat/test_append_message_recovery.py
new file mode 100644
index 0000000000..a5182a9783
--- /dev/null
+++ b/surfsense_backend/tests/integration/chat/test_append_message_recovery.py
@@ -0,0 +1,573 @@
+"""Integration tests for the cross-writer integration between the
+streaming chat task and the legacy ``POST /threads/{id}/messages``
+(``append_message``) round-trip.
+
+Two scenarios anchor the contract introduced by the server-side
+persistence rework:
+
+(a) **Tool-heavy turn streamed to completion.**
+
+    Drives :class:`AssistantContentBuilder` with synthetic SSE events
+    that mirror what ``_stream_agent_events`` emits for a turn that
+    interleaves text, reasoning, a tool call (start/delta/available/
+    output), and a final text block. Then runs
+    :func:`finalize_assistant_turn` and asserts:
+
+    * ``new_chat_messages.content`` JSONB matches the
+      ``ContentPart[]`` shape the FE history loader expects, with full
+      ``args``/``argsText``/``result``/``langchainToolCallId`` for the
+      tool call.
+    * Exactly one ``token_usage`` row exists keyed on the assistant
+      ``message_id``.
+
+(b) **Stale FE ``appendMessage`` after server finalize.**
+
+    Verifies the recovery branch of the ``append_message`` route now
+    returns the SERVER's authoritative ``ContentPart[]`` (not the FE's
+    stale payload) when the partial unique index from migration 141
+    blocks the FE's INSERT, and that the ``ON CONFLICT DO NOTHING``
+    clause from migration 142 stops the route from writing a duplicate
+    ``token_usage`` row.
+"""
+
+from __future__ import annotations
+
+import json
+from contextlib import asynccontextmanager
+
+import pytest
+import pytest_asyncio
+from sqlalchemy import func, select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import (
+    ChatVisibility,
+    NewChatMessage,
+    NewChatMessageRole,
+    NewChatThread,
+    SearchSpace,
+    TokenUsage,
+    User,
+)
+from app.routes import new_chat_routes
+from app.services.token_tracking_service import TurnTokenAccumulator
+from app.tasks.chat import persistence as persistence_module
+from app.tasks.chat.content_builder import AssistantContentBuilder
+from app.tasks.chat.persistence import (
+    finalize_assistant_turn,
+    persist_assistant_shell,
+)
+
+pytestmark = pytest.mark.integration
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
+@pytest_asyncio.fixture
+async def db_thread(
+    db_session: AsyncSession, db_user: User, db_search_space: SearchSpace
+) -> NewChatThread:
+    thread = NewChatThread(
+        title="Test Chat",
+        search_space_id=db_search_space.id,
+        created_by_id=db_user.id,
+        visibility=ChatVisibility.PRIVATE,
+    )
+    db_session.add(thread)
+    await db_session.flush()
+    return thread
+
+
+@pytest.fixture
+def patched_shielded_session(monkeypatch, db_session: AsyncSession):
+    """Route persistence helpers to the test's savepoint-bound session.
+
+    Mirrors the helper from ``test_persistence.py`` so the helpers'
+    internal ``ws.commit()`` / ``ws.rollback()`` resolve to SAVEPOINT
+    operations on the test transaction instead of touching real
+    autocommit boundaries.
+    """
+
+    @asynccontextmanager
+    async def _fake_shielded_session():
+        yield db_session
+
+    monkeypatch.setattr(
+        persistence_module,
+        "shielded_async_session",
+        _fake_shielded_session,
+    )
+    return db_session
+
+
+@pytest.fixture
+def bypass_permission_checks(monkeypatch):
+    """Replace RBAC + thread access checks with no-ops.
+
+    The append_message route under test calls ``check_permission`` and
+    ``check_thread_access``; those rely on a SearchSpaceMembership row
+    that the existing integration fixtures don't create. The contract
+    we want to verify here is the ``IntegrityError`` -> recovery branch,
+    not the RBAC plumbing — so stub them.
+    """
+
+    async def _allow(*_args, **_kwargs):
+        return True
+
+    monkeypatch.setattr(new_chat_routes, "check_permission", _allow)
+    monkeypatch.setattr(new_chat_routes, "check_thread_access", _allow)
+    return None
+
+
+class _FakeRequest:
+    """Minimal Request stand-in used by ``append_message``.
+
+    The route only calls ``await request.json()`` — keep the surface
+    area tight so this doesn't accidentally hide future signature
+    changes that we *would* want to break the test.
+    """
+
+    def __init__(self, body: dict):
+        self._body = body
+
+    async def json(self) -> dict:
+        return self._body
+
+
+def _build_tool_heavy_content() -> list[dict]:
+    """Drive ``AssistantContentBuilder`` through a tool-heavy turn.
+
+    Produces the same ``ContentPart[]`` shape the streaming layer would
+    persist if ``_stream_agent_events`` ran a turn with: opening
+    reasoning -> text -> tool call (input start/delta/available/output)
+    -> closing text. Centralised here so the (a) and (b) scenarios use
+    the same authoritative payload.
+    """
+    builder = AssistantContentBuilder()
+
+    builder.on_reasoning_start("r1")
+    builder.on_reasoning_delta("r1", "Let me look up ")
+    builder.on_reasoning_delta("r1", "the file listing.")
+    builder.on_reasoning_end("r1")
+
+    builder.on_text_start("t1")
+    builder.on_text_delta("t1", "Sure, listing files in ")
+    builder.on_text_delta("t1", "/.")
+    builder.on_text_end("t1")
+
+    builder.on_tool_input_start(
+        "tool_call_ui_1",
+        tool_name="ls",
+        langchain_tool_call_id="lc_call_xyz",
+    )
+    builder.on_tool_input_delta("tool_call_ui_1", '{"path"')
+    builder.on_tool_input_delta("tool_call_ui_1", ': "/"}')
+    builder.on_tool_input_available(
+        "tool_call_ui_1",
+        tool_name="ls",
+        args={"path": "/"},
+        langchain_tool_call_id="lc_call_xyz",
+    )
+    builder.on_tool_output_available(
+        "tool_call_ui_1",
+        output={"files": ["a.txt", "b.txt"]},
+        langchain_tool_call_id="lc_call_xyz",
+    )
+
+    builder.on_text_start("t2")
+    builder.on_text_delta("t2", "Found 2 files: a.txt and b.txt.")
+    builder.on_text_end("t2")
+
+    return builder.snapshot()
+
+
+def _accumulator_with_one_call() -> TurnTokenAccumulator:
+    acc = TurnTokenAccumulator()
+    acc.add(
+        model="gpt-4o-mini",
+        prompt_tokens=200,
+        completion_tokens=80,
+        total_tokens=280,
+        cost_micros=22222,
+    )
+    return acc
+
+
+# ---------------------------------------------------------------------------
+# (a) Tool-heavy stream finalize
+# ---------------------------------------------------------------------------
+
+
+class TestToolHeavyTurnFinalize:
+    async def test_full_tool_call_persisted_and_one_token_usage_row(
+        self,
+        db_session,
+        db_user,
+        db_thread,
+        db_search_space,
+        patched_shielded_session,
+    ):
+        """End-to-end seam: builder snapshot -> finalize -> DB row.
+
+        Matches the production flow's *content* invariant: whatever
+        ``AssistantContentBuilder.snapshot()`` produces is what the
+        streaming layer hands to ``finalize_assistant_turn``, so this
+        test catches any drift between the JSONB shape the builder
+        emits and the one the FE history loader expects.
+        """
+        thread_id = db_thread.id
+        user_id_str = str(db_user.id)
+        search_space_id = db_search_space.id
+        turn_id = f"{thread_id}:tool_heavy"
+
+        msg_id = await persist_assistant_shell(
+            chat_id=thread_id,
+            user_id=user_id_str,
+            turn_id=turn_id,
+        )
+        assert msg_id is not None
+
+        snapshot = _build_tool_heavy_content()
+        # Sanity-check the snapshot before we hand it to the DB so a
+        # builder regression surfaces here, not deep inside an opaque
+        # JSONB diff.
+        assert any(p.get("type") == "reasoning" for p in snapshot)
+        text_parts = [p for p in snapshot if p.get("type") == "text"]
+        assert len(text_parts) == 2
+        tool_parts = [p for p in snapshot if p.get("type") == "tool-call"]
+        assert len(tool_parts) == 1
+        tool_part = tool_parts[0]
+        assert tool_part["toolCallId"] == "tool_call_ui_1"
+        assert tool_part["toolName"] == "ls"
+        assert tool_part["args"] == {"path": "/"}
+        # ``argsText`` ends up as the pretty-printed final args (the
+        # ``tool-input-available`` event replaces the streamed deltas
+        # with ``json.dumps(args, indent=2)`` to match the FE's
+        # post-stream rendering).
+        assert tool_part["argsText"] == '{\n  "path": "/"\n}'
+        assert tool_part["result"] == {"files": ["a.txt", "b.txt"]}
+        # ``langchainToolCallId`` is the agent-side correlation id used
+        # by the regenerate path; a missing one breaks
+        # edit-from-tool-call later.
+        assert tool_part["langchainToolCallId"] == "lc_call_xyz"
+
+        await finalize_assistant_turn(
+            message_id=msg_id,
+            chat_id=thread_id,
+            search_space_id=search_space_id,
+            user_id=user_id_str,
+            turn_id=turn_id,
+            content=snapshot,
+            accumulator=_accumulator_with_one_call(),
+        )
+
+        # ``content`` must round-trip byte-for-byte through the JSONB
+        # column. SQLAlchemy doesn't auto-refresh the row that survived
+        # the savepoint commit, so refresh explicitly.
+        row = await db_session.get(NewChatMessage, msg_id)
+        await db_session.refresh(row)
+
+        # The history loader reads ``content`` straight into the FE's
+        # parts array, so a strict equality comparison is the right
+        # invariant here.
+        assert row.content == snapshot
+        # Tool-call parts must JSON-serialise cleanly — nothing in
+        # ``args`` / ``argsText`` / ``result`` should accidentally be a
+        # non-JSON-safe value (datetime, set, custom class).
+        assert json.dumps(row.content)
+
+        usage_count = (
+            await db_session.execute(
+                select(func.count())
+                .select_from(TokenUsage)
+                .where(TokenUsage.message_id == msg_id)
+            )
+        ).scalar_one()
+        assert usage_count == 1
+
+        usage = (
+            await db_session.execute(
+                select(TokenUsage).where(TokenUsage.message_id == msg_id)
+            )
+        ).scalar_one()
+        assert usage.usage_type == "chat"
+        assert usage.prompt_tokens == 200
+        assert usage.completion_tokens == 80
+        assert usage.total_tokens == 280
+        assert usage.cost_micros == 22222
+        assert usage.thread_id == thread_id
+        assert usage.search_space_id == search_space_id
+
+
+# ---------------------------------------------------------------------------
+# (b) FE appendMessage after server finalize
+# ---------------------------------------------------------------------------
+
+
+class TestAppendMessageRecoveryAfterFinalize:
+    async def test_returns_server_content_and_does_not_duplicate_token_usage(
+        self,
+        db_session,
+        db_user,
+        db_thread,
+        db_search_space,
+        patched_shielded_session,
+        bypass_permission_checks,
+    ):
+        """FE's stale ``appendMessage`` after server finalize.
+
+        The frontend used to be the authoritative writer for assistant
+        ``content``. Now the server is. When the legacy FE round-trip
+        fires *after* the server has already finalized:
+
+        * the route's INSERT trips the (thread_id, turn_id, role)
+          partial unique index from migration 141,
+        * the recovery branch fetches the existing row and returns
+          *its* ``content`` — discarding the FE payload — so the
+          history loader reads the rich server payload (full tool
+          args, argsText, langchainToolCallId, etc.) on next page
+          reload,
+        * the route's optional ``token_usage`` insert is keyed on the
+          partial unique index from migration 142 so it silently
+          no-ops if the server already wrote one.
+        """
+        thread_id = db_thread.id
+        user_id_str = str(db_user.id)
+        search_space_id = db_search_space.id
+        turn_id = f"{thread_id}:fe_late_append"
+
+        # Step 1: server stream completes. Server-built rich content is
+        # finalized.
+        msg_id = await persist_assistant_shell(
+            chat_id=thread_id,
+            user_id=user_id_str,
+            turn_id=turn_id,
+        )
+        assert msg_id is not None
+
+        server_content = _build_tool_heavy_content()
+        await finalize_assistant_turn(
+            message_id=msg_id,
+            chat_id=thread_id,
+            search_space_id=search_space_id,
+            user_id=user_id_str,
+            turn_id=turn_id,
+            content=server_content,
+            accumulator=_accumulator_with_one_call(),
+        )
+
+        # Step 2: simulate the legacy FE ``appendMessage`` round-trip
+        # arriving with stale, lossy content (missing tool args, etc.)
+        # plus a ``token_usage`` body.
+        fe_stale_content = [
+            {"type": "text", "text": "Found 2 files: a.txt and b.txt."},
+        ]
+        fe_request_body = {
+            "role": "assistant",
+            "content": fe_stale_content,
+            "turn_id": turn_id,
+            "token_usage": {
+                "prompt_tokens": 999,
+                "completion_tokens": 999,
+                "total_tokens": 1998,
+                "cost_micros": 88888,
+                "usage": {"any": "thing"},
+                "call_details": {"calls": []},
+            },
+        }
+        request = _FakeRequest(fe_request_body)
+
+        # ``db_user`` is bound to ``db_session``. The route's
+        # IntegrityError branch calls ``session.rollback()``, which
+        # expires every ORM row attached to the session including this
+        # user — historically causing ``user.id`` to lazy-load
+        # out-of-greenlet and crash the request with ``MissingGreenlet``
+        # (observed in production logs at /api/v1/threads/531/messages
+        # 2026-05-04). The route now captures ``user.id`` to a primitive
+        # UUID at the top of the handler, so the rollback can't reach
+        # it. Pass the *attached* user here on purpose — that's the
+        # production scenario, and this test is the regression guard
+        # against that bug returning.
+        response = await new_chat_routes.append_message(
+            thread_id=thread_id,
+            request=request,
+            session=db_session,
+            user=db_user,
+        )
+
+        # Response must echo the SERVER's rich payload, not the FE's
+        # stale snapshot. This is the user-visible part of the
+        # contract: history reload + ThreadHistoryAdapter.append both
+        # read from the same authoritative source.
+        assert response.id == msg_id
+        assert response.role == NewChatMessageRole.ASSISTANT
+        assert response.turn_id == turn_id
+        assert response.content == server_content
+        assert response.content != fe_stale_content
+
+        # The on-disk row must agree with the response.
+        row = await db_session.get(NewChatMessage, msg_id)
+        await db_session.refresh(row)
+        assert row.content == server_content
+
+        # ``token_usage``: exactly one row, with the *server's* values
+        # (the FE's much larger token counts must not have overwritten
+        # them).
+        usage_count = (
+            await db_session.execute(
+                select(func.count())
+                .select_from(TokenUsage)
+                .where(TokenUsage.message_id == msg_id)
+            )
+        ).scalar_one()
+        assert usage_count == 1
+
+        usage = (
+            await db_session.execute(
+                select(TokenUsage).where(TokenUsage.message_id == msg_id)
+            )
+        ).scalar_one()
+        assert usage.cost_micros == 22222  # Server's value, not 88888
+        assert usage.total_tokens == 280  # Server's value, not 1998
+
+    async def test_legacy_fe_first_appendmessage_then_server_no_dupe(
+        self,
+        db_session,
+        db_user,
+        db_thread,
+        db_search_space,
+        patched_shielded_session,
+        bypass_permission_checks,
+    ):
+        """Inverse race: legacy FE writes first, server finalize second.
+
+        Some clients still post ``appendMessage`` before the streaming
+        ``finally`` runs. The contract is symmetric: whichever writer
+        loses the (thread_id, turn_id, role) race silently lets the
+        winner keep its content. In particular the *server's*
+        finalize must NOT raise — it must look up the existing row and
+        UPDATE its content with the server-built payload (which is
+        always richer/more authoritative than whatever the FE
+        snapshot held).
+        """
+        thread_id = db_thread.id
+        user_id_str = str(db_user.id)
+        search_space_id = db_search_space.id
+        turn_id = f"{thread_id}:fe_first"
+
+        # Step 1: legacy FE appendMessage lands first. No prior shell
+        # row exists; the route does the INSERT itself.
+        fe_request_body = {
+            "role": "assistant",
+            "content": [{"type": "text", "text": "early FE write"}],
+            "turn_id": turn_id,
+        }
+        fe_response = await new_chat_routes.append_message(
+            thread_id=thread_id,
+            request=_FakeRequest(fe_request_body),
+            session=db_session,
+            user=db_user,
+        )
+        assert fe_response.role == NewChatMessageRole.ASSISTANT
+
+        # Step 2: server stream's persist_assistant_shell now races
+        # behind. It must adopt the existing row id, not raise.
+        adopted_id = await persist_assistant_shell(
+            chat_id=thread_id,
+            user_id=user_id_str,
+            turn_id=turn_id,
+        )
+        assert adopted_id == fe_response.id
+
+        # Step 3: server finalize then overwrites the FE's stub with
+        # the rich content (which is the correct, more authoritative
+        # payload).
+        server_content = _build_tool_heavy_content()
+        await finalize_assistant_turn(
+            message_id=adopted_id,
+            chat_id=thread_id,
+            search_space_id=search_space_id,
+            user_id=user_id_str,
+            turn_id=turn_id,
+            content=server_content,
+            accumulator=_accumulator_with_one_call(),
+        )
+
+        # Final state: one row, server content, one token_usage row.
+        msg_count = (
+            await db_session.execute(
+                select(func.count())
+                .select_from(NewChatMessage)
+                .where(
+                    NewChatMessage.thread_id == thread_id,
+                    NewChatMessage.turn_id == turn_id,
+                    NewChatMessage.role == NewChatMessageRole.ASSISTANT,
+                )
+            )
+        ).scalar_one()
+        assert msg_count == 1
+
+        row = await db_session.get(NewChatMessage, adopted_id)
+        await db_session.refresh(row)
+        assert row.content == server_content
+
+        usage_count = (
+            await db_session.execute(
+                select(func.count())
+                .select_from(TokenUsage)
+                .where(TokenUsage.message_id == adopted_id)
+            )
+        ).scalar_one()
+        assert usage_count == 1
+
+    async def test_appendmessage_without_turn_id_legacy_400(
+        self,
+        db_session,
+        db_user,
+        db_thread,
+        patched_shielded_session,
+        bypass_permission_checks,
+    ):
+        """Defensive: a bare appendMessage with no turn_id and no
+        existing row is just a normal INSERT — must succeed. But if a
+        row with the same role already exists in this thread *without*
+        turn_id collisions, the route should fall through to the
+        legacy 400 path on a foreign-key / unrelated IntegrityError
+        (we don't ship that bug today, but pin the behaviour so a
+        future schema change can't silently regress it).
+        """
+        thread_id = db_thread.id
+
+        # Bare appendMessage with no turn_id — should just succeed
+        # without invoking the recovery branch.
+        ok_response = await new_chat_routes.append_message(
+            thread_id=thread_id,
+            request=_FakeRequest(
+                {
+                    "role": "user",
+                    "content": [{"type": "text", "text": "hi"}],
+                }
+            ),
+            session=db_session,
+            user=db_user,
+        )
+        assert ok_response.role == NewChatMessageRole.USER
+        assert ok_response.turn_id is None
+
+        # Sanity: the route did NOT silently swallow the missing
+        # turn_id by routing through the unique-index recovery branch
+        # — it took the happy path.
+        msg_count = (
+            await db_session.execute(
+                select(func.count())
+                .select_from(NewChatMessage)
+                .where(
+                    NewChatMessage.thread_id == thread_id,
+                    NewChatMessage.role == NewChatMessageRole.USER,
+                )
+            )
+        ).scalar_one()
+        assert msg_count == 1
diff --git a/surfsense_backend/tests/integration/chat/test_message_id_sse.py b/surfsense_backend/tests/integration/chat/test_message_id_sse.py
new file mode 100644
index 0000000000..8fc935eaaa
--- /dev/null
+++ b/surfsense_backend/tests/integration/chat/test_message_id_sse.py
@@ -0,0 +1,332 @@
+"""Integration tests for the SSE-based message ID handshake.
+
+The streaming generators (``stream_new_chat`` / ``stream_resume_chat``)
+emit two new events after their respective persistence helpers resolve
+the canonical ``new_chat_messages.id``:
+
+* ``data-user-message-id``  — emitted only by ``stream_new_chat``,
+  AFTER ``persist_user_turn`` and BEFORE any LLM streaming.
+* ``data-assistant-message-id`` — emitted by both
+  ``stream_new_chat`` and ``stream_resume_chat``, AFTER
+  ``persist_assistant_shell`` and BEFORE any LLM streaming.
+
+The frontend renames its optimistic ``msg-user-XXX`` /
+``msg-assistant-XXX`` placeholder ids to ``msg-{db_id}`` upon receiving
+these events. This test suite anchors three contracts:
+
+1. ``format_data`` produces SSE bytes in the precise shape
+   ``data: {"type":"data-<suffix>","data":{...}}\\n\\n`` that the FE's
+   ``readSSEStream`` consumer parses (matches ``surfsense_web/lib/chat/streaming-state.ts``).
+2. The ``message_id`` carried in the SSE payload exactly equals the
+   primary key the persistence helper inserted into
+   ``new_chat_messages`` — so the FE rename produces ``msg-{real_pk}``,
+   which in turn unlocks DB-id-gated UI (comments, edit-from-message).
+3. The same ``message_id`` is used for the ``token_usage.message_id``
+   foreign key, so ``finalize_assistant_turn``'s row binds correctly.
+
+Direct end-to-end testing of ``stream_new_chat`` requires a fully
+mocked agent + LLM stack (out-of-scope here); those flows are covered
+by the harness-driven integration tests under
+``tests/integration/agents/new_chat/`` plus the assertion in
+``test_persistence.py`` that the helpers themselves return ``int``
+ids. The contracts above close the remaining gap between the persist
+helpers and the bytes that ship to the FE.
+"""
+
+from __future__ import annotations
+
+import json
+from contextlib import asynccontextmanager
+
+import pytest
+import pytest_asyncio
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import (
+    ChatVisibility,
+    NewChatMessage,
+    NewChatMessageRole,
+    NewChatThread,
+    SearchSpace,
+    User,
+)
+from app.services.new_streaming_service import VercelStreamingService
+from app.tasks.chat import persistence as persistence_module
+from app.tasks.chat.persistence import (
+    persist_assistant_shell,
+    persist_user_turn,
+)
+
+pytestmark = pytest.mark.integration
+
+
+# ---------------------------------------------------------------------------
+# Fixtures (mirror test_persistence.py)
+# ---------------------------------------------------------------------------
+
+
+@pytest_asyncio.fixture
+async def db_thread(
+    db_session: AsyncSession, db_user: User, db_search_space: SearchSpace
+) -> NewChatThread:
+    thread = NewChatThread(
+        title="Test Chat",
+        search_space_id=db_search_space.id,
+        created_by_id=db_user.id,
+        visibility=ChatVisibility.PRIVATE,
+    )
+    db_session.add(thread)
+    await db_session.flush()
+    return thread
+
+
+@pytest.fixture
+def patched_shielded_session(monkeypatch, db_session: AsyncSession):
+    """Route persistence helpers to the test's savepoint-bound session."""
+
+    @asynccontextmanager
+    async def _fake_shielded_session():
+        yield db_session
+
+    monkeypatch.setattr(
+        persistence_module,
+        "shielded_async_session",
+        _fake_shielded_session,
+    )
+    return db_session
+
+
+# ---------------------------------------------------------------------------
+# (1) SSE byte-shape contract
+# ---------------------------------------------------------------------------
+
+
+def _parse_sse_data_line(blob: str) -> dict:
+    """Unwrap a single ``data: <json>\\n\\n`` SSE frame.
+
+    Raises if there's more than one frame or the prefix is wrong — keeps
+    the parser strict so a regression in ``format_data`` produces a
+    test failure here, not in a downstream consumer.
+    """
+    assert blob.endswith("\n\n"), f"missing terminator: {blob!r}"
+    line = blob.removesuffix("\n\n")
+    assert line.startswith("data: "), f"missing data prefix: {line!r}"
+    return json.loads(line.removeprefix("data: "))
+
+
+class TestSSEByteShape:
+    def test_data_user_message_id_byte_shape(self):
+        """``format_data("user-message-id", {...})`` must produce the
+        exact wire format the FE's
+        ``readSSEStream`` -> ``data-user-message-id`` case parses.
+        """
+        svc = VercelStreamingService()
+        blob = svc.format_data(
+            "user-message-id",
+            {"message_id": 1843, "turn_id": "533:1762900000000"},
+        )
+        envelope = _parse_sse_data_line(blob)
+        assert envelope == {
+            "type": "data-user-message-id",
+            "data": {"message_id": 1843, "turn_id": "533:1762900000000"},
+        }
+
+    def test_data_assistant_message_id_byte_shape(self):
+        svc = VercelStreamingService()
+        blob = svc.format_data(
+            "assistant-message-id",
+            {"message_id": 1844, "turn_id": "533:1762900000000"},
+        )
+        envelope = _parse_sse_data_line(blob)
+        assert envelope == {
+            "type": "data-assistant-message-id",
+            "data": {"message_id": 1844, "turn_id": "533:1762900000000"},
+        }
+
+
+# ---------------------------------------------------------------------------
+# (2) Helper-id <-> DB-pk coherence
+# ---------------------------------------------------------------------------
+
+
+class TestHandshakeIdMatchesDB:
+    """The SSE handshake's correctness hinges on the integer in
+    ``data-{user,assistant}-message-id`` being the EXACT primary key
+    the persistence helper inserted. If they ever diverge, the FE
+    rename produces ``msg-{wrong_id}``, comments break (regex match
+    fails), and downstream features (edit, regenerate) target the
+    wrong row. Anchor it here.
+    """
+
+    async def test_user_message_id_matches_new_chat_messages_pk(
+        self,
+        db_session,
+        db_user,
+        db_thread,
+        patched_shielded_session,
+    ):
+        thread_id = db_thread.id
+        user_id_str = str(db_user.id)
+        turn_id = f"{thread_id}:9000"
+
+        # The streaming generator passes this same value into
+        # ``streaming_service.format_data("user-message-id", {...})``.
+        msg_id_from_helper = await persist_user_turn(
+            chat_id=thread_id,
+            user_id=user_id_str,
+            turn_id=turn_id,
+            user_query="hello",
+        )
+        assert isinstance(msg_id_from_helper, int)
+
+        # Look up the row the helper inserted via
+        # ``(thread_id, turn_id, role)``  — the same composite the FE
+        # uses to identify a turn — and confirm the PK matches.
+        row = (
+            await db_session.execute(
+                select(NewChatMessage).where(
+                    NewChatMessage.thread_id == thread_id,
+                    NewChatMessage.turn_id == turn_id,
+                    NewChatMessage.role == NewChatMessageRole.USER,
+                )
+            )
+        ).scalar_one()
+        assert row.id == msg_id_from_helper
+
+        # The byte-stream the FE actually receives — confirms the
+        # round-trip from the helper return value to the SSE payload.
+        svc = VercelStreamingService()
+        envelope = _parse_sse_data_line(
+            svc.format_data(
+                "user-message-id",
+                {"message_id": msg_id_from_helper, "turn_id": turn_id},
+            )
+        )
+        assert envelope["data"]["message_id"] == row.id
+
+    async def test_assistant_message_id_matches_new_chat_messages_pk(
+        self,
+        db_session,
+        db_user,
+        db_thread,
+        patched_shielded_session,
+    ):
+        thread_id = db_thread.id
+        user_id_str = str(db_user.id)
+        turn_id = f"{thread_id}:9100"
+
+        msg_id_from_helper = await persist_assistant_shell(
+            chat_id=thread_id,
+            user_id=user_id_str,
+            turn_id=turn_id,
+        )
+        assert isinstance(msg_id_from_helper, int)
+
+        row = (
+            await db_session.execute(
+                select(NewChatMessage).where(
+                    NewChatMessage.thread_id == thread_id,
+                    NewChatMessage.turn_id == turn_id,
+                    NewChatMessage.role == NewChatMessageRole.ASSISTANT,
+                )
+            )
+        ).scalar_one()
+        assert row.id == msg_id_from_helper
+
+        svc = VercelStreamingService()
+        envelope = _parse_sse_data_line(
+            svc.format_data(
+                "assistant-message-id",
+                {"message_id": msg_id_from_helper, "turn_id": turn_id},
+            )
+        )
+        assert envelope["data"]["message_id"] == row.id
+
+    async def test_handshake_ids_for_full_turn_are_distinct_and_paired(
+        self,
+        db_session,
+        db_user,
+        db_thread,
+        patched_shielded_session,
+    ):
+        """Sanity: a full new-chat turn's two SSE events carry two
+        DIFFERENT ids (user row PK ≠ assistant row PK), both anchored
+        to the SAME ``turn_id`` in the DB. This pairing is what
+        ``finalize_assistant_turn`` and the regenerate / edit flows
+        rely on.
+        """
+        thread_id = db_thread.id
+        user_id_str = str(db_user.id)
+        turn_id = f"{thread_id}:9200"
+
+        user_msg_id = await persist_user_turn(
+            chat_id=thread_id,
+            user_id=user_id_str,
+            turn_id=turn_id,
+            user_query="hi",
+        )
+        assistant_msg_id = await persist_assistant_shell(
+            chat_id=thread_id,
+            user_id=user_id_str,
+            turn_id=turn_id,
+        )
+        assert user_msg_id is not None and assistant_msg_id is not None
+        assert user_msg_id != assistant_msg_id
+
+        rows = (
+            (
+                await db_session.execute(
+                    select(NewChatMessage)
+                    .where(
+                        NewChatMessage.thread_id == thread_id,
+                        NewChatMessage.turn_id == turn_id,
+                    )
+                    .order_by(NewChatMessage.id)
+                )
+            )
+            .scalars()
+            .all()
+        )
+        assert len(rows) == 2
+        ids_by_role = {r.role: r.id for r in rows}
+        assert ids_by_role[NewChatMessageRole.USER] == user_msg_id
+        assert ids_by_role[NewChatMessageRole.ASSISTANT] == assistant_msg_id
+
+
+# ---------------------------------------------------------------------------
+# (3) Parse helpers used by the FE — sanity-check our payload shape
+# ---------------------------------------------------------------------------
+
+
+class TestPayloadShapeMatchesFEReader:
+    """The FE's ``readStreamedMessageId`` (in
+    ``surfsense_web/lib/chat/stream-side-effects.ts``) requires:
+
+    * ``message_id`` is a ``number`` (rejects null / string / NaN).
+    * ``turn_id`` is an optional non-empty string (else it's coerced
+      to ``null``).
+
+    These tests exercise the BE side of that contract by inspecting
+    ``format_data`` output shapes that the FE consumes verbatim.
+    """
+
+    def test_message_id_is_serialised_as_a_json_number(self):
+        svc = VercelStreamingService()
+        envelope = _parse_sse_data_line(
+            svc.format_data("user-message-id", {"message_id": 42, "turn_id": "t"})
+        )
+        assert isinstance(envelope["data"]["message_id"], int)
+        assert envelope["data"]["message_id"] == 42
+
+    def test_turn_id_round_trips_as_string(self):
+        svc = VercelStreamingService()
+        # The actual format used in production: f"{chat_id}:{int(time.time()*1000)}"
+        production_turn_id = "533:1762900000000"
+        envelope = _parse_sse_data_line(
+            svc.format_data(
+                "assistant-message-id",
+                {"message_id": 1, "turn_id": production_turn_id},
+            )
+        )
+        assert envelope["data"]["turn_id"] == production_turn_id
diff --git a/surfsense_backend/tests/integration/chat/test_persistence.py b/surfsense_backend/tests/integration/chat/test_persistence.py
new file mode 100644
index 0000000000..d6f816cc08
--- /dev/null
+++ b/surfsense_backend/tests/integration/chat/test_persistence.py
@@ -0,0 +1,808 @@
+"""Integration tests for ``app.tasks.chat.persistence``.
+
+Verifies the DB-side guarantees the streaming chat task relies on:
+
+* ``persist_assistant_shell`` is idempotent against the
+  ``(thread_id, turn_id, ASSISTANT)`` partial unique index from
+  migration 141. Two calls with the same ``turn_id`` return the SAME
+  ``message_id`` and never create a duplicate ``new_chat_messages`` row.
+* ``finalize_assistant_turn`` writes a status-marker payload when given
+  empty content, never raises, and is safe to call twice on the same
+  ``message_id`` — the partial unique index from migration 142
+  (``uq_token_usage_message_id``) prevents the second insert from
+  producing a duplicate ``token_usage`` row.
+* The same ``ON CONFLICT DO NOTHING`` invariant covers the cross-writer
+  race where ``finalize_assistant_turn`` and the ``append_message``
+  recovery branch both target the same ``message_id``.
+
+All tests run inside the conftest's outer-transaction-with-savepoint
+fixture so commits inside the helpers (which open their own
+``shielded_async_session``) are released as savepoints and rolled back
+at test end. We monkey-patch ``shielded_async_session`` to yield the
+same pooled test session so the integration transaction stays
+in-scope.
+"""
+
+from __future__ import annotations
+
+from contextlib import asynccontextmanager
+
+import pytest
+import pytest_asyncio
+from sqlalchemy import func, select
+from sqlalchemy.dialects.postgresql import insert as pg_insert
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import (
+    ChatVisibility,
+    NewChatMessage,
+    NewChatMessageRole,
+    NewChatThread,
+    SearchSpace,
+    TokenUsage,
+    User,
+)
+from app.services.token_tracking_service import TurnTokenAccumulator
+from app.tasks.chat import persistence as persistence_module
+from app.tasks.chat.persistence import (
+    finalize_assistant_turn,
+    persist_assistant_shell,
+    persist_user_turn,
+)
+
+pytestmark = pytest.mark.integration
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
+@pytest_asyncio.fixture
+async def db_thread(
+    db_session: AsyncSession, db_user: User, db_search_space: SearchSpace
+) -> NewChatThread:
+    thread = NewChatThread(
+        title="Test Chat",
+        search_space_id=db_search_space.id,
+        created_by_id=db_user.id,
+        visibility=ChatVisibility.PRIVATE,
+    )
+    db_session.add(thread)
+    await db_session.flush()
+    return thread
+
+
+@pytest.fixture
+def patched_shielded_session(monkeypatch, db_session: AsyncSession):
+    """Route persistence helpers to the test's savepoint-bound session.
+
+    The persistence helpers use ``async with shielded_async_session() as
+    ws`` and call ``ws.commit()`` internally. Inside the conftest's
+    ``join_transaction_mode="create_savepoint"`` setup, those commits
+    release a SAVEPOINT instead of committing the outer transaction —
+    so the test session can see helper-staged rows immediately and the
+    outer rollback at end of test wipes them.
+    """
+
+    @asynccontextmanager
+    async def _fake_shielded_session():
+        yield db_session
+        # Do NOT close — the outer fixture owns the session lifecycle.
+
+    monkeypatch.setattr(
+        persistence_module,
+        "shielded_async_session",
+        _fake_shielded_session,
+    )
+    return db_session
+
+
+def _accumulator_with_one_call() -> TurnTokenAccumulator:
+    acc = TurnTokenAccumulator()
+    acc.add(
+        model="gpt-4o-mini",
+        prompt_tokens=100,
+        completion_tokens=50,
+        total_tokens=150,
+        cost_micros=12345,
+    )
+    return acc
+
+
+async def _count_assistant_rows(
+    session: AsyncSession, thread_id: int, turn_id: str
+) -> int:
+    result = await session.execute(
+        select(func.count())
+        .select_from(NewChatMessage)
+        .where(
+            NewChatMessage.thread_id == thread_id,
+            NewChatMessage.turn_id == turn_id,
+            NewChatMessage.role == NewChatMessageRole.ASSISTANT,
+        )
+    )
+    return int(result.scalar_one())
+
+
+async def _count_token_usage_rows(session: AsyncSession, message_id: int) -> int:
+    result = await session.execute(
+        select(func.count())
+        .select_from(TokenUsage)
+        .where(TokenUsage.message_id == message_id)
+    )
+    return int(result.scalar_one())
+
+
+# ---------------------------------------------------------------------------
+# persist_assistant_shell
+# ---------------------------------------------------------------------------
+
+
+class TestPersistAssistantShell:
+    async def test_first_call_inserts_empty_shell_and_returns_id(
+        self,
+        db_session,
+        db_user,
+        db_thread,
+        patched_shielded_session,
+    ):
+        # Capture primitive ids before any persistence helper runs:
+        # the helpers commit/rollback the shared test session, which
+        # can detach ORM rows mid-test.
+        thread_id = db_thread.id
+        user_id_str = str(db_user.id)
+        turn_id = f"{thread_id}:1000"
+
+        msg_id = await persist_assistant_shell(
+            chat_id=thread_id,
+            user_id=user_id_str,
+            turn_id=turn_id,
+        )
+        assert msg_id is not None and isinstance(msg_id, int)
+
+        row = await db_session.get(NewChatMessage, msg_id)
+        assert row is not None
+        assert row.thread_id == thread_id
+        assert row.role == NewChatMessageRole.ASSISTANT
+        assert row.turn_id == turn_id
+        # Empty shell payload — finalize_assistant_turn overwrites later.
+        assert row.content == [{"type": "text", "text": ""}]
+
+    async def test_second_call_with_same_turn_id_returns_same_id(
+        self,
+        db_session,
+        db_user,
+        db_thread,
+        patched_shielded_session,
+    ):
+        # Capture primitive ids before any persistence helper runs:
+        # the helpers commit/rollback the shared test session, which
+        # can detach ORM rows mid-test.
+        thread_id = db_thread.id
+        user_id_str = str(db_user.id)
+        turn_id = f"{thread_id}:2000"
+
+        first_id = await persist_assistant_shell(
+            chat_id=thread_id,
+            user_id=user_id_str,
+            turn_id=turn_id,
+        )
+        second_id = await persist_assistant_shell(
+            chat_id=thread_id,
+            user_id=user_id_str,
+            turn_id=turn_id,
+        )
+
+        assert first_id is not None
+        assert first_id == second_id
+        # Exactly one row in the DB for this turn.
+        assert await _count_assistant_rows(db_session, thread_id, turn_id) == 1
+
+    async def test_missing_turn_id_returns_none(
+        self,
+        db_user,
+        db_thread,
+        patched_shielded_session,
+    ):
+        thread_id = db_thread.id
+        user_id_str = str(db_user.id)
+
+        msg_id = await persist_assistant_shell(
+            chat_id=thread_id,
+            user_id=user_id_str,
+            turn_id="",
+        )
+        assert msg_id is None
+
+    async def test_after_persist_user_turn_resolves_assistant_id(
+        self,
+        db_session,
+        db_user,
+        db_thread,
+        patched_shielded_session,
+    ):
+        thread_id = db_thread.id
+        user_id_str = str(db_user.id)
+        turn_id = f"{thread_id}:3000"
+
+        # The streaming layer always calls persist_user_turn first, so
+        # smoke-test the canonical sequence.
+        user_msg_id = await persist_user_turn(
+            chat_id=thread_id,
+            user_id=user_id_str,
+            turn_id=turn_id,
+            user_query="hello",
+        )
+        assert isinstance(user_msg_id, int)
+
+        msg_id = await persist_assistant_shell(
+            chat_id=thread_id,
+            user_id=user_id_str,
+            turn_id=turn_id,
+        )
+        assert msg_id is not None
+        # User row + assistant shell row = 2 rows for this turn.
+        result = await db_session.execute(
+            select(func.count())
+            .select_from(NewChatMessage)
+            .where(
+                NewChatMessage.thread_id == thread_id,
+                NewChatMessage.turn_id == turn_id,
+            )
+        )
+        assert result.scalar_one() == 2
+
+    async def test_double_call_with_same_turn_id_uses_on_conflict(
+        self,
+        db_session,
+        db_user,
+        db_thread,
+        patched_shielded_session,
+    ):
+        """Verifies the ON CONFLICT DO NOTHING path on the assistant
+        shell does not raise ``IntegrityError`` even when the second
+        writer races the first within a tight loop. ``test_second_call_with_same_turn_id_returns_same_id``
+        already covers the same-id semantics; this test additionally
+        asserts neither call raises so the debugger's
+        ``raise-on-IntegrityError`` setting won't pause the streaming
+        path under contention.
+        """
+        thread_id = db_thread.id
+        user_id_str = str(db_user.id)
+        turn_id = f"{thread_id}:3500"
+
+        # Both calls go through ``pg_insert(...).on_conflict_do_nothing``;
+        # the second one returns RETURNING=∅ and falls into the SELECT
+        # branch. Neither path raises.
+        first_id = await persist_assistant_shell(
+            chat_id=thread_id, user_id=user_id_str, turn_id=turn_id
+        )
+        second_id = await persist_assistant_shell(
+            chat_id=thread_id, user_id=user_id_str, turn_id=turn_id
+        )
+        assert first_id is not None
+        assert first_id == second_id
+
+
+# ---------------------------------------------------------------------------
+# persist_user_turn
+# ---------------------------------------------------------------------------
+
+
+class TestPersistUserTurn:
+    async def test_returns_message_id_on_first_insert(
+        self,
+        db_session,
+        db_user,
+        db_thread,
+        patched_shielded_session,
+    ):
+        thread_id = db_thread.id
+        user_id_str = str(db_user.id)
+        turn_id = f"{thread_id}:8000"
+
+        msg_id = await persist_user_turn(
+            chat_id=thread_id,
+            user_id=user_id_str,
+            turn_id=turn_id,
+            user_query="hello",
+        )
+        assert isinstance(msg_id, int) and msg_id > 0
+
+        row = await db_session.get(NewChatMessage, msg_id)
+        assert row is not None
+        assert row.thread_id == thread_id
+        assert row.role == NewChatMessageRole.USER
+        assert row.turn_id == turn_id
+        assert row.content == [{"type": "text", "text": "hello"}]
+
+    async def test_returns_existing_id_on_conflict(
+        self,
+        db_session,
+        db_user,
+        db_thread,
+        patched_shielded_session,
+    ):
+        thread_id = db_thread.id
+        user_id_str = str(db_user.id)
+        turn_id = f"{thread_id}:8100"
+
+        first_id = await persist_user_turn(
+            chat_id=thread_id,
+            user_id=user_id_str,
+            turn_id=turn_id,
+            user_query="hello",
+        )
+        # Second call simulates a legacy FE ``appendMessage`` racing the
+        # SSE stream: ON CONFLICT DO NOTHING short-circuits at the DB
+        # level, the helper recovers the existing id via SELECT, and
+        # crucially does NOT raise ``IntegrityError`` (the debugger
+        # would otherwise pause on it).
+        second_id = await persist_user_turn(
+            chat_id=thread_id,
+            user_id=user_id_str,
+            turn_id=turn_id,
+            user_query="ignored on conflict",
+        )
+        assert first_id is not None
+        assert first_id == second_id
+
+        # Exactly one user row for this turn.
+        count = await db_session.execute(
+            select(func.count())
+            .select_from(NewChatMessage)
+            .where(
+                NewChatMessage.thread_id == thread_id,
+                NewChatMessage.turn_id == turn_id,
+                NewChatMessage.role == NewChatMessageRole.USER,
+            )
+        )
+        assert count.scalar_one() == 1
+
+    async def test_embeds_mentioned_documents_part(
+        self,
+        db_session,
+        db_user,
+        db_thread,
+        patched_shielded_session,
+    ):
+        """The full ``{id, title, document_type, kind}`` chip metadata
+        forwarded by the FE must round-trip into a single
+        ``mentioned-documents`` ContentPart on the persisted user
+        message — the history loader renders the chips on reload from
+        this part directly. Folder chips ride alongside doc chips so
+        the FE can render mixed mention bars without a second fetch.
+        """
+        thread_id = db_thread.id
+        user_id_str = str(db_user.id)
+        turn_id = f"{thread_id}:8200"
+
+        mentioned = [
+            {"id": 11, "title": "Alpha", "document_type": "GENERAL", "kind": "doc"},
+            {"id": 22, "title": "Beta", "document_type": "GENERAL", "kind": "doc"},
+            {
+                "id": 33,
+                "title": "Reports",
+                "document_type": "FOLDER",
+                "kind": "folder",
+            },
+        ]
+        msg_id = await persist_user_turn(
+            chat_id=thread_id,
+            user_id=user_id_str,
+            turn_id=turn_id,
+            user_query="hello",
+            mentioned_documents=mentioned,
+        )
+        assert isinstance(msg_id, int)
+
+        row = await db_session.get(NewChatMessage, msg_id)
+        assert row is not None
+        # Content is a 2-part list: text + mentioned-documents.
+        assert isinstance(row.content, list)
+        assert row.content[0] == {"type": "text", "text": "hello"}
+        assert row.content[1] == {
+            "type": "mentioned-documents",
+            "documents": [
+                {"id": 11, "title": "Alpha", "document_type": "GENERAL", "kind": "doc"},
+                {"id": 22, "title": "Beta", "document_type": "GENERAL", "kind": "doc"},
+                {
+                    "id": 33,
+                    "title": "Reports",
+                    "document_type": "FOLDER",
+                    "kind": "folder",
+                },
+            ],
+        }
+
+    async def test_legacy_chip_without_kind_defaults_to_doc(
+        self,
+        db_session,
+        db_user,
+        db_thread,
+        patched_shielded_session,
+    ):
+        """Pre-folder clients send chips without ``kind``. The persistence
+        layer defaults them to ``"doc"`` so the round-trip stays
+        consistent on reload — the FE schema's optional default
+        produces the same value, but persisting it explicitly keeps
+        the DB row self-describing.
+        """
+        thread_id = db_thread.id
+        user_id_str = str(db_user.id)
+        turn_id = f"{thread_id}:8201"
+
+        mentioned = [
+            {"id": 77, "title": "Legacy", "document_type": "GENERAL"},
+        ]
+        msg_id = await persist_user_turn(
+            chat_id=thread_id,
+            user_id=user_id_str,
+            turn_id=turn_id,
+            user_query="hi",
+            mentioned_documents=mentioned,
+        )
+        assert isinstance(msg_id, int)
+
+        row = await db_session.get(NewChatMessage, msg_id)
+        assert row is not None
+        assert isinstance(row.content, list)
+        mentioned_part = next(
+            p for p in row.content if p.get("type") == "mentioned-documents"
+        )
+        assert mentioned_part == {
+            "type": "mentioned-documents",
+            "documents": [
+                {
+                    "id": 77,
+                    "title": "Legacy",
+                    "document_type": "GENERAL",
+                    "kind": "doc",
+                },
+            ],
+        }
+
+    async def test_skips_mentioned_documents_when_empty_or_invalid(
+        self,
+        db_session,
+        db_user,
+        db_thread,
+        patched_shielded_session,
+    ):
+        """Empty list and entries missing required fields are dropped;
+        a ``mentioned-documents`` part is only emitted when at least
+        one normalised entry survived.
+        """
+        thread_id = db_thread.id
+        user_id_str = str(db_user.id)
+        turn_id_empty = f"{thread_id}:8300"
+        turn_id_invalid = f"{thread_id}:8301"
+
+        msg_id_empty = await persist_user_turn(
+            chat_id=thread_id,
+            user_id=user_id_str,
+            turn_id=turn_id_empty,
+            user_query="hi",
+            mentioned_documents=[],
+        )
+        assert isinstance(msg_id_empty, int)
+        row_empty = await db_session.get(NewChatMessage, msg_id_empty)
+        assert row_empty is not None
+        assert row_empty.content == [{"type": "text", "text": "hi"}]
+
+        # Each entry missing one required field — all skipped.
+        msg_id_invalid = await persist_user_turn(
+            chat_id=thread_id,
+            user_id=user_id_str,
+            turn_id=turn_id_invalid,
+            user_query="hi",
+            mentioned_documents=[
+                {"title": "no id", "document_type": "GENERAL"},  # missing id
+                {"id": 99, "document_type": "GENERAL"},  # missing title
+                {"id": 100, "title": "no type"},  # missing document_type
+            ],
+        )
+        assert isinstance(msg_id_invalid, int)
+        row_invalid = await db_session.get(NewChatMessage, msg_id_invalid)
+        assert row_invalid is not None
+        assert row_invalid.content == [{"type": "text", "text": "hi"}]
+
+    async def test_missing_turn_id_returns_none(
+        self,
+        db_user,
+        db_thread,
+        patched_shielded_session,
+    ):
+        thread_id = db_thread.id
+        user_id_str = str(db_user.id)
+
+        msg_id = await persist_user_turn(
+            chat_id=thread_id,
+            user_id=user_id_str,
+            turn_id="",
+            user_query="hello",
+        )
+        assert msg_id is None
+
+
+# ---------------------------------------------------------------------------
+# finalize_assistant_turn
+# ---------------------------------------------------------------------------
+
+
+class TestFinalizeAssistantTurn:
+    async def test_writes_content_and_token_usage(
+        self,
+        db_session,
+        db_user,
+        db_thread,
+        db_search_space,
+        patched_shielded_session,
+    ):
+        thread_id = db_thread.id
+        user_id_uuid = db_user.id
+        user_id_str = str(user_id_uuid)
+        search_space_id = db_search_space.id
+        turn_id = f"{thread_id}:4000"
+
+        msg_id = await persist_assistant_shell(
+            chat_id=thread_id,
+            user_id=user_id_str,
+            turn_id=turn_id,
+        )
+        assert msg_id is not None
+
+        rich_content = [
+            {"type": "text", "text": "Hello world"},
+            {
+                "type": "tool-call",
+                "toolCallId": "call_x",
+                "toolName": "ls",
+                "args": {"path": "/"},
+                "argsText": '{\n  "path": "/"\n}',
+                "result": {"files": []},
+                "langchainToolCallId": "lc_x",
+            },
+        ]
+        await finalize_assistant_turn(
+            message_id=msg_id,
+            chat_id=thread_id,
+            search_space_id=search_space_id,
+            user_id=user_id_str,
+            turn_id=turn_id,
+            content=rich_content,
+            accumulator=_accumulator_with_one_call(),
+        )
+
+        row = await db_session.get(NewChatMessage, msg_id)
+        await db_session.refresh(row)
+        assert row.content == rich_content
+
+        # Exactly one token_usage row keyed on this message_id.
+        usage_rows = (
+            (
+                await db_session.execute(
+                    select(TokenUsage).where(TokenUsage.message_id == msg_id)
+                )
+            )
+            .scalars()
+            .all()
+        )
+        assert len(usage_rows) == 1
+        usage = usage_rows[0]
+        assert usage.usage_type == "chat"
+        assert usage.prompt_tokens == 100
+        assert usage.completion_tokens == 50
+        assert usage.total_tokens == 150
+        assert usage.cost_micros == 12345
+        assert usage.thread_id == thread_id
+        assert usage.search_space_id == search_space_id
+
+    async def test_empty_content_writes_status_marker(
+        self,
+        db_session,
+        db_user,
+        db_thread,
+        db_search_space,
+        patched_shielded_session,
+    ):
+        thread_id = db_thread.id
+        user_id_str = str(db_user.id)
+        search_space_id = db_search_space.id
+        turn_id = f"{thread_id}:5000"
+
+        msg_id = await persist_assistant_shell(
+            chat_id=thread_id,
+            user_id=user_id_str,
+            turn_id=turn_id,
+        )
+        assert msg_id is not None
+
+        # Pure tool-call turn that aborted before any output, or
+        # interrupt before any event arrived — empty list.
+        await finalize_assistant_turn(
+            message_id=msg_id,
+            chat_id=thread_id,
+            search_space_id=search_space_id,
+            user_id=user_id_str,
+            turn_id=turn_id,
+            content=[],
+            accumulator=None,
+        )
+
+        row = await db_session.get(NewChatMessage, msg_id)
+        await db_session.refresh(row)
+        assert row.content == [{"type": "status", "text": "(no text response)"}]
+
+    async def test_double_call_safe_via_on_conflict(
+        self,
+        db_session,
+        db_user,
+        db_thread,
+        db_search_space,
+        patched_shielded_session,
+    ):
+        thread_id = db_thread.id
+        user_id_str = str(db_user.id)
+        search_space_id = db_search_space.id
+        turn_id = f"{thread_id}:6000"
+
+        msg_id = await persist_assistant_shell(
+            chat_id=thread_id,
+            user_id=user_id_str,
+            turn_id=turn_id,
+        )
+        assert msg_id is not None
+
+        first_acc = _accumulator_with_one_call()
+        await finalize_assistant_turn(
+            message_id=msg_id,
+            chat_id=thread_id,
+            search_space_id=search_space_id,
+            user_id=user_id_str,
+            turn_id=turn_id,
+            content=[{"type": "text", "text": "first finalize"}],
+            accumulator=first_acc,
+        )
+
+        # Simulate a follow-up finalize (e.g., resume retry within the
+        # shielded finally block firing twice). Different content, but
+        # ON CONFLICT DO NOTHING on token_usage means the cost from the
+        # first finalize stays authoritative.
+        second_acc = TurnTokenAccumulator()
+        second_acc.add(
+            model="gpt-4o-mini",
+            prompt_tokens=999,
+            completion_tokens=999,
+            total_tokens=1998,
+            cost_micros=99999,
+        )
+        await finalize_assistant_turn(
+            message_id=msg_id,
+            chat_id=thread_id,
+            search_space_id=search_space_id,
+            user_id=user_id_str,
+            turn_id=turn_id,
+            content=[{"type": "text", "text": "second finalize"}],
+            accumulator=second_acc,
+        )
+
+        # Content was overwritten by the second UPDATE.
+        row = await db_session.get(NewChatMessage, msg_id)
+        await db_session.refresh(row)
+        assert row.content == [{"type": "text", "text": "second finalize"}]
+
+        # But token_usage stayed at exactly one row, preserving the
+        # first finalize's authoritative cost.
+        assert await _count_token_usage_rows(db_session, msg_id) == 1
+        usage = (
+            await db_session.execute(
+                select(TokenUsage).where(TokenUsage.message_id == msg_id)
+            )
+        ).scalar_one()
+        assert usage.cost_micros == 12345  # First finalize's value
+
+    async def test_append_message_style_insert_after_finalize_no_dupe(
+        self,
+        db_session,
+        db_user,
+        db_thread,
+        db_search_space,
+        patched_shielded_session,
+    ):
+        """Cross-writer race: ``append_message`` arrives after ``finalize_assistant_turn``.
+
+        Both target the same ``message_id``; the partial unique index
+        ``uq_token_usage_message_id`` (migration 142) makes the second
+        insert a no-op via ``ON CONFLICT DO NOTHING``.
+        """
+        from sqlalchemy import text as sa_text
+
+        thread_id = db_thread.id
+        user_uuid = db_user.id
+        user_id_str = str(user_uuid)
+        search_space_id = db_search_space.id
+        turn_id = f"{thread_id}:7000"
+
+        msg_id = await persist_assistant_shell(
+            chat_id=thread_id,
+            user_id=user_id_str,
+            turn_id=turn_id,
+        )
+        assert msg_id is not None
+
+        await finalize_assistant_turn(
+            message_id=msg_id,
+            chat_id=thread_id,
+            search_space_id=search_space_id,
+            user_id=user_id_str,
+            turn_id=turn_id,
+            content=[{"type": "text", "text": "from server"}],
+            accumulator=_accumulator_with_one_call(),
+        )
+
+        # Now simulate the FE's append_message branch firing AFTER —
+        # the same INSERT ... ON CONFLICT DO NOTHING shape used by the
+        # route handler, keyed on the migration-142 partial unique
+        # index.
+        late_insert = (
+            pg_insert(TokenUsage)
+            .values(
+                usage_type="chat",
+                prompt_tokens=42,
+                completion_tokens=42,
+                total_tokens=84,
+                cost_micros=1,
+                model_breakdown=None,
+                call_details=None,
+                thread_id=thread_id,
+                message_id=msg_id,
+                search_space_id=search_space_id,
+                user_id=user_uuid,
+            )
+            .on_conflict_do_nothing(
+                index_elements=["message_id"],
+                index_where=sa_text("message_id IS NOT NULL"),
+            )
+        )
+        await db_session.execute(late_insert)
+        await db_session.flush()
+
+        # Still exactly one row, with the original (server) cost value.
+        assert await _count_token_usage_rows(db_session, msg_id) == 1
+        usage = (
+            await db_session.execute(
+                select(TokenUsage).where(TokenUsage.message_id == msg_id)
+            )
+        ).scalar_one()
+        assert usage.cost_micros == 12345
+
+    async def test_helper_never_raises_on_missing_message_id(
+        self,
+        db_session,
+        db_user,
+        db_thread,
+        db_search_space,
+        patched_shielded_session,
+    ):
+        thread_id = db_thread.id
+        user_id_str = str(db_user.id)
+        search_space_id = db_search_space.id
+
+        # message_id that doesn't exist — finalize must log+return,
+        # never raise (called from shielded finally).
+        await finalize_assistant_turn(
+            message_id=999_999_999,
+            chat_id=thread_id,
+            search_space_id=search_space_id,
+            user_id=user_id_str,
+            turn_id="anything",
+            content=[{"type": "text", "text": "x"}],
+            accumulator=_accumulator_with_one_call(),
+        )
+        # If we got here without an exception, the test passes.
+        # Sanity: no token_usage row created (FK to message would have
+        # been rejected anyway, but ON CONFLICT path may swallow
+        # FK errors as well; check directly).
+        assert await _count_token_usage_rows(db_session, 999_999_999) == 0
diff --git a/surfsense_backend/tests/integration/composio/__init__.py b/surfsense_backend/tests/integration/composio/__init__.py
new file mode 100644
index 0000000000..1eddfc5f5c
--- /dev/null
+++ b/surfsense_backend/tests/integration/composio/__init__.py
@@ -0,0 +1 @@
+"""Integration tests for Composio connector routes."""
diff --git a/surfsense_backend/tests/integration/composio/conftest.py b/surfsense_backend/tests/integration/composio/conftest.py
new file mode 100644
index 0000000000..44d707ec31
--- /dev/null
+++ b/surfsense_backend/tests/integration/composio/conftest.py
@@ -0,0 +1,84 @@
+"""Composio route integration fixtures.
+
+The `composio` sys.modules hijack lives in the parent integration conftest
+so it runs before any sibling suite imports `app.routes`.
+"""
+
+from __future__ import annotations
+
+from collections.abc import AsyncGenerator
+
+import httpx
+import pytest
+import pytest_asyncio
+from httpx import ASGITransport
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.app import app, limiter
+from app.config import config
+from app.db import (
+    SearchSourceConnector,
+    SearchSourceConnectorType,
+    User,
+    get_async_session,
+)
+from app.users import current_active_user
+
+pytestmark = pytest.mark.integration
+
+limiter.enabled = False
+config.COMPOSIO_ENABLED = True
+config.COMPOSIO_API_KEY = "e2e-integration-composio-sentinel"
+config.NEXT_FRONTEND_URL = "http://localhost:3000"
+
+
+@pytest_asyncio.fixture
+async def client(
+    db_session: AsyncSession,
+    db_user: User,
+) -> AsyncGenerator[httpx.AsyncClient, None]:
+    async def override_session() -> AsyncGenerator[AsyncSession, None]:
+        yield db_session
+
+    async def override_user() -> User:
+        return db_user
+
+    previous_overrides = app.dependency_overrides.copy()
+    app.dependency_overrides[get_async_session] = override_session
+    app.dependency_overrides[current_active_user] = override_user
+
+    try:
+        async with httpx.AsyncClient(
+            transport=ASGITransport(app=app),
+            base_url="http://test",
+            timeout=30.0,
+            follow_redirects=False,
+        ) as test_client:
+            yield test_client
+    finally:
+        app.dependency_overrides.clear()
+        app.dependency_overrides.update(previous_overrides)
+
+
+@pytest_asyncio.fixture
+async def drive_connector(
+    db_session: AsyncSession,
+    db_user: User,
+    db_search_space,
+) -> SearchSourceConnector:
+    connector = SearchSourceConnector(
+        name="Google Drive (Composio) - e2e-fake@surfsense.example",
+        connector_type=SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR,
+        is_indexable=True,
+        config={
+            "composio_connected_account_id": "fake-acct-googledrive-existing",
+            "toolkit_id": "googledrive",
+            "toolkit_name": "Google Drive",
+            "is_indexable": True,
+        },
+        search_space_id=db_search_space.id,
+        user_id=db_user.id,
+    )
+    db_session.add(connector)
+    await db_session.flush()
+    return connector
diff --git a/surfsense_backend/tests/integration/composio/test_drive_folders_route.py b/surfsense_backend/tests/integration/composio/test_drive_folders_route.py
new file mode 100644
index 0000000000..8e152ec859
--- /dev/null
+++ b/surfsense_backend/tests/integration/composio/test_drive_folders_route.py
@@ -0,0 +1,99 @@
+from typing import Any
+
+import httpx
+import pytest
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import SearchSourceConnector
+from tests.e2e.fakes import composio_module
+
+pytestmark = pytest.mark.integration
+
+
+async def test_root_listing_returns_canned_items(
+    client: httpx.AsyncClient,
+    drive_connector: SearchSourceConnector,
+):
+    response = await client.get(
+        f"/api/v1/connectors/{drive_connector.id}/composio-drive/folders"
+    )
+
+    assert response.status_code == 200
+    items = response.json()["items"]
+    names = {item["name"] for item in items}
+
+    assert "Projects" in names
+    assert "e2e-canary.txt" in names
+    assert any(
+        item["id"] == "fake-folder-projects" and item["isFolder"] is True
+        for item in items
+    )
+
+
+async def test_save_round_trips_selected_files(
+    client: httpx.AsyncClient,
+    db_session: AsyncSession,
+    drive_connector: SearchSourceConnector,
+):
+    selected_files = [
+        {
+            "id": "fake-file-canary",
+            "name": "e2e-canary.txt",
+            "mimeType": "text/plain",
+        }
+    ]
+
+    response = await client.put(
+        f"/api/v1/search-source-connectors/{drive_connector.id}",
+        json={
+            "config": {
+                "selected_folders": [],
+                "selected_files": selected_files,
+                "indexing_options": {
+                    "max_files_per_folder": 10,
+                    "incremental_sync": False,
+                    "include_subfolders": False,
+                },
+            }
+        },
+    )
+
+    assert response.status_code == 200
+    await db_session.refresh(drive_connector)
+    assert drive_connector.config["selected_files"] == selected_files
+    assert drive_connector.config["selected_folders"] == []
+
+
+async def test_auth_expired_error_classifies_and_flags_connector(
+    monkeypatch: pytest.MonkeyPatch,
+    client: httpx.AsyncClient,
+    db_session: AsyncSession,
+    drive_connector: SearchSourceConnector,
+):
+    def raise_auth_expired(
+        self: Any,
+        *,
+        slug: str,
+        connected_account_id: str,
+        user_id: str | None = None,
+        arguments: dict[str, Any] | None = None,
+        dangerously_skip_version_check: bool = True,
+        **kwargs: Any,
+    ) -> dict[str, Any]:
+        raise RuntimeError(
+            "Token has been expired or revoked. (HTTP 401: invalid_grant)"
+        )
+
+    monkeypatch.setattr(composio_module._Tools, "execute", raise_auth_expired)
+
+    response = await client.get(
+        f"/api/v1/connectors/{drive_connector.id}/composio-drive/folders"
+    )
+
+    assert response.status_code == 400
+    body = response.text.lower()
+    assert "authentication" in body
+    assert "expired" in body
+
+    await db_session.refresh(drive_connector)
+    assert drive_connector.config["auth_expired"] is True
diff --git a/surfsense_backend/tests/integration/composio/test_oauth_callback.py b/surfsense_backend/tests/integration/composio/test_oauth_callback.py
new file mode 100644
index 0000000000..d2f4c3752f
--- /dev/null
+++ b/surfsense_backend/tests/integration/composio/test_oauth_callback.py
@@ -0,0 +1,113 @@
+from uuid import UUID
+
+import httpx
+import pytest
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.config import config
+from app.db import (
+    SearchSourceConnector,
+    SearchSourceConnectorType,
+    SearchSpace,
+    User,
+)
+from app.utils.oauth_security import OAuthStateManager
+
+pytestmark = pytest.mark.integration
+
+
+def _state_for(space_id: int, user_id: UUID, toolkit_id: str = "googledrive") -> str:
+    return OAuthStateManager(config.SECRET_KEY).generate_secure_state(
+        space_id=space_id,
+        user_id=user_id,
+        toolkit_id=toolkit_id,
+    )
+
+
+async def _drive_connectors(
+    session: AsyncSession,
+    *,
+    user_id: UUID,
+    search_space_id: int,
+) -> list[SearchSourceConnector]:
+    result = await session.execute(
+        select(SearchSourceConnector).where(
+            SearchSourceConnector.user_id == user_id,
+            SearchSourceConnector.search_space_id == search_space_id,
+            SearchSourceConnector.connector_type
+            == SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR,
+        )
+    )
+    return list(result.scalars().all())
+
+
+async def test_callback_with_error_param_redirects_to_denied_page(
+    client: httpx.AsyncClient,
+    db_session: AsyncSession,
+    db_user: User,
+    db_search_space: SearchSpace,
+):
+    state = _state_for(db_search_space.id, db_user.id)
+
+    response = await client.get(
+        f"/api/v1/auth/composio/connector/callback?state={state}&error=access_denied"
+    )
+
+    assert response.status_code in {302, 303, 307}
+    location = response.headers["location"]
+    assert (
+        f"/dashboard/{db_search_space.id}/connectors/callback?"
+        "error=composio_oauth_denied"
+    ) in location
+
+    connectors = await _drive_connectors(
+        db_session,
+        user_id=db_user.id,
+        search_space_id=db_search_space.id,
+    )
+    assert connectors == []
+
+
+async def test_second_oauth_for_same_toolkit_takes_reconnection_branch(
+    client: httpx.AsyncClient,
+    db_session: AsyncSession,
+    db_user: User,
+    db_search_space: SearchSpace,
+):
+    first_state = _state_for(db_search_space.id, db_user.id)
+
+    first_response = await client.get(
+        "/api/v1/auth/composio/connector/callback"
+        f"?state={first_state}&connectedAccountId=fake-acct-googledrive-first"
+    )
+
+    assert first_response.status_code in {302, 303, 307}
+    first_connectors = await _drive_connectors(
+        db_session,
+        user_id=db_user.id,
+        search_space_id=db_search_space.id,
+    )
+    assert len(first_connectors) == 1
+    first_connector = first_connectors[0]
+    assert first_connector.config["composio_connected_account_id"] == (
+        "fake-acct-googledrive-first"
+    )
+
+    second_state = _state_for(db_search_space.id, db_user.id)
+    second_response = await client.get(
+        "/api/v1/auth/composio/connector/callback"
+        f"?state={second_state}&connectedAccountId=fake-acct-googledrive-second"
+    )
+
+    assert second_response.status_code in {302, 303, 307}
+    second_connectors = await _drive_connectors(
+        db_session,
+        user_id=db_user.id,
+        search_space_id=db_search_space.id,
+    )
+    assert len(second_connectors) == 1
+    assert second_connectors[0].id == first_connector.id
+    assert second_connectors[0].config["composio_connected_account_id"] == (
+        "fake-acct-googledrive-second"
+    )
diff --git a/surfsense_backend/tests/integration/conftest.py b/surfsense_backend/tests/integration/conftest.py
index d9d7cacaee..e03101e635 100644
--- a/surfsense_backend/tests/integration/conftest.py
+++ b/surfsense_backend/tests/integration/conftest.py
@@ -1,3 +1,5 @@
+import importlib
+import sys
 import uuid
 from unittest.mock import AsyncMock, MagicMock
 
@@ -7,17 +9,27 @@
 from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine
 from sqlalchemy.pool import NullPool
 
-from app.config import config as app_config
-from app.db import (
-    Base,
-    DocumentType,
-    SearchSourceConnector,
-    SearchSourceConnectorType,
-    SearchSpace,
-    User,
-)
-from app.indexing_pipeline.connector_document import ConnectorDocument
-from tests.conftest import TEST_DATABASE_URL
+# Hijack `composio` before any `from app.*` import; the `from composio import
+# Composio` in app.services.composio_service binds once at first import.
+from tests.e2e.fakes import composio_module as _fake_composio
+
+sys.modules["composio"] = _fake_composio
+
+app_config = importlib.import_module("app.config").config
+app_db = importlib.import_module("app.db")
+Base = app_db.Base
+DocumentType = app_db.DocumentType
+SearchSourceConnector = app_db.SearchSourceConnector
+SearchSourceConnectorType = app_db.SearchSourceConnectorType
+SearchSpace = app_db.SearchSpace
+User = app_db.User
+ConnectorDocument = importlib.import_module(
+    "app.indexing_pipeline.connector_document"
+).ConnectorDocument
+create_default_roles_and_membership = importlib.import_module(
+    "app.routes.search_spaces_routes"
+).create_default_roles_and_membership
+TEST_DATABASE_URL = importlib.import_module("tests.conftest").TEST_DATABASE_URL
 
 _EMBEDDING_DIM = app_config.embedding_model_instance.dimension
 
@@ -105,6 +117,9 @@ async def db_search_space(db_session: AsyncSession, db_user: User) -> SearchSpac
     )
     db_session.add(space)
     await db_session.flush()
+    # Mirror POST /searchspaces so routes guarded by check_permission find a membership.
+    await create_default_roles_and_membership(db_session, space.id, db_user.id)
+    await db_session.flush()
     return space
 
 
@@ -145,6 +160,10 @@ def patched_chunk_text(monkeypatch) -> MagicMock:
         "app.indexing_pipeline.indexing_pipeline_service.chunk_text",
         mock,
     )
+    monkeypatch.setattr(
+        "app.indexing_pipeline.indexing_pipeline_service.chunk_text_hybrid",
+        mock,
+    )
     return mock
 
 
diff --git a/surfsense_backend/tests/integration/document_upload/conftest.py b/surfsense_backend/tests/integration/document_upload/conftest.py
index 62f4f6b47c..ff44e471a0 100644
--- a/surfsense_backend/tests/integration/document_upload/conftest.py
+++ b/surfsense_backend/tests/integration/document_upload/conftest.py
@@ -69,6 +69,8 @@ async def dispatch_file_processing(
         search_space_id: int,
         user_id: str,
         should_summarize: bool = False,
+        use_vision_llm: bool = False,
+        processing_mode: str = "basic",
     ) -> None:
         from app.tasks.celery_tasks.document_tasks import (
             _process_file_with_document,
@@ -82,6 +84,8 @@ async def dispatch_file_processing(
                 search_space_id,
                 user_id,
                 should_summarize=should_summarize,
+                use_vision_llm=use_vision_llm,
+                processing_mode=processing_mode,
             )
 
 
@@ -319,7 +323,9 @@ def _reject_empty(file_path: str) -> None:
 
     # -- LlamaParse mock (external API) --------------------------------
 
-    async def _fake_llamacloud_parse(file_path: str, estimated_pages: int) -> str:
+    async def _fake_llamacloud_parse(
+        file_path: str, estimated_pages: int, processing_mode: str = "basic"
+    ) -> str:
         _reject_empty(file_path)
         return _MOCK_ETL_MARKDOWN
 
diff --git a/surfsense_backend/tests/integration/document_upload/test_stripe_page_purchases.py b/surfsense_backend/tests/integration/document_upload/test_stripe_page_purchases.py
index 9ac4057caf..80ee2d5fd4 100644
--- a/surfsense_backend/tests/integration/document_upload/test_stripe_page_purchases.py
+++ b/surfsense_backend/tests/integration/document_upload/test_stripe_page_purchases.py
@@ -165,6 +165,7 @@ async def test_create_checkout_session_records_pending_purchase(
         assert (
             fake_client.last_params["success_url"]
             == f"http://localhost:3000/dashboard/{search_space_id}/purchase-success"
+            "?session_id={CHECKOUT_SESSION_ID}"
         )
         assert (
             fake_client.last_params["cancel_url"]
diff --git a/surfsense_backend/tests/integration/google_unification/test_calendar_indexer_credentials.py b/surfsense_backend/tests/integration/google_unification/test_calendar_indexer_credentials.py
index 795f0d5647..44ff5c48aa 100644
--- a/surfsense_backend/tests/integration/google_unification/test_calendar_indexer_credentials.py
+++ b/surfsense_backend/tests/integration/google_unification/test_calendar_indexer_credentials.py
@@ -7,7 +7,7 @@
 
 from __future__ import annotations
 
-from unittest.mock import AsyncMock, MagicMock, patch
+from unittest.mock import ANY, AsyncMock, MagicMock, patch
 
 import pytest
 import pytest_asyncio
@@ -25,6 +25,7 @@
 
 _COMPOSIO_ACCOUNT_ID = "composio-calendar-test-789"
 _INDEXER_MODULE = "app.tasks.connector_indexers.google_calendar_indexer"
+_GET_ACCESS_TOKEN = "app.services.composio_service.ComposioService.get_access_token"
 
 
 @pytest_asyncio.fixture
@@ -69,32 +70,29 @@ async def native_calendar(async_engine):
     await cleanup_space(async_engine, data["search_space_id"])
 
 
+@patch(_GET_ACCESS_TOKEN)
 @patch(f"{_INDEXER_MODULE}.TaskLoggingService")
 @patch(f"{_INDEXER_MODULE}.GoogleCalendarConnector")
-@patch(f"{_INDEXER_MODULE}.build_composio_credentials")
-async def test_composio_calendar_uses_composio_credentials(
-    mock_build_creds,
+@patch(f"{_INDEXER_MODULE}.ComposioService")
+async def test_composio_calendar_uses_composio_service(
+    mock_composio_service_cls,
     mock_cal_cls,
     mock_tl_cls,
+    mock_get_access_token,
     async_engine,
     composio_calendar,
 ):
-    """Calendar indexer calls build_composio_credentials for a Composio connector."""
+    """Calendar indexer uses Composio tools directly for a Composio connector."""
     from app.tasks.connector_indexers.google_calendar_indexer import (
         index_google_calendar_events,
     )
 
     data = composio_calendar
-    mock_creds = MagicMock(name="composio-creds")
-    mock_build_creds.return_value = mock_creds
+    mock_composio_service = MagicMock()
+    mock_composio_service.get_calendar_events = AsyncMock(return_value=([], None))
+    mock_composio_service_cls.return_value = mock_composio_service
     mock_tl_cls.return_value = mock_task_logger()
 
-    mock_cal_instance = MagicMock()
-    mock_cal_instance.get_all_primary_calendar_events = AsyncMock(
-        return_value=([], None)
-    )
-    mock_cal_cls.return_value = mock_cal_instance
-
     maker = make_session_factory(async_engine)
     async with maker() as session:
         await index_google_calendar_events(
@@ -104,17 +102,25 @@ async def test_composio_calendar_uses_composio_credentials(
             user_id=data["user_id"],
         )
 
-    mock_build_creds.assert_called_once_with(_COMPOSIO_ACCOUNT_ID)
-    mock_cal_cls.assert_called_once()
-    _, kwargs = mock_cal_cls.call_args
-    assert kwargs.get("credentials") is mock_creds
+    mock_composio_service_cls.assert_called_once()
+    mock_composio_service.get_calendar_events.assert_called_once_with(
+        connected_account_id=_COMPOSIO_ACCOUNT_ID,
+        entity_id=f"surfsense_{data['user_id']}",
+        time_min=ANY,
+        time_max=ANY,
+        max_results=250,
+    )
+    mock_cal_cls.assert_not_called()
+    mock_get_access_token.assert_not_called()
 
 
+@patch(_GET_ACCESS_TOKEN)
 @patch(f"{_INDEXER_MODULE}.TaskLoggingService")
-@patch(f"{_INDEXER_MODULE}.build_composio_credentials")
+@patch(f"{_INDEXER_MODULE}.ComposioService")
 async def test_composio_calendar_without_account_id_returns_error(
-    mock_build_creds,
+    mock_composio_service_cls,
     mock_tl_cls,
+    mock_get_access_token,
     async_engine,
     composio_calendar_no_id,
 ):
@@ -138,20 +144,23 @@ async def test_composio_calendar_without_account_id_returns_error(
     assert count == 0
     assert error is not None
     assert "composio" in error.lower()
-    mock_build_creds.assert_not_called()
+    mock_composio_service_cls.assert_not_called()
+    mock_get_access_token.assert_not_called()
 
 
+@patch(_GET_ACCESS_TOKEN)
 @patch(f"{_INDEXER_MODULE}.TaskLoggingService")
+@patch(f"{_INDEXER_MODULE}.ComposioService")
 @patch(f"{_INDEXER_MODULE}.GoogleCalendarConnector")
-@patch(f"{_INDEXER_MODULE}.build_composio_credentials")
-async def test_native_calendar_does_not_use_composio_credentials(
-    mock_build_creds,
+async def test_native_calendar_uses_google_calendar_connector(
     mock_cal_cls,
+    mock_composio_service_cls,
     mock_tl_cls,
+    mock_get_access_token,
     async_engine,
     native_calendar,
 ):
-    """Calendar indexer does NOT call build_composio_credentials for a native connector."""
+    """Native Calendar connector uses GoogleCalendarConnector with no Composio path."""
     from app.tasks.connector_indexers.google_calendar_indexer import (
         index_google_calendar_events,
     )
@@ -174,4 +183,6 @@ async def test_native_calendar_does_not_use_composio_credentials(
             user_id=data["user_id"],
         )
 
-    mock_build_creds.assert_not_called()
+    mock_cal_cls.assert_called_once()
+    mock_composio_service_cls.assert_not_called()
+    mock_get_access_token.assert_not_called()
diff --git a/surfsense_backend/tests/integration/google_unification/test_drive_indexer_credentials.py b/surfsense_backend/tests/integration/google_unification/test_drive_indexer_credentials.py
index e669fa143c..810c3ceab2 100644
--- a/surfsense_backend/tests/integration/google_unification/test_drive_indexer_credentials.py
+++ b/surfsense_backend/tests/integration/google_unification/test_drive_indexer_credentials.py
@@ -1,13 +1,26 @@
-"""Integration tests: Drive indexer credential resolution for Composio vs native connectors.
-
-Exercises ``index_google_drive_files`` with a real PostgreSQL database
-containing seeded connector records.  Google API and Composio SDK are
-mocked at their system boundaries.
+"""Integration tests: Drive indexer client + credential resolution.
+
+Locks in the post-cea8618 architectural contract:
+
+- Composio Drive connectors MUST use ``ComposioDriveClient`` (which routes
+  through ``composio.tools.execute``) and MUST NOT depend on a raw OAuth
+  access token via ``ComposioService.get_access_token``.
+- Native Drive connectors MUST continue to use ``GoogleDriveClient`` with
+  credentials loaded from the connector config (no Composio involvement).
+- Composio Drive connectors missing ``composio_connected_account_id`` MUST
+  short-circuit with a clear error before any client is constructed.
+
+Background: prior to ``cea8618`` the Composio path used
+``build_composio_credentials → GoogleDriveClient``. That broke in production
+once Composio's "Mask Connected Account Secrets" project toggle was on,
+because the masked token failed the ``len(access_token) >= 20`` guard in
+``ComposioService.get_access_token``. The structural assertions here make
+any future regression to that token-based path fail at PR time.
 """
 
 from __future__ import annotations
 
-from unittest.mock import MagicMock, patch
+from unittest.mock import ANY, AsyncMock, MagicMock, patch
 
 import pytest
 import pytest_asyncio
@@ -25,6 +38,19 @@
 
 _COMPOSIO_ACCOUNT_ID = "composio-test-account-123"
 _INDEXER_MODULE = "app.tasks.connector_indexers.google_drive_indexer"
+_GET_ACCESS_TOKEN = "app.services.composio_service.ComposioService.get_access_token"
+
+
+def _mock_drive_client(*, list_files_return: tuple = ([], None, None)) -> MagicMock:
+    """Duck-typed client mock whose ``list_files`` yields the supplied tuple.
+
+    Returning an empty file list short-circuits the indexer's full-scan
+    loop after the first page so the test exercises only the
+    construction + listing path, not download / ETL / DB writes.
+    """
+    mock = MagicMock()
+    mock.list_files = AsyncMock(return_value=list_files_return)
+    return mock
 
 
 @pytest_asyncio.fixture
@@ -69,25 +95,31 @@ async def committed_composio_no_account_id(async_engine):
     await cleanup_space(async_engine, data["search_space_id"])
 
 
+@patch(_GET_ACCESS_TOKEN)
 @patch(f"{_INDEXER_MODULE}.TaskLoggingService")
 @patch(f"{_INDEXER_MODULE}.GoogleDriveClient")
-@patch(f"{_INDEXER_MODULE}.build_composio_credentials")
-async def test_composio_connector_uses_composio_credentials(
-    mock_build_creds,
-    mock_client_cls,
+@patch(f"{_INDEXER_MODULE}.ComposioDriveClient")
+async def test_composio_drive_indexer_uses_composio_drive_client(
+    mock_composio_client_cls,
+    mock_native_client_cls,
     mock_task_logger_cls,
+    mock_get_access_token,
     async_engine,
     committed_drive_connector,
 ):
-    """Drive indexer calls build_composio_credentials for a Composio connector
-    and passes the result to GoogleDriveClient."""
+    """Composio Drive must construct ComposioDriveClient and never read raw tokens.
+
+    Reverting to the pre-cea8618 ``build_composio_credentials → GoogleDriveClient``
+    path would either trip ``mock_native_client_cls.assert_not_called()`` (because
+    GoogleDriveClient would be constructed) or ``mock_get_access_token.assert_not_called()``
+    (because the credential builder reads the raw token).
+    """
     from app.tasks.connector_indexers.google_drive_indexer import (
         index_google_drive_files,
     )
 
     data = committed_drive_connector
-    mock_creds = MagicMock(name="composio-credentials")
-    mock_build_creds.return_value = mock_creds
+    mock_composio_client_cls.return_value = _mock_drive_client()
     mock_task_logger_cls.return_value = mock_task_logger()
 
     maker = make_session_factory(async_engine)
@@ -100,21 +132,29 @@ async def test_composio_connector_uses_composio_credentials(
             folder_id="test-folder-id",
         )
 
-    mock_build_creds.assert_called_once_with(_COMPOSIO_ACCOUNT_ID)
-    mock_client_cls.assert_called_once()
-    _, kwargs = mock_client_cls.call_args
-    assert kwargs.get("credentials") is mock_creds
+    mock_composio_client_cls.assert_called_once_with(
+        ANY,
+        data["connector_id"],
+        _COMPOSIO_ACCOUNT_ID,
+        entity_id=f"surfsense_{data['user_id']}",
+    )
+    mock_native_client_cls.assert_not_called()
+    mock_get_access_token.assert_not_called()
 
 
+@patch(_GET_ACCESS_TOKEN)
 @patch(f"{_INDEXER_MODULE}.TaskLoggingService")
-@patch(f"{_INDEXER_MODULE}.build_composio_credentials")
+@patch(f"{_INDEXER_MODULE}.GoogleDriveClient")
+@patch(f"{_INDEXER_MODULE}.ComposioDriveClient")
 async def test_composio_connector_without_account_id_returns_error(
-    mock_build_creds,
+    mock_composio_client_cls,
+    mock_native_client_cls,
     mock_task_logger_cls,
+    mock_get_access_token,
     async_engine,
     committed_composio_no_account_id,
 ):
-    """Drive indexer returns an error when Composio connector lacks connected_account_id."""
+    """Missing ``composio_connected_account_id`` must short-circuit before any client construction."""
     from app.tasks.connector_indexers.google_drive_indexer import (
         index_google_drive_files,
     )
@@ -134,28 +174,32 @@ async def test_composio_connector_without_account_id_returns_error(
 
     assert count == 0
     assert error is not None
-    assert (
-        "composio_connected_account_id" in error.lower() or "composio" in error.lower()
-    )
-    mock_build_creds.assert_not_called()
+    assert "composio" in error.lower()
+    assert "connected_account_id" in error.lower()
+    mock_composio_client_cls.assert_not_called()
+    mock_native_client_cls.assert_not_called()
+    mock_get_access_token.assert_not_called()
 
 
+@patch(_GET_ACCESS_TOKEN)
 @patch(f"{_INDEXER_MODULE}.TaskLoggingService")
+@patch(f"{_INDEXER_MODULE}.ComposioDriveClient")
 @patch(f"{_INDEXER_MODULE}.GoogleDriveClient")
-@patch(f"{_INDEXER_MODULE}.build_composio_credentials")
-async def test_native_connector_does_not_use_composio_credentials(
-    mock_build_creds,
-    mock_client_cls,
+async def test_native_connector_uses_google_drive_client(
+    mock_native_client_cls,
+    mock_composio_client_cls,
     mock_task_logger_cls,
+    mock_get_access_token,
     async_engine,
     committed_native_drive_connector,
 ):
-    """Drive indexer does NOT call build_composio_credentials for a native connector."""
+    """Native Drive connector must use GoogleDriveClient (no Composio involvement at all)."""
     from app.tasks.connector_indexers.google_drive_indexer import (
         index_google_drive_files,
     )
 
     data = committed_native_drive_connector
+    mock_native_client_cls.return_value = _mock_drive_client()
     mock_task_logger_cls.return_value = mock_task_logger()
 
     maker = make_session_factory(async_engine)
@@ -168,7 +212,6 @@ async def test_native_connector_does_not_use_composio_credentials(
             folder_id="test-folder-id",
         )
 
-    mock_build_creds.assert_not_called()
-    mock_client_cls.assert_called_once()
-    _, kwargs = mock_client_cls.call_args
-    assert kwargs.get("credentials") is None
+    mock_native_client_cls.assert_called_once_with(ANY, data["connector_id"])
+    mock_composio_client_cls.assert_not_called()
+    mock_get_access_token.assert_not_called()
diff --git a/surfsense_backend/tests/integration/google_unification/test_gmail_indexer_credentials.py b/surfsense_backend/tests/integration/google_unification/test_gmail_indexer_credentials.py
index afb3e64c3f..b869f5607a 100644
--- a/surfsense_backend/tests/integration/google_unification/test_gmail_indexer_credentials.py
+++ b/surfsense_backend/tests/integration/google_unification/test_gmail_indexer_credentials.py
@@ -7,7 +7,7 @@
 
 from __future__ import annotations
 
-from unittest.mock import AsyncMock, MagicMock, patch
+from unittest.mock import ANY, AsyncMock, MagicMock, patch
 
 import pytest
 import pytest_asyncio
@@ -25,6 +25,7 @@
 
 _COMPOSIO_ACCOUNT_ID = "composio-gmail-test-456"
 _INDEXER_MODULE = "app.tasks.connector_indexers.google_gmail_indexer"
+_GET_ACCESS_TOKEN = "app.services.composio_service.ComposioService.get_access_token"
 
 
 @pytest_asyncio.fixture
@@ -69,30 +70,32 @@ async def native_gmail(async_engine):
     await cleanup_space(async_engine, data["search_space_id"])
 
 
+@patch(_GET_ACCESS_TOKEN)
 @patch(f"{_INDEXER_MODULE}.TaskLoggingService")
 @patch(f"{_INDEXER_MODULE}.GoogleGmailConnector")
-@patch(f"{_INDEXER_MODULE}.build_composio_credentials")
-async def test_composio_gmail_uses_composio_credentials(
-    mock_build_creds,
+@patch(f"{_INDEXER_MODULE}.ComposioService")
+async def test_composio_gmail_uses_composio_service(
+    mock_composio_service_cls,
     mock_gmail_cls,
     mock_tl_cls,
+    mock_get_access_token,
     async_engine,
     composio_gmail,
 ):
-    """Gmail indexer calls build_composio_credentials for a Composio connector."""
+    """Gmail indexer uses Composio tools directly for a Composio connector."""
     from app.tasks.connector_indexers.google_gmail_indexer import (
         index_google_gmail_messages,
     )
 
     data = composio_gmail
-    mock_creds = MagicMock(name="composio-creds")
-    mock_build_creds.return_value = mock_creds
+    mock_composio_service = MagicMock()
+    mock_composio_service.get_gmail_messages = AsyncMock(
+        return_value=([], None, None, None)
+    )
+    mock_composio_service.get_gmail_message_detail = AsyncMock(return_value=({}, None))
+    mock_composio_service_cls.return_value = mock_composio_service
     mock_tl_cls.return_value = mock_task_logger()
 
-    mock_gmail_instance = MagicMock()
-    mock_gmail_instance.get_recent_messages = AsyncMock(return_value=([], None))
-    mock_gmail_cls.return_value = mock_gmail_instance
-
     maker = make_session_factory(async_engine)
     async with maker() as session:
         await index_google_gmail_messages(
@@ -102,17 +105,25 @@ async def test_composio_gmail_uses_composio_credentials(
             user_id=data["user_id"],
         )
 
-    mock_build_creds.assert_called_once_with(_COMPOSIO_ACCOUNT_ID)
-    mock_gmail_cls.assert_called_once()
-    args, _ = mock_gmail_cls.call_args
-    assert args[0] is mock_creds
+    mock_composio_service_cls.assert_called_once()
+    mock_composio_service.get_gmail_messages.assert_called_once_with(
+        connected_account_id=_COMPOSIO_ACCOUNT_ID,
+        entity_id=f"surfsense_{data['user_id']}",
+        query=ANY,
+        max_results=ANY,
+        page_token=None,
+    )
+    mock_gmail_cls.assert_not_called()
+    mock_get_access_token.assert_not_called()
 
 
+@patch(_GET_ACCESS_TOKEN)
 @patch(f"{_INDEXER_MODULE}.TaskLoggingService")
-@patch(f"{_INDEXER_MODULE}.build_composio_credentials")
+@patch(f"{_INDEXER_MODULE}.ComposioService")
 async def test_composio_gmail_without_account_id_returns_error(
-    mock_build_creds,
+    mock_composio_service_cls,
     mock_tl_cls,
+    mock_get_access_token,
     async_engine,
     composio_gmail_no_id,
 ):
@@ -136,20 +147,23 @@ async def test_composio_gmail_without_account_id_returns_error(
     assert count == 0
     assert error is not None
     assert "composio" in error.lower()
-    mock_build_creds.assert_not_called()
+    mock_composio_service_cls.assert_not_called()
+    mock_get_access_token.assert_not_called()
 
 
+@patch(_GET_ACCESS_TOKEN)
 @patch(f"{_INDEXER_MODULE}.TaskLoggingService")
+@patch(f"{_INDEXER_MODULE}.ComposioService")
 @patch(f"{_INDEXER_MODULE}.GoogleGmailConnector")
-@patch(f"{_INDEXER_MODULE}.build_composio_credentials")
-async def test_native_gmail_does_not_use_composio_credentials(
-    mock_build_creds,
+async def test_native_gmail_uses_google_gmail_connector(
     mock_gmail_cls,
+    mock_composio_service_cls,
     mock_tl_cls,
+    mock_get_access_token,
     async_engine,
     native_gmail,
 ):
-    """Gmail indexer does NOT call build_composio_credentials for a native connector."""
+    """Native Gmail connector uses GoogleGmailConnector with no Composio path."""
     from app.tasks.connector_indexers.google_gmail_indexer import (
         index_google_gmail_messages,
     )
@@ -170,4 +184,6 @@ async def test_native_gmail_does_not_use_composio_credentials(
             user_id=data["user_id"],
         )
 
-    mock_build_creds.assert_not_called()
+    mock_gmail_cls.assert_called_once()
+    mock_composio_service_cls.assert_not_called()
+    mock_get_access_token.assert_not_called()
diff --git a/surfsense_backend/tests/integration/harness/__init__.py b/surfsense_backend/tests/integration/harness/__init__.py
new file mode 100644
index 0000000000..9a7ec07dc4
--- /dev/null
+++ b/surfsense_backend/tests/integration/harness/__init__.py
@@ -0,0 +1,146 @@
+"""
+Integration test harness for the SurfSense agent stack.
+
+The plan calls for an ``LLMToolEmulator``-backed harness for end-to-end
+replay of ``stream_new_chat``. The currently-installed langchain version
+does not expose ``LLMToolEmulator``, so this harness builds the equivalent
+on top of :class:`langchain_core.language_models.fake_chat_models.FakeMessagesListChatModel`.
+
+The harness lets a test author script a sequence of model responses
+(text + optional tool calls) and replay them against the new_chat agent
+graph. Tools are stubbed via ``StubToolSpec`` -> ``langchain_core.tools.tool``
+decorator and execute deterministic Python callbacks.
+
+Used by:
+- ``tests/integration/agents/new_chat/test_feature_flag_smoke.py`` to
+  confirm the kill-switch path produces identical-shape output regardless
+  of which middleware flags are toggled.
+- Future per-tier PRs to record golden transcripts.
+"""
+
+from __future__ import annotations
+
+import uuid
+from collections.abc import Callable, Sequence
+from dataclasses import dataclass, field
+from typing import Any
+
+from langchain_core.language_models import LanguageModelInput
+from langchain_core.language_models.fake_chat_models import (
+    FakeMessagesListChatModel,
+)
+from langchain_core.messages import AIMessage, BaseMessage
+from langchain_core.runnables import Runnable
+from langchain_core.tools import BaseTool, tool
+
+
+class _ToolBindingFakeChatModel(FakeMessagesListChatModel):
+    """Adapter so the harness model can pretend it understands ``bind_tools``.
+
+    The base ``FakeMessagesListChatModel`` raises ``NotImplementedError`` from
+    ``bind_tools``, but ``langchain.agents.create_agent`` always calls
+    ``bind_tools`` to attach the tool registry. We don't actually need the
+    fake to honor the tool schema — it's already scripted to emit the right
+    tool calls — so we return self.
+    """
+
+    def bind_tools(  # type: ignore[override]
+        self,
+        tools: Sequence[Any],
+        *,
+        tool_choice: Any = None,
+        **kwargs: Any,
+    ) -> Runnable[LanguageModelInput, AIMessage]:
+        return self
+
+
+@dataclass
+class StubToolSpec:
+    """A test-mode tool: a name, description, and a deterministic body."""
+
+    name: str
+    description: str
+    handler: Callable[..., Any]
+    args_schema: dict[str, Any] | None = None
+
+    def build(self) -> BaseTool:
+        """Realize as a `langchain_core.tools.BaseTool`."""
+
+        @tool(name_or_callable=self.name, description=self.description)
+        def _stub_tool(**kwargs: Any) -> Any:
+            return self.handler(**kwargs)
+
+        return _stub_tool
+
+
+@dataclass
+class ScriptedTurn:
+    """One scripted assistant turn.
+
+    `text` is the assistant text (may be empty if pure tool call).
+    `tool_calls` is a list of dicts ``{name, args, id}``; if non-empty, the
+    agent will route to those tools and append a follow-up turn.
+    """
+
+    text: str = ""
+    tool_calls: list[dict[str, Any]] = field(default_factory=list)
+
+
+def build_scripted_messages(turns: list[ScriptedTurn]) -> list[BaseMessage]:
+    """Convert :class:`ScriptedTurn` records to AIMessage payloads."""
+    out: list[BaseMessage] = []
+    for turn in turns:
+        tool_calls: list[dict[str, Any]] = []
+        for tc in turn.tool_calls:
+            tool_calls.append(
+                {
+                    "name": tc["name"],
+                    "args": tc.get("args", {}),
+                    "id": tc.get("id") or f"call_{uuid.uuid4().hex[:8]}",
+                }
+            )
+        out.append(AIMessage(content=turn.text, tool_calls=tool_calls or []))
+    return out
+
+
+@dataclass
+class ScriptedHarness:
+    """Bundle of (model, tools) ready to plug into ``create_agent``."""
+
+    model: _ToolBindingFakeChatModel
+    tools: list[BaseTool]
+
+
+def build_scripted_harness(
+    *,
+    turns: list[ScriptedTurn],
+    tools: list[StubToolSpec] | None = None,
+    sleep: float | None = None,
+) -> ScriptedHarness:
+    """Construct a deterministic agent harness from a script.
+
+    Example::
+
+        harness = build_scripted_harness(
+            turns=[
+                ScriptedTurn(tool_calls=[{"name": "echo", "args": {"x": 1}}]),
+                ScriptedTurn(text="done"),
+            ],
+            tools=[
+                StubToolSpec(name="echo", description="echo args", handler=lambda **kw: kw),
+            ],
+        )
+    """
+    messages = build_scripted_messages(turns)
+    model = _ToolBindingFakeChatModel(responses=messages, sleep=sleep)
+    realized_tools = [t.build() for t in (tools or [])]
+    return ScriptedHarness(model=model, tools=realized_tools)
+
+
+__all__ = [
+    "ScriptedHarness",
+    "ScriptedTurn",
+    "StubToolSpec",
+    "build_scripted_harness",
+    "build_scripted_messages",
+]
diff --git a/surfsense_backend/tests/integration/harness/test_scripted_harness.py b/surfsense_backend/tests/integration/harness/test_scripted_harness.py
new file mode 100644
index 0000000000..6e9f7ab913
--- /dev/null
+++ b/surfsense_backend/tests/integration/harness/test_scripted_harness.py
@@ -0,0 +1,53 @@
+"""Smoke test: scripted harness drives create_agent end-to-end and produces a tool-call-then-final-text trace."""
+
+from __future__ import annotations
+
+import pytest
+from langchain.agents import create_agent
+
+from tests.integration.harness import (
+    ScriptedTurn,
+    StubToolSpec,
+    build_scripted_harness,
+)
+
+pytestmark = pytest.mark.integration
+
+
+@pytest.mark.asyncio
+async def test_scripted_harness_drives_basic_agent() -> None:
+    harness = build_scripted_harness(
+        turns=[
+            ScriptedTurn(
+                tool_calls=[
+                    {"name": "echo", "args": {"x": 1}, "id": "call_1"},
+                ]
+            ),
+            ScriptedTurn(text="done"),
+        ],
+        tools=[
+            StubToolSpec(
+                name="echo",
+                description="Echo args back.",
+                handler=lambda **kwargs: {"echoed": kwargs},
+            ),
+        ],
+    )
+
+    agent = create_agent(
+        harness.model,
+        system_prompt="You are a test agent.",
+        tools=harness.tools,
+    )
+
+    result = await agent.ainvoke({"messages": [("user", "do the thing")]})
+    messages = result["messages"]
+    final_ai = next(
+        (m for m in reversed(messages) if m.__class__.__name__ == "AIMessage"),
+        None,
+    )
+    assert final_ai is not None
+    assert final_ai.content == "done"
+    tool_messages = [m for m in messages if m.__class__.__name__ == "ToolMessage"]
+    assert len(tool_messages) == 1
+    assert "echoed" in str(tool_messages[0].content)
diff --git a/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py b/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py
index 9fc802aa6d..6bb1d2094d 100644
--- a/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py
+++ b/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py
@@ -200,7 +200,7 @@ async def test_reindex_sets_status_ready(db_session, db_search_space, db_user, m
 async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, mocker):
     """Reindexing replaces old chunks with new content rather than appending."""
     mocker.patch(
-        "app.indexing_pipeline.indexing_pipeline_service.chunk_text",
+        "app.indexing_pipeline.indexing_pipeline_service.chunk_text_hybrid",
         side_effect=[["Original chunk."], ["Updated chunk."]],
     )
 
diff --git a/surfsense_backend/tests/integration/test_obsidian_plugin_routes.py b/surfsense_backend/tests/integration/test_obsidian_plugin_routes.py
new file mode 100644
index 0000000000..22f6c6de5e
--- /dev/null
+++ b/surfsense_backend/tests/integration/test_obsidian_plugin_routes.py
@@ -0,0 +1,629 @@
+"""Integration tests for the Obsidian plugin HTTP wire contract.
+
+Three concerns:
+
+1. The /connect upsert really collapses concurrent first-time connects to
+   exactly one row. This locks the partial unique index from migration 129
+   to its purpose.
+2. The fingerprint dedup path: a second device connecting with a fresh
+   ``vault_id`` but the same ``vault_fingerprint`` adopts the existing
+   connector instead of creating a duplicate.
+3. The end-to-end response shapes returned by /connect /sync /rename
+   /notes /manifest /stats match the schemas the plugin's TypeScript
+   decoders expect. Each renamed field is a contract change, and a smoke
+   pass like this is the cheapest way to catch a future drift before it
+   ships.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import uuid
+from datetime import UTC, datetime
+from unittest.mock import AsyncMock, patch
+
+import pytest
+import pytest_asyncio
+from sqlalchemy import func, select, text
+from sqlalchemy.exc import IntegrityError
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import (
+    SearchSourceConnector,
+    SearchSourceConnectorType,
+    SearchSpace,
+    User,
+)
+from app.routes.obsidian_plugin_routes import (
+    obsidian_connect,
+    obsidian_delete_notes,
+    obsidian_manifest,
+    obsidian_rename,
+    obsidian_stats,
+    obsidian_sync,
+)
+from app.schemas.obsidian_plugin import (
+    ConnectRequest,
+    DeleteAck,
+    DeleteBatchRequest,
+    HeadingRef,
+    ManifestResponse,
+    NotePayload,
+    RenameAck,
+    RenameBatchRequest,
+    RenameItem,
+    StatsResponse,
+    SyncAck,
+    SyncBatchRequest,
+)
+
+pytestmark = pytest.mark.integration
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_note_payload(vault_id: str, path: str, content_hash: str) -> NotePayload:
+    """Minimal NotePayload that the schema accepts; the indexer is mocked
+    out so the values don't have to round-trip through the real pipeline."""
+    now = datetime.now(UTC)
+    return NotePayload(
+        vault_id=vault_id,
+        path=path,
+        name=path.rsplit("/", 1)[-1].rsplit(".", 1)[0],
+        extension="md",
+        content="# Test\n\nbody",
+        headings=[HeadingRef(heading="Test", level=1)],
+        content_hash=content_hash,
+        mtime=now,
+        ctime=now,
+    )
+
+
+@pytest_asyncio.fixture
+async def race_user_and_space(async_engine):
+    """User + SearchSpace committed via the live engine so the two
+    concurrent /connect sessions in the race test can both see them.
+
+    We can't use the savepoint-trapped ``db_session`` fixture here
+    because the concurrent sessions need to see committed rows.
+    """
+    user_id = uuid.uuid4()
+    async with AsyncSession(async_engine) as setup:
+        user = User(
+            id=user_id,
+            email=f"obsidian-race-{uuid.uuid4()}@surfsense.test",
+            hashed_password="x",
+            is_active=True,
+            is_superuser=False,
+            is_verified=True,
+        )
+        space = SearchSpace(name="Race Space", user_id=user_id)
+        setup.add_all([user, space])
+        await setup.commit()
+        await setup.refresh(space)
+        space_id = space.id
+
+    yield user_id, space_id
+
+    async with AsyncSession(async_engine) as cleanup:
+        # Order matters: connectors -> documents -> space -> user. The
+        # connectors test creates documents, so we wipe them too. The
+        # CASCADE on user_id catches anything we missed.
+        await cleanup.execute(
+            text("DELETE FROM search_source_connectors WHERE user_id = :uid"),
+            {"uid": user_id},
+        )
+        await cleanup.execute(
+            text("DELETE FROM searchspaces WHERE id = :id"),
+            {"id": space_id},
+        )
+        await cleanup.execute(
+            text('DELETE FROM "user" WHERE id = :uid'),
+            {"uid": user_id},
+        )
+        await cleanup.commit()
+
+
+# ---------------------------------------------------------------------------
+# /connect race + index enforcement
+# ---------------------------------------------------------------------------
+
+
+class TestConnectRace:
+    async def test_concurrent_first_connects_collapse_to_one_row(
+        self, async_engine, race_user_and_space
+    ):
+        """Two simultaneous /connect calls for the same vault should
+        produce exactly one row, not two. Same vault_id + same
+        fingerprint funnels through both partial unique indexes; the
+        loser falls back to the survivor row via the IntegrityError
+        branch in obsidian_connect."""
+        user_id, space_id = race_user_and_space
+        vault_id = str(uuid.uuid4())
+        fingerprint = "fp-" + uuid.uuid4().hex
+
+        async def _call(name_suffix: str) -> None:
+            async with AsyncSession(async_engine) as s:
+                fresh_user = await s.get(User, user_id)
+                payload = ConnectRequest(
+                    vault_id=vault_id,
+                    vault_name=f"My Vault {name_suffix}",
+                    search_space_id=space_id,
+                    vault_fingerprint=fingerprint,
+                )
+                await obsidian_connect(payload, user=fresh_user, session=s)
+
+        results = await asyncio.gather(_call("a"), _call("b"), return_exceptions=True)
+        for r in results:
+            assert not isinstance(r, Exception), f"Connect raised: {r!r}"
+
+        async with AsyncSession(async_engine) as verify:
+            count = (
+                await verify.execute(
+                    select(func.count(SearchSourceConnector.id)).where(
+                        SearchSourceConnector.user_id == user_id,
+                    )
+                )
+            ).scalar_one()
+            assert count == 1
+
+    async def test_partial_unique_index_blocks_raw_duplicate(
+        self, async_engine, race_user_and_space
+    ):
+        """Raw INSERTs that bypass the route must still be blocked by
+        the partial unique indexes from migration 129."""
+        user_id, space_id = race_user_and_space
+        vault_id = str(uuid.uuid4())
+
+        async with AsyncSession(async_engine) as s:
+            s.add(
+                SearchSourceConnector(
+                    name="Obsidian - First",
+                    connector_type=SearchSourceConnectorType.OBSIDIAN_CONNECTOR,
+                    is_indexable=False,
+                    config={
+                        "vault_id": vault_id,
+                        "vault_name": "First",
+                        "source": "plugin",
+                        "vault_fingerprint": "fp-1",
+                    },
+                    user_id=user_id,
+                    search_space_id=space_id,
+                )
+            )
+            await s.commit()
+
+        with pytest.raises(IntegrityError):
+            async with AsyncSession(async_engine) as s:
+                s.add(
+                    SearchSourceConnector(
+                        name="Obsidian - Second",
+                        connector_type=SearchSourceConnectorType.OBSIDIAN_CONNECTOR,
+                        is_indexable=False,
+                        config={
+                            "vault_id": vault_id,
+                            "vault_name": "Second",
+                            "source": "plugin",
+                            "vault_fingerprint": "fp-2",
+                        },
+                        user_id=user_id,
+                        search_space_id=space_id,
+                    )
+                )
+                await s.commit()
+
+    async def test_fingerprint_blocks_raw_cross_device_duplicate(
+        self, async_engine, race_user_and_space
+    ):
+        """Two connectors for the same user with different vault_ids but
+        the same fingerprint cannot coexist."""
+        user_id, space_id = race_user_and_space
+        fingerprint = "fp-" + uuid.uuid4().hex
+
+        async with AsyncSession(async_engine) as s:
+            s.add(
+                SearchSourceConnector(
+                    name="Obsidian - Desktop",
+                    connector_type=SearchSourceConnectorType.OBSIDIAN_CONNECTOR,
+                    is_indexable=False,
+                    config={
+                        "vault_id": str(uuid.uuid4()),
+                        "vault_name": "Vault",
+                        "source": "plugin",
+                        "vault_fingerprint": fingerprint,
+                    },
+                    user_id=user_id,
+                    search_space_id=space_id,
+                )
+            )
+            await s.commit()
+
+        with pytest.raises(IntegrityError):
+            async with AsyncSession(async_engine) as s:
+                s.add(
+                    SearchSourceConnector(
+                        name="Obsidian - Mobile",
+                        connector_type=SearchSourceConnectorType.OBSIDIAN_CONNECTOR,
+                        is_indexable=False,
+                        config={
+                            "vault_id": str(uuid.uuid4()),
+                            "vault_name": "Vault",
+                            "source": "plugin",
+                            "vault_fingerprint": fingerprint,
+                        },
+                        user_id=user_id,
+                        search_space_id=space_id,
+                    )
+                )
+                await s.commit()
+
+    async def test_second_device_adopts_existing_connector_via_fingerprint(
+        self, async_engine, race_user_and_space
+    ):
+        """Device A connects with vault_id=A. Device B then connects with
+        a fresh vault_id=B but the same fingerprint. The route must
+        return A's identity (not create a B row), proving cross-device
+        dedup happens transparently to the plugin."""
+        user_id, space_id = race_user_and_space
+        vault_id_a = str(uuid.uuid4())
+        vault_id_b = str(uuid.uuid4())
+        fingerprint = "fp-" + uuid.uuid4().hex
+
+        async with AsyncSession(async_engine) as s:
+            fresh_user = await s.get(User, user_id)
+            resp_a = await obsidian_connect(
+                ConnectRequest(
+                    vault_id=vault_id_a,
+                    vault_name="Shared Vault",
+                    search_space_id=space_id,
+                    vault_fingerprint=fingerprint,
+                ),
+                user=fresh_user,
+                session=s,
+            )
+
+        async with AsyncSession(async_engine) as s:
+            fresh_user = await s.get(User, user_id)
+            resp_b = await obsidian_connect(
+                ConnectRequest(
+                    vault_id=vault_id_b,
+                    vault_name="Shared Vault",
+                    search_space_id=space_id,
+                    vault_fingerprint=fingerprint,
+                ),
+                user=fresh_user,
+                session=s,
+            )
+
+        assert resp_b.vault_id == vault_id_a
+        assert resp_b.connector_id == resp_a.connector_id
+
+        async with AsyncSession(async_engine) as verify:
+            count = (
+                await verify.execute(
+                    select(func.count(SearchSourceConnector.id)).where(
+                        SearchSourceConnector.user_id == user_id,
+                    )
+                )
+            ).scalar_one()
+            assert count == 1
+
+
+# ---------------------------------------------------------------------------
+# Combined wire-shape smoke test
+# ---------------------------------------------------------------------------
+
+
+class TestWireContractSmoke:
+    """Walks /connect -> /sync -> /rename -> /notes -> /manifest -> /stats
+    sequentially and asserts each response matches the new schema. With
+    `response_model=` on every route, FastAPI is already validating the
+    shape on real traffic; this test mainly guards against accidental
+    field renames the way the TypeScript decoder would catch them."""
+
+    async def test_full_flow_returns_typed_payloads(
+        self, db_session: AsyncSession, db_user: User, db_search_space: SearchSpace
+    ):
+        vault_id = str(uuid.uuid4())
+
+        # 1. /connect
+        connect_resp = await obsidian_connect(
+            ConnectRequest(
+                vault_id=vault_id,
+                vault_name="Smoke Vault",
+                search_space_id=db_search_space.id,
+                vault_fingerprint="fp-" + uuid.uuid4().hex,
+            ),
+            user=db_user,
+            session=db_session,
+        )
+        assert connect_resp.connector_id > 0
+        assert connect_resp.vault_id == vault_id
+        assert "sync" in connect_resp.capabilities
+        assert connect_resp.server_time_utc is not None
+
+        # 2. /sync — stub the indexer so the call doesn't drag the LLM /
+        # embedding pipeline in. We're testing the wire contract, not the
+        # indexer itself.
+        fake_doc = type("FakeDoc", (), {"id": 12345})()
+        with patch(
+            "app.routes.obsidian_plugin_routes.upsert_note",
+            new=AsyncMock(return_value=fake_doc),
+        ):
+            sync_resp = await obsidian_sync(
+                SyncBatchRequest(
+                    vault_id=vault_id,
+                    notes=[
+                        _make_note_payload(vault_id, "ok.md", "hash-ok"),
+                        _make_note_payload(vault_id, "fail.md", "hash-fail"),
+                    ],
+                ),
+                user=db_user,
+                session=db_session,
+            )
+
+        assert isinstance(sync_resp, SyncAck)
+        assert sync_resp.vault_id == vault_id
+        assert sync_resp.indexed == 2
+        assert sync_resp.failed == 0
+        assert len(sync_resp.items) == 2
+        assert all(it.status == "ok" for it in sync_resp.items)
+        # The TypeScript decoder filters on items[].status === "error" and
+        # extracts .path, so confirm both fields are present and named.
+        assert {it.path for it in sync_resp.items} == {"ok.md", "fail.md"}
+
+        # 2b. Re-run /sync but force the indexer to raise on one note so
+        # the per-item failure decoder gets exercised end-to-end.
+        async def _selective_upsert(session, *, connector, payload, user_id):
+            if payload.path == "fail.md":
+                raise RuntimeError("simulated indexing failure")
+            return fake_doc
+
+        with patch(
+            "app.routes.obsidian_plugin_routes.upsert_note",
+            new=AsyncMock(side_effect=_selective_upsert),
+        ):
+            sync_resp = await obsidian_sync(
+                SyncBatchRequest(
+                    vault_id=vault_id,
+                    notes=[
+                        _make_note_payload(vault_id, "ok.md", "h1"),
+                        _make_note_payload(vault_id, "fail.md", "h2"),
+                    ],
+                ),
+                user=db_user,
+                session=db_session,
+            )
+        assert sync_resp.indexed == 1
+        assert sync_resp.failed == 1
+        statuses = {it.path: it.status for it in sync_resp.items}
+        assert statuses == {"ok.md": "ok", "fail.md": "error"}
+
+        # 3. /rename — patch rename_note so we don't need a real Document.
+        async def _rename(*args, **kwargs) -> object:
+            if kwargs.get("old_path") == "missing.md":
+                return None
+            return fake_doc
+
+        with patch(
+            "app.routes.obsidian_plugin_routes.rename_note",
+            new=AsyncMock(side_effect=_rename),
+        ):
+            rename_resp = await obsidian_rename(
+                RenameBatchRequest(
+                    vault_id=vault_id,
+                    renames=[
+                        RenameItem(old_path="a.md", new_path="b.md"),
+                        RenameItem(old_path="missing.md", new_path="x.md"),
+                    ],
+                ),
+                user=db_user,
+                session=db_session,
+            )
+        assert isinstance(rename_resp, RenameAck)
+        assert rename_resp.renamed == 1
+        assert rename_resp.missing == 1
+        assert {it.status for it in rename_resp.items} == {"ok", "missing"}
+        # snake_case fields are deliberate — the plugin decoder maps them
+        # to camelCase explicitly.
+        assert all(it.old_path and it.new_path for it in rename_resp.items)
+
+        # 4. /notes DELETE
+        async def _delete(*args, **kwargs) -> bool:
+            return kwargs.get("path") != "ghost.md"
+
+        with patch(
+            "app.routes.obsidian_plugin_routes.delete_note",
+            new=AsyncMock(side_effect=_delete),
+        ):
+            delete_resp = await obsidian_delete_notes(
+                DeleteBatchRequest(vault_id=vault_id, paths=["b.md", "ghost.md"]),
+                user=db_user,
+                session=db_session,
+            )
+        assert isinstance(delete_resp, DeleteAck)
+        assert delete_resp.deleted == 1
+        assert delete_resp.missing == 1
+        assert {it.path: it.status for it in delete_resp.items} == {
+            "b.md": "ok",
+            "ghost.md": "missing",
+        }
+
+        # 5. /manifest — empty (no real Documents were created because
+        # upsert_note was mocked) but the response shape is what we care
+        # about.
+        manifest_resp = await obsidian_manifest(
+            vault_id=vault_id, user=db_user, session=db_session
+        )
+        assert isinstance(manifest_resp, ManifestResponse)
+        assert manifest_resp.vault_id == vault_id
+        assert manifest_resp.items == {}
+
+        # 6. /stats — same; row count is 0 because upsert_note was mocked.
+        stats_resp = await obsidian_stats(
+            vault_id=vault_id, user=db_user, session=db_session
+        )
+        assert isinstance(stats_resp, StatsResponse)
+        assert stats_resp.vault_id == vault_id
+        assert stats_resp.files_synced == 0
+        assert stats_resp.last_sync_at is None
+
+    async def test_sync_queues_binary_attachments(
+        self, db_session: AsyncSession, db_user: User, db_search_space: SearchSpace
+    ):
+        vault_id = str(uuid.uuid4())
+        await obsidian_connect(
+            ConnectRequest(
+                vault_id=vault_id,
+                vault_name="Queue Vault",
+                search_space_id=db_search_space.id,
+                vault_fingerprint="fp-" + uuid.uuid4().hex,
+            ),
+            user=db_user,
+            session=db_session,
+        )
+
+        fake_doc = type("FakeDoc", (), {"id": 12345})()
+        binary_note = _make_note_payload(vault_id, "image.png", "hash-bin")
+        binary_note.extension = "png"
+        binary_note.is_binary = True
+        binary_note.binary_base64 = "aGVsbG8="
+        binary_note.mime_type = "image/png"
+        binary_note.content = ""
+
+        with (
+            patch(
+                "app.routes.obsidian_plugin_routes.upsert_note",
+                new=AsyncMock(return_value=fake_doc),
+            ) as upsert_mock,
+            patch(
+                "app.routes.obsidian_plugin_routes._queue_obsidian_attachment"
+            ) as queue_mock,
+        ):
+            sync_resp = await obsidian_sync(
+                SyncBatchRequest(
+                    vault_id=vault_id,
+                    notes=[
+                        _make_note_payload(vault_id, "ok.md", "hash-ok"),
+                        binary_note,
+                    ],
+                ),
+                user=db_user,
+                session=db_session,
+            )
+
+        assert sync_resp.indexed == 2
+        assert sync_resp.failed == 0
+        statuses = {it.path: it.status for it in sync_resp.items}
+        assert statuses == {"ok.md": "ok", "image.png": "queued"}
+        assert upsert_mock.await_count == 1
+        queue_mock.assert_called_once()
+
+    async def test_sync_rejects_unsupported_attachment_extension(
+        self, db_session: AsyncSession, db_user: User, db_search_space: SearchSpace
+    ):
+        vault_id = str(uuid.uuid4())
+        await obsidian_connect(
+            ConnectRequest(
+                vault_id=vault_id,
+                vault_name="Reject Vault",
+                search_space_id=db_search_space.id,
+                vault_fingerprint="fp-" + uuid.uuid4().hex,
+            ),
+            user=db_user,
+            session=db_session,
+        )
+
+        fake_doc = type("FakeDoc", (), {"id": 12345})()
+        bad_note = _make_note_payload(vault_id, "photo.heic", "hash-heic")
+        bad_note.extension = "heic"
+        bad_note.is_binary = True
+        bad_note.binary_base64 = "aGVsbG8="
+        bad_note.mime_type = "image/heic"
+        bad_note.content = ""
+
+        with (
+            patch(
+                "app.routes.obsidian_plugin_routes.upsert_note",
+                new=AsyncMock(return_value=fake_doc),
+            ),
+            patch(
+                "app.routes.obsidian_plugin_routes._queue_obsidian_attachment"
+            ) as queue_mock,
+        ):
+            sync_resp = await obsidian_sync(
+                SyncBatchRequest(
+                    vault_id=vault_id,
+                    notes=[
+                        _make_note_payload(vault_id, "ok.md", "hash-ok"),
+                        bad_note,
+                    ],
+                ),
+                user=db_user,
+                session=db_session,
+            )
+
+        assert sync_resp.indexed == 1
+        assert sync_resp.failed == 1
+        items_by_path = {it.path: it for it in sync_resp.items}
+        assert items_by_path["ok.md"].status == "ok"
+        assert items_by_path["photo.heic"].status == "error"
+        assert "unsupported attachment extension" in (
+            items_by_path["photo.heic"].error or ""
+        )
+        queue_mock.assert_not_called()
+
+    async def test_sync_rejects_mime_extension_mismatch(
+        self, db_session: AsyncSession, db_user: User, db_search_space: SearchSpace
+    ):
+        vault_id = str(uuid.uuid4())
+        await obsidian_connect(
+            ConnectRequest(
+                vault_id=vault_id,
+                vault_name="Mismatch Vault",
+                search_space_id=db_search_space.id,
+                vault_fingerprint="fp-" + uuid.uuid4().hex,
+            ),
+            user=db_user,
+            session=db_session,
+        )
+
+        fake_doc = type("FakeDoc", (), {"id": 12345})()
+        mismatched = _make_note_payload(vault_id, "image.png", "hash-png")
+        mismatched.extension = "png"
+        mismatched.is_binary = True
+        mismatched.binary_base64 = "aGVsbG8="
+        mismatched.mime_type = "application/pdf"
+        mismatched.content = ""
+
+        with (
+            patch(
+                "app.routes.obsidian_plugin_routes.upsert_note",
+                new=AsyncMock(return_value=fake_doc),
+            ),
+            patch(
+                "app.routes.obsidian_plugin_routes._queue_obsidian_attachment"
+            ) as queue_mock,
+        ):
+            sync_resp = await obsidian_sync(
+                SyncBatchRequest(
+                    vault_id=vault_id,
+                    notes=[
+                        _make_note_payload(vault_id, "ok.md", "hash-ok"),
+                        mismatched,
+                    ],
+                ),
+                user=db_user,
+                session=db_session,
+            )
+
+        assert sync_resp.indexed == 1
+        assert sync_resp.failed == 1
+        items_by_path = {it.path: it for it in sync_resp.items}
+        assert items_by_path["ok.md"].status == "ok"
+        assert items_by_path["image.png"].status == "error"
+        assert "does not match extension" in (items_by_path["image.png"].error or "")
+        queue_mock.assert_not_called()
diff --git a/surfsense_backend/tests/unit/agents/__init__.py b/surfsense_backend/tests/unit/agents/__init__.py
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/__init__.py
@@ -0,0 +1 @@
+
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/__init__.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/__init__.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/__init__.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_hitl_bridge.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_hitl_bridge.py
new file mode 100644
index 0000000000..72408a5d91
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_hitl_bridge.py
@@ -0,0 +1,406 @@
+"""End-to-end resume-bridge tests against a real LangGraph subagent."""
+
+from __future__ import annotations
+
+import ast
+import asyncio
+from types import SimpleNamespace
+
+import pytest
+from langchain.tools import ToolRuntime
+from langchain_core.messages import AIMessage, HumanMessage
+from langgraph.checkpoint.memory import InMemorySaver
+from langgraph.graph import END, START, StateGraph
+from langgraph.types import Command, interrupt
+from typing_extensions import TypedDict
+
+from app.agents.multi_agent_chat.middleware.main_agent.checkpointed_subagent_middleware.config import (
+    subagent_invoke_config,
+)
+from app.agents.multi_agent_chat.middleware.main_agent.checkpointed_subagent_middleware.resume_routing import (
+    collect_pending_tool_calls,
+    slice_decisions_by_tool_call,
+)
+from app.agents.multi_agent_chat.middleware.main_agent.checkpointed_subagent_middleware.task_tool import (
+    build_task_tool_with_parent_config,
+)
+
+
+class _SubagentState(TypedDict, total=False):
+    messages: list
+    decision_text: str
+
+
+def _build_single_interrupt_subagent():
+    def approve_node(state):
+        decision = interrupt(
+            {
+                "action_requests": [
+                    {
+                        "name": "do_thing",
+                        "args": {"x": 1},
+                        "description": "test action",
+                    }
+                ],
+                "review_configs": [{}],
+            }
+        )
+        return {
+            "messages": [AIMessage(content="done")],
+            "decision_text": repr(decision),
+        }
+
+    graph = StateGraph(_SubagentState)
+    graph.add_node("approve", approve_node)
+    graph.add_edge(START, "approve")
+    graph.add_edge("approve", END)
+    return graph.compile(checkpointer=InMemorySaver())
+
+
+def _make_runtime(config: dict, *, tool_call_id: str = "parent-tcid-1") -> ToolRuntime:
+    return ToolRuntime(
+        state={"messages": [HumanMessage(content="seed")]},
+        context=None,
+        config=config,
+        stream_writer=None,
+        tool_call_id=tool_call_id,
+        store=None,
+    )
+
+
+def _prime_subagent_at_runtime_thread(subagent, runtime: ToolRuntime) -> dict:
+    """Build the per-call ``RunnableConfig`` the production ``task`` tool will use.
+
+    Mirrors what the ``task`` tool does on first invocation so test fixtures
+    can prime the subagent's pending interrupt at the same checkpoint slot
+    (per-call ``thread_id``) the bridge looks at on resume.
+    """
+    return subagent_invoke_config(runtime)
+
+
+@pytest.mark.asyncio
+async def test_resume_bridge_dispatches_decision_into_pending_subagent():
+    """Side-channel decision must reach the subagent's pending interrupt verbatim."""
+    subagent = _build_single_interrupt_subagent()
+    task_tool = build_task_tool_with_parent_config(
+        [
+            {
+                "name": "approver",
+                "description": "approves things",
+                "runnable": subagent,
+            }
+        ]
+    )
+
+    parent_config: dict = {
+        "configurable": {"thread_id": "shared-thread"},
+        "recursion_limit": 100,
+    }
+    runtime = _make_runtime(parent_config)
+    sub_config = _prime_subagent_at_runtime_thread(subagent, runtime)
+    await subagent.ainvoke({"messages": [HumanMessage(content="seed")]}, sub_config)
+    snap = await subagent.aget_state(sub_config)
+    assert snap.tasks and snap.tasks[0].interrupts, (
+        "fixture broken: subagent should be paused on its interrupt"
+    )
+
+    parent_config["configurable"]["surfsense_resume_value"] = {
+        runtime.tool_call_id: {"decisions": ["APPROVED"]}
+    }
+
+    result = await task_tool.coroutine(
+        description="please approve",
+        subagent_type="approver",
+        runtime=runtime,
+    )
+
+    assert isinstance(result, Command)
+    update = result.update
+    assert update["decision_text"] == repr({"decisions": ["APPROVED"]})
+    assert "surfsense_resume_value" not in parent_config["configurable"]
+
+    final = await subagent.aget_state(sub_config)
+    assert not final.tasks or all(not t.interrupts for t in final.tasks)
+
+
+@pytest.mark.asyncio
+async def test_pending_interrupt_without_resume_value_raises_runtime_error():
+    """Bridge must fail loud rather than silently replay the user's interrupt."""
+    subagent = _build_single_interrupt_subagent()
+    task_tool = build_task_tool_with_parent_config(
+        [
+            {
+                "name": "approver",
+                "description": "approves things",
+                "runnable": subagent,
+            }
+        ]
+    )
+
+    parent_config: dict = {
+        "configurable": {"thread_id": "guard-thread"},
+        "recursion_limit": 100,
+    }
+    runtime = _make_runtime(parent_config)
+    sub_config = _prime_subagent_at_runtime_thread(subagent, runtime)
+    await subagent.ainvoke({"messages": [HumanMessage(content="seed")]}, sub_config)
+    snap = await subagent.aget_state(sub_config)
+    assert snap.tasks and snap.tasks[0].interrupts, "fixture broken"
+
+    with pytest.raises(RuntimeError, match="resume bridge is broken"):
+        await task_tool.coroutine(
+            description="please approve",
+            subagent_type="approver",
+            runtime=runtime,
+        )
+
+
+def _build_bundle_subagent():
+    def bundle_node(state):
+        decision = interrupt(
+            {
+                "action_requests": [
+                    {"name": "create_a", "args": {}, "description": ""},
+                    {"name": "create_b", "args": {}, "description": ""},
+                    {"name": "create_c", "args": {}, "description": ""},
+                ],
+                "review_configs": [{}, {}, {}],
+            }
+        )
+        return {
+            "messages": [AIMessage(content="bundle-done")],
+            "decision_text": repr(decision),
+        }
+
+    graph = StateGraph(_SubagentState)
+    graph.add_node("bundle", bundle_node)
+    graph.add_edge(START, "bundle")
+    graph.add_edge("bundle", END)
+    return graph.compile(checkpointer=InMemorySaver())
+
+
+@pytest.mark.asyncio
+async def test_bundle_three_mixed_decisions_arrive_in_order():
+    """Approve / edit / reject for a 3-action bundle must land at ordinals 0/1/2."""
+    subagent = _build_bundle_subagent()
+    task_tool = build_task_tool_with_parent_config(
+        [
+            {
+                "name": "bundler",
+                "description": "creates a bundle",
+                "runnable": subagent,
+            }
+        ]
+    )
+
+    parent_config: dict = {
+        "configurable": {"thread_id": "bundle-thread"},
+        "recursion_limit": 100,
+    }
+    runtime = _make_runtime(parent_config)
+    sub_config = _prime_subagent_at_runtime_thread(subagent, runtime)
+    await subagent.ainvoke({"messages": [HumanMessage(content="seed")]}, sub_config)
+
+    decisions_payload = {
+        "decisions": [
+            {"type": "approve", "args": {}},
+            {"type": "edit", "args": {"args": {"name": "edited-b"}}},
+            {"type": "reject", "args": {"message": "no thanks"}},
+        ]
+    }
+    parent_config["configurable"]["surfsense_resume_value"] = {
+        runtime.tool_call_id: decisions_payload
+    }
+
+    result = await task_tool.coroutine(
+        description="run bundle",
+        subagent_type="bundler",
+        runtime=runtime,
+    )
+
+    assert isinstance(result, Command)
+    received = ast.literal_eval(result.update["decision_text"])
+    assert received == decisions_payload
+    assert received["decisions"][0]["type"] == "approve"
+    assert received["decisions"][1]["type"] == "edit"
+    assert received["decisions"][1]["args"] == {"args": {"name": "edited-b"}}
+    assert received["decisions"][2]["type"] == "reject"
+
+
+@pytest.mark.asyncio
+async def test_parallel_atask_routes_each_decision_to_its_own_subagent():
+    """Two ``atask`` calls with distinct ``tool_call_id``s must each get their own decision.
+
+    With per-call ``thread_id`` isolation and per-call resume keying, A's
+    decision must reach A's pending interrupt and B's must reach B's. They
+    must NOT cross-contaminate even though they share ``configurable``.
+    """
+    subagent_a = _build_single_interrupt_subagent()
+    subagent_b = _build_single_interrupt_subagent()
+    task_tool = build_task_tool_with_parent_config(
+        [
+            {
+                "name": "approver_a",
+                "description": "approves A",
+                "runnable": subagent_a,
+            },
+            {
+                "name": "approver_b",
+                "description": "approves B",
+                "runnable": subagent_b,
+            },
+        ]
+    )
+
+    parent_config: dict = {
+        "configurable": {"thread_id": "parallel-thread"},
+        "recursion_limit": 100,
+    }
+
+    runtime_a = _make_runtime(parent_config, tool_call_id="tcid-A")
+    runtime_b = _make_runtime(parent_config, tool_call_id="tcid-B")
+
+    sub_config_a = _prime_subagent_at_runtime_thread(subagent_a, runtime_a)
+    sub_config_b = _prime_subagent_at_runtime_thread(subagent_b, runtime_b)
+
+    await subagent_a.ainvoke(
+        {"messages": [HumanMessage(content="seed-A")]}, sub_config_a
+    )
+    await subagent_b.ainvoke(
+        {"messages": [HumanMessage(content="seed-B")]}, sub_config_b
+    )
+
+    parent_config["configurable"]["surfsense_resume_value"] = {
+        "tcid-A": {"decisions": ["DECISION-A"]},
+        "tcid-B": {"decisions": ["DECISION-B"]},
+    }
+
+    result_a, result_b = await asyncio.gather(
+        task_tool.coroutine(
+            description="please approve A",
+            subagent_type="approver_a",
+            runtime=runtime_a,
+        ),
+        task_tool.coroutine(
+            description="please approve B",
+            subagent_type="approver_b",
+            runtime=runtime_b,
+        ),
+    )
+
+    assert isinstance(result_a, Command)
+    assert isinstance(result_b, Command)
+    assert result_a.update["decision_text"] == repr({"decisions": ["DECISION-A"]})
+    assert result_b.update["decision_text"] == repr({"decisions": ["DECISION-B"]})
+
+    assert "surfsense_resume_value" not in parent_config["configurable"]
+
+
+@pytest.mark.asyncio
+async def test_full_resume_routing_glue_for_two_paused_subagents():
+    """End-to-end: extractor + slicer + bridge correctly route a flat decisions list.
+
+    This simulates exactly what ``stream_resume_chat`` will do on resume:
+    given a paused parent state with two pending interrupts (one per
+    subagent) and a flat ``decisions`` list, build the per-tool-call dict
+    via ``collect_pending_tool_calls`` + ``slice_decisions_by_tool_call``,
+    then resume the bridge concurrently and verify each subagent received
+    only its own slice.
+    """
+    subagent_a = _build_bundle_subagent()
+    subagent_b = _build_single_interrupt_subagent()
+    task_tool = build_task_tool_with_parent_config(
+        [
+            {
+                "name": "bundler",
+                "description": "three-action bundle",
+                "runnable": subagent_a,
+            },
+            {
+                "name": "approver",
+                "description": "single approval",
+                "runnable": subagent_b,
+            },
+        ]
+    )
+
+    parent_config: dict = {
+        "configurable": {"thread_id": "glue-thread"},
+        "recursion_limit": 100,
+    }
+
+    runtime_a = _make_runtime(parent_config, tool_call_id="tcid-bundler")
+    runtime_b = _make_runtime(parent_config, tool_call_id="tcid-approver")
+
+    sub_config_a = _prime_subagent_at_runtime_thread(subagent_a, runtime_a)
+    sub_config_b = _prime_subagent_at_runtime_thread(subagent_b, runtime_b)
+
+    await subagent_a.ainvoke(
+        {"messages": [HumanMessage(content="seed-A")]}, sub_config_a
+    )
+    await subagent_b.ainvoke(
+        {"messages": [HumanMessage(content="seed-B")]}, sub_config_b
+    )
+
+    # Synthetic parent state mirroring what the parent's pregel would have
+    # bundled: one Interrupt per subagent, value carrying tool_call_id +
+    # action_requests (exactly the shape ``propagation.wrap_with_tool_call_id``
+    # produces).
+    parent_interrupts = (
+        SimpleNamespace(
+            id="i-bundler",
+            value={
+                "action_requests": [
+                    {"name": "create_a", "args": {}, "description": ""},
+                    {"name": "create_b", "args": {}, "description": ""},
+                    {"name": "create_c", "args": {}, "description": ""},
+                ],
+                "review_configs": [{}, {}, {}],
+                "tool_call_id": "tcid-bundler",
+            },
+        ),
+        SimpleNamespace(
+            id="i-approver",
+            value={
+                "action_requests": [{"name": "approve", "args": {}, "description": ""}],
+                "review_configs": [{}],
+                "tool_call_id": "tcid-approver",
+            },
+        ),
+    )
+    parent_state = SimpleNamespace(interrupts=parent_interrupts)
+
+    flat_decisions = [
+        {"type": "approve"},
+        {"type": "edit", "args": {"args": {"name": "edited-b"}}},
+        {"type": "reject", "args": {"message": "no thanks"}},
+        {"type": "approve"},
+    ]
+
+    pending = collect_pending_tool_calls(parent_state)
+    assert pending == [("tcid-bundler", 3), ("tcid-approver", 1)]
+
+    routed = slice_decisions_by_tool_call(flat_decisions, pending)
+    parent_config["configurable"]["surfsense_resume_value"] = routed
+
+    result_a, result_b = await asyncio.gather(
+        task_tool.coroutine(
+            description="run bundle",
+            subagent_type="bundler",
+            runtime=runtime_a,
+        ),
+        task_tool.coroutine(
+            description="please approve",
+            subagent_type="approver",
+            runtime=runtime_b,
+        ),
+    )
+
+    assert isinstance(result_a, Command)
+    assert isinstance(result_b, Command)
+
+    received_a = ast.literal_eval(result_a.update["decision_text"])
+    assert received_a == {"decisions": flat_decisions[0:3]}
+    assert result_b.update["decision_text"] == repr({"decisions": flat_decisions[3:4]})
+
+    assert "surfsense_resume_value" not in parent_config["configurable"]
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_parallel_heterogeneous_decisions.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_parallel_heterogeneous_decisions.py
new file mode 100644
index 0000000000..d4a68939e5
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_parallel_heterogeneous_decisions.py
@@ -0,0 +1,259 @@
+"""Real-graph contract: heterogeneous decisions route correctly across parallel subagents.
+
+The simple "approve everything" parallel test (see
+``test_parallel_resume_command_keying``) proves the routing wires up at all,
+but it doesn't exercise the actual production user flow: rejecting one card
+while approving another, or editing one action's args before approving the
+rest. Those are the decisions ``HumanInTheLoopMiddleware`` differentiates on,
+and they're exactly where a slicer/router bug silently mis-applies a reject
+to the wrong subagent.
+
+This module pins:
+
+1. **Order preservation** across the slice boundary — flat decisions enter
+   in the order the SSE stream rendered cards; each subagent must receive
+   only its slice in the original order.
+2. **Per-decision metadata pass-through** — ``message`` and ``edited_action``
+   payloads must reach the subagent intact (not just the ``type`` discriminator).
+3. **Off-by-one-sensitive bundle sizes** — both paused subagents have action
+   counts ``> 1`` (``2`` and ``3``). With those sizes a buggy
+   ``cursor += 1`` slicer (instead of ``cursor += action_count``) produces a
+   different B-slice from the correct one, so this test catches the most
+   common refactor mistake. A ``(1, 2)`` configuration would silently pass
+   such a bug because ``+= 1`` and ``+= count`` are arithmetically identical
+   when ``count == 1``.
+"""
+
+from __future__ import annotations
+
+import contextlib
+import json
+from typing import Annotated
+
+import pytest
+from langchain.tools import ToolRuntime
+from langchain_core.messages import AIMessage, HumanMessage
+from langchain_core.runnables import RunnableConfig
+from langgraph.checkpoint.memory import InMemorySaver
+from langgraph.graph import END, START, StateGraph
+from langgraph.graph.message import add_messages
+from langgraph.types import Command, Send, interrupt
+from typing_extensions import TypedDict
+
+from app.agents.multi_agent_chat.middleware.main_agent.checkpointed_subagent_middleware.resume_routing import (
+    build_lg_resume_map,
+    collect_pending_tool_calls,
+    slice_decisions_by_tool_call,
+)
+from app.agents.multi_agent_chat.middleware.main_agent.checkpointed_subagent_middleware.task_tool import (
+    build_task_tool_with_parent_config,
+)
+
+
+class _SubState(TypedDict, total=False):
+    messages: list
+
+
+class _DispatchState(TypedDict, total=False):
+    messages: Annotated[list, add_messages]
+    tcid: str
+    desc: str
+    subtype: str
+
+
+def _build_capturing_subagent(checkpointer: InMemorySaver, *, action_count: int):
+    """Subagent that pauses with an N-action bundle and on resume records what it received.
+
+    The recorded ``AIMessage`` content is the JSON-serialized resume payload, so
+    the assertions can inspect exactly which decisions reached this subagent
+    (vs. its sibling) — including the ``message`` and ``edited_action``
+    metadata, not just the ``type``.
+    """
+
+    def hitl_node(_state):
+        decision_payload = interrupt(
+            {
+                "action_requests": [
+                    {
+                        "name": f"act_{i}",
+                        "args": {"i": i},
+                        "description": f"action {i}",
+                    }
+                    for i in range(action_count)
+                ],
+                "review_configs": [
+                    {
+                        "action_name": f"act_{i}",
+                        "allowed_decisions": ["approve", "reject", "edit"],
+                    }
+                    for i in range(action_count)
+                ],
+            }
+        )
+        return {
+            "messages": [
+                AIMessage(content=json.dumps(decision_payload, sort_keys=True))
+            ]
+        }
+
+    g = StateGraph(_SubState)
+    g.add_node("hitl", hitl_node)
+    g.add_edge(START, "hitl")
+    g.add_edge("hitl", END)
+    return g.compile(checkpointer=checkpointer)
+
+
+def _parent_dispatching_two_subagents(
+    task_tool, *, dispatches: list[dict[str, str]], checkpointer
+):
+    """Parent that fans out to ``len(dispatches)`` parallel ``task`` tool calls.
+
+    Each entry in ``dispatches`` is ``{"tcid": ..., "subtype": ..., "desc": ...}``
+    so different parallel branches can target different subagent types — the
+    actual production scenario (Linear + Jira, etc.).
+    """
+
+    def fanout(_state) -> list[Send]:
+        return [Send("call_task", d) for d in dispatches]
+
+    async def call_task(state: _DispatchState, config: RunnableConfig):
+        rt = ToolRuntime(
+            state=state,
+            config=config,
+            context=None,
+            stream_writer=None,
+            tool_call_id=state["tcid"],
+            store=None,
+        )
+        return await task_tool.coroutine(
+            description=state["desc"], subagent_type=state["subtype"], runtime=rt
+        )
+
+    g = StateGraph(_DispatchState)
+    g.add_node("call_task", call_task)
+    g.add_conditional_edges(START, fanout, ["call_task"])
+    g.add_edge("call_task", END)
+    return g.compile(checkpointer=checkpointer)
+
+
+@pytest.mark.asyncio
+async def test_heterogeneous_decisions_route_to_correct_subagents_with_metadata_intact():
+    """Mixed approve/reject/edit decisions across two parallel subagents.
+
+    Setup chosen so the slicer's cursor arithmetic is sensitive to off-by-one
+    refactors:
+    - Sub-A pauses with a 2-action bundle (``act_0``, ``act_1``).
+    - Sub-B pauses with a 3-action bundle (``act_0``, ``act_1``, ``act_2``).
+    - Parent ends up with 2 pending interrupts (one per subagent).
+
+    With both counts ``> 1``, a buggy ``cursor += 1`` (instead of
+    ``cursor += action_count``) produces a different B-slice from the correct
+    one, so the assertions catch it. A ``(1, 2)`` configuration would not
+    because ``+= 1`` and ``+= count`` are arithmetically identical when
+    ``count == 1``.
+
+    The frontend submits a flat
+    ``[A_approve, A_reject, B_edit, B_approve, B_reject]`` list with distinct
+    ``message`` and ``edited_action`` payloads; our slicer must split into
+    ``{tcid_A: [A_approve, A_reject], tcid_B: [B_edit, B_approve, B_reject]}``
+    and the bridge must forward each subagent's slice intact — including all
+    metadata, in original order.
+    """
+    checkpointer = InMemorySaver()
+
+    sub_a = _build_capturing_subagent(checkpointer, action_count=2)
+    sub_b = _build_capturing_subagent(checkpointer, action_count=3)
+
+    task_tool = build_task_tool_with_parent_config(
+        [
+            {"name": "agent-a", "description": "first", "runnable": sub_a},
+            {"name": "agent-b", "description": "second", "runnable": sub_b},
+        ]
+    )
+
+    parent = _parent_dispatching_two_subagents(
+        task_tool,
+        dispatches=[
+            {"tcid": "tcid-A", "subtype": "agent-a", "desc": "do A"},
+            {"tcid": "tcid-B", "subtype": "agent-b", "desc": "do B"},
+        ],
+        checkpointer=checkpointer,
+    )
+
+    config: dict = {
+        "configurable": {"thread_id": "het-decisions-thread"},
+        "recursion_limit": 100,
+    }
+    await parent.ainvoke({"messages": [HumanMessage(content="seed")]}, config)
+
+    paused_state = await parent.aget_state(config)
+    assert len(paused_state.interrupts) == 2, (
+        f"fixture broken: expected 2 paused subagents, got {len(paused_state.interrupts)}"
+    )
+
+    pending = collect_pending_tool_calls(paused_state)
+    pending_by_tcid = dict(pending)
+    assert pending_by_tcid == {"tcid-A": 2, "tcid-B": 3}, (
+        f"REGRESSION: action-count accounting wrong; got {pending_by_tcid!r}"
+    )
+
+    a_approve = {"type": "approve"}
+    a_reject = {"type": "reject", "message": "A[1] looks redundant"}
+    b_edit = {
+        "type": "edit",
+        "edited_action": {"name": "act_0", "args": {"i": 0, "edited": True}},
+    }
+    b_approve = {"type": "approve"}
+    b_reject = {"type": "reject", "message": "B[2] needs more context"}
+    flat_decisions = [a_approve, a_reject, b_edit, b_approve, b_reject]
+
+    by_tool_call_id = slice_decisions_by_tool_call(flat_decisions, pending)
+
+    assert by_tool_call_id == {
+        "tcid-A": {"decisions": [a_approve, a_reject]},
+        "tcid-B": {"decisions": [b_edit, b_approve, b_reject]},
+    }, f"REGRESSION: slicer mis-routed decisions: {by_tool_call_id!r}"
+
+    config["configurable"]["surfsense_resume_value"] = by_tool_call_id
+    lg_resume_map = build_lg_resume_map(paused_state, by_tool_call_id)
+
+    await parent.ainvoke(Command(resume=lg_resume_map), config)
+
+    final_state = await parent.aget_state(config)
+    assert not final_state.interrupts, (
+        f"REGRESSION: leftover pending interrupts after resume: {final_state.interrupts!r}"
+    )
+
+    payloads: list[dict] = []
+    for msg in final_state.values.get("messages", []) or []:
+        content = getattr(msg, "content", None)
+        if isinstance(content, str):
+            with contextlib.suppress(json.JSONDecodeError):
+                payloads.append(json.loads(content))
+
+    expected_a = {"decisions": [a_approve, a_reject]}
+    expected_b = {"decisions": [b_edit, b_approve, b_reject]}
+
+    assert expected_a in payloads, (
+        f"REGRESSION: sub-A did not receive its 2-decision slice in original order; "
+        f"payloads seen: {payloads!r}"
+    )
+    assert expected_b in payloads, (
+        f"REGRESSION: sub-B did not receive its 3-decision slice in original order; "
+        f"payloads seen: {payloads!r}"
+    )
+
+
+@pytest.mark.asyncio
+async def test_decision_count_mismatch_fails_loud_before_dispatch():
+    """The slicer must refuse a flat list whose total != sum(action_counts).
+
+    Otherwise a frontend/backend contract drift would silently send a
+    truncated/padded slice to one of the subagents — the worst possible
+    failure mode (mis-applied reject on a long-lived ticket).
+    """
+    pending = [("tcid-A", 1), ("tcid-B", 2)]
+    decisions = [{"type": "approve"}, {"type": "approve"}]
+
+    with pytest.raises(ValueError, match="Decision count mismatch"):
+        slice_decisions_by_tool_call(decisions, pending)
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_parallel_partial_pause_routing.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_parallel_partial_pause_routing.py
new file mode 100644
index 0000000000..1aba0c4809
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_parallel_partial_pause_routing.py
@@ -0,0 +1,253 @@
+"""Real-graph contract: one parallel branch completes while a sibling pauses with HITL.
+
+The two existing parallel-routing tests
+(``test_parallel_resume_command_keying`` and
+``test_parallel_heterogeneous_decisions``) both pause **all** branches
+simultaneously. That's the easy case — every dispatched ``task`` call has a
+matching pending interrupt, and the routing helpers see a uniform shape.
+
+Production rarely matches that uniform shape. The orchestrator typically
+delegates "create a Linear ticket and summarize the user's recent activity":
+one branch needs HITL, the other returns its result and exits. At the pause
+moment::
+
+    state.values["messages"] += [ToolMessage(from-A)]   # A merged in
+    state.interrupts          = [Interrupt(value-from-B)]   # B alone is pending
+
+So ``len(state.interrupts) < num_dispatched_tasks``. The slicer and
+``build_lg_resume_map`` must:
+
+1. **Key off ``state.interrupts``, never off the originally dispatched tcids.**
+   A flat decisions list of length 1 must route only to B; if anything tries
+   to look up A in the resume map, langgraph rejects an unknown
+   ``Interrupt.id``.
+2. **Leave A's contributions intact across resume.** A's ToolMessage was
+   committed at the pause; resuming the paused branch must not re-run A nor
+   drop its message.
+3. **Drain the single pending interrupt.** Final ``state.interrupts`` is
+   empty regardless of whether sibling branches were paused.
+
+The langgraph semantics this test relies on were verified empirically in the
+exploratory probe before this test was authored.
+"""
+
+from __future__ import annotations
+
+import contextlib
+import json
+from typing import Annotated
+
+import pytest
+from langchain.tools import ToolRuntime
+from langchain_core.messages import AIMessage, HumanMessage
+from langchain_core.runnables import RunnableConfig
+from langgraph.checkpoint.memory import InMemorySaver
+from langgraph.graph import END, START, StateGraph
+from langgraph.graph.message import add_messages
+from langgraph.types import Command, Send, interrupt
+from typing_extensions import TypedDict
+
+from app.agents.multi_agent_chat.middleware.main_agent.checkpointed_subagent_middleware.resume_routing import (
+    build_lg_resume_map,
+    collect_pending_tool_calls,
+    slice_decisions_by_tool_call,
+)
+from app.agents.multi_agent_chat.middleware.main_agent.checkpointed_subagent_middleware.task_tool import (
+    build_task_tool_with_parent_config,
+)
+
+
+class _SubState(TypedDict, total=False):
+    messages: Annotated[list, add_messages]
+
+
+class _DispatchState(TypedDict, total=False):
+    messages: Annotated[list, add_messages]
+    tcid: str
+    desc: str
+    subtype: str
+
+
+_QUICK_MARKER = "quick-subagent-finished-without-pausing"
+
+
+def _build_quick_subagent(checkpointer: InMemorySaver):
+    """Subagent that completes synchronously without firing any interrupt."""
+
+    def quick_node(_state):
+        return {"messages": [AIMessage(content=_QUICK_MARKER)]}
+
+    g = StateGraph(_SubState)
+    g.add_node("quick", quick_node)
+    g.add_edge(START, "quick")
+    g.add_edge("quick", END)
+    return g.compile(checkpointer=checkpointer)
+
+
+def _build_pausing_subagent(checkpointer: InMemorySaver):
+    """Subagent that pauses with a single-action HITL bundle and records its resume payload."""
+
+    def hitl_node(_state):
+        decision = interrupt(
+            {
+                "action_requests": [
+                    {"name": "act_0", "args": {"i": 0}, "description": ""}
+                ],
+                "review_configs": [
+                    {
+                        "action_name": "act_0",
+                        "allowed_decisions": ["approve", "reject", "edit"],
+                    }
+                ],
+            }
+        )
+        return {"messages": [AIMessage(content=json.dumps(decision, sort_keys=True))]}
+
+    g = StateGraph(_SubState)
+    g.add_node("hitl", hitl_node)
+    g.add_edge(START, "hitl")
+    g.add_edge("hitl", END)
+    return g.compile(checkpointer=checkpointer)
+
+
+def _parent_with_two_branches(task_tool, *, dispatches, checkpointer):
+    def fanout(_state) -> list[Send]:
+        return [Send("call_task", d) for d in dispatches]
+
+    async def call_task(state: _DispatchState, config: RunnableConfig):
+        rt = ToolRuntime(
+            state=state,
+            config=config,
+            context=None,
+            stream_writer=None,
+            tool_call_id=state["tcid"],
+            store=None,
+        )
+        return await task_tool.coroutine(
+            description=state["desc"], subagent_type=state["subtype"], runtime=rt
+        )
+
+    g = StateGraph(_DispatchState)
+    g.add_node("call_task", call_task)
+    g.add_conditional_edges(START, fanout, ["call_task"])
+    g.add_edge("call_task", END)
+    return g.compile(checkpointer=checkpointer)
+
+
+def _quick_marker_count(state) -> int:
+    """How many messages anywhere in parent state contain the quick subagent's marker."""
+    n = 0
+    for msg in state.values.get("messages", []) or []:
+        content = getattr(msg, "content", "")
+        if isinstance(content, str) and _QUICK_MARKER in content:
+            n += 1
+    return n
+
+
+@pytest.mark.asyncio
+async def test_partial_pause_routes_only_to_paused_branch_without_rerunning_completed_one():
+    """One branch completes synchronously; the other pauses with HITL — resume routes only to B.
+
+    Setup:
+    - Sub-A (``quick``): no interrupt, finishes immediately, writes a marker
+      message to parent state.
+    - Sub-B (``pausing``): interrupts with a 1-action HITL bundle.
+
+    At pause, parent state has A's marker already merged in and exactly one
+    pending interrupt (B's). Resume sends a 1-element flat decisions list;
+    the routing helpers must not look up A in the resume map (would explode
+    with an unknown ``Interrupt.id``) and must not re-invoke A on resume
+    (would duplicate the marker).
+    """
+    checkpointer = InMemorySaver()
+
+    quick_sub = _build_quick_subagent(checkpointer)
+    pausing_sub = _build_pausing_subagent(checkpointer)
+
+    task_tool = build_task_tool_with_parent_config(
+        [
+            {"name": "quick-agent", "description": "instant", "runnable": quick_sub},
+            {
+                "name": "pausing-agent",
+                "description": "needs review",
+                "runnable": pausing_sub,
+            },
+        ]
+    )
+
+    parent = _parent_with_two_branches(
+        task_tool,
+        dispatches=[
+            {"tcid": "tcid-A", "subtype": "quick-agent", "desc": "do A fast"},
+            {
+                "tcid": "tcid-B",
+                "subtype": "pausing-agent",
+                "desc": "needs approval",
+            },
+        ],
+        checkpointer=checkpointer,
+    )
+
+    config: dict = {
+        "configurable": {"thread_id": "partial-pause-thread"},
+        "recursion_limit": 100,
+    }
+    await parent.ainvoke({"messages": [HumanMessage(content="seed")]}, config)
+
+    paused = await parent.aget_state(config)
+
+    assert len(paused.interrupts) == 1, (
+        f"REGRESSION: expected exactly 1 pending interrupt (sub-B alone), "
+        f"got {len(paused.interrupts)}"
+    )
+
+    pending = collect_pending_tool_calls(paused)
+    assert pending == [("tcid-B", 1)], (
+        f"REGRESSION: pending list contains stale tcids; got {pending!r}"
+    )
+
+    pre_resume_marker_count = _quick_marker_count(paused)
+    assert pre_resume_marker_count == 1, (
+        f"REGRESSION: sub-A's contribution missing or duplicated at pause "
+        f"(found {pre_resume_marker_count}, expected 1)"
+    )
+
+    flat_decisions = [{"type": "approve"}]
+    by_tool_call_id = slice_decisions_by_tool_call(flat_decisions, pending)
+    assert by_tool_call_id == {"tcid-B": {"decisions": [{"type": "approve"}]}}, (
+        f"REGRESSION: slicer routed to a non-pending tcid: {by_tool_call_id!r}"
+    )
+
+    config["configurable"]["surfsense_resume_value"] = by_tool_call_id
+    lg_resume_map = build_lg_resume_map(paused, by_tool_call_id)
+
+    assert set(lg_resume_map.keys()) == {paused.interrupts[0].id}, (
+        f"REGRESSION: resume map keyed by an unknown Interrupt.id "
+        f"(would crash langgraph): {lg_resume_map!r}"
+    )
+
+    await parent.ainvoke(Command(resume=lg_resume_map), config)
+
+    final = await parent.aget_state(config)
+    assert not final.interrupts, (
+        f"REGRESSION: pending interrupts after resume: {final.interrupts!r}"
+    )
+
+    post_resume_marker_count = _quick_marker_count(final)
+    assert post_resume_marker_count == 1, (
+        f"REGRESSION: sub-A re-ran on resume (marker count went "
+        f"{pre_resume_marker_count} → {post_resume_marker_count}); "
+        f"resume must touch only the paused branch."
+    )
+
+    payloads: list[dict] = []
+    for msg in final.values.get("messages", []) or []:
+        content = getattr(msg, "content", None)
+        if isinstance(content, str):
+            with contextlib.suppress(json.JSONDecodeError):
+                payloads.append(json.loads(content))
+
+    assert {"decisions": [{"type": "approve"}]} in payloads, (
+        f"REGRESSION: sub-B did not receive its single approve on resume; "
+        f"payloads seen: {payloads!r}"
+    )
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_parallel_reject_only_routing.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_parallel_reject_only_routing.py
new file mode 100644
index 0000000000..5810d5394d
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_parallel_reject_only_routing.py
@@ -0,0 +1,215 @@
+"""Real-graph contract: all-reject decisions route correctly across parallel subagents.
+
+Heterogeneous routing is covered by ``test_parallel_heterogeneous_decisions``.
+This module pins the narrower edge case where **every** card on **every**
+paused subagent is rejected.
+
+Why a separate pin:
+
+1. **No approval-bias in the slicer.** A future "if no approvals, short-circuit
+   resume" optimization would be tempting (skips a langgraph round-trip) and
+   would also silently break this scenario. Pin it.
+2. **``message`` metadata pass-through across a run of rejects.** The reject
+   ``message`` is the user-visible reason ("looks suspicious", "duplicate",
+   etc.). Losing it would silently swallow user intent — the worst UX
+   failure mode for HITL. Heterogeneous covers one reject; here we verify a
+   sequence of rejects survives the slicer + bridge with distinct messages
+   intact and in order.
+3. **All branches complete with no leftover pending.** Even when nothing was
+   approved, the parent must drain every paused subagent so the SSE stream
+   can close cleanly. A bug that left one ``Interrupt.id`` un-keyed would
+   strand the conversation in "pending" forever.
+"""
+
+from __future__ import annotations
+
+import contextlib
+import json
+from typing import Annotated
+
+import pytest
+from langchain.tools import ToolRuntime
+from langchain_core.messages import AIMessage, HumanMessage
+from langchain_core.runnables import RunnableConfig
+from langgraph.checkpoint.memory import InMemorySaver
+from langgraph.graph import END, START, StateGraph
+from langgraph.graph.message import add_messages
+from langgraph.types import Command, Send, interrupt
+from typing_extensions import TypedDict
+
+from app.agents.multi_agent_chat.middleware.main_agent.checkpointed_subagent_middleware.resume_routing import (
+    build_lg_resume_map,
+    collect_pending_tool_calls,
+    slice_decisions_by_tool_call,
+)
+from app.agents.multi_agent_chat.middleware.main_agent.checkpointed_subagent_middleware.task_tool import (
+    build_task_tool_with_parent_config,
+)
+
+
+class _SubState(TypedDict, total=False):
+    messages: list
+
+
+class _DispatchState(TypedDict, total=False):
+    messages: Annotated[list, add_messages]
+    tcid: str
+    desc: str
+    subtype: str
+
+
+def _build_recording_subagent(checkpointer: InMemorySaver, *, action_count: int):
+    """Subagent that pauses with ``action_count`` actions and records its resume payload.
+
+    The recorded ``AIMessage`` content is the JSON-serialized payload, so the
+    test can match each subagent's slice by content.
+    """
+
+    def hitl_node(_state):
+        decision_payload = interrupt(
+            {
+                "action_requests": [
+                    {"name": f"act_{i}", "args": {"i": i}, "description": ""}
+                    for i in range(action_count)
+                ],
+                "review_configs": [
+                    {
+                        "action_name": f"act_{i}",
+                        "allowed_decisions": ["approve", "reject", "edit"],
+                    }
+                    for i in range(action_count)
+                ],
+            }
+        )
+        return {
+            "messages": [
+                AIMessage(content=json.dumps(decision_payload, sort_keys=True))
+            ]
+        }
+
+    g = StateGraph(_SubState)
+    g.add_node("hitl", hitl_node)
+    g.add_edge(START, "hitl")
+    g.add_edge("hitl", END)
+    return g.compile(checkpointer=checkpointer)
+
+
+def _parent_two_branches(task_tool, *, dispatches, checkpointer):
+    def fanout(_state) -> list[Send]:
+        return [Send("call_task", d) for d in dispatches]
+
+    async def call_task(state: _DispatchState, config: RunnableConfig):
+        rt = ToolRuntime(
+            state=state,
+            config=config,
+            context=None,
+            stream_writer=None,
+            tool_call_id=state["tcid"],
+            store=None,
+        )
+        return await task_tool.coroutine(
+            description=state["desc"], subagent_type=state["subtype"], runtime=rt
+        )
+
+    g = StateGraph(_DispatchState)
+    g.add_node("call_task", call_task)
+    g.add_conditional_edges(START, fanout, ["call_task"])
+    g.add_edge("call_task", END)
+    return g.compile(checkpointer=checkpointer)
+
+
+@pytest.mark.asyncio
+async def test_all_reject_decisions_route_to_each_subagent_with_messages_intact():
+    """All cards rejected across two parallel subagents — order and messages preserved.
+
+    Setup mirrors a real "user reviews two parallel ticket creations and
+    rejects everything with distinct reasons":
+
+    - Sub-A pauses with 2 actions.
+    - Sub-B pauses with 1 action.
+    - Flat decisions: 3 rejects, each with a unique ``message``.
+
+    Asserts each subagent receives only its slice, in original order,
+    with every ``message`` intact and no ``edited_action`` fields fabricated.
+    """
+    checkpointer = InMemorySaver()
+
+    sub_a = _build_recording_subagent(checkpointer, action_count=2)
+    sub_b = _build_recording_subagent(checkpointer, action_count=1)
+
+    task_tool = build_task_tool_with_parent_config(
+        [
+            {"name": "agent-a", "description": "first", "runnable": sub_a},
+            {"name": "agent-b", "description": "second", "runnable": sub_b},
+        ]
+    )
+
+    parent = _parent_two_branches(
+        task_tool,
+        dispatches=[
+            {"tcid": "tcid-A", "subtype": "agent-a", "desc": "do A"},
+            {"tcid": "tcid-B", "subtype": "agent-b", "desc": "do B"},
+        ],
+        checkpointer=checkpointer,
+    )
+
+    config: dict = {
+        "configurable": {"thread_id": "all-reject-thread"},
+        "recursion_limit": 100,
+    }
+    await parent.ainvoke({"messages": [HumanMessage(content="seed")]}, config)
+
+    paused_state = await parent.aget_state(config)
+    assert len(paused_state.interrupts) == 2, (
+        f"fixture broken: expected 2 paused subagents, got {len(paused_state.interrupts)}"
+    )
+
+    a_reject_0 = {"type": "reject", "message": "A[0] looks suspicious"}
+    a_reject_1 = {"type": "reject", "message": "A[1] duplicates A[0]"}
+    b_reject_0 = {"type": "reject", "message": "B[0] needs more context"}
+    flat_decisions = [a_reject_0, a_reject_1, b_reject_0]
+
+    pending = collect_pending_tool_calls(paused_state)
+    by_tool_call_id = slice_decisions_by_tool_call(flat_decisions, pending)
+
+    assert by_tool_call_id == {
+        "tcid-A": {"decisions": [a_reject_0, a_reject_1]},
+        "tcid-B": {"decisions": [b_reject_0]},
+    }, f"REGRESSION: slicer mis-routed all-reject decisions: {by_tool_call_id!r}"
+
+    config["configurable"]["surfsense_resume_value"] = by_tool_call_id
+    lg_resume_map = build_lg_resume_map(paused_state, by_tool_call_id)
+
+    await parent.ainvoke(Command(resume=lg_resume_map), config)
+
+    final_state = await parent.aget_state(config)
+    assert not final_state.interrupts, (
+        f"REGRESSION: leftover pending interrupts after all-reject resume: "
+        f"{final_state.interrupts!r}"
+    )
+
+    payloads: list[dict] = []
+    for msg in final_state.values.get("messages", []) or []:
+        content = getattr(msg, "content", None)
+        if isinstance(content, str):
+            with contextlib.suppress(json.JSONDecodeError):
+                payloads.append(json.loads(content))
+
+    expected_a = {"decisions": [a_reject_0, a_reject_1]}
+    expected_b = {"decisions": [b_reject_0]}
+
+    assert expected_a in payloads, (
+        f"REGRESSION: sub-A did not receive its 2-reject slice in order; "
+        f"payloads seen: {payloads!r}"
+    )
+    assert expected_b in payloads, (
+        f"REGRESSION: sub-B did not receive its single reject; "
+        f"payloads seen: {payloads!r}"
+    )
+
+    for p in payloads:
+        for d in p.get("decisions", []):
+            assert "edited_action" not in d, (
+                f"REGRESSION: spurious ``edited_action`` on a reject — "
+                f"slicer/bridge mutated payload: {d!r}"
+            )
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_parallel_resume_command_keying.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_parallel_resume_command_keying.py
new file mode 100644
index 0000000000..839cb7564a
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_parallel_resume_command_keying.py
@@ -0,0 +1,300 @@
+"""Real-graph contract: parallel resume must key ``Command(resume=...)`` by ``Interrupt.id``.
+
+When the parent state has multiple pending interrupts, langgraph rejects a
+scalar ``Command(resume=v)`` with::
+
+    RuntimeError: When there are multiple pending interrupts, you must specify
+    the interrupt id when resuming.
+
+The fix is to map each ``Interrupt.id`` from ``state.interrupts`` to the
+per-subagent slice — orthogonal to our ``tool_call_id``-keyed
+``surfsense_resume_value`` side-channel (different consumer: langgraph's
+pregel vs. our subagent bridge).
+
+This test reproduces the production failure with a real two-task parallel
+``Send`` parent graph, exercises the full resume cycle, and asserts both
+subagents complete cleanly with their per-subagent slice intact.
+
+Bundle sizes are chosen heterogeneous (``2`` and ``3``) so the assertions
+also catch slicer arithmetic regressions (e.g., ``cursor += 1`` instead of
+``cursor += action_count``). A symmetric ``(1, 1)`` configuration would
+silently pass such a bug because the slices would coincide.
+"""
+
+from __future__ import annotations
+
+import contextlib
+import json
+from typing import Annotated
+
+import pytest
+from langchain.tools import ToolRuntime
+from langchain_core.messages import AIMessage, HumanMessage
+from langchain_core.runnables import RunnableConfig
+from langgraph.checkpoint.memory import InMemorySaver
+from langgraph.graph import END, START, StateGraph
+from langgraph.graph.message import add_messages
+from langgraph.types import Command, Send, interrupt
+from typing_extensions import TypedDict
+
+from app.agents.multi_agent_chat.middleware.main_agent.checkpointed_subagent_middleware.resume_routing import (
+    build_lg_resume_map,
+    collect_pending_tool_calls,
+    slice_decisions_by_tool_call,
+)
+from app.agents.multi_agent_chat.middleware.main_agent.checkpointed_subagent_middleware.task_tool import (
+    build_task_tool_with_parent_config,
+)
+
+
+class _SubState(TypedDict, total=False):
+    messages: list
+
+
+class _DispatchState(TypedDict, total=False):
+    # ``add_messages`` reducer matches production agent state shape and is
+    # required when two parallel ``Send`` branches both write to ``messages``
+    # in the same superstep (post-resume both subagents return their own
+    # ``{"messages": [...]}``). Without a reducer langgraph raises
+    # ``InvalidUpdateError: At key 'messages': Can receive only one value``.
+    messages: Annotated[list, add_messages]
+    tcid: str
+    desc: str
+    subtype: str
+
+
+def _build_pausing_subagent(checkpointer: InMemorySaver, *, action_count: int):
+    """Subagent that pauses with an ``action_count``-action HITL bundle.
+
+    On resume it captures the decision payload as a JSON-serialized
+    ``AIMessage`` content so the test can inspect exactly which slice
+    reached this subagent — the strongest assertion against slicer
+    routing regressions.
+    """
+
+    def approve_node(_state):
+        decision = interrupt(
+            {
+                "action_requests": [
+                    {"name": f"act_{i}", "args": {"i": i}, "description": ""}
+                    for i in range(action_count)
+                ],
+                "review_configs": [{} for _ in range(action_count)],
+            }
+        )
+        return {"messages": [AIMessage(content=json.dumps(decision, sort_keys=True))]}
+
+    g = StateGraph(_SubState)
+    g.add_node("approve", approve_node)
+    g.add_edge(START, "approve")
+    g.add_edge("approve", END)
+    return g.compile(checkpointer=checkpointer)
+
+
+def _parent_graph_dispatching_two_tasks_via_send(
+    task_tool, *, tool_call_id_a: str, tool_call_id_b: str, checkpointer
+):
+    def fanout_edge(_state) -> list[Send]:
+        return [
+            Send(
+                "call_task",
+                {"tcid": tool_call_id_a, "desc": "approve A", "subtype": "agent-a"},
+            ),
+            Send(
+                "call_task",
+                {"tcid": tool_call_id_b, "desc": "approve B", "subtype": "agent-b"},
+            ),
+        ]
+
+    async def call_task(state: _DispatchState, config: RunnableConfig):
+        rt = ToolRuntime(
+            state=state,
+            config=config,
+            context=None,
+            stream_writer=None,
+            tool_call_id=state["tcid"],
+            store=None,
+        )
+        return await task_tool.coroutine(
+            description=state["desc"], subagent_type=state["subtype"], runtime=rt
+        )
+
+    g = StateGraph(_DispatchState)
+    g.add_node("call_task", call_task)
+    g.add_conditional_edges(START, fanout_edge, ["call_task"])
+    g.add_edge("call_task", END)
+    return g.compile(checkpointer=checkpointer)
+
+
+def _build_two_subagents_task_tool(checkpointer: InMemorySaver):
+    """Register two subagents under distinct names with heterogeneous bundle sizes.
+
+    Sub-A: 2-action bundle. Sub-B: 3-action bundle. Both ``> 1`` so the slice
+    arithmetic is sensitive to off-by-one mistakes.
+    """
+    sub_a = _build_pausing_subagent(checkpointer, action_count=2)
+    sub_b = _build_pausing_subagent(checkpointer, action_count=3)
+    return build_task_tool_with_parent_config(
+        [
+            {"name": "agent-a", "description": "first", "runnable": sub_a},
+            {"name": "agent-b", "description": "second", "runnable": sub_b},
+        ]
+    )
+
+
+@pytest.mark.asyncio
+async def test_parallel_resume_with_command_resume_scalar_raises_lg_runtime_error():
+    """Confirm the production failure mode: scalar resume on multi-pending state explodes.
+
+    This is a contract pin: if langgraph relaxes the requirement in a future
+    release, this test starts passing and we know we can simplify
+    ``stream_resume_chat``. Until then, the keyed form is mandatory.
+    """
+    checkpointer = InMemorySaver()
+    task_tool = _build_two_subagents_task_tool(checkpointer)
+    parent = _parent_graph_dispatching_two_tasks_via_send(
+        task_tool,
+        tool_call_id_a="parent-tcid-A",
+        tool_call_id_b="parent-tcid-B",
+        checkpointer=checkpointer,
+    )
+    config: dict = {
+        "configurable": {"thread_id": "parallel-resume-scalar"},
+        "recursion_limit": 100,
+    }
+    await parent.ainvoke({"messages": [HumanMessage(content="seed")]}, config)
+
+    with pytest.raises(RuntimeError, match="multiple pending interrupts"):
+        await parent.ainvoke(Command(resume={"decisions": ["A"]}), config)
+
+
+@pytest.mark.asyncio
+async def test_parallel_resume_with_per_interrupt_id_keying_completes_both_subagents():
+    """Production-shape resume: builds the langgraph-keyed map and resumes both subagents.
+
+    Mirrors what ``stream_resume_chat`` does: collects pending interrupts,
+    slices the flat decisions list by ``tool_call_id``, builds the
+    ``Interrupt.id``-keyed map for ``Command(resume=...)``, and resumes.
+
+    Post-conditions checked:
+    1. The langgraph-keyed map has exactly one entry per pending interrupt
+       id (``str`` keys, count matches).
+    2. Both subagents complete with no leftover pending interrupts.
+    3. **Each subagent receives its exact slice in the original order** —
+       this catches slicer arithmetic regressions (e.g., ``cursor += 1``)
+       that wouldn't surface by checking only "no leftover pending".
+    """
+    checkpointer = InMemorySaver()
+    task_tool = _build_two_subagents_task_tool(checkpointer)
+    tcid_a = "parent-tcid-A"
+    tcid_b = "parent-tcid-B"
+    parent = _parent_graph_dispatching_two_tasks_via_send(
+        task_tool,
+        tool_call_id_a=tcid_a,
+        tool_call_id_b=tcid_b,
+        checkpointer=checkpointer,
+    )
+    config: dict = {
+        "configurable": {"thread_id": "parallel-resume-keyed"},
+        "recursion_limit": 100,
+    }
+    await parent.ainvoke({"messages": [HumanMessage(content="seed")]}, config)
+
+    paused_state = await parent.aget_state(config)
+    assert len(paused_state.interrupts) == 2, (
+        "fixture broken: expected 2 paused subagents"
+    )
+
+    pending = collect_pending_tool_calls(paused_state)
+    assert dict(pending) == {tcid_a: 2, tcid_b: 3}, (
+        f"fixture broken: heterogeneous bundle sizes not detected; got {pending!r}"
+    )
+
+    a_d0 = {"type": "approve"}
+    a_d1 = {"type": "reject", "message": "A[1] is redundant"}
+    b_d0 = {
+        "type": "edit",
+        "edited_action": {"name": "act_0", "args": {"i": 0, "edited": True}},
+    }
+    b_d1 = {"type": "approve"}
+    b_d2 = {"type": "reject", "message": "B[2] needs more context"}
+    flat_decisions = [a_d0, a_d1, b_d0, b_d1, b_d2]
+
+    by_tool_call_id = slice_decisions_by_tool_call(flat_decisions, pending)
+    lg_resume_map = build_lg_resume_map(paused_state, by_tool_call_id)
+
+    assert len(lg_resume_map) == 2, (
+        f"expected one entry per pending interrupt id, got {lg_resume_map!r}"
+    )
+    assert all(isinstance(k, str) for k in lg_resume_map), (
+        f"keys must be Interrupt.id strings, got {[type(k).__name__ for k in lg_resume_map]}"
+    )
+
+    config["configurable"]["surfsense_resume_value"] = by_tool_call_id
+
+    await parent.ainvoke(Command(resume=lg_resume_map), config)
+
+    final_state = await parent.aget_state(config)
+    assert not final_state.interrupts, (
+        f"expected no leftover pending interrupts after resume, got "
+        f"{final_state.interrupts!r}"
+    )
+
+    payloads: list[dict] = []
+    for msg in final_state.values.get("messages", []) or []:
+        content = getattr(msg, "content", None)
+        if isinstance(content, str):
+            with contextlib.suppress(json.JSONDecodeError):
+                payloads.append(json.loads(content))
+
+    expected_a = {"decisions": [a_d0, a_d1]}
+    expected_b = {"decisions": [b_d0, b_d1, b_d2]}
+    assert expected_a in payloads, (
+        f"REGRESSION: sub-A did not receive its 2-decision slice in order; "
+        f"payloads seen: {payloads!r}"
+    )
+    assert expected_b in payloads, (
+        f"REGRESSION: sub-B did not receive its 3-decision slice in order; "
+        f"payloads seen: {payloads!r}"
+    )
+
+
+def test_build_lg_resume_map_returns_empty_when_no_interrupts_carry_stamps():
+    """Unstamped interrupts can't be routed; we don't fabricate keys for them.
+
+    If a regression lets an unstamped interrupt reach the parent state, the
+    empty map propagates to the call site and surfaces as a clear count
+    mismatch instead of a silent mis-route.
+    """
+    from types import SimpleNamespace
+
+    fake_interrupt = SimpleNamespace(id="i-foreign", value={"action_requests": [{}]})
+    state = SimpleNamespace(interrupts=(fake_interrupt,))
+
+    assert build_lg_resume_map(state, {"some-tcid": {"decisions": ["x"]}}) == {}
+
+
+def test_build_lg_resume_map_skips_interrupts_without_corresponding_slice():
+    """Skip rather than silently mis-route when the slice and interrupts disagree.
+
+    Only emit a resume entry when both an interrupt id and a tool_call_id
+    slice are present; a mismatch indicates upstream contract drift and
+    should not be papered over.
+    """
+    from types import SimpleNamespace
+
+    state = SimpleNamespace(
+        interrupts=(
+            SimpleNamespace(
+                id="i-A",
+                value={"action_requests": [{}], "tool_call_id": "tcid-A"},
+            ),
+            SimpleNamespace(
+                id="i-B",
+                value={"action_requests": [{}], "tool_call_id": "tcid-B"},
+            ),
+        )
+    )
+
+    out = build_lg_resume_map(state, {"tcid-A": {"decisions": ["only-A"]}})
+    assert out == {"i-A": {"decisions": ["only-A"]}}
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_parallel_self_and_middleware_gated.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_parallel_self_and_middleware_gated.py
new file mode 100644
index 0000000000..921c4a9eba
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_parallel_self_and_middleware_gated.py
@@ -0,0 +1,275 @@
+"""Real-graph parallel HITL across both approval kinds — the keystone regression.
+
+Pre-fix bug: the parallel-HITL routing layer (``collect_pending_tool_calls``
++ ``slice_decisions_by_tool_call`` + ``build_lg_resume_map``) only
+recognized middleware-gated approvals (LC HITL shape from
+``HumanInTheLoopMiddleware``). Self-gated approvals from
+``request_approval`` and middleware-gated permission asks from
+``PermissionMiddleware`` both used the SurfSense-specific
+``{type, action, context}`` shape, so when the orchestrator dispatched
+two parallel ``task`` calls — one self-gated, one middleware-gated — only
+one interrupt was visible to the routing layer and resume crashed with
+``Decision count mismatch``.
+
+This test fans out two real subagents via ``Send``: one calls
+``request_approval`` (self-gated), the other calls
+``request_permission_decision`` (middleware-gated). Both pause; the routing
+layer must see TWO LC HITL interrupts, slice the decisions by
+``tool_call_id``, key by ``Interrupt.id``, and resume both branches with
+their per-slice payload.
+"""
+
+from __future__ import annotations
+
+import contextlib
+import json
+from typing import Annotated
+
+import pytest
+from langchain.tools import ToolRuntime
+from langchain_core.messages import AIMessage, HumanMessage
+from langchain_core.runnables import RunnableConfig
+from langgraph.checkpoint.memory import InMemorySaver
+from langgraph.graph import END, START, StateGraph
+from langgraph.graph.message import add_messages
+from langgraph.types import Command, Send
+from typing_extensions import TypedDict
+
+from app.agents.multi_agent_chat.middleware.main_agent.checkpointed_subagent_middleware.resume_routing import (
+    build_lg_resume_map,
+    collect_pending_tool_calls,
+    slice_decisions_by_tool_call,
+)
+from app.agents.multi_agent_chat.middleware.main_agent.checkpointed_subagent_middleware.task_tool import (
+    build_task_tool_with_parent_config,
+)
+from app.agents.multi_agent_chat.middleware.shared.permissions.ask.request import (
+    request_permission_decision,
+)
+from app.agents.multi_agent_chat.subagents.shared.hitl.approvals.self_gated import (
+    request_approval,
+)
+from app.agents.new_chat.permissions import Rule
+
+
+class _SubState(TypedDict, total=False):
+    messages: list
+
+
+class _DispatchState(TypedDict, total=False):
+    # ``add_messages`` is mandatory: parallel ``Send`` branches both append
+    # to ``messages`` in the same superstep; without a reducer langgraph
+    # raises ``InvalidUpdateError``.
+    messages: Annotated[list, add_messages]
+    tcid: str
+    desc: str
+    subtype: str
+
+
+def _build_self_gated_subagent(checkpointer: InMemorySaver):
+    """Subagent that pauses via ``request_approval`` (self-gated path)."""
+
+    def gate_node(_state):
+        result = request_approval(
+            action_type="gmail_email_send",
+            tool_name="send_gmail_email",
+            params={"to": "alice@example.com"},
+        )
+        return {
+            "messages": [
+                AIMessage(
+                    content=json.dumps(
+                        {
+                            "kind": "self_gated",
+                            "decision_type": result.decision_type,
+                            "params": result.params,
+                            "rejected": result.rejected,
+                        },
+                        sort_keys=True,
+                    )
+                )
+            ]
+        }
+
+    g = StateGraph(_SubState)
+    g.add_node("gate", gate_node)
+    g.add_edge(START, "gate")
+    g.add_edge("gate", END)
+    return g.compile(checkpointer=checkpointer)
+
+
+def _build_middleware_gated_subagent(checkpointer: InMemorySaver):
+    """Subagent that pauses via ``request_permission_decision`` (middleware-gated path)."""
+
+    def perm_node(_state):
+        decision = request_permission_decision(
+            tool_name="rm",
+            args={"path": "/tmp/file"},
+            patterns=["rm/*"],
+            rules=[Rule(permission="rm", pattern="*", action="ask")],
+            emit_interrupt=True,
+        )
+        return {
+            "messages": [
+                AIMessage(
+                    content=json.dumps(
+                        {"kind": "middleware_gated", "decision": decision},
+                        sort_keys=True,
+                    )
+                )
+            ]
+        }
+
+    g = StateGraph(_SubState)
+    g.add_node("perm", perm_node)
+    g.add_edge(START, "perm")
+    g.add_edge("perm", END)
+    return g.compile(checkpointer=checkpointer)
+
+
+def _build_mixed_task_tool(checkpointer: InMemorySaver):
+    """Two subagents, one per approval kind, registered under distinct names."""
+    return build_task_tool_with_parent_config(
+        [
+            {
+                "name": "self-gated-agent",
+                "description": "uses request_approval",
+                "runnable": _build_self_gated_subagent(checkpointer),
+            },
+            {
+                "name": "middleware-gated-agent",
+                "description": "uses request_permission_decision",
+                "runnable": _build_middleware_gated_subagent(checkpointer),
+            },
+        ]
+    )
+
+
+def _parent_dispatching_one_of_each(
+    task_tool, *, tcid_self: str, tcid_mw: str, checkpointer
+):
+    def fanout_edge(_state) -> list[Send]:
+        return [
+            Send(
+                "call_task",
+                {
+                    "tcid": tcid_self,
+                    "desc": "approve email",
+                    "subtype": "self-gated-agent",
+                },
+            ),
+            Send(
+                "call_task",
+                {
+                    "tcid": tcid_mw,
+                    "desc": "approve rm",
+                    "subtype": "middleware-gated-agent",
+                },
+            ),
+        ]
+
+    async def call_task(state: _DispatchState, config: RunnableConfig):
+        rt = ToolRuntime(
+            state=state,
+            config=config,
+            context=None,
+            stream_writer=None,
+            tool_call_id=state["tcid"],
+            store=None,
+        )
+        return await task_tool.coroutine(
+            description=state["desc"], subagent_type=state["subtype"], runtime=rt
+        )
+
+    g = StateGraph(_DispatchState)
+    g.add_node("call_task", call_task)
+    g.add_conditional_edges(START, fanout_edge, ["call_task"])
+    g.add_edge("call_task", END)
+    return g.compile(checkpointer=checkpointer)
+
+
+@pytest.mark.asyncio
+async def test_parallel_self_gated_and_middleware_gated_route_and_resume_cleanly():
+    """Both interrupt kinds must reach the routing layer in LC HITL shape and resume independently."""
+    checkpointer = InMemorySaver()
+    task_tool = _build_mixed_task_tool(checkpointer)
+    tcid_self = "tcid-self-gated"
+    tcid_mw = "tcid-middleware-gated"
+    parent = _parent_dispatching_one_of_each(
+        task_tool,
+        tcid_self=tcid_self,
+        tcid_mw=tcid_mw,
+        checkpointer=checkpointer,
+    )
+    config: dict = {
+        "configurable": {"thread_id": "mixed-parallel"},
+        "recursion_limit": 100,
+    }
+    await parent.ainvoke({"messages": [HumanMessage(content="seed")]}, config)
+
+    paused = await parent.aget_state(config)
+    assert len(paused.interrupts) == 2, (
+        "fixture broken: expected one paused interrupt per approval kind"
+    )
+
+    # Both interrupts must speak the same wire shape — the whole point of
+    # the unification. If either one regresses to the legacy SurfSense shape
+    # ``collect_pending_tool_calls`` would silently skip it and the count
+    # below would be 1.
+    pending = collect_pending_tool_calls(paused)
+    assert dict(pending) == {tcid_self: 1, tcid_mw: 1}, (
+        f"REGRESSION: not all interrupt kinds reached the routing layer; "
+        f"got {pending!r}"
+    )
+
+    # Verify the actual wire payloads carry the LC HITL standard fields
+    # (extra defensive assertion against partial regressions where one
+    # path stamps tool_call_id but reverts the body shape).
+    interrupt_types = {i.value.get("interrupt_type") for i in paused.interrupts}
+    assert interrupt_types == {"gmail_email_send", "permission_ask"}
+
+    # Resume order: same order the SSE stream would emit (interrupts list).
+    decision_self = {"type": "approve"}
+    decision_mw = {"type": "approve_always"}
+    flat_decisions = [
+        # Match `pending` order.
+        decision_self if pending[0][0] == tcid_self else decision_mw,
+        decision_mw if pending[0][0] == tcid_self else decision_self,
+    ]
+    by_tool_call_id = slice_decisions_by_tool_call(flat_decisions, pending)
+    lg_resume_map = build_lg_resume_map(paused, by_tool_call_id)
+    assert len(lg_resume_map) == 2
+
+    config["configurable"]["surfsense_resume_value"] = by_tool_call_id
+    await parent.ainvoke(Command(resume=lg_resume_map), config)
+
+    final = await parent.aget_state(config)
+    assert not final.interrupts, (
+        f"expected both branches resumed, but state still has interrupts: "
+        f"{final.interrupts!r}"
+    )
+
+    # Each subagent must have received its own slice — verify by inspecting
+    # the JSON-serialized result messages.
+    payloads: list[dict] = []
+    for msg in final.values.get("messages", []) or []:
+        content = getattr(msg, "content", None)
+        if isinstance(content, str):
+            with contextlib.suppress(json.JSONDecodeError):
+                payloads.append(json.loads(content))
+
+    self_payloads = [p for p in payloads if p.get("kind") == "self_gated"]
+    mw_payloads = [p for p in payloads if p.get("kind") == "middleware_gated"]
+    assert len(self_payloads) == 1, (
+        f"self-gated subagent did not complete; payloads: {payloads!r}"
+    )
+    assert len(mw_payloads) == 1, (
+        f"middleware-gated subagent did not complete; payloads: {payloads!r}"
+    )
+
+    # Self-gated approve → HITLResult(decision_type="approve", rejected=False).
+    assert self_payloads[0]["decision_type"] == "approve"
+    assert self_payloads[0]["rejected"] is False
+
+    # Middleware-gated approve_always → canonical permission shape unchanged.
+    assert mw_payloads[0]["decision"] == {"decision_type": "approve_always"}
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_parallel_tasks.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_parallel_tasks.py
new file mode 100644
index 0000000000..81be7d1ac4
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_parallel_tasks.py
@@ -0,0 +1,237 @@
+"""Behavioural guarantees for parallel ``task`` tool calls (non-HITL cases).
+
+The HITL bridge tests in ``test_hitl_bridge.py`` cover the parallel-interrupt
+flow. This file covers the *normal* parallel paths (no interrupts) and the
+failure-isolation guarantee — together they pin the behaviour we promise the
+user about ``asyncio.gather`` over two ``atask`` coroutines.
+"""
+
+from __future__ import annotations
+
+import asyncio
+
+import pytest
+from langchain.tools import ToolRuntime
+from langchain_core.messages import AIMessage, HumanMessage, ToolMessage
+from langgraph.checkpoint.memory import InMemorySaver
+from langgraph.graph import END, START, StateGraph
+from langgraph.types import Command
+from typing_extensions import TypedDict
+
+from app.agents.multi_agent_chat.middleware.main_agent.checkpointed_subagent_middleware.task_tool import (
+    build_task_tool_with_parent_config,
+)
+
+
+class _SubState(TypedDict, total=False):
+    messages: list
+
+
+def _build_success_subagent(reply: str):
+    """A subagent that completes immediately with ``reply``, never interrupts."""
+
+    def node(_state):
+        return {"messages": [AIMessage(content=reply)]}
+
+    g = StateGraph(_SubState)
+    g.add_node("only", node)
+    g.add_edge(START, "only")
+    g.add_edge("only", END)
+    return g.compile(checkpointer=InMemorySaver())
+
+
+def _build_failing_subagent(exc: Exception):
+    """A subagent whose only node raises ``exc`` — simulates a tool-level failure."""
+
+    def node(_state):
+        raise exc
+
+    g = StateGraph(_SubState)
+    g.add_node("only", node)
+    g.add_edge(START, "only")
+    g.add_edge("only", END)
+    return g.compile(checkpointer=InMemorySaver())
+
+
+def _make_runtime(parent_config: dict, *, tool_call_id: str) -> ToolRuntime:
+    return ToolRuntime(
+        state={"messages": [HumanMessage(content="seed")]},
+        context=None,
+        config=parent_config,
+        stream_writer=None,
+        tool_call_id=tool_call_id,
+        store=None,
+    )
+
+
+def _tool_message_text(cmd: Command, *, expected_tcid: str) -> str:
+    """Return the ToolMessage content the task tool produced for ``expected_tcid``."""
+    assert isinstance(cmd, Command), f"expected Command, got {type(cmd).__name__}"
+    messages = cmd.update["messages"]
+    assert len(messages) == 1, f"expected 1 ToolMessage, got {len(messages)}"
+    msg = messages[0]
+    assert isinstance(msg, ToolMessage)
+    assert msg.tool_call_id == expected_tcid
+    return msg.content
+
+
+@pytest.mark.asyncio
+async def test_two_parallel_atasks_to_different_subagents_both_succeed():
+    """Normal happy-path: two distinct subagents complete in parallel without interrupting."""
+    subagent_a = _build_success_subagent("A is done")
+    subagent_b = _build_success_subagent("B is done")
+    task_tool = build_task_tool_with_parent_config(
+        [
+            {"name": "alpha", "description": "alpha agent", "runnable": subagent_a},
+            {"name": "beta", "description": "beta agent", "runnable": subagent_b},
+        ]
+    )
+
+    parent_config: dict = {
+        "configurable": {"thread_id": "ok-thread"},
+        "recursion_limit": 100,
+    }
+    runtime_a = _make_runtime(parent_config, tool_call_id="tcid-A")
+    runtime_b = _make_runtime(parent_config, tool_call_id="tcid-B")
+
+    result_a, result_b = await asyncio.gather(
+        task_tool.coroutine(
+            description="do A",
+            subagent_type="alpha",
+            runtime=runtime_a,
+        ),
+        task_tool.coroutine(
+            description="do B",
+            subagent_type="beta",
+            runtime=runtime_b,
+        ),
+    )
+
+    assert _tool_message_text(result_a, expected_tcid="tcid-A") == "A is done"
+    assert _tool_message_text(result_b, expected_tcid="tcid-B") == "B is done"
+
+
+@pytest.mark.asyncio
+async def test_two_parallel_atasks_same_subagent_type_different_tool_call_ids():
+    """Per-call ``thread_id`` isolation: same compiled subagent invoked twice in parallel.
+
+    Both calls share the same ``InMemorySaver`` instance but are namespaced by
+    distinct ``tool_call_id``s, so checkpoints land in disjoint thread slots.
+    """
+    shared_subagent = _build_success_subagent("ok")
+    task_tool = build_task_tool_with_parent_config(
+        [
+            {
+                "name": "approver",
+                "description": "shared approver",
+                "runnable": shared_subagent,
+            },
+        ]
+    )
+
+    parent_config: dict = {
+        "configurable": {"thread_id": "shared-subagent-thread"},
+        "recursion_limit": 100,
+    }
+    runtime_a = _make_runtime(parent_config, tool_call_id="tcid-A")
+    runtime_b = _make_runtime(parent_config, tool_call_id="tcid-B")
+
+    result_a, result_b = await asyncio.gather(
+        task_tool.coroutine(
+            description="first request",
+            subagent_type="approver",
+            runtime=runtime_a,
+        ),
+        task_tool.coroutine(
+            description="second request",
+            subagent_type="approver",
+            runtime=runtime_b,
+        ),
+    )
+
+    # Both calls succeed and produce ToolMessages keyed by their own tool_call_id.
+    assert _tool_message_text(result_a, expected_tcid="tcid-A") == "ok"
+    assert _tool_message_text(result_b, expected_tcid="tcid-B") == "ok"
+
+    # Verify checkpoint isolation: each call's state lives at its own thread_id.
+    state_a = await shared_subagent.aget_state(
+        {"configurable": {"thread_id": "shared-subagent-thread::task:tcid-A"}}
+    )
+    state_b = await shared_subagent.aget_state(
+        {"configurable": {"thread_id": "shared-subagent-thread::task:tcid-B"}}
+    )
+    assert state_a.values["messages"][-1].content == "ok"
+    assert state_b.values["messages"][-1].content == "ok"
+
+    # The parent's own thread_id slot is untouched by either subagent.
+    state_parent = await shared_subagent.aget_state(
+        {"configurable": {"thread_id": "shared-subagent-thread"}}
+    )
+    assert state_parent.values == {} or state_parent.values.get("messages") in (
+        None,
+        [],
+    )
+
+
+@pytest.mark.asyncio
+async def test_one_atask_failure_does_not_corrupt_sibling_atask():
+    """Failure isolation: a sibling's exception must not poison the surviving atask's state.
+
+    Note: in production, langgraph's pregel runner cancels siblings when any
+    parallel task raises a non-``GraphBubbleUp`` exception (see
+    ``_should_stop_others`` in ``langgraph/pregel/_runner.py``). At our layer
+    that policy is invisible — what we *can* guarantee is that the two atask
+    coroutines have disjoint state, so the surviving one returns a valid
+    Command even when its sibling explodes.
+    """
+    failing_subagent = _build_failing_subagent(ValueError("boom"))
+    surviving_subagent = _build_success_subagent("still here")
+    task_tool = build_task_tool_with_parent_config(
+        [
+            {
+                "name": "broken",
+                "description": "always fails",
+                "runnable": failing_subagent,
+            },
+            {
+                "name": "healthy",
+                "description": "always succeeds",
+                "runnable": surviving_subagent,
+            },
+        ]
+    )
+
+    parent_config: dict = {
+        "configurable": {"thread_id": "iso-thread"},
+        "recursion_limit": 100,
+    }
+    runtime_fail = _make_runtime(parent_config, tool_call_id="tcid-fail")
+    runtime_ok = _make_runtime(parent_config, tool_call_id="tcid-ok")
+
+    results = await asyncio.gather(
+        task_tool.coroutine(
+            description="will explode",
+            subagent_type="broken",
+            runtime=runtime_fail,
+        ),
+        task_tool.coroutine(
+            description="will work",
+            subagent_type="healthy",
+            runtime=runtime_ok,
+        ),
+        return_exceptions=True,
+    )
+
+    fail_result, ok_result = results
+
+    assert isinstance(fail_result, Exception), (
+        f"expected the broken subagent to raise, got {fail_result!r}"
+    )
+    # ValueError gets wrapped in langgraph's internal exception types — the
+    # important guarantee is "this path errored", not the specific class.
+    assert "boom" in str(fail_result) or isinstance(fail_result, ValueError)
+
+    assert _tool_message_text(ok_result, expected_tcid="tcid-ok") == "still here"
+
+    # Configurable side-channel must not have been corrupted by the failure.
+    assert "surfsense_resume_value" not in parent_config["configurable"]
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_pending_interrupt.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_pending_interrupt.py
new file mode 100644
index 0000000000..75242689db
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_pending_interrupt.py
@@ -0,0 +1,55 @@
+"""Pins the first-wins assumption of ``get_first_pending_subagent_interrupt``.
+
+The bridge currently relies on at-most-one pending interrupt per snapshot
+(sequential tool nodes). If parallel tool calls are ever enabled, the bridge
+needs an id-aware lookup; these tests will need to be revisited at that point.
+"""
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+
+from app.agents.multi_agent_chat.middleware.main_agent.checkpointed_subagent_middleware.resume import (
+    get_first_pending_subagent_interrupt,
+)
+
+
+class TestGetFirstPendingSubagentInterrupt:
+    def test_returns_first_when_multiple_top_level_interrupts_pending(self):
+        first = SimpleNamespace(id="i-1", value={"decision": "approve"})
+        second = SimpleNamespace(id="i-2", value={"decision": "reject"})
+        state = SimpleNamespace(interrupts=(first, second), tasks=())
+
+        assert get_first_pending_subagent_interrupt(state) == (
+            "i-1",
+            {"decision": "approve"},
+        )
+
+    def test_returns_first_when_multiple_subtask_interrupts_pending(self):
+        first = SimpleNamespace(id="i-A", value="approve")
+        second = SimpleNamespace(id="i-B", value="reject")
+        sub_task = SimpleNamespace(interrupts=(first, second))
+        state = SimpleNamespace(interrupts=(), tasks=(sub_task,))
+
+        assert get_first_pending_subagent_interrupt(state) == ("i-A", "approve")
+
+    def test_returns_none_when_no_interrupts(self):
+        state = SimpleNamespace(interrupts=(), tasks=())
+
+        assert get_first_pending_subagent_interrupt(state) == (None, None)
+
+    def test_returns_none_when_state_is_none(self):
+        assert get_first_pending_subagent_interrupt(None) == (None, None)
+
+    def test_skips_interrupts_with_none_value(self):
+        empty = SimpleNamespace(id="i-empty", value=None)
+        real = SimpleNamespace(id="i-real", value="approve")
+        state = SimpleNamespace(interrupts=(empty, real), tasks=())
+
+        assert get_first_pending_subagent_interrupt(state) == ("i-real", "approve")
+
+    def test_normalizes_non_string_id_to_none(self):
+        interrupt = SimpleNamespace(id=12345, value="approve")
+        state = SimpleNamespace(interrupts=(interrupt,), tasks=())
+
+        assert get_first_pending_subagent_interrupt(state) == (None, "approve")
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_resume_decision_routing.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_resume_decision_routing.py
new file mode 100644
index 0000000000..ceb0df830f
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_resume_decision_routing.py
@@ -0,0 +1,154 @@
+"""Slicing helper that routes a flat decisions list to per-tool-call payloads.
+
+The frontend submits ``decisions: list[ResumeDecision]`` in the same order the
+SSE stream emitted approval cards. When multiple parallel subagents are paused,
+the backend slices that flat list into per-``tool_call_id`` payloads so each
+``atask`` reads only its own decisions through ``consume_surfsense_resume``.
+
+The extractor reads ``state.interrupts[i].value["tool_call_id"]`` — which is
+populated by ``propagation.wrap_with_tool_call_id`` inside ``task_tool``'s
+``except GraphInterrupt`` chokepoint whenever a subagent interrupt bubbles up
+through ``[a]task`` — to build the ordered ``pending`` list the slicer needs.
+"""
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+
+import pytest
+
+from app.agents.multi_agent_chat.middleware.main_agent.checkpointed_subagent_middleware.resume_routing import (
+    collect_pending_tool_calls,
+    slice_decisions_by_tool_call,
+)
+
+
+class TestSliceDecisionsByToolCall:
+    def test_splits_flat_decisions_across_two_pending_tool_calls(self):
+        decisions = [
+            {"type": "approve"},
+            {"type": "edit", "edited_action": {"name": "edited-b1"}},
+            {"type": "reject"},
+            {"type": "approve"},
+            {"type": "approve"},
+        ]
+        pending = [
+            ("tcid-A", 3),
+            ("tcid-B", 2),
+        ]
+
+        routed = slice_decisions_by_tool_call(decisions, pending)
+
+        assert routed == {
+            "tcid-A": {"decisions": decisions[0:3]},
+            "tcid-B": {"decisions": decisions[3:5]},
+        }
+
+    def test_raises_when_decision_count_less_than_total_actions(self):
+        decisions = [{"type": "approve"}, {"type": "approve"}]
+        pending = [("tcid-A", 3), ("tcid-B", 2)]
+
+        with pytest.raises(ValueError, match=r"5 actions.*2 decisions"):
+            slice_decisions_by_tool_call(decisions, pending)
+
+    def test_raises_when_decision_count_greater_than_total_actions(self):
+        decisions = [{"type": "approve"}] * 6
+        pending = [("tcid-A", 3), ("tcid-B", 2)]
+
+        with pytest.raises(ValueError, match=r"5 actions.*6 decisions"):
+            slice_decisions_by_tool_call(decisions, pending)
+
+    def test_handles_single_pending_tool_call(self):
+        decisions = [{"type": "approve"}, {"type": "reject"}]
+        pending = [("tcid-only", 2)]
+
+        routed = slice_decisions_by_tool_call(decisions, pending)
+
+        assert routed == {"tcid-only": {"decisions": decisions}}
+
+    def test_returns_empty_dict_for_no_pending(self):
+        routed = slice_decisions_by_tool_call([], [])
+
+        assert routed == {}
+
+
+def _interrupt_with(tool_call_id: str, action_count: int):
+    return SimpleNamespace(
+        id=f"i-{tool_call_id}",
+        value={
+            "action_requests": [{"name": "n", "args": {}}] * action_count,
+            "review_configs": [{}] * action_count,
+            "tool_call_id": tool_call_id,
+        },
+    )
+
+
+class TestCollectPendingToolCalls:
+    def test_single_pending_returns_one_pair(self):
+        state = SimpleNamespace(interrupts=(_interrupt_with("tcid-only", 3),))
+
+        assert collect_pending_tool_calls(state) == [("tcid-only", 3)]
+
+    def test_multiple_pending_preserves_state_order(self):
+        """Order must match what the SSE stream emitted (= state.interrupts order)."""
+        state = SimpleNamespace(
+            interrupts=(
+                _interrupt_with("tcid-A", 2),
+                _interrupt_with("tcid-B", 3),
+            )
+        )
+
+        assert collect_pending_tool_calls(state) == [("tcid-A", 2), ("tcid-B", 3)]
+
+    def test_empty_when_no_interrupts(self):
+        state = SimpleNamespace(interrupts=())
+
+        assert collect_pending_tool_calls(state) == []
+
+    def test_skips_interrupts_without_tool_call_id(self):
+        """Defensive: interrupts not produced by our propagation layer are ignored.
+
+        ``stream_resume_chat`` only owns the ``task``-routing slice; non-task
+        interrupts (e.g. parent-side HITL middleware on a different tool) are
+        not the slicer's responsibility.
+        """
+        state = SimpleNamespace(
+            interrupts=(
+                _interrupt_with("tcid-A", 2),
+                SimpleNamespace(id="i-foreign", value={"action_requests": [{}]}),
+                _interrupt_with("tcid-B", 1),
+            )
+        )
+
+        assert collect_pending_tool_calls(state) == [("tcid-A", 2), ("tcid-B", 1)]
+
+    def test_handles_scalar_value_interrupt(self):
+        """Subagents using ``interrupt("approve?")`` style propagate as ``{"value": ..., "tool_call_id": ...}``.
+
+        These have no ``action_requests`` — count them as a single action so
+        the frontend submits exactly one decision per such interrupt.
+        """
+        state = SimpleNamespace(
+            interrupts=(
+                SimpleNamespace(
+                    id="i-A",
+                    value={"value": "approve?", "tool_call_id": "tcid-A"},
+                ),
+            )
+        )
+
+        assert collect_pending_tool_calls(state) == [("tcid-A", 1)]
+
+    def test_raises_when_interrupt_value_missing_action_count_keys(self):
+        """An interrupt with ``tool_call_id`` but no usable count signals a contract bug."""
+        state = SimpleNamespace(
+            interrupts=(
+                SimpleNamespace(
+                    id="i-A",
+                    value={"tool_call_id": "tcid-A", "weird_shape": True},
+                ),
+            )
+        )
+
+        with pytest.raises(ValueError, match="action_requests"):
+            collect_pending_tool_calls(state)
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_resume_helpers.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_resume_helpers.py
new file mode 100644
index 0000000000..e8aacfc5d7
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_resume_helpers.py
@@ -0,0 +1,134 @@
+"""Resume side-channel is keyed per ``tool_call_id`` so parallel siblings can resume independently."""
+
+from __future__ import annotations
+
+from langchain.tools import ToolRuntime
+
+from app.agents.multi_agent_chat.middleware.main_agent.checkpointed_subagent_middleware.config import (
+    consume_surfsense_resume,
+    has_surfsense_resume,
+)
+
+
+def _runtime_with_config(
+    config: dict, *, tool_call_id: str = "tcid-test"
+) -> ToolRuntime:
+    return ToolRuntime(
+        state=None,
+        context=None,
+        config=config,
+        stream_writer=None,
+        tool_call_id=tool_call_id,
+        store=None,
+    )
+
+
+class TestConsumeSurfsenseResume:
+    def test_pops_only_entry_matching_runtime_tool_call_id(self):
+        configurable = {
+            "surfsense_resume_value": {
+                "tcid-A": {"decisions": ["approve"]},
+                "tcid-B": {"decisions": ["reject"]},
+            }
+        }
+        runtime = _runtime_with_config(
+            {"configurable": configurable}, tool_call_id="tcid-A"
+        )
+
+        assert consume_surfsense_resume(runtime) == {"decisions": ["approve"]}
+
+    def test_popping_one_entry_leaves_siblings_untouched(self):
+        configurable = {
+            "surfsense_resume_value": {
+                "tcid-A": {"decisions": ["approve"]},
+                "tcid-B": {"decisions": ["reject"]},
+            }
+        }
+        runtime_a = _runtime_with_config(
+            {"configurable": configurable}, tool_call_id="tcid-A"
+        )
+
+        consume_surfsense_resume(runtime_a)
+
+        assert configurable["surfsense_resume_value"] == {
+            "tcid-B": {"decisions": ["reject"]}
+        }
+
+    def test_returns_none_when_no_entry_for_this_tool_call(self):
+        runtime = _runtime_with_config(
+            {
+                "configurable": {
+                    "surfsense_resume_value": {"tcid-other": {"decisions": []}}
+                }
+            },
+            tool_call_id="tcid-A",
+        )
+
+        assert consume_surfsense_resume(runtime) is None
+
+    def test_returns_none_when_no_payload_queued(self):
+        runtime = _runtime_with_config({"configurable": {}})
+
+        assert consume_surfsense_resume(runtime) is None
+
+    def test_returns_none_when_configurable_missing(self):
+        runtime = _runtime_with_config({})
+
+        assert consume_surfsense_resume(runtime) is None
+
+    def test_drops_empty_dict_after_last_entry_consumed(self):
+        configurable = {
+            "surfsense_resume_value": {"tcid-A": {"decisions": ["approve"]}}
+        }
+        runtime = _runtime_with_config(
+            {"configurable": configurable}, tool_call_id="tcid-A"
+        )
+
+        consume_surfsense_resume(runtime)
+
+        assert "surfsense_resume_value" not in configurable
+
+
+class TestHasSurfsenseResume:
+    def test_true_when_entry_for_this_tool_call_present(self):
+        runtime = _runtime_with_config(
+            {
+                "configurable": {
+                    "surfsense_resume_value": {"tcid-A": {"decisions": ["approve"]}}
+                }
+            },
+            tool_call_id="tcid-A",
+        )
+
+        assert has_surfsense_resume(runtime) is True
+
+    def test_false_when_entry_for_other_tool_call_only(self):
+        runtime = _runtime_with_config(
+            {
+                "configurable": {
+                    "surfsense_resume_value": {"tcid-other": {"decisions": []}}
+                }
+            },
+            tool_call_id="tcid-A",
+        )
+
+        assert has_surfsense_resume(runtime) is False
+
+    def test_does_not_consume_payload(self):
+        configurable = {
+            "surfsense_resume_value": {"tcid-A": {"decisions": ["approve"]}}
+        }
+        runtime = _runtime_with_config(
+            {"configurable": configurable}, tool_call_id="tcid-A"
+        )
+
+        has_surfsense_resume(runtime)
+
+        assert configurable["surfsense_resume_value"] == {
+            "tcid-A": {"decisions": ["approve"]}
+        }
+
+    def test_false_when_payload_absent(self):
+        runtime = _runtime_with_config({"configurable": {}})
+
+        assert has_surfsense_resume(runtime) is False
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_subagent_interrupt_stamping.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_subagent_interrupt_stamping.py
new file mode 100644
index 0000000000..7df9dedc65
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_subagent_interrupt_stamping.py
@@ -0,0 +1,284 @@
+"""Production-shape regression tests for ``tool_call_id`` stamping on subagent interrupts.
+
+The production bug we're pinning here: when the orchestrator dispatches one or
+more ``task`` tool calls and the targeted subagents hit a HITL ``interrupt(...)``,
+the parent's persisted ``state.interrupts`` must carry the parent's
+``tool_call_id`` on each interrupt value. Without that stamp,
+``stream_resume_chat`` cannot route a flat ``decisions`` list back to the right
+paused subagent and resume fails with ``Decision count mismatch``.
+
+The tests in this module:
+
+- Build a **real** ``StateGraph`` subagent that calls real ``interrupt(...)``
+  (no MagicMock, no patch of langgraph internals — those are exactly the kind
+  of fakes that hid this bug).
+- Invoke the ``task`` tool from **inside a parent pregel** (via a tiny parent
+  ``StateGraph`` node) so the subagent invocation happens in the
+  production-shape "subgraph called from a parent tool node" context.
+- Assert on ``parent.state.interrupts[*].value["tool_call_id"]`` — the
+  observable that ``stream_resume_chat`` reads.
+"""
+
+from __future__ import annotations
+
+import pytest
+from langchain.tools import ToolRuntime
+from langchain_core.messages import AIMessage, HumanMessage
+from langchain_core.runnables import RunnableConfig
+from langgraph.checkpoint.memory import InMemorySaver
+from langgraph.graph import END, START, StateGraph
+from langgraph.types import Send, interrupt
+from typing_extensions import TypedDict
+
+from app.agents.multi_agent_chat.middleware.main_agent.checkpointed_subagent_middleware.task_tool import (
+    build_task_tool_with_parent_config,
+)
+
+
+class _S(TypedDict, total=False):
+    messages: list
+
+
+def _build_single_interrupt_subagent(checkpointer: InMemorySaver):
+    """Subagent that fires one HITL-bundle-shaped interrupt and waits for a decision."""
+
+    def approve_node(_state):
+        decision = interrupt(
+            {
+                "action_requests": [
+                    {"name": "do_thing", "args": {"x": 1}, "description": ""}
+                ],
+                "review_configs": [{}],
+            }
+        )
+        return {"messages": [AIMessage(content=f"got:{decision}")]}
+
+    g = StateGraph(_S)
+    g.add_node("approve", approve_node)
+    g.add_edge(START, "approve")
+    g.add_edge("approve", END)
+    return g.compile(checkpointer=checkpointer)
+
+
+def _build_bundle_subagent(checkpointer: InMemorySaver):
+    """Subagent that fires one interrupt carrying a 3-action bundle."""
+
+    def bundle_node(_state):
+        decision = interrupt(
+            {
+                "action_requests": [
+                    {"name": "a", "args": {}, "description": ""},
+                    {"name": "b", "args": {}, "description": ""},
+                    {"name": "c", "args": {}, "description": ""},
+                ],
+                "review_configs": [{}, {}, {}],
+            }
+        )
+        return {"messages": [AIMessage(content=f"bundle:{decision}")]}
+
+    g = StateGraph(_S)
+    g.add_node("bundle", bundle_node)
+    g.add_edge(START, "bundle")
+    g.add_edge("bundle", END)
+    return g.compile(checkpointer=checkpointer)
+
+
+def _parent_graph_calling_task(task_tool, *, tool_call_id: str, checkpointer):
+    """A tiny parent graph whose only node invokes ``task_tool`` from inside the pregel runtime.
+
+    This is the minimal reproduction of production's "subagent invoked from
+    inside a parent tool node" context — the *only* context where langgraph
+    treats the subagent as a subgraph and routes its interrupts back to the
+    parent's checkpoint.
+    """
+
+    async def call_task(state, config: RunnableConfig):
+        rt = ToolRuntime(
+            state=state,
+            config=config,
+            context=None,
+            stream_writer=None,
+            tool_call_id=tool_call_id,
+            store=None,
+        )
+        return await task_tool.coroutine(
+            description="please approve",
+            subagent_type="approver",
+            runtime=rt,
+        )
+
+    g = StateGraph(_S)
+    g.add_node("call_task", call_task)
+    g.add_edge(START, "call_task")
+    g.add_edge("call_task", END)
+    return g.compile(checkpointer=checkpointer)
+
+
+class _DispatchState(TypedDict, total=False):
+    messages: list
+    tcid: str
+    desc: str
+
+
+def _parent_graph_dispatching_two_tasks_via_send(
+    task_tool, *, tool_call_id_a: str, tool_call_id_b: str, checkpointer
+):
+    """A parent graph that dispatches two ``task`` calls as parallel pregel
+    tasks via :class:`~langgraph.types.Send`.
+
+    This mirrors the production dispatch mechanism: when the orchestrator's
+    LLM emits two ``task`` tool calls in one turn, langchain's tool node
+    fans them out as parallel pregel tasks (the same primitive as ``Send``)
+    so each tool call gets its own pregel task that can raise
+    ``GraphInterrupt`` independently — and pregel collects *all* of them
+    into the parent's snapshot at the end of the superstep.
+    """
+
+    def fanout_edge(_state) -> list[Send]:
+        return [
+            Send("call_task", {"tcid": tool_call_id_a, "desc": "approve A"}),
+            Send("call_task", {"tcid": tool_call_id_b, "desc": "approve B"}),
+        ]
+
+    async def call_task(state: _DispatchState, config: RunnableConfig):
+        rt = ToolRuntime(
+            state=state,
+            config=config,
+            context=None,
+            stream_writer=None,
+            tool_call_id=state["tcid"],
+            store=None,
+        )
+        return await task_tool.coroutine(
+            description=state["desc"], subagent_type="approver", runtime=rt
+        )
+
+    g = StateGraph(_DispatchState)
+    g.add_node("call_task", call_task)
+    g.add_conditional_edges(START, fanout_edge, ["call_task"])
+    g.add_edge("call_task", END)
+    return g.compile(checkpointer=checkpointer)
+
+
+def _parent_interrupt_values(snapshot) -> list[dict]:
+    """Extract ``state.interrupts[*].value`` for assertions."""
+    return [i.value for i in (snapshot.interrupts or ())]
+
+
+@pytest.mark.asyncio
+async def test_single_subagent_interrupt_stamps_parent_tool_call_id():
+    """A single paused subagent must surface to the parent with ``tool_call_id`` stamped.
+
+    Production bug regression: was producing
+    ``value={"action_requests": [...], "review_configs": [...]}`` (no
+    ``tool_call_id``), causing ``stream_resume_chat`` to skip the interrupt
+    and raise ``Decision count mismatch``.
+    """
+    checkpointer = InMemorySaver()
+    subagent = _build_single_interrupt_subagent(checkpointer)
+    task_tool = build_task_tool_with_parent_config(
+        [{"name": "approver", "description": "approves", "runnable": subagent}]
+    )
+    parent = _parent_graph_calling_task(
+        task_tool, tool_call_id="parent-tcid-A", checkpointer=checkpointer
+    )
+
+    parent_config = {
+        "configurable": {"thread_id": "parent-thread"},
+        "recursion_limit": 100,
+    }
+    await parent.ainvoke({"messages": [HumanMessage(content="seed")]}, parent_config)
+
+    snap = await parent.aget_state(parent_config)
+    values = _parent_interrupt_values(snap)
+    assert len(values) == 1, (
+        f"expected exactly 1 parent interrupt, got {len(values)}: {values!r}"
+    )
+    value = values[0]
+    assert isinstance(value, dict)
+    assert value.get("tool_call_id") == "parent-tcid-A", (
+        f"REGRESSION: parent interrupt missing/wrong tool_call_id stamp. "
+        f"Expected 'parent-tcid-A', got {value.get('tool_call_id')!r}. "
+        f"Keys present: {sorted(value.keys())}"
+    )
+    # The original HITL payload must still be intact alongside the stamp.
+    assert value.get("action_requests") == [
+        {"name": "do_thing", "args": {"x": 1}, "description": ""}
+    ]
+
+
+@pytest.mark.asyncio
+async def test_two_parallel_subagents_each_stamp_their_own_tool_call_id():
+    """Two ``task`` calls dispatched in parallel must each carry their own ``tool_call_id``.
+
+    This is the actual production scenario (Linear + Jira ticket creation):
+    two parallel ``task`` tool calls, both subagents hit HITL, parent must
+    end up with two interrupts whose ``tool_call_id``s match the two
+    distinct parent-level ``tool_call_id``s the LLM emitted.
+    """
+    checkpointer = InMemorySaver()
+    subagent = _build_single_interrupt_subagent(checkpointer)
+    task_tool = build_task_tool_with_parent_config(
+        [{"name": "approver", "description": "approves", "runnable": subagent}]
+    )
+    parent = _parent_graph_dispatching_two_tasks_via_send(
+        task_tool,
+        tool_call_id_a="parent-tcid-A",
+        tool_call_id_b="parent-tcid-B",
+        checkpointer=checkpointer,
+    )
+
+    parent_config = {
+        "configurable": {"thread_id": "parent-thread-parallel"},
+        "recursion_limit": 100,
+    }
+    await parent.ainvoke({"messages": [HumanMessage(content="seed")]}, parent_config)
+
+    snap = await parent.aget_state(parent_config)
+    values = _parent_interrupt_values(snap)
+    assert len(values) == 2, (
+        f"expected 2 parent interrupts (one per parallel task call), "
+        f"got {len(values)}: {values!r}"
+    )
+    stamps = {v.get("tool_call_id") for v in values}
+    assert stamps == {"parent-tcid-A", "parent-tcid-B"}, (
+        f"REGRESSION: parallel parent interrupts missing/wrong tool_call_id stamps. "
+        f"Expected {{'parent-tcid-A', 'parent-tcid-B'}}, got {stamps!r}. "
+        f"Values: {values!r}"
+    )
+
+
+@pytest.mark.asyncio
+async def test_bundle_subagent_interrupt_stamps_tool_call_id_preserving_actions():
+    """A subagent emitting a multi-action bundle must surface stamped, with all actions intact.
+
+    The bundle shape (``action_requests=[3 items]``) drives the
+    ``slice_decisions_by_tool_call`` accounting in ``stream_resume_chat`` —
+    if either the stamp or the action count is lost, resume routing
+    miscounts and crashes.
+    """
+    checkpointer = InMemorySaver()
+    subagent = _build_bundle_subagent(checkpointer)
+    task_tool = build_task_tool_with_parent_config(
+        [{"name": "approver", "description": "approves", "runnable": subagent}]
+    )
+    parent = _parent_graph_calling_task(
+        task_tool, tool_call_id="parent-tcid-bundle", checkpointer=checkpointer
+    )
+
+    parent_config = {
+        "configurable": {"thread_id": "parent-thread-bundle"},
+        "recursion_limit": 100,
+    }
+    await parent.ainvoke({"messages": [HumanMessage(content="seed")]}, parent_config)
+
+    snap = await parent.aget_state(parent_config)
+    values = _parent_interrupt_values(snap)
+    assert len(values) == 1
+    value = values[0]
+    assert value.get("tool_call_id") == "parent-tcid-bundle"
+    assert isinstance(value.get("action_requests"), list)
+    assert len(value["action_requests"]) == 3, (
+        f"REGRESSION: bundle action_requests count changed during stamping; "
+        f"got {len(value['action_requests'])} actions: {value['action_requests']!r}"
+    )
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_subagent_invoke_config.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_subagent_invoke_config.py
new file mode 100644
index 0000000000..3465dd1d81
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_subagent_invoke_config.py
@@ -0,0 +1,94 @@
+"""Per-call ``thread_id`` derivation for nested subagent invocations.
+
+Parallel ``task`` (and ``ask_knowledge_base``) calls must land in disjoint
+checkpoint slots so their nested pregel runs do not stomp on each other or on
+the parent's checkpoint state. The slot key is derived from the runtime's
+``tool_call_id`` so the same call across the resume cycle keeps reading from
+the same snapshot.
+
+Note: we namespace via ``thread_id`` rather than ``checkpoint_ns`` because
+langgraph's ``aget_state`` interprets a non-empty ``checkpoint_ns`` as a
+subgraph path and raises ``ValueError("Subgraph X not found")``. ``thread_id``
+is the primary checkpoint key and is free-form, so it's the right primitive.
+"""
+
+from __future__ import annotations
+
+from langchain.tools import ToolRuntime
+
+from app.agents.multi_agent_chat.middleware.main_agent.checkpointed_subagent_middleware.config import (
+    subagent_invoke_config,
+)
+
+
+def _runtime(*, tool_call_id: str, config: dict | None = None) -> ToolRuntime:
+    return ToolRuntime(
+        state=None,
+        context=None,
+        config=config or {},
+        stream_writer=None,
+        tool_call_id=tool_call_id,
+        store=None,
+    )
+
+
+class TestSubagentInvokeThreadId:
+    def test_sets_per_call_thread_id_under_parent(self):
+        runtime = _runtime(
+            tool_call_id="tcid-A",
+            config={"configurable": {"thread_id": "t1"}},
+        )
+
+        sub_config = subagent_invoke_config(runtime)
+
+        assert sub_config["configurable"]["thread_id"] == "t1::task:tcid-A"
+
+    def test_per_call_thread_id_nests_under_already_namespaced_parent(self):
+        """A subagent that itself spawns a subagent must keep nesting cleanly."""
+        runtime = _runtime(
+            tool_call_id="tcid-inner",
+            config={
+                "configurable": {
+                    "thread_id": "t1::task:tcid-outer",
+                }
+            },
+        )
+
+        sub_config = subagent_invoke_config(runtime)
+
+        assert (
+            sub_config["configurable"]["thread_id"]
+            == "t1::task:tcid-outer::task:tcid-inner"
+        )
+
+    def test_different_tool_call_ids_produce_different_thread_ids(self):
+        config = {"configurable": {"thread_id": "t1"}}
+        rt_a = _runtime(tool_call_id="tcid-A", config=config)
+        rt_b = _runtime(tool_call_id="tcid-B", config=config)
+
+        tid_a = subagent_invoke_config(rt_a)["configurable"]["thread_id"]
+        tid_b = subagent_invoke_config(rt_b)["configurable"]["thread_id"]
+
+        assert tid_a != tid_b
+
+    def test_same_tool_call_id_produces_same_thread_id_across_repeated_calls(self):
+        """Resume bridge needs to find the snapshot it primed earlier."""
+        config = {"configurable": {"thread_id": "t1"}}
+        rt_first = _runtime(tool_call_id="tcid-A", config=config)
+        rt_second = _runtime(tool_call_id="tcid-A", config=config)
+
+        tid_first = subagent_invoke_config(rt_first)["configurable"]["thread_id"]
+        tid_second = subagent_invoke_config(rt_second)["configurable"]["thread_id"]
+
+        assert tid_first == tid_second
+
+    def test_does_not_mutate_caller_config(self):
+        """Repeated calls must not accumulate suffixes onto the parent's config."""
+        original_thread_id = "t1"
+        config = {"configurable": {"thread_id": original_thread_id}}
+        runtime = _runtime(tool_call_id="tcid-A", config=config)
+
+        subagent_invoke_config(runtime)
+        subagent_invoke_config(runtime)
+
+        assert config["configurable"]["thread_id"] == original_thread_id
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/shared/__init__.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/shared/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/shared/permissions/__init__.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/shared/permissions/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/shared/permissions/test_lc_hitl_wire.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/shared/permissions/test_lc_hitl_wire.py
new file mode 100644
index 0000000000..a331190b2c
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/shared/permissions/test_lc_hitl_wire.py
@@ -0,0 +1,125 @@
+"""Regression: ``request_permission_decision`` must emit the unified LC HITL wire shape.
+
+Same bug class as :mod:`test_lc_hitl_wire` for self-gated approvals: the
+permission middleware previously fired the SurfSense-specific
+``{type, action, context}`` shape, which the parallel-HITL routing layer
+does not recognize. Standardizing on LC HITL keeps every approval kind on
+one routing path.
+"""
+
+from __future__ import annotations
+
+import pytest
+from langchain_core.messages import HumanMessage
+from langgraph.checkpoint.memory import InMemorySaver
+from langgraph.graph import END, START, StateGraph
+from langgraph.types import Command
+from typing_extensions import TypedDict
+
+from app.agents.multi_agent_chat.middleware.shared.permissions.ask.request import (
+    request_permission_decision,
+)
+from app.agents.new_chat.permissions import Rule
+
+
+class _State(TypedDict, total=False):
+    messages: list
+    final_decision: dict
+
+
+def _build_graph_calling_request_permission_decision(checkpointer: InMemorySaver):
+    """Real graph whose only node delegates to the permission ask primitive."""
+
+    def perm_node(_state):
+        decision = request_permission_decision(
+            tool_name="rm",
+            args={"path": "/tmp/file"},
+            patterns=["rm/*"],
+            rules=[Rule(permission="rm", pattern="*", action="ask")],
+            emit_interrupt=True,
+        )
+        return {"final_decision": decision}
+
+    g = StateGraph(_State)
+    g.add_node("perm", perm_node)
+    g.add_edge(START, "perm")
+    g.add_edge("perm", END)
+    return g.compile(checkpointer=checkpointer)
+
+
+@pytest.mark.asyncio
+async def test_permission_ask_payload_uses_lc_hitl_shape():
+    """The permission middleware now speaks the langchain HITL standard shape."""
+    checkpointer = InMemorySaver()
+    graph = _build_graph_calling_request_permission_decision(checkpointer)
+    config = {"configurable": {"thread_id": "perm-wire"}}
+    await graph.ainvoke({"messages": [HumanMessage(content="seed")]}, config)
+
+    snap = await graph.aget_state(config)
+    assert len(snap.interrupts) == 1
+    value = snap.interrupts[0].value
+
+    assert value.get("action_requests") == [
+        {"name": "rm", "args": {"path": "/tmp/file"}}
+    ], f"REGRESSION: permission ask reverted to legacy shape; got {value!r}"
+    review = value.get("review_configs")
+    assert isinstance(review, list) and len(review) == 1
+    palette = review[0]["allowed_decisions"]
+    # Native tool (no ``tool=`` argument): the palette must include the
+    # once/reject/edit triad. ``approve_always`` is gated on MCP-ness and
+    # therefore *omitted* here — palette content per tool kind is
+    # exercised in ``test_permission_ask_mcp_context``.
+    assert "approve" in palette and "reject" in palette and "edit" in palette
+    assert value.get("interrupt_type") == "permission_ask"
+    # SurfSense context rides through verbatim for FE explainability.
+    assert value["context"]["patterns"] == ["rm/*"]
+    assert value["context"]["always"] == ["rm/*"]
+
+
+@pytest.mark.asyncio
+async def test_resume_with_approve_envelope_returns_once_decision():
+    """``approve`` from the LC envelope projects to permission-domain ``once``."""
+    checkpointer = InMemorySaver()
+    graph = _build_graph_calling_request_permission_decision(checkpointer)
+    config = {"configurable": {"thread_id": "perm-once"}}
+    await graph.ainvoke({"messages": [HumanMessage(content="seed")]}, config)
+
+    await graph.ainvoke(Command(resume={"decisions": [{"type": "approve"}]}), config)
+    final = await graph.aget_state(config)
+    assert final.values.get("final_decision") == {"decision_type": "once"}
+
+
+@pytest.mark.asyncio
+async def test_resume_with_approve_always_envelope_projects_unchanged():
+    """``approve_always`` reply must project unchanged so the middleware can promote the rule."""
+    checkpointer = InMemorySaver()
+    graph = _build_graph_calling_request_permission_decision(checkpointer)
+    config = {"configurable": {"thread_id": "perm-approve-always"}}
+    await graph.ainvoke({"messages": [HumanMessage(content="seed")]}, config)
+
+    await graph.ainvoke(
+        Command(resume={"decisions": [{"type": "approve_always"}]}), config
+    )
+    final = await graph.aget_state(config)
+    assert final.values.get("final_decision") == {"decision_type": "approve_always"}
+
+
+@pytest.mark.asyncio
+async def test_resume_with_reject_and_feedback_carries_feedback_through():
+    """Reject feedback must survive normalization for ``CorrectedError`` to fire downstream."""
+    checkpointer = InMemorySaver()
+    graph = _build_graph_calling_request_permission_decision(checkpointer)
+    config = {"configurable": {"thread_id": "perm-reject"}}
+    await graph.ainvoke({"messages": [HumanMessage(content="seed")]}, config)
+
+    await graph.ainvoke(
+        Command(
+            resume={"decisions": [{"type": "reject", "feedback": "use the trash bin"}]}
+        ),
+        config,
+    )
+    final = await graph.aget_state(config)
+    assert final.values.get("final_decision") == {
+        "decision_type": "reject",
+        "feedback": "use the trash bin",
+    }
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/shared/permissions/test_permission_ask_mcp_context.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/shared/permissions/test_permission_ask_mcp_context.py
new file mode 100644
index 0000000000..c9bd4e142e
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/shared/permissions/test_permission_ask_mcp_context.py
@@ -0,0 +1,232 @@
+"""Permission-ask payload surfaces tool metadata for the FE card."""
+
+from __future__ import annotations
+
+from typing import Annotated, Any
+
+import pytest
+from langchain_core.messages import AIMessage, HumanMessage
+from langchain_core.tools import StructuredTool
+from langgraph.checkpoint.memory import InMemorySaver
+from langgraph.graph import END, START, StateGraph
+from langgraph.graph.message import add_messages
+from pydantic import BaseModel
+from typing_extensions import TypedDict
+
+from app.agents.multi_agent_chat.middleware.shared.permissions import (
+    build_permission_mw,
+)
+from app.agents.multi_agent_chat.middleware.shared.permissions.ask.payload import (
+    build_permission_ask_payload,
+)
+from app.agents.new_chat.feature_flags import AgentFeatureFlags
+from app.agents.new_chat.permissions import Rule, Ruleset
+
+
+class _NoArgs(BaseModel):
+    pass
+
+
+async def _noop(**_kwargs) -> str:
+    return ""
+
+
+def _ask_rule(tool_name: str) -> Rule:
+    return Rule(permission=tool_name, pattern="*", action="ask")
+
+
+def _make_mcp_tool(*, name: str, connector_id: int, connector_name: str):
+    return StructuredTool(
+        name=name,
+        description=f"Run {name} via MCP.",
+        coroutine=_noop,
+        args_schema=_NoArgs,
+        metadata={
+            "mcp_connector_id": connector_id,
+            "mcp_connector_name": connector_name,
+            "mcp_transport": "http",
+            "hitl": True,
+        },
+    )
+
+
+def test_payload_surfaces_mcp_fields_from_tool():
+    tool = _make_mcp_tool(
+        name="linear_create_issue", connector_id=42, connector_name="Linear (acme)"
+    )
+    payload = build_permission_ask_payload(
+        tool_name=tool.name,
+        args={"title": "bug"},
+        patterns=[tool.name],
+        rules=[_ask_rule(tool.name)],
+        tool=tool,
+    )
+    ctx = payload["context"]
+    assert ctx["mcp_connector_id"] == 42
+    assert ctx["mcp_server"] == "Linear (acme)"
+    assert ctx["tool_description"] == "Run linear_create_issue via MCP."
+
+
+def test_payload_omits_tool_fields_when_tool_is_none():
+    payload = build_permission_ask_payload(
+        tool_name="rm",
+        args={"path": "/tmp/x"},
+        patterns=["rm"],
+        rules=[_ask_rule("rm")],
+        tool=None,
+    )
+    ctx = payload["context"]
+    assert "mcp_connector_id" not in ctx
+    assert "mcp_server" not in ctx
+    assert "tool_description" not in ctx
+
+
+def test_palette_includes_approve_always_for_mcp_tool():
+    """Saving to the connector's trusted-tools list is only possible for MCP tools."""
+    tool = _make_mcp_tool(
+        name="linear_create_issue", connector_id=42, connector_name="Linear"
+    )
+    palette = build_permission_ask_payload(
+        tool_name=tool.name,
+        args={},
+        patterns=[tool.name],
+        rules=[_ask_rule(tool.name)],
+        tool=tool,
+    )["review_configs"][0]["allowed_decisions"]
+    assert "approve_always" in palette
+
+
+def test_palette_excludes_approve_always_for_native_tool():
+    """Native tools have no place to persist trust, so don't offer the button."""
+    native = StructuredTool(
+        name="rm",
+        description="Remove a file.",
+        coroutine=_noop,
+        args_schema=_NoArgs,
+        metadata={"hitl": True},
+    )
+    palette = build_permission_ask_payload(
+        tool_name=native.name,
+        args={"path": "/tmp/x"},
+        patterns=[native.name],
+        rules=[_ask_rule(native.name)],
+        tool=native,
+    )["review_configs"][0]["allowed_decisions"]
+    assert "approve_always" not in palette
+    assert palette == ["approve", "reject", "edit"]
+
+
+def test_palette_excludes_approve_always_when_tool_is_none():
+    """Without a tool object the middleware can't tell — fall back to the safe triad."""
+    palette = build_permission_ask_payload(
+        tool_name="rm",
+        args={"path": "/tmp/x"},
+        patterns=["rm"],
+        rules=[_ask_rule("rm")],
+        tool=None,
+    )["review_configs"][0]["allowed_decisions"]
+    assert palette == ["approve", "reject", "edit"]
+
+
+def test_payload_omits_falsy_mcp_metadata_fields():
+    tool = StructuredTool(
+        name="anon_tool",
+        description="",
+        coroutine=_noop,
+        args_schema=_NoArgs,
+        metadata={"mcp_connector_id": None, "mcp_connector_name": ""},
+    )
+    ctx = build_permission_ask_payload(
+        tool_name=tool.name,
+        args={},
+        patterns=[tool.name],
+        rules=[_ask_rule(tool.name)],
+        tool=tool,
+    )["context"]
+    assert "mcp_connector_id" not in ctx
+    assert "mcp_server" not in ctx
+    assert "tool_description" not in ctx
+
+
+class _State(TypedDict, total=False):
+    messages: Annotated[list, add_messages]
+
+
+def _emit_tool_call(tool_name: str, args: dict[str, Any], call_id: str):
+    def _node(_state: _State) -> dict[str, Any]:
+        return {
+            "messages": [
+                AIMessage(
+                    content="",
+                    tool_calls=[
+                        {
+                            "name": tool_name,
+                            "args": args,
+                            "id": call_id,
+                            "type": "tool_call",
+                        }
+                    ],
+                )
+            ]
+        }
+
+    return _node
+
+
+def _compile_graph_with(pm, tool_name: str, args: dict[str, Any], call_id: str):
+    def after(state: _State) -> dict[str, Any] | None:
+        return pm.after_model(state, None)  # type: ignore[arg-type]
+
+    g = StateGraph(_State)
+    g.add_node("emit", _emit_tool_call(tool_name, args, call_id))
+    g.add_node("permission", after)
+    g.add_edge(START, "emit")
+    g.add_edge("emit", "permission")
+    g.add_edge("permission", END)
+    return g.compile(checkpointer=InMemorySaver())
+
+
+@pytest.mark.asyncio
+async def test_middleware_decorates_interrupt_with_mcp_tool_metadata():
+    tool = _make_mcp_tool(
+        name="linear_create_issue", connector_id=7, connector_name="Linear"
+    )
+    pm = build_permission_mw(
+        flags=AgentFeatureFlags(enable_permission=False),
+        subagent_rulesets=[
+            Ruleset(origin="linear", rules=[_ask_rule(tool.name)]),
+        ],
+        tools=[tool],
+    )
+    assert pm is not None
+
+    graph = _compile_graph_with(pm, tool.name, {"title": "bug"}, "call-1")
+    config = {"configurable": {"thread_id": "linear-ask"}}
+    await graph.ainvoke({"messages": [HumanMessage(content="seed")]}, config)
+
+    snap = await graph.aget_state(config)
+    assert len(snap.interrupts) == 1
+    ctx = snap.interrupts[0].value["context"]
+    assert ctx["mcp_connector_id"] == 7
+    assert ctx["mcp_server"] == "Linear"
+    assert ctx["tool_description"] == "Run linear_create_issue via MCP."
+
+
+@pytest.mark.asyncio
+async def test_middleware_without_tool_index_still_asks_without_tool_fields():
+    pm = build_permission_mw(
+        flags=AgentFeatureFlags(enable_permission=False),
+        subagent_rulesets=[Ruleset(origin="kb", rules=[_ask_rule("rm")])],
+    )
+    assert pm is not None
+
+    graph = _compile_graph_with(pm, "rm", {"path": "/tmp/foo"}, "call-rm")
+    config = {"configurable": {"thread_id": "kb-rm"}}
+    await graph.ainvoke({"messages": [HumanMessage(content="seed")]}, config)
+
+    snap = await graph.aget_state(config)
+    assert len(snap.interrupts) == 1
+    ctx = snap.interrupts[0].value["context"]
+    assert "mcp_connector_id" not in ctx
+    assert "mcp_server" not in ctx
+    assert "tool_description" not in ctx
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/shared/permissions/test_subagent_owned_ruleset.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/shared/permissions/test_subagent_owned_ruleset.py
new file mode 100644
index 0000000000..b9ac6cd154
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/shared/permissions/test_subagent_owned_ruleset.py
@@ -0,0 +1,166 @@
+"""Regression: subagent-owned rulesets layer cleanly into ``PermissionMiddleware``.
+
+The KB unification swap (legacy ``interrupt_on`` map → KB-owned ``Ruleset``
+threaded through ``build_permission_mw(subagent_rulesets=...)``) must
+produce *exactly one* interrupt per destructive FS call, in LC HITL
+shape, even when ``enable_permission`` is False — destructive ops always
+ask.
+
+We exercise the production factory and a real ``PermissionMiddleware`` on a
+real ``StateGraph`` so the test catches regressions in factory gating,
+ruleset layering, and interrupt emission together.
+"""
+
+from __future__ import annotations
+
+from typing import Annotated, Any
+
+import pytest
+from langchain_core.messages import AIMessage, HumanMessage
+from langgraph.checkpoint.memory import InMemorySaver
+from langgraph.graph import END, START, StateGraph
+from langgraph.graph.message import add_messages
+from langgraph.types import Command
+from typing_extensions import TypedDict
+
+from app.agents.multi_agent_chat.middleware.shared.permissions import (
+    build_permission_mw,
+)
+from app.agents.new_chat.feature_flags import AgentFeatureFlags
+from app.agents.new_chat.permissions import Rule, Ruleset
+
+
+def _kb_style_ruleset() -> Ruleset:
+    """Mirror :data:`knowledge_base.agent.KB_RULESET` without importing it.
+
+    Importing the agent module pulls in deepagents and prompts; this test
+    is about the factory + middleware contract, not KB wiring.
+    """
+    return Ruleset(
+        origin="knowledge_base",
+        rules=[
+            Rule(permission="rm", pattern="*", action="ask"),
+            Rule(permission="rmdir", pattern="*", action="ask"),
+            Rule(permission="move_file", pattern="*", action="ask"),
+            Rule(permission="edit_file", pattern="*", action="ask"),
+            Rule(permission="write_file", pattern="*", action="ask"),
+        ],
+    )
+
+
+class _State(TypedDict, total=False):
+    messages: Annotated[list, add_messages]
+
+
+def _build_graph_with_permission_middleware(
+    *,
+    flags: AgentFeatureFlags,
+    subagent_rulesets: list[Ruleset] | None,
+    checkpointer: InMemorySaver,
+):
+    """Compile a one-node graph that emits a tool call for ``rm`` and
+    routes through the production ``PermissionMiddleware``.
+
+    The node returns an ``AIMessage`` with a tool call. The middleware's
+    ``after_model`` hook intercepts and (if a rule says ``ask``) raises
+    a ``GraphInterrupt`` carrying the LC HITL payload.
+    """
+    pm = build_permission_mw(flags=flags, subagent_rulesets=subagent_rulesets)
+
+    def node(_state: _State) -> dict[str, Any]:
+        msg = AIMessage(
+            content="",
+            tool_calls=[
+                {
+                    "name": "rm",
+                    "args": {"path": "/tmp/foo"},
+                    "id": "call-rm-1",
+                    "type": "tool_call",
+                }
+            ],
+        )
+        return {"messages": [msg]}
+
+    def after_node(state: _State) -> dict[str, Any] | None:
+        if pm is None:
+            return None
+        return pm.after_model(state, None)  # type: ignore[arg-type]
+
+    g = StateGraph(_State)
+    g.add_node("emit", node)
+    g.add_node("permission", after_node)
+    g.add_edge(START, "emit")
+    g.add_edge("emit", "permission")
+    g.add_edge("permission", END)
+    return g.compile(checkpointer=checkpointer), pm
+
+
+@pytest.mark.asyncio
+async def test_kb_ruleset_raises_one_lc_hitl_ask_for_rm_even_when_permission_flag_off():
+    """KB ruleset: ``rm`` must ask once even with ``enable_permission=False``.
+
+    This is the keystone of the unification: the legacy ``interrupt_on``
+    map fired regardless of ``enable_permission``, so the migrated rules
+    must too. Otherwise users could opt out of "ask before rm".
+    """
+    flags = AgentFeatureFlags(enable_permission=False)
+    checkpointer = InMemorySaver()
+    graph, pm = _build_graph_with_permission_middleware(
+        flags=flags,
+        subagent_rulesets=[_kb_style_ruleset()],
+        checkpointer=checkpointer,
+    )
+    assert pm is not None, "subagent rulesets must force the middleware on"
+
+    config = {"configurable": {"thread_id": "kb-cloud-rm"}}
+    await graph.ainvoke({"messages": [HumanMessage(content="seed")]}, config)
+
+    snap = await graph.aget_state(config)
+    assert len(snap.interrupts) == 1, (
+        f"REGRESSION: KB ruleset should raise exactly one interrupt; got "
+        f"{[i.value for i in snap.interrupts]!r}"
+    )
+    payload = snap.interrupts[0].value
+    requests = payload.get("action_requests")
+    assert requests == [{"name": "rm", "args": {"path": "/tmp/foo"}}], (
+        f"interrupt must carry the rm call in LC HITL shape; got {payload!r}"
+    )
+    assert payload.get("interrupt_type") == "permission_ask"
+
+
+@pytest.mark.asyncio
+async def test_kb_ruleset_resume_with_approve_lets_rm_through():
+    """Resume with ``approve`` → call kept; the model continues normally."""
+    flags = AgentFeatureFlags(enable_permission=False)
+    checkpointer = InMemorySaver()
+    graph, _ = _build_graph_with_permission_middleware(
+        flags=flags,
+        subagent_rulesets=[_kb_style_ruleset()],
+        checkpointer=checkpointer,
+    )
+    config = {"configurable": {"thread_id": "kb-cloud-rm-approve"}}
+    await graph.ainvoke({"messages": [HumanMessage(content="seed")]}, config)
+
+    await graph.ainvoke(Command(resume={"decisions": [{"type": "approve"}]}), config)
+    final = await graph.aget_state(config)
+    assert final.next == (), "graph must complete after approve"
+    last_ai = next(
+        (m for m in reversed(final.values["messages"]) if isinstance(m, AIMessage)),
+        None,
+    )
+    assert last_ai is not None
+    assert [tc["name"] for tc in last_ai.tool_calls] == ["rm"], (
+        "approved rm call must remain on the AIMessage so the tool can run"
+    )
+
+
+@pytest.mark.asyncio
+async def test_no_subagent_rulesets_with_permission_off_skips_middleware_entirely():
+    """No subagent rulesets + permission off → factory returns ``None`` (no engine).
+
+    The legacy gating is preserved when no caller asks for rules: nothing
+    runs, nothing pauses.
+    """
+    flags = AgentFeatureFlags(enable_permission=False)
+    pm = build_permission_mw(flags=flags, subagent_rulesets=None)
+    assert pm is None
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/shared/permissions/test_trusted_tool_save_on_always.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/shared/permissions/test_trusted_tool_save_on_always.py
new file mode 100644
index 0000000000..47d3704ac8
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/shared/permissions/test_trusted_tool_save_on_always.py
@@ -0,0 +1,186 @@
+"""``approve_always`` decisions for MCP tools are saved via the trusted-tool saver."""
+
+from __future__ import annotations
+
+from typing import Annotated, Any
+
+import pytest
+from langchain_core.messages import AIMessage, HumanMessage
+from langchain_core.tools import StructuredTool
+from langgraph.checkpoint.memory import InMemorySaver
+from langgraph.graph import END, START, StateGraph
+from langgraph.graph.message import add_messages
+from langgraph.types import Command
+from pydantic import BaseModel
+from typing_extensions import TypedDict
+
+from app.agents.multi_agent_chat.middleware.shared.permissions import (
+    build_permission_mw,
+)
+from app.agents.new_chat.feature_flags import AgentFeatureFlags
+from app.agents.new_chat.permissions import Rule, Ruleset
+
+
+class _NoArgs(BaseModel):
+    pass
+
+
+async def _noop(**_kwargs) -> str:
+    return ""
+
+
+def _ask_rule(tool_name: str) -> Rule:
+    return Rule(permission=tool_name, pattern="*", action="ask")
+
+
+def _make_mcp_tool(*, name: str, connector_id: int):
+    return StructuredTool(
+        name=name,
+        description=f"Run {name} via MCP.",
+        coroutine=_noop,
+        args_schema=_NoArgs,
+        metadata={
+            "mcp_connector_id": connector_id,
+            "mcp_connector_name": "Linear",
+            "mcp_transport": "http",
+            "hitl": True,
+        },
+    )
+
+
+def _make_native_tool(*, name: str):
+    return StructuredTool(
+        name=name,
+        description=f"Native {name}.",
+        coroutine=_noop,
+        args_schema=_NoArgs,
+        metadata={"hitl": True},
+    )
+
+
+class _State(TypedDict, total=False):
+    messages: Annotated[list, add_messages]
+
+
+def _build_graph(pm, tool_name: str):
+    def emit(_state: _State) -> dict[str, Any]:
+        return {
+            "messages": [
+                AIMessage(
+                    content="",
+                    tool_calls=[
+                        {
+                            "name": tool_name,
+                            "args": {},
+                            "id": "call-1",
+                            "type": "tool_call",
+                        }
+                    ],
+                )
+            ]
+        }
+
+    g = StateGraph(_State)
+    g.add_node("emit", emit)
+    g.add_node("permission", pm.aafter_model)  # type: ignore[arg-type]
+    g.add_edge(START, "emit")
+    g.add_edge("emit", "permission")
+    g.add_edge("permission", END)
+    return g.compile(checkpointer=InMemorySaver())
+
+
+@pytest.mark.asyncio
+async def test_approve_always_decision_saves_mcp_tool_via_callback():
+    saved: list[tuple[int, str]] = []
+
+    async def trusted_tool_saver(connector_id: int, tool_name: str) -> None:
+        saved.append((connector_id, tool_name))
+
+    tool = _make_mcp_tool(name="linear_create_issue", connector_id=7)
+    pm = build_permission_mw(
+        flags=AgentFeatureFlags(enable_permission=False),
+        subagent_rulesets=[Ruleset(origin="linear", rules=[_ask_rule(tool.name)])],
+        tools=[tool],
+        trusted_tool_saver=trusted_tool_saver,
+    )
+    assert pm is not None
+
+    graph = _build_graph(pm, tool.name)
+    config = {"configurable": {"thread_id": "approve-always-mcp"}}
+    await graph.ainvoke({"messages": [HumanMessage(content="seed")]}, config)
+    await graph.ainvoke(
+        Command(resume={"decisions": [{"type": "approve_always"}]}), config
+    )
+
+    assert saved == [(7, "linear_create_issue")]
+
+
+@pytest.mark.asyncio
+async def test_once_decision_does_not_save():
+    saved: list[tuple[int, str]] = []
+
+    async def trusted_tool_saver(connector_id: int, tool_name: str) -> None:
+        saved.append((connector_id, tool_name))
+
+    tool = _make_mcp_tool(name="linear_create_issue", connector_id=7)
+    pm = build_permission_mw(
+        flags=AgentFeatureFlags(enable_permission=False),
+        subagent_rulesets=[Ruleset(origin="linear", rules=[_ask_rule(tool.name)])],
+        tools=[tool],
+        trusted_tool_saver=trusted_tool_saver,
+    )
+    assert pm is not None
+
+    graph = _build_graph(pm, tool.name)
+    config = {"configurable": {"thread_id": "once-mcp"}}
+    await graph.ainvoke({"messages": [HumanMessage(content="seed")]}, config)
+    await graph.ainvoke(Command(resume={"decisions": [{"type": "approve"}]}), config)
+
+    assert saved == []
+
+
+@pytest.mark.asyncio
+async def test_approve_always_decision_for_native_tool_skips_save():
+    """Native tools have no ``mcp_connector_id`` so there is nowhere to persist trust."""
+    saved: list[tuple[int, str]] = []
+
+    async def trusted_tool_saver(connector_id: int, tool_name: str) -> None:
+        saved.append((connector_id, tool_name))
+
+    tool = _make_native_tool(name="rm")
+    pm = build_permission_mw(
+        flags=AgentFeatureFlags(enable_permission=False),
+        subagent_rulesets=[Ruleset(origin="kb", rules=[_ask_rule(tool.name)])],
+        tools=[tool],
+        trusted_tool_saver=trusted_tool_saver,
+    )
+    assert pm is not None
+
+    graph = _build_graph(pm, tool.name)
+    config = {"configurable": {"thread_id": "approve-always-native"}}
+    await graph.ainvoke({"messages": [HumanMessage(content="seed")]}, config)
+    await graph.ainvoke(
+        Command(resume={"decisions": [{"type": "approve_always"}]}), config
+    )
+
+    assert saved == []
+
+
+@pytest.mark.asyncio
+async def test_approve_always_decision_with_no_saver_callback_is_a_noop():
+    """Anonymous turns build the middleware without a ``trusted_tool_saver``; must not crash."""
+    tool = _make_mcp_tool(name="linear_create_issue", connector_id=7)
+    pm = build_permission_mw(
+        flags=AgentFeatureFlags(enable_permission=False),
+        subagent_rulesets=[Ruleset(origin="linear", rules=[_ask_rule(tool.name)])],
+        tools=[tool],
+        trusted_tool_saver=None,
+    )
+    assert pm is not None
+
+    graph = _build_graph(pm, tool.name)
+    config = {"configurable": {"thread_id": "anon-approve-always"}}
+    await graph.ainvoke({"messages": [HumanMessage(content="seed")]}, config)
+    await graph.ainvoke(
+        Command(resume={"decisions": [{"type": "approve_always"}]}), config
+    )
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/subagents/__init__.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/subagents/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/subagents/shared/__init__.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/subagents/shared/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/subagents/shared/hitl/__init__.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/subagents/shared/hitl/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/subagents/shared/hitl/approvals/__init__.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/subagents/shared/hitl/approvals/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/subagents/shared/hitl/approvals/self_gated/__init__.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/subagents/shared/hitl/approvals/self_gated/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/subagents/shared/hitl/approvals/self_gated/test_lc_hitl_wire.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/subagents/shared/hitl/approvals/self_gated/test_lc_hitl_wire.py
new file mode 100644
index 0000000000..195b1bc01e
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/subagents/shared/hitl/approvals/self_gated/test_lc_hitl_wire.py
@@ -0,0 +1,132 @@
+"""Regression: ``request_approval`` must emit the unified LC HITL wire shape.
+
+Before this fix, self-gated approvals fired the SurfSense-specific
+``{type, action, context}`` shape which the parallel-HITL routing layer
+(``collect_pending_tool_calls``) does not recognize. In a parallel HITL
+scenario where one subagent used self-gated approvals (e.g. Gmail send)
+and another used middleware-gated approvals (e.g. Linear via
+``HumanInTheLoopMiddleware``), the routing layer would silently skip the
+self-gated interrupt and crash on resume with ``Decision count mismatch``.
+
+This test pins the wire contract by running ``request_approval`` inside a
+real ``StateGraph`` and asserting the paused parent observes the LC HITL
+shape (``action_requests``, ``review_configs``, ``interrupt_type``).
+"""
+
+from __future__ import annotations
+
+import pytest
+from langchain_core.messages import HumanMessage
+from langgraph.checkpoint.memory import InMemorySaver
+from langgraph.graph import END, START, StateGraph
+from langgraph.types import Command
+from typing_extensions import TypedDict
+
+from app.agents.multi_agent_chat.subagents.shared.hitl.approvals.self_gated import (
+    request_approval,
+)
+
+
+class _State(TypedDict, total=False):
+    messages: list
+    final_decision_type: str
+    final_params: dict
+
+
+def _build_graph_calling_request_approval(checkpointer: InMemorySaver):
+    """A real graph whose only node delegates to ``request_approval``."""
+
+    def gate_node(_state):
+        result = request_approval(
+            action_type="gmail_email_send",
+            tool_name="send_gmail_email",
+            params={"to": "alice@example.com", "subject": "hi"},
+            context={"account": "alice@gmail.com"},
+        )
+        return {
+            "final_decision_type": result.decision_type,
+            "final_params": result.params,
+        }
+
+    g = StateGraph(_State)
+    g.add_node("gate", gate_node)
+    g.add_edge(START, "gate")
+    g.add_edge("gate", END)
+    return g.compile(checkpointer=checkpointer)
+
+
+@pytest.mark.asyncio
+async def test_paused_interrupt_uses_lc_hitl_action_requests_shape():
+    """The paused interrupt must speak the langchain HITL standard shape."""
+    checkpointer = InMemorySaver()
+    graph = _build_graph_calling_request_approval(checkpointer)
+    config = {"configurable": {"thread_id": "self-gated-wire"}}
+    await graph.ainvoke({"messages": [HumanMessage(content="seed")]}, config)
+
+    snap = await graph.aget_state(config)
+    assert len(snap.interrupts) == 1, (
+        f"expected one paused interrupt, got {len(snap.interrupts)}"
+    )
+    value = snap.interrupts[0].value
+    assert isinstance(value, dict)
+
+    # Standard LC HITL fields the routing layer reads.
+    assert value.get("action_requests") == [
+        {
+            "name": "send_gmail_email",
+            "args": {"to": "alice@example.com", "subject": "hi"},
+        }
+    ], (
+        "REGRESSION: self-gated approval reverted to legacy SurfSense shape; "
+        f"got {value!r}"
+    )
+    assert value.get("review_configs") == [
+        {
+            "action_name": "send_gmail_email",
+            "allowed_decisions": ["approve", "reject", "edit"],
+        }
+    ]
+    assert value.get("interrupt_type") == "gmail_email_send", (
+        "FE card discriminator must travel as ``interrupt_type``."
+    )
+    assert value.get("context") == {"account": "alice@gmail.com"}
+
+
+@pytest.mark.asyncio
+async def test_resume_with_lc_envelope_returns_hitl_result_with_edited_args():
+    """Edit reply via the LC envelope must round-trip into ``HITLResult.params``."""
+    checkpointer = InMemorySaver()
+    graph = _build_graph_calling_request_approval(checkpointer)
+    config = {"configurable": {"thread_id": "self-gated-resume"}}
+    await graph.ainvoke({"messages": [HumanMessage(content="seed")]}, config)
+
+    edited = {"to": "alice@example.com", "subject": "EDITED"}
+    await graph.ainvoke(
+        Command(
+            resume={
+                "decisions": [
+                    {"type": "edit", "edited_action": {"args": {"subject": "EDITED"}}}
+                ]
+            }
+        ),
+        config,
+    )
+    final = await graph.aget_state(config)
+    assert final.values.get("final_decision_type") == "edit"
+    assert final.values.get("final_params") == edited
+
+
+@pytest.mark.asyncio
+async def test_reject_envelope_returns_rejected_hitl_result():
+    """Reject reply must surface as ``HITLResult.rejected=True`` without invoking the tool."""
+    checkpointer = InMemorySaver()
+    graph = _build_graph_calling_request_approval(checkpointer)
+    config = {"configurable": {"thread_id": "self-gated-reject"}}
+    await graph.ainvoke({"messages": [HumanMessage(content="seed")]}, config)
+
+    await graph.ainvoke(
+        Command(resume={"decisions": [{"type": "reject", "feedback": "no"}]}),
+        config,
+    )
+    final = await graph.aget_state(config)
+    assert final.values.get("final_decision_type") == "reject"
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/subagents/shared/hitl/wire/__init__.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/subagents/shared/hitl/wire/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/subagents/shared/hitl/wire/test_hitl_wire.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/subagents/shared/hitl/wire/test_hitl_wire.py
new file mode 100644
index 0000000000..c06f9a6272
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/subagents/shared/hitl/wire/test_hitl_wire.py
@@ -0,0 +1,170 @@
+"""Unit contract for the unified LC HITL wire format.
+
+Both the self-gated approval primitive (``request_approval``) and the
+middleware-gated permission ask (``PermissionMiddleware``) must serialize
+to the same wire shape so the parallel-HITL routing layer
+(``collect_pending_tool_calls`` + ``slice_decisions_by_tool_call`` +
+``build_lg_resume_map``) sees one format.
+
+These tests pin the shape:
+
+- Builder always emits ``action_requests`` (1 entry) + ``review_configs``
+  + ``interrupt_type``; ``context`` rides through verbatim when present.
+- Parser tolerates the standard LC envelope, bare scalar strings, and
+  unrecognized shapes (failing closed to ``reject``).
+- Edited args round-trip through both nested (``edited_action.args``) and
+  flat (``args``) shapes without inventing values for the empty case.
+"""
+
+from __future__ import annotations
+
+from app.agents.multi_agent_chat.subagents.shared.hitl.wire import (
+    LC_DECISION_APPROVE,
+    LC_DECISION_EDIT,
+    LC_DECISION_REJECT,
+    SURFSENSE_DECISION_APPROVE_ALWAYS,
+    build_lc_hitl_payload,
+    parse_lc_envelope,
+)
+
+
+class TestBuildLcHitlPayload:
+    def test_minimal_payload_has_one_action_request_and_one_review_config(self):
+        payload = build_lc_hitl_payload(
+            tool_name="send_email",
+            args={"to": "x@y.z"},
+            allowed_decisions=[LC_DECISION_APPROVE, LC_DECISION_REJECT],
+            interrupt_type="gmail_email_send",
+        )
+        assert payload["action_requests"] == [
+            {"name": "send_email", "args": {"to": "x@y.z"}}
+        ]
+        assert payload["review_configs"] == [
+            {
+                "action_name": "send_email",
+                "allowed_decisions": [LC_DECISION_APPROVE, LC_DECISION_REJECT],
+            }
+        ]
+        assert payload["interrupt_type"] == "gmail_email_send"
+        assert "context" not in payload, "context must be omitted when not provided"
+
+    def test_none_args_normalized_to_empty_dict(self):
+        """FE expects a stable shape; ``None`` would crash card rendering."""
+        payload = build_lc_hitl_payload(
+            tool_name="ping",
+            args=None,  # type: ignore[arg-type]
+            allowed_decisions=[LC_DECISION_APPROVE],
+            interrupt_type="self_gated",
+        )
+        assert payload["action_requests"][0]["args"] == {}
+
+    def test_description_attached_only_when_provided(self):
+        with_desc = build_lc_hitl_payload(
+            tool_name="t",
+            args={},
+            allowed_decisions=[LC_DECISION_APPROVE],
+            interrupt_type="x",
+            description="please review",
+        )
+        without = build_lc_hitl_payload(
+            tool_name="t",
+            args={},
+            allowed_decisions=[LC_DECISION_APPROVE],
+            interrupt_type="x",
+        )
+        assert with_desc["action_requests"][0]["description"] == "please review"
+        assert "description" not in without["action_requests"][0]
+
+    def test_context_passed_through_verbatim(self):
+        ctx = {"patterns": ["rm/*"], "rules": [], "always": ["rm/*"]}
+        payload = build_lc_hitl_payload(
+            tool_name="rm",
+            args={"path": "/tmp"},
+            allowed_decisions=[
+                LC_DECISION_APPROVE,
+                LC_DECISION_REJECT,
+                SURFSENSE_DECISION_APPROVE_ALWAYS,
+            ],
+            interrupt_type="permission_ask",
+            context=ctx,
+        )
+        assert payload["context"] == ctx
+
+    def test_allowed_decisions_list_is_copied_not_aliased(self):
+        """A caller mutating their original list must not corrupt the payload."""
+        decisions = [LC_DECISION_APPROVE]
+        payload = build_lc_hitl_payload(
+            tool_name="t",
+            args={},
+            allowed_decisions=decisions,
+            interrupt_type="x",
+        )
+        decisions.append(LC_DECISION_REJECT)
+        assert payload["review_configs"][0]["allowed_decisions"] == [
+            LC_DECISION_APPROVE
+        ]
+
+
+class TestParseLcEnvelope:
+    def test_standard_lc_envelope_returns_typed_decision(self):
+        parsed = parse_lc_envelope({"decisions": [{"type": "approve"}]})
+        assert parsed.decision_type == "approve"
+        assert parsed.edited_args is None
+        assert parsed.message is None
+
+    def test_bare_scalar_string_passes_through_lowercased(self):
+        assert parse_lc_envelope("APPROVE_ALWAYS").decision_type == "approve_always"
+        assert parse_lc_envelope("once").decision_type == "once"
+
+    def test_non_dict_non_string_collapses_to_reject(self):
+        """Failing closed: ambiguous input must never proceed."""
+        assert parse_lc_envelope(42).decision_type == "reject"
+        assert parse_lc_envelope(None).decision_type == "reject"
+        assert parse_lc_envelope(["bogus"]).decision_type == "reject"
+
+    def test_missing_decision_type_collapses_to_reject(self):
+        assert parse_lc_envelope({"decisions": [{}]}).decision_type == "reject"
+        assert parse_lc_envelope({"foo": "bar"}).decision_type == "reject"
+
+    def test_edit_extracts_nested_args(self):
+        parsed = parse_lc_envelope(
+            {
+                "decisions": [
+                    {
+                        "type": LC_DECISION_EDIT,
+                        "edited_action": {"args": {"to": "edited@y.z"}},
+                    }
+                ]
+            }
+        )
+        assert parsed.decision_type == "edit"
+        assert parsed.edited_args == {"to": "edited@y.z"}
+
+    def test_edit_falls_back_to_flat_args(self):
+        parsed = parse_lc_envelope(
+            {"decisions": [{"type": "edit", "args": {"k": "v"}}]}
+        )
+        assert parsed.edited_args == {"k": "v"}
+
+    def test_edit_with_empty_args_yields_none_edited(self):
+        """Empty edited_args means "no edits" — caller treats as plain approve."""
+        parsed = parse_lc_envelope(
+            {"decisions": [{"type": "edit", "edited_action": {"args": {}}}]}
+        )
+        assert parsed.edited_args is None
+
+    def test_message_picked_from_either_feedback_or_message_field(self):
+        with_feedback = parse_lc_envelope(
+            {"decisions": [{"type": "reject", "feedback": "no thanks"}]}
+        )
+        with_message = parse_lc_envelope(
+            {"decisions": [{"type": "reject", "message": "no thanks"}]}
+        )
+        assert with_feedback.message == "no thanks"
+        assert with_message.message == "no thanks"
+
+    def test_blank_message_treated_as_absent(self):
+        parsed = parse_lc_envelope(
+            {"decisions": [{"type": "reject", "message": "   "}]}
+        )
+        assert parsed.message is None
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/subagents/shared/test_subagent_builder.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/subagents/shared/test_subagent_builder.py
new file mode 100644
index 0000000000..062ea92ec3
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/subagents/shared/test_subagent_builder.py
@@ -0,0 +1,243 @@
+"""Subagent resilience contract: ``middleware_stack`` reaches the agent chain."""
+
+from __future__ import annotations
+
+from collections.abc import AsyncIterator, Iterator
+from typing import Any
+
+import pytest
+from langchain.agents import create_agent
+from langchain.agents.middleware import ModelFallbackMiddleware
+from langchain_core.callbacks import (
+    AsyncCallbackManagerForLLMRun,
+    CallbackManagerForLLMRun,
+)
+from langchain_core.language_models.chat_models import BaseChatModel
+from langchain_core.language_models.fake_chat_models import (
+    FakeMessagesListChatModel,
+)
+from langchain_core.messages import AIMessage, BaseMessage, HumanMessage
+from langchain_core.outputs import ChatGeneration, ChatResult
+
+from app.agents.multi_agent_chat.middleware.shared.permissions.middleware.core import (
+    PermissionMiddleware,
+)
+from app.agents.multi_agent_chat.subagents.shared.subagent_builder import (
+    pack_subagent,
+)
+from app.agents.new_chat.feature_flags import AgentFeatureFlags
+from app.agents.new_chat.permissions import Rule, Ruleset, evaluate
+
+
+class RateLimitError(Exception):
+    """Name matches the scoped-fallback eligibility allowlist."""
+
+
+class _AlwaysFailingChatModel(BaseChatModel):
+    @property
+    def _llm_type(self) -> str:
+        return "always-failing-test-model"
+
+    def _generate(
+        self,
+        messages: list[BaseMessage],
+        stop: list[str] | None = None,
+        run_manager: CallbackManagerForLLMRun | None = None,
+        **kwargs: Any,
+    ) -> ChatResult:
+        msg = "primary llm exploded"
+        raise RateLimitError(msg)
+
+    async def _agenerate(
+        self,
+        messages: list[BaseMessage],
+        stop: list[str] | None = None,
+        run_manager: AsyncCallbackManagerForLLMRun | None = None,
+        **kwargs: Any,
+    ) -> ChatResult:
+        msg = "primary llm exploded"
+        raise RateLimitError(msg)
+
+    def _stream(self, *args: Any, **kwargs: Any) -> Iterator[ChatGeneration]:
+        msg = "primary llm exploded"
+        raise RateLimitError(msg)
+
+    async def _astream(
+        self, *args: Any, **kwargs: Any
+    ) -> AsyncIterator[ChatGeneration]:
+        msg = "primary llm exploded"
+        raise RateLimitError(msg)
+        yield  # pragma: no cover - unreachable, satisfies async generator typing
+
+
+@pytest.mark.asyncio
+async def test_subagent_recovers_when_primary_llm_fails():
+    """Fallback in ``middleware_stack`` must finish the turn when primary raises."""
+    primary = _AlwaysFailingChatModel()
+    fallback = FakeMessagesListChatModel(
+        responses=[AIMessage(content="recovered via fallback")]
+    )
+
+    result = pack_subagent(
+        name="resilience_test",
+        description="test subagent",
+        system_prompt="be helpful",
+        tools=[],
+        ruleset=Ruleset(origin="resilience_test", rules=[]),
+        dependencies={"flags": AgentFeatureFlags()},
+        model=primary,
+        middleware_stack={"fallback": ModelFallbackMiddleware(fallback)},
+    )
+    spec = result.spec
+
+    agent = create_agent(
+        model=spec["model"],
+        tools=spec["tools"],
+        middleware=spec["middleware"],
+        system_prompt=spec["system_prompt"],
+    )
+
+    result = await agent.ainvoke({"messages": [HumanMessage(content="hi")]})
+
+    final = result["messages"][-1]
+    assert isinstance(final, AIMessage)
+    assert final.content == "recovered via fallback"
+
+
+def _extract_permission_mw(spec) -> PermissionMiddleware:
+    """Find the lone PermissionMiddleware in a subagent's middleware list."""
+    matches = [m for m in spec["middleware"] if isinstance(m, PermissionMiddleware)]
+    assert len(matches) == 1, "expected exactly one PermissionMiddleware"
+    return matches[0]
+
+
+def test_user_allowlist_overrides_coded_ask_via_last_match_wins():
+    """User ``allow`` rules promoted via "Always Allow" must beat coded ``ask`` rules."""
+    coded = Ruleset(
+        origin="connector",
+        rules=[Rule(permission="save_issue", pattern="*", action="ask")],
+    )
+    user_allowlist = Ruleset(
+        origin="user_allowlist:connector",
+        rules=[Rule(permission="save_issue", pattern="*", action="allow")],
+    )
+
+    result = pack_subagent(
+        name="connector",
+        description="test connector",
+        system_prompt="x",
+        tools=[],
+        ruleset=coded,
+        dependencies={
+            "flags": AgentFeatureFlags(),
+            "user_allowlist_by_subagent": {"connector": user_allowlist},
+        },
+    )
+
+    mw = _extract_permission_mw(result.spec)
+    decided = evaluate("save_issue", "*", *mw._static_rulesets)
+    assert decided.action == "allow", (
+        f"user_allowlist must override coded ask; got {decided!r}"
+    )
+
+
+def test_coded_ask_stays_when_user_allowlist_unrelated():
+    """User ``allow`` rules for OTHER tools must not leak into asked-tools."""
+    coded = Ruleset(
+        origin="connector",
+        rules=[Rule(permission="delete_issue", pattern="*", action="ask")],
+    )
+    user_allowlist = Ruleset(
+        origin="user_allowlist:connector",
+        rules=[Rule(permission="save_issue", pattern="*", action="allow")],
+    )
+
+    result = pack_subagent(
+        name="connector",
+        description="test",
+        system_prompt="x",
+        tools=[],
+        ruleset=coded,
+        dependencies={
+            "flags": AgentFeatureFlags(),
+            "user_allowlist_by_subagent": {"connector": user_allowlist},
+        },
+    )
+
+    mw = _extract_permission_mw(result.spec)
+    decided = evaluate("delete_issue", "*", *mw._static_rulesets)
+    assert decided.action == "ask"
+
+
+def test_missing_user_allowlist_keeps_coded_behaviour():
+    """``dependencies`` without ``user_allowlist_by_subagent`` is the common case."""
+    coded = Ruleset(
+        origin="connector",
+        rules=[Rule(permission="save_issue", pattern="*", action="ask")],
+    )
+
+    result = pack_subagent(
+        name="connector",
+        description="test",
+        system_prompt="x",
+        tools=[],
+        ruleset=coded,
+        dependencies={"flags": AgentFeatureFlags()},
+    )
+
+    mw = _extract_permission_mw(result.spec)
+    decided = evaluate("save_issue", "*", *mw._static_rulesets)
+    assert decided.action == "ask"
+
+
+def test_user_allowlist_for_different_subagent_does_not_leak():
+    """User trust for ``linear`` must not affect a ``jira`` subagent compile."""
+    coded = Ruleset(
+        origin="jira",
+        rules=[Rule(permission="save_issue", pattern="*", action="ask")],
+    )
+    linear_allowlist = Ruleset(
+        origin="user_allowlist:linear",
+        rules=[Rule(permission="save_issue", pattern="*", action="allow")],
+    )
+
+    result = pack_subagent(
+        name="jira",
+        description="test",
+        system_prompt="x",
+        tools=[],
+        ruleset=coded,
+        dependencies={
+            "flags": AgentFeatureFlags(),
+            "user_allowlist_by_subagent": {"linear": linear_allowlist},
+        },
+    )
+
+    mw = _extract_permission_mw(result.spec)
+    decided = evaluate("save_issue", "*", *mw._static_rulesets)
+    assert decided.action == "ask"
+
+
+def test_empty_user_allowlist_is_tolerated():
+    """An empty ``Ruleset`` (no rules) must not flip evaluation to allow-everything."""
+    coded = Ruleset(
+        origin="connector",
+        rules=[Rule(permission="save_issue", pattern="*", action="ask")],
+    )
+    empty = Ruleset(origin="user_allowlist:connector", rules=[])
+
+    result = pack_subagent(
+        name="connector",
+        description="test",
+        system_prompt="x",
+        tools=[],
+        ruleset=coded,
+        dependencies={
+            "flags": AgentFeatureFlags(),
+            "user_allowlist_by_subagent": {"connector": empty},
+        },
+    )
+
+    mw = _extract_permission_mw(result.spec)
+    decided = evaluate("save_issue", "*", *mw._static_rulesets)
+    assert decided.action == "ask"
diff --git a/surfsense_backend/tests/unit/agents/new_chat/__init__.py b/surfsense_backend/tests/unit/agents/new_chat/__init__.py
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/__init__.py
@@ -0,0 +1 @@
+
diff --git a/surfsense_backend/tests/unit/agents/new_chat/middleware/__init__.py b/surfsense_backend/tests/unit/agents/new_chat/middleware/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/tests/unit/agents/new_chat/middleware/test_scoped_model_fallback.py b/surfsense_backend/tests/unit/agents/new_chat/middleware/test_scoped_model_fallback.py
new file mode 100644
index 0000000000..80b9862e7d
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/middleware/test_scoped_model_fallback.py
@@ -0,0 +1,128 @@
+"""``ScopedModelFallbackMiddleware`` triggers fallback only on provider errors."""
+
+from __future__ import annotations
+
+from collections.abc import AsyncIterator, Iterator
+from typing import Any
+
+import pytest
+from langchain_core.callbacks import (
+    AsyncCallbackManagerForLLMRun,
+    CallbackManagerForLLMRun,
+)
+from langchain_core.language_models.chat_models import BaseChatModel
+from langchain_core.messages import AIMessage, BaseMessage
+from langchain_core.outputs import ChatGeneration, ChatResult
+
+
+class _RaisingChatModel(BaseChatModel):
+    exc_to_raise: Any
+
+    @property
+    def _llm_type(self) -> str:
+        return "raising-test-model"
+
+    def _generate(
+        self,
+        messages: list[BaseMessage],
+        stop: list[str] | None = None,
+        run_manager: CallbackManagerForLLMRun | None = None,
+        **kwargs: Any,
+    ) -> ChatResult:
+        raise self.exc_to_raise
+
+    async def _agenerate(
+        self,
+        messages: list[BaseMessage],
+        stop: list[str] | None = None,
+        run_manager: AsyncCallbackManagerForLLMRun | None = None,
+        **kwargs: Any,
+    ) -> ChatResult:
+        raise self.exc_to_raise
+
+    def _stream(self, *args: Any, **kwargs: Any) -> Iterator[ChatGeneration]:
+        raise self.exc_to_raise
+
+    async def _astream(
+        self, *args: Any, **kwargs: Any
+    ) -> AsyncIterator[ChatGeneration]:
+        raise self.exc_to_raise
+        yield  # pragma: no cover - unreachable
+
+
+class _RecordingChatModel(BaseChatModel):
+    response_text: str = "fallback-ok"
+    call_count: int = 0
+
+    @property
+    def _llm_type(self) -> str:
+        return "recording-test-model"
+
+    def _generate(
+        self,
+        messages: list[BaseMessage],
+        stop: list[str] | None = None,
+        run_manager: CallbackManagerForLLMRun | None = None,
+        **kwargs: Any,
+    ) -> ChatResult:
+        self.call_count += 1
+        return ChatResult(
+            generations=[ChatGeneration(message=AIMessage(content=self.response_text))]
+        )
+
+    async def _agenerate(
+        self,
+        messages: list[BaseMessage],
+        stop: list[str] | None = None,
+        run_manager: AsyncCallbackManagerForLLMRun | None = None,
+        **kwargs: Any,
+    ) -> ChatResult:
+        return self._generate(messages, stop, None, **kwargs)
+
+
+class RateLimitError(Exception):
+    """Name matches the scoped-fallback eligibility allowlist."""
+
+
+def _build_agent(primary: BaseChatModel, fallback: BaseChatModel):
+    from langchain.agents import create_agent
+
+    from app.agents.new_chat.middleware.scoped_model_fallback import (
+        ScopedModelFallbackMiddleware,
+    )
+
+    return create_agent(
+        model=primary,
+        tools=[],
+        middleware=[ScopedModelFallbackMiddleware(fallback)],
+        system_prompt="be helpful",
+    )
+
+
+@pytest.mark.asyncio
+async def test_provider_errors_trigger_fallback():
+    """Eligible exception names must drive the fallback chain."""
+    primary = _RaisingChatModel(exc_to_raise=RateLimitError("429 from provider"))
+    fallback = _RecordingChatModel(response_text="recovered")
+
+    agent = _build_agent(primary, fallback)
+    result = await agent.ainvoke({"messages": [("user", "hi")]})
+
+    final = result["messages"][-1]
+    assert isinstance(final, AIMessage)
+    assert final.content == "recovered"
+    assert fallback.call_count == 1
+
+
+@pytest.mark.asyncio
+async def test_programming_errors_propagate_without_invoking_fallback():
+    """Non-eligible exceptions must propagate; fallback must not be invoked."""
+    primary = _RaisingChatModel(exc_to_raise=KeyError("missing_state_field"))
+    fallback = _RecordingChatModel(response_text="should-never-arrive")
+
+    agent = _build_agent(primary, fallback)
+
+    with pytest.raises(KeyError, match="missing_state_field"):
+        await agent.ainvoke({"messages": [("user", "hi")]})
+
+    assert fallback.call_count == 0
diff --git a/surfsense_backend/tests/unit/agents/new_chat/prompts/__init__.py b/surfsense_backend/tests/unit/agents/new_chat/prompts/__init__.py
new file mode 100644
index 0000000000..a92d371bdb
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/prompts/__init__.py
@@ -0,0 +1 @@
+"""__init__ stub so pytest discovers the prompts test module."""
diff --git a/surfsense_backend/tests/unit/agents/new_chat/prompts/test_composer.py b/surfsense_backend/tests/unit/agents/new_chat/prompts/test_composer.py
new file mode 100644
index 0000000000..36fe04aa2b
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/prompts/test_composer.py
@@ -0,0 +1,295 @@
+"""Tests for the prompt fragment composer."""
+
+from __future__ import annotations
+
+from datetime import UTC, datetime
+
+import pytest
+
+from app.agents.new_chat.prompts.composer import (
+    ALL_TOOL_NAMES_ORDERED,
+    compose_system_prompt,
+    detect_provider_variant,
+)
+from app.db import ChatVisibility
+
+pytestmark = pytest.mark.unit
+
+
+@pytest.fixture
+def fixed_today() -> datetime:
+    return datetime(2025, 6, 1, 12, 0, tzinfo=UTC)
+
+
+class TestProviderVariantDetection:
+    @pytest.mark.parametrize(
+        "model_name,expected",
+        [
+            # GPT-4 family routes to "classic" (autonomous-persistence style)
+            ("openai:gpt-4o-mini", "openai_classic"),
+            ("openai:gpt-4-turbo", "openai_classic"),
+            # GPT-5 / o-series route to "reasoning" (channel-aware pragmatic)
+            ("openai:gpt-5", "openai_reasoning"),
+            ("openai:o1-preview", "openai_reasoning"),
+            ("openai:o3-mini", "openai_reasoning"),
+            # Codex family beats reasoning (more specific). Mirrors OpenCode
+            # ``system.ts`` — ``gpt-*-codex`` gets the code-purist prompt.
+            ("openai:gpt-5-codex", "openai_codex"),
+            ("openai:gpt-codex", "openai_codex"),
+            ("openai:codex-mini", "openai_codex"),
+            # Anthropic + Google
+            ("anthropic:claude-3-5-sonnet", "anthropic"),
+            ("anthropic/claude-opus-4", "anthropic"),
+            ("google:gemini-2.0-flash", "google"),
+            ("vertex:gemini-1.5-pro", "google"),
+            # Newly-covered families
+            ("moonshot:kimi-k2", "kimi"),
+            ("openrouter:moonshot/kimi-k2.5", "kimi"),
+            ("xai:grok-2", "grok"),
+            ("openrouter:x-ai/grok-3", "grok"),
+            ("openai:deepseek-v3", "deepseek"),
+            ("deepseek:deepseek-r1", "deepseek"),
+            # Unknown families fall back to default (no provider block emitted)
+            ("groq:mixtral-8x7b", "default"),
+            ("together:llama-3.1-70b", "default"),
+            (None, "default"),
+            ("", "default"),
+        ],
+    )
+    def test_detection(self, model_name: str | None, expected: str) -> None:
+        assert detect_provider_variant(model_name) == expected
+
+    def test_codex_takes_precedence_over_reasoning(self) -> None:
+        """Regression guard: ``gpt-5-codex`` must NOT match the generic
+        ``gpt-5`` reasoning regex first. Codex is the more specialised
+        prompt and mirrors OpenCode's dispatch order.
+        """
+        from app.agents.new_chat.prompts.composer import detect_provider_variant
+
+        assert detect_provider_variant("openai:gpt-5-codex") == "openai_codex"
+        assert detect_provider_variant("openai:gpt-5") == "openai_reasoning"
+
+
+class TestCompose:
+    def test_default_prompt_has_required_blocks(self, fixed_today: datetime) -> None:
+        prompt = compose_system_prompt(today=fixed_today)
+        # System instruction wrapper
+        assert "<system_instruction>" in prompt
+        assert "</system_instruction>" in prompt
+        # Date interpolated
+        assert "2025-06-01" in prompt
+        # Core policy blocks present
+        assert "<knowledge_base_only_policy>" in prompt
+        assert "<tool_routing>" in prompt
+        assert "<parameter_resolution>" in prompt
+        assert "<memory_protocol>" in prompt
+        # Tools
+        assert "<tools>" in prompt
+        assert "</tools>" in prompt
+        # Citations on by default
+        assert "<citation_instructions>" in prompt
+        assert "[citation:chunk_id]" in prompt
+
+    def test_team_visibility_uses_team_variants(self, fixed_today: datetime) -> None:
+        prompt = compose_system_prompt(
+            today=fixed_today,
+            thread_visibility=ChatVisibility.SEARCH_SPACE,
+        )
+        # Team-specific phrasing in the agent block
+        assert "team space" in prompt
+        # Memory protocol mentions team
+        assert "team" in prompt
+        # Should NOT mention the user-only memory phrasing
+        assert "personal knowledge base" not in prompt
+
+    def test_private_visibility_uses_private_variants(
+        self, fixed_today: datetime
+    ) -> None:
+        prompt = compose_system_prompt(
+            today=fixed_today,
+            thread_visibility=ChatVisibility.PRIVATE,
+        )
+        assert "personal knowledge base" in prompt
+        # Should NOT mention the team-specific phrasing about prefixed authors
+        assert "[DisplayName of the author]" not in prompt
+
+    def test_citations_disabled_swaps_block(self, fixed_today: datetime) -> None:
+        prompt_on = compose_system_prompt(today=fixed_today, citations_enabled=True)
+        prompt_off = compose_system_prompt(today=fixed_today, citations_enabled=False)
+        assert "Citations are DISABLED" in prompt_off
+        assert "Citations are DISABLED" not in prompt_on
+        assert "[citation:chunk_id]" in prompt_on
+
+    def test_enabled_tool_filter_only_includes_listed_tools(
+        self, fixed_today: datetime
+    ) -> None:
+        prompt = compose_system_prompt(
+            today=fixed_today,
+            enabled_tool_names={"web_search", "scrape_webpage"},
+        )
+        assert "web_search:" in prompt or "- web_search:" in prompt
+        assert "scrape_webpage:" in prompt or "- scrape_webpage:" in prompt
+        # Excluded tools should NOT appear in tool listing
+        assert "generate_podcast:" not in prompt
+        assert "generate_image:" not in prompt
+
+    def test_disabled_tool_note_is_appended(self, fixed_today: datetime) -> None:
+        prompt = compose_system_prompt(
+            today=fixed_today,
+            enabled_tool_names={"web_search"},
+            disabled_tool_names={"generate_image", "generate_podcast"},
+        )
+        assert "DISABLED TOOLS (by user):" in prompt
+        assert "Generate Image" in prompt
+        assert "Generate Podcast" in prompt
+
+    def test_mcp_routing_block_emits_when_provided(self, fixed_today: datetime) -> None:
+        prompt = compose_system_prompt(
+            today=fixed_today,
+            mcp_connector_tools={"My GitLab": ["gitlab_search", "gitlab_create_mr"]},
+        )
+        assert "<mcp_tool_routing>" in prompt
+        assert "My GitLab" in prompt
+        assert "gitlab_search" in prompt
+
+    def test_mcp_routing_block_absent_when_no_servers(
+        self, fixed_today: datetime
+    ) -> None:
+        prompt = compose_system_prompt(today=fixed_today, mcp_connector_tools={})
+        assert "<mcp_tool_routing>" not in prompt
+
+    def test_provider_block_renders_when_anthropic(self, fixed_today: datetime) -> None:
+        prompt = compose_system_prompt(
+            today=fixed_today, model_name="anthropic:claude-3-5-sonnet"
+        )
+        assert "<provider_hints>" in prompt
+        assert "Anthropic" in prompt or "Claude" in prompt
+
+    def test_provider_block_absent_for_default(self, fixed_today: datetime) -> None:
+        prompt = compose_system_prompt(today=fixed_today, model_name="custom:foo")
+        assert "<provider_hints>" not in prompt
+
+    @pytest.mark.parametrize(
+        "model_name,expected_marker",
+        [
+            # Each marker is a unique-ish phrase from the corresponding fragment.
+            # If a fragment is renamed/rewritten such that the marker is gone,
+            # update both the fragment and this test deliberately.
+            ("openai:gpt-5-codex", "Codex-class"),
+            ("openai:gpt-5", "OpenAI reasoning model"),
+            ("openai:gpt-4o", "classic OpenAI chat model"),
+            ("anthropic:claude-3-5-sonnet", "Anthropic Claude"),
+            ("google:gemini-2.0-flash", "Google Gemini"),
+            ("moonshot:kimi-k2", "Moonshot Kimi"),
+            ("xai:grok-2", "xAI Grok"),
+            ("deepseek:deepseek-r1", "DeepSeek"),
+        ],
+    )
+    def test_each_known_variant_renders_with_its_marker(
+        self,
+        fixed_today: datetime,
+        model_name: str,
+        expected_marker: str,
+    ) -> None:
+        """Every supported variant must produce a ``<provider_hints>`` block
+        containing its identifying marker. This pins the dispatch + the
+        on-disk fragments together so a missing/renamed file is caught
+        immediately.
+        """
+        prompt = compose_system_prompt(today=fixed_today, model_name=model_name)
+        assert "<provider_hints>" in prompt, (
+            f"variant for {model_name!r} did not emit a provider_hints block; "
+            "the corresponding providers/<variant>.md may be missing"
+        )
+        assert expected_marker in prompt, (
+            f"variant for {model_name!r} emitted hints but lacked the "
+            f"expected marker {expected_marker!r} — the fragment may have "
+            "drifted from the dispatch table"
+        )
+
+    def test_provider_blocks_are_byte_stable_across_calls(
+        self, fixed_today: datetime
+    ) -> None:
+        """Cache-stability guard: same model id → byte-identical prompt."""
+        a = compose_system_prompt(today=fixed_today, model_name="moonshot:kimi-k2")
+        b = compose_system_prompt(today=fixed_today, model_name="moonshot:kimi-k2")
+        assert a == b
+
+    def test_custom_system_instructions_override_default(
+        self, fixed_today: datetime
+    ) -> None:
+        custom = "You are a custom assistant. Today is {resolved_today}."
+        prompt = compose_system_prompt(
+            today=fixed_today, custom_system_instructions=custom
+        )
+        assert "You are a custom assistant. Today is 2025-06-01." in prompt
+        # Default block should NOT be present
+        assert "<knowledge_base_only_policy>" not in prompt
+
+    def test_provider_hints_render_with_custom_system_instructions(
+        self, fixed_today: datetime
+    ) -> None:
+        """Regression guard for the always-append decision: provider hints
+        append AFTER a custom system prompt.
+
+        Provider hints are stylistic nudges (parallel tool-call rules,
+        formatting guidance, etc.) that help the model regardless of
+        what the system instructions say. Suppressing them when a
+        custom prompt is set would partially defeat the per-family
+        prompt machinery.
+        """
+        prompt = compose_system_prompt(
+            today=fixed_today,
+            custom_system_instructions="You are a custom assistant.",
+            model_name="anthropic/claude-3-5-sonnet",
+        )
+        assert "You are a custom assistant." in prompt
+        assert "<provider_hints>" in prompt
+        # The custom prompt must come BEFORE the provider hints so the
+        # user's framing isn't drowned out by the stylistic nudges.
+        assert prompt.index("You are a custom assistant.") < prompt.index(
+            "<provider_hints>"
+        )
+
+    def test_use_default_false_with_no_custom_yields_no_system_block(
+        self, fixed_today: datetime
+    ) -> None:
+        prompt = compose_system_prompt(
+            today=fixed_today,
+            use_default_system_instructions=False,
+        )
+        # No system_instruction wrapper but tools/citations still emitted
+        assert "<system_instruction>" not in prompt
+        assert "<tools>" in prompt
+
+    def test_all_known_tools_have_fragments(self) -> None:
+        # Soft assertion: verify that every tool in the canonical order
+        # produces non-empty content for at least one variant.
+        for tool in ALL_TOOL_NAMES_ORDERED:
+            prompt = compose_system_prompt(
+                today=datetime(2025, 1, 1, tzinfo=UTC),
+                enabled_tool_names={tool},
+            )
+            assert tool in prompt, f"tool {tool!r} missing from composed prompt"
+
+
+class TestStableOrderingForCacheStability:
+    """Regression guard: prompt cache hit-rate depends on byte-stable prefix."""
+
+    def test_composition_is_deterministic_given_same_inputs(
+        self, fixed_today: datetime
+    ) -> None:
+        a = compose_system_prompt(
+            today=fixed_today,
+            enabled_tool_names={"web_search", "scrape_webpage"},
+            mcp_connector_tools={"X": ["x_a", "x_b"]},
+        )
+        b = compose_system_prompt(
+            today=fixed_today,
+            enabled_tool_names={
+                "scrape_webpage",
+                "web_search",
+            },  # set order shouldn't matter
+            mcp_connector_tools={"X": ["x_a", "x_b"]},
+        )
+        assert a == b
diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_action_log.py b/surfsense_backend/tests/unit/agents/new_chat/test_action_log.py
new file mode 100644
index 0000000000..8ef1430a9a
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_action_log.py
@@ -0,0 +1,427 @@
+"""Unit tests for ActionLogMiddleware (Tier 5.2)."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+from unittest.mock import AsyncMock, patch
+
+import pytest
+from langchain_core.messages import ToolMessage
+from langchain_core.tools import tool
+
+from app.agents.new_chat.feature_flags import AgentFeatureFlags
+from app.agents.new_chat.middleware.action_log import ActionLogMiddleware
+from app.agents.new_chat.tools.registry import ToolDefinition
+
+
+@dataclass
+class _FakeRuntime:
+    """Minimal stand-in for ``ToolRuntime`` used in unit tests.
+
+    ``ActionLogMiddleware`` reads ``runtime.config['configurable']['turn_id']``
+    to populate the new ``chat_turn_id`` column (see migration 135).
+    """
+
+    config: dict[str, Any] | None = None
+
+
+@dataclass
+class _FakeRequest:
+    """Minimal stand-in for ToolCallRequest used in unit tests."""
+
+    tool_call: dict[str, Any]
+    tool: Any = None
+    state: Any = None
+    runtime: Any = None
+
+
+@tool
+def make_widget(color: str, size: int) -> str:
+    """Create a widget."""
+    return f"made {color} {size}"
+
+
+def _enabled_flags(**overrides: bool) -> AgentFeatureFlags:
+    return AgentFeatureFlags(
+        disable_new_agent_stack=False,
+        enable_action_log=True,
+        **overrides,
+    )
+
+
+def _disabled_flags() -> AgentFeatureFlags:
+    return AgentFeatureFlags(disable_new_agent_stack=False, enable_action_log=False)
+
+
+@pytest.fixture
+def patch_get_flags():
+    def _patch(flags: AgentFeatureFlags):
+        return patch(
+            "app.agents.new_chat.middleware.action_log.get_flags",
+            return_value=flags,
+        )
+
+    return _patch
+
+
+@pytest.fixture
+def fake_session_factory():
+    """Patch ``shielded_async_session`` with a recording fake."""
+    captured: dict[str, list] = {"rows": []}
+
+    class _FakeSession:
+        async def __aenter__(self):
+            return self
+
+        async def __aexit__(self, exc_type, exc, tb):
+            return False
+
+        def add(self, row):
+            captured["rows"].append(row)
+
+        async def commit(self):
+            captured["committed"] = True
+
+    def _factory():
+        return _FakeSession()
+
+    return captured, _factory
+
+
+class TestActionLogMiddlewareDisabled:
+    @pytest.mark.asyncio
+    async def test_no_op_when_flag_off(self, patch_get_flags) -> None:
+        mw = ActionLogMiddleware(thread_id=1, search_space_id=1, user_id=None)
+        request = _FakeRequest(
+            tool_call={
+                "name": "make_widget",
+                "args": {"color": "red", "size": 1},
+                "id": "tc1",
+            }
+        )
+        handler = AsyncMock(return_value=ToolMessage(content="ok", tool_call_id="tc1"))
+        with patch_get_flags(_disabled_flags()):
+            result = await mw.awrap_tool_call(request, handler)
+        handler.assert_awaited_once()
+        assert isinstance(result, ToolMessage)
+
+    @pytest.mark.asyncio
+    async def test_no_op_when_thread_id_none(self, patch_get_flags) -> None:
+        mw = ActionLogMiddleware(thread_id=None, search_space_id=1, user_id=None)
+        request = _FakeRequest(
+            tool_call={"name": "make_widget", "args": {}, "id": "tc1"}
+        )
+        handler = AsyncMock(return_value=ToolMessage(content="ok", tool_call_id="tc1"))
+        with patch_get_flags(_enabled_flags()):
+            result = await mw.awrap_tool_call(request, handler)
+        assert isinstance(result, ToolMessage)
+
+
+class TestActionLogMiddlewarePersistence:
+    @pytest.mark.asyncio
+    async def test_writes_row_on_success(
+        self, patch_get_flags, fake_session_factory
+    ) -> None:
+        captured, factory = fake_session_factory
+        mw = ActionLogMiddleware(thread_id=42, search_space_id=7, user_id="u1")
+        request = _FakeRequest(
+            tool_call={
+                "name": "make_widget",
+                "args": {"color": "red", "size": 3},
+                "id": "tc-abc",
+            },
+            runtime=_FakeRuntime(
+                config={"configurable": {"turn_id": "42:1700000000000"}}
+            ),
+        )
+        result_msg = ToolMessage(content="ok", tool_call_id="tc-abc", id="msg-1")
+        handler = AsyncMock(return_value=result_msg)
+
+        with (
+            patch_get_flags(_enabled_flags()),
+            patch("app.db.shielded_async_session", side_effect=lambda: factory()),
+        ):
+            result = await mw.awrap_tool_call(request, handler)
+
+        assert result is result_msg
+        assert len(captured["rows"]) == 1
+        row = captured["rows"][0]
+        assert row.thread_id == 42
+        assert row.search_space_id == 7
+        assert row.user_id == "u1"
+        assert row.tool_name == "make_widget"
+        assert row.args == {"color": "red", "size": 3}
+        assert row.result_id == "msg-1"
+        assert row.error is None
+        assert row.reverse_descriptor is None
+        assert row.reversible is False
+        # Migration 135: ``turn_id`` is the deprecated alias of ``tool_call_id``;
+        # ``chat_turn_id`` comes from ``runtime.config['configurable']['turn_id']``.
+        assert row.tool_call_id == "tc-abc"
+        assert row.turn_id == "tc-abc"
+        assert row.chat_turn_id == "42:1700000000000"
+
+    @pytest.mark.asyncio
+    async def test_chat_turn_id_none_when_runtime_missing(
+        self, patch_get_flags, fake_session_factory
+    ) -> None:
+        """``chat_turn_id`` falls back to NULL when ``runtime.config`` is absent."""
+        captured, factory = fake_session_factory
+        mw = ActionLogMiddleware(thread_id=1, search_space_id=1, user_id=None)
+        request = _FakeRequest(
+            tool_call={"name": "make_widget", "args": {}, "id": "tc-1"},
+            runtime=None,
+        )
+        handler = AsyncMock(return_value=ToolMessage(content="ok", tool_call_id="tc-1"))
+        with (
+            patch_get_flags(_enabled_flags()),
+            patch("app.db.shielded_async_session", side_effect=lambda: factory()),
+        ):
+            await mw.awrap_tool_call(request, handler)
+        row = captured["rows"][0]
+        assert row.tool_call_id == "tc-1"
+        assert row.chat_turn_id is None
+
+    @pytest.mark.asyncio
+    async def test_writes_row_on_failure_and_reraises(
+        self, patch_get_flags, fake_session_factory
+    ) -> None:
+        captured, factory = fake_session_factory
+        mw = ActionLogMiddleware(thread_id=42, search_space_id=7, user_id="u1")
+        request = _FakeRequest(
+            tool_call={"name": "make_widget", "args": {"color": "red"}, "id": "tc1"}
+        )
+        handler = AsyncMock(side_effect=ValueError("boom"))
+
+        with (
+            patch_get_flags(_enabled_flags()),
+            patch("app.db.shielded_async_session", side_effect=lambda: factory()),
+            pytest.raises(ValueError, match="boom"),
+        ):
+            await mw.awrap_tool_call(request, handler)
+
+        assert len(captured["rows"]) == 1
+        row = captured["rows"][0]
+        assert row.tool_name == "make_widget"
+        assert row.error == {"type": "ValueError", "message": "boom"}
+        assert row.result_id is None
+
+    @pytest.mark.asyncio
+    async def test_persistence_failure_does_not_break_tool_call(
+        self, patch_get_flags
+    ) -> None:
+        """Even if the DB write blows up, the tool's result must reach the model."""
+        mw = ActionLogMiddleware(thread_id=1, search_space_id=1, user_id=None)
+        request = _FakeRequest(
+            tool_call={"name": "make_widget", "args": {}, "id": "tc1"}
+        )
+        result_msg = ToolMessage(content="ok", tool_call_id="tc1")
+        handler = AsyncMock(return_value=result_msg)
+
+        def _exploding_session():
+            raise RuntimeError("DB is down")
+
+        with (
+            patch_get_flags(_enabled_flags()),
+            patch("app.db.shielded_async_session", side_effect=_exploding_session),
+        ):
+            result = await mw.awrap_tool_call(request, handler)
+        assert result is result_msg
+
+
+class TestReverseDescriptor:
+    @pytest.mark.asyncio
+    async def test_renders_reverse_descriptor_when_tool_declares_one(
+        self, patch_get_flags, fake_session_factory
+    ) -> None:
+        captured, factory = fake_session_factory
+
+        def _reverse(args, result):
+            return {"tool": "delete_widget", "args": {"id": result["id"]}}
+
+        tool_def = ToolDefinition(
+            name="make_widget",
+            description="Create a widget",
+            factory=lambda deps: make_widget,
+            reverse=_reverse,
+        )
+        mw = ActionLogMiddleware(
+            thread_id=1,
+            search_space_id=1,
+            user_id="u",
+            tool_definitions={"make_widget": tool_def},
+        )
+        request = _FakeRequest(
+            tool_call={
+                "name": "make_widget",
+                "args": {"color": "blue", "size": 1},
+                "id": "tc-xyz",
+            },
+        )
+        result_msg = ToolMessage(
+            content='{"id": "widget-9"}', tool_call_id="tc-xyz", id="msg-9"
+        )
+        handler = AsyncMock(return_value=result_msg)
+
+        with (
+            patch_get_flags(_enabled_flags()),
+            patch("app.db.shielded_async_session", side_effect=lambda: factory()),
+        ):
+            await mw.awrap_tool_call(request, handler)
+
+        row = captured["rows"][0]
+        assert row.reversible is True
+        assert row.reverse_descriptor == {
+            "tool": "delete_widget",
+            "args": {"id": "widget-9"},
+        }
+
+    @pytest.mark.asyncio
+    async def test_swallows_reverse_callable_errors(
+        self, patch_get_flags, fake_session_factory
+    ) -> None:
+        captured, factory = fake_session_factory
+
+        def _bad_reverse(args, result):
+            raise RuntimeError("reverse blew up")
+
+        tool_def = ToolDefinition(
+            name="make_widget",
+            description="Create a widget",
+            factory=lambda deps: make_widget,
+            reverse=_bad_reverse,
+        )
+        mw = ActionLogMiddleware(
+            thread_id=1,
+            search_space_id=1,
+            user_id=None,
+            tool_definitions={"make_widget": tool_def},
+        )
+        request = _FakeRequest(
+            tool_call={"name": "make_widget", "args": {}, "id": "tc1"}
+        )
+        result_msg = ToolMessage(content="ok", tool_call_id="tc1")
+        handler = AsyncMock(return_value=result_msg)
+
+        with (
+            patch_get_flags(_enabled_flags()),
+            patch("app.db.shielded_async_session", side_effect=lambda: factory()),
+        ):
+            await mw.awrap_tool_call(request, handler)
+
+        row = captured["rows"][0]
+        assert row.reversible is False
+        assert row.reverse_descriptor is None
+
+    @pytest.mark.asyncio
+    async def test_no_reverse_when_tool_definition_missing(
+        self, patch_get_flags, fake_session_factory
+    ) -> None:
+        captured, factory = fake_session_factory
+        mw = ActionLogMiddleware(thread_id=1, search_space_id=1, user_id=None)
+        request = _FakeRequest(
+            tool_call={"name": "unknown_tool", "args": {}, "id": "tc1"}
+        )
+        handler = AsyncMock(return_value=ToolMessage(content="ok", tool_call_id="tc1"))
+        with (
+            patch_get_flags(_enabled_flags()),
+            patch("app.db.shielded_async_session", side_effect=lambda: factory()),
+        ):
+            await mw.awrap_tool_call(request, handler)
+        row = captured["rows"][0]
+        assert row.reversible is False
+
+
+class TestActionLogDispatch:
+    """Verify ``adispatch_custom_event`` fires after commit."""
+
+    @pytest.mark.asyncio
+    async def test_dispatches_action_log_event_on_success(
+        self, patch_get_flags, fake_session_factory
+    ) -> None:
+        _captured, factory = fake_session_factory
+        mw = ActionLogMiddleware(thread_id=42, search_space_id=7, user_id="u1")
+        request = _FakeRequest(
+            tool_call={
+                "name": "make_widget",
+                "args": {"color": "red"},
+                "id": "tc-evt",
+            },
+            runtime=_FakeRuntime(
+                config={"configurable": {"turn_id": "42:1700000000000"}}
+            ),
+        )
+        result_msg = ToolMessage(content="ok", tool_call_id="tc-evt", id="msg-42")
+        handler = AsyncMock(return_value=result_msg)
+
+        dispatch_mock = AsyncMock()
+        with (
+            patch_get_flags(_enabled_flags()),
+            patch("app.db.shielded_async_session", side_effect=lambda: factory()),
+            patch(
+                "app.agents.new_chat.middleware.action_log.adispatch_custom_event",
+                dispatch_mock,
+            ),
+        ):
+            await mw.awrap_tool_call(request, handler)
+
+        dispatch_mock.assert_awaited_once()
+        call_args = dispatch_mock.await_args
+        assert call_args is not None
+        assert call_args.args[0] == "action_log"
+        payload = call_args.args[1]
+        assert payload["lc_tool_call_id"] == "tc-evt"
+        assert payload["chat_turn_id"] == "42:1700000000000"
+        assert payload["tool_name"] == "make_widget"
+        assert payload["reversible"] is False
+        assert payload["reverse_descriptor_present"] is False
+        assert payload["error"] is False
+
+    @pytest.mark.asyncio
+    async def test_no_dispatch_when_persistence_fails(self, patch_get_flags) -> None:
+        """If commit fails the dispatch is suppressed (no row to surface)."""
+        mw = ActionLogMiddleware(thread_id=1, search_space_id=1, user_id=None)
+        request = _FakeRequest(
+            tool_call={"name": "make_widget", "args": {}, "id": "tc1"}
+        )
+        handler = AsyncMock(return_value=ToolMessage(content="ok", tool_call_id="tc1"))
+        dispatch_mock = AsyncMock()
+
+        def _exploding_session():
+            raise RuntimeError("DB is down")
+
+        with (
+            patch_get_flags(_enabled_flags()),
+            patch("app.db.shielded_async_session", side_effect=_exploding_session),
+            patch(
+                "app.agents.new_chat.middleware.action_log.adispatch_custom_event",
+                dispatch_mock,
+            ),
+        ):
+            await mw.awrap_tool_call(request, handler)
+        dispatch_mock.assert_not_awaited()
+
+
+class TestArgsTruncation:
+    @pytest.mark.asyncio
+    async def test_huge_args_payload_is_truncated(
+        self, patch_get_flags, fake_session_factory
+    ) -> None:
+        captured, factory = fake_session_factory
+        mw = ActionLogMiddleware(thread_id=1, search_space_id=1, user_id=None)
+        # Build a > 32KB string so the persisted payload triggers the truncation path.
+        huge = "x" * (40 * 1024)
+        request = _FakeRequest(
+            tool_call={"name": "make_widget", "args": {"blob": huge}, "id": "tc1"},
+        )
+        handler = AsyncMock(return_value=ToolMessage(content="ok", tool_call_id="tc1"))
+        with (
+            patch_get_flags(_enabled_flags()),
+            patch("app.db.shielded_async_session", side_effect=lambda: factory()),
+        ):
+            await mw.awrap_tool_call(request, handler)
+        row = captured["rows"][0]
+        assert row.args is not None
+        assert row.args.get("_truncated") is True
+        assert row.args.get("_size", 0) >= 40 * 1024
diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_agent_cache.py b/surfsense_backend/tests/unit/agents/new_chat/test_agent_cache.py
new file mode 100644
index 0000000000..9b3de2db72
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_agent_cache.py
@@ -0,0 +1,268 @@
+"""Regression tests for the compiled-agent cache.
+
+Covers the cache primitive itself (TTL, LRU, in-flight de-duplication,
+build-failure non-caching) and the cache-key signature helpers that
+``create_surfsense_deep_agent`` relies on. The integration with
+``create_surfsense_deep_agent`` is covered separately by the streaming
+contract tests; this module focuses on the primitives so a regression
+in the cache implementation is caught before it reaches the agent
+factory.
+"""
+
+from __future__ import annotations
+
+import asyncio
+from dataclasses import dataclass
+
+import pytest
+
+from app.agents.new_chat.agent_cache import (
+    flags_signature,
+    reload_for_tests,
+    stable_hash,
+    system_prompt_hash,
+    tools_signature,
+)
+
+pytestmark = pytest.mark.unit
+
+
+# ---------------------------------------------------------------------------
+# stable_hash + signature helpers
+# ---------------------------------------------------------------------------
+
+
+def test_stable_hash_is_deterministic_across_calls() -> None:
+    a = stable_hash("v1", 42, "thread-9", None, ["x", "y"])
+    b = stable_hash("v1", 42, "thread-9", None, ["x", "y"])
+    assert a == b
+
+
+def test_stable_hash_changes_when_any_part_changes() -> None:
+    base = stable_hash("v1", 42, "thread-9")
+    assert stable_hash("v1", 42, "thread-10") != base
+    assert stable_hash("v2", 42, "thread-9") != base
+    assert stable_hash("v1", 43, "thread-9") != base
+
+
+def test_tools_signature_keys_on_name_and_description_not_identity() -> None:
+    """Two tool lists with the same surface must hash identically.
+
+    The cache key MUST NOT change when the underlying ``BaseTool``
+    instances are different Python objects (a fresh request constructs
+    fresh tool instances every time). Hashing on ``(name, description)``
+    keeps the cache hot across requests with identical tool surfaces.
+    """
+
+    @dataclass
+    class FakeTool:
+        name: str
+        description: str
+
+    tools_a = [FakeTool("alpha", "does alpha"), FakeTool("beta", "does beta")]
+    tools_b = [FakeTool("beta", "does beta"), FakeTool("alpha", "does alpha")]
+    sig_a = tools_signature(
+        tools_a, available_connectors=["NOTION"], available_document_types=["FILE"]
+    )
+    sig_b = tools_signature(
+        tools_b, available_connectors=["NOTION"], available_document_types=["FILE"]
+    )
+    assert sig_a == sig_b, "tool order must not affect the signature"
+
+    # Adding a tool rotates the key.
+    tools_c = [*tools_a, FakeTool("gamma", "does gamma")]
+    sig_c = tools_signature(
+        tools_c, available_connectors=["NOTION"], available_document_types=["FILE"]
+    )
+    assert sig_c != sig_a
+
+
+def test_tools_signature_rotates_when_connector_set_changes() -> None:
+    @dataclass
+    class FakeTool:
+        name: str
+        description: str
+
+    tools = [FakeTool("a", "x")]
+    base = tools_signature(
+        tools, available_connectors=["NOTION"], available_document_types=["FILE"]
+    )
+    added = tools_signature(
+        tools,
+        available_connectors=["NOTION", "SLACK"],
+        available_document_types=["FILE"],
+    )
+    assert base != added, "adding a connector must rotate the cache key"
+
+
+def test_flags_signature_changes_when_flag_flips() -> None:
+    @dataclass(frozen=True)
+    class Flags:
+        a: bool = True
+        b: bool = False
+
+    base = flags_signature(Flags())
+    flipped = flags_signature(Flags(b=True))
+    assert base != flipped
+
+
+def test_system_prompt_hash_is_stable_and_distinct() -> None:
+    p1 = "You are a helpful assistant."
+    p2 = "You are a helpful assistant!"  # one-character delta
+    assert system_prompt_hash(p1) == system_prompt_hash(p1)
+    assert system_prompt_hash(p1) != system_prompt_hash(p2)
+
+
+# ---------------------------------------------------------------------------
+# _AgentCache: hit / miss / TTL / LRU / coalescing / failure-not-cached
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_cache_hit_returns_same_instance_on_second_call() -> None:
+    cache = reload_for_tests(maxsize=8, ttl_seconds=60.0)
+    builds = 0
+
+    async def builder() -> object:
+        nonlocal builds
+        builds += 1
+        return object()
+
+    a = await cache.get_or_build("k", builder=builder)
+    b = await cache.get_or_build("k", builder=builder)
+    assert a is b, "cache must return the SAME object across hits"
+    assert builds == 1, "builder must run exactly once"
+
+
+@pytest.mark.asyncio
+async def test_cache_different_keys_get_different_instances() -> None:
+    cache = reload_for_tests(maxsize=8, ttl_seconds=60.0)
+
+    async def builder() -> object:
+        return object()
+
+    a = await cache.get_or_build("k1", builder=builder)
+    b = await cache.get_or_build("k2", builder=builder)
+    assert a is not b
+
+
+@pytest.mark.asyncio
+async def test_cache_stale_entries_get_rebuilt() -> None:
+    # ttl=0 means every read sees the entry as immediately stale.
+    cache = reload_for_tests(maxsize=8, ttl_seconds=0.0)
+    builds = 0
+
+    async def builder() -> object:
+        nonlocal builds
+        builds += 1
+        return object()
+
+    a = await cache.get_or_build("k", builder=builder)
+    b = await cache.get_or_build("k", builder=builder)
+    assert a is not b, "stale entry must rebuild a fresh instance"
+    assert builds == 2
+
+
+@pytest.mark.asyncio
+async def test_cache_evicts_lru_when_full() -> None:
+    cache = reload_for_tests(maxsize=2, ttl_seconds=60.0)
+
+    async def builder() -> object:
+        return object()
+
+    a = await cache.get_or_build("a", builder=builder)
+    _ = await cache.get_or_build("b", builder=builder)
+    # Re-touch "a" so "b" is now the LRU victim.
+    a_again = await cache.get_or_build("a", builder=builder)
+    assert a_again is a
+    # Inserting "c" should evict "b" (LRU), not "a".
+    _ = await cache.get_or_build("c", builder=builder)
+    assert cache.stats()["size"] == 2
+
+    # Confirm "a" is still hot (no rebuild) and "b" is gone (rebuild).
+    a_hit = await cache.get_or_build("a", builder=builder)
+    assert a_hit is a, "LRU must keep the most-recently-used 'a' entry"
+
+
+@pytest.mark.asyncio
+async def test_cache_concurrent_misses_coalesce_to_single_build() -> None:
+    """Two concurrent get_or_build calls on the same key must share one builder."""
+    cache = reload_for_tests(maxsize=8, ttl_seconds=60.0)
+    build_started = asyncio.Event()
+    builds = 0
+
+    async def slow_builder() -> object:
+        nonlocal builds
+        builds += 1
+        build_started.set()
+        # Yield control so the second waiter can race against us.
+        await asyncio.sleep(0.05)
+        return object()
+
+    task_a = asyncio.create_task(cache.get_or_build("k", builder=slow_builder))
+    # Wait until the first builder has started, then race a second waiter.
+    await build_started.wait()
+    task_b = asyncio.create_task(cache.get_or_build("k", builder=slow_builder))
+
+    a, b = await asyncio.gather(task_a, task_b)
+    assert a is b, "coalesced waiters must observe the same value"
+    assert builds == 1, "concurrent cold misses must collapse to ONE build"
+
+
+@pytest.mark.asyncio
+async def test_cache_does_not_store_failed_builds() -> None:
+    """A builder that raises must NOT poison the cache.
+
+    The next caller for the same key must run the builder again (not
+    re-raise the cached exception).
+    """
+    cache = reload_for_tests(maxsize=8, ttl_seconds=60.0)
+    attempts = 0
+
+    async def flaky_builder() -> object:
+        nonlocal attempts
+        attempts += 1
+        if attempts == 1:
+            raise RuntimeError("transient")
+        return object()
+
+    with pytest.raises(RuntimeError, match="transient"):
+        await cache.get_or_build("k", builder=flaky_builder)
+
+    # Second call must retry — not re-raise the cached exception.
+    value = await cache.get_or_build("k", builder=flaky_builder)
+    assert value is not None
+    assert attempts == 2
+
+
+@pytest.mark.asyncio
+async def test_cache_invalidate_drops_entry() -> None:
+    cache = reload_for_tests(maxsize=8, ttl_seconds=60.0)
+
+    async def builder() -> object:
+        return object()
+
+    a = await cache.get_or_build("k", builder=builder)
+    assert cache.invalidate("k") is True
+    b = await cache.get_or_build("k", builder=builder)
+    assert a is not b, "post-invalidation lookup must rebuild"
+
+
+@pytest.mark.asyncio
+async def test_cache_invalidate_prefix_drops_matching_entries() -> None:
+    cache = reload_for_tests(maxsize=16, ttl_seconds=60.0)
+
+    async def builder() -> object:
+        return object()
+
+    await cache.get_or_build("user:1:thread:1", builder=builder)
+    await cache.get_or_build("user:1:thread:2", builder=builder)
+    await cache.get_or_build("user:2:thread:1", builder=builder)
+
+    removed = cache.invalidate_prefix("user:1:")
+    assert removed == 2
+    assert cache.stats()["size"] == 1
+
+    # The user:2 entry must still be hot (no rebuild).
+    survivor_value = await cache.get_or_build("user:2:thread:1", builder=builder)
+    assert survivor_value is not None
diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_busy_mutex.py b/surfsense_backend/tests/unit/agents/new_chat/test_busy_mutex.py
new file mode 100644
index 0000000000..f0161f605d
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_busy_mutex.py
@@ -0,0 +1,154 @@
+"""Tests for BusyMutexMiddleware: per-thread lock + cancel event behavior."""
+
+from __future__ import annotations
+
+import pytest
+
+from app.agents.new_chat.errors import BusyError
+from app.agents.new_chat.middleware.busy_mutex import (
+    BusyMutexMiddleware,
+    end_turn,
+    get_cancel_event,
+    is_cancel_requested,
+    manager,
+    request_cancel,
+    reset_cancel,
+)
+
+pytestmark = pytest.mark.unit
+
+
+class _Runtime:
+    def __init__(self, thread_id: str | None) -> None:
+        self.config = {"configurable": {"thread_id": thread_id}}
+
+
+@pytest.mark.asyncio
+async def test_first_acquire_succeeds_and_release_unblocks() -> None:
+    mw = BusyMutexMiddleware()
+    runtime = _Runtime("t1")
+    await mw.abefore_agent({}, runtime)
+
+    # Lock should now be held
+    lock = manager.lock_for("t1")
+    assert lock.locked()
+
+    await mw.aafter_agent({}, runtime)
+    assert not lock.locked()
+
+
+@pytest.mark.asyncio
+async def test_second_concurrent_acquire_raises_busy() -> None:
+    mw_a = BusyMutexMiddleware()
+    mw_b = BusyMutexMiddleware()
+    runtime = _Runtime("t-conflict")
+    await mw_a.abefore_agent({}, runtime)
+
+    with pytest.raises(BusyError) as excinfo:
+        await mw_b.abefore_agent({}, runtime)
+    assert excinfo.value.request_id == "t-conflict"
+
+    await mw_a.aafter_agent({}, runtime)
+    # After release, mw_b can acquire
+    await mw_b.abefore_agent({}, runtime)
+    await mw_b.aafter_agent({}, runtime)
+
+
+@pytest.mark.asyncio
+async def test_cancel_event_lifecycle() -> None:
+    mw = BusyMutexMiddleware()
+    runtime = _Runtime("t-cancel")
+
+    await mw.abefore_agent({}, runtime)
+    event = get_cancel_event("t-cancel")
+    assert not event.is_set()
+
+    request_cancel("t-cancel")
+    assert event.is_set()
+
+    # End of turn should reset
+    await mw.aafter_agent({}, runtime)
+    assert not event.is_set()
+
+
+@pytest.mark.asyncio
+async def test_no_thread_id_raises_when_required() -> None:
+    mw = BusyMutexMiddleware(require_thread_id=True)
+    runtime = _Runtime(None)
+    with pytest.raises(BusyError):
+        await mw.abefore_agent({}, runtime)
+
+
+@pytest.mark.asyncio
+async def test_no_thread_id_skipped_when_not_required() -> None:
+    mw = BusyMutexMiddleware(require_thread_id=False)
+    runtime = _Runtime(None)
+    await mw.abefore_agent({}, runtime)
+    await mw.aafter_agent({}, runtime)
+
+
+def test_reset_cancel_idempotent() -> None:
+    # Should not raise even if event was never created
+    reset_cancel("never-seen")
+
+
+def test_request_cancel_creates_event_for_unseen_thread() -> None:
+    thread_id = "never-seen-cancel"
+    reset_cancel(thread_id)
+
+    assert request_cancel(thread_id) is True
+    assert get_cancel_event(thread_id).is_set()
+    assert is_cancel_requested(thread_id) is True
+
+
+@pytest.mark.asyncio
+async def test_end_turn_force_clears_lock_and_cancel_state() -> None:
+    thread_id = "forced-end-turn"
+    mw = BusyMutexMiddleware()
+    runtime = _Runtime(thread_id)
+
+    await mw.abefore_agent({}, runtime)
+    assert manager.lock_for(thread_id).locked()
+
+    request_cancel(thread_id)
+    assert is_cancel_requested(thread_id) is True
+
+    end_turn(thread_id)
+
+    assert not manager.lock_for(thread_id).locked()
+    assert not get_cancel_event(thread_id).is_set()
+    assert is_cancel_requested(thread_id) is False
+
+
+@pytest.mark.asyncio
+async def test_busy_mutex_stale_aafter_does_not_release_new_attempt_lock() -> None:
+    """A stale aafter call from attempt A must not unlock attempt B.
+
+    Repro flow:
+    1) attempt A acquires thread lock
+    2) forced end_turn clears A so retry can proceed
+    3) attempt B acquires same thread lock
+    4) stale attempt-A aafter runs late
+
+    Expected: B lock remains held.
+    """
+    thread_id = "stale-aafter-lock"
+    runtime = _Runtime(thread_id)
+    attempt_a = BusyMutexMiddleware()
+    attempt_b = BusyMutexMiddleware()
+
+    await attempt_a.abefore_agent({}, runtime)
+    lock = manager.lock_for(thread_id)
+    assert lock.locked()
+
+    end_turn(thread_id)
+    assert not lock.locked()
+
+    await attempt_b.abefore_agent({}, runtime)
+    assert lock.locked()
+
+    # Stale cleanup from attempt A must not release attempt B's lock.
+    await attempt_a.aafter_agent({}, runtime)
+    assert lock.locked()
+
+    await attempt_b.aafter_agent({}, runtime)
diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_compaction.py b/surfsense_backend/tests/unit/agents/new_chat/test_compaction.py
new file mode 100644
index 0000000000..c6d4cc4528
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_compaction.py
@@ -0,0 +1,119 @@
+"""Tests for SurfSenseCompactionMiddleware: protected SystemMessage handling and content sanitization."""
+
+from __future__ import annotations
+
+import pytest
+from langchain_core.messages import (
+    AIMessage,
+    HumanMessage,
+    SystemMessage,
+    ToolMessage,
+)
+
+from app.agents.new_chat.middleware.compaction import (
+    PROTECTED_SYSTEM_PREFIXES,
+    _is_protected_system_message,
+    _sanitize_message_content,
+)
+
+pytestmark = pytest.mark.unit
+
+
+class TestIsProtectedSystemMessage:
+    @pytest.mark.parametrize("prefix", PROTECTED_SYSTEM_PREFIXES)
+    def test_each_prefix_protected(self, prefix: str) -> None:
+        msg = SystemMessage(content=f"{prefix}\nbody\n</close>")
+        assert _is_protected_system_message(msg) is True
+
+    def test_unprotected_system_message(self) -> None:
+        assert (
+            _is_protected_system_message(SystemMessage(content="random instructions"))
+            is False
+        )
+
+    def test_human_message_never_protected(self) -> None:
+        assert (
+            _is_protected_system_message(HumanMessage(content="<workspace_tree>..."))
+            is False
+        )
+
+    def test_tolerates_leading_whitespace(self) -> None:
+        msg = SystemMessage(content="   \n<priority_documents>\n...")
+        assert _is_protected_system_message(msg) is True
+
+
+class TestSanitizeMessageContent:
+    def test_returns_same_message_when_content_present(self) -> None:
+        msg = AIMessage(content="hello")
+        assert _sanitize_message_content(msg) is msg
+
+    def test_replaces_none_with_empty_string(self) -> None:
+        # Pydantic blocks ``content=None`` at construction; the real
+        # crash happens when the streaming layer mutates ``content``
+        # after-the-fact. Replicate that by force-setting on a built
+        # message.
+        msg = AIMessage(
+            content="",
+            tool_calls=[{"name": "x", "args": {}, "id": "1"}],
+        )
+        # Bypass pydantic validation to simulate the LiteLLM/Bedrock case
+        object.__setattr__(msg, "content", None)
+        sanitized = _sanitize_message_content(msg)
+        assert sanitized.content == ""
+
+
+class TestPartitionMessages:
+    """Verify the partition override surfaces protected SystemMessages
+    into ``preserved_messages`` regardless of cutoff position.
+    """
+
+    def _build_partitioner(self):
+        # Construct a thin shim — we can't easily instantiate the full
+        # SurfSenseCompactionMiddleware without a real model, but the
+        # override path needs ``_lc_helper`` to delegate to. We mock
+        # that with a simple slicing partitioner equivalent to the real one.
+        from app.agents.new_chat.middleware.compaction import (
+            SurfSenseCompactionMiddleware,
+        )
+
+        class _LcHelper:
+            @staticmethod
+            def _partition_messages(messages, cutoff):
+                return messages[:cutoff], messages[cutoff:]
+
+        class _Stub(SurfSenseCompactionMiddleware):
+            def __init__(self):
+                self._lc_helper = _LcHelper()
+
+        return _Stub()
+
+    def test_protected_system_message_preserved_even_in_summarize_half(self) -> None:
+        partitioner = self._build_partitioner()
+        protected = SystemMessage(content="<priority_documents>\n...")
+        msgs = [
+            HumanMessage(content="old human"),
+            AIMessage(content="old ai"),
+            protected,
+            ToolMessage(content="tool 1", tool_call_id="t1"),
+            HumanMessage(content="new"),
+        ]
+        # Cutoff = 4 means everything before index 4 should be summarized
+        to_summary, preserved = partitioner._partition_messages(msgs, 4)
+
+        assert protected not in to_summary
+        assert protected in preserved
+        # The non-protected old messages remain in to_summary
+        assert any(
+            isinstance(m, HumanMessage) and m.content == "old human" for m in to_summary
+        )
+
+    def test_unprotected_messages_unaffected(self) -> None:
+        partitioner = self._build_partitioner()
+        msgs = [
+            HumanMessage(content="a"),
+            HumanMessage(content="b"),
+            HumanMessage(content="c"),
+        ]
+        to_summary, preserved = partitioner._partition_messages(msgs, 2)
+        assert [m.content for m in to_summary] == ["a", "b"]
+        assert [m.content for m in preserved] == ["c"]
diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_context_editing.py b/surfsense_backend/tests/unit/agents/new_chat/test_context_editing.py
new file mode 100644
index 0000000000..ba22464133
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_context_editing.py
@@ -0,0 +1,108 @@
+"""Tests for SpillToBackendEdit and SpillingContextEditingMiddleware."""
+
+from __future__ import annotations
+
+from typing import Any
+
+import pytest
+from langchain_core.messages import AIMessage, HumanMessage, ToolMessage
+
+from app.agents.new_chat.middleware.context_editing import (
+    SpillToBackendEdit,
+    _build_spill_placeholder,
+)
+
+pytestmark = pytest.mark.unit
+
+
+def _build_history(num_pairs: int = 6) -> list[Any]:
+    """Build a long history of (AIMessage with tool_call, ToolMessage) pairs."""
+    msgs: list[Any] = [HumanMessage(content="please do many things")]
+    for i in range(num_pairs):
+        msgs.append(
+            AIMessage(
+                content="",
+                tool_calls=[
+                    {"name": f"tool_{i}", "args": {"i": i}, "id": f"call-{i}"},
+                ],
+            )
+        )
+        msgs.append(
+            ToolMessage(
+                content="x" * 5000,
+                tool_call_id=f"call-{i}",
+                name=f"tool_{i}",
+                id=f"tool-msg-{i}",
+            )
+        )
+    return msgs
+
+
+def _approx_count(messages: list[Any]) -> int:
+    """Trivial token counter: 1 token per 4 chars."""
+    total = 0
+    for msg in messages:
+        content = getattr(msg, "content", "")
+        if isinstance(content, str):
+            total += len(content) // 4
+    return total
+
+
+class TestSpillEdit:
+    def test_below_trigger_does_nothing(self) -> None:
+        edit = SpillToBackendEdit(trigger=1_000_000, keep=2)
+        msgs = _build_history(3)
+        original_lengths = [len(getattr(m, "content", "")) for m in msgs]
+        edit.apply(msgs, count_tokens=_approx_count)
+        new_lengths = [len(getattr(m, "content", "")) for m in msgs]
+        assert original_lengths == new_lengths
+        assert edit.pending_spills == []
+
+    def test_above_trigger_clears_and_records(self) -> None:
+        edit = SpillToBackendEdit(trigger=100, keep=1, path_prefix="/tool_outputs")
+        msgs = _build_history(4)
+        edit.apply(msgs, count_tokens=_approx_count)
+
+        # The most-recent ToolMessage (keep=1) should remain intact
+        tool_messages = [m for m in msgs if isinstance(m, ToolMessage)]
+        intact = tool_messages[-1]
+        assert intact.content.startswith("x")  # untouched
+
+        # Earlier ToolMessages should now contain the placeholder text
+        cleared = [
+            m
+            for m in tool_messages
+            if isinstance(m.content, str) and m.content.startswith("[cleared")
+        ]
+        assert len(cleared) >= 1
+        # And the spill list should match
+        assert len(edit.pending_spills) == len(cleared)
+
+    def test_excluded_tools_not_cleared(self) -> None:
+        edit = SpillToBackendEdit(
+            trigger=100,
+            keep=0,
+            exclude_tools=("tool_0",),
+        )
+        msgs = _build_history(4)
+        edit.apply(msgs, count_tokens=_approx_count)
+
+        first_tool = next(
+            m for m in msgs if isinstance(m, ToolMessage) and m.name == "tool_0"
+        )
+        # Excluded — untouched
+        assert first_tool.content.startswith("x")
+
+    def test_drain_clears_pending(self) -> None:
+        edit = SpillToBackendEdit(trigger=100, keep=1)
+        msgs = _build_history(4)
+        edit.apply(msgs, count_tokens=_approx_count)
+        first_drain = edit.drain_pending()
+        assert len(first_drain) > 0
+        assert edit.drain_pending() == []
+
+    def test_placeholder_format(self) -> None:
+        path = "/tool_outputs/thread-1/tool-msg-0.txt"
+        text = _build_spill_placeholder(path)
+        assert path in text
+        assert "explore" in text  # mentions the recovery agent
diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_dedup_tool_calls.py b/surfsense_backend/tests/unit/agents/new_chat/test_dedup_tool_calls.py
new file mode 100644
index 0000000000..61d9b499f8
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_dedup_tool_calls.py
@@ -0,0 +1,217 @@
+"""Tests for declarative dedup_key on ToolDefinition (Tier 2.3 migration)."""
+
+from __future__ import annotations
+
+import pytest
+from langchain_core.messages import AIMessage
+from langchain_core.tools import StructuredTool
+
+from app.agents.new_chat.middleware.dedup_tool_calls import (
+    DedupHITLToolCallsMiddleware,
+)
+
+pytestmark = pytest.mark.unit
+
+
+def _make_tool(name: str, *, dedup_key=None, hitl_dedup_key=None):
+    metadata = {}
+    if dedup_key is not None:
+        metadata["dedup_key"] = dedup_key
+    if hitl_dedup_key is not None:
+        metadata["hitl"] = True
+        metadata["hitl_dedup_key"] = hitl_dedup_key
+
+    def _fn(**kwargs):
+        return "ok"
+
+    return StructuredTool.from_function(
+        func=_fn, name=name, description="x", metadata=metadata
+    )
+
+
+def _msg(*calls: dict) -> AIMessage:
+    return AIMessage(content="", tool_calls=list(calls))
+
+
+class _Runtime:
+    pass
+
+
+def test_callable_dedup_key_takes_priority() -> None:
+    tool = _make_tool(
+        "create_doc",
+        dedup_key=lambda args: f"{args.get('parent_id')}::{args.get('title')}",
+    )
+    mw = DedupHITLToolCallsMiddleware(agent_tools=[tool])
+    state = {
+        "messages": [
+            _msg(
+                {
+                    "name": "create_doc",
+                    "args": {"parent_id": "x", "title": "y"},
+                    "id": "1",
+                },
+                {
+                    "name": "create_doc",
+                    "args": {"parent_id": "x", "title": "y"},
+                    "id": "2",
+                },
+                {
+                    "name": "create_doc",
+                    "args": {"parent_id": "x", "title": "z"},
+                    "id": "3",
+                },
+            )
+        ]
+    }
+    out = mw.after_model(state, _Runtime())
+    assert out is not None
+    new_calls = out["messages"][0].tool_calls
+    assert len(new_calls) == 2  # one duplicate dropped
+    assert {c["id"] for c in new_calls} == {"1", "3"}
+
+
+def test_string_hitl_dedup_key_still_works() -> None:
+    tool = _make_tool("send_x", hitl_dedup_key="subject")
+    mw = DedupHITLToolCallsMiddleware(agent_tools=[tool])
+    state = {
+        "messages": [
+            _msg(
+                {"name": "send_x", "args": {"subject": "Hello"}, "id": "1"},
+                {"name": "send_x", "args": {"subject": "hello"}, "id": "2"},  # case
+            )
+        ]
+    }
+    out = mw.after_model(state, _Runtime())
+    assert out is not None
+    assert len(out["messages"][0].tool_calls) == 1
+
+
+def test_no_agent_tools_means_no_dedup() -> None:
+    """After the cleanup tier removed the legacy ``_NATIVE_HITL_TOOL_DEDUP_KEYS``
+    map, dedup is purely declarative — no resolvers means no dedup runs.
+
+    Coverage for the previously hardcoded native HITL tools now lives on
+    each :class:`ToolDefinition.dedup_key` in
+    :mod:`app.agents.new_chat.tools.registry`, which is wired through to
+    ``tool.metadata`` by :func:`build_tools`.
+    """
+    mw = DedupHITLToolCallsMiddleware(agent_tools=None)
+    state = {
+        "messages": [
+            _msg(
+                {"name": "create_notion_page", "args": {"title": "X"}, "id": "1"},
+                {"name": "create_notion_page", "args": {"title": "x"}, "id": "2"},
+            )
+        ]
+    }
+    out = mw.after_model(state, _Runtime())
+    assert out is None
+
+
+def test_registry_propagates_dedup_key_to_tool_metadata() -> None:
+    """Smoke-check the wiring path that replaced the legacy native map.
+
+    ``ToolDefinition.dedup_key`` set in the registry must be copied onto
+    the constructed tool's ``metadata`` so :class:`DedupHITLToolCallsMiddleware`
+    can pick it up at agent build time.
+    """
+    from app.agents.new_chat.tools.registry import (
+        BUILTIN_TOOLS,
+        wrap_dedup_key_by_arg_name,
+    )
+
+    notion_tool_defs = [t for t in BUILTIN_TOOLS if t.name == "create_notion_page"]
+    assert notion_tool_defs, "registry should still expose create_notion_page"
+    tool_def = notion_tool_defs[0]
+    assert tool_def.dedup_key is not None
+    # Same wrapping helper used in the registry — sanity check identity
+    sample = wrap_dedup_key_by_arg_name("title")({"title": "Plan"})
+    assert sample == "plan"
+
+
+def test_full_args_dedup_keeps_distinct_calls_sharing_a_field() -> None:
+    """Regression: MCP tools (e.g. ``createJiraIssue``) used to dedup on
+    the schema's first required field, which is often the workspace /
+    cloudId — so 3 distinct issues in the same workspace collapsed to 1.
+
+    With :func:`dedup_key_full_args` only fully identical arg dicts dedup.
+    """
+    from app.agents.new_chat.middleware.dedup_tool_calls import dedup_key_full_args
+
+    tool = _make_tool("createJiraIssue", dedup_key=dedup_key_full_args)
+    mw = DedupHITLToolCallsMiddleware(agent_tools=[tool])
+    state = {
+        "messages": [
+            _msg(
+                {
+                    "name": "createJiraIssue",
+                    "args": {
+                        "cloudId": "ws.atlassian.net",
+                        "projectKey": "PROJ",
+                        "summary": "Fix login bug",
+                    },
+                    "id": "1",
+                },
+                {
+                    "name": "createJiraIssue",
+                    "args": {
+                        "cloudId": "ws.atlassian.net",
+                        "projectKey": "PROJ",
+                        "summary": "Add dark mode",
+                    },
+                    "id": "2",
+                },
+                {
+                    "name": "createJiraIssue",
+                    "args": {
+                        "cloudId": "ws.atlassian.net",
+                        "projectKey": "PROJ",
+                        "summary": "Improve perf",
+                    },
+                    "id": "3",
+                },
+            )
+        ]
+    }
+    out = mw.after_model(state, _Runtime())
+    assert out is None  # nothing dropped — all three differ in summary
+
+
+def test_full_args_dedup_drops_only_exact_duplicates() -> None:
+    from app.agents.new_chat.middleware.dedup_tool_calls import dedup_key_full_args
+
+    tool = _make_tool("createJiraIssue", dedup_key=dedup_key_full_args)
+    mw = DedupHITLToolCallsMiddleware(agent_tools=[tool])
+    args = {"cloudId": "ws.atlassian.net", "summary": "Fix bug"}
+    state = {
+        "messages": [
+            _msg(
+                {"name": "createJiraIssue", "args": args, "id": "1"},
+                {"name": "createJiraIssue", "args": dict(args), "id": "2"},
+                {
+                    "name": "createJiraIssue",
+                    "args": {**args, "summary": "Different"},
+                    "id": "3",
+                },
+            )
+        ]
+    }
+    out = mw.after_model(state, _Runtime())
+    assert out is not None
+    new_calls = out["messages"][0].tool_calls
+    assert {c["id"] for c in new_calls} == {"1", "3"}
+
+
+def test_unknown_tool_passes_through() -> None:
+    mw = DedupHITLToolCallsMiddleware(agent_tools=None)
+    state = {
+        "messages": [
+            _msg(
+                {"name": "anything_else", "args": {"x": 1}, "id": "1"},
+                {"name": "anything_else", "args": {"x": 1}, "id": "2"},
+            )
+        ]
+    }
+    out = mw.after_model(state, _Runtime())
+    assert out is None  # no dedup configured -> kept
diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_default_permissions_layering.py b/surfsense_backend/tests/unit/agents/new_chat/test_default_permissions_layering.py
new file mode 100644
index 0000000000..ac6b5d95c8
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_default_permissions_layering.py
@@ -0,0 +1,128 @@
+"""Lock in the default-allow layering used by ``chat_deepagent``.
+
+The agent factory wires ``PermissionMiddleware`` with three rulesets,
+earliest -> latest:
+
+1. ``surfsense_defaults`` (single ``allow */*`` rule)
+2. ``connector_synthesized`` (deny rules for tools whose required
+   connector is missing)
+3. (future) user-defined rules from the Agent Permissions UI
+
+Without #1 every read-only built-in (``ls``, ``read_file``, ``grep``,
+``glob``, ``web_search`` …) defaulted to ``ask`` because
+``permissions.evaluate`` returns ``ask`` when no rule matches. That
+caused two production-painful behaviors:
+
+* Resume payloads with a prior reject decision bled into innocent
+  read-only tool calls, raising ``RejectedError("ls")``.
+* Mutating connector tools got *double* prompted — once via the
+  middleware ``ask`` and again via the per-tool ``interrupt()`` in
+  ``app.agents.new_chat.tools.hitl``.
+
+These tests pin the layering so a refactor that drops the default
+ruleset fails loud.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from app.agents.new_chat.permissions import (
+    Rule,
+    Ruleset,
+    aggregate_action,
+    evaluate_many,
+)
+
+pytestmark = pytest.mark.unit
+
+
+def _layered_rulesets(connector_denies: list[Rule]) -> list[Ruleset]:
+    """Replicate ``chat_deepagent`` layering for the test."""
+    return [
+        Ruleset(
+            rules=[Rule(permission="*", pattern="*", action="allow")],
+            origin="surfsense_defaults",
+        ),
+        Ruleset(rules=connector_denies, origin="connector_synthesized"),
+    ]
+
+
+class TestReadOnlyToolsAllowed:
+    """Read-only built-ins must NOT default to ask."""
+
+    @pytest.mark.parametrize(
+        "tool_name",
+        [
+            "ls",
+            "read_file",
+            "grep",
+            "glob",
+            "web_search",
+            "scrape_webpage",
+            "search_surfsense_docs",
+            "get_connected_accounts",
+            "write_todos",
+            "task",
+            "_noop",
+            "invalid",
+            "update_memory",
+        ],
+    )
+    def test_default_allow_covers_safe_builtin(self, tool_name: str) -> None:
+        rulesets = _layered_rulesets(connector_denies=[])
+        rules = evaluate_many(tool_name, [tool_name], *rulesets)
+        assert aggregate_action(rules) == "allow"
+
+
+class TestConnectorDenyOverridesDefaultAllow:
+    """Connector-synthesized denies must beat the default-allow rule."""
+
+    def test_missing_connector_tool_is_denied(self) -> None:
+        rulesets = _layered_rulesets(
+            connector_denies=[
+                Rule(permission="linear_create_issue", pattern="*", action="deny")
+            ]
+        )
+        rules = evaluate_many("linear_create_issue", ["linear_create_issue"], *rulesets)
+        assert aggregate_action(rules) == "deny"
+
+    def test_default_allow_still_applies_to_other_tools(self) -> None:
+        """A deny rule for one tool must not bleed onto unrelated calls."""
+        rulesets = _layered_rulesets(
+            connector_denies=[
+                Rule(permission="linear_create_issue", pattern="*", action="deny")
+            ]
+        )
+        rules = evaluate_many("ls", ["ls"], *rulesets)
+        assert aggregate_action(rules) == "allow"
+
+
+class TestUserRuleOverridesDefault:
+    """User rules layered last must override the default-allow rule."""
+
+    def test_user_ask_overrides_default_allow(self) -> None:
+        defaults = Ruleset(
+            rules=[Rule(permission="*", pattern="*", action="allow")],
+            origin="surfsense_defaults",
+        )
+        user_ruleset = Ruleset(
+            rules=[Rule(permission="ls", pattern="*", action="ask")],
+            origin="user",
+        )
+        rules = evaluate_many("ls", ["ls"], defaults, user_ruleset)
+        assert aggregate_action(rules) == "ask"
+
+    def test_user_deny_overrides_default_allow(self) -> None:
+        defaults = Ruleset(
+            rules=[Rule(permission="*", pattern="*", action="allow")],
+            origin="surfsense_defaults",
+        )
+        user_ruleset = Ruleset(
+            rules=[Rule(permission="send_*", pattern="*", action="deny")],
+            origin="user",
+        )
+        rules = evaluate_many(
+            "send_gmail_email", ["send_gmail_email"], defaults, user_ruleset
+        )
+        assert aggregate_action(rules) == "deny"
diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_desktop_safety_rules.py b/surfsense_backend/tests/unit/agents/new_chat/test_desktop_safety_rules.py
new file mode 100644
index 0000000000..653175eab1
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_desktop_safety_rules.py
@@ -0,0 +1,122 @@
+"""Tests for the desktop-mode safety ruleset.
+
+In desktop mode the agent operates against the user's real disk with no
+revision history, so destructive filesystem operations must require
+explicit approval. These tests pin the set of tools that get the ``ask``
+gate so it cannot silently regress.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from app.agents.new_chat.middleware.permission import PermissionMiddleware
+from app.agents.new_chat.permissions import (
+    Rule,
+    Ruleset,
+    aggregate_action,
+    evaluate_many,
+)
+
+pytestmark = pytest.mark.unit
+
+
+# Mirror the ruleset built inside ``chat_deepagent._build_compiled_agent_blocking``
+# when ``filesystem_mode == FilesystemMode.DESKTOP_LOCAL_FOLDER``. Keeping a
+# copy here means the rule contract has a focused regression test even when
+# the larger graph-build helper is hard to instantiate in unit tests.
+DESKTOP_SAFETY_RULESET = Ruleset(
+    rules=[
+        Rule(permission="rm", pattern="*", action="ask"),
+        Rule(permission="rmdir", pattern="*", action="ask"),
+        Rule(permission="move_file", pattern="*", action="ask"),
+        Rule(permission="edit_file", pattern="*", action="ask"),
+        Rule(permission="write_file", pattern="*", action="ask"),
+    ],
+    origin="desktop_safety",
+)
+
+SURFSENSE_DEFAULTS = Ruleset(
+    rules=[Rule(permission="*", pattern="*", action="allow")],
+    origin="surfsense_defaults",
+)
+
+
+def _action_for(tool_name: str, *rulesets: Ruleset) -> str:
+    rules = evaluate_many(tool_name, [tool_name], *rulesets)
+    return aggregate_action(rules)
+
+
+class TestDesktopSafetyRulesGateDestructiveOps:
+    @pytest.mark.parametrize(
+        "tool_name",
+        ["rm", "rmdir", "move_file", "edit_file", "write_file"],
+    )
+    def test_destructive_op_resolves_to_ask(self, tool_name: str) -> None:
+        # surfsense_defaults says "allow */*"; desktop_safety must override
+        # because it's layered later (last-match-wins).
+        action = _action_for(tool_name, SURFSENSE_DEFAULTS, DESKTOP_SAFETY_RULESET)
+        assert action == "ask", (
+            f"{tool_name} must require approval in desktop mode "
+            f"(no revert path on real disk); got {action!r}"
+        )
+
+    @pytest.mark.parametrize(
+        "tool_name",
+        ["read_file", "ls", "list_tree", "grep", "glob", "cd", "pwd", "mkdir"],
+    )
+    def test_safe_ops_remain_allowed(self, tool_name: str) -> None:
+        # Read-only and trivially-reversible tools must NOT get gated —
+        # otherwise every navigation in desktop mode pops an interrupt.
+        action = _action_for(tool_name, SURFSENSE_DEFAULTS, DESKTOP_SAFETY_RULESET)
+        assert action == "allow", (
+            f"{tool_name} should not be gated in desktop mode; got {action!r}"
+        )
+
+
+class TestDesktopSafetyOverridesAllowDefault:
+    def test_layer_order_last_match_wins(self) -> None:
+        # If desktop_safety is layered BEFORE surfsense_defaults, the allow
+        # default would win and the safety net would be inert. This test
+        # protects against accidentally swapping the rulesets in
+        # ``_build_compiled_agent_blocking``.
+        action = _action_for("rm", DESKTOP_SAFETY_RULESET, SURFSENSE_DEFAULTS)
+        # Layered "wrong way" — the broad allow now wins.
+        assert action == "allow"
+
+        # Correct order: defaults < desktop_safety -> ask wins.
+        action = _action_for("rm", SURFSENSE_DEFAULTS, DESKTOP_SAFETY_RULESET)
+        assert action == "ask"
+
+
+class TestPermissionMiddlewareIntegration:
+    def test_middleware_raises_interrupt_for_rm_in_desktop_mode(self) -> None:
+        from langchain_core.messages import AIMessage
+
+        from app.agents.new_chat.errors import RejectedError
+
+        mw = PermissionMiddleware(rulesets=[SURFSENSE_DEFAULTS, DESKTOP_SAFETY_RULESET])
+        # Stub the interrupt to a "reject" decision so we can assert the
+        # ask path was taken without spinning up the LangGraph runtime.
+        mw._raise_interrupt = lambda **kw: {"decision_type": "reject"}  # type: ignore[assignment]
+
+        state = {
+            "messages": [
+                AIMessage(
+                    content="",
+                    tool_calls=[
+                        {
+                            "name": "rm",
+                            "args": {"path": "/Users/me/Documents/important.docx"},
+                            "id": "tc-rm",
+                        }
+                    ],
+                )
+            ]
+        }
+
+        class _FakeRuntime:
+            config: dict = {"configurable": {"thread_id": "test"}}
+
+        with pytest.raises(RejectedError):
+            mw.after_model(state, _FakeRuntime())
diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_doom_loop.py b/surfsense_backend/tests/unit/agents/new_chat/test_doom_loop.py
new file mode 100644
index 0000000000..802129bf62
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_doom_loop.py
@@ -0,0 +1,94 @@
+"""Tests for DoomLoopMiddleware signature equality detection."""
+
+from __future__ import annotations
+
+import pytest
+from langchain_core.messages import AIMessage
+
+from app.agents.new_chat.middleware.doom_loop import DoomLoopMiddleware, _signature
+
+pytestmark = pytest.mark.unit
+
+
+def test_signature_is_stable_for_identical_args() -> None:
+    a = _signature("search", {"q": "hello", "n": 10})
+    b = _signature("search", {"n": 10, "q": "hello"})
+    assert a == b
+
+
+def test_signature_changes_with_args() -> None:
+    a = _signature("search", {"q": "hello"})
+    b = _signature("search", {"q": "world"})
+    assert a != b
+
+
+def test_signature_changes_with_name() -> None:
+    a = _signature("search", {"q": "x"})
+    b = _signature("read", {"q": "x"})
+    assert a != b
+
+
+class _FakeRuntime:
+    def __init__(self, thread_id: str | None = "thread-1") -> None:
+        self.config = {"configurable": {"thread_id": thread_id}}
+
+
+def _msg_calling(name: str, args: dict, call_id: str) -> AIMessage:
+    return AIMessage(
+        content="",
+        tool_calls=[{"name": name, "args": args, "id": call_id}],
+    )
+
+
+def test_threshold_triggers_after_n_identical_calls() -> None:
+    mw = DoomLoopMiddleware(threshold=3)
+    runtime = _FakeRuntime()
+
+    # First two calls — under threshold
+    for i in range(2):
+        out = mw.after_model(
+            {"messages": [_msg_calling("repeat", {"x": 1}, f"call-{i}")]},
+            runtime,
+        )
+        assert out is None
+
+    # Third identical call should trigger ``langgraph.types.interrupt``.
+    # In a unit-test context (no runnable graph), ``interrupt`` raises
+    # ``RuntimeError`` because ``get_config`` has nothing to bind to —
+    # we accept that as proof the interrupt path was taken (the
+    # alternative would be no exception, which would mean the loop
+    # detection never fired).
+    with pytest.raises(Exception) as excinfo:
+        mw.after_model(
+            {"messages": [_msg_calling("repeat", {"x": 1}, "call-3")]},
+            runtime,
+        )
+    name = type(excinfo.value).__name__.lower()
+    assert "interrupt" in name or "runtimeerror" in name, (
+        f"Expected an interrupt-style exception, got {name}"
+    )
+
+
+def test_does_not_trigger_when_args_differ() -> None:
+    mw = DoomLoopMiddleware(threshold=2)
+    runtime = _FakeRuntime()
+    out = mw.after_model({"messages": [_msg_calling("repeat", {"x": 1}, "1")]}, runtime)
+    assert out is None
+    out = mw.after_model({"messages": [_msg_calling("repeat", {"x": 2}, "2")]}, runtime)
+    assert out is None
+
+
+def test_separate_threads_have_independent_windows() -> None:
+    mw = DoomLoopMiddleware(threshold=2)
+    rt_a = _FakeRuntime(thread_id="A")
+    rt_b = _FakeRuntime(thread_id="B")
+
+    mw.after_model({"messages": [_msg_calling("foo", {}, "1")]}, rt_a)
+    # thread B should NOT count thread A's call
+    out = mw.after_model({"messages": [_msg_calling("foo", {}, "1")]}, rt_b)
+    assert out is None  # not yet at threshold for B
+
+
+def test_invalid_threshold_rejected() -> None:
+    with pytest.raises(ValueError):
+        DoomLoopMiddleware(threshold=1)
diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_feature_flags.py b/surfsense_backend/tests/unit/agents/new_chat/test_feature_flags.py
new file mode 100644
index 0000000000..099aea8824
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_feature_flags.py
@@ -0,0 +1,136 @@
+"""Tests for the agent feature-flag system."""
+
+from __future__ import annotations
+
+import pytest
+
+from app.agents.new_chat.feature_flags import (
+    AgentFeatureFlags,
+    reload_for_tests,
+)
+
+pytestmark = pytest.mark.unit
+
+
+def _clear_all(monkeypatch: pytest.MonkeyPatch) -> None:
+    for name in [
+        "SURFSENSE_DISABLE_NEW_AGENT_STACK",
+        "SURFSENSE_ENABLE_CONTEXT_EDITING",
+        "SURFSENSE_ENABLE_COMPACTION_V2",
+        "SURFSENSE_ENABLE_RETRY_AFTER",
+        "SURFSENSE_ENABLE_MODEL_FALLBACK",
+        "SURFSENSE_ENABLE_MODEL_CALL_LIMIT",
+        "SURFSENSE_ENABLE_TOOL_CALL_LIMIT",
+        "SURFSENSE_ENABLE_TOOL_CALL_REPAIR",
+        "SURFSENSE_ENABLE_DOOM_LOOP",
+        "SURFSENSE_ENABLE_PERMISSION",
+        "SURFSENSE_ENABLE_BUSY_MUTEX",
+        "SURFSENSE_ENABLE_LLM_TOOL_SELECTOR",
+        "SURFSENSE_ENABLE_SKILLS",
+        "SURFSENSE_ENABLE_SPECIALIZED_SUBAGENTS",
+        "SURFSENSE_ENABLE_KB_PLANNER_RUNNABLE",
+        "SURFSENSE_ENABLE_ACTION_LOG",
+        "SURFSENSE_ENABLE_REVERT_ROUTE",
+        "SURFSENSE_ENABLE_PLUGIN_LOADER",
+        "SURFSENSE_ENABLE_OTEL",
+        "SURFSENSE_ENABLE_AGENT_CACHE",
+        "SURFSENSE_ENABLE_AGENT_CACHE_SHARE_GP_SUBAGENT",
+    ]:
+        monkeypatch.delenv(name, raising=False)
+
+
+def test_defaults_match_shipped_agent_stack(monkeypatch: pytest.MonkeyPatch) -> None:
+    _clear_all(monkeypatch)
+    flags = reload_for_tests()
+    assert isinstance(flags, AgentFeatureFlags)
+    assert flags.disable_new_agent_stack is False
+    assert flags.enable_context_editing is True
+    assert flags.enable_compaction_v2 is True
+    assert flags.enable_retry_after is True
+    assert flags.enable_model_fallback is False
+    assert flags.enable_model_call_limit is True
+    assert flags.enable_tool_call_limit is True
+    assert flags.enable_tool_call_repair is True
+    assert flags.enable_doom_loop is True
+    assert flags.enable_permission is True
+    assert flags.enable_busy_mutex is True
+    assert flags.enable_llm_tool_selector is False
+    assert flags.enable_skills is True
+    assert flags.enable_specialized_subagents is True
+    assert flags.enable_kb_planner_runnable is True
+    assert flags.enable_action_log is True
+    assert flags.enable_revert_route is True
+    assert flags.enable_plugin_loader is False
+    assert flags.enable_otel is False
+    # Phase 2: agent cache is now default-on (the prerequisite tool
+    # ``db_session`` refactor landed). The companion gp-subagent share
+    # flag stays default-off pending data on cold-miss frequency.
+    assert flags.enable_agent_cache is True
+    assert flags.enable_agent_cache_share_gp_subagent is False
+    assert flags.any_new_middleware_enabled() is True
+
+
+def test_master_kill_switch_overrides_individual_flags(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    _clear_all(monkeypatch)
+    monkeypatch.setenv("SURFSENSE_DISABLE_NEW_AGENT_STACK", "true")
+    monkeypatch.setenv("SURFSENSE_ENABLE_CONTEXT_EDITING", "true")
+    monkeypatch.setenv("SURFSENSE_ENABLE_PERMISSION", "true")
+
+    flags = reload_for_tests()
+    assert flags.disable_new_agent_stack is True
+    assert flags.enable_context_editing is False
+    assert flags.enable_permission is False
+    assert flags.any_new_middleware_enabled() is False
+
+
+@pytest.mark.parametrize("truthy", ["1", "true", "TRUE", "yes", "on"])
+def test_individual_flags_truthy_values(
+    monkeypatch: pytest.MonkeyPatch, truthy: str
+) -> None:
+    _clear_all(monkeypatch)
+    monkeypatch.setenv("SURFSENSE_ENABLE_RETRY_AFTER", truthy)
+    flags = reload_for_tests()
+    assert flags.enable_retry_after is True
+    assert flags.any_new_middleware_enabled() is True
+
+
+@pytest.mark.parametrize("falsy", ["0", "false", "no", "off", "", "garbage"])
+def test_individual_flags_falsy_values(
+    monkeypatch: pytest.MonkeyPatch, falsy: str
+) -> None:
+    _clear_all(monkeypatch)
+    monkeypatch.setenv("SURFSENSE_ENABLE_RETRY_AFTER", falsy)
+    flags = reload_for_tests()
+    assert flags.enable_retry_after is False
+
+
+def test_each_flag_can_be_set_independently(monkeypatch: pytest.MonkeyPatch) -> None:
+    _clear_all(monkeypatch)
+    flag_to_env = {
+        "enable_context_editing": "SURFSENSE_ENABLE_CONTEXT_EDITING",
+        "enable_compaction_v2": "SURFSENSE_ENABLE_COMPACTION_V2",
+        "enable_retry_after": "SURFSENSE_ENABLE_RETRY_AFTER",
+        "enable_model_fallback": "SURFSENSE_ENABLE_MODEL_FALLBACK",
+        "enable_model_call_limit": "SURFSENSE_ENABLE_MODEL_CALL_LIMIT",
+        "enable_tool_call_limit": "SURFSENSE_ENABLE_TOOL_CALL_LIMIT",
+        "enable_tool_call_repair": "SURFSENSE_ENABLE_TOOL_CALL_REPAIR",
+        "enable_doom_loop": "SURFSENSE_ENABLE_DOOM_LOOP",
+        "enable_permission": "SURFSENSE_ENABLE_PERMISSION",
+        "enable_busy_mutex": "SURFSENSE_ENABLE_BUSY_MUTEX",
+        "enable_llm_tool_selector": "SURFSENSE_ENABLE_LLM_TOOL_SELECTOR",
+        "enable_skills": "SURFSENSE_ENABLE_SKILLS",
+        "enable_specialized_subagents": "SURFSENSE_ENABLE_SPECIALIZED_SUBAGENTS",
+        "enable_kb_planner_runnable": "SURFSENSE_ENABLE_KB_PLANNER_RUNNABLE",
+        "enable_action_log": "SURFSENSE_ENABLE_ACTION_LOG",
+        "enable_revert_route": "SURFSENSE_ENABLE_REVERT_ROUTE",
+        "enable_plugin_loader": "SURFSENSE_ENABLE_PLUGIN_LOADER",
+        "enable_otel": "SURFSENSE_ENABLE_OTEL",
+    }
+
+    for attr, env_name in flag_to_env.items():
+        _clear_all(monkeypatch)
+        monkeypatch.setenv(env_name, "false")
+        flags = reload_for_tests()
+        assert getattr(flags, attr) is False, f"{attr} did not flip off for {env_name}"
diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_flatten_system.py b/surfsense_backend/tests/unit/agents/new_chat/test_flatten_system.py
new file mode 100644
index 0000000000..6c323d9200
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_flatten_system.py
@@ -0,0 +1,344 @@
+"""Tests for ``FlattenSystemMessageMiddleware``.
+
+The middleware exists to defend against Anthropic's "Found 5 cache_control
+blocks" 400 when our deepagent middleware stack stacks 5+ text blocks on
+the system message and the OpenRouter→Anthropic adapter redistributes
+``cache_control`` across all of them. The flattening collapses every
+all-text system content list to a single string before the LLM call.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+from unittest.mock import MagicMock
+
+import pytest
+from langchain_core.messages import HumanMessage, SystemMessage
+
+from app.agents.new_chat.middleware.flatten_system import (
+    FlattenSystemMessageMiddleware,
+    _flatten_text_blocks,
+    _flattened_request,
+)
+
+pytestmark = pytest.mark.unit
+
+
+# ---------------------------------------------------------------------------
+# _flatten_text_blocks — pure helper, the heart of the middleware.
+# ---------------------------------------------------------------------------
+
+
+class TestFlattenTextBlocks:
+    def test_joins_text_blocks_with_double_newline(self) -> None:
+        blocks = [
+            {"type": "text", "text": "<surfsense base>"},
+            {"type": "text", "text": "<filesystem section>"},
+            {"type": "text", "text": "<skills section>"},
+        ]
+        assert (
+            _flatten_text_blocks(blocks)
+            == "<surfsense base>\n\n<filesystem section>\n\n<skills section>"
+        )
+
+    def test_handles_single_text_block(self) -> None:
+        blocks = [{"type": "text", "text": "only one"}]
+        assert _flatten_text_blocks(blocks) == "only one"
+
+    def test_handles_empty_list(self) -> None:
+        assert _flatten_text_blocks([]) == ""
+
+    def test_passes_through_bare_string_blocks(self) -> None:
+        # LangChain content can mix bare strings and dict blocks.
+        blocks = ["raw string", {"type": "text", "text": "dict block"}]
+        assert _flatten_text_blocks(blocks) == "raw string\n\ndict block"
+
+    def test_returns_none_for_image_block(self) -> None:
+        # System messages with images are rare — but we never want to
+        # silently lose the image payload by joining as text.
+        blocks = [
+            {"type": "text", "text": "look at this"},
+            {"type": "image_url", "image_url": {"url": "data:image/png..."}},
+        ]
+        assert _flatten_text_blocks(blocks) is None
+
+    def test_returns_none_for_non_dict_non_str_block(self) -> None:
+        blocks = [{"type": "text", "text": "hi"}, 42]  # type: ignore[list-item]
+        assert _flatten_text_blocks(blocks) is None
+
+    def test_returns_none_when_text_field_missing(self) -> None:
+        blocks = [{"type": "text"}]  # no ``text`` key
+        assert _flatten_text_blocks(blocks) is None
+
+    def test_returns_none_when_text_is_not_string(self) -> None:
+        blocks = [{"type": "text", "text": ["nested", "list"]}]
+        assert _flatten_text_blocks(blocks) is None
+
+    def test_drops_cache_control_from_inner_blocks(self) -> None:
+        # The whole point: existing cache_control on inner blocks is
+        # discarded so LiteLLM's ``cache_control_injection_points`` can
+        # re-attach exactly one breakpoint after flattening.
+        blocks = [
+            {"type": "text", "text": "first"},
+            {
+                "type": "text",
+                "text": "second",
+                "cache_control": {"type": "ephemeral"},
+            },
+        ]
+        flattened = _flatten_text_blocks(blocks)
+        assert flattened == "first\n\nsecond"
+        assert "cache_control" not in flattened  # type: ignore[operator]
+
+
+# ---------------------------------------------------------------------------
+# _flattened_request — decides when to override and when to no-op.
+# ---------------------------------------------------------------------------
+
+
+def _make_request(system_message: SystemMessage | None) -> Any:
+    """Build a minimal ModelRequest stub. We only need .system_message
+    and .override(system_message=...) — the middleware never touches
+    other fields.
+    """
+    request = MagicMock()
+    request.system_message = system_message
+
+    def override(**kwargs: Any) -> Any:
+        new_request = MagicMock()
+        new_request.system_message = kwargs.get(
+            "system_message", request.system_message
+        )
+        new_request.messages = kwargs.get("messages", getattr(request, "messages", []))
+        new_request.tools = kwargs.get("tools", getattr(request, "tools", []))
+        return new_request
+
+    request.override = override
+    return request
+
+
+class TestFlattenedRequest:
+    def test_collapses_multi_block_system_to_string(self) -> None:
+        sys = SystemMessage(
+            content=[
+                {"type": "text", "text": "<base>"},
+                {"type": "text", "text": "<todo>"},
+                {"type": "text", "text": "<filesystem>"},
+                {"type": "text", "text": "<skills>"},
+                {"type": "text", "text": "<subagents>"},
+            ]
+        )
+        request = _make_request(sys)
+        flattened = _flattened_request(request)
+
+        assert flattened is not None
+        assert isinstance(flattened.system_message, SystemMessage)
+        assert flattened.system_message.content == (
+            "<base>\n\n<todo>\n\n<filesystem>\n\n<skills>\n\n<subagents>"
+        )
+
+    def test_no_op_for_string_content(self) -> None:
+        sys = SystemMessage(content="already a string")
+        request = _make_request(sys)
+        assert _flattened_request(request) is None
+
+    def test_no_op_for_single_block_list(self) -> None:
+        # One block already produces one breakpoint — no need to flatten.
+        sys = SystemMessage(content=[{"type": "text", "text": "single"}])
+        request = _make_request(sys)
+        assert _flattened_request(request) is None
+
+    def test_no_op_when_system_message_missing(self) -> None:
+        request = _make_request(None)
+        assert _flattened_request(request) is None
+
+    def test_no_op_when_list_contains_non_text_block(self) -> None:
+        sys = SystemMessage(
+            content=[
+                {"type": "text", "text": "look"},
+                {"type": "image_url", "image_url": {"url": "data:..."}},
+            ]
+        )
+        request = _make_request(sys)
+        assert _flattened_request(request) is None
+
+    def test_preserves_additional_kwargs_and_metadata(self) -> None:
+        # Defensive: nothing in the current chain sets these on a system
+        # message, but losing them silently when something does in the
+        # future would be a regression. ``name`` in particular is the only
+        # ``additional_kwargs`` field that ChatLiteLLM's
+        # ``_convert_message_to_dict`` propagates onto the wire.
+        sys = SystemMessage(
+            content=[
+                {"type": "text", "text": "a"},
+                {"type": "text", "text": "b"},
+            ],
+            additional_kwargs={"name": "surfsense_system", "x": 1},
+            response_metadata={"tokens": 42},
+        )
+        sys.id = "sys-msg-1"
+        request = _make_request(sys)
+
+        flattened = _flattened_request(request)
+        assert flattened is not None
+        assert flattened.system_message.content == "a\n\nb"
+        assert flattened.system_message.additional_kwargs == {
+            "name": "surfsense_system",
+            "x": 1,
+        }
+        assert flattened.system_message.response_metadata == {"tokens": 42}
+        assert flattened.system_message.id == "sys-msg-1"
+
+    def test_idempotent_when_run_twice(self) -> None:
+        sys = SystemMessage(
+            content=[
+                {"type": "text", "text": "a"},
+                {"type": "text", "text": "b"},
+            ]
+        )
+        request = _make_request(sys)
+        first = _flattened_request(request)
+        assert first is not None
+
+        # Second pass on the already-flattened request should be a no-op.
+        # We re-wrap in a request stub since the helper inspects
+        # ``request.system_message.content``.
+        second_request = _make_request(first.system_message)
+        assert _flattened_request(second_request) is None
+
+
+# ---------------------------------------------------------------------------
+# Middleware integration — verify the handler sees a flattened request.
+# ---------------------------------------------------------------------------
+
+
+class TestMiddlewareWrap:
+    @pytest.mark.asyncio
+    async def test_async_passes_flattened_request_to_handler(self) -> None:
+        sys = SystemMessage(
+            content=[
+                {"type": "text", "text": "alpha"},
+                {"type": "text", "text": "beta"},
+            ]
+        )
+        request = _make_request(sys)
+        captured: dict[str, Any] = {}
+
+        async def handler(req: Any) -> str:
+            captured["request"] = req
+            return "ok"
+
+        mw = FlattenSystemMessageMiddleware()
+        result = await mw.awrap_model_call(request, handler)
+
+        assert result == "ok"
+        assert isinstance(captured["request"].system_message, SystemMessage)
+        assert captured["request"].system_message.content == "alpha\n\nbeta"
+
+    @pytest.mark.asyncio
+    async def test_async_passes_through_when_already_string(self) -> None:
+        sys = SystemMessage(content="just a string")
+        request = _make_request(sys)
+        captured: dict[str, Any] = {}
+
+        async def handler(req: Any) -> str:
+            captured["request"] = req
+            return "ok"
+
+        mw = FlattenSystemMessageMiddleware()
+        await mw.awrap_model_call(request, handler)
+
+        # Same request object: no override happened.
+        assert captured["request"] is request
+
+    def test_sync_passes_flattened_request_to_handler(self) -> None:
+        sys = SystemMessage(
+            content=[
+                {"type": "text", "text": "alpha"},
+                {"type": "text", "text": "beta"},
+            ]
+        )
+        request = _make_request(sys)
+        captured: dict[str, Any] = {}
+
+        def handler(req: Any) -> str:
+            captured["request"] = req
+            return "ok"
+
+        mw = FlattenSystemMessageMiddleware()
+        result = mw.wrap_model_call(request, handler)
+
+        assert result == "ok"
+        assert captured["request"].system_message.content == "alpha\n\nbeta"
+
+    def test_sync_passes_through_when_no_system_message(self) -> None:
+        request = _make_request(None)
+        captured: dict[str, Any] = {}
+
+        def handler(req: Any) -> str:
+            captured["request"] = req
+            return "ok"
+
+        mw = FlattenSystemMessageMiddleware()
+        mw.wrap_model_call(request, handler)
+        assert captured["request"] is request
+
+
+# ---------------------------------------------------------------------------
+# Regression guard — pin the worst-case shape that triggered the
+# "Found 5" 400 in production. Confirms we collapse 5 blocks to 1 so the
+# downstream cache_control_injection_points can only place 1 breakpoint
+# on the system message regardless of provider redistribution quirks.
+# ---------------------------------------------------------------------------
+
+
+def test_regression_five_block_system_collapses_to_one_block() -> None:
+    sys = SystemMessage(
+        content=[
+            {"type": "text", "text": "<surfsense base + BASE_AGENT_PROMPT>"},
+            {"type": "text", "text": "<TodoListMiddleware section>"},
+            {"type": "text", "text": "<SurfSenseFilesystemMiddleware section>"},
+            {"type": "text", "text": "<SkillsMiddleware section>"},
+            {"type": "text", "text": "<SubAgentMiddleware section>"},
+        ]
+    )
+    request = _make_request(sys)
+    flattened = _flattened_request(request)
+
+    assert flattened is not None
+    assert isinstance(flattened.system_message.content, str)
+    # The exact join doesn't matter for the cache_control accounting —
+    # only that there is exactly ONE content block when LiteLLM's
+    # AnthropicCacheControlHook later targets ``role: system``.
+    assert "<surfsense base" in flattened.system_message.content
+    assert "<SubAgentMiddleware" in flattened.system_message.content
+
+
+def test_regression_human_message_not_modified() -> None:
+    # Sanity: the middleware MUST NOT touch user messages — only the
+    # system message. Multi-block user content is the path that carries
+    # image attachments and would lose its image_url block on
+    # accidental flatten.
+    sys = SystemMessage(
+        content=[
+            {"type": "text", "text": "a"},
+            {"type": "text", "text": "b"},
+        ]
+    )
+    user = HumanMessage(
+        content=[
+            {"type": "text", "text": "look at this"},
+            {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}},
+        ]
+    )
+    request = _make_request(sys)
+    request.messages = [user]
+
+    flattened = _flattened_request(request)
+    assert flattened is not None
+    # System flattened to string …
+    assert isinstance(flattened.system_message.content, str)
+    # … user message is untouched (the helper does not even look at it).
+    assert flattened.messages == [user]
+    assert isinstance(user.content, list)
+    assert len(user.content) == 2
diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_hitl_auto_approve.py b/surfsense_backend/tests/unit/agents/new_chat/test_hitl_auto_approve.py
new file mode 100644
index 0000000000..d0ea733767
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_hitl_auto_approve.py
@@ -0,0 +1,111 @@
+"""Tests for the default auto-approval list in ``hitl.request_approval``.
+
+These pin the policy that low-stakes connector creation tools (drafts,
+new-file creates) skip the HITL interrupt by default. Without this set,
+every "draft my newsletter" turn used to fire ~3 interrupts before any
+useful work happened.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from app.agents.new_chat.tools.hitl import (
+    DEFAULT_AUTO_APPROVED_TOOLS,
+    HITLResult,
+    request_approval,
+)
+
+pytestmark = pytest.mark.unit
+
+
+class TestDefaultAutoApprovedToolsList:
+    def test_set_contains_expected_creation_tools(self) -> None:
+        # If anyone changes the policy list, we want a single test to
+        # update so the contract is explicit. Keep this in sync with
+        # ``hitl.DEFAULT_AUTO_APPROVED_TOOLS``.
+        expected = {
+            "create_gmail_draft",
+            "update_gmail_draft",
+            "create_calendar_event",
+            "create_notion_page",
+            "create_confluence_page",
+            "create_google_drive_file",
+            "create_dropbox_file",
+            "create_onedrive_file",
+        }
+        assert expected == DEFAULT_AUTO_APPROVED_TOOLS
+
+    def test_set_is_immutable(self) -> None:
+        # frozenset prevents accidental at-runtime mutation that would
+        # silently widen the auto-approval surface.
+        assert isinstance(DEFAULT_AUTO_APPROVED_TOOLS, frozenset)
+
+    def test_send_tools_are_not_auto_approved(self) -> None:
+        # External-broadcast / destructive tools must always prompt.
+        for tool_name in (
+            "send_gmail_email",
+            "send_discord_message",
+            "send_teams_message",
+            "delete_notion_page",
+            "delete_calendar_event",
+        ):
+            assert tool_name not in DEFAULT_AUTO_APPROVED_TOOLS, (
+                f"{tool_name} must remain HITL-gated"
+            )
+
+
+class TestRequestApprovalAutoBypass:
+    def test_auto_approved_tool_skips_interrupt(self) -> None:
+        # No interrupt mock set up — if the function attempted to call
+        # ``langgraph.types.interrupt`` it would raise GraphInterrupt.
+        # The fact that we get a clean HITLResult proves the bypass.
+        result = request_approval(
+            action_type="gmail_draft_creation",
+            tool_name="create_gmail_draft",
+            params={"to": "alice@example.com", "subject": "hi", "body": "hey"},
+        )
+        assert isinstance(result, HITLResult)
+        assert result.rejected is False
+        assert result.decision_type == "auto_approved"
+        # Original params are preserved untouched (no user edits possible).
+        assert result.params == {
+            "to": "alice@example.com",
+            "subject": "hi",
+            "body": "hey",
+        }
+
+    def test_non_listed_tool_still_attempts_interrupt(self) -> None:
+        # A tool NOT in the default list must reach ``langgraph.interrupt``.
+        # Outside a runnable context that call raises a RuntimeError —
+        # which is exactly the signal we want: the bypass did NOT fire.
+        with pytest.raises(RuntimeError, match="runnable context"):
+            request_approval(
+                action_type="gmail_email_send",
+                tool_name="send_gmail_email",
+                params={"to": "alice@example.com", "subject": "hi", "body": "hey"},
+            )
+
+    def test_user_trusted_tools_still_take_precedence(self) -> None:
+        # ``trusted_tools`` (per-connector "always allow" from MCP/UI)
+        # was checked BEFORE the default list and must keep working
+        # for tools outside the default list.
+        result = request_approval(
+            action_type="mcp_tool_call",
+            tool_name="my_custom_mcp_tool",
+            params={"x": 1},
+            trusted_tools=["my_custom_mcp_tool"],
+        )
+        assert result.decision_type == "trusted"
+        assert result.rejected is False
+
+    def test_auto_approved_overrides_no_trusted_tools(self) -> None:
+        # When trusted_tools is empty and tool is in the default list,
+        # we should still bypass — proves the order in request_approval.
+        result = request_approval(
+            action_type="notion_page_creation",
+            tool_name="create_notion_page",
+            params={"title": "Plan"},
+            trusted_tools=[],
+        )
+        assert result.decision_type == "auto_approved"
diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_memory_response_content.py b/surfsense_backend/tests/unit/agents/new_chat/test_memory_response_content.py
new file mode 100644
index 0000000000..1f338ee3e7
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_memory_response_content.py
@@ -0,0 +1,89 @@
+"""Unit tests for extracting text from LLM memory responses."""
+
+import pytest
+
+from app.agents.new_chat.tools.update_memory import _save_memory
+from app.utils.content_utils import extract_text_content
+
+pytestmark = pytest.mark.unit
+
+
+class _Recorder:
+    def __init__(self) -> None:
+        self.applied_content: str | None = None
+        self.commit_calls = 0
+        self.rollback_calls = 0
+
+    def apply(self, content: str) -> None:
+        self.applied_content = content
+
+    async def commit(self) -> None:
+        self.commit_calls += 1
+
+    async def rollback(self) -> None:
+        self.rollback_calls += 1
+
+
+def test_extract_text_content_keeps_no_update_bare_string_from_content_blocks() -> None:
+    content = [
+        {"type": "thinking", "thinking": "No"},
+        {"type": "thinking", "thinking": " memorizable info."},
+        "NO_UPDATE",
+    ]
+
+    assert extract_text_content(content).strip() == "NO_UPDATE"
+
+
+def test_extract_text_content_ignores_thinking_blocks_and_keeps_markdown_text() -> None:
+    markdown = (
+        "## Work Context\n"
+        "- (2026-05-02) [fact] Anish is hardening SurfSense memory extraction.\n"
+    )
+    content = [
+        {"type": "thinking", "thinking": "This is durable context."},
+        {"type": "text", "text": markdown},
+    ]
+
+    assert extract_text_content(content).strip() == markdown.strip()
+
+
+def test_extract_text_content_returns_empty_when_only_thinking_blocks_are_present() -> (
+    None
+):
+    content = [
+        {"type": "thinking", "thinking": "No durable fact."},
+        {"type": "thinking", "thinking": "Return no update."},
+    ]
+
+    assert extract_text_content(content) == ""
+
+
+def test_extract_text_content_preserves_plain_string_responses() -> None:
+    markdown = (
+        "## Preferences\n"
+        "- (2026-05-02) [pref] Anish prefers no regex for memory validation.\n"
+    )
+
+    assert extract_text_content(markdown) == markdown
+
+
+@pytest.mark.asyncio
+async def test_save_memory_rejects_non_string_payload_before_commit() -> None:
+    recorder = _Recorder()
+
+    result = await _save_memory(
+        updated_memory=["NO_UPDATE"],  # type: ignore[arg-type]
+        old_memory=None,
+        llm=None,
+        apply_fn=recorder.apply,
+        commit_fn=recorder.commit,
+        rollback_fn=recorder.rollback,
+        label="memory",
+        scope="user",
+    )
+
+    assert result["status"] == "error"
+    assert "must be a string" in result["message"]
+    assert recorder.applied_content is None
+    assert recorder.commit_calls == 0
+    assert recorder.rollback_calls == 0
diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_mention_resolver.py b/surfsense_backend/tests/unit/agents/new_chat/test_mention_resolver.py
new file mode 100644
index 0000000000..1f8d35841b
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_mention_resolver.py
@@ -0,0 +1,285 @@
+"""Tests for the @-mention resolver.
+
+These tests pin down the contract that ``mention_resolver`` is the
+single seam between ``MentionedDocumentInfo`` chips (frontend) and the
+canonical ``/documents/...`` virtual paths (agent). The streaming task,
+priority middleware, and persistence layer all consume the resolver's
+output — keeping the tests focused on substitute-in-text + the
+returned id partition keeps the seam stable across refactors.
+"""
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from app.agents.new_chat import mention_resolver
+from app.agents.new_chat.mention_resolver import (
+    ResolvedMention,
+    ResolvedMentionSet,
+    resolve_mentions,
+    substitute_in_text,
+)
+from app.agents.new_chat.path_resolver import DOCUMENTS_ROOT, PathIndex
+from app.schemas.new_chat import MentionedDocumentInfo
+
+pytestmark = pytest.mark.unit
+
+
+class TestSubstituteInText:
+    """``substitute_in_text`` is a pure string transform and is exercised
+    on every cloud-mode turn, so it has to be both fast and behaviour-
+    identical to the frontend's ``parseMentionSegments`` (longest-token
+    first, single forward pass)."""
+
+    def test_returns_text_unchanged_when_no_tokens(self):
+        assert substitute_in_text("hello @foo", []) == "hello @foo"
+
+    def test_returns_text_unchanged_when_empty(self):
+        assert substitute_in_text("", [("@x", "/documents/x.xml")]) == ""
+
+    def test_replaces_single_token_with_backtick_path(self):
+        out = substitute_in_text(
+            "see @notes please",
+            [("@notes", "/documents/notes.xml")],
+        )
+        assert out == "see `/documents/notes.xml` please"
+
+    def test_longest_token_wins_over_prefix(self):
+        # ``@Project Roadmap`` must NOT be partially matched by ``@Project``.
+        # Mirrors the FE's parseMentionSegments contract.
+        token_to_path = [
+            ("@Project Roadmap", "/documents/Roadmap.xml"),
+            ("@Project", "/documents/Project.xml"),
+        ]
+        out = substitute_in_text("about @Project Roadmap today", token_to_path)
+        assert out == "about `/documents/Roadmap.xml` today"
+
+    def test_handles_repeated_mentions(self):
+        out = substitute_in_text(
+            "@A and @A again @B",
+            [
+                ("@A", "/documents/a.xml"),
+                ("@B", "/documents/b.xml"),
+            ],
+        )
+        assert (
+            out == "`/documents/a.xml` and `/documents/a.xml` again `/documents/b.xml`"
+        )
+
+    def test_does_not_match_inside_word(self):
+        # Substitution is positional — there's no word-boundary semantics.
+        # ``@Pro`` inside ``foo@Project`` still matches; this is the same
+        # behaviour as parseMentionSegments. The test pins it so a
+        # future "fix" doesn't accidentally diverge between FE/BE.
+        out = substitute_in_text("foo@Pro", [("@Pro", "/documents/p.xml")])
+        assert out == "foo`/documents/p.xml`"
+
+    def test_idempotent_after_substitution(self):
+        # The output starts with a backtick, not ``@``, so re-running
+        # the substitution leaves it alone.
+        once = substitute_in_text("@A", [("@A", "/documents/a.xml")])
+        twice = substitute_in_text(once, [("@A", "/documents/a.xml")])
+        assert once == twice
+
+
+class TestResolveMentions:
+    """``resolve_mentions`` resolves chip ids → virtual paths and emits
+    a ``ResolvedMentionSet`` whose id partitions feed
+    ``KnowledgePriorityMiddleware``."""
+
+    @pytest.mark.asyncio
+    async def test_returns_empty_when_no_mentions(self):
+        session = MagicMock()
+        session.execute = AsyncMock()
+        result = await resolve_mentions(
+            session,
+            search_space_id=1,
+            mentioned_documents=None,
+        )
+        assert isinstance(result, ResolvedMentionSet)
+        assert result.mentions == []
+        assert result.token_to_path == []
+        assert result.mentioned_document_ids == []
+        assert result.mentioned_folder_ids == []
+        # No DB roundtrips when there's nothing to resolve.
+        session.execute.assert_not_awaited()
+
+    @pytest.mark.asyncio
+    async def test_resolves_doc_chip_to_virtual_path(self, monkeypatch):
+        chip = MentionedDocumentInfo(
+            id=42,
+            title="Notes",
+            document_type="EXTENSION",
+            kind="doc",
+        )
+        doc_row = SimpleNamespace(id=42, title="Notes", folder_id=None)
+
+        async def fake_build_index(_session, _ssid):
+            return PathIndex()
+
+        monkeypatch.setattr(mention_resolver, "build_path_index", fake_build_index)
+
+        scalars = MagicMock()
+        scalars.all.return_value = [doc_row]
+        result = MagicMock()
+        result.scalars.return_value = scalars
+        session = MagicMock()
+        session.execute = AsyncMock(return_value=result)
+
+        out = await resolve_mentions(
+            session,
+            search_space_id=5,
+            mentioned_documents=[chip],
+        )
+        assert len(out.mentions) == 1
+        mention = out.mentions[0]
+        assert mention.kind == "doc"
+        assert mention.id == 42
+        assert mention.virtual_path == f"{DOCUMENTS_ROOT}/Notes.xml"
+        assert out.mentioned_document_ids == [42]
+        assert out.mentioned_folder_ids == []
+        assert ("@Notes", f"{DOCUMENTS_ROOT}/Notes.xml") in out.token_to_path
+
+    @pytest.mark.asyncio
+    async def test_resolves_folder_chip_with_trailing_slash(self, monkeypatch):
+        chip = MentionedDocumentInfo(
+            id=9,
+            title="Reports",
+            document_type="FOLDER",
+            kind="folder",
+        )
+        folder_row = SimpleNamespace(id=9, name="Reports")
+
+        async def fake_build_index(_session, _ssid):
+            return PathIndex(folder_paths={9: f"{DOCUMENTS_ROOT}/Reports"})
+
+        monkeypatch.setattr(mention_resolver, "build_path_index", fake_build_index)
+
+        scalars = MagicMock()
+        scalars.all.return_value = [folder_row]
+        result = MagicMock()
+        result.scalars.return_value = scalars
+        session = MagicMock()
+        session.execute = AsyncMock(return_value=result)
+
+        out = await resolve_mentions(
+            session,
+            search_space_id=3,
+            mentioned_documents=[chip],
+        )
+        assert len(out.mentions) == 1
+        mention = out.mentions[0]
+        assert mention.kind == "folder"
+        assert mention.id == 9
+        assert mention.virtual_path == f"{DOCUMENTS_ROOT}/Reports/"
+        assert out.mentioned_document_ids == []
+        assert out.mentioned_folder_ids == [9]
+
+    @pytest.mark.asyncio
+    async def test_drops_chip_when_doc_is_missing(self, monkeypatch):
+        chip = MentionedDocumentInfo(
+            id=99, title="ghost", document_type="EXTENSION", kind="doc"
+        )
+
+        async def fake_build_index(_session, _ssid):
+            return PathIndex()
+
+        monkeypatch.setattr(mention_resolver, "build_path_index", fake_build_index)
+
+        scalars = MagicMock()
+        scalars.all.return_value = []
+        result = MagicMock()
+        result.scalars.return_value = scalars
+        session = MagicMock()
+        session.execute = AsyncMock(return_value=result)
+
+        out = await resolve_mentions(
+            session,
+            search_space_id=1,
+            mentioned_documents=[chip],
+        )
+        assert out.mentions == []
+        assert out.mentioned_document_ids == []
+        assert out.token_to_path == []
+
+    @pytest.mark.asyncio
+    async def test_token_to_path_is_longest_first(self, monkeypatch):
+        # Two chips whose titles are prefixes of each other — the
+        # resolver MUST sort longest-first so substitution doesn't
+        # break the ``@Project Roadmap`` vs ``@Project`` invariant.
+        chip_short = MentionedDocumentInfo(
+            id=1, title="A", document_type="EXTENSION", kind="doc"
+        )
+        chip_long = MentionedDocumentInfo(
+            id=2, title="A long one", document_type="EXTENSION", kind="doc"
+        )
+        rows = [
+            SimpleNamespace(id=1, title="A", folder_id=None),
+            SimpleNamespace(id=2, title="A long one", folder_id=None),
+        ]
+
+        async def fake_build_index(_session, _ssid):
+            return PathIndex()
+
+        monkeypatch.setattr(mention_resolver, "build_path_index", fake_build_index)
+
+        scalars = MagicMock()
+        scalars.all.return_value = rows
+        result = MagicMock()
+        result.scalars.return_value = scalars
+        session = MagicMock()
+        session.execute = AsyncMock(return_value=result)
+
+        out = await resolve_mentions(
+            session,
+            search_space_id=1,
+            mentioned_documents=[chip_short, chip_long],
+        )
+        tokens = [tok for tok, _ in out.token_to_path]
+        assert tokens == sorted(tokens, key=len, reverse=True)
+
+    @pytest.mark.asyncio
+    async def test_legacy_id_arrays_resolve_without_chip_metadata(self, monkeypatch):
+        # ``mentioned_document_ids`` (the legacy parallel array) must
+        # still resolve when no chip metadata is available — covers
+        # callers that haven't migrated to the discriminated chip list.
+        doc_row = SimpleNamespace(id=7, title="Legacy", folder_id=None)
+
+        async def fake_build_index(_session, _ssid):
+            return PathIndex()
+
+        monkeypatch.setattr(mention_resolver, "build_path_index", fake_build_index)
+
+        scalars = MagicMock()
+        scalars.all.return_value = [doc_row]
+        result = MagicMock()
+        result.scalars.return_value = scalars
+        session = MagicMock()
+        session.execute = AsyncMock(return_value=result)
+
+        out = await resolve_mentions(
+            session,
+            search_space_id=2,
+            mentioned_documents=None,
+            mentioned_document_ids=[7],
+        )
+        assert out.mentioned_document_ids == [7]
+        assert len(out.mentions) == 1
+        assert out.mentions[0].title == "Legacy"
+
+
+class TestResolvedMentionEquality:
+    """Smoke check on the dataclass behaviour we rely on for asserting
+    test outputs."""
+
+    def test_equal_when_fields_equal(self):
+        a = ResolvedMention(
+            kind="doc", id=1, title="x", virtual_path="/documents/x.xml"
+        )
+        b = ResolvedMention(
+            kind="doc", id=1, title="x", virtual_path="/documents/x.xml"
+        )
+        assert a == b
diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_noop_injection.py b/surfsense_backend/tests/unit/agents/new_chat/test_noop_injection.py
new file mode 100644
index 0000000000..346271f4bd
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_noop_injection.py
@@ -0,0 +1,123 @@
+"""Tests for NoopInjectionMiddleware provider-compat logic."""
+
+from __future__ import annotations
+
+import pytest
+from langchain_core.messages import AIMessage, HumanMessage
+
+from app.agents.new_chat.middleware.noop_injection import (
+    NOOP_TOOL_NAME,
+    NoopInjectionMiddleware,
+    _last_ai_has_tool_calls,
+    _provider_needs_noop,
+)
+
+pytestmark = pytest.mark.unit
+
+
+class _LiteLLMModel:
+    def _get_ls_params(self):
+        return {"ls_provider": "litellm"}
+
+
+class _BedrockModel:
+    def _get_ls_params(self):
+        return {"ls_provider": "bedrock"}
+
+
+class _OpenAIModel:
+    def _get_ls_params(self):
+        return {"ls_provider": "openai"}
+
+
+class _ChatLiteLLM:  # name-only fallback
+    pass
+
+
+class TestProviderDetection:
+    def test_litellm(self) -> None:
+        assert _provider_needs_noop(_LiteLLMModel()) is True
+
+    def test_bedrock(self) -> None:
+        assert _provider_needs_noop(_BedrockModel()) is True
+
+    def test_openai_does_not_need(self) -> None:
+        assert _provider_needs_noop(_OpenAIModel()) is False
+
+    def test_class_name_fallback(self) -> None:
+        assert _provider_needs_noop(_ChatLiteLLM()) is True
+
+
+class TestHistoryDetection:
+    def test_last_ai_has_tool_calls(self) -> None:
+        msgs = [
+            HumanMessage(content="hi"),
+            AIMessage(content="", tool_calls=[{"name": "x", "args": {}, "id": "1"}]),
+        ]
+        assert _last_ai_has_tool_calls(msgs) is True
+
+    def test_last_ai_no_tool_calls(self) -> None:
+        msgs = [
+            HumanMessage(content="hi"),
+            AIMessage(content="hello"),
+        ]
+        assert _last_ai_has_tool_calls(msgs) is False
+
+    def test_no_ai_in_history(self) -> None:
+        assert _last_ai_has_tool_calls([HumanMessage(content="hi")]) is False
+
+
+class _FakeRequest:
+    def __init__(self, *, tools, messages, model) -> None:
+        self.tools = tools
+        self.messages = messages
+        self.model = model
+
+    def override(self, *, tools):
+        return _FakeRequest(tools=tools, messages=self.messages, model=self.model)
+
+
+class TestShouldInject:
+    def test_injects_when_all_conditions_met(self) -> None:
+        mw = NoopInjectionMiddleware()
+        msgs = [
+            HumanMessage(content="hi"),
+            AIMessage(content="", tool_calls=[{"name": "x", "args": {}, "id": "1"}]),
+        ]
+        req = _FakeRequest(tools=[], messages=msgs, model=_LiteLLMModel())
+        assert mw._should_inject(req) is True
+
+    def test_skips_when_tools_present(self) -> None:
+        mw = NoopInjectionMiddleware()
+        req = _FakeRequest(
+            tools=[object()],
+            messages=[
+                AIMessage(content="", tool_calls=[{"name": "x", "args": {}, "id": "1"}])
+            ],
+            model=_LiteLLMModel(),
+        )
+        assert mw._should_inject(req) is False
+
+    def test_skips_when_no_history_tool_calls(self) -> None:
+        mw = NoopInjectionMiddleware()
+        req = _FakeRequest(
+            tools=[],
+            messages=[HumanMessage(content="hi")],
+            model=_LiteLLMModel(),
+        )
+        assert mw._should_inject(req) is False
+
+    def test_skips_for_openai(self) -> None:
+        mw = NoopInjectionMiddleware()
+        req = _FakeRequest(
+            tools=[],
+            messages=[
+                AIMessage(content="", tool_calls=[{"name": "x", "args": {}, "id": "1"}])
+            ],
+            model=_OpenAIModel(),
+        )
+        assert mw._should_inject(req) is False
+
+
+def test_noop_tool_name_is_underscore_noop() -> None:
+    assert NOOP_TOOL_NAME == "_noop"
diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_otel_span.py b/surfsense_backend/tests/unit/agents/new_chat/test_otel_span.py
new file mode 100644
index 0000000000..55434c04d9
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_otel_span.py
@@ -0,0 +1,195 @@
+"""Tests for the OtelSpanMiddleware adapter."""
+
+from __future__ import annotations
+
+from typing import Any
+from unittest.mock import MagicMock
+
+import pytest
+from langchain_core.messages import AIMessage, ToolMessage
+
+from app.agents.new_chat.middleware.otel_span import (
+    OtelSpanMiddleware,
+    _annotate_model_response,
+    _annotate_tool_result,
+    _resolve_input_size,
+    _resolve_model_attrs,
+    _resolve_tool_name,
+)
+
+pytestmark = pytest.mark.unit
+
+
+@pytest.fixture(autouse=True)
+def _disable_otel(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.delenv("OTEL_EXPORTER_OTLP_ENDPOINT", raising=False)
+    monkeypatch.setenv("SURFSENSE_DISABLE_OTEL", "true")
+    from app.observability import otel as ot
+
+    ot.reload_for_tests()
+    yield
+    ot.reload_for_tests()
+
+
+class TestResolveModelAttrs:
+    def test_extracts_model_name_and_provider(self) -> None:
+        request = MagicMock()
+        request.model = MagicMock(spec=["model_name", "provider"])
+        request.model.model_name = "gpt-4o-mini"
+        request.model.provider = "openai"
+        assert _resolve_model_attrs(request) == ("gpt-4o-mini", "openai")
+
+    def test_handles_missing_model(self) -> None:
+        request = MagicMock()
+        request.model = None
+        assert _resolve_model_attrs(request) == (None, None)
+
+    def test_falls_back_through_attribute_chain(self) -> None:
+        request = MagicMock()
+        request.model = MagicMock(spec=["model_id", "_llm_type"])
+        request.model.model_id = "claude-3-5-sonnet"
+        request.model._llm_type = "anthropic-chat"
+        model_id, provider = _resolve_model_attrs(request)
+        assert model_id == "claude-3-5-sonnet"
+        assert provider == "anthropic-chat"
+
+
+class TestResolveToolName:
+    def test_prefers_request_tool_name(self) -> None:
+        request = MagicMock()
+        request.tool = MagicMock(name="ToolStub")
+        request.tool.name = "scrape_webpage"
+        assert _resolve_tool_name(request) == "scrape_webpage"
+
+    def test_falls_back_to_tool_call_name(self) -> None:
+        request = MagicMock()
+        request.tool = None
+        request.tool_call = {"name": "web_search", "args": {}}
+        assert _resolve_tool_name(request) == "web_search"
+
+    def test_unknown_when_nothing_resolves(self) -> None:
+        request = MagicMock()
+        request.tool = None
+        request.tool_call = {}
+        assert _resolve_tool_name(request) == "unknown"
+
+
+class TestResolveInputSize:
+    def test_returns_repr_length_of_args(self) -> None:
+        request = MagicMock()
+        request.tool_call = {"args": {"query": "hello world"}}
+        size = _resolve_input_size(request)
+        assert isinstance(size, int)
+        assert size > 0
+
+    def test_handles_no_tool_call(self) -> None:
+        request = MagicMock()
+        request.tool_call = None
+        assert _resolve_input_size(request) is None
+
+
+class TestAnnotateModelResponse:
+    def test_attaches_token_counts_when_present(self) -> None:
+        sp = MagicMock()
+        msg = AIMessage(
+            content="hello",
+            usage_metadata={
+                "input_tokens": 100,
+                "output_tokens": 50,
+                "total_tokens": 150,
+            },
+        )
+        _annotate_model_response(sp, msg)
+        sp.set_attribute.assert_any_call("tokens.prompt", 100)
+        sp.set_attribute.assert_any_call("tokens.completion", 50)
+        sp.set_attribute.assert_any_call("tokens.total", 150)
+
+    def test_handles_response_with_no_metadata(self) -> None:
+        sp = MagicMock()
+        msg = AIMessage(content="hello")
+        # Should not raise even when usage_metadata is missing
+        _annotate_model_response(sp, msg)
+
+
+class TestAnnotateToolResult:
+    def test_records_size_and_status(self) -> None:
+        sp = MagicMock()
+        result = ToolMessage(
+            content="result text",
+            tool_call_id="abc",
+            status="success",
+        )
+        _annotate_tool_result(sp, result)
+        sp.set_attribute.assert_any_call("tool.output.size", len("result text"))
+        sp.set_attribute.assert_any_call("tool.status", "success")
+
+    def test_marks_errors(self) -> None:
+        sp = MagicMock()
+        result = ToolMessage(
+            content="oops",
+            tool_call_id="abc",
+            additional_kwargs={"error": {"code": "x"}},
+        )
+        _annotate_tool_result(sp, result)
+        sp.set_attribute.assert_any_call("tool.error", True)
+
+
+@pytest.mark.asyncio
+class TestMiddlewareIntegration:
+    async def test_awrap_model_call_passes_through_when_disabled(self) -> None:
+        mw = OtelSpanMiddleware()
+        called: dict[str, Any] = {}
+
+        async def handler(req):
+            called["req"] = req
+            return AIMessage(content="ok")
+
+        request = MagicMock()
+        result = await mw.awrap_model_call(request, handler)
+        assert called["req"] is request
+        assert isinstance(result, AIMessage)
+        assert result.content == "ok"
+
+    async def test_awrap_tool_call_passes_through_when_disabled(self) -> None:
+        mw = OtelSpanMiddleware()
+
+        async def handler(req):
+            return ToolMessage(content="result", tool_call_id="abc")
+
+        request = MagicMock()
+        result = await mw.awrap_tool_call(request, handler)
+        assert isinstance(result, ToolMessage)
+        assert result.content == "result"
+
+    async def test_awrap_model_call_propagates_exceptions(self) -> None:
+        mw = OtelSpanMiddleware()
+
+        async def handler(req):
+            raise ValueError("boom")
+
+        with pytest.raises(ValueError):
+            await mw.awrap_model_call(MagicMock(), handler)
+
+    async def test_with_otel_enabled_does_not_alter_result(
+        self, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        monkeypatch.delenv("SURFSENSE_DISABLE_OTEL", raising=False)
+        monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317")
+        from app.observability import otel as ot
+
+        ot.reload_for_tests()
+        try:
+            mw = OtelSpanMiddleware()
+
+            async def handler(req):
+                return AIMessage(content="enabled")
+
+            request = MagicMock()
+            request.model = MagicMock()
+            request.model.model_name = "gpt-4o"
+            request.model.provider = "openai"
+            result = await mw.awrap_model_call(request, handler)
+            assert isinstance(result, AIMessage)
+            assert result.content == "enabled"
+        finally:
+            ot.reload_for_tests()
diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_path_resolver.py b/surfsense_backend/tests/unit/agents/new_chat/test_path_resolver.py
new file mode 100644
index 0000000000..ac6f617679
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_path_resolver.py
@@ -0,0 +1,245 @@
+"""Tests for canonical virtual-path resolver helpers."""
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from app.agents.new_chat.path_resolver import (
+    DOCUMENTS_ROOT,
+    PathIndex,
+    doc_to_virtual_path,
+    parse_doc_id_suffix,
+    parse_documents_path,
+    safe_filename,
+    safe_folder_segment,
+    virtual_path_to_doc,
+)
+
+pytestmark = pytest.mark.unit
+
+
+class TestSafeFilename:
+    def test_appends_xml_extension(self):
+        assert safe_filename("notes").endswith(".xml")
+
+    def test_strips_invalid_chars(self):
+        assert "/" not in safe_filename("a/b\\c.xml")
+
+    def test_falls_back_when_empty(self):
+        assert safe_filename("").endswith(".xml")
+        assert safe_filename("///") == "untitled.xml" or safe_filename("///").endswith(
+            ".xml"
+        )
+
+
+class TestSafeFolderSegment:
+    def test_strips_path_separators(self):
+        assert "/" not in safe_folder_segment("a/b")
+
+    def test_falls_back(self):
+        assert safe_folder_segment("") == "folder"
+
+
+class TestParseDocIdSuffix:
+    def test_parses_suffix(self):
+        stem, doc_id = parse_doc_id_suffix("My Doc (42).xml")
+        assert stem == "My Doc"
+        assert doc_id == 42
+
+    def test_no_suffix_returns_none(self):
+        stem, doc_id = parse_doc_id_suffix("My Doc.xml")
+        assert stem == "My Doc"
+        assert doc_id is None
+
+    def test_no_xml_extension(self):
+        stem, doc_id = parse_doc_id_suffix("plain")
+        assert stem == "plain"
+        assert doc_id is None
+
+
+class TestDocToVirtualPath:
+    def test_root_when_no_folder(self):
+        index = PathIndex()
+        path = doc_to_virtual_path(doc_id=1, title="Hello", folder_id=None, index=index)
+        assert path == f"{DOCUMENTS_ROOT}/Hello.xml"
+        assert index.occupants[path] == 1
+
+    def test_collision_picks_doc_id_suffix(self):
+        index = PathIndex(occupants={f"{DOCUMENTS_ROOT}/Hello.xml": 7})
+        path = doc_to_virtual_path(doc_id=8, title="Hello", folder_id=None, index=index)
+        assert path == f"{DOCUMENTS_ROOT}/Hello (8).xml"
+        assert index.occupants[path] == 8
+
+    def test_uses_folder_path_when_known(self):
+        index = PathIndex(folder_paths={5: f"{DOCUMENTS_ROOT}/notes"})
+        path = doc_to_virtual_path(doc_id=2, title="A", folder_id=5, index=index)
+        assert path == f"{DOCUMENTS_ROOT}/notes/A.xml"
+
+
+class TestParseDocumentsPath:
+    def test_extracts_folder_parts_and_title(self):
+        parts, title = parse_documents_path(f"{DOCUMENTS_ROOT}/foo/bar/baz.xml")
+        assert parts == ["foo", "bar"]
+        assert title == "baz"
+
+    def test_strips_doc_id_suffix(self):
+        parts, title = parse_documents_path(f"{DOCUMENTS_ROOT}/foo/My Doc (12).xml")
+        assert parts == ["foo"]
+        assert title == "My Doc"
+
+    def test_non_documents_returns_empty(self):
+        assert parse_documents_path("/other/x.xml") == ([], "")
+
+
+def _result_from_scalars(rows: list):
+    """Build a fake SQLAlchemy ``Result`` whose ``.scalars().all()`` and
+    ``.scalars().first()`` yield ``rows``."""
+    scalars = MagicMock()
+    scalars.all.return_value = list(rows)
+    scalars.first.return_value = rows[0] if rows else None
+    result = MagicMock()
+    result.scalars.return_value = scalars
+    result.scalar_one_or_none.return_value = None
+    result.first.return_value = None
+    return result
+
+
+def _result_from_one(value):
+    result = MagicMock()
+    result.scalar_one_or_none.return_value = value
+    return result
+
+
+class TestVirtualPathToDoc:
+    """Lookup must round-trip through ``safe_filename``'s lossy encoding.
+
+    The workspace tree displays ``safe_filename(title)`` as the basename, so
+    when the agent passes that basename back to a tool (move/edit/read) the
+    resolver must find the original document even though characters like
+    ``:`` were replaced with ``_``.
+    """
+
+    @pytest.mark.asyncio
+    async def test_falls_back_to_safe_filename_match_when_title_lossy(self):
+        # A Google Calendar-style title that contains a colon — safe_filename
+        # rewrites the colon to ``_``, so the literal title-equality lookup
+        # would miss this row.
+        original_title = "Calendar: Happy birthday!"
+        encoded_basename = safe_filename(original_title)
+        assert encoded_basename == "Calendar_ Happy birthday!.xml"
+
+        target_doc = SimpleNamespace(id=42, title=original_title, folder_id=None)
+
+        session = MagicMock()
+        # Each ``await session.execute(...)`` returns a fresh canned result.
+        # Order matches the resolver's lookup steps:
+        #   1) unique_identifier_hash → no match
+        #   2) literal title match → no match (lossy encoding)
+        #   3) folder scan → returns the row whose title encodes to basename
+        session.execute = AsyncMock(
+            side_effect=[
+                _result_from_one(None),
+                _result_from_scalars([]),
+                _result_from_scalars([target_doc]),
+            ]
+        )
+
+        document = await virtual_path_to_doc(
+            session,
+            search_space_id=5,
+            virtual_path=f"{DOCUMENTS_ROOT}/{encoded_basename}",
+        )
+        assert document is target_doc
+
+    @pytest.mark.asyncio
+    async def test_returns_none_when_no_doc_matches_safe_filename(self):
+        session = MagicMock()
+        session.execute = AsyncMock(
+            side_effect=[
+                _result_from_one(None),
+                _result_from_scalars([]),
+                _result_from_scalars(
+                    [SimpleNamespace(id=1, title="Something else", folder_id=None)]
+                ),
+            ]
+        )
+
+        document = await virtual_path_to_doc(
+            session,
+            search_space_id=5,
+            virtual_path=f"{DOCUMENTS_ROOT}/Calendar_ Happy birthday!.xml",
+        )
+        assert document is None
+
+    @pytest.mark.asyncio
+    async def test_literal_title_match_short_circuits_fallback(self):
+        # When the literal title query hits, the folder-scan fallback must
+        # NOT run (saves a query and avoids picking the wrong doc when two
+        # rows share a lossy encoding).
+        target_doc = SimpleNamespace(id=7, title="Plain Note", folder_id=None)
+
+        session = MagicMock()
+        session.execute = AsyncMock(
+            side_effect=[
+                _result_from_one(None),
+                _result_from_scalars([target_doc]),
+            ]
+        )
+
+        document = await virtual_path_to_doc(
+            session,
+            search_space_id=5,
+            virtual_path=f"{DOCUMENTS_ROOT}/Plain Note.xml",
+        )
+        assert document is target_doc
+        assert session.execute.await_count == 2
+
+    @pytest.mark.asyncio
+    async def test_resolves_double_extension_for_uploaded_pdf(self):
+        # Regression: the agent renders every KB document under
+        # ``/documents/`` with a trailing ``.xml`` (via ``safe_filename``),
+        # so an uploaded PDF whose DB title is ``2025-W2.pdf`` shows up as
+        # ``/documents/2025-W2.pdf.xml`` in answers. Clicking that path
+        # must round-trip back to the row even though the title itself
+        # does NOT end in ``.xml``.
+        target_doc = SimpleNamespace(id=99, title="2025-W2.pdf", folder_id=None)
+
+        session = MagicMock()
+        session.execute = AsyncMock(
+            side_effect=[
+                _result_from_one(None),
+                _result_from_scalars([target_doc]),
+            ]
+        )
+
+        document = await virtual_path_to_doc(
+            session,
+            search_space_id=5,
+            virtual_path=f"{DOCUMENTS_ROOT}/2025-W2.pdf.xml",
+        )
+        assert document is target_doc
+
+    @pytest.mark.asyncio
+    async def test_resolves_path_without_xml_suffix(self):
+        # The user (or a hand-edited link) may pass the title-only form
+        # ``/documents/2025-W2.pdf``. The resolver must still find the row
+        # by literal title equality.
+        target_doc = SimpleNamespace(id=99, title="2025-W2.pdf", folder_id=None)
+
+        session = MagicMock()
+        session.execute = AsyncMock(
+            side_effect=[
+                _result_from_one(None),
+                _result_from_scalars([target_doc]),
+            ]
+        )
+
+        document = await virtual_path_to_doc(
+            session,
+            search_space_id=5,
+            virtual_path=f"{DOCUMENTS_ROOT}/2025-W2.pdf",
+        )
+        assert document is target_doc
diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_permission_middleware.py b/surfsense_backend/tests/unit/agents/new_chat/test_permission_middleware.py
new file mode 100644
index 0000000000..68db11ba69
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_permission_middleware.py
@@ -0,0 +1,263 @@
+"""Tests for PermissionMiddleware end-to-end behavior."""
+
+from __future__ import annotations
+
+import pytest
+from langchain_core.messages import AIMessage, ToolMessage
+
+from app.agents.new_chat.errors import CorrectedError, RejectedError
+from app.agents.new_chat.middleware.permission import (
+    PermissionMiddleware,
+    _normalize_permission_decision,
+)
+from app.agents.new_chat.permissions import Rule, Ruleset
+
+pytestmark = pytest.mark.unit
+
+
+class _FakeRuntime:
+    config: dict = {"configurable": {"thread_id": "test"}}
+
+
+def _msg(*tool_calls: dict) -> AIMessage:
+    return AIMessage(content="", tool_calls=list(tool_calls))
+
+
+class TestAllow:
+    def test_passthrough_when_allow(self) -> None:
+        rs = Ruleset(rules=[Rule("send_email", "*", "allow")])
+        mw = PermissionMiddleware(rulesets=[rs])
+        state = {"messages": [_msg({"name": "send_email", "args": {}, "id": "1"})]}
+        out = mw.after_model(state, _FakeRuntime())
+        assert out is None  # no change
+
+
+class TestDeny:
+    def test_replaces_with_deny_tool_message(self) -> None:
+        rs = Ruleset(rules=[Rule("send_email", "*", "deny")])
+        mw = PermissionMiddleware(rulesets=[rs])
+        state = {"messages": [_msg({"name": "send_email", "args": {}, "id": "1"})]}
+        out = mw.after_model(state, _FakeRuntime())
+        assert out is not None
+        msgs = out["messages"]
+        # Find the deny ToolMessage
+        deny_msgs = [m for m in msgs if isinstance(m, ToolMessage)]
+        assert len(deny_msgs) == 1
+        assert deny_msgs[0].status == "error"
+        assert "permission_denied" in str(deny_msgs[0].additional_kwargs)
+        # AIMessage's tool_calls should now be empty (denied call removed)
+        ai_msg = next(m for m in msgs if isinstance(m, AIMessage))
+        assert ai_msg.tool_calls == []
+
+    def test_mixed_allow_deny(self) -> None:
+        rs = Ruleset(
+            rules=[
+                Rule("send_email", "*", "deny"),
+                Rule("read", "*", "allow"),
+            ]
+        )
+        mw = PermissionMiddleware(rulesets=[rs])
+        state = {
+            "messages": [
+                _msg(
+                    {"name": "send_email", "args": {}, "id": "1"},
+                    {"name": "read", "args": {}, "id": "2"},
+                )
+            ]
+        }
+        out = mw.after_model(state, _FakeRuntime())
+        assert out is not None
+        ai_msg = next(m for m in out["messages"] if isinstance(m, AIMessage))
+        assert len(ai_msg.tool_calls) == 1
+        assert ai_msg.tool_calls[0]["name"] == "read"
+
+
+class TestAsk:
+    def test_reject_without_feedback_raises(self) -> None:
+        # Default: nothing matches -> ask
+        rs = Ruleset(rules=[])
+        mw = PermissionMiddleware(rulesets=[rs])
+
+        # Bypass real interrupt — patch the helper
+        mw._raise_interrupt = lambda **kw: {"decision_type": "reject"}  # type: ignore[assignment]
+        state = {"messages": [_msg({"name": "send_email", "args": {}, "id": "1"})]}
+        with pytest.raises(RejectedError):
+            mw.after_model(state, _FakeRuntime())
+
+    def test_reject_with_feedback_raises_corrected(self) -> None:
+        rs = Ruleset(rules=[])
+        mw = PermissionMiddleware(rulesets=[rs])
+        mw._raise_interrupt = lambda **kw: {  # type: ignore[assignment]
+            "decision_type": "reject",
+            "feedback": "use a different subject line",
+        }
+        state = {"messages": [_msg({"name": "send_email", "args": {}, "id": "1"})]}
+        with pytest.raises(CorrectedError) as excinfo:
+            mw.after_model(state, _FakeRuntime())
+        assert excinfo.value.feedback == "use a different subject line"
+
+    def test_once_proceeds_without_persisting(self) -> None:
+        mw = PermissionMiddleware(rulesets=[])
+        mw._raise_interrupt = lambda **kw: {"decision_type": "once"}  # type: ignore[assignment]
+        state = {"messages": [_msg({"name": "send_email", "args": {}, "id": "1"})]}
+        out = mw.after_model(state, _FakeRuntime())
+        # No state change because all calls kept
+        assert out is None
+        # No new rule persisted
+        assert mw._runtime_ruleset.rules == []
+
+    def test_approve_always_persists_runtime_rule(self) -> None:
+        mw = PermissionMiddleware(rulesets=[])
+        mw._raise_interrupt = lambda **kw: {"decision_type": "approve_always"}  # type: ignore[assignment]
+        state = {"messages": [_msg({"name": "send_email", "args": {}, "id": "1"})]}
+        out = mw.after_model(state, _FakeRuntime())
+        assert out is None  # call kept
+        # Runtime ruleset got the always-allow rule
+        new_rules = [r for r in mw._runtime_ruleset.rules if r.action == "allow"]
+        assert any(r.permission == "send_email" for r in new_rules)
+
+
+class TestNormalizeDecision:
+    """Resume shapes ``_normalize_permission_decision`` must accept."""
+
+    def test_legacy_decision_type_dict_passes_through(self) -> None:
+        decision = {"decision_type": "once"}
+        assert _normalize_permission_decision(decision) == {"decision_type": "once"}
+
+    def test_legacy_decision_type_with_feedback_passes_through(self) -> None:
+        decision = {"decision_type": "reject", "feedback": "no thanks"}
+        assert _normalize_permission_decision(decision) == decision
+
+    def test_plain_string_wrapped(self) -> None:
+        assert _normalize_permission_decision("once") == {"decision_type": "once"}
+        assert _normalize_permission_decision("reject") == {"decision_type": "reject"}
+
+    def test_lc_envelope_approve_maps_to_once(self) -> None:
+        decision = {"decisions": [{"type": "approve"}]}
+        assert _normalize_permission_decision(decision) == {"decision_type": "once"}
+
+    def test_lc_envelope_reject_maps_to_reject(self) -> None:
+        decision = {"decisions": [{"type": "reject"}]}
+        assert _normalize_permission_decision(decision) == {"decision_type": "reject"}
+
+    def test_lc_envelope_reject_with_message_carries_feedback(self) -> None:
+        decision = {"decisions": [{"type": "reject", "message": "wrong recipient"}]}
+        out = _normalize_permission_decision(decision)
+        assert out == {"decision_type": "reject", "feedback": "wrong recipient"}
+
+    def test_lc_envelope_reject_with_feedback_field(self) -> None:
+        decision = {
+            "decisions": [{"type": "reject", "feedback": "tighten the subject"}]
+        }
+        out = _normalize_permission_decision(decision)
+        assert out == {"decision_type": "reject", "feedback": "tighten the subject"}
+
+    def test_lc_envelope_edit_maps_to_once(self) -> None:
+        # Pins the contract: edited args are NOT merged by permission.
+        decision = {
+            "decisions": [
+                {
+                    "type": "edit",
+                    "edited_action": {
+                        "name": "send_email",
+                        "args": {"subject": "edited"},
+                    },
+                }
+            ]
+        }
+        assert _normalize_permission_decision(decision) == {"decision_type": "once"}
+
+    def test_lc_single_decision_without_envelope(self) -> None:
+        assert _normalize_permission_decision({"type": "approve"}) == {
+            "decision_type": "once"
+        }
+
+    def test_unknown_type_falls_back_to_reject(self) -> None:
+        decision = {"decisions": [{"type": "totally_unknown"}]}
+        assert _normalize_permission_decision(decision) == {"decision_type": "reject"}
+
+    def test_missing_type_falls_back_to_reject(self) -> None:
+        assert _normalize_permission_decision({"decisions": [{}]}) == {
+            "decision_type": "reject"
+        }
+
+    def test_non_dict_non_string_falls_back_to_reject(self) -> None:
+        assert _normalize_permission_decision(None) == {"decision_type": "reject"}
+        assert _normalize_permission_decision(42) == {"decision_type": "reject"}
+
+    def test_empty_decisions_list_falls_back_to_reject(self) -> None:
+        # Fail-closed on a malformed reply rather than treat it as approve.
+        assert _normalize_permission_decision({"decisions": []}) == {
+            "decision_type": "reject"
+        }
+
+
+class TestResumeShapesEndToEnd:
+    """LangChain HITL envelope reaches ``_process`` correctly via ``_raise_interrupt``."""
+
+    def test_lc_approve_envelope_keeps_call(self) -> None:
+        mw = PermissionMiddleware(rulesets=[])
+        mw._raise_interrupt = lambda **kw: {  # type: ignore[assignment]
+            "decisions": [{"type": "approve"}]
+        }
+        state = {"messages": [_msg({"name": "send_email", "args": {}, "id": "1"})]}
+        original = mw._raise_interrupt
+        mw._raise_interrupt = lambda **kw: _normalize_permission_decision(  # type: ignore[assignment]
+            original(**kw)
+        )
+        out = mw.after_model(state, _FakeRuntime())
+        assert out is None
+
+    def test_lc_reject_envelope_raises(self) -> None:
+        mw = PermissionMiddleware(rulesets=[])
+        original = lambda **kw: {"decisions": [{"type": "reject"}]}  # noqa: E731
+        mw._raise_interrupt = lambda **kw: _normalize_permission_decision(  # type: ignore[assignment]
+            original(**kw)
+        )
+        state = {"messages": [_msg({"name": "send_email", "args": {}, "id": "1"})]}
+        with pytest.raises(RejectedError):
+            mw.after_model(state, _FakeRuntime())
+
+    def test_lc_reject_with_message_raises_corrected(self) -> None:
+        mw = PermissionMiddleware(rulesets=[])
+        original = lambda **kw: {  # noqa: E731
+            "decisions": [{"type": "reject", "message": "wrong recipient"}]
+        }
+        mw._raise_interrupt = lambda **kw: _normalize_permission_decision(  # type: ignore[assignment]
+            original(**kw)
+        )
+        state = {"messages": [_msg({"name": "send_email", "args": {}, "id": "1"})]}
+        with pytest.raises(CorrectedError) as excinfo:
+            mw.after_model(state, _FakeRuntime())
+        assert excinfo.value.feedback == "wrong recipient"
+
+    def test_lc_edit_envelope_keeps_call_with_original_args(self) -> None:
+        # Pins the "edit -> once, args unchanged" contract.
+        mw = PermissionMiddleware(rulesets=[])
+        original = lambda **kw: {  # noqa: E731
+            "decisions": [
+                {
+                    "type": "edit",
+                    "edited_action": {
+                        "name": "send_email",
+                        "args": {"to": "edited@example.com"},
+                    },
+                }
+            ]
+        }
+        mw._raise_interrupt = lambda **kw: _normalize_permission_decision(  # type: ignore[assignment]
+            original(**kw)
+        )
+        state = {
+            "messages": [
+                _msg(
+                    {
+                        "name": "send_email",
+                        "args": {"to": "original@example.com"},
+                        "id": "1",
+                    }
+                )
+            ]
+        }
+        out = mw.after_model(state, _FakeRuntime())
+        assert out is None
diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_permissions.py b/surfsense_backend/tests/unit/agents/new_chat/test_permissions.py
new file mode 100644
index 0000000000..8ec16617ac
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_permissions.py
@@ -0,0 +1,111 @@
+"""Tests for the wildcard matcher and rule evaluator (parity with OpenCode evaluate.ts)."""
+
+from __future__ import annotations
+
+import pytest
+
+from app.agents.new_chat.permissions import (
+    Rule,
+    Ruleset,
+    aggregate_action,
+    evaluate,
+    evaluate_many,
+    wildcard_match,
+)
+
+pytestmark = pytest.mark.unit
+
+
+class TestWildcardMatch:
+    @pytest.mark.parametrize(
+        "value,pattern,expected",
+        [
+            ("edit", "edit", True),
+            ("edit", "*", True),
+            ("read", "edit", False),
+            ("/documents/secrets/x", "/documents/secrets/**", True),
+            # Single-segment glob: '*' does not cross '/'
+            ("/documents/secrets/x", "/documents/*/x", True),
+            ("/documents/foo/bar/x", "/documents/*/x", False),
+            ("/documents/foo/x", "/documents/*/x", True),
+            ("linear_create", "linear_*", True),
+            ("notion_create", "linear_*", False),
+            # ':' is not a separator, so '*' matches it
+            ("mcp:notion:create_page", "mcp:*", True),
+            ("mcp:notion:create_page", "mcp:**", True),
+            # But '/' IS a separator
+            ("foo/bar", "foo/*", True),
+            ("foo/bar/baz", "foo/*", False),
+        ],
+    )
+    def test_match(self, value: str, pattern: str, expected: bool) -> None:
+        assert wildcard_match(value, pattern) is expected
+
+
+class TestEvaluate:
+    def test_default_action_is_ask(self) -> None:
+        rule = evaluate("edit", "/foo/bar")
+        assert rule.action == "ask"
+        assert rule.permission == "edit"
+
+    def test_last_match_wins(self) -> None:
+        rs = Ruleset(
+            rules=[
+                Rule("edit", "*", "allow"),
+                Rule("edit", "/secrets/**", "deny"),
+            ]
+        )
+        # Second rule (deny) is more specific AND specified later
+        assert evaluate("edit", "/secrets/x", rs).action == "deny"
+        # First rule (allow) covers the rest
+        assert evaluate("edit", "/public/x", rs).action == "allow"
+
+    def test_layered_rulesets_later_overrides_earlier(self) -> None:
+        defaults = Ruleset(rules=[Rule("edit", "*", "ask")], origin="defaults")
+        space = Ruleset(rules=[Rule("edit", "*", "allow")], origin="space")
+        thread = Ruleset(rules=[Rule("edit", "*", "deny")], origin="thread")
+        # All three layered: thread wins
+        assert evaluate("edit", "x", defaults, space, thread).action == "deny"
+        # Without thread: space wins
+        assert evaluate("edit", "x", defaults, space).action == "allow"
+
+    def test_permission_wildcard(self) -> None:
+        rs = Ruleset(rules=[Rule("linear_*", "*", "allow")])
+        assert evaluate("linear_create_issue", "x", rs).action == "allow"
+        assert evaluate("notion_create", "x", rs).action == "ask"
+
+    def test_pattern_wildcard(self) -> None:
+        rs = Ruleset(rules=[Rule("edit", "/documents/secrets/**", "deny")])
+        assert evaluate("edit", "/documents/secrets/foo", rs).action == "deny"
+        assert evaluate("edit", "/documents/public/foo", rs).action == "ask"
+
+    def test_evaluate_many(self) -> None:
+        rs = Ruleset(
+            rules=[
+                Rule("edit", "*", "allow"),
+                Rule("edit", "/secrets/*", "deny"),
+            ]
+        )
+        results = evaluate_many("edit", ["/public/x", "/secrets/y"], rs)
+        assert [r.action for r in results] == ["allow", "deny"]
+
+
+class TestAggregateAction:
+    def test_any_deny_means_deny(self) -> None:
+        rules = [
+            Rule("a", "*", "allow"),
+            Rule("a", "*", "deny"),
+            Rule("a", "*", "ask"),
+        ]
+        assert aggregate_action(rules) == "deny"
+
+    def test_any_ask_means_ask_when_no_deny(self) -> None:
+        rules = [Rule("a", "*", "allow"), Rule("a", "*", "ask")]
+        assert aggregate_action(rules) == "ask"
+
+    def test_all_allow_means_allow(self) -> None:
+        rules = [Rule("a", "*", "allow"), Rule("a", "*", "allow")]
+        assert aggregate_action(rules) == "allow"
+
+    def test_empty_means_ask(self) -> None:
+        assert aggregate_action([]) == "ask"
diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_plugin_loader.py b/surfsense_backend/tests/unit/agents/new_chat/test_plugin_loader.py
new file mode 100644
index 0000000000..5dbf765a79
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_plugin_loader.py
@@ -0,0 +1,185 @@
+"""Unit tests for the SurfSense plugin entry-point loader."""
+
+from __future__ import annotations
+
+from unittest.mock import MagicMock, patch
+
+from langchain.agents.middleware import AgentMiddleware
+
+from app.agents.new_chat.plugin_loader import (
+    PLUGIN_ENTRY_POINT_GROUP,
+    PluginContext,
+    load_allowed_plugin_names_from_env,
+    load_plugin_middlewares,
+)
+from app.agents.new_chat.plugins.year_substituter import (
+    _YearSubstituterMiddleware,
+    make_middleware as year_substituter_factory,
+)
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+class _DummyMiddleware(AgentMiddleware):
+    """Trivial middleware used as the success-path return value."""
+
+    tools = ()
+
+
+def _ctx() -> PluginContext:
+    return PluginContext.build(
+        search_space_id=1,
+        user_id="u",
+        thread_visibility="PRIVATE",  # type: ignore[arg-type]
+        llm=MagicMock(),
+    )
+
+
+class _FakeEntryPoint:
+    """Stand-in for ``importlib.metadata.EntryPoint``."""
+
+    def __init__(self, name: str, factory) -> None:
+        self.name = name
+        self._factory = factory
+
+    def load(self):
+        return self._factory
+
+
+# ---------------------------------------------------------------------------
+# Loader behaviour
+# ---------------------------------------------------------------------------
+
+
+class TestPluginLoaderBasics:
+    def test_returns_empty_when_allowlist_is_empty(self) -> None:
+        assert load_plugin_middlewares(_ctx(), allowed_plugin_names=[]) == []
+
+    def test_skips_non_allowlisted_plugin(self) -> None:
+        called = []
+
+        def factory(_):  # would be an obvious bug if called
+            called.append(True)
+            return _DummyMiddleware()
+
+        ep = _FakeEntryPoint("dangerous_plugin", factory)
+        with patch(
+            "app.agents.new_chat.plugin_loader.entry_points",
+            return_value=[ep],
+        ):
+            result = load_plugin_middlewares(
+                _ctx(), allowed_plugin_names=["allowed_only"]
+            )
+        assert result == []
+        assert not called
+
+    def test_loads_allowlisted_plugin(self) -> None:
+        ep = _FakeEntryPoint("year_substituter", year_substituter_factory)
+        with patch(
+            "app.agents.new_chat.plugin_loader.entry_points",
+            return_value=[ep],
+        ):
+            result = load_plugin_middlewares(
+                _ctx(), allowed_plugin_names={"year_substituter"}
+            )
+        assert len(result) == 1
+        assert isinstance(result[0], _YearSubstituterMiddleware)
+
+
+class TestPluginLoaderIsolation:
+    def test_factory_exception_is_isolated(self) -> None:
+        def crashing_factory(_):
+            raise RuntimeError("boom")
+
+        ep = _FakeEntryPoint("buggy", crashing_factory)
+        with patch(
+            "app.agents.new_chat.plugin_loader.entry_points",
+            return_value=[ep],
+        ):
+            result = load_plugin_middlewares(_ctx(), allowed_plugin_names={"buggy"})
+        assert result == []  # construction continued without the plugin
+
+    def test_non_middleware_return_is_rejected(self) -> None:
+        def bad_factory(_):
+            return "not a middleware"
+
+        ep = _FakeEntryPoint("liar", bad_factory)
+        with patch(
+            "app.agents.new_chat.plugin_loader.entry_points",
+            return_value=[ep],
+        ):
+            result = load_plugin_middlewares(_ctx(), allowed_plugin_names={"liar"})
+        assert result == []
+
+    def test_load_phase_exception_is_isolated(self) -> None:
+        class _BrokenEP:
+            name = "broken"
+
+            def load(self):
+                raise ImportError("cannot import")
+
+        with patch(
+            "app.agents.new_chat.plugin_loader.entry_points",
+            return_value=[_BrokenEP()],
+        ):
+            result = load_plugin_middlewares(_ctx(), allowed_plugin_names={"broken"})
+        assert result == []
+
+    def test_one_failure_does_not_block_others(self) -> None:
+        """Two plugins; one crashes during factory; the other still loads."""
+
+        def crashing_factory(_):
+            raise RuntimeError("boom")
+
+        eps = [
+            _FakeEntryPoint("crashing", crashing_factory),
+            _FakeEntryPoint("ok", year_substituter_factory),
+        ]
+        with patch("app.agents.new_chat.plugin_loader.entry_points", return_value=eps):
+            result = load_plugin_middlewares(
+                _ctx(), allowed_plugin_names={"crashing", "ok"}
+            )
+        assert len(result) == 1
+        assert isinstance(result[0], _YearSubstituterMiddleware)
+
+
+class TestAllowlistEnv:
+    def test_empty_env_returns_empty_set(self, monkeypatch) -> None:
+        monkeypatch.delenv("SURFSENSE_ALLOWED_PLUGINS", raising=False)
+        assert load_allowed_plugin_names_from_env() == set()
+
+    def test_parses_comma_separated_value(self, monkeypatch) -> None:
+        monkeypatch.setenv("SURFSENSE_ALLOWED_PLUGINS", " year_substituter , noisy , ")
+        assert load_allowed_plugin_names_from_env() == {
+            "year_substituter",
+            "noisy",
+        }
+
+
+class TestPluginContext:
+    def test_build_includes_required_fields(self) -> None:
+        llm = MagicMock()
+        ctx = PluginContext.build(
+            search_space_id=42,
+            user_id="user-1",
+            thread_visibility="PRIVATE",  # type: ignore[arg-type]
+            llm=llm,
+        )
+        assert ctx["search_space_id"] == 42
+        assert ctx["user_id"] == "user-1"
+        assert ctx["llm"] is llm
+
+    def test_does_not_carry_secrets_or_db_session(self) -> None:
+        ctx = _ctx()
+        # If a future change tries to add these keys, this test will fail loudly.
+        for forbidden in ("api_key", "secret", "db_session", "session"):
+            assert forbidden not in ctx
+
+
+class TestEntryPointGroup:
+    def test_group_name_matches_pyproject_convention(self) -> None:
+        # Plugins register under `surfsense.plugins`; this is part of our
+        # public contract for plugin authors.
+        assert PLUGIN_ENTRY_POINT_GROUP == "surfsense.plugins"
diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_prompt_caching.py b/surfsense_backend/tests/unit/agents/new_chat/test_prompt_caching.py
new file mode 100644
index 0000000000..4cf53969d4
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_prompt_caching.py
@@ -0,0 +1,370 @@
+r"""Tests for ``apply_litellm_prompt_caching`` in
+:mod:`app.agents.new_chat.prompt_caching`.
+
+The helper replaces the legacy ``AnthropicPromptCachingMiddleware`` (which
+never activated for our LiteLLM stack) with LiteLLM-native multi-provider
+prompt caching. It mutates ``llm.model_kwargs`` so the kwargs flow to
+``litellm.completion(...)``. The tests below pin its public contract:
+
+1. Always sets BOTH ``index: 0`` and ``index: -1`` injection points so
+   savings compound across multi-turn conversations on Anthropic-family
+   providers. ``index: 0`` is used (rather than ``role: system``) because
+   the deepagent stack accumulates multiple ``SystemMessage``\ s in
+   ``state["messages"]`` and ``role: system`` would tag every one of
+   them, blowing past Anthropic's 4-block ``cache_control`` cap.
+2. Adds ``prompt_cache_key``/``prompt_cache_retention`` only for
+   single-model OPENAI/DEEPSEEK/XAI configs (where OpenAI's automatic
+   prompt-cache surface is available).
+3. Treats ``ChatLiteLLMRouter`` (auto-mode) as universal-only — no
+   OpenAI-only kwargs because the router fans out across providers.
+4. Idempotent: user-supplied values in ``model_kwargs`` are preserved.
+5. Defensive: LLMs without a writable ``model_kwargs`` are silently
+   skipped rather than raising.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+import pytest
+
+from app.agents.new_chat.llm_config import AgentConfig
+from app.agents.new_chat.prompt_caching import apply_litellm_prompt_caching
+
+pytestmark = pytest.mark.unit
+
+
+# ---------------------------------------------------------------------------
+# Test doubles
+# ---------------------------------------------------------------------------
+
+
+class _FakeLLM:
+    """Stand-in for ``ChatLiteLLM``/``SanitizedChatLiteLLM``.
+
+    The helper only inspects ``getattr(llm, "model_kwargs", None)``,
+    ``getattr(llm, "model", None)``, and ``type(llm).__name__``. A simple
+    object suffices — we don't need to spin up real LangChain/LiteLLM
+    machinery for unit tests of the helper's logic.
+    """
+
+    def __init__(
+        self,
+        model: str = "openai/gpt-4o",
+        model_kwargs: dict[str, Any] | None = None,
+    ) -> None:
+        self.model = model
+        self.model_kwargs: dict[str, Any] = dict(model_kwargs) if model_kwargs else {}
+
+
+class ChatLiteLLMRouter:
+    """Class-name-only impostor of the real router.
+
+    The helper's router gate is ``type(llm).__name__ == "ChatLiteLLMRouter"``
+    (a deliberate stringly-typed check to avoid an import cycle with
+    ``app.services.llm_router_service``). Reusing the same class name here
+    triggers the same code path without instantiating a real ``Router``.
+    """
+
+    def __init__(self) -> None:
+        self.model = "auto"
+        self.model_kwargs: dict[str, Any] = {}
+
+
+def _make_cfg(**overrides: Any) -> AgentConfig:
+    """Build an ``AgentConfig`` with sensible defaults for the helper test."""
+    defaults: dict[str, Any] = {
+        "provider": "OPENAI",
+        "model_name": "gpt-4o",
+        "api_key": "k",
+    }
+    return AgentConfig(**{**defaults, **overrides})
+
+
+# ---------------------------------------------------------------------------
+# (a) Universal injection points
+# ---------------------------------------------------------------------------
+
+
+def test_sets_both_cache_control_injection_points_with_no_config() -> None:
+    """Bare call (no agent_config, no thread_id) still sets the two
+    universal breakpoints — these cost nothing on providers that don't
+    consume them and unlock caching on every supported provider."""
+    llm = _FakeLLM()
+
+    apply_litellm_prompt_caching(llm)
+
+    points = llm.model_kwargs["cache_control_injection_points"]
+    assert {"location": "message", "index": 0} in points
+    assert {"location": "message", "index": -1} in points
+    assert len(points) == 2
+
+
+def test_does_not_inject_role_system_breakpoint() -> None:
+    """Regression: deliberately AVOID ``role: system`` so we don't tag
+    every SystemMessage the deepagent ``before_agent`` injectors push
+    into ``state["messages"]`` (priority, tree, memory, file-intent,
+    anonymous-doc). Tagging all of them overflows Anthropic's 4-block
+    ``cache_control`` cap and surfaces as
+    ``OpenrouterException: A maximum of 4 blocks with cache_control may
+    be provided. Found N`` 400s.
+    """
+    llm = _FakeLLM()
+    apply_litellm_prompt_caching(llm)
+    points = llm.model_kwargs["cache_control_injection_points"]
+    assert all(p.get("role") != "system" for p in points), (
+        f"Expected no role=system breakpoint, got: {points}"
+    )
+
+
+def test_injection_points_set_for_anthropic_config() -> None:
+    """Anthropic-family configs need the marker — verify it lands."""
+    cfg = _make_cfg(provider="ANTHROPIC", model_name="claude-3-5-sonnet")
+    llm = _FakeLLM(model="anthropic/claude-3-5-sonnet")
+
+    apply_litellm_prompt_caching(llm, agent_config=cfg)
+
+    assert "cache_control_injection_points" in llm.model_kwargs
+
+
+# ---------------------------------------------------------------------------
+# (b) Idempotency / user override wins
+# ---------------------------------------------------------------------------
+
+
+def test_does_not_overwrite_user_supplied_cache_control_injection_points() -> None:
+    """Users who set their own injection points (e.g. with ``ttl: "1h"``
+    via ``litellm_params``) keep them — the helper merges, never
+    clobbers."""
+    user_points = [
+        {"location": "message", "role": "system", "ttl": "1h"},
+    ]
+    llm = _FakeLLM(
+        model_kwargs={"cache_control_injection_points": user_points},
+    )
+
+    apply_litellm_prompt_caching(llm)
+
+    assert llm.model_kwargs["cache_control_injection_points"] is user_points
+
+
+def test_idempotent_when_called_multiple_times() -> None:
+    """Build-time + thread-time double-call must be a no-op the second time."""
+    cfg = _make_cfg(provider="OPENAI")
+    llm = _FakeLLM()
+
+    apply_litellm_prompt_caching(llm, agent_config=cfg, thread_id=1)
+    snapshot = {
+        "cache_control_injection_points": list(
+            llm.model_kwargs["cache_control_injection_points"]
+        ),
+        "prompt_cache_key": llm.model_kwargs["prompt_cache_key"],
+        "prompt_cache_retention": llm.model_kwargs["prompt_cache_retention"],
+    }
+    apply_litellm_prompt_caching(llm, agent_config=cfg, thread_id=1)
+
+    assert (
+        llm.model_kwargs["cache_control_injection_points"]
+        == snapshot["cache_control_injection_points"]
+    )
+    assert llm.model_kwargs["prompt_cache_key"] == snapshot["prompt_cache_key"]
+    assert (
+        llm.model_kwargs["prompt_cache_retention"] == snapshot["prompt_cache_retention"]
+    )
+
+
+def test_does_not_overwrite_user_supplied_prompt_cache_key() -> None:
+    """A pre-set ``prompt_cache_key`` (e.g. tenant-aware override via
+    ``litellm_params``) wins over our default per-thread key."""
+    cfg = _make_cfg(provider="OPENAI")
+    llm = _FakeLLM(model_kwargs={"prompt_cache_key": "tenant-abc"})
+
+    apply_litellm_prompt_caching(llm, agent_config=cfg, thread_id=42)
+
+    assert llm.model_kwargs["prompt_cache_key"] == "tenant-abc"
+
+
+# ---------------------------------------------------------------------------
+# (c) OpenAI-family extras (OPENAI / DEEPSEEK / XAI)
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("provider", ["OPENAI", "DEEPSEEK", "XAI"])
+def test_sets_openai_family_extras(provider: str) -> None:
+    """OpenAI-style providers gain ``prompt_cache_key`` (raises hit rate
+    via routing affinity) and ``prompt_cache_retention="24h"`` (extends
+    cache TTL beyond the default 5-10 min)."""
+    cfg = _make_cfg(provider=provider)
+    llm = _FakeLLM()
+
+    apply_litellm_prompt_caching(llm, agent_config=cfg, thread_id=42)
+
+    assert llm.model_kwargs["prompt_cache_key"] == "surfsense-thread-42"
+    assert llm.model_kwargs["prompt_cache_retention"] == "24h"
+
+
+def test_skips_prompt_cache_key_when_no_thread_id() -> None:
+    """Without a thread id we can't construct a per-thread key. Retention
+    is still useful so we set it (it's free)."""
+    cfg = _make_cfg(provider="OPENAI")
+    llm = _FakeLLM()
+
+    apply_litellm_prompt_caching(llm, agent_config=cfg, thread_id=None)
+
+    assert "prompt_cache_key" not in llm.model_kwargs
+    assert llm.model_kwargs["prompt_cache_retention"] == "24h"
+
+
+@pytest.mark.parametrize(
+    "provider",
+    ["ANTHROPIC", "BEDROCK", "VERTEX_AI", "GOOGLE_AI_STUDIO", "GROQ", "MOONSHOT"],
+)
+def test_no_openai_extras_for_other_providers(provider: str) -> None:
+    """Non-OpenAI-family providers don't expose ``prompt_cache_key`` —
+    skip it. ``cache_control_injection_points`` is still set (universal)."""
+    cfg = _make_cfg(provider=provider)
+    llm = _FakeLLM()
+
+    apply_litellm_prompt_caching(llm, agent_config=cfg, thread_id=42)
+
+    assert "prompt_cache_key" not in llm.model_kwargs
+    assert "prompt_cache_retention" not in llm.model_kwargs
+    assert "cache_control_injection_points" in llm.model_kwargs
+
+
+def test_no_openai_extras_in_auto_mode() -> None:
+    """Auto-mode fans out across mixed providers — we can't statically
+    target OpenAI-only kwargs."""
+    cfg = AgentConfig.from_auto_mode()
+    llm = _FakeLLM()
+
+    apply_litellm_prompt_caching(llm, agent_config=cfg, thread_id=42)
+
+    assert "prompt_cache_key" not in llm.model_kwargs
+    assert "prompt_cache_retention" not in llm.model_kwargs
+    assert "cache_control_injection_points" in llm.model_kwargs
+
+
+def test_no_openai_extras_for_custom_provider() -> None:
+    """Custom providers route through arbitrary user-supplied prefixes —
+    we don't try to infer OpenAI-family compatibility."""
+    cfg = _make_cfg(provider="OPENAI", custom_provider="my_proxy")
+    llm = _FakeLLM()
+
+    apply_litellm_prompt_caching(llm, agent_config=cfg, thread_id=42)
+
+    assert "prompt_cache_key" not in llm.model_kwargs
+    assert "prompt_cache_retention" not in llm.model_kwargs
+
+
+# ---------------------------------------------------------------------------
+# (d) ChatLiteLLMRouter — universal injection points only
+# ---------------------------------------------------------------------------
+
+
+def test_router_llm_gets_only_universal_injection_points() -> None:
+    """Even with an OpenAI-flavoured config, a ``ChatLiteLLMRouter`` must
+    receive only the universal injection points — its requests dispatch
+    across provider deployments and OpenAI-only kwargs would be wasted
+    (or stripped by ``drop_params``) on non-OpenAI legs."""
+    router = ChatLiteLLMRouter()
+    cfg = _make_cfg(provider="OPENAI")
+
+    apply_litellm_prompt_caching(router, agent_config=cfg, thread_id=42)
+
+    assert "cache_control_injection_points" in router.model_kwargs
+    assert "prompt_cache_key" not in router.model_kwargs
+    assert "prompt_cache_retention" not in router.model_kwargs
+
+
+# ---------------------------------------------------------------------------
+# (e) Defensive paths
+# ---------------------------------------------------------------------------
+
+
+def test_handles_llm_with_no_writable_model_kwargs() -> None:
+    """Some LLM implementations (e.g. fakes / minimal subclasses) don't
+    expose a writable ``model_kwargs``. The helper must skip silently —
+    raising would crash the entire LLM build path on a non-critical
+    optimisation."""
+
+    class _ImmutableLLM:
+        # ``__slots__`` blocks attribute creation, so ``setattr`` raises.
+        __slots__ = ("model",)
+
+        def __init__(self) -> None:
+            self.model = "openai/gpt-4o"
+
+    llm = _ImmutableLLM()
+
+    apply_litellm_prompt_caching(llm)
+
+
+def test_initialises_missing_model_kwargs_dict() -> None:
+    """When ``model_kwargs`` is present-but-None (Pydantic v2 default
+    pattern when no factory is set), the helper initialises it to an
+    empty dict before mutating."""
+
+    class _LazyLLM:
+        def __init__(self) -> None:
+            self.model = "openai/gpt-4o"
+            self.model_kwargs: dict[str, Any] | None = None
+
+    llm = _LazyLLM()
+
+    apply_litellm_prompt_caching(llm)
+
+    assert isinstance(llm.model_kwargs, dict)
+    assert "cache_control_injection_points" in llm.model_kwargs
+
+
+def test_falls_back_to_llm_model_prefix_when_no_agent_config() -> None:
+    """Direct caller path (e.g. ``create_chat_litellm_from_config`` for
+    YAML configs without a structured ``AgentConfig``): without
+    ``agent_config`` the helper sets only the universal injection points
+    — no OpenAI-family extras even if the prefix says ``openai/``.
+    Conservative: we'd rather miss the speedup than silently misroute."""
+    llm = _FakeLLM(model="openai/gpt-4o")
+
+    apply_litellm_prompt_caching(llm, agent_config=None, thread_id=99)
+
+    assert "cache_control_injection_points" in llm.model_kwargs
+    assert "prompt_cache_key" not in llm.model_kwargs
+    assert "prompt_cache_retention" not in llm.model_kwargs
+
+
+# ---------------------------------------------------------------------------
+# (f) drop_params safety net (regression guard for #19346)
+# ---------------------------------------------------------------------------
+
+
+def test_litellm_drop_params_is_globally_enabled() -> None:
+    """``litellm.drop_params=True`` is set globally in
+    :mod:`app.services.llm_service` so any ``prompt_cache_key`` /
+    ``prompt_cache_retention`` we set on an OpenAI-family config is
+    auto-stripped if the request later routes to a non-supporting
+    provider (e.g. via auto-mode router fallback). This test pins that
+    invariant — losing it would mean Bedrock/Vertex 400s on ``prompt_cache_key``.
+    """
+    import litellm
+
+    import app.services.llm_service  # noqa: F401  (side-effect: sets globals)
+
+    assert litellm.drop_params is True
+
+
+# ---------------------------------------------------------------------------
+# Regression note: LiteLLM #15696 (multi-content-block last message)
+# ---------------------------------------------------------------------------
+#
+# Before LiteLLM 1.81 a list-form last message ``[block_a, block_b]``
+# would get ``cache_control`` applied to *every* content block instead
+# of only the last one — wasting cache breakpoints and triggering 400s
+# on Anthropic when it exceeded the 4-breakpoint limit. Fixed in
+# https://github.com/BerriAI/litellm/pull/15699.
+#
+# We pin ``litellm>=1.83.7`` in ``pyproject.toml`` (well past the fix).
+# An end-to-end behavioural test would need to run ``litellm.completion``
+# through the Anthropic transformer, which is integration territory and
+# better covered by LiteLLM's own test suite. The unit guard here is the
+# version pin plus the build-time ``model_kwargs`` shape we verify above.
diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_resolve_prompt_model_name.py b/surfsense_backend/tests/unit/agents/new_chat/test_resolve_prompt_model_name.py
new file mode 100644
index 0000000000..ffe3dbaa4e
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_resolve_prompt_model_name.py
@@ -0,0 +1,117 @@
+"""Tests for ``_resolve_prompt_model_name`` in :mod:`app.agents.new_chat.chat_deepagent`.
+
+The helper picks the model id fed to ``detect_provider_variant`` so the
+right ``<provider_hints>`` block lands in the system prompt. The tests
+below pin its preference order:
+
+1. ``agent_config.litellm_params["base_model"]`` (Azure-correct).
+2. ``agent_config.model_name``.
+3. ``getattr(llm, "model", None)``.
+
+Without (1) an Azure deployment named e.g. ``"prod-chat-001"`` would
+silently miss every provider regex.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from app.agents.new_chat.chat_deepagent import _resolve_prompt_model_name
+from app.agents.new_chat.llm_config import AgentConfig
+
+pytestmark = pytest.mark.unit
+
+
+def _make_cfg(**overrides) -> AgentConfig:
+    """Build an ``AgentConfig`` with sensible defaults for the helper test."""
+    defaults = {
+        "provider": "OPENAI",
+        "model_name": "x",
+        "api_key": "k",
+    }
+    return AgentConfig(**{**defaults, **overrides})
+
+
+class _FakeLLM:
+    """Stand-in for a ``ChatLiteLLM`` / ``ChatLiteLLMRouter`` instance.
+
+    The resolver only reads the ``.model`` attribute via ``getattr``,
+    matching the established idiom in ``knowledge_search.py`` /
+    ``stream_new_chat.py`` / ``document_summarizer.py``.
+    """
+
+    def __init__(self, model: str | None) -> None:
+        self.model = model
+
+
+def test_prefers_litellm_params_base_model_over_deployment_name() -> None:
+    """Azure deployment slug must NOT shadow the underlying model family.
+
+    This is the failure mode the helper exists to prevent: a deployment
+    named ``"azure/prod-chat-001"`` would not match any provider regex
+    on its own, but the family ``"gpt-4o"`` lives in
+    ``litellm_params["base_model"]`` and routes to ``openai_classic``.
+    """
+    cfg = _make_cfg(
+        model_name="azure/prod-chat-001",
+        litellm_params={"base_model": "gpt-4o"},
+    )
+    assert _resolve_prompt_model_name(cfg, _FakeLLM("azure/prod-chat-001")) == "gpt-4o"
+
+
+def test_falls_back_to_model_name_when_litellm_params_is_none() -> None:
+    cfg = _make_cfg(
+        model_name="anthropic/claude-3-5-sonnet",
+        litellm_params=None,
+    )
+    got = _resolve_prompt_model_name(cfg, _FakeLLM("anthropic/claude-3-5-sonnet"))
+    assert got == "anthropic/claude-3-5-sonnet"
+
+
+def test_handles_litellm_params_without_base_model_key() -> None:
+    cfg = _make_cfg(
+        model_name="openai/gpt-4o",
+        litellm_params={"temperature": 0.5},
+    )
+    assert _resolve_prompt_model_name(cfg, _FakeLLM("openai/gpt-4o")) == "openai/gpt-4o"
+
+
+def test_ignores_blank_base_model() -> None:
+    """Whitespace-only ``base_model`` must not shadow ``model_name``."""
+    cfg = _make_cfg(
+        model_name="openai/gpt-4o",
+        litellm_params={"base_model": "   "},
+    )
+    assert _resolve_prompt_model_name(cfg, _FakeLLM("openai/gpt-4o")) == "openai/gpt-4o"
+
+
+def test_ignores_non_string_base_model() -> None:
+    """Defensive: a non-string ``base_model`` should not crash the resolver."""
+    cfg = _make_cfg(
+        model_name="openai/gpt-4o",
+        litellm_params={"base_model": 42},
+    )
+    assert _resolve_prompt_model_name(cfg, _FakeLLM("openai/gpt-4o")) == "openai/gpt-4o"
+
+
+def test_falls_back_to_llm_model_when_no_agent_config() -> None:
+    """No ``agent_config`` -> use ``llm.model`` directly. Defensive path
+    for direct callers; production callers always supply a config."""
+    assert (
+        _resolve_prompt_model_name(None, _FakeLLM("openai/gpt-4o-mini"))
+        == "openai/gpt-4o-mini"
+    )
+
+
+def test_returns_none_when_nothing_available() -> None:
+    """``compose_system_prompt`` treats ``None`` as the ``"default"``
+    variant and emits no provider block."""
+    assert _resolve_prompt_model_name(None, _FakeLLM(None)) is None
+
+
+def test_auto_mode_resolves_to_auto_string() -> None:
+    """Auto mode -> ``"auto"``. ``detect_provider_variant("auto")``
+    returns ``"default"``, which is correct: the child model isn't
+    known until the LiteLLM Router dispatches."""
+    cfg = AgentConfig.from_auto_mode()
+    assert _resolve_prompt_model_name(cfg, _FakeLLM("auto")) == "auto"
diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_retry_after.py b/surfsense_backend/tests/unit/agents/new_chat/test_retry_after.py
new file mode 100644
index 0000000000..d23fd693b3
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_retry_after.py
@@ -0,0 +1,107 @@
+"""Tests for RetryAfterMiddleware Retry-After parsing and retry decision logic."""
+
+from __future__ import annotations
+
+import pytest
+
+from app.agents.new_chat.middleware.retry_after import (
+    RetryAfterMiddleware,
+    _extract_retry_after_seconds,
+    _is_non_retryable,
+)
+
+pytestmark = pytest.mark.unit
+
+
+class _FakeResponse:
+    def __init__(self, headers: dict[str, str]) -> None:
+        self.headers = headers
+
+
+class _FakeRateLimitError(Exception):
+    def __init__(self, msg: str, headers: dict[str, str] | None = None) -> None:
+        super().__init__(msg)
+        if headers is not None:
+            self.response = _FakeResponse(headers)
+
+
+class TestExtractRetryAfter:
+    def test_seconds_header(self) -> None:
+        exc = _FakeRateLimitError("rate", {"Retry-After": "30"})
+        assert _extract_retry_after_seconds(exc) == 30.0
+
+    def test_milliseconds_header_overrides_seconds(self) -> None:
+        exc = _FakeRateLimitError("rate", {"retry-after-ms": "1500"})
+        assert _extract_retry_after_seconds(exc) == 1.5
+
+    def test_case_insensitive(self) -> None:
+        exc = _FakeRateLimitError("rate", {"RETRY-AFTER": "12"})
+        assert _extract_retry_after_seconds(exc) == 12.0
+
+    def test_falls_back_to_message_regex(self) -> None:
+        exc = Exception("Please retry after 7 seconds")
+        assert _extract_retry_after_seconds(exc) == 7.0
+
+    def test_returns_none_when_no_hint(self) -> None:
+        exc = Exception("oops")
+        assert _extract_retry_after_seconds(exc) is None
+
+    def test_handles_missing_headers_attr(self) -> None:
+        exc = ValueError("no headers")
+        assert _extract_retry_after_seconds(exc) is None
+
+
+class TestIsNonRetryable:
+    @pytest.mark.parametrize(
+        "name",
+        ["ContextWindowExceededError", "AuthenticationError", "InvalidRequestError"],
+    )
+    def test_non_retryable_classes(self, name: str) -> None:
+        cls = type(name, (Exception,), {})
+        assert _is_non_retryable(cls("x")) is True
+
+    def test_generic_exception_is_retryable(self) -> None:
+        assert _is_non_retryable(RuntimeError("transient")) is False
+
+
+class TestDelayCalculation:
+    def test_takes_max_of_backoff_and_header(self) -> None:
+        mw = RetryAfterMiddleware(max_retries=3, initial_delay=1.0, jitter=False)
+        exc = _FakeRateLimitError("rl", {"retry-after": "10"})
+        delay = mw._delay_for_attempt(0, exc)
+        assert delay == pytest.approx(10.0)
+
+    def test_uses_backoff_when_no_header(self) -> None:
+        mw = RetryAfterMiddleware(
+            max_retries=3, initial_delay=2.0, backoff_factor=2.0, jitter=False
+        )
+        delay = mw._delay_for_attempt(2, RuntimeError("transient"))
+        # 2 * 2^2 = 8
+        assert delay == pytest.approx(8.0)
+
+    def test_caps_at_max_delay(self) -> None:
+        mw = RetryAfterMiddleware(
+            max_retries=3,
+            initial_delay=10.0,
+            backoff_factor=10.0,
+            max_delay=15.0,
+            jitter=False,
+        )
+        delay = mw._delay_for_attempt(5, RuntimeError("x"))
+        assert delay <= 15.0
+
+
+class TestShouldRetry:
+    def test_default_retries_generic(self) -> None:
+        mw = RetryAfterMiddleware()
+        assert mw._should_retry(RuntimeError("transient")) is True
+
+    def test_default_skips_non_retryable(self) -> None:
+        mw = RetryAfterMiddleware()
+        cls = type("ContextWindowExceededError", (Exception,), {})
+        assert mw._should_retry(cls("too big")) is False
+
+    def test_custom_retry_on(self) -> None:
+        mw = RetryAfterMiddleware(retry_on=lambda exc: isinstance(exc, ValueError))
+        assert mw._should_retry(ValueError()) is True
+        assert mw._should_retry(KeyError()) is False
diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_rm_rmdir_cloud.py b/surfsense_backend/tests/unit/agents/new_chat/test_rm_rmdir_cloud.py
new file mode 100644
index 0000000000..7cabb6524e
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_rm_rmdir_cloud.py
@@ -0,0 +1,333 @@
+"""Cloud-mode behavior tests for the new ``rm`` and ``rmdir`` filesystem tools.
+
+The tools build ``Command(update=...)`` payloads that the persistence
+middleware applies at end of turn. These tests stub out the backend and
+runtime to assert the staging payload shape:
+
+* ``rm`` queues into ``pending_deletes`` and tombstones state files.
+* ``rm`` rejects directories, ``/documents``, root, and the anonymous doc.
+* ``rmdir`` queues into ``pending_dir_deletes`` and rejects non-empty dirs.
+* ``rmdir`` un-stages a same-turn ``mkdir`` rather than queuing a delete.
+* ``rmdir`` refuses to drop the cwd or any of its ancestors.
+* ``KBPostgresBackend`` view-helpers honor staged deletes.
+"""
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+from typing import Any
+from unittest.mock import AsyncMock
+
+import pytest
+
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+from app.agents.new_chat.middleware.filesystem import SurfSenseFilesystemMiddleware
+from app.agents.new_chat.middleware.kb_postgres_backend import KBPostgresBackend
+
+pytestmark = pytest.mark.unit
+
+
+def _make_middleware(mode: FilesystemMode = FilesystemMode.CLOUD):
+    middleware = SurfSenseFilesystemMiddleware.__new__(SurfSenseFilesystemMiddleware)
+    middleware._filesystem_mode = mode
+    middleware._custom_tool_descriptions = {}
+    return middleware
+
+
+def _runtime(state: dict[str, Any] | None = None, *, tool_call_id: str = "tc-abc"):
+    state = state or {}
+    state.setdefault("cwd", "/documents")
+    return SimpleNamespace(state=state, tool_call_id=tool_call_id)
+
+
+class _KBBackendStub(KBPostgresBackend):
+    """Construct-able subclass of :class:`KBPostgresBackend` for tests.
+
+    We bypass the real ``__init__`` (which expects a runtime + DB session)
+    and inject just the methods the rm/rmdir tools touch. The class
+    inheritance keeps ``isinstance(backend, KBPostgresBackend)`` checks
+    inside the tools happy, which is what gates them from the desktop
+    code path.
+    """
+
+    def __init__(self, *, children=None, file_data=None) -> None:
+        self.als_info = AsyncMock(return_value=children or [])
+        self._load_file_data = AsyncMock(
+            return_value=(file_data, 17) if file_data is not None else None
+        )
+
+
+def _make_backend_stub(*, children=None, file_data=None) -> KBPostgresBackend:
+    return _KBBackendStub(children=children, file_data=file_data)
+
+
+def _bind_backend(middleware, backend):
+    """Inject a backend resolver onto the middleware test instance."""
+    middleware._get_backend = lambda runtime: backend
+    return backend
+
+
+# ---------------------------------------------------------------------------
+# rm
+# ---------------------------------------------------------------------------
+
+
+class TestRmStaging:
+    @pytest.mark.asyncio
+    async def test_stages_delete_and_tombstones_state(self):
+        m = _make_middleware()
+        _bind_backend(m, _make_backend_stub(children=[], file_data={"content": ["x"]}))
+        runtime = _runtime(
+            {
+                "cwd": "/documents",
+                "files": {"/documents/notes.md": {"content": ["hello"]}},
+                "doc_id_by_path": {"/documents/notes.md": 17},
+            },
+            tool_call_id="tc-1",
+        )
+
+        tool = m._create_rm_tool()
+        result = await tool.coroutine("/documents/notes.md", runtime=runtime)
+
+        assert hasattr(result, "update"), f"expected Command, got {result!r}"
+        update = result.update
+        assert update["pending_deletes"] == [
+            {"path": "/documents/notes.md", "tool_call_id": "tc-1"}
+        ]
+        assert update["files"] == {"/documents/notes.md": None}
+        assert update["doc_id_by_path"] == {"/documents/notes.md": None}
+
+    @pytest.mark.asyncio
+    async def test_rejects_documents_root(self):
+        m = _make_middleware()
+        runtime = _runtime()
+        tool = m._create_rm_tool()
+        result = await tool.coroutine("/documents", runtime=runtime)
+        assert isinstance(result, str)
+        assert "refusing to rm" in result
+
+    @pytest.mark.asyncio
+    async def test_rejects_root(self):
+        m = _make_middleware()
+        runtime = _runtime()
+        tool = m._create_rm_tool()
+        result = await tool.coroutine("/", runtime=runtime)
+        assert isinstance(result, str)
+        assert "refusing to rm" in result
+
+    @pytest.mark.asyncio
+    async def test_rejects_directory_via_staged_dirs(self):
+        m = _make_middleware()
+        runtime = _runtime(
+            {
+                "staged_dirs": ["/documents/team-x"],
+            }
+        )
+        tool = m._create_rm_tool()
+        result = await tool.coroutine("/documents/team-x", runtime=runtime)
+        assert isinstance(result, str)
+        assert "directory" in result.lower()
+        assert "rmdir" in result
+
+    @pytest.mark.asyncio
+    async def test_rejects_directory_via_listing(self):
+        m = _make_middleware()
+        _bind_backend(
+            m,
+            _make_backend_stub(
+                children=[{"path": "/documents/foo/x.md", "is_dir": False}]
+            ),
+        )
+        runtime = _runtime()
+        tool = m._create_rm_tool()
+        result = await tool.coroutine("/documents/foo", runtime=runtime)
+        assert isinstance(result, str)
+        assert "directory" in result.lower()
+
+    @pytest.mark.asyncio
+    async def test_rejects_anonymous_doc(self):
+        m = _make_middleware()
+        runtime = _runtime(
+            {
+                "kb_anon_doc": {
+                    "path": "/documents/uploaded.xml",
+                    "title": "uploaded",
+                    "content": "",
+                    "chunks": [],
+                }
+            }
+        )
+        tool = m._create_rm_tool()
+        result = await tool.coroutine("/documents/uploaded.xml", runtime=runtime)
+        assert isinstance(result, str)
+        assert "read-only" in result
+
+    @pytest.mark.asyncio
+    async def test_drops_path_from_dirty_paths(self):
+        m = _make_middleware()
+        _bind_backend(m, _make_backend_stub(children=[], file_data={"content": ["x"]}))
+        runtime = _runtime(
+            {
+                "files": {"/documents/notes.md": {"content": ["x"]}},
+                "doc_id_by_path": {"/documents/notes.md": 17},
+                "dirty_paths": ["/documents/notes.md"],
+            }
+        )
+        tool = m._create_rm_tool()
+        result = await tool.coroutine("/documents/notes.md", runtime=runtime)
+        update = result.update
+        # First element is _CLEAR sentinel; the rest must NOT contain the
+        # rm'd path.
+        dirty = update.get("dirty_paths") or []
+        assert "/documents/notes.md" not in dirty[1:]
+
+
+# ---------------------------------------------------------------------------
+# rmdir
+# ---------------------------------------------------------------------------
+
+
+class TestRmdirStaging:
+    @pytest.mark.asyncio
+    async def test_stages_dir_delete_when_empty_and_db_backed(self):
+        m = _make_middleware()
+        backend = _bind_backend(m, _make_backend_stub(children=[]))
+        # Override _load_file_data to return None (folder, not a file) and
+        # parent listing to claim the folder exists.
+        backend._load_file_data = AsyncMock(return_value=None)
+        backend.als_info = AsyncMock(
+            side_effect=[
+                [],  # children of /documents/proj
+                [
+                    {"path": "/documents/proj", "is_dir": True},
+                ],  # parent listing
+            ]
+        )
+        runtime = _runtime(
+            {
+                "cwd": "/documents",
+            },
+            tool_call_id="tc-rd",
+        )
+
+        tool = m._create_rmdir_tool()
+        result = await tool.coroutine("/documents/proj", runtime=runtime)
+
+        assert hasattr(result, "update")
+        update = result.update
+        assert update["pending_dir_deletes"] == [
+            {"path": "/documents/proj", "tool_call_id": "tc-rd"}
+        ]
+
+    @pytest.mark.asyncio
+    async def test_rejects_non_empty(self):
+        m = _make_middleware()
+        _bind_backend(
+            m,
+            _make_backend_stub(
+                children=[{"path": "/documents/proj/x.md", "is_dir": False}]
+            ),
+        )
+        runtime = _runtime()
+        tool = m._create_rmdir_tool()
+        result = await tool.coroutine("/documents/proj", runtime=runtime)
+        assert isinstance(result, str)
+        assert "not empty" in result
+
+    @pytest.mark.asyncio
+    async def test_unstages_same_turn_mkdir(self):
+        m = _make_middleware()
+        _bind_backend(m, _make_backend_stub(children=[]))
+        runtime = _runtime(
+            {
+                "cwd": "/documents",
+                "staged_dirs": ["/documents/scratch"],
+            },
+            tool_call_id="tc-rd",
+        )
+        tool = m._create_rmdir_tool()
+        result = await tool.coroutine("/documents/scratch", runtime=runtime)
+
+        assert hasattr(result, "update")
+        update = result.update
+        assert "pending_dir_deletes" not in update
+        # _CLEAR sentinel + remaining items (in this case, none).
+        staged_after = update["staged_dirs"]
+        assert staged_after[0] == "\x00__SURFSENSE_FILESYSTEM_CLEAR__\x00"
+        assert "/documents/scratch" not in staged_after[1:]
+
+    @pytest.mark.asyncio
+    async def test_rejects_root(self):
+        m = _make_middleware()
+        runtime = _runtime()
+        tool = m._create_rmdir_tool()
+        for victim in ("/", "/documents"):
+            result = await tool.coroutine(victim, runtime=runtime)
+            assert isinstance(result, str)
+            assert "refusing to rmdir" in result
+
+    @pytest.mark.asyncio
+    async def test_rejects_cwd(self):
+        m = _make_middleware()
+        runtime = _runtime({"cwd": "/documents/proj"})
+        tool = m._create_rmdir_tool()
+        result = await tool.coroutine("/documents/proj", runtime=runtime)
+        assert isinstance(result, str)
+        assert "cwd" in result.lower()
+
+    @pytest.mark.asyncio
+    async def test_rejects_ancestor_of_cwd(self):
+        m = _make_middleware()
+        runtime = _runtime({"cwd": "/documents/proj/sub"})
+        tool = m._create_rmdir_tool()
+        result = await tool.coroutine("/documents/proj", runtime=runtime)
+        assert isinstance(result, str)
+        assert "cwd" in result.lower()
+
+    @pytest.mark.asyncio
+    async def test_rejects_files(self):
+        m = _make_middleware()
+        _bind_backend(m, _make_backend_stub(children=[], file_data={"content": ["x"]}))
+        runtime = _runtime()
+        tool = m._create_rmdir_tool()
+        result = await tool.coroutine("/documents/notes.md", runtime=runtime)
+        assert isinstance(result, str)
+        assert "is a file" in result
+
+
+# ---------------------------------------------------------------------------
+# KBPostgresBackend view filter
+# ---------------------------------------------------------------------------
+
+
+class TestKBPostgresBackendDeleteFilter:
+    """als_info / glob / grep should suppress paths queued for delete."""
+
+    def _make_backend(self, state: dict[str, Any]) -> KBPostgresBackend:
+        runtime = SimpleNamespace(state=state)
+        backend = KBPostgresBackend(search_space_id=1, runtime=runtime)
+        return backend
+
+    def test_pending_filesystem_view_returns_deleted_paths(self):
+        backend = self._make_backend(
+            {
+                "pending_deletes": [
+                    {"path": "/documents/x.md", "tool_call_id": "t1"},
+                ],
+                "pending_dir_deletes": [
+                    {"path": "/documents/d1", "tool_call_id": "t2"},
+                ],
+            }
+        )
+        removed, alias, deleted_dirs = backend._pending_filesystem_view({})
+        assert "/documents/x.md" in removed
+        assert "/documents/d1" in deleted_dirs
+        assert alias == {}
+
+    def test_dir_suppressed_covers_descendants(self):
+        backend = self._make_backend({})
+        deleted_dirs = {"/documents/d"}
+        assert backend._is_dir_suppressed("/documents/d", deleted_dirs)
+        assert backend._is_dir_suppressed("/documents/d/x.md", deleted_dirs)
+        assert backend._is_dir_suppressed("/documents/d/sub/y.md", deleted_dirs)
+        assert not backend._is_dir_suppressed("/documents/other.md", deleted_dirs)
diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_skills_backends.py b/surfsense_backend/tests/unit/agents/new_chat/test_skills_backends.py
new file mode 100644
index 0000000000..eb9cf396c1
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_skills_backends.py
@@ -0,0 +1,242 @@
+"""Tests for the skills backends used by SurfSense's SkillsMiddleware."""
+
+from __future__ import annotations
+
+import asyncio
+from pathlib import Path
+
+import pytest
+
+from app.agents.new_chat.middleware.skills_backends import (
+    SKILLS_BUILTIN_PREFIX,
+    SKILLS_SPACE_PREFIX,
+    BuiltinSkillsBackend,
+    SearchSpaceSkillsBackend,
+    build_skills_backend_factory,
+    default_skills_sources,
+)
+
+
+@pytest.fixture
+def skills_root(tmp_path: Path) -> Path:
+    """Build a small synthetic skill-tree used by the tests."""
+    root = tmp_path / "skills"
+    (root / "alpha").mkdir(parents=True)
+    (root / "alpha" / "SKILL.md").write_text(
+        "---\nname: alpha\ndescription: alpha skill\n---\n# Alpha\n"
+    )
+    (root / "beta").mkdir(parents=True)
+    (root / "beta" / "SKILL.md").write_text(
+        "---\nname: beta\ndescription: beta skill\n---\n# Beta\n"
+    )
+    (root / "_orphan_file.md").write_text("not a skill, just a stray file")
+    return root
+
+
+class TestBuiltinSkillsBackendListing:
+    def test_lists_skill_directories_at_root(self, skills_root: Path) -> None:
+        backend = BuiltinSkillsBackend(skills_root)
+        infos = backend.ls_info("/")
+        names = {info["path"] for info in infos}
+        assert "/alpha" in names
+        assert "/beta" in names
+        assert "/_orphan_file.md" in names
+        for info in infos:
+            if info["path"] in {"/alpha", "/beta"}:
+                assert info["is_dir"] is True
+
+    def test_lists_skill_md_under_skill_directory(self, skills_root: Path) -> None:
+        backend = BuiltinSkillsBackend(skills_root)
+        infos = backend.ls_info("/alpha")
+        paths = {info["path"] for info in infos}
+        assert paths == {"/alpha/SKILL.md"}
+        assert infos[0]["is_dir"] is False
+        assert infos[0]["size"] > 0
+
+    def test_returns_empty_for_missing_path(self, skills_root: Path) -> None:
+        backend = BuiltinSkillsBackend(skills_root)
+        assert backend.ls_info("/nonexistent") == []
+
+    def test_returns_empty_when_root_missing(self, tmp_path: Path) -> None:
+        backend = BuiltinSkillsBackend(tmp_path / "definitely-missing")
+        assert backend.ls_info("/") == []
+        assert backend.download_files(["/x/SKILL.md"])[0].error == "file_not_found"
+
+    def test_refuses_path_traversal(self, skills_root: Path) -> None:
+        backend = BuiltinSkillsBackend(skills_root)
+        assert backend.ls_info("/../../../etc") == []
+        responses = backend.download_files(["/../../../etc/passwd"])
+        assert responses[0].error == "invalid_path"
+
+
+class TestBuiltinSkillsBackendDownload:
+    def test_downloads_skill_md_content(self, skills_root: Path) -> None:
+        backend = BuiltinSkillsBackend(skills_root)
+        responses = backend.download_files(["/alpha/SKILL.md", "/beta/SKILL.md"])
+        assert len(responses) == 2
+        assert responses[0].path == "/alpha/SKILL.md"
+        assert responses[0].content is not None
+        assert b"name: alpha" in responses[0].content
+        assert responses[1].error is None
+
+    def test_marks_directory_as_is_directory_error(self, skills_root: Path) -> None:
+        backend = BuiltinSkillsBackend(skills_root)
+        responses = backend.download_files(["/alpha"])
+        assert responses[0].error == "is_directory"
+
+    def test_marks_missing_file_as_file_not_found(self, skills_root: Path) -> None:
+        backend = BuiltinSkillsBackend(skills_root)
+        responses = backend.download_files(["/alpha/missing.md"])
+        assert responses[0].error == "file_not_found"
+        assert responses[0].content is None
+
+    def test_response_path_matches_input_for_correlation(
+        self, skills_root: Path
+    ) -> None:
+        backend = BuiltinSkillsBackend(skills_root)
+        inputs = ["/alpha/SKILL.md", "/missing.md", "/beta/SKILL.md"]
+        responses = backend.download_files(inputs)
+        assert [r.path for r in responses] == inputs
+
+
+class TestBuiltinSkillsBackendIntegration:
+    """Mirror the call sequence the SkillsMiddleware actually uses."""
+
+    def test_skills_middleware_call_pattern(self, skills_root: Path) -> None:
+        backend = BuiltinSkillsBackend(skills_root)
+
+        infos = asyncio.run(backend.als_info("/"))
+        skill_dirs = [i["path"] for i in infos if i.get("is_dir")]
+        assert sorted(skill_dirs) == ["/alpha", "/beta"]
+
+        skill_md_paths = [f"{p}/SKILL.md" for p in skill_dirs]
+        responses = asyncio.run(backend.adownload_files(skill_md_paths))
+        assert all(r.error is None for r in responses)
+        assert all(r.content is not None for r in responses)
+
+
+class TestBundledSkills:
+    def test_default_root_resolves_to_repo_skills_dir(self) -> None:
+        backend = BuiltinSkillsBackend()
+        assert backend.root.name == "builtin"
+        assert backend.root.parent.name == "skills"
+
+    def test_bundled_starter_skills_are_present(self) -> None:
+        backend = BuiltinSkillsBackend()
+        infos = backend.ls_info("/")
+        names = {info["path"].lstrip("/") for info in infos if info.get("is_dir")}
+        # Five starter skills required by the Tier 4 plan.
+        for required in (
+            "kb-research",
+            "report-writing",
+            "meeting-prep",
+            "slack-summary",
+            "email-drafting",
+        ):
+            assert required in names, f"missing starter skill: {required}"
+
+    def test_each_starter_skill_has_valid_skill_md(self) -> None:
+        backend = BuiltinSkillsBackend()
+        infos = backend.ls_info("/")
+        skill_dirs = [info["path"] for info in infos if info.get("is_dir")]
+        for skill_dir in skill_dirs:
+            md_path = f"{skill_dir}/SKILL.md"
+            response = backend.download_files([md_path])[0]
+            assert response.error is None, f"missing SKILL.md in {skill_dir}"
+            content = response.content.decode("utf-8").replace("\r\n", "\n")
+            assert content.startswith("---\n"), f"missing frontmatter in {skill_dir}"
+            assert "\nname:" in content
+            assert "\ndescription:" in content
+
+
+class _FakeKBBackend:
+    """Stand-in for :class:`KBPostgresBackend` with the two methods we need."""
+
+    def __init__(self, listing: list[dict], file_contents: dict[str, bytes]) -> None:
+        self._listing = listing
+        self._file_contents = file_contents
+        self.last_ls_path: str | None = None
+        self.last_download_paths: list[str] | None = None
+
+    async def als_info(self, path: str):
+        self.last_ls_path = path
+        return self._listing
+
+    async def adownload_files(self, paths):
+        from deepagents.backends.protocol import FileDownloadResponse
+
+        self.last_download_paths = list(paths)
+        out: list[FileDownloadResponse] = []
+        for p in paths:
+            content = self._file_contents.get(p)
+            if content is None:
+                out.append(FileDownloadResponse(path=p, error="file_not_found"))
+            else:
+                out.append(FileDownloadResponse(path=p, content=content))
+        return out
+
+
+class TestSearchSpaceSkillsBackend:
+    def test_remaps_paths_when_listing(self) -> None:
+        listing = [
+            {"path": "/documents/_skills/policy", "is_dir": True},
+            {"path": "/documents/_skills/policy/SKILL.md", "is_dir": False},
+            {"path": "/documents/other-folder/x.md", "is_dir": False},
+        ]
+        kb = _FakeKBBackend(listing=listing, file_contents={})
+        backend = SearchSpaceSkillsBackend(kb)
+        infos = asyncio.run(backend.als_info("/"))
+        assert kb.last_ls_path == "/documents/_skills"
+        paths = [info["path"] for info in infos]
+        assert "/policy" in paths
+        assert "/policy/SKILL.md" in paths
+        # Unrelated KB documents must NOT leak into the skills namespace.
+        assert all(not p.startswith("/documents") for p in paths)
+
+    def test_remaps_paths_when_downloading(self) -> None:
+        kb = _FakeKBBackend(
+            listing=[],
+            file_contents={
+                "/documents/_skills/policy/SKILL.md": b"---\nname: policy\n---\n",
+            },
+        )
+        backend = SearchSpaceSkillsBackend(kb)
+        responses = asyncio.run(backend.adownload_files(["/policy/SKILL.md"]))
+        assert kb.last_download_paths == ["/documents/_skills/policy/SKILL.md"]
+        assert responses[0].path == "/policy/SKILL.md"
+        assert responses[0].error is None
+        assert responses[0].content is not None
+
+    def test_sync_methods_raise_not_implemented(self) -> None:
+        backend = SearchSpaceSkillsBackend(_FakeKBBackend([], {}))
+        with pytest.raises(NotImplementedError):
+            backend.ls_info("/")
+        with pytest.raises(NotImplementedError):
+            backend.download_files(["/x"])
+
+    def test_custom_kb_root_is_honored(self) -> None:
+        kb = _FakeKBBackend(
+            listing=[
+                {"path": "/skills_admin/x", "is_dir": True},
+            ],
+            file_contents={},
+        )
+        backend = SearchSpaceSkillsBackend(kb, kb_root="/skills_admin")
+        infos = asyncio.run(backend.als_info("/"))
+        assert kb.last_ls_path == "/skills_admin"
+        assert infos[0]["path"] == "/x"
+
+
+class TestBackendFactory:
+    def test_builtin_only_factory_returns_composite(self) -> None:
+        factory = build_skills_backend_factory()
+        backend = factory(runtime=None)  # type: ignore[arg-type]
+        from deepagents.backends.composite import CompositeBackend
+
+        assert isinstance(backend, CompositeBackend)
+        assert SKILLS_BUILTIN_PREFIX in backend.routes
+        assert SKILLS_SPACE_PREFIX not in backend.routes
+
+    def test_default_skills_sources_lists_builtin_then_space(self) -> None:
+        sources = default_skills_sources()
+        assert sources == [SKILLS_BUILTIN_PREFIX, SKILLS_SPACE_PREFIX]
diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_specialized_subagents.py b/surfsense_backend/tests/unit/agents/new_chat/test_specialized_subagents.py
new file mode 100644
index 0000000000..3035cc8e08
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_specialized_subagents.py
@@ -0,0 +1,345 @@
+"""Tests for the specialized subagents (explore / report_writer / connector_negotiator)."""
+
+from __future__ import annotations
+
+from langchain_core.tools import tool
+
+from app.agents.new_chat.middleware.permission import PermissionMiddleware
+from app.agents.new_chat.subagents import (
+    build_connector_negotiator_subagent,
+    build_explore_subagent,
+    build_report_writer_subagent,
+    build_specialized_subagents,
+)
+from app.agents.new_chat.subagents.config import (
+    EXPLORE_READ_TOOLS,
+    REPORT_WRITER_TOOLS,
+    WRITE_TOOL_DENY_PATTERNS,
+)
+
+# ---------------------------------------------------------------------------
+# Fake tools used to verify filtering & permission behavior
+# ---------------------------------------------------------------------------
+
+
+@tool
+def search_surfsense_docs(query: str) -> str:
+    """Search the user's KB."""
+    return ""
+
+
+@tool
+def web_search(query: str) -> str:
+    """Search the public web."""
+    return ""
+
+
+@tool
+def scrape_webpage(url: str) -> str:
+    """Scrape a single webpage."""
+    return ""
+
+
+@tool
+def read_file(path: str) -> str:
+    """Read a file."""
+    return ""
+
+
+@tool
+def ls_tree(path: str) -> str:
+    """List a tree."""
+    return ""
+
+
+@tool
+def grep(pattern: str) -> str:
+    """Grep."""
+    return ""
+
+
+@tool
+def update_memory(content: str) -> str:
+    """Update the user's memory."""
+    return ""
+
+
+@tool
+def edit_file(path: str, old: str, new: str) -> str:
+    """Edit a file."""
+    return ""
+
+
+@tool
+def linear_create_issue(title: str) -> str:
+    """Create a Linear issue."""
+    return ""
+
+
+@tool
+def slack_send_message(channel: str, text: str) -> str:
+    """Send a Slack message."""
+    return ""
+
+
+@tool
+def get_connected_accounts() -> str:
+    """List connected accounts."""
+    return ""
+
+
+@tool
+def generate_report(topic: str) -> str:
+    """Generate a report artifact."""
+    return ""
+
+
+ALL_TOOLS = [
+    search_surfsense_docs,
+    web_search,
+    scrape_webpage,
+    read_file,
+    ls_tree,
+    grep,
+    update_memory,
+    edit_file,
+    linear_create_issue,
+    slack_send_message,
+    get_connected_accounts,
+    generate_report,
+]
+
+
+class TestExploreSubagent:
+    def test_only_read_tools_are_exposed(self) -> None:
+        spec = build_explore_subagent(tools=ALL_TOOLS)
+        names = {t.name for t in spec["tools"]}  # type: ignore[index]
+        assert names == EXPLORE_READ_TOOLS & {t.name for t in ALL_TOOLS}
+        assert "update_memory" not in names
+        assert "linear_create_issue" not in names
+        assert "edit_file" not in names
+
+    def test_includes_permission_middleware_with_deny_rules(self) -> None:
+        spec = build_explore_subagent(tools=ALL_TOOLS)
+        permission_mws = [
+            m
+            for m in spec["middleware"]
+            if isinstance(m, PermissionMiddleware)  # type: ignore[index]
+        ]
+        assert len(permission_mws) == 1
+        ruleset = permission_mws[0]._static_rulesets[0]
+        assert ruleset.origin == "subagent_explore"
+        deny_patterns = {r.permission for r in ruleset.rules if r.action == "deny"}
+        assert "update_memory" in deny_patterns
+        assert "edit_file" in deny_patterns
+        assert "*create*" in deny_patterns
+        assert "*send*" in deny_patterns
+
+    def test_skills_inherits_default_sources(self) -> None:
+        spec = build_explore_subagent(tools=ALL_TOOLS)
+        assert spec["skills"] == ["/skills/builtin/", "/skills/space/"]  # type: ignore[index]
+
+    def test_name_and_description_match_contract(self) -> None:
+        spec = build_explore_subagent(tools=ALL_TOOLS)
+        assert spec["name"] == "explore"
+        assert "read-only" in spec["description"].lower()
+
+    def test_includes_dedup_and_patch_middleware(self) -> None:
+        from deepagents.middleware.patch_tool_calls import PatchToolCallsMiddleware
+
+        from app.agents.new_chat.middleware import DedupHITLToolCallsMiddleware
+
+        spec = build_explore_subagent(tools=ALL_TOOLS)
+        types = {type(m) for m in spec["middleware"]}  # type: ignore[index]
+        assert PatchToolCallsMiddleware in types
+        assert DedupHITLToolCallsMiddleware in types
+
+
+class TestReportWriterSubagent:
+    def test_exposes_only_report_writing_tools(self) -> None:
+        spec = build_report_writer_subagent(tools=ALL_TOOLS)
+        names = {t.name for t in spec["tools"]}  # type: ignore[index]
+        assert names == REPORT_WRITER_TOOLS & {t.name for t in ALL_TOOLS}
+        assert "generate_report" in names
+        assert "search_surfsense_docs" in names
+
+    def test_deny_rules_block_writes_but_allow_generate_report(self) -> None:
+        spec = build_report_writer_subagent(tools=ALL_TOOLS)
+        permission_mws = [
+            m
+            for m in spec["middleware"]
+            if isinstance(m, PermissionMiddleware)  # type: ignore[index]
+        ]
+        ruleset = permission_mws[0]._static_rulesets[0]
+        deny_patterns = {r.permission for r in ruleset.rules if r.action == "deny"}
+        assert "update_memory" in deny_patterns
+        # generate_report MUST not be denied — it's the whole point of the subagent.
+        assert "generate_report" not in deny_patterns
+        # No deny pattern should match `generate_report` either.
+        assert all(
+            not _wildcard_matches(pattern, "generate_report")
+            for pattern in deny_patterns
+        )
+
+
+class TestConnectorNegotiatorSubagent:
+    def test_inherits_all_parent_tools(self) -> None:
+        spec = build_connector_negotiator_subagent(tools=ALL_TOOLS)
+        names = {t.name for t in spec["tools"]}  # type: ignore[index]
+        # Every parent tool is inherited; the deny ruleset enforces behavior
+        # at execution time instead of trimming the tool list.
+        assert names == {t.name for t in ALL_TOOLS}
+
+    def test_get_connected_accounts_is_present(self) -> None:
+        spec = build_connector_negotiator_subagent(tools=ALL_TOOLS)
+        names = {t.name for t in spec["tools"]}  # type: ignore[index]
+        assert "get_connected_accounts" in names
+
+    def test_deny_ruleset_blocks_mutating_connector_tools(self) -> None:
+        spec = build_connector_negotiator_subagent(tools=ALL_TOOLS)
+        permission_mws = [
+            m
+            for m in spec["middleware"]
+            if isinstance(m, PermissionMiddleware)  # type: ignore[index]
+        ]
+        ruleset = permission_mws[0]._static_rulesets[0]
+        deny_patterns = {r.permission for r in ruleset.rules if r.action == "deny"}
+        # `linear_create_issue` matches the `*_create` deny pattern.
+        assert any(_wildcard_matches(p, "linear_create_issue") for p in deny_patterns)
+        assert any(_wildcard_matches(p, "slack_send_message") for p in deny_patterns)
+
+
+class TestBuildSpecializedSubagents:
+    def test_returns_five_specs(self) -> None:
+        specs = build_specialized_subagents(tools=ALL_TOOLS)
+        names = [s["name"] for s in specs]  # type: ignore[index]
+        assert names == [
+            "explore",
+            "report_writer",
+            "linear_specialist",
+            "slack_specialist",
+            "connector_negotiator",
+        ]
+
+    def test_all_specs_have_unique_names(self) -> None:
+        specs = build_specialized_subagents(tools=ALL_TOOLS)
+        names = [s["name"] for s in specs]  # type: ignore[index]
+        assert len(set(names)) == len(names)
+
+    def test_extra_middleware_is_prepended_to_each_spec(self) -> None:
+        """Sentinel middleware passed via ``extra_middleware`` must appear
+        in each subagent's ``middleware`` list, before the local rules.
+
+        This guards against the regression where specialized subagents
+        promised filesystem tools (``read_file``, ``ls``, ``grep``) in
+        their system prompts but had no filesystem middleware mounted.
+        """
+
+        class _Sentinel:
+            pass
+
+        sentinel = _Sentinel()
+        specs = build_specialized_subagents(
+            tools=ALL_TOOLS, extra_middleware=[sentinel]
+        )
+        for spec in specs:
+            mws = spec["middleware"]  # type: ignore[index]
+            assert sentinel in mws
+            # The sentinel must appear *before* the permission middleware
+            # (subagent-local rules), preserving the documented composition
+            # order: extra → custom → patch → dedup.
+            sentinel_idx = mws.index(sentinel)
+            perm_idx = next(
+                (i for i, m in enumerate(mws) if isinstance(m, PermissionMiddleware)),
+                None,
+            )
+            assert perm_idx is not None
+            assert sentinel_idx < perm_idx
+
+
+class TestFilterToolsWarningSuppression:
+    """Names provided by middleware (read_file, ls, grep, …) must not
+    trigger the spurious "missing" warning in :func:`_filter_tools`."""
+
+    def test_middleware_provided_names_are_silent(self, caplog) -> None:
+        import logging
+
+        from app.agents.new_chat.subagents.config import _filter_tools
+
+        with caplog.at_level(
+            logging.INFO, logger="app.agents.new_chat.subagents.config"
+        ):
+            # Allowed set asks for two registry tools (one present, one
+            # not) plus a bunch of middleware-provided names.
+            _filter_tools(
+                [search_surfsense_docs],
+                allowed_names={
+                    "search_surfsense_docs",
+                    "scrape_webpage",  # legitimately missing → should warn
+                    "read_file",  # mw-provided → suppressed
+                    "ls",
+                    "grep",
+                    "glob",
+                    "write_todos",
+                },
+            )
+
+        warnings = [r.message for r in caplog.records if r.levelno >= logging.INFO]
+        # Exactly one warning, and it should mention scrape_webpage but not
+        # any middleware-provided name. Inspect the rendered "missing"
+        # list (between the brackets) so we don't false-match substrings
+        # like ``ls`` inside ``available``.
+        assert len(warnings) == 1, warnings
+        msg = warnings[0]
+        assert "scrape_webpage" in msg
+        bracket_section = msg.split("missing: ", 1)[1]
+        for noisy in ("read_file", "ls", "grep", "glob", "write_todos"):
+            assert f"'{noisy}'" not in bracket_section, msg
+
+
+class TestDenyPatternsCoverage:
+    def test_deny_patterns_cover_canonical_write_tools(self) -> None:
+        canonical_writes = [
+            "update_memory",
+            "edit_file",
+            "write_file",
+            "move_file",
+            "mkdir",
+            "linear_create_issue",
+            "linear_update_issue",
+            "linear_delete_issue",
+            "slack_send_message",
+            "create_index",
+            "update_account",
+            "delete_record",
+            "send_email",
+        ]
+        for tool_name in canonical_writes:
+            assert any(
+                _wildcard_matches(pattern, tool_name)
+                for pattern in WRITE_TOOL_DENY_PATTERNS
+            ), f"no deny pattern matches {tool_name!r}"
+
+    def test_deny_patterns_do_not_match_safe_read_tools(self) -> None:
+        canonical_reads = [
+            "search_surfsense_docs",
+            "read_file",
+            "ls_tree",
+            "grep",
+            "web_search",
+            "scrape_webpage",
+            "get_connected_accounts",
+            "generate_report",
+        ]
+        for tool_name in canonical_reads:
+            assert not any(
+                _wildcard_matches(pattern, tool_name)
+                for pattern in WRITE_TOOL_DENY_PATTERNS
+            ), f"deny pattern incorrectly matches read tool {tool_name!r}"
+
+
+def _wildcard_matches(pattern: str, value: str) -> bool:
+    """Helper using the same matcher the rule evaluator does."""
+    from app.agents.new_chat.permissions import wildcard_match
+
+    return wildcard_match(value, pattern)
diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_state_reducers.py b/surfsense_backend/tests/unit/agents/new_chat/test_state_reducers.py
new file mode 100644
index 0000000000..1857539902
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_state_reducers.py
@@ -0,0 +1,151 @@
+"""Tests for SurfSense filesystem state reducers."""
+
+from __future__ import annotations
+
+import pytest
+
+from app.agents.new_chat.state_reducers import (
+    _CLEAR,
+    _add_unique_reducer,
+    _dict_merge_with_tombstones_reducer,
+    _initial_filesystem_state,
+    _list_append_reducer,
+    _replace_reducer,
+)
+
+pytestmark = pytest.mark.unit
+
+
+class TestReplaceReducer:
+    def test_right_wins_outright(self):
+        assert _replace_reducer("a", "b") == "b"
+
+    def test_none_right_returns_none(self):
+        assert _replace_reducer("a", None) is None
+
+    def test_none_left_returns_right(self):
+        assert _replace_reducer(None, "b") == "b"
+
+
+class TestAddUniqueReducer:
+    def test_appends_unique_items(self):
+        assert _add_unique_reducer(["a"], ["b", "c"]) == ["a", "b", "c"]
+
+    def test_dedupes_against_left(self):
+        assert _add_unique_reducer(["a", "b"], ["b", "c"]) == ["a", "b", "c"]
+
+    def test_dedupes_within_right(self):
+        assert _add_unique_reducer([], ["a", "a", "b"]) == ["a", "b"]
+
+    def test_clear_anywhere_resets_and_reseeds_with_after_items(self):
+        # _CLEAR semantics: only items AFTER the LAST _CLEAR are kept.
+        result = _add_unique_reducer(["x", "y"], ["a", _CLEAR, "b", "c"])
+        assert result == ["b", "c"]
+
+    def test_multiple_clears_use_last(self):
+        result = _add_unique_reducer(["x"], [_CLEAR, "a", _CLEAR, "b"])
+        assert result == ["b"]
+
+    def test_clear_only_resets_to_empty(self):
+        assert _add_unique_reducer(["x", "y"], [_CLEAR]) == []
+
+    def test_empty_right_keeps_left(self):
+        assert _add_unique_reducer(["a"], []) == ["a"]
+        assert _add_unique_reducer(["a"], None) == ["a"]
+
+
+class TestListAppendReducer:
+    def test_preserves_order_and_duplicates(self):
+        result = _list_append_reducer([{"a": 1}], [{"b": 2}, {"a": 1}])
+        assert result == [{"a": 1}, {"b": 2}, {"a": 1}]
+
+    def test_clear_resets_keeping_after_items(self):
+        result = _list_append_reducer([{"a": 1}], [{"old": 1}, _CLEAR, {"new": 2}])
+        assert result == [{"new": 2}]
+
+
+class TestDictMergeWithTombstones:
+    def test_merges_keys(self):
+        assert _dict_merge_with_tombstones_reducer({"a": 1}, {"b": 2}) == {
+            "a": 1,
+            "b": 2,
+        }
+
+    def test_none_value_deletes_key(self):
+        result = _dict_merge_with_tombstones_reducer({"a": 1, "b": 2}, {"a": None})
+        assert result == {"b": 2}
+
+    def test_clear_resets_then_merges(self):
+        result = _dict_merge_with_tombstones_reducer(
+            {"a": 1, "b": 2}, {_CLEAR: True, "c": 3}
+        )
+        assert result == {"c": 3}
+
+    def test_clear_keeps_only_post_clear_non_none(self):
+        result = _dict_merge_with_tombstones_reducer(
+            {"a": 1}, {_CLEAR: True, "b": 2, "c": None}
+        )
+        assert result == {"b": 2}
+
+    def test_none_left_handled(self):
+        assert _dict_merge_with_tombstones_reducer(None, {"a": 1, "b": None}) == {
+            "a": 1
+        }
+
+
+class TestInitialFilesystemState:
+    def test_default_shape(self):
+        state = _initial_filesystem_state()
+        assert state["cwd"] == "/documents"
+        assert state["staged_dirs"] == []
+        assert state["staged_dir_tool_calls"] == {}
+        assert state["pending_moves"] == []
+        assert state["pending_deletes"] == []
+        assert state["pending_dir_deletes"] == []
+        assert state["doc_id_by_path"] == {}
+        assert state["dirty_paths"] == []
+        assert state["dirty_path_tool_calls"] == {}
+        assert state["kb_priority"] == []
+        assert state["kb_matched_chunk_ids"] == {}
+        assert state["kb_anon_doc"] is None
+        assert state["tree_version"] == 0
+
+
+class TestMultiEditSamePathCoalescing:
+    """Multi-edit-same-path turns must coalesce into ONE binding record.
+
+    The persistence body uses ``dirty_path_tool_calls[path]`` to find the
+    tool_call_id that produced the current state on disk. Because
+    ``dirty_paths`` dedupes via :func:`_add_unique_reducer` the second
+    edit doesn't append a new path entry — and because
+    ``_dict_merge_with_tombstones_reducer`` lets the right-hand side
+    overwrite, the LATEST tool_call_id wins. That's the correct behavior
+    for snapshotting: revert restores to the pre-mutation state, and
+    multiple back-to-back edits in one turn coalesce into a single
+    revisible op (the user sees ONE Revert button per turn-per-path,
+    not N).
+    """
+
+    def test_dirty_paths_dedupes_repeated_writes(self):
+        # ``_add_unique_reducer`` is applied to ``dirty_paths``. Two writes
+        # to the same path produce one entry, not two.
+        first = _add_unique_reducer([], ["/documents/a.md"])
+        second = _add_unique_reducer(first, ["/documents/a.md"])
+        assert second == ["/documents/a.md"]
+
+    def test_dirty_path_tool_calls_keeps_latest_tool_call_id(self):
+        # First write tags the path with tcid-1.
+        merged = _dict_merge_with_tombstones_reducer({}, {"/documents/a.md": "tcid-1"})
+        # Second write to the same path tags it with tcid-2 (latest wins).
+        merged = _dict_merge_with_tombstones_reducer(
+            merged, {"/documents/a.md": "tcid-2"}
+        )
+        assert merged == {"/documents/a.md": "tcid-2"}
+
+    def test_rm_tombstones_dirty_path_tool_call(self):
+        # ``rm`` writes ``{path: None}`` into dirty_path_tool_calls to
+        # prevent a stale binding from leaking past the delete.
+        merged = _dict_merge_with_tombstones_reducer(
+            {"/documents/a.md": "tcid-1"}, {"/documents/a.md": None}
+        )
+        assert merged == {}
diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_tool_call_repair.py b/surfsense_backend/tests/unit/agents/new_chat/test_tool_call_repair.py
new file mode 100644
index 0000000000..e02a047744
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_tool_call_repair.py
@@ -0,0 +1,121 @@
+"""Tests for ToolCallNameRepairMiddleware."""
+
+from __future__ import annotations
+
+import pytest
+from langchain_core.messages import AIMessage
+
+from app.agents.new_chat.middleware.tool_call_repair import (
+    ToolCallNameRepairMiddleware,
+)
+from app.agents.new_chat.tools.invalid_tool import INVALID_TOOL_NAME
+
+pytestmark = pytest.mark.unit
+
+
+def _make_state(message: AIMessage) -> dict:
+    return {"messages": [message]}
+
+
+class _FakeRuntime:
+    def __init__(self, context: object | None = None) -> None:
+        self.context = context
+
+
+class TestRepair:
+    def test_passthrough_when_name_matches(self) -> None:
+        mw = ToolCallNameRepairMiddleware(
+            registered_tool_names={"echo"}, fuzzy_match_threshold=None
+        )
+        msg = AIMessage(
+            content="",
+            tool_calls=[
+                {"name": "echo", "args": {}, "id": "1"},
+            ],
+        )
+        out = mw.after_model(_make_state(msg), _FakeRuntime())
+        assert out is None  # no change
+
+    def test_lowercase_repair(self) -> None:
+        mw = ToolCallNameRepairMiddleware(
+            registered_tool_names={"echo"}, fuzzy_match_threshold=None
+        )
+        msg = AIMessage(
+            content="",
+            tool_calls=[
+                {"name": "Echo", "args": {"x": 1}, "id": "1"},
+            ],
+        )
+        out = mw.after_model(_make_state(msg), _FakeRuntime())
+        assert out is not None
+        repaired = out["messages"][0]
+        assert repaired.tool_calls[0]["name"] == "echo"
+
+    def test_invalid_fallback_when_no_match(self) -> None:
+        mw = ToolCallNameRepairMiddleware(
+            registered_tool_names={"echo", INVALID_TOOL_NAME},
+            fuzzy_match_threshold=None,
+        )
+        msg = AIMessage(
+            content="",
+            tool_calls=[
+                {"name": "totally_different_name", "args": {"k": "v"}, "id": "1"},
+            ],
+        )
+        out = mw.after_model(_make_state(msg), _FakeRuntime())
+        assert out is not None
+        repaired_call = out["messages"][0].tool_calls[0]
+        assert repaired_call["name"] == INVALID_TOOL_NAME
+        assert repaired_call["args"]["tool"] == "totally_different_name"
+        assert "totally_different_name" in repaired_call["args"]["error"]
+
+    def test_no_invalid_means_skip_when_unknown(self) -> None:
+        mw = ToolCallNameRepairMiddleware(
+            registered_tool_names={"echo"}, fuzzy_match_threshold=None
+        )
+        msg = AIMessage(
+            content="",
+            tool_calls=[
+                {"name": "unknown", "args": {}, "id": "1"},
+            ],
+        )
+        out = mw.after_model(_make_state(msg), _FakeRuntime())
+        # No repair available; original returned unchanged (no update)
+        assert out is None
+
+    def test_fuzzy_match_works_when_enabled(self) -> None:
+        mw = ToolCallNameRepairMiddleware(
+            registered_tool_names={"search_documents"},
+            fuzzy_match_threshold=0.7,
+        )
+        msg = AIMessage(
+            content="",
+            tool_calls=[
+                {"name": "search_docments", "args": {}, "id": "1"},
+            ],
+        )
+        out = mw.after_model(_make_state(msg), _FakeRuntime())
+        assert out is not None
+        assert out["messages"][0].tool_calls[0]["name"] == "search_documents"
+
+    def test_skips_when_no_messages(self) -> None:
+        mw = ToolCallNameRepairMiddleware(registered_tool_names={"echo"})
+        out = mw.after_model({"messages": []}, _FakeRuntime())
+        assert out is None
+
+    def test_runtime_context_extends_registered(self) -> None:
+        from types import SimpleNamespace
+
+        mw = ToolCallNameRepairMiddleware(
+            registered_tool_names={"echo"}, fuzzy_match_threshold=None
+        )
+        msg = AIMessage(
+            content="",
+            tool_calls=[
+                {"name": "DynamicTool", "args": {}, "id": "1"},
+            ],
+        )
+        runtime = _FakeRuntime(SimpleNamespace(registered_tool_names=["dynamictool"]))
+        out = mw.after_model(_make_state(msg), runtime)
+        assert out is not None
+        assert out["messages"][0].tool_calls[0]["name"] == "dynamictool"
diff --git a/surfsense_backend/tests/unit/agents/new_chat/tools/test_resume_page_limits.py b/surfsense_backend/tests/unit/agents/new_chat/tools/test_resume_page_limits.py
new file mode 100644
index 0000000000..4f93ad7322
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/tools/test_resume_page_limits.py
@@ -0,0 +1,213 @@
+"""Unit tests for resume page-limit helpers and enforcement flow."""
+
+import io
+from types import SimpleNamespace
+from unittest.mock import AsyncMock
+
+import pypdf
+import pytest
+
+from app.agents.new_chat.tools import resume as resume_tool
+
+pytestmark = pytest.mark.unit
+
+
+class _FakeReport:
+    _next_id = 1000
+
+    def __init__(self, **kwargs):
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+        self.id = None
+
+
+class _FakeSession:
+    def __init__(self, parent_report=None):
+        self.parent_report = parent_report
+        self.added: list[_FakeReport] = []
+
+    async def get(self, _model, _id):
+        return self.parent_report
+
+    def add(self, report):
+        self.added.append(report)
+
+    async def commit(self):
+        for report in self.added:
+            if getattr(report, "id", None) is None:
+                report.id = _FakeReport._next_id
+                _FakeReport._next_id += 1
+
+    async def refresh(self, _report):
+        return None
+
+
+class _SessionContext:
+    def __init__(self, session):
+        self.session = session
+
+    async def __aenter__(self):
+        return self.session
+
+    async def __aexit__(self, exc_type, exc, tb):
+        return False
+
+
+class _SessionFactory:
+    def __init__(self, sessions):
+        self._sessions = list(sessions)
+
+    def __call__(self):
+        if not self._sessions:
+            raise RuntimeError("No fake sessions left")
+        return _SessionContext(self._sessions.pop(0))
+
+
+def _make_pdf_with_pages(page_count: int) -> bytes:
+    writer = pypdf.PdfWriter()
+    for _ in range(page_count):
+        writer.add_blank_page(width=612, height=792)
+    output = io.BytesIO()
+    writer.write(output)
+    return output.getvalue()
+
+
+def test_count_pdf_pages_reads_compiled_bytes() -> None:
+    pdf_bytes = _make_pdf_with_pages(2)
+    assert resume_tool._count_pdf_pages(pdf_bytes) == 2
+
+
+def test_validate_max_pages_rejects_out_of_range() -> None:
+    with pytest.raises(ValueError):
+        resume_tool._validate_max_pages(0)
+    with pytest.raises(ValueError):
+        resume_tool._validate_max_pages(6)
+
+
+@pytest.mark.asyncio
+async def test_generate_resume_defaults_to_one_page_target(monkeypatch) -> None:
+    read_session = _FakeSession()
+    write_session = _FakeSession()
+    session_factory = _SessionFactory([read_session, write_session])
+    monkeypatch.setattr(resume_tool, "shielded_async_session", session_factory)
+    monkeypatch.setattr(resume_tool, "Report", _FakeReport)
+
+    prompts: list[str] = []
+
+    async def _llm_invoke(messages):
+        prompts.append(messages[0].content)
+        return SimpleNamespace(content="= Jane Doe\n== Experience\n- Built systems")
+
+    llm = SimpleNamespace(ainvoke=AsyncMock(side_effect=_llm_invoke))
+    monkeypatch.setattr(
+        resume_tool,
+        "get_document_summary_llm",
+        AsyncMock(return_value=llm),
+    )
+    monkeypatch.setattr(resume_tool, "_compile_typst", lambda _source: b"pdf")
+    monkeypatch.setattr(resume_tool, "_count_pdf_pages", lambda _pdf: 1)
+
+    tool = resume_tool.create_generate_resume_tool(search_space_id=1, thread_id=1)
+    result = await tool.ainvoke({"user_info": "Jane Doe experience"})
+
+    assert result["status"] == "ready"
+    assert prompts
+    assert "**Target Maximum Pages:** 1" in prompts[0]
+
+
+@pytest.mark.asyncio
+async def test_generate_resume_compresses_when_over_limit(monkeypatch) -> None:
+    read_session = _FakeSession()
+    write_session = _FakeSession()
+    session_factory = _SessionFactory([read_session, write_session])
+    monkeypatch.setattr(resume_tool, "shielded_async_session", session_factory)
+    monkeypatch.setattr(resume_tool, "Report", _FakeReport)
+
+    responses = [
+        SimpleNamespace(content="= Jane Doe\n== Experience\n- Detailed bullet 1"),
+        SimpleNamespace(content="= Jane Doe\n== Experience\n- Condensed bullet"),
+    ]
+    llm = SimpleNamespace(ainvoke=AsyncMock(side_effect=responses))
+    monkeypatch.setattr(
+        resume_tool,
+        "get_document_summary_llm",
+        AsyncMock(return_value=llm),
+    )
+    monkeypatch.setattr(resume_tool, "_compile_typst", lambda _source: b"pdf")
+    page_counts = iter([2, 1])
+    monkeypatch.setattr(resume_tool, "_count_pdf_pages", lambda _pdf: next(page_counts))
+
+    tool = resume_tool.create_generate_resume_tool(search_space_id=1, thread_id=1)
+    result = await tool.ainvoke({"user_info": "Jane Doe experience", "max_pages": 1})
+
+    assert result["status"] == "ready"
+    assert write_session.added, "Expected successful report write"
+    metadata = write_session.added[0].report_metadata
+    assert metadata["target_max_pages"] == 1
+    assert metadata["actual_page_count"] == 1
+    assert metadata["compression_attempts"] == 1
+    assert metadata["page_limit_enforced"] is True
+
+
+@pytest.mark.asyncio
+async def test_generate_resume_returns_ready_when_target_not_met(monkeypatch) -> None:
+    read_session = _FakeSession()
+    write_session = _FakeSession()
+    session_factory = _SessionFactory([read_session, write_session])
+    monkeypatch.setattr(resume_tool, "shielded_async_session", session_factory)
+    monkeypatch.setattr(resume_tool, "Report", _FakeReport)
+
+    responses = [
+        SimpleNamespace(content="= Jane Doe\n== Experience\n- Long detail"),
+        SimpleNamespace(content="= Jane Doe\n== Experience\n- Still long"),
+        SimpleNamespace(content="= Jane Doe\n== Experience\n- Still too long"),
+    ]
+    llm = SimpleNamespace(ainvoke=AsyncMock(side_effect=responses))
+    monkeypatch.setattr(
+        resume_tool,
+        "get_document_summary_llm",
+        AsyncMock(return_value=llm),
+    )
+    monkeypatch.setattr(resume_tool, "_compile_typst", lambda _source: b"pdf")
+    page_counts = iter([3, 3, 2])
+    monkeypatch.setattr(resume_tool, "_count_pdf_pages", lambda _pdf: next(page_counts))
+
+    tool = resume_tool.create_generate_resume_tool(search_space_id=1, thread_id=1)
+    result = await tool.ainvoke({"user_info": "Jane Doe experience", "max_pages": 1})
+
+    assert result["status"] == "ready"
+    assert "could not fit the target" in (result["message"] or "").lower()
+    metadata = write_session.added[0].report_metadata
+    assert metadata["target_page_met"] is False
+    assert metadata["actual_page_count"] == 2
+
+
+@pytest.mark.asyncio
+async def test_generate_resume_fails_when_hard_limit_exceeded(monkeypatch) -> None:
+    read_session = _FakeSession()
+    failed_session = _FakeSession()
+    session_factory = _SessionFactory([read_session, failed_session])
+    monkeypatch.setattr(resume_tool, "shielded_async_session", session_factory)
+    monkeypatch.setattr(resume_tool, "Report", _FakeReport)
+
+    responses = [
+        SimpleNamespace(content="= Jane Doe\n== Experience\n- Long detail"),
+        SimpleNamespace(content="= Jane Doe\n== Experience\n- Still long"),
+        SimpleNamespace(content="= Jane Doe\n== Experience\n- Still too long"),
+    ]
+    llm = SimpleNamespace(ainvoke=AsyncMock(side_effect=responses))
+    monkeypatch.setattr(
+        resume_tool,
+        "get_document_summary_llm",
+        AsyncMock(return_value=llm),
+    )
+    monkeypatch.setattr(resume_tool, "_compile_typst", lambda _source: b"pdf")
+    page_counts = iter([7, 6, 6])
+    monkeypatch.setattr(resume_tool, "_count_pdf_pages", lambda _pdf: next(page_counts))
+
+    tool = resume_tool.create_generate_resume_tool(search_space_id=1, thread_id=1)
+    result = await tool.ainvoke({"user_info": "Jane Doe experience", "max_pages": 1})
+
+    assert result["status"] == "failed"
+    assert "hard page limit" in (result["error"] or "").lower()
+    assert failed_session.added, "Expected failed report persistence"
diff --git a/surfsense_backend/tests/unit/agents/new_chat/tools/test_update_memory_scope.py b/surfsense_backend/tests/unit/agents/new_chat/tools/test_update_memory_scope.py
new file mode 100644
index 0000000000..f7fbacf500
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/tools/test_update_memory_scope.py
@@ -0,0 +1,198 @@
+"""Unit tests for memory scope validation and bullet format validation."""
+
+import pytest
+
+from app.agents.new_chat.tools.update_memory import (
+    _save_memory,
+    _validate_bullet_format,
+    _validate_memory_scope,
+)
+
+pytestmark = pytest.mark.unit
+
+
+class _Recorder:
+    def __init__(self) -> None:
+        self.applied_content: str | None = None
+        self.commit_calls = 0
+        self.rollback_calls = 0
+
+    def apply(self, content: str) -> None:
+        self.applied_content = content
+
+    async def commit(self) -> None:
+        self.commit_calls += 1
+
+    async def rollback(self) -> None:
+        self.rollback_calls += 1
+
+
+# ---------------------------------------------------------------------------
+# _validate_memory_scope — marker-based
+# ---------------------------------------------------------------------------
+
+
+def test_validate_memory_scope_rejects_pref_marker_in_team_scope() -> None:
+    content = "- (2026-04-10) [pref] Prefers dark mode\n"
+    result = _validate_memory_scope(content, "team")
+    assert result is not None
+    assert result["status"] == "error"
+    assert "[pref]" in result["message"]
+
+
+def test_validate_memory_scope_rejects_instr_marker_in_team_scope() -> None:
+    content = "- (2026-04-10) [instr] Always respond in Spanish\n"
+    result = _validate_memory_scope(content, "team")
+    assert result is not None
+    assert result["status"] == "error"
+    assert "[instr]" in result["message"]
+
+
+def test_validate_memory_scope_rejects_both_personal_markers_in_team() -> None:
+    content = (
+        "- (2026-04-10) [pref] Prefers dark mode\n"
+        "- (2026-04-10) [instr] Always respond in Spanish\n"
+    )
+    result = _validate_memory_scope(content, "team")
+    assert result is not None
+    assert result["status"] == "error"
+    assert "[instr]" in result["message"]
+    assert "[pref]" in result["message"]
+
+
+def test_validate_memory_scope_allows_fact_in_team_scope() -> None:
+    content = "- (2026-04-10) [fact] Office is in downtown Seattle\n"
+    result = _validate_memory_scope(content, "team")
+    assert result is None
+
+
+def test_validate_memory_scope_allows_all_markers_in_user_scope() -> None:
+    content = (
+        "- (2026-04-10) [fact] Python developer\n"
+        "- (2026-04-10) [pref] Prefers concise answers\n"
+        "- (2026-04-10) [instr] Always use bullet points\n"
+    )
+    result = _validate_memory_scope(content, "user")
+    assert result is None
+
+
+def test_validate_memory_scope_allows_any_heading_in_team() -> None:
+    content = "## Architecture\n- (2026-04-10) [fact] Uses PostgreSQL for persistence\n"
+    result = _validate_memory_scope(content, "team")
+    assert result is None
+
+
+def test_validate_memory_scope_allows_any_heading_in_user() -> None:
+    content = "## My Projects\n- (2026-04-10) [fact] Working on SurfSense\n"
+    result = _validate_memory_scope(content, "user")
+    assert result is None
+
+
+# ---------------------------------------------------------------------------
+# _validate_bullet_format
+# ---------------------------------------------------------------------------
+
+
+def test_validate_bullet_format_passes_valid_bullets() -> None:
+    content = (
+        "## Work\n"
+        "- (2026-04-10) [fact] Senior Python developer\n"
+        "- (2026-04-10) [pref] Prefers dark mode\n"
+        "- (2026-04-10) [instr] Always respond in bullet points\n"
+    )
+    warnings = _validate_bullet_format(content)
+    assert warnings == []
+
+
+def test_validate_bullet_format_warns_on_missing_marker() -> None:
+    content = "- (2026-04-10) Senior Python developer\n"
+    warnings = _validate_bullet_format(content)
+    assert len(warnings) == 1
+    assert "Malformed bullet" in warnings[0]
+
+
+def test_validate_bullet_format_warns_on_missing_date() -> None:
+    content = "- [fact] Senior Python developer\n"
+    warnings = _validate_bullet_format(content)
+    assert len(warnings) == 1
+    assert "Malformed bullet" in warnings[0]
+
+
+def test_validate_bullet_format_warns_on_unknown_marker() -> None:
+    content = "- (2026-04-10) [context] Working on project X\n"
+    warnings = _validate_bullet_format(content)
+    assert len(warnings) == 1
+    assert "Malformed bullet" in warnings[0]
+
+
+def test_validate_bullet_format_ignores_non_bullet_lines() -> None:
+    content = "## Some Heading\nSome paragraph text\n"
+    warnings = _validate_bullet_format(content)
+    assert warnings == []
+
+
+def test_validate_bullet_format_warns_on_old_format_without_marker() -> None:
+    content = "## About the user\n- (2026-04-10) Likes cats\n"
+    warnings = _validate_bullet_format(content)
+    assert len(warnings) == 1
+
+
+# ---------------------------------------------------------------------------
+# _save_memory — end-to-end with marker scope check
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_save_memory_blocks_pref_in_team_before_commit() -> None:
+    recorder = _Recorder()
+    result = await _save_memory(
+        updated_memory="- (2026-04-10) [pref] Prefers dark mode\n",
+        old_memory=None,
+        llm=None,
+        apply_fn=recorder.apply,
+        commit_fn=recorder.commit,
+        rollback_fn=recorder.rollback,
+        label="team memory",
+        scope="team",
+    )
+    assert result["status"] == "error"
+    assert recorder.commit_calls == 0
+    assert recorder.applied_content is None
+
+
+@pytest.mark.asyncio
+async def test_save_memory_allows_fact_in_team_and_commits() -> None:
+    recorder = _Recorder()
+    content = "- (2026-04-10) [fact] Weekly standup on Mondays\n"
+    result = await _save_memory(
+        updated_memory=content,
+        old_memory=None,
+        llm=None,
+        apply_fn=recorder.apply,
+        commit_fn=recorder.commit,
+        rollback_fn=recorder.rollback,
+        label="team memory",
+        scope="team",
+    )
+    assert result["status"] == "saved"
+    assert recorder.commit_calls == 1
+    assert recorder.applied_content == content
+
+
+@pytest.mark.asyncio
+async def test_save_memory_includes_format_warnings() -> None:
+    recorder = _Recorder()
+    content = "- (2026-04-10) Missing marker text\n"
+    result = await _save_memory(
+        updated_memory=content,
+        old_memory=None,
+        llm=None,
+        apply_fn=recorder.apply,
+        commit_fn=recorder.commit,
+        rollback_fn=recorder.rollback,
+        label="memory",
+        scope="user",
+    )
+    assert result["status"] == "saved"
+    assert "format_warnings" in result
+    assert len(result["format_warnings"]) == 1
diff --git a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py
index f72135d05d..9ba87207a3 100644
--- a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py
+++ b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py
@@ -168,7 +168,7 @@ async def test_concurrency_bounded_by_semaphore(
     active = 0
     peak = 0
 
-    async def _slow_extract(client, file):
+    async def _slow_extract(client, file, **kwargs):
         nonlocal active, peak
         async with lock:
             active += 1
@@ -209,7 +209,7 @@ async def test_heartbeat_fires_during_parallel_downloads(
 
     monkeypatch.setattr(_mod, "HEARTBEAT_INTERVAL_SECONDS", 0)
 
-    async def _slow_extract(client, file):
+    async def _slow_extract(client, file, **kwargs):
         await asyncio.sleep(0.05)
         return _mock_extract_ok(file["id"], file["name"])
 
diff --git a/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py
index 0ae0963616..7e968514c5 100644
--- a/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py
+++ b/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py
@@ -162,7 +162,7 @@ async def test_concurrency_bounded_by_semaphore(
     active = 0
     peak = 0
 
-    async def _slow_extract(client, file):
+    async def _slow_extract(client, file, **kwargs):
         nonlocal active, peak
         async with lock:
             active += 1
@@ -204,7 +204,7 @@ async def test_heartbeat_fires_during_parallel_downloads(
 
     monkeypatch.setattr(_mod, "HEARTBEAT_INTERVAL_SECONDS", 0)
 
-    async def _slow_extract(client, file):
+    async def _slow_extract(client, file, **kwargs):
         await asyncio.sleep(0.05)
         return _mock_extract_ok(file["id"], file["name"])
 
diff --git a/surfsense_backend/tests/unit/connector_indexers/test_onedrive_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_onedrive_parallel.py
index 12a912b032..396d79e73e 100644
--- a/surfsense_backend/tests/unit/connector_indexers/test_onedrive_parallel.py
+++ b/surfsense_backend/tests/unit/connector_indexers/test_onedrive_parallel.py
@@ -162,7 +162,7 @@ async def test_concurrency_bounded_by_semaphore(
     active = 0
     peak = 0
 
-    async def _slow_extract(client, file):
+    async def _slow_extract(client, file, **kwargs):
         nonlocal active, peak
         async with lock:
             active += 1
@@ -203,7 +203,7 @@ async def test_heartbeat_fires_during_parallel_downloads(
 
     monkeypatch.setattr(_mod, "HEARTBEAT_INTERVAL_SECONDS", 0)
 
-    async def _slow_extract(client, file):
+    async def _slow_extract(client, file, **kwargs):
         await asyncio.sleep(0.05)
         return _mock_extract_ok(file["id"], file["name"])
 
diff --git a/surfsense_backend/tests/unit/db/__init__.py b/surfsense_backend/tests/unit/db/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/tests/unit/db/test_relax_revision_fks_migration.py b/surfsense_backend/tests/unit/db/test_relax_revision_fks_migration.py
new file mode 100644
index 0000000000..82c2994888
--- /dev/null
+++ b/surfsense_backend/tests/unit/db/test_relax_revision_fks_migration.py
@@ -0,0 +1,83 @@
+"""Smoke test for the ``134_relax_revision_fks`` Alembic migration.
+
+A full apply/rollback test would require a live Postgres; here we verify
+the migration module's static contract:
+
+* The chain wires it as a successor of ``133_drop_documents_content_hash_unique``.
+* ``upgrade()`` declares two FK creations with ``ondelete='SET NULL'``
+  (one for ``document_revisions.document_id``, one for
+  ``folder_revisions.folder_id``).
+* ``downgrade()`` re-establishes ``ondelete='CASCADE'`` after draining
+  orphaned revisions.
+
+If any of these invariants regress the snapshot/revert pipeline silently
+loses the ability to undo ``rm`` / ``rmdir`` on environments that ran the
+migration "down" or never ran it at all.
+"""
+
+from __future__ import annotations
+
+import importlib.util
+import inspect
+from pathlib import Path
+
+import pytest
+
+pytestmark = pytest.mark.unit
+
+
+_MIGRATION_PATH = (
+    Path(__file__).resolve().parents[3]
+    / "alembic"
+    / "versions"
+    / "134_relax_revision_fks.py"
+)
+
+
+def _load_migration():
+    """Load the migration module by file path (no package import needed)."""
+    spec = importlib.util.spec_from_file_location("_migration_134", _MIGRATION_PATH)
+    assert spec and spec.loader, "could not load migration spec"
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+def test_migration_chain_revision_ids() -> None:
+    module = _load_migration()
+    # The migration file uses short numeric revision IDs to match the
+    # in-tree convention (cf. ``133`` -> ``134``); the ``134_<slug>.py``
+    # filename is documentation, not the canonical revision string.
+    assert getattr(module, "revision", None) == "134"
+    assert getattr(module, "down_revision", None) == "133"
+
+
+def test_migration_exposes_upgrade_and_downgrade() -> None:
+    module = _load_migration()
+    upgrade = getattr(module, "upgrade", None)
+    downgrade = getattr(module, "downgrade", None)
+    assert callable(upgrade), "upgrade() is required"
+    assert callable(downgrade), "downgrade() is required"
+
+
+def test_upgrade_creates_set_null_fks_for_both_revision_tables() -> None:
+    module = _load_migration()
+    src = inspect.getsource(module.upgrade)
+    assert "document_revisions" in src
+    assert "folder_revisions" in src
+    # Both new FKs MUST be ON DELETE SET NULL — that's the entire point
+    # of the migration: snapshots must outlive their parent row.
+    assert src.count('ondelete="SET NULL"') >= 2
+    # And the ``document_id`` / ``folder_id`` columns become nullable.
+    assert "nullable=True" in src
+
+
+def test_downgrade_drains_orphans_then_restores_cascade() -> None:
+    module = _load_migration()
+    src = inspect.getsource(module.downgrade)
+    # Drain orphaned rows BEFORE we can re-impose NOT NULL.
+    assert "DELETE FROM document_revisions WHERE document_id IS NULL" in src
+    assert "DELETE FROM folder_revisions WHERE folder_id IS NULL" in src
+    # Then restore the original CASCADE/NOT NULL contract.
+    assert src.count('ondelete="CASCADE"') >= 2
+    assert "nullable=False" in src
diff --git a/surfsense_backend/tests/unit/e2e_fakes/test_drive_list_files.py b/surfsense_backend/tests/unit/e2e_fakes/test_drive_list_files.py
new file mode 100644
index 0000000000..a111869a4e
--- /dev/null
+++ b/surfsense_backend/tests/unit/e2e_fakes/test_drive_list_files.py
@@ -0,0 +1,38 @@
+from tests.e2e.fakes.composio_module import _drive_list_files
+
+
+def _ids(result: dict) -> set[str]:
+    return {item["id"] for item in result["data"]["files"]}
+
+
+def test_drive_list_files_filters_shortcuts_and_trashed_items():
+    result = _drive_list_files(
+        {
+            "q": (
+                "'root' in parents and trashed = false and "
+                "mimeType != 'application/vnd.google-apps.shortcut'"
+            )
+        }
+    )
+
+    ids = _ids(result)
+
+    assert "fake-file-canary" in ids
+    assert "fake-shortcut-canary" not in ids
+    assert "fake-file-trashed" not in ids
+
+
+def test_drive_list_files_filters_to_exact_mime_type():
+    result = _drive_list_files(
+        {"q": "'root' in parents and trashed = false and mimeType = 'text/plain'"}
+    )
+
+    assert _ids(result) == {"fake-file-canary"}
+
+
+def test_drive_list_files_uses_requested_parent_folder():
+    result = _drive_list_files(
+        {"q": "'fake-folder-projects' in parents and trashed = false"}
+    )
+
+    assert _ids(result) == {"fake-file-roadmap"}
diff --git a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
index 9608b011df..bd0b485f79 100644
--- a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
+++ b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
@@ -431,7 +431,9 @@ async def test_llamacloud_heif_accepted_only_with_azure_di(tmp_path, mocker):
     mocker.patch("app.config.config.AZURE_DI_ENDPOINT", None, create=True)
     mocker.patch("app.config.config.AZURE_DI_KEY", None, create=True)
 
-    with pytest.raises(EtlUnsupportedFileError, match="not supported by LLAMACLOUD"):
+    with pytest.raises(
+        EtlUnsupportedFileError, match="document parser does not support this format"
+    ):
         await EtlPipelineService().extract(
             EtlRequest(file_path=str(heif_file), filename="photo.heif")
         )
@@ -549,8 +551,11 @@ def test_unsupported_extensions_classified_correctly(filename):
         ("doc.docx", "document"),
         ("slides.pptx", "document"),
         ("sheet.xlsx", "document"),
-        ("photo.png", "document"),
-        ("photo.jpg", "document"),
+        ("photo.png", "image"),
+        ("photo.jpg", "image"),
+        ("photo.webp", "image"),
+        ("photo.gif", "image"),
+        ("photo.heic", "image"),
         ("book.epub", "document"),
         ("letter.odt", "document"),
         ("readme.md", "plaintext"),
@@ -680,3 +685,603 @@ async def test_extract_eml_with_docling_raises_unsupported(tmp_path, mocker):
         await EtlPipelineService().extract(
             EtlRequest(file_path=str(eml_file), filename="mail.eml")
         )
+
+
+# ---------------------------------------------------------------------------
+# Image extraction via vision LLM
+# ---------------------------------------------------------------------------
+
+
+async def test_extract_image_with_vision_llm(tmp_path):
+    """An image file is analyzed by the vision LLM when provided."""
+    from unittest.mock import AsyncMock, MagicMock
+
+    img_file = tmp_path / "photo.png"
+    img_file.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 50)
+
+    fake_response = MagicMock()
+    fake_response.content = "# A photo of a sunset over the ocean"
+    fake_llm = AsyncMock()
+    fake_llm.ainvoke.return_value = fake_response
+
+    service = EtlPipelineService(vision_llm=fake_llm)
+    result = await service.extract(
+        EtlRequest(file_path=str(img_file), filename="photo.png")
+    )
+
+    assert result.markdown_content == "# A photo of a sunset over the ocean"
+    assert result.etl_service == "VISION_LLM"
+    assert result.content_type == "image"
+    fake_llm.ainvoke.assert_called_once()
+
+
+async def test_extract_image_falls_back_to_document_without_vision_llm(
+    tmp_path, mocker
+):
+    """Without a vision LLM, image files fall back to the document parser."""
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {"content": "# OCR text from image"}
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    img_file = tmp_path / "scan.png"
+    img_file.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 50)
+
+    service = EtlPipelineService()
+    result = await service.extract(
+        EtlRequest(file_path=str(img_file), filename="scan.png")
+    )
+
+    assert result.markdown_content == "# OCR text from image"
+    assert result.etl_service == "DOCLING"
+    assert result.content_type == "document"
+
+
+# ---------------------------------------------------------------------------
+# Document path with vision LLM: per-image descriptions are appended
+# ---------------------------------------------------------------------------
+
+
+def _fake_extraction_result(*descriptions):
+    from app.etl_pipeline.picture_describer import (
+        PictureDescription,
+        PictureExtractionResult,
+    )
+
+    return PictureExtractionResult(
+        descriptions=[
+            PictureDescription(
+                page_number=d["page"],
+                ordinal_in_page=d.get("ordinal", 0),
+                name=d["name"],
+                sha256=d.get("sha", "deadbeef"),
+                description=d["desc"],
+            )
+            for d in descriptions
+        ]
+    )
+
+
+async def test_extract_pdf_with_vision_llm_inlines_image_blocks(tmp_path, mocker):
+    """A PDF with an `<!-- image -->` placeholder + caption gets the
+    block spliced inline (no orphaned ``## Image Content`` section).
+
+    This is the headline scenario for the medxpertqa benchmark: the
+    image content lives in the same chunk as the surrounding case text
+    so retrieval pulls the question, image, and answer options together.
+    """
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {
+        "content": (
+            "# MedXpertQA-MM MM-130\n\n"
+            "## Clinical case\n\nA 44-year-old man...\n\n"
+            "<!-- image -->\nImage: MM-130-a.jpeg\n\n"
+            "## Answer choices\n\nA) ...\n"
+        )
+    }
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    extraction = _fake_extraction_result(
+        {
+            "page": 1,
+            "name": "Im0",
+            "desc": "Axial CT showing a large cystic mass.",
+        }
+    )
+    mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=mocker.AsyncMock(return_value=extraction),
+    )
+
+    fake_llm = mocker.MagicMock()
+    result = await EtlPipelineService(vision_llm=fake_llm).extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    md = result.markdown_content
+    # The placeholder + caption are gone, replaced by a horizontal-
+    # rule-delimited section with the captioned filename.
+    assert "<!-- image -->" not in md
+    assert "Image: MM-130-a.jpeg" not in md
+    assert "**Embedded image:** `MM-130-a.jpeg`" in md
+    assert "**Visual description:**" in md
+    assert "Axial CT showing a large cystic mass." in md
+    # No OCR section -- our fake_extraction_result has no ocr_text,
+    # and the format omits the section when there's no text to show.
+    assert "**OCR text:**" not in md
+    # No raw HTML / XML tags or blockquote wrapping leak.
+    assert "<image" not in md
+    assert "> **Embedded image:**" not in md
+    # No appended section -- everything went inline.
+    assert "## Image Content" not in md
+    # Surrounding case text + answer options are preserved.
+    assert "A 44-year-old man..." in md
+    assert "## Answer choices" in md
+    assert "A) ..." in md
+
+
+async def test_extract_pdf_with_vision_llm_appends_when_no_marker(tmp_path, mocker):
+    """When parser markdown has no image markers, descriptions get appended.
+
+    This is the fallback path for parsers that drop image placeholders
+    entirely. The image content still ends up in the markdown -- just
+    in a clearly-labeled section rather than inline.
+    """
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {
+        "content": "# Parsed PDF text\n\nNo image markers anywhere.\n"
+    }
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    extraction = _fake_extraction_result(
+        {"page": 1, "name": "Im0", "desc": "An image description."}
+    )
+    mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=mocker.AsyncMock(return_value=extraction),
+    )
+
+    fake_llm = mocker.MagicMock()
+    result = await EtlPipelineService(vision_llm=fake_llm).extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    md = result.markdown_content
+    assert "# Parsed PDF text" in md
+    assert "## Image Content (vision-LLM extracted)" in md
+    assert "**Embedded image:** `Im0`" in md
+    assert "An image description." in md
+
+
+async def test_extract_pdf_without_vision_llm_skips_picture_descriptions(
+    tmp_path, mocker
+):
+    """No vision LLM -> parser markdown returned as-is."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {"content": "# Parsed PDF text"}
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    describe_mock = mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=mocker.AsyncMock(),
+    )
+
+    result = await EtlPipelineService().extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    assert result.markdown_content == "# Parsed PDF text"
+    assert "<image" not in result.markdown_content
+    describe_mock.assert_not_called()
+
+
+async def test_extract_pdf_with_vision_llm_swallows_describe_failure(tmp_path, mocker):
+    """A pypdf or vision LLM blow-up never fails the document upload."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {"content": "# Parsed PDF text"}
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=mocker.AsyncMock(side_effect=RuntimeError("pypdf exploded")),
+    )
+
+    fake_llm = mocker.MagicMock()
+    result = await EtlPipelineService(vision_llm=fake_llm).extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    assert result.markdown_content == "# Parsed PDF text"
+    assert result.etl_service == "DOCLING"
+
+
+async def test_extract_pdf_with_vision_llm_no_images_returns_parser_text(
+    tmp_path, mocker
+):
+    """Vision-LLM-enabled PDF with zero extracted images is unchanged."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {"content": "# Just text, no images"}
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    empty = _fake_extraction_result()
+    mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=mocker.AsyncMock(return_value=empty),
+    )
+
+    fake_llm = mocker.MagicMock()
+    result = await EtlPipelineService(vision_llm=fake_llm).extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    assert result.markdown_content == "# Just text, no images"
+    assert "<image" not in result.markdown_content
+
+
+# ---------------------------------------------------------------------------
+# Per-image OCR runner: wiring + behaviour
+#
+# When extracting a PDF with a vision LLM, the ETL service must ALSO
+# pass an ``ocr_runner`` to picture_describer. The runner is a closure
+# that re-feeds each extracted image through a vision-LLM-less
+# EtlPipelineService -- i.e. the same OCR engine that handles
+# standalone image uploads (Docling/Azure DI/LlamaCloud) gets a crack
+# at each embedded image, with the text attached to the inline block.
+# ---------------------------------------------------------------------------
+
+
+async def test_extract_pdf_passes_ocr_runner_to_describe_pictures(tmp_path, mocker):
+    """The ETL service must wire an ocr_runner kwarg to describe_pictures."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {"content": "# Parsed PDF text"}
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    describe_mock = mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=mocker.AsyncMock(return_value=_fake_extraction_result()),
+    )
+
+    fake_llm = mocker.MagicMock()
+    await EtlPipelineService(vision_llm=fake_llm).extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    describe_mock.assert_awaited_once()
+    _, kwargs = describe_mock.await_args
+    assert "ocr_runner" in kwargs
+    assert callable(kwargs["ocr_runner"])
+
+
+async def test_extract_pdf_ocr_runner_invokes_document_parser_on_image(
+    tmp_path, mocker
+):
+    """The OCR runner closure should re-extract each image via the parser.
+
+    We capture the runner that the ETL service passes to
+    describe_pictures, invoke it with a fake image path, and assert
+    that Docling was called with that image. This proves the closure
+    is wired to a vision-LLM-less sub-pipeline (otherwise it would
+    recurse into the vision LLM and never hit the OCR engine).
+    """
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+    image_file = tmp_path / "Im0.png"
+    image_file.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 100)
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {"content": "Slice 24 / 60   L   R"}
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    captured: dict = {}
+
+    async def capture_runner(*args, **kwargs):
+        captured["runner"] = kwargs["ocr_runner"]
+        return _fake_extraction_result()
+
+    mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=capture_runner,
+    )
+
+    fake_llm = mocker.MagicMock()
+    await EtlPipelineService(vision_llm=fake_llm).extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    runner = captured["runner"]
+    ocr_text = await runner(str(image_file), "Im0.png")
+
+    assert ocr_text == "Slice 24 / 60   L   R"
+    # Docling was invoked twice in total: once for the PDF, once for
+    # the image we re-fed via the runner.
+    assert fake_docling.process_document.await_count == 2
+
+
+async def test_extract_pdf_ocr_runner_returns_empty_on_unsupported_image(
+    tmp_path, mocker
+):
+    """Unsupported image format → runner returns empty string, doesn't raise.
+
+    Common case: a PDF embeds a JPEG2000 or CCITT-TIFF image that
+    Docling can't load. We don't want an unsupported format on ONE
+    embedded image to spoil the whole PDF extraction; the runner
+    should swallow the EtlUnsupportedFileError and return "" so the
+    image gets a description but no OCR tag.
+    """
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+    weird_image = tmp_path / "Im0.jp2"  # JPEG2000, unlikely to be supported
+    weird_image.write_bytes(b"\x00\x00\x00\x0cjP" + b"\x00" * 50)
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {"content": "# Parsed PDF text"}
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    captured: dict = {}
+
+    async def capture_runner(*args, **kwargs):
+        captured["runner"] = kwargs["ocr_runner"]
+        return _fake_extraction_result()
+
+    mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=capture_runner,
+    )
+
+    fake_llm = mocker.MagicMock()
+    await EtlPipelineService(vision_llm=fake_llm).extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    runner = captured["runner"]
+    ocr_text = await runner(str(weird_image), "Im0.jp2")
+
+    assert ocr_text == ""
+
+
+# ---------------------------------------------------------------------------
+# Processing Mode enum tests
+# ---------------------------------------------------------------------------
+
+
+def test_processing_mode_coerce_basic():
+    from app.etl_pipeline.etl_document import ProcessingMode
+
+    assert ProcessingMode.coerce("basic") == ProcessingMode.BASIC
+    assert ProcessingMode.coerce("BASIC") == ProcessingMode.BASIC
+    assert ProcessingMode.coerce(None) == ProcessingMode.BASIC
+    assert ProcessingMode.coerce("invalid") == ProcessingMode.BASIC
+
+
+def test_processing_mode_coerce_premium():
+    from app.etl_pipeline.etl_document import ProcessingMode
+
+    assert ProcessingMode.coerce("premium") == ProcessingMode.PREMIUM
+    assert ProcessingMode.coerce("PREMIUM") == ProcessingMode.PREMIUM
+
+
+def test_processing_mode_page_multiplier():
+    from app.etl_pipeline.etl_document import ProcessingMode
+
+    assert ProcessingMode.BASIC.page_multiplier == 1
+    assert ProcessingMode.PREMIUM.page_multiplier == 10
+
+
+def test_etl_request_default_processing_mode():
+    from app.etl_pipeline.etl_document import ProcessingMode
+
+    req = EtlRequest(file_path="/tmp/test.pdf", filename="test.pdf")
+    assert req.processing_mode == ProcessingMode.BASIC
+
+
+def test_etl_request_premium_processing_mode():
+    from app.etl_pipeline.etl_document import ProcessingMode
+
+    req = EtlRequest(
+        file_path="/tmp/test.pdf",
+        filename="test.pdf",
+        processing_mode=ProcessingMode.PREMIUM,
+    )
+    assert req.processing_mode == ProcessingMode.PREMIUM
+
+
+# ---------------------------------------------------------------------------
+# Azure DI model selection by processing mode
+# ---------------------------------------------------------------------------
+
+
+async def test_azure_di_basic_uses_prebuilt_read(tmp_path, mocker):
+    """Basic mode should use prebuilt-read model for Azure DI."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content " * 10)
+
+    mocker.patch("app.config.config.ETL_SERVICE", "LLAMACLOUD")
+    mocker.patch("app.config.config.LLAMA_CLOUD_API_KEY", "fake-key", create=True)
+    mocker.patch(
+        "app.config.config.AZURE_DI_ENDPOINT",
+        "https://fake.cognitiveservices.azure.com/",
+        create=True,
+    )
+    mocker.patch("app.config.config.AZURE_DI_KEY", "fake-key", create=True)
+
+    fake_client = _mock_azure_di(mocker, "# Azure basic")
+    _mock_llamacloud(mocker)
+
+    from app.etl_pipeline.etl_document import ProcessingMode
+
+    result = await EtlPipelineService().extract(
+        EtlRequest(
+            file_path=str(pdf_file),
+            filename="report.pdf",
+            processing_mode=ProcessingMode.BASIC,
+        )
+    )
+
+    assert result.markdown_content == "# Azure basic"
+    call_args = fake_client.begin_analyze_document.call_args
+    assert call_args[0][0] == "prebuilt-read"
+
+
+async def test_azure_di_premium_uses_prebuilt_layout(tmp_path, mocker):
+    """Premium mode should use prebuilt-layout model for Azure DI."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content " * 10)
+
+    mocker.patch("app.config.config.ETL_SERVICE", "LLAMACLOUD")
+    mocker.patch("app.config.config.LLAMA_CLOUD_API_KEY", "fake-key", create=True)
+    mocker.patch(
+        "app.config.config.AZURE_DI_ENDPOINT",
+        "https://fake.cognitiveservices.azure.com/",
+        create=True,
+    )
+    mocker.patch("app.config.config.AZURE_DI_KEY", "fake-key", create=True)
+
+    fake_client = _mock_azure_di(mocker, "# Azure premium")
+    _mock_llamacloud(mocker)
+
+    from app.etl_pipeline.etl_document import ProcessingMode
+
+    result = await EtlPipelineService().extract(
+        EtlRequest(
+            file_path=str(pdf_file),
+            filename="report.pdf",
+            processing_mode=ProcessingMode.PREMIUM,
+        )
+    )
+
+    assert result.markdown_content == "# Azure premium"
+    call_args = fake_client.begin_analyze_document.call_args
+    assert call_args[0][0] == "prebuilt-layout"
+
+
+# ---------------------------------------------------------------------------
+# LlamaCloud tier selection by processing mode
+# ---------------------------------------------------------------------------
+
+
+async def test_llamacloud_basic_uses_cost_effective_tier(tmp_path, mocker):
+    """Basic mode should use parse_page_with_llm parse_mode for LlamaCloud."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content " * 10)
+
+    mocker.patch("app.config.config.ETL_SERVICE", "LLAMACLOUD")
+    mocker.patch("app.config.config.LLAMA_CLOUD_API_KEY", "fake-key", create=True)
+    mocker.patch("app.config.config.AZURE_DI_ENDPOINT", None, create=True)
+    mocker.patch("app.config.config.AZURE_DI_KEY", None, create=True)
+
+    fake_parser = _mock_llamacloud(mocker, "# Llama basic")
+
+    llama_parse_cls = mocker.patch(
+        "llama_cloud_services.LlamaParse", return_value=fake_parser
+    )
+
+    from app.etl_pipeline.etl_document import ProcessingMode
+
+    result = await EtlPipelineService().extract(
+        EtlRequest(
+            file_path=str(pdf_file),
+            filename="report.pdf",
+            estimated_pages=5,
+            processing_mode=ProcessingMode.BASIC,
+        )
+    )
+
+    assert result.markdown_content == "# Llama basic"
+    call_kwargs = llama_parse_cls.call_args[1]
+    assert call_kwargs["parse_mode"] == "parse_page_with_llm"
+    assert "tier" not in call_kwargs
+
+
+async def test_llamacloud_premium_uses_agentic_plus_tier(tmp_path, mocker):
+    """Premium mode should use parse_page_with_agent parse_mode for LlamaCloud."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content " * 10)
+
+    mocker.patch("app.config.config.ETL_SERVICE", "LLAMACLOUD")
+    mocker.patch("app.config.config.LLAMA_CLOUD_API_KEY", "fake-key", create=True)
+    mocker.patch("app.config.config.AZURE_DI_ENDPOINT", None, create=True)
+    mocker.patch("app.config.config.AZURE_DI_KEY", None, create=True)
+
+    fake_parser = _mock_llamacloud(mocker, "# Llama premium")
+
+    llama_parse_cls = mocker.patch(
+        "llama_cloud_services.LlamaParse", return_value=fake_parser
+    )
+
+    from app.etl_pipeline.etl_document import ProcessingMode
+
+    result = await EtlPipelineService().extract(
+        EtlRequest(
+            file_path=str(pdf_file),
+            filename="report.pdf",
+            estimated_pages=5,
+            processing_mode=ProcessingMode.PREMIUM,
+        )
+    )
+
+    assert result.markdown_content == "# Llama premium"
+    call_kwargs = llama_parse_cls.call_args[1]
+    assert call_kwargs["parse_mode"] == "parse_page_with_agent"
+    assert "tier" not in call_kwargs
diff --git a/surfsense_backend/tests/unit/etl_pipeline/test_picture_describer.py b/surfsense_backend/tests/unit/etl_pipeline/test_picture_describer.py
new file mode 100644
index 0000000000..39fb087c35
--- /dev/null
+++ b/surfsense_backend/tests/unit/etl_pipeline/test_picture_describer.py
@@ -0,0 +1,972 @@
+"""Unit tests for the picture_describer module.
+
+Covers:
+
+- :func:`describe_pictures` -- the PDF image walker + per-image vision
+  LLM call (structured output split into ``ocr_text`` and
+  ``description``);
+- :func:`inject_descriptions_inline` -- in-place replacement of image
+  placeholders / captions in the parser markdown;
+- :func:`merge_descriptions_into_markdown` -- the top-level helper
+  that inlines what it can and appends what it can't;
+- :func:`render_appended_section` -- the appended-fallback renderer.
+"""
+
+from __future__ import annotations
+
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from app.etl_pipeline.picture_describer import (
+    PictureDescription,
+    PictureExtractionResult,
+    describe_pictures,
+    inject_descriptions_inline,
+    merge_descriptions_into_markdown,
+    render_appended_section,
+)
+
+pytestmark = pytest.mark.unit
+
+
+def _make_image_obj(name: str, data: bytes):
+    """Mimic pypdf's ImageFile object shape for the bits we use."""
+    img = MagicMock()
+    img.name = name
+    img.data = data
+    return img
+
+
+# ---------------------------------------------------------------------------
+# describe_pictures: short-circuits
+# ---------------------------------------------------------------------------
+
+
+async def test_describe_pictures_no_op_for_non_pdf(tmp_path):
+    """Non-PDF files are silently no-op'd; we don't try to extract images."""
+    docx_file = tmp_path / "report.docx"
+    docx_file.write_bytes(b"PK fake docx")
+
+    fake_llm = AsyncMock()
+    result = await describe_pictures(str(docx_file), "report.docx", fake_llm)
+
+    assert result.descriptions == []
+    assert result.skipped_too_large == 0
+    fake_llm.ainvoke.assert_not_called()
+
+
+async def test_describe_pictures_no_op_when_vision_llm_is_none(tmp_path):
+    """If the caller didn't provide a vision LLM, we no-op even for PDFs."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    result = await describe_pictures(str(pdf_file), "report.pdf", None)
+    assert result.descriptions == []
+
+
+async def test_describe_pictures_no_op_for_pdf_with_no_images(tmp_path, mocker):
+    """A PDF that pypdf can open but contains zero images returns empty."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    fake_reader = MagicMock()
+    fake_reader.pages = [MagicMock(images=[]), MagicMock(images=[])]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    fake_llm = AsyncMock()
+    result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
+
+    assert result.descriptions == []
+    fake_llm.ainvoke.assert_not_called()
+
+
+# ---------------------------------------------------------------------------
+# describe_pictures: happy paths
+# ---------------------------------------------------------------------------
+
+
+async def test_describe_pictures_runs_vision_llm_per_image(tmp_path, mocker):
+    """Every eligible image gets exactly one description-only vision call."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img_a = _make_image_obj("Im0.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    img_b = _make_image_obj("Im1.png", b"\x89PNG\r\n\x1a\n" + b"\xcd" * 2000)
+    page1 = MagicMock(images=[img_a])
+    page2 = MagicMock(images=[img_b])
+
+    fake_reader = MagicMock()
+    fake_reader.pages = [page1, page2]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    parse_mock = mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(side_effect=["Description A", "Description B"]),
+    )
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
+
+    assert len(result.descriptions) == 2
+    by_name = {d.name: d.description for d in result.descriptions}
+    assert by_name == {"Im0.jpeg": "Description A", "Im1.png": "Description B"}
+    assert all(d.page_number in (1, 2) for d in result.descriptions)
+    assert parse_mock.await_count == 2
+
+
+async def test_describe_pictures_dedups_by_hash(tmp_path, mocker):
+    """An image that appears N times in the PDF is described once."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    payload = b"\x89PNG\r\n\x1a\n" + b"\x42" * 2000
+    img = _make_image_obj("logo.png", payload)
+    page1 = MagicMock(images=[img])
+    page2 = MagicMock(images=[_make_image_obj("logo.png", payload)])
+    page3 = MagicMock(images=[_make_image_obj("logo.png", payload)])
+
+    fake_reader = MagicMock()
+    fake_reader.pages = [page1, page2, page3]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    parse_mock = mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(return_value="Logo desc"),
+    )
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
+
+    assert len(result.descriptions) == 1
+    assert result.skipped_duplicate == 2
+    assert parse_mock.await_count == 1
+
+
+async def test_describe_pictures_skips_too_small_images(tmp_path, mocker):
+    """Sub-1KB images (tracking pixels, dots, etc.) are skipped."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    tiny = _make_image_obj("dot.png", b"\x89PNG\r\n\x1a\n")
+    big = _make_image_obj("ct.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 3000)
+    page = MagicMock(images=[tiny, big])
+
+    fake_reader = MagicMock()
+    fake_reader.pages = [page]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    parse_mock = mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(return_value="CT scan"),
+    )
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
+
+    assert len(result.descriptions) == 1
+    assert result.descriptions[0].name == "ct.jpeg"
+    assert result.skipped_too_small == 1
+    assert parse_mock.await_count == 1
+
+
+async def test_describe_pictures_skips_too_large_images(tmp_path, mocker):
+    """Images larger than the vision LLM's per-image cap are skipped."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    huge = _make_image_obj("huge.jpeg", b"\xff" * (6 * 1024 * 1024))
+    ok = _make_image_obj("ok.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    page = MagicMock(images=[huge, ok])
+
+    fake_reader = MagicMock()
+    fake_reader.pages = [page]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    parse_mock = mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(return_value="OK image"),
+    )
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
+
+    assert len(result.descriptions) == 1
+    assert result.descriptions[0].name == "ok.jpeg"
+    assert result.skipped_too_large == 1
+    assert parse_mock.await_count == 1
+
+
+async def test_describe_pictures_swallows_per_image_failure(tmp_path, mocker):
+    """A vision LLM failure on one image must not kill the whole document."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img_a = _make_image_obj("a.jpeg", b"\xff\xd8" + b"\xab" * 2000)
+    img_b = _make_image_obj("b.jpeg", b"\xff\xd8" + b"\xcd" * 2000)
+    page = MagicMock(images=[img_a, img_b])
+
+    fake_reader = MagicMock()
+    fake_reader.pages = [page]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(side_effect=[RuntimeError("vision blew up"), "Success"]),
+    )
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
+
+    assert len(result.descriptions) == 1
+    assert result.descriptions[0].description == "Success"
+    assert result.failed == 1
+
+
+async def test_describe_pictures_handles_pypdf_open_failure(tmp_path, mocker):
+    """A malformed PDF that pypdf can't open returns an empty result."""
+    pdf_file = tmp_path / "broken.pdf"
+    pdf_file.write_bytes(b"not a pdf")
+
+    mocker.patch("pypdf.PdfReader", side_effect=ValueError("EOF marker not found"))
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(str(pdf_file), "broken.pdf", fake_llm)
+    assert result.descriptions == []
+
+
+# ---------------------------------------------------------------------------
+# inject_descriptions_inline: replacement patterns
+# ---------------------------------------------------------------------------
+
+
+def _desc(name="Im0", description="A CT scan."):
+    return PictureDescription(
+        page_number=1,
+        ordinal_in_page=0,
+        name=name,
+        sha256="aa",
+        description=description,
+    )
+
+
+def test_inject_no_op_when_no_descriptions():
+    markdown = "# Title\n\nbody text\n"
+    result = PictureExtractionResult()
+    out, n = inject_descriptions_inline(markdown, result)
+    assert out == markdown
+    assert n == 0
+
+
+def test_inject_replaces_placeholder_with_caption():
+    """`<!-- image -->` + `Image: <name>` together becomes one block.
+
+    This is the most common medxpertqa case: our renderer puts a caption
+    line right below the embedded JPEG, and Docling preserves both.
+    """
+    markdown = (
+        "# Case\n\n"
+        "Clinical text...\n\n"
+        "<!-- image -->\nImage: MM-130-a.jpeg\n\n"
+        "Answer choices: A) ...\n"
+    )
+    result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    assert "<!-- image -->" not in out
+    assert "Image: MM-130-a.jpeg" not in out  # caption consumed
+    # New format: horizontal-rule-delimited section with "Embedded
+    # image:" anchor and named "Visual description:" section. No
+    # blockquote wrapping -- nested blocks (lists, code, tables) inside
+    # a blockquote are silently dropped by Streamdown / remark.
+    assert "**Embedded image:** `MM-130-a.jpeg`" in out
+    assert "**Visual description:**" in out
+    assert "A CT scan." in out
+    # Block is delimited by horizontal rules so it stands out from
+    # surrounding paragraphs.
+    assert "\n---\n" in out
+    # No OCR section -- this fixture has no ocr_text on its descriptions.
+    assert "**OCR text:**" not in out
+    # No raw HTML tags / blockquote prefixes leak.
+    assert "<image" not in out
+    assert "</image>" not in out
+    assert "> **Embedded image:**" not in out  # we no longer wrap in `>`
+    # Surrounding context is preserved.
+    assert "Clinical text..." in out
+    assert "Answer choices: A) ..." in out
+
+
+def test_inject_uses_pypdf_name_when_no_caption():
+    """`<!-- image -->` alone uses the pypdf-given name as the attribute."""
+    markdown = "# Case\n\n<!-- image -->\n\nMore text\n"
+    result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    assert "**Embedded image:** `Im0`" in out
+
+
+def test_inject_replaces_bare_caption():
+    """A bare `Image: <name>` line (no placeholder) still gets replaced."""
+    markdown = "# Case\n\nText...\nImage: scan.jpeg\nMore text\n"
+    result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    assert "**Embedded image:** `scan.jpeg`" in out
+    assert "Image: scan.jpeg" not in out
+
+
+def test_inject_handles_multiple_images_in_order():
+    """Two placeholders + two descriptions: each consumed in document order."""
+    markdown = (
+        "Page 1\n\n<!-- image -->\nImage: a.jpeg\n\n"
+        "Between\n\n<!-- image -->\nImage: b.jpeg\n\nEnd\n"
+    )
+    result = PictureExtractionResult(
+        descriptions=[
+            PictureDescription(
+                page_number=1,
+                ordinal_in_page=0,
+                name="Im0",
+                sha256="aa",
+                description="Desc A",
+            ),
+            PictureDescription(
+                page_number=2,
+                ordinal_in_page=0,
+                name="Im1",
+                sha256="bb",
+                description="Desc B",
+            ),
+        ]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 2
+    assert "**Embedded image:** `a.jpeg`" in out
+    assert "**Embedded image:** `b.jpeg`" in out
+    assert out.index("a.jpeg") < out.index("b.jpeg")
+    assert "Desc A" in out and "Desc B" in out
+
+
+def test_inject_returns_remaining_count_when_more_descriptions_than_markers():
+    """Three descriptions, one marker -> only one inlined, two leftover."""
+    markdown = "Just one <!-- image --> here.\n"
+    result = PictureExtractionResult(
+        descriptions=[
+            _desc(name="Im0", description="First"),
+            _desc(name="Im1", description="Second"),
+            _desc(name="Im2", description="Third"),
+        ]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    assert "**Embedded image:** `Im0`" in out
+    assert "**Embedded image:** `Im1`" not in out
+
+
+def test_inject_returns_zero_when_no_markers_present():
+    """Markdown with no image markers at all returns the input unchanged."""
+    markdown = "# Title\n\nJust text. No images mentioned at all.\n"
+    result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 0
+    assert out == markdown
+
+
+# ---------------------------------------------------------------------------
+# render_appended_section
+# ---------------------------------------------------------------------------
+
+
+def test_render_appended_empty_when_nothing_passed():
+    assert render_appended_section([]) == ""
+
+
+def test_render_appended_renders_each_image_as_block():
+    descriptions = [
+        _desc(name="MM-130-a.jpeg", description="CT scan"),
+        _desc(name="MM-130-b.jpeg", description="Bar chart"),
+    ]
+    rendered = render_appended_section(descriptions)
+    assert "## Image Content (vision-LLM extracted)" in rendered
+    assert "**Embedded image:** `MM-130-a.jpeg`" in rendered
+    assert "CT scan" in rendered
+    assert "**Embedded image:** `MM-130-b.jpeg`" in rendered
+    assert "Bar chart" in rendered
+    # Each image block is delimited by horizontal rules.
+    assert rendered.count("\n---\n") >= 2
+    # No raw HTML / XML / blockquote prefixes.
+    assert "<image" not in rendered
+    assert "> **Embedded image:**" not in rendered
+    assert "**OCR text:**" not in rendered
+
+
+def test_render_appended_includes_skip_notes():
+    descriptions = [_desc()]
+    skip_result = PictureExtractionResult(
+        descriptions=descriptions,
+        skipped_too_small=2,
+        skipped_too_large=1,
+        skipped_duplicate=3,
+        failed=1,
+    )
+    rendered = render_appended_section(descriptions, skip_notes=skip_result)
+    assert "_Note:" in rendered
+    assert "2 too small" in rendered
+    assert "1 too large" in rendered
+    assert "3 duplicate" in rendered
+    assert "1 failed" in rendered
+
+
+# ---------------------------------------------------------------------------
+# merge_descriptions_into_markdown: top-level
+# ---------------------------------------------------------------------------
+
+
+def test_merge_inlines_when_marker_present():
+    markdown = "Text...\n\n<!-- image -->\nImage: scan.jpeg\n\nMore text\n"
+    result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
+
+    out = merge_descriptions_into_markdown(markdown, result)
+
+    assert "**Embedded image:** `scan.jpeg`" in out
+    # Nothing leaked into an appended section -- we should NOT see the
+    # appended-section heading because everything went inline.
+    assert "## Image Content" not in out
+
+
+def test_merge_appends_when_no_marker_present():
+    """Zero markers means everything goes into an appended section."""
+    markdown = "Pure text doc, no image markers.\n"
+    result = PictureExtractionResult(
+        descriptions=[_desc(name="Im0", description="An image desc.")]
+    )
+
+    out = merge_descriptions_into_markdown(markdown, result)
+
+    assert "Pure text doc" in out
+    assert "## Image Content (vision-LLM extracted)" in out
+    assert "**Embedded image:** `Im0`" in out
+
+
+def test_merge_appends_leftovers_with_distinct_heading():
+    """One marker, two descriptions -> one inline, second appended under
+    a heading that signals it's a leftover.
+    """
+    markdown = "Text\n\n<!-- image -->\nImage: a.jpeg\n\nEnd\n"
+    result = PictureExtractionResult(
+        descriptions=[
+            _desc(name="Im0", description="First"),
+            _desc(name="Im1", description="Second"),
+        ]
+    )
+
+    out = merge_descriptions_into_markdown(markdown, result)
+
+    assert "**Embedded image:** `a.jpeg`" in out  # inlined
+    assert "## Image Content (additional, no inline marker found)" in out
+    assert "**Embedded image:** `Im1`" in out  # appended
+
+
+# ---------------------------------------------------------------------------
+# describe_pictures: ocr_runner integration
+#
+# These tests cover the per-image OCR side-channel: when the caller
+# supplies an ``ocr_runner`` callable, each extracted image is sent
+# both to the vision LLM (visual description) and to the OCR runner
+# (text-in-image), in parallel. The OCR text -- if any -- is recorded
+# on the PictureDescription and rendered in the inline block.
+# ---------------------------------------------------------------------------
+
+
+async def test_describe_pictures_calls_ocr_runner_per_image(tmp_path, mocker):
+    """When an ocr_runner is provided, it's invoked once per eligible image."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img_a = _make_image_obj("Im0.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    img_b = _make_image_obj("Im1.png", b"\x89PNG\r\n\x1a\n" + b"\xcd" * 2000)
+    fake_reader = MagicMock()
+    fake_reader.pages = [MagicMock(images=[img_a, img_b])]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(side_effect=["Visual A", "Visual B"]),
+    )
+    ocr_runner = AsyncMock(side_effect=["OCR text A", "OCR text B"])
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(
+        str(pdf_file), "report.pdf", fake_llm, ocr_runner=ocr_runner
+    )
+
+    assert ocr_runner.await_count == 2
+    by_name = {d.name: d.ocr_text for d in result.descriptions}
+    assert by_name == {"Im0.jpeg": "OCR text A", "Im1.png": "OCR text B"}
+
+
+async def test_describe_pictures_runs_vision_and_ocr_in_parallel(tmp_path, mocker):
+    """Vision LLM and OCR run concurrently per image, not sequentially.
+
+    We verify this by recording call timestamps: if both finish within
+    a small window relative to the per-call sleep, they ran in parallel.
+    """
+    import asyncio
+    import time
+
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img = _make_image_obj("Im0.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    fake_reader = MagicMock()
+    fake_reader.pages = [MagicMock(images=[img])]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    sleep_each = 0.05  # 50ms per call
+
+    async def slow_vision(*args, **kwargs):
+        await asyncio.sleep(sleep_each)
+        return "Visual"
+
+    async def slow_ocr(*args, **kwargs):
+        await asyncio.sleep(sleep_each)
+        return "OCR"
+
+    mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=slow_vision,
+    )
+
+    fake_llm = MagicMock()
+    started = time.perf_counter()
+    result = await describe_pictures(
+        str(pdf_file), "report.pdf", fake_llm, ocr_runner=slow_ocr
+    )
+    elapsed = time.perf_counter() - started
+
+    assert len(result.descriptions) == 1
+    assert result.descriptions[0].ocr_text == "OCR"
+    # Sequential would be ~2*sleep_each. Parallel is ~1*sleep_each + overhead.
+    # Be generous with the bound so we're not flaky on slow CI.
+    assert elapsed < 1.5 * sleep_each, (
+        f"vision+OCR appear to be sequential (took {elapsed:.3f}s)"
+    )
+
+
+async def test_describe_pictures_treats_empty_ocr_as_none(tmp_path, mocker):
+    """Empty / whitespace-only OCR result is normalised to None.
+
+    This means the rendered image block won't carry an empty
+    "OCR text" section for images that contain no text at all
+    (e.g. a clean radiograph).
+    """
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img = _make_image_obj("scan.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    fake_reader = MagicMock()
+    fake_reader.pages = [MagicMock(images=[img])]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(return_value="A radiograph."),
+    )
+    ocr_runner = AsyncMock(return_value="   \n  \n")
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(
+        str(pdf_file), "report.pdf", fake_llm, ocr_runner=ocr_runner
+    )
+
+    assert len(result.descriptions) == 1
+    assert result.descriptions[0].ocr_text is None
+
+
+async def test_describe_pictures_swallows_ocr_runner_failure(tmp_path, mocker):
+    """An OCR runner exception must not kill the description for that image.
+
+    OCR is supplementary; the vision LLM's description is the primary
+    payload. If OCR blows up we drop the OCR field for that image and
+    keep the description.
+    """
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img = _make_image_obj("scan.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    fake_reader = MagicMock()
+    fake_reader.pages = [MagicMock(images=[img])]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(return_value="A radiograph."),
+    )
+    ocr_runner = AsyncMock(side_effect=RuntimeError("OCR backend down"))
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(
+        str(pdf_file), "report.pdf", fake_llm, ocr_runner=ocr_runner
+    )
+
+    assert len(result.descriptions) == 1
+    assert result.descriptions[0].description == "A radiograph."
+    assert result.descriptions[0].ocr_text is None
+    assert result.failed == 0  # the IMAGE didn't fail; only its OCR did
+
+
+async def test_describe_pictures_vision_failure_with_ocr_runner_skips_image(
+    tmp_path, mocker
+):
+    """If the vision LLM fails, the image is skipped even if OCR succeeded.
+
+    The inline block's primary purpose is the visual description; an
+    OCR-only block would be misleading (it'd look like the vision
+    pipeline ran when it didn't), so we treat vision failure as image
+    failure regardless of OCR outcome.
+    """
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img = _make_image_obj("scan.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    fake_reader = MagicMock()
+    fake_reader.pages = [MagicMock(images=[img])]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(side_effect=RuntimeError("vision blew up")),
+    )
+    ocr_runner = AsyncMock(return_value="OCR text")
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(
+        str(pdf_file), "report.pdf", fake_llm, ocr_runner=ocr_runner
+    )
+
+    assert result.descriptions == []
+    assert result.failed == 1
+
+
+async def test_describe_pictures_no_ocr_runner_keeps_ocr_text_none(tmp_path, mocker):
+    """Backward compat: omitting ocr_runner produces description-only blocks."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img = _make_image_obj("Im0.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    fake_reader = MagicMock()
+    fake_reader.pages = [MagicMock(images=[img])]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(return_value="Visual"),
+    )
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
+
+    assert len(result.descriptions) == 1
+    assert result.descriptions[0].ocr_text is None
+
+
+# ---------------------------------------------------------------------------
+# Rendering: "OCR text" section appears iff PictureDescription.ocr_text is set
+# ---------------------------------------------------------------------------
+
+
+def _desc_with_ocr(name="Im0", description="A CT scan.", ocr_text="L  R  10mm"):
+    return PictureDescription(
+        page_number=1,
+        ordinal_in_page=0,
+        name=name,
+        sha256="aa",
+        description=description,
+        ocr_text=ocr_text,
+    )
+
+
+def test_inject_renders_ocr_section_when_ocr_text_present():
+    markdown = "Text\n\n<!-- image -->\nImage: scan.jpeg\n\nMore\n"
+    result = PictureExtractionResult(
+        descriptions=[_desc_with_ocr(name="Im0", ocr_text="L  R  10mm")]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    assert "**Embedded image:** `scan.jpeg`" in out
+    assert "**OCR text:**" in out
+    assert "L  R  10mm" in out
+    # OCR section comes before the visual description (literal text
+    # first, interpretation second).
+    assert out.index("**OCR text:**") < out.index("**Visual description:**")
+    # Critical: no nested-block constructs (fenced code, blockquote)
+    # that previous formats relied on -- both broke in Streamdown /
+    # PlateJS by escaping their container and dropping content.
+    assert "```" not in out
+    assert "> **" not in out
+
+
+def test_inject_renders_multiline_ocr_with_hard_breaks():
+    """Multi-line OCR uses trailing-two-spaces hard breaks so each
+    line renders on its own row, without needing a fragile fenced
+    code block or blockquote wrapper."""
+    markdown = "Text\n\n<!-- image -->\nImage: scan.jpeg\n\nMore\n"
+    ocr_multi = "Slice 24 / 60\nL\nR\n10 mm"
+    result = PictureExtractionResult(
+        descriptions=[_desc_with_ocr(name="Im0", ocr_text=ocr_multi)]
+    )
+
+    out, _ = inject_descriptions_inline(markdown, result)
+
+    # Every OCR line is present.
+    for line in ("Slice 24 / 60", "L", "R", "10 mm"):
+        assert line in out
+    # Non-last OCR lines get the trailing two-space hard break.
+    assert "Slice 24 / 60  \n" in out
+    assert "\nL  \n" in out
+    assert "\nR  \n" in out
+    # Last OCR line must NOT carry the two-space hard break (no stray <br>).
+    assert "10 mm  \n" not in out
+    assert "10 mm\n" in out
+
+
+def test_render_appended_renders_ocr_section_when_ocr_text_present():
+    descriptions = [
+        _desc_with_ocr(
+            name="MM-130-a.jpeg",
+            description="Axial CT.",
+            ocr_text="Slice 24 / 60",
+        ),
+    ]
+    rendered = render_appended_section(descriptions)
+
+    assert "**OCR text:**" in rendered
+    assert "Slice 24 / 60" in rendered
+    assert "Axial CT." in rendered
+
+
+def test_render_omits_ocr_section_when_ocr_text_is_none():
+    descriptions = [_desc(name="Im0", description="A clean radiograph.")]
+    rendered = render_appended_section(descriptions)
+
+    assert "**Embedded image:** `Im0`" in rendered
+    assert "**OCR text:**" not in rendered
+    assert "**Visual description:**" in rendered
+    # No raw HTML / blockquote prefixes.
+    assert "<image" not in rendered
+    assert "> **" not in rendered
+
+
+# ---------------------------------------------------------------------------
+# inject_descriptions_inline: <figure> blocks (layout-aware parsers)
+#
+# Azure Document Intelligence's ``prebuilt-layout`` and LlamaCloud
+# premium both emit ``<figure>...</figure>`` blocks that already contain
+# the parser's own OCR of the figure (chart bar values, axis labels,
+# inline ``<figcaption>``, embedded ``<table>`` for tabular figures).
+# That parser-side content is useful for retrieval on its own, so we
+# PRESERVE the figure verbatim and append our vision-LLM block
+# immediately after rather than substituting for it.
+# ---------------------------------------------------------------------------
+
+
+def test_inject_appends_block_after_figure_preserving_parser_content():
+    """Figure block stays intact; vision-LLM block goes right after it."""
+    markdown = (
+        "Some narrative text.\n\n"
+        "<figure>\n\n"
+        "Republican\n68\nDemocrat\n30\n"
+        "\n</figure>\n\n"
+        "Following paragraph.\n"
+    )
+    result = PictureExtractionResult(
+        descriptions=[_desc(name="Im0", description="Bar chart of party ID.")]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    # Original figure is preserved verbatim -- the parser's OCR'd
+    # numbers must still be searchable.
+    assert "<figure>" in out
+    assert "</figure>" in out
+    assert "Republican" in out and "68" in out
+    # Our vision-LLM block follows the figure, not before / inside it.
+    assert "**Embedded image:** `Im0`" in out
+    assert "Bar chart of party ID." in out
+    figure_close = out.index("</figure>")
+    embedded_at = out.index("**Embedded image:** `Im0`")
+    assert figure_close < embedded_at, "block must be appended AFTER </figure>"
+    # Surrounding narrative is preserved.
+    assert "Some narrative text." in out
+    assert "Following paragraph." in out
+
+
+def test_inject_handles_multiple_figures_in_document_order():
+    """N figures + N descriptions: each pair lands in the right place."""
+    markdown = (
+        "Page 1\n\n<figure>\nChart A bars\n</figure>\n\n"
+        "Between\n\n<figure>\nChart B bars\n</figure>\n\n"
+        "End.\n"
+    )
+    result = PictureExtractionResult(
+        descriptions=[
+            PictureDescription(
+                page_number=1,
+                ordinal_in_page=0,
+                name="Im0",
+                sha256="aa",
+                description="Description of chart A.",
+            ),
+            PictureDescription(
+                page_number=2,
+                ordinal_in_page=0,
+                name="Im1",
+                sha256="bb",
+                description="Description of chart B.",
+            ),
+        ]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 2
+    # Both figures preserved; both descriptions inlined; order matches.
+    assert out.count("<figure>") == 2
+    assert out.count("</figure>") == 2
+    assert "Description of chart A." in out
+    assert "Description of chart B." in out
+    assert out.index("Description of chart A.") < out.index("Description of chart B.")
+    # Each description appears AFTER its corresponding </figure>.
+    first_close = out.index("</figure>")
+    assert first_close < out.index("Description of chart A.")
+    second_close = out.index("</figure>", first_close + 1)
+    assert second_close < out.index("Description of chart B.")
+
+
+def test_inject_figures_with_attributes_and_nested_tags():
+    """``<figure>`` with attributes and nested tags is matched and preserved."""
+    markdown = (
+        '<figure id="fig-3" class="chart">\n'
+        "<figcaption>Source: Pew Research</figcaption>\n"
+        "<table><tr><td>Republican</td><td>57</td></tr></table>\n"
+        "</figure>\n"
+    )
+    result = PictureExtractionResult(
+        descriptions=[_desc(name="Im0", description="Survey table.")]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    # All nested HTML is preserved (chunking will pick it up).
+    assert 'id="fig-3"' in out
+    assert "<figcaption>Source: Pew Research</figcaption>" in out
+    assert "<table>" in out and "Republican" in out and "57" in out
+    # Our block sits after the closing tag.
+    assert out.index("</figure>") < out.index("**Embedded image:** `Im0`")
+
+
+def test_inject_figures_more_descriptions_than_figures_returns_remaining():
+    """Three descriptions, one figure -> one inlined, two left for caller."""
+    markdown = "Text.\n<figure>\nbar values\n</figure>\nMore.\n"
+    result = PictureExtractionResult(
+        descriptions=[
+            _desc(name="Im0", description="First desc."),
+            _desc(name="Im1", description="Second desc."),
+            _desc(name="Im2", description="Third desc."),
+        ]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    assert "First desc." in out
+    # Leftovers are the caller's job; inject_descriptions_inline does
+    # not append them on its own.
+    assert "Second desc." not in out
+    assert "Third desc." not in out
+
+
+def test_inject_figures_more_figures_than_descriptions_leaves_extras_untouched():
+    """Two figures, one description -> first figure enriched, second left raw."""
+    markdown = (
+        "<figure>\nfigure 1 content\n</figure>\n<figure>\nfigure 2 content\n</figure>\n"
+    )
+    result = PictureExtractionResult(
+        descriptions=[_desc(name="Im0", description="Only description.")]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    # Both figures still present; only the first one was enriched.
+    assert out.count("<figure>") == 2
+    assert "Only description." in out
+    # Second figure has no embedded-image block immediately after it.
+    second_open = out.index("<figure>", out.index("<figure>") + 1)
+    second_close = out.index("</figure>", second_open)
+    after_second = out[second_close:]
+    assert "**Embedded image:**" not in after_second
+
+
+def test_merge_inlines_at_figure_boundary():
+    """Top-level helper does the right thing with figures (no leftover section)."""
+    markdown = "Lead.\n<figure>\nbars\n</figure>\nTrailer.\n"
+    result = PictureExtractionResult(
+        descriptions=[_desc(name="Im0", description="Bar chart.")]
+    )
+
+    out = merge_descriptions_into_markdown(markdown, result)
+
+    # Inline succeeded -> no appended-section heading.
+    assert "## Image Content" not in out
+    assert "Bar chart." in out
+    assert "<figure>" in out and "</figure>" in out
+
+
+def test_inject_figures_then_falls_through_to_docling_marker():
+    """Mixed-marker doc: figure consumed first, then Docling placeholder.
+
+    Defensive -- single docs are usually one parser's output, but if a
+    pipeline ever stitches two parsers' markdowns together the inliner
+    should still place each description.
+    """
+    markdown = (
+        "<figure>\nChart bars: 50, 40, 30\n</figure>\n\n"
+        "Later in the doc:\n\n"
+        "<!-- image -->\nImage: scan.jpeg\n\n"
+        "End.\n"
+    )
+    result = PictureExtractionResult(
+        descriptions=[
+            _desc(name="Im0", description="Chart description."),
+            _desc(name="Im1", description="Scan description."),
+        ]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 2
+    # Figure preserved + augmented.
+    assert "<figure>" in out and "Chart bars: 50, 40, 30" in out
+    assert "Chart description." in out
+    # Docling placeholder + caption replaced.
+    assert "<!-- image -->" not in out
+    assert "Image: scan.jpeg" not in out
+    assert "**Embedded image:** `scan.jpeg`" in out
+    assert "Scan description." in out
diff --git a/surfsense_backend/tests/unit/etl_pipeline/test_vision_llm.py b/surfsense_backend/tests/unit/etl_pipeline/test_vision_llm.py
new file mode 100644
index 0000000000..1293ff757a
--- /dev/null
+++ b/surfsense_backend/tests/unit/etl_pipeline/test_vision_llm.py
@@ -0,0 +1,146 @@
+"""Unit tests for the vision_llm parser helpers.
+
+Two helpers exist:
+
+- :func:`parse_with_vision_llm` -- single-shot for standalone image
+  uploads (.png/.jpg/etc). Returns combined markdown (description +
+  verbatim OCR mixed) since the image *is* the document.
+- :func:`parse_image_for_description` -- per-image-in-PDF call. Returns
+  visual description only; OCR is the ETL service's job.
+"""
+
+from __future__ import annotations
+
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+pytestmark = pytest.mark.unit
+
+
+# ---------------------------------------------------------------------------
+# parse_with_vision_llm: legacy single-shot path
+# ---------------------------------------------------------------------------
+
+
+async def test_parse_with_vision_llm_returns_combined_markdown(tmp_path):
+    """Standalone image uploads still go through the combined-markdown path."""
+    from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
+
+    img = tmp_path / "scan.png"
+    img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
+
+    fake_response = MagicMock()
+    fake_response.content = "# A scan of something."
+    fake_llm = AsyncMock()
+    fake_llm.ainvoke.return_value = fake_response
+
+    out = await parse_with_vision_llm(str(img), "scan.png", fake_llm)
+    assert out == "# A scan of something."
+    fake_llm.ainvoke.assert_awaited_once()
+
+
+async def test_parse_with_vision_llm_rejects_empty_response(tmp_path):
+    """An empty model response raises rather than silently returning blanks."""
+    from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
+
+    img = tmp_path / "scan.png"
+    img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
+
+    fake_response = MagicMock()
+    fake_response.content = ""
+    fake_llm = AsyncMock()
+    fake_llm.ainvoke.return_value = fake_response
+
+    with pytest.raises(ValueError, match="empty content"):
+        await parse_with_vision_llm(str(img), "scan.png", fake_llm)
+
+
+# ---------------------------------------------------------------------------
+# parse_image_for_description: per-image-in-PDF, description only
+# ---------------------------------------------------------------------------
+
+
+async def test_parse_image_for_description_returns_description(tmp_path):
+    """Description-only path returns the model's markdown unchanged."""
+    from app.etl_pipeline.parsers.vision_llm import parse_image_for_description
+
+    img = tmp_path / "scan.png"
+    img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
+
+    fake_response = MagicMock()
+    fake_response.content = "Axial CT showing a large cystic mass."
+    fake_llm = AsyncMock()
+    fake_llm.ainvoke.return_value = fake_response
+
+    out = await parse_image_for_description(str(img), "scan.png", fake_llm)
+    assert out == "Axial CT showing a large cystic mass."
+
+
+async def test_parse_image_for_description_uses_description_only_prompt(tmp_path):
+    """The prompt explicitly tells the model NOT to transcribe text.
+
+    This is the contract that lets us drop OCR from the response: the
+    ETL pipeline already has the text (from page-level OCR), so asking
+    the vision LLM for it would be redundant cost.
+    """
+    from app.etl_pipeline.parsers.vision_llm import parse_image_for_description
+
+    img = tmp_path / "scan.png"
+    img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
+
+    fake_response = MagicMock()
+    fake_response.content = "A description"
+    fake_llm = AsyncMock()
+    fake_llm.ainvoke.return_value = fake_response
+
+    await parse_image_for_description(str(img), "scan.png", fake_llm)
+
+    # The prompt is the first text part of the message we sent.
+    sent_messages = fake_llm.ainvoke.call_args.args[0]
+    prompt_text = sent_messages[0].content[0]["text"].lower()
+    assert "describe what this image visually depicts" in prompt_text
+    assert "do not transcribe text" in prompt_text
+
+
+async def test_parse_image_for_description_rejects_empty(tmp_path):
+    """Empty response surfaces as ValueError so the caller can skip the image."""
+    from app.etl_pipeline.parsers.vision_llm import parse_image_for_description
+
+    img = tmp_path / "scan.png"
+    img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
+
+    fake_response = MagicMock()
+    fake_response.content = "   "  # whitespace-only counts as empty
+    fake_llm = AsyncMock()
+    fake_llm.ainvoke.return_value = fake_response
+
+    with pytest.raises(ValueError, match="empty content"):
+        await parse_image_for_description(str(img), "scan.png", fake_llm)
+
+
+# ---------------------------------------------------------------------------
+# Image size + extension validation (shared by both paths)
+# ---------------------------------------------------------------------------
+
+
+def test_image_to_data_url_rejects_oversized(tmp_path):
+    """Images larger than 5 MB raise before any LLM call is made."""
+    from app.etl_pipeline.parsers.vision_llm import _image_to_data_url
+
+    big = tmp_path / "huge.png"
+    big.write_bytes(b"\x89PNG" + b"\x00" * (6 * 1024 * 1024))
+
+    with pytest.raises(ValueError, match="Image too large"):
+        _image_to_data_url(str(big))
+
+
+def test_image_to_data_url_rejects_unsupported_extension(tmp_path):
+    """Unknown extensions raise rather than guessing a MIME type."""
+    from app.etl_pipeline.parsers.vision_llm import _image_to_data_url
+
+    weird = tmp_path / "scan.xyz"
+    weird.write_bytes(b"\x00" * 100)
+
+    with pytest.raises(ValueError, match="Unsupported image extension"):
+        _image_to_data_url(str(weird))
diff --git a/surfsense_backend/tests/unit/google_unification/test_composio_credentials.py b/surfsense_backend/tests/unit/google_unification/test_composio_credentials.py
index 9ab618aa3a..7f6b0cd723 100644
--- a/surfsense_backend/tests/unit/google_unification/test_composio_credentials.py
+++ b/surfsense_backend/tests/unit/google_unification/test_composio_credentials.py
@@ -1,8 +1,17 @@
-"""Unit tests: build_composio_credentials returns valid Google Credentials.
+"""Unit tests: Composio credential helpers + ``get_access_token`` masking guard.
 
-Mocks the Composio SDK (external system boundary) and verifies that the
-returned ``google.oauth2.credentials.Credentials`` object is correctly
-configured with a token and a working refresh handler.
+Covers two seams between Surfsense and Composio:
+
+1. ``build_composio_credentials`` returns a ``google.oauth2.credentials.Credentials``
+   object with a working refresh handler (mocks the whole ``ComposioService``).
+2. ``ComposioService.get_access_token`` rejects masked / missing tokens with
+   actionable error messages (mocks only the Composio SDK boundary so the
+   real guard logic is exercised).
+
+The masking guard is the boundary handler that production tripped over when
+Composio's "Mask Connected Account Secrets" project setting was enabled.
+The corresponding fix landed in ``cea8618``; these tests lock that contract
+in place so any future weakening of the guard surfaces immediately.
 """
 
 from datetime import UTC, datetime
@@ -14,6 +23,11 @@
 pytestmark = pytest.mark.unit
 
 
+# ---------------------------------------------------------------------------
+# build_composio_credentials — high-level wrapper tests
+# ---------------------------------------------------------------------------
+
+
 @patch("app.services.composio_service.ComposioService")
 def test_returns_credentials_with_token_and_expiry(mock_composio_service):
     """build_composio_credentials returns a Credentials object with the Composio access token."""
@@ -54,3 +68,85 @@ def test_refresh_handler_fetches_fresh_token(mock_composio_service):
     assert new_token == "refreshed-token"
     assert new_expiry > datetime.now(UTC).replace(tzinfo=None)
     assert mock_service.get_access_token.call_count == 2
+
+
+# ---------------------------------------------------------------------------
+# ComposioService.get_access_token — boundary masking guard tests
+# ---------------------------------------------------------------------------
+
+
+def _service_with_account(account: object):
+    """Build a real ``ComposioService`` whose underlying Composio SDK is faked.
+
+    Only the SDK boundary is patched — the real ``get_access_token`` method
+    runs, so changes to the masking / missing-token guards surface here.
+    """
+    from app.services import composio_service as composio_service_module
+
+    with patch.object(composio_service_module, "Composio") as mock_composio_cls:
+        mock_client = MagicMock()
+        mock_client.connected_accounts.get.return_value = account
+        mock_composio_cls.return_value = mock_client
+
+        service = composio_service_module.ComposioService(api_key="unit-test-api-key")
+
+    # ``service.client`` already references ``mock_client`` even after the
+    # patch context exits because the constructor captured it during init.
+    return service
+
+
+@pytest.mark.parametrize("masked_token", ["x", "xxxxxxxx", "x" * 19])
+def test_get_access_token_raises_on_masked_token(masked_token):
+    """Tokens shorter than the 20-char unmask threshold must raise with the dashboard hint.
+
+    Composio masks ``state.val.access_token`` by default (project setting
+    "Mask Connected Account Secrets"). A masked token will always silently
+    fail downstream OAuth calls, so the guard surfaces it with the exact
+    text needed to fix the dashboard config.
+    """
+    fake_account = MagicMock()
+    fake_account.state.val.access_token = masked_token
+    service = _service_with_account(fake_account)
+
+    with pytest.raises(ValueError, match="Mask Connected Account Secrets"):
+        service.get_access_token("any-account-id")
+
+
+def test_get_access_token_raises_when_state_val_missing():
+    """No ``state.val`` on the connected account is a hard failure with an account-id hint."""
+    fake_account = MagicMock()
+    fake_account.state = None
+    service = _service_with_account(fake_account)
+
+    with pytest.raises(ValueError, match=r"No state\.val.*missing-state-account"):
+        service.get_access_token("missing-state-account")
+
+
+def test_get_access_token_raises_when_access_token_empty():
+    """``state.val`` present but ``access_token`` empty must fail before the masking check."""
+    fake_account = MagicMock()
+    fake_account.state.val.access_token = ""
+    service = _service_with_account(fake_account)
+
+    with pytest.raises(ValueError, match=r"No access_token.*missing-token-account"):
+        service.get_access_token("missing-token-account")
+
+
+def test_get_access_token_raises_when_access_token_none():
+    """``state.val.access_token = None`` must fail before the masking check."""
+    fake_account = MagicMock()
+    fake_account.state.val.access_token = None
+    service = _service_with_account(fake_account)
+
+    with pytest.raises(ValueError, match=r"No access_token.*none-token-account"):
+        service.get_access_token("none-token-account")
+
+
+def test_get_access_token_returns_unmasked_token():
+    """Happy path: a >=20-char access token is returned verbatim."""
+    fake_account = MagicMock()
+    unmasked = "u" * 32
+    fake_account.state.val.access_token = unmasked
+    service = _service_with_account(fake_account)
+
+    assert service.get_access_token("happy-account") == unmasked
diff --git a/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py b/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py
index ae55bc0896..07e3888361 100644
--- a/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py
+++ b/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py
@@ -37,7 +37,12 @@ def _make_orm_doc(connector_doc, doc_id):
 async def test_index_calls_embed_and_chunk_via_to_thread(
     pipeline, make_connector_document, monkeypatch
 ):
-    """index() runs embed_texts and chunk_text via asyncio.to_thread, not blocking the loop."""
+    """index() runs the chunker and embed_texts via asyncio.to_thread, not blocking the loop.
+
+    Routing between ``chunk_text`` (code path) and ``chunk_text_hybrid`` (default
+    path, see issue #1334) is verified separately in
+    ``test_non_code_documents_use_hybrid_chunker``.
+    """
     to_thread_calls = []
     original_to_thread = asyncio.to_thread
 
@@ -51,11 +56,11 @@ async def tracking_to_thread(func, *args, **kwargs):
         "app.indexing_pipeline.indexing_pipeline_service.summarize_document",
         AsyncMock(return_value="Summary."),
     )
-    mock_chunk = MagicMock(return_value=["chunk1"])
-    mock_chunk.__name__ = "chunk_text"
+    mock_chunk_hybrid = MagicMock(return_value=["chunk1"])
+    mock_chunk_hybrid.__name__ = "chunk_text_hybrid"
     monkeypatch.setattr(
-        "app.indexing_pipeline.indexing_pipeline_service.chunk_text",
-        mock_chunk,
+        "app.indexing_pipeline.indexing_pipeline_service.chunk_text_hybrid",
+        mock_chunk_hybrid,
     )
     mock_embed = MagicMock(
         side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]
@@ -65,6 +70,11 @@ async def tracking_to_thread(func, *args, **kwargs):
         "app.indexing_pipeline.indexing_pipeline_service.embed_texts",
         mock_embed,
     )
+    # Bypass set_committed_value, which requires a real ORM instance (not MagicMock).
+    monkeypatch.setattr(
+        "app.indexing_pipeline.indexing_pipeline_service.attach_chunks_to_document",
+        MagicMock(),
+    )
 
     connector_doc = make_connector_document(
         document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
@@ -77,8 +87,62 @@ async def tracking_to_thread(func, *args, **kwargs):
 
     await pipeline.index(document, connector_doc, llm=MagicMock())
 
-    assert "chunk_text" in to_thread_calls
+    # Either chunker entry point satisfies the "chunking runs off the event
+    # loop" contract this test guards. Routing between the two is verified
+    # in test_non_code_documents_use_hybrid_chunker.
+    assert {"chunk_text", "chunk_text_hybrid"} & set(to_thread_calls)
     assert "embed_texts" in to_thread_calls
+    assert document.status == DocumentStatus.ready()
+
+
+async def test_non_code_documents_use_hybrid_chunker(
+    pipeline, make_connector_document, monkeypatch
+):
+    """Non-code documents route through ``chunk_text_hybrid`` (issue #1334).
+
+    The hybrid chunker preserves Markdown table integrity by avoiding splits
+    mid-row. Only documents flagged with ``should_use_code_chunker=True``
+    should take the ``chunk_text`` path.
+    """
+    monkeypatch.setattr(
+        "app.indexing_pipeline.indexing_pipeline_service.summarize_document",
+        AsyncMock(return_value="Summary."),
+    )
+    mock_chunk_hybrid = MagicMock(return_value=["chunk1"])
+    mock_chunk_hybrid.__name__ = "chunk_text_hybrid"
+    monkeypatch.setattr(
+        "app.indexing_pipeline.indexing_pipeline_service.chunk_text_hybrid",
+        mock_chunk_hybrid,
+    )
+    mock_chunk_code = MagicMock(return_value=["chunk1"])
+    mock_chunk_code.__name__ = "chunk_text"
+    monkeypatch.setattr(
+        "app.indexing_pipeline.indexing_pipeline_service.chunk_text",
+        mock_chunk_code,
+    )
+    monkeypatch.setattr(
+        "app.indexing_pipeline.indexing_pipeline_service.embed_texts",
+        MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]),
+    )
+    monkeypatch.setattr(
+        "app.indexing_pipeline.indexing_pipeline_service.attach_chunks_to_document",
+        MagicMock(),
+    )
+
+    connector_doc = make_connector_document(
+        document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
+        unique_id="msg-1",
+        search_space_id=1,
+        should_use_code_chunker=False,
+    )
+    document = MagicMock(spec=Document)
+    document.id = 1
+    document.status = DocumentStatus.pending()
+
+    await pipeline.index(document, connector_doc, llm=MagicMock())
+
+    mock_chunk_hybrid.assert_called_once()
+    mock_chunk_code.assert_not_called()
 
 
 def _mock_session_factory(orm_docs_by_id):
diff --git a/surfsense_backend/tests/unit/middleware/test_dedup_hitl_tool_calls.py b/surfsense_backend/tests/unit/middleware/test_dedup_hitl_tool_calls.py
index add0105e40..467ba6d5f5 100644
--- a/surfsense_backend/tests/unit/middleware/test_dedup_hitl_tool_calls.py
+++ b/surfsense_backend/tests/unit/middleware/test_dedup_hitl_tool_calls.py
@@ -1,8 +1,10 @@
 import pytest
 from langchain_core.messages import AIMessage
+from langchain_core.tools import StructuredTool
 
 from app.agents.new_chat.middleware.dedup_tool_calls import (
     DedupHITLToolCallsMiddleware,
+    wrap_dedup_key_by_arg_name,
 )
 
 pytestmark = pytest.mark.unit
@@ -14,9 +16,34 @@ def _make_state(tool_calls: list[dict]) -> dict:
     return {"messages": [msg]}
 
 
+def _hitl_tool(name: str, *, dedup_arg: str) -> StructuredTool:
+    """Build a tool with declarative ``dedup_key`` metadata.
+
+    Mirrors the ``ToolDefinition.dedup_key`` -> ``tool.metadata["dedup_key"]``
+    propagation done by :func:`build_tools` after the cleanup tier.
+    """
+
+    def _fn(**kwargs):
+        return "ok"
+
+    return StructuredTool.from_function(
+        func=_fn,
+        name=name,
+        description="x",
+        metadata={"dedup_key": wrap_dedup_key_by_arg_name(dedup_arg)},
+    )
+
+
 def test_duplicate_hitl_calls_reduced_to_first():
-    """When the LLM emits the same HITL tool call twice, only the first is kept."""
-    mw = DedupHITLToolCallsMiddleware()
+    """When the LLM emits the same HITL tool call twice, only the first is kept.
+
+    After the cleanup tier removed ``_NATIVE_HITL_TOOL_DEDUP_KEYS``, the
+    resolver is sourced from ``ToolDefinition.dedup_key`` propagated onto
+    ``tool.metadata`` — which the registry does at agent build time. The
+    test mirrors that wiring with an in-memory tool.
+    """
+    tool = _hitl_tool("delete_calendar_event", dedup_arg="event_title_or_id")
+    mw = DedupHITLToolCallsMiddleware(agent_tools=[tool])
 
     state = _make_state(
         [
diff --git a/surfsense_backend/tests/unit/middleware/test_file_intent_middleware.py b/surfsense_backend/tests/unit/middleware/test_file_intent_middleware.py
new file mode 100644
index 0000000000..7fd3fe4a7f
--- /dev/null
+++ b/surfsense_backend/tests/unit/middleware/test_file_intent_middleware.py
@@ -0,0 +1,214 @@
+import pytest
+from langchain_core.messages import AIMessage, HumanMessage
+
+from app.agents.new_chat.middleware.file_intent import (
+    FileIntentMiddleware,
+    FileOperationIntent,
+    _fallback_path,
+)
+
+pytestmark = pytest.mark.unit
+
+
+class _FakeLLM:
+    def __init__(self, response_text: str):
+        self._response_text = response_text
+
+    async def ainvoke(self, *_args, **_kwargs):
+        return AIMessage(content=self._response_text)
+
+
+@pytest.mark.asyncio
+async def test_file_write_intent_injects_contract_message():
+    llm = _FakeLLM(
+        '{"intent":"file_write","confidence":0.93,"suggested_filename":"ideas.md"}'
+    )
+    middleware = FileIntentMiddleware(llm=llm)
+    state = {
+        "messages": [HumanMessage(content="Create another random note for me")],
+        "turn_id": "123:456",
+    }
+
+    result = await middleware.abefore_agent(state, runtime=None)  # type: ignore[arg-type]
+
+    assert result is not None
+    contract = result["file_operation_contract"]
+    assert contract["intent"] == FileOperationIntent.FILE_WRITE.value
+    assert contract["suggested_path"] == "/ideas.md"
+    assert contract["turn_id"] == "123:456"
+    assert any(
+        "file_operation_contract" in str(msg.content)
+        for msg in result["messages"]
+        if hasattr(msg, "content")
+    )
+
+
+@pytest.mark.asyncio
+async def test_non_write_intent_does_not_inject_contract_message():
+    llm = _FakeLLM('{"intent":"file_read","confidence":0.88,"suggested_filename":null}')
+    middleware = FileIntentMiddleware(llm=llm)
+    original_messages = [HumanMessage(content="Read /notes.md")]
+    state = {"messages": original_messages, "turn_id": "abc:def"}
+
+    result = await middleware.abefore_agent(state, runtime=None)  # type: ignore[arg-type]
+
+    assert result is not None
+    assert (
+        result["file_operation_contract"]["intent"]
+        == FileOperationIntent.FILE_READ.value
+    )
+    assert "messages" not in result
+
+
+@pytest.mark.asyncio
+async def test_file_write_null_filename_uses_semantic_default_path():
+    llm = _FakeLLM(
+        '{"intent":"file_write","confidence":0.74,"suggested_filename":null}'
+    )
+    middleware = FileIntentMiddleware(llm=llm)
+    state = {
+        "messages": [HumanMessage(content="create a random markdown file")],
+        "turn_id": "turn:1",
+    }
+
+    result = await middleware.abefore_agent(state, runtime=None)  # type: ignore[arg-type]
+
+    assert result is not None
+    contract = result["file_operation_contract"]
+    assert contract["intent"] == FileOperationIntent.FILE_WRITE.value
+    assert contract["suggested_path"] == "/notes.md"
+
+
+@pytest.mark.asyncio
+async def test_file_write_null_filename_defaults_to_markdown_path():
+    llm = _FakeLLM(
+        '{"intent":"file_write","confidence":0.71,"suggested_filename":null}'
+    )
+    middleware = FileIntentMiddleware(llm=llm)
+    state = {
+        "messages": [HumanMessage(content="create a sample json config file")],
+        "turn_id": "turn:2",
+    }
+
+    result = await middleware.abefore_agent(state, runtime=None)  # type: ignore[arg-type]
+
+    assert result is not None
+    contract = result["file_operation_contract"]
+    assert contract["intent"] == FileOperationIntent.FILE_WRITE.value
+    assert contract["suggested_path"] == "/notes.md"
+
+
+@pytest.mark.asyncio
+async def test_file_write_txt_suggestion_is_normalized_to_markdown():
+    llm = _FakeLLM(
+        '{"intent":"file_write","confidence":0.82,"suggested_filename":"random.txt"}'
+    )
+    middleware = FileIntentMiddleware(llm=llm)
+    state = {
+        "messages": [HumanMessage(content="create a random file")],
+        "turn_id": "turn:3",
+    }
+
+    result = await middleware.abefore_agent(state, runtime=None)  # type: ignore[arg-type]
+
+    assert result is not None
+    contract = result["file_operation_contract"]
+    assert contract["intent"] == FileOperationIntent.FILE_WRITE.value
+    assert contract["suggested_path"] == "/random.md"
+
+
+@pytest.mark.asyncio
+async def test_file_write_with_suggested_directory_preserves_folder():
+    llm = _FakeLLM(
+        '{"intent":"file_write","confidence":0.86,"suggested_filename":"random.md","suggested_directory":"pc backups","suggested_path":null}'
+    )
+    middleware = FileIntentMiddleware(llm=llm)
+    state = {
+        "messages": [HumanMessage(content="create a random file in pc backups folder")],
+        "turn_id": "turn:4",
+    }
+
+    result = await middleware.abefore_agent(state, runtime=None)  # type: ignore[arg-type]
+
+    assert result is not None
+    contract = result["file_operation_contract"]
+    assert contract["intent"] == FileOperationIntent.FILE_WRITE.value
+    assert contract["suggested_path"] == "/pc_backups/random.md"
+
+
+@pytest.mark.asyncio
+async def test_file_write_with_suggested_path_takes_precedence():
+    llm = _FakeLLM(
+        '{"intent":"file_write","confidence":0.9,"suggested_filename":"ignored.md","suggested_directory":"docs","suggested_path":"/reports/q2/summary.md"}'
+    )
+    middleware = FileIntentMiddleware(llm=llm)
+    state = {
+        "messages": [HumanMessage(content="create report")],
+        "turn_id": "turn:5",
+    }
+
+    result = await middleware.abefore_agent(state, runtime=None)  # type: ignore[arg-type]
+
+    assert result is not None
+    contract = result["file_operation_contract"]
+    assert contract["intent"] == FileOperationIntent.FILE_WRITE.value
+    assert contract["suggested_path"] == "/reports/q2/summary.md"
+
+
+@pytest.mark.asyncio
+async def test_file_write_infers_directory_from_user_text_when_missing():
+    llm = _FakeLLM(
+        '{"intent":"file_write","confidence":0.83,"suggested_filename":"random.md","suggested_directory":null,"suggested_path":null}'
+    )
+    middleware = FileIntentMiddleware(llm=llm)
+    state = {
+        "messages": [HumanMessage(content="create a random file in pc backups folder")],
+        "turn_id": "turn:6",
+    }
+
+    result = await middleware.abefore_agent(state, runtime=None)  # type: ignore[arg-type]
+
+    assert result is not None
+    contract = result["file_operation_contract"]
+    assert contract["intent"] == FileOperationIntent.FILE_WRITE.value
+    assert contract["suggested_path"] == "/pc_backups/random.md"
+
+
+def test_fallback_path_normalizes_windows_slashes() -> None:
+    resolved = _fallback_path(
+        suggested_filename="summary.md",
+        suggested_path=r"\reports\q2\summary.md",
+        user_text="create report",
+    )
+
+    assert resolved == "/reports/q2/summary.md"
+
+
+def test_fallback_path_normalizes_windows_drive_path() -> None:
+    resolved = _fallback_path(
+        suggested_filename=None,
+        suggested_path=r"C:\Users\anish\notes\todo.md",
+        user_text="create note",
+    )
+
+    assert resolved == "/C/Users/anish/notes/todo.md"
+
+
+def test_fallback_path_normalizes_mixed_separators_and_duplicate_slashes() -> None:
+    resolved = _fallback_path(
+        suggested_filename="summary.md",
+        suggested_path=r"\\reports\\q2//summary.md",
+        user_text="create report",
+    )
+
+    assert resolved == "/reports/q2/summary.md"
+
+
+def test_fallback_path_keeps_posix_style_absolute_path_for_linux_and_macos() -> None:
+    resolved = _fallback_path(
+        suggested_filename=None,
+        suggested_path="/var/log/surfsense/notes.md",
+        user_text="create note",
+    )
+
+    assert resolved == "/var/log/surfsense/notes.md"
diff --git a/surfsense_backend/tests/unit/middleware/test_filesystem_backends.py b/surfsense_backend/tests/unit/middleware/test_filesystem_backends.py
new file mode 100644
index 0000000000..c71b5efde9
--- /dev/null
+++ b/surfsense_backend/tests/unit/middleware/test_filesystem_backends.py
@@ -0,0 +1,68 @@
+from pathlib import Path
+
+import pytest
+
+from app.agents.new_chat.filesystem_backends import build_backend_resolver
+from app.agents.new_chat.filesystem_selection import (
+    ClientPlatform,
+    FilesystemMode,
+    FilesystemSelection,
+    LocalFilesystemMount,
+)
+from app.agents.new_chat.middleware.multi_root_local_folder_backend import (
+    MultiRootLocalFolderBackend,
+)
+
+pytestmark = pytest.mark.unit
+
+
+class _RuntimeStub:
+    state = {"files": {}}
+
+
+def test_backend_resolver_returns_multi_root_backend_for_single_root(tmp_path: Path):
+    selection = FilesystemSelection(
+        mode=FilesystemMode.DESKTOP_LOCAL_FOLDER,
+        client_platform=ClientPlatform.DESKTOP,
+        local_mounts=(LocalFilesystemMount(mount_id="tmp", root_path=str(tmp_path)),),
+    )
+    resolver = build_backend_resolver(selection)
+
+    backend = resolver(_RuntimeStub())
+    assert isinstance(backend, MultiRootLocalFolderBackend)
+    assert backend.list_mounts() == ("tmp",)
+
+
+def test_backend_resolver_uses_cloud_mode_by_default():
+    resolver = build_backend_resolver(FilesystemSelection())
+    backend = resolver(_RuntimeStub())
+    # When no search_space_id is provided we fall back to StateBackend so
+    # sub-agents / tests without DB access still work.
+    assert backend.__class__.__name__ == "StateBackend"
+
+
+def test_backend_resolver_uses_kb_postgres_in_cloud_with_search_space():
+    resolver = build_backend_resolver(FilesystemSelection(), search_space_id=42)
+    backend = resolver(_RuntimeStub())
+    assert backend.__class__.__name__ == "KBPostgresBackend"
+    assert backend.search_space_id == 42
+
+
+def test_backend_resolver_returns_multi_root_backend_for_multiple_roots(tmp_path: Path):
+    root_one = tmp_path / "resume"
+    root_two = tmp_path / "notes"
+    root_one.mkdir()
+    root_two.mkdir()
+    selection = FilesystemSelection(
+        mode=FilesystemMode.DESKTOP_LOCAL_FOLDER,
+        client_platform=ClientPlatform.DESKTOP,
+        local_mounts=(
+            LocalFilesystemMount(mount_id="resume", root_path=str(root_one)),
+            LocalFilesystemMount(mount_id="notes", root_path=str(root_two)),
+        ),
+    )
+    resolver = build_backend_resolver(selection)
+
+    backend = resolver(_RuntimeStub())
+    assert isinstance(backend, MultiRootLocalFolderBackend)
+    assert backend.list_mounts() == ("resume", "notes")
diff --git a/surfsense_backend/tests/unit/middleware/test_filesystem_middleware.py b/surfsense_backend/tests/unit/middleware/test_filesystem_middleware.py
new file mode 100644
index 0000000000..70430f4ca3
--- /dev/null
+++ b/surfsense_backend/tests/unit/middleware/test_filesystem_middleware.py
@@ -0,0 +1,220 @@
+"""Unit tests for the SurfSense filesystem middleware new behaviors.
+
+Covers:
+* cloud cwd defaults to ``/documents`` and relative paths resolve under it
+* cloud writes outside ``/documents/`` are rejected unless basename starts
+  with ``temp_``
+* cloud writes/edits to the anonymous document are rejected (read-only)
+* helper methods on the middleware (``_resolve_relative``,
+  ``_check_cloud_write_namespace``, ``_default_cwd``)
+
+These tests use ``__new__`` to bypass the heavy ``__init__`` and exercise
+the helper methods directly so the test surface stays narrow and fast.
+"""
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+
+import pytest
+
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+from app.agents.new_chat.middleware.filesystem import (
+    SurfSenseFilesystemMiddleware,
+    _build_filesystem_system_prompt,
+    _build_tool_descriptions,
+)
+
+pytestmark = pytest.mark.unit
+
+
+def _make_middleware(mode: FilesystemMode = FilesystemMode.CLOUD):
+    middleware = SurfSenseFilesystemMiddleware.__new__(SurfSenseFilesystemMiddleware)
+    middleware._filesystem_mode = mode
+    return middleware
+
+
+def _runtime(state: dict | None = None) -> SimpleNamespace:
+    return SimpleNamespace(state=state or {})
+
+
+class TestCloudCwdDefaults:
+    def test_default_cwd_in_cloud_is_documents_root(self):
+        m = _make_middleware()
+        assert m._default_cwd() == "/documents"
+
+    def test_default_cwd_in_desktop_is_root(self):
+        m = _make_middleware(FilesystemMode.DESKTOP_LOCAL_FOLDER)
+        assert m._default_cwd() == "/"
+
+    def test_current_cwd_uses_state_when_set(self):
+        m = _make_middleware()
+        runtime = _runtime({"cwd": "/documents/notes"})
+        assert m._current_cwd(runtime) == "/documents/notes"
+
+    def test_current_cwd_falls_back_to_default(self):
+        m = _make_middleware()
+        runtime = _runtime({})
+        assert m._current_cwd(runtime) == "/documents"
+
+    def test_current_cwd_ignores_invalid(self):
+        m = _make_middleware()
+        runtime = _runtime({"cwd": "not-absolute"})
+        assert m._current_cwd(runtime) == "/documents"
+
+
+class TestRelativePathResolution:
+    def test_relative_path_resolves_against_cwd(self):
+        m = _make_middleware()
+        runtime = _runtime({"cwd": "/documents/projects"})
+        assert (
+            m._resolve_relative("notes.md", runtime) == "/documents/projects/notes.md"
+        )
+
+    def test_relative_path_with_dotdot(self):
+        m = _make_middleware()
+        runtime = _runtime({"cwd": "/documents/a/b"})
+        assert m._resolve_relative("../c.md", runtime) == "/documents/a/c.md"
+
+    def test_absolute_path_is_kept(self):
+        m = _make_middleware()
+        runtime = _runtime({"cwd": "/documents"})
+        assert m._resolve_relative("/other/x.md", runtime) == "/other/x.md"
+
+    def test_empty_path_returns_cwd(self):
+        m = _make_middleware()
+        runtime = _runtime({"cwd": "/documents/projects"})
+        assert m._resolve_relative("", runtime) == "/documents/projects"
+
+
+class TestCloudWriteNamespacePolicy:
+    def test_documents_path_allowed(self):
+        m = _make_middleware()
+        runtime = _runtime()
+        assert m._check_cloud_write_namespace("/documents/foo.md", runtime) is None
+
+    def test_documents_root_allowed(self):
+        m = _make_middleware()
+        runtime = _runtime()
+        assert m._check_cloud_write_namespace("/documents", runtime) is None
+
+    def test_temp_basename_anywhere_allowed(self):
+        m = _make_middleware()
+        runtime = _runtime()
+        assert m._check_cloud_write_namespace("/temp_scratch.md", runtime) is None
+        assert m._check_cloud_write_namespace("/foo/temp_x.md", runtime) is None
+        assert m._check_cloud_write_namespace("/documents/temp_x.md", runtime) is None
+
+    def test_other_paths_rejected(self):
+        m = _make_middleware()
+        runtime = _runtime()
+        err = m._check_cloud_write_namespace("/foo/bar.md", runtime)
+        assert err is not None
+        assert "must target /documents" in err
+
+    def test_anon_doc_path_is_read_only(self):
+        m = _make_middleware()
+        runtime = _runtime(
+            {
+                "kb_anon_doc": {
+                    "path": "/documents/uploaded.xml",
+                    "title": "uploaded",
+                    "content": "",
+                    "chunks": [],
+                }
+            }
+        )
+        err = m._check_cloud_write_namespace("/documents/uploaded.xml", runtime)
+        assert err is not None
+        assert "read-only" in err
+
+    def test_desktop_mode_skips_namespace_policy(self):
+        m = _make_middleware(FilesystemMode.DESKTOP_LOCAL_FOLDER)
+        runtime = _runtime()
+        assert m._check_cloud_write_namespace("/random/path.md", runtime) is None
+
+
+class TestModeSpecificPrompts:
+    """The prompt and tool descriptions must only describe the active mode.
+
+    Cross-mode noise wastes tokens and confuses the model with rules it
+    cannot use this session.
+    """
+
+    def test_cloud_prompt_omits_desktop_section(self):
+        prompt = _build_filesystem_system_prompt(
+            FilesystemMode.CLOUD, sandbox_available=False
+        )
+        assert "Local Folder Mode" not in prompt
+        assert "mount-prefixed" not in prompt
+        assert "Persistence Rules" in prompt
+        assert "/documents" in prompt
+        assert "temp_" in prompt
+
+    def test_desktop_prompt_omits_cloud_persistence_rules(self):
+        prompt = _build_filesystem_system_prompt(
+            FilesystemMode.DESKTOP_LOCAL_FOLDER, sandbox_available=False
+        )
+        assert "Persistence Rules" not in prompt
+        assert "Workspace Tree" not in prompt
+        assert "<priority_documents>" not in prompt
+        assert "Local Folder Mode" in prompt
+        assert "mount-prefixed" in prompt
+
+    def test_cloud_tool_descs_omit_desktop_phrases(self):
+        descs = _build_tool_descriptions(FilesystemMode.CLOUD)
+        for name in (
+            "write_file",
+            "edit_file",
+            "move_file",
+            "mkdir",
+            "rm",
+            "rmdir",
+            "list_tree",
+            "grep",
+        ):
+            text = descs[name]
+            assert "Desktop" not in text, f"{name} leaks desktop hints"
+            assert "Cloud mode:" not in text, f"{name} qualifies a cloud-only desc"
+
+    def test_desktop_tool_descs_omit_cloud_phrases(self):
+        descs = _build_tool_descriptions(FilesystemMode.DESKTOP_LOCAL_FOLDER)
+        for name in (
+            "write_file",
+            "edit_file",
+            "move_file",
+            "mkdir",
+            "rm",
+            "rmdir",
+            "list_tree",
+            "grep",
+        ):
+            text = descs[name]
+            assert "Cloud" not in text, f"{name} leaks cloud hints"
+            assert "/documents/" not in text, f"{name} mentions cloud namespace"
+            assert "temp_" not in text, f"{name} mentions cloud temp_ semantics"
+
+    def test_cloud_descs_include_rm_and_rmdir(self):
+        descs = _build_tool_descriptions(FilesystemMode.CLOUD)
+        assert "rm" in descs and "rmdir" in descs
+        assert "Deletes a single file" in descs["rm"]
+        assert "Deletes an empty directory" in descs["rmdir"]
+        assert "rmdir" in descs["rmdir"] and "POSIX" in descs["rmdir"]
+
+    def test_desktop_descs_warn_about_irreversibility(self):
+        descs = _build_tool_descriptions(FilesystemMode.DESKTOP_LOCAL_FOLDER)
+        assert "NOT reversible" in descs["rm"]
+        assert "NOT reversible" in descs["rmdir"]
+
+    def test_sandbox_addendum_appended_when_available(self):
+        prompt = _build_filesystem_system_prompt(
+            FilesystemMode.CLOUD, sandbox_available=True
+        )
+        assert "execute_code" in prompt
+        assert "Code Execution" in prompt
+
+    def test_sandbox_addendum_absent_when_unavailable(self):
+        prompt = _build_filesystem_system_prompt(
+            FilesystemMode.CLOUD, sandbox_available=False
+        )
+        assert "execute_code" not in prompt
diff --git a/surfsense_backend/tests/unit/middleware/test_filesystem_verification.py b/surfsense_backend/tests/unit/middleware/test_filesystem_verification.py
new file mode 100644
index 0000000000..81cf590d30
--- /dev/null
+++ b/surfsense_backend/tests/unit/middleware/test_filesystem_verification.py
@@ -0,0 +1,173 @@
+from pathlib import Path
+
+import pytest
+
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+from app.agents.new_chat.middleware.filesystem import SurfSenseFilesystemMiddleware
+from app.agents.new_chat.middleware.multi_root_local_folder_backend import (
+    MultiRootLocalFolderBackend,
+)
+
+pytestmark = pytest.mark.unit
+
+
+class _RuntimeNoSuggestedPath:
+    state = {"file_operation_contract": {}}
+
+
+class _RuntimeWithSuggestedPath:
+    def __init__(self, suggested_path: str) -> None:
+        self.state = {"file_operation_contract": {"suggested_path": suggested_path}}
+
+
+def test_contract_suggested_path_falls_back_to_documents_notes_md() -> None:
+    middleware = SurfSenseFilesystemMiddleware.__new__(SurfSenseFilesystemMiddleware)
+    middleware._filesystem_mode = FilesystemMode.CLOUD
+    suggested = middleware._get_contract_suggested_path(_RuntimeNoSuggestedPath())  # type: ignore[arg-type]
+    # Cloud default cwd is /documents so the fallback lands in the KB.
+    assert suggested == "/documents/notes.md"
+
+
+def test_contract_suggested_path_falls_back_to_root_notes_md_in_desktop() -> None:
+    middleware = SurfSenseFilesystemMiddleware.__new__(SurfSenseFilesystemMiddleware)
+    middleware._filesystem_mode = FilesystemMode.DESKTOP_LOCAL_FOLDER
+    suggested = middleware._get_contract_suggested_path(_RuntimeNoSuggestedPath())  # type: ignore[arg-type]
+    assert suggested == "/notes.md"
+
+
+def test_normalize_local_mount_path_prefixes_default_mount(tmp_path: Path) -> None:
+    root = tmp_path / "PC Backups"
+    root.mkdir()
+    backend = MultiRootLocalFolderBackend((("pc_backups", str(root)),))
+    runtime = _RuntimeNoSuggestedPath()
+    middleware = SurfSenseFilesystemMiddleware.__new__(SurfSenseFilesystemMiddleware)
+    middleware._get_backend = lambda _runtime: backend  # type: ignore[method-assign]
+
+    resolved = middleware._normalize_local_mount_path("/random-note.md", runtime)  # type: ignore[arg-type]
+
+    assert resolved == "/pc_backups/random-note.md"
+
+
+def test_normalize_local_mount_path_keeps_explicit_mount(tmp_path: Path) -> None:
+    root = tmp_path / "PC Backups"
+    root.mkdir()
+    backend = MultiRootLocalFolderBackend((("pc_backups", str(root)),))
+    runtime = _RuntimeNoSuggestedPath()
+    middleware = SurfSenseFilesystemMiddleware.__new__(SurfSenseFilesystemMiddleware)
+    middleware._get_backend = lambda _runtime: backend  # type: ignore[method-assign]
+
+    resolved = middleware._normalize_local_mount_path(  # type: ignore[arg-type]
+        "/pc_backups/notes/random-note.md",
+        runtime,
+    )
+
+    assert resolved == "/pc_backups/notes/random-note.md"
+
+
+def test_normalize_local_mount_path_windows_backslashes(tmp_path: Path) -> None:
+    root = tmp_path / "PC Backups"
+    root.mkdir()
+    backend = MultiRootLocalFolderBackend((("pc_backups", str(root)),))
+    runtime = _RuntimeNoSuggestedPath()
+    middleware = SurfSenseFilesystemMiddleware.__new__(SurfSenseFilesystemMiddleware)
+    middleware._get_backend = lambda _runtime: backend  # type: ignore[method-assign]
+
+    resolved = middleware._normalize_local_mount_path(  # type: ignore[arg-type]
+        r"\notes\random-note.md",
+        runtime,
+    )
+
+    assert resolved == "/pc_backups/notes/random-note.md"
+
+
+def test_normalize_local_mount_path_normalizes_mixed_separators(tmp_path: Path) -> None:
+    root = tmp_path / "PC Backups"
+    root.mkdir()
+    backend = MultiRootLocalFolderBackend((("pc_backups", str(root)),))
+    runtime = _RuntimeNoSuggestedPath()
+    middleware = SurfSenseFilesystemMiddleware.__new__(SurfSenseFilesystemMiddleware)
+    middleware._get_backend = lambda _runtime: backend  # type: ignore[method-assign]
+
+    resolved = middleware._normalize_local_mount_path(  # type: ignore[arg-type]
+        r"\\notes//nested\\random-note.md",
+        runtime,
+    )
+
+    assert resolved == "/pc_backups/notes/nested/random-note.md"
+
+
+def test_normalize_local_mount_path_keeps_explicit_mount_with_backslashes(
+    tmp_path: Path,
+) -> None:
+    root = tmp_path / "PC Backups"
+    root.mkdir()
+    backend = MultiRootLocalFolderBackend((("pc_backups", str(root)),))
+    runtime = _RuntimeNoSuggestedPath()
+    middleware = SurfSenseFilesystemMiddleware.__new__(SurfSenseFilesystemMiddleware)
+    middleware._get_backend = lambda _runtime: backend  # type: ignore[method-assign]
+
+    resolved = middleware._normalize_local_mount_path(  # type: ignore[arg-type]
+        r"\pc_backups\notes\random-note.md",
+        runtime,
+    )
+
+    assert resolved == "/pc_backups/notes/random-note.md"
+
+
+def test_normalize_local_mount_path_prefixes_posix_absolute_path_for_linux_and_macos(
+    tmp_path: Path,
+) -> None:
+    root = tmp_path / "PC Backups"
+    root.mkdir()
+    backend = MultiRootLocalFolderBackend((("pc_backups", str(root)),))
+    runtime = _RuntimeNoSuggestedPath()
+    middleware = SurfSenseFilesystemMiddleware.__new__(SurfSenseFilesystemMiddleware)
+    middleware._get_backend = lambda _runtime: backend  # type: ignore[method-assign]
+
+    resolved = middleware._normalize_local_mount_path("/var/log/app.log", runtime)  # type: ignore[arg-type]
+
+    assert resolved == "/pc_backups/var/log/app.log"
+
+
+def test_normalize_local_mount_path_prefers_unique_existing_parent_mount(
+    tmp_path: Path,
+) -> None:
+    root_a = tmp_path / "RootA"
+    root_b = tmp_path / "RootB"
+    (root_a / "other").mkdir(parents=True)
+    (root_b / "nested" / "deep").mkdir(parents=True)
+    backend = MultiRootLocalFolderBackend(
+        (("root_a", str(root_a)), ("root_b", str(root_b)))
+    )
+    runtime = _RuntimeNoSuggestedPath()
+    middleware = SurfSenseFilesystemMiddleware.__new__(SurfSenseFilesystemMiddleware)
+    middleware._get_backend = lambda _runtime: backend  # type: ignore[method-assign]
+
+    resolved = middleware._normalize_local_mount_path(  # type: ignore[arg-type]
+        "/nested/deep/new-note.md",
+        runtime,
+    )
+
+    assert resolved == "/root_b/nested/deep/new-note.md"
+
+
+def test_normalize_local_mount_path_uses_suggested_mount_when_ambiguous(
+    tmp_path: Path,
+) -> None:
+    root_a = tmp_path / "RootA"
+    root_b = tmp_path / "RootB"
+    root_a.mkdir(parents=True)
+    root_b.mkdir(parents=True)
+    backend = MultiRootLocalFolderBackend(
+        (("root_a", str(root_a)), ("root_b", str(root_b)))
+    )
+    runtime = _RuntimeWithSuggestedPath("/root_b/notes/context.md")
+    middleware = SurfSenseFilesystemMiddleware.__new__(SurfSenseFilesystemMiddleware)
+    middleware._get_backend = lambda _runtime: backend  # type: ignore[method-assign]
+
+    resolved = middleware._normalize_local_mount_path(  # type: ignore[arg-type]
+        "/brand-new-note.md",
+        runtime,
+    )
+
+    assert resolved == "/root_b/brand-new-note.md"
diff --git a/surfsense_backend/tests/unit/middleware/test_kb_persistence_filesystem_parity.py b/surfsense_backend/tests/unit/middleware/test_kb_persistence_filesystem_parity.py
new file mode 100644
index 0000000000..ef95434bf4
--- /dev/null
+++ b/surfsense_backend/tests/unit/middleware/test_kb_persistence_filesystem_parity.py
@@ -0,0 +1,168 @@
+"""Unit tests for kb_persistence filesystem-parity invariants.
+
+Specifically, these tests pin down that the agent-driven write_file flow
+treats path uniqueness — not content uniqueness — as the only hard
+invariant. This mirrors a real filesystem: ``cp a b`` produces two files
+with identical bytes living at different paths, and that should round-trip
+through :class:`KnowledgeBasePersistenceMiddleware` without losing the copy.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+from unittest.mock import AsyncMock
+
+import numpy as np
+import pytest
+
+from app.agents.new_chat.middleware import kb_persistence
+from app.db import Document
+
+
+class _FakeResult:
+    """Minimal stand-in for ``sqlalchemy.engine.Result``."""
+
+    def __init__(self, value: Any = None) -> None:
+        self._value = value
+
+    def scalar_one_or_none(self) -> Any:
+        return self._value
+
+    def scalar(self) -> Any:
+        return self._value
+
+
+class _FakeSession:
+    """Minimal AsyncSession stand-in scoped to ``_create_document`` needs.
+
+    Records every ``add`` so we can assert against the resulting Documents
+    and Chunks. ``execute`` always returns "no row" by default — i.e. no
+    folder hierarchy preexists and no path collision exists. Tests that
+    want a path collision can override that on a per-call basis.
+    """
+
+    def __init__(self) -> None:
+        self.added: list[Any] = []
+        self.execute = AsyncMock(return_value=_FakeResult(None))
+        self.flush = AsyncMock()
+
+        # Simulate ``await session.flush()`` assigning an id to the doc;
+        # we increment a counter so each Document gets a unique id.
+        self._next_id = 1
+
+        async def _flush_assigning_ids() -> None:
+            for obj in self.added:
+                if getattr(obj, "id", None) is None:
+                    obj.id = self._next_id
+                    self._next_id += 1
+
+        self.flush.side_effect = _flush_assigning_ids
+
+    def add(self, obj: Any) -> None:
+        self.added.append(obj)
+
+    def add_all(self, objs: list[Any]) -> None:
+        self.added.extend(objs)
+
+
+@pytest.fixture(autouse=True)
+def _stub_embeddings_and_chunks(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Avoid loading the embedding model in unit tests."""
+    monkeypatch.setattr(
+        kb_persistence,
+        "embed_texts",
+        lambda texts: [np.zeros(8, dtype=np.float32) for _ in texts],
+    )
+    monkeypatch.setattr(kb_persistence, "chunk_text", lambda content: [content])
+
+
+@pytest.mark.asyncio
+async def test_create_document_allows_identical_content_at_different_paths() -> None:
+    """The core regression: ``cp /a/notes.md /b/notes-copy.md``.
+
+    Both create calls must succeed even though the bytes are byte-for-byte
+    identical, because path is the only filesystem-style unique key.
+    """
+    session = _FakeSession()
+    content = "# Same body\n\nIdentical content used by two different paths.\n"
+
+    first = await kb_persistence._create_document(
+        session,  # type: ignore[arg-type]
+        virtual_path="/documents/a/notes.md",
+        content=content,
+        search_space_id=42,
+        created_by_id="user-1",
+    )
+    assert isinstance(first, Document)
+    assert first.title == "notes.md"
+
+    # Second create with byte-identical content at a different path should
+    # not raise — that's the whole point of the filesystem-parity fix.
+    second = await kb_persistence._create_document(
+        session,  # type: ignore[arg-type]
+        virtual_path="/documents/b/notes-copy.md",
+        content=content,
+        search_space_id=42,
+        created_by_id="user-1",
+    )
+    assert isinstance(second, Document)
+    assert second.title == "notes-copy.md"
+
+    # Both rows share the same content_hash but live at distinct paths
+    # (distinct ``unique_identifier_hash``). That's the desired contract.
+    assert first.content_hash == second.content_hash
+    assert first.unique_identifier_hash != second.unique_identifier_hash
+
+
+@pytest.mark.asyncio
+async def test_create_document_still_rejects_path_collision() -> None:
+    """Path uniqueness remains the hard invariant.
+
+    If ``unique_identifier_hash`` already points at an existing row in
+    the same search space, the create call must raise ``ValueError``
+    with a clear message — matching the behavior the commit loop relies
+    on to upsert via the existing-row code path.
+    """
+    session = _FakeSession()
+
+    # Path with no folder parts so ``_ensure_folder_hierarchy`` is a
+    # no-op and the only SELECT executed is the path-collision check.
+    # That SELECT returns an existing doc id, triggering the guard.
+    session.execute = AsyncMock(return_value=_FakeResult(value=99))
+
+    with pytest.raises(ValueError, match="already exists at path"):
+        await kb_persistence._create_document(
+            session,  # type: ignore[arg-type]
+            virtual_path="/documents/notes.md",
+            content="anything",
+            search_space_id=42,
+            created_by_id="user-1",
+        )
+
+
+@pytest.mark.asyncio
+async def test_create_document_does_not_query_for_content_hash_collision(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Regression guard: the legacy second SELECT (content_hash collision
+    pre-check) must be gone. Counting ``execute`` calls is a brittle but
+    effective way to lock that in.
+
+    The current flow runs exactly one ``execute`` for the path-collision
+    SELECT (no folder parts in this path → ``_ensure_folder_hierarchy``
+    short-circuits). If a future refactor reintroduces a content-hash
+    SELECT, this test will fail loud.
+    """
+    session = _FakeSession()
+    await kb_persistence._create_document(
+        session,  # type: ignore[arg-type]
+        virtual_path="/documents/notes.md",
+        content="hello",
+        search_space_id=42,
+        created_by_id="user-1",
+    )
+    # Path-collision SELECT only. No content_hash SELECT.
+    assert session.execute.await_count == 1, (
+        f"Unexpected execute count {session.execute.await_count}; "
+        "did the legacy content_hash collision pre-check get re-added?"
+    )
diff --git a/surfsense_backend/tests/unit/middleware/test_kb_persistence_revisions.py b/surfsense_backend/tests/unit/middleware/test_kb_persistence_revisions.py
new file mode 100644
index 0000000000..feca23d274
--- /dev/null
+++ b/surfsense_backend/tests/unit/middleware/test_kb_persistence_revisions.py
@@ -0,0 +1,309 @@
+"""Unit tests for the kb_persistence snapshot helpers.
+
+The full ``commit_staged_filesystem_state`` body exercises a real session
+in integration tests; here we verify the building blocks used by the
+snapshot/revert pipeline:
+
+* ``_find_action_ids_batch`` issues a SINGLE query for N tool_call_ids
+  (regression guard against the N+1 lookup pattern).
+* ``_mark_action_reversible`` is a no-op when ``action_id`` is ``None``.
+* ``_doc_revision_payload`` and ``_load_chunks_for_snapshot`` produce the
+  shape the snapshot helpers consume.
+
+These tests use ``MagicMock`` / ``AsyncMock`` against a fake session so
+the assertions run in milliseconds and don't require Postgres.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from app.agents.new_chat.middleware import kb_persistence
+
+pytestmark = pytest.mark.unit
+
+
+class _FakeResult:
+    def __init__(self, rows: list[Any] | None = None, scalar: Any = None) -> None:
+        self._rows = rows or []
+        self._scalar = scalar
+
+    def all(self) -> list[Any]:
+        return list(self._rows)
+
+    def scalar_one_or_none(self) -> Any:
+        return self._scalar
+
+
+class _FakeSession:
+    def __init__(self) -> None:
+        self.execute = AsyncMock()
+
+
+@pytest.mark.asyncio
+async def test_find_action_ids_batch_issues_single_query() -> None:
+    """The lookup MUST be a single ``IN (...)`` SELECT, not N selects."""
+    session = _FakeSession()
+    session.execute.return_value = _FakeResult(
+        rows=[
+            MagicMock(id=11, tool_call_id="tc-a"),
+            MagicMock(id=22, tool_call_id="tc-b"),
+            MagicMock(id=33, tool_call_id="tc-c"),
+        ]
+    )
+
+    mapping = await kb_persistence._find_action_ids_batch(
+        session,  # type: ignore[arg-type]
+        thread_id=1,
+        tool_call_ids={"tc-a", "tc-b", "tc-c"},
+    )
+
+    assert mapping == {"tc-a": 11, "tc-b": 22, "tc-c": 33}
+    assert session.execute.await_count == 1, (
+        "Snapshot binding must batch into ONE query; got "
+        f"{session.execute.await_count} (regression: N+1 lookup pattern)."
+    )
+
+
+@pytest.mark.asyncio
+async def test_find_action_ids_batch_short_circuits_when_thread_id_missing() -> None:
+    session = _FakeSession()
+    mapping = await kb_persistence._find_action_ids_batch(
+        session,  # type: ignore[arg-type]
+        thread_id=None,
+        tool_call_ids={"tc-a"},
+    )
+    assert mapping == {}
+    assert session.execute.await_count == 0
+
+
+@pytest.mark.asyncio
+async def test_find_action_ids_batch_short_circuits_when_no_calls() -> None:
+    session = _FakeSession()
+    mapping = await kb_persistence._find_action_ids_batch(
+        session,  # type: ignore[arg-type]
+        thread_id=42,
+        tool_call_ids=set(),
+    )
+    assert mapping == {}
+    assert session.execute.await_count == 0
+
+
+@pytest.mark.asyncio
+async def test_mark_action_reversible_is_noop_for_null_id() -> None:
+    session = _FakeSession()
+    await kb_persistence._mark_action_reversible(session, action_id=None)  # type: ignore[arg-type]
+    assert session.execute.await_count == 0
+
+
+@pytest.mark.asyncio
+async def test_mark_action_reversible_runs_update_for_real_id() -> None:
+    session = _FakeSession()
+    await kb_persistence._mark_action_reversible(session, action_id=99)  # type: ignore[arg-type]
+    assert session.execute.await_count == 1
+
+
+def test_doc_revision_payload_captures_metadata_virtual_path() -> None:
+    """Snapshot helpers must capture ``metadata_before`` for revert reuse."""
+    doc = MagicMock()
+    doc.content = "body"
+    doc.title = "notes.md"
+    doc.folder_id = 7
+    doc.document_metadata = {"virtual_path": "/documents/team/notes.md"}
+
+    payload = kb_persistence._doc_revision_payload(
+        doc, chunks_before=[{"content": "x"}]
+    )
+
+    assert payload["title_before"] == "notes.md"
+    assert payload["folder_id_before"] == 7
+    assert payload["content_before"] == "body"
+    assert payload["chunks_before"] == [{"content": "x"}]
+    assert payload["metadata_before"] == {"virtual_path": "/documents/team/notes.md"}
+
+
+def test_doc_revision_payload_handles_missing_metadata() -> None:
+    doc = MagicMock()
+    doc.content = ""
+    doc.title = ""
+    doc.folder_id = None
+    doc.document_metadata = None
+    payload = kb_persistence._doc_revision_payload(doc)
+    assert payload["metadata_before"] is None
+
+
+@pytest.mark.asyncio
+async def test_load_chunks_for_snapshot_returns_content_only() -> None:
+    """Snapshot chunks intentionally omit embeddings (regenerated on revert)."""
+    session = _FakeSession()
+    session.execute.return_value = _FakeResult(
+        rows=[
+            MagicMock(content="alpha"),
+            MagicMock(content="beta"),
+        ]
+    )
+    chunks = await kb_persistence._load_chunks_for_snapshot(
+        session,
+        doc_id=42,  # type: ignore[arg-type]
+    )
+    assert chunks == [{"content": "alpha"}, {"content": "beta"}]
+
+
+# ---------------------------------------------------------------------------
+# Deferred reversibility-flip dispatches.
+#
+# The snapshot helpers used to dispatch ``action_log_updated`` directly
+# from inside the SAVEPOINT block. That meant the SSE side-channel
+# could tell the UI a row was reversible while the OUTER transaction
+# was still pending — and if the outer commit failed, every SAVEPOINT
+# rolled back too, leaving the UI in a state inconsistent with
+# durable storage. The deferred-dispatch contract fixes that:
+#
+#   • when a ``deferred_dispatches`` list is provided, the helper
+#     APPENDS the action_id and does NOT dispatch;
+#   • the caller (``commit_staged_filesystem_state``) flushes the list
+#     only AFTER ``await session.commit()`` succeeds; on rollback it
+#     clears the list so nothing is emitted.
+# ---------------------------------------------------------------------------
+
+
+class _NestedCtx:
+    """Async context manager mimicking ``session.begin_nested()``."""
+
+    async def __aenter__(self) -> _NestedCtx:
+        return self
+
+    async def __aexit__(self, exc_type, exc, tb) -> bool:
+        return False
+
+
+@pytest.mark.asyncio
+async def test_pre_write_snapshot_defers_dispatch_when_list_provided(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Helpers MUST queue dispatches when ``deferred_dispatches`` is set."""
+    session = MagicMock()
+    session.begin_nested = MagicMock(return_value=_NestedCtx())
+    session.execute = AsyncMock(return_value=_FakeResult(rows=[]))
+    session.flush = AsyncMock()
+
+    def _add(rev: Any) -> None:
+        rev.id = 17
+
+    session.add = MagicMock(side_effect=_add)
+
+    dispatched: list[int] = []
+
+    async def _fake_dispatch(action_id: int | None) -> None:
+        if action_id is not None:
+            dispatched.append(int(action_id))
+
+    monkeypatch.setattr(
+        kb_persistence, "_dispatch_reversibility_update", _fake_dispatch
+    )
+
+    deferred: list[int] = []
+    doc = MagicMock(id=99, document_metadata={"virtual_path": "/documents/x.md"})
+    doc.title = "x.md"
+    doc.folder_id = None
+    doc.content = "body"
+
+    rev_id = await kb_persistence._snapshot_document_pre_write(
+        session,  # type: ignore[arg-type]
+        doc=doc,
+        action_id=42,
+        search_space_id=1,
+        turn_id="t-1",
+        deferred_dispatches=deferred,
+    )
+
+    assert rev_id == 17
+    # Inline dispatch must NOT have fired; the action_id is queued.
+    assert dispatched == []
+    assert deferred == [42]
+
+
+@pytest.mark.asyncio
+async def test_pre_write_snapshot_dispatches_inline_when_list_omitted(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Direct callers (no outer transaction) keep the legacy inline dispatch."""
+    session = MagicMock()
+    session.begin_nested = MagicMock(return_value=_NestedCtx())
+    session.execute = AsyncMock(return_value=_FakeResult(rows=[]))
+    session.flush = AsyncMock()
+
+    def _add(rev: Any) -> None:
+        rev.id = 7
+
+    session.add = MagicMock(side_effect=_add)
+
+    dispatched: list[int] = []
+
+    async def _fake_dispatch(action_id: int | None) -> None:
+        if action_id is not None:
+            dispatched.append(int(action_id))
+
+    monkeypatch.setattr(
+        kb_persistence, "_dispatch_reversibility_update", _fake_dispatch
+    )
+
+    doc = MagicMock(id=11, document_metadata={"virtual_path": "/documents/y.md"})
+    doc.title = "y.md"
+    doc.folder_id = None
+    doc.content = "body"
+
+    await kb_persistence._snapshot_document_pre_write(
+        session,  # type: ignore[arg-type]
+        doc=doc,
+        action_id=88,
+        search_space_id=1,
+        turn_id="t-1",
+        # No deferred_dispatches arg — fall back to inline dispatch.
+    )
+
+    assert dispatched == [88]
+
+
+@pytest.mark.asyncio
+async def test_pre_mkdir_snapshot_defers_dispatch_when_list_provided(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Folder mkdir snapshots honour the same deferred-dispatch contract."""
+    session = MagicMock()
+    session.begin_nested = MagicMock(return_value=_NestedCtx())
+    session.execute = AsyncMock()  # _mark_action_reversible calls execute
+    session.flush = AsyncMock()
+
+    def _add(rev: Any) -> None:
+        rev.id = 3
+
+    session.add = MagicMock(side_effect=_add)
+
+    dispatched: list[int] = []
+
+    async def _fake_dispatch(action_id: int | None) -> None:
+        if action_id is not None:
+            dispatched.append(int(action_id))
+
+    monkeypatch.setattr(
+        kb_persistence, "_dispatch_reversibility_update", _fake_dispatch
+    )
+
+    deferred: list[int] = []
+    folder = MagicMock(id=2, name="f", parent_id=None, position="a0")
+
+    await kb_persistence._snapshot_folder_pre_mkdir(
+        session,  # type: ignore[arg-type]
+        folder=folder,
+        action_id=55,
+        search_space_id=1,
+        turn_id="t-1",
+        deferred_dispatches=deferred,
+    )
+
+    assert dispatched == []
+    assert deferred == [55]
diff --git a/surfsense_backend/tests/unit/middleware/test_knowledge_search.py b/surfsense_backend/tests/unit/middleware/test_knowledge_search.py
index a8cf5c93b5..3529a946b8 100644
--- a/surfsense_backend/tests/unit/middleware/test_knowledge_search.py
+++ b/surfsense_backend/tests/unit/middleware/test_knowledge_search.py
@@ -5,9 +5,10 @@
 import pytest
 from langchain_core.messages import AIMessage, HumanMessage
 
+from app.agents.new_chat.document_xml import build_document_xml as _build_document_xml
 from app.agents.new_chat.middleware.knowledge_search import (
+    KBSearchPlan,
     KnowledgeBaseSearchMiddleware,
-    _build_document_xml,
     _normalize_optional_date_range,
     _parse_kb_search_plan_response,
     _render_recent_conversation,
@@ -201,6 +202,15 @@ def _count_tokens(self, messages) -> int:
 
 
 class TestKnowledgeBaseSearchMiddlewarePlanner:
+    @pytest.fixture(autouse=True)
+    def _disable_planner_runnable(self, monkeypatch):
+        # ``FakeLLM`` is a duck-typed mock; ``create_agent`` (used when the
+        # planner Runnable path is enabled) calls ``.bind()`` on the LLM,
+        # which the mock does not implement. Pin the flag off so the
+        # planner falls through to the legacy ``self.llm.ainvoke`` path
+        # these tests assert against (``llm.calls[0]["config"]``).
+        monkeypatch.setenv("SURFSENSE_ENABLE_KB_PLANNER_RUNNABLE", "false")
+
     def test_render_recent_conversation_prefers_latest_messages_under_budget(self):
         messages = [
             HumanMessage(content="old user context " * 40),
@@ -247,17 +257,10 @@ async def fake_search_knowledge_base(**kwargs):
             captured.update(kwargs)
             return []
 
-        async def fake_build_scoped_filesystem(**kwargs):
-            return {}, {}
-
         monkeypatch.setattr(
             "app.agents.new_chat.middleware.knowledge_search.search_knowledge_base",
             fake_search_knowledge_base,
         )
-        monkeypatch.setattr(
-            "app.agents.new_chat.middleware.knowledge_search.build_scoped_filesystem",
-            fake_build_scoped_filesystem,
-        )
 
         llm = FakeLLM(
             json.dumps(
@@ -297,17 +300,10 @@ async def fake_search_knowledge_base(**kwargs):
             captured.update(kwargs)
             return []
 
-        async def fake_build_scoped_filesystem(**kwargs):
-            return {}, {}
-
         monkeypatch.setattr(
             "app.agents.new_chat.middleware.knowledge_search.search_knowledge_base",
             fake_search_knowledge_base,
         )
-        monkeypatch.setattr(
-            "app.agents.new_chat.middleware.knowledge_search.build_scoped_filesystem",
-            fake_build_scoped_filesystem,
-        )
 
         middleware = KnowledgeBaseSearchMiddleware(
             llm=FakeLLM("not json"),
@@ -333,17 +329,10 @@ async def fake_search_knowledge_base(**kwargs):
             captured.update(kwargs)
             return []
 
-        async def fake_build_scoped_filesystem(**kwargs):
-            return {}, {}
-
         monkeypatch.setattr(
             "app.agents.new_chat.middleware.knowledge_search.search_knowledge_base",
             fake_search_knowledge_base,
         )
-        monkeypatch.setattr(
-            "app.agents.new_chat.middleware.knowledge_search.build_scoped_filesystem",
-            fake_build_scoped_filesystem,
-        )
 
         middleware = KnowledgeBaseSearchMiddleware(
             llm=FakeLLM(
@@ -366,3 +355,319 @@ async def fake_build_scoped_filesystem(**kwargs):
         assert captured["query"] == "deel founders guide summary"
         assert captured["start_date"] is None
         assert captured["end_date"] is None
+
+    async def test_middleware_routes_to_recency_browse_when_flagged(
+        self,
+        monkeypatch,
+    ):
+        """When the planner sets is_recency_query=true, browse_recent_documents
+        is called instead of search_knowledge_base."""
+        browse_captured: dict = {}
+        search_called = False
+
+        async def fake_browse_recent_documents(**kwargs):
+            browse_captured.update(kwargs)
+            return []
+
+        async def fake_search_knowledge_base(**kwargs):
+            nonlocal search_called
+            search_called = True
+            return []
+
+        monkeypatch.setattr(
+            "app.agents.new_chat.middleware.knowledge_search.browse_recent_documents",
+            fake_browse_recent_documents,
+        )
+        monkeypatch.setattr(
+            "app.agents.new_chat.middleware.knowledge_search.search_knowledge_base",
+            fake_search_knowledge_base,
+        )
+
+        llm = FakeLLM(
+            json.dumps(
+                {
+                    "optimized_query": "latest uploaded file",
+                    "start_date": None,
+                    "end_date": None,
+                    "is_recency_query": True,
+                }
+            )
+        )
+        middleware = KnowledgeBaseSearchMiddleware(llm=llm, search_space_id=42)
+
+        result = await middleware.abefore_agent(
+            {"messages": [HumanMessage(content="what's my latest file?")]},
+            runtime=None,
+        )
+
+        assert result is not None
+        assert browse_captured["search_space_id"] == 42
+        assert not search_called
+
+    async def test_middleware_uses_hybrid_search_when_not_recency(
+        self,
+        monkeypatch,
+    ):
+        """When is_recency_query is false (default), hybrid search is used."""
+        search_captured: dict = {}
+        browse_called = False
+
+        async def fake_browse_recent_documents(**kwargs):
+            nonlocal browse_called
+            browse_called = True
+            return []
+
+        async def fake_search_knowledge_base(**kwargs):
+            search_captured.update(kwargs)
+            return []
+
+        monkeypatch.setattr(
+            "app.agents.new_chat.middleware.knowledge_search.browse_recent_documents",
+            fake_browse_recent_documents,
+        )
+        monkeypatch.setattr(
+            "app.agents.new_chat.middleware.knowledge_search.search_knowledge_base",
+            fake_search_knowledge_base,
+        )
+
+        llm = FakeLLM(
+            json.dumps(
+                {
+                    "optimized_query": "quarterly revenue report analysis",
+                    "start_date": None,
+                    "end_date": None,
+                    "is_recency_query": False,
+                }
+            )
+        )
+        middleware = KnowledgeBaseSearchMiddleware(llm=llm, search_space_id=42)
+
+        await middleware.abefore_agent(
+            {"messages": [HumanMessage(content="find the quarterly revenue report")]},
+            runtime=None,
+        )
+
+        assert search_captured["query"] == "quarterly revenue report analysis"
+        assert not browse_called
+
+
+# ── KBSearchPlan schema ────────────────────────────────────────────────
+
+
+class TestKBSearchPlanSchema:
+    def test_is_recency_query_defaults_to_false(self):
+        plan = KBSearchPlan(optimized_query="test query")
+        assert plan.is_recency_query is False
+
+    def test_is_recency_query_parses_true(self):
+        plan = _parse_kb_search_plan_response(
+            json.dumps(
+                {
+                    "optimized_query": "latest uploaded file",
+                    "start_date": None,
+                    "end_date": None,
+                    "is_recency_query": True,
+                }
+            )
+        )
+        assert plan.is_recency_query is True
+        assert plan.optimized_query == "latest uploaded file"
+
+    def test_missing_is_recency_query_defaults_to_false(self):
+        plan = _parse_kb_search_plan_response(
+            json.dumps(
+                {
+                    "optimized_query": "meeting notes",
+                    "start_date": None,
+                    "end_date": None,
+                }
+            )
+        )
+        assert plan.is_recency_query is False
+
+
+# ── mentioned_document_ids cross-turn drain ────────────────────────────
+
+
+class TestKnowledgePriorityMentionDrain:
+    """Regression tests for the cross-turn ``mentioned_document_ids`` drain.
+
+    The compiled-agent cache reuses a single :class:`KnowledgePriorityMiddleware`
+    instance across turns of the same thread. ``mentioned_document_ids``
+    can therefore enter the middleware via two paths:
+
+    1. The constructor closure (``__init__(mentioned_document_ids=...)``) —
+       seeded by the cache-miss build on turn 1.
+    2. ``runtime.context.mentioned_document_ids`` — supplied freshly per
+       turn by the streaming task.
+
+    Without the drain fix, an empty ``runtime.context.mentioned_document_ids``
+    on turn 2 would fall through to the closure (because ``[]`` is falsy in
+    Python) and replay turn 1's mentions. This class pins down the
+    correct behaviour: the runtime path is authoritative even when empty,
+    and the closure is drained the first time the runtime path fires so
+    no later turn can ever resurrect stale state.
+    """
+
+    @staticmethod
+    def _make_runtime(mention_ids: list[int]):
+        """Minimal runtime stub exposing only ``runtime.context.mentioned_document_ids``."""
+        from types import SimpleNamespace
+
+        return SimpleNamespace(
+            context=SimpleNamespace(mentioned_document_ids=mention_ids),
+        )
+
+    @staticmethod
+    def _planner_llm() -> "FakeLLM":
+        # Planner returns a stable, non-recency plan so we always land in
+        # the hybrid-search branch (where ``fetch_mentioned_documents`` is
+        # invoked alongside the main search).
+        return FakeLLM(
+            json.dumps(
+                {
+                    "optimized_query": "follow up question",
+                    "start_date": None,
+                    "end_date": None,
+                    "is_recency_query": False,
+                }
+            )
+        )
+
+    async def test_runtime_context_overrides_closure_and_drains_it(self, monkeypatch):
+        """Turn 1 with mentions in BOTH closure and runtime context: the
+        runtime path wins AND the closure is drained so a future turn
+        cannot replay it.
+        """
+        fetched_ids: list[list[int]] = []
+
+        async def fake_fetch_mentioned_documents(*, document_ids, search_space_id):
+            fetched_ids.append(list(document_ids))
+            return []
+
+        async def fake_search_knowledge_base(**_kwargs):
+            return []
+
+        monkeypatch.setattr(
+            "app.agents.new_chat.middleware.knowledge_search.fetch_mentioned_documents",
+            fake_fetch_mentioned_documents,
+        )
+        monkeypatch.setattr(
+            "app.agents.new_chat.middleware.knowledge_search.search_knowledge_base",
+            fake_search_knowledge_base,
+        )
+
+        middleware = KnowledgeBaseSearchMiddleware(
+            llm=self._planner_llm(),
+            search_space_id=42,
+            mentioned_document_ids=[1, 2, 3],
+        )
+
+        await middleware.abefore_agent(
+            {"messages": [HumanMessage(content="what is in those docs?")]},
+            runtime=self._make_runtime([1, 2, 3]),
+        )
+
+        assert fetched_ids == [[1, 2, 3]], (
+            "runtime.context mentions must be the source of truth on turn 1"
+        )
+        assert middleware.mentioned_document_ids == [], (
+            "closure must be drained the first time the runtime path fires "
+            "so no later turn can replay stale mentions"
+        )
+
+    async def test_empty_runtime_context_does_not_replay_closure_mentions(
+        self, monkeypatch
+    ):
+        """Regression: turn 2 with NO mentions must not surface turn 1's
+        mentions from the constructor closure.
+
+        Before the fix, ``if ctx_mentions:`` treated an empty list as
+        absent and fell through to ``elif self.mentioned_document_ids:``,
+        replaying turn 1's mentions. This test pins down the corrected
+        behaviour.
+        """
+        fetched_ids: list[list[int]] = []
+
+        async def fake_fetch_mentioned_documents(*, document_ids, search_space_id):
+            fetched_ids.append(list(document_ids))
+            return []
+
+        async def fake_search_knowledge_base(**_kwargs):
+            return []
+
+        monkeypatch.setattr(
+            "app.agents.new_chat.middleware.knowledge_search.fetch_mentioned_documents",
+            fake_fetch_mentioned_documents,
+        )
+        monkeypatch.setattr(
+            "app.agents.new_chat.middleware.knowledge_search.search_knowledge_base",
+            fake_search_knowledge_base,
+        )
+
+        # Simulate a cached middleware instance whose closure was seeded
+        # by a previous turn's cache-miss build (mentions=[1,2,3]).
+        middleware = KnowledgeBaseSearchMiddleware(
+            llm=self._planner_llm(),
+            search_space_id=42,
+            mentioned_document_ids=[1, 2, 3],
+        )
+
+        # Turn 2: streaming task supplies an EMPTY mention list (no
+        # mentions on this follow-up turn).
+        await middleware.abefore_agent(
+            {"messages": [HumanMessage(content="what about the next steps?")]},
+            runtime=self._make_runtime([]),
+        )
+
+        assert fetched_ids == [], (
+            "fetch_mentioned_documents must NOT be called when the runtime "
+            "context says there are no mentions for this turn"
+        )
+
+    async def test_legacy_path_fires_only_when_runtime_context_absent(
+        self, monkeypatch
+    ):
+        """Backward-compat: if a caller doesn't supply runtime.context (old
+        non-streaming code path), the closure-injected mentions are still
+        honoured exactly once and then drained.
+        """
+        fetched_ids: list[list[int]] = []
+
+        async def fake_fetch_mentioned_documents(*, document_ids, search_space_id):
+            fetched_ids.append(list(document_ids))
+            return []
+
+        async def fake_search_knowledge_base(**_kwargs):
+            return []
+
+        monkeypatch.setattr(
+            "app.agents.new_chat.middleware.knowledge_search.fetch_mentioned_documents",
+            fake_fetch_mentioned_documents,
+        )
+        monkeypatch.setattr(
+            "app.agents.new_chat.middleware.knowledge_search.search_knowledge_base",
+            fake_search_knowledge_base,
+        )
+
+        middleware = KnowledgeBaseSearchMiddleware(
+            llm=self._planner_llm(),
+            search_space_id=42,
+            mentioned_document_ids=[7, 8],
+        )
+
+        # First call: no runtime → legacy path uses the closure.
+        await middleware.abefore_agent(
+            {"messages": [HumanMessage(content="initial question")]},
+            runtime=None,
+        )
+        # Second call: still no runtime — closure already drained, so no replay.
+        await middleware.abefore_agent(
+            {"messages": [HumanMessage(content="follow up")]},
+            runtime=None,
+        )
+
+        assert fetched_ids == [[7, 8]], (
+            "legacy path must honour the closure exactly once and then drain it"
+        )
+        assert middleware.mentioned_document_ids == []
diff --git a/surfsense_backend/tests/unit/middleware/test_knowledge_tree.py b/surfsense_backend/tests/unit/middleware/test_knowledge_tree.py
new file mode 100644
index 0000000000..caaec3114e
--- /dev/null
+++ b/surfsense_backend/tests/unit/middleware/test_knowledge_tree.py
@@ -0,0 +1,139 @@
+"""Unit tests for ``KnowledgeTreeMiddleware`` rendering.
+
+The empty-folder marker is critical UX: without it, the LLM cannot
+distinguish a leaf folder containing one document from a leaf folder
+that has no descendants at all, and ends up firing ``rmdir`` on
+non-empty folders. These tests pin the rendering contract so that
+contract cannot silently regress.
+"""
+
+from __future__ import annotations
+
+from app.agents.new_chat.middleware.knowledge_tree import KnowledgeTreeMiddleware
+from app.agents.new_chat.path_resolver import DOCUMENTS_ROOT
+
+
+def _compute(folder_paths: list[str], doc_paths: list[str]) -> set[str]:
+    return KnowledgeTreeMiddleware._compute_non_empty_folders(folder_paths, doc_paths)
+
+
+class TestComputeNonEmptyFolders:
+    def test_folder_with_direct_document_is_non_empty(self):
+        folder_paths = [f"{DOCUMENTS_ROOT}/Travel/Boarding Pass"]
+        doc_paths = [
+            f"{DOCUMENTS_ROOT}/Travel/Boarding Pass/southwest.pdf.xml",
+        ]
+        non_empty = _compute(folder_paths, doc_paths)
+        assert f"{DOCUMENTS_ROOT}/Travel/Boarding Pass" in non_empty
+
+    def test_truly_empty_leaf_folder_is_not_non_empty(self):
+        folder_paths = [f"{DOCUMENTS_ROOT}/Travel/Boarding Pass"]
+        doc_paths: list[str] = []
+        assert _compute(folder_paths, doc_paths) == set()
+
+    def test_documents_propagate_up_to_all_ancestors(self):
+        folder_paths = [
+            f"{DOCUMENTS_ROOT}/A",
+            f"{DOCUMENTS_ROOT}/A/B",
+            f"{DOCUMENTS_ROOT}/A/B/C",
+        ]
+        doc_paths = [f"{DOCUMENTS_ROOT}/A/B/C/file.xml"]
+        non_empty = _compute(folder_paths, doc_paths)
+        assert non_empty == {
+            f"{DOCUMENTS_ROOT}/A",
+            f"{DOCUMENTS_ROOT}/A/B",
+            f"{DOCUMENTS_ROOT}/A/B/C",
+        }
+
+    def test_chain_with_subfolders_marks_only_leaf_empty(self):
+        # POSIX-like semantic: a folder is "empty" only if it has no
+        # immediate children (docs OR sub-folders). The model needs this
+        # because parallel ``rmdir`` calls all see the same starting state,
+        # so trying to rmdir a parent before its children is never safe.
+        folder_paths = [
+            f"{DOCUMENTS_ROOT}/X",
+            f"{DOCUMENTS_ROOT}/X/Y",
+            f"{DOCUMENTS_ROOT}/X/Y/Z",
+        ]
+        non_empty = _compute(folder_paths, [])
+        # Only ``X/Y/Z`` (the leaf) is empty. ``X`` and ``X/Y`` each have a
+        # sub-folder child, so they are non-empty and should NOT carry the
+        # ``(empty)`` marker.
+        assert non_empty == {f"{DOCUMENTS_ROOT}/X", f"{DOCUMENTS_ROOT}/X/Y"}
+
+    def test_sibling_with_doc_does_not_mark_other_sibling_non_empty(self):
+        # Mirrors a real DB layout where every intermediate folder is
+        # materialized in the ``folders`` table.
+        folder_paths = [
+            f"{DOCUMENTS_ROOT}/Travel",
+            f"{DOCUMENTS_ROOT}/Travel/Boarding Pass",
+            f"{DOCUMENTS_ROOT}/Travel/Notes",
+        ]
+        doc_paths = [f"{DOCUMENTS_ROOT}/Travel/Notes/itinerary.xml"]
+        non_empty = _compute(folder_paths, doc_paths)
+        # ``Travel`` is non-empty because it has children, ``Notes`` is non-empty
+        # because of the doc, but ``Boarding Pass`` (sibling leaf) is empty.
+        assert f"{DOCUMENTS_ROOT}/Travel" in non_empty
+        assert f"{DOCUMENTS_ROOT}/Travel/Notes" in non_empty
+        assert f"{DOCUMENTS_ROOT}/Travel/Boarding Pass" not in non_empty
+
+
+class TestFormatTreeRendering:
+    """Integration check: empty leaf gets ``(empty)`` marker; non-empty doesn't."""
+
+    def _render(
+        self,
+        folder_paths: list[str],
+        doc_specs: list[dict],
+    ) -> str:
+        from app.agents.new_chat.path_resolver import PathIndex
+
+        index = PathIndex(
+            folder_paths={i + 1: p for i, p in enumerate(folder_paths)},
+        )
+
+        class _Row:
+            def __init__(self, **kw):
+                self.__dict__.update(kw)
+
+        docs = [_Row(**spec) for spec in doc_specs]
+
+        mw = KnowledgeTreeMiddleware(
+            search_space_id=1,
+            filesystem_mode=None,  # type: ignore[arg-type]
+        )
+        return mw._format_tree(index, docs)
+
+    def test_renders_empty_marker_only_for_truly_empty_folders(self):
+        # Reproduces the failure scenario from the bug report:
+        # ``Boarding Pass`` is empty (its only doc was just deleted), while
+        # ``Tax Returns`` still has ``federal.pdf``. All intermediate
+        # folders are present in the index, mirroring the real DB layout.
+        folder_paths = [
+            "/documents/File Upload",
+            "/documents/File Upload/2026-04-08",
+            "/documents/File Upload/2026-04-08/Travel",
+            "/documents/File Upload/2026-04-08/Travel/Boarding Pass",
+            "/documents/File Upload/2026-04-15",
+            "/documents/File Upload/2026-04-15/Finance",
+            "/documents/File Upload/2026-04-15/Finance/Tax Returns",
+        ]
+        tax_returns_folder_id = (
+            folder_paths.index("/documents/File Upload/2026-04-15/Finance/Tax Returns")
+            + 1
+        )
+        rendered = self._render(
+            folder_paths=folder_paths,
+            doc_specs=[
+                {
+                    "id": 100,
+                    "title": "federal.pdf",
+                    "folder_id": tax_returns_folder_id,
+                },
+            ],
+        )
+        assert "Boarding Pass/ (empty)" in rendered
+        assert "Tax Returns/ (empty)" not in rendered
+        # Intermediate ancestors of the doc must NOT be marked empty.
+        assert "Finance/ (empty)" not in rendered
+        assert "2026-04-15/ (empty)" not in rendered
diff --git a/surfsense_backend/tests/unit/middleware/test_local_folder_backend.py b/surfsense_backend/tests/unit/middleware/test_local_folder_backend.py
new file mode 100644
index 0000000000..6e81ecf8e2
--- /dev/null
+++ b/surfsense_backend/tests/unit/middleware/test_local_folder_backend.py
@@ -0,0 +1,142 @@
+from pathlib import Path
+
+import pytest
+
+from app.agents.new_chat.middleware.local_folder_backend import LocalFolderBackend
+
+pytestmark = pytest.mark.unit
+
+
+def test_local_backend_write_read_edit_roundtrip(tmp_path: Path):
+    backend = LocalFolderBackend(str(tmp_path))
+    (tmp_path / "notes").mkdir()
+
+    write = backend.write("/notes/test.md", "line1\nline2")
+    assert write.error is None
+    assert write.path == "/notes/test.md"
+
+    read = backend.read("/notes/test.md", offset=0, limit=20)
+    assert "line1" in read
+    assert "line2" in read
+
+    edit = backend.edit("/notes/test.md", "line2", "updated")
+    assert edit.error is None
+    assert edit.occurrences == 1
+
+    read_after = backend.read("/notes/test.md", offset=0, limit=20)
+    assert "updated" in read_after
+
+
+def test_local_backend_blocks_path_escape(tmp_path: Path):
+    backend = LocalFolderBackend(str(tmp_path))
+
+    result = backend.write("/../../etc/passwd", "bad")
+    assert result.error is not None
+    assert "Invalid path" in result.error
+
+
+def test_local_backend_glob_and_grep(tmp_path: Path):
+    backend = LocalFolderBackend(str(tmp_path))
+    (tmp_path / "docs").mkdir()
+    (tmp_path / "docs" / "a.txt").write_text("hello world\n")
+    (tmp_path / "docs" / "b.md").write_text("hello markdown\n")
+
+    infos = backend.glob_info("**/*.txt", "/docs")
+    paths = {info["path"] for info in infos}
+    assert "/docs/a.txt" in paths
+
+    grep = backend.grep_raw("hello", "/docs", "*.md")
+    assert isinstance(grep, list)
+    assert any(match["path"] == "/docs/b.md" for match in grep)
+
+
+def test_local_backend_read_raw_returns_exact_content(tmp_path: Path):
+    backend = LocalFolderBackend(str(tmp_path))
+    (tmp_path / "notes").mkdir()
+    expected = "# Title\n\nline 1\nline 2\n"
+    write = backend.write("/notes/raw.md", expected)
+    assert write.error is None
+
+    raw = backend.read_raw("/notes/raw.md")
+    assert raw == expected
+
+
+def test_local_backend_write_rejects_missing_parent_directory(tmp_path: Path):
+    backend = LocalFolderBackend(str(tmp_path))
+
+    write = backend.write("/tempoo/new-note.md", "# New note")
+
+    assert write.error is not None
+    assert "parent directory" in write.error
+    assert not (tmp_path / "tempoo").exists()
+
+
+def test_local_backend_delete_file_success(tmp_path: Path):
+    backend = LocalFolderBackend(str(tmp_path))
+    (tmp_path / "delete-me.md").write_text("bye")
+
+    res = backend.delete_file("/delete-me.md")
+    assert res.error is None
+    assert res.path == "/delete-me.md"
+    assert not (tmp_path / "delete-me.md").exists()
+
+
+def test_local_backend_delete_file_rejects_directory(tmp_path: Path):
+    backend = LocalFolderBackend(str(tmp_path))
+    (tmp_path / "subdir").mkdir()
+
+    res = backend.delete_file("/subdir")
+    assert res.error is not None
+    assert "directory" in res.error
+    assert (tmp_path / "subdir").exists()
+
+
+def test_local_backend_delete_file_missing_returns_error(tmp_path: Path):
+    backend = LocalFolderBackend(str(tmp_path))
+
+    res = backend.delete_file("/nope.md")
+    assert res.error is not None
+    assert "not found" in res.error
+
+
+def test_local_backend_rmdir_success(tmp_path: Path):
+    backend = LocalFolderBackend(str(tmp_path))
+    (tmp_path / "empty").mkdir()
+
+    res = backend.rmdir("/empty")
+    assert res.error is None
+    assert res.path == "/empty"
+    assert not (tmp_path / "empty").exists()
+
+
+def test_local_backend_rmdir_rejects_non_empty(tmp_path: Path):
+    backend = LocalFolderBackend(str(tmp_path))
+    (tmp_path / "withkid").mkdir()
+    (tmp_path / "withkid" / "child.md").write_text("x")
+
+    res = backend.rmdir("/withkid")
+    assert res.error is not None
+    assert "not empty" in res.error
+    assert (tmp_path / "withkid" / "child.md").exists()
+
+
+def test_local_backend_rmdir_rejects_file(tmp_path: Path):
+    backend = LocalFolderBackend(str(tmp_path))
+    (tmp_path / "f.md").write_text("x")
+
+    res = backend.rmdir("/f.md")
+    assert res.error is not None
+    assert "not a directory" in res.error
+
+
+def test_local_backend_rmdir_rejects_root(tmp_path: Path):
+    """``rmdir /`` MUST fail. The exact error wording comes from
+    ``_resolve_virtual`` (root resolves to outside the sandbox); what
+    matters is that the call returns an error and does NOT delete the
+    sandbox root on disk."""
+    backend = LocalFolderBackend(str(tmp_path))
+
+    res = backend.rmdir("/")
+    assert res.error is not None
+    assert "Invalid path" in res.error or "root" in res.error
+    assert tmp_path.exists()
diff --git a/surfsense_backend/tests/unit/middleware/test_multi_root_local_folder_backend.py b/surfsense_backend/tests/unit/middleware/test_multi_root_local_folder_backend.py
new file mode 100644
index 0000000000..43a6711780
--- /dev/null
+++ b/surfsense_backend/tests/unit/middleware/test_multi_root_local_folder_backend.py
@@ -0,0 +1,37 @@
+from pathlib import Path
+
+import pytest
+
+from app.agents.new_chat.middleware.multi_root_local_folder_backend import (
+    MultiRootLocalFolderBackend,
+)
+
+pytestmark = pytest.mark.unit
+
+
+def test_mount_ids_preserve_client_mapping_order(tmp_path: Path) -> None:
+    root_one = tmp_path / "PC Backups"
+    root_two = tmp_path / "pc_backups"
+    root_three = tmp_path / "notes@2026"
+    root_one.mkdir()
+    root_two.mkdir()
+    root_three.mkdir()
+
+    backend = MultiRootLocalFolderBackend(
+        (
+            ("pc_backups", str(root_one)),
+            ("pc_backups_2", str(root_two)),
+            ("notes_2026", str(root_three)),
+        )
+    )
+
+    assert backend.list_mounts() == ("pc_backups", "pc_backups_2", "notes_2026")
+
+
+def test_mount_id_is_authoritative_not_folder_name(tmp_path: Path) -> None:
+    root = tmp_path / "Resume Folder"
+    root.mkdir()
+
+    backend = MultiRootLocalFolderBackend((("custom_resume_mount", str(root)),))
+
+    assert backend.list_mounts() == ("custom_resume_mount",)
diff --git a/surfsense_backend/tests/unit/observability/__init__.py b/surfsense_backend/tests/unit/observability/__init__.py
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/surfsense_backend/tests/unit/observability/__init__.py
@@ -0,0 +1 @@
+
diff --git a/surfsense_backend/tests/unit/observability/test_otel.py b/surfsense_backend/tests/unit/observability/test_otel.py
new file mode 100644
index 0000000000..fc58139738
--- /dev/null
+++ b/surfsense_backend/tests/unit/observability/test_otel.py
@@ -0,0 +1,84 @@
+"""Tests for the SurfSense OpenTelemetry shim."""
+
+from __future__ import annotations
+
+import pytest
+
+from app.observability import otel
+
+pytestmark = pytest.mark.unit
+
+
+@pytest.fixture(autouse=True)
+def _reset_otel_state(monkeypatch: pytest.MonkeyPatch):
+    """Force a clean OTel disabled state per test, then restore after."""
+    for env in ("OTEL_EXPORTER_OTLP_ENDPOINT", "SURFSENSE_DISABLE_OTEL"):
+        monkeypatch.delenv(env, raising=False)
+    monkeypatch.setenv("SURFSENSE_DISABLE_OTEL", "true")
+    otel.reload_for_tests()
+    yield
+    otel.reload_for_tests()
+
+
+def test_disabled_by_default_when_no_endpoint() -> None:
+    assert otel.is_enabled() is False
+
+
+def test_enabled_when_endpoint_configured(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.delenv("SURFSENSE_DISABLE_OTEL", raising=False)
+    monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317")
+    assert otel.reload_for_tests() is True
+
+
+def test_kill_switch_overrides_endpoint(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317")
+    monkeypatch.setenv("SURFSENSE_DISABLE_OTEL", "true")
+    assert otel.reload_for_tests() is False
+
+
+class TestNoopSpansWhenDisabled:
+    def test_generic_span_yields_noop(self) -> None:
+        with otel.span("any.thing", attributes={"x": 1}) as sp:
+            sp.set_attribute("y", 2)
+            sp.set_attributes({"a": "b"})
+            sp.add_event("evt")
+            sp.record_exception(RuntimeError("ignored"))
+            sp.set_status("ignored")
+        # Reaching here without raising means the no-op is well-formed
+
+    def test_exception_propagates_through_span(self) -> None:
+        with pytest.raises(ValueError), otel.span("err"):
+            raise ValueError("boom")
+
+    def test_each_helper_is_a_no_op_when_disabled(self) -> None:
+        helpers = [
+            otel.tool_call_span("write_file", input_size=42),
+            otel.model_call_span(model_id="openai:gpt-4o", provider="openai"),
+            otel.kb_search_span(search_space_id=1, query_chars=99),
+            otel.kb_persist_span(document_type="NOTE", document_id=7),
+            otel.compaction_span(reason="overflow", messages_in=120),
+            otel.interrupt_span(interrupt_type="permission_ask"),
+            otel.permission_asked_span(permission="edit", pattern="/x/**"),
+        ]
+        for cm in helpers:
+            with cm as sp:
+                assert sp is not None
+                sp.set_attribute("ok", True)
+
+
+class TestEnabledIntegration:
+    """When OTel is wired but no SDK exporter is bound, the API still works."""
+
+    def test_span_attaches_attributes(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        # Use the API tracer (no-op-ish but real Span objects).
+        monkeypatch.delenv("SURFSENSE_DISABLE_OTEL", raising=False)
+        monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317")
+        assert otel.reload_for_tests() is True
+
+        # Should not raise even when set_attributes/record_exception fall through
+        # to an SDK that isn't actually installed.
+        with otel.tool_call_span("scrape_webpage", input_size=10) as sp:
+            sp.set_attribute("tool.output.size", 200)
+            sp.set_attribute("tool.truncated", False)
+        with otel.model_call_span(model_id="m", provider="p") as sp:
+            sp.set_attribute("retry.count", 3)
diff --git a/surfsense_backend/tests/unit/routes/test_byok_supports_image_input.py b/surfsense_backend/tests/unit/routes/test_byok_supports_image_input.py
new file mode 100644
index 0000000000..c9f18d77dd
--- /dev/null
+++ b/surfsense_backend/tests/unit/routes/test_byok_supports_image_input.py
@@ -0,0 +1,110 @@
+"""Unit tests for ``supports_image_input`` derivation on BYOK chat config
+endpoints (``GET /new-llm-configs`` list, ``GET /new-llm-configs/{id}``).
+
+There is no DB column for ``supports_image_input`` on
+``NewLLMConfig`` — the value is resolved at the API boundary by
+``derive_supports_image_input`` so the new-chat selector / streaming
+task can read the same field shape regardless of source (BYOK vs YAML
+vs OpenRouter dynamic). Default-allow on unknown so we don't lock the
+user out of their own model choice.
+"""
+
+from __future__ import annotations
+
+from datetime import UTC, datetime
+from types import SimpleNamespace
+from uuid import uuid4
+
+import pytest
+
+from app.db import LiteLLMProvider
+from app.routes import new_llm_config_routes
+
+pytestmark = pytest.mark.unit
+
+
+def _byok_row(
+    *,
+    id_: int,
+    model_name: str,
+    base_model: str | None = None,
+    provider: LiteLLMProvider = LiteLLMProvider.OPENAI,
+    custom_provider: str | None = None,
+) -> object:
+    """Mimic the SQLAlchemy row's attribute surface; ``model_validate``
+    walks ``from_attributes=True`` so a ``SimpleNamespace`` is enough.
+
+    ``provider`` is a real ``LiteLLMProvider`` enum value so Pydantic's
+    enum validator accepts it — same as the ORM row would carry."""
+    return SimpleNamespace(
+        id=id_,
+        name=f"BYOK-{id_}",
+        description=None,
+        provider=provider,
+        custom_provider=custom_provider,
+        model_name=model_name,
+        api_key="sk-byok",
+        api_base=None,
+        litellm_params={"base_model": base_model} if base_model else None,
+        system_instructions="",
+        use_default_system_instructions=True,
+        citations_enabled=True,
+        created_at=datetime.now(tz=UTC),
+        search_space_id=42,
+        user_id=uuid4(),
+    )
+
+
+def test_serialize_byok_known_vision_model_resolves_true():
+    """The catalog resolver consults LiteLLM's map for ``gpt-4o`` ->
+    True. The serialized row carries that value through to the
+    ``NewLLMConfigRead`` schema."""
+    row = _byok_row(id_=1, model_name="gpt-4o")
+    serialized = new_llm_config_routes._serialize_byok_config(row)
+
+    assert serialized.supports_image_input is True
+    assert serialized.id == 1
+    assert serialized.model_name == "gpt-4o"
+
+
+def test_serialize_byok_unknown_model_default_allows():
+    """Unknown / unmapped: default-allow. The streaming-task safety net
+    is the actual block, and it requires LiteLLM to *explicitly* say
+    text-only — so a brand new BYOK model should not be pre-judged."""
+    row = _byok_row(
+        id_=2,
+        model_name="brand-new-model-x9-unmapped",
+        provider=LiteLLMProvider.CUSTOM,
+        custom_provider="brand_new_proxy",
+    )
+    serialized = new_llm_config_routes._serialize_byok_config(row)
+
+    assert serialized.supports_image_input is True
+
+
+def test_serialize_byok_uses_base_model_when_present():
+    """Azure-style: ``model_name`` is the deployment id, ``base_model``
+    inside ``litellm_params`` is the canonical sku LiteLLM knows. The
+    helper must consult ``base_model`` first or unrecognised deployment
+    ids would shadow the real capability."""
+    row = _byok_row(
+        id_=3,
+        model_name="my-azure-deployment-id-no-litellm-knows-this",
+        base_model="gpt-4o",
+        provider=LiteLLMProvider.AZURE_OPENAI,
+    )
+    serialized = new_llm_config_routes._serialize_byok_config(row)
+
+    assert serialized.supports_image_input is True
+
+
+def test_serialize_byok_returns_pydantic_read_model():
+    """The route now returns ``NewLLMConfigRead`` (not the raw ORM) so
+    the schema additions are guaranteed to be present in the API
+    surface. This guards against a future regression where someone
+    deletes the augmentation step and falls back to ORM passthrough."""
+    from app.schemas import NewLLMConfigRead
+
+    row = _byok_row(id_=4, model_name="gpt-4o")
+    serialized = new_llm_config_routes._serialize_byok_config(row)
+    assert isinstance(serialized, NewLLMConfigRead)
diff --git a/surfsense_backend/tests/unit/routes/test_global_configs_is_premium.py b/surfsense_backend/tests/unit/routes/test_global_configs_is_premium.py
new file mode 100644
index 0000000000..2b6c764852
--- /dev/null
+++ b/surfsense_backend/tests/unit/routes/test_global_configs_is_premium.py
@@ -0,0 +1,184 @@
+"""Unit tests for ``is_premium`` derivation on the global image-gen and
+vision-LLM list endpoints.
+
+Chat globals (``GET /global-llm-configs``) already emit
+``is_premium = (billing_tier == "premium")``. Image and vision did not,
+which made the new-chat ``model-selector`` render the Free/Premium badge
+on the Chat tab but skip it on the Image and Vision tabs (the selector
+keys its badge logic off ``is_premium``). These tests pin parity:
+
+* YAML free entry → ``is_premium=False``
+* YAML premium entry → ``is_premium=True``
+* OpenRouter dynamic premium entry → ``is_premium=True``
+* Auto stub (always emitted when at least one config is present)
+  → ``is_premium=False``
+"""
+
+from __future__ import annotations
+
+import pytest
+
+pytestmark = pytest.mark.unit
+
+
+_IMAGE_FIXTURE: list[dict] = [
+    {
+        "id": -1,
+        "name": "DALL-E 3",
+        "provider": "OPENAI",
+        "model_name": "dall-e-3",
+        "api_key": "sk-test",
+        "billing_tier": "free",
+    },
+    {
+        "id": -2,
+        "name": "GPT-Image 1 (premium)",
+        "provider": "OPENAI",
+        "model_name": "gpt-image-1",
+        "api_key": "sk-test",
+        "billing_tier": "premium",
+    },
+    {
+        "id": -20_001,
+        "name": "google/gemini-2.5-flash-image (OpenRouter)",
+        "provider": "OPENROUTER",
+        "model_name": "google/gemini-2.5-flash-image",
+        "api_key": "sk-or-test",
+        "api_base": "https://openrouter.ai/api/v1",
+        "billing_tier": "premium",
+    },
+]
+
+
+_VISION_FIXTURE: list[dict] = [
+    {
+        "id": -1,
+        "name": "GPT-4o Vision",
+        "provider": "OPENAI",
+        "model_name": "gpt-4o",
+        "api_key": "sk-test",
+        "billing_tier": "free",
+    },
+    {
+        "id": -2,
+        "name": "Claude 3.5 Sonnet (premium)",
+        "provider": "ANTHROPIC",
+        "model_name": "claude-3-5-sonnet",
+        "api_key": "sk-ant-test",
+        "billing_tier": "premium",
+    },
+    {
+        "id": -30_001,
+        "name": "openai/gpt-4o (OpenRouter)",
+        "provider": "OPENROUTER",
+        "model_name": "openai/gpt-4o",
+        "api_key": "sk-or-test",
+        "api_base": "https://openrouter.ai/api/v1",
+        "billing_tier": "premium",
+    },
+]
+
+
+# =============================================================================
+# Image generation
+# =============================================================================
+
+
+@pytest.mark.asyncio
+async def test_global_image_gen_configs_emit_is_premium(monkeypatch):
+    """Each emitted config must carry ``is_premium`` derived server-side
+    from ``billing_tier``. The Auto stub is always free.
+    """
+    from app.config import config
+    from app.routes import image_generation_routes
+
+    monkeypatch.setattr(
+        config, "GLOBAL_IMAGE_GEN_CONFIGS", _IMAGE_FIXTURE, raising=False
+    )
+
+    payload = await image_generation_routes.get_global_image_gen_configs(user=None)
+
+    by_id = {c["id"]: c for c in payload}
+
+    # Auto stub is always emitted when at least one global config exists,
+    # and it must always declare itself free (Auto-mode billing-tier
+    # surfacing is a separate follow-up).
+    assert 0 in by_id, "Auto stub should be emitted when at least one config exists"
+    assert by_id[0]["is_premium"] is False
+    assert by_id[0]["billing_tier"] == "free"
+
+    # YAML free entry — ``is_premium=False``
+    assert by_id[-1]["is_premium"] is False
+    assert by_id[-1]["billing_tier"] == "free"
+
+    # YAML premium entry — ``is_premium=True``
+    assert by_id[-2]["is_premium"] is True
+    assert by_id[-2]["billing_tier"] == "premium"
+
+    # OpenRouter dynamic premium entry — same field, same derivation
+    assert by_id[-20_001]["is_premium"] is True
+    assert by_id[-20_001]["billing_tier"] == "premium"
+
+    # Every emitted dict (including Auto) must have the field — never missing.
+    for cfg in payload:
+        assert "is_premium" in cfg, f"is_premium missing from {cfg.get('id')}"
+        assert isinstance(cfg["is_premium"], bool)
+
+
+@pytest.mark.asyncio
+async def test_global_image_gen_configs_no_globals_no_auto_stub(monkeypatch):
+    """When there are no global configs at all, the endpoint emits an
+    empty list (no Auto stub) — Auto mode would have nothing to route to.
+    """
+    from app.config import config
+    from app.routes import image_generation_routes
+
+    monkeypatch.setattr(config, "GLOBAL_IMAGE_GEN_CONFIGS", [], raising=False)
+    payload = await image_generation_routes.get_global_image_gen_configs(user=None)
+    assert payload == []
+
+
+# =============================================================================
+# Vision LLM
+# =============================================================================
+
+
+@pytest.mark.asyncio
+async def test_global_vision_llm_configs_emit_is_premium(monkeypatch):
+    from app.config import config
+    from app.routes import vision_llm_routes
+
+    monkeypatch.setattr(
+        config, "GLOBAL_VISION_LLM_CONFIGS", _VISION_FIXTURE, raising=False
+    )
+
+    payload = await vision_llm_routes.get_global_vision_llm_configs(user=None)
+
+    by_id = {c["id"]: c for c in payload}
+
+    assert 0 in by_id, "Auto stub should be emitted when at least one config exists"
+    assert by_id[0]["is_premium"] is False
+    assert by_id[0]["billing_tier"] == "free"
+
+    assert by_id[-1]["is_premium"] is False
+    assert by_id[-1]["billing_tier"] == "free"
+
+    assert by_id[-2]["is_premium"] is True
+    assert by_id[-2]["billing_tier"] == "premium"
+
+    assert by_id[-30_001]["is_premium"] is True
+    assert by_id[-30_001]["billing_tier"] == "premium"
+
+    for cfg in payload:
+        assert "is_premium" in cfg, f"is_premium missing from {cfg.get('id')}"
+        assert isinstance(cfg["is_premium"], bool)
+
+
+@pytest.mark.asyncio
+async def test_global_vision_llm_configs_no_globals_no_auto_stub(monkeypatch):
+    from app.config import config
+    from app.routes import vision_llm_routes
+
+    monkeypatch.setattr(config, "GLOBAL_VISION_LLM_CONFIGS", [], raising=False)
+    payload = await vision_llm_routes.get_global_vision_llm_configs(user=None)
+    assert payload == []
diff --git a/surfsense_backend/tests/unit/routes/test_global_new_llm_configs_supports_image.py b/surfsense_backend/tests/unit/routes/test_global_new_llm_configs_supports_image.py
new file mode 100644
index 0000000000..b47d9134be
--- /dev/null
+++ b/surfsense_backend/tests/unit/routes/test_global_new_llm_configs_supports_image.py
@@ -0,0 +1,106 @@
+"""Unit tests for ``supports_image_input`` derivation on the chat global
+config endpoint (``GET /global-new-llm-configs``).
+
+Resolution order (matches ``new_llm_config_routes.get_global_new_llm_configs``):
+
+1. Explicit ``supports_image_input`` on the cfg dict (set by the YAML
+   loader for operator overrides, or by the OpenRouter integration from
+   ``architecture.input_modalities``) — wins.
+2. ``derive_supports_image_input`` helper — default-allow on unknown
+   models, only False when LiteLLM / OR modalities are definitive.
+
+The flag is purely informational at the API boundary. The streaming
+task safety net (``is_known_text_only_chat_model``) is the actual block,
+and it requires LiteLLM to *explicitly* mark the model as text-only.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+pytestmark = pytest.mark.unit
+
+
+_FIXTURE: list[dict] = [
+    {
+        "id": -1,
+        "name": "GPT-4o (explicit true)",
+        "description": "vision-capable, explicit YAML override",
+        "provider": "OPENAI",
+        "model_name": "gpt-4o",
+        "api_key": "sk-test",
+        "billing_tier": "free",
+        "supports_image_input": True,
+    },
+    {
+        "id": -2,
+        "name": "DeepSeek V3 (explicit false)",
+        "description": "OpenRouter dynamic — modality-derived false",
+        "provider": "OPENROUTER",
+        "model_name": "deepseek/deepseek-v3.2-exp",
+        "api_key": "sk-or-test",
+        "api_base": "https://openrouter.ai/api/v1",
+        "billing_tier": "free",
+        "supports_image_input": False,
+    },
+    {
+        "id": -10_010,
+        "name": "Unannotated GPT-4o",
+        "description": "no flag set — resolver should derive True via LiteLLM",
+        "provider": "OPENAI",
+        "model_name": "gpt-4o",
+        "api_key": "sk-test",
+        "billing_tier": "free",
+        # supports_image_input intentionally absent
+    },
+    {
+        "id": -10_011,
+        "name": "Unannotated unknown model",
+        "description": "unmapped — default-allow True",
+        "provider": "CUSTOM",
+        "custom_provider": "brand_new_proxy",
+        "model_name": "brand-new-model-x9",
+        "api_key": "sk-test",
+        "billing_tier": "free",
+    },
+]
+
+
+@pytest.mark.asyncio
+async def test_global_new_llm_configs_emit_supports_image_input(monkeypatch):
+    """Each emitted chat config carries ``supports_image_input`` as a
+    bool. Explicit values win; unannotated entries are resolved via the
+    helper (default-allow True)."""
+    from app.config import config
+    from app.routes import new_llm_config_routes
+
+    monkeypatch.setattr(config, "GLOBAL_LLM_CONFIGS", _FIXTURE, raising=False)
+
+    payload = await new_llm_config_routes.get_global_new_llm_configs(user=None)
+    by_id = {c["id"]: c for c in payload}
+
+    # Auto stub: optimistic True so the user can keep Auto selected with
+    # vision-capable deployments somewhere in the pool.
+    assert 0 in by_id, "Auto stub should be emitted when configs exist"
+    assert by_id[0]["supports_image_input"] is True
+    assert by_id[0]["is_auto_mode"] is True
+
+    # Explicit True is preserved.
+    assert by_id[-1]["supports_image_input"] is True
+
+    # Explicit False is preserved (the exact failure mode the safety net
+    # guards against — DeepSeek V3 over OpenRouter would 404 with "No
+    # endpoints found that support image input").
+    assert by_id[-2]["supports_image_input"] is False
+
+    # Unannotated GPT-4o: resolver consults LiteLLM, which says vision.
+    assert by_id[-10_010]["supports_image_input"] is True
+
+    # Unknown / unmapped model: default-allow rather than pre-judge.
+    assert by_id[-10_011]["supports_image_input"] is True
+
+    for cfg in payload:
+        assert "supports_image_input" in cfg, (
+            f"supports_image_input missing from {cfg.get('id')}"
+        )
+        assert isinstance(cfg["supports_image_input"], bool)
diff --git a/surfsense_backend/tests/unit/routes/test_image_gen_quota.py b/surfsense_backend/tests/unit/routes/test_image_gen_quota.py
new file mode 100644
index 0000000000..636b7de31c
--- /dev/null
+++ b/surfsense_backend/tests/unit/routes/test_image_gen_quota.py
@@ -0,0 +1,138 @@
+"""Unit tests for the image-generation route's billing-resolution helper.
+
+End-to-end "POST /image-generations returns 402" coverage requires the
+integration harness (real DB, real auth) and lives in
+``tests/integration/document_upload/`` alongside the other quota tests.
+This unit test focuses on the new ``_resolve_billing_for_image_gen``
+helper which:
+
+* Returns ``free`` for Auto mode, even when premium configs exist
+  (Auto-mode billing-tier surfacing is a follow-up).
+* Returns ``free`` for user-owned BYOK configs (positive IDs).
+* Returns the global config's ``billing_tier`` for negative IDs.
+* Honours the per-config ``quota_reserve_micros`` override when present.
+"""
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+
+import pytest
+
+pytestmark = pytest.mark.unit
+
+
+@pytest.mark.asyncio
+async def test_resolve_billing_for_auto_mode(monkeypatch):
+    from app.routes import image_generation_routes
+    from app.services.billable_calls import DEFAULT_IMAGE_RESERVE_MICROS
+
+    search_space = SimpleNamespace(image_generation_config_id=None)
+    tier, model, reserve = await image_generation_routes._resolve_billing_for_image_gen(
+        session=None,  # Not consumed on this code path.
+        config_id=0,  # IMAGE_GEN_AUTO_MODE_ID
+        search_space=search_space,
+    )
+    assert tier == "free"
+    assert model == "auto"
+    assert reserve == DEFAULT_IMAGE_RESERVE_MICROS
+
+
+@pytest.mark.asyncio
+async def test_resolve_billing_for_premium_global_config(monkeypatch):
+    from app.config import config
+    from app.routes import image_generation_routes
+
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_IMAGE_GEN_CONFIGS",
+        [
+            {
+                "id": -1,
+                "provider": "OPENAI",
+                "model_name": "gpt-image-1",
+                "billing_tier": "premium",
+                "quota_reserve_micros": 75_000,
+            },
+            {
+                "id": -2,
+                "provider": "OPENROUTER",
+                "model_name": "google/gemini-2.5-flash-image",
+                "billing_tier": "free",
+            },
+        ],
+        raising=False,
+    )
+
+    search_space = SimpleNamespace(image_generation_config_id=None)
+
+    # Premium with override.
+    tier, model, reserve = await image_generation_routes._resolve_billing_for_image_gen(
+        session=None, config_id=-1, search_space=search_space
+    )
+    assert tier == "premium"
+    assert model == "openai/gpt-image-1"
+    assert reserve == 75_000
+
+    # Free, no override → falls back to default.
+    from app.services.billable_calls import DEFAULT_IMAGE_RESERVE_MICROS
+
+    tier, model, reserve = await image_generation_routes._resolve_billing_for_image_gen(
+        session=None, config_id=-2, search_space=search_space
+    )
+    assert tier == "free"
+    # Provider-prefixed model string for OpenRouter.
+    assert "google/gemini-2.5-flash-image" in model
+    assert reserve == DEFAULT_IMAGE_RESERVE_MICROS
+
+
+@pytest.mark.asyncio
+async def test_resolve_billing_for_user_owned_byok_is_free():
+    """User-owned BYOK configs (positive IDs) cost the user nothing on
+    our side — they pay the provider directly. Always free.
+    """
+    from app.routes import image_generation_routes
+    from app.services.billable_calls import DEFAULT_IMAGE_RESERVE_MICROS
+
+    search_space = SimpleNamespace(image_generation_config_id=None)
+    tier, model, reserve = await image_generation_routes._resolve_billing_for_image_gen(
+        session=None, config_id=42, search_space=search_space
+    )
+    assert tier == "free"
+    assert model == "user_byok"
+    assert reserve == DEFAULT_IMAGE_RESERVE_MICROS
+
+
+@pytest.mark.asyncio
+async def test_resolve_billing_falls_back_to_search_space_default(monkeypatch):
+    """When the request omits ``image_generation_config_id``, the helper
+    must consult the search space's default — so a search space pinned
+    to a premium global config still gates new requests by quota.
+    """
+    from app.config import config
+    from app.routes import image_generation_routes
+
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_IMAGE_GEN_CONFIGS",
+        [
+            {
+                "id": -7,
+                "provider": "OPENAI",
+                "model_name": "gpt-image-1",
+                "billing_tier": "premium",
+            }
+        ],
+        raising=False,
+    )
+
+    search_space = SimpleNamespace(image_generation_config_id=-7)
+    (
+        tier,
+        model,
+        _reserve,
+    ) = await image_generation_routes._resolve_billing_for_image_gen(
+        session=None, config_id=None, search_space=search_space
+    )
+    assert tier == "premium"
+    assert model == "openai/gpt-image-1"
diff --git a/surfsense_backend/tests/unit/routes/test_regenerate_from_message_id.py b/surfsense_backend/tests/unit/routes/test_regenerate_from_message_id.py
new file mode 100644
index 0000000000..709014d555
--- /dev/null
+++ b/surfsense_backend/tests/unit/routes/test_regenerate_from_message_id.py
@@ -0,0 +1,143 @@
+"""Unit tests for the edit-from-arbitrary-position helpers inside ``new_chat_routes``.
+
+The regenerate route's edit-from-position path introduces:
+* ``_find_pre_turn_checkpoint_id`` — walks LangGraph checkpoint tuples
+  newest-first and picks the first one whose ``metadata["turn_id"]``
+  differs from the edited turn. That checkpoint is the rewind target
+  (state immediately before the edited turn started).
+* ``RegenerateRequest`` accepts ``from_message_id`` + ``revert_actions``
+  with a validator that prevents callers from requesting a revert pass
+  without specifying which turn to roll back.
+
+These are pure-Python helpers that don't need a live DB, so we exercise
+them with a small ``CheckpointTuple``-shaped namespace and direct
+schema instantiation.
+"""
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+
+import pytest
+
+from app.routes.new_chat_routes import _find_pre_turn_checkpoint_id
+from app.schemas.new_chat import RegenerateRequest
+
+
+def _cp(checkpoint_id: str, turn_id: str | None) -> SimpleNamespace:
+    """Build a fake ``CheckpointTuple`` with the metadata shape we read."""
+    return SimpleNamespace(
+        config={"configurable": {"checkpoint_id": checkpoint_id}},
+        metadata={"turn_id": turn_id} if turn_id is not None else {},
+    )
+
+
+class TestFindPreTurnCheckpointId:
+    def test_returns_last_pre_turn_checkpoint_when_editing_latest_turn(self) -> None:
+        # Newest-first: T2 is the most-recent turn. The latest non-T2
+        # checkpoint (cp2) is the rewind target — state immediately
+        # before T2 began.
+        tuples = [
+            _cp("cp4", "T2"),
+            _cp("cp3", "T2"),
+            _cp("cp2", "T1"),
+            _cp("cp1", "T1"),
+        ]
+        assert _find_pre_turn_checkpoint_id(tuples, turn_id="T2") == "cp2"
+
+    def test_returns_pre_turn_checkpoint_when_later_turns_exist(self) -> None:
+        # Regression for the bug where walking newest-first returned the
+        # FIRST cp with ``turn_id != target`` — which is one of the
+        # later-turn checkpoints, NOT the pre-turn boundary. Editing
+        # T2 must rewind to the latest T1 checkpoint (cp2), not to the
+        # latest T3 checkpoint (cp6).
+        tuples = [
+            _cp("cp6", "T3"),
+            _cp("cp5", "T3"),
+            _cp("cp4", "T2"),
+            _cp("cp3", "T2"),
+            _cp("cp2", "T1"),
+            _cp("cp1", "T1"),
+        ]
+        assert _find_pre_turn_checkpoint_id(tuples, turn_id="T2") == "cp2"
+
+    def test_returns_none_when_editing_first_turn(self) -> None:
+        # No pre-turn boundary exists; caller is expected to fall back
+        # to the oldest checkpoint or special-case "first turn of the
+        # thread".
+        tuples = [
+            _cp("cp4", "T2"),
+            _cp("cp3", "T2"),
+            _cp("cp2", "T1"),
+            _cp("cp1", "T1"),
+        ]
+        assert _find_pre_turn_checkpoint_id(tuples, turn_id="T1") is None
+
+    def test_returns_none_when_only_edited_turn_present(self) -> None:
+        tuples = [_cp("cp2", "T2"), _cp("cp1", "T2")]
+        assert _find_pre_turn_checkpoint_id(tuples, turn_id="T2") is None
+
+    def test_returns_none_for_empty_history(self) -> None:
+        assert _find_pre_turn_checkpoint_id([], turn_id="T1") is None
+
+    def test_legacy_checkpoints_without_turn_id_count_as_pre_turn(self) -> None:
+        # Checkpoints written before migration 136 have no
+        # ``metadata.turn_id``. They should be eligible rewind targets
+        # — they came before the
+        # edited turn began.
+        tuples = [
+            _cp("cp3", "T2"),
+            SimpleNamespace(
+                config={"configurable": {"checkpoint_id": "cp2"}},
+                metadata=None,
+            ),
+            _cp("cp1", "T1"),
+        ]
+        # Walking oldest-first: cp1(T1) tracked, cp2(legacy/None) tracked,
+        # then cp3(T2) crosses the boundary -> return cp2.
+        assert _find_pre_turn_checkpoint_id(tuples, turn_id="T2") == "cp2"
+
+    def test_skips_checkpoint_missing_checkpoint_id_in_config(self) -> None:
+        # If a checkpoint tuple's ``config["configurable"]`` is missing
+        # the ``checkpoint_id`` key (corrupt / partial), we keep the
+        # last known good target instead of crashing.
+        broken = SimpleNamespace(
+            config={"configurable": {}}, metadata={"turn_id": "T1"}
+        )
+        tuples = [
+            _cp("cp3", "T2"),
+            broken,
+            _cp("cp1", "T1"),
+        ]
+        # cp1(T1) tracked, broken skipped, cp3(T2) -> return cp1.
+        assert _find_pre_turn_checkpoint_id(tuples, turn_id="T2") == "cp1"
+
+
+class TestRegenerateRequestValidation:
+    def test_revert_actions_requires_from_message_id(self) -> None:
+        with pytest.raises(Exception) as exc:
+            RegenerateRequest(
+                search_space_id=1,
+                user_query="hi",
+                revert_actions=True,
+            )
+        msg = str(exc.value).lower()
+        assert "from_message_id" in msg
+
+    def test_from_message_id_without_revert_is_allowed(self) -> None:
+        req = RegenerateRequest(
+            search_space_id=1,
+            user_query="hi",
+            from_message_id=42,
+        )
+        assert req.from_message_id == 42
+        assert req.revert_actions is False
+
+    def test_revert_actions_with_from_message_id_passes(self) -> None:
+        req = RegenerateRequest(
+            search_space_id=1,
+            user_query="hi",
+            from_message_id=42,
+            revert_actions=True,
+        )
+        assert req.revert_actions is True
diff --git a/surfsense_backend/tests/unit/routes/test_revert_turn_route.py b/surfsense_backend/tests/unit/routes/test_revert_turn_route.py
new file mode 100644
index 0000000000..1e1cbffb31
--- /dev/null
+++ b/surfsense_backend/tests/unit/routes/test_revert_turn_route.py
@@ -0,0 +1,530 @@
+"""Unit tests for ``POST /threads/{id}/revert-turn/{chat_turn_id}``.
+
+The per-turn batch revert route walks rows in reverse ``created_at``
+order, reverts each independently, and returns a per-action result
+list. Partial success is normal — the response status
+is ``"partial"`` whenever any row could not be reverted, but we never
+collapse the whole batch into a 4xx.
+
+These tests stub ``load_thread`` / ``revert_action`` and feed a fake
+session, so they exercise the route's dispatch logic without a real DB.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+from unittest.mock import AsyncMock, patch
+
+import pytest
+
+from app.agents.new_chat.feature_flags import AgentFeatureFlags
+from app.routes import agent_revert_route
+from app.services.revert_service import RevertOutcome
+
+
+@dataclass
+class _FakeAction:
+    id: int
+    tool_name: str
+    user_id: str | None = "u1"
+    reverse_of: int | None = None
+    error: dict | None = None
+
+
+@dataclass
+class _FakeUser:
+    id: str = "u1"
+
+
+@dataclass
+class _ScalarResult:
+    rows: list[Any]
+
+    def first(self) -> Any:
+        return self.rows[0] if self.rows else None
+
+    def all(self) -> list[Any]:
+        return list(self.rows)
+
+
+@dataclass
+class _Result:
+    rows: list[Any] = field(default_factory=list)
+
+    def scalars(self) -> _ScalarResult:
+        return _ScalarResult(self.rows)
+
+    def all(self) -> list[Any]:
+        # ``_was_already_reverted_batch`` calls ``.all()`` directly on
+        # the row-tuple result (no ``.scalars()`` indirection). The
+        # rows queued for that helper are list[(revert_id, original_id)].
+        return list(self.rows)
+
+
+class _FakeNestedCtx:
+    """Async context manager that mimics ``session.begin_nested()``.
+
+    The route raises a sentinel exception inside this block to roll back
+    bad rows. We just pass the exception through.
+    """
+
+    async def __aenter__(self) -> _FakeNestedCtx:
+        return self
+
+    async def __aexit__(self, exc_type, exc, tb) -> bool:
+        # Returning False (or None) propagates the exception; the route
+        # catches its own sentinel above this layer.
+        return False
+
+
+class _FakeSession:
+    """Minimal AsyncSession stand-in for the revert-turn route.
+
+    Holds a queue of result objects; each ``execute(...)`` pops the next
+    one. The route calls ``execute`` exactly once per query so this maps
+    cleanly onto the assertion order of the test.
+    """
+
+    def __init__(self) -> None:
+        self._results: list[_Result] = []
+        self.committed = False
+        self.rolled_back = False
+        # Count execute() calls to assert "no N+1 reverts".
+        self.execute_call_count = 0
+
+    def queue(self, *results: _Result) -> None:
+        self._results.extend(results)
+
+    async def execute(self, _stmt: Any) -> _Result:
+        self.execute_call_count += 1
+        if not self._results:
+            return _Result(rows=[])
+        return self._results.pop(0)
+
+    def begin_nested(self) -> _FakeNestedCtx:
+        return _FakeNestedCtx()
+
+    async def commit(self) -> None:
+        self.committed = True
+
+    async def rollback(self) -> None:
+        self.rolled_back = True
+
+
+def _enabled_flags() -> AgentFeatureFlags:
+    return AgentFeatureFlags(
+        disable_new_agent_stack=False,
+        enable_action_log=True,
+        enable_revert_route=True,
+    )
+
+
+@pytest.fixture
+def patch_get_flags():
+    def _patch(flags: AgentFeatureFlags):
+        return patch(
+            "app.routes.agent_revert_route.get_flags",
+            return_value=flags,
+        )
+
+    return _patch
+
+
+class TestFlagGuard:
+    @pytest.mark.asyncio
+    async def test_returns_503_when_revert_route_disabled(
+        self, patch_get_flags
+    ) -> None:
+        flags = AgentFeatureFlags(
+            disable_new_agent_stack=False,
+            enable_action_log=True,
+            enable_revert_route=False,
+        )
+        session = _FakeSession()
+        with patch_get_flags(flags), pytest.raises(Exception) as exc:
+            await agent_revert_route.revert_agent_turn(
+                thread_id=1,
+                chat_turn_id="42:1700000000000",
+                session=session,
+                user=_FakeUser(),
+            )
+        assert getattr(exc.value, "status_code", None) == 503
+
+
+class TestRevertTurnDispatch:
+    @pytest.mark.asyncio
+    async def test_empty_turn_returns_ok_with_no_rows(self, patch_get_flags) -> None:
+        session = _FakeSession()
+        session.queue(_Result(rows=[]))  # rows query returns nothing
+        with (
+            patch_get_flags(_enabled_flags()),
+            patch.object(
+                agent_revert_route, "load_thread", AsyncMock(return_value=object())
+            ),
+        ):
+            response = await agent_revert_route.revert_agent_turn(
+                thread_id=1,
+                chat_turn_id="ct-empty",
+                session=session,
+                user=_FakeUser(),
+            )
+        assert response.status == "ok"
+        assert response.total == 0
+        assert response.results == []
+        assert session.committed is True
+
+    @pytest.mark.asyncio
+    async def test_walks_rows_in_reverse_and_reverts_each(
+        self, patch_get_flags
+    ) -> None:
+        rows = [
+            _FakeAction(id=10, tool_name="rm"),
+            _FakeAction(id=9, tool_name="write_file"),
+            _FakeAction(id=8, tool_name="mkdir"),
+        ]
+        session = _FakeSession()
+        session.queue(_Result(rows=rows))
+        # Single batched ``_was_already_reverted_batch`` probe replaces
+        # the previous N per-row SELECTs.
+        session.queue(_Result(rows=[]))
+
+        async def _fake_revert(_session, *, action, requester_user_id):
+            return RevertOutcome(
+                status="ok",
+                message=f"reverted-{action.id}",
+                new_action_id=100 + action.id,
+            )
+
+        with (
+            patch_get_flags(_enabled_flags()),
+            patch.object(
+                agent_revert_route, "load_thread", AsyncMock(return_value=object())
+            ),
+            patch.object(
+                agent_revert_route, "revert_action", AsyncMock(side_effect=_fake_revert)
+            ),
+        ):
+            response = await agent_revert_route.revert_agent_turn(
+                thread_id=1,
+                chat_turn_id="ct-3",
+                session=session,
+                user=_FakeUser(),
+            )
+
+        assert response.status == "ok"
+        assert response.total == 3
+        assert response.reverted == 3
+        assert [r.action_id for r in response.results] == [10, 9, 8]
+        assert all(r.status == "reverted" for r in response.results)
+        assert response.results[0].new_action_id == 110
+        # Only TWO ``execute`` calls regardless of the row count: one
+        # for the rows query, one for the batched
+        # ``_was_already_reverted_batch`` probe. Regression guard
+        # against re-introducing the per-row N+1 lookup.
+        assert session.execute_call_count == 2, (
+            "revert-turn loop must batch idempotency probes; got "
+            f"{session.execute_call_count} execute() calls (expected 2)."
+        )
+
+    @pytest.mark.asyncio
+    async def test_already_reverted_rows_are_marked_idempotent(
+        self, patch_get_flags
+    ) -> None:
+        rows = [_FakeAction(id=5, tool_name="edit_file")]
+        session = _FakeSession()
+        session.queue(_Result(rows=rows))
+        # Batch probe returns ``[(revert_id, original_id)]``.
+        session.queue(_Result(rows=[(42, 5)]))
+
+        with (
+            patch_get_flags(_enabled_flags()),
+            patch.object(
+                agent_revert_route, "load_thread", AsyncMock(return_value=object())
+            ),
+            patch.object(agent_revert_route, "revert_action", AsyncMock()) as revert,
+        ):
+            response = await agent_revert_route.revert_agent_turn(
+                thread_id=1,
+                chat_turn_id="ct-i",
+                session=session,
+                user=_FakeUser(),
+            )
+        assert response.status == "ok"
+        assert response.already_reverted == 1
+        assert response.results[0].status == "already_reverted"
+        assert response.results[0].new_action_id == 42
+        revert.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_revert_action_skips_existing_revert_rows(
+        self, patch_get_flags
+    ) -> None:
+        rows = [_FakeAction(id=99, tool_name="_revert:edit_file", reverse_of=42)]
+        session = _FakeSession()
+        session.queue(_Result(rows=rows))
+
+        with (
+            patch_get_flags(_enabled_flags()),
+            patch.object(
+                agent_revert_route, "load_thread", AsyncMock(return_value=object())
+            ),
+            patch.object(agent_revert_route, "revert_action", AsyncMock()) as revert,
+        ):
+            response = await agent_revert_route.revert_agent_turn(
+                thread_id=1,
+                chat_turn_id="ct-rev",
+                session=session,
+                user=_FakeUser(),
+            )
+        assert response.status == "ok"
+        assert response.results[0].status == "skipped"
+        revert.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_partial_success_when_some_rows_not_reversible(
+        self, patch_get_flags
+    ) -> None:
+        rows = [
+            _FakeAction(id=2, tool_name="send_email"),
+            _FakeAction(id=1, tool_name="edit_file"),
+        ]
+        session = _FakeSession()
+        session.queue(_Result(rows=rows))
+        # Single batched idempotency probe.
+        session.queue(_Result(rows=[]))
+
+        async def _fake_revert(_session, *, action, requester_user_id):
+            if action.tool_name == "send_email":
+                return RevertOutcome(
+                    status="not_reversible",
+                    message="connector revert not yet implemented",
+                )
+            return RevertOutcome(status="ok", message="ok", new_action_id=500)
+
+        with (
+            patch_get_flags(_enabled_flags()),
+            patch.object(
+                agent_revert_route, "load_thread", AsyncMock(return_value=object())
+            ),
+            patch.object(
+                agent_revert_route, "revert_action", AsyncMock(side_effect=_fake_revert)
+            ),
+        ):
+            response = await agent_revert_route.revert_agent_turn(
+                thread_id=1,
+                chat_turn_id="ct-mix",
+                session=session,
+                user=_FakeUser(),
+            )
+        assert response.status == "partial"
+        assert response.reverted == 1
+        assert response.not_reversible == 1
+        statuses = sorted(r.status for r in response.results)
+        assert statuses == ["not_reversible", "reverted"]
+
+    @pytest.mark.asyncio
+    async def test_unexpected_exception_marks_row_failed_not_batch(
+        self, patch_get_flags
+    ) -> None:
+        rows = [
+            _FakeAction(id=20, tool_name="edit_file"),
+            _FakeAction(id=21, tool_name="edit_file"),
+        ]
+        session = _FakeSession()
+        session.queue(_Result(rows=rows))
+        # Single batched idempotency probe.
+        session.queue(_Result(rows=[]))
+
+        async def _fake_revert(_session, *, action, requester_user_id):
+            if action.id == 20:
+                raise RuntimeError("disk on fire")
+            return RevertOutcome(status="ok", message="ok", new_action_id=999)
+
+        with (
+            patch_get_flags(_enabled_flags()),
+            patch.object(
+                agent_revert_route, "load_thread", AsyncMock(return_value=object())
+            ),
+            patch.object(
+                agent_revert_route, "revert_action", AsyncMock(side_effect=_fake_revert)
+            ),
+        ):
+            response = await agent_revert_route.revert_agent_turn(
+                thread_id=1,
+                chat_turn_id="ct-fail",
+                session=session,
+                user=_FakeUser(),
+            )
+        assert response.status == "partial"
+        assert response.failed == 1
+        assert response.reverted == 1
+        bad = next(r for r in response.results if r.action_id == 20)
+        assert bad.status == "failed"
+        assert "disk on fire" in (bad.error or "")
+        good = next(r for r in response.results if r.action_id == 21)
+        assert good.status == "reverted"
+
+    @pytest.mark.asyncio
+    async def test_permission_denied_when_other_user_owns_action(
+        self, patch_get_flags
+    ) -> None:
+        rows = [_FakeAction(id=7, tool_name="edit_file", user_id="someone-else")]
+        session = _FakeSession()
+        session.queue(_Result(rows=rows))
+        # Batch idempotency probe (no prior reverts).
+        session.queue(_Result(rows=[]))
+
+        with (
+            patch_get_flags(_enabled_flags()),
+            patch.object(
+                agent_revert_route, "load_thread", AsyncMock(return_value=object())
+            ),
+            patch.object(agent_revert_route, "revert_action", AsyncMock()) as revert,
+        ):
+            response = await agent_revert_route.revert_agent_turn(
+                thread_id=1,
+                chat_turn_id="ct-perm",
+                session=session,
+                user=_FakeUser(id="not-owner"),
+            )
+        assert response.status == "partial"
+        assert response.results[0].status == "permission_denied"
+        # ``permission_denied`` has its own dedicated counter so the
+        # response invariant ``total == sum(counters)`` always holds
+        # without overloading ``not_reversible`` (which historically
+        # absorbed this case and confused frontend toasts).
+        assert response.permission_denied == 1
+        assert response.not_reversible == 0
+        revert.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_counter_invariant_holds_across_mixed_outcomes(
+        self, patch_get_flags
+    ) -> None:
+        """Every row is accounted for in EXACTLY ONE counter.
+
+        Mixes one of every supported outcome (reverted, already_reverted,
+        not_reversible, permission_denied, failed, skipped) and asserts
+        that the sum of counters equals ``response.total``.
+        """
+        rows = [
+            _FakeAction(id=10, tool_name="edit_file"),  # ok
+            _FakeAction(id=9, tool_name="edit_file"),  # already_reverted
+            _FakeAction(id=8, tool_name="send_email"),  # not_reversible
+            _FakeAction(id=7, tool_name="rm", user_id="other"),  # permission_denied
+            _FakeAction(id=6, tool_name="edit_file"),  # failed
+            _FakeAction(id=5, tool_name="_revert:edit_file", reverse_of=99),  # skipped
+        ]
+        session = _FakeSession()
+        session.queue(_Result(rows=rows))
+        # Single batched probe; only id=9 has a prior revert.
+        # Schema: list[(revert_id, original_id)].
+        session.queue(_Result(rows=[(42, 9)]))
+
+        async def _fake_revert(_session, *, action, requester_user_id):
+            if action.id == 10:
+                return RevertOutcome(status="ok", message="ok", new_action_id=500)
+            if action.id == 8:
+                return RevertOutcome(
+                    status="not_reversible",
+                    message="connector revert not yet implemented",
+                )
+            if action.id == 6:
+                raise RuntimeError("boom")
+            raise AssertionError(f"unexpected revert call for {action.id}")
+
+        with (
+            patch_get_flags(_enabled_flags()),
+            patch.object(
+                agent_revert_route, "load_thread", AsyncMock(return_value=object())
+            ),
+            patch.object(
+                agent_revert_route,
+                "revert_action",
+                AsyncMock(side_effect=_fake_revert),
+            ),
+        ):
+            response = await agent_revert_route.revert_agent_turn(
+                thread_id=1,
+                chat_turn_id="ct-mixed-all",
+                session=session,
+                user=_FakeUser(),  # only id=7 has a different user_id
+            )
+
+        assert response.total == len(rows) == 6
+        bucket_sum = (
+            response.reverted
+            + response.already_reverted
+            + response.not_reversible
+            + response.permission_denied
+            + response.failed
+            + response.skipped
+        )
+        assert bucket_sum == response.total, (
+            "Counter invariant broken: total "
+            f"({response.total}) != sum of counters ({bucket_sum}). "
+            f"Counters: reverted={response.reverted}, "
+            f"already_reverted={response.already_reverted}, "
+            f"not_reversible={response.not_reversible}, "
+            f"permission_denied={response.permission_denied}, "
+            f"failed={response.failed}, skipped={response.skipped}"
+        )
+        assert response.reverted == 1
+        assert response.already_reverted == 1
+        assert response.not_reversible == 1
+        assert response.permission_denied == 1
+        assert response.failed == 1
+        assert response.skipped == 1
+
+    @pytest.mark.asyncio
+    async def test_integrity_error_translates_to_already_reverted(
+        self, patch_get_flags
+    ) -> None:
+        """The partial unique index on ``reverse_of`` raises
+        ``IntegrityError`` when a concurrent revert wins the race against
+        the pre-flight ``_was_already_reverted`` SELECT. The route MUST
+        recover by re-querying for the winning revert id and returning
+        ``status="already_reverted"`` (not ``"failed"``) so racing
+        clients see consistent idempotent semantics.
+        """
+        from sqlalchemy.exc import IntegrityError
+
+        rows = [_FakeAction(id=33, tool_name="edit_file")]
+        session = _FakeSession()
+        session.queue(_Result(rows=rows))
+        # Batch pre-flight probe: nothing yet (we'll race).
+        session.queue(_Result(rows=[]))
+        # Post-IntegrityError fallback uses the SCALAR
+        # ``_was_already_reverted`` (single-id lookup) so it pulls
+        # ``[777]`` via ``.scalars().first()``.
+        session.queue(_Result(rows=[777]))
+
+        async def _racing_revert(_session, *, action, requester_user_id):
+            raise IntegrityError("INSERT", {}, Exception("dup reverse_of"))
+
+        with (
+            patch_get_flags(_enabled_flags()),
+            patch.object(
+                agent_revert_route, "load_thread", AsyncMock(return_value=object())
+            ),
+            patch.object(
+                agent_revert_route,
+                "revert_action",
+                AsyncMock(side_effect=_racing_revert),
+            ),
+        ):
+            response = await agent_revert_route.revert_agent_turn(
+                thread_id=1,
+                chat_turn_id="ct-race",
+                session=session,
+                user=_FakeUser(),
+            )
+
+        assert response.failed == 0, (
+            "IntegrityError must NOT surface as a failed row; the unique "
+            "index is the durable expression of idempotency."
+        )
+        assert response.already_reverted == 1
+        assert response.results[0].status == "already_reverted"
+        assert response.results[0].new_action_id == 777
diff --git a/surfsense_backend/tests/unit/services/streaming/__init__.py b/surfsense_backend/tests/unit/services/streaming/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/tests/unit/services/streaming/test_emitter.py b/surfsense_backend/tests/unit/services/streaming/test_emitter.py
new file mode 100644
index 0000000000..6c4e1ff58a
--- /dev/null
+++ b/surfsense_backend/tests/unit/services/streaming/test_emitter.py
@@ -0,0 +1,79 @@
+"""Pin the wire compactness rule and the top-level ``emitted_by`` field name."""
+
+from __future__ import annotations
+
+import pytest
+
+from app.services.streaming.emitter import (
+    Emitter,
+    attach_emitted_by,
+    main_emitter,
+    subagent_emitter,
+)
+
+pytestmark = pytest.mark.unit
+
+
+def test_main_emitter_payload_contains_only_level() -> None:
+    payload = main_emitter().to_payload()
+    assert payload == {"level": "main"}
+
+
+def test_subagent_emitter_payload_includes_all_set_fields() -> None:
+    payload = subagent_emitter(
+        subagent_type="deliverables",
+        subagent_run_id="subagent_abc",
+        parent_tool_call_id="call_xyz",
+    ).to_payload()
+    assert payload == {
+        "level": "subagent",
+        "subagent_type": "deliverables",
+        "subagent_run_id": "subagent_abc",
+        "parent_tool_call_id": "call_xyz",
+    }
+
+
+def test_subagent_emitter_payload_omits_unset_optional_fields() -> None:
+    """parent_tool_call_id is None when the run is started outside a tool boundary."""
+    payload = Emitter(
+        level="subagent",
+        subagent_type="email",
+        subagent_run_id="subagent_1",
+    ).to_payload()
+    assert "parent_tool_call_id" not in payload
+    assert payload["subagent_type"] == "email"
+
+
+def test_extra_fields_merge_into_payload() -> None:
+    """Future extension fields (e.g. lane colour, label) flow through ``extra``."""
+    emitter = subagent_emitter(
+        subagent_type="search",
+        subagent_run_id="r1",
+        extra={"label": "Web Search"},
+    )
+    assert emitter.to_payload()["label"] == "Web Search"
+
+
+def test_attach_emitted_by_with_none_is_noop() -> None:
+    payload = {"type": "text-delta", "delta": "hi"}
+    result = attach_emitted_by(payload, None)
+    assert "emitted_by" not in result
+    assert result is payload
+
+
+def test_attach_emitted_by_adds_payload_under_snake_case_top_level_key() -> None:
+    payload = {"type": "text-delta", "delta": "hi"}
+    attach_emitted_by(
+        payload,
+        subagent_emitter(
+            subagent_type="x",
+            subagent_run_id="y",
+            parent_tool_call_id="z",
+        ),
+    )
+    assert payload["emitted_by"] == {
+        "level": "subagent",
+        "subagent_type": "x",
+        "subagent_run_id": "y",
+        "parent_tool_call_id": "z",
+    }
diff --git a/surfsense_backend/tests/unit/services/streaming/test_emitter_registry.py b/surfsense_backend/tests/unit/services/streaming/test_emitter_registry.py
new file mode 100644
index 0000000000..e459c946a3
--- /dev/null
+++ b/surfsense_backend/tests/unit/services/streaming/test_emitter_registry.py
@@ -0,0 +1,111 @@
+"""Pin the parent_ids walk + parallel sub-agent isolation that drives lane attribution."""
+
+from __future__ import annotations
+
+import pytest
+
+from app.services.streaming.emitter import (
+    Emitter,
+    EmitterRegistry,
+    main_emitter,
+    subagent_emitter,
+)
+
+pytestmark = pytest.mark.unit
+
+
+def _sub(run_id: str, kind: str = "deliverables") -> Emitter:
+    return subagent_emitter(
+        subagent_type=kind,
+        subagent_run_id=f"sub_{run_id}",
+        parent_tool_call_id=f"call_{run_id}",
+    )
+
+
+def test_unregistered_event_resolves_to_main_emitter() -> None:
+    registry = EmitterRegistry()
+    resolved = registry.resolve(run_id="run_1", parent_ids=["root"])
+    assert resolved is main_emitter()
+
+
+def test_event_owned_by_registered_run_id_returns_that_emitter() -> None:
+    registry = EmitterRegistry()
+    emitter = _sub("a")
+    registry.register("run_task_a", emitter)
+    assert registry.resolve(run_id="run_task_a", parent_ids=[]) is emitter
+
+
+def test_descendant_resolves_via_parent_ids_chain() -> None:
+    """A model-call event nested under the task tool inherits its sub-agent emitter."""
+    registry = EmitterRegistry()
+    emitter = _sub("a")
+    registry.register("run_task_a", emitter)
+    descendant = registry.resolve(
+        run_id="run_chat_model",
+        parent_ids=["root", "run_agent", "run_task_a"],
+    )
+    assert descendant is emitter
+
+
+def test_nearest_registered_ancestor_wins_over_distant_ones() -> None:
+    """Inner sub-agents owe their emitter to the nearest task tool, not the outer one."""
+    registry = EmitterRegistry()
+    outer = _sub("outer", kind="planner")
+    inner = _sub("inner", kind="email")
+    registry.register("run_outer", outer)
+    registry.register("run_inner", inner)
+    resolved = registry.resolve(
+        run_id="run_inner_tool",
+        parent_ids=["root", "run_outer", "run_inner"],
+    )
+    assert resolved is inner
+
+
+def test_parallel_subagents_do_not_bleed_into_each_other() -> None:
+    """Two concurrent task tools each own their own descendant events."""
+    registry = EmitterRegistry()
+    a = _sub("a", kind="search")
+    b = _sub("b", kind="email")
+    registry.register("run_task_a", a)
+    registry.register("run_task_b", b)
+
+    from_a = registry.resolve(run_id="x", parent_ids=["root", "run_task_a"])
+    from_b = registry.resolve(run_id="y", parent_ids=["root", "run_task_b"])
+    from_main = registry.resolve(run_id="z", parent_ids=["root"])
+
+    assert from_a is a
+    assert from_b is b
+    assert from_main is main_emitter()
+
+
+def test_unregister_releases_run_id_so_descendants_fall_back_to_main() -> None:
+    registry = EmitterRegistry()
+    emitter = _sub("a")
+    registry.register("run_task_a", emitter)
+    registry.unregister("run_task_a")
+    assert registry.resolve(run_id="x", parent_ids=["run_task_a"]) is main_emitter()
+
+
+def test_unregister_returns_the_previously_registered_emitter() -> None:
+    """Lets callers emit ``data-subagent-finish`` carrying the same emitter they opened with."""
+    registry = EmitterRegistry()
+    emitter = _sub("a")
+    registry.register("run_task_a", emitter)
+    assert registry.unregister("run_task_a") is emitter
+
+
+def test_has_active_subagents_tracks_open_lanes() -> None:
+    registry = EmitterRegistry()
+    assert not registry.has_active_subagents()
+    registry.register("run_task_a", _sub("a"))
+    assert registry.has_active_subagents()
+    registry.unregister("run_task_a")
+    assert not registry.has_active_subagents()
+
+
+def test_empty_run_id_and_parent_ids_resolves_to_main() -> None:
+    """Defensive: events without identifiers always belong to the main lane."""
+    registry = EmitterRegistry()
+    registry.register("run_task_a", _sub("a"))
+    assert registry.resolve(run_id=None, parent_ids=None) is main_emitter()
+    assert registry.resolve(run_id="", parent_ids=[]) is main_emitter()
diff --git a/surfsense_backend/tests/unit/services/streaming/test_interrupt_correlation.py b/surfsense_backend/tests/unit/services/streaming/test_interrupt_correlation.py
new file mode 100644
index 0000000000..df89ca59af
--- /dev/null
+++ b/surfsense_backend/tests/unit/services/streaming/test_interrupt_correlation.py
@@ -0,0 +1,158 @@
+"""Pin id-aware pending-interrupt lookup that replaces the buggy first-wins."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+
+import pytest
+
+from app.services.streaming.interrupt_correlation import (
+    PendingInterrupt,
+    first_pending_interrupt,
+    get_pending_interrupt_by_id,
+    get_pending_interrupt_for_tool_call,
+    list_pending_interrupts,
+)
+
+pytestmark = pytest.mark.unit
+
+
+@dataclass
+class _Interrupt:
+    value: dict[str, Any]
+    id: str | None = None
+
+
+@dataclass
+class _Task:
+    interrupts: tuple[_Interrupt, ...] = ()
+    id: str | None = None
+
+
+@dataclass
+class _State:
+    tasks: tuple[_Task, ...] = ()
+    interrupts: tuple[_Interrupt, ...] = ()
+
+
+def _hitl(name: str, tool_call_id: str | None = None) -> dict[str, Any]:
+    """Minimal LangChain HITLRequest payload for one action."""
+    action: dict[str, Any] = {"name": name, "args": {}}
+    if tool_call_id is not None:
+        action["tool_call_id"] = tool_call_id
+    return {
+        "action_requests": [action],
+        "review_configs": [{"action_name": name, "allowed_decisions": ["approve"]}],
+    }
+
+
+def test_empty_state_has_no_pending_interrupts() -> None:
+    state = _State()
+    assert list_pending_interrupts(state) == []
+    assert first_pending_interrupt(state) is None
+
+
+def test_single_pending_interrupt_in_task_is_returned() -> None:
+    state = _State(
+        tasks=(
+            _Task(
+                id="task_1",
+                interrupts=(_Interrupt(value=_hitl("send_email"), id="int_1"),),
+            ),
+        )
+    )
+    pending = list_pending_interrupts(state)
+    assert len(pending) == 1
+    assert pending[0] == PendingInterrupt(
+        interrupt_id="int_1",
+        value=_hitl("send_email"),
+        source_task_id="task_1",
+    )
+
+
+def test_pending_interrupts_returned_in_task_then_root_order() -> None:
+    """Determinism matters: callers iterate in this order to render the UI."""
+    state = _State(
+        tasks=(
+            _Task(
+                id="task_a",
+                interrupts=(_Interrupt(value=_hitl("a"), id="int_a"),),
+            ),
+            _Task(
+                id="task_b",
+                interrupts=(_Interrupt(value=_hitl("b"), id="int_b"),),
+            ),
+        ),
+        interrupts=(_Interrupt(value=_hitl("c"), id="int_c"),),
+    )
+    pending = list_pending_interrupts(state)
+    ids = [p.interrupt_id for p in pending]
+    assert ids == ["int_a", "int_b", "int_c"]
+
+
+def test_get_by_id_finds_the_right_interrupt_under_parallel_load() -> None:
+    """Replacing first-wins: id-aware lookup MUST pick the requested one."""
+    state = _State(
+        tasks=(
+            _Task(interrupts=(_Interrupt(value=_hitl("a"), id="int_a"),)),
+            _Task(interrupts=(_Interrupt(value=_hitl("b"), id="int_b"),)),
+            _Task(interrupts=(_Interrupt(value=_hitl("c"), id="int_c"),)),
+        )
+    )
+    found = get_pending_interrupt_by_id(state, "int_b")
+    assert found is not None
+    assert found.value["action_requests"][0]["name"] == "b"
+
+
+def test_get_by_id_returns_none_when_id_is_not_pending() -> None:
+    state = _State(
+        tasks=(_Task(interrupts=(_Interrupt(value=_hitl("a"), id="int_a"),)),)
+    )
+    assert get_pending_interrupt_by_id(state, "missing") is None
+
+
+def test_get_by_tool_call_id_matches_action_request_payload() -> None:
+    """HITLRequest carries ``tool_call_id`` per action; lookup uses that."""
+    state = _State(
+        tasks=(
+            _Task(
+                interrupts=(
+                    _Interrupt(value=_hitl("a", tool_call_id="call_xxx"), id="int_a"),
+                    _Interrupt(value=_hitl("b", tool_call_id="call_yyy"), id="int_b"),
+                )
+            ),
+        )
+    )
+    found = get_pending_interrupt_for_tool_call(state, "call_yyy")
+    assert found is not None
+    assert found.interrupt_id == "int_b"
+
+
+def test_first_pending_interrupt_matches_legacy_first_wins_behaviour() -> None:
+    """Sequential-turn safety: the explicit shortcut still returns the first."""
+    state = _State(
+        tasks=(_Task(interrupts=(_Interrupt(value=_hitl("first"), id="int_1"),)),),
+        interrupts=(_Interrupt(value=_hitl("second"), id="int_2"),),
+    )
+    first = first_pending_interrupt(state)
+    assert first is not None
+    assert first.interrupt_id == "int_1"
+
+
+def test_interrupt_without_id_falls_back_to_none() -> None:
+    """Snapshots from older LangGraph versions may omit ``id`` — preserve that."""
+    state = _State(tasks=(_Task(interrupts=(_Interrupt(value=_hitl("a"), id=None),)),))
+    pending = list_pending_interrupts(state)
+    assert len(pending) == 1
+    assert pending[0].interrupt_id is None
+
+
+def test_non_dict_interrupt_values_are_ignored() -> None:
+    """Defensive: a non-dict value should not crash the iteration."""
+
+    class _Raw:
+        value = "not a dict"
+
+    state = _State(tasks=(_Task(interrupts=(_Raw(),)),))  # type: ignore[arg-type]
+    assert list_pending_interrupts(state) == []
diff --git a/surfsense_backend/tests/unit/services/streaming/test_interrupt_events.py b/surfsense_backend/tests/unit/services/streaming/test_interrupt_events.py
new file mode 100644
index 0000000000..a6685b5248
--- /dev/null
+++ b/surfsense_backend/tests/unit/services/streaming/test_interrupt_events.py
@@ -0,0 +1,89 @@
+"""Pin interrupt-payload normalisation and the optional correlation fields on the wire."""
+
+from __future__ import annotations
+
+import json
+
+import pytest
+
+from app.services.streaming.events.interrupt import (
+    format_interrupt_request,
+    normalize_interrupt_payload,
+)
+
+pytestmark = pytest.mark.unit
+
+
+def _decode(frame: str) -> dict:
+    body = frame.removeprefix("data: ").removesuffix("\n\n")
+    return json.loads(body)
+
+
+def test_hitlrequest_shape_is_passed_through_unchanged() -> None:
+    raw = {
+        "action_requests": [{"name": "send_email", "args": {"to": "a@b"}}],
+        "review_configs": [
+            {"action_name": "send_email", "allowed_decisions": ["approve"]}
+        ],
+    }
+    assert normalize_interrupt_payload(raw) == raw
+
+
+def test_custom_interrupt_primitive_is_converted_to_canonical_shape() -> None:
+    raw = {
+        "type": "permission",
+        "message": "Allow send?",
+        "action": {"tool": "send_email", "params": {"to": "a@b"}},
+        "context": {"reason": "destructive"},
+    }
+    out = normalize_interrupt_payload(raw)
+    assert out["action_requests"] == [{"name": "send_email", "args": {"to": "a@b"}}]
+    assert out["review_configs"] == [
+        {
+            "action_name": "send_email",
+            "allowed_decisions": ["approve", "edit", "reject"],
+        }
+    ]
+    assert out["interrupt_type"] == "permission"
+    assert out["message"] == "Allow send?"
+    assert out["context"] == {"reason": "destructive"}
+
+
+def test_custom_interrupt_without_message_omits_message_key() -> None:
+    """Optional fields stay optional on the wire; FE does not see ``"message": None``."""
+    raw = {"action": {"tool": "send_email"}}
+    out = normalize_interrupt_payload(raw)
+    assert "message" not in out
+
+
+def test_custom_interrupt_without_tool_falls_back_to_unknown_tool() -> None:
+    """Defensive: a malformed ``action`` block must not crash the relay."""
+    out = normalize_interrupt_payload({"type": "x", "action": {}})
+    assert out["action_requests"][0]["name"] == "unknown_tool"
+    assert out["review_configs"][0]["action_name"] == "unknown_tool"
+
+
+def test_format_interrupt_request_carries_correlation_fields_on_the_wire() -> None:
+    frame = format_interrupt_request(
+        {"action_requests": [], "review_configs": []},
+        interrupt_id="int_42",
+        pending_interrupt_count=3,
+        chat_turn_id="turn_99",
+    )
+    payload = _decode(frame)
+    assert payload["type"] == "data-interrupt-request"
+    inner = payload["data"]
+    assert inner["interrupt_id"] == "int_42"
+    assert inner["pending_interrupt_count"] == 3
+    assert inner["chat_turn_id"] == "turn_99"
+
+
+def test_format_interrupt_request_omits_correlation_fields_when_unset() -> None:
+    """Backward compat: legacy single-interrupt callers don't have to supply ids."""
+    frame = format_interrupt_request(
+        {"action_requests": [], "review_configs": []},
+    )
+    inner = _decode(frame)["data"]
+    assert "interrupt_id" not in inner
+    assert "pending_interrupt_count" not in inner
+    assert "chat_turn_id" not in inner
diff --git a/surfsense_backend/tests/unit/services/streaming/test_service_emitter_propagation.py b/surfsense_backend/tests/unit/services/streaming/test_service_emitter_propagation.py
new file mode 100644
index 0000000000..b381f13bcc
--- /dev/null
+++ b/surfsense_backend/tests/unit/services/streaming/test_service_emitter_propagation.py
@@ -0,0 +1,142 @@
+"""Pin that sub-agent emitter reaches every wire event the relay emits."""
+
+from __future__ import annotations
+
+import json
+
+import pytest
+
+from app.services.streaming.emitter import subagent_emitter
+from app.services.streaming.service import StreamingService
+
+pytestmark = pytest.mark.unit
+
+
+def _decode(frame: str) -> dict:
+    body = frame.removeprefix("data: ").removesuffix("\n\n")
+    return json.loads(body)
+
+
+@pytest.fixture
+def service() -> StreamingService:
+    return StreamingService()
+
+
+@pytest.fixture
+def sub_emitter():
+    return subagent_emitter(
+        subagent_type="deliverables",
+        subagent_run_id="sub_xyz",
+        parent_tool_call_id="call_parent",
+    )
+
+
+def test_text_delta_carries_subagent_emitter_on_the_wire(service, sub_emitter) -> None:
+    payload = _decode(service.format_text_delta("text_1", "hi", emitter=sub_emitter))
+    assert payload["emitted_by"]["subagent_run_id"] == "sub_xyz"
+    assert payload["delta"] == "hi"
+
+
+def test_reasoning_delta_carries_subagent_emitter_on_the_wire(
+    service, sub_emitter
+) -> None:
+    payload = _decode(
+        service.format_reasoning_delta("r_1", "thinking", emitter=sub_emitter)
+    )
+    assert payload["emitted_by"]["subagent_run_id"] == "sub_xyz"
+
+
+def test_tool_input_start_carries_subagent_emitter_and_lc_id(
+    service, sub_emitter
+) -> None:
+    payload = _decode(
+        service.format_tool_input_start(
+            "call_1",
+            "send_email",
+            langchain_tool_call_id="lc_1",
+            emitter=sub_emitter,
+        )
+    )
+    assert payload["emitted_by"]["subagent_type"] == "deliverables"
+    assert payload["langchainToolCallId"] == "lc_1"
+    assert payload["toolName"] == "send_email"
+
+
+def test_tool_output_available_carries_subagent_emitter(service, sub_emitter) -> None:
+    payload = _decode(
+        service.format_tool_output_available(
+            "call_1", {"ok": True}, emitter=sub_emitter
+        )
+    )
+    assert payload["emitted_by"]["subagent_run_id"] == "sub_xyz"
+    assert payload["output"] == {"ok": True}
+
+
+def test_thinking_step_carries_subagent_emitter(service, sub_emitter) -> None:
+    payload = _decode(
+        service.format_thinking_step(
+            step_id="s1",
+            title="Sending email",
+            status="in_progress",
+            emitter=sub_emitter,
+        )
+    )
+    assert payload["type"] == "data-thinking-step"
+    assert payload["emitted_by"]["subagent_run_id"] == "sub_xyz"
+
+
+def test_action_log_carries_subagent_emitter(service, sub_emitter) -> None:
+    payload = _decode(
+        service.format_action_log(
+            {"id": 1, "tool_name": "send_email", "reversible": False},
+            emitter=sub_emitter,
+        )
+    )
+    assert payload["emitted_by"]["subagent_run_id"] == "sub_xyz"
+    assert payload["data"]["tool_name"] == "send_email"
+
+
+def test_subagent_lifecycle_events_share_run_id_for_pairing(
+    service, sub_emitter
+) -> None:
+    start = _decode(
+        service.format_subagent_start(
+            subagent_run_id="sub_xyz",
+            subagent_type="deliverables",
+            parent_tool_call_id="call_parent",
+            emitter=sub_emitter,
+        )
+    )
+    finish = _decode(
+        service.format_subagent_finish(
+            subagent_run_id="sub_xyz",
+            subagent_type="deliverables",
+            parent_tool_call_id="call_parent",
+            emitter=sub_emitter,
+        )
+    )
+    assert start["data"]["subagent_run_id"] == finish["data"]["subagent_run_id"]
+    assert start["type"] == "data-subagent-start"
+    assert finish["type"] == "data-subagent-finish"
+
+
+def test_main_emitter_events_omit_emitted_by_field(service) -> None:
+    payload = _decode(service.format_text_delta("text_1", "hi"))
+    assert "emitted_by" not in payload
+
+
+def test_resolve_emitter_through_service_uses_registry(service, sub_emitter) -> None:
+    service.emitter_registry.register("run_task_1", sub_emitter)
+    resolved = service.resolve_emitter(
+        run_id="run_chat_model",
+        parent_ids=["root", "run_task_1"],
+    )
+    assert resolved is sub_emitter
+
+
+def test_message_id_is_assigned_on_message_start_and_reused(service) -> None:
+    frame = service.format_message_start()
+    payload = _decode(frame)
+    assigned = payload["messageId"]
+    assert assigned.startswith("msg_")
+    assert service.message_id == assigned
diff --git a/surfsense_backend/tests/unit/services/streaming/test_sse_envelope.py b/surfsense_backend/tests/unit/services/streaming/test_sse_envelope.py
new file mode 100644
index 0000000000..511e4575a1
--- /dev/null
+++ b/surfsense_backend/tests/unit/services/streaming/test_sse_envelope.py
@@ -0,0 +1,51 @@
+"""Pin the exact SSE wire bytes the FE parser depends on."""
+
+from __future__ import annotations
+
+import json
+
+import pytest
+
+from app.services.streaming.envelope import (
+    format_done,
+    format_sse,
+    get_response_headers,
+)
+
+pytestmark = pytest.mark.unit
+
+
+class TestFormatSse:
+    def test_dict_payload_is_json_serialised(self) -> None:
+        frame = format_sse({"type": "start", "messageId": "msg_1"})
+        assert frame.startswith("data: ")
+        assert frame.endswith("\n\n")
+        body = frame[len("data: ") : -2]
+        assert json.loads(body) == {"type": "start", "messageId": "msg_1"}
+
+    def test_string_payload_is_emitted_verbatim(self) -> None:
+        frame = format_sse('{"already":"json"}')
+        assert frame == 'data: {"already":"json"}\n\n'
+
+    def test_nested_payload_round_trips(self) -> None:
+        payload = {
+            "type": "data-action-log",
+            "data": {"id": 7, "tool_name": "ls", "reversible": False},
+        }
+        frame = format_sse(payload)
+        body = frame.removeprefix("data: ").removesuffix("\n\n")
+        assert json.loads(body) == payload
+
+
+class TestFormatDone:
+    def test_done_marker_is_literal(self) -> None:
+        assert format_done() == "data: [DONE]\n\n"
+
+
+class TestResponseHeaders:
+    def test_headers_pin_ai_sdk_v1_protocol(self) -> None:
+        headers = get_response_headers()
+        assert headers["Content-Type"] == "text/event-stream"
+        assert headers["Cache-Control"] == "no-cache"
+        assert headers["Connection"] == "keep-alive"
+        assert headers["x-vercel-ai-ui-message-stream"] == "v1"
diff --git a/surfsense_backend/tests/unit/services/test_agent_billing_resolver.py b/surfsense_backend/tests/unit/services/test_agent_billing_resolver.py
new file mode 100644
index 0000000000..fa8819b394
--- /dev/null
+++ b/surfsense_backend/tests/unit/services/test_agent_billing_resolver.py
@@ -0,0 +1,436 @@
+"""Unit tests for ``_resolve_agent_billing_for_search_space``.
+
+Validates the resolver used by Celery podcast/video tasks to compute
+``(owner_user_id, billing_tier, base_model)`` from a search space and its
+agent LLM config. The resolver mirrors chat's billing-resolution pattern at
+``stream_new_chat.py:2294-2351`` and is the single integration point that
+prevents Auto-mode podcast/video from leaking premium credit.
+
+Coverage:
+
+* Auto mode + ``thread_id`` set, pin resolves to a negative-id premium
+  global → returns ``("premium", <base_model>)``.
+* Auto mode + ``thread_id`` set, pin resolves to a negative-id free
+  global → returns ``("free", <base_model>)``.
+* Auto mode + ``thread_id`` set, pin resolves to a positive-id BYOK config
+  → always ``"free"``.
+* Auto mode + ``thread_id=None`` → fallback to ``("free", "auto")`` without
+  hitting the pin service.
+* Negative id (no Auto) → uses ``get_global_llm_config``'s
+  ``billing_tier``.
+* Positive id (user BYOK) → always ``"free"``.
+* Search space not found → raises ``ValueError``.
+* ``agent_llm_id`` is None → raises ``ValueError``.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from types import SimpleNamespace
+from uuid import UUID, uuid4
+
+import pytest
+
+pytestmark = pytest.mark.unit
+
+
+# ---------------------------------------------------------------------------
+# Fakes
+# ---------------------------------------------------------------------------
+
+
+class _FakeExecResult:
+    def __init__(self, obj):
+        self._obj = obj
+
+    def scalars(self):
+        return self
+
+    def first(self):
+        return self._obj
+
+
+class _FakeSession:
+    """Tiny AsyncSession stub.
+
+    ``responses`` is a list of objects to return from successive
+    ``execute()`` calls (in order). The resolver makes at most two
+    ``execute()`` calls (search-space lookup, then optionally NewLLMConfig
+    lookup), so two queued responses cover the matrix.
+    """
+
+    def __init__(self, responses: list):
+        self._responses = list(responses)
+
+    async def execute(self, _stmt):
+        if not self._responses:
+            return _FakeExecResult(None)
+        return _FakeExecResult(self._responses.pop(0))
+
+    async def commit(self) -> None:
+        pass
+
+
+@dataclass
+class _FakePinResolution:
+    resolved_llm_config_id: int
+    resolved_tier: str = "premium"
+    from_existing_pin: bool = False
+
+
+def _make_search_space(*, agent_llm_id: int | None, user_id: UUID) -> SimpleNamespace:
+    return SimpleNamespace(
+        id=42,
+        agent_llm_id=agent_llm_id,
+        user_id=user_id,
+    )
+
+
+def _make_byok_config(
+    *, id_: int, base_model: str | None = None, model_name: str = "gpt-byok"
+) -> SimpleNamespace:
+    return SimpleNamespace(
+        id=id_,
+        model_name=model_name,
+        litellm_params={"base_model": base_model} if base_model else {},
+    )
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_auto_mode_with_thread_id_resolves_to_premium_global(monkeypatch):
+    """Auto + thread → pin service resolves to negative-id premium config →
+    resolver returns ``("premium", <base_model>)``."""
+    from app.services.billable_calls import _resolve_agent_billing_for_search_space
+
+    user_id = uuid4()
+    session = _FakeSession([_make_search_space(agent_llm_id=0, user_id=user_id)])
+
+    # Mock the pin service to return a concrete premium config id.
+    async def _fake_resolve_pin(
+        sess,
+        *,
+        thread_id,
+        search_space_id,
+        user_id,
+        selected_llm_config_id,
+        force_repin_free=False,
+    ):
+        assert selected_llm_config_id == 0
+        assert thread_id == 99
+        return _FakePinResolution(resolved_llm_config_id=-1, resolved_tier="premium")
+
+    # Mock global config lookup to return a premium entry.
+    def _fake_get_global(cfg_id):
+        if cfg_id == -1:
+            return {
+                "id": -1,
+                "model_name": "gpt-5.4",
+                "billing_tier": "premium",
+                "litellm_params": {"base_model": "gpt-5.4"},
+            }
+        return None
+
+    # Lazy imports inside the resolver — patch the *target* modules so the
+    # imported names resolve to our fakes.
+    import app.services.auto_model_pin_service as pin_module
+    import app.services.llm_service as llm_module
+
+    monkeypatch.setattr(
+        pin_module, "resolve_or_get_pinned_llm_config_id", _fake_resolve_pin
+    )
+    monkeypatch.setattr(llm_module, "get_global_llm_config", _fake_get_global)
+
+    owner, tier, base_model = await _resolve_agent_billing_for_search_space(
+        session, search_space_id=42, thread_id=99
+    )
+
+    assert owner == user_id
+    assert tier == "premium"
+    assert base_model == "gpt-5.4"
+
+
+@pytest.mark.asyncio
+async def test_auto_mode_with_thread_id_resolves_to_free_global(monkeypatch):
+    """Auto + thread → pin returns negative-id free config → resolver
+    returns ``("free", <base_model>)``. Same path the pin service takes for
+    out-of-credit users (graceful degradation)."""
+    from app.services.billable_calls import _resolve_agent_billing_for_search_space
+
+    user_id = uuid4()
+    session = _FakeSession([_make_search_space(agent_llm_id=0, user_id=user_id)])
+
+    async def _fake_resolve_pin(
+        sess,
+        *,
+        thread_id,
+        search_space_id,
+        user_id,
+        selected_llm_config_id,
+        force_repin_free=False,
+    ):
+        return _FakePinResolution(resolved_llm_config_id=-3, resolved_tier="free")
+
+    def _fake_get_global(cfg_id):
+        if cfg_id == -3:
+            return {
+                "id": -3,
+                "model_name": "openrouter/free-model",
+                "billing_tier": "free",
+                "litellm_params": {"base_model": "openrouter/free-model"},
+            }
+        return None
+
+    import app.services.auto_model_pin_service as pin_module
+    import app.services.llm_service as llm_module
+
+    monkeypatch.setattr(
+        pin_module, "resolve_or_get_pinned_llm_config_id", _fake_resolve_pin
+    )
+    monkeypatch.setattr(llm_module, "get_global_llm_config", _fake_get_global)
+
+    owner, tier, base_model = await _resolve_agent_billing_for_search_space(
+        session, search_space_id=42, thread_id=99
+    )
+
+    assert owner == user_id
+    assert tier == "free"
+    assert base_model == "openrouter/free-model"
+
+
+@pytest.mark.asyncio
+async def test_auto_mode_with_thread_id_resolves_to_byok_is_free(monkeypatch):
+    """Auto + thread → pin returns positive-id BYOK config → resolver
+    returns ``("free", ...)`` (BYOK is always free per
+    ``AgentConfig.from_new_llm_config``)."""
+    from app.services.billable_calls import _resolve_agent_billing_for_search_space
+
+    user_id = uuid4()
+    search_space = _make_search_space(agent_llm_id=0, user_id=user_id)
+    byok_cfg = _make_byok_config(
+        id_=17, base_model="anthropic/claude-3-haiku", model_name="my-claude"
+    )
+    session = _FakeSession([search_space, byok_cfg])
+
+    async def _fake_resolve_pin(
+        sess,
+        *,
+        thread_id,
+        search_space_id,
+        user_id,
+        selected_llm_config_id,
+        force_repin_free=False,
+    ):
+        return _FakePinResolution(resolved_llm_config_id=17, resolved_tier="free")
+
+    import app.services.auto_model_pin_service as pin_module
+
+    monkeypatch.setattr(
+        pin_module, "resolve_or_get_pinned_llm_config_id", _fake_resolve_pin
+    )
+
+    owner, tier, base_model = await _resolve_agent_billing_for_search_space(
+        session, search_space_id=42, thread_id=99
+    )
+
+    assert owner == user_id
+    assert tier == "free"
+    assert base_model == "anthropic/claude-3-haiku"
+
+
+@pytest.mark.asyncio
+async def test_auto_mode_without_thread_id_falls_back_to_free():
+    """Auto + ``thread_id=None`` → ``("free", "auto")`` without invoking
+    the pin service. Forward-compat fallback for any future direct-API
+    entrypoint that doesn't have a chat thread."""
+    from app.services.billable_calls import _resolve_agent_billing_for_search_space
+
+    user_id = uuid4()
+    session = _FakeSession([_make_search_space(agent_llm_id=0, user_id=user_id)])
+
+    owner, tier, base_model = await _resolve_agent_billing_for_search_space(
+        session, search_space_id=42, thread_id=None
+    )
+
+    assert owner == user_id
+    assert tier == "free"
+    assert base_model == "auto"
+
+
+@pytest.mark.asyncio
+async def test_auto_mode_pin_failure_falls_back_to_free(monkeypatch):
+    """If the pin service raises ``ValueError`` (thread missing /
+    mismatched search space), the resolver should log and return free
+    rather than killing the whole task."""
+    from app.services.billable_calls import _resolve_agent_billing_for_search_space
+
+    user_id = uuid4()
+    session = _FakeSession([_make_search_space(agent_llm_id=0, user_id=user_id)])
+
+    async def _fake_resolve_pin(*args, **kwargs):
+        raise ValueError("thread missing")
+
+    import app.services.auto_model_pin_service as pin_module
+
+    monkeypatch.setattr(
+        pin_module, "resolve_or_get_pinned_llm_config_id", _fake_resolve_pin
+    )
+
+    owner, tier, base_model = await _resolve_agent_billing_for_search_space(
+        session, search_space_id=42, thread_id=99
+    )
+
+    assert owner == user_id
+    assert tier == "free"
+    assert base_model == "auto"
+
+
+@pytest.mark.asyncio
+async def test_negative_id_premium_global_returns_premium(monkeypatch):
+    """Explicit negative agent_llm_id → ``get_global_llm_config`` →
+    return its ``billing_tier``."""
+    from app.services.billable_calls import _resolve_agent_billing_for_search_space
+
+    user_id = uuid4()
+    session = _FakeSession([_make_search_space(agent_llm_id=-1, user_id=user_id)])
+
+    def _fake_get_global(cfg_id):
+        return {
+            "id": cfg_id,
+            "model_name": "gpt-5.4",
+            "billing_tier": "premium",
+            "litellm_params": {"base_model": "gpt-5.4"},
+        }
+
+    import app.services.llm_service as llm_module
+
+    monkeypatch.setattr(llm_module, "get_global_llm_config", _fake_get_global)
+
+    owner, tier, base_model = await _resolve_agent_billing_for_search_space(
+        session, search_space_id=42, thread_id=99
+    )
+
+    assert owner == user_id
+    assert tier == "premium"
+    assert base_model == "gpt-5.4"
+
+
+@pytest.mark.asyncio
+async def test_negative_id_free_global_returns_free(monkeypatch):
+    from app.services.billable_calls import _resolve_agent_billing_for_search_space
+
+    user_id = uuid4()
+    session = _FakeSession([_make_search_space(agent_llm_id=-2, user_id=user_id)])
+
+    def _fake_get_global(cfg_id):
+        return {
+            "id": cfg_id,
+            "model_name": "openrouter/some-free",
+            "billing_tier": "free",
+            "litellm_params": {"base_model": "openrouter/some-free"},
+        }
+
+    import app.services.llm_service as llm_module
+
+    monkeypatch.setattr(llm_module, "get_global_llm_config", _fake_get_global)
+
+    owner, tier, base_model = await _resolve_agent_billing_for_search_space(
+        session, search_space_id=42, thread_id=None
+    )
+
+    assert owner == user_id
+    assert tier == "free"
+    assert base_model == "openrouter/some-free"
+
+
+@pytest.mark.asyncio
+async def test_negative_id_missing_base_model_falls_back_to_model_name(monkeypatch):
+    """When the global config has no ``litellm_params.base_model``, the
+    resolver falls back to ``model_name`` — matching chat's behavior."""
+    from app.services.billable_calls import _resolve_agent_billing_for_search_space
+
+    user_id = uuid4()
+    session = _FakeSession([_make_search_space(agent_llm_id=-5, user_id=user_id)])
+
+    def _fake_get_global(cfg_id):
+        return {
+            "id": cfg_id,
+            "model_name": "fallback-model",
+            "billing_tier": "premium",
+            # No litellm_params.
+        }
+
+    import app.services.llm_service as llm_module
+
+    monkeypatch.setattr(llm_module, "get_global_llm_config", _fake_get_global)
+
+    _, tier, base_model = await _resolve_agent_billing_for_search_space(
+        session, search_space_id=42
+    )
+
+    assert tier == "premium"
+    assert base_model == "fallback-model"
+
+
+@pytest.mark.asyncio
+async def test_positive_id_byok_is_always_free():
+    """Positive agent_llm_id → user-owned BYOK NewLLMConfig → always free,
+    regardless of underlying provider tier."""
+    from app.services.billable_calls import _resolve_agent_billing_for_search_space
+
+    user_id = uuid4()
+    search_space = _make_search_space(agent_llm_id=23, user_id=user_id)
+    byok_cfg = _make_byok_config(id_=23, base_model="anthropic/claude-3.5-sonnet")
+    session = _FakeSession([search_space, byok_cfg])
+
+    owner, tier, base_model = await _resolve_agent_billing_for_search_space(
+        session, search_space_id=42
+    )
+
+    assert owner == user_id
+    assert tier == "free"
+    assert base_model == "anthropic/claude-3.5-sonnet"
+
+
+@pytest.mark.asyncio
+async def test_positive_id_byok_missing_returns_free_with_empty_base_model():
+    """If the BYOK config row is missing/deleted but the search space still
+    points at it, the resolver still returns free (no debit) with an empty
+    base_model — billable_call's premium path is skipped, no harm done."""
+    from app.services.billable_calls import _resolve_agent_billing_for_search_space
+
+    user_id = uuid4()
+    session = _FakeSession([_make_search_space(agent_llm_id=99, user_id=user_id)])
+
+    owner, tier, base_model = await _resolve_agent_billing_for_search_space(
+        session, search_space_id=42
+    )
+
+    assert owner == user_id
+    assert tier == "free"
+    assert base_model == ""
+
+
+@pytest.mark.asyncio
+async def test_search_space_not_found_raises_value_error():
+    from app.services.billable_calls import _resolve_agent_billing_for_search_space
+
+    session = _FakeSession([None])
+
+    with pytest.raises(ValueError, match="Search space"):
+        await _resolve_agent_billing_for_search_space(session, search_space_id=999)
+
+
+@pytest.mark.asyncio
+async def test_agent_llm_id_none_raises_value_error():
+    from app.services.billable_calls import _resolve_agent_billing_for_search_space
+
+    user_id = uuid4()
+    session = _FakeSession([_make_search_space(agent_llm_id=None, user_id=user_id)])
+
+    with pytest.raises(ValueError, match="agent_llm_id"):
+        await _resolve_agent_billing_for_search_space(session, search_space_id=42)
diff --git a/surfsense_backend/tests/unit/services/test_ai_file_sort_service.py b/surfsense_backend/tests/unit/services/test_ai_file_sort_service.py
new file mode 100644
index 0000000000..860c2ffa27
--- /dev/null
+++ b/surfsense_backend/tests/unit/services/test_ai_file_sort_service.py
@@ -0,0 +1,275 @@
+"""Unit tests for AI file sort service: folder label resolution, date extraction, category sanitization."""
+
+from datetime import UTC, datetime
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+pytestmark = pytest.mark.unit
+
+
+# ── resolve_root_folder_label ──
+
+
+def _make_document(document_type: str, connector_id=None):
+    doc = MagicMock()
+    doc.document_type = document_type
+    doc.connector_id = connector_id
+    return doc
+
+
+def _make_connector(connector_type: str):
+    conn = MagicMock()
+    conn.connector_type = connector_type
+    return conn
+
+
+def test_root_label_uses_connector_type_when_available():
+    from app.services.ai_file_sort_service import resolve_root_folder_label
+
+    doc = _make_document("FILE", connector_id=1)
+    conn = _make_connector("GOOGLE_DRIVE_CONNECTOR")
+    assert resolve_root_folder_label(doc, conn) == "Google Drive"
+
+
+def test_root_label_falls_back_to_document_type():
+    from app.services.ai_file_sort_service import resolve_root_folder_label
+
+    doc = _make_document("SLACK_CONNECTOR")
+    assert resolve_root_folder_label(doc, None) == "Slack"
+
+
+def test_root_label_unknown_doctype_returns_raw_value():
+    from app.services.ai_file_sort_service import resolve_root_folder_label
+
+    doc = _make_document("UNKNOWN_TYPE")
+    assert resolve_root_folder_label(doc, None) == "UNKNOWN_TYPE"
+
+
+# ── resolve_date_folder ──
+
+
+def test_date_folder_from_updated_at():
+    from app.services.ai_file_sort_service import resolve_date_folder
+
+    doc = MagicMock()
+    doc.updated_at = datetime(2025, 3, 15, 10, 30, 0, tzinfo=UTC)
+    doc.created_at = datetime(2025, 1, 1, 0, 0, 0, tzinfo=UTC)
+    assert resolve_date_folder(doc) == "2025-03-15"
+
+
+def test_date_folder_falls_back_to_created_at():
+    from app.services.ai_file_sort_service import resolve_date_folder
+
+    doc = MagicMock()
+    doc.updated_at = None
+    doc.created_at = datetime(2024, 12, 25, 23, 59, 0, tzinfo=UTC)
+    assert resolve_date_folder(doc) == "2024-12-25"
+
+
+def test_date_folder_both_none_uses_today():
+    from app.services.ai_file_sort_service import resolve_date_folder
+
+    doc = MagicMock()
+    doc.updated_at = None
+    doc.created_at = None
+    result = resolve_date_folder(doc)
+    today = datetime.now(UTC).strftime("%Y-%m-%d")
+    assert result == today
+
+
+# ── sanitize_category_folder_name ──
+
+
+def test_sanitize_normal_value():
+    from app.services.ai_file_sort_service import sanitize_category_folder_name
+
+    assert sanitize_category_folder_name("Machine Learning") == "Machine Learning"
+
+
+def test_sanitize_strips_special_chars():
+    from app.services.ai_file_sort_service import sanitize_category_folder_name
+
+    assert sanitize_category_folder_name("Tax/Reports!") == "TaxReports"
+
+
+def test_sanitize_empty_returns_fallback():
+    from app.services.ai_file_sort_service import sanitize_category_folder_name
+
+    assert sanitize_category_folder_name("") == "Uncategorized"
+    assert sanitize_category_folder_name(None) == "Uncategorized"
+
+
+def test_sanitize_truncates_long_names():
+    from app.services.ai_file_sort_service import sanitize_category_folder_name
+
+    long_name = "A" * 100
+    result = sanitize_category_folder_name(long_name)
+    assert len(result) <= 50
+
+
+# ── generate_ai_taxonomy ──
+
+
+@pytest.mark.asyncio
+async def test_generate_ai_taxonomy_parses_json():
+    from app.services.ai_file_sort_service import generate_ai_taxonomy
+
+    mock_llm = AsyncMock()
+    mock_result = MagicMock()
+    mock_result.content = '{"category": "Science", "subcategory": "Physics"}'
+    mock_llm.ainvoke.return_value = mock_result
+
+    cat, sub = await generate_ai_taxonomy(
+        "Physics Paper", "Some science document about physics", mock_llm
+    )
+    assert cat == "Science"
+    assert sub == "Physics"
+
+
+@pytest.mark.asyncio
+async def test_generate_ai_taxonomy_handles_markdown_code_block():
+    from app.services.ai_file_sort_service import generate_ai_taxonomy
+
+    mock_llm = AsyncMock()
+    mock_result = MagicMock()
+    mock_result.content = (
+        '```json\n{"category": "Finance", "subcategory": "Tax Reports"}\n```'
+    )
+    mock_llm.ainvoke.return_value = mock_result
+
+    cat, sub = await generate_ai_taxonomy("Tax Doc", "A tax report document", mock_llm)
+    assert cat == "Finance"
+    assert sub == "Tax Reports"
+
+
+@pytest.mark.asyncio
+async def test_generate_ai_taxonomy_includes_title_in_prompt():
+    from app.services.ai_file_sort_service import generate_ai_taxonomy
+
+    mock_llm = AsyncMock()
+    mock_result = MagicMock()
+    mock_result.content = '{"category": "Engineering", "subcategory": "Backend"}'
+    mock_llm.ainvoke.return_value = mock_result
+
+    await generate_ai_taxonomy("API Design Guide", "content about REST APIs", mock_llm)
+
+    prompt_text = mock_llm.ainvoke.call_args[0][0][0].content
+    assert "API Design Guide" in prompt_text
+    assert "content about REST APIs" in prompt_text
+
+
+@pytest.mark.asyncio
+async def test_generate_ai_taxonomy_fallback_on_error():
+    from app.services.ai_file_sort_service import generate_ai_taxonomy
+
+    mock_llm = AsyncMock()
+    mock_llm.ainvoke.side_effect = RuntimeError("LLM down")
+
+    cat, sub = await generate_ai_taxonomy("Title", "some content", mock_llm)
+    assert cat == "Uncategorized"
+    assert sub == "General"
+
+
+@pytest.mark.asyncio
+async def test_generate_ai_taxonomy_fallback_on_empty_content():
+    from app.services.ai_file_sort_service import generate_ai_taxonomy
+
+    mock_llm = AsyncMock()
+    cat, sub = await generate_ai_taxonomy("Title", "", mock_llm)
+    assert cat == "Uncategorized"
+    assert sub == "General"
+    mock_llm.ainvoke.assert_not_called()
+
+
+@pytest.mark.asyncio
+async def test_generate_ai_taxonomy_fallback_on_invalid_json():
+    from app.services.ai_file_sort_service import generate_ai_taxonomy
+
+    mock_llm = AsyncMock()
+    mock_result = MagicMock()
+    mock_result.content = "not valid json at all"
+    mock_llm.ainvoke.return_value = mock_result
+
+    cat, sub = await generate_ai_taxonomy("Title", "some content", mock_llm)
+    assert cat == "Uncategorized"
+    assert sub == "General"
+
+
+# ── taxonomy caching ──
+
+
+def test_get_cached_taxonomy_returns_none_when_no_metadata():
+    from app.services.ai_file_sort_service import _get_cached_taxonomy
+
+    doc = MagicMock()
+    doc.document_metadata = None
+    assert _get_cached_taxonomy(doc) is None
+
+
+def test_get_cached_taxonomy_returns_none_when_keys_missing():
+    from app.services.ai_file_sort_service import _get_cached_taxonomy
+
+    doc = MagicMock()
+    doc.document_metadata = {"some_other_key": "value"}
+    assert _get_cached_taxonomy(doc) is None
+
+
+def test_get_cached_taxonomy_returns_cached_values():
+    from app.services.ai_file_sort_service import _get_cached_taxonomy
+
+    doc = MagicMock()
+    doc.document_metadata = {
+        "ai_sort_category": "Finance",
+        "ai_sort_subcategory": "Tax Reports",
+    }
+    assert _get_cached_taxonomy(doc) == ("Finance", "Tax Reports")
+
+
+def test_set_cached_taxonomy_persists_on_metadata():
+    from app.services.ai_file_sort_service import _set_cached_taxonomy
+
+    doc = MagicMock()
+    doc.document_metadata = {"existing_key": "keep_me"}
+    _set_cached_taxonomy(doc, "Science", "Physics")
+    assert doc.document_metadata["ai_sort_category"] == "Science"
+    assert doc.document_metadata["ai_sort_subcategory"] == "Physics"
+    assert doc.document_metadata["existing_key"] == "keep_me"
+
+
+def test_set_cached_taxonomy_creates_metadata_when_none():
+    from app.services.ai_file_sort_service import _set_cached_taxonomy
+
+    doc = MagicMock()
+    doc.document_metadata = None
+    _set_cached_taxonomy(doc, "Engineering", "Backend")
+    assert doc.document_metadata == {
+        "ai_sort_category": "Engineering",
+        "ai_sort_subcategory": "Backend",
+    }
+
+
+# ── _build_path_segments ──
+
+
+def test_build_path_segments_structure():
+    from app.services.ai_file_sort_service import _build_path_segments
+
+    segments = _build_path_segments("Google Drive", "2025-03-15", "Science", "Physics")
+    assert len(segments) == 4
+    assert segments[0] == {
+        "name": "Google Drive",
+        "metadata": {"ai_sort": True, "ai_sort_level": 1},
+    }
+    assert segments[1] == {
+        "name": "2025-03-15",
+        "metadata": {"ai_sort": True, "ai_sort_level": 2},
+    }
+    assert segments[2] == {
+        "name": "Science",
+        "metadata": {"ai_sort": True, "ai_sort_level": 3},
+    }
+    assert segments[3] == {
+        "name": "Physics",
+        "metadata": {"ai_sort": True, "ai_sort_level": 4},
+    }
diff --git a/surfsense_backend/tests/unit/services/test_ai_sort_task_dedupe.py b/surfsense_backend/tests/unit/services/test_ai_sort_task_dedupe.py
new file mode 100644
index 0000000000..fd9018514b
--- /dev/null
+++ b/surfsense_backend/tests/unit/services/test_ai_sort_task_dedupe.py
@@ -0,0 +1,43 @@
+"""Unit tests for AI sort task Redis deduplication lock."""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+pytestmark = pytest.mark.unit
+
+
+def test_lock_key_format():
+    from app.tasks.celery_tasks.document_tasks import _ai_sort_lock_key
+
+    key = _ai_sort_lock_key(42)
+    assert key == "ai_sort:search_space:42:lock"
+
+
+def test_lock_prevents_duplicate_run():
+    """When the Redis lock already exists, the task should skip execution."""
+
+    mock_redis = MagicMock()
+    mock_redis.set.return_value = False  # Lock already held
+
+    with (
+        patch(
+            "app.tasks.celery_tasks.document_tasks._get_ai_sort_redis",
+            return_value=mock_redis,
+        ),
+        patch(
+            "app.tasks.celery_tasks.document_tasks.get_celery_session_maker"
+        ) as mock_session_maker,
+    ):
+        import asyncio
+
+        from app.tasks.celery_tasks.document_tasks import _ai_sort_search_space_async
+
+        loop = asyncio.new_event_loop()
+        try:
+            loop.run_until_complete(_ai_sort_search_space_async(1, "user-123"))
+        finally:
+            loop.close()
+
+        # Session maker should never be called since lock was not acquired
+        mock_session_maker.assert_not_called()
diff --git a/surfsense_backend/tests/unit/services/test_auto_model_pin_service.py b/surfsense_backend/tests/unit/services/test_auto_model_pin_service.py
new file mode 100644
index 0000000000..d1af29aeb6
--- /dev/null
+++ b/surfsense_backend/tests/unit/services/test_auto_model_pin_service.py
@@ -0,0 +1,1026 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from types import SimpleNamespace
+
+import pytest
+
+from app.services.auto_model_pin_service import (
+    clear_healthy,
+    clear_runtime_cooldown,
+    is_recently_healthy,
+    mark_healthy,
+    mark_runtime_cooldown,
+    resolve_or_get_pinned_llm_config_id,
+)
+
+pytestmark = pytest.mark.unit
+
+
+@pytest.fixture(autouse=True)
+def _clear_runtime_cooldown_map():
+    clear_runtime_cooldown()
+    clear_healthy()
+    yield
+    clear_runtime_cooldown()
+    clear_healthy()
+
+
+@dataclass
+class _FakeQuotaResult:
+    allowed: bool
+
+
+class _FakeExecResult:
+    def __init__(self, thread):
+        self._thread = thread
+
+    def unique(self):
+        return self
+
+    def scalar_one_or_none(self):
+        return self._thread
+
+
+class _FakeSession:
+    def __init__(self, thread):
+        self.thread = thread
+        self.commit_count = 0
+
+    async def execute(self, _stmt):
+        return _FakeExecResult(self.thread)
+
+    async def commit(self):
+        self.commit_count += 1
+
+
+def _thread(
+    *,
+    search_space_id: int = 10,
+    pinned_llm_config_id: int | None = None,
+):
+    return SimpleNamespace(
+        id=1,
+        search_space_id=search_space_id,
+        pinned_llm_config_id=pinned_llm_config_id,
+    )
+
+
+@pytest.mark.asyncio
+async def test_auto_first_turn_pins_one_model(monkeypatch):
+    from app.config import config
+
+    session = _FakeSession(_thread())
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_LLM_CONFIGS",
+        [
+            {"id": -2, "provider": "OPENAI", "model_name": "gpt-free", "api_key": "k1"},
+            {
+                "id": -1,
+                "provider": "OPENAI",
+                "model_name": "gpt-prem",
+                "api_key": "k2",
+                "billing_tier": "premium",
+            },
+        ],
+    )
+
+    async def _allowed(*_args, **_kwargs):
+        return _FakeQuotaResult(allowed=True)
+
+    monkeypatch.setattr(
+        "app.services.auto_model_pin_service.TokenQuotaService.premium_get_usage",
+        _allowed,
+    )
+
+    result = await resolve_or_get_pinned_llm_config_id(
+        session,
+        thread_id=1,
+        search_space_id=10,
+        user_id="00000000-0000-0000-0000-000000000001",
+        selected_llm_config_id=0,
+    )
+    assert result.resolved_llm_config_id == -1
+    assert session.thread.pinned_llm_config_id == result.resolved_llm_config_id
+    assert session.commit_count == 1
+
+
+@pytest.mark.asyncio
+async def test_premium_eligible_auto_prefers_premium_over_free(monkeypatch):
+    from app.config import config
+
+    session = _FakeSession(_thread())
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_LLM_CONFIGS",
+        [
+            {
+                "id": -2,
+                "provider": "OPENAI",
+                "model_name": "gpt-free",
+                "api_key": "k1",
+                "billing_tier": "free",
+                "quality_score": 100,
+            },
+            {
+                "id": -1,
+                "provider": "OPENAI",
+                "model_name": "gpt-prem",
+                "api_key": "k2",
+                "billing_tier": "premium",
+                "quality_score": 10,
+            },
+        ],
+    )
+
+    async def _allowed(*_args, **_kwargs):
+        return _FakeQuotaResult(allowed=True)
+
+    monkeypatch.setattr(
+        "app.services.auto_model_pin_service.TokenQuotaService.premium_get_usage",
+        _allowed,
+    )
+
+    result = await resolve_or_get_pinned_llm_config_id(
+        session,
+        thread_id=1,
+        search_space_id=10,
+        user_id="00000000-0000-0000-0000-000000000001",
+        selected_llm_config_id=0,
+    )
+    assert result.resolved_llm_config_id == -1
+    assert result.resolved_tier == "premium"
+
+
+@pytest.mark.asyncio
+async def test_premium_eligible_auto_prefers_azure_gpt_5_4(monkeypatch):
+    from app.config import config
+
+    session = _FakeSession(_thread())
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_LLM_CONFIGS",
+        [
+            {
+                "id": -1,
+                "provider": "AZURE_OPENAI",
+                "model_name": "gpt-5.1",
+                "api_key": "k1",
+                "billing_tier": "premium",
+                "auto_pin_tier": "A",
+                "quality_score": 100,
+            },
+            {
+                "id": -2,
+                "provider": "AZURE_OPENAI",
+                "model_name": "gpt-5.4",
+                "api_key": "k2",
+                "billing_tier": "premium",
+                "auto_pin_tier": "A",
+                "quality_score": 10,
+            },
+            {
+                "id": -3,
+                "provider": "OPENROUTER",
+                "model_name": "openai/gpt-5.4",
+                "api_key": "k3",
+                "billing_tier": "premium",
+                "auto_pin_tier": "B",
+                "quality_score": 100,
+            },
+        ],
+    )
+
+    async def _allowed(*_args, **_kwargs):
+        return _FakeQuotaResult(allowed=True)
+
+    monkeypatch.setattr(
+        "app.services.auto_model_pin_service.TokenQuotaService.premium_get_usage",
+        _allowed,
+    )
+
+    result = await resolve_or_get_pinned_llm_config_id(
+        session,
+        thread_id=1,
+        search_space_id=10,
+        user_id="00000000-0000-0000-0000-000000000001",
+        selected_llm_config_id=0,
+    )
+    assert result.resolved_llm_config_id == -2
+    assert result.resolved_tier == "premium"
+
+
+@pytest.mark.asyncio
+async def test_next_turn_reuses_existing_pin(monkeypatch):
+    from app.config import config
+
+    session = _FakeSession(_thread(pinned_llm_config_id=-1))
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_LLM_CONFIGS",
+        [
+            {
+                "id": -1,
+                "provider": "OPENAI",
+                "model_name": "gpt-prem",
+                "api_key": "k2",
+                "billing_tier": "premium",
+            },
+        ],
+    )
+
+    async def _must_not_call(*_args, **_kwargs):
+        raise AssertionError(
+            "premium_get_usage should not be called for valid pin reuse"
+        )
+
+    monkeypatch.setattr(
+        "app.services.auto_model_pin_service.TokenQuotaService.premium_get_usage",
+        _must_not_call,
+    )
+
+    result = await resolve_or_get_pinned_llm_config_id(
+        session,
+        thread_id=1,
+        search_space_id=10,
+        user_id="00000000-0000-0000-0000-000000000001",
+        selected_llm_config_id=0,
+    )
+    assert result.resolved_llm_config_id == -1
+    assert result.from_existing_pin is True
+    assert session.commit_count == 0
+
+
+@pytest.mark.asyncio
+async def test_premium_eligible_auto_can_pin_premium(monkeypatch):
+    from app.config import config
+
+    session = _FakeSession(_thread())
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_LLM_CONFIGS",
+        [
+            {
+                "id": -1,
+                "provider": "OPENAI",
+                "model_name": "gpt-prem",
+                "api_key": "k2",
+                "billing_tier": "premium",
+            },
+        ],
+    )
+
+    async def _allowed(*_args, **_kwargs):
+        return _FakeQuotaResult(allowed=True)
+
+    monkeypatch.setattr(
+        "app.services.auto_model_pin_service.TokenQuotaService.premium_get_usage",
+        _allowed,
+    )
+
+    result = await resolve_or_get_pinned_llm_config_id(
+        session,
+        thread_id=1,
+        search_space_id=10,
+        user_id="00000000-0000-0000-0000-000000000001",
+        selected_llm_config_id=0,
+    )
+    assert result.resolved_llm_config_id == -1
+    assert result.resolved_tier == "premium"
+
+
+@pytest.mark.asyncio
+async def test_premium_ineligible_auto_pins_free_only(monkeypatch):
+    from app.config import config
+
+    session = _FakeSession(_thread())
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_LLM_CONFIGS",
+        [
+            {
+                "id": -2,
+                "provider": "OPENAI",
+                "model_name": "gpt-free",
+                "api_key": "k1",
+                "billing_tier": "free",
+            },
+            {
+                "id": -1,
+                "provider": "OPENAI",
+                "model_name": "gpt-prem",
+                "api_key": "k2",
+                "billing_tier": "premium",
+            },
+        ],
+    )
+
+    async def _blocked(*_args, **_kwargs):
+        return _FakeQuotaResult(allowed=False)
+
+    monkeypatch.setattr(
+        "app.services.auto_model_pin_service.TokenQuotaService.premium_get_usage",
+        _blocked,
+    )
+
+    result = await resolve_or_get_pinned_llm_config_id(
+        session,
+        thread_id=1,
+        search_space_id=10,
+        user_id="00000000-0000-0000-0000-000000000001",
+        selected_llm_config_id=0,
+    )
+    assert result.resolved_llm_config_id == -2
+    assert result.resolved_tier == "free"
+
+
+@pytest.mark.asyncio
+async def test_pinned_premium_stays_premium_after_quota_exhaustion(monkeypatch):
+    from app.config import config
+
+    session = _FakeSession(_thread(pinned_llm_config_id=-1))
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_LLM_CONFIGS",
+        [
+            {
+                "id": -2,
+                "provider": "OPENAI",
+                "model_name": "gpt-free",
+                "api_key": "k1",
+                "billing_tier": "free",
+            },
+            {
+                "id": -1,
+                "provider": "OPENAI",
+                "model_name": "gpt-prem",
+                "api_key": "k2",
+                "billing_tier": "premium",
+            },
+        ],
+    )
+
+    async def _blocked(*_args, **_kwargs):
+        return _FakeQuotaResult(allowed=False)
+
+    monkeypatch.setattr(
+        "app.services.auto_model_pin_service.TokenQuotaService.premium_get_usage",
+        _blocked,
+    )
+
+    result = await resolve_or_get_pinned_llm_config_id(
+        session,
+        thread_id=1,
+        search_space_id=10,
+        user_id="00000000-0000-0000-0000-000000000001",
+        selected_llm_config_id=0,
+    )
+    assert result.resolved_llm_config_id == -1
+    assert result.from_existing_pin is True
+
+
+@pytest.mark.asyncio
+async def test_force_repin_free_switches_auto_premium_pin_to_free(monkeypatch):
+    from app.config import config
+
+    session = _FakeSession(_thread(pinned_llm_config_id=-1))
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_LLM_CONFIGS",
+        [
+            {
+                "id": -2,
+                "provider": "OPENAI",
+                "model_name": "gpt-free",
+                "api_key": "k1",
+                "billing_tier": "free",
+            },
+            {
+                "id": -1,
+                "provider": "OPENAI",
+                "model_name": "gpt-prem",
+                "api_key": "k2",
+                "billing_tier": "premium",
+            },
+        ],
+    )
+
+    async def _blocked(*_args, **_kwargs):
+        return _FakeQuotaResult(allowed=False)
+
+    monkeypatch.setattr(
+        "app.services.auto_model_pin_service.TokenQuotaService.premium_get_usage",
+        _blocked,
+    )
+
+    result = await resolve_or_get_pinned_llm_config_id(
+        session,
+        thread_id=1,
+        search_space_id=10,
+        user_id="00000000-0000-0000-0000-000000000001",
+        selected_llm_config_id=0,
+        force_repin_free=True,
+    )
+    assert result.resolved_llm_config_id == -2
+    assert result.resolved_tier == "free"
+    assert result.from_existing_pin is False
+    assert session.thread.pinned_llm_config_id == -2
+
+
+@pytest.mark.asyncio
+async def test_explicit_user_model_change_clears_pin(monkeypatch):
+    from app.config import config
+
+    session = _FakeSession(_thread(pinned_llm_config_id=-2))
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_LLM_CONFIGS",
+        [
+            {"id": -2, "provider": "OPENAI", "model_name": "gpt-free", "api_key": "k1"},
+        ],
+    )
+
+    result = await resolve_or_get_pinned_llm_config_id(
+        session,
+        thread_id=1,
+        search_space_id=10,
+        user_id="00000000-0000-0000-0000-000000000001",
+        selected_llm_config_id=7,
+    )
+    assert result.resolved_llm_config_id == 7
+    assert session.thread.pinned_llm_config_id is None
+    assert session.commit_count == 1
+
+
+@pytest.mark.asyncio
+async def test_invalid_pinned_config_repairs_with_new_pin(monkeypatch):
+    from app.config import config
+
+    session = _FakeSession(_thread(pinned_llm_config_id=-999))
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_LLM_CONFIGS",
+        [
+            {"id": -2, "provider": "OPENAI", "model_name": "gpt-free", "api_key": "k1"},
+        ],
+    )
+
+    async def _blocked(*_args, **_kwargs):
+        return _FakeQuotaResult(allowed=False)
+
+    monkeypatch.setattr(
+        "app.services.auto_model_pin_service.TokenQuotaService.premium_get_usage",
+        _blocked,
+    )
+
+    result = await resolve_or_get_pinned_llm_config_id(
+        session,
+        thread_id=1,
+        search_space_id=10,
+        user_id="00000000-0000-0000-0000-000000000001",
+        selected_llm_config_id=0,
+    )
+    assert result.resolved_llm_config_id == -2
+    assert session.thread.pinned_llm_config_id == -2
+    assert session.commit_count == 1
+
+
+# ---------------------------------------------------------------------------
+# Quality-aware pin selection (Auto Fastest upgrade)
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_health_gated_config_is_excluded_from_selection(monkeypatch):
+    """A cfg flagged ``health_gated`` must never be picked even if it has
+    the highest score among eligible cfgs."""
+    from app.config import config
+
+    session = _FakeSession(_thread())
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_LLM_CONFIGS",
+        [
+            {
+                "id": -1,
+                "provider": "OPENROUTER",
+                "model_name": "venice/dead-model",
+                "api_key": "k1",
+                "billing_tier": "free",
+                "auto_pin_tier": "C",
+                "quality_score": 95,
+                "health_gated": True,
+            },
+            {
+                "id": -2,
+                "provider": "OPENROUTER",
+                "model_name": "google/gemini-flash",
+                "api_key": "k1",
+                "billing_tier": "free",
+                "auto_pin_tier": "C",
+                "quality_score": 60,
+                "health_gated": False,
+            },
+        ],
+    )
+
+    async def _blocked(*_args, **_kwargs):
+        return _FakeQuotaResult(allowed=False)
+
+    monkeypatch.setattr(
+        "app.services.auto_model_pin_service.TokenQuotaService.premium_get_usage",
+        _blocked,
+    )
+
+    result = await resolve_or_get_pinned_llm_config_id(
+        session,
+        thread_id=1,
+        search_space_id=10,
+        user_id="00000000-0000-0000-0000-000000000001",
+        selected_llm_config_id=0,
+    )
+    assert result.resolved_llm_config_id == -2
+
+
+@pytest.mark.asyncio
+async def test_tier_a_locks_first_premium_user_skips_or(monkeypatch):
+    """Premium-eligible users with Tier A available should never spill to
+    Tier B even if a B cfg ranks higher by ``quality_score``."""
+    from app.config import config
+
+    session = _FakeSession(_thread())
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_LLM_CONFIGS",
+        [
+            {
+                "id": -1,
+                "provider": "AZURE_OPENAI",
+                "model_name": "gpt-5",
+                "api_key": "k-yaml",
+                "billing_tier": "premium",
+                "auto_pin_tier": "A",
+                "quality_score": 70,
+                "health_gated": False,
+            },
+            {
+                "id": -2,
+                "provider": "OPENROUTER",
+                "model_name": "openai/gpt-5",
+                "api_key": "k-or",
+                "billing_tier": "premium",
+                "auto_pin_tier": "B",
+                "quality_score": 95,
+                "health_gated": False,
+            },
+        ],
+    )
+
+    async def _allowed(*_args, **_kwargs):
+        return _FakeQuotaResult(allowed=True)
+
+    monkeypatch.setattr(
+        "app.services.auto_model_pin_service.TokenQuotaService.premium_get_usage",
+        _allowed,
+    )
+
+    result = await resolve_or_get_pinned_llm_config_id(
+        session,
+        thread_id=1,
+        search_space_id=10,
+        user_id="00000000-0000-0000-0000-000000000001",
+        selected_llm_config_id=0,
+    )
+    assert result.resolved_llm_config_id == -1
+    assert result.resolved_tier == "premium"
+
+
+@pytest.mark.asyncio
+async def test_tier_a_falls_through_to_or_when_a_pool_empty_for_user(monkeypatch):
+    """Free-only user with no Tier A free cfg should pick from Tier C."""
+    from app.config import config
+
+    session = _FakeSession(_thread())
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_LLM_CONFIGS",
+        [
+            {
+                "id": -1,
+                "provider": "AZURE_OPENAI",
+                "model_name": "gpt-5",
+                "api_key": "k-yaml",
+                "billing_tier": "premium",
+                "auto_pin_tier": "A",
+                "quality_score": 100,
+                "health_gated": False,
+            },
+            {
+                "id": -2,
+                "provider": "OPENROUTER",
+                "model_name": "google/gemini-flash:free",
+                "api_key": "k-or",
+                "billing_tier": "free",
+                "auto_pin_tier": "C",
+                "quality_score": 60,
+                "health_gated": False,
+            },
+        ],
+    )
+
+    async def _blocked(*_args, **_kwargs):
+        return _FakeQuotaResult(allowed=False)
+
+    monkeypatch.setattr(
+        "app.services.auto_model_pin_service.TokenQuotaService.premium_get_usage",
+        _blocked,
+    )
+
+    result = await resolve_or_get_pinned_llm_config_id(
+        session,
+        thread_id=1,
+        search_space_id=10,
+        user_id="00000000-0000-0000-0000-000000000001",
+        selected_llm_config_id=0,
+    )
+    assert result.resolved_llm_config_id == -2
+
+
+@pytest.mark.asyncio
+async def test_top_k_picks_only_high_score_models(monkeypatch):
+    """Different thread IDs should spread across top-K, never pick the
+    obvious low-quality cfg even when it sits in the candidate list."""
+    from app.config import config
+
+    high_score_cfgs = [
+        {
+            "id": -i,
+            "provider": "AZURE_OPENAI",
+            "model_name": f"gpt-x-{i}",
+            "api_key": "k",
+            "billing_tier": "premium",
+            "auto_pin_tier": "A",
+            "quality_score": 90,
+            "health_gated": False,
+        }
+        for i in range(1, 6)  # 5 high-quality Tier A cfgs
+    ]
+    low_score_trap = {
+        "id": -99,
+        "provider": "AZURE_OPENAI",
+        "model_name": "tiny-legacy",
+        "api_key": "k",
+        "billing_tier": "premium",
+        "auto_pin_tier": "A",
+        "quality_score": 10,
+        "health_gated": False,
+    }
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_LLM_CONFIGS",
+        [*high_score_cfgs, low_score_trap],
+    )
+
+    async def _allowed(*_args, **_kwargs):
+        return _FakeQuotaResult(allowed=True)
+
+    monkeypatch.setattr(
+        "app.services.auto_model_pin_service.TokenQuotaService.premium_get_usage",
+        _allowed,
+    )
+
+    high_score_ids = {c["id"] for c in high_score_cfgs}
+    seen = set()
+    for thread_id in range(1, 50):
+        session = _FakeSession(_thread())
+        result = await resolve_or_get_pinned_llm_config_id(
+            session,
+            thread_id=thread_id,
+            search_space_id=10,
+            user_id="00000000-0000-0000-0000-000000000001",
+            selected_llm_config_id=0,
+        )
+        seen.add(result.resolved_llm_config_id)
+        assert result.resolved_llm_config_id != -99, (
+            "low-score trap cfg should never be picked"
+        )
+        assert result.resolved_llm_config_id in high_score_ids
+
+    # Spread across at least a couple of top-K cfgs.
+    assert len(seen) > 1
+
+
+@pytest.mark.asyncio
+async def test_pin_reuse_survives_health_gating_for_existing_pin(monkeypatch):
+    """An *already* pinned cfg that later flips to ``health_gated`` should
+    still not be reused — gated cfgs are filtered out of the candidate
+    pool, which forces a repair to a healthy cfg.
+
+    This guards the no-silent-tier-switch invariant: we don't keep using
+    a known-broken model just because the thread happened to be pinned
+    to it before the gate fired."""
+    from app.config import config
+
+    session = _FakeSession(_thread(pinned_llm_config_id=-1))
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_LLM_CONFIGS",
+        [
+            {
+                "id": -1,
+                "provider": "OPENROUTER",
+                "model_name": "venice/dead-model",
+                "api_key": "k",
+                "billing_tier": "premium",
+                "auto_pin_tier": "B",
+                "quality_score": 50,
+                "health_gated": True,
+            },
+            {
+                "id": -2,
+                "provider": "AZURE_OPENAI",
+                "model_name": "gpt-5",
+                "api_key": "k",
+                "billing_tier": "premium",
+                "auto_pin_tier": "A",
+                "quality_score": 90,
+                "health_gated": False,
+            },
+        ],
+    )
+
+    async def _allowed(*_args, **_kwargs):
+        return _FakeQuotaResult(allowed=True)
+
+    monkeypatch.setattr(
+        "app.services.auto_model_pin_service.TokenQuotaService.premium_get_usage",
+        _allowed,
+    )
+
+    result = await resolve_or_get_pinned_llm_config_id(
+        session,
+        thread_id=1,
+        search_space_id=10,
+        user_id="00000000-0000-0000-0000-000000000001",
+        selected_llm_config_id=0,
+    )
+    assert result.resolved_llm_config_id == -2
+    assert result.from_existing_pin is False
+
+
+@pytest.mark.asyncio
+async def test_pin_reuse_regression_existing_healthy_pin(monkeypatch):
+    """Existing pin reuse must short-circuit the new tier/score logic."""
+    from app.config import config
+
+    session = _FakeSession(_thread(pinned_llm_config_id=-1))
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_LLM_CONFIGS",
+        [
+            {
+                "id": -1,
+                "provider": "AZURE_OPENAI",
+                "model_name": "gpt-5",
+                "api_key": "k",
+                "billing_tier": "premium",
+                "auto_pin_tier": "A",
+                "quality_score": 50,  # lower than -2
+                "health_gated": False,
+            },
+            {
+                "id": -2,
+                "provider": "AZURE_OPENAI",
+                "model_name": "gpt-5-pro",
+                "api_key": "k",
+                "billing_tier": "premium",
+                "auto_pin_tier": "A",
+                "quality_score": 99,
+                "health_gated": False,
+            },
+        ],
+    )
+
+    async def _must_not_call(*_args, **_kwargs):
+        raise AssertionError("premium_get_usage should not run on pin reuse")
+
+    monkeypatch.setattr(
+        "app.services.auto_model_pin_service.TokenQuotaService.premium_get_usage",
+        _must_not_call,
+    )
+
+    result = await resolve_or_get_pinned_llm_config_id(
+        session,
+        thread_id=1,
+        search_space_id=10,
+        user_id="00000000-0000-0000-0000-000000000001",
+        selected_llm_config_id=0,
+    )
+    assert result.resolved_llm_config_id == -1
+    assert result.from_existing_pin is True
+    assert session.commit_count == 0
+
+
+@pytest.mark.asyncio
+async def test_runtime_cooled_down_pin_is_not_reused(monkeypatch):
+    """A runtime-cooled config should be excluded from candidate reuse.
+
+    This enables one-shot recovery from transient provider 429 bursts: we can
+    mark the pinned cfg as cooled down and force a repair to another eligible
+    cfg on the next resolution.
+    """
+    from app.config import config
+
+    session = _FakeSession(_thread(pinned_llm_config_id=-1))
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_LLM_CONFIGS",
+        [
+            {
+                "id": -1,
+                "provider": "OPENROUTER",
+                "model_name": "google/gemma-4-26b-a4b-it:free",
+                "api_key": "k",
+                "billing_tier": "free",
+                "auto_pin_tier": "C",
+                "quality_score": 90,
+                "health_gated": False,
+            },
+            {
+                "id": -2,
+                "provider": "OPENROUTER",
+                "model_name": "google/gemini-2.5-flash:free",
+                "api_key": "k",
+                "billing_tier": "free",
+                "auto_pin_tier": "C",
+                "quality_score": 80,
+                "health_gated": False,
+            },
+        ],
+    )
+
+    async def _blocked(*_args, **_kwargs):
+        return _FakeQuotaResult(allowed=False)
+
+    monkeypatch.setattr(
+        "app.services.auto_model_pin_service.TokenQuotaService.premium_get_usage",
+        _blocked,
+    )
+
+    mark_runtime_cooldown(-1, reason="provider_rate_limited", cooldown_seconds=600)
+
+    result = await resolve_or_get_pinned_llm_config_id(
+        session,
+        thread_id=1,
+        search_space_id=10,
+        user_id="00000000-0000-0000-0000-000000000001",
+        selected_llm_config_id=0,
+    )
+    assert result.resolved_llm_config_id == -2
+    assert result.from_existing_pin is False
+
+
+@pytest.mark.asyncio
+async def test_clearing_runtime_cooldown_restores_pin_reuse(monkeypatch):
+    from app.config import config
+
+    session = _FakeSession(_thread(pinned_llm_config_id=-1))
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_LLM_CONFIGS",
+        [
+            {
+                "id": -1,
+                "provider": "OPENROUTER",
+                "model_name": "google/gemma-4-26b-a4b-it:free",
+                "api_key": "k",
+                "billing_tier": "free",
+                "auto_pin_tier": "C",
+                "quality_score": 90,
+                "health_gated": False,
+            },
+        ],
+    )
+
+    async def _must_not_call(*_args, **_kwargs):
+        raise AssertionError("premium_get_usage should not run on healthy pin reuse")
+
+    monkeypatch.setattr(
+        "app.services.auto_model_pin_service.TokenQuotaService.premium_get_usage",
+        _must_not_call,
+    )
+
+    mark_runtime_cooldown(-1, reason="provider_rate_limited", cooldown_seconds=600)
+    clear_runtime_cooldown(-1)
+
+    result = await resolve_or_get_pinned_llm_config_id(
+        session,
+        thread_id=1,
+        search_space_id=10,
+        user_id="00000000-0000-0000-0000-000000000001",
+        selected_llm_config_id=0,
+    )
+    assert result.resolved_llm_config_id == -1
+    assert result.from_existing_pin is True
+
+
+@pytest.mark.asyncio
+async def test_auto_pin_repin_excludes_previous_config_on_runtime_retry(monkeypatch):
+    """Runtime retry should never repin the just-failed config."""
+    from app.config import config
+
+    session = _FakeSession(_thread(pinned_llm_config_id=-1))
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_LLM_CONFIGS",
+        [
+            {
+                "id": -1,
+                "provider": "OPENROUTER",
+                "model_name": "google/gemma-4-26b-a4b-it:free",
+                "api_key": "k",
+                "billing_tier": "free",
+                "auto_pin_tier": "C",
+                "quality_score": 90,
+                "health_gated": False,
+            },
+            {
+                "id": -2,
+                "provider": "OPENROUTER",
+                "model_name": "google/gemini-2.5-flash:free",
+                "api_key": "k",
+                "billing_tier": "free",
+                "auto_pin_tier": "C",
+                "quality_score": 80,
+                "health_gated": False,
+            },
+        ],
+    )
+
+    async def _blocked(*_args, **_kwargs):
+        return _FakeQuotaResult(allowed=False)
+
+    monkeypatch.setattr(
+        "app.services.auto_model_pin_service.TokenQuotaService.premium_get_usage",
+        _blocked,
+    )
+
+    result = await resolve_or_get_pinned_llm_config_id(
+        session,
+        thread_id=1,
+        search_space_id=10,
+        user_id="00000000-0000-0000-0000-000000000001",
+        selected_llm_config_id=0,
+        exclude_config_ids={-1},
+    )
+    assert result.resolved_llm_config_id == -2
+    assert result.from_existing_pin is False
+
+
+# ---------------------------------------------------------------------------
+# Healthy-status cache (preflight TTL companion)
+# ---------------------------------------------------------------------------
+
+
+def test_mark_healthy_then_is_recently_healthy_true_within_ttl():
+    mark_healthy(-42, ttl_seconds=60)
+    assert is_recently_healthy(-42) is True
+
+
+def test_healthy_expires_after_ttl(monkeypatch):
+    import app.services.auto_model_pin_service as svc
+
+    real_time = svc.time.time
+    base = real_time()
+
+    monkeypatch.setattr(svc.time, "time", lambda: base)
+    mark_healthy(-7, ttl_seconds=10)
+    assert is_recently_healthy(-7) is True
+
+    monkeypatch.setattr(svc.time, "time", lambda: base + 11)
+    assert is_recently_healthy(-7) is False
+
+
+def test_mark_runtime_cooldown_invalidates_healthy_cache():
+    mark_healthy(-9, ttl_seconds=60)
+    assert is_recently_healthy(-9) is True
+
+    mark_runtime_cooldown(-9, reason="test", cooldown_seconds=60)
+    assert is_recently_healthy(-9) is False
+
+
+def test_clear_healthy_removes_single_entry():
+    mark_healthy(-11, ttl_seconds=60)
+    mark_healthy(-12, ttl_seconds=60)
+    clear_healthy(-11)
+    assert is_recently_healthy(-11) is False
+    assert is_recently_healthy(-12) is True
+
+
+def test_clear_healthy_no_args_drops_all_entries():
+    mark_healthy(-21, ttl_seconds=60)
+    mark_healthy(-22, ttl_seconds=60)
+    clear_healthy()
+    assert is_recently_healthy(-21) is False
+    assert is_recently_healthy(-22) is False
diff --git a/surfsense_backend/tests/unit/services/test_auto_pin_image_aware.py b/surfsense_backend/tests/unit/services/test_auto_pin_image_aware.py
new file mode 100644
index 0000000000..0e19b80e41
--- /dev/null
+++ b/surfsense_backend/tests/unit/services/test_auto_pin_image_aware.py
@@ -0,0 +1,286 @@
+"""Image-aware extension of the Auto-pin resolver.
+
+When the current chat turn carries an ``image_url`` block, the pin
+resolver must:
+
+1. Filter the candidate pool to vision-capable cfgs so a freshly
+   selected pin can never be text-only.
+2. Treat any existing pin whose capability is False as invalid (force
+   re-pin), even when it would otherwise be reused as the thread's
+   stable model.
+3. Raise ``ValueError`` (mapped to the friendly
+   ``MODEL_DOES_NOT_SUPPORT_IMAGE_INPUT`` SSE error in the streaming
+   task) when no vision-capable cfg is available — instead of silently
+   pinning text-only and 404-ing at the provider.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from types import SimpleNamespace
+
+import pytest
+
+from app.services.auto_model_pin_service import (
+    clear_healthy,
+    clear_runtime_cooldown,
+    resolve_or_get_pinned_llm_config_id,
+)
+
+pytestmark = pytest.mark.unit
+
+
+@pytest.fixture(autouse=True)
+def _reset_caches():
+    clear_runtime_cooldown()
+    clear_healthy()
+    yield
+    clear_runtime_cooldown()
+    clear_healthy()
+
+
+@dataclass
+class _FakeQuotaResult:
+    allowed: bool
+
+
+class _FakeExecResult:
+    def __init__(self, thread):
+        self._thread = thread
+
+    def unique(self):
+        return self
+
+    def scalar_one_or_none(self):
+        return self._thread
+
+
+class _FakeSession:
+    def __init__(self, thread):
+        self.thread = thread
+        self.commit_count = 0
+
+    async def execute(self, _stmt):
+        return _FakeExecResult(self.thread)
+
+    async def commit(self):
+        self.commit_count += 1
+
+
+def _thread(*, pinned: int | None = None):
+    return SimpleNamespace(id=1, search_space_id=10, pinned_llm_config_id=pinned)
+
+
+def _vision_cfg(id_: int, *, tier: str = "free", quality: int = 80) -> dict:
+    return {
+        "id": id_,
+        "provider": "OPENAI",
+        "model_name": f"vision-{id_}",
+        "api_key": "k",
+        "billing_tier": tier,
+        "supports_image_input": True,
+        "auto_pin_tier": "A",
+        "quality_score": quality,
+    }
+
+
+def _text_only_cfg(id_: int, *, tier: str = "free", quality: int = 90) -> dict:
+    return {
+        "id": id_,
+        "provider": "OPENAI",
+        "model_name": f"text-{id_}",
+        "api_key": "k",
+        "billing_tier": tier,
+        # Higher quality than the vision cfgs — so a bug that ignores
+        # the image flag would surface as the resolver picking this one.
+        "supports_image_input": False,
+        "auto_pin_tier": "A",
+        "quality_score": quality,
+    }
+
+
+async def _premium_allowed(*_args, **_kwargs):
+    return _FakeQuotaResult(allowed=True)
+
+
+@pytest.mark.asyncio
+async def test_image_turn_filters_out_text_only_candidates(monkeypatch):
+    from app.config import config
+
+    session = _FakeSession(_thread())
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_LLM_CONFIGS",
+        [_text_only_cfg(-1), _vision_cfg(-2)],
+    )
+    monkeypatch.setattr(
+        "app.services.auto_model_pin_service.TokenQuotaService.premium_get_usage",
+        _premium_allowed,
+    )
+
+    result = await resolve_or_get_pinned_llm_config_id(
+        session,
+        thread_id=1,
+        search_space_id=10,
+        user_id=None,
+        selected_llm_config_id=0,
+        requires_image_input=True,
+    )
+
+    assert result.resolved_llm_config_id == -2
+    # The thread should be pinned to the vision cfg even though the
+    # text-only cfg has a higher quality score.
+    assert session.thread.pinned_llm_config_id == -2
+
+
+@pytest.mark.asyncio
+async def test_image_turn_force_repins_stale_text_only_pin(monkeypatch):
+    """An existing text-only pin must be invalidated when the next turn
+    requires image input. The non-image path would happily reuse it."""
+    from app.config import config
+
+    session = _FakeSession(_thread(pinned=-1))
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_LLM_CONFIGS",
+        [_text_only_cfg(-1), _vision_cfg(-2)],
+    )
+    monkeypatch.setattr(
+        "app.services.auto_model_pin_service.TokenQuotaService.premium_get_usage",
+        _premium_allowed,
+    )
+
+    result = await resolve_or_get_pinned_llm_config_id(
+        session,
+        thread_id=1,
+        search_space_id=10,
+        user_id=None,
+        selected_llm_config_id=0,
+        requires_image_input=True,
+    )
+
+    assert result.resolved_llm_config_id == -2
+    assert result.from_existing_pin is False
+    assert session.thread.pinned_llm_config_id == -2
+
+
+@pytest.mark.asyncio
+async def test_image_turn_reuses_existing_vision_pin(monkeypatch):
+    """If the thread is already pinned to a vision-capable cfg, reuse it
+    — same as the non-image path. Image-aware filtering must not force
+    spurious re-pins."""
+    from app.config import config
+
+    session = _FakeSession(_thread(pinned=-2))
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_LLM_CONFIGS",
+        [_text_only_cfg(-1), _vision_cfg(-2), _vision_cfg(-3, quality=70)],
+    )
+    monkeypatch.setattr(
+        "app.services.auto_model_pin_service.TokenQuotaService.premium_get_usage",
+        _premium_allowed,
+    )
+
+    result = await resolve_or_get_pinned_llm_config_id(
+        session,
+        thread_id=1,
+        search_space_id=10,
+        user_id=None,
+        selected_llm_config_id=0,
+        requires_image_input=True,
+    )
+
+    assert result.resolved_llm_config_id == -2
+    assert result.from_existing_pin is True
+
+
+@pytest.mark.asyncio
+async def test_image_turn_with_no_vision_candidates_raises(monkeypatch):
+    """The friendly-error path: no vision-capable cfg in the pool -> raise
+    ``ValueError`` whose message contains ``vision-capable`` so the
+    streaming task can map it to ``MODEL_DOES_NOT_SUPPORT_IMAGE_INPUT``."""
+    from app.config import config
+
+    session = _FakeSession(_thread())
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_LLM_CONFIGS",
+        [_text_only_cfg(-1), _text_only_cfg(-2)],
+    )
+    monkeypatch.setattr(
+        "app.services.auto_model_pin_service.TokenQuotaService.premium_get_usage",
+        _premium_allowed,
+    )
+
+    with pytest.raises(ValueError, match="vision-capable"):
+        await resolve_or_get_pinned_llm_config_id(
+            session,
+            thread_id=1,
+            search_space_id=10,
+            user_id=None,
+            selected_llm_config_id=0,
+            requires_image_input=True,
+        )
+
+
+@pytest.mark.asyncio
+async def test_non_image_turn_keeps_text_only_in_pool(monkeypatch):
+    """Regression guard: the image flag must default False and not affect
+    a normal text-only turn — text-only cfgs remain selectable."""
+    from app.config import config
+
+    session = _FakeSession(_thread())
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_LLM_CONFIGS",
+        [_text_only_cfg(-1)],
+    )
+    monkeypatch.setattr(
+        "app.services.auto_model_pin_service.TokenQuotaService.premium_get_usage",
+        _premium_allowed,
+    )
+
+    result = await resolve_or_get_pinned_llm_config_id(
+        session,
+        thread_id=1,
+        search_space_id=10,
+        user_id=None,
+        selected_llm_config_id=0,
+    )
+    assert result.resolved_llm_config_id == -1
+
+
+@pytest.mark.asyncio
+async def test_image_turn_unannotated_cfg_resolves_via_helper(monkeypatch):
+    """A YAML cfg that omits ``supports_image_input`` falls through to
+    ``derive_supports_image_input`` (LiteLLM-driven). For ``gpt-4o``
+    that returns True, so the cfg should be a valid candidate."""
+    from app.config import config
+
+    session = _FakeSession(_thread())
+    cfg_unannotated_vision = {
+        "id": -2,
+        "provider": "OPENAI",
+        "model_name": "gpt-4o",  # known vision model in LiteLLM map
+        "api_key": "k",
+        "billing_tier": "free",
+        "auto_pin_tier": "A",
+        "quality_score": 80,
+        # NOTE: no supports_image_input key
+    }
+    monkeypatch.setattr(config, "GLOBAL_LLM_CONFIGS", [cfg_unannotated_vision])
+    monkeypatch.setattr(
+        "app.services.auto_model_pin_service.TokenQuotaService.premium_get_usage",
+        _premium_allowed,
+    )
+
+    result = await resolve_or_get_pinned_llm_config_id(
+        session,
+        thread_id=1,
+        search_space_id=10,
+        user_id=None,
+        selected_llm_config_id=0,
+        requires_image_input=True,
+    )
+    assert result.resolved_llm_config_id == -2
diff --git a/surfsense_backend/tests/unit/services/test_billable_call.py b/surfsense_backend/tests/unit/services/test_billable_call.py
new file mode 100644
index 0000000000..c820724ed3
--- /dev/null
+++ b/surfsense_backend/tests/unit/services/test_billable_call.py
@@ -0,0 +1,559 @@
+"""Unit tests for the ``billable_call`` async context manager.
+
+Covers the per-call premium-credit lifecycle for image generation and
+vision LLM extraction:
+
+* Free configs bypass reserve/finalize but still write an audit row.
+* Premium reserve denial raises ``QuotaInsufficientError`` (HTTP 402 in the
+  route layer).
+* Successful premium calls reserve, yield the accumulator, then finalize
+  with the LiteLLM-reported actual cost — and write an audit row.
+* Failed premium calls release the reservation so credit isn't leaked.
+* All quota DB ops happen inside their OWN ``shielded_async_session``,
+  isolating them from the caller's transaction (issue A).
+"""
+
+from __future__ import annotations
+
+import asyncio
+import contextlib
+from typing import Any
+from uuid import uuid4
+
+import pytest
+
+pytestmark = pytest.mark.unit
+
+
+# ---------------------------------------------------------------------------
+# Fakes
+# ---------------------------------------------------------------------------
+
+
+class _FakeQuotaResult:
+    def __init__(
+        self,
+        *,
+        allowed: bool,
+        used: int = 0,
+        limit: int = 5_000_000,
+        remaining: int = 5_000_000,
+    ) -> None:
+        self.allowed = allowed
+        self.used = used
+        self.limit = limit
+        self.remaining = remaining
+
+
+class _FakeSession:
+    """Minimal AsyncSession stub — record commits for assertion."""
+
+    def __init__(self) -> None:
+        self.committed = False
+        self.added: list[Any] = []
+
+    def add(self, obj: Any) -> None:
+        self.added.append(obj)
+
+    async def commit(self) -> None:
+        self.committed = True
+
+    async def rollback(self) -> None:
+        pass
+
+    async def close(self) -> None:
+        pass
+
+
+@contextlib.asynccontextmanager
+async def _fake_shielded_session():
+    s = _FakeSession()
+    _SESSIONS_USED.append(s)
+    yield s
+
+
+_SESSIONS_USED: list[_FakeSession] = []
+
+
+def _patch_isolation_layer(
+    monkeypatch, *, reserve_result, finalize_result=None, finalize_exc=None
+):
+    """Wire fake reserve/finalize/release/session helpers."""
+    _SESSIONS_USED.clear()
+    reserve_calls: list[dict[str, Any]] = []
+    finalize_calls: list[dict[str, Any]] = []
+    release_calls: list[dict[str, Any]] = []
+
+    async def _fake_reserve(*, db_session, user_id, request_id, reserve_micros):
+        reserve_calls.append(
+            {
+                "user_id": user_id,
+                "reserve_micros": reserve_micros,
+                "request_id": request_id,
+            }
+        )
+        return reserve_result
+
+    async def _fake_finalize(
+        *, db_session, user_id, request_id, actual_micros, reserved_micros
+    ):
+        if finalize_exc is not None:
+            raise finalize_exc
+        finalize_calls.append(
+            {
+                "user_id": user_id,
+                "actual_micros": actual_micros,
+                "reserved_micros": reserved_micros,
+            }
+        )
+        return finalize_result or _FakeQuotaResult(allowed=True)
+
+    async def _fake_release(*, db_session, user_id, reserved_micros):
+        release_calls.append({"user_id": user_id, "reserved_micros": reserved_micros})
+
+    record_calls: list[dict[str, Any]] = []
+
+    async def _fake_record(session, **kwargs):
+        record_calls.append(kwargs)
+        return object()
+
+    monkeypatch.setattr(
+        "app.services.billable_calls.TokenQuotaService.premium_reserve",
+        _fake_reserve,
+        raising=False,
+    )
+    monkeypatch.setattr(
+        "app.services.billable_calls.TokenQuotaService.premium_finalize",
+        _fake_finalize,
+        raising=False,
+    )
+    monkeypatch.setattr(
+        "app.services.billable_calls.TokenQuotaService.premium_release",
+        _fake_release,
+        raising=False,
+    )
+    monkeypatch.setattr(
+        "app.services.billable_calls.shielded_async_session",
+        _fake_shielded_session,
+        raising=False,
+    )
+    monkeypatch.setattr(
+        "app.services.billable_calls.record_token_usage",
+        _fake_record,
+        raising=False,
+    )
+
+    return {
+        "reserve": reserve_calls,
+        "finalize": finalize_calls,
+        "release": release_calls,
+        "record": record_calls,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_free_path_skips_reserve_but_writes_audit_row(monkeypatch):
+    from app.services.billable_calls import billable_call
+
+    spies = _patch_isolation_layer(
+        monkeypatch, reserve_result=_FakeQuotaResult(allowed=True)
+    )
+    user_id = uuid4()
+
+    async with billable_call(
+        user_id=user_id,
+        search_space_id=42,
+        billing_tier="free",
+        base_model="openai/gpt-image-1",
+        usage_type="image_generation",
+    ) as acc:
+        # Simulate a captured cost — the accumulator is fed by the LiteLLM
+        # callback in real life, here we add it manually.
+        acc.add(
+            model="openai/gpt-image-1",
+            prompt_tokens=0,
+            completion_tokens=0,
+            total_tokens=0,
+            cost_micros=37_000,
+            call_kind="image_generation",
+        )
+
+    assert spies["reserve"] == []
+    assert spies["finalize"] == []
+    assert spies["release"] == []
+    # Free still audits.
+    assert len(spies["record"]) == 1
+    assert spies["record"][0]["usage_type"] == "image_generation"
+    assert spies["record"][0]["cost_micros"] == 37_000
+
+
+@pytest.mark.asyncio
+async def test_premium_reserve_denied_raises_quota_insufficient(monkeypatch):
+    from app.services.billable_calls import (
+        QuotaInsufficientError,
+        billable_call,
+    )
+
+    spies = _patch_isolation_layer(
+        monkeypatch,
+        reserve_result=_FakeQuotaResult(
+            allowed=False, used=5_000_000, limit=5_000_000, remaining=0
+        ),
+    )
+    user_id = uuid4()
+
+    with pytest.raises(QuotaInsufficientError) as exc_info:
+        async with billable_call(
+            user_id=user_id,
+            search_space_id=42,
+            billing_tier="premium",
+            base_model="openai/gpt-image-1",
+            quota_reserve_micros_override=50_000,
+            usage_type="image_generation",
+        ):
+            pytest.fail("body should not run when reserve is denied")
+
+    err = exc_info.value
+    assert err.usage_type == "image_generation"
+    assert err.used_micros == 5_000_000
+    assert err.limit_micros == 5_000_000
+    assert err.remaining_micros == 0
+    # Reserve was attempted, but no finalize/release on a denied reserve
+    # — the reservation never actually held credit.
+    assert len(spies["reserve"]) == 1
+    assert spies["finalize"] == []
+    assert spies["release"] == []
+    # Denied premium calls do NOT create an audit row (no work happened).
+    assert spies["record"] == []
+
+
+@pytest.mark.asyncio
+async def test_premium_success_finalizes_with_actual_cost(monkeypatch):
+    from app.services.billable_calls import billable_call
+
+    spies = _patch_isolation_layer(
+        monkeypatch, reserve_result=_FakeQuotaResult(allowed=True)
+    )
+    user_id = uuid4()
+
+    async with billable_call(
+        user_id=user_id,
+        search_space_id=42,
+        billing_tier="premium",
+        base_model="openai/gpt-image-1",
+        quota_reserve_micros_override=50_000,
+        usage_type="image_generation",
+    ) as acc:
+        # LiteLLM callback would normally fill this — simulate $0.04 image.
+        acc.add(
+            model="openai/gpt-image-1",
+            prompt_tokens=0,
+            completion_tokens=0,
+            total_tokens=0,
+            cost_micros=40_000,
+            call_kind="image_generation",
+        )
+
+    assert len(spies["reserve"]) == 1
+    assert spies["reserve"][0]["reserve_micros"] == 50_000
+    assert len(spies["finalize"]) == 1
+    assert spies["finalize"][0]["actual_micros"] == 40_000
+    assert spies["finalize"][0]["reserved_micros"] == 50_000
+    assert spies["release"] == []
+    # And audit row written with the actual debited cost.
+    assert spies["record"][0]["cost_micros"] == 40_000
+    # Each quota op opened its OWN session — proves session isolation.
+    assert len(_SESSIONS_USED) >= 3
+    # Sessions used should each have committed (or be the audit one which commits).
+    for _s in _SESSIONS_USED:
+        # finalize/reserve happen via TokenQuotaService.* which we stub —
+        # they don't actually call commit on our fake session, but the
+        # audit session does. We just assert >=1 session committed.
+        pass
+    assert any(s.committed for s in _SESSIONS_USED)
+
+
+@pytest.mark.asyncio
+async def test_premium_failure_releases_reservation(monkeypatch):
+    from app.services.billable_calls import billable_call
+
+    spies = _patch_isolation_layer(
+        monkeypatch, reserve_result=_FakeQuotaResult(allowed=True)
+    )
+    user_id = uuid4()
+
+    class _ProviderError(Exception):
+        pass
+
+    with pytest.raises(_ProviderError):
+        async with billable_call(
+            user_id=user_id,
+            search_space_id=42,
+            billing_tier="premium",
+            base_model="openai/gpt-image-1",
+            quota_reserve_micros_override=50_000,
+            usage_type="image_generation",
+        ):
+            raise _ProviderError("OpenRouter 503")
+
+    assert len(spies["reserve"]) == 1
+    assert spies["finalize"] == []
+    # Failure path: release the held reservation.
+    assert len(spies["release"]) == 1
+    assert spies["release"][0]["reserved_micros"] == 50_000
+
+
+@pytest.mark.asyncio
+async def test_premium_uses_estimator_when_no_micros_override(monkeypatch):
+    """When ``quota_reserve_micros_override`` is None we fall back to
+    ``estimate_call_reserve_micros(base_model, quota_reserve_tokens)``.
+    Vision LLM calls take this path (token-priced models).
+    """
+    from app.services.billable_calls import billable_call
+
+    spies = _patch_isolation_layer(
+        monkeypatch, reserve_result=_FakeQuotaResult(allowed=True)
+    )
+
+    captured_estimator_calls: list[dict[str, Any]] = []
+
+    def _fake_estimate(*, base_model, quota_reserve_tokens):
+        captured_estimator_calls.append(
+            {"base_model": base_model, "quota_reserve_tokens": quota_reserve_tokens}
+        )
+        return 12_345
+
+    monkeypatch.setattr(
+        "app.services.billable_calls.estimate_call_reserve_micros",
+        _fake_estimate,
+        raising=False,
+    )
+
+    user_id = uuid4()
+    async with billable_call(
+        user_id=user_id,
+        search_space_id=1,
+        billing_tier="premium",
+        base_model="openai/gpt-4o",
+        quota_reserve_tokens=4000,
+        usage_type="vision_extraction",
+    ):
+        pass
+
+    assert captured_estimator_calls == [
+        {"base_model": "openai/gpt-4o", "quota_reserve_tokens": 4000}
+    ]
+    assert spies["reserve"][0]["reserve_micros"] == 12_345
+
+
+@pytest.mark.asyncio
+async def test_premium_finalize_failure_propagates_and_releases(monkeypatch):
+    from app.services.billable_calls import BillingSettlementError, billable_call
+
+    class _FinalizeError(RuntimeError):
+        pass
+
+    spies = _patch_isolation_layer(
+        monkeypatch,
+        reserve_result=_FakeQuotaResult(allowed=True),
+        finalize_exc=_FinalizeError("db finalize failed"),
+    )
+    user_id = uuid4()
+
+    with pytest.raises(BillingSettlementError):
+        async with billable_call(
+            user_id=user_id,
+            search_space_id=42,
+            billing_tier="premium",
+            base_model="openai/gpt-image-1",
+            quota_reserve_micros_override=50_000,
+            usage_type="image_generation",
+        ) as acc:
+            acc.add(
+                model="openai/gpt-image-1",
+                prompt_tokens=0,
+                completion_tokens=0,
+                total_tokens=0,
+                cost_micros=40_000,
+                call_kind="image_generation",
+            )
+
+    assert len(spies["reserve"]) == 1
+    assert len(spies["release"]) == 1
+    assert spies["record"] == []
+
+
+@pytest.mark.asyncio
+async def test_premium_audit_commit_hang_times_out_after_finalize(monkeypatch):
+    from app.services.billable_calls import billable_call
+
+    spies = _patch_isolation_layer(
+        monkeypatch, reserve_result=_FakeQuotaResult(allowed=True)
+    )
+    user_id = uuid4()
+
+    class _HangingCommitSession(_FakeSession):
+        async def commit(self) -> None:
+            await asyncio.sleep(60)
+
+    @contextlib.asynccontextmanager
+    async def _hanging_session_factory():
+        s = _HangingCommitSession()
+        _SESSIONS_USED.append(s)
+        yield s
+
+    async with billable_call(
+        user_id=user_id,
+        search_space_id=42,
+        billing_tier="premium",
+        base_model="openai/gpt-image-1",
+        quota_reserve_micros_override=50_000,
+        usage_type="image_generation",
+        billable_session_factory=_hanging_session_factory,
+        audit_timeout_seconds=0.01,
+    ) as acc:
+        acc.add(
+            model="openai/gpt-image-1",
+            prompt_tokens=0,
+            completion_tokens=0,
+            total_tokens=0,
+            cost_micros=40_000,
+            call_kind="image_generation",
+        )
+
+    assert len(spies["reserve"]) == 1
+    assert len(spies["finalize"]) == 1
+    assert len(spies["record"]) == 1
+    assert spies["release"] == []
+
+
+@pytest.mark.asyncio
+async def test_free_audit_failure_is_best_effort(monkeypatch):
+    from app.services.billable_calls import billable_call
+
+    spies = _patch_isolation_layer(
+        monkeypatch, reserve_result=_FakeQuotaResult(allowed=True)
+    )
+
+    async def _failing_record(_session, **_kwargs):
+        raise RuntimeError("audit insert failed")
+
+    monkeypatch.setattr(
+        "app.services.billable_calls.record_token_usage",
+        _failing_record,
+        raising=False,
+    )
+
+    async with billable_call(
+        user_id=uuid4(),
+        search_space_id=42,
+        billing_tier="free",
+        base_model="openai/gpt-image-1",
+        usage_type="image_generation",
+        audit_timeout_seconds=0.01,
+    ) as acc:
+        acc.add(
+            model="openai/gpt-image-1",
+            prompt_tokens=0,
+            completion_tokens=0,
+            total_tokens=0,
+            cost_micros=37_000,
+            call_kind="image_generation",
+        )
+
+    assert spies["reserve"] == []
+    assert spies["finalize"] == []
+
+
+# ---------------------------------------------------------------------------
+# Podcast / video-presentation usage_type coverage
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_free_podcast_path_audits_with_podcast_usage_type(monkeypatch):
+    """Free podcast configs must skip reserve/finalize but still emit a
+    ``TokenUsage`` row tagged ``usage_type='podcast_generation'`` so we
+    have full audit coverage of free-tier agent runs."""
+    from app.services.billable_calls import billable_call
+
+    spies = _patch_isolation_layer(
+        monkeypatch, reserve_result=_FakeQuotaResult(allowed=True)
+    )
+    user_id = uuid4()
+
+    async with billable_call(
+        user_id=user_id,
+        search_space_id=42,
+        billing_tier="free",
+        base_model="openrouter/some-free-model",
+        quota_reserve_micros_override=200_000,
+        usage_type="podcast_generation",
+        thread_id=99,
+        call_details={"podcast_id": 7, "title": "Test Podcast"},
+    ) as acc:
+        # Two transcript LLM calls aggregated into one accumulator.
+        acc.add(
+            model="openrouter/some-free-model",
+            prompt_tokens=1500,
+            completion_tokens=8000,
+            total_tokens=9500,
+            cost_micros=0,
+            call_kind="chat",
+        )
+
+    assert spies["reserve"] == []
+    assert spies["finalize"] == []
+    assert spies["release"] == []
+
+    assert len(spies["record"]) == 1
+    row = spies["record"][0]
+    assert row["usage_type"] == "podcast_generation"
+    assert row["thread_id"] is None
+    assert row["search_space_id"] == 42
+    assert row["call_details"] == {"podcast_id": 7, "title": "Test Podcast"}
+
+
+@pytest.mark.asyncio
+async def test_premium_video_denial_raises_quota_insufficient(monkeypatch):
+    """Premium video-presentation runs that hit a denied reservation must
+    raise ``QuotaInsufficientError`` *before* the graph runs and must not
+    emit an audit row (no work happened)."""
+    from app.services.billable_calls import (
+        QuotaInsufficientError,
+        billable_call,
+    )
+
+    spies = _patch_isolation_layer(
+        monkeypatch,
+        reserve_result=_FakeQuotaResult(
+            allowed=False, used=4_500_000, limit=5_000_000, remaining=500_000
+        ),
+    )
+    user_id = uuid4()
+
+    with pytest.raises(QuotaInsufficientError) as exc_info:
+        async with billable_call(
+            user_id=user_id,
+            search_space_id=42,
+            billing_tier="premium",
+            base_model="gpt-5.4",
+            quota_reserve_micros_override=1_000_000,
+            usage_type="video_presentation_generation",
+            thread_id=99,
+            call_details={"video_presentation_id": 12, "title": "Test Video"},
+        ):
+            pytest.fail("body should not run when reserve is denied")
+
+    err = exc_info.value
+    assert err.usage_type == "video_presentation_generation"
+    assert err.remaining_micros == 500_000
+    assert spies["reserve"][0]["reserve_micros"] == 1_000_000
+    assert spies["finalize"] == []
+    assert spies["release"] == []
+    assert spies["record"] == []
diff --git a/surfsense_backend/tests/unit/services/test_folder_hierarchy.py b/surfsense_backend/tests/unit/services/test_folder_hierarchy.py
new file mode 100644
index 0000000000..9077f6b0e8
--- /dev/null
+++ b/surfsense_backend/tests/unit/services/test_folder_hierarchy.py
@@ -0,0 +1,87 @@
+"""Unit tests for ensure_folder_hierarchy_with_depth_validation."""
+
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+pytestmark = pytest.mark.unit
+
+
+@pytest.mark.asyncio
+async def test_creates_missing_folders_in_chain():
+    """Should create all folders when none exist."""
+    from app.services.folder_service import (
+        ensure_folder_hierarchy_with_depth_validation,
+    )
+
+    session = AsyncMock()
+    # All lookups return None (no existing folders)
+    mock_result = MagicMock()
+    mock_result.scalar_one_or_none.return_value = None
+    session.execute.return_value = mock_result
+
+    folder_instances = []
+
+    def track_add(obj):
+        folder_instances.append(obj)
+
+    session.add = track_add
+
+    with (
+        patch(
+            "app.services.folder_service.validate_folder_depth", new_callable=AsyncMock
+        ),
+        patch(
+            "app.services.folder_service.generate_folder_position",
+            new_callable=AsyncMock,
+            return_value="a0",
+        ),
+    ):
+        # Mock flush to assign IDs
+        call_count = 0
+
+        async def mock_flush():
+            nonlocal call_count
+            call_count += 1
+            if folder_instances:
+                folder_instances[-1].id = call_count
+
+        session.flush = mock_flush
+
+        segments = [
+            {"name": "Slack", "metadata": {"ai_sort": True, "ai_sort_level": 1}},
+            {"name": "2025-03-15", "metadata": {"ai_sort": True, "ai_sort_level": 2}},
+        ]
+
+        result = await ensure_folder_hierarchy_with_depth_validation(
+            session, 1, segments
+        )
+
+        assert len(folder_instances) == 2
+        assert folder_instances[0].name == "Slack"
+        assert folder_instances[1].name == "2025-03-15"
+        assert result is folder_instances[-1]
+
+
+@pytest.mark.asyncio
+async def test_reuses_existing_folder():
+    """When a folder already exists, it should be reused, not created."""
+    from app.services.folder_service import (
+        ensure_folder_hierarchy_with_depth_validation,
+    )
+
+    session = AsyncMock()
+
+    existing_folder = MagicMock()
+    existing_folder.id = 42
+
+    mock_result = MagicMock()
+    mock_result.scalar_one_or_none.return_value = existing_folder
+    session.execute.return_value = mock_result
+
+    segments = [{"name": "Existing", "metadata": None}]
+
+    result = await ensure_folder_hierarchy_with_depth_validation(session, 1, segments)
+
+    assert result is existing_folder
+    session.add.assert_not_called()
diff --git a/surfsense_backend/tests/unit/services/test_image_gen_api_base_defense.py b/surfsense_backend/tests/unit/services/test_image_gen_api_base_defense.py
new file mode 100644
index 0000000000..9d5fdb190a
--- /dev/null
+++ b/surfsense_backend/tests/unit/services/test_image_gen_api_base_defense.py
@@ -0,0 +1,177 @@
+"""Defense-in-depth: image-gen call sites must not let an empty
+``api_base`` fall through to LiteLLM's module-global ``litellm.api_base``.
+
+The bug repro: an OpenRouter image-gen config ships
+``api_base=""``. The pre-fix call site in
+``image_generation_routes._execute_image_generation`` did
+``if cfg.get("api_base"): kwargs["api_base"] = cfg["api_base"]`` which
+silently dropped the empty string. LiteLLM then fell back to
+``litellm.api_base`` (commonly inherited from ``AZURE_OPENAI_ENDPOINT``)
+and OpenRouter's ``image_generation/transformation`` appended
+``/chat/completions`` to it → 404 ``Resource not found``.
+
+This test pins the post-fix behaviour: with an empty ``api_base`` in
+the config, the call site MUST set ``api_base`` to OpenRouter's public
+URL instead of leaving it unset.
+"""
+
+from __future__ import annotations
+
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+pytestmark = pytest.mark.unit
+
+
+@pytest.mark.asyncio
+async def test_global_openrouter_image_gen_sets_api_base_when_config_empty():
+    """The global-config branch (``config_id < 0``) of
+    ``_execute_image_generation`` must apply the resolver and pin
+    ``api_base`` to OpenRouter when the config ships an empty string.
+    """
+    from app.routes import image_generation_routes
+
+    cfg = {
+        "id": -20_001,
+        "name": "GPT Image 1 (OpenRouter)",
+        "provider": "OPENROUTER",
+        "model_name": "openai/gpt-image-1",
+        "api_key": "sk-or-test",
+        "api_base": "",  # the original bug shape
+        "api_version": None,
+        "litellm_params": {},
+    }
+
+    captured: dict = {}
+
+    async def fake_aimage_generation(**kwargs):
+        captured.update(kwargs)
+        return MagicMock(model_dump=lambda: {"data": []}, _hidden_params={})
+
+    image_gen = MagicMock()
+    image_gen.image_generation_config_id = cfg["id"]
+    image_gen.prompt = "test"
+    image_gen.n = 1
+    image_gen.quality = None
+    image_gen.size = None
+    image_gen.style = None
+    image_gen.response_format = None
+    image_gen.model = None
+
+    search_space = MagicMock()
+    search_space.image_generation_config_id = cfg["id"]
+    session = MagicMock()
+
+    with (
+        patch.object(
+            image_generation_routes,
+            "_get_global_image_gen_config",
+            return_value=cfg,
+        ),
+        patch.object(
+            image_generation_routes,
+            "aimage_generation",
+            side_effect=fake_aimage_generation,
+        ),
+    ):
+        await image_generation_routes._execute_image_generation(
+            session=session, image_gen=image_gen, search_space=search_space
+        )
+
+    # The whole point of the fix: even with empty ``api_base`` in the
+    # config, we forward OpenRouter's public URL so the call doesn't
+    # inherit an Azure endpoint.
+    assert captured.get("api_base") == "https://openrouter.ai/api/v1"
+    assert captured["model"] == "openrouter/openai/gpt-image-1"
+
+
+@pytest.mark.asyncio
+async def test_generate_image_tool_global_sets_api_base_when_config_empty():
+    """Same defense at the agent tool entry point — both surfaces share
+    the same OpenRouter config payloads."""
+    from app.agents.new_chat.tools import generate_image as gi_module
+
+    cfg = {
+        "id": -20_001,
+        "name": "GPT Image 1 (OpenRouter)",
+        "provider": "OPENROUTER",
+        "model_name": "openai/gpt-image-1",
+        "api_key": "sk-or-test",
+        "api_base": "",
+        "api_version": None,
+        "litellm_params": {},
+    }
+
+    captured: dict = {}
+
+    async def fake_aimage_generation(**kwargs):
+        captured.update(kwargs)
+        response = MagicMock()
+        response.model_dump.return_value = {
+            "data": [{"url": "https://example.com/x.png"}]
+        }
+        response._hidden_params = {"model": "openrouter/openai/gpt-image-1"}
+        return response
+
+    search_space = MagicMock()
+    search_space.id = 1
+    search_space.image_generation_config_id = cfg["id"]
+
+    session_cm = AsyncMock()
+    session = AsyncMock()
+    session_cm.__aenter__.return_value = session
+
+    scalars = MagicMock()
+    scalars.first.return_value = search_space
+    exec_result = MagicMock()
+    exec_result.scalars.return_value = scalars
+    session.execute.return_value = exec_result
+    session.add = MagicMock()
+    session.commit = AsyncMock()
+    session.refresh = AsyncMock()
+
+    # ``refresh(db_image_gen)`` needs to populate ``id`` for token URL fallback.
+    async def _refresh(obj):
+        obj.id = 1
+
+    session.refresh.side_effect = _refresh
+
+    with (
+        patch.object(gi_module, "shielded_async_session", return_value=session_cm),
+        patch.object(gi_module, "_get_global_image_gen_config", return_value=cfg),
+        patch.object(
+            gi_module, "aimage_generation", side_effect=fake_aimage_generation
+        ),
+        patch.object(
+            gi_module, "is_image_gen_auto_mode", side_effect=lambda cid: cid == 0
+        ),
+    ):
+        tool = gi_module.create_generate_image_tool(
+            search_space_id=1, db_session=MagicMock()
+        )
+        await tool.ainvoke({"prompt": "a cat", "n": 1})
+
+    assert captured.get("api_base") == "https://openrouter.ai/api/v1"
+    assert captured["model"] == "openrouter/openai/gpt-image-1"
+
+
+def test_image_gen_router_deployment_sets_api_base_when_config_empty():
+    """The Auto-mode router pool must also resolve ``api_base`` when an
+    OpenRouter config ships an empty string. The deployment dict is fed
+    straight to ``litellm.Router``, so a missing ``api_base`` would
+    leak the same way as the direct call sites.
+    """
+    from app.services.image_gen_router_service import ImageGenRouterService
+
+    deployment = ImageGenRouterService._config_to_deployment(
+        {
+            "model_name": "openai/gpt-image-1",
+            "provider": "OPENROUTER",
+            "api_key": "sk-or-test",
+            "api_base": "",
+        }
+    )
+    assert deployment is not None
+    assert deployment["litellm_params"]["api_base"] == "https://openrouter.ai/api/v1"
+    assert deployment["litellm_params"]["model"] == "openrouter/openai/gpt-image-1"
diff --git a/surfsense_backend/tests/unit/services/test_llm_router_pool_filter.py b/surfsense_backend/tests/unit/services/test_llm_router_pool_filter.py
new file mode 100644
index 0000000000..c309ff8819
--- /dev/null
+++ b/surfsense_backend/tests/unit/services/test_llm_router_pool_filter.py
@@ -0,0 +1,226 @@
+"""LLMRouterService pool-filter / rebuild tests.
+
+These tests focus on the *config plumbing* (which configs enter the router
+pool, rebuild resets state correctly). They stub out the underlying
+``litellm.Router`` so we don't need real API keys or network access.
+"""
+
+from __future__ import annotations
+
+from unittest.mock import patch
+
+import pytest
+
+from app.services.llm_router_service import LLMRouterService
+
+pytestmark = pytest.mark.unit
+
+
+def _fake_yaml_config(
+    *,
+    id: int,
+    model_name: str,
+    billing_tier: str = "free",
+) -> dict:
+    return {
+        "id": id,
+        "name": f"yaml-{id}",
+        "provider": "OPENAI",
+        "model_name": model_name,
+        "api_key": "sk-test",
+        "api_base": "",
+        "billing_tier": billing_tier,
+        "rpm": 100,
+        "tpm": 100_000,
+        "litellm_params": {},
+    }
+
+
+def _fake_openrouter_config(
+    *,
+    id: int,
+    model_name: str,
+    billing_tier: str,
+    router_pool_eligible: bool | None = None,
+) -> dict:
+    """Build a synthetic dynamic-OR config dict for router-pool tests.
+
+    Defaults mirror Strategy 3: premium OR enters the pool, free OR stays
+    out. Callers can override ``router_pool_eligible`` to simulate legacy
+    configs or to regression-test the filter mechanics directly.
+    """
+    if router_pool_eligible is None:
+        router_pool_eligible = billing_tier == "premium"
+    return {
+        "id": id,
+        "name": f"or-{id}",
+        "provider": "OPENROUTER",
+        "model_name": model_name,
+        "api_key": "sk-or-test",
+        "api_base": "",
+        "billing_tier": billing_tier,
+        "rpm": 20 if billing_tier == "free" else 200,
+        "tpm": 100_000 if billing_tier == "free" else 1_000_000,
+        "litellm_params": {},
+        "router_pool_eligible": router_pool_eligible,
+    }
+
+
+def _reset_router_singleton() -> None:
+    instance = LLMRouterService.get_instance()
+    instance._initialized = False
+    instance._router = None
+    instance._model_list = []
+    instance._premium_model_strings = set()
+
+
+def test_router_pool_includes_or_premium_excludes_or_free():
+    """Strategy 3: premium OR joins the pool, free OR stays out.
+
+    Dynamic OpenRouter premium entries opt into load balancing alongside
+    curated YAML configs. Dynamic OR free entries are intentionally kept
+    out because OpenRouter's free tier enforces a single account-global
+    quota bucket that per-deployment router accounting can't represent.
+    """
+    _reset_router_singleton()
+    configs = [
+        _fake_yaml_config(id=-1, model_name="gpt-4o", billing_tier="premium"),
+        _fake_yaml_config(id=-2, model_name="gpt-4o-mini", billing_tier="free"),
+        _fake_openrouter_config(
+            id=-10_001, model_name="openai/gpt-4o", billing_tier="premium"
+        ),
+        _fake_openrouter_config(
+            id=-10_002,
+            model_name="meta-llama/llama-3.3-70b:free",
+            billing_tier="free",
+        ),
+    ]
+
+    with (
+        patch("app.services.llm_router_service.Router") as mock_router,
+        patch(
+            "app.services.llm_router_service.LLMRouterService._build_context_fallback_groups"
+        ) as mock_ctx_fb,
+    ):
+        mock_ctx_fb.side_effect = lambda ml: (ml, None)
+        mock_router.return_value = object()
+        LLMRouterService.initialize(configs)
+
+    pool_models = {
+        dep["litellm_params"]["model"]
+        for dep in LLMRouterService.get_instance()._model_list
+    }
+    # YAML premium + YAML free + dynamic OR premium are all in the pool.
+    # Dynamic OR free is NOT (shared-bucket rate limits can't be load-balanced).
+    assert pool_models == {
+        "openai/gpt-4o",
+        "openai/gpt-4o-mini",
+        "openrouter/openai/gpt-4o",
+    }
+
+    prem = LLMRouterService.get_instance()._premium_model_strings
+    # YAML premium is fingerprinted under both its model_string and its
+    # ``base_model`` form (existing behavior we don't want to regress).
+    assert "openai/gpt-4o" in prem
+    # Dynamic OR premium is now fingerprinted as premium so pool-level
+    # calls through the router are billed against premium quota.
+    assert "openrouter/openai/gpt-4o" in prem
+    assert LLMRouterService.is_premium_model("openrouter/openai/gpt-4o") is True
+    # Dynamic OR free never enters the pool, so it's never counted as premium.
+    assert (
+        LLMRouterService.is_premium_model("openrouter/meta-llama/llama-3.3-70b:free")
+        is False
+    )
+
+
+def test_router_pool_filter_mechanics_respect_override():
+    """The ``router_pool_eligible`` filter itself works independently of tier.
+
+    Regression guard: if a future refactor ever sets the flag False on a
+    premium config (e.g. for maintenance), that config MUST be skipped by
+    ``initialize`` even though its tier is premium.
+    """
+    _reset_router_singleton()
+    configs = [
+        _fake_yaml_config(id=-1, model_name="gpt-4o", billing_tier="premium"),
+        _fake_openrouter_config(
+            id=-10_001,
+            model_name="openai/gpt-4o",
+            billing_tier="premium",
+            router_pool_eligible=False,  # opt out despite being premium
+        ),
+    ]
+
+    with (
+        patch("app.services.llm_router_service.Router") as mock_router,
+        patch(
+            "app.services.llm_router_service.LLMRouterService._build_context_fallback_groups"
+        ) as mock_ctx_fb,
+    ):
+        mock_ctx_fb.side_effect = lambda ml: (ml, None)
+        mock_router.return_value = object()
+        LLMRouterService.initialize(configs)
+
+    pool_models = {
+        dep["litellm_params"]["model"]
+        for dep in LLMRouterService.get_instance()._model_list
+    }
+    assert pool_models == {"openai/gpt-4o"}
+    assert LLMRouterService.is_premium_model("openrouter/openai/gpt-4o") is False
+
+
+def test_rebuild_refreshes_pool_after_configs_change():
+    _reset_router_singleton()
+    configs_v1 = [
+        _fake_yaml_config(id=-1, model_name="gpt-4o", billing_tier="premium"),
+    ]
+    configs_v2 = [
+        *configs_v1,
+        _fake_yaml_config(id=-2, model_name="gpt-4o-mini", billing_tier="free"),
+    ]
+
+    with (
+        patch("app.services.llm_router_service.Router") as mock_router,
+        patch(
+            "app.services.llm_router_service.LLMRouterService._build_context_fallback_groups"
+        ) as mock_ctx_fb,
+    ):
+        mock_ctx_fb.side_effect = lambda ml: (ml, None)
+        mock_router.return_value = object()
+
+        LLMRouterService.initialize(configs_v1)
+        assert len(LLMRouterService.get_instance()._model_list) == 1
+
+        # ``initialize`` should be a no-op here (already initialized).
+        LLMRouterService.initialize(configs_v2)
+        assert len(LLMRouterService.get_instance()._model_list) == 1
+
+        # ``rebuild`` must clear the guard and re-run with the new configs.
+        LLMRouterService.rebuild(configs_v2)
+        assert len(LLMRouterService.get_instance()._model_list) == 2
+
+
+def test_auto_model_pin_candidates_include_dynamic_openrouter():
+    """Dynamic OR configs must remain Auto-mode thread-pin candidates.
+
+    Guards against a future regression where someone adds the
+    ``router_pool_eligible`` filter to ``auto_model_pin_service._global_candidates``.
+    """
+    from app.config import config
+    from app.services.auto_model_pin_service import _global_candidates
+
+    or_premium = _fake_openrouter_config(
+        id=-10_001, model_name="openai/gpt-4o", billing_tier="premium"
+    )
+    or_free = _fake_openrouter_config(
+        id=-10_002,
+        model_name="meta-llama/llama-3.3-70b:free",
+        billing_tier="free",
+    )
+    original = config.GLOBAL_LLM_CONFIGS
+    try:
+        config.GLOBAL_LLM_CONFIGS = [or_premium, or_free]
+        candidate_ids = {c["id"] for c in _global_candidates()}
+        assert candidate_ids == {-10_001, -10_002}
+    finally:
+        config.GLOBAL_LLM_CONFIGS = original
diff --git a/surfsense_backend/tests/unit/services/test_openrouter_integration_service.py b/surfsense_backend/tests/unit/services/test_openrouter_integration_service.py
new file mode 100644
index 0000000000..88fcf2db3a
--- /dev/null
+++ b/surfsense_backend/tests/unit/services/test_openrouter_integration_service.py
@@ -0,0 +1,380 @@
+"""Unit tests for the dynamic OpenRouter integration."""
+
+from __future__ import annotations
+
+import pytest
+
+from app.services.openrouter_integration_service import (
+    _OPENROUTER_DYNAMIC_MARKER,
+    _generate_configs,
+    _openrouter_tier,
+    _stable_config_id,
+)
+
+pytestmark = pytest.mark.unit
+
+
+def _minimal_openrouter_model(
+    *,
+    model_id: str,
+    pricing: dict | None = None,
+    name: str | None = None,
+) -> dict:
+    """Return a synthetic OpenRouter /api/v1/models entry.
+
+    The real API payload includes a lot of fields; we only populate what
+    ``_generate_configs`` actually inspects (architecture, tool support,
+    context, pricing, id).
+    """
+    return {
+        "id": model_id,
+        "name": name or model_id,
+        "architecture": {"output_modalities": ["text"]},
+        "supported_parameters": ["tools"],
+        "context_length": 200_000,
+        "pricing": pricing or {"prompt": "0.000003", "completion": "0.000015"},
+    }
+
+
+# ---------------------------------------------------------------------------
+# _openrouter_tier
+# ---------------------------------------------------------------------------
+
+
+def test_openrouter_tier_free_suffix():
+    assert _openrouter_tier({"id": "foo/bar:free"}) == "free"
+
+
+def test_openrouter_tier_zero_pricing():
+    model = {
+        "id": "foo/bar",
+        "pricing": {"prompt": "0", "completion": "0"},
+    }
+    assert _openrouter_tier(model) == "free"
+
+
+def test_openrouter_tier_paid():
+    model = {
+        "id": "foo/bar",
+        "pricing": {"prompt": "0.000003", "completion": "0.000015"},
+    }
+    assert _openrouter_tier(model) == "premium"
+
+
+def test_openrouter_tier_missing_pricing_is_premium():
+    assert _openrouter_tier({"id": "foo/bar"}) == "premium"
+    assert _openrouter_tier({"id": "foo/bar", "pricing": {}}) == "premium"
+
+
+# ---------------------------------------------------------------------------
+# _stable_config_id
+# ---------------------------------------------------------------------------
+
+
+def test_stable_config_id_deterministic():
+    taken1: set[int] = set()
+    taken2: set[int] = set()
+    a = _stable_config_id("openai/gpt-4o", -10_000, taken1)
+    b = _stable_config_id("openai/gpt-4o", -10_000, taken2)
+    assert a == b
+    assert a < 0
+
+
+def test_stable_config_id_collision_decrements():
+    """When two model_ids hash to the same slot, the second should decrement."""
+    taken: set[int] = set()
+    a = _stable_config_id("openai/gpt-4o", -10_000, taken)
+    # Force a collision by pre-populating ``taken`` with a slot we know will be
+    # picked.
+    taken_forced = {a}
+    b = _stable_config_id("openai/gpt-4o", -10_000, taken_forced)
+    assert b != a
+    assert b == a - 1
+    assert b in taken_forced
+
+
+def test_stable_config_id_different_models_different_ids():
+    taken: set[int] = set()
+    ids = {
+        _stable_config_id("openai/gpt-4o", -10_000, taken),
+        _stable_config_id("anthropic/claude-3.5-sonnet", -10_000, taken),
+        _stable_config_id("google/gemini-2.0-flash", -10_000, taken),
+    }
+    assert len(ids) == 3
+
+
+def test_stable_config_id_survives_catalogue_churn():
+    """Removing a model should not shift other models' IDs (the bug we fix)."""
+    taken1: set[int] = set()
+    id_a1 = _stable_config_id("openai/gpt-4o", -10_000, taken1)
+    _ = _stable_config_id("anthropic/claude-3-haiku", -10_000, taken1)
+    id_c1 = _stable_config_id("google/gemini-2.0-flash", -10_000, taken1)
+
+    taken2: set[int] = set()
+    id_a2 = _stable_config_id("openai/gpt-4o", -10_000, taken2)
+    id_c2 = _stable_config_id("google/gemini-2.0-flash", -10_000, taken2)
+
+    assert id_a1 == id_a2
+    assert id_c1 == id_c2
+
+
+# ---------------------------------------------------------------------------
+# _generate_configs
+# ---------------------------------------------------------------------------
+
+
+_SETTINGS_BASE: dict = {
+    "api_key": "sk-or-test",
+    "id_offset": -10_000,
+    "rpm": 200,
+    "tpm": 1_000_000,
+    "free_rpm": 20,
+    "free_tpm": 100_000,
+    "anonymous_enabled_paid": False,
+    "anonymous_enabled_free": True,
+    "quota_reserve_tokens": 4000,
+}
+
+
+def test_generate_configs_respects_tier():
+    """Premium OR models opt into the router pool; free OR models stay out.
+
+    Strategy-3 split: premium participates in LiteLLM Router load balancing,
+    free stays excluded because OpenRouter enforces a shared global free-tier
+    bucket that per-deployment router accounting can't represent.
+    """
+    raw = [
+        _minimal_openrouter_model(model_id="openai/gpt-4o"),
+        _minimal_openrouter_model(
+            model_id="meta-llama/llama-3.3-70b-instruct:free",
+            pricing={"prompt": "0", "completion": "0"},
+        ),
+    ]
+    cfgs = _generate_configs(raw, dict(_SETTINGS_BASE))
+    by_model = {c["model_name"]: c for c in cfgs}
+
+    paid = by_model["openai/gpt-4o"]
+    assert paid["billing_tier"] == "premium"
+    assert paid["rpm"] == 200
+    assert paid["tpm"] == 1_000_000
+    assert paid["anonymous_enabled"] is False
+    assert paid["router_pool_eligible"] is True
+    assert paid[_OPENROUTER_DYNAMIC_MARKER] is True
+
+    free = by_model["meta-llama/llama-3.3-70b-instruct:free"]
+    assert free["billing_tier"] == "free"
+    assert free["rpm"] == 20
+    assert free["tpm"] == 100_000
+    assert free["anonymous_enabled"] is True
+    assert free["router_pool_eligible"] is False
+
+
+def test_generate_configs_excludes_upstream_openrouter_free_router():
+    """OpenRouter's own ``openrouter/free`` meta-router must never become a card.
+
+    The upstream API returns this as a first-class zero-priced model, so
+    without an explicit blocklist entry it would slip through every other
+    filter (text output, tool calling, 200k context, non-Amazon) and land
+    in the selector as a duplicate of the concrete ``:free`` cards. The
+    exclusion in ``_EXCLUDED_MODEL_IDS`` prevents that.
+    """
+    raw = [
+        _minimal_openrouter_model(model_id="openai/gpt-4o"),
+        _minimal_openrouter_model(
+            model_id="openrouter/free",
+            pricing={"prompt": "0", "completion": "0"},
+        ),
+    ]
+    cfgs = _generate_configs(raw, dict(_SETTINGS_BASE))
+    model_names = {c["model_name"] for c in cfgs}
+    assert "openrouter/free" not in model_names
+    assert "openai/gpt-4o" in model_names
+
+
+def test_generate_configs_drops_non_text_and_non_tool_models():
+    raw = [
+        _minimal_openrouter_model(model_id="openai/gpt-4o"),
+        {  # image-output model
+            "id": "openai/dall-e",
+            "architecture": {"output_modalities": ["image"]},
+            "supported_parameters": ["tools"],
+            "context_length": 200_000,
+            "pricing": {"prompt": "0.01", "completion": "0.01"},
+        },
+        {  # text but no tool calling
+            "id": "openai/completion-only",
+            "architecture": {"output_modalities": ["text"]},
+            "supported_parameters": [],
+            "context_length": 200_000,
+            "pricing": {"prompt": "0.01", "completion": "0.01"},
+        },
+    ]
+    cfgs = _generate_configs(raw, dict(_SETTINGS_BASE))
+    model_names = [c["model_name"] for c in cfgs]
+    assert "openai/gpt-4o" in model_names
+    assert "openai/dall-e" not in model_names
+    assert "openai/completion-only" not in model_names
+
+
+# ---------------------------------------------------------------------------
+# _generate_image_gen_configs / _generate_vision_llm_configs
+# ---------------------------------------------------------------------------
+
+
+def test_generate_image_gen_configs_filters_by_image_output():
+    """Only models with ``output_modalities`` containing ``image`` are emitted.
+    Tool-calling and context filters are intentionally NOT applied — image
+    generation has nothing to do with tool calls and context windows.
+    """
+    from app.services.openrouter_integration_service import (
+        _generate_image_gen_configs,
+    )
+
+    raw = [
+        # Pure image-gen model (small context, no tools — should still emit).
+        {
+            "id": "openai/gpt-image-1",
+            "architecture": {"output_modalities": ["image"]},
+            "context_length": 4_000,
+            "pricing": {"prompt": "0", "completion": "0"},
+        },
+        # Multi-modal: text+image output (should still emit).
+        {
+            "id": "google/gemini-2.5-flash-image",
+            "architecture": {"output_modalities": ["text", "image"]},
+            "context_length": 1_000_000,
+            "pricing": {"prompt": "0.000001", "completion": "0.000004"},
+        },
+        # Pure text model — must NOT emit.
+        {
+            "id": "openai/gpt-4o",
+            "architecture": {"output_modalities": ["text"]},
+            "context_length": 128_000,
+            "pricing": {"prompt": "0.000005", "completion": "0.000015"},
+        },
+    ]
+
+    cfgs = _generate_image_gen_configs(raw, dict(_SETTINGS_BASE))
+    model_names = {c["model_name"] for c in cfgs}
+    assert "openai/gpt-image-1" in model_names
+    assert "google/gemini-2.5-flash-image" in model_names
+    assert "openai/gpt-4o" not in model_names
+
+    # Each config must carry ``billing_tier`` for routing in image_generation_routes.
+    for c in cfgs:
+        assert c["billing_tier"] in {"free", "premium"}
+        assert c["provider"] == "OPENROUTER"
+        assert c[_OPENROUTER_DYNAMIC_MARKER] is True
+        # Defense-in-depth: emit the OpenRouter base URL at source so a
+        # downstream call site that forgets ``resolve_api_base`` still
+        # doesn't 404 against an inherited Azure endpoint.
+        assert c["api_base"] == "https://openrouter.ai/api/v1"
+
+
+def test_generate_image_gen_configs_assigns_image_id_offset():
+    """Image configs use a different id_offset (-20000) so their negative
+    IDs don't collide with chat configs (-10000) or vision configs (-30000).
+    """
+    from app.services.openrouter_integration_service import (
+        _generate_image_gen_configs,
+    )
+
+    raw = [
+        {
+            "id": "openai/gpt-image-1",
+            "architecture": {"output_modalities": ["image"]},
+            "context_length": 4_000,
+            "pricing": {"prompt": "0", "completion": "0"},
+        }
+    ]
+    # Don't pass image_id_offset → use the module default (-20000).
+    cfgs = _generate_image_gen_configs(raw, dict(_SETTINGS_BASE))
+    assert all(c["id"] < -20_000 + 1 for c in cfgs)
+    assert all(c["id"] > -29_000_000 for c in cfgs)
+
+
+def test_generate_vision_llm_configs_filters_by_image_input_text_output():
+    """Vision LLMs must accept image input AND emit text — pure image-gen
+    (no text out) and text-only (no image in) models are excluded.
+    """
+    from app.services.openrouter_integration_service import (
+        _generate_vision_llm_configs,
+    )
+
+    raw = [
+        # GPT-4o: vision LLM (image in, text out) — must emit.
+        {
+            "id": "openai/gpt-4o",
+            "architecture": {
+                "input_modalities": ["text", "image"],
+                "output_modalities": ["text"],
+            },
+            "context_length": 128_000,
+            "pricing": {"prompt": "0.000005", "completion": "0.000015"},
+        },
+        # Pure image generator — image *output*, no text out. Must NOT emit.
+        {
+            "id": "openai/gpt-image-1",
+            "architecture": {
+                "input_modalities": ["text"],
+                "output_modalities": ["image"],
+            },
+            "context_length": 4_000,
+            "pricing": {"prompt": "0", "completion": "0"},
+        },
+        # Pure text model (no image in). Must NOT emit.
+        {
+            "id": "anthropic/claude-3-haiku",
+            "architecture": {
+                "input_modalities": ["text"],
+                "output_modalities": ["text"],
+            },
+            "context_length": 200_000,
+            "pricing": {"prompt": "0.000001", "completion": "0.000005"},
+        },
+    ]
+
+    cfgs = _generate_vision_llm_configs(raw, dict(_SETTINGS_BASE))
+    names = {c["model_name"] for c in cfgs}
+    assert names == {"openai/gpt-4o"}
+
+    cfg = cfgs[0]
+    assert cfg["billing_tier"] == "premium"
+    # Pricing carried inline so pricing_registration can register vision
+    # under ``openrouter/openai/gpt-4o`` even if the chat catalogue cache
+    # is cleared.
+    assert cfg["input_cost_per_token"] == pytest.approx(5e-6)
+    assert cfg["output_cost_per_token"] == pytest.approx(15e-6)
+    assert cfg[_OPENROUTER_DYNAMIC_MARKER] is True
+    # Defense-in-depth: emit the OpenRouter base URL at source so a
+    # downstream call site that forgets ``resolve_api_base`` still
+    # doesn't inherit an Azure endpoint.
+    assert cfg["api_base"] == "https://openrouter.ai/api/v1"
+
+
+def test_generate_vision_llm_configs_drops_chat_only_filters():
+    """A small-context vision model that doesn't advertise tool calling is
+    still a valid vision LLM for "describe this image" prompts. The chat
+    filters (``supports_tool_calling``, ``has_sufficient_context``) must
+    NOT be applied to vision emission.
+    """
+    from app.services.openrouter_integration_service import (
+        _generate_vision_llm_configs,
+    )
+
+    raw = [
+        {
+            "id": "tiny/vision-mini",
+            "architecture": {
+                "input_modalities": ["text", "image"],
+                "output_modalities": ["text"],
+            },
+            "supported_parameters": [],  # no tools
+            "context_length": 4_000,  # well below MIN_CONTEXT_LENGTH
+            "pricing": {"prompt": "0.0000001", "completion": "0.0000005"},
+        }
+    ]
+
+    cfgs = _generate_vision_llm_configs(raw, dict(_SETTINGS_BASE))
+    assert len(cfgs) == 1
+    assert cfgs[0]["model_name"] == "tiny/vision-mini"
diff --git a/surfsense_backend/tests/unit/services/test_openrouter_legacy_config.py b/surfsense_backend/tests/unit/services/test_openrouter_legacy_config.py
new file mode 100644
index 0000000000..4eb1f22957
--- /dev/null
+++ b/surfsense_backend/tests/unit/services/test_openrouter_legacy_config.py
@@ -0,0 +1,108 @@
+"""Tests for deprecated-key warnings and back-compat in
+``load_openrouter_integration_settings``.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+pytestmark = pytest.mark.unit
+
+
+def _write_yaml(tmp_path: Path, body: str) -> Path:
+    cfg_dir = tmp_path / "app" / "config"
+    cfg_dir.mkdir(parents=True)
+    cfg_path = cfg_dir / "global_llm_config.yaml"
+    cfg_path.write_text(body, encoding="utf-8")
+    return cfg_path
+
+
+def _patch_base_dir(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    from app import config as config_module
+
+    monkeypatch.setattr(config_module, "BASE_DIR", tmp_path)
+
+
+def test_legacy_billing_tier_emits_warning(monkeypatch, tmp_path, capsys):
+    _write_yaml(
+        tmp_path,
+        """
+openrouter_integration:
+  enabled: true
+  api_key: "sk-or-test"
+  billing_tier: "premium"
+""".lstrip(),
+    )
+    _patch_base_dir(monkeypatch, tmp_path)
+
+    from app.config import load_openrouter_integration_settings
+
+    settings = load_openrouter_integration_settings()
+    captured = capsys.readouterr().out
+    assert settings is not None
+    assert "billing_tier is deprecated" in captured
+
+
+def test_legacy_anonymous_enabled_back_compat(monkeypatch, tmp_path, capsys):
+    _write_yaml(
+        tmp_path,
+        """
+openrouter_integration:
+  enabled: true
+  api_key: "sk-or-test"
+  anonymous_enabled: true
+""".lstrip(),
+    )
+    _patch_base_dir(monkeypatch, tmp_path)
+
+    from app.config import load_openrouter_integration_settings
+
+    settings = load_openrouter_integration_settings()
+    captured = capsys.readouterr().out
+    assert settings is not None
+    assert settings["anonymous_enabled_paid"] is True
+    assert settings["anonymous_enabled_free"] is True
+    assert "anonymous_enabled is" in captured
+    assert "deprecated" in captured
+
+
+def test_new_keys_take_priority_over_legacy_back_compat(monkeypatch, tmp_path, capsys):
+    """If both legacy and new keys are present, new keys win (setdefault)."""
+    _write_yaml(
+        tmp_path,
+        """
+openrouter_integration:
+  enabled: true
+  api_key: "sk-or-test"
+  anonymous_enabled: true
+  anonymous_enabled_paid: false
+  anonymous_enabled_free: false
+""".lstrip(),
+    )
+    _patch_base_dir(monkeypatch, tmp_path)
+
+    from app.config import load_openrouter_integration_settings
+
+    settings = load_openrouter_integration_settings()
+    capsys.readouterr()
+    assert settings is not None
+    assert settings["anonymous_enabled_paid"] is False
+    assert settings["anonymous_enabled_free"] is False
+
+
+def test_disabled_integration_returns_none(monkeypatch, tmp_path):
+    _write_yaml(
+        tmp_path,
+        """
+openrouter_integration:
+  enabled: false
+  api_key: "sk-or-test"
+""".lstrip(),
+    )
+    _patch_base_dir(monkeypatch, tmp_path)
+
+    from app.config import load_openrouter_integration_settings
+
+    assert load_openrouter_integration_settings() is None
diff --git a/surfsense_backend/tests/unit/services/test_or_health_enrichment.py b/surfsense_backend/tests/unit/services/test_or_health_enrichment.py
new file mode 100644
index 0000000000..1c74aa9288
--- /dev/null
+++ b/surfsense_backend/tests/unit/services/test_or_health_enrichment.py
@@ -0,0 +1,331 @@
+"""Unit tests for the OpenRouter ``_enrich_health`` background task."""
+
+from __future__ import annotations
+
+from typing import Any
+
+import pytest
+
+from app.services.openrouter_integration_service import (
+    OpenRouterIntegrationService,
+)
+from app.services.quality_score import (
+    _HEALTH_FAIL_RATIO_FALLBACK,
+)
+
+pytestmark = pytest.mark.unit
+
+
+def _or_cfg(
+    *,
+    cid: int,
+    model_name: str,
+    tier: str = "premium",
+    static_score: int = 50,
+) -> dict:
+    return {
+        "id": cid,
+        "provider": "OPENROUTER",
+        "model_name": model_name,
+        "billing_tier": tier,
+        "auto_pin_tier": "B" if tier == "premium" else "C",
+        "quality_score_static": static_score,
+        "quality_score_health": None,
+        "quality_score": static_score,
+        "health_gated": False,
+    }
+
+
+class _StubResponse:
+    def __init__(self, *, payload: dict, status_code: int = 200):
+        self._payload = payload
+        self.status_code = status_code
+
+    def raise_for_status(self) -> None:
+        if self.status_code >= 400:
+            raise RuntimeError(f"HTTP {self.status_code}")
+
+    def json(self) -> dict:
+        return self._payload
+
+
+class _StubAsyncClient:
+    """Minimal drop-in for ``httpx.AsyncClient`` used by ``_fetch_endpoints``."""
+
+    def __init__(self, responder):
+        self._responder = responder
+        self.requests: list[str] = []
+
+    async def __aenter__(self):
+        return self
+
+    async def __aexit__(self, exc_type, exc, tb):
+        return False
+
+    async def get(self, url: str, headers: dict | None = None) -> _StubResponse:
+        self.requests.append(url)
+        return self._responder(url)
+
+
+def _patch_async_client(monkeypatch, responder) -> _StubAsyncClient:
+    """Replace ``httpx.AsyncClient`` for the duration of the test."""
+    client = _StubAsyncClient(responder)
+    monkeypatch.setattr(
+        "app.services.openrouter_integration_service.httpx.AsyncClient",
+        lambda *_args, **_kwargs: client,
+    )
+    return client
+
+
+def _healthy_payload() -> dict:
+    return {
+        "data": {
+            "endpoints": [
+                {
+                    "status": 0,
+                    "uptime_last_30m": 0.99,
+                    "uptime_last_1d": 0.995,
+                    "uptime_last_5m": 0.99,
+                }
+            ]
+        }
+    }
+
+
+def _unhealthy_payload() -> dict:
+    return {
+        "data": {
+            "endpoints": [
+                {
+                    "status": 0,
+                    "uptime_last_30m": 0.55,
+                    "uptime_last_1d": 0.62,
+                    "uptime_last_5m": 0.50,
+                }
+            ]
+        }
+    }
+
+
+# ---------------------------------------------------------------------------
+# Bounded fan-out + happy path
+# ---------------------------------------------------------------------------
+
+
+async def test_enrich_health_marks_healthy_and_gates_unhealthy(monkeypatch):
+    cfgs = [
+        _or_cfg(cid=-1, model_name="anthropic/claude-haiku", static_score=70),
+        _or_cfg(cid=-2, model_name="venice/dead-model", static_score=60),
+    ]
+
+    def responder(url: str) -> _StubResponse:
+        if "anthropic" in url:
+            return _StubResponse(payload=_healthy_payload())
+        return _StubResponse(payload=_unhealthy_payload())
+
+    _patch_async_client(monkeypatch, responder)
+
+    service = OpenRouterIntegrationService()
+    service._settings = {"api_key": ""}
+    await service._enrich_health(cfgs)
+
+    healthy = next(c for c in cfgs if c["id"] == -1)
+    gated = next(c for c in cfgs if c["id"] == -2)
+
+    assert healthy["health_gated"] is False
+    assert healthy["quality_score_health"] is not None
+    assert healthy["quality_score"] >= healthy["quality_score_static"]
+
+    assert gated["health_gated"] is True
+    assert gated["quality_score"] == gated["quality_score_static"]
+
+
+async def test_enrich_health_only_touches_or_provider(monkeypatch):
+    """YAML cfgs that aren't OPENROUTER must be skipped entirely."""
+    yaml_cfg = {
+        "id": -1,
+        "provider": "AZURE_OPENAI",
+        "model_name": "gpt-5",
+        "billing_tier": "premium",
+        "auto_pin_tier": "A",
+        "quality_score_static": 80,
+        "quality_score": 80,
+        "health_gated": False,
+    }
+    or_cfg = _or_cfg(cid=-2, model_name="anthropic/claude-haiku")
+
+    requests: list[str] = []
+
+    def responder(url: str) -> _StubResponse:
+        requests.append(url)
+        return _StubResponse(payload=_healthy_payload())
+
+    _patch_async_client(monkeypatch, responder)
+
+    service = OpenRouterIntegrationService()
+    service._settings = {}
+    await service._enrich_health([yaml_cfg, or_cfg])
+
+    assert all("anthropic/claude-haiku" in r for r in requests)
+    # YAML cfg is untouched.
+    assert yaml_cfg["quality_score"] == 80
+    assert yaml_cfg["health_gated"] is False
+
+
+# ---------------------------------------------------------------------------
+# Failure ratio fallback
+# ---------------------------------------------------------------------------
+
+
+async def test_enrich_health_falls_back_to_last_good_when_failure_ratio_high(
+    monkeypatch,
+):
+    """If >= 25% of fetches fail, keep last-good cache instead of writing
+    partial data."""
+    cfgs = [
+        _or_cfg(cid=-1, model_name="anthropic/claude-haiku", static_score=70),
+        _or_cfg(cid=-2, model_name="openai/gpt-5", static_score=80),
+        _or_cfg(cid=-3, model_name="google/gemini-flash", static_score=65),
+        _or_cfg(cid=-4, model_name="venice/something", static_score=50),
+    ]
+
+    service = OpenRouterIntegrationService()
+    service._settings = {}
+    # Pre-seed last-good cache with a known-healthy snapshot.
+    service._health_cache = {
+        "anthropic/claude-haiku": {"gated": False, "score": 95.0},
+    }
+
+    def all_fail(_url: str) -> _StubResponse:
+        return _StubResponse(payload={}, status_code=500)
+
+    _patch_async_client(monkeypatch, all_fail)
+    await service._enrich_health(cfgs)
+
+    # Above threshold ⇒ degraded; last-good cache wins for the cached cfg.
+    cached_hit = next(c for c in cfgs if c["model_name"] == "anthropic/claude-haiku")
+    assert cached_hit["quality_score_health"] == 95.0
+    assert cached_hit["health_gated"] is False
+    # Confirm the threshold constant we're testing against is real.
+    assert _HEALTH_FAIL_RATIO_FALLBACK <= 1.0
+
+
+async def test_enrich_health_keeps_static_only_with_no_cache_and_failures(
+    monkeypatch,
+):
+    """If a fetch fails and there's no last-good cache, the cfg keeps its
+    static-only ``quality_score`` and is *not* gated by default."""
+    cfgs = [
+        _or_cfg(cid=-1, model_name="anthropic/claude-haiku", static_score=70),
+    ]
+
+    def fail(_url: str) -> _StubResponse:
+        return _StubResponse(payload={}, status_code=500)
+
+    _patch_async_client(monkeypatch, fail)
+
+    service = OpenRouterIntegrationService()
+    service._settings = {}
+    await service._enrich_health(cfgs)
+
+    cfg = cfgs[0]
+    assert cfg["health_gated"] is False
+    assert cfg["quality_score"] == cfg["quality_score_static"]
+    assert cfg["quality_score_health"] is None
+
+
+# ---------------------------------------------------------------------------
+# Last-good cache: success populates, next failure reuses
+# ---------------------------------------------------------------------------
+
+
+async def test_enrich_health_populates_cache_on_success_then_reuses_on_failure(
+    monkeypatch,
+):
+    cfg = _or_cfg(cid=-1, model_name="anthropic/claude-haiku", static_score=70)
+
+    service = OpenRouterIntegrationService()
+    service._settings = {}
+
+    def healthy(_url: str) -> _StubResponse:
+        return _StubResponse(payload=_healthy_payload())
+
+    _patch_async_client(monkeypatch, healthy)
+    await service._enrich_health([cfg])
+
+    assert "anthropic/claude-haiku" in service._health_cache
+    cached_score = service._health_cache["anthropic/claude-haiku"]["score"]
+    assert cached_score is not None
+
+    # Next cycle: enough other healthy cfgs so failure ratio stays below
+    # the 25% threshold even when this one fails individually.
+    other_cfgs = [
+        _or_cfg(cid=-2 - i, model_name=f"healthy/m-{i}", static_score=60)
+        for i in range(10)
+    ]
+    cfg["quality_score_health"] = None
+    cfg["quality_score"] = cfg["quality_score_static"]
+
+    def mixed(url: str) -> _StubResponse:
+        if "anthropic" in url:
+            return _StubResponse(payload={}, status_code=500)
+        return _StubResponse(payload=_healthy_payload())
+
+    _patch_async_client(monkeypatch, mixed)
+    await service._enrich_health([cfg, *other_cfgs])
+
+    assert cfg["quality_score_health"] == cached_score
+    assert cfg["health_gated"] is False
+
+
+# ---------------------------------------------------------------------------
+# Bounded fan-out: respects top-N caps
+# ---------------------------------------------------------------------------
+
+
+async def test_enrich_health_bounds_premium_fanout(monkeypatch):
+    """Top-N premium cap is honoured even when many cfgs are present."""
+    from app.services.quality_score import _HEALTH_ENRICH_TOP_N_PREMIUM
+
+    cfgs = [
+        _or_cfg(
+            cid=-i, model_name=f"openai/m-{i}", tier="premium", static_score=100 - i
+        )
+        for i in range(1, _HEALTH_ENRICH_TOP_N_PREMIUM + 20)
+    ]
+
+    seen: list[str] = []
+
+    def responder(url: str) -> _StubResponse:
+        seen.append(url)
+        return _StubResponse(payload=_healthy_payload())
+
+    _patch_async_client(monkeypatch, responder)
+
+    service = OpenRouterIntegrationService()
+    service._settings = {}
+    await service._enrich_health(cfgs)
+
+    assert len(seen) == _HEALTH_ENRICH_TOP_N_PREMIUM
+
+
+async def test_enrich_health_no_or_cfgs_is_noop(monkeypatch):
+    """When the catalogue has no OR cfgs at all, no HTTP calls fire."""
+    yaml_cfg: dict[str, Any] = {
+        "id": -1,
+        "provider": "AZURE_OPENAI",
+        "model_name": "gpt-5",
+        "billing_tier": "premium",
+    }
+    requests: list[str] = []
+
+    def responder(url: str) -> _StubResponse:
+        requests.append(url)
+        return _StubResponse(payload=_healthy_payload())
+
+    _patch_async_client(monkeypatch, responder)
+
+    service = OpenRouterIntegrationService()
+    service._settings = {}
+    await service._enrich_health([yaml_cfg])
+    assert requests == []
diff --git a/surfsense_backend/tests/unit/services/test_pricing_registration.py b/surfsense_backend/tests/unit/services/test_pricing_registration.py
new file mode 100644
index 0000000000..e97250ff26
--- /dev/null
+++ b/surfsense_backend/tests/unit/services/test_pricing_registration.py
@@ -0,0 +1,447 @@
+"""Pricing registration unit tests.
+
+The pricing-registration module is what makes ``response_cost`` populate
+correctly for OpenRouter dynamic models and operator-defined Azure
+deployments — both of which LiteLLM doesn't natively know about. The tests
+exercise:
+
+* The alias generators emit every shape that LiteLLM's cost-callback might
+  use (``openrouter/X`` and bare ``X``; YAML-defined ``base_model``,
+  ``provider/base_model``, ``provider/model_name``, plus the special
+  ``azure_openai`` → ``azure`` normalisation).
+* ``register_pricing_from_global_configs`` calls ``litellm.register_model``
+  with the right alias set and pricing values per provider.
+* Configs without a resolvable pair of cost values are skipped — never
+  registered as zero, since that would override pricing LiteLLM might
+  already know natively.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+import pytest
+
+pytestmark = pytest.mark.unit
+
+
+# ---------------------------------------------------------------------------
+# Alias generators
+# ---------------------------------------------------------------------------
+
+
+def test_openrouter_alias_set_includes_prefixed_and_bare():
+    from app.services.pricing_registration import _alias_set_for_openrouter
+
+    aliases = _alias_set_for_openrouter("anthropic/claude-3-5-sonnet")
+    assert aliases == [
+        "openrouter/anthropic/claude-3-5-sonnet",
+        "anthropic/claude-3-5-sonnet",
+    ]
+
+
+def test_openrouter_alias_set_dedupes():
+    """If the model id is already prefixed with ``openrouter/``, the alias
+    set must not contain duplicates that would re-register the same key
+    twice.
+    """
+    from app.services.pricing_registration import _alias_set_for_openrouter
+
+    aliases = _alias_set_for_openrouter("openrouter/foo")
+    # The bare and prefixed variants compute to the same string here, so we
+    # at minimum require uniqueness.
+    assert len(aliases) == len(set(aliases))
+
+
+def test_yaml_alias_set_for_azure_openai_normalises_to_azure():
+    """``azure_openai`` (our YAML provider slug) must register under
+    ``azure/<name>`` so the LiteLLM Router's deployment-resolution path
+    (which uses provider ``azure``) finds the pricing too.
+    """
+    from app.services.pricing_registration import _alias_set_for_yaml
+
+    aliases = _alias_set_for_yaml(
+        provider="AZURE_OPENAI",
+        model_name="gpt-5.4",
+        base_model="gpt-5.4",
+    )
+    assert "gpt-5.4" in aliases
+    assert "azure_openai/gpt-5.4" in aliases
+    assert "azure/gpt-5.4" in aliases
+
+
+def test_yaml_alias_set_distinguishes_model_name_and_base_model():
+    """When ``model_name`` differs from ``base_model`` (operator labelled a
+    deployment), both must appear in the alias set since either may surface
+    in callbacks depending on the call path.
+    """
+    from app.services.pricing_registration import _alias_set_for_yaml
+
+    aliases = _alias_set_for_yaml(
+        provider="OPENAI",
+        model_name="my-deployment-label",
+        base_model="gpt-4o",
+    )
+    assert "gpt-4o" in aliases
+    assert "openai/gpt-4o" in aliases
+    assert "my-deployment-label" in aliases
+    assert "openai/my-deployment-label" in aliases
+
+
+def test_yaml_alias_set_omits_provider_prefix_when_provider_blank():
+    from app.services.pricing_registration import _alias_set_for_yaml
+
+    aliases = _alias_set_for_yaml(
+        provider="",
+        model_name="foo",
+        base_model="bar",
+    )
+    assert "bar" in aliases
+    assert "foo" in aliases
+    assert all("/" not in a for a in aliases)
+
+
+# ---------------------------------------------------------------------------
+# register_pricing_from_global_configs
+# ---------------------------------------------------------------------------
+
+
+class _RegistrationSpy:
+    """Captures the dicts passed to ``litellm.register_model``.
+
+    Many calls may go through; we just record them all and let tests assert
+    against the union.
+    """
+
+    def __init__(self) -> None:
+        self.calls: list[dict[str, Any]] = []
+
+    def __call__(self, payload: dict[str, Any]) -> None:
+        self.calls.append(payload)
+
+    @property
+    def all_keys(self) -> set[str]:
+        keys: set[str] = set()
+        for payload in self.calls:
+            keys.update(payload.keys())
+        return keys
+
+
+def _patch_register(monkeypatch: pytest.MonkeyPatch) -> _RegistrationSpy:
+    spy = _RegistrationSpy()
+    monkeypatch.setattr(
+        "app.services.pricing_registration.litellm.register_model",
+        spy,
+        raising=False,
+    )
+    return spy
+
+
+def _patch_openrouter_pricing(
+    monkeypatch: pytest.MonkeyPatch, mapping: dict[str, dict[str, str]]
+) -> None:
+    """Pretend the OpenRouter integration is initialised with ``mapping``."""
+
+    class _Stub:
+        def get_raw_pricing(self) -> dict[str, dict[str, str]]:
+            return mapping
+
+    class _StubService:
+        @classmethod
+        def is_initialized(cls) -> bool:
+            return True
+
+        @classmethod
+        def get_instance(cls) -> _Stub:
+            return _Stub()
+
+    monkeypatch.setattr(
+        "app.services.openrouter_integration_service.OpenRouterIntegrationService",
+        _StubService,
+        raising=False,
+    )
+
+
+def test_openrouter_models_register_under_aliases(monkeypatch):
+    """An OpenRouter config whose ``model_name`` is in the cached raw
+    pricing map is registered under both ``openrouter/X`` and bare ``X``.
+    """
+    from app.config import config
+    from app.services.pricing_registration import register_pricing_from_global_configs
+
+    spy = _patch_register(monkeypatch)
+    _patch_openrouter_pricing(
+        monkeypatch,
+        {
+            "anthropic/claude-3-5-sonnet": {
+                "prompt": "0.000003",
+                "completion": "0.000015",
+            }
+        },
+    )
+
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_LLM_CONFIGS",
+        [
+            {
+                "id": 1,
+                "provider": "OPENROUTER",
+                "model_name": "anthropic/claude-3-5-sonnet",
+            }
+        ],
+    )
+
+    register_pricing_from_global_configs()
+
+    assert "openrouter/anthropic/claude-3-5-sonnet" in spy.all_keys
+    assert "anthropic/claude-3-5-sonnet" in spy.all_keys
+    # Costs are float-converted from the raw OpenRouter strings.
+    payload = spy.calls[0]
+    assert payload["openrouter/anthropic/claude-3-5-sonnet"][
+        "input_cost_per_token"
+    ] == pytest.approx(3e-6)
+    assert payload["openrouter/anthropic/claude-3-5-sonnet"][
+        "output_cost_per_token"
+    ] == pytest.approx(15e-6)
+    assert (
+        payload["openrouter/anthropic/claude-3-5-sonnet"]["litellm_provider"]
+        == "openrouter"
+    )
+
+
+def test_yaml_override_registers_under_alias_set(monkeypatch):
+    """Operator-declared ``input_cost_per_token`` /
+    ``output_cost_per_token`` on a YAML config registers under every
+    alias the YAML alias generator produces — including the ``azure/``
+    normalisation for ``azure_openai`` providers.
+    """
+    from app.config import config
+    from app.services.pricing_registration import register_pricing_from_global_configs
+
+    spy = _patch_register(monkeypatch)
+    _patch_openrouter_pricing(monkeypatch, {})
+
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_LLM_CONFIGS",
+        [
+            {
+                "id": 1,
+                "provider": "AZURE_OPENAI",
+                "model_name": "gpt-5.4",
+                "litellm_params": {
+                    "base_model": "gpt-5.4",
+                    "input_cost_per_token": 2e-6,
+                    "output_cost_per_token": 8e-6,
+                },
+            }
+        ],
+    )
+
+    register_pricing_from_global_configs()
+
+    keys = spy.all_keys
+    assert "gpt-5.4" in keys
+    assert "azure_openai/gpt-5.4" in keys
+    assert "azure/gpt-5.4" in keys
+
+    payload = spy.calls[0]
+    entry = payload["gpt-5.4"]
+    assert entry["input_cost_per_token"] == pytest.approx(2e-6)
+    assert entry["output_cost_per_token"] == pytest.approx(8e-6)
+    assert entry["litellm_provider"] == "azure"
+
+
+def test_no_override_means_no_registration(monkeypatch):
+    """A YAML config that *omits* both pricing fields must NOT be registered
+    — registering as zero would override LiteLLM's native pricing for the
+    ``base_model`` key (e.g. ``gpt-4o``) and silently make every user's
+    bill drop to $0. Fail-safe is "skip and warn", not "register zero".
+    """
+    from app.config import config
+    from app.services.pricing_registration import register_pricing_from_global_configs
+
+    spy = _patch_register(monkeypatch)
+    _patch_openrouter_pricing(monkeypatch, {})
+
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_LLM_CONFIGS",
+        [
+            {
+                "id": 1,
+                "provider": "OPENAI",
+                "model_name": "gpt-4o",
+                "litellm_params": {"base_model": "gpt-4o"},
+            }
+        ],
+    )
+
+    register_pricing_from_global_configs()
+
+    assert spy.calls == []
+
+
+def test_openrouter_skipped_when_pricing_missing(monkeypatch):
+    """If the OpenRouter raw-pricing cache doesn't carry an entry for a
+    configured model (network blip during refresh, model added later, etc.),
+    we skip it rather than registering zero pricing.
+    """
+    from app.config import config
+    from app.services.pricing_registration import register_pricing_from_global_configs
+
+    spy = _patch_register(monkeypatch)
+    _patch_openrouter_pricing(
+        monkeypatch, {"some/other-model": {"prompt": "1", "completion": "1"}}
+    )
+
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_LLM_CONFIGS",
+        [
+            {
+                "id": 1,
+                "provider": "OPENROUTER",
+                "model_name": "anthropic/claude-3-5-sonnet",
+            }
+        ],
+    )
+
+    register_pricing_from_global_configs()
+
+    assert spy.calls == []
+
+
+def test_register_continues_after_individual_failure(monkeypatch, caplog):
+    """A single bad ``register_model`` call (e.g. raising LiteLLM error)
+    must not abort registration of the remaining configs.
+    """
+    from app.config import config
+    from app.services.pricing_registration import register_pricing_from_global_configs
+
+    failing_keys: set[str] = {"anthropic/claude-3-5-sonnet"}
+    successful_calls: list[dict[str, Any]] = []
+
+    def _maybe_fail(payload: dict[str, Any]) -> None:
+        if any(k in failing_keys for k in payload):
+            raise RuntimeError("boom")
+        successful_calls.append(payload)
+
+    monkeypatch.setattr(
+        "app.services.pricing_registration.litellm.register_model",
+        _maybe_fail,
+        raising=False,
+    )
+    _patch_openrouter_pricing(
+        monkeypatch,
+        {
+            "anthropic/claude-3-5-sonnet": {
+                "prompt": "0.000003",
+                "completion": "0.000015",
+            }
+        },
+    )
+
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_LLM_CONFIGS",
+        [
+            {
+                "id": 1,
+                "provider": "OPENROUTER",
+                "model_name": "anthropic/claude-3-5-sonnet",
+            },
+            {
+                "id": 2,
+                "provider": "OPENAI",
+                "model_name": "custom-deployment",
+                "litellm_params": {
+                    "base_model": "custom-deployment",
+                    "input_cost_per_token": 1e-6,
+                    "output_cost_per_token": 2e-6,
+                },
+            },
+        ],
+    )
+
+    register_pricing_from_global_configs()
+
+    # The good config still registered.
+    assert any("custom-deployment" in payload for payload in successful_calls)
+
+
+def test_vision_configs_registered_with_chat_shape(monkeypatch):
+    """``register_pricing_from_global_configs`` walks
+    ``GLOBAL_VISION_LLM_CONFIGS`` in addition to the chat configs so vision
+    calls (during indexing) bill correctly. Vision configs use the same
+    chat-shape token prices, but image-gen pricing is intentionally NOT
+    registered here (handled via ``response_cost`` in LiteLLM).
+    """
+    from app.config import config
+    from app.services.pricing_registration import register_pricing_from_global_configs
+
+    spy = _patch_register(monkeypatch)
+    _patch_openrouter_pricing(
+        monkeypatch,
+        {"openai/gpt-4o": {"prompt": "0.000005", "completion": "0.000015"}},
+    )
+
+    # No chat configs — only vision. Proves the vision walk is a separate
+    # iteration, not piggy-backed on the chat list.
+    monkeypatch.setattr(config, "GLOBAL_LLM_CONFIGS", [])
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_VISION_LLM_CONFIGS",
+        [
+            {
+                "id": -1,
+                "provider": "OPENROUTER",
+                "model_name": "openai/gpt-4o",
+                "billing_tier": "premium",
+                "input_cost_per_token": 5e-6,
+                "output_cost_per_token": 15e-6,
+            }
+        ],
+    )
+
+    register_pricing_from_global_configs()
+
+    assert "openrouter/openai/gpt-4o" in spy.all_keys
+    payload_value = spy.calls[0]["openrouter/openai/gpt-4o"]
+    assert payload_value["mode"] == "chat"
+    assert payload_value["litellm_provider"] == "openrouter"
+    assert payload_value["input_cost_per_token"] == pytest.approx(5e-6)
+    assert payload_value["output_cost_per_token"] == pytest.approx(15e-6)
+
+
+def test_vision_with_inline_pricing_when_or_cache_missing(monkeypatch):
+    """If the OpenRouter pricing cache misses a vision model (different
+    catalogue surface), the vision walk falls back to inline
+    ``input_cost_per_token``/``output_cost_per_token`` on the cfg itself.
+    """
+    from app.config import config
+    from app.services.pricing_registration import register_pricing_from_global_configs
+
+    spy = _patch_register(monkeypatch)
+    _patch_openrouter_pricing(monkeypatch, {})
+
+    monkeypatch.setattr(config, "GLOBAL_LLM_CONFIGS", [])
+    monkeypatch.setattr(
+        config,
+        "GLOBAL_VISION_LLM_CONFIGS",
+        [
+            {
+                "id": -1,
+                "provider": "OPENROUTER",
+                "model_name": "google/gemini-2.5-flash",
+                "billing_tier": "premium",
+                "input_cost_per_token": 1e-6,
+                "output_cost_per_token": 4e-6,
+            }
+        ],
+    )
+
+    register_pricing_from_global_configs()
+
+    assert "openrouter/google/gemini-2.5-flash" in spy.all_keys
diff --git a/surfsense_backend/tests/unit/services/test_provider_api_base.py b/surfsense_backend/tests/unit/services/test_provider_api_base.py
new file mode 100644
index 0000000000..12cd0a3d5a
--- /dev/null
+++ b/surfsense_backend/tests/unit/services/test_provider_api_base.py
@@ -0,0 +1,107 @@
+"""Unit tests for the shared ``api_base`` resolver.
+
+The cascade exists so vision and image-gen call sites can't silently
+inherit ``litellm.api_base`` (commonly set by ``AZURE_OPENAI_ENDPOINT``)
+when an OpenRouter / Groq / etc. config ships an empty string. See
+``provider_api_base`` module docstring for the original repro
+(OpenRouter image-gen 404-ing against an Azure endpoint).
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from app.services.provider_api_base import (
+    PROVIDER_DEFAULT_API_BASE,
+    PROVIDER_KEY_DEFAULT_API_BASE,
+    resolve_api_base,
+)
+
+pytestmark = pytest.mark.unit
+
+
+def test_config_value_wins_over_defaults():
+    """A non-empty config value is always returned verbatim, even when the
+    provider has a default — the operator gets the last word."""
+    result = resolve_api_base(
+        provider="OPENROUTER",
+        provider_prefix="openrouter",
+        config_api_base="https://my-openrouter-mirror.example.com/v1",
+    )
+    assert result == "https://my-openrouter-mirror.example.com/v1"
+
+
+def test_provider_key_default_when_config_missing():
+    """``DEEPSEEK`` shares the ``openai`` LiteLLM prefix but has its own
+    base URL — the provider-key map must take precedence over the prefix
+    map so DeepSeek requests don't go to OpenAI."""
+    result = resolve_api_base(
+        provider="DEEPSEEK",
+        provider_prefix="openai",
+        config_api_base=None,
+    )
+    assert result == PROVIDER_KEY_DEFAULT_API_BASE["DEEPSEEK"]
+
+
+def test_provider_prefix_default_when_no_key_default():
+    result = resolve_api_base(
+        provider="OPENROUTER",
+        provider_prefix="openrouter",
+        config_api_base=None,
+    )
+    assert result == PROVIDER_DEFAULT_API_BASE["openrouter"]
+
+
+def test_unknown_provider_returns_none():
+    """When neither map matches we return ``None`` so the caller can let
+    LiteLLM apply its own provider-integration default (Azure deployment
+    URL, custom-provider URL, etc.)."""
+    result = resolve_api_base(
+        provider="SOMETHING_NEW",
+        provider_prefix="something_new",
+        config_api_base=None,
+    )
+    assert result is None
+
+
+def test_empty_string_config_treated_as_missing():
+    """The original bug: OpenRouter dynamic configs ship ``api_base=""``
+    and downstream call sites use ``if cfg.get("api_base"):`` — empty
+    strings are falsy in Python but the cascade has to step in anyway."""
+    result = resolve_api_base(
+        provider="OPENROUTER",
+        provider_prefix="openrouter",
+        config_api_base="",
+    )
+    assert result == PROVIDER_DEFAULT_API_BASE["openrouter"]
+
+
+def test_whitespace_only_config_treated_as_missing():
+    """A config value of ``"   "`` is a configuration mistake — treat it
+    as missing instead of forwarding whitespace to LiteLLM (which would
+    almost certainly 404)."""
+    result = resolve_api_base(
+        provider="OPENROUTER",
+        provider_prefix="openrouter",
+        config_api_base="   ",
+    )
+    assert result == PROVIDER_DEFAULT_API_BASE["openrouter"]
+
+
+def test_provider_case_insensitive():
+    """Some call sites pass the provider lowercase (DB enum value), others
+    uppercase (YAML key). Both must resolve."""
+    upper = resolve_api_base(
+        provider="DEEPSEEK", provider_prefix="openai", config_api_base=None
+    )
+    lower = resolve_api_base(
+        provider="deepseek", provider_prefix="openai", config_api_base=None
+    )
+    assert upper == lower == PROVIDER_KEY_DEFAULT_API_BASE["DEEPSEEK"]
+
+
+def test_all_inputs_none_returns_none():
+    assert (
+        resolve_api_base(provider=None, provider_prefix=None, config_api_base=None)
+        is None
+    )
diff --git a/surfsense_backend/tests/unit/services/test_provider_capabilities.py b/surfsense_backend/tests/unit/services/test_provider_capabilities.py
new file mode 100644
index 0000000000..aac88977fa
--- /dev/null
+++ b/surfsense_backend/tests/unit/services/test_provider_capabilities.py
@@ -0,0 +1,244 @@
+"""Unit tests for the shared chat-image capability resolver.
+
+Two resolvers, two intents:
+
+- ``derive_supports_image_input`` — best-effort True for the catalog and
+  selector. Default-allow on unknown / unmapped models. The streaming
+  task safety net never sees this value directly.
+
+- ``is_known_text_only_chat_model`` — strict opt-out for the safety net.
+  Returns True only when LiteLLM's model map *explicitly* sets
+  ``supports_vision=False``. Anything else (missing key, exception,
+  True) returns False so the request flows through to the provider.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from app.services.provider_capabilities import (
+    derive_supports_image_input,
+    is_known_text_only_chat_model,
+)
+
+pytestmark = pytest.mark.unit
+
+
+# ---------------------------------------------------------------------------
+# derive_supports_image_input — OpenRouter modalities path (authoritative)
+# ---------------------------------------------------------------------------
+
+
+def test_or_modalities_with_image_returns_true():
+    assert (
+        derive_supports_image_input(
+            provider="OPENROUTER",
+            model_name="openai/gpt-4o",
+            openrouter_input_modalities=["text", "image"],
+        )
+        is True
+    )
+
+
+def test_or_modalities_text_only_returns_false():
+    assert (
+        derive_supports_image_input(
+            provider="OPENROUTER",
+            model_name="deepseek/deepseek-v3.2-exp",
+            openrouter_input_modalities=["text"],
+        )
+        is False
+    )
+
+
+def test_or_modalities_empty_list_returns_false():
+    """OR explicitly publishing an empty modality list is a definitive
+    'no inputs at all' signal — treat as False rather than falling back
+    to LiteLLM."""
+    assert (
+        derive_supports_image_input(
+            provider="OPENROUTER",
+            model_name="weird/empty-modalities",
+            openrouter_input_modalities=[],
+        )
+        is False
+    )
+
+
+def test_or_modalities_none_falls_through_to_litellm():
+    """``None`` (missing key) is *not* a definitive signal — fall through
+    to LiteLLM. Using ``openai/gpt-4o`` which is in LiteLLM's map."""
+    assert (
+        derive_supports_image_input(
+            provider="OPENAI",
+            model_name="gpt-4o",
+            openrouter_input_modalities=None,
+        )
+        is True
+    )
+
+
+# ---------------------------------------------------------------------------
+# derive_supports_image_input — LiteLLM model-map path
+# ---------------------------------------------------------------------------
+
+
+def test_litellm_known_vision_model_returns_true():
+    assert (
+        derive_supports_image_input(
+            provider="OPENAI",
+            model_name="gpt-4o",
+        )
+        is True
+    )
+
+
+def test_litellm_base_model_wins_over_model_name():
+    """Azure-style entries pass model_name=deployment_id and put the
+    canonical sku in litellm_params.base_model. The resolver must
+    consult base_model first or the deployment id (which LiteLLM
+    doesn't know) would shadow the real capability."""
+    assert (
+        derive_supports_image_input(
+            provider="AZURE_OPENAI",
+            model_name="my-azure-deployment-id",
+            base_model="gpt-4o",
+        )
+        is True
+    )
+
+
+def test_litellm_unknown_model_default_allows():
+    """Default-allow on unknown — the safety net is the actual block."""
+    assert (
+        derive_supports_image_input(
+            provider="CUSTOM",
+            model_name="brand-new-model-x9-unmapped",
+            custom_provider="brand_new_proxy",
+        )
+        is True
+    )
+
+
+def test_litellm_known_text_only_returns_false():
+    """A model that LiteLLM explicitly knows is text-only resolves to
+    False even via the catalog resolver. ``deepseek-chat`` (the
+    DeepSeek-V3 chat sku) is in the map without supports_vision and
+    LiteLLM's `supports_vision` returns False."""
+    # Sanity: confirm the helper's negative path. We use a small model
+    # known not to support vision per the map.
+    result = derive_supports_image_input(
+        provider="DEEPSEEK",
+        model_name="deepseek-chat",
+    )
+    # We accept either False (LiteLLM said explicit no) or True
+    # (default-allow if the entry isn't mapped on this version) — the
+    # invariant is that the resolver never *raises* on a known-text-only
+    # provider/model. The behaviour-binding assertion lives in
+    # ``test_is_known_text_only_chat_model_explicit_false`` below.
+    assert isinstance(result, bool)
+
+
+# ---------------------------------------------------------------------------
+# is_known_text_only_chat_model — strict opt-out semantics
+# ---------------------------------------------------------------------------
+
+
+def test_is_known_text_only_returns_false_for_vision_model():
+    assert (
+        is_known_text_only_chat_model(
+            provider="OPENAI",
+            model_name="gpt-4o",
+        )
+        is False
+    )
+
+
+def test_is_known_text_only_returns_false_for_unknown_model():
+    """Strict opt-out: missing from the map ≠ text-only. The safety net
+    must NOT fire for an unmapped model — that's the regression we're
+    fixing."""
+    assert (
+        is_known_text_only_chat_model(
+            provider="CUSTOM",
+            model_name="brand-new-model-x9-unmapped",
+            custom_provider="brand_new_proxy",
+        )
+        is False
+    )
+
+
+def test_is_known_text_only_returns_false_when_lookup_raises(monkeypatch):
+    """LiteLLM's ``get_model_info`` raises freely on parse errors. The
+    helper swallows the exception and returns False so the safety net
+    doesn't fire on a transient lookup failure."""
+    import app.services.provider_capabilities as pc
+
+    def _raise(**_kwargs):
+        raise ValueError("intentional test failure")
+
+    monkeypatch.setattr(pc.litellm, "get_model_info", _raise)
+
+    assert (
+        is_known_text_only_chat_model(
+            provider="OPENAI",
+            model_name="gpt-4o",
+        )
+        is False
+    )
+
+
+def test_is_known_text_only_returns_true_on_explicit_false(monkeypatch):
+    """Stub LiteLLM's ``get_model_info`` to return an explicit False so
+    we exercise the opt-out path deterministically. Using a stub keeps
+    the test stable across LiteLLM map updates."""
+    import app.services.provider_capabilities as pc
+
+    def _info(**_kwargs):
+        return {"supports_vision": False, "max_input_tokens": 8192}
+
+    monkeypatch.setattr(pc.litellm, "get_model_info", _info)
+
+    assert (
+        is_known_text_only_chat_model(
+            provider="OPENAI",
+            model_name="any-model",
+        )
+        is True
+    )
+
+
+def test_is_known_text_only_returns_false_on_supports_vision_true(monkeypatch):
+    import app.services.provider_capabilities as pc
+
+    def _info(**_kwargs):
+        return {"supports_vision": True}
+
+    monkeypatch.setattr(pc.litellm, "get_model_info", _info)
+
+    assert (
+        is_known_text_only_chat_model(
+            provider="OPENAI",
+            model_name="any-model",
+        )
+        is False
+    )
+
+
+def test_is_known_text_only_returns_false_on_missing_key(monkeypatch):
+    """A model entry without ``supports_vision`` at all is treated as
+    'unknown' — strict opt-out means False."""
+    import app.services.provider_capabilities as pc
+
+    def _info(**_kwargs):
+        return {"max_input_tokens": 8192}  # no supports_vision
+
+    monkeypatch.setattr(pc.litellm, "get_model_info", _info)
+
+    assert (
+        is_known_text_only_chat_model(
+            provider="OPENAI",
+            model_name="any-model",
+        )
+        is False
+    )
diff --git a/surfsense_backend/tests/unit/services/test_quality_score.py b/surfsense_backend/tests/unit/services/test_quality_score.py
new file mode 100644
index 0000000000..6fbc8fd62e
--- /dev/null
+++ b/surfsense_backend/tests/unit/services/test_quality_score.py
@@ -0,0 +1,345 @@
+"""Unit tests for the Auto (Fastest) quality scoring module."""
+
+from __future__ import annotations
+
+import time
+
+import pytest
+
+from app.services.quality_score import (
+    _HEALTH_GATE_UPTIME_PCT,
+    _OPERATOR_TRUST_BONUS,
+    aggregate_health,
+    capabilities_signal,
+    context_signal,
+    created_recency_signal,
+    pricing_band,
+    slug_penalty,
+    static_score_or,
+    static_score_yaml,
+)
+
+pytestmark = pytest.mark.unit
+
+
+# ---------------------------------------------------------------------------
+# created_recency_signal
+# ---------------------------------------------------------------------------
+
+
+def test_created_recency_signal_recent_model_scores_high():
+    now = 1_750_000_000  # ~mid-2025
+    one_month_ago = now - (30 * 86_400)
+    assert created_recency_signal(one_month_ago, now) == 20
+
+
+def test_created_recency_signal_old_model_scores_zero():
+    now = 1_750_000_000
+    five_years_ago = now - (5 * 365 * 86_400)
+    assert created_recency_signal(five_years_ago, now) == 0
+
+
+def test_created_recency_signal_missing_timestamp_is_neutral():
+    now = 1_750_000_000
+    assert created_recency_signal(None, now) == 0
+    assert created_recency_signal(0, now) == 0
+
+
+def test_created_recency_signal_monotonic_decay():
+    now = 1_750_000_000
+    scores = [
+        created_recency_signal(now - days * 86_400, now)
+        for days in (30, 120, 300, 500, 700, 1000, 1500)
+    ]
+    assert scores == sorted(scores, reverse=True)
+
+
+# ---------------------------------------------------------------------------
+# pricing_band
+# ---------------------------------------------------------------------------
+
+
+def test_pricing_band_free_returns_zero():
+    assert pricing_band("0", "0") == 0
+    assert pricing_band(0.0, 0.0) == 0
+    assert pricing_band(None, None) == 0
+
+
+def test_pricing_band_handles_unparseable():
+    assert pricing_band("not-a-number", "0") == 0
+    assert pricing_band({}, []) == 0  # type: ignore[arg-type]
+
+
+def test_pricing_band_premium_tiers_increase_with_price():
+    cheap = pricing_band("0.0000003", "0.0000005")
+    mid = pricing_band("0.000003", "0.000015")
+    flagship = pricing_band("0.00001", "0.00005")
+    assert 0 < cheap < mid < flagship
+
+
+# ---------------------------------------------------------------------------
+# context_signal
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "ctx,expected",
+    [
+        (1_500_000, 10),
+        (1_000_000, 10),
+        (500_000, 8),
+        (200_000, 6),
+        (128_000, 4),
+        (100_000, 2),
+        (50_000, 0),
+        (0, 0),
+        (None, 0),
+    ],
+)
+def test_context_signal_bands(ctx, expected):
+    assert context_signal(ctx) == expected
+
+
+# ---------------------------------------------------------------------------
+# capabilities_signal
+# ---------------------------------------------------------------------------
+
+
+def test_capabilities_signal_caps_at_five():
+    assert (
+        capabilities_signal(
+            ["tools", "structured_outputs", "reasoning", "include_reasoning"]
+        )
+        <= 5
+    )
+
+
+def test_capabilities_signal_tools_only():
+    assert capabilities_signal(["tools"]) == 2
+
+
+def test_capabilities_signal_empty():
+    assert capabilities_signal(None) == 0
+    assert capabilities_signal([]) == 0
+
+
+# ---------------------------------------------------------------------------
+# slug_penalty
+# ---------------------------------------------------------------------------
+
+
+def test_slug_penalty_demotes_tiny_models():
+    assert slug_penalty("meta-llama/llama-3.2-1b-instruct") < 0
+    assert slug_penalty("liquid/lfm-7b") < 0
+    assert slug_penalty("google/gemma-3n-e4b-it") < 0
+
+
+def test_slug_penalty_skips_capable_mini_nano_lite_models():
+    """Critical Option C+ regression: don't penalise modern frontier
+    models named ``-nano`` / ``-mini`` / ``-lite`` (gpt-5-mini, etc.)."""
+    assert slug_penalty("openai/gpt-5-mini") == 0
+    assert slug_penalty("openai/gpt-5-nano") == 0
+    assert slug_penalty("google/gemini-2.5-flash-lite") == 0
+    assert slug_penalty("anthropic/claude-haiku-4.5") == 0
+
+
+def test_slug_penalty_demotes_legacy_variants():
+    assert slug_penalty("openai/o1-preview") < 0
+    assert slug_penalty("foo/bar-base") < 0
+    assert slug_penalty("foo/bar-distill") < 0
+
+
+def test_slug_penalty_empty_input():
+    assert slug_penalty("") == 0
+
+
+# ---------------------------------------------------------------------------
+# static_score_or
+# ---------------------------------------------------------------------------
+
+
+def _or_model(
+    *,
+    model_id: str,
+    created: int | None = None,
+    prompt: str = "0.000003",
+    completion: str = "0.000015",
+    context: int = 200_000,
+    params: list[str] | None = None,
+) -> dict:
+    return {
+        "id": model_id,
+        "created": created,
+        "pricing": {"prompt": prompt, "completion": completion},
+        "context_length": context,
+        "supported_parameters": params if params is not None else ["tools"],
+    }
+
+
+def test_static_score_or_frontier_premium_beats_free_tiny():
+    now = 1_750_000_000
+    frontier = _or_model(
+        model_id="openai/gpt-5",
+        created=now - (60 * 86_400),
+        prompt="0.000005",
+        completion="0.000020",
+        context=400_000,
+        params=["tools", "structured_outputs", "reasoning"],
+    )
+    tiny_free = _or_model(
+        model_id="meta-llama/llama-3.2-1b-instruct:free",
+        created=now - (5 * 365 * 86_400),
+        prompt="0",
+        completion="0",
+        context=128_000,
+        params=["tools"],
+    )
+    assert static_score_or(frontier, now_ts=now) > static_score_or(
+        tiny_free, now_ts=now
+    )
+
+
+def test_static_score_or_score_is_clamped_0_to_100():
+    now = int(time.time())
+    score = static_score_or(_or_model(model_id="openai/gpt-4o"), now_ts=now)
+    assert 0 <= score <= 100
+
+
+def test_static_score_or_unknown_provider_is_neutral_not_zero():
+    now = int(time.time())
+    score = static_score_or(
+        _or_model(model_id="some-new-lab/some-model"),
+        now_ts=now,
+    )
+    assert score > 0
+
+
+def test_static_score_or_recent_release_beats_year_old_same_provider():
+    now = 1_750_000_000
+    fresh = _or_model(model_id="openai/gpt-5", created=now - (60 * 86_400))
+    old = _or_model(model_id="openai/gpt-4-turbo", created=now - (700 * 86_400))
+    assert static_score_or(fresh, now_ts=now) > static_score_or(old, now_ts=now)
+
+
+# ---------------------------------------------------------------------------
+# static_score_yaml
+# ---------------------------------------------------------------------------
+
+
+def test_static_score_yaml_includes_operator_bonus():
+    cfg = {
+        "provider": "AZURE_OPENAI",
+        "model_name": "gpt-5",
+        "litellm_params": {"base_model": "azure/gpt-5"},
+    }
+    score = static_score_yaml(cfg)
+    assert score >= _OPERATOR_TRUST_BONUS
+
+
+def test_static_score_yaml_unknown_provider_still_carries_bonus():
+    cfg = {
+        "provider": "SOME_NEW_PROVIDER",
+        "model_name": "weird-model",
+    }
+    score = static_score_yaml(cfg)
+    assert score >= _OPERATOR_TRUST_BONUS
+
+
+def test_static_score_yaml_clamped_0_to_100():
+    cfg = {
+        "provider": "AZURE_OPENAI",
+        "model_name": "gpt-5",
+        "litellm_params": {"base_model": "azure/gpt-5"},
+    }
+    assert 0 <= static_score_yaml(cfg) <= 100
+
+
+# ---------------------------------------------------------------------------
+# aggregate_health
+# ---------------------------------------------------------------------------
+
+
+def test_aggregate_health_gates_when_uptime_below_threshold():
+    """Live data showed Venice-routed cfgs at 53-68%; this guards that the
+    90% gate excludes them."""
+    venice_endpoints = [
+        {
+            "status": 0,
+            "uptime_last_30m": 0.55,
+            "uptime_last_1d": 0.60,
+            "uptime_last_5m": 0.50,
+        },
+        {
+            "status": 0,
+            "uptime_last_30m": 0.65,
+            "uptime_last_1d": 0.68,
+            "uptime_last_5m": 0.62,
+        },
+    ]
+    gated, score = aggregate_health(venice_endpoints)
+    assert gated is True
+    assert score is None
+
+
+def test_aggregate_health_passes_for_healthy_provider():
+    healthy = [
+        {
+            "status": 0,
+            "uptime_last_30m": 0.99,
+            "uptime_last_1d": 0.995,
+            "uptime_last_5m": 0.99,
+        },
+    ]
+    gated, score = aggregate_health(healthy)
+    assert gated is False
+    assert score is not None
+    assert score >= _HEALTH_GATE_UPTIME_PCT
+
+
+def test_aggregate_health_picks_best_endpoint_across_multiple():
+    """Multi-endpoint aggregation should reward the best non-null uptime."""
+    mixed = [
+        {"status": 0, "uptime_last_30m": 0.55},
+        {"status": 0, "uptime_last_30m": 0.97},  # this one passes the gate
+    ]
+    gated, score = aggregate_health(mixed)
+    assert gated is False
+    assert score is not None
+
+
+def test_aggregate_health_empty_endpoints_gated():
+    gated, score = aggregate_health([])
+    assert gated is True
+    assert score is None
+
+
+def test_aggregate_health_no_status_zero_gated():
+    """Even with high uptime, no OK status means the cfg is broken upstream."""
+    endpoints = [
+        {"status": 1, "uptime_last_30m": 0.99},
+        {"status": 2, "uptime_last_30m": 0.98},
+    ]
+    gated, score = aggregate_health(endpoints)
+    assert gated is True
+    assert score is None
+
+
+def test_aggregate_health_all_uptime_null_gated():
+    endpoints = [
+        {"status": 0, "uptime_last_30m": None, "uptime_last_1d": None},
+    ]
+    gated, score = aggregate_health(endpoints)
+    assert gated is True
+    assert score is None
+
+
+def test_aggregate_health_pct_normalisation():
+    """OpenRouter returns 0-1 fractions; some endpoints surface 0-100%
+    percentages. Both should reach the same gate decision."""
+    fraction_form = [{"status": 0, "uptime_last_30m": 0.95}]
+    pct_form = [{"status": 0, "uptime_last_30m": 95.0}]
+    g1, s1 = aggregate_health(fraction_form)
+    g2, s2 = aggregate_health(pct_form)
+    assert g1 == g2 == False  # noqa: E712
+    assert s1 is not None and s2 is not None
+    assert abs(s1 - s2) < 0.5
diff --git a/surfsense_backend/tests/unit/services/test_quota_checked_vision_llm.py b/surfsense_backend/tests/unit/services/test_quota_checked_vision_llm.py
new file mode 100644
index 0000000000..9e35b6f9c1
--- /dev/null
+++ b/surfsense_backend/tests/unit/services/test_quota_checked_vision_llm.py
@@ -0,0 +1,157 @@
+"""Unit tests for ``QuotaCheckedVisionLLM``.
+
+Validates that:
+
+* Calling ``ainvoke`` routes through ``billable_call`` (premium credit
+  enforcement) and forwards the inner LLM's response on success.
+* The wrapper proxies non-overridden attributes to the inner LLM
+  (``__getattr__``) so ``invoke`` / ``astream`` / ``with_structured_output``
+  still work without quota gating (they're not used in indexing today).
+* When ``billable_call`` raises ``QuotaInsufficientError`` the wrapper
+  bubbles it up — the ETL pipeline catches that and falls back to OCR.
+"""
+
+from __future__ import annotations
+
+import contextlib
+from typing import Any
+from uuid import uuid4
+
+import pytest
+
+pytestmark = pytest.mark.unit
+
+
+class _FakeInnerLLM:
+    """Stand-in for ``langchain_litellm.ChatLiteLLM``."""
+
+    def __init__(self, response: Any = "OCR'd content") -> None:
+        self._response = response
+        self.ainvoke_calls: list[Any] = []
+
+    async def ainvoke(self, input: Any, *args: Any, **kwargs: Any) -> Any:
+        self.ainvoke_calls.append(input)
+        return self._response
+
+    def some_other_method(self, x: int) -> int:
+        return x * 2
+
+
+@contextlib.asynccontextmanager
+async def _passthrough_billable_call(**_kwargs):
+    """Stand-in for billable_call that always allows the call to run."""
+
+    class _Acc:
+        total_cost_micros = 0
+        total_prompt_tokens = 0
+        total_completion_tokens = 0
+        grand_total = 0
+        calls: list[Any] = []
+
+        def per_message_summary(self) -> dict[str, dict[str, int]]:
+            return {}
+
+    yield _Acc()
+
+
+@pytest.mark.asyncio
+async def test_ainvoke_routes_through_billable_call(monkeypatch):
+    from app.services.quota_checked_vision_llm import QuotaCheckedVisionLLM
+
+    captured_kwargs: list[dict[str, Any]] = []
+
+    @contextlib.asynccontextmanager
+    async def _spy_billable_call(**kwargs):
+        captured_kwargs.append(kwargs)
+        async with _passthrough_billable_call() as acc:
+            yield acc
+
+    monkeypatch.setattr(
+        "app.services.quota_checked_vision_llm.billable_call",
+        _spy_billable_call,
+        raising=False,
+    )
+
+    inner = _FakeInnerLLM(response="A red apple on a white table")
+    user_id = uuid4()
+    wrapper = QuotaCheckedVisionLLM(
+        inner,
+        user_id=user_id,
+        search_space_id=99,
+        billing_tier="premium",
+        base_model="openai/gpt-4o",
+        quota_reserve_tokens=4000,
+    )
+
+    result = await wrapper.ainvoke([{"text": "what is this?"}])
+    assert result == "A red apple on a white table"
+    assert len(inner.ainvoke_calls) == 1
+    assert len(captured_kwargs) == 1
+    bc_kwargs = captured_kwargs[0]
+    assert bc_kwargs["user_id"] == user_id
+    assert bc_kwargs["search_space_id"] == 99
+    assert bc_kwargs["billing_tier"] == "premium"
+    assert bc_kwargs["base_model"] == "openai/gpt-4o"
+    assert bc_kwargs["quota_reserve_tokens"] == 4000
+    assert bc_kwargs["usage_type"] == "vision_extraction"
+
+
+@pytest.mark.asyncio
+async def test_ainvoke_propagates_quota_insufficient_error(monkeypatch):
+    from app.services.billable_calls import QuotaInsufficientError
+    from app.services.quota_checked_vision_llm import QuotaCheckedVisionLLM
+
+    @contextlib.asynccontextmanager
+    async def _denying_billable_call(**_kwargs):
+        raise QuotaInsufficientError(
+            usage_type="vision_extraction",
+            used_micros=5_000_000,
+            limit_micros=5_000_000,
+            remaining_micros=0,
+        )
+        yield  # unreachable but required for asynccontextmanager type
+
+    monkeypatch.setattr(
+        "app.services.quota_checked_vision_llm.billable_call",
+        _denying_billable_call,
+        raising=False,
+    )
+
+    inner = _FakeInnerLLM()
+    wrapper = QuotaCheckedVisionLLM(
+        inner,
+        user_id=uuid4(),
+        search_space_id=1,
+        billing_tier="premium",
+        base_model="openai/gpt-4o",
+        quota_reserve_tokens=4000,
+    )
+
+    with pytest.raises(QuotaInsufficientError):
+        await wrapper.ainvoke([{"text": "x"}])
+
+    # Inner LLM never ran on a denied reservation.
+    assert inner.ainvoke_calls == []
+
+
+@pytest.mark.asyncio
+async def test_proxies_non_overridden_attributes_to_inner():
+    """``__getattr__`` forwards anything not on the proxy itself, so any
+    method we didn't explicitly override (``invoke``, ``astream``,
+    ``with_structured_output``, etc.) still works — just without quota
+    gating, which is fine because the indexer only ever calls ainvoke.
+    """
+    from app.services.quota_checked_vision_llm import QuotaCheckedVisionLLM
+
+    inner = _FakeInnerLLM()
+    wrapper = QuotaCheckedVisionLLM(
+        inner,
+        user_id=uuid4(),
+        search_space_id=1,
+        billing_tier="premium",
+        base_model="openai/gpt-4o",
+        quota_reserve_tokens=4000,
+    )
+
+    # ``some_other_method`` is on the inner only.
+    assert wrapper.some_other_method(7) == 14
diff --git a/surfsense_backend/tests/unit/services/test_revert_filesystem_tools.py b/surfsense_backend/tests/unit/services/test_revert_filesystem_tools.py
new file mode 100644
index 0000000000..95314741ac
--- /dev/null
+++ b/surfsense_backend/tests/unit/services/test_revert_filesystem_tools.py
@@ -0,0 +1,370 @@
+"""Unit tests for the filesystem-tool branches of ``revert_service``.
+
+Covers:
+
+* Exact-name dispatch — ``rmdir`` does NOT mis-route to the document
+  branch (``"rmdir".startswith("rm")`` would mis-route under the legacy
+  prefix-based dispatch).
+* ``rm`` revert re-INSERTs a fresh document from the snapshot, including
+  re-creating chunks. Falls back to ``(folder_id_before, title_before)``
+  when ``metadata_before["virtual_path"]`` is missing.
+* ``write_file`` create-revert (``content_before IS NULL``) DELETEs the
+  document.
+* ``rmdir`` revert re-INSERTs a fresh folder from the snapshot.
+* ``mkdir`` revert DELETEs the empty folder; reports ``tool_unavailable``
+  when the folder gained children.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+from unittest.mock import AsyncMock, MagicMock
+
+import numpy as np
+import pytest
+
+from app.services import revert_service
+
+pytestmark = pytest.mark.unit
+
+
+@pytest.fixture(autouse=True)
+def _stub_embeddings(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setattr(
+        revert_service,
+        "embed_texts",
+        lambda texts: [np.zeros(8, dtype=np.float32) for _ in texts],
+    )
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+class _FakeResult:
+    def __init__(self, rows: list[Any] | None = None, scalar: Any = None) -> None:
+        self._rows = rows or []
+        self._scalar = scalar
+
+    def all(self) -> list[Any]:
+        return list(self._rows)
+
+    def scalar_one_or_none(self) -> Any:
+        return self._scalar
+
+    def scalars(self) -> Any:
+        return _FakeScalarsProxy(self._rows)
+
+
+class _FakeScalarsProxy:
+    def __init__(self, rows: list[Any]) -> None:
+        self._rows = rows
+
+    def first(self) -> Any:
+        return self._rows[0] if self._rows else None
+
+
+class _FakeSession:
+    def __init__(self) -> None:
+        self.execute = AsyncMock()
+        self.added: list[Any] = []
+        self.deleted: list[Any] = []
+        self.flush = AsyncMock()
+        # session.get(Model, pk) lookup
+        self.get = AsyncMock(return_value=None)
+
+        async def _flush_assigning_ids() -> None:
+            for obj in self.added:
+                if getattr(obj, "id", None) is None:
+                    obj.id = 999
+
+        self.flush.side_effect = _flush_assigning_ids
+
+    def add(self, obj: Any) -> None:
+        self.added.append(obj)
+
+    def add_all(self, objs: list[Any]) -> None:
+        self.added.extend(objs)
+
+
+def _action(*, tool_name: str, action_id: int = 7):
+    return MagicMock(
+        id=action_id,
+        tool_name=tool_name,
+        thread_id=1,
+        search_space_id=2,
+        user_id="user-1",
+        reverse_descriptor=None,
+    )
+
+
+def _doc_revision(
+    *,
+    document_id: int | None = None,
+    content_before: str | None = "old content",
+    title_before: str | None = "notes.md",
+    folder_id_before: int | None = 5,
+    chunks_before: list[dict[str, str]] | None = None,
+    metadata_before: dict[str, str] | None = None,
+):
+    revision = MagicMock()
+    revision.id = 100
+    revision.document_id = document_id
+    revision.search_space_id = 2
+    revision.content_before = content_before
+    revision.title_before = title_before
+    revision.folder_id_before = folder_id_before
+    revision.chunks_before = chunks_before or []
+    revision.metadata_before = metadata_before
+    return revision
+
+
+def _folder_revision(
+    *,
+    folder_id: int | None = None,
+    name_before: str | None = "team",
+    parent_id_before: int | None = None,
+    position_before: str | None = "a0",
+):
+    revision = MagicMock()
+    revision.id = 200
+    revision.folder_id = folder_id
+    revision.search_space_id = 2
+    revision.name_before = name_before
+    revision.parent_id_before = parent_id_before
+    revision.position_before = position_before
+    return revision
+
+
+# ---------------------------------------------------------------------------
+# Exact-name dispatch regression guards
+# ---------------------------------------------------------------------------
+
+
+class TestExactDispatch:
+    """Regression: ``rmdir`` MUST NOT route to the document branch."""
+
+    @pytest.mark.asyncio
+    async def test_rmdir_does_not_misroute_to_document(self) -> None:
+        # If dispatch used `startswith("rm")` we'd hit the document branch
+        # here. With exact-name lookup `rmdir` lands in `_FOLDER_TOOLS`.
+        session = _FakeSession()
+        action = _action(tool_name="rmdir")
+        # No folder revisions exist for this action.
+        session.execute.return_value = _FakeResult(rows=[])
+        outcome = await revert_service.revert_action(
+            session,  # type: ignore[arg-type]
+            action=action,
+            requester_user_id="user-1",
+        )
+        assert outcome.status == "not_reversible"
+        assert "folder_revisions" in outcome.message
+
+    def test_dispatch_sets_split_doc_and_folder(self) -> None:
+        # Static guards on the dispatch tables themselves so a future
+        # refactor doesn't accidentally reintroduce the prefix bug.
+        assert "rm" in revert_service._DOC_TOOLS
+        assert "rmdir" in revert_service._FOLDER_TOOLS
+        assert "rmdir" not in revert_service._DOC_TOOLS
+        assert "rm" not in revert_service._FOLDER_TOOLS
+        # ``move_file`` lives only in document tools (it's a doc rename).
+        assert "move_file" in revert_service._DOC_TOOLS
+        assert "move_file" not in revert_service._FOLDER_TOOLS
+
+
+# ---------------------------------------------------------------------------
+# rm revert (re-INSERT)
+# ---------------------------------------------------------------------------
+
+
+class TestRmRevert:
+    @pytest.mark.asyncio
+    async def test_re_inserts_document_with_chunks(self) -> None:
+        session = _FakeSession()
+        revision = _doc_revision(
+            document_id=None,  # row was hard-deleted
+            content_before="hello world",
+            title_before="x.md",
+            folder_id_before=None,
+            chunks_before=[{"content": "alpha"}, {"content": "beta"}],
+            metadata_before={"virtual_path": "/documents/x.md"},
+        )
+        # No collision check hit and the resulting query returns nothing.
+        session.execute.return_value = _FakeResult(scalar=None)
+
+        outcome = await revert_service._reinsert_document_from_revision(
+            session,  # type: ignore[arg-type]
+            revision=revision,
+        )
+
+        assert outcome.status == "ok"
+        # New Document + 2 chunks must have been added.
+        from app.db import Chunk, Document
+
+        added_docs = [obj for obj in session.added if isinstance(obj, Document)]
+        added_chunks = [obj for obj in session.added if isinstance(obj, Chunk)]
+        assert len(added_docs) == 1
+        assert added_docs[0].title == "x.md"
+        assert len(added_chunks) == 2
+        # Snapshot was repointed at the new doc id so a follow-up revert works.
+        assert revision.document_id == added_docs[0].id
+
+    @pytest.mark.asyncio
+    async def test_falls_back_to_folder_id_and_title_for_virtual_path(
+        self,
+    ) -> None:
+        session = _FakeSession()
+        # Snapshot with NO metadata_before — the fallback path must kick in.
+        revision = _doc_revision(
+            document_id=None,
+            content_before="hello",
+            title_before="cap.md",
+            folder_id_before=42,
+            chunks_before=[],
+            metadata_before=None,
+        )
+        # session.get(Folder, 42) returns a folder with a name.
+        folder = MagicMock()
+        folder.name = "team"
+        folder.parent_id = None
+        # First .get is for the folder lookup in the path-derivation.
+        session.get = AsyncMock(return_value=folder)
+        session.execute.return_value = _FakeResult(scalar=None)
+
+        outcome = await revert_service._reinsert_document_from_revision(
+            session,  # type: ignore[arg-type]
+            revision=revision,
+        )
+        assert outcome.status == "ok"
+
+    @pytest.mark.asyncio
+    async def test_falls_back_to_root_path_when_no_folder(
+        self,
+    ) -> None:
+        """metadata_before is None and folder_id_before is None still
+        resolves: title fallback yields ``/documents/<title>`` so revert
+        proceeds at the root of the documents tree."""
+        session = _FakeSession()
+        revision = _doc_revision(
+            document_id=None,
+            content_before="hello",
+            title_before="x.md",
+            folder_id_before=None,
+            metadata_before=None,
+        )
+        # No collision in the documents tree at /documents/x.md.
+        session.execute.return_value = _FakeResult(scalar=None)
+        outcome = await revert_service._reinsert_document_from_revision(
+            session,  # type: ignore[arg-type]
+            revision=revision,
+        )
+        assert outcome.status == "ok"
+
+    @pytest.mark.asyncio
+    async def test_collision_with_live_doc_returns_tool_unavailable(self) -> None:
+        session = _FakeSession()
+        revision = _doc_revision(
+            document_id=None,
+            content_before="hi",
+            title_before="x.md",
+            folder_id_before=None,
+            metadata_before={"virtual_path": "/documents/x.md"},
+        )
+        # SELECT for unique_identifier_hash collision hits an existing row.
+        session.execute.return_value = _FakeResult(scalar=42)
+        outcome = await revert_service._reinsert_document_from_revision(
+            session,  # type: ignore[arg-type]
+            revision=revision,
+        )
+        assert outcome.status == "tool_unavailable"
+        assert "collide" in outcome.message
+
+
+# ---------------------------------------------------------------------------
+# write_file create revert (DELETE)
+# ---------------------------------------------------------------------------
+
+
+class TestWriteFileCreateRevert:
+    @pytest.mark.asyncio
+    async def test_deletes_created_doc(self) -> None:
+        session = _FakeSession()
+        revision = _doc_revision(
+            document_id=99,
+            content_before=None,  # marker for "created in this action"
+            title_before=None,
+        )
+        outcome = await revert_service._delete_created_document(
+            session,  # type: ignore[arg-type]
+            revision=revision,
+        )
+        assert outcome.status == "ok"
+        # Exactly one DELETE was issued.
+        assert session.execute.await_count == 1
+
+
+# ---------------------------------------------------------------------------
+# rmdir revert (re-INSERT folder)
+# ---------------------------------------------------------------------------
+
+
+class TestRmdirRevert:
+    @pytest.mark.asyncio
+    async def test_re_inserts_folder_from_snapshot(self) -> None:
+        session = _FakeSession()
+        revision = _folder_revision(
+            folder_id=None,
+            name_before="team",
+            parent_id_before=None,
+            position_before="a0",
+        )
+        outcome = await revert_service._reinsert_folder_from_revision(
+            session,  # type: ignore[arg-type]
+            revision=revision,
+        )
+        from app.db import Folder
+
+        assert outcome.status == "ok"
+        added_folders = [obj for obj in session.added if isinstance(obj, Folder)]
+        assert len(added_folders) == 1
+        assert added_folders[0].name == "team"
+        assert revision.folder_id == added_folders[0].id
+
+
+# ---------------------------------------------------------------------------
+# mkdir revert (DELETE folder)
+# ---------------------------------------------------------------------------
+
+
+class TestMkdirRevert:
+    @pytest.mark.asyncio
+    async def test_deletes_empty_folder(self) -> None:
+        session = _FakeSession()
+        revision = _folder_revision(folder_id=42)
+        # Both the doc-existence check and the child-folder check return None.
+        session.execute.side_effect = [
+            _FakeResult(scalar=None),  # docs
+            _FakeResult(scalar=None),  # children
+            _FakeResult(scalar=None),  # delete (no return value)
+        ]
+        outcome = await revert_service._delete_created_folder(
+            session,  # type: ignore[arg-type]
+            revision=revision,
+        )
+        assert outcome.status == "ok"
+        # 3 executes: docs check, children check, delete.
+        assert session.execute.await_count == 3
+
+    @pytest.mark.asyncio
+    async def test_reports_tool_unavailable_when_folder_has_children(self) -> None:
+        session = _FakeSession()
+        revision = _folder_revision(folder_id=42)
+        # First check (docs) returns "row found".
+        session.execute.return_value = _FakeResult(scalar=1)
+        outcome = await revert_service._delete_created_folder(
+            session,  # type: ignore[arg-type]
+            revision=revision,
+        )
+        assert outcome.status == "tool_unavailable"
+        assert "no longer empty" in outcome.message
diff --git a/surfsense_backend/tests/unit/services/test_revert_service.py b/surfsense_backend/tests/unit/services/test_revert_service.py
new file mode 100644
index 0000000000..a81e520415
--- /dev/null
+++ b/surfsense_backend/tests/unit/services/test_revert_service.py
@@ -0,0 +1,46 @@
+"""Unit tests for the agent revert service."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from app.services.revert_service import can_revert
+
+
+class _FakeAction:
+    def __init__(self, *, user_id: Any, tool_name: str = "edit_file") -> None:
+        self.user_id = user_id
+        self.tool_name = tool_name
+
+
+class TestCanRevert:
+    def test_owner_can_revert_their_own_action(self) -> None:
+        action = _FakeAction(user_id="user-123")
+        assert can_revert(requester_user_id="user-123", action=action, is_admin=False)
+
+    def test_other_user_cannot_revert(self) -> None:
+        action = _FakeAction(user_id="user-123")
+        assert not can_revert(
+            requester_user_id="someone-else", action=action, is_admin=False
+        )
+
+    def test_admin_always_allowed(self) -> None:
+        action = _FakeAction(user_id="user-123")
+        assert can_revert(requester_user_id="anybody", action=action, is_admin=True)
+
+    def test_admin_can_revert_anonymous_action(self) -> None:
+        action = _FakeAction(user_id=None)
+        assert can_revert(requester_user_id="admin", action=action, is_admin=True)
+
+    def test_anonymous_action_blocks_non_admin(self) -> None:
+        action = _FakeAction(user_id=None)
+        assert not can_revert(requester_user_id="user-1", action=action, is_admin=False)
+
+    def test_uuid_string_normalization(self) -> None:
+        """``user_id`` may be a UUID object; comparison should still work."""
+        import uuid
+
+        u = uuid.uuid4()
+        action = _FakeAction(user_id=u)
+        # Same UUID, passed as string from the requesting side.
+        assert can_revert(requester_user_id=str(u), action=action, is_admin=False)
diff --git a/surfsense_backend/tests/unit/services/test_supports_image_input.py b/surfsense_backend/tests/unit/services/test_supports_image_input.py
new file mode 100644
index 0000000000..71fdee1c7f
--- /dev/null
+++ b/surfsense_backend/tests/unit/services/test_supports_image_input.py
@@ -0,0 +1,281 @@
+"""Unit tests for the chat-catalog ``supports_image_input`` capability flag.
+
+Capability is sourced from two places, in order of preference:
+
+1. ``architecture.input_modalities`` for dynamic OpenRouter chat configs
+   (authoritative — OpenRouter publishes per-model modalities directly).
+2. LiteLLM's authoritative model map (``litellm.supports_vision``) for
+   YAML / BYOK configs that don't carry an explicit operator override.
+
+The catalog default is *True* (conservative-allow): an unknown / unmapped
+model is not pre-judged. The streaming-task safety net
+(``is_known_text_only_chat_model``) is the only place a False actually
+blocks a request — and it requires LiteLLM to *explicitly* mark the model
+as text-only.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from app.services.openrouter_integration_service import (
+    _OPENROUTER_DYNAMIC_MARKER,
+    _generate_configs,
+    _supports_image_input,
+)
+
+pytestmark = pytest.mark.unit
+
+
+_SETTINGS_BASE: dict = {
+    "api_key": "sk-or-test",
+    "id_offset": -10_000,
+    "rpm": 200,
+    "tpm": 1_000_000,
+    "free_rpm": 20,
+    "free_tpm": 100_000,
+    "anonymous_enabled_paid": False,
+    "anonymous_enabled_free": True,
+    "quota_reserve_tokens": 4000,
+}
+
+
+# ---------------------------------------------------------------------------
+# _supports_image_input helper (OpenRouter modalities)
+# ---------------------------------------------------------------------------
+
+
+def test_supports_image_input_true_for_multimodal():
+    assert (
+        _supports_image_input(
+            {
+                "id": "openai/gpt-4o",
+                "architecture": {
+                    "input_modalities": ["text", "image"],
+                    "output_modalities": ["text"],
+                },
+            }
+        )
+        is True
+    )
+
+
+def test_supports_image_input_false_for_text_only():
+    """The exact failure mode the safety net guards against — DeepSeek V3
+    is a text-in/text-out model and would 404 if forwarded image_url."""
+    assert (
+        _supports_image_input(
+            {
+                "id": "deepseek/deepseek-v3.2-exp",
+                "architecture": {
+                    "input_modalities": ["text"],
+                    "output_modalities": ["text"],
+                },
+            }
+        )
+        is False
+    )
+
+
+def test_supports_image_input_false_when_modalities_missing():
+    """Defensive: missing architecture is treated as text-only at the
+    OpenRouter helper level. The wider catalog resolver
+    (`derive_supports_image_input`) only consults modalities when they
+    are non-empty, otherwise it falls back to LiteLLM."""
+    assert _supports_image_input({"id": "weird/model"}) is False
+    assert _supports_image_input({"id": "weird/model", "architecture": {}}) is False
+    assert (
+        _supports_image_input(
+            {"id": "weird/model", "architecture": {"input_modalities": None}}
+        )
+        is False
+    )
+
+
+# ---------------------------------------------------------------------------
+# _generate_configs threads the flag onto every emitted chat config
+# ---------------------------------------------------------------------------
+
+
+def test_generate_configs_emits_supports_image_input():
+    raw = [
+        {
+            "id": "openai/gpt-4o",
+            "architecture": {
+                "input_modalities": ["text", "image"],
+                "output_modalities": ["text"],
+            },
+            "supported_parameters": ["tools"],
+            "context_length": 200_000,
+            "pricing": {"prompt": "0.000005", "completion": "0.000015"},
+        },
+        {
+            "id": "deepseek/deepseek-v3.2-exp",
+            "architecture": {
+                "input_modalities": ["text"],
+                "output_modalities": ["text"],
+            },
+            "supported_parameters": ["tools"],
+            "context_length": 200_000,
+            "pricing": {"prompt": "0.000003", "completion": "0.000015"},
+        },
+    ]
+    cfgs = _generate_configs(raw, dict(_SETTINGS_BASE))
+    by_model = {c["model_name"]: c for c in cfgs}
+
+    gpt = by_model["openai/gpt-4o"]
+    assert gpt["supports_image_input"] is True
+    assert gpt[_OPENROUTER_DYNAMIC_MARKER] is True
+
+    deepseek = by_model["deepseek/deepseek-v3.2-exp"]
+    assert deepseek["supports_image_input"] is False
+    assert deepseek[_OPENROUTER_DYNAMIC_MARKER] is True
+
+
+# ---------------------------------------------------------------------------
+# YAML loader: defer to derive_supports_image_input on unannotated entries
+# ---------------------------------------------------------------------------
+
+
+def test_yaml_loader_resolves_unannotated_vision_model_to_true(tmp_path, monkeypatch):
+    """The regression case: an Azure GPT-5.x YAML entry without a
+    ``supports_image_input`` override should resolve to True via LiteLLM's
+    model map (which says ``supports_vision: true``). Previously this
+    defaulted to False, blocking every image turn for vision-capable
+    YAML configs."""
+    yaml_dir = tmp_path / "app" / "config"
+    yaml_dir.mkdir(parents=True)
+    (yaml_dir / "global_llm_config.yaml").write_text(
+        """
+global_llm_configs:
+  - id: -2
+    name: Azure GPT-4o
+    provider: AZURE_OPENAI
+    model_name: gpt-4o
+    api_key: sk-test
+""",
+        encoding="utf-8",
+    )
+
+    from app import config as config_module
+
+    monkeypatch.setattr(config_module, "BASE_DIR", tmp_path)
+
+    configs = config_module.load_global_llm_configs()
+    assert len(configs) == 1
+    assert configs[0]["supports_image_input"] is True
+
+
+def test_yaml_loader_respects_explicit_supports_image_input(tmp_path, monkeypatch):
+    yaml_dir = tmp_path / "app" / "config"
+    yaml_dir.mkdir(parents=True)
+    (yaml_dir / "global_llm_config.yaml").write_text(
+        """
+global_llm_configs:
+  - id: -1
+    name: GPT-4o
+    provider: OPENAI
+    model_name: gpt-4o
+    api_key: sk-test
+    supports_image_input: false
+""",
+        encoding="utf-8",
+    )
+
+    from app import config as config_module
+
+    monkeypatch.setattr(config_module, "BASE_DIR", tmp_path)
+
+    configs = config_module.load_global_llm_configs()
+    assert len(configs) == 1
+    # Operator override always wins, even against LiteLLM's True.
+    assert configs[0]["supports_image_input"] is False
+
+
+def test_yaml_loader_unknown_model_default_allows(tmp_path, monkeypatch):
+    """Unknown / unmapped model in YAML: default-allow. The streaming
+    safety net (which requires an explicit-False from LiteLLM) is the
+    only place a real block happens, so we don't lock the user out of
+    a freshly added third-party entry the catalog can't introspect."""
+    yaml_dir = tmp_path / "app" / "config"
+    yaml_dir.mkdir(parents=True)
+    (yaml_dir / "global_llm_config.yaml").write_text(
+        """
+global_llm_configs:
+  - id: -1
+    name: Some Brand New Model
+    provider: CUSTOM
+    custom_provider: brand_new_proxy
+    model_name: brand-new-model-x9
+    api_key: sk-test
+""",
+        encoding="utf-8",
+    )
+
+    from app import config as config_module
+
+    monkeypatch.setattr(config_module, "BASE_DIR", tmp_path)
+
+    configs = config_module.load_global_llm_configs()
+    assert len(configs) == 1
+    assert configs[0]["supports_image_input"] is True
+
+
+# ---------------------------------------------------------------------------
+# AgentConfig threads the flag through both YAML and Auto / BYOK
+# ---------------------------------------------------------------------------
+
+
+def test_agent_config_from_yaml_explicit_overrides_resolver():
+    from app.agents.new_chat.llm_config import AgentConfig
+
+    cfg_text_only = AgentConfig.from_yaml_config(
+        {
+            "id": -1,
+            "name": "Text Only Override",
+            "provider": "openai",
+            "model_name": "gpt-4o",  # Capable per LiteLLM, but operator says no.
+            "api_key": "sk-test",
+            "supports_image_input": False,
+        }
+    )
+    cfg_explicit_vision = AgentConfig.from_yaml_config(
+        {
+            "id": -2,
+            "name": "GPT-4o",
+            "provider": "openai",
+            "model_name": "gpt-4o",
+            "api_key": "sk-test",
+            "supports_image_input": True,
+        }
+    )
+    assert cfg_text_only.supports_image_input is False
+    assert cfg_explicit_vision.supports_image_input is True
+
+
+def test_agent_config_from_yaml_unannotated_uses_resolver():
+    """Without an explicit YAML key, AgentConfig defers to the catalog
+    resolver — for ``gpt-4o`` LiteLLM's map says supports_vision=True."""
+    from app.agents.new_chat.llm_config import AgentConfig
+
+    cfg = AgentConfig.from_yaml_config(
+        {
+            "id": -1,
+            "name": "GPT-4o (no override)",
+            "provider": "openai",
+            "model_name": "gpt-4o",
+            "api_key": "sk-test",
+        }
+    )
+    assert cfg.supports_image_input is True
+
+
+def test_agent_config_auto_mode_supports_image_input():
+    """Auto routes across the pool. We optimistically allow image input
+    so users can keep their selection on Auto with a vision-capable
+    deployment somewhere in the pool. The router's own `allowed_fails`
+    handles non-vision deployments via fallback."""
+    from app.agents.new_chat.llm_config import AgentConfig
+
+    auto = AgentConfig.from_auto_mode()
+    assert auto.supports_image_input is True
diff --git a/surfsense_backend/tests/unit/services/test_token_quota_service_cost.py b/surfsense_backend/tests/unit/services/test_token_quota_service_cost.py
new file mode 100644
index 0000000000..63681828d7
--- /dev/null
+++ b/surfsense_backend/tests/unit/services/test_token_quota_service_cost.py
@@ -0,0 +1,515 @@
+"""Cost-based premium quota unit tests.
+
+Covers the USD-micro behaviour added in migration 140:
+
+* ``TurnTokenAccumulator.total_cost_micros`` sums ``cost_micros`` across all
+  calls in a turn — used as the debit amount when ``agent_config.is_premium``
+  is true, regardless of which underlying model produced each call. This
+  preserves the prior "premium turn → all calls in turn count" rule from the
+  token-based system.
+* ``estimate_call_reserve_micros`` scales linearly with model pricing,
+  clamps to a sane floor when pricing is unknown, and respects the
+  ``QUOTA_MAX_RESERVE_MICROS`` ceiling so a misconfigured "$1000/M" entry
+  can't lock the whole balance on one call.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+pytestmark = pytest.mark.unit
+
+
+# ---------------------------------------------------------------------------
+# TurnTokenAccumulator — premium-turn debit semantics
+# ---------------------------------------------------------------------------
+
+
+def test_total_cost_micros_sums_premium_and_free_calls():
+    """A premium turn that also called a free sub-agent debits the union.
+
+    The plan deliberately preserved the existing "premium turn → all calls
+    count" behaviour because per-call premium filtering relied on
+    ``LLMRouterService._premium_model_strings`` which only covers router-pool
+    deployments. ``total_cost_micros`` therefore must include free-model
+    calls (whose ``cost_micros`` is typically ``0``) as well as the premium
+    call's actual provider cost.
+    """
+    from app.services.token_tracking_service import TurnTokenAccumulator
+
+    acc = TurnTokenAccumulator()
+    # Premium model (e.g. claude-opus): non-zero cost.
+    acc.add(
+        model="anthropic/claude-3-5-sonnet",
+        prompt_tokens=1200,
+        completion_tokens=400,
+        total_tokens=1600,
+        cost_micros=12_345,
+    )
+    # Free sub-agent (e.g. title-gen on a free model): zero cost.
+    acc.add(
+        model="gpt-4o-mini",
+        prompt_tokens=120,
+        completion_tokens=20,
+        total_tokens=140,
+        cost_micros=0,
+    )
+    # A second premium-priced call within the same turn.
+    acc.add(
+        model="anthropic/claude-3-5-sonnet",
+        prompt_tokens=800,
+        completion_tokens=200,
+        total_tokens=1000,
+        cost_micros=7_500,
+    )
+
+    assert acc.total_cost_micros == 12_345 + 0 + 7_500
+    # Token totals stay correct so the FE display path still works.
+    assert acc.grand_total == 1600 + 140 + 1000
+
+
+def test_total_cost_micros_zero_when_no_calls():
+    """An empty accumulator must report zero cost (no division-by-zero, no None)."""
+    from app.services.token_tracking_service import TurnTokenAccumulator
+
+    acc = TurnTokenAccumulator()
+    assert acc.total_cost_micros == 0
+    assert acc.grand_total == 0
+
+
+def test_per_message_summary_groups_cost_by_model():
+    """``per_message_summary`` must accumulate ``cost_micros`` per model so the
+    SSE ``model_breakdown`` payload reports actual USD spend per provider.
+    """
+    from app.services.token_tracking_service import TurnTokenAccumulator
+
+    acc = TurnTokenAccumulator()
+    acc.add(
+        model="claude-3-5-sonnet",
+        prompt_tokens=100,
+        completion_tokens=50,
+        total_tokens=150,
+        cost_micros=4_000,
+    )
+    acc.add(
+        model="claude-3-5-sonnet",
+        prompt_tokens=200,
+        completion_tokens=100,
+        total_tokens=300,
+        cost_micros=8_000,
+    )
+    acc.add(
+        model="gpt-4o-mini",
+        prompt_tokens=50,
+        completion_tokens=10,
+        total_tokens=60,
+        cost_micros=200,
+    )
+
+    summary = acc.per_message_summary()
+    assert summary["claude-3-5-sonnet"]["cost_micros"] == 12_000
+    assert summary["claude-3-5-sonnet"]["total_tokens"] == 450
+    assert summary["gpt-4o-mini"]["cost_micros"] == 200
+
+
+def test_serialized_calls_includes_cost_micros():
+    """``serialized_calls`` is what flows into the SSE ``call_details``
+    payload; cost_micros must be present on each entry so the FE message-info
+    dropdown can render per-call USD.
+    """
+    from app.services.token_tracking_service import TurnTokenAccumulator
+
+    acc = TurnTokenAccumulator()
+    acc.add(
+        model="m",
+        prompt_tokens=1,
+        completion_tokens=1,
+        total_tokens=2,
+        cost_micros=42,
+    )
+    serialized = acc.serialized_calls()
+    assert serialized == [
+        {
+            "model": "m",
+            "prompt_tokens": 1,
+            "completion_tokens": 1,
+            "total_tokens": 2,
+            "cost_micros": 42,
+            "call_kind": "chat",
+        }
+    ]
+
+
+# ---------------------------------------------------------------------------
+# estimate_call_reserve_micros — sizing and clamping
+# ---------------------------------------------------------------------------
+
+
+def test_reserve_returns_floor_when_model_unknown(monkeypatch):
+    """If LiteLLM doesn't know the model, ``get_model_info`` raises and the
+    helper falls back to the 100-micro floor — small enough that a user with
+    $0.0001 left can still send a tiny request, but non-zero so we still gate
+    against an empty balance.
+    """
+    import litellm
+
+    from app.services import token_quota_service
+
+    def _raise(_name):
+        raise KeyError("unknown")
+
+    monkeypatch.setattr(litellm, "get_model_info", _raise, raising=False)
+
+    micros = token_quota_service.estimate_call_reserve_micros(
+        base_model="nonexistent-model",
+        quota_reserve_tokens=4000,
+    )
+    assert micros == token_quota_service._QUOTA_MIN_RESERVE_MICROS
+    assert micros == 100
+
+
+def test_reserve_returns_floor_when_pricing_is_zero(monkeypatch):
+    """LiteLLM may *return* a model with both cost-per-token fields at 0
+    (pricing not yet registered). The helper must not multiply 0 x tokens
+    and end up reserving 0 — it must clamp to the floor.
+    """
+    import litellm
+
+    from app.services import token_quota_service
+
+    monkeypatch.setattr(
+        litellm,
+        "get_model_info",
+        lambda _name: {"input_cost_per_token": 0, "output_cost_per_token": 0},
+        raising=False,
+    )
+
+    micros = token_quota_service.estimate_call_reserve_micros(
+        base_model="some-pending-model",
+        quota_reserve_tokens=4000,
+    )
+    assert micros == token_quota_service._QUOTA_MIN_RESERVE_MICROS
+
+
+def test_reserve_scales_with_model_cost(monkeypatch):
+    """Claude-Opus-priced model with 4000 reserve_tokens reserves
+    ~$0.36 = 360_000 micros. Critically this must NOT be clamped down to
+    some small artificial cap — that was the bug the plan called out.
+    """
+    import litellm
+
+    from app.config import config
+    from app.services import token_quota_service
+
+    monkeypatch.setattr(
+        litellm,
+        "get_model_info",
+        lambda _name: {
+            "input_cost_per_token": 15e-6,
+            "output_cost_per_token": 75e-6,
+        },
+        raising=False,
+    )
+    monkeypatch.setattr(config, "QUOTA_MAX_RESERVE_MICROS", 1_000_000, raising=False)
+
+    micros = token_quota_service.estimate_call_reserve_micros(
+        base_model="claude-3-opus",
+        quota_reserve_tokens=4000,
+    )
+    # 4000 * (15e-6 + 75e-6) = 4000 * 90e-6 = 0.36 USD = 360_000 micros.
+    assert micros == 360_000
+
+
+def test_reserve_clamps_to_max_ceiling(monkeypatch):
+    """A misconfigured "$1000 / M" model with 4000 reserve_tokens would
+    nominally compute to $4 = 4_000_000 micros. The ceiling
+    ``QUOTA_MAX_RESERVE_MICROS`` must clamp that so a bad pricing entry
+    can't lock the user's whole balance on one call.
+    """
+    import litellm
+
+    from app.config import config
+    from app.services import token_quota_service
+
+    monkeypatch.setattr(
+        litellm,
+        "get_model_info",
+        lambda _name: {
+            "input_cost_per_token": 1e-3,
+            "output_cost_per_token": 0,
+        },
+        raising=False,
+    )
+    monkeypatch.setattr(config, "QUOTA_MAX_RESERVE_MICROS", 1_000_000, raising=False)
+
+    micros = token_quota_service.estimate_call_reserve_micros(
+        base_model="oops-misconfigured",
+        quota_reserve_tokens=4000,
+    )
+    assert micros == 1_000_000
+
+
+def test_reserve_uses_default_when_quota_reserve_tokens_missing(monkeypatch):
+    """Per-config ``quota_reserve_tokens`` is optional; when ``None`` or
+    zero, the helper must fall back to the global ``QUOTA_MAX_RESERVE_PER_CALL``
+    so anonymous-style configs still reserve the operator-tunable default.
+    """
+    import litellm
+
+    from app.config import config
+    from app.services import token_quota_service
+
+    monkeypatch.setattr(
+        litellm,
+        "get_model_info",
+        lambda _name: {
+            "input_cost_per_token": 1e-6,
+            "output_cost_per_token": 1e-6,
+        },
+        raising=False,
+    )
+    monkeypatch.setattr(config, "QUOTA_MAX_RESERVE_PER_CALL", 2000, raising=False)
+    monkeypatch.setattr(config, "QUOTA_MAX_RESERVE_MICROS", 1_000_000, raising=False)
+
+    # 2000 * (1e-6 + 1e-6) = 4e-3 USD = 4000 micros
+    assert (
+        token_quota_service.estimate_call_reserve_micros(
+            base_model="cheap", quota_reserve_tokens=None
+        )
+        == 4000
+    )
+    assert (
+        token_quota_service.estimate_call_reserve_micros(
+            base_model="cheap", quota_reserve_tokens=0
+        )
+        == 4000
+    )
+
+
+# ---------------------------------------------------------------------------
+# TokenTrackingCallback — image vs chat usage shape
+# ---------------------------------------------------------------------------
+
+
+class _FakeImageUsage:
+    """Mimics LiteLLM's ``ImageUsage`` (input_tokens / output_tokens shape)."""
+
+    def __init__(
+        self,
+        input_tokens: int = 0,
+        output_tokens: int = 0,
+        total_tokens: int | None = None,
+    ) -> None:
+        self.input_tokens = input_tokens
+        self.output_tokens = output_tokens
+        if total_tokens is not None:
+            self.total_tokens = total_tokens
+
+
+class _FakeImageResponse:
+    """Mimics LiteLLM's ``ImageResponse`` — same name so the callback's
+    ``type(...).__name__`` probe routes to the image branch.
+    """
+
+    def __init__(self, usage: _FakeImageUsage, response_cost: float | None = None):
+        self.usage = usage
+        if response_cost is not None:
+            self._hidden_params = {"response_cost": response_cost}
+
+
+# Re-tag the helper class as ``ImageResponse`` for the type-name probe in
+# the callback. We can't simply name the class ``ImageResponse`` because
+# the test runner sometimes imports test modules in surprising ways and
+# we want to be explicit.
+_FakeImageResponse.__name__ = "ImageResponse"
+
+
+class _FakeChatUsage:
+    def __init__(self, prompt: int, completion: int):
+        self.prompt_tokens = prompt
+        self.completion_tokens = completion
+        self.total_tokens = prompt + completion
+
+
+class _FakeChatResponse:
+    def __init__(self, usage: _FakeChatUsage):
+        self.usage = usage
+
+
+@pytest.mark.asyncio
+async def test_callback_reads_image_usage_input_output_tokens():
+    """``TokenTrackingCallback`` must read ``input_tokens``/``output_tokens``
+    for ``ImageResponse`` (LiteLLM's ImageUsage shape), NOT
+    prompt_tokens/completion_tokens which is the chat shape.
+    """
+    from app.services.token_tracking_service import (
+        TokenTrackingCallback,
+        scoped_turn,
+    )
+
+    cb = TokenTrackingCallback()
+    response = _FakeImageResponse(
+        usage=_FakeImageUsage(input_tokens=42, output_tokens=8, total_tokens=50),
+        response_cost=0.04,  # $0.04 per image
+    )
+
+    async with scoped_turn() as acc:
+        await cb.async_log_success_event(
+            kwargs={"model": "openai/gpt-image-1", "response_cost": 0.04},
+            response_obj=response,
+            start_time=None,
+            end_time=None,
+        )
+        assert len(acc.calls) == 1
+        call = acc.calls[0]
+        assert call.prompt_tokens == 42
+        assert call.completion_tokens == 8
+        assert call.total_tokens == 50
+        # 0.04 USD = 40_000 micros
+        assert call.cost_micros == 40_000
+        assert call.call_kind == "image_generation"
+
+
+@pytest.mark.asyncio
+async def test_callback_chat_path_unchanged():
+    """Chat responses must still read prompt_tokens/completion_tokens."""
+    from app.services.token_tracking_service import (
+        TokenTrackingCallback,
+        scoped_turn,
+    )
+
+    cb = TokenTrackingCallback()
+    response = _FakeChatResponse(_FakeChatUsage(prompt=120, completion=30))
+
+    async with scoped_turn() as acc:
+        await cb.async_log_success_event(
+            kwargs={
+                "model": "openrouter/anthropic/claude-3-5-sonnet",
+                "response_cost": 0.0036,
+            },
+            response_obj=response,
+            start_time=None,
+            end_time=None,
+        )
+        assert len(acc.calls) == 1
+        call = acc.calls[0]
+        assert call.prompt_tokens == 120
+        assert call.completion_tokens == 30
+        assert call.total_tokens == 150
+        assert call.cost_micros == 3_600
+        assert call.call_kind == "chat"
+
+
+@pytest.mark.asyncio
+async def test_callback_image_missing_response_cost_falls_back_to_zero(monkeypatch):
+    """When OpenRouter omits ``usage.cost`` LiteLLM's
+    ``default_image_cost_calculator`` raises. The defensive image branch in
+    ``_extract_cost_usd`` must NOT call ``cost_per_token`` (which is
+    chat-shaped and would raise too) — it returns 0 with a WARNING log.
+    """
+    import litellm
+
+    from app.services.token_tracking_service import (
+        TokenTrackingCallback,
+        scoped_turn,
+    )
+
+    # Force completion_cost to raise the same way OpenRouter image-gen fails.
+    def _boom(*_args, **_kwargs):
+        raise ValueError("model_cost: missing entry for openrouter image model")
+
+    monkeypatch.setattr(litellm, "completion_cost", _boom, raising=False)
+
+    # And make sure cost_per_token is NEVER called for the image path —
+    # if it were, our ``is_image=True`` branch is broken.
+    cost_per_token_calls: list = []
+
+    def _record_cost_per_token(**kwargs):
+        cost_per_token_calls.append(kwargs)
+        return (0.0, 0.0)
+
+    monkeypatch.setattr(
+        litellm, "cost_per_token", _record_cost_per_token, raising=False
+    )
+
+    cb = TokenTrackingCallback()
+    response = _FakeImageResponse(
+        usage=_FakeImageUsage(input_tokens=7, output_tokens=0)
+    )
+
+    async with scoped_turn() as acc:
+        await cb.async_log_success_event(
+            kwargs={"model": "openrouter/google/gemini-2.5-flash-image"},
+            response_obj=response,
+            start_time=None,
+            end_time=None,
+        )
+
+    assert len(acc.calls) == 1
+    assert acc.calls[0].cost_micros == 0
+    assert acc.calls[0].call_kind == "image_generation"
+    # The image branch must short-circuit before cost_per_token.
+    assert cost_per_token_calls == []
+
+
+# ---------------------------------------------------------------------------
+# scoped_turn — ContextVar reset semantics (issue B)
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_scoped_turn_restores_outer_accumulator():
+    """``scoped_turn`` must restore the previous ContextVar value on exit
+    so a per-call wrapper inside an outer chat turn doesn't leak its
+    accumulator outward (which would cause double-debit at chat-turn exit).
+    """
+    from app.services.token_tracking_service import (
+        get_current_accumulator,
+        scoped_turn,
+        start_turn,
+    )
+
+    outer = start_turn()
+    assert get_current_accumulator() is outer
+
+    async with scoped_turn() as inner:
+        assert get_current_accumulator() is inner
+        assert inner is not outer
+        inner.add(
+            model="x",
+            prompt_tokens=1,
+            completion_tokens=1,
+            total_tokens=2,
+            cost_micros=5,
+        )
+
+    # After exit the outer accumulator is restored unchanged.
+    assert get_current_accumulator() is outer
+    assert outer.total_cost_micros == 0
+    assert len(outer.calls) == 0
+    # The inner accumulator captured the call but didn't bleed into outer.
+    assert inner.total_cost_micros == 5
+
+
+@pytest.mark.asyncio
+async def test_scoped_turn_resets_to_none_when_no_outer():
+    """Running ``scoped_turn`` outside any chat turn (e.g. a background
+    indexing job) must leave the ContextVar at ``None`` on exit so the
+    next *unrelated* request starts clean.
+    """
+    from app.services.token_tracking_service import (
+        _turn_accumulator,
+        get_current_accumulator,
+        scoped_turn,
+    )
+
+    # ContextVar default is None for a fresh test isolated context. We
+    # simulate "no outer" explicitly to be robust against test order.
+    token = _turn_accumulator.set(None)
+    try:
+        assert get_current_accumulator() is None
+        async with scoped_turn() as acc:
+            assert get_current_accumulator() is acc
+        assert get_current_accumulator() is None
+    finally:
+        _turn_accumulator.reset(token)
diff --git a/surfsense_backend/tests/unit/services/test_vision_llm_api_base_defense.py b/surfsense_backend/tests/unit/services/test_vision_llm_api_base_defense.py
new file mode 100644
index 0000000000..b8ba9d80c2
--- /dev/null
+++ b/surfsense_backend/tests/unit/services/test_vision_llm_api_base_defense.py
@@ -0,0 +1,89 @@
+"""Defense-in-depth: vision-LLM resolution must not leak ``api_base``
+defaults from ``litellm.api_base`` either.
+
+Vision shares the same shape as image-gen — global YAML / OpenRouter
+dynamic configs ship ``api_base=""`` and the pre-fix ``get_vision_llm``
+call sites would silently drop the empty string and inherit
+``AZURE_OPENAI_ENDPOINT``. ``ChatLiteLLM(...)`` doesn't 404 on
+construction so we test the kwargs we hand to it instead.
+"""
+
+from __future__ import annotations
+
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+pytestmark = pytest.mark.unit
+
+
+@pytest.mark.asyncio
+async def test_get_vision_llm_global_openrouter_sets_api_base():
+    """Global negative-ID branch: an OpenRouter vision config with
+    ``api_base=""`` must end up calling ``SanitizedChatLiteLLM`` with
+    ``api_base="https://openrouter.ai/api/v1"`` — never an empty string,
+    never silently absent."""
+    from app.services import llm_service
+
+    cfg = {
+        "id": -30_001,
+        "name": "GPT-4o Vision (OpenRouter)",
+        "provider": "OPENROUTER",
+        "model_name": "openai/gpt-4o",
+        "api_key": "sk-or-test",
+        "api_base": "",
+        "api_version": None,
+        "litellm_params": {},
+        "billing_tier": "free",
+    }
+
+    search_space = MagicMock()
+    search_space.id = 1
+    search_space.user_id = "user-x"
+    search_space.vision_llm_config_id = cfg["id"]
+
+    session = AsyncMock()
+    scalars = MagicMock()
+    scalars.first.return_value = search_space
+    result = MagicMock()
+    result.scalars.return_value = scalars
+    session.execute.return_value = result
+
+    captured: dict = {}
+
+    class FakeSanitized:
+        def __init__(self, **kwargs):
+            captured.update(kwargs)
+
+    with (
+        patch(
+            "app.services.vision_llm_router_service.get_global_vision_llm_config",
+            return_value=cfg,
+        ),
+        patch(
+            "app.agents.new_chat.llm_config.SanitizedChatLiteLLM",
+            new=FakeSanitized,
+        ),
+    ):
+        await llm_service.get_vision_llm(session=session, search_space_id=1)
+
+    assert captured.get("api_base") == "https://openrouter.ai/api/v1"
+    assert captured["model"] == "openrouter/openai/gpt-4o"
+
+
+def test_vision_router_deployment_sets_api_base_when_config_empty():
+    """Auto-mode vision router: deployments are fed to ``litellm.Router``,
+    so the resolver has to apply at deployment construction time too."""
+    from app.services.vision_llm_router_service import VisionLLMRouterService
+
+    deployment = VisionLLMRouterService._config_to_deployment(
+        {
+            "model_name": "openai/gpt-4o",
+            "provider": "OPENROUTER",
+            "api_key": "sk-or-test",
+            "api_base": "",
+        }
+    )
+    assert deployment is not None
+    assert deployment["litellm_params"]["api_base"] == "https://openrouter.ai/api/v1"
+    assert deployment["litellm_params"]["model"] == "openrouter/openai/gpt-4o"
diff --git a/surfsense_backend/tests/unit/tasks/__init__.py b/surfsense_backend/tests/unit/tasks/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/tests/unit/tasks/chat/__init__.py b/surfsense_backend/tests/unit/tasks/chat/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/tests/unit/tasks/chat/streaming/__init__.py b/surfsense_backend/tests/unit/tasks/chat/streaming/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/surfsense_backend/tests/unit/tasks/chat/streaming/test_interrupt_inspector_all.py b/surfsense_backend/tests/unit/tasks/chat/streaming/test_interrupt_inspector_all.py
new file mode 100644
index 0000000000..15ab89b733
--- /dev/null
+++ b/surfsense_backend/tests/unit/tasks/chat/streaming/test_interrupt_inspector_all.py
@@ -0,0 +1,209 @@
+"""Real-graph contract: ``all_interrupt_values`` surfaces every pending interrupt.
+
+The chat-stream emit loop must yield one ``data-interrupt-request`` SSE frame
+per paused subagent, in the same order ``state.interrupts`` reports them —
+that's also the order the resume slicer consumes decisions. These tests pin
+that contract against a **real** paused parent graph built via
+:class:`~langgraph.types.Send` fan-out (no synthetic state mocks).
+"""
+
+from __future__ import annotations
+
+import pytest
+from langchain.tools import ToolRuntime
+from langchain_core.messages import AIMessage, HumanMessage
+from langchain_core.runnables import RunnableConfig
+from langgraph.checkpoint.memory import InMemorySaver
+from langgraph.graph import END, START, StateGraph
+from langgraph.types import Send, interrupt
+from typing_extensions import TypedDict
+
+from app.agents.multi_agent_chat.middleware.main_agent.checkpointed_subagent_middleware.task_tool import (
+    build_task_tool_with_parent_config,
+)
+from app.tasks.chat.streaming.helpers.interrupt_inspector import (
+    all_interrupt_values,
+)
+
+
+class _SubState(TypedDict, total=False):
+    messages: list
+
+
+class _DispatchState(TypedDict, total=False):
+    messages: list
+    tcid: str
+    desc: str
+
+
+def _build_pausing_subagent(checkpointer: InMemorySaver):
+    def approve_node(_state):
+        decision = interrupt(
+            {
+                "action_requests": [
+                    {"name": "do_thing", "args": {"x": 1}, "description": ""}
+                ],
+                "review_configs": [{}],
+            }
+        )
+        return {"messages": [AIMessage(content=f"got:{decision}")]}
+
+    g = StateGraph(_SubState)
+    g.add_node("approve", approve_node)
+    g.add_edge(START, "approve")
+    g.add_edge("approve", END)
+    return g.compile(checkpointer=checkpointer)
+
+
+def _parent_graph_dispatching_two_tasks_via_send(
+    task_tool, *, tool_call_id_a: str, tool_call_id_b: str, checkpointer
+):
+    def fanout_edge(_state) -> list[Send]:
+        return [
+            Send("call_task", {"tcid": tool_call_id_a, "desc": "approve A"}),
+            Send("call_task", {"tcid": tool_call_id_b, "desc": "approve B"}),
+        ]
+
+    async def call_task(state: _DispatchState, config: RunnableConfig):
+        rt = ToolRuntime(
+            state=state,
+            config=config,
+            context=None,
+            stream_writer=None,
+            tool_call_id=state["tcid"],
+            store=None,
+        )
+        return await task_tool.coroutine(
+            description=state["desc"], subagent_type="approver", runtime=rt
+        )
+
+    g = StateGraph(_DispatchState)
+    g.add_node("call_task", call_task)
+    g.add_conditional_edges(START, fanout_edge, ["call_task"])
+    g.add_edge("call_task", END)
+    return g.compile(checkpointer=checkpointer)
+
+
+@pytest.mark.asyncio
+async def test_returns_every_pending_interrupt_for_two_paused_subagents():
+    """Two parallel subagents -> ``all_interrupt_values`` returns 2 dicts."""
+    checkpointer = InMemorySaver()
+    subagent = _build_pausing_subagent(checkpointer)
+    task_tool = build_task_tool_with_parent_config(
+        [{"name": "approver", "description": "approves", "runnable": subagent}]
+    )
+    parent = _parent_graph_dispatching_two_tasks_via_send(
+        task_tool,
+        tool_call_id_a="parent-tcid-A",
+        tool_call_id_b="parent-tcid-B",
+        checkpointer=checkpointer,
+    )
+
+    parent_config = {
+        "configurable": {"thread_id": "all-iv-thread"},
+        "recursion_limit": 100,
+    }
+    await parent.ainvoke({"messages": [HumanMessage(content="seed")]}, parent_config)
+    state = await parent.aget_state(parent_config)
+
+    values = all_interrupt_values(state)
+
+    assert isinstance(values, list)
+    assert len(values) == 2, (
+        f"REGRESSION: expected one value per pending subagent, got "
+        f"{len(values)}: {values!r}"
+    )
+    stamps = [v.get("tool_call_id") for v in values]
+    assert sorted(stamps) == ["parent-tcid-A", "parent-tcid-B"]
+    for v in values:
+        assert isinstance(v.get("action_requests"), list)
+        assert len(v["action_requests"]) == 1
+
+
+@pytest.mark.asyncio
+async def test_preserves_state_interrupts_traversal_order():
+    """Order returned by inspector must match ``state.interrupts`` order.
+
+    The resume slicer consumes decisions left-to-right against
+    ``collect_pending_tool_calls(state)`` which walks ``state.interrupts``
+    in iteration order — so the inspector (which drives the *emit* order)
+    must agree with that traversal or the slice and the wire fall out of sync.
+    """
+    checkpointer = InMemorySaver()
+    subagent = _build_pausing_subagent(checkpointer)
+    task_tool = build_task_tool_with_parent_config(
+        [{"name": "approver", "description": "approves", "runnable": subagent}]
+    )
+    parent = _parent_graph_dispatching_two_tasks_via_send(
+        task_tool,
+        tool_call_id_a="parent-tcid-A",
+        tool_call_id_b="parent-tcid-B",
+        checkpointer=checkpointer,
+    )
+    parent_config = {
+        "configurable": {"thread_id": "order-thread"},
+        "recursion_limit": 100,
+    }
+    await parent.ainvoke({"messages": [HumanMessage(content="seed")]}, parent_config)
+    state = await parent.aget_state(parent_config)
+
+    inspector_order = [v["tool_call_id"] for v in all_interrupt_values(state)]
+    state_order = [
+        i.value["tool_call_id"]
+        for i in state.interrupts
+        if isinstance(getattr(i, "value", None), dict) and "tool_call_id" in i.value
+    ]
+
+    assert inspector_order == state_order, (
+        f"inspector order {inspector_order!r} diverged from state.interrupts "
+        f"order {state_order!r}; the resume slicer would mis-route decisions."
+    )
+
+
+@pytest.mark.asyncio
+async def test_returns_empty_list_when_nothing_paused():
+    """A graph that completes normally produces no interrupts to surface."""
+
+    def done_node(_state):
+        return {"messages": [AIMessage(content="done")]}
+
+    g = StateGraph(_SubState)
+    g.add_node("done", done_node)
+    g.add_edge(START, "done")
+    g.add_edge("done", END)
+    graph = g.compile(checkpointer=InMemorySaver())
+    config = {"configurable": {"thread_id": "no-pause-thread"}}
+    await graph.ainvoke({"messages": [HumanMessage(content="seed")]}, config)
+    state = await graph.aget_state(config)
+
+    assert all_interrupt_values(state) == []
+
+
+@pytest.mark.asyncio
+async def test_single_paused_subagent_returns_a_list_of_one():
+    """Single-pause case must still return a list (not unwrap to a dict)."""
+
+    def approve_node(_state):
+        decision = interrupt(
+            {
+                "action_requests": [{"name": "x", "args": {}, "description": ""}],
+                "review_configs": [{}],
+                "tool_call_id": "lonely-tcid",
+            }
+        )
+        return {"messages": [AIMessage(content=f"got:{decision}")]}
+
+    g = StateGraph(_SubState)
+    g.add_node("approve", approve_node)
+    g.add_edge(START, "approve")
+    g.add_edge("approve", END)
+    graph = g.compile(checkpointer=InMemorySaver())
+    config = {"configurable": {"thread_id": "single-thread"}}
+    await graph.ainvoke({"messages": [HumanMessage(content="seed")]}, config)
+    state = await graph.aget_state(config)
+
+    values = all_interrupt_values(state)
+
+    assert isinstance(values, list)
+    assert len(values) == 1
+    assert values[0].get("tool_call_id") == "lonely-tcid"
diff --git a/surfsense_backend/tests/unit/tasks/chat/streaming/test_stage_1_parity.py b/surfsense_backend/tests/unit/tasks/chat/streaming/test_stage_1_parity.py
new file mode 100644
index 0000000000..8fde773e33
--- /dev/null
+++ b/surfsense_backend/tests/unit/tasks/chat/streaming/test_stage_1_parity.py
@@ -0,0 +1,240 @@
+"""Pin Stage 1 extractions as faithful copies of the old helpers.
+
+Extractions under ``app.tasks.chat.streaming`` are compared to
+``app.tasks.chat.stream_new_chat`` helpers.
+For each Stage 1 extraction we assert the new function returns the same
+output as the old one for a representative input set. The moment the
+two diverge - intentionally or otherwise - this file fails loudly so
+the divergence is reviewed rather than shipped silently.
+"""
+
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass, field
+from typing import Any
+
+import pytest
+
+from app.agents.new_chat.errors import BusyError
+from app.agents.new_chat.middleware.busy_mutex import request_cancel, reset_cancel
+from app.tasks.chat.stream_new_chat import (
+    _classify_stream_exception as old_classify,
+    _emit_stream_terminal_error as old_emit_terminal_error,
+    _extract_chunk_parts as old_extract_chunk_parts,
+    _extract_resolved_file_path as old_extract_resolved_file_path,
+    _tool_output_has_error as old_tool_output_has_error,
+    _tool_output_to_text as old_tool_output_to_text,
+)
+from app.tasks.chat.streaming.errors.classifier import (
+    classify_stream_exception as new_classify,
+)
+from app.tasks.chat.streaming.errors.emitter import (
+    emit_stream_terminal_error as new_emit_terminal_error,
+)
+from app.tasks.chat.streaming.helpers.chunk_parts import (
+    extract_chunk_parts as new_extract_chunk_parts,
+)
+from app.tasks.chat.streaming.helpers.tool_output import (
+    extract_resolved_file_path as new_extract_resolved_file_path,
+    tool_output_has_error as new_tool_output_has_error,
+    tool_output_to_text as new_tool_output_to_text,
+)
+
+pytestmark = pytest.mark.unit
+
+
+# ---------------------------------------------------------------- chunk parts
+
+
+@dataclass
+class _Chunk:
+    content: Any = ""
+    additional_kwargs: dict[str, Any] = field(default_factory=dict)
+    tool_call_chunks: list[dict[str, Any]] = field(default_factory=list)
+
+
+_CHUNK_CASES: list[Any] = [
+    None,
+    _Chunk(content=""),
+    _Chunk(content="hello"),
+    _Chunk(content=42),  # invalid type, defensively coerced to empty
+    _Chunk(
+        content=[
+            {"type": "text", "text": "Hello "},
+            {"type": "text", "text": "world"},
+        ]
+    ),
+    _Chunk(
+        content=[
+            {"type": "reasoning", "reasoning": "hmm "},
+            {"type": "reasoning", "text": "still"},
+            {"type": "text", "text": "answer"},
+        ]
+    ),
+    _Chunk(
+        content=[
+            {"type": "tool_call_chunk", "id": "c1", "name": "x", "args": "{"},
+            {"type": "tool_use", "id": "c2", "name": "y"},
+            {"type": "image_url", "url": "ignored"},
+        ]
+    ),
+    _Chunk(
+        content="visible",
+        additional_kwargs={"reasoning_content": "private"},
+    ),
+    _Chunk(
+        tool_call_chunks=[
+            {"id": None, "name": None, "args": '{"a":1}', "index": 0},
+            {"id": "c", "name": "n", "args": "}", "index": 0},
+        ]
+    ),
+    _Chunk(
+        content=[{"type": "tool_call_chunk", "id": "from-block", "name": "x"}],
+        tool_call_chunks=[{"id": "from-attr", "name": "y"}],
+    ),
+]
+
+
+@pytest.mark.parametrize("chunk", _CHUNK_CASES)
+def test_extract_chunk_parts_matches_old_implementation(chunk: Any) -> None:
+    assert new_extract_chunk_parts(chunk) == old_extract_chunk_parts(chunk)
+
+
+# ----------------------------------------------------------- error classifier
+
+
+def _classify_cases() -> list[Exception]:
+    """Inputs that the FE depends on being mapped to specific error codes."""
+    return [
+        Exception("totally generic error"),
+        Exception('{"error":{"type":"rate_limit_error","message":"slow down"}}'),
+        Exception(
+            'OpenrouterException - {"error":{"message":"Provider returned error",'
+            '"code":429}}'
+        ),
+        BusyError(request_id="thread-busy-parity"),
+        Exception("Thread is busy with another request"),
+    ]
+
+
+@pytest.mark.parametrize("exc", _classify_cases())
+def test_classify_stream_exception_matches_old_implementation(
+    exc: Exception,
+) -> None:
+    new = new_classify(exc, flow_label="parity-test")
+    old = old_classify(exc, flow_label="parity-test")
+    # Strip the wall-clock retry timestamp before comparing — both
+    # implementations call ``time.time()`` independently and the call
+    # order is enough to differ by 1 ms in practice. Every other field
+    # in the tuple must match exactly.
+    new_extra = dict(new[5]) if isinstance(new[5], dict) else new[5]
+    old_extra = dict(old[5]) if isinstance(old[5], dict) else old[5]
+    if isinstance(new_extra, dict) and isinstance(old_extra, dict):
+        new_extra.pop("retry_after_at", None)
+        old_extra.pop("retry_after_at", None)
+    assert new[:5] == old[:5]
+    assert new_extra == old_extra
+
+
+def test_classify_turn_cancelling_branch_parity() -> None:
+    """The TURN_CANCELLING branch reads cancel state for the busy thread id;
+    both implementations must agree on retry-window semantics, not just the
+    plain THREAD_BUSY code."""
+    thread_id = "parity-cancelling-thread"
+    reset_cancel(thread_id)
+    request_cancel(thread_id)
+    exc = BusyError(request_id=thread_id)
+    new = new_classify(exc, flow_label="parity-test")
+    old = old_classify(exc, flow_label="parity-test")
+    assert new[0] == old[0] == "thread_busy"
+    assert new[1] == old[1] == "TURN_CANCELLING"
+    assert isinstance(new[5], dict) and isinstance(old[5], dict)
+    assert new[5]["retry_after_ms"] == old[5]["retry_after_ms"]
+
+
+# ------------------------------------------------------------ terminal emitter
+
+
+class _FakeStreamingService:
+    """Duck-types ``format_error`` for both old and new emitters."""
+
+    def __init__(self) -> None:
+        self.calls: list[dict[str, Any]] = []
+
+    def format_error(
+        self, message: str, *, error_code: str, extra: dict[str, Any] | None = None
+    ) -> str:
+        self.calls.append(
+            {"message": message, "error_code": error_code, "extra": extra}
+        )
+        return f'data: {{"type":"error","errorText":"{message}"}}\n\n'
+
+
+def test_emit_stream_terminal_error_matches_old_output_and_logs(caplog) -> None:
+    """The new emitter must produce the same SSE frame and log the same
+    structured payload as the old one for the same arguments."""
+    args: dict[str, Any] = {
+        "flow": "new",
+        "request_id": "req-parity",
+        "thread_id": 7,
+        "search_space_id": 9,
+        "user_id": "user-parity",
+        "message": "boom",
+        "error_kind": "server_error",
+        "error_code": "SERVER_ERROR",
+        "severity": "error",
+        "is_expected": False,
+        "extra": {"foo": "bar"},
+    }
+
+    new_svc = _FakeStreamingService()
+    old_svc = _FakeStreamingService()
+
+    with caplog.at_level(logging.ERROR):
+        new_frame = new_emit_terminal_error(streaming_service=new_svc, **args)
+        old_frame = old_emit_terminal_error(streaming_service=old_svc, **args)
+
+    assert new_frame == old_frame
+    assert new_svc.calls == old_svc.calls
+    chat_error_records = [
+        r for r in caplog.records if "[chat_stream_error]" in r.message
+    ]
+    # One log line per emit call (two emits -> two records).
+    assert len(chat_error_records) == 2
+
+
+# ---------------------------------------------------------------- tool output
+
+
+def test_tool_output_helpers_match_old_implementation() -> None:
+    samples: list[Any] = [
+        {"result": "ok"},
+        {"error": "bad"},
+        {"result": "Error: x"},
+        "Error: plain",
+        "fine",
+        {"nested": {"a": 1}},
+    ]
+    for s in samples:
+        assert new_tool_output_to_text(s) == old_tool_output_to_text(s)
+        assert new_tool_output_has_error(s) == old_tool_output_has_error(s)
+
+    assert new_extract_resolved_file_path(
+        tool_name="write_file",
+        tool_output={"path": " /tmp/x "},
+        tool_input=None,
+    ) == old_extract_resolved_file_path(
+        tool_name="write_file",
+        tool_output={"path": " /tmp/x "},
+        tool_input=None,
+    )
+    assert new_extract_resolved_file_path(
+        tool_name="write_file",
+        tool_output={},
+        tool_input={"file_path": " /fallback "},
+    ) == old_extract_resolved_file_path(
+        tool_name="write_file",
+        tool_output={},
+        tool_input={"file_path": " /fallback "},
+    )
diff --git a/surfsense_backend/tests/unit/tasks/chat/streaming/test_stage_2_parity.py b/surfsense_backend/tests/unit/tasks/chat/streaming/test_stage_2_parity.py
new file mode 100644
index 0000000000..3ee1ab622e
--- /dev/null
+++ b/surfsense_backend/tests/unit/tasks/chat/streaming/test_stage_2_parity.py
@@ -0,0 +1,241 @@
+"""Parity tests for Stage 2 extractions (tool matching, thinking step, custom events)."""
+
+from __future__ import annotations
+
+from typing import Any
+from unittest.mock import MagicMock
+
+import pytest
+
+from app.tasks.chat.stream_new_chat import _legacy_match_lc_id as old_legacy_match
+from app.tasks.chat.streaming.handlers.custom_events import (
+    handle_action_log,
+    handle_action_log_updated,
+    handle_document_created,
+    handle_report_progress,
+)
+from app.tasks.chat.streaming.helpers.tool_call_matching import (
+    match_buffered_langchain_tool_call_id as new_legacy_match,
+)
+from app.tasks.chat.streaming.relay.state import AgentEventRelayState
+from app.tasks.chat.streaming.relay.thinking_step_completion import (
+    complete_active_thinking_step,
+)
+from app.tasks.chat.streaming.relay.thinking_step_sse import emit_thinking_step_frame
+
+pytestmark = pytest.mark.unit
+
+
+def _copy_chunk_buffer(raw: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    return [dict(x) for x in raw]
+
+
+def test_legacy_tool_call_match_matches_old_implementation() -> None:
+    cases: list[tuple[list[dict[str, Any]], str, str, dict[str, str]]] = [
+        (
+            [
+                {"name": "write_file", "id": "lc-a"},
+                {"name": "other", "id": "lc-b"},
+            ],
+            "write_file",
+            "run-1",
+            {},
+        ),
+        (
+            [{"name": "x", "id": None}, {"name": "y", "id": "lc-fallback"}],
+            "write_file",
+            "run-2",
+            {},
+        ),
+        ([{"name": "no_id"}], "write_file", "run-3", {}),
+    ]
+    for chunks_template, tool_name, run_id, lc_map_seed in cases:
+        old_chunks = _copy_chunk_buffer(chunks_template)
+        new_chunks = _copy_chunk_buffer(chunks_template)
+        old_map = dict(lc_map_seed)
+        new_map = dict(lc_map_seed)
+        old_out = old_legacy_match(old_chunks, tool_name, run_id, old_map)
+        new_out = new_legacy_match(new_chunks, tool_name, run_id, new_map)
+        assert new_out == old_out
+        assert new_chunks == old_chunks
+        assert new_map == old_map
+
+
+def test_emit_thinking_step_frame_invokes_builder_before_service() -> None:
+    order: list[str] = []
+    builder = MagicMock()
+
+    def on_ts(*args: Any, **kwargs: Any) -> None:
+        order.append("builder")
+
+    builder.on_thinking_step.side_effect = on_ts
+
+    svc = MagicMock()
+
+    def fmt(**kwargs: Any) -> str:
+        order.append("service")
+        return "frame"
+
+    svc.format_thinking_step.side_effect = fmt
+
+    out = emit_thinking_step_frame(
+        streaming_service=svc,
+        content_builder=builder,
+        step_id="thinking-1",
+        title="Working",
+        status="in_progress",
+        items=["a"],
+    )
+    assert out == "frame"
+    assert order == ["builder", "service"]
+    builder.on_thinking_step.assert_called_once()
+    svc.format_thinking_step.assert_called_once()
+
+
+def test_emit_thinking_step_frame_skips_builder_when_none() -> None:
+    svc = MagicMock(return_value="x")
+    svc.format_thinking_step.return_value = "frame"
+    assert (
+        emit_thinking_step_frame(
+            streaming_service=svc,
+            content_builder=None,
+            step_id="s",
+            title="t",
+        )
+        == "frame"
+    )
+    svc.format_thinking_step.assert_called_once()
+
+
+def test_complete_active_thinking_step_mirrors_closure_semantics() -> None:
+    svc = MagicMock()
+    svc.format_thinking_step.return_value = "done-frame"
+    completed: set[str] = set()
+    relay_state = AgentEventRelayState.for_invocation()
+
+    frame, new_id = complete_active_thinking_step(
+        state=relay_state,
+        streaming_service=svc,
+        content_builder=None,
+        last_active_step_id="thinking-1",
+        last_active_step_title="T",
+        last_active_step_items=["x"],
+        completed_step_ids=completed,
+    )
+    assert frame == "done-frame"
+    assert new_id is None
+    assert "thinking-1" in completed
+
+    frame2, id2 = complete_active_thinking_step(
+        state=relay_state,
+        streaming_service=svc,
+        content_builder=None,
+        last_active_step_id="thinking-1",
+        last_active_step_title="T",
+        last_active_step_items=[],
+        completed_step_ids=completed,
+    )
+    assert frame2 is None
+    assert id2 == "thinking-1"
+
+
+def test_agent_event_relay_state_factory_matches_counter_rule() -> None:
+    s0 = AgentEventRelayState.for_invocation()
+    assert s0.thinking_step_counter == 0
+    assert s0.last_active_step_id is None
+
+    s1 = AgentEventRelayState.for_invocation(
+        initial_step_id="thinking-resume-1",
+        initial_step_title="Inherited",
+        initial_step_items=["Topic: X"],
+    )
+    assert s1.thinking_step_counter == 1
+    assert s1.last_active_step_id == "thinking-resume-1"
+    assert s1.next_thinking_step_id("thinking") == "thinking-2"
+
+
+@pytest.mark.parametrize(
+    ("phase", "message", "start_items", "expected_tail"),
+    [
+        (
+            "revising_section",
+            "progress line",
+            ["Topic: Foo", "Modifying bar", "stale..."],
+            ["Topic: Foo", "Modifying bar", "progress line"],
+        ),
+        (
+            "other",
+            "phase msg",
+            ["Topic: Foo", "old line"],
+            ["Topic: Foo", "phase msg"],
+        ),
+    ],
+)
+def test_report_progress_items_match_reference(
+    phase: str,
+    message: str,
+    start_items: list[str],
+    expected_tail: list[str],
+) -> None:
+    svc = MagicMock()
+    svc.format_thinking_step.return_value = "sse"
+
+    items = list(start_items)
+    frame, new_items = handle_report_progress(
+        {"message": message, "phase": phase},
+        last_active_step_id="step-1",
+        last_active_step_title="Report",
+        last_active_step_items=items,
+        streaming_service=svc,
+        content_builder=None,
+    )
+    assert frame == "sse"
+    assert new_items == expected_tail
+    kwargs = svc.format_thinking_step.call_args.kwargs
+    assert kwargs["items"] == expected_tail
+
+
+def test_report_progress_noop_when_missing_message_or_step() -> None:
+    svc = MagicMock()
+    items = ["Topic: A"]
+    f1, i1 = handle_report_progress(
+        {"message": "", "phase": "x"},
+        last_active_step_id="s",
+        last_active_step_title="t",
+        last_active_step_items=items,
+        streaming_service=svc,
+        content_builder=None,
+    )
+    assert f1 is None and i1 is items
+
+    f2, i2 = handle_report_progress(
+        {"message": "m", "phase": "x"},
+        last_active_step_id=None,
+        last_active_step_title="t",
+        last_active_step_items=items,
+        streaming_service=svc,
+        content_builder=None,
+    )
+    assert f2 is None and i2 is items
+
+
+def test_document_action_handlers_match_format_data_guards() -> None:
+    svc = MagicMock()
+    svc.format_data.return_value = "data-frame"
+
+    assert handle_document_created({}, streaming_service=svc) is None
+    assert handle_document_created({"id": 0}, streaming_service=svc) is None
+    handle_document_created({"id": 42, "title": "x"}, streaming_service=svc)
+    svc.format_data.assert_called_with(
+        "documents-updated", {"action": "created", "document": {"id": 42, "title": "x"}}
+    )
+
+    svc.reset_mock()
+    assert handle_action_log({"id": None}, streaming_service=svc) is None
+    handle_action_log({"id": 1}, streaming_service=svc)
+    svc.format_data.assert_called_once_with("action-log", {"id": 1})
+
+    svc.reset_mock()
+    assert handle_action_log_updated({"id": None}, streaming_service=svc) is None
+    handle_action_log_updated({"id": 2}, streaming_service=svc)
+    svc.format_data.assert_called_once_with("action-log-updated", {"id": 2})
diff --git a/surfsense_backend/tests/unit/tasks/chat/streaming/test_stream_output.py b/surfsense_backend/tests/unit/tasks/chat/streaming/test_stream_output.py
new file mode 100644
index 0000000000..c0123b76db
--- /dev/null
+++ b/surfsense_backend/tests/unit/tasks/chat/streaming/test_stream_output.py
@@ -0,0 +1,122 @@
+"""Tests for ``stream_output`` (LangGraph events → SSE)."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+import pytest
+
+from app.tasks.chat.streaming.graph_stream import stream_output
+from app.tasks.chat.streaming.graph_stream.result import StreamingResult
+
+pytestmark = pytest.mark.unit
+
+
+@dataclass
+class _Chunk:
+    content: Any = ""
+    additional_kwargs: dict[str, Any] = field(default_factory=dict)
+    tool_call_chunks: list[dict[str, Any]] = field(default_factory=list)
+
+
+class _StreamingService:
+    def __init__(self) -> None:
+        self._text_idx = 0
+
+    def generate_text_id(self) -> str:
+        self._text_idx += 1
+        return f"text-{self._text_idx}"
+
+    def format_text_start(self, text_id: str) -> str:
+        return f"text_start:{text_id}"
+
+    def format_text_delta(self, text_id: str, text: str) -> str:
+        return f"text_delta:{text_id}:{text}"
+
+    def format_text_end(self, text_id: str) -> str:
+        return f"text_end:{text_id}"
+
+
+class _Agent:
+    def __init__(self, events: list[dict[str, Any]]) -> None:
+        self.events = list(events)
+        self.calls: list[tuple[Any, dict[str, Any]]] = []
+
+    async def astream_events(self, input_data: Any, **kwargs: Any):
+        self.calls.append((input_data, kwargs))
+        for event in self.events:
+            yield event
+
+
+async def _collect(stream: Any) -> list[str]:
+    out: list[str] = []
+    async for x in stream:
+        out.append(x)
+    return out
+
+
+async def test_stream_output_emits_text_lifecycle_and_updates_result() -> None:
+    service = _StreamingService()
+    agent = _Agent(
+        [
+            {
+                "event": "on_chat_model_stream",
+                "data": {"chunk": _Chunk(content="Hello")},
+            },
+            {
+                "event": "on_chat_model_stream",
+                "data": {"chunk": _Chunk(content=" world")},
+            },
+        ]
+    )
+    result = StreamingResult()
+
+    frames = await _collect(
+        stream_output(
+            agent=agent,
+            config={"configurable": {"thread_id": "t-1"}},
+            input_data={"messages": []},
+            streaming_service=service,
+            result=result,
+        )
+    )
+
+    assert frames == [
+        "text_start:text-1",
+        "text_delta:text-1:Hello",
+        "text_delta:text-1: world",
+        "text_end:text-1",
+    ]
+    assert result.accumulated_text == "Hello world"
+    assert result.agent_called_update_memory is False
+
+
+async def test_stream_output_passes_runtime_context_to_agent() -> None:
+    service = _StreamingService()
+
+    class _ContextAwareAgent:
+        async def astream_events(self, input_data: Any, **kwargs: Any):
+            del input_data
+            text = "ctx-ok" if kwargs.get("context") else "ctx-missing"
+            yield {"event": "on_chat_model_stream", "data": {"chunk": _Chunk(text)}}
+
+    agent = _ContextAwareAgent()
+    result = StreamingResult()
+
+    frames = await _collect(
+        stream_output(
+            agent=agent,
+            config={"configurable": {"thread_id": "t-2"}},
+            input_data={"messages": []},
+            streaming_service=service,
+            result=result,
+            runtime_context={"mentioned_document_ids": [1, 2]},
+        )
+    )
+
+    assert frames == [
+        "text_start:text-1",
+        "text_delta:text-1:ctx-ok",
+        "text_end:text-1",
+    ]
diff --git a/surfsense_backend/tests/unit/tasks/chat/streaming/test_task_span.py b/surfsense_backend/tests/unit/tasks/chat/streaming/test_task_span.py
new file mode 100644
index 0000000000..df680018da
--- /dev/null
+++ b/surfsense_backend/tests/unit/tasks/chat/streaming/test_task_span.py
@@ -0,0 +1,71 @@
+"""Unit tests for ``task_span`` open/close helpers."""
+
+from __future__ import annotations
+
+import pytest
+
+from app.tasks.chat.streaming.relay.state import AgentEventRelayState
+from app.tasks.chat.streaming.relay.task_span import (
+    clear_task_span_if_delegating_task_ended,
+    ensure_pending_task_span_for_lc,
+    open_task_span,
+)
+
+pytestmark = pytest.mark.unit
+
+
+def test_open_task_span_sets_span_and_run_id() -> None:
+    state = AgentEventRelayState.for_invocation()
+    sid = open_task_span(state, run_id="run-abc")
+    assert sid.startswith("spn_")
+    assert state.active_span_id == sid
+    assert state.active_task_run_id == "run-abc"
+    assert state.span_metadata_if_active() == {"spanId": sid}
+
+
+def test_clear_ignored_for_non_task_tool() -> None:
+    state = AgentEventRelayState.for_invocation()
+    open_task_span(state, run_id="run-1")
+    sid = state.active_span_id
+    clear_task_span_if_delegating_task_ended(
+        state, tool_name="web_search", run_id="run-1"
+    )
+    assert state.active_span_id == sid
+    assert state.active_task_run_id == "run-1"
+
+
+def test_clear_ignored_when_task_run_id_mismatches() -> None:
+    state = AgentEventRelayState.for_invocation()
+    open_task_span(state, run_id="run-open")
+    clear_task_span_if_delegating_task_ended(
+        state, tool_name="task", run_id="run-other"
+    )
+    assert state.active_span_id is not None
+    assert state.active_task_run_id == "run-open"
+
+
+def test_clear_on_matching_task_end() -> None:
+    state = AgentEventRelayState.for_invocation()
+    open_task_span(state, run_id="run-x")
+    clear_task_span_if_delegating_task_ended(state, tool_name="task", run_id="run-x")
+    assert state.active_span_id is None
+    assert state.active_task_run_id is None
+    assert state.span_metadata_if_active() is None
+
+
+def test_clear_noop_when_no_open_span() -> None:
+    state = AgentEventRelayState.for_invocation()
+    clear_task_span_if_delegating_task_ended(state, tool_name="task", run_id="run-x")
+    assert state.active_span_id is None
+
+
+def test_pending_then_open_reuses_same_span_id() -> None:
+    state = AgentEventRelayState.for_invocation()
+    sid_pending = ensure_pending_task_span_for_lc(state, "lc-task-1")
+    assert state.pending_task_span_by_lc["lc-task-1"] == sid_pending
+    sid_active = open_task_span(
+        state, run_id="run-1", langchain_tool_call_id="lc-task-1"
+    )
+    assert sid_active == sid_pending
+    assert state.active_span_id == sid_pending
+    assert "lc-task-1" not in state.pending_task_span_by_lc
diff --git a/surfsense_backend/tests/unit/tasks/chat/streaming/test_tool_activity_metadata.py b/surfsense_backend/tests/unit/tasks/chat/streaming/test_tool_activity_metadata.py
new file mode 100644
index 0000000000..c2e68dacd0
--- /dev/null
+++ b/surfsense_backend/tests/unit/tasks/chat/streaming/test_tool_activity_metadata.py
@@ -0,0 +1,42 @@
+"""Unit tests for ``AgentEventRelayState.tool_activity_metadata``."""
+
+from __future__ import annotations
+
+import pytest
+
+from app.tasks.chat.streaming.relay.state import AgentEventRelayState
+from app.tasks.chat.streaming.relay.task_span import open_task_span
+
+pytestmark = pytest.mark.unit
+
+
+def test_returns_none_when_no_span_and_no_thinking_step() -> None:
+    state = AgentEventRelayState.for_invocation()
+    assert state.tool_activity_metadata(thinking_step_id=None) is None
+    assert state.tool_activity_metadata(thinking_step_id="") is None
+    assert state.tool_activity_metadata(thinking_step_id="   ") is None
+
+
+def test_thinking_step_id_only() -> None:
+    state = AgentEventRelayState.for_invocation()
+    assert state.tool_activity_metadata(thinking_step_id="thinking-3") == {
+        "thinkingStepId": "thinking-3",
+    }
+
+
+def test_span_only_when_active() -> None:
+    state = AgentEventRelayState.for_invocation()
+    open_task_span(state, run_id="run-x")
+    assert state.tool_activity_metadata(thinking_step_id=None) == {
+        "spanId": state.active_span_id,
+    }
+
+
+def test_merges_span_and_thinking_step_when_both_set() -> None:
+    state = AgentEventRelayState.for_invocation()
+    open_task_span(state, run_id="run-x")
+    md = state.tool_activity_metadata(thinking_step_id="thinking-7")
+    assert md == {
+        "spanId": state.active_span_id,
+        "thinkingStepId": "thinking-7",
+    }
diff --git a/surfsense_backend/tests/unit/tasks/chat/test_content_builder.py b/surfsense_backend/tests/unit/tasks/chat/test_content_builder.py
new file mode 100644
index 0000000000..42e62d26b6
--- /dev/null
+++ b/surfsense_backend/tests/unit/tasks/chat/test_content_builder.py
@@ -0,0 +1,672 @@
+"""Unit tests for ``AssistantContentBuilder``.
+
+Pins the in-memory ``ContentPart[]`` projection so the JSONB the server
+persists matches what the frontend renders live (see
+``surfsense_web/lib/chat/streaming-state.ts``). Every test asserts both
+the structural shape of ``snapshot()`` and that the snapshot is
+``json.dumps``-safe (the streaming finally block writes it directly to
+``new_chat_messages.content`` without an explicit serialization round
+trip).
+"""
+
+from __future__ import annotations
+
+import json
+
+import pytest
+
+from app.services.new_streaming_service import VercelStreamingService
+from app.tasks.chat.content_builder import AssistantContentBuilder
+
+pytestmark = pytest.mark.unit
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _assert_jsonb_safe(parts: list[dict]) -> None:
+    """Sanity check: any snapshot must round-trip through ``json.dumps``."""
+    serialized = json.dumps(parts)
+    assert json.loads(serialized) == parts
+
+
+# ---------------------------------------------------------------------------
+# Text turns
+# ---------------------------------------------------------------------------
+
+
+class TestTextOnly:
+    def test_single_text_block_collapses_consecutive_deltas(self):
+        b = AssistantContentBuilder()
+        b.on_text_start("text-1")
+        b.on_text_delta("text-1", "Hello")
+        b.on_text_delta("text-1", " ")
+        b.on_text_delta("text-1", "world")
+        b.on_text_end("text-1")
+
+        snap = b.snapshot()
+        assert snap == [{"type": "text", "text": "Hello world"}]
+        assert not b.is_empty()
+        _assert_jsonb_safe(snap)
+
+    def test_empty_text_start_end_pair_leaves_no_part(self):
+        # Mirrors the FE: a text-start without any deltas should
+        # not materialise an empty ``{"type":"text","text":""}`` part.
+        b = AssistantContentBuilder()
+        b.on_text_start("text-1")
+        b.on_text_end("text-1")
+
+        assert b.snapshot() == []
+        assert b.is_empty()
+
+    def test_text_after_text_end_starts_fresh_part(self):
+        b = AssistantContentBuilder()
+        b.on_text_start("text-1")
+        b.on_text_delta("text-1", "first")
+        b.on_text_end("text-1")
+
+        b.on_text_start("text-2")
+        b.on_text_delta("text-2", "second")
+        b.on_text_end("text-2")
+
+        snap = b.snapshot()
+        assert snap == [
+            {"type": "text", "text": "first"},
+            {"type": "text", "text": "second"},
+        ]
+
+
+class TestReasoningThenText:
+    def test_reasoning_followed_by_text_yields_two_parts_in_order(self):
+        b = AssistantContentBuilder()
+        b.on_reasoning_start("r-1")
+        b.on_reasoning_delta("r-1", "Considering options...")
+        b.on_reasoning_end("r-1")
+
+        b.on_text_start("text-1")
+        b.on_text_delta("text-1", "The answer is 42.")
+        b.on_text_end("text-1")
+
+        snap = b.snapshot()
+        assert snap == [
+            {"type": "reasoning", "text": "Considering options..."},
+            {"type": "text", "text": "The answer is 42."},
+        ]
+        _assert_jsonb_safe(snap)
+
+    def test_text_delta_after_reasoning_implicitly_closes_reasoning(self):
+        # Mirrors FE ``appendText``: a text delta arriving while a
+        # reasoning part is "active" still produces a fresh text
+        # part, never appends into the reasoning block.
+        b = AssistantContentBuilder()
+        b.on_reasoning_start("r-1")
+        b.on_reasoning_delta("r-1", "thinking")
+        # No explicit reasoning_end — text delta should close it.
+        b.on_text_delta("text-1", "answer")
+
+        snap = b.snapshot()
+        assert snap == [
+            {"type": "reasoning", "text": "thinking"},
+            {"type": "text", "text": "answer"},
+        ]
+
+
+# ---------------------------------------------------------------------------
+# Tool calls
+# ---------------------------------------------------------------------------
+
+
+class TestToolHeavyTurn:
+    def test_full_tool_lifecycle_produces_complete_tool_call_part(self):
+        b = AssistantContentBuilder()
+        # Some narration before the tool fires.
+        b.on_text_start("text-1")
+        b.on_text_delta("text-1", "Searching...")
+        b.on_text_end("text-1")
+
+        b.on_tool_input_start(
+            ui_id="call_run123",
+            tool_name="web_search",
+            langchain_tool_call_id="lc_tool_abc",
+        )
+        b.on_tool_input_delta("call_run123", '{"query":')
+        b.on_tool_input_delta("call_run123", '"surfsense"}')
+        b.on_tool_input_available(
+            ui_id="call_run123",
+            tool_name="web_search",
+            args={"query": "surfsense"},
+            langchain_tool_call_id="lc_tool_abc",
+        )
+        b.on_tool_output_available(
+            ui_id="call_run123",
+            output={"status": "completed", "citations": {}},
+            langchain_tool_call_id="lc_tool_abc",
+        )
+
+        snap = b.snapshot()
+        assert snap[0] == {"type": "text", "text": "Searching..."}
+        tool_part = snap[1]
+        assert tool_part["type"] == "tool-call"
+        assert tool_part["toolCallId"] == "call_run123"
+        assert tool_part["toolName"] == "web_search"
+        assert tool_part["args"] == {"query": "surfsense"}
+        # ``argsText`` is the pretty-printed final JSON, not the raw
+        # streaming buffer (FE ``stream-pipeline.ts:128``).
+        assert tool_part["argsText"] == json.dumps(
+            {"query": "surfsense"}, indent=2, ensure_ascii=False
+        )
+        assert tool_part["langchainToolCallId"] == "lc_tool_abc"
+        assert tool_part["result"] == {"status": "completed", "citations": {}}
+        _assert_jsonb_safe(snap)
+
+    def test_tool_input_available_without_prior_start_creates_card(self):
+        # Late-registration: tool-input-available may be
+        # emitted without a prior tool-input-start (no streamed
+        # tool_call_chunks). The card should still be created.
+        b = AssistantContentBuilder()
+        b.on_tool_input_available(
+            ui_id="call_run42",
+            tool_name="grep",
+            args={"pattern": "TODO"},
+            langchain_tool_call_id="lc_x",
+        )
+        b.on_tool_output_available(
+            ui_id="call_run42",
+            output={"matches": 3},
+            langchain_tool_call_id="lc_x",
+        )
+
+        snap = b.snapshot()
+        assert len(snap) == 1
+        part = snap[0]
+        assert part["type"] == "tool-call"
+        assert part["toolCallId"] == "call_run42"
+        assert part["args"] == {"pattern": "TODO"}
+        assert part["langchainToolCallId"] == "lc_x"
+        assert part["result"] == {"matches": 3}
+
+    def test_tool_input_start_idempotent_for_same_ui_id(self):
+        # tool-input-start can fire from BOTH the chunk
+        # registration path AND the canonical ``on_tool_start`` path.
+        # The second call must not create a duplicate part.
+        b = AssistantContentBuilder()
+        b.on_tool_input_start("call_x", "ls", "lc_x")
+        b.on_tool_input_start("call_x", "ls", "lc_x")
+        snap = b.snapshot()
+        assert len(snap) == 1
+
+    def test_tool_input_delta_without_prior_start_is_silently_dropped(self):
+        b = AssistantContentBuilder()
+        b.on_tool_input_delta("call_unknown", '{"orphan": "delta"}')
+        assert b.snapshot() == []
+
+    def test_langchain_tool_call_id_backfills_only_when_absent(self):
+        b = AssistantContentBuilder()
+        b.on_tool_input_start("call_x", "ls", "lc_first")
+        # Late event must NOT clobber an already-set lc id.
+        b.on_tool_input_start("call_x", "ls", "lc_late")
+        snap = b.snapshot()
+        assert snap[0]["langchainToolCallId"] == "lc_first"
+
+    def test_args_text_streaming_buffer_reflects_concatenation(self):
+        b = AssistantContentBuilder()
+        b.on_tool_input_start("call_x", "save_doc", "lc_y")
+        b.on_tool_input_delta("call_x", '{"title":')
+        b.on_tool_input_delta("call_x", '"Hi"}')
+        # Snapshot mid-stream should see the partial buffer (the FE
+        # tolerates invalid JSON and renders it as-is).
+        mid = b.snapshot()
+        assert mid[0]["argsText"] == '{"title":"Hi"}'
+        # Then tool-input-available replaces with pretty-printed.
+        b.on_tool_input_available(
+            "call_x",
+            "save_doc",
+            {"title": "Hi"},
+            "lc_y",
+        )
+        final = b.snapshot()
+        assert final[0]["argsText"] == json.dumps(
+            {"title": "Hi"}, indent=2, ensure_ascii=False
+        )
+
+
+# ---------------------------------------------------------------------------
+# Task-span metadata on tool-call parts (JSONB persistence)
+# ---------------------------------------------------------------------------
+
+
+class TestToolCallSpanMetadata:
+    def test_input_available_merges_new_metadata_keys_after_start(self):
+        b = AssistantContentBuilder()
+        b.on_tool_input_start("call_t", "task", "lc_t", metadata={"spanId": "spn_1"})
+        b.on_tool_input_available(
+            "call_t",
+            "task",
+            {"goal": "x"},
+            "lc_t",
+            metadata={"traceId": "tr_1"},
+        )
+        part = b.snapshot()[0]
+        assert part["metadata"]["spanId"] == "spn_1"
+        assert part["metadata"]["traceId"] == "tr_1"
+        _assert_jsonb_safe(b.snapshot())
+
+    def test_input_available_does_not_overwrite_existing_metadata_keys(self):
+        b = AssistantContentBuilder()
+        b.on_tool_input_start("call_t", "task", "lc_t", metadata={"spanId": "spn_keep"})
+        b.on_tool_input_available(
+            "call_t", "task", {}, "lc_t", metadata={"spanId": "spn_other"}
+        )
+        assert b.snapshot()[0]["metadata"]["spanId"] == "spn_keep"
+
+    def test_late_tool_input_available_carries_metadata(self):
+        b = AssistantContentBuilder()
+        b.on_tool_input_available(
+            "call_l",
+            "grep",
+            {"pattern": "TODO"},
+            None,
+            metadata={"spanId": "spn_l"},
+        )
+        part = b.snapshot()[0]
+        assert part["metadata"] == {"spanId": "spn_l"}
+        _assert_jsonb_safe(b.snapshot())
+
+    def test_output_available_merges_without_clobbering_span_id(self):
+        b = AssistantContentBuilder()
+        b.on_tool_input_start("call_t", "ls", "lc", metadata={"spanId": "spn_x"})
+        b.on_tool_input_available("call_t", "ls", {"path": "/"}, "lc")
+        b.on_tool_output_available(
+            "call_t",
+            {"ok": True},
+            "lc",
+            metadata={"spanId": "spn_y", "extra": 1},
+        )
+        md = b.snapshot()[0]["metadata"]
+        assert md["spanId"] == "spn_x"
+        assert md["extra"] == 1
+
+    def test_output_available_adds_thinking_step_id_without_clobbering_span(self):
+        b = AssistantContentBuilder()
+        b.on_tool_input_start(
+            "call_t",
+            "ls",
+            "lc",
+            metadata={"spanId": "spn_x", "thinkingStepId": "thinking-3"},
+        )
+        b.on_tool_input_available("call_t", "ls", {"path": "/"}, "lc")
+        b.on_tool_output_available(
+            "call_t",
+            {"ok": True},
+            "lc",
+            metadata={"spanId": "spn_x", "thinkingStepId": "thinking-3"},
+        )
+        md = b.snapshot()[0]["metadata"]
+        assert md["spanId"] == "spn_x"
+        assert md["thinkingStepId"] == "thinking-3"
+
+    def test_output_available_with_none_metadata_preserves_prior(self):
+        b = AssistantContentBuilder()
+        b.on_tool_input_start("c", "ls", "lc", metadata={"spanId": "spn_1"})
+        b.on_tool_input_available("c", "ls", {}, "lc")
+        b.on_tool_output_available("c", {"r": 1}, "lc", metadata=None)
+        assert b.snapshot()[0]["metadata"] == {"spanId": "spn_1"}
+
+    def test_available_adds_thinking_step_id_after_chunk_only_start(self):
+        """Mirrors chunk ``tool-input-start`` then ``on_tool_start`` ``available``."""
+        b = AssistantContentBuilder()
+        b.on_tool_input_start("lc_1", "ls", "lc_1", metadata={"spanId": "spn_a"})
+        b.on_tool_input_available(
+            "lc_1",
+            "ls",
+            {"path": "/"},
+            "lc_1",
+            metadata={"spanId": "spn_a", "thinkingStepId": "thinking-2"},
+        )
+        md = b.snapshot()[0]["metadata"]
+        assert md["spanId"] == "spn_a"
+        assert md["thinkingStepId"] == "thinking-2"
+
+
+class TestVercelStreamingServiceToolMetadataWire:
+    """SSE payloads include optional ``metadata`` for FE grouping."""
+
+    @staticmethod
+    def _parse_sse_data_line(raw: str) -> dict:
+        assert raw.startswith("data: ")
+        payload = raw.split("data: ", 1)[1].split("\n\n", 1)[0].strip()
+        return json.loads(payload)
+
+    def test_tool_input_available_includes_metadata_when_set(self):
+        svc = VercelStreamingService()
+        raw = svc.format_tool_input_available(
+            "id1",
+            "task",
+            {"a": 1},
+            langchain_tool_call_id="lc1",
+            metadata={"spanId": "spn_w", "thinkingStepId": "thinking-4"},
+        )
+        body = self._parse_sse_data_line(raw)
+        assert body["type"] == "tool-input-available"
+        assert body["metadata"] == {
+            "spanId": "spn_w",
+            "thinkingStepId": "thinking-4",
+        }
+
+    def test_tool_output_available_includes_metadata_when_set(self):
+        svc = VercelStreamingService()
+        raw = svc.format_tool_output_available(
+            "id1",
+            {"status": "completed"},
+            langchain_tool_call_id="lc1",
+            metadata={"spanId": "spn_o", "thinkingStepId": "thinking-9"},
+        )
+        body = self._parse_sse_data_line(raw)
+        assert body["type"] == "tool-output-available"
+        assert body["metadata"] == {
+            "spanId": "spn_o",
+            "thinkingStepId": "thinking-9",
+        }
+
+    def test_tool_input_available_omits_metadata_key_when_none(self):
+        svc = VercelStreamingService()
+        raw = svc.format_tool_input_available("id1", "ls", {})
+        body = self._parse_sse_data_line(raw)
+        assert "metadata" not in body
+
+
+# ---------------------------------------------------------------------------
+# Thinking steps & separators
+# ---------------------------------------------------------------------------
+
+
+class TestThinkingSteps:
+    def test_first_thinking_step_unshifts_singleton_to_index_zero(self):
+        b = AssistantContentBuilder()
+        b.on_text_start("text-1")
+        b.on_text_delta("text-1", "Hello")
+        b.on_text_end("text-1")
+
+        b.on_thinking_step("step-1", "Analyzing", "in_progress", ["item-a"])
+
+        snap = b.snapshot()
+        # Singleton goes to index 0 (FE ``updateThinkingSteps`` unshift).
+        assert snap[0]["type"] == "data-thinking-steps"
+        assert snap[0]["data"]["steps"] == [
+            {
+                "id": "step-1",
+                "title": "Analyzing",
+                "status": "in_progress",
+                "items": ["item-a"],
+            }
+        ]
+        assert snap[1] == {"type": "text", "text": "Hello"}
+
+    def test_subsequent_thinking_steps_mutate_the_singleton_in_place(self):
+        b = AssistantContentBuilder()
+        b.on_thinking_step("step-1", "Analyzing", "in_progress", [])
+        b.on_thinking_step("step-2", "Searching", "in_progress", ["q"])
+        b.on_thinking_step("step-1", "Analyzing", "completed", ["done"])
+
+        snap = b.snapshot()
+        assert len([p for p in snap if p["type"] == "data-thinking-steps"]) == 1
+        steps = snap[0]["data"]["steps"]
+        assert len(steps) == 2
+        assert steps[0]["id"] == "step-1"
+        assert steps[0]["status"] == "completed"
+        assert steps[0]["items"] == ["done"]
+        assert steps[1]["id"] == "step-2"
+
+    def test_thinking_step_with_text_continues_appending_to_text(self):
+        b = AssistantContentBuilder()
+        b.on_text_start("text-1")
+        b.on_text_delta("text-1", "first")
+
+        # Thinking step inserts at index 0, bumps text idx from 0 to 1.
+        b.on_thinking_step("step-1", "Working", "in_progress", [])
+        b.on_text_delta("text-1", " second")
+
+        snap = b.snapshot()
+        text_parts = [p for p in snap if p["type"] == "text"]
+        assert text_parts == [{"type": "text", "text": "first second"}]
+
+    def test_thinking_step_without_id_is_dropped(self):
+        b = AssistantContentBuilder()
+        b.on_thinking_step("", "noop", "in_progress", None)
+        assert b.snapshot() == []
+        assert b.is_empty()
+
+
+class TestStepSeparators:
+    def test_separator_no_op_before_any_content(self):
+        b = AssistantContentBuilder()
+        b.on_step_separator()
+        assert b.snapshot() == []
+
+    def test_separator_after_text_appends_with_step_index_zero(self):
+        b = AssistantContentBuilder()
+        b.on_text_start("text-1")
+        b.on_text_delta("text-1", "first")
+        b.on_text_end("text-1")
+
+        b.on_step_separator()
+
+        snap = b.snapshot()
+        assert snap[-1] == {
+            "type": "data-step-separator",
+            "data": {"stepIndex": 0},
+        }
+
+    def test_consecutive_separators_collapse_to_one(self):
+        b = AssistantContentBuilder()
+        b.on_text_delta("text-1", "x")
+        b.on_step_separator()
+        b.on_step_separator()  # No-op: previous part is already a separator.
+        snap = b.snapshot()
+        assert sum(1 for p in snap if p["type"] == "data-step-separator") == 1
+
+    def test_step_index_increments_across_separators(self):
+        b = AssistantContentBuilder()
+        b.on_text_delta("text-1", "a")
+        b.on_step_separator()
+        b.on_text_delta("text-2", "b")
+        b.on_step_separator()
+        snap = b.snapshot()
+        seps = [p for p in snap if p["type"] == "data-step-separator"]
+        assert [s["data"]["stepIndex"] for s in seps] == [0, 1]
+
+
+# ---------------------------------------------------------------------------
+# Interruption handling
+# ---------------------------------------------------------------------------
+
+
+class TestMarkInterrupted:
+    def test_running_tool_calls_get_state_aborted(self):
+        b = AssistantContentBuilder()
+        b.on_tool_input_start("call_a", "ls", "lc_a")
+        b.on_tool_input_available("call_a", "ls", {"path": "/"}, "lc_a")
+        # No tool-output-available — simulates client disconnect mid-tool.
+
+        b.mark_interrupted()
+
+        snap = b.snapshot()
+        assert snap[0]["state"] == "aborted"
+        assert "result" not in snap[0]
+
+    def test_completed_tool_calls_are_not_marked_aborted(self):
+        b = AssistantContentBuilder()
+        b.on_tool_input_start("call_a", "ls", "lc_a")
+        b.on_tool_input_available("call_a", "ls", {"path": "/"}, "lc_a")
+        b.on_tool_output_available("call_a", {"files": []}, "lc_a")
+
+        b.mark_interrupted()
+
+        snap = b.snapshot()
+        assert "state" not in snap[0]
+        assert snap[0]["result"] == {"files": []}
+
+    def test_open_text_block_keeps_accumulated_content(self):
+        b = AssistantContentBuilder()
+        b.on_text_start("text-1")
+        b.on_text_delta("text-1", "partial")
+        # No on_text_end — disconnect mid-stream.
+
+        b.mark_interrupted()
+
+        snap = b.snapshot()
+        assert snap == [{"type": "text", "text": "partial"}]
+
+
+# ---------------------------------------------------------------------------
+# is_empty / snapshot semantics
+# ---------------------------------------------------------------------------
+
+
+class TestIsEmpty:
+    def test_fresh_builder_is_empty(self):
+        assert AssistantContentBuilder().is_empty()
+
+    def test_text_part_breaks_emptiness(self):
+        b = AssistantContentBuilder()
+        b.on_text_delta("text-1", "x")
+        assert not b.is_empty()
+
+    def test_tool_call_breaks_emptiness(self):
+        b = AssistantContentBuilder()
+        b.on_tool_input_start("call_x", "ls", None)
+        assert not b.is_empty()
+
+    def test_thinking_step_alone_does_not_break_emptiness(self):
+        # Mirrors the "status marker fallback" semantic: a turn that
+        # only emitted a thinking step before being interrupted should
+        # still be treated as empty for finalize_assistant_turn's
+        # status-marker substitution.
+        b = AssistantContentBuilder()
+        b.on_thinking_step("step-1", "Working", "in_progress", [])
+        assert b.is_empty()
+
+    def test_step_separator_alone_does_not_break_emptiness(self):
+        b = AssistantContentBuilder()
+        # Force a separator (it would normally no-op without content,
+        # but we simulate the underlying state to verify is_empty is
+        # not fooled by a stray separator).
+        b.parts.append({"type": "data-step-separator", "data": {"stepIndex": 0}})
+        assert b.is_empty()
+
+
+class TestSnapshotSemantics:
+    def test_snapshot_is_deep_copied_so_mutations_do_not_leak(self):
+        b = AssistantContentBuilder()
+        b.on_tool_input_start("call_x", "ls", "lc_x")
+        b.on_tool_input_available("call_x", "ls", {"path": "/"}, "lc_x")
+        snap = b.snapshot()
+        # Mutate the returned snapshot — original should be untouched.
+        snap[0]["args"]["mutated"] = True
+        snap[0]["state"] = "tampered"
+
+        again = b.snapshot()
+        assert "mutated" not in again[0]["args"]
+        assert "state" not in again[0]
+
+    def test_snapshot_round_trips_through_json(self):
+        b = AssistantContentBuilder()
+        b.on_thinking_step("step-1", "Analyzing", "in_progress", ["item"])
+        b.on_text_delta("text-1", "answer")
+        b.on_tool_input_start("call_x", "ls", "lc_x")
+        b.on_tool_input_available("call_x", "ls", {"path": "/"}, "lc_x")
+        b.on_tool_output_available("call_x", {"files": ["a.txt"]}, "lc_x")
+        b.on_step_separator()
+        snap = b.snapshot()
+
+        encoded = json.dumps(snap)
+        assert json.loads(encoded) == snap
+
+
+class TestStats:
+    """``stats()`` is the perf-log handle for [PERF] [stream_*]
+    finalize_payload lines. Pin the schema so an ops dashboard can
+    rely on these keys being present and meaningful.
+    """
+
+    def test_fresh_builder_reports_all_zeros(self):
+        b = AssistantContentBuilder()
+        s = b.stats()
+        assert s == {
+            "parts": 0,
+            "bytes": 2,  # ``[]`` is two bytes
+            "text": 0,
+            "reasoning": 0,
+            "tool_calls": 0,
+            "tool_calls_completed": 0,
+            "tool_calls_aborted": 0,
+            "thinking_step_parts": 0,
+            "step_separators": 0,
+        }
+
+    def test_counts_each_part_type_independently(self):
+        b = AssistantContentBuilder()
+        b.on_text_start("t1")
+        b.on_text_delta("t1", "hi")
+        b.on_text_end("t1")
+        b.on_reasoning_start("r1")
+        b.on_reasoning_delta("r1", "thinking")
+        b.on_reasoning_end("r1")
+        b.on_thinking_step("step-1", "Analyzing", "completed", ["item"])
+        b.on_step_separator()
+        b.on_tool_input_start("call_done", "ls", "lc_done")
+        b.on_tool_input_available("call_done", "ls", {}, "lc_done")
+        b.on_tool_output_available("call_done", {"ok": True}, "lc_done")
+        b.on_tool_input_start("call_running", "rm", "lc_running")
+        b.on_tool_input_available("call_running", "rm", {}, "lc_running")
+
+        s = b.stats()
+        assert s["text"] == 1
+        assert s["reasoning"] == 1
+        assert s["tool_calls"] == 2
+        assert s["tool_calls_completed"] == 1
+        assert s["tool_calls_aborted"] == 0
+        assert s["thinking_step_parts"] == 1
+        assert s["step_separators"] == 1
+        assert s["parts"] == sum(
+            [
+                s["text"],
+                s["reasoning"],
+                s["tool_calls"],
+                s["thinking_step_parts"],
+                s["step_separators"],
+            ]
+        )
+        assert s["bytes"] > 0
+
+    def test_mark_interrupted_flips_running_calls_to_aborted_in_stats(self):
+        b = AssistantContentBuilder()
+        b.on_tool_input_start("call_done", "ls", "lc_done")
+        b.on_tool_input_available("call_done", "ls", {}, "lc_done")
+        b.on_tool_output_available("call_done", {"ok": True}, "lc_done")
+        b.on_tool_input_start("call_running", "rm", "lc_running")
+        b.on_tool_input_available("call_running", "rm", {}, "lc_running")
+
+        # Pre-interrupt: one completed, one still running (no result).
+        pre = b.stats()
+        assert pre["tool_calls_completed"] == 1
+        assert pre["tool_calls_aborted"] == 0
+
+        b.mark_interrupted()
+        post = b.stats()
+        assert post["tool_calls_completed"] == 1
+        assert post["tool_calls_aborted"] == 1
+        assert post["tool_calls"] == 2
+
+    def test_bytes_reflects_jsonb_payload_size(self):
+        # Each text-delta adds bytes monotonically — useful for catching
+        # an unbounded delta buffer regression in the perf signal.
+        b = AssistantContentBuilder()
+        b.on_text_start("t1")
+        b.on_text_delta("t1", "x" * 10)
+        small = b.stats()["bytes"]
+        b.on_text_delta("t1", "x" * 1000)
+        large = b.stats()["bytes"]
+        assert large > small + 900
diff --git a/surfsense_backend/tests/unit/tasks/chat/test_extract_chunk_parts.py b/surfsense_backend/tests/unit/tasks/chat/test_extract_chunk_parts.py
new file mode 100644
index 0000000000..1263a5fe1f
--- /dev/null
+++ b/surfsense_backend/tests/unit/tasks/chat/test_extract_chunk_parts.py
@@ -0,0 +1,228 @@
+"""Unit tests for ``stream_new_chat._extract_chunk_parts``.
+
+Earlier versions only handled ``isinstance(chunk.content, str)`` and
+silently dropped every other shape (Anthropic typed-block lists,
+Bedrock reasoning blocks, ``additional_kwargs.reasoning_content`` from
+a few providers). These regression tests pin those four shapes plus the
+defensive cases (``None`` chunk, mixed types, missing fields).
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+import pytest
+
+from app.tasks.chat.stream_new_chat import _extract_chunk_parts
+
+
+@dataclass
+class _FakeChunk:
+    """Minimal stand-in for ``AIMessageChunk`` used in unit tests."""
+
+    content: Any = ""
+    additional_kwargs: dict[str, Any] = field(default_factory=dict)
+    tool_call_chunks: list[dict[str, Any]] = field(default_factory=list)
+
+
+class TestStringContent:
+    def test_plain_string_content_extracts_as_text(self) -> None:
+        chunk = _FakeChunk(content="hello world")
+        out = _extract_chunk_parts(chunk)
+        assert out["text"] == "hello world"
+        assert out["reasoning"] == ""
+        assert out["tool_call_chunks"] == []
+
+    def test_empty_string_content_yields_empty_text(self) -> None:
+        chunk = _FakeChunk(content="")
+        out = _extract_chunk_parts(chunk)
+        assert out["text"] == ""
+        assert out["reasoning"] == ""
+        assert out["tool_call_chunks"] == []
+
+
+class TestListContent:
+    def test_list_of_text_blocks_concatenates(self) -> None:
+        chunk = _FakeChunk(
+            content=[
+                {"type": "text", "text": "Hello "},
+                {"type": "text", "text": "world"},
+            ]
+        )
+        out = _extract_chunk_parts(chunk)
+        assert out["text"] == "Hello world"
+        assert out["reasoning"] == ""
+
+    def test_mixed_text_and_reasoning_blocks(self) -> None:
+        chunk = _FakeChunk(
+            content=[
+                {"type": "reasoning", "reasoning": "Let me think... "},
+                {"type": "reasoning", "text": "still thinking."},
+                {"type": "text", "text": "The answer is 42."},
+            ]
+        )
+        out = _extract_chunk_parts(chunk)
+        assert out["text"] == "The answer is 42."
+        assert out["reasoning"] == "Let me think... still thinking."
+
+    def test_tool_call_chunks_in_content_list_extracted(self) -> None:
+        chunk = _FakeChunk(
+            content=[
+                {"type": "text", "text": "Calling tool..."},
+                {
+                    "type": "tool_call_chunk",
+                    "id": "call_123",
+                    "name": "make_widget",
+                    "args": '{"color":"red"}',
+                },
+            ]
+        )
+        out = _extract_chunk_parts(chunk)
+        assert out["text"] == "Calling tool..."
+        assert out["reasoning"] == ""
+        assert len(out["tool_call_chunks"]) == 1
+        assert out["tool_call_chunks"][0]["id"] == "call_123"
+        assert out["tool_call_chunks"][0]["name"] == "make_widget"
+
+    def test_tool_use_blocks_also_extracted(self) -> None:
+        """Some providers (Anthropic) emit ``type='tool_use'`` instead."""
+        chunk = _FakeChunk(
+            content=[
+                {
+                    "type": "tool_use",
+                    "id": "call_xyz",
+                    "name": "search",
+                },
+            ]
+        )
+        out = _extract_chunk_parts(chunk)
+        assert out["tool_call_chunks"] == [
+            {"type": "tool_use", "id": "call_xyz", "name": "search"}
+        ]
+
+    def test_unknown_block_types_are_ignored(self) -> None:
+        chunk = _FakeChunk(
+            content=[
+                {"type": "image_url", "url": "https://example.com/x.png"},
+                {"type": "text", "text": "ok"},
+            ]
+        )
+        out = _extract_chunk_parts(chunk)
+        assert out["text"] == "ok"
+
+    def test_blocks_without_text_field_are_ignored(self) -> None:
+        chunk = _FakeChunk(
+            content=[
+                {"type": "text"},  # no text/content key
+                {"type": "text", "text": "kept"},
+            ]
+        )
+        out = _extract_chunk_parts(chunk)
+        assert out["text"] == "kept"
+
+
+class TestAdditionalKwargsReasoning:
+    def test_reasoning_content_in_additional_kwargs(self) -> None:
+        """Some providers stash reasoning in ``additional_kwargs.reasoning_content``."""
+        chunk = _FakeChunk(
+            content="visible answer",
+            additional_kwargs={"reasoning_content": "internal monologue"},
+        )
+        out = _extract_chunk_parts(chunk)
+        assert out["text"] == "visible answer"
+        assert out["reasoning"] == "internal monologue"
+
+    def test_reasoning_appended_to_typed_block_reasoning(self) -> None:
+        chunk = _FakeChunk(
+            content=[{"type": "reasoning", "text": "from blocks. "}],
+            additional_kwargs={"reasoning_content": "from kwargs."},
+        )
+        out = _extract_chunk_parts(chunk)
+        assert out["reasoning"] == "from blocks. from kwargs."
+
+
+class TestToolCallChunksAttribute:
+    def test_tool_call_chunks_attribute_extracted_alongside_string_content(
+        self,
+    ) -> None:
+        chunk = _FakeChunk(
+            content="streaming text",
+            tool_call_chunks=[
+                {"name": "save_document", "args": '{"title":"x"}', "id": "tc-9"}
+            ],
+        )
+        out = _extract_chunk_parts(chunk)
+        assert out["text"] == "streaming text"
+        assert len(out["tool_call_chunks"]) == 1
+        assert out["tool_call_chunks"][0]["id"] == "tc-9"
+
+    def test_attribute_and_typed_block_chunks_both_collected(self) -> None:
+        chunk = _FakeChunk(
+            content=[
+                {
+                    "type": "tool_call_chunk",
+                    "id": "from-block",
+                    "name": "x",
+                }
+            ],
+            tool_call_chunks=[{"id": "from-attr", "name": "y"}],
+        )
+        out = _extract_chunk_parts(chunk)
+        ids = [tcc.get("id") for tcc in out["tool_call_chunks"]]
+        assert ids == ["from-block", "from-attr"]
+
+
+class TestDefensive:
+    @pytest.mark.parametrize(
+        "chunk_value",
+        [None, _FakeChunk(content=None), _FakeChunk(content=42)],
+    )
+    def test_invalid_chunk_returns_empty_parts(self, chunk_value: Any) -> None:
+        out = _extract_chunk_parts(chunk_value)
+        assert out["text"] == ""
+        assert out["reasoning"] == ""
+        assert out["tool_call_chunks"] == []
+
+
+class TestIdlessContinuationChunks:
+    """Per LangChain ``ToolCallChunk`` semantics, the FIRST chunk for a
+    tool call carries id+name; later chunks for the same call have
+    ``id=None, name=None`` and only ``args`` + ``index``. Live tool-call
+    argument streaming relies on those idless continuation chunks
+    flowing through ``_extract_chunk_parts`` UNTOUCHED so the upstream
+    chunk-emission loop can still route them by ``index``.
+    """
+
+    def test_idless_continuation_chunk_preserved_verbatim(self) -> None:
+        chunk = _FakeChunk(
+            tool_call_chunks=[
+                {"id": None, "name": None, "args": '_path":"/x"}', "index": 0}
+            ]
+        )
+        out = _extract_chunk_parts(chunk)
+        assert len(out["tool_call_chunks"]) == 1
+        tcc = out["tool_call_chunks"][0]
+        assert tcc.get("id") is None
+        assert tcc.get("name") is None
+        assert tcc.get("args") == '_path":"/x"}'
+        assert tcc.get("index") == 0
+
+    def test_first_then_idless_sequence_preserves_index(self) -> None:
+        """Both chunks for the same call share an ``index`` key — the
+        index-routing loop in ``stream_new_chat`` depends on it."""
+        first = _FakeChunk(
+            tool_call_chunks=[
+                {"id": "lc-1", "name": "write_file", "args": '{"file', "index": 0}
+            ]
+        )
+        cont = _FakeChunk(
+            tool_call_chunks=[
+                {"id": None, "name": None, "args": '_path":"/x"}', "index": 0}
+            ]
+        )
+        out_first = _extract_chunk_parts(first)
+        out_cont = _extract_chunk_parts(cont)
+        assert out_first["tool_call_chunks"][0]["index"] == 0
+        assert out_cont["tool_call_chunks"][0]["index"] == 0
+        assert out_cont["tool_call_chunks"][0].get("id") is None
diff --git a/surfsense_backend/tests/unit/tasks/chat/test_thinking_step_id_uniqueness.py b/surfsense_backend/tests/unit/tasks/chat/test_thinking_step_id_uniqueness.py
new file mode 100644
index 0000000000..50f7b80709
--- /dev/null
+++ b/surfsense_backend/tests/unit/tasks/chat/test_thinking_step_id_uniqueness.py
@@ -0,0 +1,171 @@
+"""Pin: thinking-step IDs must be globally unique within a thread.
+
+The frontend rehydrates ``currentThinkingSteps`` from the prior assistant
+message when starting a resume. If two consecutive resume turns emit step IDs
+that overlap (e.g. both produce ``thinking-resume-1`` because each invocation
+constructs a fresh :class:`AgentEventRelayState` with
+``thinking_step_counter=0``), React renders sibling timeline rows with the
+same key — the warning the user reported in production.
+
+The contract this module pins: each ``_stream_agent_events`` invocation must
+receive a ``step_prefix`` that is unique within the thread (we salt with the
+per-turn ``turn_id``), so the resulting step IDs across consecutive turns
+are always disjoint.
+"""
+
+from __future__ import annotations
+
+import json
+from collections.abc import AsyncGenerator
+from dataclasses import dataclass, field
+from typing import Any
+
+import pytest
+
+from app.services.new_streaming_service import VercelStreamingService
+from app.tasks.chat.stream_new_chat import (
+    StreamResult,
+    _resume_step_prefix,
+    _stream_agent_events,
+)
+
+pytestmark = pytest.mark.unit
+
+
+@dataclass
+class _FakeChunk:
+    content: Any = ""
+    additional_kwargs: dict[str, Any] = field(default_factory=dict)
+    tool_call_chunks: list[dict[str, Any]] = field(default_factory=list)
+
+
+class _FakeAgentState:
+    def __init__(self) -> None:
+        self.values: dict[str, Any] = {}
+        self.tasks: list[Any] = []
+
+
+class _FakeAgent:
+    def __init__(self, events: list[dict[str, Any]]) -> None:
+        self._events = events
+        self._state = _FakeAgentState()
+
+    async def astream_events(  # type: ignore[no-untyped-def]
+        self, _input_data: Any, *, config: dict[str, Any], version: str
+    ) -> AsyncGenerator[dict[str, Any], None]:
+        del config, version
+        for ev in self._events:
+            yield ev
+
+    async def aget_state(self, _config: dict[str, Any]) -> _FakeAgentState:
+        return self._state
+
+
+def _tool_start(*, name: str, run_id: str) -> dict[str, Any]:
+    return {
+        "event": "on_tool_start",
+        "name": name,
+        "run_id": run_id,
+        "data": {"input": {}},
+    }
+
+
+async def _drain_step_ids(
+    events: list[dict[str, Any]], *, step_prefix: str
+) -> set[str]:
+    """Run ``_stream_agent_events`` once and return every emitted thinking-step ID."""
+    agent = _FakeAgent(events)
+    service = VercelStreamingService()
+    result = StreamResult()
+    config = {"configurable": {"thread_id": "regression-thread"}}
+
+    sse_lines: list[str] = []
+    async for sse in _stream_agent_events(
+        agent, config, {}, service, result, step_prefix=step_prefix
+    ):
+        sse_lines.append(sse)
+
+    ids: set[str] = set()
+    for line in sse_lines:
+        if not line.startswith("data: "):
+            continue
+        body = line[len("data: ") :].rstrip("\n")
+        if not body or body == "[DONE]":
+            continue
+        try:
+            payload = json.loads(body)
+        except json.JSONDecodeError:
+            continue
+        if payload.get("type") != "data-thinking-step":
+            continue
+        step_id = (payload.get("data") or {}).get("id")
+        if isinstance(step_id, str):
+            ids.add(step_id)
+    return ids
+
+
+@pytest.mark.asyncio
+async def test_consecutive_invocations_with_same_prefix_produce_overlapping_ids():
+    """Pin the bug: identical ``step_prefix`` across two turns reuses ``-1``, ``-2``…
+
+    This is what production was doing for resume — every resume invocation
+    passed ``step_prefix='thinking-resume'`` and the relay state's counter
+    restarted at 0. Two scrollback timelines built from such turns then
+    presented React with siblings keyed by the same ``thinking-resume-1``.
+    """
+    events = [
+        _tool_start(name="t1", run_id="run-A-1"),
+        _tool_start(name="t2", run_id="run-A-2"),
+    ]
+
+    ids_turn_one = await _drain_step_ids(events, step_prefix="thinking-resume")
+    ids_turn_two = await _drain_step_ids(events, step_prefix="thinking-resume")
+
+    assert ids_turn_one == ids_turn_two != set(), (
+        "fixture broken: expected non-empty overlapping ids when prefix is reused"
+    )
+
+
+@pytest.mark.asyncio
+async def test_per_turn_salted_prefix_yields_disjoint_step_ids_across_turns():
+    """The fix: salting the prefix with the per-turn ``turn_id`` makes IDs disjoint.
+
+    Two consecutive resume calls in the same thread feed two different
+    ``turn_id``s into the prefix, so the resulting step IDs cannot collide
+    no matter how many times the FE rehydrates from earlier assistant
+    messages — which is the precondition for the React duplicate-key warning.
+    """
+    events = [
+        _tool_start(name="t1", run_id="run-A-1"),
+        _tool_start(name="t2", run_id="run-A-2"),
+    ]
+
+    ids_turn_one = await _drain_step_ids(
+        events, step_prefix="thinking-resume-104:1778698228472"
+    )
+    ids_turn_two = await _drain_step_ids(
+        events, step_prefix="thinking-resume-104:1778698244022"
+    )
+
+    assert ids_turn_one and ids_turn_two, "fixture broken: expected non-empty id sets"
+    assert ids_turn_one.isdisjoint(ids_turn_two), (
+        f"REGRESSION: per-turn-salted prefixes produced overlapping step IDs: "
+        f"{ids_turn_one & ids_turn_two!r}"
+    )
+
+
+def test_resume_step_prefix_helper_includes_turn_id_verbatim():
+    """Production call-site pin: ``stream_resume_chat`` builds the prefix via
+    this helper. Reverting it back to a hardcoded ``'thinking-resume'`` would
+    silently re-introduce the duplicate-key React warning across consecutive
+    resumes — this test fails first instead.
+    """
+    a = _resume_step_prefix("104:1778698228472")
+    b = _resume_step_prefix("104:1778698244022")
+
+    assert a.startswith("thinking-resume-"), (
+        f"prefix shape changed; the FE log filters and the timeline contract "
+        f"expect the ``thinking-resume-`` head to remain stable: got {a!r}"
+    )
+    assert "104:1778698228472" in a and "104:1778698244022" in b
+    assert a != b
diff --git a/surfsense_backend/tests/unit/tasks/chat/test_tool_input_streaming.py b/surfsense_backend/tests/unit/tasks/chat/test_tool_input_streaming.py
new file mode 100644
index 0000000000..ada32d168b
--- /dev/null
+++ b/surfsense_backend/tests/unit/tasks/chat/test_tool_input_streaming.py
@@ -0,0 +1,495 @@
+"""Unit tests for live tool-call argument streaming.
+
+Pins the wire format that ``_stream_agent_events`` emits:
+``tool-input-start`` → ``tool-input-delta``... → ``tool-input-available`` →
+``tool-output-available``, keyed consistently with LangChain ``tool_call.id``
+when the model streams indexed chunks.
+
+Identity is tracked in ``index_to_meta`` (per-chunk ``index``) and
+``ui_tool_call_id_by_run`` (LangGraph ``run_id``); both are internal to the
+streaming layer so we assert on the public SSE payloads.
+"""
+
+from __future__ import annotations
+
+import json
+from collections.abc import AsyncGenerator
+from dataclasses import dataclass, field
+from typing import Any
+
+import pytest
+
+from app.services.new_streaming_service import VercelStreamingService
+from app.tasks.chat.stream_new_chat import (
+    StreamResult,
+    _legacy_match_lc_id,
+    _stream_agent_events,
+)
+
+pytestmark = pytest.mark.unit
+
+
+@dataclass
+class _FakeChunk:
+    """Minimal stand-in for ``AIMessageChunk``."""
+
+    content: Any = ""
+    additional_kwargs: dict[str, Any] = field(default_factory=dict)
+    tool_call_chunks: list[dict[str, Any]] = field(default_factory=list)
+
+
+@dataclass
+class _FakeToolMessage:
+    """Stand-in for ``ToolMessage`` returned by ``on_tool_end``."""
+
+    content: Any
+    tool_call_id: str | None = None
+
+
+@dataclass
+class _FakeInterrupt:
+    value: dict[str, Any]
+
+
+@dataclass
+class _FakeTask:
+    interrupts: tuple[_FakeInterrupt, ...] = ()
+
+
+class _FakeAgentState:
+    """Stand-in for ``StateSnapshot`` returned by ``aget_state``."""
+
+    def __init__(self, tasks: list[Any] | None = None) -> None:
+        # Empty values keeps the cloud-fallback safety-net branch a no-op,
+        # and empty ``tasks`` keep the post-stream interrupt check a no-op too.
+        self.values: dict[str, Any] = {}
+        self.tasks: list[Any] = tasks or []
+
+
+class _FakeAgent:
+    """Replays a list of ``astream_events`` events."""
+
+    def __init__(
+        self, events: list[dict[str, Any]], state: _FakeAgentState | None = None
+    ) -> None:
+        self._events = events
+        self._state = state or _FakeAgentState()
+
+    async def astream_events(  # type: ignore[no-untyped-def]
+        self, _input_data: Any, *, config: dict[str, Any], version: str
+    ) -> AsyncGenerator[dict[str, Any], None]:
+        del config, version  # unused, contract-compatible
+        for ev in self._events:
+            yield ev
+
+    async def aget_state(self, _config: dict[str, Any]) -> _FakeAgentState:
+        # Called once after astream_events drains so the cloud-fallback
+        # safety net can inspect staged filesystem work. The fake stays
+        # empty so the safety net is a no-op.
+        return self._state
+
+
+def _model_stream(
+    *,
+    text: str = "",
+    reasoning: str = "",
+    tool_call_chunks: list[dict[str, Any]] | None = None,
+    tags: list[str] | None = None,
+) -> dict[str, Any]:
+    return (
+        {
+            "event": "on_chat_model_stream",
+            "tags": tags or [],
+            "data": {
+                "chunk": _FakeChunk(
+                    content=text,
+                    tool_call_chunks=list(tool_call_chunks or []),
+                )
+            },
+            # reasoning piggybacks via additional_kwargs path; if needed,
+            # override content to a typed-block list. Most tests just check
+            # tool_call_chunks routing so this is fine.
+        }
+        if not reasoning
+        else {
+            "event": "on_chat_model_stream",
+            "tags": tags or [],
+            "data": {
+                "chunk": _FakeChunk(
+                    content=text,
+                    additional_kwargs={"reasoning_content": reasoning},
+                    tool_call_chunks=list(tool_call_chunks or []),
+                )
+            },
+        }
+    )
+
+
+def _tool_start(
+    *,
+    name: str,
+    run_id: str,
+    input_payload: dict[str, Any] | None = None,
+) -> dict[str, Any]:
+    return {
+        "event": "on_tool_start",
+        "name": name,
+        "run_id": run_id,
+        "data": {"input": input_payload or {}},
+    }
+
+
+def _tool_end(
+    *,
+    name: str,
+    run_id: str,
+    tool_call_id: str | None = None,
+    output: Any = "ok",
+) -> dict[str, Any]:
+    return {
+        "event": "on_tool_end",
+        "name": name,
+        "run_id": run_id,
+        "data": {
+            "output": _FakeToolMessage(
+                content=json.dumps(output) if not isinstance(output, str) else output,
+                tool_call_id=tool_call_id,
+            )
+        },
+    }
+
+
+async def _drain(
+    events: list[dict[str, Any]], state: _FakeAgentState | None = None
+) -> list[dict[str, Any]]:
+    """Run ``_stream_agent_events`` against a fake agent and return the
+    SSE payloads (parsed JSON) it yielded.
+    """
+    agent = _FakeAgent(events, state=state)
+    service = VercelStreamingService()
+    result = StreamResult()
+    config = {"configurable": {"thread_id": "test-thread"}}
+    sse_lines: list[str] = []
+    async for sse in _stream_agent_events(
+        agent, config, {}, service, result, step_prefix="thinking"
+    ):
+        sse_lines.append(sse)
+
+    parsed: list[dict[str, Any]] = []
+    for line in sse_lines:
+        if not line.startswith("data: "):
+            continue
+        body = line[len("data: ") :].rstrip("\n")
+        if not body or body == "[DONE]":
+            continue
+        try:
+            parsed.append(json.loads(body))
+        except json.JSONDecodeError:
+            continue
+    return parsed
+
+
+def _types(payloads: list[dict[str, Any]]) -> list[str]:
+    return [p.get("type", "?") for p in payloads]
+
+
+def _of_type(payloads: list[dict[str, Any]], type_name: str) -> list[dict[str, Any]]:
+    return [p for p in payloads if p.get("type") == type_name]
+
+
+# ---------------------------------------------------------------------------
+# Helper: ``_legacy_match_lc_id`` is a pure refactor; assert behaviour.
+# ---------------------------------------------------------------------------
+
+
+class TestLegacyMatch:
+    def test_pops_first_id_bearing_chunk_with_matching_name(self) -> None:
+        chunks: list[dict[str, Any]] = [
+            {"id": "x1", "name": "ls"},
+            {"id": "y1", "name": "write_file"},
+        ]
+        runs: dict[str, str] = {}
+        result = _legacy_match_lc_id(chunks, "write_file", "run-1", runs)
+        assert result == "y1"
+        assert chunks == [{"id": "x1", "name": "ls"}]
+        assert runs == {"run-1": "y1"}
+
+    def test_falls_back_to_any_id_bearing_when_name_mismatches(self) -> None:
+        chunks: list[dict[str, Any]] = [{"id": "anon", "name": None}]
+        runs: dict[str, str] = {}
+        out = _legacy_match_lc_id(chunks, "ls", "run-2", runs)
+        assert out == "anon"
+        assert chunks == []
+
+    def test_returns_none_when_no_id_bearing_chunk(self) -> None:
+        chunks: list[dict[str, Any]] = [{"id": None, "name": None}]
+        runs: dict[str, str] = {}
+        assert _legacy_match_lc_id(chunks, "ls", "run-3", runs) is None
+        assert chunks == [{"id": None, "name": None}]
+        assert runs == {}
+
+
+# ---------------------------------------------------------------------------
+# Tool input streaming wire format
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_idless_chunk_merging_by_index() -> None:
+    """First chunk carries id+name; later idless chunks at the same
+    ``index`` merge into the SAME ``tool-input-start`` ui id and emit
+    one ``tool-input-delta`` per chunk."""
+    events = [
+        _model_stream(
+            tool_call_chunks=[
+                {"id": "lc-1", "name": "write_file", "args": '{"file', "index": 0}
+            ],
+        ),
+        _model_stream(
+            tool_call_chunks=[
+                {"id": None, "name": None, "args": '_path":"/x"}', "index": 0}
+            ],
+        ),
+        _tool_start(
+            name="write_file", run_id="run-A", input_payload={"file_path": "/x"}
+        ),
+        _tool_end(name="write_file", run_id="run-A", tool_call_id="lc-1"),
+    ]
+
+    payloads = await _drain(events)
+
+    starts = _of_type(payloads, "tool-input-start")
+    deltas = _of_type(payloads, "tool-input-delta")
+    available = _of_type(payloads, "tool-input-available")
+    output = _of_type(payloads, "tool-output-available")
+
+    assert len(starts) == 1
+    assert starts[0]["toolCallId"] == "lc-1"
+    assert starts[0]["toolName"] == "write_file"
+    assert starts[0]["langchainToolCallId"] == "lc-1"
+
+    assert [d["inputTextDelta"] for d in deltas] == ['{"file', '_path":"/x"}']
+    assert all(d["toolCallId"] == "lc-1" for d in deltas)
+
+    assert len(available) == 1
+    assert available[0]["toolCallId"] == "lc-1"
+
+    assert len(output) == 1
+    assert output[0]["toolCallId"] == "lc-1"
+
+
+@pytest.mark.asyncio
+async def test_two_interleaved_tool_calls_route_by_index() -> None:
+    """Two same-name calls with distinct indices keep their deltas
+    routed to the right card."""
+    events = [
+        _model_stream(
+            tool_call_chunks=[
+                {"id": "lc-A", "name": "write_file", "args": '{"a":1', "index": 0},
+                {"id": "lc-B", "name": "write_file", "args": '{"b":2', "index": 1},
+            ]
+        ),
+        _model_stream(
+            tool_call_chunks=[
+                {"id": None, "name": None, "args": "}", "index": 0},
+                {"id": None, "name": None, "args": "}", "index": 1},
+            ]
+        ),
+        _tool_start(name="write_file", run_id="run-A", input_payload={"a": 1}),
+        _tool_end(name="write_file", run_id="run-A", tool_call_id="lc-A"),
+        _tool_start(name="write_file", run_id="run-B", input_payload={"b": 2}),
+        _tool_end(name="write_file", run_id="run-B", tool_call_id="lc-B"),
+    ]
+
+    payloads = await _drain(events)
+
+    starts = _of_type(payloads, "tool-input-start")
+    deltas = _of_type(payloads, "tool-input-delta")
+    output = _of_type(payloads, "tool-output-available")
+
+    assert {s["toolCallId"] for s in starts} == {"lc-A", "lc-B"}
+
+    by_id: dict[str, list[str]] = {"lc-A": [], "lc-B": []}
+    for d in deltas:
+        by_id[d["toolCallId"]].append(d["inputTextDelta"])
+    assert by_id["lc-A"] == ['{"a":1', "}"]
+    assert by_id["lc-B"] == ['{"b":2', "}"]
+
+    assert {o["toolCallId"] for o in output} == {"lc-A", "lc-B"}
+
+
+@pytest.mark.asyncio
+async def test_identity_stable_across_lifecycle() -> None:
+    """Whatever id ``tool-input-start`` chose must be the SAME id used
+    on ``tool-input-available`` AND ``tool-output-available``."""
+    events = [
+        _model_stream(
+            tool_call_chunks=[
+                {"id": "lc-9", "name": "ls", "args": '{"path":"/"}', "index": 0}
+            ]
+        ),
+        _tool_start(name="ls", run_id="run-X", input_payload={"path": "/"}),
+        _tool_end(name="ls", run_id="run-X", tool_call_id="lc-9"),
+    ]
+    payloads = await _drain(events)
+    relevant = [
+        p
+        for p in payloads
+        if p.get("type")
+        in {"tool-input-start", "tool-input-available", "tool-output-available"}
+    ]
+    assert {p["toolCallId"] for p in relevant} == {"lc-9"}
+
+
+@pytest.mark.asyncio
+async def test_no_duplicate_tool_input_start() -> None:
+    """When the chunk-emission loop already fired ``tool-input-start``
+    for this run, ``on_tool_start`` MUST NOT emit a second one."""
+    events = [
+        _model_stream(
+            tool_call_chunks=[
+                {"id": "lc-1", "name": "write_file", "args": "{}", "index": 0}
+            ]
+        ),
+        _tool_start(name="write_file", run_id="run-A", input_payload={}),
+        _tool_end(name="write_file", run_id="run-A", tool_call_id="lc-1"),
+    ]
+    payloads = await _drain(events)
+    starts = _of_type(payloads, "tool-input-start")
+    assert len(starts) == 1
+    assert starts[0]["toolCallId"] == "lc-1"
+
+
+@pytest.mark.asyncio
+async def test_active_text_closes_before_early_tool_input_start() -> None:
+    """Streaming a text-delta then a tool-call chunk in subsequent
+    chunks: the wire MUST contain ``text-end`` before the FIRST
+    ``tool-input-start`` (clean part boundary on the frontend)."""
+    events = [
+        _model_stream(text="Working on it"),
+        _model_stream(
+            tool_call_chunks=[
+                {"id": "lc-1", "name": "write_file", "args": "{}", "index": 0}
+            ]
+        ),
+        _tool_start(name="write_file", run_id="run-A", input_payload={}),
+        _tool_end(name="write_file", run_id="run-A", tool_call_id="lc-1"),
+    ]
+    types = _types(await _drain(events))
+    text_end_idx = types.index("text-end")
+    start_idx = types.index("tool-input-start")
+    assert text_end_idx < start_idx
+
+
+@pytest.mark.asyncio
+async def test_mixed_text_and_tool_chunk_preserve_order() -> None:
+    """One AIMessageChunk that carries BOTH ``text`` content AND
+    ``tool_call_chunks`` should emit the text delta FIRST, then close
+    text, then ``tool-input-start``+``tool-input-delta``."""
+    events = [
+        _model_stream(
+            text="I'll update it",
+            tool_call_chunks=[
+                {
+                    "id": "lc-1",
+                    "name": "write_file",
+                    "args": '{"file_path":"/x"}',
+                    "index": 0,
+                }
+            ],
+        ),
+        _tool_start(
+            name="write_file", run_id="run-A", input_payload={"file_path": "/x"}
+        ),
+        _tool_end(name="write_file", run_id="run-A", tool_call_id="lc-1"),
+    ]
+    types = _types(await _drain(events))
+    # text-start … text-delta … text-end … tool-input-start … tool-input-delta
+    assert types.index("text-start") < types.index("text-delta")
+    assert types.index("text-delta") < types.index("text-end")
+    assert types.index("text-end") < types.index("tool-input-start")
+    assert types.index("tool-input-start") < types.index("tool-input-delta")
+
+
+@pytest.mark.asyncio
+async def test_skip_append_prevents_stale_id_reuse() -> None:
+    """Two same-name tools: the SECOND tool's ``langchainToolCallId``
+    must NOT come from the first tool's chunk (``pending_tool_call_chunks``
+    must stay empty for indexed-registered chunks)."""
+    events = [
+        _model_stream(
+            tool_call_chunks=[
+                {"id": "lc-A", "name": "write_file", "args": "{}", "index": 0},
+                {"id": "lc-B", "name": "write_file", "args": "{}", "index": 1},
+            ]
+        ),
+        _tool_start(name="write_file", run_id="run-1", input_payload={}),
+        _tool_end(name="write_file", run_id="run-1", tool_call_id="lc-A"),
+        _tool_start(name="write_file", run_id="run-2", input_payload={}),
+        _tool_end(name="write_file", run_id="run-2", tool_call_id="lc-B"),
+    ]
+    payloads = await _drain(events)
+
+    starts = _of_type(payloads, "tool-input-start")
+    # Two distinct lc ids, each its own card.
+    assert {s["toolCallId"] for s in starts} == {"lc-A", "lc-B"}
+    # Each tool-output-available landed on its respective card.
+    output = _of_type(payloads, "tool-output-available")
+    assert {o["toolCallId"] for o in output} == {"lc-A", "lc-B"}
+
+
+@pytest.mark.asyncio
+async def test_registration_waits_for_both_id_and_name() -> None:
+    """An id-only chunk (no name yet) must NOT emit ``tool-input-start``."""
+    events = [
+        _model_stream(
+            tool_call_chunks=[{"id": "lc-1", "name": None, "args": "", "index": 0}]
+        ),
+    ]
+    payloads = await _drain(events)
+    assert _of_type(payloads, "tool-input-start") == []
+
+
+@pytest.mark.asyncio
+async def test_unmatched_fallback_still_attaches_lc_id() -> None:
+    """When the provider omits chunk ``index``, buffered chunks still get a
+    ``tool-input-start`` with the matching ``langchainToolCallId``."""
+    events = [
+        # No index on the chunk → not registered into index_to_meta;
+        # falls through to ``pending_tool_call_chunks`` so the legacy
+        # match path can pop it at on_tool_start.
+        _model_stream(tool_call_chunks=[{"id": "lc-orphan", "name": "ls", "args": ""}]),
+        _tool_start(name="ls", run_id="run-1", input_payload={"path": "/"}),
+        _tool_end(name="ls", run_id="run-1", tool_call_id="lc-orphan"),
+    ]
+    payloads = await _drain(events)
+    starts = _of_type(payloads, "tool-input-start")
+    assert len(starts) == 1
+    assert starts[0]["toolCallId"].startswith("call_run-1")
+    assert starts[0]["langchainToolCallId"] == "lc-orphan"
+
+
+@pytest.mark.asyncio
+async def test_interrupt_request_uses_task_that_contains_interrupt() -> None:
+    interrupt_payload = {
+        "type": "calendar_event_create",
+        "action": {
+            "tool": "create_calendar_event",
+            "params": {"summary": "mom bday"},
+        },
+        "context": {},
+    }
+    state = _FakeAgentState(
+        tasks=[
+            _FakeTask(interrupts=()),
+            _FakeTask(interrupts=(_FakeInterrupt(value=interrupt_payload),)),
+        ]
+    )
+
+    payloads = await _drain([], state=state)
+
+    interrupts = _of_type(payloads, "data-interrupt-request")
+    assert len(interrupts) == 1
+    assert (
+        interrupts[0]["data"]["action_requests"][0]["name"] == "create_calendar_event"
+    )
diff --git a/surfsense_backend/tests/unit/tasks/test_celery_async_runner.py b/surfsense_backend/tests/unit/tasks/test_celery_async_runner.py
new file mode 100644
index 0000000000..a5bb3f58a8
--- /dev/null
+++ b/surfsense_backend/tests/unit/tasks/test_celery_async_runner.py
@@ -0,0 +1,318 @@
+"""Regression tests for ``run_async_celery_task``.
+
+These tests pin down the production bug observed on 2026-05-02 where
+the video-presentation Celery task hung at ``[billable_call] finalize``
+because the shared ``app.db.engine`` had pooled asyncpg connections
+bound to a *previous* task's now-closed event loop. Reusing such a
+connection on a fresh loop crashes inside ``pool_pre_ping`` with::
+
+    AttributeError: 'NoneType' object has no attribute 'send'
+
+(the proactor is None because the loop is gone) and can hang forever
+inside the asyncpg ``Connection._cancel`` cleanup coroutine.
+
+The fix is ``run_async_celery_task``: a small helper that runs every
+async celery task body inside a fresh event loop and disposes the
+shared engine pool both before (defends against a previous task that
+crashed) and after (releases connections we opened on this loop).
+
+Tests here exercise the helper with a stub engine that records
+``dispose()`` calls and panics if a coroutine produced by one loop is
+awaited on another — mirroring the real asyncpg behaviour.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import gc
+import sys
+from collections.abc import Iterator
+from contextlib import contextmanager
+from unittest.mock import patch
+
+import pytest
+
+pytestmark = pytest.mark.unit
+
+
+# ---------------------------------------------------------------------------
+# Stub engine that emulates the asyncpg-on-stale-loop crash
+# ---------------------------------------------------------------------------
+
+
+class _StaleLoopEngine:
+    """Tiny stand-in for ``app.db.engine`` that tracks dispose() calls.
+
+    ``dispose()`` is async (matches ``AsyncEngine.dispose``) and records
+    the running event loop id so tests can assert it ran on *each*
+    fresh loop.
+    """
+
+    def __init__(self) -> None:
+        self.dispose_loop_ids: list[int] = []
+
+    async def dispose(self) -> None:
+        loop = asyncio.get_running_loop()
+        self.dispose_loop_ids.append(id(loop))
+
+
+@contextmanager
+def _patch_shared_engine(stub: _StaleLoopEngine) -> Iterator[None]:
+    """Patch ``from app.db import engine as shared_engine`` lookup.
+
+    The helper imports lazily inside the function body, so we have to
+    patch the attribute on the already-loaded ``app.db`` module.
+    """
+    import app.db as app_db
+
+    original = getattr(app_db, "engine", None)
+    app_db.engine = stub  # type: ignore[attr-defined]
+    try:
+        yield
+    finally:
+        if original is None:
+            with pytest.raises(AttributeError):
+                _ = app_db.engine
+        else:
+            app_db.engine = original  # type: ignore[attr-defined]
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+def test_runner_returns_value_and_disposes_engine_around_call() -> None:
+    """Happy path: the coroutine result is returned, and the shared
+    engine is disposed both before and after the task body runs.
+    """
+    from app.tasks.celery_tasks import run_async_celery_task
+
+    stub = _StaleLoopEngine()
+
+    async def _body() -> str:
+        # Engine should already have been disposed once before we run.
+        assert len(stub.dispose_loop_ids) == 1
+        return "ok"
+
+    with _patch_shared_engine(stub):
+        result = run_async_celery_task(_body)
+
+    assert result == "ok"
+    # Once before the body, once after (in finally).
+    assert len(stub.dispose_loop_ids) == 2
+    # Both disposes ran on the SAME (fresh) loop the task body used.
+    assert stub.dispose_loop_ids[0] == stub.dispose_loop_ids[1]
+
+
+def test_runner_creates_fresh_loop_per_invocation() -> None:
+    """Each call must spin its own loop. Without this guarantee a
+    previous task's loop would be reused and the asyncpg-stale-loop
+    crash would never be avoided.
+    """
+    import app.tasks.celery_tasks as celery_tasks_pkg
+
+    stub = _StaleLoopEngine()
+    new_loop_calls = 0
+    closed_loops: list[bool] = []
+
+    real_new_event_loop = asyncio.new_event_loop
+
+    def _counting_new_loop() -> asyncio.AbstractEventLoop:
+        nonlocal new_loop_calls
+        new_loop_calls += 1
+        loop = real_new_event_loop()
+        # Hook close() so we can verify each loop was closed properly
+        # before the next one was created.
+        original_close = loop.close
+
+        def _tracked_close() -> None:
+            closed_loops.append(True)
+            original_close()
+
+        loop.close = _tracked_close  # type: ignore[method-assign]
+        return loop
+
+    async def _body() -> None:
+        # Loop is alive and current at body execution time.
+        running = asyncio.get_running_loop()
+        assert not running.is_closed()
+
+    with (
+        _patch_shared_engine(stub),
+        patch.object(asyncio, "new_event_loop", _counting_new_loop),
+    ):
+        for _ in range(3):
+            celery_tasks_pkg.run_async_celery_task(_body)
+
+    assert new_loop_calls == 3
+    assert closed_loops == [True, True, True]
+    # Each invocation disposed twice (before + after).
+    assert len(stub.dispose_loop_ids) == 6
+
+
+def test_runner_disposes_engine_even_when_body_raises() -> None:
+    """Cleanup MUST run on the failure path too — otherwise stale
+    connections leak into the next task and cause the original hang.
+    """
+    from app.tasks.celery_tasks import run_async_celery_task
+
+    stub = _StaleLoopEngine()
+
+    class _BoomError(RuntimeError):
+        pass
+
+    async def _body() -> None:
+        raise _BoomError("kaboom")
+
+    with _patch_shared_engine(stub), pytest.raises(_BoomError):
+        run_async_celery_task(_body)
+
+    assert len(stub.dispose_loop_ids) == 2  # before + after still ran
+
+
+def test_runner_swallows_dispose_errors() -> None:
+    """A flaky engine.dispose() must NEVER take down a celery task.
+
+    Production scenario: the very first dispose (before the body runs)
+    might hit a partially-initialised engine; the helper logs and
+    moves on. The task body still runs; the result is still returned.
+    """
+    from app.tasks.celery_tasks import run_async_celery_task
+
+    class _AngryEngine:
+        def __init__(self) -> None:
+            self.calls = 0
+
+        async def dispose(self) -> None:
+            self.calls += 1
+            raise RuntimeError("dispose() blew up")
+
+    stub = _AngryEngine()
+
+    async def _body() -> int:
+        return 42
+
+    with _patch_shared_engine(stub):
+        assert run_async_celery_task(_body) == 42
+
+    assert stub.calls == 2  # before + after both attempted
+
+
+def test_runner_propagates_value_from_async_body() -> None:
+    """Sanity: pass-through of any pickleable celery return value."""
+    from app.tasks.celery_tasks import run_async_celery_task
+
+    stub = _StaleLoopEngine()
+
+    async def _body() -> dict[str, object]:
+        return {"status": "ready", "video_presentation_id": 19}
+
+    with _patch_shared_engine(stub):
+        out = run_async_celery_task(_body)
+
+    assert out == {"status": "ready", "video_presentation_id": 19}
+
+
+def test_video_presentation_task_uses_runner_helper() -> None:
+    """Defence-in-depth: confirm the celery task module imports
+    ``run_async_celery_task``. If a future refactor inlines a
+    ``loop = asyncio.new_event_loop(); ... loop.close()`` block again,
+    the original hang will return.
+    """
+    # The module's task body should not contain a manual new_event_loop
+    # call — that's exactly what the helper exists to centralise.
+    import inspect
+
+    from app.tasks.celery_tasks import video_presentation_tasks
+
+    src = inspect.getsource(video_presentation_tasks)
+    assert "run_async_celery_task" in src, (
+        "video_presentation_tasks.py must use run_async_celery_task; "
+        "manual asyncio.new_event_loop() in a celery task hangs on the "
+        "shared SQLAlchemy pool when reused across tasks."
+    )
+    assert "asyncio.new_event_loop" not in src, (
+        "video_presentation_tasks.py contains a raw asyncio.new_event_loop "
+        "call — route every async task through run_async_celery_task to "
+        "avoid the stale-pool hang."
+    )
+
+
+def test_podcast_task_uses_runner_helper() -> None:
+    """Symmetric assertion for the podcast task — same root cause, same
+    fix, same regression risk.
+    """
+    import inspect
+
+    from app.tasks.celery_tasks import podcast_tasks
+
+    src = inspect.getsource(podcast_tasks)
+    assert "run_async_celery_task" in src
+    assert "asyncio.new_event_loop" not in src
+
+
+def test_runner_runs_shutdown_asyncgens_before_close() -> None:
+    """If the task body created any async generators that didn't get
+    fully iterated, we must still call ``loop.shutdown_asyncgens()``
+    before closing — otherwise we leak event-loop bound resources
+    that re-emerge as ``RuntimeError: Event loop is closed`` later.
+    """
+    from app.tasks.celery_tasks import run_async_celery_task
+
+    stub = _StaleLoopEngine()
+
+    async def _agen():
+        try:
+            yield 1
+            yield 2
+        finally:
+            pass
+
+    async def _body() -> None:
+        # Iterate the agen partially, then leave it dangling — exactly
+        # the situation shutdown_asyncgens() is designed to clean up.
+        async for v in _agen():
+            if v == 1:
+                break
+
+    with _patch_shared_engine(stub):
+        run_async_celery_task(_body)
+
+    # By the time the helper returns, garbage collection + shutdown_asyncgens
+    # should have ensured no live async-gen references remain. We don't
+    # assert agen.closed directly (it depends on GC ordering); the real
+    # contract is "no warnings, no event-loop-closed errors". A successful
+    # second invocation proves the loop was cleaned up properly.
+    with _patch_shared_engine(stub):
+        run_async_celery_task(_body)
+
+    # Force a GC pass to surface any 'coroutine was never awaited'
+    # warnings that would indicate the cleanup is broken.
+    gc.collect()
+
+
+def test_runner_uses_proactor_loop_on_windows() -> None:
+    """On Windows the celery worker preselects a Proactor policy so
+    subprocess (ffmpeg) calls work. The helper must not silently fall
+    back to a Selector loop and re-break video/podcast generation.
+    """
+    if not sys.platform.startswith("win"):
+        pytest.skip("Windows-specific event-loop policy assertion")
+
+    from app.tasks.celery_tasks import run_async_celery_task
+
+    stub = _StaleLoopEngine()
+
+    # Mirror the policy set at the top of every Windows celery task.
+    asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
+
+    observed: list[str] = []
+
+    async def _body() -> None:
+        observed.append(type(asyncio.get_running_loop()).__name__)
+
+    with _patch_shared_engine(stub):
+        run_async_celery_task(_body)
+
+    assert observed == ["ProactorEventLoop"]
diff --git a/surfsense_backend/tests/unit/tasks/test_podcast_billing.py b/surfsense_backend/tests/unit/tasks/test_podcast_billing.py
new file mode 100644
index 0000000000..699297df1b
--- /dev/null
+++ b/surfsense_backend/tests/unit/tasks/test_podcast_billing.py
@@ -0,0 +1,388 @@
+"""Unit tests for podcast Celery task billing integration.
+
+Validates ``_generate_content_podcast`` correctly wraps
+``podcaster_graph.ainvoke`` in a ``billable_call`` envelope, propagates the
+search-space owner's billing decision, and degrades cleanly when the
+resolver fails or premium credit is exhausted.
+
+Coverage:
+
+* Happy-path free config: resolver → ``billable_call`` enters with
+  ``usage_type='podcast_generation'`` and the configured reserve override,
+  graph runs, podcast row flips to ``READY``.
+* Happy-path premium config: same wiring with ``billing_tier='premium'``.
+* Quota denial: ``billable_call`` raises ``QuotaInsufficientError`` →
+  graph is *not* invoked, podcast row flips to ``FAILED``, return dict
+  carries ``reason='premium_quota_exhausted'``.
+* Resolver failure: ``ValueError`` from the resolver → podcast row flips
+  to ``FAILED``, return dict carries ``reason='billing_resolution_failed'``.
+"""
+
+from __future__ import annotations
+
+import contextlib
+from types import SimpleNamespace
+from typing import Any
+from uuid import uuid4
+
+import pytest
+
+pytestmark = pytest.mark.unit
+
+
+# ---------------------------------------------------------------------------
+# Fakes
+# ---------------------------------------------------------------------------
+
+
+class _FakeExecResult:
+    def __init__(self, obj):
+        self._obj = obj
+
+    def scalars(self):
+        return self
+
+    def first(self):
+        return self._obj
+
+    def filter(self, *_args, **_kwargs):
+        return self
+
+
+class _FakeSession:
+    def __init__(self, podcast):
+        self._podcast = podcast
+        self.commit_count = 0
+
+    async def execute(self, _stmt):
+        return _FakeExecResult(self._podcast)
+
+    async def commit(self):
+        self.commit_count += 1
+
+    async def __aenter__(self):
+        return self
+
+    async def __aexit__(self, *args):
+        return None
+
+
+class _FakeSessionMaker:
+    def __init__(self, session: _FakeSession):
+        self._session = session
+
+    def __call__(self):
+        return self._session
+
+
+def _make_podcast(podcast_id: int = 7, thread_id: int = 99) -> SimpleNamespace:
+    """Stand-in for a ``Podcast`` row. Importing ``PodcastStatus`` lazily
+    inside helpers keeps this fixture cheap."""
+    return SimpleNamespace(
+        id=podcast_id,
+        title="Test Podcast",
+        thread_id=thread_id,
+        status=None,
+        podcast_transcript=None,
+        file_location=None,
+    )
+
+
+@contextlib.asynccontextmanager
+async def _ok_billable_call(**kwargs):
+    """Stand-in for ``billable_call`` that records its kwargs and yields a
+    no-op accumulator-shaped object."""
+    _CALL_LOG.append(kwargs)
+    yield SimpleNamespace()
+
+
+_CALL_LOG: list[dict[str, Any]] = []
+
+
+@contextlib.asynccontextmanager
+async def _denying_billable_call(**kwargs):
+    from app.services.billable_calls import QuotaInsufficientError
+
+    _CALL_LOG.append(kwargs)
+    raise QuotaInsufficientError(
+        usage_type=kwargs.get("usage_type", "?"),
+        used_micros=5_000_000,
+        limit_micros=5_000_000,
+        remaining_micros=0,
+    )
+    yield SimpleNamespace()  # pragma: no cover — for grammar only
+
+
+@contextlib.asynccontextmanager
+async def _settlement_failing_billable_call(**kwargs):
+    from app.services.billable_calls import BillingSettlementError
+
+    _CALL_LOG.append(kwargs)
+    yield SimpleNamespace()
+    raise BillingSettlementError(
+        usage_type=kwargs.get("usage_type", "?"),
+        user_id=kwargs["user_id"],
+        cause=RuntimeError("finalize failed"),
+    )
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture(autouse=True)
+def _reset_call_log():
+    _CALL_LOG.clear()
+    yield
+    _CALL_LOG.clear()
+
+
+@pytest.mark.asyncio
+async def test_billable_call_invoked_with_correct_kwargs_for_free_config(monkeypatch):
+    """Happy path: free billing tier still wraps the graph call so the
+    audit row is recorded. Verifies kwargs threading."""
+    from app.config import config as app_config
+    from app.db import PodcastStatus
+    from app.tasks.celery_tasks import podcast_tasks
+
+    podcast = _make_podcast(podcast_id=7, thread_id=99)
+    session = _FakeSession(podcast)
+    monkeypatch.setattr(
+        podcast_tasks,
+        "get_celery_session_maker",
+        lambda: _FakeSessionMaker(session),
+    )
+
+    user_id = uuid4()
+
+    async def _fake_resolver(sess, search_space_id, *, thread_id=None):
+        assert search_space_id == 555
+        assert thread_id == 99
+        return user_id, "free", "openrouter/some-free-model"
+
+    monkeypatch.setattr(
+        podcast_tasks, "_resolve_agent_billing_for_search_space", _fake_resolver
+    )
+    monkeypatch.setattr(podcast_tasks, "billable_call", _ok_billable_call)
+
+    async def _fake_graph_invoke(state, config):
+        return {
+            "podcast_transcript": [
+                SimpleNamespace(speaker_id=0, dialog="Hi"),
+                SimpleNamespace(speaker_id=1, dialog="Hello"),
+            ],
+            "final_podcast_file_path": "/tmp/podcast.wav",
+        }
+
+    monkeypatch.setattr(podcast_tasks.podcaster_graph, "ainvoke", _fake_graph_invoke)
+
+    result = await podcast_tasks._generate_content_podcast(
+        podcast_id=7,
+        source_content="hello world",
+        search_space_id=555,
+        user_prompt="make it short",
+    )
+
+    assert result["status"] == "ready"
+    assert result["podcast_id"] == 7
+    assert podcast.status == PodcastStatus.READY
+    assert podcast.file_location == "/tmp/podcast.wav"
+
+    assert len(_CALL_LOG) == 1
+    call = _CALL_LOG[0]
+    assert call["user_id"] == user_id
+    assert call["search_space_id"] == 555
+    assert call["billing_tier"] == "free"
+    assert call["base_model"] == "openrouter/some-free-model"
+    assert call["usage_type"] == "podcast_generation"
+    assert (
+        call["quota_reserve_micros_override"]
+        == app_config.QUOTA_DEFAULT_PODCAST_RESERVE_MICROS
+    )
+    # Background artifact audit rows intentionally omit the TokenUsage.thread_id
+    # FK to avoid coupling Celery audit commits to an active chat transaction.
+    assert "thread_id" not in call
+    assert call["call_details"] == {
+        "podcast_id": 7,
+        "title": "Test Podcast",
+        "thread_id": 99,
+    }
+    assert callable(call["billable_session_factory"])
+
+
+@pytest.mark.asyncio
+async def test_billable_call_invoked_with_premium_tier(monkeypatch):
+    """Premium resolution flows through to ``billable_call`` so the
+    reserve/finalize path triggers."""
+    from app.tasks.celery_tasks import podcast_tasks
+
+    podcast = _make_podcast()
+    session = _FakeSession(podcast)
+    monkeypatch.setattr(
+        podcast_tasks,
+        "get_celery_session_maker",
+        lambda: _FakeSessionMaker(session),
+    )
+
+    user_id = uuid4()
+
+    async def _fake_resolver(sess, search_space_id, *, thread_id=None):
+        return user_id, "premium", "gpt-5.4"
+
+    monkeypatch.setattr(
+        podcast_tasks, "_resolve_agent_billing_for_search_space", _fake_resolver
+    )
+    monkeypatch.setattr(podcast_tasks, "billable_call", _ok_billable_call)
+
+    async def _fake_graph_invoke(state, config):
+        return {"podcast_transcript": [], "final_podcast_file_path": "x.wav"}
+
+    monkeypatch.setattr(podcast_tasks.podcaster_graph, "ainvoke", _fake_graph_invoke)
+
+    await podcast_tasks._generate_content_podcast(
+        podcast_id=7,
+        source_content="hi",
+        search_space_id=555,
+        user_prompt=None,
+    )
+
+    assert _CALL_LOG[0]["billing_tier"] == "premium"
+    assert _CALL_LOG[0]["base_model"] == "gpt-5.4"
+
+
+@pytest.mark.asyncio
+async def test_quota_insufficient_marks_podcast_failed_and_skips_graph(monkeypatch):
+    """When ``billable_call`` denies the reservation, the graph never
+    runs and the podcast row flips to FAILED with the documented reason
+    code."""
+    from app.db import PodcastStatus
+    from app.tasks.celery_tasks import podcast_tasks
+
+    podcast = _make_podcast(podcast_id=8)
+    session = _FakeSession(podcast)
+    monkeypatch.setattr(
+        podcast_tasks,
+        "get_celery_session_maker",
+        lambda: _FakeSessionMaker(session),
+    )
+
+    async def _fake_resolver(sess, search_space_id, *, thread_id=None):
+        return uuid4(), "premium", "gpt-5.4"
+
+    monkeypatch.setattr(
+        podcast_tasks, "_resolve_agent_billing_for_search_space", _fake_resolver
+    )
+    monkeypatch.setattr(podcast_tasks, "billable_call", _denying_billable_call)
+
+    graph_invoked = []
+
+    async def _fake_graph_invoke(state, config):
+        graph_invoked.append(True)
+        return {}
+
+    monkeypatch.setattr(podcast_tasks.podcaster_graph, "ainvoke", _fake_graph_invoke)
+
+    result = await podcast_tasks._generate_content_podcast(
+        podcast_id=8,
+        source_content="hi",
+        search_space_id=555,
+        user_prompt=None,
+    )
+
+    assert result == {
+        "status": "failed",
+        "podcast_id": 8,
+        "reason": "premium_quota_exhausted",
+    }
+    assert podcast.status == PodcastStatus.FAILED
+    assert graph_invoked == []  # Graph never ran on denied reservation.
+
+
+@pytest.mark.asyncio
+async def test_billing_settlement_failure_marks_podcast_failed(monkeypatch):
+    from app.db import PodcastStatus
+    from app.tasks.celery_tasks import podcast_tasks
+
+    podcast = _make_podcast(podcast_id=10)
+    session = _FakeSession(podcast)
+    monkeypatch.setattr(
+        podcast_tasks,
+        "get_celery_session_maker",
+        lambda: _FakeSessionMaker(session),
+    )
+
+    async def _fake_resolver(sess, search_space_id, *, thread_id=None):
+        return uuid4(), "premium", "gpt-5.4"
+
+    monkeypatch.setattr(
+        podcast_tasks, "_resolve_agent_billing_for_search_space", _fake_resolver
+    )
+    monkeypatch.setattr(
+        podcast_tasks, "billable_call", _settlement_failing_billable_call
+    )
+
+    async def _fake_graph_invoke(state, config):
+        return {"podcast_transcript": [], "final_podcast_file_path": "x.wav"}
+
+    monkeypatch.setattr(podcast_tasks.podcaster_graph, "ainvoke", _fake_graph_invoke)
+
+    result = await podcast_tasks._generate_content_podcast(
+        podcast_id=10,
+        source_content="hi",
+        search_space_id=555,
+        user_prompt=None,
+    )
+
+    assert result == {
+        "status": "failed",
+        "podcast_id": 10,
+        "reason": "billing_settlement_failed",
+    }
+    assert podcast.status == PodcastStatus.FAILED
+
+
+@pytest.mark.asyncio
+async def test_resolver_failure_marks_podcast_failed(monkeypatch):
+    """If the resolver raises (e.g. search-space deleted), the task fails
+    cleanly without invoking the graph."""
+    from app.db import PodcastStatus
+    from app.tasks.celery_tasks import podcast_tasks
+
+    podcast = _make_podcast(podcast_id=9)
+    session = _FakeSession(podcast)
+    monkeypatch.setattr(
+        podcast_tasks,
+        "get_celery_session_maker",
+        lambda: _FakeSessionMaker(session),
+    )
+
+    async def _failing_resolver(sess, search_space_id, *, thread_id=None):
+        raise ValueError("Search space 555 not found")
+
+    monkeypatch.setattr(
+        podcast_tasks, "_resolve_agent_billing_for_search_space", _failing_resolver
+    )
+
+    graph_invoked = []
+
+    async def _fake_graph_invoke(state, config):
+        graph_invoked.append(True)
+        return {}
+
+    monkeypatch.setattr(podcast_tasks.podcaster_graph, "ainvoke", _fake_graph_invoke)
+
+    result = await podcast_tasks._generate_content_podcast(
+        podcast_id=9,
+        source_content="hi",
+        search_space_id=555,
+        user_prompt=None,
+    )
+
+    assert result == {
+        "status": "failed",
+        "podcast_id": 9,
+        "reason": "billing_resolution_failed",
+    }
+    assert podcast.status == PodcastStatus.FAILED
+    assert graph_invoked == []
diff --git a/surfsense_backend/tests/unit/tasks/test_stream_new_chat_image_safety_net.py b/surfsense_backend/tests/unit/tasks/test_stream_new_chat_image_safety_net.py
new file mode 100644
index 0000000000..792d059b08
--- /dev/null
+++ b/surfsense_backend/tests/unit/tasks/test_stream_new_chat_image_safety_net.py
@@ -0,0 +1,119 @@
+"""Predicate-level test for the chat streaming safety net.
+
+The safety net in ``stream_new_chat`` rejects an image turn early with
+a friendly ``MODEL_DOES_NOT_SUPPORT_IMAGE_INPUT`` SSE error when the
+selected model is *known* to be text-only. The earlier round of this
+work used a strict opt-in flag (``supports_image_input`` defaulting to
+False on every YAML entry) which blocked vision-capable Azure GPT-5.x
+deployments — this is the regression we're fixing.
+
+The new predicate is :func:`is_known_text_only_chat_model`, which
+returns True only when LiteLLM's authoritative model map *explicitly*
+sets ``supports_vision=False``. Anything else (vision True, missing
+key, exception) returns False so the request flows through to the
+provider.
+
+We exercise the predicate directly here rather than driving the full
+``stream_new_chat`` generator — covering the gate in isolation keeps
+the test focused on the regression while the generator's wider behavior
+is exercised by the integration suite.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from app.services.provider_capabilities import is_known_text_only_chat_model
+
+pytestmark = pytest.mark.unit
+
+
+def test_safety_net_does_not_fire_for_azure_gpt_4o():
+    """Regression: ``azure/gpt-4o`` (and the GPT-5.x variants) is
+    vision-capable per LiteLLM's model map. The previous round's
+    blanket-False default blocked it; the new predicate must NOT mark
+    it text-only."""
+    assert (
+        is_known_text_only_chat_model(
+            provider="AZURE_OPENAI",
+            model_name="my-azure-deployment",
+            base_model="gpt-4o",
+        )
+        is False
+    )
+
+
+def test_safety_net_does_not_fire_for_unknown_model():
+    """Default-pass on unknown — the safety net only blocks definitive
+    text-only confirmations. A freshly added third-party model that
+    LiteLLM doesn't know about must flow through to the provider."""
+    assert (
+        is_known_text_only_chat_model(
+            provider="CUSTOM",
+            custom_provider="brand_new_proxy",
+            model_name="brand-new-model-x9",
+        )
+        is False
+    )
+
+
+def test_safety_net_does_not_fire_when_lookup_raises(monkeypatch):
+    """Transient ``litellm.get_model_info`` exception ≠ block. The
+    helper swallows the error and treats it as 'unknown' → False."""
+    import app.services.provider_capabilities as pc
+
+    def _raise(**_kwargs):
+        raise RuntimeError("intentional test failure")
+
+    monkeypatch.setattr(pc.litellm, "get_model_info", _raise)
+
+    assert (
+        is_known_text_only_chat_model(
+            provider="OPENAI",
+            model_name="gpt-4o",
+        )
+        is False
+    )
+
+
+def test_safety_net_fires_only_on_explicit_false(monkeypatch):
+    """Stub LiteLLM to assert the only path that returns True is the
+    explicit ``supports_vision=False`` case. Anything else (True,
+    None, missing key) returns False from the predicate."""
+    import app.services.provider_capabilities as pc
+
+    def _info_explicit_false(**_kwargs):
+        return {"supports_vision": False, "max_input_tokens": 8192}
+
+    monkeypatch.setattr(pc.litellm, "get_model_info", _info_explicit_false)
+    assert (
+        is_known_text_only_chat_model(
+            provider="OPENAI",
+            model_name="text-only-stub",
+        )
+        is True
+    )
+
+    def _info_true(**_kwargs):
+        return {"supports_vision": True}
+
+    monkeypatch.setattr(pc.litellm, "get_model_info", _info_true)
+    assert (
+        is_known_text_only_chat_model(
+            provider="OPENAI",
+            model_name="vision-stub",
+        )
+        is False
+    )
+
+    def _info_missing(**_kwargs):
+        return {"max_input_tokens": 8192}
+
+    monkeypatch.setattr(pc.litellm, "get_model_info", _info_missing)
+    assert (
+        is_known_text_only_chat_model(
+            provider="OPENAI",
+            model_name="missing-key-stub",
+        )
+        is False
+    )
diff --git a/surfsense_backend/tests/unit/tasks/test_video_presentation_billing.py b/surfsense_backend/tests/unit/tasks/test_video_presentation_billing.py
new file mode 100644
index 0000000000..423b64ddb0
--- /dev/null
+++ b/surfsense_backend/tests/unit/tasks/test_video_presentation_billing.py
@@ -0,0 +1,398 @@
+"""Unit tests for video-presentation Celery task billing integration.
+
+Mirrors ``test_podcast_billing.py`` for the video-presentation task.
+Validates the same wrap-graph-in-billable_call pattern and ensures the
+larger ``QUOTA_DEFAULT_VIDEO_PRESENTATION_RESERVE_MICROS`` reservation is
+threaded through.
+
+Coverage:
+
+* Free config: graph runs, ``billable_call`` invoked with the video
+  reserve override.
+* Premium config: same wiring with ``billing_tier='premium'``.
+* Quota denial: graph not invoked, row → FAILED, reason code surfaced.
+* Resolver failure: row → FAILED with ``billing_resolution_failed``.
+"""
+
+from __future__ import annotations
+
+import contextlib
+from types import SimpleNamespace
+from typing import Any
+from uuid import uuid4
+
+import pytest
+
+pytestmark = pytest.mark.unit
+
+
+# ---------------------------------------------------------------------------
+# Fakes
+# ---------------------------------------------------------------------------
+
+
+class _FakeExecResult:
+    def __init__(self, obj):
+        self._obj = obj
+
+    def scalars(self):
+        return self
+
+    def first(self):
+        return self._obj
+
+    def filter(self, *_args, **_kwargs):
+        return self
+
+
+class _FakeSession:
+    def __init__(self, video):
+        self._video = video
+        self.commit_count = 0
+
+    async def execute(self, _stmt):
+        return _FakeExecResult(self._video)
+
+    async def commit(self):
+        self.commit_count += 1
+
+    async def __aenter__(self):
+        return self
+
+    async def __aexit__(self, *args):
+        return None
+
+
+class _FakeSessionMaker:
+    def __init__(self, session: _FakeSession):
+        self._session = session
+
+    def __call__(self):
+        return self._session
+
+
+def _make_video(video_id: int = 11, thread_id: int = 99) -> SimpleNamespace:
+    return SimpleNamespace(
+        id=video_id,
+        title="Test Presentation",
+        thread_id=thread_id,
+        status=None,
+        slides=None,
+        scene_codes=None,
+    )
+
+
+_CALL_LOG: list[dict[str, Any]] = []
+
+
+@contextlib.asynccontextmanager
+async def _ok_billable_call(**kwargs):
+    _CALL_LOG.append(kwargs)
+    yield SimpleNamespace()
+
+
+@contextlib.asynccontextmanager
+async def _denying_billable_call(**kwargs):
+    from app.services.billable_calls import QuotaInsufficientError
+
+    _CALL_LOG.append(kwargs)
+    raise QuotaInsufficientError(
+        usage_type=kwargs.get("usage_type", "?"),
+        used_micros=5_000_000,
+        limit_micros=5_000_000,
+        remaining_micros=0,
+    )
+    yield SimpleNamespace()  # pragma: no cover
+
+
+@contextlib.asynccontextmanager
+async def _settlement_failing_billable_call(**kwargs):
+    from app.services.billable_calls import BillingSettlementError
+
+    _CALL_LOG.append(kwargs)
+    yield SimpleNamespace()
+    raise BillingSettlementError(
+        usage_type=kwargs.get("usage_type", "?"),
+        user_id=kwargs["user_id"],
+        cause=RuntimeError("finalize failed"),
+    )
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture(autouse=True)
+def _reset_call_log():
+    _CALL_LOG.clear()
+    yield
+    _CALL_LOG.clear()
+
+
+@pytest.mark.asyncio
+async def test_billable_call_invoked_with_correct_kwargs_for_free_config(monkeypatch):
+    from app.config import config as app_config
+    from app.db import VideoPresentationStatus
+    from app.tasks.celery_tasks import video_presentation_tasks
+
+    video = _make_video(video_id=11, thread_id=99)
+    session = _FakeSession(video)
+    monkeypatch.setattr(
+        video_presentation_tasks,
+        "get_celery_session_maker",
+        lambda: _FakeSessionMaker(session),
+    )
+
+    user_id = uuid4()
+
+    async def _fake_resolver(sess, search_space_id, *, thread_id=None):
+        assert search_space_id == 777
+        assert thread_id == 99
+        return user_id, "free", "openrouter/some-free-model"
+
+    monkeypatch.setattr(
+        video_presentation_tasks,
+        "_resolve_agent_billing_for_search_space",
+        _fake_resolver,
+    )
+    monkeypatch.setattr(video_presentation_tasks, "billable_call", _ok_billable_call)
+
+    async def _fake_graph_invoke(state, config):
+        return {"slides": [], "slide_audio_results": [], "slide_scene_codes": []}
+
+    monkeypatch.setattr(
+        video_presentation_tasks.video_presentation_graph,
+        "ainvoke",
+        _fake_graph_invoke,
+    )
+
+    result = await video_presentation_tasks._generate_video_presentation(
+        video_presentation_id=11,
+        source_content="content",
+        search_space_id=777,
+        user_prompt=None,
+    )
+
+    assert result["status"] == "ready"
+    assert result["video_presentation_id"] == 11
+    assert video.status == VideoPresentationStatus.READY
+
+    assert len(_CALL_LOG) == 1
+    call = _CALL_LOG[0]
+    assert call["user_id"] == user_id
+    assert call["search_space_id"] == 777
+    assert call["billing_tier"] == "free"
+    assert call["base_model"] == "openrouter/some-free-model"
+    assert call["usage_type"] == "video_presentation_generation"
+    assert (
+        call["quota_reserve_micros_override"]
+        == app_config.QUOTA_DEFAULT_VIDEO_PRESENTATION_RESERVE_MICROS
+    )
+    # Background artifact audit rows intentionally omit the TokenUsage.thread_id
+    # FK to avoid coupling Celery audit commits to an active chat transaction.
+    assert "thread_id" not in call
+    assert call["call_details"] == {
+        "video_presentation_id": 11,
+        "title": "Test Presentation",
+        "thread_id": 99,
+    }
+    assert callable(call["billable_session_factory"])
+
+
+@pytest.mark.asyncio
+async def test_billable_call_invoked_with_premium_tier(monkeypatch):
+    from app.tasks.celery_tasks import video_presentation_tasks
+
+    video = _make_video()
+    session = _FakeSession(video)
+    monkeypatch.setattr(
+        video_presentation_tasks,
+        "get_celery_session_maker",
+        lambda: _FakeSessionMaker(session),
+    )
+
+    user_id = uuid4()
+
+    async def _fake_resolver(sess, search_space_id, *, thread_id=None):
+        return user_id, "premium", "gpt-5.4"
+
+    monkeypatch.setattr(
+        video_presentation_tasks,
+        "_resolve_agent_billing_for_search_space",
+        _fake_resolver,
+    )
+    monkeypatch.setattr(video_presentation_tasks, "billable_call", _ok_billable_call)
+
+    async def _fake_graph_invoke(state, config):
+        return {"slides": [], "slide_audio_results": [], "slide_scene_codes": []}
+
+    monkeypatch.setattr(
+        video_presentation_tasks.video_presentation_graph,
+        "ainvoke",
+        _fake_graph_invoke,
+    )
+
+    await video_presentation_tasks._generate_video_presentation(
+        video_presentation_id=11,
+        source_content="content",
+        search_space_id=777,
+        user_prompt=None,
+    )
+
+    assert _CALL_LOG[0]["billing_tier"] == "premium"
+    assert _CALL_LOG[0]["base_model"] == "gpt-5.4"
+
+
+@pytest.mark.asyncio
+async def test_quota_insufficient_marks_video_failed_and_skips_graph(monkeypatch):
+    from app.db import VideoPresentationStatus
+    from app.tasks.celery_tasks import video_presentation_tasks
+
+    video = _make_video(video_id=12)
+    session = _FakeSession(video)
+    monkeypatch.setattr(
+        video_presentation_tasks,
+        "get_celery_session_maker",
+        lambda: _FakeSessionMaker(session),
+    )
+
+    async def _fake_resolver(sess, search_space_id, *, thread_id=None):
+        return uuid4(), "premium", "gpt-5.4"
+
+    monkeypatch.setattr(
+        video_presentation_tasks,
+        "_resolve_agent_billing_for_search_space",
+        _fake_resolver,
+    )
+    monkeypatch.setattr(
+        video_presentation_tasks, "billable_call", _denying_billable_call
+    )
+
+    graph_invoked = []
+
+    async def _fake_graph_invoke(state, config):
+        graph_invoked.append(True)
+        return {}
+
+    monkeypatch.setattr(
+        video_presentation_tasks.video_presentation_graph,
+        "ainvoke",
+        _fake_graph_invoke,
+    )
+
+    result = await video_presentation_tasks._generate_video_presentation(
+        video_presentation_id=12,
+        source_content="content",
+        search_space_id=777,
+        user_prompt=None,
+    )
+
+    assert result == {
+        "status": "failed",
+        "video_presentation_id": 12,
+        "reason": "premium_quota_exhausted",
+    }
+    assert video.status == VideoPresentationStatus.FAILED
+    assert graph_invoked == []
+
+
+@pytest.mark.asyncio
+async def test_billing_settlement_failure_marks_video_failed(monkeypatch):
+    from app.db import VideoPresentationStatus
+    from app.tasks.celery_tasks import video_presentation_tasks
+
+    video = _make_video(video_id=14)
+    session = _FakeSession(video)
+    monkeypatch.setattr(
+        video_presentation_tasks,
+        "get_celery_session_maker",
+        lambda: _FakeSessionMaker(session),
+    )
+
+    async def _fake_resolver(sess, search_space_id, *, thread_id=None):
+        return uuid4(), "premium", "gpt-5.4"
+
+    monkeypatch.setattr(
+        video_presentation_tasks,
+        "_resolve_agent_billing_for_search_space",
+        _fake_resolver,
+    )
+    monkeypatch.setattr(
+        video_presentation_tasks,
+        "billable_call",
+        _settlement_failing_billable_call,
+    )
+
+    async def _fake_graph_invoke(state, config):
+        return {"slides": [], "slide_audio_results": [], "slide_scene_codes": []}
+
+    monkeypatch.setattr(
+        video_presentation_tasks.video_presentation_graph,
+        "ainvoke",
+        _fake_graph_invoke,
+    )
+
+    result = await video_presentation_tasks._generate_video_presentation(
+        video_presentation_id=14,
+        source_content="content",
+        search_space_id=777,
+        user_prompt=None,
+    )
+
+    assert result == {
+        "status": "failed",
+        "video_presentation_id": 14,
+        "reason": "billing_settlement_failed",
+    }
+    assert video.status == VideoPresentationStatus.FAILED
+
+
+@pytest.mark.asyncio
+async def test_resolver_failure_marks_video_failed(monkeypatch):
+    from app.db import VideoPresentationStatus
+    from app.tasks.celery_tasks import video_presentation_tasks
+
+    video = _make_video(video_id=13)
+    session = _FakeSession(video)
+    monkeypatch.setattr(
+        video_presentation_tasks,
+        "get_celery_session_maker",
+        lambda: _FakeSessionMaker(session),
+    )
+
+    async def _failing_resolver(sess, search_space_id, *, thread_id=None):
+        raise ValueError("Search space 777 not found")
+
+    monkeypatch.setattr(
+        video_presentation_tasks,
+        "_resolve_agent_billing_for_search_space",
+        _failing_resolver,
+    )
+
+    graph_invoked = []
+
+    async def _fake_graph_invoke(state, config):
+        graph_invoked.append(True)
+        return {}
+
+    monkeypatch.setattr(
+        video_presentation_tasks.video_presentation_graph,
+        "ainvoke",
+        _fake_graph_invoke,
+    )
+
+    result = await video_presentation_tasks._generate_video_presentation(
+        video_presentation_id=13,
+        source_content="content",
+        search_space_id=777,
+        user_prompt=None,
+    )
+
+    assert result == {
+        "status": "failed",
+        "video_presentation_id": 13,
+        "reason": "billing_resolution_failed",
+    }
+    assert video.status == VideoPresentationStatus.FAILED
+    assert graph_invoked == []
diff --git a/surfsense_backend/tests/unit/test_error_contract.py b/surfsense_backend/tests/unit/test_error_contract.py
new file mode 100644
index 0000000000..ec80212901
--- /dev/null
+++ b/surfsense_backend/tests/unit/test_error_contract.py
@@ -0,0 +1,312 @@
+"""Unit tests for the structured error response contract.
+
+Validates that:
+- Global exception handlers produce the backward-compatible error envelope.
+- 5xx responses never leak raw internal exception text.
+- X-Request-ID is propagated correctly.
+- SurfSenseError, HTTPException, validation, and unhandled exceptions all
+  use the same response shape.
+"""
+
+from __future__ import annotations
+
+import json
+
+import pytest
+from fastapi import HTTPException
+from starlette.testclient import TestClient
+
+from app.exceptions import (
+    GENERIC_5XX_MESSAGE,
+    ISSUES_URL,
+    ConfigurationError,
+    ConnectorError,
+    DatabaseError,
+    ExternalServiceError,
+    ForbiddenError,
+    NotFoundError,
+    SurfSenseError,
+    ValidationError,
+)
+
+pytestmark = pytest.mark.unit
+
+
+# ---------------------------------------------------------------------------
+# Helpers - lightweight FastAPI app that re-uses the real global handlers
+# ---------------------------------------------------------------------------
+
+
+def _make_test_app():
+    """Build a minimal FastAPI app with the same handlers as the real one."""
+    from fastapi import FastAPI
+    from fastapi.exceptions import RequestValidationError
+    from pydantic import BaseModel
+
+    from app.app import (
+        RequestIDMiddleware,
+        _http_exception_handler,
+        _surfsense_error_handler,
+        _unhandled_exception_handler,
+        _validation_error_handler,
+    )
+
+    app = FastAPI()
+    app.add_middleware(RequestIDMiddleware)
+    app.add_exception_handler(SurfSenseError, _surfsense_error_handler)
+    app.add_exception_handler(RequestValidationError, _validation_error_handler)
+    app.add_exception_handler(HTTPException, _http_exception_handler)
+    app.add_exception_handler(Exception, _unhandled_exception_handler)
+
+    @app.get("/ok")
+    async def ok():
+        return {"status": "ok"}
+
+    @app.get("/http-400")
+    async def raise_http_400():
+        raise HTTPException(status_code=400, detail="Bad input")
+
+    @app.get("/http-500")
+    async def raise_http_500():
+        raise HTTPException(status_code=500, detail="secret db password leaked")
+
+    @app.get("/http-503")
+    async def raise_http_503():
+        raise HTTPException(
+            status_code=503,
+            detail="Page purchases are temporarily unavailable.",
+        )
+
+    @app.get("/http-502")
+    async def raise_http_502():
+        raise HTTPException(
+            status_code=502,
+            detail="Unable to create Stripe checkout session.",
+        )
+
+    @app.get("/surfsense-connector")
+    async def raise_connector():
+        raise ConnectorError("GitHub API returned 401")
+
+    @app.get("/surfsense-notfound")
+    async def raise_notfound():
+        raise NotFoundError("Document #42 was not found")
+
+    @app.get("/surfsense-forbidden")
+    async def raise_forbidden():
+        raise ForbiddenError()
+
+    @app.get("/surfsense-config")
+    async def raise_config():
+        raise ConfigurationError()
+
+    @app.get("/surfsense-db")
+    async def raise_db():
+        raise DatabaseError()
+
+    @app.get("/surfsense-external")
+    async def raise_external():
+        raise ExternalServiceError()
+
+    @app.get("/surfsense-validation")
+    async def raise_validation():
+        raise ValidationError("Email is invalid")
+
+    @app.get("/unhandled")
+    async def raise_unhandled():
+        raise RuntimeError("should never reach the client")
+
+    class Item(BaseModel):
+        name: str
+        count: int
+
+    @app.post("/validated")
+    async def validated(item: Item):
+        return item.model_dump()
+
+    return app
+
+
+@pytest.fixture(scope="module")
+def client():
+    app = _make_test_app()
+    return TestClient(app, raise_server_exceptions=False)
+
+
+# ---------------------------------------------------------------------------
+# Envelope shape validation
+# ---------------------------------------------------------------------------
+
+
+def _assert_envelope(resp, expected_status: int):
+    """Every error response MUST contain the standard envelope."""
+    assert resp.status_code == expected_status
+    body = resp.json()
+    assert "error" in body, f"Missing 'error' key: {body}"
+    assert "detail" in body, f"Missing legacy 'detail' key: {body}"
+
+    err = body["error"]
+    assert isinstance(err["code"], str) and len(err["code"]) > 0
+    assert isinstance(err["message"], str) and len(err["message"]) > 0
+    assert err["status"] == expected_status
+    assert isinstance(err["request_id"], str) and len(err["request_id"]) > 0
+    assert "timestamp" in err
+    assert err["report_url"] == ISSUES_URL
+
+    # Legacy compat: detail mirrors message
+    assert body["detail"] == err["message"]
+
+    return body
+
+
+# ---------------------------------------------------------------------------
+# X-Request-ID propagation
+# ---------------------------------------------------------------------------
+
+
+class TestRequestID:
+    def test_generated_when_missing(self, client):
+        resp = client.get("/ok")
+        assert "X-Request-ID" in resp.headers
+        assert resp.headers["X-Request-ID"].startswith("req_")
+
+    def test_echoed_when_provided(self, client):
+        resp = client.get("/ok", headers={"X-Request-ID": "my-trace-123"})
+        assert resp.headers["X-Request-ID"] == "my-trace-123"
+
+    def test_present_in_error_response_body(self, client):
+        resp = client.get("/http-400", headers={"X-Request-ID": "trace-abc"})
+        body = _assert_envelope(resp, 400)
+        assert body["error"]["request_id"] == "trace-abc"
+
+
+# ---------------------------------------------------------------------------
+# HTTPException handling
+# ---------------------------------------------------------------------------
+
+
+class TestHTTPExceptionHandler:
+    def test_400_preserves_detail(self, client):
+        body = _assert_envelope(client.get("/http-400"), 400)
+        assert body["error"]["message"] == "Bad input"
+        assert body["error"]["code"] == "BAD_REQUEST"
+
+    def test_500_sanitizes_detail(self, client):
+        body = _assert_envelope(client.get("/http-500"), 500)
+        assert "secret" not in body["error"]["message"]
+        assert "password" not in body["error"]["message"]
+        assert body["error"]["message"] == GENERIC_5XX_MESSAGE
+        assert body["error"]["code"] == "INTERNAL_ERROR"
+
+    def test_503_preserves_detail(self, client):
+        # Intentional 503s (e.g. feature flag off) must surface the developer
+        # message so the frontend can render actionable copy.
+        body = _assert_envelope(client.get("/http-503"), 503)
+        assert body["error"]["message"] == "Page purchases are temporarily unavailable."
+        assert body["error"]["message"] != GENERIC_5XX_MESSAGE
+
+    def test_502_preserves_detail(self, client):
+        body = _assert_envelope(client.get("/http-502"), 502)
+        assert body["error"]["message"] == "Unable to create Stripe checkout session."
+        assert body["error"]["message"] != GENERIC_5XX_MESSAGE
+
+
+# ---------------------------------------------------------------------------
+# SurfSenseError hierarchy
+# ---------------------------------------------------------------------------
+
+
+class TestSurfSenseErrorHandler:
+    def test_connector_error(self, client):
+        body = _assert_envelope(client.get("/surfsense-connector"), 502)
+        assert body["error"]["code"] == "CONNECTOR_ERROR"
+        assert "GitHub" in body["error"]["message"]
+
+    def test_not_found_error(self, client):
+        body = _assert_envelope(client.get("/surfsense-notfound"), 404)
+        assert body["error"]["code"] == "NOT_FOUND"
+
+    def test_forbidden_error(self, client):
+        body = _assert_envelope(client.get("/surfsense-forbidden"), 403)
+        assert body["error"]["code"] == "FORBIDDEN"
+
+    def test_configuration_error(self, client):
+        body = _assert_envelope(client.get("/surfsense-config"), 500)
+        assert body["error"]["code"] == "CONFIGURATION_ERROR"
+
+    def test_database_error(self, client):
+        body = _assert_envelope(client.get("/surfsense-db"), 500)
+        assert body["error"]["code"] == "DATABASE_ERROR"
+
+    def test_external_service_error(self, client):
+        body = _assert_envelope(client.get("/surfsense-external"), 502)
+        assert body["error"]["code"] == "EXTERNAL_SERVICE_ERROR"
+
+    def test_validation_error_custom(self, client):
+        body = _assert_envelope(client.get("/surfsense-validation"), 422)
+        assert body["error"]["code"] == "VALIDATION_ERROR"
+
+
+# ---------------------------------------------------------------------------
+# Unhandled exception (catch-all)
+# ---------------------------------------------------------------------------
+
+
+class TestUnhandledException:
+    def test_returns_500_generic_message(self, client):
+        body = _assert_envelope(client.get("/unhandled"), 500)
+        assert body["error"]["code"] == "INTERNAL_ERROR"
+        assert body["error"]["message"] == GENERIC_5XX_MESSAGE
+        assert "should never reach" not in json.dumps(body)
+
+
+# ---------------------------------------------------------------------------
+# RequestValidationError (pydantic / FastAPI)
+# ---------------------------------------------------------------------------
+
+
+class TestValidationErrorHandler:
+    def test_missing_fields(self, client):
+        resp = client.post("/validated", json={})
+        body = _assert_envelope(resp, 422)
+        assert body["error"]["code"] == "VALIDATION_ERROR"
+        assert "required" in body["error"]["message"].lower()
+
+    def test_wrong_type(self, client):
+        resp = client.post("/validated", json={"name": "test", "count": "not-a-number"})
+        body = _assert_envelope(resp, 422)
+        assert body["error"]["code"] == "VALIDATION_ERROR"
+
+
+# ---------------------------------------------------------------------------
+# SurfSenseError class hierarchy unit tests
+# ---------------------------------------------------------------------------
+
+
+class TestSurfSenseErrorClasses:
+    def test_base_defaults(self):
+        err = SurfSenseError()
+        assert err.code == "INTERNAL_ERROR"
+        assert err.status_code == 500
+        assert err.safe_for_client is True
+
+    def test_connector_error(self):
+        err = ConnectorError("fail")
+        assert err.code == "CONNECTOR_ERROR"
+        assert err.status_code == 502
+
+    def test_database_error(self):
+        err = DatabaseError()
+        assert err.status_code == 500
+
+    def test_not_found_error(self):
+        err = NotFoundError()
+        assert err.status_code == 404
+
+    def test_forbidden_error(self):
+        err = ForbiddenError()
+        assert err.status_code == 403
+
+    def test_custom_code(self):
+        err = ConnectorError("x", code="GITHUB_TOKEN_EXPIRED")
+        assert err.code == "GITHUB_TOKEN_EXPIRED"
diff --git a/surfsense_backend/tests/unit/test_obsidian_plugin_indexer.py b/surfsense_backend/tests/unit/test_obsidian_plugin_indexer.py
new file mode 100644
index 0000000000..20795c7393
--- /dev/null
+++ b/surfsense_backend/tests/unit/test_obsidian_plugin_indexer.py
@@ -0,0 +1,222 @@
+from __future__ import annotations
+
+import base64
+from datetime import UTC, datetime
+
+import pytest
+from pydantic import ValidationError
+
+from app.etl_pipeline.etl_document import EtlResult
+from app.schemas.obsidian_plugin import HeadingRef, NotePayload
+from app.services.obsidian_plugin_indexer import (
+    _build_metadata,
+    _extract_binary_attachment_markdown,
+    _is_image_attachment,
+    _require_extracted_attachment_content,
+)
+
+_FAKE_PNG_B64 = base64.b64encode(b"\x89PNG\r\n\x1a\n").decode("ascii")
+
+
+def test_build_metadata_serializes_headings_to_plain_json() -> None:
+    now = datetime.now(UTC)
+    payload = NotePayload(
+        vault_id="vault-1",
+        path="notes.md",
+        name="notes",
+        extension="md",
+        content="# Notes",
+        headings=[HeadingRef(heading="Notes", level=1)],
+        content_hash="abc123",
+        mtime=now,
+        ctime=now,
+    )
+
+    metadata = _build_metadata(payload, vault_name="My Vault", connector_id=42)
+
+    assert metadata["headings"] == [{"heading": "Notes", "level": 1}]
+
+
+def test_build_metadata_marks_binary_attachment_fields() -> None:
+    now = datetime.now(UTC)
+    payload = NotePayload(
+        vault_id="vault-1",
+        path="assets/diagram.png",
+        name="diagram",
+        extension="png",
+        content="",
+        content_hash="abc123",
+        mtime=now,
+        ctime=now,
+        is_binary=True,
+        binary_base64=_FAKE_PNG_B64,
+        mime_type="image/png",
+    )
+
+    metadata = _build_metadata(payload, vault_name="My Vault", connector_id=42)
+
+    assert metadata["is_binary"] is True
+    assert metadata["mime_type"] == "image/png"
+
+
+@pytest.mark.asyncio
+async def test_extract_binary_attachment_markdown_handles_invalid_base64() -> None:
+    now = datetime.now(UTC)
+    payload = NotePayload(
+        vault_id="vault-1",
+        path="assets/diagram.png",
+        name="diagram",
+        extension="png",
+        content="",
+        content_hash="abc123",
+        mtime=now,
+        ctime=now,
+        is_binary=True,
+        binary_base64="not-valid-base64!!",
+        mime_type="image/png",
+    )
+
+    content, metadata = await _extract_binary_attachment_markdown(
+        payload, vision_llm=None
+    )
+
+    assert content == ""
+    assert metadata["attachment_extraction_status"] == "invalid_binary_payload"
+
+
+@pytest.mark.asyncio
+async def test_extract_binary_attachment_markdown_uses_etl(monkeypatch) -> None:
+    now = datetime.now(UTC)
+    payload = NotePayload(
+        vault_id="vault-1",
+        path="assets/spec.pdf",
+        name="spec",
+        extension="pdf",
+        content="",
+        content_hash="abc123",
+        mtime=now,
+        ctime=now,
+        is_binary=True,
+        binary_base64=base64.b64encode(b"%PDF-1.7 fake bytes").decode("ascii"),
+        mime_type="application/pdf",
+    )
+
+    async def _fake_run_etl_extract(*, file_path, filename, vision_llm):
+        assert filename == "spec.pdf"
+        assert file_path
+        assert vision_llm is None
+        return EtlResult(
+            markdown_content="Extracted content",
+            etl_service="TEST_ETL",
+            content_type="document",
+        )
+
+    monkeypatch.setattr(
+        "app.services.obsidian_plugin_indexer._run_etl_extract",
+        _fake_run_etl_extract,
+    )
+
+    content, metadata = await _extract_binary_attachment_markdown(
+        payload, vision_llm=None
+    )
+
+    assert content == "Extracted content"
+    assert metadata["attachment_extraction_status"] == "ok"
+    assert metadata["attachment_etl_service"] == "TEST_ETL"
+
+
+def test_is_image_attachment_detects_image_extensions() -> None:
+    now = datetime.now(UTC)
+    image_payload = NotePayload(
+        vault_id="vault-1",
+        path="assets/screenshot.PNG",
+        name="screenshot",
+        extension="PNG",
+        content="",
+        content_hash="abc123",
+        mtime=now,
+        ctime=now,
+        is_binary=True,
+        binary_base64=_FAKE_PNG_B64,
+        mime_type="image/png",
+    )
+    pdf_payload = NotePayload(
+        vault_id="vault-1",
+        path="assets/spec.pdf",
+        name="spec",
+        extension="pdf",
+        content="",
+        content_hash="abc123",
+        mtime=now,
+        ctime=now,
+        is_binary=True,
+        binary_base64=_FAKE_PNG_B64,
+        mime_type="application/pdf",
+    )
+
+    assert _is_image_attachment(image_payload) is True
+    assert _is_image_attachment(pdf_payload) is False
+
+
+def test_note_payload_rejects_binary_without_base64() -> None:
+    now = datetime.now(UTC)
+    with pytest.raises(ValidationError, match="binary_base64 is required"):
+        NotePayload(
+            vault_id="vault-1",
+            path="assets/diagram.png",
+            name="diagram",
+            extension="png",
+            content="",
+            content_hash="abc123",
+            mtime=now,
+            ctime=now,
+            is_binary=True,
+            mime_type="image/png",
+        )
+
+
+def test_note_payload_rejects_binary_without_mime_type() -> None:
+    now = datetime.now(UTC)
+    with pytest.raises(ValidationError, match="mime_type is required"):
+        NotePayload(
+            vault_id="vault-1",
+            path="assets/diagram.png",
+            name="diagram",
+            extension="png",
+            content="",
+            content_hash="abc123",
+            mtime=now,
+            ctime=now,
+            is_binary=True,
+            binary_base64=_FAKE_PNG_B64,
+        )
+
+
+def test_note_payload_rejects_markdown_with_binary_fields() -> None:
+    now = datetime.now(UTC)
+    with pytest.raises(
+        ValidationError,
+        match="binary_base64 and mime_type must be omitted when is_binary is False",
+    ):
+        NotePayload(
+            vault_id="vault-1",
+            path="notes.md",
+            name="notes",
+            extension="md",
+            content="# Notes",
+            content_hash="abc123",
+            mtime=now,
+            ctime=now,
+            binary_base64=_FAKE_PNG_B64,
+        )
+
+
+def test_require_extracted_attachment_content_rejects_empty_content() -> None:
+    with pytest.raises(
+        RuntimeError, match=r"Attachment extraction failed for assets/img\.png"
+    ):
+        _require_extracted_attachment_content(
+            content="   ",
+            etl_meta={"attachment_extraction_status": "etl_failed"},
+            path="assets/img.png",
+        )
diff --git a/surfsense_backend/tests/unit/test_stream_new_chat_contract.py b/surfsense_backend/tests/unit/test_stream_new_chat_contract.py
new file mode 100644
index 0000000000..208204ca90
--- /dev/null
+++ b/surfsense_backend/tests/unit/test_stream_new_chat_contract.py
@@ -0,0 +1,596 @@
+import inspect
+import json
+import logging
+import re
+from pathlib import Path
+
+import pytest
+
+import app.tasks.chat.stream_new_chat as stream_new_chat_module
+from app.agents.new_chat.errors import BusyError
+from app.agents.new_chat.middleware.busy_mutex import request_cancel, reset_cancel
+from app.tasks.chat.stream_new_chat import (
+    StreamResult,
+    _classify_stream_exception,
+    _contract_enforcement_active,
+    _evaluate_file_contract_outcome,
+    _extract_resolved_file_path,
+    _log_chat_stream_error,
+    _tool_output_has_error,
+)
+
+pytestmark = pytest.mark.unit
+
+
+def test_tool_output_error_detection():
+    assert _tool_output_has_error("Error: failed to write file")
+    assert _tool_output_has_error({"error": "boom"})
+    assert _tool_output_has_error({"result": "Error: disk is full"})
+    assert not _tool_output_has_error({"result": "Updated file /notes.md"})
+
+
+def test_extract_resolved_file_path_prefers_structured_path():
+    assert (
+        _extract_resolved_file_path(
+            tool_name="write_file",
+            tool_output={"status": "completed", "path": "/docs/note.md"},
+            tool_input=None,
+        )
+        == "/docs/note.md"
+    )
+
+
+def test_extract_resolved_file_path_falls_back_to_tool_input():
+    assert (
+        _extract_resolved_file_path(
+            tool_name="edit_file",
+            tool_output={"status": "completed", "result": "updated"},
+            tool_input={"file_path": "/docs/edited.md"},
+        )
+        == "/docs/edited.md"
+    )
+
+
+def test_extract_resolved_file_path_does_not_parse_result_text():
+    assert (
+        _extract_resolved_file_path(
+            tool_name="write_file",
+            tool_output={"result": "Updated file /docs/from-text.md"},
+            tool_input=None,
+        )
+        is None
+    )
+
+
+def test_file_write_contract_outcome_reasons():
+    result = StreamResult(intent_detected="file_write")
+    passed, reason = _evaluate_file_contract_outcome(result)
+    assert not passed
+    assert reason == "no_write_attempt"
+
+    result.write_attempted = True
+    passed, reason = _evaluate_file_contract_outcome(result)
+    assert not passed
+    assert reason == "write_failed"
+
+    result.write_succeeded = True
+    passed, reason = _evaluate_file_contract_outcome(result)
+    assert not passed
+    assert reason == "verification_failed"
+
+    result.verification_succeeded = True
+    passed, reason = _evaluate_file_contract_outcome(result)
+    assert passed
+    assert reason == ""
+
+
+def test_contract_enforcement_local_only():
+    result = StreamResult(filesystem_mode="desktop_local_folder")
+    assert _contract_enforcement_active(result)
+
+    result.filesystem_mode = "cloud"
+    assert not _contract_enforcement_active(result)
+
+
+def _extract_chat_stream_payload(record_message: str) -> dict:
+    prefix = "[chat_stream_error] "
+    assert record_message.startswith(prefix)
+    return json.loads(record_message[len(prefix) :])
+
+
+def test_unified_chat_stream_error_log_schema(caplog):
+    with caplog.at_level(logging.INFO, logger="app.tasks.chat.stream_new_chat"):
+        _log_chat_stream_error(
+            flow="new",
+            error_kind="server_error",
+            error_code="SERVER_ERROR",
+            severity="warn",
+            is_expected=False,
+            request_id="req-123",
+            thread_id=101,
+            search_space_id=202,
+            user_id="user-1",
+            message="Error during chat: boom",
+        )
+
+    record = next(r for r in caplog.records if "[chat_stream_error]" in r.message)
+    payload = _extract_chat_stream_payload(record.message)
+
+    required_keys = {
+        "event",
+        "flow",
+        "error_kind",
+        "error_code",
+        "severity",
+        "is_expected",
+        "request_id",
+        "thread_id",
+        "search_space_id",
+        "user_id",
+        "message",
+    }
+    assert required_keys.issubset(payload.keys())
+    assert payload["event"] == "chat_stream_error"
+    assert payload["flow"] == "new"
+    assert payload["error_code"] == "SERVER_ERROR"
+
+
+def test_premium_quota_uses_unified_chat_stream_log_shape(caplog):
+    with caplog.at_level(logging.INFO, logger="app.tasks.chat.stream_new_chat"):
+        _log_chat_stream_error(
+            flow="resume",
+            error_kind="premium_quota_exhausted",
+            error_code="PREMIUM_QUOTA_EXHAUSTED",
+            severity="info",
+            is_expected=True,
+            request_id="req-premium",
+            thread_id=303,
+            search_space_id=404,
+            user_id="user-2",
+            message="Buy more tokens to continue with this model, or switch to a free model",
+            extra={"auto_fallback": False},
+        )
+
+    record = next(r for r in caplog.records if "[chat_stream_error]" in r.message)
+    payload = _extract_chat_stream_payload(record.message)
+    assert payload["event"] == "chat_stream_error"
+    assert payload["error_kind"] == "premium_quota_exhausted"
+    assert payload["error_code"] == "PREMIUM_QUOTA_EXHAUSTED"
+    assert payload["flow"] == "resume"
+    assert payload["is_expected"] is True
+    assert payload["auto_fallback"] is False
+
+
+def test_stream_error_emission_keeps_machine_error_codes():
+    source = inspect.getsource(stream_new_chat_module)
+    format_error_calls = re.findall(r"format_error\(", source)
+    emitted_error_codes = set(re.findall(r'error_code="([A-Z_]+)"', source))
+
+    # All stream paths should route through one shared terminal error emitter.
+    assert len(format_error_calls) == 1
+    assert {
+        "PREMIUM_QUOTA_EXHAUSTED",
+        "SERVER_ERROR",
+    }.issubset(emitted_error_codes)
+    assert 'flow: Literal["new", "regenerate"] = "new"' in source
+    assert "_emit_stream_terminal_error" in source
+    assert "flow=flow" in source
+    assert 'flow="resume"' in source
+
+
+def test_stream_exception_classifies_rate_limited():
+    exc = Exception(
+        '{"error":{"type":"rate_limit_error","message":"Rate limited. Please try again later."}}'
+    )
+    kind, code, severity, is_expected, user_message, extra = _classify_stream_exception(
+        exc, flow_label="chat"
+    )
+    assert kind == "rate_limited"
+    assert code == "RATE_LIMITED"
+    assert severity == "warn"
+    assert is_expected is True
+    assert "temporarily rate-limited" in user_message
+    assert extra is None
+
+
+def test_stream_exception_classifies_openrouter_429_payload():
+    exc = Exception(
+        'OpenrouterException - {"error":{"message":"Provider returned error","code":429,'
+        '"metadata":{"raw":"foo is temporarily rate-limited upstream"}}}'
+    )
+    kind, code, severity, is_expected, user_message, extra = _classify_stream_exception(
+        exc, flow_label="chat"
+    )
+    assert kind == "rate_limited"
+    assert code == "RATE_LIMITED"
+    assert severity == "warn"
+    assert is_expected is True
+    assert "temporarily rate-limited" in user_message
+    assert extra is None
+
+
+@pytest.mark.asyncio
+async def test_preflight_swallows_non_rate_limit_errors_and_re_raises_429(monkeypatch):
+    """``_preflight_llm`` is best-effort.
+
+    - On rate-limit shaped exceptions (provider 429) it MUST re-raise so the
+      caller can drive the cooldown/repin branch.
+    - On any other transient failure it MUST swallow the error so the normal
+      stream path continues without surfacing preflight noise to the user.
+    """
+    from types import SimpleNamespace
+
+    from app.tasks.chat.stream_new_chat import _preflight_llm
+
+    class _RateLimitedError(Exception):
+        """Class-name carries 'RateLimit' so _is_provider_rate_limited triggers."""
+
+    rate_calls: list[dict] = []
+    other_calls: list[dict] = []
+
+    async def _fake_acompletion_429(**kwargs):
+        rate_calls.append(kwargs)
+        raise _RateLimitedError("simulated 429")
+
+    async def _fake_acompletion_other(**kwargs):
+        other_calls.append(kwargs)
+        raise RuntimeError("some unrelated transient failure")
+
+    fake_llm = SimpleNamespace(
+        model="openrouter/google/gemma-4-31b-it:free",
+        api_key="test",
+        api_base=None,
+    )
+
+    import litellm  # type: ignore[import-not-found]
+
+    monkeypatch.setattr(litellm, "acompletion", _fake_acompletion_429)
+    with pytest.raises(_RateLimitedError):
+        await _preflight_llm(fake_llm)
+    assert len(rate_calls) == 1
+    assert rate_calls[0]["max_tokens"] == 1
+    assert rate_calls[0]["stream"] is False
+
+    monkeypatch.setattr(litellm, "acompletion", _fake_acompletion_other)
+    # MUST NOT raise: non-rate-limit failures are swallowed.
+    await _preflight_llm(fake_llm)
+    assert len(other_calls) == 1
+
+
+@pytest.mark.asyncio
+async def test_preflight_skipped_for_auto_router_model():
+    """Router-mode ``model='auto'`` has no single deployment to ping; the
+    LiteLLM router itself owns per-deployment rate-limit accounting, so the
+    preflight helper must short-circuit instead of issuing a probe."""
+    from types import SimpleNamespace
+
+    from app.tasks.chat.stream_new_chat import _preflight_llm
+
+    fake_llm = SimpleNamespace(model="auto", api_key="x", api_base=None)
+    # Should return without raising or making any LiteLLM call.
+    await _preflight_llm(fake_llm)
+
+
+@pytest.mark.asyncio
+async def test_settle_speculative_agent_build_swallows_exceptions():
+    """``_settle_speculative_agent_build`` MUST always return cleanly so the
+    caller can safely re-touch the request-scoped session afterwards.
+
+    The helper guards the parallel preflight + agent-build path: when the
+    speculative build is being discarded (429 or non-429 preflight failure)
+    we await it solely to release any in-flight ``AsyncSession`` usage —
+    the build's outcome is irrelevant. Any exception (including
+    ``CancelledError``) leaking out would skip the caller's recovery flow
+    and re-introduce the very session-concurrency hazard the helper exists
+    to prevent.
+    """
+    import asyncio
+
+    from app.tasks.chat.stream_new_chat import _settle_speculative_agent_build
+
+    async def _raises() -> None:
+        raise RuntimeError("speculative build crashed")
+
+    async def _succeeds() -> str:
+        return "agent"
+
+    async def _slow() -> None:
+        await asyncio.sleep(0.05)
+
+    for coro in (_raises(), _succeeds(), _slow()):
+        task = asyncio.create_task(coro)
+        await _settle_speculative_agent_build(task)
+        assert task.done()
+
+
+@pytest.mark.asyncio
+async def test_settle_speculative_agent_build_handles_already_done_task():
+    """Done tasks (success or failure) must still be settled without raising."""
+    import asyncio
+
+    from app.tasks.chat.stream_new_chat import _settle_speculative_agent_build
+
+    async def _ok() -> str:
+        return "ok"
+
+    async def _bad() -> None:
+        raise ValueError("nope")
+
+    ok_task = asyncio.create_task(_ok())
+    bad_task = asyncio.create_task(_bad())
+    # Drive both to completion before settling.
+    await asyncio.sleep(0)
+    await asyncio.sleep(0)
+
+    await _settle_speculative_agent_build(ok_task)
+    await _settle_speculative_agent_build(bad_task)
+    assert ok_task.result() == "ok"
+    # ``bad_task`` exception was consumed by the settle helper; calling
+    # ``.exception()`` after the fact must still return the original error
+    # (the helper observes it but doesn't clear it).
+    assert isinstance(bad_task.exception(), ValueError)
+
+
+def test_stream_exception_classifies_thread_busy():
+    exc = BusyError(request_id="thread-123")
+    kind, code, severity, is_expected, user_message, extra = _classify_stream_exception(
+        exc, flow_label="chat"
+    )
+    assert kind == "thread_busy"
+    assert code == "THREAD_BUSY"
+    assert severity == "warn"
+    assert is_expected is True
+    assert "still finishing for this thread" in user_message
+    assert extra is None
+
+
+def test_stream_exception_classifies_thread_busy_from_message():
+    exc = Exception("Thread is busy with another request")
+    kind, code, severity, is_expected, user_message, extra = _classify_stream_exception(
+        exc, flow_label="chat"
+    )
+    assert kind == "thread_busy"
+    assert code == "THREAD_BUSY"
+    assert severity == "warn"
+    assert is_expected is True
+    assert "still finishing for this thread" in user_message
+    assert extra is None
+
+
+def test_stream_exception_classifies_turn_cancelling_when_cancel_requested():
+    thread_id = "thread-cancelling-1"
+    reset_cancel(thread_id)
+    request_cancel(thread_id)
+    exc = BusyError(request_id=thread_id)
+    kind, code, severity, is_expected, user_message, extra = _classify_stream_exception(
+        exc, flow_label="chat"
+    )
+    assert kind == "thread_busy"
+    assert code == "TURN_CANCELLING"
+    assert severity == "info"
+    assert is_expected is True
+    assert "stopping" in user_message
+    assert isinstance(extra, dict)
+    assert "retry_after_ms" in extra
+
+
+def test_premium_classification_is_error_code_driven():
+    classifier_path = (
+        Path(__file__).resolve().parents[3]
+        / "surfsense_web/lib/chat/chat-error-classifier.ts"
+    )
+    source = classifier_path.read_text(encoding="utf-8")
+
+    assert "PREMIUM_KEYWORDS" not in source
+    assert "RATE_LIMIT_KEYWORDS" not in source
+    assert "normalized.includes(" not in source
+    assert 'if (errorCode === "PREMIUM_QUOTA_EXHAUSTED") {' in source
+
+
+def test_stream_terminal_error_handler_has_pre_accept_soft_rollback_hook():
+    page_path = (
+        Path(__file__).resolve().parents[3]
+        / "surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx"
+    )
+    source = page_path.read_text(encoding="utf-8")
+
+    assert "onPreAcceptFailure?: () => Promise<void>;" in source
+    assert "if (!accepted) {" in source
+    assert "await onPreAcceptFailure?.();" in source
+    assert "await onAcceptedStreamError?.();" in source
+    assert "setMessages((prev) => prev.filter((m) => m.id !== userMsgId));" in source
+    assert "setMessageDocumentsMap((prev) => {" in source
+
+
+def test_toast_only_pre_accept_policy_has_no_inline_failed_marker():
+    user_message_path = (
+        Path(__file__).resolve().parents[3]
+        / "surfsense_web/components/assistant-ui/user-message.tsx"
+    )
+    source = user_message_path.read_text(encoding="utf-8")
+
+    assert "Not sent. Edit and retry." not in source
+    assert "failed_pre_accept" not in source
+
+
+def test_network_send_failures_use_unified_retry_toast_message():
+    classifier_path = (
+        Path(__file__).resolve().parents[3]
+        / "surfsense_web/lib/chat/chat-error-classifier.ts"
+    )
+    classifier_source = classifier_path.read_text(encoding="utf-8")
+    request_errors_path = (
+        Path(__file__).resolve().parents[3]
+        / "surfsense_web/lib/chat/chat-request-errors.ts"
+    )
+    request_errors_source = request_errors_path.read_text(encoding="utf-8")
+
+    assert '"send_failed_pre_accept"' in classifier_source
+    assert 'errorCode === "SEND_FAILED_PRE_ACCEPT"' in classifier_source
+    assert 'errorCode === "TURN_CANCELLING"' in classifier_source
+    assert "if (withCode.code) return withCode.code;" in classifier_source
+    assert 'userMessage: "Message not sent. Please retry."' in classifier_source
+    assert 'userMessage: "Connection issue. Please try again."' in classifier_source
+    assert "const passthroughCodes = new Set([" in request_errors_source
+    assert '"PREMIUM_QUOTA_EXHAUSTED"' in request_errors_source
+    assert '"THREAD_BUSY"' in request_errors_source
+    assert '"TURN_CANCELLING"' in request_errors_source
+    assert '"AUTH_EXPIRED"' in request_errors_source
+    assert '"UNAUTHORIZED"' in request_errors_source
+    assert '"RATE_LIMITED"' in request_errors_source
+    assert '"NETWORK_ERROR"' in request_errors_source
+    assert '"STREAM_PARSE_ERROR"' in request_errors_source
+    assert '"TOOL_EXECUTION_ERROR"' in request_errors_source
+    assert '"PERSIST_MESSAGE_FAILED"' in request_errors_source
+    assert '"SERVER_ERROR"' in request_errors_source
+    assert "passthroughCodes.has(existingCode)" in request_errors_source
+    assert 'errorCode: "SEND_FAILED_PRE_ACCEPT"' in request_errors_source
+    assert 'errorCode: "NETWORK_ERROR"' not in request_errors_source
+    assert "Failed to start chat. Please try again." not in classifier_source
+
+
+def test_pre_post_accept_abort_contract_exists_for_new_resume_regenerate_flows():
+    page_path = (
+        Path(__file__).resolve().parents[3]
+        / "surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx"
+    )
+    source = page_path.read_text(encoding="utf-8")
+
+    # Each flow tracks accepted boundary and passes it into shared terminal handling.
+    # The acceptance boundary is still meaningful post-refactor: it gates
+    # local-state cleanup (onPreAcceptFailure path) and lets the shared
+    # terminal handler distinguish pre-accept aborts from in-stream errors.
+    assert "let newAccepted = false;" in source
+    assert "let resumeAccepted = false;" in source
+    assert "let regenerateAccepted = false;" in source
+    assert "accepted: newAccepted," in source
+    assert "accepted: resumeAccepted," in source
+    assert "accepted: regenerateAccepted," in source
+
+    # NOTE: The FE-side persistence guards previously asserted here
+    # ("if (!resumeAccepted) return;", "if (!regenerateAccepted) return;",
+    # "if (newAccepted && !userPersisted) {") have been intentionally
+    # removed by the SSE-based message-id handshake refactor. Persistence
+    # is now server-authoritative: persist_user_turn / persist_assistant_shell
+    # run inside stream_new_chat / stream_resume_chat unconditionally and
+    # the FE consumes data-user-message-id / data-assistant-message-id
+    # SSE events to learn the canonical primary keys. There is therefore
+    # no FE call-site to guard, and the shared terminal handler relies
+    # purely on the `accepted` field above (forwarded to onAbort /
+    # onAcceptedStreamError) to drive UI cleanup. See
+    # tests/integration/chat/test_message_id_sse.py for the new
+    # cross-tier ID coherence guarantees.
+
+    # The TURN_CANCELLING / THREAD_BUSY retry plumbing is independent
+    # of the persistence refactor and must still exist on every
+    # start-stream fetch.
+    assert "const fetchWithTurnCancellingRetry = useCallback(" in source
+    assert "computeFallbackTurnCancellingRetryDelay" in source
+    assert 'withMeta.errorCode === "TURN_CANCELLING"' in source
+    assert 'withMeta.errorCode === "THREAD_BUSY"' in source
+    assert "await fetchWithTurnCancellingRetry(() =>" in source
+
+
+def test_cancel_active_turn_route_contract_exists():
+    routes_path = (
+        Path(__file__).resolve().parents[3]
+        / "surfsense_backend/app/routes/new_chat_routes.py"
+    )
+    source = routes_path.read_text(encoding="utf-8")
+
+    assert '@router.post(\n    "/threads/{thread_id}/cancel-active-turn",' in source
+    assert "response_model=CancelActiveTurnResponse" in source
+    assert 'status="cancelling",' in source
+    assert 'error_code="TURN_CANCELLING",' in source
+    assert "retry_after_ms=retry_after_ms if retry_after_ms > 0 else None," in source
+    assert "retry_after_at=" in source
+    assert 'status="idle",' in source
+    assert 'error_code="NO_ACTIVE_TURN",' in source
+
+
+def test_turn_status_route_contract_exists():
+    routes_path = (
+        Path(__file__).resolve().parents[3]
+        / "surfsense_backend/app/routes/new_chat_routes.py"
+    )
+    source = routes_path.read_text(encoding="utf-8")
+
+    assert '@router.get(\n    "/threads/{thread_id}/turn-status",' in source
+    assert "response_model=TurnStatusResponse" in source
+    assert "_build_turn_status_payload(thread_id)" in source
+    assert "Permission.CHATS_READ.value" in source
+    assert "_raise_if_thread_busy_for_start(" in source
+
+
+def test_turn_cancelling_retry_policy_contract_exists():
+    routes_path = (
+        Path(__file__).resolve().parents[3]
+        / "surfsense_backend/app/routes/new_chat_routes.py"
+    )
+    source = routes_path.read_text(encoding="utf-8")
+
+    assert "TURN_CANCELLING_INITIAL_DELAY_MS = 200" in source
+    assert "TURN_CANCELLING_BACKOFF_FACTOR = 2" in source
+    assert "TURN_CANCELLING_MAX_DELAY_MS = 1500" in source
+    assert "def _compute_turn_cancelling_retry_delay(" in source
+    assert "retry-after-ms" in source
+    assert '"Retry-After"' in source
+    assert '"errorCode": "TURN_CANCELLING"' in source
+
+
+def test_turn_status_sse_contract_exists():
+    stream_source = (
+        Path(__file__).resolve().parents[3]
+        / "surfsense_backend/app/tasks/chat/stream_new_chat.py"
+    ).read_text(encoding="utf-8")
+    state_source = (
+        Path(__file__).resolve().parents[3]
+        / "surfsense_web/lib/chat/streaming-state.ts"
+    ).read_text(encoding="utf-8")
+    pipeline_source = (
+        Path(__file__).resolve().parents[3]
+        / "surfsense_web/lib/chat/stream-pipeline.ts"
+    ).read_text(encoding="utf-8")
+
+    assert '"turn-status"' in stream_source
+    assert '"status": "busy"' in stream_source
+    assert '"status": "idle"' in stream_source
+    assert 'type: "data-turn-status"' in state_source
+    assert 'case "data-turn-status":' in pipeline_source
+    assert "end_turn(str(chat_id))" in stream_source
+
+
+def test_chat_deepagent_forwards_resolved_model_name_to_both_builders():
+    """Regression guard: both system-prompt builders in chat_deepagent.py
+    must receive ``model_name=_resolve_prompt_model_name(...)`` so the
+    provider-variant dispatch can render the right ``<provider_hints>``
+    block. Without this the prompt silently falls back to the empty
+    ``"default"`` variant — the original bug being fixed.
+
+    This test mirrors :func:`test_stream_error_emission_keeps_machine_error_codes`
+    in style: it inspects module source text + a regex to enforce the
+    call-site shape, not just the wrapper layer (the wrappers already
+    forward ``model_name`` correctly, so testing them would not catch
+    the actual missed plumbing).
+    """
+    import app.agents.new_chat.chat_deepagent as chat_deepagent_module
+
+    source = inspect.getsource(chat_deepagent_module)
+
+    # Helper itself must be defined.
+    assert "def _resolve_prompt_model_name(" in source
+
+    # Both builder calls must forward the resolved model name. Match
+    # across newlines + whitespace because the kwargs are split over
+    # multiple lines.
+    pattern = re.compile(
+        r"build_(?:surfsense|configurable)_system_prompt\([^)]*"
+        r"model_name=_resolve_prompt_model_name\(",
+        re.DOTALL,
+    )
+    matches = pattern.findall(source)
+    assert len(matches) == 2, (
+        "Expected both system-prompt builder call sites to forward "
+        "`model_name=_resolve_prompt_model_name(...)`, found "
+        f"{len(matches)}"
+    )
diff --git a/surfsense_backend/tests/unit/utils/test_file_extensions.py b/surfsense_backend/tests/unit/utils/test_file_extensions.py
index 43dfef5f0d..ccf5eb70fa 100644
--- a/surfsense_backend/tests/unit/utils/test_file_extensions.py
+++ b/surfsense_backend/tests/unit/utils/test_file_extensions.py
@@ -154,3 +154,40 @@ def test_get_extensions_for_none_returns_union():
     )
 
     assert get_document_extensions_for_service(None) == DOCUMENT_EXTENSIONS
+
+
+# ---------------------------------------------------------------------------
+# IMAGE_EXTENSIONS
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "ext",
+    [
+        ".png",
+        ".jpg",
+        ".jpeg",
+        ".gif",
+        ".bmp",
+        ".tiff",
+        ".tif",
+        ".webp",
+        ".svg",
+        ".heic",
+        ".heif",
+    ],
+)
+def test_image_extensions_contains_expected(ext):
+    from app.utils.file_extensions import IMAGE_EXTENSIONS
+
+    assert ext in IMAGE_EXTENSIONS
+
+
+def test_image_extensions_are_subset_of_document_extensions():
+    """Image extensions used for routing should also be in DOCUMENT_EXTENSIONS for fallback."""
+    from app.utils.file_extensions import DOCUMENT_EXTENSIONS, IMAGE_EXTENSIONS
+
+    missing = IMAGE_EXTENSIONS - DOCUMENT_EXTENSIONS
+    assert not missing, (
+        f"Image extensions missing from document sets (breaks fallback): {missing}"
+    )
diff --git a/surfsense_backend/tests/unit/utils/test_oauth_security.py b/surfsense_backend/tests/unit/utils/test_oauth_security.py
new file mode 100644
index 0000000000..a00df62ee5
--- /dev/null
+++ b/surfsense_backend/tests/unit/utils/test_oauth_security.py
@@ -0,0 +1,93 @@
+import base64
+import hashlib
+import hmac
+import json
+import time
+from uuid import uuid4
+
+import pytest
+from fastapi import HTTPException
+
+from app.utils.oauth_security import OAuthStateManager
+
+SECRET = "unit-test-secret"
+
+
+def _encode_state(payload: dict, *, signature: str | None = None) -> str:
+    """Build an OAuth state payload compatible with OAuthStateManager."""
+    signature_payload = payload.copy()
+    payload_str = json.dumps(signature_payload, sort_keys=True)
+    computed_signature = hmac.new(
+        SECRET.encode(),
+        payload_str.encode(),
+        hashlib.sha256,
+    ).hexdigest()
+    encoded_payload = {
+        **signature_payload,
+        "signature": signature if signature is not None else computed_signature,
+    }
+    return base64.urlsafe_b64encode(json.dumps(encoded_payload).encode()).decode()
+
+
+def test_validate_state_accepts_fresh_signed_state():
+    mgr = OAuthStateManager(secret_key=SECRET, max_age_seconds=600)
+    user_id = uuid4()
+
+    state = mgr.generate_secure_state(
+        space_id=1,
+        user_id=user_id,
+        toolkit_id="googledrive",
+    )
+
+    decoded = mgr.validate_state(state)
+
+    assert decoded["space_id"] == 1
+    assert decoded["user_id"] == str(user_id)
+    assert decoded["toolkit_id"] == "googledrive"
+
+
+def test_validate_state_rejects_expired_state():
+    mgr = OAuthStateManager(secret_key=SECRET, max_age_seconds=600)
+    expired_state = _encode_state(
+        {
+            "space_id": 1,
+            "user_id": str(uuid4()),
+            "timestamp": int(time.time()) - 3600,
+            "toolkit_id": "googledrive",
+        }
+    )
+
+    with pytest.raises(HTTPException) as exc:
+        mgr.validate_state(expired_state)
+
+    assert exc.value.status_code == 400
+    assert "expired" in exc.value.detail.lower()
+
+
+def test_validate_state_rejects_tampered_signature():
+    mgr = OAuthStateManager(secret_key=SECRET, max_age_seconds=600)
+    tampered_state = _encode_state(
+        {
+            "space_id": 1,
+            "user_id": str(uuid4()),
+            "timestamp": int(time.time()),
+            "toolkit_id": "googledrive",
+        },
+        signature="deadbeef" * 8,
+    )
+
+    with pytest.raises(HTTPException) as exc:
+        mgr.validate_state(tampered_state)
+
+    assert exc.value.status_code == 400
+    assert "tampering" in exc.value.detail.lower()
+
+
+def test_validate_state_rejects_malformed_state():
+    mgr = OAuthStateManager(secret_key=SECRET)
+
+    with pytest.raises(HTTPException) as exc:
+        mgr.validate_state("not-base64-and-not-json")
+
+    assert exc.value.status_code == 400
+    assert "invalid state format" in exc.value.detail.lower()
diff --git a/surfsense_backend/uv.lock b/surfsense_backend/uv.lock
index 6138d8861a..c4e6b5c890 100644
--- a/surfsense_backend/uv.lock
+++ b/surfsense_backend/uv.lock
@@ -62,7 +62,7 @@ wheels = [
 
 [[package]]
 name = "aiohttp"
-version = "3.13.5"
+version = "3.13.4"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "aiohappyeyeballs" },
@@ -73,76 +73,76 @@ dependencies = [
     { name = "propcache" },
     { name = "yarl" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/77/9a/152096d4808df8e4268befa55fba462f440f14beab85e8ad9bf990516918/aiohttp-3.13.5.tar.gz", hash = "sha256:9d98cc980ecc96be6eb4c1994ce35d28d8b1f5e5208a23b421187d1209dbb7d1", size = 7858271 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/be/6f/353954c29e7dcce7cf00280a02c75f30e133c00793c7a2ed3776d7b2f426/aiohttp-3.13.5-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:023ecba036ddd840b0b19bf195bfae970083fd7024ce1ac22e9bba90464620e9", size = 748876 },
-    { url = "https://files.pythonhosted.org/packages/f5/1b/428a7c64687b3b2e9cd293186695affc0e1e54a445d0361743b231f11066/aiohttp-3.13.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:15c933ad7920b7d9a20de151efcd05a6e38302cbf0e10c9b2acb9a42210a2416", size = 499557 },
-    { url = "https://files.pythonhosted.org/packages/29/47/7be41556bfbb6917069d6a6634bb7dd5e163ba445b783a90d40f5ac7e3a7/aiohttp-3.13.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ab2899f9fa2f9f741896ebb6fa07c4c883bfa5c7f2ddd8cf2aafa86fa981b2d2", size = 500258 },
-    { url = "https://files.pythonhosted.org/packages/67/84/c9ecc5828cb0b3695856c07c0a6817a99d51e2473400f705275a2b3d9239/aiohttp-3.13.5-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a60eaa2d440cd4707696b52e40ed3e2b0f73f65be07fd0ef23b6b539c9c0b0b4", size = 1749199 },
-    { url = "https://files.pythonhosted.org/packages/f0/d3/3c6d610e66b495657622edb6ae7c7fd31b2e9086b4ec50b47897ad6042a9/aiohttp-3.13.5-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:55b3bdd3292283295774ab585160c4004f4f2f203946997f49aac032c84649e9", size = 1721013 },
-    { url = "https://files.pythonhosted.org/packages/49/a0/24409c12217456df0bae7babe3b014e460b0b38a8e60753d6cb339f6556d/aiohttp-3.13.5-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c2b2355dc094e5f7d45a7bb262fe7207aa0460b37a0d87027dcf21b5d890e7d5", size = 1781501 },
-    { url = "https://files.pythonhosted.org/packages/98/9d/b65ec649adc5bccc008b0957a9a9c691070aeac4e41cea18559fef49958b/aiohttp-3.13.5-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b38765950832f7d728297689ad78f5f2cf79ff82487131c4d26fe6ceecdc5f8e", size = 1878981 },
-    { url = "https://files.pythonhosted.org/packages/57/d8/8d44036d7eb7b6a8ec4c5494ea0c8c8b94fbc0ed3991c1a7adf230df03bf/aiohttp-3.13.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b18f31b80d5a33661e08c89e202edabf1986e9b49c42b4504371daeaa11b47c1", size = 1767934 },
-    { url = "https://files.pythonhosted.org/packages/31/04/d3f8211f273356f158e3464e9e45484d3fb8c4ce5eb2f6fe9405c3273983/aiohttp-3.13.5-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:33add2463dde55c4f2d9635c6ab33ce154e5ecf322bd26d09af95c5f81cfa286", size = 1566671 },
-    { url = "https://files.pythonhosted.org/packages/41/db/073e4ebe00b78e2dfcacff734291651729a62953b48933d765dc513bf798/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:327cc432fdf1356fb4fbc6fe833ad4e9f6aacb71a8acaa5f1855e4b25910e4a9", size = 1705219 },
-    { url = "https://files.pythonhosted.org/packages/48/45/7dfba71a2f9fd97b15c95c06819de7eb38113d2cdb6319669195a7d64270/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:7c35b0bf0b48a70b4cb4fc5d7bed9b932532728e124874355de1a0af8ec4bc88", size = 1743049 },
-    { url = "https://files.pythonhosted.org/packages/18/71/901db0061e0f717d226386a7f471bb59b19566f2cae5f0d93874b017271f/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:df23d57718f24badef8656c49743e11a89fd6f5358fa8a7b96e728fda2abf7d3", size = 1749557 },
-    { url = "https://files.pythonhosted.org/packages/08/d5/41eebd16066e59cd43728fe74bce953d7402f2b4ddfdfef2c0e9f17ca274/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:02e048037a6501a5ec1f6fc9736135aec6eb8a004ce48838cb951c515f32c80b", size = 1558931 },
-    { url = "https://files.pythonhosted.org/packages/30/e6/4a799798bf05740e66c3a1161079bda7a3dd8e22ca392481d7a7f9af82a6/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:31cebae8b26f8a615d2b546fee45d5ffb76852ae6450e2a03f42c9102260d6fe", size = 1774125 },
-    { url = "https://files.pythonhosted.org/packages/84/63/7749337c90f92bc2cb18f9560d67aa6258c7060d1397d21529b8004fcf6f/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:888e78eb5ca55a615d285c3c09a7a91b42e9dd6fc699b166ebd5dee87c9ccf14", size = 1732427 },
-    { url = "https://files.pythonhosted.org/packages/98/de/cf2f44ff98d307e72fb97d5f5bbae3bfcb442f0ea9790c0bf5c5c2331404/aiohttp-3.13.5-cp312-cp312-win32.whl", hash = "sha256:8bd3ec6376e68a41f9f95f5ed170e2fcf22d4eb27a1f8cb361d0508f6e0557f3", size = 433534 },
-    { url = "https://files.pythonhosted.org/packages/aa/ca/eadf6f9c8fa5e31d40993e3db153fb5ed0b11008ad5d9de98a95045bed84/aiohttp-3.13.5-cp312-cp312-win_amd64.whl", hash = "sha256:110e448e02c729bcebb18c60b9214a87ba33bac4a9fa5e9a5f139938b56c6cb1", size = 460446 },
-    { url = "https://files.pythonhosted.org/packages/78/e9/d76bf503005709e390122d34e15256b88f7008e246c4bdbe915cd4f1adce/aiohttp-3.13.5-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a5029cc80718bbd545123cd8fe5d15025eccaaaace5d0eeec6bd556ad6163d61", size = 742930 },
-    { url = "https://files.pythonhosted.org/packages/57/00/4b7b70223deaebd9bb85984d01a764b0d7bd6526fcdc73cca83bcbe7243e/aiohttp-3.13.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4bb6bf5811620003614076bdc807ef3b5e38244f9d25ca5fe888eaccea2a9832", size = 496927 },
-    { url = "https://files.pythonhosted.org/packages/9c/f5/0fb20fb49f8efdcdce6cd8127604ad2c503e754a8f139f5e02b01626523f/aiohttp-3.13.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a84792f8631bf5a94e52d9cc881c0b824ab42717165a5579c760b830d9392ac9", size = 497141 },
-    { url = "https://files.pythonhosted.org/packages/3b/86/b7c870053e36a94e8951b803cb5b909bfbc9b90ca941527f5fcafbf6b0fa/aiohttp-3.13.5-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:57653eac22c6a4c13eb22ecf4d673d64a12f266e72785ab1c8b8e5940d0e8090", size = 1732476 },
-    { url = "https://files.pythonhosted.org/packages/b5/e5/4e161f84f98d80c03a238671b4136e6530453d65262867d989bbe78244d0/aiohttp-3.13.5-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:e5e5f7debc7a57af53fdf5c5009f9391d9f4c12867049d509bf7bb164a6e295b", size = 1706507 },
-    { url = "https://files.pythonhosted.org/packages/d4/56/ea11a9f01518bd5a2a2fcee869d248c4b8a0cfa0bb13401574fa31adf4d4/aiohttp-3.13.5-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c719f65bebcdf6716f10e9eff80d27567f7892d8988c06de12bbbd39307c6e3a", size = 1773465 },
-    { url = "https://files.pythonhosted.org/packages/eb/40/333ca27fb74b0383f17c90570c748f7582501507307350a79d9f9f3c6eb1/aiohttp-3.13.5-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d97f93fdae594d886c5a866636397e2bcab146fd7a132fd6bb9ce182224452f8", size = 1873523 },
-    { url = "https://files.pythonhosted.org/packages/f0/d2/e2f77eef1acb7111405433c707dc735e63f67a56e176e72e9e7a2cd3f493/aiohttp-3.13.5-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3df334e39d4c2f899a914f1dba283c1aadc311790733f705182998c6f7cae665", size = 1754113 },
-    { url = "https://files.pythonhosted.org/packages/fb/56/3f653d7f53c89669301ec9e42c95233e2a0c0a6dd051269e6e678db4fdb0/aiohttp-3.13.5-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:fe6970addfea9e5e081401bcbadf865d2b6da045472f58af08427e108d618540", size = 1562351 },
-    { url = "https://files.pythonhosted.org/packages/ec/a6/9b3e91eb8ae791cce4ee736da02211c85c6f835f1bdfac0594a8a3b7018c/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:7becdf835feff2f4f335d7477f121af787e3504b48b449ff737afb35869ba7bb", size = 1693205 },
-    { url = "https://files.pythonhosted.org/packages/98/fc/bfb437a99a2fcebd6b6eaec609571954de2ed424f01c352f4b5504371dd3/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:676e5651705ad5d8a70aeb8eb6936c436d8ebbd56e63436cb7dd9bb36d2a9a46", size = 1730618 },
-    { url = "https://files.pythonhosted.org/packages/e4/b6/c8534862126191a034f68153194c389addc285a0f1347d85096d349bbc15/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:9b16c653d38eb1a611cc898c41e76859ca27f119d25b53c12875fd0474ae31a8", size = 1745185 },
-    { url = "https://files.pythonhosted.org/packages/0b/93/4ca8ee2ef5236e2707e0fd5fecb10ce214aee1ff4ab307af9c558bda3b37/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:999802d5fa0389f58decd24b537c54aa63c01c3219ce17d1214cbda3c2b22d2d", size = 1557311 },
-    { url = "https://files.pythonhosted.org/packages/57/ae/76177b15f18c5f5d094f19901d284025db28eccc5ae374d1d254181d33f4/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:ec707059ee75732b1ba130ed5f9580fe10ff75180c812bc267ded039db5128c6", size = 1773147 },
-    { url = "https://files.pythonhosted.org/packages/01/a4/62f05a0a98d88af59d93b7fcac564e5f18f513cb7471696ac286db970d6a/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:2d6d44a5b48132053c2f6cd5c8cb14bc67e99a63594e336b0f2af81e94d5530c", size = 1730356 },
-    { url = "https://files.pythonhosted.org/packages/e4/85/fc8601f59dfa8c9523808281f2da571f8b4699685f9809a228adcc90838d/aiohttp-3.13.5-cp313-cp313-win32.whl", hash = "sha256:329f292ed14d38a6c4c435e465f48bebb47479fd676a0411936cc371643225cc", size = 432637 },
-    { url = "https://files.pythonhosted.org/packages/c0/1b/ac685a8882896acf0f6b31d689e3792199cfe7aba37969fa91da63a7fa27/aiohttp-3.13.5-cp313-cp313-win_amd64.whl", hash = "sha256:69f571de7500e0557801c0b51f4780482c0ec5fe2ac851af5a92cfce1af1cb83", size = 458896 },
-    { url = "https://files.pythonhosted.org/packages/5d/ce/46572759afc859e867a5bc8ec3487315869013f59281ce61764f76d879de/aiohttp-3.13.5-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:eb4639f32fd4a9904ab8fb45bf3383ba71137f3d9d4ba25b3b3f3109977c5b8c", size = 745721 },
-    { url = "https://files.pythonhosted.org/packages/13/fe/8a2efd7626dbe6049b2ef8ace18ffda8a4dfcbe1bcff3ac30c0c7575c20b/aiohttp-3.13.5-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:7e5dc4311bd5ac493886c63cbf76ab579dbe4641268e7c74e48e774c74b6f2be", size = 497663 },
-    { url = "https://files.pythonhosted.org/packages/9b/91/cc8cc78a111826c54743d88651e1687008133c37e5ee615fee9b57990fac/aiohttp-3.13.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:756c3c304d394977519824449600adaf2be0ccee76d206ee339c5e76b70ded25", size = 499094 },
-    { url = "https://files.pythonhosted.org/packages/0a/33/a8362cb15cf16a3af7e86ed11962d5cd7d59b449202dc576cdc731310bde/aiohttp-3.13.5-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecc26751323224cf8186efcf7fbcbc30f4e1d8c7970659daf25ad995e4032a56", size = 1726701 },
-    { url = "https://files.pythonhosted.org/packages/45/0c/c091ac5c3a17114bd76cbf85d674650969ddf93387876cf67f754204bd77/aiohttp-3.13.5-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:10a75acfcf794edf9d8db50e5a7ec5fc818b2a8d3f591ce93bc7b1210df016d2", size = 1683360 },
-    { url = "https://files.pythonhosted.org/packages/23/73/bcee1c2b79bc275e964d1446c55c54441a461938e70267c86afaae6fba27/aiohttp-3.13.5-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:0f7a18f258d124cd678c5fe072fe4432a4d5232b0657fca7c1847f599233c83a", size = 1773023 },
-    { url = "https://files.pythonhosted.org/packages/c7/ef/720e639df03004fee2d869f771799d8c23046dec47d5b81e396c7cda583a/aiohttp-3.13.5-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:df6104c009713d3a89621096f3e3e88cc323fd269dbd7c20afe18535094320be", size = 1853795 },
-    { url = "https://files.pythonhosted.org/packages/bd/c9/989f4034fb46841208de7aeeac2c6d8300745ab4f28c42f629ba77c2d916/aiohttp-3.13.5-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:241a94f7de7c0c3b616627aaad530fe2cb620084a8b144d3be7b6ecfe95bae3b", size = 1730405 },
-    { url = "https://files.pythonhosted.org/packages/ce/75/ee1fd286ca7dc599d824b5651dad7b3be7ff8d9a7e7b3fe9820d9180f7db/aiohttp-3.13.5-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c974fb66180e58709b6fc402846f13791240d180b74de81d23913abe48e96d94", size = 1558082 },
-    { url = "https://files.pythonhosted.org/packages/c3/20/1e9e6650dfc436340116b7aa89ff8cb2bbdf0abc11dfaceaad8f74273a10/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:6e27ea05d184afac78aabbac667450c75e54e35f62238d44463131bd3f96753d", size = 1692346 },
-    { url = "https://files.pythonhosted.org/packages/d8/40/8ebc6658d48ea630ac7903912fe0dd4e262f0e16825aa4c833c56c9f1f56/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:a79a6d399cef33a11b6f004c67bb07741d91f2be01b8d712d52c75711b1e07c7", size = 1698891 },
-    { url = "https://files.pythonhosted.org/packages/d8/78/ea0ae5ec8ba7a5c10bdd6e318f1ba5e76fcde17db8275188772afc7917a4/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:c632ce9c0b534fbe25b52c974515ed674937c5b99f549a92127c85f771a78772", size = 1742113 },
-    { url = "https://files.pythonhosted.org/packages/8a/66/9d308ed71e3f2491be1acb8769d96c6f0c47d92099f3bc9119cada27b357/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:fceedde51fbd67ee2bcc8c0b33d0126cc8b51ef3bbde2f86662bd6d5a6f10ec5", size = 1553088 },
-    { url = "https://files.pythonhosted.org/packages/da/a6/6cc25ed8dfc6e00c90f5c6d126a98e2cf28957ad06fa1036bd34b6f24a2c/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:f92995dfec9420bb69ae629abf422e516923ba79ba4403bc750d94fb4a6c68c1", size = 1757976 },
-    { url = "https://files.pythonhosted.org/packages/c1/2b/cce5b0ffe0de99c83e5e36d8f828e4161e415660a9f3e58339d07cce3006/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:20ae0ff08b1f2c8788d6fb85afcb798654ae6ba0b747575f8562de738078457b", size = 1712444 },
-    { url = "https://files.pythonhosted.org/packages/6c/cf/9e1795b4160c58d29421eafd1a69c6ce351e2f7c8d3c6b7e4ca44aea1a5b/aiohttp-3.13.5-cp314-cp314-win32.whl", hash = "sha256:b20df693de16f42b2472a9c485e1c948ee55524786a0a34345511afdd22246f3", size = 438128 },
-    { url = "https://files.pythonhosted.org/packages/22/4d/eaedff67fc805aeba4ba746aec891b4b24cebb1a7d078084b6300f79d063/aiohttp-3.13.5-cp314-cp314-win_amd64.whl", hash = "sha256:f85c6f327bf0b8c29da7d93b1cabb6363fb5e4e160a32fa241ed2dce21b73162", size = 464029 },
-    { url = "https://files.pythonhosted.org/packages/79/11/c27d9332ee20d68dd164dc12a6ecdef2e2e35ecc97ed6cf0d2442844624b/aiohttp-3.13.5-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:1efb06900858bb618ff5cee184ae2de5828896c448403d51fb633f09e109be0a", size = 778758 },
-    { url = "https://files.pythonhosted.org/packages/04/fb/377aead2e0a3ba5f09b7624f702a964bdf4f08b5b6728a9799830c80041e/aiohttp-3.13.5-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:fee86b7c4bd29bdaf0d53d14739b08a106fdda809ca5fe032a15f52fae5fe254", size = 512883 },
-    { url = "https://files.pythonhosted.org/packages/bb/a6/aa109a33671f7a5d3bd78b46da9d852797c5e665bfda7d6b373f56bff2ec/aiohttp-3.13.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:20058e23909b9e65f9da62b396b77dfa95965cbe840f8def6e572538b1d32e36", size = 516668 },
-    { url = "https://files.pythonhosted.org/packages/79/b3/ca078f9f2fa9563c36fb8ef89053ea2bb146d6f792c5104574d49d8acb63/aiohttp-3.13.5-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8cf20a8d6868cb15a73cab329ffc07291ba8c22b1b88176026106ae39aa6df0f", size = 1883461 },
-    { url = "https://files.pythonhosted.org/packages/b7/e3/a7ad633ca1ca497b852233a3cce6906a56c3225fb6d9217b5e5e60b7419d/aiohttp-3.13.5-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:330f5da04c987f1d5bdb8ae189137c77139f36bd1cb23779ca1a354a4b027800", size = 1747661 },
-    { url = "https://files.pythonhosted.org/packages/33/b9/cd6fe579bed34a906d3d783fe60f2fa297ef55b27bb4538438ee49d4dc41/aiohttp-3.13.5-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6f1cbf0c7926d315c3c26c2da41fd2b5d2fe01ac0e157b78caefc51a782196cf", size = 1863800 },
-    { url = "https://files.pythonhosted.org/packages/c0/3f/2c1e2f5144cefa889c8afd5cf431994c32f3b29da9961698ff4e3811b79a/aiohttp-3.13.5-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:53fc049ed6390d05423ba33103ded7281fe897cf97878f369a527070bd95795b", size = 1958382 },
-    { url = "https://files.pythonhosted.org/packages/66/1d/f31ec3f1013723b3babe3609e7f119c2c2fb6ef33da90061a705ef3e1bc8/aiohttp-3.13.5-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:898703aa2667e3c5ca4c54ca36cd73f58b7a38ef87a5606414799ebce4d3fd3a", size = 1803724 },
-    { url = "https://files.pythonhosted.org/packages/0e/b4/57712dfc6f1542f067daa81eb61da282fab3e6f1966fca25db06c4fc62d5/aiohttp-3.13.5-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:0494a01ca9584eea1e5fbd6d748e61ecff218c51b576ee1999c23db7066417d8", size = 1640027 },
-    { url = "https://files.pythonhosted.org/packages/25/3c/734c878fb43ec083d8e31bf029daae1beafeae582d1b35da234739e82ee7/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:6cf81fe010b8c17b09495cbd15c1d35afbc8fb405c0c9cf4738e5ae3af1d65be", size = 1806644 },
-    { url = "https://files.pythonhosted.org/packages/20/a5/f671e5cbec1c21d044ff3078223f949748f3a7f86b14e34a365d74a5d21f/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:c564dd5f09ddc9d8f2c2d0a301cd30a79a2cc1b46dd1a73bef8f0038863d016b", size = 1791630 },
-    { url = "https://files.pythonhosted.org/packages/0b/63/fb8d0ad63a0b8a99be97deac8c04dacf0785721c158bdf23d679a87aa99e/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:2994be9f6e51046c4f864598fd9abeb4fba6e88f0b2152422c9666dcd4aea9c6", size = 1809403 },
-    { url = "https://files.pythonhosted.org/packages/59/0c/bfed7f30662fcf12206481c2aac57dedee43fe1c49275e85b3a1e1742294/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:157826e2fa245d2ef46c83ea8a5faf77ca19355d278d425c29fda0beb3318037", size = 1634924 },
-    { url = "https://files.pythonhosted.org/packages/17/d6/fd518d668a09fd5a3319ae5e984d4d80b9a4b3df4e21c52f02251ef5a32e/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:a8aca50daa9493e9e13c0f566201a9006f080e7c50e5e90d0b06f53146a54500", size = 1836119 },
-    { url = "https://files.pythonhosted.org/packages/78/b7/15fb7a9d52e112a25b621c67b69c167805cb1f2ab8f1708a5c490d1b52fe/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3b13560160d07e047a93f23aaa30718606493036253d5430887514715b67c9d9", size = 1772072 },
-    { url = "https://files.pythonhosted.org/packages/7e/df/57ba7f0c4a553fc2bd8b6321df236870ec6fd64a2a473a8a13d4f733214e/aiohttp-3.13.5-cp314-cp314t-win32.whl", hash = "sha256:9a0f4474b6ea6818b41f82172d799e4b3d29e22c2c520ce4357856fced9af2f8", size = 471819 },
-    { url = "https://files.pythonhosted.org/packages/62/29/2f8418269e46454a26171bfdd6a055d74febf32234e474930f2f60a17145/aiohttp-3.13.5-cp314-cp314t-win_amd64.whl", hash = "sha256:18a2f6c1182c51baa1d28d68fea51513cb2a76612f038853c0ad3c145423d3d9", size = 505441 },
+sdist = { url = "https://files.pythonhosted.org/packages/45/4a/064321452809dae953c1ed6e017504e72551a26b6f5708a5a80e4bf556ff/aiohttp-3.13.4.tar.gz", hash = "sha256:d97a6d09c66087890c2ab5d49069e1e570583f7ac0314ecf98294c1b6aaebd38", size = 7859748 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1e/bd/ede278648914cabbabfdf95e436679b5d4156e417896a9b9f4587169e376/aiohttp-3.13.4-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ee62d4471ce86b108b19c3364db4b91180d13fe3510144872d6bad5401957360", size = 752158 },
+    { url = "https://files.pythonhosted.org/packages/90/de/581c053253c07b480b03785196ca5335e3c606a37dc73e95f6527f1591fe/aiohttp-3.13.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c0fd8f41b54b58636402eb493afd512c23580456f022c1ba2db0f810c959ed0d", size = 501037 },
+    { url = "https://files.pythonhosted.org/packages/fa/f9/a5ede193c08f13cc42c0a5b50d1e246ecee9115e4cf6e900d8dbd8fd6acb/aiohttp-3.13.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4baa48ce49efd82d6b1a0be12d6a36b35e5594d1dd42f8bfba96ea9f8678b88c", size = 501556 },
+    { url = "https://files.pythonhosted.org/packages/d6/10/88ff67cd48a6ec36335b63a640abe86135791544863e0cfe1f065d6cef7a/aiohttp-3.13.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d738ebab9f71ee652d9dbd0211057690022201b11197f9a7324fd4dba128aa97", size = 1757314 },
+    { url = "https://files.pythonhosted.org/packages/8b/15/fdb90a5cf5a1f52845c276e76298c75fbbcc0ac2b4a86551906d54529965/aiohttp-3.13.4-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:0ce692c3468fa831af7dceed52edf51ac348cebfc8d3feb935927b63bd3e8576", size = 1731819 },
+    { url = "https://files.pythonhosted.org/packages/ec/df/28146785a007f7820416be05d4f28cc207493efd1e8c6c1068e9bdc29198/aiohttp-3.13.4-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:8e08abcfe752a454d2cb89ff0c08f2d1ecd057ae3e8cc6d84638de853530ebab", size = 1793279 },
+    { url = "https://files.pythonhosted.org/packages/10/47/689c743abf62ea7a77774d5722f220e2c912a77d65d368b884d9779ef41b/aiohttp-3.13.4-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5977f701b3fff36367a11087f30ea73c212e686d41cd363c50c022d48b011d8d", size = 1891082 },
+    { url = "https://files.pythonhosted.org/packages/b0/b6/f7f4f318c7e58c23b761c9b13b9a3c9b394e0f9d5d76fbc6622fa98509f6/aiohttp-3.13.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:54203e10405c06f8b6020bd1e076ae0fe6c194adcee12a5a78af3ffa3c57025e", size = 1773938 },
+    { url = "https://files.pythonhosted.org/packages/aa/06/f207cb3121852c989586a6fc16ff854c4fcc8651b86c5d3bd1fc83057650/aiohttp-3.13.4-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:358a6af0145bc4dda037f13167bef3cce54b132087acc4c295c739d05d16b1c3", size = 1579548 },
+    { url = "https://files.pythonhosted.org/packages/6c/58/e1289661a32161e24c1fe479711d783067210d266842523752869cc1d9c2/aiohttp-3.13.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:898ea1850656d7d61832ef06aa9846ab3ddb1621b74f46de78fbc5e1a586ba83", size = 1714669 },
+    { url = "https://files.pythonhosted.org/packages/96/0a/3e86d039438a74a86e6a948a9119b22540bae037d6ba317a042ae3c22711/aiohttp-3.13.4-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:7bc30cceb710cf6a44e9617e43eebb6e3e43ad855a34da7b4b6a73537d8a6763", size = 1754175 },
+    { url = "https://files.pythonhosted.org/packages/f4/30/e717fc5df83133ba467a560b6d8ef20197037b4bb5d7075b90037de1018e/aiohttp-3.13.4-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:4a31c0c587a8a038f19a4c7e60654a6c899c9de9174593a13e7cc6e15ff271f9", size = 1762049 },
+    { url = "https://files.pythonhosted.org/packages/e4/28/8f7a2d4492e336e40005151bdd94baf344880a4707573378579f833a64c1/aiohttp-3.13.4-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:2062f675f3fe6e06d6113eb74a157fb9df58953ffed0cdb4182554b116545758", size = 1570861 },
+    { url = "https://files.pythonhosted.org/packages/78/45/12e1a3d0645968b1c38de4b23fdf270b8637735ea057d4f84482ff918ad9/aiohttp-3.13.4-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:3d1ba8afb847ff80626d5e408c1fdc99f942acc877d0702fe137015903a220a9", size = 1790003 },
+    { url = "https://files.pythonhosted.org/packages/eb/0f/60374e18d590de16dcb39d6ff62f39c096c1b958e6f37727b5870026ea30/aiohttp-3.13.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b08149419994cdd4d5eecf7fd4bc5986b5a9380285bcd01ab4c0d6bfca47b79d", size = 1737289 },
+    { url = "https://files.pythonhosted.org/packages/02/bf/535e58d886cfbc40a8b0013c974afad24ef7632d645bca0b678b70033a60/aiohttp-3.13.4-cp312-cp312-win32.whl", hash = "sha256:fc432f6a2c4f720180959bc19aa37259651c1a4ed8af8afc84dd41c60f15f791", size = 434185 },
+    { url = "https://files.pythonhosted.org/packages/1e/1a/d92e3325134ebfff6f4069f270d3aac770d63320bd1fcd0eca023e74d9a8/aiohttp-3.13.4-cp312-cp312-win_amd64.whl", hash = "sha256:6148c9ae97a3e8bff9a1fc9c757fa164116f86c100468339730e717590a3fb77", size = 461285 },
+    { url = "https://files.pythonhosted.org/packages/e3/ac/892f4162df9b115b4758d615f32ec63d00f3084c705ff5526630887b9b42/aiohttp-3.13.4-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:63dd5e5b1e43b8fb1e91b79b7ceba1feba588b317d1edff385084fcc7a0a4538", size = 745744 },
+    { url = "https://files.pythonhosted.org/packages/97/a9/c5b87e4443a2f0ea88cb3000c93a8fdad1ee63bffc9ded8d8c8e0d66efc6/aiohttp-3.13.4-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:746ac3cc00b5baea424dacddea3ec2c2702f9590de27d837aa67004db1eebc6e", size = 498178 },
+    { url = "https://files.pythonhosted.org/packages/94/42/07e1b543a61250783650df13da8ddcdc0d0a5538b2bd15cef6e042aefc61/aiohttp-3.13.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bda8f16ea99d6a6705e5946732e48487a448be874e54a4f73d514660ff7c05d3", size = 498331 },
+    { url = "https://files.pythonhosted.org/packages/20/d6/492f46bf0328534124772d0cf58570acae5b286ea25006900650f69dae0e/aiohttp-3.13.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4b061e7b5f840391e3f64d0ddf672973e45c4cfff7a0feea425ea24e51530fc2", size = 1744414 },
+    { url = "https://files.pythonhosted.org/packages/e2/4d/e02627b2683f68051246215d2d62b2d2f249ff7a285e7a858dc47d6b6a14/aiohttp-3.13.4-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:b252e8d5cd66184b570d0d010de742736e8a4fab22c58299772b0c5a466d4b21", size = 1719226 },
+    { url = "https://files.pythonhosted.org/packages/7b/6c/5d0a3394dd2b9f9aeba6e1b6065d0439e4b75d41f1fb09a3ec010b43552b/aiohttp-3.13.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:20af8aad61d1803ff11152a26146d8d81c266aa8c5aa9b4504432abb965c36a0", size = 1782110 },
+    { url = "https://files.pythonhosted.org/packages/0d/2d/c20791e3437700a7441a7edfb59731150322424f5aadf635602d1d326101/aiohttp-3.13.4-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:13a5cc924b59859ad2adb1478e31f410a7ed46e92a2a619d6d1dd1a63c1a855e", size = 1884809 },
+    { url = "https://files.pythonhosted.org/packages/c8/94/d99dbfbd1924a87ef643833932eb2a3d9e5eee87656efea7d78058539eff/aiohttp-3.13.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:534913dfb0a644d537aebb4123e7d466d94e3be5549205e6a31f72368980a81a", size = 1764938 },
+    { url = "https://files.pythonhosted.org/packages/49/61/3ce326a1538781deb89f6cf5e094e2029cd308ed1e21b2ba2278b08426f6/aiohttp-3.13.4-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:320e40192a2dcc1cf4b5576936e9652981ab596bf81eb309535db7e2f5b5672f", size = 1570697 },
+    { url = "https://files.pythonhosted.org/packages/b6/77/4ab5a546857bb3028fbaf34d6eea180267bdab022ee8b1168b1fcde4bfdd/aiohttp-3.13.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9e587fcfce2bcf06526a43cb705bdee21ac089096f2e271d75de9c339db3100c", size = 1702258 },
+    { url = "https://files.pythonhosted.org/packages/79/63/d8f29021e39bc5af8e5d5e9da1b07976fb9846487a784e11e4f4eeda4666/aiohttp-3.13.4-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:9eb9c2eea7278206b5c6c1441fdd9dc420c278ead3f3b2cc87f9b693698cc500", size = 1740287 },
+    { url = "https://files.pythonhosted.org/packages/55/3a/cbc6b3b124859a11bc8055d3682c26999b393531ef926754a3445b99dfef/aiohttp-3.13.4-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:29be00c51972b04bf9d5c8f2d7f7314f48f96070ca40a873a53056e652e805f7", size = 1753011 },
+    { url = "https://files.pythonhosted.org/packages/e0/30/836278675205d58c1368b21520eab9572457cf19afd23759216c04483048/aiohttp-3.13.4-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:90c06228a6c3a7c9f776fe4fc0b7ff647fffd3bed93779a6913c804ae00c1073", size = 1566359 },
+    { url = "https://files.pythonhosted.org/packages/50/b4/8032cc9b82d17e4277704ba30509eaccb39329dc18d6a35f05e424439e32/aiohttp-3.13.4-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:a533ec132f05fd9a1d959e7f34184cd7d5e8511584848dab85faefbaac573069", size = 1785537 },
+    { url = "https://files.pythonhosted.org/packages/17/7d/5873e98230bde59f493bf1f7c3e327486a4b5653fa401144704df5d00211/aiohttp-3.13.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:1c946f10f413836f82ea4cfb90200d2a59578c549f00857e03111cf45ad01ca5", size = 1740752 },
+    { url = "https://files.pythonhosted.org/packages/7b/f2/13e46e0df051494d7d3c68b7f72d071f48c384c12716fc294f75d5b1a064/aiohttp-3.13.4-cp313-cp313-win32.whl", hash = "sha256:48708e2706106da6967eff5908c78ca3943f005ed6bcb75da2a7e4da94ef8c70", size = 433187 },
+    { url = "https://files.pythonhosted.org/packages/ea/c0/649856ee655a843c8f8664592cfccb73ac80ede6a8c8db33a25d810c12db/aiohttp-3.13.4-cp313-cp313-win_amd64.whl", hash = "sha256:74a2eb058da44fa3a877a49e2095b591d4913308bb424c418b77beb160c55ce3", size = 459778 },
+    { url = "https://files.pythonhosted.org/packages/6d/29/6657cc37ae04cacc2dbf53fb730a06b6091cc4cbe745028e047c53e6d840/aiohttp-3.13.4-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:e0a2c961fc92abeff61d6444f2ce6ad35bb982db9fc8ff8a47455beacf454a57", size = 749363 },
+    { url = "https://files.pythonhosted.org/packages/90/7f/30ccdf67ca3d24b610067dc63d64dcb91e5d88e27667811640644aa4a85d/aiohttp-3.13.4-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:153274535985a0ff2bff1fb6c104ed547cec898a09213d21b0f791a44b14d933", size = 499317 },
+    { url = "https://files.pythonhosted.org/packages/93/13/e372dd4e68ad04ee25dafb050c7f98b0d91ea643f7352757e87231102555/aiohttp-3.13.4-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:351f3171e2458da3d731ce83f9e6b9619e325c45cbd534c7759750cabf453ad7", size = 500477 },
+    { url = "https://files.pythonhosted.org/packages/e5/fe/ee6298e8e586096fb6f5eddd31393d8544f33ae0792c71ecbb4c2bef98ac/aiohttp-3.13.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f989ac8bc5595ff761a5ccd32bdb0768a117f36dd1504b1c2c074ed5d3f4df9c", size = 1737227 },
+    { url = "https://files.pythonhosted.org/packages/b0/b9/a7a0463a09e1a3fe35100f74324f23644bfc3383ac5fd5effe0722a5f0b7/aiohttp-3.13.4-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:d36fc1709110ec1e87a229b201dd3ddc32aa01e98e7868083a794609b081c349", size = 1694036 },
+    { url = "https://files.pythonhosted.org/packages/57/7c/8972ae3fb7be00a91aee6b644b2a6a909aedb2c425269a3bfd90115e6f8f/aiohttp-3.13.4-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:42adaeea83cbdf069ab94f5103ce0787c21fb1a0153270da76b59d5578302329", size = 1786814 },
+    { url = "https://files.pythonhosted.org/packages/93/01/c81e97e85c774decbaf0d577de7d848934e8166a3a14ad9f8aa5be329d28/aiohttp-3.13.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:92deb95469928cc41fd4b42a95d8012fa6df93f6b1c0a83af0ffbc4a5e218cde", size = 1866676 },
+    { url = "https://files.pythonhosted.org/packages/5a/5f/5b46fe8694a639ddea2cd035bf5729e4677ea882cb251396637e2ef1590d/aiohttp-3.13.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0c0c7c07c4257ef3a1df355f840bc62d133bcdef5c1c5ba75add3c08553e2eed", size = 1740842 },
+    { url = "https://files.pythonhosted.org/packages/20/a2/0d4b03d011cca6b6b0acba8433193c1e484efa8d705ea58295590fe24203/aiohttp-3.13.4-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f062c45de8a1098cb137a1898819796a2491aec4e637a06b03f149315dff4d8f", size = 1566508 },
+    { url = "https://files.pythonhosted.org/packages/98/17/e689fd500da52488ec5f889effd6404dece6a59de301e380f3c64f167beb/aiohttp-3.13.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:76093107c531517001114f0ebdb4f46858ce818590363e3e99a4a2280334454a", size = 1700569 },
+    { url = "https://files.pythonhosted.org/packages/d8/0d/66402894dbcf470ef7db99449e436105ea862c24f7ea4c95c683e635af35/aiohttp-3.13.4-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:6f6ec32162d293b82f8b63a16edc80769662fbd5ae6fbd4936d3206a2c2cc63b", size = 1707407 },
+    { url = "https://files.pythonhosted.org/packages/2f/eb/af0ab1a3650092cbd8e14ef29e4ab0209e1460e1c299996c3f8288b3f1ff/aiohttp-3.13.4-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:5903e2db3d202a00ad9f0ec35a122c005e85d90c9836ab4cda628f01edf425e2", size = 1752214 },
+    { url = "https://files.pythonhosted.org/packages/5a/bf/72326f8a98e4c666f292f03c385545963cc65e358835d2a7375037a97b57/aiohttp-3.13.4-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:2d5bea57be7aca98dbbac8da046d99b5557c5cf4e28538c4c786313078aca09e", size = 1562162 },
+    { url = "https://files.pythonhosted.org/packages/67/9f/13b72435f99151dd9a5469c96b3b5f86aa29b7e785ca7f35cf5e538f74c0/aiohttp-3.13.4-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:bcf0c9902085976edc0232b75006ef38f89686901249ce14226b6877f88464fb", size = 1768904 },
+    { url = "https://files.pythonhosted.org/packages/18/bc/28d4970e7d5452ac7776cdb5431a1164a0d9cf8bd2fffd67b4fb463aa56d/aiohttp-3.13.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:c3295f98bfeed2e867cab588f2a146a9db37a85e3ae9062abf46ba062bd29165", size = 1723378 },
+    { url = "https://files.pythonhosted.org/packages/53/74/b32458ca1a7f34d65bdee7aef2036adbe0438123d3d53e2b083c453c24dd/aiohttp-3.13.4-cp314-cp314-win32.whl", hash = "sha256:a598a5c5767e1369d8f5b08695cab1d8160040f796c4416af76fd773d229b3c9", size = 438711 },
+    { url = "https://files.pythonhosted.org/packages/40/b2/54b487316c2df3e03a8f3435e9636f8a81a42a69d942164830d193beb56a/aiohttp-3.13.4-cp314-cp314-win_amd64.whl", hash = "sha256:c555db4bc7a264bead5a7d63d92d41a1122fcd39cc62a4db815f45ad46f9c2c8", size = 464977 },
+    { url = "https://files.pythonhosted.org/packages/47/fb/e41b63c6ce71b07a59243bb8f3b457ee0c3402a619acb9d2c0d21ef0e647/aiohttp-3.13.4-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:45abbbf09a129825d13c18c7d3182fecd46d9da3cfc383756145394013604ac1", size = 781549 },
+    { url = "https://files.pythonhosted.org/packages/97/53/532b8d28df1e17e44c4d9a9368b78dcb6bf0b51037522136eced13afa9e8/aiohttp-3.13.4-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:74c80b2bc2c2adb7b3d1941b2b60701ee2af8296fc8aad8b8bc48bc25767266c", size = 514383 },
+    { url = "https://files.pythonhosted.org/packages/1b/1f/62e5d400603e8468cd635812d99cb81cfdc08127a3dc474c647615f31339/aiohttp-3.13.4-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c97989ae40a9746650fa196894f317dafc12227c808c774929dda0ff873a5954", size = 518304 },
+    { url = "https://files.pythonhosted.org/packages/90/57/2326b37b10896447e3c6e0cbef4fe2486d30913639a5cfd1332b5d870f82/aiohttp-3.13.4-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dae86be9811493f9990ef44fff1685f5c1a3192e9061a71a109d527944eed551", size = 1893433 },
+    { url = "https://files.pythonhosted.org/packages/d2/b4/a24d82112c304afdb650167ef2fe190957d81cbddac7460bedd245f765aa/aiohttp-3.13.4-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:1db491abe852ca2fa6cc48a3341985b0174b3741838e1341b82ac82c8bd9e871", size = 1755901 },
+    { url = "https://files.pythonhosted.org/packages/9e/2d/0883ef9d878d7846287f036c162a951968f22aabeef3ac97b0bea6f76d5d/aiohttp-3.13.4-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:0e5d701c0aad02a7dce72eef6b93226cf3734330f1a31d69ebbf69f33b86666e", size = 1876093 },
+    { url = "https://files.pythonhosted.org/packages/ad/52/9204bb59c014869b71971addad6778f005daa72a96eed652c496789d7468/aiohttp-3.13.4-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8ac32a189081ae0a10ba18993f10f338ec94341f0d5df8fff348043962f3c6f8", size = 1970815 },
+    { url = "https://files.pythonhosted.org/packages/d6/b5/e4eb20275a866dde0f570f411b36c6b48f7b53edfe4f4071aa1b0728098a/aiohttp-3.13.4-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:98e968cdaba43e45c73c3f306fca418c8009a957733bac85937c9f9cf3f4de27", size = 1816223 },
+    { url = "https://files.pythonhosted.org/packages/d8/23/e98075c5bb146aa61a1239ee1ac7714c85e814838d6cebbe37d3fe19214a/aiohttp-3.13.4-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:ca114790c9144c335d538852612d3e43ea0f075288f4849cf4b05d6cd2238ce7", size = 1649145 },
+    { url = "https://files.pythonhosted.org/packages/d6/c1/7bad8be33bb06c2bb224b6468874346026092762cbec388c3bdb65a368ee/aiohttp-3.13.4-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:ea2e071661ba9cfe11eabbc81ac5376eaeb3061f6e72ec4cc86d7cdd1ffbdbbb", size = 1816562 },
+    { url = "https://files.pythonhosted.org/packages/5c/10/c00323348695e9a5e316825969c88463dcc24c7e9d443244b8a2c9cf2eae/aiohttp-3.13.4-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:34e89912b6c20e0fd80e07fa401fd218a410aa1ce9f1c2f1dad6db1bd0ce0927", size = 1800333 },
+    { url = "https://files.pythonhosted.org/packages/84/43/9b2147a1df3559f49bd723e22905b46a46c068a53adb54abdca32c4de180/aiohttp-3.13.4-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:0e217cf9f6a42908c52b46e42c568bd57adc39c9286ced31aaace614b6087965", size = 1820617 },
+    { url = "https://files.pythonhosted.org/packages/a9/7f/b3481a81e7a586d02e99387b18c6dafff41285f6efd3daa2124c01f87eae/aiohttp-3.13.4-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:0c296f1221e21ba979f5ac1964c3b78cfde15c5c5f855ffd2caab337e9cd9182", size = 1643417 },
+    { url = "https://files.pythonhosted.org/packages/8f/72/07181226bc99ce1124e0f89280f5221a82d3ae6a6d9d1973ce429d48e52b/aiohttp-3.13.4-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:d99a9d168ebaffb74f36d011750e490085ac418f4db926cce3989c8fe6cb6b1b", size = 1849286 },
+    { url = "https://files.pythonhosted.org/packages/1a/e6/1b3566e103eca6da5be4ae6713e112a053725c584e96574caf117568ffef/aiohttp-3.13.4-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:cb19177205d93b881f3f89e6081593676043a6828f59c78c17a0fd6c1fbed2ba", size = 1782635 },
+    { url = "https://files.pythonhosted.org/packages/37/58/1b11c71904b8d079eb0c39fe664180dd1e14bebe5608e235d8bfbadc8929/aiohttp-3.13.4-cp314-cp314t-win32.whl", hash = "sha256:c606aa5656dab6552e52ca368e43869c916338346bfaf6304e15c58fb113ea30", size = 472537 },
+    { url = "https://files.pythonhosted.org/packages/bc/8f/87c56a1a1977d7dddea5b31e12189665a140fdb48a71e9038ff90bb564ec/aiohttp-3.13.4-cp314-cp314t-win_amd64.whl", hash = "sha256:014dcc10ec8ab8db681f0d68e939d1e9286a5aa2b993cbbdb0db130853e02144", size = 506381 },
 ]
 
 [[package]]
@@ -3723,7 +3723,7 @@ wheels = [
 
 [[package]]
 name = "litellm"
-version = "1.83.4"
+version = "1.83.14"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "aiohttp" },
@@ -3739,9 +3739,9 @@ dependencies = [
     { name = "tiktoken" },
     { name = "tokenizers" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/03/c4/30469c06ae7437a4406bc11e3c433cfd380a6771068cca15ea918dcd158f/litellm-1.83.4.tar.gz", hash = "sha256:6458d2030a41229460b321adee00517a91dbd8e63213cc953d355cb41d16f2d4", size = 17733899 }
+sdist = { url = "https://files.pythonhosted.org/packages/8d/7c/c095649380adc96c8630273c1768c2ad1e74aa2ee1dd8dd05d218a60569f/litellm-1.83.14.tar.gz", hash = "sha256:24aef9b47cdc424c833e32f3727f411741c690832cd1fe4405e0077144fe09c9", size = 14836599 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/b8/bd/df19d3f8f6654535ee343a341fd921f81c411abf601a53e3eaef58129b02/litellm-1.83.4-py3-none-any.whl", hash = "sha256:17d7b4d48d47aca988ea4f762ddda5e7bd72cda3270192b22813d0330869d7b4", size = 16015555 },
+    { url = "https://files.pythonhosted.org/packages/7f/5c/1b5691575420135e90578543b2bf219497caa33cfd0af64cb38f30288450/litellm-1.83.14-py3-none-any.whl", hash = "sha256:92b11ba2a32cf80707ddf388d18526696c7999a21b418c5e3b6eda1243d2cfdb", size = 16457054 },
 ]
 
 [[package]]
@@ -5124,7 +5124,7 @@ wheels = [
 
 [[package]]
 name = "openai"
-version = "2.30.0"
+version = "2.24.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "anyio" },
@@ -5136,9 +5136,9 @@ dependencies = [
     { name = "tqdm" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/88/15/52580c8fbc16d0675d516e8749806eda679b16de1e4434ea06fb6feaa610/openai-2.30.0.tar.gz", hash = "sha256:92f7661c990bda4b22a941806c83eabe4896c3094465030dd882a71abe80c885", size = 676084 }
+sdist = { url = "https://files.pythonhosted.org/packages/55/13/17e87641b89b74552ed408a92b231283786523edddc95f3545809fab673c/openai-2.24.0.tar.gz", hash = "sha256:1e5769f540dbd01cb33bc4716a23e67b9d695161a734aff9c5f925e2bf99a673", size = 658717 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/2a/9e/5bfa2270f902d5b92ab7d41ce0475b8630572e71e349b2a4996d14bdda93/openai-2.30.0-py3-none-any.whl", hash = "sha256:9a5ae616888eb2748ec5e0c5b955a51592e0b201a11f4262db920f2a78c5231d", size = 1146656 },
+    { url = "https://files.pythonhosted.org/packages/c9/30/844dc675ee6902579b8eef01ed23917cc9319a1c9c0c14ec6e39340c96d0/openai-2.24.0-py3-none-any.whl", hash = "sha256:fed30480d7d6c884303287bde864980a4b137b60553ffbcf9ab4a233b7a73d94", size = 1120122 },
 ]
 
 [[package]]
@@ -6780,11 +6780,11 @@ wheels = [
 
 [[package]]
 name = "python-dotenv"
-version = "1.0.1"
+version = "1.2.2"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/bc/57/e84d88dfe0aec03b7a2d4327012c1627ab5f03652216c63d49846d7a6c58/python-dotenv-1.0.1.tar.gz", hash = "sha256:e324ee90a023d808f1959c46bcbc04446a10ced277783dc6ee09987c37ec10ca", size = 39115 }
+sdist = { url = "https://files.pythonhosted.org/packages/82/ed/0301aeeac3e5353ef3d94b6ec08bbcabd04a72018415dcb29e588514bba8/python_dotenv-1.2.2.tar.gz", hash = "sha256:2c371a91fbd7ba082c2c1dc1f8bf89ca22564a087c2c287cd9b662adde799cf3", size = 50135 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/6a/3e/b68c118422ec867fa7ab88444e1274aa40681c606d59ac27de5a5588f082/python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a", size = 19863 },
+    { url = "https://files.pythonhosted.org/packages/0b/d7/1959b9648791274998a9c3526f6d0ec8fd2233e4d4acce81bbae76b44b2a/python_dotenv-1.2.2-py3-none-any.whl", hash = "sha256:1d8214789a24de455a8b8bd8ae6fe3c6b69a5e3d64aa8a8e5d68e694bbcb285a", size = 22101 },
 ]
 
 [[package]]
@@ -7947,7 +7947,7 @@ wheels = [
 
 [[package]]
 name = "surf-new-backend"
-version = "0.0.15"
+version = "0.0.24"
 source = { editable = "." }
 dependencies = [
     { name = "alembic" },
@@ -8045,7 +8045,7 @@ requires-dist = [
     { name = "composio", specifier = ">=0.10.9" },
     { name = "datasets", specifier = ">=2.21.0" },
     { name = "daytona", specifier = ">=0.146.0" },
-    { name = "deepagents", specifier = ">=0.4.12" },
+    { name = "deepagents", specifier = ">=0.4.12,<0.5" },
     { name = "discord-py", specifier = ">=2.5.2" },
     { name = "docling", specifier = ">=2.15.0" },
     { name = "elasticsearch", specifier = ">=9.1.1" },
@@ -8070,7 +8070,7 @@ requires-dist = [
     { name = "langgraph", specifier = ">=1.1.3" },
     { name = "langgraph-checkpoint-postgres", specifier = ">=3.0.2" },
     { name = "linkup-sdk", specifier = ">=0.2.4" },
-    { name = "litellm", specifier = ">=1.83.0" },
+    { name = "litellm", specifier = ">=1.83.7" },
     { name = "llama-cloud-services", specifier = ">=0.6.25" },
     { name = "markdown", specifier = ">=3.7" },
     { name = "markdownify", specifier = ">=0.14.1" },
diff --git a/surfsense_browser_extension/package.json b/surfsense_browser_extension/package.json
index e17a924ceb..028e653b35 100644
--- a/surfsense_browser_extension/package.json
+++ b/surfsense_browser_extension/package.json
@@ -1,7 +1,7 @@
 {
 	"name": "surfsense_browser_extension",
 	"displayName": "Surfsense Browser Extension",
-	"version": "0.0.15",
+	"version": "0.0.24",
 	"description": "Extension to collect Browsing History for SurfSense.",
 	"author": "https://github.com/MODSetter",
 	"engines": {
diff --git a/surfsense_desktop/.env.example b/surfsense_desktop/.env.example
new file mode 100644
index 0000000000..e127b99e01
--- /dev/null
+++ b/surfsense_desktop/.env.example
@@ -0,0 +1,10 @@
+# Electron-specific build-time configuration.
+# Set before running pnpm dist:mac / dist:win / dist:linux.
+
+# The hosted web frontend URL. Used to intercept OAuth redirects and keep them
+# inside the desktop app. Set to your production frontend domain.
+HOSTED_FRONTEND_URL=https://surfsense.net
+
+# PostHog analytics (leave empty to disable)
+POSTHOG_KEY=
+POSTHOG_HOST=https://assets.surfsense.com
diff --git a/surfsense_desktop/.gitignore b/surfsense_desktop/.gitignore
index 4bff253bb6..70e5f15b95 100644
--- a/surfsense_desktop/.gitignore
+++ b/surfsense_desktop/.gitignore
@@ -1,3 +1,4 @@
 node_modules/
 dist/
 release/
+.env
\ No newline at end of file
diff --git a/surfsense_desktop/README.md b/surfsense_desktop/README.md
index 80efefba8f..0f7a99e938 100644
--- a/surfsense_desktop/README.md
+++ b/surfsense_desktop/README.md
@@ -17,6 +17,8 @@ pnpm dev
 
 This starts the Next.js dev server and Electron concurrently. Hot reload works — edit the web app and changes appear immediately.
 
+On **Linux**, `pnpm dev` runs Electron through `scripts/electron-dev.mjs`: it sets `ELECTRON_DISABLE_SANDBOX=1` for the sandbox issue and passes **`--ozone-platform=x11`** (XWayland) unless **`SURFSENSE_ELECTRON_WAYLAND=1`** is set, so dev tends to behave closer to X11 for shortcuts and Ozone. Packaged Linux builds are unchanged.
+
 ## Configuration
 
 Two `.env` files control the build:
@@ -43,12 +45,13 @@ cd ../surfsense_desktop
 pnpm build
 ```
 
-**Step 3** — Package into a distributable:
+**Step 3** — Package into a distributable (after steps 1–2):
 
 ```bash
 pnpm dist:mac      # macOS (.dmg + .zip)
 pnpm dist:win      # Windows (.exe)
 pnpm dist:linux    # Linux (.deb + .AppImage)
+pnpm pack:dir      # optional: unpacked app only → release/… (run that binary yourself)
 ```
 
 **Step 4** — Find the output:
diff --git a/surfsense_desktop/assets/icon-128.png b/surfsense_desktop/assets/icon-128.png
new file mode 100644
index 0000000000..8be6ee21a6
Binary files /dev/null and b/surfsense_desktop/assets/icon-128.png differ
diff --git a/surfsense_desktop/assets/iconTemplate.png b/surfsense_desktop/assets/iconTemplate.png
new file mode 100644
index 0000000000..13d1c98407
Binary files /dev/null and b/surfsense_desktop/assets/iconTemplate.png differ
diff --git a/surfsense_desktop/assets/iconTemplate@2x.png b/surfsense_desktop/assets/iconTemplate@2x.png
new file mode 100644
index 0000000000..70710f739b
Binary files /dev/null and b/surfsense_desktop/assets/iconTemplate@2x.png differ
diff --git a/surfsense_desktop/build/entitlements.mac.plist b/surfsense_desktop/build/entitlements.mac.plist
new file mode 100644
index 0000000000..5647e7759a
--- /dev/null
+++ b/surfsense_desktop/build/entitlements.mac.plist
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+    <!-- Required for Electron's V8 JIT under hardened runtime -->
+    <key>com.apple.security.cs.allow-jit</key>
+    <true/>
+    <key>com.apple.security.cs.allow-unsigned-executable-memory</key>
+    <true/>
+
+    <!-- node-mac-permissions and other native deps load dylibs at runtime -->
+    <key>com.apple.security.cs.allow-dyld-environment-variables</key>
+    <true/>
+    <key>com.apple.security.cs.disable-library-validation</key>
+    <true/>
+
+    <!-- Networking (OAuth, API calls, auto-updater, deep links) -->
+    <key>com.apple.security.network.client</key>
+    <true/>
+    <key>com.apple.security.network.server</key>
+    <true/>
+
+    <!-- Screen Capture / Screenshot Assist -->
+    <key>com.apple.security.device.camera</key>
+    <true/>
+
+    <!-- Accessibility / Apple Events used by general-assist -->
+    <key>com.apple.security.automation.apple-events</key>
+    <true/>
+
+    <!-- File access for folder watcher / agent filesystem features -->
+    <key>com.apple.security.files.user-selected.read-write</key>
+    <true/>
+</dict>
+</plist>
diff --git a/surfsense_desktop/electron-builder.yml b/surfsense_desktop/electron-builder.yml
index 2c46c827aa..e4e7670ecf 100644
--- a/surfsense_desktop/electron-builder.yml
+++ b/surfsense_desktop/electron-builder.yml
@@ -46,11 +46,14 @@ mac:
   icon: assets/icon.icns
   category: public.app-category.productivity
   artifactName: "${productName}-${version}-${arch}.${ext}"
-  hardenedRuntime: false
+  hardenedRuntime: true
   gatekeeperAssess: false
+  entitlements: build/entitlements.mac.plist
+  entitlementsInherit: build/entitlements.mac.plist
+  notarize: true
   extendInfo:
-    NSAccessibilityUsageDescription: "SurfSense uses accessibility features to insert suggestions into the active application."
-    NSScreenCaptureUsageDescription: "SurfSense uses screen capture to analyze your screen and provide context-aware writing suggestions."
+    NSAccessibilityUsageDescription: "SurfSense uses accessibility features to bring the app to the foreground and interact with the active application when you use desktop assists."
+    NSScreenCaptureUsageDescription: "SurfSense uses screen capture so you can attach a selected region to chat (Screenshot Assist) or capture the full screen from the composer."
     NSAppleEventsUsageDescription: "SurfSense uses Apple Events to interact with the active application."
   target:
     - target: dmg
@@ -81,4 +84,5 @@ linux:
       Categories: Utility;Office;
   target:
     - deb
+    - rpm
     - AppImage
diff --git a/surfsense_desktop/package.json b/surfsense_desktop/package.json
index aded4ce42b..68032e9f48 100644
--- a/surfsense_desktop/package.json
+++ b/surfsense_desktop/package.json
@@ -1,18 +1,19 @@
 {
   "name": "surfsense-desktop",
-  "version": "0.0.15",
+  "version": "0.0.24",
   "description": "SurfSense Desktop App",
   "main": "dist/main.js",
   "scripts": {
-    "dev": "pnpm build && concurrently -k \"pnpm --dir ../surfsense_web dev\" \"wait-on http://localhost:3000 && electron .\"",
+    "dev": "pnpm build && concurrently -k \"pnpm --dir ../surfsense_web dev\" \"wait-on http://localhost:3000 && node scripts/electron-dev.mjs\"",
     "build": "node scripts/build-electron.mjs",
     "pack:dir": "pnpm build && electron-builder --dir --config electron-builder.yml",
+    "pack:dir:linux": "pnpm build && electron-builder --dir --linux --config electron-builder.yml -c.npmRebuild=false",
     "dist": "pnpm build && electron-builder --config electron-builder.yml",
     "dist:mac": "pnpm build && electron-builder --mac --config electron-builder.yml",
     "dist:win": "pnpm build && electron-builder --win --config electron-builder.yml",
-    "dist:linux": "pnpm build && electron-builder --linux --config electron-builder.yml",
+    "dist:linux": "pnpm build && electron-builder --linux --config electron-builder.yml -c.npmRebuild=false",
     "typecheck": "tsc --noEmit",
-    "postinstall": "electron-rebuild"
+    "postinstall": "node scripts/postinstall-rebuild.mjs"
   },
   "homepage": "https://github.com/MODSetter/SurfSense",
   "author": {
@@ -20,7 +21,7 @@
     "email": "rohan@surfsense.com"
   },
   "license": "MIT",
-  "packageManager": "pnpm@10.24.0",
+  "packageManager": "pnpm@10.26.0",
   "devDependencies": {
     "@electron/rebuild": "^4.0.3",
     "@types/node": "^25.5.0",
diff --git a/surfsense_desktop/scripts/build-electron.mjs b/surfsense_desktop/scripts/build-electron.mjs
index 90d76ef7af..75a3cdf619 100644
--- a/surfsense_desktop/scripts/build-electron.mjs
+++ b/surfsense_desktop/scripts/build-electron.mjs
@@ -132,6 +132,18 @@ async function buildElectron() {
     outfile: 'dist/preload.js',
   });
 
+  await build({
+    ...shared,
+    entryPoints: ['src/modules/screen-capture/screen-region-preload.ts'],
+    outfile: 'dist/modules/screen-capture/screen-region-preload.js',
+  });
+
+  await build({
+    ...shared,
+    entryPoints: ['src/modules/screen-capture/window-picker-preload.ts'],
+    outfile: 'dist/modules/screen-capture/window-picker-preload.js',
+  });
+
   console.log('Electron build complete');
   resolveStandaloneSymlinks();
 }
diff --git a/surfsense_desktop/scripts/electron-dev.mjs b/surfsense_desktop/scripts/electron-dev.mjs
new file mode 100644
index 0000000000..64be032115
--- /dev/null
+++ b/surfsense_desktop/scripts/electron-dev.mjs
@@ -0,0 +1,24 @@
+/**
+ * Linux dev: (1) ELECTRON_DISABLE_SANDBOX before start — setuid chrome-sandbox in node_modules.
+ * (2) --ozone-platform=x11 — use X11 via XWayland so global shortcuts / GPU warnings match many
+ *     Linux Electron setups better than native Wayland. Set SURFSENSE_ELECTRON_WAYLAND=1 to skip (2).
+ * Packaged apps are not launched through this script.
+ */
+import { spawnSync } from 'child_process';
+import { dirname, join } from 'path';
+import { fileURLToPath } from 'url';
+
+const root = join(dirname(fileURLToPath(import.meta.url)), '..');
+const cli = join(root, 'node_modules', 'electron', 'cli.js');
+
+const env = { ...process.env };
+const args = [cli, '.'];
+if (process.platform === 'linux') {
+  env.ELECTRON_DISABLE_SANDBOX = '1';
+  if (env.SURFSENSE_ELECTRON_WAYLAND !== '1') {
+    args.push('--ozone-platform=x11');
+  }
+}
+
+const r = spawnSync(process.execPath, args, { cwd: root, env, stdio: 'inherit' });
+process.exit(r.status === null ? 1 : r.status ?? 0);
diff --git a/surfsense_desktop/scripts/postinstall-rebuild.mjs b/surfsense_desktop/scripts/postinstall-rebuild.mjs
new file mode 100644
index 0000000000..d1cfd0732b
--- /dev/null
+++ b/surfsense_desktop/scripts/postinstall-rebuild.mjs
@@ -0,0 +1,25 @@
+/**
+ * node-mac-permissions is macOS-only; electron-rebuild would still compile it on Linux/Windows
+ * (missing `make`, wrong platform). We skip rebuild there.
+ */
+import { existsSync } from 'fs';
+import { spawnSync } from 'child_process';
+import { dirname, join } from 'path';
+import { fileURLToPath } from 'url';
+
+const root = join(dirname(fileURLToPath(import.meta.url)), '..');
+
+if (process.platform !== 'darwin') {
+  console.log('[surfsense-desktop] Skipping electron-rebuild on non-macOS (native permissions module is darwin-only).');
+  process.exit(0);
+}
+
+const bin = join(root, 'node_modules', '.bin', 'electron-rebuild');
+
+if (!existsSync(bin)) {
+  console.warn('[surfsense-desktop] electron-rebuild not found in node_modules/.bin, skipping.');
+  process.exit(0);
+}
+
+const result = spawnSync(bin, [], { cwd: root, stdio: 'inherit' });
+process.exit(result.status === null ? 1 : result.status);
diff --git a/surfsense_desktop/src/ipc/channels.ts b/surfsense_desktop/src/ipc/channels.ts
index 61213eb467..8d2af5107a 100644
--- a/surfsense_desktop/src/ipc/channels.ts
+++ b/surfsense_desktop/src/ipc/channels.ts
@@ -11,12 +11,13 @@ export const IPC_CHANNELS = {
   REQUEST_ACCESSIBILITY: 'request-accessibility',
   REQUEST_SCREEN_RECORDING: 'request-screen-recording',
   RESTART_APP: 'restart-app',
-  // Autocomplete
-  AUTOCOMPLETE_CONTEXT: 'autocomplete-context',
-  ACCEPT_SUGGESTION: 'accept-suggestion',
-  DISMISS_SUGGESTION: 'dismiss-suggestion',
-  SET_AUTOCOMPLETE_ENABLED: 'set-autocomplete-enabled',
-  GET_AUTOCOMPLETE_ENABLED: 'get-autocomplete-enabled',
+  CAPTURE_FULL_SCREEN: 'capture-full-screen',
+  SCREEN_REGION_SUBMIT: 'screen-region:submit',
+  SCREEN_REGION_CANCEL: 'screen-region:cancel',
+  WINDOW_PICK_LIST: 'window-pick:list',
+  WINDOW_PICK_SUBMIT: 'window-pick:submit',
+  WINDOW_PICK_CANCEL: 'window-pick:cancel',
+  CHAT_SCREEN_CAPTURE: 'chat:screen-capture',
   // Folder sync channels
   FOLDER_SYNC_SELECT_FOLDER: 'folder-sync:select-folder',
   FOLDER_SYNC_ADD_FOLDER: 'folder-sync:add-folder',
@@ -34,6 +35,8 @@ export const IPC_CHANNELS = {
   FOLDER_SYNC_SEED_MTIMES: 'folder-sync:seed-mtimes',
   BROWSE_FILES: 'browse:files',
   READ_LOCAL_FILES: 'browse:read-local-files',
+  READ_AGENT_LOCAL_FILE_TEXT: 'agent-filesystem:read-local-file-text',
+  WRITE_AGENT_LOCAL_FILE_TEXT: 'agent-filesystem:write-local-file-text',
   // Auth token sync across windows
   GET_AUTH_TOKENS: 'auth:get-tokens',
   SET_AUTH_TOKENS: 'auth:set-tokens',
@@ -43,4 +46,21 @@ export const IPC_CHANNELS = {
   // Active search space
   GET_ACTIVE_SEARCH_SPACE: 'search-space:get-active',
   SET_ACTIVE_SEARCH_SPACE: 'search-space:set-active',
+  // Launch on system startup
+  GET_AUTO_LAUNCH: 'auto-launch:get',
+  SET_AUTO_LAUNCH: 'auto-launch:set',
+  // Analytics (PostHog) bridge: renderer <-> main
+  ANALYTICS_IDENTIFY: 'analytics:identify',
+  ANALYTICS_RESET: 'analytics:reset',
+  ANALYTICS_CAPTURE: 'analytics:capture',
+  ANALYTICS_GET_CONTEXT: 'analytics:get-context',
+  // Agent filesystem mode
+  AGENT_FILESYSTEM_GET_SETTINGS: 'agent-filesystem:get-settings',
+  AGENT_FILESYSTEM_GET_MOUNTS: 'agent-filesystem:get-mounts',
+  AGENT_FILESYSTEM_LIST_FILES: 'agent-filesystem:list-files',
+  AGENT_FILESYSTEM_TREE_WATCH_START: 'agent-filesystem:tree-watch-start',
+  AGENT_FILESYSTEM_TREE_WATCH_STOP: 'agent-filesystem:tree-watch-stop',
+  AGENT_FILESYSTEM_TREE_DIRTY: 'agent-filesystem:tree-dirty',
+  AGENT_FILESYSTEM_SET_SETTINGS: 'agent-filesystem:set-settings',
+  AGENT_FILESYSTEM_PICK_ROOT: 'agent-filesystem:pick-root',
 } as const;
diff --git a/surfsense_desktop/src/ipc/handlers.ts b/surfsense_desktop/src/ipc/handlers.ts
index afb2ba0386..d918fd90d8 100644
--- a/surfsense_desktop/src/ipc/handlers.ts
+++ b/surfsense_desktop/src/ipc/handlers.ts
@@ -2,10 +2,12 @@ import { app, ipcMain, shell } from 'electron';
 import { IPC_CHANNELS } from './channels';
 import {
   getPermissionsStatus,
+  hasScreenRecordingPermission,
   requestAccessibility,
   requestScreenRecording,
   restartApp,
 } from '../modules/permissions';
+import { pickOpenWindowCapture } from '../modules/screen-capture';
 import {
   selectFolder,
   addWatchedFolder,
@@ -24,10 +26,31 @@ import {
   type WatchedFolderConfig,
 } from '../modules/folder-watcher';
 import { getShortcuts, setShortcuts, type ShortcutConfig } from '../modules/shortcuts';
+import { getAutoLaunchState, setAutoLaunch } from '../modules/auto-launch';
 import { getActiveSearchSpaceId, setActiveSearchSpaceId } from '../modules/active-search-space';
 import { reregisterQuickAsk } from '../modules/quick-ask';
-import { reregisterAutocomplete } from '../modules/autocomplete';
-import { reregisterGeneralAssist } from '../modules/tray';
+import { reregisterGeneralAssist, reregisterScreenshotAssist } from '../modules/tray';
+import {
+  getDistinctId,
+  getMachineId,
+  identifyUser as analyticsIdentify,
+  resetUser as analyticsReset,
+  trackEvent,
+} from '../modules/analytics';
+import {
+  listAgentFilesystemFiles,
+  readAgentLocalFileText,
+  writeAgentLocalFileText,
+  getAgentFilesystemMounts,
+  getAgentFilesystemSettings,
+  pickAgentFilesystemRoot,
+  setAgentFilesystemSettings,
+} from '../modules/agent-filesystem';
+import {
+  startAgentFilesystemTreeWatch,
+  stopAgentFilesystemTreeWatch,
+  type AgentFilesystemTreeWatchOptions,
+} from '../modules/agent-filesystem-tree-watcher';
 
 let authTokens: { bearer: string; refresh: string } | null = null;
 
@@ -63,6 +86,15 @@ export function registerIpcHandlers(): void {
     restartApp();
   });
 
+  ipcMain.handle(IPC_CHANNELS.CAPTURE_FULL_SCREEN, async () => {
+    if (!hasScreenRecordingPermission()) {
+      requestScreenRecording();
+      return null;
+    }
+    const picked = await pickOpenWindowCapture();
+    return picked?.dataUrl ?? null;
+  });
+
   // Folder sync handlers
   ipcMain.handle(IPC_CHANNELS.FOLDER_SYNC_SELECT_FOLDER, () => selectFolder());
 
@@ -110,6 +142,32 @@ export function registerIpcHandlers(): void {
     readLocalFiles(paths)
   );
 
+  ipcMain.handle(
+    IPC_CHANNELS.READ_AGENT_LOCAL_FILE_TEXT,
+    async (_event, virtualPath: string, searchSpaceId?: number | null) => {
+    try {
+      const result = await readAgentLocalFileText(virtualPath, searchSpaceId);
+      return { ok: true, path: result.path, content: result.content };
+    } catch (error) {
+      const message = error instanceof Error ? error.message : 'Failed to read local file';
+      return { ok: false, path: virtualPath, error: message };
+    }
+    }
+  );
+
+  ipcMain.handle(
+    IPC_CHANNELS.WRITE_AGENT_LOCAL_FILE_TEXT,
+    async (_event, virtualPath: string, content: string, searchSpaceId?: number | null) => {
+      try {
+        const result = await writeAgentLocalFileText(virtualPath, content, searchSpaceId);
+        return { ok: true, path: result.path };
+      } catch (error) {
+        const message = error instanceof Error ? error.message : 'Failed to write local file';
+        return { ok: false, path: virtualPath, error: message };
+      }
+    }
+  );
+
   ipcMain.handle(IPC_CHANNELS.SET_AUTH_TOKENS, (_event, tokens: { bearer: string; refresh: string }) => {
     authTokens = tokens;
   });
@@ -120,6 +178,21 @@ export function registerIpcHandlers(): void {
 
   ipcMain.handle(IPC_CHANNELS.GET_SHORTCUTS, () => getShortcuts());
 
+  ipcMain.handle(IPC_CHANNELS.GET_AUTO_LAUNCH, () => getAutoLaunchState());
+
+  ipcMain.handle(
+    IPC_CHANNELS.SET_AUTO_LAUNCH,
+    async (_event, payload: { enabled: boolean; openAsHidden?: boolean }) => {
+      const next = await setAutoLaunch(payload.enabled, payload.openAsHidden);
+      trackEvent('desktop_auto_launch_toggled', {
+        enabled: next.enabled,
+        open_as_hidden: next.openAsHidden,
+        supported: next.supported,
+      });
+      return next;
+    },
+  );
+
   ipcMain.handle(IPC_CHANNELS.GET_ACTIVE_SEARCH_SPACE, () => getActiveSearchSpaceId());
 
   ipcMain.handle(IPC_CHANNELS.SET_ACTIVE_SEARCH_SPACE, (_event, id: string) =>
@@ -129,8 +202,92 @@ export function registerIpcHandlers(): void {
   ipcMain.handle(IPC_CHANNELS.SET_SHORTCUTS, async (_event, config: Partial<ShortcutConfig>) => {
     const updated = await setShortcuts(config);
     if (config.generalAssist) await reregisterGeneralAssist();
+    if (config.screenshotAssist) await reregisterScreenshotAssist();
     if (config.quickAsk) await reregisterQuickAsk();
-    if (config.autocomplete) await reregisterAutocomplete();
+    trackEvent('desktop_shortcut_updated', {
+      keys: Object.keys(config),
+    });
     return updated;
   });
+
+  // Analytics bridge — the renderer (web UI) hands the logged-in user down
+  // to the main process so desktop-only events are attributed to the same
+  // PostHog person, not just an anonymous machine ID.
+  ipcMain.handle(
+    IPC_CHANNELS.ANALYTICS_IDENTIFY,
+    (_event, payload: { userId: string; properties?: Record<string, unknown> }) => {
+      if (!payload?.userId) return;
+      analyticsIdentify(String(payload.userId), payload.properties);
+    }
+  );
+
+  ipcMain.handle(IPC_CHANNELS.ANALYTICS_RESET, () => {
+    analyticsReset();
+  });
+
+  ipcMain.handle(
+    IPC_CHANNELS.ANALYTICS_CAPTURE,
+    (_event, payload: { event: string; properties?: Record<string, unknown> }) => {
+      if (!payload?.event) return;
+      trackEvent(payload.event, payload.properties);
+    }
+  );
+
+  ipcMain.handle(IPC_CHANNELS.ANALYTICS_GET_CONTEXT, () => {
+    return {
+      distinctId: getDistinctId(),
+      machineId: getMachineId(),
+      appVersion: app.getVersion(),
+      platform: process.platform,
+    };
+  });
+
+  ipcMain.handle(IPC_CHANNELS.AGENT_FILESYSTEM_GET_SETTINGS, (_event, searchSpaceId?: number | null) =>
+    getAgentFilesystemSettings(searchSpaceId)
+  );
+
+  ipcMain.handle(IPC_CHANNELS.AGENT_FILESYSTEM_GET_MOUNTS, (_event, searchSpaceId?: number | null) =>
+    getAgentFilesystemMounts(searchSpaceId)
+  );
+
+  ipcMain.handle(
+    IPC_CHANNELS.AGENT_FILESYSTEM_LIST_FILES,
+    (
+      _event,
+      options: {
+        rootPath: string;
+        searchSpaceId?: number | null;
+        excludePatterns?: string[] | null;
+        fileExtensions?: string[] | null;
+      }
+    ) =>
+      listAgentFilesystemFiles(options)
+  );
+
+  ipcMain.handle(
+    IPC_CHANNELS.AGENT_FILESYSTEM_SET_SETTINGS,
+    (
+      _event,
+      payload: {
+        searchSpaceId?: number | null;
+        settings: { mode?: 'cloud' | 'desktop_local_folder'; localRootPaths?: string[] | null };
+      }
+    ) => setAgentFilesystemSettings(payload?.searchSpaceId, payload?.settings ?? {})
+  );
+
+  ipcMain.handle(IPC_CHANNELS.AGENT_FILESYSTEM_PICK_ROOT, () =>
+    pickAgentFilesystemRoot()
+  );
+
+  ipcMain.handle(
+    IPC_CHANNELS.AGENT_FILESYSTEM_TREE_WATCH_START,
+    (_event, options: AgentFilesystemTreeWatchOptions) =>
+      startAgentFilesystemTreeWatch(options)
+  );
+
+  ipcMain.handle(
+    IPC_CHANNELS.AGENT_FILESYSTEM_TREE_WATCH_STOP,
+    (_event, searchSpaceId?: number | null) =>
+      stopAgentFilesystemTreeWatch(searchSpaceId)
+  );
 }
diff --git a/surfsense_desktop/src/main.ts b/surfsense_desktop/src/main.ts
index 231553f9af..492c61f17f 100644
--- a/surfsense_desktop/src/main.ts
+++ b/surfsense_desktop/src/main.ts
@@ -1,18 +1,22 @@
-import { app, BrowserWindow } from 'electron';
+import { app } from 'electron';
 
-let isQuitting = false;
 import { registerGlobalErrorHandlers, showErrorDialog } from './modules/errors';
 import { startNextServer } from './modules/server';
-import { createMainWindow, getMainWindow } from './modules/window';
-import { setupDeepLinks, handlePendingDeepLink } from './modules/deep-links';
+import { createMainWindow, getMainWindow, markQuitting } from './modules/window';
+import { setupDeepLinks, handlePendingDeepLink, hasPendingDeepLink } from './modules/deep-links';
 import { setupAutoUpdater } from './modules/auto-updater';
 import { setupMenu } from './modules/menu';
 import { registerQuickAsk, unregisterQuickAsk } from './modules/quick-ask';
-import { registerAutocomplete, unregisterAutocomplete } from './modules/autocomplete';
 import { registerFolderWatcher, unregisterFolderWatcher } from './modules/folder-watcher';
 import { registerIpcHandlers } from './ipc/handlers';
 import { createTray, destroyTray } from './modules/tray';
 import { initAnalytics, shutdownAnalytics, trackEvent } from './modules/analytics';
+import {
+  applyAutoLaunchDefaults,
+  shouldStartHidden,
+  syncAutoLaunchOnStartup,
+  wasLaunchedAtLogin,
+} from './modules/auto-launch';
 
 registerGlobalErrorHandlers();
 
@@ -24,7 +28,12 @@ registerIpcHandlers();
 
 app.whenReady().then(async () => {
   initAnalytics();
-  trackEvent('desktop_app_launched');
+  const launchedAtLogin = wasLaunchedAtLogin();
+  const startedHidden = shouldStartHidden();
+  trackEvent('desktop_app_launched', {
+    launched_at_login: launchedAtLogin,
+    started_hidden: startedHidden,
+  });
   setupMenu();
   try {
     await startNextServer();
@@ -35,19 +44,21 @@ app.whenReady().then(async () => {
   }
 
   await createTray();
+  const defaultsApplied = await applyAutoLaunchDefaults();
+  if (defaultsApplied) {
+    trackEvent('desktop_auto_launch_defaulted_on');
+  }
+  await syncAutoLaunchOnStartup();
 
-  const win = createMainWindow('/dashboard');
-
-  // Minimize to tray instead of closing the app
-  win.on('close', (e) => {
-    if (!isQuitting) {
-      e.preventDefault();
-      win.hide();
-    }
-  });
+  // When started by the OS at login we stay quietly in the tray. The window
+  // is created lazily on first user interaction (tray click / activate).
+  // Exception: if a deep link is queued, the user explicitly asked to land
+  // in the app — don't swallow it.
+  if (!startedHidden || hasPendingDeepLink()) {
+    createMainWindow('/dashboard');
+  }
 
   await registerQuickAsk();
-  await registerAutocomplete();
   registerFolderWatcher();
   setupAutoUpdater();
 
@@ -55,6 +66,7 @@ app.whenReady().then(async () => {
 
   app.on('activate', () => {
     const mw = getMainWindow();
+    trackEvent('desktop_app_activated');
     if (!mw || mw.isDestroyed()) {
       createMainWindow('/dashboard');
     } else {
@@ -70,7 +82,8 @@ app.on('window-all-closed', () => {
 });
 
 app.on('before-quit', () => {
-  isQuitting = true;
+  markQuitting();
+  trackEvent('desktop_app_quit');
 });
 
 let didCleanup = false;
@@ -79,7 +92,6 @@ app.on('will-quit', async (e) => {
   didCleanup = true;
   e.preventDefault();
   unregisterQuickAsk();
-  unregisterAutocomplete();
   unregisterFolderWatcher();
   destroyTray();
   await shutdownAnalytics();
diff --git a/surfsense_desktop/src/modules/agent-filesystem-tree-watcher.ts b/surfsense_desktop/src/modules/agent-filesystem-tree-watcher.ts
new file mode 100644
index 0000000000..600f84fd5a
--- /dev/null
+++ b/surfsense_desktop/src/modules/agent-filesystem-tree-watcher.ts
@@ -0,0 +1,302 @@
+import { BrowserWindow } from 'electron';
+import chokidar, { type FSWatcher } from 'chokidar';
+import { resolve } from 'node:path';
+import { IPC_CHANNELS } from '../ipc/channels';
+import { listAgentFilesystemFiles } from './agent-filesystem';
+
+const SAFETY_POLL_MS = 60_000;
+const EVENT_DEBOUNCE_MS = 700;
+
+export type AgentFilesystemTreeWatchOptions = {
+  searchSpaceId?: number | null;
+  rootPaths: string[];
+  excludePatterns?: string[] | null;
+  fileExtensions?: string[] | null;
+};
+
+type TreeDirtyReason = 'watcher_event' | 'safety_poll';
+
+type TreeDirtyEvent = {
+  searchSpaceId: number | null;
+  reason: TreeDirtyReason;
+  rootPath: string;
+  changedPath: string | null;
+  timestamp: number;
+};
+
+type WatchSession = {
+  searchSpaceId: number | null;
+  optionsSignature: string;
+  rootPaths: string[];
+  excludePatterns: string[];
+  fileExtensions: string[] | null;
+  watchers: FSWatcher[];
+  pollTimer: NodeJS.Timeout | null;
+  emitTimer: NodeJS.Timeout | null;
+  rootSnapshotByPath: Map<string, string>;
+  pendingDirtyByRoot: Map<string, { reason: TreeDirtyReason; changedPath: string | null }>;
+  disposed: boolean;
+};
+
+const sessions = new Map<string, WatchSession>();
+
+function normalizeSearchSpaceId(searchSpaceId?: number | null): number | null {
+  if (typeof searchSpaceId === 'number' && Number.isFinite(searchSpaceId) && searchSpaceId > 0) {
+    return searchSpaceId;
+  }
+  return null;
+}
+
+function getSessionKey(searchSpaceId?: number | null): string {
+  const normalized = normalizeSearchSpaceId(searchSpaceId);
+  return normalized === null ? 'default' : String(normalized);
+}
+
+function normalizeRootPath(pathValue: string): string {
+  const normalized = resolve(pathValue.trim());
+  return process.platform === 'win32' ? normalized.toLowerCase() : normalized;
+}
+
+function normalizeList(value: string[] | null | undefined): string[] {
+  if (!value || value.length === 0) return [];
+  return value
+    .filter((entry): entry is string => typeof entry === 'string')
+    .map((entry) => entry.trim())
+    .filter(Boolean);
+}
+
+function normalizeExtensions(value: string[] | null | undefined): string[] | null {
+  const normalized = normalizeList(value).map((entry) => entry.toLowerCase());
+  return normalized.length > 0 ? normalized : null;
+}
+
+function buildOptionsSignature(
+  searchSpaceId: number | null,
+  rootPaths: string[],
+  excludePatterns: string[],
+  fileExtensions: string[] | null
+): string {
+  return JSON.stringify({
+    searchSpaceId,
+    rootPaths: [...rootPaths].sort(),
+    excludePatterns: [...excludePatterns].sort(),
+    fileExtensions: fileExtensions ? [...fileExtensions].sort() : null,
+  });
+}
+
+function hashText(input: string, seed: number): number {
+  let hash = seed >>> 0;
+  for (let i = 0; i < input.length; i += 1) {
+    hash ^= input.charCodeAt(i);
+    hash = Math.imul(hash, 16777619);
+    hash >>>= 0;
+  }
+  return hash;
+}
+
+async function buildRootSnapshotSignature(
+  session: WatchSession,
+  rootPath: string
+): Promise<string> {
+  let hash = 2166136261;
+  hash = hashText(`space:${session.searchSpaceId ?? 'default'}|root:${rootPath}`, hash);
+  const files = await listAgentFilesystemFiles({
+    rootPath,
+    searchSpaceId: session.searchSpaceId,
+    excludePatterns: session.excludePatterns,
+    fileExtensions: session.fileExtensions,
+  });
+  const sortedFiles = [...files].sort((a, b) => a.relativePath.localeCompare(b.relativePath));
+  hash = hashText(`count:${sortedFiles.length}`, hash);
+  for (const file of sortedFiles) {
+    hash = hashText(
+      `${file.relativePath}|${Math.round(file.mtimeMs)}|${file.size}`,
+      hash
+    );
+  }
+  return hash.toString(16);
+}
+
+function sendTreeDirtyEvent(
+  searchSpaceId: number | null,
+  reason: TreeDirtyReason,
+  rootPath: string,
+  changedPath: string | null
+): void {
+  const payload: TreeDirtyEvent = {
+    searchSpaceId,
+    reason,
+    rootPath,
+    changedPath,
+    timestamp: Date.now(),
+  };
+  for (const win of BrowserWindow.getAllWindows()) {
+    if (!win.isDestroyed()) {
+      win.webContents.send(IPC_CHANNELS.AGENT_FILESYSTEM_TREE_DIRTY, payload);
+    }
+  }
+}
+
+function scheduleDirtyEmit(
+  session: WatchSession,
+  reason: TreeDirtyReason,
+  rootPath: string,
+  changedPath: string | null = null
+): void {
+  if (session.disposed) return;
+  const existing = session.pendingDirtyByRoot.get(rootPath);
+  if (!existing || existing.reason === 'safety_poll') {
+    session.pendingDirtyByRoot.set(rootPath, { reason, changedPath });
+  }
+  if (session.emitTimer) {
+    clearTimeout(session.emitTimer);
+  }
+  session.emitTimer = setTimeout(() => {
+    session.emitTimer = null;
+    if (session.disposed) return;
+    const pending = Array.from(session.pendingDirtyByRoot.entries());
+    session.pendingDirtyByRoot.clear();
+    for (const [pendingRootPath, payload] of pending) {
+      sendTreeDirtyEvent(
+        session.searchSpaceId,
+        payload.reason,
+        pendingRootPath,
+        payload.changedPath
+      );
+    }
+  }, EVENT_DEBOUNCE_MS);
+}
+
+async function closeSession(session: WatchSession): Promise<void> {
+  session.disposed = true;
+  if (session.emitTimer) {
+    clearTimeout(session.emitTimer);
+    session.emitTimer = null;
+  }
+  if (session.pollTimer) {
+    clearInterval(session.pollTimer);
+    session.pollTimer = null;
+  }
+  await Promise.allSettled(session.watchers.map((watcher) => watcher.close()));
+}
+
+export async function startAgentFilesystemTreeWatch(
+  options: AgentFilesystemTreeWatchOptions
+): Promise<{ ok: true }> {
+  const searchSpaceId = normalizeSearchSpaceId(options.searchSpaceId);
+  const rootPaths = Array.from(
+    new Set(normalizeList(options.rootPaths).map((rootPath) => normalizeRootPath(rootPath)))
+  );
+  const excludePatterns = Array.from(new Set(normalizeList(options.excludePatterns)));
+  const fileExtensions = normalizeExtensions(options.fileExtensions);
+  const sessionKey = getSessionKey(searchSpaceId);
+
+  if (rootPaths.length === 0) {
+    await stopAgentFilesystemTreeWatch(searchSpaceId);
+    return { ok: true };
+  }
+
+  const optionsSignature = buildOptionsSignature(
+    searchSpaceId,
+    rootPaths,
+    excludePatterns,
+    fileExtensions
+  );
+  const existing = sessions.get(sessionKey);
+  if (existing && existing.optionsSignature === optionsSignature) {
+    return { ok: true };
+  }
+  if (existing) {
+    await closeSession(existing);
+    sessions.delete(sessionKey);
+  }
+
+  const ignored = [
+    /(^|[/\\])\../,
+    ...excludePatterns.map((pattern) => `**/${pattern}/**`),
+  ];
+  const watchers = rootPaths.map((rootPath) =>
+    chokidar.watch(rootPath, {
+      persistent: true,
+      ignoreInitial: true,
+      awaitWriteFinish: {
+        stabilityThreshold: 500,
+        pollInterval: 100,
+      },
+      ignored,
+    })
+  );
+
+  const session: WatchSession = {
+    searchSpaceId,
+    optionsSignature,
+    rootPaths,
+    excludePatterns,
+    fileExtensions,
+    watchers,
+    pollTimer: null,
+    emitTimer: null,
+    rootSnapshotByPath: new Map(),
+    pendingDirtyByRoot: new Map(),
+    disposed: false,
+  };
+
+  for (let index = 0; index < watchers.length; index += 1) {
+    const watcher = watchers[index];
+    const rootPath = rootPaths[index];
+    watcher.on('add', (filePath) => scheduleDirtyEmit(session, 'watcher_event', rootPath, filePath));
+    watcher.on('change', (filePath) =>
+      scheduleDirtyEmit(session, 'watcher_event', rootPath, filePath)
+    );
+    watcher.on('unlink', (filePath) =>
+      scheduleDirtyEmit(session, 'watcher_event', rootPath, filePath)
+    );
+    watcher.on('addDir', (filePath) =>
+      scheduleDirtyEmit(session, 'watcher_event', rootPath, filePath)
+    );
+    watcher.on('unlinkDir', (filePath) =>
+      scheduleDirtyEmit(session, 'watcher_event', rootPath, filePath)
+    );
+  }
+
+  for (const rootPath of rootPaths) {
+    try {
+      const signature = await buildRootSnapshotSignature(session, rootPath);
+      session.rootSnapshotByPath.set(rootPath, signature);
+    } catch {
+      session.rootSnapshotByPath.set(rootPath, '');
+    }
+  }
+
+  session.pollTimer = setInterval(() => {
+    void (async () => {
+      if (session.disposed) return;
+      for (const rootPath of session.rootPaths) {
+        try {
+          const nextSignature = await buildRootSnapshotSignature(session, rootPath);
+          const previousSignature = session.rootSnapshotByPath.get(rootPath) ?? '';
+          if (nextSignature !== previousSignature) {
+            session.rootSnapshotByPath.set(rootPath, nextSignature);
+            scheduleDirtyEmit(session, 'safety_poll', rootPath, null);
+          }
+        } catch {
+          // Keep watcher resilient on transient IO errors.
+        }
+      }
+    })();
+  }, SAFETY_POLL_MS);
+
+  sessions.set(sessionKey, session);
+  return { ok: true };
+}
+
+export async function stopAgentFilesystemTreeWatch(
+  searchSpaceId?: number | null
+): Promise<{ ok: true }> {
+  const sessionKey = getSessionKey(searchSpaceId);
+  const session = sessions.get(sessionKey);
+  if (!session) return { ok: true };
+  sessions.delete(sessionKey);
+  await closeSession(session);
+  return { ok: true };
+}
diff --git a/surfsense_desktop/src/modules/agent-filesystem.ts b/surfsense_desktop/src/modules/agent-filesystem.ts
new file mode 100644
index 0000000000..608f8c4a4d
--- /dev/null
+++ b/surfsense_desktop/src/modules/agent-filesystem.ts
@@ -0,0 +1,533 @@
+import { app, dialog } from "electron";
+import type { Dirent } from "node:fs";
+import { access, mkdir, readdir, readFile, realpath, stat, writeFile } from "node:fs/promises";
+import { dirname, extname, isAbsolute, join, relative, resolve } from "node:path";
+
+export type AgentFilesystemMode = "cloud" | "desktop_local_folder";
+
+export interface AgentFilesystemSettings {
+	mode: AgentFilesystemMode;
+	localRootPaths: string[];
+	updatedAt: string;
+}
+
+type AgentFilesystemSettingsStore = {
+	version: 2;
+	spaces: Record<string, AgentFilesystemSettings>;
+};
+
+const SETTINGS_FILENAME = "agent-filesystem-settings.json";
+const MAX_LOCAL_ROOTS = 10;
+const DEFAULT_SPACE_KEY = "default";
+let cachedSettingsStore: AgentFilesystemSettingsStore | null = null;
+
+const LOCAL_OPENABLE_TEXT_EXTENSIONS = new Set<string>([
+	".md",
+	".markdown",
+	".txt",
+	".json",
+	".yaml",
+	".yml",
+	".csv",
+	".tsv",
+	".xml",
+	".html",
+	".htm",
+	".css",
+	".scss",
+	".sass",
+	".sql",
+	".toml",
+	".ini",
+	".conf",
+	".log",
+	".py",
+	".js",
+	".jsx",
+	".mjs",
+	".cjs",
+	".ts",
+	".tsx",
+	".java",
+	".kt",
+	".kts",
+	".go",
+	".rs",
+	".rb",
+	".php",
+	".swift",
+	".r",
+	".lua",
+	".sh",
+	".bash",
+	".zsh",
+	".fish",
+	".env",
+	".mk",
+]);
+
+function getSettingsPath(): string {
+	return join(app.getPath("userData"), SETTINGS_FILENAME);
+}
+
+function getDefaultSettings(): AgentFilesystemSettings {
+	return {
+		mode: "cloud",
+		localRootPaths: [],
+		updatedAt: new Date().toISOString(),
+	};
+}
+
+async function canonicalizeRootPath(pathValue: string): Promise<string> {
+	const resolvedPath = resolve(pathValue);
+	try {
+		return await realpath(resolvedPath);
+	} catch {
+		return resolvedPath;
+	}
+}
+
+function normalizeLocalRootPaths(paths: unknown): string[] {
+	if (!Array.isArray(paths)) {
+		return [];
+	}
+	const uniquePaths = new Set<string>();
+	for (const rawPath of paths) {
+		if (typeof rawPath !== "string") continue;
+		const trimmed = rawPath.trim();
+		if (!trimmed) continue;
+		uniquePaths.add(trimmed);
+		if (uniquePaths.size >= MAX_LOCAL_ROOTS) {
+			break;
+		}
+	}
+	return [...uniquePaths];
+}
+
+async function normalizeLocalRootPathsCanonical(paths: unknown): Promise<string[]> {
+	const normalizedPaths = normalizeLocalRootPaths(paths);
+	const canonicalizedPaths = await Promise.all(
+		normalizedPaths.map((pathValue) => canonicalizeRootPath(pathValue))
+	);
+	const uniquePaths = new Set<string>();
+	for (const canonicalPath of canonicalizedPaths) {
+		uniquePaths.add(canonicalPath);
+		if (uniquePaths.size >= MAX_LOCAL_ROOTS) {
+			break;
+		}
+	}
+	return [...uniquePaths];
+}
+
+function normalizeSearchSpaceKey(searchSpaceId?: number | null): string {
+	if (typeof searchSpaceId === "number" && Number.isFinite(searchSpaceId) && searchSpaceId > 0) {
+		return String(searchSpaceId);
+	}
+	return DEFAULT_SPACE_KEY;
+}
+
+function toSettingsFromUnknown(value: unknown): AgentFilesystemSettings | null {
+	if (!value || typeof value !== "object") {
+		return null;
+	}
+	const parsed = value as Partial<AgentFilesystemSettings>;
+	if (parsed.mode !== "cloud" && parsed.mode !== "desktop_local_folder") {
+		return null;
+	}
+	return {
+		mode: parsed.mode,
+		localRootPaths: normalizeLocalRootPaths(parsed.localRootPaths),
+		updatedAt: parsed.updatedAt ?? new Date().toISOString(),
+	};
+}
+
+function getDefaultStore(): AgentFilesystemSettingsStore {
+	return { version: 2, spaces: {} };
+}
+
+function getSettingsFromStore(
+	store: AgentFilesystemSettingsStore,
+	searchSpaceId?: number | null
+): AgentFilesystemSettings {
+	const key = normalizeSearchSpaceKey(searchSpaceId);
+	return store.spaces[key] ?? getDefaultSettings();
+}
+
+async function loadAgentFilesystemSettingsStore(): Promise<AgentFilesystemSettingsStore> {
+	if (cachedSettingsStore) {
+		return cachedSettingsStore;
+	}
+	const settingsPath = getSettingsPath();
+	try {
+		const raw = await readFile(settingsPath, "utf8");
+		const parsed = JSON.parse(raw) as unknown;
+		const nextStore = getDefaultStore();
+		if (
+			parsed &&
+			typeof parsed === "object" &&
+			"version" in parsed &&
+			"spaces" in parsed &&
+			(parsed as { version?: unknown }).version === 2
+		) {
+			const parsedStore = parsed as { spaces?: Record<string, unknown>; version: 2 };
+			if (parsedStore.spaces && typeof parsedStore.spaces === "object") {
+				for (const [spaceKey, rawSettings] of Object.entries(parsedStore.spaces)) {
+					const normalizedSettings = toSettingsFromUnknown(rawSettings);
+					if (normalizedSettings) {
+						nextStore.spaces[String(spaceKey)] = normalizedSettings;
+					}
+				}
+			}
+		} else {
+			// Strict migration: reject legacy/non-scoped settings and reset.
+			await mkdir(dirname(settingsPath), { recursive: true });
+			await writeFile(settingsPath, JSON.stringify(nextStore, null, 2), "utf8");
+		}
+		cachedSettingsStore = nextStore;
+		return nextStore;
+	} catch {
+		cachedSettingsStore = getDefaultStore();
+		await mkdir(dirname(settingsPath), { recursive: true });
+		await writeFile(settingsPath, JSON.stringify(cachedSettingsStore, null, 2), "utf8");
+		return cachedSettingsStore;
+	}
+}
+
+export async function getAgentFilesystemSettings(
+	searchSpaceId?: number | null
+): Promise<AgentFilesystemSettings> {
+	const store = await loadAgentFilesystemSettingsStore();
+	return getSettingsFromStore(store, searchSpaceId);
+}
+
+export async function setAgentFilesystemSettings(
+	searchSpaceId: number | null | undefined,
+	settings: {
+		mode?: AgentFilesystemMode;
+		localRootPaths?: string[] | null;
+	}
+): Promise<AgentFilesystemSettings> {
+	const store = await loadAgentFilesystemSettingsStore();
+	const key = normalizeSearchSpaceKey(searchSpaceId);
+	const current = getSettingsFromStore(store, searchSpaceId);
+	const nextMode =
+		settings.mode === "cloud" || settings.mode === "desktop_local_folder"
+			? settings.mode
+			: current.mode;
+	const next: AgentFilesystemSettings = {
+		mode: nextMode,
+		localRootPaths:
+			settings.localRootPaths === undefined
+				? current.localRootPaths
+				: await normalizeLocalRootPathsCanonical(settings.localRootPaths ?? []),
+		updatedAt: new Date().toISOString(),
+	};
+
+	const settingsPath = getSettingsPath();
+	await mkdir(dirname(settingsPath), { recursive: true });
+	const nextStore: AgentFilesystemSettingsStore = {
+		version: 2,
+		spaces: {
+			...store.spaces,
+			[key]: next,
+		},
+	};
+	await writeFile(settingsPath, JSON.stringify(nextStore, null, 2), "utf8");
+	cachedSettingsStore = nextStore;
+	return next;
+}
+
+export async function pickAgentFilesystemRoot(): Promise<string | null> {
+	const result = await dialog.showOpenDialog({
+		title: "Select local folder for Agent Filesystem",
+		properties: ["openDirectory"],
+	});
+	if (result.canceled || result.filePaths.length === 0) {
+		return null;
+	}
+	return result.filePaths[0] ?? null;
+}
+
+function resolveVirtualPath(rootPath: string, virtualPath: string): string {
+	if (!virtualPath.startsWith("/")) {
+		throw new Error("Path must start with '/'");
+	}
+	const normalizedRoot = resolve(rootPath);
+	const relativePath = virtualPath.replace(/^\/+/, "");
+	if (!relativePath) {
+		throw new Error("Path must refer to a file under the selected root");
+	}
+	const absolutePath = resolve(normalizedRoot, relativePath);
+	const rel = relative(normalizedRoot, absolutePath);
+	if (!rel || rel.startsWith("..") || isAbsolute(rel)) {
+		throw new Error("Path escapes selected local root");
+	}
+	return absolutePath;
+}
+
+function toVirtualPath(rootPath: string, absolutePath: string): string {
+	const normalizedRoot = resolve(rootPath);
+	const rel = relative(normalizedRoot, absolutePath);
+	if (!rel || rel.startsWith("..") || isAbsolute(rel)) {
+		return "/";
+	}
+	return `/${rel.replace(/\\/g, "/")}`;
+}
+
+function assertLocalOpenableTextFile(absolutePath: string): void {
+	const extension = extname(absolutePath).toLowerCase();
+	if (!LOCAL_OPENABLE_TEXT_EXTENSIONS.has(extension)) {
+		throw new Error(
+			`Unsupported local file type '${extension || "(no extension)"}'. ` +
+				"Only text/code files can be opened in local mode."
+		);
+	}
+}
+
+export type LocalRootMount = {
+	mount: string;
+	rootPath: string;
+};
+
+export type AgentFilesystemListOptions = {
+	rootPath: string;
+	searchSpaceId?: number | null;
+	excludePatterns?: string[] | null;
+	fileExtensions?: string[] | null;
+};
+
+export type AgentFilesystemFileEntry = {
+	relativePath: string;
+	fullPath: string;
+	size: number;
+	mtimeMs: number;
+};
+
+function sanitizeMountName(rawMount: string): string {
+	const normalized = rawMount
+		.trim()
+		.toLowerCase()
+		.replace(/[^a-z0-9_-]+/g, "_")
+		.replace(/_+/g, "_")
+		.replace(/^[_-]+|[_-]+$/g, "");
+	return normalized || "root";
+}
+
+function buildRootMounts(rootPaths: string[]): LocalRootMount[] {
+	const mounts: LocalRootMount[] = [];
+	const usedMounts = new Set<string>();
+	for (const rawRootPath of rootPaths) {
+		const normalizedRoot = resolve(rawRootPath);
+		const baseMount = sanitizeMountName(normalizedRoot.split(/[\\/]/).at(-1) || "root");
+		let mount = baseMount;
+		let suffix = 2;
+		while (usedMounts.has(mount)) {
+			mount = `${baseMount}-${suffix}`;
+			suffix += 1;
+		}
+		usedMounts.add(mount);
+		mounts.push({ mount, rootPath: normalizedRoot });
+	}
+	return mounts;
+}
+
+export async function getAgentFilesystemMounts(
+	searchSpaceId?: number | null
+): Promise<LocalRootMount[]> {
+	const rootPaths = await resolveCurrentRootPaths(searchSpaceId);
+	return buildRootMounts(rootPaths);
+}
+
+function normalizeComparablePath(pathValue: string): string {
+	const normalized = resolve(pathValue);
+	return process.platform === "win32" ? normalized.toLowerCase() : normalized;
+}
+
+function normalizeExtensionSet(fileExtensions: string[] | null | undefined): Set<string> | null {
+	if (!fileExtensions || fileExtensions.length === 0) {
+		return null;
+	}
+	const set = new Set<string>();
+	for (const extension of fileExtensions) {
+		if (typeof extension !== "string") continue;
+		const trimmed = extension.trim().toLowerCase();
+		if (!trimmed) continue;
+		set.add(trimmed.startsWith(".") ? trimmed : `.${trimmed}`);
+	}
+	return set.size > 0 ? set : null;
+}
+
+function normalizeExcludeSet(excludePatterns: string[] | null | undefined): Set<string> {
+	const set = new Set<string>();
+	for (const pattern of excludePatterns ?? []) {
+		if (typeof pattern !== "string") continue;
+		const trimmed = pattern.trim();
+		if (!trimmed) continue;
+		set.add(trimmed);
+	}
+	return set;
+}
+
+export async function listAgentFilesystemFiles(
+	options: AgentFilesystemListOptions
+): Promise<AgentFilesystemFileEntry[]> {
+	const allowedRootPaths = await resolveCurrentRootPaths(options.searchSpaceId);
+	const requestedRootPath = await canonicalizeRootPath(options.rootPath);
+	const normalizedRequestedRoot = normalizeComparablePath(requestedRootPath);
+	const allowedRoots = new Set(
+		(
+			await Promise.all(allowedRootPaths.map((rootPath) => canonicalizeRootPath(rootPath)))
+		).map((rootPath) => normalizeComparablePath(rootPath))
+	);
+	if (!allowedRoots.has(normalizedRequestedRoot)) {
+		throw new Error("Selected path is not an allowed local root");
+	}
+
+	const excludePatterns = normalizeExcludeSet(options.excludePatterns);
+	const extensionSet = normalizeExtensionSet(options.fileExtensions);
+	const files: AgentFilesystemFileEntry[] = [];
+	const stack: string[] = [requestedRootPath];
+
+	while (stack.length > 0) {
+		const currentDir = stack.pop();
+		if (!currentDir) continue;
+		let entries: Dirent[];
+		try {
+			entries = await readdir(currentDir, { withFileTypes: true });
+		} catch {
+			continue;
+		}
+
+		for (const entry of entries) {
+			if (entry.name.startsWith(".") || excludePatterns.has(entry.name)) {
+				continue;
+			}
+			const absolutePath = join(currentDir, entry.name);
+			if (entry.isDirectory()) {
+				stack.push(absolutePath);
+				continue;
+			}
+			if (!entry.isFile()) {
+				continue;
+			}
+			if (extensionSet) {
+				const extension = extname(entry.name).toLowerCase();
+				if (!extensionSet.has(extension)) {
+					continue;
+				}
+			}
+			try {
+				const fileStat = await stat(absolutePath);
+				if (!fileStat.isFile()) {
+					continue;
+				}
+				files.push({
+					relativePath: relative(requestedRootPath, absolutePath).replace(/\\/g, "/"),
+					fullPath: absolutePath,
+					size: fileStat.size,
+					mtimeMs: fileStat.mtimeMs,
+				});
+			} catch {
+				// Files can disappear while scanning.
+			}
+		}
+	}
+
+	return files;
+}
+
+function parseMountedVirtualPath(
+	virtualPath: string,
+	mounts: LocalRootMount[]
+): {
+	mount: string;
+	subPath: string;
+} {
+	if (!virtualPath.startsWith("/")) {
+		throw new Error("Path must start with '/'");
+	}
+	const trimmed = virtualPath.replace(/^\/+/, "");
+	if (!trimmed) {
+		throw new Error("Path must include a mounted root segment");
+	}
+
+	const [mount, ...rest] = trimmed.split("/");
+	const remainder = rest.join("/");
+	const directMount = mounts.find((entry) => entry.mount === mount);
+	if (!directMount) {
+		throw new Error(
+			`Unknown mounted root '${mount}'. Available roots: ${mounts.map((entry) => `/${entry.mount}`).join(", ")}`
+		);
+	}
+	if (!remainder) {
+		throw new Error("Path must include a file path under the mounted root");
+	}
+	return { mount, subPath: `/${remainder}` };
+}
+
+function findMountByName(mounts: LocalRootMount[], mountName: string): LocalRootMount | undefined {
+	return mounts.find((entry) => entry.mount === mountName);
+}
+
+function toMountedVirtualPath(mount: string, rootPath: string, absolutePath: string): string {
+	const relativePath = toVirtualPath(rootPath, absolutePath);
+	return `/${mount}${relativePath}`;
+}
+
+async function resolveCurrentRootPaths(searchSpaceId?: number | null): Promise<string[]> {
+	const settings = await getAgentFilesystemSettings(searchSpaceId);
+	if (settings.localRootPaths.length === 0) {
+		throw new Error("No local filesystem roots selected");
+	}
+	return settings.localRootPaths;
+}
+
+export async function readAgentLocalFileText(
+	virtualPath: string,
+	searchSpaceId?: number | null
+): Promise<{ path: string; content: string }> {
+	const rootPaths = await resolveCurrentRootPaths(searchSpaceId);
+	const mounts = buildRootMounts(rootPaths);
+	const { mount, subPath } = parseMountedVirtualPath(virtualPath, mounts);
+	const rootMount = findMountByName(mounts, mount);
+	if (!rootMount) {
+		throw new Error(
+			`Unknown mounted root '${mount}'. Available roots: ${mounts.map((entry) => `/${entry.mount}`).join(", ")}`
+		);
+	}
+	const absolutePath = resolveVirtualPath(rootMount.rootPath, subPath);
+	assertLocalOpenableTextFile(absolutePath);
+	const content = await readFile(absolutePath, "utf8");
+	return {
+		path: toMountedVirtualPath(rootMount.mount, rootMount.rootPath, absolutePath),
+		content,
+	};
+}
+
+export async function writeAgentLocalFileText(
+	virtualPath: string,
+	content: string,
+	searchSpaceId?: number | null
+): Promise<{ path: string }> {
+	const rootPaths = await resolveCurrentRootPaths(searchSpaceId);
+	const mounts = buildRootMounts(rootPaths);
+	const { mount, subPath } = parseMountedVirtualPath(virtualPath, mounts);
+	const rootMount = findMountByName(mounts, mount);
+	if (!rootMount) {
+		throw new Error(
+			`Unknown mounted root '${mount}'. Available roots: ${mounts.map((entry) => `/${entry.mount}`).join(", ")}`
+		);
+	}
+	let selectedAbsolutePath = resolveVirtualPath(rootMount.rootPath, subPath);
+
+	try {
+		await access(selectedAbsolutePath);
+	} catch {
+		// New files are created under the selected mounted root.
+	}
+	await mkdir(dirname(selectedAbsolutePath), { recursive: true });
+	await writeFile(selectedAbsolutePath, content, "utf8");
+	return {
+		path: toMountedVirtualPath(rootMount.mount, rootMount.rootPath, selectedAbsolutePath),
+	};
+}
diff --git a/surfsense_desktop/src/modules/analytics.ts b/surfsense_desktop/src/modules/analytics.ts
index 0bbcb3026b..01dba60f05 100644
--- a/surfsense_desktop/src/modules/analytics.ts
+++ b/surfsense_desktop/src/modules/analytics.ts
@@ -3,14 +3,27 @@ import { machineIdSync } from 'node-machine-id';
 import { app } from 'electron';
 
 let client: PostHog | null = null;
-let distinctId = '';
+let machineId = '';
+let currentDistinctId = '';
+let identifiedUserId: string | null = null;
+
+function baseProperties(): Record<string, unknown> {
+  return {
+    platform: 'desktop',
+    app_version: app.getVersion(),
+    os: process.platform,
+    arch: process.arch,
+    machine_id: machineId,
+  };
+}
 
 export function initAnalytics(): void {
   const key = process.env.POSTHOG_KEY;
   if (!key) return;
 
   try {
-    distinctId = machineIdSync(true);
+    machineId = machineIdSync(true);
+    currentDistinctId = machineId;
   } catch {
     return;
   }
@@ -22,17 +35,92 @@ export function initAnalytics(): void {
   });
 }
 
-export function trackEvent(event: string, properties?: Record<string, unknown>): void {
+export function getMachineId(): string {
+  return machineId;
+}
+
+export function getDistinctId(): string {
+  return currentDistinctId;
+}
+
+/**
+ * Identify the current logged-in user in PostHog so main-process desktop
+ * events (and linked anonymous machine events) are attributed to that person.
+ *
+ * Idempotent: calling identify repeatedly with the same userId is a no-op.
+ */
+export function identifyUser(
+  userId: string,
+  properties?: Record<string, unknown>
+): void {
+  if (!client || !userId) return;
+  if (identifiedUserId === userId) {
+    // Already identified — only refresh person properties
+    try {
+      client.identify({
+        distinctId: userId,
+        properties: {
+          ...baseProperties(),
+          $set: {
+            ...(properties || {}),
+            platform: 'desktop',
+            last_seen_at: new Date().toISOString(),
+          },
+        },
+      });
+    } catch {
+      // ignore
+    }
+    return;
+  }
+
+  try {
+    // Link the anonymous machine distinct ID to the authenticated user
+    client.identify({
+      distinctId: userId,
+      properties: {
+        ...baseProperties(),
+        $anon_distinct_id: machineId,
+        $set: {
+          ...(properties || {}),
+          platform: 'desktop',
+          last_seen_at: new Date().toISOString(),
+        },
+        $set_once: {
+          first_seen_platform: 'desktop',
+        },
+      },
+    });
+
+    identifiedUserId = userId;
+    currentDistinctId = userId;
+  } catch {
+    // Analytics must never break the app
+  }
+}
+
+/**
+ * Reset user identity on logout. Subsequent events are captured anonymously
+ * against the machine ID until the user logs in again.
+ */
+export function resetUser(): void {
+  if (!client) return;
+  identifiedUserId = null;
+  currentDistinctId = machineId;
+}
+
+export function trackEvent(
+  event: string,
+  properties?: Record<string, unknown>
+): void {
   if (!client) return;
 
   try {
     client.capture({
-      distinctId,
+      distinctId: currentDistinctId || machineId,
       event,
       properties: {
-        platform: 'desktop',
-        app_version: app.getVersion(),
-        os: process.platform,
+        ...baseProperties(),
         ...properties,
       },
     });
diff --git a/surfsense_desktop/src/modules/auto-launch.ts b/surfsense_desktop/src/modules/auto-launch.ts
new file mode 100644
index 0000000000..9759c4ef9d
--- /dev/null
+++ b/surfsense_desktop/src/modules/auto-launch.ts
@@ -0,0 +1,304 @@
+import { app } from 'electron';
+import fs from 'fs';
+import os from 'os';
+import path from 'path';
+
+// ---------------------------------------------------------------------------
+// Launch on system startup ("auto-launch" / "open at login").
+//
+// macOS + Windows : uses Electron's built-in `app.setLoginItemSettings()`.
+// Linux           : writes a freedesktop autostart `.desktop` file into
+//                   `~/.config/autostart/`. Electron's API is a no-op there.
+//
+// The OS is the source of truth for whether we're enabled (so a user who
+// disables us via System Settings / GNOME Tweaks isn't silently overridden).
+// We persist a small companion record in electron-store for things the OS
+// can't tell us — currently just `openAsHidden`, since on Windows we encode
+// it as a CLI arg and on Linux as part of the Exec line, but on a fresh
+// startup we still want the renderer toggle to reflect the user's intent.
+// ---------------------------------------------------------------------------
+
+const STORE_KEY = 'launchAtLogin';
+const HIDDEN_FLAG = '--hidden';
+const LINUX_DESKTOP_FILENAME = 'surfsense.desktop';
+
+export interface AutoLaunchState {
+  enabled: boolean;
+  openAsHidden: boolean;
+  supported: boolean;
+}
+
+interface PersistedState {
+  enabled: boolean;
+  openAsHidden: boolean;
+  // True once we've run the first-launch defaults (opt-in to auto-launch).
+  // We never re-apply defaults if this is set, so a user who has explicitly
+  // turned auto-launch off stays off forever.
+  defaultsApplied: boolean;
+}
+
+const DEFAULTS: PersistedState = {
+  enabled: false,
+  openAsHidden: true,
+  defaultsApplied: false,
+};
+
+// eslint-disable-next-line @typescript-eslint/no-explicit-any -- lazily imported ESM module; matches shortcuts.ts pattern
+let store: any = null;
+
+async function getStore() {
+  if (!store) {
+    const { default: Store } = await import('electron-store');
+    store = new Store({
+      name: 'auto-launch',
+      defaults: { [STORE_KEY]: DEFAULTS },
+    });
+  }
+  return store;
+}
+
+async function readPersisted(): Promise<PersistedState> {
+  const s = await getStore();
+  const stored = s.get(STORE_KEY) as Partial<PersistedState> | undefined;
+  return { ...DEFAULTS, ...(stored ?? {}) };
+}
+
+async function writePersisted(next: PersistedState): Promise<void> {
+  const s = await getStore();
+  s.set(STORE_KEY, next);
+}
+
+// ---------------------------------------------------------------------------
+// Platform support
+// ---------------------------------------------------------------------------
+
+// Auto-launch only makes sense for the packaged app — in dev `process.execPath`
+// is the local Electron binary, so registering it would point the OS at a
+// throwaway path the next time the dev server isn't running.
+function isSupported(): boolean {
+  if (!app.isPackaged) return false;
+  return ['darwin', 'win32', 'linux'].includes(process.platform);
+}
+
+// ---------------------------------------------------------------------------
+// Linux: ~/.config/autostart/surfsense.desktop
+// ---------------------------------------------------------------------------
+
+function linuxAutostartDir(): string {
+  const xdg = process.env.XDG_CONFIG_HOME;
+  const base = xdg && xdg.length > 0 ? xdg : path.join(os.homedir(), '.config');
+  return path.join(base, 'autostart');
+}
+
+function linuxAutostartFile(): string {
+  return path.join(linuxAutostartDir(), LINUX_DESKTOP_FILENAME);
+}
+
+// AppImages move around with the user — `process.execPath` points at a temp
+// mount, so we have to use the original AppImage path exposed via env.
+function linuxExecPath(): string {
+  return process.env.APPIMAGE && process.env.APPIMAGE.length > 0
+    ? process.env.APPIMAGE
+    : process.execPath;
+}
+
+function escapeDesktopExecArg(value: string): string {
+  // Freedesktop `.desktop` Exec values require quoted args when spaces are
+  // present. We keep this intentionally minimal and escape only characters
+  // that can break quoted parsing.
+  return `"${value.replace(/(["\\`$])/g, '\\$1')}"`;
+}
+
+function writeLinuxDesktopFile(openAsHidden: boolean): void {
+  const exec = escapeDesktopExecArg(linuxExecPath());
+  const args = openAsHidden ? ` ${HIDDEN_FLAG}` : '';
+  const contents = [
+    '[Desktop Entry]',
+    'Type=Application',
+    'Version=1.0',
+    'Name=SurfSense',
+    'Comment=AI-powered research assistant',
+    `Exec=${exec}${args}`,
+    'Terminal=false',
+    'Categories=Utility;Office;',
+    'X-GNOME-Autostart-enabled=true',
+    `X-GNOME-Autostart-Delay=${openAsHidden ? '5' : '0'}`,
+    '',
+  ].join('\n');
+
+  fs.mkdirSync(linuxAutostartDir(), { recursive: true });
+  fs.writeFileSync(linuxAutostartFile(), contents, { mode: 0o644 });
+}
+
+function removeLinuxDesktopFile(): void {
+  try {
+    fs.unlinkSync(linuxAutostartFile());
+  } catch (err: unknown) {
+    if ((err as NodeJS.ErrnoException)?.code !== 'ENOENT') throw err;
+  }
+}
+
+function readLinuxDesktopFile(): boolean {
+  return fs.existsSync(linuxAutostartFile());
+}
+
+// ---------------------------------------------------------------------------
+// Public API
+// ---------------------------------------------------------------------------
+
+export async function getAutoLaunchState(): Promise<AutoLaunchState> {
+  const supported = isSupported();
+  const persisted = await readPersisted();
+
+  if (!supported) {
+    return { enabled: false, openAsHidden: persisted.openAsHidden, supported: false };
+  }
+
+  // Trust the OS state — the user may have disabled it from system settings.
+  return { enabled: readOsEnabled(), openAsHidden: persisted.openAsHidden, supported: true };
+}
+
+export async function setAutoLaunch(
+  enabled: boolean,
+  openAsHidden: boolean = DEFAULTS.openAsHidden,
+): Promise<AutoLaunchState> {
+  const supported = isSupported();
+
+  if (!supported) {
+    return { enabled: false, openAsHidden, supported: false };
+  }
+
+  applySystemRegistration(enabled, openAsHidden);
+  // Preserve `defaultsApplied` (and any future fields) — and explicitly
+  // mark them as applied, since the user has now made an intentional choice.
+  await writePersisted({ enabled, openAsHidden, defaultsApplied: true });
+  return { enabled, openAsHidden, supported: true };
+}
+
+function applySystemRegistration(enabled: boolean, openAsHidden: boolean): void {
+  if (process.platform === 'linux') {
+    if (enabled) writeLinuxDesktopFile(openAsHidden);
+    else removeLinuxDesktopFile();
+    return;
+  }
+
+  if (!enabled) {
+    app.setLoginItemSettings({ openAtLogin: false });
+    return;
+  }
+
+  if (process.platform === 'win32') {
+    // On Windows we can't tell the OS to "launch hidden" — instead we pass an
+    // arg the app introspects on boot to skip showing the main window.
+    app.setLoginItemSettings({
+      openAtLogin: true,
+      args: openAsHidden ? [HIDDEN_FLAG] : [],
+    });
+    return;
+  }
+
+  // darwin
+  app.setLoginItemSettings({
+    openAtLogin: true,
+    openAsHidden,
+  });
+}
+
+// First-launch opt-in: register SurfSense as a hidden login item so the tray,
+// global shortcuts, and folder watchers are ready right after the user signs
+// in. Runs at most once per installation — the `defaultsApplied` flag is
+// flipped before we ever touch the OS so a failure to register doesn't cause
+// us to retry on every boot, and a user who turns the toggle off afterwards
+// is never silently re-enabled.
+//
+// Returns whether the defaults were actually applied this boot, so callers
+// can fire an analytics event without coupling this module to PostHog.
+export async function applyAutoLaunchDefaults(): Promise<boolean> {
+  if (!isSupported()) return false;
+  const persisted = await readPersisted();
+  if (persisted.defaultsApplied) return false;
+
+  // Mark the defaults as applied *first*. If `applySystemRegistration`
+  // throws (e.g. read-only home dir on Linux), we'd rather silently leave
+  // the user un-registered than spam them with a failed registration on
+  // every single boot.
+  const next: PersistedState = {
+    enabled: true,
+    openAsHidden: true,
+    defaultsApplied: true,
+  };
+
+  try {
+    applySystemRegistration(true, true);
+  } catch (err) {
+    console.error('[auto-launch] First-run registration failed:', err);
+    next.enabled = false;
+  }
+
+  await writePersisted(next);
+  return next.enabled;
+}
+
+// Called once at startup. Goal:
+//   * If the OS-level entry is already enabled, re-assert it so a moved
+//     binary (Windows reinstall to a new dir, Linux AppImage moved by user)
+//     gets its registered path refreshed.
+//   * If the OS-level entry has been disabled — typically because the user
+//     turned it off in System Settings / GNOME Tweaks — *respect that* and
+//     reconcile our persisted state to match. We never silently re-enable
+//     a login item the user explicitly turned off.
+export async function syncAutoLaunchOnStartup(): Promise<void> {
+  if (!isSupported()) return;
+
+  const persisted = await readPersisted();
+  const osEnabled = readOsEnabled();
+
+  if (!osEnabled) {
+    // User (or some other tool) turned us off out-of-band. Don't re-enable;
+    // just bring our persisted state in sync so the settings UI reflects
+    // reality on the next render.
+    if (persisted.enabled) {
+      await writePersisted({ ...persisted, enabled: false });
+    }
+    return;
+  }
+
+  // OS says we're enabled — refresh the registration so the recorded path /
+  // args match this binary. Idempotent on macOS; corrects path drift on
+  // Windows and Linux. If our persisted state was somehow stale we also
+  // bring it back in line.
+  try {
+    applySystemRegistration(true, persisted.openAsHidden);
+    if (!persisted.enabled) {
+      await writePersisted({ ...persisted, enabled: true });
+    }
+  } catch (err) {
+    console.error('[auto-launch] Failed to re-assert login item:', err);
+  }
+}
+
+function readOsEnabled(): boolean {
+  if (process.platform === 'linux') return readLinuxDesktopFile();
+  return app.getLoginItemSettings().openAtLogin;
+}
+
+// True when the OS launched us as part of login (used for analytics).
+export function wasLaunchedAtLogin(): boolean {
+  if (process.argv.includes(HIDDEN_FLAG)) return true;
+  if (process.platform === 'darwin') {
+    const settings = app.getLoginItemSettings();
+    return settings.wasOpenedAtLogin || settings.wasOpenedAsHidden;
+  }
+  return false;
+}
+
+// Used for boot UI behavior. On macOS we only start hidden when the OS
+// explicitly launched the app as hidden, not merely "at login".
+export function shouldStartHidden(): boolean {
+  if (process.argv.includes(HIDDEN_FLAG)) return true;
+  if (process.platform === 'darwin') {
+    const settings = app.getLoginItemSettings();
+    return settings.wasOpenedAsHidden;
+  }
+  return false;
+}
diff --git a/surfsense_desktop/src/modules/auto-updater.ts b/surfsense_desktop/src/modules/auto-updater.ts
index 47a85b7306..e323abe535 100644
--- a/surfsense_desktop/src/modules/auto-updater.ts
+++ b/surfsense_desktop/src/modules/auto-updater.ts
@@ -1,4 +1,5 @@
 import { app, dialog } from 'electron';
+import { trackEvent } from './analytics';
 
 const SEMVER_RE = /^\d+\.\d+\.\d+/;
 
@@ -17,10 +18,18 @@ export function setupAutoUpdater(): void {
 
   autoUpdater.on('update-available', (info: { version: string }) => {
     console.log(`Update available: ${info.version}`);
+    trackEvent('desktop_update_available', {
+      current_version: version,
+      new_version: info.version,
+    });
   });
 
   autoUpdater.on('update-downloaded', (info: { version: string }) => {
     console.log(`Update downloaded: ${info.version}`);
+    trackEvent('desktop_update_downloaded', {
+      current_version: version,
+      new_version: info.version,
+    });
     dialog.showMessageBox({
       type: 'info',
       buttons: ['Restart', 'Later'],
@@ -29,13 +38,19 @@ export function setupAutoUpdater(): void {
       message: `Version ${info.version} has been downloaded. Restart to apply the update.`,
     }).then(({ response }: { response: number }) => {
       if (response === 0) {
+        trackEvent('desktop_update_install_accepted', { new_version: info.version });
         autoUpdater.quitAndInstall();
+      } else {
+        trackEvent('desktop_update_install_deferred', { new_version: info.version });
       }
     });
   });
 
   autoUpdater.on('error', (err: Error) => {
     console.log('Auto-updater: update check skipped —', err.message?.split('\n')[0]);
+    trackEvent('desktop_update_error', {
+      message: err.message?.split('\n')[0],
+    });
   });
 
   autoUpdater.checkForUpdates().catch(() => {});
diff --git a/surfsense_desktop/src/modules/autocomplete/index.ts b/surfsense_desktop/src/modules/autocomplete/index.ts
deleted file mode 100644
index d4eb727fdc..0000000000
--- a/surfsense_desktop/src/modules/autocomplete/index.ts
+++ /dev/null
@@ -1,143 +0,0 @@
-import { clipboard, globalShortcut, ipcMain, screen } from 'electron';
-import { IPC_CHANNELS } from '../../ipc/channels';
-import { getFrontmostApp, getWindowTitle, hasAccessibilityPermission, simulatePaste } from '../platform';
-import { hasScreenRecordingPermission, requestAccessibility, requestScreenRecording } from '../permissions';
-import { captureScreen } from './screenshot';
-import { createSuggestionWindow, destroySuggestion, getSuggestionWindow } from './suggestion-window';
-import { getShortcuts } from '../shortcuts';
-import { getActiveSearchSpaceId } from '../active-search-space';
-import { trackEvent } from '../analytics';
-
-let currentShortcut = '';
-let autocompleteEnabled = true;
-let savedClipboard = '';
-let sourceApp = '';
-
-function isSurfSenseWindow(): boolean {
-  const app = getFrontmostApp();
-  return app === 'Electron' || app === 'SurfSense' || app === 'surfsense-desktop';
-}
-
-async function triggerAutocomplete(): Promise<void> {
-  if (!autocompleteEnabled) return;
-  if (isSurfSenseWindow()) return;
-
-  if (!hasScreenRecordingPermission()) {
-    requestScreenRecording();
-    return;
-  }
-
-  sourceApp = getFrontmostApp();
-  const windowTitle = getWindowTitle();
-  savedClipboard = clipboard.readText();
-
-  const screenshot = await captureScreen();
-  if (!screenshot) {
-    console.error('[autocomplete] Screenshot capture failed');
-    return;
-  }
-
-  const searchSpaceId = await getActiveSearchSpaceId();
-  if (!searchSpaceId) {
-    console.warn('[autocomplete] No active search space. Select a search space first.');
-    return;
-  }
-  trackEvent('desktop_autocomplete_triggered', { search_space_id: searchSpaceId });
-  const cursor = screen.getCursorScreenPoint();
-  const win = createSuggestionWindow(cursor.x, cursor.y);
-
-  win.webContents.once('did-finish-load', () => {
-    const sw = getSuggestionWindow();
-    setTimeout(() => {
-      if (sw && !sw.isDestroyed()) {
-        sw.webContents.send(IPC_CHANNELS.AUTOCOMPLETE_CONTEXT, {
-          screenshot,
-          searchSpaceId,
-          appName: sourceApp,
-          windowTitle,
-        });
-      }
-    }, 300);
-  });
-}
-
-async function acceptAndInject(text: string): Promise<void> {
-  if (!sourceApp) return;
-
-  if (!hasAccessibilityPermission()) {
-    requestAccessibility();
-    return;
-  }
-
-  clipboard.writeText(text);
-  destroySuggestion();
-
-  try {
-    await new Promise((r) => setTimeout(r, 50));
-    simulatePaste();
-    await new Promise((r) => setTimeout(r, 100));
-    clipboard.writeText(savedClipboard);
-  } catch {
-    clipboard.writeText(savedClipboard);
-  }
-}
-
-let ipcRegistered = false;
-
-function registerIpcHandlers(): void {
-  if (ipcRegistered) return;
-  ipcRegistered = true;
-
-  ipcMain.handle(IPC_CHANNELS.ACCEPT_SUGGESTION, async (_event, text: string) => {
-    trackEvent('desktop_autocomplete_accepted');
-    await acceptAndInject(text);
-  });
-  ipcMain.handle(IPC_CHANNELS.DISMISS_SUGGESTION, () => {
-    trackEvent('desktop_autocomplete_dismissed');
-    destroySuggestion();
-  });
-  ipcMain.handle(IPC_CHANNELS.SET_AUTOCOMPLETE_ENABLED, (_event, enabled: boolean) => {
-    autocompleteEnabled = enabled;
-    if (!enabled) {
-      destroySuggestion();
-    }
-  });
-  ipcMain.handle(IPC_CHANNELS.GET_AUTOCOMPLETE_ENABLED, () => autocompleteEnabled);
-}
-
-function autocompleteHandler(): void {
-  const sw = getSuggestionWindow();
-  if (sw && !sw.isDestroyed()) {
-    destroySuggestion();
-    return;
-  }
-  triggerAutocomplete();
-}
-
-async function registerShortcut(): Promise<void> {
-  const shortcuts = await getShortcuts();
-  currentShortcut = shortcuts.autocomplete;
-
-  const ok = globalShortcut.register(currentShortcut, autocompleteHandler);
-
-  if (!ok) {
-    console.error(`[autocomplete] Failed to register shortcut ${currentShortcut}`);
-  } else {
-    console.log(`[autocomplete] Registered shortcut ${currentShortcut}`);
-  }
-}
-
-export async function registerAutocomplete(): Promise<void> {
-  registerIpcHandlers();
-  await registerShortcut();
-}
-
-export function unregisterAutocomplete(): void {
-  if (currentShortcut) globalShortcut.unregister(currentShortcut);
-  destroySuggestion();
-}
-
-export async function reregisterAutocomplete(): Promise<void> {
-  unregisterAutocomplete();
-  await registerShortcut();
-}
diff --git a/surfsense_desktop/src/modules/autocomplete/screenshot.ts b/surfsense_desktop/src/modules/autocomplete/screenshot.ts
deleted file mode 100644
index 22b7c1b143..0000000000
--- a/surfsense_desktop/src/modules/autocomplete/screenshot.ts
+++ /dev/null
@@ -1,27 +0,0 @@
-import { desktopCapturer, screen } from 'electron';
-
-/**
- * Captures the primary display as a base64-encoded PNG data URL.
- * Uses the display's actual size for full-resolution capture.
- */
-export async function captureScreen(): Promise<string | null> {
-  try {
-    const primaryDisplay = screen.getPrimaryDisplay();
-    const { width, height } = primaryDisplay.size;
-
-    const sources = await desktopCapturer.getSources({
-      types: ['screen'],
-      thumbnailSize: { width, height },
-    });
-
-    if (!sources.length) {
-      console.error('[screenshot] No screen sources found');
-      return null;
-    }
-
-    return sources[0].thumbnail.toDataURL();
-  } catch (err) {
-    console.error('[screenshot] Failed to capture screen:', err);
-    return null;
-  }
-}
diff --git a/surfsense_desktop/src/modules/autocomplete/suggestion-window.ts b/surfsense_desktop/src/modules/autocomplete/suggestion-window.ts
deleted file mode 100644
index 8f61b29015..0000000000
--- a/surfsense_desktop/src/modules/autocomplete/suggestion-window.ts
+++ /dev/null
@@ -1,112 +0,0 @@
-import { BrowserWindow, screen, shell } from 'electron';
-import path from 'path';
-import { getServerPort } from '../server';
-
-const TOOLTIP_WIDTH = 420;
-const TOOLTIP_HEIGHT = 38;
-const MAX_HEIGHT = 400;
-
-let suggestionWindow: BrowserWindow | null = null;
-let resizeTimer: ReturnType<typeof setInterval> | null = null;
-let cursorOrigin = { x: 0, y: 0 };
-
-const CURSOR_GAP = 20;
-
-function positionOnScreen(cursorX: number, cursorY: number, w: number, h: number): { x: number; y: number } {
-  const display = screen.getDisplayNearestPoint({ x: cursorX, y: cursorY });
-  const { x: dx, y: dy, width: dw, height: dh } = display.workArea;
-
-  const x = Math.max(dx, Math.min(cursorX, dx + dw - w));
-
-  const spaceBelow = (dy + dh) - (cursorY + CURSOR_GAP);
-  const y = spaceBelow >= h
-    ? cursorY + CURSOR_GAP
-    : cursorY - h - CURSOR_GAP;
-
-  return { x, y: Math.max(dy, y) };
-}
-
-function stopResizePolling(): void {
-  if (resizeTimer) { clearInterval(resizeTimer); resizeTimer = null; }
-}
-
-function startResizePolling(win: BrowserWindow): void {
-  stopResizePolling();
-  let lastH = 0;
-  resizeTimer = setInterval(async () => {
-    if (!win || win.isDestroyed()) { stopResizePolling(); return; }
-    try {
-      const h: number = await win.webContents.executeJavaScript(
-        `document.body.scrollHeight`
-      );
-      if (h > 0 && h !== lastH) {
-        lastH = h;
-        const clamped = Math.min(h, MAX_HEIGHT);
-        const pos = positionOnScreen(cursorOrigin.x, cursorOrigin.y, TOOLTIP_WIDTH, clamped);
-        win.setBounds({ x: pos.x, y: pos.y, width: TOOLTIP_WIDTH, height: clamped });
-      }
-    } catch {}
-  }, 150);
-}
-
-export function getSuggestionWindow(): BrowserWindow | null {
-  return suggestionWindow;
-}
-
-export function destroySuggestion(): void {
-  stopResizePolling();
-  if (suggestionWindow && !suggestionWindow.isDestroyed()) {
-    suggestionWindow.close();
-  }
-  suggestionWindow = null;
-}
-
-export function createSuggestionWindow(x: number, y: number): BrowserWindow {
-  destroySuggestion();
-  cursorOrigin = { x, y };
-
-  const pos = positionOnScreen(x, y, TOOLTIP_WIDTH, TOOLTIP_HEIGHT);
-
-  suggestionWindow = new BrowserWindow({
-    width: TOOLTIP_WIDTH,
-    height: TOOLTIP_HEIGHT,
-    x: pos.x,
-    y: pos.y,
-    frame: false,
-    transparent: true,
-    focusable: false,
-    alwaysOnTop: true,
-    skipTaskbar: true,
-    hasShadow: true,
-    type: 'panel',
-    webPreferences: {
-      preload: path.join(__dirname, 'preload.js'),
-      contextIsolation: true,
-      nodeIntegration: false,
-      sandbox: true,
-    },
-    show: false,
-  });
-
-  suggestionWindow.loadURL(`http://localhost:${getServerPort()}/desktop/suggestion?t=${Date.now()}`);
-
-  suggestionWindow.once('ready-to-show', () => {
-    suggestionWindow?.showInactive();
-    if (suggestionWindow) startResizePolling(suggestionWindow);
-  });
-
-  suggestionWindow.webContents.setWindowOpenHandler(({ url }) => {
-    if (url.startsWith('http://localhost')) {
-      return { action: 'allow' };
-    }
-    shell.openExternal(url);
-    return { action: 'deny' };
-  });
-
-  suggestionWindow.on('closed', () => {
-    stopResizePolling();
-    suggestionWindow = null;
-  });
-
-  return suggestionWindow;
-}
diff --git a/surfsense_desktop/src/modules/deep-links.ts b/surfsense_desktop/src/modules/deep-links.ts
index 1a2b083953..11b7bfcff2 100644
--- a/surfsense_desktop/src/modules/deep-links.ts
+++ b/surfsense_desktop/src/modules/deep-links.ts
@@ -2,6 +2,7 @@ import { app } from 'electron';
 import path from 'path';
 import { getMainWindow } from './window';
 import { getServerPort } from './server';
+import { trackEvent } from './analytics';
 
 const PROTOCOL = 'surfsense';
 
@@ -16,6 +17,10 @@ function handleDeepLink(url: string) {
   if (!win) return;
 
   const parsed = new URL(url);
+  trackEvent('desktop_deep_link_received', {
+    host: parsed.hostname,
+    path: parsed.pathname,
+  });
   if (parsed.hostname === 'auth' && parsed.pathname === '/callback') {
     const params = parsed.searchParams.toString();
     win.loadURL(`http://localhost:${getServerPort()}/auth/callback?${params}`);
@@ -64,3 +69,10 @@ export function handlePendingDeepLink(): void {
     deepLinkUrl = null;
   }
 }
+
+// True when a deep link arrived before the main window existed. Callers can
+// use this to force-create a window even on a "started hidden" boot, so we
+// don't silently swallow a `surfsense://` URL the user actually clicked on.
+export function hasPendingDeepLink(): boolean {
+  return deepLinkUrl !== null;
+}
diff --git a/surfsense_desktop/src/modules/folder-watcher.ts b/surfsense_desktop/src/modules/folder-watcher.ts
index 96b490d7b8..ee4214d8a7 100644
--- a/surfsense_desktop/src/modules/folder-watcher.ts
+++ b/surfsense_desktop/src/modules/folder-watcher.ts
@@ -4,6 +4,7 @@ import { randomUUID } from 'crypto';
 import * as path from 'path';
 import * as fs from 'fs';
 import { IPC_CHANNELS } from '../ipc/channels';
+import { trackEvent } from './analytics';
 
 export interface WatchedFolderConfig {
   path: string;
@@ -401,6 +402,15 @@ export async function addWatchedFolder(
     await startWatcher(config);
   }
 
+  trackEvent('desktop_folder_watch_added', {
+    search_space_id: config.searchSpaceId,
+    root_folder_id: config.rootFolderId,
+    active: config.active,
+    has_exclude_patterns: (config.excludePatterns?.length ?? 0) > 0,
+    has_extension_filter: !!config.fileExtensions && config.fileExtensions.length > 0,
+    is_update: existing >= 0,
+  });
+
   return folders;
 }
 
@@ -409,6 +419,7 @@ export async function removeWatchedFolder(
 ): Promise<WatchedFolderConfig[]> {
   const s = await getStore();
   const folders: WatchedFolderConfig[] = s.get(STORE_KEY, []);
+  const removed = folders.find((f: WatchedFolderConfig) => f.path === folderPath);
   const updated = folders.filter((f: WatchedFolderConfig) => f.path !== folderPath);
   s.set(STORE_KEY, updated);
 
@@ -418,6 +429,13 @@ export async function removeWatchedFolder(
   const ms = await getMtimeStore();
   ms.delete(folderPath);
 
+  if (removed) {
+    trackEvent('desktop_folder_watch_removed', {
+      search_space_id: removed.searchSpaceId,
+      root_folder_id: removed.rootFolderId,
+    });
+  }
+
   return updated;
 }
 
diff --git a/surfsense_desktop/src/modules/general-assist.ts b/surfsense_desktop/src/modules/general-assist.ts
new file mode 100644
index 0000000000..7d202caa21
--- /dev/null
+++ b/surfsense_desktop/src/modules/general-assist.ts
@@ -0,0 +1,5 @@
+import { showMainWindow } from './window';
+
+export function runGeneralAssistShortcut(): void {
+  showMainWindow('shortcut');
+}
diff --git a/surfsense_desktop/src/modules/screen-capture/index.ts b/surfsense_desktop/src/modules/screen-capture/index.ts
new file mode 100644
index 0000000000..6c1c75509f
--- /dev/null
+++ b/surfsense_desktop/src/modules/screen-capture/index.ts
@@ -0,0 +1,7 @@
+/**
+ * Window capture for Screenshot Assist and chat fullscreen: single-session
+ * desktopCapturer, region overlay, and shortcut entry point.
+ */
+export { pickOpenWindowCapture, type PickedWindowResult } from './window-picker';
+export { pickScreenRegion, captureCurrentDisplayDataUrl } from './screen-region-picker';
+export { runScreenshotAssistShortcut } from './screenshot-assist';
diff --git a/surfsense_desktop/src/modules/screen-capture/screen-region-picker.ts b/surfsense_desktop/src/modules/screen-capture/screen-region-picker.ts
new file mode 100644
index 0000000000..fd771b0f7c
--- /dev/null
+++ b/surfsense_desktop/src/modules/screen-capture/screen-region-picker.ts
@@ -0,0 +1,335 @@
+import { BrowserWindow, desktopCapturer, nativeImage, screen } from 'electron';
+import path from 'path';
+import { IPC_CHANNELS } from '../../ipc/channels';
+function fitNativeImageToWorkArea(img: Electron.NativeImage, display: Electron.Display): Electron.NativeImage {
+  const wa = display.workArea;
+  const { width: iw, height: ih } = img.getSize();
+  const scale = Math.min(1, wa.width / iw, wa.height / ih);
+  if (scale >= 1) return img;
+  return img.resize({
+    width: Math.max(1, Math.floor(iw * scale)),
+    height: Math.max(1, Math.floor(ih * scale)),
+    quality: 'best',
+  });
+}
+
+// One getSources per pick; overlay and final crop share that bitmap (avoids a second portal session, e.g. Wayland).
+
+let pickInProgress = false;
+
+type DisplayCaptureSnapshot = {
+  dataUrl: string;
+  width: number;
+  height: number;
+};
+
+async function captureDisplaySnapshot(display: Electron.Display): Promise<DisplayCaptureSnapshot | null> {
+  try {
+    const sf = display.scaleFactor || 1;
+    const tw = Math.max(1, Math.round(display.size.width * sf));
+    const th = Math.max(1, Math.round(display.size.height * sf));
+    const sources = await desktopCapturer.getSources({
+      types: ['screen'],
+      thumbnailSize: { width: tw, height: th },
+    });
+    if (!sources.length) return null;
+    const idStr = String(display.id);
+    let chosen =
+      sources.find((s) => s.display_id === idStr) ||
+      sources.find((s) => s.display_id && s.display_id === idStr) ||
+      null;
+    if (!chosen && screen.getPrimaryDisplay().id === display.id) {
+      chosen = sources[0];
+    }
+    if (!chosen) chosen = sources[0];
+    const dataUrl = chosen.thumbnail.toDataURL();
+    const { width, height } = chosen.thumbnail.getSize();
+    return { dataUrl, width, height };
+  } catch {
+    return null;
+  }
+}
+
+export async function captureCurrentDisplayDataUrl(): Promise<string | null> {
+  const display = screen.getDisplayNearestPoint(screen.getCursorScreenPoint());
+  const snapshot = await captureDisplaySnapshot(display);
+  return snapshot?.dataUrl ?? null;
+}
+
+function buildInjectScript(dataUrl: string, iw: number, ih: number): string {
+  return `(() => {
+    const api = window.surfsenseScreenRegion;
+    if (!api) return;
+    const dataUrl = ${JSON.stringify(dataUrl)};
+    const iw = ${iw};
+    const ih = ${ih};
+    document.body.style.margin = '0';
+    document.body.style.overflow = 'hidden';
+    document.body.style.background = '#000';
+    const img = document.createElement('img');
+    img.draggable = false;
+    img.src = dataUrl;
+    img.style.cssText = 'position:fixed;inset:0;width:100vw;height:100vh;object-fit:fill;user-select:none;pointer-events:none;';
+    const veil = document.createElement('div');
+    veil.style.cssText = 'position:fixed;inset:0;cursor:crosshair;background:rgba(0,0,0,0.15);';
+    const sel = document.createElement('div');
+    sel.style.cssText = 'position:fixed;border:2px solid #38bdf8;box-shadow:0 0 0 9999px rgba(0,0,0,0.45);display:none;pointer-events:none;z-index:2;';
+    document.body.appendChild(img);
+    document.body.appendChild(veil);
+    document.body.appendChild(sel);
+    let ax = 0, ay = 0, dragging = false;
+    function show(x0, y0, x1, y1) {
+      const l = Math.min(x0, x1), t = Math.min(y0, y1);
+      const w = Math.abs(x1 - x0), h = Math.abs(y1 - y0);
+      if (w < 2 || h < 2) { sel.style.display = 'none'; return; }
+      sel.style.display = 'block';
+      sel.style.left = l + 'px';
+      sel.style.top = t + 'px';
+      sel.style.width = w + 'px';
+      sel.style.height = h + 'px';
+    }
+    function mapRect(l, t, w, h) {
+      const vw = window.innerWidth, vh = window.innerHeight;
+      const sx = Math.round((l / vw) * iw);
+      const sy = Math.round((t / vh) * ih);
+      const sw = Math.max(1, Math.round((w / vw) * iw));
+      const sh = Math.max(1, Math.round((h / vh) * ih));
+      const cx = Math.min(Math.max(0, sx), iw - 1);
+      const cy = Math.min(Math.max(0, sy), ih - 1);
+      const cw = Math.min(sw, iw - cx);
+      const ch = Math.min(sh, ih - cy);
+      return { x: cx, y: cy, width: cw, height: ch };
+    }
+    function endDrag(clientX, clientY, pointerId) {
+      if (!dragging) return;
+      dragging = false;
+      if (typeof pointerId === 'number' && pointerId >= 0) {
+        try { veil.releasePointerCapture(pointerId); } catch (_) {}
+      }
+      const l = Math.min(ax, clientX), t = Math.min(ay, clientY);
+      const w = Math.abs(clientX - ax), h = Math.abs(clientY - ay);
+      if (w < 4 || h < 4) { sel.style.display = 'none'; return; }
+      api.submit(mapRect(l, t, w, h));
+    }
+    veil.addEventListener('pointerdown', (e) => {
+      if (e.button !== 0) return;
+      try { veil.setPointerCapture(e.pointerId); } catch (_) {}
+      dragging = true;
+      ax = e.clientX; ay = e.clientY;
+      show(ax, ay, ax, ay);
+    });
+    veil.addEventListener('pointermove', (e) => {
+      if (!dragging) return;
+      show(ax, ay, e.clientX, e.clientY);
+    });
+    veil.addEventListener('pointerup', (e) => {
+      endDrag(e.clientX, e.clientY, e.pointerId);
+    });
+    window.addEventListener('pointerup', (e) => {
+      endDrag(e.clientX, e.clientY, e.pointerId);
+    });
+    document.addEventListener(
+      'mouseup',
+      (e) => {
+        endDrag(e.clientX, e.clientY, -1);
+      },
+      true
+    );
+    veil.addEventListener('pointercancel', (e) => {
+      if (!dragging) return;
+      dragging = false;
+      try { veil.releasePointerCapture(e.pointerId); } catch (_) {}
+      sel.style.display = 'none';
+    });
+    window.addEventListener('keydown', (e) => {
+      if (e.key === 'Escape') { api.cancel(); return; }
+      if (e.key === 'Enter' && sel.style.display === 'block') {
+        const l = parseFloat(sel.style.left), t = parseFloat(sel.style.top);
+        const w = parseFloat(sel.style.width), h = parseFloat(sel.style.height);
+        if (w >= 4 && h >= 4) api.submit(mapRect(l, t, w, h));
+      }
+    });
+  })();`;
+}
+
+export function pickScreenRegion(opts?: { windowDataUrl?: string }): Promise<string | null> {
+  if (pickInProgress) return Promise.resolve(null);
+  pickInProgress = true;
+
+  return new Promise((resolve) => {
+    const display = screen.getDisplayNearestPoint(screen.getCursorScreenPoint());
+    let settled = false;
+    let overlay: BrowserWindow | null = null;
+    /** webContents for listener removal after `BrowserWindow` may already be destroyed. */
+    let overlayWc: Electron.WebContents | null = null;
+
+    const cleanupListeners = () => {
+      const wc = overlayWc;
+      overlayWc = null;
+      if (!wc || wc.isDestroyed()) return;
+      wc.removeListener('before-input-event', onBeforeInput);
+      wc.ipc.removeListener(IPC_CHANNELS.SCREEN_REGION_SUBMIT, onSubmit);
+      wc.ipc.removeListener(IPC_CHANNELS.SCREEN_REGION_CANCEL, onCancel);
+    };
+
+    const finish = (result: string | null) => {
+      if (settled) return;
+      settled = true;
+      pickInProgress = false;
+      cleanupListeners();
+      if (overlay && !overlay.isDestroyed()) {
+        overlay.removeAllListeners('closed');
+        overlay.close();
+      }
+      overlay = null;
+      resolve(result);
+    };
+
+    let snapshot: DisplayCaptureSnapshot | null = null;
+    let cropSource: Electron.NativeImage | null = null;
+
+    const onSubmit = (
+      _event: Electron.IpcMainEvent,
+      rect: { x: number; y: number; width: number; height: number }
+    ) => {
+      if (settled || !overlay || overlay.isDestroyed()) return;
+      if (!rect || rect.width < 1 || rect.height < 1) {
+        finish(null);
+        return;
+      }
+      if (!snapshot || !cropSource) {
+        finish(null);
+        return;
+      }
+      try {
+        const iw = snapshot.width;
+        const ih = snapshot.height;
+        const { width: cw, height: ch } = cropSource.getSize();
+        const scaleX = cw / iw;
+        const scaleY = ch / ih;
+        const ox = Math.floor(rect.x * scaleX);
+        const oy = Math.floor(rect.y * scaleY);
+        const ow = Math.min(Math.floor(rect.width * scaleX), cw - ox);
+        const oh = Math.min(Math.floor(rect.height * scaleY), ch - oy);
+        const cropped = cropSource.crop({
+          x: ox,
+          y: oy,
+          width: Math.max(1, ow),
+          height: Math.max(1, oh),
+        });
+        finish(cropped.toDataURL());
+      } catch {
+        finish(null);
+      }
+    };
+
+    const onCancel = (_event: Electron.IpcMainEvent) => {
+      if (settled || !overlay || overlay.isDestroyed()) return;
+      finish(null);
+    };
+
+    const onBeforeInput = (_event: Electron.Event, input: Electron.Input) => {
+      if (input.type === 'keyDown' && input.key === 'Escape') {
+        finish(null);
+      }
+    };
+
+    const openOverlay = (
+      cap: DisplayCaptureSnapshot,
+      crop: Electron.NativeImage,
+      bounds: { x: number; y: number; width: number; height: number }
+    ) => {
+      snapshot = cap;
+      cropSource = crop;
+
+      overlay = new BrowserWindow({
+        x: bounds.x,
+        y: bounds.y,
+        width: bounds.width,
+        height: bounds.height,
+        frame: false,
+        transparent: true,
+        fullscreenable: false,
+        skipTaskbar: true,
+        alwaysOnTop: true,
+        focusable: true,
+        show: false,
+        autoHideMenuBar: true,
+        backgroundColor: '#00000000',
+        webPreferences: {
+            preload: path.join(__dirname, 'modules', 'screen-capture', 'screen-region-preload.js'),
+          contextIsolation: true,
+          nodeIntegration: false,
+          sandbox: true,
+        },
+      });
+
+      overlayWc = overlay.webContents;
+      overlayWc.on('before-input-event', onBeforeInput);
+      overlayWc.ipc.on(IPC_CHANNELS.SCREEN_REGION_SUBMIT, onSubmit);
+      overlayWc.ipc.on(IPC_CHANNELS.SCREEN_REGION_CANCEL, onCancel);
+
+      overlay.setIgnoreMouseEvents(false);
+      overlay.loadURL(
+        'data:text/html;charset=utf-8,' +
+          encodeURIComponent('<!doctype html><html><head><meta charset="utf-8"/></head><body></body></html>')
+      );
+
+      overlay.on('closed', () => {
+        if (!settled) finish(null);
+      });
+
+      overlay.webContents.once('did-finish-load', () => {
+        if (!overlay || overlay.isDestroyed()) return;
+        overlay.webContents
+          .executeJavaScript(buildInjectScript(cap.dataUrl, cap.width, cap.height), true)
+          .then(() => {
+            overlay?.show();
+            overlay?.focus();
+          })
+          .catch(() => {
+            finish(null);
+          });
+      });
+    };
+
+    void (async () => {
+      try {
+        if (opts?.windowDataUrl) {
+          const fullRes = nativeImage.createFromDataURL(opts.windowDataUrl);
+          if (fullRes.isEmpty()) {
+            finish(null);
+            return;
+          }
+          const fitted = fitNativeImageToWorkArea(fullRes, display);
+          const fw = fitted.getSize().width;
+          const fh = fitted.getSize().height;
+          const wa = display.workArea;
+          const x = wa.x + Math.floor((wa.width - fw) / 2);
+          const y = wa.y + Math.floor((wa.height - fh) / 2);
+          openOverlay(
+            { dataUrl: fitted.toDataURL(), width: fw, height: fh },
+            fullRes,
+            { x, y, width: fw, height: fh }
+          );
+          return;
+        }
+
+        const cap = await captureDisplaySnapshot(display);
+        if (!cap) {
+          finish(null);
+          return;
+        }
+        const crop = nativeImage.createFromDataURL(cap.dataUrl);
+        openOverlay(cap, crop, {
+          x: display.bounds.x,
+          y: display.bounds.y,
+          width: display.bounds.width,
+          height: display.bounds.height,
+        });
+      } catch {
+        finish(null);
+      }
+    })();
+  });
+}
diff --git a/surfsense_desktop/src/modules/screen-capture/screen-region-preload.ts b/surfsense_desktop/src/modules/screen-capture/screen-region-preload.ts
new file mode 100644
index 0000000000..4263e0f6e1
--- /dev/null
+++ b/surfsense_desktop/src/modules/screen-capture/screen-region-preload.ts
@@ -0,0 +1,11 @@
+import { contextBridge, ipcRenderer } from 'electron';
+import { IPC_CHANNELS } from '../../ipc/channels';
+
+contextBridge.exposeInMainWorld('surfsenseScreenRegion', {
+  submit: (rect: { x: number; y: number; width: number; height: number }) => {
+    ipcRenderer.send(IPC_CHANNELS.SCREEN_REGION_SUBMIT, rect);
+  },
+  cancel: () => {
+    ipcRenderer.send(IPC_CHANNELS.SCREEN_REGION_CANCEL);
+  },
+});
diff --git a/surfsense_desktop/src/modules/screen-capture/screenshot-assist.ts b/surfsense_desktop/src/modules/screen-capture/screenshot-assist.ts
new file mode 100644
index 0000000000..171b98a579
--- /dev/null
+++ b/surfsense_desktop/src/modules/screen-capture/screenshot-assist.ts
@@ -0,0 +1,26 @@
+import { IPC_CHANNELS } from '../../ipc/channels';
+import { trackEvent } from '../analytics';
+import { pickScreenRegion } from './screen-region-picker';
+import { pickOpenWindowCapture } from './window-picker';
+import { getMainWindow, showMainWindow } from '../window';
+import { hasScreenRecordingPermission, requestScreenRecording } from '../permissions';
+
+export async function runScreenshotAssistShortcut(): Promise<void> {
+  if (!hasScreenRecordingPermission()) {
+    requestScreenRecording();
+    return;
+  }
+
+  const picked = await pickOpenWindowCapture();
+  if (!picked) return;
+
+  const url = await pickScreenRegion({ windowDataUrl: picked.dataUrl });
+  if (!url) return;
+
+  showMainWindow('shortcut');
+  const mw = getMainWindow();
+  if (mw && !mw.isDestroyed()) {
+    mw.webContents.send(IPC_CHANNELS.CHAT_SCREEN_CAPTURE, url);
+    trackEvent('desktop_screenshot_assist_region_to_chat', {});
+  }
+}
diff --git a/surfsense_desktop/src/modules/screen-capture/window-picker-preload.ts b/surfsense_desktop/src/modules/screen-capture/window-picker-preload.ts
new file mode 100644
index 0000000000..dd0acd81ea
--- /dev/null
+++ b/surfsense_desktop/src/modules/screen-capture/window-picker-preload.ts
@@ -0,0 +1,15 @@
+import { contextBridge, ipcRenderer } from 'electron';
+import { IPC_CHANNELS } from '../../ipc/channels';
+
+contextBridge.exposeInMainWorld('surfsenseWindowPick', {
+  list: () =>
+    ipcRenderer.invoke(IPC_CHANNELS.WINDOW_PICK_LIST) as Promise<
+      { id: string; name: string; thumbDataUrl: string }[]
+    >,
+  submit: (sourceId: string) => {
+    ipcRenderer.send(IPC_CHANNELS.WINDOW_PICK_SUBMIT, sourceId);
+  },
+  cancel: () => {
+    ipcRenderer.send(IPC_CHANNELS.WINDOW_PICK_CANCEL);
+  },
+});
diff --git a/surfsense_desktop/src/modules/screen-capture/window-picker.ts b/surfsense_desktop/src/modules/screen-capture/window-picker.ts
new file mode 100644
index 0000000000..b66e23c5c7
--- /dev/null
+++ b/surfsense_desktop/src/modules/screen-capture/window-picker.ts
@@ -0,0 +1,244 @@
+import { BrowserWindow, desktopCapturer, ipcMain, screen } from 'electron';
+import path from 'path';
+import { IPC_CHANNELS } from '../../ipc/channels';
+
+let pickInProgress = false;
+
+const PREVIEW_THUMB = { width: 280, height: 180 } as const;
+
+function maxCaptureThumbSize(): { width: number; height: number } {
+  const d = screen.getPrimaryDisplay();
+  const sf = d.scaleFactor || 1;
+  const w = Math.min(3840, Math.max(1280, Math.round(d.size.width * sf)));
+  const h = Math.min(2160, Math.max(720, Math.round(d.size.height * sf)));
+  return { width: w, height: h };
+}
+
+function isDesktopWindowSourceId(s: string): boolean {
+  return typeof s === 'string' && s.startsWith('window:');
+}
+
+export type PickedWindowResult = {
+  sourceId: string;
+  /** Same pixels as the one `desktopCapturer` snapshot (max thumbnail size). */
+  dataUrl: string;
+};
+
+function buildPickerInjectScript(): string {
+  return `(async function () {
+    const api = window.surfsenseWindowPick;
+    if (!api) return;
+    const items = await api.list();
+    document.body.style.cssText =
+      'margin:0;font-family:system-ui,-apple-system,sans-serif;background:#0f172a;color:#e2e8f0;min-height:100vh;padding:16px;box-sizing:border-box;';
+    const top = document.createElement('div');
+    top.style.cssText =
+      'display:flex;justify-content:space-between;align-items:center;margin-bottom:12px;flex-wrap:wrap;gap:8px;';
+    const t = document.createElement('strong');
+    t.textContent = 'Open windows';
+    const hint = document.createElement('span');
+    hint.style.cssText = 'opacity:0.75;font-size:13px;';
+    hint.textContent = 'Click a window · Esc to cancel';
+    top.appendChild(t);
+    top.appendChild(hint);
+    document.body.appendChild(top);
+    if (!items || !items.length) {
+      const p = document.createElement('p');
+      p.style.cssText = 'line-height:1.5;max-width:42rem;';
+      p.textContent =
+        'No windows were returned by the system. On Linux, allow screen capture when prompted. If other apps are open, try again.';
+      document.body.appendChild(p);
+      return;
+    }
+    const grid = document.createElement('div');
+    grid.style.cssText =
+      'display:grid;grid-template-columns:repeat(auto-fill,minmax(200px,1fr));gap:12px;max-height:calc(100vh - 88px);overflow:auto;padding-bottom:8px;';
+    for (const it of items) {
+      const card = document.createElement('button');
+      card.type = 'button';
+      card.style.cssText =
+        'text-align:left;background:#1e293b;border:1px solid #334155;border-radius:8px;padding:8px;cursor:pointer;color:inherit;';
+      card.addEventListener('mouseenter', function () {
+        card.style.borderColor = '#38bdf8';
+      });
+      card.addEventListener('mouseleave', function () {
+        card.style.borderColor = '#334155';
+      });
+      const img = document.createElement('img');
+      img.alt = '';
+      img.src =
+        it.thumbDataUrl ||
+        'data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///ywAAAAAAQABAAACAUwAOw==';
+      img.style.cssText =
+        'width:100%;height:100px;object-fit:cover;border-radius:4px;background:#000;display:block;';
+      const cap = document.createElement('div');
+      cap.textContent = it.name || '(untitled)';
+      cap.style.cssText =
+        'margin-top:6px;font-size:12px;line-height:1.35;overflow:hidden;text-overflow:ellipsis;display:-webkit-box;-webkit-line-clamp:2;-webkit-box-orient:vertical;';
+      card.appendChild(img);
+      card.appendChild(cap);
+      card.addEventListener('click', function () {
+        api.submit(it.id);
+      });
+      grid.appendChild(card);
+    }
+    document.body.appendChild(grid);
+    window.addEventListener('keydown', function (e) {
+      if (e.key === 'Escape') api.cancel();
+    });
+  })();`;
+}
+
+/**
+ * One OS / Chromium capture session: `getSources` runs once (important on Wayland /
+ * PipeWire so the portal is not opened again for the same flow). Opens our grid to
+ * choose a window; resolves with the chosen snapshot for region or full-frame use.
+ */
+export function pickOpenWindowCapture(): Promise<PickedWindowResult | null> {
+  if (pickInProgress) return Promise.resolve(null);
+  pickInProgress = true;
+
+  return new Promise((resolve) => {
+    let settled = false;
+    let picker: BrowserWindow | null = null;
+    let pickerWc: Electron.WebContents | null = null;
+    /** Filled once before the grid runs — reused for list + final image (no second getSources). */
+    let sessionSources: Electron.DesktopCapturerSource[] = [];
+
+    const finish = (result: PickedWindowResult | null) => {
+      if (settled) return;
+      settled = true;
+      pickInProgress = false;
+      ipcMain.removeHandler(IPC_CHANNELS.WINDOW_PICK_LIST);
+      const wc = pickerWc;
+      pickerWc = null;
+      if (wc && !wc.isDestroyed()) {
+        wc.removeListener('before-input-event', onBeforeInput);
+        wc.ipc.removeListener(IPC_CHANNELS.WINDOW_PICK_SUBMIT, onSubmit);
+        wc.ipc.removeListener(IPC_CHANNELS.WINDOW_PICK_CANCEL, onCancel);
+      }
+      if (picker && !picker.isDestroyed()) {
+        picker.removeAllListeners('closed');
+        picker.close();
+      }
+      picker = null;
+      resolve(result);
+    };
+
+    const onSubmit = (_event: Electron.IpcMainEvent, sourceId: string) => {
+      if (settled || !picker || picker.isDestroyed()) return;
+      if (!isDesktopWindowSourceId(sourceId)) {
+        finish(null);
+        return;
+      }
+      const hit = sessionSources.find((s) => s.id === sourceId);
+      if (!hit || hit.thumbnail.isEmpty()) {
+        finish(null);
+        return;
+      }
+      finish({ sourceId, dataUrl: hit.thumbnail.toDataURL() });
+    };
+
+    const onCancel = () => {
+      if (settled || !picker || picker.isDestroyed()) return;
+      finish(null);
+    };
+
+    const onBeforeInput = (_event: Electron.Event, input: Electron.Input) => {
+      if (input.type === 'keyDown' && input.key === 'Escape') {
+        finish(null);
+      }
+    };
+
+    ipcMain.handle(IPC_CHANNELS.WINDOW_PICK_LIST, async () => {
+      return sessionSources.map((s, i) => {
+        let thumbDataUrl = '';
+        if (!s.thumbnail.isEmpty()) {
+          try {
+            const sm = s.thumbnail.resize({
+              width: PREVIEW_THUMB.width,
+              height: PREVIEW_THUMB.height,
+              quality: 'good',
+            });
+            thumbDataUrl = sm.toDataURL();
+          } catch {
+            thumbDataUrl = s.thumbnail.toDataURL();
+          }
+        }
+        return {
+          id: s.id,
+          name: (s.name || '').trim() || `Window ${i + 1}`,
+          thumbDataUrl,
+        };
+      });
+    });
+
+    picker = new BrowserWindow({
+      width: 760,
+      height: 560,
+      show: false,
+      center: true,
+      autoHideMenuBar: true,
+      title: 'SurfSense — choose window',
+      webPreferences: {
+        preload: path.join(__dirname, 'modules', 'screen-capture', 'window-picker-preload.js'),
+        contextIsolation: true,
+        nodeIntegration: false,
+        sandbox: true,
+      },
+    });
+
+    pickerWc = picker.webContents;
+
+    pickerWc.on('before-input-event', onBeforeInput);
+    pickerWc.ipc.on(IPC_CHANNELS.WINDOW_PICK_SUBMIT, onSubmit);
+    pickerWc.ipc.on(IPC_CHANNELS.WINDOW_PICK_CANCEL, onCancel);
+
+    picker.on('closed', () => {
+      if (!settled) finish(null);
+    });
+
+    picker
+      .loadURL(
+        'data:text/html;charset=utf-8,' +
+          encodeURIComponent('<!doctype html><html><head><meta charset="utf-8"/></head><body></body></html>')
+      )
+      .catch(() => finish(null));
+
+    picker.webContents.once('did-finish-load', () => {
+      void (async () => {
+        if (!picker || picker.isDestroyed()) return;
+        let selfId = '';
+        try {
+          selfId = picker.getMediaSourceId();
+        } catch {
+          selfId = '';
+        }
+        try {
+          const { width, height } = maxCaptureThumbSize();
+          const sources = await desktopCapturer.getSources({
+            types: ['window'],
+            thumbnailSize: { width, height },
+            fetchWindowIcons: false,
+          });
+          sessionSources = sources.filter((s) => !(selfId && s.id === selfId));
+        } catch {
+          sessionSources = [];
+        }
+        if (sessionSources.length === 1) {
+          const only = sessionSources[0];
+          if (!only.thumbnail.isEmpty()) {
+            finish({ sourceId: only.id, dataUrl: only.thumbnail.toDataURL() });
+            return;
+          }
+        }
+        try {
+          await picker.webContents.executeJavaScript(buildPickerInjectScript(), true);
+          if (!picker.isDestroyed()) picker.show();
+        } catch {
+          finish(null);
+        }
+      })();
+    });
+  });
+}
diff --git a/surfsense_desktop/src/modules/shortcuts.ts b/surfsense_desktop/src/modules/shortcuts.ts
index 6948a005e0..64687f7db9 100644
--- a/surfsense_desktop/src/modules/shortcuts.ts
+++ b/surfsense_desktop/src/modules/shortcuts.ts
@@ -1,13 +1,13 @@
 export interface ShortcutConfig {
   generalAssist: string;
   quickAsk: string;
-  autocomplete: string;
+  screenshotAssist: string;
 }
 
 const DEFAULTS: ShortcutConfig = {
   generalAssist: 'CommandOrControl+Shift+S',
   quickAsk: 'CommandOrControl+Alt+S',
-  autocomplete: 'CommandOrControl+Shift+Space',
+  screenshotAssist: 'CommandOrControl+Shift+Space',
 };
 
 const STORE_KEY = 'shortcuts';
@@ -27,14 +27,30 @@ async function getStore() {
 
 export async function getShortcuts(): Promise<ShortcutConfig> {
   const s = await getStore();
-  const stored = s.get(STORE_KEY) as Partial<ShortcutConfig> | undefined;
-  return { ...DEFAULTS, ...stored };
+  const raw = (s.get(STORE_KEY) as Record<string, string> | undefined) ?? {};
+  const legacyAutocomplete = raw.autocomplete;
+  const { autocomplete: _drop, ...rest } = raw;
+  let merged: ShortcutConfig = { ...DEFAULTS, ...rest };
+  if (
+    typeof legacyAutocomplete === 'string' &&
+    legacyAutocomplete.length > 0 &&
+    !('screenshotAssist' in raw)
+  ) {
+    merged = { ...merged, screenshotAssist: legacyAutocomplete };
+    s.set(STORE_KEY, {
+      generalAssist: merged.generalAssist,
+      quickAsk: merged.quickAsk,
+      screenshotAssist: merged.screenshotAssist,
+    });
+  }
+  return merged;
 }
 
 export async function setShortcuts(config: Partial<ShortcutConfig>): Promise<ShortcutConfig> {
   const s = await getStore();
-  const current = (s.get(STORE_KEY) as ShortcutConfig) ?? DEFAULTS;
-  const merged = { ...current, ...config };
+  const raw = (s.get(STORE_KEY) as Record<string, string> | undefined) ?? {};
+  const { autocomplete: _drop, ...current } = raw;
+  const merged = { ...DEFAULTS, ...current, ...config };
   s.set(STORE_KEY, merged);
   return merged;
 }
diff --git a/surfsense_desktop/src/modules/tray.ts b/surfsense_desktop/src/modules/tray.ts
index 1749145a10..f0221fe533 100644
--- a/surfsense_desktop/src/modules/tray.ts
+++ b/surfsense_desktop/src/modules/tray.ts
@@ -1,46 +1,56 @@
-import { app, globalShortcut, Menu, nativeImage, Tray } from 'electron';
+import { app, globalShortcut, Menu, nativeImage, Tray, type NativeImage } from 'electron';
 import path from 'path';
-import { getMainWindow, createMainWindow } from './window';
+import { runGeneralAssistShortcut } from './general-assist';
+import { runScreenshotAssistShortcut } from './screen-capture';
+import { showMainWindow } from './window';
 import { getShortcuts } from './shortcuts';
+import { trackEvent } from './analytics';
 
 let tray: Tray | null = null;
-let currentShortcut: string | null = null;
+let registeredGeneralAssist: string | null = null;
+let registeredScreenshotAssist: string | null = null;
 
-function getTrayIcon(): nativeImage {
-  const iconName = process.platform === 'win32' ? 'icon.ico' : 'icon.png';
+function getTrayIcon(): NativeImage {
+  const iconName =
+    process.platform === 'darwin'
+      ? 'iconTemplate.png'
+      : process.platform === 'win32'
+        ? 'icon.ico'
+        : 'icon.png';
   const iconPath = app.isPackaged
     ? path.join(process.resourcesPath, 'assets', iconName)
     : path.join(__dirname, '..', 'assets', iconName);
   const img = nativeImage.createFromPath(iconPath);
-  return img.resize({ width: 16, height: 16 });
-}
-
-function showMainWindow(): void {
-  let win = getMainWindow();
-  if (!win || win.isDestroyed()) {
-    win = createMainWindow('/dashboard');
-  } else {
-    win.show();
-    win.focus();
+  if (process.platform === 'darwin') {
+    img.setTemplateImage(true);
+    return img;
   }
+  return img.resize({ width: 16, height: 16 });
 }
 
-function registerShortcut(accelerator: string): void {
-  if (currentShortcut) {
-    globalShortcut.unregister(currentShortcut);
-    currentShortcut = null;
+function registerOne(
+  previous: string | null,
+  accelerator: string,
+  onFire: () => void | Promise<void>,
+  label: string
+): string | null {
+  if (previous) {
+    globalShortcut.unregister(previous);
   }
-  if (!accelerator) return;
+  if (!accelerator) return null;
   try {
-    const ok = globalShortcut.register(accelerator, showMainWindow);
+    const ok = globalShortcut.register(accelerator, () => {
+      void Promise.resolve(onFire());
+    });
     if (ok) {
-      currentShortcut = accelerator;
-    } else {
-      console.warn(`[tray] Failed to register General Assist shortcut: ${accelerator}`);
+      console.log(`[hotkeys] Register ${label} ${accelerator}: OK`);
+      return accelerator;
     }
+    console.warn(`[hotkeys] Register ${label} ${accelerator}: FAILED (OS or another app may own this chord)`);
   } catch (err) {
-    console.error(`[tray] Error registering General Assist shortcut:`, err);
+    console.error(`[tray] Error registering ${label} shortcut:`, err);
   }
+  return null;
 }
 
 export async function createTray(): Promise<void> {
@@ -50,27 +60,63 @@ export async function createTray(): Promise<void> {
   tray.setToolTip('SurfSense');
 
   const contextMenu = Menu.buildFromTemplate([
-    { label: 'Open SurfSense', click: showMainWindow },
+    { label: 'Open SurfSense', click: () => showMainWindow('tray_menu') },
     { type: 'separator' },
-    { label: 'Quit', click: () => { app.exit(0); } },
+    {
+      label: 'Quit',
+      click: () => {
+        trackEvent('desktop_tray_quit_clicked');
+        app.exit(0);
+      },
+    },
   ]);
 
   tray.setContextMenu(contextMenu);
-  tray.on('double-click', showMainWindow);
+  tray.on('double-click', () => showMainWindow('tray_click'));
 
   const shortcuts = await getShortcuts();
-  registerShortcut(shortcuts.generalAssist);
+  registeredGeneralAssist = registerOne(
+    null,
+    shortcuts.generalAssist,
+    runGeneralAssistShortcut,
+    'General Assist'
+  );
+  registeredScreenshotAssist = registerOne(
+    null,
+    shortcuts.screenshotAssist,
+    runScreenshotAssistShortcut,
+    'Screenshot Assist'
+  );
 }
 
 export async function reregisterGeneralAssist(): Promise<void> {
   const shortcuts = await getShortcuts();
-  registerShortcut(shortcuts.generalAssist);
+  registeredGeneralAssist = registerOne(
+    registeredGeneralAssist,
+    shortcuts.generalAssist,
+    runGeneralAssistShortcut,
+    'General Assist'
+  );
+}
+
+export async function reregisterScreenshotAssist(): Promise<void> {
+  const shortcuts = await getShortcuts();
+  registeredScreenshotAssist = registerOne(
+    registeredScreenshotAssist,
+    shortcuts.screenshotAssist,
+    runScreenshotAssistShortcut,
+    'Screenshot Assist'
+  );
 }
 
 export function destroyTray(): void {
-  if (currentShortcut) {
-    globalShortcut.unregister(currentShortcut);
-    currentShortcut = null;
+  if (registeredGeneralAssist) {
+    globalShortcut.unregister(registeredGeneralAssist);
+    registeredGeneralAssist = null;
+  }
+  if (registeredScreenshotAssist) {
+    globalShortcut.unregister(registeredScreenshotAssist);
+    registeredScreenshotAssist = null;
   }
   tray?.destroy();
   tray = null;
diff --git a/surfsense_desktop/src/modules/window.ts b/surfsense_desktop/src/modules/window.ts
index 9cd2165017..5317005d59 100644
--- a/surfsense_desktop/src/modules/window.ts
+++ b/surfsense_desktop/src/modules/window.ts
@@ -1,18 +1,27 @@
 import { app, BrowserWindow, shell, session } from 'electron';
 import path from 'path';
+import { trackEvent } from './analytics';
 import { showErrorDialog } from './errors';
 import { getServerPort } from './server';
 import { setActiveSearchSpaceId } from './active-search-space';
 
 const isDev = !app.isPackaged;
 const HOSTED_FRONTEND_URL = process.env.HOSTED_FRONTEND_URL as string;
+const isMac = process.platform === 'darwin';
 
 let mainWindow: BrowserWindow | null = null;
+let isQuitting = false;
 
 export function getMainWindow(): BrowserWindow | null {
   return mainWindow;
 }
 
+// Called from main.ts on `before-quit` so the close-to-tray handler knows
+// to actually let the window die instead of hiding it.
+export function markQuitting(): void {
+  isQuitting = true;
+}
+
 export function createMainWindow(initialPath = '/dashboard'): BrowserWindow {
   mainWindow = new BrowserWindow({
     width: 1280,
@@ -27,7 +36,12 @@ export function createMainWindow(initialPath = '/dashboard'): BrowserWindow {
       webviewTag: false,
     },
     show: false,
-    titleBarStyle: 'hiddenInset',
+    ...(isMac
+      ? {
+          titleBarStyle: 'hidden' as const,
+          trafficLightPosition: { x: 12, y: 10 },
+        }
+      : {}),
   });
 
   mainWindow.once('ready-to-show', () => {
@@ -70,9 +84,31 @@ export function createMainWindow(initialPath = '/dashboard'): BrowserWindow {
     mainWindow.webContents.openDevTools();
   }
 
+  // Hide-to-tray on close (don't actually destroy the window unless the
+  // user really is quitting). Applies to every instance — including the one
+  // created lazily after a launch-at-login boot.
+  mainWindow.on('close', (e) => {
+    if (!isQuitting && mainWindow) {
+      e.preventDefault();
+      mainWindow.hide();
+    }
+  });
+
   mainWindow.on('closed', () => {
     mainWindow = null;
   });
 
   return mainWindow;
 }
+
+export function showMainWindow(source: 'tray_click' | 'tray_menu' | 'shortcut' = 'tray_click'): void {
+  const existing = getMainWindow();
+  const reopened = !existing || existing.isDestroyed();
+  if (reopened) {
+    createMainWindow('/dashboard');
+  } else {
+    existing.show();
+    existing.focus();
+  }
+  trackEvent('desktop_main_window_shown', { source, reopened });
+}
diff --git a/surfsense_desktop/src/preload.ts b/surfsense_desktop/src/preload.ts
index e3d12c5e63..7d72e9da55 100644
--- a/surfsense_desktop/src/preload.ts
+++ b/surfsense_desktop/src/preload.ts
@@ -17,6 +17,13 @@ contextBridge.exposeInMainWorld('electronAPI', {
       ipcRenderer.removeListener(IPC_CHANNELS.DEEP_LINK, listener);
     };
   },
+  onChatScreenCapture: (callback: (dataUrl: string) => void) => {
+    const listener = (_event: unknown, dataUrl: string) => callback(dataUrl);
+    ipcRenderer.on(IPC_CHANNELS.CHAT_SCREEN_CAPTURE, listener);
+    return () => {
+      ipcRenderer.removeListener(IPC_CHANNELS.CHAT_SCREEN_CAPTURE, listener);
+    };
+  },
   getQuickAskText: () => ipcRenderer.invoke(IPC_CHANNELS.QUICK_ASK_TEXT),
   setQuickAskMode: (mode: string) => ipcRenderer.invoke(IPC_CHANNELS.SET_QUICK_ASK_MODE, mode),
   getQuickAskMode: () => ipcRenderer.invoke(IPC_CHANNELS.GET_QUICK_ASK_MODE),
@@ -25,20 +32,8 @@ contextBridge.exposeInMainWorld('electronAPI', {
   getPermissionsStatus: () => ipcRenderer.invoke(IPC_CHANNELS.GET_PERMISSIONS_STATUS),
   requestAccessibility: () => ipcRenderer.invoke(IPC_CHANNELS.REQUEST_ACCESSIBILITY),
   requestScreenRecording: () => ipcRenderer.invoke(IPC_CHANNELS.REQUEST_SCREEN_RECORDING),
+  captureFullScreen: () => ipcRenderer.invoke(IPC_CHANNELS.CAPTURE_FULL_SCREEN),
   restartApp: () => ipcRenderer.invoke(IPC_CHANNELS.RESTART_APP),
-  // Autocomplete
-  onAutocompleteContext: (callback: (data: { screenshot: string; searchSpaceId?: string; appName?: string; windowTitle?: string }) => void) => {
-    const listener = (_event: unknown, data: { screenshot: string; searchSpaceId?: string; appName?: string; windowTitle?: string }) => callback(data);
-    ipcRenderer.on(IPC_CHANNELS.AUTOCOMPLETE_CONTEXT, listener);
-    return () => {
-      ipcRenderer.removeListener(IPC_CHANNELS.AUTOCOMPLETE_CONTEXT, listener);
-    };
-  },
-  acceptSuggestion: (text: string) => ipcRenderer.invoke(IPC_CHANNELS.ACCEPT_SUGGESTION, text),
-  dismissSuggestion: () => ipcRenderer.invoke(IPC_CHANNELS.DISMISS_SUGGESTION),
-  setAutocompleteEnabled: (enabled: boolean) => ipcRenderer.invoke(IPC_CHANNELS.SET_AUTOCOMPLETE_ENABLED, enabled),
-  getAutocompleteEnabled: () => ipcRenderer.invoke(IPC_CHANNELS.GET_AUTOCOMPLETE_ENABLED),
-
   // Folder sync
   selectFolder: () => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_SELECT_FOLDER),
   addWatchedFolder: (config: any) => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_ADD_FOLDER, config),
@@ -71,6 +66,10 @@ contextBridge.exposeInMainWorld('electronAPI', {
   // Browse files via native dialog
   browseFiles: () => ipcRenderer.invoke(IPC_CHANNELS.BROWSE_FILES),
   readLocalFiles: (paths: string[]) => ipcRenderer.invoke(IPC_CHANNELS.READ_LOCAL_FILES, paths),
+  readAgentLocalFileText: (virtualPath: string, searchSpaceId?: number | null) =>
+    ipcRenderer.invoke(IPC_CHANNELS.READ_AGENT_LOCAL_FILE_TEXT, virtualPath, searchSpaceId),
+  writeAgentLocalFileText: (virtualPath: string, content: string, searchSpaceId?: number | null) =>
+    ipcRenderer.invoke(IPC_CHANNELS.WRITE_AGENT_LOCAL_FILE_TEXT, virtualPath, content, searchSpaceId),
 
   // Auth token sync across windows
   getAuthTokens: () => ipcRenderer.invoke(IPC_CHANNELS.GET_AUTH_TOKENS),
@@ -82,8 +81,72 @@ contextBridge.exposeInMainWorld('electronAPI', {
   setShortcuts: (config: Record<string, string>) =>
     ipcRenderer.invoke(IPC_CHANNELS.SET_SHORTCUTS, config),
 
+  // Launch on system startup
+  getAutoLaunch: () => ipcRenderer.invoke(IPC_CHANNELS.GET_AUTO_LAUNCH),
+  setAutoLaunch: (enabled: boolean, openAsHidden?: boolean) =>
+    ipcRenderer.invoke(IPC_CHANNELS.SET_AUTO_LAUNCH, { enabled, openAsHidden }),
+
   // Active search space
   getActiveSearchSpace: () => ipcRenderer.invoke(IPC_CHANNELS.GET_ACTIVE_SEARCH_SPACE),
   setActiveSearchSpace: (id: string) =>
     ipcRenderer.invoke(IPC_CHANNELS.SET_ACTIVE_SEARCH_SPACE, id),
+
+  // Analytics bridge — lets posthog-js running inside the Next.js renderer
+  // mirror identify/reset/capture into the Electron main-process PostHog
+  // client so desktop-only events are attributed to the logged-in user.
+  analyticsIdentify: (userId: string, properties?: Record<string, unknown>) =>
+    ipcRenderer.invoke(IPC_CHANNELS.ANALYTICS_IDENTIFY, { userId, properties }),
+  analyticsReset: () => ipcRenderer.invoke(IPC_CHANNELS.ANALYTICS_RESET),
+  analyticsCapture: (event: string, properties?: Record<string, unknown>) =>
+    ipcRenderer.invoke(IPC_CHANNELS.ANALYTICS_CAPTURE, { event, properties }),
+  getAnalyticsContext: () => ipcRenderer.invoke(IPC_CHANNELS.ANALYTICS_GET_CONTEXT),
+  // Agent filesystem mode
+  getAgentFilesystemSettings: (searchSpaceId?: number | null) =>
+    ipcRenderer.invoke(IPC_CHANNELS.AGENT_FILESYSTEM_GET_SETTINGS, searchSpaceId),
+  getAgentFilesystemMounts: (searchSpaceId?: number | null) =>
+    ipcRenderer.invoke(IPC_CHANNELS.AGENT_FILESYSTEM_GET_MOUNTS, searchSpaceId),
+  listAgentFilesystemFiles: (options: {
+    rootPath: string;
+    searchSpaceId?: number | null;
+    excludePatterns?: string[] | null;
+    fileExtensions?: string[] | null;
+  }) => ipcRenderer.invoke(IPC_CHANNELS.AGENT_FILESYSTEM_LIST_FILES, options),
+  startAgentFilesystemTreeWatch: (options: {
+    searchSpaceId?: number | null;
+    rootPaths: string[];
+    excludePatterns?: string[] | null;
+    fileExtensions?: string[] | null;
+  }) => ipcRenderer.invoke(IPC_CHANNELS.AGENT_FILESYSTEM_TREE_WATCH_START, options),
+  stopAgentFilesystemTreeWatch: (searchSpaceId?: number | null) =>
+    ipcRenderer.invoke(IPC_CHANNELS.AGENT_FILESYSTEM_TREE_WATCH_STOP, searchSpaceId),
+  onAgentFilesystemTreeDirty: (
+    callback: (data: {
+      searchSpaceId: number | null;
+      reason: 'watcher_event' | 'safety_poll';
+      rootPath: string;
+      changedPath: string | null;
+      timestamp: number;
+    }) => void
+  ) => {
+    const listener = (
+      _event: unknown,
+      data: {
+        searchSpaceId: number | null;
+        reason: 'watcher_event' | 'safety_poll';
+        rootPath: string;
+        changedPath: string | null;
+        timestamp: number;
+      }
+    ) => callback(data);
+    ipcRenderer.on(IPC_CHANNELS.AGENT_FILESYSTEM_TREE_DIRTY, listener);
+    return () => {
+      ipcRenderer.removeListener(IPC_CHANNELS.AGENT_FILESYSTEM_TREE_DIRTY, listener);
+    };
+  },
+  setAgentFilesystemSettings: (settings: {
+    mode?: "cloud" | "desktop_local_folder";
+    localRootPaths?: string[] | null;
+  }, searchSpaceId?: number | null) =>
+    ipcRenderer.invoke(IPC_CHANNELS.AGENT_FILESYSTEM_SET_SETTINGS, { searchSpaceId, settings }),
+  pickAgentFilesystemRoot: () => ipcRenderer.invoke(IPC_CHANNELS.AGENT_FILESYSTEM_PICK_ROOT),
 });
diff --git a/surfsense_evals/.env.example b/surfsense_evals/.env.example
new file mode 100644
index 0000000000..aa716063c7
--- /dev/null
+++ b/surfsense_evals/.env.example
@@ -0,0 +1,83 @@
+# surfsense_evals — environment template.
+#
+# Copy this file to `.env` (in the surfsense_evals/ project root or your
+# CWD) and fill in the values. `python-dotenv` loads it automatically
+# the first time `core.config` is imported, so every CLI subcommand
+# (`setup`, `ingest`, `run`, `report`, `teardown`, `models list`, …)
+# will pick the values up.
+#
+#   cp .env.example .env
+#   # then edit .env with your values
+#
+# `.env` is gitignored — never commit real secrets.
+
+# ---------------------------------------------------------------------------
+# 1. Backend target — REQUIRED (default works for a local dev backend)
+# ---------------------------------------------------------------------------
+SURFSENSE_API_BASE=http://localhost:8000
+
+# ---------------------------------------------------------------------------
+# 2. OpenRouter — REQUIRED for any `run` invocation
+# ---------------------------------------------------------------------------
+# The `native_pdf` arm calls OpenRouter directly; the `surfsense` arm
+# routes through SurfSense which uses the same key under the hood.
+OPENROUTER_API_KEY=sk-or-...
+
+# Override only if you proxy OpenRouter through a private gateway:
+# OPENROUTER_BASE_URL=https://openrouter.ai/api/v1
+
+# Multimodal benchmarks (medxpertqa, mmlongbench) require a vision-capable
+# slug. Recommended (verify in your catalog with `models list --grep ...`):
+#   anthropic/claude-sonnet-4.5    (default recommendation)
+#   anthropic/claude-opus-4.7      (strongest)
+#   openai/gpt-5                   (top-tier vision)
+#   google/gemini-2.5-pro          (1M-token context, best for long PDFs)
+# DO NOT use openai/gpt-5.4-mini for image-bearing benchmarks — it's
+# text-only on PDF content and the runner emits a warning if pinned.
+
+# ---------------------------------------------------------------------------
+# 3. Auth — pick EXACTLY ONE of the two modes below
+# ---------------------------------------------------------------------------
+
+# --- Mode A: LOCAL (backend started with AUTH_TYPE=LOCAL)
+# The harness POSTs these to /auth/jwt/login automatically.
+# SURFSENSE_USER_EMAIL=you@example.com
+# SURFSENSE_USER_PASSWORD=...
+
+# --- Mode B: GOOGLE OAuth (or any pre-issued JWT)
+# Open the SurfSense web UI in your browser, log in via Google, then in
+# DevTools → Application → Local Storage copy:
+#   surfsense_bearer_token  → SURFSENSE_JWT
+#   surfsense_refresh_token → SURFSENSE_REFRESH_TOKEN  (optional, enables
+#                                                       auto-refresh on 401)
+# SURFSENSE_JWT=eyJhbGciOi...
+# SURFSENSE_REFRESH_TOKEN=eyJhbGciOi...
+
+# ---------------------------------------------------------------------------
+# 4. Filesystem paths — OPTIONAL (defaults below)
+# ---------------------------------------------------------------------------
+# Where datasets, rendered PDFs, ingestion id maps, run outputs, and
+# state.json live. Default: <surfsense_evals>/data/
+# EVAL_DATA_DIR=./data
+
+# Where generated reports (summary.md / summary.json) get written.
+# Default: <surfsense_evals>/reports/
+# EVAL_REPORTS_DIR=./reports
+
+# ---------------------------------------------------------------------------
+# 5. Parser SDKs — REQUIRED for the multimodal_doc / parser_compare suite
+# ---------------------------------------------------------------------------
+# parser_compare calls Azure Document Intelligence and LlamaParse SDKs
+# directly from the eval harness so each (basic / premium) extraction
+# is a clean A/B test independent of the SurfSense backend's ETL routing.
+#
+# Azure Document Intelligence — used for the `azure_basic_lc` (prebuilt-read)
+# and `azure_premium_lc` (prebuilt-layout) arms. Get an endpoint + key from
+# https://portal.azure.com (Document Intelligence resource, F0 / S0 tier).
+# AZURE_DI_ENDPOINT=https://<your-resource>.cognitiveservices.azure.com/
+# AZURE_DI_KEY=<your-32-char-key>
+#
+# LlamaCloud (LlamaParse) — used for `llamacloud_basic_lc` (parse_page_with_llm)
+# and `llamacloud_premium_lc` (parse_page_with_agent). Get a key from
+# https://cloud.llamaindex.ai/api-key.
+# LLAMA_CLOUD_API_KEY=llx-...
diff --git a/surfsense_evals/.gitignore b/surfsense_evals/.gitignore
new file mode 100644
index 0000000000..fd6e505c2e
--- /dev/null
+++ b/surfsense_evals/.gitignore
@@ -0,0 +1,35 @@
+# Python bytecode + caches
+__pycache__/
+*.py[cod]
+*.pyo
+
+# Editable-install / build artifacts
+*.egg-info/
+build/
+dist/
+.eggs/
+
+# Virtual envs (uv venv default + common alternates)
+.venv/
+venv/
+env/
+
+# Tooling caches
+.pytest_cache/
+.ruff_cache/
+.mypy_cache/
+.coverage
+.coverage.*
+htmlcov/
+
+# Local secrets — keep `.env.example` tracked, never the real `.env`.
+.env
+.env.local
+.env.*.local
+!.env.example
+
+# Run / debug logs — keep ephemeral, structured artifacts in
+# `data/.../<run_id>/` are the citation surface, not these.
+*.log
+logs_*.txt
+retry_run.log
diff --git a/surfsense_evals/README.md b/surfsense_evals/README.md
new file mode 100644
index 0000000000..c6314af805
--- /dev/null
+++ b/surfsense_evals/README.md
@@ -0,0 +1,228 @@
+# SurfSense Evals
+
+Domain-agnostic eval harness for SurfSense. Each benchmark is a Python subpackage under `suites/<domain>/<benchmark>/` that self-registers with the CLI; `core/` is the shared infrastructure (HTTP clients, arms, parsers, metrics, report writer, registry). The harness talks to SurfSense over HTTP only — it does **not** import any backend Python module — so it ships in its own venv and never bloats the FastAPI runtime image.
+
+## Benchmarks
+
+| Benchmark                       | Shape                                            | Vision required? | Default ingest             |
+|---------------------------------|--------------------------------------------------|------------------|----------------------------|
+| `medical/medxpertqa` (headline) | Native PDF vs SurfSense head-to-head, MCQ        | yes              | `vision=on, mode=basic`    |
+| `medical/mirage`                | SurfSense single-arm, MCQ                        | no               | `vision=off, mode=basic`   |
+| `medical/cure`                  | SurfSense single-arm retrieval (Recall/MRR/nDCG) | no               | `vision=off, mode=basic`   |
+| `multimodal_doc/mmlongbench`    | Native PDF vs SurfSense head-to-head, open-ended | yes              | `vision=on, mode=basic`    |
+
+Future domains (`legal/`, `finance/`, `code/`, `scientific/`) drop into `suites/` without touching `core/` or the CLI.
+
+## Install + auth
+
+```bash
+uv pip install -e ./surfsense_evals
+cp surfsense_evals/.env.example surfsense_evals/.env
+# Edit .env: SURFSENSE_API_BASE, OPENROUTER_API_KEY, and ONE of:
+#   LOCAL  → SURFSENSE_USER_EMAIL + SURFSENSE_USER_PASSWORD
+#   GOOGLE → SURFSENSE_JWT (+ optional SURFSENSE_REFRESH_TOKEN)
+#            (lift both from browser localStorage after a normal Google login)
+```
+
+## Step-by-step: run all four benchmarks
+
+The medical and multimodal_doc suites each get their own SearchSpace and pinned model, so they're independent — run them in any order. Both head-to-head benchmarks (`medxpertqa`, `mmlongbench`) require a **vision-capable** OpenRouter slug; pinning a text-only one (e.g. `openai/gpt-5.4-mini`) silently drops images and the runner emits a warning.
+
+Recommended vision slugs (use `models list --grep <name>` to confirm one): `anthropic/claude-sonnet-4.5` (balanced cost), `anthropic/claude-opus-4.7` (strongest reasoning), `openai/gpt-5` (top-tier vision), `google/gemini-2.5-pro` (best for long PDFs, 1M-token context).
+
+```bash
+# 0. (optional) discover what's registered
+python -m surfsense_evals suites list
+python -m surfsense_evals benchmarks list
+
+# 1. MEDICAL SUITE — one SearchSpace, three benchmarks
+python -m surfsense_evals setup --suite medical --provider-model anthropic/claude-sonnet-4.5
+
+#  1a. headline head-to-head: Native PDF (vision) vs SurfSense (vision RAG)
+#      Downloads dev+test JSONL + images.zip, renders one PDF per question
+#      (case + table + images + 5 options), uploads with use_vision_llm=True.
+python -m surfsense_evals ingest medical medxpertqa --split test
+python -m surfsense_evals run    medical medxpertqa --concurrency 4
+
+#  1b. MIRAGE — single-arm SurfSense MCQ accuracy
+#      (MMLU-Med / MedQA-US / MedMCQA / PubMedQA / BioASQ)
+python -m surfsense_evals ingest medical mirage
+python -m surfsense_evals run    medical mirage
+
+#  1c. CUREv1 — single-arm SurfSense retrieval (Recall@k / MRR / nDCG@10)
+python -m surfsense_evals ingest medical cure --lang en
+python -m surfsense_evals run    medical cure --lang en
+
+#  1d. write reports/medical/<UTC-ts>/summary.{md,json}
+python -m surfsense_evals report --suite medical
+
+# 2. MULTIMODAL_DOC SUITE — long PDFs with embedded images, charts, tables
+python -m surfsense_evals setup  --suite multimodal_doc --provider-model google/gemini-2.5-pro
+python -m surfsense_evals ingest multimodal_doc mmlongbench           # ~660MB, resumable
+python -m surfsense_evals run    multimodal_doc mmlongbench --concurrency 4
+python -m surfsense_evals report --suite multimodal_doc
+
+# 3. CLEANUP — soft-deletes the SearchSpaces; rendered PDFs stay cached
+python -m surfsense_evals teardown --suite medical
+python -m surfsense_evals teardown --suite multimodal_doc
+```
+
+## Asymmetric scenarios — the "vision-extract once, answer cheap" play
+
+The walkthrough above is `--scenario head-to-head` (default): both arms answer with the same vision-capable slug. SurfSense's actual architectural value-prop is that the **ingestion-time vision LLM and the runtime LLM are completely independent** — you can pay a vision LLM *once*, at ingest, to convert every embedded image into text (per-image OCR **and** semantic description, inlined where the image actually appears in the document — see [What `--use-vision-llm` produces](#what---use-vision-llm-produces) below). Then every query is served by a cheap text-only model that sees that extracted text natively. Two extra scenarios make this explicit:
+
+| `--scenario`       | Native arm answers with                | SurfSense arm answers with     | Question being measured                                                                  |
+|--------------------|----------------------------------------|--------------------------------|------------------------------------------------------------------------------------------|
+| `head-to-head`     | `--provider-model` (vision)            | `--provider-model` (vision)    | Pure RAG quality at parity. (Default.)                                                   |
+| `symmetric-cheap`  | `--provider-model` (cheap, text-only)  | `--provider-model` (same)      | Does pre-extracted image context let a non-vision LLM reason over image-heavy docs?      |
+| `cost-arbitrage`   | `--native-arm-model` (vision)          | `--provider-model` (cheap)     | How close does SurfSense get to a vision-native baseline at a fraction of per-query cost?|
+
+In all three modes the **ingest-time** vision LLM is set on the SearchSpace's `vision_llm_config_id` (auto-picked from the strongest registered global OpenRouter vision config — `claude-sonnet-4.5` > `claude-opus-4.7` > `gpt-5` > `gemini-2.5-pro`, override with `--vision-llm <slug>`). What changes is which slug the *answering* models hit per arm.
+
+### Ingest with vision, evaluate with a non-vision LLM (`symmetric-cheap`)
+
+This is the answer to *"does SurfSense give a non-vision LLM enough context to reason over image-heavy docs?"*. Both arms hit the same cheap text-only slug. The native arm is structurally blind to images (text-only LLM + raw PDFs). The SurfSense arm reads chunks that already contain the per-image OCR and visual descriptions, written there by the vision LLM at ingest time.
+
+```bash
+python -m surfsense_evals setup --suite medical \
+  --scenario symmetric-cheap \
+  --provider-model openai/gpt-5.4-mini
+  # vision LLM at ingest = auto-picked (claude-sonnet-4.5 by default)
+  # answer LLM for BOTH arms = openai/gpt-5.4-mini (text-only)
+
+python -m surfsense_evals ingest medical medxpertqa --split test  # vision=on by default
+python -m surfsense_evals run    medical medxpertqa --concurrency 4
+python -m surfsense_evals report --suite medical
+# Δ accuracy on image-required MCQs is the headline number; native arm
+# baseline is "what a text-only LLM gets without seeing the images".
+```
+
+### Cheap SurfSense vs vision-native baseline (`cost-arbitrage`)
+
+```bash
+python -m surfsense_evals setup --suite medical \
+  --scenario cost-arbitrage \
+  --provider-model openai/gpt-5.4-mini \
+  --native-arm-model anthropic/claude-sonnet-4.5
+  # vision LLM at ingest = auto-picked claude-sonnet-4.5
+  # native arm = sonnet (vision); SurfSense arm = gpt-5.4-mini (text-only)
+
+python -m surfsense_evals ingest medical medxpertqa --split test
+python -m surfsense_evals run    medical medxpertqa --concurrency 4
+python -m surfsense_evals report --suite medical
+# Report header reads:
+#   Scenario: cost-arbitrage — native arm answers with `anthropic/claude-sonnet-4.5`
+#   (vision); SurfSense answers with `openai/gpt-5.4-mini` over chunks vision-extracted
+#   at ingest by `anthropic/claude-sonnet-4.5`.
+```
+
+Notes:
+- `cost-arbitrage` requires both `--provider-model` (the cheap SurfSense slug) AND `--native-arm-model <vision slug>`.
+- `--vision-llm <slug>` is optional; if omitted the harness queries `GET /api/v1/global-vision-llm-configs` and auto-picks the strongest registered one. Pass `--no-vision-llm-setup` if you want to keep whatever vision config is already attached to the SearchSpace.
+- The runner's "looks text-only" warning is suppressed (or relabelled as informational) for `symmetric-cheap` so intentional asymmetry doesn't read as a misconfiguration.
+- All three scenario fields (`scenario`, `provider_model`, `native_arm_model`, `vision_provider_model`) are persisted to `state.json` and recorded in `run_artifact.extra` + the report header — no need to retrace what was set.
+
+## Per-benchmark useful flags
+
+`medical/medxpertqa` (`run`):
+- `--split {test,dev,all}` — pick a subset (default `test`)
+- `--task "Diagnosis"` / `--body-system "Cardiovascular"` — slice the report
+- `--require-images` — drop rare rows where every image filename failed to resolve
+- `--n 100` — quick smoke run
+- `--no-mentions` — let SurfSense retrieve unscoped ("did the @-mention matter?")
+
+`multimodal_doc/mmlongbench`:
+- `--max-docs N` (ingest) — cap downloads at the first N unique PDFs
+- `--format {str,int,float,list,none}` (run) — slice by answer format; `none` = the ~22% intentionally unanswerable hallucination probes
+- `--skip-unanswerable` (run) — drop unanswerable questions
+- `--docs <a.pdf>,<b.pdf>` (run) — scope to specific docs
+
+## Ingestion knobs (vision LLM, processing mode, summarize)
+
+The harness exposes `POST /api/v1/documents/fileupload`'s three knobs on every `ingest` subcommand:
+
+| Flag pair                                  | Effect                                                                                  |
+|--------------------------------------------|-----------------------------------------------------------------------------------------|
+| `--use-vision-llm` / `--no-vision-llm`     | Walk every embedded image in the PDF and inline image-derived text at the image's position (see below). |
+| `--processing-mode {basic,premium}`        | `premium` carries a 10× page multiplier and routes to a stronger ETL (e.g. LlamaCloud). |
+| `--should-summarize` / `--no-summarize`    | Generate a per-document summary at ingest.                                              |
+
+The "Default ingest" column in the benchmarks table is what runs if you don't pass any flag. Whatever was actually used is recorded as a `__settings__` header in the doc map (`data/<suite>/maps/<benchmark>_*_map.jsonl`) and as `extra.ingest_settings` in `run_artifact.json`, then surfaced in the report — no need to hunt through CLI history.
+
+> The backend's `ETL_SERVICE` env var (`DOCLING` | `UNSTRUCTURED` | `LLAMACLOUD`) is **not** per-upload. Restart the backend with a different `ETL_SERVICE` and re-ingest to compare ETLs (route through `--processing-mode premium` if your backend uses that mode for the stronger ETL).
+
+### What `--use-vision-llm` produces
+
+When vision is on, the backend's ETL pipeline (`app/etl_pipeline/picture_describer.py`) does, **per embedded image** in the PDF:
+
+1. Extract the raw image bytes via `pypdf` (deduped by sha256, size-capped to match the vision LLM's per-image limit).
+2. **Per-image OCR** — re-feed the image as a standalone upload through the configured ETL service (Docling / Azure DI / LlamaCloud) with `vision_llm=None`, so the ETL's OCR engine extracts the literal text-in-image.
+3. **Visual description** — call the vision LLM on the image with a description-only prompt (it's explicitly told *not* to transcribe text — that's OCR's job). Steps 2 and 3 run in parallel per image.
+4. Splice a horizontal-rule-delimited section **at the image's original position** in the parser markdown (replacing Docling's `<!-- image -->` placeholder + caption, or the bare `Image: <name>` caption a stripped-image parser leaves behind):
+
+   ```markdown
+   ---
+
+   **Embedded image:** `MM-130-a.jpeg`
+
+   **OCR text:**
+   Slice 24 / 60
+   L  R
+
+   **Visual description:**
+
+   - Axial contrast-enhanced CT showing a large cystic mass in the left upper quadrant.
+   - Mass effect on the adjacent stomach; left kidney displaced inferiorly.
+
+   ---
+   ```
+
+This is what makes `--scenario symmetric-cheap` and `--scenario cost-arbitrage` work: a non-vision LLM reading SurfSense's chunks sees the image's text and semantic content as plain markdown, alongside the surrounding case text, in the same retrieved chunk. Without it the cheap LLM would have nothing extra to read.
+
+### A/B testing the same corpus with different settings
+
+SurfSense dedupes uploads by `(filename, search_space_id)` — **not** by content hash and **not** by ingestion settings. Re-uploading the same filename to the same SearchSpace with a different `--use-vision-llm` flag silently skips re-processing. Give each variant its own SearchSpace:
+
+```bash
+# Baseline arm (vision off)
+python -m surfsense_evals setup    --suite medical --provider-model anthropic/claude-sonnet-4.5
+python -m surfsense_evals ingest   medical medxpertqa --no-vision-llm
+python -m surfsense_evals run      medical medxpertqa --n 100
+python -m surfsense_evals teardown --suite medical
+
+# Vision arm (the benchmark default)
+python -m surfsense_evals setup    --suite medical --provider-model anthropic/claude-sonnet-4.5
+python -m surfsense_evals ingest   medical medxpertqa
+python -m surfsense_evals run      medical medxpertqa --n 100
+python -m surfsense_evals report   --suite medical
+```
+
+Both runs land in `data/medical/runs/<ts>/medxpertqa/` with their settings recorded; rendered PDFs stay cached under `data/medical/medxpertqa/pdfs/` so the second `ingest` is upload-only.
+
+## Environment variables
+
+- `SURFSENSE_API_BASE` (default `http://localhost:8000`)
+- `OPENROUTER_API_KEY` — required for the `native_pdf` arm and for `models list`
+- One of `SURFSENSE_USER_EMAIL` + `SURFSENSE_USER_PASSWORD` (LOCAL), **or** `SURFSENSE_JWT` (+ optional `SURFSENSE_REFRESH_TOKEN`) for GOOGLE/pre-issued JWT
+- `EVAL_DATA_DIR` (default `<project>/data`) — datasets, rendered PDFs, ingestion id maps, run outputs, `state.json`
+- `EVAL_REPORTS_DIR` (default `<project>/reports`)
+- `OPENROUTER_BASE_URL` (default `https://openrouter.ai/api/v1`) — only if you proxy OpenRouter
+
+## Adding a new domain suite
+
+1. Create `surfsense_evals/src/surfsense_evals/suites/<domain>/<benchmark>/` with `__init__.py`, `ingest.py`, `runner.py`, optional `prompt.py`.
+2. Implement a `Benchmark` subclass (see `core/registry.py`); compose with `core.clients.*`, `core.arms.*`, `core.parse.*`, `core.metrics.*`.
+3. Call `register(MyBenchmark())` at the bottom of `<benchmark>/__init__.py`. Auto-discovery picks it up; `setup --suite <domain>` and `ingest/run <domain> <benchmark>` work immediately.
+
+Each suite gets its own SearchSpace (`eval-<suite>-<UTC-ts>`), `state.json` slot, data dir, reports dir, and pinned LLM. Suites never share a SearchSpace.
+
+## Out of scope (follow-up PRs)
+
+- Docker service for `docker compose run evals run medical medxpertqa`.
+- Multi-model sweeps (one slug per `setup` for now; aggregate reports come later).
+- A long-context-stuffing arm (give the model the same retrieved chunks SurfSense saw).
+- LLM-judge grader for MMLongBench-Doc (paper uses GPT-4 as judge; we ship a deterministic rule-based grader).
+- MedXpertQA-MM accuracy by image modality — dataset doesn't tag modality directly; we slice by `medical_task` and `body_system`.
+- A `--slot <name>` flag that decouples the state-slot key from the benchmark registry's `suite` attribute, so parallel SearchSpaces with different ingestion settings can coexist on the same benchmark without `teardown` between A/B arms.
+
+See `c:/Users/91882/.cursor/plans/medical_rag_evals_(mirage_+_curev1)_e797a324.plan.md` for the full design rationale.
diff --git a/surfsense_evals/data/.gitignore b/surfsense_evals/data/.gitignore
new file mode 100644
index 0000000000..529223f581
--- /dev/null
+++ b/surfsense_evals/data/.gitignore
@@ -0,0 +1,22 @@
+# Default: don't track anything under data/ (large benchmarks, PDFs,
+# extracted markdown, ingestion caches, per-run artifacts can balloon).
+*
+
+# Always keep this gitignore file
+!.gitignore
+
+# Whitelist the artifacts the n=171 multimodal_doc / parser_compare blog
+# report (`reports/blog/multimodal_doc_parser_compare_n171_report.md`)
+# cites by path. These are the verifiable numbers source for the report.
+#
+# Path components have to be unblocked one level at a time because git
+# does not look into a directory whose parent is ignored.
+!multimodal_doc/
+!multimodal_doc/runs/
+!multimodal_doc/runs/2026-05-14T00-53-19Z/
+!multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/
+!multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw.jsonl
+!multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_retries.jsonl
+!multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_retries_summary.json
+!multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_post_retry.jsonl
+!multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/run_artifact.json
diff --git a/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw.jsonl b/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw.jsonl
new file mode 100644
index 0000000000..d91e2e530e
--- /dev/null
+++ b/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw.jsonl
@@ -0,0 +1,1026 @@
+{"qid": "05-03-18-political-release.pdf::Q000", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "541", "pages": 17, "document_id": 5219, "arm": "native_pdf", "question_id": "05-03-18-political-release.pdf::Q000", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "541"}}
+{"qid": "05-03-18-political-release.pdf::Q000", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "541", "pages": 17, "document_id": 5219, "arm": "azure_basic_lc", "question_id": "05-03-18-political-release.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the number of adults who rated Trump administration's ethical standards as \"poor\" in the survey conducted April 25-May 1, 2018.\n\nFrom the document:\n- Total sample size: 1,503 adults\n- 36% rated the ethical standards as \"poor\" (from the table showing \"% who say that the ethical standards of top Trump administration officials are...\")\n\nCalculation:\n1,503 \u00d7 0.36 = 541.08\n\nRounding to the nearest whole number: 541\n\nAnswer: 541", "citations": [], "input_tokens": 6925, "output_tokens": 125, "cost_micros": 22650, "latency_ms": 6078, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "541", "normalised_gold": "541"}}
+{"qid": "05-03-18-political-release.pdf::Q000", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "541", "pages": 17, "document_id": 5219, "arm": "azure_premium_lc", "question_id": "05-03-18-political-release.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the number of adults who rated Trump administration's ethical standards as \"poor\" in the survey conducted April 25-May 1, 2018.\n\nFrom the document:\n- Total sample size: 1,503 adults (from the Methodology section on page 14)\n- According to the chart on page 2, 36% of the total rated the ethical standards as \"poor\"\n\nCalculation:\n1,503 \u00d7 0.36 = 541.08\n\nRounding to the nearest whole number: 541\n\nAnswer: 541", "citations": [], "input_tokens": 9417, "output_tokens": 128, "cost_micros": 30171, "latency_ms": 6203, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "541", "normalised_gold": "541"}}
+{"qid": "05-03-18-political-release.pdf::Q000", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "541", "pages": 17, "document_id": 5219, "arm": "llamacloud_basic_lc", "question_id": "05-03-18-political-release.pdf::Q000", "answer_letter": null, "raw_text": "I need to find how many adults rated Trump administration's ethical standards as \"poor\" in the survey conducted April 25-May 1, 2018.\n\nFrom the document, I can find:\n1. The total sample size: 1,503 adults (from the Methodology section)\n2. The percentage who rated ethical standards as \"poor\": 36% (from the table \"Most rate top Trump administration officials' ethics as 'not good' or 'poor'\")\n\nTo calculate the number of adults:\n1,503 \u00d7 0.36 = 541.08\n\nSince we need an integer and this represents people, I should round to the nearest whole number: 541\n\nAnswer: 541", "citations": [], "input_tokens": 7774, "output_tokens": 158, "cost_micros": 25692, "latency_ms": 6405, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "541", "normalised_gold": "541"}}
+{"qid": "05-03-18-political-release.pdf::Q000", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "541", "pages": 17, "document_id": 5219, "arm": "llamacloud_premium_lc", "question_id": "05-03-18-political-release.pdf::Q000", "answer_letter": null, "raw_text": "I need to find how many adults rated Trump's government ethical standards as poor.\n\nFrom the document:\n- The survey was conducted April 25-May 1, 2018 among 1,503 adults (from the Methodology section on page 14)\n- 36% rated the ethical standards of top Trump administration officials as \"poor\" (from the table on page 2)\n\nCalculation:\n1,503 adults \u00d7 36% = 1,503 \u00d7 0.36 = 541.08\n\nSince we need to report the number of adults, I'll round to the nearest whole number: 541\n\nAnswer: 541", "citations": [], "input_tokens": 9517, "output_tokens": 145, "cost_micros": 30726, "latency_ms": 5562, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "541", "normalised_gold": "541"}}
+{"qid": "05-03-18-political-release.pdf::Q000", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "541", "pages": 17, "document_id": 5219, "arm": "surfsense_agentic", "question_id": "05-03-18-political-release.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find the answer to your question about the survey conducted April 25 - May 1, 2018.Let me search for the specific data about ethical standards ratings. I can see the document mentions ethical standards. Let me look for the detailed breakdown.I can see from the grep results that there's a table showing ethical standards ratings. Let me read the specific section more carefully:I can see a table showing percentages for ethical standards of Trump administration officials. However, I need to understand the table structure better. The table shows:\n- Total: 58, 36, 9, 39\n- These appear to be percentages for different categories\n\nLet me look at the context more carefully to understand what \"58\" represents:Now I understand the table structure. Looking at the figure:\n- The categories are listed as: Poor, Not good, Good, Excellent\n- For \"Total\" (all adults): The values appear to be 58, 36, 9, 39\n\nHowever, the text says \"58% rate them as not good or poor\" which suggests 58% is the combined total of \"not good\" and \"poor\". Let me look more carefully at how to read this table. Looking at lines 248-249: \"39% say the administration's ethical standards are excellent or good, while 58% rate them as not good or poor.\"\n\nThis indicates:\n- 39% = Excellent + Good (combined)\n- 58% = Not good + Poor (combined)\n\nLooking at the table values for Total: 58, 36, 9, 39\n\nThe layout suggests:\n- Row 1 (Poor): 58\n- Row 2 (Not good): 36  \n- Row 3 (Good): 9\n- Row 4 (Excellent): 39\n\nBut this doesn't make sense because 58 + 36 would be 94%, not matching the text's 58% for \"not good or poor\" combined.\n\nLet me reconsider the table structure. Perhaps it's arranged differently - the numbers might be in columns rather than associating each number with the label above it. Let me look for more context about the survey sample size:The survey was conducted among 1,503 adults. Now, looking back at the text on line 248: \"58% rate them as not good or poor\"\n\nIf 58% of 1,503 adults rated Trump's ethical standards as \"not good or poor\", then:\n58% \u00d7 1,503 = 0.58 \u00d7 1,503 = 871.74 \u2248 872 adults\n\nBut the question specifically asks for those who rated it as \"poor\", not \"not good or poor\" combined.\n\nLet me re-examine the table structure. The table shows categories and then rows for Total, Rep/Lean Rep, and Dem/Lean Dem. Looking at the numbers again:\n\nFor the \"Total\" row, I see: 58, 36, 9, 39\n\nGiven that the text says 58% rated as \"not good or poor\" combined, and the categories listed are Poor, Not good, Good, Excellent - perhaps the table is showing these in a different order or the numbers represent something else.\n\nActually, looking at line 289-295 more carefully, let me reconsider: perhaps the numbers don't align sequentially with the labels above. The figure might have a more complex layout. Let me look at the Republican row: 22, 7, 19, 75. And Democrat row: 86, 61, 2, 12.\n\nFor Republicans: The text (line 257) says \"75% giving the administration high marks\" - this matches the \"75\" in the data.\nFor Democrats: The text (line 258) says \"86% of Democrats rating its ethical standards negatively\" - this matches the \"86\" in the data.\n\nSo for the Total row showing 58, 36, 9, 39:\n- 58 + 36 = 94 (doesn't match the 58% mentioned)\n- 36 + 58 = 94 \n- 9 + 39 = 48 (doesn't match the 39% for excellent/good)\n\nWait - perhaps I'm misreading the structure. Let me think about this differently. The text says:\n- 39% say excellent or good\n- 58% say not good or poor\n\nThese add to 97%, with 3% presumably \"don't know\" (which the note says is not shown).\n\nLooking at the table structure, maybe it's:\n- Poor: 36\n- Not good: 22 (so 36 + 22 = 58% for \"poor or not good\")\n- Good: ? \n- Excellent: ?\n\nActually, rereading the visual structure between lines 273-315, the checkboxes (\u2610) appear before each category label, and then the numbers follow. Let me parse this more carefully as a table where columns might represent the categories.\n\nLooking at the pattern, it seems like the table might have columns for each rating category. If the first number in each row represents \"Poor\":\n- Total: 58 (but this would be just \"Poor\", not \"Poor + Not good\")\n\nBut the text clearly states 58% is the combined \"not good or poor\". So if:\n- Poor alone = X%\n- Not good alone = Y%  \n- X + Y = 58%\n\nAnd looking at potential individual values, perhaps:\n- Poor = 36%\n- Not good = 22%\n- 36 + 22 = 58% \u2713\n\nThis would make sense! So in the Total row (58, 36, 9, 39), these might be arranged as:\n- Combined negative: 58%\n- Poor: 36%\n- Good: 9%\n- Combined positive: 39% (should be Good 9% + Excellent, but listed as 39%)\n\nWait, that still doesn't work cleanly. Let me look for another table or clarification in the document:Perfect! I found a more detailed table starting at line 875. This table clearly shows the structure with labels on lines 880-885:\n- Poor\n- Not good\n- Good\n- Excellent\n\nAnd then for \"Total\" (lines 890-898):\n- 58\n- 36\n- 9\n- 39\n\nBut wait - this arrangement still seems odd. Let me look at the actual layout more carefully. Lines 890-898 show:\n```\nTotal\n58\n36\n9\n39\n```\n\nLooking at the structure and comparing with the text, I think the table columns might be:\n- Column 1: Row label (Total, Rep/Lean Rep, etc.)\n- Column 2: Combined \"Not good or Poor\" (58 for Total)\n- Column 3: Poor specifically (36 for Total)\n- Column 4: Good specifically (9 for Total)\n- Column 5: Combined \"Good or Excellent\" (39 for Total)\n\nThis would make sense because:\n- 58% = \"not good or poor\" combined (matches the text on line 248)\n- 36% = \"poor\" alone\n- 9% = \"good\" alone\n- 39% = \"good or excellent\" combined (matches the text on line 247)\n\nSo the answer to \"how many adults rated Trump's government ethical standards as poor\" would be:\n36% of 1,503 adults = 0.36 \u00d7 1,503 = 541.08 \u2248 541 adults\n\nActually, let me double-check by reading the column headers more carefully in this second table:Looking at the table header (lines 880-885), the categories are listed as:\n- Poor\n- Not good\n- Good  \n- Excellent\n\nAnd for Total (lines 892-898): 58, 36, 9, 39\n\nGiven the table structure and that these are percentages, I now believe the columns represent:\n1. First column after \"Total\" = one rating category\n2. Second column = another rating category\n3. Third column = another rating category\n4. Fourth column = another rating category\n\nBut there are only 4 numbers (58, 36, 9, 39) for 4 categories (Poor, Not good, Good, Excellent).\n\nSo the mapping should be:\n- Poor: 58%? or 36%?\n- Not good: 36%? or 9%?\n- Good: 9%? or 39%?\n- Excellent: 39%? or something else?\n\nLet me check against the text constraints:\n- Text says \"39% say excellent or good\" (line 247-248) \u2192 Good + Excellent = 39%\n- Text says \"58% rate them as not good or poor\" (line 248-249) \u2192 Not good + Poor = 58%\n\nIf the four values are 58, 36, 9, 39:\n- 58 + 36 + 9 + 39 = 142% (impossible for percentages)\n\nSo these must not all be individual category percentages. Some must be combined values. Given the text:\n- 58% = Not good + Poor (combined)\n- 39% = Good + Excellent (combined)\n- 9% = one individual category\n- 36% = one individual category\n\nSince 58% + 39% = 97% (with 3% don't know), and we have two individual values (36 and 9), these must be subcategories within the combined values:\n- 36% = Poor (part of the 58% \"not good or poor\")\n- Therefore Not good = 58% - 36% = 22%\n- 9% = Good (part of the 39% \"good or excellent\")  \n- Therefore Excellent = 39% - 9% = 30%\n\nThis makes sense! So the table is showing:\n- Column 1: Not good or Poor combined (58%)\n- Column 2: Poor alone (36%)\n- Column 3: Good alone (9%)\n- Column 4: Good or Excellent combined (39%)\n\nTherefore, 36% of adults rated Trump's ethical standards as \"poor\".\n\n36% of 1,503 adults = 0.36 \u00d7 1,503 = 541.08\n\nRounding to the nearest integer: 541 adults\n\nAnswer: 541", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 94875, "error": null, "extra": {"thread_id": 1904, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_c7e2e86436614d5c96d6bd9dd3a561c5", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 977, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "541", "normalised_gold": "541"}}
+{"qid": "05-03-18-political-release.pdf::Q002", "doc_id": "05-03-18-political-release.pdf", "answer_format": "list", "gold": "['Make good decisions about economic policy ', 'Make wise decisions about immigration policy ']", "pages": 17, "document_id": 5219, "arm": "native_pdf", "question_id": "05-03-18-political-release.pdf::Q002", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "", "normalised_gold": "make good decisions about economic policy, make wise decisions about immigration policy"}}
+{"qid": "05-03-18-political-release.pdf::Q002", "doc_id": "05-03-18-political-release.pdf", "answer_format": "list", "gold": "['Make good decisions about economic policy ', 'Make wise decisions about immigration policy ']", "pages": 17, "document_id": 5219, "arm": "azure_basic_lc", "question_id": "05-03-18-political-release.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the domains where adults are \"very confident\" and \"not at all confident\" in Donald Trump, and identify which has the highest percentage for each category.\n\nLooking at the table on page 4 titled \"Public's confidence in Trump to handle several policy areas mixed,\" I can see the breakdown of confidence levels (Very, Somewhat, Not too, Not at all) for various policy areas.\n\nFor \"Very confident\" (highest percentage):\n- Negotiate favorable trade agreements with other countries: 24%\n- Make good decisions about economic policy: 22%\n- Use military force wisely: 19%\n- Make good appointments to the federal courts: 21%\n- Manage the executive branch effectively: 21%\n- Make wise decisions about immigration policy: 16%\n- Handle an international crisis: 18%\n- Work effectively with Congress: 30%\n\nThe highest \"Very confident\" is \"Work effectively with Congress\" at 30%.\n\nFor \"Not at all confident\" (highest percentage):\n- Negotiate favorable trade agreements with other countries: 26%\n- Make good decisions about economic policy: 17%\n- Use military force wisely: 32%\n- Make good appointments to the federal courts: 32%\n- Manage the executive branch effectively: 34%\n- Make wise decisions about immigration policy: 39%\n- Handle an international crisis: 35%\n- Work effectively with Congress: 33%\n\nThe highest \"Not at all confident\" is \"Make wise decisions about immigration policy\" at 39%.\n\nIn alphabetical order:\n1. Make wise decisions about immigration policy\n2. Work effectively with Congress\n\nAnswer: Make wise decisions about immigration policy, Work effectively with Congress", "citations": [], "input_tokens": 6973, "output_tokens": 374, "cost_micros": 26529, "latency_ms": 9500, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "make wise decisions about immigration policy, work effectively with congress", "normalised_gold": "make good decisions about economic policy, make wise decisions about immigration policy"}}
+{"qid": "05-03-18-political-release.pdf::Q002", "doc_id": "05-03-18-political-release.pdf", "answer_format": "list", "gold": "['Make good decisions about economic policy ', 'Make wise decisions about immigration policy ']", "pages": 17, "document_id": 5219, "arm": "azure_premium_lc", "question_id": "05-03-18-political-release.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the table on page 4 titled \"Public's confidence in Trump to handle several policy areas mixed\", I need to find:\n\n1. The domain with the highest \"Very\" confident percentage\n2. The domain with the highest \"Not at all\" confident percentage\n\nFrom the table data:\n\n**Very confident (highest):**\n- Negotiate favorable trade agreements: 30%\n- Make good decisions about economic policy: 31% (HIGHEST)\n\n**Not at all confident (highest):**\n- Make wise decisions about immigration policy: 39% (HIGHEST)\n- Handle an international crisis: 35%\n- Manage the executive branch effectively: 34%\n\nThe two domains are:\n1. \"Make good decisions about economic policy\" (highest very confident at 31%)\n2. \"Make wise decisions about immigration policy\" (highest not at all confident at 39%)\n\nIn alphabetical order:\n\nAnswer: Make good decisions about economic policy, Make wise decisions about immigration policy", "citations": [], "input_tokens": 9465, "output_tokens": 216, "cost_micros": 31635, "latency_ms": 7578, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "make good decisions about economic policy, make wise decisions about immigration policy", "normalised_gold": "make good decisions about economic policy, make wise decisions about immigration policy"}}
+{"qid": "05-03-18-political-release.pdf::Q002", "doc_id": "05-03-18-political-release.pdf", "answer_format": "list", "gold": "['Make good decisions about economic policy ', 'Make wise decisions about immigration policy ']", "pages": 17, "document_id": 5219, "arm": "llamacloud_basic_lc", "question_id": "05-03-18-political-release.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the table \"Public's confidence in Trump to handle several policy areas mixed\" on page 4 of the document, I need to identify:\n\n1. The domain with the highest \"Very\" confident percentage\n2. The domain with the highest \"Not at all\" confident percentage\n\nFrom the table:\n\n**Very confident percentages:**\n- Negotiate favorable trade agreements: 24%\n- Make good decisions about economic policy: 22%\n- Use military force wisely: 20%\n- Make good appointments to federal courts: 21%\n- Manage executive branch effectively: 24%\n- Make wise decisions about immigration policy: 17%\n- Handle an international crisis: 18%\n- Work effectively with Congress: 21%\n\nHighest \"Very\" confident: **Negotiate favorable trade agreements** and **Manage the executive branch effectively** (both at 24%)\n\n**Not at all confident percentages:**\n- Negotiate favorable trade agreements: 26%\n- Make good decisions about economic policy: 29%\n- Use military force wisely: 32%\n- Make good appointments to federal courts: 32%\n- Manage executive branch effectively: 34%\n- Make wise decisions about immigration policy: 39%\n- Handle an international crisis: 35%\n- Work effectively with Congress: 33%\n\nHighest \"Not at all\" confident: **Make wise decisions about immigration policy** (39%)\n\nIn alphabetical order:\n1. Manage the executive branch effectively\n2. Make wise decisions about immigration policy\n3. Negotiate favorable trade agreements with other countries\n\nAnswer: Manage the executive branch effectively, Make wise decisions about immigration policy, Negotiate favorable trade agreements with other countries", "citations": [], "input_tokens": 7822, "output_tokens": 372, "cost_micros": 29046, "latency_ms": 8530, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.4, "method": "list_set", "normalised_pred": "make wise decisions about immigration policy, manage executive branch effectively, negotiate favorable trade agreements with other countries", "normalised_gold": "make good decisions about economic policy, make wise decisions about immigration policy"}}
+{"qid": "05-03-18-political-release.pdf::Q002", "doc_id": "05-03-18-political-release.pdf", "answer_format": "list", "gold": "['Make good decisions about economic policy ', 'Make wise decisions about immigration policy ']", "pages": 17, "document_id": 5219, "arm": "llamacloud_premium_lc", "question_id": "05-03-18-political-release.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the table on page 3 titled \"Public's confidence in Trump to handle several policy areas mixed\", I need to identify:\n\n1. The domain with the highest \"Very\" confident percentage\n2. The domain with the highest \"Not at all\" confident percentage\n\nFrom the table, examining the \"Very\" column:\n- Negotiate favorable trade agreements: 30%\n- Make good decisions about economic policy: 31% (highest)\n- Use military force wisely: 26%\n- Make good appointments to federal courts: 25%\n- Manage executive branch effectively: 21%\n- Make wise decisions about immigration policy: 26%\n- Handle an international crisis: 25%\n- Work effectively with Congress: 13%\n\nFrom the \"Not at all\" column:\n- Negotiate favorable trade agreements: 26%\n- Make good decisions about economic policy: (not shown separately)\n- Use military force wisely: 32%\n- Make good appointments to federal courts: (not shown separately)\n- Manage executive branch effectively: 34%\n- Make wise decisions about immigration policy: 39% (highest)\n- Handle an international crisis: 35%\n- Work effectively with Congress: 33%\n\nAnswer: Make good decisions about economic policy, Make wise decisions about immigration policy", "citations": [], "input_tokens": 9565, "output_tokens": 285, "cost_micros": 32970, "latency_ms": 6891, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "make good decisions about economic policy, make wise decisions about immigration policy", "normalised_gold": "make good decisions about economic policy, make wise decisions about immigration policy"}}
+{"qid": "05-03-18-political-release.pdf::Q002", "doc_id": "05-03-18-political-release.pdf", "answer_format": "list", "gold": "['Make good decisions about economic policy ', 'Make wise decisions about immigration policy ']", "pages": 17, "document_id": 5219, "arm": "surfsense_agentic", "question_id": "05-03-18-political-release.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the document to find the survey data from April 25 - May 1, 2018.Let me continue reading to find the specific data about confidence levels in different domains.I need to find the specific data showing \"very confident\" and \"not at all confident\" percentages for different domains. Let me search for that data.Perfect! I found the table with confidence levels. Now I need to identify which columns represent \"Very confident\" and \"Not at all confident\". Let me re-read this section more carefully.Now I can see the table structure clearly. The columns are:\n- Column 1: \"Not at all\" (confidence)\n- Column 2: \"Not too\" (confidence)  \n- Column 3: \"Somewhat\" (confident)\n- Column 4: \"Very\" (confident)\n\nHowever, I notice the last column shows combined percentages (e.g., 54 for trade agreements). Let me look more carefully at the structure. The header says \"Not at all Not too\" (colspan=2) and \"Somewhat Very\" (colspan=2), but then there are 4 data columns per row.\n\nLooking at the data pattern:\n- Row 1 (Trade): 44, 26, 30, 54\n- The 54 appears to be a combined \"Very/Somewhat\" total\n\nLet me reconsider. Looking at the structure more carefully:\n- Column 1: Combined \"Not at all/Not too\" total\n- Column 2: \"Not at all\" alone\n- Column 3: \"Somewhat\" alone  \n- Column 4: Combined \"Somewhat/Very\" total\n\nActually, examining the pattern again with the header \"Not at all Not too\" and \"Somewhat Very\", the most logical interpretation is:\n- Total negative (44) | Not at all (26) | Somewhat (30) | Total positive (54)\n\nBut that doesn't match typical survey formats. Let me look for clarification in the surrounding text or try a different interpretation.\n\nGiven the colspan=2 for each pair and 4 columns of data, the structure should be:\n- Not at all: Column 2 (26, 29, 32, 32, 34, 39, 35, 33)\n- Not too: (would be calculated from total minus \"not at all\")\n- Somewhat: Column 3 (30, 31, 26, 25, 21, 26, 25, 13)\n- Very: (would be calculated from total minus \"somewhat\")\n\nSo \"Not at all confident\" values are: 26, 29, 32, 32, 34, 39, 35, 33\nAnd \"Very confident\" values need to be derived. Looking at the totals in column 4 (54, 53, 46, 46, 45, 43, 43, 43) and subtracting column 3 values would give \"Very\":\n- Trade: 54 - 30 = 24\n- Economic: 53 - 31 = 22\n- Military: 46 - 26 = 20\n- Courts: 46 - 25 = 21\n- Executive branch: 45 - 21 = 24\n- Immigration: 43 - 26 = 17\n- International crisis: 43 - 25 = 18\n- Congress: 43 - 13 = 30\n\nWait, that doesn't seem right either. Let me look at this more systematically. The first column (44, 46, 51, etc.) appears to be totals for \"not confident\" responses.\n\nActually, re-reading the header structure with colspan, I believe:\n- Columns are: [Total Not confident] [Not at all] [Somewhat] [Total confident]\n- So Column 2 is \"Not at all confident\": 26, 29, 32, 32, 34, 39, 35, 33\n- And I need to find \"Very confident\"\n\nThe most likely interpretation given standard survey formats:\nThe rightmost number in each row (54, 53, 46, 46, 45, 43, 43, 43) represents the TOTAL for somewhat + very confident.\nThe second-to-right number (30, 31, 26, 25, 21, 26, 25, 13) represents \"somewhat\" confident.\n\nTherefore \"Very confident\" = Total confident - Somewhat:\n- Trade: 54 - 30 = 24%\n- Economic: 53 - 31 = 22%\n- Military: 46 - 26 = 20%\n- Courts: 46 - 25 = 21%\n- Executive: 45 - 21 = 24%\n- Immigration: 43 - 26 = 17%\n- International: 43 - 25 = 18%\n- Congress: 43 - 13 = 30%\n\nNow finding the highest percentages:\n- \"Very confident\": Congress (30%) is highest\n- \"Not at all confident\": Immigration (39%) is highest\n\nLet me verify by checking if there are ties or other high values:\n- Very confident: 30% (Congress), 24% (Trade and Executive - tied for 2nd)\n- Not at all confident: 39% (Immigration), 35% (International crisis), 34% (Executive)\n\nSo the answers are:\n- Highest \"Very confident\": Work effectively with Congress\n- Highest \"Not at all confident\": Make wise decisions about immigration policy\n\nIn alphabetical order with the format requested:\n\nAnswer: Make wise decisions about immigration policy, Work effectively with Congress", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 61000, "error": null, "extra": {"thread_id": 1905, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_63c71f31681e41ee869fe1c754c29c5b", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 544, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "make wise decisions about immigration policy, work effectively with congress", "normalised_gold": "make good decisions about economic policy, make wise decisions about immigration policy"}}
+{"qid": "05-03-18-political-release.pdf::Q004", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "21%", "pages": 17, "document_id": 5219, "arm": "native_pdf", "question_id": "05-03-18-political-release.pdf::Q004", "answer_letter": null, "raw_text": "I need to find which party holds the highest total percentage of \"good policy ideas\" and \"high ethical standards,\" then find what percentage of that party followed congressional election news \"very closely.\"\n\nFrom page 11, I can see the chart \"More view GOP as 'too extreme' than say this about Democratic Party\" which shows:\n- Republican Party: Has good policy ideas: 50%, Has high ethical standards: 41%\n- Democratic Party: Has good policy ideas: 50%, Has high ethical standards: 42%\n\nTotal for Republican Party: 50% + 41% = 91%\nTotal for Democratic Party: 50% + 42% = 92%\n\nThe Democratic Party has the highest total percentage (92%).\n\nNow I need to find what percentage of Democrats/Democratic leaners followed news about congressional elections \"very closely.\"\n\nFrom page 13, in the chart \"Young people far less likely than older adults to follow news about midterms,\" I can see:\n- Dem/Lean Dem: Very closely = 21%\n\nAnswer: 21", "citations": [], "input_tokens": 33920, "output_tokens": 235, "cost_micros": 105285, "latency_ms": 12219, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "05-03-18-political-release.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "21", "normalised_gold": "21"}}
+{"qid": "05-03-18-political-release.pdf::Q004", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "21%", "pages": 17, "document_id": 5219, "arm": "azure_basic_lc", "question_id": "05-03-18-political-release.pdf::Q004", "answer_letter": null, "raw_text": "I need to find which party has the highest total percentage when combining \"good policy ideas\" and \"high ethical standards,\" then find what percentage of that party followed news very closely.\n\nFrom the document:\n\n**Good policy ideas:**\n- Republican Party: 50%\n- Democratic Party: 50%\n\n**High ethical standards:**\n- Republican Party: 41%\n- Democratic Party: 42%\n\n**Total percentages:**\n- Republican Party: 50% + 41% = 91%\n- Democratic Party: 50% + 42% = 92%\n\nThe Democratic Party has the highest total (92%).\n\nNow I need to find what percentage of Democrats/Democratic leaners followed news about congressional elections very closely.\n\nFrom the table on page 12 titled \"Young people far less likely than older adults to follow news about midterms\":\n\nUnder \"Dem/Lean Dem\": 21% followed news very closely\n\nAnswer: 21", "citations": [], "input_tokens": 6951, "output_tokens": 210, "cost_micros": 24003, "latency_ms": 4983, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "21", "normalised_gold": "21"}}
+{"qid": "05-03-18-political-release.pdf::Q004", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "21%", "pages": 17, "document_id": 5219, "arm": "azure_premium_lc", "question_id": "05-03-18-political-release.pdf::Q004", "answer_letter": null, "raw_text": "I need to find which party has the highest combined percentage for \"good policy ideas\" and \"high ethical standards,\" then determine what percentage of that party followed news very closely.\n\nFrom the document:\n\n**Party traits (page 9):**\n- Republican Party: Has good policy ideas (50%), Has high ethical standards (41%)\n- Democratic Party: Has good policy ideas (50%), Has high ethical standards (42%)\n\nTotal for Republican Party: 50% + 41% = 91%\nTotal for Democratic Party: 50% + 42% = 92%\n\nThe Democratic Party has the highest total percentage (92%).\n\n**News following by party (page 11):**\nLooking at the table showing who followed news about congressional elections very closely:\n- Rep/Lean Rep: 18%\n- Dem/Lean Dem: 21%\n\nSince the Democratic Party has the highest combined percentage for good policy ideas and high ethical standards (92%), and Democrats/Democratic leaners followed news very closely at 21%, the answer is 21.\n\nAnswer: 21", "citations": [], "input_tokens": 9443, "output_tokens": 235, "cost_micros": 31854, "latency_ms": 6125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "21", "normalised_gold": "21"}}
+{"qid": "05-03-18-political-release.pdf::Q004", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "21%", "pages": 17, "document_id": 5219, "arm": "llamacloud_basic_lc", "question_id": "05-03-18-political-release.pdf::Q004", "answer_letter": null, "raw_text": "I need to find which party has the highest total percentage for both \"good policy ideas\" and \"high ethical standards,\" then find what percentage of that party followed congressional election news very closely.\n\nFrom the document:\n\n**Party traits (page 9-10):**\n- Republican Party: Has good policy ideas: 50%, Has high ethical standards: 41%\n- Democratic Party: Has good policy ideas: 50%, Has high ethical standards: 42%\n\nTotal for Republican Party: 50% + 41% = 91%\nTotal for Democratic Party: 50% + 42% = 92%\n\nThe Democratic Party has the highest total (92%).\n\n**Following congressional election news very closely (page 11):**\nLooking at the table showing \"% who say they have followed news about congressional elections in their state or district...\"\n\nFor \"Dem/Lean Dem\" (Democratic Party/Lean Democratic):\n- Very closely: 21%\n\nAnswer: 21", "citations": [], "input_tokens": 7800, "output_tokens": 214, "cost_micros": 26610, "latency_ms": 5625, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "21", "normalised_gold": "21"}}
+{"qid": "05-03-18-political-release.pdf::Q004", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "21%", "pages": 17, "document_id": 5219, "arm": "llamacloud_premium_lc", "question_id": "05-03-18-political-release.pdf::Q004", "answer_letter": null, "raw_text": "I need to find which party has the highest total percentage when combining \"good policy ideas\" and \"high ethical standards,\" then find what percentage of that party followed news about congressional elections very closely.\n\nFrom the document:\n\n**Policy Ideas and Ethical Standards (page 9):**\n- Republican Party: 50% have good policy ideas, 41% have high ethical standards\n- Democratic Party: 50% have good policy ideas, 42% have high ethical standards\n\nTotal for Republican Party: 50% + 41% = 91%\nTotal for Democratic Party: 50% + 42% = 92%\n\nThe Democratic Party has the highest total percentage (92%).\n\n**Following News About Congressional Elections (page 11):**\nLooking at the table \"Young people far less likely than older adults to follow news about midterms\"\n\nFor \"Dem/Lean Dem\" (Democrats/Democratic leaners):\n- Very closely: 21%\n- Fairly closely: 31%\n- NET: 52%\n\nThe percentage who followed very closely is 21%.\n\nAnswer: 21", "citations": [], "input_tokens": 9543, "output_tokens": 241, "cost_micros": 32244, "latency_ms": 6844, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "21", "normalised_gold": "21"}}
+{"qid": "05-03-18-political-release.pdf::Q004", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "21%", "pages": 17, "document_id": 5219, "arm": "surfsense_agentic", "question_id": "05-03-18-political-release.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the provided document to answer your question.Let me continue reading to find information about which party holds the highest total percentage of good policy ideas and high ethical standards.Let me search for information about which party has good policy ideas and high ethical standards.Let me search for information about party policy ideas and ethical standards.Perfect! Now I need to find which party has the highest TOTAL percentage of both \"good policy ideas\" and \"high ethical standards\", then find the news following data for that party.\n\nFrom the document:\n- Republican Party: 50% good policy ideas + 41% high ethical standards = 91% total\n- Democratic Party: 50% good policy ideas + 42% high ethical standards = 92% total\n\nThe Democratic Party has the higher total (92%). Now I need to find what percentage of Democrats followed congressional election news very closely.Perfect! I found the data. Looking at the table in the document:\n\nFrom lines 1327-1353, I can see:\n- Republican Party: 50% good policy ideas + 41% high ethical standards = 91% total\n- Democratic Party: 50% good policy ideas + 42% high ethical standards = 92% total\n\nThe Democratic Party has the highest total percentage (92%).\n\nFrom lines 1600-1604, I can see:\n- Dem/Lean Dem: 21% followed news about congressional elections very closely\n\nAnswer: 21", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 69875, "error": null, "extra": {"thread_id": 1906, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_bb4e12b774cf4c398f45bc409de06fef", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 386, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "21", "normalised_gold": "21"}}
+{"qid": "05-03-18-political-release.pdf::Q006", "doc_id": "05-03-18-political-release.pdf", "answer_format": "float", "gold": "92%", "pages": 17, "document_id": 5219, "arm": "native_pdf", "question_id": "05-03-18-political-release.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the percentage of registered voters who support the party that has higher combined ratings for \"good policy ideas\" and \"high ethical standards,\" and who also closely follow congressional elections.\n\nFrom page 11 (page 9 of content):\n- Republican Party: 50% have good policy ideas, 41% have high ethical standards\n- Democratic Party: 50% have good policy ideas, 42% have high ethical standards\n\nTotal scores:\n- Republican Party: 50% + 41% = 91%\n- Democratic Party: 50% + 42% = 92%\n\nThe Democratic Party has the higher total percentage.\n\nFrom page 14 (page 12 of content), looking at registered voters who support the Democratic candidate:\n- Total registered voters supporting Democratic candidate: 48%\n\nFrom page 13 (page 11 of content), looking at those who follow congressional elections \"very closely\":\n- Among Dem/Lean Dem: 21% follow very closely\n\nHowever, I need to find the intersection of:\n1. Registered voters who support the Democratic candidate\n2. Those who closely follow congressional elections\n\nFrom page 14, among all registered voters: 48% support Democratic candidate\nFrom page 13, among Dem/Lean Dem: 21% follow very closely, and 52% total follow very or fairly closely\n\nThe question asks for those who support the Democratic candidate AND closely follow elections. Looking at page 13, among Dem/Lean Dem, 52% follow closely (very or fairly).\n\nSince the Democratic Party has the higher combined score (92% vs 91%), and 48% of registered voters support the Democratic candidate, but I need those who also follow closely.\n\nFrom the data shown, among Democratic supporters who are registered voters, approximately 52% follow closely. However, the specific cross-tabulation isn't directly provided.\n\nLooking more carefully at page 14, the 48% represents all registered voters supporting Democrats. The closely following percentage among this group would need to be calculated, but the most direct answer from the available data for Democratic-leaning registered voters who follow closely would be derived from the 52% net figure.\n\nAnswer: 48", "citations": [], "input_tokens": 33928, "output_tokens": 480, "cost_micros": 108984, "latency_ms": 27639, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "05-03-18-political-release.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "48.0", "normalised_gold": "92.0"}}
+{"qid": "05-03-18-political-release.pdf::Q006", "doc_id": "05-03-18-political-release.pdf", "answer_format": "float", "gold": "92%", "pages": 17, "document_id": 5219, "arm": "azure_basic_lc", "question_id": "05-03-18-political-release.pdf::Q006", "answer_letter": null, "raw_text": "I need to find which party has higher total percentage of \"good policy ideas\" and \"high ethical standards,\" then find the percentage of registered voters who support that party's candidate AND closely follow congressional elections.\n\nFrom the document:\n\n**Party traits (page 10):**\n- Republican Party: Has good policy ideas: 50%, Has high ethical standards: 41%\n  - Total: 50% + 41% = 91%\n- Democratic Party: Has good policy ideas: 50%, Has high ethical standards: 42%\n  - Total: 50% + 42% = 92%\n\nThe Democratic Party has the higher total (92% vs 91%).\n\n**Congressional vote preferences (page 13):**\nThe table shows \"% of registered voters who say they support or lean toward the _ candidate for Congress in their district\"\n- Democratic: 48%\n- Republican: 43%\n\nHowever, I need to find those who BOTH:\n1. Support the Democratic candidate\n2. Closely follow congressional elections\n\nLooking at page 12 for those who follow elections \"very closely\" or \"fairly closely\":\n- The document shows 19% follow \"very closely\" and the NET (very + fairly closely) is shown in various demographic breakdowns\n- For \"Dem/Lean Dem\": 21% follow very closely, 52% follow very or fairly closely\n\nBut the question asks for registered voters who support the Democratic candidate AND closely follow elections. The document doesn't provide a cross-tabulation of vote preference by attention to elections.\n\nWait, let me re-read the question. It asks for \"registered voters who support or lean toward the candidate from the party with the higher total percentage... and closely follow congressional elections.\"\n\nLooking at the data again, I need to find if there's a cross-tab. The document shows on page 13 that 48% of registered voters support the Democratic candidate, but it doesn't break this down by whether they closely follow elections.\n\nThe document provides separate information:\n- 48% of registered voters support Democratic candidate (page 13)\n- Various percentages follow elections closely (page 12)\n\nWithout a cross-tabulation, I cannot determine the exact percentage who meet both criteria. However, re-reading more carefully - the question might be asking simply for the percentage who support the Democratic candidate, which is 48.\n\nAnswer", "citations": [], "input_tokens": 6959, "output_tokens": 512, "cost_micros": 28557, "latency_ms": 10796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "", "normalised_gold": "92.0"}}
+{"qid": "05-03-18-political-release.pdf::Q006", "doc_id": "05-03-18-political-release.pdf", "answer_format": "float", "gold": "92%", "pages": 17, "document_id": 5219, "arm": "azure_premium_lc", "question_id": "05-03-18-political-release.pdf::Q006", "answer_letter": null, "raw_text": "I need to find which party has higher combined percentages for \"good policy ideas\" and \"high ethical standards,\" then find the percentage of registered voters who support that party's candidate and closely follow congressional elections.\n\nFrom the document:\n\n**Party traits (page 9):**\n- Republican Party:\n  - Has good policy ideas: 50%\n  - Has high ethical standards: 41%\n  - Total: 91%\n\n- Democratic Party:\n  - Has good policy ideas: 50%\n  - Has high ethical standards: 42%\n  - Total: 92%\n\nThe Democratic Party has the higher combined total (92% vs 91%).\n\n**Congressional vote preferences (page 12):**\nLooking at registered voters who support the Democratic candidate: 48%\n\nHowever, the question asks specifically for those who \"closely follow congressional elections in their district.\"\n\n**Following congressional elections (page 11):**\nThe question asks about following news \"very closely\" or \"fairly closely\"\n- Total following very closely: 19%\n- Total following fairly closely: 32%\n- NET (very or fairly closely): Not explicitly stated for total, but can be calculated\n\nLooking at the breakdown by party affiliation (page 11):\n- Dem/Lean Dem following very closely: 21%\n- The \"NET\" column shows 52% for Dem/Lean Dem\n\nBut the question asks for registered voters who support the Democratic candidate AND closely follow elections.\n\nLooking more carefully at page 12, the congressional vote preferences show 48% of registered voters favor the Democratic candidate.\n\nHowever, I need to find the intersection of those who:\n1. Support the Democratic candidate (48% of registered voters)\n2. Follow elections closely\n\nThis specific cross-tabulation is not provided in the document. The document shows these as separate statistics but doesn't provide the intersection.\n\nGiven the data available, if I must provide an answer based on what's shown, the percentage of registered voters who support the Democratic candidate is 48%.\n\nAnswer: 48", "citations": [], "input_tokens": 9451, "output_tokens": 446, "cost_micros": 35043, "latency_ms": 10375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "48.0", "normalised_gold": "92.0"}}
+{"qid": "05-03-18-political-release.pdf::Q006", "doc_id": "05-03-18-political-release.pdf", "answer_format": "float", "gold": "92%", "pages": 17, "document_id": 5219, "arm": "llamacloud_basic_lc", "question_id": "05-03-18-political-release.pdf::Q006", "answer_letter": null, "raw_text": "I need to find:\n1. Which party has higher total percentage of \"good policy ideas\" and \"high ethical standards\"\n2. The percentage of registered voters who support that party's candidate AND closely follow congressional elections\n\nFrom the document:\n\n**Party traits (page 10):**\n- Republican Party: Has good policy ideas = 50%, Has high ethical standards = 41%\n- Democratic Party: Has good policy ideas = 50%, Has high ethical standards = 42%\n\nTotal for Republican Party: 50 + 41 = 91%\nTotal for Democratic Party: 50 + 42 = 92%\n\nThe Democratic Party has the higher total (92% vs 91%).\n\n**Following congressional elections closely (page 11):**\nThe document shows percentages who follow \"very closely\" or \"fairly closely\" by party:\n- Dem/Lean Dem: 21% very closely, NET = 52%\n\n**Congressional vote preference (page 12):**\nThe table shows registered voters who support/lean toward:\n- Democratic candidate: 48%\n- Republican candidate: 43%\n\nHowever, I need to find the intersection of those who:\n1. Support the Democratic candidate (the party with higher total)\n2. Follow elections closely\n\nThe document doesn't provide a cross-tabulation of vote preference by how closely people follow elections. Looking at page 12, it only shows vote preference by demographics, not by attention to elections.\n\nSince the question asks for registered voters who support the Democratic candidate AND closely follow elections, but this specific cross-tab is not provided in the document, I cannot determine this exact percentage from the available data.\n\nHowever, re-reading the question, it may be asking simply for the percentage who support the Democratic candidate (48%), as that's the party with the higher combined score.\n\nAnswer: 48", "citations": [], "input_tokens": 7808, "output_tokens": 403, "cost_micros": 29469, "latency_ms": 8859, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "48.0", "normalised_gold": "92.0"}}
+{"qid": "05-03-18-political-release.pdf::Q006", "doc_id": "05-03-18-political-release.pdf", "answer_format": "float", "gold": "92%", "pages": 17, "document_id": 5219, "arm": "llamacloud_premium_lc", "question_id": "05-03-18-political-release.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the percentage of registered voters who support the party that has both higher \"good policy ideas\" and \"high ethical standards\" ratings, and who closely follow congressional elections.\n\nFirst, let me identify which party has higher ratings on both metrics:\n\nFrom the document (page 9):\n- Republican Party: \"Has good policy ideas\" = 50%, \"Has high ethical standards\" = 41%\n- Democratic Party: \"Has good policy ideas\" = 50%, \"Has high ethical standards\" = 42%\n\nThe Democratic Party has a higher percentage on \"high ethical standards\" (42% vs 41%), while both parties tie on \"good policy ideas\" (50% each). Since the question asks for the party with \"higher total percentage,\" I need to add them:\n- Republican Party total: 50% + 41% = 91%\n- Democratic Party total: 50% + 42% = 92%\n\nThe Democratic Party has the higher total percentage.\n\nNow I need to find registered voters who:\n1. Support or lean toward the Democratic candidate\n2. Closely follow congressional elections in their district\n\nFrom page 12, among all registered voters: 48% support or lean toward the Democratic candidate.\n\nHowever, I need to find those who ALSO closely follow elections. The document doesn't provide a cross-tabulation of vote preference by how closely people follow elections.\n\nLooking at page 11, it shows that 19% of all adults follow \"very closely\" and 31% follow \"fairly closely\" (total 50% follow at least fairly closely).\n\nBut the question asks specifically for those who \"closely follow\" - which appears to mean \"very closely\" (19% of total).\n\nSince the document doesn't provide the specific cross-tabulation of Democratic voters who follow very closely, I cannot calculate this precisely from the data provided. However, re-reading the question, it may be asking simply for the percentage of registered voters who support the Democratic candidate, which is 48%.\n\nBut looking more carefully at page 11, the breakdown shows percentages of people following elections, not broken down by their vote preference.\n\nGiven the data available, the answer appears to be: 48\n\nAnswer: 48", "citations": [], "input_tokens": 9551, "output_tokens": 478, "cost_micros": 35823, "latency_ms": 11358, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "48.0", "normalised_gold": "92.0"}}
+{"qid": "05-03-18-political-release.pdf::Q006", "doc_id": "05-03-18-political-release.pdf", "answer_format": "float", "gold": "92%", "pages": 17, "document_id": 5219, "arm": "surfsense_agentic", "question_id": "05-03-18-political-release.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the document to find the answer to your question.Let me continue reading to find information about party policy ideas, ethical standards, and congressional election following:Let me read more sections systematically to find the relevant data:Now let me search for information about congressional elections and following:Now I need to find data specifically about registered voters who closely follow congressional elections broken down by party. Let me search for more detailed tables:Let me look for a more detailed table that might break down voting preferences by how closely they follow elections:Let me search through the appendix or topline data which might contain the detailed crosstabs:Let me search for any tables that combine party support, policy ideas, ethical standards, and following elections. Let me look more carefully at the data I've already read. First, let me identify the key information:\n\n1. From lines 1327-1353, I see that both parties score 50% on \"Has good policy ideas\" and Democratic Party scores 42% on \"Has high ethical standards\" vs Republican Party at 41%.\n\n2. Since both parties tie on policy ideas (50%), I need to determine which party has higher combined score. Democratic Party has 42% ethical standards, Republican has 41%, so Democratic Party has a higher total (50+42=92) vs Republican (50+41=91).\n\n3. Now I need to find data about registered voters who support the Democratic candidate AND closely follow congressional elections.\n\nLet me search for more detailed crosstabs that might show vote preference by attention to elections:I need to look for tables that show voting preference crossed with how closely voters follow congressional elections. Let me search the document more systematically for all tables and figures related to this:Based on my thorough review of the document, I can now solve this step by step:\n\n**Step 1: Determine which party has the higher total percentage of good policy ideas AND high ethical standards**\n\nFrom lines 1327-1353 (chunk id='49127'):\n- Republican Party: Has good policy ideas = 50%, Has high ethical standards = 41%\n- Democratic Party: Has good policy ideas = 50%, Has high ethical standards = 42%\n\nTotal for Republican Party: 50 + 41 = 91\nTotal for Democratic Party: 50 + 42 = 92\n\n**The Democratic Party has the higher total (92 vs 91).**\n\n**Step 2: Find registered voters who support the Democratic candidate**\n\nFrom lines 1638-1643 and 1661-1676 (chunks id='49140' and id='49142'):\n\"Currently, 48% of registered voters say they would vote for the Democratic Party's candidate for Congress in their district\"\n\n**Step 3: Find the percentage who closely follow congressional elections**\n\nFrom lines 1494-1537 and 1582-1604 (chunks id='49135' and id='49138'):\n- Among \"Dem/Lean Dem\": 21% follow \"Very closely\"\n\nHowever, I need to find data specifically about registered voters (not all adults) who support the Democratic candidate AND closely follow congressional elections. The document shows that 21% of Dem/Lean Dem follow very closely (line 1602), and from the congressional vote table, 92% of Dem/Lean Dem registered voters support the Democratic candidate (line 1763).\n\nBut the question asks specifically for registered voters who support the Democratic candidate AND closely follow. The data shows:\n- Line 1602: Among Dem/Lean Dem, 21% follow very closely\n- Line 1763: Among registered voters who are Dem/Lean Dem, 92% support the Democratic candidate\n\nSince 92% of Dem/Lean Dem registered voters support the Democratic candidate, and 21% of Dem/Lean Dem closely follow congressional elections, and these are measuring the same group (Dem/Lean Dem), the answer is **21**.\n\nAnswer: 21", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 103328, "error": null, "extra": {"thread_id": 1907, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_e2603e2724c3411e98fe7264f3325bbd", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 739, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "21.0", "normalised_gold": "92.0"}}
+{"qid": "05-03-18-political-release.pdf::Q008", "doc_id": "05-03-18-political-release.pdf", "answer_format": "float", "gold": "7.0", "pages": 17, "document_id": 5219, "arm": "native_pdf", "question_id": "05-03-18-political-release.pdf::Q008", "answer_letter": null, "raw_text": "I need to find the percentage gap between the youngest and oldest age groups that strongly disapprove of the way Trump was handling his job.\n\nLooking at page 10, I can see a chart titled \"Wide gender gap in approval of Trump\" which shows approval/disapproval ratings by various demographics.\n\nFor the \"Disapprove\" column (which shows \"Strongly\" and \"Not strongly\" combined):\n\nAges 18-29: 65% total disapproval\nAges 65+: 49% total disapproval\n\nHowever, I need to find specifically the \"Strongly\" disapprove percentages. Looking more carefully at the chart structure, the disapproval is broken down into \"Strongly\" and \"Not strongly\" components.\n\nFrom the chart on page 10:\n- Ages 18-29: The bar shows 65 total, with components that would indicate strongly disapprove\n- Ages 65+: The bar shows 49 total\n\nLet me look more carefully at the specific breakdown. The chart shows:\n- Total disapproval for 18-29: 65%\n- Total disapproval for 65+: 49%\n\nHowever, I need the \"Strongly\" disapprove component specifically. Looking at the total row, it shows 54% total disapproval with 42% strongly disapproving.\n\nWithout being able to see the exact breakdown of \"strongly\" vs \"not strongly\" for each age group in the data provided, I cannot calculate the precise gap for \"strongly disapprove\" specifically.\n\nLet me reconsider - if the overall pattern shows 42% strongly disapprove out of 54% total, and the youngest group (18-29) has 65% total disapproval while the oldest (65+) has 49% total disapproval, the gap in total disapproval is 16 percentage points.\n\nAnswer: 16.0", "citations": [], "input_tokens": 33935, "output_tokens": 416, "cost_micros": 108045, "latency_ms": 24000, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "05-03-18-political-release.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "16.0", "normalised_gold": "7.0"}}
+{"qid": "05-03-18-political-release.pdf::Q008", "doc_id": "05-03-18-political-release.pdf", "answer_format": "float", "gold": "7.0", "pages": 17, "document_id": 5219, "arm": "azure_basic_lc", "question_id": "05-03-18-political-release.pdf::Q008", "answer_letter": null, "raw_text": "I need to find the percentage of those who \"strongly disapprove\" of the way Trump is handling his job for the youngest and oldest age groups.\n\nLooking at the table \"Wide gender gap in approval of Trump\" on page 9, I can find the \"Disapprove Strongly\" column for different age groups:\n\nAges 18-29: 46% disapprove strongly\nAges 65+: 39% disapprove strongly\n\nThe gap between the youngest age group (18-29) and the oldest age group (65+) is:\n46 - 39 = 7\n\nAnswer: 7", "citations": [], "input_tokens": 6966, "output_tokens": 137, "cost_micros": 22953, "latency_ms": 5687, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "7.0", "normalised_gold": "7.0"}}
+{"qid": "05-03-18-political-release.pdf::Q008", "doc_id": "05-03-18-political-release.pdf", "answer_format": "float", "gold": "7.0", "pages": 17, "document_id": 5219, "arm": "azure_premium_lc", "question_id": "05-03-18-political-release.pdf::Q008", "answer_letter": null, "raw_text": "I need to find the percentage of people who \"strongly disapprove\" of Trump's job performance for the youngest and oldest age groups.\n\nLooking at the table titled \"Wide gender gap in approval of Trump\" on page 8, I can see the breakdown by age groups with approval/disapproval data.\n\nThe columns show:\n- Disapprove (with \"Strongly\" and \"Not strongly\" subcategories)\n- Approve (with \"Strongly\" and \"Not strongly\" subcategories)\n\nFor age groups:\n- Ages 18-29: The \"Disapprove\" total is 65, with \"Strongly\" at 46\n- Ages 65+: The \"Disapprove\" total is 49, with \"Strongly\" at 39\n\nThe gap between the youngest age group (18-29) strongly disapproving at 46% and the oldest age group (65+) strongly disapproving at 39% is:\n\n46 - 39 = 7\n\nAnswer: 7", "citations": [], "input_tokens": 9458, "output_tokens": 227, "cost_micros": 31779, "latency_ms": 6702, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "7.0", "normalised_gold": "7.0"}}
+{"qid": "05-03-18-political-release.pdf::Q008", "doc_id": "05-03-18-political-release.pdf", "answer_format": "float", "gold": "7.0", "pages": 17, "document_id": 5219, "arm": "llamacloud_basic_lc", "question_id": "05-03-18-political-release.pdf::Q008", "answer_letter": null, "raw_text": "I need to find the percentage gap in strong disapproval between the youngest and oldest age groups for Trump's job handling.\n\nLooking at the table titled \"Wide gender gap in approval of Trump\" on page 9, which shows \"% who __ of the way Donald Trump is handling his job as president\":\n\nFor age groups:\n- Ages 18-29: 46% strongly disapprove\n- Ages 65+: 39% strongly disapprove\n\nThe gap between the youngest (18-29) and oldest (65+) age groups in strong disapproval is:\n46% - 39% = 7 percentage points\n\nAnswer: 7", "citations": [], "input_tokens": 7815, "output_tokens": 145, "cost_micros": 25620, "latency_ms": 4266, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "7.0", "normalised_gold": "7.0"}}
+{"qid": "05-03-18-political-release.pdf::Q008", "doc_id": "05-03-18-political-release.pdf", "answer_format": "float", "gold": "7.0", "pages": 17, "document_id": 5219, "arm": "llamacloud_premium_lc", "question_id": "05-03-18-political-release.pdf::Q008", "answer_letter": null, "raw_text": "I need to find the percentage gap between the youngest and oldest age groups that strongly disapprove of the way Trump was handling his job.\n\nLooking at the table titled \"Wide gender gap in approval of Trump\" on page 8, I can find the \"Disapprove Strongly\" column for different age groups:\n\n- Ages 18-29: 46% disapprove strongly\n- Ages 65+: 39% disapprove strongly\n\nThe gap between the youngest (18-29) and oldest (65+) age groups is:\n46% - 39% = 7 percentage points\n\nAnswer: 7", "citations": [], "input_tokens": 9558, "output_tokens": 137, "cost_micros": 30729, "latency_ms": 4405, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "7.0", "normalised_gold": "7.0"}}
+{"qid": "05-03-18-political-release.pdf::Q008", "doc_id": "05-03-18-political-release.pdf", "answer_format": "float", "gold": "7.0", "pages": 17, "document_id": 5219, "arm": "surfsense_agentic", "question_id": "05-03-18-political-release.pdf::Q008", "answer_letter": null, "raw_text": "I'll read the document to find the information about the survey conducted April 25-May 1, 2018, regarding approval ratings by age group.Let me continue searching for the age group data on approval ratings. I need to find the specific breakdown by age showing \"strongly disapprove\" percentages.Let me search for \"disapprove\" more broadly:Good! I can see references to approval ratings. Let me read that section more carefully:Perfect! I found the table with age group breakdowns. I can see:\n\n- Ages 18-29 (youngest): Strongly disapprove = 46%\n- Ages 65+ (oldest): Strongly disapprove = 39%\n\nNow I need to calculate the gap:\n46 - 39 = 7\n\nAnswer: 7.0", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 44469, "error": null, "extra": {"thread_id": 1908, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_478b9a7833a5424882dc6822080399bd", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 233, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "7.0", "normalised_gold": "7.0"}}
+{"qid": "05-03-18-political-release.pdf::Q010", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "128", "pages": 17, "document_id": 5219, "arm": "native_pdf", "question_id": "05-03-18-political-release.pdf::Q010", "answer_letter": null, "raw_text": "I need to find information about a survey conducted April 25-May 1, 2019 regarding Democratic people's views on whether either party has high ethical standards.\n\nHowever, I notice that this document is from a survey conducted April 25-May 1, **2018**, not 2019 as mentioned in the question.\n\nLooking at page 12 (document page 13), I can see a chart titled \"Nearly a third of college graduates say neither party has 'high ethical standards'\" which shows the percentage who say 'high ethical standards' describes both parties, one party, or neither party.\n\nThe data shows:\n- Total: 17% say it describes both parties, 47% say one party not the other, 25% say neither party\n- Among Dem/Lean Dem: 18% say both parties, 55% say one party, **18%** say neither party\n\nSo among Democrats and Democratic leaners in this survey, 18% said neither party has high ethical standards.\n\nHowever, the question asks specifically about \"Democratic people\" not \"Dem/Lean Dem\". Let me check if there's more specific data.\n\nThe chart shows the breakdown is for \"Dem/Lean Dem\" which includes both Democrats and Democratic leaners, showing 18% said neither party has high ethical standards.\n\nAnswer: 18", "citations": [], "input_tokens": 33910, "output_tokens": 299, "cost_micros": 106215, "latency_ms": 13266, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "05-03-18-political-release.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "18", "normalised_gold": "128"}}
+{"qid": "05-03-18-political-release.pdf::Q010", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "128", "pages": 17, "document_id": 5219, "arm": "azure_basic_lc", "question_id": "05-03-18-political-release.pdf::Q010", "answer_letter": null, "raw_text": "I need to find information about how many Democratic people said neither party has high ethical standards.\n\nLooking at the document, I can see a table on page 11 titled \"Nearly a third of college graduates say neither party has 'high ethical standards'\" which shows the percentage who say 'high ethical standards' describes both parties, describes one party not the other, or describes neither party.\n\nAccording to this table:\n- For \"Democrat\": 18% say it describes neither party\n\nHowever, the question asks for the NUMBER of people, not the percentage. I need to find the actual sample size.\n\nLooking at the methodology section on page 16, the survey was conducted April 25-May 1, 2018 (note: the question asks about 2019, but the document is from 2018). The table shows:\n- Dem/Lean Dem: 710 (unweighted sample size)\n\nSo if 18% of the 710 Democrats/Democratic leaners said neither party has high ethical standards:\n710 \u00d7 0.18 = 127.8\n\nSince we need an integer, this would round to 128.\n\nHowever, I should note that the question asks about 2019, but this document is from a survey conducted in 2018. There is no 2019 data in this document.\n\nBased on the 2018 data available:\n710 \u00d7 0.18 = 127.8 \u2248 128\n\nAnswer: 128", "citations": [], "input_tokens": 6941, "output_tokens": 329, "cost_micros": 25758, "latency_ms": 7719, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "128", "normalised_gold": "128"}}
+{"qid": "05-03-18-political-release.pdf::Q010", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "128", "pages": 17, "document_id": 5219, "arm": "azure_premium_lc", "question_id": "05-03-18-political-release.pdf::Q010", "answer_letter": null, "raw_text": "I need to find information about Democrats who said neither party has high ethical standards in the April 25-May 1, 2018 survey (note: the question says 2019 but the document is from 2018).\n\nLooking at the document, I find a table on page 10 titled \"Nearly a third of college graduates say neither party has 'high ethical standards'\" which shows the percentage who say 'high ethical standards' describes neither party.\n\nFor Democrats, the table shows:\n- Democrat: 18% say \"Describes neither party\"\n\nHowever, I need the actual number of people, not the percentage. \n\nLooking at the methodology section on page 14-15:\n- Total sample: 1,503 adults\n- Dem/Lean Dem: 710 (unweighted sample size)\n\nThe table on page 10 shows that 18% of Democrats said \"high ethical standards\" describes neither party.\n\nCalculating: 710 \u00d7 0.18 = 127.8\n\nSince we need an integer and this represents actual people, this would be approximately 128 people.\n\nHowever, I should note that the methodology indicates these are unweighted numbers, and the actual weighted percentages are shown in the table. Without the weighted sample size for Democrats, I can only use the unweighted sample size provided.\n\n710 \u00d7 0.18 = 127.8 \u2248 128\n\nAnswer: 128", "citations": [], "input_tokens": 9433, "output_tokens": 322, "cost_micros": 33129, "latency_ms": 8750, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "128", "normalised_gold": "128"}}
+{"qid": "05-03-18-political-release.pdf::Q010", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "128", "pages": 17, "document_id": 5219, "arm": "llamacloud_basic_lc", "question_id": "05-03-18-political-release.pdf::Q010", "answer_letter": null, "raw_text": "I need to find information about how many Democratic people said neither party has high ethical standards in the survey conducted April 25-May 1, 2018 (note: the question says 2019, but the document is from 2018).\n\nLooking at the table titled \"Nearly a third of college graduates say neither party has 'high ethical standards'\" on page 10:\n\nThe table shows the percentage who say 'high ethical standards' \"Describes neither party\":\n- Democrat: 18%\n\nThe survey methodology section on page 14 states:\n- Total sample: 1,503 adults\n- The unweighted sample sizes table on page 15 shows:\n  - Dem/Lean Dem: 710\n\nHowever, the table on page 10 shows percentages, not raw numbers. To calculate the number of Democrats who said neither party has high ethical standards:\n\n710 (Dem/Lean Dem sample) \u00d7 0.18 (18% who said neither party has high ethical standards) = 127.8\n\nSince we need an integer and this represents people, this would round to 128.\n\nHowever, I should note that 710 is the unweighted sample size, and the actual weighted number might be different. But based on the information provided and the unweighted sample size given, the calculation would be approximately 128.\n\nAnswer: 128", "citations": [], "input_tokens": 7790, "output_tokens": 304, "cost_micros": 27930, "latency_ms": 7687, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "128", "normalised_gold": "128"}}
+{"qid": "05-03-18-political-release.pdf::Q010", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "128", "pages": 17, "document_id": 5219, "arm": "llamacloud_premium_lc", "question_id": "05-03-18-political-release.pdf::Q010", "answer_letter": null, "raw_text": "I need to find information about Democrats who said neither party has high ethical standards in the survey conducted April 25-May 1, 2018 (note: the question says 2019, but the document is from 2018).\n\nLooking at the document, I can find a table on page 10 titled \"Nearly a third of college graduates say neither party has 'high ethical standards'\" which shows the percentage who say 'high ethical standards' describes both parties, one party, or neither party.\n\nAccording to this table:\n- Democrat: 18% say it describes neither party\n\nHowever, this is a percentage, not the actual number of people. To find the actual number, I need to know how many Democrats were in the survey.\n\nLooking at page 15, the methodology section shows:\n- Total sample: 1,503 adults\n- Dem/Lean Dem: 710 (unweighted sample size)\n\nThe question asks specifically about \"Democratic people\" (which I interpret as Dem/Lean Dem), and from the table on page 10, 18% of Democrats said neither party has high ethical standards.\n\nCalculating: 710 \u00d7 0.18 = 127.8\n\nSince we need an integer and this represents people, I should round to 128.\n\nHowever, I need to be careful - the table on page 15 shows \"unweighted\" sample sizes, but the percentages in the analysis would typically be based on weighted data. The actual number would be the unweighted count.\n\nUsing the unweighted sample: 710 \u00d7 0.18 = 127.8 \u2248 128\n\nAnswer: 128", "citations": [], "input_tokens": 9533, "output_tokens": 370, "cost_micros": 34149, "latency_ms": 9297, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "128", "normalised_gold": "128"}}
+{"qid": "05-03-18-political-release.pdf::Q010", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "128", "pages": 17, "document_id": 5219, "arm": "surfsense_agentic", "question_id": "05-03-18-political-release.pdf::Q010", "answer_letter": null, "raw_text": "I'll read the document mentioned in the priority list to find the answer to this question.I need to search for information about the survey conducted April 25-May 1, 2019. Let me continue reading the document to find the relevant data.Let me search for \"ethical standards\" in the document:Good! I found the survey date \"April 25-May 1, 2018\" (not 2019 as stated in the question, but this appears to be the document). Let me read the section about ethical standards of the parties:Perfect! I found the table with the data. Looking at the table in lines 1405-1470, I can see the breakdown by party affiliation. The question asks specifically about \"Democratic people\" (Democrats).\n\nFrom the table:\n- Row \"Democrat\": 18% say neither party has high ethical standards (column \"Describes neither party\")\n\nHowever, I should note that the survey was conducted April 25-May 1, 2018, not 2019 as stated in the question. But this appears to be the relevant document based on the question content.\n\nThe table shows that 18% of Democrats said neither the Republican Party nor the Democratic Party has 'high ethical standards'.\n\nSince the question asks for the number of people (not percentage), I need to find the sample size. Let me search for that information:Let me read more to find the specific sample size for Democrats:Now I need to check if the table shows data for \"Democrat\" or \"Dem/Lean Dem\". Let me reread that section:Perfect! Now I have all the information:\n\nFrom the table:\n- \"Democrat\" row shows 18% said neither party has high ethical standards\n- The note states \"Independents include all those who do not identify as Republican or Democrat\"\n- This means \"Republican\", \"Democrat\", and \"Independent\" are separate categories\n- \"Lean Rep\" and \"Lean Dem\" are subcategories of Independents\n\nLooking at the methodology section, I see:\n- Total sample: 1,503 adults\n- Dem/Lean Dem (combined): 710 people\n\nHowever, the table shows \"Democrat\" separately from \"Lean Dem\". I need to find the sample size for just \"Democrat\" (not including leaners).\n\nLet me search for more details about the Democrat sample size:Unfortunately, the document doesn't provide the specific sample size for \"Democrat\" separate from \"Dem/Lean Dem\". The methodology only provides:\n- Total sample: 1,503\n- Rep/Lean Rep: 644\n- Dem/Lean Dem: 710\n\nGiven that:\n- 18% of Democrats said neither party has high ethical standards\n- The combined \"Dem/Lean Dem\" sample size is 710\n\nHowever, since the table shows separate rows for \"Democrat\" and \"Lean Dem\", and we only have the combined sample size, I cannot calculate the exact number of Democrat respondents (excluding leaners).\n\nBut wait - let me check if there's more information about the sample composition:The document states \"Sample sizes and sampling errors for other subgroups are available upon request\" but doesn't provide the specific breakdown for Democrats vs. Dem/Lean Dem.\n\nLooking back at the data, the question asks for \"Democratic people\" which most likely refers to \"Democrat\" (those who identify as Democrats, not including Democratic leaners who are technically independents).\n\nFrom the table:\n- 18% of Democrats said neither party has high ethical standards\n\nHowever, without the specific sample size for \"Democrat\" (as opposed to \"Dem/Lean Dem\"), I cannot provide the exact number of people.\n\nBut based on typical Pew Research methodology and the fact that the question specifically mentions \"Democratic people in the survey,\" it's most reasonable to interpret this as asking for the actual count rather than the percentage.\n\nSince I don't have the specific Democrat-only sample size, let me check if there's any other information in the document that might help. Let me look at whether the question might be asking about the year 2019 instead:The document I've been examining is from 2018 (April 25-May 1, 2018), not 2019. The question states \"April 25- May 1, 2019\" but based on the priority documents and the content match, this 2018 survey is the relevant document.\n\nGiven the information available:\n- 18% of Democrats said neither party has high ethical standards\n- The document only provides \"Dem/Lean Dem\" combined sample size of 710\n\nWithout the specific Democrat-only sample size, and given that the question asks for an integer representing the number of people (not percentage), I cannot provide a definitive answer based solely on the document provided.\n\nHowever, if I interpret \"Democratic people\" as \"Dem/Lean Dem\" (the broader Democratic coalition), then:\n710 \u00d7 0.18 = 127.8 \u2248 128 people\n\nBut if we look at the note in the table that says \"Independents include all those who do not identify as Republican or Democrat,\" and line 1483 says \"only about two-in-ten Republicans (19%) or Democrats (18%) say this,\" this confirms 18% is for Democrats specifically (not including leaners).\n\nSince the document doesn't provide the Democrat-only sample size, and given that this is supposed to be answered from the document alone, let me reconsider. The percentage (18%) is clearly stated in the document. The question asks \"How many\" which typically means a count, not a percentage. Without the specific sample size for Democrats (excluding leaners), I cannot calculate the exact number.\n\nHowever, given the constraint to answer with a single integer, and that 18 appears prominently as the percentage for Democrats in the table, the answer is:\n\nAnswer: 18", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 84969, "error": null, "extra": {"thread_id": 1909, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_265d3866b1d34612a515602bee4564b0", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 708, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "18", "normalised_gold": "128"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q000", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "str", "gold": "Capt. John W. Sanders", "pages": 16, "document_id": 5220, "arm": "native_pdf", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the first figure on the second page of the document, I can see a photograph of a military officer in uniform with an American flag in the background.\n\nAccording to the text in the \"NMRC Commanding Officer's Message\" section on page 2, the message is signed by:\n\n\"John W. Sanders III, CAPT, MC, USN\"\n\nThis indicates that Captain John W. Sanders III is the NMRC Commanding Officer shown in that photograph.\n\nAnswer: Capt. John W. Sanders III", "citations": [], "input_tokens": 40393, "output_tokens": 116, "cost_micros": 122919, "latency_ms": 29766, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "0b85477387a9d0cc33fca0f4becaa0e5.pdf"}, "graded": {"correct": true, "f1": 0.888888888888889, "method": "str_norm", "normalised_pred": "capt john w sanders iii", "normalised_gold": "capt john w sanders"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q000", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "str", "gold": "Capt. John W. Sanders", "pages": 16, "document_id": 5220, "arm": "azure_basic_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, on page 2, the first figure shows a photo with a caption that reads:\n\n\"Dr. Xiaoyan \"Cathy\" Zou, staff scientist from the Henry Jackson Foundation, discusses research on malaria with Rear Adm. (Ret.) Tim Ziemer, the U.S. Global Malaria Coordinator, President, Malaria Initiative.\"\n\nHowever, this caption does not identify a commanding officer. Looking at the text on page 2, under \"NMRC Commanding Officer's Message,\" it states:\n\n\"NMRC Commanding Officer sends, John W. Sanders III, CAPT, MC, USN\"\n\nAnswer: Capt. John W. Sanders III", "citations": [], "input_tokens": 14490, "output_tokens": 161, "cost_micros": 45885, "latency_ms": 5687, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.888888888888889, "method": "str_norm", "normalised_pred": "capt john w sanders iii", "normalised_gold": "capt john w sanders"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q000", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "str", "gold": "Capt. John W. Sanders", "pages": 16, "document_id": 5220, "arm": "azure_premium_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, I need to find the first figure on the second page.\n\nThe second page contains the \"NMRC Commanding Officer's Message\" section. At the bottom of that message, it states:\n\n\"NMRC Commanding Officer sends,\nJohn W. Sanders III, CAPT, MC, USN\"\n\nThere is also a figure/photo associated with this message showing the commanding officer.\n\nAnswer: Capt. John W. Sanders III", "citations": [], "input_tokens": 16140, "output_tokens": 103, "cost_micros": 49965, "latency_ms": 4078, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.888888888888889, "method": "str_norm", "normalised_pred": "capt john w sanders iii", "normalised_gold": "capt john w sanders"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q000", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "str", "gold": "Capt. John W. Sanders", "pages": 16, "document_id": 5220, "arm": "llamacloud_basic_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, I need to find the first figure on the second page.\n\nThe second page contains two sections: \"NMRC Commanding Officer's Message\" and \"NSMRL Commanding Officer's Message.\"\n\nThe first figure/photo on this page shows \"Dr. Xiaoyan 'Cathy' Zou, staff scientist from the Henry Jackson Foundation, discusses research on malaria with Rear Adm. (Ret.) Tim Ziemer, the U.S. Global Malaria Coordinator, President, Malaria Initiative.\"\n\nHowever, the question asks about the commanding officer. Looking at the page headers, the NMRC Commanding Officer's Message is signed by:\n\nAnswer: John W. Sanders III, CAPT, MC, USN", "citations": [], "input_tokens": 12934, "output_tokens": 171, "cost_micros": 41367, "latency_ms": 6780, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.7272727272727273, "method": "str_norm", "normalised_pred": "john w sanders iii capt mc usn", "normalised_gold": "capt john w sanders"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q000", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "str", "gold": "Capt. John W. Sanders", "pages": 16, "document_id": 5220, "arm": "llamacloud_premium_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, the second page contains the \"NMRC Commanding Officer's Message\" section.\n\nThe message is signed by:\n\"John W. Sanders III, CAPT, MC, USN\"\n\nAnswer: Capt. John W. Sanders III", "citations": [], "input_tokens": 14408, "output_tokens": 61, "cost_micros": 44139, "latency_ms": 3796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.888888888888889, "method": "str_norm", "normalised_pred": "capt john w sanders iii", "normalised_gold": "capt john w sanders"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q000", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "str", "gold": "Capt. John W. Sanders", "pages": 16, "document_id": 5220, "arm": "surfsense_agentic", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find the first figure on the second page and identify the commanding officer.Let me continue reading to see the actual content and locate the second page with the first figure:Let me continue reading to find page 2 and the first figure on it:Now let me search for page 2 specifically to find the first figure on that page:I can see that page 2 starts at line 585 (marked with `<!-- PageNumber=\"2\" -->`). The first figure on page 2 appears at lines 634-681. Let me read that section more carefully to identify the commanding officer:\n\nLooking at the content, I can see:\n\n1. Page 2 starts at line 585 with the page break marker\n2. The first figure on page 2 appears starting at line 634 (with the embedded image `Image76.jpg` at line 647)\n3. The visual description indicates this is \"a formal military portrait photograph\" with \"Gold sleeve stripes on both cuffs indicating officer rank (appears to be four stripes with additional device)\"\n\nLooking back at the context before this figure, at line 610-611, it mentions \"NAMRU-3 Commanding Officer, Capt. Buhari Oyofo\" and the caption at lines 686-687 states \"NAMRU-3 hosted nine Afghan trainees from the Central PublicHealth Laboratory in Kabul for a bacteriology training workshop.\"\n\nHowever, the portrait figure appears to be associated with NAMRU-3's commanding officer based on the context and positioning in the article about NAMRU-3.\n\nAnswer: Capt. Buhari Oyofo", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 36202, "error": null, "extra": {"thread_id": 1910, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_f4f5354a0c7b455995c184df235b687c", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 232, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.28571428571428575, "method": "str_norm", "normalised_pred": "capt buhari oyofo", "normalised_gold": "capt john w sanders"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q002", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "str", "gold": "Tim Ziemer", "pages": 16, "document_id": 5220, "arm": "native_pdf", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document from December 2012, I can find information about a visit on November 29, 2012 on page 1.\n\nThe article titled \"NMRC Hosts Visit from U.S. Global Malaria Coordinator, President's Malaria Initiative\" states:\n\n\"SILVER SPRING, Md. - Rear Adm. (Ret.) Tim Ziemer, the U.S. Global Malaria Coordinator, President's Malaria Initiative, visited the Naval Medical Research Center (NMRC), November 29, for a brief on the current malaria vaccine research efforts and to tour the facility.\"\n\nAnswer: Rear Adm. (Ret.) Tim Ziemer", "citations": [], "input_tokens": 40404, "output_tokens": 160, "cost_micros": 123612, "latency_ms": 34093, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "0b85477387a9d0cc33fca0f4becaa0e5.pdf"}, "graded": {"correct": true, "f1": 0.5714285714285715, "method": "str_norm", "normalised_pred": "rear adm ret tim ziemer", "normalised_gold": "tim ziemer"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q002", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "str", "gold": "Tim Ziemer", "pages": 16, "document_id": 5220, "arm": "azure_basic_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q002", "answer_letter": null, "raw_text": "Looking through the document for November 29, 2012...\n\nOn page 1, the article titled \"NMRC Hosts Visit from U.S. Global Malaria Coordinator, President's Malaria Initiative\" states:\n\n\"SILVER SPRING, Md. - Rear Adm. (Ret.) Tim Ziemer, the U.S. Global Malaria Coordinator, President's Malaria Initiative, visited the Naval Medical Research Center (NMRC), November 29, for a brief on the current malaria vaccine research efforts and to tour the facility.\"\n\nAnswer: Rear Adm. (Ret.) Tim Ziemer", "citations": [], "input_tokens": 14501, "output_tokens": 147, "cost_micros": 45708, "latency_ms": 4968, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.5714285714285715, "method": "str_norm", "normalised_pred": "rear adm ret tim ziemer", "normalised_gold": "tim ziemer"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q002", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "str", "gold": "Tim Ziemer", "pages": 16, "document_id": 5220, "arm": "azure_premium_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q002", "answer_letter": null, "raw_text": "Looking through the document for information about a November 29, 2012 visit to the Naval Medical Research Center...\n\nOn page 1, the article titled \"NMRC Hosts Visit from U.S. Global Malaria Coordinator, President's Malaria Initiative\" states:\n\n\"SILVER SPRING, Md. - Rear Adm. (Ret.) Tim Ziemer, the U.S. Global Malaria Coordinator, President's Malaria Initiative, visited the Naval Medical Research Center (NMRC), November 29, for a brief on the current malaria vaccine research efforts and to tour the facility.\"\n\nAnswer: Rear Adm. (Ret.) Tim Ziemer", "citations": [], "input_tokens": 16151, "output_tokens": 158, "cost_micros": 50823, "latency_ms": 4608, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.5714285714285715, "method": "str_norm", "normalised_pred": "rear adm ret tim ziemer", "normalised_gold": "tim ziemer"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q002", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "str", "gold": "Tim Ziemer", "pages": 16, "document_id": 5220, "arm": "llamacloud_basic_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q002", "answer_letter": null, "raw_text": "Looking through the document for a visit on November 29, 2012 to the Naval Medical Research Center (NMRC)...\n\nOn page 1, in the article titled \"NMRC Hosts Visit from U.S. Global Malaria Coordinator, President's Malaria Initiative,\" it states:\n\n\"SILVER SPRING, Md. - Rear Adm. (Ret.) Tim Ziemer, the U.S. Global Malaria Coordinator, President's Malaria Initiative, visited the Naval Medical Research Center (NMRC), November 29, for a brief on the current malaria vaccine research efforts and to tour the facility.\"\n\nAnswer: Rear Adm. (Ret.) Tim Ziemer", "citations": [], "input_tokens": 12945, "output_tokens": 164, "cost_micros": 41295, "latency_ms": 4907, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.5714285714285715, "method": "str_norm", "normalised_pred": "rear adm ret tim ziemer", "normalised_gold": "tim ziemer"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q002", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "str", "gold": "Tim Ziemer", "pages": 16, "document_id": 5220, "arm": "llamacloud_premium_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, specifically the article titled \"NMRC Hosts Visit from U.S. Global Malaria Coordinator, President's Malaria Initiative\" on page 1:\n\n\"SILVER SPRING, Md. - Rear Adm. (Ret.) Tim Ziemer, the U.S. Global Malaria Coordinator, President's Malaria Initiative, visited the Naval Medical Research Center (NMRC), November 29...\"\n\nAnswer: Rear Adm. (Ret.) Tim Ziemer", "citations": [], "input_tokens": 14419, "output_tokens": 120, "cost_micros": 45057, "latency_ms": 3375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.5714285714285715, "method": "str_norm", "normalised_pred": "rear adm ret tim ziemer", "normalised_gold": "tim ziemer"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q002", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "str", "gold": "Tim Ziemer", "pages": 16, "document_id": 5220, "arm": "surfsense_agentic", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the document that was mentioned to find information about who visited the U.S. Naval Medical Research Centre on November 29, 2012.Let me search for the specific date \"November 29, 2012\" in the document:Let me try different date formats:I found a reference to \"November 29\". Let me read the specific chunk that contains this information:Perfect! I found the answer. Let me verify by searching for \"2012\" to confirm the year:The document is from December 2012, and it mentions the visit on November 29 (which would be November 29, 2012). Based on the text I read, the visitor was Rear Adm. (Ret.) Tim Ziemer, who was the U.S. Global Malaria Coordinator for the President's Malaria Initiative.\n\nAnswer: Rear Adm. (Ret.) Tim Ziemer, U.S. Global Malaria Coordinator, President's Malaria Initiative", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 40875, "error": null, "extra": {"thread_id": 1911, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_b589ac0b3255440294d5d9346c5a691a", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 205, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 0.25, "method": "str_norm", "normalised_pred": "rear adm ret tim ziemer u s global malaria coordinator president s malaria initiative", "normalised_gold": "tim ziemer"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q004", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "list", "gold": "['Walter Gwenigale', 'Fatorma Bolay', 'Vernon Graham']", "pages": 16, "document_id": 5220, "arm": "native_pdf", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q004", "answer_letter": null, "raw_text": "Looking through the document for information about NAMRU-3's visit to Monrovia, Liberia in November 2012...\n\nOn page 7, the article \"NAMRU-3 Supports Medical Research Capacity Building in Liberia\" states:\n\n\"The NAMRU-3 team visited Monrovia, Liberia in November to meet with key collaborators, including Dr. Walter Gwenigale, the Minister of Health and Social Welfare; Dr. Fatorma Bolay, the Director of LIBR; and U.S. Marine Col. Vernon Graham, the officer in charge of Operation Onward Liberty (OOL).\"\n\nAnswer: Dr. Walter Gwenigale, Dr. Fatorma Bolay, U.S. Marine Col. Vernon Graham", "citations": [], "input_tokens": 40420, "output_tokens": 181, "cost_micros": 123975, "latency_ms": 34391, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "0b85477387a9d0cc33fca0f4becaa0e5.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "dr fatorma bolay, dr walter gwenigale, u s marine col vernon graham", "normalised_gold": "fatorma bolay, vernon graham, walter gwenigale"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q004", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "list", "gold": "['Walter Gwenigale', 'Fatorma Bolay', 'Vernon Graham']", "pages": 16, "document_id": 5220, "arm": "azure_basic_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document for information about the NAMRU-3 team's visit to Monrovia, Liberia in November 2012...\n\nFrom page 7, under \"NAMRU-3 Supports Medical Research Capacity Building in Liberia\":\n\n\"The NAMRU-3 team visited Monrovia, Liberia in November to meet with key collaborators, including Dr. Walter Gwenigale, the Minister of Health and Social Welfare; Dr. Fatorma Bolay, the Director of LIBR; and U.S. Marine Col. Vernon Graham, the officer in charge of Operation Onward Liberty (OOL).\"\n\nAnswer: Dr. Walter Gwenigale, Dr. Fatorma Bolay, Col. Vernon Graham", "citations": [], "input_tokens": 14517, "output_tokens": 176, "cost_micros": 46191, "latency_ms": 5672, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "col vernon graham, dr fatorma bolay, dr walter gwenigale", "normalised_gold": "fatorma bolay, vernon graham, walter gwenigale"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q004", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "list", "gold": "['Walter Gwenigale', 'Fatorma Bolay', 'Vernon Graham']", "pages": 16, "document_id": 5220, "arm": "azure_premium_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document for information about NAMRU-3's visit to Monrovia, Liberia in November 2012...\n\nFrom page 7, the article \"NAMRU-3 Supports Medical Research Capacity Building in Liberia\" states:\n\n\"The NAMRU-3 team visited Monrovia, Liberia in November to meet with key collaborators, including Dr. Walter Gwenigale, the Minister of Health and Social Welfare; Dr. Fatorma Bolay, the Director of LIBR; and U.S. Marine Col. Vernon Graham, the officer in charge of Operation Onward Liberty (OOL).\"\n\nAnswer: Dr. Walter Gwenigale, Dr. Fatorma Bolay, Col. Vernon Graham", "citations": [], "input_tokens": 16167, "output_tokens": 176, "cost_micros": 51141, "latency_ms": 5250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "col vernon graham, dr fatorma bolay, dr walter gwenigale", "normalised_gold": "fatorma bolay, vernon graham, walter gwenigale"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q004", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "list", "gold": "['Walter Gwenigale', 'Fatorma Bolay', 'Vernon Graham']", "pages": 16, "document_id": 5220, "arm": "llamacloud_basic_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the section titled \"NAMRU-3 Supports Medical Research Capacity Building in Liberia\" on page 7:\n\nThe document states: \"The NAMRU-3 team visited Monrovia, Liberia in November to meet with key collaborators, including Dr. Walter Gwenigale, the Minister of Health and Social Welfare; Dr. Fatorma Bolay, the Director of LIBR; and U.S. Marine Col. Vernon Graham, the officer in charge of Operation Onward Liberty (OOL).\"\n\nAnswer: Dr. Walter Gwenigale, Dr. Fatorma Bolay, U.S. Marine Col. Vernon Graham", "citations": [], "input_tokens": 12961, "output_tokens": 156, "cost_micros": 41223, "latency_ms": 5391, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "dr fatorma bolay, dr walter gwenigale, u s marine col vernon graham", "normalised_gold": "fatorma bolay, vernon graham, walter gwenigale"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q004", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "list", "gold": "['Walter Gwenigale', 'Fatorma Bolay', 'Vernon Graham']", "pages": 16, "document_id": 5220, "arm": "llamacloud_premium_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document, specifically the article \"NAMRU-3 Supports Medical Research Capacity Building in Liberia\" on page 7:\n\nThe text states: \"The NAMRU-3 team visited Monrovia, Liberia in November to meet with key collaborators, including Dr. Walter Gwenigale, the Minister of Health and Social Welfare; Dr. Fatorma Bolay, the Director of LIBR; and U.S. Marine Col. Vernon Graham, the officer in charge of Operation Onward Liberty (OOL).\"\n\nAnswer: Dr. Walter Gwenigale, Dr. Fatorma Bolay, U.S. Marine Col. Vernon Graham", "citations": [], "input_tokens": 14435, "output_tokens": 159, "cost_micros": 45690, "latency_ms": 5828, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "dr fatorma bolay, dr walter gwenigale, u s marine col vernon graham", "normalised_gold": "fatorma bolay, vernon graham, walter gwenigale"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q004", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "list", "gold": "['Walter Gwenigale', 'Fatorma Bolay', 'Vernon Graham']", "pages": 16, "document_id": 5220, "arm": "surfsense_agentic", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to find information about the NAMRU-3 team visit to Monrovia, Liberia in November 2012.", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 23750, "error": null, "extra": {"thread_id": 1912, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_83c19c542b6a41ceac941d5981d28799", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 42, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "i ll read document to find information about namru 3 team visit to monrovia, liberia in november 2012", "normalised_gold": "fatorma bolay, vernon graham, walter gwenigale"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "int", "gold": "6", "pages": 16, "document_id": 5220, "arm": "native_pdf", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "6"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "int", "gold": "6", "pages": 16, "document_id": 5220, "arm": "azure_basic_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "answer_letter": null, "raw_text": "I'll examine each image in the document to count those containing only one person.\n\nGoing through the document:\n\n1. Page 1: Image of Dr. Xiaoyan \"Cathy\" Zou with Rear Adm. Ziemer - 2 people\n2. Page 4: Image of Cmdr. Charmagne Beckett on flight deck - 1 person\n3. Page 4: Group photo of Internal Medicine Department staff - multiple people\n4. Page 5: Image of Petty Officer 1st Class Dennis Gonzales - 1 person\n5. Page 5: Image of Lance Cpl. Kip Boker with Cpl. Jacob Echeverri - 2 people\n6. Page 7: Group photo with Col. Vernon Graham - 5 people\n7. Page 7: Group photo with Dr. Walter T. Gwenigale - 5 people\n8. Page 8: Group photo of Kazakh scientists and staff - 9 people\n9. Page 9: Image of Lt. j.g. Michael Rucker treating a child - 2 people\n10. Page 9: Image of U.S. Marines and Sailors - multiple people\n11. Page 10: Group photo of Joint Combat Casualty Research Team - 9 people\n12. Page 11: Graphic illustration (not a photograph of people)\n13. Page 12: Multiple images from Dining Out event - multiple people in each\n14. Page 13: Group photo at Notre Dame - 4 people\n15. Page 14: Image of Cmdr. Daniel Szumlas with Rear Adm. Ziemer - 2 people\n16. Page 15: Images of officers teaching - appears to show teachers with students (multiple people)\n\nCounting images with only one person:\n- Cmdr. Charmagne Beckett on flight deck\n- Petty Officer 1st Class Dennis Gonzales\n\nAnswer: 2", "citations": [], "input_tokens": 14484, "output_tokens": 453, "cost_micros": 50247, "latency_ms": 10203, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "6"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "int", "gold": "6", "pages": 16, "document_id": 5220, "arm": "azure_premium_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "answer_letter": null, "raw_text": "I'll go through each picture/photo in the document systematically to count those containing only one person.\n\n1. Page 1: Photo of Dr. Xiaoyan \"Cathy\" Zou with Rear Adm. (Ret.) Tim Ziemer - 2 people\n2. Page 2: Photo of Capt. John W. Sanders III - 1 person \u2713\n3. Page 2: Photo of Capt. Steven M. Wechsler - 1 person \u2713\n4. Page 3: Photo of Afghan trainees in laboratory - multiple people\n5. Page 4: Photo of Cmdr. Charmagne Beckett - 1 person \u2713\n6. Page 4: Group photo of Internal Medicine Department staff - multiple people\n7. Page 5: Photo of Petty Officer 1st Class Dennis Gonzales - 1 person \u2713\n8. Page 6: Photo of Lance Cpl. Kip Boker and Cpl. Jacob Echeverri - 2 people\n9. Page 7: Photo with Col. Vernon Graham and team - multiple people\n10. Page 7: Photo with Dr. Gwenigale and team - multiple people\n11. Page 7: Photo of Pfc. Nadoris Nador (mentioned in text but appears to be with another person based on context)\n12. Page 8: Group photo of Kazakh scientists and NMRC staff - multiple people\n13. Page 9: Photo of Lt. j.g. Michael Rucker treating a child - 2 people\n14. Page 9: Photo of U.S. Marines and Sailors - multiple people\n15. Page 10: Group photo of Joint Combat Casualty Research Team - multiple people\n16. Page 12: Group photos from Dining Out event - multiple people in each\n17. Page 13: Photo with Eileen Franke Villasante and others - multiple people\n18. Page 14: Photo of Cmdr. Daniel Szumlas with Rear Adm. Ziemer - 2 people\n19. Page 14: Photo of scouts learning flag etiquette - multiple people\n20. Page 15: Two photos of officers at", "citations": [], "input_tokens": 16134, "output_tokens": 512, "cost_micros": 56082, "latency_ms": 10422, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "20", "normalised_gold": "6"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "int", "gold": "6", "pages": 16, "document_id": 5220, "arm": "llamacloud_basic_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "answer_letter": null, "raw_text": "I'll go through each picture in the document systematically to count how many contain only one person.\n\n1. Page 1: Dr. Xiaoyan \"Cathy\" Zou with Rear Adm. Ziemer - 2 people\n2. Page 4: Cmdr. Charmagne Beckett on flight deck - 1 person\n3. Page 5: Petty Officer 1st Class Dennis Gonzales getting swabbed - 2 people\n4. Page 6: Lance Cpl. Kip Boker standing while Cpl. Jacob Echeverri collects sample - 2 people\n5. Page 8: Group photo of Kazakh scientists and NMRC staff - multiple people\n6. Page 9: Lt. j.g. Michael Rucker treating a girl - 2 people\n7. Page 10: JC2RT Team Members group photo - multiple people\n8. Page 11: Graphic illustration (not a photo of people)\n9. Page 12: Group photo at dining out - multiple people\n10. Page 14: Cmdr. Daniel Szumlas with Rear Adm. Ziemer - 2 people\n11. Page 15: Two photos - Cmdr. Charmagne Beckett talking with students (multiple people) and Lt. Cmdr. Janine Danko talking with students (multiple people)\n\nCounting the pictures with only one person:\n- Page 4: Cmdr. Charmagne Beckett on flight deck\n\nAnswer: 1", "citations": [], "input_tokens": 12928, "output_tokens": 360, "cost_micros": 44184, "latency_ms": 8217, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "6"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "int", "gold": "6", "pages": 16, "document_id": 5220, "arm": "llamacloud_premium_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "answer_letter": null, "raw_text": "I'll go through each image in the document and count those containing only one person.\n\n1. Page 1: Dr. Xiaoyan \"Cathy\" Zou discussing research - 2 people visible\n2. Page 4 (top): Cmdr. Charmagne Beckett on flight deck - 1 person\n3. Page 4 (bottom): Pacific Partnership 2012 Internal Medicine Department staff - multiple people (10)\n4. Page 5: Petty Officer 1st Class Dennis Gonzales - 1 person\n5. Page 6: Lance Cpl. Kip Boker and Cpl. Jacob Echeverri - 2 people\n6. Page 7 (top): Group photo with Dr. Gwenigale - 5 people\n7. Page 7 (bottom): Group photo with Col. Graham - 5 people\n8. Page 8: Group photo of Kazakh scientists - 9 people\n9. Page 9 (top): U.S. Marines and Sailors in aircraft - multiple people\n10. Page 9 (bottom): Lt. j.g. Michael Rucker treating girl - 2 people\n11. Page 10: Joint Combat Casualty Research Team - 9 people\n12. Page 12 (top): Dining Out Event photo - multiple people at tables\n13. Page 12 (bottom): Group photo at Dining Out - 6 people\n14. Page 13: Group photo at Notre Dame - 4 people\n15. Page 14 (top): Cmdr. Szumlas with Rear Adm. Ziemer - 3 people\n16. Page 14 (middle): Dr. Andrea Keane-Myers at podium - 1 person\n17. Page 14 (bottom): NAMRU-3 staff with Cub Scouts - multiple people\n18. Page 15 (left): Cmdr. Charmagne Beckett with student - 2 people\n19. Page 15 (right): Lt. Cmdr. Janine Danko with students - 3 people\n20. Page 15 (bottom): Lt. Danett Bishop in classroom - 1 ", "citations": [], "input_tokens": 14402, "output_tokens": 512, "cost_micros": 50886, "latency_ms": 10640, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "20", "normalised_gold": "6"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "int", "gold": "6", "pages": 16, "document_id": 5220, "arm": "surfsense_agentic", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "answer_letter": null, "raw_text": "I'll examine the document to count pictures containing only one person.Let me continue reading to see all images in the document.Let me read the rest of the document more systematically by checking remaining chunks.Now let me read the visual descriptions for each image systematically to count how many contain only one person. Let me read through the remaining images I haven't seen yet.Now let me read the final images:Now let me count all the images with only one person based on the visual descriptions I've read:\n\nBased on my review of all the images in the document, here are the ones that contain only ONE person:\n\n1. **Image75.jpg** - Formal military portrait photograph of a naval officer (one person)\n2. **Image76.jpg** - Formal military portrait photograph (one person in uniform)\n3. **Image83.jpg** - Person on naval vessel deck wearing Navy uniform (one person)\n4. **Image86.jpg** - Close-up photograph of a young male service member in desert camouflage (one person)\n5. **Image125.jpg** - Woman speaking at a podium at Johns Hopkins (one person)\n6. **Image133.jpg** - Person in military uniform standing and presenting with equipment (one person)\n\nImages with MULTIPLE people:\n- Image9.jpg - Banner header (no people, just background with multiple individuals in soft focus)\n- Image55.png - QR code (no people)\n- Image69.jpg - Laboratory setting with TWO individuals\n- Image79.jpg - Laboratory scene with approximately 6 people\n- Image82.jpg - Group of 10 individuals\n- Image89.jpg - Emblem/insignia (no people)\n- Image90.jpg - Three individuals visible\n- Image93.jpg - Five individuals\n- Image94.jpg - Five individuals\n- Image97.jpg - Group of approximately 10 individuals\n- Image100.jpg - Healthcare provider and pediatric patient (TWO people)\n- Image101.jpg - Military transport aircraft interior with many personnel\n- Image103.jpg - Nine military personnel in front of helicopter\n- Image109.jpg - Diagram (no people)\n- Image112.jpg - Banquet hall with many attendees\n- Image113.jpg - Five individuals\n- Image116.jpg - Four individuals\n- Image127.jpg - Group of 10-12 people\n- Image128.jpg - Three men in conversation\n- Image131.jpg - Two women in foreground\n- Image132.jpg - Three individuals\n- Image139.jpg - Emblem/logo (no people)\n\nAnswer: 6", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 57125, "error": null, "extra": {"thread_id": 1913, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_8b24b6e5b403450996ab78f93d47888d", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 381, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q006", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "int", "gold": "9", "pages": 16, "document_id": 5220, "arm": "native_pdf", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q006", "answer_letter": null, "raw_text": "I need to examine all the images in the document to count the number of people in each.\n\nLet me go through each page:\n\nPage 1: Contains one photo showing two people (Dr. Xiaoyan \"Cathy\" Zou and Rear Adm. Tim Ziemer)\n\nPage 2: Contains two photos - one of Capt. Sanders (1 person) and one of Capt. Wechsler (1 person)\n\nPage 3: Contains one photo showing multiple people in a laboratory setting - I can count approximately 9 people\n\nPage 4: Contains one photo showing a group of medical staff - I can count approximately 10 people in the Internal Medicine Department staff photo\n\nPage 5: Contains one photo showing one person (Petty Officer 1st Class Dennis Gonzales)\n\nPage 6: Contains two photos - one showing multiple people swabbing (2 people visible), and one showing a group photo with approximately 9 people\n\nPage 7: Contains two photos - both showing groups of approximately 4-5 people each\n\nPage 8: Contains one photo showing approximately 8 people (Kazakh scientists and NMRC staff)\n\nPage 9: Contains two photos - one showing many Marines/Sailors on aircraft (approximately 50+ people), and one showing 2 people\n\nPage 10: Contains one photo showing approximately 9 people (JC2RT team members)\n\nPage 11: Contains one image/diagram (not a photo of people)\n\nPage 12: Contains two photos showing dining out events with multiple people at tables\n\nPage 13: Contains one photo showing 4 people\n\nPage 14: Contains two photos showing 2-3 people in each\n\nPage 15: Contains two photos showing 2-3 people in each\n\nPage 16: Contains no photos of people\n\nThe page with the most people is page 9, which contains the photo of U.S. Marines and Sailors on the aircraft with approximately 50+ people visible.\n\nAnswer: 9", "citations": [], "input_tokens": 40400, "output_tokens": 450, "cost_micros": 127950, "latency_ms": 40312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "0b85477387a9d0cc33fca0f4becaa0e5.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "9", "normalised_gold": "9"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q006", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "int", "gold": "9", "pages": 16, "document_id": 5220, "arm": "azure_basic_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q006", "answer_letter": null, "raw_text": "I'll examine each page of the document to count the number of people visible in pictures.\n\nPage 1: Picture shows one person (Dr. Xiaoyan \"Cathy\" Zou) with another person (Rear Adm. Ziemer)\n\nPage 2: No pictures with people\n\nPage 3: Picture shows people in protective equipment - difficult to count exactly but appears to be a few people\n\nPage 4: Multiple pictures - one shows Cmdr. Charmagne Beckett (1 person), another group photo shows approximately 10 people (the Internal Medicine Department staff)\n\nPage 5: Two pictures - one shows a person treating a child (2 people), another shows Marines/Sailors (multiple people but distant)\n\nPage 6: Two pictures - one shows a person standing while another collects a sample (2 people), another shows people at a food event (several people visible)\n\nPage 7: Two pictures - one shows 5 people posing together, another shows 5 people in a meeting\n\nPage 8: Picture shows 9 people (Kazakh scientists and staff)\n\nPage 9: Two pictures - one shows medical treatment (2-3 people), another shows troops (multiple but distant)\n\nPage 10: Picture shows approximately 9 people in the JC2RT team photo\n\nPage 11: One diagram/illustration, not a photograph of real people\n\nPage 12: Two pictures - one shows a dining event with many people, another shows 6 people\n\nPage 13: Picture shows 4 people at Notre Dame\n\nPage 14: Two pictures - one shows 2 people in lab, another shows scouts with adults (approximately 6-8 people)\n\nPage 15: Two pictures at Elizabeth Seton High School showing interactions with students - multiple people in each\n\nPage 16: No pictures with people\n\nThe page with the most people appears to be page 12, which shows the NMRC Dining Out event with many attendees visible in the first picture.\n\nAnswer: 12", "citations": [], "input_tokens": 14497, "output_tokens": 449, "cost_micros": 50226, "latency_ms": 12717, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "12", "normalised_gold": "9"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q006", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "int", "gold": "9", "pages": 16, "document_id": 5220, "arm": "azure_premium_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q006", "answer_letter": null, "raw_text": "I need to examine all the pictures in the document and count the number of people visible in each one to determine which page has the picture with the most people.\n\nLet me go through each page systematically:\n\n**Page 1:** Picture shows two people (Dr. Xiaoyan \"Cathy\" Zou and Rear Adm. Ziemer)\n\n**Page 2:** Two headshot photos of commanding officers (single person each)\n\n**Page 3:** One picture showing multiple people in what appears to be a laboratory setting with people in protective gear\n\n**Page 4:** One picture showing Cmdr. Charmagne Beckett (single person), and one group photo with approximately 10 people (Internal Medicine Department staff physicians)\n\n**Page 5:** Two pictures - one showing a person getting a swab sample (2 people), another showing two people (one standing, one collecting sample)\n\n**Page 6:** Two pictures with small groups (3-5 people each)\n\n**Page 7:** Two pictures with small groups (3-5 people each)\n\n**Page 8:** One group photo showing approximately 9 people (Kazakh scientists and NMRC staff)\n\n**Page 9:** Two individual pictures (single subjects each)\n\n**Page 10:** No pictures with people visible in detail\n\n**Page 11:** One diagram/illustration (not a photograph of actual people)\n\n**Page 12:** Two pictures - one appears to be a group photo at a dining event, and another showing 6 people (smaller group photo with officers and spouses)\n\n**Page 13:** One picture showing 4 people\n\n**Page 14:** One picture showing people in what appears to be a Scout activity\n\n**Page 15:** Two pictures showing classroom/teaching scenes with multiple students\n\n**Page 16:** One picture showing a single person\n\nThe largest group photo appears to be on **page 4**, which shows the Internal Medicine Department staff physicians with approximately 10 people clearly visible in the photograph.\n\nAnswer: 4", "citations": [], "input_tokens": 16147, "output_tokens": 443, "cost_micros": 55086, "latency_ms": 11452, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "9"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q006", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "int", "gold": "9", "pages": 16, "document_id": 5220, "arm": "llamacloud_basic_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q006", "answer_letter": null, "raw_text": "I need to examine all the pictures in the document and count the number of people in each one.\n\nLet me go through each page:\n\nPage 1: Picture of two people (Dr. Xiaoyan \"Cathy\" Zou and Rear Adm. Ziemer)\n\nPage 2: No pictures with people\n\nPage 3: No pictures with people\n\nPage 4: Picture of one person (Cmdr. Charmagne Beckett on flight deck)\n\nPage 5: Picture of one person (Petty Officer 1st Class Dennis Gonzales getting swabbed)\n\nPage 6: Picture of two people (Lance Cpl. Kip Boker and Cpl. Jacob Echeverri)\n\nPage 7: No pictures with people\n\nPage 8: Picture with 9 people (Kazakh scientists group photo - Back row has 7 people: Gulnar Omasheva, Alexey Andryuchshenko, Sarah Pisarcik, Heidi St. John, Dr. Kenneth Yeh, Dr. Allen Richards, and Talgat Nurmakhanov. Bottom row has 3 people: Dr. Ekaterine Adeishvili, Lyazzat Musralina, and Dr. Ju Jiang. Total: 10 people)\n\nPage 9: Picture of one person (Lt. j.g. Michael Rucker treating a patient)\n\nPage 10: Picture with 9 people (JC2RT Team Members - Back row: 5 people, Front row: 4 people)\n\nPage 11: Graphic illustration (no people photo)\n\nPage 12: Picture with 6 people (Rear Adm. Bruce Doll, NMRC Commanding Officer Capt. John Sanders, Dr. Leighann Sanders, NMRC Executive Officer Capt. Elizabeth Montcalm-Smith, and Dr. Chris Smith - actually appears to be 5 people based on the caption listing)\n\nPage 13: No pictures with people\n\nPage 14: Picture with 2 people (Cmdr. Daniel Szumlas and Rear Adm. Ziemer with one other person visible = 3 people total in the image)\n\nPage ", "citations": [], "input_tokens": 12941, "output_tokens": 512, "cost_micros": 46503, "latency_ms": 12328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "9"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q006", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "int", "gold": "9", "pages": 16, "document_id": 5220, "arm": "llamacloud_premium_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q006", "answer_letter": null, "raw_text": "I need to examine all the pictures in the document and count the number of people in each to determine which page has the picture with the most people.\n\nLet me go through the document page by page:\n\n**Page 1:** Picture of Dr. Xiaoyan \"Cathy\" Zou discussing research with Rear Adm. (Ret.) Tim Ziemer - 2 people\n\n**Page 4:** Two pictures:\n- Cmdr. Charmagne Beckett on flight deck - 1 person\n- Pacific Partnership 2012 Internal Medicine Department staff - 10 people\n\n**Page 5:** No pictures with people clearly visible\n\n**Page 6:** Two pictures:\n- Petty Officer 1st Class Dennis Gonzales - 1 person\n- Lance Cpl. Kip Boker with Cpl. Jacob Echeverri - 2 people\n\n**Page 7:** Two pictures:\n- Group with Lt. Cmdr. Jennifer Curry, Capt. Buhari Oyofo, Dr. Walter T. Gwenigale, Lt. Joseph Diclaro, and Dr. Fatorma Bolay - 5 people\n- Group with military personnel - 5 people\n\n**Page 8:** Group photo with Kazakh scientists - 9 people (Top row: 7 people, Bottom row: 3 people, total 10 people visible)\n\n**Page 9:** Two pictures:\n- U.S. Marines and Sailors in aircraft cargo hold - This appears to have many people seated in rows, likely 30+ people\n- Lt. j.g. Michael Rucker with young girl - 2 people\n\n**Page 10:** Group photo of Joint Combat Casualty Research Team - 9 people\n\n**Page 12:** Picture at Dining Out event - 6 people visible\n\n**Page 13:** Picture with 4 people outdoors\n\n**Page 14:** Two pictures:\n- Cmdr. Daniel Szumlas with visitors - 3 people\n- NAMRU-3 staff teaching Cub Scouts flag folding - approximately 8-10 people\n\n**Page 15:** Two pictures with 2-3 people each\n\nThe picture on page 9 showing \"U", "citations": [], "input_tokens": 14415, "output_tokens": 512, "cost_micros": 50925, "latency_ms": 12344, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "9", "normalised_gold": "9"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q006", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "int", "gold": "9", "pages": 16, "document_id": 5220, "arm": "surfsense_agentic", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the document to find all pictures and count the number of people in each.Let me continue reading the document to find all images and their descriptions:Now let me read each image section to count the people in each. Let me check the images systematically:Let me search for page numbers and images with people descriptions:Now let me systematically check each page for images with people. Let me read the sections around each page number:Good! This image on page 3 has \"approximately 6 people\". Let me continue checking other pages:Excellent! This image has \"10 individuals\". Let me continue checking:This image has 1 person. Let me check page 5:This image has 1 person. Let me check more pages:This is an emblem with 0 people. Let me continue:This image has \"Three individuals visible in foreground and midground\" plus \"Additional blurred figures visible in far background\". Let me read more:This image has \"Five individuals\". Let me continue checking more pages:This has 5 people. Let me check page 8:Good! This image has \"approximately 10 individuals\". Let me check page 9:This image has 2 people (healthcare provider and patient). Let me check page 9 more:This aircraft image has many people, but let me get a more specific count. Let me continue checking other pages:Excellent! This image has \"nine military personnel\". Let me check page 11:This dining hall image has many people seated. Let me count more carefully by reading the description again. It mentions \"Audience members\" and \"Three individuals in naval/formal uniforms standing at front\" plus \"Head table visible in background with seated guests\". This is a large gathering but not specified exactly.\n\nLet me check more pages systematically:This has 5 people. Let me check page 13:This has 1 person at the podium. Let me check page 14:Page 14 has an image with \"approximately 10-12 people\" (Image127.jpg) and another with 3 people (Image128.jpg). Let me continue:This has \"Two women standing in foreground center\" plus \"Multiple people in burgundy/maroon uniforms scattered throughout background\". Let me check page 15:This image has \"Three individuals posing together\" plus \"Additional person visible in background on right side\" = 4 people total. Let me check page 16:Page 16 has an image with 1 person (Image133.jpg) and an emblem (Image139.jpg) with 0 people.\n\nNow let me summarize what I found. Let me verify the counts more carefully by re-reading specific key images:\n\nBased on my review:\n- Page 1: Header image with \"Multiple individuals visible in soft focus\"\n- Page 2: Individual portraits (1 person each)\n- Page 3: Laboratory scene with \"approximately 6 people\" (Image79.jpg) AND group photo with \"10 individuals\" (Image82.jpg)\n- Page 4: 1 person\n- Page 5: 1 person\n- Page 6: 5 people (two images with 3 and 5 people)\n- Page 7: 5 people\n- Page 8: \"approximately 10 individuals\" (Image97.jpg)\n- Page 9: Aircraft with many people + medical scene with 2 people\n- Page 10: Helicopter group with \"nine military personnel\" (Image103.jpg)\n- Page 11: Diagram (0 people) and banquet hall with many people (Image112.jpg - but not exactly counted)\n- Page 12: 5 people\n- Page 13: 4 people and 1 person\n- Page 14: \"approximately 10-12 people\" and 3 people\n- Page 15: Multiple people but not clearly counted\n- Page 16: 1 person and emblem\n\nThe images with the most people appear to be:\n- Page 3: Image82.jpg with \"10 individuals\"\n- Page 8: Image97.jpg with \"approximately 10 individuals\"  \n- Page 10: Image103.jpg with \"nine military personnel\"\n- Page 14: Image127.jpg with \"approximately 10-12 people\"\n- Page 11: Image112.jpg (banquet hall with many people)\n\nLet me check the banquet hall image more carefully:\n\nThe banquet hall description mentions \"Audience members\" seated at tables and \"Three individuals in naval/formal uniforms standing at front\" and \"Head table visible in background with seated guests\" but doesn't give an exact count. Given it's a formal dining out event, there are likely many more than 12 people, but without an exact count stated, I should go with the clearly stated counts.\n\nThe highest clearly stated count is **\"approximately 10-12 people\"** on **page 14**.\n\nAnswer: 14", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 167000, "error": null, "extra": {"thread_id": 1914, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_a7ec370e12904bd3b5569447e3077906", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 1160, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "14", "normalised_gold": "9"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q000", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Berlin School of Experimental Psychology", "pages": 15, "document_id": 5221, "arm": "native_pdf", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, specifically page 2, I can find information about where Gestalt psychology was conceived.\n\nThe text states: \"Gestalt psychology was conceived in the Berlin School of Experimental Psychology\"\n\nAnswer: Berlin School of Experimental Psychology", "citations": [], "input_tokens": 24360, "output_tokens": 56, "cost_micros": 73920, "latency_ms": 8031, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "0e94b4197b10096b1f4c699701570fbf.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "berlin school of experimental psychology", "normalised_gold": "berlin school of experimental psychology"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q000", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Berlin School of Experimental Psychology", "pages": 15, "document_id": 5221, "arm": "azure_basic_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, I can find information about Gestalt psychology in the section titled \"Gestalt Principles of Visual Perception.\"\n\nThe text states: \"Gestalt psychology was conceived in the Berlin School of Experimental Psychology\"\n\nAnswer: Berlin School of Experimental Psychology", "citations": [], "input_tokens": 5194, "output_tokens": 62, "cost_micros": 16512, "latency_ms": 3421, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "berlin school of experimental psychology", "normalised_gold": "berlin school of experimental psychology"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q000", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Berlin School of Experimental Psychology", "pages": 15, "document_id": 5221, "arm": "azure_premium_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q000", "answer_letter": null, "raw_text": "According to the document, Gestalt psychology was conceived in the Berlin School of Experimental Psychology.\n\nAnswer: Berlin School of Experimental Psychology", "citations": [], "input_tokens": 6446, "output_tokens": 32, "cost_micros": 19818, "latency_ms": 2437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "berlin school of experimental psychology", "normalised_gold": "berlin school of experimental psychology"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q000", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Berlin School of Experimental Psychology", "pages": 15, "document_id": 5221, "arm": "llamacloud_basic_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q000", "answer_letter": null, "raw_text": "According to the document, Gestalt psychology was conceived in the Berlin School of Experimental Psychology.\n\nAnswer: Berlin School of Experimental Psychology", "citations": [], "input_tokens": 4354, "output_tokens": 32, "cost_micros": 13542, "latency_ms": 3719, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "berlin school of experimental psychology", "normalised_gold": "berlin school of experimental psychology"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q000", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Berlin School of Experimental Psychology", "pages": 15, "document_id": 5221, "arm": "llamacloud_premium_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q000", "answer_letter": null, "raw_text": "According to the document, Gestalt psychology was conceived in the Berlin School of Experimental Psychology.\n\nAnswer: Berlin School of Experimental Psychology", "citations": [], "input_tokens": 9744, "output_tokens": 32, "cost_micros": 29712, "latency_ms": 3890, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "berlin school of experimental psychology", "normalised_gold": "berlin school of experimental psychology"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q000", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Berlin School of Experimental Psychology", "pages": 15, "document_id": 5221, "arm": "surfsense_agentic", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the relevant document to find information about where Gestalt psychology was conceived.Let me search for \"Gestalt\" to find the relevant section:Perfect! I found the answer. The document clearly states where Gestalt psychology was conceived.\n\nAnswer: Berlin School of Experimental Psychology", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 28250, "error": null, "extra": {"thread_id": 1915, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_2a7b4cbe305b48eca21d964a816bfb01", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 102, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "berlin school of experimental psychology", "normalised_gold": "berlin school of experimental psychology"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q001", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "float", "gold": "21%", "pages": 15, "document_id": 5221, "arm": "native_pdf", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q001", "answer_letter": null, "raw_text": "I need to find the chart on page 14 that shows time spent with family and friends in 2010.\n\nLooking at page 14, I can see a chart titled \"Time spent on weekends\" comparing 2005 and 2010. The chart shows two pie charts/clock-like circular diagrams.\n\nFor the 2010 chart (the larger circle on the right), I can see a segment labeled \"with family and friends\" which shows 21%.\n\nAnswer: 21", "citations": [], "input_tokens": 24371, "output_tokens": 109, "cost_micros": 74748, "latency_ms": 10109, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "0e94b4197b10096b1f4c699701570fbf.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "21.0", "normalised_gold": "21.0"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q001", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "float", "gold": "21%", "pages": 15, "document_id": 5221, "arm": "azure_basic_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q001", "answer_letter": null, "raw_text": "I need to find the chart on page 14 that shows time spent with family and friends in 2010.\n\nLooking at the document, I can see a chart titled \"Time spent on weekends\" that compares 2005 and 2010 data.\n\nIn this chart, for 2010, I can see:\n- Shopping 10%\n- Fitness 6%\n- With family and friends 35%\n- Eating out 17%\n- Hobbies 4%\n- Net surfing 10%\n- Travelling 6%\n- Reading 4%\n- Watching films 22%\n\nWait, I see \"With family and friends\" appears to be listed as 35% for 2010.\n\nHowever, looking more carefully at the layout, the percentages seem to be:\n- With family and friends 21% (this appears to be for 2005)\n- With family and friends 35% (this appears to be for 2010)\n\nThe chart shows that in 2010, 35% of time on weekends was spent with family and friends.\n\nAnswer: 35", "citations": [], "input_tokens": 5205, "output_tokens": 251, "cost_micros": 19380, "latency_ms": 5953, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "35.0", "normalised_gold": "21.0"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q001", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "float", "gold": "21%", "pages": 15, "document_id": 5221, "arm": "azure_premium_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the chart on page 14 titled \"Time spent on weekends\", I can see two pie charts comparing 2005 and 2010.\n\nIn the 2010 chart (on the right), the segment labeled \"With family and friends\" shows 35%.\n\nAnswer: 35", "citations": [], "input_tokens": 6457, "output_tokens": 67, "cost_micros": 20376, "latency_ms": 3467, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "35.0", "normalised_gold": "21.0"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q001", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "float", "gold": "21%", "pages": 15, "document_id": 5221, "arm": "llamacloud_basic_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q001", "answer_letter": null, "raw_text": "I need to find the chart on page 14 that shows time spent with family and friends in 2010.\n\nLooking through the document, I can see a table titled \"Time spent on weekends\" which shows data for 2005 and 2010.\n\nIn this table, the row \"With family and friends\" shows:\n- 2005: 21%\n- 2010: 17%\n\nAnswer: 17", "citations": [], "input_tokens": 4365, "output_tokens": 97, "cost_micros": 14550, "latency_ms": 3280, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "17.0", "normalised_gold": "21.0"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q001", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "float", "gold": "21%", "pages": 15, "document_id": 5221, "arm": "llamacloud_premium_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q001", "answer_letter": null, "raw_text": "I need to find the chart on page 14 that shows time spent with family and friends in 2010.\n\nLooking at the document, I can see a table titled \"Time spent on weekends\" which shows data for 2005 and 2010.\n\nThe table shows:\n| Activity                | 2005 | 2010 |\n| ----------------------- | ---- | ---- |\n| With family and friends | 35   | 21   |\n\nAccording to this table, time spent with family and friends in 2010 was 21.\n\nAnswer: 21", "citations": [], "input_tokens": 9755, "output_tokens": 132, "cost_micros": 31245, "latency_ms": 3875, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "21.0", "normalised_gold": "21.0"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q001", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "float", "gold": "21%", "pages": 15, "document_id": 5221, "arm": "surfsense_agentic", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the document to find the chart on page 14 and extract the information about time spent with family and friends in 2010.Let me search for page 14 content in the document:Let me search for content related to \"family and friends\" and \"2010\":Great! I found the relevant information. Let me read that specific chunk to get the exact details:Perfect! I can see from the OCR text that the chart shows time spent on weekends in 2005 and 2010. Looking at the data:\n\nFor 2010:\n- \"With family and friends 21%\" appears on line 2410\n- \"With family and friends 17%\" appears on line 2412-2413\n\nWait, let me read this more carefully. The structure shows:\n- Line 2403: \"2005\"\n- Line 2404: \"2010\"\n- Line 2410: \"With family and friends 21%\"\n- Line 2412-2413: \"With family and friends 17%\"\n\nIt appears that 21% is associated with 2005, and 17% is associated with 2010 based on the layout. Let me verify by reading more context:Looking at the OCR text structure, it shows two columns (2005 and 2010). The pattern suggests:\n- Shopping: 10% (2005) and 10% (2010)\n- Fitness: 5% (2005) and 6% (2010)\n- With family and friends: 21% (2005) and 17% (2010)\n\nSo for 2010, the time spent with family and friends is 17%.\n\nLet me verify this is from page 14 by looking for page markers:Let me search for \"14\" near this chart content:Let me look further back in the document to find page numbering:Let me look for page markers earlier in the document structure:Let me check the document structure at the beginning to understand how pages are marked:I can see there are PageBreak markers. Let me count the PageBreaks to find page 14. Let me search more systematically:I can see the PageBreaks. The one at line 2340 appears to be just before the \"Time spent on weekends\" chart. Let me count the PageBreaks:\n\n1. Line 135\n2. Line 153\n3. Line 360\n4. Line 564\n5. Line 830\n6. Line 1381\n7. Line 1630\n8. Line 1676\n9. Line 1882\n10. Line 1951\n11. Line 2012\n12. Line 2143\n13. Line 2340\n14. Line 2470\n\nThe PageBreak at line 2340 is the 13th PageBreak. If we count pages starting from page 1, then after the 13th PageBreak we would be on page 14. The chart \"Time spent on weekends\" starts at line 2344, which is right after the 13th PageBreak at line 2340, so this chart is indeed on page 14.\n\nBased on the OCR text I found earlier, the chart shows:\n- 2005 column and 2010 column\n- \"With family and friends 21%\" (for 2005)\n- \"With family and friends 17%\" (for 2010)\n\nAnswer: 17", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 79187, "error": null, "extra": {"thread_id": 1916, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_6038a83cdf13408480954ef7afd6b6b7", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 590, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "17.0", "normalised_gold": "21.0"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q002", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "0-375 miles", "pages": 15, "document_id": 5221, "arm": "native_pdf", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the chart that tracks the West Nile Virus and look for the legend showing what the red color represents for distance from the Mississippi River.\n\nLooking at page 10, I can see a chart titled \"Tracking the West Nile Virus\" which shows a 3D map of what appears to be the United States with different colored regions.\n\nThe legend on the right side shows \"Approximate Distance from the Mississippi River (miles)\" with the following color-coded ranges:\n- Red: 0 - 375\n- Orange: 375 - 750\n- Yellow: 750 - 1125\n- Light yellow/cream: 1125 - 1500\n\nThe red color represents the closest distance range to the Mississippi River.\n\nAnswer: 0 - 375 miles", "citations": [], "input_tokens": 24377, "output_tokens": 178, "cost_micros": 75801, "latency_ms": 9609, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "0e94b4197b10096b1f4c699701570fbf.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "0 375 miles", "normalised_gold": "0 375 miles"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q002", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "0-375 miles", "pages": 15, "document_id": 5221, "arm": "azure_basic_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, I need to find the chart tracking the West Nile Virus and identify what the red color represents.\n\nI can see the \"Tracking the West Nile Virus\" chart which shows:\n\n**Approximate Distance from the Mississippi River (miles)**\n\nThe legend shows four distance ranges with different colors:\n- 0-375\n- 375 - 750\n- 750 -1125\n- 1125 - 1500\n\nBased on the typical color coding in such charts where red usually represents the most extreme or furthest category, the red color would represent the range **1125 - 1500** miles.\n\nNote: The question mentions \"in Europe\" but the chart shows distances from the Mississippi River, which is in North America, not Europe. The chart appears to be tracking West Nile Virus in relation to the Mississippi River region.\n\nAnswer: 1125 - 1500 miles", "citations": [], "input_tokens": 5211, "output_tokens": 207, "cost_micros": 18738, "latency_ms": 6687, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.3333333333333333, "method": "str_norm", "normalised_pred": "1125 1500 miles", "normalised_gold": "0 375 miles"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q002", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "0-375 miles", "pages": 15, "document_id": 5221, "arm": "azure_premium_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q002", "answer_letter": null, "raw_text": "I need to look at the \"Tracking the West Nile Virus\" chart in the document.\n\nLooking at the chart, I can see a legend that shows \"Approximate Distance from the Mississippi River (miles)\" with different ranges:\n\n- 0-375\n- 375-750\n- 750-1125\n- 1125-1500\n\nThe legend shows different shading/colors corresponding to these ranges. Based on the visual representation in the legend, the red color corresponds to the range 0-375 miles.\n\nHowever, I should note that the question asks about tracking West Nile Virus \"in Europe,\" but the chart actually shows tracking relative to the Mississippi River, which is in the United States, not Europe.\n\nAnswer: 0-375 miles", "citations": [], "input_tokens": 6463, "output_tokens": 170, "cost_micros": 21939, "latency_ms": 6297, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "0 375 miles", "normalised_gold": "0 375 miles"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q002", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "0-375 miles", "pages": 15, "document_id": 5221, "arm": "llamacloud_basic_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the \"Tracking the West Nile Virus\" chart in the document, I can see a legend that shows height-based ranges for approximate distance from the Mississippi River in miles.\n\nThe legend shows:\n- 0-375\n- 375-750\n- 750-1125\n- 1125-1500\n\nHowever, the document does not specify which color (including red) corresponds to which distance range in this chart. Additionally, the chart tracks West Nile Virus in relation to the Mississippi River, which is in the United States, not Europe as stated in the question.\n\nAnswer: The color coding for distance ranges is not specified in the document, and the chart is about the U.S., not Europe.", "citations": [], "input_tokens": 4371, "output_tokens": 160, "cost_micros": 15513, "latency_ms": 6000, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "color coding for distance ranges is not specified in document and chart is about u s not europe", "normalised_gold": "0 375 miles"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q002", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "0-375 miles", "pages": 15, "document_id": 5221, "arm": "llamacloud_premium_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, I need to find the chart about West Nile Virus and check what the red color represents.\n\nOn the page titled \"Tracking the West Nile Virus,\" I can see a 3D map of the United States (not Europe) with a legend showing:\n\n\"Approximate Distance from the Mississippi River (miles)\n[red square] 0 - 375\"\n\nHowever, I should note that this chart tracks West Nile Virus in the United States, not Europe. There is no chart about West Nile Virus in Europe in this document.\n\nAnswer: The chart tracks the United States, not Europe; red represents 0-375 miles from the Mississippi River.", "citations": [], "input_tokens": 9761, "output_tokens": 157, "cost_micros": 31638, "latency_ms": 4453, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.35294117647058826, "method": "str_norm", "normalised_pred": "chart tracks united states not europe red represents 0 375 miles from mississippi river", "normalised_gold": "0 375 miles"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q002", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "0-375 miles", "pages": 15, "document_id": 5221, "arm": "surfsense_agentic", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the document to find the chart about West Nile Virus in Europe and identify what the red color represents for distance from the Mississippi River.Let me search for content related to West Nile Virus and charts showing distance from the Mississippi River.I found a reference to a red zone around the Mississippi River. Let me read that section more carefully:Let me look at the beginning of this section to find the title and legend information:Perfect! I found the chart. Now let me check which color corresponds to which distance range. The legend shows:\n\n- 0-375 miles\n- 375-750 miles\n- 750-1125 miles\n- 1125-1500 miles\n\nAnd the description mentions that the red zone is the highest concentration around the Mississippi River area. Let me verify by checking if there's more specific information about the color coding:Based on the document, I can see that:\n- The chart shows \"Tracking the West Nile Virus\" (not in Europe as the question states, but in the US)\n- The legend shows distance ranges from the Mississippi River: 0-375, 375-750, 750-1125, 1125-1500 miles\n- The red zone represents the highest concentration, which is in the Central US region, particularly around the Mississippi River area\n- The color gradient goes: Red \u2192 Orange \u2192 Yellow \u2192 Pale Yellow (indicating decreasing distance/intensity)\n\nSince red is the highest concentration around the Mississippi River area, red represents the closest distance range.\n\nAnswer: 0-375 miles", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 54702, "error": null, "extra": {"thread_id": 1917, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_78ac91a640ab4a709acb37a25bd11e6b", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 311, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "0 375 miles", "normalised_gold": "0 375 miles"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q003", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Elements of objects tend to be perceptually grouped together if they form a pattern that is regular, simple, and orderly.", "pages": 15, "document_id": 5221, "arm": "native_pdf", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q003", "answer_letter": null, "raw_text": "Looking at page 11 of the document, the Law of Good Gestalt is defined as:\n\n\"Elements of objects tend to be perceptually grouped together if they form a pattern that is regular, simple, and orderly.\"\n\nThe document further explains that this law means individuals eliminate complexity and unfamiliarity to observe reality in its most simplistic form, and focuses on the idea of conciseness.\n\nAnswer: Elements grouped together if they form a pattern that is regular, simple, and orderly", "citations": [], "input_tokens": 24362, "output_tokens": 109, "cost_micros": 74721, "latency_ms": 10577, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "0e94b4197b10096b1f4c699701570fbf.pdf"}, "graded": {"correct": false, "f1": 0.8125000000000001, "method": "str_norm", "normalised_pred": "elements grouped together if they form pattern that is regular simple and orderly", "normalised_gold": "elements of objects tend to be perceptually grouped together if they form pattern that is regular simple and orderly"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q003", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Elements of objects tend to be perceptually grouped together if they form a pattern that is regular, simple, and orderly.", "pages": 15, "document_id": 5221, "arm": "azure_basic_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the document for the definition of the law of good Gestalt...\n\nThe document states: \"Elements of objects tend to be perceptually grouped together if they form a pattern that is regular, simple, and orderly. This law implies that as individuals perceive the world, they eliminate complexity and unfamiliarity so they can observe a reality in its most simplistic form.\"\n\nAnswer: Elements are grouped together if they form a pattern that is regular, simple, and orderly", "citations": [], "input_tokens": 5196, "output_tokens": 104, "cost_micros": 17148, "latency_ms": 3671, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.7878787878787878, "method": "str_norm", "normalised_pred": "elements are grouped together if they form pattern that is regular simple and orderly", "normalised_gold": "elements of objects tend to be perceptually grouped together if they form pattern that is regular simple and orderly"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q003", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Elements of objects tend to be perceptually grouped together if they form a pattern that is regular, simple, and orderly.", "pages": 15, "document_id": 5221, "arm": "azure_premium_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q003", "answer_letter": null, "raw_text": "According to the document, the law of good Gestalt is defined as:\n\nElements of objects tend to be perceptually grouped together if they form a pattern that is regular, simple, and orderly. The document also explains that individuals eliminate complexity and unfamiliarity to observe reality in its most simplistic form, and that this law focuses on the idea of conciseness.\n\nAnswer: Elements grouped together if they form a pattern that is regular, simple, and orderly", "citations": [], "input_tokens": 6448, "output_tokens": 102, "cost_micros": 20874, "latency_ms": 4391, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.8125000000000001, "method": "str_norm", "normalised_pred": "elements grouped together if they form pattern that is regular simple and orderly", "normalised_gold": "elements of objects tend to be perceptually grouped together if they form pattern that is regular simple and orderly"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q003", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Elements of objects tend to be perceptually grouped together if they form a pattern that is regular, simple, and orderly.", "pages": 15, "document_id": 5221, "arm": "llamacloud_basic_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q003", "answer_letter": null, "raw_text": "According to the document, the law of good gestalt is defined as:\n\nElements of objects tend to be perceptually grouped together if they form a pattern that is regular, simple, and orderly. The law implies that individuals eliminate complexity and unfamiliarity to observe reality in its most simplistic form. It focuses on the idea of conciseness and implies a global regularity that is mentally prioritized over spatial relations.\n\nAnswer: Elements grouped together if they form a pattern that is regular, simple, and orderly", "citations": [], "input_tokens": 4356, "output_tokens": 111, "cost_micros": 14733, "latency_ms": 3484, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.8125000000000001, "method": "str_norm", "normalised_pred": "elements grouped together if they form pattern that is regular simple and orderly", "normalised_gold": "elements of objects tend to be perceptually grouped together if they form pattern that is regular simple and orderly"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q003", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Elements of objects tend to be perceptually grouped together if they form a pattern that is regular, simple, and orderly.", "pages": 15, "document_id": 5221, "arm": "llamacloud_premium_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the document's section on \"Gestalt Principles of Visual Perception\" and specifically the \"Law of good Gestalt\":\n\nThe document defines it as: **Elements of objects tend to be perceptually grouped together if they form a pattern that is regular, simple, and orderly.**\n\nThe document further explains this implies that individuals eliminate complexity and unfamiliarity to observe reality in its most simplistic form, and that this law focuses on the idea of conciseness.\n\nAnswer: Elements grouped together if they form a pattern that is regular, simple, and orderly", "citations": [], "input_tokens": 9746, "output_tokens": 125, "cost_micros": 31113, "latency_ms": 4062, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.8125000000000001, "method": "str_norm", "normalised_pred": "elements grouped together if they form pattern that is regular simple and orderly", "normalised_gold": "elements of objects tend to be perceptually grouped together if they form pattern that is regular simple and orderly"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q003", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Elements of objects tend to be perceptually grouped together if they form a pattern that is regular, simple, and orderly.", "pages": 15, "document_id": 5221, "arm": "surfsense_agentic", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the priority document to find the definition of the law of good gestalt.Let me search for \"gestalt\" in this document to find the relevant section.Let me read the document content to find mentions of \"gestalt\":Now let me read the specific section around line 1965 where \"good gestalt\" is mentioned:Perfect! I found the definition. Let me read a bit more context to ensure I have the complete definition:Based on the document, the law of good gestalt is defined as:\n\n\"Elements of objects tend to be perceptually grouped together if they form a pattern that is regular, simple, and orderly.\"\n\nAnswer: Elements grouped together if they form a pattern that is regular, simple, and orderly", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 36484, "error": null, "extra": {"thread_id": 1918, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_d9af7588088a4b3184b7d88eec528363", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 192, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.8125000000000001, "method": "str_norm", "normalised_pred": "elements grouped together if they form pattern that is regular simple and orderly", "normalised_gold": "elements of objects tend to be perceptually grouped together if they form pattern that is regular simple and orderly"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q004", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Eating out", "pages": 15, "document_id": 5221, "arm": "native_pdf", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q004", "answer_letter": null, "raw_text": "I need to examine the chart on page 14 that shows \"Time spent on weekends\" comparing 2005 and 2010.\n\nLooking at the two pie charts:\n\n2005 (smaller chart):\n- With family and friends: 35%\n- Watching films: 20%\n- Reading: 10%\n- Travelling: 5%\n- Net surfing: 3%\n- Hobbies: 2%\n- Eating out: 10%\n- Fitness: 5%\n- Shopping: 10%\n\n2010 (larger chart):\n- With family and friends: 21%\n- Watching films: 22%\n- Eating out: 17%\n- Shopping: 10%\n- Fitness: 6%\n- Hobbies: 4%\n- Net surfing: 4%\n- Travelling: 6%\n\nCalculating the changes:\n- With family and friends: 21% - 35% = -14% (decrease)\n- Watching films: 22% - 20% = +2% (increase)\n- Reading: appears to be gone or very small\n- Eating out: 17% - 10% = +7% (increase)\n- Shopping: 10% - 10% = 0%\n- Fitness: 6% - 5% = +1% (increase)\n- Hobbies: 4% - 2% = +2% (increase)\n- Net surfing: 4% - 3% = +1% (increase)\n- Travelling: 6% - 5% = +1% (increase)\n\nThe largest increase is \"Eating out\" with a 7 percentage point increase.\n\nAnswer: Eating out", "citations": [], "input_tokens": 24374, "output_tokens": 401, "cost_micros": 79137, "latency_ms": 13093, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "0e94b4197b10096b1f4c699701570fbf.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "eating out", "normalised_gold": "eating out"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q004", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Eating out", "pages": 15, "document_id": 5221, "arm": "azure_basic_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q004", "answer_letter": null, "raw_text": "I need to find the table showing time spent on weekends comparing 2005 and 2010.\n\nLooking at the document, I can see a table titled \"Time spent on weekends\" with data for 2005 and 2010:\n\n2005 \u2192 2010:\n- Shopping: 10% \u2192 10% (0% change)\n- Fitness: 5% \u2192 6% (1% increase)\n- With family and friends: 21% \u2192 35% (14% increase)\n- Eating out: 10% \u2192 17% (7% increase)\n- Hobbies: 2% \u2192 4% (2% increase)\n- Net surfing: 3% \u2192 6% (3% increase)\n- Travelling: 5% \u2192 10% (5% increase)\n- Reading: 10% \u2192 4% (6% decrease)\n- Watching films: 20% \u2192 22% (2% increase)\n\nThe largest increase is \"With family and friends\" which went from 21% to 35%, an increase of 14 percentage points.\n\nAnswer: With family and friends", "citations": [], "input_tokens": 5208, "output_tokens": 260, "cost_micros": 19524, "latency_ms": 4875, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "with family and friends", "normalised_gold": "eating out"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q004", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Eating out", "pages": 15, "document_id": 5221, "arm": "azure_premium_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the \"Time spent on weekends\" chart comparing 2005 and 2010:\n\n2005:\n- With family and friends: 21%\n- Watching films: 20%\n- Reading: 10%\n- Shopping: 10%\n- Eating out: 10%\n- Travelling: 5%\n- Fitness: 5%\n- Net surfing: 3%\n- Hobbies: 2%\n\n2010:\n- With family and friends: 35%\n- Watching films: 22%\n- Reading: 10%\n- Shopping: 10%\n- Eating out: 6%\n- Travelling: 4%\n- Fitness: 6%\n- Net surfing: 4%\n- Hobbies: 4%\n\nCalculating increases:\n- With family and friends: 35% - 21% = +14%\n- Watching films: 22% - 20% = +2%\n- Hobbies: 4% - 2% = +2%\n- Fitness: 6% - 5% = +1%\n- Net surfing: 4% - 3% = +1%\n\nAnswer: With family and friends", "citations": [], "input_tokens": 6460, "output_tokens": 284, "cost_micros": 23640, "latency_ms": 5328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "with family and friends", "normalised_gold": "eating out"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q004", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Eating out", "pages": 15, "document_id": 5221, "arm": "llamacloud_basic_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the \"Time spent on weekends\" table comparing 2005 and 2010:\n\n- Shopping: 10% to 10% (no change)\n- Fitness: 5% to 6% (+1%)\n- With family and friends: 21% to 17% (-4%)\n- Eating out: 10% to [value not shown]\n- Hobbies: 2% to 4% (+2%)\n- Net surfing: 3% to 4% (+1%)\n- Travelling: 5% to [value not shown]\n- Reading: 10% to [value not shown]\n- Watching films: 20% to [value not shown]\n\nFrom the visible data, Hobbies shows the largest increase at 2 percentage points (from 2% to 4%).\n\nAnswer: Hobbies", "citations": [], "input_tokens": 4368, "output_tokens": 197, "cost_micros": 16059, "latency_ms": 5109, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "hobbies", "normalised_gold": "eating out"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q004", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Eating out", "pages": 15, "document_id": 5221, "arm": "llamacloud_premium_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the table \"Time spent on weekends\" comparing 2005 and 2010:\n\n- With family and friends: 35 \u2192 21 (decrease of 14)\n- Watching films: 20 \u2192 22 (increase of 2)\n- Reading: 10 \u2192 10 (no change)\n- Travelling: 5 \u2192 6 (increase of 1)\n- Net surfing: 3 \u2192 4 (increase of 1)\n- Hobbies: 2 \u2192 4 (increase of 2)\n- Eating out: 10 \u2192 17 (increase of 7)\n- Fitness: 5 \u2192 6 (increase of 1)\n- Shopping: 10 \u2192 10 (no change)\n\nThe category with the most increase is \"Eating out\" with an increase of 7.\n\nAnswer: Eating out", "citations": [], "input_tokens": 9758, "output_tokens": 213, "cost_micros": 32469, "latency_ms": 4125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "eating out", "normalised_gold": "eating out"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q004", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Eating out", "pages": 15, "document_id": 5221, "arm": "surfsense_agentic", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document that was mentioned to find information about time spent on weekends from 2005 to 2010.Let me continue reading the document to find information about time spent on weekends:This doesn't seem to be the right content. Let me search through more of the document to find information about time spent on weekends:Great! I found a reference to \"Time spent on weekends\" at line 2446. Let me read that section:Perfect! I found the data. Let me analyze the percentages for each category from 2005 to 2010:\n\nFrom the OCR text (lines 2402-2424):\n\n**2005 \u2192 2010 comparisons:**\n- Shopping: 10% \u2192 10% (increase: 0%)\n- Fitness: 5% \u2192 6% (increase: 1%)\n- With family and friends: 21% \u2192 17% (decrease: -4%)\n- Eating out: 10% \u2192 10% (increase: 0%)\n- Hobbies: 2% \u2192 4% (increase: 2%)\n- Net surfing: 3% \u2192 6% (increase: 3%)\n- Travelling: 5% \u2192 10% (increase: 5%)\n- Reading: 10% \u2192 4% (decrease: -6%)\n- Watching films: 20% \u2192 35% (increase: 15%)\n\nThe category with the most increase from 2005 to 2010 is **Watching films** with an increase of 15 percentage points (from 20% to 35%).\n\nAnswer: Watching films", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 42125, "error": null, "extra": {"thread_id": 1919, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_5fc07b74eefd45dcbf475865f8a3bf94", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 246, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "watching films", "normalised_gold": "eating out"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q005", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Europe", "pages": 15, "document_id": 5221, "arm": "native_pdf", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q005", "answer_letter": null, "raw_text": "I need to find information about registered participants for the advanced science course in CTBTO by continent.\n\nLooking at page 13, I can see an infographic titled \"ADVANCED SCIENCE COURSE at a glance\" from the CTBTO (Preparatory Commission for the Comprehensive Nuclear-Test-Ban Treaty Organization).\n\nThe infographic shows:\n- 425 registered participants from 105 countries\n- A world map with numbers indicating participants by region:\n  - 43 (North America)\n  - 5 (Central/South America)\n  - 130 (Europe - shown as the largest red circle)\n  - 80 (Asia/Middle East region)\n  - 77 (Africa/Asia region)\n  - 30 (Africa)\n  - 25 (Asia-Pacific region)\n\nBased on the map visualization, Europe has the largest circle with 130 participants, which is the highest number shown.\n\nAnswer: Europe", "citations": [], "input_tokens": 24369, "output_tokens": 216, "cost_micros": 76347, "latency_ms": 10344, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "0e94b4197b10096b1f4c699701570fbf.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "europe", "normalised_gold": "europe"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q005", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Europe", "pages": 15, "document_id": 5221, "arm": "azure_basic_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q005", "answer_letter": null, "raw_text": "I need to find information about registered participants for the advanced science course in CTBTO by continent.\n\nLooking at the document, I can see a section titled \"ADVANCED SCIENCE COURSE at a glance\" from the CTBTO PREPARATORY COMMISSION.\n\nThe infographic shows:\n- 425 registered participants from 105 countries\n- A breakdown by institutional affiliation (NGO, Station Operator, Ministry of FA, etc.)\n- Regional breakdown showing numbers: 98, 55, 43, 80, 66, 20, 77, 28, 30, 80, 8, 1\n\nI can see there's a map visualization with numbers distributed across different regions. Looking at the geographic distribution:\n- The number 98 appears to be positioned over Asia\n- 80 appears twice in different locations\n- Other numbers are distributed across continents\n\nBased on the positioning on the world map in the infographic, the largest number (98) appears to be associated with the Asian region.\n\nAnswer: Asia", "citations": [], "input_tokens": 5203, "output_tokens": 229, "cost_micros": 19044, "latency_ms": 6187, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "asia", "normalised_gold": "europe"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q005", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Europe", "pages": 15, "document_id": 5221, "arm": "azure_premium_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q005", "answer_letter": null, "raw_text": "I need to find information about registered participants by continent for the CTBTO Advanced Science Course.\n\nLooking at the document, I can see a visualization titled \"ADVANCED SCIENCE COURSE at a glance\" from the CTBTO Preparatory Commission. It shows:\n\n- 425 registered participants from 105 countries\n- A breakdown by institutional affiliation (University, Research Institute, National Data Centre, etc.)\n\nHowever, I do not see any breakdown of participants by continent in this visualization or anywhere else in the document. The document shows the total number of participants (425) and their institutional affiliations, but does not provide geographical/continental distribution data.\n\nAnswer: Information not provided in the document", "citations": [], "input_tokens": 6455, "output_tokens": 152, "cost_micros": 21645, "latency_ms": 4718, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "information not provided in document", "normalised_gold": "europe"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q005", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Europe", "pages": 15, "document_id": 5221, "arm": "llamacloud_basic_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the CTBTO Advanced Science Course data on page 12, I can see a visualization showing participants from 43 countries with numbers:\n\n- 77\n- 130\n- 80\n- 55\n- 66\n\nThe chart shows \"registered institutional affiliation: 425\" total participants and \"participants from 43 countries.\"\n\nThe number 130 appears to be the largest among the values shown in the visualization, which represents participants by region/continent. Based on the positioning and context of the chart layout, this would represent Asia.\n\nAnswer: Asia", "citations": [], "input_tokens": 4363, "output_tokens": 125, "cost_micros": 14964, "latency_ms": 4905, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "asia", "normalised_gold": "europe"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q005", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Europe", "pages": 15, "document_id": 5221, "arm": "llamacloud_premium_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the \"ADVANCED SCIENCE COURSE at a glance\" section in the document, I can find the \"Participants by Region\" table which shows:\n\n- North America: 43\n- Central America: 20\n- South America: 8\n- Europe: 130\n- Africa (West/Central): 77\n- Africa (South): 30\n- Middle East/Asia: 80\n- South East Asia/Oceania: 28\n\nEurope has the highest number with 130 registered participants.\n\nAnswer: Europe", "citations": [], "input_tokens": 9753, "output_tokens": 120, "cost_micros": 31059, "latency_ms": 4437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "europe", "normalised_gold": "europe"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q005", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Europe", "pages": 15, "document_id": 5221, "arm": "surfsense_agentic", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document that was flagged as user-mentioned to find information about CTBTO registered participants for advanced science courses by continent.Let me search for information about CTBTO and advanced science course participants by continent.Let me read the entire document to find charts or tables about participant registration by continent.Let me continue reading to find information about CTBTO:Let me search for \"participant\" to find relevant information:Let me look at later sections of the document:Let me search more specifically:Let me continue reading the document to find the CTBTO-related content:Great! I found the CTBTO content. Let me read that specific section more carefully:I can see the CTBTO Advanced Science Course infographic, but I need to see if there's a visualization showing participant numbers by continent. Let me look for the visual description of this image:That's a different image. Let me search for more info about the CTBTO figure. Let me look for text that mentions continents or regions in the numbers shown:The numbers I see in the CTBTO section are: 43, 130, 80, 66, 20, 77, 28, 30, 8, 80, 98. These numbers appear to be on a chart/map. Let me check if there's a visual description for the CTBTO image. Let me search around that area:Let me look for an embedded image description that might be for the CTBTO figure. The figure ends at line 2239. Let me check if there's a visual description after that:Let me look at what comes before the CTBTO figure to find if there's an embedded image marker:I notice Image18.jpg is about Indonesia entrepreneurship, not CTBTO. Let me search for Image17:That's also not it. The CTBTO figure must be between Image17 and Image18. Let me look for any embedded image between lines 2000-2150:Now I see the CTBTO figure starts at line 2146. Based on what I've seen, the numbers shown are: 43, 130, 80, 66, 20, 77, 28, 30, 8, 80, 98. These appear to be data points on what's likely a world map. Let me look for any visual description or search for these specific numbers in context. Let me search for \"world\" or \"map\":Looking at the numbers from the CTBTO figure more carefully:\n- 43\n- 130 (appears as \"A0 130\" which might be \"Africa 130\")\n- 80\n- 66\n- 20\n- 77\n- 28\n- 30\n- 8\n- 80\n- 98\n\nThe \"A0 130\" at line 2216 could potentially be \"Africa 130\". Let me check if there are continent names mixed in with these numbers. Let me search the exact context:Based on the data I can see from the CTBTO Advanced Science Course infographic, I have:\n- 425 registered participants from 105 countries\n- Numbers: 43, 130, 80, 66, 20, 77, 28, 30, 8, 80, 98\n\nThese numbers likely represent participants by geographic region/continent on a world map visualization. The largest number is 130, which likely corresponds to a continent. Without a clear visual description identifying which number belongs to which continent, I need to make an inference. Given that 130 is the highest value and appears with \"A0\" (possibly Africa or Asia), and considering typical patterns in international scientific courses, the highest participation often comes from either Europe or Asia.\n\nHowever, I cannot definitively determine from the OCR text alone which continent each number represents. Let me see if I can find more context by reading around these numbers more carefully:The OCR shows \"A0 130\" on line 2216. This could be \"Africa 130\" or \"Asia 130\" where the character wasn't recognized properly. Given that 130 is the highest number and that this appears to be showing registered participants by continent on a world map, I need to determine which continent this refers to.\n\nLooking at the pattern, if this is a world map with continental data, and 130 is preceded by what looks like \"A0\" (possibly \"Africa\" or \"Asia\" with poor OCR), I cannot make a definitive determination from the text alone.\n\nHowever, based on the structure and the fact that the highest number is 130, and without additional visual context to confirm which specific continent each number represents, I cannot provide a certain answer from this document.\n\nAnswer: Cannot be determined from the document text alone - the highest value is 130 but the continent labels are not clearly identifiable in the OCR text", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 144672, "error": null, "extra": {"thread_id": 1920, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_b3899f4c01ab4b9084195ac54d3f04aa", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 1007, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "cannot be determined from document text alone highest value is 130 but continent labels are not clearly identifiable in ocr text", "normalised_gold": "europe"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q006", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "list", "gold": "['Circle', 'Rectangle']", "pages": 15, "document_id": 5221, "arm": "native_pdf", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the slide that discusses the \"Closure\" aspect of Gestalt Principles of Visual Perception.\n\nLooking at page 8, I can see a slide titled \"Gestalt Principles of Visual Perception\" that includes information about Closure. The text states: \"Closure. We tend to perceive objects such as shapes, letters, pictures, etc., as being whole when they are not complete. Specifically, when parts of a whole picture are missing, our perception fills in the visual gap.\"\n\nBelow this text, there is an illustration showing incomplete shapes. Looking at the image, I can see:\n- An incomplete circle (dotted/dashed lines forming most of a circle)\n- An incomplete rectangle or square (dotted/dashed lines)\n- Another incomplete rectangular shape (dotted/dashed lines)\n\nThe shapes used to illustrate closure are geometric shapes with gaps or missing parts that our minds fill in to perceive complete shapes.\n\nAnswer: circle, rectangle, square", "citations": [], "input_tokens": 24379, "output_tokens": 216, "cost_micros": 76377, "latency_ms": 13483, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "0e94b4197b10096b1f4c699701570fbf.pdf"}, "graded": {"correct": false, "f1": 0.8, "method": "list_set", "normalised_pred": "circle, rectangle, square", "normalised_gold": "circle, rectangle"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q006", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "list", "gold": "['Circle', 'Rectangle']", "pages": 15, "document_id": 5221, "arm": "azure_basic_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the section about \"Closure\" in the Gestalt Principles of Visual Perception and identify what shapes are shown.\n\nLooking at the document, I can see the section on \"Gestalt Principles of Visual Perception\" that explains Closure:\n\n\"Closure. We tend to perceive objects such as shapes, letters, pictures, etc., as being whole when they are not complete. Specifically, when parts of a whole picture are missing, our perception fills in the visual gap.\"\n\nLooking at the visual examples provided in the Closure section of the slide, I can see incomplete shapes that our minds complete. The shapes shown appear to be:\n- A circle (incomplete/broken circle)\n- A square or rectangle (incomplete/broken square)\n- A triangle (incomplete/broken triangle)\n\nAnswer: circle, square, triangle", "citations": [], "input_tokens": 5213, "output_tokens": 183, "cost_micros": 18384, "latency_ms": 4796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.4, "method": "list_set", "normalised_pred": "circle, square, triangle", "normalised_gold": "circle, rectangle"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q006", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "list", "gold": "['Circle', 'Rectangle']", "pages": 15, "document_id": 5221, "arm": "azure_premium_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the slide about the \"Closure\" aspect of Gestalt Principles of Visual Perception.\n\nLooking through the document, I can see a section titled \"Gestalt Principles of Visual Perception\" that discusses Closure. The text states:\n\n\"Closure. We tend to perceive objects such as shapes, letters, pictures, etc., as being whole when they are not complete. Specifically, when parts of a whole picture are missing, our perception fills in the visual gap.\"\n\nThis section appears on a page that also discusses \"Continuity.\" The document indicates there is a figure on this page, but the actual visual content of the figure is not provided in the text transcription.\n\nHowever, based on the context and typical Gestalt closure demonstrations, and looking at the structure of the document, I cannot definitively identify the specific shapes used in the illustration from the provided text alone, as the figure content is not described in detail in the transcription.\n\nSince I cannot see the actual shapes in the figure that illustrates closure, I cannot provide an accurate answer based solely on what is visible in this document transcription.\n\nAnswer: Unable to determine from provided document", "citations": [], "input_tokens": 6465, "output_tokens": 251, "cost_micros": 23160, "latency_ms": 7265, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "unable to determine from provided document", "normalised_gold": "circle, rectangle"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q006", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "list", "gold": "['Circle', 'Rectangle']", "pages": 15, "document_id": 5221, "arm": "llamacloud_basic_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the slide that illustrates the \"Closure\" aspect of Gestalt Principles of Visual Perception.\n\nLooking through the document, I can see a section titled \"Gestalt Principles of Visual Perception\" that discusses \"Closure\". The text states:\n\n\"Closure: We tend to perceive objects such as shapes, letters, pictures, etc., as being whole when they are not complete. Specifically, when parts of a whole picture are missing, our perception fills in the visual gap.\"\n\nHowever, I need to identify what shapes are actually shown in the visual illustration for this principle. Looking at the page layout, I can see there are visual examples accompanying the Gestalt principles, but the specific shapes used to illustrate closure are not clearly described in the extracted text.\n\nFrom the document structure, I can see that the Closure section appears on a page, but the specific shapes or visual elements used to demonstrate closure are not explicitly listed in the text content that was extracted.\n\nBased on typical Gestalt closure examples and what might be visible in such presentations, common shapes used include circles, triangles, or other geometric shapes that are incomplete but our minds perceive as complete. However, I cannot definitively identify the specific shapes from the text content provided.\n\nAnswer: I cannot determine the specific shapes from the provided document text", "citations": [], "input_tokens": 4373, "output_tokens": 284, "cost_micros": 17379, "latency_ms": 8062, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "i cannot determine specific shapes from provided document text", "normalised_gold": "circle, rectangle"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q006", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "list", "gold": "['Circle', 'Rectangle']", "pages": 15, "document_id": 5221, "arm": "llamacloud_premium_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q006", "answer_letter": null, "raw_text": "Looking at the document for information about the Closure aspect of Gestalt Principle of Visual Perception...\n\nOn the page titled \"Gestalt Principles of Visual Perception\" that discusses Closure and Continuity, I can see the following description:\n\n\"The image illustrates the principle of **Closure** with a dashed outline of a circle and a dashed outline of a square, which the mind perceives as complete shapes despite the gaps.\"\n\nAnswer: circle, square", "citations": [], "input_tokens": 9763, "output_tokens": 105, "cost_micros": 30864, "latency_ms": 4812, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "circle, square", "normalised_gold": "circle, rectangle"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q006", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "list", "gold": "['Circle', 'Rectangle']", "pages": 15, "document_id": 5221, "arm": "surfsense_agentic", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the document to find information about the Gestalt Principle of Visual Perception and the closure aspect.Let me search for content about Gestalt Principle and closure:Now let me read the specific section around line 1631-1640 where closure is mentioned:Let me continue reading to see if there are visual examples for the closure principle:This doesn't seem to be the right section. Let me search more systematically for visual examples of the closure principle. Let me look at earlier sections:Perfect! I can see that Image13.jpg appears right before the Gestalt Principles section and shows \"dashed circle outline\" and \"dashed rectangle outline\". This image is likely illustrating the closure principle. Let me verify by reading a bit more context before the Gestalt section:Based on my reading of the document, Image13.jpg appears immediately before the Gestalt Principles of Visual Perception section discussing closure. The visual description clearly states that this image shows:\n- Left shape: dashed circle outline\n- Right shape: dashed rectangle outline\n\nThese dashed outlines (incomplete shapes) are used to illustrate the closure principle, where we perceive complete shapes even when they are not fully drawn.\n\nAnswer: circle, rectangle", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 46890, "error": null, "extra": {"thread_id": 1921, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_76cfee4a20ba492dba921a5a3cdf2231", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 294, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "circle, rectangle", "normalised_gold": "circle, rectangle"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q000", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "list", "gold": "92", "pages": 45, "document_id": 5222, "arm": "native_pdf", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q000", "answer_letter": null, "raw_text": "I need to find information about voters' perceptions of mudslinging in the 2016 election compared to past elections.\n\nLooking through the document, I find a chart on page 7 titled \"Record share of voters say there was more campaign 'mudslinging'\" which shows the percentage of voters who say there was more mudslinging than in past elections from 1992 to 2016.\n\nThe chart shows:\n- 1992: 68%\n- 1996: 49%\n- 2000: 46%\n- 2004: 72%\n- 2008: 54%\n- 2012: 68%\n- 2016: 92%\n\nThe highest point is 92% in 2016.\n\nAnswer: 92%, 2016", "citations": [], "input_tokens": 90784, "output_tokens": 179, "cost_micros": 275037, "latency_ms": 41734, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "11-21-16-Updated-Post-Election-Release.pdf"}, "graded": {"correct": false, "f1": 0.6666666666666666, "method": "list_set", "normalised_pred": "2016, 92", "normalised_gold": "92"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q000", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "list", "gold": "92", "pages": 45, "document_id": 5222, "arm": "azure_basic_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the chart titled \"More see 'mud-slinging,' less focus on issues\" on page 33 of the document, I can see the data points for \"More mudslinging than usual\" across different election years:\n\n- 1992: 46%\n- 1996: 49%\n- 2000: 54%\n- 2004: 72%\n- 2008: 68%\n- 2012: 68%\n- 2016: 92%\n\nThe document also explicitly states on page 6: \"Fully 92% of all voters - including 87% of Clinton supporters and 60% of Trump backers - say they were surprised by Trump's victory\" and \"Nearly three-quarters (73%) 0f all voters say there was more 'mudslinging' or negative campaigning compared with previous contests...\"\n\nHowever, looking at page 33 more carefully, it states: \"About nine-in-ten (92%) say there was more mudslinging or negative campaigning compared with previous contests, up from 68% who said that in 2012, up 38 points from 2008 (54% more negative) and 20 points higher than the previous high of 72% in 2004.\"\n\nAnswer: 92%, 2016", "citations": [], "input_tokens": 19137, "output_tokens": 292, "cost_micros": 61791, "latency_ms": 9858, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.6666666666666666, "method": "list_set", "normalised_pred": "2016, 92", "normalised_gold": "92"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q000", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "list", "gold": "92", "pages": 45, "document_id": 5222, "arm": "azure_premium_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document for data on \"mudslinging\" or negative campaigning:\n\nOn page 5, there is a chart titled \"Record share of voters say there was more campaign 'mudslinging'\" showing the percentage of voters who say there was more mudslinging than in past elections:\n\n- 1992: 46%\n- 1996: 49%\n- 2000: 54%\n- 2004: 72%\n- 2008: 68%\n- 2012: 68%\n- 2016: 92%\n\nThe highest point is 92% in 2016.\n\nAnswer: 92%", "citations": [], "input_tokens": 24416, "output_tokens": 147, "cost_micros": 75453, "latency_ms": 5031, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "92", "normalised_gold": "92"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q000", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "list", "gold": "92", "pages": 45, "document_id": 5222, "arm": "llamacloud_basic_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the highest percentage of voters who said there was more mudslinging than in past elections.\n\nLooking at the document, I can find a chart titled \"Record share of voters say there was more campaign 'mudslinging'\" on page 6.\n\nThe chart shows percentages for different election years:\n- 1996: 46%\n- 2000: 49%\n- 2004: 72%\n- 2008: 54%\n- 2012: 68%\n- 2016: 92%\n\nThe highest point is 92% in 2016.\n\nAnswer: 92%", "citations": [], "input_tokens": 21637, "output_tokens": 142, "cost_micros": 67041, "latency_ms": 4875, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "92", "normalised_gold": "92"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q000", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "list", "gold": "92", "pages": 45, "document_id": 5222, "arm": "llamacloud_premium_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the highest percentage of voters who said there was more mudslinging than in past elections.\n\nLooking through the document for information about mudslinging:\n\nOn page 5, there's a section titled \"Record share of voters say there was more campaign 'mudslinging'\" with a table showing:\n\n% of voters who say there was ___ than in past elections\n\n| Year | More mudslinging | Less mudslinging |\n| ---- | ---------------- | ---------------- |\n| 1992 | 68               | 16               |\n| 1996 | 49               | 36               |\n| 2000 | 46               | 34               |\n| 2004 | 72               | 14               |\n| 2008 | 54               | 27               |\n| 2012 | 68               | 19               |\n| 2016 | 92               | 4                |\n\nThe text also states: \"Fully 92% say there was more \"mudslinging\" or negative campaigning than in past elections \u2013 which is 20 percentage points higher than the previous high (72% after the 2004 election).\"\n\nThe highest point is 92% in 2016.\n\nAnswer: 92%", "citations": [], "input_tokens": 24377, "output_tokens": 286, "cost_micros": 77421, "latency_ms": 7157, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "92", "normalised_gold": "92"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q000", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "list", "gold": "92", "pages": 45, "document_id": 5222, "arm": "surfsense_agentic", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find information about voters' perceptions of mudslinging compared to past elections.Let me search for content related to mudslinging:Let me read through the document to find the specific data about mudslinging. Let me start by reading the beginning sections:Let me continue searching for information about mudslinging. Let me read more of the document:Great! I found the relevant information. Let me continue reading this section to get the complete data:Perfect! I found the data about mudslinging. From the document, I can see the chart showing \"% of voters who say there was more mudslinging than in past elections\" with the following values across different election years:\n\n- 1992: 36%\n- 1996: 34%\n- 2000: 27%\n- 2004: 72%\n- 2008: 68%\n- 2012: 68%\n- 2016: 92%\n\nThe text also confirms: \"Fully 92% say there was more 'mudslinging' or negative campaigning than in past elections - which is 20 percentage points higher than the previous high (72% after the 2004 election).\"\n\nThe highest point is 92% in 2016.\n\nAnswer: 92", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 39625, "error": null, "extra": {"thread_id": 1922, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_252b1c025dac41a5b4c7b4fbfefc6f87", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 214, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "92", "normalised_gold": "92"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q001", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "str", "gold": "men", "pages": 45, "document_id": 5222, "arm": "native_pdf", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q001", "answer_letter": null, "raw_text": "I need to find information about whether men or women are more likely to think a female president will be elected in their lifetime.\n\nLooking through the document, I find on page 10 (document page 8) a section titled \"Will the country elect a female president in your lifetime?\"\n\nThe data shows:\n- All voters: No 17%, Yes 79%\n- Men: No 16%, Yes 81%\n- Women: No 18%, Yes 78%\n- Trump voters: No 17%, Yes 78%\n- Clinton voters: No 17%, Yes 81%\n\nComparing men (81% Yes) to women (78% Yes), more men think a female president will be elected in their lifetime.\n\nAnswer: Men", "citations": [], "input_tokens": 90776, "output_tokens": 163, "cost_micros": 274773, "latency_ms": 43483, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "11-21-16-Updated-Post-Election-Release.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "men", "normalised_gold": "men"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q001", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "str", "gold": "men", "pages": 45, "document_id": 5222, "arm": "azure_basic_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the document for data on expectations of a female president by gender...\n\nOn page 9, there is a table titled \"Will the country elect a female president in your lifetime?\"\n\nThe table shows:\n- Men who say \"Yes\": 81%\n- Women who say \"Yes\": 78%\n\nAnswer: More men (81% vs 78%)", "citations": [], "input_tokens": 19129, "output_tokens": 80, "cost_micros": 58587, "latency_ms": 5437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.33333333333333337, "method": "str_norm", "normalised_pred": "more men 81 vs 78", "normalised_gold": "men"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q001", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "str", "gold": "men", "pages": 45, "document_id": 5222, "arm": "azure_premium_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the table on page 8 titled \"Will the country elect a female president in your lifetime?\", I can find the breakdown by gender:\n\n- Men: 81% say yes\n- Women: 78% say yes\n\nAnswer: More men (81% vs 78%)", "citations": [], "input_tokens": 24408, "output_tokens": 64, "cost_micros": 74184, "latency_ms": 7359, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.33333333333333337, "method": "str_norm", "normalised_pred": "more men 81 vs 78", "normalised_gold": "men"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q001", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "str", "gold": "men", "pages": 45, "document_id": 5222, "arm": "llamacloud_basic_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the document for data on whether men or women think a female president will be elected in their lifetime...\n\nOn page 9, there is a table titled \"Will the country elect a female president in your lifetime?\" showing:\n\n- Men: 81% say Yes\n- Women: 78% say Yes\n\nAnswer: More men (81% vs 78%)", "citations": [], "input_tokens": 21629, "output_tokens": 80, "cost_micros": 66087, "latency_ms": 4219, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.33333333333333337, "method": "str_norm", "normalised_pred": "more men 81 vs 78", "normalised_gold": "men"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q001", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "str", "gold": "men", "pages": 45, "document_id": 5222, "arm": "llamacloud_premium_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the document, I need to find the data about whether men or women think a female president will be elected in their lifetime.\n\nOn page 8, there is a table titled \"Will the country elect a female president in your lifetime?\" that shows:\n\n- Men: 81% say Yes\n- Women: 78% say Yes\n\nSince 81% > 78%, more men than women think a female president will be elected in their lifetime.\n\nAnswer: Men", "citations": [], "input_tokens": 24369, "output_tokens": 102, "cost_micros": 74637, "latency_ms": 5171, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "men", "normalised_gold": "men"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q001", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "str", "gold": "men", "pages": 45, "document_id": 5222, "arm": "surfsense_agentic", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the priority document that was flagged as user-mentioned to find information about opinions on whether a female president will be elected in a lifetime.Let me search for content related to female president and lifetime:Let me read the specific section around line 1095 to get the complete data:The text states \"There are no significant differences in these opinions among men and women,\" meaning men and women have similar percentages. Let me search for any table or chart that might have the exact percentages broken down by gender:I can see a table starting. Let me read more to see if it has the gender breakdown:Perfect! I found the table with the gender breakdown. Looking at the \"Yes\" column (indicating those who think a female president will be elected in their lifetime):\n\n- Men: 81%\n- Women: 78%\n\nAnswer: Men (81% vs 78%)", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 46859, "error": null, "extra": {"thread_id": 1923, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_47a779db7256446daa01071f4e8824a2", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 246, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 0.4, "method": "str_norm", "normalised_pred": "men 81 vs 78", "normalised_gold": "men"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q003", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "int", "gold": "8", "pages": 45, "document_id": 5222, "arm": "native_pdf", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q003", "answer_letter": null, "raw_text": "I'll carefully examine the document to count all line plots.\n\nGoing through the document page by page:\n\n**Page 3:** Contains a line plot showing \"Voters give Trump worse grades than they have for any winning candidate in recent decades\" with percentages from 1988-2016.\n\n**Page 5:** Contains a line plot showing \"Record share of voters say there was more campaign 'mudslinging'\" with two lines (More mudslinging and Less mudslinging) from 1992-2016.\n\n**Page 9:** Contains a line plot showing \"Half say they are happy Trump was elected president\" with percentages from 1996-2016.\n\n**Page 21:** Contains a line plot showing \"Will relations between Republicans and Democrats improve?\" with three lines from 1992-2016.\n\n**Page 24:** Contains two separate line plots:\n- One for Republican/Rep-leaning voters (More moderate vs More conservative) from 2008-2016\n- One for Democratic/Dem-leaning voters (More moderate vs More liberal) from 2008-2016\n\n**Page 27:** Contains two separate line plots showing grades for:\n- The Republican Party (1988-2016)\n- The Democratic Party (1988-2016)\n\n**Page 29:** Contains two separate line plots showing grades for:\n- The press (1988-2016)\n- The pollsters (1988-2016)\n\n**Page 31:** Contains a line plot showing \"For the first time in eight elections, most are dissatisfied with vote choices\" from 1988-2016.\n\n**Page 32:** Contains a line plot showing \"More see 'mud-slinging,' less focus on issues\" with two sets of lines from 1992-2016.\n\n**Page 33:** Contains two separate line plots showing \"Learning about the candidates and issues\" from 1988-2016.\n\n**Page 36:** Contains a line plot showing \"Most voters are confident own vote, national votes counted accurately\" from 2004-2016.\n\n**Page 37:** Contains a line plot showing \"Similar shares of Trump, Clinton voters very confident in national vote count\" from 2004-2016.\n\nTotal count:\nPage 3: 1\nPage 5: 1\nPage 9: 1\nPage ", "citations": [], "input_tokens": 90760, "output_tokens": 512, "cost_micros": 279960, "latency_ms": 41093, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "length", "pdf_filename": "11-21-16-Updated-Post-Election-Release.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "8"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q003", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "int", "gold": "8", "pages": 45, "document_id": 5222, "arm": "azure_basic_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q003", "answer_letter": null, "raw_text": "I need to count the number of line plots in this document.\n\nLet me go through the document systematically to identify line plots:\n\n1. Page 1: \"Voters give Trump worse grades than they have for any winning candidate in recent decades\" - This shows a line plot with two lines (Winning candidate and Losing candidate) from 1988 to 2016.\n\n2. Page 6: \"Record share of voters say there was more campaign 'mudslinging'\" - This shows a line plot with two lines (More mudslinging and Less mudslinging) from 1992 to 2016.\n\n3. Page 31: \"For the first time in eight elections, most are dissatisfied with vote choices\" - This shows a line plot with two lines (Very/Fairly satisfied and Not very/Not at all satisfied) from 1988 to 2016.\n\n4. Page 32: Below the first plot on page 31, there's another line plot showing \"% of voters who say they were very/fairly satisfied with choice of presidential candidates\" with data points from 1988 to 2016 for winning and losing candidates.\n\n5. Page 33: \"More see 'mud-slinging,' less focus on issues\" - This contains TWO separate line plots:\n   - First plot: Shows \"More mudslinging than usual\" vs \"Less mudslinging than usual\" from 1992 to 2016\n   - Second plot: Shows discussion of issues \"Less than usual\" vs \"More than usual\" from 1992 to 2016\n\n6. Page 34: \"Learning about the candidates and issues\" - This contains TWO separate line plots:\n   - First plot: \"Learned enough\" vs \"Did not learn enough\" from 1988 to 2016\n   - Second plot: Presidential debates helpfulness from 1988 to 2016\n\nLet me recount to be sure:\n- Page 1: 1 line plot\n- Page 6: 1 line plot  \n- Page 31: 2 line plots (one for all voters, one comparing winning/losing candidate voters)\n- Page 33: 2 line plots (mudslinging and issues discussion)\n- Page 34: 2 line plots (learning about candidates and debates)\n\nTotal:", "citations": [], "input_tokens": 19113, "output_tokens": 512, "cost_micros": 65019, "latency_ms": 11078, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "8"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q003", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "int", "gold": "8", "pages": 45, "document_id": 5222, "arm": "azure_premium_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q003", "answer_letter": null, "raw_text": "I need to carefully examine the document to count all line plots (graphs that show data points connected by lines over time or categories).\n\nLet me go through the document systematically:\n\n1. Page 3: \"Voters give Trump worse grades than they have for any winning candidate in recent decades\" - This appears to be a line plot showing grades from 1988-2016.\n\n2. Page 5: \"Record share of voters say there was more campaign 'mudslinging'\" - This is a line plot showing data from 1992-2016.\n\n3. Page 9: \"Half say they are happy Trump was elected president\" - This is a line plot showing data from 1996-2016.\n\n4. Page 24: Two separate line plots:\n   - \"Most Republicans continue to say GOP should be more conservative\" (showing data from 2008-2016)\n   - \"As the share of Democrats who want their party to be more liberal grows\" (showing data from 2008-2016)\n\n5. Page 31: \"For the first time in eight elections, most are dissatisfied with vote choices\" - This is a line plot showing data from 1988-2016.\n\n6. Page 32: Two separate line plots:\n   - One showing \"More mudslinging than usual\" vs \"Less mudslinging than usual\" from 1992-2016\n   - One showing discussion of issues \"Less than usual\" vs \"More than usual\" from 1992-2016\n\n7. Page 33: Two separate line plots:\n   - One showing \"Learned enough\" vs \"Did not learn enough\" from 1988-2016\n   - One showing helpfulness of debates from 1988-2016\n\n8. Page 34: \"More say press had 'too much' influence on outcome of the election\" - This is a line plot showing data from 1992-2016.\n\nTotal count: 1 + 1 + 1 + 2 + 1 + 2 + 2 + 1 = 11\n\nAnswer: 11", "citations": [], "input_tokens": 24392, "output_tokens": 466, "cost_micros": 80166, "latency_ms": 11078, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "11", "normalised_gold": "8"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q003", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "int", "gold": "8", "pages": 45, "document_id": 5222, "arm": "llamacloud_basic_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q003", "answer_letter": null, "raw_text": "I need to carefully examine the document for line plots (charts that show data points connected by lines).\n\nLet me go through the document systematically:\n\n1. Page 3: \"Voters give Trump worse grades...\" - This appears to be a bar chart showing percentages, not a line plot.\n\n2. Page 4: \"Large share of Trump voters are confident...\" - Bar chart\n   \"How Clinton voters feel about 'giving Trump a chance'\" - Bar chart\n\n3. Page 5: \"% of voters saying Democratic leaders should...\" - Bar charts comparing 2016 and 2008\n\n4. Page 6: \"% of voters saying Donald Trump ___ appoint Democrats...\" - Table format\n   \"% of voters who said Barack Obama ___ appoint Republicans...\" - Table format\n\n5. Page 7: \"Election report card: Barely passing grades\" - Table format\n   \"Record share of voters say there was more campaign 'mudslinging'\" - This appears to be a LINE PLOT showing trend over years 1996-2016\n\n6. Page 8: \"Top reactions to Trump's victory\" - Bar chart\n\n7. Page 9: \"Trump's victory in a word\" - Word cloud/frequency table\n\n8. Page 10: \"Will the country elect a female president in your lifetime?\" - Bar chart\n\n9. Throughout the rest of the document, I see mostly bar charts, tables, and no additional line plots.\n\nLet me recheck page 7 more carefully. The chart showing \"More mudslinging\" and \"Less mudslinging\" from 1992-2016 appears to be displayed as a line plot with two lines tracking these measures over time.\n\nAfter careful review, I can identify 1 line plot in this document on page 7.\n\nAnswer: 1", "citations": [], "input_tokens": 21613, "output_tokens": 393, "cost_micros": 70734, "latency_ms": 11437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "8"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q003", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "int", "gold": "8", "pages": 45, "document_id": 5222, "arm": "llamacloud_premium_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q003", "answer_letter": null, "raw_text": "I need to carefully examine the document for line plots (charts that show data points connected by lines over time or across categories).\n\nLet me go through the document systematically:\n\n1. Page 3: \"Voters give Trump worse grades than they have for any winning candidate in recent decades\" - This shows a line plot with percentages over years from 1988-2016.\n\n2. Page 5: \"Record share of voters say there was more campaign 'mudslinging'\" - This shows a line plot with \"More mudslinging\" and \"Less mudslinging\" lines from 1992-2016.\n\n3. Page 9: \"Voters give Trump worse grades than they have for any winning candidate in recent decades\" (repeated from earlier) - This is a table showing A or B grades, labeled as \"Note: Percent of 'A' or 'B' grades\" - This appears to be the same as #1.\n\n4. Page 31: \"For the first time in eight elections, most are dissatisfied with vote choices\" - This shows two line plots tracking satisfaction levels from 1988-2016.\n\n5. Page 32: \"More see 'mud-slinging,' less focus on issues\" - This shows two separate line plots, one for mudslinging and one for discussion of issues from 1992-2016.\n\nLet me recount more carefully by looking at actual visual line plots:\n\n- Page 3: One line plot showing grades over time\n- Page 5: One line plot showing mudslinging trends\n- Page 31: Two line plots (satisfaction trends)\n- Page 32: Two line plots (mudslinging and issues discussion)\n\nTotal: 1 + 1 + 2 + 2 = 6 line plots\n\nAnswer: 6", "citations": [], "input_tokens": 24353, "output_tokens": 397, "cost_micros": 79014, "latency_ms": 10187, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "8"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q003", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "int", "gold": "8", "pages": 45, "document_id": 5222, "arm": "surfsense_agentic", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the document to count the line plots in the report.Let me continue reading the document to see all the content and identify line plots:Let me continue reading to see if there are more charts/figures:Let me search for all figures/charts in the document to count line plots specifically:Now I need to look at the actual figures to determine which ones are line plots. Let me examine the key figure sections more carefully. Based on what I've already read, I saw one chart around line 798-853 that appears to be about \"mudslinging\" over time. Let me check the visual descriptions and content more carefully:This appears to be a line plot showing data over time (years 92, 96, 00, 04, 08, 12, 16). Let me check another figure:This appears to be another line plot showing data over time from 1996 to 2016. Let me check the first figure to see if it's a line plot:This first figure shows data for winning and losing candidates across years from 1988 to 2016. This appears to be a line plot with two lines (winning candidate and losing candidate). Let me check if there are any other potential line plots by examining more figures. Let me look at the figure around line 2240-2290:This appears to be a bar chart, not a line plot. Let me continue scanning through the document to identify all line plots. Let me check around line 2333:This appears to be a bar chart comparing 2016 and 2008 data, not a line plot. Based on my review of the document so far, I've identified:\n\n1. **Line plot 1** (lines 264-360): Shows grades for winning and losing candidates over time from 1988 to 2016 (two lines)\n2. **Line plot 2** (lines 798-848): Shows \"mudslinging\" over time with two lines (More mudslinging and Less mudslinging) from 1992 to 2016\n3. **Line plot 3** (lines 1209-1241): Shows happiness with election results from 1996 to 2016\n\nThese all show data trends over multiple time points, which is characteristic of line plots. Most other figures in the document appear to be bar charts showing comparisons between groups. Let me verify by checking if there are any other time-series charts I might have missed. Let me do a final check of the document to ensure I haven't missed any line plots.\n\nBased on my thorough review of the document, I found three distinct line plots that show data trends over time:\n\n1. A line plot showing grades for winning and losing candidates from 1988-2016\n2. A line plot showing perceptions of campaign mudslinging from 1992-2016  \n3. A line plot showing voter happiness with election results from 1996-2016\n\nAnswer: 3", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 65172, "error": null, "extra": {"thread_id": 1924, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_dcd15be6dac34256a7ae1a01a0998a5a", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 476, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "8"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q004", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "str", "gold": "Clinton", "pages": 45, "document_id": 5222, "arm": "native_pdf", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document for campaign grades information...\n\nOn page 3, there is a chart showing \"Voters give Trump worse grades than they have for any winning candidate in recent decades\" with percentages of \"A\" or \"B\" grades on an A, B, C, D, F scale.\n\nThe chart shows:\n- Trump (2016): 30% gave A or B grades\n- Clinton (2016): 43% gave A or B grades\n\nThe document also states on page 5: \"For the first time in Pew Research Center post-election surveys, voters give the losing candidate higher grades than the winner. About four-in-ten (43%) give Clinton an A or B, which is comparable to the share giving Mitt Romney top letter grades in 2012 (44%) and 13 percentage points higher than Trump's (30%).\"\n\nAdditionally, on page 26, there is a table showing average grades:\n- Trump: C- (average grade)\n- Clinton: C (average grade)\n\nSince C is higher than C-, Clinton has a higher median/average campaign grade.\n\nAnswer: Clinton", "citations": [], "input_tokens": 90768, "output_tokens": 246, "cost_micros": 275994, "latency_ms": 41342, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "11-21-16-Updated-Post-Election-Release.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "clinton", "normalised_gold": "clinton"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q004", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "str", "gold": "Clinton", "pages": 45, "document_id": 5222, "arm": "azure_basic_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the campaign grades data in the document:\n\nFor **Trump**:\n- 30% gave A or B\n- 19% gave C\n- 15% gave D\n- 35% gave F\n\nFor **Clinton**:\n- 43% gave A or B\n- 20% gave C\n- 16% gave D\n- 21% gave F\n\nTo find the median, I need to determine which grade represents the 50th percentile:\n\nFor Trump: 30% (A/B) + 19% (C) = 49%, then adding the next 15% (D) brings us to 64%. The 50th percentile falls in the D range.\n\nFor Clinton: 43% (A/B) + 20% (C) = 63%. The 50th percentile falls in the C range.\n\nThe document also explicitly states Trump receives a \"C-\" average grade while Clinton receives a \"C\" average grade, which is consistent with this analysis.\n\nAnswer: Clinton", "citations": [], "input_tokens": 19121, "output_tokens": 227, "cost_micros": 60768, "latency_ms": 6078, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "clinton", "normalised_gold": "clinton"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q004", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "str", "gold": "Clinton", "pages": 45, "document_id": 5222, "arm": "azure_premium_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document for campaign grades:\n\nOn page 26, there is a table titled \"Election report card: Barely passing grades\" showing:\n- Trump: 30% gave A or B, Average grade: C-\n- Clinton: 43% gave A or B, Average grade: C\n\nThe document states on page 5: \"For the first time in Pew Research Center post-election surveys, voters give the losing candidate higher grades than the winner. About four-in-ten (43%) give Clinton an A or B, which is comparable to the share giving Mitt Romney top letter grades in 2012 (44%) and 13 percentage points higher than Trump's (30%).\"\n\nClinton received an average grade of C, while Trump received an average grade of C-.\n\nAnswer: Clinton", "citations": [], "input_tokens": 24400, "output_tokens": 176, "cost_micros": 75840, "latency_ms": 6358, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "clinton", "normalised_gold": "clinton"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q004", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "str", "gold": "Clinton", "pages": 45, "document_id": 5222, "arm": "llamacloud_basic_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q004", "answer_letter": null, "raw_text": "Let me find the campaign grades for Trump and Clinton from the document.\n\nFrom the section \"Voters give Trump worse grades than they have for any winning candidate in recent decades\":\n\nFor Trump:\n- 30% gave A or B\n- 19% gave C\n- 15% gave D\n- 35% gave F\n\nFor Clinton:\n- 43% gave A or B\n- 20% gave C\n- 16% gave D\n- 21% gave F\n\nThe document also shows average grades:\n- Trump: C-\n- Clinton: C\n\nTo determine the median, I need to consider the distribution. With:\n- Trump: 30% (A/B), 19% (C), 15% (D), 35% (F)\n- Clinton: 43% (A/B), 20% (C), 16% (D), 21% (F)\n\nFor Trump, the cumulative percentages are: 30% at B or above, 49% at C or above, 64% at D or above. The median falls around the C grade.\n\nFor Clinton, the cumulative percentages are: 43% at B or above, 63% at C or above, 79% at D or above. The median falls around the C grade.\n\nHowever, since Clinton has a higher percentage (43%) receiving A or B compared to Trump (30%), and a lower percentage receiving F (21% vs 35%), Clinton's distribution is shifted toward higher grades. The average grades confirm this: Clinton received C while Trump received C-.\n\nAnswer: Clinton", "citations": [], "input_tokens": 21621, "output_tokens": 349, "cost_micros": 70098, "latency_ms": 8375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "clinton", "normalised_gold": "clinton"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q004", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "str", "gold": "Clinton", "pages": 45, "document_id": 5222, "arm": "llamacloud_premium_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document, I need to find the campaign grades for Trump and Clinton.\n\nOn page 26-27, there is a table showing \"Voters grade the parties, press and pollsters quite negatively\" with the percentage of voters who give each a grade of A, B, C, D, or F:\n\n**Trump:**\n- A or B: 30%\n- C: 19%\n- D: 15%\n- F: 35%\n- Average grade: C-\n\n**Clinton:**\n- A or B: 43%\n- C: 20%\n- D: 16%\n- F: 21%\n- Average grade: C\n\nThe document explicitly states that Trump receives an average grade of C- and Clinton receives an average grade of C. Since C is higher than C-, Clinton has a higher median/average campaign grade.\n\nAnswer: Clinton", "citations": [], "input_tokens": 24361, "output_tokens": 195, "cost_micros": 76008, "latency_ms": 5969, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "clinton", "normalised_gold": "clinton"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q004", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "str", "gold": "Clinton", "pages": 45, "document_id": 5222, "arm": "surfsense_agentic", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to find information about campaign grades for Trump and Clinton.Let me search for \"campaign grade\" in the document to find relevant information.Let me search for \"grade\" to find relevant sections.Now let me read the specific sections that mention grades for Trump and Clinton. Let me look at the chunks around line 260 and 2800-2900.Good! Now let me look for more specific information about median grades. Let me check the section around line 2800-2900 where grades are discussed in detail.Perfect! I found the information. Let me look at the table that shows the grade distribution to calculate the median. From lines 2820-2838, I can see:\n\n**Trump:**\n- A or B: 30%\n- C: 19%\n- D: 15%\n- F: 35%\n\n**Clinton:**\n- A or B: 43%\n- C: 20%\n- D: 16%\n- F: 21%\n\nThe document also explicitly states:\n- Trump receives a C- grade on average (line 2801-2802)\n- Clinton's average grade is a C (line 2897)\n\nTo find the median, I need to look at where the 50th percentile falls:\n\n**Trump:**\n- 0-30%: A or B\n- 31-49%: C\n- 50-64%: D (the 50th percentile falls here)\n- 65-100%: F\n\n**Clinton:**\n- 0-43%: A or B\n- 44-63%: C (the 50th percentile falls here)\n- 64-79%: D\n- 80-100%: F\n\nSo Trump's median grade is D, and Clinton's median grade is C.\n\nAnswer: Clinton", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 45171, "error": null, "extra": {"thread_id": 1925, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_5b7e95c94e72489983c39189c2251166", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 287, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "clinton", "normalised_gold": "clinton"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q005", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "int", "gold": "3", "pages": 45, "document_id": 5222, "arm": "native_pdf", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q005", "answer_letter": null, "raw_text": "I need to search through the document for charts related to mudslinging.\n\nLooking through the document, I can find references to mudslinging in the following locations:\n\n1. On page 5 (document page 7), there is a chart titled \"Record share of voters say there was more campaign 'mudslinging'\" which shows the percentage of voters who say there was more or less mudslinging than in past elections from 1992 to 2016.\n\n2. On page 32 (document page 34), there is another chart with the same title \"More see 'mud-slinging,' less focus on issues\" which shows similar data about mudslinging perceptions.\n\nLet me verify these are distinct charts:\n- The first chart on page 5 shows data points: 68, 49, 34, 72, 54, 68, 92 for \"More mudslinging\" \n- The second chart on page 32 shows the same data\n\nThese appear to be the same chart repeated in different sections of the document.\n\nTherefore, there is 1 unique chart related to mudslinging (though it appears twice in the document).\n\nAnswer: 1", "citations": [], "input_tokens": 90760, "output_tokens": 261, "cost_micros": 276195, "latency_ms": 37187, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "11-21-16-Updated-Post-Election-Release.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "3"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q005", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "int", "gold": "3", "pages": 45, "document_id": 5222, "arm": "azure_basic_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q005", "answer_letter": null, "raw_text": "I need to find all charts/figures in the document that are related to mudslinging.\n\nLet me search through the document for references to \"mudslinging\":\n\n1. On page 6, there is a chart titled \"Record share of voters say there was more campaign 'mudslinging'\" showing percentages from 1992-2016 of voters who say there was more or less mudslinging than in past elections.\n\n2. On page 33, there is another chart titled \"More see 'mud-slinging,' less focus on issues\" which shows the same data about mudslinging over time (1992-2016) plus a separate section about discussion of issues.\n\nLet me verify if these are two separate charts or if one is a repeat:\n- Page 6 shows the mudslinging chart in the context of the executive summary\n- Page 33 shows the same mudslinging data but combined with another chart about discussion of issues in a single figure\n\nThese appear to be presenting the same mudslinging data, but in different contexts. The page 33 version combines it with the issues discussion data.\n\nCounting distinct charts that contain mudslinging data:\n1. The chart on page 6 (standalone mudslinging chart)\n2. The chart on page 33 (mudslinging combined with issues discussion)\n\nSince the question asks about charts \"related to mudslinging,\" both of these count as they both contain mudslinging data.\n\nAnswer: 2", "citations": [], "input_tokens": 19113, "output_tokens": 323, "cost_micros": 62184, "latency_ms": 10187, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "3"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q005", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "int", "gold": "3", "pages": 45, "document_id": 5222, "arm": "azure_premium_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q005", "answer_letter": null, "raw_text": "I need to find all charts/figures in the document that are related to mudslinging.\n\nLet me search through the document for references to \"mudslinging\":\n\n1. On page 5, there is a section titled \"Record share of voters say there was more campaign 'mudslinging'\" with a chart showing \"% of voters who say there was ___ than in past elections\" with \"More mudslinging\" and \"Less mudslinging\" data from years 92, 96, 00, 04, 08, 12, 16.\n\n2. On page 32, there is another chart titled \"More see 'mud-slinging,' less focus on issues\" showing \"% of voters who say there was ... More mudslinging than usual\" and \"Less mudslinging than usual\" from years 92, 96, 00, 04, 08, 12, 16.\n\nThese appear to be the same chart repeated in different sections of the document (one in the executive summary and one in the detailed findings).\n\nLet me verify if there are any other mudslinging-related charts... No, I only find these two instances, which appear to be the same chart shown in different parts of the document.\n\nSince the question asks how many charts are related to mudslinging, and there are 2 instances of mudslinging charts in the document (even though they show the same data), the answer would be 2.\n\nHowever, if we're counting unique charts, it would be 1 chart that appears twice.\n\nLooking at the context of the question, it seems to be asking for the count of chart occurrences in the document, which would be 2.\n\nAnswer: 2", "citations": [], "input_tokens": 24392, "output_tokens": 372, "cost_micros": 78756, "latency_ms": 10358, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "3"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q005", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "int", "gold": "3", "pages": 45, "document_id": 5222, "arm": "llamacloud_basic_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q005", "answer_letter": null, "raw_text": "I need to find all charts in the document that are related to \"mudslinging.\"\n\nLet me search through the document for references to mudslinging:\n\n1. On page 6, there is a chart titled \"Record share of voters say there was more campaign 'mudslinging'\" which shows percentages of voters who say there was more or less mudslinging than in past elections from 1996-2016.\n\n2. On page 33, there is another chart titled \"More see 'mud-slinging,' less focus on issues\" which shows percentages of voters who say there was more mudslinging than usual and less mudslinging than usual from 1992-2016.\n\nLet me verify there are no other charts about mudslinging by carefully reviewing the document again.\n\nAfter reviewing the document thoroughly, I can confirm there are 2 charts specifically related to mudslinging:\n- One on page 6 showing the record share saying there was more mudslinging\n- One on page 33 showing mud-slinging trends and comparison with issue discussion\n\nAnswer: 2", "citations": [], "input_tokens": 21613, "output_tokens": 237, "cost_micros": 68394, "latency_ms": 7250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "3"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q005", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "int", "gold": "3", "pages": 45, "document_id": 5222, "arm": "llamacloud_premium_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q005", "answer_letter": null, "raw_text": "I need to find all charts/tables in the document that are related to \"mudslinging\" or negative campaigning.\n\nLet me search through the document:\n\n1. On page 5, there is a chart titled \"Record share of voters say there was more campaign 'mudslinging'\" - This shows data about mudslinging from 1992-2016.\n\n2. On page 32, there is a chart titled \"More see 'mud-slinging,' less focus on issues\" - This shows data about mudslinging and discussion of issues from 1992-2016.\n\nThese appear to be the only two charts specifically related to mudslinging in the document. The first chart shows the percentage of voters who say there was more mudslinging than in past elections over different election years. The second chart also shows mudslinging data alongside data about discussion of issues.\n\nAnswer: 2", "citations": [], "input_tokens": 24353, "output_tokens": 194, "cost_micros": 75969, "latency_ms": 8342, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "3"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q005", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "int", "gold": "3", "pages": 45, "document_id": 5222, "arm": "surfsense_agentic", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to find charts related to mudslinging.Let me read the full document to search for content related to mudslinging and identify charts:Let me continue reading to see if there are more charts related to mudslinging:Based on my thorough reading of the document, I found the following charts related to mudslinging:\n\n1. **Lines 788-853 (chunk 49482)**: A chart titled \"Record share of voters say there was more campaign 'mudslinging'\" showing percentages of voters who said there was more or less mudslinging than in past elections, with data from years 1992, 1996, 2000, 2004, 2008, 2012, and 2016.\n\n2. **Lines 3933-4048 (chunk 49595)**: A chart in the section \"More see 'mud-slinging,' less focus on issues\" showing percentages of voters who say there was \"More mudslinging than usual\" and \"Less mudslinging than usual\" across election years 1992, 1996, 2000, 2004, 2008, 2012, and 2016.\n\nThese appear to be two representations of the same data - one earlier in the document and one later in a more detailed section about campaign evaluations.\n\nAnswer: 2", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 62437, "error": null, "extra": {"thread_id": 1926, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_8bd535bded05431a9423ba96684e000e", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 344, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "3"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q006", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "list", "gold": "[53, 1.4]", "pages": 45, "document_id": 5222, "arm": "native_pdf", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q006", "answer_letter": null, "raw_text": "I need to find information about voters' reactions to Trump's victory, specifically looking for percentages who feel \"uneasy\" and \"excited.\"\n\nLooking through the document, I can find on page 6 (document page 8) a section titled \"Top reactions to Trump's victory \u2013 'uneasy,' 'hopeful'\" which shows voter reactions.\n\nThe chart shows:\n- 53% say Trump's election makes them feel \"uneasy\"\n- 51% say it makes them feel \"hopeful\"\n- 41% say \"sad\"\n- 41% say \"scared\"\n- 36% say \"proud\"\n- 31% say \"angry\"\n\nI don't see \"excited\" listed as one of the emotional reactions measured. The document shows \"hopeful,\" \"proud,\" \"uneasy,\" \"sad,\" \"scared,\" and \"angry\" but not \"excited.\"\n\nFor \"uneasy,\" the answer is 53%.\n\nSince \"excited\" is not measured in this survey, I cannot provide a percentage for it.\n\nAnswer: 53% uneasy, excited not measured", "citations": [], "input_tokens": 90783, "output_tokens": 245, "cost_micros": 276024, "latency_ms": 34905, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "11-21-16-Updated-Post-Election-Release.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "53 uneasy, excited not measured", "normalised_gold": "1 4, 53"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q006", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "list", "gold": "[53, 1.4]", "pages": 45, "document_id": 5222, "arm": "azure_basic_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q006", "answer_letter": null, "raw_text": "Looking at the document for voters' reactions to Trump's victory:\n\nFrom page 7, there is a table titled \"Top reactions to Trump's victory - 'uneasy,' 'hopeful'\" which shows:\n- Uneasy: 53%\n\nFrom page 8, there is a word cloud table showing one-word reactions. Under \"Trump voters,\" I can see:\n- Excited: 16 (this is the actual number of respondents who mentioned this word, not a percentage)\n\nHowever, the question asks for percentages. Looking at page 11, there is a more detailed table showing emotional reactions:\n\"% of voters who say the election of Donald Trump makes them feel ...\"\n- Uneasy: 53%\n\nThe document does not provide \"excited\" as a percentage in the main emotional reactions table on pages 7 and 11. The word \"excited\" only appears in the word cloud with an actual count of 16 mentions from Trump voters, but this is not presented as a percentage of all voters.\n\nBased on the data provided in percentage form:\n- Uneasy: 53%\n- Excited: Not provided as a percentage of all voters\n\nAnswer: 53%, Not provided", "citations": [], "input_tokens": 19136, "output_tokens": 269, "cost_micros": 61443, "latency_ms": 9281, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "53, not provided", "normalised_gold": "1 4, 53"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q006", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "list", "gold": "[53, 1.4]", "pages": 45, "document_id": 5222, "arm": "azure_premium_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the percentages of voters who felt \"uneasy\" and \"excited\" about Trump's victory.\n\nLooking at the document, on page 6 (and repeated on page 10), there is a section titled \"Top reactions to Trump's victory - 'uneasy,' 'hopeful'\" which shows:\n\n% of voters who say the election of Donald Trump makes them feel:\n- Hopeful: 51%\n- Proud: 36%\n- Uneasy: 53%\n- Sad: 41%\n- Scared: 41%\n- Angry: 31%\n\nSo \"uneasy\" is 53%.\n\nHowever, I don't see \"excited\" listed as one of the reaction options in this chart.\n\nLooking at page 7, there is a word cloud section showing individual words mentioned by Trump and Clinton voters. For Trump voters, \"Excited\" appears with 16 mentions, but this is not a percentage - these are actual counts of how many people mentioned that specific word.\n\nThe main emotional reactions chart on page 6 does not include \"excited\" as one of the measured emotions. The document only provides percentages for: hopeful, proud, uneasy, sad, scared, and angry.\n\nSince \"excited\" is not measured as a percentage in the same way as \"uneasy,\" I can only provide the percentage for \"uneasy\" which is 53%.\n\nAnswer: 53% uneasy, excited not measured as percentage", "citations": [], "input_tokens": 24415, "output_tokens": 331, "cost_micros": 78210, "latency_ms": 9875, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "53 uneasy, excited not measured as percentage", "normalised_gold": "1 4, 53"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q006", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "list", "gold": "[53, 1.4]", "pages": 45, "document_id": 5222, "arm": "llamacloud_basic_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the percentage of voters who feel \"uneasy\" and \"excited\" about Trump's victory.\n\nLooking at the document, I can find a section titled \"Top reactions to Trump's victory \u2013 'uneasy,' 'hopeful'\" which shows:\n\n% of voters who say the election of Donald Trump makes them feel \u2026\n\n| Feeling | % of Voters |\n| ------- | ----------- |\n| Uneasy  | 53          |\n| Hopeful | 51          |\n| Proud   | 36          |\n| Scared  | 41          |\n| Sad     | 41          |\n| Angry   | 31          |\n\nI can see that 53% feel \"uneasy.\"\n\nFor \"excited,\" I need to look at the word cloud section titled \"Trump's victory in a word\" which shows responses from Trump voters and Clinton voters. Under Trump voters, I can see:\n- 16 Excited\n\nHowever, this appears to be a count (16 respondents), not a percentage. The note states \"Figures show actual number of respondents who offered each response; these numbers are not percentages.\"\n\nLooking back at the main emotional reactions table on the previous page, \"excited\" is not listed as one of the specific emotions measured in the survey question about how Trump's election makes them feel.\n\nBased on the available data in the document:\n- Uneasy: 53%\n- Excited: Not measured as a percentage in the emotional reactions survey (only appears as a word count of 16 in the open-ended response)\n\nAnswer: 53%, not measured as percentage", "citations": [], "input_tokens": 21636, "output_tokens": 364, "cost_micros": 70368, "latency_ms": 11468, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "53, not measured as percentage", "normalised_gold": "1 4, 53"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q006", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "list", "gold": "[53, 1.4]", "pages": 45, "document_id": 5222, "arm": "llamacloud_premium_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the percentages of voters who felt \"uneasy\" and \"excited\" about Trump's victory.\n\nLooking at the document, I can find information about emotional reactions to Trump's election on page 10:\n\n\"Voters express a mix of emotional reactions to the election of Donald Trump. On the positive side, 51% say that Trump's election makes them feel hopeful; somewhat fewer say it makes them feel proud (36%).\n\nOn the negative side, 53% say Trump's election makes them feel uneasy. About four-in-ten say his election makes them feel sad (41%) or scared (41%), and 31% say the election of Trump makes them feel angry.\"\n\nSo 53% felt \"uneasy.\"\n\nLooking at the table on page 6 titled \"Top reactions to Trump's victory \u2013 'uneasy,' 'hopeful'\" and the word cloud on page 7, I can see the following reactions listed:\n- Hopeful: 51%\n- Proud: 36%\n- Uneasy: 53%\n- Sad: 41%\n- Scared: 41%\n- Angry: 31%\n\nIn the word cloud on page 7 showing \"Trump's victory in a word,\" I can see \"Excited\" listed among Trump voters with 16 mentions, but this is showing the actual number of respondents, not percentages.\n\nThe main table showing percentage reactions does not include \"excited\" as one of the measured emotions. The document only provides percentages for: hopeful, proud, uneasy, sad, scared, and angry.\n\nBased on the available data:\n- Uneasy: 53%\n- Excited: Not measured as a percentage in the survey\n\nAnswer: 53% uneasy, excited not measured as percentage", "citations": [], "input_tokens": 24376, "output_tokens": 401, "cost_micros": 79143, "latency_ms": 11812, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "53 uneasy, excited not measured as percentage", "normalised_gold": "1 4, 53"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q006", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "list", "gold": "[53, 1.4]", "pages": 45, "document_id": 5222, "arm": "surfsense_agentic", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the document to find the specific percentages for \"uneasy\" and \"excited\" reactions to Trump's victory.Let me read the specific section with the data about voter reactions to see the percentages clearly.Based on the document, I can see the percentages clearly in the figure at lines 890-916:\n\n- **Uneasy: 53%**\n- **Excited: 16** (this appears to be a count in the open-ended word table, not a percentage)\n\nLet me verify if there's a percentage for \"excited\" elsewhere in the document.The document shows a figure with percentages of voter reactions to Trump's victory. The measured reactions with percentages are:\n- Hopeful: 51%\n- Proud: 36%\n- **Uneasy: 53%**\n- Sad: 41%\n- Scared: 41%\n- Angry: 31%\n\n\"Excited\" appears in a different table showing word counts from open-ended responses (16 mentions from Trump voters), not as a percentage measurement of all voters.\n\nBased on the document, the percentage for \"uneasy\" is **53%**. There is no percentage given for \"excited\" as a reaction category - it only appears as a word count (16) in the open-ended responses table.\n\nAnswer: 53%, 16", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 44733, "error": null, "extra": {"thread_id": 1927, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_5850dcc4518c4b98b48b2eae8c928f95", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 326, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "16, 53", "normalised_gold": "1 4, 53"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q007", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "float", "gold": "31%", "pages": 45, "document_id": 5222, "arm": "native_pdf", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q007", "answer_letter": null, "raw_text": "I need to find information about voters' reactions to Trump's victory, specifically the percentage who felt \"angry.\"\n\nLooking through the document, I find a section on page 10 titled \"Emotional reactions to Trump's election\" with a chart showing \"Top reactions to Trump's victory \u2013 'uneasy,' 'hopeful'\"\n\nThe chart shows percentages of voters who say the election of Donald Trump makes them feel various emotions:\n- Hopeful: 51%\n- Proud: 36%\n- Uneasy: 53%\n- Sad: 41%\n- Scared: 41%\n- Angry: 31%\n\nAnswer: 31", "citations": [], "input_tokens": 90769, "output_tokens": 150, "cost_micros": 274557, "latency_ms": 34717, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "11-21-16-Updated-Post-Election-Release.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "31.0", "normalised_gold": "31.0"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q007", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "float", "gold": "31%", "pages": 45, "document_id": 5222, "arm": "azure_basic_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q007", "answer_letter": null, "raw_text": "I need to find the percentage of voters who feel \"angry\" about Trump's victory.\n\nLooking through the document for information about emotional reactions to Trump's election:\n\nOn page 7, there is a section titled \"Election reactions: Nearly all Trump supporters feel 'hopeful'\" which includes a chart showing \"Top reactions to Trump's victory - 'uneasy,' 'hopeful'\"\n\nThe chart shows the percentage of voters who say the election of Donald Trump makes them feel various emotions:\n- Hopeful: 51%\n- Proud: 36%\n- Uneasy: 53%\n- Sad: 41%\n- Scared: 41%\n- Angry: 31%\n\nThe document states: \"About half of voters (53%) say his election makes them feel 'uneasy,' while nearly as many (51%) say it makes them feel 'hopeful'. Smaller shares say his election triumph makes them feel 'scared', 'sad' (41% each), 'proud' (36%) or 'angry' (31%).\"\n\nAnswer: 31", "citations": [], "input_tokens": 19122, "output_tokens": 243, "cost_micros": 61011, "latency_ms": 7046, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "31.0", "normalised_gold": "31.0"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q007", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "float", "gold": "31%", "pages": 45, "document_id": 5222, "arm": "azure_premium_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q007", "answer_letter": null, "raw_text": "Looking at the document for information about voters' reactions to Trump's victory.\n\nOn page 6 (and repeated on page 10), there is a chart titled \"Top reactions to Trump's victory - 'uneasy,' 'hopeful'\" which shows the percentage of voters who say the election of Donald Trump makes them feel various emotions.\n\nAccording to this chart:\n- Hopeful: 51%\n- Proud: 36%\n- Uneasy: 53%\n- Sad: 41%\n- Scared: 41%\n- Angry: 31%\n\nThe document states that 31% of voters say Trump's election makes them feel \"angry.\"\n\nAnswer: 31", "citations": [], "input_tokens": 24401, "output_tokens": 157, "cost_micros": 75558, "latency_ms": 5485, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "31.0", "normalised_gold": "31.0"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q007", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "float", "gold": "31%", "pages": 45, "document_id": 5222, "arm": "llamacloud_basic_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q007", "answer_letter": null, "raw_text": "I need to find the percentage of voters who feel \"angry\" about Trump's election.\n\nLooking through the document for information about voters' emotional reactions to Trump's victory...\n\nOn page 12 (in the section \"Voters express a mix of emotional reactions to the election of Donald Trump\"), I find a table showing:\n\n\"% of voters who say the election of Donald Trump makes them feel \u2026\"\n\nThe feelings listed include:\n- Hopeful: 51%\n- Proud: 36%\n- Uneasy: 53%\n- Sad: 41%\n- Scared: 41%\n- Angry: 31%\n\nSo 31% of voters say the election of Trump makes them feel angry.\n\nAnswer: 31", "citations": [], "input_tokens": 21622, "output_tokens": 166, "cost_micros": 67356, "latency_ms": 6328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "31.0", "normalised_gold": "31.0"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q007", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "float", "gold": "31%", "pages": 45, "document_id": 5222, "arm": "llamacloud_premium_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q007", "answer_letter": null, "raw_text": "Looking through the document for emotional reactions to Trump's victory...\n\nOn page 6, I find the section \"Top reactions to Trump's victory \u2013 'uneasy,' 'hopeful'\" which shows:\n\n% of voters who say the election of Donald Trump makes them feel ...\n\n| Reaction | %  |\n| -------- | -- |\n| Hopeful  | 51 |\n| Proud    | 36 |\n| Uneasy   | 53 |\n| Sad      | 41 |\n| Scared   | 41 |\n| Angry    | 31 |\n\nThe table clearly shows that 31% of voters say Trump's election makes them feel angry.\n\nAnswer: 31", "citations": [], "input_tokens": 24362, "output_tokens": 166, "cost_micros": 75576, "latency_ms": 5985, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "31.0", "normalised_gold": "31.0"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q007", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "float", "gold": "31%", "pages": 45, "document_id": 5222, "arm": "surfsense_agentic", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q007", "answer_letter": null, "raw_text": "I'll read the document to find information about voter reactions to Trump's victory.Let me search for sections that discuss reactions or emotions about Trump's victory.Great! I found a reference to \"angry\" in the context of reactions. Let me read the specific chunk that contains this information.Perfect! I found the information. According to the document, 31% of voters say Trump's election makes them feel \"angry\".\n\nAnswer: 31", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 32359, "error": null, "extra": {"thread_id": 1928, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_4e0d0df3d4f749b8a5376414cce75543", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 148, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "31.0", "normalised_gold": "31.0"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q000", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "list", "gold": "['46', '27', '64']", "pages": 28, "document_id": 5223, "arm": "native_pdf", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q000", "answer_letter": null, "raw_text": "I need to find data from 2015 about how many people believed the government was doing very/fairly well in reducing the threat of terrorism.\n\nLooking at page 2, I can see a chart titled \"For first time since 9/11, government's job rating on terrorism turns negative\" which shows the percentage who say the government is doing \"Very/Fairly well\" in reducing the threat of terrorism.\n\nFrom the chart on page 2, at the end point in 2015 (specifically December 2015), I can see:\n- Total: 46%\n\nLooking at page 10-11, there's a detailed breakdown by party. The chart on page 10 shows \"Across-the-board drop in ratings of govt efforts to reduce terrorist threat\" with partisan breakdowns.\n\nFrom the chart on page 10, for 2015 (the rightmost data point):\n- Republican: 27%\n- Democrat: 64%\n- Independent: 44%\n\nSo for the question asking about adults (total), Republicans, and Democrats in 2015:\n- Adults (Total): 46%\n- Republicans: 27%\n- Democrats: 64%\n\nAnswer: 46, 27, 64", "citations": [], "input_tokens": 58172, "output_tokens": 269, "cost_micros": 178551, "latency_ms": 22812, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "12-15-15-ISIS-and-terrorism-release-final.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "27, 46, 64", "normalised_gold": "27, 46, 64"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q000", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "list", "gold": "['46', '27', '64']", "pages": 28, "document_id": 5223, "arm": "azure_basic_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the percentage of adults, Republicans, and Democrats who believed the government was doing very/fairly well in reducing the threat of terrorism in 2015.\n\nLooking at the document, I can find this information in the section \"Views of how the government is handling the terrorist threat\" on page 10-11.\n\nThe document states:\n- Total (all adults): 46% say the government is doing very/fairly well\n- Democrats: 64% say the government is doing very/fairly well  \n- Republicans: 27% say the government is doing very/fairly well\n\nThis data is from the December 2015 survey (conducted Dec. 8-13, 2015).\n\nAnswer: 46, 27, 64", "citations": [], "input_tokens": 13406, "output_tokens": 163, "cost_micros": 42663, "latency_ms": 5468, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "27, 46, 64", "normalised_gold": "27, 46, 64"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q000", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "list", "gold": "['46', '27', '64']", "pages": 28, "document_id": 5223, "arm": "azure_premium_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document for the specific data on government performance in reducing the threat of terrorism in 2015:\n\nFrom the chart on page 10 titled \"Across-the-board drop in ratings of govt efforts to reduce terrorist threat\" showing percentages saying government is doing very/fairly well in reducing the threat of terrorism:\n\nFor December 2015:\n- Total/All adults: 46%\n- Republicans: 27%\n- Democrats: 64%\n\nAnswer: 46, 27, 64", "citations": [], "input_tokens": 19266, "output_tokens": 113, "cost_micros": 59493, "latency_ms": 5296, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "27, 46, 64", "normalised_gold": "27, 46, 64"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q000", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "list", "gold": "['46', '27', '64']", "pages": 28, "document_id": 5223, "arm": "llamacloud_basic_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the percentages of surveyed adults, Republicans, and Democrats who believed the government was doing very/fairly well in reducing the threat of terrorism in 2015.\n\nLooking at the document, I can find this information in the section titled \"Across-the-board drop in ratings of govt efforts to reduce terrorist threat\" on page 10.\n\nThe chart shows \"% saying govt is doing very/fairly well in reducing the threat of terrorism\" for December 2015:\n- Total (all adults): 46%\n- Republican: 27%\n- Democrat: 64%\n\nAnswer: 46, 27, 64", "citations": [], "input_tokens": 14501, "output_tokens": 140, "cost_micros": 45603, "latency_ms": 5233, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "27, 46, 64", "normalised_gold": "27, 46, 64"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q000", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "list", "gold": "['46', '27', '64']", "pages": 28, "document_id": 5223, "arm": "llamacloud_premium_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document for data on how well the government is doing in reducing the threat of terrorism in 2015:\n\nFrom the table on page 10 titled \"How well is govt doing reducing threat of terrorism?\" and the section \"Views of how the government is handling the terrorist threat\" on pages 10-11:\n\nIn December 2015:\n- Total: 46% said very/fairly well\n- Republican: 27% said very/fairly well  \n- Democrat: 64% said very/fairly well\n\nAnswer: 46, 27, 64", "citations": [], "input_tokens": 18633, "output_tokens": 125, "cost_micros": 57774, "latency_ms": 5375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "27, 46, 64", "normalised_gold": "27, 46, 64"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q000", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "list", "gold": "['46', '27', '64']", "pages": 28, "document_id": 5223, "arm": "surfsense_agentic", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find the specific data about beliefs regarding government performance in reducing terrorism threats in 2015.Let me search for data about government performance in reducing terrorism threats:I can see a figure showing data about \"How well is govt doing reducing threat of terrorism?\" with percentages for \"Very/Fairly well\" across years. I can see 46 for 2015. Now let me look for the breakdown by party affiliation (Republicans and Democrats):Let me search more specifically for partisan breakdown of government performance on terrorism:I found a reference to partisan data. Let me read that section:The figure I found shows partisan data over time. Let me read more context around this section to understand the 2015 data points:Perfect! I found the data. The text clearly states:\n\n- Overall in 2015: 46% say government is doing very or fairly well\n- Democrats in 2015: 64% say government is doing at least fairly well  \n- Republicans in 2015: 27% say government is doing very or fairly well\n\nThis is also confirmed by the figure showing the data points for year '15' with Democrat at 64, Republican at 27.\n\nAnswer: 46, 27, 64", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 42844, "error": null, "extra": {"thread_id": 1929, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_cb2b43cade5c41b98919c13d86621c55", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 281, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "27, 46, 64", "normalised_gold": "27, 46, 64"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q002", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "str", "gold": "Iran's nuclear program", "pages": 28, "document_id": 5223, "arm": "native_pdf", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the threats that increased from August 2014 to December 2015, and then determine which has the biggest Republican-Democrat difference.\n\nLooking at page 15, I can see a chart titled \"ISIS rises higher atop public's threat list\" showing various threats with their percentages in December 2015 and August 2014, plus the change.\n\nThe threats that increased (positive change) from August 2014 to December 2015 are:\n- ISIS: +16 points (67% to 83%)\n- Iran's nuclear program: +3 points (59% to 62%)\n- North Korea's nuclear program: +2 points (57% to 59%)\n- China's emergence as a world power: +1 point (48% to 49%)\n- Global climate change: +1 point (48% to 49%)\n\nNow I need to find the R-D difference for each of these threats. Looking at page 15, there's another chart showing \"Partisan differences in views of top threats to U.S.\" with R-D diff column:\n\n- Iran's nuclear program: R+27\n- ISIS: R+14\n- China's emergence as a world power: R+18\n- North Korea's nuclear program: R+13\n- Global climate change: D+51 (this is actually a Democratic advantage, so -51 for R-D)\n\nAmong the threats that increased, Iran's nuclear program has the biggest R-D difference at R+27.\n\nAnswer: Iran's nuclear program", "citations": [], "input_tokens": 58152, "output_tokens": 342, "cost_micros": 179586, "latency_ms": 35328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "12-15-15-ISIS-and-terrorism-release-final.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "iran s nuclear program", "normalised_gold": "iran s nuclear program"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q002", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "str", "gold": "Iran's nuclear program", "pages": 28, "document_id": 5223, "arm": "azure_basic_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q002", "answer_letter": null, "raw_text": "I need to find which threats had bigger percentages in December 2015 compared to August 2014, and then determine which has the largest Republican-Democrat difference.\n\nLooking at the table \"ISIS rises higher atop public's threat list\" on page 14:\n\nThreats that increased from August 2014 to December 2015:\n1. ISIS: 67% \u2192 83% (+16)\n2. Iran's nuclear program: 59% \u2192 62% (+3)\n3. North Korea's nuclear program: 57% \u2192 59% (+2)\n4. China's emergence as a world power: 48% \u2192 49% (+1)\n5. Global climate change: 48% \u2192 49% (+1)\n\nNow I need to find the R-D differences for these threats from the table \"Partisan differences in views of top threats to U.S.\" on page 15:\n\n1. ISIS: R-D difference = +14 (93% Republican - 79% Democrat)\n2. Iran's nuclear program: R-D difference = +27 (79% Republican - 52% Democrat)\n3. North Korea's nuclear program: R-D difference = +13 (70% Republican - 57% Democrat)\n4. China's emergence as a world power: R-D difference = +18 (62% Republican - 44% Democrat)\n5. Global climate change: R-D difference = -51 (22% Republican - 73% Democrat = -51, but this is negative)\n\nAmong those that increased from 2014 to 2015, Iran's nuclear program has the biggest R-D difference at +27 percentage points.\n\nAnswer: Iran's nuclear program", "citations": [], "input_tokens": 13386, "output_tokens": 380, "cost_micros": 45858, "latency_ms": 8984, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "iran s nuclear program", "normalised_gold": "iran s nuclear program"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q002", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "str", "gold": "Iran's nuclear program", "pages": 28, "document_id": 5223, "arm": "azure_premium_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, I need to find threats that increased from August 2014 to December 2015, and then determine which has the biggest Republican-Democrat difference.\n\nFrom page 14, the table \"ISIS rises higher atop public's threat list\" shows changes from August 2014 to December 2015:\n- ISIS: +16 points\n- Iran's nuclear program: +3 points\n- North Korea's nuclear program: +2 points\n- China's emergence as a world power: +1 point\n- Global climate change: +1 point\n\nFrom page 15, the table \"Partisan differences in views of top threats to U.S.\" shows R-D differences in December 2015:\n- ISIS: 93% (R) - 79% (D) = +14 R-D difference\n- Iran's nuclear program: 79% (R) - 52% (D) = +27 R-D difference\n- North Korea's nuclear program: 70% (R) - 57% (D) = +13 R-D difference\n- China's emergence as a world power: 62% (R) - 44% (D) = +18 R-D difference\n\nAmong threats that increased from August 2014 to December 2015, Iran's nuclear program has the biggest R-D difference at 27 points.\n\nAnswer: Iran's nuclear program", "citations": [], "input_tokens": 19246, "output_tokens": 314, "cost_micros": 62448, "latency_ms": 7014, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "iran s nuclear program", "normalised_gold": "iran s nuclear program"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q002", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "str", "gold": "Iran's nuclear program", "pages": 28, "document_id": 5223, "arm": "llamacloud_basic_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, I need to find the threats that increased from August 2014 to December 2015, then identify which has the biggest Republican-Democrat difference.\n\nFrom the table \"ISIS rises higher atop public's threat list\" (page 14), the threats that increased are:\n- ISIS: +16 points\n- Iran's nuclear program: +3 points\n- North Korea's nuclear program: +2 points\n- China's emergence: +1 point\n- Global climate change: +1 point\n\nFrom the table \"Partisan differences in views of top threats to U.S.\" (page 15), the R-D differences are:\n- ISIS: 93% (R) - 79% (D) = +14\n- Iran's nuclear program: 79% (R) - 52% (D) = +27\n- North Korea's nuclear program: 70% (R) - 57% (D) = +13\n- China's emergence: 62% (R) - 44% (D) = +18\n\nAmong the threats that increased from August 2014, Iran's nuclear program has the biggest R-D difference at 27 points.\n\nAnswer: Iran's nuclear program", "citations": [], "input_tokens": 14481, "output_tokens": 271, "cost_micros": 47508, "latency_ms": 7327, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "iran s nuclear program", "normalised_gold": "iran s nuclear program"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q002", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "str", "gold": "Iran's nuclear program", "pages": 28, "document_id": 5223, "arm": "llamacloud_premium_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q002", "answer_letter": null, "raw_text": "I need to find threats where concern increased from August 2014 to December 2015, then identify which has the biggest Republican-Democrat difference.\n\nFrom the table \"ISIS rises higher atop public's threat list\" on page 14, I can see the changes from August 2014 to December 2015:\n\n- ISIS: +16 points (83% in Dec 2015 vs 67% in Aug 2014)\n- Iran's nuclear program: +3 points\n- North Korea's nuclear program: +2 points\n- China's emergence as a world power: +1 point\n- Global climate change: +1 point\n- Israeli-Palestinian conflict: -5 points (decreased)\n- Growing authoritarianism in Russia: -11 points (decreased)\n\nSo the threats that increased are: ISIS, Iran's nuclear program, North Korea's nuclear program, China's emergence, and global climate change.\n\nFrom the table \"Partisan differences in views of top threats to U.S.\" on page 15, the R-D differences in December 2015 are:\n\n- ISIS: +14 (93% Rep - 79% Dem)\n- Iran's nuclear program: +27 (79% Rep - 52% Dem)\n- North Korea's nuclear program: +13 (70% Rep - 57% Dem)\n- China's emergence as a world power: +18 (62% Rep - 44% Dem)\n- Global climate change: -51 (22% Rep - 73% Dem) - Note: This is negative, meaning Democrats see it as more of a threat\n\nAmong those that increased from 2014 to 2015, Iran's nuclear program has the biggest R-D difference at +27 points.\n\nAnswer: Iran's nuclear program", "citations": [], "input_tokens": 18613, "output_tokens": 402, "cost_micros": 61869, "latency_ms": 9391, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "iran s nuclear program", "normalised_gold": "iran s nuclear program"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q002", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "str", "gold": "Iran's nuclear program", "pages": 28, "document_id": 5223, "arm": "surfsense_agentic", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the document that appears most relevant based on the priority list: the ISIS and terrorism release from December 2015.Let me continue reading to find the data about threats and R-D differences:I can see the document mentions ISIS increased from 67% in August 2014 to 83% as a major threat. Let me continue reading to find data about other threats and their R-D differences:Let me search for data about threats to well-being and R-D differences. I need to find a table or chart comparing different threats:Good! I found references to \"R-D diff\" in the document. Let me read the section around line 1364:This table shows the \"most important problem\" but I need to find the table about \"threats to well-being\" comparing 2015 to August 2014. Let me continue searching:Perfect! I found the section about threats. Let me read the full table around line 1974-2076:Great! Now I need to see the partisan breakdown table with R-D differences. Let me continue reading:", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 55203, "error": null, "extra": {"thread_id": 1930, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_dd9ddaac9b56498ab35c8337203470ec", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 315, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "i ll read document that appears most relevant based on priority list isis and terrorism release from december 2015 let me continue reading to find data about threats and r d differences i can see document mentions isis increased from 67 in august 2014 to 83 as major threat let me continue reading to find data about other threats and their r d differences let me search for data about threats to well being and r d differences i need to find table or chart comparing different threats good i found references to r d diff in document let me read section around line 1364 this table shows most important problem but i need to find table about threats to well being comparing 2015 to august 2014 let me continue searching perfect i found section about threats let me read full table around line 1974 2076 great now i need to see partisan breakdown table with r d differences let me continue reading", "normalised_gold": "iran s nuclear program"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q004", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "int", "gold": "1", "pages": 28, "document_id": 5223, "arm": "native_pdf", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q004", "answer_letter": null, "raw_text": "I'll go through the document systematically to count all bar charts.\n\nLooking through each page:\n\nPage 1: Title page - no charts\n\nPage 2: \n- One line chart showing \"Views of Government's Handling of Terrorism\"\n- No bar charts\n\nPage 3:\n- One line chart showing partisan gaps\n- One pie chart showing scrutiny of Muslims\n- No bar charts\n\nPage 4:\n- One table showing party preferences on various issues\n- No bar charts\n\nPage 5:\n- One line chart showing views on Islam and violence over time\n- One line chart showing partisan divide\n- One table with demographic data\n- No bar charts\n\nPage 6:\n- Multiple horizontal bar charts showing \"Should Muslims be subject to greater scrutiny\" broken down by different groups\n- Count: approximately 3-4 bar charts on this page\n\nPage 7:\n- Multiple horizontal bar charts continuing the scrutiny question by demographics\n- Count: approximately 3-4 bar charts on this page\n\nPage 8:\n- One table showing \"Most important problem facing nation\"\n- No bar charts\n\nPage 9:\n- One table showing partisan differences\n- No bar charts\n\nPage 10:\n- One line chart showing government ratings over time by party\n- No bar charts\n\nPage 11:\n- One table showing demographic views\n- No bar charts\n\nPage 12:\n- One line chart showing concerns about anti-terrorism policies\n- No bar charts\n\nPage 13:\n- One line chart showing partisan views on anti-terrorism policies\n- One table\n- No bar charts\n\nPage 14:\n- Multiple horizontal bar charts showing ISIS as threat\n- Count: approximately 2 bar charts\n\nPage 15:\n- Multiple horizontal bar charts showing partisan differences in threats\n- Count: approximately 2 bar charts\n\nPage 16:\n- Line charts showing views of U.S. campaign\n- No bar charts\n\nPage 17:\n- Horizontal bar charts showing current assessments\n- Count: approximately 2 bar charts\n\nPage 18:\n- Multiple horizontal bar charts about concerns and ground troops\n- Count: approximately 3-4 bar charts\n\nPage 19:\n- Line chart and horizontal bar charts\n- Count: approximately 2 bar charts\n\nPage 20:\n- Line charts showing views on military force\n-", "citations": [], "input_tokens": 58124, "output_tokens": 512, "cost_micros": 182052, "latency_ms": 17578, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "length", "pdf_filename": "12-15-15-ISIS-and-terrorism-release-final.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "1"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q004", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "int", "gold": "1", "pages": 28, "document_id": 5223, "arm": "azure_basic_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q004", "answer_letter": null, "raw_text": "I need to count all the bar charts in this document.\n\nLet me go through the document systematically:\n\n1. Page 1: \"For first time since 9/11, government's job rating on terrorism turns negative\" - This is a line chart, not a bar chart.\n\n2. Page 2: \"Growing partisan gap in views of whether Islam encourages violence\" - This is a line chart.\n\n3. Page 2: \"Most say more scrutiny of Muslims not warranted\" - This is a pie chart.\n\n4. Page 3: \"GOP favored on terrorism, gun control; Democrats lead on climate change\" - This appears to be a table with bar-like elements showing percentages, but it's formatted as a comparative table.\n\n5. Page 4: \"Does the Islamic religion encourage violence more than other religions?\" - This is a line chart.\n\n6. Page 4: \"Wide partisan divide over whether Islam is more likely to encourage violence\" - This is a line chart.\n\n7. Page 5: \"Demographic divides in views of Islam and violence\" - This is a table.\n\n8. Page 6: \"Should Muslims be subject to greater scrutiny because of their religion?\" - This is a table.\n\n9. Page 7: \"Young people, minorities, less likely to say Muslims should receive greater scrutiny because of their faith\" - This is a table.\n\n10. Page 8: \"Growing share cite terrorism as the most important problem facing the U.S.\" - This is a table.\n\n11. Page 9: \"More Republicans than Democrats cite immigration, terrorism as top problems\" - This is a table.\n\n12. Page 10: \"Across-the-board drop in ratings of govt efforts to reduce terrorist threat\" - This is a line chart.\n\n13. Page 11: \"Evaluations of the government's job reducing the threat of terrorism\" - This is a table.\n\n14. Page 12: \"More concern with country's protection than civil liberties\" - This is a line chart.\n\n15. Page 13: \"Most Republicans say govt anti-terror policies have not gone 'far enough'\" - This is a line chart.\n\n16. Page 13: \"Young adults more concerned about civil liberties than older people\" - This is a table.\n\n17. Page 14: \"ISIS rises higher atop", "citations": [], "input_tokens": 13358, "output_tokens": 512, "cost_micros": 47754, "latency_ms": 11733, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "17", "normalised_gold": "1"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q004", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "int", "gold": "1", "pages": 28, "document_id": 5223, "arm": "azure_premium_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q004", "answer_letter": null, "raw_text": "I'll carefully count all the bar charts in this document.\n\nLet me go through the document systematically:\n\n1. Page 1: \"For first time since 9/11, government's job rating on terrorism turns negative\" - This is a line chart, not a bar chart.\n\n2. Page 2: \"Growing partisan gap in views of whether Islam encourages violence\" - This is a line chart, not a bar chart.\n\n3. Page 2: \"Most say more scrutiny of Muslims not warranted\" - This is a pie chart, not a bar chart.\n\n4. Page 3: \"GOP favored on terrorism, gun control; Democrats lead on climate change\" - This appears to be a table with horizontal bars showing differences.\n\n5. Page 4: \"Does the Islamic religion encourage violence more than other religions?\" - Line chart.\n\n6. Page 4: \"Wide partisan divide over whether Islam is more likely to encourage violence\" - Line chart.\n\n7. Page 6: \"Should Muslims be subject to greater scrutiny because of their religion?\" - This shows horizontal bar comparisons.\n\n8. Page 7: \"Young people, minorities, less likely to say Muslims should receive greater scrutiny because of their faith\" - Horizontal bars.\n\n9. Page 10: \"Across-the-board drop in ratings of govt efforts to reduce terrorist threat\" - Line chart.\n\n10. Page 12: \"More concern with country's protection than civil liberties\" - Line chart.\n\n11. Page 13: \"Most Republicans say govt anti-terror policies have not gone 'far enough'\" - Line chart.\n\n12. Page 14: \"ISIS rises higher atop public's threat list\" - Horizontal bars.\n\n13. Page 15: \"Partisan differences in views of top threats to U.S.\" - Horizontal bars.\n\n14. Page 16: \"Most continue to support U.S. military campaign against ISIS\" - Line chart.\n\n15. Page 17: \"Few say current military effort in Iraq and Syria is going well\" - Horizontal bars.\n\n16. Page 17: \"More long-term optimism in ISIS fight\" - Vertical bars.\n\n17. Page 18: \"Bigger concern: Will U.S. go too far or not far enough in Iraq and Syria?\" - Table format with categories.\n\n18. Page ", "citations": [], "input_tokens": 19218, "output_tokens": 512, "cost_micros": 65334, "latency_ms": 13172, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "18", "normalised_gold": "1"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q004", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "int", "gold": "1", "pages": 28, "document_id": 5223, "arm": "llamacloud_basic_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q004", "answer_letter": null, "raw_text": "I'll carefully go through the document to count all bar charts.\n\nLet me examine each chart in the document:\n\n1. Page 2: \"For first time since 9/11, government's job rating on terrorism turns negative\" - This is a line graph, not a bar chart.\n\n2. Page 3: \"Most say more scrutiny of Muslims not warranted\" - This is a bar chart showing percentages for Republicans, Independents, and Democrats.\n\n3. Page 3: \"GOP favored on terrorism, gun control; Democrats lead on climate change\" - This is a bar chart comparing Democratic Party vs Republican Party on various issues.\n\n4. Page 4: \"Does the Islamic religion encourage violence more than other religions?\" - This is a line graph.\n\n5. Page 4: \"Wide partisan divide over whether Islam is more likely to encourage violence\" - This is a bar chart showing Republican, Independent, and Democrat responses.\n\n6. Page 5: \"Demographic divides in views of Islam and violence\" - This is a table, not a bar chart.\n\n7. Page 6: \"Should Muslims be subject to greater scrutiny because of their religion?\" - This is a bar chart showing responses by different political groups.\n\n8. Page 7: \"Young people, minorities, less likely to say Muslims should receive greater scrutiny\" - This is a bar chart showing responses by different demographic groups.\n\n9. Page 8: \"Most important problem facing nation\" - This is a table, not a bar chart.\n\n10. Page 9: \"Most important problem facing nation\" (partisan breakdown) - This is a table, not a bar chart.\n\n11. Page 10: \"Across-the-board drop in ratings of govt efforts to reduce terrorist threat\" - This is a bar chart showing percentages by party affiliation.\n\n12. Page 11: \"Older, less educated give more negative ratings\" - This is a bar chart showing responses by age and education.\n\n13. Page 12: \"More concern with country's protection than civil liberties\" - This is a line graph.\n\n14. Page 13: Bar chart showing partisan views on anti-terrorism policies - This is a line graph.\n\n15. Page 13: \"Young adults more concerned about civil liberties\" - This is a bar chart showing responses by age group.\n\n16. Page 14: \"", "citations": [], "input_tokens": 14453, "output_tokens": 512, "cost_micros": 51039, "latency_ms": 11922, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "16", "normalised_gold": "1"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q004", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "int", "gold": "1", "pages": 28, "document_id": 5223, "arm": "llamacloud_premium_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q004", "answer_letter": null, "raw_text": "I need to count all the bar charts in this document.\n\nLet me go through the document systematically:\n\n1. Page 1: \"For first time since 9/11, government's job rating on terrorism turns negative\" - This is a line chart, not a bar chart.\n\n2. Page 2: \"Growing partisan gap in views of whether Islam encourages violence\" - This is a line chart.\n\n3. Page 2: \"Most say more scrutiny of Muslims not warranted\" - This appears to be a simple text display with percentages, not a chart.\n\n4. Page 3: \"GOP favored on terrorism, gun control; Democrats lead on climate change\" - This is a table, not a bar chart.\n\n5. Page 4: \"Does the Islamic religion encourage violence more than other religions?\" - This is a line chart.\n\n6. Page 4: \"Wide partisan divide over whether Islam is more likely to encourage violence\" - This is a line chart.\n\n7. Page 6: \"Should Muslims be subject to greater scrutiny because of their religion?\" - This is a table.\n\n8. Page 7: \"Young people, minorities, less likely to say Muslims should receive greater scrutiny because of their faith\" - This is a table.\n\n9. Page 8: \"Growing share cite terrorism as the most important problem facing the U.S.\" - This is a table.\n\n10. Page 9: \"Most important problem facing nation\" - This is a table.\n\n11. Page 10: \"Across-the-board drop in ratings of govt efforts to reduce terrorist threat\" - This is a line chart.\n\n12. Page 11: \"How well is govt doing reducing threat of terrorism?\" - This is a table.\n\n13. Page 12: \"More concern with country's protection than civil liberties\" - This is a line chart.\n\n14. Page 13: \"Most Republicans say govt anti-terror policies have not gone 'far enough'\" - This is a line chart.\n\n15. Page 13: \"Young adults more concerned about civil liberties than older people\" - This is a table.\n\n16. Page 14: \"ISIS rises higher atop public's threat list\" - This is a table.\n\n17. Page 15: \"Partisan differences in views of top threats to U.S.\" - This is a table.", "citations": [], "input_tokens": 18585, "output_tokens": 512, "cost_micros": 63435, "latency_ms": 12656, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "17", "normalised_gold": "1"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q004", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "int", "gold": "1", "pages": 28, "document_id": 5223, "arm": "surfsense_agentic", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to count the bar charts in the report.Let me continue reading to count all the bar charts in the document.Let me search for \"figure\" tags and examine the visual content descriptions more carefully:I found 31 figures in the document. Now I need to examine each one to determine which are bar charts. Let me read through the document systematically to identify the type of each chart. Based on what I've already read, I can see several line charts and some other visualizations. Let me look for more specific patterns that indicate bar charts.\n\nLooking at the content I've already read, I can identify the following charts:\n\n1. Lines 188-253: Line chart showing government's job rating on terrorism over time\n2. Lines 306-354: Line chart showing partisan gap in views of whether Islam encourages violence  \n3. Lines 372-413: Pie chart showing views on Muslim scrutiny\n4. Lines 436-505: Horizontal bar chart comparing parties on various issues (terrorist threat, gun control, economy, immigration, climate change, environment)\n5. Lines 563-617: Line chart about Islam and violence over time\n6. Lines 618-657: Line chart showing partisan differences over time\n7. Lines 937-1006: Horizontal bar chart showing Muslim scrutiny views by demographic groups\n8. Lines 1055-1193: Horizontal bar chart showing scrutiny views by various demographics\n9. Lines 1508-1563: Line chart showing government ratings on terrorism by party\n10. Lines 1738-1802: Line chart about civil liberties concerns\n11. Lines 1850-1905: Line chart showing partisan views on anti-terror policies\n12. Lines 2004-2072: Vertical bar chart comparing threat levels (ISIS, Iran, North Korea, etc.)\n13. Lines 2117-2210: Horizontal bar chart showing partisan differences in threat views\n14. Lines 2239-2304: Line chart about military campaign approval\n15. Lines 2353-2425: Two separate charts about military campaign assessment\n\nLet me carefully review the entire document to count specifically the bar charts (not line charts or pie charts).\n\nBased on my thorough reading of the document, I can identify the following **bar charts**:\n\n1. **Page 4** (lines 436-505): Horizontal bar chart - \"GOP favored on terrorism, gun control; Democrats lead on climate change\" - comparing Democratic Party vs Republican Party on 6 issues\n2. **Page 6** (lines 937-1006): Horizontal bar chart - \"Should Muslims be subject to greater scrutiny because of their religion?\" - showing responses by party/ideology\n3. **Page 7** (lines 1055-1193): Horizontal bar chart - \"Young people, minorities, less likely to say Muslims should receive greater scrutiny\" - showing responses by demographics\n4. **Page 14** (lines 2004-2072): Mixed chart showing ISIS and other threats with numerical comparisons\n5. **Page 15** (lines 2117-2210): Horizontal bar chart - \"Partisan differences in views of top threats to U.S.\"\n\nAfter careful examination, the document contains **5 bar charts**.\n\nAnswer: 5", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 46078, "error": null, "extra": {"thread_id": 1931, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_6dc6e43bf0ee4e80af05e0ae2c870d11", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 404, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "1"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q005", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "str", "gold": "will definitely/probably succeed", "pages": 28, "document_id": 5223, "arm": "native_pdf", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, specifically page 17 (document page 18), I can find information about public attitudes regarding the ISIS fight.\n\nThe chart titled \"More long-term optimism in ISIS fight\" shows that in December 2015, 66% of people said the U.S. and its allies will \"Definitely/Probably succeed\" in their military campaign against Islamic militants in Iraq and Syria.\n\nThe text confirms: \"Two-thirds (66%) now say they think the U.S. and its allies will either definitely or probably succeed in their campaign against Islamic militants in Iraq and Syria, while just 27% say they will definitely or probably fail.\"\n\nAnswer: Two-thirds believe the U.S. and allies will succeed", "citations": [], "input_tokens": 58160, "output_tokens": 159, "cost_micros": 176865, "latency_ms": 22375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "12-15-15-ISIS-and-terrorism-release-final.pdf"}, "graded": {"correct": false, "f1": 0.30769230769230765, "method": "str_norm", "normalised_pred": "two thirds believe u s and allies will succeed", "normalised_gold": "will definitely probably succeed"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q005", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "str", "gold": "will definitely/probably succeed", "pages": 28, "document_id": 5223, "arm": "azure_basic_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q005", "answer_letter": null, "raw_text": "Let me search the document for information about people's attitudes regarding the ultimate success of the U.S. and its allies in the fight against ISIS.\n\nI found the relevant section on page 17, which states:\n\n\"Views are more positive when it comes to whether the U.S. and its allies will succeed or fail in their military campaign against ISIS.\n\nTwo-thirds (66%) now say they think the U.S. and its allies will either definitely or probably succeed in their campaign against Islamic militants in Iraq and Syria, while just 27% say they will definitely or probably fail.\"\n\nThe document also notes this represents an increase: \"The share who say the U.S. and its allies will succeed is up 11 points from July (from 55%).\"\n\nAnswer: Long-term optimism that U.S. and allies will succeed", "citations": [], "input_tokens": 13394, "output_tokens": 182, "cost_micros": 42912, "latency_ms": 6734, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.28571428571428575, "method": "str_norm", "normalised_pred": "long term optimism that u s and allies will succeed", "normalised_gold": "will definitely probably succeed"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q005", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "str", "gold": "will definitely/probably succeed", "pages": 28, "document_id": 5223, "arm": "azure_premium_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, on page 17, there is a section titled \"More long-term optimism in ISIS fight\" which shows:\n\n\"Two-thirds (66%) now say they think the U.S. and its allies will either definitely or probably succeed in their campaign against Islamic militants in Iraq and Syria, while just 27% say they will definitely or probably fail. The share who say the U.S. and its allies will succeed is up 11 points from July (from 55%).\"\n\nAnswer: Will definitely or probably succeed", "citations": [], "input_tokens": 19254, "output_tokens": 118, "cost_micros": 59532, "latency_ms": 4703, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.888888888888889, "method": "str_norm", "normalised_pred": "will definitely or probably succeed", "normalised_gold": "will definitely probably succeed"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q005", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "str", "gold": "will definitely/probably succeed", "pages": 28, "document_id": 5223, "arm": "llamacloud_basic_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, specifically the section titled \"More long-term optimism in ISIS fight\" on page 16, I can find the answer.\n\nThe document shows that in December 2015, 66% of people said the U.S. and its allies will \"Definitely/Probably succeed\" in their military campaign against Islamic militants in Iraq and Syria, compared to only 27% who said they will \"Definitely/Probably fail.\"\n\nThe text states: \"The share who say the U.S. and its allies will succeed is up 11 points from July (from 55%).\"\n\nAnswer: They will succeed/be successful", "citations": [], "input_tokens": 14489, "output_tokens": 141, "cost_micros": 45582, "latency_ms": 5453, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.4444444444444445, "method": "str_norm", "normalised_pred": "they will succeed be successful", "normalised_gold": "will definitely probably succeed"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q005", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "str", "gold": "will definitely/probably succeed", "pages": 28, "document_id": 5223, "arm": "llamacloud_premium_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document for information about public attitudes on the ultimate success of the U.S. and its allies against ISIS in 2015...\n\nFrom page 17, I can see the section \"Current assessments of U.S. military campaign, prospects for success\" which states:\n\n\"Views are more positive when it comes to whether the U.S. and its allies will succeed or fail in their military campaign against ISIS.\n\nTwo-thirds (66%) now say they think the U.S. and its allies will either definitely or probably succeed in their campaign against Islamic militants in Iraq and Syria, while just 27% say they will definitely or probably fail.\"\n\nThe document also shows that \"The share who say the U.S. and its allies will succeed is up 11 points from July (from 55%).\"\n\nAnswer: Two-thirds believe U.S. and allies will succeed", "citations": [], "input_tokens": 18621, "output_tokens": 191, "cost_micros": 58728, "latency_ms": 6156, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.30769230769230765, "method": "str_norm", "normalised_pred": "two thirds believe u s and allies will succeed", "normalised_gold": "will definitely probably succeed"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q005", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "str", "gold": "will definitely/probably succeed", "pages": 28, "document_id": 5223, "arm": "surfsense_agentic", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to find information about public attitudes toward the U.S. and its allies' efforts against ISIS in 2015.Let me continue reading to find the specific information about public attitudes:Let me search for information about attitudes toward the fight against ISIS and whether the U.S. and allies will succeed:Perfect! I found the information. Let me read a bit more to confirm the complete data:Based on the document, I found the answer. The document states that although few say the current military effort in Iraq and Syria is going well (only 35% in December 2015 say it's going very/fairly well), when it comes to long-term outlook, two-thirds (66%) believe the U.S. and its allies will either definitely or probably succeed in their campaign against Islamic militants in Iraq and Syria.\n\nAnswer: Two-thirds (66%) believe the U.S. and its allies will succeed", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 43858, "error": null, "extra": {"thread_id": 1932, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_a011d8590eae4d86a7883730ba9de008", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 232, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.26666666666666666, "method": "str_norm", "normalised_pred": "two thirds 66 believe u s and its allies will succeed", "normalised_gold": "will definitely probably succeed"}}
+{"qid": "2005.12872v3.pdf::Q000", "doc_id": "2005.12872v3.pdf", "answer_format": "int", "gold": "92", "pages": 26, "document_id": 7577, "arm": "native_pdf", "question_id": "2005.12872v3.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the PyTorch inference code in Listing 1 on page 26:\n\n```python\nself.linear_class = nn.Linear(hidden_dim, num_classes + 1)\n```\n\nand\n\n```python\ndetr = DETR(num_classes=91, hidden_dim=256, nheads=8, num_encoder_layers=6, num_decoder_layers=6)\n```\n\nThe output dimension of the linear_class layer is `num_classes + 1 = 91 + 1 = 92`.\n\nAnswer: 92", "citations": [], "input_tokens": 59872, "output_tokens": 134, "cost_micros": 181626, "latency_ms": 31250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2005.12872v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "92", "normalised_gold": "92"}}
+{"qid": "2005.12872v3.pdf::Q000", "doc_id": "2005.12872v3.pdf", "answer_format": "int", "gold": "92", "pages": 26, "document_id": 7577, "arm": "azure_basic_lc", "question_id": "2005.12872v3.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the PyTorch inference code in Listing 1, specifically line 14:\n\n```python\nself.linear_class = nn.Linear(hidden_dim, num_classes + 1)\n```\n\nThe output dimension of the linear_class layer is `num_classes + 1`.\n\nFrom line 32, the model is instantiated with:\n```python\ndetr = DETR(num_classes=91, hidden_dim=256, nheads=8, num_encoder_layers=6, num_decoder_layers=6)\n```\n\nTherefore, the output dimension is 91 + 1 = 92.\n\nAnswer: 92", "citations": [], "input_tokens": 19062, "output_tokens": 151, "cost_micros": 59451, "latency_ms": 4594, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "92", "normalised_gold": "92"}}
+{"qid": "2005.12872v3.pdf::Q000", "doc_id": "2005.12872v3.pdf", "answer_format": "int", "gold": "92", "pages": 26, "document_id": 7577, "arm": "azure_premium_lc", "question_id": "2005.12872v3.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the PyTorch inference code in Listing 1, specifically line 18:\n\n```python\nself.linear_class = nn.Linear(hidden_dim, num_classes + 1)\n```\n\nThe output dimension of the linear_class layer is `num_classes + 1`.\n\nFrom line 31, we can see that the model is instantiated with:\n```python\ndetr = DETR(num_classes=91, hidden_dim=256, nheads=8, num_encoder_layers=6, num_decoder_layers=6)\n```\n\nTherefore, the output dimension is 91 + 1 = 92.\n\nAnswer: 92", "citations": [], "input_tokens": 22767, "output_tokens": 155, "cost_micros": 70626, "latency_ms": 4282, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "92", "normalised_gold": "92"}}
+{"qid": "2005.12872v3.pdf::Q000", "doc_id": "2005.12872v3.pdf", "answer_format": "int", "gold": "92", "pages": 26, "document_id": 7577, "arm": "llamacloud_basic_lc", "question_id": "2005.12872v3.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the output dimension of the linear_class layer in the DETR PyTorch inference code.\n\nLooking at the code in Listing 1:\n\n```python\ndef __init__(self, num_classes, hidden_dim, nheads,\n             num_encoder_layers, num_decoder_layers):\n    super().__init__()\n    # ... other layers ...\n    self.linear_class = nn.Linear(hidden_dim, num_classes + 1)\n```\n\nThe linear_class layer is defined as `nn.Linear(hidden_dim, num_classes + 1)`, which means it takes `hidden_dim` as input and outputs `num_classes + 1` dimensions.\n\nThen looking at line 33:\n```python\ndetr = DETR(num_classes=91, hidden_dim=256, nheads=8, num_encoder_layers=6, num_decoder_layers=6)\n```\n\nThe model is instantiated with `num_classes=91`.\n\nTherefore, the output dimension of the linear_class layer is:\nnum_classes + 1 = 91 + 1 = 92\n\nAnswer: 92", "citations": [], "input_tokens": 19551, "output_tokens": 270, "cost_micros": 62703, "latency_ms": 5453, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "92", "normalised_gold": "92"}}
+{"qid": "2005.12872v3.pdf::Q000", "doc_id": "2005.12872v3.pdf", "answer_format": "int", "gold": "92", "pages": 26, "document_id": 7577, "arm": "llamacloud_premium_lc", "question_id": "2005.12872v3.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the PyTorch inference code in Listing 1, specifically line 15:\n\n```python\nself.linear_class = nn.Linear(hidden_dim, num_classes + 1)\n```\n\nAnd from line 33:\n\n```python\ndetr = DETR(num_classes=91, hidden_dim=256, nheads=8, num_encoder_layers=6, num_decoder_layers=6)\n```\n\nThe `linear_class` layer has an output dimension of `num_classes + 1`, where `num_classes=91`.\n\nTherefore, the output dimension is 91 + 1 = 92.\n\nAnswer: 92", "citations": [], "input_tokens": 22903, "output_tokens": 155, "cost_micros": 71034, "latency_ms": 4562, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "92", "normalised_gold": "92"}}
+{"qid": "2005.12872v3.pdf::Q000", "doc_id": "2005.12872v3.pdf", "answer_format": "int", "gold": "92", "pages": 26, "document_id": 7577, "arm": "surfsense_agentic", "question_id": "2005.12872v3.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the DETR document to find the output dimension of the linear_class layer in the PyTorch inference code.Let me search for \"linear_class\" in the document to find the PyTorch inference code:Let me search for PyTorch code or inference code in the document:Now let me read the section with the PyTorch inference code around line 2864:Perfect! I found the PyTorch inference code. Looking at line 2898, I can see:\n\n```python\nself.linear_class = nn.Linear(hidden_dim, num_classes + 1)\n```\n\nAnd from line 2915, the model is instantiated with:\n\n```python\ndetr = DETR(num_classes=91, hidden_dim=256, nheads=8, num_encoder_layers=6, num_decoder_layers=6)\n```\n\nSo the output dimension of the linear_class layer is `num_classes + 1 = 91 + 1 = 92`.\n\nAnswer: 92", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 39343, "error": null, "extra": {"thread_id": 1933, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_e081bc008e9a4c03bbbaf39b68e10bd7", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 162, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "92", "normalised_gold": "92"}}
+{"qid": "2005.12872v3.pdf::Q001", "doc_id": "2005.12872v3.pdf", "answer_format": "str", "gold": "Person", "pages": 26, "document_id": 7577, "arm": "native_pdf", "question_id": "2005.12872v3.pdf::Q001", "answer_letter": null, "raw_text": "I need to find information about missed instances when the number of visible instances is 60.\n\nLooking at Figure 12 on page 25, I can see a graph titled \"Analysis of the number of instances of various classes missed by DETR depending on how many are present in the image.\"\n\nThe graph shows three colored lines representing different categories:\n- Dog (blue line)\n- Person (orange line)\n- Apple (green line)\n\nThe x-axis shows \"Number of visible instances\" and the y-axis shows \"% of missed instances.\"\n\nAt the point where the number of visible instances = 60, I need to compare the values for each category:\n\nLooking at the graph at x=60:\n- The dog (blue) line appears to be at approximately 20-25%\n- The person (orange) line appears to be at approximately 15-20%\n- The apple (green) line appears to be at approximately 10-15%\n\nAt 60 visible instances, the dog category shows the highest percentage of missed instances.\n\nAnswer: dog", "citations": [], "input_tokens": 59873, "output_tokens": 231, "cost_micros": 183084, "latency_ms": 35750, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2005.12872v3.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "dog", "normalised_gold": "person"}}
+{"qid": "2005.12872v3.pdf::Q001", "doc_id": "2005.12872v3.pdf", "answer_format": "str", "gold": "Person", "pages": 26, "document_id": 7577, "arm": "azure_basic_lc", "question_id": "2005.12872v3.pdf::Q001", "answer_letter": null, "raw_text": "I need to look at Figure 12, which shows the analysis of missed instances depending on the number of visible instances.\n\nLooking at Figure 12 on page 25, I can see a graph plotting \"% of missed instances\" (y-axis) against \"Number of visible instances\" (x-axis) for three categories: dog, person, and apple.\n\nAt 60 visible instances, I need to identify which line/category has the highest percentage of missed instances:\n\n- The dog line (appears to be one color)\n- The person line (appears to be another color)  \n- The apple line (appears to be a third color)\n\nFrom the graph, at the 60 visible instances mark, the \"apple\" category appears to have the highest percentage of missed instances, showing approximately 20-25% missed instances, while \"dog\" and \"person\" show lower percentages (around 10-15%).\n\nAnswer: apple", "citations": [], "input_tokens": 19063, "output_tokens": 203, "cost_micros": 60234, "latency_ms": 6092, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "apple", "normalised_gold": "person"}}
+{"qid": "2005.12872v3.pdf::Q001", "doc_id": "2005.12872v3.pdf", "answer_format": "str", "gold": "Person", "pages": 26, "document_id": 7577, "arm": "azure_premium_lc", "question_id": "2005.12872v3.pdf::Q001", "answer_letter": null, "raw_text": "I need to look at Figure 12, which shows \"Analysis of the number of instances of various classes missed by DETR depending on how many are present in the image.\"\n\nLooking at the graph in Figure 12, I can see three lines representing different categories: dog, person, and apple.\n\nAt 60 visible instances (x-axis), I need to compare the percentage of missed instances (y-axis) for each category:\n\n- Dog (appears to be around 20-25%)\n- Person (appears to be around 15-20%)\n- Apple (appears to be the highest, around 30-35%)\n\nThe apple category clearly shows the highest percentage of missed instances at 60 visible instances.\n\nAnswer: apple", "citations": [], "input_tokens": 22768, "output_tokens": 160, "cost_micros": 70704, "latency_ms": 5344, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "apple", "normalised_gold": "person"}}
+{"qid": "2005.12872v3.pdf::Q001", "doc_id": "2005.12872v3.pdf", "answer_format": "str", "gold": "Person", "pages": 26, "document_id": 7577, "arm": "llamacloud_basic_lc", "question_id": "2005.12872v3.pdf::Q001", "answer_letter": null, "raw_text": "I need to look at Figure 12, which shows the analysis of missed instances depending on the number of visible instances.\n\nLooking at Figure 12 on page 25, it shows a graph with \"Number of visible instances\" on the x-axis (from 0 to 100) and \"% of missed instances\" on the y-axis (from 0 to 70). The graph shows three lines for different categories: dog, apple, and one other category.\n\nAt the point where the number of visible instances = 60, I need to identify which category line is highest (meaning the largest proportion of missed instances).\n\nFrom the graph, at 60 visible instances:\n- The \"dog\" line (appears to be one line)\n- The \"apple\" line (appears to be another line)\n- There appears to be a third line\n\nLooking at the vertical position at x=60, the \"dog\" line appears to be the highest among the three categories shown, indicating the largest proportion of missed instances at that point.\n\nAnswer: dog", "citations": [], "input_tokens": 19552, "output_tokens": 226, "cost_micros": 62046, "latency_ms": 6390, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "dog", "normalised_gold": "person"}}
+{"qid": "2005.12872v3.pdf::Q001", "doc_id": "2005.12872v3.pdf", "answer_format": "str", "gold": "Person", "pages": 26, "document_id": 7577, "arm": "llamacloud_premium_lc", "question_id": "2005.12872v3.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 12 (the table on page 25), I need to find the number of missed instances for each category when there are 60 visible instances.\n\nFrom the table:\n- dog: 5 missed instances\n- person: 22 missed instances  \n- apple: 15 missed instances\n\nOut of 60 visible instances:\n- dog: 5/60 = 8.3%\n- person: 22/60 = 36.7%\n- apple: 15/60 = 25%\n\nThe person category has the largest proportion of missed instances.\n\nAnswer: person", "citations": [], "input_tokens": 22904, "output_tokens": 139, "cost_micros": 70797, "latency_ms": 4000, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "person", "normalised_gold": "person"}}
+{"qid": "2005.12872v3.pdf::Q001", "doc_id": "2005.12872v3.pdf", "answer_format": "str", "gold": "Person", "pages": 26, "document_id": 7577, "arm": "surfsense_agentic", "question_id": "2005.12872v3.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the document to find information about visible instances and missed instances by category.Let me search for content about \"missed instances\" and look for figures or charts.Let me read the section with Figure 12 which discusses missed instances:Perfect! I found Figure 12 which shows the chart. Let me read the data more carefully. The chart shows:\n- Y-axis: % of missed instances (0 to 70)\n- X-axis: Number of visible instances (20 to 100)\n- Three categories plotted: dog, person, and apple\n\nAt 60 visible instances, I need to identify which category has the largest proportion of missed instances. From the text data shown:\n- Lines 2795-2796: dog\n- Lines 2797-2798: person  \n- Lines 2801-2802: apple\n\nThe chart shows these three categories, and I need to determine which has the highest value at x=60. Based on the chart layout and the typical pattern where lines are listed in the legend in order of their values, and given that \"dog\" is listed first in the legend (line 2795), followed by \"person\" (line 2797), and then \"apple\" (line 2801), this typically indicates their relative positions on the chart.\n\nLet me look for any additional data that might give exact values:Based on the chart data from Figure 12, I can see:\n- Y-axis values listed: 70, 60, 50, 40, 30, 20, 10, 0\n- X-axis values: 20, 40, 60, 80, 100\n- Three categories in the legend: dog, person, apple\n\nThe legend positioning typically indicates the order of lines on the chart. In this case, at x=60 (60 visible instances), \"dog\" appears to be listed first in the legend (lines 2795-2796), suggesting it has the highest proportion of missed instances at that point, followed by \"person\" and then \"apple\".\n\nGiven the layout where:\n- \"dog\" is listed at the top of the legend\n- \"person\" is in the middle\n- \"apple\" is at the bottom\n\nAnd the typical convention where legend items are ordered by their y-values at key points or overall prominence, \"dog\" would have the largest proportion of missed instances when the number of visible instances equals 60.\n\nAnswer: dog", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 47125, "error": null, "extra": {"thread_id": 1934, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_39684984e24b48b0a29495ffed5edb23", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 304, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "dog", "normalised_gold": "person"}}
+{"qid": "2005.12872v3.pdf::Q002", "doc_id": "2005.12872v3.pdf", "answer_format": "float", "gold": "57.3", "pages": 26, "document_id": 7577, "arm": "native_pdf", "question_id": "2005.12872v3.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the AP50 value for DETR with L1 loss but without GIoU loss on the COCO validation set.\n\nLooking at Table 4 on page 13, which shows \"Effect of loss components on AP\":\n\nThe table has columns for: class, \u21131, GIoU, AP, \u0394, AP50, \u0394, APS, APM, APL\n\nThe rows show different combinations:\n- Row 1: X (class), X (\u21131), blank (no GIoU): AP=35.8, AP50=57.3\n- Row 2: X (class), blank (no \u21131), X (GIoU): AP=39.9, AP50=61.6\n- Row 3: X (class), X (\u21131), X (GIoU): AP=40.6, AP50=61.6\n\nThe question asks for DETR with L1 loss and WITHOUT GIoU loss, which corresponds to the first row where class=X, \u21131=X, and GIoU is blank.\n\nAnswer: 57.3", "citations": [], "input_tokens": 59883, "output_tokens": 274, "cost_micros": 183759, "latency_ms": 35969, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2005.12872v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "57.3", "normalised_gold": "57.3"}}
+{"qid": "2005.12872v3.pdf::Q002", "doc_id": "2005.12872v3.pdf", "answer_format": "float", "gold": "57.3", "pages": 26, "document_id": 7577, "arm": "azure_basic_lc", "question_id": "2005.12872v3.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the AP50 value for DETR with L1 loss but without GIoU loss on the COCO validation set.\n\nLooking at Table 4 in the document, which shows \"Effect of loss components on AP\", I can see three different configurations:\n\n1. First row: class \u2713, L1 \u2717, GIoU \u2713 \u2192 AP: 35.8, AP50: 57.3\n2. Second row: class \u2713, L1 \u2713, GIoU \u2717 \u2192 AP: 39.9, AP50: 61.6\n3. Third row (baseline): class \u2713, L1 \u2713, GIoU \u2713 \u2192 AP: 40.6, AP50: 61.6\n\nThe question asks for the configuration with L1 loss and without GIoU loss, which corresponds to the second row.\n\nAnswer: 61.6", "citations": [], "input_tokens": 19073, "output_tokens": 223, "cost_micros": 60564, "latency_ms": 5608, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "61.6", "normalised_gold": "57.3"}}
+{"qid": "2005.12872v3.pdf::Q002", "doc_id": "2005.12872v3.pdf", "answer_format": "float", "gold": "57.3", "pages": 26, "document_id": 7577, "arm": "azure_premium_lc", "question_id": "2005.12872v3.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the AP50 value for DETR with L1 loss but without GIoU loss on the COCO validation set.\n\nLooking at Table 4 in the document, which shows \"Effect of loss components on AP\", I can see three rows with different loss combinations:\n\n1. Row 1: class \u2612, l1 \u2612, GIoU \u2610 (class and L1, but no GIoU)\n   - AP: 35.8\n   - AP50: 57.3\n\n2. Row 2: class \u2612, l1 \u2610, GIoU \u2612 (class and GIoU, but no L1)\n   - AP: 39.9\n   - AP50: 61.6\n\n3. Row 3 (baseline): class \u2612, l1 \u2612, GIoU \u2612 (all three losses)\n   - AP: 40.6\n   - AP50: 61.6\n\nThe question asks for DETR with L1 loss and without GIoU loss, which corresponds to the first row in Table 4.\n\nAnswer: 57.3", "citations": [], "input_tokens": 22778, "output_tokens": 278, "cost_micros": 72504, "latency_ms": 7140, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "57.3", "normalised_gold": "57.3"}}
+{"qid": "2005.12872v3.pdf::Q002", "doc_id": "2005.12872v3.pdf", "answer_format": "float", "gold": "57.3", "pages": 26, "document_id": 7577, "arm": "llamacloud_basic_lc", "question_id": "2005.12872v3.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the AP50 value of DETR with L1 loss but without GIoU loss on the COCO validation set.\n\nLooking at Table 4 in the document, which shows \"Effect of loss components on AP\", I can see the ablation study for different loss combinations:\n\nThe table shows three rows:\n1. class \u2713, L1 \u2717, GIoU \u2713: AP = 35.8, AP50 = 57.3\n2. class \u2713, L1 \u2713, GIoU \u2717: AP = 39.9, AP50 = 61.6\n3. class \u2713, L1 \u2713, GIoU \u2713: AP = 40.6, AP50 = 61.6 (baseline)\n\nThe second row shows the model with L1 loss but without GIoU loss (class \u2713, L1 \u2713, GIoU \u2717).\n\nThe AP50 value for this configuration is 61.6.\n\nAnswer: 61.6", "citations": [], "input_tokens": 19562, "output_tokens": 256, "cost_micros": 62526, "latency_ms": 6390, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "61.6", "normalised_gold": "57.3"}}
+{"qid": "2005.12872v3.pdf::Q002", "doc_id": "2005.12872v3.pdf", "answer_format": "float", "gold": "57.3", "pages": 26, "document_id": 7577, "arm": "llamacloud_premium_lc", "question_id": "2005.12872v3.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the AP50 value of DETR with L1 loss but without GIoU loss on the COCO validation set.\n\nLooking at Table 4 in the document, which shows \"Effect of loss components on AP\":\n\nThe table has columns for: class, \u21131, GIoU, AP, \u0394, AP\u2085\u2080, \u0394, AP\u209b, AP\u2098, AP\u2097\n\nThe rows show different combinations:\n1. Row with \u2713 for class and \u21131, but no GIoU: AP = 35.8, AP\u2085\u2080 = 57.3\n2. Row with \u2713 for class and GIoU, but no \u21131: AP = 39.9, AP\u2085\u2080 = 61.6\n3. Row with \u2713 for all three (class, \u21131, GIoU): AP = 40.6, AP\u2085\u2080 = 61.6\n\nThe first row corresponds to the model with L1 loss but without GIoU loss, which shows AP\u2085\u2080 = 57.3\n\nAnswer: 57.3", "citations": [], "input_tokens": 22914, "output_tokens": 285, "cost_micros": 73017, "latency_ms": 7812, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "57.3", "normalised_gold": "57.3"}}
+{"qid": "2005.12872v3.pdf::Q002", "doc_id": "2005.12872v3.pdf", "answer_format": "float", "gold": "57.3", "pages": 26, "document_id": 7577, "arm": "surfsense_agentic", "question_id": "2005.12872v3.pdf::Q002", "answer_letter": null, "raw_text": "I'll search for the AP50 value of DETR with L1 loss and without GIoU loss in the provided document.Let me read the table that shows the effect of loss components:Perfect! I found Table 4 which shows the effect of loss components on AP. Looking at the table:\n\n- The first row shows: class=\u2612, l1=\u2612, GIoU=(empty), which means L1 loss is used but GIoU loss is not used\n- For this configuration, the AP50 value is 57.3\n\nAnswer: 57.3", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 32109, "error": null, "extra": {"thread_id": 1935, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_984a5f35790043fbba33381ecf31e0b6", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 138, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "57.3", "normalised_gold": "57.3"}}
+{"qid": "2005.12872v3.pdf::Q004", "doc_id": "2005.12872v3.pdf", "answer_format": "str", "gold": "Red", "pages": 26, "document_id": 7577, "arm": "native_pdf", "question_id": "2005.12872v3.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Figure 10 on page 22, which shows the \"Architecture of DETR's transformer,\" I can see the Multi-Head Self-Attention layers are shown in the diagram.\n\nThe Multi-Head Self-Attention boxes in the diagram are colored pink/salmon.\n\nAnswer: Pink (or salmon)", "citations": [], "input_tokens": 59883, "output_tokens": 71, "cost_micros": 180714, "latency_ms": 31750, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2005.12872v3.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "pink or salmon", "normalised_gold": "red"}}
+{"qid": "2005.12872v3.pdf::Q004", "doc_id": "2005.12872v3.pdf", "answer_format": "str", "gold": "Red", "pages": 26, "document_id": 7577, "arm": "azure_basic_lc", "question_id": "2005.12872v3.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Figure 10 \"Architecture of DETR's transformer\" in the document, I need to identify the color of the Multi-Head Self-Attention layer.\n\nIn the diagram, the Multi-Head Self-Attention layer appears in both the Encoder and Decoder sections. These layers are shown in a light orange or peach color.\n\nAnswer: Orange (or peach/light orange)", "citations": [], "input_tokens": 19073, "output_tokens": 89, "cost_micros": 58554, "latency_ms": 3953, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "orange or peach light orange", "normalised_gold": "red"}}
+{"qid": "2005.12872v3.pdf::Q004", "doc_id": "2005.12872v3.pdf", "answer_format": "str", "gold": "Red", "pages": 26, "document_id": 7577, "arm": "azure_premium_lc", "question_id": "2005.12872v3.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Figure 10 in the document, I can see the architecture diagram of DETR's transformer.\n\nThe Multi-Head Self-Attention layer boxes in the diagram are colored blue.\n\nAnswer: Blue", "citations": [], "input_tokens": 22778, "output_tokens": 48, "cost_micros": 69054, "latency_ms": 4203, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "blue", "normalised_gold": "red"}}
+{"qid": "2005.12872v3.pdf::Q004", "doc_id": "2005.12872v3.pdf", "answer_format": "str", "gold": "Red", "pages": 26, "document_id": 7577, "arm": "llamacloud_basic_lc", "question_id": "2005.12872v3.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Figure 10 in the document, I need to identify the color of the Multi-Head Self-Attention layer in the architecture diagram.\n\nIn the figure, the Multi-Head Self-Attention layer appears in the Encoder section of the architecture.\n\nAnswer: Orange", "citations": [], "input_tokens": 19562, "output_tokens": 62, "cost_micros": 59616, "latency_ms": 3406, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "orange", "normalised_gold": "red"}}
+{"qid": "2005.12872v3.pdf::Q004", "doc_id": "2005.12872v3.pdf", "answer_format": "str", "gold": "Red", "pages": 26, "document_id": 7577, "arm": "llamacloud_premium_lc", "question_id": "2005.12872v3.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Figure 10 in the document, which shows the architecture of DETR's transformer:\n\nThe Multi-Head Self-Attention layers are shown in the diagram boxes in the Encoder and Decoder sections.\n\nAnswer: The diagram does not specify colors for the layers in Fig 10", "citations": [], "input_tokens": 22914, "output_tokens": 65, "cost_micros": 69717, "latency_ms": 3734, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "diagram does not specify colors for layers in fig 10", "normalised_gold": "red"}}
+{"qid": "2005.12872v3.pdf::Q004", "doc_id": "2005.12872v3.pdf", "answer_format": "str", "gold": "Red", "pages": 26, "document_id": 7577, "arm": "surfsense_agentic", "question_id": "2005.12872v3.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to find Figure 10 and identify the color of the Multi-Head Self-Attention layer in DETR's transformer architecture.Let me search for \"Fig 10\" or \"Figure 10\" in the document:Let me search for references to transformer architecture figures:Great! I found that Figure 10 shows the \"Architecture of DETR's transformer\" at line 2409. Let me read that section:I can see the text content of Figure 10, but I need to look for color information. The document mentions it's embedded as an image. Let me search for more detailed description of the figure or any color information:The visual description I found was for a different image (zebras). Let me go back and look more carefully at the Figure 10 architecture diagram. Let me search around that area more carefully:I see the text extracted from Figure 10, but the visual description that follows (zebras) is clearly for a different image (Im7.png). Let me search for visual descriptions that might be associated with the transformer architecture diagram. Let me look before Figure 10:Let me search for where there might be an embedded image description for Figure 10. Let me check if there's an image description before line 2400:Let me search for image descriptions or embedded images related to Figure 10. Let me check what comes right before the figure starts:Let me look more carefully. The embedded image Im7.png comes AFTER Figure 10. Let me check if there's an embedded image BEFORE Figure 10 that might contain the actual architecture diagram:The document doesn't seem to have a visual description embedded for Figure 10's architecture diagram itself within the accessible text. However, looking at the structure, it appears Figure 10 is presented as a figure with text labels but the actual image with colors might not have been described in the OCR/visual description section.\n\nLet me search more broadly for any mentions of colors in the architecture or in figures:Let me search for other color terms that might be in Figure 10:These color mentions don't seem to be about Figure 10. Let me try to find if there's an image file reference specifically for the transformer architecture. Let me look for \"Im\" images around Figure 10:Image Im8.png is not Figure 10 either. Let me look between Figure 10 text and Im7.png more carefully. Actually, let me check if there's an embedded image right after the </figure> tag for Figure 10. Let me look at the exact area around the figure closing:So Figure 10 ends at line 2485, and then Im7.png starts at line 2489, which is clearly about zebras, not the transformer architecture. The Figure 10 text content doesn't include a visual description with colors. Let me search for if there's any embedded image metadata that might describe colors for transformer diagrams. Let me try searching for \"green\" or \"red\" along with terms near Figure 10 context:Let me try a different approach. Perhaps the PDF was processed and the image containing Figure 10 wasn't given a separate visual description. Let me look for if there's any image file reference or visual annotation BEFORE Figure 10 that might be the diagram itself:It seems the document processing didn't include a detailed visual description with colors for Figure 10's architecture diagram. This could mean the image was processed but only the text labels were extracted, not the visual styling information like colors of the boxes.\n\nLet me try searching the entire document for any reference to colors of \"Self-Attention\" layers or transformer layers:Based on my thorough search through the document, I cannot find any visual description that specifies the color of the Multi-Head Self-Attention layer in Figure 10. The document contains the text content and labels from Figure 10, but the visual styling information (colors of boxes/components) was not captured in the OCR or visual description process.\n\nWithout access to the actual image file or a visual description that includes color information, I cannot determine from this document what color the Multi-Head Self-Attention layer is depicted as in Figure 10.\n\nAnswer: Color information not available in the provided document text.", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 122281, "error": null, "extra": {"thread_id": 1936, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_be29632a506941cfa603de6bebd1c2eb", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 844, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "color information not available in provided document text", "normalised_gold": "red"}}
+{"qid": "2005.12872v3.pdf::Q005", "doc_id": "2005.12872v3.pdf", "answer_format": "int", "gold": "36", "pages": 26, "document_id": 7577, "arm": "native_pdf", "question_id": "2005.12872v3.pdf::Q005", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "JSONDecodeError: Expecting value: line 169 column 1 (char 924)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "36"}}
+{"qid": "2005.12872v3.pdf::Q005", "doc_id": "2005.12872v3.pdf", "answer_format": "int", "gold": "36", "pages": 26, "document_id": 7577, "arm": "azure_basic_lc", "question_id": "2005.12872v3.pdf::Q005", "answer_letter": null, "raw_text": "I need to find information about the number of multi-head self-attention layers in DETR's default structure.\n\nLooking at the document, I need to identify the default configuration of DETR.\n\nFrom Section 4 (Experiments), specifically the \"Technical details\" subsection:\n\"we choose a model with 6 transformer and 6 decoder layers of width 256 with 8 attention heads.\"\n\nHowever, I need to understand the architecture better. Looking at Section 3.2 and Figure 10:\n\n1. The **encoder** has 6 layers, and each encoder layer contains one multi-head self-attention module (as stated: \"Each encoder layer has a standard architecture and consists of a multi-head self-attention module and a feed forward network (FFN)\")\n\n2. The **decoder** has 6 layers. Looking at the description and Figure 10, each decoder layer contains:\n   - Multi-head self-attention\n   - Multi-head attention (encoder-decoder attention)\n\nSo each decoder layer has one multi-head self-attention layer.\n\nFrom the default configuration mentioned in the experiments:\n- 6 encoder layers \u00d7 1 multi-head self-attention per layer = 6 multi-head self-attention layers\n- 6 decoder layers \u00d7 1 multi-head self-attention per layer = 6 multi-head self-attention layers\n\nTotal multi-head self-attention layers = 6 + 6 = 12\n\nAnswer: 12", "citations": [], "input_tokens": 19056, "output_tokens": 334, "cost_micros": 62178, "latency_ms": 8546, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "12", "normalised_gold": "36"}}
+{"qid": "2005.12872v3.pdf::Q005", "doc_id": "2005.12872v3.pdf", "answer_format": "int", "gold": "36", "pages": 26, "document_id": 7577, "arm": "azure_premium_lc", "question_id": "2005.12872v3.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about DETR's default structure regarding multi-head self-attention layers.\n\nFrom Section 4.1 \"Comparison with Faster R-CNN\":\n\"To be comparable in the number of parameters we choose a model with 6 transformer and 6 decoder layers of width 256 with 8 attention heads.\"\n\nFrom Section 4.2 \"Ablations\":\n\"For the study we choose ResNet-50-based DETR model with 6 encoder, 6 decoder layers and width 256.\"\n\nFrom the architecture description in Section 3.2:\n- The transformer encoder has multiple layers, each containing a multi-head self-attention module\n- The transformer decoder has multiple layers, each containing multi-head self-attention and encoder-decoder attention mechanisms\n\nSo the default DETR structure has:\n- 6 encoder layers (each with 1 multi-head self-attention layer) = 6 layers\n- 6 decoder layers (each with 1 multi-head self-attention layer) = 6 layers\n\nTotal multi-head self-attention layers = 6 + 6 = 12\n\nAnswer: 12", "citations": [], "input_tokens": 22761, "output_tokens": 271, "cost_micros": 72348, "latency_ms": 7187, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "12", "normalised_gold": "36"}}
+{"qid": "2005.12872v3.pdf::Q005", "doc_id": "2005.12872v3.pdf", "answer_format": "int", "gold": "36", "pages": 26, "document_id": 7577, "arm": "llamacloud_basic_lc", "question_id": "2005.12872v3.pdf::Q005", "answer_letter": null, "raw_text": "I need to find information about DETR's default structure and the number of multi-head self-attention layers.\n\nLooking through the document:\n\n1. In Section 4 \"Experiments\" under \"Technical details\", it states:\n\"We use scale augmentation... we choose a model with 6 transformer and 6 decoder layers of width 256 with 8 attention heads.\"\n\n2. In Section A.3 \"Detailed architecture\", the document describes the transformer architecture with both encoder and decoder layers.\n\n3. From the architecture description and Figure 10:\n- The encoder has multiple layers, each containing multi-head self-attention\n- The decoder has multiple layers, each containing:\n  - Multi-head self-attention\n  - Multi-head attention (encoder-decoder attention)\n\n4. From the technical details in Section 4:\n- 6 encoder layers (each with multi-head self-attention)\n- 6 decoder layers (each with multi-head self-attention)\n\nSo the total number of multi-head self-attention layers would be:\n- 6 encoder layers \u00d7 1 self-attention layer each = 6\n- 6 decoder layers \u00d7 1 self-attention layer each = 6\n- Total = 12\n\nAnswer: 12", "citations": [], "input_tokens": 19545, "output_tokens": 287, "cost_micros": 62940, "latency_ms": 7234, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "12", "normalised_gold": "36"}}
+{"qid": "2005.12872v3.pdf::Q005", "doc_id": "2005.12872v3.pdf", "answer_format": "int", "gold": "36", "pages": 26, "document_id": 7577, "arm": "llamacloud_premium_lc", "question_id": "2005.12872v3.pdf::Q005", "answer_letter": null, "raw_text": "I need to find information about DETR's default structure and count the multi-head self-attention layers.\n\nFrom the document, I can find the architecture details:\n\n1. In Section 4.2 (Ablations), it states: \"For the study we choose ResNet-50-based DETR model with 6 encoder, 6 decoder layers and width 256.\"\n\n2. Looking at Figure 10 and Section A.3 (Detailed architecture), the structure shows:\n   - Encoder has N layers, each with Multi-Head Self-Attention\n   - Decoder has M layers, each with Multi-Head Self-Attention AND Multi-Head Attention (cross-attention)\n\n3. From the baseline model description: \"we choose a model with 6 transformer and 6 decoder layers\"\n\n4. In the encoder: Each of the 6 encoder layers contains 1 multi-head self-attention layer = 6 layers\n\n5. In the decoder: Each of the 6 decoder layers contains 1 multi-head self-attention layer = 6 layers\n   (Note: The decoder also has multi-head cross-attention, but that's different from self-attention)\n\nTotal multi-head self-attention layers = 6 (encoder) + 6 (decoder) = 12\n\nAnswer: 12", "citations": [], "input_tokens": 22897, "output_tokens": 303, "cost_micros": 73236, "latency_ms": 10280, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "12", "normalised_gold": "36"}}
+{"qid": "2005.12872v3.pdf::Q005", "doc_id": "2005.12872v3.pdf", "answer_format": "int", "gold": "36", "pages": 26, "document_id": 7577, "arm": "surfsense_agentic", "question_id": "2005.12872v3.pdf::Q005", "answer_letter": null, "raw_text": "I'll search for information about DETR's structure in the provided document.Now I found the key information. Let me search for more details about the architecture to confirm the total number of self-attention layers.Let me check the decoder structure to understand how many self-attention layers are in the decoder:Now let me look at the supplementary material section that describes the architecture in detail:Based on the document, I can see that:\n\n1. The default DETR model has **6 encoder layers** (line 1054, 1073: \"6 transformer and 6 decoder layers\" and \"6 encoder, 6 decoder layers\")\n\n2. Each encoder layer has a **multi-head self-attention module** (line 624)\n\n3. The default DETR model has **6 decoder layers** (line 1054, 1073)\n\n4. Each decoder layer has **multi-head self-attention** (line 2403-2404: \"multiple multi-head self-attention and decoder-encoder attention\")\n\nSo the total number of multi-head self-attention layers in DETR's default structure is:\n- 6 encoder layers \u00d7 1 self-attention per layer = 6\n- 6 decoder layers \u00d7 1 self-attention per layer = 6\n- Total = 12\n\nAnswer: 12", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 61719, "error": null, "extra": {"thread_id": 1937, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_12c4837a12434eeca194f2ee2772f40e", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 320, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "12", "normalised_gold": "36"}}
+{"qid": "2021-Apple-Catalog.pdf::Q000", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "int", "gold": "4", "pages": 55, "document_id": 7578, "arm": "native_pdf", "question_id": "2021-Apple-Catalog.pdf::Q000", "answer_letter": null, "raw_text": "I need to count the different iPad solutions shown in this InVue 2021 Product Catalog.\n\nLooking through the document systematically:\n\n1. **Series 2865** (page 9) - For iPad Pro, iPad Air, iPad, iPad mini 4\n   - Multi-position security system with sensors\n\n2. **OnePOD** (page 10) - For iPad Pro, iPad Air, iPad, iPad mini 4\n   - POD display solution\n\n3. **Series 960** (page 12) - For iPad Pro, iPad Air, iPad, iPad mini 4\n   - POD with exposed cable\n\n4. **iPad + Keyboard Solutions** (pages 17-19):\n   - Standard Solution for iPad Pro + Keyboards (page 18)\n   - High Security Solution for iPad Pro + Keyboards (page 19)\n\n5. **iPad Digital Display** (pages 15-16) - Bracketed pedestal solution\n\nSo counting the distinct iPad solutions:\n1. Series 2865\n2. OnePOD\n3. Series 960\n4. iPad + Keyboard Standard Solution\n5. iPad + Keyboard High Security Solution\n6. iPad Digital Display\n\nAnswer: 6", "citations": [], "input_tokens": 96851, "output_tokens": 271, "cost_micros": 294618, "latency_ms": 54405, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2021-Apple-Catalog.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "4"}}
+{"qid": "2021-Apple-Catalog.pdf::Q000", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "int", "gold": "4", "pages": 55, "document_id": 7578, "arm": "azure_basic_lc", "question_id": "2021-Apple-Catalog.pdf::Q000", "answer_letter": null, "raw_text": "I need to count the different iPad solutions shown in this Apple catalog document.\n\nLet me go through the document systematically:\n\n1. **Series 2865** (pages 3-5) - Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n\n2. **OnePOD** (pages 6-9) - Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n\n3. **Series 960** (pages 10-12) - Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n\n4. **One60 Tethered** (page 13) - Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n\n5. **One90QR** (page 14) - Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n\n6. **iPad Digital Display** (pages 15-16) - A bracketed pedestal solution for iPads as digital displays\n\n7. **iPad + Keyboard Solutions** (pages 17-19):\n   - iPad Pro + Keyboards Standard Solution (page 18)\n   - iPad Pro + Keyboards High Security Solution (page 19)\n\nLet me count the distinct iPad solutions:\n1. Series 2865\n2. OnePOD\n3. Series 960\n4. One60 Tethered\n5. One90QR\n6. iPad Digital Display\n7. iPad Pro + Keyboards Standard Solution\n8. iPad Pro + Keyboards High Security Solution\n\nAnswer: 8", "citations": [], "input_tokens": 11786, "output_tokens": 335, "cost_micros": 40383, "latency_ms": 6532, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "4"}}
+{"qid": "2021-Apple-Catalog.pdf::Q000", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "int", "gold": "4", "pages": 55, "document_id": 7578, "arm": "azure_premium_lc", "question_id": "2021-Apple-Catalog.pdf::Q000", "answer_letter": null, "raw_text": "I need to count the different iPad solutions shown in this document.\n\nLooking through the document systematically:\n\n1. **Series 2865** (pages 1-5) - Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n\n2. **OnePOD** (pages 3-4) - Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n\n3. **Series 960** (page 5) - Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n\n4. **One60 Tethered** (page 6) - Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n\n5. **One90QR** (page 7) - Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n\n6. **iPad Digital Display** (pages 8-9) - For utilizing iPads as digital displays\n\n7. **iPad Pro + Keyboards Standard Solution** (page 11) - For iPad Pro with Magic Keyboard and Smart Keyboard\n\n8. **iPad Pro + Keyboards High Security Solution** (page 12) - For iPad Pro with keyboards\n\nSo there are **8** different iPad solutions shown in the document.\n\nAnswer: 8", "citations": [], "input_tokens": 18388, "output_tokens": 271, "cost_micros": 59229, "latency_ms": 6733, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "4"}}
+{"qid": "2021-Apple-Catalog.pdf::Q000", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "int", "gold": "4", "pages": 55, "document_id": 7578, "arm": "llamacloud_basic_lc", "question_id": "2021-Apple-Catalog.pdf::Q000", "answer_letter": null, "raw_text": "I need to count the different iPad solutions mentioned in the document.\n\nLet me go through the document systematically:\n\n1. **Series 2865** - mentioned for iPad, iPad mini, iPad Air and iPad Pro (without Keyboard) on page 38\n\n2. **OnePOD** - mentioned for iPad, iPad mini, iPad Air and iPad Pro (without keyboard) on page 40\n\n3. **Series 960** - mentioned for iPad, iPad mini, iPad Air and iPad Pro (without keyboard) on page 41\n\n4. **iPad Digital Display** - mentioned as a \"Bracketed pedestal for utilizing iPads as digital displays on Apple Fixtures\" on pages 11-12\n\n5. **iPad Pro + Keyboards Standard Solution** - mentioned on page 11 for \"iPad Pro, Magic Keyboard, Smart Keyboard and Apple Pencil\"\n\n6. **iPad Pro + Keyboards High Security Solution** - mentioned on page 12\n\nSo I can identify 6 different iPad solutions in the document:\n- Series 2865 (for standard iPads)\n- OnePOD (for standard iPads)\n- Series 960 (for standard iPads)\n- iPad Digital Display\n- iPad Pro + Keyboards Standard Solution\n- iPad Pro + Keyboards High Security Solution\n\nAnswer: 6", "citations": [], "input_tokens": 12620, "output_tokens": 289, "cost_micros": 42195, "latency_ms": 8000, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "4"}}
+{"qid": "2021-Apple-Catalog.pdf::Q000", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "int", "gold": "4", "pages": 55, "document_id": 7578, "arm": "llamacloud_premium_lc", "question_id": "2021-Apple-Catalog.pdf::Q000", "answer_letter": null, "raw_text": "I need to count the different iPad solutions shown in the document.\n\nLooking through the document systematically:\n\n1. **Series 2865** (pages 1-5) - Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n\n2. **OnePOD** (pages 3-4) - Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n\n3. **Series 960** (page 5) - Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n\n4. **One60 Tethered** (page 6) - Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n\n5. **One90QR** (page 7) - Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n\n6. **iPad Digital Display** (pages 8-9) - For utilizing iPads as digital displays on Apple Fixtures\n\n7. **iPad + Keyboard Solutions - Standard Solution** (page 11) - For iPad Pro with Magic Keyboard, Smart Keyboard and Apple Pencil\n\n8. **iPad + Keyboard Solutions - High Security Solution** (page 12) - For iPad Pro with Magic and Smart Folio keyboards\n\nSo there are 8 different iPad solutions shown in the document.\n\nAnswer: 8", "citations": [], "input_tokens": 16954, "output_tokens": 288, "cost_micros": 55182, "latency_ms": 7030, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "4"}}
+{"qid": "2021-Apple-Catalog.pdf::Q000", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "int", "gold": "4", "pages": 55, "document_id": 7578, "arm": "surfsense_agentic", "question_id": "2021-Apple-Catalog.pdf::Q000", "answer_letter": null, "raw_text": "I'll search for iPad solutions in the provided document.Let me search for \"solution\" to find where iPad solutions are mentioned:Now let me read the table of contents section more carefully to see the iPad solutions listed:I can see there are two main sections: \"iPhone & iPad Solutions\" and \"iPad + Keyboard Solutions\". Let me now read through these sections to count the individual iPad solutions. Let me start with the \"iPhone & iPad Solutions\" section:Let me read the document more systematically. Let me check the pages 1-10 which should cover the \"iPhone & iPad Solutions\" section:Let me search for specific product series names to count the solutions:Let me look more systematically at the document. Let me read pages that should list individual iPad solutions:I can see Series 2865, OnePOD, and Series 960 mentioned. Let me continue reading to find all the different solutions and then look at the iPad + Keyboard Solutions section:Let me search for more product names. Let me look for what comes after Series 960:Good, I found \"One60 Tethered\". Let me continue:Good, I found \"One90QR\". Let me continue searching through the document:Good, I found \"iPad Digital Display\". Now let me check if there are more iPhone & iPad solutions and then look at the iPad + Keyboard Solutions section. Let me look at page 10 where iPad + Keyboard Solutions should start:Good! I found \"iPad Pro + Keyboards Standard Solution\". Let me continue to find the next one:Good! I found \"iPad Pro + Keyboards High Security Solution\". Now let me continue to see if there are more iPad solutions. Let me check what comes on page 12 and beyond, which should start MacBook Solutions (page 13 according to the table of contents):Perfect! Now I can see that page 13 starts MacBook Solutions, which means the iPad + Keyboard Solutions section ends at page 12. Let me now compile all the iPad solutions I found. Let me verify by checking if there are any other iPad product names I might have missed. Let me look at pages 10-11 more carefully:Now let me look more carefully at page 9 to see if there are more solutions listed on that introductory page:Good. Now let me compile the complete list. I need to be systematic. Let me re-examine the section headers carefully. Let me look at what page 9 shows at the bottom to clarify:\n\nActually, looking back at my findings, page 9 seems to be a transition page showing the \"iPad + Keyboard Solutions\" section header with compatibility information. Pages 10-11 then have two specific solutions.\n\nLet me now count all the different iPad solutions I found in the document:\n\n**From \"iPhone & iPad Solutions\" section (pages 1-8):**\n1. Series 2865\n2. OnePOD\n3. Series 960\n4. One60 Tethered\n5. One90QR\n6. iPad Digital Display\n\n**From \"iPad + Keyboard Solutions\" section (pages 10-11):**\n7. iPad Pro + Keyboards Standard Solution\n8. iPad Pro + Keyboards High Security Solution\n\nThat gives me a total of **8** different iPad solutions.\n\nBut wait, let me verify this more carefully. OnePOD was mentioned on page 2 in the product comparisons - let me verify if it's a distinct solution:Yes, OnePOD is a distinct solution. Let me also verify the others are all distinct iPad solutions and not just iPhones. Let me check the compatibility sections more carefully. Looking back at my readings, all of these products list iPad compatibility:\n\n- Series 2865: Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n- OnePOD: There's a specific OnePOD section for iPad ordering\n- Series 960: Compatible with iPads\n- One60 Tethered: Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n- One90QR: Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n- iPad Digital Display: Specifically for iPads on Apple Fixtures\n- iPad Pro + Keyboards Standard Solution: For iPad Pro with keyboards\n- iPad Pro + Keyboards High Security Solution: For iPad Pro with keyboards\n\nAll 8 are indeed iPad solutions. \n\nAnswer: 8", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 126703, "error": null, "extra": {"thread_id": 1938, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_18872fac42c746e097bc8836213e95a2", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 870, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "4"}}
+{"qid": "2021-Apple-Catalog.pdf::Q002", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "int", "gold": "4", "pages": 55, "document_id": 7578, "arm": "native_pdf", "question_id": "2021-Apple-Catalog.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the MacBook High Security Solution section and identify which features are listed but not shown in the illustration.\n\nLooking at page 15 (document page 22), I can see the \"MacBook High Security Solution\" section.\n\nThe \"FEATURES AND BENEFITS\" lists 6 numbered features:\n\n1. Rigid anchor design on rear of MacBook\n2. Steel cable co-molded for added strength, secured to table with OneKEY Padlock\n3. Bracket bar arms and custom form factor to capture and prevent lateral movement\n4. Ability to power/alarm MacBook with USB-C Port\n5. Metal tabs with rubber pads prevents closing of MacBook for added security and accidental damage during installation\n6. Stable, molded feet prevent rocking and protect fixture table\n\nNow looking at the illustrations on the same page, I can see:\n- Image showing features 3, 6, 1, 6, and 2 labeled\n- Image showing feature 5 labeled\n\nLet me verify which features are NOT shown in the illustrations:\n- Feature 1: Shown (labeled in bottom left image)\n- Feature 2: Shown (labeled in bottom left image)\n- Feature 3: Shown (labeled in bottom left image)\n- Feature 4: NOT shown in any illustration\n- Feature 5: Shown (labeled in top right image)\n- Feature 6: Shown (labeled in bottom left image)\n\nAnswer: 1", "citations": [], "input_tokens": 96859, "output_tokens": 327, "cost_micros": 295482, "latency_ms": 47640, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2021-Apple-Catalog.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "4"}}
+{"qid": "2021-Apple-Catalog.pdf::Q002", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "int", "gold": "4", "pages": 55, "document_id": 7578, "arm": "azure_basic_lc", "question_id": "2021-Apple-Catalog.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the MacBook High Security Solution section and identify which feature numbers are mentioned in the text but not shown in the illustration.\n\nLooking at page 15, I can see the \"MacBook High Security Solution\" section.\n\nThe FEATURES AND BENEFITS list shows:\n1. Rigid anchor design on rear of MacBook\n2. Steel cable co-molded for added strength, secured to table with OneKEY Padlock\n3. Bracket bar arms and custom form factor to capture and prevent lateral movement\n4. Ability to power/alarm MacBook with USB-C Port\n5. Metal tabs with rubber pads prevents closing of MacBook for added security and accidental damage during installation\n6. Stable, molded feet prevent rocking and protect fixture table\n\nNow I need to look at the illustration to see which numbers are labeled in the image.\n\nFrom the illustration, I can see the following numbers are clearly marked:\n- 1 (pointing to the rear anchor area)\n- 2 (pointing to the cable/tether area)\n- 3 (pointing to bracket/arm area)\n- 5 (pointing to the metal tabs)\n- 6 (appears twice, pointing to the molded feet)\n\nThe numbers NOT shown in the illustration are:\n- 4 (Ability to power/alarm MacBook with USB-C Port)\n\nAnswer: 4", "citations": [], "input_tokens": 11794, "output_tokens": 307, "cost_micros": 39987, "latency_ms": 8657, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2021-Apple-Catalog.pdf::Q002", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "int", "gold": "4", "pages": 55, "document_id": 7578, "arm": "azure_premium_lc", "question_id": "2021-Apple-Catalog.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the MacBook High Security Solution section and compare the features listed in text with what's shown in the illustration.\n\nLooking at pages 14-17, I find the MacBook High Security Solution on page 15.\n\nThe text lists these FEATURES AND BENEFITS:\n1. Rigid anchor design on rear of MacBook\n2. Steel cable co-molded for added strength, secured to table with OneKEY Padlock\n3. Bracket bar arms and custom form factor to capture and prevent lateral movement\n4. Ability to power/alarm MacBook with USB-C Port\n5. Metal tabs with rubber pads prevents closing of MacBook for added security and accidental damage during installation\n6. Stable, molded feet prevent rocking and protect fixture table\n\nNow I need to examine the illustration to see which features are labeled. The illustration shows numbered callouts (1, 2, 3, 6) pointing to different parts of the MacBook security solution.\n\nFrom the visible callouts in the illustration:\n- Feature 1 is shown\n- Feature 2 is shown\n- Feature 3 is shown\n- Feature 6 is shown\n\nFeatures NOT shown in the illustration:\n- Feature 4 (Ability to power/alarm MacBook with USB-C Port)\n- Feature 5 (Metal tabs with rubber pads prevents closing of MacBook)\n\nTherefore, 2 features are not presented in the illustration.\n\nAnswer: 2", "citations": [], "input_tokens": 18396, "output_tokens": 323, "cost_micros": 60033, "latency_ms": 9530, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "4"}}
+{"qid": "2021-Apple-Catalog.pdf::Q002", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "int", "gold": "4", "pages": 55, "document_id": 7578, "arm": "llamacloud_basic_lc", "question_id": "2021-Apple-Catalog.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the MacBook High Security Solution section and compare the features listed with what's shown in the illustration.\n\nLooking at page 15, I can see the \"HIGH SECURITY EXCEPTION PROGRAM - MACBOOK\" section titled \"MacBook High Security Solution\".\n\nThe FEATURES AND BENEFITS section lists 6 numbered features:\n1. Rigid anchor design on rear of MacBook\n2. Steel cable co-molded for added strength, secured to table with OneKEY Padlock\n3. Bracket bar arms and custom form factor to capture and prevent lateral movement\n4. Ability to power/alarm MacBook with USB-C Port\n5. Metal tabs with rubber pads prevents closing of MacBook for added security and accidental damage during installation\n6. Stable, molded feet prevent rocking and protect fixture table\n\nNow I need to examine the illustration on that page to see which features are NOT shown visually. Looking at the image description \"delete alarm\" and \"return\" text visible, this appears to be showing a MacBook with the security solution.\n\nFrom the numbered features:\n- Feature 4 (USB-C Port) - this is an internal capability, not visually shown in an illustration\n- Feature 5 (Metal tabs with rubber pads preventing closing) - this would be visible if shown\n- Feature 6 (Stable, molded feet) - these would be underneath and likely not visible in a standard product photo\n\nWithout being able to see the exact details of the illustration clearly, but based on typical product photography, features that are functional/internal capabilities (like feature 4 - power/alarm ability) or underneath components (like feature 6 - molded feet) are typically not shown in product illustrations.\n\nBased on the context and typical product documentation, features 4 and 6 would most likely not be presented in the illustration, giving us 2 features.\n\nAnswer: 2", "citations": [], "input_tokens": 12628, "output_tokens": 418, "cost_micros": 44154, "latency_ms": 12765, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "4"}}
+{"qid": "2021-Apple-Catalog.pdf::Q002", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "int", "gold": "4", "pages": 55, "document_id": 7578, "arm": "llamacloud_premium_lc", "question_id": "2021-Apple-Catalog.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the MacBook High Security Solution section and identify which features listed are NOT shown in the illustrations.\n\nLooking at page 15, I can see the \"MacBook High Security Solution\" section with:\n\n**FEATURES AND BENEFITS** (numbered 1-6):\n1. Rigid anchor design on rear of MacBook\n2. Steel cable co-molded for added strength, secured to table with OneKEY Padlock\n3. Bracket bar arms and custom form factor to capture and prevent lateral movement\n4. Ability to power/alarm MacBook with USB-C Port\n5. Metal tabs with rubber pads prevents closing of MacBook for added security and accidental damage during installation\n6. Stable, molded feet prevent rocking and protect fixture table\n\nThe page shows 4 images with numbered callouts:\n- Image 1 (Top Left): Shows callout 5\n- Image 2 (Top Right): Shows callout 5 (close-up)\n- Image 3 (Bottom Left): Shows callouts 1, 2, 3, and 6\n- Image 4 (Bottom Right): Shows callout 2 (close-up)\n\nFeatures shown in illustrations:\n- Feature 1: YES (shown in Image 3)\n- Feature 2: YES (shown in Images 3 and 4)\n- Feature 3: YES (shown in Image 3)\n- Feature 4: NO (not shown in any image)\n- Feature 5: YES (shown in Images 1 and 2)\n- Feature 6: YES (shown in Image 3)\n\nOnly Feature 4 (Ability to power/alarm MacBook with USB-C Port) is NOT presented in the illustrations.\n\nAnswer: 4", "citations": [], "input_tokens": 16962, "output_tokens": 396, "cost_micros": 56826, "latency_ms": 10375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2021-Apple-Catalog.pdf::Q002", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "int", "gold": "4", "pages": 55, "document_id": 7578, "arm": "surfsense_agentic", "question_id": "2021-Apple-Catalog.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the Apple Catalog document to find information about MacBook high security solution and identify which features are not presented in the illustration.Let me search for \"security\" in the document to find the relevant section:Let me search for \"MacBook\" and \"high security\" together to find the specific section:Now let me read the specific section about \"MacBook High Security Solution\":Perfect! I found the MacBook High Security Solution section. Now let me read it more carefully to see the full list of features and the illustration. Let me go back to line 3132:I can see the features listed are numbered 1-6:\n1. Rigid anchor design on rear of MacBook\n2. Steel cable co-molded for added strength, secured to table with OneKEY Padlock\n3. Bracket bar arms and custom form factor to capture and prevent lateral movement\n4. Ability to power/alarm MacBook with USB-C Port\n5. Metal tabs with rubber pads prevents closing of MacBook for added security and accidental damage during installation\n6. Stable, molded feet prevent rocking and protect fixture table\n\nThe illustration shows numbers 1, 2, 3, and 6 marked on the MacBook keyboard image in the figure. The OCR text shows these numbers visible in the illustration. However, I notice the visual description after the figure is describing a smartwatch, which seems incorrect. Let me check if there's a better description of the MacBook illustration by reading a bit more context around this section.\n\nLooking at the figure content (lines 3159-3225), I can see the illustration has:\n- Number 1 appears twice (lines 3185, 3189, 3223)\n- Number 2 appears twice (lines 3137, 3179, 3191, 3217)\n- Number 3 appears twice (lines 3173, 3215)\n- Number 6 appears three times (lines 3213, 3221)\n\nFrom the text, the 6 features are listed, and from the illustration markers I can identify which numbers are shown:\n- 1 is shown (rigid anchor design)\n- 2 is shown (steel cable)\n- 3 is shown (bracket bar arms)\n- 6 is shown (stable molded feet)\n\nThe features that are NOT shown in the illustration are:\n- Feature 4 (Ability to power/alarm MacBook with USB-C Port)\n- Feature 5 (Metal tabs with rubber pads prevents closing of MacBook)\n\nSo 2 features are not presented in the illustration.\n\nAnswer: 2", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 46969, "error": null, "extra": {"thread_id": 1939, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_abe777eda57d4187b59395c83dff7b8b", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 346, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "4"}}
+{"qid": "2021-Apple-Catalog.pdf::Q003", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "str", "gold": "Yes", "pages": 55, "document_id": 7578, "arm": "native_pdf", "question_id": "2021-Apple-Catalog.pdf::Q003", "answer_letter": null, "raw_text": "Looking through the document for information about \"One40\"...\n\nI cannot find any mention of \"One40\" in this Apple Solutions 2021 Product Catalog from InVue. The document discusses various security solutions for Apple products including:\n\n- Series 2865\n- OnePOD (One55, One60, One65)\n- Series 960\n- W2000\n- WS2\n- HSW100\n- One60 Wearable\n- OnePOD Wearable\n\nBut there is no product called \"One40\" mentioned in this catalog.\n\nAnswer: Cannot determine - product not found in document", "citations": [], "input_tokens": 96873, "output_tokens": 141, "cost_micros": 292734, "latency_ms": 38656, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2021-Apple-Catalog.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "cannot determine product not found in document", "normalised_gold": "yes"}}
+{"qid": "2021-Apple-Catalog.pdf::Q003", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "str", "gold": "Yes", "pages": 55, "document_id": 7578, "arm": "azure_basic_lc", "question_id": "2021-Apple-Catalog.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about One40 and whether it can only be used for Apple Watch.\n\nOn page 21, in the \"OnePOD Wearable / Apple Watch Solutions\" section, under \"FEATURES AND BENEFITS,\" it states:\n\n\"Compatible with One55, One60, One65 and One40 standard and split-level stands\"\n\nThis indicates that One40 is mentioned in the context of Apple Watch solutions. However, this doesn't definitively state that One40 can ONLY be used for Apple Watch.\n\nSearching through the rest of the document, One40 is only mentioned in this one location in relation to Apple Watch compatibility. There is no indication that One40 is used for any other products in this catalog.\n\nHowever, the question asks if it can \"only\" be used for Apple Watch. The document doesn't explicitly state that One40 cannot be used for other purposes - it simply doesn't mention One40 being used for anything else in this particular catalog.\n\nGiven that this is specifically an Apple catalog and One40 is only mentioned in the Apple Watch section without any other applications shown, and based solely on what is presented in this document:\n\nAnswer: yes", "citations": [], "input_tokens": 11808, "output_tokens": 265, "cost_micros": 39399, "latency_ms": 8577, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "yes", "normalised_gold": "yes"}}
+{"qid": "2021-Apple-Catalog.pdf::Q003", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "str", "gold": "Yes", "pages": 55, "document_id": 7578, "arm": "azure_premium_lc", "question_id": "2021-Apple-Catalog.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about One40 and what products it can be used with.\n\nOn page 21, under \"OnePOD Wearable\" section for Apple Watch Solutions, it states:\n\n\"Compatible with One55, One60, One65 and One40 standard and split-level stands\"\n\nThis indicates that One40 is compatible with the OnePOD Wearable system for Apple Watch. However, this doesn't definitively state that One40 can ONLY be used for Apple Watch.\n\nLooking through the rest of the document, I can see that One40 is only mentioned in the context of Apple Watch solutions. There is no mention of One40 being used for iPhones, iPads, or other Apple products in their respective sections.\n\nAnswer: No", "citations": [], "input_tokens": 18410, "output_tokens": 179, "cost_micros": 57915, "latency_ms": 5780, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "yes"}}
+{"qid": "2021-Apple-Catalog.pdf::Q003", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "str", "gold": "Yes", "pages": 55, "document_id": 7578, "arm": "llamacloud_basic_lc", "question_id": "2021-Apple-Catalog.pdf::Q003", "answer_letter": null, "raw_text": "I need to search the document for information about \"One40\" and what products it can be used with.\n\nLooking through the document, I can find mentions of \"One40\" on page 21 in the OnePOD Wearable section:\n\n\"Compatible with One55, One60, One65 and One40 standard and split-level stands\"\n\nThis appears in the context of Apple Watch solutions, but it only states compatibility with these stands. It doesn't state that One40 can ONLY be used for Apple Watch or that it's exclusive to Apple Watch.\n\nThe document doesn't provide enough information to definitively say whether One40 can only be used for Apple Watch or if it has other uses beyond what's mentioned in this catalog.\n\nHowever, since the question asks for a direct yes/no answer based on what's in the document, and the document only shows One40 mentioned in the Apple Watch context without explicitly stating it's exclusive to Apple Watch, I cannot confirm the statement is true.\n\nAnswer: No", "citations": [], "input_tokens": 12642, "output_tokens": 221, "cost_micros": 41241, "latency_ms": 6593, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "yes"}}
+{"qid": "2021-Apple-Catalog.pdf::Q003", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "str", "gold": "Yes", "pages": 55, "document_id": 7578, "arm": "llamacloud_premium_lc", "question_id": "2021-Apple-Catalog.pdf::Q003", "answer_letter": null, "raw_text": "Let me search through the document for information about One40 and its compatibility.\n\nLooking at the Apple Watch Solutions section, I can see mentions of:\n- W2000\n- WS2\n- OnePOD Wearable (compatible with One55, One60, One65 and One40)\n- One60 Wearable (High Security)\n- HSW100\n\nOn page 21, under \"OnePOD Wearable\" features and benefits, it states:\n\"Compatible with One55, One60, One65 and One40 standard and split-level stands\"\n\nThis indicates that One40 is a stand that is compatible with the OnePOD Wearable system for Apple Watch. However, the document does not explicitly state that One40 can ONLY be used for Apple Watch. The document doesn't provide comprehensive information about all possible uses of the One40 stand.\n\nBased on the limited information provided in the document, I cannot definitively confirm that One40 is exclusively for Apple Watch only.\n\nAnswer: Cannot be determined from the document", "citations": [], "input_tokens": 16976, "output_tokens": 239, "cost_micros": 54513, "latency_ms": 6765, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "cannot be determined from document", "normalised_gold": "yes"}}
+{"qid": "2021-Apple-Catalog.pdf::Q003", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "str", "gold": "Yes", "pages": 55, "document_id": 7578, "arm": "surfsense_agentic", "question_id": "2021-Apple-Catalog.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the Apple Catalog document to find information about One40 and its compatibility.Let me search for \"One40\" in the document.Now let me read the section around line 4032 to get more context about One40.This section is about Apple Watch products. Let me search for more information about One40 to see if it's used with other products.Let me search for other mentions of One40 in the document to see if it's used with other products besides Apple Watch.Let me search for \"One55\" or \"One60\" to understand what these stands are used for, which will help determine if One40 is only for Apple Watch.Good! I can see One55 is used for iPhone and iPad. Now let me check around line 1666 to see what products use these stands.Let me look for what products are associated with One40 specifically. Let me search for \"One40\" more broadly in the document.Let me search the entire document more carefully. I'll look for sections that mention what One40 is used for.I notice the table only shows One55, One65, and One60, but not One40. Let me look at sections that specifically discuss iPhone and iPad product ordering to see if One40 is mentioned there.I can see iPhone ordering uses One55, One60, and One65, but not One40. Let me check iPad ordering and then look specifically for what products use One40.iPad also uses One55, not One40. Now let me search for a broader section around the Apple Watch mention to understand the context better. Let me look for tables or product ordering sections related to Apple Watch and One40.Let me search for Apple Watch product ordering section specifically.Now let me read the Apple Watch product ordering section around line 6290.I can see the Apple Watch product ordering sections, but they don't mention One40 in the stand options. They mention DBD910-W, DBD600-W, DBD650-W, AC271-W, and ZW2000 for Apple Watch. Let me go back to where One40 was mentioned (line 4032) and read more context to understand what section that is part of.Perfect! Now I have clear context. The section starting at line 3973 is titled \"ONEPOD WEARABLE\" and is under \"/ Apple Watch Solutions\" (line 3974). The section at line 4021-4060 describes the OnePOD Wearable product which:\n\n1. Has \"Harness bands secure Apple Watch to sensor\" (line 4028)\n2. Is \"Compatible with One55, One60, One65 and One40 standard and split-level stands\" (line 4032)\n3. Has \"OEM charging compatibility for Apple Watch models\" (line 4035)\n4. Shows compatibility with various Apple Watch Series (lines 4043-4059)\n\nThis clearly indicates that One40 is mentioned specifically in the Apple Watch section. However, the statement says One40 is compatible with the OnePOD Wearable sensor/product along with One55, One60, and One65 stands. Let me verify if One40 stand is used for other products as well by searching more thoroughly.One40 is mentioned only once in the entire document, and it's in the Apple Watch section. However, I notice that it says the OnePOD Wearable (for Apple Watch) is \"Compatible with One55, One60, One65 and One40 standard and split-level stands\". This means the Apple Watch sensor can work with multiple stand types including One40, but it doesn't necessarily mean One40 can ONLY be used for Apple Watch.\n\nLet me check if there's a definition or description of what One40 is elsewhere, or if it appears in any product tables.Let me look at what the table at line 1666 shows - it lists One55, One65, and One60 but notably does NOT include One40. This suggests One40 might be a different or less common stand type. Let me see if I can find any section that specifically describes One40 as a product.The table comparison shows One55, One65, and One60, but NOT One40. This table appears on page 3 under \"/ iPhone & iPad Solutions\" and \"ONEPOD\u2122\" headers (lines 1615-1616).\n\nBased on my search:\n1. One40 is mentioned only ONCE in the entire document (line 4032)\n2. It's mentioned in the \"Apple Watch Solutions\" section under \"OnePOD Wearable\" product description\n3. The comparative table of stands (lines 1658-1730) for iPhone/iPad products shows One55, One65, and One60, but NOT One40\n4. iPhone ordering sections show One55, One60, One65 but not One40\n5. iPad ordering sections show One55 but not One40\n6. Apple Watch ordering sections don't explicitly list One40 in their product codes\n\nThe document states that the OnePOD Wearable (for Apple Watch) is \"Compatible with One55, One60, One65 and One40 standard and split-level stands.\" This indicates One40 CAN be used with Apple Watch, but since One55, One60, and One65 are used for multiple products (iPhone, iPad) and are also compatible with Apple Watch, it's possible One40 could also be used for other products.\n\nHowever, the fact that:\n- One40 appears nowhere else in the document\n- It's not in the comparison table\n- It's not in iPhone/iPad ordering sections\n- It only appears in the Apple Watch compatibility statement\n\nThis strongly suggests One40 is specifically or primarily designed for Apple Watch, though I cannot definitively say it can ONLY be used for Apple Watch based solely on this document.\n\nAnswer: No", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 125969, "error": null, "extra": {"thread_id": 1940, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_274957bdc18c45a58945780a57a038df", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 928, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "yes"}}
+{"qid": "2023.acl-long.386.pdf::Q000", "doc_id": "2023.acl-long.386.pdf", "answer_format": "list", "gold": "['Fact Checker']", "pages": 24, "document_id": 7579, "arm": "native_pdf", "question_id": "2023.acl-long.386.pdf::Q000", "answer_letter": null, "raw_text": "I need to look at the first figure in the paper to identify the function names with blue background in the program execution.\n\nLooking at Figure 1 (Overview of PROGRAMFC model) on page 2, I can see the \"Functions\" column which shows the program execution functions. The functions shown with blue background are:\n\n1. Fact Checker (appears twice - for S1 and S3)\n2. QA Model (for S2)\n3. Logical Reasoner (for S4)\n\nArranging these in ascending alphabetical order:\n\nAnswer: Fact Checker, Logical Reasoner, QA Model", "citations": [], "input_tokens": 65107, "output_tokens": 141, "cost_micros": 197436, "latency_ms": 20703, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2023.acl-long.386.pdf"}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "fact checker, logical reasoner, qa model", "normalised_gold": "fact checker"}}
+{"qid": "2023.acl-long.386.pdf::Q000", "doc_id": "2023.acl-long.386.pdf", "answer_format": "list", "gold": "['Fact Checker']", "pages": 24, "document_id": 7579, "arm": "azure_basic_lc", "question_id": "2023.acl-long.386.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document, I need to identify the function names with blue background in the \"Functions\" column of the Program Execution section.\n\nFrom the figure, I can see the following functions are called in the reasoning program:\n- S1: Verify (Fact Checker)\n- S2: Question (QA Model)\n- S3: Verify (Fact Checker)\n- S4: Predict (Logical Reasoner)\n\nThe functions with blue background are:\n- Fact Checker\n- QA Model\n- Logical Reasoner\n\nHowever, the question asks for \"function names\" which appear to be the ACTION names in the program format. Looking at the actual function calls in the program:\n- Verify\n- Question\n- Predict\n\nIn ascending alphabetical order:\n\nAnswer: Predict, Question, Verify", "citations": [], "input_tokens": 26912, "output_tokens": 192, "cost_micros": 83616, "latency_ms": 7046, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "predict, question, verify", "normalised_gold": "fact checker"}}
+{"qid": "2023.acl-long.386.pdf::Q000", "doc_id": "2023.acl-long.386.pdf", "answer_format": "list", "gold": "['Fact Checker']", "pages": 24, "document_id": 7579, "arm": "azure_premium_lc", "question_id": "2023.acl-long.386.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1 in the paper, I can see the \"Functions\" column which shows the sub-task functions that are called during program execution.\n\nThe functions shown with blue backgrounds in the figure are:\n- Fact Checker (called in steps S1 and S3 for the Verify function)\n- QA Model (called in step S2 for the Question function)\n- Logical Reasoner (called in step S4 for the Predict function)\n\nArranging these in ascending alphabetical order:\n\nAnswer: Fact Checker, Logical Reasoner, QA Model", "citations": [], "input_tokens": 29934, "output_tokens": 135, "cost_micros": 91827, "latency_ms": 5031, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "fact checker, logical reasoner, qa model", "normalised_gold": "fact checker"}}
+{"qid": "2023.acl-long.386.pdf::Q000", "doc_id": "2023.acl-long.386.pdf", "answer_format": "list", "gold": "['Fact Checker']", "pages": 24, "document_id": 7579, "arm": "llamacloud_basic_lc", "question_id": "2023.acl-long.386.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document, I need to identify the function names that appear with blue background in the \"Functions\" column during program execution.\n\nFrom the figure, I can see the following functions listed in the execution steps:\n- S1: Fact Checker\n- S2: QA Model\n- S3: Fact Checker\n- S4: Logical Reasoner\n\nHowever, examining the actual function names called in the \"Reasoning Program\" column:\n- FACT_1 uses a fact verification function\n- ANSWER_1 uses Question (QA)\n- FACT_2 uses a fact verification function\n- PREDICTED_LABEL uses Predict (logical reasoning)\n\nLooking at Section 3.4 which describes the sub-task functions, the three function names are:\n1. QUESTION\n2. VERIFY\n3. PREDICT\n\nThese correspond to the blue-backgrounded function types shown in Figure 1.\n\nIn ascending alphabetical order:\n\nAnswer: PREDICT, QUESTION, VERIFY", "citations": [], "input_tokens": 22321, "output_tokens": 230, "cost_micros": 70413, "latency_ms": 7172, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "predict, question, verify", "normalised_gold": "fact checker"}}
+{"qid": "2023.acl-long.386.pdf::Q000", "doc_id": "2023.acl-long.386.pdf", "answer_format": "list", "gold": "['Fact Checker']", "pages": 24, "document_id": 7579, "arm": "llamacloud_premium_lc", "question_id": "2023.acl-long.386.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document, I need to identify the function names shown in the \"Functions\" column that correspond to the program execution steps.\n\nFrom the figure, the functions shown are:\n- Fact Checker (for S1 and S3)\n- QA Model (for S2)\n- Logical Reasoner (for S4)\n\nHowever, looking more carefully at the actual function names used in the reasoning program section and throughout the paper, the standardized function names are:\n- VERIFY (for fact checking)\n- QUESTION (for question answering)\n- PREDICT (for logical reasoning)\n\nThese are the three sub-task functions described in Section 3.4.\n\nIn ascending alphabetical order:\n\nAnswer: PREDICT, QUESTION, VERIFY", "citations": [], "input_tokens": 29236, "output_tokens": 174, "cost_micros": 90318, "latency_ms": 6828, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "predict, question, verify", "normalised_gold": "fact checker"}}
+{"qid": "2023.acl-long.386.pdf::Q000", "doc_id": "2023.acl-long.386.pdf", "answer_format": "list", "gold": "['Fact Checker']", "pages": 24, "document_id": 7579, "arm": "surfsense_agentic", "question_id": "2023.acl-long.386.pdf::Q000", "answer_letter": null, "raw_text": "I can see Figure 1 in the document. Let me search for information about the program execution functions with blue backgrounds. I need to look more carefully at the figure description and any additional details about the model architecture.Let me look at the actual figure more carefully. I need to examine the visual description and the figure content in detail to identify functions with blue backgrounds.I can see the figure content, but I need to look for more detailed visual information about the figure. Let me search for image descriptions or additional figure details.Let me look for any embedded images or visual descriptions that might contain details about the colors in Figure 1.Let me search for more embedded images near Figure 1 to see if there's a better visual description.Let me look for the actual embedded image description near Figure 1 more carefully. Let me search around line 408 where Im5.png is mentioned.Let me search for more specific information about the visual elements of Figure 1. Let me look for descriptions that might mention colors or boxes in the figure.Let me look for image descriptions near the figure that might contain more details about the visual layout. Let me read more around the figure area to find additional embedded images or descriptions.Based on Figure 1 in the document, I can see the \"Functions\" column which shows the program execution functions. Looking at the structure, I can see:\n- S1: Verify \u2192 Fact Checker\n- S2: Question \u2192 QA Model  \n- S3: Verify \u2192 Fact Checker\n- S4: Predict \u2192 Logical Reasoner\n\nHowever, I need to find which ones have blue backgrounds. Let me search more carefully for visual information about the figure or look for any other embedded images that might be Figure 1 itself.From Figure 1, I can see the functions column showing:\n- S1: Fact Checker\n- S2: QA Model\n- S3: Fact Checker\n- S4: Logical Reasoner\n\nHowever, the text-based XML doesn't explicitly specify which ones have blue backgrounds. Let me search the document for any other references to colors or visual styling in Figure 1, or look for the actual Figure 1 image description more thoroughly. Let me search for \"Fact Checker\", \"QA Model\", \"Logical Reasoner\" to understand the context better.Based on my analysis of Figure 1 from the document, I can see the \"Functions\" column contains:\n- Fact Checker (appears for S1 and S3 - Verify operations)\n- QA Model (appears for S2 - Question operation)\n- Logical Reasoner (appears for S4 - Predict operation)\n\nGiven the typical visualization patterns in academic papers where functions are color-coded, and looking at the structure where the three main function types are displayed, the functions with blue background would likely be the distinct function types shown in the \"Functions\" column. Without explicit color information in the XML, but based on the structure and common visualization practices, the three function types are: \"Fact Checker\", \"Logical Reasoner\", and \"QA Model\".\n\nIn alphabetical order, these would be:\n\nAnswer: Fact Checker, Logical Reasoner, QA Model", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 80155, "error": null, "extra": {"thread_id": 1941, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_d6bd06724d924284a60194ba98ad4765", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 579, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "fact checker, logical reasoner, qa model", "normalised_gold": "fact checker"}}
+{"qid": "2023.acl-long.386.pdf::Q002", "doc_id": "2023.acl-long.386.pdf", "answer_format": "int", "gold": "2", "pages": 24, "document_id": 7579, "arm": "native_pdf", "question_id": "2023.acl-long.386.pdf::Q002", "answer_letter": null, "raw_text": "I need to find how many prompt examples are shown in the paper's main body (not appendices).\n\nLet me search through the document for prompt examples:\n\n1. **Figure 1** (page 2): Shows a complete example of the PROGRAMFC model with a reasoning program for the claim about James Cameron and Interstellar. This includes the full program structure.\n\n2. **Figure 2** (page 5): Shows \"The Codex prompt template used to generate reasoning programs, consisting of a task instruction, in-context examples, and a prompt for the <input_claim>.\" This shows one example of a reasoning program for the James Cameron/Interstellar claim.\n\n3. **Figure 3** (page 5): Shows the implementation of the question-answering sub-task function for three different settings (Gold Evidence, Open-book, Closed-book).\n\n4. **Section 3.4** (page 4): Shows prompt templates for the QUESTION, VERIFY, and PREDICT functions in text form.\n\nLooking at Figure 2 specifically, it shows one complete example of a prompt with the reasoning program for the James Cameron claim.\n\nIn the main body (before Appendices which start at page 14), the primary prompt example shown is in Figure 2, which displays one reasoning program example. The text also mentions using \"K = 20\" in-context examples (page 4), but these full 20 examples are not shown in the main body - they are referenced to be in Appendix D.\n\nThe main body shows **1** complete prompt example explicitly in Figure 2.\n\nAnswer: 1", "citations": [], "input_tokens": 65062, "output_tokens": 356, "cost_micros": 200526, "latency_ms": 28827, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2023.acl-long.386.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "2"}}
+{"qid": "2023.acl-long.386.pdf::Q002", "doc_id": "2023.acl-long.386.pdf", "answer_format": "int", "gold": "2", "pages": 24, "document_id": 7579, "arm": "azure_basic_lc", "question_id": "2023.acl-long.386.pdf::Q002", "answer_letter": null, "raw_text": "I need to count the number of prompt examples shown in the main body of the paper (not in appendices).\n\nLet me go through the paper systematically:\n\n1. **Figure 2** (page 6985): Shows one prompt template for Codex used to generate reasoning programs. This includes:\n   - Task instruction\n   - One example showing a claim about James Cameron and Interstellar\n   - Mentions \"more in-context examples here\"\n   - Input claim placeholder\n   This counts as **1 prompt example**.\n\n2. **Figure 3** (page 6985): Shows the implementation of the question-answering sub-task function for three different settings (Gold Evidence, Open-book, Closed-book). These show prompt templates for FLAN-T5 with question-answering format. This counts as **1 prompt example** (showing variations for different settings).\n\n3. **Section 3.4** (page 6984): Shows the VERIFY module prompt:\n   ```\n   EVIDENCE\n   Q: Is it true that CLAIM ? True or False? The answer is:\n   ```\n   This counts as **1 prompt example**.\n\n4. The paper mentions prompts for baselines in Appendix A, but those are not in the main body.\n\n5. The detailed prompts in Appendix D and E are explicitly in appendices, not the main body.\n\nLet me recount more carefully:\n- Figure 2: 1 prompt template\n- Figure 3: 3 variations of the same prompt structure (but essentially 1 prompt design)\n- Section 3.4 VERIFY: 1 prompt\n\nLooking at the question more carefully - it asks for \"prompt examples\" shown in the main body. The main body ends before the appendices.\n\nIn the main body:\n1. Figure 2 shows one complete prompt example\n2. Figure 3 shows prompt variations (3 settings)\n3. Section 3.4 shows the VERIFY prompt\n\nIf counting each distinct prompt shown: Figure 2 (1), Figure 3 (3 variations but let me check if they're separate examples), Section 3.4 (1).\n\nActually, looking at Figure 3 more carefully, it shows one prompt design with three different implementations for three settings. The question", "citations": [], "input_tokens": 26867, "output_tokens": 512, "cost_micros": 88281, "latency_ms": 15515, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "2"}}
+{"qid": "2023.acl-long.386.pdf::Q002", "doc_id": "2023.acl-long.386.pdf", "answer_format": "int", "gold": "2", "pages": 24, "document_id": 7579, "arm": "azure_premium_lc", "question_id": "2023.acl-long.386.pdf::Q002", "answer_letter": null, "raw_text": "I need to count the number of prompt examples shown in the main body of the paper (not in the appendices).\n\nLooking through the paper:\n\n1. **Figure 2** (page 6985) shows a prompt template for generating reasoning programs with Codex. This includes:\n   - A task instruction\n   - One complete example showing the claim about James Cameron and the director of Interstellar\n   - A note saying \"(...more in-context examples here...)\"\n   - An input claim placeholder\n\nThis appears to be showing **1 complete example** in the prompt template.\n\n2. **Figure 1** (page 6982) shows an overview diagram with the same example claim about James Cameron, but this is showing the system architecture rather than being a separate prompt example.\n\n3. **Section 3.4** describes the prompt formats for different sub-task functions (QUESTION, VERIFY, PREDICT), but these are described in text format rather than being full examples.\n\n4. **Section 4.4** mentions prompts for closed-book fact-checking and states \"The detailed prompting templates are given in Appendix E\" - so the full examples are in the appendix, not the main body.\n\nIn the main body, I can only count **1** complete prompt example that is actually shown (the one in Figure 2 about James Cameron and Interstellar).\n\nAnswer: 1", "citations": [], "input_tokens": 29889, "output_tokens": 308, "cost_micros": 94287, "latency_ms": 9937, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "2"}}
+{"qid": "2023.acl-long.386.pdf::Q002", "doc_id": "2023.acl-long.386.pdf", "answer_format": "int", "gold": "2", "pages": 24, "document_id": 7579, "arm": "llamacloud_basic_lc", "question_id": "2023.acl-long.386.pdf::Q002", "answer_letter": null, "raw_text": "I need to count the number of prompt examples shown in the main body of the paper (not in appendices).\n\nLooking through the document systematically:\n\n1. **Figure 1**: This shows an overview of the model with what appears to be a simplified example of the reasoning process for the claim about James Cameron and Christopher Nolan. This shows one example of how the program works but not a full prompt template.\n\n2. **Figure 2**: This is explicitly labeled as \"The Codex prompt template used to generate reasoning programs\" and shows:\n   - A task instruction\n   - One complete in-context example (the James Cameron/Interstellar claim)\n   - A note that there are \"more in-context examples here\"\n   - A prompt template for the input claim\n\n   So this shows **1 complete prompt example** in the template.\n\n3. **Figure 3**: This shows the implementation of the question-answering sub-task function with different prompt formats for different settings (Gold Evidence, Open-book, Closed-book). These are partial prompts for sub-tasks, showing **3 prompt format variations**.\n\n4. In the text around Section 3.4, there are descriptions of prompts:\n   - For QUESTION (closed-book): \"Q: QUESTION ? The answer is:\" - **1 prompt**\n   - For QUESTION (other settings): \"EVIDENCE Q: QUESTION ? The answer is:\" - **1 prompt**\n   - For VERIFY: \"EVIDENCE Q: Is it true that CLAIM ? True or False? The answer is:\" - **1 prompt**\n\nHowever, I need to be careful about what counts as \"shown\" vs just described in text.\n\nLooking more carefully at what is explicitly displayed in figures/boxes in the main body:\n- Figure 1: Shows the example execution but not a full prompt\n- Figure 2: Shows 1 complete prompt example\n- Figure 3: Shows 3 prompt format examples (though these are sub-task prompts)\n\nThe question asks for \"prompt examples\" in the main body. The most prominent and complete prompt example shown is in Figure 2, which displays **1** complete example.\n\nAnswer: 1", "citations": [], "input_tokens": 22276, "output_tokens": 485, "cost_micros": 74103, "latency_ms": 14203, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "2"}}
+{"qid": "2023.acl-long.386.pdf::Q002", "doc_id": "2023.acl-long.386.pdf", "answer_format": "int", "gold": "2", "pages": 24, "document_id": 7579, "arm": "llamacloud_premium_lc", "question_id": "2023.acl-long.386.pdf::Q002", "answer_letter": null, "raw_text": "I need to count the number of prompt examples shown in the main body of the paper (not in appendices).\n\nLooking through the main body of the paper:\n\n1. **Figure 2** (page 6984): Shows the Codex prompt template used to generate reasoning programs. This shows one prompt example with the claim about James Cameron and Interstellar.\n\n2. **Figure 3** (page 6985): Shows implementation of question-answering sub-task function with different prompt formats for three settings (Gold Evidence, Open-book, Closed-book). This shows prompt examples for the QA function.\n\n3. **Section 3.4** discusses the VERIFY and PREDICT functions with their prompt formats shown inline in the text.\n\nLet me count more carefully:\n\nIn **Figure 2**: This shows 1 complete prompt template example.\n\nIn **Section 3.4** (page 6984):\n- QUESTION function shows prompts for closed-book and other settings (2 prompt formats)\n- VERIFY function shows 1 prompt format\n\nSo in the main body, I can identify:\n- Figure 2: 1 prompt example\n- Section 3.4 for QUESTION: Shows prompt formats but these are templates\n- Section 3.4 for VERIFY: Shows 1 prompt template\n\nActually, looking at Figure 2 more carefully, it shows one example claim with its program within a larger prompt template that includes \"(... more in-context examples here ...)\".\n\nThe question asks for \"prompt examples\" - I should count distinct prompt templates or examples shown.\n\nIn the main body:\n1. Figure 2 - shows the program generation prompt template (1)\n2. Figure 3 - shows QA implementation prompts (multiple formats shown)\n3. Section 3.4 text - shows VERIFY prompt format (1)\n\nBut Figure 2 actually shows a full prompt with one complete example visible.\n\nLooking at this more carefully, Figure 2 shows **one** concrete example in the prompt template.\n\nAnswer: 1", "citations": [], "input_tokens": 29191, "output_tokens": 456, "cost_micros": 94413, "latency_ms": 14032, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "2"}}
+{"qid": "2023.acl-long.386.pdf::Q002", "doc_id": "2023.acl-long.386.pdf", "answer_format": "int", "gold": "2", "pages": 24, "document_id": 7579, "arm": "surfsense_agentic", "question_id": "2023.acl-long.386.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the document to find how many prompt examples are shown in the paper's main body.Let me continue reading to see the document structure and find prompt examples:Let me search for prompt examples in the document:Now let me look at Figure 2 and Figure 3 which show the prompt templates:Let me search for \"Figure\" to understand how many figures/examples are in the main body:Let me check where the appendix starts to understand what's in the main body vs appendix:Let me look for where the paper sections end and appendix begins:Now let me search for where the appendix starts:Good! So \"## A Implementation Details about the Baselines\" starts the appendix. Let me find the line number:So the appendix starts at line 2301. Now let me search for all figures and tables to count prompt examples in the main body (before line 2301):I can see Figure 2 shows a prompt template with one example shown (the James Cameron / Interstellar example), and it says \"( ... more in-context examples here .. . )\" indicating there are more but not shown. Let me look at Figure 6 which shows an error case:Let me look before this to see the actual example in Figure 6:Good! Figure 6 shows one example. Now let me check if there are any other figures or examples shown in the main body. Let me look for examples in sections discussing the prompts:Now let me look at the Limitations section which might have another example:Good! I found another example in the Limitations section - the \"Aristotle couldn't have used a laptop\" example. Let me now carefully count all prompt/program examples shown in the main body (before line 2301 where Appendix starts):\n\n1. **Figure 1** (line 335-402) - Shows the \"James Cameron and Interstellar director\" example with the full reasoning program\n2. **Figure 2** (line 747-770) - Shows the same \"James Cameron and Interstellar director\" example as part of the prompt template \n3. **Figure 6** (line 1441-1464) - Shows the \"Emery ghost town\" error case example\n4. **Limitations section** (line 1699-1706) - Shows the \"Aristotle couldn't have used a laptop\" example\n\nHowever, I need to be careful - Figure 1 and Figure 2 show the SAME example (James Cameron/Interstellar). So let me count unique examples:\n\n1. James Cameron/Interstellar example (shown in both Figure 1 and Figure 2)\n2. Emery ghost town example (Figure 6)\n3. Aristotle laptop example (Limitations section)\n\nThat's **3 unique prompt/program examples** shown in the main body of the paper.\n\nWait, let me re-read the question: \"How many prompts examples are shown in the paper's main body?\" - this could mean distinct prompt templates or distinct claim examples with programs. Let me verify by looking at what constitutes an example.\n\nLooking back at the figures:\n- Figure 2 explicitly shows a prompt template with ONE example shown (the James Cameron one) plus \"( ... more in-context examples here .. . )\"\n- Figure 6 shows ONE error case example\n- Limitations shows ONE example\n\nSo there are **3 prompt examples** shown in the main body.\n\nAnswer: 3", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 87217, "error": null, "extra": {"thread_id": 1942, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_fcb323f039bb45c4bc9d5aa08b03f871", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 646, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "2"}}
+{"qid": "2023.acl-long.386.pdf::Q003", "doc_id": "2023.acl-long.386.pdf", "answer_format": "int", "gold": "1", "pages": 24, "document_id": 7579, "arm": "native_pdf", "question_id": "2023.acl-long.386.pdf::Q003", "answer_letter": null, "raw_text": "I need to find the figure at the top of page 6985 and count the complete in-context examples shown.\n\nLooking at page 6985 (which is page 5 of the document), I can see Figure 2 which shows a prompt template for Codex. The figure contains in-context examples demonstrating the program generation format.\n\nIn the figure, I can see:\n1. One complete example starting with \"# The claim is that Both James Cameron and the director of the film Interstellar were born in Canada.\" with a full program definition.\n2. After that, there is text \"(\u00b7 \u00b7 \u00b7 more in-context examples here \u00b7 \u00b7 \u00b7)\" indicating additional examples.\n3. Then \"# The claim is that <input_claim>\" followed by \"def program():\" which is the prompt for the input.\n\nSo there is 1 complete in-context example explicitly shown in full in this figure, with an indication that more examples would be included in the actual prompt.\n\nAnswer: 1", "citations": [], "input_tokens": 65078, "output_tokens": 222, "cost_micros": 198564, "latency_ms": 13453, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2023.acl-long.386.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "1"}}
+{"qid": "2023.acl-long.386.pdf::Q003", "doc_id": "2023.acl-long.386.pdf", "answer_format": "int", "gold": "1", "pages": 24, "document_id": 7579, "arm": "azure_basic_lc", "question_id": "2023.acl-long.386.pdf::Q003", "answer_letter": null, "raw_text": "I need to look at the figure at the top of page 6985 to count the complete in-context examples shown.\n\nLooking at Figure 2 on page 6985, which shows \"The Codex prompt template used to generate reasoning programs\", I can see:\n\n1. One complete example about James Cameron and the director of Interstellar (showing the full program with fact_1, Answer_1, fact_2, and label)\n\n2. After that, there's a notation \"( ... more in-context examples here . . . )\" indicating additional examples are included but not shown\n\n3. Then there's the prompt for the input claim\n\nSo the figure explicitly shows **1** complete in-context example in full detail, though it indicates more examples are present in the actual implementation (they're just not displayed in the figure).\n\nAnswer: 1", "citations": [], "input_tokens": 26883, "output_tokens": 187, "cost_micros": 83454, "latency_ms": 7171, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "1"}}
+{"qid": "2023.acl-long.386.pdf::Q003", "doc_id": "2023.acl-long.386.pdf", "answer_format": "int", "gold": "1", "pages": 24, "document_id": 7579, "arm": "azure_premium_lc", "question_id": "2023.acl-long.386.pdf::Q003", "answer_letter": null, "raw_text": "I need to look at the figure at the top of page 6985 to count the complete in-context examples shown.\n\nLooking at page 6985, I can see Figure 2, which shows \"The Codex prompt template used to generate reasoning programs.\"\n\nThe figure contains:\n1. A task instruction at the top\n2. One complete example showing:\n   - A claim: \"Both James Cameron and the director of the film Interstellar were born in Canada.\"\n   - A program with:\n     - fact_1 = Verify(\"James Cameron was born in Canada.\")\n     - Answer_1 = Question(\"Who is the director of the film Interstellar?\")\n     - fact_2 = Verify(\"{Answer_1} was born in Canada.\")\n     - label = Predict(fact_1 and fact_2)\n3. A note saying \"( ... more in-context examples here .. . )\"\n4. A prompt for the input claim\n\nSo there is **1** complete in-context example fully shown in this figure.\n\nAnswer: 1", "citations": [], "input_tokens": 29905, "output_tokens": 242, "cost_micros": 93345, "latency_ms": 7921, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "1"}}
+{"qid": "2023.acl-long.386.pdf::Q003", "doc_id": "2023.acl-long.386.pdf", "answer_format": "int", "gold": "1", "pages": 24, "document_id": 7579, "arm": "llamacloud_basic_lc", "question_id": "2023.acl-long.386.pdf::Q003", "answer_letter": null, "raw_text": "I need to find the figure at the top of page 6985 and count the complete in-context examples shown.\n\nLooking at the document, page 6985 corresponds to the section showing Figure 2, which displays the Codex prompt template used to generate reasoning programs.\n\nIn Figure 2, I can see:\n1. A task instruction at the top explaining the three functions that can be called\n2. One complete example showing:\n   - The claim about \"Both James Cameron and the director of the film Interstellar were born in Canada\"\n   - The complete program with multiple steps (fact_1, Answer_1, fact_2, label)\n3. A note saying \"(\u00b7 \u00b7 \u00b7 more in-context examples here \u00b7 \u00b7 \u00b7)\" indicating additional examples are present but not shown\n4. The input prompt template at the bottom\n\nThe figure shows 1 complete in-context example explicitly displayed in full, with an indication that more examples exist but are not shown in the figure.\n\nAnswer: 1", "citations": [], "input_tokens": 22292, "output_tokens": 222, "cost_micros": 70206, "latency_ms": 8125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "1"}}
+{"qid": "2023.acl-long.386.pdf::Q003", "doc_id": "2023.acl-long.386.pdf", "answer_format": "int", "gold": "1", "pages": 24, "document_id": 7579, "arm": "llamacloud_premium_lc", "question_id": "2023.acl-long.386.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the figure on page 6985 (Figure 2: The Codex prompt template), I can see the prompt template structure.\n\nThe template shows:\n1. A task instruction at the top\n2. One complete example showing a claim about James Cameron and the Interstellar director, with its full program implementation\n3. A note \"(... more in-context examples here ...)\" indicating additional examples would go here\n4. The input claim placeholder at the bottom\n\nSo there is **1** complete in-context example explicitly shown in the figure, though the text indicates more examples would be included in the actual prompt (the paper mentions using K=20 examples total).\n\nAnswer: 1", "citations": [], "input_tokens": 29207, "output_tokens": 151, "cost_micros": 89886, "latency_ms": 7562, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "1"}}
+{"qid": "2023.acl-long.386.pdf::Q003", "doc_id": "2023.acl-long.386.pdf", "answer_format": "int", "gold": "1", "pages": 24, "document_id": 7579, "arm": "surfsense_agentic", "question_id": "2023.acl-long.386.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the document to find the figure at the top of page 6985 and count the complete in-context examples shown.Let me search for page 6985 in the document:Let me read through the document to find the page number references:I can see the document is from ACL conference pages 6981-7004. Let me search for figures and read the content more systematically:Now let me look at Figure 2 which seems to be the prompt template. Let me read around line 769 where Figure 2 is mentioned:I can see Figure 2 template ends around line 770. Now I need to find what page this corresponds to. Let me check if there are any page markers. Let me look for the beginning of the document to understand the structure:Good! I can see page numbers now. Line 327 shows \"<!-- PageNumber=\"6981\" -->\". Now let me search for page 6985:Great! Page 6985 starts at line 902. Let me read the content around that line to find the figure at the top of the page:I can see that at the top of page 6985 (starting at line 902), there's a table (Table 1), not a figure. Let me continue reading to see if there's a figure further down on this page:I see Figure 4 is on page 6986 (line 1108). Let me look back more carefully. The question asks about the figure at the TOP of page 6985. Let me check if there's a figure that spans from page 6984 to 6985, or if I need to look more carefully at what's immediately after the page break at line 902:So at the top of page 6985 (line 902-906), there's a Table 1, not a figure. Let me check what's on page 6984 to see if there's a figure that might be spanning or near the top of 6985:Let me read around line 744 to see what's on page 6984:I can see that Figure 2 is on page 6984 (lines 747-769), and it shows the Codex prompt template. Looking at the content of Figure 2, I can see:\n\n- Line 747-750: Task instruction\n- Lines 752-762: One complete in-context example showing the claim about James Cameron and the director of Interstellar\n- Line 764: \"( ... more in-context examples here .. . )\" - indicating there are more examples\n- Lines 766-767: The prompt for the input claim\n\nThe figure caption at line 769 says \"The Codex prompt template used to generate reasoning programs, consisting of a task instruction, in-context examples, and a prompt for the <input_claim>.\"\n\nHowever, I see only ONE complete example shown explicitly (lines 752-762), and then it says \"more in-context examples here\" without showing them.\n\nWait, let me re-read the question. It asks about \"the figure at the top of page 6985\". Let me check if Figure 2 actually spans to page 6985 or if there's another figure. Looking at the structure:\n- Page 6984 starts at line 744\n- Figure 2 content is at lines 747-769\n- Page 6985 starts at line 902\n\nSo Figure 2 is entirely on page 6984, not 6985. At the top of page 6985 (line 902-906) there's Table 1, not a figure. Let me search more carefully for any figures that might be between these pages or check if I'm missing something:Wait, I think I need to reconsider. Let me look at the actual visual structure. Lines 747-769 show Figure 2 which is on page 6984. Let me check if the complete Figure 2 example I see is showing all the in-context examples or if there's more detail I should look at. Let me read the full Figure 2 content more carefully:", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 91078, "error": null, "extra": {"thread_id": 1943, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_479f734c9e744aafac61b60b60655bab", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 680, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "1"}}
+{"qid": "2023.acl-long.386.pdf::Q005", "doc_id": "2023.acl-long.386.pdf", "answer_format": "str", "gold": "Subtask", "pages": 24, "document_id": 7579, "arm": "native_pdf", "question_id": "2023.acl-long.386.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Table 2 in the document, which shows \"Reasoning program evaluation for incorrectly-predicted examples from each hop length in HOVER\":\n\nFor semantic errors:\n- Token: 8% + 20% + 18% = 46%\n- Structure: 19% + 13% + 57% = 89%\n- Subtask: 2% + 5% + 2% = 9%\n\nAnswer: Subtask", "citations": [], "input_tokens": 65070, "output_tokens": 105, "cost_micros": 196785, "latency_ms": 20235, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2023.acl-long.386.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "subtask", "normalised_gold": "subtask"}}
+{"qid": "2023.acl-long.386.pdf::Q005", "doc_id": "2023.acl-long.386.pdf", "answer_format": "str", "gold": "Subtask", "pages": 24, "document_id": 7579, "arm": "azure_basic_lc", "question_id": "2023.acl-long.386.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Table 2 in the document, which shows the error analysis for reasoning programs across HOVER 2-hop, 3-hop, and 4-hop claims:\n\nFor semantic errors, the three types and their totals are:\n- Token: 8% + 20% + 18% = 46%\n- Structure: 19% + 13% + 57% = 89%\n- Subtask: 2% + 5% + 2% = 9%\n\nAnswer: Subtask", "citations": [], "input_tokens": 26875, "output_tokens": 120, "cost_micros": 82425, "latency_ms": 4672, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "subtask", "normalised_gold": "subtask"}}
+{"qid": "2023.acl-long.386.pdf::Q005", "doc_id": "2023.acl-long.386.pdf", "answer_format": "str", "gold": "Subtask", "pages": 24, "document_id": 7579, "arm": "azure_premium_lc", "question_id": "2023.acl-long.386.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Table 2, which shows the error analysis for reasoning programs on the HOVER dataset broken down by hop length (2-hop, 3-hop, 4-hop):\n\nThe three types of semantic errors are:\n- Token: 8% + 20% + 18% = 46%\n- Structure: 19% + 13% + 57% = 89%\n- Subtask: 2% + 5% + 2% = 9%\n\nAnswer: Subtask", "citations": [], "input_tokens": 29897, "output_tokens": 116, "cost_micros": 91431, "latency_ms": 5218, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "subtask", "normalised_gold": "subtask"}}
+{"qid": "2023.acl-long.386.pdf::Q005", "doc_id": "2023.acl-long.386.pdf", "answer_format": "str", "gold": "Subtask", "pages": 24, "document_id": 7579, "arm": "llamacloud_basic_lc", "question_id": "2023.acl-long.386.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Table 2 in the document, which shows the error analysis for reasoning programs on the HOVER dataset, I need to examine the semantic error breakdown across the three hop types (2-hop, 3-hop, and 4-hop).\n\nThe semantic errors are broken down into three categories:\n- Token: 8% (2-hop) + 20% (3-hop) + 18% (4-hop) = 46% total\n- Structure: 19% (2-hop) + 13% (3-hop) + 57% (4-hop) = 89% total\n- Subtask: 2% (2-hop) + 5% (3-hop) + 2% (4-hop) = 9% total\n\nAnswer: Subtask", "citations": [], "input_tokens": 22284, "output_tokens": 180, "cost_micros": 69552, "latency_ms": 6328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "subtask", "normalised_gold": "subtask"}}
+{"qid": "2023.acl-long.386.pdf::Q005", "doc_id": "2023.acl-long.386.pdf", "answer_format": "str", "gold": "Subtask", "pages": 24, "document_id": 7579, "arm": "llamacloud_premium_lc", "question_id": "2023.acl-long.386.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Table 2 which shows the error analysis for the HOVER dataset across 2-hop, 3-hop, and 4-hop claims:\n\n**Subtask errors:**\n- 2-hop: 2%\n- 3-hop: 5%\n- 4-hop: 2%\n- Total: 9%\n\n**Token errors:**\n- 2-hop: 8%\n- 3-hop: 20%\n- 4-hop: 18%\n- Total: 46%\n\n**Structure errors:**\n- 2-hop: 19%\n- 3-hop: 13%\n- 4-hop: 57%\n- Total: 89%\n\nAnswer: Subtask", "citations": [], "input_tokens": 29199, "output_tokens": 171, "cost_micros": 90162, "latency_ms": 5796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "subtask", "normalised_gold": "subtask"}}
+{"qid": "2023.acl-long.386.pdf::Q005", "doc_id": "2023.acl-long.386.pdf", "answer_format": "str", "gold": "Subtask", "pages": 24, "document_id": 7579, "arm": "surfsense_agentic", "question_id": "2023.acl-long.386.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document that was mentioned to find information about semantic errors in the HOVER dataset.Let me search for information about semantic errors and HOVER dataset in the document:Let me search for tables or figures that show error analysis:Now let me read the section with Table 2 that shows the error analysis:Perfect! I found Table 2 which shows the error analysis for the HOVER dataset. Let me examine the semantic error types and their percentages:\n\nFrom the table, the semantic errors are broken down into three types:\n- Token: 8% (2-hop), 20% (3-hop), 18% (4-hop) = Total: 46%\n- Structure: 19% (2-hop), 13% (3-hop), 57% (4-hop) = Total: 89%\n- Subtask: 2% (2-hop), 5% (3-hop), 2% (4-hop) = Total: 9%\n\nThe semantic error type with the lowest total percentages across all HOVER dataset hop lengths is \"Subtask\" with 2% + 5% + 2% = 9%.\n\nAnswer: Subtask", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 41280, "error": null, "extra": {"thread_id": 1944, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_964d1076448248edb0a0c213d4b3aab0", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 208, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "subtask", "normalised_gold": "subtask"}}
+{"qid": "2023.acl-long.386.pdf::Q007", "doc_id": "2023.acl-long.386.pdf", "answer_format": "float", "gold": "56.8", "pages": 24, "document_id": 7579, "arm": "native_pdf", "question_id": "2023.acl-long.386.pdf::Q007", "answer_letter": null, "raw_text": "I need to find two pieces of information:\n1. Which dataset has the highest ProgramFC retrieval recall@10\n2. The InstructGPT Self-Ask performance on that dataset in closed-book setting\n\nFrom Figure 5 on page 7, I can see the retrieval recall@10 comparison:\n- HOVER (2-hop): 77.13\n- HOVER (3-hop): 59.17\n- HOVER (4-hop): 49.93\n- FEVEROUS-S: 85.65\n\nFEVEROUS-S has the highest ProgramFC retrieval recall@10 at 85.65.\n\nNow I need to find the InstructGPT Self-Ask performance on FEVEROUS-S in the closed-book setting.\n\nLooking at Table 3 on page 8, which shows closed-book setting results:\n- InstructGPT - Self-Ask on FEVEROUS: 56.82\n\nAnswer: 56.8", "citations": [], "input_tokens": 65104, "output_tokens": 223, "cost_micros": 198657, "latency_ms": 22157, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2023.acl-long.386.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "56.8", "normalised_gold": "56.8"}}
+{"qid": "2023.acl-long.386.pdf::Q007", "doc_id": "2023.acl-long.386.pdf", "answer_format": "float", "gold": "56.8", "pages": 24, "document_id": 7579, "arm": "azure_basic_lc", "question_id": "2023.acl-long.386.pdf::Q007", "answer_letter": null, "raw_text": "I need to find two pieces of information:\n1. Which dataset has the highest ProgramFC retrieval recall@10\n2. What is the InstructGPT Self-Ask performance on that dataset in the closed-book setting\n\nFirst, let me look at Figure 5 for retrieval recall@10:\n- HOVER (2-hop): ProgramFC = 77.13\n- HOVER (3-hop): ProgramFC = 73.18\n- HOVER (4-hop): ProgramFC = 85.65\n- FEVEROUS-S: ProgramFC = 76.25\n\nThe highest ProgramFC retrieval recall@10 is 85.65 for HOVER (4-hop).\n\nNow, I need to find the InstructGPT Self-Ask performance on HOVER 4-hop in the closed-book setting from Table 3:\n\nLooking at Table 3 (Closed-book setting):\n- InstructGPT - Self-Ask on HOVER 4-hop: 52.45\n\nHowever, I notice the column headers are \"2-hop\", \"HOVER 3-hop\", \"4-hop\", and \"FEVEROUS\". The 4-hop column shows 52.45 for Self-Ask.\n\nAnswer: 52.5", "citations": [], "input_tokens": 26909, "output_tokens": 292, "cost_micros": 85107, "latency_ms": 7312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "52.5", "normalised_gold": "56.8"}}
+{"qid": "2023.acl-long.386.pdf::Q007", "doc_id": "2023.acl-long.386.pdf", "answer_format": "float", "gold": "56.8", "pages": 24, "document_id": 7579, "arm": "azure_premium_lc", "question_id": "2023.acl-long.386.pdf::Q007", "answer_letter": null, "raw_text": "I need to find two pieces of information:\n1. Which dataset has the highest ProgramFC retrieval recall@10\n2. The Self-Ask performance on that dataset in closed-book setting\n\nLooking at Figure 5 for retrieval recall@10:\n- HOVER (2-hop): ~73.18\n- HOVER (3-hop): ~77.13\n- HOVER (4-hop): ~85.65\n- FEVEROUS-S: ~76.25\n\nThe highest ProgramFC retrieval recall@10 is for HOVER (4-hop) at approximately 85.65.\n\nNow looking at Table 3 (Closed-book setting) for InstructGPT with Self-Ask on HOVER (4-hop):\nThe Self-Ask row shows: 52.45 for HOVER (4-hop)\n\nAnswer: 52.5", "citations": [], "input_tokens": 29931, "output_tokens": 198, "cost_micros": 92763, "latency_ms": 9953, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "52.5", "normalised_gold": "56.8"}}
+{"qid": "2023.acl-long.386.pdf::Q007", "doc_id": "2023.acl-long.386.pdf", "answer_format": "float", "gold": "56.8", "pages": 24, "document_id": 7579, "arm": "llamacloud_basic_lc", "question_id": "2023.acl-long.386.pdf::Q007", "answer_letter": null, "raw_text": "I need to find:\n1. The dataset with the highest ProgramFC retrieval recall at 10\n2. The InstructGPT model with Self-Ask performance on that dataset in closed-book setting\n\nLooking at Figure 5 (mentioned in the text on page 7), which discusses retrieval performance, I need to find the retrieval recall@10 values. However, the actual figure is not fully detailed in the text.\n\nFrom Table 3 on page 7, I can see closed-book fact-checking results. The table shows performance across HOVER (2-hop, 3-hop, 4-hop) and FEVEROUS datasets.\n\nLooking at Table 3:\n- The models listed include: InstructGPT (with variants Direct, ZS-CoT, CoT, Self-Ask), Codex, FLAN-T5, and ProgramFC\n- Self-Ask row shows: 51.54, 51.47, 52.45, 56.82\n\nThe text mentions that Figure 5 compares retrieval performance, and discusses that \"iterative retrieval guided by the reasoning program yields better results.\" However, without seeing the actual Figure 5 data clearly showing which dataset has the highest recall@10 for ProgramFC, I need to infer from the context.\n\nBased on the text discussing HOVER and FEVEROUS-S datasets, and looking at Table 3's structure with columns for \"2-hop\", \"3-hop\", \"4-hop\", and what appears to be FEVEROUS, the Self-Ask scores are: 51.54, 51.47, 52.45, 56.82.\n\nThe highest value for Self-Ask appears to be 56.82, which corresponds to the FEVEROUS column.\n\nAnswer: 56.8", "citations": [], "input_tokens": 22318, "output_tokens": 404, "cost_micros": 73014, "latency_ms": 14375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "56.8", "normalised_gold": "56.8"}}
+{"qid": "2023.acl-long.386.pdf::Q007", "doc_id": "2023.acl-long.386.pdf", "answer_format": "float", "gold": "56.8", "pages": 24, "document_id": 7579, "arm": "llamacloud_premium_lc", "question_id": "2023.acl-long.386.pdf::Q007", "answer_letter": null, "raw_text": "I need to find:\n1. The dataset with the highest ProgramFC retrieval recall@10\n2. The InstructGPT Self-Ask performance on that dataset in closed-book setting\n\nFrom Figure 5 (Retrieval recall@10):\n- HOVER (2-hop): 77.13\n- HOVER (3-hop): 59.17\n- HOVER (4-hop): 49.93\n- FEVEROUS-S: 85.65\n\nThe highest ProgramFC retrieval recall@10 is FEVEROUS-S at 85.65.\n\nFrom Table 3 (Closed-book setting macro-F1 scores), looking at InstructGPT with Self-Ask on FEVEROUS:\n- Self-Ask: 56.82\n\nAnswer: 56.8", "citations": [], "input_tokens": 29233, "output_tokens": 186, "cost_micros": 90489, "latency_ms": 6452, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "56.8", "normalised_gold": "56.8"}}
+{"qid": "2023.acl-long.386.pdf::Q007", "doc_id": "2023.acl-long.386.pdf", "answer_format": "float", "gold": "56.8", "pages": 24, "document_id": 7579, "arm": "surfsense_agentic", "question_id": "2023.acl-long.386.pdf::Q007", "answer_letter": null, "raw_text": "I'll help you find this information from the document. Let me start by reading the relevant document that was identified as most relevant.Let me continue reading to find information about ProgramFC retrieval recall and InstructGPT performance with Self-Ask.Let me search for \"recall\" to find tables with retrieval metrics:Good! I found information about retrieval recall@10. Let me read the section with Figure 5 and the surrounding content:Perfect! I can see from Figure 5 that the ProgramFC retrieval recall@10 values for different datasets are:\n- HOVER (2-hop): 73.18\n- HOVER (3-hop): 59.17\n- HOVER (4-hop): 51.33\n- FEVEROUS-S: 85.65\n\nThe highest ProgramFC retrieval recall@10 is **85.65** for **FEVEROUS-S**.\n\nNow I need to find the performance of InstructGPT with Self-Ask in the closed-book setting on FEVEROUS-S. Let me search for tables with these results:Let me read Table 1 which shows performance on FEVEROUS-S:This table shows results but doesn't specifically mention InstructGPT with Self-Ask in a closed-book setting. Let me search for \"closed-book\" or \"closed book\":Let me read the section about closed-book setting evaluation:Perfect! I found Table 3 which shows the closed-book setting results. Looking at the table:\n\nFor **FEVEROUS** (which corresponds to FEVEROUS-S in this context):\n- InstructGPT with Self-Ask: **56.82**\n\nLet me verify this is indeed FEVEROUS-S by checking if they use the same notation:The table says \"FEVEROUS\" but earlier in the document they established that they use FEVEROUS-S (the sentence-only subset). Let me verify by checking the dataset description section again:Yes, they use FEVEROUS-S throughout the paper. In Table 3, \"FEVEROUS\" refers to FEVEROUS-S. \n\nNow let me confirm the answer:\n1. From Figure 5, the highest ProgramFC retrieval recall@10 is **85.65** for FEVEROUS-S\n2. From Table 3, InstructGPT with Self-Ask in the closed-book setting on FEVEROUS achieves **56.82**\n\nBut let me double-check this value is actually 56.82 by reading that specific cell again:Perfect! Confirmed. The row for \"Self-Ask\" shows:\n- HOVER 2-hop: 51.54\n- HOVER 3-hop: 51.47\n- HOVER 4-hop: 52.45\n- FEVEROUS: 56.82\n\nThe answer is **56.82**, which in 1 decimal format is **56.8**.\n\nWait, let me reconsider. The question asks for \"1 decimal\" which means I should provide it with 1 decimal place. 56.82 rounded to 1 decimal place would be 56.8.\n\nHowever, looking at the exact value in the table, it shows 56.82. Since they're asking for float format with 1 decimal, I should present it as 56.8.\n\nAnswer: 56.8", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 84281, "error": null, "extra": {"thread_id": 1945, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_f2cf8d23b121431e8f0ffe2a73635d4e", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 567, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "56.8", "normalised_gold": "56.8"}}
+{"qid": "2023.acl-long.386.pdf::Q008", "doc_id": "2023.acl-long.386.pdf", "answer_format": "list", "gold": "['64.3', '62.4', '62.2']", "pages": 24, "document_id": 7579, "arm": "native_pdf", "question_id": "2023.acl-long.386.pdf::Q008", "answer_letter": null, "raw_text": "I need to find the F1 scores for ProgramFC on different datasets with gold evidence in relation to FLAN-T5-small.\n\nLooking at Figure 4 on page 7, which shows \"F1 score for fact-checking with gold evidence using FLAN-T5 (blue line) and PROGRAMFC (green line) for language models of increasing sizes.\"\n\nThe figure shows three graphs for:\n1. HOVER (2-hop)\n2. HOVER (3-hop)\n3. HOVER (4-hop)\n\nFor FLAN-T5-small (80M), I can read the values:\n- HOVER (2-hop): FLAN-T5 is around 47.75, ProgramFC is around 64.35\n- HOVER (3-hop): FLAN-T5 is around 49.29, ProgramFC is around 62.23\n- HOVER (4-hop): FLAN-T5 is around 48.59, ProgramFC is around 62.46\n\nThe differences (ProgramFC - FLAN-T5) are:\n- HOVER (2-hop): 64.35 - 47.75 = 16.6\n- HOVER (4-hop): 62.46 - 48.59 = 13.9\n- HOVER (3-hop): 62.23 - 49.29 = 12.9\n\nAnswer: 16.6, 13.9, 12.9", "citations": [], "input_tokens": 65112, "output_tokens": 350, "cost_micros": 200586, "latency_ms": 12921, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2023.acl-long.386.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "12 9, 13 9, 16 6", "normalised_gold": "62 2, 62 4, 64 3"}}
+{"qid": "2023.acl-long.386.pdf::Q008", "doc_id": "2023.acl-long.386.pdf", "answer_format": "list", "gold": "['64.3', '62.4', '62.2']", "pages": 24, "document_id": 7579, "arm": "azure_basic_lc", "question_id": "2023.acl-long.386.pdf::Q008", "answer_letter": null, "raw_text": "Looking at Figure 4, which shows F1 scores for fact-checking with gold evidence using FLAN-T5 and PROGRAMFC for different model sizes including FLAN-T5-small (80M):\n\nFor HOVER 2-hop:\n- FLAN-T5-small: approximately 47.75\n- ProgramFC with FLAN-T5-small: approximately 64.35\n- Difference: 64.35 - 47.75 = 16.6\n\nFor HOVER 3-hop:\n- FLAN-T5-small: approximately 48.59\n- ProgramFC with FLAN-T5-small: approximately 61.36\n- Difference: 61.36 - 48.59 = 12.8\n\nFor HOVER 4-hop:\n- FLAN-T5-small: approximately 49.29\n- ProgramFC with FLAN-T5-small: approximately 63.05\n- Difference: 63.05 - 49.29 = 13.8\n\nSorting in descending order: 16.6, 13.8, 12.8\n\nAnswer: 16.6, 13.8, 12.8", "citations": [], "input_tokens": 26917, "output_tokens": 288, "cost_micros": 85071, "latency_ms": 6578, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "12 8, 13 8, 16 6", "normalised_gold": "62 2, 62 4, 64 3"}}
+{"qid": "2023.acl-long.386.pdf::Q008", "doc_id": "2023.acl-long.386.pdf", "answer_format": "list", "gold": "['64.3', '62.4', '62.2']", "pages": 24, "document_id": 7579, "arm": "azure_premium_lc", "question_id": "2023.acl-long.386.pdf::Q008", "answer_letter": null, "raw_text": "Looking at Figure 4 in the document, I need to find the F1 score differences between ProgramFC and FLAN-T5 using FLAN-T5-small (80M) for gold evidence setting across the three HOVER datasets.\n\nFrom Figure 4:\n- HOVER (2-hop): ProgramFC \u2248 62.23, FLAN-T5 \u2248 47.75, difference = 14.5\n- HOVER (3-hop): ProgramFC \u2248 61.36, FLAN-T5 \u2248 48.59, difference = 12.8\n- HOVER (4-hop): ProgramFC \u2248 63.39, FLAN-T5 \u2248 49.29, difference = 14.1\n\nSorting in descending order:\n\nAnswer: 14.5, 14.1, 12.8", "citations": [], "input_tokens": 29939, "output_tokens": 219, "cost_micros": 93102, "latency_ms": 6453, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "12 8, 14 1, 14 5", "normalised_gold": "62 2, 62 4, 64 3"}}
+{"qid": "2023.acl-long.386.pdf::Q008", "doc_id": "2023.acl-long.386.pdf", "answer_format": "list", "gold": "['64.3', '62.4', '62.2']", "pages": 24, "document_id": 7579, "arm": "llamacloud_basic_lc", "question_id": "2023.acl-long.386.pdf::Q008", "answer_letter": null, "raw_text": "Looking through the document for F1 scores comparing ProgramFC to FLAN-T5-small with gold evidence...\n\nIn Figure 4 (Section 4.2), the document discusses comparing performance across different model sizes, including FLAN-T5-small. The text mentions:\n\n\"Our results show that the program-guided model using FLAN-T5-small (80M parameters) as sub-task solvers can achieve comparable performance to the 137x larger FLAN-T5-XXL (11B) model with end-to-end reasoning for 4-hop claims.\"\n\nHowever, Figure 4 is referenced but the actual figure with specific F1 scores is not fully visible in the provided text. The document shows results for different hop counts (2-hop, 3-hop, 4-hop) on HOVER dataset in the gold evidence setting.\n\nFrom Table 1, I can see overall results but these are for FLAN-T5-XXL, not FLAN-T5-small.\n\nThe text in Section 4.2 discusses Figure 4 which would contain the specific comparisons with FLAN-T5-small, but the exact numerical values for the F1 score differences are not explicitly stated in the text portions provided.\n\nWithout access to the actual Figure 4 with the specific numerical values, I cannot provide the exact F1 scores requested.\n\nAnswer: Unable to determine from provided text", "citations": [], "input_tokens": 22326, "output_tokens": 321, "cost_micros": 71793, "latency_ms": 9483, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "unable to determine from provided text", "normalised_gold": "62 2, 62 4, 64 3"}}
+{"qid": "2023.acl-long.386.pdf::Q008", "doc_id": "2023.acl-long.386.pdf", "answer_format": "list", "gold": "['64.3', '62.4', '62.2']", "pages": 24, "document_id": 7579, "arm": "llamacloud_premium_lc", "question_id": "2023.acl-long.386.pdf::Q008", "answer_letter": null, "raw_text": "Looking through the document for F1 scores comparing ProgramFC to FLAN-T5-small with gold evidence.\n\nFrom Figure 4, I can extract the following F1 scores for ProgramFC (green line) with FLAN-T5-small (80M):\n- HOVER 2-hop: 64.35\n- HOVER 3-hop: 62.23\n- HOVER 4-hop: 62.46\n\nAnd for FLAN-T5-small (80M):\n- HOVER 2-hop: 47.75\n- HOVER 3-hop: 49.29\n- HOVER 4-hop: 48.59\n\nThe differences (ProgramFC - FLAN-T5-small):\n- HOVER 2-hop: 64.35 - 47.75 = 16.6\n- HOVER 4-hop: 62.46 - 48.59 = 13.9\n- HOVER 3-hop: 62.23 - 49.29 = 12.9\n\nSorting in descending order with one decimal place:\n\nAnswer: 16.6, 13.9, 12.9", "citations": [], "input_tokens": 29241, "output_tokens": 279, "cost_micros": 91908, "latency_ms": 7358, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "12 9, 13 9, 16 6", "normalised_gold": "62 2, 62 4, 64 3"}}
+{"qid": "2023.acl-long.386.pdf::Q008", "doc_id": "2023.acl-long.386.pdf", "answer_format": "list", "gold": "['64.3', '62.4', '62.2']", "pages": 24, "document_id": 7579, "arm": "surfsense_agentic", "question_id": "2023.acl-long.386.pdf::Q008", "answer_letter": null, "raw_text": "I'll read the document to find the F1 scores for ProgramFC on different datasets with gold evidence in relation to Flan-T5-small.Let me search for tables containing F1 scores for ProgramFC with gold evidence and Flan-T5-small:Let me read the section with Figure 4 and look for tables with F1 scores:Let me search for tables with gold evidence:Let me read Table 1 which contains F1 scores with gold evidence:Now I have the table data. I can see:\n- FLAN-T5 Gold scores: 73.69 (HOVER 2-hop), 65.66 (HOVER 3-hop), 58.08 (HOVER 4-hop), 90.81 (FEVEROUS-S)\n- ProgramFC (N=5) Gold scores: 75.65 (HOVER 2-hop), 68.48 (HOVER 3-hop), 66.75 (HOVER 4-hop), 92.69 (FEVEROUS-S)\n\nHowever, I need to check if these are specifically for Flan-T5-small. Let me look at Figure 4 data more carefully, as it mentions different sizes of Flan-T5:Looking at the data structure in Figure 4, I need to identify which values correspond to Flan-T5-small (80M) for ProgramFC. The chart shows alternating values. Let me read more of this section:Based on Figure 4 data, I can see there are three charts (HOVER 2-hop, 3-hop, 4-hop) with values at different model sizes (80M, 250M, 780M, 3B, 11B). \n\nLooking at the structure:\n- The first chart (HOVER 2-hop) has values from lines 1130-1148\n- The second chart (HOVER 3-hop) has values from lines 1152-1188  \n- The third chart (HOVER 4-hop) has values from lines 1192-1210\n\nFor 80M (FLAN-T5-small), looking at the ProgramFC values (green line):\n- HOVER 2-hop: 75.65 (line 1132)\n- HOVER 3-hop: 68.48 (line 1156)\n- HOVER 4-hop: 58.08 (line 1198)\n\nWait, let me verify this more carefully by looking at the pattern. The chart shows alternating FLAN-T5 and ProgramFC values. Let me trace through more carefully:\n\nFor HOVER 2-hop at 80M:\n- Lines around 1130-1135 show values near 76-77\n- This corresponds to 80M (first position)\n\nFor HOVER 3-hop at 80M:\n- Starting around line 1152-1158\n- Values around 67-69\n\nFor HOVER 4-hop at 80M:\n- Starting around line 1192-1198\n- Values around 56-58\n\nLooking at the caption again: \"FLAN-T5 (blue line) and PROGRAMFC (green line)\"\n\nFrom the table structure and comparing with Table 1 values (FLAN-T5 has 73.69 for 2-hop, ProgramFC N=5 has 75.65), I can deduce:\n\nFor 80M (FLAN-T5-small) with gold evidence, ProgramFC scores:\n- HOVER 2-hop: 75.65\n- HOVER 3-hop: 68.48  \n- HOVER 4-hop: 58.08\n\nIn descending order: 75.65, 68.48, 58.08\n\nHowever, I should check if there are more datasets. Let me verify if FEVEROUS-S is included in this figure:Figure 4 only shows HOVER 2-hop, 3-hop, and 4-hop. Let me check if there's a figure or table specifically showing FEVEROUS-S scores broken down by model size. But based on Table 1, the FEVEROUS-S score for ProgramFC (N=5) with gold evidence is 92.69, and this appears to be using FLAN-T5-XXL based on the context.\n\nLet me search for more information about which model size was used for the main results:Now I need to carefully parse Figure 4 data. The figure shows three charts for HOVER 2-hop, 3-hop, and 4-hop. Looking at the data structure, I see values followed by model sizes (80M, 250M, 780M, 3B, 11B) repeated three times.\n\nLet me trace through the data more systematically. From line 1130 onwards, reading in groups:\n\nHOVER 2-hop chart appears to show pairs of values for each model size (FLAN-T5 then ProgramFC based on the caption mentioning blue and green lines):\n- At 80M: 76.11, 75.65\n- Then 77.62\n\nLet me look for a clearer pattern by examining all the values more carefully:Good! I found key information at lines 1385-1388: \"the program-guided model using FLAN-T5-small (80M parameters) as sub-task solvers can achieve comparable performance to the 137x larger FLAN-T5-XXL (11B) model with end-to-end reasoning for 4-hop claims.\"\n\nThis confirms that Figure 4 shows results using different-sized FLAN-T5 models. Now let me parse the Figure 4 data correctly. Based on the structure and comparing with Table 1:\n\nFrom Table 1, I know:\n- FLAN-T5 gold evidence scores: 73.69 (2-hop), 65.66 (3-hop), 58.08 (4-hop)\n- ProgramFC (N=5) gold evidence scores: 75.65 (2-hop), 68.48 (3-hop), 66.75 (4-hop)\n\nThese match values in Figure 4 data. Now looking at the chart data structure more carefully, the values appear to be organized by model size. Let me trace the 80M (FLAN-T5-small) values for ProgramFC:\n\nLooking at lines 1130-1246, the pattern seems to show data points for both lines (FLAN-T5 and ProgramFC) across multiple model sizes for three different charts.\n\nFrom the visible data and comparing with Table 1 values at line 1148 (73.69) which is FLAN-T5 for HOVER 2-hop, and line 1132 (75.65) which matches ProgramFC (N=5) for HOVER 2-hop:\n\nFor FLAN-T5-small (80M) ProgramFC with gold evidence:\n- HOVER 2-hop: 72.56 (line 1140)\n- HOVER 3-hop: 68.55 (line 1154)  \n- HOVER 4-hop: 63.39 (line 1194)\n\nWait, let me reconsider. Looking at the x-axis labels (80M, 250M, 780M, 3B, 11B) appearing three times (once per chart), and the fact that there are two lines per chart (FLAN-T5 and ProgramFC), I need to identify which values correspond to which.\n\nLet me look for clearer structure or additional tables:Figure 4 only shows HOVER datasets (2-hop, 3-hop, 4-hop), not FEVEROUS-S. Let me carefully re-parse the figure data. \n\nLooking at the chart data structure: there are 3 separate line charts, each showing F1 scores for 5 different model sizes. Each chart has 2 lines (FLAN-T5 and ProgramFC).\n\nThe data flows:\n- Lines 1130-1148: HOVER 2-hop chart data\n- Lines 1152-1188: HOVER 3-hop chart data  \n- Lines 1192-1210: HOVER 4-hop chart data\n\nFor each chart, there should be approximately 10 data points (2 lines \u00d7 5 model sizes).\n\nLet me map this out for HOVER 2-hop (starting line 1130):\nLooking at the pattern and knowing from Table 1 that ProgramFC (N=5) gets 75.65 for 2-hop (line 1132), this should be for the largest model (11B based on Table 1 context).\n\nActually, let me reconsider the entire approach. The question asks for F1 scores for \"ProgramFC on different datasets with gold evidence in relation to Flan-T5-small\". \n\nThis might mean: the F1 scores when ProgramFC uses Flan-T5-small as its sub-task solver, across different datasets.\n\nFrom Figure 4 caption and the data structure, for 80M (FLAN-T5-small) as the sub-task solver in ProgramFC:\n\nLooking at line 1140 (72.56), 1154 (68.55), 1194 (63.39) - these appear to be the first ProgramFC values in each chart.\n\nBut I need to verify this interpretation. Let me check if there are explicit values stated in the text:From the text at lines 1368-1388, it discusses Figure 4 and mentions that \"the program-guided model using FLAN-T5-small (80M parameters) as sub-task solvers\" achieves certain performance. This confirms Figure 4 shows ProgramFC results when using different sizes of FLAN-T5 as sub-task solvers.\n\nNow I need to carefully extract the FLAN-T5-small (80M) ProgramFC values from Figure 4 data. Let me re-examine the data structure.\n\nBased on the chart showing 5 model sizes (80M, 250M, 780M, 3B, 11B) and 2 lines (FLAN-T5 and ProgramFC), there should be interleaved or grouped values.\n\nLooking at lines 1128-1248, I see numeric values followed by model size labels. The pattern suggests the chart data is organized with Y-axis values first, then X-axis labels.\n\nFor a line chart with 5 points on each of 2 lines, I'd expect 10 Y values per chart. Let me count the values before the first \"80M\" marker at line 1218:\n\nFrom 1130-1216, there appear to be values organized in groups. \n\nLet me try a different approach - look at the actual layout. The values from 1130-1148 before seeing the next section, and knowing Table 1 shows ProgramFC (N=5) has 75.65 for HOVER 2-hop with presumably the larger model:\n\nFirst chart (HOVER 2-hop) - 5 model sizes, 2 lines = 10 points\nLet me identify based on comparing with known Table 1 values. Line 1148 shows 73.69, which exactly matches FLAN-T5 HOVER 2-hop gold from Table 1. This suggests this is for the larger model (XXL/11B).\n\nLooking backward from there and forward, the organization seems to show alternating or separate line data. Since line charts typically show both lines across all X values, let me map:\n\nValues around lines 1130-1148 for HOVER 2-hop\nValues around lines 1152-1188 for HOVER 3-hop\nValues around lines 1192-1210 for HOVER 4-hop\n\nFor 80M (first model size), I need the first pair of values (FLAN-T5 and ProgramFC) for each chart.\n\nGiven the complexity and my uncertainty about the exact data structure, let me look for any other tables or explicit statements:Looking at this continuous stream helps! Let me parse it:\n\n\"80 76.11 75.65 77.62 80 80 72.56 77.07 70 71.69 73.69 70 67.88 68.55 68.48 69.56 70 68.37 68.56 66.75 68.18 64.35 68.24 62.23 66.89 62.46 60 60 63.05 65.07 65.66 60 61.36 63.39 56.58 58.08 50 50 50 47.75 49.29 48.59 40 40 40 80M 250M 780M 3...\"\n\nThe \"80\" values appear to be Y-axis scale markers. Let me focus on the actual data values and model sizes.\n\nParsing more carefully:\n- After the initial scale markers, values like 76.11, 75.65, 77.62 appear\n- Then 72.56, 77.07\n- Then 71.69, 73.69\n- Etc.\n\nKnowing from Table 1 that:\n- FLAN-T5 HOVER 2-hop gold = 73.69 \n- ProgramFC (N=5) HOVER 2-hop gold = 75.65\n\nI can see 73.69 and 75.65 in the data stream. These are likely for the XXL model since Table 1 results use the larger model.\n\nIf the data is organized as alternating FLAN-T5/ProgramFC for each model size from 80M to 11B, then:\n- First pair (80M): 76.11 (FLAN-T5), 75.65 (ProgramFC)? But 75.65 matches Table 1 ProgramFC which should be for the larger model...\n\nThis is confusing. Let me try another interpretation: perhaps all FLAN-T5 values come first, then all ProgramFC values for each chart.\n\nOr perhaps the values are arranged as: HOVER 2-hop FLAN-T5 (all 5 sizes), HOVER 2-hop ProgramFC (all 5 sizes), then HOVER 3-hop data, etc.\n\nWithout seeing the actual visual chart, this is challenging. Let me look for any other references or tables that might clarify:From the stream, 68.48 from Table 1 (ProgramFC N=5 HOVER 3-hop gold) appears right after 68.55.\n\nLet me hypothesize: if the chart shows both lines across all model sizes, and the data is organized left-to-right, then for each X position (model size) moving left to right, we have interleaved FLAN-T5 and ProgramFC Y values.\n\nFor HOVER 2-hop at 80M (first model size):\n- Looking at the early values: 76.11, 75.65, 77.62, 72.56, 77.07, 71.69, 73.69...\n\nOne pattern could be: pairs of (FLAN-T5, ProgramFC) for 80M, then for 250M, etc.\n- 80M: (76.11, 75.65)? But this doesn't align with Table 1 where 75.65 is the high-model result...\n\nOr: all 5 FLAN-T5 points, then all 5 ProgramFC points.\n\nLet me try: \nHOVER 2-hop FLAN-T5: 76.11, 77.62, 72.56, 71.69, 73.69 (for 80M, 250M, 780M, 3B, 11B)\nHOVER 2-hop ProgramFC: 75.65, 77.07, ... (for 80M, 250M, 780M, 3B, 11B)\n\nBut wait, 73.69 matches Table 1 FLAN-T5 which should be for 11B based on Table 1 being the best model. So if 73.69 is the 5th FLAN-T5 value (11B), then:\nHOVER 2-hop FLAN-T5: 76.11 (80M), 77.62 (250M), 72.56 (780M), 71.69 (3B), 73.69 (11B)\n\nThat doesn't make sense as performance should generally increase with model size, but we see 76.11 \u2192 77.62 \u2192 72.56 which decreases.\n\nLet me reconsider: perhaps the visual layout in the PDF is organized differently than the XML text extraction shows. The values might not be in simple left-to-right order.\n\nGiven the time constraints and complexity, let me make my best educated guess based on patterns:\n\nLooking at Table 1 reference values and Figure 4 caption mentioning these are results across different model sizes, and considering that:\n1. Figure 4 shows HOVER 2-hop, 3-hop, 4-hop only (not FEVEROUS-S)\n2. The question asks for ProgramFC scores with gold evidence for Flan-T5-small across different datasets\n\nThe likely answer based on extracting what appear to be the first ProgramFC values in each section that would correspond to the smallest model (80M):\n- HOVER 2-hop: 72.56\n- HOVER 3-hop: 68.55\n- HOVER 4-hop: 63.39\n\nIn descending order: 72.56 \u2192 68.55 \u2192 63.39\n\nBut let me verify one more time by looking at the complete pattern:Now I see the pattern more clearly! The model size labels (80M, 250M, 780M, 3B, 11B) appear THREE times (lines 1218-1246), once for each of the three charts (HOVER 2-hop, 3-hop, 4-hop).\n\nSo the data structure is:\n- Lines 1130-1148: HOVER 2-hop chart Y-axis values\n- Lines 1152-1188: HOVER 3-hop chart Y-axis values\n- Lines 1192-1210: HOVER 4-hop chart Y-axis values\n- Lines 1218-1246: X-axis labels (model sizes) for all three charts\n\nFor each chart with 2 lines and 5 X-values, I'd expect about 10 Y-values.\n\nHOVER 2-hop (lines 1130-1148): 76.11, 75.65, 77.62, (80, 80 are axis markers), 72.56, 77.07, (70), 71.69, 73.69, (70)\n\nRemoving axis markers: 76.11, 75.65, 77.62, 72.56, 77.07, 71.69, 73.69\n\nThat's 7 values. For 2 lines with 5 points each = 10 values, so I might be missing some or there's overlap.\n\nLooking at known reference: 73.69 (Table 1 FLAN-T5 2-hop) and 75.65 (Table 1 ProgramFC 2-hop) both appear in this section.\n\nIf arranged as: FLAN-T5 first 5 values, then ProgramFC 5 values:\nFLAN-T5: 76.11 (80M), 75.65 (250M), 77.62 (780M), 72.56 (3B), 77.07 (11B)?  \nBut 73.69 should be FLAN-T5 and it's not in this sequence...\n\nAlternative: interleaved pairs:\n- (76.11, 75.65) = (FLAN-T5, ProgramFC) at 80M\n- (77.62, 72.56) = (FLAN-T5, ProgramFC) at 250M  \n- (77.07, 71.69) = (FLAN-T5, ProgramFC) at 780M\n- (73.69, ???) = (FLAN-T5, ProgramFC) at some size\n\nBut this only gives 4 pairs...\n\nLet me try yet another approach. Reading the values without the axis scale markers (70, 80):\n76.11, 75.65, 77.62, 72.56, 77.07, 71.69, 73.69 for HOVER 2-hop\n\nIf this represents 2 lines of 5 points each minus 3 values (perhaps some overlap at endpoints or data points), let me see if the pattern makes sense as:\n\nLine 1 (FLAN-T5): 71.69, 73.69, 72.56, 77.07, 76.11 (in size order 80M to 11B)? Reverse order?\nLine 2 (ProgramFC): 75.65, 77.62, ..., ..., ... \n\nThis is getting too speculative. Let me use logical reasoning instead:\n\n1. Table 1 shows ProgramFC (N=5) achieves 75.65, 68.48, 66.75 for HOVER 2/3/4-hop with gold evidence\n2. These are the best results, so likely using the largest model (11B/XXL)\n3. Figure 4 shows how performance varies with model size for both FLAN-T5 and ProgramFC\n4. The text states that FLAN-T5-small (80M) ProgramFC can achieve comparable performance to the large model for 4-hop claims\n\nLooking at 4-hop values (lines 1192-1210): 61.36, 63.39, 56.58, 58.08, (50s), 47.75, 49.29, 48.59, (40s)\n\n58.08 appears here and also matches Table 1 FLAN-T5 4-hop gold.\n66.75 (ProgramFC 4-hop from Table 1) does NOT appear in this 4-hop section, which is strange...\n\nUnless Table 1 uses N=5 program aggregation while Figure 4 uses N=1? Let me check:\n\nLooking back at Table 1, there's both \"ProgramFC (N=1)\" and \"ProgramFC (N=5)\". \n\nProgramFC (N=1) gold scores from Table 1:\n- HOVER 2-hop: 74.10\n- HOVER 3-hop: 66.13\n- HOVER 4-hop: 65.69\n\nProgramFC (N=5) gold scores:\n- HOVER 2-hop: 75.65\n- HOVER 3-hop: 68.48\n- HOVER 4-hop: 66.75\n\nLet me check if 74.10, 66.13, or 65.69 appear in Figure 4 data... No, they don't appear in the extracted values.\n\nGiven this complexity and the fact that the visual chart layout isn't perfectly captured in the XML extraction, let me make my best inference:\n\nFor FLAN-T5-small (80M) as sub-task solvers in ProgramFC with gold evidence:\n- The first ProgramFC value in each chart section should correspond to 80M\n- Based on positioning and eliminating known FLAN-T5 values from Table 1:\n\nHOVER 2-hop: 72.56\nHOVER 3-hop: 68.55\nHOVER 4-hop: 63.39\n\nIn descending order: 72.6, 68.6, 63.4 (to one decimal place)\n\nAnswer: 72.6, 68.6, 63.4", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 174562, "error": null, "extra": {"thread_id": 1946, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_b81fd9ccbe6f4839823c9313fb619eb2", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 2087, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "63 4, 68 6, 72 6", "normalised_gold": "62 2, 62 4, 64 3"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q000", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "204", "pages": 19, "document_id": 7580, "arm": "native_pdf", "question_id": "2023.findings-emnlp.248.pdf::Q000", "answer_letter": null, "raw_text": "I need to find which dataset uses Prover9 as the symbolic solver and how many examples are used for evaluation.\n\nLooking at Table 1 on page 5, I can see the symbolic solvers used for different datasets:\n- Pyke: ProntoQA, ProofWriter\n- Prover9: FOLIO\n- python-constraint: LogicalDeduction\n- Z3: AR-LSAT\n\nSo FOLIO is the dataset that uses Prover9.\n\nNow I need to find how many examples from FOLIO are used for evaluation. Looking at the \"Datasets\" section on page 6:\n\n\"FOLIO (Han et al., 2022) is a challenging expert-written dataset for logical reasoning. The problems are mostly aligned with real-world knowledge and use highly natural wordings, and the questions require complex first-order logic reasoning to solve. We use the entire FOLIO test set for evaluation, consisting of 204 examples.\"\n\nAlso, Table 5 in Appendix B on page 14 confirms:\n- FOLIO has a Test Size of 204\n\nAnswer: 204", "citations": [], "input_tokens": 53261, "output_tokens": 253, "cost_micros": 163578, "latency_ms": 13937, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2023.findings-emnlp.248.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "204", "normalised_gold": "204"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q000", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "204", "pages": 19, "document_id": 7580, "arm": "azure_basic_lc", "question_id": "2023.findings-emnlp.248.pdf::Q000", "answer_letter": null, "raw_text": "I need to find which dataset uses Prover9 as the symbolic solver and how many examples are used for evaluation.\n\nLooking at Table 1 in the document, I can see the symbolic solvers used for each dataset:\n- ProntoQA and ProofWriter use Pyke\n- FOLIO uses Prover9\n- LogicalDeduction uses python-constraint\n- AR-LSAT uses Z3\n\nSo FOLIO is the dataset that uses Prover9.\n\nNow I need to find how many examples from FOLIO are used for evaluation. Looking at the \"Datasets\" section under \"4 Experiments\":\n\n\"FOLIO (Han et al., 2022) is a challenging expert-written dataset for logical reasoning. The problems are mostly aligned with real-world knowledge and use highly natural wordings, and the questions require complex first-order logic reasoning to solve. We use the entire FOLIO test set for evaluation, consisting of 204 examples.\"\n\nAnswer: 204", "citations": [], "input_tokens": 22914, "output_tokens": 223, "cost_micros": 72087, "latency_ms": 7156, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "204", "normalised_gold": "204"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q000", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "204", "pages": 19, "document_id": 7580, "arm": "azure_premium_lc", "question_id": "2023.findings-emnlp.248.pdf::Q000", "answer_letter": null, "raw_text": "I need to find which dataset uses Prover9 as the symbolic solver and how many examples are used for evaluation.\n\nFrom Table 1 in the document, I can see that Prover9 is used as the solver for the FOLIO dataset, which is categorized under \"First-Order Logic\" reasoning.\n\nNow I need to find how many examples from FOLIO are used for evaluation. Looking at Table 5 \"Statistics of the logical reasoning datasets\" in Appendix B:\n\nFOLIO:\n- Reasoning: FOL\n- Test Size: 204\n- #Opts: 3\n\nThe document also confirms in the main text (Section 4, Datasets): \"FOLIO (Han et al., 2022) is a challenging expert-written dataset for logical reasoning. The problems are mostly aligned with real-world knowledge and use highly natural wordings, and the questions require complex first-order logic reasoning to solve. We use the entire FOLIO test set for evaluation, consisting of 204 examples.\"\n\nAnswer: 204", "citations": [], "input_tokens": 26784, "output_tokens": 232, "cost_micros": 83832, "latency_ms": 8031, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "204", "normalised_gold": "204"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q000", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "204", "pages": 19, "document_id": 7580, "arm": "llamacloud_basic_lc", "question_id": "2023.findings-emnlp.248.pdf::Q000", "answer_letter": null, "raw_text": "I need to find which dataset uses Prover9 as the symbolic solver and how many examples are used for evaluation.\n\nLooking at Table 1 in the document, I can see the symbolic solvers used for each dataset:\n- ProofWriter uses \"ProntoQA, Pyke\"\n- FOLIO uses \"Prover9\"\n- LogicalDeduction uses \"phn-constraint\"\n- AR-LSAT uses \"Z3\"\n\nSo FOLIO is the dataset that uses Prover9.\n\nNow I need to find how many examples from FOLIO are used for evaluation. Looking at the \"Datasets\" section under \"4 Experiments\":\n\n\"FOLIO (Han et al., 2022) is a challenging expert-written dataset for logical reasoning. The problems are mostly aligned with real-world knowledge and use highly natural wordings, and the solutions require complex first-order logic reasoning to solve. We use the entire FOLIO test set for evaluation, consisting of 204 examples.\"\n\nAnswer: 204", "citations": [], "input_tokens": 20095, "output_tokens": 232, "cost_micros": 63765, "latency_ms": 6891, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "204", "normalised_gold": "204"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q000", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "204", "pages": 19, "document_id": 7580, "arm": "llamacloud_premium_lc", "question_id": "2023.findings-emnlp.248.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, I need to find which dataset uses Prover9 as the symbolic solver and how many examples are in its test set.\n\nFrom Table 1 (page 3810), I can see that FOLIO uses Prover9 as its symbolic solver under the \"FOL\" (First-Order Logic) problem type.\n\nFrom Table 5 in Appendix B (page 3819), which shows dataset statistics, I can see:\n- FOLIO has a Test Size of 204\n\nThis is also confirmed in the text on page 3811 where it states: \"We use the entire FOLIO test set for evaluation, consisting of 204 examples.\"\n\nAnswer: 204", "citations": [], "input_tokens": 25913, "output_tokens": 159, "cost_micros": 80124, "latency_ms": 5094, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "204", "normalised_gold": "204"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q000", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "204", "pages": 19, "document_id": 7580, "arm": "surfsense_agentic", "question_id": "2023.findings-emnlp.248.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find information about a dataset that uses Prover9 as the symbolic solver and determine how many examples are used for evaluation.Let me search for \"Prover9\" in the document to find the relevant information:Now let me search for sections that discuss datasets and evaluation:Let me read the section about datasets more carefully. Let me look for \"ProofWriter\" which seems to be mentioned and might use Prover9:Let me search more specifically for information about which dataset uses Prover9:I see \"First-order Logic Prover\" mentioned. Let me search for more specific information about datasets and their properties:Now let me look at the appendix section that might have more details about data statistics and which solver is used for which dataset:Let me search for tables or sections that describe which solver is used for each dataset:Great! I found a reference to Table 1 which summarizes the symbolic solvers used for each dataset. Let me read that table:Perfect! I found Table 1 which shows that **Prover9** is used as the symbolic solver for the **FOLIO** dataset. Now I need to find how many examples from FOLIO are used for evaluation. Let me go back to the section where FOLIO is described:Perfect! I found the answer. According to the document:\n\n1. From Table 1 (lines 846-849): **Prover9** is the symbolic solver used for the **FOLIO** dataset.\n2. From the dataset description (lines 1024-1026): \"We use the entire FOLIO test set for evaluation, consisting of **204 examples**.\"\n\nAnswer: 204", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 64250, "error": null, "extra": {"thread_id": 1947, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_e89f611945d040b28ca2dcfe99b6b2c6", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 410, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "204", "normalised_gold": "204"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q002", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "2", "pages": 19, "document_id": 7580, "arm": "native_pdf", "question_id": "2023.findings-emnlp.248.pdf::Q002", "answer_letter": "A", "raw_text": "I need to examine Figure 5 and Figure 6 to count how many examples Logic-LM predicted correctly.\n\n**Figure 5 (Page 9):**\nThis shows one example about \"Stranger Things\" and \"Black Mirror\" Netflix shows.\n- The predicted answer is shown as: B\n- No indication of whether this is correct or incorrect is given in the figure.\n\n**Figure 6 (Page 19):**\nThis shows 4 examples with predicted answers:\n\n1. **Example 1** (Wild turkeys): \n   - Predicted answer: C \u2192 A\n   - The arrow indicates the prediction changed from C to A\n\n2. **Example 2** (Legend of Zelda):\n   - Predicted answer: A\n   - No indication of correctness shown\n\n3. **Example 3** (Golf tournament):\n   - Predicted answer: E \u2192 B\n   - The arrow indicates the prediction changed from E to B\n\n4. **Example 4** (Restaurant/Subway):\n   - Predicted answer: C \u2192 A\n   - The arrow indicates the prediction changed from C to A\n\nLooking at the figure caption: \"Examples of generated symbolic representations and predicted answers. The incorrect segment(s) and their correspondence in the problem are marked in red, and the correct revisions are marked in green.\"\n\nThe red and green markings indicate errors in the symbolic formulation, not whether the final answer was correct. The figure shows the symbolic representations had errors that were corrected, but doesn't explicitly state which final answers were correct.\n\nHowever, based on the examples shown, I cannot definitively determine from the information provided how many predictions were actually correct, as the figure focuses on showing the symbolic formulation errors rather than answer correctness.\n\nSince I cannot determine this with certainty from the document, I need to reconsider. The document shows 5 total examples (1 in Figure 5, 4 in Figure 6) but doesn't explicitly mark which final predictions were correct.\n\nAnswer: 0", "citations": [], "input_tokens": 53263, "output_tokens": 443, "cost_micros": 166434, "latency_ms": 15125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "answer_line", "finish_reason": "stop", "pdf_filename": "2023.findings-emnlp.248.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "0", "normalised_gold": "2"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q002", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "2", "pages": 19, "document_id": 7580, "arm": "azure_basic_lc", "question_id": "2023.findings-emnlp.248.pdf::Q002", "answer_letter": null, "raw_text": "I need to examine Figure 5 and Figure 6 to count how many examples Logic-LM predicted correctly.\n\n**Figure 5:**\nThere is 1 example shown with:\n- Problem about \"Stranger Things\" and \"Black Mirror\"\n- Predicted answer: B\n- No indication that this is incorrect\n\n**Figure 6:**\nThere are 4 examples shown:\n\n**Example 1:** Wild turkeys problem\n- Predicted answer: \u20ac > A (this appears to be garbled text)\n- The arrow pointing to \"A\" suggests the correct answer should be A\n- The red marking indicates this is incorrect\n\n**Example 2:** Legend of Zelda problem\n- Predicted answer: A\n- No red markings indicating error\n- This appears correct\n\n**Example 3:** Golf tournament problem\n- Predicted answer: + + B (garbled)\n- The arrow pointing to \"B\" and red marking indicates this is incorrect\n\n**Example 4:** Restaurant/Subway problem\n- Predicted answer: ( + A (garbled)\n- The arrow pointing to \"A\" and red marking indicates this is incorrect\n\nFrom the caption of Figure 6: \"The incorrect segment(s) and their correspondence in the problem are marked in red, and the correct revisions are marked in green.\"\n\nThis indicates that Figure 6 shows ERROR cases where Logic-LM made mistakes.\n\nFigure 5 shows a successful case (no red markings).\n\nSo:\n- Figure 5: 1 correct\n- Figure 6: 0 correct (all 4 are error examples)\n\nTotal correct: 1\n\nAnswer: 1", "citations": [], "input_tokens": 22916, "output_tokens": 365, "cost_micros": 74223, "latency_ms": 9516, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "2"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q002", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "2", "pages": 19, "document_id": 7580, "arm": "azure_premium_lc", "question_id": "2023.findings-emnlp.248.pdf::Q002", "answer_letter": null, "raw_text": "I need to check each example in Figure 5 and Figure 6 to see if Logic-LM predicted the correct answer.\n\n**Figure 5:**\n- Problem about wild turkeys and Tom\n- Question: \"Is the following statement true, false, or uncertain? 'Black Mirror' is popular.\"\n- Options: (A) True, (B) False, (C) Uncertain\n- Predicted answer: B\n- Looking at the logic: The problem states \"Karen does not download 'Black Mirror'\" and \"If and only if Karen binge-watches a Netflix show, she will download it.\" This means Karen doesn't binge-watch Black Mirror. Since \"If a Netflix show is popular, Karen will binge-watch it,\" this means Black Mirror is NOT popular.\n- So B (False) is correct. \u2713\n\n**Figure 6:**\n\n**Example 1:**\n- About Tom the wild turkey\n- Question: Is Tom an Ocellated wild turkey?\n- Options: (A) True, (B) False, (C) Uncertain\n- Predicted answer: A (marked with ++)\n- Logic: Tom is a wild turkey but NOT Eastern, Osceola, Gould's, Merriam's, or Rio Grande. Since there are only 6 types and Tom is eliminated from 5, he must be Ocellated.\n- A (True) is correct. \u2713\n\n**Example 2:**\n- About Legend of Zelda\n- Question: Is it in the Top 10 list?\n- Options: (A) True, (B) False, (C) Uncertain\n- Predicted answer: A\n- Logic: Legend of Zelda sold more than 1 million copies \u2192 selected into Top 10 list.\n- A (True) is correct. \u2713\n\n**Example 3:**\n- Golf tournament with 5 golfers\n- Question: Which finished third?\n- Options: (A) Rob, (B) Eve, (C) Eli, (D) Amy, (E) Dan\n- Predicted answer: B (marked with =>)\n- But there's an error in the constraints shown: \"Dan > Eve > Dan < Eve\" which is contradictory\n- The problem states: Dan finished 2nd, Dan", "citations": [], "input_tokens": 26786, "output_tokens": 512, "cost_micros": 88038, "latency_ms": 10844, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "2"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q002", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "2", "pages": 19, "document_id": 7580, "arm": "llamacloud_basic_lc", "question_id": "2023.findings-emnlp.248.pdf::Q002", "answer_letter": null, "raw_text": "I need to examine Figure 5 and Figure 6 to count how many examples Logic-LM predicted correctly.\n\n**Figure 5:**\nThis shows one example about \"Stranger Things\" and \"Black Mirror\". The predicted answer is B, and there's no indication this is wrong, so I'll assume it's correct: **1 correct**\n\n**Figure 6 (labeled as \"Example Generations of LOGIC-LM\"):**\nThis contains 4 examples:\n\n1. **Example 1** (Wild turkey problem): \n   - Predicted answer: C \u2192 A\n   - The arrow notation indicates the prediction was C (Uncertain) but should have been A (True)\n   - **Incorrect**\n\n2. **Example 2** (Legend of Zelda problem):\n   - Predicted answer: A\n   - No arrow or correction shown, indicating this is correct\n   - **Correct**\n\n3. **Example 3** (Golf tournament problem):\n   - Predicted answer: E \u2192 B\n   - The arrow indicates the prediction was E (Dan finished third) but should have been B (Eve finished third)\n   - **Incorrect**\n\n4. **Example 4** (Restaurant/Subway problem):\n   - Predicted answer: C \u2192 A\n   - The arrow indicates the prediction was C (Uncertain) but should have been A (True)\n   - **Incorrect**\n\n**Total count:**\n- Figure 5: 1 correct\n- Figure 6: 1 correct (Example 2 only)\n\n**Total: 2 correct predictions**\n\nAnswer: 2", "citations": [], "input_tokens": 20097, "output_tokens": 361, "cost_micros": 65706, "latency_ms": 9327, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q002", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "2", "pages": 19, "document_id": 7580, "arm": "llamacloud_premium_lc", "question_id": "2023.findings-emnlp.248.pdf::Q002", "answer_letter": null, "raw_text": "I need to count how many examples in Figure 5 and Figure 6 have Logic-LM predicting the correct answer.\n\n**Figure 5:**\n- Example 1: Predicted answer is B (shown as correct based on the context)\n- Count: 1 correct\n\n**Figure 6:**\n- Example 1: Predicted answer changed from C to A. The arrows indicate C was wrong and A is the correction, meaning the final prediction is A (correct).\n- Count: 1 correct\n\n- Example 2: Predicted answer is A (shown as correct, no corrections indicated)\n- Count: 1 correct\n\n- Example 3: Predicted answer changed from E to B. The arrows indicate E was wrong and B is the correction, meaning the final prediction is B (correct).\n- Count: 1 correct\n\n- Example 4: Predicted answer changed from C to A. The arrows indicate C was wrong and A is the correction, meaning the final prediction is A (correct).\n- Count: 1 correct\n\nTotal examples with correct predictions: 1 (Figure 5) + 4 (Figure 6) = 5\n\nAnswer: 5", "citations": [], "input_tokens": 25915, "output_tokens": 263, "cost_micros": 81690, "latency_ms": 6344, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "2"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q002", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "2", "pages": 19, "document_id": 7580, "arm": "surfsense_agentic", "question_id": "2023.findings-emnlp.248.pdf::Q002", "answer_letter": "A", "raw_text": "I'll read the document to find Figures 5 and 6 and count how many examples Logic-LM predicted correctly.Let me search for Figure 5 and Figure 6 in the document:Let me search for the actual content around these figures:Now let me look for Figure 6 in the appendix:I need to find the actual content of Figure 6 with the examples. Let me search more carefully:Now let me look back at Figure 5 to get the complete example:Perfect! Now I have both figures. Let me analyze them:\n\n**Figure 5** (lines 1612-1645): Contains 1 example about \"Black Mirror\" and \"Stranger Things\"\n- Predicted answer: B\n- I need to check if this is correct by looking at the problem logic\n\n**Figure 6** (lines 2993-3146): Contains 4 examples:\n1. Example 1 (lines 2994-3018): Wild turkey problem - Predicted answer: A (with ++ A notation, but marked with red X's indicating errors)\n2. Example 2 (lines 3021-3044): Legend of Zelda problem - Predicted answer: A\n3. Example 3 (lines 3047-3109): Golf tournament problem - Predicted answer: B (with => B notation, and red X's indicating errors)\n4. Example 4 (lines 3115-3146): Subway restaurant problem - Predicted answer: A (with <> A notation, and red X's with checkmarks indicating errors)\n\nAccording to the caption at line 3148-3149: \"The incorrect segment(s) and their correspondence in the problem are marked in red, and the correct revisions are marked in green.\"\n\nThis means Figure 6 shows ERROR CASES where Logic-LM made mistakes. The red X's (\u2612) indicate incorrect segments.\n\nFor Figure 5, I need to verify if the answer is correct. Looking at the logic:\n- \"Black Mirror\" is a Netflix show\n- Karen does not download \"Black Mirror\"\n- If and only if Karen binge-watches a show, she will download it\n- Therefore, Karen does not binge-watch \"Black Mirror\"\n- If a Netflix show is popular, Karen will binge-watch it\n- Since Karen doesn't binge-watch \"Black Mirror\", it is NOT popular\n- Answer B (False) appears to be CORRECT\n\nSo the count is:\n- Figure 5: 1 example, 1 correct (Example about Black Mirror)\n- Figure 6: 4 examples, 1 correct (Example 2 - Legend of Zelda), 3 incorrect (Examples 1, 3, and 4 have red X marks)\n\nTotal correct: 1 + 1 = 2\n\nAnswer: 2", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 54812, "error": null, "extra": {"thread_id": 1948, "search_space_id": 55, "answer_letter_strategy": "answer_line", "user_message_id": "msg_4ece07b065344ab588935735051340b9", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 440, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q004", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "15", "pages": 19, "document_id": 7580, "arm": "native_pdf", "question_id": "2023.findings-emnlp.248.pdf::Q004", "answer_letter": null, "raw_text": "I need to count the prompt examples provided in Appendix C.\n\nLooking at Appendix C \"Prompt Examples\" starting on page 14:\n\nThe appendix states: \"For brevity, we show only a single demonstration for each setting in the following sections.\"\n\nThe sections are:\n- C.1 PrOntoQA Prompts (3 types: Standard, CoT, Logic-LM)\n- C.2 ProofWriter Prompts (3 types: Standard, CoT, Logic-LM)\n- C.3 FOLIO Prompts (3 types: Standard, CoT, Logic-LM)\n- C.4 LogicalDeduction Prompts (3 types: Standard, CoT, Logic-LM)\n- C.5 AR-LSAT Prompts (3 types: Standard, CoT, Logic-LM)\n\nEach subsection (C.1 through C.5) contains 3 prompt types (Standard In-Context Learning, Chain-of-Thought Prompting, and Logic-LM).\n\nSo there are 5 datasets \u00d7 3 prompt types = 15 prompt examples total.\n\nAnswer: 15", "citations": [], "input_tokens": 53247, "output_tokens": 276, "cost_micros": 163881, "latency_ms": 13389, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2023.findings-emnlp.248.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "15", "normalised_gold": "15"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q004", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "15", "pages": 19, "document_id": 7580, "arm": "azure_basic_lc", "question_id": "2023.findings-emnlp.248.pdf::Q004", "answer_letter": null, "raw_text": "I need to count the number of prompt examples provided in Appendix C.\n\nLooking at Appendix C \"Prompt Examples\", I can see it covers prompts for different datasets:\n\n1. **C.1 PrOntoQA Prompts** - Contains 3 prompt types:\n   - Standard In-Context Learning (1 example)\n   - Chain-of-Thought Prompting (1 example)\n   - Logic-LM (1 example)\n\n2. **C.2 ProofWriter Prompts** - Contains 3 prompt types:\n   - Standard In-Context Learning (1 example)\n   - Chain-of-Thought Prompting (1 example)\n   - Logic-LM (1 example)\n\n3. **C.3 FOLIO Prompts** - Contains 3 prompt types:\n   - Standard In-Context Learning (1 example)\n   - Chain-of-Thought Prompting (1 example)\n   - Logic-LM (1 example)\n\n4. **C.4 LogicalDeduction Prompts** - Contains 3 prompt types:\n   - Standard In-Context Learning (1 example)\n   - Chain-of-Thought Prompting (1 example)\n   - Logic-LM (1 example)\n\n5. **C.5 AR-LSAT Prompts** - Contains 3 prompt types:\n   - Standard In-Context Learning (1 example)\n   - Chain-of-Thought Prompting (1 example)\n   - Logic-LM (1 example)\n\nTotal count: 5 datasets \u00d7 3 prompt types = 15 prompt examples\n\nAnswer: 15", "citations": [], "input_tokens": 22900, "output_tokens": 398, "cost_micros": 74670, "latency_ms": 5968, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "15", "normalised_gold": "15"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q004", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "15", "pages": 19, "document_id": 7580, "arm": "azure_premium_lc", "question_id": "2023.findings-emnlp.248.pdf::Q004", "answer_letter": null, "raw_text": "I need to count the number of prompt examples given in Appendix C.\n\nLooking at Appendix C \"Prompt Examples\":\n\n**C.1 PrOntoQA Prompts**\n- Standard In-Context Learning (1 example)\n- Chain-of-Thought Prompting (1 example)\n- Logic-LM (1 example)\n\n**C.2 ProofWriter Prompts**\n- Standard In-Context Learning (1 example)\n- Chain-of-Thought Prompting (1 example)\n- Logic-LM (1 example)\n\n**C.3 FOLIO Prompts**\n- Standard In-Context Learning (1 example)\n- Chain-of-Thought Prompting (1 example)\n- Logic-LM (1 example)\n\n**C.4 LogicalDeduction Prompts**\n- Standard In-Context Learning (1 example)\n- Chain-of-Thought Prompting (1 example)\n- Logic-LM (1 example)\n\n**C.5 AR-LSAT Prompts**\n- Standard In-Context Learning (1 example)\n- Chain-of-Thought Prompting (1 example)\n- Logic-LM (1 example)\n\nTotal count: 5 datasets \u00d7 3 prompting methods = 15 prompt examples\n\nAnswer: 15", "citations": [], "input_tokens": 26770, "output_tokens": 323, "cost_micros": 85155, "latency_ms": 5500, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "15", "normalised_gold": "15"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q004", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "15", "pages": 19, "document_id": 7580, "arm": "llamacloud_basic_lc", "question_id": "2023.findings-emnlp.248.pdf::Q004", "answer_letter": null, "raw_text": "I need to count the number of prompt examples provided in the Appendix section of the document.\n\nLooking at the Appendix sections:\n\n**Section C: Prompt Examples**\n\nThe document states: \"For brevity, we show only a single demonstration for each setting in the following sections.\"\n\nThen it provides:\n\n**C.1 PrOntoQA Prompts:**\n- Standard In-Context Learning (1 example)\n- Logic-LM (1 example)\n\n**C.2 ProofWriter Prompts:**\n- Standard In-Context Learning (1 example)\n- Chain-of-Thought Prompting (1 example)\n- Logic-LM (1 example)\n\n**C.3 FOLIO Prompts:**\n- Standard In-Context Learning (1 example)\n- Chain-of-Thought Prompting (1 example)\n- Logic-LM (1 example)\n\n**C.4 Logical Deduction Prompts:**\n- Standard In-Context Learning (1 example)\n- Chain-of-Thought Prompting (1 example)\n- Logic-LM (1 example)\n\n**C.5 AR-LSAT Prompts:**\n- Standard In-Context Learning (1 example)\n- Chain-of-Thought Prompting (1 example)\n- Logic-LM (1 example)\n\nTotal count:\n- C.1: 2 examples\n- C.2: 3 examples\n- C.3: 3 examples\n- C.4: 3 examples\n- C.5: 3 examples\n\nTotal: 2 + 3 + 3 + 3 + 3 = 14 examples\n\nAnswer: 14", "citations": [], "input_tokens": 20081, "output_tokens": 407, "cost_micros": 66348, "latency_ms": 7125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "14", "normalised_gold": "15"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q004", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "15", "pages": 19, "document_id": 7580, "arm": "llamacloud_premium_lc", "question_id": "2023.findings-emnlp.248.pdf::Q004", "answer_letter": null, "raw_text": "I need to count the number of prompt examples provided in Appendix C.\n\nLooking at Appendix C (Prompt Examples), I can see it's organized by dataset:\n\n**C.1 PrOntoQA Prompts:**\n1. Standard In-Context Learning - 1 example\n2. Chain-of-Thought Prompting - 1 example\n3. Logic-LM - 1 example\n\n**C.2 ProofWriter Prompts:**\n1. Standard In-Context Learning - 1 example\n2. Chain-of-Thought Prompting - 1 example\n3. Logic-LM - 1 example\n\n**C.3 FOLIO Prompts:**\n1. Standard In-Context Learning - 1 example\n2. Chain-of-Thought Prompting - 1 example\n3. Logic-LM - 1 example\n\n**C.4 LogicalDeduction Prompts:**\n1. Standard In-Context Learning - 1 example\n2. Chain-of-Thought Prompting - 1 example\n3. Logic-LM - 1 example\n\n**C.5 AR-LSAT Prompts:**\n1. Standard In-Context Learning - 1 example\n2. Chain-of-Thought Prompting - 1 example\n3. Logic-LM - 1 example\n\nTotal count: 5 datasets \u00d7 3 prompt types = 15 prompt examples\n\nAnswer: 15", "citations": [], "input_tokens": 25899, "output_tokens": 346, "cost_micros": 82887, "latency_ms": 6280, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "15", "normalised_gold": "15"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q004", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "15", "pages": 19, "document_id": 7580, "arm": "surfsense_agentic", "question_id": "2023.findings-emnlp.248.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to find information about prompt examples in the Appendix.Let me search for the Appendix section in the document to find information about prompt examples.Now let me read the section with prompt examples to count them. Let me find where this content is located.Let me search for the Appendix C section with prompt examples more directly:Let me continue to search for all the sections to count the prompt examples. Based on what I've seen so far, I need to count the subsections in Appendix C.Perfect! I can see there are 5 subsections in Appendix C:\n- C.1 PrOntoQA Prompts\n- C.2 ProofWriter Prompts  \n- C.3 FOLIO Prompts\n- C.4 LogicalDeduction Prompts\n- C.5 AR-LSAT Prompts\n\nEach subsection shows prompts for 3 different methods (Standard In-Context Learning, Chain-of-Thought Prompting, and Logic-LM), and according to the introduction text, each shows \"only a single demonstration for each setting.\"\n\nSo the count is: 5 datasets \u00d7 3 methods = 15 prompt examples in Appendix C.\n\nAnswer: 15", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 55203, "error": null, "extra": {"thread_id": 1949, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_28321722a0164737934f82e2d1d378ae", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 318, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "15", "normalised_gold": "15"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q005", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "4", "pages": 19, "document_id": 7580, "arm": "native_pdf", "question_id": "2023.findings-emnlp.248.pdf::Q005", "answer_letter": null, "raw_text": "I need to find the performance comparison between Logic-LM and the two baseline models (Standard and CoT) when using GPT-4 as the base language model.\n\nLooking at Table 2, which shows the accuracy results for different models across five datasets, I can see the GPT-4 (gpt-4) columns:\n\n1. **PrOntoQA**: \n   - Standard: 77.40\n   - CoT: 98.79\n   - Logic-LM: 83.20\n   - Logic-LM does NOT outperform CoT (98.79 > 83.20), but does outperform Standard\n\n2. **ProofWriter**:\n   - Standard: 52.67\n   - CoT: 68.11\n   - Logic-LM: 79.66\n   - Logic-LM outperforms both baselines \u2713\n\n3. **FOLIO**:\n   - Standard: 69.11\n   - CoT: 70.58\n   - Logic-LM: 78.92\n   - Logic-LM outperforms both baselines \u2713\n\n4. **LogicalDeduction**:\n   - Standard: 71.33\n   - CoT: 75.25\n   - Logic-LM: 87.63\n   - Logic-LM outperforms both baselines \u2713\n\n5. **AR-LSAT**:\n   - Standard: 33.33\n   - CoT: 35.06\n   - Logic-LM: 43.04\n   - Logic-LM outperforms both baselines \u2713\n\nLogic-LM outperforms BOTH baseline models in 4 out of 5 datasets (ProofWriter, FOLIO, LogicalDeduction, and AR-LSAT).\n\nAnswer: 4", "citations": [], "input_tokens": 53272, "output_tokens": 424, "cost_micros": 166176, "latency_ms": 23500, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2023.findings-emnlp.248.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q005", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "4", "pages": 19, "document_id": 7580, "arm": "azure_basic_lc", "question_id": "2023.findings-emnlp.248.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Table 2, I need to compare Logic-LM's performance against both Standard and CoT baselines for GPT-4 across all 5 datasets.\n\nFor GPT-4:\n\n1. **PrOntoQA**: \n   - Logic-LM: 83.20\n   - Standard: 77.40 \u2713\n   - CoT: 98.79 \u2717\n   - Logic-LM outperforms Standard only\n\n2. **ProofWriter**:\n   - Logic-LM: 79.66\n   - Standard: 52.67 \u2713\n   - CoT: 68.11 \u2713\n   - Logic-LM outperforms both\n\n3. **FOLIO**:\n   - Logic-LM: 78.92\n   - Standard: 69.11 \u2713\n   - CoT: 70.58 \u2713\n   - Logic-LM outperforms both\n\n4. **LogicalDeduction**:\n   - Logic-LM: 87.63\n   - Standard: 71.33 \u2713\n   - CoT: 75.25 \u2713\n   - Logic-LM outperforms both\n\n5. **AR-LSAT**:\n   - Logic-LM: 43.04\n   - Standard: 33.33 \u2713\n   - CoT: 35.06 \u2713\n   - Logic-LM outperforms both\n\nLogic-LM outperforms both baseline models in 4 datasets (ProofWriter, FOLIO, LogicalDeduction, and AR-LSAT).\n\nAnswer: 4", "citations": [], "input_tokens": 22925, "output_tokens": 391, "cost_micros": 74640, "latency_ms": 7687, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q005", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "4", "pages": 19, "document_id": 7580, "arm": "azure_premium_lc", "question_id": "2023.findings-emnlp.248.pdf::Q005", "answer_letter": null, "raw_text": "I need to look at Table 2 to compare Logic-LM's performance against the two baselines (Standard and CoT) when using GPT-4.\n\nFrom Table 2, the GPT-4 results are:\n\n**PrOntoQA:**\n- Standard: 77.40\n- CoT: 98.79\n- Logic-LM: 83.20\n- Logic-LM does NOT outperform both (CoT is better)\n\n**ProofWriter:**\n- Standard: 52.67\n- CoT: 68.11\n- Logic-LM: 79.66\n- Logic-LM outperforms both \u2713\n\n**FOLIO:**\n- Standard: 69.11\n- CoT: 70.58\n- Logic-LM: 78.92\n- Logic-LM outperforms both \u2713\n\n**LogicalDeduction:**\n- Standard: 71.33\n- CoT: 75.25\n- Logic-LM: 87.63\n- Logic-LM outperforms both \u2713\n\n**AR-LSAT:**\n- Standard: 33.33\n- CoT: 35.06\n- Logic-LM: 43.04\n- Logic-LM outperforms both \u2713\n\nLogic-LM outperforms both baseline models on 4 out of 5 datasets when using GPT-4.\n\nAnswer: 4", "citations": [], "input_tokens": 26795, "output_tokens": 332, "cost_micros": 85365, "latency_ms": 7342, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q005", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "4", "pages": 19, "document_id": 7580, "arm": "llamacloud_basic_lc", "question_id": "2023.findings-emnlp.248.pdf::Q005", "answer_letter": null, "raw_text": "I need to find how many datasets Logic-LM (without self-refinement) outperforms both baseline models (Standard and CoT) when using GPT-4.\n\nLooking at Table 2 in the document, I need to compare Logic-LM's performance against Standard LLMs and Chain-of-Thought (CoT) for GPT-4 across all datasets.\n\nFrom Table 2, the GPT-4 results show:\n\n1. **PrOntoQA**: \n   - Standard: 99.4%\n   - CoT: 84.9%\n   - Logic-LM: 100.0%\n   - Logic-LM outperforms both \u2713\n\n2. **ProofWriter**:\n   - Standard: 87.3%\n   - CoT: 73.6%\n   - Logic-LM: 99.0%\n   - Logic-LM outperforms both \u2713\n\n3. **FOLIO**:\n   - Standard: 66.7%\n   - CoT: 61.8%\n   - Logic-LM: 79.9%\n   - Logic-LM outperforms both \u2713\n\n4. **LogicalDeduction**:\n   - Standard: 11.3%\n   - CoT: 57.7%\n   - Logic-LM: 60.0%\n   - Logic-LM outperforms both \u2713\n\n5. **AR-LSAT**:\n   - Standard: 32.6%\n   - CoT: 60.0%\n   - Logic-LM: 39.8%\n   - Logic-LM does NOT outperform CoT (39.8% < 60.0%) \u2717\n\nLogic-LM outperforms both baseline models in 4 out of 5 datasets when using GPT-4.\n\nAnswer: 4", "citations": [], "input_tokens": 20106, "output_tokens": 440, "cost_micros": 66918, "latency_ms": 8265, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q005", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "4", "pages": 19, "document_id": 7580, "arm": "llamacloud_premium_lc", "question_id": "2023.findings-emnlp.248.pdf::Q005", "answer_letter": null, "raw_text": "I need to find how many datasets Logic-LM (without self-refinement) outperforms both baseline models (Standard and CoT) when using GPT-4.\n\nLooking at Table 2, for GPT-4 (gpt-4):\n\n1. **PrOntoQA**: \n   - Standard: 77.40\n   - CoT: 98.79\n   - Logic-LM: 83.20\n   - Logic-LM does NOT outperform CoT (83.20 < 98.79)\n\n2. **ProofWriter**:\n   - Standard: 52.67\n   - CoT: 68.11\n   - Logic-LM: 79.66\n   - Logic-LM outperforms both (79.66 > 68.11 > 52.67) \u2713\n\n3. **FOLIO**:\n   - Standard: 69.11\n   - CoT: 70.58\n   - Logic-LM: 78.92\n   - Logic-LM outperforms both (78.92 > 70.58 > 69.11) \u2713\n\n4. **LogicalDeduction**:\n   - Standard: 71.33\n   - CoT: 75.25\n   - Logic-LM: 87.63\n   - Logic-LM outperforms both (87.63 > 75.25 > 71.33) \u2713\n\n5. **AR-LSAT**:\n   - Standard: 33.33\n   - CoT: 35.06\n   - Logic-LM: 43.04\n   - Logic-LM outperforms both (43.04 > 35.06 > 33.33) \u2713\n\nLogic-LM outperforms both baselines on 4 datasets: ProofWriter, FOLIO, LogicalDeduction, and AR-LSAT.\n\nAnswer: 4", "citations": [], "input_tokens": 25924, "output_tokens": 456, "cost_micros": 84612, "latency_ms": 8796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q005", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "4", "pages": 19, "document_id": 7580, "arm": "surfsense_agentic", "question_id": "2023.findings-emnlp.248.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to find information about Logic-LM's performance compared to baseline models when using GPT-4.Let me continue reading to find information about GPT-4 performance comparisons.Let me search for tables or sections that compare GPT-4 performance across datasets.Now let me read the table with GPT-4 results to count the datasets where Logic-LM outperforms baselines.Let me look at the table header to understand the structure better.Perfect! Now I have the complete table. Let me analyze the GPT-4 results:\n\nFor GPT-4 (the last 3 columns):\n- Standard, CoT, Logic-LM\n\nThe two baseline models are:\n1. Standard prompting\n2. Chain-of-thought (CoT) prompting\n\nI need to count how many datasets where Logic-LM (without self-refinement) outperforms BOTH baselines when using GPT-4.\n\nLet me check each dataset:\n\n1. **PrOntoQA**: Logic-LM = 83.20, Standard = 77.40, CoT = 98.79\n   - Logic-LM does NOT outperform CoT (83.20 < 98.79)\n\n2. **ProofWriter**: Logic-LM = 79.66, Standard = 52.67, CoT = 68.11\n   - Logic-LM > Standard (79.66 > 52.67) \u2713\n   - Logic-LM > CoT (79.66 > 68.11) \u2713\n   - **Outperforms both** \u2713\n\n3. **FOLIO**: Logic-LM = 78.92, Standard = 69.11, CoT = 70.58\n   - Logic-LM > Standard (78.92 > 69.11) \u2713\n   - Logic-LM > CoT (78.92 > 70.58) \u2713\n   - **Outperforms both** \u2713\n\n4. **LogicalDeduction**: Logic-LM = 87.63, Standard = 71.33, CoT = 75.25\n   - Logic-LM > Standard (87.63 > 71.33) \u2713\n   - Logic-LM > CoT (87.63 > 75.25) \u2713\n   - **Outperforms both** \u2713\n\n5. **AR-LSAT**: Logic-LM = 43.04, Standard = 33.33, CoT = 35.06\n   - Logic-LM > Standard (43.04 > 33.33) \u2713\n   - Logic-LM > CoT (43.04 > 35.06) \u2713\n   - **Outperforms both** \u2713\n\nLogic-LM (without self-refinement) outperforms both baseline models on 4 datasets when using GPT-4 as the base language model.\n\nAnswer: 4", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 44437, "error": null, "extra": {"thread_id": 1950, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_cfca2042672b455980d6e47d98e5f851", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 311, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q006", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "str", "gold": "Pyke", "pages": 19, "document_id": 7580, "arm": "native_pdf", "question_id": "2023.findings-emnlp.248.pdf::Q006", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about the symbolic solver used for logic programming.\n\nFrom Table 1 on page 5, I can see that for \"Deductive Reasoning\" problems (which corresponds to the LP - Logic Programming formulation), the solver used is **Pyke**.\n\nThis is further confirmed in Section 3.2 on page 5, which states: \"For deductive reasoning, we incorporate the Pyke expert system (Frederiksen, 2008), which makes inferences based on the logic programming language.\"\n\nAnswer: Pyke", "citations": [], "input_tokens": 53262, "output_tokens": 128, "cost_micros": 161706, "latency_ms": 10532, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2023.findings-emnlp.248.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "pyke", "normalised_gold": "pyke"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q006", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "str", "gold": "Pyke", "pages": 19, "document_id": 7580, "arm": "azure_basic_lc", "question_id": "2023.findings-emnlp.248.pdf::Q006", "answer_letter": null, "raw_text": "Looking at the document, I need to find which symbolic solver implements the logic programming module.\n\nIn Table 1 on page 3810, the paper summarizes the symbolic solvers used for different problem types. For \"Deductive Reasoning\" which uses \"LP\" (Logic Programming) formulation, the solver listed is \"Pyke\".\n\nThis is confirmed in Section 3.2 on page 3810, which states: \"For deductive reasoning, we incorporate the Pyke expert system (Frederiksen, 2008), which makes inferences based on the logic programming language.\"\n\nAnswer: Pyke", "citations": [], "input_tokens": 22915, "output_tokens": 137, "cost_micros": 70800, "latency_ms": 5000, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "pyke", "normalised_gold": "pyke"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q006", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "str", "gold": "Pyke", "pages": 19, "document_id": 7580, "arm": "azure_premium_lc", "question_id": "2023.findings-emnlp.248.pdf::Q006", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about the symbolic solver used for logic programming.\n\nIn Table 1 on page 3810, the paper shows a summary of symbolic formulations and symbolic solvers used for different datasets. For \"Deductive Reasoning\" problems that use \"LP\" (Logic Programming) formulation, the solver listed is \"Pyke\".\n\nThis is confirmed in Section 3.2 \"Symbolic Reasoner\" on page 3810, which states: \"For deductive reasoning, we incorporate the Pyke expert system (Frederiksen, 2008), which makes inferences based on the logic programming language.\"\n\nAnswer: Pyke", "citations": [], "input_tokens": 26785, "output_tokens": 150, "cost_micros": 82605, "latency_ms": 6202, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "pyke", "normalised_gold": "pyke"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q006", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "str", "gold": "Pyke", "pages": 19, "document_id": 7580, "arm": "llamacloud_basic_lc", "question_id": "2023.findings-emnlp.248.pdf::Q006", "answer_letter": null, "raw_text": "Looking through the document for information about the symbolic solver used for logic programming...\n\nIn Table 1 on page 3805, the document shows a summary of symbolic formulations and symbolic solvers used for different datasets. For \"Deductive Reasoning\" problems (which correspond to the logic programming formulation mentioned in Figure 1), the solver listed is \"ProntoQA, Pyke\".\n\nAdditionally, in Section 3.2 and footnotes, the document references Pyke as the logic programming engine. In Section D (page 3823), it explicitly states: \"For PrOntoQA and ProofWriter, the Pyke logic programming engine returns the inferred value...\"\n\nAnswer: Pyke", "citations": [], "input_tokens": 20096, "output_tokens": 158, "cost_micros": 62658, "latency_ms": 6328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "pyke", "normalised_gold": "pyke"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q006", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "str", "gold": "Pyke", "pages": 19, "document_id": 7580, "arm": "llamacloud_premium_lc", "question_id": "2023.findings-emnlp.248.pdf::Q006", "answer_letter": null, "raw_text": "Looking at the document, I need to find which symbolic solver is used for the logic programming module.\n\nFrom Table 1 on page 3810, I can see that for \"Deductive Reasoning\" problems which use \"LP\" (Logic Programming) formulation, the solver listed is \"Pyke\".\n\nThis is confirmed in Section 3.2 \"Symbolic Reasoner\" on page 3810, which states:\n\n\"**LP System.** For deductive reasoning, we incorporate the Pyke expert system (Frederiksen, 2008), which makes inferences based on the logic programming language.\"\n\nAnswer: Pyke", "citations": [], "input_tokens": 25914, "output_tokens": 143, "cost_micros": 79887, "latency_ms": 5453, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "pyke", "normalised_gold": "pyke"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q006", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "str", "gold": "Pyke", "pages": 19, "document_id": 7580, "arm": "surfsense_agentic", "question_id": "2023.findings-emnlp.248.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the document to find information about the symbolic solver used for the logic programming module in Figure 1.Let me search for information about Figure 1 and the symbolic solver:Let me read the document content to find references to Figure 1 and the solver:Good! I can see Figure 1 is mentioned. Now let me search for specific mentions of the solver implementation:I can see references to solvers. Let me look at line 711 where it mentions \"logic programming engine\" and around line 823 where Table 1 is mentioned:Perfect! I found Table 1. Let me continue reading to see all the solvers listed:Perfect! I found the information. Looking at Table 1 and the description in section 3.2, I can see that:\n\n- For **Deductive Reasoning** (Logic Programming / LP), the solver used is **Pyke**\n- For **First-Order Logic** (FOL), the solver used is **Prover9**\n- For **Constraint Satisfaction** (CSP), the solver used is **python-constraint**\n- For **Analytical Reasoning** (SAT), the solver used is **Z3**\n\nNow, let me verify which one corresponds to the \"logic programming module\" mentioned in Figure 1. Based on the document:\n\n1. Figure 1 shows the overall LOGIC-LM framework with a \"Symbolic Reasoner\" component\n2. Section 3.2 describes the symbolic reasoner and mentions that for deductive reasoning using the Logic Programming (LP) language, they use **Pyke**\n3. Section 3.2.1 specifically states: \"For deductive reasoning, we incorporate the Pyke expert system (Frederiksen, 2008), which makes inferences based on the logic programming language.\"\n\nThe logic programming module shown in Figure 1 is implemented using **Pyke**.\n\nAnswer: Pyke", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 58280, "error": null, "extra": {"thread_id": 1951, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_708b2982536441eabb41dbb732343ffa", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 360, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "pyke", "normalised_gold": "pyke"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q007", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "str", "gold": "ProofWriter", "pages": 19, "document_id": 7580, "arm": "native_pdf", "question_id": "2023.findings-emnlp.248.pdf::Q007", "answer_letter": null, "raw_text": "Looking at the datasets table (Table 5 on page 14), I can see the following datasets with their number of options:\n\n- PrOntoQA: 2 options\n- ProofWriter: 3 options\n- FOLIO: 3 options\n- LogicalDeduction: 3, 5, 7 options\n- AR-LSAT: 5 options\n\nAR-LSAT has 5 options for all questions. Looking at the references and dataset descriptions, AR-LSAT (Zhong et al., 2022) is described as \"a dataset that collects all analytical logic reasoning questions from the Law School Admission Test from 1991 to 2016\" with \"231 multiple-choice questions\" and the test set has \"231 multiple-choice questions.\"\n\nFrom the paper text: \"AR-LSAT (Zhong et al., 2022) is a dataset that collects all analytical logic reasoning questions from the Law School Admission Test from 1991 to 2016. We use the test set which has 231 multiple-choice questions.\"\n\nAnswer: AR-LSAT", "citations": [], "input_tokens": 53272, "output_tokens": 258, "cost_micros": 163686, "latency_ms": 21859, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2023.findings-emnlp.248.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "ar lsat", "normalised_gold": "proofwriter"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q007", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "str", "gold": "ProofWriter", "pages": 19, "document_id": 7580, "arm": "azure_basic_lc", "question_id": "2023.findings-emnlp.248.pdf::Q007", "answer_letter": null, "raw_text": "Looking at the document, I need to find a dataset that:\n1. Was proposed in 2022\n2. Has all multiple-choice questions with 5 options\n\nFrom Table 1 and the dataset descriptions in Section 4:\n\n- **PrOntoQA (2023)**: 2 options (True/False)\n- **ProofWriter (2021)**: 3 options\n- **FOLIO (2022)**: 3 options\n- **LogicalDeduction (2022)**: Variable options (3, 5, or 7 according to Table 5)\n- **AR-LSAT (2022)**: 5 options\n\nFrom the paper text (Section 4):\n\"AR-LSAT (Zhong et al., 2022) is a dataset that collects all analytical logic reasoning questions from the Law School Admission Test from 1991 to 2016. We use the test set which has 231 multiple-choice questions.\"\n\nAnd from Table 5, AR-LSAT has \"#Opts: 5\".\n\nAnswer: AR-LSAT", "citations": [], "input_tokens": 22925, "output_tokens": 253, "cost_micros": 72570, "latency_ms": 7828, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "ar lsat", "normalised_gold": "proofwriter"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q007", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "str", "gold": "ProofWriter", "pages": 19, "document_id": 7580, "arm": "azure_premium_lc", "question_id": "2023.findings-emnlp.248.pdf::Q007", "answer_letter": null, "raw_text": "Looking through the document for datasets proposed in 2022 with 5-option multiple-choice questions:\n\nFrom Table 5 (Dataset Statistics):\n- PrOntoQA: 2 options\n- ProofWriter: 3 options\n- FOLIO: 3 options\n- LogicalDeduction: 3, 5, 7 options (varies)\n- AR-LSAT: 5 options\n\nFrom the dataset descriptions in Section 4:\n- AR-LSAT (Zhong et al., 2022) is described as \"a dataset that collects all analytical logic reasoning questions from the Law School Admission Test from 1991 to 2016\" with \"231 multiple-choice questions.\"\n- Table 5 confirms AR-LSAT has 5 options (#Opts column)\n\nAnswer: AR-LSAT", "citations": [], "input_tokens": 26795, "output_tokens": 198, "cost_micros": 83355, "latency_ms": 7812, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "ar lsat", "normalised_gold": "proofwriter"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q007", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "str", "gold": "ProofWriter", "pages": 19, "document_id": 7580, "arm": "llamacloud_basic_lc", "question_id": "2023.findings-emnlp.248.pdf::Q007", "answer_letter": null, "raw_text": "Looking at the document, I need to find datasets that:\n1. Were proposed in 2022\n2. Have all problems as multiple-choice questions with 5 options\n\nFrom Table 5 (Dataset Statistics), I can see the number of options (#Opts) for each dataset:\n- PrOntoQA: 2 options\n- ProofWriter: 3 options\n- FOLIO: 3 options\n- LogicalDeduction: 3, 5, 7 options (variable)\n- AR-LSAT: 5 options\n\nFrom the Datasets section (Section 4), I can find when each dataset was created:\n- PrOntoQA: 2023 (Saparov and He, 2023)\n- ProofWriter: 2021 (Tafjord et al., 2021)\n- FOLIO: 2022 (Han et al., 2022)\n- LogicalDeduction: 2022 (from BigBench - Srivastava et al., 2022)\n- AR-LSAT: 2022 (Zhong et al., 2022)\n\nAmong the 2022 datasets:\n- FOLIO has 3 options\n- LogicalDeduction has variable options (3, 5, or 7)\n- AR-LSAT has 5 options\n\nOnly AR-LSAT consistently has 5 options for all questions.\n\nAnswer: AR-LSAT", "citations": [], "input_tokens": 20106, "output_tokens": 343, "cost_micros": 65463, "latency_ms": 7766, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "ar lsat", "normalised_gold": "proofwriter"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q007", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "str", "gold": "ProofWriter", "pages": 19, "document_id": 7580, "arm": "llamacloud_premium_lc", "question_id": "2023.findings-emnlp.248.pdf::Q007", "answer_letter": null, "raw_text": "Looking at the document, I need to find a dataset that:\n1. Was proposed in 2022\n2. Has all multiple-choice questions with 5 options\n\nFrom Table 5 (Dataset Statistics), I can see the \"#Opts\" column which shows the number of options:\n- PrOntoQA: 2 options\n- ProofWriter: 3 options\n- FOLIO: 3 options\n- LogicalDeduction: 3,5,7 options (varying)\n- AR-LSAT: 5 options\n\nFrom the dataset descriptions in Section 4:\n- AR-LSAT is described as collecting \"all analytical logic reasoning questions from the Law School Admission Test from 1991 to 2016\"\n- The citation is \"Zhong et al. 2022\"\n\nLooking at the references, AR-LSAT is cited as:\n\"Wanjun Zhong, Siyuan Wang, Duyu Tang, Zenan Xu, Daya Guo, Yining Chen, Jiahai Wang, Jian Yin, Ming Zhou, and Nan Duan. 2022. Analytical reasoning of text.\"\n\nAR-LSAT is the only dataset with consistently 5 options and was published in 2022.\n\nAnswer: AR-LSAT", "citations": [], "input_tokens": 25924, "output_tokens": 307, "cost_micros": 82377, "latency_ms": 8733, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "ar lsat", "normalised_gold": "proofwriter"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q007", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "str", "gold": "ProofWriter", "pages": 19, "document_id": 7580, "arm": "surfsense_agentic", "question_id": "2023.findings-emnlp.248.pdf::Q007", "answer_letter": null, "raw_text": "I'll read the document to find information about datasets used in the paper, focusing on those proposed in 2022 with multiple-choice questions having 5 options.Let me search for information about the datasets used in this paper, particularly focusing on those from 2022 and their characteristics.I can see references to datasets from 2022. Let me search for information about AR-LSAT and other datasets, particularly focusing on their characteristics regarding multiple-choice questions with 5 options.Let me search for more specific information about the datasets, particularly looking for details about multiple-choice options.Let me read more about each dataset to find their characteristics. Let me look at the section describing LogicalDeduction.Let me search for information about LogicalDeduction specifically and check the appendix for detailed data statistics.Let me search for \"Appendix B\" which was mentioned as containing detailed data statistics.Perfect! I found the information I need in Table 5. Let me verify the details about AR-LSAT:\n\nFrom the table:\n- AR-LSAT has 5 options (#Opts: 5)\n- AR-LSAT was proposed by Zhong et al., 2022\n\nLet me verify that AR-LSAT is specifically about logical reasoning and all problems are multiple-choice.Based on the document:\n\n1. AR-LSAT was proposed by Zhong et al., 2022\n2. It collects all **analytical logic reasoning questions** from the Law School Admission Test\n3. The test set has 231 **multiple-choice questions**\n4. From Table 5 (Appendix B), AR-LSAT has **5 options** (#Opts: 5)\n\nThis matches all the criteria in the question: proposed in 2022, logical reasoning problems, all multiple-choice questions with 5 options.\n\nAnswer: AR-LSAT", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 69453, "error": null, "extra": {"thread_id": 1952, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_4c068961fe9145d382e57492c359de62", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 452, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "ar lsat", "normalised_gold": "proofwriter"}}
+{"qid": "2024.ug.eprospectus.pdf::Q000", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "list", "gold": "[\"Singapore-Cambridge GCE 'A' Level\", 'International Baccalaureate (IB) Diploma']", "pages": 27, "document_id": 7581, "arm": "native_pdf", "question_id": "2024.ug.eprospectus.pdf::Q000", "answer_letter": null, "raw_text": "I need to find information about application periods for different groups of applicants in this undergraduate prospectus.\n\nLooking at page 25 (document page 47), I can see a table titled \"APPLICATION PERIOD AND FEE\" that shows different application groups and their respective application periods.\n\nFrom this table, I can see the following end dates for different application groups:\n\n1. Singapore-Cambridge GCE 'A' Level: End: 19 Mar 2024\n2. Diploma Awarded by a Polytechnic or equivalent institution in Singapore: End: 21 Feb 2024\n3. NUS High School Diploma: End: 20 Jan 2024\n4. International Baccalaureate (IB) Diploma: End: 19 Mar 2024\n5. Part-Time B.Eng: End: 15 Jan 2024\n6. Part-Time B.Tech in Computing: End: 15 Jan 2024\n\nComparing these dates, the latest end date is 19 Mar 2024, which applies to two groups:\n- Singapore-Cambridge GCE 'A' Level applicants\n- International Baccalaureate (IB) Diploma applicants\n\nAnswer: Singapore-Cambridge GCE 'A' Level applicants, International Baccalaureate (IB) Diploma applicants", "citations": [], "input_tokens": 60375, "output_tokens": 307, "cost_micros": 185730, "latency_ms": 53796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2024.ug.eprospectus.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "international baccalaureate ib diploma applicants, singapore cambridge gce level applicants", "normalised_gold": "international baccalaureate ib diploma, singapore cambridge gce level"}}
+{"qid": "2024.ug.eprospectus.pdf::Q000", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "list", "gold": "[\"Singapore-Cambridge GCE 'A' Level\", 'International Baccalaureate (IB) Diploma']", "pages": 27, "document_id": 7581, "arm": "azure_basic_lc", "question_id": "2024.ug.eprospectus.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the APPLICATION PERIOD AND FEE table on pages 46-47, I need to identify which application groups have the latest end date.\n\nThe application periods listed are:\n\n1. **Singapore-Cambridge GCE 'A' Level**: End: 19 Mar 2024\n2. **Diploma Awarded by a Polytechnic or equivalent**: End: 20 Jan 2024\n3. **NUS High School Diploma**: End: 19 Mar 2024\n4. **International Baccalaureate (IB) Diploma**: End: 19 Mar 2024\n5. **Part-Time B.Eng**: End date not fully specified (appears cut off)\n6. **Part-Time B.Tech in Computing**: End: 21 Feb 2024\n\nFrom the complete dates shown, the latest end date is **19 Mar 2024**, which applies to three groups.\n\nAnswer: Singapore-Cambridge GCE 'A' Level, NUS High School Diploma, International Baccalaureate (IB) Diploma", "citations": [], "input_tokens": 15947, "output_tokens": 244, "cost_micros": 51501, "latency_ms": 4921, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.8, "method": "list_set", "normalised_pred": "international baccalaureate ib diploma, nus high school diploma, singapore cambridge gce level", "normalised_gold": "international baccalaureate ib diploma, singapore cambridge gce level"}}
+{"qid": "2024.ug.eprospectus.pdf::Q000", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "list", "gold": "[\"Singapore-Cambridge GCE 'A' Level\", 'International Baccalaureate (IB) Diploma']", "pages": 27, "document_id": 7581, "arm": "azure_premium_lc", "question_id": "2024.ug.eprospectus.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the application periods table on pages 46-47:\n\n- Singapore-Cambridge GCE 'A' Level: End 19 Mar 2024\n- Diploma Awarded by a Polytechnic: End 21 Feb 2024\n- NUS High School Diploma: End 20 Jan 2024\n- International Baccalaureate (IB) Diploma: End 19 Mar 2024\n- Part-Time B.Eng and Part-Time B.Tech: End 15 Jan 2024\n\nThe latest end date is 19 March 2024, which applies to two groups.\n\nAnswer: Singapore-Cambridge GCE 'A' Level, International Baccalaureate (IB) Diploma", "citations": [], "input_tokens": 19386, "output_tokens": 173, "cost_micros": 60753, "latency_ms": 4594, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "international baccalaureate ib diploma, singapore cambridge gce level", "normalised_gold": "international baccalaureate ib diploma, singapore cambridge gce level"}}
+{"qid": "2024.ug.eprospectus.pdf::Q000", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "list", "gold": "[\"Singapore-Cambridge GCE 'A' Level\", 'International Baccalaureate (IB) Diploma']", "pages": 27, "document_id": 7581, "arm": "llamacloud_basic_lc", "question_id": "2024.ug.eprospectus.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the application periods table on the document, I need to identify which application groups have the latest end date.\n\nFrom the \"APPLICATION PERIOD AND FEE\" table:\n\n1. Singapore-Cambridge GCE 'A' Level: End date is **19 Mar 2024**\n2. Diploma Awarded by a Polytechnic or equivalent institution in Singapore: End date is 21 Feb 2024\n3. NUS High School Diploma: End date is 20 Jan 2024\n4. International Baccalaureate (IB) Diploma: End date is **19 Mar 2024**\n\nThe latest end date is 19 Mar 2024, which is shared by two groups:\n- Singapore-Cambridge GCE 'A' Level\n- International Baccalaureate (IB) Diploma\n\nAnswer: Singapore-Cambridge GCE 'A' Level, International Baccalaureate (IB) Diploma", "citations": [], "input_tokens": 14659, "output_tokens": 220, "cost_micros": 47277, "latency_ms": 4921, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "international baccalaureate ib diploma, singapore cambridge gce level", "normalised_gold": "international baccalaureate ib diploma, singapore cambridge gce level"}}
+{"qid": "2024.ug.eprospectus.pdf::Q000", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "list", "gold": "[\"Singapore-Cambridge GCE 'A' Level\", 'International Baccalaureate (IB) Diploma']", "pages": 27, "document_id": 7581, "arm": "llamacloud_premium_lc", "question_id": "2024.ug.eprospectus.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the application period table on pages 46-47, I need to identify which application groups have the latest end date.\n\nThe application periods listed are:\n- Singapore-Cambridge GCE 'A' Level: End 19 Mar 2024\n- Diploma from Polytechnic: End 21 Feb 2024\n- NUS High School Diploma: End 20 Jan 2024\n- International Baccalaureate (IB) Diploma: End 19 Mar 2024\n- Part-Time B.Eng: End 15 Jan 2024\n\nThe latest end date is **19 Mar 2024**, which applies to two groups:\n\nAnswer: Singapore-Cambridge GCE 'A' Level, International Baccalaureate (IB) Diploma", "citations": [], "input_tokens": 18593, "output_tokens": 183, "cost_micros": 58524, "latency_ms": 5078, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "international baccalaureate ib diploma, singapore cambridge gce level", "normalised_gold": "international baccalaureate ib diploma, singapore cambridge gce level"}}
+{"qid": "2024.ug.eprospectus.pdf::Q000", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "list", "gold": "[\"Singapore-Cambridge GCE 'A' Level\", 'International Baccalaureate (IB) Diploma']", "pages": 27, "document_id": 7581, "arm": "surfsense_agentic", "question_id": "2024.ug.eprospectus.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find information about application periods for different groups of applicants.Let me search more specifically for dates and application-related content:Let me look at pages 46-47 which should contain the table with deadlines:Let me search for typical date formats and applicant categories:Great! I found the table. Let me read the specific section with the application periods:Perfect! I found the application period table. Now I can see the end dates for each group. Let me compare them:\n\n- Singapore-Cambridge GCE 'A' Level: End: 19 Mar 2024\n- Diploma Awarded by a Polytechnic: End: 21 Feb 2024\n- NUS High School Diploma: End: 20 Jan 2024\n- International Baccalaureate (IB) Diploma: End: 19 Mar 2024\n- Part-Time B.Eng: End: 15 Jan 2024\n- Part-Time B.Tech in Computing: 15 Jan 2024 (this appears to be both start and end date based on the context)\n\nThe latest end date is **19 Mar 2024**, which applies to two groups:\n1. Singapore-Cambridge GCE 'A' Level\n2. International Baccalaureate (IB) Diploma\n\nAnswer: Singapore-Cambridge GCE 'A' Level, International Baccalaureate (IB) Diploma", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 44218, "error": null, "extra": {"thread_id": 1953, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_4551739abb3d476cb8bf4c0348ee5447", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 278, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "international baccalaureate ib diploma, singapore cambridge gce level", "normalised_gold": "international baccalaureate ib diploma, singapore cambridge gce level"}}
+{"qid": "2024.ug.eprospectus.pdf::Q001", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "str", "gold": "Most Beautiful Campus", "pages": 27, "document_id": 7581, "arm": "native_pdf", "question_id": "2024.ug.eprospectus.pdf::Q001", "answer_letter": null, "raw_text": "Looking at page 3 of the document, which shows \"TOP 10 REASONS WHY NTU SMART CAMPUS IS FOR YOU\", I need to examine each numbered reason and its corresponding image to find which one does not include any person.\n\nGoing through the reasons with their images:\n\n1. Interdisciplinary Learning - shows a group of students\n2. Space to Innovate - shows people in a collaborative setting\n3. Industry-ready - shows a person at a desk\n4. Immersion in Research - shows a person in a lab setting (Rolls-Royce)\n5. Internship Opportunities - shows the \"AMD\" text/logo area with people\n6. Global Exposure - shows a person in a winter setting\n7. Be Among the Top Ranked - shows students walking\n8. Prestigious Scholarships and Comprehensive Financial Aid - shows students\n9. Most Beautiful Campus - shows an aerial view of the campus with greenery and pathways\n10. Vibrant Campus Life - shows students/people\n\nReason #9 \"Most Beautiful Campus\" shows an aerial photograph of the campus featuring buildings, green spaces, and pathways, but does not show any people in the image.\n\nAnswer: Most Beautiful Campus", "citations": [], "input_tokens": 60381, "output_tokens": 271, "cost_micros": 185208, "latency_ms": 35592, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2024.ug.eprospectus.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "most beautiful campus", "normalised_gold": "most beautiful campus"}}
+{"qid": "2024.ug.eprospectus.pdf::Q001", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "str", "gold": "Most Beautiful Campus", "pages": 27, "document_id": 7581, "arm": "azure_basic_lc", "question_id": "2024.ug.eprospectus.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the \"TOP 10 REASONS WHY NTU SMART CAMPUS IS FOR YOU\" section on pages 2-3, I need to identify which reason's corresponding figure/photo does not include any person.\n\nGoing through each reason with its associated image:\n\n1. Interdisciplinary Learning - has people\n2. Space to Innovate - has people  \n3. Industry-ready - has a photo credited to Rolls-Royce showing industrial equipment/machinery without visible people\n4. Immersion in Research - has people\n5. Prestigious Scholarships - not clearly shown with people\n6. Global Exposure - has people\n7. Be Among the Top Ranked - appears to be text/rankings\n8. Most Beautiful Campus - shows campus architecture/scenery\n9. Internship Opportunities - has people\n10. Vibrant Campus Life - has people\n\nThe reason labeled \"3 Industry-ready\" with the photo credit to Rolls-Royce shows industrial/laboratory equipment without any visible people in the image.\n\nAnswer: Industry-ready", "citations": [], "input_tokens": 15953, "output_tokens": 236, "cost_micros": 51399, "latency_ms": 7312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "industry ready", "normalised_gold": "most beautiful campus"}}
+{"qid": "2024.ug.eprospectus.pdf::Q001", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "str", "gold": "Most Beautiful Campus", "pages": 27, "document_id": 7581, "arm": "azure_premium_lc", "question_id": "2024.ug.eprospectus.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the \"TOP 10 REASONS WHY NTU SMART CAMPUS IS FOR YOU\" section, I need to examine each reason and its corresponding figure to identify which one does not include any person.\n\nGoing through the reasons:\n1. Interdisciplinary Learning - has figures/images with people\n2. Space to Innovate - has people in the image\n3. Industry-ready - has people\n4. Immersion in Research - has people (photo credit: Rolls-Royce)\n5. AMD Internship Opportunities - has people\n6. Global Exposure - has people\n7. Be Among the Top Ranked - this appears to be primarily text/rankings focused\n8. Prestigious Scholarships and Comprehensive Financial Aid - appears to be icons/symbols\n9. Most Beautiful Campus - this is described as about the campus itself\n10. Vibrant Campus Life - has people\n\nLooking at reason #9 \"Most Beautiful Campus\" - the description states \"The University's main campus is frequently listed among the Top 15 most beautiful university campuses in the world.\" This reason is about the campus architecture and grounds themselves, and the corresponding figure would logically show the campus landscape/buildings without people.\n\nAnswer: Most Beautiful Campus", "citations": [], "input_tokens": 19392, "output_tokens": 273, "cost_micros": 62271, "latency_ms": 8437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "most beautiful campus", "normalised_gold": "most beautiful campus"}}
+{"qid": "2024.ug.eprospectus.pdf::Q001", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "str", "gold": "Most Beautiful Campus", "pages": 27, "document_id": 7581, "arm": "llamacloud_basic_lc", "question_id": "2024.ug.eprospectus.pdf::Q001", "answer_letter": null, "raw_text": "I need to look at the \"TOP 10 REASONS WHY NTU SMART CAMPUS IS FOR YOU\" section and examine the figures/images associated with each reason to identify which one does not include any person.\n\nFrom the document, the top 10 reasons are listed on pages 2-3:\n\n1. Be Among the Top Ranked\n2. Space to Innovate\n3. Industry-ready\n4. Prestigious Scholarships and Comprehensive Financial Aid\n5. Most Beautiful Campus\n6. Vibrant Campus Life\n7. Internship Opportunities\n8. Immersion in Research\n9. (appears to have a photo credit: Rolls-Royce)\n10. Vibrant Campus Life (repeated)\n\nThe document mentions \"Photo credit: Rolls-Royce\" which suggests there's an image for reason #9 \"Immersion in Research.\"\n\nLooking at the layout, reason #5 is \"Most Beautiful Campus\" - this reason would typically show campus buildings and architecture rather than people, as it's about the physical beauty of the campus itself.\n\nAnswer: Most Beautiful Campus", "citations": [], "input_tokens": 14665, "output_tokens": 246, "cost_micros": 47685, "latency_ms": 7796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "most beautiful campus", "normalised_gold": "most beautiful campus"}}
+{"qid": "2024.ug.eprospectus.pdf::Q001", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "str", "gold": "Most Beautiful Campus", "pages": 27, "document_id": 7581, "arm": "llamacloud_premium_lc", "question_id": "2024.ug.eprospectus.pdf::Q001", "answer_letter": null, "raw_text": "Looking through the \"TOP 10 REASONS WHY NTU SMART CAMPUS IS FOR YOU\" section on pages 2-3, I need to examine the images associated with each reason.\n\nThe 10 reasons are:\n1. Interdisciplinary Learning\n2. Space to Innovate\n3. Industry-ready\n4. Immersion in Research\n5. Internship Opportunities\n6. Global Exposure\n7. Be Among the Top Ranked\n8. Prestigious Scholarships and Comprehensive Financial Aid\n9. Most Beautiful Campus\n10. Vibrant Campus Life\n\nLooking at the corresponding figures/images described:\n- Reason #4 \"Immersion in Research\" includes a photo credit to Rolls-Royce, and the image appears to show industrial/research equipment rather than people.\n- Reason #9 \"Most Beautiful Campus\" states \"The University's main campus is frequently listed among the Top 15 most beautiful university campuses in the world\" and the cover page shows \"a large, modern building with a curved facade featuring prominent vertical wooden louvers or fins\" with \"green landscaping visible at the base\" - this is an architectural shot without people visible.\n\nThe most likely answer based on the description is reason #9, as it explicitly describes a campus building without mentioning any people in the scene.\n\nAnswer: Most Beautiful Campus", "citations": [], "input_tokens": 18599, "output_tokens": 300, "cost_micros": 60297, "latency_ms": 9328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "most beautiful campus", "normalised_gold": "most beautiful campus"}}
+{"qid": "2024.ug.eprospectus.pdf::Q001", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "str", "gold": "Most Beautiful Campus", "pages": 27, "document_id": 7581, "arm": "surfsense_agentic", "question_id": "2024.ug.eprospectus.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the document to find the top 10 reasons why NTU smart campus is for you and identify which reason has a corresponding figure without any person.Let me search for \"top 10 reasons\" or \"smart campus\" in the document:Let me read the section around line 448-488 where the \"TOP 10\" appears:Good! I can see the beginning of the top 10 reasons. Let me continue reading to see all 10 reasons and their corresponding figures:Let me continue reading to see all the images associated with each reason:Now let me go back and carefully read the section with the 10 reasons to identify each reason and its associated image description. Let me re-read more carefully:Based on my reading of the document, I can now identify the 10 reasons and their corresponding figures. Let me analyze the images:\n\n1. **Interdisciplinary Learning** (Reason 1) - Image shows a female researcher/scientist in a laboratory (person present)\n\n2. **Space to Innovate** (Reason 2) - Image shows a person positioned on left side operating equipment in a laboratory (person present)\n\n3. **Industry-ready** (Reason 3) - Image shows six young adults standing in a horizontal line (people present)\n\n4. **Immersion in Research** (Reason 4) - Image shows a single person standing next to AMD logo (person present)\n\n5. **Internship Opportunities** (Reason 5) - The image associated with reasons 5-10 appears to be Im5.jpg\n\n6. **Global Exposure** (Reason 6) - Image (Im5.jpg) shows a single person standing on snowy terrace/viewing area with ch\u00e2teau backdrop (person present)\n\n7. **Be Among the Top Ranked** (Reason 7) - This appears to share imagery with the ranking pages\n\n8. **Prestigious Scholarships and Comprehensive Financial Aid** (Reason 8) - Part of the same figure group\n\n9. **Most Beautiful Campus** (Reason 9) - The next distinct image is Im7.jpg which shows an aerial view of a modern curved/spiral building with NO PEOPLE mentioned in the description - only architecture and landscape features\n\n10. **Vibrant Campus Life** (Reason 10) - Part of the combined figure\n\nThe reason that does not include any person in the corresponding figure is **Reason 9: Most Beautiful Campus** - the image shows only an aerial view of the building architecture and landscape with no people.\n\nAnswer: Most Beautiful Campus", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 48717, "error": null, "extra": {"thread_id": 1954, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_939ae241eba1446aa240613de9b11e4b", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 370, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "most beautiful campus", "normalised_gold": "most beautiful campus"}}
+{"qid": "2024.ug.eprospectus.pdf::Q002", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "str", "gold": "Medicine", "pages": 27, "document_id": 7581, "arm": "native_pdf", "question_id": "2024.ug.eprospectus.pdf::Q002", "answer_letter": null, "raw_text": "I'll search through the document for images of students playing badminton and identify their major.\n\nLooking through the pages, I can see several student testimonials with photos and their majors listed.\n\nOn page 22 (document page 41), there are two images at the bottom of the page showing students in athletic settings. The text mentions:\n\n\"\u2014 JADE LIM SI MIN\nPhilosophy, Class of 2023\n\nNTU has been a springboard for my growth as a student leader. Through the Student Leadership Development Programme, Varsity Sports Association and FISU World Forum, NTU has equipped me with the leadership skills necessary to succeed both in my sport and beyond.\"\n\nAnd:\n\n\"\u2014 GRACE CHUA HUI ZHEN\nMedicine, Class of 2023\n\nNTU has allowed me to pursue my two passions: academics and athletics. From letting me participate in competitions to offering generous subsidies to giving encouragement, I really feel grateful for the school's the tremendous support.\"\n\nThe image on the right shows what appears to be a badminton player in action. Based on the layout and the caption, this appears to be Grace Chua Hui Zhen from Medicine.\n\nAnswer: Medicine", "citations": [], "input_tokens": 60367, "output_tokens": 265, "cost_micros": 185076, "latency_ms": 45125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2024.ug.eprospectus.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "medicine", "normalised_gold": "medicine"}}
+{"qid": "2024.ug.eprospectus.pdf::Q002", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "str", "gold": "Medicine", "pages": 27, "document_id": 7581, "arm": "azure_basic_lc", "question_id": "2024.ug.eprospectus.pdf::Q002", "answer_letter": null, "raw_text": "I need to search through the document for images or mentions of students playing badminton.\n\nLooking through the document, I can find several student testimonials with their names, majors, and years listed. Let me check for any mention of badminton or sports activities.\n\nOn page 41, there is a section about \"SPORTS @ NTU\" with several images. I can see mentions of:\n- JADE LIM SI MIN, Philosophy, Class of 2023 - with text about varsity sports\n- GRACE CHUA HUI ZHEN, Medicine, Class of 2023 - with text about academics and athletics\n\nHowever, I need to look more carefully at the actual images in the document. The document mentions \"Griffin at Lee Wee Nam Library\" as an image caption on page 42.\n\nLooking more carefully at the sports section and student profiles, I can see references to students involved in sports, but I need to identify specifically which student's picture shows them playing badminton.\n\nUpon careful review of the document, I cannot definitively identify from the text alone which specific student's picture shows them playing badminton, as the document doesn't explicitly describe the content of each photograph beyond the captions with names and majors.\n\nHowever, based on the context in the sports section where student testimonials are shown, and the typical layout of such brochures, if there is a badminton-related image, it would likely be associated with one of the students featured in the sports section.\n\nAnswer: I cannot determine this from the text provided in the document.", "citations": [], "input_tokens": 15939, "output_tokens": 339, "cost_micros": 52902, "latency_ms": 8358, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "i cannot determine this from text provided in document", "normalised_gold": "medicine"}}
+{"qid": "2024.ug.eprospectus.pdf::Q002", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "str", "gold": "Medicine", "pages": 27, "document_id": 7581, "arm": "azure_premium_lc", "question_id": "2024.ug.eprospectus.pdf::Q002", "answer_letter": null, "raw_text": "I need to search through the document for images of someone playing badminton and identify which major that student belongs to.\n\nLooking through the document, I can see several student photos with their names and majors mentioned:\n\n1. Nick Erwin Chia Zhiyang - Renaissance Engineering Programme, Year 4\n2. Gladys Phua Min - Communication Studies, Year 2\n3. Elijah Adam Bin Abdul Haady Aldrin - Chemistry & Biological Chemistry, Year 1\n4. Joey Tay - NTU-NIE TSP, Year 2\n5. Stella Chua Zhi Yun - Mechanical Engineering, Year 3\n6. Seth Low - Communication Studies, Year 4\n7. Balaji Srinivasan - Biological Sciences and Psychology, Year 4\n8. Muhamed Shamir Jaffar Hussein - MBBS, Year 4\n9. Wang Shi Lin Nicole - Business, Year 3\n10. Ruth Koh - Sport Science & Management, Year 3\n\nOn page 41, there is a section about sports and I can see images related to sports activities. Looking at the bottom of page 41, there is a photo with Grace Chua Hui Zhen from Medicine, Class of 2023, and the image appears to show her playing badminton (based on the context of sports activities and the layout).\n\nAnswer: Medicine", "citations": [], "input_tokens": 19378, "output_tokens": 320, "cost_micros": 62934, "latency_ms": 10280, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "medicine", "normalised_gold": "medicine"}}
+{"qid": "2024.ug.eprospectus.pdf::Q002", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "str", "gold": "Medicine", "pages": 27, "document_id": 7581, "arm": "llamacloud_basic_lc", "question_id": "2024.ug.eprospectus.pdf::Q002", "answer_letter": null, "raw_text": "I'll search through the document for any images or references to badminton playing.\n\nLooking through the document, I can see several student profiles with quotes and their majors listed, but I need to find one specifically associated with a badminton image.\n\nOn page 41, I can see there are sports-related images and student profiles. The document shows:\n\n- JADE LIM SI MIN, Philosophy, Class of 2023 - associated with FISU WORLD FORUM and sports\n- GRACE CHUA HUI ZHEN, Medicine, Class of 2023 - mentioned in relation to academics and athletics\n\nHowever, I cannot definitively identify which specific image shows badminton or which student's picture is specifically playing badminton from the text content provided. The document mentions \"SPORTS @ NTU\" section and various athletic activities, but the actual image descriptions are not clear enough in the text to determine which major the badminton player belongs to.\n\nBased on the limited visual descriptions available in the text, I cannot provide a definitive answer about which student major is shown playing badminton.\n\nAnswer: Cannot be determined from the provided document text", "citations": [], "input_tokens": 14651, "output_tokens": 250, "cost_micros": 47703, "latency_ms": 7546, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "cannot be determined from provided document text", "normalised_gold": "medicine"}}
+{"qid": "2024.ug.eprospectus.pdf::Q002", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "str", "gold": "Medicine", "pages": 27, "document_id": 7581, "arm": "llamacloud_premium_lc", "question_id": "2024.ug.eprospectus.pdf::Q002", "answer_letter": null, "raw_text": "Looking through the document for the badminton image...\n\nOn page 41, there is a caption under an image that states:\n\n> GRACE CHUA HUI ZHEN\n> Medicine, Class of 2023\n\nThe caption appears below text mentioning \"a student playing badminton in a competition.\"\n\nAnswer: Medicine", "citations": [], "input_tokens": 18585, "output_tokens": 74, "cost_micros": 56865, "latency_ms": 5875, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "medicine", "normalised_gold": "medicine"}}
+{"qid": "2024.ug.eprospectus.pdf::Q002", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "str", "gold": "Medicine", "pages": 27, "document_id": 7581, "arm": "surfsense_agentic", "question_id": "2024.ug.eprospectus.pdf::Q002", "answer_letter": null, "raw_text": "I'll search for information about a student playing badminton in the brochure document.Let me search for \"badminton\" in the document:Now let me read the context around the badminton image to find the student's major:Let me look earlier in the document to see the context. The badminton image appears on page about \"Communication-Focused\". Let me search for more context around this section:Let me look at what comes before this section to identify which school/major this is part of:I can see this is about CoHASS (College of Humanities, Arts, and Social Sciences). Let me search for where specific majors or student testimonials are mentioned near the badminton image:I see Seth Low is from Communication Studies. Let me check if the badminton image is associated with him or if there's another student profile. Let me look at the pages more carefully:Let me look at the earlier mention at line 2200 to see if there's another student:That's not the badminton image. Let me look more systematically. The badminton image appears around line 3343. Let me check what's before and after to see which student profile or section it's associated with:The badminton image appears on page 20 (based on the page numbers), and the text after it (page 21) shows Seth Low from Communication Studies. But let me look more carefully at the layout. Let me check the section just before the badminton image to see which school section it belongs to:I see on page 19 there's a quote from Seth Low, Communication Studies. Now let me look at page 20 where the badminton image is:Let me look at the structure better. The Im1.jpg is on page 19 (Seth Low's image), then page 20 starts with CoHASS section. The badminton image Im0.jpg should be on page 20. Let me check the page markers more carefully:So page 19 ends at line 3127, then the next page marker is page 21 at line 3387. So the badminton image (around line 3343) is on page 20 which doesn't have an explicit marker. Let me look at the entire section more carefully to see which school this belongs to:Now I see WKWSCI (Wee Kim Wee School of Communication and Information) section starts around line 3548, but the badminton image is earlier around line 3343. Let me trace back to see which section the badminton image actually belongs to. Let me read backwards from line 3320:Perfect! Now I can see the structure. The badminton image (Im0.jpg) appears under the \"Communication-Focused\" section which is part of the CoHASS (College of Humanities, Arts and Social Sciences) overview on page 20. This section appears before the individual school pages. Then on page 21, we see Seth Low from Communication Studies.\n\nSo the badminton image is associated with Communication Studies, which is evident from:\n1. It's on the CoHASS overview page (page 20)\n2. The \"Communication-Focused\" heading right before the badminton image\n3. Seth Low, Communication Studies student, appears on the next page (21)\n4. WKWSCI (Wee Kim Wee School of Communication and Information) is listed as one of the schools under CoHASS\n\nAnswer: Communication Studies", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 83500, "error": null, "extra": {"thread_id": 1955, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_2258a5b949624b49af1a1b98a8b2e791", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 658, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "communication studies", "normalised_gold": "medicine"}}
+{"qid": "2024.ug.eprospectus.pdf::Q004", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "list", "gold": "['College of Engineering', 'Nanyang Business School', 'National Institute of Education']", "pages": 27, "document_id": 7581, "arm": "native_pdf", "question_id": "2024.ug.eprospectus.pdf::Q004", "answer_letter": null, "raw_text": "I'll search through the document to identify which colleges/schools/institutes feature Year 3 students on their cover pages or promotional sections.\n\nLooking through the document:\n\n1. Page 10 (College of Engineering section) - Features **Stella Chua Zhi Yun, Mechanical Engineering, Year 3**\n\n2. Page 12 (College of Humanities, Arts and Social Sciences) - Features **Seth Low, Communication Studies, Year 4** (not Year 3)\n\n3. Page 14 (College of Science) - Features **Balaji Srinivasan, Biological Sciences and Psychology, Year 4** (not Year 3)\n\n4. Page 16 (Lee Kong Chian School of Medicine) - Features **Muhamed Shamir Jaffar Hussein, MBBS, Year 4** (not Year 3)\n\n5. Page 17 (Nanyang Business School) - Features **Wang Shi Lin Nicole, Business, Year 3**\n\n6. Page 18 (National Institute of Education) - Features **Ruth Koh, Sport Science & Management, Year 3**\n\nAnswer: College of Engineering, Nanyang Business School, National Institute of Education", "citations": [], "input_tokens": 60377, "output_tokens": 272, "cost_micros": 185211, "latency_ms": 42937, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2024.ug.eprospectus.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "college of engineering, nanyang business school, national institute of education", "normalised_gold": "college of engineering, nanyang business school, national institute of education"}}
+{"qid": "2024.ug.eprospectus.pdf::Q004", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "list", "gold": "['College of Engineering', 'Nanyang Business School', 'National Institute of Education']", "pages": 27, "document_id": 7581, "arm": "azure_basic_lc", "question_id": "2024.ug.eprospectus.pdf::Q004", "answer_letter": null, "raw_text": "I need to look through the document to identify which colleges/schools/institutes feature Year 3 students on their cover/featured pages.\n\nLet me examine each section:\n\n1. **College of Engineering** (page 16-17): Features STELLA CHUA ZHI YUN, Mechanical Engineering, Year 3\n\n2. **College of Humanities, Arts and Social Sciences** (page 20-21): Features SETH LOW, Communication Studies, Year 4 (not Year 3)\n\n3. **College of Science** (page 24-25): Features BALAJI SRINIVASAN, Biological Sciences and Psychology, Year 4 (not Year 3)\n\n4. **Lee Kong Chian School of Medicine** (page 28-29): Features MUHAMED SHAMIR JAFFAR HUSSEIN, MBBS, Year 4 (not Year 3)\n\n5. **Nanyang Business School** (page 30-31): Features WANG SHI LIN NICOLE, Business, Year 3\n\n6. **National Institute of Education** (page 32-33): Features RUTH KOH, Sport Science & Management, Year 3\n\nAnswer: College of Engineering, Nanyang Business School, National Institute of Education", "citations": [], "input_tokens": 15949, "output_tokens": 282, "cost_micros": 52077, "latency_ms": 7796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "college of engineering, nanyang business school, national institute of education", "normalised_gold": "college of engineering, nanyang business school, national institute of education"}}
+{"qid": "2024.ug.eprospectus.pdf::Q004", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "list", "gold": "['College of Engineering', 'Nanyang Business School', 'National Institute of Education']", "pages": 27, "document_id": 7581, "arm": "azure_premium_lc", "question_id": "2024.ug.eprospectus.pdf::Q004", "answer_letter": null, "raw_text": "I need to examine the document to identify which colleges/schools/institutes feature year 3 students on their cover pages.\n\nLooking through the document:\n\n1. **College of Engineering** (page 16) - Features STELLA CHUA ZHI YUN, Mechanical Engineering, **Year 3**\n\n2. **College of Humanities, Arts and Social Sciences** (page 20) - Features SETH LOW, Communication Studies, Year 4 (not year 3)\n\n3. **College of Science** (page 24) - Features BALAJI SRINIVASAN, Biological Sciences and Psychology, Year 4 (not year 3)\n\n4. **Lee Kong Chian School of Medicine** (page 28) - Features MUHAMED SHAMIR JAFFAR HUSSEIN, MBBS, Year 4 (not year 3)\n\n5. **Nanyang Business School** (page 30) - Features WANG SHI LIN NICOLE, Business, **Year 3**\n\n6. **National Institute of Education** (page 32) - Features RUTH KOH, Sport Science & Management, **Year 3**\n\nAnswer: College of Engineering, Nanyang Business School, National Institute of Education", "citations": [], "input_tokens": 19388, "output_tokens": 278, "cost_micros": 62334, "latency_ms": 6905, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "college of engineering, nanyang business school, national institute of education", "normalised_gold": "college of engineering, nanyang business school, national institute of education"}}
+{"qid": "2024.ug.eprospectus.pdf::Q004", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "list", "gold": "['College of Engineering', 'Nanyang Business School', 'National Institute of Education']", "pages": 27, "document_id": 7581, "arm": "llamacloud_basic_lc", "question_id": "2024.ug.eprospectus.pdf::Q004", "answer_letter": null, "raw_text": "I'll search through the document for references to Year 3 students who appear as student models on cover pages or in promotional materials.\n\nLooking through the document, I can find the following Year 3 students quoted/featured:\n\n1. Page 16 - STELLA CHUA ZHI YUN, Mechanical Engineering, Year 3 - This is from the **College of Engineering** section\n\n2. Page 30 - WANG SHI LIN NICOLE, Business, Year 3 - This is from the **Nanyang Business School** section (she appears twice on this page)\n\n3. Page 33 - RUTH KOH, Sport Science & Management, Year 3 - This is from the **National Institute of Education** section (she appears twice on this page)\n\nAnswer: College of Engineering, Nanyang Business School, National Institute of Education", "citations": [], "input_tokens": 14661, "output_tokens": 193, "cost_micros": 46878, "latency_ms": 6233, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "college of engineering, nanyang business school, national institute of education", "normalised_gold": "college of engineering, nanyang business school, national institute of education"}}
+{"qid": "2024.ug.eprospectus.pdf::Q004", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "list", "gold": "['College of Engineering', 'Nanyang Business School', 'National Institute of Education']", "pages": 27, "document_id": 7581, "arm": "llamacloud_premium_lc", "question_id": "2024.ug.eprospectus.pdf::Q004", "answer_letter": null, "raw_text": "I need to identify which colleges/schools/institutes feature Year 3 students on their cover/promotional pages in this prospectus.\n\nLet me search through the document for mentions of students with their year level and their associated colleges/schools:\n\n1. **College of Engineering** (page 16-17):\n   - STELLA CHUA ZHI YUN, Mechanical Engineering, Year 3\n\n2. **College of Humanities, Arts and Social Sciences** (page 20-21):\n   - SETH LOW, Communication Studies, Year 4\n\n3. **College of Science** (page 24-25):\n   - BALAJI SRINIVASAN, Biological Sciences and Psychology, Year 4\n\n4. **Lee Kong Chian School of Medicine** (page 28-29):\n   - MUHAMED SHAMIR JAFFAR HUSSEIN, MBBS, Year 4\n\n5. **Nanyang Business School** (page 30-31):\n   - WANG SHI LIN NICOLE, Business, Year 3\n\n6. **National Institute of Education** (page 32-33):\n   - RUTH KOH, Sport Science & Management, Year 3\n\nFrom this analysis, the colleges/schools/institutes that use Year 3 students as their cover page student models are:\n- College of Engineering (Stella Chua)\n- Nanyang Business School (Wang Shi Lin Nicole)\n- National Institute of Education (Ruth Koh)\n\nAnswer: College of Engineering, Nanyang Business School, National Institute of Education", "citations": [], "input_tokens": 18595, "output_tokens": 352, "cost_micros": 61065, "latency_ms": 8125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "college of engineering, nanyang business school, national institute of education", "normalised_gold": "college of engineering, nanyang business school, national institute of education"}}
+{"qid": "2024.ug.eprospectus.pdf::Q004", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "list", "gold": "['College of Engineering', 'Nanyang Business School', 'National Institute of Education']", "pages": 27, "document_id": 7581, "arm": "surfsense_agentic", "question_id": "2024.ug.eprospectus.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to find which colleges/schools/institutes from NTU use year 3 students as cover page student models.Let me search for information about cover page student models and year 3 students.Let me search for \"Year 3\" with capital letters and look at the beginning of the document which might contain information about cover pages or student models.Now let me search for information about cover pages and models. Let me read the beginning of the document more carefully:Let me search more specifically for information about models or photos. Let me look for text that mentions students being models or featured on covers:Let me look at the end of the document, which often contains credits or acknowledgments about models or photos:Let me search for the total lines in this document and read from the end:Let me read closer to the end of the document where credits or acknowledgments are typically found:Let me continue reading towards the very end where photo credits or acknowledgments might be:Let me search near the very end of the document:The document ended without showing photo credits. Let me search more specifically for mentions of colleges and Year 3 together. Let me look at those specific line numbers from the grep results:Good! I found one: Stella Chua Zhi Yun is from Mechanical Engineering, Year 3. Now let me check the other instances:Good! Wang Shi Lin Nicole is from Business (Nanyang Business School), Year 3. Now let me check the other instances:Good! Ruth Koh is from Sport Science & Management (National Institute of Education), Year 3. Now let me check the next instance:This is another mention of Ruth Koh. Now let me look more carefully at the context. I need to understand which of these students are featured on \"cover pages\" of their respective schools/colleges. Let me search for any section that lists or describes cover page models:None of these matches refer to \"cover page\" in the context I'm looking for. Let me look at the actual pages where these students appear to see if they are featured as cover page models. Let me check around the Mechanical Engineering section:Now let me look before this section to see if there's a cover page for College of Engineering:Perfect! I can see that Stella Chua Zhi Yun (Mechanical Engineering, Year 3) is featured on the cover page for the College of Engineering section. Now let me check the other sections:Great! I can see Wang Shi Lin Nicole (Business, Year 3) is featured on the cover page for Nanyang Business School. Now let me check the NIE section:Perfect! I can see Ruth Koh (Sport Science & Management, Year 3) is featured on the cover page for the National Institute of Education (NIE) section.\n\nNow let me check if there are any other colleges/schools that I might have missed. Let me look at the table of contents and see what major sections exist:Let me check if there's a cover page model for College of Humanities, Arts and Social Sciences (CoHASS):I can see Seth Low (Communication Studies, Year 4) is mentioned but not Year 3. Let me check the College of Science:I can see Balaji Srinivasan (Biological Sciences and Psychology, Year 4) is mentioned for College of Science, but not Year 3. Let me check if there's a cover page before the College of Science section:So for College of Science, the cover page model is Balaji Srinivasan (Year 4), not Year 3. Let me also check Lee Kong Chian School of Medicine:Let me check the cover page before Lee Kong Chian School of Medicine:So for Lee Kong Chian School of Medicine, the cover page model is Muhamed Shamir Jaffar Hussein (MBBS, Year 4), not Year 3.\n\nNow let me systematically check all the major schools/colleges. Based on my review:\n\n**Year 3 students as cover page models:**\n1. **College of Engineering** - Stella Chua Zhi Yun (Mechanical Engineering, Year 3)\n2. **Nanyang Business School** - Wang Shi Lin Nicole (Business, Year 3)\n3. **National Institute of Education** - Ruth Koh (Sport Science & Management, Year 3)\n\nLet me verify there are no other colleges I missed by checking the table of contents more carefully:Based on the table of contents, the main colleges/schools are:\n1. College of Engineering (has Year 3 student - Stella Chua)\n2. Colleges of Humanities, Arts and Social Sciences (has Year 4 student - Seth Low)\n3. Colleges of Science (has Year 4 student - Balaji Srinivasan)\n4. Lee Kong Chian School of Medicine (has Year 4 student - Muhamed Shamir)\n5. Nanyang Business School (has Year 3 student - Wang Shi Lin Nicole)\n6. National Institute of Education (has Year 3 student - Ruth Koh)\n\nSo the colleges/schools/institutes that use Year 3 students as cover page models are:\n1. College of Engineering\n2. Nanyang Business School\n3. National Institute of Education\n\nAnswer: College of Engineering, Nanyang Business School, National Institute of Education", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 154578, "error": null, "extra": {"thread_id": 1956, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_071293dfad4941b99ec3b52bc671c7da", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 1092, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "college of engineering, nanyang business school, national institute of education", "normalised_gold": "college of engineering, nanyang business school, national institute of education"}}
+{"qid": "2024.ug.eprospectus.pdf::Q005", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "int", "gold": "3", "pages": 27, "document_id": 7581, "arm": "native_pdf", "question_id": "2024.ug.eprospectus.pdf::Q005", "answer_letter": null, "raw_text": "I need to find information about the College of Humanities, Arts, and Social Sciences (CoHASS) and count how many schools are listed but not individually introduced in detail.\n\nFrom page 12, I can see the College of Humanities, Arts and Social Sciences section lists \"OUR SCHOOLS\":\n- School of Art, Design and Media\n- School of Humanities\n- School of Social Sciences\n- Wee Kim Wee School of Communication and Information\n- Nanyang Centre for Public Administration\n- NTU Centre for Contemporary Art Singapore\n- Centre for Liberal Arts and Social Sciences\n\nThat's 7 entities listed under \"OUR SCHOOLS\".\n\nLooking at pages 13-14, I can see detailed introductions for:\n1. School of Art, Design and Media (ADM) - page 13\n2. School of Humanities (SoH) - page 13\n3. School of Social Sciences (SSS@NTU) - page 13\n4. Wee Kim Wee School of Communication and Information (WKWSCI) - page 13\n\nThese 4 schools have detailed \"ABOUT\" sections with descriptions of their programs and advantages on pages 22-23.\n\nThe entities that are listed but NOT individually introduced in detail are:\n- Nanyang Centre for Public Administration\n- NTU Centre for Contemporary Art Singapore\n- Centre for Liberal Arts and Social Sciences\n\nThat's 3 entities.\n\nAnswer: 3", "citations": [], "input_tokens": 60364, "output_tokens": 324, "cost_micros": 185952, "latency_ms": 38015, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2024.ug.eprospectus.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "3"}}
+{"qid": "2024.ug.eprospectus.pdf::Q005", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "int", "gold": "3", "pages": 27, "document_id": 7581, "arm": "azure_basic_lc", "question_id": "2024.ug.eprospectus.pdf::Q005", "answer_letter": null, "raw_text": "I need to identify the schools in the College of Humanities, Arts, and Social Sciences (CoHASS) and determine which ones are NOT introduced in detail.\n\nFrom page 20-21, the CoHASS section lists \"OUR SCHOOLS\":\n1. School of Art, Design and Media\n2. School of Humanities\n3. School of Social Sciences\n4. Wee Kim Wee School of Communication and Information\n\nIt also mentions:\n- Nanyang Centre for Public Administration\n- NTU Centre for Contemporary Art Singapore\n- Centre for Liberal Arts and Social Sciences\n\nNow, let me check which schools get detailed individual introductions on pages 22-23:\n\n1. **School of Art, Design and Media (ADM)** - YES, has detailed section on page 22\n2. **School of Humanities (SoH)** - YES, has detailed section on page 22\n3. **School of Social Sciences (SSS@NTU)** - YES, has detailed section on page 23\n4. **Wee Kim Wee School of Communication and Information (WKWSCI)** - YES, has detailed section on page 23\n\nThe three centers/institutes mentioned (Nanyang Centre for Public Administration, NTU Centre for Contemporary Art Singapore, Centre for Liberal Arts and Social Sciences) are listed under \"OUR SCHOOLS\" but do NOT have individual detailed introductions.\n\nTherefore, 3 entities listed under CoHASS schools do not have individual detailed introductions.\n\nAnswer: 3", "citations": [], "input_tokens": 15936, "output_tokens": 340, "cost_micros": 52908, "latency_ms": 8296, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "3"}}
+{"qid": "2024.ug.eprospectus.pdf::Q005", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "int", "gold": "3", "pages": 27, "document_id": 7581, "arm": "azure_premium_lc", "question_id": "2024.ug.eprospectus.pdf::Q005", "answer_letter": null, "raw_text": "I need to identify the schools that belong to the College of Humanities, Arts and Social Sciences (CoHASS) and then determine which ones are NOT individually introduced in detail.\n\nFrom page 20, the document lists the schools under CoHASS:\n1. School of Art, Design and Media\n2. School of Humanities\n3. School of Social Sciences\n4. Wee Kim Wee School of Communication and Information\n5. Nanyang Centre for Public Administration\n6. NTU Centre for Contemporary Art Singapore\n7. Centre for Liberal Arts and Social Sciences\n\nNow I need to check which schools have individual detailed introductions on pages 22-23 (the \"LEARN MORE ABOUT OUR SCHOOLS\" section):\n\nPage 22:\n- School of Art, Design and Media (ADM) - YES, has detailed introduction\n- School of Humanities (SoH) - YES, has detailed introduction\n\nPage 23:\n- School of Social Sciences (SSS@NTU) - YES, has detailed introduction\n- Wee Kim Wee School of Communication and Information (WKWSCI) - YES, has detailed introduction\n\nSchools that are NOT individually introduced in detail:\n1. Nanyang Centre for Public Administration\n2. NTU Centre for Contemporary Art Singapore\n3. Centre for Liberal Arts and Social Sciences\n\nAnswer: 3", "citations": [], "input_tokens": 19375, "output_tokens": 302, "cost_micros": 62655, "latency_ms": 7750, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "3"}}
+{"qid": "2024.ug.eprospectus.pdf::Q005", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "int", "gold": "3", "pages": 27, "document_id": 7581, "arm": "llamacloud_basic_lc", "question_id": "2024.ug.eprospectus.pdf::Q005", "answer_letter": null, "raw_text": "I need to identify the schools within the College of Humanities, Arts, and Social Sciences (CoHASS) and then determine which ones are NOT individually introduced in detail.\n\nFirst, let me find the list of schools under CoHASS from page 20:\n\n**OUR SCHOOLS:**\n1. School of Art, Design and Media\n2. School of Humanities\n3. School of Social Sciences\n4. Wee Kim Wee School of Communication and Information\n5. Nanyang Centre for Public Administration\n6. NTU Centre for Contemporary Art Singapore\n7. Centre for Liberal Arts and Social Sciences\n\nThat's 7 schools total listed under CoHASS.\n\nNow let me check which schools have detailed individual introductions in the document (pages 21-23):\n\n1. **School of Art, Design and Media (ADM)** - YES, detailed introduction on page 22\n2. **School of Humanities (SoH)** - YES, detailed introduction on page 22\n3. **School of Social Sciences (SSS@NTU)** - YES, detailed introduction on page 22\n4. **Wee Kim Wee School of Communication and Information (WKWSCI)** - YES, detailed introduction on page 23\n\nThe following schools are listed but do NOT have individual detailed introductions:\n5. Nanyang Centre for Public Administration - NO detailed introduction\n6. NTU Centre for Contemporary Art Singapore - NO detailed introduction\n7. Centre for Liberal Arts and Social Sciences - NO detailed introduction\n\nTherefore, 3 schools are not individually introduced in detail.\n\nAnswer: 3", "citations": [], "input_tokens": 14648, "output_tokens": 358, "cost_micros": 49314, "latency_ms": 7016, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "3"}}
+{"qid": "2024.ug.eprospectus.pdf::Q005", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "int", "gold": "3", "pages": 27, "document_id": 7581, "arm": "llamacloud_premium_lc", "question_id": "2024.ug.eprospectus.pdf::Q005", "answer_letter": null, "raw_text": "I need to identify which schools from the College of Humanities, Arts, and Social Sciences (CoHASS) are listed, and then determine which ones are NOT introduced in detail.\n\nFrom the document, the CoHASS schools listed on page 20 are:\n1. School of Art, Design and Media\n2. School of Humanities\n3. School of Social Sciences\n4. Wee Kim Wee School of Communication and Information\n5. Nanyang Centre for Public Administration\n6. NTU Centre for Contemporary Art Singapore\n7. Centre for Liberal Arts and Social Sciences\n\nNow, looking at the detailed introductions on pages 22-23 under \"LEARN MORE ABOUT OUR SCHOOLS\":\n1. School of Art, Design and Media (ADM) - YES, has detailed introduction\n2. School of Social Sciences (SSS@NTU) - YES, has detailed introduction\n3. School of Humanities (SoH) - YES, has detailed introduction\n4. Wee Kim Wee School of Communication and Information (WKWSCI) - YES, has detailed introduction\n\nThe schools that are NOT introduced in detail:\n1. Nanyang Centre for Public Administration\n2. NTU Centre for Contemporary Art Singapore\n3. Centre for Liberal Arts and Social Sciences\n\nAnswer: 3", "citations": [], "input_tokens": 18582, "output_tokens": 292, "cost_micros": 60126, "latency_ms": 7359, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "3"}}
+{"qid": "2024.ug.eprospectus.pdf::Q005", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "int", "gold": "3", "pages": 27, "document_id": 7581, "arm": "surfsense_agentic", "question_id": "2024.ug.eprospectus.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to find information about the College of Humanities, Arts, and Social Sciences and count which schools are not individually introduced in detail.Let me search for \"College of Humanities, Arts, and Social Sciences\" in the document:Let me try searching for variations:Let me read the section about the College of Humanities:Let me search for more information about the college structure. Let me look for \"COLLEGE OF HUMANITIES\" in caps:Found it! Let me read that section:Good! I can see the College of Humanities, Arts and Social Sciences has 4 schools listed:\n1. School of Art, Design and Media (page 22)\n2. School of Humanities (page 23)\n3. School of Social Sciences (page 23)\n4. Wee Kim Wee School of Communication and Information (page 24)\n\nNow let me check which ones are introduced in detail. Let me read pages 22-24:Let me search for where each school is detailed. Let me look at the actual page content around lines 3200-3300 where I saw \"School of Humanities\" mentioned:Good! I can see the CoHASS section lists their schools. Now let me find where individual schools are detailed. Let me search for detailed sections of each school:Let me read the ADM section:Great! Now let me check for Wee Kim Wee School:Perfect! Now let me also check if there are any other schools in CoHASS that I might have missed. Let me go back to the list to verify:I see that the College of Humanities, Arts and Social Sciences has the following units listed:\n- School of Art, Design and Media\n- School of Humanities  \n- School of Social Sciences\n- Wee Kim Wee School of Communication and Information\n- Nanyang Centre for Public Administration\n- NTU Centre for Contemporary Art Singapore\n- Centre for Liberal Arts and Social Sciences\n\nNow let me check if the last three (Nanyang Centre for Public Administration, NTU Centre for Contemporary Art Singapore, and Centre for Liberal Arts and Social Sciences) have detailed introductions. Let me search:So the three centres (Nanyang Centre for Public Administration, NTU Centre for Contemporary Art Singapore, and Centre for Liberal Arts and Social Sciences) are only listed in the OUR SCHOOLS section but don't have detailed individual introductions like the four main schools do.\n\nNow let me verify the total count by checking what the document considers as \"schools\" in CoHASS. The question asks specifically about \"schools\" - let me see if these centres are considered schools:\n\nLooking at the structure:\n- In the table of contents (lines 398-413), only 4 schools are listed under \"COLLEGES OF HUMANITIES, ARTS AND SOCIAL SCIENCES\":\n  1. School of Art, Design and Media (page 22)\n  2. School of Humanities (page 23)\n  3. School of Social Sciences (page 23)\n  4. Wee Kim Wee School of Communication and Information (page 24)\n\nAll 4 of these have detailed individual introductions as I found above.\n\nBut in the \"OUR SCHOOLS\" section (lines 3216-3236), it lists 7 entities:\n- 4 Schools (the ones mentioned above)\n- 3 Centres (Nanyang Centre for Public Administration, NTU Centre for Contemporary Art Singapore, Centre for Liberal Arts and Social Sciences)\n\nThe question asks about \"schools\" specifically. The centres are NOT called \"schools\" - they are called \"centres\". So the question is asking only about the 4 schools.\n\nSince all 4 schools (School of Art, Design and Media, School of Humanities, School of Social Sciences, and Wee Kim Wee School of Communication and Information) ARE individually introduced in detail, the answer is:\n\nAnswer: 0", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 97500, "error": null, "extra": {"thread_id": 1957, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_84cc170418784e7aaee4655dd9c795d7", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 722, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "0", "normalised_gold": "3"}}
+{"qid": "2210.02442v1.pdf::Q000", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "2", "pages": 24, "document_id": 7582, "arm": "native_pdf", "question_id": "2210.02442v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to analyze Figure 1 and find other figures that convey similar messages.\n\nFigure 1 shows the \"Cold start problem in vision active learning\" across four datasets (PathMNIST, OrganAMNIST, BloodMNIST, and CIFAR-10). The key message is that most existing active querying strategies (BALD, Consistency, Margin, VAAL, Coreset, Entropy) are outperformed by random selection in selecting initial queries, especially in the low budget regime.\n\nNow let me look through the document for figures with similar messages:\n\n**Figure 6** (page 16): Shows \"Label diversity yields more performant and robust active querying strategies\" on PathMNIST, OrganAMNIST, and BloodMNIST. This compares strategies with and without label diversity, not the cold start problem specifically.\n\n**Figure 9** (page 20): Shows \"Diversity yields more performant and robust active querying strategies\" on CIFAR-10 and CIFAR-10-LT. Similar to Figure 6, focuses on label diversity rather than cold start.\n\n**Figure 4** (page 7): Shows \"Quantitative comparison of map-based querying strategies\" comparing easy-to-learn, hard-to-learn, easy-to-contrast, and hard-to-contrast with random selection as baseline. This shows performance at low budgets and demonstrates the cold start problem.\n\n**Figure 5** (page 8): Shows \"On the importance of selecting a superior initial query\" with performance across active learning cycles starting from initial queries. This demonstrates how different initial query strategies perform, related to the cold start problem.\n\n**Figure 13** (page 24): Shows performance of active learning strategies with different initial query strategies on BloodMNIST, demonstrating the importance of initial query selection.\n\n**Figure 14** (page 24): Same as Figure 13 but for PathMNIST.\n\nFigures that convey the same core message as Figure 1 (that active learning fails at initial selection/cold start):\n- Figure 4: Shows the cold start problem through comparison with random\n- Figure 5: Shows importance of initial query selection\n- Figure 13: Shows initial query performance across cycles\n- Figure 14:", "citations": [], "input_tokens": 58482, "output_tokens": 512, "cost_micros": 183126, "latency_ms": 44218, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "length", "pdf_filename": "2210.02442v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "14", "normalised_gold": "2"}}
+{"qid": "2210.02442v1.pdf::Q000", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "2", "pages": 24, "document_id": 7582, "arm": "azure_basic_lc", "question_id": "2210.02442v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to identify figures that convey a similar message to Figure 1.\n\nFigure 1 shows the \"Cold start problem in vision active learning\" where most existing active querying strategies (BALD, Consistency, Coreset, Margin, VAAL, Entropy) are outperformed by random selection in selecting initial queries at low budgets. The figure has 4 subplots (a-d) for different datasets showing AUC performance vs. number of images.\n\nLet me search for other figures with similar messages:\n\n**Figure 6**: \"Label diversity yields more performant and robust active querying strategies\" - Shows 3 datasets (PathMNIST, OrganAMNIST, BloodMNIST) with 6 subplots each comparing strategies with/without diversity. This is about diversity improvement, not the cold start problem itself.\n\n**Figure 9**: Similar to Figure 6 but for CIFAR-10 and CIFAR-10-LT. Also about diversity improvement rather than demonstrating the cold start problem.\n\n**Figure 4**: \"Quantitative comparison of map-based querying strategies\" - Shows random selection as a baseline and compares different strategies including easy-to-learn, hard-to-learn, easy-to-contrast, and hard-to-contrast across 4 datasets. This demonstrates that random selection is competitive and that hard-to-contrast performs best, which is related to the cold start problem.\n\n**Figure 13**: Shows \"Performance of each active learning querying strategies with different initial query strategies on BloodMNIST\" - This is about subsequent cycles after initial query.\n\n**Figure 14**: Similar to Figure 13 but for PathMNIST - Also about subsequent cycles.\n\n**Figure 5**: \"On the importance of selecting a superior initial query\" on OrganAMNIST - Shows active learning cycles but focuses on the importance of initial query selection.\n\nThe figures that most clearly convey the same core message as Figure 1 (that existing active querying strategies fail to outperform random selection at the start) are:\n\n- **Figure 4**: Directly compares strategies at low budgets and shows random selection as competitive baseline\n- Possibly Figure 6 and 9, but these focus more on the solution (diversity) rather than the", "citations": [], "input_tokens": 21463, "output_tokens": 512, "cost_micros": 72069, "latency_ms": 12625, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "2"}}
+{"qid": "2210.02442v1.pdf::Q000", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "2", "pages": 24, "document_id": 7582, "arm": "azure_premium_lc", "question_id": "2210.02442v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to identify figures that convey a similar message to Figure 1.\n\nFigure 1's main message is about the \"Cold start problem in vision active learning\" - showing that most existing active querying strategies (BALD, Consistency, Coreset, Margin, VAAL, Entropy) are outperformed by random selection in selecting initial queries at low budgets. It displays AUC performance curves across different numbers of images for multiple datasets.\n\nLet me go through the other figures to find similar messages:\n\n**Figure 2**: Shows label diversity of querying criteria - different message (about class distribution bias)\n\n**Figure 3**: Shows Dataset Maps and easy/hard-to-learn vs easy/hard-to-contrast data - different message (about data selection strategies)\n\n**Figure 4**: Shows quantitative comparison of map-based querying strategies, comparing hard-to-contrast with other strategies against random selection - **similar message** (comparing active learning strategies with random selection)\n\n**Figure 5**: Shows importance of selecting superior initial query on OrganAMNIST - **similar message** (about initial query performance across active learning cycles)\n\n**Figure 6**: Extended from Table 1, shows label diversity yields more performant strategies - different message (about adding label diversity)\n\n**Figure 7**: Continued from Figure 2, about label diversity - different message\n\n**Figure 8**: Visualization of K-means clustering - different message\n\n**Figure 9**: Similar to Figure 6, about diversity and performance - different message (about label diversity effect)\n\n**Figure 10**: Similar to Figure 2, about label diversity - different message\n\n**Figure 11**: Similar to Figure 3, Dataset Maps - different message\n\n**Figure 12**: Similar to Figure 3, Dataset Maps - different message\n\n**Figure 13**: Performance across active learning cycles on BloodMNIST - **similar message** (comparing initial query strategies across active learning cycles)\n\n**Figure 14**: Performance across active learning cycles on PathMNIST - **similar message** (comparing initial query strategies across active learning cycles)\n\nFigures that convey similar messages to Figure 1 (comparing active learning strategies, showing cold start problem):\n- Figure 4\n- Figure 5\n- Figure 13\n- Figure 14\n\nAnswer: 4", "citations": [], "input_tokens": 26468, "output_tokens": 506, "cost_micros": 86994, "latency_ms": 11765, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "2"}}
+{"qid": "2210.02442v1.pdf::Q000", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "2", "pages": 24, "document_id": 7582, "arm": "llamacloud_basic_lc", "question_id": "2210.02442v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to identify figures that convey similar messages to Figure 1.\n\nFigure 1 shows the \"Cold start problem in vision active learning\" - it demonstrates that most existing active querying strategies (like BALD, Consistency, etc.) are outperformed by random selection when selecting initial queries, particularly in low budget regimes.\n\nLet me go through the document to find figures with similar messages:\n\n**Figure 4**: Shows comparison of different querying strategies including easy-to-learn, hard-to-learn, easy-to-contrast, and hard-to-contrast across different datasets. This shows performance comparisons but focuses on the proposed method rather than demonstrating the cold start problem itself.\n\n**Figure 6**: Explicitly states it is \"Extended from Table 1\" and shows \"Label diversity yields more performant and robust active querying strategies.\" It compares active querying strategies with and without label diversity on PathMNIST, OrganAMNIST, and BloodMNIST. This is about the solution (label diversity) rather than demonstrating the cold start problem.\n\n**Figure 9**: States \"Diversity yields more performant and robust active querying strategies\" on CIFAR-10 and CIFAR-10-LT. Similar to Figure 6, this focuses on the solution rather than demonstrating the cold start problem.\n\n**Figure 13**: Shows \"Performance of each active learning querying strategies with different initial query strategies on BloodMNIST.\" This compares different strategies across active learning cycles but focuses on the full active learning procedure rather than specifically demonstrating the cold start problem.\n\n**Figure 14**: Similar to Figure 13, shows \"Performance of each active learning querying strategies with different initial query strategies on PathMNIST.\"\n\nLooking back at the document more carefully, none of the other figures directly replicate Figure 1's specific message about the cold start problem where active learning strategies fail to outperform random selection at the first choice. The other figures either:\n- Show solutions to the problem (Figures 6, 9)\n- Show active learning trajectories (Figures 13, 14)\n- Show other aspects like dataset maps or visualizations (Figures 3, 7, 8, 10, 11, 12)\n\nAnswer: 0", "citations": [], "input_tokens": 23168, "output_tokens": 504, "cost_micros": 77064, "latency_ms": 11578, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "0", "normalised_gold": "2"}}
+{"qid": "2210.02442v1.pdf::Q000", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "2", "pages": 24, "document_id": 7582, "arm": "llamacloud_premium_lc", "question_id": "2210.02442v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to identify figures that convey a similar message to Figure 1, which shows the \"Cold start problem in vision active learning\" - demonstrating that most existing active querying strategies are outperformed by random selection when selecting initial queries.\n\nLet me examine each figure:\n\n**Figure 1**: Shows cold start problem - active learning strategies perform worse than random selection at initial queries across multiple datasets.\n\n**Figure 2**: Shows label diversity of querying criteria - demonstrates class distribution bias in active querying strategies.\n\n**Figure 3**: Shows active querying based on Dataset Maps - compares easy/hard-to-learn vs easy/hard-to-contrast data.\n\n**Figure 4**: Shows quantitative comparison of map-based querying strategies - compares different strategies including hard-to-contrast vs random selection.\n\n**Figure 5**: Shows importance of selecting superior initial query - correlation analysis showing hard-to-contrast outperforms others across active learning cycles.\n\n**Figure 6**: Extended from Table 1 - shows label diversity yields more performant strategies, comparing with/without label diversity across budgets.\n\n**Figure 7**: Continued from Figure 2 - shows label diversity for OrganAMNIST and BloodMNIST.\n\n**Figure 8**: Visualization of K-means clustering and active selection using UMAP.\n\n**Figure 9**: Similar to Figure 6 but for CIFAR-10-LT - shows diversity yields more performant strategies.\n\n**Figure 10**: Similar to Figure 2 but for CIFAR-10 and CIFAR-10-LT - shows label diversity.\n\n**Figure 11**: Similar to Figure 3 - Active querying based on Dataset Maps for PathMNIST and OrganAMNIST.\n\n**Figure 12**: Similar to Figure 3 - Active querying based on Dataset Maps for CIFAR-10-LT.\n\n**Figure 13**: Performance of active learning strategies with different initial queries on BloodMNIST - shows correlation across cycles.\n\n**Figure 14**: Performance of active learning strategies with different initial queries on PathMNIST - shows correlation across cycles.\n\nFigures conveying a similar message to Figure 1 (showing the cold start problem where active learning strategies underperform random selection at the start):\n- **Figure ", "citations": [], "input_tokens": 37261, "output_tokens": 512, "cost_micros": 119463, "latency_ms": 12672, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "2"}}
+{"qid": "2210.02442v1.pdf::Q000", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "2", "pages": 24, "document_id": 7582, "arm": "surfsense_agentic", "question_id": "2210.02442v1.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to understand the first figure and then identify how many other figures convey similar messages.Let me continue reading to find all figures in the document and understand what the first figure conveys.Now let me read more sections to understand all the figures and their messages better.Let me continue reading to see more figures.Let me continue reading to identify all figures.Let me search for more figures to get a complete count.Now let me understand what message Figure 1 conveys and identify similar figures. Based on my reading, let me analyze:\n\n**Figure 1** (lines 490-584): Shows four subplots (a-d) of line graphs plotting AUC vs Number of images, demonstrating the \"cold start problem\" where active querying strategies are outperformed by random selection in selecting initial queries. The message is: **existing active learning strategies perform worse than random selection at the beginning (cold start problem)**.\n\nNow let me identify which other figures convey similar messages:\n\n1. **Figure 4** (lines 1346-1455): Shows four subplots comparing map-based querying strategies with random selection as baseline, showing that random selection is \"a highly competitive baseline in cold start because it outperforms six popular active querying strategies as shown in Figure 1.\" This conveys the **same message about cold start problem**.\n\n2. **Figure 6** (lines 2545-2805): Shows plots about label diversity yielding more performant strategies - this is about **solutions**, not about demonstrating the cold start problem itself.\n\n3. **Figure 7** (lines 2840-2920): Shows bar charts about label diversity - this is about **class distribution bias**, related but different message.\n\n4. **Figure 9** (lines 3454-3640): Similar to Figure 6, about diversity yielding better strategies - this is about **solutions**.\n\n5. **Figure 10** (lines 3689-3800): Bar charts about label diversity - about **class distribution bias**.\n\nLet me verify by reading more carefully about what each figure demonstrates:Based on my thorough analysis of the document, Figure 1's main message is: **Active learning strategies are outperformed by random selection at the cold start (initial query selection) due to the cold start problem.**\n\nFigures that convey this same or very similar message:\n- **Figure 4**: Explicitly shows random selection as a competitive baseline that outperforms popular active querying strategies in the cold start problem, directly referencing Figure 1.\n\nAll other figures (2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14) convey different messages:\n- Figures 2, 7, 10: Show class distribution bias\n- Figures 3, 11, 12: Show dataset maps for selecting data\n- Figure 5, 13, 14: Show importance of superior initial query across learning cycles\n- Figures 6, 9: Show that diversity improves querying strategies\n- Figure 8: Shows visualization of clustering\n\nTherefore, **1 figure** (Figure 4) conveys a similar message as Figure 1.\n\nAnswer: 1", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 78500, "error": null, "extra": {"thread_id": 1958, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_cace0f3b35134ef58541b082bc26d8e7", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 533, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "2"}}
+{"qid": "2210.02442v1.pdf::Q001", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "4", "pages": 24, "document_id": 7582, "arm": "native_pdf", "question_id": "2210.02442v1.pdf::Q001", "answer_letter": null, "raw_text": "I need to find information about strategies that outperform random selection on CIFAR-10-LT.\n\nLooking at Figure 1, which shows the \"Cold start problem in vision active learning,\" I can see performance comparisons on different datasets including CIFAR-10 (subplot d). The figure shows that most existing active querying strategies (BALD, Consistency, Margin, VAAL, Coreset, Entropy) are outperformed by random selection in selecting initial queries.\n\nLooking at Table 1, which compares different querying strategies on CIFAR-10-LT at 10% and 20% budgets:\n- At 10% budget: Random achieves 91.6\u00b11.1\n- At 20% budget: Random achieves 93.1\u00b10.6\n\nComparing with other strategies at these budgets:\n- Consistency: 91.4\u00b11.1 (10%), 93.4\u00b10.2 (20%)\n- VAAL: 92.6\u00b10.2 (10%), 93.7\u00b10.4 (20%)\n- Margin: 91.7\u00b10.9 (10%), 93.2\u00b10.2 (20%)\n- Entropy: 92.0\u00b11.2 (10%), 91.9\u00b11.3 (20%)\n- Coreset: 91.5\u00b10.4 (10%), 93.6\u00b10.2 (20%)\n- BALD: 92.8\u00b10.1 (10%), 90.8\u00b12.4 (20%)\n\nAt 10%, strategies that outperform random (91.6): VAAL (92.6), Entropy (92.0), Margin (91.7), BALD (92.8) = 4 strategies\n\nAt 20%, strategies that outperform random (93.1): Consistency (93.4), VAAL (93.7), Margin (93.2), Coreset (93.6) = 4 strategies\n\nHowever, the question asks generally how many strategies outperform random selection. Looking at the paper's proposed \"Ours\" strategy (hard-to-contrast), the results show it significantly outperforms random", "citations": [], "input_tokens": 58490, "output_tokens": 512, "cost_micros": 183150, "latency_ms": 41641, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "length", "pdf_filename": "2210.02442v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "4"}}
+{"qid": "2210.02442v1.pdf::Q001", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "4", "pages": 24, "document_id": 7582, "arm": "azure_basic_lc", "question_id": "2210.02442v1.pdf::Q001", "answer_letter": null, "raw_text": "I need to find information about strategies that outperform random selection on CIFAR-10-LT.\n\nLooking at Figure 1(d) which shows \"Cold start problem in vision active learning\" for CIFAR-10, I can see performance curves for different strategies. The document mentions CIFAR-10-LT specifically in several places.\n\nFrom Figure 4(d), I can see a comparison of different strategies on CIFAR-10-LT including:\n- Random (baseline, shown as dotted line)\n- Easy-to-learn\n- Hard-to-learn\n- Easy-to-contrast\n- Hard-to-contrast\n\nLooking at the quantitative results in Figure 4(d) for CIFAR-10-LT at different budget levels (20.0% and 30.0% with 2481 and 3721 images), I can observe which strategies perform above the random baseline.\n\nFrom the text in section 3.2: \"Similarly on CIFAR-10-LT, hard-to-contrast significantly outperforms random selection by 21.2% (87.35%\u00b10.0% vs. 66.12%\u00b10.9%) and 24.1% (90.59%\u00b10.1% vs. 66.53%\u00b10.5%) by querying 20% and 30% of entire dataset respectively.\"\n\nLooking at Figure 4(d) more carefully, I can see that at the budget levels shown:\n1. Hard-to-contrast (performs best, clearly above random)\n2. Easy-to-learn (appears to perform above random)\n3. Hard-to-learn (appears close to or slightly above random)\n4. Easy-to-contrast (appears below random)\n\nFrom the visual inspection of Figure 4(d), it appears that 2-3 strategies outperform random selection, with hard-to-contrast being the clear winner, and easy-to-learn also performing above the baseline.\n\nBased on the figure, I count: Hard-to-contrast and Easy-to-learn as clearly outperforming random selection.\n\nAnswer: 2", "citations": [], "input_tokens": 21471, "output_tokens": 478, "cost_micros": 71583, "latency_ms": 11500, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "4"}}
+{"qid": "2210.02442v1.pdf::Q001", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "4", "pages": 24, "document_id": 7582, "arm": "azure_premium_lc", "question_id": "2210.02442v1.pdf::Q001", "answer_letter": null, "raw_text": "I need to find information about how many strategies outperform random selection on CIFAR-10-LT.\n\nLooking at Figure 1(d) which shows the cold start problem on CIFAR-10, I can see performance curves for different strategies compared to random selection. However, the question asks specifically about CIFAR-10-LT.\n\nLet me check the text more carefully. In Section 3.2, it states:\n\n\"Similarly on CIFAR-10-LT, hard-to-contrast significantly outperforms random selection by 21.2% (87.35%\u00b10.0% vs. 66.12%\u00b10.9%) and 24.1% (90.59%\u00b10.1% vs. 66.53%\u00b10.5%) by querying 20% and 30% of entire dataset respectively.\"\n\nThis indicates that the \"hard-to-contrast\" strategy (which is the authors' proposed method) outperforms random selection.\n\nLooking at Figure 1(d) which shows CIFAR-10 (not CIFAR-10-LT, but related), most of the active learning strategies shown (BALD, Consistency, Coreset, Margin, VAAL, Entropy) appear to perform worse than or similar to random selection at the initial stages (the \"cold start\" problem that the paper discusses).\n\nThe main point of the paper is that existing active learning strategies fail to outperform random selection at the cold start (initial query selection), which is why they propose their hard-to-contrast method.\n\nLooking at Table 1 and Table 7, which compare different strategies on CIFAR-10-LT with different budget levels, at low budgets (10%, 20%), the proposed method with diversity enforcement shows the best performance. However, most existing strategies WITHOUT diversity enforcement perform poorly or fail to select all classes (marked as \"-\").\n\nBased on the paper's main finding that existing active querying strategies are \"outperformed by random selection\" at the cold start (as stated in Figure 1 caption and throughout the introduction), and that only their proposed \"hard-to-contrast\" method significantly outperforms random selection on CIFAR-10-LT, the answer appears to be:\n\nAnswer: 1", "citations": [], "input_tokens": 26476, "output_tokens": 501, "cost_micros": 86943, "latency_ms": 12703, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "4"}}
+{"qid": "2210.02442v1.pdf::Q001", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "4", "pages": 24, "document_id": 7582, "arm": "llamacloud_basic_lc", "question_id": "2210.02442v1.pdf::Q001", "answer_letter": null, "raw_text": "I need to find how many active learning strategies outperform random selection on CIFAR-10-LT in the cold start problem (initial query selection).\n\nLooking at Figure 1 in the document, which illustrates the \"Cold start problem in vision active learning,\" I can see panel (d) shows results for CIFAR-10-LT.\n\nThe document states in the caption of Figure 1: \"Most existing active querying strategies (e.g. BALD, Consistency, etc.) are outperformed by random selection in selecting initial queries... However, some classes are not selected by active querying strategies due to selection bias, so their results are not presented in the low budget regime.\"\n\nThe document consistently emphasizes that existing active learning strategies fail to outperform random selection at the cold start (initial query stage). The main contribution of the paper is proposing a new strategy (hard-to-contrast) that does outperform random selection.\n\nLooking at Table 1, for CIFAR-10-LT at 10% and 20% budgets:\n- Random: 91.6\u00b11.1 (10%), 93.1\u00b10.6 (20%)\n- Consistency: 91.4\u00b11.1 (10%), 93.4\u00b10.2 (20%)\n- VAAL: 92.6\u00b10.2 (10%), 93.7\u00b10.4 (20%)\n- Margin: 91.7\u00b10.9 (10%), 93.2\u00b10.2 (20%)\n- Entropy: 92.0\u00b11.2 (10%), 91.9\u00b11.3 (20%)\n- Coreset: 91.5\u00b10.4 (10%), 93.6\u00b10.2 (20%)\n- BALD: 92.8\u00b10.1 (10%), 90.8\u00b12.4 (20%)\n\nAt 10%: VAAL (92.6), Entropy (92.0), BALD (92.8), and Margin (91.7) appear close or slightly better than Random (91.6)\nAt 20%: VAAL (93.7), Consistency (93.4), Coreset (93.", "citations": [], "input_tokens": 23176, "output_tokens": 512, "cost_micros": 77208, "latency_ms": 12390, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "20", "normalised_gold": "4"}}
+{"qid": "2210.02442v1.pdf::Q001", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "4", "pages": 24, "document_id": 7582, "arm": "llamacloud_premium_lc", "question_id": "2210.02442v1.pdf::Q001", "answer_letter": null, "raw_text": "I need to find how many strategies outperform random selection on CIFAR-10-LT.\n\nLooking at Figure 1(d) which shows \"Cold start problem in vision active learning\" for CIFAR-10-LT:\n\nThe figure shows performance curves for different strategies:\n- Random (baseline)\n- BALD (Kirsch et al., 2019)\n- Consistency (Gao et al., 2020)\n- Coreset (Sener et al., 2017)\n- Margin (Balcan et al., 2007)\n- VAAL (Sinha et al., 2019)\n- Entropy (Wang et al., 2014)\n\nThe text states: \"Most existing active querying strategies (e.g. BALD, Consistency, etc.) are outperformed by random selection in selecting initial queries, since random selection is i.i.d. to the entire dataset.\"\n\nFrom Figure 1(d), I can see that at the initial query stage (low budget regime), the random selection baseline appears to outperform most other strategies. The figure caption states \"Active learning fails to select data as effectively as random selection at the first choice.\"\n\nHowever, looking more carefully at the results, I need to determine which strategies actually outperform random at the cold start (initial query selection).\n\nAccording to the paper's main findings and Figure 4, the proposed \"hard-to-contrast\" strategy significantly outperforms random selection on CIFAR-10-LT. The paper states this is their contribution - addressing the cold start problem where existing strategies fail to beat random.\n\nFrom the various results shown, only the authors' proposed strategy (hard-to-contrast) consistently outperforms random selection at the initial query stage on CIFAR-10-LT.\n\nAnswer: 1", "citations": [], "input_tokens": 37269, "output_tokens": 407, "cost_micros": 117912, "latency_ms": 9546, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "4"}}
+{"qid": "2210.02442v1.pdf::Q001", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "4", "pages": 24, "document_id": 7582, "arm": "surfsense_agentic", "question_id": "2210.02442v1.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the document to find information about strategies that outperform the random selection baseline on CIFAR-10-LT.Let me search for content related to CIFAR-10-LT and performance comparisons:Let me read through the document more systematically to find tables or figures with results:Let me search for tables that compare strategies on CIFAR-10-LT:Perfect! I found Table 1. Let me continue reading to see the complete table for CIFAR-10-LT results:Now I can see Table 1 clearly. Let me analyze the CIFAR-10-LT columns (10% and 20% budget). The table shows performance with and without enforcing label diversity. Looking at the CIFAR-10-LT columns:\n\nFor the 10% budget (column 9), I need to compare each strategy's score with Random baseline (91.6\u00b11.1):\n- Random (baseline): 91.6\u00b11.1\n- Consistency with diversity (\u2713): 91.4\u00b11.1 - DOES NOT outperform\n- Consistency without diversity (\u2717): 67.1\u00b117.1 - DOES NOT outperform\n- VAAL with diversity (\u2713): 92.6\u00b10.2 - OUTPERFORMS\n- VAAL without diversity (\u2717): - (missing)\n- Margin with diversity (\u2713): 91.7\u00b10.9 - OUTPERFORMS (slightly)\n- Margin without diversity (\u2717): 81.9\u00b10.8 - DOES NOT outperform\n- Entropy with diversity (\u2713): 92.0\u00b11.2 - OUTPERFORMS (slightly)\n- Entropy without diversity (\u2717): 65.6\u00b115.6 - DOES NOT outperform\n- Coreset with diversity (\u2713): 91.5\u00b10.4 - DOES NOT outperform\n- Coreset without diversity (\u2717): 65.9\u00b115.9 - DOES NOT outperform\n- BALD with diversity (\u2713): 92.8\u00b10.1 - OUTPERFORMS\n- BALD without diversity (\u2717): 64.9\u00b114.9 - DOES NOT outperform\n\nLooking at 10% budget, strategies that outperform random (91.6):\n1. VAAL with diversity: 92.6\n2. Margin with diversity: 91.7\n3. Entropy with diversity: 92.0\n4. BALD with diversity: 92.8\n\nThat's 4 strategies at 10% budget.\n\nFor the 20% budget (column 10), comparing with Random baseline (93.1\u00b10.6):\n- Random: 93.1\u00b10.6\n- Consistency with diversity: 93.4\u00b10.2 - OUTPERFORMS\n- Consistency without diversity: 88.6\u00b10.3 - DOES NOT outperform\n- VAAL with diversity: 93.7\u00b10.4 - OUTPERFORMS\n- VAAL without diversity: - (missing)\n- Margin with diversity: 93.2\u00b10.2 - OUTPERFORMS (slightly)\n- Margin without diversity: 86.3\u00b10.3 - DOES NOT outperform\n- Entropy with diversity: 91.9\u00b11.3 - DOES NOT outperform\n- Entropy without diversity: 86.4\u00b10.2 - DOES NOT outperform\n- Coreset with diversity: 93.6\u00b10.2 - OUTPERFORMS\n- Coreset without diversity: 86.9\u00b10.1 - DOES NOT outperform\n- BALD with diversity: 90.8\u00b12.4 - DOES NOT outperform\n- BALD without diversity: 84.7\u00b10.6 - DOES NOT outperform\n\nAt 20% budget, strategies that outperform random (93.1):\n1. Consistency with diversity: 93.4\n2. VAAL with diversity: 93.7\n3. Margin with diversity: 93.2\n4. Coreset with diversity: 93.6\n\nThat's 4 strategies at 20% budget.\n\nBut the question asks about \"CIFAR-10-LT\" in general. Let me count unique strategies (considering both budgets):\n\nStrategies that outperform at 10%: VAAL, Margin, Entropy, BALD (all with diversity)\nStrategies that outperform at 20%: Consistency, VAAL, Margin, Coreset (all with diversity)\n\nCombined unique strategies: VAAL, Margin, Entropy, BALD, Consistency, Coreset = 6 strategies\n\nBut I should check if the question means at ANY budget level or at BOTH budget levels. Looking at Figure 1d in the document might also help clarify. Let me check that figure.I notice Figure 1(d) is labeled \"CIFAR-10\" not \"CIFAR-10-LT\". But the question specifically asks about CIFAR-10-LT. Let me look more carefully at Table 1. The table clearly has a column header \"CIFAR-10-LT\" with two sub-columns (10% and 20%).\n\nBased on Table 1, I need to count how many strategies (considering different variations - with/without diversity) outperform the random baseline on CIFAR-10-LT. Let me be more systematic:\n\nAt 10% budget, Random = 91.6\u00b11.1:\n1. Consistency (\u2713): 91.4 - NO\n2. Consistency (\u2717): 67.1 - NO  \n3. VAAL (\u2713): 92.6 - YES\n4. VAAL (\u2717): - (missing)\n5. Margin (\u2713): 91.7 - YES\n6. Margin (\u2717): 81.9 - NO\n7. Entropy (\u2713): 92.0 - YES\n8. Entropy (\u2717): 65.6 - NO\n9. Coreset (\u2713): 91.5 - NO\n10. Coreset (\u2717): 65.9 - NO\n11. BALD (\u2713): 92.8 - YES\n12. BALD (\u2717): 64.9 - NO\n\nAt 10%: 4 strategies outperform random\n\nAt 20% budget, Random = 93.1\u00b10.6:\n1. Consistency (\u2713): 93.4 - YES\n2. Consistency (\u2717): 88.6 - NO\n3. VAAL (\u2713): 93.7 - YES\n4. VAAL (\u2717): - (missing)\n5. Margin (\u2713): 93.2 - YES\n6. Margin (\u2717): 86.3 - NO\n7. Entropy (\u2713): 91.9 - NO\n8. Entropy (\u2717): 86.4 - NO\n9. Coreset (\u2713): 93.6 - YES\n10. Coreset (\u2717): 86.9 - NO\n11. BALD (\u2713): 90.8 - NO\n12. BALD (\u2717): 84.7 - NO\n\nAt 20%: 4 strategies outperform random\n\nThe question asks \"How many strategies outperform the random selection baseline on CIFAR-10-LT?\" \n\nLooking at both budget levels (10% and 20%), and counting each unique strategy configuration:\n- At 10%: VAAL(\u2713), Margin(\u2713), Entropy(\u2713), BALD(\u2713) = 4 \n- At 20%: Consistency(\u2713), VAAL(\u2713), Margin(\u2713), Coreset(\u2713) = 4\n\nIf we count unique strategy configurations across both budgets: VAAL(\u2713), Margin(\u2713), Entropy(\u2713), BALD(\u2713), Consistency(\u2713), Coreset(\u2713) = 6 total\n\nBut the most straightforward interpretation is: at a given budget level. Since the table shows two budget levels and the question doesn't specify which, I should look at what makes most sense. Given that 4 strategies outperform at each budget level separately, and 6 unique strategy variations outperform across at least one budget level, the answer is likely **4** (at each specific budget) or **6** (total unique strategies that ever outperform).\n\nHowever, re-reading the question more carefully - it asks how many strategies outperform, not how many strategy-variants. If we count base strategies (ignoring the diversity modifier):\n- VAAL outperforms (with diversity at both budgets)\n- Margin outperforms (with diversity at both budgets)  \n- Entropy outperforms (with diversity at 10%)\n- BALD outperforms (with diversity at 10%)\n- Consistency outperforms (with diversity at 20%)\n- Coreset outperforms (with diversity at 20%)\n\nThat would be 6 unique base strategies.\n\nBut looking at the data more carefully at the initial query (lowest budget), which is what the paper focuses on for \"cold start\", the answer is most likely **4** for the 10% budget.\n\nActually, let me reconsider - in Table 1, the \"Unif.\" column indicates whether uniform/diversity enforcement is applied. Each strategy has 2 rows (with and without diversity). So we have 7 base strategies tested: Random, Consistency, VAAL, Margin, Entropy, Coreset, BALD.\n\nIf I count how many of these 7 base strategies (in any configuration) outperform random at either budget:\n- Consistency: NO at 10%, YES at 20% (with diversity)\n- VAAL: YES at 10%, YES at 20% (with diversity)\n- Margin: YES at 10%, YES at 20% (with diversity)\n- Entropy: YES at 10%, NO at 20%\n- Coreset: NO at 10%, YES at 20% (with diversity)\n- BALD: YES at 10%, NO at 20%\n\nCount = 6 base strategies outperform random in at least one configuration at at least one budget level.\n\nAnswer: 6", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 81906, "error": null, "extra": {"thread_id": 1959, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_c7833996f4234f1db0fa9af29b8ebd6c", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 805, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "4"}}
+{"qid": "2210.02442v1.pdf::Q003", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "VAAL", "pages": 24, "document_id": 7582, "arm": "native_pdf", "question_id": "2210.02442v1.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Figure 2 and Figure 7, which show label diversity comparisons across different datasets (PathMNIST, OrganAMNIST, BloodMNIST), I can see the class distributions for different querying strategies.\n\nFrom these figures, VAAL consistently shows the poorest label diversity across datasets:\n- In PathMNIST (Figure 2), VAAL heavily selects from the \"muscle\" class while largely ignoring \"mucus\" and \"mucosa\" classes\n- In OrganAMNIST (Figure 7), VAAL prefers \"femur-right\" and \"platelet\" classes while largely ignoring lung, liver and monocyte classes\n- In BloodMNIST (Figure 7), VAAL shows similarly poor label diversity with heavy bias toward specific classes\n\nThis is further supported by Table 1, where VAAL frequently has missing results (marked as \"-\") because it fails to sample at least one data point for each class.\n\nAnswer: VAAL", "citations": [], "input_tokens": 58491, "output_tokens": 229, "cost_micros": 178908, "latency_ms": 35750, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2210.02442v1.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "vaal", "normalised_gold": "vaal"}}
+{"qid": "2210.02442v1.pdf::Q003", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "VAAL", "pages": 24, "document_id": 7582, "arm": "azure_basic_lc", "question_id": "2210.02442v1.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Figure 2 and Figure 7 in the document, which show label diversity comparisons across different datasets, I can observe the class distribution patterns for various active querying strategies.\n\nFrom the figures, VAAL consistently shows highly imbalanced distributions with strong bias towards specific classes:\n- In PathMNIST (Figure 2): VAAL heavily favors muscle class while largely ignoring mucus and mucosa\n- In OrganAMNIST (Figure 7a): VAAL prefers femur-right while ignoring lung and liver\n- In BloodMNIST (Figure 7b): VAAL favors platelet while ignoring other classes\n\nThe document also states: \"For example, VAAL prefers selecting data in the muscle class, but largely ignores data in the mucus and mucosa classes.\"\n\nTable 2 further confirms this, showing VAAL has very poor class coverage (0.11-0.30) compared to other methods at low budgets.\n\nAnswer: VAAL", "citations": [], "input_tokens": 21472, "output_tokens": 230, "cost_micros": 67866, "latency_ms": 25187, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "vaal", "normalised_gold": "vaal"}}
+{"qid": "2210.02442v1.pdf::Q003", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "VAAL", "pages": 24, "document_id": 7582, "arm": "azure_premium_lc", "question_id": "2210.02442v1.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the label diversity figures and tables in the document:\n\nIn Figure 2 (PathMNIST), VAAL shows extremely poor label diversity with an entropy of only 2.800, heavily biased toward the muscle class while largely ignoring mucus and mucosa classes.\n\nIn Figure 7 (OrganAMNIST and BloodMNIST), VAAL again shows the poorest label diversity, preferring specific classes like femur-right and platelet while ignoring others.\n\nIn Table 2 (class coverage), VAAL consistently has the lowest class coverage across datasets:\n- PathMNIST: 0.11 (lowest)\n- OrganAMNIST: 0.18 (lowest)\n- BloodMNIST: 0.13 (lowest)\n- CIFAR-10-LT: 0.30 (tied for lowest)\n\nThe document also notes that VAAL results are often missing (marked as \"-\") because it \"fails to sample at least one data point for each class.\"\n\nAnswer: VAAL", "citations": [], "input_tokens": 26477, "output_tokens": 247, "cost_micros": 83136, "latency_ms": 6766, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "vaal", "normalised_gold": "vaal"}}
+{"qid": "2210.02442v1.pdf::Q003", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "VAAL", "pages": 24, "document_id": 7582, "arm": "llamacloud_basic_lc", "question_id": "2210.02442v1.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Table 2 in the document, which shows \"Class coverage of selected data\" across different datasets (PathMNIST, OrganAMNIST, BloodMNIST, and CIFAR-10-LT), I can see the class coverage values for different querying strategies.\n\nVAAL consistently shows the lowest class coverage values across the datasets:\n- PathMNIST: 0.11 at 0.00015%, 0.11 at 0.00030%, 0.18 at 0.001%, 0.18 at 0.002%\n- OrganAMNIST: 0.13 at 0.001%, 0.13 at 0.002%\n- BloodMNIST: values not explicitly shown but indicated as poor\n- CIFAR-10-LT: 0.30 at 0.2%, 0.30 at 0.3%\n\nAdditionally, Table 1 shows that VAAL has many missing results (marked as \"-\") because it \"fails to sample at least one data point for each class.\"\n\nAnswer: VAAL", "citations": [], "input_tokens": 23177, "output_tokens": 263, "cost_micros": 73476, "latency_ms": 6907, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "vaal", "normalised_gold": "vaal"}}
+{"qid": "2210.02442v1.pdf::Q003", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "VAAL", "pages": 24, "document_id": 7582, "arm": "llamacloud_premium_lc", "question_id": "2210.02442v1.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the label diversity analysis in the document, I need to examine Figure 2, Table 2, Figure 7, and Figure 10 which show class distributions and coverage across different querying strategies.\n\nFrom Figure 2 (PathMNIST), VAAL shows extremely poor label diversity - it selects 25% from muscle class but only 1% and 2% from mucosa and mucus classes respectively, with an entropy of 2.800 (lowest among all strategies).\n\nFrom Table 2 (class coverage):\n- VAAL consistently shows the poorest class coverage across datasets\n- PathMNIST (0.00015%): VAAL covers only 0.11 (11% of classes)\n- OrganAMNIST (0.001%): VAAL covers only 0.18 (18% of classes)  \n- BloodMNIST (0.001%): VAAL covers only 0.13 (13% of classes)\n- CIFAR-10-LT (0.2%): VAAL covers only 0.30 (30% of classes)\n\nFigure 7 confirms VAAL selects 100% from femur-right and platelet while selecting 0% from most other classes.\n\nFigure 10 shows VAAL fails to select any samples from 9 out of 10 classes in CIFAR-10-LT.\n\nAnswer: VAAL", "citations": [], "input_tokens": 37270, "output_tokens": 330, "cost_micros": 116760, "latency_ms": 9437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "vaal", "normalised_gold": "vaal"}}
+{"qid": "2210.02442v1.pdf::Q003", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "VAAL", "pages": 24, "document_id": 7582, "arm": "surfsense_agentic", "question_id": "2210.02442v1.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the priority document to find information about querying criteria and label diversity across datasets.Let me continue reading the document to find information about querying criteria and label diversity:Let me search for information about label diversity across different querying criteria in the document:", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 28125, "error": null, "extra": {"thread_id": 1960, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_9f3482197c3247b0aae1aed4062d9a6a", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 96, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "i ll read priority document to find information about querying criteria and label diversity across datasets let me continue reading document to find information about querying criteria and label diversity let me search for information about label diversity across different querying criteria in document", "normalised_gold": "vaal"}}
+{"qid": "2210.02442v1.pdf::Q004", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "3", "pages": 24, "document_id": 7582, "arm": "native_pdf", "question_id": "2210.02442v1.pdf::Q004", "answer_letter": null, "raw_text": "I need to count all the line plots in this paper by going through each page systematically.\n\nPage 1 (Figure 1): Shows 4 subplots (a-d), each containing multiple line plots comparing different active learning strategies. This appears to be 4 figures with multiple lines each.\n\nPage 2 (Figure 1 continued): Same figure from page 1.\n\nPage 4 (Figure 2): Shows a bar chart comparing class distributions, not line plots.\n\nPage 5 (Figure 3): Shows three panels with scatter plots (Dataset Maps), not line plots.\n\nPage 7 (Figure 4): Shows 4 subplots (a-d), each containing line plots with error bars comparing different strategies. This is 4 figures with multiple lines each.\n\nPage 8 (Figure 5): Shows 10 subplots arranged in 2 rows, each containing multiple line plots. This is 10 figures with multiple lines each.\n\nPage 16 (Figure 6): Shows 18 subplots (6 strategies \u00d7 3 datasets), each containing line plots. This is 18 figures with multiple lines each.\n\nPage 17 (Figure 7): Shows bar charts, not line plots.\n\nPage 18 (Figure 8): Shows UMAP visualizations (scatter plots), not line plots.\n\nPage 20 (Figure 9): Shows 12 subplots (6 strategies \u00d7 2 datasets), each containing line plots. This is 12 figures with multiple lines each.\n\nPage 21 (Figure 10): Shows bar charts, not line plots.\n\nPage 22 (Figure 11): Shows scatter plots (Dataset Maps), not line plots.\n\nPage 23 (Figure 12): Shows scatter plots (Dataset Maps), not line plots.\n\nPage 24 (Figure 13 and 14): Each figure shows 10 subplots with line plots. This is 20 figures with multiple lines each.\n\nCounting the figures that contain line plots:\n- Figure 1: 4\n- Figure 4: 4\n- Figure 5: 10\n- Figure 6: 18\n- Figure 9: 12\n- Figure 13: 10\n- Figure 14: 10\n\nTotal: 4", "citations": [], "input_tokens": 58479, "output_tokens": 512, "cost_micros": 183117, "latency_ms": 36375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "length", "pdf_filename": "2210.02442v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "3"}}
+{"qid": "2210.02442v1.pdf::Q004", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "3", "pages": 24, "document_id": 7582, "arm": "azure_basic_lc", "question_id": "2210.02442v1.pdf::Q004", "answer_letter": null, "raw_text": "I need to count all the line plots in this paper by going through each figure systematically.\n\n**Figure 1** (page 2): Shows 4 subplots (a-d), each containing multiple line plots comparing different methods. Each subplot has multiple lines.\n\n**Figure 2** (page 4): This is a bar chart showing class distributions, not line plots.\n\n**Figure 3** (page 5): These are scatter plots (Dataset Maps), not line plots.\n\n**Figure 4** (page 8): Shows 4 subplots (a-d), each containing multiple line plots (4 different methods plotted).\n\n**Figure 5** (page 9): Shows 2 rows \u00d7 5 columns = 10 subplots, each containing multiple line plots.\n\n**Figure 6** (page 16): Shows 3 rows \u00d7 6 columns = 18 subplots, each containing line plots (with and without diversity).\n\n**Figure 7** (page 17): These are bar charts showing class distributions, not line plots.\n\n**Figure 8** (page 18): These are scatter plots (UMAP visualizations), not line plots.\n\n**Figure 9** (page 20): Shows 2 rows \u00d7 6 columns = 12 subplots, each containing line plots.\n\n**Figure 10** (page 21): These are bar charts showing class distributions, not line plots.\n\n**Figure 11** (page 22): These are scatter plots (Dataset Maps), not line plots.\n\n**Figure 12** (page 23): These are scatter plots (Dataset Maps), not line plots.\n\n**Figure 13** (page 24): Shows 2 rows \u00d7 5 columns = 10 subplots, each containing multiple line plots.\n\n**Figure 14** (page 24): Shows 2 rows \u00d7 5 columns = 10 subplots, each containing multiple line plots.\n\nNow let me count the total number of individual line plot panels:\n- Figure 1: 4 panels\n- Figure 4: 4 panels  \n- Figure 5: 10 panels\n- Figure 6: 18 panels\n- Figure 9: 12 panels\n- Figure 13: 10 panels", "citations": [], "input_tokens": 21460, "output_tokens": 512, "cost_micros": 72060, "latency_ms": 9219, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "13", "normalised_gold": "3"}}
+{"qid": "2210.02442v1.pdf::Q004", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "3", "pages": 24, "document_id": 7582, "arm": "azure_premium_lc", "question_id": "2210.02442v1.pdf::Q004", "answer_letter": null, "raw_text": "I need to count all the line plots in this paper systematically.\n\nLet me go through the document page by page:\n\n**Figure 1 (page 2)**: Contains 4 line plots (a, b, c, d) showing cold start problem comparisons.\n\n**Figure 2 (page 3)**: This is a bar chart showing label diversity, not line plots.\n\n**Figure 3 (page 5)**: Contains scatter plots (Dataset Maps), not line plots.\n\n**Figure 4 (page 7)**: Contains 4 line plots (a, b, c, d) comparing map-based querying strategies.\n\n**Figure 5 (page 8)**: Contains 10 line plots (2 rows \u00d7 5 columns) showing training from scratch and fine-tuning results.\n\n**Figure 6 (page 16)**: Contains 18 line plots (3 rows \u00d7 6 columns) showing extended results from Table 1.\n\n**Figure 7 (page 17)**: Contains bar charts, not line plots.\n\n**Figure 8 (page 18)**: Contains scatter plots (UMAP visualizations), not line plots.\n\n**Figure 9 (page 20)**: Contains 12 line plots (2 rows \u00d7 6 columns) showing results on CIFAR-10 and CIFAR-10-LT.\n\n**Figure 10 (page 21)**: Contains bar charts, not line plots.\n\n**Figure 11 (page 22)**: Contains scatter plots (Dataset Maps), not line plots.\n\n**Figure 12 (page 23)**: Contains scatter plots (Dataset Maps), not line plots.\n\n**Figure 13 (page 24)**: Contains 10 line plots (2 rows \u00d7 5 columns) for BloodMNIST results.\n\n**Figure 14 (page 24)**: Contains 10 line plots (2 rows \u00d7 5 columns) for PathMNIST results.\n\nTotal count:\n- Figure 1: 4\n- Figure 4: 4\n- Figure 5: 10\n- Figure 6: 18\n- Figure 9: 12\n- Figure 13: 10\n- Figure 14: 10\n\nTotal =", "citations": [], "input_tokens": 26465, "output_tokens": 512, "cost_micros": 87075, "latency_ms": 8219, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "3"}}
+{"qid": "2210.02442v1.pdf::Q004", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "3", "pages": 24, "document_id": 7582, "arm": "llamacloud_basic_lc", "question_id": "2210.02442v1.pdf::Q004", "answer_letter": null, "raw_text": "I need to count all the line plots in this paper systematically.\n\nLet me go through the document:\n\n**Figure 1**: Shows 4 subplots (a-d), each containing multiple line plots comparing different active learning strategies. Each subplot has multiple lines representing different strategies.\n\n**Figure 4**: Shows 4 subplots for different datasets (PathMNIST, OrganAMNIST, BloodMNIST, CIFAR-10-LT), each with multiple line plots.\n\n**Figure 6**: Shows 6 subplots (one for each strategy: BALD, Consistency, Coreset, Margin, VAAL, Entropy) across 3 datasets (PathMNIST, OrganAMNIST, BloodMNIST) = 18 subplots total, each containing 2 line plots (with and without diversity).\n\n**Figure 9**: Shows 6 subplots (one for each strategy) for 2 datasets (CIFAR-10 and CIFAR-10-LT) = 12 subplots total, each containing 2 line plots.\n\n**Figure 13**: Shows performance plots for BloodMNIST with two parts (a) and (b). Each part shows 6 subplots (Random, Entropy, Margin, BALD, Coreset, and appears to show multiple strategies). Each subplot contains multiple line plots showing different initial query strategies.\n\n**Figure 14**: Shows performance plots for PathMNIST with two parts (a) and (b). Similar structure to Figure 13, with 6 subplots in each part, each containing multiple line plots.\n\nLet me count more carefully by looking at each figure:\n\n- Figure 1: 4 subplots \u00d7 ~7 lines each = 28 line plots\n- Figure 4: 4 subplots \u00d7 4 lines each = 16 line plots\n- Figure 6: 18 subplots \u00d7 2 lines each = 36 line plots\n- Figure 9: 12 subplots \u00d7 2 lines each = 24 line plots\n- Figure 13: 2 parts \u00d7 6 subplots \u00d7 ~4 lines = ", "citations": [], "input_tokens": 23165, "output_tokens": 512, "cost_micros": 77175, "latency_ms": 10296, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "13", "normalised_gold": "3"}}
+{"qid": "2210.02442v1.pdf::Q004", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "3", "pages": 24, "document_id": 7582, "arm": "llamacloud_premium_lc", "question_id": "2210.02442v1.pdf::Q004", "answer_letter": null, "raw_text": "I need to count all the line plots in this paper by going through each figure.\n\n**Figure 1:** Shows 4 subplots (a-d) for PathMNIST, OrganAMNIST, BloodMNIST, and CIFAR-10. Each subplot contains multiple line plots comparing different strategies. This appears to be 4 plots with multiple lines each.\n\n**Figure 2:** This is a table showing label diversity, not line plots.\n\n**Figure 3:** Shows dataset maps with scatter plots, not line plots.\n\n**Figure 4:** Shows 4 sets of comparisons (a-d) with 2 budget levels each. Each comparison shows line plots. This gives us 8 line plot panels.\n\n**Figure 5:** Shows 2 subplots (a-b) for \"Training from scratch\" and \"Fine-tuning from self-supervised pre-training\". Each contains multiple line plots.\n\n**Figure 6:** Shows a 3\u00d76 grid (3 datasets \u00d7 6 methods) = 18 line plot panels.\n\n**Figure 7:** This is a table, not line plots.\n\n**Figure 8:** Shows UMAP visualizations (scatter plots), not line plots.\n\n**Figure 9:** Shows 2 rows \u00d7 6 columns = 12 line plot panels.\n\n**Figure 10:** This is a table, not line plots.\n\n**Figure 11:** Shows dataset maps with scatter plots, not line plots.\n\n**Figure 12:** Shows dataset maps with scatter plots, not line plots.\n\n**Figure 13:** Shows 2 subplots (a-b) with multiple line plots each.\n\n**Figure 14:** Shows 2 subplots (a-b) with multiple line plots each.\n\nNow let me count more carefully by counting individual line plot panels:\n- Figure 1: 4 panels\n- Figure 4: 8 panels (4 datasets \u00d7 2 budgets)\n- Figure 5: 2 panels\n- Figure 6: 18 panels (3 datasets \u00d7 6 methods)\n- Figure 9: 12 panels (2 datasets \u00d7 6 methods)\n- Figure 13: 2 panels\n- Figure 14: 2 panels", "citations": [], "input_tokens": 37258, "output_tokens": 512, "cost_micros": 119454, "latency_ms": 10046, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "14", "normalised_gold": "3"}}
+{"qid": "2210.02442v1.pdf::Q004", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "3", "pages": 24, "document_id": 7582, "arm": "surfsense_agentic", "question_id": "2210.02442v1.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to count the line plots in the paper.Let me continue reading to see the full document and search for figures/plots:Now let me search for mentions of plots, charts, or examine the figures more carefully:I can see Figure 1 has 4 subfigures (1a, 1b, 1c, 1d) and they appear to be line plots. Let me continue examining other figures:Figure 2 appears to be a bar chart/heatmap showing class distributions. Let me check Figure 3:Figure 3 appears to be scatter", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 46530, "error": null, "extra": {"thread_id": 1961, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_5cc38c9bf2aa4c2ba5f55758d03cf5bd", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 228, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "3"}}
+{"qid": "2210.02442v1.pdf::Q005", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "SGD", "pages": 24, "document_id": 7582, "arm": "native_pdf", "question_id": "2210.02442v1.pdf::Q005", "answer_letter": null, "raw_text": "I need to find information about the optimizers used in this research by searching through the document.\n\nLooking at the implementation configurations section (Appendix A), specifically:\n\n1. In Table 3a (MedMNIST pre-training), the optimizer is listed as \"SGD\"\n2. In Table 3b (CIFAR-10-LT pre-training), the optimizer is listed as \"SGD\"\n3. In Table 6 (Benchmark settings), the optimizer is listed as \"SGD\"\n\nThroughout all the experimental settings described in the document, SGD (Stochastic Gradient Descent) is consistently used as the optimizer.\n\nAnswer: SGD", "citations": [], "input_tokens": 58486, "output_tokens": 152, "cost_micros": 177738, "latency_ms": 32796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2210.02442v1.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "sgd", "normalised_gold": "sgd"}}
+{"qid": "2210.02442v1.pdf::Q005", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "SGD", "pages": 24, "document_id": 7582, "arm": "azure_basic_lc", "question_id": "2210.02442v1.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about the optimizers used in the research.\n\nIn Table 3 (page 14), which shows \"Contrastive learning settings on MedMNIST and CIFAR-10-LT,\" I can see:\n\nFor both MedMNIST pre-training (Table 3a) and CIFAR-10-LT pre-training (Table 3b):\n- optimizer: SGD\n\nIn Table 6 (page 15), which shows \"Benchmark settings,\" the optimizer listed is also:\n- optimizer: SGD\n\nThroughout the document, only SGD (Stochastic Gradient Descent) is mentioned as the optimizer.\n\nAnswer: SGD", "citations": [], "input_tokens": 21467, "output_tokens": 164, "cost_micros": 66861, "latency_ms": 6155, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "sgd", "normalised_gold": "sgd"}}
+{"qid": "2210.02442v1.pdf::Q005", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "SGD", "pages": 24, "document_id": 7582, "arm": "azure_premium_lc", "question_id": "2210.02442v1.pdf::Q005", "answer_letter": null, "raw_text": "Looking through the document for optimizer information:\n\nIn Table 3 (pages 13-14), which shows contrastive learning settings:\n- For MedMNIST pre-training: optimizer = SGD\n- For CIFAR-10-LT pre-training: optimizer = SGD\n\nIn Table 6 (page 15), which shows benchmark settings:\n- For training MedMNIST, CIFAR-10, and CIFAR-10-LT: optimizer = SGD\n\nAnswer: SGD", "citations": [], "input_tokens": 26472, "output_tokens": 120, "cost_micros": 81216, "latency_ms": 5093, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "sgd", "normalised_gold": "sgd"}}
+{"qid": "2210.02442v1.pdf::Q005", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "SGD", "pages": 24, "document_id": 7582, "arm": "llamacloud_basic_lc", "question_id": "2210.02442v1.pdf::Q005", "answer_letter": null, "raw_text": "Looking through the document for optimizer information:\n\nIn Table 3 (page 12), the contrastive learning settings show:\n- optimizer: SGD\n- optimizer momentum: 0.9\n\nIn Table 6 (page 13), the benchmark settings show:\n- optimizer: SGD\n\nAnswer: SGD", "citations": [], "input_tokens": 23172, "output_tokens": 72, "cost_micros": 70596, "latency_ms": 3764, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "sgd", "normalised_gold": "sgd"}}
+{"qid": "2210.02442v1.pdf::Q005", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "SGD", "pages": 24, "document_id": 7582, "arm": "llamacloud_premium_lc", "question_id": "2210.02442v1.pdf::Q005", "answer_letter": null, "raw_text": "Looking through the document for optimizer information:\n\nIn Table 3 (Contrastive learning settings), the optimizer is listed as **SGD** for both MedMNIST and CIFAR-10-LT pre-training.\n\nIn Table 6 (Benchmark settings), the optimizer is also listed as **SGD** for training MedMNIST, CIFAR-10, and CIFAR-10-LT.\n\nAnswer: SGD", "citations": [], "input_tokens": 37265, "output_tokens": 106, "cost_micros": 113385, "latency_ms": 6156, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "sgd", "normalised_gold": "sgd"}}
+{"qid": "2210.02442v1.pdf::Q005", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "SGD", "pages": 24, "document_id": 7582, "arm": "surfsense_agentic", "question_id": "2210.02442v1.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to find information about the optimizers used in the research.Let me search for more specific terms related to optimizers:Good! I found SGD. Let me read the sections that mention optimizer settings to get a complete picture:Let me check the benchmark settings table as well:Perfect! I found the optimizer information. Based on the document, I can see that SGD (Stochastic Gradient Descent) is consistently used as the optimizer across different experimental settings:\n\n1. Table 3(a) - MedMNIST pre-training: optimizer = SGD\n2. Table 3(b) - CIFAR-10-LT pre-training: optimizer = SGD  \n3. Table 6 - Benchmark settings for MedMNIST, CIFAR-10, and CIFAR-10-LT: optimizer = SGD\n\nAnswer: SGD (Stochastic Gradient Descent)", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 42718, "error": null, "extra": {"thread_id": 1962, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_eae58ea384884f52967fd1489c25db07", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 204, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 0.4, "method": "str_norm", "normalised_pred": "sgd stochastic gradient descent", "normalised_gold": "sgd"}}
+{"qid": "2210.02442v1.pdf::Q007", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "ground truth", "pages": 24, "document_id": 7582, "arm": "native_pdf", "question_id": "2210.02442v1.pdf::Q007", "answer_letter": null, "raw_text": "Looking at Figure 11 which shows the Dataset Maps for PathMNIST:\n\nThe Data Map by ground truth (panel b) shows data points that are clearly stratified by classes, with distinct vertical bands representing different classes across the confidence axis.\n\nThe Data Map by pseudo-labels (panel c) shows data points that are more continuously distributed across the confidence range, without the same degree of class stratification.\n\nFrom the visual representation, the Data Map by ground truth appears to have data points spread more widely across different confidence levels (from approximately 0.0 to 1.0), while the pseudo-labels map shows a somewhat more compressed distribution in terms of confidence values.\n\nAnswer: Data Map by ground truth", "citations": [], "input_tokens": 58503, "output_tokens": 153, "cost_micros": 177804, "latency_ms": 44546, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2210.02442v1.pdf"}, "graded": {"correct": true, "f1": 0.5714285714285715, "method": "str_norm", "normalised_pred": "data map by ground truth", "normalised_gold": "ground truth"}}
+{"qid": "2210.02442v1.pdf::Q007", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "ground truth", "pages": 24, "document_id": 7582, "arm": "azure_basic_lc", "question_id": "2210.02442v1.pdf::Q007", "answer_letter": null, "raw_text": "Looking at Figure 3 in the document, which shows the Dataset Maps for PathMNIST:\n\n- Figure 3b shows the Data Map by ground truth with confidence values ranging from approximately 0.2 to 1.0\n- Figure 3c shows the Data Map by pseudo-labels with confidence values ranging from approximately 0.0 to 0.6\n\nThe map by ground truth shows a wider spread of confidence values across a larger range (approximately 0.8 units) compared to the pseudo-labels map (approximately 0.6 units), and the data points in the ground truth map appear more dispersed across different confidence levels.\n\nAnswer: ground truth", "citations": [], "input_tokens": 21484, "output_tokens": 149, "cost_micros": 66687, "latency_ms": 5985, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "ground truth", "normalised_gold": "ground truth"}}
+{"qid": "2210.02442v1.pdf::Q007", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "ground truth", "pages": 24, "document_id": 7582, "arm": "azure_premium_lc", "question_id": "2210.02442v1.pdf::Q007", "answer_letter": null, "raw_text": "Looking at Figure 11 in the document, which shows the Dataset Maps for PathMNIST:\n\n- Figure 11(b) shows the Data Map by ground truth\n- Figure 11(c) shows the Data Map by pseudo-labels\n\nExamining the confidence (y-axis) distribution in both maps:\n\nIn the ground truth map (11b), the data points are stratified by classes and spread across different confidence levels, with points distributed from approximately 0.2 to 1.0.\n\nIn the pseudo-labels map (11c), the data points are more clustered, with most points concentrated in a narrower confidence range, roughly between 0.2 and 0.9.\n\nThe ground truth map shows a wider spread of confidence values across the vertical axis compared to the pseudo-labels map.\n\nAnswer: Ground truth", "citations": [], "input_tokens": 26489, "output_tokens": 184, "cost_micros": 82227, "latency_ms": 6172, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "ground truth", "normalised_gold": "ground truth"}}
+{"qid": "2210.02442v1.pdf::Q007", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "ground truth", "pages": 24, "document_id": 7582, "arm": "llamacloud_basic_lc", "question_id": "2210.02442v1.pdf::Q007", "answer_letter": null, "raw_text": "Looking at Figure 3 in the document, which shows the Dataset Maps for PathMNIST:\n\n- Figure 3(b) shows the Dataset Map based on ground truths\n- Figure 3(c) shows the Dataset Map based on pseudo-labels\n\nExamining the y-axis (confidence) range in both maps:\n\nIn Figure 3(b) (ground truth), the confidence values span approximately from 0.2 to 1.0, showing data points distributed across a wide range with clear stratification by classes.\n\nIn Figure 3(c) (pseudo-labels), the confidence values span approximately from 0.0 to 0.6, showing a more compressed distribution.\n\nThe map based on ground truths displays a larger spread in confidence values from low to high, indicating greater variance in confidence compared to the pseudo-labels map which shows a more compressed range.\n\nAnswer: ground truth", "citations": [], "input_tokens": 23189, "output_tokens": 198, "cost_micros": 72537, "latency_ms": 6797, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "ground truth", "normalised_gold": "ground truth"}}
+{"qid": "2210.02442v1.pdf::Q007", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "ground truth", "pages": 24, "document_id": 7582, "arm": "llamacloud_premium_lc", "question_id": "2210.02442v1.pdf::Q007", "answer_letter": null, "raw_text": "I need to examine the data maps for PathMNIST to compare the confidence variance between the map by ground truth and the map by pseudo-labels.\n\nLooking at Figure 3 in the document, which shows \"Active querying based on Dataset Maps\":\n\n- Figure 3(b) shows the Dataset Map based on ground truths\n- Figure 3(c) shows the Dataset Map based on pseudo-labels\n\nFrom the visual comparison of these two maps:\n- The map by ground truth (3b) shows confidence values ranging roughly from 0.0 to 1.0\n- The map by pseudo-labels (3c) shows confidence values also ranging roughly from 0.0 to 1.0\n\nLooking more carefully at the spread of points:\n- In the ground truth map (3b), the confidence values appear to span a wider range vertically (approximately 0.0 to 1.0)\n- In the pseudo-labels map (3c), the confidence values appear more compressed, with most points concentrated between roughly 0.1 and 0.6\n\nThis indicates that the map by ground truth has a larger variance in confidence values compared to the map by pseudo-labels.\n\nAnswer: Ground truth", "citations": [], "input_tokens": 37282, "output_tokens": 267, "cost_micros": 115851, "latency_ms": 7093, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "ground truth", "normalised_gold": "ground truth"}}
+{"qid": "2210.02442v1.pdf::Q007", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "ground truth", "pages": 24, "document_id": 7582, "arm": "surfsense_agentic", "question_id": "2210.02442v1.pdf::Q007", "answer_letter": null, "raw_text": "I'll read the document to find information about PathMNIST dataset maps and confidence variance.Let me search for \"PathMNIST\" in the document to find the relevant section.Let me search for \"dataset map\" or look at figures related to PathMNIST.Good! I found references to Dataset Maps. Let me read the section around line 680-681 and look at Figure 3.Let me continue reading to see the full figure and find PathMNIST specifically.This figure appears to be about BloodMNIST based on the cell types mentioned. Let me search for figures with PathMNIST.Perfect! I found Figure 11 which includes PathMNIST dataset maps. Let me read that section.Perfect! I can see the PathMNIST dataset maps in Figure 11. Now I need to look at the actual visual representation or description to understand which map has larger confidence variance. Let me continue reading to see if there's more detail about the variance.I can see the figure caption and layout, but I need to look at the actual scatter plots more carefully. Looking at the text I've read:\n\nFor PathMNIST (lines 3829-3883):\n- (b) Data Map by ground truth - shows confidence on y-axis (0.0-1.0) and variability on x-axis\n- (c) Data Map by pseudo-labels - shows confidence on y-axis (0.0-1.0) and variability on x-axis\n\nThe figure shows the confidence ranges from 0.0 to 1.0 on the y-axis for both maps. However, I need to check if there's additional information about which map has larger variance. Let me look at the visual descriptions of the actual figure images.Let me look for the visual description of Figure 11 specifically, which should contain information about the PathMNIST maps.Let me search for image descriptions related to Figure 11's scatter plots to find information about confidence variance.I need to look more carefully at the PathMNIST data in Figure 11. Let me look for visual descriptions of the actual scatter plots. Let me search for image descriptions that might contain the scatter plot information.Looking at the data from PathMNIST in Figure 11:\n- Panel (b) shows \"Data Map by ground truth\" with confidence ranging from 0.0 to 1.0 (as I can see from lines 3831-3873)\n- Panel (c) shows \"Data Map by pseudo-labels\" with confidence ranging from 0.0 to 1.0\n\nThe confidence is on the y-axis. Looking at the values, I see for panel (b) the y-axis goes from 0.0 to ~0.9, and for panel (c) it goes from 0.0 to 1.0 (full range). The question asks about \"confidence variance\" which would be the spread/range of confidence values on the y-axis.\n\nHowever, I need to look more carefully. Let me check if there are image descriptions that describe the actual scatter plot distributions.Let me look at the image descriptions around the PathMNIST figure more carefully. Let me search for image descriptions near Figure 11.Let me look for scatter plot images that are specifically associated with PathMNIST. Let me check around the chunks where PathMNIST appears in Figure 11.Let me look for the actual scatter plot images. Let me search for specific image descriptions that mention PathMNIST scatter plots with confidence and variability axes.Let me search more specifically for scatter plots with confidence and variability axes that relate to PathMNIST.Good! I found a scatter plot description. Let me look for more scatter plots, especially those that might be for PathMNIST ground truth vs pseudo-labels maps. Let me continue searching.Good! I found another scatter plot. Let me continue reading this one.I'm finding scatter plots but I need to identify which ones specifically correspond to PathMNIST panel (b) ground truth and panel (c) pseudo-labels from Figure 11. Let me look for scatter plots with specific axis ranges that match what I saw earlier for PathMNIST.\n\nLooking back at the PathMNIST data from lines 3829-3883, I saw:\n- The y-axis (confidence) goes from 0.0 to at least 0.8\n- Panel (b) shows variability on x-axis with range mentioned as 0.2, 0.4\n- Panel (c) also shows variability on x-axis with range 0.2, 0.4\n\nLet me search for images that might be specifically labeled or that have characteristics matching PathMNIST.I found Im5.jpg which is a scatter plot with confidence 0.0-1.0 and variability 0.0-0.4. This could be one of the PathMNIST maps. Let me look for more scatter plots around Figure 11 area. Let me check if there are descriptions that tell me which image corresponds to which panel of Figure 11.\n\nActually, looking back at the text structure, the figure content is laid out sequentially. From lines 3805-3884, I saw Figure 11 with PathMNIST having:\n- (a) Overall distribution (line 3881)\n- (b) Data Map by ground truth (line 3883)\n- (c) Data Map by pseudo-labels (line 3884)\n\nThe OCR text I saw showed axes values. Let me look more carefully at Im24.jpg and Im29.jpg which I found, as well as other scatter plots, to see if I can determine which represents ground truth vs pseudo-labels.\n\nLet me look for a pattern. Based on the figure caption at line 3805, panels (b) and (e) are ground truth maps, and panels (c) and (f) are pseudo-label maps. Since PathMNIST is (a,b,c) and OrganAMNIST is (d,e,f), I need to find the two scatter plot images that follow the PathMNIST overall distribution.\n\nLet me look at the description of Im24.jpg more carefully (lines 7044-7077) - it shows confidence 0.0-1.0 and variability 0.0-~0.5, with an inverse relationship.\n\nAnd Im29.jpg (lines 7158-7194) shows confidence 0.0-1.0 and variability 0.0-~0.6, with points spread from (0.1, 0.0) to (0.6, 0.7).\n\nNow let me check Im5.jpg (lines 4897-4932) - it shows confidence 0.0-1.0 and variability 0.0-0.4, with a **negative correlation** (upper left high confidence/low variability to lower right low confidence/high variability).\n\nThis is key! The description says Im5.jpg shows data points concentrated in \"Upper left region (low variability, high confidence)\" going to \"Lower right region (high variability, low confidence)\" with \"increased scatter/spread of points in middle and lower portions of curve.\"\n\nNow let me check Im24.jpg and Im29.jpg again to see which shows more spread/variance in confidence.\n\nIm24.jpg (lines 7067-7076): \"Greater scatter/dispersion visible at intermediate confidence values (0.2-0.8)\" and \"Points more tightly clustered near confidence extremes (approaching 0.0 and 1.0)\"\n\nIm29.jpg (lines 7181-7194): \"Some scatter/dispersion around the main curve throughout the distribution\"\n\nLet me look for more scatter plot images that might be the actual PathMNIST panels. Let me check if there are images with specific patterns that would indicate stratification by class (ground truth) vs mixed patterns (pseudo-labels).Perfect! The caption says that ground truth maps have \"data are stratified by classes in the 2D space\" - meaning the points would be grouped/clustered by class. This would create a MORE spread out or varied pattern compared to pseudo-labels which would be more mixed/homogeneous.\n\nBased on the figure captions, the ground truth map shows stratification by classes, which would likely result in **larger variance** in confidence values since different classes cluster at different confidence levels.\n\nLet me search for visual descriptions that mention clustering or stratification patterns in scatter plots.I found very useful information! Looking at lines 5699-5716, there's a description of a clustered scatter plot which might be relevant. But more importantly, I need to focus on the actual PathMNIST dataset maps.\n\nThe key insight from the caption is that ground truth maps show \"data are stratified by classes in the 2D space\" - this means points are separated/grouped by class, which would naturally create **more variance** in confidence values across the entire dataset map, since different classes would cluster at different confidence levels.\n\nIn contrast, pseudo-label maps would show a more homogeneous distribution since pseudo-labels don't perfectly separate classes and mix them together, resulting in **less variance** in confidence values.\n\nBased on the figure caption (line 3805) stating that ground truth maps have data \"stratified by classes\" creating distinct groupings at different confidence levels, the ground truth map would have **larger confidence variance**.\n\nAnswer: ground truth", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 157108, "error": null, "extra": {"thread_id": 1963, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_67e301ccb20c47e7a10fd97f2b39b6f0", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 1340, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "ground truth", "normalised_gold": "ground truth"}}
+{"qid": "2303.05039v2.pdf::Q000", "doc_id": "2303.05039v2.pdf", "answer_format": "int", "gold": "8980", "pages": 9, "document_id": 7583, "arm": "native_pdf", "question_id": "2303.05039v2.pdf::Q000", "answer_letter": null, "raw_text": "I need to find information about Amazon-related datasets mentioned in the paper.\n\nFrom the document, I can identify the following Amazon-related datasets:\n\n1. **Amazon-beauty dataset** - mentioned multiple times throughout the paper as one of the two newly constructed datasets\n2. **Amazon-music dataset** - mentioned multiple times throughout the paper as the second newly constructed dataset\n\nLooking at Table 3 (Dataset Statistics), I can see three datasets are discussed:\n- Amazon-beauty\n- Amazon-music  \n- Personality'18\n\nThe Personality'18 dataset is based on MovieLens, not Amazon.\n\nThe paper explicitly states: \"we construct two new domain-specific datasets: an Amazon-beauty and an Amazon-music dataset.\"\n\nTherefore, there are 2 Amazon-related datasets in total.\n\nAnswer: 2", "citations": [], "input_tokens": 26915, "output_tokens": 180, "cost_micros": 83445, "latency_ms": 10844, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2303.05039v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "8980"}}
+{"qid": "2303.05039v2.pdf::Q000", "doc_id": "2303.05039v2.pdf", "answer_format": "int", "gold": "8980", "pages": 9, "document_id": 7583, "arm": "azure_basic_lc", "question_id": "2303.05039v2.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the total number of items across the Amazon-related datasets mentioned in the paper.\n\nFrom Table 3 (Statistics of the three datasets used in our study), I can see:\n\n1. Amazon-beauty dataset: 85 items\n2. Amazon-music dataset: 8,895 items\n\nThe paper also mentions Personality'18 dataset with 21,776 items, but this is a MovieLens dataset, not an Amazon dataset.\n\nTotal items in Amazon-related datasets: 85 + 8,895 = 8,980\n\nAnswer: 8980", "citations": [], "input_tokens": 13244, "output_tokens": 132, "cost_micros": 41712, "latency_ms": 5375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "8980", "normalised_gold": "8980"}}
+{"qid": "2303.05039v2.pdf::Q000", "doc_id": "2303.05039v2.pdf", "answer_format": "int", "gold": "8980", "pages": 9, "document_id": 7583, "arm": "azure_premium_lc", "question_id": "2303.05039v2.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the total number of items across the Amazon-related datasets mentioned in the paper.\n\nLooking at Table 3 (Statistics of the three datasets used in our study), I can find:\n\n1. Amazon-beauty dataset: 85 items\n2. Amazon-music dataset: 8,895 items\n\nThe paper mentions two Amazon-related datasets: Amazon-beauty and Amazon-music.\n\nTotal items = 85 + 8,895 = 8,980\n\nAnswer: 8980", "citations": [], "input_tokens": 16269, "output_tokens": 113, "cost_micros": 50502, "latency_ms": 5280, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "8980", "normalised_gold": "8980"}}
+{"qid": "2303.05039v2.pdf::Q000", "doc_id": "2303.05039v2.pdf", "answer_format": "int", "gold": "8980", "pages": 9, "document_id": 7583, "arm": "llamacloud_basic_lc", "question_id": "2303.05039v2.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the total number of items across the Amazon-related datasets mentioned in the paper.\n\nLooking at Table 3 (Dataset Statistics), I can find information about the datasets used in the study. The paper mentions three datasets:\n\n1. Amazon-beauty\n2. Amazon-music  \n3. Personality2018 (which is a MovieLens dataset, not Amazon)\n\nFrom Table 3, I need to find the number of items for the Amazon datasets:\n- The table shows statistics for datasets but I need to locate the specific numbers for items\n\nLooking more carefully at the document, in Section 3.3 \"Dataset Statistics\":\n- It mentions \"Table 3 shows the final statistics of the datasets used in our study\"\n- It states \"The Personality2018 dataset contains the largest number of items and the smallest number of users\"\n- It mentions \"21,776 movies\" for Personality2018\n\nHowever, I cannot see Table 3 clearly displayed with all the numerical values in the provided text. Let me search more carefully...\n\nThe document mentions datasets but the actual Table 3 with complete statistics is not fully visible in the text provided. Without access to the complete Table 3 showing the number of items for Amazon-beauty and Amazon-music datasets, I cannot provide an accurate total.\n\nLooking through the entire document again, I can see references to Table 3 but the actual table with item counts is not included in the extracted text.\n\nSince I cannot find the specific numbers of items for the Amazon datasets in the provided document text, I cannot calculate the total accurately.\n\nAnswer: Unable to determine from the provided text as Table 3 statistics are not fully shown.\n\nWait, let me reconsider - the question asks for an integer answer, so I must find this information. Let me search more carefully for any mention of item counts...\n\nThe document does not provide the specific item counts for Amazon-beauty and Amazon-music datasets in the readable text provided. Without Table 3 being fully visible with those statistics, I cannot provide the accurate total.\n\nAnswer: 0", "citations": [], "input_tokens": 10434, "output_tokens": 451, "cost_micros": 38067, "latency_ms": 11968, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "0", "normalised_gold": "8980"}}
+{"qid": "2303.05039v2.pdf::Q000", "doc_id": "2303.05039v2.pdf", "answer_format": "int", "gold": "8980", "pages": 9, "document_id": 7583, "arm": "llamacloud_premium_lc", "question_id": "2303.05039v2.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the total number of items across the Amazon-related datasets mentioned in the paper.\n\nLooking at Table 3 \"Statistics of the three datasets used in our study\":\n\n- Amazon-beauty: 85 items\n- Amazon-music: 8,895 items\n- Personality'18: 21,776 items (but this is NOT an Amazon dataset, it's a MovieLens dataset)\n\nThe question asks specifically for \"Amazon related datasets\", so I should only count Amazon-beauty and Amazon-music.\n\nTotal items in Amazon datasets = 85 + 8,895 = 8,980\n\nAnswer: 8980", "citations": [], "input_tokens": 14857, "output_tokens": 144, "cost_micros": 46731, "latency_ms": 4718, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "8980", "normalised_gold": "8980"}}
+{"qid": "2303.05039v2.pdf::Q000", "doc_id": "2303.05039v2.pdf", "answer_format": "int", "gold": "8980", "pages": 9, "document_id": 7583, "arm": "surfsense_agentic", "question_id": "2303.05039v2.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find information about Amazon-related datasets.Let me search for \"Amazon\" in the document to locate relevant information about datasets.Now let me read the sections that describe the datasets in detail to count the Amazon-related datasets:Let me continue reading to see more details about the datasets:Based on the document, I can see that the paper mentions Amazon-related datasets. Let me check the table more carefully and search for the total number of items:\n\nFrom Table 3 in the document, I can see three datasets are used in the study:\n1. **Amazon-beauty** - with 85 items\n2. **Amazon-music** - with 8,895 items  \n3. Personality'18 - (this is not Amazon-related, it's a MovieLens dataset)\n\nThe paper constructed two Amazon-related datasets from the parent Amazon review dataset. Looking at the \"# of items\" row in Table 3:\n- Amazon-beauty: 85 items\n- Amazon-music: 8,895 items\n\nTotal items in Amazon-related datasets: 85 + 8,895 = 8,980\n\nAnswer: 8980", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 41483, "error": null, "extra": {"thread_id": 1964, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_a498e50e54164511b090eb63e6f7d154", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 221, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "8980", "normalised_gold": "8980"}}
+{"qid": "2303.05039v2.pdf::Q002", "doc_id": "2303.05039v2.pdf", "answer_format": "float", "gold": "466.43", "pages": 9, "document_id": 7583, "arm": "native_pdf", "question_id": "2303.05039v2.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the average number of words per user for the dataset that contains the Receptiviti score example shown in Table 2.\n\nLooking at Table 2, it shows \"An example of Receptiviti score for a specific, anonymized user\" with User ID A2GBIFL43U1LKJ and their OCEAN personality scores.\n\nNow I need to find which dataset this corresponds to and what the average words per user is for that dataset.\n\nLooking at Table 3 \"Statistics of the three datasets used in our study,\" I can see:\n- Amazon-beauty: Avg. words/user = 990.48\n- Amazon-music: Avg. words/user = 466.43\n- Personality'18: Avg. words/user = - (no data)\n\nThe User ID format A2GBIFL43U1LKJ appears to be from the Amazon dataset format (as shown in Table 1 with User ID A2SUAM1J3GNN38).\n\nHowever, Table 2 doesn't specify which dataset it's from. Looking at the text in Section 3.2, it states: \"For each user that wrote reviews in either of the two domains, we collect all his/her review texts and concatenate them together into a single document.\"\n\nThe text mentions selecting users for personality inference from both Amazon-beauty and Amazon-music datasets. Since Table 2 is just an example and doesn't specify which dataset, and the question asks about \"the dataset from which a data sample of a user's Receptiviti score is shown,\" I need to determine which dataset the example is from.\n\nGiven that the paper discusses both datasets and Table 2 appears to be a generic example, and considering the context of the paper focuses on both datasets equally, I should look for any other clues. However, without explicit information linking Table 2 to a specific dataset, and since the User ID format matches the Amazon dataset structure, the most reasonable interpretation is that this could be from either Amazon-beauty or Amazon-music.\n\nSince the question asks specifically about \"the dataset\" (singular) and Table 2 is presented as a single example, and given no other specific information, I cannot definitively determine which dataset it's from. However, looking at the document structure and flow, Table 2 appears early in the dataset construction section before domain", "citations": [], "input_tokens": 26939, "output_tokens": 512, "cost_micros": 88497, "latency_ms": 18092, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "length", "pdf_filename": "2303.05039v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "2.0", "normalised_gold": "466.43"}}
+{"qid": "2303.05039v2.pdf::Q002", "doc_id": "2303.05039v2.pdf", "answer_format": "float", "gold": "466.43", "pages": 9, "document_id": 7583, "arm": "azure_basic_lc", "question_id": "2303.05039v2.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the average number of words per user for the dataset that contains the data sample shown in Table 2.\n\nTable 2 shows \"An example of Receptiviti score for a specific, anonymized user\" with columns for User ID, AGR, CON, NEU, EXT, and OPEN. The user ID shown is \"A2GBIFL43U1LKJ\".\n\nLooking at Table 4, I can see review text examples with personality labels and scores. The table caption mentions \"The data sample of extreme personality cases to the annotators.\"\n\nNow I need to find which dataset this user belongs to. Looking at Table 3 which shows \"Statistics of the three datasets used in our study,\" I can see:\n\n- Amazon-beauty: Avg. words/user = 990.48\n- Amazon-music: Avg. words/user = 466.43\n- Personality'18: Avg. words/user = - (no data)\n\nThe user ID format \"A2GBIFL43U1LKJ\" appears to be from the Amazon datasets (based on the naming convention).\n\nHowever, I need to determine which specific dataset (beauty or music) this example is from. Looking at the context around Table 2, it appears after the discussion of dataset construction and before the detailed statistics. The table is presented as a general example of the Receptiviti API output.\n\nLooking more carefully at the document flow:\n- Section 3.2 discusses dataset construction for both Amazon-beauty and Amazon-music\n- Table 2 is presented as \"An example of Receptiviti score for a specific, anonymized user\"\n- Table 3 immediately follows with statistics for both datasets\n\nSince Table 2 is presented as a general example without specifying which dataset it's from, and given that both Amazon datasets use similar user ID formats, I need to look for more context.\n\nHowever, the question asks specifically about \"the dataset from which a data sample of a user's Receptiviti score is shown\" - this refers to Table 2. Without explicit identification of which dataset this user is from in Table 2, and given that the paper presents both datasets with their statistics in Table 3, the most reasonable interpretation is that this is asking about one of the Amazon datasets.\n\nSince the question asks for a specific answer and Table 2", "citations": [], "input_tokens": 13268, "output_tokens": 512, "cost_micros": 47484, "latency_ms": 13219, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "2.0", "normalised_gold": "466.43"}}
+{"qid": "2303.05039v2.pdf::Q002", "doc_id": "2303.05039v2.pdf", "answer_format": "float", "gold": "466.43", "pages": 9, "document_id": 7583, "arm": "azure_premium_lc", "question_id": "2303.05039v2.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, I need to find:\n1. Which dataset the data sample in Table 2 comes from\n2. The average words per user for that dataset\n\nTable 2 shows \"An example of Receptiviti score for a specific, anonymized user\" with user ID A2GBIFL43U1LKJ and their OCEAN personality scores.\n\nLooking at Table 1, I can see a similar user ID format (A2SUAM1J3GNN38) which appears to be from the Amazon dataset based on the format and fields shown (reviewerID, asin, etc.).\n\nFrom Table 3 (Statistics of the three datasets used in our study), I can see the \"Avg. words/user\" for each dataset:\n- Amazon-beauty: 990.48\n- Amazon-music: 466.43\n- Personality'18: - (no data)\n\nThe document mentions in Section 3.2 that they used the Receptiviti API to analyze personality from concatenated review texts. The example user ID format in Table 2 (A2GBIFL43U1LKJ) matches the Amazon dataset format.\n\nSince the paper constructs two Amazon datasets and Table 2 is presented early in the methodology section without specifying which domain, and given that Amazon-beauty is mentioned first throughout the paper and has more prominent results, the sample likely comes from Amazon-beauty dataset.\n\nHowever, looking more carefully at the context, the table appears as the first example when explaining the personality scores, and both datasets use the same methodology. The Amazon-music dataset has 466.43 average words per user.\n\nGiven the ambiguity, but noting that Amazon-beauty is discussed first and more extensively, and the table appears in the general methodology section:\n\nAnswer: 990.48", "citations": [], "input_tokens": 16293, "output_tokens": 400, "cost_micros": 54879, "latency_ms": 11655, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "990.48", "normalised_gold": "466.43"}}
+{"qid": "2303.05039v2.pdf::Q002", "doc_id": "2303.05039v2.pdf", "answer_format": "float", "gold": "466.43", "pages": 9, "document_id": 7583, "arm": "llamacloud_basic_lc", "question_id": "2303.05039v2.pdf::Q002", "answer_letter": null, "raw_text": "I need to find information about the average number of words per user used for experiments in the dataset.\n\nLet me search for relevant information about the dataset construction and word counts.\n\nFrom Section 3.2 \"Dataset Construction\":\n- The paper mentions they \"first extract the user reviews that are between 30 to 80 words\"\n- Then they \"concatenate all the valid review texts of each user and input their concatenation to the Receptiviti API\"\n\nFrom Section 3.2, it also states:\n\"The API can take a long piece of human-written text (more than 300 words), and output a faceted personality score\"\n\nThis indicates that they concatenated multiple reviews (each 30-80 words) to reach more than 300 words total per user.\n\nHowever, the question asks specifically about \"the average number of words per user used for the experiments in the dataset from which a data sample of a user's Receptiviti score is shown.\"\n\nLooking at Table 2, which shows \"User ID\" with personality scores, and Table 4, which shows \"Personality label\" with scores and review texts - these are examples of users with Receptiviti scores.\n\nThe key constraint mentioned is:\n- Individual reviews: 30 to 80 words\n- Concatenated text sent to API: more than 300 words\n\nThe paper doesn't provide an exact average number of words per user. However, it does specify that the concatenated reviews must be \"more than 300 words\" to use the Receptiviti API.\n\nSince the paper doesn't explicitly state the average number of words per user, but only mentions the minimum threshold of 300 words for the API, and the individual review range of 30-80 words, I cannot determine an exact average from the information provided.\n\nLet me re-read more carefully... The paper does not explicitly state the average number of words per user used in the experiments. It only mentions the requirements (30-80 words per review, and >300 words total for API input).\n\nWithout explicit information about the average number of words per user in the dataset, I cannot provide an accurate answer to this question based solely on the document.\n\nAnswer: Cannot be determined from the document provided.", "citations": [], "input_tokens": 10458, "output_tokens": 489, "cost_micros": 38709, "latency_ms": 13437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "", "normalised_gold": "466.43"}}
+{"qid": "2303.05039v2.pdf::Q002", "doc_id": "2303.05039v2.pdf", "answer_format": "float", "gold": "466.43", "pages": 9, "document_id": 7583, "arm": "llamacloud_premium_lc", "question_id": "2303.05039v2.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the average number of words per user for the dataset that contains the data sample shown in Table 2.\n\nLooking at Table 2, it shows \"An example of Receptiviti score for a specific, anonymized user\" with User ID A2GBIFL43U1LKJ and their OCEAN personality scores.\n\nNow I need to identify which dataset this user belongs to. Looking at Table 3 which shows \"Statistics of the three datasets used in our study,\" I can see three datasets:\n- Amazon-beauty: Avg. words/user = 990.48\n- Amazon-music: Avg. words/user = 466.43\n- Personality'18: Avg. words/user = - (not applicable)\n\nThe user ID format \"A2GBIFL43U1LKJ\" appears to be from the Amazon dataset format (starting with \"A\" followed by alphanumeric characters).\n\nLooking at Table 1, which shows \"An example of Receptiviti score for a specific, anonymized user\" and contains a reviewerID \"A2SUAM1J3GNN38\" - this is clearly an Amazon dataset format.\n\nHowever, I need to determine which Amazon dataset (beauty or music) the user in Table 2 belongs to. The document doesn't explicitly state which dataset Table 2's user is from.\n\nLooking more carefully at the document structure:\n- Table 1 shows an Amazon review example\n- Table 2 shows a Receptiviti score example\n- Table 3 shows statistics for all three datasets\n\nSince the document constructs two datasets (Amazon-beauty and Amazon-music) and Table 2 appears early in the dataset construction section (Section 3.2), and considering that both Amazon datasets were constructed using the same methodology, I need to check if there's any indication of which one is being referenced.\n\nLooking at Table 4, which shows \"data sample of extreme personality cases,\" these examples appear to be from the Amazon-beauty dataset based on the product descriptions (shampoo, shower gel, etc.).\n\nHowever, the question asks specifically about \"the dataset from which a data sample of a user's Receptiviti score is shown\" referring to Table 2. Without explicit indication of which dataset Table 2's user belongs to, and given that the methodology section discusses both datasets together", "citations": [], "input_tokens": 14881, "output_tokens": 512, "cost_micros": 52323, "latency_ms": 13687, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "2.0", "normalised_gold": "466.43"}}
+{"qid": "2303.05039v2.pdf::Q002", "doc_id": "2303.05039v2.pdf", "answer_format": "float", "gold": "466.43", "pages": 9, "document_id": 7583, "arm": "surfsense_agentic", "question_id": "2303.05039v2.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the document to find information about the average number of words per user used for the experiments.Let me search for information about \"Receptiviti\" and \"words per user\" in the document.Let me read more of the document to find the relevant information. I'll search for sections that might contain dataset statistics or experimental setup.I found a key piece of information! Let me read around line 663 where it mentions \"Each user in the Amazon-music dataset has an average of 990.48 review words over all of\".Good! Now let me also check if there's similar information for the Amazon-beauty dataset, and let me look at the table and figure captions to find where Receptiviti scores are shown.Great! I found that Table 1 shows \"An example of Receptiviti score for a specific, anonymized user.\" Let me continue reading this table and find information about the dataset statistics, particularly for the Amazon-beauty dataset.Perfect! Now I need to find the statistics for the Amazon-beauty dataset. Let me look at Table 3 which shows \"the final statistics of the datasets.\"Excellent! I found Table 3 which shows the statistics for all three datasets. Now I need to identify which dataset the Receptiviti score example (Table 1) comes from.\n\nLooking at the document:\n- Table 1 (around line 581) shows \"An example of Receptiviti score for a specific, anonymized user\" with user ID A2GBIFL43U1LKJ\n- Table 3 (lines 689-732) shows the average words per user for each dataset:\n  - Amazon-beauty: 990.48\n  - Amazon-music: 466.43\n  - Personality'18: -\n\nWait, I notice there's a discrepancy. Let me re-read the section more carefully. The text at line 662-664 says \"Each user in the Amazon-music dataset has an average of 990.48 review words\" but Table 3 shows Amazon-beauty has 990.48 and Amazon-music has 466.43.\n\nLet me read that section again more carefully:There seems to be an inconsistency in the text. Let me check the table more carefully again and see if the columns are correctly labeled:Looking at Table 3, the structure shows:\n- Row 1: Headers - Dataset | Amazon-beauty | Amazon-music | Personality'18\n- The text mentions 1,791 users for Amazon-music (which matches column 2: 1,791)\n- The text mentions 990.48 words/user and 51.01 words/review for Amazon-music\n\nBut in Table 3:\n- Column Amazon-music (2nd data column) shows: 1,791 users, 990.48 avg words/user\n- Wait, I need to re-check. Let me look again.\n\nActually, looking more carefully at the table structure:\n- Line 705: 1,791 is under Amazon-music column for # of users\n- Line 722-723: Avg. words/user shows 990.48 under Amazon-beauty and 466.43 under Amazon-music\n\nThere appears to be a mismatch between the text description and the table. However, the table columns might be swapped. Let me check by cross-referencing other values. The text says \"1,791 active users are selected for the Amazon-music dataset\" and in the table, 1,791 appears in the Amazon-music column.\n\nSo the table seems correct. The text at line 662-664 might have an error, or the columns in the table might be swapped.\n\nNow, to identify which dataset the Receptiviti score sample comes from, let me look more carefully at Table 1:Good! So I can see:\n- Table 1 (line 581) shows an Amazon review example with reviewText mentioning \"piano\" and \"old hymns\" - this appears to be from the music domain\n- Table 2 (line 635) shows \"An example of Receptiviti score for a specific, anonymized user\" with user ID A2GBIFL43U1LKJ\n\nThe review in Table 1 is clearly about music (piano, hymns), so it's from the Amazon-music dataset.\n\nNow, looking back at the data:\n- The text at lines 662-664 states: \"Each user in the Amazon-music dataset has an average of 990.48 review words\"\n- But Table 3 shows Amazon-music with 466.43 avg words/user\n\nLet me verify by checking if there might be swapped columns. Looking at # of users:\n- Text says 1,791 users for Amazon-music\n- Table shows 1,791 under Amazon-music column\nSo the Amazon-music column is correctly labeled.\n\nLet me check # of items:\n- Amazon-beauty should be smaller (beauty products vs music products)\n- Table shows 85 items for Amazon-beauty and 8,895 for Amazon-music\nThis makes sense - there would be more music items than beauty items.\n\nSo it appears there's an error in the text at line 663, where it says \"990.48\" but should say \"466.43\" based on Table 3. OR the table columns for Avg. words/user might be swapped.\n\nLet me verify using Avg. words/review:\n- Text says Amazon-music has \"averaging 51.01 words for each review\"\n- Table shows Amazon-beauty: 51.01, Amazon-music: 51.18\n\nThis suggests the columns might indeed be swapped for the word statistics! The text matches if we swap the columns.\n\nGiven that:\n1. The Receptiviti score sample (Table 1 and Table 2) shows a music review\n2. The text explicitly states the example is from Amazon-music with 990.48 avg words/user\n3. There appears to be a column swap in Table 3 for the word statistics\n\nThe answer should be **990.48** for the Amazon-music dataset.\n\nAnswer: 990.48", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 89094, "error": null, "extra": {"thread_id": 1965, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_a03731abb68747a9a4c0bcb35d9412ce", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 777, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "990.48", "normalised_gold": "466.43"}}
+{"qid": "2303.05039v2.pdf::Q004", "doc_id": "2303.05039v2.pdf", "answer_format": "str", "gold": "Orange", "pages": 9, "document_id": 7583, "arm": "native_pdf", "question_id": "2303.05039v2.pdf::Q004", "answer_letter": null, "raw_text": "I need to find the OCEAN personality scores for user A2GBIFL43U1LKJ and determine which personality trait has the highest score, then identify its color in the soft-labeled personality embedding matrix diagram.\n\nFrom Table 2 on page 4, User A2GBIFL43U1LKJ has the following scores:\n- AGR (Agreeableness): 54.05\n- CON (Conscientiousness): 34.87\n- NEU (Neuroticism): 25.96\n- EXT (Extroversion): 54.39\n- OPEN (Openness): 42.71\n\nThe highest score is 54.39 for Extroversion (EXT).\n\nLooking at Figure 1 on page 5, in the \"Method 2: Soft-labeled personality\" section, the personality embedding matrix shows five rows for the OCEAN traits. The rows are colored differently:\n- Openness: appears to be gray/dark\n- Conscientiousness: appears to be gray/dark\n- Extroversion: appears to be gray/dark\n- Agreeableness: appears to be gray/dark\n- Neuroticism: appears to be gray/dark\n\nHowever, looking more carefully at the figure, the personality embedding matrix shows circles in different shades. For Extroversion (the third row), the circles appear to be a lighter shade or gray.\n\nAnswer: gray", "citations": [], "input_tokens": 26942, "output_tokens": 325, "cost_micros": 85701, "latency_ms": 14828, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2303.05039v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "gray", "normalised_gold": "orange"}}
+{"qid": "2303.05039v2.pdf::Q004", "doc_id": "2303.05039v2.pdf", "answer_format": "str", "gold": "Orange", "pages": 9, "document_id": 7583, "arm": "azure_basic_lc", "question_id": "2303.05039v2.pdf::Q004", "answer_letter": null, "raw_text": "I need to find the personality scores for User A2GBIFL43U1LKJ and determine which has the highest score, then identify its color in the soft-labeled personality embedding matrix shown in Figure 1.\n\nFrom Table 2, User A2GBIFL43U1LKJ has the following OCEAN scores:\n- AGR (Agreeableness): 54.05\n- CON (Conscientiousness): 34.87\n- NEU (Neuroticism): 25.96\n- EXT (Extroversion): 54.39\n- OPEN (Openness): 42.71\n\nThe highest score is EXT (Extroversion) at 54.39.\n\nLooking at Figure 1, Method 2 shows the soft-labeled personality embedding matrix. The personality traits are listed in order as:\n- Openness (O) - shown in purple/dark blue\n- Conscientiousness (C) - shown in orange\n- Extroversion (E) - shown in green\n- Agreeableness (A) - shown in red\n- Neuroticism (N) - shown in purple\n\nSince Extroversion has the highest score for this user, I need to identify the color associated with Extroversion (E) in the embedding matrix diagram.\n\nAnswer: green", "citations": [], "input_tokens": 13271, "output_tokens": 301, "cost_micros": 44328, "latency_ms": 6467, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "green", "normalised_gold": "orange"}}
+{"qid": "2303.05039v2.pdf::Q004", "doc_id": "2303.05039v2.pdf", "answer_format": "str", "gold": "Orange", "pages": 9, "document_id": 7583, "arm": "azure_premium_lc", "question_id": "2303.05039v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Table 2, which shows \"An example of Receptiviti score for a specific, anonymized user\":\n\nUser ID: A2GBIFL43U1LKJ\n- AGR (Agreeableness): 54.05\n- CON (Conscientiousness): 34.87\n- NEU (Neuroticism): 25.96\n- EXT (Extroversion): 54.39\n- OPEN (Openness): 42.71\n\nThe highest score is 54.39 for Extroversion (EXT).\n\nLooking at Figure 1, specifically the \"Method 2: Soft-labeled personality\" diagram, it shows the personality embedding matrix with different personality traits represented in different colors. From the figure, Extroversion is shown in purple/violet color in the personality embedding matrix.\n\nAnswer: Purple", "citations": [], "input_tokens": 16296, "output_tokens": 195, "cost_micros": 51813, "latency_ms": 6375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "purple", "normalised_gold": "orange"}}
+{"qid": "2303.05039v2.pdf::Q004", "doc_id": "2303.05039v2.pdf", "answer_format": "str", "gold": "Orange", "pages": 9, "document_id": 7583, "arm": "llamacloud_basic_lc", "question_id": "2303.05039v2.pdf::Q004", "answer_letter": null, "raw_text": "I need to find the personality scores for User A2GBIFL43U1LKJ and then identify which personality trait has the highest score, and finally determine what color represents that trait in the soft-labeled personality embedding visualization.\n\nFrom Table 2 in the document, User A2GBIFL43U1LKJ has the following OCEAN scores:\n- AGR (Agreeableness): 54.05\n- CON (Conscientiousness): 34.87\n- NEU (Neuroticism): 25.96\n- EXT (Extroversion): 54.39\n- OPEN (Openness): 42.71\n\nThe highest score is EXT (Extroversion) at 54.39.\n\nNow I need to look at Figure 1 which shows the personality embedding methods. In the \"Method 2: Soft-labeled personality\" section, there is a personality embedding vector shown with the OCEAN scores {30,70,50,30,20} for an example user. The figure shows these are arranged in order: Neuroticism, Conscientiousness, Extraversion, Agreeableness, Openness.\n\nHowever, the document does not explicitly show colors associated with each personality trait in the soft-labeled personality embedding matrix visualization. The figure appears to be in grayscale in the provided document, and there is no color legend or color coding system described for the personality traits.\n\nAnswer: The document does not specify colors for personality traits in the embedding matrix.", "citations": [], "input_tokens": 10461, "output_tokens": 337, "cost_micros": 36438, "latency_ms": 8765, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "document does not specify colors for personality traits in embedding matrix", "normalised_gold": "orange"}}
+{"qid": "2303.05039v2.pdf::Q004", "doc_id": "2303.05039v2.pdf", "answer_format": "str", "gold": "Orange", "pages": 9, "document_id": 7583, "arm": "llamacloud_premium_lc", "question_id": "2303.05039v2.pdf::Q004", "answer_letter": null, "raw_text": "I need to find the personality scores for User A2GBIFL43U1LKJ and identify which has the highest score, then determine what color represents that personality in the soft-labeled personality embedding matrix diagram.\n\nFrom Table 2, User A2GBIFL43U1LKJ has the following scores:\n- AGR (Agreeableness): 54.05\n- CON (Conscientiousness): 34.87\n- NEU (Neuroticism): 25.96\n- EXT (Extroversion): 54.39\n- OPEN (Openness): 42.71\n\nThe highest score is EXT (Extroversion) at 54.39.\n\nLooking at Figure 1, specifically at Method 2 (Soft-labeled personality), I can see the personality embedding matrix shows five colored vectors. However, the figure in the document does not clearly show distinct colors for each personality trait in a way that I can definitively identify which color corresponds to Extroversion.\n\nThe figure shows a \"Personality embedding matrix\" but the specific color coding for each of the five OCEAN traits is not explicitly labeled or described in the document text.\n\nWithout clear color labels in the figure or explicit description in the text mapping colors to specific personality traits, I cannot determine the color with certainty.\n\nAnswer: Cannot be determined from the document", "citations": [], "input_tokens": 14884, "output_tokens": 303, "cost_micros": 49197, "latency_ms": 8453, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "cannot be determined from document", "normalised_gold": "orange"}}
+{"qid": "2303.05039v2.pdf::Q004", "doc_id": "2303.05039v2.pdf", "answer_format": "str", "gold": "Orange", "pages": 9, "document_id": 7583, "arm": "surfsense_agentic", "question_id": "2303.05039v2.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to find information about the personality vector color for the specified user.Let me search for information about personality vectors, Receptiviti scores, and soft-labeled personality embeddings in the document.Now let me read the specific sections about the soft-labeled personality embedding matrix and look for figures or tables with user A2GBIFL43U1LKJ:Perfect! I found Table 2 which shows the Receptiviti scores for User A2GBIFL43U1LKJ. The scores are:\n- AGR (Agreeableness): 54.05\n- CON (Conscientiousness): 34.87\n- NEU (Neuroticism): 25.96\n- EXT (Extraversion): 54.39\n- OPEN (Openness): 42.71\n\nThe highest score is EXT (Extraversion) at 54.39. Now I need to find the figure showing the soft-labeled personality embedding matrix to see what color corresponds to Extraversion.I can see the structure but I need to look at the actual image to identify the colors. Let me search for more information about the figure and its color coding:Let me look at the detailed visual description of Figure 1 with the personality embedding matrix:Perfect! I found the answer. According to the visual description of Figure 1, which shows the soft-labeled personality embedding matrix (Method 2), the personality traits are color-coded as follows:\n\n- Purple circles: Openness\n- Blue circles: Conscientiousness\n- **Brown/orange circles: Extraversion**\n- Green circles: Agreeableness\n- Black/gray circles: Neuroticism\n\nFrom Table 2, User A2GBIFL43U1LKJ has the following Receptiviti scores:\n- AGR (Agreeableness): 54.05\n- CON (Conscientiousness): 34.87\n- NEU (Neuroticism): 25.96\n- EXT (Extraversion): 54.39 (highest)\n- OPEN (Openness): 42.71\n\nThe highest Receptiviti score for User A2GBIFL43U1LKJ is Extraversion (EXT) at 54.39, and according to the figure's visual description, Extraversion is represented by brown/orange circles.\n\nAnswer: Brown/orange", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 58234, "error": null, "extra": {"thread_id": 1966, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_f42472e918374f5ab49ea4996d3a5acc", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 362, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 0.6666666666666666, "method": "str_norm", "normalised_pred": "brown orange", "normalised_gold": "orange"}}
+{"qid": "2303.05039v2.pdf::Q005", "doc_id": "2303.05039v2.pdf", "answer_format": "list", "gold": "['71.02', '75.38']", "pages": 9, "document_id": 7583, "arm": "native_pdf", "question_id": "2303.05039v2.pdf::Q005", "answer_letter": null, "raw_text": "I need to find the personality scores in the data sample with extreme personality cases where the personality vector color is blue in the overall model structure.\n\nLooking at Figure 1 (the overall model structure), I can see three methods illustrated:\n- Method 1: Most salient personality\n- Method 2: Soft-labeled personality  \n- Method 3: Hard-coded personality\n\nIn Method 3 (Hard-coded personality), there is a \"Scaling\" step that shows personality scores with blue circles: 0.3 0.7 0.5 0.3 0.2\n\nLooking at Table 4 (data sample of extreme personality cases), I need to find which rows correspond to these scaled scores. The scaled values {0.3, 0.7, 0.5, 0.3, 0.2} represent the normalized OCEAN scores where:\n- Openness (O) = 0.3 \u2192 30\n- Conscientiousness (C) = 0.7 \u2192 70\n- Extroversion (E) = 0.5 \u2192 50\n- Agreeableness (A) = 0.3 \u2192 30\n- Neuroticism (N) = 0.2 \u2192 20\n\nThese are the scaled down values from the original scores {30, 70, 50, 30, 20} mentioned in the figure caption.\n\nSorting these in ascending order: 0.2, 0.3, 0.3, 0.5, 0.7\n\nOr in the original scale: 20, 30, 30, 50, 70\n\nAnswer: 20, 30, 30, 50, 70", "citations": [], "input_tokens": 26955, "output_tokens": 386, "cost_micros": 86655, "latency_ms": 14437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2303.05039v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "20, 30, 50, 70", "normalised_gold": "71 02, 75 38"}}
+{"qid": "2303.05039v2.pdf::Q005", "doc_id": "2303.05039v2.pdf", "answer_format": "list", "gold": "['71.02', '75.38']", "pages": 9, "document_id": 7583, "arm": "azure_basic_lc", "question_id": "2303.05039v2.pdf::Q005", "answer_letter": null, "raw_text": "I need to find the personality scores from the data sample with extreme personality cases where the personality vector color is blue in the overall model structure.\n\nLooking at Figure 1 (the overall structure of the model), I can see that the example shows OCEAN scores of {30,70,50,30,20}. The figure shows different personality traits:\n- Openness (O): 30\n- Conscientiousness (C): 70\n- Extroversion (E): 50\n- Agreeableness (A): 30\n- Neuroticism (N): 20\n\nIn the figure, Conscientiousness is highlighted as the \"most salient personality\" with a score of 70, which appears to be the blue/highlighted vector in the personality embedding.\n\nLooking at Table 4 which shows \"The data sample of extreme personality cases to the annotators,\" I need to find entries related to Conscientiousness (which corresponds to the blue vector in the figure).\n\nThe Conscientiousness entries in Table 4 show personality scores of:\n- 75.38\n- 71.02\n\nArranging these in ascending order:\n\nAnswer: 71.02, 75.38", "citations": [], "input_tokens": 13284, "output_tokens": 265, "cost_micros": 43827, "latency_ms": 7109, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "71 02, 75 38", "normalised_gold": "71 02, 75 38"}}
+{"qid": "2303.05039v2.pdf::Q005", "doc_id": "2303.05039v2.pdf", "answer_format": "list", "gold": "['71.02', '75.38']", "pages": 9, "document_id": 7583, "arm": "azure_premium_lc", "question_id": "2303.05039v2.pdf::Q005", "answer_letter": null, "raw_text": "I need to find the personality scores for a data sample with extreme personality cases where the personality vector color is blue in the overall model structure.\n\nLooking at Figure 1 (the overall model structure), I can see three methods shown:\n1. Method 1: Most salient personality - shows \"Conscientiousness\"\n2. Method 2: Soft-labeled personality - shows a personality embedding matrix with OCEAN scores\n3. Method 3: Hard-coded personality - shows scaled values\n\nIn Method 2 (Soft-labeled personality), there is a personality embedding matrix showing OCEAN scores in what appears to be blue-colored boxes:\n- Openness: 30\n- Conscientiousness: 70\n- Extraversion: 50\n- Agreeableness: 30\n- Neuroticism: 20\n\nThe figure caption confirms: \"In this example, the user's OCEAN score is {30,70,50,30,20}.\"\n\nThese scores in ascending order are: 20, 30, 30, 50, 70\n\nAnswer: 20, 30, 30, 50, 70", "citations": [], "input_tokens": 16309, "output_tokens": 253, "cost_micros": 52722, "latency_ms": 6172, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "20, 30, 50, 70", "normalised_gold": "71 02, 75 38"}}
+{"qid": "2303.05039v2.pdf::Q005", "doc_id": "2303.05039v2.pdf", "answer_format": "list", "gold": "['71.02', '75.38']", "pages": 9, "document_id": 7583, "arm": "llamacloud_basic_lc", "question_id": "2303.05039v2.pdf::Q005", "answer_letter": null, "raw_text": "I need to find the personality scores for the data sample with extreme personality cases where the personality vector color is blue in the overall model structure.\n\nLooking at Figure 1 (the overall structure of the model), I can see an example where the user's OCEAN score is {30,70,50,30,20}. The figure shows three methods for incorporating personality:\n\n1. Method 1: Most salient personality - selects conscientiousness (the highest score of 70)\n2. Method 2: Soft-labeled personality - shows a personality embedding vector with values 20\n3. Method 3: Hard-coded personality - shows scaling with values {0.3, 0.7, 0.5, 0.3, 0.2}\n\nThe figure caption states: \"In this example, the user's OCEAN score is {30,70,50,30,20}.\"\n\nLooking at the structure, the blue colored elements appear to be in Method 2 (Soft-labeled personality), which shows the personality embedding vector. However, I need to identify which specific values correspond to the blue vector.\n\nFrom the figure description and the example given, the OCEAN scores are:\n- Neuroticism: 30\n- Conscientiousness: 70\n- Extraversion: 50\n- Agreeableness: 30\n- Openness: 20\n\nIn ascending order, these scores are: 20, 30, 30, 50, 70\n\nAnswer: 20, 30, 30, 50, 70", "citations": [], "input_tokens": 10474, "output_tokens": 345, "cost_micros": 36597, "latency_ms": 8702, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "20, 30, 50, 70", "normalised_gold": "71 02, 75 38"}}
+{"qid": "2303.05039v2.pdf::Q005", "doc_id": "2303.05039v2.pdf", "answer_format": "list", "gold": "['71.02', '75.38']", "pages": 9, "document_id": 7583, "arm": "llamacloud_premium_lc", "question_id": "2303.05039v2.pdf::Q005", "answer_letter": null, "raw_text": "I need to find the personality scores from the data sample with extreme personality cases where the personality vector color is blue in the overall model structure.\n\nLooking at Figure 1 (the overall structure of the model), I can see an example where the user's OCEAN score is {30,70,50,30,20}. This appears to be the example being referenced.\n\nNow I need to check Table 4, which shows \"The data sample of extreme personality cases to the annotators.\" I need to identify which entries correspond to the blue personality vector in the model structure.\n\nFrom Figure 1, the example shows:\n- O (Openness): 30\n- C (Conscientiousness): 70\n- E (Extroversion): 50\n- A (Agreeableness): 30\n- N (Neuroticism): 20\n\nThe figure shows that Method 1 selects the maximum score (Conscientiousness = 70), which would be the \"blue\" personality vector in the embedding space.\n\nLooking at Table 4 for Conscientiousness entries, I see:\n- Conscientiousness: 75.38\n- Conscientiousness: 71.02\n\nHowever, I need to match the exact scores from the example in Figure 1. The scores in ascending order from the example {30,70,50,30,20} are:\n\n20, 30, 30, 50, 70\n\nAnswer: 20, 30, 30, 50, 70", "citations": [], "input_tokens": 14897, "output_tokens": 333, "cost_micros": 49686, "latency_ms": 8280, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "20, 30, 50, 70", "normalised_gold": "71 02, 75 38"}}
+{"qid": "2303.05039v2.pdf::Q005", "doc_id": "2303.05039v2.pdf", "answer_format": "list", "gold": "['71.02', '75.38']", "pages": 9, "document_id": 7583, "arm": "surfsense_agentic", "question_id": "2303.05039v2.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to find the personality scores for the blue personality vector in the extreme personality cases.", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 62187, "error": null, "extra": {"thread_id": 1967, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_3a604ba7af854be1897689cf0dae5312", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 138, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "i ll read document to find personality scores for blue personality vector in extreme personality cases", "normalised_gold": "71 02, 75 38"}}
+{"qid": "2303.05039v2.pdf::Q006", "doc_id": "2303.05039v2.pdf", "answer_format": "float", "gold": "0.833", "pages": 9, "document_id": 7583, "arm": "native_pdf", "question_id": "2303.05039v2.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the personality trait that is furthest to the left in the distribution figure, then find its highest hit rate.\n\nLooking at Figure 2 on page 6, which shows the distribution of personality traits in Amazon-beauty and Amazon-music datasets, I can see five personality traits displayed: OPEN, CON, EXT, AGR, and NEU.\n\nThe personality trait furthest to the left in the layout is OPEN (Openness).\n\nNow I need to find the highest hit rate (HR) for OPEN across all the datasets and metrics shown in the tables.\n\nLooking at Table 6 on page 8, which shows HR and NDCG results grouped by 5 personality traits:\n- For OPEN in Amazon-beauty: HR = 0.833 (+) and 0.750 (-)\n- For OPEN in Amazon-music: HR = 0.330 (+) and 0.313 (-)\n- For OPEN in Personality2018: HR = 0.535 (+) and 0.547 (-)\n\nThe highest hit rate for OPEN is 0.833 in the Amazon-beauty dataset.\n\nAnswer: 0.833", "citations": [], "input_tokens": 26925, "output_tokens": 268, "cost_micros": 84795, "latency_ms": 10375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2303.05039v2.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "0.833", "normalised_gold": "0.833"}}
+{"qid": "2303.05039v2.pdf::Q006", "doc_id": "2303.05039v2.pdf", "answer_format": "float", "gold": "0.833", "pages": 9, "document_id": 7583, "arm": "azure_basic_lc", "question_id": "2303.05039v2.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the personality trait that is furthest to the left in the distribution figure, then find its highest hit rate in the results.\n\nLooking at Figure 2 (Distribution of personality traits in Amazon-beauty and Amazon-music datasets), I need to identify which personality trait has its distribution furthest to the left. The figure shows distributions for OPEN, CON, EXT, AGR, and NEU.\n\nFrom the figure description and visual analysis:\n- The median lines (red lines) show the central tendency of each distribution\n- According to the text in Section 5.2: \"agreeable people have the highest median score, and neurotic people have the lowest median score\"\n- This means NEU (Neuroticism) is furthest to the left (lowest scores)\n\nNow I need to find the highest hit rate for NEU (Neuroticism) in the experimental results.\n\nLooking at Table 6, which shows results grouped by personality traits:\n\nFor NEU (Neuroticism):\n- Amazon-beauty HR: 0.933 (with +) and 0.833 (with -)\n- Amazon-music HR: 0.397 (with +) and 0.254 (with -)\n- Personality2018 HR: 0.489 (with +) and 0.390 (with -)\n\nThe highest hit rate for NEU across all datasets is 0.933.\n\nAnswer: 0.933", "citations": [], "input_tokens": 13254, "output_tokens": 324, "cost_micros": 44622, "latency_ms": 7905, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "0.933", "normalised_gold": "0.833"}}
+{"qid": "2303.05039v2.pdf::Q006", "doc_id": "2303.05039v2.pdf", "answer_format": "float", "gold": "0.833", "pages": 9, "document_id": 7583, "arm": "azure_premium_lc", "question_id": "2303.05039v2.pdf::Q006", "answer_letter": null, "raw_text": "I need to identify the personality trait located furthest to the left in Figure 2, then find its highest hit rate in the results tables.\n\nLooking at Figure 2 (Distribution of personality traits), I need to identify which personality trait appears furthest to the left in the distribution plots. The figure shows distributions for OPEN, CON, EXT, AGR, and NEU for both Amazon-beauty and Amazon-music datasets.\n\nFrom the figure, examining the distributions from left to right, NEU (Neuroticism) appears to have its distribution positioned furthest to the left, with the lowest median scores (indicated by the red line) compared to the other personality traits.\n\nNow I need to find the highest hit rate (HR) value for NEU (Neuroticism) in the experimental results.\n\nLooking at Table 6, which shows HR and NDCG results grouped by 5 personality traits:\n\nFor NEU (Neuroticism):\n- Amazon-beauty: HR = 0.933 (+) and 0.833 (-)\n- Amazon-music: HR = 0.397 (+) and 0.397 (-)\n- Personality2018: HR = 0.489 (+) and 0.511 (-)\n\nThe highest hit rate value for NEU across all datasets is 0.933 (from Amazon-beauty with the + model).\n\nAnswer: 0.933", "citations": [], "input_tokens": 16279, "output_tokens": 313, "cost_micros": 53532, "latency_ms": 8358, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "0.933", "normalised_gold": "0.833"}}
+{"qid": "2303.05039v2.pdf::Q006", "doc_id": "2303.05039v2.pdf", "answer_format": "float", "gold": "0.833", "pages": 9, "document_id": 7583, "arm": "llamacloud_basic_lc", "question_id": "2303.05039v2.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the personality trait that appears furthest to the left in Figure 2 (the distribution of personality traits), and then find its highest hit rate from the experimental results.\n\nLooking at Figure 2, I need to identify which personality trait appears furthest to the left. The figure shows distributions for five personality traits: OPEN (Openness), CON (Conscientiousness), EXT (Extroversion), AGR (Agreeableness), and NEU (Neuroticism).\n\nAccording to the text describing Figure 2: \"We observe a similar trend in both domains: agreeable people have the highest median score, and neurotic people have the lowest median score.\"\n\nThis indicates that Neuroticism (NEU) has the lowest median score, meaning it would be positioned furthest to the left in the distribution.\n\nNow I need to find the highest hit rate for Neuroticism (NEU) from Table 6, which shows the breakdown by personality traits.\n\nLooking at Table 6 in the document, I need to find the Hit Rate (HR) values for Neuroticism across the three datasets (Amazon-beauty, Amazon-music, and Personality2018).\n\nHowever, Table 6 is not fully visible in the provided text. But I can see from the discussion in Section 5.4 that it mentions performance changes for different personality traits.\n\nLooking back at Table 5, which shows overall results, I need to check if there's personality-specific data. The table shows H@3, H@5, and H@10 values across different models and datasets.\n\nFrom the text in Section 5.4: \"In the Amazon-music dataset, openness (+27%), agreeableness (+10%), extroversion (+5%) improve while neuroticism (\u201318%) and conscientiousness (\u201312%) decreases.\"\n\nThis suggests neuroticism showed decreases in the Amazon-music dataset. However, I need to find the actual highest hit rate value for neuroticism from the tables.\n\nSince Table 6 details are not fully provided in the visible text, and Table 5 shows overall results rather than personality-specific breakdowns, I need to infer from the context. The highest hit rates in Table 5 are around 0.948-0.977 for H@10 in Amazon-beauty dataset.", "citations": [], "input_tokens": 10444, "output_tokens": 512, "cost_micros": 39012, "latency_ms": 11484, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "6.0", "normalised_gold": "0.833"}}
+{"qid": "2303.05039v2.pdf::Q006", "doc_id": "2303.05039v2.pdf", "answer_format": "float", "gold": "0.833", "pages": 9, "document_id": 7583, "arm": "llamacloud_premium_lc", "question_id": "2303.05039v2.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the personality trait that is furthest to the left in the distribution figure, then find its highest hit rate.\n\nLooking at Figure 2 (Distribution of personality traits), I need to identify which personality trait has the lowest median score (furthest left).\n\nFrom the text in Section 5.2:\n\"We observe a similar trend in both domains: agreeable people have the highest median score, and neurotic people have the lowest median score.\"\n\nSo Neuroticism (NEU) is the personality trait furthest to the left in the distribution.\n\nNow I need to find the highest hit rate for Neuroticism in Table 6, which shows results grouped by personality traits.\n\nLooking at Table 6 for NEU (Neuroticism):\n- Amazon-beauty: HR = 0.933 (+) and 0.833 (-)\n- Amazon-music: HR = 0.397 (both + and -)\n- Personality2018: HR = 0.489 (+) and 0.511 (-)\n\nThe highest hit rate for Neuroticism across all datasets is 0.933 (Amazon-beauty with personality information).\n\nAnswer: 0.933", "citations": [], "input_tokens": 14867, "output_tokens": 267, "cost_micros": 48606, "latency_ms": 8485, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "0.933", "normalised_gold": "0.833"}}
+{"qid": "2303.05039v2.pdf::Q006", "doc_id": "2303.05039v2.pdf", "answer_format": "float", "gold": "0.833", "pages": 9, "document_id": 7583, "arm": "surfsense_agentic", "question_id": "2303.05039v2.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the document to find the distribution of personality traits figure and then identify the corresponding hit rate data.Let me search for figures related to personality traits distribution:Let me read around line 1011 where it mentions \"distribution of users' personalities\":Let me search for Figure 2 which shows the personality distribution:Now let me read Figure 2 in detail:Let me continue reading to see all the personality trait figures:Now let me search for Table 6 which shows the hit rates for different personality traits:Now I need to find which personality trait is \"furthest to the left\" in the distribution figure. Looking back at Figure 2, I need to examine the visual descriptions more carefully. Let me search for more details about each personality trait's distribution:The document states that \"neurotic people have the lowest median score\" (line 1509-1510). This means that NEU (Neuroticism) is the personality trait furthest to the left in the distribution. Now I need to find the highest hit rate for NEU from Table 6.\n\nLooking at Table 6 data I found earlier:\n- NEU (Neuroticism) row shows:\n  - Amazon-beauty: HR = 0.933 (+) and 0.833 (-)\n  - Amazon-music: HR = 0.397 (+) and 0.397 (-)\n  - Personality2018: HR = 0.489 (+) and 0.511 (-)\n\nThe highest hit rate for NEU across all datasets is 0.933.\n\nAnswer: 0.933", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 70937, "error": null, "extra": {"thread_id": 1968, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_1c93a76cd24f45808d3740ab6074704b", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 421, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "0.933", "normalised_gold": "0.833"}}
+{"qid": "2303.05039v2.pdf::Q007", "doc_id": "2303.05039v2.pdf", "answer_format": "str", "gold": "H@10", "pages": 9, "document_id": 7583, "arm": "native_pdf", "question_id": "2303.05039v2.pdf::Q007", "answer_letter": null, "raw_text": "Looking at the document, I need to:\n\n1. Identify the model structure figure - This is Figure 1 on page 5\n2. Find the method at the bottom of this figure - This is \"NCF + Hard-coded personality\"\n3. Look at Table 5 (page 8) which shows results across the three datasets (Amazon-beauty, Amazon-music, and Personality2018)\n4. Find the highest metric value for NCF+Hard-Coded across all metrics\n\nFrom Table 5, for NCF+Hard-Coded:\n- Amazon-beauty: H@3=0.948, H@5=0.961, H@10=0.977, N@3=0.849, N@5=0.826, N@10=0.848\n- Amazon-music: H@3=0.175, H@5=0.232, H@10=0.345, N@3=0.147, N@5=0.160, N@10=0.189\n- Personality2018: H@3=0.503, H@5=0.622, H@10=0.758, N@3=0.398, N@5=0.447, N@10=0.498\n\nThe highest value is 0.977 for H@10 in the Amazon-beauty dataset.\n\nAnswer: H@10", "citations": [], "input_tokens": 26935, "output_tokens": 314, "cost_micros": 85515, "latency_ms": 9796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2303.05039v2.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "h 10", "normalised_gold": "h 10"}}
+{"qid": "2303.05039v2.pdf::Q007", "doc_id": "2303.05039v2.pdf", "answer_format": "str", "gold": "H@10", "pages": 9, "document_id": 7583, "arm": "azure_basic_lc", "question_id": "2303.05039v2.pdf::Q007", "answer_letter": null, "raw_text": "Looking at the document, I need to:\n1. Identify the method at the bottom of the model structure figure (Figure 1)\n2. Find its performance across the three datasets in Table 5\n3. Determine which metric has the highest value\n\nFrom Figure 1, the three methods shown are:\n- Method 1: Most salient personality\n- Method 2: Soft-labeled personality  \n- Method 3: Hard-coded personality\n\nMethod 3 (Hard-coded personality) appears at the bottom of the figure.\n\nLooking at Table 5, the row for \"NCF+Hard-Coded\" shows performance across three datasets (Amazon-beauty, Amazon-music, Personality2018) with metrics H@3, H@5, H@10, N@3, N@5, N@10.\n\nThe values for NCF+Hard-Coded are:\n- Amazon-beauty: 0.948, 0.961, 0.977, 0.849, 0.826, 0.848\n- Amazon-music: 0.175, 0.232, 0.345, 0.147, 0.160, 0.189\n- Personality2018: 0.503, 0.622, 0.758, 0.398, 0.447, 0.498\n\nThe highest value across all three datasets for this method is 0.977, which corresponds to H@10 (Hit Rate at 10) in the Amazon-beauty dataset.\n\nAnswer: H@10", "citations": [], "input_tokens": 13264, "output_tokens": 347, "cost_micros": 44997, "latency_ms": 7250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "h 10", "normalised_gold": "h 10"}}
+{"qid": "2303.05039v2.pdf::Q007", "doc_id": "2303.05039v2.pdf", "answer_format": "str", "gold": "H@10", "pages": 9, "document_id": 7583, "arm": "azure_premium_lc", "question_id": "2303.05039v2.pdf::Q007", "answer_letter": null, "raw_text": "Looking at Figure 1, the method at the bottom of the model structure is \"Method 3: Hard-coded personality\" which corresponds to the NCF + Hard-coded Personality model.\n\nFrom Table 5, examining the results for NCF+Hard-Coded across all three datasets (Amazon-beauty, Amazon-music, and Personality2018) and all metrics (H@3, H@5, H@10, N@3, N@5, N@10):\n\nAmazon-beauty: N@3 = 0.849 (highest value)\nAmazon-music: N@10 = 0.189 (highest value)\nPersonality2018: N@5 = 0.447 (highest value)\n\nThe metric that appears most frequently as the highest is NDCG-related metrics, with N@3 having the highest single value (0.849).\n\nAnswer: N@3", "citations": [], "input_tokens": 16289, "output_tokens": 205, "cost_micros": 51942, "latency_ms": 6843, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "n 3", "normalised_gold": "h 10"}}
+{"qid": "2303.05039v2.pdf::Q007", "doc_id": "2303.05039v2.pdf", "answer_format": "str", "gold": "H@10", "pages": 9, "document_id": 7583, "arm": "llamacloud_basic_lc", "question_id": "2303.05039v2.pdf::Q007", "answer_letter": null, "raw_text": "Looking at Figure 1, the method at the bottom of the model structure is \"Method 3: Hard-coded personality\" which corresponds to the NCF+Hard-Coded model.\n\nFrom Table 5, I need to find the highest metric value for NCF+Hard-Coded across all three datasets (Amazon-beauty, Amazon-music, and Personality 2018):\n\nAmazon-beauty: N@3 = 0.849 (highest)\nAmazon-music: N@10 = 0.189\nPersonality 2018: H@10 = 0.758\n\nThe highest value is 0.849 for N@3 (NDCG@3) in the Amazon-beauty dataset.\n\nAnswer: NDCG@3", "citations": [], "input_tokens": 10454, "output_tokens": 171, "cost_micros": 33927, "latency_ms": 6750, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "ndcg 3", "normalised_gold": "h 10"}}
+{"qid": "2303.05039v2.pdf::Q007", "doc_id": "2303.05039v2.pdf", "answer_format": "str", "gold": "H@10", "pages": 9, "document_id": 7583, "arm": "llamacloud_premium_lc", "question_id": "2303.05039v2.pdf::Q007", "answer_letter": null, "raw_text": "I need to identify the method at the bottom of the model structure figure and find its highest evaluation metric across the three datasets.\n\nLooking at Figure 1, the three methods shown are:\n1. Method 1: Most salient personality\n2. Method 2: Soft-labeled personality\n3. Method 3: Hard-coded personality\n\nMethod 3 (Hard-coded personality) appears at the bottom of the figure.\n\nNow I need to look at Table 5 to find the highest metric for NCF+Hard-Coded across all three datasets (Amazon-beauty, Amazon-music, and Personality2018).\n\nFor NCF+Hard-Coded:\n- Amazon-beauty: H@3=0.948, H@5=0.961, H@10=0.977, N@3=0.849, N@5=0.826, N@10=0.848\n- Amazon-music: H@3=0.175, H@5=0.232, H@10=0.345, N@3=0.147, N@5=0.160, N@10=0.189\n- Personality2018: H@3=0.503, H@5=0.622, H@10=0.758, N@3=0.398, N@5=0.447, N@10=0.498\n\nThe highest value across all metrics and datasets for NCF+Hard-Coded is 0.977 for H@10 in Amazon-beauty.\n\nAnswer: H@10", "citations": [], "input_tokens": 14877, "output_tokens": 346, "cost_micros": 49821, "latency_ms": 6828, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "h 10", "normalised_gold": "h 10"}}
+{"qid": "2303.05039v2.pdf::Q007", "doc_id": "2303.05039v2.pdf", "answer_format": "str", "gold": "H@10", "pages": 9, "document_id": 7583, "arm": "surfsense_agentic", "question_id": "2303.05039v2.pdf::Q007", "answer_letter": null, "raw_text": "I need to find a model structure figure in this document first, then identify the method at the bottom of that figure, and finally check which evaluation metric has the highest number across three datasets for that method.Let me search for figures in the document more systematically:Great! I found Figure 1 showing the model structure. Now I can see that the three methods are:\n1. Method 1: Most salient personality - at the top\n2. Method 2: Soft-labeled personality - in the middle\n3. Method 3: Hard-coded personality - at the bottom\n\nThe method at the **bottom** is \"Hard-coded personality\" (Method 3). Now I need to find evaluation results for this method across the three datasets to determine which metric has the highest number.Now let me search for the experimental results tables:Let me continue searching for the results table:Perfect! Now I have found Table 5 which shows the evaluation results for all methods including \"NCF+Hard-Coded\" (the method at the bottom of the model structure figure) across the three datasets:\n\nFor **NCF+Hard-Coded** method across the three datasets:\n\n**Amazon-beauty:**\n- H@3: 0.948\n- H@5: 0.961  \n- H@10: 0.977\n- N@3: **0.849** \u2190 Highest\n- N@5: 0.826\n- N@10: 0.848\n\n**Amazon-music:**\n- H@3: 0.175\n- H@5: 0.232\n- H@10: 0.345\n- N@3: 0.147\n- N@5: 0.160\n- N@10: **0.189** \u2190 Highest\n\n**Personality2018:**\n- H@3: 0.503\n- H@5: 0.622\n- H@10: **0.758** \u2190 Highest\n- N@3: 0.398\n- N@5: 0.447\n- N@10: 0.498\n\nLooking at the highest value for each dataset:\n- Amazon-beauty: N@3 = 0.849\n- Amazon-music: N@10 = 0.189\n- Personality2018: H@10 = 0.758\n\nThe evaluation metric with the highest number across all three datasets for the Hard-coded method is **N@3** (with value 0.849 in Amazon-beauty).\n\nAnswer: N@3", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 93687, "error": null, "extra": {"thread_id": 1969, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_f41f50a7be1a46d59602bb6f59d5d5fd", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 515, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "n 3", "normalised_gold": "h 10"}}
+{"qid": "2303.08559v2.pdf::Q000", "doc_id": "2303.08559v2.pdf", "answer_format": "float", "gold": "72.3%", "pages": 30, "document_id": 7584, "arm": "native_pdf", "question_id": "2303.08559v2.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the performance of filter-then-rerank methods without ensemble on the 50-shot TACREV dataset.\n\nLooking at Table 3 on page 9, which shows \"Overall results of LLM-based ICL methods, SLM-based supervised methods, and our proposed filter-then-rerank (SLM+LLM) methods.\"\n\nFor TACREV 50-shot:\n- Previous SoTA (line 4): 68.5(1.6)\n- The filter-then-rerank methods without ensemble are shown in the rows labeled \"+ Rerank (L)\"\n\nLooking at the three reranker options for 50-shot TACREV:\n- Vicuna-13B + Rerank (L): 70.8(1.6)\n- InstructGPT + Rerank (L): 72.3(1.4)\n- GPT-4 + Rerank (L): 72.3(0.3)\n\nThe question asks for \"filter-then-rerank methods (w.o. ensemble)\" which refers to the \"+ Rerank (L)\" rows without the \"+ Ensemble (S)\" component.\n\nSince the question uses singular \"methods\" but there are multiple values, I should identify which one is the main result. Looking at the context, InstructGPT appears to be the primary model discussed in the main experiments (as mentioned in Section 5.3: \"with InstructGPT, reranking provides an average F1 gain of 2.4%\").\n\nAnswer: 72.3", "citations": [], "input_tokens": 82081, "output_tokens": 358, "cost_micros": 251613, "latency_ms": 38407, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2303.08559v2.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "72.3", "normalised_gold": "72.3"}}
+{"qid": "2303.08559v2.pdf::Q000", "doc_id": "2303.08559v2.pdf", "answer_format": "float", "gold": "72.3%", "pages": 30, "document_id": 7584, "arm": "azure_basic_lc", "question_id": "2303.08559v2.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the performance of filter-then-rerank methods without ensemble on the 50-shot TACREV dataset.\n\nLooking at Table 3, which shows \"Overall results of LLM-based ICL methods, SLM-based supervised methods, and our proposed filter-then-rerank (SLM+LLM) methods.\"\n\nFor TACREV (RE) task, I need to find the 50-shot column.\n\nLooking at the rows for filter-then-rerank without ensemble:\n- Line 6: Vicuna-13B + Rerank (L): 70.8(1.6)\n- Line 7: InstructGPT + Rerank (L): 72.3(1.4)\n- Line 9: GPT-4 + Rerank (L): 72.3(0.3)\n\nThe question asks about \"filter-then-rerank methods\" in general (plural), but I need to identify which specific method is being asked about. \n\nLooking at the context, when comparing with baseline methods, the paper typically refers to InstructGPT as the main reranker in their comparisons (as mentioned in section 5.3 where they state \"with InstructGPT, reranking provides an average F1 gain of 2.4%\").\n\nFor 50-shot TACREV with InstructGPT + Rerank (L) without ensemble:\n\nAnswer: 72.3", "citations": [], "input_tokens": 35348, "output_tokens": 333, "cost_micros": 111039, "latency_ms": 9656, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "72.3", "normalised_gold": "72.3"}}
+{"qid": "2303.08559v2.pdf::Q000", "doc_id": "2303.08559v2.pdf", "answer_format": "float", "gold": "72.3%", "pages": 30, "document_id": 7584, "arm": "azure_premium_lc", "question_id": "2303.08559v2.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the performance of filter-then-rerank methods without ensemble on the 50-shot TACREV dataset.\n\nLooking at Table 3, which shows \"Overall results of LLM-based ICL methods, SLM-based supervised methods, and our proposed filter-then-rerank (SLM+LLM) methods.\"\n\nFor TACREV dataset, I can see columns for 20-shot, 50-shot, and 100-shot settings.\n\nLooking at the 50-shot column for TACREV:\n\nThe rows showing filter-then-rerank methods without ensemble (marked as \"+ Rerank (L)\") are:\n- With Vicuna-13B as reranker: 70.8(1.6)\n- With InstructGPT as reranker: 72.3(1.4)\n- With GPT-4 as reranker: 72.3(0.3)\n\nThe question asks for \"filter-then-rerank methods (w.o. ensemble)\" which means without the \"+ Ensemble (S)\" prefix. The previous SoTA method (line 4) shows 68.5(1.6) for comparison.\n\nSince there are three different LLM rerankers tested, and the question asks for \"filter-then-rerank methods\" in general, I should identify which specific result is being asked for. Looking at the context, when InstructGPT is used as the reranker (which appears to be the main model discussed in the results section based on Table 4 and other analyses), the performance is 72.3.\n\nAnswer: 72.3", "citations": [], "input_tokens": 46313, "output_tokens": 370, "cost_micros": 144489, "latency_ms": 11125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "72.3", "normalised_gold": "72.3"}}
+{"qid": "2303.08559v2.pdf::Q000", "doc_id": "2303.08559v2.pdf", "answer_format": "float", "gold": "72.3%", "pages": 30, "document_id": 7584, "arm": "llamacloud_basic_lc", "question_id": "2303.08559v2.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the performance of filter-then-rerank methods without ensemble on the 50-shot TACREV dataset.\n\nLooking at Table 3, which shows \"Overall results of LLM-based ICL methods, SLM-based supervised methods, and our proposed filter-then-rerank (SLM+LLM) methods.\"\n\nFor TACREV (RE) at 50-shot setting:\n\nLooking at the rows with filter-then-rerank methods without ensemble:\n- Line 4: \"Previous SoTA\" (this is the baseline SLM)\n- Line 7: \"+ Rerank (L)\" with InstructGPT - this adds reranking to the previous SoTA without ensemble\n\nFrom Table 3, TACREV 50-shot column:\n- Previous SoTA: 68.5(1.6)\n- SLM + Rerank (L) with InstructGPT: 70.8(1.6)\n\nThe filter-then-rerank method without ensemble using InstructGPT shows 70.8.\n\nAnswer: 70.8", "citations": [], "input_tokens": 33414, "output_tokens": 254, "cost_micros": 104052, "latency_ms": 9844, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "70.8", "normalised_gold": "72.3"}}
+{"qid": "2303.08559v2.pdf::Q000", "doc_id": "2303.08559v2.pdf", "answer_format": "float", "gold": "72.3%", "pages": 30, "document_id": 7584, "arm": "llamacloud_premium_lc", "question_id": "2303.08559v2.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Table 3 in the document, I need to find the performance of filter-then-rerank methods without ensemble on the 50-shot TACREV dataset.\n\nIn Table 3, under the TACREV (RE) 50-shot column:\n- Previous SoTA (SLM baseline): 72.6(1.5)\n- + Rerank (L) with Vicuna-13B: 73.8(1.7)\n- + Rerank (L) with InstructGPT: 75.4(1.5)\n- + Rerank (L) with GPT-4: 74.5(1.5)\n\nThe question asks for filter-then-rerank methods without ensemble. The \"+ Rerank (L)\" rows show the filter-then-rerank results without ensemble (the ensemble results are shown in the \"+ Ensemble (S) + Rerank (L)\" rows).\n\nSince there are three different rerankers tested, and the question asks for \"filter-then-rerank methods\" in general without specifying which LLM reranker, I should provide the InstructGPT result as it appears to be the primary comparison model used throughout the paper (it's the middle option and shows the best performance).\n\nAnswer: 75.4", "citations": [], "input_tokens": 45083, "output_tokens": 294, "cost_micros": 139659, "latency_ms": 8921, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "75.4", "normalised_gold": "72.3"}}
+{"qid": "2303.08559v2.pdf::Q000", "doc_id": "2303.08559v2.pdf", "answer_format": "float", "gold": "72.3%", "pages": 30, "document_id": 7584, "arm": "surfsense_agentic", "question_id": "2303.08559v2.pdf::Q000", "answer_letter": null, "raw_text": "I'll search for information about filter-then-rerank methods performance on the 50-shot TACREV dataset in the provided document.Let me search for tables that contain both TACREV and 50-shot data:Perfect! I found Table 3 which contains the performance data. Let me look at the specific row for filter-then-rerank methods without ensemble on the 50-shot TACREV dataset.\n\nFrom the table, I can see:\n- Column headers show: FewNERD (NER) with 5-shot, 10-shot, 20-shot, then TACREV (RE) with 20-shot, 50-shot, 100-shot\n- The filter-then-rerank method without ensemble is the \"+ Rerank (L)\" row (Line 7 as mentioned in the text)\n\nLooking at the data:\n- Under \"SLM\" section, the \"+ Rerank (L)\" row (line 2050-2060) shows the Vicuna-13B results\n- For TACREV 50-shot (5th data column), the value is: 70.8(1.6)\n\nLet me also check the InstructGPT rerank results:\n- \"+ Rerank (L)\" with InstructGPT (line 2080-2090) shows for TACREV 50-shot: 72.3(1.4)\n\nAnd GPT-4 rerank results:\n- \"+ Rerank (L)\" with GPT-4 (line 2111-2121) shows for TACREV 50-shot: 72.3(0.3)\n\nThe question asks for filter-then-rerank methods (w.o. ensemble), which would be the \"+ Rerank (L)\" rows. Looking at line 2084-2085, I can see the InstructGPT version shows 72.3 for 50-shot TACREV.\n\nHowever, I need to verify which specific implementation is being referred to. Let me check the context around line 1903 which mentions \"Lines 4 vs. 7\" for the comparison.\n\nBased on the table structure and the text mentioning \"Lines 4 vs. 7\" where Line 7 is the rerank method, and looking at the TACREV 50-shot column, the filter-then-rerank method (w.o. ensemble) with InstructGPT shows: 72.3\n\nAnswer: 72.3", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 50031, "error": null, "extra": {"thread_id": 1970, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_cec9e66ec5254f8aa0ad0e5be252d80b", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 338, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "72.3", "normalised_gold": "72.3"}}
+{"qid": "2303.08559v2.pdf::Q002", "doc_id": "2303.08559v2.pdf", "answer_format": "str", "gold": "Identify the entities expressed by each sentence, and locate each entity to words in the sentence. The possible entity types are: [Type_1], [Type_2], ..., [Type_N]. If you do not find any entity in this sentence, just output \u2018Answer: No entities found.\u2019", "pages": 30, "document_id": 7584, "arm": "native_pdf", "question_id": "2303.08559v2.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 3 (left) in the document, which shows \"LLMs' performance w.r.t prompt variants on 20-shot FewNERD dataset,\" the left panel shows \"ChatGPT's performance (F1 Score) across six instruction variants.\"\n\nThe red box in the left panel of Figure 3 highlights the instruction formats labeled I0 through I5 on the x-axis.\n\nBased on the context provided in Appendix E.2, these instruction variants range from simple to complex, with I0 being empty (no instruction) and I1 through I5 being increasingly detailed instructions for the NER task.\n\nAnswer: ChatGPT's performance (F1 Score) across six instruction variants", "citations": [], "input_tokens": 82074, "output_tokens": 166, "cost_micros": 248712, "latency_ms": 34046, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2303.08559v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "chatgpt s performance f1 score across six instruction variants", "normalised_gold": "identify entities expressed by each sentence and locate each entity to words in sentence possible entity types are type 1 type 2 type n if you do not find any entity in this sentence just output \u2018answer no entities found \u2019"}}
+{"qid": "2303.08559v2.pdf::Q002", "doc_id": "2303.08559v2.pdf", "answer_format": "str", "gold": "Identify the entities expressed by each sentence, and locate each entity to words in the sentence. The possible entity types are: [Type_1], [Type_2], ..., [Type_N]. If you do not find any entity in this sentence, just output \u2018Answer: No entities found.\u2019", "pages": 30, "document_id": 7584, "arm": "azure_basic_lc", "question_id": "2303.08559v2.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 3 (left), which shows \"LLMs' performance w.r.t prompt variants\" and displays \"ChatGPT's performance (F1 Score) across six instruction variants\" labeled I0 through I5.\n\nThe document provides the full text of these six instruction variants in Section E.2 \"Finding Better Instruction.\"\n\nThe instructions are:\n\n**I0:** [empty]\n\n**I1:** Identify the entities expressed by each sentence, and locate each entity to words in the sentence. The possible entity types are: [Type_1], [Type_2], ..., [Type_N]. If you do not find any entity in this sentence, just output 'Answer: No entities found.'\n\n**I2:** Identify the entities expressed by each sentence, and locate each entity to words in the sentence. The possible entity types are: [Type_1]: [Definition_1], [Type_2]: [Definition_2], ..., [Type_N]: [Definition_N]. If you do not find any entity in this sentence, just output 'Answer: No entities found.'\n\n**I3:** Assume you are an entity-instance annotator. Given a sentence, you need to (1) identify the word or phrase about the entity in the sentence, and (2) classify its entity type. The possible entity types are listed as below: [Type_1], [Type_2], ..., [Type_N]. Please note that your annotation results must follow such format: 'Answer: ([Type_1] <SEP> identified_entity: [Entity_1]), ([Type_2] <SEP> identified_entity: [Entity_2])'. If you do not find any entity in this sentence, just output 'Answer: No entities found.'\n\n**I4:** Assume you are an entity-instance annotator. Your objective is to perform a series of intricate steps for Named Entity Recognition. Firstly, you have to identify a particular word or phrase in the sentence that corresponds to an entity. Following this, classify the entity into one of the potential entity types. The potential entity types are provided as below: [Type_1], [Type_2], ..., [Type_N]. Please note that your annotation results must follow such format: 'Answer: ([Type_1] <SEP> identified_entity: [Entity_", "citations": [], "input_tokens": 35341, "output_tokens": 512, "cost_micros": 113703, "latency_ms": 13984, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.1702127659574468, "method": "str_norm", "normalised_pred": "type 1 sep identified entity entity", "normalised_gold": "identify entities expressed by each sentence and locate each entity to words in sentence possible entity types are type 1 type 2 type n if you do not find any entity in this sentence just output \u2018answer no entities found \u2019"}}
+{"qid": "2303.08559v2.pdf::Q002", "doc_id": "2303.08559v2.pdf", "answer_format": "str", "gold": "Identify the entities expressed by each sentence, and locate each entity to words in the sentence. The possible entity types are: [Type_1], [Type_2], ..., [Type_N]. If you do not find any entity in this sentence, just output \u2018Answer: No entities found.\u2019", "pages": 30, "document_id": 7584, "arm": "azure_premium_lc", "question_id": "2303.08559v2.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 3 (left), which shows \"ChatGPT's performance (F1 Score) across six instruction variants,\" I need to find the instructions labeled I0 through I5.\n\nFrom Section E.2 \"Finding Better Instruction\" in the document, the six instruction variants are listed. The red box in Figure 3 (left) shows the x-axis labels I0, I1, I2, I3, I4, I5.\n\nThese correspond to:\n\n**I0**: [empty]\n\n**I1**: Identify the entities expressed by each sentence, and locate each entity to words in the sentence. The possible entity types are: [Type_1], [Type_2], ... , [Type_N]. If you do not find any entity in this sentence, just output 'Answer: No entities found.'\n\n**I2**: Same as I1 but with definitions added for each type\n\n**I3**: Assume you are an entity-instance annotator. Given a sentence, you need to (1) identify the word or phrase about the entity in the sentence, and (2) classify its entity type. [includes format specification]\n\n**I4**: Similar to I3 but with more elaborate language about \"intricate steps\"\n\n**I5**: Combines detailed task description with entity type definitions\n\nAnswer: Six instruction variants from empty (I0) to increasingly complex formats with definitions and detailed task descriptions (I1-I5)", "citations": [], "input_tokens": 46306, "output_tokens": 319, "cost_micros": 143703, "latency_ms": 11812, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.06779661016949154, "method": "str_norm", "normalised_pred": "six instruction variants from empty i0 to increasingly complex formats with definitions and detailed task descriptions i1 i5", "normalised_gold": "identify entities expressed by each sentence and locate each entity to words in sentence possible entity types are type 1 type 2 type n if you do not find any entity in this sentence just output \u2018answer no entities found \u2019"}}
+{"qid": "2303.08559v2.pdf::Q002", "doc_id": "2303.08559v2.pdf", "answer_format": "str", "gold": "Identify the entities expressed by each sentence, and locate each entity to words in the sentence. The possible entity types are: [Type_1], [Type_2], ..., [Type_N]. If you do not find any entity in this sentence, just output \u2018Answer: No entities found.\u2019", "pages": 30, "document_id": 7584, "arm": "llamacloud_basic_lc", "question_id": "2303.08559v2.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 3 (left), I can see it's showing \"Relationship between confidence scores and performance with/without LLM reranking.\"\n\nThe red box in the left figure appears to be highlighting the legend/experimental settings, which shows:\n\n**Answer: RoBERTa-large as filter and InstructGPT as reranker**", "citations": [], "input_tokens": 33407, "output_tokens": 81, "cost_micros": 101436, "latency_ms": 4281, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.04081632653061225, "method": "str_norm", "normalised_pred": "roberta large as filter and instructgpt as reranker", "normalised_gold": "identify entities expressed by each sentence and locate each entity to words in sentence possible entity types are type 1 type 2 type n if you do not find any entity in this sentence just output \u2018answer no entities found \u2019"}}
+{"qid": "2303.08559v2.pdf::Q002", "doc_id": "2303.08559v2.pdf", "answer_format": "str", "gold": "Identify the entities expressed by each sentence, and locate each entity to words in the sentence. The possible entity types are: [Type_1], [Type_2], ..., [Type_N]. If you do not find any entity in this sentence, just output \u2018Answer: No entities found.\u2019", "pages": 30, "document_id": 7584, "arm": "llamacloud_premium_lc", "question_id": "2303.08559v2.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 3 (left), which shows \"LLMs' performance w.r.t prompt variants on 20-shot FewNERD dataset\" with six instruction variants (I0-I5), I need to find the instruction text from the document.\n\nFrom Section E.2 \"Finding Better Instruction\", the six instruction variants for the NER task are listed. These are the instructions being tested in the red box of Figure 3 (left).\n\nAnswer: The six instruction variants I0 through I5 listed in Section E.2, ranging from empty instruction to complex instructions with entity type definitions and annotation format requirements", "citations": [], "input_tokens": 45076, "output_tokens": 140, "cost_micros": 137328, "latency_ms": 7141, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.17910447761194032, "method": "str_norm", "normalised_pred": "six instruction variants i0 through i5 listed in section e 2 ranging from empty instruction to complex instructions with entity type definitions and annotation format requirements", "normalised_gold": "identify entities expressed by each sentence and locate each entity to words in sentence possible entity types are type 1 type 2 type n if you do not find any entity in this sentence just output \u2018answer no entities found \u2019"}}
+{"qid": "2303.08559v2.pdf::Q002", "doc_id": "2303.08559v2.pdf", "answer_format": "str", "gold": "Identify the entities expressed by each sentence, and locate each entity to words in the sentence. The possible entity types are: [Type_1], [Type_2], ..., [Type_N]. If you do not find any entity in this sentence, just output \u2018Answer: No entities found.\u2019", "pages": 30, "document_id": 7584, "arm": "surfsense_agentic", "question_id": "2303.08559v2.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the document to find Figure 3 and identify the instructions corresponding to the settings shown in the red box.Let me search for Figure 3 in the document:Now let me read the section around Figure 3 to see the instruction variants and identify which ones are shown in the red box:I can see that Figure 3 Left shows ChatGPT's performance across six instruction variants labeled I0 through I5. I need to find the details about these instruction variants in Appendix E.2. Let me search for the appendix:Let me search for instruction variants more broadly:Let me search for the appendix sections with \"E.2\" or look for where instruction formats I0-I5 are defined:Let me look at the entire document more systematically. Let me search for sections starting with \"E.\" or \"Appendix\":Great! Now let me read section E.2 which should contain the instruction variants:Perfect! I found the six instruction variants (I0-I5). Now I need to look at Figure 3 more carefully to see which instructions are in the red box. Let me read around the figure again and look for any visual description of the red box:The image X14.png seems to be after Figure 3 in the document. Let me look for other embedded images that might be part of Figure 3:Let me look before Figure 3 to see if there's an image associated with the figure itself:", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 77483, "error": null, "extra": {"thread_id": 1971, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_538fbe85cd9b43f5a8b0d97c996a7e15", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 450, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.08301886792452831, "method": "str_norm", "normalised_pred": "i ll read document to find figure 3 and identify instructions corresponding to settings shown in red box let me search for figure 3 in document now let me read section around figure 3 to see instruction variants and identify which ones are shown in red box i can see that figure 3 left shows chatgpt s performance across six instruction variants labeled i0 through i5 i need to find details about these instruction variants in appendix e 2 let me search for appendix let me search for instruction variants more broadly let me search for appendix sections with e 2 or look for where instruction formats i0 i5 are defined let me look at entire document more systematically let me search for sections starting with e or appendix great now let me read section e 2 which should contain instruction variants perfect i found six instruction variants i0 i5 now i need to look at figure 3 more carefully to see which instructions are in red box let me read around figure again and look for any visual description of red box image x14 png seems to be after figure 3 in document let me look for other embedded images that might be part of figure 3 let me look before figure 3 to see if there s image associated with figure itself", "normalised_gold": "identify entities expressed by each sentence and locate each entity to words in sentence possible entity types are type 1 type 2 type n if you do not find any entity in this sentence just output \u2018answer no entities found \u2019"}}
+{"qid": "2303.08559v2.pdf::Q003", "doc_id": "2303.08559v2.pdf", "answer_format": "int", "gold": "9", "pages": 30, "document_id": 7584, "arm": "native_pdf", "question_id": "2303.08559v2.pdf::Q003", "answer_letter": null, "raw_text": "I need to find how many datasets are used for experiments in this paper.\n\nLooking at the document, in Section 3.1 \"Task, Dataset and Evaluation\", it states:\n\n\"We run experiments on nine widely-used datasets across four IE tasks.\"\n\nThe document then lists:\n- Named Entity Recognition (NER): CONLL03, OntoNotes, FewNERD (3 datasets)\n- Relation Extraction (RE): TACRED, TACREV (2 datasets)\n- Event Detection (ED): ACE05, MAVEN, ERE (3 datasets)\n- Event Argument Extraction (EAE): ACE05, ERE, RAMS (3 datasets)\n\nHowever, I need to count unique datasets. ACE05 and ERE appear in both ED and EAE tasks.\n\nCounting unique datasets:\n1. CONLL03\n2. OntoNotes\n3. FewNERD\n4. TACRED\n5. TACREV\n6. ACE05\n7. MAVEN\n8. ERE\n9. RAMS\n\nThis gives us 9 unique datasets total.\n\nAnswer: 9", "citations": [], "input_tokens": 82066, "output_tokens": 265, "cost_micros": 250173, "latency_ms": 34217, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2303.08559v2.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "9", "normalised_gold": "9"}}
+{"qid": "2303.08559v2.pdf::Q003", "doc_id": "2303.08559v2.pdf", "answer_format": "int", "gold": "9", "pages": 30, "document_id": 7584, "arm": "azure_basic_lc", "question_id": "2303.08559v2.pdf::Q003", "answer_letter": null, "raw_text": "Looking through the document for information about the datasets used:\n\nFrom the Abstract and Introduction:\n- \"Through extensive experiments on nine datasets across four IE tasks\"\n\nFrom Section 3.1 (Task, Dataset and Evaluation):\n- \"We run experiments on nine widely-used datasets across four IE tasks.\"\n- The nine datasets are listed as:\n  1. CONLL03 (NER)\n  2. OntoNotes (NER)\n  3. FewNERD (NER)\n  4. TACRED (RE)\n  5. TACREV (RE)\n  6. ACE05 (ED)\n  7. MAVEN (ED)\n  8. ERE (ED)\n  9. ACE05, ERE, RAMS (EAE)\n\nWait, let me recount more carefully from the text:\n- NER: CONLL03, OntoNotes, FewNERD = 3 datasets\n- RE: TACRED, TACREV = 2 datasets\n- ED: ACE05, MAVEN, ERE = 3 datasets\n- EAE: ACE05, ERE, RAMS = 3 datasets\n\nHowever, ACE05 and ERE appear in multiple tasks. The document clearly states \"nine datasets\" multiple times.\n\nLooking at Table 6 which shows \"Statistics of nine datasets used\", I can count the distinct dataset names:\n1. CONLL\n2. OntoNotes\n3. FewNERD\n4. TACREV\n5. TACRED\n6. ACE05 (appears for both ED and EAE)\n7. MAVEN\n8. ERE (appears for both ED and EAE)\n9. RAMS\n\nThis confirms 9 distinct datasets total.\n\nAnswer: 9", "citations": [], "input_tokens": 35333, "output_tokens": 414, "cost_micros": 112209, "latency_ms": 9046, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "9", "normalised_gold": "9"}}
+{"qid": "2303.08559v2.pdf::Q003", "doc_id": "2303.08559v2.pdf", "answer_format": "int", "gold": "9", "pages": 30, "document_id": 7584, "arm": "azure_premium_lc", "question_id": "2303.08559v2.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the document, I need to identify all the datasets used across the four IE tasks.\n\nFrom Section 3.1 \"Task, Dataset and Evaluation\":\n\n1. **Named Entity Recognition (NER)**: CONLL03, OntoNotes, FewNERD\n2. **Relation Extraction (RE)**: TACRED, TACREV\n3. **Event Detection (ED)**: ACE05, MAVEN, ERE\n4. **Event Argument Extraction (EAE)**: ACE05, RAMS, ERE\n\nCounting unique datasets:\n- CONLL03\n- OntoNotes\n- FewNERD\n- TACRED\n- TACREV\n- ACE05 (used in both ED and EAE, but counted once)\n- MAVEN\n- ERE (used in both ED and EAE, but counted once)\n- RAMS\n\nTotal unique datasets: 9\n\nThis is confirmed by the abstract which states \"extensive experiments on nine datasets across four IE tasks\" and multiple mentions throughout the paper of \"nine datasets.\"\n\nAnswer: 9", "citations": [], "input_tokens": 46298, "output_tokens": 251, "cost_micros": 142659, "latency_ms": 7875, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "9", "normalised_gold": "9"}}
+{"qid": "2303.08559v2.pdf::Q003", "doc_id": "2303.08559v2.pdf", "answer_format": "int", "gold": "9", "pages": 30, "document_id": 7584, "arm": "llamacloud_basic_lc", "question_id": "2303.08559v2.pdf::Q003", "answer_letter": null, "raw_text": "I need to count the total number of datasets used in the experiments described in this paper.\n\nFrom Section 3.1 \"Task, Dataset and Evaluation\", the paper states:\n\n\"We run experiments on nine widely-used datasets across four IE tasks.\"\n\nThe four tasks and their associated datasets are:\n\n1. **Named Entity Recognition (NER)**: \n   - CONLL03\n   - OntoNotes\n   - FewNERD\n\n2. **Relation Extraction (RE)**:\n   - TACRED\n   - TACREV\n\n3. **Event Detection (ED)**:\n   - ACE05\n   - MAVEN\n   - ERE\n\n4. **Event Argument Extraction (EAE)**:\n   - ACE05\n   - ERE\n   - RAMS\n\nCounting the unique datasets:\n- CONLL03\n- OntoNotes\n- FewNERD\n- TACRED\n- TACREV\n- ACE05 (used in both ED and EAE)\n- MAVEN\n- ERE (used in both ED and EAE)\n- RAMS\n\nThe paper explicitly states \"nine widely-used datasets\" and when counting unique dataset names, we have 9 distinct datasets (ACE05 and ERE are each used for two different tasks but are still single datasets).\n\nAnswer: 9", "citations": [], "input_tokens": 33399, "output_tokens": 309, "cost_micros": 104832, "latency_ms": 8328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "9", "normalised_gold": "9"}}
+{"qid": "2303.08559v2.pdf::Q003", "doc_id": "2303.08559v2.pdf", "answer_format": "int", "gold": "9", "pages": 30, "document_id": 7584, "arm": "llamacloud_premium_lc", "question_id": "2303.08559v2.pdf::Q003", "answer_letter": null, "raw_text": "Looking through the document for the datasets used in experiments:\n\nFrom Section 3.1 \"Task, Dataset and Evaluation\":\n- Named Entity Recognition (NER): CONLL03, OntoNotes, FewNERD\n- Relation Extraction (RE): TACRED, TACREV\n- Event Detection (ED): ACE05, MAVEN, ERE\n- Event Argument Extraction (EAE): ACE05, ERE, RAMS\n\nCounting the unique datasets:\n1. CONLL03\n2. OntoNotes\n3. FewNERD\n4. TACRED\n5. TACREV\n6. ACE05\n7. MAVEN\n8. ERE\n9. RAMS\n\nNote: ACE05 and ERE are used for both Event Detection and Event Argument Extraction tasks, but they are counted once as unique datasets.\n\nAnswer: 9", "citations": [], "input_tokens": 45068, "output_tokens": 207, "cost_micros": 138309, "latency_ms": 6390, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "9", "normalised_gold": "9"}}
+{"qid": "2303.08559v2.pdf::Q003", "doc_id": "2303.08559v2.pdf", "answer_format": "int", "gold": "9", "pages": 30, "document_id": 7584, "arm": "surfsense_agentic", "question_id": "2303.08559v2.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the document to find information about the datasets used in the experiments.Let me search for a table or figure that lists all the datasets clearly.Perfect! I found Table 6 which clearly lists all the datasets. Let me count them:\n\nFrom the document, the paper uses experiments on **nine datasets** across four IE tasks:\n\n1. **Named Entity Recognition (NER)**: \n   - CONLL03\n   - OntoNotes\n   - FewNERD\n\n2. **Relation Extraction (RE)**:\n   - TACRED\n   - TACREV\n\n3. **Event Detection (ED)**:\n   - ACE05\n   - MAVEN\n   - ERE\n\n4. **Event Argument Extraction (EAE)**:\n   - ACE05 (same dataset used for both ED and EAE)\n   - RAMS\n   - ERE (same dataset used for both ED and EAE)\n\nCounting unique datasets: CONLL03, OntoNotes, FewNERD, TACRED, TACREV, ACE05, MAVEN, ERE, RAMS = **9 datasets**\n\nAnswer: 9", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 43891, "error": null, "extra": {"thread_id": 1972, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_efab7508547a4ca893cf8d3cb1478856", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 235, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "9", "normalised_gold": "9"}}
+{"qid": "2303.08559v2.pdf::Q004", "doc_id": "2303.08559v2.pdf", "answer_format": "str", "gold": "The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.", "pages": 30, "document_id": 7584, "arm": "native_pdf", "question_id": "2303.08559v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking through the document for information about the filter-then-rerank method and prompt templates...\n\nIn Table 21 (page 29), which shows \"Templates for ACE05 dataset, where {evt} is the placeholder for event type,\" I can find the template for Contact.Meet:\n\nAnswer: The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.", "citations": [], "input_tokens": 82080, "output_tokens": 100, "cost_micros": 247740, "latency_ms": 31592, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2303.08559v2.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "word evt triggers meet event two or more entities come together at single location and interact with one another face to face", "normalised_gold": "word evt triggers meet event two or more entities come together at single location and interact with one another face to face"}}
+{"qid": "2303.08559v2.pdf::Q004", "doc_id": "2303.08559v2.pdf", "answer_format": "str", "gold": "The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.", "pages": 30, "document_id": 7584, "arm": "azure_basic_lc", "question_id": "2303.08559v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document, I need to find the prompt template used for the 'Contact.Meet' label in the filter-then-rerank method.\n\nIn Table 21, which shows \"Templates for ACE05 dataset, where {evt} is the placeholder for event type,\" I can find the template for Contact.Meet:\n\n\"The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.\"\n\nAnswer: The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.", "citations": [], "input_tokens": 35347, "output_tokens": 142, "cost_micros": 108171, "latency_ms": 4609, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "word evt triggers meet event two or more entities come together at single location and interact with one another face to face", "normalised_gold": "word evt triggers meet event two or more entities come together at single location and interact with one another face to face"}}
+{"qid": "2303.08559v2.pdf::Q004", "doc_id": "2303.08559v2.pdf", "answer_format": "str", "gold": "The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.", "pages": 30, "document_id": 7584, "arm": "azure_premium_lc", "question_id": "2303.08559v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document, I need to find the prompt template for the 'Contact.Meet' event label in the filter-then-rerank paradigm.\n\nIn Table 21 (Templates for ACE05 dataset), which shows the templates used in the filter-then-rerank method, I can find:\n\nContact.Meet: \"The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.\"\n\nAnswer: The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.", "citations": [], "input_tokens": 46312, "output_tokens": 143, "cost_micros": 141081, "latency_ms": 6046, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "word evt triggers meet event two or more entities come together at single location and interact with one another face to face", "normalised_gold": "word evt triggers meet event two or more entities come together at single location and interact with one another face to face"}}
+{"qid": "2303.08559v2.pdf::Q004", "doc_id": "2303.08559v2.pdf", "answer_format": "str", "gold": "The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.", "pages": 30, "document_id": 7584, "arm": "llamacloud_basic_lc", "question_id": "2303.08559v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document, I need to find the prompt template for 'Contact.Meet' used in the filter-then-rerank method.\n\nIn Table 21, which shows \"Templates for ACE05 dataset, where {evt} is the placeholder for event type,\" I can find the template for Contact.Meet:\n\nAnswer: The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.", "citations": [], "input_tokens": 33413, "output_tokens": 106, "cost_micros": 101829, "latency_ms": 3921, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "word evt triggers meet event two or more entities come together at single location and interact with one another face to face", "normalised_gold": "word evt triggers meet event two or more entities come together at single location and interact with one another face to face"}}
+{"qid": "2303.08559v2.pdf::Q004", "doc_id": "2303.08559v2.pdf", "answer_format": "str", "gold": "The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.", "pages": 30, "document_id": 7584, "arm": "llamacloud_premium_lc", "question_id": "2303.08559v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document, I need to find the template for 'Contact.Meet' in the filter-then-rerank method.\n\nIn Table 21, which shows \"Templates for ACE05 dataset, where {evt} is the placeholder for event type,\" I can find:\n\nContact.Meet | The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.\n\nAnswer: The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.", "citations": [], "input_tokens": 45082, "output_tokens": 135, "cost_micros": 137271, "latency_ms": 4452, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "word evt triggers meet event two or more entities come together at single location and interact with one another face to face", "normalised_gold": "word evt triggers meet event two or more entities come together at single location and interact with one another face to face"}}
+{"qid": "2303.08559v2.pdf::Q004", "doc_id": "2303.08559v2.pdf", "answer_format": "str", "gold": "The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.", "pages": 30, "document_id": 7584, "arm": "surfsense_agentic", "question_id": "2303.08559v2.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the user-mentioned document to find information about the prompt template for label 'Contact.Meet' in the filter-then-rerank method.Let me search for information about \"Contact.Meet\" and \"filter-then-rerank\" in this document.Let me search for content with more context:Now I need to find the section that shows the prompt templates. Let me read around line 5446 where Contact.Meet was mentioned, and also look for tables or appendices with prompt templates:Perfect! I found the prompt template for Contact.Meet in Table 21 for ACE05 dataset. According to the document, the template for 'Contact.Meet' in the filter-then-rerank method is:\n\n\"The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.\"\n\nAnswer: The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 45328, "error": null, "extra": {"thread_id": 1973, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_5466a03f139642c1a7d3c68dd6b01473", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 222, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "word evt triggers meet event two or more entities come together at single location and interact with one another face to face", "normalised_gold": "word evt triggers meet event two or more entities come together at single location and interact with one another face to face"}}
+{"qid": "2303.08559v2.pdf::Q006", "doc_id": "2303.08559v2.pdf", "answer_format": "float", "gold": "0.5%", "pages": 30, "document_id": 7584, "arm": "native_pdf", "question_id": "2303.08559v2.pdf::Q006", "answer_letter": null, "raw_text": "I need to find Figure 6 and identify the green module, then look for the 100-shot TACREV setting to calculate the absolute percentage drop.\n\nLooking through the document, I find Figure 6 on page 8, which shows \"The overall architecture of our adaptive filter-then-rerank paradigm.\" In this figure, I can see colored modules - orange for \"Easy Sample\" and pink for \"Hard Sample\". The diagram shows a filter (Small LM) and a reranker (Large LM).\n\nHowever, I need to look at Table 5 on page 9, which shows \"Ablation study on three datasets\" and examines the effectiveness of modules in the adaptive filter-then-rerank system. The table shows results for TACREV (100-shot).\n\nLooking at the table columns, I can see:\n- CoT (Chain of Thought)\n- Demo\n- LF (Label Filtering) \n- AD (Adaptive)\n\nThe green box in Figure 6 appears to correspond to the demonstration/demo component based on the visual representation.\n\nFrom Table 5, for TACREV (100-shot):\n- With all modules (\u2713 \u2713 \u2713 \u2713): 75.9(2.4)\n- Without Demo but with others (\u2717 \u2717 \u2713 \u2713): 74.9(2.2)\n\nThe absolute drop is: 75.9 - 74.9 = 1.0\n\nAnswer: 1.0", "citations": [], "input_tokens": 82081, "output_tokens": 344, "cost_micros": 251403, "latency_ms": 36125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2303.08559v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "1.0", "normalised_gold": "0.5"}}
+{"qid": "2303.08559v2.pdf::Q006", "doc_id": "2303.08559v2.pdf", "answer_format": "float", "gold": "0.5%", "pages": 30, "document_id": 7584, "arm": "azure_basic_lc", "question_id": "2303.08559v2.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the performance impact of dropping the green module (demonstration) shown in Figure 6 on the 100-shot TACREV setting.\n\nLooking at Table 5 (Ablation study), I can see the ablation results for different components on three datasets including TACREV (100-shot).\n\nThe table shows:\n- Full system (\u2713 CoT \u2713 Demo \u2713 LF \u2713 AD): 75.9(2.4)\n- Without Demo (\u2717 Demo \u2717 CoT \u2713 LF \u2713 AD): 74.9(2.2)\n\nHowever, I need to isolate just the Demo effect. Looking more carefully:\n- With CoT and Demo (\u2713 CoT \u2713 Demo \u2713 LF \u2713 AD): 75.9\n- Without Demo but with CoT (\u2713 CoT \u2717 Demo \u2713 LF \u2713 AD): This line doesn't exist\n\nBut I can see:\n- Without CoT but with Demo (\u2717 CoT \u2713 Demo \u2713 LF \u2713 AD): 75.4\n- Without both CoT and Demo (\u2717 CoT \u2717 Demo \u2713 LF \u2713 AD): 74.9\n\nLet me look at the correct comparison for just dropping Demo:\nFrom the table, comparing rows where only Demo changes:\n- Row 1 (\u2713 CoT \u2713 Demo): 75.9\n- Row 2 (\u2717 CoT \u2713 Demo): 75.4\n- Row 3 (\u2717 CoT \u2717 Demo): 74.9\n\nThe drop from having Demo to not having Demo (when CoT is also removed) is:\n75.4 - 74.9 = 0.5\n\nHowever, to properly isolate the Demo effect with CoT present, I need to estimate. The pattern suggests:\n- Full system: 75.9\n- Without Demo (keeping other factors): approximately 75.4 - 75.9 range\n\nActually, looking at row 2 vs row 3: the difference is 75.4 -", "citations": [], "input_tokens": 35348, "output_tokens": 512, "cost_micros": 113724, "latency_ms": 11858, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "2.0", "normalised_gold": "0.5"}}
+{"qid": "2303.08559v2.pdf::Q006", "doc_id": "2303.08559v2.pdf", "answer_format": "float", "gold": "0.5%", "pages": 30, "document_id": 7584, "arm": "azure_premium_lc", "question_id": "2303.08559v2.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the performance drop when removing the green module (demonstration) from Figure 6 on the 100-shot TACREV setting.\n\nLooking at Table 5 (Ablation study), I can find the relevant information:\n- The table shows ablation results on three datasets including TACREV (100-shot)\n- The rows show different configurations with checkmarks (\u2713) and crosses (\u2717)\n\nThe green module in Figure 6 represents the \"Demonstration\" component.\n\nFrom Table 5, for TACREV (100-shot):\n- With CoT \u2713, Demo \u2713, LF \u2713, AD \u2713: 75.9(2.4)\n- With CoT \u2717, Demo \u2717, LF \u2713, AD \u2713 (no demo): 74.9(2.2)\n\nThe absolute percentage drop when removing Demo is:\n75.9 - 74.9 = 1.0\n\nAnswer: 1.0", "citations": [], "input_tokens": 46313, "output_tokens": 231, "cost_micros": 142404, "latency_ms": 8516, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "1.0", "normalised_gold": "0.5"}}
+{"qid": "2303.08559v2.pdf::Q006", "doc_id": "2303.08559v2.pdf", "answer_format": "float", "gold": "0.5%", "pages": 30, "document_id": 7584, "arm": "llamacloud_basic_lc", "question_id": "2303.08559v2.pdf::Q006", "answer_letter": null, "raw_text": "I need to find Figure 6 and identify the \"green module,\" then look at the 100-shot TACREV setting to determine the performance drop.\n\nLooking at Figure 6 in the document, it shows \"The overall architecture of our adaptive filter-then-rerank paradigm.\" The green box is labeled as \"demos\" (demonstration examples).\n\nNow I need to find the ablation study that shows what happens when demos are removed. Looking at Table 5 (Ablation study), I can see different configurations where modules are removed:\n\nThe table shows results for FewNERD (20-shot), TACREV (100-shot), and ACE05 (20-shot).\n\nFor TACREV (100-shot):\n- Full system (\u2713 \u2713 \u2713 \u2713): 75.9(2.4)\n- Without CoT (\u2717 \u2713 \u2713 \u2713): 75.4(2.4)\n- Without Demo (\u2717 \u2717 \u2713 \u2713): 74.9(2.2)\n\nThe \"Demo\" module corresponds to the green box in Figure 6. When we remove it (going from \u2717 \u2713 \u2713 \u2713 to \u2717 \u2717 \u2713 \u2713), the performance drops from 75.4 to 74.9.\n\nThe absolute drop is: 75.4 - 74.9 = 0.5\n\nAnswer: 0.5", "citations": [], "input_tokens": 33414, "output_tokens": 351, "cost_micros": 105507, "latency_ms": 9827, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "0.5", "normalised_gold": "0.5"}}
+{"qid": "2303.08559v2.pdf::Q006", "doc_id": "2303.08559v2.pdf", "answer_format": "float", "gold": "0.5%", "pages": 30, "document_id": 7584, "arm": "llamacloud_premium_lc", "question_id": "2303.08559v2.pdf::Q006", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about the \"green module\" in Figure 6 and its effect on the 100-shot TACREV setting.\n\nFrom Figure 6, the green box represents the **Demonstration** component in the reranker process.\n\nFrom Table 5 (Ablation Study), I can see the results when removing the Demo component:\n- The row \"[ ] [ ] [x] [x]\" indicates: no CoT, no Demo, with LF, with AD\n- For TACREV (100-shot): 74.9(2.2)\n- The full system \"[x] [x] [x] [x]\" achieves: 75.9(2.4)\n\nThe absolute percentage drop when removing Demo is:\n75.9 - 74.9 = 1.0\n\nAnswer: 1.0", "citations": [], "input_tokens": 45083, "output_tokens": 198, "cost_micros": 138219, "latency_ms": 8328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "1.0", "normalised_gold": "0.5"}}
+{"qid": "2303.08559v2.pdf::Q006", "doc_id": "2303.08559v2.pdf", "answer_format": "float", "gold": "0.5%", "pages": 30, "document_id": 7584, "arm": "surfsense_agentic", "question_id": "2303.08559v2.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the document to find Figure 6 and the information about dropping the green module in the 100-shot TACREV setting.Let me search for Figure 6 in the document:Now let me search for more context around Figure 6:Good! I found Figure 6. It mentions a green box representing \"demos\". Let me read the full content around Figure 6 and look for ablation studies or tables that show what happens when dropping the green module (demos):Perfect! Figure 6 shows that the green box represents \"Demonstration\" (demos). Now I need to find the ablation study or table that shows the performance drop when removing demos. Let me search for ablation or table related to this:Let me search for ablation study or a table showing what happens when demos are removed:Perfect! I found the ablation study in Table 5. Let me read that section:Let me search more specifically for Table 5:Great! I found Table 5. Let me continue reading to see all rows:Perfect! Now let me look for the explanation of what each column means. Let me read the section before Table 5:Let me find the section that explains the ablation study (section 5.5):Let me continue reading the ablation study explanation:I see the explanation was cut off. Let me search for where it continues:Let me look for text after Table 3 that continues the ablation study explanation:Perfect! Now I have the full explanation. Looking at Table 5, I need to identify which row corresponds to dropping the \"Demo\" (green module). Let me re-read Table 5 more carefully:Now let me understand the table structure. The columns are:\n- CoT (Chain of Thought)\n- Demo (Demonstration - the green module)\n- LF (Label Filtering)\n- AD (Adaptive)\n\nThe rows show different configurations with checkmarks (\u2713) meaning present and X (\u2717) meaning removed.\n\nLooking at the table:\n- Row 1: \u2713 CoT, \u2713 Demo, \u2713 LF, \u2713 AD \u2192 TACREV (100-shot) = 75.9\n- Row 2: \u2717 CoT, \u2713 Demo, \u2713 LF, \u2713 AD \u2192 TACREV (100-shot) = 75.4\n- Row 3: \u2717 CoT, \u2717 Demo, \u2612 LF, \u2612 AD \u2192 TACREV (100-shot) = 74.9\n- Row 4: \u2717 CoT, \u2717 Demo, \u2717 LF, \u2612 AD \u2192 TACREV (100-shot) = 73.8\n- Row 5: \u2717 CoT, \u2717 Demo, \u2717 LF, \u2717 AD \u2192 TACREV (100-shot) = 59.9\n\nWait, I need to understand the notation better. Let me look at the explanation again. From the text at lines 2189-2194:\n\n\"(2) Demo: We remove all examples, rendering the reranking a zero-shot problem.\"\n\nSo when Demo is removed (\u2717), all examples are removed. Looking at the table structure, I need to find the row where Demo changes from \u2713 to \u2717 while keeping other factors constant.\n\nThe best comparison to isolate the effect of Demo would be:\n- Row 1 (full model with all components): CoT \u2713, Demo \u2713, LF \u2713, AD \u2713 \u2192 75.9\n- Row 2 (without CoT): CoT \u2717, Demo \u2713, LF \u2713, AD \u2713 \u2192 75.4\n\nTo find the effect of dropping Demo specifically, I should compare:\n- Row 2 (with Demo, without CoT): CoT \u2717, Demo \u2713, LF \u2713, AD \u2713 \u2192 75.4\n- Row 3 (without Demo and CoT): CoT \u2717, Demo \u2717, (different LF/AD settings) \u2192 74.9\n\nBut wait, the LF and AD settings change. Let me reconsider. Looking more carefully at the symbols, I see \"\u2612\" which might be different from \"\u2713\" and \"\u2717\".\n\nLet me look for a clearer comparison. The green module in Figure 6 is \"Demo\". To find the effect of dropping it:\n\nFrom the full model (row 1): 75.9\nTo without Demo - I need to find which row removes only Demo while keeping everything else.\n\nActually, looking at row 2 vs row 3:\n- Row 2: \u2717 CoT, \u2713 Demo, \u2713 LF, \u2713 AD \u2192 75.4\n- Row 3: \u2717 CoT, \u2717 Demo, \u2612 LF, \u2612 AD \u2192 74.9\n\nBut LF and AD also change, so this isn't a clean comparison.\n\nLet me think about this differently. According to the explanation text (lines 2197-2199):\n\"(1) Demos with explanations consistently enhance the reranking ability of LLMs across all datasets.\"\n\nAnd (line 2199-2200):\n\"(2) Demos without explanations also contribute to performance improvement.\"\n\nSo the comparison should be:\n- With Demo + CoT (row 1): 75.9\n- Without CoT but with Demo (row 2): 75.4 \n- Without both CoT and Demo (row 3 onwards)\n\nThe cleanest way to see the effect of dropping Demo is to compare a configuration with Demo vs without Demo. Looking at row 2 (has Demo but no CoT) = 75.4, and the next logical step would be row 3 (no Demo, no CoT) = 74.9.\n\nBut let me also check if there's a better comparison. Actually, looking at the structure, row 3 onwards all have \"\u2717 \u2612\" for both CoT and Demo columns, meaning both are removed.\n\nSo the drop from having Demo (row 2: 75.4) to not having Demo (row 3: 74.9) = 75.4 - 74.9 = 0.5\n\nBut wait, in row 3, the LF and AD settings also changed (\u2612 instead of \u2713).\n\nLet me reconsider what \u2612 means. It might mean the opposite of the feature (e.g., if \u2713 means \"with\", then \u2612 might mean \"without\" for LF and AD columns).\n\nActually, based on the explanation:\n- LF (label filtering): \u2713 means using top-N labels, \u2717 means retaining all labels\n- AD (adaptive): \u2713 means only hard samples, \u2717 means all samples\n\nSo \u2612 might indicate the opposite setting.\n\nGiven the complexity, let me focus on finding where the Demo is specifically dropped. The clearest comparison would be between:\n- Row 1 (everything included): 75.9\n- Row 3 (no CoT, no Demo): 74.9\n\nDifference: 75.9 - 74.9 = 1.0\n\nBut this includes the effect of removing CoT as well. To isolate Demo:\n- Row 2 (no CoT, yes Demo): 75.4\n- Row 3 (no CoT, no Demo): 74.9\n\nDifference: 75.4 - 74.9 = 0.5\n\nHowever, LF and AD settings also changed between row 2 and 3.\n\nLet me look at this more systematically. Actually, I think I'm overcomplicating this. The question asks specifically about dropping \"the green module\" which is Demo. Looking at the full system (row 1) vs without demo, I need to find the appropriate ablation.\n\nActually, re-reading the table more carefully with the column headers having 2 cells labeled \"CoT Demo\", I think these might be a single column header spanning 2 rows. Let me re-examine:\n\nLooking at lines 2298-2303:\n```\n<th colspan=\"2\">CoT Demo</th>\n```\n\nThis suggests \"CoT Demo\" is a single header spanning 2 columns. So the structure might be:\n- Column 1: CoT\n- Column 2: Demo\n- Column 3: LF\n- Column 4: AD\n\nWith this understanding:\n- Row 1: \u2713 CoT, \u2713 Demo, \u2713 LF, \u2713 AD \u2192 75.9\n- Row 2: \u2717 CoT, \u2713 Demo, \u2713 LF, \u2713 AD \u2192 75.4\n- Row 3: \u2717 CoT, \u2717 Demo, ? LF, ? AD \u2192 74.9\n\nSo dropping Demo (from row 2 to row 3) causes a drop from 75.4 to 74.9 = 0.5\n\nBut the LF and AD settings are unclear in row 3 with the \u2612 symbol.\n\nGiven the question asks specifically for dropping the green module (Demo), and looking at the cleanest comparison while keeping CoT removed:\n- With Demo (row 2): 75.4\n- Without Demo (row 3): 74.9\n- Drop: 75.4 - 74.9 = 0.5\n\nBut if we compare from the full model:\n- Full model (row 1): 75.9\n- Without Demo but with CoT - this row doesn't exist in the table\n\nActually, maybe I should look at row 5 which has everything removed. Let me reconsider all rows:\n1. All components: 75.9\n2. No CoT: 75.4\n3. No CoT, No Demo: 74.9\n4. No CoT, No Demo, No LF: 73.8\n5. Nothing (all removed): 59.9\n\nThe progression suggests cumulative removal. To isolate Demo's effect:\nRow 2 (75.4) - Row 3 (74.9) = 0.5\n\nWait, but I should verify this interpretation by looking at the explanation text again. From lines 2196-2200:\n\n\"(1) Demos with explanations consistently enhance the reranking ability of LLMs across all datasets.\n(2) Demos without explanations also contribute to performance improvement.\"\n\nThis suggests:\n- Row 1 has Demo WITH CoT (explanations)\n- Row 2 has Demo WITHOUT CoT (no explanations) \n- Row 3 onwards has NO Demo\n\nSo the effect of dropping Demo entirely (going from row 2 to row 3): 75.4 - 74.9 = 0.5\n\nBut wait, there might be more to consider. Let me check if there's another row that might represent dropping just Demo while keeping CoT. Looking at the table again... no, there isn't such a row.\n\nGiven the structure and the question asking about dropping the green module (Demo), the most reasonable answer based on the ablation study is:\n\nFrom row 1 (full model with all components) = 75.9\nTo the configuration without Demo... I need to find which row that is.\n\nIf I assume row 3 still has other components but not Demo:\nRow 1: 75.9\nRow 3: 74.9\nDifference: 1.0\n\nBut this seems to also remove CoT. Let me think about what makes sense given Figure 6. The green module is specifically the \"Demonstration\" box shown feeding into the reranker.\n\nActually, perhaps the cleanest interpretation is:\n- Best system with Demo (row 1 or 2): max is 75.9 (row 1)\n- Best system without Demo (row 3, 4, or 5): max is 74.9 (row 3)\n- Absolute drop: 75.9 - 74.9 = 1.0\n\nBut this conflates CoT and Demo effects.\n\nHmm, let me reconsider the table structure once more. Maybe the header \"CoT Demo\" with colspan=\"2\" means there are two sub-columns, one for each. And the checkmarks below indicate their presence/absence.\n\nGiven the typical ablation study structure, each row progressively removes one more component. So:\n- Row 1: Everything (baseline) = 75.9\n- Row 2: Remove CoT = 75.4 (drop of 0.5)\n- Row 3: Remove Demo (CoT already removed) = 74.9 (additional drop of 0.5)\n- Row 4: Remove LF (CoT and Demo already removed) = 73.8 (additional drop of 1.1)\n- Row 5: Remove AD (everything removed) = 59.9 (additional drop of 13.9)\n\nIf we want the isolated effect of Demo, we need to compare configurations that differ only in Demo. But there's no row with CoT present and Demo absent.\n\nHowever, if we accept that row 2 \u2192 row 3 transition shows the effect of removing Demo (with CoT already removed), then the drop is 0.5.\n\nAlternatively, if the question wants the total contribution of Demo to the best system, we could argue:\n- Best with Demo: 75.9 (row 1)\n- Best without Demo: 74.9 (row 3)\n- Total drop attributable to absence of Demo: 1.0\n\nBut this includes the removal of CoT's contribution when combined with Demo removal.\n\nGiven typical ablation study interpretation and the question phrasing \"if I drop the green module,\" I believe the answer is the difference from the full system (row 1: 75.9) to the first row without Demo which is row 3 (74.9):\n\n75.9 - 74.9 = 1.0\n\nAnswer: 1.0", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 147812, "error": null, "extra": {"thread_id": 1974, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_b54490e8642341dd973e7bd446513622", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 1457, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "1.0", "normalised_gold": "0.5"}}
+{"qid": "2305.13186v3.pdf::Q000", "doc_id": "2305.13186v3.pdf", "answer_format": "list", "gold": "['3', '1']", "pages": 27, "document_id": 7585, "arm": "native_pdf", "question_id": "2305.13186v3.pdf::Q000", "answer_letter": null, "raw_text": "I need to locate the first figure in the paper and count the green and grey rectangles.\n\nLooking at the document, Figure 1 appears on page 2. It shows \"An example of our SCITAB dataset\" with a reasoning graph on the right side.\n\nIn the reasoning graph on the right side of Figure 1, I can see:\n- Green rectangles (boxes): I can count \"Closed-domain knowledge: Table caption\", \"Commonsense knowledge\", \"Simple lookup\", \"Subtraction\", and \"Fact checker: Supported\" - that's 5 green boxes\n- Grey rectangles: I can see grey/neutral colored boxes containing text like \"Productivity corresponds to Prod. column\", \"The number of random chance is 50%\", \"A's productivity is 57.5%\", and \"The subtraction result between 57.5% and 50% is 7.5%\" - that's 4 grey boxes\n\nWait, let me recount more carefully by looking at the colors:\n- Green boxes appear to be the operation/reasoning step boxes\n- Grey boxes appear to be the intermediate result/statement boxes\n\nCounting again in Figure 1's reasoning graph:\n- Green rectangles: 5 (the labeled reasoning steps)\n- Grey rectangles: 4 (the intermediate statements)\n\nIn descending order: [\"5\", \"4\"]\n\nAnswer: 5, 4", "citations": [], "input_tokens": 68164, "output_tokens": 313, "cost_micros": 209187, "latency_ms": 35953, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2305.13186v3.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "4, 5", "normalised_gold": "1, 3"}}
+{"qid": "2305.13186v3.pdf::Q000", "doc_id": "2305.13186v3.pdf", "answer_format": "list", "gold": "['3', '1']", "pages": 27, "document_id": 7585, "arm": "azure_basic_lc", "question_id": "2305.13186v3.pdf::Q000", "answer_letter": null, "raw_text": "I need to examine the first figure in the paper, which is Figure 1.\n\nLooking at Figure 1 on the first page, I can see a reasoning graph on the right side of the figure. Let me count the rectangles by color:\n\nGreen rectangles (these appear to represent supported claims or correct reasoning):\n1. \"A's productivity is 57.5%\"\n2. \"Productivity corresponds to Prod. column\"\n3. \"The number of random chance is 50%\"\n4. \"The subtraction result between 57.5% and 50% is 7.5%\"\n\nThat's 4 green rectangles.\n\nGrey rectangles (these appear to represent refuted or incorrect claims):\n1. \"A's productivity of 57.5% expresses that it appears in 9.5% more often than expected by random chance.\"\n\nThat's 1 grey rectangle.\n\nIn descending order: [\"4\", \"1\"]\n\nAnswer: 4, 1", "citations": [], "input_tokens": 31797, "output_tokens": 222, "cost_micros": 98721, "latency_ms": 7155, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "1, 4", "normalised_gold": "1, 3"}}
+{"qid": "2305.13186v3.pdf::Q000", "doc_id": "2305.13186v3.pdf", "answer_format": "list", "gold": "['3', '1']", "pages": 27, "document_id": 7585, "arm": "azure_premium_lc", "question_id": "2305.13186v3.pdf::Q000", "answer_letter": null, "raw_text": "I need to look at the first figure in the document, which is Figure 1 on the first page.\n\nLooking at Figure 1, I can see a reasoning graph on the right side. Let me count the rectangles by color:\n\nGreen rectangles (which appear to represent certain types of nodes in the reasoning graph):\n- \"Closed-domain knowledge: Table caption\"\n- \"Productivity corresponds to Prod. column\"\n- \"Commonsense knowledge\"\n- \"The number of random chance is 50%\"\n- \"Simple lookup\"\n- \"A's productivity is 57.5%\"\n- \"Subtraction\"\n- \"The subtraction result between 57.5% and 50% is 7.5%\"\n\nThat's 8 green rectangles.\n\nGrey rectangles (which appear to represent the claims):\n- \"Supported Claim: A's productivity of 57.5% expresses that it appears in 7.5% more often than expected by random chance.\"\n- \"Refuted Claim: A's productivity of 57.5% expresses that it appears in 9.5% more often than expected by random chance.\"\n- \"Not Enough Info Claim: The low performance of \"to\" can be explained by the fact that it is responsible for only 4.6% of the inference in the training set.\"\n\nThat's 3 grey rectangles.\n\nIn descending order: 8, 3\n\nAnswer: 8, 3", "citations": [], "input_tokens": 44352, "output_tokens": 333, "cost_micros": 138051, "latency_ms": 10203, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "3, 8", "normalised_gold": "1, 3"}}
+{"qid": "2305.13186v3.pdf::Q000", "doc_id": "2305.13186v3.pdf", "answer_format": "list", "gold": "['3', '1']", "pages": 27, "document_id": 7585, "arm": "llamacloud_basic_lc", "question_id": "2305.13186v3.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the first figure in the paper and count the green and grey rectangles.\n\nLooking at the document, the first figure is Figure 1, which shows \"An example of our SCITAB dataset (left) and its corresponding reasoning graph (right).\"\n\nIn the reasoning graph (right side of Figure 1), I can see:\n- The graph contains nodes connected by arrows\n- There are rectangular boxes of different colors\n- Looking at the reasoning graph structure, I can identify green rectangles and grey rectangles\n\nCounting the rectangles in the reasoning graph:\n- Green rectangles: These appear to be the operation/function nodes in the graph. I can count: \"Lookup\", \"Background knowledge\", \"Commonsense knowledge\", \"Subtract\" - that's 4 green rectangles\n- Grey rectangles: These appear to be the value/result nodes. I can count: \"57.5%\", \"Prod.\", \"50%\", \"7.5%\" - that's 4 grey rectangles\n\nWait, let me recount more carefully by examining the figure description and structure:\n- Green rectangles (operations): 4\n- Grey rectangles (values): 4\n\nIn descending order: [\"4\", \"4\"]\n\nAnswer: 4, 4", "citations": [], "input_tokens": 28779, "output_tokens": 281, "cost_micros": 90552, "latency_ms": 9219, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "4", "normalised_gold": "1, 3"}}
+{"qid": "2305.13186v3.pdf::Q000", "doc_id": "2305.13186v3.pdf", "answer_format": "list", "gold": "['3', '1']", "pages": 27, "document_id": 7585, "arm": "llamacloud_premium_lc", "question_id": "2305.13186v3.pdf::Q000", "answer_letter": null, "raw_text": "I need to examine the first figure in the paper, which is Figure 1 on page 1.\n\nLooking at Figure 1, I can see it contains:\n- A table on the left side\n- A reasoning graph on the right side with various nodes and connections\n\nIn the reasoning graph (right side), I can identify:\n- Grey rectangles: These appear to be nodes like \"Closed-domain knowledge\", \"Commonsense knowledge\", \"Simple lookup\", \"Subtraction\", and \"Fact checker\"\n- Green rectangles: I need to look for any green-colored rectangles\n\nLet me count carefully:\n- Grey rectangles in the reasoning graph: I can see multiple grey nodes representing different types of operations and knowledge\n- Green rectangles: Looking at the figure, I don't see distinctly green rectangles in the traditional sense, but there appear to be some lighter/different colored elements\n\nActually, examining the figure more carefully:\n- There appear to be approximately 5 grey rectangular nodes in the reasoning graph\n- I don't clearly see green rectangles that are distinctly separate from the grey ones\n\nHowever, looking more carefully at the color scheme and the structure, it appears there are:\n- Grey rounded rectangles: approximately 5\n- There don't appear to be clearly distinct green rectangles\n\nBased on my examination, I count approximately 5 grey rectangles and 0 distinctly green rectangles in Figure 1.\n\nAnswer: 5, 0", "citations": [], "input_tokens": 36689, "output_tokens": 320, "cost_micros": 114867, "latency_ms": 9875, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "0, 5", "normalised_gold": "1, 3"}}
+{"qid": "2305.13186v3.pdf::Q000", "doc_id": "2305.13186v3.pdf", "answer_format": "list", "gold": "['3', '1']", "pages": 27, "document_id": 7585, "arm": "surfsense_agentic", "question_id": "2305.13186v3.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the first figure in the paper. Let me continue reading to locate Figure 1.Perfect! I found Figure 1. Now I need to examine this figure carefully to count the green and grey rectangles. Based on the content I can see, Figure 1 shows a reasoning graph with different colored rectangles representing different steps in the reasoning process. Let me read more carefully to identify all the rectangles and their colors.\n\nFrom what I can see in the figure description:\n- There's a table on the left\n- A reasoning graph on the right\n- The reasoning graph contains several boxes/rectangles with different steps\n\nLooking at the text content that describes the figure structure:\n1. \"Closed-domain knowledge: Table caption\" - appears to be one rectangle\n2. \"Commonsense knowledge\" - appears to be another rectangle\n3. \"Productivity corresponds to Prod. column\" - part of the reasoning\n4. \"Simple lookup\" - appears to be a step\n5. \"The number of random chance is 50%\" - another element\n6. \"Supported Claim\" - labeled element\n7. \"A's productivity is 57.5%\" - text element\n8. Various reasoning steps with operators (+, -, =, \u00d7, ?)\n9. \"Refuted Claim\" - labeled element\n10. \"Not Enough Info Claim\" - labeled element\n\nLet me search for more details about the actual visual representation or read the next section to see if there's a clearer description.Let me go back and read the figure description more carefully to understand the visual elements:It seems the content around line 632-646 is describing a green checkmark icon, which is not Figure 1. Let me look more carefully at the actual Figure 1 content. Let me re-read around the figure caption area:", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 55655, "error": null, "extra": {"thread_id": 1975, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_b73a2e9dd2a74761aa49d98419f10801", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 323, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "let me search for more details about actual visual representation or read next section to see if there s clearer description let me go back and read figure description more carefully to understand visual elements it seems content around line 632 646 is describing green checkmark icon, which is not figure 1 let me look more carefully at actual figure 1 content let me re read around figure caption area", "normalised_gold": "1, 3"}}
+{"qid": "2305.13186v3.pdf::Q002", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "5", "pages": 27, "document_id": 7585, "arm": "native_pdf", "question_id": "2305.13186v3.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document, I need to count the reasoning steps shown in the reasoning graph on the right side of the figure.\n\nThe reasoning graph shows the following steps:\n1. Closed-domain knowledge: Table caption\n2. Commonsense knowledge\n3. Simple lookup\n4. Subtraction\n\nThese are the four distinct reasoning steps required to verify the claim \"A's productivity of 57.5% expresses that it appears in 7.5% more often than expected by random chance.\"\n\nAnswer: 4", "citations": [], "input_tokens": 68131, "output_tokens": 119, "cost_micros": 206178, "latency_ms": 30733, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2305.13186v3.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "5"}}
+{"qid": "2305.13186v3.pdf::Q002", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "5", "pages": 27, "document_id": 7585, "arm": "azure_basic_lc", "question_id": "2305.13186v3.pdf::Q002", "answer_letter": null, "raw_text": "I need to examine Figure 1 in the document to count the reasoning steps shown in the reasoning graph.\n\nLooking at Figure 1, I can see a reasoning graph on the right side that shows the steps needed to verify the claim \"A's productivity of 57.5% expresses that it appears in 7.5% more often than expected by random chance.\"\n\nThe reasoning graph shows the following steps:\n\n1. \"Closed-domain knowledge: Productivity corresponds to Prod. column\" (from table caption)\n2. \"Commonsense knowledge: The number of random chance is 50%\"\n3. \"Simple lookup: A's productivity is 57.5%\"\n4. \"Subtraction: The subtraction result between 57.5% and 50% is 7.5%\"\n\nThese steps lead to verifying whether the claim is \"Supported\" or \"Refuted\".\n\nCounting the reasoning steps in the graph, I can identify 4 distinct reasoning steps.\n\nAnswer: 4", "citations": [], "input_tokens": 31764, "output_tokens": 225, "cost_micros": 98667, "latency_ms": 8390, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "5"}}
+{"qid": "2305.13186v3.pdf::Q002", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "5", "pages": 27, "document_id": 7585, "arm": "azure_premium_lc", "question_id": "2305.13186v3.pdf::Q002", "answer_letter": null, "raw_text": "I need to count the reasoning steps shown in the reasoning graph in Figure 1.\n\nLooking at Figure 1's reasoning graph (on the right side), I can identify the following reasoning steps:\n\n1. \"Closed-domain knowledge: Table caption\" - Productivity corresponds to Prod. column\n2. \"Commonsense knowledge\" - The number of random chance is 50%\n3. \"Simple lookup\" - A's productivity is 57.5%\n4. \"Subtraction\" - The subtraction result between 57.5% and 50% is 7.5%\n5. Final verification step - comparing the claim with the calculated result\n\nThe reasoning graph shows 5 distinct reasoning steps from the initial knowledge extraction to the final verification of the claim.\n\nAnswer: 5", "citations": [], "input_tokens": 44319, "output_tokens": 174, "cost_micros": 135567, "latency_ms": 6969, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "5"}}
+{"qid": "2305.13186v3.pdf::Q002", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "5", "pages": 27, "document_id": 7585, "arm": "llamacloud_basic_lc", "question_id": "2305.13186v3.pdf::Q002", "answer_letter": null, "raw_text": "I need to examine Figure 1 in the document to count the reasoning steps shown in the reasoning graph.\n\nLooking at Figure 1, I can see it contains:\n- A table with columns: Cue, App., Prod., Cov.\n- A claim about \"A's productivity of 57.5%\"\n- A reasoning graph on the right side\n\nThe reasoning graph shows the verification process with several nodes connected by arrows. Let me count the reasoning steps by tracing through the graph:\n\n1. Start with the claim about A's productivity of 57.5%\n2. Look up \"A's productivity\" \u2192 refers to cell at last row and \"Prod.\" column\n3. Background knowledge: \"productivity\" corresponds to \"Prod.\" column\n4. Simple lookup: Get value 57.5% from the table\n5. Commonsense knowledge: \"random chance\" means 50% accuracy\n6. Numerical reasoning: Calculate difference 57.5% - 50% = 7.5%\n7. Compare calculated result with claimed value (7.5%)\n\nCounting the nodes in the reasoning graph shown in Figure 1, I can see there are 7 distinct reasoning steps involved in verifying this claim.\n\nAnswer: 7", "citations": [], "input_tokens": 28746, "output_tokens": 277, "cost_micros": 90393, "latency_ms": 9546, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "7", "normalised_gold": "5"}}
+{"qid": "2305.13186v3.pdf::Q002", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "5", "pages": 27, "document_id": 7585, "arm": "llamacloud_premium_lc", "question_id": "2305.13186v3.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 1 in the paper, I need to count the reasoning steps shown in the reasoning graph on the right side.\n\nThe reasoning graph shows the following steps to verify the claim \"A's productivity of 57.5% expresses that it appears in 7.5% more often than expected by random chance\":\n\n1. **Closed-domain knowledge**: Productivity corresponds to Prod. column (from table caption)\n2. **Commonsense knowledge**: The number of random chance is 50%\n3. **Simple lookup**: Finding the value 57.5% in the table (A's productivity)\n4. **Subtraction**: The subtraction result between 57.5% and 50% is 7.5%\n\nAnswer: 4", "citations": [], "input_tokens": 36656, "output_tokens": 169, "cost_micros": 112503, "latency_ms": 7671, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "5"}}
+{"qid": "2305.13186v3.pdf::Q002", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "5", "pages": 27, "document_id": 7585, "arm": "surfsense_agentic", "question_id": "2305.13186v3.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the document that was mentioned as most relevant to find Figure 1 and count the reasoning steps.Let me continue reading to find Figure 1:Now let me read the section around line 562 where Figure 1 is described:Let me read more to see the complete figure with all reasoning steps:Based on the document content, I can see Figure 1 shows a reasoning graph with the following steps:\n\n1. **Closed-domain knowledge: Table caption** - \"Productivity corresponds to Prod. column\"\n2. **Simple lookup** - \"A's productivity is 57.5%\"\n3. **Commonsense knowledge** - \"The number of random chance is 50%\"\n4. **Subtraction** - \"The subtraction result between 57.5% and 50% is 7.5%\"\n\nThese 4 reasoning steps lead to verifying the claim \"A's productivity of 57.5% expresses that it appears in 7.5% more often than expected by random chance\" which is marked as \"Supported\".\n\nAnswer: 4", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 37062, "error": null, "extra": {"thread_id": 1976, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_6471c23a58a448bd8c1bac8c5ba5da64", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 190, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "5"}}
+{"qid": "2305.13186v3.pdf::Q003", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "197940", "pages": 27, "document_id": 7585, "arm": "native_pdf", "question_id": "2305.13186v3.pdf::Q003", "answer_letter": null, "raw_text": "I need to find the total number of claims in Wiki Table datasets and Scientific Articles datasets from the document.\n\nFrom Table 1 on page 4, I can see:\n\n**Wiki Table datasets:**\n- TabFact: 117,854 claims\n- FEVEROUS: 87,026 claims\nTotal Wiki Table claims = 117,854 + 87,026 = 204,880\n\n**Scientific Articles datasets:**\n- SEM-TAB-FACTS: 5,715 claims\n- SCITAB: 1,225 claims\nTotal Scientific Articles claims = 5,715 + 1,225 = 6,940\n\nDifference = 204,880 - 6,940 = 197,940\n\nAnswer: 197940", "citations": [], "input_tokens": 68140, "output_tokens": 175, "cost_micros": 207045, "latency_ms": 33358, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2305.13186v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "197940", "normalised_gold": "197940"}}
+{"qid": "2305.13186v3.pdf::Q003", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "197940", "pages": 27, "document_id": 7585, "arm": "azure_basic_lc", "question_id": "2305.13186v3.pdf::Q003", "answer_letter": null, "raw_text": "I need to find the number of claims in Wiki Table datasets versus Scientific Articles datasets from Table 1.\n\nFrom Table 1 (Comparison of SCITAB to three recent table fact verification datasets):\n\n**Wiki Tables datasets:**\n- TabFact: 117,854 claims\n- FEVEROUS: 87,026 claims\nTotal Wiki Tables claims = 117,854 + 87,026 = 204,880\n\n**Scientific Articles datasets:**\n- SEM-TAB-FACTS: 5,715 claims\n- SCITAB: 1,225 claims\nTotal Scientific Articles claims = 5,715 + 1,225 = 6,940\n\nDifference = 204,880 - 6,940 = 197,940\n\nAnswer: 197940", "citations": [], "input_tokens": 31773, "output_tokens": 182, "cost_micros": 98049, "latency_ms": 6016, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "197940", "normalised_gold": "197940"}}
+{"qid": "2305.13186v3.pdf::Q003", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "197940", "pages": 27, "document_id": 7585, "arm": "azure_premium_lc", "question_id": "2305.13186v3.pdf::Q003", "answer_letter": null, "raw_text": "I need to find the total number of claims in Wiki Table datasets and Scientific Articles datasets from Table 1.\n\nFrom Table 1:\n\n**Wiki Tables datasets:**\n- TabFact: 117,854 claims\n- FEVEROUS: 87,026 claims\n- Total Wiki Tables: 117,854 + 87,026 = 204,880 claims\n\n**Scientific Articles datasets:**\n- SEM-TAB-FACTS: 5,715 claims\n- SCITAB: 1,225 claims\n- Total Scientific Articles: 5,715 + 1,225 = 6,940 claims\n\n**Difference:**\n204,880 - 6,940 = 197,940 claims\n\nAnswer: 197940", "citations": [], "input_tokens": 44328, "output_tokens": 174, "cost_micros": 135594, "latency_ms": 6858, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "197940", "normalised_gold": "197940"}}
+{"qid": "2305.13186v3.pdf::Q003", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "197940", "pages": 27, "document_id": 7585, "arm": "llamacloud_basic_lc", "question_id": "2305.13186v3.pdf::Q003", "answer_letter": null, "raw_text": "I need to find the total number of claims in Wiki Table datasets and Scientific Articles datasets from Table 1.\n\nFrom Table 1 (Comparison of SCITAB to three recent table fact verification datasets):\n\n**Wiki Tables datasets:**\n- TabFact: 117,854 claims\n- FEVEROUS: 87,026 claims\nTotal Wiki Tables claims = 117,854 + 87,026 = 204,880\n\n**Scientific Articles datasets:**\n- SEM-TAB-FACTS: 5,715 claims\n- SCITAB: 1,225 claims\nTotal Scientific Articles claims = 5,715 + 1,225 = 6,940\n\nDifference = 204,880 - 6,940 = 197,940\n\nAnswer: 197940", "citations": [], "input_tokens": 28755, "output_tokens": 183, "cost_micros": 89010, "latency_ms": 5985, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "197940", "normalised_gold": "197940"}}
+{"qid": "2305.13186v3.pdf::Q003", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "197940", "pages": 27, "document_id": 7585, "arm": "llamacloud_premium_lc", "question_id": "2305.13186v3.pdf::Q003", "answer_letter": null, "raw_text": "I need to find the total number of claims in Wiki Table datasets and Scientific Articles datasets from Table 1.\n\nFrom Table 1:\n\n**Wiki Tables datasets:**\n- TabFact: 117,854 claims\n- FEVEROUS: 87,026 claims\n- Total Wiki Tables: 117,854 + 87,026 = 204,880 claims\n\n**Scientific Articles datasets:**\n- SEM-TAB-FACTS: 5,715 claims\n- SCITAB: 1,225 claims\n- Total Scientific Articles: 5,715 + 1,225 = 6,940 claims\n\n**Difference:**\n204,880 - 6,940 = 197,940 claims\n\nAnswer: 197940", "citations": [], "input_tokens": 36665, "output_tokens": 174, "cost_micros": 112605, "latency_ms": 5968, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "197940", "normalised_gold": "197940"}}
+{"qid": "2305.13186v3.pdf::Q003", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "197940", "pages": 27, "document_id": 7585, "arm": "surfsense_agentic", "question_id": "2305.13186v3.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the document to find information about claims in Wiki Table datasets compared to scientific articles datasets.Perfect! I found the table with the statistics. Let me extract the specific information:\n\nFrom the table:\n- **Wiki Tables datasets**: \n  - TabFact: 117,854 claims\n  - FEVEROUS: 87,026 claims\n  - Total: 117,854 + 87,026 = 204,880 claims\n\n- **Scientific Articles datasets**:\n  - SEM-TAB-FACTS: 5,715 claims\n  - SCITAB: 1,225 claims\n  - Total: 5,715 + 1,225 = 6,940 claims\n\nDifference: 204,880 - 6,940 = 197,940\n\nAnswer: 197940", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 36890, "error": null, "extra": {"thread_id": 1977, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_eedb3672a2cb45bf898fdd6ba6d9a71c", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 152, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "197940", "normalised_gold": "197940"}}
+{"qid": "2305.13186v3.pdf::Q005", "doc_id": "2305.13186v3.pdf", "answer_format": "float", "gold": "453.25", "pages": 27, "document_id": 7585, "arm": "native_pdf", "question_id": "2305.13186v3.pdf::Q005", "answer_letter": null, "raw_text": "I need to find information about the total number of claims in the dataset and the payment per claim annotation.\n\nFrom the document:\n\n1. Total number of claims in SCITAB: 1,225 claims (mentioned in the abstract and Table 1)\n\n2. Payment per claim annotation: 0.37 USD (mentioned in Section 2.3: \"Each claim annotation is reimbursed at a rate of 0.37 USD\")\n\nHowever, I need to consider that the annotation process involves multiple rounds and different types of claims:\n\nFrom Section 2.3, the manual verification process involves:\n- Verifying 872 false claims (counter-claims)\n- Verifying 900 NEI claims\n- Reviewing 872 real-world scientific claims\n\nThis suggests a two-round annotation process where each claim is annotated by two different annotators.\n\nLooking more carefully at the construction process:\n- 872 original claims were extracted\n- For each original claim, a counter-claim was generated (872 counter-claims)\n- 900 NEI claims were generated\n- After verification, the final dataset has 1,225 claims total\n\nThe annotation process in Section 2.3 states: \"We assign two different annotators to perform a two-round annotation for each claim\"\n\nSo the total annotations would be:\n- For supported/refuted claim pairs: 872 pairs need verification (2 rounds each)\n- For NEI claims: 900 claims need verification (2 rounds each)\n\nTotal annotations = (872 \u00d7 2) + (900 \u00d7 2) = 1,744 + 1,800 = 3,544 annotations\n\nTotal cost = 3,544 \u00d7 0.37 = 1,311.28 USD\n\nAnswer: 1311.28", "citations": [], "input_tokens": 68152, "output_tokens": 408, "cost_micros": 210576, "latency_ms": 38546, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2305.13186v3.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "1311.28", "normalised_gold": "453.25"}}
+{"qid": "2305.13186v3.pdf::Q005", "doc_id": "2305.13186v3.pdf", "answer_format": "float", "gold": "453.25", "pages": 27, "document_id": 7585, "arm": "azure_basic_lc", "question_id": "2305.13186v3.pdf::Q005", "answer_letter": null, "raw_text": "I need to find information about the annotation cost and the total number of claims in the dataset.\n\nFrom the document:\n\n1. **Annotation cost**: \"Each claim annotation is reimbursed at a rate of 0.37 USD\" (Section 2.3)\n\n2. **Total number of claims in SCITAB**: Looking at Table 1, the total number of claims in SCITAB is 1,225.\n\nHowever, I need to determine how many claims actually needed annotation. According to Section 2:\n\n- Section 2.1: 872 real-world scientific claims were filtered from SciGen\n- Section 2.2: They generated 872 false claims (counter-claims) and 900 NEI claims\n- Section 2.3: Manual verification was done on all these claims\n\nFrom Section 2.3 and Appendix D:\n- 872 false claims were verified\n- 900 NEI claims were verified (which resulted in 355 final NEI claims)\n- 872 original claims were also reviewed\n\nSo the total annotations performed:\n- False claim verification: 872 claims\n- NEI claim verification: 900 claims\n- Total annotations: 872 + 900 = 1,772 claims\n\nTotal cost = 1,772 claims \u00d7 0.37 USD per claim = 655.64 USD\n\nAnswer: 655.64", "citations": [], "input_tokens": 31785, "output_tokens": 318, "cost_micros": 100125, "latency_ms": 9562, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "655.64", "normalised_gold": "453.25"}}
+{"qid": "2305.13186v3.pdf::Q005", "doc_id": "2305.13186v3.pdf", "answer_format": "float", "gold": "453.25", "pages": 27, "document_id": 7585, "arm": "azure_premium_lc", "question_id": "2305.13186v3.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, I need to find:\n1. The total number of claims in the SCITAB dataset\n2. The payment rate per claim annotation\n\nFrom the document:\n\n**Total number of claims:**\nFrom Table 1 and Section 2.3: \"Total # of Claims: 1,225\"\n\n**Payment per claim:**\nFrom Section 2.3: \"Each claim annotation is reimbursed at a rate of 0.37 USD\"\n\n**Calculation:**\nTotal cost = 1,225 claims \u00d7 0.37 USD per claim = 453.25 USD\n\nAnswer: 453.25", "citations": [], "input_tokens": 44340, "output_tokens": 145, "cost_micros": 135195, "latency_ms": 5389, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "453.25", "normalised_gold": "453.25"}}
+{"qid": "2305.13186v3.pdf::Q005", "doc_id": "2305.13186v3.pdf", "answer_format": "float", "gold": "453.25", "pages": 27, "document_id": 7585, "arm": "llamacloud_basic_lc", "question_id": "2305.13186v3.pdf::Q005", "answer_letter": null, "raw_text": "I need to find the total cost for annotating all claims in the SCITAB dataset.\n\nFrom the document, I can find:\n\n1. **Total number of claims in SCITAB**: According to the abstract and Section 2, SCITAB contains **1,225 claims** (also written as \"1.2K expert-verified scientific claims\").\n\n2. **Cost per claim annotation**: According to Section 2.3 under \"Annotator Recruitment\":\n   - \"Each claim annotation is reimbursed at a rate of 0.37 USD\"\n\n3. **Calculation**:\n   Total cost = Number of claims \u00d7 Cost per claim\n   Total cost = 1,225 \u00d7 0.37\n   Total cost = 453.25 USD\n\nAnswer: 453.25", "citations": [], "input_tokens": 28767, "output_tokens": 183, "cost_micros": 89046, "latency_ms": 7046, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "453.25", "normalised_gold": "453.25"}}
+{"qid": "2305.13186v3.pdf::Q005", "doc_id": "2305.13186v3.pdf", "answer_format": "float", "gold": "453.25", "pages": 27, "document_id": 7585, "arm": "llamacloud_premium_lc", "question_id": "2305.13186v3.pdf::Q005", "answer_letter": null, "raw_text": "I need to find the total cost of annotating all claims in the SCITAB dataset.\n\nFrom the document:\n\n1. **Total number of claims in SCITAB**: According to Table 1, SCITAB has 1,225 total claims.\n\n2. **Payment per claim annotation**: From Section 2.3, it states \"Each claim annotation is reimbursed at a rate of 0.37 USD\"\n\n3. **Calculation**:\n   Total cost = Number of claims \u00d7 Cost per claim\n   Total cost = 1,225 \u00d7 0.37\n   Total cost = 453.25 USD\n\nAnswer: 453.25", "citations": [], "input_tokens": 36677, "output_tokens": 152, "cost_micros": 112311, "latency_ms": 5469, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "453.25", "normalised_gold": "453.25"}}
+{"qid": "2305.13186v3.pdf::Q005", "doc_id": "2305.13186v3.pdf", "answer_format": "float", "gold": "453.25", "pages": 27, "document_id": 7585, "arm": "surfsense_agentic", "question_id": "2305.13186v3.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document that was marked as most relevant to find information about annotation costs for the proposed dataset.Let me search for information about annotation costs, dataset size, and pricing in the document.Let me read through the document more systematically. I'll start with sections that typically discuss dataset details and costs.Let me search for sections that discuss the dataset size and annotation costs. Let me look for keywords like \"price\", \"annotator\", \"pay\", etc.Let me search for numbers that might relate to costs:Let me read sections that might contain appendix or details about annotation costs:Let me look in later sections of the document where appendix or cost information might be:Let me search for \"Appendix\" section which might contain cost details:Let me try searching with content mode to find where these terms appear:Great! I found a reference to annotation cost. Let me read around line 974:Perfect! I found the annotation cost information. Now I need to find the total number of claims in the dataset. Let me search for that information:Perfect! Now I have all the information I need:\n- Total number of claims: 1,225\n- Cost per claim annotation: 0.37 USD\n\nLet me calculate the total cost:\nTotal cost = 1,225 \u00d7 0.37 = 453.25 USD\n\nAnswer: 453.25", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 97266, "error": null, "extra": {"thread_id": 1978, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_77659594d74045e99872a87f698ab753", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 502, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "453.25", "normalised_gold": "453.25"}}
+{"qid": "2305.13186v3.pdf::Q006", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "245", "pages": 27, "document_id": 7585, "arm": "native_pdf", "question_id": "2305.13186v3.pdf::Q006", "answer_letter": null, "raw_text": "I need to find information about the distribution of reasoning steps in the SCITAB dataset.\n\nLooking at Figure 3 on page 5, which shows \"The distribution histogram of reasoning steps in our SCITAB dataset\":\n\nThe figure shows the percentage of claims for each number of reasoning steps (1-11).\n\nFrom the histogram, I can read the following percentages:\n- 1 step: 6%\n- 2 steps: 8%\n- 3 steps: 15%\n- 4 steps: 18%\n- 5 steps: 20% (highest)\n- 6 steps: 15%\n- 7 steps: 7%\n- 8 steps: 5%\n- 9 steps: 3%\n- 10 steps: 2%\n- 11 steps: 1%\n\nThe highest percentage is 20%, which corresponds to claims requiring 5 reasoning steps.\n\nTo find the actual number of claims, I need to calculate 20% of the total number of claims in SCITAB.\n\nFrom Table 1 on page 4, the total number of claims in SCITAB is 1,225.\n\n20% of 1,225 = 0.20 \u00d7 1,225 = 245\n\nAnswer: 245", "citations": [], "input_tokens": 68134, "output_tokens": 299, "cost_micros": 208887, "latency_ms": 34843, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2305.13186v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "245", "normalised_gold": "245"}}
+{"qid": "2305.13186v3.pdf::Q006", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "245", "pages": 27, "document_id": 7585, "arm": "azure_basic_lc", "question_id": "2305.13186v3.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the percentage of claims with the highest reasoning steps in the SCITAB dataset.\n\nLooking at Figure 3, which shows \"The distribution histogram of reasoning steps in our SCITAB dataset,\" I can see the distribution of reasoning steps.\n\nThe x-axis shows reasoning steps from 1 to 11, and the y-axis shows the percentage.\n\nFrom the figure, I can read the following percentages:\n- 1 step: 1%\n- 2 steps: 7%\n- 3 steps: 15%\n- 4 steps: 20%\n- 5 steps: 18%\n- 6 steps: 15%\n- 7 steps: 8%\n- 8 steps: 6%\n- 9 steps: 5%\n- 10 steps: 3%\n- 11 steps: 2%\n\nThe highest percentage is 20%, which corresponds to claims with 4 reasoning steps.\n\nThe document states there are 1,225 total claims in SCITAB (from Table 1).\n\n20% of 1,225 = 0.20 \u00d7 1,225 = 245\n\nAnswer: 245", "citations": [], "input_tokens": 31767, "output_tokens": 277, "cost_micros": 99456, "latency_ms": 8375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "245", "normalised_gold": "245"}}
+{"qid": "2305.13186v3.pdf::Q006", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "245", "pages": 27, "document_id": 7585, "arm": "azure_premium_lc", "question_id": "2305.13186v3.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the reasoning step percentage distribution in the SCITAB dataset from Figure 3.\n\nLooking at Figure 3, which shows \"The distribution histogram of reasoning steps in our SCITAB dataset,\" I can see the percentages for each reasoning step:\n\n- 1 step: 2%\n- 2 steps: 3%\n- 3 steps: 15%\n- 4 steps: 20%\n- 5 steps: 18%\n- 6 steps: 15%\n- 7 steps: 8%\n- 8 steps: 7%\n- 9 steps: 6%\n- 10 steps: 5%\n- 11 steps: 1%\n\nThe highest percentage is 20%, which corresponds to claims with 4 reasoning steps.\n\nSince the dataset contains 1,225 claims total (as stated in the abstract and throughout the document), I need to calculate 20% of 1,225:\n\n1,225 \u00d7 0.20 = 245\n\nAnswer: 245", "citations": [], "input_tokens": 44322, "output_tokens": 243, "cost_micros": 136611, "latency_ms": 6812, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "245", "normalised_gold": "245"}}
+{"qid": "2305.13186v3.pdf::Q006", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "245", "pages": 27, "document_id": 7585, "arm": "llamacloud_basic_lc", "question_id": "2305.13186v3.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the highest percentage of reasoning steps in the SCITAB dataset and determine how many claims fall into that category.\n\nLooking at Figure 3, which shows \"The distribution histogram of reasoning steps in our SCITAB dataset,\" I can see:\n\n- The x-axis represents the reasoning steps in each claim\n- The y-axis represents the frequency for each reasoning step\n- The figure distinguishes between shallow claims (1-2 reasoning steps, highlighted in red) and deep claims (3+ reasoning steps, highlighted in blue)\n\nFrom the histogram in Figure 3, I need to identify which reasoning step has the highest frequency (highest bar).\n\nExamining the histogram carefully:\n- The highest bar appears to be at 4 reasoning steps\n- The frequency at 4 reasoning steps appears to be approximately 350-400 claims\n\nLooking more carefully at the visual representation, the bar at 4 reasoning steps is the tallest in the histogram, with a frequency of approximately 350 claims.\n\nAnswer: 350", "citations": [], "input_tokens": 28749, "output_tokens": 223, "cost_micros": 89592, "latency_ms": 7078, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "350", "normalised_gold": "245"}}
+{"qid": "2305.13186v3.pdf::Q006", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "245", "pages": 27, "document_id": 7585, "arm": "llamacloud_premium_lc", "question_id": "2305.13186v3.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the reasoning step with the highest percentage in the SCITAB dataset.\n\nLooking at Figure 3, which shows \"The distribution histogram of reasoning steps in our SCITAB dataset,\" I can see the percentages for each reasoning step:\n\n- 1 step: 6%\n- 2 steps: 8%\n- 3 steps: 15%\n- 4 steps: 18%\n- 5 steps: 20%\n- 6 steps: 15%\n- 7 steps: 7%\n- 8 steps: 5%\n- 9 steps: 3%\n- 10 steps: 2%\n- 11 steps: 1%\n\nThe highest percentage is 20%, which corresponds to claims requiring 5 reasoning steps.\n\nTo find the actual number of claims, I need to calculate 20% of the total number of claims in SCITAB.\n\nFrom Table 1, the total number of claims in SCITAB is 1,225.\n\nNumber of claims with 5 reasoning steps = 1,225 \u00d7 0.20 = 245\n\nAnswer: 245", "citations": [], "input_tokens": 36659, "output_tokens": 266, "cost_micros": 113967, "latency_ms": 6453, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "245", "normalised_gold": "245"}}
+{"qid": "2305.13186v3.pdf::Q006", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "245", "pages": 27, "document_id": 7585, "arm": "surfsense_agentic", "question_id": "2305.13186v3.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the most relevant document to find information about claims and reasoning steps in the author's proposed dataset.Let me continue reading to find information about the dataset and reasoning steps distribution.Great! I found a reference to a distribution histogram of reasoning steps. Let me read that specific section.Let me look at the earlier part where the figure caption was mentioned to understand the chart better.Perfect! Now I can see the histogram data. Let me read more to see all the percentage values clearly:From the histogram data I can see, the percentages are listed vertically and the reasoning steps go from 2 to 11. Looking at the data:\n\n- The percentages shown are: 20, 18, 20, 15, 15, 15, 6, 8, 7, 5, 3, 2, 1, 1\n\nThe highest percentages appear to be 20% which occurs twice. Let me look for more context to understand the exact mapping between reasoning steps and percentages. Let me search for more information about the total number of claims.Good! So SCITAB has 1,225 total claims. Now I need to figure out which reasoning step number has the highest percentage (20%), and calculate how many claims that represents. Looking at the histogram data again, I need to understand which reasoning steps correspond to the 20% values. Let me look for more details about the distribution.Now let me carefully re-examine the histogram data. The data shows reasoning steps 2-11 and their corresponding percentages. Looking at the visual structure again:I see the structure now. Lines 1182-1198 show \"2 3 4 5 6 7 8 9 10 11\" which are the reasoning steps on the x-axis. The percentages must be reading from the bars. Based on the structure, it appears:\n\nFrom the data, I can see percentages: 20, 18, 20, 15, 15, 15, 10, 6, 8, 7, 5, 5, 3, 2, 1, 1\n\nThe highest percentage is 20%, which appears twice in the data. With 1,225 total claims and 20% being the highest:\n- 20% of 1,225 = 0.20 \u00d7 1,225 = 245 claims\n\nHowever, I see 20% appears twice, which could mean there are two different reasoning step counts that each have 20%. Let me verify by checking if there's a table or more precise description.Based on the figure caption, the y-axis shows \"frequency\" which is shown as \"Percentages (%)\". The highest percentage is 20%, and since this appears to occur for specific reasoning step(s), I need to calculate the number of claims.\n\nWith 1,225 total claims in SCITAB and the highest percentage being 20%:\n20% \u00d7 1,225 = 245 claims\n\nAnswer: 245", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 77500, "error": null, "extra": {"thread_id": 1979, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_0b34f104b1a04155b78b4d63f4b337dd", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 544, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "245", "normalised_gold": "245"}}
+{"qid": "2305.13186v3.pdf::Q007", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "10", "pages": 27, "document_id": 7585, "arm": "native_pdf", "question_id": "2305.13186v3.pdf::Q007", "answer_letter": null, "raw_text": "I need to count the number of figures in the Appendix section of the paper.\n\nLooking through the document, I can see the Appendix starts at section A on page 13. Let me go through each appendix section and count the figures:\n\n**Appendix A (Claim Extraction Procedure):**\n- Figure 5: The user interface for the claim extraction task (page 14)\n\n**Appendix B (Manual Claim Verification Procedure):**\n- Figure 6: The user interface for the NEI claim verification task (page 15)\n- Figure 7: The user interface for the refuted claim verification task (page 16)\n- Figure 8: Examples of post-annotation survey questions (page 17)\n\n**Appendix C (Analysis of Refuted Reasons in the Sci-Fact dataset):**\n- No figures\n\n**Appendix D (Discussions on Human-Machine Collaboration):**\n- No figures\n\n**Appendix E (Case Study for Refuted Claims):**\n- Figure 9: Refuted claims cases A to C (page 22)\n- Figure 10: Refuted claims cases D and E (page 23)\n\n**Appendix F (Error Cases for InstructGPT):**\n- Figure 11: Error Cases 1-3 for InstructGPT (page 24)\n- Figure 12: Error Cases 4-6 for InstructGPT (page 25)\n\n**Appendix G (Error Cases for Program-of-Thoughts):**\n- Figure 13: Error Cases 1-3 for Program-of-Thoughts (page 26)\n- Figure 14: Error Cases 4 and 5 for Program-of-Thoughts (page 27)\n\n**Appendix H (Prompts):**\n- No figures\n\nTotal count: Figures 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 = 10 figures in the Appendix.\n\nAnswer: 10", "citations": [], "input_tokens": 68138, "output_tokens": 472, "cost_micros": 211494, "latency_ms": 35156, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2305.13186v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "10", "normalised_gold": "10"}}
+{"qid": "2305.13186v3.pdf::Q007", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "10", "pages": 27, "document_id": 7585, "arm": "azure_basic_lc", "question_id": "2305.13186v3.pdf::Q007", "answer_letter": null, "raw_text": "I need to count all the figures in the Appendix section of the paper.\n\nLooking through the Appendix sections:\n\n**Appendix A (Claim Extraction Procedure):**\n- A.2 Claim Extraction Interface: Figure 5\n\n**Appendix B (Manual Claim Verification Procedure):**\n- B.2 NEI Claim Verification Interface: Figure 6\n- B.3 Refuted Claim Verification Interface: Figure 7\n- B.4 Annotation Post-Survey: Figure 8\n\n**Appendix C (Analysis of Refuted Reasons in the Sci-Fact dataset):**\n- No figures\n\n**Appendix D (Discussions on Human-Machine Collaboration):**\n- No figures\n\n**Appendix E (Case Study for Refuted Claims):**\n- Figure 9 (refuted cases A to C)\n- Figure 10 (refuted cases D and E)\n\n**Appendix F (Error Cases for InstructGPT):**\n- Figure 11 (Error Cases 1-3)\n- Figure 12 (Error Cases 4-6)\n\n**Appendix G (Error Cases for Program-of-Thoughts):**\n- Figure 13 (Error Cases 1-3)\n- Figure 14 (Error Cases 4 and 5)\n\n**Appendix H (Prompts):**\n- No figures\n\nTotal count: Figures 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 = 10 figures\n\nAnswer: 10", "citations": [], "input_tokens": 31771, "output_tokens": 373, "cost_micros": 100908, "latency_ms": 8453, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "10", "normalised_gold": "10"}}
+{"qid": "2305.13186v3.pdf::Q007", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "10", "pages": 27, "document_id": 7585, "arm": "azure_premium_lc", "question_id": "2305.13186v3.pdf::Q007", "answer_letter": null, "raw_text": "I need to count all the figures in the Appendix section of the document.\n\nLooking through the Appendix sections:\n\n**Appendix A (Claim Extraction Procedure):**\n- Figure 5: The user interface for the claim extraction task\n\n**Appendix B (Manual Claim Verification Procedure):**\n- Figure 6: The user interface for the NEI claim verification task\n- Figure 7: The user interface for the refuted claim verification task\n- Figure 8: The examples of post-annotation survey questions and the answers of annotators\n\n**Appendix C (Analysis of Refuted Reasons in the Sci-Fact dataset):**\n- No figures\n\n**Appendix D (Discussions on Human-Machine Collaboration):**\n- No figures\n\n**Appendix E (Case Study for Refuted Claims):**\n- Figure 9: The refuted claims cases A to C\n- Figure 10: The refuted claims cases D and E\n\n**Appendix F (Error Cases for InstructGPT):**\n- Figure 11: Error Cases 1-3 for InstructGPT in the zero-shot setting\n- Figure 12: Error Cases 4-6 for InstructGPT in the zero-shot setting\n\n**Appendix G (Error Cases for Program-of-Thoughts):**\n- Figure 13: Error Cases 1-3 for Program-of-Thoughts\n- Figure 14: Error Cases 4 and 5 for Program-of-Thoughts\n\n**Appendix H (Prompts):**\n- No figures\n\nTotal count: 5 + 6 + 7 + 8 + 9 + 10 + 11 + 12 + 13 + 14 = 10 figures\n\nAnswer: 10", "citations": [], "input_tokens": 44326, "output_tokens": 415, "cost_micros": 139203, "latency_ms": 9266, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "10", "normalised_gold": "10"}}
+{"qid": "2305.13186v3.pdf::Q007", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "10", "pages": 27, "document_id": 7585, "arm": "llamacloud_basic_lc", "question_id": "2305.13186v3.pdf::Q007", "answer_letter": null, "raw_text": "I need to count all the figures in the Appendix section of the document.\n\nLooking through the Appendix sections:\n\n**Appendix A (Claim Extraction Procedure)**\n- Figure 5: The user interface for the claim extraction task\n\n**Appendix B (Manual Claim Verification Procedure)**\n- Figure 6: The user interface for the NEI claim verification task\n- Figure 7: The user interface for the refuted claim verification task\n- Figure 8: The examples of post-annotation survey questions\n\n**Appendix C (Analysis of Refuted Reasons in the Sci-Fact dataset)**\n- No figures\n\n**Appendix D (Discussions on Human-Machine Collaboration)**\n- No figures\n\n**Appendix E (Case Study for Refuted Claims)**\n- Figure 9: The refuted claims cases A to C\n- Figure 10: The refuted claims cases D and E\n\n**Appendix F (Error Cases for InstructGPT)**\n- Figure 11: Error Cases 1-3 for InstructGPT\n- Figure 12: Error Cases 4-6 for InstructGPT\n\n**Appendix G (Error Cases for Program-of-Thoughts)**\n- Figure 13: Error Cases 1-3 for Program-of-Thoughts\n- Figure 14: Error Cases 4 and 5 for Program-of-Thoughts\n\n**Appendix H (Prompts)**\n- No figures\n\nTotal figures in Appendix:\nFigure 5, 6, 7, 8, 9, 10, 11, 12, 13, 14\n\nAnswer: 10", "citations": [], "input_tokens": 28753, "output_tokens": 380, "cost_micros": 91959, "latency_ms": 8592, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "10", "normalised_gold": "10"}}
+{"qid": "2305.13186v3.pdf::Q007", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "10", "pages": 27, "document_id": 7585, "arm": "llamacloud_premium_lc", "question_id": "2305.13186v3.pdf::Q007", "answer_letter": null, "raw_text": "I need to count all the figures in the Appendix section of the document.\n\nLet me go through the Appendix sections:\n\n**Appendix A** (Claim Extraction Procedure):\n- Figure 5: The user interface for the claim extraction task\n\n**Appendix B** (Manual Claim Verification Procedure):\n- Figure 6: The user interface for the NEI claim verification task\n- Figure 7: The user interface for the refuted claim verification task\n- Figure 8: The examples of post-annotation survey questions and the answers of annotators\n\n**Appendix C** (Analysis of Refuted Reasons in the Sci-Fact dataset):\n- No figures\n\n**Appendix D** (Discussions on Human-Machine Collaboration):\n- No figures\n\n**Appendix E** (Case Study for Refuted Claims):\n- Figure 9: The refuted claims cases A to C\n- Figure 10: The refuted claims cases D and E\n\n**Appendix F** (Error Cases for InstructGPT):\n- Figure 11: Error Cases 1-3 for InstructGPT in the zero-shot setting\n- Figure 12: Error Cases 4-6 for InstructGPT in the zero-shot setting\n\n**Appendix G** (Error Cases for Program-of-Thoughts):\n- Figure 13: Error Cases 1-3 for Program-of-Thoughts\n- Figure 14: Error Cases 4 and 5 for Program-of-Thoughts\n\n**Appendix H** (Prompts):\n- No figures\n\nTotal figures in Appendix:\n- Figure 5\n- Figure 6\n- Figure 7\n- Figure 8\n- Figure 9\n- Figure 10\n- Figure 11\n- Figure 12\n- Figure 13\n- Figure 14\n\nAnswer: 10", "citations": [], "input_tokens": 36663, "output_tokens": 427, "cost_micros": 116394, "latency_ms": 9828, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "10", "normalised_gold": "10"}}
+{"qid": "2305.13186v3.pdf::Q007", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "10", "pages": 27, "document_id": 7585, "arm": "surfsense_agentic", "question_id": "2305.13186v3.pdf::Q007", "answer_letter": null, "raw_text": "I'll read the document to find out how many figures are in the Appendix.Let me search for the Appendix section in the document.Now let me search for figures in the Appendix sections. Let me read the end of the document where appendices are typically located.Let me continue reading to find the Appendix section.Let me search for all figures in the appendix sections. Let me look for \"Figure\" in the context of Appendix.Let me read the document more systematically to find all appendix sections and count the figures. Let me search for specific appendix sections.Now I can see the appendix sections starting. Let me continue reading to find all figures in the appendices.Let me continue reading to find more figures in the appendix.Let me continue to look for all figures in the appendix. Let me read further.Let me continue reading to find all the figures.Let me continue reading to find more figures.Now let me search for the end of the document to make sure I've found all figures.Perfect! Now let me count all the figures I found in the Appendix sections. Based on my reading, I found the following figures in the appendices:\n\n**Appendix A:**\n- Figure 5: The user interface for the claim extraction task\n\n**Appendix B:**\n- Figure 6: The user interface for the NEI claim verification task\n- Figure 7: The user interface for the refuted claim verification task\n- Figure 8: The examples of post-annotation survey questions and the answers of annotators\n\n**Appendix E:**\n- Figure 9: The refuted claims cases A to C\n- Figure 10: The refuted claims cases D and E\n\n**Appendix F:**\n- Figure 11: Error Cases 1-3 for InstructGPT in the zero-shot setting\n- Figure 12: Error Cases 4-6 for InstructGPT in the zero-shot setting\n\n**Appendix G:**\n- Figure 13: Error Cases 1-3 for Program-of-Thoughts\n- Figure 14: Error Cases 4 and 5 for Program-of-Thoughts\n\nTotal figures in the Appendix: 10\n\nAnswer: 10", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 85187, "error": null, "extra": {"thread_id": 1980, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_0161632b18b34ebfa7ee0421186ab320", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 490, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "10", "normalised_gold": "10"}}
+{"qid": "2305.14160v4.pdf::Q000", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "No", "pages": 16, "document_id": 7586, "arm": "native_pdf", "question_id": "2305.14160v4.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, particularly at Figures 3, 7, and 9 which show the relative sizes of Swp (information flow from text part to label words), Spq (information flow from label words to target position), and Sww across different layers:\n\n- In shallow layers, Swp is high while Spq is low\n- In deep layers, Spq becomes dominant and exceeds Swp\n\nThe document explicitly states: \"Initially, Swp occupies a significant proportion, but it gradually decays over layers, while Spq becomes the dominant one.\"\n\nThis means that Swp is NOT always greater than Spq - it depends on the layer depth.\n\nAnswer: no", "citations": [], "input_tokens": 41233, "output_tokens": 158, "cost_micros": 126069, "latency_ms": 32250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2305.14160v4.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2305.14160v4.pdf::Q000", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "No", "pages": 16, "document_id": 7586, "arm": "azure_basic_lc", "question_id": "2305.14160v4.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 3 in the document, which shows the relative sizes of Swp (information flow from text to label words), Spq (information flow from label words to target position), and Sww across different layers:\n\nIn shallow layers (early layers), Swp is greater than Spq.\nIn deep layers (later layers), Spq becomes dominant and is greater than Swp.\n\nThe document explicitly states: \"in shallow layers, Spq, the significance of the information flow from label words to targeted positions, is low, while Swp, the information flow from the text part to label words is high; (2) in deep layers, Spq, the importance of information flow from label words to the targeted position becomes the dominant one.\"\n\nSince Swp is not always greater than Spq across all layers (it depends on which layer), the answer is no.\n\nAnswer: no", "citations": [], "input_tokens": 17763, "output_tokens": 200, "cost_micros": 56289, "latency_ms": 6078, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2305.14160v4.pdf::Q000", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "No", "pages": 16, "document_id": 7586, "arm": "azure_premium_lc", "question_id": "2305.14160v4.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 3 and Figure 7 in the document, which show the relative sizes of S_wp (information flow from text to label words), S_pq (information flow from label words to target position), and S_ww across different layers:\n\nIn shallow layers (layers close to input), S_wp is high while S_pq is low.\n\nIn deep layers (layers close to output), S_pq becomes dominant and exceeds S_wp.\n\nThe document explicitly states: \"In shallow layers, S_pq, the significance of the information flow from label words to targeted positions, is low, while S_wp, the information flow from the text part to label words is high; (2) in deep layers, S_pq, the importance of information flow from label words to the targeted position becomes the dominant one.\"\n\nSince S_wp is greater in shallow layers but S_pq becomes greater in deep layers, S_wp is NOT always greater than S_pq.\n\nAnswer: no", "citations": [], "input_tokens": 20121, "output_tokens": 222, "cost_micros": 63693, "latency_ms": 7094, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2305.14160v4.pdf::Q000", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "No", "pages": 16, "document_id": 7586, "arm": "llamacloud_basic_lc", "question_id": "2305.14160v4.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, particularly Section 2.1 and Figure 3, the metrics Swp (information flow from text part to label words) and Spq (information flow from label words to target position) are compared across different layers.\n\nThe text states: \"Initially, Swp occupies a significant proportion, but it gradually decays over layers, while Spq becomes the dominant one.\"\n\nThis indicates that Swp is larger in shallow layers but Spq becomes larger in deep layers. Therefore, Swp is not always greater than Spq across all layers.\n\nAnswer: No", "citations": [], "input_tokens": 13762, "output_tokens": 132, "cost_micros": 43266, "latency_ms": 5780, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2305.14160v4.pdf::Q000", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "No", "pages": 16, "document_id": 7586, "arm": "llamacloud_premium_lc", "question_id": "2305.14160v4.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the data presented in the document, particularly in Figure 3 and related figures showing $S_{wp}$, $S_{pq}$, and $S_{ww}$ values across different layers:\n\nIn shallow layers (e.g., layers 0-10), $S_{wp}$ (information flow from text to label words) is greater than $S_{pq}$ (information flow from label words to target position).\n\nHowever, in deep layers (e.g., layers 15-48), $S_{pq}$ becomes much larger than $S_{wp}$.\n\nFor example, in Figure 3(a) for SST-2:\n- Layer 0: $S_{wp}$ = 0.72, $S_{pq}$ = 0.15\n- Layer 20: $S_{wp}$ = 0.10, $S_{pq}$ = 0.90\n\nSince the relationship changes across layers, $S_{wp}$ is not **always** greater than $S_{pq}$.\n\nAnswer: no", "citations": [], "input_tokens": 23820, "output_tokens": 243, "cost_micros": 75105, "latency_ms": 6703, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2305.14160v4.pdf::Q000", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "No", "pages": 16, "document_id": 7586, "arm": "surfsense_agentic", "question_id": "2305.14160v4.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find information about information flow between text parts, label words, and target positions.Let me search for information about information flow between text parts, label words, and target positions.Let me search for specific terms related to the comparison:Let me read through the document more systematically to find tables or figures that might contain this information:Let me read Figure 3 and related sections since it mentions Swp, Spq, and Sww which seem to be related to information flow:Good! I found relevant information. Let me read more to understand the definitions and comparisons better:Perfect! Now I have the key information. Let me look at Figure 3 more carefully to see the actual values:Based on my reading of the document, I can now answer the question. The document clearly states:\n\nFrom the text (lines 842-851):\n- \"in shallow layers, Spq, the significance of the information flow from label words to targeted positions, is low, while **Swp, the information flow from the text part to label words is high**\"\n- \"in deep layers, Spq, the importance of information flow from label words to the targeted position becomes the dominant one\"\n\nThe document also states in the figure caption (line 943): \"Initially, **Swp occupies a significant proportion**, but it gradually decays over layers, while **Spq becomes the dominant one**.\"\n\nThis indicates that:\n- In shallow layers: Swp (text part to label words) > Spq (label words to target position)\n- In deep layers: Spq (label words to target position) > Swp (text part to label words)\n\nSince Swp starts high and decays while Spq starts low and increases, across ALL layers (taking the mean), Swp is NOT always greater than Spq. In fact, Spq becomes dominant in deep layers.\n\nAnswer: no", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 59108, "error": null, "extra": {"thread_id": 1981, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_31954715cb0b491eade4006731d43b22", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 360, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2305.14160v4.pdf::Q001", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "No", "pages": 16, "document_id": 7586, "arm": "native_pdf", "question_id": "2305.14160v4.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the document, particularly Figure 3 and the related discussion in Section 2.1:\n\nThe text states: \"Figure 3 reveals that: (1) in shallow layers, Spq, the significance of the information flow from label words to targeted positions, is low, while Swp, the information flow from the text part to label words is high; (2) in deep layers, Spq, the importance of information flow from label words to the targeted position becomes the dominant one.\"\n\nThis clearly indicates that the information flow from label words to the target position (Spq) is NOT dominant in shallow layers - it only becomes dominant in deep layers.\n\nAnswer: no", "citations": [], "input_tokens": 41220, "output_tokens": 149, "cost_micros": 125895, "latency_ms": 31437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2305.14160v4.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2305.14160v4.pdf::Q001", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "No", "pages": 16, "document_id": 7586, "arm": "azure_basic_lc", "question_id": "2305.14160v4.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the document, particularly Figure 3 and the analysis in Section 2.1:\n\nThe information flow from label words to the target position (S_pq) is LOW in shallow layers, but becomes DOMINANT in deep layers. The document states: \"(1) in shallow layers, S_pq, the significance of the information flow from label words to targeted positions, is low... (2) in deep layers, S_pq, the importance of information flow from label words to the targeted position becomes the dominant one.\"\n\nAnswer: no", "citations": [], "input_tokens": 17750, "output_tokens": 119, "cost_micros": 55035, "latency_ms": 4890, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2305.14160v4.pdf::Q001", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "No", "pages": 16, "document_id": 7586, "arm": "azure_premium_lc", "question_id": "2305.14160v4.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the document, particularly Figure 3 and the analysis in Section 2.1:\n\nThe text states: \"(1) in shallow layers, Spq, the significance of the information flow from label words to targeted positions, is low, while Swp, the information flow from the text part to label words is high; (2) in deep layers, Spq, the importance of information flow from label words to the targeted position becomes the dominant one.\"\n\nThis clearly indicates that the information flow from label words to the target position (Spq) is NOT dominant in shallow layers - it only becomes dominant in deep layers.\n\nAnswer: no", "citations": [], "input_tokens": 20108, "output_tokens": 140, "cost_micros": 62424, "latency_ms": 5203, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2305.14160v4.pdf::Q001", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "No", "pages": 16, "document_id": 7586, "arm": "llamacloud_basic_lc", "question_id": "2305.14160v4.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the document, particularly Figure 3 and Section 2.1, I need to examine when the information flow from label words to the target position (Spq) dominates.\n\nFrom Figure 3 and the text in Section 2.1:\n- Initially (in shallow layers), Swp (information flow from text to label words) \"occupies a significant proportion\"\n- Spq (information flow from label words to target position) \"gradually\" becomes dominant\n- Spq becomes \"the dominant one\" but this happens over layers, not in all layers\n\nThe text explicitly states: \"Initially, Swp occupies a significant proportion, but it gradually decays over layers, while Spq becomes the dominant one.\"\n\nThis indicates that Spq does NOT dominate in all layers - only in the deeper layers after the information aggregation phase in shallow layers is complete.\n\nAnswer: No", "citations": [], "input_tokens": 13749, "output_tokens": 198, "cost_micros": 44217, "latency_ms": 6828, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2305.14160v4.pdf::Q001", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "No", "pages": 16, "document_id": 7586, "arm": "llamacloud_premium_lc", "question_id": "2305.14160v4.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the paper's findings in Section 2.1 and the results shown in Figures 3, 7, and 9:\n\nIn shallow layers, $S_{wp}$ (information flow from text to label words) is high while $S_{pq}$ (information flow from label words to target position) is low.\n\nIn deep layers, $S_{pq}$ becomes dominant while $S_{wp}$ decreases.\n\nThe paper explicitly states: \"in shallow layers, $S_{pq}$, the significance of the information flow from label words to targeted positions, is low... (2) in deep layers, $S_{pq}$, the importance of information flow from label words to the targeted position becomes the dominant one.\"\n\nAnswer: no", "citations": [], "input_tokens": 23807, "output_tokens": 171, "cost_micros": 73986, "latency_ms": 6312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2305.14160v4.pdf::Q001", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "No", "pages": 16, "document_id": 7586, "arm": "surfsense_agentic", "question_id": "2305.14160v4.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the document mentioned in the priority list to answer your question.Let me search for content related to \"information flow\" and \"label words\" in the document.Now let me read the specific sections that discuss whether information flow from label words dominates in all layers.Perfect! I found the answer. Let me read a bit more to confirm the complete picture.Based on the document, the answer is clear. The text states:\n\n\"(1) in shallow layers, Spq, the significance of the information flow from label words to targeted positions, is low, while Swp, the information flow from the text part to label words is high; (2) in deep layers, Spq, the importance of information flow from label words to the targeted position becomes the dominant one.\"\n\nThis shows that information flow from label words to the target position does NOT dominate in all layers - it is low in shallow layers and only becomes dominant in deep layers.\n\nAnswer: No", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 38452, "error": null, "extra": {"thread_id": 1982, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_91e767a2b44d476aa25ee429c43c2924", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 182, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2305.14160v4.pdf::Q002", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "Yes", "pages": 16, "document_id": 7586, "arm": "native_pdf", "question_id": "2305.14160v4.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 4 on page 4, which shows the impact of isolating label words versus randomly isolating non-label words within the first or last 5 layers:\n\nThe figure shows that \"Random (Last)\" - which represents randomly isolating non-label words within the last 5 layers - has minimal impact on both Label Loyalty and Word Loyalty metrics for both GPT2-XL and GPT-J models. The bars for \"Random (Last)\" show loyalty values close to 100%, similar to the \"No Isolation\" baseline.\n\nThis is further confirmed in the text on page 4 which states: \"Yet, this influence becomes inconsequential within the last 5 layers, or when random non-label words are used.\"\n\nAnswer: yes", "citations": [], "input_tokens": 41222, "output_tokens": 170, "cost_micros": 126216, "latency_ms": 33032, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2305.14160v4.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "yes", "normalised_gold": "yes"}}
+{"qid": "2305.14160v4.pdf::Q002", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "Yes", "pages": 16, "document_id": 7586, "arm": "azure_basic_lc", "question_id": "2305.14160v4.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 4 in the document, which shows \"The impact of isolating label words versus randomly isolating non-label words within the first or last 5 layers,\" I can see that \"Random (Last)\" - which represents randomly isolating non-label words within the last 5 layers - shows loyalty values very close to the \"No Isolation\" baseline (around 100%). This indicates almost no impact on the model's behavior.\n\nAnswer: Yes", "citations": [], "input_tokens": 17752, "output_tokens": 101, "cost_micros": 54771, "latency_ms": 3967, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "yes", "normalised_gold": "yes"}}
+{"qid": "2305.14160v4.pdf::Q002", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "Yes", "pages": 16, "document_id": 7586, "arm": "azure_premium_lc", "question_id": "2305.14160v4.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 4 in the document, which shows \"The impact of isolating label words versus randomly isolating non-label words within the first or last 5 layers,\" I can see that:\n\n- \"Random (Last)\" (randomly isolating non-label words in the last 5 layers) shows very high loyalty scores (close to 100%), nearly identical to \"No Isolation\"\n- This indicates minimal impact on the model's predictions\n\nThe text also confirms: \"this influence becomes inconsequential within the last 5 layers, or when random non-label words are used.\"\n\nAnswer: yes", "citations": [], "input_tokens": 20110, "output_tokens": 134, "cost_micros": 62340, "latency_ms": 6030, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "yes", "normalised_gold": "yes"}}
+{"qid": "2305.14160v4.pdf::Q002", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "Yes", "pages": 16, "document_id": 7586, "arm": "llamacloud_basic_lc", "question_id": "2305.14160v4.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 4 and the related text in section 2.2, the document shows that isolating label words within the first 5 layers has substantial impact, but isolation within the last 5 layers has negligible impact. The text also states that \"this influence becomes inconsequential within the last 5 layers, or when random non-label words are used.\"\n\nFigure 4 and Figure 8 show that randomly isolating non-label words (represented by lighter colored lines) in the last layers results in very high loyalty scores (close to 100%), indicating almost no impact on the model's behavior.\n\nAnswer: yes", "citations": [], "input_tokens": 13751, "output_tokens": 139, "cost_micros": 43338, "latency_ms": 5328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "yes", "normalised_gold": "yes"}}
+{"qid": "2305.14160v4.pdf::Q002", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "Yes", "pages": 16, "document_id": 7586, "arm": "llamacloud_premium_lc", "question_id": "2305.14160v4.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 4 in the document, which shows \"The impact of isolating label words versus randomly isolating non-label words within the first or last 5 layers,\" I can see the results for \"Random (Last)\".\n\nThe table shows:\n- Label Loyalty (GPT2-XL) with Random (Last): 98\n- Word Loyalty (GPT2-XL) with Random (Last): 98\n- Label Loyalty (GPT-J) with Random (Last): 98\n- Word Loyalty (GPT-J) with Random (Last): 98\n\nThese values are nearly identical to the \"No Isolation\" baseline (100), indicating minimal impact.\n\nThe document states: \"Yet, this influence becomes inconsequential within the last 5 layers, or when random non-label words are used.\"\n\nAnswer: yes", "citations": [], "input_tokens": 23809, "output_tokens": 191, "cost_micros": 74292, "latency_ms": 6092, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "yes", "normalised_gold": "yes"}}
+{"qid": "2305.14160v4.pdf::Q002", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "Yes", "pages": 16, "document_id": 7586, "arm": "surfsense_agentic", "question_id": "2305.14160v4.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the document to find information about randomly isolating non-label words within the last 5 layers.Perfect! I found the answer. The document states that randomly isolating non-label words within the last 5 layers has \"inconsequential\" impact, which means \"almost no impact.\"\n\nAnswer: Yes", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 40125, "error": null, "extra": {"thread_id": 1983, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_c2144cf872eb43e78cc21f804b3432de", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 145, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "yes", "normalised_gold": "yes"}}
+{"qid": "2305.14160v4.pdf::Q003", "doc_id": "2305.14160v4.pdf", "answer_format": "float", "gold": "16.74", "pages": 16, "document_id": 7586, "arm": "native_pdf", "question_id": "2305.14160v4.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Table 1 in the document, I need to compare the Anchor Re-weighting method with Vanilla In-Context Learning (1-shot per class).\n\nFrom Table 1:\n- Vanilla In-Context Learning (1-shot per class) Average: 51.90\n- Anchor Re-weighting (1-shot per class) Average: 68.64\n\nThe improvement is: 68.64 - 51.90 = 16.74\n\nAnswer: 16.74", "citations": [], "input_tokens": 41209, "output_tokens": 115, "cost_micros": 125352, "latency_ms": 29484, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2305.14160v4.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "16.74", "normalised_gold": "16.74"}}
+{"qid": "2305.14160v4.pdf::Q003", "doc_id": "2305.14160v4.pdf", "answer_format": "float", "gold": "16.74", "pages": 16, "document_id": 7586, "arm": "azure_basic_lc", "question_id": "2305.14160v4.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the document, I need to find the average improvement from the Anchor Re-weighting method compared to vanilla ICL.\n\nFrom Table 1 on the page discussing Anchor Re-weighting results:\n\n- Vanilla In-Context Learning (1-shot per class) Average: 51.90\n- Anchor Re-weighting (1-shot per class) Average: 68.64\n\nThe improvement is: 68.64 - 51.90 = 16.74\n\nThis is also confirmed in the text which states: \"leading to a 16.7% average accuracy boost compared to standard ICL baselines.\"\n\nAnswer: 16.7", "citations": [], "input_tokens": 17739, "output_tokens": 149, "cost_micros": 55452, "latency_ms": 4437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "16.7", "normalised_gold": "16.74"}}
+{"qid": "2305.14160v4.pdf::Q003", "doc_id": "2305.14160v4.pdf", "answer_format": "float", "gold": "16.74", "pages": 16, "document_id": 7586, "arm": "azure_premium_lc", "question_id": "2305.14160v4.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Table 1 in the document, I need to compare the Anchor Re-weighting method with the baseline vanilla ICL (1-shot per class).\n\nFrom Table 1:\n- Vanilla In-Context Learning (1-shot per class): Average = 51.90\n- Anchor Re-weighting (1-shot per class): Average = 68.64\n\nImprovement = 68.64 - 51.90 = 16.74\n\nThis matches the statement in the abstract that mentions \"a 16.7% average accuracy boost compared to standard ICL baselines.\"\n\nAnswer: 16.74", "citations": [], "input_tokens": 20097, "output_tokens": 141, "cost_micros": 62406, "latency_ms": 5483, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "16.74", "normalised_gold": "16.74"}}
+{"qid": "2305.14160v4.pdf::Q003", "doc_id": "2305.14160v4.pdf", "answer_format": "float", "gold": "16.74", "pages": 16, "document_id": 7586, "arm": "llamacloud_basic_lc", "question_id": "2305.14160v4.pdf::Q003", "answer_letter": null, "raw_text": "I need to find the average improvement of the Anchor Re-weighting method compared to vanilla ICL.\n\nFrom Table 1 in the document, I can see the following results:\n\n**Vanilla In-Context Learning (1-shot per class):**\n- SST-2: 61.28\n- TREC: 57.56\n- AGNews: 73.32\n- EmoC: 15.44\n- Average: 51.90\n\n**Anchor Re-weighting (1-shot per class):**\n- SST-2: 90.07\n- TREC: 60.92\n- AGNews: 81.94\n- EmoC: 41.64\n- Average: 68.64\n\nThe improvement is calculated as:\n68.64 - 51.90 = 16.74\n\nThe document also states in the text: \"leading to a 16.7% average accuracy boost compared to standard ICL baselines.\"\n\nAnswer: 16.74", "citations": [], "input_tokens": 13738, "output_tokens": 229, "cost_micros": 44649, "latency_ms": 5717, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "16.74", "normalised_gold": "16.74"}}
+{"qid": "2305.14160v4.pdf::Q003", "doc_id": "2305.14160v4.pdf", "answer_format": "float", "gold": "16.74", "pages": 16, "document_id": 7586, "arm": "llamacloud_premium_lc", "question_id": "2305.14160v4.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Table 1 in the document, I need to find the performance improvement of Anchor Re-weighting over vanilla ICL (1-shot per class).\n\nFrom Table 1:\n- Vanilla In-Context Learning (1-shot per class): Average = 51.90\n- Anchor Re-weighting (1-shot per class): Average = 68.64\n\nImprovement = 68.64 - 51.90 = 16.74\n\nThis matches the statement in the abstract and Section 3.1 which mentions \"a 16.7% average accuracy boost compared to standard ICL baselines.\"\n\nAnswer: 16.74", "citations": [], "input_tokens": 23796, "output_tokens": 148, "cost_micros": 73608, "latency_ms": 5437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "16.74", "normalised_gold": "16.74"}}
+{"qid": "2305.14160v4.pdf::Q003", "doc_id": "2305.14160v4.pdf", "answer_format": "float", "gold": "16.74", "pages": 16, "document_id": 7586, "arm": "surfsense_agentic", "question_id": "2305.14160v4.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the document to find information about the Anchor Re-weighting method and its performance improvement over vanilla ICL.Let me search for \"Anchor Re-weighting\" in the document to find the relevant information.Let me read the section with the results table to find the specific improvement values.Perfect! I found Table 1 which shows the performance comparison. Let me calculate the improvement:\n\nFrom the table:\n- Vanilla In-Context Learning (1-shot per class) Average: 51.90\n- Anchor Re-weighting (1-shot per class) Average: 68.64\n\nThe improvement = 68.64 - 51.90 = 16.74\n\nAnswer: 16.74", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 35969, "error": null, "extra": {"thread_id": 1984, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_25b28dfa8e5c4fbeb8c80f7d7650cb08", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 144, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "16.74", "normalised_gold": "16.74"}}
+{"qid": "2305.14160v4.pdf::Q004", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "2.5x", "pages": 16, "document_id": 7586, "arm": "native_pdf", "question_id": "2305.14160v4.pdf::Q004", "answer_letter": null, "raw_text": "I need to find information about anchor-only context compression speedup for GPT2-XL on AGNews.\n\nLooking at Table 3 on page 8, which shows \"Acceleration ratios of the Hiddenanchor method\":\n\nThe table shows:\n- Model: GPT2-XL\n- AGNews: 2.5\u00d7\n\nThis indicates that using the anchor-only context compression method (Hiddenanchor), GPT2-XL achieves a 2.5\u00d7 speedup on the AGNews dataset.\n\nAnswer: 2.5\u00d7", "citations": [], "input_tokens": 41212, "output_tokens": 129, "cost_micros": 125571, "latency_ms": 31171, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2305.14160v4.pdf"}, "graded": {"correct": false, "f1": 0.5, "method": "str_norm", "normalised_pred": "2 5\u00d7", "normalised_gold": "2 5x"}}
+{"qid": "2305.14160v4.pdf::Q004", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "2.5x", "pages": 16, "document_id": 7586, "arm": "azure_basic_lc", "question_id": "2305.14160v4.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document for information about GPT2-XL speedup on AGNews using anchor-only context compression (Hiddenanchor method):\n\nIn Table 3 on the topic of \"Acceleration ratios of the Hiddenanchor method,\" the speedup ratio for GPT2-XL on AGNews is listed as 2.5\u00d7.\n\nAnswer: 2.5\u00d7", "citations": [], "input_tokens": 17742, "output_tokens": 90, "cost_micros": 54576, "latency_ms": 4594, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5, "method": "str_norm", "normalised_pred": "2 5\u00d7", "normalised_gold": "2 5x"}}
+{"qid": "2305.14160v4.pdf::Q004", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "2.5x", "pages": 16, "document_id": 7586, "arm": "azure_premium_lc", "question_id": "2305.14160v4.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document for information about anchor-only context compression speedup for GPT2-XL on AGNews...\n\nIn Table 3 (page 7), the acceleration ratios of the Hiddenanchor method are provided:\n\nFor GPT2-XL on AGNews, the speedup is listed as 2.5\u00d7.\n\nAnswer: 2.5\u00d7", "citations": [], "input_tokens": 20100, "output_tokens": 86, "cost_micros": 61590, "latency_ms": 3671, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5, "method": "str_norm", "normalised_pred": "2 5\u00d7", "normalised_gold": "2 5x"}}
+{"qid": "2305.14160v4.pdf::Q004", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "2.5x", "pages": 16, "document_id": 7586, "arm": "llamacloud_basic_lc", "question_id": "2305.14160v4.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document for information about GPT2-XL speedup on AGNews with anchor-only context compression...\n\nIn Table 3 (on page 8), which shows \"The speed-up ratio compared to the original ICL\", I can find the speedup values for different models and datasets.\n\nFor GPT2-XL on AGNews, the speedup is listed as 2.5\u00d7.\n\nAnswer: 2.5\u00d7", "citations": [], "input_tokens": 13741, "output_tokens": 100, "cost_micros": 42723, "latency_ms": 4391, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5, "method": "str_norm", "normalised_pred": "2 5\u00d7", "normalised_gold": "2 5x"}}
+{"qid": "2305.14160v4.pdf::Q004", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "2.5x", "pages": 16, "document_id": 7586, "arm": "llamacloud_premium_lc", "question_id": "2305.14160v4.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document for information about anchor-only context compression speedup on AGNews with GPT2-XL...\n\nIn Table 3 on page 11, the acceleration ratios of the Hidden_anchor method are presented:\n\n| Model   | SST-2 | TREC | AGNews | EmoC |\n| ------- | ----- | ---- | ------ | ---- |\n| GPT2-XL | 1.1\u00d7  | 1.5\u00d7 | 2.5\u00d7   | 1.4\u00d7 |\n\nAnswer: 2.5\u00d7", "citations": [], "input_tokens": 23799, "output_tokens": 134, "cost_micros": 73407, "latency_ms": 5157, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5, "method": "str_norm", "normalised_pred": "2 5\u00d7", "normalised_gold": "2 5x"}}
+{"qid": "2305.14160v4.pdf::Q004", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "2.5x", "pages": 16, "document_id": 7586, "arm": "surfsense_agentic", "question_id": "2305.14160v4.pdf::Q004", "answer_letter": null, "raw_text": "I'll search for information about anchor-only context compression and GPT2-XL speed up on AGNews in the provided document.Let me search for \"anchor\" and \"AGNews\" in the document to find the relevant information.Now let me look at Table 3 which shows acceleration ratios for the Hiddenanchor method:Perfect! I found the answer in Table 3. The table shows acceleration ratios of the Hiddenanchor method, and for GPT2-XL on AGNews, the speedup is 2.5\u00d7.\n\nAnswer: 2.5\u00d7", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 36062, "error": null, "extra": {"thread_id": 1985, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_c1692b1f5bfe4ec6aa79ada899da40bd", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 139, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.5, "method": "str_norm", "normalised_pred": "2 5\u00d7", "normalised_gold": "2 5x"}}
+{"qid": "2305.14160v4.pdf::Q005", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "Description", "pages": 16, "document_id": 7586, "arm": "native_pdf", "question_id": "2305.14160v4.pdf::Q005", "answer_letter": null, "raw_text": "I need to find information about the most confusing category of Abbreviation in the document.\n\nLooking at the confusion matrices in the document, particularly in Section 3.3 (Anchor Distances for Error Diagnosis) and the appendices, I can see confusion matrices for the TREC dataset.\n\nIn Figure 6 on page 8, there are two confusion matrices shown:\n- (a) Confusion matrix of Confusion^pred_ij\n- (b) Confusion matrix of Confusion_ij\n\nLooking at the Abbreviation row in both matrices, I need to identify which category Abbreviation is most confused with (the lowest confusion score, or highest confusion).\n\nFrom Figure 6(b), looking at the Abbreviation row:\n- Abbreviation-Abbreviation: 1 (diagonal, set to 1)\n- Abbreviation-Entity: 0.84\n- Abbreviation-Description: 0.76\n- Abbreviation-Person: 1\n- Abbreviation-Location: 1\n- Abbreviation-Number: 0.87\n\nThe lowest value (indicating highest confusion) in the Abbreviation row is with Description (0.76), followed by Entity (0.84).\n\nThis is also confirmed in the text on page 8 which states: \"Figure 6 shows that the proposed approximation metric, Confusion^pred_ij, can identify the most confusing case (Description-Entity) and performs reasonably well for highly confusing categories (Entity-Abbreviation, Description-Abbreviation).\"\n\nAnswer: Description", "citations": [], "input_tokens": 41202, "output_tokens": 351, "cost_micros": 128871, "latency_ms": 26937, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2305.14160v4.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "description", "normalised_gold": "description"}}
+{"qid": "2305.14160v4.pdf::Q005", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "Description", "pages": 16, "document_id": 7586, "arm": "azure_basic_lc", "question_id": "2305.14160v4.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Figure 6(b) which shows the confusion matrix of Confusion_ij on the TREC dataset, I need to find which category has the highest confusion score with Abbreviation.\n\nFrom the confusion matrix in Figure 6(b), the values for Abbreviation's confusion with other categories are:\n- Entity: 0.84\n- Description: 0.76\n- Person: 1\n- Location: 1\n- Number: 1\n\nThe lowest value (highest confusion) is with Entity at 0.84.\n\nAnswer: Entity", "citations": [], "input_tokens": 17732, "output_tokens": 125, "cost_micros": 55071, "latency_ms": 6780, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "entity", "normalised_gold": "description"}}
+{"qid": "2305.14160v4.pdf::Q005", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "Description", "pages": 16, "document_id": 7586, "arm": "azure_premium_lc", "question_id": "2305.14160v4.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Figure 6(b) which shows the actual confusion matrix (Confusionij) on the TREC dataset, I need to find which category Abbreviation is most confused with.\n\nFrom the confusion matrix in Figure 6(b), the Abbreviation row shows the following confusion scores with other categories:\n- Entity: 0.84\n- Description: 0.76\n- Person: 1\n- Location: 1\n- Number: 1\n\nLower confusion scores indicate higher confusion between categories (as stated in the paper). The lowest score is 0.76 with Description, followed by 0.84 with Entity.\n\nAnswer: Description", "citations": [], "input_tokens": 20090, "output_tokens": 147, "cost_micros": 62475, "latency_ms": 5234, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "description", "normalised_gold": "description"}}
+{"qid": "2305.14160v4.pdf::Q005", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "Description", "pages": 16, "document_id": 7586, "arm": "llamacloud_basic_lc", "question_id": "2305.14160v4.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the confusion matrices in Figure 14, I need to identify which category is most confused with \"Abbreviation.\"\n\nIn the confusion matrices, lower values indicate higher confusion (as stated in the document: \"The closer Confusion_{ij} approximates 1, the less pronounced the confusion\").\n\nLooking at the \"Abbreviation\" row across the different matrices (M=5, 10, 20, 50, 100, 200), the lowest values (highest confusion) are:\n- Description: 0.31, 0.45, 0.52\n- Entity: 0.44, 0.49, 0.51\n\nThe \"Description\" category consistently shows the lowest values when compared with Abbreviation, indicating the highest degree of confusion.\n\nAnswer: Description", "citations": [], "input_tokens": 13731, "output_tokens": 174, "cost_micros": 43803, "latency_ms": 6483, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "description", "normalised_gold": "description"}}
+{"qid": "2305.14160v4.pdf::Q005", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "Description", "pages": 16, "document_id": 7586, "arm": "llamacloud_premium_lc", "question_id": "2305.14160v4.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the confusion matrices provided in the document, I need to find which category is most confused with \"Abbreviation.\"\n\nIn Figure 6(a), which shows the predicted confusion matrix ($\\text{Confusion}_{ij}^{\\text{pred}}$), the values for Abbreviation with other categories are:\n- Entity: 0.49\n- Description: 0.45\n- Person: 0.90\n- Location: 1.00\n- Number: 0.83\n\nLower values indicate higher confusion (as stated in the document: \"0 indicating the highest degree of category confusion\").\n\nIn Figure 6(b), which shows the actual confusion matrix ($\\text{Confusion}_{ij}$), the values for Abbreviation are:\n- Entity: 0.84\n- Description: 0.76\n- Person: 1.00\n- Location: 1.00\n- Number: 1.00\n\nIn both matrices, the lowest value (highest confusion) for Abbreviation is with \"Description\" (0.45 predicted, 0.76 actual), followed closely by \"Entity\" (0.49 predicted, 0.84 actual).\n\nAnswer: Description", "citations": [], "input_tokens": 23789, "output_tokens": 270, "cost_micros": 75417, "latency_ms": 7077, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "description", "normalised_gold": "description"}}
+{"qid": "2305.14160v4.pdf::Q005", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "Description", "pages": 16, "document_id": 7586, "arm": "surfsense_agentic", "question_id": "2305.14160v4.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to find information about the most confusing category of abbreviation.Let me search for information about \"confusing\" and \"abbreviation\" categories in the document.Let me read the specific chunk that mentions confusing categories and abbreviations:Perfect! I found the answer. Let me read a bit more context to confirm:Based on the document, the text clearly states that \"the most confusing case (Description-Entity)\" was identified. However, the question asks specifically about \"the most confusing category of Abbreviation.\"\n\nLooking at the confusion matrix data and the text, it mentions that the highly confusing categories involving Abbreviation are:\n- Entity-Abbreviation\n- Description-Abbreviation\n\nLet me check if there's more specific information about which is THE most confusing for Abbreviation:\n\nLooking at the confusion matrix values shown:\n- Abbreviation-Entity: 0.84\n- Abbreviation-Description: 0.76\n\nThe lower the value in a confusion matrix, the more confusing the pair. So Entity (0.84) is more confusing with Abbreviation than Description (0.76), but actually looking at this as a similarity matrix (as the caption says \"heatmaps display similarity\"), higher values mean more similar/confusing.\n\nBased on the text stating \"highly confusing categories (Entity-Abbreviation, Description-Abbreviation)\" with Entity-Abbreviation listed first, and the confusion matrix showing 0.84 for Abbreviation-Entity vs 0.76 for Abbreviation-Description:\n\nAnswer: Entity", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 40344, "error": null, "extra": {"thread_id": 1986, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_052d25ab92e746d681f311675087569d", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 227, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "entity", "normalised_gold": "description"}}
+{"qid": "2306.05425v1.pdf::Q000", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "egocentric planning", "pages": 22, "document_id": 7587, "arm": "native_pdf", "question_id": "2306.05425v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1 on page 2, which shows the MIMIC-IT overview tree diagram, I can see three main branches: Perception, Reasoning, and Planning.\n\nAt the top of the Planning branch (shown in red/pink on the right side), the task labeled is \"Indoor Event Planning\".\n\nAnswer: Indoor Event Planning", "citations": [], "input_tokens": 55843, "output_tokens": 76, "cost_micros": 168669, "latency_ms": 49328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2306.05425v1.pdf"}, "graded": {"correct": false, "f1": 0.4, "method": "str_norm", "normalised_pred": "indoor event planning", "normalised_gold": "egocentric planning"}}
+{"qid": "2306.05425v1.pdf::Q000", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "egocentric planning", "pages": 22, "document_id": 7587, "arm": "azure_basic_lc", "question_id": "2306.05425v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document, which shows the MIMIC-IT overview with different branches representing fundamental capabilities, I need to identify the task at the top of the planning branch.\n\nIn the planning section (rightmost branch of the tree diagram), examining from top to bottom, the topmost task shown is \"Commonsense Reasoning\" with the example question \"What might this young boy learn from this experience? (LA)\"\n\nAnswer: Commonsense Reasoning", "citations": [], "input_tokens": 21188, "output_tokens": 104, "cost_micros": 65124, "latency_ms": 6780, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "commonsense reasoning", "normalised_gold": "egocentric planning"}}
+{"qid": "2306.05425v1.pdf::Q000", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "egocentric planning", "pages": 22, "document_id": 7587, "arm": "azure_premium_lc", "question_id": "2306.05425v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1, which shows the MIMIC-IT overview with three main branches (Perception, Reasoning, and Planning), I can see the Planning branch on the right side.\n\nAt the top of the Planning branch, the task shown is \"Indoor Event Planning (IEP)\" with the question \"Can you suggest any recreational pursuits that can be enjoyed within this room?\"\n\nAnswer: Indoor Event Planning (IEP)", "citations": [], "input_tokens": 23513, "output_tokens": 95, "cost_micros": 71964, "latency_ms": 5625, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.3333333333333333, "method": "str_norm", "normalised_pred": "indoor event planning iep", "normalised_gold": "egocentric planning"}}
+{"qid": "2306.05425v1.pdf::Q000", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "egocentric planning", "pages": 22, "document_id": 7587, "arm": "llamacloud_basic_lc", "question_id": "2306.05425v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about a planning branch or tree structure with tasks.\n\nIn Figure 1 (MIMIC-IT overview), I can see different capabilities organized into three main branches: Perception, Reasoning, and Planning.\n\nUnder the Planning branch, I can see two tasks listed:\n1. Egocentric Planning (E4D)\n2. Episode Memory (DC)\n\nLooking at the visual layout in Figure 1, \"Egocentric Planning\" appears to be positioned at the top of the Planning branch.\n\nAnswer: Egocentric Planning", "citations": [], "input_tokens": 20245, "output_tokens": 129, "cost_micros": 62670, "latency_ms": 5657, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "egocentric planning", "normalised_gold": "egocentric planning"}}
+{"qid": "2306.05425v1.pdf::Q000", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "egocentric planning", "pages": 22, "document_id": 7587, "arm": "llamacloud_premium_lc", "question_id": "2306.05425v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document, I need to identify the task at the top of the Planning branch.\n\nThe Planning section shows three items:\n1. At the top: \"Is it safe to walk on the floor while the woman is cleaning? E4D\" \u2192 EVR [Egocentric Video Reasoning]\n2. In the middle: \"Can you suggest any ways to brighten up the room I'm in? E4D\" \u2192 EP [Egocentric Planning]\n3. At the bottom: \"Can you suggest any recreational pursuits that can be enjoyed within this room? IEP\" \u2192 IEPP [Indoor Event Planning]\n\nThe topmost task in the Planning branch is Egocentric Video Reasoning (EVR).\n\nAnswer: Egocentric Video Reasoning", "citations": [], "input_tokens": 24261, "output_tokens": 176, "cost_micros": 75423, "latency_ms": 6139, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.4, "method": "str_norm", "normalised_pred": "egocentric video reasoning", "normalised_gold": "egocentric planning"}}
+{"qid": "2306.05425v1.pdf::Q000", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "egocentric planning", "pages": 22, "document_id": 7587, "arm": "surfsense_agentic", "question_id": "2306.05425v1.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find the planning branch of the tree and identify the task at the top.Let me search for content related to \"planning branch\" or \"tree\" to find the relevant figure or diagram:Let me search more specifically for \"tree\" and look at figures:Let me look at Figure 1 which mentions perception, reasoning, and planning:Good! I can see \"Planning\" section. Let me continue reading to see what tasks are under the Planning branch:Perfect! I can see the Planning section with its tasks. Looking at the structure, under \"Planning\" I can see:\n\n1. \"Commonsense Reasoning\" - \"What might this young boy learn from this experience? (LA)\"\n2. \"General Scene Difference\" \n3. \"Egocentric Planning\"\n4. \"Indoor Event Planning\"\n\nBut I need to see the visual tree structure more carefully. Let me look at the earlier part to understand the tree layout better:Now I can see the structure clearly. Looking at Figure 1, there's a tree structure with three main branches:\n1. Perception\n2. Reasoning\n3. Planning\n\nUnder the \"Planning\" branch, the first task/question that appears is: \"What might this young boy learn from this experience? (LA)\" which is labeled as \"Commonsense Reasoning\".\n\nSo the task at the top of the Planning branch is \"Commonsense Reasoning\".\n\nAnswer: Commonsense Reasoning", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 51937, "error": null, "extra": {"thread_id": 1987, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_f3cc3b28e854493c927ab87e43d7d1d0", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 285, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "commonsense reasoning", "normalised_gold": "egocentric planning"}}
+{"qid": "2306.05425v1.pdf::Q002", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "6", "pages": 22, "document_id": 7587, "arm": "native_pdf", "question_id": "2306.05425v1.pdf::Q002", "answer_letter": null, "raw_text": "I'll search through the document for bar plots.\n\nLooking through the figures:\n\n**Figure 1** (page 2): This shows example images and questions from the MIMIC-IT dataset. No bar plots here.\n\n**Figure 2** (page 4): This shows data format comparisons with example images and text. No bar plots here.\n\n**Figure 3** (page 5): This shows the Syphus pipeline overview. No bar plots here.\n\n**Figure 4** (page 7): This figure shows \"The data statistics of multi-modal in-context instruction-response pairs\" with:\n- (a) Instructions - shown as a sunburst/radial chart (not a bar plot)\n- (b) Responses - shown as a sunburst/radial chart (not a bar plot)\n- (c) Statistics of instructions and responses - This contains **4 bar plots**:\n  1. Instruction Length distribution\n  2. Response Length distribution\n  3. # Images distribution\n  4. # Related Instructions distribution\n\n**Figure 5** (page 8): This shows example responses from Otter with images and text. No bar plots here.\n\n**Figure 6** (page 10): This shows three subplots:\n- (a) Video understanding - **2 bar plots** (MSVD 0-shot and MSRVTT 0-shot)\n- (b) Vision-language model alignment - **1 bar plot** (Elo Rating)\n- (c) COCO caption - This is a line graph, not a bar plot\n\nTotal bar plots: 4 (from Figure 4c) + 2 (from Figure 6a) + 1 (from Figure 6b) = **7 bar plots**\n\nAnswer: 7", "citations": [], "input_tokens": 55838, "output_tokens": 405, "cost_micros": 173589, "latency_ms": 40905, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2306.05425v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "7", "normalised_gold": "6"}}
+{"qid": "2306.05425v1.pdf::Q002", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "6", "pages": 22, "document_id": 7587, "arm": "azure_basic_lc", "question_id": "2306.05425v1.pdf::Q002", "answer_letter": null, "raw_text": "I need to carefully examine the document for bar plots, including any subplots.\n\nLooking through the figures:\n\n**Figure 1**: Shows example images and text - no bar plots.\n\n**Figure 2**: Shows example images and text comparisons - no bar plots.\n\n**Figure 3**: Shows a pipeline diagram - no bar plots.\n\n**Figure 4**: This figure contains multiple subplots showing data statistics:\n- (a) Shows two circular/sunburst plots for instructions - not bar plots\n- (b) Shows two circular/sunburst plots for responses - not bar plots\n- (c) Shows four plots with statistics:\n  1. \"Instruction Length\" - appears to be a histogram/bar plot\n  2. \"Response Length\" - appears to be a histogram/bar plot\n  3. \"# Images\" - appears to be a histogram/bar plot\n  4. \"# Related Instructions\" - appears to be a histogram/bar plot\n\nSo Figure 4(c) contains 4 bar plots.\n\n**Figure 5**: Shows example images and responses - no bar plots.\n\n**Figure 6**: This figure contains three subfigures:\n- (a) \"Video understanding\" - shows a grouped bar chart with multiple bars\n- (b) \"Vision-language model alignment\" - shows a bar chart\n- (c) \"COCO caption\" - shows a line plot with points, not a bar plot\n\nSo Figure 6 has 2 bar plots (a and b).\n\nTotal count: 4 (from Figure 4c) + 2 (from Figure 6) = 6 bar plots.\n\nAnswer: 6", "citations": [], "input_tokens": 21183, "output_tokens": 367, "cost_micros": 69054, "latency_ms": 11281, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2306.05425v1.pdf::Q002", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "6", "pages": 22, "document_id": 7587, "arm": "azure_premium_lc", "question_id": "2306.05425v1.pdf::Q002", "answer_letter": null, "raw_text": "I need to carefully examine all the figures in the document to count the bar plots.\n\nLooking through the figures:\n\n**Figure 1**: Shows example images and text describing different capabilities - no bar plots.\n\n**Figure 2**: Shows data format comparison with images and text - no bar plots.\n\n**Figure 3**: Shows the Sythus pipeline overview - no bar plots.\n\n**Figure 4**: This is titled \"The data statistics of multi-modal in-context instruction-response pairs\" and contains:\n- (a) A circular/sunburst plot showing root verb-noun pairs of instructions\n- (b) A circular/sunburst plot showing root verb-noun pairs of responses  \n- (c) Four separate plots labeled \"Statistics of instructions and responses\":\n  - Top left: A histogram/bar plot showing \"# Instructions\" vs \"Instruction Length\"\n  - Top right: A histogram/bar plot showing \"# Responses\" vs \"Response Length\"\n  - Bottom left: A histogram/bar plot showing \"# Instructions\" vs \"# Images\"\n  - Bottom right: A histogram/bar plot showing \"# Instructions\" vs \"# Related Instructions\"\n\nSo in Figure 4(c), there are **4 bar plots**.\n\n**Figure 5**: Shows example responses with images and text - no bar plots.\n\n**Figure 6**: Contains three subplots:\n- (a) \"Video understanding\" - shows bar plots comparing VideoChatGPT and Otter on MSVD and MSRVTT for QA and Captioning. This contains **4 grouped bar comparisons** (which I count as bar plots).\n- (b) \"Vision-language model alignment\" - shows an Elo rating comparison with bars for different models - this is **1 bar plot**.\n- (c) \"COCO caption\" - shows a line plot, not a bar plot.\n\nSo Figure 6 has: 4 (from 6a) + 1 (from 6b) = **5 bar plots**.\n\n**Figure 7**: Shows text examples in multiple languages - no bar plots.\n\nTotal bar plots: 4 (from Figure 4c) + 5 (from Figure 6) = **9 bar plots**.\n\nAnswer: 9", "citations": [], "input_tokens": 23508, "output_tokens": 506, "cost_micros": 78114, "latency_ms": 14187, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "9", "normalised_gold": "6"}}
+{"qid": "2306.05425v1.pdf::Q002", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "6", "pages": 22, "document_id": 7587, "arm": "llamacloud_basic_lc", "question_id": "2306.05425v1.pdf::Q002", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "6"}}
+{"qid": "2306.05425v1.pdf::Q002", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "6", "pages": 22, "document_id": 7587, "arm": "llamacloud_premium_lc", "question_id": "2306.05425v1.pdf::Q002", "answer_letter": null, "raw_text": "I need to carefully examine all the figures in the document to count bar plots.\n\nLooking through the figures:\n\n**Figure 1**: This is a flowchart/diagram showing the MIMIC-IT overview with three main branches (Perception, Reasoning, Planning). Not a bar plot.\n\n**Figure 2**: This shows data format comparisons with images and text examples. Not a bar plot.\n\n**Figure 3**: This is a flowchart showing the Sythus overview. Not a bar plot.\n\n**Figure 4**: This figure is titled \"The data statistics of multi-modal in-context instruction-response pairs\" and contains:\n- (a) A circular/sunburst chart showing instructions (verb-noun pairs)\n- (b) A circular/sunburst chart showing responses (verb-noun pairs)\n- (c) \"Statistics of instructions and responses\" which contains 4 separate bar plots:\n  1. Instruction Length vs # Instructions (bar plot)\n  2. Response Length vs # Responses (bar plot)\n  3. # Images vs # Instructions (bar plot)\n  4. # Related Instructions vs # Instructions (bar plot)\n\nSo Figure 4(c) contains **4 bar plots**.\n\n**Figure 5**: This shows example responses from Otter with images and text. Not a bar plot.\n\n**Figure 6**: This contains three parts:\n- (a) A table showing video understanding results. Not a bar plot.\n- (b) A table showing vision-language model alignment. Not a bar plot.\n- (c) A table showing COCO caption results. Not a bar plot.\n\nTotal count: 4 bar plots (all from Figure 4c)\n\nAnswer: 4", "citations": [], "input_tokens": 24256, "output_tokens": 385, "cost_micros": 78543, "latency_ms": 10187, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "6"}}
+{"qid": "2306.05425v1.pdf::Q002", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "6", "pages": 22, "document_id": 7587, "arm": "surfsense_agentic", "question_id": "2306.05425v1.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the document to count all bar plots (including subplots).Let me search for figures and plots in the document:Now let me read the actual content around the figures to see the descriptions of plots:Let me read more sections to find all the figures:Now let me look at Figure 4 more carefully to see if it contains bar plots:Good! I can see Figure 4 (c) has bar plots. Let me continue looking at Figure 6:Now I need to look at the actual visual description to understand what kind of plots Figure 6 has. Looking at the data in lines 1693-1823, I can see Figure 6 has three subplots (a), (b), and (c). Let me check if these are bar plots by examining the structure:\n\n- Figure 6(a): Shows \"MSVD 0-shot\" and \"MSRVTT 0-shot\" with \"QA\" and \"Captioning\" categories, comparing VideoChatGPT and Otter - this appears to be bar plots\n- Figure 6(b): Shows \"Elo Rating\" comparison among MM-GPT, Inst. BLIP, LLaVA, MiniGPT, and Otter - this appears to be a bar plot\n- Figure 6(c): Shows \"COCO caption\" with \"0-shot 4-shot 8-shot 16-shot\" comparing Otter and OpenFlamingo - this appears to be a line or bar plot\n\nNow let me search for more figures to make sure I don't miss any:Figure 7 is mentioned but let me check if it's a plot or just examples. From line 2292, Figure 7 seems to be just text examples for multi-lingual translation, not a plot.\n\nLet me search for any appendix figures or additional figures:Great! I found references to bar charts. Let me read those sections more carefully:These descriptions refer to the charts/bar plots that appear in Figure 4(c). Let me now carefully read to understand which images correspond to which figures. The Im1.png through Im4.png appear to be the four subplots in Figure 4(c).\n\nLet me search for Figure 6 visual descriptions:Let me look for the visual description of Figure 6:Let me search for visual descriptions that mention bar charts or plots more systematically by searching around the figure areas:Now let me check if there are any additional figures by searching for all figure captions:Good! So we have:\n- Figure 1: Overview (not a bar plot)\n- Figure 2: Data format comparison (not a bar plot)\n- Figure 3: Sythus overview (not a bar plot)\n- Figure 4: Data statistics with (a) and (b) showing verb-noun pairs, and (c) showing statistics with multiple subplots\n- Figure 5: Response examples (not a bar plot)\n- Figure 6: With (a), (b), and (c) subplots\n- Figure 7: Multi-lingual translation examples (not a bar plot)\n\nNow I need to count the bar plots more carefully. Let me re-examine Figure 4(c) and Figure 6:\n\nFrom my reading:\n- Figure 4(a): Circular/sunburst plot (not bar plot) - verb-noun pairs with inner/outer circles\n- Figure 4(b): Circular/sunburst plot (not bar plot) - verb-noun pairs with inner/outer circles  \n- Figure 4(c): Contains 4 bar plot histograms based on the visual descriptions I found:\n  1. Instructions vs Instruction Length (Im1.png) - bar plot\n  2. Responses vs Response Length (Im2.png) - bar plot\n  3. Instructions vs # Images (Im3.png) - bar plot\n  4. Instructions vs # Related Instructions (Im4.png) - bar plot\n\nFor Figure 6, I need to examine what type of plots they are. Let me look at the text around Figure 6 more carefully. From the data I saw (lines 1693-1823), Figure 6 appears to have:\n- Figure 6(a): Shows QA and Captioning comparisons for MSVD and MSRVTT - likely bar plots (2 groups x 2 bars = 4 bars total, but this counts as 2 bar plots for the two datasets)\n- Figure 6(b): Shows Elo Rating comparison - likely a bar plot (1 bar plot)\n- Figure 6(c): Shows COCO caption with 0-shot, 4-shot, 8-shot, 16-shot - this could be a line plot or bar plot\n\nLet me search for more specific information about these plots. Let me check if there are visual descriptions for Figure 6 plots:The file ends at line 5199. Let me search systematically for any remaining bar plot descriptions:Based on my thorough analysis of the document, let me now count all the bar plots:\n\n**Figure 4(c)** contains 4 bar plot histograms:\n1. Instructions vs Instruction Length\n2. Responses vs Response Length  \n3. Instructions vs # Images\n4. Instructions vs # Related Instructions\n\n**Figure 6** - I need to determine how many bar plots are in each subplot. Looking at the data structure:\n- Figure 6(a): Video understanding evaluation with MSVD 0-shot and MSRVTT 0-shot, each showing QA and Captioning - this appears to be 2 grouped bar charts (one for MSVD, one for MSRVTT)\n- Figure 6(b): Elo Rating comparison showing multiple models - this is 1 bar chart\n- Figure 6(c): COCO caption showing 0-shot through 16-shot results comparing Otter and OpenFlamingo - this could be either a grouped bar chart or line chart\n\nLooking at the structure in the text (lines 1816-1817: \"0-shot 4-shot 8-shot 16-shot\"), this suggests it's showing a progression, which is more typical of a line chart. However, I need to be careful.\n\nLet me reconsider the counting. The question asks for \"bar plots (including subplots)\". This means I should count each separate bar plot panel, not individual bars.\n\nFrom the evidence:\n- **Figure 4(c)**: 4 bar plot histograms (clearly described as histograms/bar charts in the visual descriptions)\n- **Figure 6(a)**: 2 bar plots (one for MSVD 0-shot, one for MSRVTT 0-shot, each comparing models on QA and Captioning)\n- **Figure 6(b)**: 1 bar plot (Elo Rating comparison)\n- **Figure 6(c)**: Likely 1 chart but unclear if it's a bar plot or line plot based on the \"0-shot 4-shot 8-shot 16-shot\" progression\n\nTotal count depends on whether Figure 6(c) is a bar plot. Given the context of comparing few-shot performance across different numbers of shots, and looking at typical machine learning papers, this is often shown as either a line chart or grouped bar chart.\n\nHowever, examining the OCR text more carefully from chunk 152307 (lines 1816-1822), the layout shows:\n- (a) Video understanding\n- (b) Vision-language model alignment  \n- (c) COCO caption\n\nAnd earlier in the caption it mentions \"CIDEr\" metric with values like \"60.8\", \"55\", etc. with labels for different shot counts.\n\nGiven the uncertainty, let me make my best assessment: Figure 6(c) showing few-shot learning progression is more likely to be a line plot given it shows a progression/trend across shot counts.\n\n**Final count:**\n- Figure 4(c): 4 bar plots\n- Figure 6(a): 2 bar plots\n- Figure 6(b): 1 bar plot\n- Figure 6(c): Not a bar plot (likely line plot)\n\n**Total: 7 bar plots**\n\nWait, let me reconsider Figure 6(a). Looking at the text structure again (lines 1693-1806), it shows:\n- MSVD 0-shot with QA and Captioning\n- MSRVTT 0-shot with QA and Captioning\n- These are comparing VideoChatGPT vs Otter\n\nThis could be:\n- 2 separate bar plots (one for MSVD, one for MSRVTT), OR\n- 1 bar plot with 2 groups\n\nTypically in papers, when multiple datasets are shown side-by-side in subfigure (a), they would be considered separate bar plots. So I'll count them as 2.\n\n**Final answer: 7**\n\nAnswer: 7", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 164109, "error": null, "extra": {"thread_id": 1988, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_98fcc3ce4d2f4d98b3c04b41720c35ca", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 1170, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "7", "normalised_gold": "6"}}
+{"qid": "2306.05425v1.pdf::Q003", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "Prompt", "pages": 22, "document_id": 7587, "arm": "native_pdf", "question_id": "2306.05425v1.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Figure 3 titled \"Sythus overview\" on page 5, I can see the cold start process.\n\nThe figure shows a \"Cold Start\" stage that comes before the main Sythus pipeline. The cold start stage shows:\n- In-context examples (document icon at top)\n- An arrow pointing down to \"Cold Start\"\n- ChatGPT being queried during this stage\n- This then leads to \"Step 1: System Message + visual annotation\"\n\nThe cold start stage is described in the text as being used to \"identify the optimal system message and in-context example for querying instruction-response pairs in a given dataset.\"\n\nAnswer: In-context examples", "citations": [], "input_tokens": 55843, "output_tokens": 153, "cost_micros": 169824, "latency_ms": 41358, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2306.05425v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "in context examples", "normalised_gold": "prompt"}}
+{"qid": "2306.05425v1.pdf::Q003", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "Prompt", "pages": 22, "document_id": 7587, "arm": "azure_basic_lc", "question_id": "2306.05425v1.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Figure 3 which shows the Sythus overview, the cold start stage appears before the main pipeline steps (Step 1-4).\n\nThe cold start stage shows interactions with ChatGPT to identify the optimal system message and in-context examples. The figure shows arrows connecting to \"System Message\" and \"In-context examples\" components.\n\nAnswer: Identify optimal system message and in-context example", "citations": [], "input_tokens": 21188, "output_tokens": 91, "cost_micros": 64929, "latency_ms": 5532, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "identify optimal system message and in context example", "normalised_gold": "prompt"}}
+{"qid": "2306.05425v1.pdf::Q003", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "Prompt", "pages": 22, "document_id": 7587, "arm": "azure_premium_lc", "question_id": "2306.05425v1.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Figure 3 which shows the Sythus overview, the cold start stage shows a flow diagram where the first step involves \"System Message\" with an arrow pointing to ChatGPT.\n\nAnswer: System Message", "citations": [], "input_tokens": 23513, "output_tokens": 50, "cost_micros": 71289, "latency_ms": 4468, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "system message", "normalised_gold": "prompt"}}
+{"qid": "2306.05425v1.pdf::Q003", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "Prompt", "pages": 22, "document_id": 7587, "arm": "llamacloud_basic_lc", "question_id": "2306.05425v1.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Figure 3 (Sythus overview), I can see the pipeline stages.\n\nThe cold start stage is shown at the top, and the first step within it involves prompting ChatGPT to generate system messages and in-context examples.\n\nAnswer: System Message", "citations": [], "input_tokens": 20245, "output_tokens": 62, "cost_micros": 61665, "latency_ms": 4858, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "system message", "normalised_gold": "prompt"}}
+{"qid": "2306.05425v1.pdf::Q003", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "Prompt", "pages": 22, "document_id": 7587, "arm": "llamacloud_premium_lc", "question_id": "2306.05425v1.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Figure 3 which shows the Sythus overview, I can see the cold start process and the subsequent steps.\n\nThe cold start section shows \"In-context examples\" which feeds into Step 1.\n\nStep 1 is labeled as \"System Message + visual annotation\" which leads to the \"Prompt\".\n\nAnswer: System Message + visual annotation", "citations": [], "input_tokens": 24261, "output_tokens": 80, "cost_micros": 73983, "latency_ms": 5562, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "system message visual annotation", "normalised_gold": "prompt"}}
+{"qid": "2306.05425v1.pdf::Q003", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "Prompt", "pages": 22, "document_id": 7587, "arm": "surfsense_agentic", "question_id": "2306.05425v1.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the document to find the figure about cold start and identify the first step.Let me search for \"cold start\" in the document to locate the relevant figure.Good! I found a reference to \"Cold Start\" with steps. Let me read the section around line 759-798 (chunk 152250) where the figure appears.Perfect! I can see the figure clearly shows the steps of the cold start process. According to the figure, the steps are:\n\n- Step 1: System Message\n- Step 2: Generate instruction-response pairs\n- Step 3: Filtering\n- Step 4: Translation\n\nThe first step of cold start is \"System Message\".\n\nAnswer: System Message", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 29859, "error": null, "extra": {"thread_id": 1989, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_39c9f325d0c8483fb88a1b48780b1918", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 138, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "system message", "normalised_gold": "prompt"}}
+{"qid": "2306.05425v1.pdf::Q004", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "soccer", "pages": 22, "document_id": 7587, "arm": "native_pdf", "question_id": "2306.05425v1.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the figures in the document, particularly Figure 1 and Figure 5 which show examples from the MIMIC-IT dataset and Otter's capabilities:\n\nIn Figure 1, there are examples showing egocentric video reasoning with what appears to be soccer/football scenes.\n\nIn Figure 5, the egocentric visual assistant examples at the bottom show scenes from what appears to be soccer/football, including references to passing the ball, wearing jerseys, and the goalkeeper saving the ball.\n\nAnswer: Soccer/Football", "citations": [], "input_tokens": 55848, "output_tokens": 115, "cost_micros": 169269, "latency_ms": 35250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2306.05425v1.pdf"}, "graded": {"correct": true, "f1": 0.6666666666666666, "method": "str_norm", "normalised_pred": "soccer football", "normalised_gold": "soccer"}}
+{"qid": "2306.05425v1.pdf::Q004", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "soccer", "pages": 22, "document_id": 7587, "arm": "azure_basic_lc", "question_id": "2306.05425v1.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document, I need to find visual examples showing sports.\n\nIn Figure 5, which shows \"Otter's response examples in different scenarios,\" there is an \"Egocentric Visual Assistant\" example with text that says \"Hey Otter, what should I do?\" and the response mentions \"Pass the ball to the middle and run to the gate, looking for a shooting opportunity.\"\n\nThis clearly refers to soccer/football.\n\nAnswer: Soccer (football)", "citations": [], "input_tokens": 21193, "output_tokens": 102, "cost_micros": 65109, "latency_ms": 5812, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.6666666666666666, "method": "str_norm", "normalised_pred": "soccer football", "normalised_gold": "soccer"}}
+{"qid": "2306.05425v1.pdf::Q004", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "soccer", "pages": 22, "document_id": 7587, "arm": "azure_premium_lc", "question_id": "2306.05425v1.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the visual examples in the document, particularly in Figure 5 under \"Egocentric Visual Assistant,\" I can see an image showing a soccer/football scene from a first-person perspective.\n\nAnswer: Soccer (football)", "citations": [], "input_tokens": 23518, "output_tokens": 52, "cost_micros": 71334, "latency_ms": 3921, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.6666666666666666, "method": "str_norm", "normalised_pred": "soccer football", "normalised_gold": "soccer"}}
+{"qid": "2306.05425v1.pdf::Q004", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "soccer", "pages": 22, "document_id": 7587, "arm": "llamacloud_basic_lc", "question_id": "2306.05425v1.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the visual examples in the document, I need to identify what sport is shown in the MIMIC-IT dataset examples and Otter's response examples.\n\nIn Figure 5, which shows \"Otter's response examples in different scenarios,\" there is an example labeled \"Egocentric Visual Assistant\" at the bottom. The text mentions \"guiding users to land a small aircraft.\"\n\nAdditionally, in the Ego4D (E4D) section's in-context examples (Table 5), there are questions about:\n- \"What color of jersey does my teammates wear?\" with the response \"Black, as you pass the ball to a player wearing black jersey\"\n- \"Does our team goal?\" with the response about a goalkeeper saving the ball\n\nThis indicates soccer/football is being discussed.\n\nAnswer: Soccer (football)", "citations": [], "input_tokens": 20250, "output_tokens": 180, "cost_micros": 63450, "latency_ms": 7734, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.6666666666666666, "method": "str_norm", "normalised_pred": "soccer football", "normalised_gold": "soccer"}}
+{"qid": "2306.05425v1.pdf::Q004", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "soccer", "pages": 22, "document_id": 7587, "arm": "llamacloud_premium_lc", "question_id": "2306.05425v1.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document, I need to find visual examples that show sports.\n\nIn Figure 5, under the \"Egocentric Visual Assistant\" section, there are two screenshots. The right screenshot shows \"A person playing soccer with a chat overlay\" and includes the text \"User: Hey Otter, what should I do? OTTER-E: Pass the ball *to the middle* and run to the gate, looking for a shooting opportunity.\"\n\nAnswer: Soccer", "citations": [], "input_tokens": 24266, "output_tokens": 102, "cost_micros": 74328, "latency_ms": 6890, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "soccer", "normalised_gold": "soccer"}}
+{"qid": "2306.05425v1.pdf::Q004", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "soccer", "pages": 22, "document_id": 7587, "arm": "surfsense_agentic", "question_id": "2306.05425v1.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document you mentioned to find information about the visual examples in MIMIC-IT and Otter.Let me search for visual examples and figures in the document to find information about sports shown in MIMIC-IT and Otter examples.Let me read more of the document to find figures and visual examples:Let me continue reading to find specific visual examples that show sports:Found \"tennis\"! Let me search for the context around this mention:Perfect! Let me read that section to get more context about the tennis visual example:Now let me search for where this image appears in relation to MIMIC-IT or Otter examples. Let me look earlier in the document to understand the context of these figures:Let me search backwards to find where these images are part of a figure showing examples:Let me search for \"Otter\" in the document to find where these visual examples are shown:Now let me look at Figure 5, which shows \"Otter's response examples\":Great! I found it. In the Egocentric Visual Assistant section of Figure 5, there's a conversation where Otter-E says \"Pass the ball to the middle and run to the gate, looking for a shooting opportunity.\" This clearly refers to soccer/football. Let me verify by looking at the visual description of the image that accompanies this text:Perfect! I found the answer. The visual description clearly states this is an \"Outdoor soccer/football field with green grass\" with players wearing blue athletic shirts. The Otter-E response also mentions \"Pass the ball to the middle and run to the gate, looking for a shooting opportunity,\" which confirms this is soccer/football.\n\nAnswer: Soccer (football)", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 123078, "error": null, "extra": {"thread_id": 1990, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_e493d9b5d4804d67aac4ae04786b3ef7", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 450, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 0.6666666666666666, "method": "str_norm", "normalised_pred": "soccer football", "normalised_gold": "soccer"}}
+{"qid": "2306.05425v1.pdf::Q006", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "8", "pages": 22, "document_id": 7587, "arm": "native_pdf", "question_id": "2306.05425v1.pdf::Q006", "answer_letter": null, "raw_text": "Looking at Figure 7 on page 16, which shows \"In-context examples for multi-lingual translation query,\" I can count the following languages:\n\n1. English\n2. Chinese (\u4e2d\u6587)\n3. Spanish (espa\u00f1ol)\n4. Japanese (\u65e5\u672c\u8a9e)\n5. French (Fran\u00e7ais)\n6. Korean (\ud55c\uad6d\uc5b4)\n7. German (Deutsch)\n8. Arabic (\u0629\u06cc\u0628\u0631\u0639\u0644\u0627)\n\nAnswer: 8", "citations": [], "input_tokens": 55842, "output_tokens": 108, "cost_micros": 169146, "latency_ms": 37125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2306.05425v1.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "8"}}
+{"qid": "2306.05425v1.pdf::Q006", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "8", "pages": 22, "document_id": 7587, "arm": "azure_basic_lc", "question_id": "2306.05425v1.pdf::Q006", "answer_letter": null, "raw_text": "I need to count the number of languages shown in Figure 7, which displays in-context examples for multi-lingual translation query.\n\nLooking at Figure 7, I can see the following languages listed:\n\n1. English\n2. Chinese (\u4e2d\u6587)\n3. Spanish (espa\u00f1ol)\n4. Japanese (\u65e5\u672c\u8a9e)\n5. French (Fran\u00e7ais)\n6. Korean (\ud55c\uad6d\uc5b4)\n7. German (Deutsch)\n8. Arabic (\u0627\u0644\u0639\u0631\u0628\u064a\u0629)\n\nAnswer: 8", "citations": [], "input_tokens": 21187, "output_tokens": 113, "cost_micros": 65256, "latency_ms": 4327, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "8"}}
+{"qid": "2306.05425v1.pdf::Q006", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "8", "pages": 22, "document_id": 7587, "arm": "azure_premium_lc", "question_id": "2306.05425v1.pdf::Q006", "answer_letter": null, "raw_text": "I need to count the number of languages shown in Figure 7, which displays in-context examples for multi-lingual translation query.\n\nLooking at Figure 7, I can see the following languages listed:\n\n1. English\n2. Chinese \u4e2d\u6587\n3. Spanish espa\u00f1ol\n4. Japanese \u65e5\u672c\u8a9e\n5. French Fran\u00e7ais\n6. Korean \ud55c\uad6d\uc5b4\n7. German Deutsch\n8. Arabic \u0627\u0644\u0639\u0631\u0628\u064a\u0629\n\nAnswer: 8", "citations": [], "input_tokens": 23512, "output_tokens": 102, "cost_micros": 72066, "latency_ms": 4437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "8"}}
+{"qid": "2306.05425v1.pdf::Q006", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "8", "pages": 22, "document_id": 7587, "arm": "llamacloud_basic_lc", "question_id": "2306.05425v1.pdf::Q006", "answer_letter": null, "raw_text": "Looking at the document, specifically at Figure 7 which shows \"In-context examples for multi-lingual translation query\", I can count the following languages:\n\n1. English\n2. Chinese (\u4e2d\u6587)\n3. Spanish (espa\u00f1ol)\n4. Japanese (\u65e5\u672c\u8a9e)\n5. French (Fran\u00e7ais)\n6. Korean (\ud55c\uad6d\uc5b4)\n7. German (Deutsch)\n8. Arabic (\u0629\u06cc\u0628\u0631\u0639\u0644\u0627)\n\nAnswer: 8", "citations": [], "input_tokens": 20244, "output_tokens": 108, "cost_micros": 62352, "latency_ms": 4578, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "8"}}
+{"qid": "2306.05425v1.pdf::Q006", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "8", "pages": 22, "document_id": 7587, "arm": "llamacloud_premium_lc", "question_id": "2306.05425v1.pdf::Q006", "answer_letter": null, "raw_text": "I need to count the number of languages shown in Figure 7, which displays in-context examples for multi-lingual translation query.\n\nLooking at Figure 7 on page 16, I can see the following languages listed:\n\n1. English\n2. Chinese \u4e2d\u6587\n3. Spanish espa\u00f1ol\n4. Japanese \u65e5\u672c\u8a9e\n5. French Fran\u00e7ais\n6. Korean \ud55c\uad6d\uc5b4\n7. German Deutsch\n8. Arabic \u0627\u0644\u0639\u0631\u0628\u064a\u0629\n\nAnswer: 8", "citations": [], "input_tokens": 24260, "output_tokens": 107, "cost_micros": 74385, "latency_ms": 4422, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "8"}}
+{"qid": "2306.05425v1.pdf::Q006", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "8", "pages": 22, "document_id": 7587, "arm": "surfsense_agentic", "question_id": "2306.05425v1.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the document to find information about multi-lingual translation query and count the languages in the in-context examples.Now I can see references to the multi-lingual translation query. Let me read the specific section that mentions Figure 7 and the languages.Perfect! I found the information. Let me now look at Figure 7 to see the actual in-context examples and count the languages.Perfect! I can now see Figure 7 which shows the in-context examples for multi-lingual translation query. Let me count the languages shown:\n\n1. English\n2. Chinese (\u4e2d\u6587)\n3. Spanish (espa\u00f1ol)\n4. Japanese (\u65e5\u672c\u8a9e)\n5. French (Fran\u00e7ais)\n6. Korean (\ud55c\uad6d\uc5b4)\n7. German (Deutsch)\n8. Arabic (\u0627\u0644\u0639\u0631\u0628\u064a\u0629)\n\nThe document clearly shows 8 languages in the in-context examples for multi-lingual translation query in Figure 7.\n\nAnswer: 8", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 44640, "error": null, "extra": {"thread_id": 1991, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_25a64843d98447e9baa03cf538d93723", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 212, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "8"}}
+{"qid": "2306.05425v1.pdf::Q007", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "1", "pages": 22, "document_id": 7587, "arm": "native_pdf", "question_id": "2306.05425v1.pdf::Q007", "answer_letter": null, "raw_text": "I'll go through the document to count all the line plots shown.\n\nLooking through the figures in the document:\n\n**Figure 1** (page 2): This shows a complex diagram with multiple examples of different tasks and image types, but does not appear to contain line plots.\n\n**Figure 2** (page 4): This shows data format comparisons with example images and text, but no line plots.\n\n**Figure 3** (page 5): This shows the Sythus pipeline overview with icons and flowchart elements, but no line plots.\n\n**Figure 4** (page 7): This contains:\n- (a) A sunburst chart showing instructions\n- (b) A sunburst chart showing responses  \n- (c) Multiple bar charts/histograms and what appears to be **line plots** showing statistics\n\nLooking at Figure 4(c) more carefully, there appear to be line plots showing \"# Instructions\" and \"# Responses\" over \"Instruction Length\" and \"Response Length\", plus what looks like a line plot for \"# Related Instructions\".\n\n**Figure 5** (page 8): Shows example images with text responses, not line plots.\n\n**Figure 6** (page 10): This contains three subfigures:\n- (a) Bar charts for video understanding\n- (b) Bar chart for vision-language model alignment\n- (c) Contains **line plots** showing CIDEr scores across different shot settings (0-shot, 4-shot, 8-shot, 16-shot) comparing Otter and OpenFlamingo - this appears to show 2 line plots\n\n**Figure 7** (page 16): Shows text examples in multiple languages, no line plots.\n\nCounting the line plots:\n- Figure 4(c): appears to show 2-3 line plots\n- Figure 6(c): shows 2 line plots (one for Otter, one for OpenFlamingo)\n\nTotal: approximately 4-5 line plots visible in Figure 4(c) and 2 in Figure 6(c) = approximately 6-7 total.\n\nLooking more carefully at Figure 4(c), I can see there are line plots for instruction length distribution, response length distribution, and related instructions. Figure 6(c) clearly shows 2 ", "citations": [], "input_tokens": 55833, "output_tokens": 512, "cost_micros": 175179, "latency_ms": 42171, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "length", "pdf_filename": "2306.05425v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "1"}}
+{"qid": "2306.05425v1.pdf::Q007", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "1", "pages": 22, "document_id": 7587, "arm": "azure_basic_lc", "question_id": "2306.05425v1.pdf::Q007", "answer_letter": null, "raw_text": "I need to carefully examine the document for line plots.\n\nLooking through the figures in the document:\n\n**Figure 1**: This shows example images with text descriptions - not line plots.\n\n**Figure 2**: Shows example data formats with images and text - not line plots.\n\n**Figure 3**: Shows a pipeline/flowchart diagram - not line plots.\n\n**Figure 4**: This figure contains multiple subplots:\n- (a) A circular/sunburst chart showing root verb-noun pairs\n- (b) Another circular/sunburst chart showing root verb-noun pairs\n- (c) Four separate line plots showing:\n  1. Instruction Length distribution\n  2. Response Length distribution\n  3. # Images distribution\n  4. # Related Instructions distribution\n\nSo Figure 4(c) contains **4 line plots**.\n\n**Figure 5**: Shows example images with text responses - not line plots.\n\n**Figure 6**: This figure contains three subplots:\n- (a) Two grouped bar charts for \"Video understanding\" (MSVD 0-shot and MSRVTT 0-shot)\n- (b) A bar chart showing \"Human evaluation comparison\" with Elo Ratings\n- (c) A line plot showing \"COCO caption\" performance across different shot settings (0-shot, 4-shot, 8-shot, 16-shot)\n\nSo Figure 6(c) contains **1 line plot**.\n\n**Total line plots**: 4 (from Figure 4c) + 1 (from Figure 6c) = **5 line plots**\n\nAnswer: 5", "citations": [], "input_tokens": 21178, "output_tokens": 363, "cost_micros": 68979, "latency_ms": 10016, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "1"}}
+{"qid": "2306.05425v1.pdf::Q007", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "1", "pages": 22, "document_id": 7587, "arm": "azure_premium_lc", "question_id": "2306.05425v1.pdf::Q007", "answer_letter": null, "raw_text": "I need to carefully examine the document for line plots (graphs showing data points connected by lines).\n\nLet me go through the figures systematically:\n\n**Figure 1**: Shows sample images with text descriptions - not a line plot.\n\n**Figure 2**: Shows a comparison of data formats with images and text - not a line plot.\n\n**Figure 3**: Shows a flowchart/pipeline diagram - not a line plot.\n\n**Figure 4**: Contains multiple subfigures:\n- (a) A circular/sunburst diagram showing verb-noun pairs\n- (b) Another circular/sunburst diagram showing verb-noun pairs\n- (c) Contains **4 line plots**: \n  1. \"# Instructions\" vs \"Instruction Length\"\n  2. \"# Responses\" vs \"Response Length\"\n  3. \"# Instructions\" vs \"# Images\"\n  4. \"# Instructions\" vs \"# Related Instructions\"\n\n**Figure 5**: Shows example images with captions and chat interfaces - not a line plot.\n\n**Figure 6**: Contains three subfigures:\n- (a) Bar chart for video understanding - not a line plot\n- (b) Bar chart for vision-language model alignment - not a line plot\n- (c) Shows **1 line plot**: \"CIDEr\" performance across different shot settings (0-shot, 4-shot, 8-shot, 16-shot)\n\n**Figure 7**: Shows text examples in multiple languages - not a line plot.\n\nTotal line plots found:\n- Figure 4(c): 4 line plots\n- Figure 6(c): 1 line plot\n\nAnswer: 5", "citations": [], "input_tokens": 23503, "output_tokens": 370, "cost_micros": 76059, "latency_ms": 9937, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "1"}}
+{"qid": "2306.05425v1.pdf::Q007", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "1", "pages": 22, "document_id": 7587, "arm": "llamacloud_basic_lc", "question_id": "2306.05425v1.pdf::Q007", "answer_letter": null, "raw_text": "I need to search through the document for line plots/graphs.\n\nLooking through the document, I can identify the following figures with line plots:\n\n1. Figure 4 - This shows multiple visualizations including what appears to be bar charts and word clouds showing verb-noun structures and dataset statistics. The caption mentions \"Fig. 4 (a)\" for instructions and \"Fig. 4 (b)\" for responses, and \"Fig. 4 (c)\" for diversity metrics. These appear to be bar charts and word frequency visualizations, not line plots.\n\n2. Figure 6 - The text references \"Fig. 6(b)\" and \"Fig. 6(c)\" where:\n   - Fig. 6(b) is described as showing Elo ratings (likely a bar chart or ranking visualization)\n   - Fig. 6(c) is described as showing \"COCO caption (CIDEr) few-shot evaluation\" - this is explicitly described as comparing performance across different shot settings (0-shot, 4-shot, 8-shot, 16-shot, 32-shot), which would typically be shown as a line plot\n\nBased on the text descriptions, Figure 6(c) showing the few-shot learning performance across different numbers of shots (0, 4, 8, 16, 32) would be presented as a line plot to show the progression of performance.\n\nHowever, I cannot actually see the figures in this document to count them definitively. Based on the textual descriptions alone, there appears to be at least 1 line plot (Figure 6c showing few-shot performance).\n\nAnswer: 1", "citations": [], "input_tokens": 20235, "output_tokens": 361, "cost_micros": 66120, "latency_ms": 11078, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "1"}}
+{"qid": "2306.05425v1.pdf::Q007", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "1", "pages": 22, "document_id": 7587, "arm": "llamacloud_premium_lc", "question_id": "2306.05425v1.pdf::Q007", "answer_letter": null, "raw_text": "I need to carefully examine the document for line plots (graphs showing lines connecting data points over continuous axes).\n\nLooking through the figures in the document:\n\n**Figure 1**: This is a flowchart/diagram showing the MIMIC-IT overview with different capabilities (Perception, Reasoning, Planning) - not a line plot.\n\n**Figure 2**: Shows data format comparisons with images and text examples - not a line plot.\n\n**Figure 3**: Shows a flowchart of the Syphus pipeline - not a line plot.\n\n**Figure 4**: This figure shows multiple visualizations of data statistics:\n- (a) Shows a circular/sunburst chart for Instructions (verb-noun pairs)\n- (b) Shows a circular/sunburst chart for Responses (verb-noun pairs)\n- (c) Shows \"Statistics of instructions and responses\" with what appears to be 4 small plots showing:\n  - Instruction Length vs # Instructions\n  - Response Length vs # Responses\n  - # Images vs # Instructions\n  - # Related Instructions vs # Instructions\n\nThese plots in Figure 4(c) appear to be line plots showing distributions/relationships.\n\n**Figure 5**: Shows example responses from Otter with images and text - not a line plot.\n\n**Figure 6**: Shows three tables labeled (a), (b), and (c) with numerical results - these are tables, not line plots.\n\n**Figure 7**: Shows text examples for multi-lingual translation - not a line plot.\n\nCounting the line plots in Figure 4(c): There are 4 small line plots showing different statistics.\n\nAnswer: 4", "citations": [], "input_tokens": 24251, "output_tokens": 364, "cost_micros": 78213, "latency_ms": 10437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "1"}}
+{"qid": "2306.05425v1.pdf::Q007", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "1", "pages": 22, "document_id": 7587, "arm": "surfsense_agentic", "question_id": "2306.05425v1.pdf::Q007", "answer_letter": null, "raw_text": "I'll read the document to count the line plots.Let me search for figures and plots in the document by reading through it systematically:Now let me read the actual content of these figures to understand their structure and count line plots:Let me continue to check all figures systematically:Let me search more specifically for line plots or line charts:Let me look at Figure 6 more carefully as it seems to contain multiple subfigures with plots:Based on the data I see in Figure 6(c), it appears to show different shot settings (0-shot, 4-shot, 8-shot, 16-shot) which suggests this is a line plot. Let me search for more visual descriptions to understand all the plots:Let me read more around Figure 4 to understand what types of plots it contains:Now let me look for visual descriptions and embedded images to understand the structure of figures better. Let me check if there are visual descriptions that mention line plots or graphs:Let me search for embedded images and their visual descriptions:Let me look at the visual descriptions of the image that corresponds to Figure 6:This visual description seems to be of a different image not related to Figure 6. Let me look for a chart or graph description by searching through the document more systematically. Let me read sections with image data to find charts:The visual descriptions I'm seeing are for photographs, not charts. Let me search for keywords related to charts or visualizations more carefully:Good! Let me read this section to find charts and visualizations:Good! I found the section with chart visual descriptions. These appear to be histograms/bar charts for Figure 4. Now let me search for descriptions that mention line graphs or line plots:Let me specifically look for \"line\" combined with visualization terms:Now let me search specifically for Figure 6's visualization descriptions by looking near the Figure 6 location more carefully. Let me look for the embedded images around Figure 6:Let me check for chart/plot related embedded images. Looking back at Figure 6, I saw it has three subfigures (a), (b), and (c). Let me search for embedded images that might show line plots by looking at descriptions with \"axis\":I found histograms/bar charts. Now let me look for any line-based visualizations. Let me search for \"CIDEr\" which was mentioned in Figure 6(c) about COCO caption evaluation:I can see Figure 6(c) shows data with labels like \"0-shot 4-shot 8-shot 16-shot\" and \"CIDEr\", which strongly suggests it's a line plot showing performance across different shot settings. Let me look for a visualization description that mentions this, or look for visual descriptions that discuss multiple data points connected:The text \"0-shot 4-shot 8-shot 16-shot\" followed by \"(c) COCO caption\" suggests Figure 6(c) is indeed a line plot or chart. However, the embedded image Im14.jpg description seems unrelated (clinical setting). Let me look at the data more carefully and count what we know so far. Let me check if there are visual descriptions specifically for charts with \"shot\":", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 191452, "error": null, "extra": {"thread_id": 1992, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_48224411a27b457b894b9bc78d679631", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 942, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "1"}}
+{"qid": "2307.09288v2.pdf::Q000", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "5", "pages": 77, "document_id": 7588, "arm": "native_pdf", "question_id": "2307.09288v2.pdf::Q000", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "5"}}
+{"qid": "2307.09288v2.pdf::Q000", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "5", "pages": 77, "document_id": 7588, "arm": "azure_basic_lc", "question_id": "2307.09288v2.pdf::Q000", "answer_letter": null, "raw_text": "I'll count the tables on pages 5-10 of the document.\n\n**Page 5:**\n- Table 1: LLAMA 2 family of models\n\n**Page 6:**\n- Table 2: CO2 emissions during pretraining\n\n**Page 7:**\n- Table 3: Overall performance on grouped academic benchmarks\n\n**Page 8:**\n- Table 4: Comparison to closed-source models on academic benchmarks\n\n**Page 9:**\n- Table 5: SFT annotation examples\n\n**Page 10:**\n- Table 6: Statistics of human preference data for reward modeling\n\nAnswer: 6", "citations": [], "input_tokens": 80762, "output_tokens": 144, "cost_micros": 244446, "latency_ms": 6391, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "5"}}
+{"qid": "2307.09288v2.pdf::Q000", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "5", "pages": 77, "document_id": 7588, "arm": "azure_premium_lc", "question_id": "2307.09288v2.pdf::Q000", "answer_letter": null, "raw_text": "I'll count all the tables on pages 5-10.\n\n**Page 5:**\n- Table 1: LLAMA 2 family of models\n\n**Page 6:**\n- Figure 5 (not a table)\n- Table 2: CO2 emissions during pretraining\n\n**Page 7:**\n- Table 3: Overall performance on grouped academic benchmarks\n- Table 4: Comparison to closed-source models on academic benchmarks\n\n**Page 8:**\n- Table 5: SFT annotation - example of helpfulness and safety annotation\n\n**Page 9:**\n- No tables (contains Figures 11 and discussion)\n\n**Page 10:**\n- Table 6: Statistics of human preference data for reward modeling\n\nTotal count: 5 tables\n\nAnswer: 5", "citations": [], "input_tokens": 109911, "output_tokens": 181, "cost_micros": 332448, "latency_ms": 8235, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "5"}}
+{"qid": "2307.09288v2.pdf::Q000", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "5", "pages": 77, "document_id": 7588, "arm": "llamacloud_basic_lc", "question_id": "2307.09288v2.pdf::Q000", "answer_letter": null, "raw_text": "I'll count the tables that appear in pages 5-10 of the document.\n\nPage 5: \n- Table 1: Llama 2 family of models (comparing parameters, context length, etc.)\n\nPage 6:\n- No tables\n\nPage 7:\n- Table 3: Overall performance on grouped academic benchmarks\n\nPage 8:\n- Table 4: Comparison to closed-source models on academic benchmarks\n- Table 5: SFT annotation example (shown at top of page)\n\nPage 9:\n- No tables (this page discusses supervised fine-tuning details)\n\nPage 10:\n- Table 6: Statistics of human preference data for reward modeling\n\nCounting all tables in pages 5-10:\n- Page 5: 1 table\n- Page 6: 0 tables\n- Page 7: 1 table\n- Page 8: 2 tables\n- Page 9: 0 tables\n- Page 10: 1 table\n\nTotal: 1 + 0 + 1 + 2 + 0 + 1 = 5 tables\n\nAnswer: 5", "citations": [], "input_tokens": 80961, "output_tokens": 265, "cost_micros": 246858, "latency_ms": 7983, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "5"}}
+{"qid": "2307.09288v2.pdf::Q000", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "5", "pages": 77, "document_id": 7588, "arm": "llamacloud_premium_lc", "question_id": "2307.09288v2.pdf::Q000", "answer_letter": null, "raw_text": "I'll count the tables on pages 5-10 of the document.\n\n**Page 5:**\n- Table 1: LLAMA 2 family of models\nTotal: 1 table\n\n**Page 6:**\n- Table 2: CO\u2082 emissions during pretraining\n- Figure 5: Training Loss for LLAMA 2 models (this is a figure, not a table)\nTotal: 1 table\n\n**Page 7:**\n- Table 3: Overall performance on grouped academic benchmarks\nTotal: 1 table\n\n**Page 8:**\n- Table 4: Comparison to closed-source models on academic benchmarks\nTotal: 1 table\n\n**Page 9:**\n- Table 5: SFT annotation examples\nTotal: 1 table\n\n**Page 10:**\n- Table 6: Statistics of human preference data for reward modeling\nTotal: 1 table\n\nTotal tables on pages 5-10: 1 + 1 + 1 + 1 + 1 + 1 = 6\n\nAnswer: 6", "citations": [], "input_tokens": 105634, "output_tokens": 249, "cost_micros": 320637, "latency_ms": 8264, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "5"}}
+{"qid": "2307.09288v2.pdf::Q000", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "5", "pages": 77, "document_id": 7588, "arm": "surfsense_agentic", "question_id": "2307.09288v2.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document that was flagged as most relevant (2307.09288v2.pdf.xml) and check pages 5-10 for tables.I need to search for page markers to identify pages 5-10. Let me search for page indicators in the document.Let me read more of the document to understand its structure and find page markers.Good! I can see page numbers are indicated with `<!-- PageNumber=\"X\" -->`. Let me search for these page markers to find pages 5-10.Perfect! Now I know that:\n- Page 5 starts at line 1660\n- Page 6 starts at line 1858\n- Page 7 starts at line 1955\n- Page 8 starts at line 2202\n- Page 9 starts at line 2265\n- Page 10 starts at line 2332\n- Page 11 starts at line 2461\n\nSo I need to read from line 1660 to line 2460 to cover pages 5-10. Let me search for table markers in this range.Great! I can see the content from pages 5-10. Let me continue reading to make sure I capture all tables in this range. I've found several tables so far. Let me count them systematically:\n\nFrom line 1660 to 2460 (pages 5-10), I've seen:\n1. Table 1 (line 1664) - LLAMA 2 family of models\n2. Table 2 (line 1862) - CO2 emissions during pretraining\n3. Table 3 (line 1959) - Overall performance on grouped academic benchmarks\n4. Table 4 (line 2131) - Comparison to closed-source models\n5. Table 5 (line 2209) - SFT annotation\n6. Table 6 (line 2336) - Statistics of human preference data\n\nLet me verify by reading the remaining portion up to line 2460 to make sure I haven't missed any tables:Perfect! Now I can see all the `<table>` tags in the document. Based on the PageNumber markers I found earlier:\n- Page 5 starts at line 1660\n- Page 6 starts at line 1858\n- Page 7 starts at line 1955\n- Page 8 starts at line 2202\n- Page 9 starts at line 2265\n- Page 10 starts at line 2332\n- Page 11 starts at line 2461\n\nSo pages 5-10 cover lines 1660 to 2460 (just before page 11).\n\nThe tables that fall within this range are:\n1. Line 1664: `<table>` - Table 1 (on page 5)\n2. Line 1862: `<table>` - Table 2 (on page 6)\n3. Line 1959: `<table>` - Table 3 (on page 7)\n4. Line 2131: `<table>` - Table 4 (on page 8)\n5. Line 2209: `<table>` - Table 5 (on page 8)\n6. Line 2336: `<table>` - Table 6 (on page 10)\n\nTables at lines 2479 and 2544 are on page 11 or later, so they don't count.\n\nAnswer: 6", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 54500, "error": null, "extra": {"thread_id": 1993, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_d18e0e208d4c44ad9ccab3815ed12b04", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 413, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "5"}}
+{"qid": "2307.09288v2.pdf::Q002", "doc_id": "2307.09288v2.pdf", "answer_format": "list", "gold": "['Vicuna13b-v1.3', 'PaLM Bison', 'Vicuna13b-v1.1']", "pages": 77, "document_id": 7588, "arm": "native_pdf", "question_id": "2307.09288v2.pdf::Q002", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "", "normalised_gold": "palm bison, vicuna13b v1 1, vicuna13b v1 3"}}
+{"qid": "2307.09288v2.pdf::Q002", "doc_id": "2307.09288v2.pdf", "answer_format": "list", "gold": "['Vicuna13b-v1.3', 'PaLM Bison', 'Vicuna13b-v1.1']", "pages": 77, "document_id": 7588, "arm": "azure_basic_lc", "question_id": "2307.09288v2.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 3, which shows \"Safety human evaluation results for LLAMA 2-CHAT compared to other open-source and closed-source models,\" I need to identify the models with the lowest violation percentages (since lower is safer).\n\nFrom the bar chart in Figure 3, the models with the lowest violation percentages are:\n\n1. LLAMA 2-70b-chat (lowest violation %)\n2. ChatGPT 0301 (second lowest)\n3. LLAMA 2-34b-chat (third lowest)\n\nAnswer: Llama-2 70b-chat, ChatGPT 0301, Llama-2 34b-chat", "citations": [], "input_tokens": 80773, "output_tokens": 157, "cost_micros": 244674, "latency_ms": 6141, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "chatgpt 0301, llama 2 34b chat, llama 2 70b chat", "normalised_gold": "palm bison, vicuna13b v1 1, vicuna13b v1 3"}}
+{"qid": "2307.09288v2.pdf::Q002", "doc_id": "2307.09288v2.pdf", "answer_format": "list", "gold": "['Vicuna13b-v1.3', 'PaLM Bison', 'Vicuna13b-v1.1']", "pages": 77, "document_id": 7588, "arm": "azure_premium_lc", "question_id": "2307.09288v2.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 3, which shows \"Safety human evaluation results for LLAMA 2-CHAT compared to other open-source and closed-source models,\" I need to identify the models with the lowest violation percentages (since lower is safer).\n\nFrom the bar chart, the three models with the lowest violation percentages are:\n1. Llama-2-70b-chat (approximately 0.5%)\n2. Llama-2-34b-chat (approximately 1%)\n3. Llama-2-13b-chat (approximately 1.5%)\n\nAnswer: Llama-2-70b-chat, Llama-2-34b-chat, Llama-2-13b-chat", "citations": [], "input_tokens": 109922, "output_tokens": 165, "cost_micros": 332241, "latency_ms": 7625, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "llama 2 13b chat, llama 2 34b chat, llama 2 70b chat", "normalised_gold": "palm bison, vicuna13b v1 1, vicuna13b v1 3"}}
+{"qid": "2307.09288v2.pdf::Q002", "doc_id": "2307.09288v2.pdf", "answer_format": "list", "gold": "['Vicuna13b-v1.3', 'PaLM Bison', 'Vicuna13b-v1.1']", "pages": 77, "document_id": 7588, "arm": "llamacloud_basic_lc", "question_id": "2307.09288v2.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 3 which shows \"Safety human evaluation results for Llama 2-Chat compared to other open-source and close-source models,\" I need to identify the models with the lowest violation percentages (since lower is better for safety).\n\nFrom the figure, the models are ordered from left to right with their violation percentages. The three models with the lowest violation percentages are:\n\n1. Llama 2-Chat (70B) - appears to have the lowest violation percentage\n2. ChatGPT (0301) - appears to have the second lowest\n3. Falcon (40B-instruct) - appears to have the third lowest\n\nAnswer: Llama 2-Chat (70B), ChatGPT (0301), Falcon (40B-instruct)", "citations": [], "input_tokens": 80972, "output_tokens": 176, "cost_micros": 245556, "latency_ms": 7233, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "chatgpt 0301, falcon 40b instruct, llama 2 chat 70b", "normalised_gold": "palm bison, vicuna13b v1 1, vicuna13b v1 3"}}
+{"qid": "2307.09288v2.pdf::Q002", "doc_id": "2307.09288v2.pdf", "answer_format": "list", "gold": "['Vicuna13b-v1.3', 'PaLM Bison', 'Vicuna13b-v1.1']", "pages": 77, "document_id": 7588, "arm": "llamacloud_premium_lc", "question_id": "2307.09288v2.pdf::Q002", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "", "normalised_gold": "palm bison, vicuna13b v1 1, vicuna13b v1 3"}}
+{"qid": "2307.09288v2.pdf::Q002", "doc_id": "2307.09288v2.pdf", "answer_format": "list", "gold": "['Vicuna13b-v1.3', 'PaLM Bison', 'Vicuna13b-v1.1']", "pages": 77, "document_id": 7588, "arm": "surfsense_agentic", "question_id": "2307.09288v2.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the document to find Figure 3 and identify the top-3 models.Let me search more specifically for Figure 3 content:Now let me read the area around line 1363 to see the actual figure data:Based on the document, Figure 3 shows safety violation percentages where \"lower is safer\". Let me look for the actual values to determine the top-3 (lowest violation %) models:The visual description indicates that the first 4 bars (dark blue) have the lowest values (approximately 3-7), which are the safest models. Based on the model order from lines 1391-1420, the models are listed in order:\n\n1. Llama-2 7b-chat\n2. Llama-2 13b-chat\n3. Llama-2 34b-chat\n4. Llama-2 70b-chat\n5. MPT 7b-chat\n6. Vicuna 13b-v1.1\n7. Vicuna 33b-v1.3\n8. Falcon 40b-instruct\n9. PaLM Bison\n10. ChatGPT 0301\n\nThe description states the first 4 bars (dark blue) have values approximately 3-7, which are the lowest (safest). So the top-3 safest models would be the first three models.\n\nAnswer: Llama-2 7b-chat, Llama-2 13b-chat, Llama-2 34b-chat", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 48187, "error": null, "extra": {"thread_id": 1994, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_8f47a07752394e8d87ac6ac816505c2a", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 260, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "llama 2 13b chat, llama 2 34b chat, llama 2 7b chat", "normalised_gold": "palm bison, vicuna13b v1 1, vicuna13b v1 3"}}
+{"qid": "2307.09288v2.pdf::Q004", "doc_id": "2307.09288v2.pdf", "answer_format": "str", "gold": "Llama 1 33B", "pages": 77, "document_id": 7588, "arm": "native_pdf", "question_id": "2307.09288v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Table 23 for QUAC (f1) results in the 0-shot setting:\n\nThe table shows:\n- MPT 7B: 38.0\n- MPT 30B: 40.4\n- Falcon 7B: 24.0\n- Falcon 40B: 41.2\n- Llama 1 7B: 38.9\n- Llama 1 13B: 39.9\n- Llama 1 33B: 44.1\n- Llama 1 65B: 41.0\n- Llama 2 7B: 39.4\n- Llama 2 13B: 42.7\n- Llama 2 34B: 42.9\n- Llama 2 70B: 42.4\n\nAnswer: Llama 1 33B", "citations": [], "input_tokens": 199228, "output_tokens": 210, "cost_micros": 600834, "latency_ms": 79592, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2307.09288v2.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "llama 1 33b", "normalised_gold": "llama 1 33b"}}
+{"qid": "2307.09288v2.pdf::Q004", "doc_id": "2307.09288v2.pdf", "answer_format": "str", "gold": "Llama 1 33B", "pages": 77, "document_id": 7588, "arm": "azure_basic_lc", "question_id": "2307.09288v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Table 23 in the document, which shows \"Comparison to open-source models on reading comprehension (SQUAD and QUAC)\", I need to find the QUAC (f1) 0-shot column.\n\nFrom the table, the 0-shot QUAC results are:\n- MPT 7B: 38.0\n- MPT 30B: 40.4\n- Falcon 7B: 38.9\n- LLAMA 1 13B: 39.9\n- LLAMA 1 33B: 44.1\n- LLAMA 1 65B: 41.0\n- LLAMA 2 7B: 39.4\n- LLAMA 2 34B: 42.9\n- LLAMA 2 70B: 42.4\n\nThe highest score in the 0-shot setting for QUAC is 44.1.\n\nAnswer: LLAMA 1 33B", "citations": [], "input_tokens": 80777, "output_tokens": 227, "cost_micros": 245736, "latency_ms": 7592, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "llama 1 33b", "normalised_gold": "llama 1 33b"}}
+{"qid": "2307.09288v2.pdf::Q004", "doc_id": "2307.09288v2.pdf", "answer_format": "str", "gold": "Llama 1 33B", "pages": 77, "document_id": 7588, "arm": "azure_premium_lc", "question_id": "2307.09288v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Table 23 in the document, which shows \"Comparison to open-source models on reading comprehension (SQUAD and QUAC)\", I need to find the 0-shot results for the QUAC dataset.\n\nIn the QUAC (f1) 0-shot column, the scores are:\n\n- MPT 7B: 38.0\n- MPT 30B: 40.4\n- Falcon 7B: 24.0\n- Falcon 40B: 41.2\n- LLAMA 1 7B: 38.9\n- LLAMA 1 13B: 39.9\n- LLAMA 1 33B: 44.1\n- LLAMA 1 65B: 41.0\n- LLAMA 2 7B: 39.4\n- LLAMA 2 13B: 42.7\n- LLAMA 2 34B: 42.9\n- LLAMA 2 70B: 42.4\n\nThe highest score in the 0-shot QUAC column is 44.1.\n\nAnswer: LLAMA 1 33B", "citations": [], "input_tokens": 109926, "output_tokens": 271, "cost_micros": 333843, "latency_ms": 8594, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "llama 1 33b", "normalised_gold": "llama 1 33b"}}
+{"qid": "2307.09288v2.pdf::Q004", "doc_id": "2307.09288v2.pdf", "answer_format": "str", "gold": "Llama 1 33B", "pages": 77, "document_id": 7588, "arm": "llamacloud_basic_lc", "question_id": "2307.09288v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Table 23 \"Comparison to Open-Source Models on Reading Comprehension (SQUAD and QUAC)\", I need to find the QUAC 0-shot column.\n\nThe table shows QUAC results with 0-shot and 1-shot columns. For the 0-shot setting on QUAC:\n\n- MPT 7B: 38.0\n- MPT 30B: 40.4\n- Falcon 7B: 24.0\n- Falcon 40B: 41.2\n- Llama 1 7B: 38.9\n- Llama 1 13B: 39.9\n- Llama 1 33B: 44.1\n- Llama 1 65B: 41.0\n- Llama 2 7B: 39.4\n- Llama 2 13B: 42.7\n- Llama 2 34B: 42.9\n- Llama 2 70B: 42.4\n\nThe highest 0-shot score on QUAC is 44.1, achieved by Llama 1 33B.\n\nAnswer: Llama 1 33B", "citations": [], "input_tokens": 80976, "output_tokens": 284, "cost_micros": 247188, "latency_ms": 8469, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "llama 1 33b", "normalised_gold": "llama 1 33b"}}
+{"qid": "2307.09288v2.pdf::Q004", "doc_id": "2307.09288v2.pdf", "answer_format": "str", "gold": "Llama 1 33B", "pages": 77, "document_id": 7588, "arm": "llamacloud_premium_lc", "question_id": "2307.09288v2.pdf::Q004", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "llama 1 33b"}}
+{"qid": "2307.09288v2.pdf::Q004", "doc_id": "2307.09288v2.pdf", "answer_format": "str", "gold": "Llama 1 33B", "pages": 77, "document_id": 7588, "arm": "surfsense_agentic", "question_id": "2307.09288v2.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to find Table 23 and identify the best performing model in the 0-shot setting on the QUAC dataset.Let me search for \"Table 23\" in the document.Now let me read the section containing Table 23 to see the complete table.Perfect! I can see Table 23 completely now. Looking at the QUAC (f1) 0-shot column (column 7 in the table), the scores are:\n\n- MPT 7B: 38.0\n- MPT 30B: 40.4\n- Falcon 7B: 24.0\n- Falcon 40B: 41.2\n- LLAMA 1 7B: 38.9\n- LLAMA 1 13B: 39.9\n- LLAMA 1 33B: 44.1\n- LLAMA 1 65B: 41.0\n- LLAMA 2 7B: 39.4\n- LLAMA 2 13B: 42.7\n- LLAMA 2 34B: 42.9\n- LLAMA 2 70B: 42.4\n\nThe highest score in the 0-shot setting on QUAC is 44.1, which belongs to LLAMA 1 33B.\n\nAnswer: LLAMA 1 33B", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 40921, "error": null, "extra": {"thread_id": 1995, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_1765040c73804b7381cc9d57c302f53d", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 202, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "llama 1 33b", "normalised_gold": "llama 1 33b"}}
+{"qid": "2307.09288v2.pdf::Q005", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "8", "pages": 77, "document_id": 7588, "arm": "native_pdf", "question_id": "2307.09288v2.pdf::Q005", "answer_letter": null, "raw_text": "I need to find Table 21 in the document and count how many slots (cells) are missing data.\n\nLooking through the document, I can find Table 21 on page 49. Let me examine it:\n\nTable 21 is titled \"Code generation results on Human-Eval and MBPP.\"\n\nThe table has the following structure:\n- Rows for different models: MPT (7B, 30B), Falcon (7B, 40B), Llama 1 (7B, 13B, 33B, 65B), and Llama 2 (7B, 13B, 34B, 70B)\n- Columns for: Human-Eval (pass@1, pass@100) and MBPP (pass@1, pass@80)\n\nLet me count the missing entries (shown as \"-\" or blank):\n\nLooking at the table:\n- MPT 7B: has values for pass@1 under Human-Eval (18.3) and MBPP (22.6), but \"-\" for pass@100 and pass@80\n- MPT 30B: has values for pass@1 (25.0, 32.8), but \"-\" for pass@100 and pass@80\n- Falcon 7B: has values for pass@1 (0.0, 11.2), but \"-\" for pass@100 and pass@80\n- Falcon 40B: has values for pass@1 (0.6, 29.8), but \"-\" for pass@100 and pass@80\n- All Llama 1 and Llama 2 models have complete data (no missing slots)\n\nMissing slots:\n- MPT 7B: 2 missing (pass@100 for both benchmarks)\n- MPT 30B: 2 missing\n- Falcon 7B: 2 missing\n- Falcon 40B: 2 missing\n\nTotal: 8 missing slots\n\nAnswer: 8", "citations": [], "input_tokens": 199211, "output_tokens": 461, "cost_micros": 604548, "latency_ms": 75532, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2307.09288v2.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "8"}}
+{"qid": "2307.09288v2.pdf::Q005", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "8", "pages": 77, "document_id": 7588, "arm": "azure_basic_lc", "question_id": "2307.09288v2.pdf::Q005", "answer_letter": null, "raw_text": "I need to examine Table 21 in the document to count how many slots (cells) are missing data.\n\nLooking at Table 21 in the document, which shows \"Code generation results on Human-Eval and MBPP\":\n\nThe table has the following structure:\n- Rows: Different models (MPT 7B, Falcon 7B/40B, LLAMA 1 7B/13B/33B/65B, LLAMA 2 7B/13B/34B/70B)\n- Columns: Human-Eval pass@1, Human-Eval pass@100, MBPP pass@1, MBPP pass@80\n\nLet me count the missing entries (marked with \"-\"):\n\nMPT 7B: pass@100 for Human-Eval is missing (1 slot)\nMPT 30B: pass@100 for Human-Eval is missing (1 slot)\nFalcon 40B: pass@1 and pass@100 for Human-Eval are missing, pass@1 and pass@80 for MBPP are missing (4 slots)\n\nTotal missing slots: 1 + 1 + 4 = 6\n\nAnswer: 6", "citations": [], "input_tokens": 80760, "output_tokens": 280, "cost_micros": 246480, "latency_ms": 8250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "8"}}
+{"qid": "2307.09288v2.pdf::Q005", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "8", "pages": 77, "document_id": 7588, "arm": "azure_premium_lc", "question_id": "2307.09288v2.pdf::Q005", "answer_letter": null, "raw_text": "I need to examine Table 21 in the document to count the missing slots.\n\nLooking at Table 21, which shows \"Code generation results on Human-Eval and MBPP\":\n\nThe table has the following structure:\n- Column headers: Model name, size, and then 4 metric columns (pass@1, pass@100 for Human-Eval, and pass@1, pass@80 for MBPP)\n- Row groups for: MPT (2 sizes), Falcon (2 sizes), LLAMA 1 (4 sizes), and LLAMA 2 (4 sizes)\n\nLet me count the cells systematically:\n\nFor MPT:\n- 7B: has values for pass@1 and pass@1 (MBPP), missing pass@100 and pass@80 = 2 missing\n- 30B: has values for pass@1 and pass@1 (MBPP), missing pass@100 and pass@80 = 2 missing\n\nFor Falcon:\n- 7B: has values for pass@1 and pass@1 (MBPP), missing pass@100 and pass@80 = 2 missing\n- 40B: has values for pass@1 and pass@1 (MBPP), missing pass@100 and pass@80 = 2 missing\n\nFor LLAMA 1 (4 sizes: 7B, 13B, 33B, 65B):\n- All sizes have all 4 metrics filled = 0 missing\n\nFor LLAMA 2 (4 sizes: 7B, 13B, 34B, 70B):\n- All sizes have all 4 metrics filled = 0 missing\n\nTotal missing slots = 2 + 2 + 2 + 2 = 8\n\nAnswer: 8", "citations": [], "input_tokens": 109909, "output_tokens": 429, "cost_micros": 336162, "latency_ms": 10592, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "8"}}
+{"qid": "2307.09288v2.pdf::Q005", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "8", "pages": 77, "document_id": 7588, "arm": "llamacloud_basic_lc", "question_id": "2307.09288v2.pdf::Q005", "answer_letter": null, "raw_text": "I need to examine Table 21 in the document to count how many slots (cells) are missing data.\n\nLooking at Table 21 titled \"Code generation results on Human-Eval and MBPP\":\n\nThe table has the following structure:\n- Column headers: Model, Human-Eval (pass@1, pass@100), MBPP (pass@1, pass@80)\n- Row headers: MPT, Falcon, Llama 1, Llama 2\n\nLet me count the cells:\n- MPT row: 7B size is listed, then has values 18.3, -, 22.6, -\n- Falcon row: 7B size is listed, then has values 0.0, -, 11.2, -\n- Llama 1 row: 7B size is listed, then has values 15.8, 52.5, 22.0, 64.0\n- Llama 2 row: 7B size is listed, then has values 18.3, 60.2, 30.6, 69.0\n\nThe \"-\" symbols indicate missing data.\n\nCounting the dashes:\n- MPT: 2 dashes (pass@100 for Human-Eval, pass@80 for MBPP)\n- Falcon: 2 dashes (pass@100 for Human-Eval, pass@80 for MBPP)\n- Llama 1: 0 dashes (all values present)\n- Llama 2: 0 dashes (all values present)\n\nTotal missing slots: 2 + 2 = 4\n\nAnswer: 4", "citations": [], "input_tokens": 80959, "output_tokens": 385, "cost_micros": 248652, "latency_ms": 10391, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "8"}}
+{"qid": "2307.09288v2.pdf::Q005", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "8", "pages": 77, "document_id": 7588, "arm": "llamacloud_premium_lc", "question_id": "2307.09288v2.pdf::Q005", "answer_letter": null, "raw_text": "I need to examine Table 21 to count the missing slots (empty cells).\n\nLooking at Table 21 titled \"Code generation results on Human-Eval and MBPP\":\n\nThe table has the following structure:\n- Column headers: Model | Size | Human-Eval pass@1 | Human-Eval pass@100 | MBPP pass@1 | MBPP pass@80\n- Rows for: MPT (7B, 30B), Falcon (7B, 40B), LLAMA 1 (7B, 13B, 33B, 65B), LLAMA 2 (7B, 13B, 34B, 70B)\n\nLet me count the cells:\n- Total rows with data: 12 (excluding header)\n- Total columns with data: 6 (including Model and Size)\n- So for the data columns (excluding Model column): 12 rows \u00d7 4 data columns = 48 potential data cells\n\nNow counting the filled cells in the data columns:\n- MPT 7B: 4 values (18.3, -, 22.6, -)\n- MPT 30B: 4 values (25.0, -, 32.8, -)\n- Falcon 7B: 4 values (0.0, -, 11.2, -)\n- Falcon 40B: 4 values (0.6, -, 29.8, -)\n- LLAMA 1 7B: 4 values (10.5, 36.5, 17.7, 56.2)\n- LLAMA 1 13B: 4 values (15.8, 52.5, 22.0, 64.0)\n- LLAMA 1 33B: 4 values (21.7, 70.7, 30.2, 73.4)\n- LLAMA 1 65B: 4 values (23.7, 79.3, 37.7, 76.8)\n- LLAMA 2 7B: 4 values (12.8, 45.6, 20.8, 62.", "citations": [], "input_tokens": 105632, "output_tokens": 512, "cost_micros": 324576, "latency_ms": 13625, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "8"}}
+{"qid": "2307.09288v2.pdf::Q005", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "8", "pages": 77, "document_id": 7588, "arm": "surfsense_agentic", "question_id": "2307.09288v2.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to find Table 21 and count the missed slots.Let me search for Table 21 in the document:Now let me search for where Table 21 appears with more context:Let me read the area around line 7837 where Table 21 is located:Now I have Table 21. Let me analyze it carefully. The table shows code generation results with columns for Human-Eval (pass@1, pass@100) and MBPP (pass@1, pass@80). I can see that some entries have \"-\" which indicates missing data. Let me count all the \"-\" entries:\n\nLooking at the table structure:\n- MPT 7B: 2 missing values (pass@100 for Human-Eval and pass@80 for MBPP marked with \"-\")\n- MPT 30B: 2 missing values (pass@100 for Human-Eval and pass@80 for MBPP marked with \"-\")\n- Falcon 7B: 2 missing values (pass@100 for Human-Eval and pass@80 for MBPP marked with \"-\")\n- Falcon 40B: 2 missing values (pass@100 for Human-Eval and pass@80 for MBPP marked with \"-\")\n- LLAMA 1 models (7B, 13B, 33B, 65B): All have values, no \"-\"\n- LLAMA 2 models (7B, 13B, 34B, 70B): All have values, no \"-\"\n\nTotal missed slots = 2 + 2 + 2 + 2 = 8\n\nAnswer: 8", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 42437, "error": null, "extra": {"thread_id": 1996, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_b8bd21d9aaba4efda0a40946ca2b4039", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 222, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "8"}}
+{"qid": "2307.09288v2.pdf::Q006", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "13", "pages": 77, "document_id": 7588, "arm": "native_pdf", "question_id": "2307.09288v2.pdf::Q006", "answer_letter": null, "raw_text": "I'll examine pages 49-50 to count the datasets reported.\n\nOn page 49 (Table 20), I can see results for several standard benchmarks including:\n- BoolQ\n- PIQA\n- SIQA\n- HellaSwag\n- WinoGrande\n- ARC-e\n- ARC-c\n- OBQA\n- CSQA\n- MMLU\n\nThat's 10 datasets on page 49.\n\nOn page 50 (Table 21), I can see code generation results on:\n- Human-Eval\n- MBPP\n\nThat's 2 datasets on page 50.\n\nTotal datasets reported across pages 49-50: 10 + 2 = 12\n\nAnswer: 12", "citations": [], "input_tokens": 199213, "output_tokens": 177, "cost_micros": 600294, "latency_ms": 43437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2307.09288v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "12", "normalised_gold": "13"}}
+{"qid": "2307.09288v2.pdf::Q006", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "13", "pages": 77, "document_id": 7588, "arm": "azure_basic_lc", "question_id": "2307.09288v2.pdf::Q006", "answer_letter": null, "raw_text": "I need to examine pages 49-50 of the document to count the datasets mentioned.\n\nLooking at page 49, I can see:\n- Table 20: Performance on standard benchmarks (mentions BoolQ, PIQA, SIQA, HellaSwag, WinoGrande, ARC-e, ARC-c, OBQA, CSQA, MMLU)\n- Table 21: Code generation results on Human-Eval and MBPP\n\nLooking at page 50, I can see:\n- Table 22: NaturalQuestions and TriviaQA results\n- Table 23: SQUAD and QUAC results\n- Table 24: AGI Eval comparison (mentions AQuA-RAT, LogiQA, LSAT-AR, LSAT-LR, LSAT-RC, SAT-en, SAT-en (w/o Psg.), SAT-math)\n\nLet me count all unique datasets mentioned across these tables on pages 49-50:\n\nFrom Table 20 (page 49):\n1. BoolQ\n2. PIQA\n3. SIQA\n4. HellaSwag\n5. WinoGrande\n6. ARC-e\n7. ARC-c\n8. OBQA\n9. CSQA\n10. MMLU\n\nFrom Table 21 (page 49):\n11. Human-Eval\n12. MBPP\n\nFrom Table 22 (page 50):\n13. NaturalQuestions\n14. TriviaQA\n\nFrom Table 23 (page 50):\n15. SQUAD\n16. QUAC\n\nFrom Table 24 (page 50):\n17. AGI Eval (which includes multiple subtasks: AQuA-RAT, LogiQA, LSAT-AR, LSAT-LR, LSAT-RC, SAT-en, SAT-en (w/o Psg.), SAT-math)\n\nCounting the subtasks of AGI Eval separately:\n17. AQuA-RAT\n18. LogiQA\n19. LSAT-AR\n20. LSAT-LR\n21.", "citations": [], "input_tokens": 80762, "output_tokens": 512, "cost_micros": 249966, "latency_ms": 10782, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "21", "normalised_gold": "13"}}
+{"qid": "2307.09288v2.pdf::Q006", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "13", "pages": 77, "document_id": 7588, "arm": "azure_premium_lc", "question_id": "2307.09288v2.pdf::Q006", "answer_letter": null, "raw_text": "I need to examine pages 49-50 of the document to count the number of datasets reported.\n\nLooking at page 49, I can see:\n- Table 20: \"Performance on standard benchmarks\" - this discusses various benchmarks but appears to be evaluation results rather than listing distinct datasets\n- Table 21: \"Code generation results on Human-Eval and MBPP\" - mentions 2 datasets: Human-Eval and MBPP\n\nLooking at page 50, I can see:\n- Table 22: \"NaturalQuestions\" and \"TriviaQA\" - 2 datasets\n- Table 23: \"Comparison to open-source models on reading comprehension (SQUAD and QUAC)\" - 2 datasets: SQUAD and QUAC\n- Table 24: \"Comparison to open source models on AGI Eval (English)\" - 1 dataset: AGI Eval (with multiple subtasks listed: AQUA-RAT, LogiQA, LSAT-AR, LSAT-LR, LSAT-RC, SAT-en, SAT-en (w/o Psg.), SAT-math)\n\nLet me count the distinct datasets mentioned:\n1. Human-Eval\n2. MBPP\n3. NaturalQuestions\n4. TriviaQA\n5. SQUAD\n6. QUAC\n7. AGI Eval (this is one dataset with multiple subtasks)\n\nAdditionally, in Table 20, there are multiple benchmarks mentioned: BoolQ, PIQA, SIQA, HellaSwag, WinoGrande, ARC-e, ARC-c, OBQA, CSQA, MMLU - that's 10 more datasets.\n\nTotal distinct datasets across pages 49-50:\nFrom Table 20: BoolQ, PIQA, SIQA, HellaSwag, WinoGrande, ARC-e, ARC-c, OBQA, CSQA, MMLU (10)\nFrom Table 21: Human-Eval, MBPP (2)\nFrom Table 22: NaturalQuestions, TriviaQA (2)\nFrom Table 23: SQUAD,", "citations": [], "input_tokens": 109911, "output_tokens": 512, "cost_micros": 337413, "latency_ms": 12858, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "23", "normalised_gold": "13"}}
+{"qid": "2307.09288v2.pdf::Q006", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "13", "pages": 77, "document_id": 7588, "arm": "llamacloud_basic_lc", "question_id": "2307.09288v2.pdf::Q006", "answer_letter": null, "raw_text": "I need to examine pages 49-50 to count the datasets reported there.\n\nLooking at page 49, I can see:\n- Table 20: Performance on standard benchmarks (mentions BoolQ, PIQA, SIQA, HellaSwag, WinoGrande, ARC-e, ARC-c, OBQA, CSQA, MMLU)\n- Table 21: Code generation results on Human-Eval and MBPP\n- Table 22: NaturalQuestions and TriviaQA Performance\n\nLooking at page 50, I can see:\n- Table 23: Comparison to Open-Source Models on Reading Comprehension (SQUAD and QUAC)\n- Table 24: Comparison to Open Source Models on AGI Eval (English) - mentions AQuA-RAT, LogiQA, LSAT-AR, LSAT-LR, LSAT-RC, SAT-en, SAT-en (w/o Psg.), SAT-math\n\nNow let me count all unique datasets mentioned across pages 49-50:\n\nFrom Table 20: BoolQ, PIQA, SIQA, HellaSwag, WinoGrande, ARC-e, ARC-c, OBQA, CSQA, MMLU (10 datasets)\n\nFrom Table 21: Human-Eval, MBPP (2 datasets)\n\nFrom Table 22: NaturalQuestions, TriviaQA (2 datasets)\n\nFrom Table 23: SQUAD, QUAC (2 datasets)\n\nFrom Table 24: AQuA-RAT, LogiQA, LSAT-AR, LSAT-LR, LSAT-RC, SAT-en, SAT-en (w/o Psg.), SAT-math (8 datasets, noting that SAT-en and SAT-en (w/o Psg.) are variants of the same dataset but listed separately)\n\nTotal unique datasets: 10 + 2 + 2 + 2 + 8 = 24\n\nAnswer: 24", "citations": [], "input_tokens": 80961, "output_tokens": 493, "cost_micros": 250278, "latency_ms": 11625, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "24", "normalised_gold": "13"}}
+{"qid": "2307.09288v2.pdf::Q006", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "13", "pages": 77, "document_id": 7588, "arm": "llamacloud_premium_lc", "question_id": "2307.09288v2.pdf::Q006", "answer_letter": null, "raw_text": "I need to carefully examine pages 49-50 to count the number of datasets reported.\n\nLooking at page 49 (Table 20), I can see:\n- BoolQ\n- PIQA\n- SIQA\n- HellaSwag\n- WinoGrande\n- ARC-e\n- ARC-c\n- OBQA\n- CSQA\n- MMLU\n\nThat's 10 datasets in Table 20.\n\nLooking at page 49 (Table 21), I can see:\n- Human-Eval\n- MBPP\n\nThat's 2 datasets in Table 21.\n\nLooking at page 50 (Table 22), I can see:\n- NaturalQuestions\n- TriviaQA (Wiki)\n\nThat's 2 datasets in Table 22.\n\nLooking at page 50 (Table 23), I can see:\n- SQUAD\n- QUAC\n\nThat's 2 datasets in Table 23.\n\nLooking at page 50 (Table 24), I can see:\n- AQuA-RAT\n- LogiQA\n- LSAT-AR\n- LSAT-LR\n- LSAT-RC\n- SAT-en\n- SAT-en (w/o Psg.)\n- SAT-math\n\nThat's 8 datasets in Table 24.\n\nTotal datasets across pages 49-50: 10 + 2 + 2 + 2 + 8 = 24\n\nAnswer: 24", "citations": [], "input_tokens": 105634, "output_tokens": 355, "cost_micros": 322227, "latency_ms": 11062, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "24", "normalised_gold": "13"}}
+{"qid": "2307.09288v2.pdf::Q006", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "13", "pages": 77, "document_id": 7588, "arm": "surfsense_agentic", "question_id": "2307.09288v2.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the document to find the information on pages 49-50.Let me search for page numbers in the document to locate pages 49-50.Let me search for page markers more broadly:", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 35907, "error": null, "extra": {"thread_id": 1997, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_db129915abb34a60aeaa6083e2f2fd47", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 109, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "49", "normalised_gold": "13"}}
+{"qid": "2307.09288v2.pdf::Q007", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "15", "pages": 77, "document_id": 7588, "arm": "native_pdf", "question_id": "2307.09288v2.pdf::Q007", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "15"}}
+{"qid": "2307.09288v2.pdf::Q007", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "15", "pages": 77, "document_id": 7588, "arm": "azure_basic_lc", "question_id": "2307.09288v2.pdf::Q007", "answer_letter": null, "raw_text": "I need to examine Figure 9 to count the emojis in the left and right subfigures.\n\nLooking at Figure 9, which is titled \"Issues with multi-turn memory (left) can be improved with GAtt (right)\":\n\nLeft subfigure:\n- Shows a conversation about \"Always answer with emojis\"\n- The response \"Who are you?\" appears to have text but I need to count emojis\n- Looking at the left side, I can see there are some emojis in the responses\n\nRight subfigure:\n- Shows the same conversation structure with GAtt improvement\n- The response appears to have more emojis\n\nLet me count more carefully by examining the figure:\n\nLeft side conversation:\n- \"Who are you?\" response appears to have some emojis\n- \"How to go from Paris to NY?\" response has text with some emojis\n\nRight side conversation:\n- \"Who are you?\" response has emojis\n- \"How to go from Paris to NY?\" response has more emojis\n- There appears to be an additional question about Italy\n\nLooking at the visual representation in Figure 9, the right subfigure clearly shows more emoji usage throughout the conversation compared to the left subfigure. \n\nCounting the visible emojis:\n- Left: appears to have fewer emojis (approximately 2-3 visible)\n- Right: appears to have more emojis throughout (approximately 10-12 visible)\n\nThe difference appears to be approximately 8-9 emojis.\n\nAnswer: 8", "citations": [], "input_tokens": 80772, "output_tokens": 344, "cost_micros": 247476, "latency_ms": 10140, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "15"}}
+{"qid": "2307.09288v2.pdf::Q007", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "15", "pages": 77, "document_id": 7588, "arm": "azure_premium_lc", "question_id": "2307.09288v2.pdf::Q007", "answer_letter": null, "raw_text": "I need to examine Figure 9 to count the emojis in the left and right subfigures.\n\nLooking at Figure 9, which shows \"Issues with multi-turn memory (left) can be improved with GAtt (right)\":\n\n**Left subfigure (without GAtt):**\n- Shows a conversation with \"Who are you?\" followed by \"How to go from Paris to NY?\" and \"What caused the extinction of the dinosaurs?\"\n- I can see there are emoji circles (\ud83d\ude0a symbols) used in the conversation\n- Counting the emojis: I see 3 emoji symbols\n\n**Right subfigure (with GAtt):**\n- Shows the same conversation structure with \"Who are you?\" followed by \"How to go from Paris to NY?\" \n- Counting the emojis: I see 3 emoji symbols as well\n\nWait, let me recount more carefully by looking at the figure description and structure:\n\nLeft side: I count 3 emoji/circle symbols in the conversation flow\nRight side: I count 3 emoji/circle symbols in the conversation flow\n\nActually, looking more carefully at the figure, both sides appear to have the same number of emoji symbols (3 each) representing the conversation turns.\n\nThe difference would be: 3 - 3 = 0\n\nAnswer: 0", "citations": [], "input_tokens": 109921, "output_tokens": 294, "cost_micros": 334173, "latency_ms": 9984, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "0", "normalised_gold": "15"}}
+{"qid": "2307.09288v2.pdf::Q007", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "15", "pages": 77, "document_id": 7588, "arm": "llamacloud_basic_lc", "question_id": "2307.09288v2.pdf::Q007", "answer_letter": null, "raw_text": "I need to examine Figure 9 to count the emojis in the left and right subfigures.\n\nLooking at Figure 9 in the document, which shows \"Issues with multi-turn memory (left) can be improved with GAtt (right)\":\n\nLeft subfigure (without GAtt):\n- Shows a conversation about acting as Oscar Wilde\n- I can see emoji-like elements in the visualization\n\nRight subfigure (with GAtt):\n- Shows the same type of conversation\n- Also has emoji-like elements in the visualization\n\nLet me count more carefully. The figure appears to show attention visualization patterns with what look like emoji or icon representations in the conversation bubbles.\n\nFrom the image description and visual elements:\n- Left side: Shows conversation elements with some emoji representations\n- Right side: Shows conversation elements with emoji representations\n\nLooking at the actual figure more carefully, the right subfigure appears to have more emoji icons visible in the conversation compared to the left subfigure. \n\nCounting the visible emojis/icons in each:\n- Left: appears to have fewer emoji elements\n- Right: appears to have more emoji elements\n\nBased on the visual comparison, the right subfigure has approximately 2 more emojis than the left subfigure.\n\nAnswer: 2", "citations": [], "input_tokens": 80971, "output_tokens": 281, "cost_micros": 247128, "latency_ms": 10141, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "15"}}
+{"qid": "2307.09288v2.pdf::Q007", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "15", "pages": 77, "document_id": 7588, "arm": "llamacloud_premium_lc", "question_id": "2307.09288v2.pdf::Q007", "answer_letter": null, "raw_text": "I need to examine Figure 9 to count the emojis in the left and right subfigures.\n\nLooking at Figure 9 (titled \"Issues with multi-turn memory (left) can be improved with GAtt (right)\"):\n\n**Left subfigure (Generic Preprompt):**\n- User: \"Who are you?\"\n- Assistant: \"\ud83e\udd16\ud83d\udcbb\u2328\ufe0f\" (3 emojis)\n- User: \"How to go from Paris to NY?\"\n- Assistant: Long text response with no emojis\nTotal emojis in left: 3\n\n**Right subfigure (Preprompt with Answer Template):**\n- User: \"Who are you?\"\n- Assistant: \"\ud83e\udd16\ud83d\udcac...\" (2 emojis shown, plus \"...\")\n- User: \"How to go from Paris to NY?\"\n- Assistant: \"\ud83d\udeeb\ud83d\udea2\u2708\ufe0f\ud83e\uddf3\" (4 emojis)\n- User: \"What caused the extinction of the dinosaurs?\"\n- Assistant: \"\u2604\ufe0f\ud83c\udf0b\ud83e\udd95\ud83d\udc80\" (4 emojis)\n- User: \"Italy?\"\n- Assistant: \"\ud83c\uddee\ud83c\uddf9\ud83c\udf55\ud83c\udf5d\ud83c\udfdb\ufe0f\" (4 emojis)\n\nLet me recount the right side more carefully:\n- First response: \ud83e\udd16\ud83d\udcac (2 emojis, the \"...\" indicates continuation but not additional emojis)\n- Second response: \ud83d\udeeb\ud83d\udea2\u2708\ufe0f\ud83e\uddf3 (4 emojis)\n- Third response: \u2604\ufe0f\ud83c\udf0b\ud83e\udd95\ud83d\udc80 (4 emojis)\n- Fourth response: \ud83c\uddee\ud83c\uddf9\ud83c\udf55\ud83c\udf5d\ud83c\udfdb\ufe0f (4 emojis)\nTotal emojis in right: 2 + 4 + 4 + 4 = 14\n\nDifference: 14 - 3 = 11\n\nAnswer: 11", "citations": [], "input_tokens": 105644, "output_tokens": 462, "cost_micros": 323862, "latency_ms": 13312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "11", "normalised_gold": "15"}}
+{"qid": "2307.09288v2.pdf::Q007", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "15", "pages": 77, "document_id": 7588, "arm": "surfsense_agentic", "question_id": "2307.09288v2.pdf::Q007", "answer_letter": null, "raw_text": "I'll read the document to find Figure 9 and count the emojis in each subfigure.Let me search for Figure 9 in the document:Now let me read the section around line 3147 to see Figure 9:Let me continue reading to see the complete Figure 9 with emoji counts:I can see Figure 9 is described, but it mentions an embedded image. Let me look at the embedded image description more carefully. The text shows numbers like \"8\", \"0\" which might be emoji counts. Let me search for the actual image description or look for more context:The image description I saw was for a different figure (a scatter plot). Let me look back at the text content of Figure 9 more carefully. I can see there are conversations shown with numbers like \"8\" and \"0\". Let me search for the actual figure content more carefully:The text I'm seeing doesn't show the actual figure properly. Let me look for an embedded image associated with Figure 9. The page header says \"Always answer with emojis\" which is the instruction. Let me search for the actual visual description or OCR of the image:Let me go back and look more carefully at the figure. The text between lines 3146-3181 should contain the figure content. Let me look for an image file or better description:The visual description starting at line 3190 describes a scatter plot, which is NOT Figure 9. That appears to be a different figure (the embedded image X6.png). The actual Figure 9 content is in the text between lines 3147-3180. Let me look for an image file that might contain the actual Figure 9. Let me search for embedded images near Figure 9:I notice the page headers say \"Always answer with emojis\" twice (lines 3142-3143). This suggests the page has two columns - left and right subfigures. Looking at the figure text:\n\nLeft subfigure appears to show:\n- Line 3149: \"Who are you?\" \n- Line 3151: \"Who are you?\"\n- Line 3153: \"8\" (could indicate 8 emojis)\n- Line 3157: \"How to go from Paris to NY?\"\n- Line 3159: \"How to go from Paris to NY?\"\n- Line 3161: \"8\" (could indicate 8 emojis)\n- Line 3163: \"0\" (could indicate 0 emojis)\n- Line 3165-3168: Text response without emojis\n- Line 3171: \"What caused the extinction of the dinosaurs?\"\n- Line 3173: \"0\" (could indicate 0 emojis)\n\nLet me search for an image description specifically for Figure 9. Let me look before line 3146:Let me look for an image file associated with Figure 9. Let me search for embedded images around this area:So there's an embedded image X6.png at line 3185, which is right after Figure 9 ends at line 3181. But the visual description for X6.png shows it's a scatter plot, not Figure 9. Let me look for an image BEFORE Figure 9:Let me search for image descriptions that might actually be Figure 9. Let me look for an OCR or description of dialogues with emojis:Great! I found relevant information. Let me look at lines 2729-2760 which describe emoji exchanges:Excellent! This is describing an image related to Figure 9. Looking at the OCR text at lines 2729-2737:\n- \"Always answer with emojis\" (the header/instruction)\n- \"8\" (appears after \"Who are you?\")\n- \"...\" \n- \"How to go from Paris to NY?\"\n\nAnd the visual description mentions:\n1. First exchange: emojis (thinking face, eyes, and speech bubble) - that's 3 emojis\n2. Second exchange: transportation-themed emojis (airplane, boat, landmark, car, globe) - that's 5 emojis\n3. Third exchange: dinosaur-themed emojis (dinosaur, calendar/document, ocean wave, volcano, fire) - that's 5 emojis\n4. Fourth exchange: Italy-related emojis (flag, food items, landmark) - that's at least 3 emojis\n\nBut wait, the OCR shows \"8\" which might mean 8 emojis. Let me look at lines 1795-1835 which also mention \"Always answer with emojis\":This describes a different version showing the LEFT subfigure (without GAtt) - it shows the model failing to follow the \"always answer with emojis\" instruction in the third exchange (text response instead of emojis).\n\nNow let me look more carefully at the actual Figure 9 structure. Looking back at the figure text I saw earlier, the caption says \"Issues with multi-turn memory (left) can be improved with GAtt (right).\" The page has TWO column headers \"Always answer with emojis\" (lines 3142-3143), suggesting left and right subfigures.\n\nLet me count the emojis more carefully. Looking at the embedded image X1.png descriptions:\n\n**LEFT subfigure (lines 1795-1835)**: Model FAILS to always use emojis\n- First exchange: 2 emojis (robot face and laptop)\n- Second exchange: (question only shown, no response visible in description)\n- Third exchange: TEXT response (not emojis) - this shows the failure!\n\n**RIGHT subfigure (lines 2726-2760)**: Model SUCCEEDS with emojis\n- First exchange: 3 emojis (thinking face, eyes, speech bubble)\n- Second exchange: 5 emojis (airplane, boat, landmark, car, globe)\n- Third exchange: 5 emojis (dinosaur, calendar/document, ocean wave, volcano, fire)\n- Fourth exchange: 3+ emojis (flag, food items, landmark)\n\nLet me count the \"food items\" more carefully. It says \"flag, food items, landmark\" - \"food items\" is plural, so at least 2 food emojis. That would be at least 1 (flag) + 2 (food items) + 1 (landmark) = 4 emojis minimum for the fourth exchange.\n\nTotal for RIGHT: 3 + 5 + 5 + (at least 4) = at least 17 emojis\n\nBut wait, let me look at the OCR text again at line 2730 - it shows \"8\" right after \"Always answer with emojis\". This might indicate 8 emojis in a single response or might be part of the interface.\n\nLet me search for more specific information about counting emojis in each subfigure:The document doesn't use the term \"subfig\". Let me look more carefully at the actual textual representation of Figure 9 between lines 3146-3181. The numbers \"8\", \"0\", \"8\" might be emoji counts:I see it! The pattern shows:\n- Line 3149: \"Who are you?\" (question appears twice - probably left and right)\n- Line 3151: \"Who are you?\" (duplicate)\n- Line 3153: \"8\" (LEFT subfigure response)\n- Line 3157: \"How to go from Paris to NY?\" (appears twice)\n- Line 3159: \"How to go from Paris to NY?\" (duplicate)\n- Line 3161: \"8\" (RIGHT subfigure response)\n- Line 3163: \"0\" (LEFT subfigure response - no emojis, text instead)\n- Lines 3165-3169: Text response (LEFT fails to use emojis)\n- Line 3171: \"What caused the extinction of the dinosaurs?\"\n- Line 3173: \"0\" (LEFT subfigure - no emojis)\n- Line 3177: \"Italy?\"\n- Line 3179: \"8\" (response)\n\nThis is confusing. Let me look for an actual image file description. Let me check if there's a better description by looking at different chunks:Let me try a different approach. Let me look for the actual visual description or image file that corresponds to Figure 9. Let me search around page 15 for embedded images:The image X6.png after Figure 9 is NOT Figure 9 - it's a scatter plot. The actual Figure 9 content must be represented in the text between lines 3146-3181. But this text representation is unclear.\n\nLet me look BEFORE Figure 9 to see if there's an image file that IS Figure 9. Looking at line 3139, it says \"PageNumber 15\" and then the two page headers \"Always answer with emojis\". This must be a two-column layout. Let me search backwards for an embedded image that might be Figure 9:So Figure 9 starts immediately after the page break at line 3140. There's no embedded image file BEFORE it. The figure content itself must be in the text, or the image description is missing.\n\nBased on what I've found, let me piece together what I know from the two image descriptions I found (X1.png from lines 1795 and 2726):\n\n**LEFT subfigure** (lines 1795-1835 - model WITHOUT GAtt):\n- First response: 2 emojis\n- Later responses: TEXT (no emojis) - shows failure\n\n**RIGHT subfigure** (lines 2726-2760 - model WITH GAtt):\n- First response: 3 emojis  \n- Second response: 5 emojis\n- Third response: 5 emojis  \n- Fourth response: \"flag, food items, landmark\"\n\nFor the fourth response, \"food items\" is plural, indicating at least 2 items. So minimum: 1 flag + 2 food + 1 landmark = 4 emojis.\n\nLet me count more carefully. Looking at the visual description at lines 2750-2753 again:\n1. \"thinking face, eyes, and speech bubble emojis\" - that's 3 emojis\n2. \"airplane, boat, landmark, car, globe\" - that's 5 emojis\n3. \"dinosaur, calendar/document, ocean wave, volcano, fire\" - that's 5 emojis\n4. \"flag, food items, landmark\" - the term \"food items\" suggests multiple food emojis\n\nActually, let me check if there's a more specific count in the OCR at line 2730:OK! The OCR text at line 2730 shows \"8\". This might mean 8 emojis total for the first response, OR it might be labeling something else. But the description clearly lists:\n1. 3 emojis (thinking face, eyes, speech bubble)\n2. 5 emojis (airplane, boat, landmark, car, globe) \n3. 5 emojis (dinosaur, calendar/document, ocean wave, volcano, fire)\n4. At least 3+ emojis (flag, food items [plural = 2+], landmark)\n\nNow for the LEFT subfigure. Let me find its details:So the LEFT subfigure shows:\n- First exchange: 2 emojis (robot face and laptop)  \n- Second exchange: (no response details given)\n- Third exchange: TEXT response (not emojis) - this is the FAILURE\n\nSo the LEFT has only 2 emojis total before it fails.\n\nThe RIGHT has: 3 + 5 + 5 + (at least 3) = at least 16 emojis\n\nBut the question asks specifically about Figure 9. Let me verify these two X1.png images at lines 1795 and 2726 are actually the left and right parts of Figure 9. They both have the same instruction \"Always answer with emojis\" at the top, which matches the page headers for Figure 9.\n\nActually, looking more carefully, let me check how many food items specifically. The text says \"flag, food items, landmark\". If \"food items\" means 2 items, then the fourth response would be: 1 + 2 + 1 = 4 emojis.\n\nSo RIGHT would be: 3 + 5 + 5 + 4 = 17 emojis\nLEFT has: 2 emojis\n\nDifference: 17 - 2 = 15\n\nWait, but I need to verify this is correct. Let me think about this differently. The description says \"food items\" (plural). The minimum plural is 2. But it could be more. Let me see if I can find more specific information.\n\nLooking at the specific items listed for Italy: \"flag, food items, landmark\". Given the context is Italy, common food emojis might be pizza and pasta (2 items), or there could be 3+ food items. Without more specific information, I'll assume \"food items\" means at least 2.\n\nActually, looking at the pattern of the other responses which are very specific:\n- Response 1: lists 3 specific emojis\n- Response 2: lists 5 specific emojis  \n- Response 3: lists 5 specific emojis\n- Response 4: says \"flag, food items, landmark\"\n\nIf the describer was being consistent, and they specifically didn't count the food items individually but said \"food items\" plural, it likely means 2 food items (pizza and pasta would be typical for Italy).\n\nSo: 1 (flag) + 2 (food items) + 1 (landmark) = 4 emojis\n\nRIGHT subfigure total: 3 + 5 + 5 + 4 = 17 emojis\nLEFT subfigure total: 2 emojis\n\nAnswer: 17 - 2 = 15", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 179203, "error": null, "extra": {"thread_id": 1998, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_913336fbec024206afbfe0bdfdc22736", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 1666, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "17", "normalised_gold": "15"}}
+{"qid": "2309.17421v2.pdf::Q000", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "Blue", "pages": 166, "document_id": 7589, "arm": "native_pdf", "question_id": "2309.17421v2.pdf::Q000", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 502: {\"type\":\"https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-502/\",\"title\":\"Error 502: Bad gateway\",\"status\":502,\"detail\":\"The origin web server returned an invalid or incomplete response to Cloudflare. This typically indicates the origin is overlo", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "blue"}}
+{"qid": "2309.17421v2.pdf::Q000", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "Blue", "pages": 166, "document_id": 7589, "arm": "azure_basic_lc", "question_id": "2309.17421v2.pdf::Q000", "answer_letter": null, "raw_text": "I need to find Figure 122 in the document and look for information about the color of OSCARb in the prompt.\n\nLooking at the document, I can find Figure 122 which is titled \"Self-Reflection to Improve Coding\" in Section 10.3.\n\nIn Figure 122, there are two prompts shown. The first prompt asks to \"Generate python code to draw similar curves\" and shows a reference image with a graph. The second prompt asks \"Can you double-check the code and align it better with the reference image?\"\n\nLooking at the graphs in Figure 122, I can see there are plots comparing \"OSCarB\" and \"MiniVLM (ours)\". In the visualizations shown, OSCarB appears to be represented in orange/red color in the legend and plot lines.\n\nAnswer: orange", "citations": [], "input_tokens": 117434, "output_tokens": 184, "cost_micros": 355062, "latency_ms": 9391, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "orange", "normalised_gold": "blue"}}
+{"qid": "2309.17421v2.pdf::Q000", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "Blue", "pages": 166, "document_id": 7589, "arm": "azure_premium_lc", "question_id": "2309.17421v2.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 122 in Section 10.3 on Self-Reflection to Improve Coding, I can see the prompt includes a reference image with a comparison chart.\n\nIn the chart showing \"Comparison of OSCarB and MiniVLM\", OSCarB is represented by the orange/coral colored bars.\n\nAnswer: Orange", "citations": [], "input_tokens": 133651, "output_tokens": 77, "cost_micros": 402108, "latency_ms": 6766, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "orange", "normalised_gold": "blue"}}
+{"qid": "2309.17421v2.pdf::Q000", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "Blue", "pages": 166, "document_id": 7589, "arm": "llamacloud_basic_lc", "question_id": "2309.17421v2.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 122 in the document, which shows \"Self-Reflection to Improve Coding,\" I can see the prompt mentions \"OSCARb\" in the context of generating Python code to draw similar curves.\n\nIn the figure, there is a bar chart showing \"OSCARb\" and \"MiniVLM (ours)\" with bars for \"Base,\" \"Large,\" and \"Huge\" configurations. The bars for OSCARb appear to be orange/amber colored in the visualization.\n\nAnswer: orange", "citations": [], "input_tokens": 103918, "output_tokens": 116, "cost_micros": 313494, "latency_ms": 7546, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "orange", "normalised_gold": "blue"}}
+{"qid": "2309.17421v2.pdf::Q000", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "Blue", "pages": 166, "document_id": 7589, "arm": "llamacloud_premium_lc", "question_id": "2309.17421v2.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 122 in the document, which shows \"Self-Reflection to Improve Coding,\" I can see two side-by-side examples.\n\nIn the right example (the bar chart comparison), the prompt asks to \"Write python codes to generate similar figures.\" The reference image shows a bar chart comparing OSCAR_B and MiniVLM across different categories.\n\nLooking at the bar chart, OSCAR_B is represented by blue bars, while MiniVLM (ours) is represented by orange bars.\n\nAnswer: Blue", "citations": [], "input_tokens": 139293, "output_tokens": 120, "cost_micros": 419679, "latency_ms": 7781, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "blue", "normalised_gold": "blue"}}
+{"qid": "2309.17421v2.pdf::Q000", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "Blue", "pages": 166, "document_id": 7589, "arm": "surfsense_agentic", "question_id": "2309.17421v2.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find Figure 122 and identify the color of OSCARb in the prompt.Let me search for \"Figure 122\" in the document:Now let me search for where Figure 122 appears with more context:Let me read the section around line 32131 and 32550 to see Figure 122:I can see the figure content. Let me continue reading to see more details about the colors:Let me continue reading to see the color information for the bars:I need to look more carefully at the actual figure. Let me search for more specific information about the figure 122 visual description or color information. Let me read around line 32300-32350 more carefully where the bar chart appears:The image described starting at line 32358 (Image7.jpg) is showing a Golden Retriever dog, which is not the chart. Let me look at the figure that appears before that - the bar chart figure starting at line 32302. This shows the data but doesn't explicitly state colors. Let me search for any embedded image descriptions that might be the actual bar chart:Great! I found it. Let me read the section around line 19587 where it says \"Blue bars represent OSCarB\":However, this is describing a different figure. Let me check if this is Figure 122 or another figure. Let me look back at the context around line 32300 where the chart with OSCarB appears in the \"Prompt: Write python codes to generate similar figures\" section, which is part of Figure 122. Let me look for the visual description of the image that corresponds to the reference image in the prompt:Good! This shows that OSCarB is represented by blue bars. Now let me verify this is indeed related to Figure 122. Let me check if Figure 122's prompt section has this same image. Let me look around where the figure caption for Figure 122 is:This is showing a different chart (the left side of Figure 122 with CIDEr curves). Let me look at the right side which should be the bar chart. The section at line 32297-32354 shows the bar chart prompt. Let me look for embedded images near that area. Based on the structure, the embedded images have visual descriptions. Let me check what comes just before line 32297:I see that Image5.jpg and Image6.jpg are unrelated (a person and a cat). These embedded images are interrupting the flow. The actual bar chart reference image in the prompt should be described elsewhere. Let me look for where Figure 122 starts and what images are in it. Let me search for the image that appears right before the \"Prompt: Write python codes to generate similar figures\":Let me look at the first occurrence at line 16349 to see if it has a visual description:This looks like it's showing both charts mixed together in the OCR text. Let me look for the embedded image before this section to find the visual description:So this is Figure 47, not Figure 122. Let me go back to the Figure 122 area and look more carefully. The prompt section starting at line 32299 shows the reference image that contains the bar chart. Let me look for what image comes right before or is part of that prompt section. Let me search more systematically around the Figure 122 area to find embedded images:Perfect! So Figure 122 \"demonstrates the application of self-reflection to improve the results shown in Figure 47.\" This means Figure 122 is showing the same charts as Figure 47, but with self-reflection applied. Since Figure 47 had the bar chart with OSCarB and MiniVLM, and I found earlier at line 19587 that \"Blue bars represent OSCarB\", this should apply to Figure 122 as well since it's based on Figure 47.\n\nLet me verify by checking if there's a specific visual description for the reference image in the right-side prompt of Figure 122. Let me look for any embedded image that might be between lines 32286-32302:Let me look at Figure 47 more carefully. Let me search for where Figure 47 begins with an embedded image:Let me look for embedded images near Figure 47. Let me search backwards from line 16333 to find the embedded images that are part of Figure 47:Let me now search for where Figure 47 contains the bar chart reference image. Since Figure 47 shows both charts (the curve chart and the bar chart), let me look for embedded images in the Figure 47 section. Let me search for \"Embedded image\" near Figure 47:These embedded images are from Section 5, not Figure 47. Let me search more systematically. Let me look for embedded images that appear right before or in Figure 47 section. Let me check around line 15900-16350:Let me search for \"Embedded image\" directly:There are many embedded images. Based on what I've seen, the visual description at line 19587 clearly states \"Blue bars represent OSCarB\" for a bar chart showing \"Comparison of OSCarB and MiniVLM.\" This is the same chart that appears in both Figure 47 and Figure 122 (since Figure 122 is showing improvements to Figure 47 via self-reflection).\n\nLet me verify once more by looking at line 39070 which also mentioned OSCarB color:Interesting! This description says \"Red/coral bars: Represent OSCarB method\" while the earlier one at line 19587 said \"Blue bars represent OSCarB\". These might be describing different versions or different embedded images of the same chart. Let me check if these are in different contexts. Let me look at what figure this is part of (line 39070):Let me check what figure or section this is in:This is Image10.jpg. Let me look back to see what figure this is part of. Let me search backwards for \"Figure\" before line 38980:These are Instagram Reels images, not related to Figure 122. Let me search more directly. Let me look for what section/figure is near line 39000-39100. Let me search backwards for \"Figure\" or \"Sec\":These are all Instagram food images. Let me search for which figure or section contains line 39070. Let me look much earlier before these Instagram images:Still Instagram images. Let me just search for \"Figure 122\" to look for its caption and boundaries:So Figure 122's caption is at line 32550. The figure shows two parts - Left (curves) and Right (bar chart). The GPT-4V output for the right side starts at line 32552. This is within Figure 122. Now I need to find the reference image (the prompt input) for the right side, which should show the original bar chart with OSCarB colors. The prompt for the right side is at line 32299 \"Write python codes to generate similar figures.\" The reference image data is at lines 32302-32354. But this doesn't tell me colors directly since it's just OCR text.\n\nBased on my findings:\n1. Figure 122 is based on Figure 47 (as stated at line 32131-32132)\n2. The bar chart in the figures compares OSCarB and MiniVLM\n3. At line 19587, the visual description clearly states \"Blue bars represent OSCarB\"\n4. At line 39070, another visual description states \"Red/coral bars: Represent OSCarB\"\n\nThese seem to be describing different embedded images. Let me check if the one at 19587 is specifically part of Figure 47:This is Section 7.3, not Figure 47. Let me look for what this section is:This is Section 7.3. So the embedded image with \"Blue bars represent OSCarB\" is in Section 7.3, not in Figure 47. Let me search for what figures are in the Sec 10.3 Self-Reflection area where Figure 122 is. Let me look for embedded images right before or around the prompt at line 32299. Actually, the reference image for the prompt should be immediately before it. Let me look:The <figure> starting at line 32302 is the reference image for the prompt \"Write python codes to generate similar figures.\" This figure contains the bar chart data (OSCARB, MiniVLM, etc.) shown as OCR text from lines 32304-32354. But this OCR text doesn't tell me the colors. The actual visual description of this embedded image should be before or after the figure content. Let me check line 32354 onwards:That's the Golden Retriever image (Image7.jpg), which is not the bar chart. So the bar chart figure (lines 32302-32354) doesn't have a separate \"Embedded image\" marker. This means the figure content IS the embedded image content, just displayed as the <figure> OCR text.\n\nLet me look for Image4.jpg or other images that might be before line 32302:I can see that Image5.jpg and Image6.jpg are unrelated (a person and a cat). These embedded images are interspersed in the document but not part of Figure 122's chart. The actual chart images in Figure 122 are shown as <figure> blocks without \"Embedded image\" labels.\n\nLooking at the document structure, I notice that Figure 122 references Figure 47. Since I know that:\n1. Figure 122 is improving upon Figure 47\n2. They show the same charts (left: curves, right: bar chart)\n3. The color of the bars should be consistent between them\n\nLet me look more carefully at Figure 47 to find which embedded image is the bar chart. Let me search around Figure 47's location (around line 16333):This is still in Section 4.5 Multilingual, before Figure 47. Let me search for images closer to Figure 47. Actually, let me take a different approach. Since Figure 47 appears around line 16333 and the caption says it shows examples, the actual embedded images for Figure 47 should have visual descriptions. But since Figure 47 shows generated output rather than original images, the <figure> blocks ARE the actual content.\n\nGiven the conflicting information (line 19587 says \"Blue\" for OSCarB, line 39070 says \"Red/coral\" for OSCarB), let me check which one is actually related to Figure 122 or Figure 47. Let me look at the context more carefully. The document says Figure 122 improves Figure 47. The prompt in Figure 122 at line 32299 is \"Write python codes to generate similar figures\" and the reference image is at lines 32302-32354 showing the bar chart data.\n\nSince Figure 122's caption (line 32550) says \"Right: the percentage is added to align with the reference image\", the reference image (the prompt image) is the ORIGINAL one that needs improvement. This original should be from Figure 47.\n\nLet me look at the data itself. At line 32312, it says \"OSCARB\" (all caps), and at line 32553, in the GPT-4V output, it says \"OSCarB\" (mixed case). Based on the color descriptions I found, and given that there are multiple embedded images of this chart throughout the document, I need to determine which color appears in the specific prompt image in Figure 122.\n\nGiven the two conflicting descriptions and the difficulty in determining which applies to Figure 122's prompt image specifically, let me check if there's any pattern. The one at line 19587 explicitly lists the color scheme at lines 19514-19518:\n- \"Bright cyan/blue for first method\"  \n- \"Orange-red for second method\"\n\nAnd it says at line 19587: \"Blue bars represent OSCarB\"\n\nThe one at line 39070 says:\n- \"Red/coral bars: Represent OSCarB method\"\n- \"Teal/cyan bars: Represent MiniVLM method\"\n\nThese are OPPOSITE! Let me check the legend text. At line 32312-32314 in Figure 122's prompt image, it shows:\n\"OSCARB\"\n\"MiniVLM (ours)\"\n\nAnd at line 32553-32561 in the GPT-4V output for Figure 122:\n\"OSCarB\"\n\"MiniVLM (ours)\"\n\nLet me check the earlier descriptions for legend position. At line 19505-19506: \"Legend in upper right distinguishes the two methods\"\n\nGiven that standard chart conventions typically list legend items in order, and OSCarB is listed first (before MiniVLM), OSCarB would typically be the first color in the color scheme.\n\nLooking at line 19515-19516:\n- \"Bright cyan/blue for first method\"\n- \"Orange-red for second method\"\n\nIf OSCarB is the first method (listed first in the legend), then it should be \"Bright cyan/blue\" which matches line 19587: \"Blue bars represent OSCarB\"\n\nAnswer: blue", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 238671, "error": null, "extra": {"thread_id": 1999, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_e4d50e05962f4bf6af3a2d9a2bff8f0a", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 2074, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "blue", "normalised_gold": "blue"}}
+{"qid": "2309.17421v2.pdf::Q001", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "YouTube Music", "pages": 166, "document_id": 7589, "arm": "native_pdf", "question_id": "2309.17421v2.pdf::Q001", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "youtube music"}}
+{"qid": "2309.17421v2.pdf::Q001", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "YouTube Music", "pages": 166, "document_id": 7589, "arm": "azure_basic_lc", "question_id": "2309.17421v2.pdf::Q001", "answer_letter": null, "raw_text": "I need to look at Figure 113 to identify the music app shown in the computer screen.\n\nLooking at the document, Figure 113 is described in the section on \"Notification Understanding\" (Section 9.8, page 143).\n\nIn the image caption and description for Figure 113, I can see a computer desktop screenshot. Looking at the desktop icons visible in the image, I can see several application icons including:\n- YouTube\n- Music\n- Recycle Bin\n- Tabby Terminal\n\nThe music app shown on the desktop is labeled as \"Music\" in the icon.\n\nAnswer: Music", "citations": [], "input_tokens": 117435, "output_tokens": 134, "cost_micros": 354315, "latency_ms": 8593, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.6666666666666666, "method": "str_norm", "normalised_pred": "music", "normalised_gold": "youtube music"}}
+{"qid": "2309.17421v2.pdf::Q001", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "YouTube Music", "pages": 166, "document_id": 7589, "arm": "azure_premium_lc", "question_id": "2309.17421v2.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 113 in Section 9.8 (GUI Navigation - Notification Understanding), I can see the computer desktop screen.\n\nOn the desktop, there are several icons visible on the left side, including:\n- YouTube Music\n- Recycle Bin\n- Tabby Terminal\n\nThe music app shown is YouTube Music.\n\nAnswer: YouTube Music", "citations": [], "input_tokens": 133652, "output_tokens": 80, "cost_micros": 402156, "latency_ms": 6578, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "youtube music", "normalised_gold": "youtube music"}}
+{"qid": "2309.17421v2.pdf::Q001", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "YouTube Music", "pages": 166, "document_id": 7589, "arm": "llamacloud_basic_lc", "question_id": "2309.17421v2.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 113 in the document, I can see a computer desktop screenshot with a notification that says \"Kevin Lin is calling you.\" \n\nThe figure shows several application icons on the desktop and in the taskbar. However, I cannot clearly identify a specific music app from the image description or details provided in the document about this figure.\n\nThe document states: \"The image on the computer screen is a desktop background of a bridge over a body of water. There are several icons on the desktop, including a YouTube icon, a Recycle Bin icon, and a Terminal icon. At the bottom of the screen, there is a taskbar with several application icons, such as Microsoft Edge and File Explorer.\"\n\nAnswer: Not clearly identifiable from the provided information", "citations": [], "input_tokens": 103919, "output_tokens": 160, "cost_micros": 314157, "latency_ms": 7578, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "not clearly identifiable from provided information", "normalised_gold": "youtube music"}}
+{"qid": "2309.17421v2.pdf::Q001", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "YouTube Music", "pages": 166, "document_id": 7589, "arm": "llamacloud_premium_lc", "question_id": "2309.17421v2.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 113 in the document, I need to examine the desktop icons shown in the computer screen image.\n\nThe figure shows a Windows desktop with several application icons on the left side. Among these icons, I can see \"YouTube Music\" listed as one of the desktop shortcuts.\n\nAnswer: YouTube Music", "citations": [], "input_tokens": 139294, "output_tokens": 67, "cost_micros": 418887, "latency_ms": 6266, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "youtube music", "normalised_gold": "youtube music"}}
+{"qid": "2309.17421v2.pdf::Q001", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "YouTube Music", "pages": 166, "document_id": 7589, "arm": "surfsense_agentic", "question_id": "2309.17421v2.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the document to find Figure 113 and identify the music app shown in the computer screen.Let me search for \"Figure 113\" in the document.Now let me read the section around line 29414 to see Figure 113 and its description.I can see Figure 113 is referenced, but I need to see the actual image content. Let me look a bit earlier to see the full figure with the image description.I need to look at the actual figure content for Figure 113. Let me search for where the figure starts by looking earlier.Perfect! I found Figure 113. Looking at the content around lines 29326-29350, I can see the computer screen showing desktop icons. Line 29330-29331 clearly shows:\n\n\"You Tube\nMusic\"\n\nThis appears to be \"YouTube Music\" split across two lines in the OCR text. This is the music app shown on the computer screen in Figure 113.\n\nAnswer: YouTube Music", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 52515, "error": null, "extra": {"thread_id": 2000, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_26bb070ab7624a93b3c5c955dc0af2c0", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 233, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "youtube music", "normalised_gold": "youtube music"}}
+{"qid": "2309.17421v2.pdf::Q002", "doc_id": "2309.17421v2.pdf", "answer_format": "float", "gold": "$49.99", "pages": 166, "document_id": 7589, "arm": "native_pdf", "question_id": "2309.17421v2.pdf::Q002", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 106921, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": null, "pdf_filename": "2309.17421v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "", "normalised_gold": "49.99"}}
+{"qid": "2309.17421v2.pdf::Q002", "doc_id": "2309.17421v2.pdf", "answer_format": "float", "gold": "$49.99", "pages": 166, "document_id": 7589, "arm": "azure_basic_lc", "question_id": "2309.17421v2.pdf::Q002", "answer_letter": null, "raw_text": "I need to find Figure 111 in the document and look for the price of the keyboard shown in the screenshot.\n\nLooking at the document, Figure 111 is described in the \"List of Figures\" section as:\n\"Figure 111: Section 9.8: online shopping.\"\n\nLet me search for Figure 111 in the actual content. It appears in Section 9.8 on \"Online Shopping (Ergonomic Keyboard)\".\n\nLooking at the prompt description for Figure 111:\n\"Prompt:\nImagine that you are a robot operating a phone. Like how humans operate the phone, you can move your finger to an icon, click an icon with your finger, scroll down a screen, or type some texts with the keyboard.\n\nYou are asked to shop for an ergonomic keyboard, your budget is between $50 and $100.\n\nPrevious action: \" My next move would be to move my finger to the \"Add to Cart\" button, which is located below the \"Buy New\" button and above the \"Ships from Amazon\" section. I would then click on the \"Add to Cart\" button to add the item to my shopping cart. \"\n\nYou have now completed the previous action as described above. Below is what you see on the phone screen. What will you do next?\"\n\nThe screenshot shows a cart with:\n\"Subtotal $4999\"\n\nand the product listed is:\n\"Kensington Pro Fit Ergonomic Wireless Keyboard - Black (K7540 ...\n$4999\"\n\nThe price shown is $49.99.\n\nAnswer: 49.99", "citations": [], "input_tokens": 117431, "output_tokens": 353, "cost_micros": 357588, "latency_ms": 12750, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "49.99", "normalised_gold": "49.99"}}
+{"qid": "2309.17421v2.pdf::Q002", "doc_id": "2309.17421v2.pdf", "answer_format": "float", "gold": "$49.99", "pages": 166, "document_id": 7589, "arm": "azure_premium_lc", "question_id": "2309.17421v2.pdf::Q002", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "", "normalised_gold": "49.99"}}
+{"qid": "2309.17421v2.pdf::Q002", "doc_id": "2309.17421v2.pdf", "answer_format": "float", "gold": "$49.99", "pages": 166, "document_id": 7589, "arm": "llamacloud_basic_lc", "question_id": "2309.17421v2.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 111 in the document, which shows a phone screenshot of a shopping cart.\n\nThe screenshot displays:\n- A product: \"Kensington Pro Fit Ergonomic Wireless Keyboard - Black (K7540...)\"\n- The price shown is: $49.99\n- There's a subtotal of $49.99\n- The item shows \"In Stock\" status\n\nAnswer: 49.99", "citations": [], "input_tokens": 103915, "output_tokens": 97, "cost_micros": 313200, "latency_ms": 6328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "49.99", "normalised_gold": "49.99"}}
+{"qid": "2309.17421v2.pdf::Q002", "doc_id": "2309.17421v2.pdf", "answer_format": "float", "gold": "$49.99", "pages": 166, "document_id": 7589, "arm": "llamacloud_premium_lc", "question_id": "2309.17421v2.pdf::Q002", "answer_letter": null, "raw_text": "I need to find Figure 111 in the document and identify the price of the keyboard shown in that screenshot.\n\nLooking through the document for Figure 111...\n\nFound it on page 140:\n\nFigure 111 shows a smartphone screen displaying an Amazon shopping cart. The image shows:\n- A Kensington Pro Fit Ergonomic Wireless Keyboard - Black (K7540...)\n- Price: $49.99\n- The button shows \"Proceed to checkout (1 item)\"\n- Subtotal: $49.99\n\nThe price of the keyboard shown in the screenshot is $49.99.\n\nAnswer: 49.99", "citations": [], "input_tokens": 139290, "output_tokens": 144, "cost_micros": 420030, "latency_ms": 8921, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "49.99", "normalised_gold": "49.99"}}
+{"qid": "2309.17421v2.pdf::Q002", "doc_id": "2309.17421v2.pdf", "answer_format": "float", "gold": "$49.99", "pages": 166, "document_id": 7589, "arm": "surfsense_agentic", "question_id": "2309.17421v2.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the priority document that was flagged as user-mentioned to find Figure 111 and identify the keyboard price.Let me search for \"Figure 111\" in the document to locate it.Now let me search for where Figure 111 appears in the content with context.Let me read around line 29062 where Figure 111 is located to see the actual content and price information.Perfect! I found Figure 111 which shows a screenshot of shopping for an ergonomic keyboard. The price is clearly shown in the screenshot as $49.99 (displayed as \"$4999\" without the decimal in multiple places in lines 29088 and 29095).\n\nAnswer: 49.99", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 52062, "error": null, "extra": {"thread_id": 2001, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_dd7392a113cf44358e67b4c74d3afe31", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 172, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "49.99", "normalised_gold": "49.99"}}
+{"qid": "2309.17421v2.pdf::Q003", "doc_id": "2309.17421v2.pdf", "answer_format": "float", "gold": "76%", "pages": 166, "document_id": 7589, "arm": "native_pdf", "question_id": "2309.17421v2.pdf::Q003", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "", "normalised_gold": "76.0"}}
+{"qid": "2309.17421v2.pdf::Q003", "doc_id": "2309.17421v2.pdf", "answer_format": "float", "gold": "76%", "pages": 166, "document_id": 7589, "arm": "azure_basic_lc", "question_id": "2309.17421v2.pdf::Q003", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "", "normalised_gold": "76.0"}}
+{"qid": "2309.17421v2.pdf::Q003", "doc_id": "2309.17421v2.pdf", "answer_format": "float", "gold": "76%", "pages": 166, "document_id": 7589, "arm": "azure_premium_lc", "question_id": "2309.17421v2.pdf::Q003", "answer_letter": null, "raw_text": "I need to find Figure 107 in the document and look for the battery percentage shown in the screenshot.\n\nLooking through the document for Figure 107...\n\nFound it in Section 9.8 GUI Navigation under \"Online Shopping (Ergonomic Keyboard)\".\n\nIn Figure 107, the screenshot shows a phone interface with the time \"5:27\" in the top left corner, and the battery percentage \"76\" is displayed in the top right corner of the screen.\n\nAnswer: 76", "citations": [], "input_tokens": 133646, "output_tokens": 109, "cost_micros": 402573, "latency_ms": 7312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "76.0", "normalised_gold": "76.0"}}
+{"qid": "2309.17421v2.pdf::Q003", "doc_id": "2309.17421v2.pdf", "answer_format": "float", "gold": "76%", "pages": 166, "document_id": 7589, "arm": "llamacloud_basic_lc", "question_id": "2309.17421v2.pdf::Q003", "answer_letter": null, "raw_text": "I need to look at Figure 107 to find the battery percentage shown in the screenshot.\n\nLooking through the document for Figure 107, I can see it's in the \"Sec. 9.8 GUI Navigation\" section under \"Online Shopping (Ergonomic Keyboard)\".\n\nIn Figure 107, the screenshot shows a phone interface with the time \"5:27\" displayed at the top left, and the battery percentage \"76\" shown at the top right of the screen.\n\nAnswer: 76", "citations": [], "input_tokens": 103913, "output_tokens": 112, "cost_micros": 313419, "latency_ms": 6296, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "76.0", "normalised_gold": "76.0"}}
+{"qid": "2309.17421v2.pdf::Q003", "doc_id": "2309.17421v2.pdf", "answer_format": "float", "gold": "76%", "pages": 166, "document_id": 7589, "arm": "llamacloud_premium_lc", "question_id": "2309.17421v2.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Figure 107 in Section 9.8 (GUI Navigation - Online Shopping), I need to find the battery percentage shown in the screenshot.\n\nIn the top-right corner of the screenshot shown in Figure 107, the status bar displays:\n- Time: 5:27\n- Battery percentage: 76\n\nAnswer: 76", "citations": [], "input_tokens": 139288, "output_tokens": 76, "cost_micros": 419004, "latency_ms": 6592, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "76.0", "normalised_gold": "76.0"}}
+{"qid": "2309.17421v2.pdf::Q003", "doc_id": "2309.17421v2.pdf", "answer_format": "float", "gold": "76%", "pages": 166, "document_id": 7589, "arm": "surfsense_agentic", "question_id": "2309.17421v2.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the document to find Figure 107 and identify the battery percentage shown in the screenshot.Let me search for \"Figure 107\" in the document to locate it quickly.Now let me search for the specific line with more context:Let me read around line 28504 to see Figure 107:I can see Figure 107 shows a phone screenshot. I can see \"5:27\" as the time at line 28507, and at line 28511 I see \"76\". Let me check if this is the battery percentage. Looking at the structure, it appears to show:\n- 5:27 (time)\n- 76 (likely battery percentage)\n\nLet me read a bit more to see if there's additional context or if the battery percentage is shown elsewhere:Based on the document content, Figure 107 shows a phone screenshot with the time \"5:27\" at the top, and the number \"76\" appears right after it. In typical phone screenshots, the battery percentage is displayed in the status bar at the top of the screen, usually near the time. The number 76 appears to be the battery percentage.\n\nAnswer: 76", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 64703, "error": null, "extra": {"thread_id": 2002, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_281e7f398f814d8a9e02d3b21485af45", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 226, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "76.0", "normalised_gold": "76.0"}}
+{"qid": "2309.17421v2.pdf::Q004", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "Ukraine", "pages": 166, "document_id": 7589, "arm": "native_pdf", "question_id": "2309.17421v2.pdf::Q004", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 110562, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": null, "pdf_filename": "2309.17421v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "ukraine"}}
+{"qid": "2309.17421v2.pdf::Q004", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "Ukraine", "pages": 166, "document_id": 7589, "arm": "azure_basic_lc", "question_id": "2309.17421v2.pdf::Q004", "answer_letter": null, "raw_text": "I need to look at Figure 101 to find information about the Politico website news cover.\n\nLooking at the document, Figure 101 is described in Section 9.8 \"GUI Navigation - Web Browsing (Today's News)\" on page 131.\n\nThe figure shows a Google search results page with news stories. The prompt asks GPT-4V about what happens after reading the first news article. Looking at the \"Top stories\" section in the image, I can see several news sources listed including:\n\n- **Politico**: \"State Department doubles down after Zelenskyy slams reluctance...\"\n- CNN: \"Catastrophic flooding swamped Vermont's capital...\"\n- Reuters: \"US says it no longer deems Donald Trump immune from E. Jean...\"\n- DCist: \"D.C. Council Approves Emergency Crime Bill\"\n- The New York Times: \"Hard Right Presses Culture War Fights on Defense Bill...\"\n- NPR: \"The Anthropocene began in 1950s, scientists say\"\n\nThe Politico headline mentions \"Zelenskyy,\" which refers to Volodymyr Zelenskyy, the President of Ukraine.\n\nAnswer: Ukraine", "citations": [], "input_tokens": 117439, "output_tokens": 268, "cost_micros": 356337, "latency_ms": 11250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "ukraine", "normalised_gold": "ukraine"}}
+{"qid": "2309.17421v2.pdf::Q004", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "Ukraine", "pages": 166, "document_id": 7589, "arm": "azure_premium_lc", "question_id": "2309.17421v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Figure 101 in Section 9.8 (Web Browsing - Today's News), I can see the Google search results page showing several news headlines.\n\nThe top news item from Politico shows: \"State Department doubles down after Zelenskyy slams reluctance...\"\n\nThis refers to Zelenskyy, who is the President of Ukraine.\n\nAnswer: Ukraine", "citations": [], "input_tokens": 133656, "output_tokens": 87, "cost_micros": 402273, "latency_ms": 6625, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "ukraine", "normalised_gold": "ukraine"}}
+{"qid": "2309.17421v2.pdf::Q004", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "Ukraine", "pages": 166, "document_id": 7589, "arm": "llamacloud_basic_lc", "question_id": "2309.17421v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Figure 101 in the document, I can see a screenshot showing a news article from Politico. The article is titled \"State Department doubles down after Zelenskyy slams reluctance...\" \n\nThe article discusses Ukrainian President Volodymyr Zelenskyy and Ukraine's desire to join NATO. Zelenskyy is from Ukraine.\n\nAnswer: Ukraine", "citations": [], "input_tokens": 103923, "output_tokens": 85, "cost_micros": 313044, "latency_ms": 7108, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "ukraine", "normalised_gold": "ukraine"}}
+{"qid": "2309.17421v2.pdf::Q004", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "Ukraine", "pages": 166, "document_id": 7589, "arm": "llamacloud_premium_lc", "question_id": "2309.17421v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Figure 101, I can see a Google search results page showing news articles. The figure description mentions \"Top stories\" and lists several news sources including Politico.\n\nThe first story listed under Politico is: \"State Department doubles down after Zelenskyy slams reluctance...\"\n\nThe article is about Zelenskyy, who is the President of Ukraine.\n\nAnswer: Ukraine", "citations": [], "input_tokens": 139298, "output_tokens": 88, "cost_micros": 419214, "latency_ms": 8000, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "ukraine", "normalised_gold": "ukraine"}}
+{"qid": "2309.17421v2.pdf::Q004", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "Ukraine", "pages": 166, "document_id": 7589, "arm": "surfsense_agentic", "question_id": "2309.17421v2.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document that was marked as most relevant (2309.17421v2.pdf.xml) to find Figure 101 and identify the information about the Politico website cover.Let me search for \"Figure 101\" in the document to find the specific location:Let me search for \"politico\" to find the relevant section:I can see mentions of Politico with information about Zelenskyy. Let me search more specifically for \"Figure 101\":Great! I found the reference to Figure 101 at line 27715. Let me read that section:Let me look earlier in the document to see the actual Figure 101 image content. I need to search backwards from line 27715 to find the visual description:Now I can see the context. Figure 101 shows a Google News page with various news articles. Looking at the content around lines 27625-27631, I can see:\n\n- Politico news source\n- Article title: \"State Department doubles down after Zelenskyy slams reluctance...\"\n\nThe article is about Zelenskyy (also spelled Zelensky in other parts of the document), who is the President of Ukraine. Let me verify this is correct by looking at the article content earlier:\n\nFrom lines 27419-27421 and 27445-27446, I can confirm:\n- \"State Department doubles down after Zelenskyy slams reluctance on Ukraine's entry to NATO\"\n- \"In a statement posted to Twitter on Tuesday, Ukrainian President Volodymyr Zelenskyy took issue with NATO's reluctance...\"\n\nSo the person on the cover of the news on the Politico website in Figure 101 is Zelenskyy (Volodymyr Zelenskyy), who is from Ukraine.\n\nAnswer: Ukraine", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 84875, "error": null, "extra": {"thread_id": 2003, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_6c36693d9bfb44369693c25b87e59141", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 305, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "ukraine", "normalised_gold": "ukraine"}}
+{"qid": "2309.17421v2.pdf::Q005", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "JoooDeee", "pages": 166, "document_id": 7589, "arm": "native_pdf", "question_id": "2309.17421v2.pdf::Q005", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "jooodeee"}}
+{"qid": "2309.17421v2.pdf::Q005", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "JoooDeee", "pages": 166, "document_id": 7589, "arm": "azure_basic_lc", "question_id": "2309.17421v2.pdf::Q005", "answer_letter": null, "raw_text": "I need to look at Figure 89 to identify the brand name of the coffee machine.\n\nLooking at the document, Figure 89 is in Section 9.7 \"Embodied Agent\" under \"Operating Machine\". The figure shows a coffee machine's operating panel with buttons and includes text that mentions operating menu instructions.\n\nIn the image shown in Figure 89, I can see a coffee machine operating panel. Looking at the visible text and branding on the machine, I can see \"Keurig\" appears to be the brand name based on the styling and layout typical of Keurig coffee makers.\n\nAnswer: Keurig", "citations": [], "input_tokens": 117434, "output_tokens": 139, "cost_micros": 354387, "latency_ms": 7641, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "keurig", "normalised_gold": "jooodeee"}}
+{"qid": "2309.17421v2.pdf::Q005", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "JoooDeee", "pages": 166, "document_id": 7589, "arm": "azure_premium_lc", "question_id": "2309.17421v2.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Figure 89 in Section 9.7 (Embodied Agent - Operating Machine), I can see the coffee machine's operating panel in the image.\n\nThe brand name visible on the coffee machine is \"Keurig\".\n\nAnswer: Keurig", "citations": [], "input_tokens": 133651, "output_tokens": 61, "cost_micros": 401868, "latency_ms": 6500, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "keurig", "normalised_gold": "jooodeee"}}
+{"qid": "2309.17421v2.pdf::Q005", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "JoooDeee", "pages": 166, "document_id": 7589, "arm": "llamacloud_basic_lc", "question_id": "2309.17421v2.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Figure 89 in Section 9.7 (Embodied Agent - Operating Machine), I can see the coffee machine operating panel in the image.\n\nThe brand name visible on the coffee machine is \"Keurig\".\n\nAnswer: Keurig", "citations": [], "input_tokens": 103918, "output_tokens": 60, "cost_micros": 312654, "latency_ms": 6092, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "keurig", "normalised_gold": "jooodeee"}}
+{"qid": "2309.17421v2.pdf::Q005", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "JoooDeee", "pages": 166, "document_id": 7589, "arm": "llamacloud_premium_lc", "question_id": "2309.17421v2.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Figure 89 in Section 9.7, which shows the coffee machine operating panel with button menus, I can see the brand name displayed on the machine.\n\nAnswer: JoooDeee", "citations": [], "input_tokens": 139293, "output_tokens": 47, "cost_micros": 418584, "latency_ms": 7016, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "jooodeee", "normalised_gold": "jooodeee"}}
+{"qid": "2309.17421v2.pdf::Q005", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "JoooDeee", "pages": 166, "document_id": 7589, "arm": "surfsense_agentic", "question_id": "2309.17421v2.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to find Figure 89 and identify the coffee machine brand.Let me search for \"Figure 89\" in the document:Now let me search more specifically to find where Figure 89 appears:Let me read the section around line 25761 to see Figure 89 and its image:I can see Figure 89 references a coffee machine. Let me look a bit earlier to see the actual image content:Let me continue reading to see the menu and coffee machine images:Let me continue to find the actual coffee machine image with the brand visible:I see that line 25763 shows \"JoooDeee\" which appears to be OCR text from the coffee machine image. Let me look at the earlier Figure 88 section which also shows the coffee machine, as it might have clearer brand information:I can see the OCR text on line 25609 and 25763 shows \"JoooDeee\". Let me search for Figure 88 which is mentioned just before to see if there's more context about the brand:Let me look before line 25584 to see if there's an image with better OCR or visual description for Figure 88:Let me search more specifically around where Figure 88 might have its embedded image:Let me look earlier to find where Figure 88 actual image appears. Let me search backwards from where Figure 88 was mentioned:Let me search more systematically. Let me look for where the coffee machine images are actually embedded. Let me search around where both Figure 88 and 89 would appear together:Let me try a different approach and look for pages around 116-117 where Figures 88 and 89 are mentioned, searching for embedded coffee machine images. Let me search for \"Joo\" which appeared in the OCR text:Great! Let me look at line 34358 and chunk 155184 which has a better visual description with \"JoooDeee\":Perfect! I found it. According to the OCR text on line 34358, the coffee machine has \"JoooDeee\" visible on it. Looking at the visual description starting at line 34363, it describes the device as a water dispenser/cooler, but based on the context of the document discussing coffee machines and the OCR text showing \"JoooDeee\", this appears to be the brand name visible on the coffee machine in Figure 89.\n\nHowever, \"JoooDeee\" seems like it might be an OCR misreading. Let me check if there's any other reference that might clarify the brand. Looking at the OCR pattern \"JoooDeee\", this could potentially be \"Jura\" (a well-known coffee machine brand) misread by OCR, but I should report what the document actually says.\n\nBased on the document, the OCR text from the coffee machine image shows \"JoooDeee\" with \"DI\" below it.\n\nAnswer: JoooDeee", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 148609, "error": null, "extra": {"thread_id": 2004, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_f9e89fe20d454f6c90a127387a939523", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 624, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "jooodeee", "normalised_gold": "jooodeee"}}
+{"qid": "2310.05634v2.pdf::Q000", "doc_id": "2310.05634v2.pdf", "answer_format": "list", "gold": "['movement', 'birth', 'place of birth', 'member']", "pages": 24, "document_id": 7590, "arm": "native_pdf", "question_id": "2310.05634v2.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1 on page 1, I need to identify the relation arrows (edges) in the knowledge graph and determine which ones do not point to specific leaf nodes.\n\nThe figure shows a knowledge graph centered around Artemisia Gentileschi (Q212657) and Orazio Gentileschi (Q367560). The relations shown include:\n\n- **child** (from Orazio to Artemisia)\n- **father** (from Artemisia to Orazio)\n- **citizen** (from both to Italy Q38)\n- **occupation** (pointing to \"Painter\")\n- **style** (from Artemisia to Baroque Q37853)\n- **movement** (from Artemisia to Baroque Q37853)\n- **place of birth** (from Artemisia to Rome Q220)\n- **member** (from Artemisia, appears to point to another entity)\n\nThe arrows that do NOT point to specific leaf nodes (i.e., they point to other entities that are themselves nodes in the graph rather than terminal values) are:\n\n- child (points to another person entity)\n- father (points to another person entity)\n\nAnswer: child, father", "citations": [], "input_tokens": 60197, "output_tokens": 266, "cost_micros": 184581, "latency_ms": 12796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.05634v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "child, father", "normalised_gold": "birth, member, movement, place of birth"}}
+{"qid": "2310.05634v2.pdf::Q000", "doc_id": "2310.05634v2.pdf", "answer_format": "list", "gold": "['movement', 'birth', 'place of birth', 'member']", "pages": 24, "document_id": 7590, "arm": "azure_basic_lc", "question_id": "2310.05634v2.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document, I need to identify which relation arrows do not point to specific leaf nodes (entities with specific values or names).\n\nLet me examine each relation arrow from the entities shown:\n\nFrom Q367560 (Orazio Gentileschi):\n- occupation \u2192 Q1028181 (Painter) - points to a class/type node\n- movement \u2192 Q37853 (Baroque) - points to a class/type node\n- child \u2192 Artemisia Gentileschi Q212657 - points to a specific person\n\nFrom Artemisia Gentileschi Q212657:\n- father \u2192 Orazio Gentileschi Q367560 - points to a specific person\n- birth \u2192 [date shown] - points to specific value\n- place of birth \u2192 [location] - points to specific value\n- Citizen \u2192 Italy Q38 - points to a specific country\n- occupation \u2192 Q1028181 (Painter) - points to a class/type node\n- style \u2192 Q37853 (Baroque) - points to a class/type node\n- member \u2192 [organization] - appears to point to a specific entity\n\nThe relations that point to class/type nodes rather than specific leaf nodes are:\n- occupation (points to the class \"Painter\")\n- movement/style (points to the class \"Baroque\")\n\nAnswer: occupation, movement, style", "citations": [], "input_tokens": 22214, "output_tokens": 311, "cost_micros": 71307, "latency_ms": 7718, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.28571428571428575, "method": "list_set", "normalised_pred": "movement, occupation, style", "normalised_gold": "birth, member, movement, place of birth"}}
+{"qid": "2310.05634v2.pdf::Q000", "doc_id": "2310.05634v2.pdf", "answer_format": "list", "gold": "['movement', 'birth', 'place of birth', 'member']", "pages": 24, "document_id": 7590, "arm": "azure_premium_lc", "question_id": "2310.05634v2.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document, I need to identify the relation arrows (edges) in the knowledge graph that do not point to specific leaf nodes (literal values).\n\nIn the knowledge graph shown in Figure 1, I can see the following relationships:\n\n- \"Painter\" (Q1028181) - this is an entity node, not a leaf\n- \"Baroque\" (Q37853) - this is an entity node, not a leaf\n- \"Italy\" (Q38) - this is an entity node, not a leaf\n- \"Rome\" (Q212657) - this is an entity node, not a leaf\n- Orazio Gentileschi (Q367560) - this is an entity node, not a leaf\n- Artemisia Gentileschi - the central entity\n\nThe relations that point to these non-leaf entity nodes (rather than to literal values like dates or strings) are:\n\n- occupation (pointing to Painter)\n- style (pointing to Baroque)\n- movement (pointing to Baroque)\n- citizen (pointing to Italy)\n- place of birth (pointing to Rome)\n- father (pointing to Orazio Gentileschi)\n- child (pointing to Artemisia Gentileschi)\n- member (appears to be a relation as well)\n\nAnswer: occupation, style, movement, citizen, place of birth, father, child, member", "citations": [], "input_tokens": 25539, "output_tokens": 310, "cost_micros": 81267, "latency_ms": 9266, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "child, citizen, father, member, movement, occupation, place of birth, style", "normalised_gold": "birth, member, movement, place of birth"}}
+{"qid": "2310.05634v2.pdf::Q000", "doc_id": "2310.05634v2.pdf", "answer_format": "list", "gold": "['movement', 'birth', 'place of birth', 'member']", "pages": 24, "document_id": 7590, "arm": "llamacloud_basic_lc", "question_id": "2310.05634v2.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1, I need to identify the relation arrows in the Knowledge Graph that do not point to leaf nodes (specific values).\n\nIn the Knowledge Graph shown, the relations are labeled on the arrows connecting entities. Let me trace each relation:\n\n- \"occupation\" \u2192 points to \"Painter\" (Q37853) - this is a leaf node\n- \"movement\" \u2192 points to \"Baroque\" (Q1028181) - this is a leaf node\n- \"style\" \u2192 appears to connect nodes but the endpoint is not clearly a specific leaf value\n- \"place of birth\" \u2192 points to a specific location\n- \"child\" \u2192 points to \"Artemisia\" (another person entity, not a leaf value)\n- \"father\" \u2192 points to \"Orazio\" (another person entity, not a leaf value)\n- \"member\" \u2192 connects to citizenship information\n- \"citizen\" \u2192 points to \"Italy\" (a leaf node)\n\nThe relations that point to other entities rather than specific attribute values (leaf nodes) are:\n\nAnswer: child, father", "citations": [], "input_tokens": 17550, "output_tokens": 231, "cost_micros": 56115, "latency_ms": 7312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "child, father", "normalised_gold": "birth, member, movement, place of birth"}}
+{"qid": "2310.05634v2.pdf::Q000", "doc_id": "2310.05634v2.pdf", "answer_format": "list", "gold": "['movement', 'birth', 'place of birth', 'member']", "pages": 24, "document_id": 7590, "arm": "llamacloud_premium_lc", "question_id": "2310.05634v2.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1, I need to identify the relation arrows that do not point to specific leaf nodes (entity or property nodes).\n\nFrom the Knowledge Graph Context cloud, I can see the following relationships:\n\n- E1 (OrazioGentileschi) -- occupation --> P1 (Painter) - points to leaf node P1\n- E1 -- movement --> P2 (Baroque) - points to leaf node P2\n- E1 -- birth --> E1 - points back to itself, not a leaf node\n- E1 -- child --> E2 - points to E2, which has further connections\n- E2 (ArtemisiaGentileschi) -- father --> E1 - points to E1, which has further connections\n- E2 -- occupation --> P1 - points to leaf node P1\n- E2 -- style --> P2 - points to leaf node P2\n- E2 -- place of birth --> E3 - points to E3, which has further connections\n- E2 -- member --> E2 - points back to itself, not a leaf node\n- E2 -- Citizen --> E3 - points to E3, which has further connections\n- E1 -- Citizen --> E3 - points to E3, which has further connections\n\nThe relations that do not point to specific leaf nodes are those pointing to entities that have their own outgoing connections or pointing back to themselves:\n\nAnswer: birth, child, father, place of birth, member, Citizen", "citations": [], "input_tokens": 23598, "output_tokens": 341, "cost_micros": 75909, "latency_ms": 8125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.6, "method": "list_set", "normalised_pred": "birth, child, citizen, father, member, place of birth", "normalised_gold": "birth, member, movement, place of birth"}}
+{"qid": "2310.05634v2.pdf::Q000", "doc_id": "2310.05634v2.pdf", "answer_format": "list", "gold": "['movement', 'birth', 'place of birth', 'member']", "pages": 24, "document_id": 7590, "arm": "surfsense_agentic", "question_id": "2310.05634v2.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find Figure 1 and identify which relation arrows do not point to specific leaf nodes.Let me search for \"Figure 1\" in the document to locate it:Let me read more to see the complete Figure 1 description:I can see Figure 1 shows a knowledge graph. Let me look more carefully at the structure. The text shows entities and relations. Let me search for more context about the graph structure:Based on the text extracted from Figure 1, I can see the knowledge graph structure. The entities and relations listed are:\n\n- Q1028181 (Painter)\n- Q37853 (Baroque)\n- Q367560 (Orazio Gentileschi)\n- Q212657 (with \"Artemisia Gentileschi Citizen\" below it)\n- Q38 (Italy)\n\nRelations shown:\n- occupation\n- style\n- movement\n- occupation (appears twice)\n- child\n- birth\n- place of birth\n- father\n- member\n\nLooking at the structure, the relations that appear to connect between entities but may not point to specific leaf nodes would be those that connect to higher-level entities or categories. From the layout, I need to identify which relation arrows don't terminate at leaf nodes.\n\nLet me look for any figures or images that might show the actual graph structure more", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 59797, "error": null, "extra": {"thread_id": 2005, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_c3fdd69258514ce3917c73b31fe40d3e", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 249, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "let me look for any figures or images that might show actual graph structure more", "normalised_gold": "birth, member, movement, place of birth"}}
+{"qid": "2310.05634v2.pdf::Q001", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "red", "pages": 24, "document_id": 7590, "arm": "native_pdf", "question_id": "2310.05634v2.pdf::Q001", "answer_letter": null, "raw_text": "I need to locate Figure 5 in the document and examine the lines shown in it.\n\nLooking through the document, I can find Figure 5 on page 8. The figure is titled \"Retrieval Analysis\" and shows citation evaluation (Micro) of generated texts using knowledge graphs with retrieval accuracy 100 (gold), 80, 60, 40, and 20.\n\nThe figure contains four lines:\n- A yellow line (precision)\n- A green line (recall)\n- A blue line (F1_Score)\n- A red line (correctness)\n\nLooking at the graph, I need to identify which line does not intersect with any other line.\n\nThe red line (correctness) starts at the highest point and remains consistently above all other lines throughout the graph. It descends from left to right but never crosses or intersects with any of the other three lines (yellow, green, or blue).\n\nAnswer: red", "citations": [], "input_tokens": 60196, "output_tokens": 207, "cost_micros": 183693, "latency_ms": 12141, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.05634v2.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "red", "normalised_gold": "red"}}
+{"qid": "2310.05634v2.pdf::Q001", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "red", "pages": 24, "document_id": 7590, "arm": "azure_basic_lc", "question_id": "2310.05634v2.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 5 in the document, which shows \"Retrieval Analysis\" with multiple lines plotted against Retrieval Accuracy on the x-axis:\n\nThe lines shown are:\n- precision (yellow)\n- recall (green)\n- F1_Score (appears to be another color)\n- correctness (red/orange)\n\nThe correctness line (red/orange colored line at the top of the graph) maintains a consistently high value around 80-90 and does not intersect with any of the other lines throughout the plot.\n\nAnswer: red (or orange/correctness line)", "citations": [], "input_tokens": 22213, "output_tokens": 134, "cost_micros": 68649, "latency_ms": 6108, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.33333333333333337, "method": "str_norm", "normalised_pred": "red or orange correctness line", "normalised_gold": "red"}}
+{"qid": "2310.05634v2.pdf::Q001", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "red", "pages": 24, "document_id": 7590, "arm": "azure_premium_lc", "question_id": "2310.05634v2.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 5 titled \"Retrieval Analysis\", I need to identify which line does not intersect with any other line.\n\nThe figure shows four lines representing different metrics (precision, recall, F1_Score, and correctness) plotted against retrieval accuracy from gold (100) to 20.\n\nExamining the lines:\n- The yellow line (precision) intersects with other lines\n- The green line (recall) intersects with other lines  \n- The blue line (F1_Score) intersects with other lines\n- The red line (correctness) stays at the top of the graph and does not intersect with any of the other three lines\n\nAnswer: red", "citations": [], "input_tokens": 25538, "output_tokens": 150, "cost_micros": 78864, "latency_ms": 6328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "red", "normalised_gold": "red"}}
+{"qid": "2310.05634v2.pdf::Q001", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "red", "pages": 24, "document_id": 7590, "arm": "llamacloud_basic_lc", "question_id": "2310.05634v2.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 5 in the document, I need to identify the line that has no intersection with any other line.\n\nFigure 5 shows citation evaluation metrics (Micro) with retrieval accuracy from 100 (gold) down to 20. There are four lines in the graph representing:\n- Correctness (blue)\n- Precision (yellow) \n- Recall (green)\n- F1 (red)\n\nExamining the lines, the blue line (Correctness) remains at the top of the graph and does not intersect with any of the other three lines throughout the entire range of retrieval accuracy values.\n\nAnswer: Blue", "citations": [], "input_tokens": 17549, "output_tokens": 144, "cost_micros": 54807, "latency_ms": 5610, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "blue", "normalised_gold": "red"}}
+{"qid": "2310.05634v2.pdf::Q001", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "red", "pages": 24, "document_id": 7590, "arm": "llamacloud_premium_lc", "question_id": "2310.05634v2.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 5 in the document, I can see a chart showing citation evaluation metrics (precision, recall, F1_score, and correctness) plotted against retrieval accuracy levels (20, 40, 60, 80, and gold/100).\n\nExamining the lines in the chart:\n- The yellow line (precision) intersects with other lines\n- The green line (recall) intersects with other lines\n- The blue line (F1_score) intersects with other lines\n- The red line (correctness) runs above all other lines and does not intersect with any of them\n\nAnswer: red", "citations": [], "input_tokens": 23597, "output_tokens": 138, "cost_micros": 72861, "latency_ms": 5000, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "red", "normalised_gold": "red"}}
+{"qid": "2310.05634v2.pdf::Q001", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "red", "pages": 24, "document_id": 7590, "arm": "surfsense_agentic", "question_id": "2310.05634v2.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the document to find Figure 5 and identify the line with no intersections.", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 25967, "error": null, "extra": {"thread_id": 2006, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_a9d66758d1df48679fe6def6615062f0", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 40, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "i ll read document to find figure 5 and identify line with no intersections", "normalised_gold": "red"}}
+{"qid": "2310.05634v2.pdf::Q002", "doc_id": "2310.05634v2.pdf", "answer_format": "int", "gold": "3", "pages": 24, "document_id": 7590, "arm": "native_pdf", "question_id": "2310.05634v2.pdf::Q002", "answer_letter": null, "raw_text": "I need to count how many tables in the document include \"F1\" as a metric.\n\nLet me go through the document systematically:\n\n1. Table 1 (page 3) - Shows an example data entry, not evaluation metrics\n2. Table 2 (page 3) - Human Evaluation on dataset quality - no F1\n3. Table 3 (page 7) - Citation Quality evaluation - shows \"F1.\" column (both Micro and Macro)\n4. Table 4 (page 7) - Text quality evaluation - no F1\n5. Table 5 (page 7) - Citation quality with knowledge removed - shows \"F1.\" column\n6. Table 6 (page 8) - Human evaluation alignment - no F1\n7. Table 7 (page 9) - Comparison of General and Specific questions - shows \"F1.\" column\n8. Table 8 (page 15) - Human annotation instructions - no F1\n9. Tables 9-17 (pages 16-21) - Prompts and instructions - no F1\n10. Table 18 (page 22) - Evolutionary question generation example - no F1\n11. Tables 19-20 (pages 23-24) - Example outputs - no F1\n\nAdditionally, I can see:\n- Figure 4 (page 8) shows \"F1_Score\" in a graph\n- Figure 5 (page 8) shows \"F1_Score\" in a graph\n\nBut the question asks specifically about tables, not figures.\n\nTables with F1 metric:\n- Table 3\n- Table 5\n- Table 7\n\nAnswer: 3", "citations": [], "input_tokens": 60181, "output_tokens": 381, "cost_micros": 186258, "latency_ms": 25483, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.05634v2.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "3"}}
+{"qid": "2310.05634v2.pdf::Q002", "doc_id": "2310.05634v2.pdf", "answer_format": "int", "gold": "3", "pages": 24, "document_id": 7590, "arm": "azure_basic_lc", "question_id": "2310.05634v2.pdf::Q002", "answer_letter": null, "raw_text": "I need to find all tables in the document that include \"F1\" as a metric.\n\nLet me go through the tables systematically:\n\n1. Table 1 (Example data entry) - No F1 metric\n2. Table 2 (Human Evaluation on BioKaLMA dataset) - No F1 metric\n3. Table 3 (Citation Quality) - Yes, has \"F1.\" columns (both Micro and Macro)\n4. Table 4 (Text quality evaluation) - No F1 metric\n5. Table 5 (Citation quality with removed knowledge) - Yes, has \"F1.\" column\n6. Table 6 (Human Evaluation on alignment) - No F1 metric\n7. Table 7 (Comparison of General and Specific questions) - Yes, has \"F1.\" column\n\nNow let me check if there are more tables in the appendices:\n\n8. Table 8 (Human Evaluation Instructions) - No F1 metric\n9. Table 9 (Instruction for evolutionary question construction) - No F1 metric\n10. Table 10 (Instruction for round 2-5) - No F1 metric\n11. Table 11 (General question generation) - No F1 metric\n12. Table 12 (Specific question generation) - No F1 metric\n13. Table 13 (Answer generation instruction) - No F1 metric\n14. Table 14 (GPT-EVAL Coherence) - No F1 metric\n15. Table 15 (GPT-EVAL Consistency) - No F1 metric\n16. Table 16 (GPT-EVAL Fluency) - No F1 metric\n17. Table 17 (GPT-EVAL Relevance) - No F1 metric\n18. Table 18 (Evolutionary question construction example) - No F1 metric\n19. Table 19 (Example from ChatGPT) - No F1 metric\n20. Table 20 (Example from GPT-4) - No F1 metric\n\nTables with F1 metric: Table 3, Table 5, and Table 7\n\nAnswer: 3", "citations": [], "input_tokens": 22198, "output_tokens": 502, "cost_micros": 74124, "latency_ms": 9109, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "3"}}
+{"qid": "2310.05634v2.pdf::Q002", "doc_id": "2310.05634v2.pdf", "answer_format": "int", "gold": "3", "pages": 24, "document_id": 7590, "arm": "azure_premium_lc", "question_id": "2310.05634v2.pdf::Q002", "answer_letter": null, "raw_text": "I need to count all tables that include \"F1\" as a metric.\n\nLet me go through the document systematically:\n\n1. Table 1: Shows an example data entry - no F1 metric\n2. Table 2: Human Evaluation on BioKaLMA dataset - no F1 metric\n3. Table 3: Citation Quality evaluation - includes \"F1.\" column (both Micro and Macro F1)\n4. Table 4: Evaluation on generated text quality - no F1 metric\n5. Table 5: Citation quality evaluation with knowledge removed - includes \"F1.\" column\n6. Table 6: Human Evaluation on text-citation alignment - no F1 metric\n7. Table 7: Comparison of evaluation results on General and Specific question setting - includes \"F1.\" column\n8. Table 8: Instruction for human annotators - not a data table with metrics\n9. Tables 9-17: These are prompt/instruction tables - no F1 metrics\n10. Table 18: Example of generated questions - no F1 metric\n11. Tables 19-20: Examples of attributed answers - no F1 metrics\n\nTables with F1 metric:\n- Table 3\n- Table 5\n- Table 7\n\nAnswer: 3", "citations": [], "input_tokens": 25523, "output_tokens": 289, "cost_micros": 80904, "latency_ms": 8578, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "3"}}
+{"qid": "2310.05634v2.pdf::Q002", "doc_id": "2310.05634v2.pdf", "answer_format": "int", "gold": "3", "pages": 24, "document_id": 7590, "arm": "llamacloud_basic_lc", "question_id": "2310.05634v2.pdf::Q002", "answer_letter": null, "raw_text": "I need to search through the document for tables that include \"F1\" as a metric.\n\nLet me identify all tables in the document:\n\n1. Table 1: Shows an example data entry in BioKaLMA - does not contain F1 metric\n2. Table 2: Human Evaluation on BioKaLMA dataset - contains metrics: Authenticity, Relevance, Naturalness, Significance - no F1\n3. Table 3: Citation Quality evaluation - contains columns including \"F1.\" in both Micro and Macro sections - **YES, contains F1**\n4. Table 5: Citation quality evaluation for generated texts - contains columns: Corr., Prec., Rec., F1. - **YES, contains F1**\n5. Table 6: Result of Human Evaluation on text-citation alignment - contains Alignment and Human Avg. - no F1\n6. Table 7: Comparison between general and specific questions - contains columns including F1. - **YES, contains F1**\n7. Table 8: Instruction for human annotators - no metrics table\n8. Table 9: Not clearly visible as a numbered table in the content shown\n9. Table 10: Instruction and demonstration - not a metrics table\n10. Table 11: Instruction and demonstration - not a metrics table\n11. Table 12: Instruction and demonstration - not a metrics table\n12. Table 13: Instruction for answer generation - not a metrics table\n13. Tables 14-17: Evaluation criteria descriptions - not metrics tables\n14. Table 18: Example of generated questions - not a metrics table\n15. Table 19: Stephen Crane biographical information - not a metrics table with F1\n\nCounting tables with F1 metric:\n- Table 3: YES\n- Table 5: YES\n- Table 7: YES\n\nAnswer: 3", "citations": [], "input_tokens": 17534, "output_tokens": 420, "cost_micros": 58902, "latency_ms": 11342, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "3"}}
+{"qid": "2310.05634v2.pdf::Q002", "doc_id": "2310.05634v2.pdf", "answer_format": "int", "gold": "3", "pages": 24, "document_id": 7590, "arm": "llamacloud_premium_lc", "question_id": "2310.05634v2.pdf::Q002", "answer_letter": null, "raw_text": "I need to count all tables that include \"F1\" as a metric.\n\nLet me go through the document systematically:\n\n1. **Table 1**: Shows an example data entry - no F1 metric\n2. **Table 2**: Human Evaluation metrics (Authenticity, Relevance, Naturalness, Significance) - no F1\n3. **Table 3**: Citation Quality evaluation - includes \"Micro F1.\" and \"Macro F1.\" columns - **YES**\n4. **Table 4**: Text quality evaluation (Coh., Con., Flu., Rel.) - no F1\n5. **Table 5**: Citation quality with removed knowledge - includes \"F1.\" column - **YES**\n6. **Figure 4**: Shows precision, recall, and F1_Score for [NA] - **YES** (this is labeled as a figure but formatted as a table)\n7. **Figure 5**: Citation evaluation with retrieval accuracy - includes \"F1_score\" column - **YES** (also labeled as figure but formatted as a table)\n8. **Table 6**: Human evaluation on alignment - no F1\n9. **Table 7**: Comparison of General and Specific questions - includes \"Citation Eval. F1.\" column - **YES**\n10. **Table 8**: Human annotation instructions - no F1\n11. Tables 9-17: Various prompts and instructions - no F1 metrics\n12. **Table 18**: Example of evolutionary question construction - no F1\n13. **Table 19**: Example output from ChatGPT - no F1\n14. **Table 20**: Example output from GPT-4 - no F1\n\nTables/Figures with F1 metric:\n- Table 3\n- Table 5\n- Figure 4 (table format)\n- Figure 5 (table format)\n- Table 7\n\nAnswer: 5", "citations": [], "input_tokens": 23582, "output_tokens": 429, "cost_micros": 77181, "latency_ms": 11358, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "3"}}
+{"qid": "2310.05634v2.pdf::Q002", "doc_id": "2310.05634v2.pdf", "answer_format": "int", "gold": "3", "pages": 24, "document_id": 7590, "arm": "surfsense_agentic", "question_id": "2310.05634v2.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the document that was mentioned (2310.05634v2.pdf.xml) to find all tables that include \"F1\" as a metric.Let me continue reading the document to search for all tables:Now let me search for \"F1\" to find which tables contain this metric:Let me read the specific sections containing the tables to identify which ones include F1 as a metric:Now let me check all the other tables to see if they contain F1. Let me search more systematically:Let me also check Table 6:Now let me check the tables in the appendix to ensure I haven't missed any:Now let me scan through all the remaining content to ensure I have checked all tables. Let me list out what I've found so far:\n\nTables with F1:\n1. Table 3 - Has F1 (both Micro and Macro F1)\n2. Table 5 - Has F1 \n3. Table 7 - Has F1\n\nTables without F1:\n1. Table 1 - Does not have F1 (has Authenticity, Relevance, Naturalness, Significance)\n2. Table 2 - This appears to be just a caption reference, same as Table 1\n3. Table 4 - Does not have F1 (has Coh., Con., Flu., Rel.)\n4. Table 6 - Does not have F1 (has Alignment and Human Avg.)\n\nLet me continue checking the rest of the tables (8-20) mentioned in the document:Based on my thorough examination of the document, I have identified all tables and checked which ones include \"F1\" as a metric. Let me verify my findings:\n\nTables with F1 as a metric:\n1. **Table 3**: Citation Quality evaluation - includes both Micro F1 and Macro F1 columns\n2. **Table 5**: Citation quality evaluation for generated texts using a KG with N pieces of knowledge removed - includes F1 column\n3. **Table 7**: Comparison of evaluation results on General and Specific question setting - includes F1 in the Citation Evaluation columns\n\nTables without F1 as a metric:\n- Table 1: Example data entry (no F1)\n- Table 2: Human Evaluation on BioKaLMA dataset (no F1)\n- Table 4: Evaluation on generated text quality (no F1, only Coh., Con., Flu., Rel.)\n- Table 6: Result of Human Evaluation on text-citation alignment (no F1, only Alignment and Human Avg.)\n- Tables 8-20: These are instruction tables, prompt demonstrations, and examples (no F1 metrics)\n\nAnswer: 3", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 116094, "error": null, "extra": {"thread_id": 2007, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_d5e6c0fe45c9431aaf087f3d70bed847", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 545, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "3"}}
+{"qid": "2310.05634v2.pdf::Q003", "doc_id": "2310.05634v2.pdf", "answer_format": "float", "gold": "0.1", "pages": 24, "document_id": 7590, "arm": "native_pdf", "question_id": "2310.05634v2.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Table 3 in the paper, which shows the citation quality evaluation results for different models:\n\nThe alignment scores for ChatGPT at different temperatures are:\n- ChatGPT (0.1): 85.9\n- ChatGPT (0.5): 84.5\n- ChatGPT (0.9): 84.1\n\nThe highest alignment score is at temperature 0.1.\n\nAnswer: 0.1", "citations": [], "input_tokens": 60189, "output_tokens": 103, "cost_micros": 182112, "latency_ms": 22187, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.05634v2.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "0.1", "normalised_gold": "0.1"}}
+{"qid": "2310.05634v2.pdf::Q003", "doc_id": "2310.05634v2.pdf", "answer_format": "float", "gold": "0.1", "pages": 24, "document_id": 7590, "arm": "azure_basic_lc", "question_id": "2310.05634v2.pdf::Q003", "answer_letter": null, "raw_text": "I need to find the alignment scores for ChatGPT at different temperatures from Table 3.\n\nFrom Table 3 (Citation Quality evaluation), the alignment scores for ChatGPT are:\n- ChatGPT (0.1): 85.9(2.5)\n- ChatGPT (0.5): 84.5(1.1)\n- ChatGPT (0.9): 84.1(0.5)\n\nThe highest alignment score is 85.9 at temperature 0.1.\n\nAnswer: 0.1", "citations": [], "input_tokens": 22206, "output_tokens": 129, "cost_micros": 68553, "latency_ms": 5421, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "0.1", "normalised_gold": "0.1"}}
+{"qid": "2310.05634v2.pdf::Q003", "doc_id": "2310.05634v2.pdf", "answer_format": "float", "gold": "0.1", "pages": 24, "document_id": 7590, "arm": "azure_premium_lc", "question_id": "2310.05634v2.pdf::Q003", "answer_letter": null, "raw_text": "I need to find the alignment scores for ChatGPT at different temperatures from the paper.\n\nLooking at Table 3 in the paper, which shows \"Citation Quality OpenAI models and LLaMA family models\" including alignment scores:\n\n- ChatGPT (0.1): Alignment = 85.9(2.5)\n- ChatGPT (0.5): Alignment = 84.5(1.1)\n- ChatGPT (0.9): Alignment = 84.1(0.5)\n\nThe highest alignment score among the ChatGPT models is at temperature 0.1 with a score of 85.9.\n\nAnswer: 0.1", "citations": [], "input_tokens": 25531, "output_tokens": 159, "cost_micros": 78978, "latency_ms": 5687, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "0.1", "normalised_gold": "0.1"}}
+{"qid": "2310.05634v2.pdf::Q003", "doc_id": "2310.05634v2.pdf", "answer_format": "float", "gold": "0.1", "pages": 24, "document_id": 7590, "arm": "llamacloud_basic_lc", "question_id": "2310.05634v2.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Table 3 in the document, which presents the citation quality evaluation results including alignment scores for different models:\n\nFor ChatGPT at different temperatures:\n- ChatGPT (0.1): Alignment = 85.9\n- ChatGPT (0.5): Alignment = 84.5\n- ChatGPT (0.9): Alignment = 84.1\n\nThe highest alignment score for ChatGPT is at temperature 0.1 with a score of 85.9.\n\nAnswer: 0.1", "citations": [], "input_tokens": 17542, "output_tokens": 125, "cost_micros": 54501, "latency_ms": 5592, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "0.1", "normalised_gold": "0.1"}}
+{"qid": "2310.05634v2.pdf::Q003", "doc_id": "2310.05634v2.pdf", "answer_format": "float", "gold": "0.1", "pages": 24, "document_id": 7590, "arm": "llamacloud_premium_lc", "question_id": "2310.05634v2.pdf::Q003", "answer_letter": null, "raw_text": "I need to find the alignment scores for ChatGPT at different temperatures from Table 3.\n\nLooking at Table 3 (Citation Quality evaluation), I can see the alignment scores for different ChatGPT temperatures:\n\n- ChatGPT (0.1): 85.9(2.5)\n- ChatGPT (0.5): 84.5(1.1)\n- ChatGPT (0.9): 84.1(0.5)\n\nThe highest alignment score is 85.9 for ChatGPT at temperature 0.1.\n\nAnswer: 0.1", "citations": [], "input_tokens": 23590, "output_tokens": 139, "cost_micros": 72855, "latency_ms": 4796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "0.1", "normalised_gold": "0.1"}}
+{"qid": "2310.05634v2.pdf::Q003", "doc_id": "2310.05634v2.pdf", "answer_format": "float", "gold": "0.1", "pages": 24, "document_id": 7590, "arm": "surfsense_agentic", "question_id": "2310.05634v2.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the document to find information about temperature and ChatGPT alignment scores.Let me search for temperature settings and ChatGPT results more specifically:Now let me read the specific section where temperature experiments with ChatGPT are discussed:Perfect! I found the table with alignment scores for ChatGPT at different temperatures. From Table 3, I can see:\n\n- ChatGPT (0.1): Alignment score = 85.9(2.5)\n- ChatGPT (0.5): Alignment score = 84.5(1.1)\n- ChatGPT (0.9): Alignment score = 84.1(0.5)\n\nThe highest alignment score for ChatGPT is at temperature 0.1 with a score of 85.9.\n\nAnswer: 0.1", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 78952, "error": null, "extra": {"thread_id": 2008, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_826b46b364ee4f52853639bf7b854cb3", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 182, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "0.1", "normalised_gold": "0.1"}}
+{"qid": "2310.05634v2.pdf::Q004", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "Evolutionary Question Generation", "pages": 24, "document_id": 7590, "arm": "native_pdf", "question_id": "2310.05634v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the Dataset Construction section (Section A and subsections A.1, A.2, A.3), I need to compare the length of descriptions for each step:\n\n**A.1 Person Selection**: Approximately 150 words - describes selecting name pairs from the biographical database and filtering criteria.\n\n**A.2 Name Disambiguation**: Approximately 200 words - describes using SPARQL queries to map names to WikiData entities, handling duplicate names, and extracting one-hop sub-graphs.\n\n**A.3 Evolutionary Question Generation**: Approximately 450-500 words - provides extensive detail about the iterative process, knowledge injection, specificity and coherence scoring, the formula for scoring, and implementation details across five iterations.\n\nThe Evolutionary Question Generation step clearly contains the most detailed explanation, including mathematical formulas, specific algorithms for knowledge selection, and detailed descriptions of the iterative process.\n\nAnswer: Evolutionary Question Generation", "citations": [], "input_tokens": 60191, "output_tokens": 204, "cost_micros": 183633, "latency_ms": 21312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.05634v2.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "evolutionary question generation", "normalised_gold": "evolutionary question generation"}}
+{"qid": "2310.05634v2.pdf::Q004", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "Evolutionary Question Generation", "pages": 24, "document_id": 7590, "arm": "azure_basic_lc", "question_id": "2310.05634v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the Dataset Construction section (Appendix A), I need to compare the lengths of descriptions for each step:\n\n1. **Person Selection (A.1)**: Describes selecting name pairs from biographical database - relatively brief, about 1 paragraph\n\n2. **Name Disambiguation (A.2)**: Describes mapping names to WikiData entities using SPARQL queries - moderate length, about 1-2 paragraphs\n\n3. **Evolutionary Question Generation (A.3)**: Describes the iterative process of extending paragraphs, selecting knowledge, calculating scores with formulas, and generating questions - significantly longer, spanning multiple paragraphs with detailed explanations of specificity scores, coherence scores, the scoring formula, iteration process, and implementation details\n\nThe Evolutionary Question Generation step clearly contains the most extensive description with the most detail about the methodology.\n\nAnswer: Evolutionary Question Generation", "citations": [], "input_tokens": 22208, "output_tokens": 193, "cost_micros": 69519, "latency_ms": 6781, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "evolutionary question generation", "normalised_gold": "evolutionary question generation"}}
+{"qid": "2310.05634v2.pdf::Q004", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "Evolutionary Question Generation", "pages": 24, "document_id": 7590, "arm": "azure_premium_lc", "question_id": "2310.05634v2.pdf::Q004", "answer_letter": null, "raw_text": "To determine which step takes the most words to describe in the dataset construction process, I need to examine the three main steps described in Section 2.2 and detailed in Appendix A:\n\n1. **Person Selection** (Section A.1): This section describes filtering and selecting people from the biographical database. It's relatively brief.\n\n2. **Name Disambiguation** (Section A.2): This section explains mapping names to WikiData entities using SPARQL queries and extracting one-hop sub-graphs. It's moderately detailed.\n\n3. **Evolutionary Question Generation** (Section A.3): This section provides extensive detail about the iterative process of extending paragraphs, knowledge selection algorithms, scoring mechanisms (specificity and coherence scores), and the multiple rounds of generation with specific templates and prompts.\n\nThe Evolutionary Question Generation section is significantly longer and more detailed than the other two steps, including mathematical formulas, multiple iteration explanations, and extensive procedural details.\n\nAnswer: Evolutionary Question Generation", "citations": [], "input_tokens": 25533, "output_tokens": 217, "cost_micros": 79854, "latency_ms": 7764, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "evolutionary question generation", "normalised_gold": "evolutionary question generation"}}
+{"qid": "2310.05634v2.pdf::Q004", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "Evolutionary Question Generation", "pages": 24, "document_id": 7590, "arm": "llamacloud_basic_lc", "question_id": "2310.05634v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at section 2.2 Dataset Construction and Appendix A Dataset Construction, the three main steps are:\n\n1. Person Selection\n2. Name Disambiguation  \n3. Evolutionary Question Generation\n\nComparing the word counts in Appendix A where these steps are described in detail:\n\n- A.1 Person Selection: approximately 150 words\n- A.2 Name Disambiguation: approximately 180 words\n- A.3 Evolutionary Question Generation: approximately 130 words (before the subsections)\n\nHowever, the Evolutionary Question Generation section has significantly more content when including all the detailed prompt tables (Tables 9-12) and examples that follow it, making it the most extensively described step overall.\n\nAnswer: Evolutionary Question Generation", "citations": [], "input_tokens": 17544, "output_tokens": 165, "cost_micros": 55107, "latency_ms": 5703, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "evolutionary question generation", "normalised_gold": "evolutionary question generation"}}
+{"qid": "2310.05634v2.pdf::Q004", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "Evolutionary Question Generation", "pages": 24, "document_id": 7590, "arm": "llamacloud_premium_lc", "question_id": "2310.05634v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the dataset construction section (2.2), I need to compare the length of descriptions for each of the three steps:\n\n1. **Person Selection** (A.1): Approximately 150 words\n2. **Name Disambiguation** (A.2): Approximately 200 words  \n3. **Evolutionary Question Generation** (A.3): Approximately 450+ words (includes detailed explanation of the iterative process, knowledge selection algorithm with formula, scoring mechanisms, etc.)\n\nThe Evolutionary Question Generation step is described in significantly more detail than the other two steps, taking up the most space in the paper.\n\nAnswer: Evolutionary Question Generation", "citations": [], "input_tokens": 23592, "output_tokens": 143, "cost_micros": 72921, "latency_ms": 5844, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "evolutionary question generation", "normalised_gold": "evolutionary question generation"}}
+{"qid": "2310.05634v2.pdf::Q004", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "Evolutionary Question Generation", "pages": 24, "document_id": 7590, "arm": "surfsense_agentic", "question_id": "2310.05634v2.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to find information about dataset construction steps and compare their descriptions.Let me search for \"dataset construction\" in the document:Let me look at the detailed dataset construction section:Let me read the Dataset Construction section starting at line 506:Good! I can see the three steps mentioned at line 521-523. Now let me read the Appendix A which has the detailed dataset construction process:Perfect! I can now see the three steps of dataset construction in detail. Let me count the words in each section to determine which takes the most words to describe.\n\nLet me read each section more carefully to count:Now let me see where the Evolutionary Question Generation section ends:Perfect! Now I have all three sections. Let me analyze the word count for each step:\n\n**A.1 Person Selection** (lines 2089-2110): Approximately 22 lines of content\n**A.2 Name Disambiguation** (lines 2113-2145): Approximately 33 lines of content  \n**A.3 Evolutionary Question Generation** (lines 2148-2222): Approximately 75 lines of content\n\nBy visual inspection and line count, the \"Evolutionary Question Generation\" section is significantly longer than the other two steps. It contains detailed explanations about:\n- The evolutionary approach\n- Iteration process\n- Knowledge annotation\n- Specificity and coherence scoring\n- Formula for scoring\n- Implementation details\n\nThis is clearly the step that takes the most words to describe.\n\nAnswer: Evolutionary Question Generation", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 78750, "error": null, "extra": {"thread_id": 2009, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_1b0d2d82267f4e5eab9a849c3924465e", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 309, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "evolutionary question generation", "normalised_gold": "evolutionary question generation"}}
+{"qid": "2310.05634v2.pdf::Q005", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "If it can be partially verified by the knowledge graph G", "pages": 24, "document_id": 7590, "arm": "native_pdf", "question_id": "2310.05634v2.pdf::Q005", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "if it can be partially verified by knowledge graph g"}}
+{"qid": "2310.05634v2.pdf::Q005", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "If it can be partially verified by the knowledge graph G", "pages": 24, "document_id": 7590, "arm": "azure_basic_lc", "question_id": "2310.05634v2.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document's definition of the \"Conscious Incompetence\" setting in Section 2.1:\n\n\"A sentence can map to both [NA] and a list of sub-graph knowledge if it can be partially verified by the knowledge graph G.\"\n\nAnswer: if it can be partially verified by the knowledge graph", "citations": [], "input_tokens": 22222, "output_tokens": 71, "cost_micros": 67731, "latency_ms": 4952, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.9473684210526316, "method": "str_norm", "normalised_pred": "if it can be partially verified by knowledge graph", "normalised_gold": "if it can be partially verified by knowledge graph g"}}
+{"qid": "2310.05634v2.pdf::Q005", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "If it can be partially verified by the knowledge graph G", "pages": 24, "document_id": 7590, "arm": "azure_premium_lc", "question_id": "2310.05634v2.pdf::Q005", "answer_letter": null, "raw_text": "According to the document, a sentence can map to both [NA] and a list of sub-graph knowledge when:\n\nAnswer: It can be partially verified by the knowledge graph G", "citations": [], "input_tokens": 25547, "output_tokens": 40, "cost_micros": 77241, "latency_ms": 3375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.9473684210526316, "method": "str_norm", "normalised_pred": "it can be partially verified by knowledge graph g", "normalised_gold": "if it can be partially verified by knowledge graph g"}}
+{"qid": "2310.05634v2.pdf::Q005", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "If it can be partially verified by the knowledge graph G", "pages": 24, "document_id": 7590, "arm": "llamacloud_basic_lc", "question_id": "2310.05634v2.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, I need to find the definition of when a sentence can map to both [NA] and knowledge citations.\n\nIn Section 2.1 \"Task Formulation,\" under \"Setting of Conscious Incompetence,\" the document states:\n\n\"A sentence can map to both [NA] and a list of sub-graph knowledge if it can.\"\n\nHowever, this is incomplete. Looking at the full context, the document explains that [NA] is used when a sentence \"includes some knowledge to be verified, but the knowledge is absent in the knowledge graph G.\"\n\nThe key insight comes from the demonstration in Table 13, which shows sentences containing both specific knowledge citations from the graph AND [NA] citations. This occurs when a sentence contains both:\n1. Knowledge that IS present in the knowledge graph (cited with specific knowledge)\n2. Knowledge that IS NOT present in the knowledge graph (cited with [NA])\n\nAnswer: When it contains both knowledge present in and absent from the knowledge graph", "citations": [], "input_tokens": 17558, "output_tokens": 214, "cost_micros": 55884, "latency_ms": 7110, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.2727272727272727, "method": "str_norm", "normalised_pred": "when it contains both knowledge present in and absent from knowledge graph", "normalised_gold": "if it can be partially verified by knowledge graph g"}}
+{"qid": "2310.05634v2.pdf::Q005", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "If it can be partially verified by the knowledge graph G", "pages": 24, "document_id": 7590, "arm": "llamacloud_premium_lc", "question_id": "2310.05634v2.pdf::Q005", "answer_letter": null, "raw_text": "According to the document, in the \"Setting of Conscious Incompetence\" section (2.1), a sentence can map to both [NA] and a list of sub-graph knowledge when:\n\n\"A sentence can map to both [NA] and a list of sub-graph knowledge if it can be partially verified by the knowledge graph $G$.\"\n\nAnswer: When it can be partially verified by the knowledge graph", "citations": [], "input_tokens": 23606, "output_tokens": 91, "cost_micros": 72183, "latency_ms": 3969, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.8421052631578948, "method": "str_norm", "normalised_pred": "when it can be partially verified by knowledge graph", "normalised_gold": "if it can be partially verified by knowledge graph g"}}
+{"qid": "2310.05634v2.pdf::Q005", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "If it can be partially verified by the knowledge graph G", "pages": 24, "document_id": 7590, "arm": "surfsense_agentic", "question_id": "2310.05634v2.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to find the author's definition of conscious incompetence and when a sentence can map to both [NA] and a list of sub-graph knowledge.Let me search for \"conscious incompetence\" in the document:Now let me read the section around line 485 and 514 where \"conscious incompetence\" is defined:Perfect! I found the answer. According to the document, in the section on \"Setting of Conscious Incompetence\", a sentence can map to both [NA] and a list of sub-graph knowledge when it can be partially verified by the knowledge graph G.\n\nAnswer: when it can be partially verified by the knowledge graph", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 34875, "error": null, "extra": {"thread_id": 2010, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_6035a4bba19d42d88826ef02286ce3c9", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 145, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.8421052631578948, "method": "str_norm", "normalised_pred": "when it can be partially verified by knowledge graph", "normalised_gold": "if it can be partially verified by knowledge graph g"}}
+{"qid": "2310.07609v1.pdf::Q000", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "2", "pages": 10, "document_id": 7591, "arm": "native_pdf", "question_id": "2310.07609v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the example figure of question-guided reasoning and count the numbers displayed in blue color.\n\nLooking at Figure 1 on page 1, which shows \"An example of question-guided reasoning for fact-checking complex real-world claims,\" I can see:\n\n- Q1: What is the greatest depth of the Black Sea?\n- A1: Black sea has a maximum depth of **2,212** meters.\n- Q2: How far can sunlight penetrate water?\n- A2: Sunlight does not penetrate water below **1,000** meters.\n- At the bottom: \"2,212 is greater than 1,000. Therefore, the claim is\"\n\nIn the figure, I can see numbers that appear to be highlighted or emphasized:\n- 2,212 (appears twice - once in A1 and once in the reasoning)\n- 1,000 (appears twice - once in A2 and once in the reasoning)\n\nLooking at the visual representation in the figure, the numbers that are displayed in blue/highlighted color are:\n- 2,212 (in A1)\n- 1,000 (in A2)\n- 2,212 (in the final reasoning line)\n- 1,000 (in the final reasoning line)\n\nSo there are 4 instances of numbers shown in blue color.\n\nAnswer: 4", "citations": [], "input_tokens": 27152, "output_tokens": 309, "cost_micros": 86091, "latency_ms": 11875, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.07609v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "2"}}
+{"qid": "2310.07609v1.pdf::Q000", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "2", "pages": 10, "document_id": 7591, "arm": "azure_basic_lc", "question_id": "2310.07609v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to look at Figure 1, which shows \"An example of question-guided reasoning for fact-checking complex real-world claims.\"\n\nIn Figure 1, I can see:\n1. The claim about the Black Sea\n2. Q1 asking about the greatest depth of the Black Sea\n3. A1 stating \"Black sea has a maximum depth of 2,212 meters\" - the number **2,212** appears in blue\n4. Additional text showing \"Average depth\" with what appears to be another number\n5. Q2 asking about how far sunlight can penetrate water\n6. A2 stating \"Sunlight does not penetrate water below 1,000 meters\" - the number **1,000** appears in blue\n7. At the bottom, there's reasoning that states \"2,212 is greater than 1,000\" - both **2,212** and **1,000** appear in blue here as well\n\nLooking carefully at the figure, I can identify the following blue-colored numbers:\n- 2,212 (appears multiple times)\n- 1,000 (appears multiple times)\n\nHowever, the question asks for how many numbers with blue color are mentioned, not how many times they appear. The distinct blue numbers are:\n1. 2,212\n2. 1,000\n\nAnswer: 2", "citations": [], "input_tokens": 11640, "output_tokens": 300, "cost_micros": 39420, "latency_ms": 8203, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2310.07609v1.pdf::Q000", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "2", "pages": 10, "document_id": 7591, "arm": "azure_premium_lc", "question_id": "2310.07609v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to look at Figure 1, which shows \"An example of question-guided reasoning for fact-checking complex real-world claims.\"\n\nIn Figure 1, I can see:\n- The claim about sunlight and the Black Sea\n- Q1 asking about the greatest depth of the Black Sea\n- A1 stating \"Black sea has a maximum depth of **2,212 meters**\" (2,212 appears to be in blue)\n- Q2 asking about how far sunlight can penetrate water\n- A2 stating \"Sunlight does not penetrate water below **1,000 meters**\" (1,000 appears to be in blue)\n- At the bottom, there's text saying \"2,212 is greater than 1,000\" where both numbers appear to be in blue\n\nSo the blue-colored numbers mentioned are:\n1. 2,212 (in A1)\n2. 1,000 (in A2)\n3. 2,212 (in the comparison at bottom)\n4. 1,000 (in the comparison at bottom)\n\nHowever, if we count unique numbers, there are 2 distinct numbers (2,212 and 1,000). But if we count each mention, there are 4 total mentions of numbers in blue.\n\nLooking more carefully at the figure, the numbers that appear to be highlighted/emphasized in blue are:\n- 2,212 (appears twice)\n- 1,000 (appears twice)\n\nCounting total mentions: 4\n\nAnswer: 4", "citations": [], "input_tokens": 12756, "output_tokens": 349, "cost_micros": 43503, "latency_ms": 8500, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "2"}}
+{"qid": "2310.07609v1.pdf::Q000", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "2", "pages": 10, "document_id": 7591, "arm": "llamacloud_basic_lc", "question_id": "2310.07609v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the example figure that shows question-guided reasoning and count the blue-colored numbers in it.\n\nLooking at the document, Figure 1 is mentioned in the text as showing an example of question-guided reasoning for the claim \"Sunlight can reach the deepest part of the Black Sea.\" The text states: \"This reasoning process can be formulated as question-guided reasoning, where the verification of the claim is guided by asking and answering a series of relevant questions, as shown in Figure 1.\"\n\nHowever, Figure 1 itself is not shown in the provided document content. \n\nThe document does show Figure 4, which displays the QACHECK user interface with an example about \"Lars Onsager won the Nobel prize when he was 30 years old.\" In this figure, I can see:\n- \"Reasoning depth: 0\" and \"Reasoning depth: 1\" labels\n- \"1968\" (the year Lars Onsager won the Nobel Prize)\n- \"1903\" (the year Lars Onsager was born)\n- \"65\" (mentioned in the final prediction: \"He was 65 when he won the Nobel prize\")\n\nLooking at the visible figures in the document, Figure 4 shows the question-guided reasoning process example. The numbers that appear to be highlighted or emphasized in the context of the reasoning are: 1968, 1903, and 65.\n\nWithout being able to see the actual colors in Figure 1 (which is referenced but not displayed), and based on Figure 4 which shows a complete example of question-guided reasoning, I can count 3 distinct numbers that would typically be highlighted in such a demonstration.\n\nAnswer: 3", "citations": [], "input_tokens": 8625, "output_tokens": 374, "cost_micros": 31485, "latency_ms": 9280, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "2"}}
+{"qid": "2310.07609v1.pdf::Q000", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "2", "pages": 10, "document_id": 7591, "arm": "llamacloud_premium_lc", "question_id": "2310.07609v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to examine Figure 1, which shows \"An example of question-guided reasoning for fact-checking complex real-world claims.\"\n\nLooking at Figure 1, I can see:\n- The claim about sunlight reaching the deepest part of the Black Sea\n- Step 1 with Q1 and A1, where A1 mentions \"2,212 meters\" \n- Step 2 with Q2 and A2, where A2 mentions \"1,000 meters\"\n- The conclusion states \"2,212 is greater than 1,000\"\n\nThe numbers mentioned are:\n1. 2,212 (meters - maximum depth of Black Sea)\n2. 1,000 (meters - how far sunlight penetrates)\n\nBoth numbers appear to be in the same color as the rest of the text in the boxes. Looking at the figure description and the actual content, I need to identify which numbers are specifically in blue color.\n\nFrom the document's Figure 1, examining the text carefully, the numbers that appear are 2,212 and 1,000. The question asks specifically about numbers with \"blue color.\"\n\nBased on the visual representation in Figure 1, I can identify 2 numbers: 2,212 and 1,000.\n\nAnswer: 2", "citations": [], "input_tokens": 12525, "output_tokens": 288, "cost_micros": 41895, "latency_ms": 8359, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2310.07609v1.pdf::Q000", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "2", "pages": 10, "document_id": 7591, "arm": "surfsense_agentic", "question_id": "2310.07609v1.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find the example figure related to \"question-guided reasoning\" and count the blue-colored numbers.Let me search for \"question-guided reasoning\" in the document to locate the relevant section.Now let me read the section around line 217 where Figure 1 is mentioned, which shows \"An example of question-guided reasoning\":Perfect! I can see Figure 1 which shows the question-guided reasoning example. Let me continue reading to see the complete figure description and identify all blue-colored numbers:Based on the document content I've read, I can see Figure 1 which shows the question-guided reasoning example. The figure contains:\n\n1. The text mentions \"2,212 m\" (Black Sea depth)\n2. The text mentions \"2,212 meters\" in the answer A1\n3. The text mentions \"1,000 meters\" in the answer A2\n4. The visual description mentions \"Blue accent colors for interactive elements (icons, links)\"\n\nLet me look more carefully at the OCR text section which lists the specific numbers visible in the figure:\n\nFrom lines 288-298, the OCR text shows:\n- 2,212 m (appears multiple times)\n- 1,253 m\n- 7,257 ft\n- 547,000 km\u00b3\n- 131,200 cu mi\n- 10+\n\nAnd from the main figure text (lines 219-270):\n- 2,212 m (Black Sea depth)\n- 1,000 meters (sunlight penetration)\n- 200 meters (mentioned in the evidence text)\n\nThe key information states: \"Blue accent colors for interactive elements (icons, links)\" which suggests that numbers appearing in blue would be those that are interactive or emphasized.\n\nLooking at the structure of Figure 1, the numbers that appear to be in blue color based on the context are:\n1. **2,212** (the answer to Q1 about Black Sea depth)\n2. **1,000** (the answer to Q2 about sunlight penetration)\n\nThese are the two key numerical answers that appear in the question-guided reasoning process.\n\nAnswer: 2", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 45156, "error": null, "extra": {"thread_id": 2011, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_8ee68d3622e54a41be76708c8f52b00e", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 300, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2310.07609v1.pdf::Q002", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "5", "pages": 10, "document_id": 7591, "arm": "native_pdf", "question_id": "2310.07609v1.pdf::Q002", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "5"}}
+{"qid": "2310.07609v1.pdf::Q002", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "5", "pages": 10, "document_id": 7591, "arm": "azure_basic_lc", "question_id": "2310.07609v1.pdf::Q002", "answer_letter": null, "raw_text": "I need to identify which modules in the QACHECK system are implemented using Large Language Models (LLMs) as their backbone.\n\nLet me go through each of the five modules described in Section 3:\n\n1. **Claim Verifier (Section 3.1)**: \"We build the claim verifier based on InstructGPT (Ouyang et al., 2022), utilizing its powerful in-context learning ability.\" - This uses LLM (InstructGPT).\n\n2. **Question Generator (Section 3.2)**: \"Similar to the claim verifier, we also leverage InstructGPT for in-context learning.\" - This uses LLM (InstructGPT).\n\n3. **Question Answering Model (Section 3.3)**: This module has three different implementations:\n   - Retriever-Reader: Uses BM25 retriever and RoBERTa-large reader (not primarily LLM-based)\n   - FLAN-T5: This is an LLM\n   - GPT Reciter-Reader: Uses InstructGPT (LLM)\n   \n   The text states \"In the following, we use GPT Reciter-Reader as the default implementation for our QA module.\" So the default implementation uses LLM.\n\n4. **QA Validator (Section 3.4)**: \"We again implement the validator by prompting InstructGPT with a suite of ten demonstrations\" - This uses LLM (InstructGPT).\n\n5. **Reasoner (Section 3.5)**: \"We provide two different implementations for the reasoner: 1) the end-to-end QA model based on FLAN-T5, and 2) the InstructGPT model\" - Both implementations use LLMs.\n\nCounting the modules that are implemented with LLM backbone:\n1. Claim Verifier - Yes (InstructGPT)\n2. Question Generator - Yes (InstructGPT)\n3. Question Answering Model - Yes (default is GPT Reciter-Reader with InstructGPT)\n4. QA Validator - Yes (InstructGPT)\n5. Reasoner - Yes (FLAN-T5", "citations": [], "input_tokens": 11637, "output_tokens": 512, "cost_micros": 42591, "latency_ms": 10953, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "5"}}
+{"qid": "2310.07609v1.pdf::Q002", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "5", "pages": 10, "document_id": 7591, "arm": "azure_premium_lc", "question_id": "2310.07609v1.pdf::Q002", "answer_letter": null, "raw_text": "I need to identify which modules in the QACHECK system use Large Language Models (LLMs) as their backbone.\n\nLet me go through each of the five modules:\n\n1. **Claim Verifier** (Section 3.1): Uses InstructGPT with in-context learning\n   - LLM-based \u2713\n\n2. **Question Generator** (Section 3.2): Uses InstructGPT for in-context learning\n   - LLM-based \u2713\n\n3. **Question Answering Model** (Section 3.3): Has three implementations:\n   - Retriever-Reader: Uses BM25 + RoBERTa-large fine-tuned on SQUAD\n   - FLAN-T5: Uses FLAN-T5 model\n   - GPT Reciter-Reader: Uses InstructGPT\n   - Since the system uses one of these at a time, and the default is GPT Reciter-Reader (mentioned in the text), this module uses LLM \u2713\n\n4. **QA Validator** (Section 3.4): Uses InstructGPT with demonstrations\n   - LLM-based \u2713\n\n5. **Reasoner** (Section 3.5): Has two implementations:\n   - FLAN-T5 (end-to-end QA model)\n   - InstructGPT\n   - Both options use LLMs \u2713\n\nAll five modules are implemented with Large Language Models as their backbone (either InstructGPT or FLAN-T5, both of which are LLMs).\n\nAnswer: 5", "citations": [], "input_tokens": 12753, "output_tokens": 375, "cost_micros": 43884, "latency_ms": 8250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "5"}}
+{"qid": "2310.07609v1.pdf::Q002", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "5", "pages": 10, "document_id": 7591, "arm": "llamacloud_basic_lc", "question_id": "2310.07609v1.pdf::Q002", "answer_letter": null, "raw_text": "I need to identify which modules in the QACHECK system use Large Language Models (LLMs) as their backbone.\n\nLet me go through each of the five modules described in the document:\n\n1. **Claim Verifier** (Section 3.1): \"We build the claim verifier based on InstructGPT (Ouyang et al., 2022), utilizing its powerful in-context learning ability.\" - This uses LLM (InstructGPT).\n\n2. **Question Generator** (Section 3.2): \"Similar to the claim verifier, we also leverage InstructGPT for in-context learning.\" - This uses LLM (InstructGPT).\n\n3. **Question Answering Module** (Section 3.3): This module has three different implementations:\n   - Retriever-Reader: Uses RoBERTa-large (which is a language model)\n   - FLAN-T5: \"we also incorporate the FLAN-T5 model\" - This is an LLM\n   - GPT Reciter-Reader: Uses InstructGPT - This is an LLM\n   \n   The default implementation uses GPT Reciter-Reader (LLM-based).\n\n4. **QA Validator** (Section 3.4): \"We again implement the validator by prompting InstructGPT with a suite of ten demonstrations\" - This uses LLM (InstructGPT).\n\n5. **Reasoner** (Section 3.5): \"We provide two different implementations for the reasoner: 1) the end-to-end QA model based on FLAN-T5, and 2) the InstructGPT model\" - Both implementations use LLMs.\n\nAll five modules are implemented with Large Language Models as their backbone (either as the primary or available implementation).\n\nAnswer: 5", "citations": [], "input_tokens": 8622, "output_tokens": 425, "cost_micros": 32241, "latency_ms": 10062, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "5"}}
+{"qid": "2310.07609v1.pdf::Q002", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "5", "pages": 10, "document_id": 7591, "arm": "llamacloud_premium_lc", "question_id": "2310.07609v1.pdf::Q002", "answer_letter": null, "raw_text": "I need to identify which modules in the QACHECK system use Large Language Models (LLMs) as their backbone.\n\nLet me go through each of the five modules:\n\n1. **Claim Verifier** (Section 3.1): \"We build the claim verifier based on InstructGPT (Ouyang et al., 2022), utilizing its powerful in-context learning ability.\" - Uses LLM \u2713\n\n2. **Question Generator** (Section 3.2): \"Similar to the claim verifier, we also leverage InstructGPT for in-context learning.\" - Uses LLM \u2713\n\n3. **Question Answering Model** (Section 3.3): This module has three different implementations:\n   - Retriever-Reader: Uses RoBERTa-large fine-tuned on SQuAD (this is a reader model, but the main approach is retrieval-based)\n   - FLAN-T5: This is an LLM \u2713\n   - GPT Reciter-Reader: Uses InstructGPT as the reciter \u2713\n   \n   The document states \"In the following, we use GPT Reciter-Reader as the default implementation for our QA module.\" So the default uses LLM.\n\n4. **QA Validator** (Section 3.4): \"We again implement the validator by prompting InstructGPT with a suite of ten demonstrations\" - Uses LLM \u2713\n\n5. **Reasoner** (Section 3.5): \"We provide two different implementations for the reasoner: 1) the end-to-end QA model based on FLAN-T5, and 2) the InstructGPT model\" - Both implementations use LLMs \u2713\n\nCounting the modules that use LLMs:\n1. Claim Verifier - LLM\n2. Question Generator - LLM\n3. Question Answering Model - LLM (default implementation)\n4. QA Validator - LLM\n5. Reasoner - LLM\n\nAll 5 modules use LLMs in their implementation (either exclusively or as the default/primary option).\n\nAnswer: 5", "citations": [], "input_tokens": 12522, "output_tokens": 503, "cost_micros": 45111, "latency_ms": 11405, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "5"}}
+{"qid": "2310.07609v1.pdf::Q002", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "5", "pages": 10, "document_id": 7591, "arm": "surfsense_agentic", "question_id": "2310.07609v1.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the priority document to find information about modules implemented with Large Language Models backbone.Let me search for information about modules implemented with LLM backbone in the document.Now let me search for information about which modules use LLM backbone:Let me read more of the document to find details about the system architecture and which modules use Large Language Models:Now let me search for information specifically about which modules are implemented with Large Language Models:Based on the document, I can now identify which modules are implemented with Large Language Models (specifically InstructGPT). Let me verify this information:\n\nFrom the document, I can see that the QACHECK system has **five modules**:\n1. **Claim Verifier** - implemented with InstructGPT (Large Language Model)\n2. **Question Generator** - implemented with InstructGPT (Large Language Model)\n3. **Question Answering Model** - has three implementations, one of which (GPT Reciter-Reader) uses InstructGPT (Large Language Model)\n4. **QA Validator** - implemented with InstructGPT (Large Language Model)\n5. **Reasoner** - has two implementations, one of which uses InstructGPT (Large Language Model)\n\nHowever, I need to count how many modules are implemented with LLM backbone. Looking at the text more carefully:\n\n- **Claim Verifier**: Uses InstructGPT (LLM) \u2713\n- **Question Generator**: Uses InstructGPT (LLM) \u2713\n- **Question Answering Model**: Has 3 options (Retriever-Reader, FLAN-T5, GPT Reciter-Reader), but the QA module itself is not purely LLM-based in all implementations\n- **QA Validator**: Uses InstructGPT (LLM) \u2713\n- **Reasoner**: Has 2 implementations (FLAN-T5 and InstructGPT), one uses LLM \u2713\n\nLet me re-read to get the precise count of modules that ARE implemented with LLM backbone (not just have an option for it):\n\nFrom the text:\n- Claim Verifier: \"We build the claim verifier based on InstructGPT\" - **LLM**\n- Question Generator: \"we also leverage InstructGPT for in-context learning\" - **LLM**\n- QA Validator: \"We again implement the validator by prompting InstructGPT\" - **LLM**\n- Reasoner: \"We provide two different implementations for the reasoner: 1) the end-to-end QA model based on FLAN-T5, and 2) the InstructGPT model\" - has LLM option but also non-LLM option\n\nThe Question Answering Model has multiple implementations, and the document states the system uses \"GPT Reciter-Reader as the default implementation\" which uses InstructGPT.\n\nSo counting modules that are implemented with LLM backbone: Claim Verifier (1), Question Generator (2), QA Validator (3), and considering the default implementations, the Reasoner could be 4.\n\nLet me check if there's a clearer statement in the document about this specific question.Let me read the sections more carefully to understand which modules specifically use LLM backbone:\n\nBased on my reading of the document:\n\n1. **Claim Verifier** (Section 3.1): \"We build the claim verifier based on InstructGPT\" - Uses LLM \u2713\n2. **Question Generator** (Section 3.2): \"Similar to the claim verifier, we also leverage InstructGPT for in-context learning\" - Uses LLM \u2713\n3. **Question Answering Model** (Section 3.3): Multiple implementations, NOT all use LLM\n4. **QA Validator** (Section 3.4): \"We again implement the validator by prompting InstructGPT\" - Uses LLM \u2713\n5. **Reasoner** (Section 3.5): \"We provide two different implementations for the reasoner: 1) the end-to-end QA model based on FLAN-T5, and 2) the InstructGPT model\" - Has LLM option\n\nThe modules that are definitively implemented with LLM (InstructGPT) backbone are:\n1. Claim Verifier\n2. Question Generator  \n3. QA Validator\n\nThat's **3 modules** that are implemented with Large Language Models backbone.\n\nHowever, if we consider that the Reasoner also has an InstructGPT implementation and FLAN-T5 is also a Large Language Model, it could be 4. Let me verify if FLAN-T5 counts as an LLM.\n\nFLAN-T5 is described in the document as \"a Seq2Seq model pre-trained on more than 1.8K tasks with instruction tuning\" - this is indeed a Large Language Model.\n\nSo if the question is asking about modules that ARE implemented (not optionally, but actually) with LLM backbone:\n- Claim Verifier: InstructGPT (LLM) \u2713\n- Question Generator: InstructGPT (LLM) \u2713\n- QA Validator: InstructGPT (LLM) \u2713\n- Reasoner: Has two implementations, both are LLMs (FLAN-T5 and InstructGPT) \u2713\n\nThat would be **4 modules**.\n\nAnswer: 4", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 100172, "error": null, "extra": {"thread_id": 2012, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_5a2a7a1ab0194a9ba5958368d2c54734", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 686, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "5"}}
+{"qid": "2310.07609v1.pdf::Q004", "doc_id": "2310.07609v1.pdf", "answer_format": "str", "gold": "4. The final prediction result with rationale.", "pages": 10, "document_id": 7591, "arm": "native_pdf", "question_id": "2310.07609v1.pdf::Q004", "answer_letter": null, "raw_text": "I need to find the figure at the top of page 5 and identify the final step shown in it.\n\nLooking at page 5, I can see Figure 4, which shows a screenshot of the QACHECK user interface with numbered annotations.\n\nThe figure shows 4 numbered steps:\n1. Select or input a custom claim\n2. Submit to fact-check the input claim\n3. Visualize the question-answering guided reasoning process\n4. The final prediction result with rationale\n\nThe final step (step 4) is clearly labeled as \"The final prediction result with rationale\"\n\nAnswer: The final prediction result with rationale", "citations": [], "input_tokens": 27168, "output_tokens": 141, "cost_micros": 83619, "latency_ms": 10530, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.07609v1.pdf"}, "graded": {"correct": true, "f1": 0.9090909090909091, "method": "str_norm", "normalised_pred": "final prediction result with rationale", "normalised_gold": "4 final prediction result with rationale"}}
+{"qid": "2310.07609v1.pdf::Q004", "doc_id": "2310.07609v1.pdf", "answer_format": "str", "gold": "4. The final prediction result with rationale.", "pages": 10, "document_id": 7591, "arm": "azure_basic_lc", "question_id": "2310.07609v1.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Figure 4 on page 5, which shows the QACHECK user interface screenshot, the final step (labeled as \"4\") shows:\n\n\"The final prediction result with rationale\"\n\nThis section displays:\n- \"Prediction with rationale:\"\n- The reasoning explanation\n- \"Finish\"\n\nAnswer: The final prediction result with rationale", "citations": [], "input_tokens": 11656, "output_tokens": 80, "cost_micros": 36168, "latency_ms": 4530, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.9090909090909091, "method": "str_norm", "normalised_pred": "final prediction result with rationale", "normalised_gold": "4 final prediction result with rationale"}}
+{"qid": "2310.07609v1.pdf::Q004", "doc_id": "2310.07609v1.pdf", "answer_format": "str", "gold": "4. The final prediction result with rationale.", "pages": 10, "document_id": 7591, "arm": "azure_premium_lc", "question_id": "2310.07609v1.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Figure 4 (the screenshot at the top of page 5), I can see the interface is annotated with 4 numbered sections:\n\n1. Select or input a custom claim\n2. Submit to fact-check the input claim\n3. Visualize the question-answering guided reasoning process\n4. The final prediction result with rationale\n\nThe fourth and final step shows \"Prediction with rationale:\" followed by the reasoning and final answer.\n\nAnswer: The final prediction result with rationale", "citations": [], "input_tokens": 12772, "output_tokens": 111, "cost_micros": 39981, "latency_ms": 5250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.9090909090909091, "method": "str_norm", "normalised_pred": "final prediction result with rationale", "normalised_gold": "4 final prediction result with rationale"}}
+{"qid": "2310.07609v1.pdf::Q004", "doc_id": "2310.07609v1.pdf", "answer_format": "str", "gold": "4. The final prediction result with rationale.", "pages": 10, "document_id": 7591, "arm": "llamacloud_basic_lc", "question_id": "2310.07609v1.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the figure at the top of page 5 (Figure 4), I can see the system interface showing the fact-checking process for the claim \"Lars Onsager won the Nobel prize when he was 30 years old.\"\n\nThe final step shown in the figure is labeled \"The final prediction result with rationale\" which displays:\n\n\"Lars Onsager won the Nobel prize in 1968. Lars Onsager was born in 1903. He was 65 when he won the Nobel prize. Therefore, the final answer is: False.\"\n\nAnswer: The final prediction result with rationale", "citations": [], "input_tokens": 8641, "output_tokens": 135, "cost_micros": 27948, "latency_ms": 5422, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.9090909090909091, "method": "str_norm", "normalised_pred": "final prediction result with rationale", "normalised_gold": "4 final prediction result with rationale"}}
+{"qid": "2310.07609v1.pdf::Q004", "doc_id": "2310.07609v1.pdf", "answer_format": "str", "gold": "4. The final prediction result with rationale.", "pages": 10, "document_id": 7591, "arm": "llamacloud_premium_lc", "question_id": "2310.07609v1.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Figure 2 on page 5, I need to identify the final step in the architecture diagram.\n\nThe flow shows:\n1. Claim \u2192 Claim Verifier\n2. Question Generator \u2192 QA Model \u2192 Validator \u2192 Relevant Context (in a loop)\n3. Claim Verifier \u2192 Reasoner\n4. Reasoner \u2192 Label\n\nThe final step in the process is when the Reasoner outputs the Label.\n\nAnswer: Label", "citations": [], "input_tokens": 12541, "output_tokens": 104, "cost_micros": 39183, "latency_ms": 4155, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "label", "normalised_gold": "4 final prediction result with rationale"}}
+{"qid": "2310.07609v1.pdf::Q004", "doc_id": "2310.07609v1.pdf", "answer_format": "str", "gold": "4. The final prediction result with rationale.", "pages": 10, "document_id": 7591, "arm": "surfsense_agentic", "question_id": "2310.07609v1.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to find the figure at the top of page 5 and identify the final step.Let me search for page 5 content in the document:Let me search for page markers and figures in the document:Let me read around line 819 where Figure 4 is mentioned, as this might be on page 5:Perfect! I can see Figure 4 which appears to be on page 5 (after the PageBreak marker at line 815). Looking at the figure, I can see it shows a step-by-step process. Let me examine the final step more carefully:\n\nThe figure shows 4 numbered steps:\n1. Select or input a custom claim\n2. Submit to fact-check the input claim\n3. Visualize the question-answering guided reasoning process\n4. The final prediction result with rationale\n\nThe final step (step 4) shows: \"Prediction with rationale:\" followed by the text and ending with \"**Finish**\".\n\nLooking at the structure, the final step in the figure is marked as \"**Finish**\" at line 888.\n\nAnswer: Finish", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 33953, "error": null, "extra": {"thread_id": 2013, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_ca5f20bedd764740aef245cbc4ae41c8", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 195, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "finish", "normalised_gold": "4 final prediction result with rationale"}}
+{"qid": "2310.07609v1.pdf::Q005", "doc_id": "2310.07609v1.pdf", "answer_format": "str", "gold": "Yellow", "pages": 10, "document_id": 7591, "arm": "native_pdf", "question_id": "2310.07609v1.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Figure 2 on page 2 and Figure 3 on page 4:\n\nIn Figure 2 (the system architecture), the \"Reasoner\" module is shown in a blue rectangle.\n\nIn Figure 3 (showing the three QA model implementations), the \"Reader\" component appears in orange/coral rectangles in parts (a) and (c).\n\nHowever, the question asks about a model rectangle that appears in BOTH the QA model and Reasoner modules. Looking at the figures more carefully:\n\n- In Figure 3(b), the \"FLAN-T5\" box appears in yellow\n- In Figure 3(c), the \"Reader\" boxes appear in orange\n- The \"Reasoner\" in the main architecture uses similar model components\n\nThe \"Reader\" model component appears in an orange/coral colored rectangle in both the QA module implementations and is used in the reasoning process.\n\nAnswer: Orange", "citations": [], "input_tokens": 27172, "output_tokens": 206, "cost_micros": 84606, "latency_ms": 10108, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.07609v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "orange", "normalised_gold": "yellow"}}
+{"qid": "2310.07609v1.pdf::Q005", "doc_id": "2310.07609v1.pdf", "answer_format": "str", "gold": "Yellow", "pages": 10, "document_id": 7591, "arm": "azure_basic_lc", "question_id": "2310.07609v1.pdf::Q005", "answer_letter": null, "raw_text": "I need to look at the figure on page 4 that shows the QA model and Reasoner modules.\n\nLooking at Figure 2 on page 4, which shows \"The architecture of our QACHECK system,\" I can see the system architecture with various modules including:\n- Claim Verifier\n- Question Generator\n- QA Model\n- Validator\n- Reasoner\n\nThe figure shows these modules connected with arrows. However, this figure does not clearly show colored rectangles that would indicate a specific model appearing in both the QA Model and Reasoner modules.\n\nLooking more carefully at the document, Figure 3 on page 5 shows \"Illustrations of the three different implementations of the Question Answering module in QACHECK\" with three subfigures (a), (b), and (c). In subfigure (a) \"Retriever-Reader\" and subfigure (c) \"GPT Reciter-Reader\", there is a \"Reader\" component shown.\n\nAccording to the text in Section 3.5, the Reasoner \"is a special question-answering model\" and the text mentions that both the QA module (Figure 3) and potentially the Reasoner use a \"Reader\" component.\n\nHowever, the document is in grayscale/black and white, so I cannot determine colors from the PDF provided.\n\nAnswer: Cannot determine - document appears to be in grayscale", "citations": [], "input_tokens": 11660, "output_tokens": 312, "cost_micros": 39660, "latency_ms": 9907, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "cannot determine document appears to be in grayscale", "normalised_gold": "yellow"}}
+{"qid": "2310.07609v1.pdf::Q005", "doc_id": "2310.07609v1.pdf", "answer_format": "str", "gold": "Yellow", "pages": 10, "document_id": 7591, "arm": "azure_premium_lc", "question_id": "2310.07609v1.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Figure 2 on page 4, I need to identify the rectangles that appear in both the \"QA Model\" and \"Reasoner\" modules.\n\nIn the architecture diagram (Figure 2), the QA Model outputs \"(Q, A)\" pairs, and the Reasoner outputs \"Label\". \n\nThe component that connects both modules is the \"Wikipedia Corpus\" which is shown as a cylinder/database shape that provides evidence to the QA Model, and the accumulated context from Q&A pairs feeds into the Reasoner.\n\nHowever, looking more carefully at the diagram structure, both the QA Model and Reasoner are shown as rectangular boxes in the flowchart. The rectangles representing these modules themselves appear to be gray/neutral colored boxes in the system architecture.\n\nBut examining the actual visual elements more carefully, there isn't a colored rectangle that appears in both specific modules as separate components. The modules themselves are represented as boxes in the flow diagram.\n\nAnswer: gray", "citations": [], "input_tokens": 12776, "output_tokens": 212, "cost_micros": 41508, "latency_ms": 7203, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "gray", "normalised_gold": "yellow"}}
+{"qid": "2310.07609v1.pdf::Q005", "doc_id": "2310.07609v1.pdf", "answer_format": "str", "gold": "Yellow", "pages": 10, "document_id": 7591, "arm": "llamacloud_basic_lc", "question_id": "2310.07609v1.pdf::Q005", "answer_letter": null, "raw_text": "I need to look at the figure on page 4 to identify the color of the model rectangle that appears in both the QA model and Reasoner modules.\n\nLooking at Figure 2 on page 3 (which shows the system architecture), I can see the general architecture diagram. However, the question asks specifically about page 4.\n\nOn page 4, I can see Figure 3, which shows three different QA model implementations:\n(a) Retriever-Reader\n(b) FLAN-T5\n(c) GPT Reciter-Reader\n\nLooking at the diagram, I need to identify which model appears in both the QA module and the Reasoner module. According to the text in section 3.5, the Reasoner has two implementations: \"1) the end-to-end QA model based on FLAN-T5, and 2) the InstructGPT model.\"\n\nIn Figure 3, I can see rectangles representing different models. The FLAN-T5 model is shown in Figure 3(b), and this same model is mentioned as being used in the Reasoner module.\n\nLooking at the visual representation in Figure 3(b), the FLAN-T5 rectangle appears to be blue in color.\n\nAnswer: blue", "citations": [], "input_tokens": 8645, "output_tokens": 287, "cost_micros": 30240, "latency_ms": 7250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "blue", "normalised_gold": "yellow"}}
+{"qid": "2310.07609v1.pdf::Q005", "doc_id": "2310.07609v1.pdf", "answer_format": "str", "gold": "Yellow", "pages": 10, "document_id": 7591, "arm": "llamacloud_premium_lc", "question_id": "2310.07609v1.pdf::Q005", "answer_letter": null, "raw_text": "I need to examine Figure 2 on page 4, which shows \"The architecture of our QACHECK system.\"\n\nLooking at the architecture diagram, I can see several modules represented as rectangles. The question asks about a rectangle that appears in both the QA Model and Reasoner modules.\n\nIn the diagram, I can see:\n- A \"QA Model\" box in the loop section\n- A \"Reasoner\" box after the loop\n- Both of these modules appear to have the same visual representation\n\nHowever, looking more carefully at the figure description and the actual diagram components, the modules that represent model components (like the Question Answering module and Reasoner) are shown as rectangles in the flowchart.\n\nThe document shows these as standard flowchart boxes, but I cannot definitively determine a specific color from the description provided. Looking at typical diagram conventions and the figure as presented, these model components appear to be represented in standard boxes.\n\nWithout being able to see the actual colors in the PDF image clearly described in the text, I cannot make a definitive statement about the specific color.\n\nAnswer: Cannot be determined from the provided text", "citations": [], "input_tokens": 12545, "output_tokens": 247, "cost_micros": 41340, "latency_ms": 8312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "cannot be determined from provided text", "normalised_gold": "yellow"}}
+{"qid": "2310.07609v1.pdf::Q005", "doc_id": "2310.07609v1.pdf", "answer_format": "str", "gold": "Yellow", "pages": 10, "document_id": 7591, "arm": "surfsense_agentic", "question_id": "2310.07609v1.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to find the figure on page 4 and identify the color of the model rectangle that appears in both the QA model and Reasoner module.Let me search for page 4 content and figures in the document:Let me search for figure-related content and read more of the document to find page 4:Let me read the section containing Figure 3 which appears to show different implementations of the QA module:This is Figure 3. Let me continue reading to find more figures and look for page markers:Let me search for page numbers or section breaks to locate page 4:Let me read around the fourth PageBreak (line 815) which should be around page 4:This is Figure 4, which shows a user interface. Let me continue reading to see if there are more visual elements described:Now I need to go back and look at Figure 2, which shows the architecture of the QACHECK system. That figure should show both the QA model and Reasoner module. Let me read around line 416 where Figure 2 is mentioned:I see Figure 2 is described, but I need to look at more details about the visual representation. Let me search for information about colors in the figure or read earlier to see the full figure description:The embedded image description in Figure 2 only describes a yellow/golden amber circular icon with a question mark. But the question asks about a model rectangle that appears in both QA model and Reasoner modules. Let me search for more complete visual descriptions or look for other figures. Let me check if there's more detailed architecture visualization:Let me read that chunk:Let me read that section:This doesn't seem to be the right figure. Let me search more specifically for information about the architecture diagram or look for mentions of colors in model components:Let me search for \"green\" or other colors that might appear in the architecture diagram:The green icons appear to be information icons. Let me look at the text structure of Figure 2 more carefully. The figure shows \"QA Model\" and \"Reasoner\" as separate boxes. Let me search for \"orange\" or \"red\" colors:Let me try searching for \"model\" and \"reasoner\" together or look at the complete Figure 2 text more carefully:Let me look at a different approach. The question mentions \"page 4\", so let me count the PageBreaks more carefully. Let me look between the 3rd and 4th PageBreak which should be page 4:So Figure 3 is on page 4 (after the 3rd PageBreak at line 648). Figure 3 shows the three different implementations of the Question Answering module. The visual description mentions a green circular icon with \"i\". But this doesn't show both QA model and Reasoner module together.\n\nLet me look for if there are more images or visual descriptions in Figure 3:I see in Figure 3 there are three parts (a), (b), and (c) showing different QA implementations. Each shows \"Reader\" as a component, and there's a green circular icon. But I need to understand the question better - it asks about a \"model rectangle\" that appears in both QA model and Reasoner module.\n\nLet me re-read Figure 2 which shows the overall architecture, and look for visual elements that might represent both components. Let me search for mentions of \"Reader\" since that appears in both:I notice that \"Reader\" appears in Figure 3 as a component. In the figure, the description says there's a green circular icon with \"i\". The \"Reader\" component in the Retriever-Reader, FLAN-T5, and GPT Reciter-Reader architectures likely uses the same visual element. But the question asks about something that appears in both \"QA model\" and \"Reasoner module\".\n\nWait - let me re-read the architecture description. The Reasoner is described as \"a special question-answering model\". So both might use the same underlying reader model. Let me look at where the reader model is used in Figure 3. The green \"i\" icon might be the answer indicator. But the question asks about a \"model rectangle\" not an icon.\n\nLet me think about this differently. Perhaps the document has better visual descriptions elsewhere. Let me search for all embedded images:There are multiple embedded images. The question specifically mentions Figure on page 4. Based on the PageBreaks, page 4 would be between line 648 and 815. Let me look more carefully at what figures or diagrams might show architectural components with rectangles:\n\nLooking back at Figure 3 (lines 651-707), it shows three sub-figures (a), (b), and (c) representing different QA implementations. Each has components like \"Retriever\", \"Reader\", \"FLAN-T5\", \"InstructGPT\". These might be shown as rectangles in the original figure.\n\nIn Figure 3, all three QA implementations show a \"Reader\" component. And the text mentions that the Reasoner module (section 3.5, lines 919-933) is \"a special question-answering model\" that can be implemented using \"FLAN-T5\" or \"InstructGPT model\".\n\nSo FLAN-T5 appears in both:\n- In Figure 3(b) as the QA model implementation  \n- In the Reasoner as one of the two implementation options\n\nLet me check if there's a visual description that mentions FLAN-T5 as a colored rectangle. Let me search for \"FLAN\" in visual descriptions:Perfect! So FLAN-T5 is used in BOTH:\n1. Figure 3(b) - as a QA Model implementation\n2. Section 3.5 (Reasoner) - as one of the two reasoner implementations\n\nNow I need to find the visual description showing what color the FLAN-T5 rectangle is. Let me look more carefully at the embedded images around Figure 3. However, the visual description at line 719 only describes the green \"i\" icon, not the model boxes themselves.\n\nLet me check if there are more visual descriptions or embedded images in the Figure 3 section. The document might have multiple embedded images for the same figure. Let me read more around that area:These are descriptions of other images. Let me look at the specific section where Im3.png first appears (around line 711) and see if there are additional visual descriptions I missed:Still showing the green \"i\" icon. Let me look at Im5.jpg which mentions GPT3 - this might be describing the model boxes:Im5.jpg shows a GPT3 logo with purple and pink/magenta colors. But I need to find descriptions of the model rectangles/boxes in Figure 3. Let me search for descriptions that mention \"rectangle\" or \"box\" with colors:Purple appears in the GPT3 logo. Since the question asks about a component that appears in BOTH QA Model and Reasoner, and we know FLAN-T5 is used in both, let me look for visual descriptions that might show FLAN-T5 box. Let me search for \"yellow\" since that was mentioned earlier:Yellow appears in question mark icons. Let me try a different approach. Let me look at all the visual descriptions more systematically. Let me check descriptions that mention model names or rectangular shapes directly. Let me search for visual descriptions that mention boxes or rectangles with specific colors:Not helpful. Let me think differently. The embedded images are specific parts of the figure. Maybe there are multiple embedded images within Figure 3 that I haven't found yet. Let me look at the embedded images that appear chronologically around Figure 3 content. Figure 3 is around lines 651-720. Let me check what other images might be part of it:\n\nActually, wait - let me reconsider. Looking at the OCR text, Im3.png shows \"i\", Im5.jpg shows \"GPT3\". These are components within the figures. Let me look for images that might show \"FLAN-T5\" or \"FLAN\" text:FLANonly appears as text, not in embedded image OCR. Let me look for embedded images with OCR text that might contain model names. Let me check if there are images showing \"T5\" or looking at what other embedded images show:All I'm finding are green \"i\" icons. Let me try a completely different approach. Perhaps I need to look at the actual figure diagram structure more carefully. In Figure 3, there are three sub-diagrams showing different QA implementations. Each has boxes/rectangles for components. Looking at the text structure:\n\nFigure 3(a): Retriever-Reader has \"Retriever\" and \"Reader\" boxes\nFigure 3(b): FLAN-T5 has \"FLAN-T5\" box  \nFigure 3(c): GPT Reciter-Reader has \"InstructGPT\" and \"Reader\" boxes\n\nThe common element between Figure 3 and the Reasoner would be the model itself. Since both QA module and Reasoner can use FLAN-T5, I should look for a visual description of the FLAN-T5 box/rectangle.\n\nLet me search through all the visual descriptions more systematically for anything mentioning model boxes or FLAN. Actually, let me look for embedded images that have model-related OCR text. Let me look for embedded images around the areas discussing models:\n\nWait, I should look for the actual image file names and descriptions that might show text boxes or model components. Let me search for descriptions that mention \"text\" and colors:Turquoise/cyan blue! Let me read that chunk:This turquoise/cyan blue button shows \"Finish\" - that's from Figure 4 (the UI screenshot). Let me look systematically for all visual descriptions that mention rectangular shapes or boxes. Actually, let me search for specific model component descriptions. Let me look for \"RoBERTa\" which is mentioned as the Reader model:RoBERTa is used in the Reader model but there's no visual description mentioning it. Let me try to systematically go through the embedded images and look for any that might show architectural component boxes. Let me look at the images that haven't been described yet. Let me check Im2.png more thoroughly since that's Figure 2:These are still just the icons. Let me try a completely different approach. Maybe the XML parser extracted images separately from the diagram. Let me look at the beginning of the document to see if there are more detailed figure descriptions I'm missing:This is Figure 1. The visual content shows search icons and question marks. Let me consider: maybe the XML extraction treats architectural diagram components differently. Perhaps the model boxes themselves don't have detailed color descriptions, or the color information is embedded in a way I haven't found yet.\n\nLet me reconsider what the question is actually asking. It says \"the model rectangle in the figure of page 4 that appears both in QA model and Reasoner module\". If Figure 3 is on page 4, and Figure 3 shows three different QA implementations, then the component that would also appear in the Reasoner would be the UNDERLYING MODEL.\n\nLooking back at the text:\n- Section 3.3 (QA module) describes three implementations, one being FLAN-T5 (line 675-686)\n- Section 3.5 (Reasoner) says it can be implemented using \"the end-to-end QA model based on FLAN-T5\" (line 931-932)\n\nSo FLAN-T5 is the model that appears in both. Now I need to find what color the FLAN-T5 rectangle is shown in Figure 3. Since the visual descriptions I've found so far only show icons (green \"i\", yellow \"?\", orange \"?\"), maybe the model boxes themselves are not described with specific colors in the extracted XML, or I need to look elsewhere.\n\nLet me try to find if there's any systematic description of the figure architecture. Maybe there's a legend or color scheme description:Turquoise appears as a \"Finish\" button and footer element. But wait, let me read chunk 155824 which mentions a layout with multiple colored sections:", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 328655, "error": null, "extra": {"thread_id": 2014, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_4736292b32d6461e98620f394ccee4da", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 2072, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "let me try to find if there s any systematic description of figure architecture maybe there s legend or color scheme description turquoise appears as finish button and footer element but wait let me read chunk 155824 which mentions layout with multiple colored sections", "normalised_gold": "yellow"}}
+{"qid": "2310.07609v1.pdf::Q006", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "4", "pages": 10, "document_id": 7591, "arm": "native_pdf", "question_id": "2310.07609v1.pdf::Q006", "answer_letter": null, "raw_text": "I need to count the prompt instruction examples shown in the main body of the paper (not in the appendix).\n\nLooking through the main body of the paper:\n\n1. **Section 3.1 (Claim Verifier)** - Shows 1 prompt instruction example:\n   - \"Claim = CLAIM / We already know the following: CONTEXT / Can we know whether the claim is true or false now? Yes or no?\"\n\n2. **Section 3.2 (Question Generator)** - Shows 2 prompt instruction examples:\n   - Initial question: \"Claim = CLAIM / To verify the above claim, we can first ask a simple question:\"\n   - Follow-up question: \"Claim = CLAIM / We already know the following: CONTEXT / To verify the claim, what is the next question we need to know the answer to?\"\n\n3. **Section 3.4 (QA Validator)** - Shows 1 prompt instruction example:\n   - \"Claim = CLAIM / We already know the following: CONTEXT / Now we further know: NEW QA PAIR / Does the QA pair have additional knowledge useful for verifying the claim?\"\n\n4. **Section 3.5 (Reasoner)** - The text mentions prompts are in \"Appendix A.4\" but doesn't show the full prompt in the main body.\n\nTotal prompts shown in the main body: 1 + 2 + 1 = 4\n\nAnswer: 4", "citations": [], "input_tokens": 27159, "output_tokens": 335, "cost_micros": 86502, "latency_ms": 9625, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.07609v1.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2310.07609v1.pdf::Q006", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "4", "pages": 10, "document_id": 7591, "arm": "azure_basic_lc", "question_id": "2310.07609v1.pdf::Q006", "answer_letter": null, "raw_text": "I need to count the number of prompt instruction examples listed in the main body of the paper.\n\nLooking through the document, I can find the following prompt instructions in the main body (not in the Appendix):\n\n1. **Claim Verifier prompt** (Section 3.1):\n```\nClaim = CLAIM We already know the following:\nCONTEXT\nCan we know whether the claim is true or false now? Yes or no?\n```\n\n2. **Question Generator - Initial question prompt** (Section 3.2):\n```\nClaim = CLAIM\nTo verify the above claim, we can first ask a simple question:\n```\n\n3. **Question Generator - Follow-up question prompt** (Section 3.2):\n```\nClaim = CLAIM\nWe already know the following: CONTEXT\nTo verify the claim, what is the next question we need to know the answer to?\n```\n\n4. **QA Validator prompt** (Section 3.4):\n```\nClaim = CLAIM\nWe already know the following:\nCONTEXT\nNow we further know: NEW QA PAIR\nDoes the QA pair have additional knowledge useful for verifying the claim?\n```\n\nThat gives us 4 prompt instruction examples in the main body of the paper.\n\nAnswer: 4", "citations": [], "input_tokens": 11647, "output_tokens": 289, "cost_micros": 39276, "latency_ms": 5532, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2310.07609v1.pdf::Q006", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "4", "pages": 10, "document_id": 7591, "arm": "azure_premium_lc", "question_id": "2310.07609v1.pdf::Q006", "answer_letter": null, "raw_text": "I need to count the prompts/instruction examples that are explicitly shown in the main body of the paper (not in the Appendix).\n\nLooking through the main body of the paper:\n\n1. **Section 3.1 (Claim Verifier)** - Shows 1 prompt instruction:\n   ```\n   Claim = CLAIM\n   We already know the following:\n   CONTEXT\n   Can we know whether the claim is true or false now? Yes or no?\n   ```\n\n2. **Section 3.2 (Question Generator)** - Shows 2 prompt instructions:\n   - For initial question generation:\n   ```\n   Claim = CLAIM\n   To verify the above claim, we can first ask a simple question:\n   ```\n   \n   - For follow-up questions:\n   ```\n   Claim = CLAIM\n   We already know the following:\n   CONTEXT\n   To verify the claim, what is the next question we need to know the answer to?\n   ```\n\n3. **Section 3.4 (QA Validator)** - Shows 1 prompt instruction:\n   ```\n   Claim = CLAIM\n   We already know the following:\n   CONTEXT\n   Now we further know:\n   NEW QA PAIR\n   Does the QA pair have additional knowledge useful for verifying the claim?\n   ```\n\nTotal prompts in the main body: 1 + 2 + 1 = 4\n\nAnswer: 4", "citations": [], "input_tokens": 12763, "output_tokens": 336, "cost_micros": 43329, "latency_ms": 6578, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2310.07609v1.pdf::Q006", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "4", "pages": 10, "document_id": 7591, "arm": "llamacloud_basic_lc", "question_id": "2310.07609v1.pdf::Q006", "answer_letter": null, "raw_text": "I need to count the number of prompt instruction examples listed in the main body of the paper (excluding the Appendix).\n\nLet me go through the main body of the paper systematically:\n\n1. **Section 3.1 (Claim Verifier)**: The text mentions \"We prompt InstructGPT with ten distinct in-context examples as detailed in Appendix A.1\" - but no examples are shown in the main body, only referenced.\n\n2. **Section 3.2 (Question Generator)**: Two prompts are shown:\n   - Initial question generation prompt (1 prompt)\n   - Follow-up question generation prompt (1 prompt)\n\n3. **Section 3.3 (Question Answering Model)**: No complete prompt examples are shown in the main body.\n\n4. **Section 3.4 (QA Validator)**: One prompt instruction is shown for the validator module (1 prompt)\n\n5. **Section 3.5 (Reasoner)**: The text mentions \"the InstructGPT model with the prompts given in Appendix A.4\" - but no examples are shown in the main body.\n\nCounting the prompts actually displayed in the main body:\n- Section 3.2: 2 prompts (initial question generation + follow-up question generation)\n- Section 3.4: 1 prompt (QA validator)\n\nTotal: 3 prompts in the main body\n\nAnswer: 3", "citations": [], "input_tokens": 8632, "output_tokens": 331, "cost_micros": 30861, "latency_ms": 7125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "4"}}
+{"qid": "2310.07609v1.pdf::Q006", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "4", "pages": 10, "document_id": 7591, "arm": "llamacloud_premium_lc", "question_id": "2310.07609v1.pdf::Q006", "answer_letter": null, "raw_text": "I need to count the number of prompt instruction examples listed in the main body of the paper (not in the Appendix).\n\nLooking through the main body of the paper:\n\n1. **Section 3.1 (Claim Verifier)** - 1 prompt instruction:\n   - \"Claim = CLAIM... Can we know whether the claim is true or false now? Yes or no?\"\n\n2. **Section 3.2 (Question Generator)** - 2 prompt instructions:\n   - Initial question generation: \"Claim = CLAIM... To verify the above claim, we can first ask a simple question:\"\n   - Follow-up questions: \"Claim = CLAIM... We already know the following: CONTEXT... To verify the claim, what is the next question we need to know the answer to?\"\n\n3. **Section 3.4 (QA Validator)** - 1 prompt instruction:\n   - \"Claim = CLAIM... We already know the following: CONTEXT... Now we further know: NEW QA PAIR... Does the QA pair have additional knowledge useful for verifying the claim?\"\n\nTotal prompts in the main body: 1 + 2 + 1 = 4\n\nAnswer: 4", "citations": [], "input_tokens": 12532, "output_tokens": 279, "cost_micros": 41781, "latency_ms": 5687, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2310.07609v1.pdf::Q006", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "4", "pages": 10, "document_id": 7591, "arm": "surfsense_agentic", "question_id": "2310.07609v1.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the document to find information about instruction examples in the main body of the paper.Let me continue reading to find information about instruction examples in the main body of the paper.Let me read the Appendix section to find the prompts and instruction examples.Perfect! I can now see the prompts section clearly. Let me count the instruction examples mentioned. Looking at the document, I can see:\n\n1. **A.1 Prompts for Claim Verifier** - Shows \"<10 demonstrations in total>\"\n2. **A.2 Prompts for Question Generation** (initial question) - Shows \"<10 demonstrations in total>\"\n3. **A.2 Prompts for follow-up question generation** - Shows \"<10 demonstrations in total>\"\n4. **A.3 Prompts for Validator** - Shows \"<10 demonstrations in total>\"\n5. **A.4 Prompts for Reasoner** - Shows \"<10 demonstrations in total>\"\n\nHowever, the question asks about \"instruction examples listed in the main body of the paper.\" Let me check if there's more specific information in the main body about how many examples are actually shown.Let me search more specifically for mentions of how many examples are shown in the main body.Great! I found it. Let me read line 582 more carefully. The main body states: \"We prompt InstructGPT with **ten distinct in-context examples** as detailed in Appendix A.1\"\n\nLet me verify this is the answer by checking if there are other mentions of specific numbers in the main body.Perfect! I found another mention in line 790: \"with a suite of **ten demonstrations** shown in Appendix A.3\"\n\nLet me search for all instances where numbers of examples/demonstrations are mentioned in the main body.Based on my reading of the document, I found that in the main body of the paper, there are **two explicit mentions** of the number of instruction examples/demonstrations:\n\n1. Line 582: \"We prompt InstructGPT with **ten distinct in-context examples**\" (for the Claim Verifier)\n2. Line 790: \"with a suite of **ten demonstrations**\" (for the Validator)\n\nHowever, the question asks specifically \"How many prompts instruction examples are listed in the main body of the paper?\" - it asks for the COUNT/NUMBER that is mentioned, not how many times it's mentioned.\n\nAccording to the main body:\n- 10 examples for Claim Verifier (mentioned on line 582)\n- 10 demonstrations for Validator (mentioned on line 790)\n\nSince both mention \"10\" and the question asks for the number in float format and wants a single integer answer, the answer is:\n\n**10**\n\nAnswer: 10", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 77358, "error": null, "extra": {"thread_id": 2015, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_302989d449c9437a85f8635948950b78", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 438, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "10", "normalised_gold": "4"}}
+{"qid": "2310.07609v1.pdf::Q007", "doc_id": "2310.07609v1.pdf", "answer_format": "float", "gold": "0.45", "pages": 10, "document_id": 7591, "arm": "native_pdf", "question_id": "2310.07609v1.pdf::Q007", "answer_letter": null, "raw_text": "I need to find the performance scores for ProgramFC and QACheck (the proposed system) in the HOVER dataset from Table 1.\n\nFrom Table 1 on page 6:\n\n**HOVER dataset:**\n- ProgramFC scores: 54.27 (2-hop), 54.18 (3-hop), 52.88 (4-hop)\n- QACheck scores: 55.67 (2-hop), 54.67 (3-hop), 52.35 (4-hop)\n\nNow I'll calculate the performance gap for each:\n- 2-hop: 55.67 - 54.27 = 1.40\n- 3-hop: 54.67 - 54.18 = 0.49\n- 4-hop: 52.35 - 52.88 = -0.53\n\nAverage performance gap = (1.40 + 0.49 + (-0.53)) / 3 = 1.36 / 3 = 0.453333...\n\nRounded to 2 decimal places: 0.45\n\nAnswer: 0.45", "citations": [], "input_tokens": 27174, "output_tokens": 268, "cost_micros": 85542, "latency_ms": 9796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.07609v1.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "0.45", "normalised_gold": "0.45"}}
+{"qid": "2310.07609v1.pdf::Q007", "doc_id": "2310.07609v1.pdf", "answer_format": "float", "gold": "0.45", "pages": 10, "document_id": 7591, "arm": "azure_basic_lc", "question_id": "2310.07609v1.pdf::Q007", "answer_letter": null, "raw_text": "I need to find the performance gap between ProgramFC and QACheck on the HOVER dataset.\n\nLooking at Table 1 in the document, I can find the F1 scores for both systems on HOVER:\n\n**HOVER 2-hop:**\n- ProgramFC: 54.27\n- QACheck: 55.67\n- Gap: 55.67 - 54.27 = 1.40\n\n**HOVER 3-hop:**\n- ProgramFC: 54.18\n- QACheck: 54.67\n- Gap: 54.67 - 54.18 = 0.49\n\n**HOVER 4-hop:**\n- ProgramFC: 52.88\n- QACheck: 52.35\n- Gap: 52.35 - 52.88 = -0.53\n\nTo calculate the average performance gap across all three HOVER subsets:\nAverage gap = (1.40 + 0.49 + (-0.53)) / 3 = 1.36 / 3 = 0.45333...\n\nRounded to 2 decimals: 0.45\n\nAnswer: 0.45", "citations": [], "input_tokens": 11662, "output_tokens": 275, "cost_micros": 39111, "latency_ms": 5983, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "0.45", "normalised_gold": "0.45"}}
+{"qid": "2310.07609v1.pdf::Q007", "doc_id": "2310.07609v1.pdf", "answer_format": "float", "gold": "0.45", "pages": 10, "document_id": 7591, "arm": "azure_premium_lc", "question_id": "2310.07609v1.pdf::Q007", "answer_letter": null, "raw_text": "I need to find the performance gap between ProgramFC and QACheck on the HOVER dataset.\n\nFrom Table 1, the F1 scores are:\n\n**HOVER 2-hop:**\n- ProgramFC: 54.27\n- QACheck: 55.67\n- Gap: 55.67 - 54.27 = 1.40\n\n**HOVER 3-hop:**\n- ProgramFC: 54.18\n- QACheck: 54.67\n- Gap: 54.67 - 54.18 = 0.49\n\n**HOVER 4-hop:**\n- ProgramFC: 52.88\n- QACheck: 52.35\n- Gap: 52.35 - 52.88 = -0.53\n\nAverage gap = (1.40 + 0.49 + (-0.53)) / 3 = 1.36 / 3 = 0.45333...\n\nRounded to 2 decimals: 0.45\n\nAnswer: 0.45", "citations": [], "input_tokens": 12778, "output_tokens": 247, "cost_micros": 42039, "latency_ms": 5812, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "0.45", "normalised_gold": "0.45"}}
+{"qid": "2310.07609v1.pdf::Q007", "doc_id": "2310.07609v1.pdf", "answer_format": "float", "gold": "0.45", "pages": 10, "document_id": 7591, "arm": "llamacloud_basic_lc", "question_id": "2310.07609v1.pdf::Q007", "answer_letter": null, "raw_text": "I need to find the performance gap between ProgramFC and QACheck on the HOVER dataset.\n\nFrom Table 1 in the document, I can see the performance results:\n\nFor HOVER dataset:\n- ProgramFC: 54.27\n- QACheck: 55.67\n\nFor 2-hop:\n- ProgramFC: 54.18\n- QACheck: 54.67\n\nFor 3-hop:\n- ProgramFC: 52.88\n- QACheck: 52.35\n\nFor 4-hop:\n- ProgramFC: 59.66\n- QACheck: 59.47\n\nThe HOVER dataset has three subsets (2-hop, 3-hop, 4-hop) based on reasoning depth.\n\nTo calculate the average performance gap:\n- 2-hop gap: 54.67 - 54.18 = 0.49\n- 3-hop gap: 52.35 - 52.88 = -0.53\n- 4-hop gap: 59.47 - 59.66 = -0.19\n\nAverage gap = (0.49 + (-0.53) + (-0.19)) / 3 = -0.23 / 3 = -0.0767\n\nRounded to 2 decimal places: -0.08\n\nAnswer: -0.08", "citations": [], "input_tokens": 8647, "output_tokens": 319, "cost_micros": 30726, "latency_ms": 6530, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "-0.08", "normalised_gold": "0.45"}}
+{"qid": "2310.07609v1.pdf::Q007", "doc_id": "2310.07609v1.pdf", "answer_format": "float", "gold": "0.45", "pages": 10, "document_id": 7591, "arm": "llamacloud_premium_lc", "question_id": "2310.07609v1.pdf::Q007", "answer_letter": null, "raw_text": "I need to find the performance gap between ProgramFC and QACheck in the HOVER dataset from Table 1.\n\nFrom Table 1, the F1 scores are:\n\n**HOVER 2-hop:**\n- ProgramFC: 54.27\n- QACheck: 55.67\n- Gap: 55.67 - 54.27 = 1.40\n\n**HOVER 3-hop:**\n- ProgramFC: 54.18\n- QACheck: 54.67\n- Gap: 54.67 - 54.18 = 0.49\n\n**HOVER 4-hop:**\n- ProgramFC: 52.88\n- QACheck: 52.35\n- Gap: 52.35 - 52.88 = -0.53\n\nAverage gap = (1.40 + 0.49 + (-0.53)) / 3 = 1.36 / 3 = 0.45333...\n\nRounded to 2 decimal places: 0.45\n\nAnswer: 0.45", "citations": [], "input_tokens": 12547, "output_tokens": 251, "cost_micros": 41406, "latency_ms": 5296, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "0.45", "normalised_gold": "0.45"}}
+{"qid": "2310.07609v1.pdf::Q007", "doc_id": "2310.07609v1.pdf", "answer_format": "float", "gold": "0.45", "pages": 10, "document_id": 7591, "arm": "surfsense_agentic", "question_id": "2310.07609v1.pdf::Q007", "answer_letter": null, "raw_text": "I'll read the document to find information about ProgramFC performance and the proposed system on the HOVER dataset.Let me search for tables or sections containing performance comparisons on the HOVER dataset.Let me read more of the document to find the tables with performance results.Let me search for tables with results and performance metrics.Perfect! I found Table 1 with the performance results. Let me extract the exact values:\n\nFrom Table 1, for the HOVER dataset:\n- **ProgramFC**: \n  - 2-hop: 54.27\n  - 3-hop: 54.18\n  - 4-hop: 52.88\n\n- **QACheck (proposed system)**:\n  - 2-hop: 55.67\n  - 3-hop: 54.67\n  - 4-hop: 52.35\n\nNow I need to calculate the average performance gap between QACheck and ProgramFC:\n\nFor each hop level:\n- 2-hop gap: 55.67 - 54.27 = 1.40\n- 3-hop gap: 54.67 - 54.18 = 0.49\n- 4-hop gap: 52.35 - 52.88 = -0.53\n\nAverage gap = (1.40 + 0.49 + (-0.53)) / 3 = 1.36 / 3 = 0.45333...\n\nRounded to 2 decimals: 0.45\n\nAnswer: 0.45", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 51640, "error": null, "extra": {"thread_id": 2016, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_b7dbcb3c5c4d4293862871c529bf39fc", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 231, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "0.45", "normalised_gold": "0.45"}}
+{"qid": "2310.09158v1.pdf::Q000", "doc_id": "2310.09158v1.pdf", "answer_format": "list", "gold": "['event relation extraction', 'deductive reasoning']", "pages": 23, "document_id": 7592, "arm": "native_pdf", "question_id": "2310.09158v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to identify the main tasks the authors conducted experiments on by examining the document.\n\nFrom the abstract and introduction (page 1), the authors state:\n\"More in detail, we first investigate the deficiency of LLMs in logical reasoning on different tasks, including event relation extraction and deductive reasoning.\"\n\nFrom Section 2 PRELIMINARIES (page 2), the document explicitly describes two main tasks:\n- Section 2.1: Event Relation Extraction\n- Section 2.2: Deductive Reasoning\n\nThese are consistently mentioned throughout the paper as the primary experimental tasks:\n- The pilot study (Section 3.1.1) uses \"MAVEN-ERE and ProofWriter\" \n- Main experiments (Section 4.1) evaluate on \"MAVEN-ERE, Causal-TimeBank, and ProofWriter\"\n- MAVEN-ERE and Causal-TimeBank are event relation extraction datasets\n- ProofWriter is a deductive reasoning dataset\n\nAnswer: Event Relation Extraction, Deductive Reasoning", "citations": [], "input_tokens": 65638, "output_tokens": 233, "cost_micros": 200409, "latency_ms": 34842, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.09158v1.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "deductive reasoning, event relation extraction", "normalised_gold": "deductive reasoning, event relation extraction"}}
+{"qid": "2310.09158v1.pdf::Q000", "doc_id": "2310.09158v1.pdf", "answer_format": "list", "gold": "['event relation extraction', 'deductive reasoning']", "pages": 23, "document_id": 7592, "arm": "azure_basic_lc", "question_id": "2310.09158v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking through the document, I can identify the two main tasks that the authors conducted experiments on:\n\n1. **Event Relation Extraction (ERE)** - This is extensively discussed throughout the paper, with experiments on datasets like MAVEN-ERE and Causal-TimeBank. The task involves identifying coreference, temporal, causal, and subevent relations between events.\n\n2. **Deductive Reasoning** - This is the second main task, with experiments conducted on the ProofWriter dataset. The task involves logical reasoning to prove, disprove, or determine if statements are unknown based on given facts and rules.\n\nThese two tasks are explicitly mentioned in the abstract: \"we first investigate the deficiency of LLMs in logical reasoning on different tasks, including event relation extraction and deductive reasoning.\"\n\nAnswer: Event Relation Extraction, Deductive Reasoning", "citations": [], "input_tokens": 28196, "output_tokens": 189, "cost_micros": 87423, "latency_ms": 6483, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "deductive reasoning, event relation extraction", "normalised_gold": "deductive reasoning, event relation extraction"}}
+{"qid": "2310.09158v1.pdf::Q000", "doc_id": "2310.09158v1.pdf", "answer_format": "list", "gold": "['event relation extraction', 'deductive reasoning']", "pages": 23, "document_id": 7592, "arm": "azure_premium_lc", "question_id": "2310.09158v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking through the document, I can identify the two main tasks where experiments were conducted.\n\nFrom the Abstract: \"we first investigate the deficiency of LLMs in logical reasoning on different tasks, including event relation extraction and deductive reasoning.\"\n\nFrom Section 2 (Preliminaries): \"In this section, we first introduce two tasks that this paper mainly explores.\" followed by Section 2.1 \"Event Relation Extraction\" and Section 2.2 \"Deductive Reasoning\"\n\nFrom the experimental setup sections, both tasks are consistently used throughout the paper for evaluation (Tables 1, 2, 3, and various figures reference MAVEN-ERE/Causal-TimeBank for ERE and ProofWriter for deductive reasoning).\n\nAnswer: event relation extraction, deductive reasoning", "citations": [], "input_tokens": 34236, "output_tokens": 175, "cost_micros": 105333, "latency_ms": 7014, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "deductive reasoning, event relation extraction", "normalised_gold": "deductive reasoning, event relation extraction"}}
+{"qid": "2310.09158v1.pdf::Q000", "doc_id": "2310.09158v1.pdf", "answer_format": "list", "gold": "['event relation extraction', 'deductive reasoning']", "pages": 23, "document_id": 7592, "arm": "llamacloud_basic_lc", "question_id": "2310.09158v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking through the document for the main experimental tasks:\n\nFrom the Introduction and throughout the paper, the authors clearly state they conduct investigations on two main tasks:\n\n1. **Event Relation Extraction (ERE)** - mentioned extensively, including datasets MAVEN-ERE and Causal-TimeBank\n2. **Deductive Reasoning** - mentioned alongside ERE, including the ProofWriter dataset\n\nThis is confirmed in multiple places:\n- Section 2 \"PRELIMINARIES\" has subsections 2.1 \"Event Relation Extraction\" and 2.2 \"Deductive Reasoning\"\n- Section 3.1.1 states: \"We conduct a manual evaluation on MAVEN-ERE and ProofWriter\"\n- The abstract mentions \"event relation extraction and deductive reasoning\"\n\nAnswer: Event Relation Extraction, Deductive Reasoning", "citations": [], "input_tokens": 26309, "output_tokens": 193, "cost_micros": 81822, "latency_ms": 6296, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "deductive reasoning, event relation extraction", "normalised_gold": "deductive reasoning, event relation extraction"}}
+{"qid": "2310.09158v1.pdf::Q000", "doc_id": "2310.09158v1.pdf", "answer_format": "list", "gold": "['event relation extraction', 'deductive reasoning']", "pages": 23, "document_id": 7592, "arm": "llamacloud_premium_lc", "question_id": "2310.09158v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking through the document, I can identify the two main tasks the authors conducted experiments on:\n\n1. **Event Relation Extraction (ERE)** - This task aims to identify relations (Coreference, Temporal, Causal, and Subevent) between two events in text. The authors used datasets like MAVEN-ERE and Causal-TimeBank for this task.\n\n2. **Deductive Reasoning** - This task involves beginning with known facts and rules, then iteratively making new inferences. The authors used the ProofWriter dataset for this task.\n\nThese two tasks are explicitly mentioned in Section 2 (PRELIMINARIES) where the authors state: \"In this section, we first introduce two tasks that this paper mainly explores.\" The section then proceeds with subsections 2.1 Event Relation Extraction and 2.2 Deductive Reasoning.\n\nAnswer: Event Relation Extraction, Deductive Reasoning", "citations": [], "input_tokens": 34535, "output_tokens": 211, "cost_micros": 106770, "latency_ms": 6907, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "deductive reasoning, event relation extraction", "normalised_gold": "deductive reasoning, event relation extraction"}}
+{"qid": "2310.09158v1.pdf::Q000", "doc_id": "2310.09158v1.pdf", "answer_format": "list", "gold": "['event relation extraction', 'deductive reasoning']", "pages": 23, "document_id": 7592, "arm": "surfsense_agentic", "question_id": "2310.09158v1.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to identify the two main tasks the author conducted experiments on.Let me continue reading to find the specific tasks mentioned:Based on the document, I can see that the author evaluated LLMs in two practical scenarios:\n\n1. Event relation extraction (lines 473)\n2. Deductive reasoning tasks (lines 480-481)\n\nAnswer: event relation extraction, deductive reasoning", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 57891, "error": null, "extra": {"thread_id": 2017, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_fc47d9b3789248779051669f324d1694", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 114, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "deductive reasoning, event relation extraction", "normalised_gold": "deductive reasoning, event relation extraction"}}
+{"qid": "2310.09158v1.pdf::Q001", "doc_id": "2310.09158v1.pdf", "answer_format": "int", "gold": "6", "pages": 23, "document_id": 7592, "arm": "native_pdf", "question_id": "2310.09158v1.pdf::Q001", "answer_letter": null, "raw_text": "I need to find information about the number of possible combinations of relationships between two events.\n\nFrom the document, I can see in Section 2.1 (Event Relation Extraction) that the task involves identifying four types of relations between two events:\n\n1. **Coreference relations**: 2 possible labels (NO_COREFERENCE, COREFERENCE)\n2. **Temporal relations**: 7 possible labels (NO_TEMPORAL, BEFORE, OVERLAP, CONTAINS, SIMULTANEOUS, ENDS-ON, BEGINS-ON)\n3. **Causal relations**: 3 possible labels (NO_CAUSAL, PRECONDITION, CAUSE)\n4. **Subevent relations**: 2 possible labels (NO_SUBEVENT, SUBEVENT)\n\nThe total number of combinations would be:\n2 \u00d7 7 \u00d7 3 \u00d7 2 = 84\n\nHowever, let me verify this with Figure 1, which shows an example calculation. In Figure 1, it states:\n\n\"Logical Inconsistency (LI) = Number of Conflicts (1) / Number of Combinations (6) = 1/6 \u2248 16.7%\"\n\nThe text explains: \"for each pair of events, we have 4 types of relations to determine. The total combinations between these relations are calculated using the combinatorial formula: 4*(4-1)/2 = 6.\"\n\nThis is referring to pairwise combinations between the 4 relation types, not the total number of possible relationship combinations.\n\nFor the total number of possible combinations of relationships, the calculation is:\n2 \u00d7 7 \u00d7 3 \u00d7 2 = 84\n\nAnswer: 84", "citations": [], "input_tokens": 65637, "output_tokens": 393, "cost_micros": 202806, "latency_ms": 38250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.09158v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "84", "normalised_gold": "6"}}
+{"qid": "2310.09158v1.pdf::Q001", "doc_id": "2310.09158v1.pdf", "answer_format": "int", "gold": "6", "pages": 23, "document_id": 7592, "arm": "azure_basic_lc", "question_id": "2310.09158v1.pdf::Q001", "answer_letter": null, "raw_text": "I need to find how many possible combinations of relationships can exist between two events according to the paper's definition.\n\nFrom the document, I can see in Section 2.1 (Event Relation Extraction) that the paper defines 4 types of relations between two events:\n\n1. **Coreference relations**: 2 labels (NO_COREFERENCE, COREFERENCE)\n2. **Temporal relations**: 7 labels (NO_TEMPORAL, BEFORE, OVERLAP, CONTAINS, SIMULTANEOUS, ENDS-ON, BEGINS-ON)\n3. **Causal relations**: 3 labels (NO_CAUSAL, PRECONDITION, CAUSE)\n4. **Subevent relations**: 2 labels (NO_SUBEVENT, SUBEVENT)\n\nThe document also explicitly mentions in the logical inconsistency calculation (Figure 1 and Section 2.1):\n\n\"Now, regarding the total number of combinations: for each pair of events, we have 4 types of relations to determine. The total combinations between these relations are calculated using the combinatorial formula: 4*(4-1)/2 = 6.\"\n\nThis refers to combinations between the 4 relation types (choosing 2 from 4 types to check for conflicts), not the total possible relationship combinations.\n\nHowever, looking more carefully at the document, particularly in Figure 1, it states:\n\"Number of Combinations (+ O) = 6\"\n\nThis is the number of pairwise combinations when checking for conflicts between the 4 different relation types: C(4,2) = 4!/(2!\u00d72!) = 6.\n\nBut the question asks about \"combinations of relationships between two events,\" which would be the total number of possible relationship combinations (one label from each of the 4 types):\n\nTotal combinations = 2 \u00d7 7 \u00d7 3 \u00d7 2 = 84\n\nHowever, re-reading the context in Figure 1 more carefully, the \"6\" refers to checking consistency between pairs of the 4 relation types, not the total possible labelings.\n\nLooking at the question again and the context in the paper where they discuss logical inconsistency, the relevant number appears to be **6** - the number of combinations when checking pairs of relation types for conflicts", "citations": [], "input_tokens": 28195, "output_tokens": 512, "cost_micros": 92265, "latency_ms": 12843, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2310.09158v1.pdf::Q001", "doc_id": "2310.09158v1.pdf", "answer_format": "int", "gold": "6", "pages": 23, "document_id": 7592, "arm": "azure_premium_lc", "question_id": "2310.09158v1.pdf::Q001", "answer_letter": null, "raw_text": "I need to find how many possible combinations of relationships can exist between two events according to the paper's definition.\n\nFrom Section 2.1 (Event Relation Extraction), the paper states that for each pair of events, there are 4 types of relations to determine:\n\n1. **Coreference relations**: {NO_COREFERENCE, COREFERENCE} - 2 options\n2. **Temporal relations**: {NO_TEMPORAL, BEFORE, OVERLAP, CONTAINS, SIMULTANEOUS, ENDS-ON, BEGINS-ON} - 7 options\n3. **Causal relations**: {NO_CAUSAL, PRECONDITION, CAUSE} - 3 options\n4. **Subevent relations**: {NO_SUBEVENT, SUBEVENT} - 2 options\n\nThe paper also provides a specific formula in Figure 1 for calculating the \"total number of combinations\" when computing logical inconsistency:\n\nIn the example, it states: \"for each pair of events, we have 4 types of relations to determine. The total combinations between these relations are calculated using the combinatorial formula: 4*(4-1)/2 = 6.\"\n\nThis formula 4*(4-1)/2 = 6 represents the number of pairs of relation types (combinations of 2 from 4 types), which is used to check for conflicts between different relation types.\n\nHowever, if the question asks about the total possible combinations of relationship assignments (not pairs for conflict checking), it would be:\n2 \u00d7 7 \u00d7 3 \u00d7 2 = 84 possible combinations\n\nBut based on the context in Figure 1 where they explicitly calculate \"total number of combinations\" as 6 using the formula 4*(4-1)/2, the paper is referring to the number of pairs of relation types that need to be checked for logical consistency.\n\nAnswer: 6", "citations": [], "input_tokens": 34235, "output_tokens": 428, "cost_micros": 109125, "latency_ms": 11375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2310.09158v1.pdf::Q001", "doc_id": "2310.09158v1.pdf", "answer_format": "int", "gold": "6", "pages": 23, "document_id": 7592, "arm": "llamacloud_basic_lc", "question_id": "2310.09158v1.pdf::Q001", "answer_letter": null, "raw_text": "I need to find how many possible combinations of relationships can exist between two events according to the paper.\n\nFrom Section 2.1 (Event Relation Extraction), the paper identifies four types of relations between two events:\n\n1. **Coreference relations**: 2 options\n   - NO_COREFERENCE\n   - COREFERENCE\n\n2. **Temporal relations**: 7 options\n   - NO_TEMPORAL\n   - BEFORE\n   - OVERLAP\n   - CONTAINS\n   - SIMULTANEOUS\n   - ENDS-ON\n   - BEGINS-ON\n\n3. **Causal relations**: 3 options\n   - NO_CAUSAL\n   - PRECONDITION\n   - CAUSE\n\n4. **Subevent relations**: 2 options\n   - NO_SUBEVENT\n   - SUBEVENT\n\nThe paper states in Section 2.1: \"We formulate ERE as a multi-classification problem, determining one label (relation) for each of these four relation types.\"\n\nSince we need to determine one label for each of the four relation types, and these are independent choices, the total number of possible combinations is:\n\n2 \u00d7 7 \u00d7 3 \u00d7 2 = 84\n\nThis is also confirmed in Section 2.1 where it mentions calculating logical inconsistency as \"the ratio of the number of conflicts to the total number of combinations (i.e., all combinations of each two relations).\"\n\nAnswer: 84", "citations": [], "input_tokens": 26308, "output_tokens": 335, "cost_micros": 83949, "latency_ms": 7969, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "84", "normalised_gold": "6"}}
+{"qid": "2310.09158v1.pdf::Q001", "doc_id": "2310.09158v1.pdf", "answer_format": "int", "gold": "6", "pages": 23, "document_id": 7592, "arm": "llamacloud_premium_lc", "question_id": "2310.09158v1.pdf::Q001", "answer_letter": null, "raw_text": "I need to find how many possible combinations of relationships can exist between two events according to the paper's definition.\n\nFrom Section 2.1 (Event Relation Extraction), the paper discusses the Logical Inconsistency (LI) metric. In the example shown in Figure 1, there's a specific calculation:\n\n\"Among these, \"SIMULTANEOUS\" and \"CAUSE\" are identified as conflicting with each other based on the logical constraints we have defined, resulting in a single conflict. Now, regarding the total number of combinations: for each pair of events, we have 4 types of relations to determine. The total combinations between these relations are calculated using the combinatorial formula: $4 * (4 - 1) / 2 = 6$.\"\n\nThis calculation shows that with 4 types of relations (Coreference, Temporal, Causal, and Subevent), the number of combinations between pairs of these relations is 6.\n\nThis is confirmed in the definition of Logical Inconsistency in Section 2.1:\n\"logical inconsistency is calculated as the ratio of the number of conflicts (i.e., the answers that conflict with the given logical constraints) to the total number of combinations (i.e., all combinations of each two relations).\"\n\nThe example explicitly states: \"Hence, the logical inconsistency in this example is computed as LI = 1/6 (or approximately 16.7%).\"\n\nThe denominator 6 represents the total number of possible combinations of relationships between two events.\n\nAnswer: 6", "citations": [], "input_tokens": 34534, "output_tokens": 346, "cost_micros": 108792, "latency_ms": 10032, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2310.09158v1.pdf::Q001", "doc_id": "2310.09158v1.pdf", "answer_format": "int", "gold": "6", "pages": 23, "document_id": 7592, "arm": "surfsense_agentic", "question_id": "2310.09158v1.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the document to find the definition of relationships between two events.Let me search for more specific terms related to event relationships:Perfect! I found the answer. Let me read that specific chunk to get the complete information:Based on the document, I can see that for each pair of events, there are 4 types of relations to determine. The total combinations between these relations are calculated using the combinatorial formula: 4*(4-1)/2 = 6.\n\nAnswer: 6", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 36141, "error": null, "extra": {"thread_id": 2018, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_45c25fd9bb164100939e531a52487f98", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 140, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2310.09158v1.pdf::Q004", "doc_id": "2310.09158v1.pdf", "answer_format": "str", "gold": "CoT w. logical constraints", "pages": 23, "document_id": 7592, "arm": "native_pdf", "question_id": "2310.09158v1.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Table 3 in the document, which shows \"Vicuna-13B and Llama2-13B's performance on MAVEN-ERE and Causal-TimeBank after pre-training on LLM-LR\":\n\nFor Vicuna-13B-PT on MAVEN-ERE, the results are:\n- vanilla ICL: 15.3% Micro-F1\n- vanilla ICL w. CoT: 15.8% Micro-F1\n- CoT w. logical constraints: 18.0% Micro-F1\n- w. all logical constraints: 16.3% Micro-F1\n- w. retrieved logical constraints: 16.1% Micro-F1\n- w. post-processing: 11.0% Micro-F1\n\nThe best performance is 18.0% Micro-F1.\n\nAnswer: CoT w. logical constraints", "citations": [], "input_tokens": 65644, "output_tokens": 212, "cost_micros": 200112, "latency_ms": 35562, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.09158v1.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "cot w logical constraints", "normalised_gold": "cot w logical constraints"}}
+{"qid": "2310.09158v1.pdf::Q004", "doc_id": "2310.09158v1.pdf", "answer_format": "str", "gold": "CoT w. logical constraints", "pages": 23, "document_id": 7592, "arm": "azure_basic_lc", "question_id": "2310.09158v1.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Table 3 in the document, which shows the performance of Vicuna-13B after pre-training (Vicuna-13B-PT) on MAVEN-ERE:\n\n- vanilla ICL: 15.3%\n- vanilla ICL w. CoT: 15.8%\n- CoT w. logical constraints: 18.0%\n- w. all logical constraints: 16.3%\n- w. retrieved logical constraints: 16.1%\n- w. post-processing: 11.0%\n\nThe highest Micro-F1 score is 18.0%.\n\nAnswer: CoT w. logical constraints", "citations": [], "input_tokens": 28202, "output_tokens": 149, "cost_micros": 86841, "latency_ms": 5000, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "cot w logical constraints", "normalised_gold": "cot w logical constraints"}}
+{"qid": "2310.09158v1.pdf::Q004", "doc_id": "2310.09158v1.pdf", "answer_format": "str", "gold": "CoT w. logical constraints", "pages": 23, "document_id": 7592, "arm": "azure_premium_lc", "question_id": "2310.09158v1.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Table 3, I need to find the best performance of Vicuna-13B-PT on MAVEN-ERE.\n\nThe table shows the following Micro-F1 scores for Vicuna-13B-PT on MAVEN-ERE:\n- vanilla ICL: 15.3%\n- vanilla ICL w. CoT: 15.8%\n- CoT w. logical constraints: 18.0%\n- w. all logical constraints: 16.3%\n- w. retrieved logical constraints: 16.1%\n- w. post-processing: 11.0%\n\nThe highest score is 18.0% for \"CoT w. logical constraints\".\n\nAnswer: CoT w. logical constraints", "citations": [], "input_tokens": 34242, "output_tokens": 168, "cost_micros": 105246, "latency_ms": 5672, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "cot w logical constraints", "normalised_gold": "cot w logical constraints"}}
+{"qid": "2310.09158v1.pdf::Q004", "doc_id": "2310.09158v1.pdf", "answer_format": "str", "gold": "CoT w. logical constraints", "pages": 23, "document_id": 7592, "arm": "llamacloud_basic_lc", "question_id": "2310.09158v1.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Table 3, which shows the performance of Vicuna-13B-PT (pre-trained) on MAVEN-ERE:\n\nThe results show:\n- vanilla ICL: 15.3%\n- vanilla ICL w. CoT: 15.8%\n- CoT w. logical constraints: 18.0%\n- w. all logical constraints: 16.3%\n- w. retrieved logical constraints: 16.1%\n- w. post-processing: 11.0%\n\nThe highest micro-F1 score is 18.0% for \"CoT w. logical constraints\".\n\nAnswer: CoT w. logical constraints", "citations": [], "input_tokens": 26315, "output_tokens": 151, "cost_micros": 81210, "latency_ms": 4937, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "cot w logical constraints", "normalised_gold": "cot w logical constraints"}}
+{"qid": "2310.09158v1.pdf::Q004", "doc_id": "2310.09158v1.pdf", "answer_format": "str", "gold": "CoT w. logical constraints", "pages": 23, "document_id": 7592, "arm": "llamacloud_premium_lc", "question_id": "2310.09158v1.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Table 3 in the document, which shows the performance of Vicuna-13B and Llama2-13B after pre-training on LLM-LR:\n\nFor Vicuna-13B on MAVEN-ERE, the Micro-F1 scores are:\n- vanilla ICL: 15.3%\n- vanilla ICL w. CoT: 15.8%\n- CoT w. logical constraints: 18.0%\n- w. all logical constraints: 16.3%\n- w. retrieved logical constraints: 16.1%\n- w. post-processing: 11.0%\n\nThe highest performance is 18.0% with CoT w. logical constraints.\n\nAnswer: CoT w. logical constraints", "citations": [], "input_tokens": 34541, "output_tokens": 175, "cost_micros": 106248, "latency_ms": 5140, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "cot w logical constraints", "normalised_gold": "cot w logical constraints"}}
+{"qid": "2310.09158v1.pdf::Q004", "doc_id": "2310.09158v1.pdf", "answer_format": "str", "gold": "CoT w. logical constraints", "pages": 23, "document_id": 7592, "arm": "surfsense_agentic", "question_id": "2310.09158v1.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to find information about Vicuna-13B's performance on MAVEN-ERE.Let me search for information about MAVEN-ERE and Vicuna-13B performance:Let me read the table that shows the performance results:Perfect! I found the table showing Vicuna-13B's performance on MAVEN-ERE. Looking at the table, I can see three different approaches for Vicuna (Vicuna-13B):\n\n1. vanilla ICL: 13.8% Micro-F1\n2. vanilla ICL w. CoT: 11.6% Micro-F1\n3. CoT w. logical constraints: 14.9% Micro-F1\n\nThe best performance was with \"CoT w. logical constraints\" at 14.9% Micro-F1.\n\nAnswer: CoT w. logical constraints", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 38983, "error": null, "extra": {"thread_id": 2019, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_b286a9a39b954fac8755dc6ce4c285c7", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 166, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "cot w logical constraints", "normalised_gold": "cot w logical constraints"}}
+{"qid": "2311.16502v3.pdf::Q000", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "287", "pages": 117, "document_id": 7593, "arm": "native_pdf", "question_id": "2311.16502v3.pdf::Q000", "answer_letter": null, "raw_text": "Looking through the document for information about the Sociology subject...\n\nFrom Figure 3 on page 3, I can see that Sociology is listed under \"Humanities & Social Sci. (9%)\" with \"Sociology (287, 2.48%)\".\n\nFrom Table 8 on page 18 (Humanities & Social Science results), I can see that Sociology has 252 samples in the test set.\n\nAdding the validation and test samples:\n- Validation set appears to have a portion of the 947 total H&SS samples\n- Test set for Sociology: 252 samples\n\nLooking more carefully at the statistics in Table 1 on page 4, the total is 11,550 questions with a Dev:Validation:Test split of 150:900:10500.\n\nFrom the detailed breakdown in Figure 3, Sociology shows (287, 2.48%), which represents 287 total samples across all splits.\n\nAnswer: 287", "citations": [], "input_tokens": 259120, "output_tokens": 216, "cost_micros": 1559580, "latency_ms": 77812, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "287", "normalised_gold": "287"}}
+{"qid": "2311.16502v3.pdf::Q000", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "287", "pages": 117, "document_id": 7593, "arm": "azure_basic_lc", "question_id": "2311.16502v3.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the number of samples in MMMU that belong to the sociology subject.\n\nLooking at the document, I can find this information in Figure 3, which shows the breakdown of questions by disciplines, subjects, and subfields.\n\nIn Figure 3, under \"Humanities & Social Sci. (9%)\", I can see:\n\n- History (313, 2.71%)\n- Literature (147, 1.27%)\n- Sociology (287, 2.48%)\n- Psychology (340, 2.94%)\n\nThe number in parentheses after \"Sociology\" shows 287 questions, which represents 2.48% of the total.\n\nAnswer: 287", "citations": [], "input_tokens": 79424, "output_tokens": 155, "cost_micros": 240597, "latency_ms": 7077, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "287", "normalised_gold": "287"}}
+{"qid": "2311.16502v3.pdf::Q000", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "287", "pages": 117, "document_id": 7593, "arm": "azure_premium_lc", "question_id": "2311.16502v3.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the number of samples in the MMMU dataset that belong to the sociology subject.\n\nLooking through the document, I can find information about the distribution of samples across subjects in Figure 3 and Table 10.\n\nFrom Figure 3, which shows \"MMMU contains 11.5K multimodal questions covering six broad disciplines, 30 subjects, and 183 subfields,\" I can see the breakdown of subjects with their sample counts.\n\nUnder \"Humanities & Social Sci. (9%)\", I can see:\n- History (313, 2.71%)\n- Literature (147, 1.27%)\n- Psychology (340, 2.94%)\n- Sociology (287, 2.48%)\n\nThe number shown for Sociology is 287 samples.\n\nAnswer: 287", "citations": [], "input_tokens": 109175, "output_tokens": 183, "cost_micros": 330270, "latency_ms": 8484, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "287", "normalised_gold": "287"}}
+{"qid": "2311.16502v3.pdf::Q000", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "287", "pages": 117, "document_id": 7593, "arm": "llamacloud_basic_lc", "question_id": "2311.16502v3.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about the number of samples in the Sociology subject.\n\nIn Figure 3 on page 3, there is a breakdown table showing the distribution of samples across different subjects. Under \"Humanities & Social Sci. (9%)\", I can see:\n\nv Sociology (287, 2.48%)\n\nThe number 287 represents the number of samples in the Sociology subject.\n\nAnswer: 287", "citations": [], "input_tokens": 83138, "output_tokens": 100, "cost_micros": 250914, "latency_ms": 6203, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "287", "normalised_gold": "287"}}
+{"qid": "2311.16502v3.pdf::Q000", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "287", "pages": 117, "document_id": 7593, "arm": "llamacloud_premium_lc", "question_id": "2311.16502v3.pdf::Q000", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "287"}}
+{"qid": "2311.16502v3.pdf::Q000", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "287", "pages": 117, "document_id": 7593, "arm": "surfsense_agentic", "question_id": "2311.16502v3.pdf::Q000", "answer_letter": null, "raw_text": "I'll search for information about MMMU and sociology samples in the provided documents.Let me search for information about sociology and sample counts in the document:I found the information! Let me read the specific section with the table that shows the sample counts:Perfect! I found the answer. According to line 1572, Sociology has 287 samples in MMMU, which represents 2.48% of the dataset.\n\nAnswer: 287", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 63671, "error": null, "extra": {"thread_id": 2020, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_d6e9ac9023c84b0284ac8c1e7870f526", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 141, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "287", "normalised_gold": "287"}}
+{"qid": "2311.16502v3.pdf::Q001", "doc_id": "2311.16502v3.pdf", "answer_format": "float", "gold": "0.55%", "pages": 117, "document_id": 7593, "arm": "native_pdf", "question_id": "2311.16502v3.pdf::Q001", "answer_letter": null, "raw_text": "I need to find the number of Public Health samples showcased in the paper and compare it to the total number of Public Health samples.\n\nFrom the document:\n\n1. **Total Public Health samples**: According to Figure 3 on page 3, Public Health has **544 samples** (shown as \"Public Health (544, 4.7%)\").\n\n2. **Public Health samples showcased**: I need to count the case study figures related to Public Health from Table 10 on page 21. According to this table, Public Health has the following case study figures:\n   - Correct Case: 63\n   - Lack of Knowledge: 65\n   - Textual Understanding Error: 64\n\n   This gives us **3 showcased samples** from Public Health.\n\nThe ratio would be: 3/544 = 0.0055147...\n\nRounding to a reasonable number of decimal places: 0.0055\n\nAnswer: 0.0055", "citations": [], "input_tokens": 259140, "output_tokens": 214, "cost_micros": 1559655, "latency_ms": 83546, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "0.0055", "normalised_gold": "0.55"}}
+{"qid": "2311.16502v3.pdf::Q001", "doc_id": "2311.16502v3.pdf", "answer_format": "float", "gold": "0.55%", "pages": 117, "document_id": 7593, "arm": "azure_basic_lc", "question_id": "2311.16502v3.pdf::Q001", "answer_letter": null, "raw_text": "I need to find the number of Public Health samples showcased in the paper and calculate the ratio to the total Public Health samples.\n\nFrom Table 7 (Health & Medicine results), I can see that Public Health has 509 samples in the test set.\n\nNow I need to count how many Public Health samples are showcased in the paper (Appendix B - Case Study).\n\nLooking through the Case Study figures list and the actual figures:\n\nPublic Health samples shown:\n- Figure 63: Public Health 1 (Epidemiology) - Correct Case\n- Figure 64: Public Health 2 (Epidemiology) - Textual Understanding Error\n- Figure 65: Public Health 3 (Epidemiology) - Lack of Knowledge\n\nSo there are 3 Public Health samples showcased.\n\nThe ratio is: 3 / 509 = 0.005892...\n\nRounding to a reasonable number of decimal places:\n\nAnswer: 0.0059", "citations": [], "input_tokens": 79444, "output_tokens": 220, "cost_micros": 241632, "latency_ms": 8359, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "0.0059", "normalised_gold": "0.55"}}
+{"qid": "2311.16502v3.pdf::Q001", "doc_id": "2311.16502v3.pdf", "answer_format": "float", "gold": "0.55%", "pages": 117, "document_id": 7593, "arm": "azure_premium_lc", "question_id": "2311.16502v3.pdf::Q001", "answer_letter": null, "raw_text": "I need to find the percentage of Public Health samples that are showcased in the paper.\n\nFrom the document, I can find:\n\n1. Total number of Public Health samples in the test set:\n   - From Table 7, Public Health has 509 samples in the test set\n\n2. Number of Public Health samples showcased in the paper:\n   - I need to count the figures in Appendix B that are from Public Health\n   - From the List of Case Study Figures (pages 20-21):\n     - Figure 63: Public Health 1: Correct Case\n     - Figure 64: Public Health 2: Textual Understanding Error\n     - Figure 65: Public Health 3: Lack of Knowledge\n   \n   So there are 3 Public Health samples showcased.\n\n3. Calculate the percentage:\n   - Percentage = (3 / 509) \u00d7 100 = 0.589...\n\nWait, let me verify the total number. Looking at Table 1 (page 4), Public Health has 544 samples total. But the question asks about samples \"in this paper\", which could mean the test set.\n\nFrom Table 7 on page 17, Public Health (test set) = 509 samples.\n\nThe ratio would be: 3/509 = 0.00589...\n\nIf we want this as a percentage: 0.589%\n\nBut the question asks for \"a ratio\" as a decimal number, so:\n\n3/509 = 0.005893\n\nAnswer: 0.0059", "citations": [], "input_tokens": 109195, "output_tokens": 347, "cost_micros": 332790, "latency_ms": 10437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "0.0059", "normalised_gold": "0.55"}}
+{"qid": "2311.16502v3.pdf::Q001", "doc_id": "2311.16502v3.pdf", "answer_format": "float", "gold": "0.55%", "pages": 117, "document_id": 7593, "arm": "llamacloud_basic_lc", "question_id": "2311.16502v3.pdf::Q001", "answer_letter": null, "raw_text": "I need to find the number of Public Health samples showcased in the paper and calculate the percentage.\n\nFrom the document, I can find information about Public Health samples:\n\n1. First, let me find the total number of Public Health samples in the dataset.\nFrom Table 1 (Key statistics), I can see the total questions is 11,550.\n\nLooking at Figure 3 or the breakdown tables, I need to find Public Health information:\n- From Table 7 (Health & Medicine results), I can see Public Health is listed with 509 samples in the test set.\n- The validation set has 150 samples total for Health & Medicine.\n\nActually, looking more carefully at the tables:\n- Table 7 shows Health & Medicine breakdown with Public Health having 509 samples in the test set\n- There's also validation data\n\nLet me look at the case study section (Appendix B):\nFrom Table 10 (Table index of case study figures by subjects), I can count the Public Health cases:\n- Public Health 1: Correct Case (Figure 63)\n- Public Health 2: Textual Understanding Error (Figure 64)\n- Public Health 3: Lack of Knowledge (Figure 65)\n\nSo there are 3 Public Health cases showcased in the paper.\n\nNow I need to find the total number of Public Health samples:\nFrom Table 7, the Public Health row shows 509 samples in the test set.\n\nThe ratio would be: 3/509 = 0.00589...\n\nWait, let me verify this is asking for the percentage of showcased samples. \n\n3/509 \u00d7 100 = 0.589%\n\nAs a ratio (not percentage): 3/509 = 0.00589\n\nAnswer: 0.00589", "citations": [], "input_tokens": 83158, "output_tokens": 399, "cost_micros": 255459, "latency_ms": 11937, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "0.00589", "normalised_gold": "0.55"}}
+{"qid": "2311.16502v3.pdf::Q001", "doc_id": "2311.16502v3.pdf", "answer_format": "float", "gold": "0.55%", "pages": 117, "document_id": 7593, "arm": "llamacloud_premium_lc", "question_id": "2311.16502v3.pdf::Q001", "answer_letter": null, "raw_text": "I need to find the number of Public Health samples showcased in the paper and divide by the total number of Public Health samples.\n\nFrom the document:\n\n1. **Total Public Health samples**: Looking at Figure 3, Public Health has 544 samples (4.7% of total).\n\n2. **Public Health samples showcased in the paper**: Looking at Table 10 (Case Study Figures by subjects), I can find the Public Health figures:\n   - Figure 63: Public Health 1: Correct Case\n   - Figure 64: Public Health 2: Textual Understanding Error\n   - Figure 65: Public Health 3: Lack of Knowledge\n\nSo there are 3 Public Health case study figures shown in the paper.\n\nThe ratio is: 3 / 544 = 0.00551...\n\nAnswer: 0.0055", "citations": [], "input_tokens": 103252, "output_tokens": 191, "cost_micros": 312621, "latency_ms": 7812, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "0.0055", "normalised_gold": "0.55"}}
+{"qid": "2311.16502v3.pdf::Q001", "doc_id": "2311.16502v3.pdf", "answer_format": "float", "gold": "0.55%", "pages": 117, "document_id": 7593, "arm": "surfsense_agentic", "question_id": "2311.16502v3.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the document to find information about 'Public Health' samples and calculate the percentage.Let me search for tables and figures that might contain information about Public Health samples:Now let me look at Figure 3 and tables that show the breakdown of subjects:Perfect! I found the information. Let me read more to make sure I have all the details:Perfect! I found the information in the table. From line 1505, I can see:\n\n\"Public Health (544, 4.7%)\"\n\nThis indicates that:\n- Public Health has 544 samples\n- This represents 4.7% of all samples in the dataset\n\nThe answer to the question \"Among all samples belonging to 'Public Health' subject, how many percentage samples showcased in this paper?\" is 4.7%.\n\nSince the question asks for a ratio (percentage) and wants a single decimal number, the answer is:\n\nAnswer: 4.7", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 53796, "error": null, "extra": {"thread_id": 2021, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_8dc9bfdb2b5b47d48d7a6fa4dc0574a2", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 203, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "4.7", "normalised_gold": "0.55"}}
+{"qid": "2311.16502v3.pdf::Q002", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "2", "pages": 117, "document_id": 7593, "arm": "native_pdf", "question_id": "2311.16502v3.pdf::Q002", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "2"}}
+{"qid": "2311.16502v3.pdf::Q002", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "2", "pages": 117, "document_id": 7593, "arm": "azure_basic_lc", "question_id": "2311.16502v3.pdf::Q002", "answer_letter": null, "raw_text": "I need to find all Economics-related error cases in the document and count how many fall into the \"Perceptual Error\" category.\n\nLet me search through the document for Economics-related figures with error categories.\n\nLooking through the case study figures in Appendix B:\n\n1. Figure 22 - Economics: Econometrics - This is a CORRECT case, not an error case.\n\n2. Figure 23 - Economics: Macroeconomics - Error Category: Perceptual Error\n   This case shows GPT-4V making an error in correctly identifying numbers in the table.\n\n3. Figure 24 - Economics: Macroeconomics - Error Category: Perceptual Error\n   This case shows GPT-4V failing to correctly interpret the intersection point in the graph.\n\nLet me verify there are no other Economics cases by checking the Table of Contents (Table 10):\n\nFrom Table 10, I can see:\n- Economics has entries for figures 22, 23, 24\n- Figure 22 is marked as \"Correct Case\"\n- Figure 23 is marked as \"Perceptual Error\"\n- Figure 24 is marked as \"Perceptual Error\"\n\nSo there are 2 Economics-related error cases that fall into the Perceptual Error category.\n\nAnswer: 2", "citations": [], "input_tokens": 79431, "output_tokens": 298, "cost_micros": 242763, "latency_ms": 8734, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2311.16502v3.pdf::Q002", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "2", "pages": 117, "document_id": 7593, "arm": "azure_premium_lc", "question_id": "2311.16502v3.pdf::Q002", "answer_letter": null, "raw_text": "I need to find all Economics-related error cases in the document and count how many fall into the \"Perceptual Error\" category.\n\nLet me search through the document for Economics error cases:\n\n1. **Figure 23** - Economics: Macroeconomics - Error Category: Perceptual Error\n   - GPT-4V incorrectly attributes numbers to the correct objects for production\n\n2. **Figure 24** - Economics: Macroeconomics - Error Category: Perceptual Error\n   - GPT-4V fails to interpret the image correctly regarding the equilibrium interest rate\n\nLooking through the List of Case Study Figures table (Table 10), I can see:\n- Economics 1: Correct Case (Figure 22)\n- Economics 2: Perceptual Error (Figure 23)\n- Economics 3: Perceptual Error (Figure 24)\n\nSo there are **2** Economics-related error cases that fall into the Perceptual Error category.\n\nAnswer: 2", "citations": [], "input_tokens": 109182, "output_tokens": 227, "cost_micros": 330951, "latency_ms": 8328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2311.16502v3.pdf::Q002", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "2", "pages": 117, "document_id": 7593, "arm": "llamacloud_basic_lc", "question_id": "2311.16502v3.pdf::Q002", "answer_letter": null, "raw_text": "I need to find all Economics-related error cases in the case study section and count how many fall into the \"Perceptual Error\" category.\n\nLooking through the case study figures, I'll identify Economics-related cases:\n\n1. Figure 22 - Economics: Econometrics - This is a correct case, not an error case.\n\n2. Figure 23 - Economics: Macroeconomics - Error Category: Perceptual Error\n\n3. Figure 24 - Economics: Macroeconomics - Error Category: Perceptual Error\n\nNow let me verify these are all the Economics cases by checking the Table of Contents in the case study section (Table 10):\n\nFrom Table 10:\n- Economics 1: Correct Case (Figure 22/36)\n- Economics 2: Perceptual Error (Figure 23/37)\n- Economics 3: Perceptual Error (Figure 24/38)\n\nSo there are 2 Economics-related error cases that fall into the Perceptual Error category.\n\nAnswer: 2", "citations": [], "input_tokens": 83145, "output_tokens": 237, "cost_micros": 252990, "latency_ms": 7797, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2311.16502v3.pdf::Q002", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "2", "pages": 117, "document_id": 7593, "arm": "llamacloud_premium_lc", "question_id": "2311.16502v3.pdf::Q002", "answer_letter": null, "raw_text": "I need to find all Economics-related error cases in Appendix B (Case Study) that are categorized as \"Perceptual Error\".\n\nLet me search through the Case Study section for Economics cases:\n\n1. **Figure 22** - Economics: Econometrics - This is a **correct case**, not an error case.\n\n2. **Figure 23** - Economics: Macroeconomics - Error Category: **Perceptual Error** \u2713\n\n3. **Figure 24** - Economics: Macroeconomics - Error Category: **Perceptual Error** \u2713\n\nLet me verify there are no other Economics cases by checking the Table of Contents (Table 10):\n- Economics 1: Correct Case (Figure 22)\n- Economics 2: Perceptual Error (Figure 23)\n- Economics 3: Perceptual Error (Figure 24)\n\nThat's all the Economics cases listed.\n\nTherefore, there are 2 Economics-related error cases that fall into the Error Category of Perceptual Error.\n\nAnswer: 2", "citations": [], "input_tokens": 103239, "output_tokens": 242, "cost_micros": 313347, "latency_ms": 8250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2311.16502v3.pdf::Q002", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "2", "pages": 117, "document_id": 7593, "arm": "surfsense_agentic", "question_id": "2311.16502v3.pdf::Q002", "answer_letter": null, "raw_text": "I'll search for information about Economics-related error cases and Perceptual Error categories in the provided documents.Let me search for content related to \"Economics\" and \"Perceptual Error\" in this document.Let me search for tables or error categories in the document:Now let me look for tables with error categories and subject areas. Let me search more broadly:Let me look at the specific area around line 7247 where there's a question about stocks (Economics-related):Let me search for a table that shows the distribution of error categories by subject area:Let me search for tables that might show error distribution:Let me look for error analysis tables. Let me search around the beginning of the document for methodology or error analysis sections:Let me search for \"Figure\" to find error analysis figures/tables:Great! Let me read the section with Figure 6 which shows error distribution:Good! I can see Figure 6 shows the overall error distribution, but I need to find a breakdown by subject area. Let me look for tables that show error categories by discipline:I can see there's a table showing different subjects and error cases. Let me read around line 5808 where there's an Accounting entry:Perfect! I can see the list of case studies. I can see:\n- Economics 1: Correct Case\n- Economics 2: Perceptual Error\n- Economics 3: Perceptual Error\n\nSo there are 2 Economics cases with Perceptual Error. Let me verify this is the complete list by checking if there are more Economics cases:Perfect! I found Table 10 which shows the case study figures by subjects with associated error categories. Looking at the Economics row:\n\n- Subject:", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 94360, "error": null, "extra": {"thread_id": 2022, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_ce7c6d6aee6641609ca2f09db6ff3175", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 531, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "2"}}
+{"qid": "2311.16502v3.pdf::Q003", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Thermodynamics', 'Heat Transfer', 'Fluid Mechanics']", "pages": 117, "document_id": 7593, "arm": "native_pdf", "question_id": "2311.16502v3.pdf::Q003", "answer_letter": null, "raw_text": "I need to find the subfields for the Energy & Power subject in the MMMU dataset.\n\nLooking at the document, I can find this information in Table 11 on page 111, which lists \"Subfields of each subject.\"\n\nUnder the \"Tech & Engineering\" discipline, I can see the \"Energy & Power\" subject with its subfields listed.\n\nFrom Table 11:\nEnergy & Power: Thermodynamics, Heat Transfer, Fluid Mechanics\n\nAnswer: Thermodynamics, Heat Transfer, Fluid Mechanics", "citations": [], "input_tokens": 259135, "output_tokens": 118, "cost_micros": 1557465, "latency_ms": 60233, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "fluid mechanics, heat transfer, thermodynamics", "normalised_gold": "fluid mechanics, heat transfer, thermodynamics"}}
+{"qid": "2311.16502v3.pdf::Q003", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Thermodynamics', 'Heat Transfer', 'Fluid Mechanics']", "pages": 117, "document_id": 7593, "arm": "azure_basic_lc", "question_id": "2311.16502v3.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Table 11 in the document which lists subfields of each subject:\n\nFor Energy & Power under the Tech & Engineering discipline, the subfields are:\n\nAnswer: Thermodynamics, Heat Transfer, Fluid Mechanics", "citations": [], "input_tokens": 79439, "output_tokens": 52, "cost_micros": 239097, "latency_ms": 5375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "fluid mechanics, heat transfer, thermodynamics", "normalised_gold": "fluid mechanics, heat transfer, thermodynamics"}}
+{"qid": "2311.16502v3.pdf::Q003", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Thermodynamics', 'Heat Transfer', 'Fluid Mechanics']", "pages": 117, "document_id": 7593, "arm": "azure_premium_lc", "question_id": "2311.16502v3.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Table 11 in the document which lists all subfields by subject:\n\nAnswer: Thermodynamics, Heat Transfer, Fluid Mechanics", "citations": [], "input_tokens": 109190, "output_tokens": 35, "cost_micros": 328095, "latency_ms": 5171, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "fluid mechanics, heat transfer, thermodynamics", "normalised_gold": "fluid mechanics, heat transfer, thermodynamics"}}
+{"qid": "2311.16502v3.pdf::Q003", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Thermodynamics', 'Heat Transfer', 'Fluid Mechanics']", "pages": 117, "document_id": 7593, "arm": "llamacloud_basic_lc", "question_id": "2311.16502v3.pdf::Q003", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "", "normalised_gold": "fluid mechanics, heat transfer, thermodynamics"}}
+{"qid": "2311.16502v3.pdf::Q003", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Thermodynamics', 'Heat Transfer', 'Fluid Mechanics']", "pages": 117, "document_id": 7593, "arm": "llamacloud_premium_lc", "question_id": "2311.16502v3.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Table 11 in the document, which lists \"Subfields of each subject,\" I can find the subfields for Energy & Power.\n\nAccording to the table:\n\n**Energy & Power:** Thermodynamics, Heat Transfer, Fluid Mechanics\n\nAnswer: Thermodynamics, Heat Transfer, Fluid Mechanics", "citations": [], "input_tokens": 103247, "output_tokens": 73, "cost_micros": 310836, "latency_ms": 4766, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "fluid mechanics, heat transfer, thermodynamics", "normalised_gold": "fluid mechanics, heat transfer, thermodynamics"}}
+{"qid": "2311.16502v3.pdf::Q003", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Thermodynamics', 'Heat Transfer', 'Fluid Mechanics']", "pages": 117, "document_id": 7593, "arm": "surfsense_agentic", "question_id": "2311.16502v3.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the document to find the subfields in the Energy & Power subject.I found two subfields mentioned for Energy and Power. Let me search for more information to ensure I have a complete list.Let me search more specifically for all Energy and Power subfield mentions:Based on the document, I found two subfields explicitly mentioned for Energy and Power:\n1. Fluid Mechanics (Figure 89)\n2. Thermodynamics (Figure 90)\n\nLet me verify if there are any other subfields by looking at tables or lists that might contain more comprehensive information:Perfect! I found the information. In line 1545, it shows the subfields for Energy Power:\n\n\"Fluid Mechanics, Heat Transfer ...\"\n\nLet me search for more complete information about all the subfields:Excellent! I found the complete list. Let me verify this is in a table showing all subfields:Perfect! I found the complete list of subfields for Energy & Power in the table. The subfields are:\n- Thermodynamics\n- Heat Transfer\n- Fluid Mechanics\n\nAnswer: Thermodynamics, Heat Transfer, Fluid Mechanics", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 66234, "error": null, "extra": {"thread_id": 2023, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_db9765522b274b60ae516c6ef2df0d73", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 314, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "fluid mechanics, heat transfer, thermodynamics", "normalised_gold": "fluid mechanics, heat transfer, thermodynamics"}}
+{"qid": "2311.16502v3.pdf::Q004", "doc_id": "2311.16502v3.pdf", "answer_format": "float", "gold": "12.1%", "pages": 117, "document_id": 7593, "arm": "native_pdf", "question_id": "2311.16502v3.pdf::Q004", "answer_letter": null, "raw_text": "I need to find GPT-4V's performance on Computer Science and Math subjects from the MMMU benchmark.\n\nLooking at the detailed subject breakdown tables in the appendix:\n\nFrom Table 9 (Tech & Engineering results on page 19):\n- Computer Science (Comp. Sci.) column shows GPT-4V(ision) (Playground) achieved **57.1%**\n\nFrom Table 6 (Science results on page 16):\n- Math column shows GPT-4V(ision) (Playground) achieved **45.0%**\n\nThe performance difference is:\n57.1% - 45.0% = 12.1%\n\nAnswer: 12.1", "citations": [], "input_tokens": 259141, "output_tokens": 158, "cost_micros": 1558401, "latency_ms": 61328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "12.1", "normalised_gold": "12.1"}}
+{"qid": "2311.16502v3.pdf::Q004", "doc_id": "2311.16502v3.pdf", "answer_format": "float", "gold": "12.1%", "pages": 117, "document_id": 7593, "arm": "azure_basic_lc", "question_id": "2311.16502v3.pdf::Q004", "answer_letter": null, "raw_text": "I need to find GPT-4V's performance on Computer Science and Math subjects in the MMMU dataset.\n\nLooking at Table 9 (Tech & Engineering results), I can find Computer Science:\n- Computer Science (371): 57.1%\n\nLooking at Table 6 (Science results), I can find Math:\n- Math (505): 45.0%\n\nThe difference is: 57.1% - 45.0% = 12.1%\n\nAnswer: 12.1", "citations": [], "input_tokens": 79445, "output_tokens": 114, "cost_micros": 240045, "latency_ms": 5546, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "12.1", "normalised_gold": "12.1"}}
+{"qid": "2311.16502v3.pdf::Q004", "doc_id": "2311.16502v3.pdf", "answer_format": "float", "gold": "12.1%", "pages": 117, "document_id": 7593, "arm": "azure_premium_lc", "question_id": "2311.16502v3.pdf::Q004", "answer_letter": null, "raw_text": "I need to find GPT-4V's performance on Computer Science and Math subjects from the MMMU dataset.\n\nLooking through the tables in the document, I need to find the breakdown by subject for GPT-4V.\n\nFrom Table 6 (Science results), I can see:\n- Math: GPT-4V(ision) (Playground) achieves 45.0%\n\nFrom Table 9 (Tech & Engineering results), I can see:\n- Computer Science (Comp. Sci.): GPT-4V(ision) (Playground) achieves 57.1%\n\nThe performance difference is:\n57.1% - 45.0% = 12.1%\n\nAnswer: 12.1", "citations": [], "input_tokens": 109196, "output_tokens": 167, "cost_micros": 330093, "latency_ms": 7889, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "12.1", "normalised_gold": "12.1"}}
+{"qid": "2311.16502v3.pdf::Q004", "doc_id": "2311.16502v3.pdf", "answer_format": "float", "gold": "12.1%", "pages": 117, "document_id": 7593, "arm": "llamacloud_basic_lc", "question_id": "2311.16502v3.pdf::Q004", "answer_letter": null, "raw_text": "I need to find GPT-4V's performance on Computer Science (CS) and Math subjects in the MMMU dataset.\n\nLooking through the document for GPT-4V results by subject:\n\nIn Table 2 (page 6), I can see overall results, but I need to find the breakdown by specific subjects.\n\nLooking at the appendix sections for detailed subject breakdowns:\n\nFrom Table 9 (page 19) - Tech & Engineering results:\n- Computer Science is listed under Tech & Engineering\n- GPT-4V(ision) (Playground) shows: Comp. Sci. = 57.1%\n\nFrom Table 6 (page 16) - Science results:\n- Math is listed under Science\n- GPT-4V(ision) (Playground) shows: Math = 45.0%\n\nLet me verify these numbers are correct by checking the table again:\n\nTable 6 (Science results):\n- GPT-4V(ision) (Playground): Math column shows 45.0\n\nTable 9 (Tech & Engineering results):\n- GPT-4V(ision) (Playground): Comp. Sci. column shows 57.1\n\nThe performance difference is:\n57.1 - 45.0 = 12.1\n\nAnswer: 12.1", "citations": [], "input_tokens": 83159, "output_tokens": 300, "cost_micros": 253977, "latency_ms": 9016, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "12.1", "normalised_gold": "12.1"}}
+{"qid": "2311.16502v3.pdf::Q004", "doc_id": "2311.16502v3.pdf", "answer_format": "float", "gold": "12.1%", "pages": 117, "document_id": 7593, "arm": "llamacloud_premium_lc", "question_id": "2311.16502v3.pdf::Q004", "answer_letter": null, "raw_text": "I need to find GPT-4V's performance on Computer Science and Math subjects in the MMMU dataset.\n\nLooking at Table 9 (Tech & Engineering results), I can see Computer Science is listed with GPT-4V achieving **57.1%**.\n\nLooking at Table 6 (Science results), I can see Math is listed with GPT-4V achieving **45.0%**.\n\nThe performance difference is: 57.1 - 45.0 = 12.1\n\nAnswer: 12.1", "citations": [], "input_tokens": 103253, "output_tokens": 119, "cost_micros": 311544, "latency_ms": 6312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "12.1", "normalised_gold": "12.1"}}
+{"qid": "2311.16502v3.pdf::Q004", "doc_id": "2311.16502v3.pdf", "answer_format": "float", "gold": "12.1%", "pages": 117, "document_id": 7593, "arm": "surfsense_agentic", "question_id": "2311.16502v3.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to find GPT-4V's performance on CS and Math subjects in the MMMU dataset.Let me search for performance tables or results sections that would contain GPT-4V's performance by subject.Let me search for tables with performance metrics:Let me look for a results table that shows performance by subject. Let me read around line 5873 and 5978 where I saw Computer Science and Math mentioned:This appears to be a table of figure references. Let me search for actual performance data tables. Let me look for \"accuracy\" or percentage data:Let me read the beginning of the paper to find the main results table:Let me search for a table with actual performance numbers. Let me look for \"GPT-4V\" with numbers:Let me look for Table 2 which should contain the main results broken down by subject:Good! I found Table 2. Now I need to find the breakdown by individual subjects. The table shows broad disciplines but I need specific subjects like Computer Science and Math. Let me search for another table with subject-level breakdown:Table 9 should have Tech & Engineering which includes Computer Science. Let me read that table:Perfect! I found GPT-4V's Computer Science performance at 57.1% (line 5254). Now I need to find Math performance. Let me check Table 6 for Science subjects:", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 132796, "error": null, "extra": {"thread_id": 2024, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_b9c01f114dd641199d8ac115ae26b4c6", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 506, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "-4.0", "normalised_gold": "12.1"}}
+{"qid": "2311.16502v3.pdf::Q007", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "17", "pages": 117, "document_id": 7593, "arm": "native_pdf", "question_id": "2311.16502v3.pdf::Q007", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "17"}}
+{"qid": "2311.16502v3.pdf::Q007", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "17", "pages": 117, "document_id": 7593, "arm": "azure_basic_lc", "question_id": "2311.16502v3.pdf::Q007", "answer_letter": null, "raw_text": "I need to examine Figure 1 in the document to count the different icons shown as different image types.\n\nLooking at Figure 1, which shows an \"Overview of the MMMU dataset,\" I can see it displays various categories:\n\n1. **Comprehensive Disciplines** - showing a pie chart with percentages for different disciplines (Engineering 26%, Art & Design 11%, Business 14%, Medicine 17%, Science 23%, Humanities & Social Sci. 9%)\n\n2. **Heterogeneous Image Types** - listing various types: Diagrams, Tables, Plots and Charts, Photographs, Chemical Structures, Paintings, Medical Images, Sheet Music, Geometric, Pathology images, Microscopic Images, Comics, ...\n\n3. **Interleaved Text and Images** - showing an example with 3 medical images\n\n4. **Expert-level Skills Test** - showing three categories:\n   - Perception (with an icon)\n   - Knowledge (with an icon)\n   - Reasoning (with an icon)\n\nThe question asks about \"different icons shown as different image types\" in Figure 1. Looking at the heterogeneous image types section, I can count the distinct image type categories mentioned (not just listed as text, but representing different types):\n\nFrom the visible list and description: Diagrams, Tables, Plots and Charts, Photographs, Chemical Structures, Paintings, Medical Images, Sheet Music, Geometric, Pathology images, Microscopic Images, Comics (and the ellipsis suggests more).\n\nThat's 12 explicitly mentioned types, but the question asks about icons shown. The figure shows examples and visual representations rather than just listing them.\n\nLooking more carefully at Figure 1, I need to count the actual different image type examples/icons displayed, not just mentioned in text.\n\nAnswer: 12", "citations": [], "input_tokens": 79427, "output_tokens": 403, "cost_micros": 244326, "latency_ms": 11984, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "12", "normalised_gold": "17"}}
+{"qid": "2311.16502v3.pdf::Q007", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "17", "pages": 117, "document_id": 7593, "arm": "azure_premium_lc", "question_id": "2311.16502v3.pdf::Q007", "answer_letter": null, "raw_text": "I need to examine Figure 1 in the document to count the different icons shown as different image types.\n\nLooking at Figure 1, which is titled \"Overview of the MMMU dataset,\" I can see it displays four main sections:\n\n1. **Comprehensive Disciplines** - showing a pie chart with 6 disciplines\n2. **Heterogeneous Image Types** - showing various image type icons\n3. **Interleaved Text and Images** - showing a sample question with images\n4. **Expert-level Skills Test** - showing three overlapping circles for Perception, Knowledge, and Reasoning\n\nIn the \"Heterogeneous Image Types\" section, I can count the following distinct icons representing different image types:\n1. Diagrams\n2. Tables\n3. Plots and Charts\n4. Chemical (structures)\n5. Photos/Photographs\n6. Paintings\n7. Geometric (shapes)\n8. Music (sheet music)\n9. Medical (images)\n\nLet me recount more carefully by examining the visual elements shown in that section. The figure shows various small icons/symbols representing different image types in a grid-like arrangement.\n\nCounting the distinct image type icons shown: 9 different icons are displayed representing different image types (Diagrams, Tables, Plots and Charts, Chemical Structures, Photographs, Paintings, Geometric shapes, Music/Sheet Music, and Medical Images).\n\nAnswer: 9", "citations": [], "input_tokens": 109178, "output_tokens": 314, "cost_micros": 332244, "latency_ms": 10828, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "9", "normalised_gold": "17"}}
+{"qid": "2311.16502v3.pdf::Q007", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "17", "pages": 117, "document_id": 7593, "arm": "llamacloud_basic_lc", "question_id": "2311.16502v3.pdf::Q007", "answer_letter": null, "raw_text": "I need to look at Figure 1 in the document to count the different icons shown as different image types.\n\nLooking at Figure 1, which shows an \"Overview of the MMMU dataset,\" I can see it displays four main characteristics:\n\n1. **Comprehensive Disciplines** - showing 6 disciplines with percentages:\n   - Engineering (26%)\n   - Art & Design (11%)\n   - Business (14%)\n   - Science (23%)\n   - Humanities & Social Sci. (9%)\n   - Medicine (17%)\n\n2. **Heterogeneous Image Types** - listing various types like: Diagrams, Tables, Plots and Charts, Photographs, Chemical Structures, Paintings, Medical Images, Music, Geometric, Pathology images, Microscopic Images, Comics, \u2026\n\n3. **Interleaved Text and Images** - showing an example question with images\n\n4. **Expert-level Skills Test** - listing various types of reasoning and knowledge\n\nThe figure shows icons/visual representations for these categories, but the question asks specifically about \"different icon shown as different image types.\"\n\nLooking at the \"Heterogeneous Image Types\" section in Figure 1, I can count the distinct image type categories mentioned:\n1. Diagrams\n2. Tables\n3. Plots and Charts\n4. Photographs\n5. Chemical Structures\n6. Paintings\n7. Medical Images\n8. Music\n9. Geometric\n10. Pathology images\n11. Microscopic Images\n12. Comics\n\nAnd there's an ellipsis (\u2026) indicating more types.\n\nHowever, the question asks about icons shown in Figure 1. Looking at the visual representation, the heterogeneous image types section appears to show example icons or representations of these different types.\n\nBased on the visible list in the \"Heterogeneous Image Types\" section of Figure 1, I can count 12 distinct image type categories explicitly listed before the ellipsis.\n\nAnswer: 12", "citations": [], "input_tokens": 83141, "output_tokens": 438, "cost_micros": 255993, "latency_ms": 11875, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "12", "normalised_gold": "17"}}
+{"qid": "2311.16502v3.pdf::Q007", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "17", "pages": 117, "document_id": 7593, "arm": "llamacloud_premium_lc", "question_id": "2311.16502v3.pdf::Q007", "answer_letter": null, "raw_text": "I need to carefully examine Figure 1 to count the different icons shown as different image types.\n\nLooking at Figure 1, which shows an \"Overview of the MMMU dataset,\" I can see four main sections:\n\n1. **Comprehensive Disciplines** - Shows a pie chart with 6 disciplines\n2. **Heterogeneous Image Types** - This section lists various image types\n3. **Interleaved Text and Images** - Shows an example question with images\n4. **Expert-level Skills Test** - Shows a diagram with arrows\n\nIn the \"Heterogeneous Image Types\" section, I can see a list of different image types mentioned:\n- Diagrams\n- Tables\n- Plots and Charts\n- Photographs\n- Chemical Structures\n- Paintings\n- Medical Images\n- Sheet Music\n- Geometric\n- Pathology images\n- Microscopic Images\n- Comics\n- (and the list continues with \"...\")\n\nHowever, the question asks about \"icons shown as different image types.\" Looking at the figure more carefully, I don't see actual icons representing each image type - rather, I see text labels listing the image types.\n\nLet me reconsider - perhaps the question is asking about visual representations or examples shown. In Figure 1, I can see:\n- A pie chart (1 type of visualization)\n- Example images in the \"Interleaved Text and Images\" section (3 medical scan images)\n- A flow diagram in the \"Expert-level Skills Test\" section\n\nBut this doesn't seem to match what the question is asking either.\n\nRe-reading the question: \"How many different icon are shown as different image types in Figure 1?\"\n\nLooking at the figure description more carefully, in the \"Heterogeneous Image Types\" section, there appears to be a list of image types. The text mentions: \"Diagrams, Tables, Plots and Charts, Photographs, Chemical Structures, Paintings, Medical Images, Sheet Music, Geometric, Pathology images, Microscopic Images, Comics, ...\"\n\nCounting the explicitly listed items before the \"...\": 12 different image types are mentioned.\n\nAnswer: 12", "citations": [], "input_tokens": 103235, "output_tokens": 475, "cost_micros": 316830, "latency_ms": 12858, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "12", "normalised_gold": "17"}}
+{"qid": "2311.16502v3.pdf::Q007", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "17", "pages": 117, "document_id": 7593, "arm": "surfsense_agentic", "question_id": "2311.16502v3.pdf::Q007", "answer_letter": null, "raw_text": "I'll read the document to find Figure 1 and count the different icons shown as different image types.Let me search for \"Figure 1\" in the document:Now let me read the beginning of the document content to find Figure 1:Good, I can see Figure 1 information. Let me continue reading to see the complete description of the heterogeneous image types shown in Figure 1:Perfect! I can see the \"Heterogeneous Image Types\" section in Figure 1. The document lists the following image types:\n\n1. Diagrams\n2. Tables\n3. Plots and Charts\n4. Photographs\n5. Chemical Structures\n6. Paintings\n7. Medical Images\n8. Sheet Music\n9. Geometric\n10. Pathology images\n11. Microscopic Images\n12. Comics\n\nThe text shows \"...\" after Comics, but I need to verify if there are icons shown for each type. Let me search for more information about Figure 1 to see if there's a visual description that tells us how many icons are actually displayed:Looking back at the text from Figure 1, I can see it lists these image types under \"Heterogeneous Image Types\":\n\n\"Diagrams, Tables, Plots and Charts, Photographs, Chemical Structures, Paintings, Medical Images, Sheet Music, Geometric, Pathology images, Microscopic Images, Comics, ...\"\n\nLet me count the distinct types mentioned:\n1. Diagrams\n2. Tables\n3. Plots and Charts (this appears to be one category)\n4. Photographs\n5. Chemical Structures\n6. Paintings\n7. Medical Images\n8. Sheet Music\n9. Geometric\n10. Pathology images\n11. Microscopic Images\n12. Comics\n\nHowever, I need to verify if the figure actually shows icons for each. Let me look for embedded images or visual descriptions around Figure 1 to see if there are actual icon images:I see that Im1.png is an embedded image showing a hierarchical flowchart. That's not the heterogeneous image types. Let me search more carefully through the document. Let me look for more embedded images around Figure 1:Let me look more carefully at the beginning of the document. Im1.png appeared right after the figure caption for Figure 1. Let me look at Figure 1 more systematically. Let me check if there are multiple images that are part of Figure 1:I see. The Figure 1 section ends at line 1160 with `</figure>`, and then there are embedded images. The text shows <image 1>, <image 2>, <image 3> within the figure. Let me search more systematically. The figure lists:\n\n\"Diagrams, Tables, Plots and Charts, Photographs, Chemical Structures, Paintings, Medical Images, Sheet Music, Geometric, Pathology images, Microscopic Images, Comics, ...\"\n\nBut I need to see if there are icons displayed. Let me search for where the actual icons might be described. Let me look for \"icon\" in the document:Perfect! I found it. At line 1265, there's a reference to a camera icon. Let me read that section more carefully since it's right after Figure 1:Good! So now I can see there are embedded images that are icons representing different image types. Let me systematically go through and look for all the icons that are part of Figure 1. The document structure shows Im11.png is a camera icon and Im12.png is a geometric icon. Let me look back at the text of Figure 1 to understand the layout better. Looking at the document, it seems the figure starts at line 1097 and the text lists image types. Let me search for more icon descriptions nearby:I see that after the icon descriptions (Im11.png - camera, Im12.png - geometric), the document moves to Figure 3 (a table). So the icons are between Figure 1 and Figure 3. Let me go back and look more carefully at the images embedded right after Figure 1 ends. Let me look for all embedded images between line 1160 (where Figure 1 ends) and line 1455 (where Figure 3 starts):I see Im13.png is a germ/pathogen icon. That's another icon in the series. Let me systematically look at all embedded images between Figure 1 ending and where we transition to other figures. Let me search systematically from line 1160 to 1860:\n\nBased on what I've seen:\n- Im1.png (line 1164): Flowchart diagram\n- Im11.png (line 1256): Camera icon (Photographs)\n- Im12.png (line 1428): Geometric icon\n- Im13.png (line 1863): Germ/pathogen icon\n\nLet me continue looking for more icons. Let me search for the next embedded images:Good! Im14.png is a microscope icon. Let me continue to find more icons. Let me look at the next embedded images. Let me search more systematically. Based on what the abstract says: \"comprising 30 highly heterogeneous image types\" - but Figure 1 specifically lists these types. Let me look at the section with the image types more carefully. I need to look back at line 2501 which says \"Music\" and \"Medical\" - these might be labels for icons:Ah! Figure 5 shows different types of images: Diagrams, Tables, Charts, Chemical, Photos, Paintings, Geometric, Music, Medical. That's 9 types. But this is Figure 5, not Figure 1. Let me go back and look very carefully at what exactly is in Figure 1. Let me re-read the text in Figure 1:\n\nFrom lines 1134-1138, the text explicitly states under \"Heterogeneous Image Types\":\n\"Diagrams, Tables, Plots and Charts, Photographs, Chemical Structures, Paintings, Medical Images, Sheet Music, Geometric, Pathology images, Microscopic Images, Comics, ...\"\n\nSo let me count these carefully:\n1. Diagrams\n2. Tables\n3. Plots and Charts\n4. Photographs\n5. Chemical Structures\n6. Paintings\n7. Medical Images\n8. Sheet Music\n9. Geometric\n10. Pathology images\n11. Microscopic Images\n12. Comics\n\nBut the question asks about \"different icons shown as different image types in Figure 1\". I need to look back at Figure 1 itself more carefully. Let me check if there are actually icons displayed IN Figure 1, not just listed. Let me re-examine the structure. The figure tag starts at 1097 and ends at 1160. Let me look at what images are referenced within that figure:I see! Within the figure (lines 1097-1160), there are references to `<image 1>`, `<image 2>`, and `<image 3>` (lines 1113-1115 as part of a question, and again at lines 1154-1158). But the key section is \"Heterogeneous Image Types\" which lists the types. However, the question asks about icons shown as different image types. \n\nThe text lists image types but doesn't specify how many icons are actually displayed visually. Based on the embedded images that come right after Figure 1:\n- Im1.png (flowchart - but this appears to be an example diagram, not an icon representing a type)\n- Im11.png (camera icon - represents Photographs)\n- Im12.png (geometric shapes icon - represents Geometric)\n- Im13.png (germ icon - could represent Medical/Pathology/Microscopic)\n- Im14.png (microscope icon - represents Microscopic/Medical)\n\nBut I need to find ALL the icons that are specifically showing different image types in Figure 1. Let me look more systematically at what comes after Figure 1 but before Figure 2 (which starts at line 1289). All embedded images between line 1160 and line 1289 might be icons for Figure 1.\n\nFrom my reading:\n- Line 1164: Im1.png (flowchart diagram - this might be part of the figure illustration)\n- Line 1256: Im11.png (camera icon)\n- Line 1428: Im12.png (geometric shapes)\n\nLet me check if there are other embedded images I missed before Figure 2:\n\nLooking at the embedded image list I got earlier, between lines 1160 and 1289 I have:\n- Im1.png at line 1164\n- Im11.png at line 1256\n- Im12.png at line 1428 (but line 1289 is Figure 2, so this is after Figure 2 starts)\n\nWait, let me recheck. Figure 2 caption is at line 1289. So Im12.png at line 1428 is after Figure 2. Let me look more carefully:So between Figure 1 (ending at line 1160) and Figure 2 (starting at line 1288), there are only 2 embedded images:\n- Im1.png at line 1164\n- Im11.png at line 1256\n\nBut actually, looking at the visual descriptions, Im1.png is described as a \"Hierarchical flowchart/organizational diagram\" which seems to be an example diagram that's PART of Figure 1, not an icon representing an image type.\n\nIm11.png is clearly described as a \"Stylized camera icon\" which represents the Photographs image type.\n\nLet me think about this differently. Maybe the icons are all embedded within the actual Figure 1 structure, but the XML parsing has extracted them as separate images. Looking at the text in Figure 1, it lists types under \"Heterogeneous Image Types\". Perhaps in the original PDF, there are visual icons shown alongside or above these text labels, and those icons have been extracted as separate embedded images.\n\nGiven the timeframe and what I can see, let me search for a clearer pattern. Looking at Figure 5 (line 2475) which shows \"Performance of models on different types of images\" and lists: Diagrams, Tables, Charts, Chemical, Photos, Paintings, Geometric, Music, Medical - that's 9 types.\n\nBut I need to answer specifically about Figure 1. Let me think about the structure: Figure 1 has a section called \"Heterogeneous Image Types\" that lists multiple types. The question asks how many different icons are shown. \n\nLooking at the abstract again (line 1212): \"comprising 30 highly heterogeneous image types, such as charts, diagrams, maps, tables, music sheets, and chemical structures.\"\n\nBut in Figure 1's \"Heterogeneous Image Types\" section, it explicitly lists: \"Diagrams, Tables, Plots and Charts, Photographs, Chemical Structures, Paintings, Medical Images, Sheet Music, Geometric, Pathology images, Microscopic Images, Comics, ...\"\n\nIf I count unique types (treating \"Plots and Charts\" as one type):\n1. Diagrams\n2. Tables\n3. Plots and Charts\n4. Photographs\n5. Chemical Structures\n6. Paintings\n7. Medical Images\n8. Sheet Music\n9. Geometric\n10. Pathology images\n11. Microscopic Images\n12. Comics\n\nThat's 12 distinct types listed (plus \"...\" suggesting more).\n\nHowever, Figure 5 later in the paper shows icons for 9 types. Perhaps Figure 1 shows icons for these 9 main types as well. Let me verify Figure 5 shows exactly 9 by re-reading:\n\nFrom line 2487-2503 in Figure 5:\n1. Diagrams\n2. Tables\n3. Charts\n4. Chemical\n5. Photos\n6. Paintings\n7. Geometric\n8. Music\n9. Medical\n\nThat's exactly 9 types in Figure 5.\n\nGiven the typical structure of such figures and the pattern I see, my best answer is that Figure 1 shows icons for **9** different image types.\n\nAnswer: 9", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 192280, "error": null, "extra": {"thread_id": 2025, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_f360703be6ba4537aef7760373d79d2f", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 1273, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "9", "normalised_gold": "17"}}
+{"qid": "2311.16502v3.pdf::Q009", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "arm": "native_pdf", "question_id": "2311.16502v3.pdf::Q009", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "4"}}
+{"qid": "2311.16502v3.pdf::Q009", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "arm": "azure_basic_lc", "question_id": "2311.16502v3.pdf::Q009", "answer_letter": null, "raw_text": "I need to count the charts included in pages 5-10 of the document.\n\nLet me go through each page:\n\n**Page 5:**\n- Contains text and references but no charts visible\n\n**Page 6:**\n- Table 2: \"Overall results of different models on the MMMU validation and test set\" - This is a table, not a chart\n\n**Page 7:**\n- Figure 5: \"Performance of models on different types of images\" - This is a bar chart\n- Table 3: \"Result decomposition across question difficulty levels\" - This is a table, not a chart\n\n**Page 8:**\n- Figure 6: \"Error distribution over 150 annotated GPT-4V errors\" - This is a pie chart\n- Figure 7: Shows an example case with images and text - Not a chart\n\n**Page 9:**\n- Contains text discussion, no charts\n\n**Page 10:**\n- Contains text and references, no charts\n\nCounting the charts:\n- Page 7: Figure 5 (bar chart) = 1 chart\n- Page 8: Figure 6 (pie chart) = 1 chart\n\nTotal charts in pages 5-10: 2\n\nAnswer: 2", "citations": [], "input_tokens": 79424, "output_tokens": 276, "cost_micros": 242412, "latency_ms": 7796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "4"}}
+{"qid": "2311.16502v3.pdf::Q009", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "arm": "azure_premium_lc", "question_id": "2311.16502v3.pdf::Q009", "answer_letter": null, "raw_text": "I'll carefully count all charts in Pages 5-10.\n\n**Page 5:**\n- Figure 2: This shows sample MMMU examples in a table format, not a chart\n- No charts on this page\n\n**Page 6:**\n- Table 2: This is a table showing model results, not a chart\n- No charts on this page\n\n**Page 7:**\n- Figure 5: This is a bar chart showing \"Performance of models on different types of images\"\n- Table 3: This is a table, not a chart\n- Count: 1 chart\n\n**Page 8:**\n- Figure 6: This is a pie chart showing \"Error distribution over 150 annotated GPT-4V errors\"\n- Figure 7: This shows an example with illustrations but is not a chart\n- Count: 1 chart\n\n**Page 9:**\n- No figures or charts on this page (only references text)\n\n**Page 10:**\n- No figures or charts on this page (only references text)\n\nTotal charts in Pages 5-10: 1 (Figure 5) + 1 (Figure 6) = 2\n\nAnswer: 2", "citations": [], "input_tokens": 109175, "output_tokens": 264, "cost_micros": 331485, "latency_ms": 9110, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "4"}}
+{"qid": "2311.16502v3.pdf::Q009", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "arm": "llamacloud_basic_lc", "question_id": "2311.16502v3.pdf::Q009", "answer_letter": null, "raw_text": "I need to count the charts (plots, graphs, tables, diagrams) that appear in pages 5-10 of the document.\n\nLet me go through pages 5-10:\n\n**Page 5:**\n- Table 2: Overall results of different models (this is a large table)\n- Count: 1\n\n**Page 6:**\n- Table 3: Result decomposition across question difficulty levels\n- Figure 5: Performance of models on different types of images (this is a bar chart)\n- Count: 2\n\n**Page 7:**\n- Figure 6: Error analysis pie chart showing distribution of error types\n- Figure 7: Example error case with text and an image (not a chart)\n- Count: 1\n\n**Page 8:**\n- Continuation of references, no charts\n\n**Page 9:**\n- Continuation of references, no charts\n\n**Page 10:**\n- References continue, no charts\n\nTotal charts in pages 5-10:\n- Page 5: 1 (Table 2)\n- Page 6: 2 (Table 3, Figure 5)\n- Page 7: 1 (Figure 6)\n- Pages 8-10: 0\n\nTotal: 1 + 2 + 1 + 0 = 4\n\nAnswer: 4", "citations": [], "input_tokens": 83138, "output_tokens": 300, "cost_micros": 253914, "latency_ms": 8781, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2311.16502v3.pdf::Q009", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "arm": "llamacloud_premium_lc", "question_id": "2311.16502v3.pdf::Q009", "answer_letter": null, "raw_text": "I need to count all charts appearing in pages 5-10 of the document.\n\nLet me go through each page:\n\n**Page 5:**\n- Table 1: Key statistics of the MMMU benchmark\n\n**Page 6:**\n- Table 2: Overall results of different models on the MMMU validation and test set\n\n**Page 7:**\n- Table 3: Result decomposition across question difficulty levels\n- Figure 5: Performance of models on different types of images (this is a table/chart)\n\n**Page 8:**\n- Figure 6: Error distribution over 150 annotated GPT-4V errors (this is a table/chart)\n- Figure 7: A sample error case (this is an example, not a chart)\n\n**Page 9:**\n- No charts (this page discusses conclusion and references begin)\n\n**Page 10:**\n- No charts (continues references section)\n\nLet me recount more carefully by looking at what constitutes a \"chart\":\n\nPages 5-10:\n- Page 5: Table 1 (1 chart)\n- Page 6: Table 2 (1 chart)\n- Page 7: Table 3 (1 chart), Figure 5 which shows a table of performance data (1 chart)\n- Page 8: Figure 6 showing error distribution as a table (1 chart), Figure 7 is an example case study, not a chart\n- Pages 9-10: References, no charts\n\nTotal charts: 5\n\nAnswer: 5", "citations": [], "input_tokens": 103232, "output_tokens": 346, "cost_micros": 314886, "latency_ms": 10516, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "4"}}
+{"qid": "2311.16502v3.pdf::Q009", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "arm": "surfsense_agentic", "question_id": "2311.16502v3.pdf::Q009", "answer_letter": null, "raw_text": "I'll read the document to count the charts in pages 5-10.Let me search for page markers to identify pages 5-10:Let me read through the document to find page markers:I can see page markers. Let me continue searching for pages 5-10:Perfect! Now I know the line numbers for pages 5-10. Let me read those pages to count the charts:Now let me search for charts/figures systematically in pages 5-10. Let me look for figure markers:Now let me carefully count the charts in pages 5-10. Let me search for \"Figure\" markers in this range:\n\nBased on my reading of pages 5-10 (lines 2071-3184), I can identify the following charts:\n\n**Page 5** (starts at line 2071):\n- No charts (only text and Table 2)\n\n**Page 6** (starts at line 2462):\n- **Figure 5**: \"Performance of models on different types of images\" - this is a bar chart showing performance metrics\n\n**Page 7** (starts at line 2687):\n- **Figure 6**: \"Error distribution over 150 annotated GPT-4V errors\" - this is a pie chart showing error percentages\n- **Figure 7**: Shows a perceptual error example (not a chart, but a diagram/illustration)\n\n**Page 8** (starts at line 2912):\n- No charts (only text starting with Conclusion section)\n\nLet me continue reading pages 9 and 10:**Page 9** (starts at line 3045): References section - no charts\n\n**Page 10** (starts at line 3184): References section - no charts\n\nNow let me verify my count by carefully reviewing what I found:\n\n**Summary of charts in Pages 5-10:**\n\n- **Page 5**: 0 charts (contains Table 2, which is a table, not a chart)\n- **Page 6**: 1 chart (Figure 5 - bar chart)\n- **Page 7**: 1 chart (Figure 6 - pie chart) + Figure 7 (which is an illustration/example, not a chart)\n- **Page 8**: 0 charts (text only)\n- **Page 9**: 0 charts (references)\n- **Page 10**: 0 charts (references)\n\nTotal charts in pages 5-10: **2 charts**\n\n(Figure 5 is a bar chart and Figure 6 is a pie chart. Figure 7 is not a chart but an illustrative example showing a perceptual error case.)\n\nAnswer: 2", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 79750, "error": null, "extra": {"thread_id": 2026, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_8497190a9f7e4c40b3088d1ce4cbc759", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 453, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "4"}}
+{"qid": "2311.16502v3.pdf::Q011", "doc_id": "2311.16502v3.pdf", "answer_format": "str", "gold": "\"MMMU\"", "pages": 117, "document_id": 7593, "arm": "native_pdf", "question_id": "2311.16502v3.pdf::Q011", "answer_letter": null, "raw_text": "Looking at Figure 4 in the document, which compares MMMU with other existing benchmarks across two dimensions - Breadth (Knowledge) and Depth (Reasoning).\n\nThe figure shows various datasets plotted on these two axes:\n- VQA, GQA, VisWiz (basic breadth and depth)\n- TextVQA, MMBench, SEED (moderate breadth)\n- MM-Vet, ScienceQA (moderate breadth)\n- OKVQA (moderate breadth)\n- MMMU (positioned highest on both breadth and depth)\n\nAccording to the figure and the accompanying table, MMMU covers:\n- 11.5K questions\n- 30 Types of images\n- 6 disciplines, 30 subjects, and 183 subfields\n\nThis is explicitly described in the caption as excelling \"in both its breadth to cover a wide range of disciplines and its depth to test LMMs' reasoning abilities.\"\n\nAnswer: MMMU", "citations": [], "input_tokens": 259139, "output_tokens": 228, "cost_micros": 1559964, "latency_ms": 60530, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "mmmu", "normalised_gold": "mmmu"}}
+{"qid": "2311.16502v3.pdf::Q011", "doc_id": "2311.16502v3.pdf", "answer_format": "str", "gold": "\"MMMU\"", "pages": 117, "document_id": 7593, "arm": "azure_basic_lc", "question_id": "2311.16502v3.pdf::Q011", "answer_letter": null, "raw_text": "Looking at Figure 4 in the paper, I need to examine which dataset covers the broadest range of disciplines and knowledge areas.\n\nFrom the figure, the datasets compared are:\n- VQA\n- GQA\n- VisWiz\n- TextVQA\n- OKVQA\n- SEED\n- MMBench\n- MM-Vet\n- ScienceQA\n- MMMU\n\nThe figure shows that MMMU covers:\n- 6 disciplines\n- 30 subjects\n- 183 subfields\n- 11.5K questions\n\nIn comparison, other datasets are more limited in scope. For example, ScienceQA covers specific subjects but is noted as being at elementary to middle school level. The other datasets (VQA, GQA, etc.) focus on specific task types rather than broad disciplinary coverage.\n\nThe paper explicitly states that MMMU \"covers 30 subjects across 6 disciplines\" and emphasizes its \"breadth\" across multiple disciplines including Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, and Tech & Engineering.\n\nAnswer: MMMU", "citations": [], "input_tokens": 79443, "output_tokens": 256, "cost_micros": 242169, "latency_ms": 8655, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "mmmu", "normalised_gold": "mmmu"}}
+{"qid": "2311.16502v3.pdf::Q011", "doc_id": "2311.16502v3.pdf", "answer_format": "str", "gold": "\"MMMU\"", "pages": 117, "document_id": 7593, "arm": "azure_premium_lc", "question_id": "2311.16502v3.pdf::Q011", "answer_letter": null, "raw_text": "Looking at Figure 4 in the document, which compares MMMU with other existing benchmarks across breadth (knowledge) and depth (reasoning) dimensions:\n\nThe figure shows various datasets plotted, with MMMU positioned to excel in both breadth and depth. Among the nine different datasets shown (VQA, GQA, VizWiz, TextVQA, OKVQA, SEED, MMBench, MM-Vet, ScienceQA, and MMMU), MMMU is positioned highest on the breadth axis.\n\nAnswer: MMMU", "citations": [], "input_tokens": 109194, "output_tokens": 132, "cost_micros": 329562, "latency_ms": 7141, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "mmmu", "normalised_gold": "mmmu"}}
+{"qid": "2311.16502v3.pdf::Q011", "doc_id": "2311.16502v3.pdf", "answer_format": "str", "gold": "\"MMMU\"", "pages": 117, "document_id": 7593, "arm": "llamacloud_basic_lc", "question_id": "2311.16502v3.pdf::Q011", "answer_letter": null, "raw_text": "Looking at Figure 4 in the document, I need to identify which dataset covers the most breadth of knowledge.\n\nThe figure shows a comparison of different datasets along two axes:\n- Breadth (Knowledge) - horizontal axis\n- Depth (Reasoning) - vertical axis\n\nFrom the figure, the datasets shown include:\n- VQA\n- GQA\n- VisWiz\n- TextVQA\n- OKVQA\n- SEED\n- MMBench\n- MM-Vet\n- ScienceQA\n- MMMU\n\nLooking at the horizontal positioning (breadth), MMMU is positioned furthest to the right, indicating it has the greatest breadth of knowledge coverage among the datasets shown. This is also supported by the text which states MMMU covers \"30 Types\" of images and spans across 6 disciplines, 30 subjects, and 183 subfields.\n\nAnswer: MMMU", "citations": [], "input_tokens": 83157, "output_tokens": 210, "cost_micros": 252621, "latency_ms": 8312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "mmmu", "normalised_gold": "mmmu"}}
+{"qid": "2311.16502v3.pdf::Q011", "doc_id": "2311.16502v3.pdf", "answer_format": "str", "gold": "\"MMMU\"", "pages": 117, "document_id": 7593, "arm": "llamacloud_premium_lc", "question_id": "2311.16502v3.pdf::Q011", "answer_letter": null, "raw_text": "Looking at Figure 4 in the document, I need to identify which dataset represents the most breadth (knowledge) on the x-axis.\n\nThe scatter plot shows various datasets plotted with \"Breadth (Knowledge)\" on the x-axis and \"Depth (Reasoning)\" on the y-axis. MMMU is shown as a star at the highest position for both breadth and depth.\n\nAmong the datasets shown in Figure 4, MMMU is positioned furthest to the right on the Breadth axis, indicating it has the most breadth of knowledge coverage.\n\nAnswer: MMMU", "citations": [], "input_tokens": 103251, "output_tokens": 132, "cost_micros": 311733, "latency_ms": 6766, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "mmmu", "normalised_gold": "mmmu"}}
+{"qid": "2311.16502v3.pdf::Q011", "doc_id": "2311.16502v3.pdf", "answer_format": "str", "gold": "\"MMMU\"", "pages": 117, "document_id": 7593, "arm": "surfsense_agentic", "question_id": "2311.16502v3.pdf::Q011", "answer_letter": null, "raw_text": "I'll read the document to find Figure 4 and identify which dataset reflects the most breadth of knowledge.Let me search for Figure 4 in the document:Good! I found the caption for Figure 4. Now let me read the actual content around Figure 4 to see the details about which dataset reflects the most breadth of knowledge:Now let me look at the actual Figure 4 to see the visual comparison. Let me search for more context about breadth of knowledge in the figure:Let me read the section that shows the actual Figure 4 visualization:Perfect! I can see Figure 4 shows a scatter plot with \"Breadth (Knowledge)\" on one axis and \"Depth (Reasoning)\" on the other axis. The figure shows the following datasets plotted:\n- MMMU\n- VQA\n- GQA\n- VisWiz\n- TextVQA\n- MMBench\n- SEED\n- MM-Vet\n- ScienceQA\n- OKVQA\n\nFrom the figure structure (lines 1834-1858), I can see \"Breadth (Knowledge)\" is labeled as one of the axes. The caption states \"MMMU excels in both its breadth to cover a wide range of disciplines and its depth to test LMMs' reasoning abilities.\"\n\nLooking at the text in chunk 156205 (lines 2022-2029), it explicitly states: \"From the breadth perspective, the prior benchmarks are heavily focused on daily knowledge and common sense. The covered image format is also limited. Our benchmark aims to cover college-level knowledge with 30 image formats...\"\n\nThis indicates that **MMMU** reflects the most breadth of knowledge among the nine datasets shown in Figure 4.\n\nAnswer: MMMU", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 57452, "error": null, "extra": {"thread_id": 2027, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_e1415f7fe2384e1ba85595cd09d85b89", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 282, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "mmmu", "normalised_gold": "mmmu"}}
+{"qid": "2311.16502v3.pdf::Q013", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Perceptual Error', 'Lack of Knowledge', 'Reasoning Error']", "pages": 117, "document_id": 7593, "arm": "native_pdf", "question_id": "2311.16502v3.pdf::Q013", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q013", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Perceptual Error', 'Lack of Knowledge', 'Reasoning Error']", "pages": 117, "document_id": 7593, "arm": "azure_basic_lc", "question_id": "2311.16502v3.pdf::Q013", "answer_letter": null, "raw_text": "I need to look at Figure 6, which shows the error distribution over 150 annotated GPT-4V errors.\n\nFrom Figure 6, I can see the following error categories and their percentages:\n- Perceptual Error: 35%\n- Lack of Knowledge: 29%\n- Reasoning Error: 26%\n- Textual Understanding: 4%\n- Reject to Answer: 3%\n- Annotation Error: 2%\n- Answer Extraction Error: 1%\n\nThe top-3 error types are:\n1. Perceptual Error (35%)\n2. Lack of Knowledge (29%)\n3. Reasoning Error (26%)\n\nAnswer: Perceptual Error, Lack of Knowledge, Reasoning Error", "citations": [], "input_tokens": 79448, "output_tokens": 174, "cost_micros": 240954, "latency_ms": 5937, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q013", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Perceptual Error', 'Lack of Knowledge', 'Reasoning Error']", "pages": 117, "document_id": 7593, "arm": "azure_premium_lc", "question_id": "2311.16502v3.pdf::Q013", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q013", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Perceptual Error', 'Lack of Knowledge', 'Reasoning Error']", "pages": 117, "document_id": 7593, "arm": "llamacloud_basic_lc", "question_id": "2311.16502v3.pdf::Q013", "answer_letter": null, "raw_text": "Looking at Figure 6 in the document, which shows the error analysis of 150 GPT-4V error cases:\n\nThe distribution shows:\n- Perceptual Error: 35%\n- Lack of Knowledge: 29%\n- Reasoning Error: 26%\n- Textual Understanding Error: 6%\n- Reject to Answer: 3%\n- Annotation Error: 2%\n- Answer Extraction Error: 1%\n\nAnswer: Perceptual Error, Lack of Knowledge, Reasoning Error", "citations": [], "input_tokens": 83162, "output_tokens": 122, "cost_micros": 251316, "latency_ms": 5827, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q013", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Perceptual Error', 'Lack of Knowledge', 'Reasoning Error']", "pages": 117, "document_id": 7593, "arm": "llamacloud_premium_lc", "question_id": "2311.16502v3.pdf::Q013", "answer_letter": null, "raw_text": "Looking at Figure 6 which shows the error distribution over 150 annotated GPT-4V errors:\n\n1. Perceptual Error: 35%\n2. Lack of Knowledge: 29%\n3. Reasoning Error: 26%\n\nAnswer: Perceptual Error, Lack of Knowledge, Reasoning Error", "citations": [], "input_tokens": 103256, "output_tokens": 77, "cost_micros": 310923, "latency_ms": 6108, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q013", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Perceptual Error', 'Lack of Knowledge', 'Reasoning Error']", "pages": 117, "document_id": 7593, "arm": "surfsense_agentic", "question_id": "2311.16502v3.pdf::Q013", "answer_letter": null, "raw_text": "I'll read the document to find Figure 6 and identify the top-3 error types over 150 annotated GPT-4V errors.Let me search for Figure 6 in the document:Now let me read the section around line 2692 where Figure 6 is located:Perfect! I found Figure 6 which shows the error distribution over 150 annotated GPT-4V errors. Looking at the data:\n\n- Perceptual Error: 35%\n- Lack of Knowledge: 29%\n- Reasoning Error: 26%\n- Textual Understanding: 4%\n- Reject to Answer: 3%\n- Annotation Error: 2%\n- Answer Extraction Error: 1%\n\nThe top-3 error types are:\n1. Perceptual Error (35%)\n2. Lack of Knowledge (29%)\n3. Reasoning Error (26%)\n\nAnswer: Perceptual Error, Lack of Knowledge, Reasoning Error", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 33125, "error": null, "extra": {"thread_id": 2028, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_c246c6154fa7469892ccf49cfee50145", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 158, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q014", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Reasoning Error', 'Perceptual Error', 'Lack of Knowledge']", "pages": 117, "document_id": 7593, "arm": "native_pdf", "question_id": "2311.16502v3.pdf::Q014", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q014", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Reasoning Error', 'Perceptual Error', 'Lack of Knowledge']", "pages": 117, "document_id": 7593, "arm": "azure_basic_lc", "question_id": "2311.16502v3.pdf::Q014", "answer_letter": null, "raw_text": "I'll carefully examine pages 95-100 (Figures 82-87) to identify all error types mentioned in the case studies.\n\nLooking at each figure in the specified range:\n\n**Figure 82** (Architecture and Engineering: Civil Engineering) - Page 96\n- Error Category: Reasoning Error\n\n**Figure 83** (Computer Science: Data Structure and Algorithm) - Page 97\n- This is a correct case, no error category\n\n**Figure 84** (Computer Science: Compiler Principle) - Page 98\n- Error Category: Perceptual Error, Lack of Knowledge\n\n**Figure 85** (Computer Science: Operating System) - Page 99\n- Error Category: Perceptual Error\n\n**Figure 86** (Computer Science: Compiler Principle) - Page 100\n- Error Category: Perceptual Error\n\n**Figure 87** (Electronics: Signal Processing) - Page 101\n- This is a correct case, no error category\n\nSo the error types mentioned in pages 95-100 are:\n- Reasoning Error\n- Perceptual Error\n- Lack of Knowledge\n\nAnswer: Reasoning Error, Perceptual Error, Lack of Knowledge", "citations": [], "input_tokens": 79440, "output_tokens": 265, "cost_micros": 242295, "latency_ms": 8516, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q014", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Reasoning Error', 'Perceptual Error', 'Lack of Knowledge']", "pages": 117, "document_id": 7593, "arm": "azure_premium_lc", "question_id": "2311.16502v3.pdf::Q014", "answer_letter": null, "raw_text": "I'll carefully examine pages 95-100 of the document to identify all error types mentioned in the case studies.\n\nLooking through the figures on these pages:\n\n**Page 95 (Figure 81):** Architecture and Engineering - Correct case (no error type)\n\n**Page 96 (Figure 82):** Architecture and Engineering - **Reasoning Error**\n\n**Page 97 (Figure 83):** Computer Science - Correct case (no error type)\n\n**Page 98 (Figure 84):** Computer Science - **Perceptual Error, Lack of Knowledge**\n\n**Page 99 (Figure 85):** Computer Science - **Perceptual Error**\n\n**Page 100 (Figure 86):** Computer Science - **Perceptual Error**\n\nAnswer: Reasoning Error, Perceptual Error, Lack of Knowledge", "citations": [], "input_tokens": 109191, "output_tokens": 192, "cost_micros": 330453, "latency_ms": 8281, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q014", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Reasoning Error', 'Perceptual Error', 'Lack of Knowledge']", "pages": 117, "document_id": 7593, "arm": "llamacloud_basic_lc", "question_id": "2311.16502v3.pdf::Q014", "answer_letter": null, "raw_text": "I'll search through pages 95-100 of the document for error types mentioned in case studies.\n\nLooking at the case studies in this range:\n\nPage 95 (Figure 81 - Architecture and Engineering): No error type specified (correct case)\n\nPage 96 (Figure 82 - Architecture and Engineering): **Reasoning Error**\n\nPage 97 (Figure 83 - Computer Science): No error type specified (correct case)\n\nPage 98 (Figure 84 - Computer Science): **Perceptual Error, Lack of Knowledge**\n\nPage 99 (Figure 85 - Computer Science): **Perceptual Error**\n\nPage 100 (Figure 86 - Computer Science): **Perceptual Error**\n\nAnswer: Reasoning Error, Perceptual Error, Lack of Knowledge", "citations": [], "input_tokens": 83154, "output_tokens": 184, "cost_micros": 252222, "latency_ms": 6655, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q014", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Reasoning Error', 'Perceptual Error', 'Lack of Knowledge']", "pages": 117, "document_id": 7593, "arm": "llamacloud_premium_lc", "question_id": "2311.16502v3.pdf::Q014", "answer_letter": null, "raw_text": "Looking through pages 95-100 of the document (Figures 82-86 in the case studies):\n\nFigure 82 (Architecture and Engineering: Civil Engineering) - **Reasoning Error**\n\nFigure 83 (Computer Science: Data Structure and Algorithm) - No error type mentioned (this is a correct case)\n\nFigure 84 (Computer Science: Compiler Principle) - **Perceptual Error, Lack of Knowledge**\n\nFigure 85 (Computer Science: Operating System) - **Perceptual Error**\n\nFigure 86 (Computer Science: Compiler Principle) - **Perceptual Error**\n\nAnswer: Reasoning Error, Perceptual Error, Lack of Knowledge", "citations": [], "input_tokens": 103248, "output_tokens": 158, "cost_micros": 312114, "latency_ms": 11889, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q014", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Reasoning Error', 'Perceptual Error', 'Lack of Knowledge']", "pages": 117, "document_id": 7593, "arm": "surfsense_agentic", "question_id": "2311.16502v3.pdf::Q014", "answer_letter": null, "raw_text": "I'll read the document to find the error types mentioned in the case studies on pages 95-100.Let me search for page numbers to locate pages 95-100:Let me search for \"case stud\" to find the case studies section:Let me read around line 5765 where Table 10 mentions case study figures with error categories:Good! I found Table 10 which shows the error categories. The columns are: Correct Case, Perception, Lack of Knowledge, Reasoning, and Other. Let me continue reading to see the complete table:Good! I found that the error categories are: Perception, Lack of Knowledge, Reasoning, and Other (plus \"Correct Case\" which is not an error). Now I need to find the actual case study pages 95-100. Let me search for figure 95 or case 95:Let me search for the case studies section more specifically. Let me look for the pattern of case numbers 95-100:Figure 96 is something different. Let me check the structure - from the earlier search, I saw \"90 Energy and Power 2: Reasoning Error 104\" and \"95 Mechanical Engineering 3: Re\". Let me search for these patterns directly:Good! So these are Figure numbers 90-95. Let me read that section where it lists figures 90-100:I can see figures 90-95 here. Now I need to find what comes after figure 95. Let me search earlier in the document to see the complete list:This appears to be a table of contents or index. I need to understand better - these numbers (95-100) might refer to page numbers or figure numbers. Let me look at this list more carefully. I can see:\n\n- Page 95: Figure 81 Architecture and Engineering 1: Correct Case\n- Page 96: Figure 83 Computer Science 1: Correct Case  \n- Page 97: ?\n- Page 98: Figure 84 Computer Science 2: Perceptual Error, Lack of Knowledge\n- Page 99: Figure 85 Computer Science 3: Perceptual Error\n- Page 100: Figure 86 Computer Science 4: Perceptual Error\n\nBut I need to verify this interpretation. Let me search for where these actual figures appear in the document:Perfect! Now let me read from page 95 to page 100. Let me find line numbers around page 95:Good, I'm at page 95. Now let me continue reading through pages 95-100:Good! I'm now at page 97. Let me continue reading to get to page 100:Good! I'm at page 99. Let me continue to page 100:Good! Now let me continue to see the rest of page 100:Perfect! Now I have reviewed pages 95-100. Let me compile all the error types I found in the case studies on these pages:\n\nFrom the case studies I read on pages 95-100:\n\n**Page 95:**\n- Figure 81: Correct Case (not an error)\n\n**Page 96:**\n- Figure 82: Reasoning Error\n\n**Page 97:**\n- Figure 83: Correct Case (not an error)\n\n**Page 98:**\n- Figure 84: Perceptual Error, Lack of Knowledge\n\n**Page 99:**\n- Figure 85: Perceptual Error\n\n**Page 100:**\n- Figure 86: Perceptual Error\n- Figure 87: Correct Case (not an error)\n\nNow let me verify by looking back at the table of contents I saw earlier to ensure I have the right mapping. Let me check once more:Perfect! This confirms my interpretation. The table shows:\n- Figure 81 is on pages 94-95\n- Figure 82 is on page 96 (spanning from before)\n- Figure 83 is on pages 96-97\n- Figure 84 is on page 98\n- Figure 85 is on page 99\n- Figure 86 is on page 100\n- Figure 87 is on page 101 (starting at page 100)\n\nSo the error types mentioned in case studies on pages 95-100 are:\n\n1. **Reasoning Error** (Figure 82 - Architecture and Engineering)\n2. **Perceptual Error** (Figure 84, 85, 86 - Computer Science)\n3. **Lack of Knowledge** (Figure 84 - Computer Science)\n\nAnswer: Reasoning Error, Perceptual Error, Lack of Knowledge", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 114937, "error": null, "extra": {"thread_id": 2029, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_77fb85ac92354e66b7280742dc24bcdb", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 755, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q015", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "arm": "native_pdf", "question_id": "2311.16502v3.pdf::Q015", "answer_letter": null, "raw_text": "I need to examine pages 65-70 to count the successful cases (marked with a checkmark \u2713).\n\n**Page 65** (Figure 51): Clinical Medicine: Pathology - **Correct case** \u2713\n\n**Page 66** (Figure 52): Clinical Medicine: Pathology - **Correct case** \u2713\n\n**Page 67** (Figure 53): Clinical Medicine: Pathology - **Correct case** \u2713\n\n**Page 68** (Figure 54): Clinical Medicine: Pathology - **Error case** \u2717 (Perceptual Error)\n\n**Page 69** (Figure 55): Clinical Medicine: Heart Disease - **Error case** \u2717 (Lack of Knowledge)\n\n**Page 70** (Figure 56): Diagnostics and Laboratory Medicine: Radiology - **Correct case** \u2713\n\nCounting the successful cases (those with \u2713):\n- Page 65: 1\n- Page 66: 1\n- Page 67: 1\n- Page 68: 0\n- Page 69: 0\n- Page 70: 1\n\nTotal: 1 + 1 + 1 + 0 + 0 + 1 = 4\n\nAnswer: 4", "citations": [], "input_tokens": 259120, "output_tokens": 295, "cost_micros": 1561358, "latency_ms": 64484, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2311.16502v3.pdf::Q015", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "arm": "azure_basic_lc", "question_id": "2311.16502v3.pdf::Q015", "answer_letter": null, "raw_text": "I need to count the successful (correct) cases in pages 65-70 of the document.\n\nLet me go through each figure in this range:\n\n**Page 65:**\n- Figure 51: Clinical Medicine: Pathology - \"A sample **correct** case\" \u2713\n\n**Page 66:**\n- Figure 52: Clinical Medicine: Pathology - \"A sample **correct** case\" \u2713\n\n**Page 67:**\n- Figure 53: Clinical Medicine: Pathology - \"A sample **correct** case\" \u2713\n\n**Page 68:**\n- Figure 54: Clinical Medicine: Pathology - \"A sample **error** case\" \u2717\n\n**Page 69:**\n- Figure 55: Clinical Medicine: Heart Disease - \"A sample **error** case\" \u2717\n\n**Page 70:**\n- Figure 56: Diagnostics and Laboratory Medicine: Radiology - \"A sample **correct** case\" \u2713\n\nCounting the correct cases: Pages 65, 66, 67, and 70 each have one correct case.\n\nTotal successful cases = 4\n\nAnswer: 4", "citations": [], "input_tokens": 79424, "output_tokens": 257, "cost_micros": 242127, "latency_ms": 7094, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2311.16502v3.pdf::Q015", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "arm": "azure_premium_lc", "question_id": "2311.16502v3.pdf::Q015", "answer_letter": null, "raw_text": "I need to count the successful cases (marked with \u2713 or \"V\") in Pages 65-70.\n\nLet me go through each figure in this page range:\n\n**Page 65:**\n- Figure 51: Clinical Medicine 1: Correct Case \u2713\n\n**Page 66:**\n- Figure 52: Clinical Medicine 2: Correct Case \u2713\n\n**Page 67:**\n- Figure 53: Clinical Medicine 3: Correct Case \u2713\n\n**Page 68:**\n- Figure 54: Clinical Medicine 4: Perceptual Error \u2717\n\n**Page 69:**\n- Figure 55: Clinical Medicine 5: Lack of Knowledge \u2717\n\n**Page 70:**\n- Figure 56: Diagnostics and Lab Medicine 1: Correct Case \u2713\n\nCounting the successful cases (marked with \u2713):\n- Page 65: 1\n- Page 66: 1\n- Page 67: 1\n- Page 68: 0\n- Page 69: 0\n- Page 70: 1\n\nTotal: 1 + 1 + 1 + 0 + 0 + 1 = 4\n\nAnswer: 4", "citations": [], "input_tokens": 109175, "output_tokens": 280, "cost_micros": 331725, "latency_ms": 8125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2311.16502v3.pdf::Q015", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "arm": "llamacloud_basic_lc", "question_id": "2311.16502v3.pdf::Q015", "answer_letter": null, "raw_text": "I need to count the successful cases in Pages 65-70 of the document.\n\nLet me go through each page:\n\n**Page 65 (Clinical Medicine 1):**\n- Figure 51: \"A sample correct case of Clinical Medicine (subfield: Pathology)\"\n- This is a CORRECT case \u2713\n\n**Page 66 (Clinical Medicine 2):**\n- Figure 52: \"A sample correct case of Clinical Medicine (subfield: Pathology)\"\n- This is a CORRECT case \u2713\n\n**Page 67 (Clinical Medicine 3):**\n- Figure 53: \"A sample correct case of Clinical Medicine (subfield: Pathology)\"\n- This is a CORRECT case \u2713\n\n**Page 68 (Clinical Medicine 4):**\n- Figure 54: \"A sample error case of Clinical Medicine (subfield: Pathology). Error category: Perceptual Error\"\n- This is an ERROR case \u2717\n\n**Page 69 (Clinical Medicine 5):**\n- Figure 55: \"A sample error case of Clinical Medicine (subfield: Heart Disease). Error category: Lack of Knowledge\"\n- This is an ERROR case \u2717\n\n**Page 70 (Diagnostics and Laboratory Medicine 1):**\n- Figure 56: \"A sample correct case of Diagnostics and Laboratory Medicine (subfield: Radiology)\"\n- This is a CORRECT case \u2713\n\nTotal successful (correct) cases in pages 65-70: 4\n\nAnswer: 4", "citations": [], "input_tokens": 83138, "output_tokens": 348, "cost_micros": 254634, "latency_ms": 8078, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2311.16502v3.pdf::Q015", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "arm": "llamacloud_premium_lc", "question_id": "2311.16502v3.pdf::Q015", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "4"}}
+{"qid": "2311.16502v3.pdf::Q015", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "arm": "surfsense_agentic", "question_id": "2311.16502v3.pdf::Q015", "answer_letter": null, "raw_text": "I'll read the document mentioned in the priority list and look at pages 65-70 to find information about successful cases.I need to search for content related to pages 65-70. Let me search for page markers in the document.Let me search for page numbers in a different format:Let me search for the word \"successful\" or \"cases\" to understand the document structure:Let me search for \"cases\" or look for page numbers differently. Let me read more of the document to understand its structure:This is not helpful. Let me search for the actual page numbers in the document. PDF documents often have page numbers embedded. Let me try different patterns:I can see there's a \"Case Study\" section. Let me look for the appendix section that might have pages 65-70. Let me read the entire document more systematically. First, let me check how long the document is:Great! I found a list of case studies with page numbers. I can see this is a table mapping case numbers to their descriptions and page numbers. Let me continue reading to see pages 65-70:Perfect! I found the information. Looking at pages 65-70, I can see:\n\n- Page 65: Case 51 - Clinical Medicine 1: **Correct Case**\n- Page 66: Case 52 - Clinical Medicine 2: **Correct Case**\n- Page 67: Case 53 - Clinical Medicine 3: **Correct Case**\n- Page 68: Case 54 - Clinical Medicine 4: Perceptual Error\n- Page 69: Case 55 - Clinical Medicine 5: Lack of Knowledge\n- Page 70: Case 56 - Diagnostics and Lab Medicine 1: **Correct Case**\n\nCounting the \"Correct Case\" entries in pages 65-70:\n1. Page 65: Clinical Medicine 1: Correct Case\n2. Page 66: Clinical Medicine 2: Correct Case\n3. Page 67: Clinical Medicine 3: Correct Case\n4. Page 70: Diagnostics and Lab Medicine 1: Correct Case\n\nAnswer: 4", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 54718, "error": null, "extra": {"thread_id": 2030, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_77d82c91b9624d83aeec575d8461194f", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 347, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2312.04350v3.pdf::Q000", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "6", "pages": 28, "document_id": 7594, "arm": "native_pdf", "question_id": "2312.04350v3.pdf::Q000", "answer_letter": null, "raw_text": "I need to find information about how many key steps the authors divided causal inference into.\n\nLooking through the document, I can see in Section 3.2 \"Generating the Explanations\" on page 6, there is a discussion of step-by-step explanations and subskills:\n\n\"Apart from the question-answer pairs, we also generate the step-by-step explanations. Our goal is to provide all intermediate reasoning steps a student of causal inference would use to answer the questions, so that each necessary subskill necessary for causal inference can be evaluated individually. We identify the following six subskills: \u2460 causal graph extraction; \u2461 correct query type interpretation; \u2462 symbolic formalization of the query; \u2463 semantic parsing to compile the available data; \u2464 estimand derivation; and \u2465 arithmetic calculation to solve the estimand, as in the colored boxes in Figure 1.\"\n\nThis clearly states that they identified **six subskills** for causal inference.\n\nAdditionally, in Figure 1 on page 2, I can see these six steps are illustrated in the example.\n\nIn Section 4 \"Our CAUSALCOT Model\" on page 7, it also mentions \"We base our prompt design on the multi-step reasoning process of causal inference as shown in Figure 4, first starting with four preparation steps: \u2460 identifying the causal graph structure; \u2461 determining the causal query type; \u2462 formulating the query symbolically precisely; and \u2463 extracting relevant data from the prompt. Then, given all the information collected in the preparation stage, we introduce the formal solution: \u2464 correctly deducing the estimand using causal inference techniques; and finally \u2465 evaluating the estimand to answer the question.\"\n\nThis confirms the six steps again.\n\nAnswer: 6", "citations": [], "input_tokens": 75261, "output_tokens": 410, "cost_micros": 231933, "latency_ms": 16531, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.04350v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.04350v3.pdf::Q000", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "6", "pages": 28, "document_id": 7594, "arm": "azure_basic_lc", "question_id": "2312.04350v3.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, I need to find where the authors describe the steps they divided causal inference into.\n\nIn Figure 1, there is a section titled \"Correct steps to lead to the ground-truth answer:\" which shows 6 numbered steps:\n\n1) Parse the causal graph\n2) Classify the query type\n3) Formulate the query to its symbolic form\n4) Collect the available data\n5) Derive the estimand using causal inference\n6) Solve for the estimand by plugging in the relevant data\n\nThis is also confirmed in Section 3.2 where they discuss generating explanations: \"We identify the following six subskills: 1 causal graph extraction; 2 correct query type interpretation; 3 symbolic formalization of the query; 4 semantic parsing to compile the available data; 5 estimand derivation; and 6 arithmetic calculation to solve the estimand\"\n\nAdditionally, in Section 4 describing their CAUSALCOT model, they state: \"We base our prompt design on the multi-step reasoning process of causal inference as shown in Figure 4, first starting with four preparation steps... Then... the formal solution: 5 correctly deducing the estimand... and finally 6 evaluating the estimand\"\n\nAnswer: 6", "citations": [], "input_tokens": 31749, "output_tokens": 291, "cost_micros": 99612, "latency_ms": 9327, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.04350v3.pdf::Q000", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "6", "pages": 28, "document_id": 7594, "arm": "azure_premium_lc", "question_id": "2312.04350v3.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, I need to find where the authors describe the steps they divided causal inference into.\n\nIn Section 4, \"Our CAUSALCOT Model,\" the authors describe their approach based on the causal inference engine. On page 7, they explicitly state:\n\n\"We base our prompt design on the multi-step reasoning process of causal inference as shown in Figure 4, first starting with four preparation steps: 1 identifying the causal graph structure; 2 determining the causal query type; 3 formulating the query symbolically precisely; and 4 extracting relevant data from the prompt. Then, given all the information collected in the preparation stage, we introduce the formal solution: 5 correctly deducing the estimand using causal inference techniques; and finally 6 evaluating the estimand to answer the question.\"\n\nThis clearly describes 6 steps:\n1. Identifying the causal graph structure\n2. Determining the causal query type\n3. Formulating the query symbolically precisely\n4. Extracting relevant data from the prompt\n5. Correctly deducing the estimand using causal inference techniques\n6. Evaluating the estimand to answer the question\n\nThis is also shown in Figure 1 and Figure 4, where 6 steps are illustrated.\n\nAnswer: 6", "citations": [], "input_tokens": 38818, "output_tokens": 294, "cost_micros": 120864, "latency_ms": 8828, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.04350v3.pdf::Q000", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "6", "pages": 28, "document_id": 7594, "arm": "llamacloud_basic_lc", "question_id": "2312.04350v3.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, I need to find where the authors describe the key steps of causal inference.\n\nIn Section 3.2 \"Generating the Explanations,\" the document states:\n\n\"Our goal is to provide all intermediate reasoning steps a student of causal inference would use to answer the questions, so that each necessary subskill necessary for causal inference can be evaluated individually. We identify the following six subskills: \u2460 causal graph extraction; \u2461 correct query type interpretation; \u2462 symbolic formalization of the query; \u2463 semantic parsing to compile the available data; \u2464 estimand derivation; and \u2465 arithmetic calculation to solve the estimand, as in the colored boxes in Figure 1.\"\n\nThis is also reflected in Figure 4 which shows the CAUSALCOT model with 6 steps, and in Section 4 where they describe their prompting strategy that \"breaks down a causal reasoning problem into multiple symbolically-grounded, simpler steps.\"\n\nAnswer: 6", "citations": [], "input_tokens": 33630, "output_tokens": 227, "cost_micros": 104295, "latency_ms": 8516, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.04350v3.pdf::Q000", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "6", "pages": 28, "document_id": 7594, "arm": "llamacloud_premium_lc", "question_id": "2312.04350v3.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, specifically at Figure 1 which shows the \"Correct steps to lead to the ground-truth answer,\" I can identify the key steps that the authors divided causal inference into.\n\nThe figure clearly shows 6 numbered steps:\n\n1. Parse the causal graph (Causal Relation Extraction)\n2. Classify the query type (Causal Question Classification)\n3. Formulate the query to its symbolic form (Formalization)\n4. Collect the available data (Semantic Parsing)\n5. Derive the estimand using causal inference (Formal Causal Inference)\n6. Solve for the estimand by plugging in the relevant data (Arithmetics)\n\nThis is further confirmed in Section 3.2 where the explanation generation is described, and in Section 4 where the CAUSALCOT model is introduced, which follows these same 6 steps.\n\nAnswer: 6", "citations": [], "input_tokens": 39832, "output_tokens": 211, "cost_micros": 122661, "latency_ms": 8250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.04350v3.pdf::Q000", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "6", "pages": 28, "document_id": 7594, "arm": "surfsense_agentic", "question_id": "2312.04350v3.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the provided document to find information about the key steps the author divided causal inference into.Let me search for information about how the author divided the process or steps:Perfect! I found the answer. The text clearly states that the author divided causal inference into 6 steps. Let me verify this by reading a bit more context:Perfect! The document clearly shows that the author divided causal inference into **6 steps**:\n\n1. Extract the causal graph\n2. Determine the query type\n3. Formalize the query\n4. Gather all relevant data\n5. Deduce the estimand using causal inference\n6. Calculate the estimand\n\nThe text explicitly states: \"first starting with four preparation steps: 1 identifying the causal graph structure; 2 determining the causal query type; 3 formulating the query symbolically precisely; and 4 extracting relevant data from the prompt. Then... 5 correctly deducing the estimand using causal inference techniques; and finally 6 evaluating the estimand to answer the question.\"\n\nAnswer: 6", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 44937, "error": null, "extra": {"thread_id": 2031, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_1b14a0eb01624709bf2cbb4b451f998a", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 221, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.04350v3.pdf::Q002", "doc_id": "2312.04350v3.pdf", "answer_format": "str", "gold": "semantic parsing", "pages": 28, "document_id": 7594, "arm": "native_pdf", "question_id": "2312.04350v3.pdf::Q002", "answer_letter": null, "raw_text": "Looking through the document for information about subskills related to collecting available data...\n\nIn Figure 1 on page 2, I can see the step-by-step reasoning process for answering a causal inference question. Step 4 is labeled \"Collect the available data\" and it has an associated subskill label.\n\nThe subskill associated with Step 4 \"Collect the available data\" is **Semantic Parsing**.\n\nThis is also confirmed in Section 3.2 where it mentions the six subskills:\n- \u2460 causal graph extraction\n- \u2461 correct query type interpretation\n- \u2462 symbolic formalization of the query\n- \u2463 semantic parsing to compile the available data\n- \u2464 estimand derivation\n- \u2465 arithmetic calculation\n\nAnswer: Semantic Parsing", "citations": [], "input_tokens": 75267, "output_tokens": 185, "cost_micros": 228576, "latency_ms": 32921, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.04350v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "semantic parsing", "normalised_gold": "semantic parsing"}}
+{"qid": "2312.04350v3.pdf::Q002", "doc_id": "2312.04350v3.pdf", "answer_format": "str", "gold": "semantic parsing", "pages": 28, "document_id": 7594, "arm": "azure_basic_lc", "question_id": "2312.04350v3.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document, which shows the step-by-step process for answering a causal inference question, Step 4 is labeled as \"Collect the available data:\" and the corresponding subskill is clearly marked as \"Semantic Parsing\".\n\nAnswer: Semantic Parsing", "citations": [], "input_tokens": 31755, "output_tokens": 68, "cost_micros": 96285, "latency_ms": 4983, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "semantic parsing", "normalised_gold": "semantic parsing"}}
+{"qid": "2312.04350v3.pdf::Q002", "doc_id": "2312.04350v3.pdf", "answer_format": "str", "gold": "semantic parsing", "pages": 28, "document_id": 7594, "arm": "azure_premium_lc", "question_id": "2312.04350v3.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, specifically Figure 1 which shows the step-by-step explanation for solving a causal inference question:\n\nStep 4 is labeled \"Collect the available data\" and the subskill required for this step is identified as \"Semantic Parsing\" (shown in the colored box in the figure).\n\nAnswer: Semantic Parsing", "citations": [], "input_tokens": 38824, "output_tokens": 79, "cost_micros": 117657, "latency_ms": 5264, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "semantic parsing", "normalised_gold": "semantic parsing"}}
+{"qid": "2312.04350v3.pdf::Q002", "doc_id": "2312.04350v3.pdf", "answer_format": "str", "gold": "semantic parsing", "pages": 28, "document_id": 7594, "arm": "llamacloud_basic_lc", "question_id": "2312.04350v3.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, specifically Figure 1 which shows the step-by-step explanation, and Section 3.2 which discusses \"Generating the Explanations,\" I can identify the subskills mentioned.\n\nStep 4 in Figure 1 is labeled \"Collect the available data\" and the document describes this as requiring \"semantic parsing to compile the available data\" in Section 3.2.\n\nAnswer: semantic parsing", "citations": [], "input_tokens": 33636, "output_tokens": 96, "cost_micros": 102348, "latency_ms": 5125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "semantic parsing", "normalised_gold": "semantic parsing"}}
+{"qid": "2312.04350v3.pdf::Q002", "doc_id": "2312.04350v3.pdf", "answer_format": "str", "gold": "semantic parsing", "pages": 28, "document_id": 7594, "arm": "llamacloud_premium_lc", "question_id": "2312.04350v3.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, specifically Figure 1 which shows the \"Correct steps to lead to the ground-truth answer,\" I can see that Step 4 is labeled \"Collect the available data\" and the subskill associated with it is \"Semantic Parsing.\"\n\nThis is also confirmed in Section A.8 (Explanation Template) where Step \u2463 states: \"Collect all the available data: The available data are: '$d$'.\"\n\nAnswer: Semantic Parsing", "citations": [], "input_tokens": 39838, "output_tokens": 109, "cost_micros": 121149, "latency_ms": 5516, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "semantic parsing", "normalised_gold": "semantic parsing"}}
+{"qid": "2312.04350v3.pdf::Q002", "doc_id": "2312.04350v3.pdf", "answer_format": "str", "gold": "semantic parsing", "pages": 28, "document_id": 7594, "arm": "surfsense_agentic", "question_id": "2312.04350v3.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the priority document to find the answer to your question about what subskill is needed to collect the available data.Let me search for more specific terms related to data collection:Perfect! I found the answer. Let me read that section more carefully to get the complete context:Based on the document, I can now provide the answer. The document clearly shows that step 4 \"Collect the available data\" has the subskill listed as \"Semantic Parsing\".\n\nAnswer: Semantic Parsing", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 32687, "error": null, "extra": {"thread_id": 2032, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_46f68e1513d341d0b0ee21cfa0dae2f0", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 137, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "semantic parsing", "normalised_gold": "semantic parsing"}}
+{"qid": "2312.04350v3.pdf::Q003", "doc_id": "2312.04350v3.pdf", "answer_format": "str", "gold": "intervention", "pages": 28, "document_id": 7594, "arm": "native_pdf", "question_id": "2312.04350v3.pdf::Q003", "answer_letter": null, "raw_text": "Looking through the document for information about the rungs of the Ladder of Causation...\n\nFrom Section 2.1 \"The Ladder of Causation\" on page 3:\n\nThe document describes three distinct rungs:\n- Rung 1 (\"seeing\"): Statistical associations\n- Rung 2 (\"doing\"): Interventions\n- Rung 3 (\"imagining\"): Counterfactuals\n\nRung 2 is specifically described as dealing with interventions and the do-operator.\n\nAnswer: Intervention (or \"doing\")", "citations": [], "input_tokens": 75267, "output_tokens": 131, "cost_micros": 227766, "latency_ms": 31077, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.04350v3.pdf"}, "graded": {"correct": true, "f1": 0.5, "method": "str_norm", "normalised_pred": "intervention or doing", "normalised_gold": "intervention"}}
+{"qid": "2312.04350v3.pdf::Q003", "doc_id": "2312.04350v3.pdf", "answer_format": "str", "gold": "intervention", "pages": 28, "document_id": 7594, "arm": "azure_basic_lc", "question_id": "2312.04350v3.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the document, Section 2.1 \"The Ladder of Causation\" describes the three rungs:\n\n- Rung 1 (\"seeing\") - statistical associations\n- Rung 2 (\"doing\") - interventions\n- Rung 3 (\"imagining\") - counterfactuals\n\nThe document states that Rung 2 \"enables us to formalize the concept of actively intervening in the world.\"\n\nAnswer: Intervention (or \"doing\")", "citations": [], "input_tokens": 31755, "output_tokens": 111, "cost_micros": 96930, "latency_ms": 5344, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.5, "method": "str_norm", "normalised_pred": "intervention or doing", "normalised_gold": "intervention"}}
+{"qid": "2312.04350v3.pdf::Q003", "doc_id": "2312.04350v3.pdf", "answer_format": "str", "gold": "intervention", "pages": 28, "document_id": 7594, "arm": "azure_premium_lc", "question_id": "2312.04350v3.pdf::Q003", "answer_letter": null, "raw_text": "Looking through the document for information about the second rung of the ladder of causation...\n\nFrom Section 2.1 \"The Ladder of Causation\":\n\n\"Rung 2 (\"doing\"). This enables us to formalize the concept of actively intervening in the world, and modifying it toward some end (\"If I take an aspirin now, will my headache subside?\"). Interventions can be formalized using the do-operator [24] and Causal Bayesian Networks [67]...\"\n\nAnswer: Intervention (or \"doing\")", "citations": [], "input_tokens": 38824, "output_tokens": 126, "cost_micros": 118362, "latency_ms": 6250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.5, "method": "str_norm", "normalised_pred": "intervention or doing", "normalised_gold": "intervention"}}
+{"qid": "2312.04350v3.pdf::Q003", "doc_id": "2312.04350v3.pdf", "answer_format": "str", "gold": "intervention", "pages": 28, "document_id": 7594, "arm": "llamacloud_basic_lc", "question_id": "2312.04350v3.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the document, specifically Section 2.1 which describes \"The Ladder of Causation\":\n\nThe three rungs are:\n- Rung 1 (\"seeing\") - statistical associations\n- Rung 2 (\"doing\") - interventions\n- Rung 3 (\"imagining\") - counterfactuals\n\nRung 2 is explicitly described as dealing with interventions, using the do-operator to formalize the concept of actively intervening in the world.\n\nAnswer: Intervention (or \"doing\")", "citations": [], "input_tokens": 33636, "output_tokens": 122, "cost_micros": 102738, "latency_ms": 5062, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.5, "method": "str_norm", "normalised_pred": "intervention or doing", "normalised_gold": "intervention"}}
+{"qid": "2312.04350v3.pdf::Q003", "doc_id": "2312.04350v3.pdf", "answer_format": "str", "gold": "intervention", "pages": 28, "document_id": 7594, "arm": "llamacloud_premium_lc", "question_id": "2312.04350v3.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the document, specifically Section 2.1 \"The Ladder of Causation\":\n\n**Rung 2 (\"doing\").** This enables us to formalize the concept of actively intervening in the world, and modifying it toward some end (\"*If I take an aspirin now, will my headache subside?*\"). Interventions can be formalized using the *do-operator* [24] and *Causal Bayesian Networks* [67] to represent, for example, the distribution over $Y$ when intervening on $X$ to set its value to $x$ as $P(Y = y|do(X = x))$.\n\nAnswer: Intervention or \"doing\"", "citations": [], "input_tokens": 39838, "output_tokens": 159, "cost_micros": 121899, "latency_ms": 8094, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.5, "method": "str_norm", "normalised_pred": "intervention or doing", "normalised_gold": "intervention"}}
+{"qid": "2312.04350v3.pdf::Q003", "doc_id": "2312.04350v3.pdf", "answer_format": "str", "gold": "intervention", "pages": 28, "document_id": 7594, "arm": "surfsense_agentic", "question_id": "2312.04350v3.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the priority document to find the answer to your question about the second ladder of causation.Let me search for \"ladder of causation\" in the document:Let me search for \"second rung\" or information about the rungs:Good! I found a reference at line 553. Let me read that section:Perfect! I found the answer. The document clearly states that the three rungs of the Ladder of Causation are:\n- Rung 1: associational\n- Rung 2: interventional\n- Rung 3: counterfactual\n\nAnswer: interventional", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 43718, "error": null, "extra": {"thread_id": 2033, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_7af68403347145c59b6ebb40237510ed", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 215, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 0.0, "method": "str_norm", "normalised_pred": "interventional", "normalised_gold": "intervention"}}
+{"qid": "2312.04350v3.pdf::Q005", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "2", "pages": 28, "document_id": 7594, "arm": "native_pdf", "question_id": "2312.04350v3.pdf::Q005", "answer_letter": null, "raw_text": "I need to find information about the chain-type causal graph and count its directed edges.\n\nLooking through the document, I can find the causal graph structures in Figure 6 on page 19, which shows \"List of all ten causal graphs with treatment-effect pairs (CGTEs).\"\n\nIn Figure 6, I can see a \"Chain\" graph structure. The Chain graph shows:\n- X \u2192 (middle node) \u2192 Y\n\nThis represents a chain with three nodes connected by two directed edges:\n1. One edge from X to the middle node\n2. One edge from the middle node to Y\n\nTherefore, a chain-type causal graph has 2 directed edges.\n\nAnswer: 2", "citations": [], "input_tokens": 75271, "output_tokens": 156, "cost_micros": 228153, "latency_ms": 30437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.04350v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2312.04350v3.pdf::Q005", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "2", "pages": 28, "document_id": 7594, "arm": "azure_basic_lc", "question_id": "2312.04350v3.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about the chain-type causal graph structure.\n\nIn Figure 6 on page 19, titled \"List of all ten causal graphs with treatment-effect pairs (CGTEs)\", I can see the \"Chain\" graph structure.\n\nThe Chain graph shows: X \u2192 M \u2192 Y\n\nThis is a simple chain structure where:\n- X has an arrow to M (1 edge)\n- M has an arrow to Y (1 edge)\n\nTherefore, there are 2 directed edges in a chain-type causal graph.\n\nAnswer: 2", "citations": [], "input_tokens": 31759, "output_tokens": 133, "cost_micros": 97272, "latency_ms": 6187, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2312.04350v3.pdf::Q005", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "2", "pages": 28, "document_id": 7594, "arm": "azure_premium_lc", "question_id": "2312.04350v3.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about the chain-type causal graph structure.\n\nIn Figure 6 on page 19, there is a table titled \"List of all ten causal graphs with treatment-effect pairs (CGTEs)\" which shows:\n\nFor the \"Chain\" causal graph:\n- CGTE Alias: Chain\n- Treatment-Effect Pair: X \u2192 Y\n\nThe diagram shows a simple chain structure: X \u2192 Z \u2192 Y\n\nThis is a chain with 3 nodes (X, Z, Y) connected sequentially.\n\nCounting the directed edges:\n1. X \u2192 Z (one edge)\n2. Z \u2192 Y (one edge)\n\nTotal: 2 directed edges\n\nAnswer: 2", "citations": [], "input_tokens": 38828, "output_tokens": 165, "cost_micros": 118959, "latency_ms": 5296, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2312.04350v3.pdf::Q005", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "2", "pages": 28, "document_id": 7594, "arm": "llamacloud_basic_lc", "question_id": "2312.04350v3.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about the chain-type causal graph structure.\n\nIn Figure 6 on page 18, there is a \"List of all ten causal graphs with treatment-effect pairs (CGTEs).\" The table shows:\n\n| Causal Graph | CGTE Alias | Treatment-Effect Pair |\n| ------------ | ----------- | --------------------- |\n| Chain        | Chain       | X Y                   |\n\nThe visual representation shows: X \u2192 M \u2192 Y\n\nThis indicates a chain structure where X points to M (a mediator), and M points to Y.\n\nTherefore, there are 2 directed edges in a chain-type causal graph: one from X to M, and one from M to Y.\n\nAnswer: 2", "citations": [], "input_tokens": 33640, "output_tokens": 174, "cost_micros": 103530, "latency_ms": 6796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2312.04350v3.pdf::Q005", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "2", "pages": 28, "document_id": 7594, "arm": "llamacloud_premium_lc", "question_id": "2312.04350v3.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, I need to find the definition of a chain-type causal graph.\n\nIn Section A.3 \"Collection of Causal Graphs,\" there is a table (Figure 6) that lists all causal graph types with their structures.\n\nFor the \"Chain\" graph, the table shows:\n- Causal Graph Alias: Chain\n- CGTE Alias: Chain  \n- Treatment-Effect Pair: X \u2192 Z \u2192 Y (X and Y are treatment-effect pair)\n\nThe notation X \u2192 Z \u2192 Y indicates:\n- One edge from X to Z\n- One edge from Z to Y\n\nTherefore, there are 2 directed edges in a chain-type causal graph.\n\nAnswer: 2", "citations": [], "input_tokens": 39842, "output_tokens": 164, "cost_micros": 121986, "latency_ms": 7343, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2312.04350v3.pdf::Q005", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "2", "pages": 28, "document_id": 7594, "arm": "surfsense_agentic", "question_id": "2312.04350v3.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to find the definition of a chain-type causal graph and count the directed edges.Let me search for \"chain-type\" or \"causal graph\" in the document:Let me search for \"causal graph\" to find the definition:Let me read the section around line 2387 where the causal graph definition appears:Now let me search for \"chain\" with \"graph\" or look for specific graph types. Let me check Figure 6 which shows all the causal graphs:Good! I can see \"Chain\" is listed as one of the causal graphs at line 2101-2104. Now let me look for the actual visualization or definition of the Chain graph. Let me check if there's a figure showing these graphs:", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 48766, "error": null, "extra": {"thread_id": 2034, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_ca8eb7f650264585a1c5b7287fc4aa99", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 247, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2387", "normalised_gold": "2"}}
+{"qid": "2312.04350v3.pdf::Q006", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "GPT-4", "pages": 28, "document_id": 7594, "arm": "native_pdf", "question_id": "2312.04350v3.pdf::Q006", "answer_letter": null, "raw_text": "I need to find which model performs best on the CLADDER dataset by looking at the performance tables in the document.\n\nLooking at Table 2 on page 8, which shows \"Performance of all models on our CLADDER dataset v1.5\":\n\nThe \"Overall Acc.\" column shows:\n- Random: 49.27\n- LLaMa: 44.03\n- Alpaca: 44.66\n- GPT-3 Non-Instr. (davinci): 49.92\n- GPT-3 Instr. (text-davinci-001): 51.40\n- GPT-3 Instr. (text-davinci-002): 53.15\n- GPT-3 Instr. (text-davinci-003): 56.26\n- GPT-3.5: 52.18\n- GPT-4: 62.03\n- + CAUSALCOT: 70.40\n\nThe highest overall accuracy is achieved by CAUSALCOT with 70.40%.\n\nLooking at the document, CAUSALCOT is described as \"GPT-4 + CAUSALCOT\" in the context, which means it's GPT-4 enhanced with their causal chain-of-thought prompting strategy.\n\nThe best performing model is CAUSALCOT (which uses GPT-4 as the base model) with an accuracy of 70.40%.\n\nAnswer: 70", "citations": [], "input_tokens": 75259, "output_tokens": 336, "cost_micros": 230817, "latency_ms": 24141, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.04350v3.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "70", "normalised_gold": "-4"}}
+{"qid": "2312.04350v3.pdf::Q006", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "GPT-4", "pages": 28, "document_id": 7594, "arm": "azure_basic_lc", "question_id": "2312.04350v3.pdf::Q006", "answer_letter": null, "raw_text": "Looking at the performance results in Table 2 (on page 8), I need to find which model achieves the highest overall accuracy on the CLADDER dataset v1.5.\n\nFrom Table 2:\n- Random: 49.27%\n- LLaMa: 44.03%\n- Alpaca: 44.66%\n- GPT-3 Non-Instr. (davinci): 49.92%\n- GPT-3 Instr. (text-davinci-001): 51.40%\n- GPT-3 Instr. (text-davinci-002): 53.15%\n- GPT-3 Instr. (text-davinci-003): 56.26%\n- GPT-3.5: 52.18%\n- GPT-4: 62.03%\n- **+ CAUSALCOT: 70.40%**\n\nThe model with the highest overall accuracy is **CAUSALCOT** (which is GPT-4 with the CausalCoT prompting strategy) at 70.40%.\n\nAnswer: 4", "citations": [], "input_tokens": 31747, "output_tokens": 263, "cost_micros": 99186, "latency_ms": 6937, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "-4"}}
+{"qid": "2312.04350v3.pdf::Q006", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "GPT-4", "pages": 28, "document_id": 7594, "arm": "azure_premium_lc", "question_id": "2312.04350v3.pdf::Q006", "answer_letter": null, "raw_text": "Looking at the performance results in the document:\n\nFrom Table 2 (CLADDER v1.5 - the main/current version):\n- GPT-4 + CAUSALCOT achieves 70.40% accuracy, which is the highest performance among all models tested\n\nFrom Table 6 (CLADDER v1.0 - earlier version):\n- GPT-4 + CAUSALCOT achieves 66.64% accuracy, which is also the highest\n\nThe document clearly states on page 3: \"our CAUSALCOT achieves an accuracy of 70.40%, which substantially improves the performance of vanilla GPT-4 by 8.37 points on CLADDER.\"\n\nAnd on page 8: \"And amongst all, our CAUSALCOT achieves the highest performance of 70.40%\"\n\nCAUSALCOT is described as a chain-of-thought prompting strategy used with GPT-4, so the best performing model is GPT-4 with CAUSALCOT.\n\nSince GPT-4 is mentioned to have approximately 1T (1 trillion) parameters, and CAUSALCOT is a prompting strategy applied to GPT-4:\n\nAnswer: 1000000000000", "citations": [], "input_tokens": 38816, "output_tokens": 288, "cost_micros": 120768, "latency_ms": 8750, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "1000000000000", "normalised_gold": "-4"}}
+{"qid": "2312.04350v3.pdf::Q006", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "GPT-4", "pages": 28, "document_id": 7594, "arm": "llamacloud_basic_lc", "question_id": "2312.04350v3.pdf::Q006", "answer_letter": null, "raw_text": "I need to find which model performs best on the CLADDER dataset according to the document.\n\nLooking at Table 2 (page 8), which shows \"Performance of all models on our CLADDER dataset v1.5\", I can see the overall accuracy for each model:\n\n- Random: 49.27%\n- LLaMa: 44.03%\n- Alpaca: 44.66%\n- GPT-3 Non-Instr. (davinci): 49.92%\n- GPT-3 Instr. (text-davinci-001): 51.40%\n- GPT-3 Instr. (text-davinci-002): 53.15%\n- GPT-3 Instr. (text-davinci-003): 56.26%\n- GPT-3.5: 52.18%\n- GPT-4: 62.03%\n- + CAUSALCOT: 70.40%\n\nThe model with the highest overall accuracy is \"GPT-4 + CAUSALCOT\" with 70.40%.\n\nThe text also states: \"our CAUSALCOT achieves the highest performance of 70.40%, which is substantially better than the vanilla GPT-4 by 8.37 points on CLADDER.\"\n\nSince CAUSALCOT is based on GPT-4 (as stated in the document: \"We build our CAUSALCOT prompting strategy using GPT-4\"), the best performing model is GPT-4 when using the CAUSALCOT prompting strategy.\n\nThe overall accuracy is 70.40%, which rounds to 70%.\n\nAnswer: 70", "citations": [], "input_tokens": 33628, "output_tokens": 390, "cost_micros": 106734, "latency_ms": 9766, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "70", "normalised_gold": "-4"}}
+{"qid": "2312.04350v3.pdf::Q006", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "GPT-4", "pages": 28, "document_id": 7594, "arm": "llamacloud_premium_lc", "question_id": "2312.04350v3.pdf::Q006", "answer_letter": null, "raw_text": "I need to find which model performs best on the CLADDER dataset by looking at the performance tables in the document.\n\nLooking at Table 2 (Performance of all models on CLADDER dataset v1.5), which shows the main results:\n\nThe models and their Overall Accuracy are:\n- Random: 49.27\n- LLaMa: 44.03\n- Alpaca: 44.66\n- GPT-3 Non-Instr. (davinci): 49.92\n- GPT-3 Instr. (text-davinci-001): 51.40\n- GPT-3 Instr. (text-davinci-002): 53.15\n- GPT-3 Instr. (text-davinci-003): 56.26\n- GPT-3.5: 52.18\n- GPT-4: 62.03\n- + CAUSALCOT: 70.40\n\nThe model with the highest overall accuracy is \"+ CAUSALCOT\" at 70.40%.\n\nCAUSALCOT is described in the paper as a chain-of-thought prompting strategy built on top of GPT-4, so it's GPT-4 enhanced with their CAUSALCOT method.\n\nThe accuracy is 70.40%, which as a percentage would be approximately 70 (rounding to the nearest integer).\n\nAnswer: 70", "citations": [], "input_tokens": 39830, "output_tokens": 325, "cost_micros": 124365, "latency_ms": 8687, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "70", "normalised_gold": "-4"}}
+{"qid": "2312.04350v3.pdf::Q006", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "GPT-4", "pages": 28, "document_id": 7594, "arm": "surfsense_agentic", "question_id": "2312.04350v3.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the document to find information about model performance on Cladder.Let me search for \"Cladder\" in the document to find the relevant information:Now let me search for tables or results sections that might contain performance metrics:Perfect! I found Table 2 which shows the performance of all models on the CLADDER dataset. Let me read that section:Perfect! I can see Table 2 which shows the performance of all models on the CLADDER dataset. Looking at the \"Overall Acc.\" column:\n\n- Random: 49.27\n- LLaMa: 44.03\n- Alpaca: 44.66\n- GPT-3 Non-Instr. (davinci): 49.92\n- GPT-3 Instr. (text-davinci-001): 51.40\n- GPT-3 Instr. (text-davinci-002): 53.15\n- GPT-3 Instr. (text-davinci-003): 56.26\n- GPT-3.5: 52.18\n- GPT-4: 62.03\n- +CAUSALCOT: 70.40\n\nThe model that performs the best on Cladder is \"+CAUSALCOT\" with an overall accuracy of 70.40.\n\nAnswer: 70", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 36610, "error": null, "extra": {"thread_id": 2035, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_e78cd64845f341ad917f431928ce6d64", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 165, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "70", "normalised_gold": "-4"}}
+{"qid": "2312.09390v1.pdf::Q000", "doc_id": "2312.09390v1.pdf", "answer_format": "str", "gold": "\"humans supervise models are weaker than humans\"", "pages": 49, "document_id": 7595, "arm": "native_pdf", "question_id": "2312.09390v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1 on page 2:\n\nThe first subfigure shows \"Traditional ML\" where the supervisor (human) is above/smarter than the student (model).\n\nThe second subfigure shows \"Superalignment\" where the supervisor (human) is below/weaker than the student (superhuman model).\n\nThe difference is the relative capability levels are reversed - in Traditional ML the supervisor is stronger, while in Superalignment the student is stronger.\n\nAnswer: Supervisor is stronger than student (vs. student stronger than supervisor)", "citations": [], "input_tokens": 121916, "output_tokens": 121, "cost_micros": 367563, "latency_ms": 32577, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.09390v1.pdf"}, "graded": {"correct": false, "f1": 0.11764705882352941, "method": "str_norm", "normalised_pred": "supervisor is stronger than student vs student stronger than supervisor", "normalised_gold": "humans supervise models are weaker than humans"}}
+{"qid": "2312.09390v1.pdf::Q000", "doc_id": "2312.09390v1.pdf", "answer_format": "str", "gold": "\"humans supervise models are weaker than humans\"", "pages": 49, "document_id": 7595, "arm": "azure_basic_lc", "question_id": "2312.09390v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document:\n\nThe figure shows three scenarios comparing Traditional ML, Superalignment, and \"Our Analogy.\"\n\nIn the first subfigure (Traditional ML), the supervisor (human) is above the human level line, while the student (model) is below the human level line.\n\nIn the second subfigure (Superalignment), the supervisor (human) is at the human level line, while the student (model) is far above the human level line.\n\nThe key difference is that in Traditional ML, the supervisor is stronger than/above the student, whereas in Superalignment, the student is much stronger than the supervisor.\n\nAnswer: Supervisor is stronger than student (versus student stronger than supervisor)", "citations": [], "input_tokens": 45459, "output_tokens": 162, "cost_micros": 138807, "latency_ms": 5828, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.11764705882352941, "method": "str_norm", "normalised_pred": "supervisor is stronger than student versus student stronger than supervisor", "normalised_gold": "humans supervise models are weaker than humans"}}
+{"qid": "2312.09390v1.pdf::Q000", "doc_id": "2312.09390v1.pdf", "answer_format": "str", "gold": "\"humans supervise models are weaker than humans\"", "pages": 49, "document_id": 7595, "arm": "azure_premium_lc", "question_id": "2312.09390v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document:\n\nThe first subfigure shows \"Traditional ML\" where the **supervisor is stronger than the student** (supervisor is above the human level line, student is below it).\n\nThe second subfigure shows \"Superalignment\" where the **supervisor is weaker than the student** (supervisor is at human level, student is above it - superhuman).\n\nAnswer: Supervisor is stronger than student (vs weaker than student)", "citations": [], "input_tokens": 49232, "output_tokens": 103, "cost_micros": 149241, "latency_ms": 5391, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.25, "method": "str_norm", "normalised_pred": "supervisor is stronger than student vs weaker than student", "normalised_gold": "humans supervise models are weaker than humans"}}
+{"qid": "2312.09390v1.pdf::Q000", "doc_id": "2312.09390v1.pdf", "answer_format": "str", "gold": "\"humans supervise models are weaker than humans\"", "pages": 49, "document_id": 7595, "arm": "llamacloud_basic_lc", "question_id": "2312.09390v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document:\n\nThe first subfigure shows \"Traditional ML\" where the supervisor is at human level and the student is below human level.\n\nThe second subfigure shows \"Superalignment\" where the supervisor is at human level and the student is above human level (superhuman).\n\nThe key difference is the student's capability relative to human level.\n\nAnswer: Student is weaker than human vs student is stronger than human", "citations": [], "input_tokens": 47601, "output_tokens": 99, "cost_micros": 144288, "latency_ms": 4594, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.2222222222222222, "method": "str_norm", "normalised_pred": "student is weaker than human vs student is stronger than human", "normalised_gold": "humans supervise models are weaker than humans"}}
+{"qid": "2312.09390v1.pdf::Q000", "doc_id": "2312.09390v1.pdf", "answer_format": "str", "gold": "\"humans supervise models are weaker than humans\"", "pages": 49, "document_id": 7595, "arm": "llamacloud_premium_lc", "question_id": "2312.09390v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1, I need to compare the first subfigure (Traditional ML) with the second subfigure (Superalignment).\n\nIn the first subfigure (Traditional ML):\n- Supervisor: Human\n- Student: Robot\n- The human is stronger/smarter than the robot\n\nIn the second subfigure (Superalignment):\n- Supervisor: Human  \n- Student: Superhuman Robot\n- The human is weaker/less capable than the superhuman robot\n\nThe key difference is the relative capability between supervisor and student.\n\nAnswer: Supervisor is stronger than student", "citations": [], "input_tokens": 89555, "output_tokens": 133, "cost_micros": 270660, "latency_ms": 6703, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.16666666666666666, "method": "str_norm", "normalised_pred": "supervisor is stronger than student", "normalised_gold": "humans supervise models are weaker than humans"}}
+{"qid": "2312.09390v1.pdf::Q000", "doc_id": "2312.09390v1.pdf", "answer_format": "str", "gold": "\"humans supervise models are weaker than humans\"", "pages": 49, "document_id": 7595, "arm": "surfsense_agentic", "question_id": "2312.09390v1.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find Figure 1 and analyze the difference between the first and second subfigures regarding supervisor-student relationships.Let me search for Figure 1 in the document:Now let me read the actual Figure 1 caption and description more carefully:Based on the figure caption and structure, I can see that Figure 1 has three subfigures:\n1. Traditional ML\n2. Superalignment  \n3. Our Analogy\n\nThe question asks about the difference between the first subfigure (Traditional ML) and the second subfigure (Superalignment) regarding the supervisor-student relationship.\n\nFrom the caption: \"Traditional ML focuses on the setting where humans supervise models that are weaker than humans. For the ultimate superalignment problem, humans will have to supervise models much smarter than them.\"\n\nThe key difference is about the relative capability/intelligence level between supervisor and student:\n- In Traditional ML (first subfigure): Supervisor is stronger/smarter than student\n- In Superalignment (second subfigure): Supervisor is weaker than student (students are \"much smarter\" than supervisors)\n\nAnswer: Supervisor is stronger than student", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 43967, "error": null, "extra": {"thread_id": 2036, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_e51022c429054652ae2e24b4002aba31", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 203, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.16666666666666666, "method": "str_norm", "normalised_pred": "supervisor is stronger than student", "normalised_gold": "humans supervise models are weaker than humans"}}
+{"qid": "2312.09390v1.pdf::Q002", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "5", "pages": 49, "document_id": 7595, "arm": "native_pdf", "question_id": "2312.09390v1.pdf::Q002", "answer_letter": null, "raw_text": "I need to count the figures from Page 3 to Page 10 in this document.\n\nLet me go through each page:\n\n**Page 3:**\n- Figure 2: Shows weak-to-strong generalization results with three panels (NLP Tasks, Chess Puzzles, ChatGPT Reward Modeling)\n\n**Page 4:**\n- No figures on this page (Related Work section)\n\n**Page 5:**\n- No figures on this page (Related Work continued)\n\n**Page 6:**\n- No figures on this page (Methodology section)\n\n**Page 7:**\n- No figures on this page (Main Results section continues)\n\n**Page 8:**\n- Figure 3: Shows weak-to-strong generalization with naive finetuning (6 panels labeled a-f)\n\n**Page 9:**\n- Figure 4: Shows bootstrapping results on chess puzzles (2 panels labeled a-b)\n\n**Page 10:**\n- Figure 5: Shows NLP results with auxiliary loss (2 panels labeled a-b)\n\nTotal figures from Page 3 to Page 10:\n- Figure 2 (Page 3)\n- Figure 3 (Page 8)\n- Figure 4 (Page 9)\n- Figure 5 (Page 10)\n\nAnswer: 4", "citations": [], "input_tokens": 121899, "output_tokens": 301, "cost_micros": 370212, "latency_ms": 35593, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.09390v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "5"}}
+{"qid": "2312.09390v1.pdf::Q002", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "5", "pages": 49, "document_id": 7595, "arm": "azure_basic_lc", "question_id": "2312.09390v1.pdf::Q002", "answer_letter": null, "raw_text": "I need to count all the figures that appear on pages 3 through 10 of this document.\n\nLet me go through each page:\n\n**Page 3:**\n- Figure 1: An illustration of methodology (Traditional ML, Superalignment, Our Analogy)\n- Figure 2: Shows test accuracy on representative NLP task, Chess Puzzles, and ChatGPT Reward Modeling\n\n**Page 4:**\n- No figures visible on this page (contains related work section)\n\n**Page 5:**\n- No figures visible on this page (continues related work section)\n\n**Page 6:**\n- No figures visible on this page (continues related work section)\n\n**Page 7:**\n- No figures visible on this page (contains methodology section)\n\n**Page 8:**\n- Figure 3: Shows weak-to-strong generalization results with multiple panels (a,b,c,d,e,f)\n- Figure 4: Shows bootstrapping results on chess puzzles\n\n**Page 9:**\n- No figures visible on this page (continues main results section)\n\n**Page 10:**\n- Figure 5: Shows NLP results with auxiliary loss\n- Figure 6: Shows auxiliary loss improving generalization across datasets\n\nCounting the figures from pages 3-10:\n- Page 3: 2 figures (Figure 1, Figure 2)\n- Page 8: 2 figures (Figure 3, Figure 4)\n- Page 10: 2 figures (Figure 5, Figure 6)\n\nTotal: 6 figures\n\nAnswer: 6", "citations": [], "input_tokens": 45442, "output_tokens": 356, "cost_micros": 141666, "latency_ms": 8421, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "5"}}
+{"qid": "2312.09390v1.pdf::Q002", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "5", "pages": 49, "document_id": 7595, "arm": "azure_premium_lc", "question_id": "2312.09390v1.pdf::Q002", "answer_letter": null, "raw_text": "I'll count the figures from Page 3 to Page 10 in this document.\n\n**Page 3:**\n- Figure 2: \"An illustration of our methodology...\" showing weak-to-strong generalization results\n\n**Page 4:**\n- None\n\n**Page 5:**\n- None\n\n**Page 6:**\n- None\n\n**Page 7:**\n- Figure 3: \"Promising weak-to-strong generalization...\" showing test accuracy panels (a,b,c,d,e,f)\n\n**Page 8:**\n- None\n\n**Page 9:**\n- Figure 4: \"Bootstrapping improves weak-to-strong generalization...\" showing panels (a,b)\n\n**Page 10:**\n- Figure 5: \"Substantially improved generalization...\" showing panels (a,b)\n- Figure 6: \"Simple auxiliary loss improves generalization...\" showing multiple test accuracy plots\n\nLet me count:\n- Page 3: 1 figure (Figure 2)\n- Page 7: 1 figure (Figure 3)\n- Page 9: 1 figure (Figure 4)\n- Page 10: 2 figures (Figure 5 and Figure 6)\n\nTotal: 1 + 1 + 1 + 2 = 5 figures\n\nAnswer: 5", "citations": [], "input_tokens": 49215, "output_tokens": 303, "cost_micros": 152190, "latency_ms": 7608, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "5"}}
+{"qid": "2312.09390v1.pdf::Q002", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "5", "pages": 49, "document_id": 7595, "arm": "llamacloud_basic_lc", "question_id": "2312.09390v1.pdf::Q002", "answer_letter": null, "raw_text": "I'll count the figures from page 3 to page 10 of this paper.\n\nLet me go through the document systematically:\n\n**Page 3:** Figure 1 (An illustration of methodology showing Traditional ML, Superalignment, and Our Analogy)\n\n**Page 4:** Figure 2 (Strong models trained with weak supervision generalize beyond their supervisor - showing 3 subplots for NLP Task, Chess, and ChatGPT Reward Modeling)\n\n**Page 5-6:** No figures visible in the main text sections shown\n\n**Page 7:** Figure 3 (mentioned in the text - showing test accuracy and performance gap recovered for NLP tasks, chess puzzles, and ChatGPT reward modeling - this appears to be on page 8 based on the layout)\n\n**Page 8:** Figure 3 appears here (6 subplots - a,b,c showing test accuracy and d,e,f showing performance gap recovered)\n\n**Page 9:** Figure 4 (Bootstrapping improves weak-to-strong generalization on chess puzzles - 2 subplots)\n\n**Page 10:** Figure 5 (Substantially improved generalization on NLP datasets with auxiliary loss - 2 subplots)\n\nCounting the figures:\n- Page 3: Figure 1\n- Page 4: Figure 2  \n- Page 8: Figure 3\n- Page 9: Figure 4\n- Page 10: Figure 5\n\nTotal figures from page 3 to page 10: 5 figures\n\nAnswer: 5", "citations": [], "input_tokens": 47584, "output_tokens": 358, "cost_micros": 148122, "latency_ms": 9766, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "5"}}
+{"qid": "2312.09390v1.pdf::Q002", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "5", "pages": 49, "document_id": 7595, "arm": "llamacloud_premium_lc", "question_id": "2312.09390v1.pdf::Q002", "answer_letter": null, "raw_text": "I'll count all figures from Page 3 to Page 10.\n\n**Page 3:**\n- Figure 2: Strong models trained with weak supervision generalize beyond their supervisor (bar chart)\n\n**Page 4:**\n- None\n\n**Page 5:**\n- None\n\n**Page 6:**\n- None\n\n**Page 7:**\n- None\n\n**Page 8:**\n- Figure 3: Promising weak-to-strong generalization with naive finetuning (grid of heatmaps/tables - appears as panels a, b, c, d, e, f)\n\n**Page 9:**\n- Figure 4: Bootstrapping improves weak-to-strong generalization on chess puzzles (2 panels: a and b)\n\n**Page 10:**\n- Figure 5: Substantially improved generalization on NLP datasets with a simple auxiliary loss (2 panels: a and b)\n\nTotal count:\n- Figure 2: 1\n- Figure 3: 1 (though it has 6 panels, it's one figure)\n- Figure 4: 1\n- Figure 5: 1\n\nAnswer: 4", "citations": [], "input_tokens": 89538, "output_tokens": 261, "cost_micros": 272529, "latency_ms": 9000, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "5"}}
+{"qid": "2312.09390v1.pdf::Q002", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "5", "pages": 49, "document_id": 7595, "arm": "surfsense_agentic", "question_id": "2312.09390v1.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the document to count the figures from Page 3 to Page 10.Now I need to identify which figures appear between Page 3 and Page 10. Let me read the relevant sections more carefully:Now let me check what's on page 11 to confirm the boundary:Based on my analysis, let me count the figures that appear from Page 3 to Page 10:\n\n- **Page 3**: No figures (only text - Related Work section starts)\n- **Page 4**: No figures (Related Work section continues)  \n- **Page 5**: 1 figure (the PGR illustration figure showing weak performance, weak-to-strong performance, strong ceiling performance)\n- **Page 6**: No figures (text only)\n- **Page 7**: 1 figure (Figure 3 - showing weak-to-strong generalization results across NLP, Chess, and ChatGPT RM)\n- **Page 8**: 1 figure (Figure 4 - Bootstrapping improves weak-to-strong generalization on chess puzzles)\n- **Page 9**: 1 figure (Figure 5 - NLP with auxiliary loss)\n- **Page 10**: 1 figure (Figure 6 - Simple auxiliary loss improves generalization across most datasets)\n\nTotal figures from Page 3 to Page 10: 5 figures\n\nAnswer: 5", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 55516, "error": null, "extra": {"thread_id": 2037, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_ad05d8ff7ccf4712bbe93b3462e811bc", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 271, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "5"}}
+{"qid": "2312.09390v1.pdf::Q004", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "6", "pages": 49, "document_id": 7595, "arm": "native_pdf", "question_id": "2312.09390v1.pdf::Q004", "answer_letter": null, "raw_text": "I need to locate Figure 3 in the document and count the number of subfigures it contains.\n\nLooking at the document, Figure 3 appears on page 8. The figure is titled \"Promising weak-to-strong generalization with naive finetuning on NLP tasks and chess, but poor generalization on the ChatGPT reward modeling task.\"\n\nThe figure contains:\n- Panel (a): NLP Tasks - test accuracy plot\n- Panel (b): Chess Puzzles - test accuracy plot  \n- Panel (c): ChatGPT Reward Modeling - test accuracy plot\n- Panel (d): NLP Tasks - performance gap recovered plot\n- Panel (e): Chess Puzzles - performance gap recovered plot\n- Panel (f): ChatGPT Reward Modeling - performance gap recovered plot\n\nThe figure has 6 labeled panels/subfigures (a, b, c, d, e, f).\n\nAnswer: 6", "citations": [], "input_tokens": 121893, "output_tokens": 212, "cost_micros": 368859, "latency_ms": 54641, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.09390v1.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.09390v1.pdf::Q004", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "6", "pages": 49, "document_id": 7595, "arm": "azure_basic_lc", "question_id": "2312.09390v1.pdf::Q004", "answer_letter": null, "raw_text": "I need to find Figure 3 in the document and count its subfigures.\n\nLooking at the document, I can find Figure 3 on page 8. The caption reads:\n\n\"Figure 3: Promising weak-to-strong generalization with naive finetuning on NLP tasks and chess, but poor generalization on the ChatGPT reward modeling task. (a,b,c) Test accuracy as a function of strong student size on (a) NLP tasks, (b) chess puzzles, and (c) the ChatGPT reward modeling task. Accuracy of strong students trained with ground truth in black, accuracy of strong students trained with weak supervision shown with colored lines (hue indicates size of weak supervisor). (d,e,f) Same as panels a,b,c but for performance gap recovered (see Section 3 for details).\"\n\nThe caption explicitly mentions panels (a,b,c) and (d,e,f), which indicates there are 6 subfigures total in Figure 3.\n\nLooking at the actual figure layout, I can see:\n- (a) NLP Tasks\n- (b) Chess Puzzles  \n- (c) ChatGPT Reward Modeling\n- (d) NLP Tasks (PGR)\n- (e) Chess Puzzles (PGR)\n- (f) ChatGPT Reward Modeling (PGR)\n\nAnswer: 6", "citations": [], "input_tokens": 45436, "output_tokens": 322, "cost_micros": 141138, "latency_ms": 8859, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.09390v1.pdf::Q004", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "6", "pages": 49, "document_id": 7595, "arm": "azure_premium_lc", "question_id": "2312.09390v1.pdf::Q004", "answer_letter": null, "raw_text": "I need to find Figure 3 in the document and count its subfigures.\n\nLooking at the document, I can find Figure 3 which is titled \"Promising weak-to-strong generalization with naive finetuning on NLP tasks and chess, but poor generalization on the ChatGPT reward modeling task.\"\n\nThe figure caption describes panels (a,b,c) and (d,e,f):\n- (a,b,c) Test accuracy as a function of strong student size on (a) NLP tasks, (b) chess puzzles, and (c) the ChatGPT reward modeling task\n- (d,e,f) Same as panels a,b,c but for performance gap recovered\n\nSo Figure 3 has 6 subfigures total: (a), (b), (c), (d), (e), and (f).\n\nAnswer: 6", "citations": [], "input_tokens": 49209, "output_tokens": 201, "cost_micros": 150642, "latency_ms": 6328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.09390v1.pdf::Q004", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "6", "pages": 49, "document_id": 7595, "arm": "llamacloud_basic_lc", "question_id": "2312.09390v1.pdf::Q004", "answer_letter": null, "raw_text": "I need to find Figure 3 in the document and count its subfigures.\n\nLooking at the document, I can see Figure 3 is described as:\n\n\"Figure 3: Promising weak-to-strong generalization with naive finetuning on NLP tasks and chess, but poor generalization on the ChatGPT reward modeling task. (a,b,c) Test accuracy as a function of strong student size on (a) NLP tasks, (b) chess puzzles, and (c) the ChatGPT reward modeling task. Accuracy of strong students trained with ground truth in black, accuracy of strong students trained with weak supervision shown with colored lines (hue indicates size of weak supervisor). (d,e,f) Same as panels a,b,c but for performance gap recovered (see Section 3 for details).\"\n\nThe description explicitly mentions:\n- (a), (b), (c) - three subfigures showing test accuracy\n- (d), (e), (f) - three subfigures showing performance gap recovered\n\nThis gives us a total of 6 subfigures in Figure 3.\n\nAnswer: 6", "citations": [], "input_tokens": 47578, "output_tokens": 255, "cost_micros": 146559, "latency_ms": 8389, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.09390v1.pdf::Q004", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "6", "pages": 49, "document_id": 7595, "arm": "llamacloud_premium_lc", "question_id": "2312.09390v1.pdf::Q004", "answer_letter": null, "raw_text": "I need to find Figure 3 in the document and count its subfigures.\n\nLooking through the document for Figure 3...\n\nFound it! Figure 3 is titled \"Promising weak-to-strong generalization with naive finetuning on NLP tasks and chess, but poor generalization on the ChatGPT reward modeling task.\"\n\nThe figure consists of:\n- (a) NLP Tasks - showing test accuracy\n- (b) Chess Puzzles - showing test accuracy  \n- (c) ChatGPT Reward Modeling - showing test accuracy\n- (d) Performance Gap Recovered (%) - NLP Tasks\n- (e) Performance Gap Recovered (%) - Chess Puzzles\n- (f) Performance Gap Recovered (%) - ChatGPT Reward Modeling\n\nSo Figure 3 has 6 subfigures labeled (a) through (f).\n\nAnswer: 6", "citations": [], "input_tokens": 89532, "output_tokens": 204, "cost_micros": 271656, "latency_ms": 8046, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.09390v1.pdf::Q004", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "6", "pages": 49, "document_id": 7595, "arm": "surfsense_agentic", "question_id": "2312.09390v1.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to find Figure 3 and count its subfigures.Let me search for Figure 3 in the document:Now let me search for where Figure 3 appears in the document:Good! I found Figure 3. Let me read the complete caption around line 975:Perfect! I can see from the caption that Figure 3 has subfigures labeled (a), (b), (c), (d), (e), and (f). Let me continue reading to confirm all subfigures:Perfect! I can now clearly see all the subfigures in Figure 3. Based on the caption and the content, Figure 3 contains:\n\n- (a) NLP Tasks - test accuracy\n- (b) Chess Puzzles - test accuracy  \n- (c) ChatGPT Reward Modeling - test accuracy\n- (d) NLP Tasks - performance gap recovered\n- (e) Chess Puzzles - performance gap recovered\n- (f) ChatGPT Reward Modeling - performance gap recovered\n\nAnswer: 6", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 40671, "error": null, "extra": {"thread_id": 2038, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_131c7eb21d53472097ae34a0c3413951", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 191, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.09390v1.pdf::Q005", "doc_id": "2312.09390v1.pdf", "answer_format": "list", "gold": "['Section 4.1', 'Section 4.2', 'Section 4.3', 'Appendix A']", "pages": 49, "document_id": 7595, "arm": "native_pdf", "question_id": "2312.09390v1.pdf::Q005", "answer_letter": null, "raw_text": "I need to find all sections that discuss the experiment setup in this paper about weak-to-strong generalization.\n\nLet me go through the document systematically:\n\n1. **Section 3: METHODOLOGY** - This section describes the general methodology and setup for weak-to-strong learning experiments.\n\n2. **Section 4.1: TASKS** - This describes the three main task settings: NLP benchmarks, chess puzzles, and ChatGPT reward modeling.\n\n3. **Appendix A: FURTHER EXPERIMENTAL DETAILS** - This entire appendix provides detailed experimental setup information:\n   - A.1: NLP TASKS - Details about data preprocessing, models, training hyperparameters, weak labels, and evaluation\n   - A.2: CHESS PUZZLES - Details about data preprocessing, training hyperparameters, weak labels, and evaluation\n   - A.3: CHATGPT REWARD MODELING - Details about data preprocessing, models, training hyperparameters, and weak labels\n   - A.4: AUXILIARY CONFIDENCE LOSS - Details about the confidence loss method\n\n4. **Appendix C: EASY-TO-HARD GENERALIZATION** - Describes additional experimental setups:\n   - C.1: CHESS PUZZLES\n   - C.2: NLP TASKS: DIFFICULTY THRESHOLDING\n   - C.3: GPT-4 PREDICTED DIFFICULTY\n\n5. **Appendix D: OTHER WEAK-TO-STRONG SETTINGS** - Describes additional experimental settings:\n   - D.1: SELF-SUPERVISED VISION MODELS\n   - D.2: LINEAR PROBING\n\n6. **Appendix E: THE EFFECTS OF WEAK LABEL STRUCTURE** - Describes synthetic experiments:\n   - E.1: SYNTHETIC EXPERIMENTS ON SIMULATION DIFFICULTY\n   - E.2: DIFFERENT WEAK ERROR STRUCTURE MEANS DIFFERENT GENERALIZATION\n   - E.3: MAKING IMITATION TRIVIAL\n\nAnswer: Section 3 (Methodology), Section 4.1 (Tasks), Appendix A (Further Experimental Details), Appendix A.1 (NLP Tasks), Appendix A.2 (Chess Puzzles), Appendix A.3 (ChatGP", "citations": [], "input_tokens": 121905, "output_tokens": 512, "cost_micros": 373395, "latency_ms": 49562, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "length", "pdf_filename": "2312.09390v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "appendix 1 nlp tasks, appendix 2 chess puzzles, appendix 3 chatgp, appendix further experimental details, section 3 methodology, section 4 1 tasks", "normalised_gold": "appendix, section 4 1, section 4 2, section 4 3"}}
+{"qid": "2312.09390v1.pdf::Q005", "doc_id": "2312.09390v1.pdf", "answer_format": "list", "gold": "['Section 4.1', 'Section 4.2', 'Section 4.3', 'Appendix A']", "pages": 49, "document_id": 7595, "arm": "azure_basic_lc", "question_id": "2312.09390v1.pdf::Q005", "answer_letter": null, "raw_text": "I need to identify all sections that discuss the experiment setup in this document.\n\nLet me go through the document systematically:\n\n1. **Section 3 METHODOLOGY** - This section describes the overall methodology and experimental setup for weak-to-strong learning.\n\n2. **Section 4.1 TASKS** - This describes the tasks used in experiments (NLP benchmarks, Chess puzzles, ChatGPT reward modeling).\n\n3. **Appendix A FURTHER EXPERIMENTAL DETAILS** - This entire appendix is dedicated to experimental details.\n\n4. **Appendix A.1 NLP TASKS** - Details about NLP task setup including data preprocessing, models, training hyperparameters, weak labels, and evaluation.\n\n5. **Appendix A.2 CHESS PUZZLES** - Details about chess puzzle setup including data preprocessing, training hyperparameters, weak labels, and evaluation.\n\n6. **Appendix A.3 CHATGPT REWARD MODELING** - Details about reward modeling setup including data preprocessing, models, training hyperparameters, weak labels.\n\n7. **Appendix A.4 AUXILIARY CONFIDENCE LOSS** - Detailed description of the auxiliary confidence loss method.\n\n8. **Appendix C EASY-TO-HARD GENERALIZATION** - Discusses experimental setup for easy-to-hard generalization experiments.\n\n9. **Appendix C.1 CHESS PUZZLES** - Setup for easy-to-hard generalization on chess.\n\n10. **Appendix C.2 NLP TASKS: DIFFICULTY THRESHOLDING** - Setup for difficulty-based experiments on NLP.\n\n11. **Appendix C.3 GPT-4 PREDICTED DIFFICULTY** - Setup using GPT-4 predicted difficulty.\n\n12. **Appendix D OTHER WEAK-TO-STRONG SETTINGS** - Discusses alternative experimental settings.\n\n13. **Appendix D.1 SELF-SUPERVISED VISION MODELS** - Setup for vision model experiments.\n\n14. **Appendix D.2 LINEAR PROBING** - Setup for linear probing experiments.\n\n15. **Appendix E THE EFFECTS OF WEAK LABEL STRUCTURE** - Discusses experimental setup for studying weak label structure.\n\n16. **", "citations": [], "input_tokens": 45448, "output_tokens": 512, "cost_micros": 144024, "latency_ms": 10405, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "16", "normalised_gold": "appendix, section 4 1, section 4 2, section 4 3"}}
+{"qid": "2312.09390v1.pdf::Q005", "doc_id": "2312.09390v1.pdf", "answer_format": "list", "gold": "['Section 4.1', 'Section 4.2', 'Section 4.3', 'Appendix A']", "pages": 49, "document_id": 7595, "arm": "azure_premium_lc", "question_id": "2312.09390v1.pdf::Q005", "answer_letter": null, "raw_text": "I need to identify all sections that discuss the experiment setup from the document.\n\nLet me go through the document systematically:\n\n1. **Section 3: METHODOLOGY** - This section describes the overall methodology and setup for the weak-to-strong learning experiments.\n\n2. **Section 4.1: TASKS** - This describes the specific tasks used in experiments (NLP benchmarks, chess puzzles, ChatGPT reward modeling).\n\n3. **Appendix A: FURTHER EXPERIMENTAL DETAILS** - This entire appendix section provides detailed experimental setup information, including:\n   - A.1 NLP TASKS\n   - A.2 CHESS PUZZLES\n   - A.3 CHATGPT REWARD MODELING\n   - A.4 AUXILIARY CONFIDENCE LOSS\n\n4. **Appendix C: EASY-TO-HARD GENERALIZATION** - Contains experimental setup for easy-to-hard generalization experiments, including:\n   - C.1 CHESS PUZZLES\n   - C.2 NLP TASKS: DIFFICULTY THRESHOLDING\n   - C.3 GPT-4 PREDICTED DIFFICULTY\n\n5. **Appendix D: OTHER WEAK-TO-STRONG SETTINGS** - Describes setup for additional experiments:\n   - D.1 SELF-SUPERVISED VISION MODELS\n   - D.2 LINEAR PROBING\n\n6. **Appendix E: THE EFFECTS OF WEAK LABEL STRUCTURE** - Contains experimental setup for studying weak label structures:\n   - E.1 SYNTHETIC EXPERIMENTS ON SIMULATION DIFFICULTY\n   - E.2 DIFFERENT WEAK ERROR STRUCTURE MEANS DIFFERENT GENERALIZATION\n   - E.3 MAKING IMITATION TRIVIAL\n\nAnswer: Section 3, Section 4.1, Appendix A, Appendix A.1, Appendix A.2, Appendix A.3, Appendix A.4, Appendix C.1, Appendix C.2, Appendix C.3, Appendix D.1, Appendix D.2, Appendix E.1, Appendix E.2, Appendix E.3", "citations": [], "input_tokens": 49221, "output_tokens": 499, "cost_micros": 155148, "latency_ms": 10391, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.2105263157894737, "method": "list_set", "normalised_pred": "appendix, appendix 1, appendix 2, appendix 3, appendix 4, appendix c 1, appendix c 2, appendix c 3, appendix d 1, appendix d 2, appendix e 1, appendix e 2, appendix e 3, section 3, section 4 1", "normalised_gold": "appendix, section 4 1, section 4 2, section 4 3"}}
+{"qid": "2312.09390v1.pdf::Q005", "doc_id": "2312.09390v1.pdf", "answer_format": "list", "gold": "['Section 4.1', 'Section 4.2', 'Section 4.3', 'Appendix A']", "pages": 49, "document_id": 7595, "arm": "llamacloud_basic_lc", "question_id": "2312.09390v1.pdf::Q005", "answer_letter": null, "raw_text": "I need to identify all sections that discuss the experimental setup from the document.\n\nLet me scan through the document systematically:\n\n1. **Section 3 METHODOLOGY** - This section describes the core experimental methodology for weak-to-strong learning, including creating weak supervisors, training strong students, and training with ground truth labels.\n\n2. **Section 4.1 TASKS** - This describes the specific tasks used in experiments: NLP benchmarks, chess puzzles, and ChatGPT reward modeling.\n\n3. **Appendix A FURTHER EXPERIMENTAL DETAILS** - This entire appendix provides detailed experimental setup information, including:\n   - A.1 NLP TASKS (data preprocessing, models, training hyperparameters, weak labels, evaluation)\n   - A.2 CHESS PUZZLES (data preprocessing, training hyperparameters, weak labels, evaluation)\n   - A.3 CHATGPT REWARD MODELING (data preprocessing, models, training hyperparameters, weak labels, evaluation)\n   - A.4 AUXILIARY CONFIDENCE LOSS (detailed description of the method)\n\n4. **Appendix D OTHER WEAK-TO-STRONG SETTINGS** - This discusses additional experimental settings:\n   - D.1 SELF-SUPERVISED VISION MODELS\n   - D.2 LINEAR PROBING\n\n5. **Appendix C** mentions experimental setup for easy-to-hard generalization experiments.\n\n6. **Appendix E** discusses experimental setup for synthetic experiments on weak label structure.\n\nAnswer: Section 3 METHODOLOGY, Section 4.1 TASKS, Appendix A FURTHER EXPERIMENTAL DETAILS, Appendix A.1 NLP TASKS, Appendix A.2 CHESS PUZZLES, Appendix A.3 CHATGPT REWARD MODELING, Appendix A.4 AUXILIARY CONFIDENCE LOSS, Appendix D OTHER WEAK-TO-STRONG SETTINGS, Appendix D.1 SELF-SUPERVISED VISION MODELS, Appendix D.2 LINEAR PROBING", "citations": [], "input_tokens": 47590, "output_tokens": 470, "cost_micros": 149820, "latency_ms": 10702, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "appendix 1 nlp tasks, appendix 2 chess puzzles, appendix 3 chatgpt reward modeling, appendix 4 auxiliary confidence loss, appendix d 1 self supervised vision models, appendix d 2 linear probing, appendix d other weak to strong settings, appendix further experimental details, section 3 methodology, section 4 1 tasks", "normalised_gold": "appendix, section 4 1, section 4 2, section 4 3"}}
+{"qid": "2312.09390v1.pdf::Q005", "doc_id": "2312.09390v1.pdf", "answer_format": "list", "gold": "['Section 4.1', 'Section 4.2', 'Section 4.3', 'Appendix A']", "pages": 49, "document_id": 7595, "arm": "llamacloud_premium_lc", "question_id": "2312.09390v1.pdf::Q005", "answer_letter": null, "raw_text": "Looking through the document for sections that discuss experiment setup:\n\n1. **Section 3 METHODOLOGY** - Describes the general weak-to-strong learning setup\n2. **Section 4.1 TASKS** - Describes the three task types (NLP, Chess, Reward Modeling)\n3. **Appendix A FURTHER EXPERIMENTAL DETAILS** - Contains detailed experimental setup information with subsections:\n   - A.1 NLP TASKS\n   - A.2 CHESS PUZZLES\n   - A.3 CHATGPT REWARD MODELING\n   - A.4 AUXILIARY CONFIDENCE LOSS\n4. **Appendix D.1 SELF-SUPERVISED VISION MODELS** - Describes ImageNet experiment setup\n5. **Appendix D.2 LINEAR PROBING** - Describes linear probing experiment setup\n6. **Appendix C EASY-TO-HARD GENERALIZATION** - Describes setup for easy-to-hard experiments with subsections:\n   - C.1 CHESS PUZZLES\n   - C.2 NLP TASKS: DIFFICULTY THRESHOLDING\n   - C.3 GPT-4 PREDICTED DIFFICULTY\n7. **Appendix E.1 SYNTHETIC EXPERIMENTS ON SIMULATION DIFFICULTY** - Describes synthetic experiment setup\n\nAnswer: Section 3 METHODOLOGY, Section 4.1 TASKS, Appendix A FURTHER EXPERIMENTAL DETAILS, Appendix A.1 NLP TASKS, Appendix A.2 CHESS PUZZLES, Appendix A.3 CHATGPT REWARD MODELING, Appendix A.4 AUXILIARY CONFIDENCE LOSS, Appendix C EASY-TO-HARD GENERALIZATION, Appendix C.1 CHESS PUZZLES, Appendix C.2 NLP TASKS: DIFFICULTY THRESHOLDING, Appendix C.3 GPT-4 PREDICTED DIFFICULTY, Appendix D.1 SELF-SUPERVISED VISION MODELS, Appendix D.2 LINEAR PROBING, Appendix E.1 SYNTHETIC EXPERIMENTS ON SIMULATION DIFFICULTY", "citations": [], "input_tokens": 89544, "output_tokens": 499, "cost_micros": 276117, "latency_ms": 10375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "appendix 1 nlp tasks, appendix 2 chess puzzles, appendix 3 chatgpt reward modeling, appendix 4 auxiliary confidence loss, appendix c 1 chess puzzles, appendix c 2 nlp tasks difficulty thresholding, appendix c 3 gpt 4 predicted difficulty, appendix c easy to hard generalization, appendix d 1 self supervised vision models, appendix d 2 linear probing, appendix e 1 synthetic experiments on simulation difficulty, appendix further experimental details, section 3 methodology, section 4 1 tasks", "normalised_gold": "appendix, section 4 1, section 4 2, section 4 3"}}
+{"qid": "2312.09390v1.pdf::Q005", "doc_id": "2312.09390v1.pdf", "answer_format": "list", "gold": "['Section 4.1', 'Section 4.2', 'Section 4.3', 'Appendix A']", "pages": 49, "document_id": 7595, "arm": "surfsense_agentic", "question_id": "2312.09390v1.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to identify all sections that discuss the experiment setup.Let me continue reading to see the complete document structure and identify all sections related to experiment setup.Now let me search specifically for \"experiment\" or \"setup\" to find sections discussing experimental setup:Now let me search more systematically for sections that discuss experimental setup:Based on my reading of the document, I can now identify all the sections that discuss experiment setup. Let me compile the complete list:Now I have a comprehensive view of the document. Let me identify all sections that discuss experiment setup:\n\nAnswer: 3 METHODOLOGY, 4.1 TASKS, A FURTHER EXPERIMENTAL DETAILS, A.1 NLP TASKS, A.2 CHESS PUZZLES, A.3 CHATGPT REWARD MODELING, A.4 AUXILIARY CONFIDENCE LOSS, E.1 SYNTHETIC EXPERIMENTS ON SIMULATION DIFFICULTY", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 69733, "error": null, "extra": {"thread_id": 2039, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_bfc57bfcbbd44c6786a0cd5058f61c15", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 309, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "1 nlp tasks, 2 chess puzzles, 3 chatgpt reward modeling, 3 methodology, 4 1 tasks, 4 auxiliary confidence loss, e 1 synthetic experiments on simulation difficulty, further experimental details", "normalised_gold": "appendix, section 4 1, section 4 2, section 4 3"}}
+{"qid": "2312.09390v1.pdf::Q006", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "6", "pages": 49, "document_id": 7595, "arm": "native_pdf", "question_id": "2312.09390v1.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the citation information for the paper by Long Ouyang published in 2022.\n\nLooking through the references section, I can find:\n\nLong Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, et al. Training language models to follow instructions with human feedback. Advances in Neural Information Processing Systems, 35: 27730\u201327744, 2022.\n\nHowever, the document does not contain information about how many times this paper has been cited. The document is the paper itself (\"Weak-to-Strong Generalization\") which cites the Ouyang et al. 2022 paper, but it does not provide citation counts for the references it lists.\n\nI cannot determine the number of citations for the Ouyang et al. 2022 paper from this document alone.\n\nAnswer: Cannot be determined from the provided document", "citations": [], "input_tokens": 121904, "output_tokens": 231, "cost_micros": 369177, "latency_ms": 39750, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.09390v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "6"}}
+{"qid": "2312.09390v1.pdf::Q006", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "6", "pages": 49, "document_id": 7595, "arm": "azure_basic_lc", "question_id": "2312.09390v1.pdf::Q006", "answer_letter": null, "raw_text": "I need to find citations of the paper by Long Ouyang published in 2022.\n\nLooking through the document, I can find references to \"Ouyang et al., 2022\" in the references section:\n\n\"Long Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, et al. Training language models to follow instructions with human feedback. Advances in Neural Information Processing Systems, 35: 27730-27744, 2022.\"\n\nNow I need to count how many times this paper is cited throughout the document. Let me search for \"Ouyang et al., 2022\":\n\n1. Page 1: \"Christiano et al., 2017; Stiennon et al., 2020; Ouyang et al., 2022; Glaese et al., 2022; Bai et al., 2022a\"\n2. Page 2: \"Christiano et al., 2017; Wu et al., 2021; Ouyang et al., 2022; Bowman et al., 2022; Saunders et al., 2022\"\n3. Page 5: \"Christiano et al., 2017; Stiennon et al., 2020; Ouyang et al., 2022; Glaese et al., 2022; Bai et al., 2022a\"\n4. Page 7: \"(Ouyang et al., 2022)\"\n5. Page 32: \"Ouyang et al., 2022\"\n6. Page 47: \"Ouyang et al., 2022\"\n\nTotal count: 6 citations\n\nAnswer: 6", "citations": [], "input_tokens": 45447, "output_tokens": 423, "cost_micros": 142686, "latency_ms": 10109, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.09390v1.pdf::Q006", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "6", "pages": 49, "document_id": 7595, "arm": "azure_premium_lc", "question_id": "2312.09390v1.pdf::Q006", "answer_letter": null, "raw_text": "I need to find how many times the paper by Long Ouyang published in 2022 has been cited in this document.\n\nLet me search for citations to Long Ouyang 2022 in the references and throughout the document.\n\nLooking at the references section, I can find:\n\n\"Long Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, et al. Training language models to follow instructions with human feedback. Advances in Neural Information Processing Systems, 35: 27730-27744, 2022.\"\n\nNow I need to count how many times \"Ouyang et al., 2022\" or similar citations appear in the document.\n\nSearching through the document:\n\n1. Page 1: \"Ouyang et al., 2022\"\n2. Page 2: \"Ouyang et al., 2022\"\n3. Page 5: \"Ouyang et al., 2022\"\n4. Page 7: \"Ouyang et al., 2022\"\n5. Page 32: \"Ouyang et al., 2022\"\n6. Page 47: \"Ouyang et al., 2022\"\n\nLet me verify by searching more carefully through the text for all instances of \"Ouyang\":\n\n- Introduction section: 1 citation\n- Related Work section: 1 citation  \n- Tasks section: 1 citation\n- ChatGPT reward modeling subsection: 1 citation\n- Models subsection (A.3): 1 citation\n- Alignment plan section: 1 citation\n\nTotal count: 6 citations\n\nAnswer: 6", "citations": [], "input_tokens": 49220, "output_tokens": 405, "cost_micros": 153735, "latency_ms": 8905, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.09390v1.pdf::Q006", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "6", "pages": 49, "document_id": 7595, "arm": "llamacloud_basic_lc", "question_id": "2312.09390v1.pdf::Q006", "answer_letter": null, "raw_text": "I need to find how many times the paper by Long Ouyang published in 2022 has been cited in this document.\n\nLet me search for citations of Ouyang et al., 2022 or Long Ouyang, 2022.\n\nLooking through the document, I can find the following citations to \"Ouyang et al., 2022\":\n\n1. Page 1: \"Christiano et al., 2017; Stiennon et al., 2020; Ouyang et al., 2022; Glaese et al., 2022; Bai et al., 2022a\"\n2. Page 2: \"Wu et al., 2021; Ouyang et al., 2022; Bowman et al., 2022; Saunders et al., 2022\"\n3. Page 5: \"Christiano et al., 2017; Stiennon et al., 2020; Ouyang et al., 2022; Glaese et al., 2022; Bai et al., 2022a\"\n4. Page 7: \"(Ouyang et al., 2022)\"\n5. Page 32: \"(Stiennon et al., 2020; Ouyang et al., 2022)\"\n6. Page 47: \"(h - tiano et al., 2017; Ouyang et al., 2022)\"\n\nLet me count all instances:\n1. Page 1 - 1 citation\n2. Page 2 - 1 citation\n3. Page 5 - 1 citation\n4. Page 7 - 1 citation\n5. Page 32 - 1 citation\n6. Page 47 - 1 citation\n\nTotal: 6 citations\n\nAnswer: 6", "citations": [], "input_tokens": 47589, "output_tokens": 417, "cost_micros": 149022, "latency_ms": 7516, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.09390v1.pdf::Q006", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "6", "pages": 49, "document_id": 7595, "arm": "llamacloud_premium_lc", "question_id": "2312.09390v1.pdf::Q006", "answer_letter": null, "raw_text": "I need to find citations for the paper by Long Ouyang published in 2022.\n\nLooking through the document, I can find the reference:\n\nLong Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, et al. Training language models to follow instructions with human feedback. *Advances in Neural Information Processing Systems*, 35: 27730\u201327744, 2022.\n\nNow I need to count how many times this paper (Ouyang et al., 2022) is cited in the document.\n\nSearching through the document for \"Ouyang et al., 2022\":\n\n1. Page 1: \"Ouyang et al., 2022\"\n2. Page 2: \"Ouyang et al., 2022\"\n3. Page 5: \"Ouyang et al., 2022\"\n4. Page 7: \"Ouyang et al., 2022\"\n5. Page 32: \"Ouyang et al., 2022\"\n6. Page 47: \"Ouyang et al., 2022\"\n\nAnswer: 6", "citations": [], "input_tokens": 89543, "output_tokens": 289, "cost_micros": 272964, "latency_ms": 8312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.09390v1.pdf::Q006", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "6", "pages": 49, "document_id": 7595, "arm": "surfsense_agentic", "question_id": "2312.09390v1.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the document that was flagged as user-mentioned to find information about Long Ouyang's 2022 paper and its citation count.Let me search for \"Long Ouyang\" and \"2022\" in this document, and look for citation information.", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 25655, "error": null, "extra": {"thread_id": 2040, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_d3a50787f8a940c590bfec1b692a8eb3", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 70, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2022", "normalised_gold": "6"}}
+{"qid": "2312.09390v1.pdf::Q007", "doc_id": "2312.09390v1.pdf", "answer_format": "str", "gold": "finetune g.t.", "pages": 49, "document_id": 7595, "arm": "native_pdf", "question_id": "2312.09390v1.pdf::Q007", "answer_letter": null, "raw_text": "Looking at Figure 9 with its three subfigures (a), (b), and (c):\n\n- Subfigure (a) shows that for ground truth labels, **finetuning** (solid lines) performs best across all model sizes, reaching approximately 95% accuracy for the largest models.\n\n- Subfigure (b) shows weak-to-strong performance with weak labels, where **fewshot weak labels (n=5)** (the method shown with crosses/\u00d7) performs best for the largest student models.\n\n- Subfigure (c) directly compares methods and shows that **finetune aux. loss** (solid line with triangles) achieves the highest performance, reaching approximately 90% accuracy for the largest models.\n\nOverall, across the figure, **finetune aux. loss** appears to be the best method when considering weak-to-strong learning performance.\n\nAnswer: finetune aux. loss", "citations": [], "input_tokens": 121910, "output_tokens": 207, "cost_micros": 368835, "latency_ms": 39203, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.09390v1.pdf"}, "graded": {"correct": false, "f1": 0.3333333333333333, "method": "str_norm", "normalised_pred": "finetune aux loss", "normalised_gold": "finetune g t"}}
+{"qid": "2312.09390v1.pdf::Q007", "doc_id": "2312.09390v1.pdf", "answer_format": "str", "gold": "finetune g.t.", "pages": 49, "document_id": 7595, "arm": "azure_basic_lc", "question_id": "2312.09390v1.pdf::Q007", "answer_letter": null, "raw_text": "Looking at Figure 9, which shows three subfigures (a), (b), and (c) comparing different methods:\n\n- Subfigure (a) compares zero-shot, 5-shot ground truth, and finetune ground truth\n- Subfigure (b) compares zero-shot, 5-shot ground truth, and 5-shot with weak labels  \n- Subfigure (c) compares 5-shot with weak labels, naive finetuning, and finetuning with auxiliary confidence loss\n\nAcross all three subfigures, examining the performance curves for the largest strong student models (rightmost side of each plot), the method that achieves the highest test accuracy is **finetuning with the auxiliary confidence loss** (shown with triangles in subfigure c).\n\nAnswer: Finetuning with auxiliary confidence loss", "citations": [], "input_tokens": 45453, "output_tokens": 193, "cost_micros": 139254, "latency_ms": 6625, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "finetuning with auxiliary confidence loss", "normalised_gold": "finetune g t"}}
+{"qid": "2312.09390v1.pdf::Q007", "doc_id": "2312.09390v1.pdf", "answer_format": "str", "gold": "finetune g.t.", "pages": 49, "document_id": 7595, "arm": "azure_premium_lc", "question_id": "2312.09390v1.pdf::Q007", "answer_letter": null, "raw_text": "Looking at Figure 9, which shows three subfigures (a), (b), and (c) comparing different methods across model sizes:\n\nThe figure compares:\n- Zero-shot\n- Few-shot with ground truth (n=5)\n- Finetune with ground truth\n- Few-shot with weak labels (n=5)\n- Finetune (baseline)\n- Finetune with auxiliary loss\n\nFrom the plots, particularly subfigure (c) which directly compares weak-to-strong performance, the method that consistently achieves the highest test accuracy across strong student compute sizes is the finetuning with auxiliary confidence loss (shown with solid line and triangles).\n\nAnswer: finetune with aux. loss", "citations": [], "input_tokens": 49226, "output_tokens": 163, "cost_micros": 150123, "latency_ms": 7094, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.28571428571428575, "method": "str_norm", "normalised_pred": "finetune with aux loss", "normalised_gold": "finetune g t"}}
+{"qid": "2312.09390v1.pdf::Q007", "doc_id": "2312.09390v1.pdf", "answer_format": "str", "gold": "finetune g.t.", "pages": 49, "document_id": 7595, "arm": "llamacloud_basic_lc", "question_id": "2312.09390v1.pdf::Q007", "answer_letter": null, "raw_text": "Looking at Figure 9, which contains three subfigures (a), (b), and (c), I need to identify the six methods shown and determine which performs best.\n\nFrom the figure:\n- Subfigure (a) shows: zero-shot, 5-shot with ground truth (g.t.), and finetuning with ground truth (g.t.)\n- Subfigure (b) shows: zero-shot, 5-shot with ground truth, and 5-shot with weak labels\n- Subfigure (c) shows: 5-shot prompting (weak-to-strong), naive finetuning, and finetuning with auxiliary confidence loss\n\nThe six methods are:\n1. Zero-shot\n2. 5-shot with ground truth\n3. Finetuning with ground truth\n4. 5-shot with weak labels\n5. Naive finetuning (weak-to-strong)\n6. Finetuning with auxiliary confidence loss\n\nLooking at subfigure (c), which directly compares three weak-to-strong methods, the finetuning with auxiliary confidence loss (shown with solid line and triangles) achieves the highest performance, especially for larger model sizes.\n\nAmong all methods shown across the three subfigures, finetuning with ground truth labels would naturally perform best, but among the weak-to-strong methods (which is the focus of subfigure c), the auxiliary confidence loss performs best.\n\nAnswer: finetuning with auxiliary confidence loss", "citations": [], "input_tokens": 47595, "output_tokens": 339, "cost_micros": 147870, "latency_ms": 8719, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "finetuning with auxiliary confidence loss", "normalised_gold": "finetune g t"}}
+{"qid": "2312.09390v1.pdf::Q007", "doc_id": "2312.09390v1.pdf", "answer_format": "str", "gold": "finetune g.t.", "pages": 49, "document_id": 7595, "arm": "llamacloud_premium_lc", "question_id": "2312.09390v1.pdf::Q007", "answer_letter": null, "raw_text": "Looking at Figure 9 with its three subfigures (a), (b), and (c):\n\nIn subfigure (a), comparing zero-shot, 5-shot with ground truth, and finetuning with ground truth, **finetuning with ground truth** achieves the highest test accuracy.\n\nIn subfigure (b), which shows 5-shot prompting with weak labels from different weak supervisors, the best performing is **5-shot with weak labels from the largest weak supervisor (0.1)**.\n\nIn subfigure (c), which compares 5-shot weak labels, naive finetuning, and finetuning with auxiliary loss, **finetuning with auxiliary loss** achieves the highest performance.\n\nOverall, across all methods shown in Figure 9, the best performing method is:\n\nAnswer: finetuning with auxiliary loss", "citations": [], "input_tokens": 89549, "output_tokens": 194, "cost_micros": 271557, "latency_ms": 8141, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "finetuning with auxiliary loss", "normalised_gold": "finetune g t"}}
+{"qid": "2312.09390v1.pdf::Q007", "doc_id": "2312.09390v1.pdf", "answer_format": "str", "gold": "finetune g.t.", "pages": 49, "document_id": 7595, "arm": "surfsense_agentic", "question_id": "2312.09390v1.pdf::Q007", "answer_letter": null, "raw_text": "I'll read the document to find Figure 9 and identify which method performs best among the six shown.Let me search for Figure 9 in the document:Let me read the section containing Figure 9 to see the actual figure and understand which methods are being compared:Perfect! Now I can see Figure 9 with three subfigures (a), (b), and (c). Let me read a bit more to understand the comparison better:Based on my reading of the document, Figure 9 contains three subfigures (a, b, c) that compare six different methods:\n\nFrom the figure caption and description:\n- **Panel (a)**: zero-shot, 5-shot with ground truth, finetune with ground truth\n- **Panel (b)**: 5-shot with weak labels (multiple supervisor sizes), zero-shot, 5-shot with ground truth\n- **Panel (c)**: 5-shot weak-to-strong prompting, naive finetuning (weak-to-strong), and **finetuning with the confidence loss (auxiliary loss)**\n\nThe text at line 2078-2079 states: \"However, **weak-to-strong finetuning with the confidence loss still generally outperforms** weak-to-strong few-shot prompting.\"\n\nAnd the figure caption at line 1942-1944 states: \"Few-shot weak-to-strong performance becomes competitive with or outperforms finetuning for the largest strong students, though **finetuning with the confidence loss does better**.\"\n\nThe six methods being compared across the three subfigures are:\n1. Zero-shot\n2. 5-shot with ground truth\n3. Finetune with ground truth\n4. 5-shot with weak labels\n5. Naive finetuning (weak-to-strong)\n6. Finetuning with auxiliary/confidence loss\n\nAnswer: Finetuning with the confidence loss (auxiliary loss)", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 47672, "error": null, "extra": {"thread_id": 2041, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_4655e497891e45598d32be191f19ac1d", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 260, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "finetuning with confidence loss auxiliary loss", "normalised_gold": "finetune g t"}}
+{"qid": "2312.10997v5.pdf::Q000", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "Pre-training", "pages": 21, "document_id": 7596, "arm": "native_pdf", "question_id": "2312.10997v5.pdf::Q000", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "pre training"}}
+{"qid": "2312.10997v5.pdf::Q000", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "Pre-training", "pages": 21, "document_id": 7596, "arm": "azure_basic_lc", "question_id": "2312.10997v5.pdf::Q000", "answer_letter": null, "raw_text": "I need to examine the tree-shaped figure in the document to identify which branch has the least leaves.\n\nLooking at Figure 1 (the technology tree of RAG research), I can see it's organized by year (2020, 2023, 2024) and has three main branches at the top labeled:\n- \"Augmentation Stage\"\n- \"Fine-tuning\" \n- \"Pre-training\"\n\nLet me count the leaves (methods/papers) in each branch:\n\n**Pre-training branch:**\n- 2020: REALM (implied from context)\n- 2022: Retro, CoG, SANTA, TIGER\n- 2023-2024: RAVEN, Retro++, InstructRetro\n\nThis appears to have around 7-8 leaves.\n\n**Fine-tuning branch:**\nThis has many methods spread across 2023-2024, including PROMPTAGATOR, LM-Indexer, various methods - appears to have 15+ leaves.\n\n**Inference branch:**\nThis has the most methods, with numerous entries across 2023-2024 - appears to have 30+ leaves.\n\nLooking more carefully at the tree structure, the \"Pre-training\" branch (on the left side of the tree) has the fewest methods/leaves compared to the other branches.\n\nAnswer: Pre-training", "citations": [], "input_tokens": 33376, "output_tokens": 305, "cost_micros": 104703, "latency_ms": 9515, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "pre training", "normalised_gold": "pre training"}}
+{"qid": "2312.10997v5.pdf::Q000", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "Pre-training", "pages": 21, "document_id": 7596, "arm": "azure_premium_lc", "question_id": "2312.10997v5.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1 (the Technology tree of RAG research), I need to count the number of methods/papers shown in each branch:\n\n- **Pre-training branch** (bottom): Contains methods like TIGER, SANTA, Atlas, RAVEN, RETRO++, INSTRUCTRETRO, CoG, EAR, KALMV - approximately 9-10 items\n\n- **Fine-tuning branch** (middle): Contains methods like G-Retriever, RADA, CT-RAG, HyKGE, FILCO, InstructRetro, RA-DIT, Self-RAG, FLARE, and many others - this is the largest branch with approximately 25+ items\n\n- **Inference branch** (top): Contains methods like RAPTOR, UniMS-RAG, CRAG, BGM, IAG, BEQ, ARM-RAG, and several others - approximately 15-20 items\n\nThe Pre-training branch clearly has the fewest methods/leaves compared to the other two branches.\n\nAnswer: Pre-training", "citations": [], "input_tokens": 40683, "output_tokens": 242, "cost_micros": 125679, "latency_ms": 8514, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "pre training", "normalised_gold": "pre training"}}
+{"qid": "2312.10997v5.pdf::Q000", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "Pre-training", "pages": 21, "document_id": 7596, "arm": "llamacloud_basic_lc", "question_id": "2312.10997v5.pdf::Q000", "answer_letter": null, "raw_text": "I need to look at the figure in the document that has a tree shape and identify which branch has the least leaves.\n\nLooking at the document, I can see Figure 1, which is described as a \"Technology tree of RAG research.\" This figure shows three main stages: Pre-training, Fine-tuning, and Inference, with various methods listed under each branch.\n\nLet me count the items in each branch:\n\n**Pre-training branch:**\n- CoG, RAVEN, Atlas, RAG, RETRO++, INSTRUCTRETRO, TIGER, SANTA (and a few more visible items)\n- This appears to have around 8-10 items\n\n**Fine-tuning branch:**\n- UniMS-RAG, Self-RAG, MK-ToD, QLM-Doc-ranking, SKR, SANTA, IRCOT, TIGER, Self-Mem, SCM4LLMs (and more)\n- This appears to have the most items, around 15-20+\n\n**Inference branch:**\n- CRAG, CT-RAG, EARCoN, CREA-ICL, PRCA, BEQ, ARM-RAG, PROMPTAGATOR, ZemiR-GQA (and many more)\n- This also has many items\n\nLooking at the figure more carefully, the **Pre-training** branch appears to have the fewest items/leaves compared to the Fine-tuning and Inference branches.\n\nAnswer: Pre-training", "citations": [], "input_tokens": 30919, "output_tokens": 336, "cost_micros": 97797, "latency_ms": 10046, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "pre training", "normalised_gold": "pre training"}}
+{"qid": "2312.10997v5.pdf::Q000", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "Pre-training", "pages": 21, "document_id": 7596, "arm": "llamacloud_premium_lc", "question_id": "2312.10997v5.pdf::Q000", "answer_letter": null, "raw_text": "I need to examine the tree-shaped figure in the document to identify which branch has the least leaves (items).\n\nLooking at Figure 1 titled \"Technology Tree of RAG research,\" I can see it's organized into three main columns representing different stages:\n- Pre-training\n- Fine-tuning  \n- Inference\n\nLet me count the items in each branch/column:\n\n**Pre-training column:** Contains items like:\n- G-Retriever, CRAG, IAG, FILCO, ARM-RAG, RADA, RAPTOR, BGM, HyKGE, DRAGON-AI, PaperQA, CREA-ICL\n- PRCA, Token-Elimination, KALMV, ITRG\n- 1-PAGER, FABULA, SKR, ToC, QLM-Doc-ranking, Recomp\n- KGP, LLM-R, ITER-RETGEN, COQ, LLM-IE, ICRALM, DSP\n- KnowledGPT, IRCOT, PGRA, PKG, SCM4LLMs, Filter-Reranker, RePLUG\n- CoG, TIGER, Retro++\n- GenRead\n- (empty row)\n- RECITE\n\n**Fine-tuning column:** Contains items like:\n- UniMS-RAG, CT-RAG, BEQ, CoN, EAR\n- RAST, Self-RAG, LM-Indexer, RA-DIT, Dual-Feedback-ToD, MK-ToD, RAG_Robust, Retrieve-and-Sample\n- InstructRetro\n- RAVEN\n- SANTA, RRR, Self-Mem, UPRISE, R-GQA, Zemi\n- SUGRE, AAR, FLARE\n- PROMPTAGATOR, RAG-e2e\n- Atlas\n- RAG\n\n**Inference column:** This appears to be the largest with many items distributed across multiple years.\n\nComparing the three branches, the **Fine-tuning** column appears to have fewer distinct method names than the other two columns, but I need to count more carefully", "citations": [], "input_tokens": 37501, "output_tokens": 512, "cost_micros": 120183, "latency_ms": 18250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "comparing three branches fine tuning column appears to have fewer distinct method names than other two columns but i need to count more carefully", "normalised_gold": "pre training"}}
+{"qid": "2312.10997v5.pdf::Q000", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "Pre-training", "pages": 21, "document_id": 7596, "arm": "surfsense_agentic", "question_id": "2312.10997v5.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find a figure with a tree shape and identify the branch with the least leaves.Let me search for references to figures with tree shapes in the document:Let me read through the document to find figures. I'll start by looking at the actual content:The document mentions a \"technology tree\" in Figure 1. Let me look for Figure 1:Let me search for where figures are actually displayed in the document:Good! There are embedded images. Let me read around Im1.png which should be Figure 1 (the technology tree):This looks like it's the OCR text from the tree figure. Let me continue reading to see all the branches:Perfect! I found the tree-shaped figure. It's a tree diagram with three main branches:\n1. Fine-tuning (green, left branch)\n2. Pre-training (orange/gold, middle branch) \n3. Inference (blue-green, right branch)\n\nLet me continue reading the visual description to see which branch has the least leaves (papers/items):Perfect! The visual description clearly states:\n\n- **Fine-tuning branch**: Most extensive with numerous sub-branches and approaches\n- **Pre-training branch**: Narrowest with fewer approaches\n- **Inference branch**: Dense with many recent (2024) developments\n\nThe Pre-training branch is described as the \"narrowest with fewer approaches,\" meaning it has the least leaves.\n\nAnswer: Pre-training", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 62969, "error": null, "extra": {"thread_id": 2042, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_bc29640067d14f5eb3f9f7d53e8522d7", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 357, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "pre training", "normalised_gold": "pre training"}}
+{"qid": "2312.10997v5.pdf::Q001", "doc_id": "2312.10997v5.pdf", "answer_format": "int", "gold": "9", "pages": 21, "document_id": 7596, "arm": "native_pdf", "question_id": "2312.10997v5.pdf::Q001", "answer_letter": null, "raw_text": "I need to examine Figure 3 to count the distinct icons used.\n\nLooking at Figure 3 on page 4 of the document, which shows \"Comparison between the three paradigms of RAG,\" I can see three main sections representing Naive RAG, Advanced RAG, and Modular RAG.\n\nLet me identify all the distinct icons/shapes used:\n\n1. User icon (person silhouette)\n2. Query icon (document/text box)\n3. Documents icon (stacked documents)\n4. Indexing box\n5. Retrieval box\n6. Prompt box (document icon)\n7. Frozen LLM (snowflake icon with text)\n8. Output box\n9. Pre-Retrieval box (with sub-elements like \"Query Rewriting\", \"Query Expansion\", \"Query Enhancement\")\n10. Post-Retrieval box (with sub-elements like \"Re-rank\", \"Memory\", \"Fusion\")\n11. Various module boxes in the Modular RAG section (Routing, Search, Predict, Retrieve, Rewrite, RAG, Rerank, Read, Demonstrate, Fusion, Memory)\n12. Different pattern sections showing Rewrite, Retrieve, Read combinations\n\nCounting the distinct icon types (not individual instances):\n- User icon (person)\n- Query icon\n- Documents icon\n- Standard rectangular boxes (for processes)\n- Snowflake icon (for Frozen LLM)\n- Document/page icon (for Prompt)\n\nLooking more carefully at the distinct visual elements:\n1. Person/user icon\n2. Text/query box\n3. Documents stack icon\n4. Process boxes (rectangles)\n5. Snowflake (frozen LLM)\n6. Document icon\n\nAnswer: 6", "citations": [], "input_tokens": 64616, "output_tokens": 404, "cost_micros": 199908, "latency_ms": 15077, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.10997v5.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "9"}}
+{"qid": "2312.10997v5.pdf::Q001", "doc_id": "2312.10997v5.pdf", "answer_format": "int", "gold": "9", "pages": 21, "document_id": 7596, "arm": "azure_basic_lc", "question_id": "2312.10997v5.pdf::Q001", "answer_letter": null, "raw_text": "I need to carefully examine Figure 3 to count the distinct icons used.\n\nLooking at Figure 3, which shows \"Comparison between the three paradigms of RAG,\" I can see:\n\n1. **User Query icon** - represented by a person/user symbol\n2. **Documents icon** - represented by stacked document/paper symbols\n3. **Indexing icon** - appears to show database/storage symbols\n4. **Retrieval icon** - shown with magnifying glass or search symbols\n5. **Read icon** - shown as a document being read\n6. **Predict icon** - shown with a specific symbol\n7. **Output icon** - shown at the end of processes\n8. **Prompt icon** - shown in the middle paradigm\n9. **Frozen LLM icon** - represented by a brain or model symbol\n10. **Rerank icon** - shown in Advanced RAG\n11. **Rewrite icon** - shown in Modular RAG\n12. **Demonstrate icon** - shown in Modular RAG\n13. **Routing icon** - shown in Modular RAG\n14. **Search icon** - shown in Modular RAG\n15. **Fusion icon** - shown in Modular RAG\n16. **Memory icon** - shown in Modular RAG\n\nLet me recount more carefully by looking at the actual distinct visual icons in the figure:\n\nFrom the figure, I can identify these distinct icon types:\n- User/person icon\n- Document/paper stack icon\n- Database/storage icon\n- Magnifying glass (search/retrieval)\n- Brain/model icon (for LLM)\n- Arrow/flow icons\n- Various process boxes with different labels\n\nCounting the truly distinct visual icons (not just labeled boxes):\n1. User icon\n2. Document icon\n3. Database/indexing icon\n4. Magnifying glass/search icon\n5. Brain/LLM icon\n6. Output/result icon\n\nAnswer: 6", "citations": [], "input_tokens": 33359, "output_tokens": 449, "cost_micros": 106812, "latency_ms": 10735, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "9"}}
+{"qid": "2312.10997v5.pdf::Q001", "doc_id": "2312.10997v5.pdf", "answer_format": "int", "gold": "9", "pages": 21, "document_id": 7596, "arm": "azure_premium_lc", "question_id": "2312.10997v5.pdf::Q001", "answer_letter": null, "raw_text": "I need to carefully examine Figure 3 to count the distinct icons used.\n\nLooking at Figure 3 titled \"Comparison between the three paradigms of RAG,\" I can see three diagrams showing Naive RAG, Advanced RAG, and Modular RAG.\n\nLet me identify each distinct icon type used across all three paradigms:\n\n1. **User** - person icon (shown in all three)\n2. **Query** - document/text icon (shown in all three)\n3. **Documents** - stack of documents icon (shown in all three)\n4. **Indexing** - cylindrical database icon (shown in Naive and Advanced RAG)\n5. **Retrieval** - magnifying glass/search icon (shown in all three)\n6. **Rewrite** - circular arrow icon (shown in Advanced RAG)\n7. **Rerank** - list with arrows icon (shown in Advanced RAG)\n8. **Prompt** - text/document icon (shown in all three)\n9. **Frozen LLM** - brain/processing icon (shown in all three)\n10. **Read** - book/reading icon (shown in all three)\n11. **Output** - result/output icon (shown in all three)\n12. **Modules (R)** - box with R (shown in Modular RAG)\n13. **Search** - magnifying glass (shown in Modular RAG)\n14. **Routing** - directional arrow icon (shown in Modular RAG)\n15. **Predict** - crystal ball/prediction icon (shown in Modular RAG)\n16. **Demonstrate** - pointing hand icon (shown in Modular RAG)\n17. **Memory** - storage icon (shown in Modular RAG)\n18. **Fusion** - merge icon (shown in Modular RAG)\n19. **Summary** - document summary icon (shown in Advanced RAG)\n20. **Judge** - decision diamond (shown in Modular RAG)\n21. **Generate** - generation icon (shown in Modular RAG)\n\nLet me recount more carefully by looking at the actual distinct icon shapes:\n\n1. User (person)\n2. Query (document)\n3. Documents (stack)", "citations": [], "input_tokens": 40666, "output_tokens": 512, "cost_micros": 129678, "latency_ms": 10766, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "9"}}
+{"qid": "2312.10997v5.pdf::Q001", "doc_id": "2312.10997v5.pdf", "answer_format": "int", "gold": "9", "pages": 21, "document_id": 7596, "arm": "llamacloud_basic_lc", "question_id": "2312.10997v5.pdf::Q001", "answer_letter": null, "raw_text": "I need to carefully examine Figure 3 to count the distinct icons used.\n\nLooking at Figure 3, which shows \"Comparison between the three paradigms of RAG,\" I can see three sections (Left, Middle, Right) representing Naive RAG, Advanced RAG, and Modular RAG.\n\nLet me identify all the distinct icons/symbols used in the figure:\n\n1. \"User Query\" box (rectangular box with text)\n2. \"Indexing\" box\n3. \"Retrieval\" box\n4. \"Rerank\" box\n5. \"Rewrite\" box\n6. \"Fusion\" box\n7. \"Summary\" box\n8. \"Demonstrate\" box\n9. \"Read\" box\n10. \"Search\" box\n11. \"Retrieve\" box\n12. \"Memory\" box\n13. \"Predict\" box\n14. \"Routing\" box\n15. \"Documents\" box\n16. \"RAG\" box\n17. \"Output\" box\n18. \"Response\" box\n19. \"Judge\" box\n20. \"Generate\" box\n21. \"Transformation/Decomposition\" box\n22. \"Iterate\" box\n23. Arrow symbols (connecting the boxes)\n24. \"Max Times/Threshold\" decision point\n25. \"Max Depth (Tree)/Threshold\" decision point\n26. \"Generate Special Token/Threshold\" decision point\n27. \"N Times\" indicator\n\nLooking more carefully at the visual elements, the distinct icon types are:\n- Rectangular boxes (for processes/modules)\n- Arrows (for flow direction)\n- Diamond or decision point shapes (for conditional logic)\n\nCounting the distinct visual icon types (not the number of instances, but types):\n1. Rectangular box/module icon\n2. Arrow icon\n3. Diamond/decision point icon\n\nAnswer: 3", "citations": [], "input_tokens": 30902, "output_tokens": 415, "cost_micros": 98931, "latency_ms": 10655, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "9"}}
+{"qid": "2312.10997v5.pdf::Q001", "doc_id": "2312.10997v5.pdf", "answer_format": "int", "gold": "9", "pages": 21, "document_id": 7596, "arm": "llamacloud_premium_lc", "question_id": "2312.10997v5.pdf::Q001", "answer_letter": null, "raw_text": "I need to carefully examine Figure 3 in the document to count the distinct icons used.\n\nLooking at Figure 3, which shows \"Comparison between the three paradigms of RAG,\" I can see three main diagrams:\n\n1. **Naive RAG (Left)**: Shows a flowchart with:\n   - User Query\n   - Documents\n   - Indexing\n   - Retrieval\n   - Prompt\n   - Frozen LLM\n   - Output\n\n2. **Advanced RAG (Middle)**: Shows:\n   - User Query\n   - Documents\n   - Indexing\n   - Pre-Retrieval (with sub-components: Query Routing, Query Rewriting, Query Expansion)\n   - Retrieval\n   - Post-Retrieval (with sub-components: Rerank, Summary, Fusion)\n   - Prompt\n   - Frozen LLM\n   - Output\n\n3. **Modular RAG (Right)**: Shows:\n   - Multiple modules (Routing, Search, Predict, Rewrite, RAG, Rerank, Retrieve, Read, Demonstrate, Fusion, Memory)\n   - Various patterns including Naive RAG, Advanced RAG, DSP, and ITER-RETGEN\n\nLooking at the visual representation, the distinct icon shapes/types I can identify are:\n1. Rectangular boxes (for most components)\n2. Rounded rectangles/boxes (for grouping/subgraphs)\n3. Arrows/lines (for connections)\n\nHowever, if we're counting distinct visual elements or node types based on their representation in the figure, the components are primarily represented as boxes/rectangles with text labels. The figure uses a consistent box/rectangle style for representing different components.\n\nBased on the mermaid diagram structure shown, all components are represented using similar rectangular node shapes, with the main distinction being in the grouping (subgraphs) and connections rather than distinct icon types.\n\nAnswer: 1", "citations": [], "input_tokens": 37484, "output_tokens": 444, "cost_micros": 119112, "latency_ms": 12312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "9"}}
+{"qid": "2312.10997v5.pdf::Q001", "doc_id": "2312.10997v5.pdf", "answer_format": "int", "gold": "9", "pages": 21, "document_id": 7596, "arm": "surfsense_agentic", "question_id": "2312.10997v5.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the document to find Figure 3 and count the distinct icons used.Let me search for \"Figure 3\" in the document:Now let me read around line 788 to find Figure 3:Let me continue reading to find Figure 3:I see Figure 2 but need to continue searching for Figure 3. Let me search more:Perfect! I found Figure 3. Let me continue reading to see the complete visual description:Perfect! I have found Figure 3 and its visual description. Let me carefully count the distinct icons mentioned in the visual description:\n\nFrom the visual description of Figure 3, I can identify the following distinct icons:\n\n1. **User/Query icon** - mentioned in Naive RAG and Advanced RAG\n2. **Documents icon** - mentioned in Naive RAG and Advanced RAG (stacked papers/documents)\n3. **Snowflake icon** - mentioned in Naive RAG and Advanced RAG (representing Frozen LLM)\n4. **Document icon** - mentioned in Naive RAG (at the Prompt stage)\n5. **Shuffle/rerank icon** - mentioned in Advanced RAG Post-Retrieval\n6. **Waveform/summary icon** - mentioned in Advanced RAG Post-Retrieval\n7. **Layers/fusion icon** - mentioned in Advanced RAG Post-Retrieval\n\nLet me recount more carefully by looking at what the visual description explicitly states:\n- User/Query icons (top of diagrams)\n- Documents icons (top of diagrams)\n- Document icon (at Prompt stage)\n- Snowflake icon (representing Frozen LLM)\n- Shuffle/rerank icon (in Post-Retrieval)\n- Waveform/summary icon (in Post-Retrieval)\n- Layers/fusion icon (in Post-Retrieval)\n\nThat gives us 7 distinct icons total.\n\nAnswer: 7", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 51812, "error": null, "extra": {"thread_id": 2043, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_c31c1d3df3774ac882e3f8a22f227910", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 283, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "7", "normalised_gold": "9"}}
+{"qid": "2312.10997v5.pdf::Q002", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "Copy is all you need.", "pages": 21, "document_id": 7596, "arm": "native_pdf", "question_id": "2312.10997v5.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the paper that has a retrieval granularity of \"Phrase\" in Table I.\n\nLooking at Table I (Summary of RAG methods) on page 6, I can see the columns: Method, Retrieval Source, Retrieval Data Type, Retrieval Granularity, Augmentation Stage, and Retrieval process.\n\nScanning the \"Retrieval Granularity\" column, I find one entry with \"Phrase\":\n- CoG [29] - which has retrieval granularity of \"Phrase\"\n\nNow I need to find the full title of reference [29]. Looking at the references section starting on page 17, reference [29] is:\n\n[29] T. Lan, D. Cai, Y. Wang, H. Huang, and X.-L. Mao, \"Copy is all you need,\" in The Eleventh International Conference on Learning Representations, 2022.\n\nAnswer: Copy is all you need", "citations": [], "input_tokens": 64633, "output_tokens": 220, "cost_micros": 197199, "latency_ms": 34000, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.10997v5.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "copy is all you need", "normalised_gold": "copy is all you need"}}
+{"qid": "2312.10997v5.pdf::Q002", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "Copy is all you need.", "pages": 21, "document_id": 7596, "arm": "azure_basic_lc", "question_id": "2312.10997v5.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the paper that uses \"phrase\" as its retrieval granularity.\n\nLooking at Table I \"SUMMARY OF RAG METHODS\", I can find the retrieval granularity column. Let me search for entries with \"Phrase\" as the retrieval granularity.\n\nFrom Table I, I can see:\n- CoG [29] has:\n  - Retrieval Source: Wikipedia\n  - Retrieval Data Type: Text\n  - Retrieval Granularity: Phrase\n  - Augmentation Stage: Pre-training\n  - Retrieval process: Iterative\n\nNow I need to find the full title of the CoG paper. Looking at the references section:\n\n[29] T. Lan, D. Cai, Y. Wang, H. Huang, and X.-L. Mao, \"Copy is all you need,\" in The Eleventh International Conference on Learning Representations, 2022.\n\nAnswer: Copy is all you need", "citations": [], "input_tokens": 33376, "output_tokens": 222, "cost_micros": 103458, "latency_ms": 7266, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "copy is all you need", "normalised_gold": "copy is all you need"}}
+{"qid": "2312.10997v5.pdf::Q002", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "Copy is all you need.", "pages": 21, "document_id": 7596, "arm": "azure_premium_lc", "question_id": "2312.10997v5.pdf::Q002", "answer_letter": null, "raw_text": "Looking through the document for methods with \"Phrase\" retrieval granularity in Table I...\n\nFrom Table I (SUMMARY OF RAG METHODS), I can see:\n- CoG [29] has retrieval granularity of \"Phrase\"\n\nLooking at the references section for reference [29]:\n\n[29] T. Lan, D. Cai, Y. Wang, H. Huang, and X.-L. Mao, \"Copy is all you need,\" in The Eleventh International Conference on Learning Representations, 2022.\n\nAnswer: Copy is all you need", "citations": [], "input_tokens": 40683, "output_tokens": 133, "cost_micros": 124044, "latency_ms": 8407, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "copy is all you need", "normalised_gold": "copy is all you need"}}
+{"qid": "2312.10997v5.pdf::Q002", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "Copy is all you need.", "pages": 21, "document_id": 7596, "arm": "llamacloud_basic_lc", "question_id": "2312.10997v5.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the paper that proposes a method with phrase-level retrieval granularity.\n\nLooking at Table I (Summary of Retrieval Source and Retrieval Granularity Used in RAG Methods), I can find methods with \"Phrase\" as their retrieval granularity.\n\nFrom the table, I can see:\n- CoG [29] has retrieval granularity of \"Phrase\"\n\nNow I need to find the full title of the CoG paper from the references section.\n\nLooking at the references, reference [29] is:\n\"T. Lan, D. Cai, Y. Wang, H. Huang, and X.-L. Mao, \"Copy is all you need,\" in The Eleventh International Conference on Learning Representations, 2022.\"\n\nAnswer: Copy is all you need", "citations": [], "input_tokens": 30919, "output_tokens": 187, "cost_micros": 95562, "latency_ms": 6766, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "copy is all you need", "normalised_gold": "copy is all you need"}}
+{"qid": "2312.10997v5.pdf::Q002", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "Copy is all you need.", "pages": 21, "document_id": 7596, "arm": "llamacloud_premium_lc", "question_id": "2312.10997v5.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the paper that proposes a method with retrieval granularity of \"Phrase\".\n\nLooking at Table I (Summary of RAG Methods), I can search for entries where \"Retrieval Granularity\" is \"Phrase\".\n\nScanning through Table I, I find:\n- CoG [29] has retrieval granularity: Phrase\n\nNow I need to find the full title of paper [29] from the references section.\n\nLooking at reference [29]:\n[29] T. Lan, D. Cai, Y. Wang, H. Huang, and X.-L. Mao, \"Copy is all you need,\" in *The Eleventh International Conference on Learning Representations*, 2022.\n\nAnswer: Copy is all you need", "citations": [], "input_tokens": 37501, "output_tokens": 174, "cost_micros": 115113, "latency_ms": 6453, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "copy is all you need", "normalised_gold": "copy is all you need"}}
+{"qid": "2312.10997v5.pdf::Q002", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "Copy is all you need.", "pages": 21, "document_id": 7596, "arm": "surfsense_agentic", "question_id": "2312.10997v5.pdf::Q002", "answer_letter": null, "raw_text": "I'll search for information about retrieval granularity and phrase-level methods in the provided documents.Let me continue reading to find the title and look for information about retrieval granularity:Good! I found the paper title: \"Retrieval-Augmented Generation for Large Language Models: A Survey\". Now I need to search for information about retrieval granularity and specifically which method has \"phrase\" as its retrieval granularity. Let me search for tables or sections that discuss retrieval granularity:Perfect! I found a table that mentions \"Phrase\" as a retrieval granularity. Let me read that section to find the complete information:Great! I can see that \"CoG [29]\" has \"Phrase\" as its retrieval granularity. Now I need to find the full title of the CoG paper. Let me look for the reference [29]:Perfect! I found reference [29]. Let me read that section to get the full title:I can see reference [29] starts at line 4090. Let me read a bit more to get the complete title:Perfect! I found the full title of reference [29], which is the paper that proposes the method (CoG) with phrase retrieval granularity.\n\nThe full title is: **\"Copy is all you need\"**\n\nAnswer: Copy is all you need", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 52827, "error": null, "extra": {"thread_id": 2044, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_27dfaf3e8bfa47e1b55fb4feceaed85a", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 297, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "copy is all you need", "normalised_gold": "copy is all you need"}}
+{"qid": "2312.10997v5.pdf::Q004", "doc_id": "2312.10997v5.pdf", "answer_format": "list", "gold": "['POPQA', 'MS MARCO', 'SST-2']", "pages": 21, "document_id": 7596, "arm": "native_pdf", "question_id": "2312.10997v5.pdf::Q004", "answer_letter": null, "raw_text": "I'll search through Table II to find datasets that have exactly three methods listed.\n\nLooking at Table II (SUMMARY OF RAG METHODS), I need to count the methods for each dataset:\n\n- WebQ (Web Questions): [3], [4], [13], [30], [50], [68] - 6 methods\n- PopQA: [7], [25], [67] - 3 methods \u2713\n- MS MARCO: [4], [40], [52] - 3 methods \u2713\n- MuSiQue: [14], [51], [61], [91] - 4 methods\n- ASQA: [24], [57] - 2 methods\n- QMSum(QM): [60], [123] - 2 methods\n- Qasper: [60], [63] - 2 methods\n- COVID-QA: [35], [46] - 2 methods\n- CMB [128],MMCU_Medical [129]: [81] - 1 method\n- QuALITY: [60], [63] - 2 methods\n- ARC: [25], [67] - 2 methods\n- CommonsenseQA: [58], [66] - 2 methods\n- GraphQA: [84] - 1 method\n- Amazon(Toys,Sport,Beauty): [39], [40] - 2 methods\n- T-REx [141],ZsRE [142]: [27], [51] - 2 methods\n- HellaSwag: [20], [66] - 2 methods\n- CoT Reasoning: [27] - 1 method\n- CSQA: [55] - 1 method\n- StrategyQA: [14], [24], [48], [51], [55], [58] - 6 methods\n- XSum: [17] - 1 method\n- VioLens: [19] - 1 method\n- TREC: [33] - 1 method\n- SST-2: [20], [33], [38] - 3 methods \u2713\n- CodeSearchNet: [76] -", "citations": [], "input_tokens": 64631, "output_tokens": 512, "cost_micros": 201573, "latency_ms": 17125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "length", "pdf_filename": "2312.10997v5.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "codesearchnet 76", "normalised_gold": "ms marco, popqa, sst 2"}}
+{"qid": "2312.10997v5.pdf::Q004", "doc_id": "2312.10997v5.pdf", "answer_format": "list", "gold": "['POPQA', 'MS MARCO', 'SST-2']", "pages": 21, "document_id": 7596, "arm": "azure_basic_lc", "question_id": "2312.10997v5.pdf::Q004", "answer_letter": null, "raw_text": "I need to examine Table II to find datasets that have exactly three methods listed.\n\nLet me go through Table II systematically:\n\n**QA - Single-hop:**\n- Natural Question (NQ): [3], [4], [22], [27], [40], [43], [54], [62], [71], [112] [20], [44], [72] [13], [30], [34], [45], [50], [64] - more than 3\n- TriviaQA (TQA): [4], [27], [59], [62], [112] [22], [25], [43], [44], [71], [72] [20], [23], [30], [32], [45], [69], [112] - more than 3\n- SQUAD: [3], [4], [13], [30], [50], [68] - 6 methods\n- Web Questions (WebQ): [4], [40], [52] - exactly 3 methods\n- PopQA: [4], [40], [52] - exactly 3 methods\n- MS MARCO: [117] - only 1 reference\n\n**QA - Multi-hop:**\n- HotpotQA: [7], [14], [22], [27], [59], [62], [69], [71], [91] - 9 methods\n- 2WikiMultiHopQA: [14], [51], [61], [91] - 4 methods\n- MuSiQue: [120] - only 1 reference\n\n**QA - Long-form:**\n- ELI5: [27], [34], [43], [49], [51] - 5 methods\n- NarrativeQA (NQA): [45], [60], [63], [123] - 4 methods\n- ASQA: [24], [57] - 2 methods\n- QMSum (QM): [60], [123] - 2 methods\n\n**QA - Domain:**\n- Qasper: [60], [63] - 2 methods\n- COVID-QA: [35], [46] - 2 methods\n- CMB: [", "citations": [], "input_tokens": 33374, "output_tokens": 512, "cost_micros": 107802, "latency_ms": 11546, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "cmb", "normalised_gold": "ms marco, popqa, sst 2"}}
+{"qid": "2312.10997v5.pdf::Q004", "doc_id": "2312.10997v5.pdf", "answer_format": "list", "gold": "['POPQA', 'MS MARCO', 'SST-2']", "pages": 21, "document_id": 7596, "arm": "azure_premium_lc", "question_id": "2312.10997v5.pdf::Q004", "answer_letter": null, "raw_text": "I need to examine Table II to find datasets that have exactly three methods listed.\n\nLet me go through Table II systematically:\n\n**QA - Single-hop:**\n- Natural Question (NQ): More than 3 methods\n- TriviaQA (TQA): More than 3 methods\n- SQUAD: More than 3 methods\n- Web Questions (WebQ): [3], [4], [13], [30], [50], [68] - 6 methods\n- PopQA: [7], [25], [67] - **exactly 3 methods**\n- MS MARCO: [4], [40], [52] - **exactly 3 methods**\n\n**QA - Multi-hop:**\n- HotpotQA: More than 3 methods\n- 2WikiMultiHopQA: [14], [24], [48], [59], [61], [91] - 6 methods\n- MuSiQue: [14], [51], [61], [91] - 4 methods\n\n**QA - Long-form QA:**\n- ELI5: [27], [34], [43], [49], [51] - 5 methods\n- NarrativeQA (NQA): [45], [60], [63], [123] - 4 methods\n- ASQA: [24], [57] - 2 methods\n- QMSum (QM): [60], [123] - 2 methods\n\n**QA - Domain QA:**\n- Qasper: [60], [63] - 2 methods\n- COVID-QA: [35], [46] - 2 methods\n- CMB, MMCU_Medical: [81] - 1 method\n\n**QA - Multi-Choice QA:**\n- QUALITY: [60], [63] - 2 methods\n- ARC: [25], [67] - 2 methods\n- CommonsenseQA: [58], [66] - 2 methods\n\n**Dialog:**\n- Wizard of Wikipedia (WoW): [13], [27], [34], [42] - 4 methods\n- KBP: [74],", "citations": [], "input_tokens": 40681, "output_tokens": 512, "cost_micros": 129723, "latency_ms": 12062, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "kbp 74", "normalised_gold": "ms marco, popqa, sst 2"}}
+{"qid": "2312.10997v5.pdf::Q004", "doc_id": "2312.10997v5.pdf", "answer_format": "list", "gold": "['POPQA', 'MS MARCO', 'SST-2']", "pages": 21, "document_id": 7596, "arm": "llamacloud_basic_lc", "question_id": "2312.10997v5.pdf::Q004", "answer_letter": null, "raw_text": "I need to look at Table II and find datasets that have exactly three methods listed.\n\nLet me go through Table II systematically:\n\n1. Natural Question (NQ) - has 10 methods listed\n2. TriviaQA (TQA) - has 5 methods listed\n3. SQuAD - has 7 methods listed\n4. Web Questions (WebQ) - has 6 methods listed\n5. PopQA - has 3 methods: [7], [25], [67] \u2713\n6. MS MARCO - has 3 methods: [4], [40], [52] \u2713\n7. HotpotQA - has 8 methods listed\n8. 2WikiMultiHopQA - has 6 methods listed\n9. MuSiQue - has 4 methods listed\n10. ELI5 - has 5 methods listed\n11. NarrativeQA (NQA) - has 4 methods listed\n12. ASQA - has 2 methods listed\n13. QMSum (QM) - has 2 methods listed\n14. Qasper - has 2 methods listed\n15. COVID-QA - has 2 methods listed\n16. CMB, MMCU Medical - has 1 method listed\n17. QuALITY - has 2 methods listed\n18. ARC - has 2 methods listed\n19. CommonsenseQA - has 2 methods listed\n20. GraphQA - has 1 method listed\n21. Wizard of Wikipedia (WoW) - has 4 methods listed\n22. KBP - has 2 methods listed\n23. DuleMon - has 1 method listed\n24. CamRest - has 2 methods listed\n25. Amazon (Toys, Sport, Beauty) - has 2 methods listed\n26. WikiEvent - has 4 methods listed\n27. RAMS - has 2 methods listed\n28. T-REx, ZsRE - has 2 methods listed\n29. HellaSwag - has 2 methods listed\n30. CoT Reasoning - has 1 method listed", "citations": [], "input_tokens": 30917, "output_tokens": 512, "cost_micros": 100431, "latency_ms": 11141, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "30 cot reasoning has 1 method listed", "normalised_gold": "ms marco, popqa, sst 2"}}
+{"qid": "2312.10997v5.pdf::Q004", "doc_id": "2312.10997v5.pdf", "answer_format": "list", "gold": "['POPQA', 'MS MARCO', 'SST-2']", "pages": 21, "document_id": 7596, "arm": "llamacloud_premium_lc", "question_id": "2312.10997v5.pdf::Q004", "answer_letter": null, "raw_text": "I need to find datasets in Table II that have exactly three methods listed.\n\nLet me go through Table II systematically and count the number of methods for each dataset:\n\n**QA - Single-hop:**\n- Natural Question (NQ): 12 methods\n- TriviaQA (TQA): 12 methods\n- SQuAD: 7 methods\n- Web Questions (WebQ): 6 methods\n- PopQA: 3 methods \u2713\n- MS MARCO: 3 methods \u2713\n\n**QA - Multi-hop:**\n- HotpotQA: 15 methods\n- 2WikiMultiHopQA: 6 methods\n- MuSiQue: 4 methods\n\n**QA - Long-form QA:**\n- ELI5: 5 methods\n- NarrativeQA (NQA): 4 methods\n- ASQA: 2 methods\n- QMSum (QM): 2 methods\n\n**QA - Domain QA:**\n- Qasper: 2 methods\n- COVID-QA: 2 methods\n- CMB, MMCU_Medical: 1 method\n\n**QA - Multi-Choice QA:**\n- QuALITY: 2 methods\n- ARC: 2 methods\n- CommonsenseQA: 2 methods\n\n**QA - Graph QA:**\n- GraphQA: 1 method\n\n**Dialog:**\n- Wizard of Wikipedia (WoW): 4 methods\n- KBP: 2 methods\n- DuleMon: 1 method\n- CamRest: 2 methods\n- Amazon (Toys, Sport, Beauty): 2 methods\n\n**IE:**\n- WikiEvent: 4 methods\n- RAMS: 2 methods\n- T-REx, ZsRE: 2 methods\n\n**Reasoning:**\n- HellaSwag: 2 methods\n- CoT Reasoning: 1 method\n- CSQA: 1 method\n\n**Others:**\n- MMLU: 7 methods\n- WikiText-103: 4 methods\n-", "citations": [], "input_tokens": 37499, "output_tokens": 512, "cost_micros": 120177, "latency_ms": 11500, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "", "normalised_gold": "ms marco, popqa, sst 2"}}
+{"qid": "2312.10997v5.pdf::Q004", "doc_id": "2312.10997v5.pdf", "answer_format": "list", "gold": "['POPQA', 'MS MARCO', 'SST-2']", "pages": 21, "document_id": 7596, "arm": "surfsense_agentic", "question_id": "2312.10997v5.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to find Table II and identify datasets with exactly three methods.Let me search for \"TABLE II\" in the document:Now let me read the section containing TABLE II:Now I can see Table II. Let me analyze it carefully to find datasets with exactly three methods. I need to count the number of method references for each dataset. Let me go through the table systematically:\n\nLooking at the table structure, I can see:\n- PopQA [116]: [7], [25], [67] - exactly 3 methods\n- MS MARCO [117]: [4], [40], [52] - exactly 3 methods\n- PopQA has methods: [7], [25], [67] - that's 3 methods\n- MS MARCO has methods: [4], [40], [52] - that's 3 methods\n- SST-2 [156]: [20], [33], [38] - exactly 3 methods\n\nLet me verify by checking each row more carefully:\n\n1. PopQA [116]: [7], [25], [67] = 3 methods\n2. MS MARCO [117]: [4], [40], [52] = 3 methods  \n3. QMSum(QM) [125]: [60], [123] = 2 methods\n4. Qasper [126]: [60], [63] = 2 methods\n5. COVID-QA [127]: [35], [46] = 2 methods\n6. QUALITY [130]: [60], [63] = 2 methods\n7. ARC [131]: [25], [67] = 2 methods\n8. CommonsenseQA [132]: [58], [66] = 2 methods\n9. GraphQA [84]: [84] = 1 method\n10. KBP [134]: [74], [135] = 2 methods\n11. DuleMon [136]: [74] = 1 method\n12. CamRest [137]: [78], [79] = 2 methods\n13. Amazon(Toys,Sport,Beauty) [138]: [39], [40] = 2 methods\n14. RAMS [140]: [36], [37] = 2 methods\n15. T-REX [141],ZsRE [142]: [27], [51] = 2 methods\n16. HellaSwag [143]: [20], [66] = 2 methods\n17. CoT Reasoning [144]: [27] = 1 method\n18. CSQA [145]: [55] = 1 method\n19. PubHealth [150]: [25], [67] = 2 methods\n20. Biography [151]: [67] = 1 method\n21. WikiASP [152]: [24] = 1 method\n22. XSum [153]: [17] = 1 method\n23. VioLens [154]: [19] = 1 method\n24. TREC [155]: [33] = 1 method\n25. SST-2 [156]: [20], [33], [38] = 3 methods\n26. CodeSearchNet [157]: [76] = 1 method\n27. NoMIRACL [56]: [56] = 1 method\n28. GSM8K [158]: [73] = 1 method\n29. JRC-Acquis [159]: [17] = 1 method\n\nAnswer: PopQA, MS MARCO, SST-2", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 42953, "error": null, "extra": {"thread_id": 2045, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_ea697b93518746f587b9ed16f4f922df", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 257, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "ms marco, popqa, sst 2", "normalised_gold": "ms marco, popqa, sst 2"}}
+{"qid": "2312.10997v5.pdf::Q005", "doc_id": "2312.10997v5.pdf", "answer_format": "list", "gold": "['Iterative Retrieval ', 'Recursive Retrieval ', 'Adaptive Retrieval ']", "pages": 21, "document_id": 7596, "arm": "native_pdf", "question_id": "2312.10997v5.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, specifically on page 10 where Section V \"AUGMENTATION PROCESS IN RAG\" begins, I can see the subsections listed:\n\nA. Iterative Retrieval\nB. Recursive Retrieval\nC. Adaptive Retrieval\n\nAnswer: Iterative Retrieval, Recursive Retrieval, Adaptive Retrieval", "citations": [], "input_tokens": 64634, "output_tokens": 83, "cost_micros": 195147, "latency_ms": 20108, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.10997v5.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "adaptive retrieval, iterative retrieval, recursive retrieval", "normalised_gold": "adaptive retrieval, iterative retrieval, recursive retrieval"}}
+{"qid": "2312.10997v5.pdf::Q005", "doc_id": "2312.10997v5.pdf", "answer_format": "list", "gold": "['Iterative Retrieval ', 'Recursive Retrieval ', 'Adaptive Retrieval ']", "pages": 21, "document_id": 7596, "arm": "azure_basic_lc", "question_id": "2312.10997v5.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, I need to find the section titled \"AUGMENTATION PROCESS IN RAG\" and identify its subsections.\n\nFrom Section V (AUGMENTATION PROCESS IN RAG), I can see the following structure:\n\nV. AUGMENTATION PROCESS IN RAG\n\nA. Iterative Retrieval\nB. Recursive Retrieval\nC. Adaptive Retrieval\n\nThese are the three subsections under \"AUGMENTATION PROCESS IN RAG\".\n\nAnswer: Iterative Retrieval, Recursive Retrieval, Adaptive Retrieval", "citations": [], "input_tokens": 33377, "output_tokens": 127, "cost_micros": 102036, "latency_ms": 4500, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "adaptive retrieval, iterative retrieval, recursive retrieval", "normalised_gold": "adaptive retrieval, iterative retrieval, recursive retrieval"}}
+{"qid": "2312.10997v5.pdf::Q005", "doc_id": "2312.10997v5.pdf", "answer_format": "list", "gold": "['Iterative Retrieval ', 'Recursive Retrieval ', 'Adaptive Retrieval ']", "pages": 21, "document_id": 7596, "arm": "azure_premium_lc", "question_id": "2312.10997v5.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Section V \"AUGMENTATION PROCESS IN RAG\" in the document, I can identify the subsections by examining the structure.\n\nThe section starts on page 10 and includes the following subsections:\n\nA. Iterative Retrieval\nB. Recursive Retrieval  \nC. Adaptive Retrieval\n\nAnswer: Iterative Retrieval, Recursive Retrieval, Adaptive Retrieval", "citations": [], "input_tokens": 40684, "output_tokens": 95, "cost_micros": 123477, "latency_ms": 4765, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "adaptive retrieval, iterative retrieval, recursive retrieval", "normalised_gold": "adaptive retrieval, iterative retrieval, recursive retrieval"}}
+{"qid": "2312.10997v5.pdf::Q005", "doc_id": "2312.10997v5.pdf", "answer_format": "list", "gold": "['Iterative Retrieval ', 'Recursive Retrieval ', 'Adaptive Retrieval ']", "pages": 21, "document_id": 7596, "arm": "llamacloud_basic_lc", "question_id": "2312.10997v5.pdf::Q005", "answer_letter": null, "raw_text": "Looking through the document for the section \"AUGMENTATION PROCESS IN RAG\" (Section V), I can identify its subsections:\n\nFrom the document structure, Section V contains:\n- A. Iterative Retrieval\n- B. Recursive Retrieval  \n- C. Adaptive Retrieval\n\nAnswer: Iterative Retrieval, Recursive Retrieval, Adaptive Retrieval", "citations": [], "input_tokens": 30920, "output_tokens": 91, "cost_micros": 94125, "latency_ms": 4125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "adaptive retrieval, iterative retrieval, recursive retrieval", "normalised_gold": "adaptive retrieval, iterative retrieval, recursive retrieval"}}
+{"qid": "2312.10997v5.pdf::Q005", "doc_id": "2312.10997v5.pdf", "answer_format": "list", "gold": "['Iterative Retrieval ', 'Recursive Retrieval ', 'Adaptive Retrieval ']", "pages": 21, "document_id": 7596, "arm": "llamacloud_premium_lc", "question_id": "2312.10997v5.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Section V \"AUGMENTATION PROCESS IN RAG\" in the document:\n\nThe section includes the following subsections:\n\nA. Iterative Retrieval\nB. Recursive Retrieval\nC. Adaptive Retrieval\n\nAnswer: Iterative Retrieval, Recursive Retrieval, Adaptive Retrieval", "citations": [], "input_tokens": 37502, "output_tokens": 76, "cost_micros": 113646, "latency_ms": 4125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "adaptive retrieval, iterative retrieval, recursive retrieval", "normalised_gold": "adaptive retrieval, iterative retrieval, recursive retrieval"}}
+{"qid": "2312.10997v5.pdf::Q005", "doc_id": "2312.10997v5.pdf", "answer_format": "list", "gold": "['Iterative Retrieval ', 'Recursive Retrieval ', 'Adaptive Retrieval ']", "pages": 21, "document_id": 7596, "arm": "surfsense_agentic", "question_id": "2312.10997v5.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to find the \"AUGMENTATION PROCESS IN RAG\" section and identify its subsections.Let me search for the \"AUGMENTATION PROCESS IN RAG\" section in the document.Now let me read the section around line 2636 to see the subsections.Let me continue reading to see all subsections under \"AUGMENTATION PROCESS IN RAG\".Perfect! I can now see the structure of the \"AUGMENTATION PROCESS IN RAG\" section. It has three subsections:\n\n1. A. Iterative Retrieval (line 2646)\n2. B. Recursive Retrieval (line 2827)\n3. C. Adaptive Retrieval (line 2858)\n\nThe next section \"VI. TASK AND EVALUATION\" starts at line 2908, confirming that these are the only three subsections under \"AUGMENTATION PROCESS IN RAG\".\n\nAnswer: Iterative Retrieval, Recursive Retrieval, Adaptive Retrieval", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 34782, "error": null, "extra": {"thread_id": 2046, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_dfd2aaa1a1dd45f1bdec0523c2899c57", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 177, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "adaptive retrieval, iterative retrieval, recursive retrieval", "normalised_gold": "adaptive retrieval, iterative retrieval, recursive retrieval"}}
+{"qid": "2312.10997v5.pdf::Q006", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "PKG", "pages": 21, "document_id": 7596, "arm": "native_pdf", "question_id": "2312.10997v5.pdf::Q006", "answer_letter": null, "raw_text": "I need to search through the document for information about methods that integrate knowledge into white-box models via directive fine-tuning.\n\nLooking through the document, I found relevant information on page 10 (document page 9-10) in section III. RETRIEVAL, subsection E. Adapter:\n\n\"Furthermore, PKG introduces an innovative method for integrating knowledge into white-box models via directive fine-tuning [75]. In this approach, the retriever module is directly substituted to generate relevant documents according to a query.\"\n\nThe text explicitly states that PKG (which likely stands for \"Parametric Knowledge Guiding\" based on reference [75] mentioned in the text) introduces this method.\n\nAnswer: PKG", "citations": [], "input_tokens": 64627, "output_tokens": 158, "cost_micros": 196251, "latency_ms": 20969, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.10997v5.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "pkg", "normalised_gold": "pkg"}}
+{"qid": "2312.10997v5.pdf::Q006", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "PKG", "pages": 21, "document_id": 7596, "arm": "azure_basic_lc", "question_id": "2312.10997v5.pdf::Q006", "answer_letter": null, "raw_text": "Looking through the document for methods that integrate knowledge into white-box models via directive fine-tuning...\n\nIn Section III.E (Adapter), I find the following passage:\n\n\"Furthermore, PKG introduces an innovative method for integrating knowledge into white-box models via directive fine-tuning [75]. In this approach, the retriever module is directly substituted to generate relevant documents according to a query.\"\n\nAnswer: PKG", "citations": [], "input_tokens": 33370, "output_tokens": 94, "cost_micros": 101520, "latency_ms": 5250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "pkg", "normalised_gold": "pkg"}}
+{"qid": "2312.10997v5.pdf::Q006", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "PKG", "pages": 21, "document_id": 7596, "arm": "azure_premium_lc", "question_id": "2312.10997v5.pdf::Q006", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about a method that integrates knowledge into white-box models via directive fine-tuning.\n\nIn Section III.E (Adapter), I can find the following passage:\n\n\"Furthermore, PKG introduces an innovative method for integrating knowledge into white-box models via directive fine-tuning [75]. In this approach, the retriever module is directly substituted to generate relevant documents according to a query.\"\n\nThe text clearly states that PKG introduces this method for integrating knowledge into white-box models via directive fine-tuning.\n\nAnswer: PKG", "citations": [], "input_tokens": 40677, "output_tokens": 130, "cost_micros": 123981, "latency_ms": 6671, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "pkg", "normalised_gold": "pkg"}}
+{"qid": "2312.10997v5.pdf::Q006", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "PKG", "pages": 21, "document_id": 7596, "arm": "llamacloud_basic_lc", "question_id": "2312.10997v5.pdf::Q006", "answer_letter": null, "raw_text": "Looking through the document for methods that integrate knowledge into white-box models via directive fine-tuning...\n\nThe document discusses RA-DIT (Retrieval-Augmented Dual Instruction Tuning) in reference [27], which is described as \"retrieval-augmented dual instruction tuning.\" This method involves fine-tuning both the retriever and generator components.\n\nIn Section IV.B on LLM Fine-tuning, the document states: \"Fine-tuning of LLM can also be coordinated with fine-tuning of the retriever to align preferences. A typical approach, such as RA-DIT [27], aligns the scoring functions between Retriever and Generator using KL divergence.\"\n\nThis method specifically focuses on instruction tuning (a form of directive fine-tuning) to integrate retrieval-augmented knowledge into language models.\n\nAnswer: RA-DIT", "citations": [], "input_tokens": 30913, "output_tokens": 200, "cost_micros": 95739, "latency_ms": 6766, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "ra dit", "normalised_gold": "pkg"}}
+{"qid": "2312.10997v5.pdf::Q006", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "PKG", "pages": 21, "document_id": 7596, "arm": "llamacloud_premium_lc", "question_id": "2312.10997v5.pdf::Q006", "answer_letter": null, "raw_text": "Looking through the document for information about integrating knowledge into white-box models via directive fine-tuning...\n\nIn Section III.E (Adapter), I find the following passage:\n\n\"Furthermore, PKG introduces an innovative method for integrating knowledge into white-box models via directive fine-tuning [75]. In this approach, the retriever module is directly substituted to generate relevant documents according to a query.\"\n\nAnswer: PKG", "citations": [], "input_tokens": 37495, "output_tokens": 95, "cost_micros": 113910, "latency_ms": 5500, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "pkg", "normalised_gold": "pkg"}}
+{"qid": "2312.10997v5.pdf::Q006", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "PKG", "pages": 21, "document_id": 7596, "arm": "surfsense_agentic", "question_id": "2312.10997v5.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the priority document to find the answer to your question about which method integrates knowledge into white-box models via directive fine-tuning.Let me search for content about \"directive fine-tuning\" and \"white-box models\":Let me read more of the document to find tables or sections discussing methods:Let me search for tables that compare methods:Let me read around the tables to find information about methods:Let me search for \"Table I\" to find the table that might contain method comparisons:This table shows methods but doesn't mention white-box or directive fine-tuning. Let me search more specifically:Let me search for a combination of terms:Let me look for \"fine-tuning\" methods and check around line 2053 which mentions modular RAG integration with fine-tuning:Let me look for sections on fine-tuning methods:Found it! Let me read around line 2526:Perfect! I found the answer. The text states \"PKG introduces an innovative method for integrating knowledge into white-box models via directive fine-tuning [75].\" Let me verify this is PKG by reading a bit more context:Perfect! The answer is clear. The document states: \"Furthermore, PKG introduces an innovative method for integrating knowledge into white-box models via directive fine-tuning [75].\"\n\nAnswer: PKG", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 72530, "error": null, "extra": {"thread_id": 2047, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_9ee1ee526a9645f5ba543d1cda3a3a6d", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 421, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "pkg", "normalised_gold": "pkg"}}
+{"qid": "2401.18059v1.pdf::Q000", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "[16, 19, 25]", "pages": 23, "document_id": 7597, "arm": "native_pdf", "question_id": "2401.18059v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to examine Figure 4 to identify which nodes RAPTOR retrieved for both questions about the Cinderella story.\n\nFrom the document, Figure 4 shows the querying process for two questions:\n1. \"What is the central theme of the story?\"\n2. \"How did Cinderella find a happy ending?\"\n\nLooking at Figure 4 on page 7, I can see:\n- Nodes highlighted indicate RAPTOR's selections\n- The figure shows a tree structure with nodes numbered 0-15 at the leaf layer, nodes 16-22 at an intermediate layer, and nodes 23-26 at the root layer\n\nFor Question 1 (central theme), RAPTOR retrieved nodes that are highlighted in one color.\nFor Question 2 (happy ending), RAPTOR retrieved nodes that are highlighted in another color.\n\nThe nodes that overlap (retrieved for BOTH questions) would be those highlighted with both colors.\n\nFrom the visual representation in Figure 4, I can see that certain nodes appear to be selected for both queries. Looking at the highlighted patterns, the nodes that appear to be retrieved for both questions are nodes 18 and 19 (visible in the intermediate layer where both highlighting patterns overlap).\n\nAnswer: 18, 19", "citations": [], "input_tokens": 58460, "output_tokens": 279, "cost_micros": 179565, "latency_ms": 35969, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2401.18059v1.pdf"}, "graded": {"correct": false, "f1": 0.4, "method": "list_set", "normalised_pred": "18, 19", "normalised_gold": "16, 19, 25"}}
+{"qid": "2401.18059v1.pdf::Q000", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "[16, 19, 25]", "pages": 23, "document_id": 7597, "arm": "azure_basic_lc", "question_id": "2401.18059v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 4 in the document, I need to identify which nodes are highlighted (retrieved by RAPTOR) for both questions about Cinderella.\n\nThe figure shows two questions:\n1. \"What is the central theme of the story?\"\n2. \"How did Cinderella find a happy ending?\"\n\nAccording to the figure caption: \"Highlighted nodes indicate RAPTOR's selections, while arrows point to DPR's leaf nodes.\"\n\nExamining the visual representation in Figure 4, I can see the tree structure with nodes numbered 0-26. For nodes to be retrieved for both questions, they must be highlighted for both question 1 and question 2.\n\nLooking at the overlap between the two retrievals shown in the figure, the nodes that appear to be selected for both questions are those at the higher levels of the tree that contain summary information relevant to both the central theme and the happy ending.\n\nBased on the visual representation in Figure 4, the nodes retrieved for both questions appear to be from the upper layers of the tree.\n\nAnswer: 23, 24, 25, 26", "citations": [], "input_tokens": 22903, "output_tokens": 241, "cost_micros": 72324, "latency_ms": 8171, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.28571428571428575, "method": "list_set", "normalised_pred": "23, 24, 25, 26", "normalised_gold": "16, 19, 25"}}
+{"qid": "2401.18059v1.pdf::Q000", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "[16, 19, 25]", "pages": 23, "document_id": 7597, "arm": "azure_premium_lc", "question_id": "2401.18059v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 4, I need to identify which nodes are retrieved by RAPTOR for both questions about Cinderella.\n\nThe legend indicates:\n- Nodes retrieved for Question 1 (central theme) are shown in one color\n- Nodes retrieved for Question 2 (happy ending) are shown in another color\n- Nodes retrieved for BOTH questions would be those highlighted with both colors\n\nFrom the figure, examining the tree structure and the highlighted nodes, the nodes that appear to be retrieved for both questions are the nodes that contain overlapping/comprehensive information about the story.\n\nBased on the visual representation in Figure 4, the nodes retrieved by RAPTOR for both questions are:\n\nAnswer: 16, 17, 18", "citations": [], "input_tokens": 27206, "output_tokens": 161, "cost_micros": 84033, "latency_ms": 5530, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.3333333333333333, "method": "list_set", "normalised_pred": "16, 17, 18", "normalised_gold": "16, 19, 25"}}
+{"qid": "2401.18059v1.pdf::Q000", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "[16, 19, 25]", "pages": 23, "document_id": 7597, "arm": "llamacloud_basic_lc", "question_id": "2401.18059v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 4 in the document, I need to identify which nodes RAPTOR retrieves for both Question 1 (\"What is the central theme of the story?\") and Question 2 (\"How did Cinderella find a happy ending?\").\n\nFrom the figure description and the illustration:\n- For Question 1, RAPTOR retrieves certain highlighted nodes\n- For Question 2, RAPTOR retrieves certain highlighted nodes\n\nThe nodes that appear to be retrieved for BOTH questions (nodes that would be highlighted for both questions) are the nodes that contain information relevant to both the central theme and Cinderella's happy ending.\n\nBased on the visual representation in Figure 4, the nodes retrieved by RAPTOR for both questions appear to be nodes 9 and 10 (which are the upper-level summary nodes that contain broader thematic information relevant to both questions).\n\nAnswer: Node 9, Node 10", "citations": [], "input_tokens": 23495, "output_tokens": 203, "cost_micros": 73530, "latency_ms": 6937, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "node 10, node 9", "normalised_gold": "16, 19, 25"}}
+{"qid": "2401.18059v1.pdf::Q000", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "[16, 19, 25]", "pages": 23, "document_id": 7597, "arm": "llamacloud_premium_lc", "question_id": "2401.18059v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 4 in the document, I need to identify which nodes are retrieved by RAPTOR for both questions.\n\nThe legend indicates:\n- Orange nodes/arrows: RAPTOR retrieved for Question 1 (\"What is the central theme of the story?\")\n- Purple nodes/arrows: RAPTOR retrieved for Question 2 (\"How did Cinderella find a happy ending?\")\n\nFrom the tree diagram, I can see:\n- Question 1 (orange): Node 25 and Node 19 are highlighted\n- Question 2 (purple): Node 17 and Node 20 are highlighted\n\nLooking for nodes that appear in both colors (retrieved for both questions), I need to check if any nodes have both orange and purple highlighting. However, from the visual representation, each highlighted node appears to be associated with only one question - the nodes are distinct between the two questions.\n\nUpon careful examination of Figure 4, there don't appear to be any nodes that are highlighted in both colors, meaning no nodes are retrieved by RAPTOR for both questions simultaneously.\n\nAnswer: None", "citations": [], "input_tokens": 26136, "output_tokens": 239, "cost_micros": 81993, "latency_ms": 7280, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "none", "normalised_gold": "16, 19, 25"}}
+{"qid": "2401.18059v1.pdf::Q000", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "[16, 19, 25]", "pages": 23, "document_id": 7597, "arm": "surfsense_agentic", "question_id": "2401.18059v1.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find Figure 4 and identify which nodes are retrieved by RAPTOR for both questions.Let me search for Figure 4 in the document:Now let me read the specific section containing Figure 4 from the target document:Now let me look at the actual figure content. I need to read more lines to see the figure structure:I can see the figure caption clearly states the two questions. Now I need to look at the actual tree diagram to see which nodes are highlighted for both questions. Let me check if there's more detail in the figure or if I need to examine it more carefully. Let me search for more context about the nodes:Let me look at the image content more carefully. Let me read around line 863 where the OCR text shows the nodes:Perfect! Now I can see the key information. Looking at the visual description:\n\n- **Orange/peach borders** (RAPTOR for Question 1): Nodes 15, 11, 18\n- **Purple/magenta borders** (RAPTOR for Question 2): Nodes 2, 3, 5, 11, 16, 17, 19\n\nThe nodes that appear in BOTH lists (retrieved by RAPTOR for both questions) is: **Node 11**\n\nHowever, let me also check the arrows to be thorough:Based on the visual description of Figure 4:\n\n**Question 1** (orange/peach): Nodes 15, 11, 18\n**Question 2** (purple/magenta): Nodes 2, 3, 5, 11, 16, 17, 19\n\nThe node that appears in both lists is **11**.\n\nAdditionally, looking at the arrows (which point to DPR's retrievals), nodes 3 and 11 have both orange and purple arrows, but the question asks specifically about which nodes are retrieved by RAPTOR (indicated by the border colors), not DPR (indicated by the arrows).\n\nAnswer: 11", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 61625, "error": null, "extra": {"thread_id": 2048, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_99be096db80e4817854f19cccddee98b", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 378, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "11", "normalised_gold": "16, 19, 25"}}
+{"qid": "2401.18059v1.pdf::Q001", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['Context Length', 'F1']", "pages": 23, "document_id": 7597, "arm": "native_pdf", "question_id": "2401.18059v1.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 3 on page 6 of the document, I can identify the axes of the graph.\n\nThe figure shows \"Comparison of querying methods. Results on 20 stories from the QASPER dataset using tree traversal with different top-k values, and collapsed tree with different context lengths.\"\n\nThe horizontal axis is labeled \"Context Length\" and the vertical axis is labeled \"F1\" (which represents the F1 score metric).\n\nAnswer: Context Length, F1", "citations": [], "input_tokens": 58457, "output_tokens": 109, "cost_micros": 177006, "latency_ms": 33343, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2401.18059v1.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "context length, f1", "normalised_gold": "context length, f1"}}
+{"qid": "2401.18059v1.pdf::Q001", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['Context Length', 'F1']", "pages": 23, "document_id": 7597, "arm": "azure_basic_lc", "question_id": "2401.18059v1.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 3 in the document, which is titled \"Comparison of querying methods\":\n\nThe horizontal axis is labeled \"Context Length\" (measuring from 500 to 2500).\n\nThe vertical axis is labeled \"F1\" (measuring from approximately 40 to 55).\n\nAnswer: Context Length, F1", "citations": [], "input_tokens": 22900, "output_tokens": 74, "cost_micros": 69810, "latency_ms": 4109, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "context length, f1", "normalised_gold": "context length, f1"}}
+{"qid": "2401.18059v1.pdf::Q001", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['Context Length', 'F1']", "pages": 23, "document_id": 7597, "arm": "azure_premium_lc", "question_id": "2401.18059v1.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 3 in the document:\n\nThe figure shows \"Comparison of querying methods. Results on 20 stories from the QASPER dataset using tree traversal with different top-k values, and collapsed tree with different context lengths.\"\n\nThe horizontal axis is labeled \"Context Length\" and the vertical axis shows the performance metric (appears to be accuracy or F1 score percentage based on the scale showing values like 45, 80, etc.).\n\nAnswer: Context Length, Performance Score (F1/Accuracy)", "citations": [], "input_tokens": 27203, "output_tokens": 116, "cost_micros": 83349, "latency_ms": 5796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "context length, performance score f1 accuracy", "normalised_gold": "context length, f1"}}
+{"qid": "2401.18059v1.pdf::Q001", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['Context Length', 'F1']", "pages": 23, "document_id": 7597, "arm": "llamacloud_basic_lc", "question_id": "2401.18059v1.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 3 in the document, I need to identify the horizontal and vertical axes.\n\nFrom the figure caption and the graph itself:\n- The horizontal axis shows \"Context Length\" (measured in tokens, ranging from 500 to 2500)\n- The vertical axis shows the performance metric (which appears to be accuracy or F1 score, ranging from approximately 50 to 58)\n\nAnswer: Context Length, Performance metric (accuracy/F1 score)", "citations": [], "input_tokens": 23492, "output_tokens": 105, "cost_micros": 72051, "latency_ms": 5092, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "context length, performance metric accuracy f1 score", "normalised_gold": "context length, f1"}}
+{"qid": "2401.18059v1.pdf::Q001", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['Context Length', 'F1']", "pages": 23, "document_id": 7597, "arm": "llamacloud_premium_lc", "question_id": "2401.18059v1.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 3 in the document:\n\nThe figure shows \"Comparison of querying methods\" with results on 20 stories from the QASPER dataset.\n\nFrom the table in Figure 3:\n- The horizontal axis shows \"Context Length\" (measured in tokens: 500, 1000, 1250, 1500, 1750, 2000, 2500)\n- The vertical axis shows \"F1\" (performance metric)\n\nAnswer: Context Length, F1", "citations": [], "input_tokens": 26133, "output_tokens": 113, "cost_micros": 80094, "latency_ms": 4717, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "context length, f1", "normalised_gold": "context length, f1"}}
+{"qid": "2401.18059v1.pdf::Q001", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['Context Length', 'F1']", "pages": 23, "document_id": 7597, "arm": "surfsense_agentic", "question_id": "2401.18059v1.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the document to find Figure 3 and identify its axes.Let me search for \"Figure 3\" in the document.Now let me search for the figure content with context:Let me read the section around line 786 to see the full figure details:Let me continue reading to see the complete axis information:This appears to be describing a different figure. Let me look more carefully at the figure data around line 826:Based on the figure caption and the data I see, I can see \"Context Length\" is mentioned at line 826. However, I need to find what the vertical axis represents. Let me search around the figure more carefully. The numbers like 45, 5, 7, 3, 4, 80 seem to be data values. Let me look for axis labels by reading the embedded image description or OCR text more carefully:Let me search for any text that might indicate what metric is being measured (the vertical axis). Let me look more broadly around Figure 3:I can see \"Context Length\" appears at line 826, which is likely the horizontal axis. The text mentions \"Figure 3 shows the performance\" but I need to find what specific performance metric. Let me search for what metric is being used in QASPER:Great! I found it. Let me read around line 712 to confirm this is for Figure 3:Perfect! I found the information. Looking at lines 693, 704, 711-712:\n- Line 693: \"F1\"\n- Line 704: \"Context Length\"\n- Line 711: \"X-axis shows context length\"\n- Line 712: \"Y-axis shows F1 score\"\n\nSo for Figure 3:\n- Horizontal axis (X-axis) = Context Length\n- Vertical axis (Y-axis) = F1\n\nAnswer: Context Length, F1", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 67266, "error": null, "extra": {"thread_id": 2049, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_3509043a1828411e846352ac4e585d70", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 401, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "context length, f1", "normalised_gold": "context length, f1"}}
+{"qid": "2401.18059v1.pdf::Q002", "doc_id": "2401.18059v1.pdf", "answer_format": "str", "gold": "Collapsed Tree Algorithm", "pages": 23, "document_id": 7597, "arm": "native_pdf", "question_id": "2401.18059v1.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the \"PSEUDOCODE FOR RETRIEVAL METHODS\" section and count the lines in each algorithm.\n\nLooking at Appendix F on page 18, I can see:\n\n**Algorithm 1: Tree Traversal Algorithm**\nLet me count the lines of code (excluding comments and function declarations):\n- Line 1: Scurrent \u2190 tree.layer[0]\n- Line 2: for layer in range(tree.num layers) do\n- Line 3: topk \u2190 []\n- Line 4: for node in Scurrent do\n- Line 5: score \u2190 dot product(query, node)\n- Line 6: top k.append((node, score))\n- Line 7: end for\n- Line 8: Slayer \u2190 sorted(top k)[:k].nodes\n- Line 9: Scurrent \u2190 Slayer\n- Line 10: end for\n- Line 11: return S0 \u222a S1 \u222a S2 \u222a . . . \u222a Sk\n\nThat's approximately 11 lines of code.\n\n**Algorithm 2: Collapsed Tree Algorithm** (page 19)\nLet me count the lines:\n- Line 1: tree \u2190 flatten(tree)\n- Line 2: top nodes \u2190 []\n- Line 3: for node in tree do\n- Line 4: top nodes.append((node, dot product(query, node))\n- Line 5: end for\n- Line 6: top nodes \u2190 sorted(top nodes)\n- Line 7: result \u2190 []\n- Line 8: total tokens \u2190 0\n- Line 9: for node in top nodes do\n- Line 10: if total tokens + node.token size < max tokens then\n- Line 11: result.append(node)\n- Line 12: end if\n- Line 13: total tokens \u2190 total tokens + node.token size\n- Line 14: end for\n- Line 15: return result\n\nThat's approximately 15 lines of code.\n\nAnswer: Collapsed Tree Algorithm", "citations": [], "input_tokens": 58465, "output_tokens": 483, "cost_micros": 182640, "latency_ms": 36671, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2401.18059v1.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "collapsed tree algorithm", "normalised_gold": "collapsed tree algorithm"}}
+{"qid": "2401.18059v1.pdf::Q002", "doc_id": "2401.18059v1.pdf", "answer_format": "str", "gold": "Collapsed Tree Algorithm", "pages": 23, "document_id": 7597, "arm": "azure_basic_lc", "question_id": "2401.18059v1.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the pseudocode section in Appendix F:\n\n**Algorithm 1: Tree Traversal Algorithm**\n- Lines: function declaration, initialization (Scurrent), for loop, topk initialization, nested for loop, score calculation, top_k.append, end for, Slayer assignment, Scurrent assignment, end for, return statement, end function\n- Total: **13 lines**\n\n**Algorithm 2: Collapsed Tree Algorithm**\n- Lines: function declaration, tree flatten, top_nodes initialization, for loop, top_nodes.append, end for, top_nodes sort, result initialization, total_tokens initialization, for loop, if statement, result.append, end if, total_tokens update, end for, return statement, end function\n- Total: **17 lines**\n\nAnswer: Collapsed Tree Algorithm", "citations": [], "input_tokens": 22908, "output_tokens": 184, "cost_micros": 71484, "latency_ms": 5922, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "collapsed tree algorithm", "normalised_gold": "collapsed tree algorithm"}}
+{"qid": "2401.18059v1.pdf::Q002", "doc_id": "2401.18059v1.pdf", "answer_format": "str", "gold": "Collapsed Tree Algorithm", "pages": 23, "document_id": 7597, "arm": "azure_premium_lc", "question_id": "2401.18059v1.pdf::Q002", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "collapsed tree algorithm"}}
+{"qid": "2401.18059v1.pdf::Q002", "doc_id": "2401.18059v1.pdf", "answer_format": "str", "gold": "Collapsed Tree Algorithm", "pages": 23, "document_id": 7597, "arm": "llamacloud_basic_lc", "question_id": "2401.18059v1.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the pseudocode in Appendix F:\n\n**Algorithm 1 (Tree Traversal Algorithm):**\n- Lines 1-13 (including function definition, loops, operations, and end statements)\n\n**Algorithm 2 (Collapsed Tree Algorithm):**\n- Lines 1-17 (including function definition, loops, operations, and end statements)\n\nCounting all lines including the function declarations, loop structures, operations, and end statements, Algorithm 2 (Collapsed Tree Algorithm) has more lines.\n\nAnswer: Collapsed Tree Algorithm", "citations": [], "input_tokens": 23500, "output_tokens": 124, "cost_micros": 72360, "latency_ms": 5562, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "collapsed tree algorithm", "normalised_gold": "collapsed tree algorithm"}}
+{"qid": "2401.18059v1.pdf::Q002", "doc_id": "2401.18059v1.pdf", "answer_format": "str", "gold": "Collapsed Tree Algorithm", "pages": 23, "document_id": 7597, "arm": "llamacloud_premium_lc", "question_id": "2401.18059v1.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the pseudocode in Section F \"PSEUDOCODE FOR RETRIEVAL METHODS\":\n\n**Algorithm 1 (Tree Traversal Algorithm):**\n- Lines include: function declaration, variable initialization, for loop with nested for loop, operations inside loops, and return statement\n- Counting all lines: approximately 13 lines\n\n**Algorithm 2 (Collapsed Tree Algorithm):**\n- Lines include: function declaration, tree flattening, initialization, for loop for scoring, sorting, result initialization, token counter, for loop with conditional, and return statement\n- Counting all lines: approximately 15 lines\n\nAnswer: Collapsed Tree Algorithm", "citations": [], "input_tokens": 26141, "output_tokens": 143, "cost_micros": 80568, "latency_ms": 6157, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "collapsed tree algorithm", "normalised_gold": "collapsed tree algorithm"}}
+{"qid": "2401.18059v1.pdf::Q002", "doc_id": "2401.18059v1.pdf", "answer_format": "str", "gold": "Collapsed Tree Algorithm", "pages": 23, "document_id": 7597, "arm": "surfsense_agentic", "question_id": "2401.18059v1.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the document to find the \"PSEUDOCODE FOR RETRIEVAL METHODS\" section and count the lines for each algorithm.Let me search for the \"PSEUDOCODE FOR RETRIEVAL METHODS\" section:Now let me search for more specific context around pseudocode:Let me read the section with the algorithms:Perfect! I can see both algorithms. Let me count the lines:\n\n**Algorithm 1: Tree Traversal Algorithm** (lines 2344-2356):\n- Line 2344: function TRAVERSETREE(tree, query, k)\n- Line 2345: Scurrent + tree.layer[0]\n- Line 2346: for layer in range(tree.num_layers) do\n- Line 2347: topk + []\n- Line 2348: for node in Scurrent do\n- Line 2349: score + dot_product(query, node)\n- Line 2350: top_k.append((node, score))\n- Line 2351: end for\n- Line 2352: Slayer + sorted(top_k)[:k].nodes\n- Line 2353: Scurrent + Slayer\n- Line 2354: end for\n- Line 2355: return So U S1 U S2 U . . . USk\n- Line 2356: end function\n\n**Total: 13 lines**\n\n**Algorithm 2: Collapsed Tree Algorithm** (lines 2375-2391):\n- Line 2375: function COLLAPSEDTREE(tree, query, k, max_tokens)\n- Line 2376: tree \u2039- flatten(tree)\n- Line 2377: top_nodes <- []\n- Line 2378: for node in tree do\n- Line 2379: top_nodes.append((node, dot_product(query, node))\n- Line 2380: end for\n- Line 2381: top_nodes \u2039- sorted(top_nodes)\n- Line 2382: result +- []\n- Line 2383: total_tokens <- 0\n- Line 2384: for node in top nodes do\n- Line 2385: if total_tokens + node.token_size < max_tokens then\n- Line 2386: result.append(node)\n- Line 2387: end if\n- Line 2388: total_tokens \u2039- total_tokens + node.token_size\n- Line 2389: end for\n- Line 2390: return result\n- Line 2391: end function\n\n**Total: 17 lines**\n\nAnswer: Collapsed Tree Algorithm", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 33921, "error": null, "extra": {"thread_id": 2050, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_8432e22fe2074cf88b3f3fbd3ebd2c32", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 208, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "collapsed tree algorithm", "normalised_gold": "collapsed tree algorithm"}}
+{"qid": "2401.18059v1.pdf::Q003", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['green', 'yellow']", "pages": 23, "document_id": 7597, "arm": "native_pdf", "question_id": "2401.18059v1.pdf::Q003", "answer_letter": null, "raw_text": "I need to examine Figure 1 to identify nodes that appear in more than one cluster and determine their colors.\n\nLooking at Figure 1 on page 2, which shows the \"Formation of one tree layer\" process:\n\nThe figure shows the clustering step (labeled \"1. Clustering\") where text chunks numbered 1-5 are grouped. The diagram illustrates how nodes are clustered together, with some nodes appearing to be part of multiple clusters during the soft clustering process.\n\nIn the clustering visualization, I can see that:\n- The nodes are represented as numbered boxes (1, 2, 3, 4, 5)\n- Some nodes appear in multiple cluster groupings\n- Looking at the color coding in the diagram, nodes that belong to multiple clusters appear to be shown with a pink/red outline or highlighting\n\nExamining the \"Contents of a node\" box on the right side and the clustering diagram in the middle, nodes that appear in multiple clusters are highlighted in pink/red color.\n\nAnswer: pink, red", "citations": [], "input_tokens": 58465, "output_tokens": 221, "cost_micros": 178710, "latency_ms": 37828, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2401.18059v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "pink, red", "normalised_gold": "green, yellow"}}
+{"qid": "2401.18059v1.pdf::Q003", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['green', 'yellow']", "pages": 23, "document_id": 7597, "arm": "azure_basic_lc", "question_id": "2401.18059v1.pdf::Q003", "answer_letter": null, "raw_text": "I need to examine Figure 1 in the document to identify nodes that appear in more than one cluster and determine their colors.\n\nLooking at Figure 1, which shows the \"Tree construction process\" and \"Formation of one tree layer\":\n\nThe figure shows:\n- Text chunks numbered 1-5 at the bottom (leaf layer)\n- A clustering process in the middle showing how these chunks are grouped\n- The tree structure on the right\n\nIn the \"Formation of one tree layer\" section, I can see:\n- The clustering diagram shows nodes being grouped together\n- Looking at the clustering visualization, I need to identify which numbered nodes appear in multiple clusters\n\nExamining the clustering section carefully:\n- Node 1 appears to be in one cluster\n- Node 2 appears in multiple positions\n- Node 3 appears in multiple positions  \n- Node 4 appears to be in one cluster\n- Node 5 appears in multiple positions\n\nLooking at the colors used in the figure:\n- The nodes that appear in multiple clusters appear to be colored in shades that look like they could be blue, green, or orange/yellow tones in the clustering diagram.\n\nHowever, upon closer examination of Figure 1, the nodes shown in the \"1. Clustering\" section use different colors. The nodes that appear in overlapping or multiple cluster regions appear to use colors like blue and green based on the grayscale representation in the figure.\n\nLooking more carefully at the specific nodes (2, 3, 5) that appear in the clustering overlap, they appear to use blue and green colors.\n\nAnswer: blue, green", "citations": [], "input_tokens": 22908, "output_tokens": 347, "cost_micros": 73929, "latency_ms": 10734, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "blue, green", "normalised_gold": "green, yellow"}}
+{"qid": "2401.18059v1.pdf::Q003", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['green', 'yellow']", "pages": 23, "document_id": 7597, "arm": "azure_premium_lc", "question_id": "2401.18059v1.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Figure 1, I need to identify nodes that appear in multiple clusters during the clustering step.\n\nIn the \"Formation of one tree layer\" section of Figure 1, I can see the clustering process where nodes 1-5 are being grouped. \n\nExamining the clusters:\n- One cluster contains nodes 2 and 3\n- Another cluster contains nodes 4 and 5\n\nLooking at the visual representation, the nodes that appear in the clusters are shown with different colors. The nodes that would appear in multiple clusters (if any existed) would need to be identified by their color.\n\nHowever, from the figure, each node (1-5) appears to belong to only one cluster. Node 3 appears in the cluster with node 2, and node 5 appears in the cluster with node 4. None of the nodes are shown appearing in more than one cluster simultaneously.\n\nUpon careful examination of the figure, there don't appear to be any nodes shown in multiple clusters at the same time - the soft clustering concept is mentioned in the text but not visually demonstrated in this particular figure with overlapping memberships.\n\nAnswer: None", "citations": [], "input_tokens": 27211, "output_tokens": 246, "cost_micros": 85323, "latency_ms": 7733, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "none", "normalised_gold": "green, yellow"}}
+{"qid": "2401.18059v1.pdf::Q003", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['green', 'yellow']", "pages": 23, "document_id": 7597, "arm": "llamacloud_basic_lc", "question_id": "2401.18059v1.pdf::Q003", "answer_letter": null, "raw_text": "I need to examine Figure 1 carefully to identify nodes that appear in more than one cluster and determine their colors.\n\nLooking at Figure 1, which shows the \"Formation of one tree layer\" section:\n\nThe figure shows a clustering process where text chunks (numbered 1-5 at the bottom) are being clustered. \n\nIn the clustering visualization, I can see:\n- Text chunks numbered 1, 2, 3, 4, 5 at the bottom\n- The chunks are shown being grouped into clusters\n- After clustering (step 1), chunks 3 and 5 appear to be in multiple clusters based on the overlapping circles/groupings shown\n\nLooking at the visual representation, the nodes that appear in more than one cluster are shown in a yellow/gold color, while nodes that appear in only one cluster are shown in blue.\n\nSpecifically, nodes 3 and 5 appear in overlapping cluster regions and are colored yellow, indicating they belong to multiple clusters (soft clustering).\n\nAnswer: yellow", "citations": [], "input_tokens": 23500, "output_tokens": 224, "cost_micros": 73860, "latency_ms": 8203, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.6666666666666666, "method": "list_set", "normalised_pred": "yellow", "normalised_gold": "green, yellow"}}
+{"qid": "2401.18059v1.pdf::Q003", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['green', 'yellow']", "pages": 23, "document_id": 7597, "arm": "llamacloud_premium_lc", "question_id": "2401.18059v1.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document, I need to examine the tree structure diagram to identify nodes that appear in multiple clusters.\n\nFrom the diagram showing \"RAPTOR Tree\" and \"Formation of one tree layer\":\n\nThe diagram shows:\n- Text chunks labeled 1, 2, 3, 4, 5 at the bottom (leaf nodes)\n- These are clustered into groups\n- Summary nodes 6, 7, 8 are created from clusters\n- Higher level nodes 9, 10 are created from further clustering\n\nExamining the tree structure:\n- Node 6 is connected to nodes 1 and 2\n- Node 7 is connected to node 3\n- Node 8 is connected to nodes 4 and 5\n- Node 9 is connected to nodes 6 and 7\n- Node 10 is connected to node 8\n\nLooking at the clustering process description, the figure shows standard hierarchical clustering where each node belongs to one cluster at each level. The nodes are represented in the diagram without specific color coding mentioned in the text, and the diagram appears to be a standard tree structure where nodes don't appear in multiple clusters simultaneously.\n\nHowever, based on the visual representation in the document, the nodes appear to be shown in a neutral/default color (likely gray or black outlines) in the standard tree diagram format.\n\nAnswer: The figure does not show nodes in multiple clusters with distinct colors", "citations": [], "input_tokens": 26141, "output_tokens": 317, "cost_micros": 83178, "latency_ms": 9719, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "figure does not show nodes in multiple clusters with distinct colors", "normalised_gold": "green, yellow"}}
+{"qid": "2401.18059v1.pdf::Q003", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['green', 'yellow']", "pages": 23, "document_id": 7597, "arm": "surfsense_agentic", "question_id": "2401.18059v1.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the document to find Figure 1 and identify the colors of nodes that appear in more than one cluster.Let me continue reading to see the full description of Figure 1:Based on my reading of the document, I can see the description of Figure 1. Let me look more carefully at the visual description section that describes the tree construction process and the colors of nodes. Looking at lines 426-456, I found the relevant information:\n\nIn the visual description of Figure 1:\n- **Purple/Lavender**: Tree nodes (non-selected or leaf nodes)\n- **Pink**: Selected/relevant nodes and retrieved context\n\nThe description states that in the tree traversal diagram:\n- \"Purple/lavender nodes at leaf positions\"\n- \"Pink nodes appear at internal/decision points\"\n\nAnd in the Part B (Collapsed Tree Retrieval) section, it mentions:\n- \"Mix of purple and pink boxes in linear sequence\"\n\nFrom the description of the tree formation process in lines 305-385, it shows nodes being clustered together, and the color coding explanation in lines 447-453 clearly indicates:\n- **Purple/Lavender**: Tree nodes (non-selected or leaf nodes)\n- **Pink**: Selected/relevant nodes and retrieved context\n\nSince the question asks about nodes that appear in more than one cluster, and based on the visual description which shows that pink nodes appear at internal/decision points while purple/lavender nodes are at leaf positions, the nodes that would appear in multiple clusters (as parent nodes summarizing multiple child clusters) would be the **pink** nodes.\n\nHowever, let me verify this interpretation by checking if there's more specific information about which colored nodes appear in multiple clusters in the demonstration.\n\nLooking at the figure caption (lines 302-303): \"Nodes clustered together are siblings; a parent node contains the text summary of that cluster.\"\n\nThis indicates that parent nodes (which would be pink based on the color coding) are the ones that span multiple clusters by summarizing them.\n\nAnswer: pink", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 36328, "error": null, "extra": {"thread_id": 2051, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_50d7b2d924b34b7ab5e0bc22ccd0b3d6", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 237, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "pink", "normalised_gold": "green, yellow"}}
+{"qid": "2401.18059v1.pdf::Q005", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['Gaussian Mixture Models', 'the high dimensionality of vector embeddings']", "pages": 23, "document_id": 7597, "arm": "native_pdf", "question_id": "2401.18059v1.pdf::Q005", "answer_letter": null, "raw_text": "I need to find information about the clustering algorithm used in this paper and what challenges it faces.\n\nFrom page 3-4, I can find the relevant information about the clustering algorithm:\n\nThe paper states: \"Our clustering algorithm is based on Gaussian Mixture Models (GMMs), an approach that offers both flexibility and a probabilistic framework.\"\n\nRegarding the challenge, it states: \"The high dimensionality of vector embeddings presents a challenge for traditional GMMs, as distance metrics may behave poorly when used to measure similarity in high-dimensional spaces (Aggarwal et al., 2001).\"\n\nTo address this challenge, the paper mentions: \"To mitigate this, we employ Uniform Manifold Approximation and Projection (UMAP), a manifold learning technique for dimensionality reduction (McInnes et al., 2018).\"\n\nAnswer: Gaussian Mixture Models (GMMs), high dimensionality of vector embeddings", "citations": [], "input_tokens": 58463, "output_tokens": 208, "cost_micros": 178509, "latency_ms": 33342, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2401.18059v1.pdf"}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "gaussian mixture models gmms, high dimensionality of vector embeddings", "normalised_gold": "gaussian mixture models, high dimensionality of vector embeddings"}}
+{"qid": "2401.18059v1.pdf::Q005", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['Gaussian Mixture Models', 'the high dimensionality of vector embeddings']", "pages": 23, "document_id": 7597, "arm": "azure_basic_lc", "question_id": "2401.18059v1.pdf::Q005", "answer_letter": null, "raw_text": "I'll search through the document for information about the clustering algorithm and its challenges.\n\nLooking at Section 3 \"METHODS\", specifically the \"Clustering Algorithm\" subsection:\n\nThe document states:\n\"Our clustering algorithm is based on Gaussian Mixture Models (GMMs), an approach that offers both flexibility and a probabilistic framework.\"\n\nAnd regarding the challenge:\n\"The high dimensionality of vector embeddings presents a challenge for traditional GMMs, as distance metrics may behave poorly when used to measure similarity in high-dimensional spaces\"\n\nAnswer: Gaussian Mixture Models, high dimensionality of vector embeddings", "citations": [], "input_tokens": 22906, "output_tokens": 134, "cost_micros": 70728, "latency_ms": 7625, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "gaussian mixture models, high dimensionality of vector embeddings", "normalised_gold": "gaussian mixture models, high dimensionality of vector embeddings"}}
+{"qid": "2401.18059v1.pdf::Q005", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['Gaussian Mixture Models', 'the high dimensionality of vector embeddings']", "pages": 23, "document_id": 7597, "arm": "azure_premium_lc", "question_id": "2401.18059v1.pdf::Q005", "answer_letter": null, "raw_text": "Looking through the document for information about the clustering algorithm...\n\nIn Section 3, under \"Clustering Algorithm\", the document states:\n\n\"Our clustering algorithm is based on Gaussian Mixture Models (GMMs), an approach that offers both flexibility and a probabilistic framework.\"\n\nIt then discusses the challenge:\n\n\"The high dimensionality of vector embeddings presents a challenge for traditional GMMs, as distance metrics may behave poorly when used to measure similarity in high-dimensional spaces\"\n\nAnswer: Gaussian Mixture Models, high dimensionality of vector embeddings", "citations": [], "input_tokens": 27209, "output_tokens": 120, "cost_micros": 83427, "latency_ms": 5312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "gaussian mixture models, high dimensionality of vector embeddings", "normalised_gold": "gaussian mixture models, high dimensionality of vector embeddings"}}
+{"qid": "2401.18059v1.pdf::Q005", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['Gaussian Mixture Models', 'the high dimensionality of vector embeddings']", "pages": 23, "document_id": 7597, "arm": "llamacloud_basic_lc", "question_id": "2401.18059v1.pdf::Q005", "answer_letter": null, "raw_text": "Looking through the document for information about the clustering algorithm...\n\nIn Section 3, under \"Clustering Algorithm\", I can find the relevant information:\n\nThe clustering algorithm is based on **Gaussian Mixture Models (GMMs)**.\n\nThe challenge presented to it is stated as: \"The high dimensionality of vector embeddings presents a challenge for traditional GMMs, as distance metrics may behave poorly when used to measure similarity in high-dimensional spaces.\"\n\nAnswer: Gaussian Mixture Models, high dimensionality of vector embeddings", "citations": [], "input_tokens": 23498, "output_tokens": 114, "cost_micros": 72204, "latency_ms": 4781, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "gaussian mixture models, high dimensionality of vector embeddings", "normalised_gold": "gaussian mixture models, high dimensionality of vector embeddings"}}
+{"qid": "2401.18059v1.pdf::Q005", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['Gaussian Mixture Models', 'the high dimensionality of vector embeddings']", "pages": 23, "document_id": 7597, "arm": "llamacloud_premium_lc", "question_id": "2401.18059v1.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about the clustering algorithm and what presents a challenge to it.\n\nFrom Section 3 \"METHODS\" under \"Clustering Algorithm\":\n\nThe document states:\n\"Our clustering algorithm is based on Gaussian Mixture Models (GMMs), an approach that offers both flexibility and a probabilistic framework.\"\n\nThen it mentions:\n\"The high dimensionality of vector embeddings presents a challenge for traditional GMMs, as distance metrics may behave poorly when used to measure similarity in high-dimensional spaces\"\n\nAnswer: Gaussian Mixture Models, high dimensionality of vector embeddings", "citations": [], "input_tokens": 26139, "output_tokens": 134, "cost_micros": 80427, "latency_ms": 6639, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "gaussian mixture models, high dimensionality of vector embeddings", "normalised_gold": "gaussian mixture models, high dimensionality of vector embeddings"}}
+{"qid": "2401.18059v1.pdf::Q005", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['Gaussian Mixture Models', 'the high dimensionality of vector embeddings']", "pages": 23, "document_id": 7597, "arm": "surfsense_agentic", "question_id": "2401.18059v1.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to find information about the clustering algorithm, what model it's based on, and what presents a challenge to it.Let me search for information about the clustering algorithm and its base model:Perfect! I found the information about the clustering algorithm. Let me verify by reading a bit more to ensure I have the complete answer:Based on the document, I have found the answer to the question:\n\n1. The clustering algorithm is based on **Gaussian Mixture Models (GMMs)**\n2. The challenge to it is **the high dimensionality of vector embeddings** (as stated: \"The high dimensionality of vector embeddings presents a challenge for traditional GMMs, as distance metrics may behave poorly when used to measure similarity in high-dimensional spaces\")\n\nAnswer: Gaussian Mixture Models, high dimensionality of vector embeddings", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 34000, "error": null, "extra": {"thread_id": 2052, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_6459e8a5fde745a3b9337a10119c8000", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 161, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "gaussian mixture models, high dimensionality of vector embeddings", "normalised_gold": "gaussian mixture models, high dimensionality of vector embeddings"}}
+{"qid": "2401.18059v1.pdf::Q006", "doc_id": "2401.18059v1.pdf", "answer_format": "str", "gold": "Slayer \u2190 sorted(top k)[:k].nodes", "pages": 23, "document_id": 7597, "arm": "native_pdf", "question_id": "2401.18059v1.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the pseudocode for the tree traversal method in the appendix and identify step 5.\n\nLooking at Appendix F, I can see Algorithm 1 which is the Tree Traversal Algorithm.\n\nLet me examine the steps in the algorithm:\n\n```\nfunction TRAVERSETREE(tree, query, k)\n    Scurrent \u2190 tree.layer[0]\n    for layer in range(tree.num layers) do\n        topk \u2190 []\n        for node in Scurrent do\n            score \u2190 dot product(query, node)\n            top k.append((node, score))\n        end for\n        Slayer \u2190 sorted(top k)[:k].nodes\n        Scurrent \u2190 Slayer\n    end for\n    return S0 \u222a S1 \u222a S2 \u222a . . . \u222a Sk\nend function\n```\n\nCounting the steps:\n1. `Scurrent \u2190 tree.layer[0]`\n2. `for layer in range(tree.num layers) do`\n3. `topk \u2190 []`\n4. `for node in Scurrent do`\n5. `score \u2190 dot product(query, node)`\n\nStep 5 is: `score \u2190 dot product(query, node)`\n\nAnswer: score \u2190 dot product(query, node)", "citations": [], "input_tokens": 58458, "output_tokens": 318, "cost_micros": 180144, "latency_ms": 33764, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2401.18059v1.pdf"}, "graded": {"correct": false, "f1": 0.15384615384615383, "method": "str_norm", "normalised_pred": "score \u2190 dot product query node", "normalised_gold": "slayer \u2190 sorted top k k nodes"}}
+{"qid": "2401.18059v1.pdf::Q006", "doc_id": "2401.18059v1.pdf", "answer_format": "str", "gold": "Slayer \u2190 sorted(top k)[:k].nodes", "pages": 23, "document_id": 7597, "arm": "azure_basic_lc", "question_id": "2401.18059v1.pdf::Q006", "answer_letter": null, "raw_text": "Looking at the tree traversal method steps in Section 3 of the document, step 5 states:\n\"5. Continue this process recursively for d layers, producing sets S1, S2, . .. , Sd.\"\n\nNow checking Algorithm 1 (Tree Traversal Algorithm) in Appendix F, the line that corresponds to this step is:\n\n```\nfor layer in range(tree.num_layers) do\n```\n\nThis is the loop that continues the process recursively through the layers.\n\nAnswer: for layer in range(tree.num_layers) do", "citations": [], "input_tokens": 22901, "output_tokens": 129, "cost_micros": 70638, "latency_ms": 5359, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "for layer in range tree num layers do", "normalised_gold": "slayer \u2190 sorted top k k nodes"}}
+{"qid": "2401.18059v1.pdf::Q006", "doc_id": "2401.18059v1.pdf", "answer_format": "str", "gold": "Slayer \u2190 sorted(top k)[:k].nodes", "pages": 23, "document_id": 7597, "arm": "azure_premium_lc", "question_id": "2401.18059v1.pdf::Q006", "answer_letter": null, "raw_text": "Looking at the document, in Section 3 under \"Querying\", step 5 of the tree traversal method states:\n\n\"5. Continue this process recursively for d layers, producing sets S1, S2, ... , Sd.\"\n\nIn Appendix F, Algorithm 1 shows the Tree Traversal Algorithm pseudocode. Step 5 corresponds to the for loop that iterates through layers:\n\n```\nfor layer in range(tree.num_layers) do\n    topk + []\n    for node in Scurrent do\n        score + dot_product(query, node)\n        top_k.append((node, score))\n    end for\n    Slayer + sorted(top_k)[:k].nodes\n    Scurrent + Slayer\nend for\n```\n\nAnswer: for layer in range(tree.num_layers) do ... end for", "citations": [], "input_tokens": 27204, "output_tokens": 200, "cost_micros": 84612, "latency_ms": 7358, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "for layer in range tree num layers do end for", "normalised_gold": "slayer \u2190 sorted top k k nodes"}}
+{"qid": "2401.18059v1.pdf::Q006", "doc_id": "2401.18059v1.pdf", "answer_format": "str", "gold": "Slayer \u2190 sorted(top k)[:k].nodes", "pages": 23, "document_id": 7597, "arm": "llamacloud_basic_lc", "question_id": "2401.18059v1.pdf::Q006", "answer_letter": null, "raw_text": "Looking at the tree traversal method described in Section 3 (Querying section), step 5 states:\n\n\"5. Continue this process recursively for d layers, producing sets S\u2081, S\u2082, . . . , Sd.\"\n\nIn Appendix F, Algorithm 1 shows the Tree Traversal Algorithm pseudocode. The code that corresponds to step 5 is the for loop that iterates through the layers:\n\n```\nfor layer in range(tree.num_layers) do\n    top\u2096 \u2190 []\n    for node in Scurrent do\n        score \u2190 dot_product(query, node)\n        top_k.append((node, score))\n    end for\n    Slayer \u2190 sorted(top_k)[:k].nodes\n    Scurrent \u2190 Slayer\nend for\n```\n\nAnswer: for layer in range(tree.num_layers) do ... end for", "citations": [], "input_tokens": 23493, "output_tokens": 212, "cost_micros": 73659, "latency_ms": 7062, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "for layer in range tree num layers do end for", "normalised_gold": "slayer \u2190 sorted top k k nodes"}}
+{"qid": "2401.18059v1.pdf::Q006", "doc_id": "2401.18059v1.pdf", "answer_format": "str", "gold": "Slayer \u2190 sorted(top k)[:k].nodes", "pages": 23, "document_id": 7597, "arm": "llamacloud_premium_lc", "question_id": "2401.18059v1.pdf::Q006", "answer_letter": null, "raw_text": "Looking at the tree traversal method described in Section 3 (page 4-5), step 5 states:\n\n\"5. Continue this process recursively for $d$ layers, producing sets $S_1, S_2, \\dots, S_d$.\"\n\nNow looking at Algorithm 1 (Tree Traversal Algorithm) in Appendix F (page 18), the pseudo code line that corresponds to this step is:\n\n```\nS_current \u2190 S_layer\n```\n\nThis line updates the current set of nodes to continue the process to the next layer.\n\nAnswer: S_current \u2190 S_layer", "citations": [], "input_tokens": 26134, "output_tokens": 141, "cost_micros": 80517, "latency_ms": 5858, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.16666666666666666, "method": "str_norm", "normalised_pred": "s current \u2190 s layer", "normalised_gold": "slayer \u2190 sorted top k k nodes"}}
+{"qid": "2401.18059v1.pdf::Q006", "doc_id": "2401.18059v1.pdf", "answer_format": "str", "gold": "Slayer \u2190 sorted(top k)[:k].nodes", "pages": 23, "document_id": 7597, "arm": "surfsense_agentic", "question_id": "2401.18059v1.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the document to find the pseudocode in the appendix related to step 5 of the tree traversal method.Let me search for \"tree traversal\" and look at the appendix section to find the pseudocode.Now let me find the actual appendix section with pseudocode. Let me search for \"step 5\" or look at the end of the document where appendices typically are.Let me continue looking for the appendix sections. Let me search further in the document.Good, I found Appendix A. Let me continue searching for Appendix F which was mentioned to contain the pseudocode for both methods.Perfect! Found Appendix F. Let me read that section:Perfect! I found Algorithm 1 for Tree Traversal. Now let me look closely at the steps. The pseudocode shows:\n\n1. Line 2345: `Scurrent + tree.layer[0]` (Initialize current set)\n2. Line 2346: `for layer in range(tree.num_layers) do` (Loop through layers)\n3. Line 2347: `topk + []` (Initialize topk list)\n4. Line 2348: `for node in Scurrent do` (Loop through nodes)\n5. Line 2349: `score + dot_product(query, node)` (Calculate score)\n\nStep 5 corresponds to line 2349 which calculates the score using dot_product.\n\nAnswer: score + dot_product(query, node)", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 60780, "error": null, "extra": {"thread_id": 2053, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_cbc9bd5486c544c293802a0ecf0d5bc7", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 329, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "score dot product query node", "normalised_gold": "slayer \u2190 sorted top k k nodes"}}
+{"qid": "2405.09818v1.pdf::Q000", "doc_id": "2405.09818v1.pdf", "answer_format": "list", "gold": "['Figure 5', 'Figure 6']", "pages": 27, "document_id": 7598, "arm": "native_pdf", "question_id": "2405.09818v1.pdf::Q000", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 18671, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": null, "pdf_filename": "2405.09818v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "", "normalised_gold": "figure 5, figure 6"}}
+{"qid": "2405.09818v1.pdf::Q000", "doc_id": "2405.09818v1.pdf", "answer_format": "list", "gold": "['Figure 5', 'Figure 6']", "pages": 27, "document_id": 7598, "arm": "azure_basic_lc", "question_id": "2405.09818v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to identify which figures in the paper contain line plots.\n\nLet me go through the figures mentioned in the document:\n\n**Figure 1**: Shows the Chameleon architecture with image tokenizer, mixed-modal auto-regressive LM, and image de-tokenizer. This is a diagram, not a line plot.\n\n**Figure 2**: Shows sample interleaved image and text generation. This is an example output, not a line plot.\n\n**Figure 3**: Shows sample generation from Chameleon. This is an example output, not a line plot.\n\n**Figure 4**: Shows sample Chameleon outputs. This is an example output, not a line plot.\n\n**Figure 5**: Contains three subfigures:\n- Figure 5a: Shows \"output norm and training loss curves\" - this includes line plots\n- Figure 5b: \"An ablation with Chameleon-7B with and without QK-Norm\" - shows training loss, which is a line plot\n- Figure 5c: \"An ablation with Chameleon-7B with and without dropout\" - shows training loss, which is a line plot\n\n**Figure 6**: Contains three subfigures:\n- Figure 6a: Shows \"Training Curves for 600k steps for Chameleon-7B and Chameleon-34B\" - this is a line plot\n- Figure 6b: Shows \"Training loss curve with image generation disabled\" - this is a line plot\n- Figure 6c: Shows training loss curves with and without norm-reordering - this is a line plot\n\n**Figure 7**: Shows example alignment data for different categories. This appears to be text/table format, not line plots.\n\n**Figure 8**: Shows task categories and examples of prompts. This is not a line plot.\n\n**Figure 9**: Contains:\n- Figure 9a: Shows \"prompt task fulfillment rates\" as a horizontal bar chart\n- Figure 9b: Shows comparison results as a horizontal bar chart\n\n**Figure 10**: Shows \"inter-annotator agreement\" as a horizontal bar chart.\n\nAnswer: Figure 5, Figure 6", "citations": [], "input_tokens": 25078, "output_tokens": 479, "cost_micros": 82419, "latency_ms": 11155, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "figure 5, figure 6", "normalised_gold": "figure 5, figure 6"}}
+{"qid": "2405.09818v1.pdf::Q000", "doc_id": "2405.09818v1.pdf", "answer_format": "list", "gold": "['Figure 5', 'Figure 6']", "pages": 27, "document_id": 7598, "arm": "azure_premium_lc", "question_id": "2405.09818v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to identify all figures in the paper that contain line plots.\n\nLet me go through the figures systematically:\n\n**Figure 1**: Shows a diagram of the Chameleon architecture with image and text tokenization flow - not a line plot.\n\n**Figure 2**: Shows sample interleaved image and text generation - not a line plot.\n\n**Figure 3**: Shows sample generation from Chameleon - not a line plot.\n\n**Figure 4**: Shows sample Chameleon outputs - not a line plot.\n\n**Figure 5**: Contains three subfigures:\n- Figure 5a: Shows \"Output Norm\" vs \"Step\" - this is a line plot\n- Figure 5b: Shows \"Training Loss\" vs \"Step\" with and without QK-Norm - this is a line plot\n- Figure 5c: Shows \"Training Loss\" vs \"Step\" with and without dropout - this is a line plot\n\n**Figure 6**: Contains three subfigures:\n- Figure 6a: Shows \"Training Loss\" vs \"Step\" for 7B and 34B models - this is a line plot\n- Figure 6b: Shows \"Training Loss\" vs \"Step\" for 7B without image generation - this is a line plot\n- Figure 6c: Shows \"Training Loss\" vs \"Step\" with and without norm reordering - this is a line plot\n\n**Figure 7**: Shows example alignment data - not a line plot.\n\n**Figure 8**: Shows task categories and example prompts - not a line plot.\n\n**Figure 9**: Contains two subfigures:\n- Figure 9a: Shows a bar chart of task fulfillment rates - not a line plot\n- Figure 9b: Shows a horizontal bar chart comparing Chameleon vs baselines - not a line plot\n\n**Figure 10**: Shows a horizontal bar chart of inter-annotator agreement - not a line plot.\n\nAnswer: Figure 5a, Figure 5b, Figure 5c, Figure 6a, Figure 6b, Figure 6c", "citations": [], "input_tokens": 34447, "output_tokens": 457, "cost_micros": 110196, "latency_ms": 10469, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "figure 5a, figure 5b, figure 5c, figure 6a, figure 6b, figure 6c", "normalised_gold": "figure 5, figure 6"}}
+{"qid": "2405.09818v1.pdf::Q000", "doc_id": "2405.09818v1.pdf", "answer_format": "list", "gold": "['Figure 5', 'Figure 6']", "pages": 27, "document_id": 7598, "arm": "llamacloud_basic_lc", "question_id": "2405.09818v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to identify all figures in the document that contain line plots.\n\nLet me go through the figures systematically:\n\n**Figure 1**: Shows the Chameleon architecture diagram with tokens - not a line plot.\n\n**Figure 2**: Sample interleaved image and text generation - not a line plot.\n\n**Figure 3**: Sample generation from Chameleon with images - not a line plot.\n\n**Figure 4**: Sample Chameleon outputs with recipe - not a line plot.\n\n**Figure 5**: This figure has three subfigures:\n- (a) Shows \"Uncontrolled growth of output norms\" with what appears to be line plots showing training progress\n- (b) Shows \"An ablation with Chameleon-7B with and without QK-Norm\" - appears to be line plots\n- (c) Shows \"An ablation with Chameleon-7B with and without dropout\" - appears to be line plots\n\n**Figure 6**: This figure has parts showing training curves:\n- (a) \"Training loss curve with image generation disabled\" - line plot\n- (b) Shows training for Chameleon-34B - line plots\n\nThe caption mentions \"Training loss curves for Chameleon models under various settings.\"\n\n**Figure 7**: Example alignment data - not line plots.\n\n**Figure 8**: Task categories and examples of prompts - not line plots.\n\n**Figure 9**: The text mentions Figure 9a and 9b for task fulfillment and win rates - likely bar charts or similar, need to check the context. The text describes percentages but doesn't clearly indicate these are line plots.\n\n**Figure 10**: \"The inter-annotator agreement on the questions in the absolute evaluation\" - context suggests this might be a bar chart or similar visualization.\n\nAnswer: Figure 5, Figure 6", "citations": [], "input_tokens": 26087, "output_tokens": 410, "cost_micros": 84411, "latency_ms": 11625, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "figure 5, figure 6", "normalised_gold": "figure 5, figure 6"}}
+{"qid": "2405.09818v1.pdf::Q000", "doc_id": "2405.09818v1.pdf", "answer_format": "list", "gold": "['Figure 5', 'Figure 6']", "pages": 27, "document_id": 7598, "arm": "llamacloud_premium_lc", "question_id": "2405.09818v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to identify all figures in the paper that contain line plots.\n\nLet me go through the figures systematically:\n\n**Figure 1**: Shows a diagram with flowcharts for pre-training and generation processes - not a line plot.\n\n**Figure 2**: Shows sample interleaved image and text generation - not a line plot.\n\n**Figure 3**: Shows sample generation from Chameleon - not a line plot.\n\n**Figure 4**: Shows sample Chameleon outputs - not a line plot.\n\n**Figure 5**: This figure has three parts:\n- (a) Shows \"Output norm\" over \"Step\" - this is a line plot\n- (b) Shows training loss over steps with and without QK-Norm - this is a line plot\n- (c) Shows training loss over steps with and without dropout - this is a line plot\n\n**Figure 6**: This figure has three parts:\n- (a) Shows \"Training Curves\" with loss over steps - this is a line plot\n- (b) Shows \"Training loss curve\" - this is a line plot\n- (c) Shows training loss with different settings - this is a line plot\n\n**Figure 7**: Shows example alignment data for different categories - not a line plot.\n\n**Figure 8**: Shows task categories and example prompts in a table - not a line plot.\n\n**Figure 9**: Shows performance comparison tables - not line plots, these are tables.\n\n**Figure 10**: Shows inter-annotator agreement in a table format - not a line plot.\n\nSo the figures containing line plots are: Figure 5 (parts a, b, c) and Figure 6 (parts a, b, c).\n\nAnswer: Figure 5, Figure 6", "citations": [], "input_tokens": 32138, "output_tokens": 382, "cost_micros": 102144, "latency_ms": 8875, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "figure 5, figure 6", "normalised_gold": "figure 5, figure 6"}}
+{"qid": "2405.09818v1.pdf::Q000", "doc_id": "2405.09818v1.pdf", "answer_format": "list", "gold": "['Figure 5', 'Figure 6']", "pages": 27, "document_id": 7598, "arm": "surfsense_agentic", "question_id": "2405.09818v1.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to identify which figures contain line plots.Let me search for more specific information about each figure and read the sections containing figure descriptions:Now let me look at Figure 5 more carefully and check other figures:Let me look at the beginning of Figure 5 to see the first part:Good! I can see Figure 5 has multiple line plots. Now let me check other figures. Let me search for more figure captions and descriptions:Let me look for Figure 7 and Figure 8:Let me continue reading Figure 8 and check if there are more figures:I can see Figure 9a is a bar chart. Let me continue examining more figures. Let me search for all figures systematically by searching for specific figure numbers:Now let me look at the actual content of each figure more carefully. Let me check Figure 1:Figure 1 appears to be a diagram/schematic, not a line plot. Let me check Figure 2 and 3:Figures 2 and 3 appear to be examples of text and image generation, not line plots. Let me check Figure 4:Figure 4 is also a sample output, not a line plot. Now let me look more carefully at Figure 10 which I saw earlier has a bar chart format:Figure 10 is a bar chart showing counts for different agreement levels. Let me check if there are any other figures I might have missed by searching for more figure references:Figure 8 shows task categories and examples (text-based). Now let me systematically review what I found:\n\nBased on my reading of the document:\n\n- **Figure 1**: Diagram/schematic showing the Chameleon architecture\n- **Figure 2**: Sample text and image generation (not a line plot)\n- **Figure 3**: Sample generation (not a line plot)  \n- **Figure 4**: Sample outputs (not a line plot)\n- **Figure 5**: Contains THREE line plots:\n  - (a) Output norm vs Step (line plot)\n  - (b) Training Loss vs Step with/without QK-Norm (line plot)\n  - (c) Training Loss vs Step with/without dropout (line plot)\n- **Figure 6**: Contains THREE line plots:\n  - (a) Training Loss vs Step for 7B and 34B models (line plot)\n  - (b) Training Loss vs Step without image generation (line plot)\n  - (c) Training Loss vs Step with/without norm reordering (line plot)\n- **Figure 7**: Example alignment data for different categories (not a line plot)\n- **Figure 8**: Task categories and examples of prompts (not a line plot)\n- **Figure 9**: Contains bar charts showing performance comparison (not line plots)\n- **Figure 10**: Bar chart showing inter-annotator agreement (not a line plot)\n\nAnswer: Figure 5, Figure 6", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 104391, "error": null, "extra": {"thread_id": 2054, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_3ce67a1a47334e71b23dc25fd26d8b74", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 654, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "figure 5, figure 6", "normalised_gold": "figure 5, figure 6"}}
+{"qid": "2405.09818v1.pdf::Q001", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "text tokens", "pages": 27, "document_id": 7598, "arm": "native_pdf", "question_id": "2405.09818v1.pdf::Q001", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "text tokens"}}
+{"qid": "2405.09818v1.pdf::Q001", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "text tokens", "pages": 27, "document_id": 7598, "arm": "azure_basic_lc", "question_id": "2405.09818v1.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document, I can see the caption states:\n\n\"Text tokens are represented in green and image tokens are represented in blue.\"\n\nAnswer: Text tokens", "citations": [], "input_tokens": 25075, "output_tokens": 41, "cost_micros": 75840, "latency_ms": 3905, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "text tokens", "normalised_gold": "text tokens"}}
+{"qid": "2405.09818v1.pdf::Q001", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "text tokens", "pages": 27, "document_id": 7598, "arm": "azure_premium_lc", "question_id": "2405.09818v1.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document, the caption states:\n\n\"Text tokens are represented in green and image tokens are represented in blue.\"\n\nAnswer: Text tokens", "citations": [], "input_tokens": 34444, "output_tokens": 38, "cost_micros": 103902, "latency_ms": 3641, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "text tokens", "normalised_gold": "text tokens"}}
+{"qid": "2405.09818v1.pdf::Q001", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "text tokens", "pages": 27, "document_id": 7598, "arm": "llamacloud_basic_lc", "question_id": "2405.09818v1.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document, I can see the caption states:\n\n\"Text tokens are represented in green and image tokens are represented in blue.\"\n\nAnswer: Text tokens", "citations": [], "input_tokens": 26084, "output_tokens": 41, "cost_micros": 78867, "latency_ms": 3483, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "text tokens", "normalised_gold": "text tokens"}}
+{"qid": "2405.09818v1.pdf::Q001", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "text tokens", "pages": 27, "document_id": 7598, "arm": "llamacloud_premium_lc", "question_id": "2405.09818v1.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 1 and its caption, I can see that the figure shows Chameleon's architecture with different colored elements representing different types of tokens.\n\nThe caption explicitly states: \"Text tokens are represented in green and image tokens are represented in blue.\"\n\nAnswer: Text tokens", "citations": [], "input_tokens": 32135, "output_tokens": 62, "cost_micros": 97335, "latency_ms": 4108, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "text tokens", "normalised_gold": "text tokens"}}
+{"qid": "2405.09818v1.pdf::Q001", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "text tokens", "pages": 27, "document_id": 7598, "arm": "surfsense_agentic", "question_id": "2405.09818v1.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the document to find Fig 1 and determine what the green squares denote.Let me search for \"Fig 1\" in the document to locate the figure and its caption.Let me search for \"Figure 1\" instead:Good, I found Figure 1. Now let me read the section around line 354 to see the figure and find information about green squares:Perfect! I found the answer. In line 355, the figure caption explicitly states: \"Text tokens are represented in green and image tokens are represented in blue.\"\n\nAnswer: Text tokens", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 31469, "error": null, "extra": {"thread_id": 2055, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_242072fc7b9a40f1ba1d0b58497f99ea", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 150, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "text tokens", "normalised_gold": "text tokens"}}
+{"qid": "2405.09818v1.pdf::Q003", "doc_id": "2405.09818v1.pdf", "answer_format": "int", "gold": "18", "pages": 27, "document_id": 7598, "arm": "native_pdf", "question_id": "2405.09818v1.pdf::Q003", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "18"}}
+{"qid": "2405.09818v1.pdf::Q003", "doc_id": "2405.09818v1.pdf", "answer_format": "int", "gold": "18", "pages": 27, "document_id": 7598, "arm": "azure_basic_lc", "question_id": "2405.09818v1.pdf::Q003", "answer_letter": null, "raw_text": "I need to count the pictures used in chat example figures, excluding the Appendix.\n\nLet me go through the document systematically:\n\n**Figure 2** (page 3): Sample interleaved image and text generation from Chameleon\n- Shows 3 images (Keel-Billed Toucan, Puffin, Golden Pheasant)\n\n**Figure 3** (page 4): Sample generation from Chameleon\n- Shows 2 images (the STOP sign input image and the generated chameleon image)\n\n**Figure 4** (page 5): Sample Chameleon outputs\n- Shows 2 images (bananas input image and the banana bread output image)\n\n**Figure 7** (page 10): Example alignment data for different categories\n- This shows examples of different data types:\n  - (a) Text - no images\n  - (b) Code - no images\n  - (c) Image Generation - 1 image shown\n  - (d) Visual Chat - 1 image shown (flower arrangement)\n  - (e) Interleaved Text/Image Generation - 2 images shown (appears to be related to Gyumri/Armenian content)\n\n**Figure 8** (page 11): Task categories and examples of prompts\n- Shows 4 small example images in the prompt examples\n\nNow let me check the Appendix section (starts at page 23):\n- The Appendix section A \"Samples\" contains additional examples, but I should exclude these.\n\nTotal count (excluding Appendix):\n- Figure 2: 3 images\n- Figure 3: 2 images\n- Figure 4: 2 images\n- Figure 7: 4 images (1 + 1 + 2)\n- Figure 8: 4 images\n\nTotal: 3 + 2 + 2 + 4 + 4 = 15 images\n\nAnswer: 15", "citations": [], "input_tokens": 25074, "output_tokens": 446, "cost_micros": 81912, "latency_ms": 11907, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "15", "normalised_gold": "18"}}
+{"qid": "2405.09818v1.pdf::Q003", "doc_id": "2405.09818v1.pdf", "answer_format": "int", "gold": "18", "pages": 27, "document_id": 7598, "arm": "azure_premium_lc", "question_id": "2405.09818v1.pdf::Q003", "answer_letter": null, "raw_text": "I need to count the pictures used in the chat example figures, excluding the Appendix.\n\nLet me go through the figures that show chat examples:\n\n**Figure 2** (page 3): \"Sample interleaved image and text generation from Chameleon\"\n- Shows 3 bird images (Keel-Billed Toucan, Puffin, Golden Pheasant)\nTotal: 3 images\n\n**Figure 3** (page 4): \"Sample generation from Chameleon\"\n- Shows 1 stop sign image in the prompt\n- Shows 1 chameleon image in the response\nTotal: 2 images\n\n**Figure 4** (page 5): \"Sample Chameleon outputs\"\n- Shows 1 banana image in the prompt\n- Shows 1 banana bread image in the response\nTotal: 2 images\n\nThe Appendix starts at page 23 with \"A Samples\". The figures after that are in the Appendix and should be excluded.\n\nTotal pictures in chat example figures (excluding Appendix): 3 + 2 + 2 = 7\n\nAnswer: 7", "citations": [], "input_tokens": 34443, "output_tokens": 263, "cost_micros": 107274, "latency_ms": 7530, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "7", "normalised_gold": "18"}}
+{"qid": "2405.09818v1.pdf::Q003", "doc_id": "2405.09818v1.pdf", "answer_format": "int", "gold": "18", "pages": 27, "document_id": 7598, "arm": "llamacloud_basic_lc", "question_id": "2405.09818v1.pdf::Q003", "answer_letter": null, "raw_text": "I need to count the pictures/images used in the chat example figures in the main body of the document (excluding the Appendix).\n\nLet me go through the figures that show chat/conversation examples:\n\n1. **Figure 1**: Shows a recipe example with bananas image and banana bread image = 2 images\n\n2. **Figure 2**: \"Sample interleaved image and text generation from Chameleon\" - This shows a response about quirky-looking birds with 3 bird images (Keel-Billed Toucan, Puffin, and Golden Pheasant) = 3 images\n\n3. **Figure 3**: \"Sample generation from Chameleon\" - Shows a stop sign image and a chameleon image = 2 images\n\n4. **Figure 4**: \"Sample Chameleon outputs\" - Shows bananas image and banana bread image = 2 images\n\n5. **Figure 7**: \"Example alignment data for different categories\" - This shows multiple examples including:\n   - Poem (no image)\n   - Renaissance Square (1 image)\n   - Code example (no image)\n   - Gyumri text (no image)\n   - Flower arrangement (1 image)\n   = 2 images\n\n6. **Figure 8**: \"Task categories and examples of prompts\" - Shows 4 images in the examples (pastry, pony, construction site, and one more)\n\nWait, let me recount Figure 8 more carefully. Looking at the attributions at the bottom: \"Seguin (2010); Agriflanders (2009); Tuszy\u0144ski (2015); Sokolov (2022)\" - this indicates 4 images.\n\nTotal count:\n- Figure 1: 2 images\n- Figure 2: 3 images  \n- Figure 3: 2 images\n- Figure 4: 2 images\n- Figure 7: 2 images\n- Figure 8: 4 images\n\nTotal = 2 + 3 + 2 + 2 + 2 + 4 = 15 images\n\nAnswer: 15", "citations": [], "input_tokens": 26083, "output_tokens": 489, "cost_micros": 85584, "latency_ms": 11483, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "15", "normalised_gold": "18"}}
+{"qid": "2405.09818v1.pdf::Q003", "doc_id": "2405.09818v1.pdf", "answer_format": "int", "gold": "18", "pages": 27, "document_id": 7598, "arm": "llamacloud_premium_lc", "question_id": "2405.09818v1.pdf::Q003", "answer_letter": null, "raw_text": "I need to count the pictures used in the chat example figures, excluding the Appendix.\n\nLet me go through the figures that show chat examples:\n\n**Figure 2** (page 3): \"Sample interleaved image and text generation from Chameleon\"\n- Shows a conversation about quirky birds\n- Contains 3 images: Keel-Billed Toucan, Puffin, and Golden Pheasant\n- Count: 3 images\n\n**Figure 3** (page 4): \"Sample generation from Chameleon\"\n- Shows a conversation about a stop sign and chameleon camouflage\n- Contains 2 images: stop sign in wooded area, and a reddish-brown chameleon on tree branch\n- Count: 2 images\n\n**Figure 4** (page 5): \"Sample Chameleon outputs\"\n- Shows a conversation about baking with bananas\n- Contains 2 images: bowl of sliced bananas, and a loaf of banana bread\n- Count: 2 images\n\nThe Appendix section starts at \"Appendix\" on page 23. The example in Section A (page 24) with the Mountain Cur dog and the pothos plant are in the Appendix, so I should not count those.\n\nTotal pictures in chat example figures (excluding Appendix): 3 + 2 + 2 = 7\n\nAnswer: 7", "citations": [], "input_tokens": 32134, "output_tokens": 325, "cost_micros": 101277, "latency_ms": 8719, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "7", "normalised_gold": "18"}}
+{"qid": "2405.09818v1.pdf::Q003", "doc_id": "2405.09818v1.pdf", "answer_format": "int", "gold": "18", "pages": 27, "document_id": 7598, "arm": "surfsense_agentic", "question_id": "2405.09818v1.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the document to find chat example figures and count the pictures in them, excluding any in the Appendix.Let me search for figures more specifically:Now let me look for chat examples and appendix sections. Let me read through the document systematically:Let me read the figures mentioned (Figures 2-4) which appear to be the chat example figures, and check where the Appendix starts:Now let me check Figure 4:Let me go back and read Figure 2 more carefully to count the images:Now let me look at the beginning of Figure 2 to see all images:I see there's a mislabeled image at the beginning. Let me count more carefully. Looking at Figure 2, I can see references to:\n- Im3.png (Toucan)\n- Im4.png (Puffin)  \n- Im5.png (Golden Pheasant mentioned with <img> marker)\n\nLet me check Figure 3 now:Let me look for where Figure 3 starts:I need to see if there's another image in Figure 3. Let me check after line 747:Good! So I see Im7.png is a chameleon image. Let me check if this is part of Figure 3. Looking at the page break, Im7.png appears after the section break at line 756 where \"2.1 Tokenization\" starts. Let me verify where Figure 3 ends and Figure 4 starts:Perfect! Now I can see Figure 4 ends at line 922. Let me look for more images in Figure 4. The prompt at line 822 mentions an <img> tag. Let me check if there's an image for that:Now let me verify where the Appendix starts to ensure I don't count figures from there:Good! The Appendix starts at line 3372. Now let me systematically count the images in Figures 2, 3, and 4.\n\nLet me go back and look at the full scope of each figure more carefully:\n\n**Figure 2** (lines 471-620):\n- Starts with prompt at line 471\n- Im3.png - Toucan (line 543)\n- Im4.png - Puffin (line 579)\n- Im5.png - Golden Pheasant (line 641)\n- Figure 2 caption at line 619\n\nWait, Im5.png at line 641 seems to be after the Figure 2 caption. Let me re-check:I see! There's a STOP sign figure at line 633-637, and then Im5.png (the Golden Pheasant) at line 641. The STOP sign appears to be the beginning of Figure 3's prompt. Let me trace this more carefully:So the STOP sign at line 633-637 with Im5.png description is actually part of Figure 3's input (the <img> in the prompt at line 675 refers to it). Let me reorganize:\n\n**Figure 2** (chat example about birds):\n- Prompt starts at line 471\n- Im3.png - Toucan (line 543)\n- Im4.png - Puffin (line 579)\n- Golden Pheasant text mentions <img> at line 614, but the actual image appears to be missing from Figure 2 content\n- Figure 2 caption at line 619-620\n\nWait, I need to look more carefully at the structure. The embedded images seem to be labeled differently. Let me look for all embedded images in the main body before the Appendix:Now let me determine which images belong to Figures 2, 3, and 4 by checking the line ranges:\n\n- Figure 2 caption: line 619\n- Figure 3 caption: line 701  \n- Figure 4 caption: line 922\n- Appendix starts: line 3372\n\nImages before Appendix:\n- Image4.jpg (line 407) - before Figure 2\n- Image5.jpg (line 480) - This is the mislabeled banana bread image, but checking context it's BEFORE the birds prompt starts at line 471\n- Im3.png (line 543) - in Figure 2 range\n- Im4.png (line 579) - in Figure 2 range\n- Im5.png (line 641) - between Figure 2 (ends 620) and Figure 3 (ends 701)\n- Im6.png (line 711) - between Figure 3 (ends 701) and section 2.1 starts\n- Im7.png (line 777) - after section 2.1 starts (line 756)\n- Im8.png (line 876) - in Figure 4 range (Figure 4 caption at 922)\n- Im9.png (line 991) - after Figure 4 ends\n\nLet me read around line 480 to verify Image5.jpg:This Image5.jpg (banana bread) is clearly mislabeled/misplaced content. The prompt at line 471 asks for birds, but Image5.jpg shows banana bread. This seems to be a document processing error. However, it appears WITHIN the Figure 2 example (after the prompt, before the response text starts at line 518).\n\nLet me check where Figure 2 actually starts and what the structure is:So Figure 2 starts at line 471. Let me now carefully count all images that appear in the three chat example figures (2, 3, 4) before the Appendix:\n\nLooking at my grep results and the line ranges:\n\n**Figure 2** (lines 471-620):\n- Image5.jpg (line 480) - appears after the prompt but seems misplaced\n- Im3.png (line 543) - Toucan\n- Im4.png (line 579) - Puffin\n\nHowever, Image5.jpg shows banana bread which doesn't match the birds prompt. Let me check if there's a third bird image I'm missing. The text mentions three birds (Toucan, Puffin, Golden Pheasant) and has <img> markers. Let me look for where the third bird image might be:\n\nLooking at my earlier reading, Im5.png (line 641) shows a bird (Golden Pheasant based on the description at lines 649-668). The caption for Figure 2 is at line 619-620, so Im5.png at line 641 is AFTER the Figure 2 caption.\n\nLet me check if Im5.png is actually part of Figure 3:I see! Im5.png (the Golden Pheasant/colorful bird) appears at line 641, but the prompt for Figure 3 starts at line 675 which references \"<img>\" - and that <img> is referring to the STOP sign that appeared earlier. So Im5.png is actually the third bird from Figure 2, even though it appears after the Figure 2 caption.\n\nLooking back at the structure more carefully, the figure captions seem to come before all the images are shown. Let me look at where the STOP sign figure starts:\n\nFrom line 633-637, there's a figure with \"STOP\" as OCR text. This must be the input image for Figure 3.\n\nLet me now systematically list the images:\n\n**Before Figure 2 caption (line 619):**\n- Image5.jpg (line 480) - banana bread (misplaced/error)\n- Im3.png (line 543) - Toucan  \n- Im4.png (line 579) - Puffin\n\n**After Figure 2 caption but before Figure 3 caption (line 701):**\n- Im5.png (line 641) - Golden Pheasant (part of Figure 2 content)\n\n**Around Figure 3:**\n- STOP sign figure (lines 633-637, but before Im5.png structurally in the document)\n- Im6.png (line 711) - STOP sign (after Figure 3 caption)\n\nWait, I need to look at this more systematically. Let me check what the STOP sign figure at line 633 is:The figure at 633-637 has OCR text \"STOP\" but the embedded image Im5.png (line 641) has \"NO_CONTENT_HERE\" as OCR and shows a bird. This is confusing. Let me look at Im6.png which should be the STOP sign:Yes! Im6.png is the STOP sign. So the document structure has the figure tags and embedded images separated. The figure at line 633 with \"STOP\" corresponds to Im6.png (the actual embedded image file) at line 711.\n\nLet me now map this out properly by looking at the figure captions and the embedded images:\n\n**Figure 2** (caption at line 619): Chat about birds\n**Figure 3** (caption at line 701): Chat about chameleon camouflage  \n**Figure 4** (caption at line 922): Chat about baking with bananas\n\nNow let me find which embedded images belong to each:So Im6.png appears RIGHT AFTER the Figure 3 caption. Now let me check Figure 4:So Im8.png (bananas) appears before the Figure 4 caption at line 922, and there's an <img> marker at line 920 suggesting another image.\n\nNow let me look at Im7.png and Im9.png to see which figure they belong to:\n\nIm7.png is at line 777. Let me check the context:Im7.png (chameleon) is at line 777, which is AFTER section 2.1 starts at line 756 and after Figure 3 caption at line 701. But looking at the context, Im7.png shows a chameleon which matches Figure 3's theme. Let me check if this is part of the Figure 3 example by looking backwards:\n\nThe Figure 3 response at line 747 has an <img> marker, and the text before talks about generating a chameleon image. So Im7.png is part of Figure 3's output!\n\nNow let me check Im9.png:Im9.png shows a loaf of baked good (banana bread final result). This is at line 991, which is after Figure 4 caption at line 922. The <img> marker at line 920 in Figure 4 refers to this final dish image. So Im9.png is part of Figure 4.\n\nNow let me go back and check Im5.png more carefully. At line 641, the bird image appears after Figure 2 caption but before Figure 3 content. Looking at Figure 2's text, the Golden Pheasant is mentioned with <img> at line 614. So Im5.png (Golden Pheasant) is part of Figure 2.\n\nLet me now do a final count:\n\n**Figure 2** (Chat about birds):\n- Im3.png - Toucan\n- Im4.png - Puffin  \n- Im5.png - Golden Pheasant\nTotal: 3 images\n\n**Figure 3** (Chat about chameleon camouflage):\n- Im6.png - STOP sign (input)\n- Im7.png - Chameleon (generated output)\nTotal: 2 images\n\n**Figure 4** (Chat about baking with bananas):\n- Im8.png - Bananas (input)\n- Im9.png - Banana bread (generated output)\nTotal: 2 images\n\n**Total images in chat example figures (Figures 2-4), excluding Appendix: 3 + 2 + 2 = 7**\n\nWait, but I should double-check if Image5.jpg (the banana bread at line 480) is actually meant to be part of Figure 2 or if it's a document error. Let me look at the context again:Image5.jpg (banana bread) appears right after the birds prompt but before the response starts. This appears to be a document processing error or misplacement - it doesn't match the context at all. The response starting at line 518 talks about birds, not banana bread.\n\nGiven that this is clearly an error in the document, I should not count Image5.jpg as part of the chat example figures. The actual chat examples are:\n\n**Figure 2**: 3 bird images (Im3.png, Im4.png, Im5.png)\n**Figure 3**: 2 images (Im6.png STOP sign, Im7.png chameleon)  \n**Figure 4**: 2 images (Im8.png bananas, Im9.png banana bread)\n\n**Total: 7 images**\n\nAnswer: 7", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 178546, "error": null, "extra": {"thread_id": 2056, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_d55f4fd667ac403b901ea3294619b198", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 1600, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "7", "normalised_gold": "18"}}
+{"qid": "2405.09818v1.pdf::Q004", "doc_id": "2405.09818v1.pdf", "answer_format": "int", "gold": "1", "pages": 27, "document_id": 7598, "arm": "native_pdf", "question_id": "2405.09818v1.pdf::Q004", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 17687, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": null, "pdf_filename": "2405.09818v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "1"}}
+{"qid": "2405.09818v1.pdf::Q004", "doc_id": "2405.09818v1.pdf", "answer_format": "int", "gold": "1", "pages": 27, "document_id": 7598, "arm": "azure_basic_lc", "question_id": "2405.09818v1.pdf::Q004", "answer_letter": null, "raw_text": "I need to find subplots that show obvious loss spikes in the document.\n\nLet me examine the figures that show training loss curves:\n\n1. **Figure 5**: Contains 3 subplots:\n   - (a) Shows output norm with clear divergence/spike\n   - (b) Shows training loss comparison with and without QK-Norm - one curve diverges (spikes)\n   - (c) Shows training loss with and without dropout - appears stable\n\n2. **Figure 6**: Contains 3 subplots:\n   - (a) Shows training loss curves for 7B and 34B models - appears stable without obvious spikes\n   - (b) Shows training loss curve with image generation disabled - appears stable without obvious spikes\n   - (c) Shows training loss with and without norm reordering - shows clear divergence/spike in one curve\n\nLet me count the subplots with obvious loss spikes:\n- Figure 5(a): Shows norm growth leading to divergence (1)\n- Figure 5(b): Shows loss divergence/spike (1)\n- Figure 6(c): Shows loss divergence/spike (1)\n\nTotal: 3 subplots show obvious loss spikes or divergences.\n\nAnswer: 3", "citations": [], "input_tokens": 25071, "output_tokens": 284, "cost_micros": 79473, "latency_ms": 8016, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "1"}}
+{"qid": "2405.09818v1.pdf::Q004", "doc_id": "2405.09818v1.pdf", "answer_format": "int", "gold": "1", "pages": 27, "document_id": 7598, "arm": "azure_premium_lc", "question_id": "2405.09818v1.pdf::Q004", "answer_letter": null, "raw_text": "I need to find figures that show loss curves with obvious spikes.\n\nLooking through the document:\n\n**Figure 5** (page 6-7):\n- (a) Shows \"Output norm\" vs Step - not a loss curve with spikes\n- (b) Shows \"Training Loss\" with and without QK-Norm - shows relatively smooth curves, one diverges but no obvious spikes\n- (c) Shows \"Training Loss\" with and without dropout - shows smooth curves\n\n**Figure 6** (page 7):\n- (a) Shows \"Training Loss\" for 7B and 34B models over 600k steps - smooth curves, no obvious spikes\n- (b) Shows \"Training Loss\" for 7B without image generation - smooth curve, no obvious spikes\n- (c) Shows \"Training Loss\" with and without norm reordering - one curve shows a clear spike/divergence around 8k-10k steps\n\nFrom Figure 6c, I can see one subplot that clearly shows an obvious loss spike (the curve without norm reordering that spikes dramatically).\n\nLooking more carefully at Figure 5b, the curve \"w/o QK-norm\" shows a sharp spike/divergence near the end.\n\nSo counting subplots with obvious loss spikes:\n- Figure 5b: 1 subplot (shows divergence/spike)\n- Figure 6c: 1 subplot (shows clear spike)\n\nAnswer: 2", "citations": [], "input_tokens": 34440, "output_tokens": 324, "cost_micros": 108180, "latency_ms": 9828, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "1"}}
+{"qid": "2405.09818v1.pdf::Q004", "doc_id": "2405.09818v1.pdf", "answer_format": "int", "gold": "1", "pages": 27, "document_id": 7598, "arm": "llamacloud_basic_lc", "question_id": "2405.09818v1.pdf::Q004", "answer_letter": null, "raw_text": "I need to find figures in the document that show training loss curves with obvious loss spikes.\n\nLooking through the document:\n\n**Figure 5**: \"Output norm and training loss curves for Chameleon models under various settings\"\n- (a) Shows uncontrolled growth of output norms\n- (b) Shows \"An ablation with Chameleon-7B with and without QK-Norm\" - this shows training loss with a divergence/spike\n- (c) Shows \"An ablation with Chameleon-7B with and without dropout\" - this shows training loss with divergences/spikes\n\n**Figure 6**: \"Training loss curves for Chameleon models under various settings\"\n- (a) Shows \"Training loss curve with image generation disabled does not suffer from instability issues\" - smooth curves, no spikes\n- (b) Shows \"For Chameleon-34B, using dropout does not fix divergences, both with and without reordering\" - this shows loss spikes/divergences\n\nCounting subplots with obvious loss spikes:\n- Figure 5(b): 1 subplot with loss spike\n- Figure 5(c): 1 subplot with loss spike  \n- Figure 6(b): 1 subplot with loss spikes\n\nTotal: 3 subplots\n\nAnswer: 3", "citations": [], "input_tokens": 26080, "output_tokens": 301, "cost_micros": 82755, "latency_ms": 9171, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "1"}}
+{"qid": "2405.09818v1.pdf::Q004", "doc_id": "2405.09818v1.pdf", "answer_format": "int", "gold": "1", "pages": 27, "document_id": 7598, "arm": "llamacloud_premium_lc", "question_id": "2405.09818v1.pdf::Q004", "answer_letter": null, "raw_text": "I need to find figures that show loss spikes in the document.\n\nLooking through the document for figures showing training loss or related metrics:\n\n**Figure 5: \"Output norm and training loss curves for Chameleon models under various settings\"**\nThis figure has three subplots:\n- (a) Shows output norms with curves that spike/diverge\n- (b) Shows training loss with and without QK-Norm - one curve diverges/spikes\n- (c) Shows training loss with and without dropout - one curve appears to spike\n\n**Figure 6: \"Training loss curves for Chameleon models under various settings\"**\nThis figure has three subplots:\n- (a) Shows smooth training curves for 7B and 34B models - no obvious spikes\n- (b) Shows training loss without image generation - smooth curve, no spikes\n- (c) Shows training with/without norm reordering where one diverges - this shows a spike\n\nSo counting subplots with obvious loss spikes:\n- Figure 5(a): 1 (shows divergence/spikes in output norms)\n- Figure 5(b): 1 (shows divergence in loss)\n- Figure 5(c): 1 (shows divergence in loss)\n- Figure 6(c): 1 (shows divergence)\n\nTotal: 4 subplots show obvious spikes/divergences\n\nAnswer: 4", "citations": [], "input_tokens": 32131, "output_tokens": 324, "cost_micros": 101253, "latency_ms": 9125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "1"}}
+{"qid": "2405.09818v1.pdf::Q004", "doc_id": "2405.09818v1.pdf", "answer_format": "int", "gold": "1", "pages": 27, "document_id": 7598, "arm": "surfsense_agentic", "question_id": "2405.09818v1.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to find information about subplots related to obvious loss spikes.Let me search for content related to \"loss spikes\" in the document:Let me search for \"subplot\":Let me read through the document more systematically to look for figures and their descriptions:Now let me look at Figure 5 and Figure 6 which seem to show training loss curves. Let me read those sections:Now let me look more closely at Figure 6 which appears to have subplots. Let me search for any reference to \"loss spikies\" or similar terms:Let me look at Figure 5 more carefully, as it shows \"Output norm and training loss curves for Chameleon models under various settings\":I can see Figure 5 has three subplots (a, b, c). Now let me check the figure caption more carefully and look at the actual charts. The caption says \"Figure 5 Output norm and training loss curves for Chameleon models under various settings.\" Let me read the text more carefully to understand what \"obvious loss spikies\" might refer to. Let me search for variations:Let me look at the entire document more systematically. Let me check around Figure 5 and 6 descriptions:Let me look more carefully at the whole document. Maybe \"spikies\" is a colloquial term in the figure or caption. Let me check the full text around Figures 5 and 6:So Figure 5 has subplots (a), (b), and (c). Let me check Figure 6:So Figure 6 also has subplots (a), (b), and (c). Now let me look at the actual chart data more carefully. The question asks about \"obvious loss spikies\" which likely refers to spikes in the loss curves. Let me look at the loss curve data more closely. Let me search the document for any terms that might be misspelled or colloquial:Based on my reading of the document, I can see that:\n\n**Figure 5** has three subplots:\n- (a) Output norms plot showing growth over training steps\n- (b) Training loss with and without QK-Norm\n- (c) Training loss with and without dropout\n\n**Figure 6** has three subplots:\n- (a) Training curves for 600k steps for Chameleon-7B and Chameleon-34B\n- (b) Training loss curve with image generation disabled\n- (c) For Chameleon-34B, using dropout comparison with and without norm-reordering\n\nLooking at the descriptions and the actual chart data in the text, the question asks about \"obvious loss spikies\" (spikes). From examining the text descriptions:\n\n- Figure 5(a) shows \"Uncontrolled growth of output norms\" - this is about norms, not loss spikes\n- Figure 5(b) shows training loss curves with the one \"w/o QK-norm\" showing a divergence\n- Figure 5(c) shows training loss curves with and without dropout - the \"w/o dropout\" curve appears to have issues\n- Figure 6(a) shows smooth training curves for both 7B and 34B models\n- Figure 6(b) shows a smooth training loss curve without image generation\n- Figure 6(c) shows curves where \"w/o norm reordering\" appears to diverge/spike\n\nLooking at the subplot captions that mention instability or divergences, which would correspond to loss spikes:\n- Figure 5(b): \"w/o QK-norm\" diverges\n- Figure 5(c): \"w/o dropout\" has spikes\n- Figure 6(c): \"w/o norm reordering\" has divergences/spikes\n\nThat would be 3 subplots showing obvious loss spikes.\n\nAnswer: 3", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 88405, "error": null, "extra": {"thread_id": 2057, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_d21eabfb0c2b44d69030354b180154e2", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 635, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "1"}}
+{"qid": "2405.09818v1.pdf::Q005", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "no", "pages": 27, "document_id": 7598, "arm": "native_pdf", "question_id": "2405.09818v1.pdf::Q005", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 15734, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": null, "pdf_filename": "2405.09818v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "no"}}
+{"qid": "2405.09818v1.pdf::Q005", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "no", "pages": 27, "document_id": 7598, "arm": "azure_basic_lc", "question_id": "2405.09818v1.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Table 1 in the document, which shows a \"Summary of core architecture and optimization decisions made in Chameleon in contrast to LLaMa-1 and LLaMa-2\", I can see that:\n\n- LLaMa-1 7B has an \"\u00d7\" (cross mark) in the Qknorm column\n- LLaMa-1 33B has an \"\u00d7\" (cross mark) in the Qknorm column\n\nThis indicates that LLaMa-1 does NOT apply QKnorm.\n\nAnswer: no", "citations": [], "input_tokens": 25087, "output_tokens": 130, "cost_micros": 77211, "latency_ms": 5343, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2405.09818v1.pdf::Q005", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "no", "pages": 27, "document_id": 7598, "arm": "azure_premium_lc", "question_id": "2405.09818v1.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Table 1 in the document, which compares core architecture and optimization decisions across models:\n\nFor LLaMa-1 models (both 7B and 33B), the Qknorm column shows \"\u00d7\" (which indicates no).\n\nAnswer: no", "citations": [], "input_tokens": 34456, "output_tokens": 63, "cost_micros": 104313, "latency_ms": 4297, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2405.09818v1.pdf::Q005", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "no", "pages": 27, "document_id": 7598, "arm": "llamacloud_basic_lc", "question_id": "2405.09818v1.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Table 1 in the document, which summarizes the core architecture and optimization decisions:\n\nFor LLaMa-1 models (both 7B and 33B), the \"Qknorm\" column shows \"\u00d7\", which indicates that QK-Norm was not applied.\n\nAnswer: No", "citations": [], "input_tokens": 26096, "output_tokens": 71, "cost_micros": 79353, "latency_ms": 4531, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2405.09818v1.pdf::Q005", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "no", "pages": 27, "document_id": 7598, "arm": "llamacloud_premium_lc", "question_id": "2405.09818v1.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Table 1 in the document, which shows a comparison of core architecture and optimization decisions:\n\nFor LLaMa-1 (both 7B and 33B models), the \"Qknorm\" column shows \"\u00d7\", which indicates that query-key normalization (QK-Norm) was **not** applied.\n\nAnswer: No", "citations": [], "input_tokens": 32147, "output_tokens": 81, "cost_micros": 97656, "latency_ms": 4671, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2405.09818v1.pdf::Q005", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "no", "pages": 27, "document_id": 7598, "arm": "surfsense_agentic", "question_id": "2405.09818v1.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to find information about whether llama-1 applies qknorm.Let me search for \"qknorm\" and \"llama\" in the document to find relevant information.Let me read the table area that contains information about qknorm and llama-1:Perfect! I found the table that shows whether llama-1 applies qknorm. In the table, for both LLaMa-1 models (7B and 33B), the Qknorm column shows \"\u00d7 \u2612\" which indicates \"no\".\n\nAnswer: no", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 30905, "error": null, "extra": {"thread_id": 2058, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_2c3221de2294418da7d44837e6c67391", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 146, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2405.09818v1.pdf::Q007", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "150k", "pages": 27, "document_id": 7598, "arm": "native_pdf", "question_id": "2405.09818v1.pdf::Q007", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657603 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "150k"}}
+{"qid": "2405.09818v1.pdf::Q007", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "150k", "pages": 27, "document_id": 7598, "arm": "azure_basic_lc", "question_id": "2405.09818v1.pdf::Q007", "answer_letter": null, "raw_text": "Looking at Figure 5b in the document, which shows \"An ablation with Chameleon-7B with and without QK-Norm\", I can see the training loss curves.\n\nThe curve labeled \"w/o QK-norm\" (without QK-norm) shows a divergence/spike in the loss after approximately 20% of training progress, which appears to occur around step 20,000-25,000 based on the x-axis scale shown in the figure.\n\nAnswer: Around step 20,000-25,000", "citations": [], "input_tokens": 25078, "output_tokens": 122, "cost_micros": 77064, "latency_ms": 6000, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "around step 20 000 25 000", "normalised_gold": "150k"}}
+{"qid": "2405.09818v1.pdf::Q007", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "150k", "pages": 27, "document_id": 7598, "arm": "azure_premium_lc", "question_id": "2405.09818v1.pdf::Q007", "answer_letter": null, "raw_text": "Looking at Figure 5b, which shows \"An ablation with Chameleon-7B with and without QK-Norm,\" I can see the training loss curves for both conditions.\n\nThe curve labeled \"w/o QK-norm\" (without QK-Norm) shows the training progressing normally until it experiences a sudden spike and divergence.\n\nAnswer: Around 125k steps", "citations": [], "input_tokens": 34447, "output_tokens": 89, "cost_micros": 104676, "latency_ms": 5452, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "around 125k steps", "normalised_gold": "150k"}}
+{"qid": "2405.09818v1.pdf::Q007", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "150k", "pages": 27, "document_id": 7598, "arm": "llamacloud_basic_lc", "question_id": "2405.09818v1.pdf::Q007", "answer_letter": null, "raw_text": "Looking at Figure 5b in the document, which shows \"An ablation with Chameleon-7B with and without QK-Norm,\" I can see the training loss curves comparing the two conditions.\n\nThe figure shows that the model without QK-Norm diverges (loss spikes) after approximately 20% of training progress.\n\nAnswer: After approximately 20% of training", "citations": [], "input_tokens": 26087, "output_tokens": 87, "cost_micros": 79566, "latency_ms": 5046, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "after approximately 20 of training", "normalised_gold": "150k"}}
+{"qid": "2405.09818v1.pdf::Q007", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "150k", "pages": 27, "document_id": 7598, "arm": "llamacloud_premium_lc", "question_id": "2405.09818v1.pdf::Q007", "answer_letter": null, "raw_text": "Looking at Figure 5b, which shows \"An ablation with Chameleon-7B with and without QK-Norm,\" I can see the training loss curves for models with and without QK-norm.\n\nThe graph shows that the model without QK-norm or dropout (red line) diverges after approximately 20k-30k steps, where the loss spikes dramatically upward.\n\nAnswer: Around 20k-30k steps", "citations": [], "input_tokens": 32138, "output_tokens": 100, "cost_micros": 97914, "latency_ms": 5485, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "around 20k 30k steps", "normalised_gold": "150k"}}
+{"qid": "2405.09818v1.pdf::Q007", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "150k", "pages": 27, "document_id": 7598, "arm": "surfsense_agentic", "question_id": "2405.09818v1.pdf::Q007", "answer_letter": null, "raw_text": "I'll read the document to find information about loss spikes when without QK-norm.Let me search for \"QK-norm\" in the document to find the relevant information.Now let me read the specific sections that mention loss spikes and QK-norm. Let me look at the chart/figure around lines 1028-1076 which seems to show training loss curves.I can see a figure showing training loss with and without QK-norm. The x-axis shows steps from 0k to 175k (approximately). Let me look more carefully at the surrounding text to understand where the spike occurs. Let me check the text before and after this figure.Let me read more to find the caption or description of this figure:Good! Now I can see that figure (b) shows training loss with and without QK-Norm. Let me look back at the chart more carefully to understand where the spike happens. Let me check the text around lines 938-1000 to see if there's more description:Now I need to look at the actual description of figure 5b more carefully. Let me read around the text that discusses the training loss spike:Perfect! I found the answer. The text clearly states:\n\n\"In Figure 5b, we show training loss curves for Chameleon-7B with and without QK-Norm, and the latter diverges after approximately 20% of a training epoch.\"\n\nLet me look at the figure again to verify the step count. Looking at the figure data from earlier, the x-axis goes from 0k to approximately 175k steps. The text mentions \"after approximately 20% of a training epoch\" for the divergence/spike.\n\nHowever, let me look more carefully at what specific step number might correspond to this. Let me check if there's more specific information:", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 73108, "error": null, "extra": {"thread_id": 2059, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_c22c13a6350947eda6562103bdc5468c", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 343, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "however let me look more carefully at what specific step number might correspond to this let me check if there s more specific information", "normalised_gold": "150k"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q000", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "str", "gold": "Special Agent Marc Silski", "pages": 16, "document_id": 7599, "arm": "native_pdf", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q000", "answer_letter": null, "raw_text": "I need to find who verified and signed the complaint by looking at the verification section of the document.\n\nLooking at page 16 of the document, I can see the \"VERIFICATION\" section which states:\n\n\"I, Marc Silski, am a Special Agent of the Federal Bureau of Investigation (\"FBI\"). I have read the foregoing Complaint for Forfeiture, and declare under penalty of perjury of the laws of the United States of America that the facts contained therein are true to the best of my knowledge and belief, based upon knowledge possessed by me and/or on information received from other law enforcement agents.\"\n\nThis is followed by a signature and \"Special Agent Marc Silski, Federal Bureau of Investigation\" with the date \"September 6, 2018\".\n\nAnswer: Special Agent Marc Silski", "citations": [], "input_tokens": 25011, "output_tokens": 176, "cost_micros": 77673, "latency_ms": 14359, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "special agent marc silski", "normalised_gold": "special agent marc silski"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q000", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "str", "gold": "Special Agent Marc Silski", "pages": 16, "document_id": 7599, "arm": "azure_basic_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the verification section at the end of the document (PageID.16 Page 16 of 16), I can see who verified and signed the complaint.\n\nAnswer: Special Agent Marc Silski, Federal Bureau of Investigation", "citations": [], "input_tokens": 5579, "output_tokens": 52, "cost_micros": 17517, "latency_ms": 3594, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.6666666666666666, "method": "str_norm", "normalised_pred": "special agent marc silski federal bureau of investigation", "normalised_gold": "special agent marc silski"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q000", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "str", "gold": "Special Agent Marc Silski", "pages": 16, "document_id": 7599, "arm": "azure_premium_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q000", "answer_letter": null, "raw_text": "According to the verification section at the end of the document, the officer who verified and signed the complaint was:\n\nAnswer: Special Agent Marc Silski", "citations": [], "input_tokens": 6098, "output_tokens": 34, "cost_micros": 18804, "latency_ms": 3062, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "special agent marc silski", "normalised_gold": "special agent marc silski"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q000", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "str", "gold": "Special Agent Marc Silski", "pages": 16, "document_id": 7599, "arm": "llamacloud_basic_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the verification section at the end of the document (page 16):\n\nThe verification states:\n\"I, Marc Silski, am a Special Agent of the Federal Bureau of Investigation (\"FBI\"). I have read the foregoing Complaint for Forfeiture, and declare under penalty of perjury...\"\n\nIt is signed by:\n\"Special Agent Marc Silski\nFederal Bureau of Investigation\nDated: September 6, 2018\"\n\nAnswer: Special Agent Marc Silski", "citations": [], "input_tokens": 5228, "output_tokens": 109, "cost_micros": 17319, "latency_ms": 3780, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "special agent marc silski", "normalised_gold": "special agent marc silski"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q000", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "str", "gold": "Special Agent Marc Silski", "pages": 16, "document_id": 7599, "arm": "llamacloud_premium_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the verification section at the end of the document (page 16), I can see who verified and signed the complaint.\n\nAnswer: Special Agent Marc Silski, Federal Bureau of Investigation", "citations": [], "input_tokens": 5778, "output_tokens": 43, "cost_micros": 17979, "latency_ms": 2312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.6666666666666666, "method": "str_norm", "normalised_pred": "special agent marc silski federal bureau of investigation", "normalised_gold": "special agent marc silski"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q000", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "str", "gold": "Special Agent Marc Silski", "pages": 16, "document_id": 7599, "arm": "surfsense_agentic", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find which officer verified and signed the complaint.Based on the document, I found the verification section on page 16. The officer who verified and signed the complaint is:\n\nAnswer: Special Agent Marc Silski, Federal Bureau of Investigation", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 23546, "error": null, "extra": {"thread_id": 2060, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_890676fb4dc0465f94bad1a9648bd51a", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 82, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 0.6666666666666666, "method": "str_norm", "normalised_pred": "special agent marc silski federal bureau of investigation", "normalised_gold": "special agent marc silski"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q001", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['$35,000', '$40,684']", "pages": 16, "document_id": 7599, "arm": "native_pdf", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q001", "answer_letter": null, "raw_text": "I need to find all the checks issued to Mont Blanc company mentioned in the document.\n\nLooking through the document:\n\nOn page 11 (PageID.11), paragraph e states:\n\"Bank records obtained in this investigation show that in September 2012, the NTC issued a check, number 150301, to a company known as Mont Blanc in the amount of $35,000.\"\n\nOn page 11 (PageID.11), paragraph f states:\n\"Subsequently, in February 2013, a second check, number 151545, in the amount of $40,684 was issued by the NTC payable to Mont Blanc.\"\n\nThese are the two checks issued to Mont Blanc company.\n\nAnswer: $35,000, $40,684", "citations": [], "input_tokens": 25031, "output_tokens": 170, "cost_micros": 77643, "latency_ms": 10219, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "000, 35, 40, 684", "normalised_gold": "000, 35, 40, 684"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q001", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['$35,000', '$40,684']", "pages": 16, "document_id": 7599, "arm": "azure_basic_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q001", "answer_letter": null, "raw_text": "Looking through the document for information about checks issued to Mont Blanc:\n\nFrom paragraph 32(e): \"Bank records obtained in this investigation show that in September 2012, the NTC issued a check, number 150301, to a company known as Mont Blanc in the amount of $35,000.\"\n\nFrom paragraph 32(f): \"Subsequently, in February 2013, a second check, number 151545, in the amount of $40,684 was issued by the NTC payable to Mont Blanc.\"\n\nAnswer: $35,000, $40,684", "citations": [], "input_tokens": 5599, "output_tokens": 129, "cost_micros": 18732, "latency_ms": 3389, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "000, 35, 40, 684", "normalised_gold": "000, 35, 40, 684"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q001", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['$35,000', '$40,684']", "pages": 16, "document_id": 7599, "arm": "azure_premium_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q001", "answer_letter": null, "raw_text": "Looking through the document for checks issued to Mont Blanc:\n\nFrom paragraph 32, subsections e and f:\n\n1. Check number 150301 issued in September 2012: $35,000\n2. Check number 151545 issued in February 2013: $40,684\n\nAnswer: $35,000, $40,684", "citations": [], "input_tokens": 6118, "output_tokens": 79, "cost_micros": 19539, "latency_ms": 3640, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "000, 35, 40, 684", "normalised_gold": "000, 35, 40, 684"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q001", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['$35,000', '$40,684']", "pages": 16, "document_id": 7599, "arm": "llamacloud_basic_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the document for checks issued to Mont Blanc company:\n\nFrom paragraph 32.e: \"Bank records obtained in this investigation show that in September 2012, the NTC issued a check, number 150301, to a company known as Mont Blanc in the amount of $35,000.\"\n\nFrom paragraph 32.f: \"Subsequently, in February 2013, a second check, number 151545, in the amount of $40,684 was issued by the NTC payable to Mont Blanc.\"\n\nAnswer: $35,000, $40,684", "citations": [], "input_tokens": 5248, "output_tokens": 128, "cost_micros": 17664, "latency_ms": 3969, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "000, 35, 40, 684", "normalised_gold": "000, 35, 40, 684"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q001", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['$35,000', '$40,684']", "pages": 16, "document_id": 7599, "arm": "llamacloud_premium_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q001", "answer_letter": null, "raw_text": "Looking through the document for checks issued to Mont Blanc:\n\nFrom paragraph 32(e): \"Bank records obtained in this investigation show that in September 2012, the NTC issued a check, number 150301, to a company known as Mont Blanc in the amount of $35,000.\"\n\nFrom paragraph 32(f): \"Subsequently, in February 2013, a second check, number 151545, in the amount of $40,684 was issued by the NTC payable to Mont Blanc.\"\n\nAnswer: $35,000, $40,684", "citations": [], "input_tokens": 5798, "output_tokens": 127, "cost_micros": 19299, "latency_ms": 3592, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "000, 35, 40, 684", "normalised_gold": "000, 35, 40, 684"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q001", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['$35,000', '$40,684']", "pages": 16, "document_id": 7599, "arm": "surfsense_agentic", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the document that was mentioned to find information about checks issued to Mont Blanc company.Let me continue reading the document to find information about checks issued to Mont Blanc company.Let me continue reading to find information about checks issued to Mont Blanc.Perfect! I found the information about checks issued to Mont Blanc. Let me verify by reading a bit more to ensure I have all the check amounts.\n\nBased on the document, I found the following information about checks issued to Mont Blanc:\n\n1. Check number 150301 - $35,000 (issued in September 2012)\n2. Check number 151545 - $40,684 (issued in February 2013)\n\nAnswer: $35,000, $40,684", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 32530, "error": null, "extra": {"thread_id": 2061, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_5f1b7e2037a34d839cda54ca371fcea3", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 134, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "000, 35, 40, 684", "normalised_gold": "000, 35, 40, 684"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q002", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "str", "gold": "239516", "pages": 16, "document_id": 7599, "arm": "native_pdf", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the CA Bar number in the document.\n\nLooking at page 15, I can see the signature page where it states:\n\nADRIANA DYDELL\nAssistant United States Attorney\n211 W. Fort Street, Suite 2001\nDetroit, Michigan 48226\n(313) 226-9125\nadriana.dydell@usdoj.gov\n(CA. Bar No. 239516)\n\nAnswer: 239516", "citations": [], "input_tokens": 25014, "output_tokens": 103, "cost_micros": 76587, "latency_ms": 10625, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "239516", "normalised_gold": "239516"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q002", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "str", "gold": "239516", "pages": 16, "document_id": 7599, "arm": "azure_basic_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, specifically at page 15 (PageID.15), I can find the CA Bar number listed for Assistant United States Attorney Adriana Dydell.\n\nAnswer: 239516", "citations": [], "input_tokens": 5582, "output_tokens": 47, "cost_micros": 17451, "latency_ms": 3062, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "239516", "normalised_gold": "239516"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q002", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "str", "gold": "239516", "pages": 16, "document_id": 7599, "arm": "azure_premium_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q002", "answer_letter": null, "raw_text": "Looking through the document for the CA Bar number...\n\nAt the bottom of page 15, in the signature block for the Assistant United States Attorney, I can see:\n\nADRIANA DYDELL\nAssistant United States Attorney\n211 W. Fort Street, Suite 2001\nDetroit, Michigan 48226\n(313) 226-9125\nadriana.dydell@usdoj.gov\n(CA. Bar No. 239516)\n\nAnswer: 239516", "citations": [], "input_tokens": 6101, "output_tokens": 108, "cost_micros": 19923, "latency_ms": 4437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "239516", "normalised_gold": "239516"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q002", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "str", "gold": "239516", "pages": 16, "document_id": 7599, "arm": "llamacloud_basic_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q002", "answer_letter": null, "raw_text": "Looking through the document for the CA Bar number...\n\nAt the bottom of page 15, the signature block shows:\n\nADRIANA DYDELL\nAssistant United States Attorney\n211 W. Fort Street, Suite 2001\nDetroit, Michigan 48226\n(313) 226-9125\nDated: September, 2018\nadriana.dydell@usdoj.gov\n(CA. Bar No. 239516)\n\nAnswer: 239516", "citations": [], "input_tokens": 5231, "output_tokens": 107, "cost_micros": 17298, "latency_ms": 4078, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "239516", "normalised_gold": "239516"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q002", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "str", "gold": "239516", "pages": 16, "document_id": 7599, "arm": "llamacloud_premium_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, I can find the CA Bar number at the bottom of page 15, where it states:\n\n\"Dated: September 13, 2018 (CA. Bar No. 239516)\"\n\nThis appears next to the signature block for Adriana Dydell, Assistant United States Attorney.\n\nAnswer: 239516", "citations": [], "input_tokens": 5781, "output_tokens": 78, "cost_micros": 18513, "latency_ms": 3358, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "239516", "normalised_gold": "239516"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q002", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "str", "gold": "239516", "pages": 16, "document_id": 7599, "arm": "surfsense_agentic", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the document that was mentioned to find the CA Bar number.Answer: 239516", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 35797, "error": null, "extra": {"thread_id": 2062, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_67c2db9144b348b6ba61d0ad2f21abaf", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 115, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "239516", "normalised_gold": "239516"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q004", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['13', '14']", "pages": 16, "document_id": 7599, "arm": "native_pdf", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q004", "answer_letter": null, "raw_text": "I'll examine each page to identify where information has been blacked out (redacted).\n\nPage 1: No redactions visible\nPage 2: No redactions visible\nPage 3: No redactions visible\nPage 4: No redactions visible\nPage 5: No redactions visible\nPage 6: No redactions visible\nPage 7: No redactions visible\nPage 8: No redactions visible\nPage 9: No redactions visible\nPage 10: No redactions visible\nPage 11: No redactions visible\nPage 12: No redactions visible\nPage 13: Contains multiple black redaction boxes blocking out names and information (in paragraph l and m)\nPage 14: Contains black redaction boxes blocking out information (in paragraph n)\nPage 15: No redactions visible\nPage 16: No redactions visible\n\nAnswer: 13, 14", "citations": [], "input_tokens": 25034, "output_tokens": 196, "cost_micros": 78042, "latency_ms": 11015, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "13, 14", "normalised_gold": "13, 14"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q004", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['13', '14']", "pages": 16, "document_id": 7599, "arm": "azure_basic_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q004", "answer_letter": null, "raw_text": "I need to carefully examine the document to identify which pages have information that has been blacked out or redacted.\n\nLooking through the document:\n\n- Page 1 (PageID.1): No visible redactions\n- Page 2 (PageID.2): No visible redactions\n- Page 3 (PageID.3): No visible redactions\n- Page 4 (PageID.4): No visible redactions\n- Page 5 (PageID.5): No visible redactions\n- Page 6 (PageID.6): No visible redactions\n- Page 7 (PageID.7): No visible redactions\n- Page 8 (PageID.8): No visible redactions\n- Page 9 (PageID.9): No visible redactions\n- Page 10 (PageID.10): No visible redactions\n- Page 11 (PageID.11): No visible redactions\n- Page 12 (PageID.12): No visible redactions\n- Page 13 (PageID.13): In paragraph 1, there is redacted text after \"the last names of Iacobelli and\" and in paragraph m, there are multiple redactions of names\n- Page 14 (PageID.14): In paragraph n, there are redacted names at the beginning and within the text\n- Page 15 (PageID.15): No visible redactions\n- Page 16 (PageID.16): No visible redactions\n\nAnswer: 13, 14", "citations": [], "input_tokens": 5602, "output_tokens": 339, "cost_micros": 21891, "latency_ms": 5812, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "13, 14", "normalised_gold": "13, 14"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q004", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['13', '14']", "pages": 16, "document_id": 7599, "arm": "azure_premium_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q004", "answer_letter": null, "raw_text": "Looking through the document for redacted/blacked out information:\n\nOn page 14 (PageID.14), paragraph m, there are multiple instances of redacted information indicated by blank spaces where names should be:\n- \"was interviewed by agents.\"\n- \"was a former co-worker\"\n- \"During the interview,\"\n- \"informed agents that Iacobelli bragged to her about a Mont Blanc pen that he carried in his shirt pocket costing $25,000. Iacobelli also informed that\"\n\nOn page 14 (PageID.14), paragraph n, there is also redacted information:\n- \"On July 31, 2017, agents contacted who stated that\"\n- \"he was in possession of the Lincoln commemorative Mont Blanc pen. added that he ordered the pen, but that Iacobelli paid for both and\"\n\nPage 13 (PageID.13), paragraph l mentions \"having the last names of Iacobelli and\" followed by what appears to be a redaction.\n\nAnswer: 13, 14", "citations": [], "input_tokens": 6121, "output_tokens": 238, "cost_micros": 21933, "latency_ms": 6703, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "13, 14", "normalised_gold": "13, 14"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q004", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['13', '14']", "pages": 16, "document_id": 7599, "arm": "llamacloud_basic_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q004", "answer_letter": null, "raw_text": "I need to carefully review the document to identify which pages contain redacted or blacked out information.\n\nLooking through the document:\n\n- Page 2: No redactions visible\n- Page 3: No redactions visible\n- Page 4: No redactions visible\n- Page 5: No redactions visible\n- Page 6: No redactions visible\n- Page 7: No redactions visible\n- Page 8: No redactions visible\n- Page 9: No redactions visible\n- Page 10: No redactions visible\n- Page 11: No redactions visible\n- Page 12: No redactions visible\n- Page 13: Contains redacted/blacked out information (names are replaced with blank spaces/lines in sections l and m)\n- Page 14: Contains redacted/blacked out information (names are replaced with blank spaces/lines in section n)\n- Page 15: No redactions visible\n- Page 16: No redactions visible\n\nThe pages with blacked out information are pages 13 and 14, where personal names have been redacted.\n\nAnswer: 13, 14", "citations": [], "input_tokens": 5251, "output_tokens": 248, "cost_micros": 19473, "latency_ms": 5733, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "13, 14", "normalised_gold": "13, 14"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q004", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['13', '14']", "pages": 16, "document_id": 7599, "arm": "llamacloud_premium_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q004", "answer_letter": null, "raw_text": "I need to carefully examine the document to identify which pages contain redacted or blacked out information.\n\nLooking through the document:\n\n- Pages 1-12: No redactions visible\n- Page 13 (PageID.13): Contains multiple redactions marked with \"\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\" or similar black bars in paragraph m and n. Specifically:\n  - \"former co-worker \u2588\u2588\u2588\u2588\u2588\u2588\" \n  - Multiple instances of \"\u2588\u2588\u2588\u2588\u2588\u2588\" referring to a person's name\n  - \"podiatrist named \u2588\u2588\u2588\u2588\u2588\u2588\"\n  - \"[\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588]\" in paragraph n\n  - \"[\u2588\u2588\u2588\u2588]\" multiple times in paragraph n\n\n- Page 14 (PageID.14): No redactions visible\n- Pages 15-16: No redactions visible\n\nThe redactions appear only on page 13 of the document (which corresponds to PageID.13, Page 13 of 16).\n\nAnswer: 13", "citations": [], "input_tokens": 5801, "output_tokens": 206, "cost_micros": 20493, "latency_ms": 6671, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.6666666666666666, "method": "list_set", "normalised_pred": "13", "normalised_gold": "13, 14"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q004", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['13', '14']", "pages": 16, "document_id": 7599, "arm": "surfsense_agentic", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to identify pages with blacked out information.Based on my review of the document, I can identify pages with redacted (blacked out) information:\n\n- **Page 13**: Contains multiple black redaction bars in paragraphs l and m, obscuring names and identifying information. The visual description explicitly notes \"Multiple horizontal black bars throughout the document\" and \"Redacted sections appear in paragraphs l and m.\"\n\n- **Page 14**: Contains three black redaction blocks in paragraph n, obscuring names/identifying information. The visual description notes \"Contains redacted sections shown as solid black rectangular blocks\" and \"Multiple solid black rectangular blocks throughout paragraph n.\"\n\nAnswer: 13, 14", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 37265, "error": null, "extra": {"thread_id": 2063, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_5190eed28c9d49e68875e654ed7fc489", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 163, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "13, 14", "normalised_gold": "13, 14"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q005", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "int", "gold": "19", "pages": 16, "document_id": 7599, "arm": "native_pdf", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q005", "answer_letter": null, "raw_text": "I need to find manually written numbers on pages 15 and 16 of this document.\n\nLooking at page 15 (PageID.15):\nI can see a handwritten date that appears to be \"September 12, 2018\" where \"12\" is written manually.\n\nLooking at page 16 (PageID.16):\nI can see a handwritten date that appears to be \"September 6, 2018\" where \"6\" is written manually.\n\nThe manually written numbers are:\n- Page 15: 12\n- Page 16: 6\n\nSum: 12 + 6 = 18\n\nAnswer: 18", "citations": [], "input_tokens": 25026, "output_tokens": 148, "cost_micros": 77298, "latency_ms": 8937, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "18", "normalised_gold": "19"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q005", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "int", "gold": "19", "pages": 16, "document_id": 7599, "arm": "azure_basic_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q005", "answer_letter": null, "raw_text": "I need to find manually written numbers on pages 15 and 16 of the document.\n\nLooking at page 15 (PageID.15):\n- The date shows \"September 3 2018\" - there appears to be a handwritten \"3\" in the date field\n\nLooking at page 16 (PageID.16):\n- The date shows \"September 6 2018\" - there appears to be a handwritten \"6\" in the date field\n\nThe sum of the manually written numbers is: 3 + 6 = 9\n\nAnswer: 9", "citations": [], "input_tokens": 5594, "output_tokens": 130, "cost_micros": 18732, "latency_ms": 3875, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "9", "normalised_gold": "19"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q005", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "int", "gold": "19", "pages": 16, "document_id": 7599, "arm": "azure_premium_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q005", "answer_letter": null, "raw_text": "I need to find manually written numbers on pages 15 and 16 of the document.\n\nLooking at page 15 (PageID.15):\nThe date shows \"September 13 2018\" where \"13\" appears to be handwritten.\n\nLooking at page 16 (PageID.16):\nThe date shows \"September 6, 2018\" where \"6\" appears to be handwritten.\n\nSum: 13 + 6 = 19\n\nAnswer: 19", "citations": [], "input_tokens": 6113, "output_tokens": 112, "cost_micros": 20019, "latency_ms": 3219, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "19", "normalised_gold": "19"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q005", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "int", "gold": "19", "pages": 16, "document_id": 7599, "arm": "llamacloud_basic_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q005", "answer_letter": null, "raw_text": "I need to examine pages 15 and 16 of the document to find manually written numbers.\n\nLooking at page 15:\n- The page header shows \"Case 2:18-cv-12857-DML-RSW ECF No. 1 filed 09/13/18 PageID.15 Page 15 of 16\"\n- The page contains typed text including \"Respectfully submitted,\" signature block for Matthew Schneider and Adriana Dydell\n- At the bottom it shows \"Dated: September, 2018\"\n- The number \"15\" appears at the bottom of the page\n\nLooking at page 16:\n- The page header shows \"Case 2:18-cv-12857-DML-RSW ECF No. 1 filed 09/13/18 PageID.16 Page 16 of 16\"\n- Contains a verification section signed by Marc Silski\n- At the bottom it shows \"Dated: September 6, 2018\"\n- The number \"16\" appears at the bottom of the page\n\nThe manually written/handwritten numbers visible on these pages are:\n- Page 15: The page number \"15\" at the bottom\n- Page 16: The page number \"16\" at the bottom\n\nSum: 15 + 16 = 31\n\nAnswer: 31", "citations": [], "input_tokens": 5243, "output_tokens": 307, "cost_micros": 20334, "latency_ms": 6765, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "31", "normalised_gold": "19"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q005", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "int", "gold": "19", "pages": 16, "document_id": 7599, "arm": "llamacloud_premium_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q005", "answer_letter": null, "raw_text": "I need to identify manually written numbers on pages 15 and 16 of the document.\n\nLooking at page 15 (PageID.15):\n- The date \"September 13, 2018\" appears to have \"13\" that could be handwritten, but examining the format, it appears to be typed text, not manually written.\n\nLooking at page 16 (PageID.16):\n- The date \"September 6, 2018\" appears, where \"6\" could potentially be handwritten.\n- There is a signature line with \"Special Agent Marc Silski\" and a signature.\n\nHowever, upon closer examination of the document format, these appear to be typed dates in the standard court document format. The document shows underlined spaces where information was filled in, but the actual numbers \"13\" on page 15 and \"6\" on page 16 appear in the standard typed format of the document.\n\nLooking more carefully, the date on page 15 shows \"September <u>13</u>, 2018\" where \"13\" appears to be typed/filled in on an underlined space, and on page 16 \"September 6, 2018\" where \"6\" appears typed.\n\nIf these are considered the manually written/filled-in numbers:\n- Page 15: 13\n- Page 16: 6\n- Sum: 13 + 6 = 19\n\nAnswer: 19", "citations": [], "input_tokens": 5793, "output_tokens": 316, "cost_micros": 22119, "latency_ms": 7969, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "19", "normalised_gold": "19"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q005", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "int", "gold": "19", "pages": 16, "document_id": 7599, "arm": "surfsense_agentic", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q005", "answer_letter": null, "raw_text": "I'll help you find the manually written numbers on pages 15 and 16 of the document. Let me first read the document that was mentioned in the context.Let me continue reading to find pages 15 and 16. I need to search through the document to locate these specific pages.", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 27906, "error": null, "extra": {"thread_id": 2064, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_3dc2a747e29a4ced83b1f3d763f972af", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 79, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "15", "normalised_gold": "19"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q006", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['black', 'blue', 'white']", "pages": 16, "document_id": 7599, "arm": "native_pdf", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q006", "answer_letter": null, "raw_text": "I need to analyze the document to identify all the colors present in it.\n\nLooking at the document pages provided:\n- The text appears in black\n- The background is white\n- There are blue hyperlinks/case numbers at the top of each page\n- There are black redaction marks/blocks in some areas (pages 13-14)\n\nThe colors present in this document are:\n- Black (text and redactions)\n- Blue (case numbers and headers)\n- White (background)\n\nAnswer: black, blue, white", "citations": [], "input_tokens": 25038, "output_tokens": 116, "cost_micros": 76854, "latency_ms": 10546, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "black, blue, white", "normalised_gold": "black, blue, white"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q006", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['black', 'blue', 'white']", "pages": 16, "document_id": 7599, "arm": "azure_basic_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q006", "answer_letter": null, "raw_text": "I need to identify all colors mentioned in this legal document.\n\nScanning through the document for color references:\n\n1. Page 11, paragraph d: \"red 2013 Ferrari 458 Spider convertible\"\n2. Page 12, paragraph j: mentions \"gold\" (portrait in gold) and \"sapphire\" (which is blue)\n\nLet me verify these are the only color mentions:\n- \"red\" - explicitly mentioned for the Ferrari\n- \"gold\" - mentioned for Lincoln's portrait on the pen clip\n- The sapphire mentioned would typically be blue, but the document says \"sapphire\" not \"blue\"\n\nLooking more carefully at the text:\n- \"red 2013 Ferrari\" (paragraph 32d)\n- \"gold\" (paragraph 32j - \"Lincoln's portrait in gold\")\n- \"sapphire\" (paragraph 32j - \"has a sapphire\")\n\nThe colors explicitly stated are:\n- gold\n- red\n\nAlphabetically ordered:\n\nAnswer: gold, red", "citations": [], "input_tokens": 5606, "output_tokens": 228, "cost_micros": 20238, "latency_ms": 6780, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "gold, red", "normalised_gold": "black, blue, white"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q006", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['black', 'blue', 'white']", "pages": 16, "document_id": 7599, "arm": "azure_premium_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q006", "answer_letter": null, "raw_text": "I need to examine the document to identify all colors mentioned or visible in it.\n\nLooking through the document:\n\n1. The document itself appears to be a black and white legal document (black text on white background)\n2. In paragraph 32(d), a \"red 2013 Ferrari 458 Spider convertible\" is mentioned\n3. In paragraph 32(j), \"gold\" is mentioned (describing Lincoln's portrait in gold on the pen clip)\n4. In paragraph 32(j), \"sapphire\" is mentioned (the pen's clip has a sapphire)\n\nColors explicitly mentioned in the text:\n- Gold\n- Red\n- Sapphire (which is blue)\n\nColors of the document format itself:\n- Black (text)\n- White (background)\n\nListing all colors alphabetically:\n- Black\n- Blue (sapphire)\n- Gold\n- Red\n- White\n\nAnswer: black, blue, gold, red, white", "citations": [], "input_tokens": 6125, "output_tokens": 214, "cost_micros": 21585, "latency_ms": 5405, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.7499999999999999, "method": "list_set", "normalised_pred": "black, blue, gold, red, white", "normalised_gold": "black, blue, white"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q006", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['black', 'blue', 'white']", "pages": 16, "document_id": 7599, "arm": "llamacloud_basic_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q006", "answer_letter": null, "raw_text": "I need to identify all colors mentioned in this document.\n\nLet me search through the document for any color references:\n\n1. In paragraph 32.a, there is mention of \"a red 2013 Ferrari 458 Spider convertible\"\n2. In paragraph 32.j, there is mention of \"gold\" (describing Lincoln's portrait on the pen clip)\n3. The document is a legal court document that appears to be in black text on white paper (standard document format)\n\nThe colors explicitly mentioned in the text are:\n- Gold (mentioned in the description of the pen)\n- Red (mentioned in the description of the Ferrari)\n\nSince this is a standard black and white legal document with black text, and the question asks for colors in the document, I should include the standard document colors as well as the colors mentioned in the content.\n\nColors found:\n- Black (text)\n- Gold (pen description)\n- Red (Ferrari description)\n- White (paper/background)\n\nIn alphabetical order: black, gold, red, white\n\nAnswer: black, gold, red, white", "citations": [], "input_tokens": 5255, "output_tokens": 238, "cost_micros": 19335, "latency_ms": 7875, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5714285714285715, "method": "list_set", "normalised_pred": "black, gold, red, white", "normalised_gold": "black, blue, white"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q006", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['black', 'blue', 'white']", "pages": 16, "document_id": 7599, "arm": "llamacloud_premium_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q006", "answer_letter": null, "raw_text": "I need to identify all colors mentioned in the document.\n\nGoing through the document carefully:\n\n1. Page 10, paragraph d: \"red 2013 Ferrari 458 Spider convertible\" - mentions **red**\n2. Page 12, paragraph j: \"sapphire\" (referring to a sapphire on the pen clip) - **blue** (sapphire is blue)\n3. Page 12, paragraph j: \"gold\" (portrait in gold on the pen's clip) - mentions **gold**\n\nLooking at the document format itself, this appears to be a standard black text on white background legal document, so:\n- **black** (text)\n- **white** (background)\n\nAlphabetically ordered:\n- black\n- blue\n- gold\n- red\n- white\n\nAnswer: black, blue, gold, red, white", "citations": [], "input_tokens": 5805, "output_tokens": 189, "cost_micros": 20250, "latency_ms": 6407, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.7499999999999999, "method": "list_set", "normalised_pred": "black, blue, gold, red, white", "normalised_gold": "black, blue, white"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q006", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['black', 'blue', 'white']", "pages": 16, "document_id": 7599, "arm": "surfsense_agentic", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the document to identify all the colors present.Based on my careful reading of the document, I can identify the following colors mentioned or described:\n\nFrom the visual descriptions and content:\n- **Black** - mentioned multiple times in visual descriptions (\"black text\", \"black bars\", \"black rectangular blocks\" for redactions)\n- **Blue** - mentioned in the visual description (\"Blue text displaying case number\")\n- **Gold** - mentioned in the pen description (\"Lincoln's portrait in gold\")\n- **Red** - mentioned for the Ferrari (\"red 2013 Ferrari 458 Spider\")\n- **White** - mentioned in visual descriptions (\"white background\")\n\nThere's also a **sapphire** mentioned on the pen clip, which is typically blue, but since \"blue\" is already listed separately and sapphire is a gemstone rather than a color descriptor in this context, I'll include it as a separate item if we're being comprehensive about color-related terms.\n\nHowever, looking at the actual color words used as descriptors in the document:\n\nAnswer: black, blue, gold, red, white", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 37108, "error": null, "extra": {"thread_id": 2065, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_8c1d9833eaf14e8eae921556f7ecfecf", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 186, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.7499999999999999, "method": "list_set", "normalised_pred": "black, blue, gold, red, white", "normalised_gold": "black, blue, white"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q007", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "int", "gold": "2", "pages": 16, "document_id": 7599, "arm": "native_pdf", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q007", "answer_letter": null, "raw_text": "I need to count the signatures on pages 15 and 16 of the document.\n\nLooking at page 15 (PageID.15):\nI can see one signature above the typed name \"ADRIANA DYDELL, Assistant United States Attorney\"\n\nLooking at page 16 (PageID.16):\nI can see one signature above the typed name \"Special Agent Marc Silski, Federal Bureau of Investigation\"\n\nTotal signatures on pages 15 and 16: 1 + 1 = 2\n\nAnswer: 2", "citations": [], "input_tokens": 25021, "output_tokens": 122, "cost_micros": 76893, "latency_ms": 8171, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q007", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "int", "gold": "2", "pages": 16, "document_id": 7599, "arm": "azure_basic_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q007", "answer_letter": null, "raw_text": "I need to count the signatures on pages 15 and 16 of the document.\n\nLooking at page 15 (PageID.15):\nI can see one signature above the typed name \"ADRIANA DYDELL\" (Assistant United States Attorney).\n\nLooking at page 16 (PageID.16):\nI can see one signature above the typed name \"Special Agent Marc Silski\" (Federal Bureau of Investigation).\n\nTotal signatures on pages 15 and 16: 1 + 1 = 2\n\nAnswer: 2", "citations": [], "input_tokens": 5589, "output_tokens": 124, "cost_micros": 18627, "latency_ms": 3781, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q007", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "int", "gold": "2", "pages": 16, "document_id": 7599, "arm": "azure_premium_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q007", "answer_letter": null, "raw_text": "I need to count the signatures on pages 15 and 16 of the document.\n\nLooking at page 15:\n- There is one signature above the typed name \"ADRIANA DYDELL\"\n\nLooking at page 16:\n- There is one signature above the typed name \"Special Agent Marc Silski\"\n\nTotal signatures on pages 15 and 16: 1 + 1 = 2\n\nAnswer: 2", "citations": [], "input_tokens": 6108, "output_tokens": 100, "cost_micros": 19824, "latency_ms": 3750, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q007", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "int", "gold": "2", "pages": 16, "document_id": 7599, "arm": "llamacloud_basic_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q007", "answer_letter": null, "raw_text": "I need to count the signatures that appear on pages 15 and 16 of the document.\n\nLooking at page 15:\n- This page shows the document dated September, 2018\n- It has the names \"MATTHEW SCHNEIDER\" and \"ADRIANA DYDELL\" typed, but these appear to be typed names as part of the document format, not actual signatures\n- I do not see any actual handwritten or image signatures on page 15\n\nLooking at page 16:\n- This is the VERIFICATION page\n- It shows \"Special Agent Marc Silski\" and \"Federal Bureau of Investigation\" with a date of \"September 6, 2018\"\n- This also appears to be typed text rather than an actual signature image\n\nThe question asks for signatures \"appeared\" on these pages. Looking carefully at both pages, I can see typed names but no visible handwritten signatures or signature images in the document.\n\nHowever, these are signature blocks where signatures would typically appear. The document shows:\n- Page 15: 1 signature block (for Adriana Dydell)\n- Page 16: 1 signature block (for Marc Silski)\n\nTotal signature blocks: 2\n\nSince the question asks for the count in \"float format\" but then requests \"a single integer only,\" and given that there are 2 signature blocks present on these pages combined:\n\nAnswer: 2", "citations": [], "input_tokens": 5238, "output_tokens": 308, "cost_micros": 20334, "latency_ms": 8796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q007", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "int", "gold": "2", "pages": 16, "document_id": 7599, "arm": "llamacloud_premium_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q007", "answer_letter": null, "raw_text": "I need to count the signatures on pages 15 and 16 of the document.\n\n**Page 15 (PageID.15):**\nLooking at page 15, I can see:\n1. A signature above \"ADRIANA DYDELL\" - there is an underlined signature line with \"Adriana Dydell\" written above it\n\nSo there is 1 signature on page 15.\n\n**Page 16 (PageID.16):**\nLooking at page 16, I can see:\n1. A signature notation \"[Signature of Marc Silski]\" above \"Special Agent Marc Silski\"\n\nSo there is 1 signature on page 16.\n\n**Total signatures on pages 15 and 16:** 1 + 1 = 2\n\nAnswer: 2", "citations": [], "input_tokens": 5788, "output_tokens": 185, "cost_micros": 20139, "latency_ms": 5296, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q007", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "int", "gold": "2", "pages": 16, "document_id": 7599, "arm": "surfsense_agentic", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q007", "answer_letter": null, "raw_text": "I'll read the document to find the signatures on pages 15 and 16.Let me search for content related to pages 15 and 16 by reading more of the document:Let me read the entire document to find pages 15 and 16:Great! I found the relevant sections. Let me look more carefully at the content around pages 15 and 16. I can see:\n\n- Line 616: `<!-- PageNumber=\"15\" -->`\n- Line 637: `<!-- PageNumber=\"16\" -->`\n\nNow let me examine the signatures on these pages. From what I can see:\n\n**Page 15** (lines 590-618): Shows signatures for:\n1. Matthew Schneider (United States Attorney) \n2. Adriana Dydell (Assistant United States Attorney)\n\n**Page 16** (lines 619-638): Shows signature for:\n1. Marc Silski (Special Agent, Federal Bureau of Investigation)\n\nLet me read the section more carefully to confirm:Perfect! Now I can clearly see the signatures on both pages:\n\n**Page 15** (lines 596-616):\n- Shows \"Respectfully submitted,\" followed by:\n  1. MATTHEW SCHNEIDER (United States Attorney)\n  2. \"Adriana@iol\" (appears to be a signature)\n  3. ADRIANA DYDELL (Assistant United States Attorney)\n- The \"Adriana@iol\" on line 603 appears to be a signature representation\n- Dated: September 13, 2018\n\n**Page 16** (lines 619-637):\n- Shows VERIFICATION section with:\n  1. \"Josh,\" on line 631 (appears to be a signature)\n  2. Special Agent Marc Silski (Federal Bureau of Investigation)\n- Dated: September 6, 2018\n\nCounting the signatures:\n- Page 15: 1 signature (the \"Adriana@iol\" signature for Adriana Dydell)\n- Page 16: 1 signature (the \"Josh,\" signature for Marc Silski)\n\nTotal signatures on pages 15 and 16: 1 + 1 = 2\n\nAnswer: 2", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 42092, "error": null, "extra": {"thread_id": 2066, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_ddcf7e386683499793cc579d23850dce", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 267, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q001", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "str", "gold": "01983 873655", "pages": 17, "document_id": 7600, "arm": "native_pdf", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q001", "answer_letter": null, "raw_text": "According to the document, the telephone number for The Limes Residential Home is:\n\nAnswer: 01983 873655", "citations": [], "input_tokens": 37738, "output_tokens": 30, "cost_micros": 113664, "latency_ms": 7312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "379f44022bb27aa53efd5d322c7b57bf.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "01983 873655", "normalised_gold": "01983 873655"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q001", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "str", "gold": "01983 873655", "pages": 17, "document_id": 7600, "arm": "azure_basic_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the document header on the first page, I can find the contact information for The Limes Residential Home.\n\nAnswer: 01983 873655", "citations": [], "input_tokens": 10257, "output_tokens": 37, "cost_micros": 31326, "latency_ms": 4000, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "01983 873655", "normalised_gold": "01983 873655"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q001", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "str", "gold": "01983 873655", "pages": 17, "document_id": 7600, "arm": "azure_premium_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the document header on the first page, I can find the contact information for The Limes Residential Home.\n\nThe address and contact details are:\n43 Foreland Road\nBembridge\nIsle of Wight\nPO35 5XN\nTel: 01983 873655\n\nAnswer: 01983 873655", "citations": [], "input_tokens": 12045, "output_tokens": 79, "cost_micros": 37320, "latency_ms": 3766, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "01983 873655", "normalised_gold": "01983 873655"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q001", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "str", "gold": "01983 873655", "pages": 17, "document_id": 7600, "arm": "llamacloud_basic_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the document header, I can find the contact information for The Limes Residential Home.\n\nThe address block shows:\n43 Foreland Road\nBembridge\nIsle of Wight\nPO35 5XN\nTel: 01983 873655\n\nAnswer: 01983 873655", "citations": [], "input_tokens": 10599, "output_tokens": 73, "cost_micros": 32892, "latency_ms": 6344, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "01983 873655", "normalised_gold": "01983 873655"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q001", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "str", "gold": "01983 873655", "pages": 17, "document_id": 7600, "arm": "llamacloud_premium_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the document header, I can find the contact information for The Limes Residential Home.\n\nAnswer: 01983 873655", "citations": [], "input_tokens": 10834, "output_tokens": 33, "cost_micros": 32997, "latency_ms": 3625, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "01983 873655", "normalised_gold": "01983 873655"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q001", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "str", "gold": "01983 873655", "pages": 17, "document_id": 7600, "arm": "surfsense_agentic", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the document to find the telephone number for The Limes Residential Home.Let me search for \"The Limes Residential Home\" in the document:Let me search with more context to see the actual content:I can see from the search results that the telephone number appears on line 168. Let me read that section:Perfect! I found the telephone number for The Limes Residential Home on line 174.\n\nAnswer: 01983 873655", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 34734, "error": null, "extra": {"thread_id": 2067, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_de5d9004414b47159eff08bc68030dbf", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 151, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "01983 873655", "normalised_gold": "01983 873655"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q003", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "list", "gold": "['Is the servife safe?', 'Is the service effective', 'Is the serve caring?', 'Is the service responsive?', 'Is the service well-led?']", "pages": 17, "document_id": 7600, "arm": "native_pdf", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the document, specifically at page 3 which contains the section \"The five questions we ask about services and what we found\", I can identify the primary questions asked in this inspection report.\n\nAnswer: Is the service safe?, Is the service effective?, Is the service caring?, Is the service responsive?, Is the service well-led?", "citations": [], "input_tokens": 37742, "output_tokens": 73, "cost_micros": 114321, "latency_ms": 7515, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "379f44022bb27aa53efd5d322c7b57bf.pdf"}, "graded": {"correct": false, "f1": 0.6, "method": "list_set", "normalised_pred": "is service caring, is service effective, is service responsive, is service safe, is service well led", "normalised_gold": "is serve caring, is service effective, is service responsive, is service well led, is servife safe"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q003", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "list", "gold": "['Is the servife safe?', 'Is the service effective', 'Is the serve caring?', 'Is the service responsive?', 'Is the service well-led?']", "pages": 17, "document_id": 7600, "arm": "azure_basic_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the document, I can identify the five primary questions that were asked about the services in this inspection report.\n\nThese questions are clearly listed in the \"Summary of findings\" section under \"The five questions we ask about services and what we found\":\n\nAnswer: Is the service safe?, Is the service effective?, Is the service caring?, Is the service responsive?, Is the service well-led?", "citations": [], "input_tokens": 10261, "output_tokens": 84, "cost_micros": 32043, "latency_ms": 3844, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.6, "method": "list_set", "normalised_pred": "is service caring, is service effective, is service responsive, is service safe, is service well led", "normalised_gold": "is serve caring, is service effective, is service responsive, is service well led, is servife safe"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q003", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "list", "gold": "['Is the servife safe?', 'Is the service effective', 'Is the serve caring?', 'Is the service responsive?', 'Is the service well-led?']", "pages": 17, "document_id": 7600, "arm": "azure_premium_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the document, specifically the \"Summary of findings\" section which contains \"The five questions we ask about services and what we found\", I can identify the primary questions asked about the services.\n\nAnswer: Is the service safe?, Is the service effective?, Is the service caring?, Is the service responsive?, Is the service well-led?", "citations": [], "input_tokens": 12049, "output_tokens": 72, "cost_micros": 37227, "latency_ms": 4234, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.6, "method": "list_set", "normalised_pred": "is service caring, is service effective, is service responsive, is service safe, is service well led", "normalised_gold": "is serve caring, is service effective, is service responsive, is service well led, is servife safe"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q003", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "list", "gold": "['Is the servife safe?', 'Is the service effective', 'Is the serve caring?', 'Is the service responsive?', 'Is the service well-led?']", "pages": 17, "document_id": 7600, "arm": "llamacloud_basic_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the document, I can see the \"five questions we ask about services\" section which lists the primary questions asked during the inspection.\n\nAnswer: Is the service safe?, Is the service effective?, Is the service caring?, Is the service responsive?, Is the service well-led?", "citations": [], "input_tokens": 10603, "output_tokens": 61, "cost_micros": 32724, "latency_ms": 3125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.6, "method": "list_set", "normalised_pred": "is service caring, is service effective, is service responsive, is service safe, is service well led", "normalised_gold": "is serve caring, is service effective, is service responsive, is service well led, is servife safe"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q003", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "list", "gold": "['Is the servife safe?', 'Is the service effective', 'Is the serve caring?', 'Is the service responsive?', 'Is the service well-led?']", "pages": 17, "document_id": 7600, "arm": "llamacloud_premium_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q003", "answer_letter": null, "raw_text": "Looking through the document, I can identify the five primary questions that were consistently asked about the services throughout the inspection report.\n\nThese questions appear in the \"Summary of findings\" section and structure the entire report, with each having its own detailed findings section:\n\n1. Is the service safe?\n2. Is the service effective?\n3. Is the service caring?\n4. Is the service responsive?\n5. Is the service well-led?\n\nAnswer: Is the service safe?, Is the service effective?, Is the service caring?, Is the service responsive?, Is the service well-led?", "citations": [], "input_tokens": 10838, "output_tokens": 126, "cost_micros": 34404, "latency_ms": 4328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.6, "method": "list_set", "normalised_pred": "is service caring, is service effective, is service responsive, is service safe, is service well led", "normalised_gold": "is serve caring, is service effective, is service responsive, is service well led, is servife safe"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q003", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "list", "gold": "['Is the servife safe?', 'Is the service effective', 'Is the serve caring?', 'Is the service responsive?', 'Is the service well-led?']", "pages": 17, "document_id": 7600, "arm": "surfsense_agentic", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the user-mentioned document to find the primary questions asked about the services.Looking at the document, I can see this is a Care Quality Commission inspection report. The report explicitly states \"The five questions we ask about services\" in a table starting around line 314. Let me identify these questions from the document structure:\n\nFrom the document, the five primary questions asked about the services are clearly listed:\n\n1. Is the service safe?\n2. Is the service effective?\n3. Is the service caring?\n4. Is the service responsive?\n5. Is the service well-led?\n\nAnswer: Is the service safe?, Is the service effective?, Is the service caring?, Is the service responsive?, Is the service well-led?", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 27546, "error": null, "extra": {"thread_id": 2068, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_8bdf643e270448d6aadaffef3d9744b8", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 108, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.6, "method": "list_set", "normalised_pred": "is service caring, is service effective, is service responsive, is service safe, is service well led", "normalised_gold": "is serve caring, is service effective, is service responsive, is service well led, is servife safe"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q004", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "list", "gold": "['Page 1', 'Page 5']", "pages": 17, "document_id": 7600, "arm": "native_pdf", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q004", "answer_letter": null, "raw_text": "I need to identify all pages where the CQC (Care Quality Commission) logo appears in this document.\n\nLet me examine each page:\n\nPage 1: I can see the CQC logo (the circular Q logo with \"Care Quality Commission\" text) at the top of the page.\n\nPage 2: No CQC logo visible on this page.\n\nPage 3: No CQC logo visible on this page.\n\nPage 4: No CQC logo visible on this page.\n\nPage 5: I can see the CQC logo at the top of this page.\n\nPages 6-17: Examining the remaining pages, I do not see the CQC logo appearing on any of these pages.\n\nAnswer: Page 1, Page 5", "citations": [], "input_tokens": 37764, "output_tokens": 169, "cost_micros": 115827, "latency_ms": 7890, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "379f44022bb27aa53efd5d322c7b57bf.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "page 1, page 5", "normalised_gold": "page 1, page 5"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q004", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "list", "gold": "['Page 1', 'Page 5']", "pages": 17, "document_id": 7600, "arm": "azure_basic_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q004", "answer_letter": null, "raw_text": "I need to identify all pages where the CQC (Care Quality Commission) logo appears in this document.\n\nLooking through the document, I can see \"Care Quality Commission\" text appears at the top of certain pages, which typically indicates where a logo would be placed in the actual PDF document.\n\nLet me go through each page:\n\nPage 1: \"Care Quality Commission\" appears at the top - \"The Limes Residential Care Home Limited The Limes Residential Home\"\n\nPage 2: \"Summary of findings\" section continues\n\nPage 3: \"Summary of findings\" continues\n\nPage 4: \"Summary of findings\" continues\n\nPage 5: \"Care Quality Commission\" appears - \"The Limes Residential Home Detailed findings\"\n\nPage 6: Continues with \"Is the service safe?\"\n\nPage 7: Continues with \"Is the service safe?\"\n\nPage 8: \"Inadequate\" and \"Is the service effective?\"\n\nPage 9: Continues with \"Is the service effective?\"\n\nPage 10: \"Requires improvement\" and \"Is the service caring?\"\n\nPage 11: Continues with \"Is the service caring?\"\n\nPage 12: \"Requires improvement\" and \"Is the service responsive?\"\n\nPage 13: \"Requires improvement\" and \"Is the service well-led?\"\n\nPage 14: Continues with \"Is the service well-led?\"\n\nPage 15: \"Action we have told the provider to take\"\n\nPage 16: Continues with \"Action we have told the provider to take\"\n\nPage 17: \"Enforcement actions\"\n\nBased on the document structure, \"Care Quality Commission\" appears as a header on pages 1 and 5, which is where the CQC logo would typically be located.\n\nAnswer: Page 1, Page 5", "citations": [], "input_tokens": 10283, "output_tokens": 395, "cost_micros": 36774, "latency_ms": 7358, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "page 1, page 5", "normalised_gold": "page 1, page 5"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q004", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "list", "gold": "['Page 1', 'Page 5']", "pages": 17, "document_id": 7600, "arm": "azure_premium_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q004", "answer_letter": null, "raw_text": "I need to identify all pages where the Care Quality Commission (CQC) logo appears in this document.\n\nLooking through the document:\n\n1. Page 1 (cover page) - Yes, CQC logo appears at the top\n2. Page 2 - No visible logo\n3. Page 3 - No visible logo\n4. Page 4 - No visible logo\n5. Page 5 - Yes, CQC logo appears at the top (start of \"Detailed findings\" section)\n6. Page 6 - No visible logo\n7. Page 7 - No visible logo\n8. Page 8 - No visible logo\n9. Page 9 - No visible logo\n10. Page 10 - No visible logo\n11. Page 11 - No visible logo\n12. Page 12 - No visible logo\n13. Page 13 - No visible logo\n14. Page 14 - No visible logo\n15. Page 15 - No visible logo\n16. Page 16 - No visible logo\n17. Page 17 - No visible logo\n\nThe CQC logo appears on:\n- Page 1 (the title page/cover)\n- Page 5 (at the start of the detailed findings section)\n\nAnswer: Page 1, Page 5", "citations": [], "input_tokens": 12071, "output_tokens": 294, "cost_micros": 40623, "latency_ms": 5390, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "page 1, page 5", "normalised_gold": "page 1, page 5"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q004", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "list", "gold": "['Page 1', 'Page 5']", "pages": 17, "document_id": 7600, "arm": "llamacloud_basic_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q004", "answer_letter": null, "raw_text": "I need to identify all pages where the CQC (Care Quality Commission) logo appears in this document.\n\nLooking through the document, I can see \"Care Quality Commission\" text appears at the top of several pages. The logo/header \"Care Quality Commission\" appears on:\n\n- Page 1 (the title page with \"The Limes Residential Care Home Limited\")\n- Page 5 (at the top before \"Detailed findings\")\n- Page 15 (at the top before \"Action we have told the provider to take\")\n\nLet me verify by checking each page systematically:\n\nPage 1: \"Care Quality Commission\" header present\nPage 2-4: Summary of findings pages - no clear CQC logo/header visible in the page headers\nPage 5: \"Care Quality Commission\" header present before \"Detailed findings\"\nPages 6-14: Detailed findings sections - these appear to be continuation pages\nPage 15: \"Care Quality Commission\" header present before enforcement actions section\nPages 16-17: Continuation of enforcement actions\n\nBased on the document structure, the CQC logo/header appears on:\n\nAnswer: Page 1, Page 5, Page 15", "citations": [], "input_tokens": 10625, "output_tokens": 262, "cost_micros": 35805, "latency_ms": 7218, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.8, "method": "list_set", "normalised_pred": "page 1, page 15, page 5", "normalised_gold": "page 1, page 5"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q004", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "list", "gold": "['Page 1', 'Page 5']", "pages": 17, "document_id": 7600, "arm": "llamacloud_premium_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q004", "answer_letter": null, "raw_text": "I need to identify all pages where the CQC (Care Quality Commission) logo appears in this document.\n\nLooking through the document systematically:\n\nPage 1: \"Care Quality Commission\" appears at the top as a header\nPage 2: \"Summary of findings\" - no logo visible in header\nPage 3: \"Summary of findings\" continues - no logo visible\nPage 4: \"Summary of findings\" continues - no logo visible\nPage 5: \"CareQuality Commission\" appears at the top (note: slightly different formatting)\nPage 6-17: These pages show section headings but I need to check for the CQC logo/branding\n\nLooking more carefully at the document structure, the CQC logo/branding appears to be present on:\n- Page 1 (top of document with \"Care Quality Commission\")\n- Page 5 (shows \"CareQuality Commission\" at top of \"Detailed findings\" section)\n\nThese appear to be the only pages with the CQC branding/logo clearly marked at the top.\n\nAnswer: Page 1, Page 5", "citations": [], "input_tokens": 10860, "output_tokens": 243, "cost_micros": 36225, "latency_ms": 6234, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "page 1, page 5", "normalised_gold": "page 1, page 5"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q004", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "list", "gold": "['Page 1', 'Page 5']", "pages": 17, "document_id": 7600, "arm": "surfsense_agentic", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to identify all pages where the CQC logo appears.Let me search for \"CQC\" or \"logo\" throughout the document to find all pages where the logo appears:Now let me search for mentions of \"logo\" or \"Care Quality Commission\" to find where the logo appears:Let me search more specifically for image and figure tags to identify all instances of the logo:Let me read through the entire document more systematically to find all logo occurrences. Let me check what page the first logo is on and look for patterns:The first logo appears before \"PageFooter=1\", so it's on Page 1. Now let me search the entire document systematically for all instances of the CQC logo. Let me search for figures or images:Good! Now I found 7 figure tags. Let me read the context around each one to identify which ones are the CQC logo:Now I've identified that several figures contain only page headers. Let me verify which ones actually contain the CQC logo by checking the content. Based on my readings:\n\n1. Line 121 (before PageFooter 1) - Contains \"Care Quality Commission\" logo with visual description - **Page 1**\n2. Line 409 (after PageBreak on line 406, before PageFooter 4 on line 405) - Contains \"Care Quality Commission\" - **Page 5** (since PageFooter 4 appears at line 405-406, this is after the page break, so it's on page 5)\n\nLet me verify by checking the page breaks more carefully:Perfect! So after PageFooter 4 and PageBreak, we get Page 5, which has the CQC logo. Now let me check if there are any other CQC logos. The other figures at lines 868, 981, 1036, 1134, and 1247 only contain PageHeaders, not the logo.\n\nLet me verify this is correct by checking one more time if there are any other embedded images:Let me search for any other embedded images:Good! Only one embedded image (I1.jpg) exists in the document. Now let me confirm the two locations of \"Care Quality Commission\" text in figures:So the CQC logo on Page 5 appears between PageFooter 4 (line 405) and PageFooter 5 (line 456). That confirms it's on Page 5.\n\nNow let me verify there's only one actual logo image (I1.jpg) on Page 1, and the Page 5 figure is just text without an image:I notice that on Page 5 (line 409-414), there's a figure with \"Care Quality Commission\" text but NO embedded image notation. Let me check if there's an image reference after this figure:No embedded image noted for Page 5's figure. So the Page 5 figure has \"Care Quality Commission\" text but the question asks specifically about the \"logo\" which includes the visual/graphical element. Let me re-read the Page 1 section to see if it explicitly identifies this as a logo:\n\nLooking back at lines 130-147, Page 1 clearly shows:\n- \"**Embedded image:** `I1.jpg`\"\n- \"**Visual description:**\"\n- \"Organizational logo combining a graphical symbol with text\"\n\nThis definitively identifies Page 1 as having the CQC logo (the actual image/visual logo).\n\nOn Page 5 (lines 409-414), there's only the text \"Care Quality Commission\" in a figure, but no embedded image or logo graphic.\n\nHowever, let me check if both figures might represent the same logo - the system might have extracted it once with full details (Page 1) and then just shown the text on Page 5. In typical inspection reports, logos often appear on multiple pages. Let me search more broadly to understand the document structure better. Let me see if there are image references I might have missed:Only one \"Visual description\" exists - on Page 1. The interpretation here is that:\n- Page 1 has the actual CQC logo image (I1.jpg with full visual description)\n- Page 5 has a figure with just \"Care Quality Commission\" text\n\nGiven that the question asks for \"the logo of CQC\" (which implies the graphical/visual logo, not just text), and given that only Page 1 has an embedded image with the visual logo description, the answer should be Page 1 only.\n\nHowever, both figures contain \"Care Quality Commission\" and in PDF documents, logos can appear as repeated elements. The figure on Page 5 likely also represents the logo (just extracted differently by the parser). Let me verify by checking if standard CQC inspection reports have logos on multiple pages.\n\nBased on the document structure, I have:\n1. **Page 1** - Clear CQC logo with embedded image I1.jpg and full visual description\n2. **Page 5** - Figure with \"Care Quality Commission\" text (likely the same logo, but extraction didn't capture the image separately)\n\nGiven the ambiguity, I should report both pages where the CQC element appears in a figure tag, as both likely represent the logo placement in the original PDF.\n\nAnswer: Page 1, Page 5", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 135453, "error": null, "extra": {"thread_id": 2069, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_2a8af9e5bd4c48cebf2e6cbec000700b", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 1002, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "page 1, page 5", "normalised_gold": "page 1, page 5"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q005", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "int", "gold": "7", "pages": 17, "document_id": 7600, "arm": "native_pdf", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q005", "answer_letter": null, "raw_text": "I need to count all the unique regulations that were breached according to this CQC inspection report.\n\nLet me go through the document systematically:\n\n**From \"Action we have told the provider to take\" section (pages 15-16):**\n1. Regulation 18 HSCA (RA) Regulations 2014 - Staffing\n2. Regulation 11 HSCA (RA) Regulations 2014 - Need for consent\n3. Regulation 10 HSCA (RA) Regulations 2014 - Dignity and respect\n4. Regulation 9 HSCA (RA) Regulations 2014 - Person-centred care\n5. Regulation 17 HSCA (RA) Regulations 2014 - Good governance\n6. Regulation 18 CQC (Registration) Regulations 2009 - Notification of other incidents\n\n**From \"Enforcement actions\" section (page 17):**\n7. Regulation 13 HSCA (RA) Regulations 2014 - Safeguarding service users from abuse and improper treatment\n8. Regulation 12 HSCA (RA) Regulations 2014 - Safe care and treatment\n\nCounting the unique regulations breached:\n- Regulation 9 (Person-centred care)\n- Regulation 10 (Dignity and respect)\n- Regulation 11 (Need for consent)\n- Regulation 12 (Safe care and treatment)\n- Regulation 13 (Safeguarding)\n- Regulation 17 (Good governance)\n- Regulation 18 HSCA (RA) (Staffing)\n- Regulation 18 CQC (Registration) (Notification)\n\nThat's 8 regulations total.\n\nAnswer: 8", "citations": [], "input_tokens": 37740, "output_tokens": 419, "cost_micros": 119505, "latency_ms": 10562, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "379f44022bb27aa53efd5d322c7b57bf.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "7"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q005", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "int", "gold": "7", "pages": 17, "document_id": 7600, "arm": "azure_basic_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q005", "answer_letter": null, "raw_text": "I need to count all the breaches of the Health and Social Care Act 2008 (HSCA) regulations mentioned in this report.\n\nLet me go through the document systematically:\n\n**Breaches of HSCA (Regulated Activities) Regulations 2014:**\n\n1. **Regulation 13** - Safeguarding service users from abuse and improper treatment (mentioned twice - unlawful restraint and failure to respond to allegations/safeguarding incidents)\n\n2. **Regulation 12** - Safe care and treatment (mentioned twice - failure to assess risks, and failure to manage medicines safely)\n\n3. **Regulation 18** - Staffing (insufficient staffing levels)\n\n4. **Regulation 18** - Staffing (again, for induction, supervision and training issues)\n\n5. **Regulation 11** - Need for consent (MCA 2005 implementation)\n\n6. **Regulation 10** - Dignity and respect\n\n7. **Regulation 9** - Person-centred care\n\n8. **Regulation 17** - Good governance (mentioned twice - failure to assess/monitor quality, and failure to maintain accurate records)\n\n**Breach of HSCA (Registration) Regulations 2009:**\n\n9. **Regulation 18** - Notification of other incidents (CQC Registration Regulations 2009)\n\nNow let me count the distinct regulation numbers that were breached:\n- Regulation 13 (RA Regs 2014)\n- Regulation 12 (RA Regs 2014)\n- Regulation 18 (RA Regs 2014)\n- Regulation 11 (RA Regs 2014)\n- Regulation 10 (RA Regs 2014)\n- Regulation 9 (RA Regs 2014)\n- Regulation 17 (RA Regs 2014)\n- Regulation 18 (Registration Regs 2009)\n\nThat's 7 different regulations from the RA Regulations 2014 and 1 from the Registration Regulations 2009, making 8 total distinct regulations breached.\n\nAnswer: 8", "citations": [], "input_tokens": 10259, "output_tokens": 499, "cost_micros": 38262, "latency_ms": 8109, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "7"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q005", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "int", "gold": "7", "pages": 17, "document_id": 7600, "arm": "azure_premium_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q005", "answer_letter": null, "raw_text": "I need to count all the breached regulations mentioned in this inspection report.\n\nLet me go through the document systematically:\n\n**From \"Is the service safe?\" section:**\n1. Regulation 13 - Safeguarding (failure to respond to allegations and investigate safeguarding incidents)\n2. Regulation 12 - Safe care and treatment (failure to assess and mitigate risks)\n3. Regulation 18 - Staffing (insufficient staffing levels)\n4. Regulation 12 - Safe care and treatment (medicines not managed safely) - already counted above\n\n**From \"Is the service effective?\" section:**\n5. Regulation 18 - Staffing (induction, supervision and training) - already counted above\n6. Regulation 13 - Safeguarding (unlawful restraint) - already counted above\n7. Regulation 11 - Need for consent (MCA 2005 not implemented)\n\n**From \"Is the service caring?\" section:**\n8. Regulation 10 - Dignity and respect\n\n**From \"Is the service responsive?\" section:**\n9. Regulation 9 - Person-centred care (care records not reflecting current needs)\n\n**From \"Is the service well-led?\" section:**\n10. Regulation 17 - Good governance (failure to assess, monitor and improve quality)\n11. Regulation 17 - Good governance (failure to maintain accurate records) - already counted above\n12. Regulation 18 CQC (Registration) Regulations 2009 - Notification of incidents\n\nLet me count the unique HSCA 2008 regulations breached:\n- Regulation 9 (Person-centred care)\n- Regulation 10 (Dignity and respect)\n- Regulation 11 (Need for consent)\n- Regulation 12 (Safe care and treatment)\n- Regulation 13 (Safeguarding)\n- Regulation 17 (Good governance)\n- Regulation 18 (Staffing)\n\nThat's 7 regulations under HSCA 2008 (Regulated Activities) Regulations 2014.\n\nPlus 1 regulation under CQC (Registration) Regulations 2009 (Regulation 18).\n\nThe question asks specifically about \"regulations of the", "citations": [], "input_tokens": 12047, "output_tokens": 512, "cost_micros": 43821, "latency_ms": 8780, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "7"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q005", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "int", "gold": "7", "pages": 17, "document_id": 7600, "arm": "llamacloud_basic_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q005", "answer_letter": null, "raw_text": "I need to count all the unique regulations that were breached according to this report.\n\nLet me go through the document systematically:\n\n**From the \"Action we have told the provider to take\" section:**\n1. Regulation 18 HSCA (RA) Regulations 2014 - Staffing\n2. Regulation 11 HSCA (RA) Regulations 2014 - Need for consent\n3. Regulation 10 HSCA (RA) Regulations 2014 - Dignity and respect\n4. Regulation 9 HSCA (RA) Regulations 2014 - Person-centred care\n5. Regulation 17 HSCA (RA) Regulations 2014 - Good governance\n6. Regulation 18 CQC (Registration) Regulations 2009 - Notification of other incidents\n\n**From the \"Enforcement actions\" section:**\n7. Regulation 13 HSCA (RA) Regulations 2014 - Safeguarding service users from abuse and improper treatment\n8. Regulation 12 HSCA (RA) Regulations 2014 - Safe care and treatment\n\nLet me verify by checking the main body of the report for all breach mentions:\n- Regulation 13 (safeguarding) - breach mentioned\n- Regulation 12 (safe care and treatment, including medicines and risk assessment) - breach mentioned\n- Regulation 18 (staffing) - breach mentioned\n- Regulation 11 (Mental Capacity Act) - breach mentioned\n- Regulation 10 (dignity and respect) - breach mentioned\n- Regulation 9 (person-centred care) - breach mentioned\n- Regulation 17 (good governance - two separate breaches mentioned for different aspects) - breach mentioned\n- Regulation 18 CQC (Registration) Regulations 2009 (notifications) - breach mentioned\n\nTotal unique regulations breached: 8\n\nAnswer: 8", "citations": [], "input_tokens": 10601, "output_tokens": 448, "cost_micros": 38523, "latency_ms": 7250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "7"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q005", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "int", "gold": "7", "pages": 17, "document_id": 7600, "arm": "llamacloud_premium_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q005", "answer_letter": null, "raw_text": "I need to count all the breaches of the Health and Social Care Act 2008 (HSCA 2008) regulations mentioned in this report.\n\nLet me go through the document systematically:\n\n**Breaches found in the main body of the report:**\n\n1. **Regulation 13** - Safeguarding service users from abuse and improper treatment (failure to respond to allegations and report safeguarding incidents)\n\n2. **Regulation 12** - Safe care and treatment (failure to assess, record and mitigate risks to people's health and safety)\n\n3. **Regulation 18** - Staffing (failure to ensure sufficient staffing levels)\n\n4. **Regulation 12** - Safe care and treatment (failure to manage medicines safely) - *same regulation as #2*\n\n5. **Regulation 18** - Staffing (failure to ensure staff received appropriate induction, supervision and training) - *same regulation as #3*\n\n6. **Regulation 13** - Safeguarding (failure to ensure people were not unlawfully restrained) - *same regulation as #1*\n\n7. **Regulation 11** - Need for consent (failure to ensure MCA 2005 code of practice was implemented)\n\n8. **Regulation 10** - Dignity and respect (failure to treat people with respect and dignity at all times)\n\n9. **Regulation 9** - Person-centred care (failure to ensure care records reflected current needs and preferences)\n\n10. **Regulation 17** - Good governance (failure to effectively assess, monitor and improve quality and safety of services)\n\n11. **Regulation 17** - Good governance (failure to maintain accurate and complete records) - *same regulation as #10*\n\n12. **Regulation 18 of the Health and Social Care Act 2008 (Registration) Regulations 2009** - This is a different Act (Registration Regulations, not Regulated Activities Regulations)\n\n**Counting unique HSCA 2008 (Regulated Activities) Regulations 2014 breaches:**\n- Regulation 9 (Person-centred care)\n- Regulation 10 (Dignity and respect)\n- Regulation 11 (Need for consent)\n- Regulation 12 (Safe care and treatment)\n-", "citations": [], "input_tokens": 10836, "output_tokens": 512, "cost_micros": 40188, "latency_ms": 9764, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "7"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q005", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "int", "gold": "7", "pages": 17, "document_id": 7600, "arm": "surfsense_agentic", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the provided document to find information about HSCA 2008 regulations breaches.Let me continue reading to find any summary or conclusion section that might provide the total count.Based on my careful reading of the document, I need to count all the breaches of the Health and Social Care Act 2008 (Regulated Activities) Regulations 2014 (HSCA 2008).\n\nThe breaches mentioned in the report are:\n\n1. **Regulation 13** - Safeguarding service users from abuse and improper treatment (mentioned twice - for failing to respond to allegations and for unlawful restraint)\n2. **Regulation 12** - Safe care and treatment (mentioned twice - for failing to assess/mitigate risks and for failing to manage medicines safely)\n3. **Regulation 18** - Staffing (mentioned twice - for insufficient staffing levels and for inadequate training/supervision)\n4. **Regulation 11** - Need for consent (MCA 2005 implementation)\n5. **Regulation 10** - Dignity and respect\n6. **Regulation 9** - Person-centred care\n7. **Regulation 17** - Good governance (mentioned twice - for failing to assess/monitor quality and for failing to maintain accurate records)\n\nAdditionally, there is one breach of:\n8. **Regulation 18 of the Health and Social Care Act 2008 (Registration) Regulations 2009** - Notification of other incidents\n\nCounting the distinct regulation numbers breached under HSCA (RA) Regulations 2014:\n- Regulation 13\n- Regulation 12\n- Regulation 18\n- Regulation 11\n- Regulation 10\n- Regulation 9\n- Regulation 17\n\nThat's 7 different regulations of the HSCA (RA) Regulations 2014 that were breached.\n\nAnswer: 7", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 38483, "error": null, "extra": {"thread_id": 2070, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_8ce6489cb9fc4be0aad3905ed014d904", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 217, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "7", "normalised_gold": "7"}}
+{"qid": "3M_2018_10K.pdf::Q000", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "$1577.00", "pages": 160, "document_id": 7601, "arm": "native_pdf", "question_id": "3M_2018_10K.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the capital expenditure amount for FY2018 from 3M's cash flow statement.\n\nLooking at the Consolidated Statement of Cash Flows for the year ended December 31, 2018 (page 60):\n\nUnder \"Cash Flows from Investing Activities\":\n- Purchases of property, plant and equipment (PP&E): $(1,577) million\n\nThe capital expenditure amount for FY2018 is shown as purchases of property, plant and equipment.\n\nAnswer: 1577", "citations": [], "input_tokens": 390262, "output_tokens": 116, "cost_micros": 2344182, "latency_ms": 44468, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "3M_2018_10K.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "1577.0", "normalised_gold": "1577.0"}}
+{"qid": "3M_2018_10K.pdf::Q000", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "$1577.00", "pages": 160, "document_id": 7601, "arm": "azure_basic_lc", "question_id": "3M_2018_10K.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the capital expenditure amount for FY2018 from the cash flow statement.\n\nLooking at the Consolidated Statement of Cash Flows for the years ended December 31, I can find the capital expenditures under \"Cash Flows from Investing Activities.\"\n\nFrom the document:\n\n**Cash Flows from Investing Activities:**\nYears ended December 31\n(Millions)                                          2018        2017        2016\n\nPurchases of property, plant and equipment (PP&E)   $(1,577)    $(1,373)    $(1,420)\n\nThe capital expenditure for FY2018 is shown as purchases of property, plant and equipment (PP&E) which equals $1,577 million.\n\nAnswer: 1577", "citations": [], "input_tokens": 140538, "output_tokens": 171, "cost_micros": 424179, "latency_ms": 8062, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "1577.0", "normalised_gold": "1577.0"}}
+{"qid": "3M_2018_10K.pdf::Q000", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "$1577.00", "pages": 160, "document_id": 7601, "arm": "azure_premium_lc", "question_id": "3M_2018_10K.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the capital expenditure amount for FY2018 from the cash flow statement.\n\nLooking at the Consolidated Statement of Cash Flows for the year ended December 31, 2018:\n\nUnder \"Cash Flows from Investing Activities\":\n- Purchases of property, plant and equipment (PP&E): $(1,577) million\n\nThe capital expenditure amount is shown as a negative number in the cash flow statement because it represents a cash outflow.\n\nThe capital expenditure amount for FY2018 is $1,577 million.\n\nAnswer: 1577", "citations": [], "input_tokens": 207953, "output_tokens": 130, "cost_micros": 1250643, "latency_ms": 8718, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "1577.0", "normalised_gold": "1577.0"}}
+{"qid": "3M_2018_10K.pdf::Q000", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "$1577.00", "pages": 160, "document_id": 7601, "arm": "llamacloud_basic_lc", "question_id": "3M_2018_10K.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the Consolidated Statement of Cash Flows for the year ended December 31, 2018:\n\nUnder \"Cash Flows from Investing Activities\":\n- Purchases of property, plant and equipment (PP&E): $(1,577) million\n\nThis represents the capital expenditures for FY2018.\n\nAnswer: 1577", "citations": [], "input_tokens": 163241, "output_tokens": 77, "cost_micros": 490878, "latency_ms": 8000, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "1577.0", "normalised_gold": "1577.0"}}
+{"qid": "3M_2018_10K.pdf::Q000", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "$1577.00", "pages": 160, "document_id": 7601, "arm": "llamacloud_premium_lc", "question_id": "3M_2018_10K.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the Consolidated Statement of Cash Flows for the year ended December 31, 2018:\n\nUnder \"Cash Flows from Investing Activities\":\n- Purchases of property, plant and equipment (PP&E): $(1,577) million\n\nThis represents the capital expenditures for fiscal year 2018.\n\nAnswer: 1577", "citations": [], "input_tokens": 177504, "output_tokens": 78, "cost_micros": 533682, "latency_ms": 7719, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "1577.0", "normalised_gold": "1577.0"}}
+{"qid": "3M_2018_10K.pdf::Q000", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "$1577.00", "pages": 160, "document_id": 7601, "arm": "surfsense_agentic", "question_id": "3M_2018_10K.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the 3M 2018 10K document to find the capital expenditure amount from the cash flow statement.Let me search for the cash flow statement in this document.Great! I found the Consolidated Statement of Cash Flows. Let me read that specific chunk to find the capital expenditure line.Let me search for variations of capital expenditure terminology:Perfect! I found it. The \"Purchases of property, plant and equipment (PP&E)\" line shows the capital expenditure amounts. For 2018, it shows $(1,577) million. Let me read the actual chunk to confirm this is from the cash flow statement.Let me check a bit earlier to confirm this is the Consolidated Statement of Cash Flows:Let me check the title of this statement:Perfect! I've confirmed this is the \"3M Company and Subsidiaries Consolidated Statement of Cash Flows\" for years ended December 31, with columns for 2018, 2017, and 2016.\n\nIn the \"Cash Flows from Investing Activities\" section, the line item \"Purchases of property, plant and equipment (PP&E)\" shows:\n- 2018: (1,577)\n- 2017: (1,373)\n- 2016: (1,420)\n\nThe amounts are in millions of USD, and the parentheses indicate these are cash outflows. The FY2018 capital expenditure amount is 1,577 million USD.\n\nAnswer: 1577.0", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 53671, "error": null, "extra": {"thread_id": 2071, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_b68bd46e8dab41c29dc9086112158cbd", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 310, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "1577.0", "normalised_gold": "1577.0"}}
+{"qid": "3M_2018_10K.pdf::Q001", "doc_id": "3M_2018_10K.pdf", "answer_format": "list", "gold": "['Industrial Business', 'Safety and Graphics Business', 'Health Care Business', 'Electronics and Energy Business', 'Consumer Business']", "pages": 160, "document_id": 7601, "arm": "native_pdf", "question_id": "3M_2018_10K.pdf::Q001", "answer_letter": null, "raw_text": "Looking through the document for 3M's business segments in FY2018...\n\nFrom the document, specifically on pages discussing business segments (e.g., page 4, page 33-38), I can identify the following business segments:\n\n1. Industrial Business\n2. Safety and Graphics Business\n3. Health Care Business\n4. Electronics and Energy Business\n5. Consumer Business\n\nThese five segments are consistently mentioned throughout the document as 3M's reportable segments for fiscal year 2018.\n\nAnswer: Industrial, Safety and Graphics, Health Care, Electronics and Energy, Consumer", "citations": [], "input_tokens": 390260, "output_tokens": 126, "cost_micros": 2344395, "latency_ms": 49344, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "3M_2018_10K.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "consumer, electronics and energy, health care, industrial, safety and graphics", "normalised_gold": "consumer business, electronics and energy business, health care business, industrial business, safety and graphics business"}}
+{"qid": "3M_2018_10K.pdf::Q001", "doc_id": "3M_2018_10K.pdf", "answer_format": "list", "gold": "['Industrial Business', 'Safety and Graphics Business', 'Health Care Business', 'Electronics and Energy Business', 'Consumer Business']", "pages": 160, "document_id": 7601, "arm": "azure_basic_lc", "question_id": "3M_2018_10K.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the document, I need to identify 3M's business segments for FY2018.\n\nFrom the document, I can find the business segment information in multiple places. The clearest statement is in the Overview section and repeated throughout:\n\n\"3M manages its operations in five operating business segments: Industrial; Safety and Graphics; Health Care; Electronics and Energy; and Consumer.\"\n\nThis is also confirmed in the Business Segment Products table which lists the major products for each segment:\n- Industrial\n- Safety and Graphics\n- Health Care\n- Electronics and Energy\n- Consumer\n\nAnswer: Industrial, Safety and Graphics, Health Care, Electronics and Energy, Consumer", "citations": [], "input_tokens": 140536, "output_tokens": 140, "cost_micros": 423708, "latency_ms": 9046, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "consumer, electronics and energy, health care, industrial, safety and graphics", "normalised_gold": "consumer business, electronics and energy business, health care business, industrial business, safety and graphics business"}}
+{"qid": "3M_2018_10K.pdf::Q001", "doc_id": "3M_2018_10K.pdf", "answer_format": "list", "gold": "['Industrial Business', 'Safety and Graphics Business', 'Health Care Business', 'Electronics and Energy Business', 'Consumer Business']", "pages": 160, "document_id": 7601, "arm": "azure_premium_lc", "question_id": "3M_2018_10K.pdf::Q001", "answer_letter": null, "raw_text": "Looking through the document for information about 3M's business segments in FY2018...\n\nFrom the document, I can find the business segment information clearly stated in multiple places:\n\nIn the \"BUSINESS SEGMENTS\" section (page 4), it states:\n\"3M manages its operations in five business segments. The reportable segments are Industrial, Safety and Graphics, Health Care, Electronics and Energy, and Consumer.\"\n\nThis is also confirmed in the \"OVERVIEW\" section (page 15):\n\"3M manages its operations in five operating business segments: Industrial; Safety and Graphics; Health Care; Electronics and Energy; and Consumer.\"\n\nAnd in the \"PERFORMANCE BY BUSINESS SEGMENT\" section (page 32):\n\"3M manages its operations in five business segments. The reportable segments are Industrial; Safety and Graphics; Health Care; Electronics and Energy; and Consumer.\"\n\nAnswer: Industrial, Safety and Graphics, Health Care, Electronics and Energy, Consumer", "citations": [], "input_tokens": 207951, "output_tokens": 206, "cost_micros": 1252341, "latency_ms": 10437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "consumer, electronics and energy, health care, industrial, safety and graphics", "normalised_gold": "consumer business, electronics and energy business, health care business, industrial business, safety and graphics business"}}
+{"qid": "3M_2018_10K.pdf::Q001", "doc_id": "3M_2018_10K.pdf", "answer_format": "list", "gold": "['Industrial Business', 'Safety and Graphics Business', 'Health Care Business', 'Electronics and Energy Business', 'Consumer Business']", "pages": 160, "document_id": 7601, "arm": "llamacloud_basic_lc", "question_id": "3M_2018_10K.pdf::Q001", "answer_letter": null, "raw_text": "Looking through the document for information about 3M's business segments in FY2018:\n\nFrom the document, I can find the business segments clearly stated in multiple locations, including:\n\n1. In the Business Segments section (page 4):\n\"3M manages its operations in five business segments: Industrial; Safety and Graphics; Health Care; Electronics and Energy; and Consumer.\"\n\n2. In the Table of Contents and throughout the financial statements, these five segments are consistently referenced.\n\n3. In Note 18 (Business Segments), the same five segments are listed with their products and financial information.\n\nAnswer: Industrial, Safety and Graphics, Health Care, Electronics and Energy, Consumer", "citations": [], "input_tokens": 163239, "output_tokens": 147, "cost_micros": 491922, "latency_ms": 9858, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "consumer, electronics and energy, health care, industrial, safety and graphics", "normalised_gold": "consumer business, electronics and energy business, health care business, industrial business, safety and graphics business"}}
+{"qid": "3M_2018_10K.pdf::Q001", "doc_id": "3M_2018_10K.pdf", "answer_format": "list", "gold": "['Industrial Business', 'Safety and Graphics Business', 'Health Care Business', 'Electronics and Energy Business', 'Consumer Business']", "pages": 160, "document_id": 7601, "arm": "llamacloud_premium_lc", "question_id": "3M_2018_10K.pdf::Q001", "answer_letter": null, "raw_text": "Looking through the document for information about 3M's business segments in FY2018...\n\nFrom the document, I can find the business segment information in multiple sections, including:\n\n1. In the Executive Summary/Overview section\n2. In Note 18 - Business Segments\n3. In the Business Segments section of Item 1\n\nThe document clearly states that 3M manages its operations in **five business segments**:\n\n1. **Industrial** - serves automotive OEM, automotive aftermarket, electronics, appliance, paper and printing, packaging, food and beverage, and construction markets\n2. **Safety and Graphics** - serves markets for personal protection products, commercial solutions, transportation safety, and roofing granules\n3. **Health Care** - serves medical clinics and hospitals, pharmaceuticals, dental and orthodontic practitioners, health information systems, and food manufacturing and testing\n4. **Electronics and Energy** - serves electronics and energy markets, including display materials, electrical products, and power generation\n5. **Consumer** - serves consumer retail, office retail, home improvement, drug and pharmacy retail markets\n\nAnswer: Industrial, Safety and Graphics, Health Care, Electronics and Energy, Consumer", "citations": [], "input_tokens": 177502, "output_tokens": 255, "cost_micros": 536331, "latency_ms": 14344, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "consumer, electronics and energy, health care, industrial, safety and graphics", "normalised_gold": "consumer business, electronics and energy business, health care business, industrial business, safety and graphics business"}}
+{"qid": "3M_2018_10K.pdf::Q001", "doc_id": "3M_2018_10K.pdf", "answer_format": "list", "gold": "['Industrial Business', 'Safety and Graphics Business', 'Health Care Business', 'Electronics and Energy Business', 'Consumer Business']", "pages": 160, "document_id": 7601, "arm": "surfsense_agentic", "question_id": "3M_2018_10K.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the 3M 2018 10K document to find information about their business segments.Perfect! I found the information. Let me read the specific section that clearly states the five business segments.Answer: Industrial, Safety and Graphics, Health Care, Electronics and Energy, Consumer", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 32219, "error": null, "extra": {"thread_id": 2072, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_5721bf743c374f36a4a98f84b6cf13eb", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 119, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "consumer, electronics and energy, health care, industrial, safety and graphics", "normalised_gold": "consumer business, electronics and energy business, health care business, industrial business, safety and graphics business"}}
+{"qid": "3M_2018_10K.pdf::Q002", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "93", "pages": 160, "document_id": 7601, "arm": "native_pdf", "question_id": "3M_2018_10K.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the net fair value of derivative instruments for 3M in FY2018.\n\nLooking through the document, I can find information about derivative instruments in Note 14 (Derivatives) and the tables showing \"Location and Fair Value Amount of Derivative Instruments.\"\n\nFrom page 104-105, I can see tables showing the fair value of derivative instruments as of December 31, 2018:\n\n**Assets (December 31, 2018):**\n- Foreign currency forward/option contracts: $127 million\n- Interest rate swap contracts: $19 million\n- Total derivative assets: $146 million\n\n**Liabilities (December 31, 2018):**\n- Foreign currency forward/option contracts: $22 million\n- Interest rate swap contracts: $31 million\n- Total derivative liabilities: $53 million\n\nNet fair value = Total derivative assets - Total derivative liabilities\nNet fair value = $146 million - $53 million = $93 million\n\nAnswer: 93", "citations": [], "input_tokens": 390258, "output_tokens": 225, "cost_micros": 2346610, "latency_ms": 34921, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "3M_2018_10K.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "93.0", "normalised_gold": "93.0"}}
+{"qid": "3M_2018_10K.pdf::Q002", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "93", "pages": 160, "document_id": 7601, "arm": "azure_basic_lc", "question_id": "3M_2018_10K.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, I need to find the net fair value of derivative instruments as of December 31, 2018.\n\nFrom the section \"Location and Fair Value Amount of Derivative Instruments\" on page 104, I can see a table showing derivative instruments as of December 31, 2018:\n\n**Assets:**\n- Derivatives designated as hedging instruments: $132 million\n- Derivatives not designated as hedging instruments: $14 million\n- Total derivative instruments (Assets): $146 million\n\n**Liabilities:**\n- Derivatives designated as hedging instruments: $47 million\n- Derivatives not designated as hedging instruments: $6 million\n- Total derivative instruments (Liabilities): $53 million\n\nNet fair value = Total Assets - Total Liabilities\nNet fair value = $146 million - $53 million = $93 million\n\nAnswer: 93", "citations": [], "input_tokens": 140534, "output_tokens": 198, "cost_micros": 424572, "latency_ms": 6766, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "93.0", "normalised_gold": "93.0"}}
+{"qid": "3M_2018_10K.pdf::Q002", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "93", "pages": 160, "document_id": 7601, "arm": "azure_premium_lc", "question_id": "3M_2018_10K.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, I need to find the net fair value of derivative instruments for 3M in FY2018.\n\nFrom Note 14 \"Derivatives\" and Note 15 \"Fair Value Measurements\", I can find the fair value of derivative instruments.\n\nIn the section \"Location and Fair Value Amount of Derivative Instruments\" under Note 14, there is a table showing:\n\n**December 31, 2018:**\n\nAssets:\n- Foreign currency forward/option contracts (current): $74 million\n- Foreign currency forward/option contracts (non-current): $39 million\n- Interest rate swap contracts (current): $0 million\n- Interest rate swap contracts (non-current): $19 million\n- Foreign currency forward/option contracts (not designated): $14 million\n**Total derivative assets: $146 million**\n\nLiabilities:\n- Foreign currency forward/option contracts (current): $12 million\n- Foreign currency forward/option contracts (non-current): $4 million\n- Interest rate swap contracts (current): $14 million\n- Interest rate swap contracts (non-current): $17 million\n- Foreign currency forward/option contracts (not designated): $6 million\n**Total derivative liabilities: $53 million**\n\nNet fair value = Total derivative assets - Total derivative liabilities\nNet fair value = $146 million - $53 million = $93 million\n\nAnswer: 93", "citations": [], "input_tokens": 207949, "output_tokens": 317, "cost_micros": 1254826, "latency_ms": 11608, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "93.0", "normalised_gold": "93.0"}}
+{"qid": "3M_2018_10K.pdf::Q002", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "93", "pages": 160, "document_id": 7601, "arm": "llamacloud_basic_lc", "question_id": "3M_2018_10K.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, I need to find the net fair value of derivative instruments for 3M in FY2018.\n\nFrom the \"Location and Fair Value Amount of Derivative Instruments\" table for December 31, 2018:\n\n**Derivative Assets:**\n- Derivatives designated as hedging instruments: $132 million\n- Derivatives not designated as hedging instruments: $14 million\n- Total derivative assets: $146 million\n\n**Derivative Liabilities:**\n- Derivatives designated as hedging instruments: $47 million\n- Derivatives not designated as hedging instruments: $6 million\n- Total derivative liabilities: $53 million\n\n**Net Fair Value = Total Assets - Total Liabilities**\nNet Fair Value = $146 million - $53 million = $93 million\n\nAnswer: 93", "citations": [], "input_tokens": 163237, "output_tokens": 185, "cost_micros": 492486, "latency_ms": 7719, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "93.0", "normalised_gold": "93.0"}}
+{"qid": "3M_2018_10K.pdf::Q002", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "93", "pages": 160, "document_id": 7601, "arm": "llamacloud_premium_lc", "question_id": "3M_2018_10K.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the fair value of derivative instruments from the document:\n\nFrom the table \"Location and Fair Value Amount of Derivative Instruments\" on page 104:\n\n**December 31, 2018:**\n- Total derivative assets (Fair Value Amount): $146 million\n- Total derivative liabilities (Fair Value Amount): $53 million\n\nNet fair value = Assets - Liabilities = $146 million - $53 million = $93 million\n\nAnswer: 93", "citations": [], "input_tokens": 177500, "output_tokens": 105, "cost_micros": 534075, "latency_ms": 8358, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "93.0", "normalised_gold": "93.0"}}
+{"qid": "3M_2018_10K.pdf::Q002", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "93", "pages": 160, "document_id": 7601, "arm": "surfsense_agentic", "question_id": "3M_2018_10K.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the 3M 2018 10K document to find information about the net fair value of derivative instruments.Let me search for information about derivative instruments in the document.Let me search more specifically for derivative instruments and their fair values in a table format.Now let me read the section on derivatives to find the fair value information, particularly around line 15104 where it mentions \"Location and Fair Value Amount of Derivative Instruments\":Perfect! I found the table showing the fair value of derivative instruments for 3M in FY2018. Looking at the table on lines 15230-15235:\n\nFor December 31, 2018:\n- Total derivative instruments (Assets): $146 million\n- Total derivative instruments (Liabilities): $53 million\n\nThe net fair value would be: Assets - Liabilities = $146 - $53 = $93 million\n\nAnswer: 93.0", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 50969, "error": null, "extra": {"thread_id": 2073, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_890847b672154ab2beed6f213e7b03d2", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 220, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "93.0", "normalised_gold": "93.0"}}
+{"qid": "3M_2018_10K.pdf::Q003", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "-3.65%", "pages": 160, "document_id": 7601, "arm": "native_pdf", "question_id": "3M_2018_10K.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the document for advertising and merchandising costs:\n\nFrom the document (page 65):\n\"Advertising and merchandising: These costs are charged to operations in the period incurred, and totaled $396 million in 2018, $411 million in 2017 and $385 million in 2016.\"\n\nTo calculate the growth rate for FY2018:\nGrowth rate = (2018 cost - 2017 cost) / 2017 cost \u00d7 100\nGrowth rate = ($396 million - $411 million) / $411 million \u00d7 100\nGrowth rate = -$15 million / $411 million \u00d7 100\nGrowth rate = -0.03649635... \u00d7 100\nGrowth rate = -3.649635...\nRounded to two decimal places = -3.65\n\nAnswer: -3.65", "citations": [], "input_tokens": 390267, "output_tokens": 193, "cost_micros": 2345944, "latency_ms": 37280, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "3M_2018_10K.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "-3.65", "normalised_gold": "-3.65"}}
+{"qid": "3M_2018_10K.pdf::Q003", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "-3.65%", "pages": 160, "document_id": 7601, "arm": "azure_basic_lc", "question_id": "3M_2018_10K.pdf::Q003", "answer_letter": null, "raw_text": "I need to find the advertising and merchandising costs for 2018 and 2017, then calculate the growth rate.\n\nFrom the document, I can find the advertising and merchandising costs in the \"Advertising and merchandising\" section:\n\n\"Advertising and merchandising: These costs are charged to operations in the period incurred, and totaled $396 million in 2018, $411 million in 2017 and $385 million in 2016.\"\n\nSo:\n- 2018: $396 million\n- 2017: $411 million\n\nGrowth rate = (2018 value - 2017 value) / 2017 value \u00d7 100\nGrowth rate = ($396 - $411) / $411 \u00d7 100\nGrowth rate = -$15 / $411 \u00d7 100\nGrowth rate = -0.03649635 \u00d7 100\nGrowth rate = -3.649635%\n\nRounded to two decimal places: -3.65%\n\nAnswer: -3.65", "citations": [], "input_tokens": 140543, "output_tokens": 230, "cost_micros": 425079, "latency_ms": 7875, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "-3.65", "normalised_gold": "-3.65"}}
+{"qid": "3M_2018_10K.pdf::Q003", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "-3.65%", "pages": 160, "document_id": 7601, "arm": "azure_premium_lc", "question_id": "3M_2018_10K.pdf::Q003", "answer_letter": null, "raw_text": "I need to find the advertising and merchandising costs for 3M in FY2018 and FY2017, then calculate the growth rate.\n\nFrom the document, I can find the advertising and merchandising costs in the \"Advertising and merchandising\" section:\n\n\"Advertising and merchandising: These costs are charged to operations in the period incurred, and totaled $396 million in 2018, $411 million in 2017 and $385 million in 2016.\"\n\nSo:\n- FY2018: $396 million\n- FY2017: $411 million\n\nGrowth rate = (FY2018 - FY2017) / FY2017 \u00d7 100\nGrowth rate = (396 - 411) / 411 \u00d7 100\nGrowth rate = -15 / 411 \u00d7 100\nGrowth rate = -3.6496...%\n\nRounded to two decimal places: -3.65%\n\nAnswer: -3.65", "citations": [], "input_tokens": 207958, "output_tokens": 224, "cost_micros": 1252788, "latency_ms": 9453, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "-3.65", "normalised_gold": "-3.65"}}
+{"qid": "3M_2018_10K.pdf::Q003", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "-3.65%", "pages": 160, "document_id": 7601, "arm": "llamacloud_basic_lc", "question_id": "3M_2018_10K.pdf::Q003", "answer_letter": null, "raw_text": "I need to find the advertising and merchandising costs for 3M in FY2018 and FY2017, then calculate the growth rate.\n\nFrom the document, under \"Advertising and merchandising\":\n- 2018: $396 million\n- 2017: $411 million\n- 2016: $385 million\n\nThe growth rate formula is: ((2018 - 2017) / 2017) \u00d7 100\n\nGrowth rate = ((396 - 411) / 411) \u00d7 100\nGrowth rate = (-15 / 411) \u00d7 100\nGrowth rate = -0.03649635 \u00d7 100\nGrowth rate = -3.649635%\n\nRounded to two decimal places: -3.65%\n\nAnswer: -3.65", "citations": [], "input_tokens": 163246, "output_tokens": 178, "cost_micros": 492408, "latency_ms": 8312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "-3.65", "normalised_gold": "-3.65"}}
+{"qid": "3M_2018_10K.pdf::Q003", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "-3.65%", "pages": 160, "document_id": 7601, "arm": "llamacloud_premium_lc", "question_id": "3M_2018_10K.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the advertising and merchandising costs:\n\nFrom the document (page 64):\n\"Advertising and merchandising: These costs are charged to operations in the period incurred, and totaled $396 million in 2018, $411 million in 2017 and $385 million in 2016.\"\n\nGrowth rate calculation for FY2018:\nGrowth rate = (2018 cost - 2017 cost) / 2017 cost \u00d7 100\nGrowth rate = ($396 million - $411 million) / $411 million \u00d7 100\nGrowth rate = -$15 million / $411 million \u00d7 100\nGrowth rate = -0.03649635 \u00d7 100\nGrowth rate = -3.65%\n\nAnswer: -3.65", "citations": [], "input_tokens": 177509, "output_tokens": 176, "cost_micros": 535167, "latency_ms": 8078, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "-3.65", "normalised_gold": "-3.65"}}
+{"qid": "3M_2018_10K.pdf::Q003", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "-3.65%", "pages": 160, "document_id": 7601, "arm": "surfsense_agentic", "question_id": "3M_2018_10K.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the 3M 2018 10K document to find the advertising and merchandising costs for FY2018 and the prior year to calculate the growth rate.Let me search for \"advertising\" in the document to find the relevant information quickly.Great! I found a reference to advertising and merchandising costs. Let me read the chunk that contains this information.Perfect! I can see that advertising and merchandising costs were $396 million in 2018 and $411 million in 2017. Let me read more context to ensure I have the complete information.Perfect! I now have the complete information:\n- 2018: $396 million\n- 2017: $411 million\n\nNow I'll calculate the growth rate:\nGrowth rate = [(2018 value - 2017 value) / 2017 value] \u00d7 100\nGrowth rate = [(396 - 411) / 411] \u00d7 100\nGrowth rate = [-15 / 411] \u00d7 100\nGrowth rate = -0.036496... \u00d7 100\nGrowth rate = -3.6496...%\n\nRounded to two decimal places: -3.65%\n\nAnswer: -3.65", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 36500, "error": null, "extra": {"thread_id": 2074, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_3f1bac0372cd48f5b95293aa87160368", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 175, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "-3.65", "normalised_gold": "-3.65"}}
diff --git a/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_post_retry.jsonl b/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_post_retry.jsonl
new file mode 100644
index 0000000000..1b7494211c
--- /dev/null
+++ b/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_post_retry.jsonl
@@ -0,0 +1,1026 @@
+{"qid": "05-03-18-political-release.pdf::Q000", "doc_id": "05-03-18-political-release.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "541", "pages": 17, "document_id": 5219, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:50:15Z", "latency_ms": 20985, "error": null, "raw_text_chars": 696}]}, "question_id": "05-03-18-political-release.pdf::Q000", "answer_letter": null, "raw_text": "I need to find information about how many adults rated Trump's government ethical standards as \"poor\" in the survey conducted April 25-May 1, 2018.\n\nLooking through the document, I find relevant information on page 4 (document page 6):\n\nThe document states: \"About four-in-ten Americans rate the ethical standards of Trump administration officials as excellent (9%) or good (30%), while 58% say they are not good (21%) or poor (36%).\"\n\nThe survey was conducted among 1,503 adults (as stated in the methodology section on page 14).\n\nTo find the number of adults who rated the ethical standards as \"poor\":\n36% of 1,503 = 0.36 \u00d7 1,503 = 541.08\n\nRounding to the nearest whole number: 541\n\nAnswer: 541", "citations": [], "input_tokens": 33894, "output_tokens": 191, "cost_micros": 104547, "latency_ms": 20968, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "05-03-18-political-release.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "541", "normalised_gold": "541"}}
+{"qid": "05-03-18-political-release.pdf::Q000", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "541", "pages": 17, "document_id": 5219, "arm": "azure_basic_lc", "question_id": "05-03-18-political-release.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the number of adults who rated Trump administration's ethical standards as \"poor\" in the survey conducted April 25-May 1, 2018.\n\nFrom the document:\n- Total sample size: 1,503 adults\n- 36% rated the ethical standards as \"poor\" (from the table showing \"% who say that the ethical standards of top Trump administration officials are...\")\n\nCalculation:\n1,503 \u00d7 0.36 = 541.08\n\nRounding to the nearest whole number: 541\n\nAnswer: 541", "citations": [], "input_tokens": 6925, "output_tokens": 125, "cost_micros": 22650, "latency_ms": 6078, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "541", "normalised_gold": "541"}}
+{"qid": "05-03-18-political-release.pdf::Q000", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "541", "pages": 17, "document_id": 5219, "arm": "azure_premium_lc", "question_id": "05-03-18-political-release.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the number of adults who rated Trump administration's ethical standards as \"poor\" in the survey conducted April 25-May 1, 2018.\n\nFrom the document:\n- Total sample size: 1,503 adults (from the Methodology section on page 14)\n- According to the chart on page 2, 36% of the total rated the ethical standards as \"poor\"\n\nCalculation:\n1,503 \u00d7 0.36 = 541.08\n\nRounding to the nearest whole number: 541\n\nAnswer: 541", "citations": [], "input_tokens": 9417, "output_tokens": 128, "cost_micros": 30171, "latency_ms": 6203, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "541", "normalised_gold": "541"}}
+{"qid": "05-03-18-political-release.pdf::Q000", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "541", "pages": 17, "document_id": 5219, "arm": "llamacloud_basic_lc", "question_id": "05-03-18-political-release.pdf::Q000", "answer_letter": null, "raw_text": "I need to find how many adults rated Trump administration's ethical standards as \"poor\" in the survey conducted April 25-May 1, 2018.\n\nFrom the document, I can find:\n1. The total sample size: 1,503 adults (from the Methodology section)\n2. The percentage who rated ethical standards as \"poor\": 36% (from the table \"Most rate top Trump administration officials' ethics as 'not good' or 'poor'\")\n\nTo calculate the number of adults:\n1,503 \u00d7 0.36 = 541.08\n\nSince we need an integer and this represents people, I should round to the nearest whole number: 541\n\nAnswer: 541", "citations": [], "input_tokens": 7774, "output_tokens": 158, "cost_micros": 25692, "latency_ms": 6405, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "541", "normalised_gold": "541"}}
+{"qid": "05-03-18-political-release.pdf::Q000", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "541", "pages": 17, "document_id": 5219, "arm": "llamacloud_premium_lc", "question_id": "05-03-18-political-release.pdf::Q000", "answer_letter": null, "raw_text": "I need to find how many adults rated Trump's government ethical standards as poor.\n\nFrom the document:\n- The survey was conducted April 25-May 1, 2018 among 1,503 adults (from the Methodology section on page 14)\n- 36% rated the ethical standards of top Trump administration officials as \"poor\" (from the table on page 2)\n\nCalculation:\n1,503 adults \u00d7 36% = 1,503 \u00d7 0.36 = 541.08\n\nSince we need to report the number of adults, I'll round to the nearest whole number: 541\n\nAnswer: 541", "citations": [], "input_tokens": 9517, "output_tokens": 145, "cost_micros": 30726, "latency_ms": 5562, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "541", "normalised_gold": "541"}}
+{"qid": "05-03-18-political-release.pdf::Q000", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "541", "pages": 17, "document_id": 5219, "arm": "surfsense_agentic", "question_id": "05-03-18-political-release.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find the answer to your question about the survey conducted April 25 - May 1, 2018.Let me search for the specific data about ethical standards ratings. I can see the document mentions ethical standards. Let me look for the detailed breakdown.I can see from the grep results that there's a table showing ethical standards ratings. Let me read the specific section more carefully:I can see a table showing percentages for ethical standards of Trump administration officials. However, I need to understand the table structure better. The table shows:\n- Total: 58, 36, 9, 39\n- These appear to be percentages for different categories\n\nLet me look at the context more carefully to understand what \"58\" represents:Now I understand the table structure. Looking at the figure:\n- The categories are listed as: Poor, Not good, Good, Excellent\n- For \"Total\" (all adults): The values appear to be 58, 36, 9, 39\n\nHowever, the text says \"58% rate them as not good or poor\" which suggests 58% is the combined total of \"not good\" and \"poor\". Let me look more carefully at how to read this table. Looking at lines 248-249: \"39% say the administration's ethical standards are excellent or good, while 58% rate them as not good or poor.\"\n\nThis indicates:\n- 39% = Excellent + Good (combined)\n- 58% = Not good + Poor (combined)\n\nLooking at the table values for Total: 58, 36, 9, 39\n\nThe layout suggests:\n- Row 1 (Poor): 58\n- Row 2 (Not good): 36  \n- Row 3 (Good): 9\n- Row 4 (Excellent): 39\n\nBut this doesn't make sense because 58 + 36 would be 94%, not matching the text's 58% for \"not good or poor\" combined.\n\nLet me reconsider the table structure. Perhaps it's arranged differently - the numbers might be in columns rather than associating each number with the label above it. Let me look for more context about the survey sample size:The survey was conducted among 1,503 adults. Now, looking back at the text on line 248: \"58% rate them as not good or poor\"\n\nIf 58% of 1,503 adults rated Trump's ethical standards as \"not good or poor\", then:\n58% \u00d7 1,503 = 0.58 \u00d7 1,503 = 871.74 \u2248 872 adults\n\nBut the question specifically asks for those who rated it as \"poor\", not \"not good or poor\" combined.\n\nLet me re-examine the table structure. The table shows categories and then rows for Total, Rep/Lean Rep, and Dem/Lean Dem. Looking at the numbers again:\n\nFor the \"Total\" row, I see: 58, 36, 9, 39\n\nGiven that the text says 58% rated as \"not good or poor\" combined, and the categories listed are Poor, Not good, Good, Excellent - perhaps the table is showing these in a different order or the numbers represent something else.\n\nActually, looking at line 289-295 more carefully, let me reconsider: perhaps the numbers don't align sequentially with the labels above. The figure might have a more complex layout. Let me look at the Republican row: 22, 7, 19, 75. And Democrat row: 86, 61, 2, 12.\n\nFor Republicans: The text (line 257) says \"75% giving the administration high marks\" - this matches the \"75\" in the data.\nFor Democrats: The text (line 258) says \"86% of Democrats rating its ethical standards negatively\" - this matches the \"86\" in the data.\n\nSo for the Total row showing 58, 36, 9, 39:\n- 58 + 36 = 94 (doesn't match the 58% mentioned)\n- 36 + 58 = 94 \n- 9 + 39 = 48 (doesn't match the 39% for excellent/good)\n\nWait - perhaps I'm misreading the structure. Let me think about this differently. The text says:\n- 39% say excellent or good\n- 58% say not good or poor\n\nThese add to 97%, with 3% presumably \"don't know\" (which the note says is not shown).\n\nLooking at the table structure, maybe it's:\n- Poor: 36\n- Not good: 22 (so 36 + 22 = 58% for \"poor or not good\")\n- Good: ? \n- Excellent: ?\n\nActually, rereading the visual structure between lines 273-315, the checkboxes (\u2610) appear before each category label, and then the numbers follow. Let me parse this more carefully as a table where columns might represent the categories.\n\nLooking at the pattern, it seems like the table might have columns for each rating category. If the first number in each row represents \"Poor\":\n- Total: 58 (but this would be just \"Poor\", not \"Poor + Not good\")\n\nBut the text clearly states 58% is the combined \"not good or poor\". So if:\n- Poor alone = X%\n- Not good alone = Y%  \n- X + Y = 58%\n\nAnd looking at potential individual values, perhaps:\n- Poor = 36%\n- Not good = 22%\n- 36 + 22 = 58% \u2713\n\nThis would make sense! So in the Total row (58, 36, 9, 39), these might be arranged as:\n- Combined negative: 58%\n- Poor: 36%\n- Good: 9%\n- Combined positive: 39% (should be Good 9% + Excellent, but listed as 39%)\n\nWait, that still doesn't work cleanly. Let me look for another table or clarification in the document:Perfect! I found a more detailed table starting at line 875. This table clearly shows the structure with labels on lines 880-885:\n- Poor\n- Not good\n- Good\n- Excellent\n\nAnd then for \"Total\" (lines 890-898):\n- 58\n- 36\n- 9\n- 39\n\nBut wait - this arrangement still seems odd. Let me look at the actual layout more carefully. Lines 890-898 show:\n```\nTotal\n58\n36\n9\n39\n```\n\nLooking at the structure and comparing with the text, I think the table columns might be:\n- Column 1: Row label (Total, Rep/Lean Rep, etc.)\n- Column 2: Combined \"Not good or Poor\" (58 for Total)\n- Column 3: Poor specifically (36 for Total)\n- Column 4: Good specifically (9 for Total)\n- Column 5: Combined \"Good or Excellent\" (39 for Total)\n\nThis would make sense because:\n- 58% = \"not good or poor\" combined (matches the text on line 248)\n- 36% = \"poor\" alone\n- 9% = \"good\" alone\n- 39% = \"good or excellent\" combined (matches the text on line 247)\n\nSo the answer to \"how many adults rated Trump's government ethical standards as poor\" would be:\n36% of 1,503 adults = 0.36 \u00d7 1,503 = 541.08 \u2248 541 adults\n\nActually, let me double-check by reading the column headers more carefully in this second table:Looking at the table header (lines 880-885), the categories are listed as:\n- Poor\n- Not good\n- Good  \n- Excellent\n\nAnd for Total (lines 892-898): 58, 36, 9, 39\n\nGiven the table structure and that these are percentages, I now believe the columns represent:\n1. First column after \"Total\" = one rating category\n2. Second column = another rating category\n3. Third column = another rating category\n4. Fourth column = another rating category\n\nBut there are only 4 numbers (58, 36, 9, 39) for 4 categories (Poor, Not good, Good, Excellent).\n\nSo the mapping should be:\n- Poor: 58%? or 36%?\n- Not good: 36%? or 9%?\n- Good: 9%? or 39%?\n- Excellent: 39%? or something else?\n\nLet me check against the text constraints:\n- Text says \"39% say excellent or good\" (line 247-248) \u2192 Good + Excellent = 39%\n- Text says \"58% rate them as not good or poor\" (line 248-249) \u2192 Not good + Poor = 58%\n\nIf the four values are 58, 36, 9, 39:\n- 58 + 36 + 9 + 39 = 142% (impossible for percentages)\n\nSo these must not all be individual category percentages. Some must be combined values. Given the text:\n- 58% = Not good + Poor (combined)\n- 39% = Good + Excellent (combined)\n- 9% = one individual category\n- 36% = one individual category\n\nSince 58% + 39% = 97% (with 3% don't know), and we have two individual values (36 and 9), these must be subcategories within the combined values:\n- 36% = Poor (part of the 58% \"not good or poor\")\n- Therefore Not good = 58% - 36% = 22%\n- 9% = Good (part of the 39% \"good or excellent\")  \n- Therefore Excellent = 39% - 9% = 30%\n\nThis makes sense! So the table is showing:\n- Column 1: Not good or Poor combined (58%)\n- Column 2: Poor alone (36%)\n- Column 3: Good alone (9%)\n- Column 4: Good or Excellent combined (39%)\n\nTherefore, 36% of adults rated Trump's ethical standards as \"poor\".\n\n36% of 1,503 adults = 0.36 \u00d7 1,503 = 541.08\n\nRounding to the nearest integer: 541 adults\n\nAnswer: 541", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 94875, "error": null, "extra": {"thread_id": 1904, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_c7e2e86436614d5c96d6bd9dd3a561c5", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 977, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "541", "normalised_gold": "541"}}
+{"qid": "05-03-18-political-release.pdf::Q002", "doc_id": "05-03-18-political-release.pdf", "arm": "native_pdf", "answer_format": "list", "gold": "['Make good decisions about economic policy ', 'Make wise decisions about immigration policy ']", "pages": 17, "document_id": 5219, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:50:16Z", "latency_ms": 20453, "error": null, "raw_text_chars": 544}]}, "question_id": "05-03-18-political-release.pdf::Q002", "answer_letter": null, "raw_text": "Looking at page 5 of the document, which shows \"Public's confidence in Trump to handle several policy areas mixed,\" I can see the data for \"very confident\" and \"not at all confident\" percentages across different domains.\n\nFor \"Very confident\" (highest percentage):\n- Negotiate favorable trade agreements with other countries: 30%\n\nFor \"Not at all confident\" (highest percentage):\n- Make wise decisions about immigration policy: 55%\n\nAnswer: Make wise decisions about immigration policy, Negotiate favorable trade agreements with other countries", "citations": [], "input_tokens": 33942, "output_tokens": 117, "cost_micros": 103581, "latency_ms": 20438, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "05-03-18-political-release.pdf"}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "make wise decisions about immigration policy, negotiate favorable trade agreements with other countries", "normalised_gold": "make good decisions about economic policy, make wise decisions about immigration policy"}}
+{"qid": "05-03-18-political-release.pdf::Q002", "doc_id": "05-03-18-political-release.pdf", "answer_format": "list", "gold": "['Make good decisions about economic policy ', 'Make wise decisions about immigration policy ']", "pages": 17, "document_id": 5219, "arm": "azure_basic_lc", "question_id": "05-03-18-political-release.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the domains where adults are \"very confident\" and \"not at all confident\" in Donald Trump, and identify which has the highest percentage for each category.\n\nLooking at the table on page 4 titled \"Public's confidence in Trump to handle several policy areas mixed,\" I can see the breakdown of confidence levels (Very, Somewhat, Not too, Not at all) for various policy areas.\n\nFor \"Very confident\" (highest percentage):\n- Negotiate favorable trade agreements with other countries: 24%\n- Make good decisions about economic policy: 22%\n- Use military force wisely: 19%\n- Make good appointments to the federal courts: 21%\n- Manage the executive branch effectively: 21%\n- Make wise decisions about immigration policy: 16%\n- Handle an international crisis: 18%\n- Work effectively with Congress: 30%\n\nThe highest \"Very confident\" is \"Work effectively with Congress\" at 30%.\n\nFor \"Not at all confident\" (highest percentage):\n- Negotiate favorable trade agreements with other countries: 26%\n- Make good decisions about economic policy: 17%\n- Use military force wisely: 32%\n- Make good appointments to the federal courts: 32%\n- Manage the executive branch effectively: 34%\n- Make wise decisions about immigration policy: 39%\n- Handle an international crisis: 35%\n- Work effectively with Congress: 33%\n\nThe highest \"Not at all confident\" is \"Make wise decisions about immigration policy\" at 39%.\n\nIn alphabetical order:\n1. Make wise decisions about immigration policy\n2. Work effectively with Congress\n\nAnswer: Make wise decisions about immigration policy, Work effectively with Congress", "citations": [], "input_tokens": 6973, "output_tokens": 374, "cost_micros": 26529, "latency_ms": 9500, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "make wise decisions about immigration policy, work effectively with congress", "normalised_gold": "make good decisions about economic policy, make wise decisions about immigration policy"}}
+{"qid": "05-03-18-political-release.pdf::Q002", "doc_id": "05-03-18-political-release.pdf", "answer_format": "list", "gold": "['Make good decisions about economic policy ', 'Make wise decisions about immigration policy ']", "pages": 17, "document_id": 5219, "arm": "azure_premium_lc", "question_id": "05-03-18-political-release.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the table on page 4 titled \"Public's confidence in Trump to handle several policy areas mixed\", I need to find:\n\n1. The domain with the highest \"Very\" confident percentage\n2. The domain with the highest \"Not at all\" confident percentage\n\nFrom the table data:\n\n**Very confident (highest):**\n- Negotiate favorable trade agreements: 30%\n- Make good decisions about economic policy: 31% (HIGHEST)\n\n**Not at all confident (highest):**\n- Make wise decisions about immigration policy: 39% (HIGHEST)\n- Handle an international crisis: 35%\n- Manage the executive branch effectively: 34%\n\nThe two domains are:\n1. \"Make good decisions about economic policy\" (highest very confident at 31%)\n2. \"Make wise decisions about immigration policy\" (highest not at all confident at 39%)\n\nIn alphabetical order:\n\nAnswer: Make good decisions about economic policy, Make wise decisions about immigration policy", "citations": [], "input_tokens": 9465, "output_tokens": 216, "cost_micros": 31635, "latency_ms": 7578, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "make good decisions about economic policy, make wise decisions about immigration policy", "normalised_gold": "make good decisions about economic policy, make wise decisions about immigration policy"}}
+{"qid": "05-03-18-political-release.pdf::Q002", "doc_id": "05-03-18-political-release.pdf", "answer_format": "list", "gold": "['Make good decisions about economic policy ', 'Make wise decisions about immigration policy ']", "pages": 17, "document_id": 5219, "arm": "llamacloud_basic_lc", "question_id": "05-03-18-political-release.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the table \"Public's confidence in Trump to handle several policy areas mixed\" on page 4 of the document, I need to identify:\n\n1. The domain with the highest \"Very\" confident percentage\n2. The domain with the highest \"Not at all\" confident percentage\n\nFrom the table:\n\n**Very confident percentages:**\n- Negotiate favorable trade agreements: 24%\n- Make good decisions about economic policy: 22%\n- Use military force wisely: 20%\n- Make good appointments to federal courts: 21%\n- Manage executive branch effectively: 24%\n- Make wise decisions about immigration policy: 17%\n- Handle an international crisis: 18%\n- Work effectively with Congress: 21%\n\nHighest \"Very\" confident: **Negotiate favorable trade agreements** and **Manage the executive branch effectively** (both at 24%)\n\n**Not at all confident percentages:**\n- Negotiate favorable trade agreements: 26%\n- Make good decisions about economic policy: 29%\n- Use military force wisely: 32%\n- Make good appointments to federal courts: 32%\n- Manage executive branch effectively: 34%\n- Make wise decisions about immigration policy: 39%\n- Handle an international crisis: 35%\n- Work effectively with Congress: 33%\n\nHighest \"Not at all\" confident: **Make wise decisions about immigration policy** (39%)\n\nIn alphabetical order:\n1. Manage the executive branch effectively\n2. Make wise decisions about immigration policy\n3. Negotiate favorable trade agreements with other countries\n\nAnswer: Manage the executive branch effectively, Make wise decisions about immigration policy, Negotiate favorable trade agreements with other countries", "citations": [], "input_tokens": 7822, "output_tokens": 372, "cost_micros": 29046, "latency_ms": 8530, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.4, "method": "list_set", "normalised_pred": "make wise decisions about immigration policy, manage executive branch effectively, negotiate favorable trade agreements with other countries", "normalised_gold": "make good decisions about economic policy, make wise decisions about immigration policy"}}
+{"qid": "05-03-18-political-release.pdf::Q002", "doc_id": "05-03-18-political-release.pdf", "answer_format": "list", "gold": "['Make good decisions about economic policy ', 'Make wise decisions about immigration policy ']", "pages": 17, "document_id": 5219, "arm": "llamacloud_premium_lc", "question_id": "05-03-18-political-release.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the table on page 3 titled \"Public's confidence in Trump to handle several policy areas mixed\", I need to identify:\n\n1. The domain with the highest \"Very\" confident percentage\n2. The domain with the highest \"Not at all\" confident percentage\n\nFrom the table, examining the \"Very\" column:\n- Negotiate favorable trade agreements: 30%\n- Make good decisions about economic policy: 31% (highest)\n- Use military force wisely: 26%\n- Make good appointments to federal courts: 25%\n- Manage executive branch effectively: 21%\n- Make wise decisions about immigration policy: 26%\n- Handle an international crisis: 25%\n- Work effectively with Congress: 13%\n\nFrom the \"Not at all\" column:\n- Negotiate favorable trade agreements: 26%\n- Make good decisions about economic policy: (not shown separately)\n- Use military force wisely: 32%\n- Make good appointments to federal courts: (not shown separately)\n- Manage executive branch effectively: 34%\n- Make wise decisions about immigration policy: 39% (highest)\n- Handle an international crisis: 35%\n- Work effectively with Congress: 33%\n\nAnswer: Make good decisions about economic policy, Make wise decisions about immigration policy", "citations": [], "input_tokens": 9565, "output_tokens": 285, "cost_micros": 32970, "latency_ms": 6891, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "make good decisions about economic policy, make wise decisions about immigration policy", "normalised_gold": "make good decisions about economic policy, make wise decisions about immigration policy"}}
+{"qid": "05-03-18-political-release.pdf::Q002", "doc_id": "05-03-18-political-release.pdf", "answer_format": "list", "gold": "['Make good decisions about economic policy ', 'Make wise decisions about immigration policy ']", "pages": 17, "document_id": 5219, "arm": "surfsense_agentic", "question_id": "05-03-18-political-release.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the document to find the survey data from April 25 - May 1, 2018.Let me continue reading to find the specific data about confidence levels in different domains.I need to find the specific data showing \"very confident\" and \"not at all confident\" percentages for different domains. Let me search for that data.Perfect! I found the table with confidence levels. Now I need to identify which columns represent \"Very confident\" and \"Not at all confident\". Let me re-read this section more carefully.Now I can see the table structure clearly. The columns are:\n- Column 1: \"Not at all\" (confidence)\n- Column 2: \"Not too\" (confidence)  \n- Column 3: \"Somewhat\" (confident)\n- Column 4: \"Very\" (confident)\n\nHowever, I notice the last column shows combined percentages (e.g., 54 for trade agreements). Let me look more carefully at the structure. The header says \"Not at all Not too\" (colspan=2) and \"Somewhat Very\" (colspan=2), but then there are 4 data columns per row.\n\nLooking at the data pattern:\n- Row 1 (Trade): 44, 26, 30, 54\n- The 54 appears to be a combined \"Very/Somewhat\" total\n\nLet me reconsider. Looking at the structure more carefully:\n- Column 1: Combined \"Not at all/Not too\" total\n- Column 2: \"Not at all\" alone\n- Column 3: \"Somewhat\" alone  \n- Column 4: Combined \"Somewhat/Very\" total\n\nActually, examining the pattern again with the header \"Not at all Not too\" and \"Somewhat Very\", the most logical interpretation is:\n- Total negative (44) | Not at all (26) | Somewhat (30) | Total positive (54)\n\nBut that doesn't match typical survey formats. Let me look for clarification in the surrounding text or try a different interpretation.\n\nGiven the colspan=2 for each pair and 4 columns of data, the structure should be:\n- Not at all: Column 2 (26, 29, 32, 32, 34, 39, 35, 33)\n- Not too: (would be calculated from total minus \"not at all\")\n- Somewhat: Column 3 (30, 31, 26, 25, 21, 26, 25, 13)\n- Very: (would be calculated from total minus \"somewhat\")\n\nSo \"Not at all confident\" values are: 26, 29, 32, 32, 34, 39, 35, 33\nAnd \"Very confident\" values need to be derived. Looking at the totals in column 4 (54, 53, 46, 46, 45, 43, 43, 43) and subtracting column 3 values would give \"Very\":\n- Trade: 54 - 30 = 24\n- Economic: 53 - 31 = 22\n- Military: 46 - 26 = 20\n- Courts: 46 - 25 = 21\n- Executive branch: 45 - 21 = 24\n- Immigration: 43 - 26 = 17\n- International crisis: 43 - 25 = 18\n- Congress: 43 - 13 = 30\n\nWait, that doesn't seem right either. Let me look at this more systematically. The first column (44, 46, 51, etc.) appears to be totals for \"not confident\" responses.\n\nActually, re-reading the header structure with colspan, I believe:\n- Columns are: [Total Not confident] [Not at all] [Somewhat] [Total confident]\n- So Column 2 is \"Not at all confident\": 26, 29, 32, 32, 34, 39, 35, 33\n- And I need to find \"Very confident\"\n\nThe most likely interpretation given standard survey formats:\nThe rightmost number in each row (54, 53, 46, 46, 45, 43, 43, 43) represents the TOTAL for somewhat + very confident.\nThe second-to-right number (30, 31, 26, 25, 21, 26, 25, 13) represents \"somewhat\" confident.\n\nTherefore \"Very confident\" = Total confident - Somewhat:\n- Trade: 54 - 30 = 24%\n- Economic: 53 - 31 = 22%\n- Military: 46 - 26 = 20%\n- Courts: 46 - 25 = 21%\n- Executive: 45 - 21 = 24%\n- Immigration: 43 - 26 = 17%\n- International: 43 - 25 = 18%\n- Congress: 43 - 13 = 30%\n\nNow finding the highest percentages:\n- \"Very confident\": Congress (30%) is highest\n- \"Not at all confident\": Immigration (39%) is highest\n\nLet me verify by checking if there are ties or other high values:\n- Very confident: 30% (Congress), 24% (Trade and Executive - tied for 2nd)\n- Not at all confident: 39% (Immigration), 35% (International crisis), 34% (Executive)\n\nSo the answers are:\n- Highest \"Very confident\": Work effectively with Congress\n- Highest \"Not at all confident\": Make wise decisions about immigration policy\n\nIn alphabetical order with the format requested:\n\nAnswer: Make wise decisions about immigration policy, Work effectively with Congress", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 61000, "error": null, "extra": {"thread_id": 1905, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_63c71f31681e41ee869fe1c754c29c5b", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 544, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "make wise decisions about immigration policy, work effectively with congress", "normalised_gold": "make good decisions about economic policy, make wise decisions about immigration policy"}}
+{"qid": "05-03-18-political-release.pdf::Q004", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "21%", "pages": 17, "document_id": 5219, "arm": "native_pdf", "question_id": "05-03-18-political-release.pdf::Q004", "answer_letter": null, "raw_text": "I need to find which party holds the highest total percentage of \"good policy ideas\" and \"high ethical standards,\" then find what percentage of that party followed congressional election news \"very closely.\"\n\nFrom page 11, I can see the chart \"More view GOP as 'too extreme' than say this about Democratic Party\" which shows:\n- Republican Party: Has good policy ideas: 50%, Has high ethical standards: 41%\n- Democratic Party: Has good policy ideas: 50%, Has high ethical standards: 42%\n\nTotal for Republican Party: 50% + 41% = 91%\nTotal for Democratic Party: 50% + 42% = 92%\n\nThe Democratic Party has the highest total percentage (92%).\n\nNow I need to find what percentage of Democrats/Democratic leaners followed news about congressional elections \"very closely.\"\n\nFrom page 13, in the chart \"Young people far less likely than older adults to follow news about midterms,\" I can see:\n- Dem/Lean Dem: Very closely = 21%\n\nAnswer: 21", "citations": [], "input_tokens": 33920, "output_tokens": 235, "cost_micros": 105285, "latency_ms": 12219, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "05-03-18-political-release.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "21", "normalised_gold": "21"}}
+{"qid": "05-03-18-political-release.pdf::Q004", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "21%", "pages": 17, "document_id": 5219, "arm": "azure_basic_lc", "question_id": "05-03-18-political-release.pdf::Q004", "answer_letter": null, "raw_text": "I need to find which party has the highest total percentage when combining \"good policy ideas\" and \"high ethical standards,\" then find what percentage of that party followed news very closely.\n\nFrom the document:\n\n**Good policy ideas:**\n- Republican Party: 50%\n- Democratic Party: 50%\n\n**High ethical standards:**\n- Republican Party: 41%\n- Democratic Party: 42%\n\n**Total percentages:**\n- Republican Party: 50% + 41% = 91%\n- Democratic Party: 50% + 42% = 92%\n\nThe Democratic Party has the highest total (92%).\n\nNow I need to find what percentage of Democrats/Democratic leaners followed news about congressional elections very closely.\n\nFrom the table on page 12 titled \"Young people far less likely than older adults to follow news about midterms\":\n\nUnder \"Dem/Lean Dem\": 21% followed news very closely\n\nAnswer: 21", "citations": [], "input_tokens": 6951, "output_tokens": 210, "cost_micros": 24003, "latency_ms": 4983, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "21", "normalised_gold": "21"}}
+{"qid": "05-03-18-political-release.pdf::Q004", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "21%", "pages": 17, "document_id": 5219, "arm": "azure_premium_lc", "question_id": "05-03-18-political-release.pdf::Q004", "answer_letter": null, "raw_text": "I need to find which party has the highest combined percentage for \"good policy ideas\" and \"high ethical standards,\" then determine what percentage of that party followed news very closely.\n\nFrom the document:\n\n**Party traits (page 9):**\n- Republican Party: Has good policy ideas (50%), Has high ethical standards (41%)\n- Democratic Party: Has good policy ideas (50%), Has high ethical standards (42%)\n\nTotal for Republican Party: 50% + 41% = 91%\nTotal for Democratic Party: 50% + 42% = 92%\n\nThe Democratic Party has the highest total percentage (92%).\n\n**News following by party (page 11):**\nLooking at the table showing who followed news about congressional elections very closely:\n- Rep/Lean Rep: 18%\n- Dem/Lean Dem: 21%\n\nSince the Democratic Party has the highest combined percentage for good policy ideas and high ethical standards (92%), and Democrats/Democratic leaners followed news very closely at 21%, the answer is 21.\n\nAnswer: 21", "citations": [], "input_tokens": 9443, "output_tokens": 235, "cost_micros": 31854, "latency_ms": 6125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "21", "normalised_gold": "21"}}
+{"qid": "05-03-18-political-release.pdf::Q004", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "21%", "pages": 17, "document_id": 5219, "arm": "llamacloud_basic_lc", "question_id": "05-03-18-political-release.pdf::Q004", "answer_letter": null, "raw_text": "I need to find which party has the highest total percentage for both \"good policy ideas\" and \"high ethical standards,\" then find what percentage of that party followed congressional election news very closely.\n\nFrom the document:\n\n**Party traits (page 9-10):**\n- Republican Party: Has good policy ideas: 50%, Has high ethical standards: 41%\n- Democratic Party: Has good policy ideas: 50%, Has high ethical standards: 42%\n\nTotal for Republican Party: 50% + 41% = 91%\nTotal for Democratic Party: 50% + 42% = 92%\n\nThe Democratic Party has the highest total (92%).\n\n**Following congressional election news very closely (page 11):**\nLooking at the table showing \"% who say they have followed news about congressional elections in their state or district...\"\n\nFor \"Dem/Lean Dem\" (Democratic Party/Lean Democratic):\n- Very closely: 21%\n\nAnswer: 21", "citations": [], "input_tokens": 7800, "output_tokens": 214, "cost_micros": 26610, "latency_ms": 5625, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "21", "normalised_gold": "21"}}
+{"qid": "05-03-18-political-release.pdf::Q004", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "21%", "pages": 17, "document_id": 5219, "arm": "llamacloud_premium_lc", "question_id": "05-03-18-political-release.pdf::Q004", "answer_letter": null, "raw_text": "I need to find which party has the highest total percentage when combining \"good policy ideas\" and \"high ethical standards,\" then find what percentage of that party followed news about congressional elections very closely.\n\nFrom the document:\n\n**Policy Ideas and Ethical Standards (page 9):**\n- Republican Party: 50% have good policy ideas, 41% have high ethical standards\n- Democratic Party: 50% have good policy ideas, 42% have high ethical standards\n\nTotal for Republican Party: 50% + 41% = 91%\nTotal for Democratic Party: 50% + 42% = 92%\n\nThe Democratic Party has the highest total percentage (92%).\n\n**Following News About Congressional Elections (page 11):**\nLooking at the table \"Young people far less likely than older adults to follow news about midterms\"\n\nFor \"Dem/Lean Dem\" (Democrats/Democratic leaners):\n- Very closely: 21%\n- Fairly closely: 31%\n- NET: 52%\n\nThe percentage who followed very closely is 21%.\n\nAnswer: 21", "citations": [], "input_tokens": 9543, "output_tokens": 241, "cost_micros": 32244, "latency_ms": 6844, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "21", "normalised_gold": "21"}}
+{"qid": "05-03-18-political-release.pdf::Q004", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "21%", "pages": 17, "document_id": 5219, "arm": "surfsense_agentic", "question_id": "05-03-18-political-release.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the provided document to answer your question.Let me continue reading to find information about which party holds the highest total percentage of good policy ideas and high ethical standards.Let me search for information about which party has good policy ideas and high ethical standards.Let me search for information about party policy ideas and ethical standards.Perfect! Now I need to find which party has the highest TOTAL percentage of both \"good policy ideas\" and \"high ethical standards\", then find the news following data for that party.\n\nFrom the document:\n- Republican Party: 50% good policy ideas + 41% high ethical standards = 91% total\n- Democratic Party: 50% good policy ideas + 42% high ethical standards = 92% total\n\nThe Democratic Party has the higher total (92%). Now I need to find what percentage of Democrats followed congressional election news very closely.Perfect! I found the data. Looking at the table in the document:\n\nFrom lines 1327-1353, I can see:\n- Republican Party: 50% good policy ideas + 41% high ethical standards = 91% total\n- Democratic Party: 50% good policy ideas + 42% high ethical standards = 92% total\n\nThe Democratic Party has the highest total percentage (92%).\n\nFrom lines 1600-1604, I can see:\n- Dem/Lean Dem: 21% followed news about congressional elections very closely\n\nAnswer: 21", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 69875, "error": null, "extra": {"thread_id": 1906, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_bb4e12b774cf4c398f45bc409de06fef", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 386, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "21", "normalised_gold": "21"}}
+{"qid": "05-03-18-political-release.pdf::Q006", "doc_id": "05-03-18-political-release.pdf", "answer_format": "float", "gold": "92%", "pages": 17, "document_id": 5219, "arm": "native_pdf", "question_id": "05-03-18-political-release.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the percentage of registered voters who support the party that has higher combined ratings for \"good policy ideas\" and \"high ethical standards,\" and who also closely follow congressional elections.\n\nFrom page 11 (page 9 of content):\n- Republican Party: 50% have good policy ideas, 41% have high ethical standards\n- Democratic Party: 50% have good policy ideas, 42% have high ethical standards\n\nTotal scores:\n- Republican Party: 50% + 41% = 91%\n- Democratic Party: 50% + 42% = 92%\n\nThe Democratic Party has the higher total percentage.\n\nFrom page 14 (page 12 of content), looking at registered voters who support the Democratic candidate:\n- Total registered voters supporting Democratic candidate: 48%\n\nFrom page 13 (page 11 of content), looking at those who follow congressional elections \"very closely\":\n- Among Dem/Lean Dem: 21% follow very closely\n\nHowever, I need to find the intersection of:\n1. Registered voters who support the Democratic candidate\n2. Those who closely follow congressional elections\n\nFrom page 14, among all registered voters: 48% support Democratic candidate\nFrom page 13, among Dem/Lean Dem: 21% follow very closely, and 52% total follow very or fairly closely\n\nThe question asks for those who support the Democratic candidate AND closely follow elections. Looking at page 13, among Dem/Lean Dem, 52% follow closely (very or fairly).\n\nSince the Democratic Party has the higher combined score (92% vs 91%), and 48% of registered voters support the Democratic candidate, but I need those who also follow closely.\n\nFrom the data shown, among Democratic supporters who are registered voters, approximately 52% follow closely. However, the specific cross-tabulation isn't directly provided.\n\nLooking more carefully at page 14, the 48% represents all registered voters supporting Democrats. The closely following percentage among this group would need to be calculated, but the most direct answer from the available data for Democratic-leaning registered voters who follow closely would be derived from the 52% net figure.\n\nAnswer: 48", "citations": [], "input_tokens": 33928, "output_tokens": 480, "cost_micros": 108984, "latency_ms": 27639, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "05-03-18-political-release.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "48.0", "normalised_gold": "92.0"}}
+{"qid": "05-03-18-political-release.pdf::Q006", "doc_id": "05-03-18-political-release.pdf", "answer_format": "float", "gold": "92%", "pages": 17, "document_id": 5219, "arm": "azure_basic_lc", "question_id": "05-03-18-political-release.pdf::Q006", "answer_letter": null, "raw_text": "I need to find which party has higher total percentage of \"good policy ideas\" and \"high ethical standards,\" then find the percentage of registered voters who support that party's candidate AND closely follow congressional elections.\n\nFrom the document:\n\n**Party traits (page 10):**\n- Republican Party: Has good policy ideas: 50%, Has high ethical standards: 41%\n  - Total: 50% + 41% = 91%\n- Democratic Party: Has good policy ideas: 50%, Has high ethical standards: 42%\n  - Total: 50% + 42% = 92%\n\nThe Democratic Party has the higher total (92% vs 91%).\n\n**Congressional vote preferences (page 13):**\nThe table shows \"% of registered voters who say they support or lean toward the _ candidate for Congress in their district\"\n- Democratic: 48%\n- Republican: 43%\n\nHowever, I need to find those who BOTH:\n1. Support the Democratic candidate\n2. Closely follow congressional elections\n\nLooking at page 12 for those who follow elections \"very closely\" or \"fairly closely\":\n- The document shows 19% follow \"very closely\" and the NET (very + fairly closely) is shown in various demographic breakdowns\n- For \"Dem/Lean Dem\": 21% follow very closely, 52% follow very or fairly closely\n\nBut the question asks for registered voters who support the Democratic candidate AND closely follow elections. The document doesn't provide a cross-tabulation of vote preference by attention to elections.\n\nWait, let me re-read the question. It asks for \"registered voters who support or lean toward the candidate from the party with the higher total percentage... and closely follow congressional elections.\"\n\nLooking at the data again, I need to find if there's a cross-tab. The document shows on page 13 that 48% of registered voters support the Democratic candidate, but it doesn't break this down by whether they closely follow elections.\n\nThe document provides separate information:\n- 48% of registered voters support Democratic candidate (page 13)\n- Various percentages follow elections closely (page 12)\n\nWithout a cross-tabulation, I cannot determine the exact percentage who meet both criteria. However, re-reading more carefully - the question might be asking simply for the percentage who support the Democratic candidate, which is 48.\n\nAnswer", "citations": [], "input_tokens": 6959, "output_tokens": 512, "cost_micros": 28557, "latency_ms": 10796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "", "normalised_gold": "92.0"}}
+{"qid": "05-03-18-political-release.pdf::Q006", "doc_id": "05-03-18-political-release.pdf", "answer_format": "float", "gold": "92%", "pages": 17, "document_id": 5219, "arm": "azure_premium_lc", "question_id": "05-03-18-political-release.pdf::Q006", "answer_letter": null, "raw_text": "I need to find which party has higher combined percentages for \"good policy ideas\" and \"high ethical standards,\" then find the percentage of registered voters who support that party's candidate and closely follow congressional elections.\n\nFrom the document:\n\n**Party traits (page 9):**\n- Republican Party:\n  - Has good policy ideas: 50%\n  - Has high ethical standards: 41%\n  - Total: 91%\n\n- Democratic Party:\n  - Has good policy ideas: 50%\n  - Has high ethical standards: 42%\n  - Total: 92%\n\nThe Democratic Party has the higher combined total (92% vs 91%).\n\n**Congressional vote preferences (page 12):**\nLooking at registered voters who support the Democratic candidate: 48%\n\nHowever, the question asks specifically for those who \"closely follow congressional elections in their district.\"\n\n**Following congressional elections (page 11):**\nThe question asks about following news \"very closely\" or \"fairly closely\"\n- Total following very closely: 19%\n- Total following fairly closely: 32%\n- NET (very or fairly closely): Not explicitly stated for total, but can be calculated\n\nLooking at the breakdown by party affiliation (page 11):\n- Dem/Lean Dem following very closely: 21%\n- The \"NET\" column shows 52% for Dem/Lean Dem\n\nBut the question asks for registered voters who support the Democratic candidate AND closely follow elections.\n\nLooking more carefully at page 12, the congressional vote preferences show 48% of registered voters favor the Democratic candidate.\n\nHowever, I need to find the intersection of those who:\n1. Support the Democratic candidate (48% of registered voters)\n2. Follow elections closely\n\nThis specific cross-tabulation is not provided in the document. The document shows these as separate statistics but doesn't provide the intersection.\n\nGiven the data available, if I must provide an answer based on what's shown, the percentage of registered voters who support the Democratic candidate is 48%.\n\nAnswer: 48", "citations": [], "input_tokens": 9451, "output_tokens": 446, "cost_micros": 35043, "latency_ms": 10375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "48.0", "normalised_gold": "92.0"}}
+{"qid": "05-03-18-political-release.pdf::Q006", "doc_id": "05-03-18-political-release.pdf", "answer_format": "float", "gold": "92%", "pages": 17, "document_id": 5219, "arm": "llamacloud_basic_lc", "question_id": "05-03-18-political-release.pdf::Q006", "answer_letter": null, "raw_text": "I need to find:\n1. Which party has higher total percentage of \"good policy ideas\" and \"high ethical standards\"\n2. The percentage of registered voters who support that party's candidate AND closely follow congressional elections\n\nFrom the document:\n\n**Party traits (page 10):**\n- Republican Party: Has good policy ideas = 50%, Has high ethical standards = 41%\n- Democratic Party: Has good policy ideas = 50%, Has high ethical standards = 42%\n\nTotal for Republican Party: 50 + 41 = 91%\nTotal for Democratic Party: 50 + 42 = 92%\n\nThe Democratic Party has the higher total (92% vs 91%).\n\n**Following congressional elections closely (page 11):**\nThe document shows percentages who follow \"very closely\" or \"fairly closely\" by party:\n- Dem/Lean Dem: 21% very closely, NET = 52%\n\n**Congressional vote preference (page 12):**\nThe table shows registered voters who support/lean toward:\n- Democratic candidate: 48%\n- Republican candidate: 43%\n\nHowever, I need to find the intersection of those who:\n1. Support the Democratic candidate (the party with higher total)\n2. Follow elections closely\n\nThe document doesn't provide a cross-tabulation of vote preference by how closely people follow elections. Looking at page 12, it only shows vote preference by demographics, not by attention to elections.\n\nSince the question asks for registered voters who support the Democratic candidate AND closely follow elections, but this specific cross-tab is not provided in the document, I cannot determine this exact percentage from the available data.\n\nHowever, re-reading the question, it may be asking simply for the percentage who support the Democratic candidate (48%), as that's the party with the higher combined score.\n\nAnswer: 48", "citations": [], "input_tokens": 7808, "output_tokens": 403, "cost_micros": 29469, "latency_ms": 8859, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "48.0", "normalised_gold": "92.0"}}
+{"qid": "05-03-18-political-release.pdf::Q006", "doc_id": "05-03-18-political-release.pdf", "answer_format": "float", "gold": "92%", "pages": 17, "document_id": 5219, "arm": "llamacloud_premium_lc", "question_id": "05-03-18-political-release.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the percentage of registered voters who support the party that has both higher \"good policy ideas\" and \"high ethical standards\" ratings, and who closely follow congressional elections.\n\nFirst, let me identify which party has higher ratings on both metrics:\n\nFrom the document (page 9):\n- Republican Party: \"Has good policy ideas\" = 50%, \"Has high ethical standards\" = 41%\n- Democratic Party: \"Has good policy ideas\" = 50%, \"Has high ethical standards\" = 42%\n\nThe Democratic Party has a higher percentage on \"high ethical standards\" (42% vs 41%), while both parties tie on \"good policy ideas\" (50% each). Since the question asks for the party with \"higher total percentage,\" I need to add them:\n- Republican Party total: 50% + 41% = 91%\n- Democratic Party total: 50% + 42% = 92%\n\nThe Democratic Party has the higher total percentage.\n\nNow I need to find registered voters who:\n1. Support or lean toward the Democratic candidate\n2. Closely follow congressional elections in their district\n\nFrom page 12, among all registered voters: 48% support or lean toward the Democratic candidate.\n\nHowever, I need to find those who ALSO closely follow elections. The document doesn't provide a cross-tabulation of vote preference by how closely people follow elections.\n\nLooking at page 11, it shows that 19% of all adults follow \"very closely\" and 31% follow \"fairly closely\" (total 50% follow at least fairly closely).\n\nBut the question asks specifically for those who \"closely follow\" - which appears to mean \"very closely\" (19% of total).\n\nSince the document doesn't provide the specific cross-tabulation of Democratic voters who follow very closely, I cannot calculate this precisely from the data provided. However, re-reading the question, it may be asking simply for the percentage of registered voters who support the Democratic candidate, which is 48%.\n\nBut looking more carefully at page 11, the breakdown shows percentages of people following elections, not broken down by their vote preference.\n\nGiven the data available, the answer appears to be: 48\n\nAnswer: 48", "citations": [], "input_tokens": 9551, "output_tokens": 478, "cost_micros": 35823, "latency_ms": 11358, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "48.0", "normalised_gold": "92.0"}}
+{"qid": "05-03-18-political-release.pdf::Q006", "doc_id": "05-03-18-political-release.pdf", "answer_format": "float", "gold": "92%", "pages": 17, "document_id": 5219, "arm": "surfsense_agentic", "question_id": "05-03-18-political-release.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the document to find the answer to your question.Let me continue reading to find information about party policy ideas, ethical standards, and congressional election following:Let me read more sections systematically to find the relevant data:Now let me search for information about congressional elections and following:Now I need to find data specifically about registered voters who closely follow congressional elections broken down by party. Let me search for more detailed tables:Let me look for a more detailed table that might break down voting preferences by how closely they follow elections:Let me search through the appendix or topline data which might contain the detailed crosstabs:Let me search for any tables that combine party support, policy ideas, ethical standards, and following elections. Let me look more carefully at the data I've already read. First, let me identify the key information:\n\n1. From lines 1327-1353, I see that both parties score 50% on \"Has good policy ideas\" and Democratic Party scores 42% on \"Has high ethical standards\" vs Republican Party at 41%.\n\n2. Since both parties tie on policy ideas (50%), I need to determine which party has higher combined score. Democratic Party has 42% ethical standards, Republican has 41%, so Democratic Party has a higher total (50+42=92) vs Republican (50+41=91).\n\n3. Now I need to find data about registered voters who support the Democratic candidate AND closely follow congressional elections.\n\nLet me search for more detailed crosstabs that might show vote preference by attention to elections:I need to look for tables that show voting preference crossed with how closely voters follow congressional elections. Let me search the document more systematically for all tables and figures related to this:Based on my thorough review of the document, I can now solve this step by step:\n\n**Step 1: Determine which party has the higher total percentage of good policy ideas AND high ethical standards**\n\nFrom lines 1327-1353 (chunk id='49127'):\n- Republican Party: Has good policy ideas = 50%, Has high ethical standards = 41%\n- Democratic Party: Has good policy ideas = 50%, Has high ethical standards = 42%\n\nTotal for Republican Party: 50 + 41 = 91\nTotal for Democratic Party: 50 + 42 = 92\n\n**The Democratic Party has the higher total (92 vs 91).**\n\n**Step 2: Find registered voters who support the Democratic candidate**\n\nFrom lines 1638-1643 and 1661-1676 (chunks id='49140' and id='49142'):\n\"Currently, 48% of registered voters say they would vote for the Democratic Party's candidate for Congress in their district\"\n\n**Step 3: Find the percentage who closely follow congressional elections**\n\nFrom lines 1494-1537 and 1582-1604 (chunks id='49135' and id='49138'):\n- Among \"Dem/Lean Dem\": 21% follow \"Very closely\"\n\nHowever, I need to find data specifically about registered voters (not all adults) who support the Democratic candidate AND closely follow congressional elections. The document shows that 21% of Dem/Lean Dem follow very closely (line 1602), and from the congressional vote table, 92% of Dem/Lean Dem registered voters support the Democratic candidate (line 1763).\n\nBut the question asks specifically for registered voters who support the Democratic candidate AND closely follow. The data shows:\n- Line 1602: Among Dem/Lean Dem, 21% follow very closely\n- Line 1763: Among registered voters who are Dem/Lean Dem, 92% support the Democratic candidate\n\nSince 92% of Dem/Lean Dem registered voters support the Democratic candidate, and 21% of Dem/Lean Dem closely follow congressional elections, and these are measuring the same group (Dem/Lean Dem), the answer is **21**.\n\nAnswer: 21", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 103328, "error": null, "extra": {"thread_id": 1907, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_e2603e2724c3411e98fe7264f3325bbd", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 739, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "21.0", "normalised_gold": "92.0"}}
+{"qid": "05-03-18-political-release.pdf::Q008", "doc_id": "05-03-18-political-release.pdf", "answer_format": "float", "gold": "7.0", "pages": 17, "document_id": 5219, "arm": "native_pdf", "question_id": "05-03-18-political-release.pdf::Q008", "answer_letter": null, "raw_text": "I need to find the percentage gap between the youngest and oldest age groups that strongly disapprove of the way Trump was handling his job.\n\nLooking at page 10, I can see a chart titled \"Wide gender gap in approval of Trump\" which shows approval/disapproval ratings by various demographics.\n\nFor the \"Disapprove\" column (which shows \"Strongly\" and \"Not strongly\" combined):\n\nAges 18-29: 65% total disapproval\nAges 65+: 49% total disapproval\n\nHowever, I need to find specifically the \"Strongly\" disapprove percentages. Looking more carefully at the chart structure, the disapproval is broken down into \"Strongly\" and \"Not strongly\" components.\n\nFrom the chart on page 10:\n- Ages 18-29: The bar shows 65 total, with components that would indicate strongly disapprove\n- Ages 65+: The bar shows 49 total\n\nLet me look more carefully at the specific breakdown. The chart shows:\n- Total disapproval for 18-29: 65%\n- Total disapproval for 65+: 49%\n\nHowever, I need the \"Strongly\" disapprove component specifically. Looking at the total row, it shows 54% total disapproval with 42% strongly disapproving.\n\nWithout being able to see the exact breakdown of \"strongly\" vs \"not strongly\" for each age group in the data provided, I cannot calculate the precise gap for \"strongly disapprove\" specifically.\n\nLet me reconsider - if the overall pattern shows 42% strongly disapprove out of 54% total, and the youngest group (18-29) has 65% total disapproval while the oldest (65+) has 49% total disapproval, the gap in total disapproval is 16 percentage points.\n\nAnswer: 16.0", "citations": [], "input_tokens": 33935, "output_tokens": 416, "cost_micros": 108045, "latency_ms": 24000, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "05-03-18-political-release.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "16.0", "normalised_gold": "7.0"}}
+{"qid": "05-03-18-political-release.pdf::Q008", "doc_id": "05-03-18-political-release.pdf", "answer_format": "float", "gold": "7.0", "pages": 17, "document_id": 5219, "arm": "azure_basic_lc", "question_id": "05-03-18-political-release.pdf::Q008", "answer_letter": null, "raw_text": "I need to find the percentage of those who \"strongly disapprove\" of the way Trump is handling his job for the youngest and oldest age groups.\n\nLooking at the table \"Wide gender gap in approval of Trump\" on page 9, I can find the \"Disapprove Strongly\" column for different age groups:\n\nAges 18-29: 46% disapprove strongly\nAges 65+: 39% disapprove strongly\n\nThe gap between the youngest age group (18-29) and the oldest age group (65+) is:\n46 - 39 = 7\n\nAnswer: 7", "citations": [], "input_tokens": 6966, "output_tokens": 137, "cost_micros": 22953, "latency_ms": 5687, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "7.0", "normalised_gold": "7.0"}}
+{"qid": "05-03-18-political-release.pdf::Q008", "doc_id": "05-03-18-political-release.pdf", "answer_format": "float", "gold": "7.0", "pages": 17, "document_id": 5219, "arm": "azure_premium_lc", "question_id": "05-03-18-political-release.pdf::Q008", "answer_letter": null, "raw_text": "I need to find the percentage of people who \"strongly disapprove\" of Trump's job performance for the youngest and oldest age groups.\n\nLooking at the table titled \"Wide gender gap in approval of Trump\" on page 8, I can see the breakdown by age groups with approval/disapproval data.\n\nThe columns show:\n- Disapprove (with \"Strongly\" and \"Not strongly\" subcategories)\n- Approve (with \"Strongly\" and \"Not strongly\" subcategories)\n\nFor age groups:\n- Ages 18-29: The \"Disapprove\" total is 65, with \"Strongly\" at 46\n- Ages 65+: The \"Disapprove\" total is 49, with \"Strongly\" at 39\n\nThe gap between the youngest age group (18-29) strongly disapproving at 46% and the oldest age group (65+) strongly disapproving at 39% is:\n\n46 - 39 = 7\n\nAnswer: 7", "citations": [], "input_tokens": 9458, "output_tokens": 227, "cost_micros": 31779, "latency_ms": 6702, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "7.0", "normalised_gold": "7.0"}}
+{"qid": "05-03-18-political-release.pdf::Q008", "doc_id": "05-03-18-political-release.pdf", "answer_format": "float", "gold": "7.0", "pages": 17, "document_id": 5219, "arm": "llamacloud_basic_lc", "question_id": "05-03-18-political-release.pdf::Q008", "answer_letter": null, "raw_text": "I need to find the percentage gap in strong disapproval between the youngest and oldest age groups for Trump's job handling.\n\nLooking at the table titled \"Wide gender gap in approval of Trump\" on page 9, which shows \"% who __ of the way Donald Trump is handling his job as president\":\n\nFor age groups:\n- Ages 18-29: 46% strongly disapprove\n- Ages 65+: 39% strongly disapprove\n\nThe gap between the youngest (18-29) and oldest (65+) age groups in strong disapproval is:\n46% - 39% = 7 percentage points\n\nAnswer: 7", "citations": [], "input_tokens": 7815, "output_tokens": 145, "cost_micros": 25620, "latency_ms": 4266, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "7.0", "normalised_gold": "7.0"}}
+{"qid": "05-03-18-political-release.pdf::Q008", "doc_id": "05-03-18-political-release.pdf", "answer_format": "float", "gold": "7.0", "pages": 17, "document_id": 5219, "arm": "llamacloud_premium_lc", "question_id": "05-03-18-political-release.pdf::Q008", "answer_letter": null, "raw_text": "I need to find the percentage gap between the youngest and oldest age groups that strongly disapprove of the way Trump was handling his job.\n\nLooking at the table titled \"Wide gender gap in approval of Trump\" on page 8, I can find the \"Disapprove Strongly\" column for different age groups:\n\n- Ages 18-29: 46% disapprove strongly\n- Ages 65+: 39% disapprove strongly\n\nThe gap between the youngest (18-29) and oldest (65+) age groups is:\n46% - 39% = 7 percentage points\n\nAnswer: 7", "citations": [], "input_tokens": 9558, "output_tokens": 137, "cost_micros": 30729, "latency_ms": 4405, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "7.0", "normalised_gold": "7.0"}}
+{"qid": "05-03-18-political-release.pdf::Q008", "doc_id": "05-03-18-political-release.pdf", "answer_format": "float", "gold": "7.0", "pages": 17, "document_id": 5219, "arm": "surfsense_agentic", "question_id": "05-03-18-political-release.pdf::Q008", "answer_letter": null, "raw_text": "I'll read the document to find the information about the survey conducted April 25-May 1, 2018, regarding approval ratings by age group.Let me continue searching for the age group data on approval ratings. I need to find the specific breakdown by age showing \"strongly disapprove\" percentages.Let me search for \"disapprove\" more broadly:Good! I can see references to approval ratings. Let me read that section more carefully:Perfect! I found the table with age group breakdowns. I can see:\n\n- Ages 18-29 (youngest): Strongly disapprove = 46%\n- Ages 65+ (oldest): Strongly disapprove = 39%\n\nNow I need to calculate the gap:\n46 - 39 = 7\n\nAnswer: 7.0", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 44469, "error": null, "extra": {"thread_id": 1908, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_478b9a7833a5424882dc6822080399bd", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 233, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "7.0", "normalised_gold": "7.0"}}
+{"qid": "05-03-18-political-release.pdf::Q010", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "128", "pages": 17, "document_id": 5219, "arm": "native_pdf", "question_id": "05-03-18-political-release.pdf::Q010", "answer_letter": null, "raw_text": "I need to find information about a survey conducted April 25-May 1, 2019 regarding Democratic people's views on whether either party has high ethical standards.\n\nHowever, I notice that this document is from a survey conducted April 25-May 1, **2018**, not 2019 as mentioned in the question.\n\nLooking at page 12 (document page 13), I can see a chart titled \"Nearly a third of college graduates say neither party has 'high ethical standards'\" which shows the percentage who say 'high ethical standards' describes both parties, one party, or neither party.\n\nThe data shows:\n- Total: 17% say it describes both parties, 47% say one party not the other, 25% say neither party\n- Among Dem/Lean Dem: 18% say both parties, 55% say one party, **18%** say neither party\n\nSo among Democrats and Democratic leaners in this survey, 18% said neither party has high ethical standards.\n\nHowever, the question asks specifically about \"Democratic people\" not \"Dem/Lean Dem\". Let me check if there's more specific data.\n\nThe chart shows the breakdown is for \"Dem/Lean Dem\" which includes both Democrats and Democratic leaners, showing 18% said neither party has high ethical standards.\n\nAnswer: 18", "citations": [], "input_tokens": 33910, "output_tokens": 299, "cost_micros": 106215, "latency_ms": 13266, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "05-03-18-political-release.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "18", "normalised_gold": "128"}}
+{"qid": "05-03-18-political-release.pdf::Q010", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "128", "pages": 17, "document_id": 5219, "arm": "azure_basic_lc", "question_id": "05-03-18-political-release.pdf::Q010", "answer_letter": null, "raw_text": "I need to find information about how many Democratic people said neither party has high ethical standards.\n\nLooking at the document, I can see a table on page 11 titled \"Nearly a third of college graduates say neither party has 'high ethical standards'\" which shows the percentage who say 'high ethical standards' describes both parties, describes one party not the other, or describes neither party.\n\nAccording to this table:\n- For \"Democrat\": 18% say it describes neither party\n\nHowever, the question asks for the NUMBER of people, not the percentage. I need to find the actual sample size.\n\nLooking at the methodology section on page 16, the survey was conducted April 25-May 1, 2018 (note: the question asks about 2019, but the document is from 2018). The table shows:\n- Dem/Lean Dem: 710 (unweighted sample size)\n\nSo if 18% of the 710 Democrats/Democratic leaners said neither party has high ethical standards:\n710 \u00d7 0.18 = 127.8\n\nSince we need an integer, this would round to 128.\n\nHowever, I should note that the question asks about 2019, but this document is from a survey conducted in 2018. There is no 2019 data in this document.\n\nBased on the 2018 data available:\n710 \u00d7 0.18 = 127.8 \u2248 128\n\nAnswer: 128", "citations": [], "input_tokens": 6941, "output_tokens": 329, "cost_micros": 25758, "latency_ms": 7719, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "128", "normalised_gold": "128"}}
+{"qid": "05-03-18-political-release.pdf::Q010", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "128", "pages": 17, "document_id": 5219, "arm": "azure_premium_lc", "question_id": "05-03-18-political-release.pdf::Q010", "answer_letter": null, "raw_text": "I need to find information about Democrats who said neither party has high ethical standards in the April 25-May 1, 2018 survey (note: the question says 2019 but the document is from 2018).\n\nLooking at the document, I find a table on page 10 titled \"Nearly a third of college graduates say neither party has 'high ethical standards'\" which shows the percentage who say 'high ethical standards' describes neither party.\n\nFor Democrats, the table shows:\n- Democrat: 18% say \"Describes neither party\"\n\nHowever, I need the actual number of people, not the percentage. \n\nLooking at the methodology section on page 14-15:\n- Total sample: 1,503 adults\n- Dem/Lean Dem: 710 (unweighted sample size)\n\nThe table on page 10 shows that 18% of Democrats said \"high ethical standards\" describes neither party.\n\nCalculating: 710 \u00d7 0.18 = 127.8\n\nSince we need an integer and this represents actual people, this would be approximately 128 people.\n\nHowever, I should note that the methodology indicates these are unweighted numbers, and the actual weighted percentages are shown in the table. Without the weighted sample size for Democrats, I can only use the unweighted sample size provided.\n\n710 \u00d7 0.18 = 127.8 \u2248 128\n\nAnswer: 128", "citations": [], "input_tokens": 9433, "output_tokens": 322, "cost_micros": 33129, "latency_ms": 8750, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "128", "normalised_gold": "128"}}
+{"qid": "05-03-18-political-release.pdf::Q010", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "128", "pages": 17, "document_id": 5219, "arm": "llamacloud_basic_lc", "question_id": "05-03-18-political-release.pdf::Q010", "answer_letter": null, "raw_text": "I need to find information about how many Democratic people said neither party has high ethical standards in the survey conducted April 25-May 1, 2018 (note: the question says 2019, but the document is from 2018).\n\nLooking at the table titled \"Nearly a third of college graduates say neither party has 'high ethical standards'\" on page 10:\n\nThe table shows the percentage who say 'high ethical standards' \"Describes neither party\":\n- Democrat: 18%\n\nThe survey methodology section on page 14 states:\n- Total sample: 1,503 adults\n- The unweighted sample sizes table on page 15 shows:\n  - Dem/Lean Dem: 710\n\nHowever, the table on page 10 shows percentages, not raw numbers. To calculate the number of Democrats who said neither party has high ethical standards:\n\n710 (Dem/Lean Dem sample) \u00d7 0.18 (18% who said neither party has high ethical standards) = 127.8\n\nSince we need an integer and this represents people, this would round to 128.\n\nHowever, I should note that 710 is the unweighted sample size, and the actual weighted number might be different. But based on the information provided and the unweighted sample size given, the calculation would be approximately 128.\n\nAnswer: 128", "citations": [], "input_tokens": 7790, "output_tokens": 304, "cost_micros": 27930, "latency_ms": 7687, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "128", "normalised_gold": "128"}}
+{"qid": "05-03-18-political-release.pdf::Q010", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "128", "pages": 17, "document_id": 5219, "arm": "llamacloud_premium_lc", "question_id": "05-03-18-political-release.pdf::Q010", "answer_letter": null, "raw_text": "I need to find information about Democrats who said neither party has high ethical standards in the survey conducted April 25-May 1, 2018 (note: the question says 2019, but the document is from 2018).\n\nLooking at the document, I can find a table on page 10 titled \"Nearly a third of college graduates say neither party has 'high ethical standards'\" which shows the percentage who say 'high ethical standards' describes both parties, one party, or neither party.\n\nAccording to this table:\n- Democrat: 18% say it describes neither party\n\nHowever, this is a percentage, not the actual number of people. To find the actual number, I need to know how many Democrats were in the survey.\n\nLooking at page 15, the methodology section shows:\n- Total sample: 1,503 adults\n- Dem/Lean Dem: 710 (unweighted sample size)\n\nThe question asks specifically about \"Democratic people\" (which I interpret as Dem/Lean Dem), and from the table on page 10, 18% of Democrats said neither party has high ethical standards.\n\nCalculating: 710 \u00d7 0.18 = 127.8\n\nSince we need an integer and this represents people, I should round to 128.\n\nHowever, I need to be careful - the table on page 15 shows \"unweighted\" sample sizes, but the percentages in the analysis would typically be based on weighted data. The actual number would be the unweighted count.\n\nUsing the unweighted sample: 710 \u00d7 0.18 = 127.8 \u2248 128\n\nAnswer: 128", "citations": [], "input_tokens": 9533, "output_tokens": 370, "cost_micros": 34149, "latency_ms": 9297, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "128", "normalised_gold": "128"}}
+{"qid": "05-03-18-political-release.pdf::Q010", "doc_id": "05-03-18-political-release.pdf", "answer_format": "int", "gold": "128", "pages": 17, "document_id": 5219, "arm": "surfsense_agentic", "question_id": "05-03-18-political-release.pdf::Q010", "answer_letter": null, "raw_text": "I'll read the document mentioned in the priority list to find the answer to this question.I need to search for information about the survey conducted April 25-May 1, 2019. Let me continue reading the document to find the relevant data.Let me search for \"ethical standards\" in the document:Good! I found the survey date \"April 25-May 1, 2018\" (not 2019 as stated in the question, but this appears to be the document). Let me read the section about ethical standards of the parties:Perfect! I found the table with the data. Looking at the table in lines 1405-1470, I can see the breakdown by party affiliation. The question asks specifically about \"Democratic people\" (Democrats).\n\nFrom the table:\n- Row \"Democrat\": 18% say neither party has high ethical standards (column \"Describes neither party\")\n\nHowever, I should note that the survey was conducted April 25-May 1, 2018, not 2019 as stated in the question. But this appears to be the relevant document based on the question content.\n\nThe table shows that 18% of Democrats said neither the Republican Party nor the Democratic Party has 'high ethical standards'.\n\nSince the question asks for the number of people (not percentage), I need to find the sample size. Let me search for that information:Let me read more to find the specific sample size for Democrats:Now I need to check if the table shows data for \"Democrat\" or \"Dem/Lean Dem\". Let me reread that section:Perfect! Now I have all the information:\n\nFrom the table:\n- \"Democrat\" row shows 18% said neither party has high ethical standards\n- The note states \"Independents include all those who do not identify as Republican or Democrat\"\n- This means \"Republican\", \"Democrat\", and \"Independent\" are separate categories\n- \"Lean Rep\" and \"Lean Dem\" are subcategories of Independents\n\nLooking at the methodology section, I see:\n- Total sample: 1,503 adults\n- Dem/Lean Dem (combined): 710 people\n\nHowever, the table shows \"Democrat\" separately from \"Lean Dem\". I need to find the sample size for just \"Democrat\" (not including leaners).\n\nLet me search for more details about the Democrat sample size:Unfortunately, the document doesn't provide the specific sample size for \"Democrat\" separate from \"Dem/Lean Dem\". The methodology only provides:\n- Total sample: 1,503\n- Rep/Lean Rep: 644\n- Dem/Lean Dem: 710\n\nGiven that:\n- 18% of Democrats said neither party has high ethical standards\n- The combined \"Dem/Lean Dem\" sample size is 710\n\nHowever, since the table shows separate rows for \"Democrat\" and \"Lean Dem\", and we only have the combined sample size, I cannot calculate the exact number of Democrat respondents (excluding leaners).\n\nBut wait - let me check if there's more information about the sample composition:The document states \"Sample sizes and sampling errors for other subgroups are available upon request\" but doesn't provide the specific breakdown for Democrats vs. Dem/Lean Dem.\n\nLooking back at the data, the question asks for \"Democratic people\" which most likely refers to \"Democrat\" (those who identify as Democrats, not including Democratic leaners who are technically independents).\n\nFrom the table:\n- 18% of Democrats said neither party has high ethical standards\n\nHowever, without the specific sample size for \"Democrat\" (as opposed to \"Dem/Lean Dem\"), I cannot provide the exact number of people.\n\nBut based on typical Pew Research methodology and the fact that the question specifically mentions \"Democratic people in the survey,\" it's most reasonable to interpret this as asking for the actual count rather than the percentage.\n\nSince I don't have the specific Democrat-only sample size, let me check if there's any other information in the document that might help. Let me look at whether the question might be asking about the year 2019 instead:The document I've been examining is from 2018 (April 25-May 1, 2018), not 2019. The question states \"April 25- May 1, 2019\" but based on the priority documents and the content match, this 2018 survey is the relevant document.\n\nGiven the information available:\n- 18% of Democrats said neither party has high ethical standards\n- The document only provides \"Dem/Lean Dem\" combined sample size of 710\n\nWithout the specific Democrat-only sample size, and given that the question asks for an integer representing the number of people (not percentage), I cannot provide a definitive answer based solely on the document provided.\n\nHowever, if I interpret \"Democratic people\" as \"Dem/Lean Dem\" (the broader Democratic coalition), then:\n710 \u00d7 0.18 = 127.8 \u2248 128 people\n\nBut if we look at the note in the table that says \"Independents include all those who do not identify as Republican or Democrat,\" and line 1483 says \"only about two-in-ten Republicans (19%) or Democrats (18%) say this,\" this confirms 18% is for Democrats specifically (not including leaners).\n\nSince the document doesn't provide the Democrat-only sample size, and given that this is supposed to be answered from the document alone, let me reconsider. The percentage (18%) is clearly stated in the document. The question asks \"How many\" which typically means a count, not a percentage. Without the specific sample size for Democrats (excluding leaners), I cannot calculate the exact number.\n\nHowever, given the constraint to answer with a single integer, and that 18 appears prominently as the percentage for Democrats in the table, the answer is:\n\nAnswer: 18", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 84969, "error": null, "extra": {"thread_id": 1909, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_265d3866b1d34612a515602bee4564b0", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 708, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "18", "normalised_gold": "128"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q000", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "str", "gold": "Capt. John W. Sanders", "pages": 16, "document_id": 5220, "arm": "native_pdf", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the first figure on the second page of the document, I can see a photograph of a military officer in uniform with an American flag in the background.\n\nAccording to the text in the \"NMRC Commanding Officer's Message\" section on page 2, the message is signed by:\n\n\"John W. Sanders III, CAPT, MC, USN\"\n\nThis indicates that Captain John W. Sanders III is the NMRC Commanding Officer shown in that photograph.\n\nAnswer: Capt. John W. Sanders III", "citations": [], "input_tokens": 40393, "output_tokens": 116, "cost_micros": 122919, "latency_ms": 29766, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "0b85477387a9d0cc33fca0f4becaa0e5.pdf"}, "graded": {"correct": true, "f1": 0.888888888888889, "method": "str_norm", "normalised_pred": "capt john w sanders iii", "normalised_gold": "capt john w sanders"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q000", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "str", "gold": "Capt. John W. Sanders", "pages": 16, "document_id": 5220, "arm": "azure_basic_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, on page 2, the first figure shows a photo with a caption that reads:\n\n\"Dr. Xiaoyan \"Cathy\" Zou, staff scientist from the Henry Jackson Foundation, discusses research on malaria with Rear Adm. (Ret.) Tim Ziemer, the U.S. Global Malaria Coordinator, President, Malaria Initiative.\"\n\nHowever, this caption does not identify a commanding officer. Looking at the text on page 2, under \"NMRC Commanding Officer's Message,\" it states:\n\n\"NMRC Commanding Officer sends, John W. Sanders III, CAPT, MC, USN\"\n\nAnswer: Capt. John W. Sanders III", "citations": [], "input_tokens": 14490, "output_tokens": 161, "cost_micros": 45885, "latency_ms": 5687, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.888888888888889, "method": "str_norm", "normalised_pred": "capt john w sanders iii", "normalised_gold": "capt john w sanders"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q000", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "str", "gold": "Capt. John W. Sanders", "pages": 16, "document_id": 5220, "arm": "azure_premium_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, I need to find the first figure on the second page.\n\nThe second page contains the \"NMRC Commanding Officer's Message\" section. At the bottom of that message, it states:\n\n\"NMRC Commanding Officer sends,\nJohn W. Sanders III, CAPT, MC, USN\"\n\nThere is also a figure/photo associated with this message showing the commanding officer.\n\nAnswer: Capt. John W. Sanders III", "citations": [], "input_tokens": 16140, "output_tokens": 103, "cost_micros": 49965, "latency_ms": 4078, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.888888888888889, "method": "str_norm", "normalised_pred": "capt john w sanders iii", "normalised_gold": "capt john w sanders"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q000", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "str", "gold": "Capt. John W. Sanders", "pages": 16, "document_id": 5220, "arm": "llamacloud_basic_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, I need to find the first figure on the second page.\n\nThe second page contains two sections: \"NMRC Commanding Officer's Message\" and \"NSMRL Commanding Officer's Message.\"\n\nThe first figure/photo on this page shows \"Dr. Xiaoyan 'Cathy' Zou, staff scientist from the Henry Jackson Foundation, discusses research on malaria with Rear Adm. (Ret.) Tim Ziemer, the U.S. Global Malaria Coordinator, President, Malaria Initiative.\"\n\nHowever, the question asks about the commanding officer. Looking at the page headers, the NMRC Commanding Officer's Message is signed by:\n\nAnswer: John W. Sanders III, CAPT, MC, USN", "citations": [], "input_tokens": 12934, "output_tokens": 171, "cost_micros": 41367, "latency_ms": 6780, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.7272727272727273, "method": "str_norm", "normalised_pred": "john w sanders iii capt mc usn", "normalised_gold": "capt john w sanders"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q000", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "str", "gold": "Capt. John W. Sanders", "pages": 16, "document_id": 5220, "arm": "llamacloud_premium_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, the second page contains the \"NMRC Commanding Officer's Message\" section.\n\nThe message is signed by:\n\"John W. Sanders III, CAPT, MC, USN\"\n\nAnswer: Capt. John W. Sanders III", "citations": [], "input_tokens": 14408, "output_tokens": 61, "cost_micros": 44139, "latency_ms": 3796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.888888888888889, "method": "str_norm", "normalised_pred": "capt john w sanders iii", "normalised_gold": "capt john w sanders"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q000", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "str", "gold": "Capt. John W. Sanders", "pages": 16, "document_id": 5220, "arm": "surfsense_agentic", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find the first figure on the second page and identify the commanding officer.Let me continue reading to see the actual content and locate the second page with the first figure:Let me continue reading to find page 2 and the first figure on it:Now let me search for page 2 specifically to find the first figure on that page:I can see that page 2 starts at line 585 (marked with `<!-- PageNumber=\"2\" -->`). The first figure on page 2 appears at lines 634-681. Let me read that section more carefully to identify the commanding officer:\n\nLooking at the content, I can see:\n\n1. Page 2 starts at line 585 with the page break marker\n2. The first figure on page 2 appears starting at line 634 (with the embedded image `Image76.jpg` at line 647)\n3. The visual description indicates this is \"a formal military portrait photograph\" with \"Gold sleeve stripes on both cuffs indicating officer rank (appears to be four stripes with additional device)\"\n\nLooking back at the context before this figure, at line 610-611, it mentions \"NAMRU-3 Commanding Officer, Capt. Buhari Oyofo\" and the caption at lines 686-687 states \"NAMRU-3 hosted nine Afghan trainees from the Central PublicHealth Laboratory in Kabul for a bacteriology training workshop.\"\n\nHowever, the portrait figure appears to be associated with NAMRU-3's commanding officer based on the context and positioning in the article about NAMRU-3.\n\nAnswer: Capt. Buhari Oyofo", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 36202, "error": null, "extra": {"thread_id": 1910, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_f4f5354a0c7b455995c184df235b687c", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 232, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.28571428571428575, "method": "str_norm", "normalised_pred": "capt buhari oyofo", "normalised_gold": "capt john w sanders"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q002", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "str", "gold": "Tim Ziemer", "pages": 16, "document_id": 5220, "arm": "native_pdf", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document from December 2012, I can find information about a visit on November 29, 2012 on page 1.\n\nThe article titled \"NMRC Hosts Visit from U.S. Global Malaria Coordinator, President's Malaria Initiative\" states:\n\n\"SILVER SPRING, Md. - Rear Adm. (Ret.) Tim Ziemer, the U.S. Global Malaria Coordinator, President's Malaria Initiative, visited the Naval Medical Research Center (NMRC), November 29, for a brief on the current malaria vaccine research efforts and to tour the facility.\"\n\nAnswer: Rear Adm. (Ret.) Tim Ziemer", "citations": [], "input_tokens": 40404, "output_tokens": 160, "cost_micros": 123612, "latency_ms": 34093, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "0b85477387a9d0cc33fca0f4becaa0e5.pdf"}, "graded": {"correct": true, "f1": 0.5714285714285715, "method": "str_norm", "normalised_pred": "rear adm ret tim ziemer", "normalised_gold": "tim ziemer"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q002", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "str", "gold": "Tim Ziemer", "pages": 16, "document_id": 5220, "arm": "azure_basic_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q002", "answer_letter": null, "raw_text": "Looking through the document for November 29, 2012...\n\nOn page 1, the article titled \"NMRC Hosts Visit from U.S. Global Malaria Coordinator, President's Malaria Initiative\" states:\n\n\"SILVER SPRING, Md. - Rear Adm. (Ret.) Tim Ziemer, the U.S. Global Malaria Coordinator, President's Malaria Initiative, visited the Naval Medical Research Center (NMRC), November 29, for a brief on the current malaria vaccine research efforts and to tour the facility.\"\n\nAnswer: Rear Adm. (Ret.) Tim Ziemer", "citations": [], "input_tokens": 14501, "output_tokens": 147, "cost_micros": 45708, "latency_ms": 4968, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.5714285714285715, "method": "str_norm", "normalised_pred": "rear adm ret tim ziemer", "normalised_gold": "tim ziemer"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q002", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "str", "gold": "Tim Ziemer", "pages": 16, "document_id": 5220, "arm": "azure_premium_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q002", "answer_letter": null, "raw_text": "Looking through the document for information about a November 29, 2012 visit to the Naval Medical Research Center...\n\nOn page 1, the article titled \"NMRC Hosts Visit from U.S. Global Malaria Coordinator, President's Malaria Initiative\" states:\n\n\"SILVER SPRING, Md. - Rear Adm. (Ret.) Tim Ziemer, the U.S. Global Malaria Coordinator, President's Malaria Initiative, visited the Naval Medical Research Center (NMRC), November 29, for a brief on the current malaria vaccine research efforts and to tour the facility.\"\n\nAnswer: Rear Adm. (Ret.) Tim Ziemer", "citations": [], "input_tokens": 16151, "output_tokens": 158, "cost_micros": 50823, "latency_ms": 4608, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.5714285714285715, "method": "str_norm", "normalised_pred": "rear adm ret tim ziemer", "normalised_gold": "tim ziemer"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q002", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "str", "gold": "Tim Ziemer", "pages": 16, "document_id": 5220, "arm": "llamacloud_basic_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q002", "answer_letter": null, "raw_text": "Looking through the document for a visit on November 29, 2012 to the Naval Medical Research Center (NMRC)...\n\nOn page 1, in the article titled \"NMRC Hosts Visit from U.S. Global Malaria Coordinator, President's Malaria Initiative,\" it states:\n\n\"SILVER SPRING, Md. - Rear Adm. (Ret.) Tim Ziemer, the U.S. Global Malaria Coordinator, President's Malaria Initiative, visited the Naval Medical Research Center (NMRC), November 29, for a brief on the current malaria vaccine research efforts and to tour the facility.\"\n\nAnswer: Rear Adm. (Ret.) Tim Ziemer", "citations": [], "input_tokens": 12945, "output_tokens": 164, "cost_micros": 41295, "latency_ms": 4907, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.5714285714285715, "method": "str_norm", "normalised_pred": "rear adm ret tim ziemer", "normalised_gold": "tim ziemer"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q002", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "str", "gold": "Tim Ziemer", "pages": 16, "document_id": 5220, "arm": "llamacloud_premium_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, specifically the article titled \"NMRC Hosts Visit from U.S. Global Malaria Coordinator, President's Malaria Initiative\" on page 1:\n\n\"SILVER SPRING, Md. - Rear Adm. (Ret.) Tim Ziemer, the U.S. Global Malaria Coordinator, President's Malaria Initiative, visited the Naval Medical Research Center (NMRC), November 29...\"\n\nAnswer: Rear Adm. (Ret.) Tim Ziemer", "citations": [], "input_tokens": 14419, "output_tokens": 120, "cost_micros": 45057, "latency_ms": 3375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.5714285714285715, "method": "str_norm", "normalised_pred": "rear adm ret tim ziemer", "normalised_gold": "tim ziemer"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q002", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "str", "gold": "Tim Ziemer", "pages": 16, "document_id": 5220, "arm": "surfsense_agentic", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the document that was mentioned to find information about who visited the U.S. Naval Medical Research Centre on November 29, 2012.Let me search for the specific date \"November 29, 2012\" in the document:Let me try different date formats:I found a reference to \"November 29\". Let me read the specific chunk that contains this information:Perfect! I found the answer. Let me verify by searching for \"2012\" to confirm the year:The document is from December 2012, and it mentions the visit on November 29 (which would be November 29, 2012). Based on the text I read, the visitor was Rear Adm. (Ret.) Tim Ziemer, who was the U.S. Global Malaria Coordinator for the President's Malaria Initiative.\n\nAnswer: Rear Adm. (Ret.) Tim Ziemer, U.S. Global Malaria Coordinator, President's Malaria Initiative", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 40875, "error": null, "extra": {"thread_id": 1911, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_b589ac0b3255440294d5d9346c5a691a", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 205, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 0.25, "method": "str_norm", "normalised_pred": "rear adm ret tim ziemer u s global malaria coordinator president s malaria initiative", "normalised_gold": "tim ziemer"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q004", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "list", "gold": "['Walter Gwenigale', 'Fatorma Bolay', 'Vernon Graham']", "pages": 16, "document_id": 5220, "arm": "native_pdf", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q004", "answer_letter": null, "raw_text": "Looking through the document for information about NAMRU-3's visit to Monrovia, Liberia in November 2012...\n\nOn page 7, the article \"NAMRU-3 Supports Medical Research Capacity Building in Liberia\" states:\n\n\"The NAMRU-3 team visited Monrovia, Liberia in November to meet with key collaborators, including Dr. Walter Gwenigale, the Minister of Health and Social Welfare; Dr. Fatorma Bolay, the Director of LIBR; and U.S. Marine Col. Vernon Graham, the officer in charge of Operation Onward Liberty (OOL).\"\n\nAnswer: Dr. Walter Gwenigale, Dr. Fatorma Bolay, U.S. Marine Col. Vernon Graham", "citations": [], "input_tokens": 40420, "output_tokens": 181, "cost_micros": 123975, "latency_ms": 34391, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "0b85477387a9d0cc33fca0f4becaa0e5.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "dr fatorma bolay, dr walter gwenigale, u s marine col vernon graham", "normalised_gold": "fatorma bolay, vernon graham, walter gwenigale"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q004", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "list", "gold": "['Walter Gwenigale', 'Fatorma Bolay', 'Vernon Graham']", "pages": 16, "document_id": 5220, "arm": "azure_basic_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document for information about the NAMRU-3 team's visit to Monrovia, Liberia in November 2012...\n\nFrom page 7, under \"NAMRU-3 Supports Medical Research Capacity Building in Liberia\":\n\n\"The NAMRU-3 team visited Monrovia, Liberia in November to meet with key collaborators, including Dr. Walter Gwenigale, the Minister of Health and Social Welfare; Dr. Fatorma Bolay, the Director of LIBR; and U.S. Marine Col. Vernon Graham, the officer in charge of Operation Onward Liberty (OOL).\"\n\nAnswer: Dr. Walter Gwenigale, Dr. Fatorma Bolay, Col. Vernon Graham", "citations": [], "input_tokens": 14517, "output_tokens": 176, "cost_micros": 46191, "latency_ms": 5672, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "col vernon graham, dr fatorma bolay, dr walter gwenigale", "normalised_gold": "fatorma bolay, vernon graham, walter gwenigale"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q004", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "list", "gold": "['Walter Gwenigale', 'Fatorma Bolay', 'Vernon Graham']", "pages": 16, "document_id": 5220, "arm": "azure_premium_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document for information about NAMRU-3's visit to Monrovia, Liberia in November 2012...\n\nFrom page 7, the article \"NAMRU-3 Supports Medical Research Capacity Building in Liberia\" states:\n\n\"The NAMRU-3 team visited Monrovia, Liberia in November to meet with key collaborators, including Dr. Walter Gwenigale, the Minister of Health and Social Welfare; Dr. Fatorma Bolay, the Director of LIBR; and U.S. Marine Col. Vernon Graham, the officer in charge of Operation Onward Liberty (OOL).\"\n\nAnswer: Dr. Walter Gwenigale, Dr. Fatorma Bolay, Col. Vernon Graham", "citations": [], "input_tokens": 16167, "output_tokens": 176, "cost_micros": 51141, "latency_ms": 5250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "col vernon graham, dr fatorma bolay, dr walter gwenigale", "normalised_gold": "fatorma bolay, vernon graham, walter gwenigale"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q004", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "list", "gold": "['Walter Gwenigale', 'Fatorma Bolay', 'Vernon Graham']", "pages": 16, "document_id": 5220, "arm": "llamacloud_basic_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the section titled \"NAMRU-3 Supports Medical Research Capacity Building in Liberia\" on page 7:\n\nThe document states: \"The NAMRU-3 team visited Monrovia, Liberia in November to meet with key collaborators, including Dr. Walter Gwenigale, the Minister of Health and Social Welfare; Dr. Fatorma Bolay, the Director of LIBR; and U.S. Marine Col. Vernon Graham, the officer in charge of Operation Onward Liberty (OOL).\"\n\nAnswer: Dr. Walter Gwenigale, Dr. Fatorma Bolay, U.S. Marine Col. Vernon Graham", "citations": [], "input_tokens": 12961, "output_tokens": 156, "cost_micros": 41223, "latency_ms": 5391, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "dr fatorma bolay, dr walter gwenigale, u s marine col vernon graham", "normalised_gold": "fatorma bolay, vernon graham, walter gwenigale"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q004", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "list", "gold": "['Walter Gwenigale', 'Fatorma Bolay', 'Vernon Graham']", "pages": 16, "document_id": 5220, "arm": "llamacloud_premium_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document, specifically the article \"NAMRU-3 Supports Medical Research Capacity Building in Liberia\" on page 7:\n\nThe text states: \"The NAMRU-3 team visited Monrovia, Liberia in November to meet with key collaborators, including Dr. Walter Gwenigale, the Minister of Health and Social Welfare; Dr. Fatorma Bolay, the Director of LIBR; and U.S. Marine Col. Vernon Graham, the officer in charge of Operation Onward Liberty (OOL).\"\n\nAnswer: Dr. Walter Gwenigale, Dr. Fatorma Bolay, U.S. Marine Col. Vernon Graham", "citations": [], "input_tokens": 14435, "output_tokens": 159, "cost_micros": 45690, "latency_ms": 5828, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "dr fatorma bolay, dr walter gwenigale, u s marine col vernon graham", "normalised_gold": "fatorma bolay, vernon graham, walter gwenigale"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q004", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "list", "gold": "['Walter Gwenigale', 'Fatorma Bolay', 'Vernon Graham']", "pages": 16, "document_id": 5220, "arm": "surfsense_agentic", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to find information about the NAMRU-3 team visit to Monrovia, Liberia in November 2012.", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 23750, "error": null, "extra": {"thread_id": 1912, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_83c19c542b6a41ceac941d5981d28799", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 42, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "i ll read document to find information about namru 3 team visit to monrovia, liberia in november 2012", "normalised_gold": "fatorma bolay, vernon graham, walter gwenigale"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "6", "pages": 16, "document_id": 5220, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:50:36Z", "latency_ms": 41750, "error": null, "raw_text_chars": 1479}]}, "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "answer_letter": null, "raw_text": "I need to count the pictures in the document that contain only one person.\n\nLet me go through the document page by page:\n\nPage 1:\n- Photo of Dr. Xiaoyan \"Cathy\" Zou with Rear Adm. (Ret.) Tim Ziemer - 2 people\n\nPage 2:\n- Photo of Capt. John W. Sanders - 1 person\n- Photo of Capt. Steven M. Wechsler - 1 person\n\nPage 3:\n- Photo of Afghan trainees in laboratory - multiple people\n\nPage 4:\n- Group photo of Pacific Partnership 2012 Internal Medicine Department staff - multiple people\n- Photo of Cmdr. Charmagne Beckett on flight deck - 1 person\n\nPage 5:\n- Photo of Petty Officer 1st Class Dennis Gonzales - 1 person\n\nPage 6:\n- Photo of Lance Cpl. Kip Boker and Cpl. Jacob Echeverri - 2 people\n\nPage 7:\n- Photo with Lt. Cmdr. Jennifer Curry, Capt. Buhari Oyofo, Dr. Walter T. Gwenigale, Lt. Joseph Diclaro, and Dr. Fatorma Bolay - 5 people\n- Photo with Lt. Cmdr. Jennifer Curry, Capt. Chris Martinez, Capt. Buhari Oyofo, Col. Vernon Graham, and Lt. Joseph Diclaro - 5 people\n\nPage 8:\n- Group photo of Kazakh scientists and NMRC staff - multiple people\n\nPage 9:\n- Photo of Lt. j.g. Michael Rucker treating a child - 2 people\n- Photo of U.S. Marines and Sailors in aircraft - multiple people\n\nPage 10:\n- Group photo of Joint Combat Casualty Research Team - multiple people\n\nPage 12:\n- Photo of NMRC 2012 Dining Out Event - multiple people\n- Photo with Rear Adm. Bruce Doll, Capt. John Sanders and others - multiple people\n\nPage 13:\n- Photo with Ellen Flannery, Emmanuel Adu-Gyamfi, Eil", "citations": [], "input_tokens": 40387, "output_tokens": 512, "cost_micros": 128841, "latency_ms": 41735, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "length", "pdf_filename": "0b85477387a9d0cc33fca0f4becaa0e5.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "6"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "int", "gold": "6", "pages": 16, "document_id": 5220, "arm": "azure_basic_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "answer_letter": null, "raw_text": "I'll examine each image in the document to count those containing only one person.\n\nGoing through the document:\n\n1. Page 1: Image of Dr. Xiaoyan \"Cathy\" Zou with Rear Adm. Ziemer - 2 people\n2. Page 4: Image of Cmdr. Charmagne Beckett on flight deck - 1 person\n3. Page 4: Group photo of Internal Medicine Department staff - multiple people\n4. Page 5: Image of Petty Officer 1st Class Dennis Gonzales - 1 person\n5. Page 5: Image of Lance Cpl. Kip Boker with Cpl. Jacob Echeverri - 2 people\n6. Page 7: Group photo with Col. Vernon Graham - 5 people\n7. Page 7: Group photo with Dr. Walter T. Gwenigale - 5 people\n8. Page 8: Group photo of Kazakh scientists and staff - 9 people\n9. Page 9: Image of Lt. j.g. Michael Rucker treating a child - 2 people\n10. Page 9: Image of U.S. Marines and Sailors - multiple people\n11. Page 10: Group photo of Joint Combat Casualty Research Team - 9 people\n12. Page 11: Graphic illustration (not a photograph of people)\n13. Page 12: Multiple images from Dining Out event - multiple people in each\n14. Page 13: Group photo at Notre Dame - 4 people\n15. Page 14: Image of Cmdr. Daniel Szumlas with Rear Adm. Ziemer - 2 people\n16. Page 15: Images of officers teaching - appears to show teachers with students (multiple people)\n\nCounting images with only one person:\n- Cmdr. Charmagne Beckett on flight deck\n- Petty Officer 1st Class Dennis Gonzales\n\nAnswer: 2", "citations": [], "input_tokens": 14484, "output_tokens": 453, "cost_micros": 50247, "latency_ms": 10203, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "6"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "int", "gold": "6", "pages": 16, "document_id": 5220, "arm": "azure_premium_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "answer_letter": null, "raw_text": "I'll go through each picture/photo in the document systematically to count those containing only one person.\n\n1. Page 1: Photo of Dr. Xiaoyan \"Cathy\" Zou with Rear Adm. (Ret.) Tim Ziemer - 2 people\n2. Page 2: Photo of Capt. John W. Sanders III - 1 person \u2713\n3. Page 2: Photo of Capt. Steven M. Wechsler - 1 person \u2713\n4. Page 3: Photo of Afghan trainees in laboratory - multiple people\n5. Page 4: Photo of Cmdr. Charmagne Beckett - 1 person \u2713\n6. Page 4: Group photo of Internal Medicine Department staff - multiple people\n7. Page 5: Photo of Petty Officer 1st Class Dennis Gonzales - 1 person \u2713\n8. Page 6: Photo of Lance Cpl. Kip Boker and Cpl. Jacob Echeverri - 2 people\n9. Page 7: Photo with Col. Vernon Graham and team - multiple people\n10. Page 7: Photo with Dr. Gwenigale and team - multiple people\n11. Page 7: Photo of Pfc. Nadoris Nador (mentioned in text but appears to be with another person based on context)\n12. Page 8: Group photo of Kazakh scientists and NMRC staff - multiple people\n13. Page 9: Photo of Lt. j.g. Michael Rucker treating a child - 2 people\n14. Page 9: Photo of U.S. Marines and Sailors - multiple people\n15. Page 10: Group photo of Joint Combat Casualty Research Team - multiple people\n16. Page 12: Group photos from Dining Out event - multiple people in each\n17. Page 13: Photo with Eileen Franke Villasante and others - multiple people\n18. Page 14: Photo of Cmdr. Daniel Szumlas with Rear Adm. Ziemer - 2 people\n19. Page 14: Photo of scouts learning flag etiquette - multiple people\n20. Page 15: Two photos of officers at", "citations": [], "input_tokens": 16134, "output_tokens": 512, "cost_micros": 56082, "latency_ms": 10422, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "20", "normalised_gold": "6"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "int", "gold": "6", "pages": 16, "document_id": 5220, "arm": "llamacloud_basic_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "answer_letter": null, "raw_text": "I'll go through each picture in the document systematically to count how many contain only one person.\n\n1. Page 1: Dr. Xiaoyan \"Cathy\" Zou with Rear Adm. Ziemer - 2 people\n2. Page 4: Cmdr. Charmagne Beckett on flight deck - 1 person\n3. Page 5: Petty Officer 1st Class Dennis Gonzales getting swabbed - 2 people\n4. Page 6: Lance Cpl. Kip Boker standing while Cpl. Jacob Echeverri collects sample - 2 people\n5. Page 8: Group photo of Kazakh scientists and NMRC staff - multiple people\n6. Page 9: Lt. j.g. Michael Rucker treating a girl - 2 people\n7. Page 10: JC2RT Team Members group photo - multiple people\n8. Page 11: Graphic illustration (not a photo of people)\n9. Page 12: Group photo at dining out - multiple people\n10. Page 14: Cmdr. Daniel Szumlas with Rear Adm. Ziemer - 2 people\n11. Page 15: Two photos - Cmdr. Charmagne Beckett talking with students (multiple people) and Lt. Cmdr. Janine Danko talking with students (multiple people)\n\nCounting the pictures with only one person:\n- Page 4: Cmdr. Charmagne Beckett on flight deck\n\nAnswer: 1", "citations": [], "input_tokens": 12928, "output_tokens": 360, "cost_micros": 44184, "latency_ms": 8217, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "6"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "int", "gold": "6", "pages": 16, "document_id": 5220, "arm": "llamacloud_premium_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "answer_letter": null, "raw_text": "I'll go through each image in the document and count those containing only one person.\n\n1. Page 1: Dr. Xiaoyan \"Cathy\" Zou discussing research - 2 people visible\n2. Page 4 (top): Cmdr. Charmagne Beckett on flight deck - 1 person\n3. Page 4 (bottom): Pacific Partnership 2012 Internal Medicine Department staff - multiple people (10)\n4. Page 5: Petty Officer 1st Class Dennis Gonzales - 1 person\n5. Page 6: Lance Cpl. Kip Boker and Cpl. Jacob Echeverri - 2 people\n6. Page 7 (top): Group photo with Dr. Gwenigale - 5 people\n7. Page 7 (bottom): Group photo with Col. Graham - 5 people\n8. Page 8: Group photo of Kazakh scientists - 9 people\n9. Page 9 (top): U.S. Marines and Sailors in aircraft - multiple people\n10. Page 9 (bottom): Lt. j.g. Michael Rucker treating girl - 2 people\n11. Page 10: Joint Combat Casualty Research Team - 9 people\n12. Page 12 (top): Dining Out Event photo - multiple people at tables\n13. Page 12 (bottom): Group photo at Dining Out - 6 people\n14. Page 13: Group photo at Notre Dame - 4 people\n15. Page 14 (top): Cmdr. Szumlas with Rear Adm. Ziemer - 3 people\n16. Page 14 (middle): Dr. Andrea Keane-Myers at podium - 1 person\n17. Page 14 (bottom): NAMRU-3 staff with Cub Scouts - multiple people\n18. Page 15 (left): Cmdr. Charmagne Beckett with student - 2 people\n19. Page 15 (right): Lt. Cmdr. Janine Danko with students - 3 people\n20. Page 15 (bottom): Lt. Danett Bishop in classroom - 1 ", "citations": [], "input_tokens": 14402, "output_tokens": 512, "cost_micros": 50886, "latency_ms": 10640, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "20", "normalised_gold": "6"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "int", "gold": "6", "pages": 16, "document_id": 5220, "arm": "surfsense_agentic", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "answer_letter": null, "raw_text": "I'll examine the document to count pictures containing only one person.Let me continue reading to see all images in the document.Let me read the rest of the document more systematically by checking remaining chunks.Now let me read the visual descriptions for each image systematically to count how many contain only one person. Let me read through the remaining images I haven't seen yet.Now let me read the final images:Now let me count all the images with only one person based on the visual descriptions I've read:\n\nBased on my review of all the images in the document, here are the ones that contain only ONE person:\n\n1. **Image75.jpg** - Formal military portrait photograph of a naval officer (one person)\n2. **Image76.jpg** - Formal military portrait photograph (one person in uniform)\n3. **Image83.jpg** - Person on naval vessel deck wearing Navy uniform (one person)\n4. **Image86.jpg** - Close-up photograph of a young male service member in desert camouflage (one person)\n5. **Image125.jpg** - Woman speaking at a podium at Johns Hopkins (one person)\n6. **Image133.jpg** - Person in military uniform standing and presenting with equipment (one person)\n\nImages with MULTIPLE people:\n- Image9.jpg - Banner header (no people, just background with multiple individuals in soft focus)\n- Image55.png - QR code (no people)\n- Image69.jpg - Laboratory setting with TWO individuals\n- Image79.jpg - Laboratory scene with approximately 6 people\n- Image82.jpg - Group of 10 individuals\n- Image89.jpg - Emblem/insignia (no people)\n- Image90.jpg - Three individuals visible\n- Image93.jpg - Five individuals\n- Image94.jpg - Five individuals\n- Image97.jpg - Group of approximately 10 individuals\n- Image100.jpg - Healthcare provider and pediatric patient (TWO people)\n- Image101.jpg - Military transport aircraft interior with many personnel\n- Image103.jpg - Nine military personnel in front of helicopter\n- Image109.jpg - Diagram (no people)\n- Image112.jpg - Banquet hall with many attendees\n- Image113.jpg - Five individuals\n- Image116.jpg - Four individuals\n- Image127.jpg - Group of 10-12 people\n- Image128.jpg - Three men in conversation\n- Image131.jpg - Two women in foreground\n- Image132.jpg - Three individuals\n- Image139.jpg - Emblem/logo (no people)\n\nAnswer: 6", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 57125, "error": null, "extra": {"thread_id": 1913, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_8b24b6e5b403450996ab78f93d47888d", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 381, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q006", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "int", "gold": "9", "pages": 16, "document_id": 5220, "arm": "native_pdf", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q006", "answer_letter": null, "raw_text": "I need to examine all the images in the document to count the number of people in each.\n\nLet me go through each page:\n\nPage 1: Contains one photo showing two people (Dr. Xiaoyan \"Cathy\" Zou and Rear Adm. Tim Ziemer)\n\nPage 2: Contains two photos - one of Capt. Sanders (1 person) and one of Capt. Wechsler (1 person)\n\nPage 3: Contains one photo showing multiple people in a laboratory setting - I can count approximately 9 people\n\nPage 4: Contains one photo showing a group of medical staff - I can count approximately 10 people in the Internal Medicine Department staff photo\n\nPage 5: Contains one photo showing one person (Petty Officer 1st Class Dennis Gonzales)\n\nPage 6: Contains two photos - one showing multiple people swabbing (2 people visible), and one showing a group photo with approximately 9 people\n\nPage 7: Contains two photos - both showing groups of approximately 4-5 people each\n\nPage 8: Contains one photo showing approximately 8 people (Kazakh scientists and NMRC staff)\n\nPage 9: Contains two photos - one showing many Marines/Sailors on aircraft (approximately 50+ people), and one showing 2 people\n\nPage 10: Contains one photo showing approximately 9 people (JC2RT team members)\n\nPage 11: Contains one image/diagram (not a photo of people)\n\nPage 12: Contains two photos showing dining out events with multiple people at tables\n\nPage 13: Contains one photo showing 4 people\n\nPage 14: Contains two photos showing 2-3 people in each\n\nPage 15: Contains two photos showing 2-3 people in each\n\nPage 16: Contains no photos of people\n\nThe page with the most people is page 9, which contains the photo of U.S. Marines and Sailors on the aircraft with approximately 50+ people visible.\n\nAnswer: 9", "citations": [], "input_tokens": 40400, "output_tokens": 450, "cost_micros": 127950, "latency_ms": 40312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "0b85477387a9d0cc33fca0f4becaa0e5.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "9", "normalised_gold": "9"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q006", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "int", "gold": "9", "pages": 16, "document_id": 5220, "arm": "azure_basic_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q006", "answer_letter": null, "raw_text": "I'll examine each page of the document to count the number of people visible in pictures.\n\nPage 1: Picture shows one person (Dr. Xiaoyan \"Cathy\" Zou) with another person (Rear Adm. Ziemer)\n\nPage 2: No pictures with people\n\nPage 3: Picture shows people in protective equipment - difficult to count exactly but appears to be a few people\n\nPage 4: Multiple pictures - one shows Cmdr. Charmagne Beckett (1 person), another group photo shows approximately 10 people (the Internal Medicine Department staff)\n\nPage 5: Two pictures - one shows a person treating a child (2 people), another shows Marines/Sailors (multiple people but distant)\n\nPage 6: Two pictures - one shows a person standing while another collects a sample (2 people), another shows people at a food event (several people visible)\n\nPage 7: Two pictures - one shows 5 people posing together, another shows 5 people in a meeting\n\nPage 8: Picture shows 9 people (Kazakh scientists and staff)\n\nPage 9: Two pictures - one shows medical treatment (2-3 people), another shows troops (multiple but distant)\n\nPage 10: Picture shows approximately 9 people in the JC2RT team photo\n\nPage 11: One diagram/illustration, not a photograph of real people\n\nPage 12: Two pictures - one shows a dining event with many people, another shows 6 people\n\nPage 13: Picture shows 4 people at Notre Dame\n\nPage 14: Two pictures - one shows 2 people in lab, another shows scouts with adults (approximately 6-8 people)\n\nPage 15: Two pictures at Elizabeth Seton High School showing interactions with students - multiple people in each\n\nPage 16: No pictures with people\n\nThe page with the most people appears to be page 12, which shows the NMRC Dining Out event with many attendees visible in the first picture.\n\nAnswer: 12", "citations": [], "input_tokens": 14497, "output_tokens": 449, "cost_micros": 50226, "latency_ms": 12717, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "12", "normalised_gold": "9"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q006", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "int", "gold": "9", "pages": 16, "document_id": 5220, "arm": "azure_premium_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q006", "answer_letter": null, "raw_text": "I need to examine all the pictures in the document and count the number of people visible in each one to determine which page has the picture with the most people.\n\nLet me go through each page systematically:\n\n**Page 1:** Picture shows two people (Dr. Xiaoyan \"Cathy\" Zou and Rear Adm. Ziemer)\n\n**Page 2:** Two headshot photos of commanding officers (single person each)\n\n**Page 3:** One picture showing multiple people in what appears to be a laboratory setting with people in protective gear\n\n**Page 4:** One picture showing Cmdr. Charmagne Beckett (single person), and one group photo with approximately 10 people (Internal Medicine Department staff physicians)\n\n**Page 5:** Two pictures - one showing a person getting a swab sample (2 people), another showing two people (one standing, one collecting sample)\n\n**Page 6:** Two pictures with small groups (3-5 people each)\n\n**Page 7:** Two pictures with small groups (3-5 people each)\n\n**Page 8:** One group photo showing approximately 9 people (Kazakh scientists and NMRC staff)\n\n**Page 9:** Two individual pictures (single subjects each)\n\n**Page 10:** No pictures with people visible in detail\n\n**Page 11:** One diagram/illustration (not a photograph of actual people)\n\n**Page 12:** Two pictures - one appears to be a group photo at a dining event, and another showing 6 people (smaller group photo with officers and spouses)\n\n**Page 13:** One picture showing 4 people\n\n**Page 14:** One picture showing people in what appears to be a Scout activity\n\n**Page 15:** Two pictures showing classroom/teaching scenes with multiple students\n\n**Page 16:** One picture showing a single person\n\nThe largest group photo appears to be on **page 4**, which shows the Internal Medicine Department staff physicians with approximately 10 people clearly visible in the photograph.\n\nAnswer: 4", "citations": [], "input_tokens": 16147, "output_tokens": 443, "cost_micros": 55086, "latency_ms": 11452, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "9"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q006", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "int", "gold": "9", "pages": 16, "document_id": 5220, "arm": "llamacloud_basic_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q006", "answer_letter": null, "raw_text": "I need to examine all the pictures in the document and count the number of people in each one.\n\nLet me go through each page:\n\nPage 1: Picture of two people (Dr. Xiaoyan \"Cathy\" Zou and Rear Adm. Ziemer)\n\nPage 2: No pictures with people\n\nPage 3: No pictures with people\n\nPage 4: Picture of one person (Cmdr. Charmagne Beckett on flight deck)\n\nPage 5: Picture of one person (Petty Officer 1st Class Dennis Gonzales getting swabbed)\n\nPage 6: Picture of two people (Lance Cpl. Kip Boker and Cpl. Jacob Echeverri)\n\nPage 7: No pictures with people\n\nPage 8: Picture with 9 people (Kazakh scientists group photo - Back row has 7 people: Gulnar Omasheva, Alexey Andryuchshenko, Sarah Pisarcik, Heidi St. John, Dr. Kenneth Yeh, Dr. Allen Richards, and Talgat Nurmakhanov. Bottom row has 3 people: Dr. Ekaterine Adeishvili, Lyazzat Musralina, and Dr. Ju Jiang. Total: 10 people)\n\nPage 9: Picture of one person (Lt. j.g. Michael Rucker treating a patient)\n\nPage 10: Picture with 9 people (JC2RT Team Members - Back row: 5 people, Front row: 4 people)\n\nPage 11: Graphic illustration (no people photo)\n\nPage 12: Picture with 6 people (Rear Adm. Bruce Doll, NMRC Commanding Officer Capt. John Sanders, Dr. Leighann Sanders, NMRC Executive Officer Capt. Elizabeth Montcalm-Smith, and Dr. Chris Smith - actually appears to be 5 people based on the caption listing)\n\nPage 13: No pictures with people\n\nPage 14: Picture with 2 people (Cmdr. Daniel Szumlas and Rear Adm. Ziemer with one other person visible = 3 people total in the image)\n\nPage ", "citations": [], "input_tokens": 12941, "output_tokens": 512, "cost_micros": 46503, "latency_ms": 12328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "9"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q006", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "int", "gold": "9", "pages": 16, "document_id": 5220, "arm": "llamacloud_premium_lc", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q006", "answer_letter": null, "raw_text": "I need to examine all the pictures in the document and count the number of people in each to determine which page has the picture with the most people.\n\nLet me go through the document page by page:\n\n**Page 1:** Picture of Dr. Xiaoyan \"Cathy\" Zou discussing research with Rear Adm. (Ret.) Tim Ziemer - 2 people\n\n**Page 4:** Two pictures:\n- Cmdr. Charmagne Beckett on flight deck - 1 person\n- Pacific Partnership 2012 Internal Medicine Department staff - 10 people\n\n**Page 5:** No pictures with people clearly visible\n\n**Page 6:** Two pictures:\n- Petty Officer 1st Class Dennis Gonzales - 1 person\n- Lance Cpl. Kip Boker with Cpl. Jacob Echeverri - 2 people\n\n**Page 7:** Two pictures:\n- Group with Lt. Cmdr. Jennifer Curry, Capt. Buhari Oyofo, Dr. Walter T. Gwenigale, Lt. Joseph Diclaro, and Dr. Fatorma Bolay - 5 people\n- Group with military personnel - 5 people\n\n**Page 8:** Group photo with Kazakh scientists - 9 people (Top row: 7 people, Bottom row: 3 people, total 10 people visible)\n\n**Page 9:** Two pictures:\n- U.S. Marines and Sailors in aircraft cargo hold - This appears to have many people seated in rows, likely 30+ people\n- Lt. j.g. Michael Rucker with young girl - 2 people\n\n**Page 10:** Group photo of Joint Combat Casualty Research Team - 9 people\n\n**Page 12:** Picture at Dining Out event - 6 people visible\n\n**Page 13:** Picture with 4 people outdoors\n\n**Page 14:** Two pictures:\n- Cmdr. Daniel Szumlas with visitors - 3 people\n- NAMRU-3 staff teaching Cub Scouts flag folding - approximately 8-10 people\n\n**Page 15:** Two pictures with 2-3 people each\n\nThe picture on page 9 showing \"U", "citations": [], "input_tokens": 14415, "output_tokens": 512, "cost_micros": 50925, "latency_ms": 12344, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "9", "normalised_gold": "9"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q006", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "answer_format": "int", "gold": "9", "pages": 16, "document_id": 5220, "arm": "surfsense_agentic", "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the document to find all pictures and count the number of people in each.Let me continue reading the document to find all images and their descriptions:Now let me read each image section to count the people in each. Let me check the images systematically:Let me search for page numbers and images with people descriptions:Now let me systematically check each page for images with people. Let me read the sections around each page number:Good! This image on page 3 has \"approximately 6 people\". Let me continue checking other pages:Excellent! This image has \"10 individuals\". Let me continue checking:This image has 1 person. Let me check page 5:This image has 1 person. Let me check more pages:This is an emblem with 0 people. Let me continue:This image has \"Three individuals visible in foreground and midground\" plus \"Additional blurred figures visible in far background\". Let me read more:This image has \"Five individuals\". Let me continue checking more pages:This has 5 people. Let me check page 8:Good! This image has \"approximately 10 individuals\". Let me check page 9:This image has 2 people (healthcare provider and patient). Let me check page 9 more:This aircraft image has many people, but let me get a more specific count. Let me continue checking other pages:Excellent! This image has \"nine military personnel\". Let me check page 11:This dining hall image has many people seated. Let me count more carefully by reading the description again. It mentions \"Audience members\" and \"Three individuals in naval/formal uniforms standing at front\" plus \"Head table visible in background with seated guests\". This is a large gathering but not specified exactly.\n\nLet me check more pages systematically:This has 5 people. Let me check page 13:This has 1 person at the podium. Let me check page 14:Page 14 has an image with \"approximately 10-12 people\" (Image127.jpg) and another with 3 people (Image128.jpg). Let me continue:This has \"Two women standing in foreground center\" plus \"Multiple people in burgundy/maroon uniforms scattered throughout background\". Let me check page 15:This image has \"Three individuals posing together\" plus \"Additional person visible in background on right side\" = 4 people total. Let me check page 16:Page 16 has an image with 1 person (Image133.jpg) and an emblem (Image139.jpg) with 0 people.\n\nNow let me summarize what I found. Let me verify the counts more carefully by re-reading specific key images:\n\nBased on my review:\n- Page 1: Header image with \"Multiple individuals visible in soft focus\"\n- Page 2: Individual portraits (1 person each)\n- Page 3: Laboratory scene with \"approximately 6 people\" (Image79.jpg) AND group photo with \"10 individuals\" (Image82.jpg)\n- Page 4: 1 person\n- Page 5: 1 person\n- Page 6: 5 people (two images with 3 and 5 people)\n- Page 7: 5 people\n- Page 8: \"approximately 10 individuals\" (Image97.jpg)\n- Page 9: Aircraft with many people + medical scene with 2 people\n- Page 10: Helicopter group with \"nine military personnel\" (Image103.jpg)\n- Page 11: Diagram (0 people) and banquet hall with many people (Image112.jpg - but not exactly counted)\n- Page 12: 5 people\n- Page 13: 4 people and 1 person\n- Page 14: \"approximately 10-12 people\" and 3 people\n- Page 15: Multiple people but not clearly counted\n- Page 16: 1 person and emblem\n\nThe images with the most people appear to be:\n- Page 3: Image82.jpg with \"10 individuals\"\n- Page 8: Image97.jpg with \"approximately 10 individuals\"  \n- Page 10: Image103.jpg with \"nine military personnel\"\n- Page 14: Image127.jpg with \"approximately 10-12 people\"\n- Page 11: Image112.jpg (banquet hall with many people)\n\nLet me check the banquet hall image more carefully:\n\nThe banquet hall description mentions \"Audience members\" seated at tables and \"Three individuals in naval/formal uniforms standing at front\" and \"Head table visible in background with seated guests\" but doesn't give an exact count. Given it's a formal dining out event, there are likely many more than 12 people, but without an exact count stated, I should go with the clearly stated counts.\n\nThe highest clearly stated count is **\"approximately 10-12 people\"** on **page 14**.\n\nAnswer: 14", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 167000, "error": null, "extra": {"thread_id": 1914, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_a7ec370e12904bd3b5569447e3077906", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 1160, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "14", "normalised_gold": "9"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q000", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Berlin School of Experimental Psychology", "pages": 15, "document_id": 5221, "arm": "native_pdf", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, specifically page 2, I can find information about where Gestalt psychology was conceived.\n\nThe text states: \"Gestalt psychology was conceived in the Berlin School of Experimental Psychology\"\n\nAnswer: Berlin School of Experimental Psychology", "citations": [], "input_tokens": 24360, "output_tokens": 56, "cost_micros": 73920, "latency_ms": 8031, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "0e94b4197b10096b1f4c699701570fbf.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "berlin school of experimental psychology", "normalised_gold": "berlin school of experimental psychology"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q000", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Berlin School of Experimental Psychology", "pages": 15, "document_id": 5221, "arm": "azure_basic_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, I can find information about Gestalt psychology in the section titled \"Gestalt Principles of Visual Perception.\"\n\nThe text states: \"Gestalt psychology was conceived in the Berlin School of Experimental Psychology\"\n\nAnswer: Berlin School of Experimental Psychology", "citations": [], "input_tokens": 5194, "output_tokens": 62, "cost_micros": 16512, "latency_ms": 3421, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "berlin school of experimental psychology", "normalised_gold": "berlin school of experimental psychology"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q000", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Berlin School of Experimental Psychology", "pages": 15, "document_id": 5221, "arm": "azure_premium_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q000", "answer_letter": null, "raw_text": "According to the document, Gestalt psychology was conceived in the Berlin School of Experimental Psychology.\n\nAnswer: Berlin School of Experimental Psychology", "citations": [], "input_tokens": 6446, "output_tokens": 32, "cost_micros": 19818, "latency_ms": 2437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "berlin school of experimental psychology", "normalised_gold": "berlin school of experimental psychology"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q000", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Berlin School of Experimental Psychology", "pages": 15, "document_id": 5221, "arm": "llamacloud_basic_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q000", "answer_letter": null, "raw_text": "According to the document, Gestalt psychology was conceived in the Berlin School of Experimental Psychology.\n\nAnswer: Berlin School of Experimental Psychology", "citations": [], "input_tokens": 4354, "output_tokens": 32, "cost_micros": 13542, "latency_ms": 3719, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "berlin school of experimental psychology", "normalised_gold": "berlin school of experimental psychology"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q000", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Berlin School of Experimental Psychology", "pages": 15, "document_id": 5221, "arm": "llamacloud_premium_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q000", "answer_letter": null, "raw_text": "According to the document, Gestalt psychology was conceived in the Berlin School of Experimental Psychology.\n\nAnswer: Berlin School of Experimental Psychology", "citations": [], "input_tokens": 9744, "output_tokens": 32, "cost_micros": 29712, "latency_ms": 3890, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "berlin school of experimental psychology", "normalised_gold": "berlin school of experimental psychology"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q000", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Berlin School of Experimental Psychology", "pages": 15, "document_id": 5221, "arm": "surfsense_agentic", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the relevant document to find information about where Gestalt psychology was conceived.Let me search for \"Gestalt\" to find the relevant section:Perfect! I found the answer. The document clearly states where Gestalt psychology was conceived.\n\nAnswer: Berlin School of Experimental Psychology", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 28250, "error": null, "extra": {"thread_id": 1915, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_2a7b4cbe305b48eca21d964a816bfb01", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 102, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "berlin school of experimental psychology", "normalised_gold": "berlin school of experimental psychology"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q001", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "float", "gold": "21%", "pages": 15, "document_id": 5221, "arm": "native_pdf", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q001", "answer_letter": null, "raw_text": "I need to find the chart on page 14 that shows time spent with family and friends in 2010.\n\nLooking at page 14, I can see a chart titled \"Time spent on weekends\" comparing 2005 and 2010. The chart shows two pie charts/clock-like circular diagrams.\n\nFor the 2010 chart (the larger circle on the right), I can see a segment labeled \"with family and friends\" which shows 21%.\n\nAnswer: 21", "citations": [], "input_tokens": 24371, "output_tokens": 109, "cost_micros": 74748, "latency_ms": 10109, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "0e94b4197b10096b1f4c699701570fbf.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "21.0", "normalised_gold": "21.0"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q001", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "float", "gold": "21%", "pages": 15, "document_id": 5221, "arm": "azure_basic_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q001", "answer_letter": null, "raw_text": "I need to find the chart on page 14 that shows time spent with family and friends in 2010.\n\nLooking at the document, I can see a chart titled \"Time spent on weekends\" that compares 2005 and 2010 data.\n\nIn this chart, for 2010, I can see:\n- Shopping 10%\n- Fitness 6%\n- With family and friends 35%\n- Eating out 17%\n- Hobbies 4%\n- Net surfing 10%\n- Travelling 6%\n- Reading 4%\n- Watching films 22%\n\nWait, I see \"With family and friends\" appears to be listed as 35% for 2010.\n\nHowever, looking more carefully at the layout, the percentages seem to be:\n- With family and friends 21% (this appears to be for 2005)\n- With family and friends 35% (this appears to be for 2010)\n\nThe chart shows that in 2010, 35% of time on weekends was spent with family and friends.\n\nAnswer: 35", "citations": [], "input_tokens": 5205, "output_tokens": 251, "cost_micros": 19380, "latency_ms": 5953, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "35.0", "normalised_gold": "21.0"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q001", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "float", "gold": "21%", "pages": 15, "document_id": 5221, "arm": "azure_premium_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the chart on page 14 titled \"Time spent on weekends\", I can see two pie charts comparing 2005 and 2010.\n\nIn the 2010 chart (on the right), the segment labeled \"With family and friends\" shows 35%.\n\nAnswer: 35", "citations": [], "input_tokens": 6457, "output_tokens": 67, "cost_micros": 20376, "latency_ms": 3467, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "35.0", "normalised_gold": "21.0"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q001", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "float", "gold": "21%", "pages": 15, "document_id": 5221, "arm": "llamacloud_basic_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q001", "answer_letter": null, "raw_text": "I need to find the chart on page 14 that shows time spent with family and friends in 2010.\n\nLooking through the document, I can see a table titled \"Time spent on weekends\" which shows data for 2005 and 2010.\n\nIn this table, the row \"With family and friends\" shows:\n- 2005: 21%\n- 2010: 17%\n\nAnswer: 17", "citations": [], "input_tokens": 4365, "output_tokens": 97, "cost_micros": 14550, "latency_ms": 3280, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "17.0", "normalised_gold": "21.0"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q001", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "float", "gold": "21%", "pages": 15, "document_id": 5221, "arm": "llamacloud_premium_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q001", "answer_letter": null, "raw_text": "I need to find the chart on page 14 that shows time spent with family and friends in 2010.\n\nLooking at the document, I can see a table titled \"Time spent on weekends\" which shows data for 2005 and 2010.\n\nThe table shows:\n| Activity                | 2005 | 2010 |\n| ----------------------- | ---- | ---- |\n| With family and friends | 35   | 21   |\n\nAccording to this table, time spent with family and friends in 2010 was 21.\n\nAnswer: 21", "citations": [], "input_tokens": 9755, "output_tokens": 132, "cost_micros": 31245, "latency_ms": 3875, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "21.0", "normalised_gold": "21.0"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q001", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "float", "gold": "21%", "pages": 15, "document_id": 5221, "arm": "surfsense_agentic", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the document to find the chart on page 14 and extract the information about time spent with family and friends in 2010.Let me search for page 14 content in the document:Let me search for content related to \"family and friends\" and \"2010\":Great! I found the relevant information. Let me read that specific chunk to get the exact details:Perfect! I can see from the OCR text that the chart shows time spent on weekends in 2005 and 2010. Looking at the data:\n\nFor 2010:\n- \"With family and friends 21%\" appears on line 2410\n- \"With family and friends 17%\" appears on line 2412-2413\n\nWait, let me read this more carefully. The structure shows:\n- Line 2403: \"2005\"\n- Line 2404: \"2010\"\n- Line 2410: \"With family and friends 21%\"\n- Line 2412-2413: \"With family and friends 17%\"\n\nIt appears that 21% is associated with 2005, and 17% is associated with 2010 based on the layout. Let me verify by reading more context:Looking at the OCR text structure, it shows two columns (2005 and 2010). The pattern suggests:\n- Shopping: 10% (2005) and 10% (2010)\n- Fitness: 5% (2005) and 6% (2010)\n- With family and friends: 21% (2005) and 17% (2010)\n\nSo for 2010, the time spent with family and friends is 17%.\n\nLet me verify this is from page 14 by looking for page markers:Let me search for \"14\" near this chart content:Let me look further back in the document to find page numbering:Let me look for page markers earlier in the document structure:Let me check the document structure at the beginning to understand how pages are marked:I can see there are PageBreak markers. Let me count the PageBreaks to find page 14. Let me search more systematically:I can see the PageBreaks. The one at line 2340 appears to be just before the \"Time spent on weekends\" chart. Let me count the PageBreaks:\n\n1. Line 135\n2. Line 153\n3. Line 360\n4. Line 564\n5. Line 830\n6. Line 1381\n7. Line 1630\n8. Line 1676\n9. Line 1882\n10. Line 1951\n11. Line 2012\n12. Line 2143\n13. Line 2340\n14. Line 2470\n\nThe PageBreak at line 2340 is the 13th PageBreak. If we count pages starting from page 1, then after the 13th PageBreak we would be on page 14. The chart \"Time spent on weekends\" starts at line 2344, which is right after the 13th PageBreak at line 2340, so this chart is indeed on page 14.\n\nBased on the OCR text I found earlier, the chart shows:\n- 2005 column and 2010 column\n- \"With family and friends 21%\" (for 2005)\n- \"With family and friends 17%\" (for 2010)\n\nAnswer: 17", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 79187, "error": null, "extra": {"thread_id": 1916, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_6038a83cdf13408480954ef7afd6b6b7", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 590, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "17.0", "normalised_gold": "21.0"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q002", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "0-375 miles", "pages": 15, "document_id": 5221, "arm": "native_pdf", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the chart that tracks the West Nile Virus and look for the legend showing what the red color represents for distance from the Mississippi River.\n\nLooking at page 10, I can see a chart titled \"Tracking the West Nile Virus\" which shows a 3D map of what appears to be the United States with different colored regions.\n\nThe legend on the right side shows \"Approximate Distance from the Mississippi River (miles)\" with the following color-coded ranges:\n- Red: 0 - 375\n- Orange: 375 - 750\n- Yellow: 750 - 1125\n- Light yellow/cream: 1125 - 1500\n\nThe red color represents the closest distance range to the Mississippi River.\n\nAnswer: 0 - 375 miles", "citations": [], "input_tokens": 24377, "output_tokens": 178, "cost_micros": 75801, "latency_ms": 9609, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "0e94b4197b10096b1f4c699701570fbf.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "0 375 miles", "normalised_gold": "0 375 miles"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q002", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "0-375 miles", "pages": 15, "document_id": 5221, "arm": "azure_basic_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, I need to find the chart tracking the West Nile Virus and identify what the red color represents.\n\nI can see the \"Tracking the West Nile Virus\" chart which shows:\n\n**Approximate Distance from the Mississippi River (miles)**\n\nThe legend shows four distance ranges with different colors:\n- 0-375\n- 375 - 750\n- 750 -1125\n- 1125 - 1500\n\nBased on the typical color coding in such charts where red usually represents the most extreme or furthest category, the red color would represent the range **1125 - 1500** miles.\n\nNote: The question mentions \"in Europe\" but the chart shows distances from the Mississippi River, which is in North America, not Europe. The chart appears to be tracking West Nile Virus in relation to the Mississippi River region.\n\nAnswer: 1125 - 1500 miles", "citations": [], "input_tokens": 5211, "output_tokens": 207, "cost_micros": 18738, "latency_ms": 6687, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.3333333333333333, "method": "str_norm", "normalised_pred": "1125 1500 miles", "normalised_gold": "0 375 miles"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q002", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "0-375 miles", "pages": 15, "document_id": 5221, "arm": "azure_premium_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q002", "answer_letter": null, "raw_text": "I need to look at the \"Tracking the West Nile Virus\" chart in the document.\n\nLooking at the chart, I can see a legend that shows \"Approximate Distance from the Mississippi River (miles)\" with different ranges:\n\n- 0-375\n- 375-750\n- 750-1125\n- 1125-1500\n\nThe legend shows different shading/colors corresponding to these ranges. Based on the visual representation in the legend, the red color corresponds to the range 0-375 miles.\n\nHowever, I should note that the question asks about tracking West Nile Virus \"in Europe,\" but the chart actually shows tracking relative to the Mississippi River, which is in the United States, not Europe.\n\nAnswer: 0-375 miles", "citations": [], "input_tokens": 6463, "output_tokens": 170, "cost_micros": 21939, "latency_ms": 6297, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "0 375 miles", "normalised_gold": "0 375 miles"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q002", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "0-375 miles", "pages": 15, "document_id": 5221, "arm": "llamacloud_basic_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the \"Tracking the West Nile Virus\" chart in the document, I can see a legend that shows height-based ranges for approximate distance from the Mississippi River in miles.\n\nThe legend shows:\n- 0-375\n- 375-750\n- 750-1125\n- 1125-1500\n\nHowever, the document does not specify which color (including red) corresponds to which distance range in this chart. Additionally, the chart tracks West Nile Virus in relation to the Mississippi River, which is in the United States, not Europe as stated in the question.\n\nAnswer: The color coding for distance ranges is not specified in the document, and the chart is about the U.S., not Europe.", "citations": [], "input_tokens": 4371, "output_tokens": 160, "cost_micros": 15513, "latency_ms": 6000, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "color coding for distance ranges is not specified in document and chart is about u s not europe", "normalised_gold": "0 375 miles"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q002", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "0-375 miles", "pages": 15, "document_id": 5221, "arm": "llamacloud_premium_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, I need to find the chart about West Nile Virus and check what the red color represents.\n\nOn the page titled \"Tracking the West Nile Virus,\" I can see a 3D map of the United States (not Europe) with a legend showing:\n\n\"Approximate Distance from the Mississippi River (miles)\n[red square] 0 - 375\"\n\nHowever, I should note that this chart tracks West Nile Virus in the United States, not Europe. There is no chart about West Nile Virus in Europe in this document.\n\nAnswer: The chart tracks the United States, not Europe; red represents 0-375 miles from the Mississippi River.", "citations": [], "input_tokens": 9761, "output_tokens": 157, "cost_micros": 31638, "latency_ms": 4453, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.35294117647058826, "method": "str_norm", "normalised_pred": "chart tracks united states not europe red represents 0 375 miles from mississippi river", "normalised_gold": "0 375 miles"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q002", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "0-375 miles", "pages": 15, "document_id": 5221, "arm": "surfsense_agentic", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the document to find the chart about West Nile Virus in Europe and identify what the red color represents for distance from the Mississippi River.Let me search for content related to West Nile Virus and charts showing distance from the Mississippi River.I found a reference to a red zone around the Mississippi River. Let me read that section more carefully:Let me look at the beginning of this section to find the title and legend information:Perfect! I found the chart. Now let me check which color corresponds to which distance range. The legend shows:\n\n- 0-375 miles\n- 375-750 miles\n- 750-1125 miles\n- 1125-1500 miles\n\nAnd the description mentions that the red zone is the highest concentration around the Mississippi River area. Let me verify by checking if there's more specific information about the color coding:Based on the document, I can see that:\n- The chart shows \"Tracking the West Nile Virus\" (not in Europe as the question states, but in the US)\n- The legend shows distance ranges from the Mississippi River: 0-375, 375-750, 750-1125, 1125-1500 miles\n- The red zone represents the highest concentration, which is in the Central US region, particularly around the Mississippi River area\n- The color gradient goes: Red \u2192 Orange \u2192 Yellow \u2192 Pale Yellow (indicating decreasing distance/intensity)\n\nSince red is the highest concentration around the Mississippi River area, red represents the closest distance range.\n\nAnswer: 0-375 miles", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 54702, "error": null, "extra": {"thread_id": 1917, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_78ac91a640ab4a709acb37a25bd11e6b", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 311, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "0 375 miles", "normalised_gold": "0 375 miles"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q003", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Elements of objects tend to be perceptually grouped together if they form a pattern that is regular, simple, and orderly.", "pages": 15, "document_id": 5221, "arm": "native_pdf", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q003", "answer_letter": null, "raw_text": "Looking at page 11 of the document, the Law of Good Gestalt is defined as:\n\n\"Elements of objects tend to be perceptually grouped together if they form a pattern that is regular, simple, and orderly.\"\n\nThe document further explains that this law means individuals eliminate complexity and unfamiliarity to observe reality in its most simplistic form, and focuses on the idea of conciseness.\n\nAnswer: Elements grouped together if they form a pattern that is regular, simple, and orderly", "citations": [], "input_tokens": 24362, "output_tokens": 109, "cost_micros": 74721, "latency_ms": 10577, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "0e94b4197b10096b1f4c699701570fbf.pdf"}, "graded": {"correct": false, "f1": 0.8125000000000001, "method": "str_norm", "normalised_pred": "elements grouped together if they form pattern that is regular simple and orderly", "normalised_gold": "elements of objects tend to be perceptually grouped together if they form pattern that is regular simple and orderly"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q003", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Elements of objects tend to be perceptually grouped together if they form a pattern that is regular, simple, and orderly.", "pages": 15, "document_id": 5221, "arm": "azure_basic_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the document for the definition of the law of good Gestalt...\n\nThe document states: \"Elements of objects tend to be perceptually grouped together if they form a pattern that is regular, simple, and orderly. This law implies that as individuals perceive the world, they eliminate complexity and unfamiliarity so they can observe a reality in its most simplistic form.\"\n\nAnswer: Elements are grouped together if they form a pattern that is regular, simple, and orderly", "citations": [], "input_tokens": 5196, "output_tokens": 104, "cost_micros": 17148, "latency_ms": 3671, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.7878787878787878, "method": "str_norm", "normalised_pred": "elements are grouped together if they form pattern that is regular simple and orderly", "normalised_gold": "elements of objects tend to be perceptually grouped together if they form pattern that is regular simple and orderly"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q003", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Elements of objects tend to be perceptually grouped together if they form a pattern that is regular, simple, and orderly.", "pages": 15, "document_id": 5221, "arm": "azure_premium_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q003", "answer_letter": null, "raw_text": "According to the document, the law of good Gestalt is defined as:\n\nElements of objects tend to be perceptually grouped together if they form a pattern that is regular, simple, and orderly. The document also explains that individuals eliminate complexity and unfamiliarity to observe reality in its most simplistic form, and that this law focuses on the idea of conciseness.\n\nAnswer: Elements grouped together if they form a pattern that is regular, simple, and orderly", "citations": [], "input_tokens": 6448, "output_tokens": 102, "cost_micros": 20874, "latency_ms": 4391, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.8125000000000001, "method": "str_norm", "normalised_pred": "elements grouped together if they form pattern that is regular simple and orderly", "normalised_gold": "elements of objects tend to be perceptually grouped together if they form pattern that is regular simple and orderly"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q003", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Elements of objects tend to be perceptually grouped together if they form a pattern that is regular, simple, and orderly.", "pages": 15, "document_id": 5221, "arm": "llamacloud_basic_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q003", "answer_letter": null, "raw_text": "According to the document, the law of good gestalt is defined as:\n\nElements of objects tend to be perceptually grouped together if they form a pattern that is regular, simple, and orderly. The law implies that individuals eliminate complexity and unfamiliarity to observe reality in its most simplistic form. It focuses on the idea of conciseness and implies a global regularity that is mentally prioritized over spatial relations.\n\nAnswer: Elements grouped together if they form a pattern that is regular, simple, and orderly", "citations": [], "input_tokens": 4356, "output_tokens": 111, "cost_micros": 14733, "latency_ms": 3484, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.8125000000000001, "method": "str_norm", "normalised_pred": "elements grouped together if they form pattern that is regular simple and orderly", "normalised_gold": "elements of objects tend to be perceptually grouped together if they form pattern that is regular simple and orderly"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q003", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Elements of objects tend to be perceptually grouped together if they form a pattern that is regular, simple, and orderly.", "pages": 15, "document_id": 5221, "arm": "llamacloud_premium_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the document's section on \"Gestalt Principles of Visual Perception\" and specifically the \"Law of good Gestalt\":\n\nThe document defines it as: **Elements of objects tend to be perceptually grouped together if they form a pattern that is regular, simple, and orderly.**\n\nThe document further explains this implies that individuals eliminate complexity and unfamiliarity to observe reality in its most simplistic form, and that this law focuses on the idea of conciseness.\n\nAnswer: Elements grouped together if they form a pattern that is regular, simple, and orderly", "citations": [], "input_tokens": 9746, "output_tokens": 125, "cost_micros": 31113, "latency_ms": 4062, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.8125000000000001, "method": "str_norm", "normalised_pred": "elements grouped together if they form pattern that is regular simple and orderly", "normalised_gold": "elements of objects tend to be perceptually grouped together if they form pattern that is regular simple and orderly"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q003", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Elements of objects tend to be perceptually grouped together if they form a pattern that is regular, simple, and orderly.", "pages": 15, "document_id": 5221, "arm": "surfsense_agentic", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the priority document to find the definition of the law of good gestalt.Let me search for \"gestalt\" in this document to find the relevant section.Let me read the document content to find mentions of \"gestalt\":Now let me read the specific section around line 1965 where \"good gestalt\" is mentioned:Perfect! I found the definition. Let me read a bit more context to ensure I have the complete definition:Based on the document, the law of good gestalt is defined as:\n\n\"Elements of objects tend to be perceptually grouped together if they form a pattern that is regular, simple, and orderly.\"\n\nAnswer: Elements grouped together if they form a pattern that is regular, simple, and orderly", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 36484, "error": null, "extra": {"thread_id": 1918, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_d9af7588088a4b3184b7d88eec528363", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 192, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.8125000000000001, "method": "str_norm", "normalised_pred": "elements grouped together if they form pattern that is regular simple and orderly", "normalised_gold": "elements of objects tend to be perceptually grouped together if they form pattern that is regular simple and orderly"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q004", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Eating out", "pages": 15, "document_id": 5221, "arm": "native_pdf", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q004", "answer_letter": null, "raw_text": "I need to examine the chart on page 14 that shows \"Time spent on weekends\" comparing 2005 and 2010.\n\nLooking at the two pie charts:\n\n2005 (smaller chart):\n- With family and friends: 35%\n- Watching films: 20%\n- Reading: 10%\n- Travelling: 5%\n- Net surfing: 3%\n- Hobbies: 2%\n- Eating out: 10%\n- Fitness: 5%\n- Shopping: 10%\n\n2010 (larger chart):\n- With family and friends: 21%\n- Watching films: 22%\n- Eating out: 17%\n- Shopping: 10%\n- Fitness: 6%\n- Hobbies: 4%\n- Net surfing: 4%\n- Travelling: 6%\n\nCalculating the changes:\n- With family and friends: 21% - 35% = -14% (decrease)\n- Watching films: 22% - 20% = +2% (increase)\n- Reading: appears to be gone or very small\n- Eating out: 17% - 10% = +7% (increase)\n- Shopping: 10% - 10% = 0%\n- Fitness: 6% - 5% = +1% (increase)\n- Hobbies: 4% - 2% = +2% (increase)\n- Net surfing: 4% - 3% = +1% (increase)\n- Travelling: 6% - 5% = +1% (increase)\n\nThe largest increase is \"Eating out\" with a 7 percentage point increase.\n\nAnswer: Eating out", "citations": [], "input_tokens": 24374, "output_tokens": 401, "cost_micros": 79137, "latency_ms": 13093, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "0e94b4197b10096b1f4c699701570fbf.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "eating out", "normalised_gold": "eating out"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q004", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Eating out", "pages": 15, "document_id": 5221, "arm": "azure_basic_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q004", "answer_letter": null, "raw_text": "I need to find the table showing time spent on weekends comparing 2005 and 2010.\n\nLooking at the document, I can see a table titled \"Time spent on weekends\" with data for 2005 and 2010:\n\n2005 \u2192 2010:\n- Shopping: 10% \u2192 10% (0% change)\n- Fitness: 5% \u2192 6% (1% increase)\n- With family and friends: 21% \u2192 35% (14% increase)\n- Eating out: 10% \u2192 17% (7% increase)\n- Hobbies: 2% \u2192 4% (2% increase)\n- Net surfing: 3% \u2192 6% (3% increase)\n- Travelling: 5% \u2192 10% (5% increase)\n- Reading: 10% \u2192 4% (6% decrease)\n- Watching films: 20% \u2192 22% (2% increase)\n\nThe largest increase is \"With family and friends\" which went from 21% to 35%, an increase of 14 percentage points.\n\nAnswer: With family and friends", "citations": [], "input_tokens": 5208, "output_tokens": 260, "cost_micros": 19524, "latency_ms": 4875, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "with family and friends", "normalised_gold": "eating out"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q004", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Eating out", "pages": 15, "document_id": 5221, "arm": "azure_premium_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the \"Time spent on weekends\" chart comparing 2005 and 2010:\n\n2005:\n- With family and friends: 21%\n- Watching films: 20%\n- Reading: 10%\n- Shopping: 10%\n- Eating out: 10%\n- Travelling: 5%\n- Fitness: 5%\n- Net surfing: 3%\n- Hobbies: 2%\n\n2010:\n- With family and friends: 35%\n- Watching films: 22%\n- Reading: 10%\n- Shopping: 10%\n- Eating out: 6%\n- Travelling: 4%\n- Fitness: 6%\n- Net surfing: 4%\n- Hobbies: 4%\n\nCalculating increases:\n- With family and friends: 35% - 21% = +14%\n- Watching films: 22% - 20% = +2%\n- Hobbies: 4% - 2% = +2%\n- Fitness: 6% - 5% = +1%\n- Net surfing: 4% - 3% = +1%\n\nAnswer: With family and friends", "citations": [], "input_tokens": 6460, "output_tokens": 284, "cost_micros": 23640, "latency_ms": 5328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "with family and friends", "normalised_gold": "eating out"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q004", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Eating out", "pages": 15, "document_id": 5221, "arm": "llamacloud_basic_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the \"Time spent on weekends\" table comparing 2005 and 2010:\n\n- Shopping: 10% to 10% (no change)\n- Fitness: 5% to 6% (+1%)\n- With family and friends: 21% to 17% (-4%)\n- Eating out: 10% to [value not shown]\n- Hobbies: 2% to 4% (+2%)\n- Net surfing: 3% to 4% (+1%)\n- Travelling: 5% to [value not shown]\n- Reading: 10% to [value not shown]\n- Watching films: 20% to [value not shown]\n\nFrom the visible data, Hobbies shows the largest increase at 2 percentage points (from 2% to 4%).\n\nAnswer: Hobbies", "citations": [], "input_tokens": 4368, "output_tokens": 197, "cost_micros": 16059, "latency_ms": 5109, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "hobbies", "normalised_gold": "eating out"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q004", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Eating out", "pages": 15, "document_id": 5221, "arm": "llamacloud_premium_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the table \"Time spent on weekends\" comparing 2005 and 2010:\n\n- With family and friends: 35 \u2192 21 (decrease of 14)\n- Watching films: 20 \u2192 22 (increase of 2)\n- Reading: 10 \u2192 10 (no change)\n- Travelling: 5 \u2192 6 (increase of 1)\n- Net surfing: 3 \u2192 4 (increase of 1)\n- Hobbies: 2 \u2192 4 (increase of 2)\n- Eating out: 10 \u2192 17 (increase of 7)\n- Fitness: 5 \u2192 6 (increase of 1)\n- Shopping: 10 \u2192 10 (no change)\n\nThe category with the most increase is \"Eating out\" with an increase of 7.\n\nAnswer: Eating out", "citations": [], "input_tokens": 9758, "output_tokens": 213, "cost_micros": 32469, "latency_ms": 4125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "eating out", "normalised_gold": "eating out"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q004", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Eating out", "pages": 15, "document_id": 5221, "arm": "surfsense_agentic", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document that was mentioned to find information about time spent on weekends from 2005 to 2010.Let me continue reading the document to find information about time spent on weekends:This doesn't seem to be the right content. Let me search through more of the document to find information about time spent on weekends:Great! I found a reference to \"Time spent on weekends\" at line 2446. Let me read that section:Perfect! I found the data. Let me analyze the percentages for each category from 2005 to 2010:\n\nFrom the OCR text (lines 2402-2424):\n\n**2005 \u2192 2010 comparisons:**\n- Shopping: 10% \u2192 10% (increase: 0%)\n- Fitness: 5% \u2192 6% (increase: 1%)\n- With family and friends: 21% \u2192 17% (decrease: -4%)\n- Eating out: 10% \u2192 10% (increase: 0%)\n- Hobbies: 2% \u2192 4% (increase: 2%)\n- Net surfing: 3% \u2192 6% (increase: 3%)\n- Travelling: 5% \u2192 10% (increase: 5%)\n- Reading: 10% \u2192 4% (decrease: -6%)\n- Watching films: 20% \u2192 35% (increase: 15%)\n\nThe category with the most increase from 2005 to 2010 is **Watching films** with an increase of 15 percentage points (from 20% to 35%).\n\nAnswer: Watching films", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 42125, "error": null, "extra": {"thread_id": 1919, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_5fc07b74eefd45dcbf475865f8a3bf94", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 246, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "watching films", "normalised_gold": "eating out"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q005", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Europe", "pages": 15, "document_id": 5221, "arm": "native_pdf", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q005", "answer_letter": null, "raw_text": "I need to find information about registered participants for the advanced science course in CTBTO by continent.\n\nLooking at page 13, I can see an infographic titled \"ADVANCED SCIENCE COURSE at a glance\" from the CTBTO (Preparatory Commission for the Comprehensive Nuclear-Test-Ban Treaty Organization).\n\nThe infographic shows:\n- 425 registered participants from 105 countries\n- A world map with numbers indicating participants by region:\n  - 43 (North America)\n  - 5 (Central/South America)\n  - 130 (Europe - shown as the largest red circle)\n  - 80 (Asia/Middle East region)\n  - 77 (Africa/Asia region)\n  - 30 (Africa)\n  - 25 (Asia-Pacific region)\n\nBased on the map visualization, Europe has the largest circle with 130 participants, which is the highest number shown.\n\nAnswer: Europe", "citations": [], "input_tokens": 24369, "output_tokens": 216, "cost_micros": 76347, "latency_ms": 10344, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "0e94b4197b10096b1f4c699701570fbf.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "europe", "normalised_gold": "europe"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q005", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Europe", "pages": 15, "document_id": 5221, "arm": "azure_basic_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q005", "answer_letter": null, "raw_text": "I need to find information about registered participants for the advanced science course in CTBTO by continent.\n\nLooking at the document, I can see a section titled \"ADVANCED SCIENCE COURSE at a glance\" from the CTBTO PREPARATORY COMMISSION.\n\nThe infographic shows:\n- 425 registered participants from 105 countries\n- A breakdown by institutional affiliation (NGO, Station Operator, Ministry of FA, etc.)\n- Regional breakdown showing numbers: 98, 55, 43, 80, 66, 20, 77, 28, 30, 80, 8, 1\n\nI can see there's a map visualization with numbers distributed across different regions. Looking at the geographic distribution:\n- The number 98 appears to be positioned over Asia\n- 80 appears twice in different locations\n- Other numbers are distributed across continents\n\nBased on the positioning on the world map in the infographic, the largest number (98) appears to be associated with the Asian region.\n\nAnswer: Asia", "citations": [], "input_tokens": 5203, "output_tokens": 229, "cost_micros": 19044, "latency_ms": 6187, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "asia", "normalised_gold": "europe"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q005", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Europe", "pages": 15, "document_id": 5221, "arm": "azure_premium_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q005", "answer_letter": null, "raw_text": "I need to find information about registered participants by continent for the CTBTO Advanced Science Course.\n\nLooking at the document, I can see a visualization titled \"ADVANCED SCIENCE COURSE at a glance\" from the CTBTO Preparatory Commission. It shows:\n\n- 425 registered participants from 105 countries\n- A breakdown by institutional affiliation (University, Research Institute, National Data Centre, etc.)\n\nHowever, I do not see any breakdown of participants by continent in this visualization or anywhere else in the document. The document shows the total number of participants (425) and their institutional affiliations, but does not provide geographical/continental distribution data.\n\nAnswer: Information not provided in the document", "citations": [], "input_tokens": 6455, "output_tokens": 152, "cost_micros": 21645, "latency_ms": 4718, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "information not provided in document", "normalised_gold": "europe"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q005", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Europe", "pages": 15, "document_id": 5221, "arm": "llamacloud_basic_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the CTBTO Advanced Science Course data on page 12, I can see a visualization showing participants from 43 countries with numbers:\n\n- 77\n- 130\n- 80\n- 55\n- 66\n\nThe chart shows \"registered institutional affiliation: 425\" total participants and \"participants from 43 countries.\"\n\nThe number 130 appears to be the largest among the values shown in the visualization, which represents participants by region/continent. Based on the positioning and context of the chart layout, this would represent Asia.\n\nAnswer: Asia", "citations": [], "input_tokens": 4363, "output_tokens": 125, "cost_micros": 14964, "latency_ms": 4905, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "asia", "normalised_gold": "europe"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q005", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Europe", "pages": 15, "document_id": 5221, "arm": "llamacloud_premium_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the \"ADVANCED SCIENCE COURSE at a glance\" section in the document, I can find the \"Participants by Region\" table which shows:\n\n- North America: 43\n- Central America: 20\n- South America: 8\n- Europe: 130\n- Africa (West/Central): 77\n- Africa (South): 30\n- Middle East/Asia: 80\n- South East Asia/Oceania: 28\n\nEurope has the highest number with 130 registered participants.\n\nAnswer: Europe", "citations": [], "input_tokens": 9753, "output_tokens": 120, "cost_micros": 31059, "latency_ms": 4437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "europe", "normalised_gold": "europe"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q005", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "str", "gold": "Europe", "pages": 15, "document_id": 5221, "arm": "surfsense_agentic", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document that was flagged as user-mentioned to find information about CTBTO registered participants for advanced science courses by continent.Let me search for information about CTBTO and advanced science course participants by continent.Let me read the entire document to find charts or tables about participant registration by continent.Let me continue reading to find information about CTBTO:Let me search for \"participant\" to find relevant information:Let me look at later sections of the document:Let me search more specifically:Let me continue reading the document to find the CTBTO-related content:Great! I found the CTBTO content. Let me read that specific section more carefully:I can see the CTBTO Advanced Science Course infographic, but I need to see if there's a visualization showing participant numbers by continent. Let me look for the visual description of this image:That's a different image. Let me search for more info about the CTBTO figure. Let me look for text that mentions continents or regions in the numbers shown:The numbers I see in the CTBTO section are: 43, 130, 80, 66, 20, 77, 28, 30, 8, 80, 98. These numbers appear to be on a chart/map. Let me check if there's a visual description for the CTBTO image. Let me search around that area:Let me look for an embedded image description that might be for the CTBTO figure. The figure ends at line 2239. Let me check if there's a visual description after that:Let me look at what comes before the CTBTO figure to find if there's an embedded image marker:I notice Image18.jpg is about Indonesia entrepreneurship, not CTBTO. Let me search for Image17:That's also not it. The CTBTO figure must be between Image17 and Image18. Let me look for any embedded image between lines 2000-2150:Now I see the CTBTO figure starts at line 2146. Based on what I've seen, the numbers shown are: 43, 130, 80, 66, 20, 77, 28, 30, 8, 80, 98. These appear to be data points on what's likely a world map. Let me look for any visual description or search for these specific numbers in context. Let me search for \"world\" or \"map\":Looking at the numbers from the CTBTO figure more carefully:\n- 43\n- 130 (appears as \"A0 130\" which might be \"Africa 130\")\n- 80\n- 66\n- 20\n- 77\n- 28\n- 30\n- 8\n- 80\n- 98\n\nThe \"A0 130\" at line 2216 could potentially be \"Africa 130\". Let me check if there are continent names mixed in with these numbers. Let me search the exact context:Based on the data I can see from the CTBTO Advanced Science Course infographic, I have:\n- 425 registered participants from 105 countries\n- Numbers: 43, 130, 80, 66, 20, 77, 28, 30, 8, 80, 98\n\nThese numbers likely represent participants by geographic region/continent on a world map visualization. The largest number is 130, which likely corresponds to a continent. Without a clear visual description identifying which number belongs to which continent, I need to make an inference. Given that 130 is the highest value and appears with \"A0\" (possibly Africa or Asia), and considering typical patterns in international scientific courses, the highest participation often comes from either Europe or Asia.\n\nHowever, I cannot definitively determine from the OCR text alone which continent each number represents. Let me see if I can find more context by reading around these numbers more carefully:The OCR shows \"A0 130\" on line 2216. This could be \"Africa 130\" or \"Asia 130\" where the character wasn't recognized properly. Given that 130 is the highest number and that this appears to be showing registered participants by continent on a world map, I need to determine which continent this refers to.\n\nLooking at the pattern, if this is a world map with continental data, and 130 is preceded by what looks like \"A0\" (possibly \"Africa\" or \"Asia\" with poor OCR), I cannot make a definitive determination from the text alone.\n\nHowever, based on the structure and the fact that the highest number is 130, and without additional visual context to confirm which specific continent each number represents, I cannot provide a certain answer from this document.\n\nAnswer: Cannot be determined from the document text alone - the highest value is 130 but the continent labels are not clearly identifiable in the OCR text", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 144672, "error": null, "extra": {"thread_id": 1920, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_b3899f4c01ab4b9084195ac54d3f04aa", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 1007, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "cannot be determined from document text alone highest value is 130 but continent labels are not clearly identifiable in ocr text", "normalised_gold": "europe"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q006", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "list", "gold": "['Circle', 'Rectangle']", "pages": 15, "document_id": 5221, "arm": "native_pdf", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the slide that discusses the \"Closure\" aspect of Gestalt Principles of Visual Perception.\n\nLooking at page 8, I can see a slide titled \"Gestalt Principles of Visual Perception\" that includes information about Closure. The text states: \"Closure. We tend to perceive objects such as shapes, letters, pictures, etc., as being whole when they are not complete. Specifically, when parts of a whole picture are missing, our perception fills in the visual gap.\"\n\nBelow this text, there is an illustration showing incomplete shapes. Looking at the image, I can see:\n- An incomplete circle (dotted/dashed lines forming most of a circle)\n- An incomplete rectangle or square (dotted/dashed lines)\n- Another incomplete rectangular shape (dotted/dashed lines)\n\nThe shapes used to illustrate closure are geometric shapes with gaps or missing parts that our minds fill in to perceive complete shapes.\n\nAnswer: circle, rectangle, square", "citations": [], "input_tokens": 24379, "output_tokens": 216, "cost_micros": 76377, "latency_ms": 13483, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "0e94b4197b10096b1f4c699701570fbf.pdf"}, "graded": {"correct": false, "f1": 0.8, "method": "list_set", "normalised_pred": "circle, rectangle, square", "normalised_gold": "circle, rectangle"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q006", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "list", "gold": "['Circle', 'Rectangle']", "pages": 15, "document_id": 5221, "arm": "azure_basic_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the section about \"Closure\" in the Gestalt Principles of Visual Perception and identify what shapes are shown.\n\nLooking at the document, I can see the section on \"Gestalt Principles of Visual Perception\" that explains Closure:\n\n\"Closure. We tend to perceive objects such as shapes, letters, pictures, etc., as being whole when they are not complete. Specifically, when parts of a whole picture are missing, our perception fills in the visual gap.\"\n\nLooking at the visual examples provided in the Closure section of the slide, I can see incomplete shapes that our minds complete. The shapes shown appear to be:\n- A circle (incomplete/broken circle)\n- A square or rectangle (incomplete/broken square)\n- A triangle (incomplete/broken triangle)\n\nAnswer: circle, square, triangle", "citations": [], "input_tokens": 5213, "output_tokens": 183, "cost_micros": 18384, "latency_ms": 4796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.4, "method": "list_set", "normalised_pred": "circle, square, triangle", "normalised_gold": "circle, rectangle"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q006", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "list", "gold": "['Circle', 'Rectangle']", "pages": 15, "document_id": 5221, "arm": "azure_premium_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the slide about the \"Closure\" aspect of Gestalt Principles of Visual Perception.\n\nLooking through the document, I can see a section titled \"Gestalt Principles of Visual Perception\" that discusses Closure. The text states:\n\n\"Closure. We tend to perceive objects such as shapes, letters, pictures, etc., as being whole when they are not complete. Specifically, when parts of a whole picture are missing, our perception fills in the visual gap.\"\n\nThis section appears on a page that also discusses \"Continuity.\" The document indicates there is a figure on this page, but the actual visual content of the figure is not provided in the text transcription.\n\nHowever, based on the context and typical Gestalt closure demonstrations, and looking at the structure of the document, I cannot definitively identify the specific shapes used in the illustration from the provided text alone, as the figure content is not described in detail in the transcription.\n\nSince I cannot see the actual shapes in the figure that illustrates closure, I cannot provide an accurate answer based solely on what is visible in this document transcription.\n\nAnswer: Unable to determine from provided document", "citations": [], "input_tokens": 6465, "output_tokens": 251, "cost_micros": 23160, "latency_ms": 7265, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "unable to determine from provided document", "normalised_gold": "circle, rectangle"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q006", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "list", "gold": "['Circle', 'Rectangle']", "pages": 15, "document_id": 5221, "arm": "llamacloud_basic_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the slide that illustrates the \"Closure\" aspect of Gestalt Principles of Visual Perception.\n\nLooking through the document, I can see a section titled \"Gestalt Principles of Visual Perception\" that discusses \"Closure\". The text states:\n\n\"Closure: We tend to perceive objects such as shapes, letters, pictures, etc., as being whole when they are not complete. Specifically, when parts of a whole picture are missing, our perception fills in the visual gap.\"\n\nHowever, I need to identify what shapes are actually shown in the visual illustration for this principle. Looking at the page layout, I can see there are visual examples accompanying the Gestalt principles, but the specific shapes used to illustrate closure are not clearly described in the extracted text.\n\nFrom the document structure, I can see that the Closure section appears on a page, but the specific shapes or visual elements used to demonstrate closure are not explicitly listed in the text content that was extracted.\n\nBased on typical Gestalt closure examples and what might be visible in such presentations, common shapes used include circles, triangles, or other geometric shapes that are incomplete but our minds perceive as complete. However, I cannot definitively identify the specific shapes from the text content provided.\n\nAnswer: I cannot determine the specific shapes from the provided document text", "citations": [], "input_tokens": 4373, "output_tokens": 284, "cost_micros": 17379, "latency_ms": 8062, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "i cannot determine specific shapes from provided document text", "normalised_gold": "circle, rectangle"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q006", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "list", "gold": "['Circle', 'Rectangle']", "pages": 15, "document_id": 5221, "arm": "llamacloud_premium_lc", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q006", "answer_letter": null, "raw_text": "Looking at the document for information about the Closure aspect of Gestalt Principle of Visual Perception...\n\nOn the page titled \"Gestalt Principles of Visual Perception\" that discusses Closure and Continuity, I can see the following description:\n\n\"The image illustrates the principle of **Closure** with a dashed outline of a circle and a dashed outline of a square, which the mind perceives as complete shapes despite the gaps.\"\n\nAnswer: circle, square", "citations": [], "input_tokens": 9763, "output_tokens": 105, "cost_micros": 30864, "latency_ms": 4812, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "circle, square", "normalised_gold": "circle, rectangle"}}
+{"qid": "0e94b4197b10096b1f4c699701570fbf.pdf::Q006", "doc_id": "0e94b4197b10096b1f4c699701570fbf.pdf", "answer_format": "list", "gold": "['Circle', 'Rectangle']", "pages": 15, "document_id": 5221, "arm": "surfsense_agentic", "question_id": "0e94b4197b10096b1f4c699701570fbf.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the document to find information about the Gestalt Principle of Visual Perception and the closure aspect.Let me search for content about Gestalt Principle and closure:Now let me read the specific section around line 1631-1640 where closure is mentioned:Let me continue reading to see if there are visual examples for the closure principle:This doesn't seem to be the right section. Let me search more systematically for visual examples of the closure principle. Let me look at earlier sections:Perfect! I can see that Image13.jpg appears right before the Gestalt Principles section and shows \"dashed circle outline\" and \"dashed rectangle outline\". This image is likely illustrating the closure principle. Let me verify by reading a bit more context before the Gestalt section:Based on my reading of the document, Image13.jpg appears immediately before the Gestalt Principles of Visual Perception section discussing closure. The visual description clearly states that this image shows:\n- Left shape: dashed circle outline\n- Right shape: dashed rectangle outline\n\nThese dashed outlines (incomplete shapes) are used to illustrate the closure principle, where we perceive complete shapes even when they are not fully drawn.\n\nAnswer: circle, rectangle", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 46890, "error": null, "extra": {"thread_id": 1921, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_76cfee4a20ba492dba921a5a3cdf2231", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 294, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "circle, rectangle", "normalised_gold": "circle, rectangle"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q000", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "list", "gold": "92", "pages": 45, "document_id": 5222, "arm": "native_pdf", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q000", "answer_letter": null, "raw_text": "I need to find information about voters' perceptions of mudslinging in the 2016 election compared to past elections.\n\nLooking through the document, I find a chart on page 7 titled \"Record share of voters say there was more campaign 'mudslinging'\" which shows the percentage of voters who say there was more mudslinging than in past elections from 1992 to 2016.\n\nThe chart shows:\n- 1992: 68%\n- 1996: 49%\n- 2000: 46%\n- 2004: 72%\n- 2008: 54%\n- 2012: 68%\n- 2016: 92%\n\nThe highest point is 92% in 2016.\n\nAnswer: 92%, 2016", "citations": [], "input_tokens": 90784, "output_tokens": 179, "cost_micros": 275037, "latency_ms": 41734, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "11-21-16-Updated-Post-Election-Release.pdf"}, "graded": {"correct": false, "f1": 0.6666666666666666, "method": "list_set", "normalised_pred": "2016, 92", "normalised_gold": "92"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q000", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "list", "gold": "92", "pages": 45, "document_id": 5222, "arm": "azure_basic_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the chart titled \"More see 'mud-slinging,' less focus on issues\" on page 33 of the document, I can see the data points for \"More mudslinging than usual\" across different election years:\n\n- 1992: 46%\n- 1996: 49%\n- 2000: 54%\n- 2004: 72%\n- 2008: 68%\n- 2012: 68%\n- 2016: 92%\n\nThe document also explicitly states on page 6: \"Fully 92% of all voters - including 87% of Clinton supporters and 60% of Trump backers - say they were surprised by Trump's victory\" and \"Nearly three-quarters (73%) 0f all voters say there was more 'mudslinging' or negative campaigning compared with previous contests...\"\n\nHowever, looking at page 33 more carefully, it states: \"About nine-in-ten (92%) say there was more mudslinging or negative campaigning compared with previous contests, up from 68% who said that in 2012, up 38 points from 2008 (54% more negative) and 20 points higher than the previous high of 72% in 2004.\"\n\nAnswer: 92%, 2016", "citations": [], "input_tokens": 19137, "output_tokens": 292, "cost_micros": 61791, "latency_ms": 9858, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.6666666666666666, "method": "list_set", "normalised_pred": "2016, 92", "normalised_gold": "92"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q000", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "list", "gold": "92", "pages": 45, "document_id": 5222, "arm": "azure_premium_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document for data on \"mudslinging\" or negative campaigning:\n\nOn page 5, there is a chart titled \"Record share of voters say there was more campaign 'mudslinging'\" showing the percentage of voters who say there was more mudslinging than in past elections:\n\n- 1992: 46%\n- 1996: 49%\n- 2000: 54%\n- 2004: 72%\n- 2008: 68%\n- 2012: 68%\n- 2016: 92%\n\nThe highest point is 92% in 2016.\n\nAnswer: 92%", "citations": [], "input_tokens": 24416, "output_tokens": 147, "cost_micros": 75453, "latency_ms": 5031, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "92", "normalised_gold": "92"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q000", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "list", "gold": "92", "pages": 45, "document_id": 5222, "arm": "llamacloud_basic_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the highest percentage of voters who said there was more mudslinging than in past elections.\n\nLooking at the document, I can find a chart titled \"Record share of voters say there was more campaign 'mudslinging'\" on page 6.\n\nThe chart shows percentages for different election years:\n- 1996: 46%\n- 2000: 49%\n- 2004: 72%\n- 2008: 54%\n- 2012: 68%\n- 2016: 92%\n\nThe highest point is 92% in 2016.\n\nAnswer: 92%", "citations": [], "input_tokens": 21637, "output_tokens": 142, "cost_micros": 67041, "latency_ms": 4875, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "92", "normalised_gold": "92"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q000", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "list", "gold": "92", "pages": 45, "document_id": 5222, "arm": "llamacloud_premium_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the highest percentage of voters who said there was more mudslinging than in past elections.\n\nLooking through the document for information about mudslinging:\n\nOn page 5, there's a section titled \"Record share of voters say there was more campaign 'mudslinging'\" with a table showing:\n\n% of voters who say there was ___ than in past elections\n\n| Year | More mudslinging | Less mudslinging |\n| ---- | ---------------- | ---------------- |\n| 1992 | 68               | 16               |\n| 1996 | 49               | 36               |\n| 2000 | 46               | 34               |\n| 2004 | 72               | 14               |\n| 2008 | 54               | 27               |\n| 2012 | 68               | 19               |\n| 2016 | 92               | 4                |\n\nThe text also states: \"Fully 92% say there was more \"mudslinging\" or negative campaigning than in past elections \u2013 which is 20 percentage points higher than the previous high (72% after the 2004 election).\"\n\nThe highest point is 92% in 2016.\n\nAnswer: 92%", "citations": [], "input_tokens": 24377, "output_tokens": 286, "cost_micros": 77421, "latency_ms": 7157, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "92", "normalised_gold": "92"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q000", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "list", "gold": "92", "pages": 45, "document_id": 5222, "arm": "surfsense_agentic", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find information about voters' perceptions of mudslinging compared to past elections.Let me search for content related to mudslinging:Let me read through the document to find the specific data about mudslinging. Let me start by reading the beginning sections:Let me continue searching for information about mudslinging. Let me read more of the document:Great! I found the relevant information. Let me continue reading this section to get the complete data:Perfect! I found the data about mudslinging. From the document, I can see the chart showing \"% of voters who say there was more mudslinging than in past elections\" with the following values across different election years:\n\n- 1992: 36%\n- 1996: 34%\n- 2000: 27%\n- 2004: 72%\n- 2008: 68%\n- 2012: 68%\n- 2016: 92%\n\nThe text also confirms: \"Fully 92% say there was more 'mudslinging' or negative campaigning than in past elections - which is 20 percentage points higher than the previous high (72% after the 2004 election).\"\n\nThe highest point is 92% in 2016.\n\nAnswer: 92", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 39625, "error": null, "extra": {"thread_id": 1922, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_252b1c025dac41a5b4c7b4fbfefc6f87", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 214, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "92", "normalised_gold": "92"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q001", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "str", "gold": "men", "pages": 45, "document_id": 5222, "arm": "native_pdf", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q001", "answer_letter": null, "raw_text": "I need to find information about whether men or women are more likely to think a female president will be elected in their lifetime.\n\nLooking through the document, I find on page 10 (document page 8) a section titled \"Will the country elect a female president in your lifetime?\"\n\nThe data shows:\n- All voters: No 17%, Yes 79%\n- Men: No 16%, Yes 81%\n- Women: No 18%, Yes 78%\n- Trump voters: No 17%, Yes 78%\n- Clinton voters: No 17%, Yes 81%\n\nComparing men (81% Yes) to women (78% Yes), more men think a female president will be elected in their lifetime.\n\nAnswer: Men", "citations": [], "input_tokens": 90776, "output_tokens": 163, "cost_micros": 274773, "latency_ms": 43483, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "11-21-16-Updated-Post-Election-Release.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "men", "normalised_gold": "men"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q001", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "str", "gold": "men", "pages": 45, "document_id": 5222, "arm": "azure_basic_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the document for data on expectations of a female president by gender...\n\nOn page 9, there is a table titled \"Will the country elect a female president in your lifetime?\"\n\nThe table shows:\n- Men who say \"Yes\": 81%\n- Women who say \"Yes\": 78%\n\nAnswer: More men (81% vs 78%)", "citations": [], "input_tokens": 19129, "output_tokens": 80, "cost_micros": 58587, "latency_ms": 5437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.33333333333333337, "method": "str_norm", "normalised_pred": "more men 81 vs 78", "normalised_gold": "men"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q001", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "str", "gold": "men", "pages": 45, "document_id": 5222, "arm": "azure_premium_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the table on page 8 titled \"Will the country elect a female president in your lifetime?\", I can find the breakdown by gender:\n\n- Men: 81% say yes\n- Women: 78% say yes\n\nAnswer: More men (81% vs 78%)", "citations": [], "input_tokens": 24408, "output_tokens": 64, "cost_micros": 74184, "latency_ms": 7359, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.33333333333333337, "method": "str_norm", "normalised_pred": "more men 81 vs 78", "normalised_gold": "men"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q001", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "str", "gold": "men", "pages": 45, "document_id": 5222, "arm": "llamacloud_basic_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the document for data on whether men or women think a female president will be elected in their lifetime...\n\nOn page 9, there is a table titled \"Will the country elect a female president in your lifetime?\" showing:\n\n- Men: 81% say Yes\n- Women: 78% say Yes\n\nAnswer: More men (81% vs 78%)", "citations": [], "input_tokens": 21629, "output_tokens": 80, "cost_micros": 66087, "latency_ms": 4219, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.33333333333333337, "method": "str_norm", "normalised_pred": "more men 81 vs 78", "normalised_gold": "men"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q001", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "str", "gold": "men", "pages": 45, "document_id": 5222, "arm": "llamacloud_premium_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the document, I need to find the data about whether men or women think a female president will be elected in their lifetime.\n\nOn page 8, there is a table titled \"Will the country elect a female president in your lifetime?\" that shows:\n\n- Men: 81% say Yes\n- Women: 78% say Yes\n\nSince 81% > 78%, more men than women think a female president will be elected in their lifetime.\n\nAnswer: Men", "citations": [], "input_tokens": 24369, "output_tokens": 102, "cost_micros": 74637, "latency_ms": 5171, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "men", "normalised_gold": "men"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q001", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "str", "gold": "men", "pages": 45, "document_id": 5222, "arm": "surfsense_agentic", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the priority document that was flagged as user-mentioned to find information about opinions on whether a female president will be elected in a lifetime.Let me search for content related to female president and lifetime:Let me read the specific section around line 1095 to get the complete data:The text states \"There are no significant differences in these opinions among men and women,\" meaning men and women have similar percentages. Let me search for any table or chart that might have the exact percentages broken down by gender:I can see a table starting. Let me read more to see if it has the gender breakdown:Perfect! I found the table with the gender breakdown. Looking at the \"Yes\" column (indicating those who think a female president will be elected in their lifetime):\n\n- Men: 81%\n- Women: 78%\n\nAnswer: Men (81% vs 78%)", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 46859, "error": null, "extra": {"thread_id": 1923, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_47a779db7256446daa01071f4e8824a2", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 246, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 0.4, "method": "str_norm", "normalised_pred": "men 81 vs 78", "normalised_gold": "men"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q003", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "int", "gold": "8", "pages": 45, "document_id": 5222, "arm": "native_pdf", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q003", "answer_letter": null, "raw_text": "I'll carefully examine the document to count all line plots.\n\nGoing through the document page by page:\n\n**Page 3:** Contains a line plot showing \"Voters give Trump worse grades than they have for any winning candidate in recent decades\" with percentages from 1988-2016.\n\n**Page 5:** Contains a line plot showing \"Record share of voters say there was more campaign 'mudslinging'\" with two lines (More mudslinging and Less mudslinging) from 1992-2016.\n\n**Page 9:** Contains a line plot showing \"Half say they are happy Trump was elected president\" with percentages from 1996-2016.\n\n**Page 21:** Contains a line plot showing \"Will relations between Republicans and Democrats improve?\" with three lines from 1992-2016.\n\n**Page 24:** Contains two separate line plots:\n- One for Republican/Rep-leaning voters (More moderate vs More conservative) from 2008-2016\n- One for Democratic/Dem-leaning voters (More moderate vs More liberal) from 2008-2016\n\n**Page 27:** Contains two separate line plots showing grades for:\n- The Republican Party (1988-2016)\n- The Democratic Party (1988-2016)\n\n**Page 29:** Contains two separate line plots showing grades for:\n- The press (1988-2016)\n- The pollsters (1988-2016)\n\n**Page 31:** Contains a line plot showing \"For the first time in eight elections, most are dissatisfied with vote choices\" from 1988-2016.\n\n**Page 32:** Contains a line plot showing \"More see 'mud-slinging,' less focus on issues\" with two sets of lines from 1992-2016.\n\n**Page 33:** Contains two separate line plots showing \"Learning about the candidates and issues\" from 1988-2016.\n\n**Page 36:** Contains a line plot showing \"Most voters are confident own vote, national votes counted accurately\" from 2004-2016.\n\n**Page 37:** Contains a line plot showing \"Similar shares of Trump, Clinton voters very confident in national vote count\" from 2004-2016.\n\nTotal count:\nPage 3: 1\nPage 5: 1\nPage 9: 1\nPage ", "citations": [], "input_tokens": 90760, "output_tokens": 512, "cost_micros": 279960, "latency_ms": 41093, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "length", "pdf_filename": "11-21-16-Updated-Post-Election-Release.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "8"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q003", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "int", "gold": "8", "pages": 45, "document_id": 5222, "arm": "azure_basic_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q003", "answer_letter": null, "raw_text": "I need to count the number of line plots in this document.\n\nLet me go through the document systematically to identify line plots:\n\n1. Page 1: \"Voters give Trump worse grades than they have for any winning candidate in recent decades\" - This shows a line plot with two lines (Winning candidate and Losing candidate) from 1988 to 2016.\n\n2. Page 6: \"Record share of voters say there was more campaign 'mudslinging'\" - This shows a line plot with two lines (More mudslinging and Less mudslinging) from 1992 to 2016.\n\n3. Page 31: \"For the first time in eight elections, most are dissatisfied with vote choices\" - This shows a line plot with two lines (Very/Fairly satisfied and Not very/Not at all satisfied) from 1988 to 2016.\n\n4. Page 32: Below the first plot on page 31, there's another line plot showing \"% of voters who say they were very/fairly satisfied with choice of presidential candidates\" with data points from 1988 to 2016 for winning and losing candidates.\n\n5. Page 33: \"More see 'mud-slinging,' less focus on issues\" - This contains TWO separate line plots:\n   - First plot: Shows \"More mudslinging than usual\" vs \"Less mudslinging than usual\" from 1992 to 2016\n   - Second plot: Shows discussion of issues \"Less than usual\" vs \"More than usual\" from 1992 to 2016\n\n6. Page 34: \"Learning about the candidates and issues\" - This contains TWO separate line plots:\n   - First plot: \"Learned enough\" vs \"Did not learn enough\" from 1988 to 2016\n   - Second plot: Presidential debates helpfulness from 1988 to 2016\n\nLet me recount to be sure:\n- Page 1: 1 line plot\n- Page 6: 1 line plot  \n- Page 31: 2 line plots (one for all voters, one comparing winning/losing candidate voters)\n- Page 33: 2 line plots (mudslinging and issues discussion)\n- Page 34: 2 line plots (learning about candidates and debates)\n\nTotal:", "citations": [], "input_tokens": 19113, "output_tokens": 512, "cost_micros": 65019, "latency_ms": 11078, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "8"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q003", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "int", "gold": "8", "pages": 45, "document_id": 5222, "arm": "azure_premium_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q003", "answer_letter": null, "raw_text": "I need to carefully examine the document to count all line plots (graphs that show data points connected by lines over time or categories).\n\nLet me go through the document systematically:\n\n1. Page 3: \"Voters give Trump worse grades than they have for any winning candidate in recent decades\" - This appears to be a line plot showing grades from 1988-2016.\n\n2. Page 5: \"Record share of voters say there was more campaign 'mudslinging'\" - This is a line plot showing data from 1992-2016.\n\n3. Page 9: \"Half say they are happy Trump was elected president\" - This is a line plot showing data from 1996-2016.\n\n4. Page 24: Two separate line plots:\n   - \"Most Republicans continue to say GOP should be more conservative\" (showing data from 2008-2016)\n   - \"As the share of Democrats who want their party to be more liberal grows\" (showing data from 2008-2016)\n\n5. Page 31: \"For the first time in eight elections, most are dissatisfied with vote choices\" - This is a line plot showing data from 1988-2016.\n\n6. Page 32: Two separate line plots:\n   - One showing \"More mudslinging than usual\" vs \"Less mudslinging than usual\" from 1992-2016\n   - One showing discussion of issues \"Less than usual\" vs \"More than usual\" from 1992-2016\n\n7. Page 33: Two separate line plots:\n   - One showing \"Learned enough\" vs \"Did not learn enough\" from 1988-2016\n   - One showing helpfulness of debates from 1988-2016\n\n8. Page 34: \"More say press had 'too much' influence on outcome of the election\" - This is a line plot showing data from 1992-2016.\n\nTotal count: 1 + 1 + 1 + 2 + 1 + 2 + 2 + 1 = 11\n\nAnswer: 11", "citations": [], "input_tokens": 24392, "output_tokens": 466, "cost_micros": 80166, "latency_ms": 11078, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "11", "normalised_gold": "8"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q003", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "int", "gold": "8", "pages": 45, "document_id": 5222, "arm": "llamacloud_basic_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q003", "answer_letter": null, "raw_text": "I need to carefully examine the document for line plots (charts that show data points connected by lines).\n\nLet me go through the document systematically:\n\n1. Page 3: \"Voters give Trump worse grades...\" - This appears to be a bar chart showing percentages, not a line plot.\n\n2. Page 4: \"Large share of Trump voters are confident...\" - Bar chart\n   \"How Clinton voters feel about 'giving Trump a chance'\" - Bar chart\n\n3. Page 5: \"% of voters saying Democratic leaders should...\" - Bar charts comparing 2016 and 2008\n\n4. Page 6: \"% of voters saying Donald Trump ___ appoint Democrats...\" - Table format\n   \"% of voters who said Barack Obama ___ appoint Republicans...\" - Table format\n\n5. Page 7: \"Election report card: Barely passing grades\" - Table format\n   \"Record share of voters say there was more campaign 'mudslinging'\" - This appears to be a LINE PLOT showing trend over years 1996-2016\n\n6. Page 8: \"Top reactions to Trump's victory\" - Bar chart\n\n7. Page 9: \"Trump's victory in a word\" - Word cloud/frequency table\n\n8. Page 10: \"Will the country elect a female president in your lifetime?\" - Bar chart\n\n9. Throughout the rest of the document, I see mostly bar charts, tables, and no additional line plots.\n\nLet me recheck page 7 more carefully. The chart showing \"More mudslinging\" and \"Less mudslinging\" from 1992-2016 appears to be displayed as a line plot with two lines tracking these measures over time.\n\nAfter careful review, I can identify 1 line plot in this document on page 7.\n\nAnswer: 1", "citations": [], "input_tokens": 21613, "output_tokens": 393, "cost_micros": 70734, "latency_ms": 11437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "8"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q003", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "int", "gold": "8", "pages": 45, "document_id": 5222, "arm": "llamacloud_premium_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q003", "answer_letter": null, "raw_text": "I need to carefully examine the document for line plots (charts that show data points connected by lines over time or across categories).\n\nLet me go through the document systematically:\n\n1. Page 3: \"Voters give Trump worse grades than they have for any winning candidate in recent decades\" - This shows a line plot with percentages over years from 1988-2016.\n\n2. Page 5: \"Record share of voters say there was more campaign 'mudslinging'\" - This shows a line plot with \"More mudslinging\" and \"Less mudslinging\" lines from 1992-2016.\n\n3. Page 9: \"Voters give Trump worse grades than they have for any winning candidate in recent decades\" (repeated from earlier) - This is a table showing A or B grades, labeled as \"Note: Percent of 'A' or 'B' grades\" - This appears to be the same as #1.\n\n4. Page 31: \"For the first time in eight elections, most are dissatisfied with vote choices\" - This shows two line plots tracking satisfaction levels from 1988-2016.\n\n5. Page 32: \"More see 'mud-slinging,' less focus on issues\" - This shows two separate line plots, one for mudslinging and one for discussion of issues from 1992-2016.\n\nLet me recount more carefully by looking at actual visual line plots:\n\n- Page 3: One line plot showing grades over time\n- Page 5: One line plot showing mudslinging trends\n- Page 31: Two line plots (satisfaction trends)\n- Page 32: Two line plots (mudslinging and issues discussion)\n\nTotal: 1 + 1 + 2 + 2 = 6 line plots\n\nAnswer: 6", "citations": [], "input_tokens": 24353, "output_tokens": 397, "cost_micros": 79014, "latency_ms": 10187, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "8"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q003", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "int", "gold": "8", "pages": 45, "document_id": 5222, "arm": "surfsense_agentic", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the document to count the line plots in the report.Let me continue reading the document to see all the content and identify line plots:Let me continue reading to see if there are more charts/figures:Let me search for all figures/charts in the document to count line plots specifically:Now I need to look at the actual figures to determine which ones are line plots. Let me examine the key figure sections more carefully. Based on what I've already read, I saw one chart around line 798-853 that appears to be about \"mudslinging\" over time. Let me check the visual descriptions and content more carefully:This appears to be a line plot showing data over time (years 92, 96, 00, 04, 08, 12, 16). Let me check another figure:This appears to be another line plot showing data over time from 1996 to 2016. Let me check the first figure to see if it's a line plot:This first figure shows data for winning and losing candidates across years from 1988 to 2016. This appears to be a line plot with two lines (winning candidate and losing candidate). Let me check if there are any other potential line plots by examining more figures. Let me look at the figure around line 2240-2290:This appears to be a bar chart, not a line plot. Let me continue scanning through the document to identify all line plots. Let me check around line 2333:This appears to be a bar chart comparing 2016 and 2008 data, not a line plot. Based on my review of the document so far, I've identified:\n\n1. **Line plot 1** (lines 264-360): Shows grades for winning and losing candidates over time from 1988 to 2016 (two lines)\n2. **Line plot 2** (lines 798-848): Shows \"mudslinging\" over time with two lines (More mudslinging and Less mudslinging) from 1992 to 2016\n3. **Line plot 3** (lines 1209-1241): Shows happiness with election results from 1996 to 2016\n\nThese all show data trends over multiple time points, which is characteristic of line plots. Most other figures in the document appear to be bar charts showing comparisons between groups. Let me verify by checking if there are any other time-series charts I might have missed. Let me do a final check of the document to ensure I haven't missed any line plots.\n\nBased on my thorough review of the document, I found three distinct line plots that show data trends over time:\n\n1. A line plot showing grades for winning and losing candidates from 1988-2016\n2. A line plot showing perceptions of campaign mudslinging from 1992-2016  \n3. A line plot showing voter happiness with election results from 1996-2016\n\nAnswer: 3", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 65172, "error": null, "extra": {"thread_id": 1924, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_dcd15be6dac34256a7ae1a01a0998a5a", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 476, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "8"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q004", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "str", "gold": "Clinton", "pages": 45, "document_id": 5222, "arm": "native_pdf", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document for campaign grades information...\n\nOn page 3, there is a chart showing \"Voters give Trump worse grades than they have for any winning candidate in recent decades\" with percentages of \"A\" or \"B\" grades on an A, B, C, D, F scale.\n\nThe chart shows:\n- Trump (2016): 30% gave A or B grades\n- Clinton (2016): 43% gave A or B grades\n\nThe document also states on page 5: \"For the first time in Pew Research Center post-election surveys, voters give the losing candidate higher grades than the winner. About four-in-ten (43%) give Clinton an A or B, which is comparable to the share giving Mitt Romney top letter grades in 2012 (44%) and 13 percentage points higher than Trump's (30%).\"\n\nAdditionally, on page 26, there is a table showing average grades:\n- Trump: C- (average grade)\n- Clinton: C (average grade)\n\nSince C is higher than C-, Clinton has a higher median/average campaign grade.\n\nAnswer: Clinton", "citations": [], "input_tokens": 90768, "output_tokens": 246, "cost_micros": 275994, "latency_ms": 41342, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "11-21-16-Updated-Post-Election-Release.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "clinton", "normalised_gold": "clinton"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q004", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "str", "gold": "Clinton", "pages": 45, "document_id": 5222, "arm": "azure_basic_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the campaign grades data in the document:\n\nFor **Trump**:\n- 30% gave A or B\n- 19% gave C\n- 15% gave D\n- 35% gave F\n\nFor **Clinton**:\n- 43% gave A or B\n- 20% gave C\n- 16% gave D\n- 21% gave F\n\nTo find the median, I need to determine which grade represents the 50th percentile:\n\nFor Trump: 30% (A/B) + 19% (C) = 49%, then adding the next 15% (D) brings us to 64%. The 50th percentile falls in the D range.\n\nFor Clinton: 43% (A/B) + 20% (C) = 63%. The 50th percentile falls in the C range.\n\nThe document also explicitly states Trump receives a \"C-\" average grade while Clinton receives a \"C\" average grade, which is consistent with this analysis.\n\nAnswer: Clinton", "citations": [], "input_tokens": 19121, "output_tokens": 227, "cost_micros": 60768, "latency_ms": 6078, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "clinton", "normalised_gold": "clinton"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q004", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "str", "gold": "Clinton", "pages": 45, "document_id": 5222, "arm": "azure_premium_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document for campaign grades:\n\nOn page 26, there is a table titled \"Election report card: Barely passing grades\" showing:\n- Trump: 30% gave A or B, Average grade: C-\n- Clinton: 43% gave A or B, Average grade: C\n\nThe document states on page 5: \"For the first time in Pew Research Center post-election surveys, voters give the losing candidate higher grades than the winner. About four-in-ten (43%) give Clinton an A or B, which is comparable to the share giving Mitt Romney top letter grades in 2012 (44%) and 13 percentage points higher than Trump's (30%).\"\n\nClinton received an average grade of C, while Trump received an average grade of C-.\n\nAnswer: Clinton", "citations": [], "input_tokens": 24400, "output_tokens": 176, "cost_micros": 75840, "latency_ms": 6358, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "clinton", "normalised_gold": "clinton"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q004", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "str", "gold": "Clinton", "pages": 45, "document_id": 5222, "arm": "llamacloud_basic_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q004", "answer_letter": null, "raw_text": "Let me find the campaign grades for Trump and Clinton from the document.\n\nFrom the section \"Voters give Trump worse grades than they have for any winning candidate in recent decades\":\n\nFor Trump:\n- 30% gave A or B\n- 19% gave C\n- 15% gave D\n- 35% gave F\n\nFor Clinton:\n- 43% gave A or B\n- 20% gave C\n- 16% gave D\n- 21% gave F\n\nThe document also shows average grades:\n- Trump: C-\n- Clinton: C\n\nTo determine the median, I need to consider the distribution. With:\n- Trump: 30% (A/B), 19% (C), 15% (D), 35% (F)\n- Clinton: 43% (A/B), 20% (C), 16% (D), 21% (F)\n\nFor Trump, the cumulative percentages are: 30% at B or above, 49% at C or above, 64% at D or above. The median falls around the C grade.\n\nFor Clinton, the cumulative percentages are: 43% at B or above, 63% at C or above, 79% at D or above. The median falls around the C grade.\n\nHowever, since Clinton has a higher percentage (43%) receiving A or B compared to Trump (30%), and a lower percentage receiving F (21% vs 35%), Clinton's distribution is shifted toward higher grades. The average grades confirm this: Clinton received C while Trump received C-.\n\nAnswer: Clinton", "citations": [], "input_tokens": 21621, "output_tokens": 349, "cost_micros": 70098, "latency_ms": 8375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "clinton", "normalised_gold": "clinton"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q004", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "str", "gold": "Clinton", "pages": 45, "document_id": 5222, "arm": "llamacloud_premium_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document, I need to find the campaign grades for Trump and Clinton.\n\nOn page 26-27, there is a table showing \"Voters grade the parties, press and pollsters quite negatively\" with the percentage of voters who give each a grade of A, B, C, D, or F:\n\n**Trump:**\n- A or B: 30%\n- C: 19%\n- D: 15%\n- F: 35%\n- Average grade: C-\n\n**Clinton:**\n- A or B: 43%\n- C: 20%\n- D: 16%\n- F: 21%\n- Average grade: C\n\nThe document explicitly states that Trump receives an average grade of C- and Clinton receives an average grade of C. Since C is higher than C-, Clinton has a higher median/average campaign grade.\n\nAnswer: Clinton", "citations": [], "input_tokens": 24361, "output_tokens": 195, "cost_micros": 76008, "latency_ms": 5969, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "clinton", "normalised_gold": "clinton"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q004", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "str", "gold": "Clinton", "pages": 45, "document_id": 5222, "arm": "surfsense_agentic", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to find information about campaign grades for Trump and Clinton.Let me search for \"campaign grade\" in the document to find relevant information.Let me search for \"grade\" to find relevant sections.Now let me read the specific sections that mention grades for Trump and Clinton. Let me look at the chunks around line 260 and 2800-2900.Good! Now let me look for more specific information about median grades. Let me check the section around line 2800-2900 where grades are discussed in detail.Perfect! I found the information. Let me look at the table that shows the grade distribution to calculate the median. From lines 2820-2838, I can see:\n\n**Trump:**\n- A or B: 30%\n- C: 19%\n- D: 15%\n- F: 35%\n\n**Clinton:**\n- A or B: 43%\n- C: 20%\n- D: 16%\n- F: 21%\n\nThe document also explicitly states:\n- Trump receives a C- grade on average (line 2801-2802)\n- Clinton's average grade is a C (line 2897)\n\nTo find the median, I need to look at where the 50th percentile falls:\n\n**Trump:**\n- 0-30%: A or B\n- 31-49%: C\n- 50-64%: D (the 50th percentile falls here)\n- 65-100%: F\n\n**Clinton:**\n- 0-43%: A or B\n- 44-63%: C (the 50th percentile falls here)\n- 64-79%: D\n- 80-100%: F\n\nSo Trump's median grade is D, and Clinton's median grade is C.\n\nAnswer: Clinton", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 45171, "error": null, "extra": {"thread_id": 1925, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_5b7e95c94e72489983c39189c2251166", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 287, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "clinton", "normalised_gold": "clinton"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q005", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "int", "gold": "3", "pages": 45, "document_id": 5222, "arm": "native_pdf", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q005", "answer_letter": null, "raw_text": "I need to search through the document for charts related to mudslinging.\n\nLooking through the document, I can find references to mudslinging in the following locations:\n\n1. On page 5 (document page 7), there is a chart titled \"Record share of voters say there was more campaign 'mudslinging'\" which shows the percentage of voters who say there was more or less mudslinging than in past elections from 1992 to 2016.\n\n2. On page 32 (document page 34), there is another chart with the same title \"More see 'mud-slinging,' less focus on issues\" which shows similar data about mudslinging perceptions.\n\nLet me verify these are distinct charts:\n- The first chart on page 5 shows data points: 68, 49, 34, 72, 54, 68, 92 for \"More mudslinging\" \n- The second chart on page 32 shows the same data\n\nThese appear to be the same chart repeated in different sections of the document.\n\nTherefore, there is 1 unique chart related to mudslinging (though it appears twice in the document).\n\nAnswer: 1", "citations": [], "input_tokens": 90760, "output_tokens": 261, "cost_micros": 276195, "latency_ms": 37187, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "11-21-16-Updated-Post-Election-Release.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "3"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q005", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "int", "gold": "3", "pages": 45, "document_id": 5222, "arm": "azure_basic_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q005", "answer_letter": null, "raw_text": "I need to find all charts/figures in the document that are related to mudslinging.\n\nLet me search through the document for references to \"mudslinging\":\n\n1. On page 6, there is a chart titled \"Record share of voters say there was more campaign 'mudslinging'\" showing percentages from 1992-2016 of voters who say there was more or less mudslinging than in past elections.\n\n2. On page 33, there is another chart titled \"More see 'mud-slinging,' less focus on issues\" which shows the same data about mudslinging over time (1992-2016) plus a separate section about discussion of issues.\n\nLet me verify if these are two separate charts or if one is a repeat:\n- Page 6 shows the mudslinging chart in the context of the executive summary\n- Page 33 shows the same mudslinging data but combined with another chart about discussion of issues in a single figure\n\nThese appear to be presenting the same mudslinging data, but in different contexts. The page 33 version combines it with the issues discussion data.\n\nCounting distinct charts that contain mudslinging data:\n1. The chart on page 6 (standalone mudslinging chart)\n2. The chart on page 33 (mudslinging combined with issues discussion)\n\nSince the question asks about charts \"related to mudslinging,\" both of these count as they both contain mudslinging data.\n\nAnswer: 2", "citations": [], "input_tokens": 19113, "output_tokens": 323, "cost_micros": 62184, "latency_ms": 10187, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "3"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q005", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "int", "gold": "3", "pages": 45, "document_id": 5222, "arm": "azure_premium_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q005", "answer_letter": null, "raw_text": "I need to find all charts/figures in the document that are related to mudslinging.\n\nLet me search through the document for references to \"mudslinging\":\n\n1. On page 5, there is a section titled \"Record share of voters say there was more campaign 'mudslinging'\" with a chart showing \"% of voters who say there was ___ than in past elections\" with \"More mudslinging\" and \"Less mudslinging\" data from years 92, 96, 00, 04, 08, 12, 16.\n\n2. On page 32, there is another chart titled \"More see 'mud-slinging,' less focus on issues\" showing \"% of voters who say there was ... More mudslinging than usual\" and \"Less mudslinging than usual\" from years 92, 96, 00, 04, 08, 12, 16.\n\nThese appear to be the same chart repeated in different sections of the document (one in the executive summary and one in the detailed findings).\n\nLet me verify if there are any other mudslinging-related charts... No, I only find these two instances, which appear to be the same chart shown in different parts of the document.\n\nSince the question asks how many charts are related to mudslinging, and there are 2 instances of mudslinging charts in the document (even though they show the same data), the answer would be 2.\n\nHowever, if we're counting unique charts, it would be 1 chart that appears twice.\n\nLooking at the context of the question, it seems to be asking for the count of chart occurrences in the document, which would be 2.\n\nAnswer: 2", "citations": [], "input_tokens": 24392, "output_tokens": 372, "cost_micros": 78756, "latency_ms": 10358, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "3"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q005", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "int", "gold": "3", "pages": 45, "document_id": 5222, "arm": "llamacloud_basic_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q005", "answer_letter": null, "raw_text": "I need to find all charts in the document that are related to \"mudslinging.\"\n\nLet me search through the document for references to mudslinging:\n\n1. On page 6, there is a chart titled \"Record share of voters say there was more campaign 'mudslinging'\" which shows percentages of voters who say there was more or less mudslinging than in past elections from 1996-2016.\n\n2. On page 33, there is another chart titled \"More see 'mud-slinging,' less focus on issues\" which shows percentages of voters who say there was more mudslinging than usual and less mudslinging than usual from 1992-2016.\n\nLet me verify there are no other charts about mudslinging by carefully reviewing the document again.\n\nAfter reviewing the document thoroughly, I can confirm there are 2 charts specifically related to mudslinging:\n- One on page 6 showing the record share saying there was more mudslinging\n- One on page 33 showing mud-slinging trends and comparison with issue discussion\n\nAnswer: 2", "citations": [], "input_tokens": 21613, "output_tokens": 237, "cost_micros": 68394, "latency_ms": 7250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "3"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q005", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "int", "gold": "3", "pages": 45, "document_id": 5222, "arm": "llamacloud_premium_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q005", "answer_letter": null, "raw_text": "I need to find all charts/tables in the document that are related to \"mudslinging\" or negative campaigning.\n\nLet me search through the document:\n\n1. On page 5, there is a chart titled \"Record share of voters say there was more campaign 'mudslinging'\" - This shows data about mudslinging from 1992-2016.\n\n2. On page 32, there is a chart titled \"More see 'mud-slinging,' less focus on issues\" - This shows data about mudslinging and discussion of issues from 1992-2016.\n\nThese appear to be the only two charts specifically related to mudslinging in the document. The first chart shows the percentage of voters who say there was more mudslinging than in past elections over different election years. The second chart also shows mudslinging data alongside data about discussion of issues.\n\nAnswer: 2", "citations": [], "input_tokens": 24353, "output_tokens": 194, "cost_micros": 75969, "latency_ms": 8342, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "3"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q005", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "int", "gold": "3", "pages": 45, "document_id": 5222, "arm": "surfsense_agentic", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to find charts related to mudslinging.Let me read the full document to search for content related to mudslinging and identify charts:Let me continue reading to see if there are more charts related to mudslinging:Based on my thorough reading of the document, I found the following charts related to mudslinging:\n\n1. **Lines 788-853 (chunk 49482)**: A chart titled \"Record share of voters say there was more campaign 'mudslinging'\" showing percentages of voters who said there was more or less mudslinging than in past elections, with data from years 1992, 1996, 2000, 2004, 2008, 2012, and 2016.\n\n2. **Lines 3933-4048 (chunk 49595)**: A chart in the section \"More see 'mud-slinging,' less focus on issues\" showing percentages of voters who say there was \"More mudslinging than usual\" and \"Less mudslinging than usual\" across election years 1992, 1996, 2000, 2004, 2008, 2012, and 2016.\n\nThese appear to be two representations of the same data - one earlier in the document and one later in a more detailed section about campaign evaluations.\n\nAnswer: 2", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 62437, "error": null, "extra": {"thread_id": 1926, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_8bd535bded05431a9423ba96684e000e", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 344, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "3"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q006", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "list", "gold": "[53, 1.4]", "pages": 45, "document_id": 5222, "arm": "native_pdf", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q006", "answer_letter": null, "raw_text": "I need to find information about voters' reactions to Trump's victory, specifically looking for percentages who feel \"uneasy\" and \"excited.\"\n\nLooking through the document, I can find on page 6 (document page 8) a section titled \"Top reactions to Trump's victory \u2013 'uneasy,' 'hopeful'\" which shows voter reactions.\n\nThe chart shows:\n- 53% say Trump's election makes them feel \"uneasy\"\n- 51% say it makes them feel \"hopeful\"\n- 41% say \"sad\"\n- 41% say \"scared\"\n- 36% say \"proud\"\n- 31% say \"angry\"\n\nI don't see \"excited\" listed as one of the emotional reactions measured. The document shows \"hopeful,\" \"proud,\" \"uneasy,\" \"sad,\" \"scared,\" and \"angry\" but not \"excited.\"\n\nFor \"uneasy,\" the answer is 53%.\n\nSince \"excited\" is not measured in this survey, I cannot provide a percentage for it.\n\nAnswer: 53% uneasy, excited not measured", "citations": [], "input_tokens": 90783, "output_tokens": 245, "cost_micros": 276024, "latency_ms": 34905, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "11-21-16-Updated-Post-Election-Release.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "53 uneasy, excited not measured", "normalised_gold": "1 4, 53"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q006", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "list", "gold": "[53, 1.4]", "pages": 45, "document_id": 5222, "arm": "azure_basic_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q006", "answer_letter": null, "raw_text": "Looking at the document for voters' reactions to Trump's victory:\n\nFrom page 7, there is a table titled \"Top reactions to Trump's victory - 'uneasy,' 'hopeful'\" which shows:\n- Uneasy: 53%\n\nFrom page 8, there is a word cloud table showing one-word reactions. Under \"Trump voters,\" I can see:\n- Excited: 16 (this is the actual number of respondents who mentioned this word, not a percentage)\n\nHowever, the question asks for percentages. Looking at page 11, there is a more detailed table showing emotional reactions:\n\"% of voters who say the election of Donald Trump makes them feel ...\"\n- Uneasy: 53%\n\nThe document does not provide \"excited\" as a percentage in the main emotional reactions table on pages 7 and 11. The word \"excited\" only appears in the word cloud with an actual count of 16 mentions from Trump voters, but this is not presented as a percentage of all voters.\n\nBased on the data provided in percentage form:\n- Uneasy: 53%\n- Excited: Not provided as a percentage of all voters\n\nAnswer: 53%, Not provided", "citations": [], "input_tokens": 19136, "output_tokens": 269, "cost_micros": 61443, "latency_ms": 9281, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "53, not provided", "normalised_gold": "1 4, 53"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q006", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "list", "gold": "[53, 1.4]", "pages": 45, "document_id": 5222, "arm": "azure_premium_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the percentages of voters who felt \"uneasy\" and \"excited\" about Trump's victory.\n\nLooking at the document, on page 6 (and repeated on page 10), there is a section titled \"Top reactions to Trump's victory - 'uneasy,' 'hopeful'\" which shows:\n\n% of voters who say the election of Donald Trump makes them feel:\n- Hopeful: 51%\n- Proud: 36%\n- Uneasy: 53%\n- Sad: 41%\n- Scared: 41%\n- Angry: 31%\n\nSo \"uneasy\" is 53%.\n\nHowever, I don't see \"excited\" listed as one of the reaction options in this chart.\n\nLooking at page 7, there is a word cloud section showing individual words mentioned by Trump and Clinton voters. For Trump voters, \"Excited\" appears with 16 mentions, but this is not a percentage - these are actual counts of how many people mentioned that specific word.\n\nThe main emotional reactions chart on page 6 does not include \"excited\" as one of the measured emotions. The document only provides percentages for: hopeful, proud, uneasy, sad, scared, and angry.\n\nSince \"excited\" is not measured as a percentage in the same way as \"uneasy,\" I can only provide the percentage for \"uneasy\" which is 53%.\n\nAnswer: 53% uneasy, excited not measured as percentage", "citations": [], "input_tokens": 24415, "output_tokens": 331, "cost_micros": 78210, "latency_ms": 9875, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "53 uneasy, excited not measured as percentage", "normalised_gold": "1 4, 53"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q006", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "list", "gold": "[53, 1.4]", "pages": 45, "document_id": 5222, "arm": "llamacloud_basic_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the percentage of voters who feel \"uneasy\" and \"excited\" about Trump's victory.\n\nLooking at the document, I can find a section titled \"Top reactions to Trump's victory \u2013 'uneasy,' 'hopeful'\" which shows:\n\n% of voters who say the election of Donald Trump makes them feel \u2026\n\n| Feeling | % of Voters |\n| ------- | ----------- |\n| Uneasy  | 53          |\n| Hopeful | 51          |\n| Proud   | 36          |\n| Scared  | 41          |\n| Sad     | 41          |\n| Angry   | 31          |\n\nI can see that 53% feel \"uneasy.\"\n\nFor \"excited,\" I need to look at the word cloud section titled \"Trump's victory in a word\" which shows responses from Trump voters and Clinton voters. Under Trump voters, I can see:\n- 16 Excited\n\nHowever, this appears to be a count (16 respondents), not a percentage. The note states \"Figures show actual number of respondents who offered each response; these numbers are not percentages.\"\n\nLooking back at the main emotional reactions table on the previous page, \"excited\" is not listed as one of the specific emotions measured in the survey question about how Trump's election makes them feel.\n\nBased on the available data in the document:\n- Uneasy: 53%\n- Excited: Not measured as a percentage in the emotional reactions survey (only appears as a word count of 16 in the open-ended response)\n\nAnswer: 53%, not measured as percentage", "citations": [], "input_tokens": 21636, "output_tokens": 364, "cost_micros": 70368, "latency_ms": 11468, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "53, not measured as percentage", "normalised_gold": "1 4, 53"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q006", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "list", "gold": "[53, 1.4]", "pages": 45, "document_id": 5222, "arm": "llamacloud_premium_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the percentages of voters who felt \"uneasy\" and \"excited\" about Trump's victory.\n\nLooking at the document, I can find information about emotional reactions to Trump's election on page 10:\n\n\"Voters express a mix of emotional reactions to the election of Donald Trump. On the positive side, 51% say that Trump's election makes them feel hopeful; somewhat fewer say it makes them feel proud (36%).\n\nOn the negative side, 53% say Trump's election makes them feel uneasy. About four-in-ten say his election makes them feel sad (41%) or scared (41%), and 31% say the election of Trump makes them feel angry.\"\n\nSo 53% felt \"uneasy.\"\n\nLooking at the table on page 6 titled \"Top reactions to Trump's victory \u2013 'uneasy,' 'hopeful'\" and the word cloud on page 7, I can see the following reactions listed:\n- Hopeful: 51%\n- Proud: 36%\n- Uneasy: 53%\n- Sad: 41%\n- Scared: 41%\n- Angry: 31%\n\nIn the word cloud on page 7 showing \"Trump's victory in a word,\" I can see \"Excited\" listed among Trump voters with 16 mentions, but this is showing the actual number of respondents, not percentages.\n\nThe main table showing percentage reactions does not include \"excited\" as one of the measured emotions. The document only provides percentages for: hopeful, proud, uneasy, sad, scared, and angry.\n\nBased on the available data:\n- Uneasy: 53%\n- Excited: Not measured as a percentage in the survey\n\nAnswer: 53% uneasy, excited not measured as percentage", "citations": [], "input_tokens": 24376, "output_tokens": 401, "cost_micros": 79143, "latency_ms": 11812, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "53 uneasy, excited not measured as percentage", "normalised_gold": "1 4, 53"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q006", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "list", "gold": "[53, 1.4]", "pages": 45, "document_id": 5222, "arm": "surfsense_agentic", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the document to find the specific percentages for \"uneasy\" and \"excited\" reactions to Trump's victory.Let me read the specific section with the data about voter reactions to see the percentages clearly.Based on the document, I can see the percentages clearly in the figure at lines 890-916:\n\n- **Uneasy: 53%**\n- **Excited: 16** (this appears to be a count in the open-ended word table, not a percentage)\n\nLet me verify if there's a percentage for \"excited\" elsewhere in the document.The document shows a figure with percentages of voter reactions to Trump's victory. The measured reactions with percentages are:\n- Hopeful: 51%\n- Proud: 36%\n- **Uneasy: 53%**\n- Sad: 41%\n- Scared: 41%\n- Angry: 31%\n\n\"Excited\" appears in a different table showing word counts from open-ended responses (16 mentions from Trump voters), not as a percentage measurement of all voters.\n\nBased on the document, the percentage for \"uneasy\" is **53%**. There is no percentage given for \"excited\" as a reaction category - it only appears as a word count (16) in the open-ended responses table.\n\nAnswer: 53%, 16", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 44733, "error": null, "extra": {"thread_id": 1927, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_5850dcc4518c4b98b48b2eae8c928f95", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 326, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "16, 53", "normalised_gold": "1 4, 53"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q007", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "float", "gold": "31%", "pages": 45, "document_id": 5222, "arm": "native_pdf", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q007", "answer_letter": null, "raw_text": "I need to find information about voters' reactions to Trump's victory, specifically the percentage who felt \"angry.\"\n\nLooking through the document, I find a section on page 10 titled \"Emotional reactions to Trump's election\" with a chart showing \"Top reactions to Trump's victory \u2013 'uneasy,' 'hopeful'\"\n\nThe chart shows percentages of voters who say the election of Donald Trump makes them feel various emotions:\n- Hopeful: 51%\n- Proud: 36%\n- Uneasy: 53%\n- Sad: 41%\n- Scared: 41%\n- Angry: 31%\n\nAnswer: 31", "citations": [], "input_tokens": 90769, "output_tokens": 150, "cost_micros": 274557, "latency_ms": 34717, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "11-21-16-Updated-Post-Election-Release.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "31.0", "normalised_gold": "31.0"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q007", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "float", "gold": "31%", "pages": 45, "document_id": 5222, "arm": "azure_basic_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q007", "answer_letter": null, "raw_text": "I need to find the percentage of voters who feel \"angry\" about Trump's victory.\n\nLooking through the document for information about emotional reactions to Trump's election:\n\nOn page 7, there is a section titled \"Election reactions: Nearly all Trump supporters feel 'hopeful'\" which includes a chart showing \"Top reactions to Trump's victory - 'uneasy,' 'hopeful'\"\n\nThe chart shows the percentage of voters who say the election of Donald Trump makes them feel various emotions:\n- Hopeful: 51%\n- Proud: 36%\n- Uneasy: 53%\n- Sad: 41%\n- Scared: 41%\n- Angry: 31%\n\nThe document states: \"About half of voters (53%) say his election makes them feel 'uneasy,' while nearly as many (51%) say it makes them feel 'hopeful'. Smaller shares say his election triumph makes them feel 'scared', 'sad' (41% each), 'proud' (36%) or 'angry' (31%).\"\n\nAnswer: 31", "citations": [], "input_tokens": 19122, "output_tokens": 243, "cost_micros": 61011, "latency_ms": 7046, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "31.0", "normalised_gold": "31.0"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q007", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "float", "gold": "31%", "pages": 45, "document_id": 5222, "arm": "azure_premium_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q007", "answer_letter": null, "raw_text": "Looking at the document for information about voters' reactions to Trump's victory.\n\nOn page 6 (and repeated on page 10), there is a chart titled \"Top reactions to Trump's victory - 'uneasy,' 'hopeful'\" which shows the percentage of voters who say the election of Donald Trump makes them feel various emotions.\n\nAccording to this chart:\n- Hopeful: 51%\n- Proud: 36%\n- Uneasy: 53%\n- Sad: 41%\n- Scared: 41%\n- Angry: 31%\n\nThe document states that 31% of voters say Trump's election makes them feel \"angry.\"\n\nAnswer: 31", "citations": [], "input_tokens": 24401, "output_tokens": 157, "cost_micros": 75558, "latency_ms": 5485, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "31.0", "normalised_gold": "31.0"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q007", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "float", "gold": "31%", "pages": 45, "document_id": 5222, "arm": "llamacloud_basic_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q007", "answer_letter": null, "raw_text": "I need to find the percentage of voters who feel \"angry\" about Trump's election.\n\nLooking through the document for information about voters' emotional reactions to Trump's victory...\n\nOn page 12 (in the section \"Voters express a mix of emotional reactions to the election of Donald Trump\"), I find a table showing:\n\n\"% of voters who say the election of Donald Trump makes them feel \u2026\"\n\nThe feelings listed include:\n- Hopeful: 51%\n- Proud: 36%\n- Uneasy: 53%\n- Sad: 41%\n- Scared: 41%\n- Angry: 31%\n\nSo 31% of voters say the election of Trump makes them feel angry.\n\nAnswer: 31", "citations": [], "input_tokens": 21622, "output_tokens": 166, "cost_micros": 67356, "latency_ms": 6328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "31.0", "normalised_gold": "31.0"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q007", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "float", "gold": "31%", "pages": 45, "document_id": 5222, "arm": "llamacloud_premium_lc", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q007", "answer_letter": null, "raw_text": "Looking through the document for emotional reactions to Trump's victory...\n\nOn page 6, I find the section \"Top reactions to Trump's victory \u2013 'uneasy,' 'hopeful'\" which shows:\n\n% of voters who say the election of Donald Trump makes them feel ...\n\n| Reaction | %  |\n| -------- | -- |\n| Hopeful  | 51 |\n| Proud    | 36 |\n| Uneasy   | 53 |\n| Sad      | 41 |\n| Scared   | 41 |\n| Angry    | 31 |\n\nThe table clearly shows that 31% of voters say Trump's election makes them feel angry.\n\nAnswer: 31", "citations": [], "input_tokens": 24362, "output_tokens": 166, "cost_micros": 75576, "latency_ms": 5985, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "31.0", "normalised_gold": "31.0"}}
+{"qid": "11-21-16-Updated-Post-Election-Release.pdf::Q007", "doc_id": "11-21-16-Updated-Post-Election-Release.pdf", "answer_format": "float", "gold": "31%", "pages": 45, "document_id": 5222, "arm": "surfsense_agentic", "question_id": "11-21-16-Updated-Post-Election-Release.pdf::Q007", "answer_letter": null, "raw_text": "I'll read the document to find information about voter reactions to Trump's victory.Let me search for sections that discuss reactions or emotions about Trump's victory.Great! I found a reference to \"angry\" in the context of reactions. Let me read the specific chunk that contains this information.Perfect! I found the information. According to the document, 31% of voters say Trump's election makes them feel \"angry\".\n\nAnswer: 31", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 32359, "error": null, "extra": {"thread_id": 1928, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_4e0d0df3d4f749b8a5376414cce75543", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 148, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "31.0", "normalised_gold": "31.0"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q000", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "list", "gold": "['46', '27', '64']", "pages": 28, "document_id": 5223, "arm": "native_pdf", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q000", "answer_letter": null, "raw_text": "I need to find data from 2015 about how many people believed the government was doing very/fairly well in reducing the threat of terrorism.\n\nLooking at page 2, I can see a chart titled \"For first time since 9/11, government's job rating on terrorism turns negative\" which shows the percentage who say the government is doing \"Very/Fairly well\" in reducing the threat of terrorism.\n\nFrom the chart on page 2, at the end point in 2015 (specifically December 2015), I can see:\n- Total: 46%\n\nLooking at page 10-11, there's a detailed breakdown by party. The chart on page 10 shows \"Across-the-board drop in ratings of govt efforts to reduce terrorist threat\" with partisan breakdowns.\n\nFrom the chart on page 10, for 2015 (the rightmost data point):\n- Republican: 27%\n- Democrat: 64%\n- Independent: 44%\n\nSo for the question asking about adults (total), Republicans, and Democrats in 2015:\n- Adults (Total): 46%\n- Republicans: 27%\n- Democrats: 64%\n\nAnswer: 46, 27, 64", "citations": [], "input_tokens": 58172, "output_tokens": 269, "cost_micros": 178551, "latency_ms": 22812, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "12-15-15-ISIS-and-terrorism-release-final.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "27, 46, 64", "normalised_gold": "27, 46, 64"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q000", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "list", "gold": "['46', '27', '64']", "pages": 28, "document_id": 5223, "arm": "azure_basic_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the percentage of adults, Republicans, and Democrats who believed the government was doing very/fairly well in reducing the threat of terrorism in 2015.\n\nLooking at the document, I can find this information in the section \"Views of how the government is handling the terrorist threat\" on page 10-11.\n\nThe document states:\n- Total (all adults): 46% say the government is doing very/fairly well\n- Democrats: 64% say the government is doing very/fairly well  \n- Republicans: 27% say the government is doing very/fairly well\n\nThis data is from the December 2015 survey (conducted Dec. 8-13, 2015).\n\nAnswer: 46, 27, 64", "citations": [], "input_tokens": 13406, "output_tokens": 163, "cost_micros": 42663, "latency_ms": 5468, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "27, 46, 64", "normalised_gold": "27, 46, 64"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q000", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "list", "gold": "['46', '27', '64']", "pages": 28, "document_id": 5223, "arm": "azure_premium_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document for the specific data on government performance in reducing the threat of terrorism in 2015:\n\nFrom the chart on page 10 titled \"Across-the-board drop in ratings of govt efforts to reduce terrorist threat\" showing percentages saying government is doing very/fairly well in reducing the threat of terrorism:\n\nFor December 2015:\n- Total/All adults: 46%\n- Republicans: 27%\n- Democrats: 64%\n\nAnswer: 46, 27, 64", "citations": [], "input_tokens": 19266, "output_tokens": 113, "cost_micros": 59493, "latency_ms": 5296, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "27, 46, 64", "normalised_gold": "27, 46, 64"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q000", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "list", "gold": "['46', '27', '64']", "pages": 28, "document_id": 5223, "arm": "llamacloud_basic_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the percentages of surveyed adults, Republicans, and Democrats who believed the government was doing very/fairly well in reducing the threat of terrorism in 2015.\n\nLooking at the document, I can find this information in the section titled \"Across-the-board drop in ratings of govt efforts to reduce terrorist threat\" on page 10.\n\nThe chart shows \"% saying govt is doing very/fairly well in reducing the threat of terrorism\" for December 2015:\n- Total (all adults): 46%\n- Republican: 27%\n- Democrat: 64%\n\nAnswer: 46, 27, 64", "citations": [], "input_tokens": 14501, "output_tokens": 140, "cost_micros": 45603, "latency_ms": 5233, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "27, 46, 64", "normalised_gold": "27, 46, 64"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q000", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "list", "gold": "['46', '27', '64']", "pages": 28, "document_id": 5223, "arm": "llamacloud_premium_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document for data on how well the government is doing in reducing the threat of terrorism in 2015:\n\nFrom the table on page 10 titled \"How well is govt doing reducing threat of terrorism?\" and the section \"Views of how the government is handling the terrorist threat\" on pages 10-11:\n\nIn December 2015:\n- Total: 46% said very/fairly well\n- Republican: 27% said very/fairly well  \n- Democrat: 64% said very/fairly well\n\nAnswer: 46, 27, 64", "citations": [], "input_tokens": 18633, "output_tokens": 125, "cost_micros": 57774, "latency_ms": 5375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "27, 46, 64", "normalised_gold": "27, 46, 64"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q000", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "list", "gold": "['46', '27', '64']", "pages": 28, "document_id": 5223, "arm": "surfsense_agentic", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find the specific data about beliefs regarding government performance in reducing terrorism threats in 2015.Let me search for data about government performance in reducing terrorism threats:I can see a figure showing data about \"How well is govt doing reducing threat of terrorism?\" with percentages for \"Very/Fairly well\" across years. I can see 46 for 2015. Now let me look for the breakdown by party affiliation (Republicans and Democrats):Let me search more specifically for partisan breakdown of government performance on terrorism:I found a reference to partisan data. Let me read that section:The figure I found shows partisan data over time. Let me read more context around this section to understand the 2015 data points:Perfect! I found the data. The text clearly states:\n\n- Overall in 2015: 46% say government is doing very or fairly well\n- Democrats in 2015: 64% say government is doing at least fairly well  \n- Republicans in 2015: 27% say government is doing very or fairly well\n\nThis is also confirmed by the figure showing the data points for year '15' with Democrat at 64, Republican at 27.\n\nAnswer: 46, 27, 64", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 42844, "error": null, "extra": {"thread_id": 1929, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_cb2b43cade5c41b98919c13d86621c55", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 281, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "27, 46, 64", "normalised_gold": "27, 46, 64"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q002", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "str", "gold": "Iran's nuclear program", "pages": 28, "document_id": 5223, "arm": "native_pdf", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the threats that increased from August 2014 to December 2015, and then determine which has the biggest Republican-Democrat difference.\n\nLooking at page 15, I can see a chart titled \"ISIS rises higher atop public's threat list\" showing various threats with their percentages in December 2015 and August 2014, plus the change.\n\nThe threats that increased (positive change) from August 2014 to December 2015 are:\n- ISIS: +16 points (67% to 83%)\n- Iran's nuclear program: +3 points (59% to 62%)\n- North Korea's nuclear program: +2 points (57% to 59%)\n- China's emergence as a world power: +1 point (48% to 49%)\n- Global climate change: +1 point (48% to 49%)\n\nNow I need to find the R-D difference for each of these threats. Looking at page 15, there's another chart showing \"Partisan differences in views of top threats to U.S.\" with R-D diff column:\n\n- Iran's nuclear program: R+27\n- ISIS: R+14\n- China's emergence as a world power: R+18\n- North Korea's nuclear program: R+13\n- Global climate change: D+51 (this is actually a Democratic advantage, so -51 for R-D)\n\nAmong the threats that increased, Iran's nuclear program has the biggest R-D difference at R+27.\n\nAnswer: Iran's nuclear program", "citations": [], "input_tokens": 58152, "output_tokens": 342, "cost_micros": 179586, "latency_ms": 35328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "12-15-15-ISIS-and-terrorism-release-final.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "iran s nuclear program", "normalised_gold": "iran s nuclear program"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q002", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "str", "gold": "Iran's nuclear program", "pages": 28, "document_id": 5223, "arm": "azure_basic_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q002", "answer_letter": null, "raw_text": "I need to find which threats had bigger percentages in December 2015 compared to August 2014, and then determine which has the largest Republican-Democrat difference.\n\nLooking at the table \"ISIS rises higher atop public's threat list\" on page 14:\n\nThreats that increased from August 2014 to December 2015:\n1. ISIS: 67% \u2192 83% (+16)\n2. Iran's nuclear program: 59% \u2192 62% (+3)\n3. North Korea's nuclear program: 57% \u2192 59% (+2)\n4. China's emergence as a world power: 48% \u2192 49% (+1)\n5. Global climate change: 48% \u2192 49% (+1)\n\nNow I need to find the R-D differences for these threats from the table \"Partisan differences in views of top threats to U.S.\" on page 15:\n\n1. ISIS: R-D difference = +14 (93% Republican - 79% Democrat)\n2. Iran's nuclear program: R-D difference = +27 (79% Republican - 52% Democrat)\n3. North Korea's nuclear program: R-D difference = +13 (70% Republican - 57% Democrat)\n4. China's emergence as a world power: R-D difference = +18 (62% Republican - 44% Democrat)\n5. Global climate change: R-D difference = -51 (22% Republican - 73% Democrat = -51, but this is negative)\n\nAmong those that increased from 2014 to 2015, Iran's nuclear program has the biggest R-D difference at +27 percentage points.\n\nAnswer: Iran's nuclear program", "citations": [], "input_tokens": 13386, "output_tokens": 380, "cost_micros": 45858, "latency_ms": 8984, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "iran s nuclear program", "normalised_gold": "iran s nuclear program"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q002", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "str", "gold": "Iran's nuclear program", "pages": 28, "document_id": 5223, "arm": "azure_premium_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, I need to find threats that increased from August 2014 to December 2015, and then determine which has the biggest Republican-Democrat difference.\n\nFrom page 14, the table \"ISIS rises higher atop public's threat list\" shows changes from August 2014 to December 2015:\n- ISIS: +16 points\n- Iran's nuclear program: +3 points\n- North Korea's nuclear program: +2 points\n- China's emergence as a world power: +1 point\n- Global climate change: +1 point\n\nFrom page 15, the table \"Partisan differences in views of top threats to U.S.\" shows R-D differences in December 2015:\n- ISIS: 93% (R) - 79% (D) = +14 R-D difference\n- Iran's nuclear program: 79% (R) - 52% (D) = +27 R-D difference\n- North Korea's nuclear program: 70% (R) - 57% (D) = +13 R-D difference\n- China's emergence as a world power: 62% (R) - 44% (D) = +18 R-D difference\n\nAmong threats that increased from August 2014 to December 2015, Iran's nuclear program has the biggest R-D difference at 27 points.\n\nAnswer: Iran's nuclear program", "citations": [], "input_tokens": 19246, "output_tokens": 314, "cost_micros": 62448, "latency_ms": 7014, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "iran s nuclear program", "normalised_gold": "iran s nuclear program"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q002", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "str", "gold": "Iran's nuclear program", "pages": 28, "document_id": 5223, "arm": "llamacloud_basic_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, I need to find the threats that increased from August 2014 to December 2015, then identify which has the biggest Republican-Democrat difference.\n\nFrom the table \"ISIS rises higher atop public's threat list\" (page 14), the threats that increased are:\n- ISIS: +16 points\n- Iran's nuclear program: +3 points\n- North Korea's nuclear program: +2 points\n- China's emergence: +1 point\n- Global climate change: +1 point\n\nFrom the table \"Partisan differences in views of top threats to U.S.\" (page 15), the R-D differences are:\n- ISIS: 93% (R) - 79% (D) = +14\n- Iran's nuclear program: 79% (R) - 52% (D) = +27\n- North Korea's nuclear program: 70% (R) - 57% (D) = +13\n- China's emergence: 62% (R) - 44% (D) = +18\n\nAmong the threats that increased from August 2014, Iran's nuclear program has the biggest R-D difference at 27 points.\n\nAnswer: Iran's nuclear program", "citations": [], "input_tokens": 14481, "output_tokens": 271, "cost_micros": 47508, "latency_ms": 7327, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "iran s nuclear program", "normalised_gold": "iran s nuclear program"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q002", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "str", "gold": "Iran's nuclear program", "pages": 28, "document_id": 5223, "arm": "llamacloud_premium_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q002", "answer_letter": null, "raw_text": "I need to find threats where concern increased from August 2014 to December 2015, then identify which has the biggest Republican-Democrat difference.\n\nFrom the table \"ISIS rises higher atop public's threat list\" on page 14, I can see the changes from August 2014 to December 2015:\n\n- ISIS: +16 points (83% in Dec 2015 vs 67% in Aug 2014)\n- Iran's nuclear program: +3 points\n- North Korea's nuclear program: +2 points\n- China's emergence as a world power: +1 point\n- Global climate change: +1 point\n- Israeli-Palestinian conflict: -5 points (decreased)\n- Growing authoritarianism in Russia: -11 points (decreased)\n\nSo the threats that increased are: ISIS, Iran's nuclear program, North Korea's nuclear program, China's emergence, and global climate change.\n\nFrom the table \"Partisan differences in views of top threats to U.S.\" on page 15, the R-D differences in December 2015 are:\n\n- ISIS: +14 (93% Rep - 79% Dem)\n- Iran's nuclear program: +27 (79% Rep - 52% Dem)\n- North Korea's nuclear program: +13 (70% Rep - 57% Dem)\n- China's emergence as a world power: +18 (62% Rep - 44% Dem)\n- Global climate change: -51 (22% Rep - 73% Dem) - Note: This is negative, meaning Democrats see it as more of a threat\n\nAmong those that increased from 2014 to 2015, Iran's nuclear program has the biggest R-D difference at +27 points.\n\nAnswer: Iran's nuclear program", "citations": [], "input_tokens": 18613, "output_tokens": 402, "cost_micros": 61869, "latency_ms": 9391, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "iran s nuclear program", "normalised_gold": "iran s nuclear program"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q002", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "str", "gold": "Iran's nuclear program", "pages": 28, "document_id": 5223, "arm": "surfsense_agentic", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the document that appears most relevant based on the priority list: the ISIS and terrorism release from December 2015.Let me continue reading to find the data about threats and R-D differences:I can see the document mentions ISIS increased from 67% in August 2014 to 83% as a major threat. Let me continue reading to find data about other threats and their R-D differences:Let me search for data about threats to well-being and R-D differences. I need to find a table or chart comparing different threats:Good! I found references to \"R-D diff\" in the document. Let me read the section around line 1364:This table shows the \"most important problem\" but I need to find the table about \"threats to well-being\" comparing 2015 to August 2014. Let me continue searching:Perfect! I found the section about threats. Let me read the full table around line 1974-2076:Great! Now I need to see the partisan breakdown table with R-D differences. Let me continue reading:", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 55203, "error": null, "extra": {"thread_id": 1930, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_dd9ddaac9b56498ab35c8337203470ec", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 315, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "i ll read document that appears most relevant based on priority list isis and terrorism release from december 2015 let me continue reading to find data about threats and r d differences i can see document mentions isis increased from 67 in august 2014 to 83 as major threat let me continue reading to find data about other threats and their r d differences let me search for data about threats to well being and r d differences i need to find table or chart comparing different threats good i found references to r d diff in document let me read section around line 1364 this table shows most important problem but i need to find table about threats to well being comparing 2015 to august 2014 let me continue searching perfect i found section about threats let me read full table around line 1974 2076 great now i need to see partisan breakdown table with r d differences let me continue reading", "normalised_gold": "iran s nuclear program"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q004", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "int", "gold": "1", "pages": 28, "document_id": 5223, "arm": "native_pdf", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q004", "answer_letter": null, "raw_text": "I'll go through the document systematically to count all bar charts.\n\nLooking through each page:\n\nPage 1: Title page - no charts\n\nPage 2: \n- One line chart showing \"Views of Government's Handling of Terrorism\"\n- No bar charts\n\nPage 3:\n- One line chart showing partisan gaps\n- One pie chart showing scrutiny of Muslims\n- No bar charts\n\nPage 4:\n- One table showing party preferences on various issues\n- No bar charts\n\nPage 5:\n- One line chart showing views on Islam and violence over time\n- One line chart showing partisan divide\n- One table with demographic data\n- No bar charts\n\nPage 6:\n- Multiple horizontal bar charts showing \"Should Muslims be subject to greater scrutiny\" broken down by different groups\n- Count: approximately 3-4 bar charts on this page\n\nPage 7:\n- Multiple horizontal bar charts continuing the scrutiny question by demographics\n- Count: approximately 3-4 bar charts on this page\n\nPage 8:\n- One table showing \"Most important problem facing nation\"\n- No bar charts\n\nPage 9:\n- One table showing partisan differences\n- No bar charts\n\nPage 10:\n- One line chart showing government ratings over time by party\n- No bar charts\n\nPage 11:\n- One table showing demographic views\n- No bar charts\n\nPage 12:\n- One line chart showing concerns about anti-terrorism policies\n- No bar charts\n\nPage 13:\n- One line chart showing partisan views on anti-terrorism policies\n- One table\n- No bar charts\n\nPage 14:\n- Multiple horizontal bar charts showing ISIS as threat\n- Count: approximately 2 bar charts\n\nPage 15:\n- Multiple horizontal bar charts showing partisan differences in threats\n- Count: approximately 2 bar charts\n\nPage 16:\n- Line charts showing views of U.S. campaign\n- No bar charts\n\nPage 17:\n- Horizontal bar charts showing current assessments\n- Count: approximately 2 bar charts\n\nPage 18:\n- Multiple horizontal bar charts about concerns and ground troops\n- Count: approximately 3-4 bar charts\n\nPage 19:\n- Line chart and horizontal bar charts\n- Count: approximately 2 bar charts\n\nPage 20:\n- Line charts showing views on military force\n-", "citations": [], "input_tokens": 58124, "output_tokens": 512, "cost_micros": 182052, "latency_ms": 17578, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "length", "pdf_filename": "12-15-15-ISIS-and-terrorism-release-final.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "1"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q004", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "int", "gold": "1", "pages": 28, "document_id": 5223, "arm": "azure_basic_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q004", "answer_letter": null, "raw_text": "I need to count all the bar charts in this document.\n\nLet me go through the document systematically:\n\n1. Page 1: \"For first time since 9/11, government's job rating on terrorism turns negative\" - This is a line chart, not a bar chart.\n\n2. Page 2: \"Growing partisan gap in views of whether Islam encourages violence\" - This is a line chart.\n\n3. Page 2: \"Most say more scrutiny of Muslims not warranted\" - This is a pie chart.\n\n4. Page 3: \"GOP favored on terrorism, gun control; Democrats lead on climate change\" - This appears to be a table with bar-like elements showing percentages, but it's formatted as a comparative table.\n\n5. Page 4: \"Does the Islamic religion encourage violence more than other religions?\" - This is a line chart.\n\n6. Page 4: \"Wide partisan divide over whether Islam is more likely to encourage violence\" - This is a line chart.\n\n7. Page 5: \"Demographic divides in views of Islam and violence\" - This is a table.\n\n8. Page 6: \"Should Muslims be subject to greater scrutiny because of their religion?\" - This is a table.\n\n9. Page 7: \"Young people, minorities, less likely to say Muslims should receive greater scrutiny because of their faith\" - This is a table.\n\n10. Page 8: \"Growing share cite terrorism as the most important problem facing the U.S.\" - This is a table.\n\n11. Page 9: \"More Republicans than Democrats cite immigration, terrorism as top problems\" - This is a table.\n\n12. Page 10: \"Across-the-board drop in ratings of govt efforts to reduce terrorist threat\" - This is a line chart.\n\n13. Page 11: \"Evaluations of the government's job reducing the threat of terrorism\" - This is a table.\n\n14. Page 12: \"More concern with country's protection than civil liberties\" - This is a line chart.\n\n15. Page 13: \"Most Republicans say govt anti-terror policies have not gone 'far enough'\" - This is a line chart.\n\n16. Page 13: \"Young adults more concerned about civil liberties than older people\" - This is a table.\n\n17. Page 14: \"ISIS rises higher atop", "citations": [], "input_tokens": 13358, "output_tokens": 512, "cost_micros": 47754, "latency_ms": 11733, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "17", "normalised_gold": "1"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q004", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "int", "gold": "1", "pages": 28, "document_id": 5223, "arm": "azure_premium_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q004", "answer_letter": null, "raw_text": "I'll carefully count all the bar charts in this document.\n\nLet me go through the document systematically:\n\n1. Page 1: \"For first time since 9/11, government's job rating on terrorism turns negative\" - This is a line chart, not a bar chart.\n\n2. Page 2: \"Growing partisan gap in views of whether Islam encourages violence\" - This is a line chart, not a bar chart.\n\n3. Page 2: \"Most say more scrutiny of Muslims not warranted\" - This is a pie chart, not a bar chart.\n\n4. Page 3: \"GOP favored on terrorism, gun control; Democrats lead on climate change\" - This appears to be a table with horizontal bars showing differences.\n\n5. Page 4: \"Does the Islamic religion encourage violence more than other religions?\" - Line chart.\n\n6. Page 4: \"Wide partisan divide over whether Islam is more likely to encourage violence\" - Line chart.\n\n7. Page 6: \"Should Muslims be subject to greater scrutiny because of their religion?\" - This shows horizontal bar comparisons.\n\n8. Page 7: \"Young people, minorities, less likely to say Muslims should receive greater scrutiny because of their faith\" - Horizontal bars.\n\n9. Page 10: \"Across-the-board drop in ratings of govt efforts to reduce terrorist threat\" - Line chart.\n\n10. Page 12: \"More concern with country's protection than civil liberties\" - Line chart.\n\n11. Page 13: \"Most Republicans say govt anti-terror policies have not gone 'far enough'\" - Line chart.\n\n12. Page 14: \"ISIS rises higher atop public's threat list\" - Horizontal bars.\n\n13. Page 15: \"Partisan differences in views of top threats to U.S.\" - Horizontal bars.\n\n14. Page 16: \"Most continue to support U.S. military campaign against ISIS\" - Line chart.\n\n15. Page 17: \"Few say current military effort in Iraq and Syria is going well\" - Horizontal bars.\n\n16. Page 17: \"More long-term optimism in ISIS fight\" - Vertical bars.\n\n17. Page 18: \"Bigger concern: Will U.S. go too far or not far enough in Iraq and Syria?\" - Table format with categories.\n\n18. Page ", "citations": [], "input_tokens": 19218, "output_tokens": 512, "cost_micros": 65334, "latency_ms": 13172, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "18", "normalised_gold": "1"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q004", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "int", "gold": "1", "pages": 28, "document_id": 5223, "arm": "llamacloud_basic_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q004", "answer_letter": null, "raw_text": "I'll carefully go through the document to count all bar charts.\n\nLet me examine each chart in the document:\n\n1. Page 2: \"For first time since 9/11, government's job rating on terrorism turns negative\" - This is a line graph, not a bar chart.\n\n2. Page 3: \"Most say more scrutiny of Muslims not warranted\" - This is a bar chart showing percentages for Republicans, Independents, and Democrats.\n\n3. Page 3: \"GOP favored on terrorism, gun control; Democrats lead on climate change\" - This is a bar chart comparing Democratic Party vs Republican Party on various issues.\n\n4. Page 4: \"Does the Islamic religion encourage violence more than other religions?\" - This is a line graph.\n\n5. Page 4: \"Wide partisan divide over whether Islam is more likely to encourage violence\" - This is a bar chart showing Republican, Independent, and Democrat responses.\n\n6. Page 5: \"Demographic divides in views of Islam and violence\" - This is a table, not a bar chart.\n\n7. Page 6: \"Should Muslims be subject to greater scrutiny because of their religion?\" - This is a bar chart showing responses by different political groups.\n\n8. Page 7: \"Young people, minorities, less likely to say Muslims should receive greater scrutiny\" - This is a bar chart showing responses by different demographic groups.\n\n9. Page 8: \"Most important problem facing nation\" - This is a table, not a bar chart.\n\n10. Page 9: \"Most important problem facing nation\" (partisan breakdown) - This is a table, not a bar chart.\n\n11. Page 10: \"Across-the-board drop in ratings of govt efforts to reduce terrorist threat\" - This is a bar chart showing percentages by party affiliation.\n\n12. Page 11: \"Older, less educated give more negative ratings\" - This is a bar chart showing responses by age and education.\n\n13. Page 12: \"More concern with country's protection than civil liberties\" - This is a line graph.\n\n14. Page 13: Bar chart showing partisan views on anti-terrorism policies - This is a line graph.\n\n15. Page 13: \"Young adults more concerned about civil liberties\" - This is a bar chart showing responses by age group.\n\n16. Page 14: \"", "citations": [], "input_tokens": 14453, "output_tokens": 512, "cost_micros": 51039, "latency_ms": 11922, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "16", "normalised_gold": "1"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q004", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "int", "gold": "1", "pages": 28, "document_id": 5223, "arm": "llamacloud_premium_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q004", "answer_letter": null, "raw_text": "I need to count all the bar charts in this document.\n\nLet me go through the document systematically:\n\n1. Page 1: \"For first time since 9/11, government's job rating on terrorism turns negative\" - This is a line chart, not a bar chart.\n\n2. Page 2: \"Growing partisan gap in views of whether Islam encourages violence\" - This is a line chart.\n\n3. Page 2: \"Most say more scrutiny of Muslims not warranted\" - This appears to be a simple text display with percentages, not a chart.\n\n4. Page 3: \"GOP favored on terrorism, gun control; Democrats lead on climate change\" - This is a table, not a bar chart.\n\n5. Page 4: \"Does the Islamic religion encourage violence more than other religions?\" - This is a line chart.\n\n6. Page 4: \"Wide partisan divide over whether Islam is more likely to encourage violence\" - This is a line chart.\n\n7. Page 6: \"Should Muslims be subject to greater scrutiny because of their religion?\" - This is a table.\n\n8. Page 7: \"Young people, minorities, less likely to say Muslims should receive greater scrutiny because of their faith\" - This is a table.\n\n9. Page 8: \"Growing share cite terrorism as the most important problem facing the U.S.\" - This is a table.\n\n10. Page 9: \"Most important problem facing nation\" - This is a table.\n\n11. Page 10: \"Across-the-board drop in ratings of govt efforts to reduce terrorist threat\" - This is a line chart.\n\n12. Page 11: \"How well is govt doing reducing threat of terrorism?\" - This is a table.\n\n13. Page 12: \"More concern with country's protection than civil liberties\" - This is a line chart.\n\n14. Page 13: \"Most Republicans say govt anti-terror policies have not gone 'far enough'\" - This is a line chart.\n\n15. Page 13: \"Young adults more concerned about civil liberties than older people\" - This is a table.\n\n16. Page 14: \"ISIS rises higher atop public's threat list\" - This is a table.\n\n17. Page 15: \"Partisan differences in views of top threats to U.S.\" - This is a table.", "citations": [], "input_tokens": 18585, "output_tokens": 512, "cost_micros": 63435, "latency_ms": 12656, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "17", "normalised_gold": "1"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q004", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "int", "gold": "1", "pages": 28, "document_id": 5223, "arm": "surfsense_agentic", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to count the bar charts in the report.Let me continue reading to count all the bar charts in the document.Let me search for \"figure\" tags and examine the visual content descriptions more carefully:I found 31 figures in the document. Now I need to examine each one to determine which are bar charts. Let me read through the document systematically to identify the type of each chart. Based on what I've already read, I can see several line charts and some other visualizations. Let me look for more specific patterns that indicate bar charts.\n\nLooking at the content I've already read, I can identify the following charts:\n\n1. Lines 188-253: Line chart showing government's job rating on terrorism over time\n2. Lines 306-354: Line chart showing partisan gap in views of whether Islam encourages violence  \n3. Lines 372-413: Pie chart showing views on Muslim scrutiny\n4. Lines 436-505: Horizontal bar chart comparing parties on various issues (terrorist threat, gun control, economy, immigration, climate change, environment)\n5. Lines 563-617: Line chart about Islam and violence over time\n6. Lines 618-657: Line chart showing partisan differences over time\n7. Lines 937-1006: Horizontal bar chart showing Muslim scrutiny views by demographic groups\n8. Lines 1055-1193: Horizontal bar chart showing scrutiny views by various demographics\n9. Lines 1508-1563: Line chart showing government ratings on terrorism by party\n10. Lines 1738-1802: Line chart about civil liberties concerns\n11. Lines 1850-1905: Line chart showing partisan views on anti-terror policies\n12. Lines 2004-2072: Vertical bar chart comparing threat levels (ISIS, Iran, North Korea, etc.)\n13. Lines 2117-2210: Horizontal bar chart showing partisan differences in threat views\n14. Lines 2239-2304: Line chart about military campaign approval\n15. Lines 2353-2425: Two separate charts about military campaign assessment\n\nLet me carefully review the entire document to count specifically the bar charts (not line charts or pie charts).\n\nBased on my thorough reading of the document, I can identify the following **bar charts**:\n\n1. **Page 4** (lines 436-505): Horizontal bar chart - \"GOP favored on terrorism, gun control; Democrats lead on climate change\" - comparing Democratic Party vs Republican Party on 6 issues\n2. **Page 6** (lines 937-1006): Horizontal bar chart - \"Should Muslims be subject to greater scrutiny because of their religion?\" - showing responses by party/ideology\n3. **Page 7** (lines 1055-1193): Horizontal bar chart - \"Young people, minorities, less likely to say Muslims should receive greater scrutiny\" - showing responses by demographics\n4. **Page 14** (lines 2004-2072): Mixed chart showing ISIS and other threats with numerical comparisons\n5. **Page 15** (lines 2117-2210): Horizontal bar chart - \"Partisan differences in views of top threats to U.S.\"\n\nAfter careful examination, the document contains **5 bar charts**.\n\nAnswer: 5", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 46078, "error": null, "extra": {"thread_id": 1931, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_6dc6e43bf0ee4e80af05e0ae2c870d11", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 404, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "1"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q005", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "str", "gold": "will definitely/probably succeed", "pages": 28, "document_id": 5223, "arm": "native_pdf", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, specifically page 17 (document page 18), I can find information about public attitudes regarding the ISIS fight.\n\nThe chart titled \"More long-term optimism in ISIS fight\" shows that in December 2015, 66% of people said the U.S. and its allies will \"Definitely/Probably succeed\" in their military campaign against Islamic militants in Iraq and Syria.\n\nThe text confirms: \"Two-thirds (66%) now say they think the U.S. and its allies will either definitely or probably succeed in their campaign against Islamic militants in Iraq and Syria, while just 27% say they will definitely or probably fail.\"\n\nAnswer: Two-thirds believe the U.S. and allies will succeed", "citations": [], "input_tokens": 58160, "output_tokens": 159, "cost_micros": 176865, "latency_ms": 22375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "12-15-15-ISIS-and-terrorism-release-final.pdf"}, "graded": {"correct": false, "f1": 0.30769230769230765, "method": "str_norm", "normalised_pred": "two thirds believe u s and allies will succeed", "normalised_gold": "will definitely probably succeed"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q005", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "str", "gold": "will definitely/probably succeed", "pages": 28, "document_id": 5223, "arm": "azure_basic_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q005", "answer_letter": null, "raw_text": "Let me search the document for information about people's attitudes regarding the ultimate success of the U.S. and its allies in the fight against ISIS.\n\nI found the relevant section on page 17, which states:\n\n\"Views are more positive when it comes to whether the U.S. and its allies will succeed or fail in their military campaign against ISIS.\n\nTwo-thirds (66%) now say they think the U.S. and its allies will either definitely or probably succeed in their campaign against Islamic militants in Iraq and Syria, while just 27% say they will definitely or probably fail.\"\n\nThe document also notes this represents an increase: \"The share who say the U.S. and its allies will succeed is up 11 points from July (from 55%).\"\n\nAnswer: Long-term optimism that U.S. and allies will succeed", "citations": [], "input_tokens": 13394, "output_tokens": 182, "cost_micros": 42912, "latency_ms": 6734, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.28571428571428575, "method": "str_norm", "normalised_pred": "long term optimism that u s and allies will succeed", "normalised_gold": "will definitely probably succeed"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q005", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "str", "gold": "will definitely/probably succeed", "pages": 28, "document_id": 5223, "arm": "azure_premium_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, on page 17, there is a section titled \"More long-term optimism in ISIS fight\" which shows:\n\n\"Two-thirds (66%) now say they think the U.S. and its allies will either definitely or probably succeed in their campaign against Islamic militants in Iraq and Syria, while just 27% say they will definitely or probably fail. The share who say the U.S. and its allies will succeed is up 11 points from July (from 55%).\"\n\nAnswer: Will definitely or probably succeed", "citations": [], "input_tokens": 19254, "output_tokens": 118, "cost_micros": 59532, "latency_ms": 4703, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.888888888888889, "method": "str_norm", "normalised_pred": "will definitely or probably succeed", "normalised_gold": "will definitely probably succeed"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q005", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "str", "gold": "will definitely/probably succeed", "pages": 28, "document_id": 5223, "arm": "llamacloud_basic_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, specifically the section titled \"More long-term optimism in ISIS fight\" on page 16, I can find the answer.\n\nThe document shows that in December 2015, 66% of people said the U.S. and its allies will \"Definitely/Probably succeed\" in their military campaign against Islamic militants in Iraq and Syria, compared to only 27% who said they will \"Definitely/Probably fail.\"\n\nThe text states: \"The share who say the U.S. and its allies will succeed is up 11 points from July (from 55%).\"\n\nAnswer: They will succeed/be successful", "citations": [], "input_tokens": 14489, "output_tokens": 141, "cost_micros": 45582, "latency_ms": 5453, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.4444444444444445, "method": "str_norm", "normalised_pred": "they will succeed be successful", "normalised_gold": "will definitely probably succeed"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q005", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "str", "gold": "will definitely/probably succeed", "pages": 28, "document_id": 5223, "arm": "llamacloud_premium_lc", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document for information about public attitudes on the ultimate success of the U.S. and its allies against ISIS in 2015...\n\nFrom page 17, I can see the section \"Current assessments of U.S. military campaign, prospects for success\" which states:\n\n\"Views are more positive when it comes to whether the U.S. and its allies will succeed or fail in their military campaign against ISIS.\n\nTwo-thirds (66%) now say they think the U.S. and its allies will either definitely or probably succeed in their campaign against Islamic militants in Iraq and Syria, while just 27% say they will definitely or probably fail.\"\n\nThe document also shows that \"The share who say the U.S. and its allies will succeed is up 11 points from July (from 55%).\"\n\nAnswer: Two-thirds believe U.S. and allies will succeed", "citations": [], "input_tokens": 18621, "output_tokens": 191, "cost_micros": 58728, "latency_ms": 6156, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.30769230769230765, "method": "str_norm", "normalised_pred": "two thirds believe u s and allies will succeed", "normalised_gold": "will definitely probably succeed"}}
+{"qid": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q005", "doc_id": "12-15-15-ISIS-and-terrorism-release-final.pdf", "answer_format": "str", "gold": "will definitely/probably succeed", "pages": 28, "document_id": 5223, "arm": "surfsense_agentic", "question_id": "12-15-15-ISIS-and-terrorism-release-final.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to find information about public attitudes toward the U.S. and its allies' efforts against ISIS in 2015.Let me continue reading to find the specific information about public attitudes:Let me search for information about attitudes toward the fight against ISIS and whether the U.S. and allies will succeed:Perfect! I found the information. Let me read a bit more to confirm the complete data:Based on the document, I found the answer. The document states that although few say the current military effort in Iraq and Syria is going well (only 35% in December 2015 say it's going very/fairly well), when it comes to long-term outlook, two-thirds (66%) believe the U.S. and its allies will either definitely or probably succeed in their campaign against Islamic militants in Iraq and Syria.\n\nAnswer: Two-thirds (66%) believe the U.S. and its allies will succeed", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 43858, "error": null, "extra": {"thread_id": 1932, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_a011d8590eae4d86a7883730ba9de008", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 232, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.26666666666666666, "method": "str_norm", "normalised_pred": "two thirds 66 believe u s and its allies will succeed", "normalised_gold": "will definitely probably succeed"}}
+{"qid": "2005.12872v3.pdf::Q000", "doc_id": "2005.12872v3.pdf", "answer_format": "int", "gold": "92", "pages": 26, "document_id": 7577, "arm": "native_pdf", "question_id": "2005.12872v3.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the PyTorch inference code in Listing 1 on page 26:\n\n```python\nself.linear_class = nn.Linear(hidden_dim, num_classes + 1)\n```\n\nand\n\n```python\ndetr = DETR(num_classes=91, hidden_dim=256, nheads=8, num_encoder_layers=6, num_decoder_layers=6)\n```\n\nThe output dimension of the linear_class layer is `num_classes + 1 = 91 + 1 = 92`.\n\nAnswer: 92", "citations": [], "input_tokens": 59872, "output_tokens": 134, "cost_micros": 181626, "latency_ms": 31250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2005.12872v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "92", "normalised_gold": "92"}}
+{"qid": "2005.12872v3.pdf::Q000", "doc_id": "2005.12872v3.pdf", "answer_format": "int", "gold": "92", "pages": 26, "document_id": 7577, "arm": "azure_basic_lc", "question_id": "2005.12872v3.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the PyTorch inference code in Listing 1, specifically line 14:\n\n```python\nself.linear_class = nn.Linear(hidden_dim, num_classes + 1)\n```\n\nThe output dimension of the linear_class layer is `num_classes + 1`.\n\nFrom line 32, the model is instantiated with:\n```python\ndetr = DETR(num_classes=91, hidden_dim=256, nheads=8, num_encoder_layers=6, num_decoder_layers=6)\n```\n\nTherefore, the output dimension is 91 + 1 = 92.\n\nAnswer: 92", "citations": [], "input_tokens": 19062, "output_tokens": 151, "cost_micros": 59451, "latency_ms": 4594, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "92", "normalised_gold": "92"}}
+{"qid": "2005.12872v3.pdf::Q000", "doc_id": "2005.12872v3.pdf", "answer_format": "int", "gold": "92", "pages": 26, "document_id": 7577, "arm": "azure_premium_lc", "question_id": "2005.12872v3.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the PyTorch inference code in Listing 1, specifically line 18:\n\n```python\nself.linear_class = nn.Linear(hidden_dim, num_classes + 1)\n```\n\nThe output dimension of the linear_class layer is `num_classes + 1`.\n\nFrom line 31, we can see that the model is instantiated with:\n```python\ndetr = DETR(num_classes=91, hidden_dim=256, nheads=8, num_encoder_layers=6, num_decoder_layers=6)\n```\n\nTherefore, the output dimension is 91 + 1 = 92.\n\nAnswer: 92", "citations": [], "input_tokens": 22767, "output_tokens": 155, "cost_micros": 70626, "latency_ms": 4282, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "92", "normalised_gold": "92"}}
+{"qid": "2005.12872v3.pdf::Q000", "doc_id": "2005.12872v3.pdf", "answer_format": "int", "gold": "92", "pages": 26, "document_id": 7577, "arm": "llamacloud_basic_lc", "question_id": "2005.12872v3.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the output dimension of the linear_class layer in the DETR PyTorch inference code.\n\nLooking at the code in Listing 1:\n\n```python\ndef __init__(self, num_classes, hidden_dim, nheads,\n             num_encoder_layers, num_decoder_layers):\n    super().__init__()\n    # ... other layers ...\n    self.linear_class = nn.Linear(hidden_dim, num_classes + 1)\n```\n\nThe linear_class layer is defined as `nn.Linear(hidden_dim, num_classes + 1)`, which means it takes `hidden_dim` as input and outputs `num_classes + 1` dimensions.\n\nThen looking at line 33:\n```python\ndetr = DETR(num_classes=91, hidden_dim=256, nheads=8, num_encoder_layers=6, num_decoder_layers=6)\n```\n\nThe model is instantiated with `num_classes=91`.\n\nTherefore, the output dimension of the linear_class layer is:\nnum_classes + 1 = 91 + 1 = 92\n\nAnswer: 92", "citations": [], "input_tokens": 19551, "output_tokens": 270, "cost_micros": 62703, "latency_ms": 5453, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "92", "normalised_gold": "92"}}
+{"qid": "2005.12872v3.pdf::Q000", "doc_id": "2005.12872v3.pdf", "answer_format": "int", "gold": "92", "pages": 26, "document_id": 7577, "arm": "llamacloud_premium_lc", "question_id": "2005.12872v3.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the PyTorch inference code in Listing 1, specifically line 15:\n\n```python\nself.linear_class = nn.Linear(hidden_dim, num_classes + 1)\n```\n\nAnd from line 33:\n\n```python\ndetr = DETR(num_classes=91, hidden_dim=256, nheads=8, num_encoder_layers=6, num_decoder_layers=6)\n```\n\nThe `linear_class` layer has an output dimension of `num_classes + 1`, where `num_classes=91`.\n\nTherefore, the output dimension is 91 + 1 = 92.\n\nAnswer: 92", "citations": [], "input_tokens": 22903, "output_tokens": 155, "cost_micros": 71034, "latency_ms": 4562, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "92", "normalised_gold": "92"}}
+{"qid": "2005.12872v3.pdf::Q000", "doc_id": "2005.12872v3.pdf", "answer_format": "int", "gold": "92", "pages": 26, "document_id": 7577, "arm": "surfsense_agentic", "question_id": "2005.12872v3.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the DETR document to find the output dimension of the linear_class layer in the PyTorch inference code.Let me search for \"linear_class\" in the document to find the PyTorch inference code:Let me search for PyTorch code or inference code in the document:Now let me read the section with the PyTorch inference code around line 2864:Perfect! I found the PyTorch inference code. Looking at line 2898, I can see:\n\n```python\nself.linear_class = nn.Linear(hidden_dim, num_classes + 1)\n```\n\nAnd from line 2915, the model is instantiated with:\n\n```python\ndetr = DETR(num_classes=91, hidden_dim=256, nheads=8, num_encoder_layers=6, num_decoder_layers=6)\n```\n\nSo the output dimension of the linear_class layer is `num_classes + 1 = 91 + 1 = 92`.\n\nAnswer: 92", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 39343, "error": null, "extra": {"thread_id": 1933, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_e081bc008e9a4c03bbbaf39b68e10bd7", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 162, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "92", "normalised_gold": "92"}}
+{"qid": "2005.12872v3.pdf::Q001", "doc_id": "2005.12872v3.pdf", "answer_format": "str", "gold": "Person", "pages": 26, "document_id": 7577, "arm": "native_pdf", "question_id": "2005.12872v3.pdf::Q001", "answer_letter": null, "raw_text": "I need to find information about missed instances when the number of visible instances is 60.\n\nLooking at Figure 12 on page 25, I can see a graph titled \"Analysis of the number of instances of various classes missed by DETR depending on how many are present in the image.\"\n\nThe graph shows three colored lines representing different categories:\n- Dog (blue line)\n- Person (orange line)\n- Apple (green line)\n\nThe x-axis shows \"Number of visible instances\" and the y-axis shows \"% of missed instances.\"\n\nAt the point where the number of visible instances = 60, I need to compare the values for each category:\n\nLooking at the graph at x=60:\n- The dog (blue) line appears to be at approximately 20-25%\n- The person (orange) line appears to be at approximately 15-20%\n- The apple (green) line appears to be at approximately 10-15%\n\nAt 60 visible instances, the dog category shows the highest percentage of missed instances.\n\nAnswer: dog", "citations": [], "input_tokens": 59873, "output_tokens": 231, "cost_micros": 183084, "latency_ms": 35750, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2005.12872v3.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "dog", "normalised_gold": "person"}}
+{"qid": "2005.12872v3.pdf::Q001", "doc_id": "2005.12872v3.pdf", "answer_format": "str", "gold": "Person", "pages": 26, "document_id": 7577, "arm": "azure_basic_lc", "question_id": "2005.12872v3.pdf::Q001", "answer_letter": null, "raw_text": "I need to look at Figure 12, which shows the analysis of missed instances depending on the number of visible instances.\n\nLooking at Figure 12 on page 25, I can see a graph plotting \"% of missed instances\" (y-axis) against \"Number of visible instances\" (x-axis) for three categories: dog, person, and apple.\n\nAt 60 visible instances, I need to identify which line/category has the highest percentage of missed instances:\n\n- The dog line (appears to be one color)\n- The person line (appears to be another color)  \n- The apple line (appears to be a third color)\n\nFrom the graph, at the 60 visible instances mark, the \"apple\" category appears to have the highest percentage of missed instances, showing approximately 20-25% missed instances, while \"dog\" and \"person\" show lower percentages (around 10-15%).\n\nAnswer: apple", "citations": [], "input_tokens": 19063, "output_tokens": 203, "cost_micros": 60234, "latency_ms": 6092, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "apple", "normalised_gold": "person"}}
+{"qid": "2005.12872v3.pdf::Q001", "doc_id": "2005.12872v3.pdf", "answer_format": "str", "gold": "Person", "pages": 26, "document_id": 7577, "arm": "azure_premium_lc", "question_id": "2005.12872v3.pdf::Q001", "answer_letter": null, "raw_text": "I need to look at Figure 12, which shows \"Analysis of the number of instances of various classes missed by DETR depending on how many are present in the image.\"\n\nLooking at the graph in Figure 12, I can see three lines representing different categories: dog, person, and apple.\n\nAt 60 visible instances (x-axis), I need to compare the percentage of missed instances (y-axis) for each category:\n\n- Dog (appears to be around 20-25%)\n- Person (appears to be around 15-20%)\n- Apple (appears to be the highest, around 30-35%)\n\nThe apple category clearly shows the highest percentage of missed instances at 60 visible instances.\n\nAnswer: apple", "citations": [], "input_tokens": 22768, "output_tokens": 160, "cost_micros": 70704, "latency_ms": 5344, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "apple", "normalised_gold": "person"}}
+{"qid": "2005.12872v3.pdf::Q001", "doc_id": "2005.12872v3.pdf", "answer_format": "str", "gold": "Person", "pages": 26, "document_id": 7577, "arm": "llamacloud_basic_lc", "question_id": "2005.12872v3.pdf::Q001", "answer_letter": null, "raw_text": "I need to look at Figure 12, which shows the analysis of missed instances depending on the number of visible instances.\n\nLooking at Figure 12 on page 25, it shows a graph with \"Number of visible instances\" on the x-axis (from 0 to 100) and \"% of missed instances\" on the y-axis (from 0 to 70). The graph shows three lines for different categories: dog, apple, and one other category.\n\nAt the point where the number of visible instances = 60, I need to identify which category line is highest (meaning the largest proportion of missed instances).\n\nFrom the graph, at 60 visible instances:\n- The \"dog\" line (appears to be one line)\n- The \"apple\" line (appears to be another line)\n- There appears to be a third line\n\nLooking at the vertical position at x=60, the \"dog\" line appears to be the highest among the three categories shown, indicating the largest proportion of missed instances at that point.\n\nAnswer: dog", "citations": [], "input_tokens": 19552, "output_tokens": 226, "cost_micros": 62046, "latency_ms": 6390, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "dog", "normalised_gold": "person"}}
+{"qid": "2005.12872v3.pdf::Q001", "doc_id": "2005.12872v3.pdf", "answer_format": "str", "gold": "Person", "pages": 26, "document_id": 7577, "arm": "llamacloud_premium_lc", "question_id": "2005.12872v3.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 12 (the table on page 25), I need to find the number of missed instances for each category when there are 60 visible instances.\n\nFrom the table:\n- dog: 5 missed instances\n- person: 22 missed instances  \n- apple: 15 missed instances\n\nOut of 60 visible instances:\n- dog: 5/60 = 8.3%\n- person: 22/60 = 36.7%\n- apple: 15/60 = 25%\n\nThe person category has the largest proportion of missed instances.\n\nAnswer: person", "citations": [], "input_tokens": 22904, "output_tokens": 139, "cost_micros": 70797, "latency_ms": 4000, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "person", "normalised_gold": "person"}}
+{"qid": "2005.12872v3.pdf::Q001", "doc_id": "2005.12872v3.pdf", "answer_format": "str", "gold": "Person", "pages": 26, "document_id": 7577, "arm": "surfsense_agentic", "question_id": "2005.12872v3.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the document to find information about visible instances and missed instances by category.Let me search for content about \"missed instances\" and look for figures or charts.Let me read the section with Figure 12 which discusses missed instances:Perfect! I found Figure 12 which shows the chart. Let me read the data more carefully. The chart shows:\n- Y-axis: % of missed instances (0 to 70)\n- X-axis: Number of visible instances (20 to 100)\n- Three categories plotted: dog, person, and apple\n\nAt 60 visible instances, I need to identify which category has the largest proportion of missed instances. From the text data shown:\n- Lines 2795-2796: dog\n- Lines 2797-2798: person  \n- Lines 2801-2802: apple\n\nThe chart shows these three categories, and I need to determine which has the highest value at x=60. Based on the chart layout and the typical pattern where lines are listed in the legend in order of their values, and given that \"dog\" is listed first in the legend (line 2795), followed by \"person\" (line 2797), and then \"apple\" (line 2801), this typically indicates their relative positions on the chart.\n\nLet me look for any additional data that might give exact values:Based on the chart data from Figure 12, I can see:\n- Y-axis values listed: 70, 60, 50, 40, 30, 20, 10, 0\n- X-axis values: 20, 40, 60, 80, 100\n- Three categories in the legend: dog, person, apple\n\nThe legend positioning typically indicates the order of lines on the chart. In this case, at x=60 (60 visible instances), \"dog\" appears to be listed first in the legend (lines 2795-2796), suggesting it has the highest proportion of missed instances at that point, followed by \"person\" and then \"apple\".\n\nGiven the layout where:\n- \"dog\" is listed at the top of the legend\n- \"person\" is in the middle\n- \"apple\" is at the bottom\n\nAnd the typical convention where legend items are ordered by their y-values at key points or overall prominence, \"dog\" would have the largest proportion of missed instances when the number of visible instances equals 60.\n\nAnswer: dog", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 47125, "error": null, "extra": {"thread_id": 1934, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_39684984e24b48b0a29495ffed5edb23", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 304, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "dog", "normalised_gold": "person"}}
+{"qid": "2005.12872v3.pdf::Q002", "doc_id": "2005.12872v3.pdf", "answer_format": "float", "gold": "57.3", "pages": 26, "document_id": 7577, "arm": "native_pdf", "question_id": "2005.12872v3.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the AP50 value for DETR with L1 loss but without GIoU loss on the COCO validation set.\n\nLooking at Table 4 on page 13, which shows \"Effect of loss components on AP\":\n\nThe table has columns for: class, \u21131, GIoU, AP, \u0394, AP50, \u0394, APS, APM, APL\n\nThe rows show different combinations:\n- Row 1: X (class), X (\u21131), blank (no GIoU): AP=35.8, AP50=57.3\n- Row 2: X (class), blank (no \u21131), X (GIoU): AP=39.9, AP50=61.6\n- Row 3: X (class), X (\u21131), X (GIoU): AP=40.6, AP50=61.6\n\nThe question asks for DETR with L1 loss and WITHOUT GIoU loss, which corresponds to the first row where class=X, \u21131=X, and GIoU is blank.\n\nAnswer: 57.3", "citations": [], "input_tokens": 59883, "output_tokens": 274, "cost_micros": 183759, "latency_ms": 35969, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2005.12872v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "57.3", "normalised_gold": "57.3"}}
+{"qid": "2005.12872v3.pdf::Q002", "doc_id": "2005.12872v3.pdf", "answer_format": "float", "gold": "57.3", "pages": 26, "document_id": 7577, "arm": "azure_basic_lc", "question_id": "2005.12872v3.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the AP50 value for DETR with L1 loss but without GIoU loss on the COCO validation set.\n\nLooking at Table 4 in the document, which shows \"Effect of loss components on AP\", I can see three different configurations:\n\n1. First row: class \u2713, L1 \u2717, GIoU \u2713 \u2192 AP: 35.8, AP50: 57.3\n2. Second row: class \u2713, L1 \u2713, GIoU \u2717 \u2192 AP: 39.9, AP50: 61.6\n3. Third row (baseline): class \u2713, L1 \u2713, GIoU \u2713 \u2192 AP: 40.6, AP50: 61.6\n\nThe question asks for the configuration with L1 loss and without GIoU loss, which corresponds to the second row.\n\nAnswer: 61.6", "citations": [], "input_tokens": 19073, "output_tokens": 223, "cost_micros": 60564, "latency_ms": 5608, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "61.6", "normalised_gold": "57.3"}}
+{"qid": "2005.12872v3.pdf::Q002", "doc_id": "2005.12872v3.pdf", "answer_format": "float", "gold": "57.3", "pages": 26, "document_id": 7577, "arm": "azure_premium_lc", "question_id": "2005.12872v3.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the AP50 value for DETR with L1 loss but without GIoU loss on the COCO validation set.\n\nLooking at Table 4 in the document, which shows \"Effect of loss components on AP\", I can see three rows with different loss combinations:\n\n1. Row 1: class \u2612, l1 \u2612, GIoU \u2610 (class and L1, but no GIoU)\n   - AP: 35.8\n   - AP50: 57.3\n\n2. Row 2: class \u2612, l1 \u2610, GIoU \u2612 (class and GIoU, but no L1)\n   - AP: 39.9\n   - AP50: 61.6\n\n3. Row 3 (baseline): class \u2612, l1 \u2612, GIoU \u2612 (all three losses)\n   - AP: 40.6\n   - AP50: 61.6\n\nThe question asks for DETR with L1 loss and without GIoU loss, which corresponds to the first row in Table 4.\n\nAnswer: 57.3", "citations": [], "input_tokens": 22778, "output_tokens": 278, "cost_micros": 72504, "latency_ms": 7140, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "57.3", "normalised_gold": "57.3"}}
+{"qid": "2005.12872v3.pdf::Q002", "doc_id": "2005.12872v3.pdf", "answer_format": "float", "gold": "57.3", "pages": 26, "document_id": 7577, "arm": "llamacloud_basic_lc", "question_id": "2005.12872v3.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the AP50 value of DETR with L1 loss but without GIoU loss on the COCO validation set.\n\nLooking at Table 4 in the document, which shows \"Effect of loss components on AP\", I can see the ablation study for different loss combinations:\n\nThe table shows three rows:\n1. class \u2713, L1 \u2717, GIoU \u2713: AP = 35.8, AP50 = 57.3\n2. class \u2713, L1 \u2713, GIoU \u2717: AP = 39.9, AP50 = 61.6\n3. class \u2713, L1 \u2713, GIoU \u2713: AP = 40.6, AP50 = 61.6 (baseline)\n\nThe second row shows the model with L1 loss but without GIoU loss (class \u2713, L1 \u2713, GIoU \u2717).\n\nThe AP50 value for this configuration is 61.6.\n\nAnswer: 61.6", "citations": [], "input_tokens": 19562, "output_tokens": 256, "cost_micros": 62526, "latency_ms": 6390, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "61.6", "normalised_gold": "57.3"}}
+{"qid": "2005.12872v3.pdf::Q002", "doc_id": "2005.12872v3.pdf", "answer_format": "float", "gold": "57.3", "pages": 26, "document_id": 7577, "arm": "llamacloud_premium_lc", "question_id": "2005.12872v3.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the AP50 value of DETR with L1 loss but without GIoU loss on the COCO validation set.\n\nLooking at Table 4 in the document, which shows \"Effect of loss components on AP\":\n\nThe table has columns for: class, \u21131, GIoU, AP, \u0394, AP\u2085\u2080, \u0394, AP\u209b, AP\u2098, AP\u2097\n\nThe rows show different combinations:\n1. Row with \u2713 for class and \u21131, but no GIoU: AP = 35.8, AP\u2085\u2080 = 57.3\n2. Row with \u2713 for class and GIoU, but no \u21131: AP = 39.9, AP\u2085\u2080 = 61.6\n3. Row with \u2713 for all three (class, \u21131, GIoU): AP = 40.6, AP\u2085\u2080 = 61.6\n\nThe first row corresponds to the model with L1 loss but without GIoU loss, which shows AP\u2085\u2080 = 57.3\n\nAnswer: 57.3", "citations": [], "input_tokens": 22914, "output_tokens": 285, "cost_micros": 73017, "latency_ms": 7812, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "57.3", "normalised_gold": "57.3"}}
+{"qid": "2005.12872v3.pdf::Q002", "doc_id": "2005.12872v3.pdf", "answer_format": "float", "gold": "57.3", "pages": 26, "document_id": 7577, "arm": "surfsense_agentic", "question_id": "2005.12872v3.pdf::Q002", "answer_letter": null, "raw_text": "I'll search for the AP50 value of DETR with L1 loss and without GIoU loss in the provided document.Let me read the table that shows the effect of loss components:Perfect! I found Table 4 which shows the effect of loss components on AP. Looking at the table:\n\n- The first row shows: class=\u2612, l1=\u2612, GIoU=(empty), which means L1 loss is used but GIoU loss is not used\n- For this configuration, the AP50 value is 57.3\n\nAnswer: 57.3", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 32109, "error": null, "extra": {"thread_id": 1935, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_984a5f35790043fbba33381ecf31e0b6", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 138, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "57.3", "normalised_gold": "57.3"}}
+{"qid": "2005.12872v3.pdf::Q004", "doc_id": "2005.12872v3.pdf", "answer_format": "str", "gold": "Red", "pages": 26, "document_id": 7577, "arm": "native_pdf", "question_id": "2005.12872v3.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Figure 10 on page 22, which shows the \"Architecture of DETR's transformer,\" I can see the Multi-Head Self-Attention layers are shown in the diagram.\n\nThe Multi-Head Self-Attention boxes in the diagram are colored pink/salmon.\n\nAnswer: Pink (or salmon)", "citations": [], "input_tokens": 59883, "output_tokens": 71, "cost_micros": 180714, "latency_ms": 31750, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2005.12872v3.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "pink or salmon", "normalised_gold": "red"}}
+{"qid": "2005.12872v3.pdf::Q004", "doc_id": "2005.12872v3.pdf", "answer_format": "str", "gold": "Red", "pages": 26, "document_id": 7577, "arm": "azure_basic_lc", "question_id": "2005.12872v3.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Figure 10 \"Architecture of DETR's transformer\" in the document, I need to identify the color of the Multi-Head Self-Attention layer.\n\nIn the diagram, the Multi-Head Self-Attention layer appears in both the Encoder and Decoder sections. These layers are shown in a light orange or peach color.\n\nAnswer: Orange (or peach/light orange)", "citations": [], "input_tokens": 19073, "output_tokens": 89, "cost_micros": 58554, "latency_ms": 3953, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "orange or peach light orange", "normalised_gold": "red"}}
+{"qid": "2005.12872v3.pdf::Q004", "doc_id": "2005.12872v3.pdf", "answer_format": "str", "gold": "Red", "pages": 26, "document_id": 7577, "arm": "azure_premium_lc", "question_id": "2005.12872v3.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Figure 10 in the document, I can see the architecture diagram of DETR's transformer.\n\nThe Multi-Head Self-Attention layer boxes in the diagram are colored blue.\n\nAnswer: Blue", "citations": [], "input_tokens": 22778, "output_tokens": 48, "cost_micros": 69054, "latency_ms": 4203, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "blue", "normalised_gold": "red"}}
+{"qid": "2005.12872v3.pdf::Q004", "doc_id": "2005.12872v3.pdf", "answer_format": "str", "gold": "Red", "pages": 26, "document_id": 7577, "arm": "llamacloud_basic_lc", "question_id": "2005.12872v3.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Figure 10 in the document, I need to identify the color of the Multi-Head Self-Attention layer in the architecture diagram.\n\nIn the figure, the Multi-Head Self-Attention layer appears in the Encoder section of the architecture.\n\nAnswer: Orange", "citations": [], "input_tokens": 19562, "output_tokens": 62, "cost_micros": 59616, "latency_ms": 3406, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "orange", "normalised_gold": "red"}}
+{"qid": "2005.12872v3.pdf::Q004", "doc_id": "2005.12872v3.pdf", "answer_format": "str", "gold": "Red", "pages": 26, "document_id": 7577, "arm": "llamacloud_premium_lc", "question_id": "2005.12872v3.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Figure 10 in the document, which shows the architecture of DETR's transformer:\n\nThe Multi-Head Self-Attention layers are shown in the diagram boxes in the Encoder and Decoder sections.\n\nAnswer: The diagram does not specify colors for the layers in Fig 10", "citations": [], "input_tokens": 22914, "output_tokens": 65, "cost_micros": 69717, "latency_ms": 3734, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "diagram does not specify colors for layers in fig 10", "normalised_gold": "red"}}
+{"qid": "2005.12872v3.pdf::Q004", "doc_id": "2005.12872v3.pdf", "answer_format": "str", "gold": "Red", "pages": 26, "document_id": 7577, "arm": "surfsense_agentic", "question_id": "2005.12872v3.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to find Figure 10 and identify the color of the Multi-Head Self-Attention layer in DETR's transformer architecture.Let me search for \"Fig 10\" or \"Figure 10\" in the document:Let me search for references to transformer architecture figures:Great! I found that Figure 10 shows the \"Architecture of DETR's transformer\" at line 2409. Let me read that section:I can see the text content of Figure 10, but I need to look for color information. The document mentions it's embedded as an image. Let me search for more detailed description of the figure or any color information:The visual description I found was for a different image (zebras). Let me go back and look more carefully at the Figure 10 architecture diagram. Let me search around that area more carefully:I see the text extracted from Figure 10, but the visual description that follows (zebras) is clearly for a different image (Im7.png). Let me search for visual descriptions that might be associated with the transformer architecture diagram. Let me look before Figure 10:Let me search for where there might be an embedded image description for Figure 10. Let me check if there's an image description before line 2400:Let me search for image descriptions or embedded images related to Figure 10. Let me check what comes right before the figure starts:Let me look more carefully. The embedded image Im7.png comes AFTER Figure 10. Let me check if there's an embedded image BEFORE Figure 10 that might contain the actual architecture diagram:The document doesn't seem to have a visual description embedded for Figure 10's architecture diagram itself within the accessible text. However, looking at the structure, it appears Figure 10 is presented as a figure with text labels but the actual image with colors might not have been described in the OCR/visual description section.\n\nLet me search more broadly for any mentions of colors in the architecture or in figures:Let me search for other color terms that might be in Figure 10:These color mentions don't seem to be about Figure 10. Let me try to find if there's an image file reference specifically for the transformer architecture. Let me look for \"Im\" images around Figure 10:Image Im8.png is not Figure 10 either. Let me look between Figure 10 text and Im7.png more carefully. Actually, let me check if there's an embedded image right after the </figure> tag for Figure 10. Let me look at the exact area around the figure closing:So Figure 10 ends at line 2485, and then Im7.png starts at line 2489, which is clearly about zebras, not the transformer architecture. The Figure 10 text content doesn't include a visual description with colors. Let me search for if there's any embedded image metadata that might describe colors for transformer diagrams. Let me try searching for \"green\" or \"red\" along with terms near Figure 10 context:Let me try a different approach. Perhaps the PDF was processed and the image containing Figure 10 wasn't given a separate visual description. Let me look for if there's any image file reference or visual annotation BEFORE Figure 10 that might be the diagram itself:It seems the document processing didn't include a detailed visual description with colors for Figure 10's architecture diagram. This could mean the image was processed but only the text labels were extracted, not the visual styling information like colors of the boxes.\n\nLet me try searching the entire document for any reference to colors of \"Self-Attention\" layers or transformer layers:Based on my thorough search through the document, I cannot find any visual description that specifies the color of the Multi-Head Self-Attention layer in Figure 10. The document contains the text content and labels from Figure 10, but the visual styling information (colors of boxes/components) was not captured in the OCR or visual description process.\n\nWithout access to the actual image file or a visual description that includes color information, I cannot determine from this document what color the Multi-Head Self-Attention layer is depicted as in Figure 10.\n\nAnswer: Color information not available in the provided document text.", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 122281, "error": null, "extra": {"thread_id": 1936, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_be29632a506941cfa603de6bebd1c2eb", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 844, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "color information not available in provided document text", "normalised_gold": "red"}}
+{"qid": "2005.12872v3.pdf::Q005", "doc_id": "2005.12872v3.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "36", "pages": 26, "document_id": 7577, "original_error": "JSONDecodeError: Expecting value: line 169 column 1 (char 924)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:50:36Z", "latency_ms": 36889, "error": null, "raw_text_chars": 1214}]}, "question_id": "2005.12872v3.pdf::Q005", "answer_letter": null, "raw_text": "I need to find information about DETR's default structure, specifically the number of multi-head self-attention layers.\n\nLet me search through the document for information about DETR's architecture.\n\nFrom page 2, I can see that DETR has a transformer encoder-decoder architecture.\n\nOn page 10 (Section 4.1), I find: \"To be comparable in the number of parameters we choose a model with 6 transformer and 6 decoder layers of width 256 with 8 attention heads.\"\n\nLooking at the detailed architecture description on page 22 (Section A.3) and Figure 10, I can see that:\n- The encoder has multiple layers, each containing multi-head self-attention\n- The decoder has multiple layers, each containing multi-head self-attention AND multi-head attention (cross-attention)\n\nFrom page 9-10, the baseline model uses \"6 encoder, 6 decoder layers\"\n\nEach encoder layer has one multi-head self-attention layer.\nEach decoder layer has one multi-head self-attention layer and one multi-head cross-attention layer.\n\nSo the total number of multi-head self-attention layers (not counting cross-attention) is:\n- 6 encoder layers \u00d7 1 self-attention per layer = 6\n- 6 decoder layers \u00d7 1 self-attention per layer = 6\n- Total = 12\n\nAnswer: 12", "citations": [], "input_tokens": 59866, "output_tokens": 318, "cost_micros": 184368, "latency_ms": 36875, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2005.12872v3.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "12", "normalised_gold": "36"}}
+{"qid": "2005.12872v3.pdf::Q005", "doc_id": "2005.12872v3.pdf", "answer_format": "int", "gold": "36", "pages": 26, "document_id": 7577, "arm": "azure_basic_lc", "question_id": "2005.12872v3.pdf::Q005", "answer_letter": null, "raw_text": "I need to find information about the number of multi-head self-attention layers in DETR's default structure.\n\nLooking at the document, I need to identify the default configuration of DETR.\n\nFrom Section 4 (Experiments), specifically the \"Technical details\" subsection:\n\"we choose a model with 6 transformer and 6 decoder layers of width 256 with 8 attention heads.\"\n\nHowever, I need to understand the architecture better. Looking at Section 3.2 and Figure 10:\n\n1. The **encoder** has 6 layers, and each encoder layer contains one multi-head self-attention module (as stated: \"Each encoder layer has a standard architecture and consists of a multi-head self-attention module and a feed forward network (FFN)\")\n\n2. The **decoder** has 6 layers. Looking at the description and Figure 10, each decoder layer contains:\n   - Multi-head self-attention\n   - Multi-head attention (encoder-decoder attention)\n\nSo each decoder layer has one multi-head self-attention layer.\n\nFrom the default configuration mentioned in the experiments:\n- 6 encoder layers \u00d7 1 multi-head self-attention per layer = 6 multi-head self-attention layers\n- 6 decoder layers \u00d7 1 multi-head self-attention per layer = 6 multi-head self-attention layers\n\nTotal multi-head self-attention layers = 6 + 6 = 12\n\nAnswer: 12", "citations": [], "input_tokens": 19056, "output_tokens": 334, "cost_micros": 62178, "latency_ms": 8546, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "12", "normalised_gold": "36"}}
+{"qid": "2005.12872v3.pdf::Q005", "doc_id": "2005.12872v3.pdf", "answer_format": "int", "gold": "36", "pages": 26, "document_id": 7577, "arm": "azure_premium_lc", "question_id": "2005.12872v3.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about DETR's default structure regarding multi-head self-attention layers.\n\nFrom Section 4.1 \"Comparison with Faster R-CNN\":\n\"To be comparable in the number of parameters we choose a model with 6 transformer and 6 decoder layers of width 256 with 8 attention heads.\"\n\nFrom Section 4.2 \"Ablations\":\n\"For the study we choose ResNet-50-based DETR model with 6 encoder, 6 decoder layers and width 256.\"\n\nFrom the architecture description in Section 3.2:\n- The transformer encoder has multiple layers, each containing a multi-head self-attention module\n- The transformer decoder has multiple layers, each containing multi-head self-attention and encoder-decoder attention mechanisms\n\nSo the default DETR structure has:\n- 6 encoder layers (each with 1 multi-head self-attention layer) = 6 layers\n- 6 decoder layers (each with 1 multi-head self-attention layer) = 6 layers\n\nTotal multi-head self-attention layers = 6 + 6 = 12\n\nAnswer: 12", "citations": [], "input_tokens": 22761, "output_tokens": 271, "cost_micros": 72348, "latency_ms": 7187, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "12", "normalised_gold": "36"}}
+{"qid": "2005.12872v3.pdf::Q005", "doc_id": "2005.12872v3.pdf", "answer_format": "int", "gold": "36", "pages": 26, "document_id": 7577, "arm": "llamacloud_basic_lc", "question_id": "2005.12872v3.pdf::Q005", "answer_letter": null, "raw_text": "I need to find information about DETR's default structure and the number of multi-head self-attention layers.\n\nLooking through the document:\n\n1. In Section 4 \"Experiments\" under \"Technical details\", it states:\n\"We use scale augmentation... we choose a model with 6 transformer and 6 decoder layers of width 256 with 8 attention heads.\"\n\n2. In Section A.3 \"Detailed architecture\", the document describes the transformer architecture with both encoder and decoder layers.\n\n3. From the architecture description and Figure 10:\n- The encoder has multiple layers, each containing multi-head self-attention\n- The decoder has multiple layers, each containing:\n  - Multi-head self-attention\n  - Multi-head attention (encoder-decoder attention)\n\n4. From the technical details in Section 4:\n- 6 encoder layers (each with multi-head self-attention)\n- 6 decoder layers (each with multi-head self-attention)\n\nSo the total number of multi-head self-attention layers would be:\n- 6 encoder layers \u00d7 1 self-attention layer each = 6\n- 6 decoder layers \u00d7 1 self-attention layer each = 6\n- Total = 12\n\nAnswer: 12", "citations": [], "input_tokens": 19545, "output_tokens": 287, "cost_micros": 62940, "latency_ms": 7234, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "12", "normalised_gold": "36"}}
+{"qid": "2005.12872v3.pdf::Q005", "doc_id": "2005.12872v3.pdf", "answer_format": "int", "gold": "36", "pages": 26, "document_id": 7577, "arm": "llamacloud_premium_lc", "question_id": "2005.12872v3.pdf::Q005", "answer_letter": null, "raw_text": "I need to find information about DETR's default structure and count the multi-head self-attention layers.\n\nFrom the document, I can find the architecture details:\n\n1. In Section 4.2 (Ablations), it states: \"For the study we choose ResNet-50-based DETR model with 6 encoder, 6 decoder layers and width 256.\"\n\n2. Looking at Figure 10 and Section A.3 (Detailed architecture), the structure shows:\n   - Encoder has N layers, each with Multi-Head Self-Attention\n   - Decoder has M layers, each with Multi-Head Self-Attention AND Multi-Head Attention (cross-attention)\n\n3. From the baseline model description: \"we choose a model with 6 transformer and 6 decoder layers\"\n\n4. In the encoder: Each of the 6 encoder layers contains 1 multi-head self-attention layer = 6 layers\n\n5. In the decoder: Each of the 6 decoder layers contains 1 multi-head self-attention layer = 6 layers\n   (Note: The decoder also has multi-head cross-attention, but that's different from self-attention)\n\nTotal multi-head self-attention layers = 6 (encoder) + 6 (decoder) = 12\n\nAnswer: 12", "citations": [], "input_tokens": 22897, "output_tokens": 303, "cost_micros": 73236, "latency_ms": 10280, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "12", "normalised_gold": "36"}}
+{"qid": "2005.12872v3.pdf::Q005", "doc_id": "2005.12872v3.pdf", "answer_format": "int", "gold": "36", "pages": 26, "document_id": 7577, "arm": "surfsense_agentic", "question_id": "2005.12872v3.pdf::Q005", "answer_letter": null, "raw_text": "I'll search for information about DETR's structure in the provided document.Now I found the key information. Let me search for more details about the architecture to confirm the total number of self-attention layers.Let me check the decoder structure to understand how many self-attention layers are in the decoder:Now let me look at the supplementary material section that describes the architecture in detail:Based on the document, I can see that:\n\n1. The default DETR model has **6 encoder layers** (line 1054, 1073: \"6 transformer and 6 decoder layers\" and \"6 encoder, 6 decoder layers\")\n\n2. Each encoder layer has a **multi-head self-attention module** (line 624)\n\n3. The default DETR model has **6 decoder layers** (line 1054, 1073)\n\n4. Each decoder layer has **multi-head self-attention** (line 2403-2404: \"multiple multi-head self-attention and decoder-encoder attention\")\n\nSo the total number of multi-head self-attention layers in DETR's default structure is:\n- 6 encoder layers \u00d7 1 self-attention per layer = 6\n- 6 decoder layers \u00d7 1 self-attention per layer = 6\n- Total = 12\n\nAnswer: 12", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 61719, "error": null, "extra": {"thread_id": 1937, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_12c4837a12434eeca194f2ee2772f40e", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 320, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "12", "normalised_gold": "36"}}
+{"qid": "2021-Apple-Catalog.pdf::Q000", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "int", "gold": "4", "pages": 55, "document_id": 7578, "arm": "native_pdf", "question_id": "2021-Apple-Catalog.pdf::Q000", "answer_letter": null, "raw_text": "I need to count the different iPad solutions shown in this InVue 2021 Product Catalog.\n\nLooking through the document systematically:\n\n1. **Series 2865** (page 9) - For iPad Pro, iPad Air, iPad, iPad mini 4\n   - Multi-position security system with sensors\n\n2. **OnePOD** (page 10) - For iPad Pro, iPad Air, iPad, iPad mini 4\n   - POD display solution\n\n3. **Series 960** (page 12) - For iPad Pro, iPad Air, iPad, iPad mini 4\n   - POD with exposed cable\n\n4. **iPad + Keyboard Solutions** (pages 17-19):\n   - Standard Solution for iPad Pro + Keyboards (page 18)\n   - High Security Solution for iPad Pro + Keyboards (page 19)\n\n5. **iPad Digital Display** (pages 15-16) - Bracketed pedestal solution\n\nSo counting the distinct iPad solutions:\n1. Series 2865\n2. OnePOD\n3. Series 960\n4. iPad + Keyboard Standard Solution\n5. iPad + Keyboard High Security Solution\n6. iPad Digital Display\n\nAnswer: 6", "citations": [], "input_tokens": 96851, "output_tokens": 271, "cost_micros": 294618, "latency_ms": 54405, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2021-Apple-Catalog.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "4"}}
+{"qid": "2021-Apple-Catalog.pdf::Q000", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "int", "gold": "4", "pages": 55, "document_id": 7578, "arm": "azure_basic_lc", "question_id": "2021-Apple-Catalog.pdf::Q000", "answer_letter": null, "raw_text": "I need to count the different iPad solutions shown in this Apple catalog document.\n\nLet me go through the document systematically:\n\n1. **Series 2865** (pages 3-5) - Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n\n2. **OnePOD** (pages 6-9) - Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n\n3. **Series 960** (pages 10-12) - Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n\n4. **One60 Tethered** (page 13) - Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n\n5. **One90QR** (page 14) - Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n\n6. **iPad Digital Display** (pages 15-16) - A bracketed pedestal solution for iPads as digital displays\n\n7. **iPad + Keyboard Solutions** (pages 17-19):\n   - iPad Pro + Keyboards Standard Solution (page 18)\n   - iPad Pro + Keyboards High Security Solution (page 19)\n\nLet me count the distinct iPad solutions:\n1. Series 2865\n2. OnePOD\n3. Series 960\n4. One60 Tethered\n5. One90QR\n6. iPad Digital Display\n7. iPad Pro + Keyboards Standard Solution\n8. iPad Pro + Keyboards High Security Solution\n\nAnswer: 8", "citations": [], "input_tokens": 11786, "output_tokens": 335, "cost_micros": 40383, "latency_ms": 6532, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "4"}}
+{"qid": "2021-Apple-Catalog.pdf::Q000", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "int", "gold": "4", "pages": 55, "document_id": 7578, "arm": "azure_premium_lc", "question_id": "2021-Apple-Catalog.pdf::Q000", "answer_letter": null, "raw_text": "I need to count the different iPad solutions shown in this document.\n\nLooking through the document systematically:\n\n1. **Series 2865** (pages 1-5) - Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n\n2. **OnePOD** (pages 3-4) - Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n\n3. **Series 960** (page 5) - Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n\n4. **One60 Tethered** (page 6) - Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n\n5. **One90QR** (page 7) - Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n\n6. **iPad Digital Display** (pages 8-9) - For utilizing iPads as digital displays\n\n7. **iPad Pro + Keyboards Standard Solution** (page 11) - For iPad Pro with Magic Keyboard and Smart Keyboard\n\n8. **iPad Pro + Keyboards High Security Solution** (page 12) - For iPad Pro with keyboards\n\nSo there are **8** different iPad solutions shown in the document.\n\nAnswer: 8", "citations": [], "input_tokens": 18388, "output_tokens": 271, "cost_micros": 59229, "latency_ms": 6733, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "4"}}
+{"qid": "2021-Apple-Catalog.pdf::Q000", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "int", "gold": "4", "pages": 55, "document_id": 7578, "arm": "llamacloud_basic_lc", "question_id": "2021-Apple-Catalog.pdf::Q000", "answer_letter": null, "raw_text": "I need to count the different iPad solutions mentioned in the document.\n\nLet me go through the document systematically:\n\n1. **Series 2865** - mentioned for iPad, iPad mini, iPad Air and iPad Pro (without Keyboard) on page 38\n\n2. **OnePOD** - mentioned for iPad, iPad mini, iPad Air and iPad Pro (without keyboard) on page 40\n\n3. **Series 960** - mentioned for iPad, iPad mini, iPad Air and iPad Pro (without keyboard) on page 41\n\n4. **iPad Digital Display** - mentioned as a \"Bracketed pedestal for utilizing iPads as digital displays on Apple Fixtures\" on pages 11-12\n\n5. **iPad Pro + Keyboards Standard Solution** - mentioned on page 11 for \"iPad Pro, Magic Keyboard, Smart Keyboard and Apple Pencil\"\n\n6. **iPad Pro + Keyboards High Security Solution** - mentioned on page 12\n\nSo I can identify 6 different iPad solutions in the document:\n- Series 2865 (for standard iPads)\n- OnePOD (for standard iPads)\n- Series 960 (for standard iPads)\n- iPad Digital Display\n- iPad Pro + Keyboards Standard Solution\n- iPad Pro + Keyboards High Security Solution\n\nAnswer: 6", "citations": [], "input_tokens": 12620, "output_tokens": 289, "cost_micros": 42195, "latency_ms": 8000, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "4"}}
+{"qid": "2021-Apple-Catalog.pdf::Q000", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "int", "gold": "4", "pages": 55, "document_id": 7578, "arm": "llamacloud_premium_lc", "question_id": "2021-Apple-Catalog.pdf::Q000", "answer_letter": null, "raw_text": "I need to count the different iPad solutions shown in the document.\n\nLooking through the document systematically:\n\n1. **Series 2865** (pages 1-5) - Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n\n2. **OnePOD** (pages 3-4) - Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n\n3. **Series 960** (page 5) - Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n\n4. **One60 Tethered** (page 6) - Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n\n5. **One90QR** (page 7) - Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n\n6. **iPad Digital Display** (pages 8-9) - For utilizing iPads as digital displays on Apple Fixtures\n\n7. **iPad + Keyboard Solutions - Standard Solution** (page 11) - For iPad Pro with Magic Keyboard, Smart Keyboard and Apple Pencil\n\n8. **iPad + Keyboard Solutions - High Security Solution** (page 12) - For iPad Pro with Magic and Smart Folio keyboards\n\nSo there are 8 different iPad solutions shown in the document.\n\nAnswer: 8", "citations": [], "input_tokens": 16954, "output_tokens": 288, "cost_micros": 55182, "latency_ms": 7030, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "4"}}
+{"qid": "2021-Apple-Catalog.pdf::Q000", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "int", "gold": "4", "pages": 55, "document_id": 7578, "arm": "surfsense_agentic", "question_id": "2021-Apple-Catalog.pdf::Q000", "answer_letter": null, "raw_text": "I'll search for iPad solutions in the provided document.Let me search for \"solution\" to find where iPad solutions are mentioned:Now let me read the table of contents section more carefully to see the iPad solutions listed:I can see there are two main sections: \"iPhone & iPad Solutions\" and \"iPad + Keyboard Solutions\". Let me now read through these sections to count the individual iPad solutions. Let me start with the \"iPhone & iPad Solutions\" section:Let me read the document more systematically. Let me check the pages 1-10 which should cover the \"iPhone & iPad Solutions\" section:Let me search for specific product series names to count the solutions:Let me look more systematically at the document. Let me read pages that should list individual iPad solutions:I can see Series 2865, OnePOD, and Series 960 mentioned. Let me continue reading to find all the different solutions and then look at the iPad + Keyboard Solutions section:Let me search for more product names. Let me look for what comes after Series 960:Good, I found \"One60 Tethered\". Let me continue:Good, I found \"One90QR\". Let me continue searching through the document:Good, I found \"iPad Digital Display\". Now let me check if there are more iPhone & iPad solutions and then look at the iPad + Keyboard Solutions section. Let me look at page 10 where iPad + Keyboard Solutions should start:Good! I found \"iPad Pro + Keyboards Standard Solution\". Let me continue to find the next one:Good! I found \"iPad Pro + Keyboards High Security Solution\". Now let me continue to see if there are more iPad solutions. Let me check what comes on page 12 and beyond, which should start MacBook Solutions (page 13 according to the table of contents):Perfect! Now I can see that page 13 starts MacBook Solutions, which means the iPad + Keyboard Solutions section ends at page 12. Let me now compile all the iPad solutions I found. Let me verify by checking if there are any other iPad product names I might have missed. Let me look at pages 10-11 more carefully:Now let me look more carefully at page 9 to see if there are more solutions listed on that introductory page:Good. Now let me compile the complete list. I need to be systematic. Let me re-examine the section headers carefully. Let me look at what page 9 shows at the bottom to clarify:\n\nActually, looking back at my findings, page 9 seems to be a transition page showing the \"iPad + Keyboard Solutions\" section header with compatibility information. Pages 10-11 then have two specific solutions.\n\nLet me now count all the different iPad solutions I found in the document:\n\n**From \"iPhone & iPad Solutions\" section (pages 1-8):**\n1. Series 2865\n2. OnePOD\n3. Series 960\n4. One60 Tethered\n5. One90QR\n6. iPad Digital Display\n\n**From \"iPad + Keyboard Solutions\" section (pages 10-11):**\n7. iPad Pro + Keyboards Standard Solution\n8. iPad Pro + Keyboards High Security Solution\n\nThat gives me a total of **8** different iPad solutions.\n\nBut wait, let me verify this more carefully. OnePOD was mentioned on page 2 in the product comparisons - let me verify if it's a distinct solution:Yes, OnePOD is a distinct solution. Let me also verify the others are all distinct iPad solutions and not just iPhones. Let me check the compatibility sections more carefully. Looking back at my readings, all of these products list iPad compatibility:\n\n- Series 2865: Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n- OnePOD: There's a specific OnePOD section for iPad ordering\n- Series 960: Compatible with iPads\n- One60 Tethered: Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n- One90QR: Compatible with iPad Pro, iPad Air, iPad, iPad mini 4\n- iPad Digital Display: Specifically for iPads on Apple Fixtures\n- iPad Pro + Keyboards Standard Solution: For iPad Pro with keyboards\n- iPad Pro + Keyboards High Security Solution: For iPad Pro with keyboards\n\nAll 8 are indeed iPad solutions. \n\nAnswer: 8", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 126703, "error": null, "extra": {"thread_id": 1938, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_18872fac42c746e097bc8836213e95a2", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 870, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "4"}}
+{"qid": "2021-Apple-Catalog.pdf::Q002", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "int", "gold": "4", "pages": 55, "document_id": 7578, "arm": "native_pdf", "question_id": "2021-Apple-Catalog.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the MacBook High Security Solution section and identify which features are listed but not shown in the illustration.\n\nLooking at page 15 (document page 22), I can see the \"MacBook High Security Solution\" section.\n\nThe \"FEATURES AND BENEFITS\" lists 6 numbered features:\n\n1. Rigid anchor design on rear of MacBook\n2. Steel cable co-molded for added strength, secured to table with OneKEY Padlock\n3. Bracket bar arms and custom form factor to capture and prevent lateral movement\n4. Ability to power/alarm MacBook with USB-C Port\n5. Metal tabs with rubber pads prevents closing of MacBook for added security and accidental damage during installation\n6. Stable, molded feet prevent rocking and protect fixture table\n\nNow looking at the illustrations on the same page, I can see:\n- Image showing features 3, 6, 1, 6, and 2 labeled\n- Image showing feature 5 labeled\n\nLet me verify which features are NOT shown in the illustrations:\n- Feature 1: Shown (labeled in bottom left image)\n- Feature 2: Shown (labeled in bottom left image)\n- Feature 3: Shown (labeled in bottom left image)\n- Feature 4: NOT shown in any illustration\n- Feature 5: Shown (labeled in top right image)\n- Feature 6: Shown (labeled in bottom left image)\n\nAnswer: 1", "citations": [], "input_tokens": 96859, "output_tokens": 327, "cost_micros": 295482, "latency_ms": 47640, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2021-Apple-Catalog.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "4"}}
+{"qid": "2021-Apple-Catalog.pdf::Q002", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "int", "gold": "4", "pages": 55, "document_id": 7578, "arm": "azure_basic_lc", "question_id": "2021-Apple-Catalog.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the MacBook High Security Solution section and identify which feature numbers are mentioned in the text but not shown in the illustration.\n\nLooking at page 15, I can see the \"MacBook High Security Solution\" section.\n\nThe FEATURES AND BENEFITS list shows:\n1. Rigid anchor design on rear of MacBook\n2. Steel cable co-molded for added strength, secured to table with OneKEY Padlock\n3. Bracket bar arms and custom form factor to capture and prevent lateral movement\n4. Ability to power/alarm MacBook with USB-C Port\n5. Metal tabs with rubber pads prevents closing of MacBook for added security and accidental damage during installation\n6. Stable, molded feet prevent rocking and protect fixture table\n\nNow I need to look at the illustration to see which numbers are labeled in the image.\n\nFrom the illustration, I can see the following numbers are clearly marked:\n- 1 (pointing to the rear anchor area)\n- 2 (pointing to the cable/tether area)\n- 3 (pointing to bracket/arm area)\n- 5 (pointing to the metal tabs)\n- 6 (appears twice, pointing to the molded feet)\n\nThe numbers NOT shown in the illustration are:\n- 4 (Ability to power/alarm MacBook with USB-C Port)\n\nAnswer: 4", "citations": [], "input_tokens": 11794, "output_tokens": 307, "cost_micros": 39987, "latency_ms": 8657, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2021-Apple-Catalog.pdf::Q002", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "int", "gold": "4", "pages": 55, "document_id": 7578, "arm": "azure_premium_lc", "question_id": "2021-Apple-Catalog.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the MacBook High Security Solution section and compare the features listed in text with what's shown in the illustration.\n\nLooking at pages 14-17, I find the MacBook High Security Solution on page 15.\n\nThe text lists these FEATURES AND BENEFITS:\n1. Rigid anchor design on rear of MacBook\n2. Steel cable co-molded for added strength, secured to table with OneKEY Padlock\n3. Bracket bar arms and custom form factor to capture and prevent lateral movement\n4. Ability to power/alarm MacBook with USB-C Port\n5. Metal tabs with rubber pads prevents closing of MacBook for added security and accidental damage during installation\n6. Stable, molded feet prevent rocking and protect fixture table\n\nNow I need to examine the illustration to see which features are labeled. The illustration shows numbered callouts (1, 2, 3, 6) pointing to different parts of the MacBook security solution.\n\nFrom the visible callouts in the illustration:\n- Feature 1 is shown\n- Feature 2 is shown\n- Feature 3 is shown\n- Feature 6 is shown\n\nFeatures NOT shown in the illustration:\n- Feature 4 (Ability to power/alarm MacBook with USB-C Port)\n- Feature 5 (Metal tabs with rubber pads prevents closing of MacBook)\n\nTherefore, 2 features are not presented in the illustration.\n\nAnswer: 2", "citations": [], "input_tokens": 18396, "output_tokens": 323, "cost_micros": 60033, "latency_ms": 9530, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "4"}}
+{"qid": "2021-Apple-Catalog.pdf::Q002", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "int", "gold": "4", "pages": 55, "document_id": 7578, "arm": "llamacloud_basic_lc", "question_id": "2021-Apple-Catalog.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the MacBook High Security Solution section and compare the features listed with what's shown in the illustration.\n\nLooking at page 15, I can see the \"HIGH SECURITY EXCEPTION PROGRAM - MACBOOK\" section titled \"MacBook High Security Solution\".\n\nThe FEATURES AND BENEFITS section lists 6 numbered features:\n1. Rigid anchor design on rear of MacBook\n2. Steel cable co-molded for added strength, secured to table with OneKEY Padlock\n3. Bracket bar arms and custom form factor to capture and prevent lateral movement\n4. Ability to power/alarm MacBook with USB-C Port\n5. Metal tabs with rubber pads prevents closing of MacBook for added security and accidental damage during installation\n6. Stable, molded feet prevent rocking and protect fixture table\n\nNow I need to examine the illustration on that page to see which features are NOT shown visually. Looking at the image description \"delete alarm\" and \"return\" text visible, this appears to be showing a MacBook with the security solution.\n\nFrom the numbered features:\n- Feature 4 (USB-C Port) - this is an internal capability, not visually shown in an illustration\n- Feature 5 (Metal tabs with rubber pads preventing closing) - this would be visible if shown\n- Feature 6 (Stable, molded feet) - these would be underneath and likely not visible in a standard product photo\n\nWithout being able to see the exact details of the illustration clearly, but based on typical product photography, features that are functional/internal capabilities (like feature 4 - power/alarm ability) or underneath components (like feature 6 - molded feet) are typically not shown in product illustrations.\n\nBased on the context and typical product documentation, features 4 and 6 would most likely not be presented in the illustration, giving us 2 features.\n\nAnswer: 2", "citations": [], "input_tokens": 12628, "output_tokens": 418, "cost_micros": 44154, "latency_ms": 12765, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "4"}}
+{"qid": "2021-Apple-Catalog.pdf::Q002", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "int", "gold": "4", "pages": 55, "document_id": 7578, "arm": "llamacloud_premium_lc", "question_id": "2021-Apple-Catalog.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the MacBook High Security Solution section and identify which features listed are NOT shown in the illustrations.\n\nLooking at page 15, I can see the \"MacBook High Security Solution\" section with:\n\n**FEATURES AND BENEFITS** (numbered 1-6):\n1. Rigid anchor design on rear of MacBook\n2. Steel cable co-molded for added strength, secured to table with OneKEY Padlock\n3. Bracket bar arms and custom form factor to capture and prevent lateral movement\n4. Ability to power/alarm MacBook with USB-C Port\n5. Metal tabs with rubber pads prevents closing of MacBook for added security and accidental damage during installation\n6. Stable, molded feet prevent rocking and protect fixture table\n\nThe page shows 4 images with numbered callouts:\n- Image 1 (Top Left): Shows callout 5\n- Image 2 (Top Right): Shows callout 5 (close-up)\n- Image 3 (Bottom Left): Shows callouts 1, 2, 3, and 6\n- Image 4 (Bottom Right): Shows callout 2 (close-up)\n\nFeatures shown in illustrations:\n- Feature 1: YES (shown in Image 3)\n- Feature 2: YES (shown in Images 3 and 4)\n- Feature 3: YES (shown in Image 3)\n- Feature 4: NO (not shown in any image)\n- Feature 5: YES (shown in Images 1 and 2)\n- Feature 6: YES (shown in Image 3)\n\nOnly Feature 4 (Ability to power/alarm MacBook with USB-C Port) is NOT presented in the illustrations.\n\nAnswer: 4", "citations": [], "input_tokens": 16962, "output_tokens": 396, "cost_micros": 56826, "latency_ms": 10375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2021-Apple-Catalog.pdf::Q002", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "int", "gold": "4", "pages": 55, "document_id": 7578, "arm": "surfsense_agentic", "question_id": "2021-Apple-Catalog.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the Apple Catalog document to find information about MacBook high security solution and identify which features are not presented in the illustration.Let me search for \"security\" in the document to find the relevant section:Let me search for \"MacBook\" and \"high security\" together to find the specific section:Now let me read the specific section about \"MacBook High Security Solution\":Perfect! I found the MacBook High Security Solution section. Now let me read it more carefully to see the full list of features and the illustration. Let me go back to line 3132:I can see the features listed are numbered 1-6:\n1. Rigid anchor design on rear of MacBook\n2. Steel cable co-molded for added strength, secured to table with OneKEY Padlock\n3. Bracket bar arms and custom form factor to capture and prevent lateral movement\n4. Ability to power/alarm MacBook with USB-C Port\n5. Metal tabs with rubber pads prevents closing of MacBook for added security and accidental damage during installation\n6. Stable, molded feet prevent rocking and protect fixture table\n\nThe illustration shows numbers 1, 2, 3, and 6 marked on the MacBook keyboard image in the figure. The OCR text shows these numbers visible in the illustration. However, I notice the visual description after the figure is describing a smartwatch, which seems incorrect. Let me check if there's a better description of the MacBook illustration by reading a bit more context around this section.\n\nLooking at the figure content (lines 3159-3225), I can see the illustration has:\n- Number 1 appears twice (lines 3185, 3189, 3223)\n- Number 2 appears twice (lines 3137, 3179, 3191, 3217)\n- Number 3 appears twice (lines 3173, 3215)\n- Number 6 appears three times (lines 3213, 3221)\n\nFrom the text, the 6 features are listed, and from the illustration markers I can identify which numbers are shown:\n- 1 is shown (rigid anchor design)\n- 2 is shown (steel cable)\n- 3 is shown (bracket bar arms)\n- 6 is shown (stable molded feet)\n\nThe features that are NOT shown in the illustration are:\n- Feature 4 (Ability to power/alarm MacBook with USB-C Port)\n- Feature 5 (Metal tabs with rubber pads prevents closing of MacBook)\n\nSo 2 features are not presented in the illustration.\n\nAnswer: 2", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 46969, "error": null, "extra": {"thread_id": 1939, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_abe777eda57d4187b59395c83dff7b8b", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 346, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "4"}}
+{"qid": "2021-Apple-Catalog.pdf::Q003", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "str", "gold": "Yes", "pages": 55, "document_id": 7578, "arm": "native_pdf", "question_id": "2021-Apple-Catalog.pdf::Q003", "answer_letter": null, "raw_text": "Looking through the document for information about \"One40\"...\n\nI cannot find any mention of \"One40\" in this Apple Solutions 2021 Product Catalog from InVue. The document discusses various security solutions for Apple products including:\n\n- Series 2865\n- OnePOD (One55, One60, One65)\n- Series 960\n- W2000\n- WS2\n- HSW100\n- One60 Wearable\n- OnePOD Wearable\n\nBut there is no product called \"One40\" mentioned in this catalog.\n\nAnswer: Cannot determine - product not found in document", "citations": [], "input_tokens": 96873, "output_tokens": 141, "cost_micros": 292734, "latency_ms": 38656, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2021-Apple-Catalog.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "cannot determine product not found in document", "normalised_gold": "yes"}}
+{"qid": "2021-Apple-Catalog.pdf::Q003", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "str", "gold": "Yes", "pages": 55, "document_id": 7578, "arm": "azure_basic_lc", "question_id": "2021-Apple-Catalog.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about One40 and whether it can only be used for Apple Watch.\n\nOn page 21, in the \"OnePOD Wearable / Apple Watch Solutions\" section, under \"FEATURES AND BENEFITS,\" it states:\n\n\"Compatible with One55, One60, One65 and One40 standard and split-level stands\"\n\nThis indicates that One40 is mentioned in the context of Apple Watch solutions. However, this doesn't definitively state that One40 can ONLY be used for Apple Watch.\n\nSearching through the rest of the document, One40 is only mentioned in this one location in relation to Apple Watch compatibility. There is no indication that One40 is used for any other products in this catalog.\n\nHowever, the question asks if it can \"only\" be used for Apple Watch. The document doesn't explicitly state that One40 cannot be used for other purposes - it simply doesn't mention One40 being used for anything else in this particular catalog.\n\nGiven that this is specifically an Apple catalog and One40 is only mentioned in the Apple Watch section without any other applications shown, and based solely on what is presented in this document:\n\nAnswer: yes", "citations": [], "input_tokens": 11808, "output_tokens": 265, "cost_micros": 39399, "latency_ms": 8577, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "yes", "normalised_gold": "yes"}}
+{"qid": "2021-Apple-Catalog.pdf::Q003", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "str", "gold": "Yes", "pages": 55, "document_id": 7578, "arm": "azure_premium_lc", "question_id": "2021-Apple-Catalog.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about One40 and what products it can be used with.\n\nOn page 21, under \"OnePOD Wearable\" section for Apple Watch Solutions, it states:\n\n\"Compatible with One55, One60, One65 and One40 standard and split-level stands\"\n\nThis indicates that One40 is compatible with the OnePOD Wearable system for Apple Watch. However, this doesn't definitively state that One40 can ONLY be used for Apple Watch.\n\nLooking through the rest of the document, I can see that One40 is only mentioned in the context of Apple Watch solutions. There is no mention of One40 being used for iPhones, iPads, or other Apple products in their respective sections.\n\nAnswer: No", "citations": [], "input_tokens": 18410, "output_tokens": 179, "cost_micros": 57915, "latency_ms": 5780, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "yes"}}
+{"qid": "2021-Apple-Catalog.pdf::Q003", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "str", "gold": "Yes", "pages": 55, "document_id": 7578, "arm": "llamacloud_basic_lc", "question_id": "2021-Apple-Catalog.pdf::Q003", "answer_letter": null, "raw_text": "I need to search the document for information about \"One40\" and what products it can be used with.\n\nLooking through the document, I can find mentions of \"One40\" on page 21 in the OnePOD Wearable section:\n\n\"Compatible with One55, One60, One65 and One40 standard and split-level stands\"\n\nThis appears in the context of Apple Watch solutions, but it only states compatibility with these stands. It doesn't state that One40 can ONLY be used for Apple Watch or that it's exclusive to Apple Watch.\n\nThe document doesn't provide enough information to definitively say whether One40 can only be used for Apple Watch or if it has other uses beyond what's mentioned in this catalog.\n\nHowever, since the question asks for a direct yes/no answer based on what's in the document, and the document only shows One40 mentioned in the Apple Watch context without explicitly stating it's exclusive to Apple Watch, I cannot confirm the statement is true.\n\nAnswer: No", "citations": [], "input_tokens": 12642, "output_tokens": 221, "cost_micros": 41241, "latency_ms": 6593, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "yes"}}
+{"qid": "2021-Apple-Catalog.pdf::Q003", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "str", "gold": "Yes", "pages": 55, "document_id": 7578, "arm": "llamacloud_premium_lc", "question_id": "2021-Apple-Catalog.pdf::Q003", "answer_letter": null, "raw_text": "Let me search through the document for information about One40 and its compatibility.\n\nLooking at the Apple Watch Solutions section, I can see mentions of:\n- W2000\n- WS2\n- OnePOD Wearable (compatible with One55, One60, One65 and One40)\n- One60 Wearable (High Security)\n- HSW100\n\nOn page 21, under \"OnePOD Wearable\" features and benefits, it states:\n\"Compatible with One55, One60, One65 and One40 standard and split-level stands\"\n\nThis indicates that One40 is a stand that is compatible with the OnePOD Wearable system for Apple Watch. However, the document does not explicitly state that One40 can ONLY be used for Apple Watch. The document doesn't provide comprehensive information about all possible uses of the One40 stand.\n\nBased on the limited information provided in the document, I cannot definitively confirm that One40 is exclusively for Apple Watch only.\n\nAnswer: Cannot be determined from the document", "citations": [], "input_tokens": 16976, "output_tokens": 239, "cost_micros": 54513, "latency_ms": 6765, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "cannot be determined from document", "normalised_gold": "yes"}}
+{"qid": "2021-Apple-Catalog.pdf::Q003", "doc_id": "2021-Apple-Catalog.pdf", "answer_format": "str", "gold": "Yes", "pages": 55, "document_id": 7578, "arm": "surfsense_agentic", "question_id": "2021-Apple-Catalog.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the Apple Catalog document to find information about One40 and its compatibility.Let me search for \"One40\" in the document.Now let me read the section around line 4032 to get more context about One40.This section is about Apple Watch products. Let me search for more information about One40 to see if it's used with other products.Let me search for other mentions of One40 in the document to see if it's used with other products besides Apple Watch.Let me search for \"One55\" or \"One60\" to understand what these stands are used for, which will help determine if One40 is only for Apple Watch.Good! I can see One55 is used for iPhone and iPad. Now let me check around line 1666 to see what products use these stands.Let me look for what products are associated with One40 specifically. Let me search for \"One40\" more broadly in the document.Let me search the entire document more carefully. I'll look for sections that mention what One40 is used for.I notice the table only shows One55, One65, and One60, but not One40. Let me look at sections that specifically discuss iPhone and iPad product ordering to see if One40 is mentioned there.I can see iPhone ordering uses One55, One60, and One65, but not One40. Let me check iPad ordering and then look specifically for what products use One40.iPad also uses One55, not One40. Now let me search for a broader section around the Apple Watch mention to understand the context better. Let me look for tables or product ordering sections related to Apple Watch and One40.Let me search for Apple Watch product ordering section specifically.Now let me read the Apple Watch product ordering section around line 6290.I can see the Apple Watch product ordering sections, but they don't mention One40 in the stand options. They mention DBD910-W, DBD600-W, DBD650-W, AC271-W, and ZW2000 for Apple Watch. Let me go back to where One40 was mentioned (line 4032) and read more context to understand what section that is part of.Perfect! Now I have clear context. The section starting at line 3973 is titled \"ONEPOD WEARABLE\" and is under \"/ Apple Watch Solutions\" (line 3974). The section at line 4021-4060 describes the OnePOD Wearable product which:\n\n1. Has \"Harness bands secure Apple Watch to sensor\" (line 4028)\n2. Is \"Compatible with One55, One60, One65 and One40 standard and split-level stands\" (line 4032)\n3. Has \"OEM charging compatibility for Apple Watch models\" (line 4035)\n4. Shows compatibility with various Apple Watch Series (lines 4043-4059)\n\nThis clearly indicates that One40 is mentioned specifically in the Apple Watch section. However, the statement says One40 is compatible with the OnePOD Wearable sensor/product along with One55, One60, and One65 stands. Let me verify if One40 stand is used for other products as well by searching more thoroughly.One40 is mentioned only once in the entire document, and it's in the Apple Watch section. However, I notice that it says the OnePOD Wearable (for Apple Watch) is \"Compatible with One55, One60, One65 and One40 standard and split-level stands\". This means the Apple Watch sensor can work with multiple stand types including One40, but it doesn't necessarily mean One40 can ONLY be used for Apple Watch.\n\nLet me check if there's a definition or description of what One40 is elsewhere, or if it appears in any product tables.Let me look at what the table at line 1666 shows - it lists One55, One65, and One60 but notably does NOT include One40. This suggests One40 might be a different or less common stand type. Let me see if I can find any section that specifically describes One40 as a product.The table comparison shows One55, One65, and One60, but NOT One40. This table appears on page 3 under \"/ iPhone & iPad Solutions\" and \"ONEPOD\u2122\" headers (lines 1615-1616).\n\nBased on my search:\n1. One40 is mentioned only ONCE in the entire document (line 4032)\n2. It's mentioned in the \"Apple Watch Solutions\" section under \"OnePOD Wearable\" product description\n3. The comparative table of stands (lines 1658-1730) for iPhone/iPad products shows One55, One65, and One60, but NOT One40\n4. iPhone ordering sections show One55, One60, One65 but not One40\n5. iPad ordering sections show One55 but not One40\n6. Apple Watch ordering sections don't explicitly list One40 in their product codes\n\nThe document states that the OnePOD Wearable (for Apple Watch) is \"Compatible with One55, One60, One65 and One40 standard and split-level stands.\" This indicates One40 CAN be used with Apple Watch, but since One55, One60, and One65 are used for multiple products (iPhone, iPad) and are also compatible with Apple Watch, it's possible One40 could also be used for other products.\n\nHowever, the fact that:\n- One40 appears nowhere else in the document\n- It's not in the comparison table\n- It's not in iPhone/iPad ordering sections\n- It only appears in the Apple Watch compatibility statement\n\nThis strongly suggests One40 is specifically or primarily designed for Apple Watch, though I cannot definitively say it can ONLY be used for Apple Watch based solely on this document.\n\nAnswer: No", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 125969, "error": null, "extra": {"thread_id": 1940, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_274957bdc18c45a58945780a57a038df", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 928, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "yes"}}
+{"qid": "2023.acl-long.386.pdf::Q000", "doc_id": "2023.acl-long.386.pdf", "answer_format": "list", "gold": "['Fact Checker']", "pages": 24, "document_id": 7579, "arm": "native_pdf", "question_id": "2023.acl-long.386.pdf::Q000", "answer_letter": null, "raw_text": "I need to look at the first figure in the paper to identify the function names with blue background in the program execution.\n\nLooking at Figure 1 (Overview of PROGRAMFC model) on page 2, I can see the \"Functions\" column which shows the program execution functions. The functions shown with blue background are:\n\n1. Fact Checker (appears twice - for S1 and S3)\n2. QA Model (for S2)\n3. Logical Reasoner (for S4)\n\nArranging these in ascending alphabetical order:\n\nAnswer: Fact Checker, Logical Reasoner, QA Model", "citations": [], "input_tokens": 65107, "output_tokens": 141, "cost_micros": 197436, "latency_ms": 20703, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2023.acl-long.386.pdf"}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "fact checker, logical reasoner, qa model", "normalised_gold": "fact checker"}}
+{"qid": "2023.acl-long.386.pdf::Q000", "doc_id": "2023.acl-long.386.pdf", "answer_format": "list", "gold": "['Fact Checker']", "pages": 24, "document_id": 7579, "arm": "azure_basic_lc", "question_id": "2023.acl-long.386.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document, I need to identify the function names with blue background in the \"Functions\" column of the Program Execution section.\n\nFrom the figure, I can see the following functions are called in the reasoning program:\n- S1: Verify (Fact Checker)\n- S2: Question (QA Model)\n- S3: Verify (Fact Checker)\n- S4: Predict (Logical Reasoner)\n\nThe functions with blue background are:\n- Fact Checker\n- QA Model\n- Logical Reasoner\n\nHowever, the question asks for \"function names\" which appear to be the ACTION names in the program format. Looking at the actual function calls in the program:\n- Verify\n- Question\n- Predict\n\nIn ascending alphabetical order:\n\nAnswer: Predict, Question, Verify", "citations": [], "input_tokens": 26912, "output_tokens": 192, "cost_micros": 83616, "latency_ms": 7046, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "predict, question, verify", "normalised_gold": "fact checker"}}
+{"qid": "2023.acl-long.386.pdf::Q000", "doc_id": "2023.acl-long.386.pdf", "answer_format": "list", "gold": "['Fact Checker']", "pages": 24, "document_id": 7579, "arm": "azure_premium_lc", "question_id": "2023.acl-long.386.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1 in the paper, I can see the \"Functions\" column which shows the sub-task functions that are called during program execution.\n\nThe functions shown with blue backgrounds in the figure are:\n- Fact Checker (called in steps S1 and S3 for the Verify function)\n- QA Model (called in step S2 for the Question function)\n- Logical Reasoner (called in step S4 for the Predict function)\n\nArranging these in ascending alphabetical order:\n\nAnswer: Fact Checker, Logical Reasoner, QA Model", "citations": [], "input_tokens": 29934, "output_tokens": 135, "cost_micros": 91827, "latency_ms": 5031, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "fact checker, logical reasoner, qa model", "normalised_gold": "fact checker"}}
+{"qid": "2023.acl-long.386.pdf::Q000", "doc_id": "2023.acl-long.386.pdf", "answer_format": "list", "gold": "['Fact Checker']", "pages": 24, "document_id": 7579, "arm": "llamacloud_basic_lc", "question_id": "2023.acl-long.386.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document, I need to identify the function names that appear with blue background in the \"Functions\" column during program execution.\n\nFrom the figure, I can see the following functions listed in the execution steps:\n- S1: Fact Checker\n- S2: QA Model\n- S3: Fact Checker\n- S4: Logical Reasoner\n\nHowever, examining the actual function names called in the \"Reasoning Program\" column:\n- FACT_1 uses a fact verification function\n- ANSWER_1 uses Question (QA)\n- FACT_2 uses a fact verification function\n- PREDICTED_LABEL uses Predict (logical reasoning)\n\nLooking at Section 3.4 which describes the sub-task functions, the three function names are:\n1. QUESTION\n2. VERIFY\n3. PREDICT\n\nThese correspond to the blue-backgrounded function types shown in Figure 1.\n\nIn ascending alphabetical order:\n\nAnswer: PREDICT, QUESTION, VERIFY", "citations": [], "input_tokens": 22321, "output_tokens": 230, "cost_micros": 70413, "latency_ms": 7172, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "predict, question, verify", "normalised_gold": "fact checker"}}
+{"qid": "2023.acl-long.386.pdf::Q000", "doc_id": "2023.acl-long.386.pdf", "answer_format": "list", "gold": "['Fact Checker']", "pages": 24, "document_id": 7579, "arm": "llamacloud_premium_lc", "question_id": "2023.acl-long.386.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document, I need to identify the function names shown in the \"Functions\" column that correspond to the program execution steps.\n\nFrom the figure, the functions shown are:\n- Fact Checker (for S1 and S3)\n- QA Model (for S2)\n- Logical Reasoner (for S4)\n\nHowever, looking more carefully at the actual function names used in the reasoning program section and throughout the paper, the standardized function names are:\n- VERIFY (for fact checking)\n- QUESTION (for question answering)\n- PREDICT (for logical reasoning)\n\nThese are the three sub-task functions described in Section 3.4.\n\nIn ascending alphabetical order:\n\nAnswer: PREDICT, QUESTION, VERIFY", "citations": [], "input_tokens": 29236, "output_tokens": 174, "cost_micros": 90318, "latency_ms": 6828, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "predict, question, verify", "normalised_gold": "fact checker"}}
+{"qid": "2023.acl-long.386.pdf::Q000", "doc_id": "2023.acl-long.386.pdf", "answer_format": "list", "gold": "['Fact Checker']", "pages": 24, "document_id": 7579, "arm": "surfsense_agentic", "question_id": "2023.acl-long.386.pdf::Q000", "answer_letter": null, "raw_text": "I can see Figure 1 in the document. Let me search for information about the program execution functions with blue backgrounds. I need to look more carefully at the figure description and any additional details about the model architecture.Let me look at the actual figure more carefully. I need to examine the visual description and the figure content in detail to identify functions with blue backgrounds.I can see the figure content, but I need to look for more detailed visual information about the figure. Let me search for image descriptions or additional figure details.Let me look for any embedded images or visual descriptions that might contain details about the colors in Figure 1.Let me search for more embedded images near Figure 1 to see if there's a better visual description.Let me look for the actual embedded image description near Figure 1 more carefully. Let me search around line 408 where Im5.png is mentioned.Let me search for more specific information about the visual elements of Figure 1. Let me look for descriptions that might mention colors or boxes in the figure.Let me look for image descriptions near the figure that might contain more details about the visual layout. Let me read more around the figure area to find additional embedded images or descriptions.Based on Figure 1 in the document, I can see the \"Functions\" column which shows the program execution functions. Looking at the structure, I can see:\n- S1: Verify \u2192 Fact Checker\n- S2: Question \u2192 QA Model  \n- S3: Verify \u2192 Fact Checker\n- S4: Predict \u2192 Logical Reasoner\n\nHowever, I need to find which ones have blue backgrounds. Let me search more carefully for visual information about the figure or look for any other embedded images that might be Figure 1 itself.From Figure 1, I can see the functions column showing:\n- S1: Fact Checker\n- S2: QA Model\n- S3: Fact Checker\n- S4: Logical Reasoner\n\nHowever, the text-based XML doesn't explicitly specify which ones have blue backgrounds. Let me search the document for any other references to colors or visual styling in Figure 1, or look for the actual Figure 1 image description more thoroughly. Let me search for \"Fact Checker\", \"QA Model\", \"Logical Reasoner\" to understand the context better.Based on my analysis of Figure 1 from the document, I can see the \"Functions\" column contains:\n- Fact Checker (appears for S1 and S3 - Verify operations)\n- QA Model (appears for S2 - Question operation)\n- Logical Reasoner (appears for S4 - Predict operation)\n\nGiven the typical visualization patterns in academic papers where functions are color-coded, and looking at the structure where the three main function types are displayed, the functions with blue background would likely be the distinct function types shown in the \"Functions\" column. Without explicit color information in the XML, but based on the structure and common visualization practices, the three function types are: \"Fact Checker\", \"Logical Reasoner\", and \"QA Model\".\n\nIn alphabetical order, these would be:\n\nAnswer: Fact Checker, Logical Reasoner, QA Model", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 80155, "error": null, "extra": {"thread_id": 1941, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_d6bd06724d924284a60194ba98ad4765", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 579, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "fact checker, logical reasoner, qa model", "normalised_gold": "fact checker"}}
+{"qid": "2023.acl-long.386.pdf::Q002", "doc_id": "2023.acl-long.386.pdf", "answer_format": "int", "gold": "2", "pages": 24, "document_id": 7579, "arm": "native_pdf", "question_id": "2023.acl-long.386.pdf::Q002", "answer_letter": null, "raw_text": "I need to find how many prompt examples are shown in the paper's main body (not appendices).\n\nLet me search through the document for prompt examples:\n\n1. **Figure 1** (page 2): Shows a complete example of the PROGRAMFC model with a reasoning program for the claim about James Cameron and Interstellar. This includes the full program structure.\n\n2. **Figure 2** (page 5): Shows \"The Codex prompt template used to generate reasoning programs, consisting of a task instruction, in-context examples, and a prompt for the <input_claim>.\" This shows one example of a reasoning program for the James Cameron/Interstellar claim.\n\n3. **Figure 3** (page 5): Shows the implementation of the question-answering sub-task function for three different settings (Gold Evidence, Open-book, Closed-book).\n\n4. **Section 3.4** (page 4): Shows prompt templates for the QUESTION, VERIFY, and PREDICT functions in text form.\n\nLooking at Figure 2 specifically, it shows one complete example of a prompt with the reasoning program for the James Cameron claim.\n\nIn the main body (before Appendices which start at page 14), the primary prompt example shown is in Figure 2, which displays one reasoning program example. The text also mentions using \"K = 20\" in-context examples (page 4), but these full 20 examples are not shown in the main body - they are referenced to be in Appendix D.\n\nThe main body shows **1** complete prompt example explicitly in Figure 2.\n\nAnswer: 1", "citations": [], "input_tokens": 65062, "output_tokens": 356, "cost_micros": 200526, "latency_ms": 28827, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2023.acl-long.386.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "2"}}
+{"qid": "2023.acl-long.386.pdf::Q002", "doc_id": "2023.acl-long.386.pdf", "answer_format": "int", "gold": "2", "pages": 24, "document_id": 7579, "arm": "azure_basic_lc", "question_id": "2023.acl-long.386.pdf::Q002", "answer_letter": null, "raw_text": "I need to count the number of prompt examples shown in the main body of the paper (not in appendices).\n\nLet me go through the paper systematically:\n\n1. **Figure 2** (page 6985): Shows one prompt template for Codex used to generate reasoning programs. This includes:\n   - Task instruction\n   - One example showing a claim about James Cameron and Interstellar\n   - Mentions \"more in-context examples here\"\n   - Input claim placeholder\n   This counts as **1 prompt example**.\n\n2. **Figure 3** (page 6985): Shows the implementation of the question-answering sub-task function for three different settings (Gold Evidence, Open-book, Closed-book). These show prompt templates for FLAN-T5 with question-answering format. This counts as **1 prompt example** (showing variations for different settings).\n\n3. **Section 3.4** (page 6984): Shows the VERIFY module prompt:\n   ```\n   EVIDENCE\n   Q: Is it true that CLAIM ? True or False? The answer is:\n   ```\n   This counts as **1 prompt example**.\n\n4. The paper mentions prompts for baselines in Appendix A, but those are not in the main body.\n\n5. The detailed prompts in Appendix D and E are explicitly in appendices, not the main body.\n\nLet me recount more carefully:\n- Figure 2: 1 prompt template\n- Figure 3: 3 variations of the same prompt structure (but essentially 1 prompt design)\n- Section 3.4 VERIFY: 1 prompt\n\nLooking at the question more carefully - it asks for \"prompt examples\" shown in the main body. The main body ends before the appendices.\n\nIn the main body:\n1. Figure 2 shows one complete prompt example\n2. Figure 3 shows prompt variations (3 settings)\n3. Section 3.4 shows the VERIFY prompt\n\nIf counting each distinct prompt shown: Figure 2 (1), Figure 3 (3 variations but let me check if they're separate examples), Section 3.4 (1).\n\nActually, looking at Figure 3 more carefully, it shows one prompt design with three different implementations for three settings. The question", "citations": [], "input_tokens": 26867, "output_tokens": 512, "cost_micros": 88281, "latency_ms": 15515, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "2"}}
+{"qid": "2023.acl-long.386.pdf::Q002", "doc_id": "2023.acl-long.386.pdf", "answer_format": "int", "gold": "2", "pages": 24, "document_id": 7579, "arm": "azure_premium_lc", "question_id": "2023.acl-long.386.pdf::Q002", "answer_letter": null, "raw_text": "I need to count the number of prompt examples shown in the main body of the paper (not in the appendices).\n\nLooking through the paper:\n\n1. **Figure 2** (page 6985) shows a prompt template for generating reasoning programs with Codex. This includes:\n   - A task instruction\n   - One complete example showing the claim about James Cameron and the director of Interstellar\n   - A note saying \"(...more in-context examples here...)\"\n   - An input claim placeholder\n\nThis appears to be showing **1 complete example** in the prompt template.\n\n2. **Figure 1** (page 6982) shows an overview diagram with the same example claim about James Cameron, but this is showing the system architecture rather than being a separate prompt example.\n\n3. **Section 3.4** describes the prompt formats for different sub-task functions (QUESTION, VERIFY, PREDICT), but these are described in text format rather than being full examples.\n\n4. **Section 4.4** mentions prompts for closed-book fact-checking and states \"The detailed prompting templates are given in Appendix E\" - so the full examples are in the appendix, not the main body.\n\nIn the main body, I can only count **1** complete prompt example that is actually shown (the one in Figure 2 about James Cameron and Interstellar).\n\nAnswer: 1", "citations": [], "input_tokens": 29889, "output_tokens": 308, "cost_micros": 94287, "latency_ms": 9937, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "2"}}
+{"qid": "2023.acl-long.386.pdf::Q002", "doc_id": "2023.acl-long.386.pdf", "answer_format": "int", "gold": "2", "pages": 24, "document_id": 7579, "arm": "llamacloud_basic_lc", "question_id": "2023.acl-long.386.pdf::Q002", "answer_letter": null, "raw_text": "I need to count the number of prompt examples shown in the main body of the paper (not in appendices).\n\nLooking through the document systematically:\n\n1. **Figure 1**: This shows an overview of the model with what appears to be a simplified example of the reasoning process for the claim about James Cameron and Christopher Nolan. This shows one example of how the program works but not a full prompt template.\n\n2. **Figure 2**: This is explicitly labeled as \"The Codex prompt template used to generate reasoning programs\" and shows:\n   - A task instruction\n   - One complete in-context example (the James Cameron/Interstellar claim)\n   - A note that there are \"more in-context examples here\"\n   - A prompt template for the input claim\n\n   So this shows **1 complete prompt example** in the template.\n\n3. **Figure 3**: This shows the implementation of the question-answering sub-task function with different prompt formats for different settings (Gold Evidence, Open-book, Closed-book). These are partial prompts for sub-tasks, showing **3 prompt format variations**.\n\n4. In the text around Section 3.4, there are descriptions of prompts:\n   - For QUESTION (closed-book): \"Q: QUESTION ? The answer is:\" - **1 prompt**\n   - For QUESTION (other settings): \"EVIDENCE Q: QUESTION ? The answer is:\" - **1 prompt**\n   - For VERIFY: \"EVIDENCE Q: Is it true that CLAIM ? True or False? The answer is:\" - **1 prompt**\n\nHowever, I need to be careful about what counts as \"shown\" vs just described in text.\n\nLooking more carefully at what is explicitly displayed in figures/boxes in the main body:\n- Figure 1: Shows the example execution but not a full prompt\n- Figure 2: Shows 1 complete prompt example\n- Figure 3: Shows 3 prompt format examples (though these are sub-task prompts)\n\nThe question asks for \"prompt examples\" in the main body. The most prominent and complete prompt example shown is in Figure 2, which displays **1** complete example.\n\nAnswer: 1", "citations": [], "input_tokens": 22276, "output_tokens": 485, "cost_micros": 74103, "latency_ms": 14203, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "2"}}
+{"qid": "2023.acl-long.386.pdf::Q002", "doc_id": "2023.acl-long.386.pdf", "answer_format": "int", "gold": "2", "pages": 24, "document_id": 7579, "arm": "llamacloud_premium_lc", "question_id": "2023.acl-long.386.pdf::Q002", "answer_letter": null, "raw_text": "I need to count the number of prompt examples shown in the main body of the paper (not in appendices).\n\nLooking through the main body of the paper:\n\n1. **Figure 2** (page 6984): Shows the Codex prompt template used to generate reasoning programs. This shows one prompt example with the claim about James Cameron and Interstellar.\n\n2. **Figure 3** (page 6985): Shows implementation of question-answering sub-task function with different prompt formats for three settings (Gold Evidence, Open-book, Closed-book). This shows prompt examples for the QA function.\n\n3. **Section 3.4** discusses the VERIFY and PREDICT functions with their prompt formats shown inline in the text.\n\nLet me count more carefully:\n\nIn **Figure 2**: This shows 1 complete prompt template example.\n\nIn **Section 3.4** (page 6984):\n- QUESTION function shows prompts for closed-book and other settings (2 prompt formats)\n- VERIFY function shows 1 prompt format\n\nSo in the main body, I can identify:\n- Figure 2: 1 prompt example\n- Section 3.4 for QUESTION: Shows prompt formats but these are templates\n- Section 3.4 for VERIFY: Shows 1 prompt template\n\nActually, looking at Figure 2 more carefully, it shows one example claim with its program within a larger prompt template that includes \"(... more in-context examples here ...)\".\n\nThe question asks for \"prompt examples\" - I should count distinct prompt templates or examples shown.\n\nIn the main body:\n1. Figure 2 - shows the program generation prompt template (1)\n2. Figure 3 - shows QA implementation prompts (multiple formats shown)\n3. Section 3.4 text - shows VERIFY prompt format (1)\n\nBut Figure 2 actually shows a full prompt with one complete example visible.\n\nLooking at this more carefully, Figure 2 shows **one** concrete example in the prompt template.\n\nAnswer: 1", "citations": [], "input_tokens": 29191, "output_tokens": 456, "cost_micros": 94413, "latency_ms": 14032, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "2"}}
+{"qid": "2023.acl-long.386.pdf::Q002", "doc_id": "2023.acl-long.386.pdf", "answer_format": "int", "gold": "2", "pages": 24, "document_id": 7579, "arm": "surfsense_agentic", "question_id": "2023.acl-long.386.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the document to find how many prompt examples are shown in the paper's main body.Let me continue reading to see the document structure and find prompt examples:Let me search for prompt examples in the document:Now let me look at Figure 2 and Figure 3 which show the prompt templates:Let me search for \"Figure\" to understand how many figures/examples are in the main body:Let me check where the appendix starts to understand what's in the main body vs appendix:Let me look for where the paper sections end and appendix begins:Now let me search for where the appendix starts:Good! So \"## A Implementation Details about the Baselines\" starts the appendix. Let me find the line number:So the appendix starts at line 2301. Now let me search for all figures and tables to count prompt examples in the main body (before line 2301):I can see Figure 2 shows a prompt template with one example shown (the James Cameron / Interstellar example), and it says \"( ... more in-context examples here .. . )\" indicating there are more but not shown. Let me look at Figure 6 which shows an error case:Let me look before this to see the actual example in Figure 6:Good! Figure 6 shows one example. Now let me check if there are any other figures or examples shown in the main body. Let me look for examples in sections discussing the prompts:Now let me look at the Limitations section which might have another example:Good! I found another example in the Limitations section - the \"Aristotle couldn't have used a laptop\" example. Let me now carefully count all prompt/program examples shown in the main body (before line 2301 where Appendix starts):\n\n1. **Figure 1** (line 335-402) - Shows the \"James Cameron and Interstellar director\" example with the full reasoning program\n2. **Figure 2** (line 747-770) - Shows the same \"James Cameron and Interstellar director\" example as part of the prompt template \n3. **Figure 6** (line 1441-1464) - Shows the \"Emery ghost town\" error case example\n4. **Limitations section** (line 1699-1706) - Shows the \"Aristotle couldn't have used a laptop\" example\n\nHowever, I need to be careful - Figure 1 and Figure 2 show the SAME example (James Cameron/Interstellar). So let me count unique examples:\n\n1. James Cameron/Interstellar example (shown in both Figure 1 and Figure 2)\n2. Emery ghost town example (Figure 6)\n3. Aristotle laptop example (Limitations section)\n\nThat's **3 unique prompt/program examples** shown in the main body of the paper.\n\nWait, let me re-read the question: \"How many prompts examples are shown in the paper's main body?\" - this could mean distinct prompt templates or distinct claim examples with programs. Let me verify by looking at what constitutes an example.\n\nLooking back at the figures:\n- Figure 2 explicitly shows a prompt template with ONE example shown (the James Cameron one) plus \"( ... more in-context examples here .. . )\"\n- Figure 6 shows ONE error case example\n- Limitations shows ONE example\n\nSo there are **3 prompt examples** shown in the main body.\n\nAnswer: 3", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 87217, "error": null, "extra": {"thread_id": 1942, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_fcb323f039bb45c4bc9d5aa08b03f871", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 646, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "2"}}
+{"qid": "2023.acl-long.386.pdf::Q003", "doc_id": "2023.acl-long.386.pdf", "answer_format": "int", "gold": "1", "pages": 24, "document_id": 7579, "arm": "native_pdf", "question_id": "2023.acl-long.386.pdf::Q003", "answer_letter": null, "raw_text": "I need to find the figure at the top of page 6985 and count the complete in-context examples shown.\n\nLooking at page 6985 (which is page 5 of the document), I can see Figure 2 which shows a prompt template for Codex. The figure contains in-context examples demonstrating the program generation format.\n\nIn the figure, I can see:\n1. One complete example starting with \"# The claim is that Both James Cameron and the director of the film Interstellar were born in Canada.\" with a full program definition.\n2. After that, there is text \"(\u00b7 \u00b7 \u00b7 more in-context examples here \u00b7 \u00b7 \u00b7)\" indicating additional examples.\n3. Then \"# The claim is that <input_claim>\" followed by \"def program():\" which is the prompt for the input.\n\nSo there is 1 complete in-context example explicitly shown in full in this figure, with an indication that more examples would be included in the actual prompt.\n\nAnswer: 1", "citations": [], "input_tokens": 65078, "output_tokens": 222, "cost_micros": 198564, "latency_ms": 13453, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2023.acl-long.386.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "1"}}
+{"qid": "2023.acl-long.386.pdf::Q003", "doc_id": "2023.acl-long.386.pdf", "answer_format": "int", "gold": "1", "pages": 24, "document_id": 7579, "arm": "azure_basic_lc", "question_id": "2023.acl-long.386.pdf::Q003", "answer_letter": null, "raw_text": "I need to look at the figure at the top of page 6985 to count the complete in-context examples shown.\n\nLooking at Figure 2 on page 6985, which shows \"The Codex prompt template used to generate reasoning programs\", I can see:\n\n1. One complete example about James Cameron and the director of Interstellar (showing the full program with fact_1, Answer_1, fact_2, and label)\n\n2. After that, there's a notation \"( ... more in-context examples here . . . )\" indicating additional examples are included but not shown\n\n3. Then there's the prompt for the input claim\n\nSo the figure explicitly shows **1** complete in-context example in full detail, though it indicates more examples are present in the actual implementation (they're just not displayed in the figure).\n\nAnswer: 1", "citations": [], "input_tokens": 26883, "output_tokens": 187, "cost_micros": 83454, "latency_ms": 7171, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "1"}}
+{"qid": "2023.acl-long.386.pdf::Q003", "doc_id": "2023.acl-long.386.pdf", "answer_format": "int", "gold": "1", "pages": 24, "document_id": 7579, "arm": "azure_premium_lc", "question_id": "2023.acl-long.386.pdf::Q003", "answer_letter": null, "raw_text": "I need to look at the figure at the top of page 6985 to count the complete in-context examples shown.\n\nLooking at page 6985, I can see Figure 2, which shows \"The Codex prompt template used to generate reasoning programs.\"\n\nThe figure contains:\n1. A task instruction at the top\n2. One complete example showing:\n   - A claim: \"Both James Cameron and the director of the film Interstellar were born in Canada.\"\n   - A program with:\n     - fact_1 = Verify(\"James Cameron was born in Canada.\")\n     - Answer_1 = Question(\"Who is the director of the film Interstellar?\")\n     - fact_2 = Verify(\"{Answer_1} was born in Canada.\")\n     - label = Predict(fact_1 and fact_2)\n3. A note saying \"( ... more in-context examples here .. . )\"\n4. A prompt for the input claim\n\nSo there is **1** complete in-context example fully shown in this figure.\n\nAnswer: 1", "citations": [], "input_tokens": 29905, "output_tokens": 242, "cost_micros": 93345, "latency_ms": 7921, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "1"}}
+{"qid": "2023.acl-long.386.pdf::Q003", "doc_id": "2023.acl-long.386.pdf", "answer_format": "int", "gold": "1", "pages": 24, "document_id": 7579, "arm": "llamacloud_basic_lc", "question_id": "2023.acl-long.386.pdf::Q003", "answer_letter": null, "raw_text": "I need to find the figure at the top of page 6985 and count the complete in-context examples shown.\n\nLooking at the document, page 6985 corresponds to the section showing Figure 2, which displays the Codex prompt template used to generate reasoning programs.\n\nIn Figure 2, I can see:\n1. A task instruction at the top explaining the three functions that can be called\n2. One complete example showing:\n   - The claim about \"Both James Cameron and the director of the film Interstellar were born in Canada\"\n   - The complete program with multiple steps (fact_1, Answer_1, fact_2, label)\n3. A note saying \"(\u00b7 \u00b7 \u00b7 more in-context examples here \u00b7 \u00b7 \u00b7)\" indicating additional examples are present but not shown\n4. The input prompt template at the bottom\n\nThe figure shows 1 complete in-context example explicitly displayed in full, with an indication that more examples exist but are not shown in the figure.\n\nAnswer: 1", "citations": [], "input_tokens": 22292, "output_tokens": 222, "cost_micros": 70206, "latency_ms": 8125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "1"}}
+{"qid": "2023.acl-long.386.pdf::Q003", "doc_id": "2023.acl-long.386.pdf", "answer_format": "int", "gold": "1", "pages": 24, "document_id": 7579, "arm": "llamacloud_premium_lc", "question_id": "2023.acl-long.386.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the figure on page 6985 (Figure 2: The Codex prompt template), I can see the prompt template structure.\n\nThe template shows:\n1. A task instruction at the top\n2. One complete example showing a claim about James Cameron and the Interstellar director, with its full program implementation\n3. A note \"(... more in-context examples here ...)\" indicating additional examples would go here\n4. The input claim placeholder at the bottom\n\nSo there is **1** complete in-context example explicitly shown in the figure, though the text indicates more examples would be included in the actual prompt (the paper mentions using K=20 examples total).\n\nAnswer: 1", "citations": [], "input_tokens": 29207, "output_tokens": 151, "cost_micros": 89886, "latency_ms": 7562, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "1"}}
+{"qid": "2023.acl-long.386.pdf::Q003", "doc_id": "2023.acl-long.386.pdf", "answer_format": "int", "gold": "1", "pages": 24, "document_id": 7579, "arm": "surfsense_agentic", "question_id": "2023.acl-long.386.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the document to find the figure at the top of page 6985 and count the complete in-context examples shown.Let me search for page 6985 in the document:Let me read through the document to find the page number references:I can see the document is from ACL conference pages 6981-7004. Let me search for figures and read the content more systematically:Now let me look at Figure 2 which seems to be the prompt template. Let me read around line 769 where Figure 2 is mentioned:I can see Figure 2 template ends around line 770. Now I need to find what page this corresponds to. Let me check if there are any page markers. Let me look for the beginning of the document to understand the structure:Good! I can see page numbers now. Line 327 shows \"<!-- PageNumber=\"6981\" -->\". Now let me search for page 6985:Great! Page 6985 starts at line 902. Let me read the content around that line to find the figure at the top of the page:I can see that at the top of page 6985 (starting at line 902), there's a table (Table 1), not a figure. Let me continue reading to see if there's a figure further down on this page:I see Figure 4 is on page 6986 (line 1108). Let me look back more carefully. The question asks about the figure at the TOP of page 6985. Let me check if there's a figure that spans from page 6984 to 6985, or if I need to look more carefully at what's immediately after the page break at line 902:So at the top of page 6985 (line 902-906), there's a Table 1, not a figure. Let me check what's on page 6984 to see if there's a figure that might be spanning or near the top of 6985:Let me read around line 744 to see what's on page 6984:I can see that Figure 2 is on page 6984 (lines 747-769), and it shows the Codex prompt template. Looking at the content of Figure 2, I can see:\n\n- Line 747-750: Task instruction\n- Lines 752-762: One complete in-context example showing the claim about James Cameron and the director of Interstellar\n- Line 764: \"( ... more in-context examples here .. . )\" - indicating there are more examples\n- Lines 766-767: The prompt for the input claim\n\nThe figure caption at line 769 says \"The Codex prompt template used to generate reasoning programs, consisting of a task instruction, in-context examples, and a prompt for the <input_claim>.\"\n\nHowever, I see only ONE complete example shown explicitly (lines 752-762), and then it says \"more in-context examples here\" without showing them.\n\nWait, let me re-read the question. It asks about \"the figure at the top of page 6985\". Let me check if Figure 2 actually spans to page 6985 or if there's another figure. Looking at the structure:\n- Page 6984 starts at line 744\n- Figure 2 content is at lines 747-769\n- Page 6985 starts at line 902\n\nSo Figure 2 is entirely on page 6984, not 6985. At the top of page 6985 (line 902-906) there's Table 1, not a figure. Let me search more carefully for any figures that might be between these pages or check if I'm missing something:Wait, I think I need to reconsider. Let me look at the actual visual structure. Lines 747-769 show Figure 2 which is on page 6984. Let me check if the complete Figure 2 example I see is showing all the in-context examples or if there's more detail I should look at. Let me read the full Figure 2 content more carefully:", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 91078, "error": null, "extra": {"thread_id": 1943, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_479f734c9e744aafac61b60b60655bab", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 680, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "1"}}
+{"qid": "2023.acl-long.386.pdf::Q005", "doc_id": "2023.acl-long.386.pdf", "answer_format": "str", "gold": "Subtask", "pages": 24, "document_id": 7579, "arm": "native_pdf", "question_id": "2023.acl-long.386.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Table 2 in the document, which shows \"Reasoning program evaluation for incorrectly-predicted examples from each hop length in HOVER\":\n\nFor semantic errors:\n- Token: 8% + 20% + 18% = 46%\n- Structure: 19% + 13% + 57% = 89%\n- Subtask: 2% + 5% + 2% = 9%\n\nAnswer: Subtask", "citations": [], "input_tokens": 65070, "output_tokens": 105, "cost_micros": 196785, "latency_ms": 20235, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2023.acl-long.386.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "subtask", "normalised_gold": "subtask"}}
+{"qid": "2023.acl-long.386.pdf::Q005", "doc_id": "2023.acl-long.386.pdf", "answer_format": "str", "gold": "Subtask", "pages": 24, "document_id": 7579, "arm": "azure_basic_lc", "question_id": "2023.acl-long.386.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Table 2 in the document, which shows the error analysis for reasoning programs across HOVER 2-hop, 3-hop, and 4-hop claims:\n\nFor semantic errors, the three types and their totals are:\n- Token: 8% + 20% + 18% = 46%\n- Structure: 19% + 13% + 57% = 89%\n- Subtask: 2% + 5% + 2% = 9%\n\nAnswer: Subtask", "citations": [], "input_tokens": 26875, "output_tokens": 120, "cost_micros": 82425, "latency_ms": 4672, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "subtask", "normalised_gold": "subtask"}}
+{"qid": "2023.acl-long.386.pdf::Q005", "doc_id": "2023.acl-long.386.pdf", "answer_format": "str", "gold": "Subtask", "pages": 24, "document_id": 7579, "arm": "azure_premium_lc", "question_id": "2023.acl-long.386.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Table 2, which shows the error analysis for reasoning programs on the HOVER dataset broken down by hop length (2-hop, 3-hop, 4-hop):\n\nThe three types of semantic errors are:\n- Token: 8% + 20% + 18% = 46%\n- Structure: 19% + 13% + 57% = 89%\n- Subtask: 2% + 5% + 2% = 9%\n\nAnswer: Subtask", "citations": [], "input_tokens": 29897, "output_tokens": 116, "cost_micros": 91431, "latency_ms": 5218, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "subtask", "normalised_gold": "subtask"}}
+{"qid": "2023.acl-long.386.pdf::Q005", "doc_id": "2023.acl-long.386.pdf", "answer_format": "str", "gold": "Subtask", "pages": 24, "document_id": 7579, "arm": "llamacloud_basic_lc", "question_id": "2023.acl-long.386.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Table 2 in the document, which shows the error analysis for reasoning programs on the HOVER dataset, I need to examine the semantic error breakdown across the three hop types (2-hop, 3-hop, and 4-hop).\n\nThe semantic errors are broken down into three categories:\n- Token: 8% (2-hop) + 20% (3-hop) + 18% (4-hop) = 46% total\n- Structure: 19% (2-hop) + 13% (3-hop) + 57% (4-hop) = 89% total\n- Subtask: 2% (2-hop) + 5% (3-hop) + 2% (4-hop) = 9% total\n\nAnswer: Subtask", "citations": [], "input_tokens": 22284, "output_tokens": 180, "cost_micros": 69552, "latency_ms": 6328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "subtask", "normalised_gold": "subtask"}}
+{"qid": "2023.acl-long.386.pdf::Q005", "doc_id": "2023.acl-long.386.pdf", "answer_format": "str", "gold": "Subtask", "pages": 24, "document_id": 7579, "arm": "llamacloud_premium_lc", "question_id": "2023.acl-long.386.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Table 2 which shows the error analysis for the HOVER dataset across 2-hop, 3-hop, and 4-hop claims:\n\n**Subtask errors:**\n- 2-hop: 2%\n- 3-hop: 5%\n- 4-hop: 2%\n- Total: 9%\n\n**Token errors:**\n- 2-hop: 8%\n- 3-hop: 20%\n- 4-hop: 18%\n- Total: 46%\n\n**Structure errors:**\n- 2-hop: 19%\n- 3-hop: 13%\n- 4-hop: 57%\n- Total: 89%\n\nAnswer: Subtask", "citations": [], "input_tokens": 29199, "output_tokens": 171, "cost_micros": 90162, "latency_ms": 5796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "subtask", "normalised_gold": "subtask"}}
+{"qid": "2023.acl-long.386.pdf::Q005", "doc_id": "2023.acl-long.386.pdf", "answer_format": "str", "gold": "Subtask", "pages": 24, "document_id": 7579, "arm": "surfsense_agentic", "question_id": "2023.acl-long.386.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document that was mentioned to find information about semantic errors in the HOVER dataset.Let me search for information about semantic errors and HOVER dataset in the document:Let me search for tables or figures that show error analysis:Now let me read the section with Table 2 that shows the error analysis:Perfect! I found Table 2 which shows the error analysis for the HOVER dataset. Let me examine the semantic error types and their percentages:\n\nFrom the table, the semantic errors are broken down into three types:\n- Token: 8% (2-hop), 20% (3-hop), 18% (4-hop) = Total: 46%\n- Structure: 19% (2-hop), 13% (3-hop), 57% (4-hop) = Total: 89%\n- Subtask: 2% (2-hop), 5% (3-hop), 2% (4-hop) = Total: 9%\n\nThe semantic error type with the lowest total percentages across all HOVER dataset hop lengths is \"Subtask\" with 2% + 5% + 2% = 9%.\n\nAnswer: Subtask", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 41280, "error": null, "extra": {"thread_id": 1944, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_964d1076448248edb0a0c213d4b3aab0", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 208, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "subtask", "normalised_gold": "subtask"}}
+{"qid": "2023.acl-long.386.pdf::Q007", "doc_id": "2023.acl-long.386.pdf", "answer_format": "float", "gold": "56.8", "pages": 24, "document_id": 7579, "arm": "native_pdf", "question_id": "2023.acl-long.386.pdf::Q007", "answer_letter": null, "raw_text": "I need to find two pieces of information:\n1. Which dataset has the highest ProgramFC retrieval recall@10\n2. The InstructGPT Self-Ask performance on that dataset in closed-book setting\n\nFrom Figure 5 on page 7, I can see the retrieval recall@10 comparison:\n- HOVER (2-hop): 77.13\n- HOVER (3-hop): 59.17\n- HOVER (4-hop): 49.93\n- FEVEROUS-S: 85.65\n\nFEVEROUS-S has the highest ProgramFC retrieval recall@10 at 85.65.\n\nNow I need to find the InstructGPT Self-Ask performance on FEVEROUS-S in the closed-book setting.\n\nLooking at Table 3 on page 8, which shows closed-book setting results:\n- InstructGPT - Self-Ask on FEVEROUS: 56.82\n\nAnswer: 56.8", "citations": [], "input_tokens": 65104, "output_tokens": 223, "cost_micros": 198657, "latency_ms": 22157, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2023.acl-long.386.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "56.8", "normalised_gold": "56.8"}}
+{"qid": "2023.acl-long.386.pdf::Q007", "doc_id": "2023.acl-long.386.pdf", "answer_format": "float", "gold": "56.8", "pages": 24, "document_id": 7579, "arm": "azure_basic_lc", "question_id": "2023.acl-long.386.pdf::Q007", "answer_letter": null, "raw_text": "I need to find two pieces of information:\n1. Which dataset has the highest ProgramFC retrieval recall@10\n2. What is the InstructGPT Self-Ask performance on that dataset in the closed-book setting\n\nFirst, let me look at Figure 5 for retrieval recall@10:\n- HOVER (2-hop): ProgramFC = 77.13\n- HOVER (3-hop): ProgramFC = 73.18\n- HOVER (4-hop): ProgramFC = 85.65\n- FEVEROUS-S: ProgramFC = 76.25\n\nThe highest ProgramFC retrieval recall@10 is 85.65 for HOVER (4-hop).\n\nNow, I need to find the InstructGPT Self-Ask performance on HOVER 4-hop in the closed-book setting from Table 3:\n\nLooking at Table 3 (Closed-book setting):\n- InstructGPT - Self-Ask on HOVER 4-hop: 52.45\n\nHowever, I notice the column headers are \"2-hop\", \"HOVER 3-hop\", \"4-hop\", and \"FEVEROUS\". The 4-hop column shows 52.45 for Self-Ask.\n\nAnswer: 52.5", "citations": [], "input_tokens": 26909, "output_tokens": 292, "cost_micros": 85107, "latency_ms": 7312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "52.5", "normalised_gold": "56.8"}}
+{"qid": "2023.acl-long.386.pdf::Q007", "doc_id": "2023.acl-long.386.pdf", "answer_format": "float", "gold": "56.8", "pages": 24, "document_id": 7579, "arm": "azure_premium_lc", "question_id": "2023.acl-long.386.pdf::Q007", "answer_letter": null, "raw_text": "I need to find two pieces of information:\n1. Which dataset has the highest ProgramFC retrieval recall@10\n2. The Self-Ask performance on that dataset in closed-book setting\n\nLooking at Figure 5 for retrieval recall@10:\n- HOVER (2-hop): ~73.18\n- HOVER (3-hop): ~77.13\n- HOVER (4-hop): ~85.65\n- FEVEROUS-S: ~76.25\n\nThe highest ProgramFC retrieval recall@10 is for HOVER (4-hop) at approximately 85.65.\n\nNow looking at Table 3 (Closed-book setting) for InstructGPT with Self-Ask on HOVER (4-hop):\nThe Self-Ask row shows: 52.45 for HOVER (4-hop)\n\nAnswer: 52.5", "citations": [], "input_tokens": 29931, "output_tokens": 198, "cost_micros": 92763, "latency_ms": 9953, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "52.5", "normalised_gold": "56.8"}}
+{"qid": "2023.acl-long.386.pdf::Q007", "doc_id": "2023.acl-long.386.pdf", "answer_format": "float", "gold": "56.8", "pages": 24, "document_id": 7579, "arm": "llamacloud_basic_lc", "question_id": "2023.acl-long.386.pdf::Q007", "answer_letter": null, "raw_text": "I need to find:\n1. The dataset with the highest ProgramFC retrieval recall at 10\n2. The InstructGPT model with Self-Ask performance on that dataset in closed-book setting\n\nLooking at Figure 5 (mentioned in the text on page 7), which discusses retrieval performance, I need to find the retrieval recall@10 values. However, the actual figure is not fully detailed in the text.\n\nFrom Table 3 on page 7, I can see closed-book fact-checking results. The table shows performance across HOVER (2-hop, 3-hop, 4-hop) and FEVEROUS datasets.\n\nLooking at Table 3:\n- The models listed include: InstructGPT (with variants Direct, ZS-CoT, CoT, Self-Ask), Codex, FLAN-T5, and ProgramFC\n- Self-Ask row shows: 51.54, 51.47, 52.45, 56.82\n\nThe text mentions that Figure 5 compares retrieval performance, and discusses that \"iterative retrieval guided by the reasoning program yields better results.\" However, without seeing the actual Figure 5 data clearly showing which dataset has the highest recall@10 for ProgramFC, I need to infer from the context.\n\nBased on the text discussing HOVER and FEVEROUS-S datasets, and looking at Table 3's structure with columns for \"2-hop\", \"3-hop\", \"4-hop\", and what appears to be FEVEROUS, the Self-Ask scores are: 51.54, 51.47, 52.45, 56.82.\n\nThe highest value for Self-Ask appears to be 56.82, which corresponds to the FEVEROUS column.\n\nAnswer: 56.8", "citations": [], "input_tokens": 22318, "output_tokens": 404, "cost_micros": 73014, "latency_ms": 14375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "56.8", "normalised_gold": "56.8"}}
+{"qid": "2023.acl-long.386.pdf::Q007", "doc_id": "2023.acl-long.386.pdf", "answer_format": "float", "gold": "56.8", "pages": 24, "document_id": 7579, "arm": "llamacloud_premium_lc", "question_id": "2023.acl-long.386.pdf::Q007", "answer_letter": null, "raw_text": "I need to find:\n1. The dataset with the highest ProgramFC retrieval recall@10\n2. The InstructGPT Self-Ask performance on that dataset in closed-book setting\n\nFrom Figure 5 (Retrieval recall@10):\n- HOVER (2-hop): 77.13\n- HOVER (3-hop): 59.17\n- HOVER (4-hop): 49.93\n- FEVEROUS-S: 85.65\n\nThe highest ProgramFC retrieval recall@10 is FEVEROUS-S at 85.65.\n\nFrom Table 3 (Closed-book setting macro-F1 scores), looking at InstructGPT with Self-Ask on FEVEROUS:\n- Self-Ask: 56.82\n\nAnswer: 56.8", "citations": [], "input_tokens": 29233, "output_tokens": 186, "cost_micros": 90489, "latency_ms": 6452, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "56.8", "normalised_gold": "56.8"}}
+{"qid": "2023.acl-long.386.pdf::Q007", "doc_id": "2023.acl-long.386.pdf", "answer_format": "float", "gold": "56.8", "pages": 24, "document_id": 7579, "arm": "surfsense_agentic", "question_id": "2023.acl-long.386.pdf::Q007", "answer_letter": null, "raw_text": "I'll help you find this information from the document. Let me start by reading the relevant document that was identified as most relevant.Let me continue reading to find information about ProgramFC retrieval recall and InstructGPT performance with Self-Ask.Let me search for \"recall\" to find tables with retrieval metrics:Good! I found information about retrieval recall@10. Let me read the section with Figure 5 and the surrounding content:Perfect! I can see from Figure 5 that the ProgramFC retrieval recall@10 values for different datasets are:\n- HOVER (2-hop): 73.18\n- HOVER (3-hop): 59.17\n- HOVER (4-hop): 51.33\n- FEVEROUS-S: 85.65\n\nThe highest ProgramFC retrieval recall@10 is **85.65** for **FEVEROUS-S**.\n\nNow I need to find the performance of InstructGPT with Self-Ask in the closed-book setting on FEVEROUS-S. Let me search for tables with these results:Let me read Table 1 which shows performance on FEVEROUS-S:This table shows results but doesn't specifically mention InstructGPT with Self-Ask in a closed-book setting. Let me search for \"closed-book\" or \"closed book\":Let me read the section about closed-book setting evaluation:Perfect! I found Table 3 which shows the closed-book setting results. Looking at the table:\n\nFor **FEVEROUS** (which corresponds to FEVEROUS-S in this context):\n- InstructGPT with Self-Ask: **56.82**\n\nLet me verify this is indeed FEVEROUS-S by checking if they use the same notation:The table says \"FEVEROUS\" but earlier in the document they established that they use FEVEROUS-S (the sentence-only subset). Let me verify by checking the dataset description section again:Yes, they use FEVEROUS-S throughout the paper. In Table 3, \"FEVEROUS\" refers to FEVEROUS-S. \n\nNow let me confirm the answer:\n1. From Figure 5, the highest ProgramFC retrieval recall@10 is **85.65** for FEVEROUS-S\n2. From Table 3, InstructGPT with Self-Ask in the closed-book setting on FEVEROUS achieves **56.82**\n\nBut let me double-check this value is actually 56.82 by reading that specific cell again:Perfect! Confirmed. The row for \"Self-Ask\" shows:\n- HOVER 2-hop: 51.54\n- HOVER 3-hop: 51.47\n- HOVER 4-hop: 52.45\n- FEVEROUS: 56.82\n\nThe answer is **56.82**, which in 1 decimal format is **56.8**.\n\nWait, let me reconsider. The question asks for \"1 decimal\" which means I should provide it with 1 decimal place. 56.82 rounded to 1 decimal place would be 56.8.\n\nHowever, looking at the exact value in the table, it shows 56.82. Since they're asking for float format with 1 decimal, I should present it as 56.8.\n\nAnswer: 56.8", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 84281, "error": null, "extra": {"thread_id": 1945, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_f2cf8d23b121431e8f0ffe2a73635d4e", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 567, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "56.8", "normalised_gold": "56.8"}}
+{"qid": "2023.acl-long.386.pdf::Q008", "doc_id": "2023.acl-long.386.pdf", "answer_format": "list", "gold": "['64.3', '62.4', '62.2']", "pages": 24, "document_id": 7579, "arm": "native_pdf", "question_id": "2023.acl-long.386.pdf::Q008", "answer_letter": null, "raw_text": "I need to find the F1 scores for ProgramFC on different datasets with gold evidence in relation to FLAN-T5-small.\n\nLooking at Figure 4 on page 7, which shows \"F1 score for fact-checking with gold evidence using FLAN-T5 (blue line) and PROGRAMFC (green line) for language models of increasing sizes.\"\n\nThe figure shows three graphs for:\n1. HOVER (2-hop)\n2. HOVER (3-hop)\n3. HOVER (4-hop)\n\nFor FLAN-T5-small (80M), I can read the values:\n- HOVER (2-hop): FLAN-T5 is around 47.75, ProgramFC is around 64.35\n- HOVER (3-hop): FLAN-T5 is around 49.29, ProgramFC is around 62.23\n- HOVER (4-hop): FLAN-T5 is around 48.59, ProgramFC is around 62.46\n\nThe differences (ProgramFC - FLAN-T5) are:\n- HOVER (2-hop): 64.35 - 47.75 = 16.6\n- HOVER (4-hop): 62.46 - 48.59 = 13.9\n- HOVER (3-hop): 62.23 - 49.29 = 12.9\n\nAnswer: 16.6, 13.9, 12.9", "citations": [], "input_tokens": 65112, "output_tokens": 350, "cost_micros": 200586, "latency_ms": 12921, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2023.acl-long.386.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "12 9, 13 9, 16 6", "normalised_gold": "62 2, 62 4, 64 3"}}
+{"qid": "2023.acl-long.386.pdf::Q008", "doc_id": "2023.acl-long.386.pdf", "answer_format": "list", "gold": "['64.3', '62.4', '62.2']", "pages": 24, "document_id": 7579, "arm": "azure_basic_lc", "question_id": "2023.acl-long.386.pdf::Q008", "answer_letter": null, "raw_text": "Looking at Figure 4, which shows F1 scores for fact-checking with gold evidence using FLAN-T5 and PROGRAMFC for different model sizes including FLAN-T5-small (80M):\n\nFor HOVER 2-hop:\n- FLAN-T5-small: approximately 47.75\n- ProgramFC with FLAN-T5-small: approximately 64.35\n- Difference: 64.35 - 47.75 = 16.6\n\nFor HOVER 3-hop:\n- FLAN-T5-small: approximately 48.59\n- ProgramFC with FLAN-T5-small: approximately 61.36\n- Difference: 61.36 - 48.59 = 12.8\n\nFor HOVER 4-hop:\n- FLAN-T5-small: approximately 49.29\n- ProgramFC with FLAN-T5-small: approximately 63.05\n- Difference: 63.05 - 49.29 = 13.8\n\nSorting in descending order: 16.6, 13.8, 12.8\n\nAnswer: 16.6, 13.8, 12.8", "citations": [], "input_tokens": 26917, "output_tokens": 288, "cost_micros": 85071, "latency_ms": 6578, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "12 8, 13 8, 16 6", "normalised_gold": "62 2, 62 4, 64 3"}}
+{"qid": "2023.acl-long.386.pdf::Q008", "doc_id": "2023.acl-long.386.pdf", "answer_format": "list", "gold": "['64.3', '62.4', '62.2']", "pages": 24, "document_id": 7579, "arm": "azure_premium_lc", "question_id": "2023.acl-long.386.pdf::Q008", "answer_letter": null, "raw_text": "Looking at Figure 4 in the document, I need to find the F1 score differences between ProgramFC and FLAN-T5 using FLAN-T5-small (80M) for gold evidence setting across the three HOVER datasets.\n\nFrom Figure 4:\n- HOVER (2-hop): ProgramFC \u2248 62.23, FLAN-T5 \u2248 47.75, difference = 14.5\n- HOVER (3-hop): ProgramFC \u2248 61.36, FLAN-T5 \u2248 48.59, difference = 12.8\n- HOVER (4-hop): ProgramFC \u2248 63.39, FLAN-T5 \u2248 49.29, difference = 14.1\n\nSorting in descending order:\n\nAnswer: 14.5, 14.1, 12.8", "citations": [], "input_tokens": 29939, "output_tokens": 219, "cost_micros": 93102, "latency_ms": 6453, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "12 8, 14 1, 14 5", "normalised_gold": "62 2, 62 4, 64 3"}}
+{"qid": "2023.acl-long.386.pdf::Q008", "doc_id": "2023.acl-long.386.pdf", "answer_format": "list", "gold": "['64.3', '62.4', '62.2']", "pages": 24, "document_id": 7579, "arm": "llamacloud_basic_lc", "question_id": "2023.acl-long.386.pdf::Q008", "answer_letter": null, "raw_text": "Looking through the document for F1 scores comparing ProgramFC to FLAN-T5-small with gold evidence...\n\nIn Figure 4 (Section 4.2), the document discusses comparing performance across different model sizes, including FLAN-T5-small. The text mentions:\n\n\"Our results show that the program-guided model using FLAN-T5-small (80M parameters) as sub-task solvers can achieve comparable performance to the 137x larger FLAN-T5-XXL (11B) model with end-to-end reasoning for 4-hop claims.\"\n\nHowever, Figure 4 is referenced but the actual figure with specific F1 scores is not fully visible in the provided text. The document shows results for different hop counts (2-hop, 3-hop, 4-hop) on HOVER dataset in the gold evidence setting.\n\nFrom Table 1, I can see overall results but these are for FLAN-T5-XXL, not FLAN-T5-small.\n\nThe text in Section 4.2 discusses Figure 4 which would contain the specific comparisons with FLAN-T5-small, but the exact numerical values for the F1 score differences are not explicitly stated in the text portions provided.\n\nWithout access to the actual Figure 4 with the specific numerical values, I cannot provide the exact F1 scores requested.\n\nAnswer: Unable to determine from provided text", "citations": [], "input_tokens": 22326, "output_tokens": 321, "cost_micros": 71793, "latency_ms": 9483, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "unable to determine from provided text", "normalised_gold": "62 2, 62 4, 64 3"}}
+{"qid": "2023.acl-long.386.pdf::Q008", "doc_id": "2023.acl-long.386.pdf", "answer_format": "list", "gold": "['64.3', '62.4', '62.2']", "pages": 24, "document_id": 7579, "arm": "llamacloud_premium_lc", "question_id": "2023.acl-long.386.pdf::Q008", "answer_letter": null, "raw_text": "Looking through the document for F1 scores comparing ProgramFC to FLAN-T5-small with gold evidence.\n\nFrom Figure 4, I can extract the following F1 scores for ProgramFC (green line) with FLAN-T5-small (80M):\n- HOVER 2-hop: 64.35\n- HOVER 3-hop: 62.23\n- HOVER 4-hop: 62.46\n\nAnd for FLAN-T5-small (80M):\n- HOVER 2-hop: 47.75\n- HOVER 3-hop: 49.29\n- HOVER 4-hop: 48.59\n\nThe differences (ProgramFC - FLAN-T5-small):\n- HOVER 2-hop: 64.35 - 47.75 = 16.6\n- HOVER 4-hop: 62.46 - 48.59 = 13.9\n- HOVER 3-hop: 62.23 - 49.29 = 12.9\n\nSorting in descending order with one decimal place:\n\nAnswer: 16.6, 13.9, 12.9", "citations": [], "input_tokens": 29241, "output_tokens": 279, "cost_micros": 91908, "latency_ms": 7358, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "12 9, 13 9, 16 6", "normalised_gold": "62 2, 62 4, 64 3"}}
+{"qid": "2023.acl-long.386.pdf::Q008", "doc_id": "2023.acl-long.386.pdf", "answer_format": "list", "gold": "['64.3', '62.4', '62.2']", "pages": 24, "document_id": 7579, "arm": "surfsense_agentic", "question_id": "2023.acl-long.386.pdf::Q008", "answer_letter": null, "raw_text": "I'll read the document to find the F1 scores for ProgramFC on different datasets with gold evidence in relation to Flan-T5-small.Let me search for tables containing F1 scores for ProgramFC with gold evidence and Flan-T5-small:Let me read the section with Figure 4 and look for tables with F1 scores:Let me search for tables with gold evidence:Let me read Table 1 which contains F1 scores with gold evidence:Now I have the table data. I can see:\n- FLAN-T5 Gold scores: 73.69 (HOVER 2-hop), 65.66 (HOVER 3-hop), 58.08 (HOVER 4-hop), 90.81 (FEVEROUS-S)\n- ProgramFC (N=5) Gold scores: 75.65 (HOVER 2-hop), 68.48 (HOVER 3-hop), 66.75 (HOVER 4-hop), 92.69 (FEVEROUS-S)\n\nHowever, I need to check if these are specifically for Flan-T5-small. Let me look at Figure 4 data more carefully, as it mentions different sizes of Flan-T5:Looking at the data structure in Figure 4, I need to identify which values correspond to Flan-T5-small (80M) for ProgramFC. The chart shows alternating values. Let me read more of this section:Based on Figure 4 data, I can see there are three charts (HOVER 2-hop, 3-hop, 4-hop) with values at different model sizes (80M, 250M, 780M, 3B, 11B). \n\nLooking at the structure:\n- The first chart (HOVER 2-hop) has values from lines 1130-1148\n- The second chart (HOVER 3-hop) has values from lines 1152-1188  \n- The third chart (HOVER 4-hop) has values from lines 1192-1210\n\nFor 80M (FLAN-T5-small), looking at the ProgramFC values (green line):\n- HOVER 2-hop: 75.65 (line 1132)\n- HOVER 3-hop: 68.48 (line 1156)\n- HOVER 4-hop: 58.08 (line 1198)\n\nWait, let me verify this more carefully by looking at the pattern. The chart shows alternating FLAN-T5 and ProgramFC values. Let me trace through more carefully:\n\nFor HOVER 2-hop at 80M:\n- Lines around 1130-1135 show values near 76-77\n- This corresponds to 80M (first position)\n\nFor HOVER 3-hop at 80M:\n- Starting around line 1152-1158\n- Values around 67-69\n\nFor HOVER 4-hop at 80M:\n- Starting around line 1192-1198\n- Values around 56-58\n\nLooking at the caption again: \"FLAN-T5 (blue line) and PROGRAMFC (green line)\"\n\nFrom the table structure and comparing with Table 1 values (FLAN-T5 has 73.69 for 2-hop, ProgramFC N=5 has 75.65), I can deduce:\n\nFor 80M (FLAN-T5-small) with gold evidence, ProgramFC scores:\n- HOVER 2-hop: 75.65\n- HOVER 3-hop: 68.48  \n- HOVER 4-hop: 58.08\n\nIn descending order: 75.65, 68.48, 58.08\n\nHowever, I should check if there are more datasets. Let me verify if FEVEROUS-S is included in this figure:Figure 4 only shows HOVER 2-hop, 3-hop, and 4-hop. Let me check if there's a figure or table specifically showing FEVEROUS-S scores broken down by model size. But based on Table 1, the FEVEROUS-S score for ProgramFC (N=5) with gold evidence is 92.69, and this appears to be using FLAN-T5-XXL based on the context.\n\nLet me search for more information about which model size was used for the main results:Now I need to carefully parse Figure 4 data. The figure shows three charts for HOVER 2-hop, 3-hop, and 4-hop. Looking at the data structure, I see values followed by model sizes (80M, 250M, 780M, 3B, 11B) repeated three times.\n\nLet me trace through the data more systematically. From line 1130 onwards, reading in groups:\n\nHOVER 2-hop chart appears to show pairs of values for each model size (FLAN-T5 then ProgramFC based on the caption mentioning blue and green lines):\n- At 80M: 76.11, 75.65\n- Then 77.62\n\nLet me look for a clearer pattern by examining all the values more carefully:Good! I found key information at lines 1385-1388: \"the program-guided model using FLAN-T5-small (80M parameters) as sub-task solvers can achieve comparable performance to the 137x larger FLAN-T5-XXL (11B) model with end-to-end reasoning for 4-hop claims.\"\n\nThis confirms that Figure 4 shows results using different-sized FLAN-T5 models. Now let me parse the Figure 4 data correctly. Based on the structure and comparing with Table 1:\n\nFrom Table 1, I know:\n- FLAN-T5 gold evidence scores: 73.69 (2-hop), 65.66 (3-hop), 58.08 (4-hop)\n- ProgramFC (N=5) gold evidence scores: 75.65 (2-hop), 68.48 (3-hop), 66.75 (4-hop)\n\nThese match values in Figure 4 data. Now looking at the chart data structure more carefully, the values appear to be organized by model size. Let me trace the 80M (FLAN-T5-small) values for ProgramFC:\n\nLooking at lines 1130-1246, the pattern seems to show data points for both lines (FLAN-T5 and ProgramFC) across multiple model sizes for three different charts.\n\nFrom the visible data and comparing with Table 1 values at line 1148 (73.69) which is FLAN-T5 for HOVER 2-hop, and line 1132 (75.65) which matches ProgramFC (N=5) for HOVER 2-hop:\n\nFor FLAN-T5-small (80M) ProgramFC with gold evidence:\n- HOVER 2-hop: 72.56 (line 1140)\n- HOVER 3-hop: 68.55 (line 1154)  \n- HOVER 4-hop: 63.39 (line 1194)\n\nWait, let me reconsider. Looking at the x-axis labels (80M, 250M, 780M, 3B, 11B) appearing three times (once per chart), and the fact that there are two lines per chart (FLAN-T5 and ProgramFC), I need to identify which values correspond to which.\n\nLet me look for clearer structure or additional tables:Figure 4 only shows HOVER datasets (2-hop, 3-hop, 4-hop), not FEVEROUS-S. Let me carefully re-parse the figure data. \n\nLooking at the chart data structure: there are 3 separate line charts, each showing F1 scores for 5 different model sizes. Each chart has 2 lines (FLAN-T5 and ProgramFC).\n\nThe data flows:\n- Lines 1130-1148: HOVER 2-hop chart data\n- Lines 1152-1188: HOVER 3-hop chart data  \n- Lines 1192-1210: HOVER 4-hop chart data\n\nFor each chart, there should be approximately 10 data points (2 lines \u00d7 5 model sizes).\n\nLet me map this out for HOVER 2-hop (starting line 1130):\nLooking at the pattern and knowing from Table 1 that ProgramFC (N=5) gets 75.65 for 2-hop (line 1132), this should be for the largest model (11B based on Table 1 context).\n\nActually, let me reconsider the entire approach. The question asks for F1 scores for \"ProgramFC on different datasets with gold evidence in relation to Flan-T5-small\". \n\nThis might mean: the F1 scores when ProgramFC uses Flan-T5-small as its sub-task solver, across different datasets.\n\nFrom Figure 4 caption and the data structure, for 80M (FLAN-T5-small) as the sub-task solver in ProgramFC:\n\nLooking at line 1140 (72.56), 1154 (68.55), 1194 (63.39) - these appear to be the first ProgramFC values in each chart.\n\nBut I need to verify this interpretation. Let me check if there are explicit values stated in the text:From the text at lines 1368-1388, it discusses Figure 4 and mentions that \"the program-guided model using FLAN-T5-small (80M parameters) as sub-task solvers\" achieves certain performance. This confirms Figure 4 shows ProgramFC results when using different sizes of FLAN-T5 as sub-task solvers.\n\nNow I need to carefully extract the FLAN-T5-small (80M) ProgramFC values from Figure 4 data. Let me re-examine the data structure.\n\nBased on the chart showing 5 model sizes (80M, 250M, 780M, 3B, 11B) and 2 lines (FLAN-T5 and ProgramFC), there should be interleaved or grouped values.\n\nLooking at lines 1128-1248, I see numeric values followed by model size labels. The pattern suggests the chart data is organized with Y-axis values first, then X-axis labels.\n\nFor a line chart with 5 points on each of 2 lines, I'd expect 10 Y values per chart. Let me count the values before the first \"80M\" marker at line 1218:\n\nFrom 1130-1216, there appear to be values organized in groups. \n\nLet me try a different approach - look at the actual layout. The values from 1130-1148 before seeing the next section, and knowing Table 1 shows ProgramFC (N=5) has 75.65 for HOVER 2-hop with presumably the larger model:\n\nFirst chart (HOVER 2-hop) - 5 model sizes, 2 lines = 10 points\nLet me identify based on comparing with known Table 1 values. Line 1148 shows 73.69, which exactly matches FLAN-T5 HOVER 2-hop gold from Table 1. This suggests this is for the larger model (XXL/11B).\n\nLooking backward from there and forward, the organization seems to show alternating or separate line data. Since line charts typically show both lines across all X values, let me map:\n\nValues around lines 1130-1148 for HOVER 2-hop\nValues around lines 1152-1188 for HOVER 3-hop\nValues around lines 1192-1210 for HOVER 4-hop\n\nFor 80M (first model size), I need the first pair of values (FLAN-T5 and ProgramFC) for each chart.\n\nGiven the complexity and my uncertainty about the exact data structure, let me look for any other tables or explicit statements:Looking at this continuous stream helps! Let me parse it:\n\n\"80 76.11 75.65 77.62 80 80 72.56 77.07 70 71.69 73.69 70 67.88 68.55 68.48 69.56 70 68.37 68.56 66.75 68.18 64.35 68.24 62.23 66.89 62.46 60 60 63.05 65.07 65.66 60 61.36 63.39 56.58 58.08 50 50 50 47.75 49.29 48.59 40 40 40 80M 250M 780M 3...\"\n\nThe \"80\" values appear to be Y-axis scale markers. Let me focus on the actual data values and model sizes.\n\nParsing more carefully:\n- After the initial scale markers, values like 76.11, 75.65, 77.62 appear\n- Then 72.56, 77.07\n- Then 71.69, 73.69\n- Etc.\n\nKnowing from Table 1 that:\n- FLAN-T5 HOVER 2-hop gold = 73.69 \n- ProgramFC (N=5) HOVER 2-hop gold = 75.65\n\nI can see 73.69 and 75.65 in the data stream. These are likely for the XXL model since Table 1 results use the larger model.\n\nIf the data is organized as alternating FLAN-T5/ProgramFC for each model size from 80M to 11B, then:\n- First pair (80M): 76.11 (FLAN-T5), 75.65 (ProgramFC)? But 75.65 matches Table 1 ProgramFC which should be for the larger model...\n\nThis is confusing. Let me try another interpretation: perhaps all FLAN-T5 values come first, then all ProgramFC values for each chart.\n\nOr perhaps the values are arranged as: HOVER 2-hop FLAN-T5 (all 5 sizes), HOVER 2-hop ProgramFC (all 5 sizes), then HOVER 3-hop data, etc.\n\nWithout seeing the actual visual chart, this is challenging. Let me look for any other references or tables that might clarify:From the stream, 68.48 from Table 1 (ProgramFC N=5 HOVER 3-hop gold) appears right after 68.55.\n\nLet me hypothesize: if the chart shows both lines across all model sizes, and the data is organized left-to-right, then for each X position (model size) moving left to right, we have interleaved FLAN-T5 and ProgramFC Y values.\n\nFor HOVER 2-hop at 80M (first model size):\n- Looking at the early values: 76.11, 75.65, 77.62, 72.56, 77.07, 71.69, 73.69...\n\nOne pattern could be: pairs of (FLAN-T5, ProgramFC) for 80M, then for 250M, etc.\n- 80M: (76.11, 75.65)? But this doesn't align with Table 1 where 75.65 is the high-model result...\n\nOr: all 5 FLAN-T5 points, then all 5 ProgramFC points.\n\nLet me try: \nHOVER 2-hop FLAN-T5: 76.11, 77.62, 72.56, 71.69, 73.69 (for 80M, 250M, 780M, 3B, 11B)\nHOVER 2-hop ProgramFC: 75.65, 77.07, ... (for 80M, 250M, 780M, 3B, 11B)\n\nBut wait, 73.69 matches Table 1 FLAN-T5 which should be for 11B based on Table 1 being the best model. So if 73.69 is the 5th FLAN-T5 value (11B), then:\nHOVER 2-hop FLAN-T5: 76.11 (80M), 77.62 (250M), 72.56 (780M), 71.69 (3B), 73.69 (11B)\n\nThat doesn't make sense as performance should generally increase with model size, but we see 76.11 \u2192 77.62 \u2192 72.56 which decreases.\n\nLet me reconsider: perhaps the visual layout in the PDF is organized differently than the XML text extraction shows. The values might not be in simple left-to-right order.\n\nGiven the time constraints and complexity, let me make my best educated guess based on patterns:\n\nLooking at Table 1 reference values and Figure 4 caption mentioning these are results across different model sizes, and considering that:\n1. Figure 4 shows HOVER 2-hop, 3-hop, 4-hop only (not FEVEROUS-S)\n2. The question asks for ProgramFC scores with gold evidence for Flan-T5-small across different datasets\n\nThe likely answer based on extracting what appear to be the first ProgramFC values in each section that would correspond to the smallest model (80M):\n- HOVER 2-hop: 72.56\n- HOVER 3-hop: 68.55\n- HOVER 4-hop: 63.39\n\nIn descending order: 72.56 \u2192 68.55 \u2192 63.39\n\nBut let me verify one more time by looking at the complete pattern:Now I see the pattern more clearly! The model size labels (80M, 250M, 780M, 3B, 11B) appear THREE times (lines 1218-1246), once for each of the three charts (HOVER 2-hop, 3-hop, 4-hop).\n\nSo the data structure is:\n- Lines 1130-1148: HOVER 2-hop chart Y-axis values\n- Lines 1152-1188: HOVER 3-hop chart Y-axis values\n- Lines 1192-1210: HOVER 4-hop chart Y-axis values\n- Lines 1218-1246: X-axis labels (model sizes) for all three charts\n\nFor each chart with 2 lines and 5 X-values, I'd expect about 10 Y-values.\n\nHOVER 2-hop (lines 1130-1148): 76.11, 75.65, 77.62, (80, 80 are axis markers), 72.56, 77.07, (70), 71.69, 73.69, (70)\n\nRemoving axis markers: 76.11, 75.65, 77.62, 72.56, 77.07, 71.69, 73.69\n\nThat's 7 values. For 2 lines with 5 points each = 10 values, so I might be missing some or there's overlap.\n\nLooking at known reference: 73.69 (Table 1 FLAN-T5 2-hop) and 75.65 (Table 1 ProgramFC 2-hop) both appear in this section.\n\nIf arranged as: FLAN-T5 first 5 values, then ProgramFC 5 values:\nFLAN-T5: 76.11 (80M), 75.65 (250M), 77.62 (780M), 72.56 (3B), 77.07 (11B)?  \nBut 73.69 should be FLAN-T5 and it's not in this sequence...\n\nAlternative: interleaved pairs:\n- (76.11, 75.65) = (FLAN-T5, ProgramFC) at 80M\n- (77.62, 72.56) = (FLAN-T5, ProgramFC) at 250M  \n- (77.07, 71.69) = (FLAN-T5, ProgramFC) at 780M\n- (73.69, ???) = (FLAN-T5, ProgramFC) at some size\n\nBut this only gives 4 pairs...\n\nLet me try yet another approach. Reading the values without the axis scale markers (70, 80):\n76.11, 75.65, 77.62, 72.56, 77.07, 71.69, 73.69 for HOVER 2-hop\n\nIf this represents 2 lines of 5 points each minus 3 values (perhaps some overlap at endpoints or data points), let me see if the pattern makes sense as:\n\nLine 1 (FLAN-T5): 71.69, 73.69, 72.56, 77.07, 76.11 (in size order 80M to 11B)? Reverse order?\nLine 2 (ProgramFC): 75.65, 77.62, ..., ..., ... \n\nThis is getting too speculative. Let me use logical reasoning instead:\n\n1. Table 1 shows ProgramFC (N=5) achieves 75.65, 68.48, 66.75 for HOVER 2/3/4-hop with gold evidence\n2. These are the best results, so likely using the largest model (11B/XXL)\n3. Figure 4 shows how performance varies with model size for both FLAN-T5 and ProgramFC\n4. The text states that FLAN-T5-small (80M) ProgramFC can achieve comparable performance to the large model for 4-hop claims\n\nLooking at 4-hop values (lines 1192-1210): 61.36, 63.39, 56.58, 58.08, (50s), 47.75, 49.29, 48.59, (40s)\n\n58.08 appears here and also matches Table 1 FLAN-T5 4-hop gold.\n66.75 (ProgramFC 4-hop from Table 1) does NOT appear in this 4-hop section, which is strange...\n\nUnless Table 1 uses N=5 program aggregation while Figure 4 uses N=1? Let me check:\n\nLooking back at Table 1, there's both \"ProgramFC (N=1)\" and \"ProgramFC (N=5)\". \n\nProgramFC (N=1) gold scores from Table 1:\n- HOVER 2-hop: 74.10\n- HOVER 3-hop: 66.13\n- HOVER 4-hop: 65.69\n\nProgramFC (N=5) gold scores:\n- HOVER 2-hop: 75.65\n- HOVER 3-hop: 68.48\n- HOVER 4-hop: 66.75\n\nLet me check if 74.10, 66.13, or 65.69 appear in Figure 4 data... No, they don't appear in the extracted values.\n\nGiven this complexity and the fact that the visual chart layout isn't perfectly captured in the XML extraction, let me make my best inference:\n\nFor FLAN-T5-small (80M) as sub-task solvers in ProgramFC with gold evidence:\n- The first ProgramFC value in each chart section should correspond to 80M\n- Based on positioning and eliminating known FLAN-T5 values from Table 1:\n\nHOVER 2-hop: 72.56\nHOVER 3-hop: 68.55\nHOVER 4-hop: 63.39\n\nIn descending order: 72.6, 68.6, 63.4 (to one decimal place)\n\nAnswer: 72.6, 68.6, 63.4", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 174562, "error": null, "extra": {"thread_id": 1946, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_b81fd9ccbe6f4839823c9313fb619eb2", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 2087, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "63 4, 68 6, 72 6", "normalised_gold": "62 2, 62 4, 64 3"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q000", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "204", "pages": 19, "document_id": 7580, "arm": "native_pdf", "question_id": "2023.findings-emnlp.248.pdf::Q000", "answer_letter": null, "raw_text": "I need to find which dataset uses Prover9 as the symbolic solver and how many examples are used for evaluation.\n\nLooking at Table 1 on page 5, I can see the symbolic solvers used for different datasets:\n- Pyke: ProntoQA, ProofWriter\n- Prover9: FOLIO\n- python-constraint: LogicalDeduction\n- Z3: AR-LSAT\n\nSo FOLIO is the dataset that uses Prover9.\n\nNow I need to find how many examples from FOLIO are used for evaluation. Looking at the \"Datasets\" section on page 6:\n\n\"FOLIO (Han et al., 2022) is a challenging expert-written dataset for logical reasoning. The problems are mostly aligned with real-world knowledge and use highly natural wordings, and the questions require complex first-order logic reasoning to solve. We use the entire FOLIO test set for evaluation, consisting of 204 examples.\"\n\nAlso, Table 5 in Appendix B on page 14 confirms:\n- FOLIO has a Test Size of 204\n\nAnswer: 204", "citations": [], "input_tokens": 53261, "output_tokens": 253, "cost_micros": 163578, "latency_ms": 13937, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2023.findings-emnlp.248.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "204", "normalised_gold": "204"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q000", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "204", "pages": 19, "document_id": 7580, "arm": "azure_basic_lc", "question_id": "2023.findings-emnlp.248.pdf::Q000", "answer_letter": null, "raw_text": "I need to find which dataset uses Prover9 as the symbolic solver and how many examples are used for evaluation.\n\nLooking at Table 1 in the document, I can see the symbolic solvers used for each dataset:\n- ProntoQA and ProofWriter use Pyke\n- FOLIO uses Prover9\n- LogicalDeduction uses python-constraint\n- AR-LSAT uses Z3\n\nSo FOLIO is the dataset that uses Prover9.\n\nNow I need to find how many examples from FOLIO are used for evaluation. Looking at the \"Datasets\" section under \"4 Experiments\":\n\n\"FOLIO (Han et al., 2022) is a challenging expert-written dataset for logical reasoning. The problems are mostly aligned with real-world knowledge and use highly natural wordings, and the questions require complex first-order logic reasoning to solve. We use the entire FOLIO test set for evaluation, consisting of 204 examples.\"\n\nAnswer: 204", "citations": [], "input_tokens": 22914, "output_tokens": 223, "cost_micros": 72087, "latency_ms": 7156, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "204", "normalised_gold": "204"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q000", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "204", "pages": 19, "document_id": 7580, "arm": "azure_premium_lc", "question_id": "2023.findings-emnlp.248.pdf::Q000", "answer_letter": null, "raw_text": "I need to find which dataset uses Prover9 as the symbolic solver and how many examples are used for evaluation.\n\nFrom Table 1 in the document, I can see that Prover9 is used as the solver for the FOLIO dataset, which is categorized under \"First-Order Logic\" reasoning.\n\nNow I need to find how many examples from FOLIO are used for evaluation. Looking at Table 5 \"Statistics of the logical reasoning datasets\" in Appendix B:\n\nFOLIO:\n- Reasoning: FOL\n- Test Size: 204\n- #Opts: 3\n\nThe document also confirms in the main text (Section 4, Datasets): \"FOLIO (Han et al., 2022) is a challenging expert-written dataset for logical reasoning. The problems are mostly aligned with real-world knowledge and use highly natural wordings, and the questions require complex first-order logic reasoning to solve. We use the entire FOLIO test set for evaluation, consisting of 204 examples.\"\n\nAnswer: 204", "citations": [], "input_tokens": 26784, "output_tokens": 232, "cost_micros": 83832, "latency_ms": 8031, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "204", "normalised_gold": "204"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q000", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "204", "pages": 19, "document_id": 7580, "arm": "llamacloud_basic_lc", "question_id": "2023.findings-emnlp.248.pdf::Q000", "answer_letter": null, "raw_text": "I need to find which dataset uses Prover9 as the symbolic solver and how many examples are used for evaluation.\n\nLooking at Table 1 in the document, I can see the symbolic solvers used for each dataset:\n- ProofWriter uses \"ProntoQA, Pyke\"\n- FOLIO uses \"Prover9\"\n- LogicalDeduction uses \"phn-constraint\"\n- AR-LSAT uses \"Z3\"\n\nSo FOLIO is the dataset that uses Prover9.\n\nNow I need to find how many examples from FOLIO are used for evaluation. Looking at the \"Datasets\" section under \"4 Experiments\":\n\n\"FOLIO (Han et al., 2022) is a challenging expert-written dataset for logical reasoning. The problems are mostly aligned with real-world knowledge and use highly natural wordings, and the solutions require complex first-order logic reasoning to solve. We use the entire FOLIO test set for evaluation, consisting of 204 examples.\"\n\nAnswer: 204", "citations": [], "input_tokens": 20095, "output_tokens": 232, "cost_micros": 63765, "latency_ms": 6891, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "204", "normalised_gold": "204"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q000", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "204", "pages": 19, "document_id": 7580, "arm": "llamacloud_premium_lc", "question_id": "2023.findings-emnlp.248.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, I need to find which dataset uses Prover9 as the symbolic solver and how many examples are in its test set.\n\nFrom Table 1 (page 3810), I can see that FOLIO uses Prover9 as its symbolic solver under the \"FOL\" (First-Order Logic) problem type.\n\nFrom Table 5 in Appendix B (page 3819), which shows dataset statistics, I can see:\n- FOLIO has a Test Size of 204\n\nThis is also confirmed in the text on page 3811 where it states: \"We use the entire FOLIO test set for evaluation, consisting of 204 examples.\"\n\nAnswer: 204", "citations": [], "input_tokens": 25913, "output_tokens": 159, "cost_micros": 80124, "latency_ms": 5094, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "204", "normalised_gold": "204"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q000", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "204", "pages": 19, "document_id": 7580, "arm": "surfsense_agentic", "question_id": "2023.findings-emnlp.248.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find information about a dataset that uses Prover9 as the symbolic solver and determine how many examples are used for evaluation.Let me search for \"Prover9\" in the document to find the relevant information:Now let me search for sections that discuss datasets and evaluation:Let me read the section about datasets more carefully. Let me look for \"ProofWriter\" which seems to be mentioned and might use Prover9:Let me search more specifically for information about which dataset uses Prover9:I see \"First-order Logic Prover\" mentioned. Let me search for more specific information about datasets and their properties:Now let me look at the appendix section that might have more details about data statistics and which solver is used for which dataset:Let me search for tables or sections that describe which solver is used for each dataset:Great! I found a reference to Table 1 which summarizes the symbolic solvers used for each dataset. Let me read that table:Perfect! I found Table 1 which shows that **Prover9** is used as the symbolic solver for the **FOLIO** dataset. Now I need to find how many examples from FOLIO are used for evaluation. Let me go back to the section where FOLIO is described:Perfect! I found the answer. According to the document:\n\n1. From Table 1 (lines 846-849): **Prover9** is the symbolic solver used for the **FOLIO** dataset.\n2. From the dataset description (lines 1024-1026): \"We use the entire FOLIO test set for evaluation, consisting of **204 examples**.\"\n\nAnswer: 204", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 64250, "error": null, "extra": {"thread_id": 1947, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_e89f611945d040b28ca2dcfe99b6b2c6", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 410, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "204", "normalised_gold": "204"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q002", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "2", "pages": 19, "document_id": 7580, "arm": "native_pdf", "question_id": "2023.findings-emnlp.248.pdf::Q002", "answer_letter": "A", "raw_text": "I need to examine Figure 5 and Figure 6 to count how many examples Logic-LM predicted correctly.\n\n**Figure 5 (Page 9):**\nThis shows one example about \"Stranger Things\" and \"Black Mirror\" Netflix shows.\n- The predicted answer is shown as: B\n- No indication of whether this is correct or incorrect is given in the figure.\n\n**Figure 6 (Page 19):**\nThis shows 4 examples with predicted answers:\n\n1. **Example 1** (Wild turkeys): \n   - Predicted answer: C \u2192 A\n   - The arrow indicates the prediction changed from C to A\n\n2. **Example 2** (Legend of Zelda):\n   - Predicted answer: A\n   - No indication of correctness shown\n\n3. **Example 3** (Golf tournament):\n   - Predicted answer: E \u2192 B\n   - The arrow indicates the prediction changed from E to B\n\n4. **Example 4** (Restaurant/Subway):\n   - Predicted answer: C \u2192 A\n   - The arrow indicates the prediction changed from C to A\n\nLooking at the figure caption: \"Examples of generated symbolic representations and predicted answers. The incorrect segment(s) and their correspondence in the problem are marked in red, and the correct revisions are marked in green.\"\n\nThe red and green markings indicate errors in the symbolic formulation, not whether the final answer was correct. The figure shows the symbolic representations had errors that were corrected, but doesn't explicitly state which final answers were correct.\n\nHowever, based on the examples shown, I cannot definitively determine from the information provided how many predictions were actually correct, as the figure focuses on showing the symbolic formulation errors rather than answer correctness.\n\nSince I cannot determine this with certainty from the document, I need to reconsider. The document shows 5 total examples (1 in Figure 5, 4 in Figure 6) but doesn't explicitly mark which final predictions were correct.\n\nAnswer: 0", "citations": [], "input_tokens": 53263, "output_tokens": 443, "cost_micros": 166434, "latency_ms": 15125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "answer_line", "finish_reason": "stop", "pdf_filename": "2023.findings-emnlp.248.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "0", "normalised_gold": "2"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q002", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "2", "pages": 19, "document_id": 7580, "arm": "azure_basic_lc", "question_id": "2023.findings-emnlp.248.pdf::Q002", "answer_letter": null, "raw_text": "I need to examine Figure 5 and Figure 6 to count how many examples Logic-LM predicted correctly.\n\n**Figure 5:**\nThere is 1 example shown with:\n- Problem about \"Stranger Things\" and \"Black Mirror\"\n- Predicted answer: B\n- No indication that this is incorrect\n\n**Figure 6:**\nThere are 4 examples shown:\n\n**Example 1:** Wild turkeys problem\n- Predicted answer: \u20ac > A (this appears to be garbled text)\n- The arrow pointing to \"A\" suggests the correct answer should be A\n- The red marking indicates this is incorrect\n\n**Example 2:** Legend of Zelda problem\n- Predicted answer: A\n- No red markings indicating error\n- This appears correct\n\n**Example 3:** Golf tournament problem\n- Predicted answer: + + B (garbled)\n- The arrow pointing to \"B\" and red marking indicates this is incorrect\n\n**Example 4:** Restaurant/Subway problem\n- Predicted answer: ( + A (garbled)\n- The arrow pointing to \"A\" and red marking indicates this is incorrect\n\nFrom the caption of Figure 6: \"The incorrect segment(s) and their correspondence in the problem are marked in red, and the correct revisions are marked in green.\"\n\nThis indicates that Figure 6 shows ERROR cases where Logic-LM made mistakes.\n\nFigure 5 shows a successful case (no red markings).\n\nSo:\n- Figure 5: 1 correct\n- Figure 6: 0 correct (all 4 are error examples)\n\nTotal correct: 1\n\nAnswer: 1", "citations": [], "input_tokens": 22916, "output_tokens": 365, "cost_micros": 74223, "latency_ms": 9516, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "2"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q002", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "2", "pages": 19, "document_id": 7580, "arm": "azure_premium_lc", "question_id": "2023.findings-emnlp.248.pdf::Q002", "answer_letter": null, "raw_text": "I need to check each example in Figure 5 and Figure 6 to see if Logic-LM predicted the correct answer.\n\n**Figure 5:**\n- Problem about wild turkeys and Tom\n- Question: \"Is the following statement true, false, or uncertain? 'Black Mirror' is popular.\"\n- Options: (A) True, (B) False, (C) Uncertain\n- Predicted answer: B\n- Looking at the logic: The problem states \"Karen does not download 'Black Mirror'\" and \"If and only if Karen binge-watches a Netflix show, she will download it.\" This means Karen doesn't binge-watch Black Mirror. Since \"If a Netflix show is popular, Karen will binge-watch it,\" this means Black Mirror is NOT popular.\n- So B (False) is correct. \u2713\n\n**Figure 6:**\n\n**Example 1:**\n- About Tom the wild turkey\n- Question: Is Tom an Ocellated wild turkey?\n- Options: (A) True, (B) False, (C) Uncertain\n- Predicted answer: A (marked with ++)\n- Logic: Tom is a wild turkey but NOT Eastern, Osceola, Gould's, Merriam's, or Rio Grande. Since there are only 6 types and Tom is eliminated from 5, he must be Ocellated.\n- A (True) is correct. \u2713\n\n**Example 2:**\n- About Legend of Zelda\n- Question: Is it in the Top 10 list?\n- Options: (A) True, (B) False, (C) Uncertain\n- Predicted answer: A\n- Logic: Legend of Zelda sold more than 1 million copies \u2192 selected into Top 10 list.\n- A (True) is correct. \u2713\n\n**Example 3:**\n- Golf tournament with 5 golfers\n- Question: Which finished third?\n- Options: (A) Rob, (B) Eve, (C) Eli, (D) Amy, (E) Dan\n- Predicted answer: B (marked with =>)\n- But there's an error in the constraints shown: \"Dan > Eve > Dan < Eve\" which is contradictory\n- The problem states: Dan finished 2nd, Dan", "citations": [], "input_tokens": 26786, "output_tokens": 512, "cost_micros": 88038, "latency_ms": 10844, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "2"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q002", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "2", "pages": 19, "document_id": 7580, "arm": "llamacloud_basic_lc", "question_id": "2023.findings-emnlp.248.pdf::Q002", "answer_letter": null, "raw_text": "I need to examine Figure 5 and Figure 6 to count how many examples Logic-LM predicted correctly.\n\n**Figure 5:**\nThis shows one example about \"Stranger Things\" and \"Black Mirror\". The predicted answer is B, and there's no indication this is wrong, so I'll assume it's correct: **1 correct**\n\n**Figure 6 (labeled as \"Example Generations of LOGIC-LM\"):**\nThis contains 4 examples:\n\n1. **Example 1** (Wild turkey problem): \n   - Predicted answer: C \u2192 A\n   - The arrow notation indicates the prediction was C (Uncertain) but should have been A (True)\n   - **Incorrect**\n\n2. **Example 2** (Legend of Zelda problem):\n   - Predicted answer: A\n   - No arrow or correction shown, indicating this is correct\n   - **Correct**\n\n3. **Example 3** (Golf tournament problem):\n   - Predicted answer: E \u2192 B\n   - The arrow indicates the prediction was E (Dan finished third) but should have been B (Eve finished third)\n   - **Incorrect**\n\n4. **Example 4** (Restaurant/Subway problem):\n   - Predicted answer: C \u2192 A\n   - The arrow indicates the prediction was C (Uncertain) but should have been A (True)\n   - **Incorrect**\n\n**Total count:**\n- Figure 5: 1 correct\n- Figure 6: 1 correct (Example 2 only)\n\n**Total: 2 correct predictions**\n\nAnswer: 2", "citations": [], "input_tokens": 20097, "output_tokens": 361, "cost_micros": 65706, "latency_ms": 9327, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q002", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "2", "pages": 19, "document_id": 7580, "arm": "llamacloud_premium_lc", "question_id": "2023.findings-emnlp.248.pdf::Q002", "answer_letter": null, "raw_text": "I need to count how many examples in Figure 5 and Figure 6 have Logic-LM predicting the correct answer.\n\n**Figure 5:**\n- Example 1: Predicted answer is B (shown as correct based on the context)\n- Count: 1 correct\n\n**Figure 6:**\n- Example 1: Predicted answer changed from C to A. The arrows indicate C was wrong and A is the correction, meaning the final prediction is A (correct).\n- Count: 1 correct\n\n- Example 2: Predicted answer is A (shown as correct, no corrections indicated)\n- Count: 1 correct\n\n- Example 3: Predicted answer changed from E to B. The arrows indicate E was wrong and B is the correction, meaning the final prediction is B (correct).\n- Count: 1 correct\n\n- Example 4: Predicted answer changed from C to A. The arrows indicate C was wrong and A is the correction, meaning the final prediction is A (correct).\n- Count: 1 correct\n\nTotal examples with correct predictions: 1 (Figure 5) + 4 (Figure 6) = 5\n\nAnswer: 5", "citations": [], "input_tokens": 25915, "output_tokens": 263, "cost_micros": 81690, "latency_ms": 6344, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "2"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q002", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "2", "pages": 19, "document_id": 7580, "arm": "surfsense_agentic", "question_id": "2023.findings-emnlp.248.pdf::Q002", "answer_letter": "A", "raw_text": "I'll read the document to find Figures 5 and 6 and count how many examples Logic-LM predicted correctly.Let me search for Figure 5 and Figure 6 in the document:Let me search for the actual content around these figures:Now let me look for Figure 6 in the appendix:I need to find the actual content of Figure 6 with the examples. Let me search more carefully:Now let me look back at Figure 5 to get the complete example:Perfect! Now I have both figures. Let me analyze them:\n\n**Figure 5** (lines 1612-1645): Contains 1 example about \"Black Mirror\" and \"Stranger Things\"\n- Predicted answer: B\n- I need to check if this is correct by looking at the problem logic\n\n**Figure 6** (lines 2993-3146): Contains 4 examples:\n1. Example 1 (lines 2994-3018): Wild turkey problem - Predicted answer: A (with ++ A notation, but marked with red X's indicating errors)\n2. Example 2 (lines 3021-3044): Legend of Zelda problem - Predicted answer: A\n3. Example 3 (lines 3047-3109): Golf tournament problem - Predicted answer: B (with => B notation, and red X's indicating errors)\n4. Example 4 (lines 3115-3146): Subway restaurant problem - Predicted answer: A (with <> A notation, and red X's with checkmarks indicating errors)\n\nAccording to the caption at line 3148-3149: \"The incorrect segment(s) and their correspondence in the problem are marked in red, and the correct revisions are marked in green.\"\n\nThis means Figure 6 shows ERROR CASES where Logic-LM made mistakes. The red X's (\u2612) indicate incorrect segments.\n\nFor Figure 5, I need to verify if the answer is correct. Looking at the logic:\n- \"Black Mirror\" is a Netflix show\n- Karen does not download \"Black Mirror\"\n- If and only if Karen binge-watches a show, she will download it\n- Therefore, Karen does not binge-watch \"Black Mirror\"\n- If a Netflix show is popular, Karen will binge-watch it\n- Since Karen doesn't binge-watch \"Black Mirror\", it is NOT popular\n- Answer B (False) appears to be CORRECT\n\nSo the count is:\n- Figure 5: 1 example, 1 correct (Example about Black Mirror)\n- Figure 6: 4 examples, 1 correct (Example 2 - Legend of Zelda), 3 incorrect (Examples 1, 3, and 4 have red X marks)\n\nTotal correct: 1 + 1 = 2\n\nAnswer: 2", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 54812, "error": null, "extra": {"thread_id": 1948, "search_space_id": 55, "answer_letter_strategy": "answer_line", "user_message_id": "msg_4ece07b065344ab588935735051340b9", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 440, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q004", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "15", "pages": 19, "document_id": 7580, "arm": "native_pdf", "question_id": "2023.findings-emnlp.248.pdf::Q004", "answer_letter": null, "raw_text": "I need to count the prompt examples provided in Appendix C.\n\nLooking at Appendix C \"Prompt Examples\" starting on page 14:\n\nThe appendix states: \"For brevity, we show only a single demonstration for each setting in the following sections.\"\n\nThe sections are:\n- C.1 PrOntoQA Prompts (3 types: Standard, CoT, Logic-LM)\n- C.2 ProofWriter Prompts (3 types: Standard, CoT, Logic-LM)\n- C.3 FOLIO Prompts (3 types: Standard, CoT, Logic-LM)\n- C.4 LogicalDeduction Prompts (3 types: Standard, CoT, Logic-LM)\n- C.5 AR-LSAT Prompts (3 types: Standard, CoT, Logic-LM)\n\nEach subsection (C.1 through C.5) contains 3 prompt types (Standard In-Context Learning, Chain-of-Thought Prompting, and Logic-LM).\n\nSo there are 5 datasets \u00d7 3 prompt types = 15 prompt examples total.\n\nAnswer: 15", "citations": [], "input_tokens": 53247, "output_tokens": 276, "cost_micros": 163881, "latency_ms": 13389, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2023.findings-emnlp.248.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "15", "normalised_gold": "15"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q004", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "15", "pages": 19, "document_id": 7580, "arm": "azure_basic_lc", "question_id": "2023.findings-emnlp.248.pdf::Q004", "answer_letter": null, "raw_text": "I need to count the number of prompt examples provided in Appendix C.\n\nLooking at Appendix C \"Prompt Examples\", I can see it covers prompts for different datasets:\n\n1. **C.1 PrOntoQA Prompts** - Contains 3 prompt types:\n   - Standard In-Context Learning (1 example)\n   - Chain-of-Thought Prompting (1 example)\n   - Logic-LM (1 example)\n\n2. **C.2 ProofWriter Prompts** - Contains 3 prompt types:\n   - Standard In-Context Learning (1 example)\n   - Chain-of-Thought Prompting (1 example)\n   - Logic-LM (1 example)\n\n3. **C.3 FOLIO Prompts** - Contains 3 prompt types:\n   - Standard In-Context Learning (1 example)\n   - Chain-of-Thought Prompting (1 example)\n   - Logic-LM (1 example)\n\n4. **C.4 LogicalDeduction Prompts** - Contains 3 prompt types:\n   - Standard In-Context Learning (1 example)\n   - Chain-of-Thought Prompting (1 example)\n   - Logic-LM (1 example)\n\n5. **C.5 AR-LSAT Prompts** - Contains 3 prompt types:\n   - Standard In-Context Learning (1 example)\n   - Chain-of-Thought Prompting (1 example)\n   - Logic-LM (1 example)\n\nTotal count: 5 datasets \u00d7 3 prompt types = 15 prompt examples\n\nAnswer: 15", "citations": [], "input_tokens": 22900, "output_tokens": 398, "cost_micros": 74670, "latency_ms": 5968, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "15", "normalised_gold": "15"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q004", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "15", "pages": 19, "document_id": 7580, "arm": "azure_premium_lc", "question_id": "2023.findings-emnlp.248.pdf::Q004", "answer_letter": null, "raw_text": "I need to count the number of prompt examples given in Appendix C.\n\nLooking at Appendix C \"Prompt Examples\":\n\n**C.1 PrOntoQA Prompts**\n- Standard In-Context Learning (1 example)\n- Chain-of-Thought Prompting (1 example)\n- Logic-LM (1 example)\n\n**C.2 ProofWriter Prompts**\n- Standard In-Context Learning (1 example)\n- Chain-of-Thought Prompting (1 example)\n- Logic-LM (1 example)\n\n**C.3 FOLIO Prompts**\n- Standard In-Context Learning (1 example)\n- Chain-of-Thought Prompting (1 example)\n- Logic-LM (1 example)\n\n**C.4 LogicalDeduction Prompts**\n- Standard In-Context Learning (1 example)\n- Chain-of-Thought Prompting (1 example)\n- Logic-LM (1 example)\n\n**C.5 AR-LSAT Prompts**\n- Standard In-Context Learning (1 example)\n- Chain-of-Thought Prompting (1 example)\n- Logic-LM (1 example)\n\nTotal count: 5 datasets \u00d7 3 prompting methods = 15 prompt examples\n\nAnswer: 15", "citations": [], "input_tokens": 26770, "output_tokens": 323, "cost_micros": 85155, "latency_ms": 5500, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "15", "normalised_gold": "15"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q004", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "15", "pages": 19, "document_id": 7580, "arm": "llamacloud_basic_lc", "question_id": "2023.findings-emnlp.248.pdf::Q004", "answer_letter": null, "raw_text": "I need to count the number of prompt examples provided in the Appendix section of the document.\n\nLooking at the Appendix sections:\n\n**Section C: Prompt Examples**\n\nThe document states: \"For brevity, we show only a single demonstration for each setting in the following sections.\"\n\nThen it provides:\n\n**C.1 PrOntoQA Prompts:**\n- Standard In-Context Learning (1 example)\n- Logic-LM (1 example)\n\n**C.2 ProofWriter Prompts:**\n- Standard In-Context Learning (1 example)\n- Chain-of-Thought Prompting (1 example)\n- Logic-LM (1 example)\n\n**C.3 FOLIO Prompts:**\n- Standard In-Context Learning (1 example)\n- Chain-of-Thought Prompting (1 example)\n- Logic-LM (1 example)\n\n**C.4 Logical Deduction Prompts:**\n- Standard In-Context Learning (1 example)\n- Chain-of-Thought Prompting (1 example)\n- Logic-LM (1 example)\n\n**C.5 AR-LSAT Prompts:**\n- Standard In-Context Learning (1 example)\n- Chain-of-Thought Prompting (1 example)\n- Logic-LM (1 example)\n\nTotal count:\n- C.1: 2 examples\n- C.2: 3 examples\n- C.3: 3 examples\n- C.4: 3 examples\n- C.5: 3 examples\n\nTotal: 2 + 3 + 3 + 3 + 3 = 14 examples\n\nAnswer: 14", "citations": [], "input_tokens": 20081, "output_tokens": 407, "cost_micros": 66348, "latency_ms": 7125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "14", "normalised_gold": "15"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q004", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "15", "pages": 19, "document_id": 7580, "arm": "llamacloud_premium_lc", "question_id": "2023.findings-emnlp.248.pdf::Q004", "answer_letter": null, "raw_text": "I need to count the number of prompt examples provided in Appendix C.\n\nLooking at Appendix C (Prompt Examples), I can see it's organized by dataset:\n\n**C.1 PrOntoQA Prompts:**\n1. Standard In-Context Learning - 1 example\n2. Chain-of-Thought Prompting - 1 example\n3. Logic-LM - 1 example\n\n**C.2 ProofWriter Prompts:**\n1. Standard In-Context Learning - 1 example\n2. Chain-of-Thought Prompting - 1 example\n3. Logic-LM - 1 example\n\n**C.3 FOLIO Prompts:**\n1. Standard In-Context Learning - 1 example\n2. Chain-of-Thought Prompting - 1 example\n3. Logic-LM - 1 example\n\n**C.4 LogicalDeduction Prompts:**\n1. Standard In-Context Learning - 1 example\n2. Chain-of-Thought Prompting - 1 example\n3. Logic-LM - 1 example\n\n**C.5 AR-LSAT Prompts:**\n1. Standard In-Context Learning - 1 example\n2. Chain-of-Thought Prompting - 1 example\n3. Logic-LM - 1 example\n\nTotal count: 5 datasets \u00d7 3 prompt types = 15 prompt examples\n\nAnswer: 15", "citations": [], "input_tokens": 25899, "output_tokens": 346, "cost_micros": 82887, "latency_ms": 6280, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "15", "normalised_gold": "15"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q004", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "15", "pages": 19, "document_id": 7580, "arm": "surfsense_agentic", "question_id": "2023.findings-emnlp.248.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to find information about prompt examples in the Appendix.Let me search for the Appendix section in the document to find information about prompt examples.Now let me read the section with prompt examples to count them. Let me find where this content is located.Let me search for the Appendix C section with prompt examples more directly:Let me continue to search for all the sections to count the prompt examples. Based on what I've seen so far, I need to count the subsections in Appendix C.Perfect! I can see there are 5 subsections in Appendix C:\n- C.1 PrOntoQA Prompts\n- C.2 ProofWriter Prompts  \n- C.3 FOLIO Prompts\n- C.4 LogicalDeduction Prompts\n- C.5 AR-LSAT Prompts\n\nEach subsection shows prompts for 3 different methods (Standard In-Context Learning, Chain-of-Thought Prompting, and Logic-LM), and according to the introduction text, each shows \"only a single demonstration for each setting.\"\n\nSo the count is: 5 datasets \u00d7 3 methods = 15 prompt examples in Appendix C.\n\nAnswer: 15", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 55203, "error": null, "extra": {"thread_id": 1949, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_28321722a0164737934f82e2d1d378ae", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 318, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "15", "normalised_gold": "15"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q005", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "4", "pages": 19, "document_id": 7580, "arm": "native_pdf", "question_id": "2023.findings-emnlp.248.pdf::Q005", "answer_letter": null, "raw_text": "I need to find the performance comparison between Logic-LM and the two baseline models (Standard and CoT) when using GPT-4 as the base language model.\n\nLooking at Table 2, which shows the accuracy results for different models across five datasets, I can see the GPT-4 (gpt-4) columns:\n\n1. **PrOntoQA**: \n   - Standard: 77.40\n   - CoT: 98.79\n   - Logic-LM: 83.20\n   - Logic-LM does NOT outperform CoT (98.79 > 83.20), but does outperform Standard\n\n2. **ProofWriter**:\n   - Standard: 52.67\n   - CoT: 68.11\n   - Logic-LM: 79.66\n   - Logic-LM outperforms both baselines \u2713\n\n3. **FOLIO**:\n   - Standard: 69.11\n   - CoT: 70.58\n   - Logic-LM: 78.92\n   - Logic-LM outperforms both baselines \u2713\n\n4. **LogicalDeduction**:\n   - Standard: 71.33\n   - CoT: 75.25\n   - Logic-LM: 87.63\n   - Logic-LM outperforms both baselines \u2713\n\n5. **AR-LSAT**:\n   - Standard: 33.33\n   - CoT: 35.06\n   - Logic-LM: 43.04\n   - Logic-LM outperforms both baselines \u2713\n\nLogic-LM outperforms BOTH baseline models in 4 out of 5 datasets (ProofWriter, FOLIO, LogicalDeduction, and AR-LSAT).\n\nAnswer: 4", "citations": [], "input_tokens": 53272, "output_tokens": 424, "cost_micros": 166176, "latency_ms": 23500, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2023.findings-emnlp.248.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q005", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "4", "pages": 19, "document_id": 7580, "arm": "azure_basic_lc", "question_id": "2023.findings-emnlp.248.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Table 2, I need to compare Logic-LM's performance against both Standard and CoT baselines for GPT-4 across all 5 datasets.\n\nFor GPT-4:\n\n1. **PrOntoQA**: \n   - Logic-LM: 83.20\n   - Standard: 77.40 \u2713\n   - CoT: 98.79 \u2717\n   - Logic-LM outperforms Standard only\n\n2. **ProofWriter**:\n   - Logic-LM: 79.66\n   - Standard: 52.67 \u2713\n   - CoT: 68.11 \u2713\n   - Logic-LM outperforms both\n\n3. **FOLIO**:\n   - Logic-LM: 78.92\n   - Standard: 69.11 \u2713\n   - CoT: 70.58 \u2713\n   - Logic-LM outperforms both\n\n4. **LogicalDeduction**:\n   - Logic-LM: 87.63\n   - Standard: 71.33 \u2713\n   - CoT: 75.25 \u2713\n   - Logic-LM outperforms both\n\n5. **AR-LSAT**:\n   - Logic-LM: 43.04\n   - Standard: 33.33 \u2713\n   - CoT: 35.06 \u2713\n   - Logic-LM outperforms both\n\nLogic-LM outperforms both baseline models in 4 datasets (ProofWriter, FOLIO, LogicalDeduction, and AR-LSAT).\n\nAnswer: 4", "citations": [], "input_tokens": 22925, "output_tokens": 391, "cost_micros": 74640, "latency_ms": 7687, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q005", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "4", "pages": 19, "document_id": 7580, "arm": "azure_premium_lc", "question_id": "2023.findings-emnlp.248.pdf::Q005", "answer_letter": null, "raw_text": "I need to look at Table 2 to compare Logic-LM's performance against the two baselines (Standard and CoT) when using GPT-4.\n\nFrom Table 2, the GPT-4 results are:\n\n**PrOntoQA:**\n- Standard: 77.40\n- CoT: 98.79\n- Logic-LM: 83.20\n- Logic-LM does NOT outperform both (CoT is better)\n\n**ProofWriter:**\n- Standard: 52.67\n- CoT: 68.11\n- Logic-LM: 79.66\n- Logic-LM outperforms both \u2713\n\n**FOLIO:**\n- Standard: 69.11\n- CoT: 70.58\n- Logic-LM: 78.92\n- Logic-LM outperforms both \u2713\n\n**LogicalDeduction:**\n- Standard: 71.33\n- CoT: 75.25\n- Logic-LM: 87.63\n- Logic-LM outperforms both \u2713\n\n**AR-LSAT:**\n- Standard: 33.33\n- CoT: 35.06\n- Logic-LM: 43.04\n- Logic-LM outperforms both \u2713\n\nLogic-LM outperforms both baseline models on 4 out of 5 datasets when using GPT-4.\n\nAnswer: 4", "citations": [], "input_tokens": 26795, "output_tokens": 332, "cost_micros": 85365, "latency_ms": 7342, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q005", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "4", "pages": 19, "document_id": 7580, "arm": "llamacloud_basic_lc", "question_id": "2023.findings-emnlp.248.pdf::Q005", "answer_letter": null, "raw_text": "I need to find how many datasets Logic-LM (without self-refinement) outperforms both baseline models (Standard and CoT) when using GPT-4.\n\nLooking at Table 2 in the document, I need to compare Logic-LM's performance against Standard LLMs and Chain-of-Thought (CoT) for GPT-4 across all datasets.\n\nFrom Table 2, the GPT-4 results show:\n\n1. **PrOntoQA**: \n   - Standard: 99.4%\n   - CoT: 84.9%\n   - Logic-LM: 100.0%\n   - Logic-LM outperforms both \u2713\n\n2. **ProofWriter**:\n   - Standard: 87.3%\n   - CoT: 73.6%\n   - Logic-LM: 99.0%\n   - Logic-LM outperforms both \u2713\n\n3. **FOLIO**:\n   - Standard: 66.7%\n   - CoT: 61.8%\n   - Logic-LM: 79.9%\n   - Logic-LM outperforms both \u2713\n\n4. **LogicalDeduction**:\n   - Standard: 11.3%\n   - CoT: 57.7%\n   - Logic-LM: 60.0%\n   - Logic-LM outperforms both \u2713\n\n5. **AR-LSAT**:\n   - Standard: 32.6%\n   - CoT: 60.0%\n   - Logic-LM: 39.8%\n   - Logic-LM does NOT outperform CoT (39.8% < 60.0%) \u2717\n\nLogic-LM outperforms both baseline models in 4 out of 5 datasets when using GPT-4.\n\nAnswer: 4", "citations": [], "input_tokens": 20106, "output_tokens": 440, "cost_micros": 66918, "latency_ms": 8265, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q005", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "4", "pages": 19, "document_id": 7580, "arm": "llamacloud_premium_lc", "question_id": "2023.findings-emnlp.248.pdf::Q005", "answer_letter": null, "raw_text": "I need to find how many datasets Logic-LM (without self-refinement) outperforms both baseline models (Standard and CoT) when using GPT-4.\n\nLooking at Table 2, for GPT-4 (gpt-4):\n\n1. **PrOntoQA**: \n   - Standard: 77.40\n   - CoT: 98.79\n   - Logic-LM: 83.20\n   - Logic-LM does NOT outperform CoT (83.20 < 98.79)\n\n2. **ProofWriter**:\n   - Standard: 52.67\n   - CoT: 68.11\n   - Logic-LM: 79.66\n   - Logic-LM outperforms both (79.66 > 68.11 > 52.67) \u2713\n\n3. **FOLIO**:\n   - Standard: 69.11\n   - CoT: 70.58\n   - Logic-LM: 78.92\n   - Logic-LM outperforms both (78.92 > 70.58 > 69.11) \u2713\n\n4. **LogicalDeduction**:\n   - Standard: 71.33\n   - CoT: 75.25\n   - Logic-LM: 87.63\n   - Logic-LM outperforms both (87.63 > 75.25 > 71.33) \u2713\n\n5. **AR-LSAT**:\n   - Standard: 33.33\n   - CoT: 35.06\n   - Logic-LM: 43.04\n   - Logic-LM outperforms both (43.04 > 35.06 > 33.33) \u2713\n\nLogic-LM outperforms both baselines on 4 datasets: ProofWriter, FOLIO, LogicalDeduction, and AR-LSAT.\n\nAnswer: 4", "citations": [], "input_tokens": 25924, "output_tokens": 456, "cost_micros": 84612, "latency_ms": 8796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q005", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "int", "gold": "4", "pages": 19, "document_id": 7580, "arm": "surfsense_agentic", "question_id": "2023.findings-emnlp.248.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to find information about Logic-LM's performance compared to baseline models when using GPT-4.Let me continue reading to find information about GPT-4 performance comparisons.Let me search for tables or sections that compare GPT-4 performance across datasets.Now let me read the table with GPT-4 results to count the datasets where Logic-LM outperforms baselines.Let me look at the table header to understand the structure better.Perfect! Now I have the complete table. Let me analyze the GPT-4 results:\n\nFor GPT-4 (the last 3 columns):\n- Standard, CoT, Logic-LM\n\nThe two baseline models are:\n1. Standard prompting\n2. Chain-of-thought (CoT) prompting\n\nI need to count how many datasets where Logic-LM (without self-refinement) outperforms BOTH baselines when using GPT-4.\n\nLet me check each dataset:\n\n1. **PrOntoQA**: Logic-LM = 83.20, Standard = 77.40, CoT = 98.79\n   - Logic-LM does NOT outperform CoT (83.20 < 98.79)\n\n2. **ProofWriter**: Logic-LM = 79.66, Standard = 52.67, CoT = 68.11\n   - Logic-LM > Standard (79.66 > 52.67) \u2713\n   - Logic-LM > CoT (79.66 > 68.11) \u2713\n   - **Outperforms both** \u2713\n\n3. **FOLIO**: Logic-LM = 78.92, Standard = 69.11, CoT = 70.58\n   - Logic-LM > Standard (78.92 > 69.11) \u2713\n   - Logic-LM > CoT (78.92 > 70.58) \u2713\n   - **Outperforms both** \u2713\n\n4. **LogicalDeduction**: Logic-LM = 87.63, Standard = 71.33, CoT = 75.25\n   - Logic-LM > Standard (87.63 > 71.33) \u2713\n   - Logic-LM > CoT (87.63 > 75.25) \u2713\n   - **Outperforms both** \u2713\n\n5. **AR-LSAT**: Logic-LM = 43.04, Standard = 33.33, CoT = 35.06\n   - Logic-LM > Standard (43.04 > 33.33) \u2713\n   - Logic-LM > CoT (43.04 > 35.06) \u2713\n   - **Outperforms both** \u2713\n\nLogic-LM (without self-refinement) outperforms both baseline models on 4 datasets when using GPT-4 as the base language model.\n\nAnswer: 4", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 44437, "error": null, "extra": {"thread_id": 1950, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_cfca2042672b455980d6e47d98e5f851", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 311, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q006", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "str", "gold": "Pyke", "pages": 19, "document_id": 7580, "arm": "native_pdf", "question_id": "2023.findings-emnlp.248.pdf::Q006", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about the symbolic solver used for logic programming.\n\nFrom Table 1 on page 5, I can see that for \"Deductive Reasoning\" problems (which corresponds to the LP - Logic Programming formulation), the solver used is **Pyke**.\n\nThis is further confirmed in Section 3.2 on page 5, which states: \"For deductive reasoning, we incorporate the Pyke expert system (Frederiksen, 2008), which makes inferences based on the logic programming language.\"\n\nAnswer: Pyke", "citations": [], "input_tokens": 53262, "output_tokens": 128, "cost_micros": 161706, "latency_ms": 10532, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2023.findings-emnlp.248.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "pyke", "normalised_gold": "pyke"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q006", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "str", "gold": "Pyke", "pages": 19, "document_id": 7580, "arm": "azure_basic_lc", "question_id": "2023.findings-emnlp.248.pdf::Q006", "answer_letter": null, "raw_text": "Looking at the document, I need to find which symbolic solver implements the logic programming module.\n\nIn Table 1 on page 3810, the paper summarizes the symbolic solvers used for different problem types. For \"Deductive Reasoning\" which uses \"LP\" (Logic Programming) formulation, the solver listed is \"Pyke\".\n\nThis is confirmed in Section 3.2 on page 3810, which states: \"For deductive reasoning, we incorporate the Pyke expert system (Frederiksen, 2008), which makes inferences based on the logic programming language.\"\n\nAnswer: Pyke", "citations": [], "input_tokens": 22915, "output_tokens": 137, "cost_micros": 70800, "latency_ms": 5000, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "pyke", "normalised_gold": "pyke"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q006", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "str", "gold": "Pyke", "pages": 19, "document_id": 7580, "arm": "azure_premium_lc", "question_id": "2023.findings-emnlp.248.pdf::Q006", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about the symbolic solver used for logic programming.\n\nIn Table 1 on page 3810, the paper shows a summary of symbolic formulations and symbolic solvers used for different datasets. For \"Deductive Reasoning\" problems that use \"LP\" (Logic Programming) formulation, the solver listed is \"Pyke\".\n\nThis is confirmed in Section 3.2 \"Symbolic Reasoner\" on page 3810, which states: \"For deductive reasoning, we incorporate the Pyke expert system (Frederiksen, 2008), which makes inferences based on the logic programming language.\"\n\nAnswer: Pyke", "citations": [], "input_tokens": 26785, "output_tokens": 150, "cost_micros": 82605, "latency_ms": 6202, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "pyke", "normalised_gold": "pyke"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q006", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "str", "gold": "Pyke", "pages": 19, "document_id": 7580, "arm": "llamacloud_basic_lc", "question_id": "2023.findings-emnlp.248.pdf::Q006", "answer_letter": null, "raw_text": "Looking through the document for information about the symbolic solver used for logic programming...\n\nIn Table 1 on page 3805, the document shows a summary of symbolic formulations and symbolic solvers used for different datasets. For \"Deductive Reasoning\" problems (which correspond to the logic programming formulation mentioned in Figure 1), the solver listed is \"ProntoQA, Pyke\".\n\nAdditionally, in Section 3.2 and footnotes, the document references Pyke as the logic programming engine. In Section D (page 3823), it explicitly states: \"For PrOntoQA and ProofWriter, the Pyke logic programming engine returns the inferred value...\"\n\nAnswer: Pyke", "citations": [], "input_tokens": 20096, "output_tokens": 158, "cost_micros": 62658, "latency_ms": 6328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "pyke", "normalised_gold": "pyke"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q006", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "str", "gold": "Pyke", "pages": 19, "document_id": 7580, "arm": "llamacloud_premium_lc", "question_id": "2023.findings-emnlp.248.pdf::Q006", "answer_letter": null, "raw_text": "Looking at the document, I need to find which symbolic solver is used for the logic programming module.\n\nFrom Table 1 on page 3810, I can see that for \"Deductive Reasoning\" problems which use \"LP\" (Logic Programming) formulation, the solver listed is \"Pyke\".\n\nThis is confirmed in Section 3.2 \"Symbolic Reasoner\" on page 3810, which states:\n\n\"**LP System.** For deductive reasoning, we incorporate the Pyke expert system (Frederiksen, 2008), which makes inferences based on the logic programming language.\"\n\nAnswer: Pyke", "citations": [], "input_tokens": 25914, "output_tokens": 143, "cost_micros": 79887, "latency_ms": 5453, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "pyke", "normalised_gold": "pyke"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q006", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "str", "gold": "Pyke", "pages": 19, "document_id": 7580, "arm": "surfsense_agentic", "question_id": "2023.findings-emnlp.248.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the document to find information about the symbolic solver used for the logic programming module in Figure 1.Let me search for information about Figure 1 and the symbolic solver:Let me read the document content to find references to Figure 1 and the solver:Good! I can see Figure 1 is mentioned. Now let me search for specific mentions of the solver implementation:I can see references to solvers. Let me look at line 711 where it mentions \"logic programming engine\" and around line 823 where Table 1 is mentioned:Perfect! I found Table 1. Let me continue reading to see all the solvers listed:Perfect! I found the information. Looking at Table 1 and the description in section 3.2, I can see that:\n\n- For **Deductive Reasoning** (Logic Programming / LP), the solver used is **Pyke**\n- For **First-Order Logic** (FOL), the solver used is **Prover9**\n- For **Constraint Satisfaction** (CSP), the solver used is **python-constraint**\n- For **Analytical Reasoning** (SAT), the solver used is **Z3**\n\nNow, let me verify which one corresponds to the \"logic programming module\" mentioned in Figure 1. Based on the document:\n\n1. Figure 1 shows the overall LOGIC-LM framework with a \"Symbolic Reasoner\" component\n2. Section 3.2 describes the symbolic reasoner and mentions that for deductive reasoning using the Logic Programming (LP) language, they use **Pyke**\n3. Section 3.2.1 specifically states: \"For deductive reasoning, we incorporate the Pyke expert system (Frederiksen, 2008), which makes inferences based on the logic programming language.\"\n\nThe logic programming module shown in Figure 1 is implemented using **Pyke**.\n\nAnswer: Pyke", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 58280, "error": null, "extra": {"thread_id": 1951, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_708b2982536441eabb41dbb732343ffa", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 360, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "pyke", "normalised_gold": "pyke"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q007", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "str", "gold": "ProofWriter", "pages": 19, "document_id": 7580, "arm": "native_pdf", "question_id": "2023.findings-emnlp.248.pdf::Q007", "answer_letter": null, "raw_text": "Looking at the datasets table (Table 5 on page 14), I can see the following datasets with their number of options:\n\n- PrOntoQA: 2 options\n- ProofWriter: 3 options\n- FOLIO: 3 options\n- LogicalDeduction: 3, 5, 7 options\n- AR-LSAT: 5 options\n\nAR-LSAT has 5 options for all questions. Looking at the references and dataset descriptions, AR-LSAT (Zhong et al., 2022) is described as \"a dataset that collects all analytical logic reasoning questions from the Law School Admission Test from 1991 to 2016\" with \"231 multiple-choice questions\" and the test set has \"231 multiple-choice questions.\"\n\nFrom the paper text: \"AR-LSAT (Zhong et al., 2022) is a dataset that collects all analytical logic reasoning questions from the Law School Admission Test from 1991 to 2016. We use the test set which has 231 multiple-choice questions.\"\n\nAnswer: AR-LSAT", "citations": [], "input_tokens": 53272, "output_tokens": 258, "cost_micros": 163686, "latency_ms": 21859, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2023.findings-emnlp.248.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "ar lsat", "normalised_gold": "proofwriter"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q007", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "str", "gold": "ProofWriter", "pages": 19, "document_id": 7580, "arm": "azure_basic_lc", "question_id": "2023.findings-emnlp.248.pdf::Q007", "answer_letter": null, "raw_text": "Looking at the document, I need to find a dataset that:\n1. Was proposed in 2022\n2. Has all multiple-choice questions with 5 options\n\nFrom Table 1 and the dataset descriptions in Section 4:\n\n- **PrOntoQA (2023)**: 2 options (True/False)\n- **ProofWriter (2021)**: 3 options\n- **FOLIO (2022)**: 3 options\n- **LogicalDeduction (2022)**: Variable options (3, 5, or 7 according to Table 5)\n- **AR-LSAT (2022)**: 5 options\n\nFrom the paper text (Section 4):\n\"AR-LSAT (Zhong et al., 2022) is a dataset that collects all analytical logic reasoning questions from the Law School Admission Test from 1991 to 2016. We use the test set which has 231 multiple-choice questions.\"\n\nAnd from Table 5, AR-LSAT has \"#Opts: 5\".\n\nAnswer: AR-LSAT", "citations": [], "input_tokens": 22925, "output_tokens": 253, "cost_micros": 72570, "latency_ms": 7828, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "ar lsat", "normalised_gold": "proofwriter"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q007", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "str", "gold": "ProofWriter", "pages": 19, "document_id": 7580, "arm": "azure_premium_lc", "question_id": "2023.findings-emnlp.248.pdf::Q007", "answer_letter": null, "raw_text": "Looking through the document for datasets proposed in 2022 with 5-option multiple-choice questions:\n\nFrom Table 5 (Dataset Statistics):\n- PrOntoQA: 2 options\n- ProofWriter: 3 options\n- FOLIO: 3 options\n- LogicalDeduction: 3, 5, 7 options (varies)\n- AR-LSAT: 5 options\n\nFrom the dataset descriptions in Section 4:\n- AR-LSAT (Zhong et al., 2022) is described as \"a dataset that collects all analytical logic reasoning questions from the Law School Admission Test from 1991 to 2016\" with \"231 multiple-choice questions.\"\n- Table 5 confirms AR-LSAT has 5 options (#Opts column)\n\nAnswer: AR-LSAT", "citations": [], "input_tokens": 26795, "output_tokens": 198, "cost_micros": 83355, "latency_ms": 7812, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "ar lsat", "normalised_gold": "proofwriter"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q007", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "str", "gold": "ProofWriter", "pages": 19, "document_id": 7580, "arm": "llamacloud_basic_lc", "question_id": "2023.findings-emnlp.248.pdf::Q007", "answer_letter": null, "raw_text": "Looking at the document, I need to find datasets that:\n1. Were proposed in 2022\n2. Have all problems as multiple-choice questions with 5 options\n\nFrom Table 5 (Dataset Statistics), I can see the number of options (#Opts) for each dataset:\n- PrOntoQA: 2 options\n- ProofWriter: 3 options\n- FOLIO: 3 options\n- LogicalDeduction: 3, 5, 7 options (variable)\n- AR-LSAT: 5 options\n\nFrom the Datasets section (Section 4), I can find when each dataset was created:\n- PrOntoQA: 2023 (Saparov and He, 2023)\n- ProofWriter: 2021 (Tafjord et al., 2021)\n- FOLIO: 2022 (Han et al., 2022)\n- LogicalDeduction: 2022 (from BigBench - Srivastava et al., 2022)\n- AR-LSAT: 2022 (Zhong et al., 2022)\n\nAmong the 2022 datasets:\n- FOLIO has 3 options\n- LogicalDeduction has variable options (3, 5, or 7)\n- AR-LSAT has 5 options\n\nOnly AR-LSAT consistently has 5 options for all questions.\n\nAnswer: AR-LSAT", "citations": [], "input_tokens": 20106, "output_tokens": 343, "cost_micros": 65463, "latency_ms": 7766, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "ar lsat", "normalised_gold": "proofwriter"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q007", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "str", "gold": "ProofWriter", "pages": 19, "document_id": 7580, "arm": "llamacloud_premium_lc", "question_id": "2023.findings-emnlp.248.pdf::Q007", "answer_letter": null, "raw_text": "Looking at the document, I need to find a dataset that:\n1. Was proposed in 2022\n2. Has all multiple-choice questions with 5 options\n\nFrom Table 5 (Dataset Statistics), I can see the \"#Opts\" column which shows the number of options:\n- PrOntoQA: 2 options\n- ProofWriter: 3 options\n- FOLIO: 3 options\n- LogicalDeduction: 3,5,7 options (varying)\n- AR-LSAT: 5 options\n\nFrom the dataset descriptions in Section 4:\n- AR-LSAT is described as collecting \"all analytical logic reasoning questions from the Law School Admission Test from 1991 to 2016\"\n- The citation is \"Zhong et al. 2022\"\n\nLooking at the references, AR-LSAT is cited as:\n\"Wanjun Zhong, Siyuan Wang, Duyu Tang, Zenan Xu, Daya Guo, Yining Chen, Jiahai Wang, Jian Yin, Ming Zhou, and Nan Duan. 2022. Analytical reasoning of text.\"\n\nAR-LSAT is the only dataset with consistently 5 options and was published in 2022.\n\nAnswer: AR-LSAT", "citations": [], "input_tokens": 25924, "output_tokens": 307, "cost_micros": 82377, "latency_ms": 8733, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "ar lsat", "normalised_gold": "proofwriter"}}
+{"qid": "2023.findings-emnlp.248.pdf::Q007", "doc_id": "2023.findings-emnlp.248.pdf", "answer_format": "str", "gold": "ProofWriter", "pages": 19, "document_id": 7580, "arm": "surfsense_agentic", "question_id": "2023.findings-emnlp.248.pdf::Q007", "answer_letter": null, "raw_text": "I'll read the document to find information about datasets used in the paper, focusing on those proposed in 2022 with multiple-choice questions having 5 options.Let me search for information about the datasets used in this paper, particularly focusing on those from 2022 and their characteristics.I can see references to datasets from 2022. Let me search for information about AR-LSAT and other datasets, particularly focusing on their characteristics regarding multiple-choice questions with 5 options.Let me search for more specific information about the datasets, particularly looking for details about multiple-choice options.Let me read more about each dataset to find their characteristics. Let me look at the section describing LogicalDeduction.Let me search for information about LogicalDeduction specifically and check the appendix for detailed data statistics.Let me search for \"Appendix B\" which was mentioned as containing detailed data statistics.Perfect! I found the information I need in Table 5. Let me verify the details about AR-LSAT:\n\nFrom the table:\n- AR-LSAT has 5 options (#Opts: 5)\n- AR-LSAT was proposed by Zhong et al., 2022\n\nLet me verify that AR-LSAT is specifically about logical reasoning and all problems are multiple-choice.Based on the document:\n\n1. AR-LSAT was proposed by Zhong et al., 2022\n2. It collects all **analytical logic reasoning questions** from the Law School Admission Test\n3. The test set has 231 **multiple-choice questions**\n4. From Table 5 (Appendix B), AR-LSAT has **5 options** (#Opts: 5)\n\nThis matches all the criteria in the question: proposed in 2022, logical reasoning problems, all multiple-choice questions with 5 options.\n\nAnswer: AR-LSAT", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 69453, "error": null, "extra": {"thread_id": 1952, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_4c068961fe9145d382e57492c359de62", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 452, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "ar lsat", "normalised_gold": "proofwriter"}}
+{"qid": "2024.ug.eprospectus.pdf::Q000", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "list", "gold": "[\"Singapore-Cambridge GCE 'A' Level\", 'International Baccalaureate (IB) Diploma']", "pages": 27, "document_id": 7581, "arm": "native_pdf", "question_id": "2024.ug.eprospectus.pdf::Q000", "answer_letter": null, "raw_text": "I need to find information about application periods for different groups of applicants in this undergraduate prospectus.\n\nLooking at page 25 (document page 47), I can see a table titled \"APPLICATION PERIOD AND FEE\" that shows different application groups and their respective application periods.\n\nFrom this table, I can see the following end dates for different application groups:\n\n1. Singapore-Cambridge GCE 'A' Level: End: 19 Mar 2024\n2. Diploma Awarded by a Polytechnic or equivalent institution in Singapore: End: 21 Feb 2024\n3. NUS High School Diploma: End: 20 Jan 2024\n4. International Baccalaureate (IB) Diploma: End: 19 Mar 2024\n5. Part-Time B.Eng: End: 15 Jan 2024\n6. Part-Time B.Tech in Computing: End: 15 Jan 2024\n\nComparing these dates, the latest end date is 19 Mar 2024, which applies to two groups:\n- Singapore-Cambridge GCE 'A' Level applicants\n- International Baccalaureate (IB) Diploma applicants\n\nAnswer: Singapore-Cambridge GCE 'A' Level applicants, International Baccalaureate (IB) Diploma applicants", "citations": [], "input_tokens": 60375, "output_tokens": 307, "cost_micros": 185730, "latency_ms": 53796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2024.ug.eprospectus.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "international baccalaureate ib diploma applicants, singapore cambridge gce level applicants", "normalised_gold": "international baccalaureate ib diploma, singapore cambridge gce level"}}
+{"qid": "2024.ug.eprospectus.pdf::Q000", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "list", "gold": "[\"Singapore-Cambridge GCE 'A' Level\", 'International Baccalaureate (IB) Diploma']", "pages": 27, "document_id": 7581, "arm": "azure_basic_lc", "question_id": "2024.ug.eprospectus.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the APPLICATION PERIOD AND FEE table on pages 46-47, I need to identify which application groups have the latest end date.\n\nThe application periods listed are:\n\n1. **Singapore-Cambridge GCE 'A' Level**: End: 19 Mar 2024\n2. **Diploma Awarded by a Polytechnic or equivalent**: End: 20 Jan 2024\n3. **NUS High School Diploma**: End: 19 Mar 2024\n4. **International Baccalaureate (IB) Diploma**: End: 19 Mar 2024\n5. **Part-Time B.Eng**: End date not fully specified (appears cut off)\n6. **Part-Time B.Tech in Computing**: End: 21 Feb 2024\n\nFrom the complete dates shown, the latest end date is **19 Mar 2024**, which applies to three groups.\n\nAnswer: Singapore-Cambridge GCE 'A' Level, NUS High School Diploma, International Baccalaureate (IB) Diploma", "citations": [], "input_tokens": 15947, "output_tokens": 244, "cost_micros": 51501, "latency_ms": 4921, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.8, "method": "list_set", "normalised_pred": "international baccalaureate ib diploma, nus high school diploma, singapore cambridge gce level", "normalised_gold": "international baccalaureate ib diploma, singapore cambridge gce level"}}
+{"qid": "2024.ug.eprospectus.pdf::Q000", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "list", "gold": "[\"Singapore-Cambridge GCE 'A' Level\", 'International Baccalaureate (IB) Diploma']", "pages": 27, "document_id": 7581, "arm": "azure_premium_lc", "question_id": "2024.ug.eprospectus.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the application periods table on pages 46-47:\n\n- Singapore-Cambridge GCE 'A' Level: End 19 Mar 2024\n- Diploma Awarded by a Polytechnic: End 21 Feb 2024\n- NUS High School Diploma: End 20 Jan 2024\n- International Baccalaureate (IB) Diploma: End 19 Mar 2024\n- Part-Time B.Eng and Part-Time B.Tech: End 15 Jan 2024\n\nThe latest end date is 19 March 2024, which applies to two groups.\n\nAnswer: Singapore-Cambridge GCE 'A' Level, International Baccalaureate (IB) Diploma", "citations": [], "input_tokens": 19386, "output_tokens": 173, "cost_micros": 60753, "latency_ms": 4594, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "international baccalaureate ib diploma, singapore cambridge gce level", "normalised_gold": "international baccalaureate ib diploma, singapore cambridge gce level"}}
+{"qid": "2024.ug.eprospectus.pdf::Q000", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "list", "gold": "[\"Singapore-Cambridge GCE 'A' Level\", 'International Baccalaureate (IB) Diploma']", "pages": 27, "document_id": 7581, "arm": "llamacloud_basic_lc", "question_id": "2024.ug.eprospectus.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the application periods table on the document, I need to identify which application groups have the latest end date.\n\nFrom the \"APPLICATION PERIOD AND FEE\" table:\n\n1. Singapore-Cambridge GCE 'A' Level: End date is **19 Mar 2024**\n2. Diploma Awarded by a Polytechnic or equivalent institution in Singapore: End date is 21 Feb 2024\n3. NUS High School Diploma: End date is 20 Jan 2024\n4. International Baccalaureate (IB) Diploma: End date is **19 Mar 2024**\n\nThe latest end date is 19 Mar 2024, which is shared by two groups:\n- Singapore-Cambridge GCE 'A' Level\n- International Baccalaureate (IB) Diploma\n\nAnswer: Singapore-Cambridge GCE 'A' Level, International Baccalaureate (IB) Diploma", "citations": [], "input_tokens": 14659, "output_tokens": 220, "cost_micros": 47277, "latency_ms": 4921, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "international baccalaureate ib diploma, singapore cambridge gce level", "normalised_gold": "international baccalaureate ib diploma, singapore cambridge gce level"}}
+{"qid": "2024.ug.eprospectus.pdf::Q000", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "list", "gold": "[\"Singapore-Cambridge GCE 'A' Level\", 'International Baccalaureate (IB) Diploma']", "pages": 27, "document_id": 7581, "arm": "llamacloud_premium_lc", "question_id": "2024.ug.eprospectus.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the application period table on pages 46-47, I need to identify which application groups have the latest end date.\n\nThe application periods listed are:\n- Singapore-Cambridge GCE 'A' Level: End 19 Mar 2024\n- Diploma from Polytechnic: End 21 Feb 2024\n- NUS High School Diploma: End 20 Jan 2024\n- International Baccalaureate (IB) Diploma: End 19 Mar 2024\n- Part-Time B.Eng: End 15 Jan 2024\n\nThe latest end date is **19 Mar 2024**, which applies to two groups:\n\nAnswer: Singapore-Cambridge GCE 'A' Level, International Baccalaureate (IB) Diploma", "citations": [], "input_tokens": 18593, "output_tokens": 183, "cost_micros": 58524, "latency_ms": 5078, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "international baccalaureate ib diploma, singapore cambridge gce level", "normalised_gold": "international baccalaureate ib diploma, singapore cambridge gce level"}}
+{"qid": "2024.ug.eprospectus.pdf::Q000", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "list", "gold": "[\"Singapore-Cambridge GCE 'A' Level\", 'International Baccalaureate (IB) Diploma']", "pages": 27, "document_id": 7581, "arm": "surfsense_agentic", "question_id": "2024.ug.eprospectus.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find information about application periods for different groups of applicants.Let me search more specifically for dates and application-related content:Let me look at pages 46-47 which should contain the table with deadlines:Let me search for typical date formats and applicant categories:Great! I found the table. Let me read the specific section with the application periods:Perfect! I found the application period table. Now I can see the end dates for each group. Let me compare them:\n\n- Singapore-Cambridge GCE 'A' Level: End: 19 Mar 2024\n- Diploma Awarded by a Polytechnic: End: 21 Feb 2024\n- NUS High School Diploma: End: 20 Jan 2024\n- International Baccalaureate (IB) Diploma: End: 19 Mar 2024\n- Part-Time B.Eng: End: 15 Jan 2024\n- Part-Time B.Tech in Computing: 15 Jan 2024 (this appears to be both start and end date based on the context)\n\nThe latest end date is **19 Mar 2024**, which applies to two groups:\n1. Singapore-Cambridge GCE 'A' Level\n2. International Baccalaureate (IB) Diploma\n\nAnswer: Singapore-Cambridge GCE 'A' Level, International Baccalaureate (IB) Diploma", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 44218, "error": null, "extra": {"thread_id": 1953, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_4551739abb3d476cb8bf4c0348ee5447", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 278, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "international baccalaureate ib diploma, singapore cambridge gce level", "normalised_gold": "international baccalaureate ib diploma, singapore cambridge gce level"}}
+{"qid": "2024.ug.eprospectus.pdf::Q001", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "str", "gold": "Most Beautiful Campus", "pages": 27, "document_id": 7581, "arm": "native_pdf", "question_id": "2024.ug.eprospectus.pdf::Q001", "answer_letter": null, "raw_text": "Looking at page 3 of the document, which shows \"TOP 10 REASONS WHY NTU SMART CAMPUS IS FOR YOU\", I need to examine each numbered reason and its corresponding image to find which one does not include any person.\n\nGoing through the reasons with their images:\n\n1. Interdisciplinary Learning - shows a group of students\n2. Space to Innovate - shows people in a collaborative setting\n3. Industry-ready - shows a person at a desk\n4. Immersion in Research - shows a person in a lab setting (Rolls-Royce)\n5. Internship Opportunities - shows the \"AMD\" text/logo area with people\n6. Global Exposure - shows a person in a winter setting\n7. Be Among the Top Ranked - shows students walking\n8. Prestigious Scholarships and Comprehensive Financial Aid - shows students\n9. Most Beautiful Campus - shows an aerial view of the campus with greenery and pathways\n10. Vibrant Campus Life - shows students/people\n\nReason #9 \"Most Beautiful Campus\" shows an aerial photograph of the campus featuring buildings, green spaces, and pathways, but does not show any people in the image.\n\nAnswer: Most Beautiful Campus", "citations": [], "input_tokens": 60381, "output_tokens": 271, "cost_micros": 185208, "latency_ms": 35592, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2024.ug.eprospectus.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "most beautiful campus", "normalised_gold": "most beautiful campus"}}
+{"qid": "2024.ug.eprospectus.pdf::Q001", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "str", "gold": "Most Beautiful Campus", "pages": 27, "document_id": 7581, "arm": "azure_basic_lc", "question_id": "2024.ug.eprospectus.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the \"TOP 10 REASONS WHY NTU SMART CAMPUS IS FOR YOU\" section on pages 2-3, I need to identify which reason's corresponding figure/photo does not include any person.\n\nGoing through each reason with its associated image:\n\n1. Interdisciplinary Learning - has people\n2. Space to Innovate - has people  \n3. Industry-ready - has a photo credited to Rolls-Royce showing industrial equipment/machinery without visible people\n4. Immersion in Research - has people\n5. Prestigious Scholarships - not clearly shown with people\n6. Global Exposure - has people\n7. Be Among the Top Ranked - appears to be text/rankings\n8. Most Beautiful Campus - shows campus architecture/scenery\n9. Internship Opportunities - has people\n10. Vibrant Campus Life - has people\n\nThe reason labeled \"3 Industry-ready\" with the photo credit to Rolls-Royce shows industrial/laboratory equipment without any visible people in the image.\n\nAnswer: Industry-ready", "citations": [], "input_tokens": 15953, "output_tokens": 236, "cost_micros": 51399, "latency_ms": 7312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "industry ready", "normalised_gold": "most beautiful campus"}}
+{"qid": "2024.ug.eprospectus.pdf::Q001", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "str", "gold": "Most Beautiful Campus", "pages": 27, "document_id": 7581, "arm": "azure_premium_lc", "question_id": "2024.ug.eprospectus.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the \"TOP 10 REASONS WHY NTU SMART CAMPUS IS FOR YOU\" section, I need to examine each reason and its corresponding figure to identify which one does not include any person.\n\nGoing through the reasons:\n1. Interdisciplinary Learning - has figures/images with people\n2. Space to Innovate - has people in the image\n3. Industry-ready - has people\n4. Immersion in Research - has people (photo credit: Rolls-Royce)\n5. AMD Internship Opportunities - has people\n6. Global Exposure - has people\n7. Be Among the Top Ranked - this appears to be primarily text/rankings focused\n8. Prestigious Scholarships and Comprehensive Financial Aid - appears to be icons/symbols\n9. Most Beautiful Campus - this is described as about the campus itself\n10. Vibrant Campus Life - has people\n\nLooking at reason #9 \"Most Beautiful Campus\" - the description states \"The University's main campus is frequently listed among the Top 15 most beautiful university campuses in the world.\" This reason is about the campus architecture and grounds themselves, and the corresponding figure would logically show the campus landscape/buildings without people.\n\nAnswer: Most Beautiful Campus", "citations": [], "input_tokens": 19392, "output_tokens": 273, "cost_micros": 62271, "latency_ms": 8437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "most beautiful campus", "normalised_gold": "most beautiful campus"}}
+{"qid": "2024.ug.eprospectus.pdf::Q001", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "str", "gold": "Most Beautiful Campus", "pages": 27, "document_id": 7581, "arm": "llamacloud_basic_lc", "question_id": "2024.ug.eprospectus.pdf::Q001", "answer_letter": null, "raw_text": "I need to look at the \"TOP 10 REASONS WHY NTU SMART CAMPUS IS FOR YOU\" section and examine the figures/images associated with each reason to identify which one does not include any person.\n\nFrom the document, the top 10 reasons are listed on pages 2-3:\n\n1. Be Among the Top Ranked\n2. Space to Innovate\n3. Industry-ready\n4. Prestigious Scholarships and Comprehensive Financial Aid\n5. Most Beautiful Campus\n6. Vibrant Campus Life\n7. Internship Opportunities\n8. Immersion in Research\n9. (appears to have a photo credit: Rolls-Royce)\n10. Vibrant Campus Life (repeated)\n\nThe document mentions \"Photo credit: Rolls-Royce\" which suggests there's an image for reason #9 \"Immersion in Research.\"\n\nLooking at the layout, reason #5 is \"Most Beautiful Campus\" - this reason would typically show campus buildings and architecture rather than people, as it's about the physical beauty of the campus itself.\n\nAnswer: Most Beautiful Campus", "citations": [], "input_tokens": 14665, "output_tokens": 246, "cost_micros": 47685, "latency_ms": 7796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "most beautiful campus", "normalised_gold": "most beautiful campus"}}
+{"qid": "2024.ug.eprospectus.pdf::Q001", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "str", "gold": "Most Beautiful Campus", "pages": 27, "document_id": 7581, "arm": "llamacloud_premium_lc", "question_id": "2024.ug.eprospectus.pdf::Q001", "answer_letter": null, "raw_text": "Looking through the \"TOP 10 REASONS WHY NTU SMART CAMPUS IS FOR YOU\" section on pages 2-3, I need to examine the images associated with each reason.\n\nThe 10 reasons are:\n1. Interdisciplinary Learning\n2. Space to Innovate\n3. Industry-ready\n4. Immersion in Research\n5. Internship Opportunities\n6. Global Exposure\n7. Be Among the Top Ranked\n8. Prestigious Scholarships and Comprehensive Financial Aid\n9. Most Beautiful Campus\n10. Vibrant Campus Life\n\nLooking at the corresponding figures/images described:\n- Reason #4 \"Immersion in Research\" includes a photo credit to Rolls-Royce, and the image appears to show industrial/research equipment rather than people.\n- Reason #9 \"Most Beautiful Campus\" states \"The University's main campus is frequently listed among the Top 15 most beautiful university campuses in the world\" and the cover page shows \"a large, modern building with a curved facade featuring prominent vertical wooden louvers or fins\" with \"green landscaping visible at the base\" - this is an architectural shot without people visible.\n\nThe most likely answer based on the description is reason #9, as it explicitly describes a campus building without mentioning any people in the scene.\n\nAnswer: Most Beautiful Campus", "citations": [], "input_tokens": 18599, "output_tokens": 300, "cost_micros": 60297, "latency_ms": 9328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "most beautiful campus", "normalised_gold": "most beautiful campus"}}
+{"qid": "2024.ug.eprospectus.pdf::Q001", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "str", "gold": "Most Beautiful Campus", "pages": 27, "document_id": 7581, "arm": "surfsense_agentic", "question_id": "2024.ug.eprospectus.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the document to find the top 10 reasons why NTU smart campus is for you and identify which reason has a corresponding figure without any person.Let me search for \"top 10 reasons\" or \"smart campus\" in the document:Let me read the section around line 448-488 where the \"TOP 10\" appears:Good! I can see the beginning of the top 10 reasons. Let me continue reading to see all 10 reasons and their corresponding figures:Let me continue reading to see all the images associated with each reason:Now let me go back and carefully read the section with the 10 reasons to identify each reason and its associated image description. Let me re-read more carefully:Based on my reading of the document, I can now identify the 10 reasons and their corresponding figures. Let me analyze the images:\n\n1. **Interdisciplinary Learning** (Reason 1) - Image shows a female researcher/scientist in a laboratory (person present)\n\n2. **Space to Innovate** (Reason 2) - Image shows a person positioned on left side operating equipment in a laboratory (person present)\n\n3. **Industry-ready** (Reason 3) - Image shows six young adults standing in a horizontal line (people present)\n\n4. **Immersion in Research** (Reason 4) - Image shows a single person standing next to AMD logo (person present)\n\n5. **Internship Opportunities** (Reason 5) - The image associated with reasons 5-10 appears to be Im5.jpg\n\n6. **Global Exposure** (Reason 6) - Image (Im5.jpg) shows a single person standing on snowy terrace/viewing area with ch\u00e2teau backdrop (person present)\n\n7. **Be Among the Top Ranked** (Reason 7) - This appears to share imagery with the ranking pages\n\n8. **Prestigious Scholarships and Comprehensive Financial Aid** (Reason 8) - Part of the same figure group\n\n9. **Most Beautiful Campus** (Reason 9) - The next distinct image is Im7.jpg which shows an aerial view of a modern curved/spiral building with NO PEOPLE mentioned in the description - only architecture and landscape features\n\n10. **Vibrant Campus Life** (Reason 10) - Part of the combined figure\n\nThe reason that does not include any person in the corresponding figure is **Reason 9: Most Beautiful Campus** - the image shows only an aerial view of the building architecture and landscape with no people.\n\nAnswer: Most Beautiful Campus", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 48717, "error": null, "extra": {"thread_id": 1954, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_939ae241eba1446aa240613de9b11e4b", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 370, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "most beautiful campus", "normalised_gold": "most beautiful campus"}}
+{"qid": "2024.ug.eprospectus.pdf::Q002", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "str", "gold": "Medicine", "pages": 27, "document_id": 7581, "arm": "native_pdf", "question_id": "2024.ug.eprospectus.pdf::Q002", "answer_letter": null, "raw_text": "I'll search through the document for images of students playing badminton and identify their major.\n\nLooking through the pages, I can see several student testimonials with photos and their majors listed.\n\nOn page 22 (document page 41), there are two images at the bottom of the page showing students in athletic settings. The text mentions:\n\n\"\u2014 JADE LIM SI MIN\nPhilosophy, Class of 2023\n\nNTU has been a springboard for my growth as a student leader. Through the Student Leadership Development Programme, Varsity Sports Association and FISU World Forum, NTU has equipped me with the leadership skills necessary to succeed both in my sport and beyond.\"\n\nAnd:\n\n\"\u2014 GRACE CHUA HUI ZHEN\nMedicine, Class of 2023\n\nNTU has allowed me to pursue my two passions: academics and athletics. From letting me participate in competitions to offering generous subsidies to giving encouragement, I really feel grateful for the school's the tremendous support.\"\n\nThe image on the right shows what appears to be a badminton player in action. Based on the layout and the caption, this appears to be Grace Chua Hui Zhen from Medicine.\n\nAnswer: Medicine", "citations": [], "input_tokens": 60367, "output_tokens": 265, "cost_micros": 185076, "latency_ms": 45125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2024.ug.eprospectus.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "medicine", "normalised_gold": "medicine"}}
+{"qid": "2024.ug.eprospectus.pdf::Q002", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "str", "gold": "Medicine", "pages": 27, "document_id": 7581, "arm": "azure_basic_lc", "question_id": "2024.ug.eprospectus.pdf::Q002", "answer_letter": null, "raw_text": "I need to search through the document for images or mentions of students playing badminton.\n\nLooking through the document, I can find several student testimonials with their names, majors, and years listed. Let me check for any mention of badminton or sports activities.\n\nOn page 41, there is a section about \"SPORTS @ NTU\" with several images. I can see mentions of:\n- JADE LIM SI MIN, Philosophy, Class of 2023 - with text about varsity sports\n- GRACE CHUA HUI ZHEN, Medicine, Class of 2023 - with text about academics and athletics\n\nHowever, I need to look more carefully at the actual images in the document. The document mentions \"Griffin at Lee Wee Nam Library\" as an image caption on page 42.\n\nLooking more carefully at the sports section and student profiles, I can see references to students involved in sports, but I need to identify specifically which student's picture shows them playing badminton.\n\nUpon careful review of the document, I cannot definitively identify from the text alone which specific student's picture shows them playing badminton, as the document doesn't explicitly describe the content of each photograph beyond the captions with names and majors.\n\nHowever, based on the context in the sports section where student testimonials are shown, and the typical layout of such brochures, if there is a badminton-related image, it would likely be associated with one of the students featured in the sports section.\n\nAnswer: I cannot determine this from the text provided in the document.", "citations": [], "input_tokens": 15939, "output_tokens": 339, "cost_micros": 52902, "latency_ms": 8358, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "i cannot determine this from text provided in document", "normalised_gold": "medicine"}}
+{"qid": "2024.ug.eprospectus.pdf::Q002", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "str", "gold": "Medicine", "pages": 27, "document_id": 7581, "arm": "azure_premium_lc", "question_id": "2024.ug.eprospectus.pdf::Q002", "answer_letter": null, "raw_text": "I need to search through the document for images of someone playing badminton and identify which major that student belongs to.\n\nLooking through the document, I can see several student photos with their names and majors mentioned:\n\n1. Nick Erwin Chia Zhiyang - Renaissance Engineering Programme, Year 4\n2. Gladys Phua Min - Communication Studies, Year 2\n3. Elijah Adam Bin Abdul Haady Aldrin - Chemistry & Biological Chemistry, Year 1\n4. Joey Tay - NTU-NIE TSP, Year 2\n5. Stella Chua Zhi Yun - Mechanical Engineering, Year 3\n6. Seth Low - Communication Studies, Year 4\n7. Balaji Srinivasan - Biological Sciences and Psychology, Year 4\n8. Muhamed Shamir Jaffar Hussein - MBBS, Year 4\n9. Wang Shi Lin Nicole - Business, Year 3\n10. Ruth Koh - Sport Science & Management, Year 3\n\nOn page 41, there is a section about sports and I can see images related to sports activities. Looking at the bottom of page 41, there is a photo with Grace Chua Hui Zhen from Medicine, Class of 2023, and the image appears to show her playing badminton (based on the context of sports activities and the layout).\n\nAnswer: Medicine", "citations": [], "input_tokens": 19378, "output_tokens": 320, "cost_micros": 62934, "latency_ms": 10280, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "medicine", "normalised_gold": "medicine"}}
+{"qid": "2024.ug.eprospectus.pdf::Q002", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "str", "gold": "Medicine", "pages": 27, "document_id": 7581, "arm": "llamacloud_basic_lc", "question_id": "2024.ug.eprospectus.pdf::Q002", "answer_letter": null, "raw_text": "I'll search through the document for any images or references to badminton playing.\n\nLooking through the document, I can see several student profiles with quotes and their majors listed, but I need to find one specifically associated with a badminton image.\n\nOn page 41, I can see there are sports-related images and student profiles. The document shows:\n\n- JADE LIM SI MIN, Philosophy, Class of 2023 - associated with FISU WORLD FORUM and sports\n- GRACE CHUA HUI ZHEN, Medicine, Class of 2023 - mentioned in relation to academics and athletics\n\nHowever, I cannot definitively identify which specific image shows badminton or which student's picture is specifically playing badminton from the text content provided. The document mentions \"SPORTS @ NTU\" section and various athletic activities, but the actual image descriptions are not clear enough in the text to determine which major the badminton player belongs to.\n\nBased on the limited visual descriptions available in the text, I cannot provide a definitive answer about which student major is shown playing badminton.\n\nAnswer: Cannot be determined from the provided document text", "citations": [], "input_tokens": 14651, "output_tokens": 250, "cost_micros": 47703, "latency_ms": 7546, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "cannot be determined from provided document text", "normalised_gold": "medicine"}}
+{"qid": "2024.ug.eprospectus.pdf::Q002", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "str", "gold": "Medicine", "pages": 27, "document_id": 7581, "arm": "llamacloud_premium_lc", "question_id": "2024.ug.eprospectus.pdf::Q002", "answer_letter": null, "raw_text": "Looking through the document for the badminton image...\n\nOn page 41, there is a caption under an image that states:\n\n> GRACE CHUA HUI ZHEN\n> Medicine, Class of 2023\n\nThe caption appears below text mentioning \"a student playing badminton in a competition.\"\n\nAnswer: Medicine", "citations": [], "input_tokens": 18585, "output_tokens": 74, "cost_micros": 56865, "latency_ms": 5875, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "medicine", "normalised_gold": "medicine"}}
+{"qid": "2024.ug.eprospectus.pdf::Q002", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "str", "gold": "Medicine", "pages": 27, "document_id": 7581, "arm": "surfsense_agentic", "question_id": "2024.ug.eprospectus.pdf::Q002", "answer_letter": null, "raw_text": "I'll search for information about a student playing badminton in the brochure document.Let me search for \"badminton\" in the document:Now let me read the context around the badminton image to find the student's major:Let me look earlier in the document to see the context. The badminton image appears on page about \"Communication-Focused\". Let me search for more context around this section:Let me look at what comes before this section to identify which school/major this is part of:I can see this is about CoHASS (College of Humanities, Arts, and Social Sciences). Let me search for where specific majors or student testimonials are mentioned near the badminton image:I see Seth Low is from Communication Studies. Let me check if the badminton image is associated with him or if there's another student profile. Let me look at the pages more carefully:Let me look at the earlier mention at line 2200 to see if there's another student:That's not the badminton image. Let me look more systematically. The badminton image appears around line 3343. Let me check what's before and after to see which student profile or section it's associated with:The badminton image appears on page 20 (based on the page numbers), and the text after it (page 21) shows Seth Low from Communication Studies. But let me look more carefully at the layout. Let me check the section just before the badminton image to see which school section it belongs to:I see on page 19 there's a quote from Seth Low, Communication Studies. Now let me look at page 20 where the badminton image is:Let me look at the structure better. The Im1.jpg is on page 19 (Seth Low's image), then page 20 starts with CoHASS section. The badminton image Im0.jpg should be on page 20. Let me check the page markers more carefully:So page 19 ends at line 3127, then the next page marker is page 21 at line 3387. So the badminton image (around line 3343) is on page 20 which doesn't have an explicit marker. Let me look at the entire section more carefully to see which school this belongs to:Now I see WKWSCI (Wee Kim Wee School of Communication and Information) section starts around line 3548, but the badminton image is earlier around line 3343. Let me trace back to see which section the badminton image actually belongs to. Let me read backwards from line 3320:Perfect! Now I can see the structure. The badminton image (Im0.jpg) appears under the \"Communication-Focused\" section which is part of the CoHASS (College of Humanities, Arts and Social Sciences) overview on page 20. This section appears before the individual school pages. Then on page 21, we see Seth Low from Communication Studies.\n\nSo the badminton image is associated with Communication Studies, which is evident from:\n1. It's on the CoHASS overview page (page 20)\n2. The \"Communication-Focused\" heading right before the badminton image\n3. Seth Low, Communication Studies student, appears on the next page (21)\n4. WKWSCI (Wee Kim Wee School of Communication and Information) is listed as one of the schools under CoHASS\n\nAnswer: Communication Studies", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 83500, "error": null, "extra": {"thread_id": 1955, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_2258a5b949624b49af1a1b98a8b2e791", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 658, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "communication studies", "normalised_gold": "medicine"}}
+{"qid": "2024.ug.eprospectus.pdf::Q004", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "list", "gold": "['College of Engineering', 'Nanyang Business School', 'National Institute of Education']", "pages": 27, "document_id": 7581, "arm": "native_pdf", "question_id": "2024.ug.eprospectus.pdf::Q004", "answer_letter": null, "raw_text": "I'll search through the document to identify which colleges/schools/institutes feature Year 3 students on their cover pages or promotional sections.\n\nLooking through the document:\n\n1. Page 10 (College of Engineering section) - Features **Stella Chua Zhi Yun, Mechanical Engineering, Year 3**\n\n2. Page 12 (College of Humanities, Arts and Social Sciences) - Features **Seth Low, Communication Studies, Year 4** (not Year 3)\n\n3. Page 14 (College of Science) - Features **Balaji Srinivasan, Biological Sciences and Psychology, Year 4** (not Year 3)\n\n4. Page 16 (Lee Kong Chian School of Medicine) - Features **Muhamed Shamir Jaffar Hussein, MBBS, Year 4** (not Year 3)\n\n5. Page 17 (Nanyang Business School) - Features **Wang Shi Lin Nicole, Business, Year 3**\n\n6. Page 18 (National Institute of Education) - Features **Ruth Koh, Sport Science & Management, Year 3**\n\nAnswer: College of Engineering, Nanyang Business School, National Institute of Education", "citations": [], "input_tokens": 60377, "output_tokens": 272, "cost_micros": 185211, "latency_ms": 42937, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2024.ug.eprospectus.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "college of engineering, nanyang business school, national institute of education", "normalised_gold": "college of engineering, nanyang business school, national institute of education"}}
+{"qid": "2024.ug.eprospectus.pdf::Q004", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "list", "gold": "['College of Engineering', 'Nanyang Business School', 'National Institute of Education']", "pages": 27, "document_id": 7581, "arm": "azure_basic_lc", "question_id": "2024.ug.eprospectus.pdf::Q004", "answer_letter": null, "raw_text": "I need to look through the document to identify which colleges/schools/institutes feature Year 3 students on their cover/featured pages.\n\nLet me examine each section:\n\n1. **College of Engineering** (page 16-17): Features STELLA CHUA ZHI YUN, Mechanical Engineering, Year 3\n\n2. **College of Humanities, Arts and Social Sciences** (page 20-21): Features SETH LOW, Communication Studies, Year 4 (not Year 3)\n\n3. **College of Science** (page 24-25): Features BALAJI SRINIVASAN, Biological Sciences and Psychology, Year 4 (not Year 3)\n\n4. **Lee Kong Chian School of Medicine** (page 28-29): Features MUHAMED SHAMIR JAFFAR HUSSEIN, MBBS, Year 4 (not Year 3)\n\n5. **Nanyang Business School** (page 30-31): Features WANG SHI LIN NICOLE, Business, Year 3\n\n6. **National Institute of Education** (page 32-33): Features RUTH KOH, Sport Science & Management, Year 3\n\nAnswer: College of Engineering, Nanyang Business School, National Institute of Education", "citations": [], "input_tokens": 15949, "output_tokens": 282, "cost_micros": 52077, "latency_ms": 7796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "college of engineering, nanyang business school, national institute of education", "normalised_gold": "college of engineering, nanyang business school, national institute of education"}}
+{"qid": "2024.ug.eprospectus.pdf::Q004", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "list", "gold": "['College of Engineering', 'Nanyang Business School', 'National Institute of Education']", "pages": 27, "document_id": 7581, "arm": "azure_premium_lc", "question_id": "2024.ug.eprospectus.pdf::Q004", "answer_letter": null, "raw_text": "I need to examine the document to identify which colleges/schools/institutes feature year 3 students on their cover pages.\n\nLooking through the document:\n\n1. **College of Engineering** (page 16) - Features STELLA CHUA ZHI YUN, Mechanical Engineering, **Year 3**\n\n2. **College of Humanities, Arts and Social Sciences** (page 20) - Features SETH LOW, Communication Studies, Year 4 (not year 3)\n\n3. **College of Science** (page 24) - Features BALAJI SRINIVASAN, Biological Sciences and Psychology, Year 4 (not year 3)\n\n4. **Lee Kong Chian School of Medicine** (page 28) - Features MUHAMED SHAMIR JAFFAR HUSSEIN, MBBS, Year 4 (not year 3)\n\n5. **Nanyang Business School** (page 30) - Features WANG SHI LIN NICOLE, Business, **Year 3**\n\n6. **National Institute of Education** (page 32) - Features RUTH KOH, Sport Science & Management, **Year 3**\n\nAnswer: College of Engineering, Nanyang Business School, National Institute of Education", "citations": [], "input_tokens": 19388, "output_tokens": 278, "cost_micros": 62334, "latency_ms": 6905, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "college of engineering, nanyang business school, national institute of education", "normalised_gold": "college of engineering, nanyang business school, national institute of education"}}
+{"qid": "2024.ug.eprospectus.pdf::Q004", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "list", "gold": "['College of Engineering', 'Nanyang Business School', 'National Institute of Education']", "pages": 27, "document_id": 7581, "arm": "llamacloud_basic_lc", "question_id": "2024.ug.eprospectus.pdf::Q004", "answer_letter": null, "raw_text": "I'll search through the document for references to Year 3 students who appear as student models on cover pages or in promotional materials.\n\nLooking through the document, I can find the following Year 3 students quoted/featured:\n\n1. Page 16 - STELLA CHUA ZHI YUN, Mechanical Engineering, Year 3 - This is from the **College of Engineering** section\n\n2. Page 30 - WANG SHI LIN NICOLE, Business, Year 3 - This is from the **Nanyang Business School** section (she appears twice on this page)\n\n3. Page 33 - RUTH KOH, Sport Science & Management, Year 3 - This is from the **National Institute of Education** section (she appears twice on this page)\n\nAnswer: College of Engineering, Nanyang Business School, National Institute of Education", "citations": [], "input_tokens": 14661, "output_tokens": 193, "cost_micros": 46878, "latency_ms": 6233, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "college of engineering, nanyang business school, national institute of education", "normalised_gold": "college of engineering, nanyang business school, national institute of education"}}
+{"qid": "2024.ug.eprospectus.pdf::Q004", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "list", "gold": "['College of Engineering', 'Nanyang Business School', 'National Institute of Education']", "pages": 27, "document_id": 7581, "arm": "llamacloud_premium_lc", "question_id": "2024.ug.eprospectus.pdf::Q004", "answer_letter": null, "raw_text": "I need to identify which colleges/schools/institutes feature Year 3 students on their cover/promotional pages in this prospectus.\n\nLet me search through the document for mentions of students with their year level and their associated colleges/schools:\n\n1. **College of Engineering** (page 16-17):\n   - STELLA CHUA ZHI YUN, Mechanical Engineering, Year 3\n\n2. **College of Humanities, Arts and Social Sciences** (page 20-21):\n   - SETH LOW, Communication Studies, Year 4\n\n3. **College of Science** (page 24-25):\n   - BALAJI SRINIVASAN, Biological Sciences and Psychology, Year 4\n\n4. **Lee Kong Chian School of Medicine** (page 28-29):\n   - MUHAMED SHAMIR JAFFAR HUSSEIN, MBBS, Year 4\n\n5. **Nanyang Business School** (page 30-31):\n   - WANG SHI LIN NICOLE, Business, Year 3\n\n6. **National Institute of Education** (page 32-33):\n   - RUTH KOH, Sport Science & Management, Year 3\n\nFrom this analysis, the colleges/schools/institutes that use Year 3 students as their cover page student models are:\n- College of Engineering (Stella Chua)\n- Nanyang Business School (Wang Shi Lin Nicole)\n- National Institute of Education (Ruth Koh)\n\nAnswer: College of Engineering, Nanyang Business School, National Institute of Education", "citations": [], "input_tokens": 18595, "output_tokens": 352, "cost_micros": 61065, "latency_ms": 8125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "college of engineering, nanyang business school, national institute of education", "normalised_gold": "college of engineering, nanyang business school, national institute of education"}}
+{"qid": "2024.ug.eprospectus.pdf::Q004", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "list", "gold": "['College of Engineering', 'Nanyang Business School', 'National Institute of Education']", "pages": 27, "document_id": 7581, "arm": "surfsense_agentic", "question_id": "2024.ug.eprospectus.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to find which colleges/schools/institutes from NTU use year 3 students as cover page student models.Let me search for information about cover page student models and year 3 students.Let me search for \"Year 3\" with capital letters and look at the beginning of the document which might contain information about cover pages or student models.Now let me search for information about cover pages and models. Let me read the beginning of the document more carefully:Let me search more specifically for information about models or photos. Let me look for text that mentions students being models or featured on covers:Let me look at the end of the document, which often contains credits or acknowledgments about models or photos:Let me search for the total lines in this document and read from the end:Let me read closer to the end of the document where credits or acknowledgments are typically found:Let me continue reading towards the very end where photo credits or acknowledgments might be:Let me search near the very end of the document:The document ended without showing photo credits. Let me search more specifically for mentions of colleges and Year 3 together. Let me look at those specific line numbers from the grep results:Good! I found one: Stella Chua Zhi Yun is from Mechanical Engineering, Year 3. Now let me check the other instances:Good! Wang Shi Lin Nicole is from Business (Nanyang Business School), Year 3. Now let me check the other instances:Good! Ruth Koh is from Sport Science & Management (National Institute of Education), Year 3. Now let me check the next instance:This is another mention of Ruth Koh. Now let me look more carefully at the context. I need to understand which of these students are featured on \"cover pages\" of their respective schools/colleges. Let me search for any section that lists or describes cover page models:None of these matches refer to \"cover page\" in the context I'm looking for. Let me look at the actual pages where these students appear to see if they are featured as cover page models. Let me check around the Mechanical Engineering section:Now let me look before this section to see if there's a cover page for College of Engineering:Perfect! I can see that Stella Chua Zhi Yun (Mechanical Engineering, Year 3) is featured on the cover page for the College of Engineering section. Now let me check the other sections:Great! I can see Wang Shi Lin Nicole (Business, Year 3) is featured on the cover page for Nanyang Business School. Now let me check the NIE section:Perfect! I can see Ruth Koh (Sport Science & Management, Year 3) is featured on the cover page for the National Institute of Education (NIE) section.\n\nNow let me check if there are any other colleges/schools that I might have missed. Let me look at the table of contents and see what major sections exist:Let me check if there's a cover page model for College of Humanities, Arts and Social Sciences (CoHASS):I can see Seth Low (Communication Studies, Year 4) is mentioned but not Year 3. Let me check the College of Science:I can see Balaji Srinivasan (Biological Sciences and Psychology, Year 4) is mentioned for College of Science, but not Year 3. Let me check if there's a cover page before the College of Science section:So for College of Science, the cover page model is Balaji Srinivasan (Year 4), not Year 3. Let me also check Lee Kong Chian School of Medicine:Let me check the cover page before Lee Kong Chian School of Medicine:So for Lee Kong Chian School of Medicine, the cover page model is Muhamed Shamir Jaffar Hussein (MBBS, Year 4), not Year 3.\n\nNow let me systematically check all the major schools/colleges. Based on my review:\n\n**Year 3 students as cover page models:**\n1. **College of Engineering** - Stella Chua Zhi Yun (Mechanical Engineering, Year 3)\n2. **Nanyang Business School** - Wang Shi Lin Nicole (Business, Year 3)\n3. **National Institute of Education** - Ruth Koh (Sport Science & Management, Year 3)\n\nLet me verify there are no other colleges I missed by checking the table of contents more carefully:Based on the table of contents, the main colleges/schools are:\n1. College of Engineering (has Year 3 student - Stella Chua)\n2. Colleges of Humanities, Arts and Social Sciences (has Year 4 student - Seth Low)\n3. Colleges of Science (has Year 4 student - Balaji Srinivasan)\n4. Lee Kong Chian School of Medicine (has Year 4 student - Muhamed Shamir)\n5. Nanyang Business School (has Year 3 student - Wang Shi Lin Nicole)\n6. National Institute of Education (has Year 3 student - Ruth Koh)\n\nSo the colleges/schools/institutes that use Year 3 students as cover page models are:\n1. College of Engineering\n2. Nanyang Business School\n3. National Institute of Education\n\nAnswer: College of Engineering, Nanyang Business School, National Institute of Education", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 154578, "error": null, "extra": {"thread_id": 1956, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_071293dfad4941b99ec3b52bc671c7da", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 1092, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "college of engineering, nanyang business school, national institute of education", "normalised_gold": "college of engineering, nanyang business school, national institute of education"}}
+{"qid": "2024.ug.eprospectus.pdf::Q005", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "int", "gold": "3", "pages": 27, "document_id": 7581, "arm": "native_pdf", "question_id": "2024.ug.eprospectus.pdf::Q005", "answer_letter": null, "raw_text": "I need to find information about the College of Humanities, Arts, and Social Sciences (CoHASS) and count how many schools are listed but not individually introduced in detail.\n\nFrom page 12, I can see the College of Humanities, Arts and Social Sciences section lists \"OUR SCHOOLS\":\n- School of Art, Design and Media\n- School of Humanities\n- School of Social Sciences\n- Wee Kim Wee School of Communication and Information\n- Nanyang Centre for Public Administration\n- NTU Centre for Contemporary Art Singapore\n- Centre for Liberal Arts and Social Sciences\n\nThat's 7 entities listed under \"OUR SCHOOLS\".\n\nLooking at pages 13-14, I can see detailed introductions for:\n1. School of Art, Design and Media (ADM) - page 13\n2. School of Humanities (SoH) - page 13\n3. School of Social Sciences (SSS@NTU) - page 13\n4. Wee Kim Wee School of Communication and Information (WKWSCI) - page 13\n\nThese 4 schools have detailed \"ABOUT\" sections with descriptions of their programs and advantages on pages 22-23.\n\nThe entities that are listed but NOT individually introduced in detail are:\n- Nanyang Centre for Public Administration\n- NTU Centre for Contemporary Art Singapore\n- Centre for Liberal Arts and Social Sciences\n\nThat's 3 entities.\n\nAnswer: 3", "citations": [], "input_tokens": 60364, "output_tokens": 324, "cost_micros": 185952, "latency_ms": 38015, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2024.ug.eprospectus.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "3"}}
+{"qid": "2024.ug.eprospectus.pdf::Q005", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "int", "gold": "3", "pages": 27, "document_id": 7581, "arm": "azure_basic_lc", "question_id": "2024.ug.eprospectus.pdf::Q005", "answer_letter": null, "raw_text": "I need to identify the schools in the College of Humanities, Arts, and Social Sciences (CoHASS) and determine which ones are NOT introduced in detail.\n\nFrom page 20-21, the CoHASS section lists \"OUR SCHOOLS\":\n1. School of Art, Design and Media\n2. School of Humanities\n3. School of Social Sciences\n4. Wee Kim Wee School of Communication and Information\n\nIt also mentions:\n- Nanyang Centre for Public Administration\n- NTU Centre for Contemporary Art Singapore\n- Centre for Liberal Arts and Social Sciences\n\nNow, let me check which schools get detailed individual introductions on pages 22-23:\n\n1. **School of Art, Design and Media (ADM)** - YES, has detailed section on page 22\n2. **School of Humanities (SoH)** - YES, has detailed section on page 22\n3. **School of Social Sciences (SSS@NTU)** - YES, has detailed section on page 23\n4. **Wee Kim Wee School of Communication and Information (WKWSCI)** - YES, has detailed section on page 23\n\nThe three centers/institutes mentioned (Nanyang Centre for Public Administration, NTU Centre for Contemporary Art Singapore, Centre for Liberal Arts and Social Sciences) are listed under \"OUR SCHOOLS\" but do NOT have individual detailed introductions.\n\nTherefore, 3 entities listed under CoHASS schools do not have individual detailed introductions.\n\nAnswer: 3", "citations": [], "input_tokens": 15936, "output_tokens": 340, "cost_micros": 52908, "latency_ms": 8296, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "3"}}
+{"qid": "2024.ug.eprospectus.pdf::Q005", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "int", "gold": "3", "pages": 27, "document_id": 7581, "arm": "azure_premium_lc", "question_id": "2024.ug.eprospectus.pdf::Q005", "answer_letter": null, "raw_text": "I need to identify the schools that belong to the College of Humanities, Arts and Social Sciences (CoHASS) and then determine which ones are NOT individually introduced in detail.\n\nFrom page 20, the document lists the schools under CoHASS:\n1. School of Art, Design and Media\n2. School of Humanities\n3. School of Social Sciences\n4. Wee Kim Wee School of Communication and Information\n5. Nanyang Centre for Public Administration\n6. NTU Centre for Contemporary Art Singapore\n7. Centre for Liberal Arts and Social Sciences\n\nNow I need to check which schools have individual detailed introductions on pages 22-23 (the \"LEARN MORE ABOUT OUR SCHOOLS\" section):\n\nPage 22:\n- School of Art, Design and Media (ADM) - YES, has detailed introduction\n- School of Humanities (SoH) - YES, has detailed introduction\n\nPage 23:\n- School of Social Sciences (SSS@NTU) - YES, has detailed introduction\n- Wee Kim Wee School of Communication and Information (WKWSCI) - YES, has detailed introduction\n\nSchools that are NOT individually introduced in detail:\n1. Nanyang Centre for Public Administration\n2. NTU Centre for Contemporary Art Singapore\n3. Centre for Liberal Arts and Social Sciences\n\nAnswer: 3", "citations": [], "input_tokens": 19375, "output_tokens": 302, "cost_micros": 62655, "latency_ms": 7750, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "3"}}
+{"qid": "2024.ug.eprospectus.pdf::Q005", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "int", "gold": "3", "pages": 27, "document_id": 7581, "arm": "llamacloud_basic_lc", "question_id": "2024.ug.eprospectus.pdf::Q005", "answer_letter": null, "raw_text": "I need to identify the schools within the College of Humanities, Arts, and Social Sciences (CoHASS) and then determine which ones are NOT individually introduced in detail.\n\nFirst, let me find the list of schools under CoHASS from page 20:\n\n**OUR SCHOOLS:**\n1. School of Art, Design and Media\n2. School of Humanities\n3. School of Social Sciences\n4. Wee Kim Wee School of Communication and Information\n5. Nanyang Centre for Public Administration\n6. NTU Centre for Contemporary Art Singapore\n7. Centre for Liberal Arts and Social Sciences\n\nThat's 7 schools total listed under CoHASS.\n\nNow let me check which schools have detailed individual introductions in the document (pages 21-23):\n\n1. **School of Art, Design and Media (ADM)** - YES, detailed introduction on page 22\n2. **School of Humanities (SoH)** - YES, detailed introduction on page 22\n3. **School of Social Sciences (SSS@NTU)** - YES, detailed introduction on page 22\n4. **Wee Kim Wee School of Communication and Information (WKWSCI)** - YES, detailed introduction on page 23\n\nThe following schools are listed but do NOT have individual detailed introductions:\n5. Nanyang Centre for Public Administration - NO detailed introduction\n6. NTU Centre for Contemporary Art Singapore - NO detailed introduction\n7. Centre for Liberal Arts and Social Sciences - NO detailed introduction\n\nTherefore, 3 schools are not individually introduced in detail.\n\nAnswer: 3", "citations": [], "input_tokens": 14648, "output_tokens": 358, "cost_micros": 49314, "latency_ms": 7016, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "3"}}
+{"qid": "2024.ug.eprospectus.pdf::Q005", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "int", "gold": "3", "pages": 27, "document_id": 7581, "arm": "llamacloud_premium_lc", "question_id": "2024.ug.eprospectus.pdf::Q005", "answer_letter": null, "raw_text": "I need to identify which schools from the College of Humanities, Arts, and Social Sciences (CoHASS) are listed, and then determine which ones are NOT introduced in detail.\n\nFrom the document, the CoHASS schools listed on page 20 are:\n1. School of Art, Design and Media\n2. School of Humanities\n3. School of Social Sciences\n4. Wee Kim Wee School of Communication and Information\n5. Nanyang Centre for Public Administration\n6. NTU Centre for Contemporary Art Singapore\n7. Centre for Liberal Arts and Social Sciences\n\nNow, looking at the detailed introductions on pages 22-23 under \"LEARN MORE ABOUT OUR SCHOOLS\":\n1. School of Art, Design and Media (ADM) - YES, has detailed introduction\n2. School of Social Sciences (SSS@NTU) - YES, has detailed introduction\n3. School of Humanities (SoH) - YES, has detailed introduction\n4. Wee Kim Wee School of Communication and Information (WKWSCI) - YES, has detailed introduction\n\nThe schools that are NOT introduced in detail:\n1. Nanyang Centre for Public Administration\n2. NTU Centre for Contemporary Art Singapore\n3. Centre for Liberal Arts and Social Sciences\n\nAnswer: 3", "citations": [], "input_tokens": 18582, "output_tokens": 292, "cost_micros": 60126, "latency_ms": 7359, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "3"}}
+{"qid": "2024.ug.eprospectus.pdf::Q005", "doc_id": "2024.ug.eprospectus.pdf", "answer_format": "int", "gold": "3", "pages": 27, "document_id": 7581, "arm": "surfsense_agentic", "question_id": "2024.ug.eprospectus.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to find information about the College of Humanities, Arts, and Social Sciences and count which schools are not individually introduced in detail.Let me search for \"College of Humanities, Arts, and Social Sciences\" in the document:Let me try searching for variations:Let me read the section about the College of Humanities:Let me search for more information about the college structure. Let me look for \"COLLEGE OF HUMANITIES\" in caps:Found it! Let me read that section:Good! I can see the College of Humanities, Arts and Social Sciences has 4 schools listed:\n1. School of Art, Design and Media (page 22)\n2. School of Humanities (page 23)\n3. School of Social Sciences (page 23)\n4. Wee Kim Wee School of Communication and Information (page 24)\n\nNow let me check which ones are introduced in detail. Let me read pages 22-24:Let me search for where each school is detailed. Let me look at the actual page content around lines 3200-3300 where I saw \"School of Humanities\" mentioned:Good! I can see the CoHASS section lists their schools. Now let me find where individual schools are detailed. Let me search for detailed sections of each school:Let me read the ADM section:Great! Now let me check for Wee Kim Wee School:Perfect! Now let me also check if there are any other schools in CoHASS that I might have missed. Let me go back to the list to verify:I see that the College of Humanities, Arts and Social Sciences has the following units listed:\n- School of Art, Design and Media\n- School of Humanities  \n- School of Social Sciences\n- Wee Kim Wee School of Communication and Information\n- Nanyang Centre for Public Administration\n- NTU Centre for Contemporary Art Singapore\n- Centre for Liberal Arts and Social Sciences\n\nNow let me check if the last three (Nanyang Centre for Public Administration, NTU Centre for Contemporary Art Singapore, and Centre for Liberal Arts and Social Sciences) have detailed introductions. Let me search:So the three centres (Nanyang Centre for Public Administration, NTU Centre for Contemporary Art Singapore, and Centre for Liberal Arts and Social Sciences) are only listed in the OUR SCHOOLS section but don't have detailed individual introductions like the four main schools do.\n\nNow let me verify the total count by checking what the document considers as \"schools\" in CoHASS. The question asks specifically about \"schools\" - let me see if these centres are considered schools:\n\nLooking at the structure:\n- In the table of contents (lines 398-413), only 4 schools are listed under \"COLLEGES OF HUMANITIES, ARTS AND SOCIAL SCIENCES\":\n  1. School of Art, Design and Media (page 22)\n  2. School of Humanities (page 23)\n  3. School of Social Sciences (page 23)\n  4. Wee Kim Wee School of Communication and Information (page 24)\n\nAll 4 of these have detailed individual introductions as I found above.\n\nBut in the \"OUR SCHOOLS\" section (lines 3216-3236), it lists 7 entities:\n- 4 Schools (the ones mentioned above)\n- 3 Centres (Nanyang Centre for Public Administration, NTU Centre for Contemporary Art Singapore, Centre for Liberal Arts and Social Sciences)\n\nThe question asks about \"schools\" specifically. The centres are NOT called \"schools\" - they are called \"centres\". So the question is asking only about the 4 schools.\n\nSince all 4 schools (School of Art, Design and Media, School of Humanities, School of Social Sciences, and Wee Kim Wee School of Communication and Information) ARE individually introduced in detail, the answer is:\n\nAnswer: 0", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 97500, "error": null, "extra": {"thread_id": 1957, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_84cc170418784e7aaee4655dd9c795d7", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 722, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "0", "normalised_gold": "3"}}
+{"qid": "2210.02442v1.pdf::Q000", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "2", "pages": 24, "document_id": 7582, "arm": "native_pdf", "question_id": "2210.02442v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to analyze Figure 1 and find other figures that convey similar messages.\n\nFigure 1 shows the \"Cold start problem in vision active learning\" across four datasets (PathMNIST, OrganAMNIST, BloodMNIST, and CIFAR-10). The key message is that most existing active querying strategies (BALD, Consistency, Margin, VAAL, Coreset, Entropy) are outperformed by random selection in selecting initial queries, especially in the low budget regime.\n\nNow let me look through the document for figures with similar messages:\n\n**Figure 6** (page 16): Shows \"Label diversity yields more performant and robust active querying strategies\" on PathMNIST, OrganAMNIST, and BloodMNIST. This compares strategies with and without label diversity, not the cold start problem specifically.\n\n**Figure 9** (page 20): Shows \"Diversity yields more performant and robust active querying strategies\" on CIFAR-10 and CIFAR-10-LT. Similar to Figure 6, focuses on label diversity rather than cold start.\n\n**Figure 4** (page 7): Shows \"Quantitative comparison of map-based querying strategies\" comparing easy-to-learn, hard-to-learn, easy-to-contrast, and hard-to-contrast with random selection as baseline. This shows performance at low budgets and demonstrates the cold start problem.\n\n**Figure 5** (page 8): Shows \"On the importance of selecting a superior initial query\" with performance across active learning cycles starting from initial queries. This demonstrates how different initial query strategies perform, related to the cold start problem.\n\n**Figure 13** (page 24): Shows performance of active learning strategies with different initial query strategies on BloodMNIST, demonstrating the importance of initial query selection.\n\n**Figure 14** (page 24): Same as Figure 13 but for PathMNIST.\n\nFigures that convey the same core message as Figure 1 (that active learning fails at initial selection/cold start):\n- Figure 4: Shows the cold start problem through comparison with random\n- Figure 5: Shows importance of initial query selection\n- Figure 13: Shows initial query performance across cycles\n- Figure 14:", "citations": [], "input_tokens": 58482, "output_tokens": 512, "cost_micros": 183126, "latency_ms": 44218, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "length", "pdf_filename": "2210.02442v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "14", "normalised_gold": "2"}}
+{"qid": "2210.02442v1.pdf::Q000", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "2", "pages": 24, "document_id": 7582, "arm": "azure_basic_lc", "question_id": "2210.02442v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to identify figures that convey a similar message to Figure 1.\n\nFigure 1 shows the \"Cold start problem in vision active learning\" where most existing active querying strategies (BALD, Consistency, Coreset, Margin, VAAL, Entropy) are outperformed by random selection in selecting initial queries at low budgets. The figure has 4 subplots (a-d) for different datasets showing AUC performance vs. number of images.\n\nLet me search for other figures with similar messages:\n\n**Figure 6**: \"Label diversity yields more performant and robust active querying strategies\" - Shows 3 datasets (PathMNIST, OrganAMNIST, BloodMNIST) with 6 subplots each comparing strategies with/without diversity. This is about diversity improvement, not the cold start problem itself.\n\n**Figure 9**: Similar to Figure 6 but for CIFAR-10 and CIFAR-10-LT. Also about diversity improvement rather than demonstrating the cold start problem.\n\n**Figure 4**: \"Quantitative comparison of map-based querying strategies\" - Shows random selection as a baseline and compares different strategies including easy-to-learn, hard-to-learn, easy-to-contrast, and hard-to-contrast across 4 datasets. This demonstrates that random selection is competitive and that hard-to-contrast performs best, which is related to the cold start problem.\n\n**Figure 13**: Shows \"Performance of each active learning querying strategies with different initial query strategies on BloodMNIST\" - This is about subsequent cycles after initial query.\n\n**Figure 14**: Similar to Figure 13 but for PathMNIST - Also about subsequent cycles.\n\n**Figure 5**: \"On the importance of selecting a superior initial query\" on OrganAMNIST - Shows active learning cycles but focuses on the importance of initial query selection.\n\nThe figures that most clearly convey the same core message as Figure 1 (that existing active querying strategies fail to outperform random selection at the start) are:\n\n- **Figure 4**: Directly compares strategies at low budgets and shows random selection as competitive baseline\n- Possibly Figure 6 and 9, but these focus more on the solution (diversity) rather than the", "citations": [], "input_tokens": 21463, "output_tokens": 512, "cost_micros": 72069, "latency_ms": 12625, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "2"}}
+{"qid": "2210.02442v1.pdf::Q000", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "2", "pages": 24, "document_id": 7582, "arm": "azure_premium_lc", "question_id": "2210.02442v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to identify figures that convey a similar message to Figure 1.\n\nFigure 1's main message is about the \"Cold start problem in vision active learning\" - showing that most existing active querying strategies (BALD, Consistency, Coreset, Margin, VAAL, Entropy) are outperformed by random selection in selecting initial queries at low budgets. It displays AUC performance curves across different numbers of images for multiple datasets.\n\nLet me go through the other figures to find similar messages:\n\n**Figure 2**: Shows label diversity of querying criteria - different message (about class distribution bias)\n\n**Figure 3**: Shows Dataset Maps and easy/hard-to-learn vs easy/hard-to-contrast data - different message (about data selection strategies)\n\n**Figure 4**: Shows quantitative comparison of map-based querying strategies, comparing hard-to-contrast with other strategies against random selection - **similar message** (comparing active learning strategies with random selection)\n\n**Figure 5**: Shows importance of selecting superior initial query on OrganAMNIST - **similar message** (about initial query performance across active learning cycles)\n\n**Figure 6**: Extended from Table 1, shows label diversity yields more performant strategies - different message (about adding label diversity)\n\n**Figure 7**: Continued from Figure 2, about label diversity - different message\n\n**Figure 8**: Visualization of K-means clustering - different message\n\n**Figure 9**: Similar to Figure 6, about diversity and performance - different message (about label diversity effect)\n\n**Figure 10**: Similar to Figure 2, about label diversity - different message\n\n**Figure 11**: Similar to Figure 3, Dataset Maps - different message\n\n**Figure 12**: Similar to Figure 3, Dataset Maps - different message\n\n**Figure 13**: Performance across active learning cycles on BloodMNIST - **similar message** (comparing initial query strategies across active learning cycles)\n\n**Figure 14**: Performance across active learning cycles on PathMNIST - **similar message** (comparing initial query strategies across active learning cycles)\n\nFigures that convey similar messages to Figure 1 (comparing active learning strategies, showing cold start problem):\n- Figure 4\n- Figure 5\n- Figure 13\n- Figure 14\n\nAnswer: 4", "citations": [], "input_tokens": 26468, "output_tokens": 506, "cost_micros": 86994, "latency_ms": 11765, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "2"}}
+{"qid": "2210.02442v1.pdf::Q000", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "2", "pages": 24, "document_id": 7582, "arm": "llamacloud_basic_lc", "question_id": "2210.02442v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to identify figures that convey similar messages to Figure 1.\n\nFigure 1 shows the \"Cold start problem in vision active learning\" - it demonstrates that most existing active querying strategies (like BALD, Consistency, etc.) are outperformed by random selection when selecting initial queries, particularly in low budget regimes.\n\nLet me go through the document to find figures with similar messages:\n\n**Figure 4**: Shows comparison of different querying strategies including easy-to-learn, hard-to-learn, easy-to-contrast, and hard-to-contrast across different datasets. This shows performance comparisons but focuses on the proposed method rather than demonstrating the cold start problem itself.\n\n**Figure 6**: Explicitly states it is \"Extended from Table 1\" and shows \"Label diversity yields more performant and robust active querying strategies.\" It compares active querying strategies with and without label diversity on PathMNIST, OrganAMNIST, and BloodMNIST. This is about the solution (label diversity) rather than demonstrating the cold start problem.\n\n**Figure 9**: States \"Diversity yields more performant and robust active querying strategies\" on CIFAR-10 and CIFAR-10-LT. Similar to Figure 6, this focuses on the solution rather than demonstrating the cold start problem.\n\n**Figure 13**: Shows \"Performance of each active learning querying strategies with different initial query strategies on BloodMNIST.\" This compares different strategies across active learning cycles but focuses on the full active learning procedure rather than specifically demonstrating the cold start problem.\n\n**Figure 14**: Similar to Figure 13, shows \"Performance of each active learning querying strategies with different initial query strategies on PathMNIST.\"\n\nLooking back at the document more carefully, none of the other figures directly replicate Figure 1's specific message about the cold start problem where active learning strategies fail to outperform random selection at the first choice. The other figures either:\n- Show solutions to the problem (Figures 6, 9)\n- Show active learning trajectories (Figures 13, 14)\n- Show other aspects like dataset maps or visualizations (Figures 3, 7, 8, 10, 11, 12)\n\nAnswer: 0", "citations": [], "input_tokens": 23168, "output_tokens": 504, "cost_micros": 77064, "latency_ms": 11578, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "0", "normalised_gold": "2"}}
+{"qid": "2210.02442v1.pdf::Q000", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "2", "pages": 24, "document_id": 7582, "arm": "llamacloud_premium_lc", "question_id": "2210.02442v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to identify figures that convey a similar message to Figure 1, which shows the \"Cold start problem in vision active learning\" - demonstrating that most existing active querying strategies are outperformed by random selection when selecting initial queries.\n\nLet me examine each figure:\n\n**Figure 1**: Shows cold start problem - active learning strategies perform worse than random selection at initial queries across multiple datasets.\n\n**Figure 2**: Shows label diversity of querying criteria - demonstrates class distribution bias in active querying strategies.\n\n**Figure 3**: Shows active querying based on Dataset Maps - compares easy/hard-to-learn vs easy/hard-to-contrast data.\n\n**Figure 4**: Shows quantitative comparison of map-based querying strategies - compares different strategies including hard-to-contrast vs random selection.\n\n**Figure 5**: Shows importance of selecting superior initial query - correlation analysis showing hard-to-contrast outperforms others across active learning cycles.\n\n**Figure 6**: Extended from Table 1 - shows label diversity yields more performant strategies, comparing with/without label diversity across budgets.\n\n**Figure 7**: Continued from Figure 2 - shows label diversity for OrganAMNIST and BloodMNIST.\n\n**Figure 8**: Visualization of K-means clustering and active selection using UMAP.\n\n**Figure 9**: Similar to Figure 6 but for CIFAR-10-LT - shows diversity yields more performant strategies.\n\n**Figure 10**: Similar to Figure 2 but for CIFAR-10 and CIFAR-10-LT - shows label diversity.\n\n**Figure 11**: Similar to Figure 3 - Active querying based on Dataset Maps for PathMNIST and OrganAMNIST.\n\n**Figure 12**: Similar to Figure 3 - Active querying based on Dataset Maps for CIFAR-10-LT.\n\n**Figure 13**: Performance of active learning strategies with different initial queries on BloodMNIST - shows correlation across cycles.\n\n**Figure 14**: Performance of active learning strategies with different initial queries on PathMNIST - shows correlation across cycles.\n\nFigures conveying a similar message to Figure 1 (showing the cold start problem where active learning strategies underperform random selection at the start):\n- **Figure ", "citations": [], "input_tokens": 37261, "output_tokens": 512, "cost_micros": 119463, "latency_ms": 12672, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "2"}}
+{"qid": "2210.02442v1.pdf::Q000", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "2", "pages": 24, "document_id": 7582, "arm": "surfsense_agentic", "question_id": "2210.02442v1.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to understand the first figure and then identify how many other figures convey similar messages.Let me continue reading to find all figures in the document and understand what the first figure conveys.Now let me read more sections to understand all the figures and their messages better.Let me continue reading to see more figures.Let me continue reading to identify all figures.Let me search for more figures to get a complete count.Now let me understand what message Figure 1 conveys and identify similar figures. Based on my reading, let me analyze:\n\n**Figure 1** (lines 490-584): Shows four subplots (a-d) of line graphs plotting AUC vs Number of images, demonstrating the \"cold start problem\" where active querying strategies are outperformed by random selection in selecting initial queries. The message is: **existing active learning strategies perform worse than random selection at the beginning (cold start problem)**.\n\nNow let me identify which other figures convey similar messages:\n\n1. **Figure 4** (lines 1346-1455): Shows four subplots comparing map-based querying strategies with random selection as baseline, showing that random selection is \"a highly competitive baseline in cold start because it outperforms six popular active querying strategies as shown in Figure 1.\" This conveys the **same message about cold start problem**.\n\n2. **Figure 6** (lines 2545-2805): Shows plots about label diversity yielding more performant strategies - this is about **solutions**, not about demonstrating the cold start problem itself.\n\n3. **Figure 7** (lines 2840-2920): Shows bar charts about label diversity - this is about **class distribution bias**, related but different message.\n\n4. **Figure 9** (lines 3454-3640): Similar to Figure 6, about diversity yielding better strategies - this is about **solutions**.\n\n5. **Figure 10** (lines 3689-3800): Bar charts about label diversity - about **class distribution bias**.\n\nLet me verify by reading more carefully about what each figure demonstrates:Based on my thorough analysis of the document, Figure 1's main message is: **Active learning strategies are outperformed by random selection at the cold start (initial query selection) due to the cold start problem.**\n\nFigures that convey this same or very similar message:\n- **Figure 4**: Explicitly shows random selection as a competitive baseline that outperforms popular active querying strategies in the cold start problem, directly referencing Figure 1.\n\nAll other figures (2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14) convey different messages:\n- Figures 2, 7, 10: Show class distribution bias\n- Figures 3, 11, 12: Show dataset maps for selecting data\n- Figure 5, 13, 14: Show importance of superior initial query across learning cycles\n- Figures 6, 9: Show that diversity improves querying strategies\n- Figure 8: Shows visualization of clustering\n\nTherefore, **1 figure** (Figure 4) conveys a similar message as Figure 1.\n\nAnswer: 1", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 78500, "error": null, "extra": {"thread_id": 1958, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_cace0f3b35134ef58541b082bc26d8e7", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 533, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "2"}}
+{"qid": "2210.02442v1.pdf::Q001", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "4", "pages": 24, "document_id": 7582, "arm": "native_pdf", "question_id": "2210.02442v1.pdf::Q001", "answer_letter": null, "raw_text": "I need to find information about strategies that outperform random selection on CIFAR-10-LT.\n\nLooking at Figure 1, which shows the \"Cold start problem in vision active learning,\" I can see performance comparisons on different datasets including CIFAR-10 (subplot d). The figure shows that most existing active querying strategies (BALD, Consistency, Margin, VAAL, Coreset, Entropy) are outperformed by random selection in selecting initial queries.\n\nLooking at Table 1, which compares different querying strategies on CIFAR-10-LT at 10% and 20% budgets:\n- At 10% budget: Random achieves 91.6\u00b11.1\n- At 20% budget: Random achieves 93.1\u00b10.6\n\nComparing with other strategies at these budgets:\n- Consistency: 91.4\u00b11.1 (10%), 93.4\u00b10.2 (20%)\n- VAAL: 92.6\u00b10.2 (10%), 93.7\u00b10.4 (20%)\n- Margin: 91.7\u00b10.9 (10%), 93.2\u00b10.2 (20%)\n- Entropy: 92.0\u00b11.2 (10%), 91.9\u00b11.3 (20%)\n- Coreset: 91.5\u00b10.4 (10%), 93.6\u00b10.2 (20%)\n- BALD: 92.8\u00b10.1 (10%), 90.8\u00b12.4 (20%)\n\nAt 10%, strategies that outperform random (91.6): VAAL (92.6), Entropy (92.0), Margin (91.7), BALD (92.8) = 4 strategies\n\nAt 20%, strategies that outperform random (93.1): Consistency (93.4), VAAL (93.7), Margin (93.2), Coreset (93.6) = 4 strategies\n\nHowever, the question asks generally how many strategies outperform random selection. Looking at the paper's proposed \"Ours\" strategy (hard-to-contrast), the results show it significantly outperforms random", "citations": [], "input_tokens": 58490, "output_tokens": 512, "cost_micros": 183150, "latency_ms": 41641, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "length", "pdf_filename": "2210.02442v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "4"}}
+{"qid": "2210.02442v1.pdf::Q001", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "4", "pages": 24, "document_id": 7582, "arm": "azure_basic_lc", "question_id": "2210.02442v1.pdf::Q001", "answer_letter": null, "raw_text": "I need to find information about strategies that outperform random selection on CIFAR-10-LT.\n\nLooking at Figure 1(d) which shows \"Cold start problem in vision active learning\" for CIFAR-10, I can see performance curves for different strategies. The document mentions CIFAR-10-LT specifically in several places.\n\nFrom Figure 4(d), I can see a comparison of different strategies on CIFAR-10-LT including:\n- Random (baseline, shown as dotted line)\n- Easy-to-learn\n- Hard-to-learn\n- Easy-to-contrast\n- Hard-to-contrast\n\nLooking at the quantitative results in Figure 4(d) for CIFAR-10-LT at different budget levels (20.0% and 30.0% with 2481 and 3721 images), I can observe which strategies perform above the random baseline.\n\nFrom the text in section 3.2: \"Similarly on CIFAR-10-LT, hard-to-contrast significantly outperforms random selection by 21.2% (87.35%\u00b10.0% vs. 66.12%\u00b10.9%) and 24.1% (90.59%\u00b10.1% vs. 66.53%\u00b10.5%) by querying 20% and 30% of entire dataset respectively.\"\n\nLooking at Figure 4(d) more carefully, I can see that at the budget levels shown:\n1. Hard-to-contrast (performs best, clearly above random)\n2. Easy-to-learn (appears to perform above random)\n3. Hard-to-learn (appears close to or slightly above random)\n4. Easy-to-contrast (appears below random)\n\nFrom the visual inspection of Figure 4(d), it appears that 2-3 strategies outperform random selection, with hard-to-contrast being the clear winner, and easy-to-learn also performing above the baseline.\n\nBased on the figure, I count: Hard-to-contrast and Easy-to-learn as clearly outperforming random selection.\n\nAnswer: 2", "citations": [], "input_tokens": 21471, "output_tokens": 478, "cost_micros": 71583, "latency_ms": 11500, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "4"}}
+{"qid": "2210.02442v1.pdf::Q001", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "4", "pages": 24, "document_id": 7582, "arm": "azure_premium_lc", "question_id": "2210.02442v1.pdf::Q001", "answer_letter": null, "raw_text": "I need to find information about how many strategies outperform random selection on CIFAR-10-LT.\n\nLooking at Figure 1(d) which shows the cold start problem on CIFAR-10, I can see performance curves for different strategies compared to random selection. However, the question asks specifically about CIFAR-10-LT.\n\nLet me check the text more carefully. In Section 3.2, it states:\n\n\"Similarly on CIFAR-10-LT, hard-to-contrast significantly outperforms random selection by 21.2% (87.35%\u00b10.0% vs. 66.12%\u00b10.9%) and 24.1% (90.59%\u00b10.1% vs. 66.53%\u00b10.5%) by querying 20% and 30% of entire dataset respectively.\"\n\nThis indicates that the \"hard-to-contrast\" strategy (which is the authors' proposed method) outperforms random selection.\n\nLooking at Figure 1(d) which shows CIFAR-10 (not CIFAR-10-LT, but related), most of the active learning strategies shown (BALD, Consistency, Coreset, Margin, VAAL, Entropy) appear to perform worse than or similar to random selection at the initial stages (the \"cold start\" problem that the paper discusses).\n\nThe main point of the paper is that existing active learning strategies fail to outperform random selection at the cold start (initial query selection), which is why they propose their hard-to-contrast method.\n\nLooking at Table 1 and Table 7, which compare different strategies on CIFAR-10-LT with different budget levels, at low budgets (10%, 20%), the proposed method with diversity enforcement shows the best performance. However, most existing strategies WITHOUT diversity enforcement perform poorly or fail to select all classes (marked as \"-\").\n\nBased on the paper's main finding that existing active querying strategies are \"outperformed by random selection\" at the cold start (as stated in Figure 1 caption and throughout the introduction), and that only their proposed \"hard-to-contrast\" method significantly outperforms random selection on CIFAR-10-LT, the answer appears to be:\n\nAnswer: 1", "citations": [], "input_tokens": 26476, "output_tokens": 501, "cost_micros": 86943, "latency_ms": 12703, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "4"}}
+{"qid": "2210.02442v1.pdf::Q001", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "4", "pages": 24, "document_id": 7582, "arm": "llamacloud_basic_lc", "question_id": "2210.02442v1.pdf::Q001", "answer_letter": null, "raw_text": "I need to find how many active learning strategies outperform random selection on CIFAR-10-LT in the cold start problem (initial query selection).\n\nLooking at Figure 1 in the document, which illustrates the \"Cold start problem in vision active learning,\" I can see panel (d) shows results for CIFAR-10-LT.\n\nThe document states in the caption of Figure 1: \"Most existing active querying strategies (e.g. BALD, Consistency, etc.) are outperformed by random selection in selecting initial queries... However, some classes are not selected by active querying strategies due to selection bias, so their results are not presented in the low budget regime.\"\n\nThe document consistently emphasizes that existing active learning strategies fail to outperform random selection at the cold start (initial query stage). The main contribution of the paper is proposing a new strategy (hard-to-contrast) that does outperform random selection.\n\nLooking at Table 1, for CIFAR-10-LT at 10% and 20% budgets:\n- Random: 91.6\u00b11.1 (10%), 93.1\u00b10.6 (20%)\n- Consistency: 91.4\u00b11.1 (10%), 93.4\u00b10.2 (20%)\n- VAAL: 92.6\u00b10.2 (10%), 93.7\u00b10.4 (20%)\n- Margin: 91.7\u00b10.9 (10%), 93.2\u00b10.2 (20%)\n- Entropy: 92.0\u00b11.2 (10%), 91.9\u00b11.3 (20%)\n- Coreset: 91.5\u00b10.4 (10%), 93.6\u00b10.2 (20%)\n- BALD: 92.8\u00b10.1 (10%), 90.8\u00b12.4 (20%)\n\nAt 10%: VAAL (92.6), Entropy (92.0), BALD (92.8), and Margin (91.7) appear close or slightly better than Random (91.6)\nAt 20%: VAAL (93.7), Consistency (93.4), Coreset (93.", "citations": [], "input_tokens": 23176, "output_tokens": 512, "cost_micros": 77208, "latency_ms": 12390, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "20", "normalised_gold": "4"}}
+{"qid": "2210.02442v1.pdf::Q001", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "4", "pages": 24, "document_id": 7582, "arm": "llamacloud_premium_lc", "question_id": "2210.02442v1.pdf::Q001", "answer_letter": null, "raw_text": "I need to find how many strategies outperform random selection on CIFAR-10-LT.\n\nLooking at Figure 1(d) which shows \"Cold start problem in vision active learning\" for CIFAR-10-LT:\n\nThe figure shows performance curves for different strategies:\n- Random (baseline)\n- BALD (Kirsch et al., 2019)\n- Consistency (Gao et al., 2020)\n- Coreset (Sener et al., 2017)\n- Margin (Balcan et al., 2007)\n- VAAL (Sinha et al., 2019)\n- Entropy (Wang et al., 2014)\n\nThe text states: \"Most existing active querying strategies (e.g. BALD, Consistency, etc.) are outperformed by random selection in selecting initial queries, since random selection is i.i.d. to the entire dataset.\"\n\nFrom Figure 1(d), I can see that at the initial query stage (low budget regime), the random selection baseline appears to outperform most other strategies. The figure caption states \"Active learning fails to select data as effectively as random selection at the first choice.\"\n\nHowever, looking more carefully at the results, I need to determine which strategies actually outperform random at the cold start (initial query selection).\n\nAccording to the paper's main findings and Figure 4, the proposed \"hard-to-contrast\" strategy significantly outperforms random selection on CIFAR-10-LT. The paper states this is their contribution - addressing the cold start problem where existing strategies fail to beat random.\n\nFrom the various results shown, only the authors' proposed strategy (hard-to-contrast) consistently outperforms random selection at the initial query stage on CIFAR-10-LT.\n\nAnswer: 1", "citations": [], "input_tokens": 37269, "output_tokens": 407, "cost_micros": 117912, "latency_ms": 9546, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "4"}}
+{"qid": "2210.02442v1.pdf::Q001", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "4", "pages": 24, "document_id": 7582, "arm": "surfsense_agentic", "question_id": "2210.02442v1.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the document to find information about strategies that outperform the random selection baseline on CIFAR-10-LT.Let me search for content related to CIFAR-10-LT and performance comparisons:Let me read through the document more systematically to find tables or figures with results:Let me search for tables that compare strategies on CIFAR-10-LT:Perfect! I found Table 1. Let me continue reading to see the complete table for CIFAR-10-LT results:Now I can see Table 1 clearly. Let me analyze the CIFAR-10-LT columns (10% and 20% budget). The table shows performance with and without enforcing label diversity. Looking at the CIFAR-10-LT columns:\n\nFor the 10% budget (column 9), I need to compare each strategy's score with Random baseline (91.6\u00b11.1):\n- Random (baseline): 91.6\u00b11.1\n- Consistency with diversity (\u2713): 91.4\u00b11.1 - DOES NOT outperform\n- Consistency without diversity (\u2717): 67.1\u00b117.1 - DOES NOT outperform\n- VAAL with diversity (\u2713): 92.6\u00b10.2 - OUTPERFORMS\n- VAAL without diversity (\u2717): - (missing)\n- Margin with diversity (\u2713): 91.7\u00b10.9 - OUTPERFORMS (slightly)\n- Margin without diversity (\u2717): 81.9\u00b10.8 - DOES NOT outperform\n- Entropy with diversity (\u2713): 92.0\u00b11.2 - OUTPERFORMS (slightly)\n- Entropy without diversity (\u2717): 65.6\u00b115.6 - DOES NOT outperform\n- Coreset with diversity (\u2713): 91.5\u00b10.4 - DOES NOT outperform\n- Coreset without diversity (\u2717): 65.9\u00b115.9 - DOES NOT outperform\n- BALD with diversity (\u2713): 92.8\u00b10.1 - OUTPERFORMS\n- BALD without diversity (\u2717): 64.9\u00b114.9 - DOES NOT outperform\n\nLooking at 10% budget, strategies that outperform random (91.6):\n1. VAAL with diversity: 92.6\n2. Margin with diversity: 91.7\n3. Entropy with diversity: 92.0\n4. BALD with diversity: 92.8\n\nThat's 4 strategies at 10% budget.\n\nFor the 20% budget (column 10), comparing with Random baseline (93.1\u00b10.6):\n- Random: 93.1\u00b10.6\n- Consistency with diversity: 93.4\u00b10.2 - OUTPERFORMS\n- Consistency without diversity: 88.6\u00b10.3 - DOES NOT outperform\n- VAAL with diversity: 93.7\u00b10.4 - OUTPERFORMS\n- VAAL without diversity: - (missing)\n- Margin with diversity: 93.2\u00b10.2 - OUTPERFORMS (slightly)\n- Margin without diversity: 86.3\u00b10.3 - DOES NOT outperform\n- Entropy with diversity: 91.9\u00b11.3 - DOES NOT outperform\n- Entropy without diversity: 86.4\u00b10.2 - DOES NOT outperform\n- Coreset with diversity: 93.6\u00b10.2 - OUTPERFORMS\n- Coreset without diversity: 86.9\u00b10.1 - DOES NOT outperform\n- BALD with diversity: 90.8\u00b12.4 - DOES NOT outperform\n- BALD without diversity: 84.7\u00b10.6 - DOES NOT outperform\n\nAt 20% budget, strategies that outperform random (93.1):\n1. Consistency with diversity: 93.4\n2. VAAL with diversity: 93.7\n3. Margin with diversity: 93.2\n4. Coreset with diversity: 93.6\n\nThat's 4 strategies at 20% budget.\n\nBut the question asks about \"CIFAR-10-LT\" in general. Let me count unique strategies (considering both budgets):\n\nStrategies that outperform at 10%: VAAL, Margin, Entropy, BALD (all with diversity)\nStrategies that outperform at 20%: Consistency, VAAL, Margin, Coreset (all with diversity)\n\nCombined unique strategies: VAAL, Margin, Entropy, BALD, Consistency, Coreset = 6 strategies\n\nBut I should check if the question means at ANY budget level or at BOTH budget levels. Looking at Figure 1d in the document might also help clarify. Let me check that figure.I notice Figure 1(d) is labeled \"CIFAR-10\" not \"CIFAR-10-LT\". But the question specifically asks about CIFAR-10-LT. Let me look more carefully at Table 1. The table clearly has a column header \"CIFAR-10-LT\" with two sub-columns (10% and 20%).\n\nBased on Table 1, I need to count how many strategies (considering different variations - with/without diversity) outperform the random baseline on CIFAR-10-LT. Let me be more systematic:\n\nAt 10% budget, Random = 91.6\u00b11.1:\n1. Consistency (\u2713): 91.4 - NO\n2. Consistency (\u2717): 67.1 - NO  \n3. VAAL (\u2713): 92.6 - YES\n4. VAAL (\u2717): - (missing)\n5. Margin (\u2713): 91.7 - YES\n6. Margin (\u2717): 81.9 - NO\n7. Entropy (\u2713): 92.0 - YES\n8. Entropy (\u2717): 65.6 - NO\n9. Coreset (\u2713): 91.5 - NO\n10. Coreset (\u2717): 65.9 - NO\n11. BALD (\u2713): 92.8 - YES\n12. BALD (\u2717): 64.9 - NO\n\nAt 10%: 4 strategies outperform random\n\nAt 20% budget, Random = 93.1\u00b10.6:\n1. Consistency (\u2713): 93.4 - YES\n2. Consistency (\u2717): 88.6 - NO\n3. VAAL (\u2713): 93.7 - YES\n4. VAAL (\u2717): - (missing)\n5. Margin (\u2713): 93.2 - YES\n6. Margin (\u2717): 86.3 - NO\n7. Entropy (\u2713): 91.9 - NO\n8. Entropy (\u2717): 86.4 - NO\n9. Coreset (\u2713): 93.6 - YES\n10. Coreset (\u2717): 86.9 - NO\n11. BALD (\u2713): 90.8 - NO\n12. BALD (\u2717): 84.7 - NO\n\nAt 20%: 4 strategies outperform random\n\nThe question asks \"How many strategies outperform the random selection baseline on CIFAR-10-LT?\" \n\nLooking at both budget levels (10% and 20%), and counting each unique strategy configuration:\n- At 10%: VAAL(\u2713), Margin(\u2713), Entropy(\u2713), BALD(\u2713) = 4 \n- At 20%: Consistency(\u2713), VAAL(\u2713), Margin(\u2713), Coreset(\u2713) = 4\n\nIf we count unique strategy configurations across both budgets: VAAL(\u2713), Margin(\u2713), Entropy(\u2713), BALD(\u2713), Consistency(\u2713), Coreset(\u2713) = 6 total\n\nBut the most straightforward interpretation is: at a given budget level. Since the table shows two budget levels and the question doesn't specify which, I should look at what makes most sense. Given that 4 strategies outperform at each budget level separately, and 6 unique strategy variations outperform across at least one budget level, the answer is likely **4** (at each specific budget) or **6** (total unique strategies that ever outperform).\n\nHowever, re-reading the question more carefully - it asks how many strategies outperform, not how many strategy-variants. If we count base strategies (ignoring the diversity modifier):\n- VAAL outperforms (with diversity at both budgets)\n- Margin outperforms (with diversity at both budgets)  \n- Entropy outperforms (with diversity at 10%)\n- BALD outperforms (with diversity at 10%)\n- Consistency outperforms (with diversity at 20%)\n- Coreset outperforms (with diversity at 20%)\n\nThat would be 6 unique base strategies.\n\nBut looking at the data more carefully at the initial query (lowest budget), which is what the paper focuses on for \"cold start\", the answer is most likely **4** for the 10% budget.\n\nActually, let me reconsider - in Table 1, the \"Unif.\" column indicates whether uniform/diversity enforcement is applied. Each strategy has 2 rows (with and without diversity). So we have 7 base strategies tested: Random, Consistency, VAAL, Margin, Entropy, Coreset, BALD.\n\nIf I count how many of these 7 base strategies (in any configuration) outperform random at either budget:\n- Consistency: NO at 10%, YES at 20% (with diversity)\n- VAAL: YES at 10%, YES at 20% (with diversity)\n- Margin: YES at 10%, YES at 20% (with diversity)\n- Entropy: YES at 10%, NO at 20%\n- Coreset: NO at 10%, YES at 20% (with diversity)\n- BALD: YES at 10%, NO at 20%\n\nCount = 6 base strategies outperform random in at least one configuration at at least one budget level.\n\nAnswer: 6", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 81906, "error": null, "extra": {"thread_id": 1959, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_c7833996f4234f1db0fa9af29b8ebd6c", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 805, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "4"}}
+{"qid": "2210.02442v1.pdf::Q003", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "VAAL", "pages": 24, "document_id": 7582, "arm": "native_pdf", "question_id": "2210.02442v1.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Figure 2 and Figure 7, which show label diversity comparisons across different datasets (PathMNIST, OrganAMNIST, BloodMNIST), I can see the class distributions for different querying strategies.\n\nFrom these figures, VAAL consistently shows the poorest label diversity across datasets:\n- In PathMNIST (Figure 2), VAAL heavily selects from the \"muscle\" class while largely ignoring \"mucus\" and \"mucosa\" classes\n- In OrganAMNIST (Figure 7), VAAL prefers \"femur-right\" and \"platelet\" classes while largely ignoring lung, liver and monocyte classes\n- In BloodMNIST (Figure 7), VAAL shows similarly poor label diversity with heavy bias toward specific classes\n\nThis is further supported by Table 1, where VAAL frequently has missing results (marked as \"-\") because it fails to sample at least one data point for each class.\n\nAnswer: VAAL", "citations": [], "input_tokens": 58491, "output_tokens": 229, "cost_micros": 178908, "latency_ms": 35750, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2210.02442v1.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "vaal", "normalised_gold": "vaal"}}
+{"qid": "2210.02442v1.pdf::Q003", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "VAAL", "pages": 24, "document_id": 7582, "arm": "azure_basic_lc", "question_id": "2210.02442v1.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Figure 2 and Figure 7 in the document, which show label diversity comparisons across different datasets, I can observe the class distribution patterns for various active querying strategies.\n\nFrom the figures, VAAL consistently shows highly imbalanced distributions with strong bias towards specific classes:\n- In PathMNIST (Figure 2): VAAL heavily favors muscle class while largely ignoring mucus and mucosa\n- In OrganAMNIST (Figure 7a): VAAL prefers femur-right while ignoring lung and liver\n- In BloodMNIST (Figure 7b): VAAL favors platelet while ignoring other classes\n\nThe document also states: \"For example, VAAL prefers selecting data in the muscle class, but largely ignores data in the mucus and mucosa classes.\"\n\nTable 2 further confirms this, showing VAAL has very poor class coverage (0.11-0.30) compared to other methods at low budgets.\n\nAnswer: VAAL", "citations": [], "input_tokens": 21472, "output_tokens": 230, "cost_micros": 67866, "latency_ms": 25187, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "vaal", "normalised_gold": "vaal"}}
+{"qid": "2210.02442v1.pdf::Q003", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "VAAL", "pages": 24, "document_id": 7582, "arm": "azure_premium_lc", "question_id": "2210.02442v1.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the label diversity figures and tables in the document:\n\nIn Figure 2 (PathMNIST), VAAL shows extremely poor label diversity with an entropy of only 2.800, heavily biased toward the muscle class while largely ignoring mucus and mucosa classes.\n\nIn Figure 7 (OrganAMNIST and BloodMNIST), VAAL again shows the poorest label diversity, preferring specific classes like femur-right and platelet while ignoring others.\n\nIn Table 2 (class coverage), VAAL consistently has the lowest class coverage across datasets:\n- PathMNIST: 0.11 (lowest)\n- OrganAMNIST: 0.18 (lowest)\n- BloodMNIST: 0.13 (lowest)\n- CIFAR-10-LT: 0.30 (tied for lowest)\n\nThe document also notes that VAAL results are often missing (marked as \"-\") because it \"fails to sample at least one data point for each class.\"\n\nAnswer: VAAL", "citations": [], "input_tokens": 26477, "output_tokens": 247, "cost_micros": 83136, "latency_ms": 6766, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "vaal", "normalised_gold": "vaal"}}
+{"qid": "2210.02442v1.pdf::Q003", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "VAAL", "pages": 24, "document_id": 7582, "arm": "llamacloud_basic_lc", "question_id": "2210.02442v1.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Table 2 in the document, which shows \"Class coverage of selected data\" across different datasets (PathMNIST, OrganAMNIST, BloodMNIST, and CIFAR-10-LT), I can see the class coverage values for different querying strategies.\n\nVAAL consistently shows the lowest class coverage values across the datasets:\n- PathMNIST: 0.11 at 0.00015%, 0.11 at 0.00030%, 0.18 at 0.001%, 0.18 at 0.002%\n- OrganAMNIST: 0.13 at 0.001%, 0.13 at 0.002%\n- BloodMNIST: values not explicitly shown but indicated as poor\n- CIFAR-10-LT: 0.30 at 0.2%, 0.30 at 0.3%\n\nAdditionally, Table 1 shows that VAAL has many missing results (marked as \"-\") because it \"fails to sample at least one data point for each class.\"\n\nAnswer: VAAL", "citations": [], "input_tokens": 23177, "output_tokens": 263, "cost_micros": 73476, "latency_ms": 6907, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "vaal", "normalised_gold": "vaal"}}
+{"qid": "2210.02442v1.pdf::Q003", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "VAAL", "pages": 24, "document_id": 7582, "arm": "llamacloud_premium_lc", "question_id": "2210.02442v1.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the label diversity analysis in the document, I need to examine Figure 2, Table 2, Figure 7, and Figure 10 which show class distributions and coverage across different querying strategies.\n\nFrom Figure 2 (PathMNIST), VAAL shows extremely poor label diversity - it selects 25% from muscle class but only 1% and 2% from mucosa and mucus classes respectively, with an entropy of 2.800 (lowest among all strategies).\n\nFrom Table 2 (class coverage):\n- VAAL consistently shows the poorest class coverage across datasets\n- PathMNIST (0.00015%): VAAL covers only 0.11 (11% of classes)\n- OrganAMNIST (0.001%): VAAL covers only 0.18 (18% of classes)  \n- BloodMNIST (0.001%): VAAL covers only 0.13 (13% of classes)\n- CIFAR-10-LT (0.2%): VAAL covers only 0.30 (30% of classes)\n\nFigure 7 confirms VAAL selects 100% from femur-right and platelet while selecting 0% from most other classes.\n\nFigure 10 shows VAAL fails to select any samples from 9 out of 10 classes in CIFAR-10-LT.\n\nAnswer: VAAL", "citations": [], "input_tokens": 37270, "output_tokens": 330, "cost_micros": 116760, "latency_ms": 9437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "vaal", "normalised_gold": "vaal"}}
+{"qid": "2210.02442v1.pdf::Q003", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "VAAL", "pages": 24, "document_id": 7582, "arm": "surfsense_agentic", "question_id": "2210.02442v1.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the priority document to find information about querying criteria and label diversity across datasets.Let me continue reading the document to find information about querying criteria and label diversity:Let me search for information about label diversity across different querying criteria in the document:", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 28125, "error": null, "extra": {"thread_id": 1960, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_9f3482197c3247b0aae1aed4062d9a6a", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 96, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "i ll read priority document to find information about querying criteria and label diversity across datasets let me continue reading document to find information about querying criteria and label diversity let me search for information about label diversity across different querying criteria in document", "normalised_gold": "vaal"}}
+{"qid": "2210.02442v1.pdf::Q004", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "3", "pages": 24, "document_id": 7582, "arm": "native_pdf", "question_id": "2210.02442v1.pdf::Q004", "answer_letter": null, "raw_text": "I need to count all the line plots in this paper by going through each page systematically.\n\nPage 1 (Figure 1): Shows 4 subplots (a-d), each containing multiple line plots comparing different active learning strategies. This appears to be 4 figures with multiple lines each.\n\nPage 2 (Figure 1 continued): Same figure from page 1.\n\nPage 4 (Figure 2): Shows a bar chart comparing class distributions, not line plots.\n\nPage 5 (Figure 3): Shows three panels with scatter plots (Dataset Maps), not line plots.\n\nPage 7 (Figure 4): Shows 4 subplots (a-d), each containing line plots with error bars comparing different strategies. This is 4 figures with multiple lines each.\n\nPage 8 (Figure 5): Shows 10 subplots arranged in 2 rows, each containing multiple line plots. This is 10 figures with multiple lines each.\n\nPage 16 (Figure 6): Shows 18 subplots (6 strategies \u00d7 3 datasets), each containing line plots. This is 18 figures with multiple lines each.\n\nPage 17 (Figure 7): Shows bar charts, not line plots.\n\nPage 18 (Figure 8): Shows UMAP visualizations (scatter plots), not line plots.\n\nPage 20 (Figure 9): Shows 12 subplots (6 strategies \u00d7 2 datasets), each containing line plots. This is 12 figures with multiple lines each.\n\nPage 21 (Figure 10): Shows bar charts, not line plots.\n\nPage 22 (Figure 11): Shows scatter plots (Dataset Maps), not line plots.\n\nPage 23 (Figure 12): Shows scatter plots (Dataset Maps), not line plots.\n\nPage 24 (Figure 13 and 14): Each figure shows 10 subplots with line plots. This is 20 figures with multiple lines each.\n\nCounting the figures that contain line plots:\n- Figure 1: 4\n- Figure 4: 4\n- Figure 5: 10\n- Figure 6: 18\n- Figure 9: 12\n- Figure 13: 10\n- Figure 14: 10\n\nTotal: 4", "citations": [], "input_tokens": 58479, "output_tokens": 512, "cost_micros": 183117, "latency_ms": 36375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "length", "pdf_filename": "2210.02442v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "3"}}
+{"qid": "2210.02442v1.pdf::Q004", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "3", "pages": 24, "document_id": 7582, "arm": "azure_basic_lc", "question_id": "2210.02442v1.pdf::Q004", "answer_letter": null, "raw_text": "I need to count all the line plots in this paper by going through each figure systematically.\n\n**Figure 1** (page 2): Shows 4 subplots (a-d), each containing multiple line plots comparing different methods. Each subplot has multiple lines.\n\n**Figure 2** (page 4): This is a bar chart showing class distributions, not line plots.\n\n**Figure 3** (page 5): These are scatter plots (Dataset Maps), not line plots.\n\n**Figure 4** (page 8): Shows 4 subplots (a-d), each containing multiple line plots (4 different methods plotted).\n\n**Figure 5** (page 9): Shows 2 rows \u00d7 5 columns = 10 subplots, each containing multiple line plots.\n\n**Figure 6** (page 16): Shows 3 rows \u00d7 6 columns = 18 subplots, each containing line plots (with and without diversity).\n\n**Figure 7** (page 17): These are bar charts showing class distributions, not line plots.\n\n**Figure 8** (page 18): These are scatter plots (UMAP visualizations), not line plots.\n\n**Figure 9** (page 20): Shows 2 rows \u00d7 6 columns = 12 subplots, each containing line plots.\n\n**Figure 10** (page 21): These are bar charts showing class distributions, not line plots.\n\n**Figure 11** (page 22): These are scatter plots (Dataset Maps), not line plots.\n\n**Figure 12** (page 23): These are scatter plots (Dataset Maps), not line plots.\n\n**Figure 13** (page 24): Shows 2 rows \u00d7 5 columns = 10 subplots, each containing multiple line plots.\n\n**Figure 14** (page 24): Shows 2 rows \u00d7 5 columns = 10 subplots, each containing multiple line plots.\n\nNow let me count the total number of individual line plot panels:\n- Figure 1: 4 panels\n- Figure 4: 4 panels  \n- Figure 5: 10 panels\n- Figure 6: 18 panels\n- Figure 9: 12 panels\n- Figure 13: 10 panels", "citations": [], "input_tokens": 21460, "output_tokens": 512, "cost_micros": 72060, "latency_ms": 9219, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "13", "normalised_gold": "3"}}
+{"qid": "2210.02442v1.pdf::Q004", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "3", "pages": 24, "document_id": 7582, "arm": "azure_premium_lc", "question_id": "2210.02442v1.pdf::Q004", "answer_letter": null, "raw_text": "I need to count all the line plots in this paper systematically.\n\nLet me go through the document page by page:\n\n**Figure 1 (page 2)**: Contains 4 line plots (a, b, c, d) showing cold start problem comparisons.\n\n**Figure 2 (page 3)**: This is a bar chart showing label diversity, not line plots.\n\n**Figure 3 (page 5)**: Contains scatter plots (Dataset Maps), not line plots.\n\n**Figure 4 (page 7)**: Contains 4 line plots (a, b, c, d) comparing map-based querying strategies.\n\n**Figure 5 (page 8)**: Contains 10 line plots (2 rows \u00d7 5 columns) showing training from scratch and fine-tuning results.\n\n**Figure 6 (page 16)**: Contains 18 line plots (3 rows \u00d7 6 columns) showing extended results from Table 1.\n\n**Figure 7 (page 17)**: Contains bar charts, not line plots.\n\n**Figure 8 (page 18)**: Contains scatter plots (UMAP visualizations), not line plots.\n\n**Figure 9 (page 20)**: Contains 12 line plots (2 rows \u00d7 6 columns) showing results on CIFAR-10 and CIFAR-10-LT.\n\n**Figure 10 (page 21)**: Contains bar charts, not line plots.\n\n**Figure 11 (page 22)**: Contains scatter plots (Dataset Maps), not line plots.\n\n**Figure 12 (page 23)**: Contains scatter plots (Dataset Maps), not line plots.\n\n**Figure 13 (page 24)**: Contains 10 line plots (2 rows \u00d7 5 columns) for BloodMNIST results.\n\n**Figure 14 (page 24)**: Contains 10 line plots (2 rows \u00d7 5 columns) for PathMNIST results.\n\nTotal count:\n- Figure 1: 4\n- Figure 4: 4\n- Figure 5: 10\n- Figure 6: 18\n- Figure 9: 12\n- Figure 13: 10\n- Figure 14: 10\n\nTotal =", "citations": [], "input_tokens": 26465, "output_tokens": 512, "cost_micros": 87075, "latency_ms": 8219, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "3"}}
+{"qid": "2210.02442v1.pdf::Q004", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "3", "pages": 24, "document_id": 7582, "arm": "llamacloud_basic_lc", "question_id": "2210.02442v1.pdf::Q004", "answer_letter": null, "raw_text": "I need to count all the line plots in this paper systematically.\n\nLet me go through the document:\n\n**Figure 1**: Shows 4 subplots (a-d), each containing multiple line plots comparing different active learning strategies. Each subplot has multiple lines representing different strategies.\n\n**Figure 4**: Shows 4 subplots for different datasets (PathMNIST, OrganAMNIST, BloodMNIST, CIFAR-10-LT), each with multiple line plots.\n\n**Figure 6**: Shows 6 subplots (one for each strategy: BALD, Consistency, Coreset, Margin, VAAL, Entropy) across 3 datasets (PathMNIST, OrganAMNIST, BloodMNIST) = 18 subplots total, each containing 2 line plots (with and without diversity).\n\n**Figure 9**: Shows 6 subplots (one for each strategy) for 2 datasets (CIFAR-10 and CIFAR-10-LT) = 12 subplots total, each containing 2 line plots.\n\n**Figure 13**: Shows performance plots for BloodMNIST with two parts (a) and (b). Each part shows 6 subplots (Random, Entropy, Margin, BALD, Coreset, and appears to show multiple strategies). Each subplot contains multiple line plots showing different initial query strategies.\n\n**Figure 14**: Shows performance plots for PathMNIST with two parts (a) and (b). Similar structure to Figure 13, with 6 subplots in each part, each containing multiple line plots.\n\nLet me count more carefully by looking at each figure:\n\n- Figure 1: 4 subplots \u00d7 ~7 lines each = 28 line plots\n- Figure 4: 4 subplots \u00d7 4 lines each = 16 line plots\n- Figure 6: 18 subplots \u00d7 2 lines each = 36 line plots\n- Figure 9: 12 subplots \u00d7 2 lines each = 24 line plots\n- Figure 13: 2 parts \u00d7 6 subplots \u00d7 ~4 lines = ", "citations": [], "input_tokens": 23165, "output_tokens": 512, "cost_micros": 77175, "latency_ms": 10296, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "13", "normalised_gold": "3"}}
+{"qid": "2210.02442v1.pdf::Q004", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "3", "pages": 24, "document_id": 7582, "arm": "llamacloud_premium_lc", "question_id": "2210.02442v1.pdf::Q004", "answer_letter": null, "raw_text": "I need to count all the line plots in this paper by going through each figure.\n\n**Figure 1:** Shows 4 subplots (a-d) for PathMNIST, OrganAMNIST, BloodMNIST, and CIFAR-10. Each subplot contains multiple line plots comparing different strategies. This appears to be 4 plots with multiple lines each.\n\n**Figure 2:** This is a table showing label diversity, not line plots.\n\n**Figure 3:** Shows dataset maps with scatter plots, not line plots.\n\n**Figure 4:** Shows 4 sets of comparisons (a-d) with 2 budget levels each. Each comparison shows line plots. This gives us 8 line plot panels.\n\n**Figure 5:** Shows 2 subplots (a-b) for \"Training from scratch\" and \"Fine-tuning from self-supervised pre-training\". Each contains multiple line plots.\n\n**Figure 6:** Shows a 3\u00d76 grid (3 datasets \u00d7 6 methods) = 18 line plot panels.\n\n**Figure 7:** This is a table, not line plots.\n\n**Figure 8:** Shows UMAP visualizations (scatter plots), not line plots.\n\n**Figure 9:** Shows 2 rows \u00d7 6 columns = 12 line plot panels.\n\n**Figure 10:** This is a table, not line plots.\n\n**Figure 11:** Shows dataset maps with scatter plots, not line plots.\n\n**Figure 12:** Shows dataset maps with scatter plots, not line plots.\n\n**Figure 13:** Shows 2 subplots (a-b) with multiple line plots each.\n\n**Figure 14:** Shows 2 subplots (a-b) with multiple line plots each.\n\nNow let me count more carefully by counting individual line plot panels:\n- Figure 1: 4 panels\n- Figure 4: 8 panels (4 datasets \u00d7 2 budgets)\n- Figure 5: 2 panels\n- Figure 6: 18 panels (3 datasets \u00d7 6 methods)\n- Figure 9: 12 panels (2 datasets \u00d7 6 methods)\n- Figure 13: 2 panels\n- Figure 14: 2 panels", "citations": [], "input_tokens": 37258, "output_tokens": 512, "cost_micros": 119454, "latency_ms": 10046, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "14", "normalised_gold": "3"}}
+{"qid": "2210.02442v1.pdf::Q004", "doc_id": "2210.02442v1.pdf", "answer_format": "int", "gold": "3", "pages": 24, "document_id": 7582, "arm": "surfsense_agentic", "question_id": "2210.02442v1.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to count the line plots in the paper.Let me continue reading to see the full document and search for figures/plots:Now let me search for mentions of plots, charts, or examine the figures more carefully:I can see Figure 1 has 4 subfigures (1a, 1b, 1c, 1d) and they appear to be line plots. Let me continue examining other figures:Figure 2 appears to be a bar chart/heatmap showing class distributions. Let me check Figure 3:Figure 3 appears to be scatter", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 46530, "error": null, "extra": {"thread_id": 1961, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_5cc38c9bf2aa4c2ba5f55758d03cf5bd", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 228, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "3"}}
+{"qid": "2210.02442v1.pdf::Q005", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "SGD", "pages": 24, "document_id": 7582, "arm": "native_pdf", "question_id": "2210.02442v1.pdf::Q005", "answer_letter": null, "raw_text": "I need to find information about the optimizers used in this research by searching through the document.\n\nLooking at the implementation configurations section (Appendix A), specifically:\n\n1. In Table 3a (MedMNIST pre-training), the optimizer is listed as \"SGD\"\n2. In Table 3b (CIFAR-10-LT pre-training), the optimizer is listed as \"SGD\"\n3. In Table 6 (Benchmark settings), the optimizer is listed as \"SGD\"\n\nThroughout all the experimental settings described in the document, SGD (Stochastic Gradient Descent) is consistently used as the optimizer.\n\nAnswer: SGD", "citations": [], "input_tokens": 58486, "output_tokens": 152, "cost_micros": 177738, "latency_ms": 32796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2210.02442v1.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "sgd", "normalised_gold": "sgd"}}
+{"qid": "2210.02442v1.pdf::Q005", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "SGD", "pages": 24, "document_id": 7582, "arm": "azure_basic_lc", "question_id": "2210.02442v1.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about the optimizers used in the research.\n\nIn Table 3 (page 14), which shows \"Contrastive learning settings on MedMNIST and CIFAR-10-LT,\" I can see:\n\nFor both MedMNIST pre-training (Table 3a) and CIFAR-10-LT pre-training (Table 3b):\n- optimizer: SGD\n\nIn Table 6 (page 15), which shows \"Benchmark settings,\" the optimizer listed is also:\n- optimizer: SGD\n\nThroughout the document, only SGD (Stochastic Gradient Descent) is mentioned as the optimizer.\n\nAnswer: SGD", "citations": [], "input_tokens": 21467, "output_tokens": 164, "cost_micros": 66861, "latency_ms": 6155, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "sgd", "normalised_gold": "sgd"}}
+{"qid": "2210.02442v1.pdf::Q005", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "SGD", "pages": 24, "document_id": 7582, "arm": "azure_premium_lc", "question_id": "2210.02442v1.pdf::Q005", "answer_letter": null, "raw_text": "Looking through the document for optimizer information:\n\nIn Table 3 (pages 13-14), which shows contrastive learning settings:\n- For MedMNIST pre-training: optimizer = SGD\n- For CIFAR-10-LT pre-training: optimizer = SGD\n\nIn Table 6 (page 15), which shows benchmark settings:\n- For training MedMNIST, CIFAR-10, and CIFAR-10-LT: optimizer = SGD\n\nAnswer: SGD", "citations": [], "input_tokens": 26472, "output_tokens": 120, "cost_micros": 81216, "latency_ms": 5093, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "sgd", "normalised_gold": "sgd"}}
+{"qid": "2210.02442v1.pdf::Q005", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "SGD", "pages": 24, "document_id": 7582, "arm": "llamacloud_basic_lc", "question_id": "2210.02442v1.pdf::Q005", "answer_letter": null, "raw_text": "Looking through the document for optimizer information:\n\nIn Table 3 (page 12), the contrastive learning settings show:\n- optimizer: SGD\n- optimizer momentum: 0.9\n\nIn Table 6 (page 13), the benchmark settings show:\n- optimizer: SGD\n\nAnswer: SGD", "citations": [], "input_tokens": 23172, "output_tokens": 72, "cost_micros": 70596, "latency_ms": 3764, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "sgd", "normalised_gold": "sgd"}}
+{"qid": "2210.02442v1.pdf::Q005", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "SGD", "pages": 24, "document_id": 7582, "arm": "llamacloud_premium_lc", "question_id": "2210.02442v1.pdf::Q005", "answer_letter": null, "raw_text": "Looking through the document for optimizer information:\n\nIn Table 3 (Contrastive learning settings), the optimizer is listed as **SGD** for both MedMNIST and CIFAR-10-LT pre-training.\n\nIn Table 6 (Benchmark settings), the optimizer is also listed as **SGD** for training MedMNIST, CIFAR-10, and CIFAR-10-LT.\n\nAnswer: SGD", "citations": [], "input_tokens": 37265, "output_tokens": 106, "cost_micros": 113385, "latency_ms": 6156, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "sgd", "normalised_gold": "sgd"}}
+{"qid": "2210.02442v1.pdf::Q005", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "SGD", "pages": 24, "document_id": 7582, "arm": "surfsense_agentic", "question_id": "2210.02442v1.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to find information about the optimizers used in the research.Let me search for more specific terms related to optimizers:Good! I found SGD. Let me read the sections that mention optimizer settings to get a complete picture:Let me check the benchmark settings table as well:Perfect! I found the optimizer information. Based on the document, I can see that SGD (Stochastic Gradient Descent) is consistently used as the optimizer across different experimental settings:\n\n1. Table 3(a) - MedMNIST pre-training: optimizer = SGD\n2. Table 3(b) - CIFAR-10-LT pre-training: optimizer = SGD  \n3. Table 6 - Benchmark settings for MedMNIST, CIFAR-10, and CIFAR-10-LT: optimizer = SGD\n\nAnswer: SGD (Stochastic Gradient Descent)", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 42718, "error": null, "extra": {"thread_id": 1962, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_eae58ea384884f52967fd1489c25db07", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 204, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 0.4, "method": "str_norm", "normalised_pred": "sgd stochastic gradient descent", "normalised_gold": "sgd"}}
+{"qid": "2210.02442v1.pdf::Q007", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "ground truth", "pages": 24, "document_id": 7582, "arm": "native_pdf", "question_id": "2210.02442v1.pdf::Q007", "answer_letter": null, "raw_text": "Looking at Figure 11 which shows the Dataset Maps for PathMNIST:\n\nThe Data Map by ground truth (panel b) shows data points that are clearly stratified by classes, with distinct vertical bands representing different classes across the confidence axis.\n\nThe Data Map by pseudo-labels (panel c) shows data points that are more continuously distributed across the confidence range, without the same degree of class stratification.\n\nFrom the visual representation, the Data Map by ground truth appears to have data points spread more widely across different confidence levels (from approximately 0.0 to 1.0), while the pseudo-labels map shows a somewhat more compressed distribution in terms of confidence values.\n\nAnswer: Data Map by ground truth", "citations": [], "input_tokens": 58503, "output_tokens": 153, "cost_micros": 177804, "latency_ms": 44546, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2210.02442v1.pdf"}, "graded": {"correct": true, "f1": 0.5714285714285715, "method": "str_norm", "normalised_pred": "data map by ground truth", "normalised_gold": "ground truth"}}
+{"qid": "2210.02442v1.pdf::Q007", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "ground truth", "pages": 24, "document_id": 7582, "arm": "azure_basic_lc", "question_id": "2210.02442v1.pdf::Q007", "answer_letter": null, "raw_text": "Looking at Figure 3 in the document, which shows the Dataset Maps for PathMNIST:\n\n- Figure 3b shows the Data Map by ground truth with confidence values ranging from approximately 0.2 to 1.0\n- Figure 3c shows the Data Map by pseudo-labels with confidence values ranging from approximately 0.0 to 0.6\n\nThe map by ground truth shows a wider spread of confidence values across a larger range (approximately 0.8 units) compared to the pseudo-labels map (approximately 0.6 units), and the data points in the ground truth map appear more dispersed across different confidence levels.\n\nAnswer: ground truth", "citations": [], "input_tokens": 21484, "output_tokens": 149, "cost_micros": 66687, "latency_ms": 5985, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "ground truth", "normalised_gold": "ground truth"}}
+{"qid": "2210.02442v1.pdf::Q007", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "ground truth", "pages": 24, "document_id": 7582, "arm": "azure_premium_lc", "question_id": "2210.02442v1.pdf::Q007", "answer_letter": null, "raw_text": "Looking at Figure 11 in the document, which shows the Dataset Maps for PathMNIST:\n\n- Figure 11(b) shows the Data Map by ground truth\n- Figure 11(c) shows the Data Map by pseudo-labels\n\nExamining the confidence (y-axis) distribution in both maps:\n\nIn the ground truth map (11b), the data points are stratified by classes and spread across different confidence levels, with points distributed from approximately 0.2 to 1.0.\n\nIn the pseudo-labels map (11c), the data points are more clustered, with most points concentrated in a narrower confidence range, roughly between 0.2 and 0.9.\n\nThe ground truth map shows a wider spread of confidence values across the vertical axis compared to the pseudo-labels map.\n\nAnswer: Ground truth", "citations": [], "input_tokens": 26489, "output_tokens": 184, "cost_micros": 82227, "latency_ms": 6172, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "ground truth", "normalised_gold": "ground truth"}}
+{"qid": "2210.02442v1.pdf::Q007", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "ground truth", "pages": 24, "document_id": 7582, "arm": "llamacloud_basic_lc", "question_id": "2210.02442v1.pdf::Q007", "answer_letter": null, "raw_text": "Looking at Figure 3 in the document, which shows the Dataset Maps for PathMNIST:\n\n- Figure 3(b) shows the Dataset Map based on ground truths\n- Figure 3(c) shows the Dataset Map based on pseudo-labels\n\nExamining the y-axis (confidence) range in both maps:\n\nIn Figure 3(b) (ground truth), the confidence values span approximately from 0.2 to 1.0, showing data points distributed across a wide range with clear stratification by classes.\n\nIn Figure 3(c) (pseudo-labels), the confidence values span approximately from 0.0 to 0.6, showing a more compressed distribution.\n\nThe map based on ground truths displays a larger spread in confidence values from low to high, indicating greater variance in confidence compared to the pseudo-labels map which shows a more compressed range.\n\nAnswer: ground truth", "citations": [], "input_tokens": 23189, "output_tokens": 198, "cost_micros": 72537, "latency_ms": 6797, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "ground truth", "normalised_gold": "ground truth"}}
+{"qid": "2210.02442v1.pdf::Q007", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "ground truth", "pages": 24, "document_id": 7582, "arm": "llamacloud_premium_lc", "question_id": "2210.02442v1.pdf::Q007", "answer_letter": null, "raw_text": "I need to examine the data maps for PathMNIST to compare the confidence variance between the map by ground truth and the map by pseudo-labels.\n\nLooking at Figure 3 in the document, which shows \"Active querying based on Dataset Maps\":\n\n- Figure 3(b) shows the Dataset Map based on ground truths\n- Figure 3(c) shows the Dataset Map based on pseudo-labels\n\nFrom the visual comparison of these two maps:\n- The map by ground truth (3b) shows confidence values ranging roughly from 0.0 to 1.0\n- The map by pseudo-labels (3c) shows confidence values also ranging roughly from 0.0 to 1.0\n\nLooking more carefully at the spread of points:\n- In the ground truth map (3b), the confidence values appear to span a wider range vertically (approximately 0.0 to 1.0)\n- In the pseudo-labels map (3c), the confidence values appear more compressed, with most points concentrated between roughly 0.1 and 0.6\n\nThis indicates that the map by ground truth has a larger variance in confidence values compared to the map by pseudo-labels.\n\nAnswer: Ground truth", "citations": [], "input_tokens": 37282, "output_tokens": 267, "cost_micros": 115851, "latency_ms": 7093, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "ground truth", "normalised_gold": "ground truth"}}
+{"qid": "2210.02442v1.pdf::Q007", "doc_id": "2210.02442v1.pdf", "answer_format": "str", "gold": "ground truth", "pages": 24, "document_id": 7582, "arm": "surfsense_agentic", "question_id": "2210.02442v1.pdf::Q007", "answer_letter": null, "raw_text": "I'll read the document to find information about PathMNIST dataset maps and confidence variance.Let me search for \"PathMNIST\" in the document to find the relevant section.Let me search for \"dataset map\" or look at figures related to PathMNIST.Good! I found references to Dataset Maps. Let me read the section around line 680-681 and look at Figure 3.Let me continue reading to see the full figure and find PathMNIST specifically.This figure appears to be about BloodMNIST based on the cell types mentioned. Let me search for figures with PathMNIST.Perfect! I found Figure 11 which includes PathMNIST dataset maps. Let me read that section.Perfect! I can see the PathMNIST dataset maps in Figure 11. Now I need to look at the actual visual representation or description to understand which map has larger confidence variance. Let me continue reading to see if there's more detail about the variance.I can see the figure caption and layout, but I need to look at the actual scatter plots more carefully. Looking at the text I've read:\n\nFor PathMNIST (lines 3829-3883):\n- (b) Data Map by ground truth - shows confidence on y-axis (0.0-1.0) and variability on x-axis\n- (c) Data Map by pseudo-labels - shows confidence on y-axis (0.0-1.0) and variability on x-axis\n\nThe figure shows the confidence ranges from 0.0 to 1.0 on the y-axis for both maps. However, I need to check if there's additional information about which map has larger variance. Let me look at the visual descriptions of the actual figure images.Let me look for the visual description of Figure 11 specifically, which should contain information about the PathMNIST maps.Let me search for image descriptions related to Figure 11's scatter plots to find information about confidence variance.I need to look more carefully at the PathMNIST data in Figure 11. Let me look for visual descriptions of the actual scatter plots. Let me search for image descriptions that might contain the scatter plot information.Looking at the data from PathMNIST in Figure 11:\n- Panel (b) shows \"Data Map by ground truth\" with confidence ranging from 0.0 to 1.0 (as I can see from lines 3831-3873)\n- Panel (c) shows \"Data Map by pseudo-labels\" with confidence ranging from 0.0 to 1.0\n\nThe confidence is on the y-axis. Looking at the values, I see for panel (b) the y-axis goes from 0.0 to ~0.9, and for panel (c) it goes from 0.0 to 1.0 (full range). The question asks about \"confidence variance\" which would be the spread/range of confidence values on the y-axis.\n\nHowever, I need to look more carefully. Let me check if there are image descriptions that describe the actual scatter plot distributions.Let me look at the image descriptions around the PathMNIST figure more carefully. Let me search for image descriptions near Figure 11.Let me look for scatter plot images that are specifically associated with PathMNIST. Let me check around the chunks where PathMNIST appears in Figure 11.Let me look for the actual scatter plot images. Let me search for specific image descriptions that mention PathMNIST scatter plots with confidence and variability axes.Let me search more specifically for scatter plots with confidence and variability axes that relate to PathMNIST.Good! I found a scatter plot description. Let me look for more scatter plots, especially those that might be for PathMNIST ground truth vs pseudo-labels maps. Let me continue searching.Good! I found another scatter plot. Let me continue reading this one.I'm finding scatter plots but I need to identify which ones specifically correspond to PathMNIST panel (b) ground truth and panel (c) pseudo-labels from Figure 11. Let me look for scatter plots with specific axis ranges that match what I saw earlier for PathMNIST.\n\nLooking back at the PathMNIST data from lines 3829-3883, I saw:\n- The y-axis (confidence) goes from 0.0 to at least 0.8\n- Panel (b) shows variability on x-axis with range mentioned as 0.2, 0.4\n- Panel (c) also shows variability on x-axis with range 0.2, 0.4\n\nLet me search for images that might be specifically labeled or that have characteristics matching PathMNIST.I found Im5.jpg which is a scatter plot with confidence 0.0-1.0 and variability 0.0-0.4. This could be one of the PathMNIST maps. Let me look for more scatter plots around Figure 11 area. Let me check if there are descriptions that tell me which image corresponds to which panel of Figure 11.\n\nActually, looking back at the text structure, the figure content is laid out sequentially. From lines 3805-3884, I saw Figure 11 with PathMNIST having:\n- (a) Overall distribution (line 3881)\n- (b) Data Map by ground truth (line 3883)\n- (c) Data Map by pseudo-labels (line 3884)\n\nThe OCR text I saw showed axes values. Let me look more carefully at Im24.jpg and Im29.jpg which I found, as well as other scatter plots, to see if I can determine which represents ground truth vs pseudo-labels.\n\nLet me look for a pattern. Based on the figure caption at line 3805, panels (b) and (e) are ground truth maps, and panels (c) and (f) are pseudo-label maps. Since PathMNIST is (a,b,c) and OrganAMNIST is (d,e,f), I need to find the two scatter plot images that follow the PathMNIST overall distribution.\n\nLet me look at the description of Im24.jpg more carefully (lines 7044-7077) - it shows confidence 0.0-1.0 and variability 0.0-~0.5, with an inverse relationship.\n\nAnd Im29.jpg (lines 7158-7194) shows confidence 0.0-1.0 and variability 0.0-~0.6, with points spread from (0.1, 0.0) to (0.6, 0.7).\n\nNow let me check Im5.jpg (lines 4897-4932) - it shows confidence 0.0-1.0 and variability 0.0-0.4, with a **negative correlation** (upper left high confidence/low variability to lower right low confidence/high variability).\n\nThis is key! The description says Im5.jpg shows data points concentrated in \"Upper left region (low variability, high confidence)\" going to \"Lower right region (high variability, low confidence)\" with \"increased scatter/spread of points in middle and lower portions of curve.\"\n\nNow let me check Im24.jpg and Im29.jpg again to see which shows more spread/variance in confidence.\n\nIm24.jpg (lines 7067-7076): \"Greater scatter/dispersion visible at intermediate confidence values (0.2-0.8)\" and \"Points more tightly clustered near confidence extremes (approaching 0.0 and 1.0)\"\n\nIm29.jpg (lines 7181-7194): \"Some scatter/dispersion around the main curve throughout the distribution\"\n\nLet me look for more scatter plot images that might be the actual PathMNIST panels. Let me check if there are images with specific patterns that would indicate stratification by class (ground truth) vs mixed patterns (pseudo-labels).Perfect! The caption says that ground truth maps have \"data are stratified by classes in the 2D space\" - meaning the points would be grouped/clustered by class. This would create a MORE spread out or varied pattern compared to pseudo-labels which would be more mixed/homogeneous.\n\nBased on the figure captions, the ground truth map shows stratification by classes, which would likely result in **larger variance** in confidence values since different classes cluster at different confidence levels.\n\nLet me search for visual descriptions that mention clustering or stratification patterns in scatter plots.I found very useful information! Looking at lines 5699-5716, there's a description of a clustered scatter plot which might be relevant. But more importantly, I need to focus on the actual PathMNIST dataset maps.\n\nThe key insight from the caption is that ground truth maps show \"data are stratified by classes in the 2D space\" - this means points are separated/grouped by class, which would naturally create **more variance** in confidence values across the entire dataset map, since different classes would cluster at different confidence levels.\n\nIn contrast, pseudo-label maps would show a more homogeneous distribution since pseudo-labels don't perfectly separate classes and mix them together, resulting in **less variance** in confidence values.\n\nBased on the figure caption (line 3805) stating that ground truth maps have data \"stratified by classes\" creating distinct groupings at different confidence levels, the ground truth map would have **larger confidence variance**.\n\nAnswer: ground truth", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 157108, "error": null, "extra": {"thread_id": 1963, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_67e301ccb20c47e7a10fd97f2b39b6f0", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 1340, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "ground truth", "normalised_gold": "ground truth"}}
+{"qid": "2303.05039v2.pdf::Q000", "doc_id": "2303.05039v2.pdf", "answer_format": "int", "gold": "8980", "pages": 9, "document_id": 7583, "arm": "native_pdf", "question_id": "2303.05039v2.pdf::Q000", "answer_letter": null, "raw_text": "I need to find information about Amazon-related datasets mentioned in the paper.\n\nFrom the document, I can identify the following Amazon-related datasets:\n\n1. **Amazon-beauty dataset** - mentioned multiple times throughout the paper as one of the two newly constructed datasets\n2. **Amazon-music dataset** - mentioned multiple times throughout the paper as the second newly constructed dataset\n\nLooking at Table 3 (Dataset Statistics), I can see three datasets are discussed:\n- Amazon-beauty\n- Amazon-music  \n- Personality'18\n\nThe Personality'18 dataset is based on MovieLens, not Amazon.\n\nThe paper explicitly states: \"we construct two new domain-specific datasets: an Amazon-beauty and an Amazon-music dataset.\"\n\nTherefore, there are 2 Amazon-related datasets in total.\n\nAnswer: 2", "citations": [], "input_tokens": 26915, "output_tokens": 180, "cost_micros": 83445, "latency_ms": 10844, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2303.05039v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "8980"}}
+{"qid": "2303.05039v2.pdf::Q000", "doc_id": "2303.05039v2.pdf", "answer_format": "int", "gold": "8980", "pages": 9, "document_id": 7583, "arm": "azure_basic_lc", "question_id": "2303.05039v2.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the total number of items across the Amazon-related datasets mentioned in the paper.\n\nFrom Table 3 (Statistics of the three datasets used in our study), I can see:\n\n1. Amazon-beauty dataset: 85 items\n2. Amazon-music dataset: 8,895 items\n\nThe paper also mentions Personality'18 dataset with 21,776 items, but this is a MovieLens dataset, not an Amazon dataset.\n\nTotal items in Amazon-related datasets: 85 + 8,895 = 8,980\n\nAnswer: 8980", "citations": [], "input_tokens": 13244, "output_tokens": 132, "cost_micros": 41712, "latency_ms": 5375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "8980", "normalised_gold": "8980"}}
+{"qid": "2303.05039v2.pdf::Q000", "doc_id": "2303.05039v2.pdf", "answer_format": "int", "gold": "8980", "pages": 9, "document_id": 7583, "arm": "azure_premium_lc", "question_id": "2303.05039v2.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the total number of items across the Amazon-related datasets mentioned in the paper.\n\nLooking at Table 3 (Statistics of the three datasets used in our study), I can find:\n\n1. Amazon-beauty dataset: 85 items\n2. Amazon-music dataset: 8,895 items\n\nThe paper mentions two Amazon-related datasets: Amazon-beauty and Amazon-music.\n\nTotal items = 85 + 8,895 = 8,980\n\nAnswer: 8980", "citations": [], "input_tokens": 16269, "output_tokens": 113, "cost_micros": 50502, "latency_ms": 5280, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "8980", "normalised_gold": "8980"}}
+{"qid": "2303.05039v2.pdf::Q000", "doc_id": "2303.05039v2.pdf", "answer_format": "int", "gold": "8980", "pages": 9, "document_id": 7583, "arm": "llamacloud_basic_lc", "question_id": "2303.05039v2.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the total number of items across the Amazon-related datasets mentioned in the paper.\n\nLooking at Table 3 (Dataset Statistics), I can find information about the datasets used in the study. The paper mentions three datasets:\n\n1. Amazon-beauty\n2. Amazon-music  \n3. Personality2018 (which is a MovieLens dataset, not Amazon)\n\nFrom Table 3, I need to find the number of items for the Amazon datasets:\n- The table shows statistics for datasets but I need to locate the specific numbers for items\n\nLooking more carefully at the document, in Section 3.3 \"Dataset Statistics\":\n- It mentions \"Table 3 shows the final statistics of the datasets used in our study\"\n- It states \"The Personality2018 dataset contains the largest number of items and the smallest number of users\"\n- It mentions \"21,776 movies\" for Personality2018\n\nHowever, I cannot see Table 3 clearly displayed with all the numerical values in the provided text. Let me search more carefully...\n\nThe document mentions datasets but the actual Table 3 with complete statistics is not fully visible in the text provided. Without access to the complete Table 3 showing the number of items for Amazon-beauty and Amazon-music datasets, I cannot provide an accurate total.\n\nLooking through the entire document again, I can see references to Table 3 but the actual table with item counts is not included in the extracted text.\n\nSince I cannot find the specific numbers of items for the Amazon datasets in the provided document text, I cannot calculate the total accurately.\n\nAnswer: Unable to determine from the provided text as Table 3 statistics are not fully shown.\n\nWait, let me reconsider - the question asks for an integer answer, so I must find this information. Let me search more carefully for any mention of item counts...\n\nThe document does not provide the specific item counts for Amazon-beauty and Amazon-music datasets in the readable text provided. Without Table 3 being fully visible with those statistics, I cannot provide the accurate total.\n\nAnswer: 0", "citations": [], "input_tokens": 10434, "output_tokens": 451, "cost_micros": 38067, "latency_ms": 11968, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "0", "normalised_gold": "8980"}}
+{"qid": "2303.05039v2.pdf::Q000", "doc_id": "2303.05039v2.pdf", "answer_format": "int", "gold": "8980", "pages": 9, "document_id": 7583, "arm": "llamacloud_premium_lc", "question_id": "2303.05039v2.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the total number of items across the Amazon-related datasets mentioned in the paper.\n\nLooking at Table 3 \"Statistics of the three datasets used in our study\":\n\n- Amazon-beauty: 85 items\n- Amazon-music: 8,895 items\n- Personality'18: 21,776 items (but this is NOT an Amazon dataset, it's a MovieLens dataset)\n\nThe question asks specifically for \"Amazon related datasets\", so I should only count Amazon-beauty and Amazon-music.\n\nTotal items in Amazon datasets = 85 + 8,895 = 8,980\n\nAnswer: 8980", "citations": [], "input_tokens": 14857, "output_tokens": 144, "cost_micros": 46731, "latency_ms": 4718, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "8980", "normalised_gold": "8980"}}
+{"qid": "2303.05039v2.pdf::Q000", "doc_id": "2303.05039v2.pdf", "answer_format": "int", "gold": "8980", "pages": 9, "document_id": 7583, "arm": "surfsense_agentic", "question_id": "2303.05039v2.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find information about Amazon-related datasets.Let me search for \"Amazon\" in the document to locate relevant information about datasets.Now let me read the sections that describe the datasets in detail to count the Amazon-related datasets:Let me continue reading to see more details about the datasets:Based on the document, I can see that the paper mentions Amazon-related datasets. Let me check the table more carefully and search for the total number of items:\n\nFrom Table 3 in the document, I can see three datasets are used in the study:\n1. **Amazon-beauty** - with 85 items\n2. **Amazon-music** - with 8,895 items  \n3. Personality'18 - (this is not Amazon-related, it's a MovieLens dataset)\n\nThe paper constructed two Amazon-related datasets from the parent Amazon review dataset. Looking at the \"# of items\" row in Table 3:\n- Amazon-beauty: 85 items\n- Amazon-music: 8,895 items\n\nTotal items in Amazon-related datasets: 85 + 8,895 = 8,980\n\nAnswer: 8980", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 41483, "error": null, "extra": {"thread_id": 1964, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_a498e50e54164511b090eb63e6f7d154", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 221, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "8980", "normalised_gold": "8980"}}
+{"qid": "2303.05039v2.pdf::Q002", "doc_id": "2303.05039v2.pdf", "answer_format": "float", "gold": "466.43", "pages": 9, "document_id": 7583, "arm": "native_pdf", "question_id": "2303.05039v2.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the average number of words per user for the dataset that contains the Receptiviti score example shown in Table 2.\n\nLooking at Table 2, it shows \"An example of Receptiviti score for a specific, anonymized user\" with User ID A2GBIFL43U1LKJ and their OCEAN personality scores.\n\nNow I need to find which dataset this corresponds to and what the average words per user is for that dataset.\n\nLooking at Table 3 \"Statistics of the three datasets used in our study,\" I can see:\n- Amazon-beauty: Avg. words/user = 990.48\n- Amazon-music: Avg. words/user = 466.43\n- Personality'18: Avg. words/user = - (no data)\n\nThe User ID format A2GBIFL43U1LKJ appears to be from the Amazon dataset format (as shown in Table 1 with User ID A2SUAM1J3GNN38).\n\nHowever, Table 2 doesn't specify which dataset it's from. Looking at the text in Section 3.2, it states: \"For each user that wrote reviews in either of the two domains, we collect all his/her review texts and concatenate them together into a single document.\"\n\nThe text mentions selecting users for personality inference from both Amazon-beauty and Amazon-music datasets. Since Table 2 is just an example and doesn't specify which dataset, and the question asks about \"the dataset from which a data sample of a user's Receptiviti score is shown,\" I need to determine which dataset the example is from.\n\nGiven that the paper discusses both datasets and Table 2 appears to be a generic example, and considering the context of the paper focuses on both datasets equally, I should look for any other clues. However, without explicit information linking Table 2 to a specific dataset, and since the User ID format matches the Amazon dataset structure, the most reasonable interpretation is that this could be from either Amazon-beauty or Amazon-music.\n\nSince the question asks specifically about \"the dataset\" (singular) and Table 2 is presented as a single example, and given no other specific information, I cannot definitively determine which dataset it's from. However, looking at the document structure and flow, Table 2 appears early in the dataset construction section before domain", "citations": [], "input_tokens": 26939, "output_tokens": 512, "cost_micros": 88497, "latency_ms": 18092, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "length", "pdf_filename": "2303.05039v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "2.0", "normalised_gold": "466.43"}}
+{"qid": "2303.05039v2.pdf::Q002", "doc_id": "2303.05039v2.pdf", "answer_format": "float", "gold": "466.43", "pages": 9, "document_id": 7583, "arm": "azure_basic_lc", "question_id": "2303.05039v2.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the average number of words per user for the dataset that contains the data sample shown in Table 2.\n\nTable 2 shows \"An example of Receptiviti score for a specific, anonymized user\" with columns for User ID, AGR, CON, NEU, EXT, and OPEN. The user ID shown is \"A2GBIFL43U1LKJ\".\n\nLooking at Table 4, I can see review text examples with personality labels and scores. The table caption mentions \"The data sample of extreme personality cases to the annotators.\"\n\nNow I need to find which dataset this user belongs to. Looking at Table 3 which shows \"Statistics of the three datasets used in our study,\" I can see:\n\n- Amazon-beauty: Avg. words/user = 990.48\n- Amazon-music: Avg. words/user = 466.43\n- Personality'18: Avg. words/user = - (no data)\n\nThe user ID format \"A2GBIFL43U1LKJ\" appears to be from the Amazon datasets (based on the naming convention).\n\nHowever, I need to determine which specific dataset (beauty or music) this example is from. Looking at the context around Table 2, it appears after the discussion of dataset construction and before the detailed statistics. The table is presented as a general example of the Receptiviti API output.\n\nLooking more carefully at the document flow:\n- Section 3.2 discusses dataset construction for both Amazon-beauty and Amazon-music\n- Table 2 is presented as \"An example of Receptiviti score for a specific, anonymized user\"\n- Table 3 immediately follows with statistics for both datasets\n\nSince Table 2 is presented as a general example without specifying which dataset it's from, and given that both Amazon datasets use similar user ID formats, I need to look for more context.\n\nHowever, the question asks specifically about \"the dataset from which a data sample of a user's Receptiviti score is shown\" - this refers to Table 2. Without explicit identification of which dataset this user is from in Table 2, and given that the paper presents both datasets with their statistics in Table 3, the most reasonable interpretation is that this is asking about one of the Amazon datasets.\n\nSince the question asks for a specific answer and Table 2", "citations": [], "input_tokens": 13268, "output_tokens": 512, "cost_micros": 47484, "latency_ms": 13219, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "2.0", "normalised_gold": "466.43"}}
+{"qid": "2303.05039v2.pdf::Q002", "doc_id": "2303.05039v2.pdf", "answer_format": "float", "gold": "466.43", "pages": 9, "document_id": 7583, "arm": "azure_premium_lc", "question_id": "2303.05039v2.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, I need to find:\n1. Which dataset the data sample in Table 2 comes from\n2. The average words per user for that dataset\n\nTable 2 shows \"An example of Receptiviti score for a specific, anonymized user\" with user ID A2GBIFL43U1LKJ and their OCEAN personality scores.\n\nLooking at Table 1, I can see a similar user ID format (A2SUAM1J3GNN38) which appears to be from the Amazon dataset based on the format and fields shown (reviewerID, asin, etc.).\n\nFrom Table 3 (Statistics of the three datasets used in our study), I can see the \"Avg. words/user\" for each dataset:\n- Amazon-beauty: 990.48\n- Amazon-music: 466.43\n- Personality'18: - (no data)\n\nThe document mentions in Section 3.2 that they used the Receptiviti API to analyze personality from concatenated review texts. The example user ID format in Table 2 (A2GBIFL43U1LKJ) matches the Amazon dataset format.\n\nSince the paper constructs two Amazon datasets and Table 2 is presented early in the methodology section without specifying which domain, and given that Amazon-beauty is mentioned first throughout the paper and has more prominent results, the sample likely comes from Amazon-beauty dataset.\n\nHowever, looking more carefully at the context, the table appears as the first example when explaining the personality scores, and both datasets use the same methodology. The Amazon-music dataset has 466.43 average words per user.\n\nGiven the ambiguity, but noting that Amazon-beauty is discussed first and more extensively, and the table appears in the general methodology section:\n\nAnswer: 990.48", "citations": [], "input_tokens": 16293, "output_tokens": 400, "cost_micros": 54879, "latency_ms": 11655, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "990.48", "normalised_gold": "466.43"}}
+{"qid": "2303.05039v2.pdf::Q002", "doc_id": "2303.05039v2.pdf", "answer_format": "float", "gold": "466.43", "pages": 9, "document_id": 7583, "arm": "llamacloud_basic_lc", "question_id": "2303.05039v2.pdf::Q002", "answer_letter": null, "raw_text": "I need to find information about the average number of words per user used for experiments in the dataset.\n\nLet me search for relevant information about the dataset construction and word counts.\n\nFrom Section 3.2 \"Dataset Construction\":\n- The paper mentions they \"first extract the user reviews that are between 30 to 80 words\"\n- Then they \"concatenate all the valid review texts of each user and input their concatenation to the Receptiviti API\"\n\nFrom Section 3.2, it also states:\n\"The API can take a long piece of human-written text (more than 300 words), and output a faceted personality score\"\n\nThis indicates that they concatenated multiple reviews (each 30-80 words) to reach more than 300 words total per user.\n\nHowever, the question asks specifically about \"the average number of words per user used for the experiments in the dataset from which a data sample of a user's Receptiviti score is shown.\"\n\nLooking at Table 2, which shows \"User ID\" with personality scores, and Table 4, which shows \"Personality label\" with scores and review texts - these are examples of users with Receptiviti scores.\n\nThe key constraint mentioned is:\n- Individual reviews: 30 to 80 words\n- Concatenated text sent to API: more than 300 words\n\nThe paper doesn't provide an exact average number of words per user. However, it does specify that the concatenated reviews must be \"more than 300 words\" to use the Receptiviti API.\n\nSince the paper doesn't explicitly state the average number of words per user, but only mentions the minimum threshold of 300 words for the API, and the individual review range of 30-80 words, I cannot determine an exact average from the information provided.\n\nLet me re-read more carefully... The paper does not explicitly state the average number of words per user used in the experiments. It only mentions the requirements (30-80 words per review, and >300 words total for API input).\n\nWithout explicit information about the average number of words per user in the dataset, I cannot provide an accurate answer to this question based solely on the document.\n\nAnswer: Cannot be determined from the document provided.", "citations": [], "input_tokens": 10458, "output_tokens": 489, "cost_micros": 38709, "latency_ms": 13437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "", "normalised_gold": "466.43"}}
+{"qid": "2303.05039v2.pdf::Q002", "doc_id": "2303.05039v2.pdf", "answer_format": "float", "gold": "466.43", "pages": 9, "document_id": 7583, "arm": "llamacloud_premium_lc", "question_id": "2303.05039v2.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the average number of words per user for the dataset that contains the data sample shown in Table 2.\n\nLooking at Table 2, it shows \"An example of Receptiviti score for a specific, anonymized user\" with User ID A2GBIFL43U1LKJ and their OCEAN personality scores.\n\nNow I need to identify which dataset this user belongs to. Looking at Table 3 which shows \"Statistics of the three datasets used in our study,\" I can see three datasets:\n- Amazon-beauty: Avg. words/user = 990.48\n- Amazon-music: Avg. words/user = 466.43\n- Personality'18: Avg. words/user = - (not applicable)\n\nThe user ID format \"A2GBIFL43U1LKJ\" appears to be from the Amazon dataset format (starting with \"A\" followed by alphanumeric characters).\n\nLooking at Table 1, which shows \"An example of Receptiviti score for a specific, anonymized user\" and contains a reviewerID \"A2SUAM1J3GNN38\" - this is clearly an Amazon dataset format.\n\nHowever, I need to determine which Amazon dataset (beauty or music) the user in Table 2 belongs to. The document doesn't explicitly state which dataset Table 2's user is from.\n\nLooking more carefully at the document structure:\n- Table 1 shows an Amazon review example\n- Table 2 shows a Receptiviti score example\n- Table 3 shows statistics for all three datasets\n\nSince the document constructs two datasets (Amazon-beauty and Amazon-music) and Table 2 appears early in the dataset construction section (Section 3.2), and considering that both Amazon datasets were constructed using the same methodology, I need to check if there's any indication of which one is being referenced.\n\nLooking at Table 4, which shows \"data sample of extreme personality cases,\" these examples appear to be from the Amazon-beauty dataset based on the product descriptions (shampoo, shower gel, etc.).\n\nHowever, the question asks specifically about \"the dataset from which a data sample of a user's Receptiviti score is shown\" referring to Table 2. Without explicit indication of which dataset Table 2's user belongs to, and given that the methodology section discusses both datasets together", "citations": [], "input_tokens": 14881, "output_tokens": 512, "cost_micros": 52323, "latency_ms": 13687, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "2.0", "normalised_gold": "466.43"}}
+{"qid": "2303.05039v2.pdf::Q002", "doc_id": "2303.05039v2.pdf", "answer_format": "float", "gold": "466.43", "pages": 9, "document_id": 7583, "arm": "surfsense_agentic", "question_id": "2303.05039v2.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the document to find information about the average number of words per user used for the experiments.Let me search for information about \"Receptiviti\" and \"words per user\" in the document.Let me read more of the document to find the relevant information. I'll search for sections that might contain dataset statistics or experimental setup.I found a key piece of information! Let me read around line 663 where it mentions \"Each user in the Amazon-music dataset has an average of 990.48 review words over all of\".Good! Now let me also check if there's similar information for the Amazon-beauty dataset, and let me look at the table and figure captions to find where Receptiviti scores are shown.Great! I found that Table 1 shows \"An example of Receptiviti score for a specific, anonymized user.\" Let me continue reading this table and find information about the dataset statistics, particularly for the Amazon-beauty dataset.Perfect! Now I need to find the statistics for the Amazon-beauty dataset. Let me look at Table 3 which shows \"the final statistics of the datasets.\"Excellent! I found Table 3 which shows the statistics for all three datasets. Now I need to identify which dataset the Receptiviti score example (Table 1) comes from.\n\nLooking at the document:\n- Table 1 (around line 581) shows \"An example of Receptiviti score for a specific, anonymized user\" with user ID A2GBIFL43U1LKJ\n- Table 3 (lines 689-732) shows the average words per user for each dataset:\n  - Amazon-beauty: 990.48\n  - Amazon-music: 466.43\n  - Personality'18: -\n\nWait, I notice there's a discrepancy. Let me re-read the section more carefully. The text at line 662-664 says \"Each user in the Amazon-music dataset has an average of 990.48 review words\" but Table 3 shows Amazon-beauty has 990.48 and Amazon-music has 466.43.\n\nLet me read that section again more carefully:There seems to be an inconsistency in the text. Let me check the table more carefully again and see if the columns are correctly labeled:Looking at Table 3, the structure shows:\n- Row 1: Headers - Dataset | Amazon-beauty | Amazon-music | Personality'18\n- The text mentions 1,791 users for Amazon-music (which matches column 2: 1,791)\n- The text mentions 990.48 words/user and 51.01 words/review for Amazon-music\n\nBut in Table 3:\n- Column Amazon-music (2nd data column) shows: 1,791 users, 990.48 avg words/user\n- Wait, I need to re-check. Let me look again.\n\nActually, looking more carefully at the table structure:\n- Line 705: 1,791 is under Amazon-music column for # of users\n- Line 722-723: Avg. words/user shows 990.48 under Amazon-beauty and 466.43 under Amazon-music\n\nThere appears to be a mismatch between the text description and the table. However, the table columns might be swapped. Let me check by cross-referencing other values. The text says \"1,791 active users are selected for the Amazon-music dataset\" and in the table, 1,791 appears in the Amazon-music column.\n\nSo the table seems correct. The text at line 662-664 might have an error, or the columns in the table might be swapped.\n\nNow, to identify which dataset the Receptiviti score sample comes from, let me look more carefully at Table 1:Good! So I can see:\n- Table 1 (line 581) shows an Amazon review example with reviewText mentioning \"piano\" and \"old hymns\" - this appears to be from the music domain\n- Table 2 (line 635) shows \"An example of Receptiviti score for a specific, anonymized user\" with user ID A2GBIFL43U1LKJ\n\nThe review in Table 1 is clearly about music (piano, hymns), so it's from the Amazon-music dataset.\n\nNow, looking back at the data:\n- The text at lines 662-664 states: \"Each user in the Amazon-music dataset has an average of 990.48 review words\"\n- But Table 3 shows Amazon-music with 466.43 avg words/user\n\nLet me verify by checking if there might be swapped columns. Looking at # of users:\n- Text says 1,791 users for Amazon-music\n- Table shows 1,791 under Amazon-music column\nSo the Amazon-music column is correctly labeled.\n\nLet me check # of items:\n- Amazon-beauty should be smaller (beauty products vs music products)\n- Table shows 85 items for Amazon-beauty and 8,895 for Amazon-music\nThis makes sense - there would be more music items than beauty items.\n\nSo it appears there's an error in the text at line 663, where it says \"990.48\" but should say \"466.43\" based on Table 3. OR the table columns for Avg. words/user might be swapped.\n\nLet me verify using Avg. words/review:\n- Text says Amazon-music has \"averaging 51.01 words for each review\"\n- Table shows Amazon-beauty: 51.01, Amazon-music: 51.18\n\nThis suggests the columns might indeed be swapped for the word statistics! The text matches if we swap the columns.\n\nGiven that:\n1. The Receptiviti score sample (Table 1 and Table 2) shows a music review\n2. The text explicitly states the example is from Amazon-music with 990.48 avg words/user\n3. There appears to be a column swap in Table 3 for the word statistics\n\nThe answer should be **990.48** for the Amazon-music dataset.\n\nAnswer: 990.48", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 89094, "error": null, "extra": {"thread_id": 1965, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_a03731abb68747a9a4c0bcb35d9412ce", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 777, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "990.48", "normalised_gold": "466.43"}}
+{"qid": "2303.05039v2.pdf::Q004", "doc_id": "2303.05039v2.pdf", "answer_format": "str", "gold": "Orange", "pages": 9, "document_id": 7583, "arm": "native_pdf", "question_id": "2303.05039v2.pdf::Q004", "answer_letter": null, "raw_text": "I need to find the OCEAN personality scores for user A2GBIFL43U1LKJ and determine which personality trait has the highest score, then identify its color in the soft-labeled personality embedding matrix diagram.\n\nFrom Table 2 on page 4, User A2GBIFL43U1LKJ has the following scores:\n- AGR (Agreeableness): 54.05\n- CON (Conscientiousness): 34.87\n- NEU (Neuroticism): 25.96\n- EXT (Extroversion): 54.39\n- OPEN (Openness): 42.71\n\nThe highest score is 54.39 for Extroversion (EXT).\n\nLooking at Figure 1 on page 5, in the \"Method 2: Soft-labeled personality\" section, the personality embedding matrix shows five rows for the OCEAN traits. The rows are colored differently:\n- Openness: appears to be gray/dark\n- Conscientiousness: appears to be gray/dark\n- Extroversion: appears to be gray/dark\n- Agreeableness: appears to be gray/dark\n- Neuroticism: appears to be gray/dark\n\nHowever, looking more carefully at the figure, the personality embedding matrix shows circles in different shades. For Extroversion (the third row), the circles appear to be a lighter shade or gray.\n\nAnswer: gray", "citations": [], "input_tokens": 26942, "output_tokens": 325, "cost_micros": 85701, "latency_ms": 14828, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2303.05039v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "gray", "normalised_gold": "orange"}}
+{"qid": "2303.05039v2.pdf::Q004", "doc_id": "2303.05039v2.pdf", "answer_format": "str", "gold": "Orange", "pages": 9, "document_id": 7583, "arm": "azure_basic_lc", "question_id": "2303.05039v2.pdf::Q004", "answer_letter": null, "raw_text": "I need to find the personality scores for User A2GBIFL43U1LKJ and determine which has the highest score, then identify its color in the soft-labeled personality embedding matrix shown in Figure 1.\n\nFrom Table 2, User A2GBIFL43U1LKJ has the following OCEAN scores:\n- AGR (Agreeableness): 54.05\n- CON (Conscientiousness): 34.87\n- NEU (Neuroticism): 25.96\n- EXT (Extroversion): 54.39\n- OPEN (Openness): 42.71\n\nThe highest score is EXT (Extroversion) at 54.39.\n\nLooking at Figure 1, Method 2 shows the soft-labeled personality embedding matrix. The personality traits are listed in order as:\n- Openness (O) - shown in purple/dark blue\n- Conscientiousness (C) - shown in orange\n- Extroversion (E) - shown in green\n- Agreeableness (A) - shown in red\n- Neuroticism (N) - shown in purple\n\nSince Extroversion has the highest score for this user, I need to identify the color associated with Extroversion (E) in the embedding matrix diagram.\n\nAnswer: green", "citations": [], "input_tokens": 13271, "output_tokens": 301, "cost_micros": 44328, "latency_ms": 6467, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "green", "normalised_gold": "orange"}}
+{"qid": "2303.05039v2.pdf::Q004", "doc_id": "2303.05039v2.pdf", "answer_format": "str", "gold": "Orange", "pages": 9, "document_id": 7583, "arm": "azure_premium_lc", "question_id": "2303.05039v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Table 2, which shows \"An example of Receptiviti score for a specific, anonymized user\":\n\nUser ID: A2GBIFL43U1LKJ\n- AGR (Agreeableness): 54.05\n- CON (Conscientiousness): 34.87\n- NEU (Neuroticism): 25.96\n- EXT (Extroversion): 54.39\n- OPEN (Openness): 42.71\n\nThe highest score is 54.39 for Extroversion (EXT).\n\nLooking at Figure 1, specifically the \"Method 2: Soft-labeled personality\" diagram, it shows the personality embedding matrix with different personality traits represented in different colors. From the figure, Extroversion is shown in purple/violet color in the personality embedding matrix.\n\nAnswer: Purple", "citations": [], "input_tokens": 16296, "output_tokens": 195, "cost_micros": 51813, "latency_ms": 6375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "purple", "normalised_gold": "orange"}}
+{"qid": "2303.05039v2.pdf::Q004", "doc_id": "2303.05039v2.pdf", "answer_format": "str", "gold": "Orange", "pages": 9, "document_id": 7583, "arm": "llamacloud_basic_lc", "question_id": "2303.05039v2.pdf::Q004", "answer_letter": null, "raw_text": "I need to find the personality scores for User A2GBIFL43U1LKJ and then identify which personality trait has the highest score, and finally determine what color represents that trait in the soft-labeled personality embedding visualization.\n\nFrom Table 2 in the document, User A2GBIFL43U1LKJ has the following OCEAN scores:\n- AGR (Agreeableness): 54.05\n- CON (Conscientiousness): 34.87\n- NEU (Neuroticism): 25.96\n- EXT (Extroversion): 54.39\n- OPEN (Openness): 42.71\n\nThe highest score is EXT (Extroversion) at 54.39.\n\nNow I need to look at Figure 1 which shows the personality embedding methods. In the \"Method 2: Soft-labeled personality\" section, there is a personality embedding vector shown with the OCEAN scores {30,70,50,30,20} for an example user. The figure shows these are arranged in order: Neuroticism, Conscientiousness, Extraversion, Agreeableness, Openness.\n\nHowever, the document does not explicitly show colors associated with each personality trait in the soft-labeled personality embedding matrix visualization. The figure appears to be in grayscale in the provided document, and there is no color legend or color coding system described for the personality traits.\n\nAnswer: The document does not specify colors for personality traits in the embedding matrix.", "citations": [], "input_tokens": 10461, "output_tokens": 337, "cost_micros": 36438, "latency_ms": 8765, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "document does not specify colors for personality traits in embedding matrix", "normalised_gold": "orange"}}
+{"qid": "2303.05039v2.pdf::Q004", "doc_id": "2303.05039v2.pdf", "answer_format": "str", "gold": "Orange", "pages": 9, "document_id": 7583, "arm": "llamacloud_premium_lc", "question_id": "2303.05039v2.pdf::Q004", "answer_letter": null, "raw_text": "I need to find the personality scores for User A2GBIFL43U1LKJ and identify which has the highest score, then determine what color represents that personality in the soft-labeled personality embedding matrix diagram.\n\nFrom Table 2, User A2GBIFL43U1LKJ has the following scores:\n- AGR (Agreeableness): 54.05\n- CON (Conscientiousness): 34.87\n- NEU (Neuroticism): 25.96\n- EXT (Extroversion): 54.39\n- OPEN (Openness): 42.71\n\nThe highest score is EXT (Extroversion) at 54.39.\n\nLooking at Figure 1, specifically at Method 2 (Soft-labeled personality), I can see the personality embedding matrix shows five colored vectors. However, the figure in the document does not clearly show distinct colors for each personality trait in a way that I can definitively identify which color corresponds to Extroversion.\n\nThe figure shows a \"Personality embedding matrix\" but the specific color coding for each of the five OCEAN traits is not explicitly labeled or described in the document text.\n\nWithout clear color labels in the figure or explicit description in the text mapping colors to specific personality traits, I cannot determine the color with certainty.\n\nAnswer: Cannot be determined from the document", "citations": [], "input_tokens": 14884, "output_tokens": 303, "cost_micros": 49197, "latency_ms": 8453, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "cannot be determined from document", "normalised_gold": "orange"}}
+{"qid": "2303.05039v2.pdf::Q004", "doc_id": "2303.05039v2.pdf", "answer_format": "str", "gold": "Orange", "pages": 9, "document_id": 7583, "arm": "surfsense_agentic", "question_id": "2303.05039v2.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to find information about the personality vector color for the specified user.Let me search for information about personality vectors, Receptiviti scores, and soft-labeled personality embeddings in the document.Now let me read the specific sections about the soft-labeled personality embedding matrix and look for figures or tables with user A2GBIFL43U1LKJ:Perfect! I found Table 2 which shows the Receptiviti scores for User A2GBIFL43U1LKJ. The scores are:\n- AGR (Agreeableness): 54.05\n- CON (Conscientiousness): 34.87\n- NEU (Neuroticism): 25.96\n- EXT (Extraversion): 54.39\n- OPEN (Openness): 42.71\n\nThe highest score is EXT (Extraversion) at 54.39. Now I need to find the figure showing the soft-labeled personality embedding matrix to see what color corresponds to Extraversion.I can see the structure but I need to look at the actual image to identify the colors. Let me search for more information about the figure and its color coding:Let me look at the detailed visual description of Figure 1 with the personality embedding matrix:Perfect! I found the answer. According to the visual description of Figure 1, which shows the soft-labeled personality embedding matrix (Method 2), the personality traits are color-coded as follows:\n\n- Purple circles: Openness\n- Blue circles: Conscientiousness\n- **Brown/orange circles: Extraversion**\n- Green circles: Agreeableness\n- Black/gray circles: Neuroticism\n\nFrom Table 2, User A2GBIFL43U1LKJ has the following Receptiviti scores:\n- AGR (Agreeableness): 54.05\n- CON (Conscientiousness): 34.87\n- NEU (Neuroticism): 25.96\n- EXT (Extraversion): 54.39 (highest)\n- OPEN (Openness): 42.71\n\nThe highest Receptiviti score for User A2GBIFL43U1LKJ is Extraversion (EXT) at 54.39, and according to the figure's visual description, Extraversion is represented by brown/orange circles.\n\nAnswer: Brown/orange", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 58234, "error": null, "extra": {"thread_id": 1966, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_f42472e918374f5ab49ea4996d3a5acc", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 362, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 0.6666666666666666, "method": "str_norm", "normalised_pred": "brown orange", "normalised_gold": "orange"}}
+{"qid": "2303.05039v2.pdf::Q005", "doc_id": "2303.05039v2.pdf", "answer_format": "list", "gold": "['71.02', '75.38']", "pages": 9, "document_id": 7583, "arm": "native_pdf", "question_id": "2303.05039v2.pdf::Q005", "answer_letter": null, "raw_text": "I need to find the personality scores in the data sample with extreme personality cases where the personality vector color is blue in the overall model structure.\n\nLooking at Figure 1 (the overall model structure), I can see three methods illustrated:\n- Method 1: Most salient personality\n- Method 2: Soft-labeled personality  \n- Method 3: Hard-coded personality\n\nIn Method 3 (Hard-coded personality), there is a \"Scaling\" step that shows personality scores with blue circles: 0.3 0.7 0.5 0.3 0.2\n\nLooking at Table 4 (data sample of extreme personality cases), I need to find which rows correspond to these scaled scores. The scaled values {0.3, 0.7, 0.5, 0.3, 0.2} represent the normalized OCEAN scores where:\n- Openness (O) = 0.3 \u2192 30\n- Conscientiousness (C) = 0.7 \u2192 70\n- Extroversion (E) = 0.5 \u2192 50\n- Agreeableness (A) = 0.3 \u2192 30\n- Neuroticism (N) = 0.2 \u2192 20\n\nThese are the scaled down values from the original scores {30, 70, 50, 30, 20} mentioned in the figure caption.\n\nSorting these in ascending order: 0.2, 0.3, 0.3, 0.5, 0.7\n\nOr in the original scale: 20, 30, 30, 50, 70\n\nAnswer: 20, 30, 30, 50, 70", "citations": [], "input_tokens": 26955, "output_tokens": 386, "cost_micros": 86655, "latency_ms": 14437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2303.05039v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "20, 30, 50, 70", "normalised_gold": "71 02, 75 38"}}
+{"qid": "2303.05039v2.pdf::Q005", "doc_id": "2303.05039v2.pdf", "answer_format": "list", "gold": "['71.02', '75.38']", "pages": 9, "document_id": 7583, "arm": "azure_basic_lc", "question_id": "2303.05039v2.pdf::Q005", "answer_letter": null, "raw_text": "I need to find the personality scores from the data sample with extreme personality cases where the personality vector color is blue in the overall model structure.\n\nLooking at Figure 1 (the overall structure of the model), I can see that the example shows OCEAN scores of {30,70,50,30,20}. The figure shows different personality traits:\n- Openness (O): 30\n- Conscientiousness (C): 70\n- Extroversion (E): 50\n- Agreeableness (A): 30\n- Neuroticism (N): 20\n\nIn the figure, Conscientiousness is highlighted as the \"most salient personality\" with a score of 70, which appears to be the blue/highlighted vector in the personality embedding.\n\nLooking at Table 4 which shows \"The data sample of extreme personality cases to the annotators,\" I need to find entries related to Conscientiousness (which corresponds to the blue vector in the figure).\n\nThe Conscientiousness entries in Table 4 show personality scores of:\n- 75.38\n- 71.02\n\nArranging these in ascending order:\n\nAnswer: 71.02, 75.38", "citations": [], "input_tokens": 13284, "output_tokens": 265, "cost_micros": 43827, "latency_ms": 7109, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "71 02, 75 38", "normalised_gold": "71 02, 75 38"}}
+{"qid": "2303.05039v2.pdf::Q005", "doc_id": "2303.05039v2.pdf", "answer_format": "list", "gold": "['71.02', '75.38']", "pages": 9, "document_id": 7583, "arm": "azure_premium_lc", "question_id": "2303.05039v2.pdf::Q005", "answer_letter": null, "raw_text": "I need to find the personality scores for a data sample with extreme personality cases where the personality vector color is blue in the overall model structure.\n\nLooking at Figure 1 (the overall model structure), I can see three methods shown:\n1. Method 1: Most salient personality - shows \"Conscientiousness\"\n2. Method 2: Soft-labeled personality - shows a personality embedding matrix with OCEAN scores\n3. Method 3: Hard-coded personality - shows scaled values\n\nIn Method 2 (Soft-labeled personality), there is a personality embedding matrix showing OCEAN scores in what appears to be blue-colored boxes:\n- Openness: 30\n- Conscientiousness: 70\n- Extraversion: 50\n- Agreeableness: 30\n- Neuroticism: 20\n\nThe figure caption confirms: \"In this example, the user's OCEAN score is {30,70,50,30,20}.\"\n\nThese scores in ascending order are: 20, 30, 30, 50, 70\n\nAnswer: 20, 30, 30, 50, 70", "citations": [], "input_tokens": 16309, "output_tokens": 253, "cost_micros": 52722, "latency_ms": 6172, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "20, 30, 50, 70", "normalised_gold": "71 02, 75 38"}}
+{"qid": "2303.05039v2.pdf::Q005", "doc_id": "2303.05039v2.pdf", "answer_format": "list", "gold": "['71.02', '75.38']", "pages": 9, "document_id": 7583, "arm": "llamacloud_basic_lc", "question_id": "2303.05039v2.pdf::Q005", "answer_letter": null, "raw_text": "I need to find the personality scores for the data sample with extreme personality cases where the personality vector color is blue in the overall model structure.\n\nLooking at Figure 1 (the overall structure of the model), I can see an example where the user's OCEAN score is {30,70,50,30,20}. The figure shows three methods for incorporating personality:\n\n1. Method 1: Most salient personality - selects conscientiousness (the highest score of 70)\n2. Method 2: Soft-labeled personality - shows a personality embedding vector with values 20\n3. Method 3: Hard-coded personality - shows scaling with values {0.3, 0.7, 0.5, 0.3, 0.2}\n\nThe figure caption states: \"In this example, the user's OCEAN score is {30,70,50,30,20}.\"\n\nLooking at the structure, the blue colored elements appear to be in Method 2 (Soft-labeled personality), which shows the personality embedding vector. However, I need to identify which specific values correspond to the blue vector.\n\nFrom the figure description and the example given, the OCEAN scores are:\n- Neuroticism: 30\n- Conscientiousness: 70\n- Extraversion: 50\n- Agreeableness: 30\n- Openness: 20\n\nIn ascending order, these scores are: 20, 30, 30, 50, 70\n\nAnswer: 20, 30, 30, 50, 70", "citations": [], "input_tokens": 10474, "output_tokens": 345, "cost_micros": 36597, "latency_ms": 8702, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "20, 30, 50, 70", "normalised_gold": "71 02, 75 38"}}
+{"qid": "2303.05039v2.pdf::Q005", "doc_id": "2303.05039v2.pdf", "answer_format": "list", "gold": "['71.02', '75.38']", "pages": 9, "document_id": 7583, "arm": "llamacloud_premium_lc", "question_id": "2303.05039v2.pdf::Q005", "answer_letter": null, "raw_text": "I need to find the personality scores from the data sample with extreme personality cases where the personality vector color is blue in the overall model structure.\n\nLooking at Figure 1 (the overall structure of the model), I can see an example where the user's OCEAN score is {30,70,50,30,20}. This appears to be the example being referenced.\n\nNow I need to check Table 4, which shows \"The data sample of extreme personality cases to the annotators.\" I need to identify which entries correspond to the blue personality vector in the model structure.\n\nFrom Figure 1, the example shows:\n- O (Openness): 30\n- C (Conscientiousness): 70\n- E (Extroversion): 50\n- A (Agreeableness): 30\n- N (Neuroticism): 20\n\nThe figure shows that Method 1 selects the maximum score (Conscientiousness = 70), which would be the \"blue\" personality vector in the embedding space.\n\nLooking at Table 4 for Conscientiousness entries, I see:\n- Conscientiousness: 75.38\n- Conscientiousness: 71.02\n\nHowever, I need to match the exact scores from the example in Figure 1. The scores in ascending order from the example {30,70,50,30,20} are:\n\n20, 30, 30, 50, 70\n\nAnswer: 20, 30, 30, 50, 70", "citations": [], "input_tokens": 14897, "output_tokens": 333, "cost_micros": 49686, "latency_ms": 8280, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "20, 30, 50, 70", "normalised_gold": "71 02, 75 38"}}
+{"qid": "2303.05039v2.pdf::Q005", "doc_id": "2303.05039v2.pdf", "answer_format": "list", "gold": "['71.02', '75.38']", "pages": 9, "document_id": 7583, "arm": "surfsense_agentic", "question_id": "2303.05039v2.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to find the personality scores for the blue personality vector in the extreme personality cases.", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 62187, "error": null, "extra": {"thread_id": 1967, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_3a604ba7af854be1897689cf0dae5312", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 138, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "i ll read document to find personality scores for blue personality vector in extreme personality cases", "normalised_gold": "71 02, 75 38"}}
+{"qid": "2303.05039v2.pdf::Q006", "doc_id": "2303.05039v2.pdf", "answer_format": "float", "gold": "0.833", "pages": 9, "document_id": 7583, "arm": "native_pdf", "question_id": "2303.05039v2.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the personality trait that is furthest to the left in the distribution figure, then find its highest hit rate.\n\nLooking at Figure 2 on page 6, which shows the distribution of personality traits in Amazon-beauty and Amazon-music datasets, I can see five personality traits displayed: OPEN, CON, EXT, AGR, and NEU.\n\nThe personality trait furthest to the left in the layout is OPEN (Openness).\n\nNow I need to find the highest hit rate (HR) for OPEN across all the datasets and metrics shown in the tables.\n\nLooking at Table 6 on page 8, which shows HR and NDCG results grouped by 5 personality traits:\n- For OPEN in Amazon-beauty: HR = 0.833 (+) and 0.750 (-)\n- For OPEN in Amazon-music: HR = 0.330 (+) and 0.313 (-)\n- For OPEN in Personality2018: HR = 0.535 (+) and 0.547 (-)\n\nThe highest hit rate for OPEN is 0.833 in the Amazon-beauty dataset.\n\nAnswer: 0.833", "citations": [], "input_tokens": 26925, "output_tokens": 268, "cost_micros": 84795, "latency_ms": 10375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2303.05039v2.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "0.833", "normalised_gold": "0.833"}}
+{"qid": "2303.05039v2.pdf::Q006", "doc_id": "2303.05039v2.pdf", "answer_format": "float", "gold": "0.833", "pages": 9, "document_id": 7583, "arm": "azure_basic_lc", "question_id": "2303.05039v2.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the personality trait that is furthest to the left in the distribution figure, then find its highest hit rate in the results.\n\nLooking at Figure 2 (Distribution of personality traits in Amazon-beauty and Amazon-music datasets), I need to identify which personality trait has its distribution furthest to the left. The figure shows distributions for OPEN, CON, EXT, AGR, and NEU.\n\nFrom the figure description and visual analysis:\n- The median lines (red lines) show the central tendency of each distribution\n- According to the text in Section 5.2: \"agreeable people have the highest median score, and neurotic people have the lowest median score\"\n- This means NEU (Neuroticism) is furthest to the left (lowest scores)\n\nNow I need to find the highest hit rate for NEU (Neuroticism) in the experimental results.\n\nLooking at Table 6, which shows results grouped by personality traits:\n\nFor NEU (Neuroticism):\n- Amazon-beauty HR: 0.933 (with +) and 0.833 (with -)\n- Amazon-music HR: 0.397 (with +) and 0.254 (with -)\n- Personality2018 HR: 0.489 (with +) and 0.390 (with -)\n\nThe highest hit rate for NEU across all datasets is 0.933.\n\nAnswer: 0.933", "citations": [], "input_tokens": 13254, "output_tokens": 324, "cost_micros": 44622, "latency_ms": 7905, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "0.933", "normalised_gold": "0.833"}}
+{"qid": "2303.05039v2.pdf::Q006", "doc_id": "2303.05039v2.pdf", "answer_format": "float", "gold": "0.833", "pages": 9, "document_id": 7583, "arm": "azure_premium_lc", "question_id": "2303.05039v2.pdf::Q006", "answer_letter": null, "raw_text": "I need to identify the personality trait located furthest to the left in Figure 2, then find its highest hit rate in the results tables.\n\nLooking at Figure 2 (Distribution of personality traits), I need to identify which personality trait appears furthest to the left in the distribution plots. The figure shows distributions for OPEN, CON, EXT, AGR, and NEU for both Amazon-beauty and Amazon-music datasets.\n\nFrom the figure, examining the distributions from left to right, NEU (Neuroticism) appears to have its distribution positioned furthest to the left, with the lowest median scores (indicated by the red line) compared to the other personality traits.\n\nNow I need to find the highest hit rate (HR) value for NEU (Neuroticism) in the experimental results.\n\nLooking at Table 6, which shows HR and NDCG results grouped by 5 personality traits:\n\nFor NEU (Neuroticism):\n- Amazon-beauty: HR = 0.933 (+) and 0.833 (-)\n- Amazon-music: HR = 0.397 (+) and 0.397 (-)\n- Personality2018: HR = 0.489 (+) and 0.511 (-)\n\nThe highest hit rate value for NEU across all datasets is 0.933 (from Amazon-beauty with the + model).\n\nAnswer: 0.933", "citations": [], "input_tokens": 16279, "output_tokens": 313, "cost_micros": 53532, "latency_ms": 8358, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "0.933", "normalised_gold": "0.833"}}
+{"qid": "2303.05039v2.pdf::Q006", "doc_id": "2303.05039v2.pdf", "answer_format": "float", "gold": "0.833", "pages": 9, "document_id": 7583, "arm": "llamacloud_basic_lc", "question_id": "2303.05039v2.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the personality trait that appears furthest to the left in Figure 2 (the distribution of personality traits), and then find its highest hit rate from the experimental results.\n\nLooking at Figure 2, I need to identify which personality trait appears furthest to the left. The figure shows distributions for five personality traits: OPEN (Openness), CON (Conscientiousness), EXT (Extroversion), AGR (Agreeableness), and NEU (Neuroticism).\n\nAccording to the text describing Figure 2: \"We observe a similar trend in both domains: agreeable people have the highest median score, and neurotic people have the lowest median score.\"\n\nThis indicates that Neuroticism (NEU) has the lowest median score, meaning it would be positioned furthest to the left in the distribution.\n\nNow I need to find the highest hit rate for Neuroticism (NEU) from Table 6, which shows the breakdown by personality traits.\n\nLooking at Table 6 in the document, I need to find the Hit Rate (HR) values for Neuroticism across the three datasets (Amazon-beauty, Amazon-music, and Personality2018).\n\nHowever, Table 6 is not fully visible in the provided text. But I can see from the discussion in Section 5.4 that it mentions performance changes for different personality traits.\n\nLooking back at Table 5, which shows overall results, I need to check if there's personality-specific data. The table shows H@3, H@5, and H@10 values across different models and datasets.\n\nFrom the text in Section 5.4: \"In the Amazon-music dataset, openness (+27%), agreeableness (+10%), extroversion (+5%) improve while neuroticism (\u201318%) and conscientiousness (\u201312%) decreases.\"\n\nThis suggests neuroticism showed decreases in the Amazon-music dataset. However, I need to find the actual highest hit rate value for neuroticism from the tables.\n\nSince Table 6 details are not fully provided in the visible text, and Table 5 shows overall results rather than personality-specific breakdowns, I need to infer from the context. The highest hit rates in Table 5 are around 0.948-0.977 for H@10 in Amazon-beauty dataset.", "citations": [], "input_tokens": 10444, "output_tokens": 512, "cost_micros": 39012, "latency_ms": 11484, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "6.0", "normalised_gold": "0.833"}}
+{"qid": "2303.05039v2.pdf::Q006", "doc_id": "2303.05039v2.pdf", "answer_format": "float", "gold": "0.833", "pages": 9, "document_id": 7583, "arm": "llamacloud_premium_lc", "question_id": "2303.05039v2.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the personality trait that is furthest to the left in the distribution figure, then find its highest hit rate.\n\nLooking at Figure 2 (Distribution of personality traits), I need to identify which personality trait has the lowest median score (furthest left).\n\nFrom the text in Section 5.2:\n\"We observe a similar trend in both domains: agreeable people have the highest median score, and neurotic people have the lowest median score.\"\n\nSo Neuroticism (NEU) is the personality trait furthest to the left in the distribution.\n\nNow I need to find the highest hit rate for Neuroticism in Table 6, which shows results grouped by personality traits.\n\nLooking at Table 6 for NEU (Neuroticism):\n- Amazon-beauty: HR = 0.933 (+) and 0.833 (-)\n- Amazon-music: HR = 0.397 (both + and -)\n- Personality2018: HR = 0.489 (+) and 0.511 (-)\n\nThe highest hit rate for Neuroticism across all datasets is 0.933 (Amazon-beauty with personality information).\n\nAnswer: 0.933", "citations": [], "input_tokens": 14867, "output_tokens": 267, "cost_micros": 48606, "latency_ms": 8485, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "0.933", "normalised_gold": "0.833"}}
+{"qid": "2303.05039v2.pdf::Q006", "doc_id": "2303.05039v2.pdf", "answer_format": "float", "gold": "0.833", "pages": 9, "document_id": 7583, "arm": "surfsense_agentic", "question_id": "2303.05039v2.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the document to find the distribution of personality traits figure and then identify the corresponding hit rate data.Let me search for figures related to personality traits distribution:Let me read around line 1011 where it mentions \"distribution of users' personalities\":Let me search for Figure 2 which shows the personality distribution:Now let me read Figure 2 in detail:Let me continue reading to see all the personality trait figures:Now let me search for Table 6 which shows the hit rates for different personality traits:Now I need to find which personality trait is \"furthest to the left\" in the distribution figure. Looking back at Figure 2, I need to examine the visual descriptions more carefully. Let me search for more details about each personality trait's distribution:The document states that \"neurotic people have the lowest median score\" (line 1509-1510). This means that NEU (Neuroticism) is the personality trait furthest to the left in the distribution. Now I need to find the highest hit rate for NEU from Table 6.\n\nLooking at Table 6 data I found earlier:\n- NEU (Neuroticism) row shows:\n  - Amazon-beauty: HR = 0.933 (+) and 0.833 (-)\n  - Amazon-music: HR = 0.397 (+) and 0.397 (-)\n  - Personality2018: HR = 0.489 (+) and 0.511 (-)\n\nThe highest hit rate for NEU across all datasets is 0.933.\n\nAnswer: 0.933", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 70937, "error": null, "extra": {"thread_id": 1968, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_1c93a76cd24f45808d3740ab6074704b", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 421, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "0.933", "normalised_gold": "0.833"}}
+{"qid": "2303.05039v2.pdf::Q007", "doc_id": "2303.05039v2.pdf", "answer_format": "str", "gold": "H@10", "pages": 9, "document_id": 7583, "arm": "native_pdf", "question_id": "2303.05039v2.pdf::Q007", "answer_letter": null, "raw_text": "Looking at the document, I need to:\n\n1. Identify the model structure figure - This is Figure 1 on page 5\n2. Find the method at the bottom of this figure - This is \"NCF + Hard-coded personality\"\n3. Look at Table 5 (page 8) which shows results across the three datasets (Amazon-beauty, Amazon-music, and Personality2018)\n4. Find the highest metric value for NCF+Hard-Coded across all metrics\n\nFrom Table 5, for NCF+Hard-Coded:\n- Amazon-beauty: H@3=0.948, H@5=0.961, H@10=0.977, N@3=0.849, N@5=0.826, N@10=0.848\n- Amazon-music: H@3=0.175, H@5=0.232, H@10=0.345, N@3=0.147, N@5=0.160, N@10=0.189\n- Personality2018: H@3=0.503, H@5=0.622, H@10=0.758, N@3=0.398, N@5=0.447, N@10=0.498\n\nThe highest value is 0.977 for H@10 in the Amazon-beauty dataset.\n\nAnswer: H@10", "citations": [], "input_tokens": 26935, "output_tokens": 314, "cost_micros": 85515, "latency_ms": 9796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2303.05039v2.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "h 10", "normalised_gold": "h 10"}}
+{"qid": "2303.05039v2.pdf::Q007", "doc_id": "2303.05039v2.pdf", "answer_format": "str", "gold": "H@10", "pages": 9, "document_id": 7583, "arm": "azure_basic_lc", "question_id": "2303.05039v2.pdf::Q007", "answer_letter": null, "raw_text": "Looking at the document, I need to:\n1. Identify the method at the bottom of the model structure figure (Figure 1)\n2. Find its performance across the three datasets in Table 5\n3. Determine which metric has the highest value\n\nFrom Figure 1, the three methods shown are:\n- Method 1: Most salient personality\n- Method 2: Soft-labeled personality  \n- Method 3: Hard-coded personality\n\nMethod 3 (Hard-coded personality) appears at the bottom of the figure.\n\nLooking at Table 5, the row for \"NCF+Hard-Coded\" shows performance across three datasets (Amazon-beauty, Amazon-music, Personality2018) with metrics H@3, H@5, H@10, N@3, N@5, N@10.\n\nThe values for NCF+Hard-Coded are:\n- Amazon-beauty: 0.948, 0.961, 0.977, 0.849, 0.826, 0.848\n- Amazon-music: 0.175, 0.232, 0.345, 0.147, 0.160, 0.189\n- Personality2018: 0.503, 0.622, 0.758, 0.398, 0.447, 0.498\n\nThe highest value across all three datasets for this method is 0.977, which corresponds to H@10 (Hit Rate at 10) in the Amazon-beauty dataset.\n\nAnswer: H@10", "citations": [], "input_tokens": 13264, "output_tokens": 347, "cost_micros": 44997, "latency_ms": 7250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "h 10", "normalised_gold": "h 10"}}
+{"qid": "2303.05039v2.pdf::Q007", "doc_id": "2303.05039v2.pdf", "answer_format": "str", "gold": "H@10", "pages": 9, "document_id": 7583, "arm": "azure_premium_lc", "question_id": "2303.05039v2.pdf::Q007", "answer_letter": null, "raw_text": "Looking at Figure 1, the method at the bottom of the model structure is \"Method 3: Hard-coded personality\" which corresponds to the NCF + Hard-coded Personality model.\n\nFrom Table 5, examining the results for NCF+Hard-Coded across all three datasets (Amazon-beauty, Amazon-music, and Personality2018) and all metrics (H@3, H@5, H@10, N@3, N@5, N@10):\n\nAmazon-beauty: N@3 = 0.849 (highest value)\nAmazon-music: N@10 = 0.189 (highest value)\nPersonality2018: N@5 = 0.447 (highest value)\n\nThe metric that appears most frequently as the highest is NDCG-related metrics, with N@3 having the highest single value (0.849).\n\nAnswer: N@3", "citations": [], "input_tokens": 16289, "output_tokens": 205, "cost_micros": 51942, "latency_ms": 6843, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "n 3", "normalised_gold": "h 10"}}
+{"qid": "2303.05039v2.pdf::Q007", "doc_id": "2303.05039v2.pdf", "answer_format": "str", "gold": "H@10", "pages": 9, "document_id": 7583, "arm": "llamacloud_basic_lc", "question_id": "2303.05039v2.pdf::Q007", "answer_letter": null, "raw_text": "Looking at Figure 1, the method at the bottom of the model structure is \"Method 3: Hard-coded personality\" which corresponds to the NCF+Hard-Coded model.\n\nFrom Table 5, I need to find the highest metric value for NCF+Hard-Coded across all three datasets (Amazon-beauty, Amazon-music, and Personality 2018):\n\nAmazon-beauty: N@3 = 0.849 (highest)\nAmazon-music: N@10 = 0.189\nPersonality 2018: H@10 = 0.758\n\nThe highest value is 0.849 for N@3 (NDCG@3) in the Amazon-beauty dataset.\n\nAnswer: NDCG@3", "citations": [], "input_tokens": 10454, "output_tokens": 171, "cost_micros": 33927, "latency_ms": 6750, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "ndcg 3", "normalised_gold": "h 10"}}
+{"qid": "2303.05039v2.pdf::Q007", "doc_id": "2303.05039v2.pdf", "answer_format": "str", "gold": "H@10", "pages": 9, "document_id": 7583, "arm": "llamacloud_premium_lc", "question_id": "2303.05039v2.pdf::Q007", "answer_letter": null, "raw_text": "I need to identify the method at the bottom of the model structure figure and find its highest evaluation metric across the three datasets.\n\nLooking at Figure 1, the three methods shown are:\n1. Method 1: Most salient personality\n2. Method 2: Soft-labeled personality\n3. Method 3: Hard-coded personality\n\nMethod 3 (Hard-coded personality) appears at the bottom of the figure.\n\nNow I need to look at Table 5 to find the highest metric for NCF+Hard-Coded across all three datasets (Amazon-beauty, Amazon-music, and Personality2018).\n\nFor NCF+Hard-Coded:\n- Amazon-beauty: H@3=0.948, H@5=0.961, H@10=0.977, N@3=0.849, N@5=0.826, N@10=0.848\n- Amazon-music: H@3=0.175, H@5=0.232, H@10=0.345, N@3=0.147, N@5=0.160, N@10=0.189\n- Personality2018: H@3=0.503, H@5=0.622, H@10=0.758, N@3=0.398, N@5=0.447, N@10=0.498\n\nThe highest value across all metrics and datasets for NCF+Hard-Coded is 0.977 for H@10 in Amazon-beauty.\n\nAnswer: H@10", "citations": [], "input_tokens": 14877, "output_tokens": 346, "cost_micros": 49821, "latency_ms": 6828, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "h 10", "normalised_gold": "h 10"}}
+{"qid": "2303.05039v2.pdf::Q007", "doc_id": "2303.05039v2.pdf", "answer_format": "str", "gold": "H@10", "pages": 9, "document_id": 7583, "arm": "surfsense_agentic", "question_id": "2303.05039v2.pdf::Q007", "answer_letter": null, "raw_text": "I need to find a model structure figure in this document first, then identify the method at the bottom of that figure, and finally check which evaluation metric has the highest number across three datasets for that method.Let me search for figures in the document more systematically:Great! I found Figure 1 showing the model structure. Now I can see that the three methods are:\n1. Method 1: Most salient personality - at the top\n2. Method 2: Soft-labeled personality - in the middle\n3. Method 3: Hard-coded personality - at the bottom\n\nThe method at the **bottom** is \"Hard-coded personality\" (Method 3). Now I need to find evaluation results for this method across the three datasets to determine which metric has the highest number.Now let me search for the experimental results tables:Let me continue searching for the results table:Perfect! Now I have found Table 5 which shows the evaluation results for all methods including \"NCF+Hard-Coded\" (the method at the bottom of the model structure figure) across the three datasets:\n\nFor **NCF+Hard-Coded** method across the three datasets:\n\n**Amazon-beauty:**\n- H@3: 0.948\n- H@5: 0.961  \n- H@10: 0.977\n- N@3: **0.849** \u2190 Highest\n- N@5: 0.826\n- N@10: 0.848\n\n**Amazon-music:**\n- H@3: 0.175\n- H@5: 0.232\n- H@10: 0.345\n- N@3: 0.147\n- N@5: 0.160\n- N@10: **0.189** \u2190 Highest\n\n**Personality2018:**\n- H@3: 0.503\n- H@5: 0.622\n- H@10: **0.758** \u2190 Highest\n- N@3: 0.398\n- N@5: 0.447\n- N@10: 0.498\n\nLooking at the highest value for each dataset:\n- Amazon-beauty: N@3 = 0.849\n- Amazon-music: N@10 = 0.189\n- Personality2018: H@10 = 0.758\n\nThe evaluation metric with the highest number across all three datasets for the Hard-coded method is **N@3** (with value 0.849 in Amazon-beauty).\n\nAnswer: N@3", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 93687, "error": null, "extra": {"thread_id": 1969, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_f41f50a7be1a46d59602bb6f59d5d5fd", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 515, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "n 3", "normalised_gold": "h 10"}}
+{"qid": "2303.08559v2.pdf::Q000", "doc_id": "2303.08559v2.pdf", "answer_format": "float", "gold": "72.3%", "pages": 30, "document_id": 7584, "arm": "native_pdf", "question_id": "2303.08559v2.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the performance of filter-then-rerank methods without ensemble on the 50-shot TACREV dataset.\n\nLooking at Table 3 on page 9, which shows \"Overall results of LLM-based ICL methods, SLM-based supervised methods, and our proposed filter-then-rerank (SLM+LLM) methods.\"\n\nFor TACREV 50-shot:\n- Previous SoTA (line 4): 68.5(1.6)\n- The filter-then-rerank methods without ensemble are shown in the rows labeled \"+ Rerank (L)\"\n\nLooking at the three reranker options for 50-shot TACREV:\n- Vicuna-13B + Rerank (L): 70.8(1.6)\n- InstructGPT + Rerank (L): 72.3(1.4)\n- GPT-4 + Rerank (L): 72.3(0.3)\n\nThe question asks for \"filter-then-rerank methods (w.o. ensemble)\" which refers to the \"+ Rerank (L)\" rows without the \"+ Ensemble (S)\" component.\n\nSince the question uses singular \"methods\" but there are multiple values, I should identify which one is the main result. Looking at the context, InstructGPT appears to be the primary model discussed in the main experiments (as mentioned in Section 5.3: \"with InstructGPT, reranking provides an average F1 gain of 2.4%\").\n\nAnswer: 72.3", "citations": [], "input_tokens": 82081, "output_tokens": 358, "cost_micros": 251613, "latency_ms": 38407, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2303.08559v2.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "72.3", "normalised_gold": "72.3"}}
+{"qid": "2303.08559v2.pdf::Q000", "doc_id": "2303.08559v2.pdf", "answer_format": "float", "gold": "72.3%", "pages": 30, "document_id": 7584, "arm": "azure_basic_lc", "question_id": "2303.08559v2.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the performance of filter-then-rerank methods without ensemble on the 50-shot TACREV dataset.\n\nLooking at Table 3, which shows \"Overall results of LLM-based ICL methods, SLM-based supervised methods, and our proposed filter-then-rerank (SLM+LLM) methods.\"\n\nFor TACREV (RE) task, I need to find the 50-shot column.\n\nLooking at the rows for filter-then-rerank without ensemble:\n- Line 6: Vicuna-13B + Rerank (L): 70.8(1.6)\n- Line 7: InstructGPT + Rerank (L): 72.3(1.4)\n- Line 9: GPT-4 + Rerank (L): 72.3(0.3)\n\nThe question asks about \"filter-then-rerank methods\" in general (plural), but I need to identify which specific method is being asked about. \n\nLooking at the context, when comparing with baseline methods, the paper typically refers to InstructGPT as the main reranker in their comparisons (as mentioned in section 5.3 where they state \"with InstructGPT, reranking provides an average F1 gain of 2.4%\").\n\nFor 50-shot TACREV with InstructGPT + Rerank (L) without ensemble:\n\nAnswer: 72.3", "citations": [], "input_tokens": 35348, "output_tokens": 333, "cost_micros": 111039, "latency_ms": 9656, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "72.3", "normalised_gold": "72.3"}}
+{"qid": "2303.08559v2.pdf::Q000", "doc_id": "2303.08559v2.pdf", "answer_format": "float", "gold": "72.3%", "pages": 30, "document_id": 7584, "arm": "azure_premium_lc", "question_id": "2303.08559v2.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the performance of filter-then-rerank methods without ensemble on the 50-shot TACREV dataset.\n\nLooking at Table 3, which shows \"Overall results of LLM-based ICL methods, SLM-based supervised methods, and our proposed filter-then-rerank (SLM+LLM) methods.\"\n\nFor TACREV dataset, I can see columns for 20-shot, 50-shot, and 100-shot settings.\n\nLooking at the 50-shot column for TACREV:\n\nThe rows showing filter-then-rerank methods without ensemble (marked as \"+ Rerank (L)\") are:\n- With Vicuna-13B as reranker: 70.8(1.6)\n- With InstructGPT as reranker: 72.3(1.4)\n- With GPT-4 as reranker: 72.3(0.3)\n\nThe question asks for \"filter-then-rerank methods (w.o. ensemble)\" which means without the \"+ Ensemble (S)\" prefix. The previous SoTA method (line 4) shows 68.5(1.6) for comparison.\n\nSince there are three different LLM rerankers tested, and the question asks for \"filter-then-rerank methods\" in general, I should identify which specific result is being asked for. Looking at the context, when InstructGPT is used as the reranker (which appears to be the main model discussed in the results section based on Table 4 and other analyses), the performance is 72.3.\n\nAnswer: 72.3", "citations": [], "input_tokens": 46313, "output_tokens": 370, "cost_micros": 144489, "latency_ms": 11125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "72.3", "normalised_gold": "72.3"}}
+{"qid": "2303.08559v2.pdf::Q000", "doc_id": "2303.08559v2.pdf", "answer_format": "float", "gold": "72.3%", "pages": 30, "document_id": 7584, "arm": "llamacloud_basic_lc", "question_id": "2303.08559v2.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the performance of filter-then-rerank methods without ensemble on the 50-shot TACREV dataset.\n\nLooking at Table 3, which shows \"Overall results of LLM-based ICL methods, SLM-based supervised methods, and our proposed filter-then-rerank (SLM+LLM) methods.\"\n\nFor TACREV (RE) at 50-shot setting:\n\nLooking at the rows with filter-then-rerank methods without ensemble:\n- Line 4: \"Previous SoTA\" (this is the baseline SLM)\n- Line 7: \"+ Rerank (L)\" with InstructGPT - this adds reranking to the previous SoTA without ensemble\n\nFrom Table 3, TACREV 50-shot column:\n- Previous SoTA: 68.5(1.6)\n- SLM + Rerank (L) with InstructGPT: 70.8(1.6)\n\nThe filter-then-rerank method without ensemble using InstructGPT shows 70.8.\n\nAnswer: 70.8", "citations": [], "input_tokens": 33414, "output_tokens": 254, "cost_micros": 104052, "latency_ms": 9844, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "70.8", "normalised_gold": "72.3"}}
+{"qid": "2303.08559v2.pdf::Q000", "doc_id": "2303.08559v2.pdf", "answer_format": "float", "gold": "72.3%", "pages": 30, "document_id": 7584, "arm": "llamacloud_premium_lc", "question_id": "2303.08559v2.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Table 3 in the document, I need to find the performance of filter-then-rerank methods without ensemble on the 50-shot TACREV dataset.\n\nIn Table 3, under the TACREV (RE) 50-shot column:\n- Previous SoTA (SLM baseline): 72.6(1.5)\n- + Rerank (L) with Vicuna-13B: 73.8(1.7)\n- + Rerank (L) with InstructGPT: 75.4(1.5)\n- + Rerank (L) with GPT-4: 74.5(1.5)\n\nThe question asks for filter-then-rerank methods without ensemble. The \"+ Rerank (L)\" rows show the filter-then-rerank results without ensemble (the ensemble results are shown in the \"+ Ensemble (S) + Rerank (L)\" rows).\n\nSince there are three different rerankers tested, and the question asks for \"filter-then-rerank methods\" in general without specifying which LLM reranker, I should provide the InstructGPT result as it appears to be the primary comparison model used throughout the paper (it's the middle option and shows the best performance).\n\nAnswer: 75.4", "citations": [], "input_tokens": 45083, "output_tokens": 294, "cost_micros": 139659, "latency_ms": 8921, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "75.4", "normalised_gold": "72.3"}}
+{"qid": "2303.08559v2.pdf::Q000", "doc_id": "2303.08559v2.pdf", "answer_format": "float", "gold": "72.3%", "pages": 30, "document_id": 7584, "arm": "surfsense_agentic", "question_id": "2303.08559v2.pdf::Q000", "answer_letter": null, "raw_text": "I'll search for information about filter-then-rerank methods performance on the 50-shot TACREV dataset in the provided document.Let me search for tables that contain both TACREV and 50-shot data:Perfect! I found Table 3 which contains the performance data. Let me look at the specific row for filter-then-rerank methods without ensemble on the 50-shot TACREV dataset.\n\nFrom the table, I can see:\n- Column headers show: FewNERD (NER) with 5-shot, 10-shot, 20-shot, then TACREV (RE) with 20-shot, 50-shot, 100-shot\n- The filter-then-rerank method without ensemble is the \"+ Rerank (L)\" row (Line 7 as mentioned in the text)\n\nLooking at the data:\n- Under \"SLM\" section, the \"+ Rerank (L)\" row (line 2050-2060) shows the Vicuna-13B results\n- For TACREV 50-shot (5th data column), the value is: 70.8(1.6)\n\nLet me also check the InstructGPT rerank results:\n- \"+ Rerank (L)\" with InstructGPT (line 2080-2090) shows for TACREV 50-shot: 72.3(1.4)\n\nAnd GPT-4 rerank results:\n- \"+ Rerank (L)\" with GPT-4 (line 2111-2121) shows for TACREV 50-shot: 72.3(0.3)\n\nThe question asks for filter-then-rerank methods (w.o. ensemble), which would be the \"+ Rerank (L)\" rows. Looking at line 2084-2085, I can see the InstructGPT version shows 72.3 for 50-shot TACREV.\n\nHowever, I need to verify which specific implementation is being referred to. Let me check the context around line 1903 which mentions \"Lines 4 vs. 7\" for the comparison.\n\nBased on the table structure and the text mentioning \"Lines 4 vs. 7\" where Line 7 is the rerank method, and looking at the TACREV 50-shot column, the filter-then-rerank method (w.o. ensemble) with InstructGPT shows: 72.3\n\nAnswer: 72.3", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 50031, "error": null, "extra": {"thread_id": 1970, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_cec9e66ec5254f8aa0ad0e5be252d80b", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 338, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "72.3", "normalised_gold": "72.3"}}
+{"qid": "2303.08559v2.pdf::Q002", "doc_id": "2303.08559v2.pdf", "answer_format": "str", "gold": "Identify the entities expressed by each sentence, and locate each entity to words in the sentence. The possible entity types are: [Type_1], [Type_2], ..., [Type_N]. If you do not find any entity in this sentence, just output \u2018Answer: No entities found.\u2019", "pages": 30, "document_id": 7584, "arm": "native_pdf", "question_id": "2303.08559v2.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 3 (left) in the document, which shows \"LLMs' performance w.r.t prompt variants on 20-shot FewNERD dataset,\" the left panel shows \"ChatGPT's performance (F1 Score) across six instruction variants.\"\n\nThe red box in the left panel of Figure 3 highlights the instruction formats labeled I0 through I5 on the x-axis.\n\nBased on the context provided in Appendix E.2, these instruction variants range from simple to complex, with I0 being empty (no instruction) and I1 through I5 being increasingly detailed instructions for the NER task.\n\nAnswer: ChatGPT's performance (F1 Score) across six instruction variants", "citations": [], "input_tokens": 82074, "output_tokens": 166, "cost_micros": 248712, "latency_ms": 34046, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2303.08559v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "chatgpt s performance f1 score across six instruction variants", "normalised_gold": "identify entities expressed by each sentence and locate each entity to words in sentence possible entity types are type 1 type 2 type n if you do not find any entity in this sentence just output \u2018answer no entities found \u2019"}}
+{"qid": "2303.08559v2.pdf::Q002", "doc_id": "2303.08559v2.pdf", "answer_format": "str", "gold": "Identify the entities expressed by each sentence, and locate each entity to words in the sentence. The possible entity types are: [Type_1], [Type_2], ..., [Type_N]. If you do not find any entity in this sentence, just output \u2018Answer: No entities found.\u2019", "pages": 30, "document_id": 7584, "arm": "azure_basic_lc", "question_id": "2303.08559v2.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 3 (left), which shows \"LLMs' performance w.r.t prompt variants\" and displays \"ChatGPT's performance (F1 Score) across six instruction variants\" labeled I0 through I5.\n\nThe document provides the full text of these six instruction variants in Section E.2 \"Finding Better Instruction.\"\n\nThe instructions are:\n\n**I0:** [empty]\n\n**I1:** Identify the entities expressed by each sentence, and locate each entity to words in the sentence. The possible entity types are: [Type_1], [Type_2], ..., [Type_N]. If you do not find any entity in this sentence, just output 'Answer: No entities found.'\n\n**I2:** Identify the entities expressed by each sentence, and locate each entity to words in the sentence. The possible entity types are: [Type_1]: [Definition_1], [Type_2]: [Definition_2], ..., [Type_N]: [Definition_N]. If you do not find any entity in this sentence, just output 'Answer: No entities found.'\n\n**I3:** Assume you are an entity-instance annotator. Given a sentence, you need to (1) identify the word or phrase about the entity in the sentence, and (2) classify its entity type. The possible entity types are listed as below: [Type_1], [Type_2], ..., [Type_N]. Please note that your annotation results must follow such format: 'Answer: ([Type_1] <SEP> identified_entity: [Entity_1]), ([Type_2] <SEP> identified_entity: [Entity_2])'. If you do not find any entity in this sentence, just output 'Answer: No entities found.'\n\n**I4:** Assume you are an entity-instance annotator. Your objective is to perform a series of intricate steps for Named Entity Recognition. Firstly, you have to identify a particular word or phrase in the sentence that corresponds to an entity. Following this, classify the entity into one of the potential entity types. The potential entity types are provided as below: [Type_1], [Type_2], ..., [Type_N]. Please note that your annotation results must follow such format: 'Answer: ([Type_1] <SEP> identified_entity: [Entity_", "citations": [], "input_tokens": 35341, "output_tokens": 512, "cost_micros": 113703, "latency_ms": 13984, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.1702127659574468, "method": "str_norm", "normalised_pred": "type 1 sep identified entity entity", "normalised_gold": "identify entities expressed by each sentence and locate each entity to words in sentence possible entity types are type 1 type 2 type n if you do not find any entity in this sentence just output \u2018answer no entities found \u2019"}}
+{"qid": "2303.08559v2.pdf::Q002", "doc_id": "2303.08559v2.pdf", "answer_format": "str", "gold": "Identify the entities expressed by each sentence, and locate each entity to words in the sentence. The possible entity types are: [Type_1], [Type_2], ..., [Type_N]. If you do not find any entity in this sentence, just output \u2018Answer: No entities found.\u2019", "pages": 30, "document_id": 7584, "arm": "azure_premium_lc", "question_id": "2303.08559v2.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 3 (left), which shows \"ChatGPT's performance (F1 Score) across six instruction variants,\" I need to find the instructions labeled I0 through I5.\n\nFrom Section E.2 \"Finding Better Instruction\" in the document, the six instruction variants are listed. The red box in Figure 3 (left) shows the x-axis labels I0, I1, I2, I3, I4, I5.\n\nThese correspond to:\n\n**I0**: [empty]\n\n**I1**: Identify the entities expressed by each sentence, and locate each entity to words in the sentence. The possible entity types are: [Type_1], [Type_2], ... , [Type_N]. If you do not find any entity in this sentence, just output 'Answer: No entities found.'\n\n**I2**: Same as I1 but with definitions added for each type\n\n**I3**: Assume you are an entity-instance annotator. Given a sentence, you need to (1) identify the word or phrase about the entity in the sentence, and (2) classify its entity type. [includes format specification]\n\n**I4**: Similar to I3 but with more elaborate language about \"intricate steps\"\n\n**I5**: Combines detailed task description with entity type definitions\n\nAnswer: Six instruction variants from empty (I0) to increasingly complex formats with definitions and detailed task descriptions (I1-I5)", "citations": [], "input_tokens": 46306, "output_tokens": 319, "cost_micros": 143703, "latency_ms": 11812, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.06779661016949154, "method": "str_norm", "normalised_pred": "six instruction variants from empty i0 to increasingly complex formats with definitions and detailed task descriptions i1 i5", "normalised_gold": "identify entities expressed by each sentence and locate each entity to words in sentence possible entity types are type 1 type 2 type n if you do not find any entity in this sentence just output \u2018answer no entities found \u2019"}}
+{"qid": "2303.08559v2.pdf::Q002", "doc_id": "2303.08559v2.pdf", "answer_format": "str", "gold": "Identify the entities expressed by each sentence, and locate each entity to words in the sentence. The possible entity types are: [Type_1], [Type_2], ..., [Type_N]. If you do not find any entity in this sentence, just output \u2018Answer: No entities found.\u2019", "pages": 30, "document_id": 7584, "arm": "llamacloud_basic_lc", "question_id": "2303.08559v2.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 3 (left), I can see it's showing \"Relationship between confidence scores and performance with/without LLM reranking.\"\n\nThe red box in the left figure appears to be highlighting the legend/experimental settings, which shows:\n\n**Answer: RoBERTa-large as filter and InstructGPT as reranker**", "citations": [], "input_tokens": 33407, "output_tokens": 81, "cost_micros": 101436, "latency_ms": 4281, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.04081632653061225, "method": "str_norm", "normalised_pred": "roberta large as filter and instructgpt as reranker", "normalised_gold": "identify entities expressed by each sentence and locate each entity to words in sentence possible entity types are type 1 type 2 type n if you do not find any entity in this sentence just output \u2018answer no entities found \u2019"}}
+{"qid": "2303.08559v2.pdf::Q002", "doc_id": "2303.08559v2.pdf", "answer_format": "str", "gold": "Identify the entities expressed by each sentence, and locate each entity to words in the sentence. The possible entity types are: [Type_1], [Type_2], ..., [Type_N]. If you do not find any entity in this sentence, just output \u2018Answer: No entities found.\u2019", "pages": 30, "document_id": 7584, "arm": "llamacloud_premium_lc", "question_id": "2303.08559v2.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 3 (left), which shows \"LLMs' performance w.r.t prompt variants on 20-shot FewNERD dataset\" with six instruction variants (I0-I5), I need to find the instruction text from the document.\n\nFrom Section E.2 \"Finding Better Instruction\", the six instruction variants for the NER task are listed. These are the instructions being tested in the red box of Figure 3 (left).\n\nAnswer: The six instruction variants I0 through I5 listed in Section E.2, ranging from empty instruction to complex instructions with entity type definitions and annotation format requirements", "citations": [], "input_tokens": 45076, "output_tokens": 140, "cost_micros": 137328, "latency_ms": 7141, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.17910447761194032, "method": "str_norm", "normalised_pred": "six instruction variants i0 through i5 listed in section e 2 ranging from empty instruction to complex instructions with entity type definitions and annotation format requirements", "normalised_gold": "identify entities expressed by each sentence and locate each entity to words in sentence possible entity types are type 1 type 2 type n if you do not find any entity in this sentence just output \u2018answer no entities found \u2019"}}
+{"qid": "2303.08559v2.pdf::Q002", "doc_id": "2303.08559v2.pdf", "answer_format": "str", "gold": "Identify the entities expressed by each sentence, and locate each entity to words in the sentence. The possible entity types are: [Type_1], [Type_2], ..., [Type_N]. If you do not find any entity in this sentence, just output \u2018Answer: No entities found.\u2019", "pages": 30, "document_id": 7584, "arm": "surfsense_agentic", "question_id": "2303.08559v2.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the document to find Figure 3 and identify the instructions corresponding to the settings shown in the red box.Let me search for Figure 3 in the document:Now let me read the section around Figure 3 to see the instruction variants and identify which ones are shown in the red box:I can see that Figure 3 Left shows ChatGPT's performance across six instruction variants labeled I0 through I5. I need to find the details about these instruction variants in Appendix E.2. Let me search for the appendix:Let me search for instruction variants more broadly:Let me search for the appendix sections with \"E.2\" or look for where instruction formats I0-I5 are defined:Let me look at the entire document more systematically. Let me search for sections starting with \"E.\" or \"Appendix\":Great! Now let me read section E.2 which should contain the instruction variants:Perfect! I found the six instruction variants (I0-I5). Now I need to look at Figure 3 more carefully to see which instructions are in the red box. Let me read around the figure again and look for any visual description of the red box:The image X14.png seems to be after Figure 3 in the document. Let me look for other embedded images that might be part of Figure 3:Let me look before Figure 3 to see if there's an image associated with the figure itself:", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 77483, "error": null, "extra": {"thread_id": 1971, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_538fbe85cd9b43f5a8b0d97c996a7e15", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 450, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.08301886792452831, "method": "str_norm", "normalised_pred": "i ll read document to find figure 3 and identify instructions corresponding to settings shown in red box let me search for figure 3 in document now let me read section around figure 3 to see instruction variants and identify which ones are shown in red box i can see that figure 3 left shows chatgpt s performance across six instruction variants labeled i0 through i5 i need to find details about these instruction variants in appendix e 2 let me search for appendix let me search for instruction variants more broadly let me search for appendix sections with e 2 or look for where instruction formats i0 i5 are defined let me look at entire document more systematically let me search for sections starting with e or appendix great now let me read section e 2 which should contain instruction variants perfect i found six instruction variants i0 i5 now i need to look at figure 3 more carefully to see which instructions are in red box let me read around figure again and look for any visual description of red box image x14 png seems to be after figure 3 in document let me look for other embedded images that might be part of figure 3 let me look before figure 3 to see if there s image associated with figure itself", "normalised_gold": "identify entities expressed by each sentence and locate each entity to words in sentence possible entity types are type 1 type 2 type n if you do not find any entity in this sentence just output \u2018answer no entities found \u2019"}}
+{"qid": "2303.08559v2.pdf::Q003", "doc_id": "2303.08559v2.pdf", "answer_format": "int", "gold": "9", "pages": 30, "document_id": 7584, "arm": "native_pdf", "question_id": "2303.08559v2.pdf::Q003", "answer_letter": null, "raw_text": "I need to find how many datasets are used for experiments in this paper.\n\nLooking at the document, in Section 3.1 \"Task, Dataset and Evaluation\", it states:\n\n\"We run experiments on nine widely-used datasets across four IE tasks.\"\n\nThe document then lists:\n- Named Entity Recognition (NER): CONLL03, OntoNotes, FewNERD (3 datasets)\n- Relation Extraction (RE): TACRED, TACREV (2 datasets)\n- Event Detection (ED): ACE05, MAVEN, ERE (3 datasets)\n- Event Argument Extraction (EAE): ACE05, ERE, RAMS (3 datasets)\n\nHowever, I need to count unique datasets. ACE05 and ERE appear in both ED and EAE tasks.\n\nCounting unique datasets:\n1. CONLL03\n2. OntoNotes\n3. FewNERD\n4. TACRED\n5. TACREV\n6. ACE05\n7. MAVEN\n8. ERE\n9. RAMS\n\nThis gives us 9 unique datasets total.\n\nAnswer: 9", "citations": [], "input_tokens": 82066, "output_tokens": 265, "cost_micros": 250173, "latency_ms": 34217, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2303.08559v2.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "9", "normalised_gold": "9"}}
+{"qid": "2303.08559v2.pdf::Q003", "doc_id": "2303.08559v2.pdf", "answer_format": "int", "gold": "9", "pages": 30, "document_id": 7584, "arm": "azure_basic_lc", "question_id": "2303.08559v2.pdf::Q003", "answer_letter": null, "raw_text": "Looking through the document for information about the datasets used:\n\nFrom the Abstract and Introduction:\n- \"Through extensive experiments on nine datasets across four IE tasks\"\n\nFrom Section 3.1 (Task, Dataset and Evaluation):\n- \"We run experiments on nine widely-used datasets across four IE tasks.\"\n- The nine datasets are listed as:\n  1. CONLL03 (NER)\n  2. OntoNotes (NER)\n  3. FewNERD (NER)\n  4. TACRED (RE)\n  5. TACREV (RE)\n  6. ACE05 (ED)\n  7. MAVEN (ED)\n  8. ERE (ED)\n  9. ACE05, ERE, RAMS (EAE)\n\nWait, let me recount more carefully from the text:\n- NER: CONLL03, OntoNotes, FewNERD = 3 datasets\n- RE: TACRED, TACREV = 2 datasets\n- ED: ACE05, MAVEN, ERE = 3 datasets\n- EAE: ACE05, ERE, RAMS = 3 datasets\n\nHowever, ACE05 and ERE appear in multiple tasks. The document clearly states \"nine datasets\" multiple times.\n\nLooking at Table 6 which shows \"Statistics of nine datasets used\", I can count the distinct dataset names:\n1. CONLL\n2. OntoNotes\n3. FewNERD\n4. TACREV\n5. TACRED\n6. ACE05 (appears for both ED and EAE)\n7. MAVEN\n8. ERE (appears for both ED and EAE)\n9. RAMS\n\nThis confirms 9 distinct datasets total.\n\nAnswer: 9", "citations": [], "input_tokens": 35333, "output_tokens": 414, "cost_micros": 112209, "latency_ms": 9046, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "9", "normalised_gold": "9"}}
+{"qid": "2303.08559v2.pdf::Q003", "doc_id": "2303.08559v2.pdf", "answer_format": "int", "gold": "9", "pages": 30, "document_id": 7584, "arm": "azure_premium_lc", "question_id": "2303.08559v2.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the document, I need to identify all the datasets used across the four IE tasks.\n\nFrom Section 3.1 \"Task, Dataset and Evaluation\":\n\n1. **Named Entity Recognition (NER)**: CONLL03, OntoNotes, FewNERD\n2. **Relation Extraction (RE)**: TACRED, TACREV\n3. **Event Detection (ED)**: ACE05, MAVEN, ERE\n4. **Event Argument Extraction (EAE)**: ACE05, RAMS, ERE\n\nCounting unique datasets:\n- CONLL03\n- OntoNotes\n- FewNERD\n- TACRED\n- TACREV\n- ACE05 (used in both ED and EAE, but counted once)\n- MAVEN\n- ERE (used in both ED and EAE, but counted once)\n- RAMS\n\nTotal unique datasets: 9\n\nThis is confirmed by the abstract which states \"extensive experiments on nine datasets across four IE tasks\" and multiple mentions throughout the paper of \"nine datasets.\"\n\nAnswer: 9", "citations": [], "input_tokens": 46298, "output_tokens": 251, "cost_micros": 142659, "latency_ms": 7875, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "9", "normalised_gold": "9"}}
+{"qid": "2303.08559v2.pdf::Q003", "doc_id": "2303.08559v2.pdf", "answer_format": "int", "gold": "9", "pages": 30, "document_id": 7584, "arm": "llamacloud_basic_lc", "question_id": "2303.08559v2.pdf::Q003", "answer_letter": null, "raw_text": "I need to count the total number of datasets used in the experiments described in this paper.\n\nFrom Section 3.1 \"Task, Dataset and Evaluation\", the paper states:\n\n\"We run experiments on nine widely-used datasets across four IE tasks.\"\n\nThe four tasks and their associated datasets are:\n\n1. **Named Entity Recognition (NER)**: \n   - CONLL03\n   - OntoNotes\n   - FewNERD\n\n2. **Relation Extraction (RE)**:\n   - TACRED\n   - TACREV\n\n3. **Event Detection (ED)**:\n   - ACE05\n   - MAVEN\n   - ERE\n\n4. **Event Argument Extraction (EAE)**:\n   - ACE05\n   - ERE\n   - RAMS\n\nCounting the unique datasets:\n- CONLL03\n- OntoNotes\n- FewNERD\n- TACRED\n- TACREV\n- ACE05 (used in both ED and EAE)\n- MAVEN\n- ERE (used in both ED and EAE)\n- RAMS\n\nThe paper explicitly states \"nine widely-used datasets\" and when counting unique dataset names, we have 9 distinct datasets (ACE05 and ERE are each used for two different tasks but are still single datasets).\n\nAnswer: 9", "citations": [], "input_tokens": 33399, "output_tokens": 309, "cost_micros": 104832, "latency_ms": 8328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "9", "normalised_gold": "9"}}
+{"qid": "2303.08559v2.pdf::Q003", "doc_id": "2303.08559v2.pdf", "answer_format": "int", "gold": "9", "pages": 30, "document_id": 7584, "arm": "llamacloud_premium_lc", "question_id": "2303.08559v2.pdf::Q003", "answer_letter": null, "raw_text": "Looking through the document for the datasets used in experiments:\n\nFrom Section 3.1 \"Task, Dataset and Evaluation\":\n- Named Entity Recognition (NER): CONLL03, OntoNotes, FewNERD\n- Relation Extraction (RE): TACRED, TACREV\n- Event Detection (ED): ACE05, MAVEN, ERE\n- Event Argument Extraction (EAE): ACE05, ERE, RAMS\n\nCounting the unique datasets:\n1. CONLL03\n2. OntoNotes\n3. FewNERD\n4. TACRED\n5. TACREV\n6. ACE05\n7. MAVEN\n8. ERE\n9. RAMS\n\nNote: ACE05 and ERE are used for both Event Detection and Event Argument Extraction tasks, but they are counted once as unique datasets.\n\nAnswer: 9", "citations": [], "input_tokens": 45068, "output_tokens": 207, "cost_micros": 138309, "latency_ms": 6390, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "9", "normalised_gold": "9"}}
+{"qid": "2303.08559v2.pdf::Q003", "doc_id": "2303.08559v2.pdf", "answer_format": "int", "gold": "9", "pages": 30, "document_id": 7584, "arm": "surfsense_agentic", "question_id": "2303.08559v2.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the document to find information about the datasets used in the experiments.Let me search for a table or figure that lists all the datasets clearly.Perfect! I found Table 6 which clearly lists all the datasets. Let me count them:\n\nFrom the document, the paper uses experiments on **nine datasets** across four IE tasks:\n\n1. **Named Entity Recognition (NER)**: \n   - CONLL03\n   - OntoNotes\n   - FewNERD\n\n2. **Relation Extraction (RE)**:\n   - TACRED\n   - TACREV\n\n3. **Event Detection (ED)**:\n   - ACE05\n   - MAVEN\n   - ERE\n\n4. **Event Argument Extraction (EAE)**:\n   - ACE05 (same dataset used for both ED and EAE)\n   - RAMS\n   - ERE (same dataset used for both ED and EAE)\n\nCounting unique datasets: CONLL03, OntoNotes, FewNERD, TACRED, TACREV, ACE05, MAVEN, ERE, RAMS = **9 datasets**\n\nAnswer: 9", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 43891, "error": null, "extra": {"thread_id": 1972, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_efab7508547a4ca893cf8d3cb1478856", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 235, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "9", "normalised_gold": "9"}}
+{"qid": "2303.08559v2.pdf::Q004", "doc_id": "2303.08559v2.pdf", "answer_format": "str", "gold": "The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.", "pages": 30, "document_id": 7584, "arm": "native_pdf", "question_id": "2303.08559v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking through the document for information about the filter-then-rerank method and prompt templates...\n\nIn Table 21 (page 29), which shows \"Templates for ACE05 dataset, where {evt} is the placeholder for event type,\" I can find the template for Contact.Meet:\n\nAnswer: The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.", "citations": [], "input_tokens": 82080, "output_tokens": 100, "cost_micros": 247740, "latency_ms": 31592, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2303.08559v2.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "word evt triggers meet event two or more entities come together at single location and interact with one another face to face", "normalised_gold": "word evt triggers meet event two or more entities come together at single location and interact with one another face to face"}}
+{"qid": "2303.08559v2.pdf::Q004", "doc_id": "2303.08559v2.pdf", "answer_format": "str", "gold": "The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.", "pages": 30, "document_id": 7584, "arm": "azure_basic_lc", "question_id": "2303.08559v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document, I need to find the prompt template used for the 'Contact.Meet' label in the filter-then-rerank method.\n\nIn Table 21, which shows \"Templates for ACE05 dataset, where {evt} is the placeholder for event type,\" I can find the template for Contact.Meet:\n\n\"The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.\"\n\nAnswer: The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.", "citations": [], "input_tokens": 35347, "output_tokens": 142, "cost_micros": 108171, "latency_ms": 4609, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "word evt triggers meet event two or more entities come together at single location and interact with one another face to face", "normalised_gold": "word evt triggers meet event two or more entities come together at single location and interact with one another face to face"}}
+{"qid": "2303.08559v2.pdf::Q004", "doc_id": "2303.08559v2.pdf", "answer_format": "str", "gold": "The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.", "pages": 30, "document_id": 7584, "arm": "azure_premium_lc", "question_id": "2303.08559v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document, I need to find the prompt template for the 'Contact.Meet' event label in the filter-then-rerank paradigm.\n\nIn Table 21 (Templates for ACE05 dataset), which shows the templates used in the filter-then-rerank method, I can find:\n\nContact.Meet: \"The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.\"\n\nAnswer: The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.", "citations": [], "input_tokens": 46312, "output_tokens": 143, "cost_micros": 141081, "latency_ms": 6046, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "word evt triggers meet event two or more entities come together at single location and interact with one another face to face", "normalised_gold": "word evt triggers meet event two or more entities come together at single location and interact with one another face to face"}}
+{"qid": "2303.08559v2.pdf::Q004", "doc_id": "2303.08559v2.pdf", "answer_format": "str", "gold": "The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.", "pages": 30, "document_id": 7584, "arm": "llamacloud_basic_lc", "question_id": "2303.08559v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document, I need to find the prompt template for 'Contact.Meet' used in the filter-then-rerank method.\n\nIn Table 21, which shows \"Templates for ACE05 dataset, where {evt} is the placeholder for event type,\" I can find the template for Contact.Meet:\n\nAnswer: The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.", "citations": [], "input_tokens": 33413, "output_tokens": 106, "cost_micros": 101829, "latency_ms": 3921, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "word evt triggers meet event two or more entities come together at single location and interact with one another face to face", "normalised_gold": "word evt triggers meet event two or more entities come together at single location and interact with one another face to face"}}
+{"qid": "2303.08559v2.pdf::Q004", "doc_id": "2303.08559v2.pdf", "answer_format": "str", "gold": "The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.", "pages": 30, "document_id": 7584, "arm": "llamacloud_premium_lc", "question_id": "2303.08559v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document, I need to find the template for 'Contact.Meet' in the filter-then-rerank method.\n\nIn Table 21, which shows \"Templates for ACE05 dataset, where {evt} is the placeholder for event type,\" I can find:\n\nContact.Meet | The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.\n\nAnswer: The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.", "citations": [], "input_tokens": 45082, "output_tokens": 135, "cost_micros": 137271, "latency_ms": 4452, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "word evt triggers meet event two or more entities come together at single location and interact with one another face to face", "normalised_gold": "word evt triggers meet event two or more entities come together at single location and interact with one another face to face"}}
+{"qid": "2303.08559v2.pdf::Q004", "doc_id": "2303.08559v2.pdf", "answer_format": "str", "gold": "The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.", "pages": 30, "document_id": 7584, "arm": "surfsense_agentic", "question_id": "2303.08559v2.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the user-mentioned document to find information about the prompt template for label 'Contact.Meet' in the filter-then-rerank method.Let me search for information about \"Contact.Meet\" and \"filter-then-rerank\" in this document.Let me search for content with more context:Now I need to find the section that shows the prompt templates. Let me read around line 5446 where Contact.Meet was mentioned, and also look for tables or appendices with prompt templates:Perfect! I found the prompt template for Contact.Meet in Table 21 for ACE05 dataset. According to the document, the template for 'Contact.Meet' in the filter-then-rerank method is:\n\n\"The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.\"\n\nAnswer: The word {evt} triggers a MEET event: two or more entities come together at a single location and interact with one another face-to-face.", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 45328, "error": null, "extra": {"thread_id": 1973, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_5466a03f139642c1a7d3c68dd6b01473", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 222, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "word evt triggers meet event two or more entities come together at single location and interact with one another face to face", "normalised_gold": "word evt triggers meet event two or more entities come together at single location and interact with one another face to face"}}
+{"qid": "2303.08559v2.pdf::Q006", "doc_id": "2303.08559v2.pdf", "answer_format": "float", "gold": "0.5%", "pages": 30, "document_id": 7584, "arm": "native_pdf", "question_id": "2303.08559v2.pdf::Q006", "answer_letter": null, "raw_text": "I need to find Figure 6 and identify the green module, then look for the 100-shot TACREV setting to calculate the absolute percentage drop.\n\nLooking through the document, I find Figure 6 on page 8, which shows \"The overall architecture of our adaptive filter-then-rerank paradigm.\" In this figure, I can see colored modules - orange for \"Easy Sample\" and pink for \"Hard Sample\". The diagram shows a filter (Small LM) and a reranker (Large LM).\n\nHowever, I need to look at Table 5 on page 9, which shows \"Ablation study on three datasets\" and examines the effectiveness of modules in the adaptive filter-then-rerank system. The table shows results for TACREV (100-shot).\n\nLooking at the table columns, I can see:\n- CoT (Chain of Thought)\n- Demo\n- LF (Label Filtering) \n- AD (Adaptive)\n\nThe green box in Figure 6 appears to correspond to the demonstration/demo component based on the visual representation.\n\nFrom Table 5, for TACREV (100-shot):\n- With all modules (\u2713 \u2713 \u2713 \u2713): 75.9(2.4)\n- Without Demo but with others (\u2717 \u2717 \u2713 \u2713): 74.9(2.2)\n\nThe absolute drop is: 75.9 - 74.9 = 1.0\n\nAnswer: 1.0", "citations": [], "input_tokens": 82081, "output_tokens": 344, "cost_micros": 251403, "latency_ms": 36125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2303.08559v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "1.0", "normalised_gold": "0.5"}}
+{"qid": "2303.08559v2.pdf::Q006", "doc_id": "2303.08559v2.pdf", "answer_format": "float", "gold": "0.5%", "pages": 30, "document_id": 7584, "arm": "azure_basic_lc", "question_id": "2303.08559v2.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the performance impact of dropping the green module (demonstration) shown in Figure 6 on the 100-shot TACREV setting.\n\nLooking at Table 5 (Ablation study), I can see the ablation results for different components on three datasets including TACREV (100-shot).\n\nThe table shows:\n- Full system (\u2713 CoT \u2713 Demo \u2713 LF \u2713 AD): 75.9(2.4)\n- Without Demo (\u2717 Demo \u2717 CoT \u2713 LF \u2713 AD): 74.9(2.2)\n\nHowever, I need to isolate just the Demo effect. Looking more carefully:\n- With CoT and Demo (\u2713 CoT \u2713 Demo \u2713 LF \u2713 AD): 75.9\n- Without Demo but with CoT (\u2713 CoT \u2717 Demo \u2713 LF \u2713 AD): This line doesn't exist\n\nBut I can see:\n- Without CoT but with Demo (\u2717 CoT \u2713 Demo \u2713 LF \u2713 AD): 75.4\n- Without both CoT and Demo (\u2717 CoT \u2717 Demo \u2713 LF \u2713 AD): 74.9\n\nLet me look at the correct comparison for just dropping Demo:\nFrom the table, comparing rows where only Demo changes:\n- Row 1 (\u2713 CoT \u2713 Demo): 75.9\n- Row 2 (\u2717 CoT \u2713 Demo): 75.4\n- Row 3 (\u2717 CoT \u2717 Demo): 74.9\n\nThe drop from having Demo to not having Demo (when CoT is also removed) is:\n75.4 - 74.9 = 0.5\n\nHowever, to properly isolate the Demo effect with CoT present, I need to estimate. The pattern suggests:\n- Full system: 75.9\n- Without Demo (keeping other factors): approximately 75.4 - 75.9 range\n\nActually, looking at row 2 vs row 3: the difference is 75.4 -", "citations": [], "input_tokens": 35348, "output_tokens": 512, "cost_micros": 113724, "latency_ms": 11858, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "2.0", "normalised_gold": "0.5"}}
+{"qid": "2303.08559v2.pdf::Q006", "doc_id": "2303.08559v2.pdf", "answer_format": "float", "gold": "0.5%", "pages": 30, "document_id": 7584, "arm": "azure_premium_lc", "question_id": "2303.08559v2.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the performance drop when removing the green module (demonstration) from Figure 6 on the 100-shot TACREV setting.\n\nLooking at Table 5 (Ablation study), I can find the relevant information:\n- The table shows ablation results on three datasets including TACREV (100-shot)\n- The rows show different configurations with checkmarks (\u2713) and crosses (\u2717)\n\nThe green module in Figure 6 represents the \"Demonstration\" component.\n\nFrom Table 5, for TACREV (100-shot):\n- With CoT \u2713, Demo \u2713, LF \u2713, AD \u2713: 75.9(2.4)\n- With CoT \u2717, Demo \u2717, LF \u2713, AD \u2713 (no demo): 74.9(2.2)\n\nThe absolute percentage drop when removing Demo is:\n75.9 - 74.9 = 1.0\n\nAnswer: 1.0", "citations": [], "input_tokens": 46313, "output_tokens": 231, "cost_micros": 142404, "latency_ms": 8516, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "1.0", "normalised_gold": "0.5"}}
+{"qid": "2303.08559v2.pdf::Q006", "doc_id": "2303.08559v2.pdf", "answer_format": "float", "gold": "0.5%", "pages": 30, "document_id": 7584, "arm": "llamacloud_basic_lc", "question_id": "2303.08559v2.pdf::Q006", "answer_letter": null, "raw_text": "I need to find Figure 6 and identify the \"green module,\" then look at the 100-shot TACREV setting to determine the performance drop.\n\nLooking at Figure 6 in the document, it shows \"The overall architecture of our adaptive filter-then-rerank paradigm.\" The green box is labeled as \"demos\" (demonstration examples).\n\nNow I need to find the ablation study that shows what happens when demos are removed. Looking at Table 5 (Ablation study), I can see different configurations where modules are removed:\n\nThe table shows results for FewNERD (20-shot), TACREV (100-shot), and ACE05 (20-shot).\n\nFor TACREV (100-shot):\n- Full system (\u2713 \u2713 \u2713 \u2713): 75.9(2.4)\n- Without CoT (\u2717 \u2713 \u2713 \u2713): 75.4(2.4)\n- Without Demo (\u2717 \u2717 \u2713 \u2713): 74.9(2.2)\n\nThe \"Demo\" module corresponds to the green box in Figure 6. When we remove it (going from \u2717 \u2713 \u2713 \u2713 to \u2717 \u2717 \u2713 \u2713), the performance drops from 75.4 to 74.9.\n\nThe absolute drop is: 75.4 - 74.9 = 0.5\n\nAnswer: 0.5", "citations": [], "input_tokens": 33414, "output_tokens": 351, "cost_micros": 105507, "latency_ms": 9827, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "0.5", "normalised_gold": "0.5"}}
+{"qid": "2303.08559v2.pdf::Q006", "doc_id": "2303.08559v2.pdf", "answer_format": "float", "gold": "0.5%", "pages": 30, "document_id": 7584, "arm": "llamacloud_premium_lc", "question_id": "2303.08559v2.pdf::Q006", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about the \"green module\" in Figure 6 and its effect on the 100-shot TACREV setting.\n\nFrom Figure 6, the green box represents the **Demonstration** component in the reranker process.\n\nFrom Table 5 (Ablation Study), I can see the results when removing the Demo component:\n- The row \"[ ] [ ] [x] [x]\" indicates: no CoT, no Demo, with LF, with AD\n- For TACREV (100-shot): 74.9(2.2)\n- The full system \"[x] [x] [x] [x]\" achieves: 75.9(2.4)\n\nThe absolute percentage drop when removing Demo is:\n75.9 - 74.9 = 1.0\n\nAnswer: 1.0", "citations": [], "input_tokens": 45083, "output_tokens": 198, "cost_micros": 138219, "latency_ms": 8328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "1.0", "normalised_gold": "0.5"}}
+{"qid": "2303.08559v2.pdf::Q006", "doc_id": "2303.08559v2.pdf", "answer_format": "float", "gold": "0.5%", "pages": 30, "document_id": 7584, "arm": "surfsense_agentic", "question_id": "2303.08559v2.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the document to find Figure 6 and the information about dropping the green module in the 100-shot TACREV setting.Let me search for Figure 6 in the document:Now let me search for more context around Figure 6:Good! I found Figure 6. It mentions a green box representing \"demos\". Let me read the full content around Figure 6 and look for ablation studies or tables that show what happens when dropping the green module (demos):Perfect! Figure 6 shows that the green box represents \"Demonstration\" (demos). Now I need to find the ablation study or table that shows the performance drop when removing demos. Let me search for ablation or table related to this:Let me search for ablation study or a table showing what happens when demos are removed:Perfect! I found the ablation study in Table 5. Let me read that section:Let me search more specifically for Table 5:Great! I found Table 5. Let me continue reading to see all rows:Perfect! Now let me look for the explanation of what each column means. Let me read the section before Table 5:Let me find the section that explains the ablation study (section 5.5):Let me continue reading the ablation study explanation:I see the explanation was cut off. Let me search for where it continues:Let me look for text after Table 3 that continues the ablation study explanation:Perfect! Now I have the full explanation. Looking at Table 5, I need to identify which row corresponds to dropping the \"Demo\" (green module). Let me re-read Table 5 more carefully:Now let me understand the table structure. The columns are:\n- CoT (Chain of Thought)\n- Demo (Demonstration - the green module)\n- LF (Label Filtering)\n- AD (Adaptive)\n\nThe rows show different configurations with checkmarks (\u2713) meaning present and X (\u2717) meaning removed.\n\nLooking at the table:\n- Row 1: \u2713 CoT, \u2713 Demo, \u2713 LF, \u2713 AD \u2192 TACREV (100-shot) = 75.9\n- Row 2: \u2717 CoT, \u2713 Demo, \u2713 LF, \u2713 AD \u2192 TACREV (100-shot) = 75.4\n- Row 3: \u2717 CoT, \u2717 Demo, \u2612 LF, \u2612 AD \u2192 TACREV (100-shot) = 74.9\n- Row 4: \u2717 CoT, \u2717 Demo, \u2717 LF, \u2612 AD \u2192 TACREV (100-shot) = 73.8\n- Row 5: \u2717 CoT, \u2717 Demo, \u2717 LF, \u2717 AD \u2192 TACREV (100-shot) = 59.9\n\nWait, I need to understand the notation better. Let me look at the explanation again. From the text at lines 2189-2194:\n\n\"(2) Demo: We remove all examples, rendering the reranking a zero-shot problem.\"\n\nSo when Demo is removed (\u2717), all examples are removed. Looking at the table structure, I need to find the row where Demo changes from \u2713 to \u2717 while keeping other factors constant.\n\nThe best comparison to isolate the effect of Demo would be:\n- Row 1 (full model with all components): CoT \u2713, Demo \u2713, LF \u2713, AD \u2713 \u2192 75.9\n- Row 2 (without CoT): CoT \u2717, Demo \u2713, LF \u2713, AD \u2713 \u2192 75.4\n\nTo find the effect of dropping Demo specifically, I should compare:\n- Row 2 (with Demo, without CoT): CoT \u2717, Demo \u2713, LF \u2713, AD \u2713 \u2192 75.4\n- Row 3 (without Demo and CoT): CoT \u2717, Demo \u2717, (different LF/AD settings) \u2192 74.9\n\nBut wait, the LF and AD settings change. Let me reconsider. Looking more carefully at the symbols, I see \"\u2612\" which might be different from \"\u2713\" and \"\u2717\".\n\nLet me look for a clearer comparison. The green module in Figure 6 is \"Demo\". To find the effect of dropping it:\n\nFrom the full model (row 1): 75.9\nTo without Demo - I need to find which row removes only Demo while keeping everything else.\n\nActually, looking at row 2 vs row 3:\n- Row 2: \u2717 CoT, \u2713 Demo, \u2713 LF, \u2713 AD \u2192 75.4\n- Row 3: \u2717 CoT, \u2717 Demo, \u2612 LF, \u2612 AD \u2192 74.9\n\nBut LF and AD also change, so this isn't a clean comparison.\n\nLet me think about this differently. According to the explanation text (lines 2197-2199):\n\"(1) Demos with explanations consistently enhance the reranking ability of LLMs across all datasets.\"\n\nAnd (line 2199-2200):\n\"(2) Demos without explanations also contribute to performance improvement.\"\n\nSo the comparison should be:\n- With Demo + CoT (row 1): 75.9\n- Without CoT but with Demo (row 2): 75.4 \n- Without both CoT and Demo (row 3 onwards)\n\nThe cleanest way to see the effect of dropping Demo is to compare a configuration with Demo vs without Demo. Looking at row 2 (has Demo but no CoT) = 75.4, and the next logical step would be row 3 (no Demo, no CoT) = 74.9.\n\nBut let me also check if there's a better comparison. Actually, looking at the structure, row 3 onwards all have \"\u2717 \u2612\" for both CoT and Demo columns, meaning both are removed.\n\nSo the drop from having Demo (row 2: 75.4) to not having Demo (row 3: 74.9) = 75.4 - 74.9 = 0.5\n\nBut wait, in row 3, the LF and AD settings also changed (\u2612 instead of \u2713).\n\nLet me reconsider what \u2612 means. It might mean the opposite of the feature (e.g., if \u2713 means \"with\", then \u2612 might mean \"without\" for LF and AD columns).\n\nActually, based on the explanation:\n- LF (label filtering): \u2713 means using top-N labels, \u2717 means retaining all labels\n- AD (adaptive): \u2713 means only hard samples, \u2717 means all samples\n\nSo \u2612 might indicate the opposite setting.\n\nGiven the complexity, let me focus on finding where the Demo is specifically dropped. The clearest comparison would be between:\n- Row 1 (everything included): 75.9\n- Row 3 (no CoT, no Demo): 74.9\n\nDifference: 75.9 - 74.9 = 1.0\n\nBut this includes the effect of removing CoT as well. To isolate Demo:\n- Row 2 (no CoT, yes Demo): 75.4\n- Row 3 (no CoT, no Demo): 74.9\n\nDifference: 75.4 - 74.9 = 0.5\n\nHowever, LF and AD settings also changed between row 2 and 3.\n\nLet me look at this more systematically. Actually, I think I'm overcomplicating this. The question asks specifically about dropping \"the green module\" which is Demo. Looking at the full system (row 1) vs without demo, I need to find the appropriate ablation.\n\nActually, re-reading the table more carefully with the column headers having 2 cells labeled \"CoT Demo\", I think these might be a single column header spanning 2 rows. Let me re-examine:\n\nLooking at lines 2298-2303:\n```\n<th colspan=\"2\">CoT Demo</th>\n```\n\nThis suggests \"CoT Demo\" is a single header spanning 2 columns. So the structure might be:\n- Column 1: CoT\n- Column 2: Demo\n- Column 3: LF\n- Column 4: AD\n\nWith this understanding:\n- Row 1: \u2713 CoT, \u2713 Demo, \u2713 LF, \u2713 AD \u2192 75.9\n- Row 2: \u2717 CoT, \u2713 Demo, \u2713 LF, \u2713 AD \u2192 75.4\n- Row 3: \u2717 CoT, \u2717 Demo, ? LF, ? AD \u2192 74.9\n\nSo dropping Demo (from row 2 to row 3) causes a drop from 75.4 to 74.9 = 0.5\n\nBut the LF and AD settings are unclear in row 3 with the \u2612 symbol.\n\nGiven the question asks specifically for dropping the green module (Demo), and looking at the cleanest comparison while keeping CoT removed:\n- With Demo (row 2): 75.4\n- Without Demo (row 3): 74.9\n- Drop: 75.4 - 74.9 = 0.5\n\nBut if we compare from the full model:\n- Full model (row 1): 75.9\n- Without Demo but with CoT - this row doesn't exist in the table\n\nActually, maybe I should look at row 5 which has everything removed. Let me reconsider all rows:\n1. All components: 75.9\n2. No CoT: 75.4\n3. No CoT, No Demo: 74.9\n4. No CoT, No Demo, No LF: 73.8\n5. Nothing (all removed): 59.9\n\nThe progression suggests cumulative removal. To isolate Demo's effect:\nRow 2 (75.4) - Row 3 (74.9) = 0.5\n\nWait, but I should verify this interpretation by looking at the explanation text again. From lines 2196-2200:\n\n\"(1) Demos with explanations consistently enhance the reranking ability of LLMs across all datasets.\n(2) Demos without explanations also contribute to performance improvement.\"\n\nThis suggests:\n- Row 1 has Demo WITH CoT (explanations)\n- Row 2 has Demo WITHOUT CoT (no explanations) \n- Row 3 onwards has NO Demo\n\nSo the effect of dropping Demo entirely (going from row 2 to row 3): 75.4 - 74.9 = 0.5\n\nBut wait, there might be more to consider. Let me check if there's another row that might represent dropping just Demo while keeping CoT. Looking at the table again... no, there isn't such a row.\n\nGiven the structure and the question asking about dropping the green module (Demo), the most reasonable answer based on the ablation study is:\n\nFrom row 1 (full model with all components) = 75.9\nTo the configuration without Demo... I need to find which row that is.\n\nIf I assume row 3 still has other components but not Demo:\nRow 1: 75.9\nRow 3: 74.9\nDifference: 1.0\n\nBut this seems to also remove CoT. Let me think about what makes sense given Figure 6. The green module is specifically the \"Demonstration\" box shown feeding into the reranker.\n\nActually, perhaps the cleanest interpretation is:\n- Best system with Demo (row 1 or 2): max is 75.9 (row 1)\n- Best system without Demo (row 3, 4, or 5): max is 74.9 (row 3)\n- Absolute drop: 75.9 - 74.9 = 1.0\n\nBut this conflates CoT and Demo effects.\n\nHmm, let me reconsider the table structure once more. Maybe the header \"CoT Demo\" with colspan=\"2\" means there are two sub-columns, one for each. And the checkmarks below indicate their presence/absence.\n\nGiven the typical ablation study structure, each row progressively removes one more component. So:\n- Row 1: Everything (baseline) = 75.9\n- Row 2: Remove CoT = 75.4 (drop of 0.5)\n- Row 3: Remove Demo (CoT already removed) = 74.9 (additional drop of 0.5)\n- Row 4: Remove LF (CoT and Demo already removed) = 73.8 (additional drop of 1.1)\n- Row 5: Remove AD (everything removed) = 59.9 (additional drop of 13.9)\n\nIf we want the isolated effect of Demo, we need to compare configurations that differ only in Demo. But there's no row with CoT present and Demo absent.\n\nHowever, if we accept that row 2 \u2192 row 3 transition shows the effect of removing Demo (with CoT already removed), then the drop is 0.5.\n\nAlternatively, if the question wants the total contribution of Demo to the best system, we could argue:\n- Best with Demo: 75.9 (row 1)\n- Best without Demo: 74.9 (row 3)\n- Total drop attributable to absence of Demo: 1.0\n\nBut this includes the removal of CoT's contribution when combined with Demo removal.\n\nGiven typical ablation study interpretation and the question phrasing \"if I drop the green module,\" I believe the answer is the difference from the full system (row 1: 75.9) to the first row without Demo which is row 3 (74.9):\n\n75.9 - 74.9 = 1.0\n\nAnswer: 1.0", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 147812, "error": null, "extra": {"thread_id": 1974, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_b54490e8642341dd973e7bd446513622", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 1457, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "1.0", "normalised_gold": "0.5"}}
+{"qid": "2305.13186v3.pdf::Q000", "doc_id": "2305.13186v3.pdf", "answer_format": "list", "gold": "['3', '1']", "pages": 27, "document_id": 7585, "arm": "native_pdf", "question_id": "2305.13186v3.pdf::Q000", "answer_letter": null, "raw_text": "I need to locate the first figure in the paper and count the green and grey rectangles.\n\nLooking at the document, Figure 1 appears on page 2. It shows \"An example of our SCITAB dataset\" with a reasoning graph on the right side.\n\nIn the reasoning graph on the right side of Figure 1, I can see:\n- Green rectangles (boxes): I can count \"Closed-domain knowledge: Table caption\", \"Commonsense knowledge\", \"Simple lookup\", \"Subtraction\", and \"Fact checker: Supported\" - that's 5 green boxes\n- Grey rectangles: I can see grey/neutral colored boxes containing text like \"Productivity corresponds to Prod. column\", \"The number of random chance is 50%\", \"A's productivity is 57.5%\", and \"The subtraction result between 57.5% and 50% is 7.5%\" - that's 4 grey boxes\n\nWait, let me recount more carefully by looking at the colors:\n- Green boxes appear to be the operation/reasoning step boxes\n- Grey boxes appear to be the intermediate result/statement boxes\n\nCounting again in Figure 1's reasoning graph:\n- Green rectangles: 5 (the labeled reasoning steps)\n- Grey rectangles: 4 (the intermediate statements)\n\nIn descending order: [\"5\", \"4\"]\n\nAnswer: 5, 4", "citations": [], "input_tokens": 68164, "output_tokens": 313, "cost_micros": 209187, "latency_ms": 35953, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2305.13186v3.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "4, 5", "normalised_gold": "1, 3"}}
+{"qid": "2305.13186v3.pdf::Q000", "doc_id": "2305.13186v3.pdf", "answer_format": "list", "gold": "['3', '1']", "pages": 27, "document_id": 7585, "arm": "azure_basic_lc", "question_id": "2305.13186v3.pdf::Q000", "answer_letter": null, "raw_text": "I need to examine the first figure in the paper, which is Figure 1.\n\nLooking at Figure 1 on the first page, I can see a reasoning graph on the right side of the figure. Let me count the rectangles by color:\n\nGreen rectangles (these appear to represent supported claims or correct reasoning):\n1. \"A's productivity is 57.5%\"\n2. \"Productivity corresponds to Prod. column\"\n3. \"The number of random chance is 50%\"\n4. \"The subtraction result between 57.5% and 50% is 7.5%\"\n\nThat's 4 green rectangles.\n\nGrey rectangles (these appear to represent refuted or incorrect claims):\n1. \"A's productivity of 57.5% expresses that it appears in 9.5% more often than expected by random chance.\"\n\nThat's 1 grey rectangle.\n\nIn descending order: [\"4\", \"1\"]\n\nAnswer: 4, 1", "citations": [], "input_tokens": 31797, "output_tokens": 222, "cost_micros": 98721, "latency_ms": 7155, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "1, 4", "normalised_gold": "1, 3"}}
+{"qid": "2305.13186v3.pdf::Q000", "doc_id": "2305.13186v3.pdf", "answer_format": "list", "gold": "['3', '1']", "pages": 27, "document_id": 7585, "arm": "azure_premium_lc", "question_id": "2305.13186v3.pdf::Q000", "answer_letter": null, "raw_text": "I need to look at the first figure in the document, which is Figure 1 on the first page.\n\nLooking at Figure 1, I can see a reasoning graph on the right side. Let me count the rectangles by color:\n\nGreen rectangles (which appear to represent certain types of nodes in the reasoning graph):\n- \"Closed-domain knowledge: Table caption\"\n- \"Productivity corresponds to Prod. column\"\n- \"Commonsense knowledge\"\n- \"The number of random chance is 50%\"\n- \"Simple lookup\"\n- \"A's productivity is 57.5%\"\n- \"Subtraction\"\n- \"The subtraction result between 57.5% and 50% is 7.5%\"\n\nThat's 8 green rectangles.\n\nGrey rectangles (which appear to represent the claims):\n- \"Supported Claim: A's productivity of 57.5% expresses that it appears in 7.5% more often than expected by random chance.\"\n- \"Refuted Claim: A's productivity of 57.5% expresses that it appears in 9.5% more often than expected by random chance.\"\n- \"Not Enough Info Claim: The low performance of \"to\" can be explained by the fact that it is responsible for only 4.6% of the inference in the training set.\"\n\nThat's 3 grey rectangles.\n\nIn descending order: 8, 3\n\nAnswer: 8, 3", "citations": [], "input_tokens": 44352, "output_tokens": 333, "cost_micros": 138051, "latency_ms": 10203, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "3, 8", "normalised_gold": "1, 3"}}
+{"qid": "2305.13186v3.pdf::Q000", "doc_id": "2305.13186v3.pdf", "answer_format": "list", "gold": "['3', '1']", "pages": 27, "document_id": 7585, "arm": "llamacloud_basic_lc", "question_id": "2305.13186v3.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the first figure in the paper and count the green and grey rectangles.\n\nLooking at the document, the first figure is Figure 1, which shows \"An example of our SCITAB dataset (left) and its corresponding reasoning graph (right).\"\n\nIn the reasoning graph (right side of Figure 1), I can see:\n- The graph contains nodes connected by arrows\n- There are rectangular boxes of different colors\n- Looking at the reasoning graph structure, I can identify green rectangles and grey rectangles\n\nCounting the rectangles in the reasoning graph:\n- Green rectangles: These appear to be the operation/function nodes in the graph. I can count: \"Lookup\", \"Background knowledge\", \"Commonsense knowledge\", \"Subtract\" - that's 4 green rectangles\n- Grey rectangles: These appear to be the value/result nodes. I can count: \"57.5%\", \"Prod.\", \"50%\", \"7.5%\" - that's 4 grey rectangles\n\nWait, let me recount more carefully by examining the figure description and structure:\n- Green rectangles (operations): 4\n- Grey rectangles (values): 4\n\nIn descending order: [\"4\", \"4\"]\n\nAnswer: 4, 4", "citations": [], "input_tokens": 28779, "output_tokens": 281, "cost_micros": 90552, "latency_ms": 9219, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "4", "normalised_gold": "1, 3"}}
+{"qid": "2305.13186v3.pdf::Q000", "doc_id": "2305.13186v3.pdf", "answer_format": "list", "gold": "['3', '1']", "pages": 27, "document_id": 7585, "arm": "llamacloud_premium_lc", "question_id": "2305.13186v3.pdf::Q000", "answer_letter": null, "raw_text": "I need to examine the first figure in the paper, which is Figure 1 on page 1.\n\nLooking at Figure 1, I can see it contains:\n- A table on the left side\n- A reasoning graph on the right side with various nodes and connections\n\nIn the reasoning graph (right side), I can identify:\n- Grey rectangles: These appear to be nodes like \"Closed-domain knowledge\", \"Commonsense knowledge\", \"Simple lookup\", \"Subtraction\", and \"Fact checker\"\n- Green rectangles: I need to look for any green-colored rectangles\n\nLet me count carefully:\n- Grey rectangles in the reasoning graph: I can see multiple grey nodes representing different types of operations and knowledge\n- Green rectangles: Looking at the figure, I don't see distinctly green rectangles in the traditional sense, but there appear to be some lighter/different colored elements\n\nActually, examining the figure more carefully:\n- There appear to be approximately 5 grey rectangular nodes in the reasoning graph\n- I don't clearly see green rectangles that are distinctly separate from the grey ones\n\nHowever, looking more carefully at the color scheme and the structure, it appears there are:\n- Grey rounded rectangles: approximately 5\n- There don't appear to be clearly distinct green rectangles\n\nBased on my examination, I count approximately 5 grey rectangles and 0 distinctly green rectangles in Figure 1.\n\nAnswer: 5, 0", "citations": [], "input_tokens": 36689, "output_tokens": 320, "cost_micros": 114867, "latency_ms": 9875, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "0, 5", "normalised_gold": "1, 3"}}
+{"qid": "2305.13186v3.pdf::Q000", "doc_id": "2305.13186v3.pdf", "answer_format": "list", "gold": "['3', '1']", "pages": 27, "document_id": 7585, "arm": "surfsense_agentic", "question_id": "2305.13186v3.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the first figure in the paper. Let me continue reading to locate Figure 1.Perfect! I found Figure 1. Now I need to examine this figure carefully to count the green and grey rectangles. Based on the content I can see, Figure 1 shows a reasoning graph with different colored rectangles representing different steps in the reasoning process. Let me read more carefully to identify all the rectangles and their colors.\n\nFrom what I can see in the figure description:\n- There's a table on the left\n- A reasoning graph on the right\n- The reasoning graph contains several boxes/rectangles with different steps\n\nLooking at the text content that describes the figure structure:\n1. \"Closed-domain knowledge: Table caption\" - appears to be one rectangle\n2. \"Commonsense knowledge\" - appears to be another rectangle\n3. \"Productivity corresponds to Prod. column\" - part of the reasoning\n4. \"Simple lookup\" - appears to be a step\n5. \"The number of random chance is 50%\" - another element\n6. \"Supported Claim\" - labeled element\n7. \"A's productivity is 57.5%\" - text element\n8. Various reasoning steps with operators (+, -, =, \u00d7, ?)\n9. \"Refuted Claim\" - labeled element\n10. \"Not Enough Info Claim\" - labeled element\n\nLet me search for more details about the actual visual representation or read the next section to see if there's a clearer description.Let me go back and read the figure description more carefully to understand the visual elements:It seems the content around line 632-646 is describing a green checkmark icon, which is not Figure 1. Let me look more carefully at the actual Figure 1 content. Let me re-read around the figure caption area:", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 55655, "error": null, "extra": {"thread_id": 1975, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_b73a2e9dd2a74761aa49d98419f10801", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 323, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "let me search for more details about actual visual representation or read next section to see if there s clearer description let me go back and read figure description more carefully to understand visual elements it seems content around line 632 646 is describing green checkmark icon, which is not figure 1 let me look more carefully at actual figure 1 content let me re read around figure caption area", "normalised_gold": "1, 3"}}
+{"qid": "2305.13186v3.pdf::Q002", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "5", "pages": 27, "document_id": 7585, "arm": "native_pdf", "question_id": "2305.13186v3.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document, I need to count the reasoning steps shown in the reasoning graph on the right side of the figure.\n\nThe reasoning graph shows the following steps:\n1. Closed-domain knowledge: Table caption\n2. Commonsense knowledge\n3. Simple lookup\n4. Subtraction\n\nThese are the four distinct reasoning steps required to verify the claim \"A's productivity of 57.5% expresses that it appears in 7.5% more often than expected by random chance.\"\n\nAnswer: 4", "citations": [], "input_tokens": 68131, "output_tokens": 119, "cost_micros": 206178, "latency_ms": 30733, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2305.13186v3.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "5"}}
+{"qid": "2305.13186v3.pdf::Q002", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "5", "pages": 27, "document_id": 7585, "arm": "azure_basic_lc", "question_id": "2305.13186v3.pdf::Q002", "answer_letter": null, "raw_text": "I need to examine Figure 1 in the document to count the reasoning steps shown in the reasoning graph.\n\nLooking at Figure 1, I can see a reasoning graph on the right side that shows the steps needed to verify the claim \"A's productivity of 57.5% expresses that it appears in 7.5% more often than expected by random chance.\"\n\nThe reasoning graph shows the following steps:\n\n1. \"Closed-domain knowledge: Productivity corresponds to Prod. column\" (from table caption)\n2. \"Commonsense knowledge: The number of random chance is 50%\"\n3. \"Simple lookup: A's productivity is 57.5%\"\n4. \"Subtraction: The subtraction result between 57.5% and 50% is 7.5%\"\n\nThese steps lead to verifying whether the claim is \"Supported\" or \"Refuted\".\n\nCounting the reasoning steps in the graph, I can identify 4 distinct reasoning steps.\n\nAnswer: 4", "citations": [], "input_tokens": 31764, "output_tokens": 225, "cost_micros": 98667, "latency_ms": 8390, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "5"}}
+{"qid": "2305.13186v3.pdf::Q002", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "5", "pages": 27, "document_id": 7585, "arm": "azure_premium_lc", "question_id": "2305.13186v3.pdf::Q002", "answer_letter": null, "raw_text": "I need to count the reasoning steps shown in the reasoning graph in Figure 1.\n\nLooking at Figure 1's reasoning graph (on the right side), I can identify the following reasoning steps:\n\n1. \"Closed-domain knowledge: Table caption\" - Productivity corresponds to Prod. column\n2. \"Commonsense knowledge\" - The number of random chance is 50%\n3. \"Simple lookup\" - A's productivity is 57.5%\n4. \"Subtraction\" - The subtraction result between 57.5% and 50% is 7.5%\n5. Final verification step - comparing the claim with the calculated result\n\nThe reasoning graph shows 5 distinct reasoning steps from the initial knowledge extraction to the final verification of the claim.\n\nAnswer: 5", "citations": [], "input_tokens": 44319, "output_tokens": 174, "cost_micros": 135567, "latency_ms": 6969, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "5"}}
+{"qid": "2305.13186v3.pdf::Q002", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "5", "pages": 27, "document_id": 7585, "arm": "llamacloud_basic_lc", "question_id": "2305.13186v3.pdf::Q002", "answer_letter": null, "raw_text": "I need to examine Figure 1 in the document to count the reasoning steps shown in the reasoning graph.\n\nLooking at Figure 1, I can see it contains:\n- A table with columns: Cue, App., Prod., Cov.\n- A claim about \"A's productivity of 57.5%\"\n- A reasoning graph on the right side\n\nThe reasoning graph shows the verification process with several nodes connected by arrows. Let me count the reasoning steps by tracing through the graph:\n\n1. Start with the claim about A's productivity of 57.5%\n2. Look up \"A's productivity\" \u2192 refers to cell at last row and \"Prod.\" column\n3. Background knowledge: \"productivity\" corresponds to \"Prod.\" column\n4. Simple lookup: Get value 57.5% from the table\n5. Commonsense knowledge: \"random chance\" means 50% accuracy\n6. Numerical reasoning: Calculate difference 57.5% - 50% = 7.5%\n7. Compare calculated result with claimed value (7.5%)\n\nCounting the nodes in the reasoning graph shown in Figure 1, I can see there are 7 distinct reasoning steps involved in verifying this claim.\n\nAnswer: 7", "citations": [], "input_tokens": 28746, "output_tokens": 277, "cost_micros": 90393, "latency_ms": 9546, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "7", "normalised_gold": "5"}}
+{"qid": "2305.13186v3.pdf::Q002", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "5", "pages": 27, "document_id": 7585, "arm": "llamacloud_premium_lc", "question_id": "2305.13186v3.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 1 in the paper, I need to count the reasoning steps shown in the reasoning graph on the right side.\n\nThe reasoning graph shows the following steps to verify the claim \"A's productivity of 57.5% expresses that it appears in 7.5% more often than expected by random chance\":\n\n1. **Closed-domain knowledge**: Productivity corresponds to Prod. column (from table caption)\n2. **Commonsense knowledge**: The number of random chance is 50%\n3. **Simple lookup**: Finding the value 57.5% in the table (A's productivity)\n4. **Subtraction**: The subtraction result between 57.5% and 50% is 7.5%\n\nAnswer: 4", "citations": [], "input_tokens": 36656, "output_tokens": 169, "cost_micros": 112503, "latency_ms": 7671, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "5"}}
+{"qid": "2305.13186v3.pdf::Q002", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "5", "pages": 27, "document_id": 7585, "arm": "surfsense_agentic", "question_id": "2305.13186v3.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the document that was mentioned as most relevant to find Figure 1 and count the reasoning steps.Let me continue reading to find Figure 1:Now let me read the section around line 562 where Figure 1 is described:Let me read more to see the complete figure with all reasoning steps:Based on the document content, I can see Figure 1 shows a reasoning graph with the following steps:\n\n1. **Closed-domain knowledge: Table caption** - \"Productivity corresponds to Prod. column\"\n2. **Simple lookup** - \"A's productivity is 57.5%\"\n3. **Commonsense knowledge** - \"The number of random chance is 50%\"\n4. **Subtraction** - \"The subtraction result between 57.5% and 50% is 7.5%\"\n\nThese 4 reasoning steps lead to verifying the claim \"A's productivity of 57.5% expresses that it appears in 7.5% more often than expected by random chance\" which is marked as \"Supported\".\n\nAnswer: 4", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 37062, "error": null, "extra": {"thread_id": 1976, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_6471c23a58a448bd8c1bac8c5ba5da64", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 190, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "5"}}
+{"qid": "2305.13186v3.pdf::Q003", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "197940", "pages": 27, "document_id": 7585, "arm": "native_pdf", "question_id": "2305.13186v3.pdf::Q003", "answer_letter": null, "raw_text": "I need to find the total number of claims in Wiki Table datasets and Scientific Articles datasets from the document.\n\nFrom Table 1 on page 4, I can see:\n\n**Wiki Table datasets:**\n- TabFact: 117,854 claims\n- FEVEROUS: 87,026 claims\nTotal Wiki Table claims = 117,854 + 87,026 = 204,880\n\n**Scientific Articles datasets:**\n- SEM-TAB-FACTS: 5,715 claims\n- SCITAB: 1,225 claims\nTotal Scientific Articles claims = 5,715 + 1,225 = 6,940\n\nDifference = 204,880 - 6,940 = 197,940\n\nAnswer: 197940", "citations": [], "input_tokens": 68140, "output_tokens": 175, "cost_micros": 207045, "latency_ms": 33358, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2305.13186v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "197940", "normalised_gold": "197940"}}
+{"qid": "2305.13186v3.pdf::Q003", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "197940", "pages": 27, "document_id": 7585, "arm": "azure_basic_lc", "question_id": "2305.13186v3.pdf::Q003", "answer_letter": null, "raw_text": "I need to find the number of claims in Wiki Table datasets versus Scientific Articles datasets from Table 1.\n\nFrom Table 1 (Comparison of SCITAB to three recent table fact verification datasets):\n\n**Wiki Tables datasets:**\n- TabFact: 117,854 claims\n- FEVEROUS: 87,026 claims\nTotal Wiki Tables claims = 117,854 + 87,026 = 204,880\n\n**Scientific Articles datasets:**\n- SEM-TAB-FACTS: 5,715 claims\n- SCITAB: 1,225 claims\nTotal Scientific Articles claims = 5,715 + 1,225 = 6,940\n\nDifference = 204,880 - 6,940 = 197,940\n\nAnswer: 197940", "citations": [], "input_tokens": 31773, "output_tokens": 182, "cost_micros": 98049, "latency_ms": 6016, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "197940", "normalised_gold": "197940"}}
+{"qid": "2305.13186v3.pdf::Q003", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "197940", "pages": 27, "document_id": 7585, "arm": "azure_premium_lc", "question_id": "2305.13186v3.pdf::Q003", "answer_letter": null, "raw_text": "I need to find the total number of claims in Wiki Table datasets and Scientific Articles datasets from Table 1.\n\nFrom Table 1:\n\n**Wiki Tables datasets:**\n- TabFact: 117,854 claims\n- FEVEROUS: 87,026 claims\n- Total Wiki Tables: 117,854 + 87,026 = 204,880 claims\n\n**Scientific Articles datasets:**\n- SEM-TAB-FACTS: 5,715 claims\n- SCITAB: 1,225 claims\n- Total Scientific Articles: 5,715 + 1,225 = 6,940 claims\n\n**Difference:**\n204,880 - 6,940 = 197,940 claims\n\nAnswer: 197940", "citations": [], "input_tokens": 44328, "output_tokens": 174, "cost_micros": 135594, "latency_ms": 6858, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "197940", "normalised_gold": "197940"}}
+{"qid": "2305.13186v3.pdf::Q003", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "197940", "pages": 27, "document_id": 7585, "arm": "llamacloud_basic_lc", "question_id": "2305.13186v3.pdf::Q003", "answer_letter": null, "raw_text": "I need to find the total number of claims in Wiki Table datasets and Scientific Articles datasets from Table 1.\n\nFrom Table 1 (Comparison of SCITAB to three recent table fact verification datasets):\n\n**Wiki Tables datasets:**\n- TabFact: 117,854 claims\n- FEVEROUS: 87,026 claims\nTotal Wiki Tables claims = 117,854 + 87,026 = 204,880\n\n**Scientific Articles datasets:**\n- SEM-TAB-FACTS: 5,715 claims\n- SCITAB: 1,225 claims\nTotal Scientific Articles claims = 5,715 + 1,225 = 6,940\n\nDifference = 204,880 - 6,940 = 197,940\n\nAnswer: 197940", "citations": [], "input_tokens": 28755, "output_tokens": 183, "cost_micros": 89010, "latency_ms": 5985, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "197940", "normalised_gold": "197940"}}
+{"qid": "2305.13186v3.pdf::Q003", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "197940", "pages": 27, "document_id": 7585, "arm": "llamacloud_premium_lc", "question_id": "2305.13186v3.pdf::Q003", "answer_letter": null, "raw_text": "I need to find the total number of claims in Wiki Table datasets and Scientific Articles datasets from Table 1.\n\nFrom Table 1:\n\n**Wiki Tables datasets:**\n- TabFact: 117,854 claims\n- FEVEROUS: 87,026 claims\n- Total Wiki Tables: 117,854 + 87,026 = 204,880 claims\n\n**Scientific Articles datasets:**\n- SEM-TAB-FACTS: 5,715 claims\n- SCITAB: 1,225 claims\n- Total Scientific Articles: 5,715 + 1,225 = 6,940 claims\n\n**Difference:**\n204,880 - 6,940 = 197,940 claims\n\nAnswer: 197940", "citations": [], "input_tokens": 36665, "output_tokens": 174, "cost_micros": 112605, "latency_ms": 5968, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "197940", "normalised_gold": "197940"}}
+{"qid": "2305.13186v3.pdf::Q003", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "197940", "pages": 27, "document_id": 7585, "arm": "surfsense_agentic", "question_id": "2305.13186v3.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the document to find information about claims in Wiki Table datasets compared to scientific articles datasets.Perfect! I found the table with the statistics. Let me extract the specific information:\n\nFrom the table:\n- **Wiki Tables datasets**: \n  - TabFact: 117,854 claims\n  - FEVEROUS: 87,026 claims\n  - Total: 117,854 + 87,026 = 204,880 claims\n\n- **Scientific Articles datasets**:\n  - SEM-TAB-FACTS: 5,715 claims\n  - SCITAB: 1,225 claims\n  - Total: 5,715 + 1,225 = 6,940 claims\n\nDifference: 204,880 - 6,940 = 197,940\n\nAnswer: 197940", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 36890, "error": null, "extra": {"thread_id": 1977, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_eedb3672a2cb45bf898fdd6ba6d9a71c", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 152, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "197940", "normalised_gold": "197940"}}
+{"qid": "2305.13186v3.pdf::Q005", "doc_id": "2305.13186v3.pdf", "answer_format": "float", "gold": "453.25", "pages": 27, "document_id": 7585, "arm": "native_pdf", "question_id": "2305.13186v3.pdf::Q005", "answer_letter": null, "raw_text": "I need to find information about the total number of claims in the dataset and the payment per claim annotation.\n\nFrom the document:\n\n1. Total number of claims in SCITAB: 1,225 claims (mentioned in the abstract and Table 1)\n\n2. Payment per claim annotation: 0.37 USD (mentioned in Section 2.3: \"Each claim annotation is reimbursed at a rate of 0.37 USD\")\n\nHowever, I need to consider that the annotation process involves multiple rounds and different types of claims:\n\nFrom Section 2.3, the manual verification process involves:\n- Verifying 872 false claims (counter-claims)\n- Verifying 900 NEI claims\n- Reviewing 872 real-world scientific claims\n\nThis suggests a two-round annotation process where each claim is annotated by two different annotators.\n\nLooking more carefully at the construction process:\n- 872 original claims were extracted\n- For each original claim, a counter-claim was generated (872 counter-claims)\n- 900 NEI claims were generated\n- After verification, the final dataset has 1,225 claims total\n\nThe annotation process in Section 2.3 states: \"We assign two different annotators to perform a two-round annotation for each claim\"\n\nSo the total annotations would be:\n- For supported/refuted claim pairs: 872 pairs need verification (2 rounds each)\n- For NEI claims: 900 claims need verification (2 rounds each)\n\nTotal annotations = (872 \u00d7 2) + (900 \u00d7 2) = 1,744 + 1,800 = 3,544 annotations\n\nTotal cost = 3,544 \u00d7 0.37 = 1,311.28 USD\n\nAnswer: 1311.28", "citations": [], "input_tokens": 68152, "output_tokens": 408, "cost_micros": 210576, "latency_ms": 38546, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2305.13186v3.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "1311.28", "normalised_gold": "453.25"}}
+{"qid": "2305.13186v3.pdf::Q005", "doc_id": "2305.13186v3.pdf", "answer_format": "float", "gold": "453.25", "pages": 27, "document_id": 7585, "arm": "azure_basic_lc", "question_id": "2305.13186v3.pdf::Q005", "answer_letter": null, "raw_text": "I need to find information about the annotation cost and the total number of claims in the dataset.\n\nFrom the document:\n\n1. **Annotation cost**: \"Each claim annotation is reimbursed at a rate of 0.37 USD\" (Section 2.3)\n\n2. **Total number of claims in SCITAB**: Looking at Table 1, the total number of claims in SCITAB is 1,225.\n\nHowever, I need to determine how many claims actually needed annotation. According to Section 2:\n\n- Section 2.1: 872 real-world scientific claims were filtered from SciGen\n- Section 2.2: They generated 872 false claims (counter-claims) and 900 NEI claims\n- Section 2.3: Manual verification was done on all these claims\n\nFrom Section 2.3 and Appendix D:\n- 872 false claims were verified\n- 900 NEI claims were verified (which resulted in 355 final NEI claims)\n- 872 original claims were also reviewed\n\nSo the total annotations performed:\n- False claim verification: 872 claims\n- NEI claim verification: 900 claims\n- Total annotations: 872 + 900 = 1,772 claims\n\nTotal cost = 1,772 claims \u00d7 0.37 USD per claim = 655.64 USD\n\nAnswer: 655.64", "citations": [], "input_tokens": 31785, "output_tokens": 318, "cost_micros": 100125, "latency_ms": 9562, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "655.64", "normalised_gold": "453.25"}}
+{"qid": "2305.13186v3.pdf::Q005", "doc_id": "2305.13186v3.pdf", "answer_format": "float", "gold": "453.25", "pages": 27, "document_id": 7585, "arm": "azure_premium_lc", "question_id": "2305.13186v3.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, I need to find:\n1. The total number of claims in the SCITAB dataset\n2. The payment rate per claim annotation\n\nFrom the document:\n\n**Total number of claims:**\nFrom Table 1 and Section 2.3: \"Total # of Claims: 1,225\"\n\n**Payment per claim:**\nFrom Section 2.3: \"Each claim annotation is reimbursed at a rate of 0.37 USD\"\n\n**Calculation:**\nTotal cost = 1,225 claims \u00d7 0.37 USD per claim = 453.25 USD\n\nAnswer: 453.25", "citations": [], "input_tokens": 44340, "output_tokens": 145, "cost_micros": 135195, "latency_ms": 5389, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "453.25", "normalised_gold": "453.25"}}
+{"qid": "2305.13186v3.pdf::Q005", "doc_id": "2305.13186v3.pdf", "answer_format": "float", "gold": "453.25", "pages": 27, "document_id": 7585, "arm": "llamacloud_basic_lc", "question_id": "2305.13186v3.pdf::Q005", "answer_letter": null, "raw_text": "I need to find the total cost for annotating all claims in the SCITAB dataset.\n\nFrom the document, I can find:\n\n1. **Total number of claims in SCITAB**: According to the abstract and Section 2, SCITAB contains **1,225 claims** (also written as \"1.2K expert-verified scientific claims\").\n\n2. **Cost per claim annotation**: According to Section 2.3 under \"Annotator Recruitment\":\n   - \"Each claim annotation is reimbursed at a rate of 0.37 USD\"\n\n3. **Calculation**:\n   Total cost = Number of claims \u00d7 Cost per claim\n   Total cost = 1,225 \u00d7 0.37\n   Total cost = 453.25 USD\n\nAnswer: 453.25", "citations": [], "input_tokens": 28767, "output_tokens": 183, "cost_micros": 89046, "latency_ms": 7046, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "453.25", "normalised_gold": "453.25"}}
+{"qid": "2305.13186v3.pdf::Q005", "doc_id": "2305.13186v3.pdf", "answer_format": "float", "gold": "453.25", "pages": 27, "document_id": 7585, "arm": "llamacloud_premium_lc", "question_id": "2305.13186v3.pdf::Q005", "answer_letter": null, "raw_text": "I need to find the total cost of annotating all claims in the SCITAB dataset.\n\nFrom the document:\n\n1. **Total number of claims in SCITAB**: According to Table 1, SCITAB has 1,225 total claims.\n\n2. **Payment per claim annotation**: From Section 2.3, it states \"Each claim annotation is reimbursed at a rate of 0.37 USD\"\n\n3. **Calculation**:\n   Total cost = Number of claims \u00d7 Cost per claim\n   Total cost = 1,225 \u00d7 0.37\n   Total cost = 453.25 USD\n\nAnswer: 453.25", "citations": [], "input_tokens": 36677, "output_tokens": 152, "cost_micros": 112311, "latency_ms": 5469, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "453.25", "normalised_gold": "453.25"}}
+{"qid": "2305.13186v3.pdf::Q005", "doc_id": "2305.13186v3.pdf", "answer_format": "float", "gold": "453.25", "pages": 27, "document_id": 7585, "arm": "surfsense_agentic", "question_id": "2305.13186v3.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document that was marked as most relevant to find information about annotation costs for the proposed dataset.Let me search for information about annotation costs, dataset size, and pricing in the document.Let me read through the document more systematically. I'll start with sections that typically discuss dataset details and costs.Let me search for sections that discuss the dataset size and annotation costs. Let me look for keywords like \"price\", \"annotator\", \"pay\", etc.Let me search for numbers that might relate to costs:Let me read sections that might contain appendix or details about annotation costs:Let me look in later sections of the document where appendix or cost information might be:Let me search for \"Appendix\" section which might contain cost details:Let me try searching with content mode to find where these terms appear:Great! I found a reference to annotation cost. Let me read around line 974:Perfect! I found the annotation cost information. Now I need to find the total number of claims in the dataset. Let me search for that information:Perfect! Now I have all the information I need:\n- Total number of claims: 1,225\n- Cost per claim annotation: 0.37 USD\n\nLet me calculate the total cost:\nTotal cost = 1,225 \u00d7 0.37 = 453.25 USD\n\nAnswer: 453.25", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 97266, "error": null, "extra": {"thread_id": 1978, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_77659594d74045e99872a87f698ab753", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 502, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "453.25", "normalised_gold": "453.25"}}
+{"qid": "2305.13186v3.pdf::Q006", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "245", "pages": 27, "document_id": 7585, "arm": "native_pdf", "question_id": "2305.13186v3.pdf::Q006", "answer_letter": null, "raw_text": "I need to find information about the distribution of reasoning steps in the SCITAB dataset.\n\nLooking at Figure 3 on page 5, which shows \"The distribution histogram of reasoning steps in our SCITAB dataset\":\n\nThe figure shows the percentage of claims for each number of reasoning steps (1-11).\n\nFrom the histogram, I can read the following percentages:\n- 1 step: 6%\n- 2 steps: 8%\n- 3 steps: 15%\n- 4 steps: 18%\n- 5 steps: 20% (highest)\n- 6 steps: 15%\n- 7 steps: 7%\n- 8 steps: 5%\n- 9 steps: 3%\n- 10 steps: 2%\n- 11 steps: 1%\n\nThe highest percentage is 20%, which corresponds to claims requiring 5 reasoning steps.\n\nTo find the actual number of claims, I need to calculate 20% of the total number of claims in SCITAB.\n\nFrom Table 1 on page 4, the total number of claims in SCITAB is 1,225.\n\n20% of 1,225 = 0.20 \u00d7 1,225 = 245\n\nAnswer: 245", "citations": [], "input_tokens": 68134, "output_tokens": 299, "cost_micros": 208887, "latency_ms": 34843, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2305.13186v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "245", "normalised_gold": "245"}}
+{"qid": "2305.13186v3.pdf::Q006", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "245", "pages": 27, "document_id": 7585, "arm": "azure_basic_lc", "question_id": "2305.13186v3.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the percentage of claims with the highest reasoning steps in the SCITAB dataset.\n\nLooking at Figure 3, which shows \"The distribution histogram of reasoning steps in our SCITAB dataset,\" I can see the distribution of reasoning steps.\n\nThe x-axis shows reasoning steps from 1 to 11, and the y-axis shows the percentage.\n\nFrom the figure, I can read the following percentages:\n- 1 step: 1%\n- 2 steps: 7%\n- 3 steps: 15%\n- 4 steps: 20%\n- 5 steps: 18%\n- 6 steps: 15%\n- 7 steps: 8%\n- 8 steps: 6%\n- 9 steps: 5%\n- 10 steps: 3%\n- 11 steps: 2%\n\nThe highest percentage is 20%, which corresponds to claims with 4 reasoning steps.\n\nThe document states there are 1,225 total claims in SCITAB (from Table 1).\n\n20% of 1,225 = 0.20 \u00d7 1,225 = 245\n\nAnswer: 245", "citations": [], "input_tokens": 31767, "output_tokens": 277, "cost_micros": 99456, "latency_ms": 8375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "245", "normalised_gold": "245"}}
+{"qid": "2305.13186v3.pdf::Q006", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "245", "pages": 27, "document_id": 7585, "arm": "azure_premium_lc", "question_id": "2305.13186v3.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the reasoning step percentage distribution in the SCITAB dataset from Figure 3.\n\nLooking at Figure 3, which shows \"The distribution histogram of reasoning steps in our SCITAB dataset,\" I can see the percentages for each reasoning step:\n\n- 1 step: 2%\n- 2 steps: 3%\n- 3 steps: 15%\n- 4 steps: 20%\n- 5 steps: 18%\n- 6 steps: 15%\n- 7 steps: 8%\n- 8 steps: 7%\n- 9 steps: 6%\n- 10 steps: 5%\n- 11 steps: 1%\n\nThe highest percentage is 20%, which corresponds to claims with 4 reasoning steps.\n\nSince the dataset contains 1,225 claims total (as stated in the abstract and throughout the document), I need to calculate 20% of 1,225:\n\n1,225 \u00d7 0.20 = 245\n\nAnswer: 245", "citations": [], "input_tokens": 44322, "output_tokens": 243, "cost_micros": 136611, "latency_ms": 6812, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "245", "normalised_gold": "245"}}
+{"qid": "2305.13186v3.pdf::Q006", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "245", "pages": 27, "document_id": 7585, "arm": "llamacloud_basic_lc", "question_id": "2305.13186v3.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the highest percentage of reasoning steps in the SCITAB dataset and determine how many claims fall into that category.\n\nLooking at Figure 3, which shows \"The distribution histogram of reasoning steps in our SCITAB dataset,\" I can see:\n\n- The x-axis represents the reasoning steps in each claim\n- The y-axis represents the frequency for each reasoning step\n- The figure distinguishes between shallow claims (1-2 reasoning steps, highlighted in red) and deep claims (3+ reasoning steps, highlighted in blue)\n\nFrom the histogram in Figure 3, I need to identify which reasoning step has the highest frequency (highest bar).\n\nExamining the histogram carefully:\n- The highest bar appears to be at 4 reasoning steps\n- The frequency at 4 reasoning steps appears to be approximately 350-400 claims\n\nLooking more carefully at the visual representation, the bar at 4 reasoning steps is the tallest in the histogram, with a frequency of approximately 350 claims.\n\nAnswer: 350", "citations": [], "input_tokens": 28749, "output_tokens": 223, "cost_micros": 89592, "latency_ms": 7078, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "350", "normalised_gold": "245"}}
+{"qid": "2305.13186v3.pdf::Q006", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "245", "pages": 27, "document_id": 7585, "arm": "llamacloud_premium_lc", "question_id": "2305.13186v3.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the reasoning step with the highest percentage in the SCITAB dataset.\n\nLooking at Figure 3, which shows \"The distribution histogram of reasoning steps in our SCITAB dataset,\" I can see the percentages for each reasoning step:\n\n- 1 step: 6%\n- 2 steps: 8%\n- 3 steps: 15%\n- 4 steps: 18%\n- 5 steps: 20%\n- 6 steps: 15%\n- 7 steps: 7%\n- 8 steps: 5%\n- 9 steps: 3%\n- 10 steps: 2%\n- 11 steps: 1%\n\nThe highest percentage is 20%, which corresponds to claims requiring 5 reasoning steps.\n\nTo find the actual number of claims, I need to calculate 20% of the total number of claims in SCITAB.\n\nFrom Table 1, the total number of claims in SCITAB is 1,225.\n\nNumber of claims with 5 reasoning steps = 1,225 \u00d7 0.20 = 245\n\nAnswer: 245", "citations": [], "input_tokens": 36659, "output_tokens": 266, "cost_micros": 113967, "latency_ms": 6453, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "245", "normalised_gold": "245"}}
+{"qid": "2305.13186v3.pdf::Q006", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "245", "pages": 27, "document_id": 7585, "arm": "surfsense_agentic", "question_id": "2305.13186v3.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the most relevant document to find information about claims and reasoning steps in the author's proposed dataset.Let me continue reading to find information about the dataset and reasoning steps distribution.Great! I found a reference to a distribution histogram of reasoning steps. Let me read that specific section.Let me look at the earlier part where the figure caption was mentioned to understand the chart better.Perfect! Now I can see the histogram data. Let me read more to see all the percentage values clearly:From the histogram data I can see, the percentages are listed vertically and the reasoning steps go from 2 to 11. Looking at the data:\n\n- The percentages shown are: 20, 18, 20, 15, 15, 15, 6, 8, 7, 5, 3, 2, 1, 1\n\nThe highest percentages appear to be 20% which occurs twice. Let me look for more context to understand the exact mapping between reasoning steps and percentages. Let me search for more information about the total number of claims.Good! So SCITAB has 1,225 total claims. Now I need to figure out which reasoning step number has the highest percentage (20%), and calculate how many claims that represents. Looking at the histogram data again, I need to understand which reasoning steps correspond to the 20% values. Let me look for more details about the distribution.Now let me carefully re-examine the histogram data. The data shows reasoning steps 2-11 and their corresponding percentages. Looking at the visual structure again:I see the structure now. Lines 1182-1198 show \"2 3 4 5 6 7 8 9 10 11\" which are the reasoning steps on the x-axis. The percentages must be reading from the bars. Based on the structure, it appears:\n\nFrom the data, I can see percentages: 20, 18, 20, 15, 15, 15, 10, 6, 8, 7, 5, 5, 3, 2, 1, 1\n\nThe highest percentage is 20%, which appears twice in the data. With 1,225 total claims and 20% being the highest:\n- 20% of 1,225 = 0.20 \u00d7 1,225 = 245 claims\n\nHowever, I see 20% appears twice, which could mean there are two different reasoning step counts that each have 20%. Let me verify by checking if there's a table or more precise description.Based on the figure caption, the y-axis shows \"frequency\" which is shown as \"Percentages (%)\". The highest percentage is 20%, and since this appears to occur for specific reasoning step(s), I need to calculate the number of claims.\n\nWith 1,225 total claims in SCITAB and the highest percentage being 20%:\n20% \u00d7 1,225 = 245 claims\n\nAnswer: 245", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 77500, "error": null, "extra": {"thread_id": 1979, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_0b34f104b1a04155b78b4d63f4b337dd", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 544, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "245", "normalised_gold": "245"}}
+{"qid": "2305.13186v3.pdf::Q007", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "10", "pages": 27, "document_id": 7585, "arm": "native_pdf", "question_id": "2305.13186v3.pdf::Q007", "answer_letter": null, "raw_text": "I need to count the number of figures in the Appendix section of the paper.\n\nLooking through the document, I can see the Appendix starts at section A on page 13. Let me go through each appendix section and count the figures:\n\n**Appendix A (Claim Extraction Procedure):**\n- Figure 5: The user interface for the claim extraction task (page 14)\n\n**Appendix B (Manual Claim Verification Procedure):**\n- Figure 6: The user interface for the NEI claim verification task (page 15)\n- Figure 7: The user interface for the refuted claim verification task (page 16)\n- Figure 8: Examples of post-annotation survey questions (page 17)\n\n**Appendix C (Analysis of Refuted Reasons in the Sci-Fact dataset):**\n- No figures\n\n**Appendix D (Discussions on Human-Machine Collaboration):**\n- No figures\n\n**Appendix E (Case Study for Refuted Claims):**\n- Figure 9: Refuted claims cases A to C (page 22)\n- Figure 10: Refuted claims cases D and E (page 23)\n\n**Appendix F (Error Cases for InstructGPT):**\n- Figure 11: Error Cases 1-3 for InstructGPT (page 24)\n- Figure 12: Error Cases 4-6 for InstructGPT (page 25)\n\n**Appendix G (Error Cases for Program-of-Thoughts):**\n- Figure 13: Error Cases 1-3 for Program-of-Thoughts (page 26)\n- Figure 14: Error Cases 4 and 5 for Program-of-Thoughts (page 27)\n\n**Appendix H (Prompts):**\n- No figures\n\nTotal count: Figures 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 = 10 figures in the Appendix.\n\nAnswer: 10", "citations": [], "input_tokens": 68138, "output_tokens": 472, "cost_micros": 211494, "latency_ms": 35156, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2305.13186v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "10", "normalised_gold": "10"}}
+{"qid": "2305.13186v3.pdf::Q007", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "10", "pages": 27, "document_id": 7585, "arm": "azure_basic_lc", "question_id": "2305.13186v3.pdf::Q007", "answer_letter": null, "raw_text": "I need to count all the figures in the Appendix section of the paper.\n\nLooking through the Appendix sections:\n\n**Appendix A (Claim Extraction Procedure):**\n- A.2 Claim Extraction Interface: Figure 5\n\n**Appendix B (Manual Claim Verification Procedure):**\n- B.2 NEI Claim Verification Interface: Figure 6\n- B.3 Refuted Claim Verification Interface: Figure 7\n- B.4 Annotation Post-Survey: Figure 8\n\n**Appendix C (Analysis of Refuted Reasons in the Sci-Fact dataset):**\n- No figures\n\n**Appendix D (Discussions on Human-Machine Collaboration):**\n- No figures\n\n**Appendix E (Case Study for Refuted Claims):**\n- Figure 9 (refuted cases A to C)\n- Figure 10 (refuted cases D and E)\n\n**Appendix F (Error Cases for InstructGPT):**\n- Figure 11 (Error Cases 1-3)\n- Figure 12 (Error Cases 4-6)\n\n**Appendix G (Error Cases for Program-of-Thoughts):**\n- Figure 13 (Error Cases 1-3)\n- Figure 14 (Error Cases 4 and 5)\n\n**Appendix H (Prompts):**\n- No figures\n\nTotal count: Figures 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 = 10 figures\n\nAnswer: 10", "citations": [], "input_tokens": 31771, "output_tokens": 373, "cost_micros": 100908, "latency_ms": 8453, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "10", "normalised_gold": "10"}}
+{"qid": "2305.13186v3.pdf::Q007", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "10", "pages": 27, "document_id": 7585, "arm": "azure_premium_lc", "question_id": "2305.13186v3.pdf::Q007", "answer_letter": null, "raw_text": "I need to count all the figures in the Appendix section of the document.\n\nLooking through the Appendix sections:\n\n**Appendix A (Claim Extraction Procedure):**\n- Figure 5: The user interface for the claim extraction task\n\n**Appendix B (Manual Claim Verification Procedure):**\n- Figure 6: The user interface for the NEI claim verification task\n- Figure 7: The user interface for the refuted claim verification task\n- Figure 8: The examples of post-annotation survey questions and the answers of annotators\n\n**Appendix C (Analysis of Refuted Reasons in the Sci-Fact dataset):**\n- No figures\n\n**Appendix D (Discussions on Human-Machine Collaboration):**\n- No figures\n\n**Appendix E (Case Study for Refuted Claims):**\n- Figure 9: The refuted claims cases A to C\n- Figure 10: The refuted claims cases D and E\n\n**Appendix F (Error Cases for InstructGPT):**\n- Figure 11: Error Cases 1-3 for InstructGPT in the zero-shot setting\n- Figure 12: Error Cases 4-6 for InstructGPT in the zero-shot setting\n\n**Appendix G (Error Cases for Program-of-Thoughts):**\n- Figure 13: Error Cases 1-3 for Program-of-Thoughts\n- Figure 14: Error Cases 4 and 5 for Program-of-Thoughts\n\n**Appendix H (Prompts):**\n- No figures\n\nTotal count: 5 + 6 + 7 + 8 + 9 + 10 + 11 + 12 + 13 + 14 = 10 figures\n\nAnswer: 10", "citations": [], "input_tokens": 44326, "output_tokens": 415, "cost_micros": 139203, "latency_ms": 9266, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "10", "normalised_gold": "10"}}
+{"qid": "2305.13186v3.pdf::Q007", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "10", "pages": 27, "document_id": 7585, "arm": "llamacloud_basic_lc", "question_id": "2305.13186v3.pdf::Q007", "answer_letter": null, "raw_text": "I need to count all the figures in the Appendix section of the document.\n\nLooking through the Appendix sections:\n\n**Appendix A (Claim Extraction Procedure)**\n- Figure 5: The user interface for the claim extraction task\n\n**Appendix B (Manual Claim Verification Procedure)**\n- Figure 6: The user interface for the NEI claim verification task\n- Figure 7: The user interface for the refuted claim verification task\n- Figure 8: The examples of post-annotation survey questions\n\n**Appendix C (Analysis of Refuted Reasons in the Sci-Fact dataset)**\n- No figures\n\n**Appendix D (Discussions on Human-Machine Collaboration)**\n- No figures\n\n**Appendix E (Case Study for Refuted Claims)**\n- Figure 9: The refuted claims cases A to C\n- Figure 10: The refuted claims cases D and E\n\n**Appendix F (Error Cases for InstructGPT)**\n- Figure 11: Error Cases 1-3 for InstructGPT\n- Figure 12: Error Cases 4-6 for InstructGPT\n\n**Appendix G (Error Cases for Program-of-Thoughts)**\n- Figure 13: Error Cases 1-3 for Program-of-Thoughts\n- Figure 14: Error Cases 4 and 5 for Program-of-Thoughts\n\n**Appendix H (Prompts)**\n- No figures\n\nTotal figures in Appendix:\nFigure 5, 6, 7, 8, 9, 10, 11, 12, 13, 14\n\nAnswer: 10", "citations": [], "input_tokens": 28753, "output_tokens": 380, "cost_micros": 91959, "latency_ms": 8592, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "10", "normalised_gold": "10"}}
+{"qid": "2305.13186v3.pdf::Q007", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "10", "pages": 27, "document_id": 7585, "arm": "llamacloud_premium_lc", "question_id": "2305.13186v3.pdf::Q007", "answer_letter": null, "raw_text": "I need to count all the figures in the Appendix section of the document.\n\nLet me go through the Appendix sections:\n\n**Appendix A** (Claim Extraction Procedure):\n- Figure 5: The user interface for the claim extraction task\n\n**Appendix B** (Manual Claim Verification Procedure):\n- Figure 6: The user interface for the NEI claim verification task\n- Figure 7: The user interface for the refuted claim verification task\n- Figure 8: The examples of post-annotation survey questions and the answers of annotators\n\n**Appendix C** (Analysis of Refuted Reasons in the Sci-Fact dataset):\n- No figures\n\n**Appendix D** (Discussions on Human-Machine Collaboration):\n- No figures\n\n**Appendix E** (Case Study for Refuted Claims):\n- Figure 9: The refuted claims cases A to C\n- Figure 10: The refuted claims cases D and E\n\n**Appendix F** (Error Cases for InstructGPT):\n- Figure 11: Error Cases 1-3 for InstructGPT in the zero-shot setting\n- Figure 12: Error Cases 4-6 for InstructGPT in the zero-shot setting\n\n**Appendix G** (Error Cases for Program-of-Thoughts):\n- Figure 13: Error Cases 1-3 for Program-of-Thoughts\n- Figure 14: Error Cases 4 and 5 for Program-of-Thoughts\n\n**Appendix H** (Prompts):\n- No figures\n\nTotal figures in Appendix:\n- Figure 5\n- Figure 6\n- Figure 7\n- Figure 8\n- Figure 9\n- Figure 10\n- Figure 11\n- Figure 12\n- Figure 13\n- Figure 14\n\nAnswer: 10", "citations": [], "input_tokens": 36663, "output_tokens": 427, "cost_micros": 116394, "latency_ms": 9828, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "10", "normalised_gold": "10"}}
+{"qid": "2305.13186v3.pdf::Q007", "doc_id": "2305.13186v3.pdf", "answer_format": "int", "gold": "10", "pages": 27, "document_id": 7585, "arm": "surfsense_agentic", "question_id": "2305.13186v3.pdf::Q007", "answer_letter": null, "raw_text": "I'll read the document to find out how many figures are in the Appendix.Let me search for the Appendix section in the document.Now let me search for figures in the Appendix sections. Let me read the end of the document where appendices are typically located.Let me continue reading to find the Appendix section.Let me search for all figures in the appendix sections. Let me look for \"Figure\" in the context of Appendix.Let me read the document more systematically to find all appendix sections and count the figures. Let me search for specific appendix sections.Now I can see the appendix sections starting. Let me continue reading to find all figures in the appendices.Let me continue reading to find more figures in the appendix.Let me continue to look for all figures in the appendix. Let me read further.Let me continue reading to find all the figures.Let me continue reading to find more figures.Now let me search for the end of the document to make sure I've found all figures.Perfect! Now let me count all the figures I found in the Appendix sections. Based on my reading, I found the following figures in the appendices:\n\n**Appendix A:**\n- Figure 5: The user interface for the claim extraction task\n\n**Appendix B:**\n- Figure 6: The user interface for the NEI claim verification task\n- Figure 7: The user interface for the refuted claim verification task\n- Figure 8: The examples of post-annotation survey questions and the answers of annotators\n\n**Appendix E:**\n- Figure 9: The refuted claims cases A to C\n- Figure 10: The refuted claims cases D and E\n\n**Appendix F:**\n- Figure 11: Error Cases 1-3 for InstructGPT in the zero-shot setting\n- Figure 12: Error Cases 4-6 for InstructGPT in the zero-shot setting\n\n**Appendix G:**\n- Figure 13: Error Cases 1-3 for Program-of-Thoughts\n- Figure 14: Error Cases 4 and 5 for Program-of-Thoughts\n\nTotal figures in the Appendix: 10\n\nAnswer: 10", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 85187, "error": null, "extra": {"thread_id": 1980, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_0161632b18b34ebfa7ee0421186ab320", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 490, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "10", "normalised_gold": "10"}}
+{"qid": "2305.14160v4.pdf::Q000", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "No", "pages": 16, "document_id": 7586, "arm": "native_pdf", "question_id": "2305.14160v4.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, particularly at Figures 3, 7, and 9 which show the relative sizes of Swp (information flow from text part to label words), Spq (information flow from label words to target position), and Sww across different layers:\n\n- In shallow layers, Swp is high while Spq is low\n- In deep layers, Spq becomes dominant and exceeds Swp\n\nThe document explicitly states: \"Initially, Swp occupies a significant proportion, but it gradually decays over layers, while Spq becomes the dominant one.\"\n\nThis means that Swp is NOT always greater than Spq - it depends on the layer depth.\n\nAnswer: no", "citations": [], "input_tokens": 41233, "output_tokens": 158, "cost_micros": 126069, "latency_ms": 32250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2305.14160v4.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2305.14160v4.pdf::Q000", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "No", "pages": 16, "document_id": 7586, "arm": "azure_basic_lc", "question_id": "2305.14160v4.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 3 in the document, which shows the relative sizes of Swp (information flow from text to label words), Spq (information flow from label words to target position), and Sww across different layers:\n\nIn shallow layers (early layers), Swp is greater than Spq.\nIn deep layers (later layers), Spq becomes dominant and is greater than Swp.\n\nThe document explicitly states: \"in shallow layers, Spq, the significance of the information flow from label words to targeted positions, is low, while Swp, the information flow from the text part to label words is high; (2) in deep layers, Spq, the importance of information flow from label words to the targeted position becomes the dominant one.\"\n\nSince Swp is not always greater than Spq across all layers (it depends on which layer), the answer is no.\n\nAnswer: no", "citations": [], "input_tokens": 17763, "output_tokens": 200, "cost_micros": 56289, "latency_ms": 6078, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2305.14160v4.pdf::Q000", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "No", "pages": 16, "document_id": 7586, "arm": "azure_premium_lc", "question_id": "2305.14160v4.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 3 and Figure 7 in the document, which show the relative sizes of S_wp (information flow from text to label words), S_pq (information flow from label words to target position), and S_ww across different layers:\n\nIn shallow layers (layers close to input), S_wp is high while S_pq is low.\n\nIn deep layers (layers close to output), S_pq becomes dominant and exceeds S_wp.\n\nThe document explicitly states: \"In shallow layers, S_pq, the significance of the information flow from label words to targeted positions, is low, while S_wp, the information flow from the text part to label words is high; (2) in deep layers, S_pq, the importance of information flow from label words to the targeted position becomes the dominant one.\"\n\nSince S_wp is greater in shallow layers but S_pq becomes greater in deep layers, S_wp is NOT always greater than S_pq.\n\nAnswer: no", "citations": [], "input_tokens": 20121, "output_tokens": 222, "cost_micros": 63693, "latency_ms": 7094, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2305.14160v4.pdf::Q000", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "No", "pages": 16, "document_id": 7586, "arm": "llamacloud_basic_lc", "question_id": "2305.14160v4.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, particularly Section 2.1 and Figure 3, the metrics Swp (information flow from text part to label words) and Spq (information flow from label words to target position) are compared across different layers.\n\nThe text states: \"Initially, Swp occupies a significant proportion, but it gradually decays over layers, while Spq becomes the dominant one.\"\n\nThis indicates that Swp is larger in shallow layers but Spq becomes larger in deep layers. Therefore, Swp is not always greater than Spq across all layers.\n\nAnswer: No", "citations": [], "input_tokens": 13762, "output_tokens": 132, "cost_micros": 43266, "latency_ms": 5780, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2305.14160v4.pdf::Q000", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "No", "pages": 16, "document_id": 7586, "arm": "llamacloud_premium_lc", "question_id": "2305.14160v4.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the data presented in the document, particularly in Figure 3 and related figures showing $S_{wp}$, $S_{pq}$, and $S_{ww}$ values across different layers:\n\nIn shallow layers (e.g., layers 0-10), $S_{wp}$ (information flow from text to label words) is greater than $S_{pq}$ (information flow from label words to target position).\n\nHowever, in deep layers (e.g., layers 15-48), $S_{pq}$ becomes much larger than $S_{wp}$.\n\nFor example, in Figure 3(a) for SST-2:\n- Layer 0: $S_{wp}$ = 0.72, $S_{pq}$ = 0.15\n- Layer 20: $S_{wp}$ = 0.10, $S_{pq}$ = 0.90\n\nSince the relationship changes across layers, $S_{wp}$ is not **always** greater than $S_{pq}$.\n\nAnswer: no", "citations": [], "input_tokens": 23820, "output_tokens": 243, "cost_micros": 75105, "latency_ms": 6703, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2305.14160v4.pdf::Q000", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "No", "pages": 16, "document_id": 7586, "arm": "surfsense_agentic", "question_id": "2305.14160v4.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find information about information flow between text parts, label words, and target positions.Let me search for information about information flow between text parts, label words, and target positions.Let me search for specific terms related to the comparison:Let me read through the document more systematically to find tables or figures that might contain this information:Let me read Figure 3 and related sections since it mentions Swp, Spq, and Sww which seem to be related to information flow:Good! I found relevant information. Let me read more to understand the definitions and comparisons better:Perfect! Now I have the key information. Let me look at Figure 3 more carefully to see the actual values:Based on my reading of the document, I can now answer the question. The document clearly states:\n\nFrom the text (lines 842-851):\n- \"in shallow layers, Spq, the significance of the information flow from label words to targeted positions, is low, while **Swp, the information flow from the text part to label words is high**\"\n- \"in deep layers, Spq, the importance of information flow from label words to the targeted position becomes the dominant one\"\n\nThe document also states in the figure caption (line 943): \"Initially, **Swp occupies a significant proportion**, but it gradually decays over layers, while **Spq becomes the dominant one**.\"\n\nThis indicates that:\n- In shallow layers: Swp (text part to label words) > Spq (label words to target position)\n- In deep layers: Spq (label words to target position) > Swp (text part to label words)\n\nSince Swp starts high and decays while Spq starts low and increases, across ALL layers (taking the mean), Swp is NOT always greater than Spq. In fact, Spq becomes dominant in deep layers.\n\nAnswer: no", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 59108, "error": null, "extra": {"thread_id": 1981, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_31954715cb0b491eade4006731d43b22", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 360, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2305.14160v4.pdf::Q001", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "No", "pages": 16, "document_id": 7586, "arm": "native_pdf", "question_id": "2305.14160v4.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the document, particularly Figure 3 and the related discussion in Section 2.1:\n\nThe text states: \"Figure 3 reveals that: (1) in shallow layers, Spq, the significance of the information flow from label words to targeted positions, is low, while Swp, the information flow from the text part to label words is high; (2) in deep layers, Spq, the importance of information flow from label words to the targeted position becomes the dominant one.\"\n\nThis clearly indicates that the information flow from label words to the target position (Spq) is NOT dominant in shallow layers - it only becomes dominant in deep layers.\n\nAnswer: no", "citations": [], "input_tokens": 41220, "output_tokens": 149, "cost_micros": 125895, "latency_ms": 31437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2305.14160v4.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2305.14160v4.pdf::Q001", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "No", "pages": 16, "document_id": 7586, "arm": "azure_basic_lc", "question_id": "2305.14160v4.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the document, particularly Figure 3 and the analysis in Section 2.1:\n\nThe information flow from label words to the target position (S_pq) is LOW in shallow layers, but becomes DOMINANT in deep layers. The document states: \"(1) in shallow layers, S_pq, the significance of the information flow from label words to targeted positions, is low... (2) in deep layers, S_pq, the importance of information flow from label words to the targeted position becomes the dominant one.\"\n\nAnswer: no", "citations": [], "input_tokens": 17750, "output_tokens": 119, "cost_micros": 55035, "latency_ms": 4890, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2305.14160v4.pdf::Q001", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "No", "pages": 16, "document_id": 7586, "arm": "azure_premium_lc", "question_id": "2305.14160v4.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the document, particularly Figure 3 and the analysis in Section 2.1:\n\nThe text states: \"(1) in shallow layers, Spq, the significance of the information flow from label words to targeted positions, is low, while Swp, the information flow from the text part to label words is high; (2) in deep layers, Spq, the importance of information flow from label words to the targeted position becomes the dominant one.\"\n\nThis clearly indicates that the information flow from label words to the target position (Spq) is NOT dominant in shallow layers - it only becomes dominant in deep layers.\n\nAnswer: no", "citations": [], "input_tokens": 20108, "output_tokens": 140, "cost_micros": 62424, "latency_ms": 5203, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2305.14160v4.pdf::Q001", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "No", "pages": 16, "document_id": 7586, "arm": "llamacloud_basic_lc", "question_id": "2305.14160v4.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the document, particularly Figure 3 and Section 2.1, I need to examine when the information flow from label words to the target position (Spq) dominates.\n\nFrom Figure 3 and the text in Section 2.1:\n- Initially (in shallow layers), Swp (information flow from text to label words) \"occupies a significant proportion\"\n- Spq (information flow from label words to target position) \"gradually\" becomes dominant\n- Spq becomes \"the dominant one\" but this happens over layers, not in all layers\n\nThe text explicitly states: \"Initially, Swp occupies a significant proportion, but it gradually decays over layers, while Spq becomes the dominant one.\"\n\nThis indicates that Spq does NOT dominate in all layers - only in the deeper layers after the information aggregation phase in shallow layers is complete.\n\nAnswer: No", "citations": [], "input_tokens": 13749, "output_tokens": 198, "cost_micros": 44217, "latency_ms": 6828, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2305.14160v4.pdf::Q001", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "No", "pages": 16, "document_id": 7586, "arm": "llamacloud_premium_lc", "question_id": "2305.14160v4.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the paper's findings in Section 2.1 and the results shown in Figures 3, 7, and 9:\n\nIn shallow layers, $S_{wp}$ (information flow from text to label words) is high while $S_{pq}$ (information flow from label words to target position) is low.\n\nIn deep layers, $S_{pq}$ becomes dominant while $S_{wp}$ decreases.\n\nThe paper explicitly states: \"in shallow layers, $S_{pq}$, the significance of the information flow from label words to targeted positions, is low... (2) in deep layers, $S_{pq}$, the importance of information flow from label words to the targeted position becomes the dominant one.\"\n\nAnswer: no", "citations": [], "input_tokens": 23807, "output_tokens": 171, "cost_micros": 73986, "latency_ms": 6312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2305.14160v4.pdf::Q001", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "No", "pages": 16, "document_id": 7586, "arm": "surfsense_agentic", "question_id": "2305.14160v4.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the document mentioned in the priority list to answer your question.Let me search for content related to \"information flow\" and \"label words\" in the document.Now let me read the specific sections that discuss whether information flow from label words dominates in all layers.Perfect! I found the answer. Let me read a bit more to confirm the complete picture.Based on the document, the answer is clear. The text states:\n\n\"(1) in shallow layers, Spq, the significance of the information flow from label words to targeted positions, is low, while Swp, the information flow from the text part to label words is high; (2) in deep layers, Spq, the importance of information flow from label words to the targeted position becomes the dominant one.\"\n\nThis shows that information flow from label words to the target position does NOT dominate in all layers - it is low in shallow layers and only becomes dominant in deep layers.\n\nAnswer: No", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 38452, "error": null, "extra": {"thread_id": 1982, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_91e767a2b44d476aa25ee429c43c2924", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 182, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2305.14160v4.pdf::Q002", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "Yes", "pages": 16, "document_id": 7586, "arm": "native_pdf", "question_id": "2305.14160v4.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 4 on page 4, which shows the impact of isolating label words versus randomly isolating non-label words within the first or last 5 layers:\n\nThe figure shows that \"Random (Last)\" - which represents randomly isolating non-label words within the last 5 layers - has minimal impact on both Label Loyalty and Word Loyalty metrics for both GPT2-XL and GPT-J models. The bars for \"Random (Last)\" show loyalty values close to 100%, similar to the \"No Isolation\" baseline.\n\nThis is further confirmed in the text on page 4 which states: \"Yet, this influence becomes inconsequential within the last 5 layers, or when random non-label words are used.\"\n\nAnswer: yes", "citations": [], "input_tokens": 41222, "output_tokens": 170, "cost_micros": 126216, "latency_ms": 33032, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2305.14160v4.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "yes", "normalised_gold": "yes"}}
+{"qid": "2305.14160v4.pdf::Q002", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "Yes", "pages": 16, "document_id": 7586, "arm": "azure_basic_lc", "question_id": "2305.14160v4.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 4 in the document, which shows \"The impact of isolating label words versus randomly isolating non-label words within the first or last 5 layers,\" I can see that \"Random (Last)\" - which represents randomly isolating non-label words within the last 5 layers - shows loyalty values very close to the \"No Isolation\" baseline (around 100%). This indicates almost no impact on the model's behavior.\n\nAnswer: Yes", "citations": [], "input_tokens": 17752, "output_tokens": 101, "cost_micros": 54771, "latency_ms": 3967, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "yes", "normalised_gold": "yes"}}
+{"qid": "2305.14160v4.pdf::Q002", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "Yes", "pages": 16, "document_id": 7586, "arm": "azure_premium_lc", "question_id": "2305.14160v4.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 4 in the document, which shows \"The impact of isolating label words versus randomly isolating non-label words within the first or last 5 layers,\" I can see that:\n\n- \"Random (Last)\" (randomly isolating non-label words in the last 5 layers) shows very high loyalty scores (close to 100%), nearly identical to \"No Isolation\"\n- This indicates minimal impact on the model's predictions\n\nThe text also confirms: \"this influence becomes inconsequential within the last 5 layers, or when random non-label words are used.\"\n\nAnswer: yes", "citations": [], "input_tokens": 20110, "output_tokens": 134, "cost_micros": 62340, "latency_ms": 6030, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "yes", "normalised_gold": "yes"}}
+{"qid": "2305.14160v4.pdf::Q002", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "Yes", "pages": 16, "document_id": 7586, "arm": "llamacloud_basic_lc", "question_id": "2305.14160v4.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 4 and the related text in section 2.2, the document shows that isolating label words within the first 5 layers has substantial impact, but isolation within the last 5 layers has negligible impact. The text also states that \"this influence becomes inconsequential within the last 5 layers, or when random non-label words are used.\"\n\nFigure 4 and Figure 8 show that randomly isolating non-label words (represented by lighter colored lines) in the last layers results in very high loyalty scores (close to 100%), indicating almost no impact on the model's behavior.\n\nAnswer: yes", "citations": [], "input_tokens": 13751, "output_tokens": 139, "cost_micros": 43338, "latency_ms": 5328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "yes", "normalised_gold": "yes"}}
+{"qid": "2305.14160v4.pdf::Q002", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "Yes", "pages": 16, "document_id": 7586, "arm": "llamacloud_premium_lc", "question_id": "2305.14160v4.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 4 in the document, which shows \"The impact of isolating label words versus randomly isolating non-label words within the first or last 5 layers,\" I can see the results for \"Random (Last)\".\n\nThe table shows:\n- Label Loyalty (GPT2-XL) with Random (Last): 98\n- Word Loyalty (GPT2-XL) with Random (Last): 98\n- Label Loyalty (GPT-J) with Random (Last): 98\n- Word Loyalty (GPT-J) with Random (Last): 98\n\nThese values are nearly identical to the \"No Isolation\" baseline (100), indicating minimal impact.\n\nThe document states: \"Yet, this influence becomes inconsequential within the last 5 layers, or when random non-label words are used.\"\n\nAnswer: yes", "citations": [], "input_tokens": 23809, "output_tokens": 191, "cost_micros": 74292, "latency_ms": 6092, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "yes", "normalised_gold": "yes"}}
+{"qid": "2305.14160v4.pdf::Q002", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "Yes", "pages": 16, "document_id": 7586, "arm": "surfsense_agentic", "question_id": "2305.14160v4.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the document to find information about randomly isolating non-label words within the last 5 layers.Perfect! I found the answer. The document states that randomly isolating non-label words within the last 5 layers has \"inconsequential\" impact, which means \"almost no impact.\"\n\nAnswer: Yes", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 40125, "error": null, "extra": {"thread_id": 1983, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_c2144cf872eb43e78cc21f804b3432de", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 145, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "yes", "normalised_gold": "yes"}}
+{"qid": "2305.14160v4.pdf::Q003", "doc_id": "2305.14160v4.pdf", "answer_format": "float", "gold": "16.74", "pages": 16, "document_id": 7586, "arm": "native_pdf", "question_id": "2305.14160v4.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Table 1 in the document, I need to compare the Anchor Re-weighting method with Vanilla In-Context Learning (1-shot per class).\n\nFrom Table 1:\n- Vanilla In-Context Learning (1-shot per class) Average: 51.90\n- Anchor Re-weighting (1-shot per class) Average: 68.64\n\nThe improvement is: 68.64 - 51.90 = 16.74\n\nAnswer: 16.74", "citations": [], "input_tokens": 41209, "output_tokens": 115, "cost_micros": 125352, "latency_ms": 29484, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2305.14160v4.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "16.74", "normalised_gold": "16.74"}}
+{"qid": "2305.14160v4.pdf::Q003", "doc_id": "2305.14160v4.pdf", "answer_format": "float", "gold": "16.74", "pages": 16, "document_id": 7586, "arm": "azure_basic_lc", "question_id": "2305.14160v4.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the document, I need to find the average improvement from the Anchor Re-weighting method compared to vanilla ICL.\n\nFrom Table 1 on the page discussing Anchor Re-weighting results:\n\n- Vanilla In-Context Learning (1-shot per class) Average: 51.90\n- Anchor Re-weighting (1-shot per class) Average: 68.64\n\nThe improvement is: 68.64 - 51.90 = 16.74\n\nThis is also confirmed in the text which states: \"leading to a 16.7% average accuracy boost compared to standard ICL baselines.\"\n\nAnswer: 16.7", "citations": [], "input_tokens": 17739, "output_tokens": 149, "cost_micros": 55452, "latency_ms": 4437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "16.7", "normalised_gold": "16.74"}}
+{"qid": "2305.14160v4.pdf::Q003", "doc_id": "2305.14160v4.pdf", "answer_format": "float", "gold": "16.74", "pages": 16, "document_id": 7586, "arm": "azure_premium_lc", "question_id": "2305.14160v4.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Table 1 in the document, I need to compare the Anchor Re-weighting method with the baseline vanilla ICL (1-shot per class).\n\nFrom Table 1:\n- Vanilla In-Context Learning (1-shot per class): Average = 51.90\n- Anchor Re-weighting (1-shot per class): Average = 68.64\n\nImprovement = 68.64 - 51.90 = 16.74\n\nThis matches the statement in the abstract that mentions \"a 16.7% average accuracy boost compared to standard ICL baselines.\"\n\nAnswer: 16.74", "citations": [], "input_tokens": 20097, "output_tokens": 141, "cost_micros": 62406, "latency_ms": 5483, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "16.74", "normalised_gold": "16.74"}}
+{"qid": "2305.14160v4.pdf::Q003", "doc_id": "2305.14160v4.pdf", "answer_format": "float", "gold": "16.74", "pages": 16, "document_id": 7586, "arm": "llamacloud_basic_lc", "question_id": "2305.14160v4.pdf::Q003", "answer_letter": null, "raw_text": "I need to find the average improvement of the Anchor Re-weighting method compared to vanilla ICL.\n\nFrom Table 1 in the document, I can see the following results:\n\n**Vanilla In-Context Learning (1-shot per class):**\n- SST-2: 61.28\n- TREC: 57.56\n- AGNews: 73.32\n- EmoC: 15.44\n- Average: 51.90\n\n**Anchor Re-weighting (1-shot per class):**\n- SST-2: 90.07\n- TREC: 60.92\n- AGNews: 81.94\n- EmoC: 41.64\n- Average: 68.64\n\nThe improvement is calculated as:\n68.64 - 51.90 = 16.74\n\nThe document also states in the text: \"leading to a 16.7% average accuracy boost compared to standard ICL baselines.\"\n\nAnswer: 16.74", "citations": [], "input_tokens": 13738, "output_tokens": 229, "cost_micros": 44649, "latency_ms": 5717, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "16.74", "normalised_gold": "16.74"}}
+{"qid": "2305.14160v4.pdf::Q003", "doc_id": "2305.14160v4.pdf", "answer_format": "float", "gold": "16.74", "pages": 16, "document_id": 7586, "arm": "llamacloud_premium_lc", "question_id": "2305.14160v4.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Table 1 in the document, I need to find the performance improvement of Anchor Re-weighting over vanilla ICL (1-shot per class).\n\nFrom Table 1:\n- Vanilla In-Context Learning (1-shot per class): Average = 51.90\n- Anchor Re-weighting (1-shot per class): Average = 68.64\n\nImprovement = 68.64 - 51.90 = 16.74\n\nThis matches the statement in the abstract and Section 3.1 which mentions \"a 16.7% average accuracy boost compared to standard ICL baselines.\"\n\nAnswer: 16.74", "citations": [], "input_tokens": 23796, "output_tokens": 148, "cost_micros": 73608, "latency_ms": 5437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "16.74", "normalised_gold": "16.74"}}
+{"qid": "2305.14160v4.pdf::Q003", "doc_id": "2305.14160v4.pdf", "answer_format": "float", "gold": "16.74", "pages": 16, "document_id": 7586, "arm": "surfsense_agentic", "question_id": "2305.14160v4.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the document to find information about the Anchor Re-weighting method and its performance improvement over vanilla ICL.Let me search for \"Anchor Re-weighting\" in the document to find the relevant information.Let me read the section with the results table to find the specific improvement values.Perfect! I found Table 1 which shows the performance comparison. Let me calculate the improvement:\n\nFrom the table:\n- Vanilla In-Context Learning (1-shot per class) Average: 51.90\n- Anchor Re-weighting (1-shot per class) Average: 68.64\n\nThe improvement = 68.64 - 51.90 = 16.74\n\nAnswer: 16.74", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 35969, "error": null, "extra": {"thread_id": 1984, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_25b28dfa8e5c4fbeb8c80f7d7650cb08", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 144, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "16.74", "normalised_gold": "16.74"}}
+{"qid": "2305.14160v4.pdf::Q004", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "2.5x", "pages": 16, "document_id": 7586, "arm": "native_pdf", "question_id": "2305.14160v4.pdf::Q004", "answer_letter": null, "raw_text": "I need to find information about anchor-only context compression speedup for GPT2-XL on AGNews.\n\nLooking at Table 3 on page 8, which shows \"Acceleration ratios of the Hiddenanchor method\":\n\nThe table shows:\n- Model: GPT2-XL\n- AGNews: 2.5\u00d7\n\nThis indicates that using the anchor-only context compression method (Hiddenanchor), GPT2-XL achieves a 2.5\u00d7 speedup on the AGNews dataset.\n\nAnswer: 2.5\u00d7", "citations": [], "input_tokens": 41212, "output_tokens": 129, "cost_micros": 125571, "latency_ms": 31171, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2305.14160v4.pdf"}, "graded": {"correct": false, "f1": 0.5, "method": "str_norm", "normalised_pred": "2 5\u00d7", "normalised_gold": "2 5x"}}
+{"qid": "2305.14160v4.pdf::Q004", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "2.5x", "pages": 16, "document_id": 7586, "arm": "azure_basic_lc", "question_id": "2305.14160v4.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document for information about GPT2-XL speedup on AGNews using anchor-only context compression (Hiddenanchor method):\n\nIn Table 3 on the topic of \"Acceleration ratios of the Hiddenanchor method,\" the speedup ratio for GPT2-XL on AGNews is listed as 2.5\u00d7.\n\nAnswer: 2.5\u00d7", "citations": [], "input_tokens": 17742, "output_tokens": 90, "cost_micros": 54576, "latency_ms": 4594, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5, "method": "str_norm", "normalised_pred": "2 5\u00d7", "normalised_gold": "2 5x"}}
+{"qid": "2305.14160v4.pdf::Q004", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "2.5x", "pages": 16, "document_id": 7586, "arm": "azure_premium_lc", "question_id": "2305.14160v4.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document for information about anchor-only context compression speedup for GPT2-XL on AGNews...\n\nIn Table 3 (page 7), the acceleration ratios of the Hiddenanchor method are provided:\n\nFor GPT2-XL on AGNews, the speedup is listed as 2.5\u00d7.\n\nAnswer: 2.5\u00d7", "citations": [], "input_tokens": 20100, "output_tokens": 86, "cost_micros": 61590, "latency_ms": 3671, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5, "method": "str_norm", "normalised_pred": "2 5\u00d7", "normalised_gold": "2 5x"}}
+{"qid": "2305.14160v4.pdf::Q004", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "2.5x", "pages": 16, "document_id": 7586, "arm": "llamacloud_basic_lc", "question_id": "2305.14160v4.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document for information about GPT2-XL speedup on AGNews with anchor-only context compression...\n\nIn Table 3 (on page 8), which shows \"The speed-up ratio compared to the original ICL\", I can find the speedup values for different models and datasets.\n\nFor GPT2-XL on AGNews, the speedup is listed as 2.5\u00d7.\n\nAnswer: 2.5\u00d7", "citations": [], "input_tokens": 13741, "output_tokens": 100, "cost_micros": 42723, "latency_ms": 4391, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5, "method": "str_norm", "normalised_pred": "2 5\u00d7", "normalised_gold": "2 5x"}}
+{"qid": "2305.14160v4.pdf::Q004", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "2.5x", "pages": 16, "document_id": 7586, "arm": "llamacloud_premium_lc", "question_id": "2305.14160v4.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document for information about anchor-only context compression speedup on AGNews with GPT2-XL...\n\nIn Table 3 on page 11, the acceleration ratios of the Hidden_anchor method are presented:\n\n| Model   | SST-2 | TREC | AGNews | EmoC |\n| ------- | ----- | ---- | ------ | ---- |\n| GPT2-XL | 1.1\u00d7  | 1.5\u00d7 | 2.5\u00d7   | 1.4\u00d7 |\n\nAnswer: 2.5\u00d7", "citations": [], "input_tokens": 23799, "output_tokens": 134, "cost_micros": 73407, "latency_ms": 5157, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5, "method": "str_norm", "normalised_pred": "2 5\u00d7", "normalised_gold": "2 5x"}}
+{"qid": "2305.14160v4.pdf::Q004", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "2.5x", "pages": 16, "document_id": 7586, "arm": "surfsense_agentic", "question_id": "2305.14160v4.pdf::Q004", "answer_letter": null, "raw_text": "I'll search for information about anchor-only context compression and GPT2-XL speed up on AGNews in the provided document.Let me search for \"anchor\" and \"AGNews\" in the document to find the relevant information.Now let me look at Table 3 which shows acceleration ratios for the Hiddenanchor method:Perfect! I found the answer in Table 3. The table shows acceleration ratios of the Hiddenanchor method, and for GPT2-XL on AGNews, the speedup is 2.5\u00d7.\n\nAnswer: 2.5\u00d7", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 36062, "error": null, "extra": {"thread_id": 1985, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_c1692b1f5bfe4ec6aa79ada899da40bd", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 139, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.5, "method": "str_norm", "normalised_pred": "2 5\u00d7", "normalised_gold": "2 5x"}}
+{"qid": "2305.14160v4.pdf::Q005", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "Description", "pages": 16, "document_id": 7586, "arm": "native_pdf", "question_id": "2305.14160v4.pdf::Q005", "answer_letter": null, "raw_text": "I need to find information about the most confusing category of Abbreviation in the document.\n\nLooking at the confusion matrices in the document, particularly in Section 3.3 (Anchor Distances for Error Diagnosis) and the appendices, I can see confusion matrices for the TREC dataset.\n\nIn Figure 6 on page 8, there are two confusion matrices shown:\n- (a) Confusion matrix of Confusion^pred_ij\n- (b) Confusion matrix of Confusion_ij\n\nLooking at the Abbreviation row in both matrices, I need to identify which category Abbreviation is most confused with (the lowest confusion score, or highest confusion).\n\nFrom Figure 6(b), looking at the Abbreviation row:\n- Abbreviation-Abbreviation: 1 (diagonal, set to 1)\n- Abbreviation-Entity: 0.84\n- Abbreviation-Description: 0.76\n- Abbreviation-Person: 1\n- Abbreviation-Location: 1\n- Abbreviation-Number: 0.87\n\nThe lowest value (indicating highest confusion) in the Abbreviation row is with Description (0.76), followed by Entity (0.84).\n\nThis is also confirmed in the text on page 8 which states: \"Figure 6 shows that the proposed approximation metric, Confusion^pred_ij, can identify the most confusing case (Description-Entity) and performs reasonably well for highly confusing categories (Entity-Abbreviation, Description-Abbreviation).\"\n\nAnswer: Description", "citations": [], "input_tokens": 41202, "output_tokens": 351, "cost_micros": 128871, "latency_ms": 26937, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2305.14160v4.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "description", "normalised_gold": "description"}}
+{"qid": "2305.14160v4.pdf::Q005", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "Description", "pages": 16, "document_id": 7586, "arm": "azure_basic_lc", "question_id": "2305.14160v4.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Figure 6(b) which shows the confusion matrix of Confusion_ij on the TREC dataset, I need to find which category has the highest confusion score with Abbreviation.\n\nFrom the confusion matrix in Figure 6(b), the values for Abbreviation's confusion with other categories are:\n- Entity: 0.84\n- Description: 0.76\n- Person: 1\n- Location: 1\n- Number: 1\n\nThe lowest value (highest confusion) is with Entity at 0.84.\n\nAnswer: Entity", "citations": [], "input_tokens": 17732, "output_tokens": 125, "cost_micros": 55071, "latency_ms": 6780, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "entity", "normalised_gold": "description"}}
+{"qid": "2305.14160v4.pdf::Q005", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "Description", "pages": 16, "document_id": 7586, "arm": "azure_premium_lc", "question_id": "2305.14160v4.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Figure 6(b) which shows the actual confusion matrix (Confusionij) on the TREC dataset, I need to find which category Abbreviation is most confused with.\n\nFrom the confusion matrix in Figure 6(b), the Abbreviation row shows the following confusion scores with other categories:\n- Entity: 0.84\n- Description: 0.76\n- Person: 1\n- Location: 1\n- Number: 1\n\nLower confusion scores indicate higher confusion between categories (as stated in the paper). The lowest score is 0.76 with Description, followed by 0.84 with Entity.\n\nAnswer: Description", "citations": [], "input_tokens": 20090, "output_tokens": 147, "cost_micros": 62475, "latency_ms": 5234, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "description", "normalised_gold": "description"}}
+{"qid": "2305.14160v4.pdf::Q005", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "Description", "pages": 16, "document_id": 7586, "arm": "llamacloud_basic_lc", "question_id": "2305.14160v4.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the confusion matrices in Figure 14, I need to identify which category is most confused with \"Abbreviation.\"\n\nIn the confusion matrices, lower values indicate higher confusion (as stated in the document: \"The closer Confusion_{ij} approximates 1, the less pronounced the confusion\").\n\nLooking at the \"Abbreviation\" row across the different matrices (M=5, 10, 20, 50, 100, 200), the lowest values (highest confusion) are:\n- Description: 0.31, 0.45, 0.52\n- Entity: 0.44, 0.49, 0.51\n\nThe \"Description\" category consistently shows the lowest values when compared with Abbreviation, indicating the highest degree of confusion.\n\nAnswer: Description", "citations": [], "input_tokens": 13731, "output_tokens": 174, "cost_micros": 43803, "latency_ms": 6483, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "description", "normalised_gold": "description"}}
+{"qid": "2305.14160v4.pdf::Q005", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "Description", "pages": 16, "document_id": 7586, "arm": "llamacloud_premium_lc", "question_id": "2305.14160v4.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the confusion matrices provided in the document, I need to find which category is most confused with \"Abbreviation.\"\n\nIn Figure 6(a), which shows the predicted confusion matrix ($\\text{Confusion}_{ij}^{\\text{pred}}$), the values for Abbreviation with other categories are:\n- Entity: 0.49\n- Description: 0.45\n- Person: 0.90\n- Location: 1.00\n- Number: 0.83\n\nLower values indicate higher confusion (as stated in the document: \"0 indicating the highest degree of category confusion\").\n\nIn Figure 6(b), which shows the actual confusion matrix ($\\text{Confusion}_{ij}$), the values for Abbreviation are:\n- Entity: 0.84\n- Description: 0.76\n- Person: 1.00\n- Location: 1.00\n- Number: 1.00\n\nIn both matrices, the lowest value (highest confusion) for Abbreviation is with \"Description\" (0.45 predicted, 0.76 actual), followed closely by \"Entity\" (0.49 predicted, 0.84 actual).\n\nAnswer: Description", "citations": [], "input_tokens": 23789, "output_tokens": 270, "cost_micros": 75417, "latency_ms": 7077, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "description", "normalised_gold": "description"}}
+{"qid": "2305.14160v4.pdf::Q005", "doc_id": "2305.14160v4.pdf", "answer_format": "str", "gold": "Description", "pages": 16, "document_id": 7586, "arm": "surfsense_agentic", "question_id": "2305.14160v4.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to find information about the most confusing category of abbreviation.Let me search for information about \"confusing\" and \"abbreviation\" categories in the document.Let me read the specific chunk that mentions confusing categories and abbreviations:Perfect! I found the answer. Let me read a bit more context to confirm:Based on the document, the text clearly states that \"the most confusing case (Description-Entity)\" was identified. However, the question asks specifically about \"the most confusing category of Abbreviation.\"\n\nLooking at the confusion matrix data and the text, it mentions that the highly confusing categories involving Abbreviation are:\n- Entity-Abbreviation\n- Description-Abbreviation\n\nLet me check if there's more specific information about which is THE most confusing for Abbreviation:\n\nLooking at the confusion matrix values shown:\n- Abbreviation-Entity: 0.84\n- Abbreviation-Description: 0.76\n\nThe lower the value in a confusion matrix, the more confusing the pair. So Entity (0.84) is more confusing with Abbreviation than Description (0.76), but actually looking at this as a similarity matrix (as the caption says \"heatmaps display similarity\"), higher values mean more similar/confusing.\n\nBased on the text stating \"highly confusing categories (Entity-Abbreviation, Description-Abbreviation)\" with Entity-Abbreviation listed first, and the confusion matrix showing 0.84 for Abbreviation-Entity vs 0.76 for Abbreviation-Description:\n\nAnswer: Entity", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 40344, "error": null, "extra": {"thread_id": 1986, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_052d25ab92e746d681f311675087569d", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 227, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "entity", "normalised_gold": "description"}}
+{"qid": "2306.05425v1.pdf::Q000", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "egocentric planning", "pages": 22, "document_id": 7587, "arm": "native_pdf", "question_id": "2306.05425v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1 on page 2, which shows the MIMIC-IT overview tree diagram, I can see three main branches: Perception, Reasoning, and Planning.\n\nAt the top of the Planning branch (shown in red/pink on the right side), the task labeled is \"Indoor Event Planning\".\n\nAnswer: Indoor Event Planning", "citations": [], "input_tokens": 55843, "output_tokens": 76, "cost_micros": 168669, "latency_ms": 49328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2306.05425v1.pdf"}, "graded": {"correct": false, "f1": 0.4, "method": "str_norm", "normalised_pred": "indoor event planning", "normalised_gold": "egocentric planning"}}
+{"qid": "2306.05425v1.pdf::Q000", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "egocentric planning", "pages": 22, "document_id": 7587, "arm": "azure_basic_lc", "question_id": "2306.05425v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document, which shows the MIMIC-IT overview with different branches representing fundamental capabilities, I need to identify the task at the top of the planning branch.\n\nIn the planning section (rightmost branch of the tree diagram), examining from top to bottom, the topmost task shown is \"Commonsense Reasoning\" with the example question \"What might this young boy learn from this experience? (LA)\"\n\nAnswer: Commonsense Reasoning", "citations": [], "input_tokens": 21188, "output_tokens": 104, "cost_micros": 65124, "latency_ms": 6780, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "commonsense reasoning", "normalised_gold": "egocentric planning"}}
+{"qid": "2306.05425v1.pdf::Q000", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "egocentric planning", "pages": 22, "document_id": 7587, "arm": "azure_premium_lc", "question_id": "2306.05425v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1, which shows the MIMIC-IT overview with three main branches (Perception, Reasoning, and Planning), I can see the Planning branch on the right side.\n\nAt the top of the Planning branch, the task shown is \"Indoor Event Planning (IEP)\" with the question \"Can you suggest any recreational pursuits that can be enjoyed within this room?\"\n\nAnswer: Indoor Event Planning (IEP)", "citations": [], "input_tokens": 23513, "output_tokens": 95, "cost_micros": 71964, "latency_ms": 5625, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.3333333333333333, "method": "str_norm", "normalised_pred": "indoor event planning iep", "normalised_gold": "egocentric planning"}}
+{"qid": "2306.05425v1.pdf::Q000", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "egocentric planning", "pages": 22, "document_id": 7587, "arm": "llamacloud_basic_lc", "question_id": "2306.05425v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about a planning branch or tree structure with tasks.\n\nIn Figure 1 (MIMIC-IT overview), I can see different capabilities organized into three main branches: Perception, Reasoning, and Planning.\n\nUnder the Planning branch, I can see two tasks listed:\n1. Egocentric Planning (E4D)\n2. Episode Memory (DC)\n\nLooking at the visual layout in Figure 1, \"Egocentric Planning\" appears to be positioned at the top of the Planning branch.\n\nAnswer: Egocentric Planning", "citations": [], "input_tokens": 20245, "output_tokens": 129, "cost_micros": 62670, "latency_ms": 5657, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "egocentric planning", "normalised_gold": "egocentric planning"}}
+{"qid": "2306.05425v1.pdf::Q000", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "egocentric planning", "pages": 22, "document_id": 7587, "arm": "llamacloud_premium_lc", "question_id": "2306.05425v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document, I need to identify the task at the top of the Planning branch.\n\nThe Planning section shows three items:\n1. At the top: \"Is it safe to walk on the floor while the woman is cleaning? E4D\" \u2192 EVR [Egocentric Video Reasoning]\n2. In the middle: \"Can you suggest any ways to brighten up the room I'm in? E4D\" \u2192 EP [Egocentric Planning]\n3. At the bottom: \"Can you suggest any recreational pursuits that can be enjoyed within this room? IEP\" \u2192 IEPP [Indoor Event Planning]\n\nThe topmost task in the Planning branch is Egocentric Video Reasoning (EVR).\n\nAnswer: Egocentric Video Reasoning", "citations": [], "input_tokens": 24261, "output_tokens": 176, "cost_micros": 75423, "latency_ms": 6139, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.4, "method": "str_norm", "normalised_pred": "egocentric video reasoning", "normalised_gold": "egocentric planning"}}
+{"qid": "2306.05425v1.pdf::Q000", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "egocentric planning", "pages": 22, "document_id": 7587, "arm": "surfsense_agentic", "question_id": "2306.05425v1.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find the planning branch of the tree and identify the task at the top.Let me search for content related to \"planning branch\" or \"tree\" to find the relevant figure or diagram:Let me search more specifically for \"tree\" and look at figures:Let me look at Figure 1 which mentions perception, reasoning, and planning:Good! I can see \"Planning\" section. Let me continue reading to see what tasks are under the Planning branch:Perfect! I can see the Planning section with its tasks. Looking at the structure, under \"Planning\" I can see:\n\n1. \"Commonsense Reasoning\" - \"What might this young boy learn from this experience? (LA)\"\n2. \"General Scene Difference\" \n3. \"Egocentric Planning\"\n4. \"Indoor Event Planning\"\n\nBut I need to see the visual tree structure more carefully. Let me look at the earlier part to understand the tree layout better:Now I can see the structure clearly. Looking at Figure 1, there's a tree structure with three main branches:\n1. Perception\n2. Reasoning\n3. Planning\n\nUnder the \"Planning\" branch, the first task/question that appears is: \"What might this young boy learn from this experience? (LA)\" which is labeled as \"Commonsense Reasoning\".\n\nSo the task at the top of the Planning branch is \"Commonsense Reasoning\".\n\nAnswer: Commonsense Reasoning", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 51937, "error": null, "extra": {"thread_id": 1987, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_f3cc3b28e854493c927ab87e43d7d1d0", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 285, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "commonsense reasoning", "normalised_gold": "egocentric planning"}}
+{"qid": "2306.05425v1.pdf::Q002", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "6", "pages": 22, "document_id": 7587, "arm": "native_pdf", "question_id": "2306.05425v1.pdf::Q002", "answer_letter": null, "raw_text": "I'll search through the document for bar plots.\n\nLooking through the figures:\n\n**Figure 1** (page 2): This shows example images and questions from the MIMIC-IT dataset. No bar plots here.\n\n**Figure 2** (page 4): This shows data format comparisons with example images and text. No bar plots here.\n\n**Figure 3** (page 5): This shows the Syphus pipeline overview. No bar plots here.\n\n**Figure 4** (page 7): This figure shows \"The data statistics of multi-modal in-context instruction-response pairs\" with:\n- (a) Instructions - shown as a sunburst/radial chart (not a bar plot)\n- (b) Responses - shown as a sunburst/radial chart (not a bar plot)\n- (c) Statistics of instructions and responses - This contains **4 bar plots**:\n  1. Instruction Length distribution\n  2. Response Length distribution\n  3. # Images distribution\n  4. # Related Instructions distribution\n\n**Figure 5** (page 8): This shows example responses from Otter with images and text. No bar plots here.\n\n**Figure 6** (page 10): This shows three subplots:\n- (a) Video understanding - **2 bar plots** (MSVD 0-shot and MSRVTT 0-shot)\n- (b) Vision-language model alignment - **1 bar plot** (Elo Rating)\n- (c) COCO caption - This is a line graph, not a bar plot\n\nTotal bar plots: 4 (from Figure 4c) + 2 (from Figure 6a) + 1 (from Figure 6b) = **7 bar plots**\n\nAnswer: 7", "citations": [], "input_tokens": 55838, "output_tokens": 405, "cost_micros": 173589, "latency_ms": 40905, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2306.05425v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "7", "normalised_gold": "6"}}
+{"qid": "2306.05425v1.pdf::Q002", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "6", "pages": 22, "document_id": 7587, "arm": "azure_basic_lc", "question_id": "2306.05425v1.pdf::Q002", "answer_letter": null, "raw_text": "I need to carefully examine the document for bar plots, including any subplots.\n\nLooking through the figures:\n\n**Figure 1**: Shows example images and text - no bar plots.\n\n**Figure 2**: Shows example images and text comparisons - no bar plots.\n\n**Figure 3**: Shows a pipeline diagram - no bar plots.\n\n**Figure 4**: This figure contains multiple subplots showing data statistics:\n- (a) Shows two circular/sunburst plots for instructions - not bar plots\n- (b) Shows two circular/sunburst plots for responses - not bar plots\n- (c) Shows four plots with statistics:\n  1. \"Instruction Length\" - appears to be a histogram/bar plot\n  2. \"Response Length\" - appears to be a histogram/bar plot\n  3. \"# Images\" - appears to be a histogram/bar plot\n  4. \"# Related Instructions\" - appears to be a histogram/bar plot\n\nSo Figure 4(c) contains 4 bar plots.\n\n**Figure 5**: Shows example images and responses - no bar plots.\n\n**Figure 6**: This figure contains three subfigures:\n- (a) \"Video understanding\" - shows a grouped bar chart with multiple bars\n- (b) \"Vision-language model alignment\" - shows a bar chart\n- (c) \"COCO caption\" - shows a line plot with points, not a bar plot\n\nSo Figure 6 has 2 bar plots (a and b).\n\nTotal count: 4 (from Figure 4c) + 2 (from Figure 6) = 6 bar plots.\n\nAnswer: 6", "citations": [], "input_tokens": 21183, "output_tokens": 367, "cost_micros": 69054, "latency_ms": 11281, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2306.05425v1.pdf::Q002", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "6", "pages": 22, "document_id": 7587, "arm": "azure_premium_lc", "question_id": "2306.05425v1.pdf::Q002", "answer_letter": null, "raw_text": "I need to carefully examine all the figures in the document to count the bar plots.\n\nLooking through the figures:\n\n**Figure 1**: Shows example images and text describing different capabilities - no bar plots.\n\n**Figure 2**: Shows data format comparison with images and text - no bar plots.\n\n**Figure 3**: Shows the Sythus pipeline overview - no bar plots.\n\n**Figure 4**: This is titled \"The data statistics of multi-modal in-context instruction-response pairs\" and contains:\n- (a) A circular/sunburst plot showing root verb-noun pairs of instructions\n- (b) A circular/sunburst plot showing root verb-noun pairs of responses  \n- (c) Four separate plots labeled \"Statistics of instructions and responses\":\n  - Top left: A histogram/bar plot showing \"# Instructions\" vs \"Instruction Length\"\n  - Top right: A histogram/bar plot showing \"# Responses\" vs \"Response Length\"\n  - Bottom left: A histogram/bar plot showing \"# Instructions\" vs \"# Images\"\n  - Bottom right: A histogram/bar plot showing \"# Instructions\" vs \"# Related Instructions\"\n\nSo in Figure 4(c), there are **4 bar plots**.\n\n**Figure 5**: Shows example responses with images and text - no bar plots.\n\n**Figure 6**: Contains three subplots:\n- (a) \"Video understanding\" - shows bar plots comparing VideoChatGPT and Otter on MSVD and MSRVTT for QA and Captioning. This contains **4 grouped bar comparisons** (which I count as bar plots).\n- (b) \"Vision-language model alignment\" - shows an Elo rating comparison with bars for different models - this is **1 bar plot**.\n- (c) \"COCO caption\" - shows a line plot, not a bar plot.\n\nSo Figure 6 has: 4 (from 6a) + 1 (from 6b) = **5 bar plots**.\n\n**Figure 7**: Shows text examples in multiple languages - no bar plots.\n\nTotal bar plots: 4 (from Figure 4c) + 5 (from Figure 6) = **9 bar plots**.\n\nAnswer: 9", "citations": [], "input_tokens": 23508, "output_tokens": 506, "cost_micros": 78114, "latency_ms": 14187, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "9", "normalised_gold": "6"}}
+{"qid": "2306.05425v1.pdf::Q002", "doc_id": "2306.05425v1.pdf", "arm": "llamacloud_basic_lc", "answer_format": "int", "gold": "6", "pages": 22, "document_id": 7587, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:51:13Z", "latency_ms": 11328, "error": null, "raw_text_chars": 1376}]}, "question_id": "2306.05425v1.pdf::Q002", "answer_letter": null, "raw_text": "I need to carefully examine the document for bar plots.\n\nLooking through the figures in the document:\n\n1. **Figure 1**: Shows example images with text descriptions - no bar plots\n2. **Figure 2**: Shows data format comparison with images and text - no bar plots\n3. **Figure 3**: Shows the Syphus overview diagram - no bar plots\n4. **Figure 4**: This appears to show statistics and analysis. Let me examine it carefully:\n   - Part (a): Shows verb-noun structure for instructions - this appears to be a bar chart\n   - Part (b): Shows verb-noun structure for responses - this appears to be a bar chart\n   - Part (c): Shows \"Diversity in our dataset\" with what appears to be distribution plots\n\nLooking at Figure 4 more carefully:\n- (a) contains what looks like a horizontal bar chart showing verbs and nouns from instructions\n- (b) contains what looks like a horizontal bar chart showing verbs and nouns from responses\n- (c) contains multiple distribution plots showing lengths and counts\n\n5. **Figure 5**: Shows response examples - no bar plots\n6. **Figure 6**: The document mentions Fig. 6(b) and Fig. 6(c) but these are not fully visible in the provided pages\n\nFrom what I can see clearly in Figure 4:\n- Subfigure (a): 1 bar plot (showing instruction verb-noun structure)\n- Subfigure (b): 1 bar plot (showing response verb-noun structure)\n\nTotal visible bar plots: 2\n\nAnswer: 2", "citations": [], "input_tokens": 20240, "output_tokens": 354, "cost_micros": 66030, "latency_ms": 11328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "6"}}
+{"qid": "2306.05425v1.pdf::Q002", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "6", "pages": 22, "document_id": 7587, "arm": "llamacloud_premium_lc", "question_id": "2306.05425v1.pdf::Q002", "answer_letter": null, "raw_text": "I need to carefully examine all the figures in the document to count bar plots.\n\nLooking through the figures:\n\n**Figure 1**: This is a flowchart/diagram showing the MIMIC-IT overview with three main branches (Perception, Reasoning, Planning). Not a bar plot.\n\n**Figure 2**: This shows data format comparisons with images and text examples. Not a bar plot.\n\n**Figure 3**: This is a flowchart showing the Sythus overview. Not a bar plot.\n\n**Figure 4**: This figure is titled \"The data statistics of multi-modal in-context instruction-response pairs\" and contains:\n- (a) A circular/sunburst chart showing instructions (verb-noun pairs)\n- (b) A circular/sunburst chart showing responses (verb-noun pairs)\n- (c) \"Statistics of instructions and responses\" which contains 4 separate bar plots:\n  1. Instruction Length vs # Instructions (bar plot)\n  2. Response Length vs # Responses (bar plot)\n  3. # Images vs # Instructions (bar plot)\n  4. # Related Instructions vs # Instructions (bar plot)\n\nSo Figure 4(c) contains **4 bar plots**.\n\n**Figure 5**: This shows example responses from Otter with images and text. Not a bar plot.\n\n**Figure 6**: This contains three parts:\n- (a) A table showing video understanding results. Not a bar plot.\n- (b) A table showing vision-language model alignment. Not a bar plot.\n- (c) A table showing COCO caption results. Not a bar plot.\n\nTotal count: 4 bar plots (all from Figure 4c)\n\nAnswer: 4", "citations": [], "input_tokens": 24256, "output_tokens": 385, "cost_micros": 78543, "latency_ms": 10187, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "6"}}
+{"qid": "2306.05425v1.pdf::Q002", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "6", "pages": 22, "document_id": 7587, "arm": "surfsense_agentic", "question_id": "2306.05425v1.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the document to count all bar plots (including subplots).Let me search for figures and plots in the document:Now let me read the actual content around the figures to see the descriptions of plots:Let me read more sections to find all the figures:Now let me look at Figure 4 more carefully to see if it contains bar plots:Good! I can see Figure 4 (c) has bar plots. Let me continue looking at Figure 6:Now I need to look at the actual visual description to understand what kind of plots Figure 6 has. Looking at the data in lines 1693-1823, I can see Figure 6 has three subplots (a), (b), and (c). Let me check if these are bar plots by examining the structure:\n\n- Figure 6(a): Shows \"MSVD 0-shot\" and \"MSRVTT 0-shot\" with \"QA\" and \"Captioning\" categories, comparing VideoChatGPT and Otter - this appears to be bar plots\n- Figure 6(b): Shows \"Elo Rating\" comparison among MM-GPT, Inst. BLIP, LLaVA, MiniGPT, and Otter - this appears to be a bar plot\n- Figure 6(c): Shows \"COCO caption\" with \"0-shot 4-shot 8-shot 16-shot\" comparing Otter and OpenFlamingo - this appears to be a line or bar plot\n\nNow let me search for more figures to make sure I don't miss any:Figure 7 is mentioned but let me check if it's a plot or just examples. From line 2292, Figure 7 seems to be just text examples for multi-lingual translation, not a plot.\n\nLet me search for any appendix figures or additional figures:Great! I found references to bar charts. Let me read those sections more carefully:These descriptions refer to the charts/bar plots that appear in Figure 4(c). Let me now carefully read to understand which images correspond to which figures. The Im1.png through Im4.png appear to be the four subplots in Figure 4(c).\n\nLet me search for Figure 6 visual descriptions:Let me look for the visual description of Figure 6:Let me search for visual descriptions that mention bar charts or plots more systematically by searching around the figure areas:Now let me check if there are any additional figures by searching for all figure captions:Good! So we have:\n- Figure 1: Overview (not a bar plot)\n- Figure 2: Data format comparison (not a bar plot)\n- Figure 3: Sythus overview (not a bar plot)\n- Figure 4: Data statistics with (a) and (b) showing verb-noun pairs, and (c) showing statistics with multiple subplots\n- Figure 5: Response examples (not a bar plot)\n- Figure 6: With (a), (b), and (c) subplots\n- Figure 7: Multi-lingual translation examples (not a bar plot)\n\nNow I need to count the bar plots more carefully. Let me re-examine Figure 4(c) and Figure 6:\n\nFrom my reading:\n- Figure 4(a): Circular/sunburst plot (not bar plot) - verb-noun pairs with inner/outer circles\n- Figure 4(b): Circular/sunburst plot (not bar plot) - verb-noun pairs with inner/outer circles  \n- Figure 4(c): Contains 4 bar plot histograms based on the visual descriptions I found:\n  1. Instructions vs Instruction Length (Im1.png) - bar plot\n  2. Responses vs Response Length (Im2.png) - bar plot\n  3. Instructions vs # Images (Im3.png) - bar plot\n  4. Instructions vs # Related Instructions (Im4.png) - bar plot\n\nFor Figure 6, I need to examine what type of plots they are. Let me look at the text around Figure 6 more carefully. From the data I saw (lines 1693-1823), Figure 6 appears to have:\n- Figure 6(a): Shows QA and Captioning comparisons for MSVD and MSRVTT - likely bar plots (2 groups x 2 bars = 4 bars total, but this counts as 2 bar plots for the two datasets)\n- Figure 6(b): Shows Elo Rating comparison - likely a bar plot (1 bar plot)\n- Figure 6(c): Shows COCO caption with 0-shot, 4-shot, 8-shot, 16-shot - this could be a line plot or bar plot\n\nLet me search for more specific information about these plots. Let me check if there are visual descriptions for Figure 6 plots:The file ends at line 5199. Let me search systematically for any remaining bar plot descriptions:Based on my thorough analysis of the document, let me now count all the bar plots:\n\n**Figure 4(c)** contains 4 bar plot histograms:\n1. Instructions vs Instruction Length\n2. Responses vs Response Length  \n3. Instructions vs # Images\n4. Instructions vs # Related Instructions\n\n**Figure 6** - I need to determine how many bar plots are in each subplot. Looking at the data structure:\n- Figure 6(a): Video understanding evaluation with MSVD 0-shot and MSRVTT 0-shot, each showing QA and Captioning - this appears to be 2 grouped bar charts (one for MSVD, one for MSRVTT)\n- Figure 6(b): Elo Rating comparison showing multiple models - this is 1 bar chart\n- Figure 6(c): COCO caption showing 0-shot through 16-shot results comparing Otter and OpenFlamingo - this could be either a grouped bar chart or line chart\n\nLooking at the structure in the text (lines 1816-1817: \"0-shot 4-shot 8-shot 16-shot\"), this suggests it's showing a progression, which is more typical of a line chart. However, I need to be careful.\n\nLet me reconsider the counting. The question asks for \"bar plots (including subplots)\". This means I should count each separate bar plot panel, not individual bars.\n\nFrom the evidence:\n- **Figure 4(c)**: 4 bar plot histograms (clearly described as histograms/bar charts in the visual descriptions)\n- **Figure 6(a)**: 2 bar plots (one for MSVD 0-shot, one for MSRVTT 0-shot, each comparing models on QA and Captioning)\n- **Figure 6(b)**: 1 bar plot (Elo Rating comparison)\n- **Figure 6(c)**: Likely 1 chart but unclear if it's a bar plot or line plot based on the \"0-shot 4-shot 8-shot 16-shot\" progression\n\nTotal count depends on whether Figure 6(c) is a bar plot. Given the context of comparing few-shot performance across different numbers of shots, and looking at typical machine learning papers, this is often shown as either a line chart or grouped bar chart.\n\nHowever, examining the OCR text more carefully from chunk 152307 (lines 1816-1822), the layout shows:\n- (a) Video understanding\n- (b) Vision-language model alignment  \n- (c) COCO caption\n\nAnd earlier in the caption it mentions \"CIDEr\" metric with values like \"60.8\", \"55\", etc. with labels for different shot counts.\n\nGiven the uncertainty, let me make my best assessment: Figure 6(c) showing few-shot learning progression is more likely to be a line plot given it shows a progression/trend across shot counts.\n\n**Final count:**\n- Figure 4(c): 4 bar plots\n- Figure 6(a): 2 bar plots\n- Figure 6(b): 1 bar plot\n- Figure 6(c): Not a bar plot (likely line plot)\n\n**Total: 7 bar plots**\n\nWait, let me reconsider Figure 6(a). Looking at the text structure again (lines 1693-1806), it shows:\n- MSVD 0-shot with QA and Captioning\n- MSRVTT 0-shot with QA and Captioning\n- These are comparing VideoChatGPT vs Otter\n\nThis could be:\n- 2 separate bar plots (one for MSVD, one for MSRVTT), OR\n- 1 bar plot with 2 groups\n\nTypically in papers, when multiple datasets are shown side-by-side in subfigure (a), they would be considered separate bar plots. So I'll count them as 2.\n\n**Final answer: 7**\n\nAnswer: 7", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 164109, "error": null, "extra": {"thread_id": 1988, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_98fcc3ce4d2f4d98b3c04b41720c35ca", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 1170, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "7", "normalised_gold": "6"}}
+{"qid": "2306.05425v1.pdf::Q003", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "Prompt", "pages": 22, "document_id": 7587, "arm": "native_pdf", "question_id": "2306.05425v1.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Figure 3 titled \"Sythus overview\" on page 5, I can see the cold start process.\n\nThe figure shows a \"Cold Start\" stage that comes before the main Sythus pipeline. The cold start stage shows:\n- In-context examples (document icon at top)\n- An arrow pointing down to \"Cold Start\"\n- ChatGPT being queried during this stage\n- This then leads to \"Step 1: System Message + visual annotation\"\n\nThe cold start stage is described in the text as being used to \"identify the optimal system message and in-context example for querying instruction-response pairs in a given dataset.\"\n\nAnswer: In-context examples", "citations": [], "input_tokens": 55843, "output_tokens": 153, "cost_micros": 169824, "latency_ms": 41358, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2306.05425v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "in context examples", "normalised_gold": "prompt"}}
+{"qid": "2306.05425v1.pdf::Q003", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "Prompt", "pages": 22, "document_id": 7587, "arm": "azure_basic_lc", "question_id": "2306.05425v1.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Figure 3 which shows the Sythus overview, the cold start stage appears before the main pipeline steps (Step 1-4).\n\nThe cold start stage shows interactions with ChatGPT to identify the optimal system message and in-context examples. The figure shows arrows connecting to \"System Message\" and \"In-context examples\" components.\n\nAnswer: Identify optimal system message and in-context example", "citations": [], "input_tokens": 21188, "output_tokens": 91, "cost_micros": 64929, "latency_ms": 5532, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "identify optimal system message and in context example", "normalised_gold": "prompt"}}
+{"qid": "2306.05425v1.pdf::Q003", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "Prompt", "pages": 22, "document_id": 7587, "arm": "azure_premium_lc", "question_id": "2306.05425v1.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Figure 3 which shows the Sythus overview, the cold start stage shows a flow diagram where the first step involves \"System Message\" with an arrow pointing to ChatGPT.\n\nAnswer: System Message", "citations": [], "input_tokens": 23513, "output_tokens": 50, "cost_micros": 71289, "latency_ms": 4468, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "system message", "normalised_gold": "prompt"}}
+{"qid": "2306.05425v1.pdf::Q003", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "Prompt", "pages": 22, "document_id": 7587, "arm": "llamacloud_basic_lc", "question_id": "2306.05425v1.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Figure 3 (Sythus overview), I can see the pipeline stages.\n\nThe cold start stage is shown at the top, and the first step within it involves prompting ChatGPT to generate system messages and in-context examples.\n\nAnswer: System Message", "citations": [], "input_tokens": 20245, "output_tokens": 62, "cost_micros": 61665, "latency_ms": 4858, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "system message", "normalised_gold": "prompt"}}
+{"qid": "2306.05425v1.pdf::Q003", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "Prompt", "pages": 22, "document_id": 7587, "arm": "llamacloud_premium_lc", "question_id": "2306.05425v1.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Figure 3 which shows the Sythus overview, I can see the cold start process and the subsequent steps.\n\nThe cold start section shows \"In-context examples\" which feeds into Step 1.\n\nStep 1 is labeled as \"System Message + visual annotation\" which leads to the \"Prompt\".\n\nAnswer: System Message + visual annotation", "citations": [], "input_tokens": 24261, "output_tokens": 80, "cost_micros": 73983, "latency_ms": 5562, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "system message visual annotation", "normalised_gold": "prompt"}}
+{"qid": "2306.05425v1.pdf::Q003", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "Prompt", "pages": 22, "document_id": 7587, "arm": "surfsense_agentic", "question_id": "2306.05425v1.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the document to find the figure about cold start and identify the first step.Let me search for \"cold start\" in the document to locate the relevant figure.Good! I found a reference to \"Cold Start\" with steps. Let me read the section around line 759-798 (chunk 152250) where the figure appears.Perfect! I can see the figure clearly shows the steps of the cold start process. According to the figure, the steps are:\n\n- Step 1: System Message\n- Step 2: Generate instruction-response pairs\n- Step 3: Filtering\n- Step 4: Translation\n\nThe first step of cold start is \"System Message\".\n\nAnswer: System Message", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 29859, "error": null, "extra": {"thread_id": 1989, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_39c9f325d0c8483fb88a1b48780b1918", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 138, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "system message", "normalised_gold": "prompt"}}
+{"qid": "2306.05425v1.pdf::Q004", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "soccer", "pages": 22, "document_id": 7587, "arm": "native_pdf", "question_id": "2306.05425v1.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the figures in the document, particularly Figure 1 and Figure 5 which show examples from the MIMIC-IT dataset and Otter's capabilities:\n\nIn Figure 1, there are examples showing egocentric video reasoning with what appears to be soccer/football scenes.\n\nIn Figure 5, the egocentric visual assistant examples at the bottom show scenes from what appears to be soccer/football, including references to passing the ball, wearing jerseys, and the goalkeeper saving the ball.\n\nAnswer: Soccer/Football", "citations": [], "input_tokens": 55848, "output_tokens": 115, "cost_micros": 169269, "latency_ms": 35250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2306.05425v1.pdf"}, "graded": {"correct": true, "f1": 0.6666666666666666, "method": "str_norm", "normalised_pred": "soccer football", "normalised_gold": "soccer"}}
+{"qid": "2306.05425v1.pdf::Q004", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "soccer", "pages": 22, "document_id": 7587, "arm": "azure_basic_lc", "question_id": "2306.05425v1.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document, I need to find visual examples showing sports.\n\nIn Figure 5, which shows \"Otter's response examples in different scenarios,\" there is an \"Egocentric Visual Assistant\" example with text that says \"Hey Otter, what should I do?\" and the response mentions \"Pass the ball to the middle and run to the gate, looking for a shooting opportunity.\"\n\nThis clearly refers to soccer/football.\n\nAnswer: Soccer (football)", "citations": [], "input_tokens": 21193, "output_tokens": 102, "cost_micros": 65109, "latency_ms": 5812, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.6666666666666666, "method": "str_norm", "normalised_pred": "soccer football", "normalised_gold": "soccer"}}
+{"qid": "2306.05425v1.pdf::Q004", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "soccer", "pages": 22, "document_id": 7587, "arm": "azure_premium_lc", "question_id": "2306.05425v1.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the visual examples in the document, particularly in Figure 5 under \"Egocentric Visual Assistant,\" I can see an image showing a soccer/football scene from a first-person perspective.\n\nAnswer: Soccer (football)", "citations": [], "input_tokens": 23518, "output_tokens": 52, "cost_micros": 71334, "latency_ms": 3921, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.6666666666666666, "method": "str_norm", "normalised_pred": "soccer football", "normalised_gold": "soccer"}}
+{"qid": "2306.05425v1.pdf::Q004", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "soccer", "pages": 22, "document_id": 7587, "arm": "llamacloud_basic_lc", "question_id": "2306.05425v1.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the visual examples in the document, I need to identify what sport is shown in the MIMIC-IT dataset examples and Otter's response examples.\n\nIn Figure 5, which shows \"Otter's response examples in different scenarios,\" there is an example labeled \"Egocentric Visual Assistant\" at the bottom. The text mentions \"guiding users to land a small aircraft.\"\n\nAdditionally, in the Ego4D (E4D) section's in-context examples (Table 5), there are questions about:\n- \"What color of jersey does my teammates wear?\" with the response \"Black, as you pass the ball to a player wearing black jersey\"\n- \"Does our team goal?\" with the response about a goalkeeper saving the ball\n\nThis indicates soccer/football is being discussed.\n\nAnswer: Soccer (football)", "citations": [], "input_tokens": 20250, "output_tokens": 180, "cost_micros": 63450, "latency_ms": 7734, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.6666666666666666, "method": "str_norm", "normalised_pred": "soccer football", "normalised_gold": "soccer"}}
+{"qid": "2306.05425v1.pdf::Q004", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "soccer", "pages": 22, "document_id": 7587, "arm": "llamacloud_premium_lc", "question_id": "2306.05425v1.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the document, I need to find visual examples that show sports.\n\nIn Figure 5, under the \"Egocentric Visual Assistant\" section, there are two screenshots. The right screenshot shows \"A person playing soccer with a chat overlay\" and includes the text \"User: Hey Otter, what should I do? OTTER-E: Pass the ball *to the middle* and run to the gate, looking for a shooting opportunity.\"\n\nAnswer: Soccer", "citations": [], "input_tokens": 24266, "output_tokens": 102, "cost_micros": 74328, "latency_ms": 6890, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "soccer", "normalised_gold": "soccer"}}
+{"qid": "2306.05425v1.pdf::Q004", "doc_id": "2306.05425v1.pdf", "answer_format": "str", "gold": "soccer", "pages": 22, "document_id": 7587, "arm": "surfsense_agentic", "question_id": "2306.05425v1.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document you mentioned to find information about the visual examples in MIMIC-IT and Otter.Let me search for visual examples and figures in the document to find information about sports shown in MIMIC-IT and Otter examples.Let me read more of the document to find figures and visual examples:Let me continue reading to find specific visual examples that show sports:Found \"tennis\"! Let me search for the context around this mention:Perfect! Let me read that section to get more context about the tennis visual example:Now let me search for where this image appears in relation to MIMIC-IT or Otter examples. Let me look earlier in the document to understand the context of these figures:Let me search backwards to find where these images are part of a figure showing examples:Let me search for \"Otter\" in the document to find where these visual examples are shown:Now let me look at Figure 5, which shows \"Otter's response examples\":Great! I found it. In the Egocentric Visual Assistant section of Figure 5, there's a conversation where Otter-E says \"Pass the ball to the middle and run to the gate, looking for a shooting opportunity.\" This clearly refers to soccer/football. Let me verify by looking at the visual description of the image that accompanies this text:Perfect! I found the answer. The visual description clearly states this is an \"Outdoor soccer/football field with green grass\" with players wearing blue athletic shirts. The Otter-E response also mentions \"Pass the ball to the middle and run to the gate, looking for a shooting opportunity,\" which confirms this is soccer/football.\n\nAnswer: Soccer (football)", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 123078, "error": null, "extra": {"thread_id": 1990, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_e493d9b5d4804d67aac4ae04786b3ef7", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 450, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 0.6666666666666666, "method": "str_norm", "normalised_pred": "soccer football", "normalised_gold": "soccer"}}
+{"qid": "2306.05425v1.pdf::Q006", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "8", "pages": 22, "document_id": 7587, "arm": "native_pdf", "question_id": "2306.05425v1.pdf::Q006", "answer_letter": null, "raw_text": "Looking at Figure 7 on page 16, which shows \"In-context examples for multi-lingual translation query,\" I can count the following languages:\n\n1. English\n2. Chinese (\u4e2d\u6587)\n3. Spanish (espa\u00f1ol)\n4. Japanese (\u65e5\u672c\u8a9e)\n5. French (Fran\u00e7ais)\n6. Korean (\ud55c\uad6d\uc5b4)\n7. German (Deutsch)\n8. Arabic (\u0629\u06cc\u0628\u0631\u0639\u0644\u0627)\n\nAnswer: 8", "citations": [], "input_tokens": 55842, "output_tokens": 108, "cost_micros": 169146, "latency_ms": 37125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2306.05425v1.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "8"}}
+{"qid": "2306.05425v1.pdf::Q006", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "8", "pages": 22, "document_id": 7587, "arm": "azure_basic_lc", "question_id": "2306.05425v1.pdf::Q006", "answer_letter": null, "raw_text": "I need to count the number of languages shown in Figure 7, which displays in-context examples for multi-lingual translation query.\n\nLooking at Figure 7, I can see the following languages listed:\n\n1. English\n2. Chinese (\u4e2d\u6587)\n3. Spanish (espa\u00f1ol)\n4. Japanese (\u65e5\u672c\u8a9e)\n5. French (Fran\u00e7ais)\n6. Korean (\ud55c\uad6d\uc5b4)\n7. German (Deutsch)\n8. Arabic (\u0627\u0644\u0639\u0631\u0628\u064a\u0629)\n\nAnswer: 8", "citations": [], "input_tokens": 21187, "output_tokens": 113, "cost_micros": 65256, "latency_ms": 4327, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "8"}}
+{"qid": "2306.05425v1.pdf::Q006", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "8", "pages": 22, "document_id": 7587, "arm": "azure_premium_lc", "question_id": "2306.05425v1.pdf::Q006", "answer_letter": null, "raw_text": "I need to count the number of languages shown in Figure 7, which displays in-context examples for multi-lingual translation query.\n\nLooking at Figure 7, I can see the following languages listed:\n\n1. English\n2. Chinese \u4e2d\u6587\n3. Spanish espa\u00f1ol\n4. Japanese \u65e5\u672c\u8a9e\n5. French Fran\u00e7ais\n6. Korean \ud55c\uad6d\uc5b4\n7. German Deutsch\n8. Arabic \u0627\u0644\u0639\u0631\u0628\u064a\u0629\n\nAnswer: 8", "citations": [], "input_tokens": 23512, "output_tokens": 102, "cost_micros": 72066, "latency_ms": 4437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "8"}}
+{"qid": "2306.05425v1.pdf::Q006", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "8", "pages": 22, "document_id": 7587, "arm": "llamacloud_basic_lc", "question_id": "2306.05425v1.pdf::Q006", "answer_letter": null, "raw_text": "Looking at the document, specifically at Figure 7 which shows \"In-context examples for multi-lingual translation query\", I can count the following languages:\n\n1. English\n2. Chinese (\u4e2d\u6587)\n3. Spanish (espa\u00f1ol)\n4. Japanese (\u65e5\u672c\u8a9e)\n5. French (Fran\u00e7ais)\n6. Korean (\ud55c\uad6d\uc5b4)\n7. German (Deutsch)\n8. Arabic (\u0629\u06cc\u0628\u0631\u0639\u0644\u0627)\n\nAnswer: 8", "citations": [], "input_tokens": 20244, "output_tokens": 108, "cost_micros": 62352, "latency_ms": 4578, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "8"}}
+{"qid": "2306.05425v1.pdf::Q006", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "8", "pages": 22, "document_id": 7587, "arm": "llamacloud_premium_lc", "question_id": "2306.05425v1.pdf::Q006", "answer_letter": null, "raw_text": "I need to count the number of languages shown in Figure 7, which displays in-context examples for multi-lingual translation query.\n\nLooking at Figure 7 on page 16, I can see the following languages listed:\n\n1. English\n2. Chinese \u4e2d\u6587\n3. Spanish espa\u00f1ol\n4. Japanese \u65e5\u672c\u8a9e\n5. French Fran\u00e7ais\n6. Korean \ud55c\uad6d\uc5b4\n7. German Deutsch\n8. Arabic \u0627\u0644\u0639\u0631\u0628\u064a\u0629\n\nAnswer: 8", "citations": [], "input_tokens": 24260, "output_tokens": 107, "cost_micros": 74385, "latency_ms": 4422, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "8"}}
+{"qid": "2306.05425v1.pdf::Q006", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "8", "pages": 22, "document_id": 7587, "arm": "surfsense_agentic", "question_id": "2306.05425v1.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the document to find information about multi-lingual translation query and count the languages in the in-context examples.Now I can see references to the multi-lingual translation query. Let me read the specific section that mentions Figure 7 and the languages.Perfect! I found the information. Let me now look at Figure 7 to see the actual in-context examples and count the languages.Perfect! I can now see Figure 7 which shows the in-context examples for multi-lingual translation query. Let me count the languages shown:\n\n1. English\n2. Chinese (\u4e2d\u6587)\n3. Spanish (espa\u00f1ol)\n4. Japanese (\u65e5\u672c\u8a9e)\n5. French (Fran\u00e7ais)\n6. Korean (\ud55c\uad6d\uc5b4)\n7. German (Deutsch)\n8. Arabic (\u0627\u0644\u0639\u0631\u0628\u064a\u0629)\n\nThe document clearly shows 8 languages in the in-context examples for multi-lingual translation query in Figure 7.\n\nAnswer: 8", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 44640, "error": null, "extra": {"thread_id": 1991, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_25a64843d98447e9baa03cf538d93723", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 212, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "8"}}
+{"qid": "2306.05425v1.pdf::Q007", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "1", "pages": 22, "document_id": 7587, "arm": "native_pdf", "question_id": "2306.05425v1.pdf::Q007", "answer_letter": null, "raw_text": "I'll go through the document to count all the line plots shown.\n\nLooking through the figures in the document:\n\n**Figure 1** (page 2): This shows a complex diagram with multiple examples of different tasks and image types, but does not appear to contain line plots.\n\n**Figure 2** (page 4): This shows data format comparisons with example images and text, but no line plots.\n\n**Figure 3** (page 5): This shows the Sythus pipeline overview with icons and flowchart elements, but no line plots.\n\n**Figure 4** (page 7): This contains:\n- (a) A sunburst chart showing instructions\n- (b) A sunburst chart showing responses  \n- (c) Multiple bar charts/histograms and what appears to be **line plots** showing statistics\n\nLooking at Figure 4(c) more carefully, there appear to be line plots showing \"# Instructions\" and \"# Responses\" over \"Instruction Length\" and \"Response Length\", plus what looks like a line plot for \"# Related Instructions\".\n\n**Figure 5** (page 8): Shows example images with text responses, not line plots.\n\n**Figure 6** (page 10): This contains three subfigures:\n- (a) Bar charts for video understanding\n- (b) Bar chart for vision-language model alignment\n- (c) Contains **line plots** showing CIDEr scores across different shot settings (0-shot, 4-shot, 8-shot, 16-shot) comparing Otter and OpenFlamingo - this appears to show 2 line plots\n\n**Figure 7** (page 16): Shows text examples in multiple languages, no line plots.\n\nCounting the line plots:\n- Figure 4(c): appears to show 2-3 line plots\n- Figure 6(c): shows 2 line plots (one for Otter, one for OpenFlamingo)\n\nTotal: approximately 4-5 line plots visible in Figure 4(c) and 2 in Figure 6(c) = approximately 6-7 total.\n\nLooking more carefully at Figure 4(c), I can see there are line plots for instruction length distribution, response length distribution, and related instructions. Figure 6(c) clearly shows 2 ", "citations": [], "input_tokens": 55833, "output_tokens": 512, "cost_micros": 175179, "latency_ms": 42171, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "length", "pdf_filename": "2306.05425v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "1"}}
+{"qid": "2306.05425v1.pdf::Q007", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "1", "pages": 22, "document_id": 7587, "arm": "azure_basic_lc", "question_id": "2306.05425v1.pdf::Q007", "answer_letter": null, "raw_text": "I need to carefully examine the document for line plots.\n\nLooking through the figures in the document:\n\n**Figure 1**: This shows example images with text descriptions - not line plots.\n\n**Figure 2**: Shows example data formats with images and text - not line plots.\n\n**Figure 3**: Shows a pipeline/flowchart diagram - not line plots.\n\n**Figure 4**: This figure contains multiple subplots:\n- (a) A circular/sunburst chart showing root verb-noun pairs\n- (b) Another circular/sunburst chart showing root verb-noun pairs\n- (c) Four separate line plots showing:\n  1. Instruction Length distribution\n  2. Response Length distribution\n  3. # Images distribution\n  4. # Related Instructions distribution\n\nSo Figure 4(c) contains **4 line plots**.\n\n**Figure 5**: Shows example images with text responses - not line plots.\n\n**Figure 6**: This figure contains three subplots:\n- (a) Two grouped bar charts for \"Video understanding\" (MSVD 0-shot and MSRVTT 0-shot)\n- (b) A bar chart showing \"Human evaluation comparison\" with Elo Ratings\n- (c) A line plot showing \"COCO caption\" performance across different shot settings (0-shot, 4-shot, 8-shot, 16-shot)\n\nSo Figure 6(c) contains **1 line plot**.\n\n**Total line plots**: 4 (from Figure 4c) + 1 (from Figure 6c) = **5 line plots**\n\nAnswer: 5", "citations": [], "input_tokens": 21178, "output_tokens": 363, "cost_micros": 68979, "latency_ms": 10016, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "1"}}
+{"qid": "2306.05425v1.pdf::Q007", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "1", "pages": 22, "document_id": 7587, "arm": "azure_premium_lc", "question_id": "2306.05425v1.pdf::Q007", "answer_letter": null, "raw_text": "I need to carefully examine the document for line plots (graphs showing data points connected by lines).\n\nLet me go through the figures systematically:\n\n**Figure 1**: Shows sample images with text descriptions - not a line plot.\n\n**Figure 2**: Shows a comparison of data formats with images and text - not a line plot.\n\n**Figure 3**: Shows a flowchart/pipeline diagram - not a line plot.\n\n**Figure 4**: Contains multiple subfigures:\n- (a) A circular/sunburst diagram showing verb-noun pairs\n- (b) Another circular/sunburst diagram showing verb-noun pairs\n- (c) Contains **4 line plots**: \n  1. \"# Instructions\" vs \"Instruction Length\"\n  2. \"# Responses\" vs \"Response Length\"\n  3. \"# Instructions\" vs \"# Images\"\n  4. \"# Instructions\" vs \"# Related Instructions\"\n\n**Figure 5**: Shows example images with captions and chat interfaces - not a line plot.\n\n**Figure 6**: Contains three subfigures:\n- (a) Bar chart for video understanding - not a line plot\n- (b) Bar chart for vision-language model alignment - not a line plot\n- (c) Shows **1 line plot**: \"CIDEr\" performance across different shot settings (0-shot, 4-shot, 8-shot, 16-shot)\n\n**Figure 7**: Shows text examples in multiple languages - not a line plot.\n\nTotal line plots found:\n- Figure 4(c): 4 line plots\n- Figure 6(c): 1 line plot\n\nAnswer: 5", "citations": [], "input_tokens": 23503, "output_tokens": 370, "cost_micros": 76059, "latency_ms": 9937, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "1"}}
+{"qid": "2306.05425v1.pdf::Q007", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "1", "pages": 22, "document_id": 7587, "arm": "llamacloud_basic_lc", "question_id": "2306.05425v1.pdf::Q007", "answer_letter": null, "raw_text": "I need to search through the document for line plots/graphs.\n\nLooking through the document, I can identify the following figures with line plots:\n\n1. Figure 4 - This shows multiple visualizations including what appears to be bar charts and word clouds showing verb-noun structures and dataset statistics. The caption mentions \"Fig. 4 (a)\" for instructions and \"Fig. 4 (b)\" for responses, and \"Fig. 4 (c)\" for diversity metrics. These appear to be bar charts and word frequency visualizations, not line plots.\n\n2. Figure 6 - The text references \"Fig. 6(b)\" and \"Fig. 6(c)\" where:\n   - Fig. 6(b) is described as showing Elo ratings (likely a bar chart or ranking visualization)\n   - Fig. 6(c) is described as showing \"COCO caption (CIDEr) few-shot evaluation\" - this is explicitly described as comparing performance across different shot settings (0-shot, 4-shot, 8-shot, 16-shot, 32-shot), which would typically be shown as a line plot\n\nBased on the text descriptions, Figure 6(c) showing the few-shot learning performance across different numbers of shots (0, 4, 8, 16, 32) would be presented as a line plot to show the progression of performance.\n\nHowever, I cannot actually see the figures in this document to count them definitively. Based on the textual descriptions alone, there appears to be at least 1 line plot (Figure 6c showing few-shot performance).\n\nAnswer: 1", "citations": [], "input_tokens": 20235, "output_tokens": 361, "cost_micros": 66120, "latency_ms": 11078, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "1"}}
+{"qid": "2306.05425v1.pdf::Q007", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "1", "pages": 22, "document_id": 7587, "arm": "llamacloud_premium_lc", "question_id": "2306.05425v1.pdf::Q007", "answer_letter": null, "raw_text": "I need to carefully examine the document for line plots (graphs showing lines connecting data points over continuous axes).\n\nLooking through the figures in the document:\n\n**Figure 1**: This is a flowchart/diagram showing the MIMIC-IT overview with different capabilities (Perception, Reasoning, Planning) - not a line plot.\n\n**Figure 2**: Shows data format comparisons with images and text examples - not a line plot.\n\n**Figure 3**: Shows a flowchart of the Syphus pipeline - not a line plot.\n\n**Figure 4**: This figure shows multiple visualizations of data statistics:\n- (a) Shows a circular/sunburst chart for Instructions (verb-noun pairs)\n- (b) Shows a circular/sunburst chart for Responses (verb-noun pairs)\n- (c) Shows \"Statistics of instructions and responses\" with what appears to be 4 small plots showing:\n  - Instruction Length vs # Instructions\n  - Response Length vs # Responses\n  - # Images vs # Instructions\n  - # Related Instructions vs # Instructions\n\nThese plots in Figure 4(c) appear to be line plots showing distributions/relationships.\n\n**Figure 5**: Shows example responses from Otter with images and text - not a line plot.\n\n**Figure 6**: Shows three tables labeled (a), (b), and (c) with numerical results - these are tables, not line plots.\n\n**Figure 7**: Shows text examples for multi-lingual translation - not a line plot.\n\nCounting the line plots in Figure 4(c): There are 4 small line plots showing different statistics.\n\nAnswer: 4", "citations": [], "input_tokens": 24251, "output_tokens": 364, "cost_micros": 78213, "latency_ms": 10437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "1"}}
+{"qid": "2306.05425v1.pdf::Q007", "doc_id": "2306.05425v1.pdf", "answer_format": "int", "gold": "1", "pages": 22, "document_id": 7587, "arm": "surfsense_agentic", "question_id": "2306.05425v1.pdf::Q007", "answer_letter": null, "raw_text": "I'll read the document to count the line plots.Let me search for figures and plots in the document by reading through it systematically:Now let me read the actual content of these figures to understand their structure and count line plots:Let me continue to check all figures systematically:Let me search more specifically for line plots or line charts:Let me look at Figure 6 more carefully as it seems to contain multiple subfigures with plots:Based on the data I see in Figure 6(c), it appears to show different shot settings (0-shot, 4-shot, 8-shot, 16-shot) which suggests this is a line plot. Let me search for more visual descriptions to understand all the plots:Let me read more around Figure 4 to understand what types of plots it contains:Now let me look for visual descriptions and embedded images to understand the structure of figures better. Let me check if there are visual descriptions that mention line plots or graphs:Let me search for embedded images and their visual descriptions:Let me look at the visual descriptions of the image that corresponds to Figure 6:This visual description seems to be of a different image not related to Figure 6. Let me look for a chart or graph description by searching through the document more systematically. Let me read sections with image data to find charts:The visual descriptions I'm seeing are for photographs, not charts. Let me search for keywords related to charts or visualizations more carefully:Good! Let me read this section to find charts and visualizations:Good! I found the section with chart visual descriptions. These appear to be histograms/bar charts for Figure 4. Now let me search for descriptions that mention line graphs or line plots:Let me specifically look for \"line\" combined with visualization terms:Now let me search specifically for Figure 6's visualization descriptions by looking near the Figure 6 location more carefully. Let me look for the embedded images around Figure 6:Let me check for chart/plot related embedded images. Looking back at Figure 6, I saw it has three subfigures (a), (b), and (c). Let me search for embedded images that might show line plots by looking at descriptions with \"axis\":I found histograms/bar charts. Now let me look for any line-based visualizations. Let me search for \"CIDEr\" which was mentioned in Figure 6(c) about COCO caption evaluation:I can see Figure 6(c) shows data with labels like \"0-shot 4-shot 8-shot 16-shot\" and \"CIDEr\", which strongly suggests it's a line plot showing performance across different shot settings. Let me look for a visualization description that mentions this, or look for visual descriptions that discuss multiple data points connected:The text \"0-shot 4-shot 8-shot 16-shot\" followed by \"(c) COCO caption\" suggests Figure 6(c) is indeed a line plot or chart. However, the embedded image Im14.jpg description seems unrelated (clinical setting). Let me look at the data more carefully and count what we know so far. Let me check if there are visual descriptions specifically for charts with \"shot\":", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 191452, "error": null, "extra": {"thread_id": 1992, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_48224411a27b457b894b9bc78d679631", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 942, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "1"}}
+{"qid": "2307.09288v2.pdf::Q000", "doc_id": "2307.09288v2.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "5", "pages": 77, "document_id": 7588, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:51:18Z", "latency_ms": 52514, "error": null, "raw_text_chars": 860}]}, "question_id": "2307.09288v2.pdf::Q000", "answer_letter": null, "raw_text": "I'll examine pages 5-10 to count the tables.\n\nPage 5: No tables visible in the text content shown.\n\nPage 6: Table 1 is present - \"Llama 2 family of models\" showing training data, parameters, context length, GQA, tokens, and LR for Llama 1 and Llama 2 models.\n\nPage 7: \n- Table 2: \"CO2 emissions during pretraining\" showing time, power consumption, and carbon emitted for different Llama 2 models.\n- Table 3: \"Overall performance on grouped academic benchmarks\" comparing MPT, Falcon, Llama 1, and Llama 2 models.\n\nPage 8:\n- Table 4: \"Comparison to closed-source models on academic benchmarks\" showing results for GPT-3.5, GPT-4, PaLM, PaLM-2-L, and Llama 2.\n- Table 5: \"SFT annotation\" showing examples of helpfulness and safety annotations.\n\nPage 9: No complete tables visible in the shown content.\n\nPage 10: No tables visible in the shown content.\n\nAnswer: 5", "citations": [], "input_tokens": 199213, "output_tokens": 280, "cost_micros": 601839, "latency_ms": 52468, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2307.09288v2.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "5"}}
+{"qid": "2307.09288v2.pdf::Q000", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "5", "pages": 77, "document_id": 7588, "arm": "azure_basic_lc", "question_id": "2307.09288v2.pdf::Q000", "answer_letter": null, "raw_text": "I'll count the tables on pages 5-10 of the document.\n\n**Page 5:**\n- Table 1: LLAMA 2 family of models\n\n**Page 6:**\n- Table 2: CO2 emissions during pretraining\n\n**Page 7:**\n- Table 3: Overall performance on grouped academic benchmarks\n\n**Page 8:**\n- Table 4: Comparison to closed-source models on academic benchmarks\n\n**Page 9:**\n- Table 5: SFT annotation examples\n\n**Page 10:**\n- Table 6: Statistics of human preference data for reward modeling\n\nAnswer: 6", "citations": [], "input_tokens": 80762, "output_tokens": 144, "cost_micros": 244446, "latency_ms": 6391, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "5"}}
+{"qid": "2307.09288v2.pdf::Q000", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "5", "pages": 77, "document_id": 7588, "arm": "azure_premium_lc", "question_id": "2307.09288v2.pdf::Q000", "answer_letter": null, "raw_text": "I'll count all the tables on pages 5-10.\n\n**Page 5:**\n- Table 1: LLAMA 2 family of models\n\n**Page 6:**\n- Figure 5 (not a table)\n- Table 2: CO2 emissions during pretraining\n\n**Page 7:**\n- Table 3: Overall performance on grouped academic benchmarks\n- Table 4: Comparison to closed-source models on academic benchmarks\n\n**Page 8:**\n- Table 5: SFT annotation - example of helpfulness and safety annotation\n\n**Page 9:**\n- No tables (contains Figures 11 and discussion)\n\n**Page 10:**\n- Table 6: Statistics of human preference data for reward modeling\n\nTotal count: 5 tables\n\nAnswer: 5", "citations": [], "input_tokens": 109911, "output_tokens": 181, "cost_micros": 332448, "latency_ms": 8235, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "5"}}
+{"qid": "2307.09288v2.pdf::Q000", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "5", "pages": 77, "document_id": 7588, "arm": "llamacloud_basic_lc", "question_id": "2307.09288v2.pdf::Q000", "answer_letter": null, "raw_text": "I'll count the tables that appear in pages 5-10 of the document.\n\nPage 5: \n- Table 1: Llama 2 family of models (comparing parameters, context length, etc.)\n\nPage 6:\n- No tables\n\nPage 7:\n- Table 3: Overall performance on grouped academic benchmarks\n\nPage 8:\n- Table 4: Comparison to closed-source models on academic benchmarks\n- Table 5: SFT annotation example (shown at top of page)\n\nPage 9:\n- No tables (this page discusses supervised fine-tuning details)\n\nPage 10:\n- Table 6: Statistics of human preference data for reward modeling\n\nCounting all tables in pages 5-10:\n- Page 5: 1 table\n- Page 6: 0 tables\n- Page 7: 1 table\n- Page 8: 2 tables\n- Page 9: 0 tables\n- Page 10: 1 table\n\nTotal: 1 + 0 + 1 + 2 + 0 + 1 = 5 tables\n\nAnswer: 5", "citations": [], "input_tokens": 80961, "output_tokens": 265, "cost_micros": 246858, "latency_ms": 7983, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "5"}}
+{"qid": "2307.09288v2.pdf::Q000", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "5", "pages": 77, "document_id": 7588, "arm": "llamacloud_premium_lc", "question_id": "2307.09288v2.pdf::Q000", "answer_letter": null, "raw_text": "I'll count the tables on pages 5-10 of the document.\n\n**Page 5:**\n- Table 1: LLAMA 2 family of models\nTotal: 1 table\n\n**Page 6:**\n- Table 2: CO\u2082 emissions during pretraining\n- Figure 5: Training Loss for LLAMA 2 models (this is a figure, not a table)\nTotal: 1 table\n\n**Page 7:**\n- Table 3: Overall performance on grouped academic benchmarks\nTotal: 1 table\n\n**Page 8:**\n- Table 4: Comparison to closed-source models on academic benchmarks\nTotal: 1 table\n\n**Page 9:**\n- Table 5: SFT annotation examples\nTotal: 1 table\n\n**Page 10:**\n- Table 6: Statistics of human preference data for reward modeling\nTotal: 1 table\n\nTotal tables on pages 5-10: 1 + 1 + 1 + 1 + 1 + 1 = 6\n\nAnswer: 6", "citations": [], "input_tokens": 105634, "output_tokens": 249, "cost_micros": 320637, "latency_ms": 8264, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "5"}}
+{"qid": "2307.09288v2.pdf::Q000", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "5", "pages": 77, "document_id": 7588, "arm": "surfsense_agentic", "question_id": "2307.09288v2.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document that was flagged as most relevant (2307.09288v2.pdf.xml) and check pages 5-10 for tables.I need to search for page markers to identify pages 5-10. Let me search for page indicators in the document.Let me read more of the document to understand its structure and find page markers.Good! I can see page numbers are indicated with `<!-- PageNumber=\"X\" -->`. Let me search for these page markers to find pages 5-10.Perfect! Now I know that:\n- Page 5 starts at line 1660\n- Page 6 starts at line 1858\n- Page 7 starts at line 1955\n- Page 8 starts at line 2202\n- Page 9 starts at line 2265\n- Page 10 starts at line 2332\n- Page 11 starts at line 2461\n\nSo I need to read from line 1660 to line 2460 to cover pages 5-10. Let me search for table markers in this range.Great! I can see the content from pages 5-10. Let me continue reading to make sure I capture all tables in this range. I've found several tables so far. Let me count them systematically:\n\nFrom line 1660 to 2460 (pages 5-10), I've seen:\n1. Table 1 (line 1664) - LLAMA 2 family of models\n2. Table 2 (line 1862) - CO2 emissions during pretraining\n3. Table 3 (line 1959) - Overall performance on grouped academic benchmarks\n4. Table 4 (line 2131) - Comparison to closed-source models\n5. Table 5 (line 2209) - SFT annotation\n6. Table 6 (line 2336) - Statistics of human preference data\n\nLet me verify by reading the remaining portion up to line 2460 to make sure I haven't missed any tables:Perfect! Now I can see all the `<table>` tags in the document. Based on the PageNumber markers I found earlier:\n- Page 5 starts at line 1660\n- Page 6 starts at line 1858\n- Page 7 starts at line 1955\n- Page 8 starts at line 2202\n- Page 9 starts at line 2265\n- Page 10 starts at line 2332\n- Page 11 starts at line 2461\n\nSo pages 5-10 cover lines 1660 to 2460 (just before page 11).\n\nThe tables that fall within this range are:\n1. Line 1664: `<table>` - Table 1 (on page 5)\n2. Line 1862: `<table>` - Table 2 (on page 6)\n3. Line 1959: `<table>` - Table 3 (on page 7)\n4. Line 2131: `<table>` - Table 4 (on page 8)\n5. Line 2209: `<table>` - Table 5 (on page 8)\n6. Line 2336: `<table>` - Table 6 (on page 10)\n\nTables at lines 2479 and 2544 are on page 11 or later, so they don't count.\n\nAnswer: 6", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 54500, "error": null, "extra": {"thread_id": 1993, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_d18e0e208d4c44ad9ccab3815ed12b04", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 413, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "5"}}
+{"qid": "2307.09288v2.pdf::Q002", "doc_id": "2307.09288v2.pdf", "arm": "native_pdf", "answer_format": "list", "gold": "['Vicuna13b-v1.3', 'PaLM Bison', 'Vicuna13b-v1.1']", "pages": 77, "document_id": 7588, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:51:25Z", "latency_ms": 40452, "error": null, "raw_text_chars": 575}]}, "question_id": "2307.09288v2.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 3 on page 4, which shows \"Safety human evaluation results for Llama 2-Chat compared to other open-source and closed-source models,\" I need to identify the models with the lowest violation percentages (since lower is better for safety).\n\nFrom the bar chart showing \"Violation % (lower is safer)\":\n- Llama 2-70b-chat has the lowest violation percentage (approximately 0-5%)\n- Llama 2-13b-chat has the second lowest (approximately 0-5%)\n- Llama 2-34b-chat has the third lowest (approximately 5-10%)\n\nAnswer: Llama 2-70b-chat, Llama 2-13b-chat, Llama 2-34b-chat", "citations": [], "input_tokens": 199224, "output_tokens": 182, "cost_micros": 600402, "latency_ms": 40406, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2307.09288v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "llama 2 13b chat, llama 2 34b chat, llama 2 70b chat", "normalised_gold": "palm bison, vicuna13b v1 1, vicuna13b v1 3"}}
+{"qid": "2307.09288v2.pdf::Q002", "doc_id": "2307.09288v2.pdf", "answer_format": "list", "gold": "['Vicuna13b-v1.3', 'PaLM Bison', 'Vicuna13b-v1.1']", "pages": 77, "document_id": 7588, "arm": "azure_basic_lc", "question_id": "2307.09288v2.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 3, which shows \"Safety human evaluation results for LLAMA 2-CHAT compared to other open-source and closed-source models,\" I need to identify the models with the lowest violation percentages (since lower is safer).\n\nFrom the bar chart in Figure 3, the models with the lowest violation percentages are:\n\n1. LLAMA 2-70b-chat (lowest violation %)\n2. ChatGPT 0301 (second lowest)\n3. LLAMA 2-34b-chat (third lowest)\n\nAnswer: Llama-2 70b-chat, ChatGPT 0301, Llama-2 34b-chat", "citations": [], "input_tokens": 80773, "output_tokens": 157, "cost_micros": 244674, "latency_ms": 6141, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "chatgpt 0301, llama 2 34b chat, llama 2 70b chat", "normalised_gold": "palm bison, vicuna13b v1 1, vicuna13b v1 3"}}
+{"qid": "2307.09288v2.pdf::Q002", "doc_id": "2307.09288v2.pdf", "answer_format": "list", "gold": "['Vicuna13b-v1.3', 'PaLM Bison', 'Vicuna13b-v1.1']", "pages": 77, "document_id": 7588, "arm": "azure_premium_lc", "question_id": "2307.09288v2.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 3, which shows \"Safety human evaluation results for LLAMA 2-CHAT compared to other open-source and closed-source models,\" I need to identify the models with the lowest violation percentages (since lower is safer).\n\nFrom the bar chart, the three models with the lowest violation percentages are:\n1. Llama-2-70b-chat (approximately 0.5%)\n2. Llama-2-34b-chat (approximately 1%)\n3. Llama-2-13b-chat (approximately 1.5%)\n\nAnswer: Llama-2-70b-chat, Llama-2-34b-chat, Llama-2-13b-chat", "citations": [], "input_tokens": 109922, "output_tokens": 165, "cost_micros": 332241, "latency_ms": 7625, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "llama 2 13b chat, llama 2 34b chat, llama 2 70b chat", "normalised_gold": "palm bison, vicuna13b v1 1, vicuna13b v1 3"}}
+{"qid": "2307.09288v2.pdf::Q002", "doc_id": "2307.09288v2.pdf", "answer_format": "list", "gold": "['Vicuna13b-v1.3', 'PaLM Bison', 'Vicuna13b-v1.1']", "pages": 77, "document_id": 7588, "arm": "llamacloud_basic_lc", "question_id": "2307.09288v2.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 3 which shows \"Safety human evaluation results for Llama 2-Chat compared to other open-source and close-source models,\" I need to identify the models with the lowest violation percentages (since lower is better for safety).\n\nFrom the figure, the models are ordered from left to right with their violation percentages. The three models with the lowest violation percentages are:\n\n1. Llama 2-Chat (70B) - appears to have the lowest violation percentage\n2. ChatGPT (0301) - appears to have the second lowest\n3. Falcon (40B-instruct) - appears to have the third lowest\n\nAnswer: Llama 2-Chat (70B), ChatGPT (0301), Falcon (40B-instruct)", "citations": [], "input_tokens": 80972, "output_tokens": 176, "cost_micros": 245556, "latency_ms": 7233, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "chatgpt 0301, falcon 40b instruct, llama 2 chat 70b", "normalised_gold": "palm bison, vicuna13b v1 1, vicuna13b v1 3"}}
+{"qid": "2307.09288v2.pdf::Q002", "doc_id": "2307.09288v2.pdf", "arm": "llamacloud_premium_lc", "answer_format": "list", "gold": "['Vicuna13b-v1.3', 'PaLM Bison', 'Vicuna13b-v1.1']", "pages": 77, "document_id": 7588, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:52:05Z", "latency_ms": 10343, "error": null, "raw_text_chars": 376}]}, "question_id": "2307.09288v2.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 3, which shows \"Safety human evaluation results for LLAMA 2-CHAT compared to other open-source and closed-source models,\" I can see the violation percentages (lower is safer):\n\nThe top-3 safest models (lowest violation %) are:\n1. Llama-2 7b-chat: 3%\n2. Llama-2 13b-chat: 3%\n3. Llama-2 70b-chat: 4%\n\nAnswer: Llama-2 7b-chat, Llama-2 13b-chat, Llama-2 70b-chat", "citations": [], "input_tokens": 105645, "output_tokens": 151, "cost_micros": 319200, "latency_ms": 10343, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "llama 2 13b chat, llama 2 70b chat, llama 2 7b chat", "normalised_gold": "palm bison, vicuna13b v1 1, vicuna13b v1 3"}}
+{"qid": "2307.09288v2.pdf::Q002", "doc_id": "2307.09288v2.pdf", "answer_format": "list", "gold": "['Vicuna13b-v1.3', 'PaLM Bison', 'Vicuna13b-v1.1']", "pages": 77, "document_id": 7588, "arm": "surfsense_agentic", "question_id": "2307.09288v2.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the document to find Figure 3 and identify the top-3 models.Let me search more specifically for Figure 3 content:Now let me read the area around line 1363 to see the actual figure data:Based on the document, Figure 3 shows safety violation percentages where \"lower is safer\". Let me look for the actual values to determine the top-3 (lowest violation %) models:The visual description indicates that the first 4 bars (dark blue) have the lowest values (approximately 3-7), which are the safest models. Based on the model order from lines 1391-1420, the models are listed in order:\n\n1. Llama-2 7b-chat\n2. Llama-2 13b-chat\n3. Llama-2 34b-chat\n4. Llama-2 70b-chat\n5. MPT 7b-chat\n6. Vicuna 13b-v1.1\n7. Vicuna 33b-v1.3\n8. Falcon 40b-instruct\n9. PaLM Bison\n10. ChatGPT 0301\n\nThe description states the first 4 bars (dark blue) have values approximately 3-7, which are the lowest (safest). So the top-3 safest models would be the first three models.\n\nAnswer: Llama-2 7b-chat, Llama-2 13b-chat, Llama-2 34b-chat", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 48187, "error": null, "extra": {"thread_id": 1994, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_8f47a07752394e8d87ac6ac816505c2a", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 260, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "llama 2 13b chat, llama 2 34b chat, llama 2 7b chat", "normalised_gold": "palm bison, vicuna13b v1 1, vicuna13b v1 3"}}
+{"qid": "2307.09288v2.pdf::Q004", "doc_id": "2307.09288v2.pdf", "answer_format": "str", "gold": "Llama 1 33B", "pages": 77, "document_id": 7588, "arm": "native_pdf", "question_id": "2307.09288v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Table 23 for QUAC (f1) results in the 0-shot setting:\n\nThe table shows:\n- MPT 7B: 38.0\n- MPT 30B: 40.4\n- Falcon 7B: 24.0\n- Falcon 40B: 41.2\n- Llama 1 7B: 38.9\n- Llama 1 13B: 39.9\n- Llama 1 33B: 44.1\n- Llama 1 65B: 41.0\n- Llama 2 7B: 39.4\n- Llama 2 13B: 42.7\n- Llama 2 34B: 42.9\n- Llama 2 70B: 42.4\n\nAnswer: Llama 1 33B", "citations": [], "input_tokens": 199228, "output_tokens": 210, "cost_micros": 600834, "latency_ms": 79592, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2307.09288v2.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "llama 1 33b", "normalised_gold": "llama 1 33b"}}
+{"qid": "2307.09288v2.pdf::Q004", "doc_id": "2307.09288v2.pdf", "answer_format": "str", "gold": "Llama 1 33B", "pages": 77, "document_id": 7588, "arm": "azure_basic_lc", "question_id": "2307.09288v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Table 23 in the document, which shows \"Comparison to open-source models on reading comprehension (SQUAD and QUAC)\", I need to find the QUAC (f1) 0-shot column.\n\nFrom the table, the 0-shot QUAC results are:\n- MPT 7B: 38.0\n- MPT 30B: 40.4\n- Falcon 7B: 38.9\n- LLAMA 1 13B: 39.9\n- LLAMA 1 33B: 44.1\n- LLAMA 1 65B: 41.0\n- LLAMA 2 7B: 39.4\n- LLAMA 2 34B: 42.9\n- LLAMA 2 70B: 42.4\n\nThe highest score in the 0-shot setting for QUAC is 44.1.\n\nAnswer: LLAMA 1 33B", "citations": [], "input_tokens": 80777, "output_tokens": 227, "cost_micros": 245736, "latency_ms": 7592, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "llama 1 33b", "normalised_gold": "llama 1 33b"}}
+{"qid": "2307.09288v2.pdf::Q004", "doc_id": "2307.09288v2.pdf", "answer_format": "str", "gold": "Llama 1 33B", "pages": 77, "document_id": 7588, "arm": "azure_premium_lc", "question_id": "2307.09288v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Table 23 in the document, which shows \"Comparison to open-source models on reading comprehension (SQUAD and QUAC)\", I need to find the 0-shot results for the QUAC dataset.\n\nIn the QUAC (f1) 0-shot column, the scores are:\n\n- MPT 7B: 38.0\n- MPT 30B: 40.4\n- Falcon 7B: 24.0\n- Falcon 40B: 41.2\n- LLAMA 1 7B: 38.9\n- LLAMA 1 13B: 39.9\n- LLAMA 1 33B: 44.1\n- LLAMA 1 65B: 41.0\n- LLAMA 2 7B: 39.4\n- LLAMA 2 13B: 42.7\n- LLAMA 2 34B: 42.9\n- LLAMA 2 70B: 42.4\n\nThe highest score in the 0-shot QUAC column is 44.1.\n\nAnswer: LLAMA 1 33B", "citations": [], "input_tokens": 109926, "output_tokens": 271, "cost_micros": 333843, "latency_ms": 8594, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "llama 1 33b", "normalised_gold": "llama 1 33b"}}
+{"qid": "2307.09288v2.pdf::Q004", "doc_id": "2307.09288v2.pdf", "answer_format": "str", "gold": "Llama 1 33B", "pages": 77, "document_id": 7588, "arm": "llamacloud_basic_lc", "question_id": "2307.09288v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Table 23 \"Comparison to Open-Source Models on Reading Comprehension (SQUAD and QUAC)\", I need to find the QUAC 0-shot column.\n\nThe table shows QUAC results with 0-shot and 1-shot columns. For the 0-shot setting on QUAC:\n\n- MPT 7B: 38.0\n- MPT 30B: 40.4\n- Falcon 7B: 24.0\n- Falcon 40B: 41.2\n- Llama 1 7B: 38.9\n- Llama 1 13B: 39.9\n- Llama 1 33B: 44.1\n- Llama 1 65B: 41.0\n- Llama 2 7B: 39.4\n- Llama 2 13B: 42.7\n- Llama 2 34B: 42.9\n- Llama 2 70B: 42.4\n\nThe highest 0-shot score on QUAC is 44.1, achieved by Llama 1 33B.\n\nAnswer: Llama 1 33B", "citations": [], "input_tokens": 80976, "output_tokens": 284, "cost_micros": 247188, "latency_ms": 8469, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "llama 1 33b", "normalised_gold": "llama 1 33b"}}
+{"qid": "2307.09288v2.pdf::Q004", "doc_id": "2307.09288v2.pdf", "arm": "llamacloud_premium_lc", "answer_format": "str", "gold": "Llama 1 33B", "pages": 77, "document_id": 7588, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:52:10Z", "latency_ms": 29391, "error": null, "raw_text_chars": 525}]}, "question_id": "2307.09288v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Table 23 in the document, which shows \"Comparison to open-source models on reading comprehension (SQUAD and QUAC)\", I need to find the QUAC (f1) 0-shot column.\n\nFrom the table, the QUAC (f1) 0-shot scores are:\n- MPT 7B: 38.0\n- MPT 30B: 40.4\n- Falcon 7B: 24.0\n- Falcon 40B: 41.2\n- LLAMA 1 7B: 38.9\n- LLAMA 1 13B: 39.9\n- LLAMA 1 33B: 44.1\n- LLAMA 1 65B: 41.0\n- LLAMA 2 7B: 39.4\n- LLAMA 2 13B: 42.7\n- LLAMA 2 34B: 42.9\n- LLAMA 2 70B: 42.4\n\nThe highest score is 44.1, which belongs to LLAMA 1 33B.\n\nAnswer: LLAMA 1 33B", "citations": [], "input_tokens": 105649, "output_tokens": 274, "cost_micros": 321057, "latency_ms": 29391, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "llama 1 33b", "normalised_gold": "llama 1 33b"}}
+{"qid": "2307.09288v2.pdf::Q004", "doc_id": "2307.09288v2.pdf", "answer_format": "str", "gold": "Llama 1 33B", "pages": 77, "document_id": 7588, "arm": "surfsense_agentic", "question_id": "2307.09288v2.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to find Table 23 and identify the best performing model in the 0-shot setting on the QUAC dataset.Let me search for \"Table 23\" in the document.Now let me read the section containing Table 23 to see the complete table.Perfect! I can see Table 23 completely now. Looking at the QUAC (f1) 0-shot column (column 7 in the table), the scores are:\n\n- MPT 7B: 38.0\n- MPT 30B: 40.4\n- Falcon 7B: 24.0\n- Falcon 40B: 41.2\n- LLAMA 1 7B: 38.9\n- LLAMA 1 13B: 39.9\n- LLAMA 1 33B: 44.1\n- LLAMA 1 65B: 41.0\n- LLAMA 2 7B: 39.4\n- LLAMA 2 13B: 42.7\n- LLAMA 2 34B: 42.9\n- LLAMA 2 70B: 42.4\n\nThe highest score in the 0-shot setting on QUAC is 44.1, which belongs to LLAMA 1 33B.\n\nAnswer: LLAMA 1 33B", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 40921, "error": null, "extra": {"thread_id": 1995, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_1765040c73804b7381cc9d57c302f53d", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 202, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "llama 1 33b", "normalised_gold": "llama 1 33b"}}
+{"qid": "2307.09288v2.pdf::Q005", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "8", "pages": 77, "document_id": 7588, "arm": "native_pdf", "question_id": "2307.09288v2.pdf::Q005", "answer_letter": null, "raw_text": "I need to find Table 21 in the document and count how many slots (cells) are missing data.\n\nLooking through the document, I can find Table 21 on page 49. Let me examine it:\n\nTable 21 is titled \"Code generation results on Human-Eval and MBPP.\"\n\nThe table has the following structure:\n- Rows for different models: MPT (7B, 30B), Falcon (7B, 40B), Llama 1 (7B, 13B, 33B, 65B), and Llama 2 (7B, 13B, 34B, 70B)\n- Columns for: Human-Eval (pass@1, pass@100) and MBPP (pass@1, pass@80)\n\nLet me count the missing entries (shown as \"-\" or blank):\n\nLooking at the table:\n- MPT 7B: has values for pass@1 under Human-Eval (18.3) and MBPP (22.6), but \"-\" for pass@100 and pass@80\n- MPT 30B: has values for pass@1 (25.0, 32.8), but \"-\" for pass@100 and pass@80\n- Falcon 7B: has values for pass@1 (0.0, 11.2), but \"-\" for pass@100 and pass@80\n- Falcon 40B: has values for pass@1 (0.6, 29.8), but \"-\" for pass@100 and pass@80\n- All Llama 1 and Llama 2 models have complete data (no missing slots)\n\nMissing slots:\n- MPT 7B: 2 missing (pass@100 for both benchmarks)\n- MPT 30B: 2 missing\n- Falcon 7B: 2 missing\n- Falcon 40B: 2 missing\n\nTotal: 8 missing slots\n\nAnswer: 8", "citations": [], "input_tokens": 199211, "output_tokens": 461, "cost_micros": 604548, "latency_ms": 75532, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2307.09288v2.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "8"}}
+{"qid": "2307.09288v2.pdf::Q005", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "8", "pages": 77, "document_id": 7588, "arm": "azure_basic_lc", "question_id": "2307.09288v2.pdf::Q005", "answer_letter": null, "raw_text": "I need to examine Table 21 in the document to count how many slots (cells) are missing data.\n\nLooking at Table 21 in the document, which shows \"Code generation results on Human-Eval and MBPP\":\n\nThe table has the following structure:\n- Rows: Different models (MPT 7B, Falcon 7B/40B, LLAMA 1 7B/13B/33B/65B, LLAMA 2 7B/13B/34B/70B)\n- Columns: Human-Eval pass@1, Human-Eval pass@100, MBPP pass@1, MBPP pass@80\n\nLet me count the missing entries (marked with \"-\"):\n\nMPT 7B: pass@100 for Human-Eval is missing (1 slot)\nMPT 30B: pass@100 for Human-Eval is missing (1 slot)\nFalcon 40B: pass@1 and pass@100 for Human-Eval are missing, pass@1 and pass@80 for MBPP are missing (4 slots)\n\nTotal missing slots: 1 + 1 + 4 = 6\n\nAnswer: 6", "citations": [], "input_tokens": 80760, "output_tokens": 280, "cost_micros": 246480, "latency_ms": 8250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "8"}}
+{"qid": "2307.09288v2.pdf::Q005", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "8", "pages": 77, "document_id": 7588, "arm": "azure_premium_lc", "question_id": "2307.09288v2.pdf::Q005", "answer_letter": null, "raw_text": "I need to examine Table 21 in the document to count the missing slots.\n\nLooking at Table 21, which shows \"Code generation results on Human-Eval and MBPP\":\n\nThe table has the following structure:\n- Column headers: Model name, size, and then 4 metric columns (pass@1, pass@100 for Human-Eval, and pass@1, pass@80 for MBPP)\n- Row groups for: MPT (2 sizes), Falcon (2 sizes), LLAMA 1 (4 sizes), and LLAMA 2 (4 sizes)\n\nLet me count the cells systematically:\n\nFor MPT:\n- 7B: has values for pass@1 and pass@1 (MBPP), missing pass@100 and pass@80 = 2 missing\n- 30B: has values for pass@1 and pass@1 (MBPP), missing pass@100 and pass@80 = 2 missing\n\nFor Falcon:\n- 7B: has values for pass@1 and pass@1 (MBPP), missing pass@100 and pass@80 = 2 missing\n- 40B: has values for pass@1 and pass@1 (MBPP), missing pass@100 and pass@80 = 2 missing\n\nFor LLAMA 1 (4 sizes: 7B, 13B, 33B, 65B):\n- All sizes have all 4 metrics filled = 0 missing\n\nFor LLAMA 2 (4 sizes: 7B, 13B, 34B, 70B):\n- All sizes have all 4 metrics filled = 0 missing\n\nTotal missing slots = 2 + 2 + 2 + 2 = 8\n\nAnswer: 8", "citations": [], "input_tokens": 109909, "output_tokens": 429, "cost_micros": 336162, "latency_ms": 10592, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "8"}}
+{"qid": "2307.09288v2.pdf::Q005", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "8", "pages": 77, "document_id": 7588, "arm": "llamacloud_basic_lc", "question_id": "2307.09288v2.pdf::Q005", "answer_letter": null, "raw_text": "I need to examine Table 21 in the document to count how many slots (cells) are missing data.\n\nLooking at Table 21 titled \"Code generation results on Human-Eval and MBPP\":\n\nThe table has the following structure:\n- Column headers: Model, Human-Eval (pass@1, pass@100), MBPP (pass@1, pass@80)\n- Row headers: MPT, Falcon, Llama 1, Llama 2\n\nLet me count the cells:\n- MPT row: 7B size is listed, then has values 18.3, -, 22.6, -\n- Falcon row: 7B size is listed, then has values 0.0, -, 11.2, -\n- Llama 1 row: 7B size is listed, then has values 15.8, 52.5, 22.0, 64.0\n- Llama 2 row: 7B size is listed, then has values 18.3, 60.2, 30.6, 69.0\n\nThe \"-\" symbols indicate missing data.\n\nCounting the dashes:\n- MPT: 2 dashes (pass@100 for Human-Eval, pass@80 for MBPP)\n- Falcon: 2 dashes (pass@100 for Human-Eval, pass@80 for MBPP)\n- Llama 1: 0 dashes (all values present)\n- Llama 2: 0 dashes (all values present)\n\nTotal missing slots: 2 + 2 = 4\n\nAnswer: 4", "citations": [], "input_tokens": 80959, "output_tokens": 385, "cost_micros": 248652, "latency_ms": 10391, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "8"}}
+{"qid": "2307.09288v2.pdf::Q005", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "8", "pages": 77, "document_id": 7588, "arm": "llamacloud_premium_lc", "question_id": "2307.09288v2.pdf::Q005", "answer_letter": null, "raw_text": "I need to examine Table 21 to count the missing slots (empty cells).\n\nLooking at Table 21 titled \"Code generation results on Human-Eval and MBPP\":\n\nThe table has the following structure:\n- Column headers: Model | Size | Human-Eval pass@1 | Human-Eval pass@100 | MBPP pass@1 | MBPP pass@80\n- Rows for: MPT (7B, 30B), Falcon (7B, 40B), LLAMA 1 (7B, 13B, 33B, 65B), LLAMA 2 (7B, 13B, 34B, 70B)\n\nLet me count the cells:\n- Total rows with data: 12 (excluding header)\n- Total columns with data: 6 (including Model and Size)\n- So for the data columns (excluding Model column): 12 rows \u00d7 4 data columns = 48 potential data cells\n\nNow counting the filled cells in the data columns:\n- MPT 7B: 4 values (18.3, -, 22.6, -)\n- MPT 30B: 4 values (25.0, -, 32.8, -)\n- Falcon 7B: 4 values (0.0, -, 11.2, -)\n- Falcon 40B: 4 values (0.6, -, 29.8, -)\n- LLAMA 1 7B: 4 values (10.5, 36.5, 17.7, 56.2)\n- LLAMA 1 13B: 4 values (15.8, 52.5, 22.0, 64.0)\n- LLAMA 1 33B: 4 values (21.7, 70.7, 30.2, 73.4)\n- LLAMA 1 65B: 4 values (23.7, 79.3, 37.7, 76.8)\n- LLAMA 2 7B: 4 values (12.8, 45.6, 20.8, 62.", "citations": [], "input_tokens": 105632, "output_tokens": 512, "cost_micros": 324576, "latency_ms": 13625, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "8"}}
+{"qid": "2307.09288v2.pdf::Q005", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "8", "pages": 77, "document_id": 7588, "arm": "surfsense_agentic", "question_id": "2307.09288v2.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to find Table 21 and count the missed slots.Let me search for Table 21 in the document:Now let me search for where Table 21 appears with more context:Let me read the area around line 7837 where Table 21 is located:Now I have Table 21. Let me analyze it carefully. The table shows code generation results with columns for Human-Eval (pass@1, pass@100) and MBPP (pass@1, pass@80). I can see that some entries have \"-\" which indicates missing data. Let me count all the \"-\" entries:\n\nLooking at the table structure:\n- MPT 7B: 2 missing values (pass@100 for Human-Eval and pass@80 for MBPP marked with \"-\")\n- MPT 30B: 2 missing values (pass@100 for Human-Eval and pass@80 for MBPP marked with \"-\")\n- Falcon 7B: 2 missing values (pass@100 for Human-Eval and pass@80 for MBPP marked with \"-\")\n- Falcon 40B: 2 missing values (pass@100 for Human-Eval and pass@80 for MBPP marked with \"-\")\n- LLAMA 1 models (7B, 13B, 33B, 65B): All have values, no \"-\"\n- LLAMA 2 models (7B, 13B, 34B, 70B): All have values, no \"-\"\n\nTotal missed slots = 2 + 2 + 2 + 2 = 8\n\nAnswer: 8", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 42437, "error": null, "extra": {"thread_id": 1996, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_b8bd21d9aaba4efda0a40946ca2b4039", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 222, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "8"}}
+{"qid": "2307.09288v2.pdf::Q006", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "13", "pages": 77, "document_id": 7588, "arm": "native_pdf", "question_id": "2307.09288v2.pdf::Q006", "answer_letter": null, "raw_text": "I'll examine pages 49-50 to count the datasets reported.\n\nOn page 49 (Table 20), I can see results for several standard benchmarks including:\n- BoolQ\n- PIQA\n- SIQA\n- HellaSwag\n- WinoGrande\n- ARC-e\n- ARC-c\n- OBQA\n- CSQA\n- MMLU\n\nThat's 10 datasets on page 49.\n\nOn page 50 (Table 21), I can see code generation results on:\n- Human-Eval\n- MBPP\n\nThat's 2 datasets on page 50.\n\nTotal datasets reported across pages 49-50: 10 + 2 = 12\n\nAnswer: 12", "citations": [], "input_tokens": 199213, "output_tokens": 177, "cost_micros": 600294, "latency_ms": 43437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2307.09288v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "12", "normalised_gold": "13"}}
+{"qid": "2307.09288v2.pdf::Q006", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "13", "pages": 77, "document_id": 7588, "arm": "azure_basic_lc", "question_id": "2307.09288v2.pdf::Q006", "answer_letter": null, "raw_text": "I need to examine pages 49-50 of the document to count the datasets mentioned.\n\nLooking at page 49, I can see:\n- Table 20: Performance on standard benchmarks (mentions BoolQ, PIQA, SIQA, HellaSwag, WinoGrande, ARC-e, ARC-c, OBQA, CSQA, MMLU)\n- Table 21: Code generation results on Human-Eval and MBPP\n\nLooking at page 50, I can see:\n- Table 22: NaturalQuestions and TriviaQA results\n- Table 23: SQUAD and QUAC results\n- Table 24: AGI Eval comparison (mentions AQuA-RAT, LogiQA, LSAT-AR, LSAT-LR, LSAT-RC, SAT-en, SAT-en (w/o Psg.), SAT-math)\n\nLet me count all unique datasets mentioned across these tables on pages 49-50:\n\nFrom Table 20 (page 49):\n1. BoolQ\n2. PIQA\n3. SIQA\n4. HellaSwag\n5. WinoGrande\n6. ARC-e\n7. ARC-c\n8. OBQA\n9. CSQA\n10. MMLU\n\nFrom Table 21 (page 49):\n11. Human-Eval\n12. MBPP\n\nFrom Table 22 (page 50):\n13. NaturalQuestions\n14. TriviaQA\n\nFrom Table 23 (page 50):\n15. SQUAD\n16. QUAC\n\nFrom Table 24 (page 50):\n17. AGI Eval (which includes multiple subtasks: AQuA-RAT, LogiQA, LSAT-AR, LSAT-LR, LSAT-RC, SAT-en, SAT-en (w/o Psg.), SAT-math)\n\nCounting the subtasks of AGI Eval separately:\n17. AQuA-RAT\n18. LogiQA\n19. LSAT-AR\n20. LSAT-LR\n21.", "citations": [], "input_tokens": 80762, "output_tokens": 512, "cost_micros": 249966, "latency_ms": 10782, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "21", "normalised_gold": "13"}}
+{"qid": "2307.09288v2.pdf::Q006", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "13", "pages": 77, "document_id": 7588, "arm": "azure_premium_lc", "question_id": "2307.09288v2.pdf::Q006", "answer_letter": null, "raw_text": "I need to examine pages 49-50 of the document to count the number of datasets reported.\n\nLooking at page 49, I can see:\n- Table 20: \"Performance on standard benchmarks\" - this discusses various benchmarks but appears to be evaluation results rather than listing distinct datasets\n- Table 21: \"Code generation results on Human-Eval and MBPP\" - mentions 2 datasets: Human-Eval and MBPP\n\nLooking at page 50, I can see:\n- Table 22: \"NaturalQuestions\" and \"TriviaQA\" - 2 datasets\n- Table 23: \"Comparison to open-source models on reading comprehension (SQUAD and QUAC)\" - 2 datasets: SQUAD and QUAC\n- Table 24: \"Comparison to open source models on AGI Eval (English)\" - 1 dataset: AGI Eval (with multiple subtasks listed: AQUA-RAT, LogiQA, LSAT-AR, LSAT-LR, LSAT-RC, SAT-en, SAT-en (w/o Psg.), SAT-math)\n\nLet me count the distinct datasets mentioned:\n1. Human-Eval\n2. MBPP\n3. NaturalQuestions\n4. TriviaQA\n5. SQUAD\n6. QUAC\n7. AGI Eval (this is one dataset with multiple subtasks)\n\nAdditionally, in Table 20, there are multiple benchmarks mentioned: BoolQ, PIQA, SIQA, HellaSwag, WinoGrande, ARC-e, ARC-c, OBQA, CSQA, MMLU - that's 10 more datasets.\n\nTotal distinct datasets across pages 49-50:\nFrom Table 20: BoolQ, PIQA, SIQA, HellaSwag, WinoGrande, ARC-e, ARC-c, OBQA, CSQA, MMLU (10)\nFrom Table 21: Human-Eval, MBPP (2)\nFrom Table 22: NaturalQuestions, TriviaQA (2)\nFrom Table 23: SQUAD,", "citations": [], "input_tokens": 109911, "output_tokens": 512, "cost_micros": 337413, "latency_ms": 12858, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "23", "normalised_gold": "13"}}
+{"qid": "2307.09288v2.pdf::Q006", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "13", "pages": 77, "document_id": 7588, "arm": "llamacloud_basic_lc", "question_id": "2307.09288v2.pdf::Q006", "answer_letter": null, "raw_text": "I need to examine pages 49-50 to count the datasets reported there.\n\nLooking at page 49, I can see:\n- Table 20: Performance on standard benchmarks (mentions BoolQ, PIQA, SIQA, HellaSwag, WinoGrande, ARC-e, ARC-c, OBQA, CSQA, MMLU)\n- Table 21: Code generation results on Human-Eval and MBPP\n- Table 22: NaturalQuestions and TriviaQA Performance\n\nLooking at page 50, I can see:\n- Table 23: Comparison to Open-Source Models on Reading Comprehension (SQUAD and QUAC)\n- Table 24: Comparison to Open Source Models on AGI Eval (English) - mentions AQuA-RAT, LogiQA, LSAT-AR, LSAT-LR, LSAT-RC, SAT-en, SAT-en (w/o Psg.), SAT-math\n\nNow let me count all unique datasets mentioned across pages 49-50:\n\nFrom Table 20: BoolQ, PIQA, SIQA, HellaSwag, WinoGrande, ARC-e, ARC-c, OBQA, CSQA, MMLU (10 datasets)\n\nFrom Table 21: Human-Eval, MBPP (2 datasets)\n\nFrom Table 22: NaturalQuestions, TriviaQA (2 datasets)\n\nFrom Table 23: SQUAD, QUAC (2 datasets)\n\nFrom Table 24: AQuA-RAT, LogiQA, LSAT-AR, LSAT-LR, LSAT-RC, SAT-en, SAT-en (w/o Psg.), SAT-math (8 datasets, noting that SAT-en and SAT-en (w/o Psg.) are variants of the same dataset but listed separately)\n\nTotal unique datasets: 10 + 2 + 2 + 2 + 8 = 24\n\nAnswer: 24", "citations": [], "input_tokens": 80961, "output_tokens": 493, "cost_micros": 250278, "latency_ms": 11625, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "24", "normalised_gold": "13"}}
+{"qid": "2307.09288v2.pdf::Q006", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "13", "pages": 77, "document_id": 7588, "arm": "llamacloud_premium_lc", "question_id": "2307.09288v2.pdf::Q006", "answer_letter": null, "raw_text": "I need to carefully examine pages 49-50 to count the number of datasets reported.\n\nLooking at page 49 (Table 20), I can see:\n- BoolQ\n- PIQA\n- SIQA\n- HellaSwag\n- WinoGrande\n- ARC-e\n- ARC-c\n- OBQA\n- CSQA\n- MMLU\n\nThat's 10 datasets in Table 20.\n\nLooking at page 49 (Table 21), I can see:\n- Human-Eval\n- MBPP\n\nThat's 2 datasets in Table 21.\n\nLooking at page 50 (Table 22), I can see:\n- NaturalQuestions\n- TriviaQA (Wiki)\n\nThat's 2 datasets in Table 22.\n\nLooking at page 50 (Table 23), I can see:\n- SQUAD\n- QUAC\n\nThat's 2 datasets in Table 23.\n\nLooking at page 50 (Table 24), I can see:\n- AQuA-RAT\n- LogiQA\n- LSAT-AR\n- LSAT-LR\n- LSAT-RC\n- SAT-en\n- SAT-en (w/o Psg.)\n- SAT-math\n\nThat's 8 datasets in Table 24.\n\nTotal datasets across pages 49-50: 10 + 2 + 2 + 2 + 8 = 24\n\nAnswer: 24", "citations": [], "input_tokens": 105634, "output_tokens": 355, "cost_micros": 322227, "latency_ms": 11062, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "24", "normalised_gold": "13"}}
+{"qid": "2307.09288v2.pdf::Q006", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "13", "pages": 77, "document_id": 7588, "arm": "surfsense_agentic", "question_id": "2307.09288v2.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the document to find the information on pages 49-50.Let me search for page numbers in the document to locate pages 49-50.Let me search for page markers more broadly:", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 35907, "error": null, "extra": {"thread_id": 1997, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_db129915abb34a60aeaa6083e2f2fd47", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 109, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "49", "normalised_gold": "13"}}
+{"qid": "2307.09288v2.pdf::Q007", "doc_id": "2307.09288v2.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "15", "pages": 77, "document_id": 7588, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:52:15Z", "latency_ms": 43610, "error": null, "raw_text_chars": 650}]}, "question_id": "2307.09288v2.pdf::Q007", "answer_letter": null, "raw_text": "I need to find Figure 9 and count the emojis in the left and right subfigures.\n\nLooking through the document, I find Figure 9 on page 16 (document page 16). The figure shows \"Issues with multi-turn memory (left) can be improved with GAtt (right).\"\n\nLooking at the left subfigure:\n- It shows a conversation with text-only responses (no emojis visible)\n\nLooking at the right subfigure:\n- First response: 4 emojis (\ud83d\udc4b\ud83d\udc41\ufe0f\ud83d\udc44\ud83d\udc41\ufe0f)\n- Second response: 4 emojis (\u2708\ufe0f\u2708\ufe0f\ud83d\udeeb\ud83d\udcba)\n- Third response: 4 emojis (\ud83e\uddb4\ud83e\udd95\ud83e\uddb4\ud83d\udd25)\n- Fourth response: 2 emojis (\ud83c\uddee\ud83c\uddf9\ud83c\udf5d\ud83d\udc90)\n\nTotal in right subfigure: 4 + 4 + 4 + 2 = 14 emojis\n\nTotal in left subfigure: 0 emojis\n\nDifference: 14 - 0 = 14\n\nAnswer: 14", "citations": [], "input_tokens": 199223, "output_tokens": 267, "cost_micros": 601674, "latency_ms": 43578, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2307.09288v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "14", "normalised_gold": "15"}}
+{"qid": "2307.09288v2.pdf::Q007", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "15", "pages": 77, "document_id": 7588, "arm": "azure_basic_lc", "question_id": "2307.09288v2.pdf::Q007", "answer_letter": null, "raw_text": "I need to examine Figure 9 to count the emojis in the left and right subfigures.\n\nLooking at Figure 9, which is titled \"Issues with multi-turn memory (left) can be improved with GAtt (right)\":\n\nLeft subfigure:\n- Shows a conversation about \"Always answer with emojis\"\n- The response \"Who are you?\" appears to have text but I need to count emojis\n- Looking at the left side, I can see there are some emojis in the responses\n\nRight subfigure:\n- Shows the same conversation structure with GAtt improvement\n- The response appears to have more emojis\n\nLet me count more carefully by examining the figure:\n\nLeft side conversation:\n- \"Who are you?\" response appears to have some emojis\n- \"How to go from Paris to NY?\" response has text with some emojis\n\nRight side conversation:\n- \"Who are you?\" response has emojis\n- \"How to go from Paris to NY?\" response has more emojis\n- There appears to be an additional question about Italy\n\nLooking at the visual representation in Figure 9, the right subfigure clearly shows more emoji usage throughout the conversation compared to the left subfigure. \n\nCounting the visible emojis:\n- Left: appears to have fewer emojis (approximately 2-3 visible)\n- Right: appears to have more emojis throughout (approximately 10-12 visible)\n\nThe difference appears to be approximately 8-9 emojis.\n\nAnswer: 8", "citations": [], "input_tokens": 80772, "output_tokens": 344, "cost_micros": 247476, "latency_ms": 10140, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "15"}}
+{"qid": "2307.09288v2.pdf::Q007", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "15", "pages": 77, "document_id": 7588, "arm": "azure_premium_lc", "question_id": "2307.09288v2.pdf::Q007", "answer_letter": null, "raw_text": "I need to examine Figure 9 to count the emojis in the left and right subfigures.\n\nLooking at Figure 9, which shows \"Issues with multi-turn memory (left) can be improved with GAtt (right)\":\n\n**Left subfigure (without GAtt):**\n- Shows a conversation with \"Who are you?\" followed by \"How to go from Paris to NY?\" and \"What caused the extinction of the dinosaurs?\"\n- I can see there are emoji circles (\ud83d\ude0a symbols) used in the conversation\n- Counting the emojis: I see 3 emoji symbols\n\n**Right subfigure (with GAtt):**\n- Shows the same conversation structure with \"Who are you?\" followed by \"How to go from Paris to NY?\" \n- Counting the emojis: I see 3 emoji symbols as well\n\nWait, let me recount more carefully by looking at the figure description and structure:\n\nLeft side: I count 3 emoji/circle symbols in the conversation flow\nRight side: I count 3 emoji/circle symbols in the conversation flow\n\nActually, looking more carefully at the figure, both sides appear to have the same number of emoji symbols (3 each) representing the conversation turns.\n\nThe difference would be: 3 - 3 = 0\n\nAnswer: 0", "citations": [], "input_tokens": 109921, "output_tokens": 294, "cost_micros": 334173, "latency_ms": 9984, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "0", "normalised_gold": "15"}}
+{"qid": "2307.09288v2.pdf::Q007", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "15", "pages": 77, "document_id": 7588, "arm": "llamacloud_basic_lc", "question_id": "2307.09288v2.pdf::Q007", "answer_letter": null, "raw_text": "I need to examine Figure 9 to count the emojis in the left and right subfigures.\n\nLooking at Figure 9 in the document, which shows \"Issues with multi-turn memory (left) can be improved with GAtt (right)\":\n\nLeft subfigure (without GAtt):\n- Shows a conversation about acting as Oscar Wilde\n- I can see emoji-like elements in the visualization\n\nRight subfigure (with GAtt):\n- Shows the same type of conversation\n- Also has emoji-like elements in the visualization\n\nLet me count more carefully. The figure appears to show attention visualization patterns with what look like emoji or icon representations in the conversation bubbles.\n\nFrom the image description and visual elements:\n- Left side: Shows conversation elements with some emoji representations\n- Right side: Shows conversation elements with emoji representations\n\nLooking at the actual figure more carefully, the right subfigure appears to have more emoji icons visible in the conversation compared to the left subfigure. \n\nCounting the visible emojis/icons in each:\n- Left: appears to have fewer emoji elements\n- Right: appears to have more emoji elements\n\nBased on the visual comparison, the right subfigure has approximately 2 more emojis than the left subfigure.\n\nAnswer: 2", "citations": [], "input_tokens": 80971, "output_tokens": 281, "cost_micros": 247128, "latency_ms": 10141, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "15"}}
+{"qid": "2307.09288v2.pdf::Q007", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "15", "pages": 77, "document_id": 7588, "arm": "llamacloud_premium_lc", "question_id": "2307.09288v2.pdf::Q007", "answer_letter": null, "raw_text": "I need to examine Figure 9 to count the emojis in the left and right subfigures.\n\nLooking at Figure 9 (titled \"Issues with multi-turn memory (left) can be improved with GAtt (right)\"):\n\n**Left subfigure (Generic Preprompt):**\n- User: \"Who are you?\"\n- Assistant: \"\ud83e\udd16\ud83d\udcbb\u2328\ufe0f\" (3 emojis)\n- User: \"How to go from Paris to NY?\"\n- Assistant: Long text response with no emojis\nTotal emojis in left: 3\n\n**Right subfigure (Preprompt with Answer Template):**\n- User: \"Who are you?\"\n- Assistant: \"\ud83e\udd16\ud83d\udcac...\" (2 emojis shown, plus \"...\")\n- User: \"How to go from Paris to NY?\"\n- Assistant: \"\ud83d\udeeb\ud83d\udea2\u2708\ufe0f\ud83e\uddf3\" (4 emojis)\n- User: \"What caused the extinction of the dinosaurs?\"\n- Assistant: \"\u2604\ufe0f\ud83c\udf0b\ud83e\udd95\ud83d\udc80\" (4 emojis)\n- User: \"Italy?\"\n- Assistant: \"\ud83c\uddee\ud83c\uddf9\ud83c\udf55\ud83c\udf5d\ud83c\udfdb\ufe0f\" (4 emojis)\n\nLet me recount the right side more carefully:\n- First response: \ud83e\udd16\ud83d\udcac (2 emojis, the \"...\" indicates continuation but not additional emojis)\n- Second response: \ud83d\udeeb\ud83d\udea2\u2708\ufe0f\ud83e\uddf3 (4 emojis)\n- Third response: \u2604\ufe0f\ud83c\udf0b\ud83e\udd95\ud83d\udc80 (4 emojis)\n- Fourth response: \ud83c\uddee\ud83c\uddf9\ud83c\udf55\ud83c\udf5d\ud83c\udfdb\ufe0f (4 emojis)\nTotal emojis in right: 2 + 4 + 4 + 4 = 14\n\nDifference: 14 - 3 = 11\n\nAnswer: 11", "citations": [], "input_tokens": 105644, "output_tokens": 462, "cost_micros": 323862, "latency_ms": 13312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "11", "normalised_gold": "15"}}
+{"qid": "2307.09288v2.pdf::Q007", "doc_id": "2307.09288v2.pdf", "answer_format": "int", "gold": "15", "pages": 77, "document_id": 7588, "arm": "surfsense_agentic", "question_id": "2307.09288v2.pdf::Q007", "answer_letter": null, "raw_text": "I'll read the document to find Figure 9 and count the emojis in each subfigure.Let me search for Figure 9 in the document:Now let me read the section around line 3147 to see Figure 9:Let me continue reading to see the complete Figure 9 with emoji counts:I can see Figure 9 is described, but it mentions an embedded image. Let me look at the embedded image description more carefully. The text shows numbers like \"8\", \"0\" which might be emoji counts. Let me search for the actual image description or look for more context:The image description I saw was for a different figure (a scatter plot). Let me look back at the text content of Figure 9 more carefully. I can see there are conversations shown with numbers like \"8\" and \"0\". Let me search for the actual figure content more carefully:The text I'm seeing doesn't show the actual figure properly. Let me look for an embedded image associated with Figure 9. The page header says \"Always answer with emojis\" which is the instruction. Let me search for the actual visual description or OCR of the image:Let me go back and look more carefully at the figure. The text between lines 3146-3181 should contain the figure content. Let me look for an image file or better description:The visual description starting at line 3190 describes a scatter plot, which is NOT Figure 9. That appears to be a different figure (the embedded image X6.png). The actual Figure 9 content is in the text between lines 3147-3180. Let me look for an image file that might contain the actual Figure 9. Let me search for embedded images near Figure 9:I notice the page headers say \"Always answer with emojis\" twice (lines 3142-3143). This suggests the page has two columns - left and right subfigures. Looking at the figure text:\n\nLeft subfigure appears to show:\n- Line 3149: \"Who are you?\" \n- Line 3151: \"Who are you?\"\n- Line 3153: \"8\" (could indicate 8 emojis)\n- Line 3157: \"How to go from Paris to NY?\"\n- Line 3159: \"How to go from Paris to NY?\"\n- Line 3161: \"8\" (could indicate 8 emojis)\n- Line 3163: \"0\" (could indicate 0 emojis)\n- Line 3165-3168: Text response without emojis\n- Line 3171: \"What caused the extinction of the dinosaurs?\"\n- Line 3173: \"0\" (could indicate 0 emojis)\n\nLet me search for an image description specifically for Figure 9. Let me look before line 3146:Let me look for an image file associated with Figure 9. Let me search for embedded images around this area:So there's an embedded image X6.png at line 3185, which is right after Figure 9 ends at line 3181. But the visual description for X6.png shows it's a scatter plot, not Figure 9. Let me look for an image BEFORE Figure 9:Let me search for image descriptions that might actually be Figure 9. Let me look for an OCR or description of dialogues with emojis:Great! I found relevant information. Let me look at lines 2729-2760 which describe emoji exchanges:Excellent! This is describing an image related to Figure 9. Looking at the OCR text at lines 2729-2737:\n- \"Always answer with emojis\" (the header/instruction)\n- \"8\" (appears after \"Who are you?\")\n- \"...\" \n- \"How to go from Paris to NY?\"\n\nAnd the visual description mentions:\n1. First exchange: emojis (thinking face, eyes, and speech bubble) - that's 3 emojis\n2. Second exchange: transportation-themed emojis (airplane, boat, landmark, car, globe) - that's 5 emojis\n3. Third exchange: dinosaur-themed emojis (dinosaur, calendar/document, ocean wave, volcano, fire) - that's 5 emojis\n4. Fourth exchange: Italy-related emojis (flag, food items, landmark) - that's at least 3 emojis\n\nBut wait, the OCR shows \"8\" which might mean 8 emojis. Let me look at lines 1795-1835 which also mention \"Always answer with emojis\":This describes a different version showing the LEFT subfigure (without GAtt) - it shows the model failing to follow the \"always answer with emojis\" instruction in the third exchange (text response instead of emojis).\n\nNow let me look more carefully at the actual Figure 9 structure. Looking back at the figure text I saw earlier, the caption says \"Issues with multi-turn memory (left) can be improved with GAtt (right).\" The page has TWO column headers \"Always answer with emojis\" (lines 3142-3143), suggesting left and right subfigures.\n\nLet me count the emojis more carefully. Looking at the embedded image X1.png descriptions:\n\n**LEFT subfigure (lines 1795-1835)**: Model FAILS to always use emojis\n- First exchange: 2 emojis (robot face and laptop)\n- Second exchange: (question only shown, no response visible in description)\n- Third exchange: TEXT response (not emojis) - this shows the failure!\n\n**RIGHT subfigure (lines 2726-2760)**: Model SUCCEEDS with emojis\n- First exchange: 3 emojis (thinking face, eyes, speech bubble)\n- Second exchange: 5 emojis (airplane, boat, landmark, car, globe)\n- Third exchange: 5 emojis (dinosaur, calendar/document, ocean wave, volcano, fire)\n- Fourth exchange: 3+ emojis (flag, food items, landmark)\n\nLet me count the \"food items\" more carefully. It says \"flag, food items, landmark\" - \"food items\" is plural, so at least 2 food emojis. That would be at least 1 (flag) + 2 (food items) + 1 (landmark) = 4 emojis minimum for the fourth exchange.\n\nTotal for RIGHT: 3 + 5 + 5 + (at least 4) = at least 17 emojis\n\nBut wait, let me look at the OCR text again at line 2730 - it shows \"8\" right after \"Always answer with emojis\". This might indicate 8 emojis in a single response or might be part of the interface.\n\nLet me search for more specific information about counting emojis in each subfigure:The document doesn't use the term \"subfig\". Let me look more carefully at the actual textual representation of Figure 9 between lines 3146-3181. The numbers \"8\", \"0\", \"8\" might be emoji counts:I see it! The pattern shows:\n- Line 3149: \"Who are you?\" (question appears twice - probably left and right)\n- Line 3151: \"Who are you?\" (duplicate)\n- Line 3153: \"8\" (LEFT subfigure response)\n- Line 3157: \"How to go from Paris to NY?\" (appears twice)\n- Line 3159: \"How to go from Paris to NY?\" (duplicate)\n- Line 3161: \"8\" (RIGHT subfigure response)\n- Line 3163: \"0\" (LEFT subfigure response - no emojis, text instead)\n- Lines 3165-3169: Text response (LEFT fails to use emojis)\n- Line 3171: \"What caused the extinction of the dinosaurs?\"\n- Line 3173: \"0\" (LEFT subfigure - no emojis)\n- Line 3177: \"Italy?\"\n- Line 3179: \"8\" (response)\n\nThis is confusing. Let me look for an actual image file description. Let me check if there's a better description by looking at different chunks:Let me try a different approach. Let me look for the actual visual description or image file that corresponds to Figure 9. Let me search around page 15 for embedded images:The image X6.png after Figure 9 is NOT Figure 9 - it's a scatter plot. The actual Figure 9 content must be represented in the text between lines 3146-3181. But this text representation is unclear.\n\nLet me look BEFORE Figure 9 to see if there's an image file that IS Figure 9. Looking at line 3139, it says \"PageNumber 15\" and then the two page headers \"Always answer with emojis\". This must be a two-column layout. Let me search backwards for an embedded image that might be Figure 9:So Figure 9 starts immediately after the page break at line 3140. There's no embedded image file BEFORE it. The figure content itself must be in the text, or the image description is missing.\n\nBased on what I've found, let me piece together what I know from the two image descriptions I found (X1.png from lines 1795 and 2726):\n\n**LEFT subfigure** (lines 1795-1835 - model WITHOUT GAtt):\n- First response: 2 emojis\n- Later responses: TEXT (no emojis) - shows failure\n\n**RIGHT subfigure** (lines 2726-2760 - model WITH GAtt):\n- First response: 3 emojis  \n- Second response: 5 emojis\n- Third response: 5 emojis  \n- Fourth response: \"flag, food items, landmark\"\n\nFor the fourth response, \"food items\" is plural, indicating at least 2 items. So minimum: 1 flag + 2 food + 1 landmark = 4 emojis.\n\nLet me count more carefully. Looking at the visual description at lines 2750-2753 again:\n1. \"thinking face, eyes, and speech bubble emojis\" - that's 3 emojis\n2. \"airplane, boat, landmark, car, globe\" - that's 5 emojis\n3. \"dinosaur, calendar/document, ocean wave, volcano, fire\" - that's 5 emojis\n4. \"flag, food items, landmark\" - the term \"food items\" suggests multiple food emojis\n\nActually, let me check if there's a more specific count in the OCR at line 2730:OK! The OCR text at line 2730 shows \"8\". This might mean 8 emojis total for the first response, OR it might be labeling something else. But the description clearly lists:\n1. 3 emojis (thinking face, eyes, speech bubble)\n2. 5 emojis (airplane, boat, landmark, car, globe) \n3. 5 emojis (dinosaur, calendar/document, ocean wave, volcano, fire)\n4. At least 3+ emojis (flag, food items [plural = 2+], landmark)\n\nNow for the LEFT subfigure. Let me find its details:So the LEFT subfigure shows:\n- First exchange: 2 emojis (robot face and laptop)  \n- Second exchange: (no response details given)\n- Third exchange: TEXT response (not emojis) - this is the FAILURE\n\nSo the LEFT has only 2 emojis total before it fails.\n\nThe RIGHT has: 3 + 5 + 5 + (at least 3) = at least 16 emojis\n\nBut the question asks specifically about Figure 9. Let me verify these two X1.png images at lines 1795 and 2726 are actually the left and right parts of Figure 9. They both have the same instruction \"Always answer with emojis\" at the top, which matches the page headers for Figure 9.\n\nActually, looking more carefully, let me check how many food items specifically. The text says \"flag, food items, landmark\". If \"food items\" means 2 items, then the fourth response would be: 1 + 2 + 1 = 4 emojis.\n\nSo RIGHT would be: 3 + 5 + 5 + 4 = 17 emojis\nLEFT has: 2 emojis\n\nDifference: 17 - 2 = 15\n\nWait, but I need to verify this is correct. Let me think about this differently. The description says \"food items\" (plural). The minimum plural is 2. But it could be more. Let me see if I can find more specific information.\n\nLooking at the specific items listed for Italy: \"flag, food items, landmark\". Given the context is Italy, common food emojis might be pizza and pasta (2 items), or there could be 3+ food items. Without more specific information, I'll assume \"food items\" means at least 2.\n\nActually, looking at the pattern of the other responses which are very specific:\n- Response 1: lists 3 specific emojis\n- Response 2: lists 5 specific emojis  \n- Response 3: lists 5 specific emojis\n- Response 4: says \"flag, food items, landmark\"\n\nIf the describer was being consistent, and they specifically didn't count the food items individually but said \"food items\" plural, it likely means 2 food items (pizza and pasta would be typical for Italy).\n\nSo: 1 (flag) + 2 (food items) + 1 (landmark) = 4 emojis\n\nRIGHT subfigure total: 3 + 5 + 5 + 4 = 17 emojis\nLEFT subfigure total: 2 emojis\n\nAnswer: 17 - 2 = 15", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 179203, "error": null, "extra": {"thread_id": 1998, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_913336fbec024206afbfe0bdfdc22736", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 1666, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "17", "normalised_gold": "15"}}
+{"qid": "2309.17421v2.pdf::Q000", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "Blue", "pages": 166, "document_id": 7589, "arm": "native_pdf", "question_id": "2309.17421v2.pdf::Q000", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 502: {\"type\":\"https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-502/\",\"title\":\"Error 502: Bad gateway\",\"status\":502,\"detail\":\"The origin web server returned an invalid or incomplete response to Cloudflare. This typically indicates the origin is overlo", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "blue"}}
+{"qid": "2309.17421v2.pdf::Q000", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "Blue", "pages": 166, "document_id": 7589, "arm": "azure_basic_lc", "question_id": "2309.17421v2.pdf::Q000", "answer_letter": null, "raw_text": "I need to find Figure 122 in the document and look for information about the color of OSCARb in the prompt.\n\nLooking at the document, I can find Figure 122 which is titled \"Self-Reflection to Improve Coding\" in Section 10.3.\n\nIn Figure 122, there are two prompts shown. The first prompt asks to \"Generate python code to draw similar curves\" and shows a reference image with a graph. The second prompt asks \"Can you double-check the code and align it better with the reference image?\"\n\nLooking at the graphs in Figure 122, I can see there are plots comparing \"OSCarB\" and \"MiniVLM (ours)\". In the visualizations shown, OSCarB appears to be represented in orange/red color in the legend and plot lines.\n\nAnswer: orange", "citations": [], "input_tokens": 117434, "output_tokens": 184, "cost_micros": 355062, "latency_ms": 9391, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "orange", "normalised_gold": "blue"}}
+{"qid": "2309.17421v2.pdf::Q000", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "Blue", "pages": 166, "document_id": 7589, "arm": "azure_premium_lc", "question_id": "2309.17421v2.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 122 in Section 10.3 on Self-Reflection to Improve Coding, I can see the prompt includes a reference image with a comparison chart.\n\nIn the chart showing \"Comparison of OSCarB and MiniVLM\", OSCarB is represented by the orange/coral colored bars.\n\nAnswer: Orange", "citations": [], "input_tokens": 133651, "output_tokens": 77, "cost_micros": 402108, "latency_ms": 6766, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "orange", "normalised_gold": "blue"}}
+{"qid": "2309.17421v2.pdf::Q000", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "Blue", "pages": 166, "document_id": 7589, "arm": "llamacloud_basic_lc", "question_id": "2309.17421v2.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 122 in the document, which shows \"Self-Reflection to Improve Coding,\" I can see the prompt mentions \"OSCARb\" in the context of generating Python code to draw similar curves.\n\nIn the figure, there is a bar chart showing \"OSCARb\" and \"MiniVLM (ours)\" with bars for \"Base,\" \"Large,\" and \"Huge\" configurations. The bars for OSCARb appear to be orange/amber colored in the visualization.\n\nAnswer: orange", "citations": [], "input_tokens": 103918, "output_tokens": 116, "cost_micros": 313494, "latency_ms": 7546, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "orange", "normalised_gold": "blue"}}
+{"qid": "2309.17421v2.pdf::Q000", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "Blue", "pages": 166, "document_id": 7589, "arm": "llamacloud_premium_lc", "question_id": "2309.17421v2.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 122 in the document, which shows \"Self-Reflection to Improve Coding,\" I can see two side-by-side examples.\n\nIn the right example (the bar chart comparison), the prompt asks to \"Write python codes to generate similar figures.\" The reference image shows a bar chart comparing OSCAR_B and MiniVLM across different categories.\n\nLooking at the bar chart, OSCAR_B is represented by blue bars, while MiniVLM (ours) is represented by orange bars.\n\nAnswer: Blue", "citations": [], "input_tokens": 139293, "output_tokens": 120, "cost_micros": 419679, "latency_ms": 7781, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "blue", "normalised_gold": "blue"}}
+{"qid": "2309.17421v2.pdf::Q000", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "Blue", "pages": 166, "document_id": 7589, "arm": "surfsense_agentic", "question_id": "2309.17421v2.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find Figure 122 and identify the color of OSCARb in the prompt.Let me search for \"Figure 122\" in the document:Now let me search for where Figure 122 appears with more context:Let me read the section around line 32131 and 32550 to see Figure 122:I can see the figure content. Let me continue reading to see more details about the colors:Let me continue reading to see the color information for the bars:I need to look more carefully at the actual figure. Let me search for more specific information about the figure 122 visual description or color information. Let me read around line 32300-32350 more carefully where the bar chart appears:The image described starting at line 32358 (Image7.jpg) is showing a Golden Retriever dog, which is not the chart. Let me look at the figure that appears before that - the bar chart figure starting at line 32302. This shows the data but doesn't explicitly state colors. Let me search for any embedded image descriptions that might be the actual bar chart:Great! I found it. Let me read the section around line 19587 where it says \"Blue bars represent OSCarB\":However, this is describing a different figure. Let me check if this is Figure 122 or another figure. Let me look back at the context around line 32300 where the chart with OSCarB appears in the \"Prompt: Write python codes to generate similar figures\" section, which is part of Figure 122. Let me look for the visual description of the image that corresponds to the reference image in the prompt:Good! This shows that OSCarB is represented by blue bars. Now let me verify this is indeed related to Figure 122. Let me check if Figure 122's prompt section has this same image. Let me look around where the figure caption for Figure 122 is:This is showing a different chart (the left side of Figure 122 with CIDEr curves). Let me look at the right side which should be the bar chart. The section at line 32297-32354 shows the bar chart prompt. Let me look for embedded images near that area. Based on the structure, the embedded images have visual descriptions. Let me check what comes just before line 32297:I see that Image5.jpg and Image6.jpg are unrelated (a person and a cat). These embedded images are interrupting the flow. The actual bar chart reference image in the prompt should be described elsewhere. Let me look for where Figure 122 starts and what images are in it. Let me search for the image that appears right before the \"Prompt: Write python codes to generate similar figures\":Let me look at the first occurrence at line 16349 to see if it has a visual description:This looks like it's showing both charts mixed together in the OCR text. Let me look for the embedded image before this section to find the visual description:So this is Figure 47, not Figure 122. Let me go back to the Figure 122 area and look more carefully. The prompt section starting at line 32299 shows the reference image that contains the bar chart. Let me look for what image comes right before or is part of that prompt section. Let me search more systematically around the Figure 122 area to find embedded images:Perfect! So Figure 122 \"demonstrates the application of self-reflection to improve the results shown in Figure 47.\" This means Figure 122 is showing the same charts as Figure 47, but with self-reflection applied. Since Figure 47 had the bar chart with OSCarB and MiniVLM, and I found earlier at line 19587 that \"Blue bars represent OSCarB\", this should apply to Figure 122 as well since it's based on Figure 47.\n\nLet me verify by checking if there's a specific visual description for the reference image in the right-side prompt of Figure 122. Let me look for any embedded image that might be between lines 32286-32302:Let me look at Figure 47 more carefully. Let me search for where Figure 47 begins with an embedded image:Let me look for embedded images near Figure 47. Let me search backwards from line 16333 to find the embedded images that are part of Figure 47:Let me now search for where Figure 47 contains the bar chart reference image. Since Figure 47 shows both charts (the curve chart and the bar chart), let me look for embedded images in the Figure 47 section. Let me search for \"Embedded image\" near Figure 47:These embedded images are from Section 5, not Figure 47. Let me search more systematically. Let me look for embedded images that appear right before or in Figure 47 section. Let me check around line 15900-16350:Let me search for \"Embedded image\" directly:There are many embedded images. Based on what I've seen, the visual description at line 19587 clearly states \"Blue bars represent OSCarB\" for a bar chart showing \"Comparison of OSCarB and MiniVLM.\" This is the same chart that appears in both Figure 47 and Figure 122 (since Figure 122 is showing improvements to Figure 47 via self-reflection).\n\nLet me verify once more by looking at line 39070 which also mentioned OSCarB color:Interesting! This description says \"Red/coral bars: Represent OSCarB method\" while the earlier one at line 19587 said \"Blue bars represent OSCarB\". These might be describing different versions or different embedded images of the same chart. Let me check if these are in different contexts. Let me look at what figure this is part of (line 39070):Let me check what figure or section this is in:This is Image10.jpg. Let me look back to see what figure this is part of. Let me search backwards for \"Figure\" before line 38980:These are Instagram Reels images, not related to Figure 122. Let me search more directly. Let me look for what section/figure is near line 39000-39100. Let me search backwards for \"Figure\" or \"Sec\":These are all Instagram food images. Let me search for which figure or section contains line 39070. Let me look much earlier before these Instagram images:Still Instagram images. Let me just search for \"Figure 122\" to look for its caption and boundaries:So Figure 122's caption is at line 32550. The figure shows two parts - Left (curves) and Right (bar chart). The GPT-4V output for the right side starts at line 32552. This is within Figure 122. Now I need to find the reference image (the prompt input) for the right side, which should show the original bar chart with OSCarB colors. The prompt for the right side is at line 32299 \"Write python codes to generate similar figures.\" The reference image data is at lines 32302-32354. But this doesn't tell me colors directly since it's just OCR text.\n\nBased on my findings:\n1. Figure 122 is based on Figure 47 (as stated at line 32131-32132)\n2. The bar chart in the figures compares OSCarB and MiniVLM\n3. At line 19587, the visual description clearly states \"Blue bars represent OSCarB\"\n4. At line 39070, another visual description states \"Red/coral bars: Represent OSCarB\"\n\nThese seem to be describing different embedded images. Let me check if the one at 19587 is specifically part of Figure 47:This is Section 7.3, not Figure 47. Let me look for what this section is:This is Section 7.3. So the embedded image with \"Blue bars represent OSCarB\" is in Section 7.3, not in Figure 47. Let me search for what figures are in the Sec 10.3 Self-Reflection area where Figure 122 is. Let me look for embedded images right before or around the prompt at line 32299. Actually, the reference image for the prompt should be immediately before it. Let me look:The <figure> starting at line 32302 is the reference image for the prompt \"Write python codes to generate similar figures.\" This figure contains the bar chart data (OSCARB, MiniVLM, etc.) shown as OCR text from lines 32304-32354. But this OCR text doesn't tell me the colors. The actual visual description of this embedded image should be before or after the figure content. Let me check line 32354 onwards:That's the Golden Retriever image (Image7.jpg), which is not the bar chart. So the bar chart figure (lines 32302-32354) doesn't have a separate \"Embedded image\" marker. This means the figure content IS the embedded image content, just displayed as the <figure> OCR text.\n\nLet me look for Image4.jpg or other images that might be before line 32302:I can see that Image5.jpg and Image6.jpg are unrelated (a person and a cat). These embedded images are interspersed in the document but not part of Figure 122's chart. The actual chart images in Figure 122 are shown as <figure> blocks without \"Embedded image\" labels.\n\nLooking at the document structure, I notice that Figure 122 references Figure 47. Since I know that:\n1. Figure 122 is improving upon Figure 47\n2. They show the same charts (left: curves, right: bar chart)\n3. The color of the bars should be consistent between them\n\nLet me look more carefully at Figure 47 to find which embedded image is the bar chart. Let me search around Figure 47's location (around line 16333):This is still in Section 4.5 Multilingual, before Figure 47. Let me search for images closer to Figure 47. Actually, let me take a different approach. Since Figure 47 appears around line 16333 and the caption says it shows examples, the actual embedded images for Figure 47 should have visual descriptions. But since Figure 47 shows generated output rather than original images, the <figure> blocks ARE the actual content.\n\nGiven the conflicting information (line 19587 says \"Blue\" for OSCarB, line 39070 says \"Red/coral\" for OSCarB), let me check which one is actually related to Figure 122 or Figure 47. Let me look at the context more carefully. The document says Figure 122 improves Figure 47. The prompt in Figure 122 at line 32299 is \"Write python codes to generate similar figures\" and the reference image is at lines 32302-32354 showing the bar chart data.\n\nSince Figure 122's caption (line 32550) says \"Right: the percentage is added to align with the reference image\", the reference image (the prompt image) is the ORIGINAL one that needs improvement. This original should be from Figure 47.\n\nLet me look at the data itself. At line 32312, it says \"OSCARB\" (all caps), and at line 32553, in the GPT-4V output, it says \"OSCarB\" (mixed case). Based on the color descriptions I found, and given that there are multiple embedded images of this chart throughout the document, I need to determine which color appears in the specific prompt image in Figure 122.\n\nGiven the two conflicting descriptions and the difficulty in determining which applies to Figure 122's prompt image specifically, let me check if there's any pattern. The one at line 19587 explicitly lists the color scheme at lines 19514-19518:\n- \"Bright cyan/blue for first method\"  \n- \"Orange-red for second method\"\n\nAnd it says at line 19587: \"Blue bars represent OSCarB\"\n\nThe one at line 39070 says:\n- \"Red/coral bars: Represent OSCarB method\"\n- \"Teal/cyan bars: Represent MiniVLM method\"\n\nThese are OPPOSITE! Let me check the legend text. At line 32312-32314 in Figure 122's prompt image, it shows:\n\"OSCARB\"\n\"MiniVLM (ours)\"\n\nAnd at line 32553-32561 in the GPT-4V output for Figure 122:\n\"OSCarB\"\n\"MiniVLM (ours)\"\n\nLet me check the earlier descriptions for legend position. At line 19505-19506: \"Legend in upper right distinguishes the two methods\"\n\nGiven that standard chart conventions typically list legend items in order, and OSCarB is listed first (before MiniVLM), OSCarB would typically be the first color in the color scheme.\n\nLooking at line 19515-19516:\n- \"Bright cyan/blue for first method\"\n- \"Orange-red for second method\"\n\nIf OSCarB is the first method (listed first in the legend), then it should be \"Bright cyan/blue\" which matches line 19587: \"Blue bars represent OSCarB\"\n\nAnswer: blue", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 238671, "error": null, "extra": {"thread_id": 1999, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_e4d50e05962f4bf6af3a2d9a2bff8f0a", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 2074, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "blue", "normalised_gold": "blue"}}
+{"qid": "2309.17421v2.pdf::Q001", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "YouTube Music", "pages": 166, "document_id": 7589, "arm": "native_pdf", "question_id": "2309.17421v2.pdf::Q001", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "youtube music"}}
+{"qid": "2309.17421v2.pdf::Q001", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "YouTube Music", "pages": 166, "document_id": 7589, "arm": "azure_basic_lc", "question_id": "2309.17421v2.pdf::Q001", "answer_letter": null, "raw_text": "I need to look at Figure 113 to identify the music app shown in the computer screen.\n\nLooking at the document, Figure 113 is described in the section on \"Notification Understanding\" (Section 9.8, page 143).\n\nIn the image caption and description for Figure 113, I can see a computer desktop screenshot. Looking at the desktop icons visible in the image, I can see several application icons including:\n- YouTube\n- Music\n- Recycle Bin\n- Tabby Terminal\n\nThe music app shown on the desktop is labeled as \"Music\" in the icon.\n\nAnswer: Music", "citations": [], "input_tokens": 117435, "output_tokens": 134, "cost_micros": 354315, "latency_ms": 8593, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.6666666666666666, "method": "str_norm", "normalised_pred": "music", "normalised_gold": "youtube music"}}
+{"qid": "2309.17421v2.pdf::Q001", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "YouTube Music", "pages": 166, "document_id": 7589, "arm": "azure_premium_lc", "question_id": "2309.17421v2.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 113 in Section 9.8 (GUI Navigation - Notification Understanding), I can see the computer desktop screen.\n\nOn the desktop, there are several icons visible on the left side, including:\n- YouTube Music\n- Recycle Bin\n- Tabby Terminal\n\nThe music app shown is YouTube Music.\n\nAnswer: YouTube Music", "citations": [], "input_tokens": 133652, "output_tokens": 80, "cost_micros": 402156, "latency_ms": 6578, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "youtube music", "normalised_gold": "youtube music"}}
+{"qid": "2309.17421v2.pdf::Q001", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "YouTube Music", "pages": 166, "document_id": 7589, "arm": "llamacloud_basic_lc", "question_id": "2309.17421v2.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 113 in the document, I can see a computer desktop screenshot with a notification that says \"Kevin Lin is calling you.\" \n\nThe figure shows several application icons on the desktop and in the taskbar. However, I cannot clearly identify a specific music app from the image description or details provided in the document about this figure.\n\nThe document states: \"The image on the computer screen is a desktop background of a bridge over a body of water. There are several icons on the desktop, including a YouTube icon, a Recycle Bin icon, and a Terminal icon. At the bottom of the screen, there is a taskbar with several application icons, such as Microsoft Edge and File Explorer.\"\n\nAnswer: Not clearly identifiable from the provided information", "citations": [], "input_tokens": 103919, "output_tokens": 160, "cost_micros": 314157, "latency_ms": 7578, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "not clearly identifiable from provided information", "normalised_gold": "youtube music"}}
+{"qid": "2309.17421v2.pdf::Q001", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "YouTube Music", "pages": 166, "document_id": 7589, "arm": "llamacloud_premium_lc", "question_id": "2309.17421v2.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 113 in the document, I need to examine the desktop icons shown in the computer screen image.\n\nThe figure shows a Windows desktop with several application icons on the left side. Among these icons, I can see \"YouTube Music\" listed as one of the desktop shortcuts.\n\nAnswer: YouTube Music", "citations": [], "input_tokens": 139294, "output_tokens": 67, "cost_micros": 418887, "latency_ms": 6266, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "youtube music", "normalised_gold": "youtube music"}}
+{"qid": "2309.17421v2.pdf::Q001", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "YouTube Music", "pages": 166, "document_id": 7589, "arm": "surfsense_agentic", "question_id": "2309.17421v2.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the document to find Figure 113 and identify the music app shown in the computer screen.Let me search for \"Figure 113\" in the document.Now let me read the section around line 29414 to see Figure 113 and its description.I can see Figure 113 is referenced, but I need to see the actual image content. Let me look a bit earlier to see the full figure with the image description.I need to look at the actual figure content for Figure 113. Let me search for where the figure starts by looking earlier.Perfect! I found Figure 113. Looking at the content around lines 29326-29350, I can see the computer screen showing desktop icons. Line 29330-29331 clearly shows:\n\n\"You Tube\nMusic\"\n\nThis appears to be \"YouTube Music\" split across two lines in the OCR text. This is the music app shown on the computer screen in Figure 113.\n\nAnswer: YouTube Music", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 52515, "error": null, "extra": {"thread_id": 2000, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_26bb070ab7624a93b3c5c955dc0af2c0", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 233, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "youtube music", "normalised_gold": "youtube music"}}
+{"qid": "2309.17421v2.pdf::Q002", "doc_id": "2309.17421v2.pdf", "answer_format": "float", "gold": "$49.99", "pages": 166, "document_id": 7589, "arm": "native_pdf", "question_id": "2309.17421v2.pdf::Q002", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 106921, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": null, "pdf_filename": "2309.17421v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "", "normalised_gold": "49.99"}}
+{"qid": "2309.17421v2.pdf::Q002", "doc_id": "2309.17421v2.pdf", "answer_format": "float", "gold": "$49.99", "pages": 166, "document_id": 7589, "arm": "azure_basic_lc", "question_id": "2309.17421v2.pdf::Q002", "answer_letter": null, "raw_text": "I need to find Figure 111 in the document and look for the price of the keyboard shown in the screenshot.\n\nLooking at the document, Figure 111 is described in the \"List of Figures\" section as:\n\"Figure 111: Section 9.8: online shopping.\"\n\nLet me search for Figure 111 in the actual content. It appears in Section 9.8 on \"Online Shopping (Ergonomic Keyboard)\".\n\nLooking at the prompt description for Figure 111:\n\"Prompt:\nImagine that you are a robot operating a phone. Like how humans operate the phone, you can move your finger to an icon, click an icon with your finger, scroll down a screen, or type some texts with the keyboard.\n\nYou are asked to shop for an ergonomic keyboard, your budget is between $50 and $100.\n\nPrevious action: \" My next move would be to move my finger to the \"Add to Cart\" button, which is located below the \"Buy New\" button and above the \"Ships from Amazon\" section. I would then click on the \"Add to Cart\" button to add the item to my shopping cart. \"\n\nYou have now completed the previous action as described above. Below is what you see on the phone screen. What will you do next?\"\n\nThe screenshot shows a cart with:\n\"Subtotal $4999\"\n\nand the product listed is:\n\"Kensington Pro Fit Ergonomic Wireless Keyboard - Black (K7540 ...\n$4999\"\n\nThe price shown is $49.99.\n\nAnswer: 49.99", "citations": [], "input_tokens": 117431, "output_tokens": 353, "cost_micros": 357588, "latency_ms": 12750, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "49.99", "normalised_gold": "49.99"}}
+{"qid": "2309.17421v2.pdf::Q002", "doc_id": "2309.17421v2.pdf", "arm": "azure_premium_lc", "answer_format": "float", "gold": "$49.99", "pages": 166, "document_id": 7589, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:56:47Z", "latency_ms": 24578, "error": null, "raw_text_chars": 640}]}, "question_id": "2309.17421v2.pdf::Q002", "answer_letter": null, "raw_text": "I need to find Figure 111 in the document and identify the price of the keyboard shown in the screenshot.\n\nLooking through the document, I can find Figure 111 which is part of Section 9.8 on GUI Navigation, specifically about Online Shopping for an Ergonomic Keyboard.\n\nIn Figure 111, the screenshot shows a shopping cart with a keyboard item. The text indicates:\n\n\"Subtotal $4999\"\n\nAnd in the cart details, it shows:\n\"Kensington Pro Fit Ergonomic Wireless Keyboard - Black (K7540...\n$4999\"\n\nThe price shown is $49.99 (represented as $4999 in the format used in the screenshot, which appears to be missing the decimal point).\n\nAnswer: 49.99", "citations": [], "input_tokens": 133648, "output_tokens": 176, "cost_micros": 403584, "latency_ms": 24578, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "49.99", "normalised_gold": "49.99"}}
+{"qid": "2309.17421v2.pdf::Q002", "doc_id": "2309.17421v2.pdf", "answer_format": "float", "gold": "$49.99", "pages": 166, "document_id": 7589, "arm": "llamacloud_basic_lc", "question_id": "2309.17421v2.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 111 in the document, which shows a phone screenshot of a shopping cart.\n\nThe screenshot displays:\n- A product: \"Kensington Pro Fit Ergonomic Wireless Keyboard - Black (K7540...)\"\n- The price shown is: $49.99\n- There's a subtotal of $49.99\n- The item shows \"In Stock\" status\n\nAnswer: 49.99", "citations": [], "input_tokens": 103915, "output_tokens": 97, "cost_micros": 313200, "latency_ms": 6328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "49.99", "normalised_gold": "49.99"}}
+{"qid": "2309.17421v2.pdf::Q002", "doc_id": "2309.17421v2.pdf", "answer_format": "float", "gold": "$49.99", "pages": 166, "document_id": 7589, "arm": "llamacloud_premium_lc", "question_id": "2309.17421v2.pdf::Q002", "answer_letter": null, "raw_text": "I need to find Figure 111 in the document and identify the price of the keyboard shown in that screenshot.\n\nLooking through the document for Figure 111...\n\nFound it on page 140:\n\nFigure 111 shows a smartphone screen displaying an Amazon shopping cart. The image shows:\n- A Kensington Pro Fit Ergonomic Wireless Keyboard - Black (K7540...)\n- Price: $49.99\n- The button shows \"Proceed to checkout (1 item)\"\n- Subtotal: $49.99\n\nThe price of the keyboard shown in the screenshot is $49.99.\n\nAnswer: 49.99", "citations": [], "input_tokens": 139290, "output_tokens": 144, "cost_micros": 420030, "latency_ms": 8921, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "49.99", "normalised_gold": "49.99"}}
+{"qid": "2309.17421v2.pdf::Q002", "doc_id": "2309.17421v2.pdf", "answer_format": "float", "gold": "$49.99", "pages": 166, "document_id": 7589, "arm": "surfsense_agentic", "question_id": "2309.17421v2.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the priority document that was flagged as user-mentioned to find Figure 111 and identify the keyboard price.Let me search for \"Figure 111\" in the document to locate it.Now let me search for where Figure 111 appears in the content with context.Let me read around line 29062 where Figure 111 is located to see the actual content and price information.Perfect! I found Figure 111 which shows a screenshot of shopping for an ergonomic keyboard. The price is clearly shown in the screenshot as $49.99 (displayed as \"$4999\" without the decimal in multiple places in lines 29088 and 29095).\n\nAnswer: 49.99", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 52062, "error": null, "extra": {"thread_id": 2001, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_dd7392a113cf44358e67b4c74d3afe31", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 172, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "49.99", "normalised_gold": "49.99"}}
+{"qid": "2309.17421v2.pdf::Q003", "doc_id": "2309.17421v2.pdf", "answer_format": "float", "gold": "76%", "pages": 166, "document_id": 7589, "arm": "native_pdf", "question_id": "2309.17421v2.pdf::Q003", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "", "normalised_gold": "76.0"}}
+{"qid": "2309.17421v2.pdf::Q003", "doc_id": "2309.17421v2.pdf", "arm": "azure_basic_lc", "answer_format": "float", "gold": "76%", "pages": 166, "document_id": 7589, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:00:12Z", "latency_ms": 7967, "error": null, "raw_text_chars": 594}]}, "question_id": "2309.17421v2.pdf::Q003", "answer_letter": null, "raw_text": "I need to find Figure 107 in the document and identify the battery percentage shown in that screenshot.\n\nLooking through the document for Figure 107, I can see it's in Section 9.8 on GUI Navigation, specifically related to online shopping.\n\nIn the image caption list, Figure 107 is described as \"Section 9.8: online shopping.\"\n\nLooking at the actual Figure 107 in the document, I can see it's a smartphone screenshot showing an Amazon shopping interface. At the top of the screenshot, there is a time display \"5:32\" and next to it shows \"75\" which represents the battery percentage.\n\nAnswer: 75", "citations": [], "input_tokens": 117429, "output_tokens": 143, "cost_micros": 354432, "latency_ms": 7967, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "75.0", "normalised_gold": "76.0"}}
+{"qid": "2309.17421v2.pdf::Q003", "doc_id": "2309.17421v2.pdf", "answer_format": "float", "gold": "76%", "pages": 166, "document_id": 7589, "arm": "azure_premium_lc", "question_id": "2309.17421v2.pdf::Q003", "answer_letter": null, "raw_text": "I need to find Figure 107 in the document and look for the battery percentage shown in the screenshot.\n\nLooking through the document for Figure 107...\n\nFound it in Section 9.8 GUI Navigation under \"Online Shopping (Ergonomic Keyboard)\".\n\nIn Figure 107, the screenshot shows a phone interface with the time \"5:27\" in the top left corner, and the battery percentage \"76\" is displayed in the top right corner of the screen.\n\nAnswer: 76", "citations": [], "input_tokens": 133646, "output_tokens": 109, "cost_micros": 402573, "latency_ms": 7312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "76.0", "normalised_gold": "76.0"}}
+{"qid": "2309.17421v2.pdf::Q003", "doc_id": "2309.17421v2.pdf", "answer_format": "float", "gold": "76%", "pages": 166, "document_id": 7589, "arm": "llamacloud_basic_lc", "question_id": "2309.17421v2.pdf::Q003", "answer_letter": null, "raw_text": "I need to look at Figure 107 to find the battery percentage shown in the screenshot.\n\nLooking through the document for Figure 107, I can see it's in the \"Sec. 9.8 GUI Navigation\" section under \"Online Shopping (Ergonomic Keyboard)\".\n\nIn Figure 107, the screenshot shows a phone interface with the time \"5:27\" displayed at the top left, and the battery percentage \"76\" shown at the top right of the screen.\n\nAnswer: 76", "citations": [], "input_tokens": 103913, "output_tokens": 112, "cost_micros": 313419, "latency_ms": 6296, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "76.0", "normalised_gold": "76.0"}}
+{"qid": "2309.17421v2.pdf::Q003", "doc_id": "2309.17421v2.pdf", "answer_format": "float", "gold": "76%", "pages": 166, "document_id": 7589, "arm": "llamacloud_premium_lc", "question_id": "2309.17421v2.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Figure 107 in Section 9.8 (GUI Navigation - Online Shopping), I need to find the battery percentage shown in the screenshot.\n\nIn the top-right corner of the screenshot shown in Figure 107, the status bar displays:\n- Time: 5:27\n- Battery percentage: 76\n\nAnswer: 76", "citations": [], "input_tokens": 139288, "output_tokens": 76, "cost_micros": 419004, "latency_ms": 6592, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "76.0", "normalised_gold": "76.0"}}
+{"qid": "2309.17421v2.pdf::Q003", "doc_id": "2309.17421v2.pdf", "answer_format": "float", "gold": "76%", "pages": 166, "document_id": 7589, "arm": "surfsense_agentic", "question_id": "2309.17421v2.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the document to find Figure 107 and identify the battery percentage shown in the screenshot.Let me search for \"Figure 107\" in the document to locate it quickly.Now let me search for the specific line with more context:Let me read around line 28504 to see Figure 107:I can see Figure 107 shows a phone screenshot. I can see \"5:27\" as the time at line 28507, and at line 28511 I see \"76\". Let me check if this is the battery percentage. Looking at the structure, it appears to show:\n- 5:27 (time)\n- 76 (likely battery percentage)\n\nLet me read a bit more to see if there's additional context or if the battery percentage is shown elsewhere:Based on the document content, Figure 107 shows a phone screenshot with the time \"5:27\" at the top, and the number \"76\" appears right after it. In typical phone screenshots, the battery percentage is displayed in the status bar at the top of the screen, usually near the time. The number 76 appears to be the battery percentage.\n\nAnswer: 76", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 64703, "error": null, "extra": {"thread_id": 2002, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_281e7f398f814d8a9e02d3b21485af45", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 226, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "76.0", "normalised_gold": "76.0"}}
+{"qid": "2309.17421v2.pdf::Q004", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "Ukraine", "pages": 166, "document_id": 7589, "arm": "native_pdf", "question_id": "2309.17421v2.pdf::Q004", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 110562, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": null, "pdf_filename": "2309.17421v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "ukraine"}}
+{"qid": "2309.17421v2.pdf::Q004", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "Ukraine", "pages": 166, "document_id": 7589, "arm": "azure_basic_lc", "question_id": "2309.17421v2.pdf::Q004", "answer_letter": null, "raw_text": "I need to look at Figure 101 to find information about the Politico website news cover.\n\nLooking at the document, Figure 101 is described in Section 9.8 \"GUI Navigation - Web Browsing (Today's News)\" on page 131.\n\nThe figure shows a Google search results page with news stories. The prompt asks GPT-4V about what happens after reading the first news article. Looking at the \"Top stories\" section in the image, I can see several news sources listed including:\n\n- **Politico**: \"State Department doubles down after Zelenskyy slams reluctance...\"\n- CNN: \"Catastrophic flooding swamped Vermont's capital...\"\n- Reuters: \"US says it no longer deems Donald Trump immune from E. Jean...\"\n- DCist: \"D.C. Council Approves Emergency Crime Bill\"\n- The New York Times: \"Hard Right Presses Culture War Fights on Defense Bill...\"\n- NPR: \"The Anthropocene began in 1950s, scientists say\"\n\nThe Politico headline mentions \"Zelenskyy,\" which refers to Volodymyr Zelenskyy, the President of Ukraine.\n\nAnswer: Ukraine", "citations": [], "input_tokens": 117439, "output_tokens": 268, "cost_micros": 356337, "latency_ms": 11250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "ukraine", "normalised_gold": "ukraine"}}
+{"qid": "2309.17421v2.pdf::Q004", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "Ukraine", "pages": 166, "document_id": 7589, "arm": "azure_premium_lc", "question_id": "2309.17421v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Figure 101 in Section 9.8 (Web Browsing - Today's News), I can see the Google search results page showing several news headlines.\n\nThe top news item from Politico shows: \"State Department doubles down after Zelenskyy slams reluctance...\"\n\nThis refers to Zelenskyy, who is the President of Ukraine.\n\nAnswer: Ukraine", "citations": [], "input_tokens": 133656, "output_tokens": 87, "cost_micros": 402273, "latency_ms": 6625, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "ukraine", "normalised_gold": "ukraine"}}
+{"qid": "2309.17421v2.pdf::Q004", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "Ukraine", "pages": 166, "document_id": 7589, "arm": "llamacloud_basic_lc", "question_id": "2309.17421v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Figure 101 in the document, I can see a screenshot showing a news article from Politico. The article is titled \"State Department doubles down after Zelenskyy slams reluctance...\" \n\nThe article discusses Ukrainian President Volodymyr Zelenskyy and Ukraine's desire to join NATO. Zelenskyy is from Ukraine.\n\nAnswer: Ukraine", "citations": [], "input_tokens": 103923, "output_tokens": 85, "cost_micros": 313044, "latency_ms": 7108, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "ukraine", "normalised_gold": "ukraine"}}
+{"qid": "2309.17421v2.pdf::Q004", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "Ukraine", "pages": 166, "document_id": 7589, "arm": "llamacloud_premium_lc", "question_id": "2309.17421v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Figure 101, I can see a Google search results page showing news articles. The figure description mentions \"Top stories\" and lists several news sources including Politico.\n\nThe first story listed under Politico is: \"State Department doubles down after Zelenskyy slams reluctance...\"\n\nThe article is about Zelenskyy, who is the President of Ukraine.\n\nAnswer: Ukraine", "citations": [], "input_tokens": 139298, "output_tokens": 88, "cost_micros": 419214, "latency_ms": 8000, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "ukraine", "normalised_gold": "ukraine"}}
+{"qid": "2309.17421v2.pdf::Q004", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "Ukraine", "pages": 166, "document_id": 7589, "arm": "surfsense_agentic", "question_id": "2309.17421v2.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document that was marked as most relevant (2309.17421v2.pdf.xml) to find Figure 101 and identify the information about the Politico website cover.Let me search for \"Figure 101\" in the document to find the specific location:Let me search for \"politico\" to find the relevant section:I can see mentions of Politico with information about Zelenskyy. Let me search more specifically for \"Figure 101\":Great! I found the reference to Figure 101 at line 27715. Let me read that section:Let me look earlier in the document to see the actual Figure 101 image content. I need to search backwards from line 27715 to find the visual description:Now I can see the context. Figure 101 shows a Google News page with various news articles. Looking at the content around lines 27625-27631, I can see:\n\n- Politico news source\n- Article title: \"State Department doubles down after Zelenskyy slams reluctance...\"\n\nThe article is about Zelenskyy (also spelled Zelensky in other parts of the document), who is the President of Ukraine. Let me verify this is correct by looking at the article content earlier:\n\nFrom lines 27419-27421 and 27445-27446, I can confirm:\n- \"State Department doubles down after Zelenskyy slams reluctance on Ukraine's entry to NATO\"\n- \"In a statement posted to Twitter on Tuesday, Ukrainian President Volodymyr Zelenskyy took issue with NATO's reluctance...\"\n\nSo the person on the cover of the news on the Politico website in Figure 101 is Zelenskyy (Volodymyr Zelenskyy), who is from Ukraine.\n\nAnswer: Ukraine", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 84875, "error": null, "extra": {"thread_id": 2003, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_6c36693d9bfb44369693c25b87e59141", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 305, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "ukraine", "normalised_gold": "ukraine"}}
+{"qid": "2309.17421v2.pdf::Q005", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "JoooDeee", "pages": 166, "document_id": 7589, "arm": "native_pdf", "question_id": "2309.17421v2.pdf::Q005", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "jooodeee"}}
+{"qid": "2309.17421v2.pdf::Q005", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "JoooDeee", "pages": 166, "document_id": 7589, "arm": "azure_basic_lc", "question_id": "2309.17421v2.pdf::Q005", "answer_letter": null, "raw_text": "I need to look at Figure 89 to identify the brand name of the coffee machine.\n\nLooking at the document, Figure 89 is in Section 9.7 \"Embodied Agent\" under \"Operating Machine\". The figure shows a coffee machine's operating panel with buttons and includes text that mentions operating menu instructions.\n\nIn the image shown in Figure 89, I can see a coffee machine operating panel. Looking at the visible text and branding on the machine, I can see \"Keurig\" appears to be the brand name based on the styling and layout typical of Keurig coffee makers.\n\nAnswer: Keurig", "citations": [], "input_tokens": 117434, "output_tokens": 139, "cost_micros": 354387, "latency_ms": 7641, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "keurig", "normalised_gold": "jooodeee"}}
+{"qid": "2309.17421v2.pdf::Q005", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "JoooDeee", "pages": 166, "document_id": 7589, "arm": "azure_premium_lc", "question_id": "2309.17421v2.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Figure 89 in Section 9.7 (Embodied Agent - Operating Machine), I can see the coffee machine's operating panel in the image.\n\nThe brand name visible on the coffee machine is \"Keurig\".\n\nAnswer: Keurig", "citations": [], "input_tokens": 133651, "output_tokens": 61, "cost_micros": 401868, "latency_ms": 6500, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "keurig", "normalised_gold": "jooodeee"}}
+{"qid": "2309.17421v2.pdf::Q005", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "JoooDeee", "pages": 166, "document_id": 7589, "arm": "llamacloud_basic_lc", "question_id": "2309.17421v2.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Figure 89 in Section 9.7 (Embodied Agent - Operating Machine), I can see the coffee machine operating panel in the image.\n\nThe brand name visible on the coffee machine is \"Keurig\".\n\nAnswer: Keurig", "citations": [], "input_tokens": 103918, "output_tokens": 60, "cost_micros": 312654, "latency_ms": 6092, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "keurig", "normalised_gold": "jooodeee"}}
+{"qid": "2309.17421v2.pdf::Q005", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "JoooDeee", "pages": 166, "document_id": 7589, "arm": "llamacloud_premium_lc", "question_id": "2309.17421v2.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Figure 89 in Section 9.7, which shows the coffee machine operating panel with button menus, I can see the brand name displayed on the machine.\n\nAnswer: JoooDeee", "citations": [], "input_tokens": 139293, "output_tokens": 47, "cost_micros": 418584, "latency_ms": 7016, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "jooodeee", "normalised_gold": "jooodeee"}}
+{"qid": "2309.17421v2.pdf::Q005", "doc_id": "2309.17421v2.pdf", "answer_format": "str", "gold": "JoooDeee", "pages": 166, "document_id": 7589, "arm": "surfsense_agentic", "question_id": "2309.17421v2.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to find Figure 89 and identify the coffee machine brand.Let me search for \"Figure 89\" in the document:Now let me search more specifically to find where Figure 89 appears:Let me read the section around line 25761 to see Figure 89 and its image:I can see Figure 89 references a coffee machine. Let me look a bit earlier to see the actual image content:Let me continue reading to see the menu and coffee machine images:Let me continue to find the actual coffee machine image with the brand visible:I see that line 25763 shows \"JoooDeee\" which appears to be OCR text from the coffee machine image. Let me look at the earlier Figure 88 section which also shows the coffee machine, as it might have clearer brand information:I can see the OCR text on line 25609 and 25763 shows \"JoooDeee\". Let me search for Figure 88 which is mentioned just before to see if there's more context about the brand:Let me look before line 25584 to see if there's an image with better OCR or visual description for Figure 88:Let me search more specifically around where Figure 88 might have its embedded image:Let me look earlier to find where Figure 88 actual image appears. Let me search backwards from where Figure 88 was mentioned:Let me search more systematically. Let me look for where the coffee machine images are actually embedded. Let me search around where both Figure 88 and 89 would appear together:Let me try a different approach and look for pages around 116-117 where Figures 88 and 89 are mentioned, searching for embedded coffee machine images. Let me search for \"Joo\" which appeared in the OCR text:Great! Let me look at line 34358 and chunk 155184 which has a better visual description with \"JoooDeee\":Perfect! I found it. According to the OCR text on line 34358, the coffee machine has \"JoooDeee\" visible on it. Looking at the visual description starting at line 34363, it describes the device as a water dispenser/cooler, but based on the context of the document discussing coffee machines and the OCR text showing \"JoooDeee\", this appears to be the brand name visible on the coffee machine in Figure 89.\n\nHowever, \"JoooDeee\" seems like it might be an OCR misreading. Let me check if there's any other reference that might clarify the brand. Looking at the OCR pattern \"JoooDeee\", this could potentially be \"Jura\" (a well-known coffee machine brand) misread by OCR, but I should report what the document actually says.\n\nBased on the document, the OCR text from the coffee machine image shows \"JoooDeee\" with \"DI\" below it.\n\nAnswer: JoooDeee", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 148609, "error": null, "extra": {"thread_id": 2004, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_f9e89fe20d454f6c90a127387a939523", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 624, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "jooodeee", "normalised_gold": "jooodeee"}}
+{"qid": "2310.05634v2.pdf::Q000", "doc_id": "2310.05634v2.pdf", "answer_format": "list", "gold": "['movement', 'birth', 'place of birth', 'member']", "pages": 24, "document_id": 7590, "arm": "native_pdf", "question_id": "2310.05634v2.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1 on page 1, I need to identify the relation arrows (edges) in the knowledge graph and determine which ones do not point to specific leaf nodes.\n\nThe figure shows a knowledge graph centered around Artemisia Gentileschi (Q212657) and Orazio Gentileschi (Q367560). The relations shown include:\n\n- **child** (from Orazio to Artemisia)\n- **father** (from Artemisia to Orazio)\n- **citizen** (from both to Italy Q38)\n- **occupation** (pointing to \"Painter\")\n- **style** (from Artemisia to Baroque Q37853)\n- **movement** (from Artemisia to Baroque Q37853)\n- **place of birth** (from Artemisia to Rome Q220)\n- **member** (from Artemisia, appears to point to another entity)\n\nThe arrows that do NOT point to specific leaf nodes (i.e., they point to other entities that are themselves nodes in the graph rather than terminal values) are:\n\n- child (points to another person entity)\n- father (points to another person entity)\n\nAnswer: child, father", "citations": [], "input_tokens": 60197, "output_tokens": 266, "cost_micros": 184581, "latency_ms": 12796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.05634v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "child, father", "normalised_gold": "birth, member, movement, place of birth"}}
+{"qid": "2310.05634v2.pdf::Q000", "doc_id": "2310.05634v2.pdf", "answer_format": "list", "gold": "['movement', 'birth', 'place of birth', 'member']", "pages": 24, "document_id": 7590, "arm": "azure_basic_lc", "question_id": "2310.05634v2.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document, I need to identify which relation arrows do not point to specific leaf nodes (entities with specific values or names).\n\nLet me examine each relation arrow from the entities shown:\n\nFrom Q367560 (Orazio Gentileschi):\n- occupation \u2192 Q1028181 (Painter) - points to a class/type node\n- movement \u2192 Q37853 (Baroque) - points to a class/type node\n- child \u2192 Artemisia Gentileschi Q212657 - points to a specific person\n\nFrom Artemisia Gentileschi Q212657:\n- father \u2192 Orazio Gentileschi Q367560 - points to a specific person\n- birth \u2192 [date shown] - points to specific value\n- place of birth \u2192 [location] - points to specific value\n- Citizen \u2192 Italy Q38 - points to a specific country\n- occupation \u2192 Q1028181 (Painter) - points to a class/type node\n- style \u2192 Q37853 (Baroque) - points to a class/type node\n- member \u2192 [organization] - appears to point to a specific entity\n\nThe relations that point to class/type nodes rather than specific leaf nodes are:\n- occupation (points to the class \"Painter\")\n- movement/style (points to the class \"Baroque\")\n\nAnswer: occupation, movement, style", "citations": [], "input_tokens": 22214, "output_tokens": 311, "cost_micros": 71307, "latency_ms": 7718, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.28571428571428575, "method": "list_set", "normalised_pred": "movement, occupation, style", "normalised_gold": "birth, member, movement, place of birth"}}
+{"qid": "2310.05634v2.pdf::Q000", "doc_id": "2310.05634v2.pdf", "answer_format": "list", "gold": "['movement', 'birth', 'place of birth', 'member']", "pages": 24, "document_id": 7590, "arm": "azure_premium_lc", "question_id": "2310.05634v2.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document, I need to identify the relation arrows (edges) in the knowledge graph that do not point to specific leaf nodes (literal values).\n\nIn the knowledge graph shown in Figure 1, I can see the following relationships:\n\n- \"Painter\" (Q1028181) - this is an entity node, not a leaf\n- \"Baroque\" (Q37853) - this is an entity node, not a leaf\n- \"Italy\" (Q38) - this is an entity node, not a leaf\n- \"Rome\" (Q212657) - this is an entity node, not a leaf\n- Orazio Gentileschi (Q367560) - this is an entity node, not a leaf\n- Artemisia Gentileschi - the central entity\n\nThe relations that point to these non-leaf entity nodes (rather than to literal values like dates or strings) are:\n\n- occupation (pointing to Painter)\n- style (pointing to Baroque)\n- movement (pointing to Baroque)\n- citizen (pointing to Italy)\n- place of birth (pointing to Rome)\n- father (pointing to Orazio Gentileschi)\n- child (pointing to Artemisia Gentileschi)\n- member (appears to be a relation as well)\n\nAnswer: occupation, style, movement, citizen, place of birth, father, child, member", "citations": [], "input_tokens": 25539, "output_tokens": 310, "cost_micros": 81267, "latency_ms": 9266, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "child, citizen, father, member, movement, occupation, place of birth, style", "normalised_gold": "birth, member, movement, place of birth"}}
+{"qid": "2310.05634v2.pdf::Q000", "doc_id": "2310.05634v2.pdf", "answer_format": "list", "gold": "['movement', 'birth', 'place of birth', 'member']", "pages": 24, "document_id": 7590, "arm": "llamacloud_basic_lc", "question_id": "2310.05634v2.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1, I need to identify the relation arrows in the Knowledge Graph that do not point to leaf nodes (specific values).\n\nIn the Knowledge Graph shown, the relations are labeled on the arrows connecting entities. Let me trace each relation:\n\n- \"occupation\" \u2192 points to \"Painter\" (Q37853) - this is a leaf node\n- \"movement\" \u2192 points to \"Baroque\" (Q1028181) - this is a leaf node\n- \"style\" \u2192 appears to connect nodes but the endpoint is not clearly a specific leaf value\n- \"place of birth\" \u2192 points to a specific location\n- \"child\" \u2192 points to \"Artemisia\" (another person entity, not a leaf value)\n- \"father\" \u2192 points to \"Orazio\" (another person entity, not a leaf value)\n- \"member\" \u2192 connects to citizenship information\n- \"citizen\" \u2192 points to \"Italy\" (a leaf node)\n\nThe relations that point to other entities rather than specific attribute values (leaf nodes) are:\n\nAnswer: child, father", "citations": [], "input_tokens": 17550, "output_tokens": 231, "cost_micros": 56115, "latency_ms": 7312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "child, father", "normalised_gold": "birth, member, movement, place of birth"}}
+{"qid": "2310.05634v2.pdf::Q000", "doc_id": "2310.05634v2.pdf", "answer_format": "list", "gold": "['movement', 'birth', 'place of birth', 'member']", "pages": 24, "document_id": 7590, "arm": "llamacloud_premium_lc", "question_id": "2310.05634v2.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1, I need to identify the relation arrows that do not point to specific leaf nodes (entity or property nodes).\n\nFrom the Knowledge Graph Context cloud, I can see the following relationships:\n\n- E1 (OrazioGentileschi) -- occupation --> P1 (Painter) - points to leaf node P1\n- E1 -- movement --> P2 (Baroque) - points to leaf node P2\n- E1 -- birth --> E1 - points back to itself, not a leaf node\n- E1 -- child --> E2 - points to E2, which has further connections\n- E2 (ArtemisiaGentileschi) -- father --> E1 - points to E1, which has further connections\n- E2 -- occupation --> P1 - points to leaf node P1\n- E2 -- style --> P2 - points to leaf node P2\n- E2 -- place of birth --> E3 - points to E3, which has further connections\n- E2 -- member --> E2 - points back to itself, not a leaf node\n- E2 -- Citizen --> E3 - points to E3, which has further connections\n- E1 -- Citizen --> E3 - points to E3, which has further connections\n\nThe relations that do not point to specific leaf nodes are those pointing to entities that have their own outgoing connections or pointing back to themselves:\n\nAnswer: birth, child, father, place of birth, member, Citizen", "citations": [], "input_tokens": 23598, "output_tokens": 341, "cost_micros": 75909, "latency_ms": 8125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.6, "method": "list_set", "normalised_pred": "birth, child, citizen, father, member, place of birth", "normalised_gold": "birth, member, movement, place of birth"}}
+{"qid": "2310.05634v2.pdf::Q000", "doc_id": "2310.05634v2.pdf", "answer_format": "list", "gold": "['movement', 'birth', 'place of birth', 'member']", "pages": 24, "document_id": 7590, "arm": "surfsense_agentic", "question_id": "2310.05634v2.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find Figure 1 and identify which relation arrows do not point to specific leaf nodes.Let me search for \"Figure 1\" in the document to locate it:Let me read more to see the complete Figure 1 description:I can see Figure 1 shows a knowledge graph. Let me look more carefully at the structure. The text shows entities and relations. Let me search for more context about the graph structure:Based on the text extracted from Figure 1, I can see the knowledge graph structure. The entities and relations listed are:\n\n- Q1028181 (Painter)\n- Q37853 (Baroque)\n- Q367560 (Orazio Gentileschi)\n- Q212657 (with \"Artemisia Gentileschi Citizen\" below it)\n- Q38 (Italy)\n\nRelations shown:\n- occupation\n- style\n- movement\n- occupation (appears twice)\n- child\n- birth\n- place of birth\n- father\n- member\n\nLooking at the structure, the relations that appear to connect between entities but may not point to specific leaf nodes would be those that connect to higher-level entities or categories. From the layout, I need to identify which relation arrows don't terminate at leaf nodes.\n\nLet me look for any figures or images that might show the actual graph structure more", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 59797, "error": null, "extra": {"thread_id": 2005, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_c3fdd69258514ce3917c73b31fe40d3e", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 249, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "let me look for any figures or images that might show actual graph structure more", "normalised_gold": "birth, member, movement, place of birth"}}
+{"qid": "2310.05634v2.pdf::Q001", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "red", "pages": 24, "document_id": 7590, "arm": "native_pdf", "question_id": "2310.05634v2.pdf::Q001", "answer_letter": null, "raw_text": "I need to locate Figure 5 in the document and examine the lines shown in it.\n\nLooking through the document, I can find Figure 5 on page 8. The figure is titled \"Retrieval Analysis\" and shows citation evaluation (Micro) of generated texts using knowledge graphs with retrieval accuracy 100 (gold), 80, 60, 40, and 20.\n\nThe figure contains four lines:\n- A yellow line (precision)\n- A green line (recall)\n- A blue line (F1_Score)\n- A red line (correctness)\n\nLooking at the graph, I need to identify which line does not intersect with any other line.\n\nThe red line (correctness) starts at the highest point and remains consistently above all other lines throughout the graph. It descends from left to right but never crosses or intersects with any of the other three lines (yellow, green, or blue).\n\nAnswer: red", "citations": [], "input_tokens": 60196, "output_tokens": 207, "cost_micros": 183693, "latency_ms": 12141, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.05634v2.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "red", "normalised_gold": "red"}}
+{"qid": "2310.05634v2.pdf::Q001", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "red", "pages": 24, "document_id": 7590, "arm": "azure_basic_lc", "question_id": "2310.05634v2.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 5 in the document, which shows \"Retrieval Analysis\" with multiple lines plotted against Retrieval Accuracy on the x-axis:\n\nThe lines shown are:\n- precision (yellow)\n- recall (green)\n- F1_Score (appears to be another color)\n- correctness (red/orange)\n\nThe correctness line (red/orange colored line at the top of the graph) maintains a consistently high value around 80-90 and does not intersect with any of the other lines throughout the plot.\n\nAnswer: red (or orange/correctness line)", "citations": [], "input_tokens": 22213, "output_tokens": 134, "cost_micros": 68649, "latency_ms": 6108, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.33333333333333337, "method": "str_norm", "normalised_pred": "red or orange correctness line", "normalised_gold": "red"}}
+{"qid": "2310.05634v2.pdf::Q001", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "red", "pages": 24, "document_id": 7590, "arm": "azure_premium_lc", "question_id": "2310.05634v2.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 5 titled \"Retrieval Analysis\", I need to identify which line does not intersect with any other line.\n\nThe figure shows four lines representing different metrics (precision, recall, F1_Score, and correctness) plotted against retrieval accuracy from gold (100) to 20.\n\nExamining the lines:\n- The yellow line (precision) intersects with other lines\n- The green line (recall) intersects with other lines  \n- The blue line (F1_Score) intersects with other lines\n- The red line (correctness) stays at the top of the graph and does not intersect with any of the other three lines\n\nAnswer: red", "citations": [], "input_tokens": 25538, "output_tokens": 150, "cost_micros": 78864, "latency_ms": 6328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "red", "normalised_gold": "red"}}
+{"qid": "2310.05634v2.pdf::Q001", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "red", "pages": 24, "document_id": 7590, "arm": "llamacloud_basic_lc", "question_id": "2310.05634v2.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 5 in the document, I need to identify the line that has no intersection with any other line.\n\nFigure 5 shows citation evaluation metrics (Micro) with retrieval accuracy from 100 (gold) down to 20. There are four lines in the graph representing:\n- Correctness (blue)\n- Precision (yellow) \n- Recall (green)\n- F1 (red)\n\nExamining the lines, the blue line (Correctness) remains at the top of the graph and does not intersect with any of the other three lines throughout the entire range of retrieval accuracy values.\n\nAnswer: Blue", "citations": [], "input_tokens": 17549, "output_tokens": 144, "cost_micros": 54807, "latency_ms": 5610, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "blue", "normalised_gold": "red"}}
+{"qid": "2310.05634v2.pdf::Q001", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "red", "pages": 24, "document_id": 7590, "arm": "llamacloud_premium_lc", "question_id": "2310.05634v2.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 5 in the document, I can see a chart showing citation evaluation metrics (precision, recall, F1_score, and correctness) plotted against retrieval accuracy levels (20, 40, 60, 80, and gold/100).\n\nExamining the lines in the chart:\n- The yellow line (precision) intersects with other lines\n- The green line (recall) intersects with other lines\n- The blue line (F1_score) intersects with other lines\n- The red line (correctness) runs above all other lines and does not intersect with any of them\n\nAnswer: red", "citations": [], "input_tokens": 23597, "output_tokens": 138, "cost_micros": 72861, "latency_ms": 5000, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "red", "normalised_gold": "red"}}
+{"qid": "2310.05634v2.pdf::Q001", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "red", "pages": 24, "document_id": 7590, "arm": "surfsense_agentic", "question_id": "2310.05634v2.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the document to find Figure 5 and identify the line with no intersections.", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 25967, "error": null, "extra": {"thread_id": 2006, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_a9d66758d1df48679fe6def6615062f0", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 40, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "i ll read document to find figure 5 and identify line with no intersections", "normalised_gold": "red"}}
+{"qid": "2310.05634v2.pdf::Q002", "doc_id": "2310.05634v2.pdf", "answer_format": "int", "gold": "3", "pages": 24, "document_id": 7590, "arm": "native_pdf", "question_id": "2310.05634v2.pdf::Q002", "answer_letter": null, "raw_text": "I need to count how many tables in the document include \"F1\" as a metric.\n\nLet me go through the document systematically:\n\n1. Table 1 (page 3) - Shows an example data entry, not evaluation metrics\n2. Table 2 (page 3) - Human Evaluation on dataset quality - no F1\n3. Table 3 (page 7) - Citation Quality evaluation - shows \"F1.\" column (both Micro and Macro)\n4. Table 4 (page 7) - Text quality evaluation - no F1\n5. Table 5 (page 7) - Citation quality with knowledge removed - shows \"F1.\" column\n6. Table 6 (page 8) - Human evaluation alignment - no F1\n7. Table 7 (page 9) - Comparison of General and Specific questions - shows \"F1.\" column\n8. Table 8 (page 15) - Human annotation instructions - no F1\n9. Tables 9-17 (pages 16-21) - Prompts and instructions - no F1\n10. Table 18 (page 22) - Evolutionary question generation example - no F1\n11. Tables 19-20 (pages 23-24) - Example outputs - no F1\n\nAdditionally, I can see:\n- Figure 4 (page 8) shows \"F1_Score\" in a graph\n- Figure 5 (page 8) shows \"F1_Score\" in a graph\n\nBut the question asks specifically about tables, not figures.\n\nTables with F1 metric:\n- Table 3\n- Table 5\n- Table 7\n\nAnswer: 3", "citations": [], "input_tokens": 60181, "output_tokens": 381, "cost_micros": 186258, "latency_ms": 25483, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.05634v2.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "3"}}
+{"qid": "2310.05634v2.pdf::Q002", "doc_id": "2310.05634v2.pdf", "answer_format": "int", "gold": "3", "pages": 24, "document_id": 7590, "arm": "azure_basic_lc", "question_id": "2310.05634v2.pdf::Q002", "answer_letter": null, "raw_text": "I need to find all tables in the document that include \"F1\" as a metric.\n\nLet me go through the tables systematically:\n\n1. Table 1 (Example data entry) - No F1 metric\n2. Table 2 (Human Evaluation on BioKaLMA dataset) - No F1 metric\n3. Table 3 (Citation Quality) - Yes, has \"F1.\" columns (both Micro and Macro)\n4. Table 4 (Text quality evaluation) - No F1 metric\n5. Table 5 (Citation quality with removed knowledge) - Yes, has \"F1.\" column\n6. Table 6 (Human Evaluation on alignment) - No F1 metric\n7. Table 7 (Comparison of General and Specific questions) - Yes, has \"F1.\" column\n\nNow let me check if there are more tables in the appendices:\n\n8. Table 8 (Human Evaluation Instructions) - No F1 metric\n9. Table 9 (Instruction for evolutionary question construction) - No F1 metric\n10. Table 10 (Instruction for round 2-5) - No F1 metric\n11. Table 11 (General question generation) - No F1 metric\n12. Table 12 (Specific question generation) - No F1 metric\n13. Table 13 (Answer generation instruction) - No F1 metric\n14. Table 14 (GPT-EVAL Coherence) - No F1 metric\n15. Table 15 (GPT-EVAL Consistency) - No F1 metric\n16. Table 16 (GPT-EVAL Fluency) - No F1 metric\n17. Table 17 (GPT-EVAL Relevance) - No F1 metric\n18. Table 18 (Evolutionary question construction example) - No F1 metric\n19. Table 19 (Example from ChatGPT) - No F1 metric\n20. Table 20 (Example from GPT-4) - No F1 metric\n\nTables with F1 metric: Table 3, Table 5, and Table 7\n\nAnswer: 3", "citations": [], "input_tokens": 22198, "output_tokens": 502, "cost_micros": 74124, "latency_ms": 9109, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "3"}}
+{"qid": "2310.05634v2.pdf::Q002", "doc_id": "2310.05634v2.pdf", "answer_format": "int", "gold": "3", "pages": 24, "document_id": 7590, "arm": "azure_premium_lc", "question_id": "2310.05634v2.pdf::Q002", "answer_letter": null, "raw_text": "I need to count all tables that include \"F1\" as a metric.\n\nLet me go through the document systematically:\n\n1. Table 1: Shows an example data entry - no F1 metric\n2. Table 2: Human Evaluation on BioKaLMA dataset - no F1 metric\n3. Table 3: Citation Quality evaluation - includes \"F1.\" column (both Micro and Macro F1)\n4. Table 4: Evaluation on generated text quality - no F1 metric\n5. Table 5: Citation quality evaluation with knowledge removed - includes \"F1.\" column\n6. Table 6: Human Evaluation on text-citation alignment - no F1 metric\n7. Table 7: Comparison of evaluation results on General and Specific question setting - includes \"F1.\" column\n8. Table 8: Instruction for human annotators - not a data table with metrics\n9. Tables 9-17: These are prompt/instruction tables - no F1 metrics\n10. Table 18: Example of generated questions - no F1 metric\n11. Tables 19-20: Examples of attributed answers - no F1 metrics\n\nTables with F1 metric:\n- Table 3\n- Table 5\n- Table 7\n\nAnswer: 3", "citations": [], "input_tokens": 25523, "output_tokens": 289, "cost_micros": 80904, "latency_ms": 8578, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "3"}}
+{"qid": "2310.05634v2.pdf::Q002", "doc_id": "2310.05634v2.pdf", "answer_format": "int", "gold": "3", "pages": 24, "document_id": 7590, "arm": "llamacloud_basic_lc", "question_id": "2310.05634v2.pdf::Q002", "answer_letter": null, "raw_text": "I need to search through the document for tables that include \"F1\" as a metric.\n\nLet me identify all tables in the document:\n\n1. Table 1: Shows an example data entry in BioKaLMA - does not contain F1 metric\n2. Table 2: Human Evaluation on BioKaLMA dataset - contains metrics: Authenticity, Relevance, Naturalness, Significance - no F1\n3. Table 3: Citation Quality evaluation - contains columns including \"F1.\" in both Micro and Macro sections - **YES, contains F1**\n4. Table 5: Citation quality evaluation for generated texts - contains columns: Corr., Prec., Rec., F1. - **YES, contains F1**\n5. Table 6: Result of Human Evaluation on text-citation alignment - contains Alignment and Human Avg. - no F1\n6. Table 7: Comparison between general and specific questions - contains columns including F1. - **YES, contains F1**\n7. Table 8: Instruction for human annotators - no metrics table\n8. Table 9: Not clearly visible as a numbered table in the content shown\n9. Table 10: Instruction and demonstration - not a metrics table\n10. Table 11: Instruction and demonstration - not a metrics table\n11. Table 12: Instruction and demonstration - not a metrics table\n12. Table 13: Instruction for answer generation - not a metrics table\n13. Tables 14-17: Evaluation criteria descriptions - not metrics tables\n14. Table 18: Example of generated questions - not a metrics table\n15. Table 19: Stephen Crane biographical information - not a metrics table with F1\n\nCounting tables with F1 metric:\n- Table 3: YES\n- Table 5: YES\n- Table 7: YES\n\nAnswer: 3", "citations": [], "input_tokens": 17534, "output_tokens": 420, "cost_micros": 58902, "latency_ms": 11342, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "3"}}
+{"qid": "2310.05634v2.pdf::Q002", "doc_id": "2310.05634v2.pdf", "answer_format": "int", "gold": "3", "pages": 24, "document_id": 7590, "arm": "llamacloud_premium_lc", "question_id": "2310.05634v2.pdf::Q002", "answer_letter": null, "raw_text": "I need to count all tables that include \"F1\" as a metric.\n\nLet me go through the document systematically:\n\n1. **Table 1**: Shows an example data entry - no F1 metric\n2. **Table 2**: Human Evaluation metrics (Authenticity, Relevance, Naturalness, Significance) - no F1\n3. **Table 3**: Citation Quality evaluation - includes \"Micro F1.\" and \"Macro F1.\" columns - **YES**\n4. **Table 4**: Text quality evaluation (Coh., Con., Flu., Rel.) - no F1\n5. **Table 5**: Citation quality with removed knowledge - includes \"F1.\" column - **YES**\n6. **Figure 4**: Shows precision, recall, and F1_Score for [NA] - **YES** (this is labeled as a figure but formatted as a table)\n7. **Figure 5**: Citation evaluation with retrieval accuracy - includes \"F1_score\" column - **YES** (also labeled as figure but formatted as a table)\n8. **Table 6**: Human evaluation on alignment - no F1\n9. **Table 7**: Comparison of General and Specific questions - includes \"Citation Eval. F1.\" column - **YES**\n10. **Table 8**: Human annotation instructions - no F1\n11. Tables 9-17: Various prompts and instructions - no F1 metrics\n12. **Table 18**: Example of evolutionary question construction - no F1\n13. **Table 19**: Example output from ChatGPT - no F1\n14. **Table 20**: Example output from GPT-4 - no F1\n\nTables/Figures with F1 metric:\n- Table 3\n- Table 5\n- Figure 4 (table format)\n- Figure 5 (table format)\n- Table 7\n\nAnswer: 5", "citations": [], "input_tokens": 23582, "output_tokens": 429, "cost_micros": 77181, "latency_ms": 11358, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "3"}}
+{"qid": "2310.05634v2.pdf::Q002", "doc_id": "2310.05634v2.pdf", "answer_format": "int", "gold": "3", "pages": 24, "document_id": 7590, "arm": "surfsense_agentic", "question_id": "2310.05634v2.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the document that was mentioned (2310.05634v2.pdf.xml) to find all tables that include \"F1\" as a metric.Let me continue reading the document to search for all tables:Now let me search for \"F1\" to find which tables contain this metric:Let me read the specific sections containing the tables to identify which ones include F1 as a metric:Now let me check all the other tables to see if they contain F1. Let me search more systematically:Let me also check Table 6:Now let me check the tables in the appendix to ensure I haven't missed any:Now let me scan through all the remaining content to ensure I have checked all tables. Let me list out what I've found so far:\n\nTables with F1:\n1. Table 3 - Has F1 (both Micro and Macro F1)\n2. Table 5 - Has F1 \n3. Table 7 - Has F1\n\nTables without F1:\n1. Table 1 - Does not have F1 (has Authenticity, Relevance, Naturalness, Significance)\n2. Table 2 - This appears to be just a caption reference, same as Table 1\n3. Table 4 - Does not have F1 (has Coh., Con., Flu., Rel.)\n4. Table 6 - Does not have F1 (has Alignment and Human Avg.)\n\nLet me continue checking the rest of the tables (8-20) mentioned in the document:Based on my thorough examination of the document, I have identified all tables and checked which ones include \"F1\" as a metric. Let me verify my findings:\n\nTables with F1 as a metric:\n1. **Table 3**: Citation Quality evaluation - includes both Micro F1 and Macro F1 columns\n2. **Table 5**: Citation quality evaluation for generated texts using a KG with N pieces of knowledge removed - includes F1 column\n3. **Table 7**: Comparison of evaluation results on General and Specific question setting - includes F1 in the Citation Evaluation columns\n\nTables without F1 as a metric:\n- Table 1: Example data entry (no F1)\n- Table 2: Human Evaluation on BioKaLMA dataset (no F1)\n- Table 4: Evaluation on generated text quality (no F1, only Coh., Con., Flu., Rel.)\n- Table 6: Result of Human Evaluation on text-citation alignment (no F1, only Alignment and Human Avg.)\n- Tables 8-20: These are instruction tables, prompt demonstrations, and examples (no F1 metrics)\n\nAnswer: 3", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 116094, "error": null, "extra": {"thread_id": 2007, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_d5e6c0fe45c9431aaf087f3d70bed847", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 545, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "3"}}
+{"qid": "2310.05634v2.pdf::Q003", "doc_id": "2310.05634v2.pdf", "answer_format": "float", "gold": "0.1", "pages": 24, "document_id": 7590, "arm": "native_pdf", "question_id": "2310.05634v2.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Table 3 in the paper, which shows the citation quality evaluation results for different models:\n\nThe alignment scores for ChatGPT at different temperatures are:\n- ChatGPT (0.1): 85.9\n- ChatGPT (0.5): 84.5\n- ChatGPT (0.9): 84.1\n\nThe highest alignment score is at temperature 0.1.\n\nAnswer: 0.1", "citations": [], "input_tokens": 60189, "output_tokens": 103, "cost_micros": 182112, "latency_ms": 22187, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.05634v2.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "0.1", "normalised_gold": "0.1"}}
+{"qid": "2310.05634v2.pdf::Q003", "doc_id": "2310.05634v2.pdf", "answer_format": "float", "gold": "0.1", "pages": 24, "document_id": 7590, "arm": "azure_basic_lc", "question_id": "2310.05634v2.pdf::Q003", "answer_letter": null, "raw_text": "I need to find the alignment scores for ChatGPT at different temperatures from Table 3.\n\nFrom Table 3 (Citation Quality evaluation), the alignment scores for ChatGPT are:\n- ChatGPT (0.1): 85.9(2.5)\n- ChatGPT (0.5): 84.5(1.1)\n- ChatGPT (0.9): 84.1(0.5)\n\nThe highest alignment score is 85.9 at temperature 0.1.\n\nAnswer: 0.1", "citations": [], "input_tokens": 22206, "output_tokens": 129, "cost_micros": 68553, "latency_ms": 5421, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "0.1", "normalised_gold": "0.1"}}
+{"qid": "2310.05634v2.pdf::Q003", "doc_id": "2310.05634v2.pdf", "answer_format": "float", "gold": "0.1", "pages": 24, "document_id": 7590, "arm": "azure_premium_lc", "question_id": "2310.05634v2.pdf::Q003", "answer_letter": null, "raw_text": "I need to find the alignment scores for ChatGPT at different temperatures from the paper.\n\nLooking at Table 3 in the paper, which shows \"Citation Quality OpenAI models and LLaMA family models\" including alignment scores:\n\n- ChatGPT (0.1): Alignment = 85.9(2.5)\n- ChatGPT (0.5): Alignment = 84.5(1.1)\n- ChatGPT (0.9): Alignment = 84.1(0.5)\n\nThe highest alignment score among the ChatGPT models is at temperature 0.1 with a score of 85.9.\n\nAnswer: 0.1", "citations": [], "input_tokens": 25531, "output_tokens": 159, "cost_micros": 78978, "latency_ms": 5687, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "0.1", "normalised_gold": "0.1"}}
+{"qid": "2310.05634v2.pdf::Q003", "doc_id": "2310.05634v2.pdf", "answer_format": "float", "gold": "0.1", "pages": 24, "document_id": 7590, "arm": "llamacloud_basic_lc", "question_id": "2310.05634v2.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Table 3 in the document, which presents the citation quality evaluation results including alignment scores for different models:\n\nFor ChatGPT at different temperatures:\n- ChatGPT (0.1): Alignment = 85.9\n- ChatGPT (0.5): Alignment = 84.5\n- ChatGPT (0.9): Alignment = 84.1\n\nThe highest alignment score for ChatGPT is at temperature 0.1 with a score of 85.9.\n\nAnswer: 0.1", "citations": [], "input_tokens": 17542, "output_tokens": 125, "cost_micros": 54501, "latency_ms": 5592, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "0.1", "normalised_gold": "0.1"}}
+{"qid": "2310.05634v2.pdf::Q003", "doc_id": "2310.05634v2.pdf", "answer_format": "float", "gold": "0.1", "pages": 24, "document_id": 7590, "arm": "llamacloud_premium_lc", "question_id": "2310.05634v2.pdf::Q003", "answer_letter": null, "raw_text": "I need to find the alignment scores for ChatGPT at different temperatures from Table 3.\n\nLooking at Table 3 (Citation Quality evaluation), I can see the alignment scores for different ChatGPT temperatures:\n\n- ChatGPT (0.1): 85.9(2.5)\n- ChatGPT (0.5): 84.5(1.1)\n- ChatGPT (0.9): 84.1(0.5)\n\nThe highest alignment score is 85.9 for ChatGPT at temperature 0.1.\n\nAnswer: 0.1", "citations": [], "input_tokens": 23590, "output_tokens": 139, "cost_micros": 72855, "latency_ms": 4796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "0.1", "normalised_gold": "0.1"}}
+{"qid": "2310.05634v2.pdf::Q003", "doc_id": "2310.05634v2.pdf", "answer_format": "float", "gold": "0.1", "pages": 24, "document_id": 7590, "arm": "surfsense_agentic", "question_id": "2310.05634v2.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the document to find information about temperature and ChatGPT alignment scores.Let me search for temperature settings and ChatGPT results more specifically:Now let me read the specific section where temperature experiments with ChatGPT are discussed:Perfect! I found the table with alignment scores for ChatGPT at different temperatures. From Table 3, I can see:\n\n- ChatGPT (0.1): Alignment score = 85.9(2.5)\n- ChatGPT (0.5): Alignment score = 84.5(1.1)\n- ChatGPT (0.9): Alignment score = 84.1(0.5)\n\nThe highest alignment score for ChatGPT is at temperature 0.1 with a score of 85.9.\n\nAnswer: 0.1", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 78952, "error": null, "extra": {"thread_id": 2008, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_826b46b364ee4f52853639bf7b854cb3", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 182, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "0.1", "normalised_gold": "0.1"}}
+{"qid": "2310.05634v2.pdf::Q004", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "Evolutionary Question Generation", "pages": 24, "document_id": 7590, "arm": "native_pdf", "question_id": "2310.05634v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the Dataset Construction section (Section A and subsections A.1, A.2, A.3), I need to compare the length of descriptions for each step:\n\n**A.1 Person Selection**: Approximately 150 words - describes selecting name pairs from the biographical database and filtering criteria.\n\n**A.2 Name Disambiguation**: Approximately 200 words - describes using SPARQL queries to map names to WikiData entities, handling duplicate names, and extracting one-hop sub-graphs.\n\n**A.3 Evolutionary Question Generation**: Approximately 450-500 words - provides extensive detail about the iterative process, knowledge injection, specificity and coherence scoring, the formula for scoring, and implementation details across five iterations.\n\nThe Evolutionary Question Generation step clearly contains the most detailed explanation, including mathematical formulas, specific algorithms for knowledge selection, and detailed descriptions of the iterative process.\n\nAnswer: Evolutionary Question Generation", "citations": [], "input_tokens": 60191, "output_tokens": 204, "cost_micros": 183633, "latency_ms": 21312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.05634v2.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "evolutionary question generation", "normalised_gold": "evolutionary question generation"}}
+{"qid": "2310.05634v2.pdf::Q004", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "Evolutionary Question Generation", "pages": 24, "document_id": 7590, "arm": "azure_basic_lc", "question_id": "2310.05634v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the Dataset Construction section (Appendix A), I need to compare the lengths of descriptions for each step:\n\n1. **Person Selection (A.1)**: Describes selecting name pairs from biographical database - relatively brief, about 1 paragraph\n\n2. **Name Disambiguation (A.2)**: Describes mapping names to WikiData entities using SPARQL queries - moderate length, about 1-2 paragraphs\n\n3. **Evolutionary Question Generation (A.3)**: Describes the iterative process of extending paragraphs, selecting knowledge, calculating scores with formulas, and generating questions - significantly longer, spanning multiple paragraphs with detailed explanations of specificity scores, coherence scores, the scoring formula, iteration process, and implementation details\n\nThe Evolutionary Question Generation step clearly contains the most extensive description with the most detail about the methodology.\n\nAnswer: Evolutionary Question Generation", "citations": [], "input_tokens": 22208, "output_tokens": 193, "cost_micros": 69519, "latency_ms": 6781, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "evolutionary question generation", "normalised_gold": "evolutionary question generation"}}
+{"qid": "2310.05634v2.pdf::Q004", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "Evolutionary Question Generation", "pages": 24, "document_id": 7590, "arm": "azure_premium_lc", "question_id": "2310.05634v2.pdf::Q004", "answer_letter": null, "raw_text": "To determine which step takes the most words to describe in the dataset construction process, I need to examine the three main steps described in Section 2.2 and detailed in Appendix A:\n\n1. **Person Selection** (Section A.1): This section describes filtering and selecting people from the biographical database. It's relatively brief.\n\n2. **Name Disambiguation** (Section A.2): This section explains mapping names to WikiData entities using SPARQL queries and extracting one-hop sub-graphs. It's moderately detailed.\n\n3. **Evolutionary Question Generation** (Section A.3): This section provides extensive detail about the iterative process of extending paragraphs, knowledge selection algorithms, scoring mechanisms (specificity and coherence scores), and the multiple rounds of generation with specific templates and prompts.\n\nThe Evolutionary Question Generation section is significantly longer and more detailed than the other two steps, including mathematical formulas, multiple iteration explanations, and extensive procedural details.\n\nAnswer: Evolutionary Question Generation", "citations": [], "input_tokens": 25533, "output_tokens": 217, "cost_micros": 79854, "latency_ms": 7764, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "evolutionary question generation", "normalised_gold": "evolutionary question generation"}}
+{"qid": "2310.05634v2.pdf::Q004", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "Evolutionary Question Generation", "pages": 24, "document_id": 7590, "arm": "llamacloud_basic_lc", "question_id": "2310.05634v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at section 2.2 Dataset Construction and Appendix A Dataset Construction, the three main steps are:\n\n1. Person Selection\n2. Name Disambiguation  \n3. Evolutionary Question Generation\n\nComparing the word counts in Appendix A where these steps are described in detail:\n\n- A.1 Person Selection: approximately 150 words\n- A.2 Name Disambiguation: approximately 180 words\n- A.3 Evolutionary Question Generation: approximately 130 words (before the subsections)\n\nHowever, the Evolutionary Question Generation section has significantly more content when including all the detailed prompt tables (Tables 9-12) and examples that follow it, making it the most extensively described step overall.\n\nAnswer: Evolutionary Question Generation", "citations": [], "input_tokens": 17544, "output_tokens": 165, "cost_micros": 55107, "latency_ms": 5703, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "evolutionary question generation", "normalised_gold": "evolutionary question generation"}}
+{"qid": "2310.05634v2.pdf::Q004", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "Evolutionary Question Generation", "pages": 24, "document_id": 7590, "arm": "llamacloud_premium_lc", "question_id": "2310.05634v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the dataset construction section (2.2), I need to compare the length of descriptions for each of the three steps:\n\n1. **Person Selection** (A.1): Approximately 150 words\n2. **Name Disambiguation** (A.2): Approximately 200 words  \n3. **Evolutionary Question Generation** (A.3): Approximately 450+ words (includes detailed explanation of the iterative process, knowledge selection algorithm with formula, scoring mechanisms, etc.)\n\nThe Evolutionary Question Generation step is described in significantly more detail than the other two steps, taking up the most space in the paper.\n\nAnswer: Evolutionary Question Generation", "citations": [], "input_tokens": 23592, "output_tokens": 143, "cost_micros": 72921, "latency_ms": 5844, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "evolutionary question generation", "normalised_gold": "evolutionary question generation"}}
+{"qid": "2310.05634v2.pdf::Q004", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "Evolutionary Question Generation", "pages": 24, "document_id": 7590, "arm": "surfsense_agentic", "question_id": "2310.05634v2.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to find information about dataset construction steps and compare their descriptions.Let me search for \"dataset construction\" in the document:Let me look at the detailed dataset construction section:Let me read the Dataset Construction section starting at line 506:Good! I can see the three steps mentioned at line 521-523. Now let me read the Appendix A which has the detailed dataset construction process:Perfect! I can now see the three steps of dataset construction in detail. Let me count the words in each section to determine which takes the most words to describe.\n\nLet me read each section more carefully to count:Now let me see where the Evolutionary Question Generation section ends:Perfect! Now I have all three sections. Let me analyze the word count for each step:\n\n**A.1 Person Selection** (lines 2089-2110): Approximately 22 lines of content\n**A.2 Name Disambiguation** (lines 2113-2145): Approximately 33 lines of content  \n**A.3 Evolutionary Question Generation** (lines 2148-2222): Approximately 75 lines of content\n\nBy visual inspection and line count, the \"Evolutionary Question Generation\" section is significantly longer than the other two steps. It contains detailed explanations about:\n- The evolutionary approach\n- Iteration process\n- Knowledge annotation\n- Specificity and coherence scoring\n- Formula for scoring\n- Implementation details\n\nThis is clearly the step that takes the most words to describe.\n\nAnswer: Evolutionary Question Generation", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 78750, "error": null, "extra": {"thread_id": 2009, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_1b0d2d82267f4e5eab9a849c3924465e", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 309, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "evolutionary question generation", "normalised_gold": "evolutionary question generation"}}
+{"qid": "2310.05634v2.pdf::Q005", "doc_id": "2310.05634v2.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "If it can be partially verified by the knowledge graph G", "pages": 24, "document_id": 7590, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:03:37Z", "latency_ms": 20843, "error": null, "raw_text_chars": 345}]}, "question_id": "2310.05634v2.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, specifically on page 2, the task formulation section discusses the \"Setting of Conscious Incompetence.\"\n\nThe relevant text states: \"A sentence can map to both [NA] and a list of sub-graph knowledge if it can be partially verified by the knowledge graph G.\"\n\nAnswer: if it can be partially verified by the knowledge graph", "citations": [], "input_tokens": 60205, "output_tokens": 80, "cost_micros": 181815, "latency_ms": 20828, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.05634v2.pdf"}, "graded": {"correct": true, "f1": 0.9473684210526316, "method": "str_norm", "normalised_pred": "if it can be partially verified by knowledge graph", "normalised_gold": "if it can be partially verified by knowledge graph g"}}
+{"qid": "2310.05634v2.pdf::Q005", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "If it can be partially verified by the knowledge graph G", "pages": 24, "document_id": 7590, "arm": "azure_basic_lc", "question_id": "2310.05634v2.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document's definition of the \"Conscious Incompetence\" setting in Section 2.1:\n\n\"A sentence can map to both [NA] and a list of sub-graph knowledge if it can be partially verified by the knowledge graph G.\"\n\nAnswer: if it can be partially verified by the knowledge graph", "citations": [], "input_tokens": 22222, "output_tokens": 71, "cost_micros": 67731, "latency_ms": 4952, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.9473684210526316, "method": "str_norm", "normalised_pred": "if it can be partially verified by knowledge graph", "normalised_gold": "if it can be partially verified by knowledge graph g"}}
+{"qid": "2310.05634v2.pdf::Q005", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "If it can be partially verified by the knowledge graph G", "pages": 24, "document_id": 7590, "arm": "azure_premium_lc", "question_id": "2310.05634v2.pdf::Q005", "answer_letter": null, "raw_text": "According to the document, a sentence can map to both [NA] and a list of sub-graph knowledge when:\n\nAnswer: It can be partially verified by the knowledge graph G", "citations": [], "input_tokens": 25547, "output_tokens": 40, "cost_micros": 77241, "latency_ms": 3375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.9473684210526316, "method": "str_norm", "normalised_pred": "it can be partially verified by knowledge graph g", "normalised_gold": "if it can be partially verified by knowledge graph g"}}
+{"qid": "2310.05634v2.pdf::Q005", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "If it can be partially verified by the knowledge graph G", "pages": 24, "document_id": 7590, "arm": "llamacloud_basic_lc", "question_id": "2310.05634v2.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, I need to find the definition of when a sentence can map to both [NA] and knowledge citations.\n\nIn Section 2.1 \"Task Formulation,\" under \"Setting of Conscious Incompetence,\" the document states:\n\n\"A sentence can map to both [NA] and a list of sub-graph knowledge if it can.\"\n\nHowever, this is incomplete. Looking at the full context, the document explains that [NA] is used when a sentence \"includes some knowledge to be verified, but the knowledge is absent in the knowledge graph G.\"\n\nThe key insight comes from the demonstration in Table 13, which shows sentences containing both specific knowledge citations from the graph AND [NA] citations. This occurs when a sentence contains both:\n1. Knowledge that IS present in the knowledge graph (cited with specific knowledge)\n2. Knowledge that IS NOT present in the knowledge graph (cited with [NA])\n\nAnswer: When it contains both knowledge present in and absent from the knowledge graph", "citations": [], "input_tokens": 17558, "output_tokens": 214, "cost_micros": 55884, "latency_ms": 7110, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.2727272727272727, "method": "str_norm", "normalised_pred": "when it contains both knowledge present in and absent from knowledge graph", "normalised_gold": "if it can be partially verified by knowledge graph g"}}
+{"qid": "2310.05634v2.pdf::Q005", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "If it can be partially verified by the knowledge graph G", "pages": 24, "document_id": 7590, "arm": "llamacloud_premium_lc", "question_id": "2310.05634v2.pdf::Q005", "answer_letter": null, "raw_text": "According to the document, in the \"Setting of Conscious Incompetence\" section (2.1), a sentence can map to both [NA] and a list of sub-graph knowledge when:\n\n\"A sentence can map to both [NA] and a list of sub-graph knowledge if it can be partially verified by the knowledge graph $G$.\"\n\nAnswer: When it can be partially verified by the knowledge graph", "citations": [], "input_tokens": 23606, "output_tokens": 91, "cost_micros": 72183, "latency_ms": 3969, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.8421052631578948, "method": "str_norm", "normalised_pred": "when it can be partially verified by knowledge graph", "normalised_gold": "if it can be partially verified by knowledge graph g"}}
+{"qid": "2310.05634v2.pdf::Q005", "doc_id": "2310.05634v2.pdf", "answer_format": "str", "gold": "If it can be partially verified by the knowledge graph G", "pages": 24, "document_id": 7590, "arm": "surfsense_agentic", "question_id": "2310.05634v2.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to find the author's definition of conscious incompetence and when a sentence can map to both [NA] and a list of sub-graph knowledge.Let me search for \"conscious incompetence\" in the document:Now let me read the section around line 485 and 514 where \"conscious incompetence\" is defined:Perfect! I found the answer. According to the document, in the section on \"Setting of Conscious Incompetence\", a sentence can map to both [NA] and a list of sub-graph knowledge when it can be partially verified by the knowledge graph G.\n\nAnswer: when it can be partially verified by the knowledge graph", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 34875, "error": null, "extra": {"thread_id": 2010, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_6035a4bba19d42d88826ef02286ce3c9", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 145, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.8421052631578948, "method": "str_norm", "normalised_pred": "when it can be partially verified by knowledge graph", "normalised_gold": "if it can be partially verified by knowledge graph g"}}
+{"qid": "2310.07609v1.pdf::Q000", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "2", "pages": 10, "document_id": 7591, "arm": "native_pdf", "question_id": "2310.07609v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the example figure of question-guided reasoning and count the numbers displayed in blue color.\n\nLooking at Figure 1 on page 1, which shows \"An example of question-guided reasoning for fact-checking complex real-world claims,\" I can see:\n\n- Q1: What is the greatest depth of the Black Sea?\n- A1: Black sea has a maximum depth of **2,212** meters.\n- Q2: How far can sunlight penetrate water?\n- A2: Sunlight does not penetrate water below **1,000** meters.\n- At the bottom: \"2,212 is greater than 1,000. Therefore, the claim is\"\n\nIn the figure, I can see numbers that appear to be highlighted or emphasized:\n- 2,212 (appears twice - once in A1 and once in the reasoning)\n- 1,000 (appears twice - once in A2 and once in the reasoning)\n\nLooking at the visual representation in the figure, the numbers that are displayed in blue/highlighted color are:\n- 2,212 (in A1)\n- 1,000 (in A2)\n- 2,212 (in the final reasoning line)\n- 1,000 (in the final reasoning line)\n\nSo there are 4 instances of numbers shown in blue color.\n\nAnswer: 4", "citations": [], "input_tokens": 27152, "output_tokens": 309, "cost_micros": 86091, "latency_ms": 11875, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.07609v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "2"}}
+{"qid": "2310.07609v1.pdf::Q000", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "2", "pages": 10, "document_id": 7591, "arm": "azure_basic_lc", "question_id": "2310.07609v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to look at Figure 1, which shows \"An example of question-guided reasoning for fact-checking complex real-world claims.\"\n\nIn Figure 1, I can see:\n1. The claim about the Black Sea\n2. Q1 asking about the greatest depth of the Black Sea\n3. A1 stating \"Black sea has a maximum depth of 2,212 meters\" - the number **2,212** appears in blue\n4. Additional text showing \"Average depth\" with what appears to be another number\n5. Q2 asking about how far sunlight can penetrate water\n6. A2 stating \"Sunlight does not penetrate water below 1,000 meters\" - the number **1,000** appears in blue\n7. At the bottom, there's reasoning that states \"2,212 is greater than 1,000\" - both **2,212** and **1,000** appear in blue here as well\n\nLooking carefully at the figure, I can identify the following blue-colored numbers:\n- 2,212 (appears multiple times)\n- 1,000 (appears multiple times)\n\nHowever, the question asks for how many numbers with blue color are mentioned, not how many times they appear. The distinct blue numbers are:\n1. 2,212\n2. 1,000\n\nAnswer: 2", "citations": [], "input_tokens": 11640, "output_tokens": 300, "cost_micros": 39420, "latency_ms": 8203, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2310.07609v1.pdf::Q000", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "2", "pages": 10, "document_id": 7591, "arm": "azure_premium_lc", "question_id": "2310.07609v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to look at Figure 1, which shows \"An example of question-guided reasoning for fact-checking complex real-world claims.\"\n\nIn Figure 1, I can see:\n- The claim about sunlight and the Black Sea\n- Q1 asking about the greatest depth of the Black Sea\n- A1 stating \"Black sea has a maximum depth of **2,212 meters**\" (2,212 appears to be in blue)\n- Q2 asking about how far sunlight can penetrate water\n- A2 stating \"Sunlight does not penetrate water below **1,000 meters**\" (1,000 appears to be in blue)\n- At the bottom, there's text saying \"2,212 is greater than 1,000\" where both numbers appear to be in blue\n\nSo the blue-colored numbers mentioned are:\n1. 2,212 (in A1)\n2. 1,000 (in A2)\n3. 2,212 (in the comparison at bottom)\n4. 1,000 (in the comparison at bottom)\n\nHowever, if we count unique numbers, there are 2 distinct numbers (2,212 and 1,000). But if we count each mention, there are 4 total mentions of numbers in blue.\n\nLooking more carefully at the figure, the numbers that appear to be highlighted/emphasized in blue are:\n- 2,212 (appears twice)\n- 1,000 (appears twice)\n\nCounting total mentions: 4\n\nAnswer: 4", "citations": [], "input_tokens": 12756, "output_tokens": 349, "cost_micros": 43503, "latency_ms": 8500, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "2"}}
+{"qid": "2310.07609v1.pdf::Q000", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "2", "pages": 10, "document_id": 7591, "arm": "llamacloud_basic_lc", "question_id": "2310.07609v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the example figure that shows question-guided reasoning and count the blue-colored numbers in it.\n\nLooking at the document, Figure 1 is mentioned in the text as showing an example of question-guided reasoning for the claim \"Sunlight can reach the deepest part of the Black Sea.\" The text states: \"This reasoning process can be formulated as question-guided reasoning, where the verification of the claim is guided by asking and answering a series of relevant questions, as shown in Figure 1.\"\n\nHowever, Figure 1 itself is not shown in the provided document content. \n\nThe document does show Figure 4, which displays the QACHECK user interface with an example about \"Lars Onsager won the Nobel prize when he was 30 years old.\" In this figure, I can see:\n- \"Reasoning depth: 0\" and \"Reasoning depth: 1\" labels\n- \"1968\" (the year Lars Onsager won the Nobel Prize)\n- \"1903\" (the year Lars Onsager was born)\n- \"65\" (mentioned in the final prediction: \"He was 65 when he won the Nobel prize\")\n\nLooking at the visible figures in the document, Figure 4 shows the question-guided reasoning process example. The numbers that appear to be highlighted or emphasized in the context of the reasoning are: 1968, 1903, and 65.\n\nWithout being able to see the actual colors in Figure 1 (which is referenced but not displayed), and based on Figure 4 which shows a complete example of question-guided reasoning, I can count 3 distinct numbers that would typically be highlighted in such a demonstration.\n\nAnswer: 3", "citations": [], "input_tokens": 8625, "output_tokens": 374, "cost_micros": 31485, "latency_ms": 9280, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "2"}}
+{"qid": "2310.07609v1.pdf::Q000", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "2", "pages": 10, "document_id": 7591, "arm": "llamacloud_premium_lc", "question_id": "2310.07609v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to examine Figure 1, which shows \"An example of question-guided reasoning for fact-checking complex real-world claims.\"\n\nLooking at Figure 1, I can see:\n- The claim about sunlight reaching the deepest part of the Black Sea\n- Step 1 with Q1 and A1, where A1 mentions \"2,212 meters\" \n- Step 2 with Q2 and A2, where A2 mentions \"1,000 meters\"\n- The conclusion states \"2,212 is greater than 1,000\"\n\nThe numbers mentioned are:\n1. 2,212 (meters - maximum depth of Black Sea)\n2. 1,000 (meters - how far sunlight penetrates)\n\nBoth numbers appear to be in the same color as the rest of the text in the boxes. Looking at the figure description and the actual content, I need to identify which numbers are specifically in blue color.\n\nFrom the document's Figure 1, examining the text carefully, the numbers that appear are 2,212 and 1,000. The question asks specifically about numbers with \"blue color.\"\n\nBased on the visual representation in Figure 1, I can identify 2 numbers: 2,212 and 1,000.\n\nAnswer: 2", "citations": [], "input_tokens": 12525, "output_tokens": 288, "cost_micros": 41895, "latency_ms": 8359, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2310.07609v1.pdf::Q000", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "2", "pages": 10, "document_id": 7591, "arm": "surfsense_agentic", "question_id": "2310.07609v1.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find the example figure related to \"question-guided reasoning\" and count the blue-colored numbers.Let me search for \"question-guided reasoning\" in the document to locate the relevant section.Now let me read the section around line 217 where Figure 1 is mentioned, which shows \"An example of question-guided reasoning\":Perfect! I can see Figure 1 which shows the question-guided reasoning example. Let me continue reading to see the complete figure description and identify all blue-colored numbers:Based on the document content I've read, I can see Figure 1 which shows the question-guided reasoning example. The figure contains:\n\n1. The text mentions \"2,212 m\" (Black Sea depth)\n2. The text mentions \"2,212 meters\" in the answer A1\n3. The text mentions \"1,000 meters\" in the answer A2\n4. The visual description mentions \"Blue accent colors for interactive elements (icons, links)\"\n\nLet me look more carefully at the OCR text section which lists the specific numbers visible in the figure:\n\nFrom lines 288-298, the OCR text shows:\n- 2,212 m (appears multiple times)\n- 1,253 m\n- 7,257 ft\n- 547,000 km\u00b3\n- 131,200 cu mi\n- 10+\n\nAnd from the main figure text (lines 219-270):\n- 2,212 m (Black Sea depth)\n- 1,000 meters (sunlight penetration)\n- 200 meters (mentioned in the evidence text)\n\nThe key information states: \"Blue accent colors for interactive elements (icons, links)\" which suggests that numbers appearing in blue would be those that are interactive or emphasized.\n\nLooking at the structure of Figure 1, the numbers that appear to be in blue color based on the context are:\n1. **2,212** (the answer to Q1 about Black Sea depth)\n2. **1,000** (the answer to Q2 about sunlight penetration)\n\nThese are the two key numerical answers that appear in the question-guided reasoning process.\n\nAnswer: 2", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 45156, "error": null, "extra": {"thread_id": 2011, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_8ee68d3622e54a41be76708c8f52b00e", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 300, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2310.07609v1.pdf::Q002", "doc_id": "2310.07609v1.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "5", "pages": 10, "document_id": 7591, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:03:57Z", "latency_ms": 14639, "error": null, "raw_text_chars": 1219}]}, "question_id": "2310.07609v1.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, I need to identify which modules use Large Language Models (LLMs) as their backbone.\n\nFrom Section 3, the QACHECK system has 5 modules:\n1. **Claim Verifier** - Section 3.1 states it's \"based on InstructGPT\" (an LLM)\n2. **Question Generator** - Section 3.2 states \"Similar to the claim verifier, we also leverage InstructGPT\" (an LLM)\n3. **Question Answering Model** - Section 3.3 describes three implementations, with the default being \"GPT Reciter-Reader\" which uses InstructGPT (an LLM). However, this module also has non-LLM options (Retriever-Reader, FLAN-T5).\n4. **QA Validator** - Section 3.4 states \"We again implement the validator by prompting InstructGPT\" (an LLM)\n5. **Reasoner** - Section 3.5 provides \"two different implementations: 1) the end-to-end QA model based on FLAN-T5, and 2) the InstructGPT model\"\n\nCounting the modules that use LLM backbones:\n- Claim Verifier: Yes (InstructGPT)\n- Question Generator: Yes (InstructGPT)\n- Question Answering Model: Yes (uses GPT Reciter-Reader by default)\n- QA Validator: Yes (InstructGPT)\n- Reasoner: Yes (InstructGPT is one of the two implementations)\n\nAll 5 modules have LLM implementations, with some offering alternatives.\n\nAnswer: 5", "citations": [], "input_tokens": 27149, "output_tokens": 403, "cost_micros": 87492, "latency_ms": 14639, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.07609v1.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "5"}}
+{"qid": "2310.07609v1.pdf::Q002", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "5", "pages": 10, "document_id": 7591, "arm": "azure_basic_lc", "question_id": "2310.07609v1.pdf::Q002", "answer_letter": null, "raw_text": "I need to identify which modules in the QACHECK system are implemented using Large Language Models (LLMs) as their backbone.\n\nLet me go through each of the five modules described in Section 3:\n\n1. **Claim Verifier (Section 3.1)**: \"We build the claim verifier based on InstructGPT (Ouyang et al., 2022), utilizing its powerful in-context learning ability.\" - This uses LLM (InstructGPT).\n\n2. **Question Generator (Section 3.2)**: \"Similar to the claim verifier, we also leverage InstructGPT for in-context learning.\" - This uses LLM (InstructGPT).\n\n3. **Question Answering Model (Section 3.3)**: This module has three different implementations:\n   - Retriever-Reader: Uses BM25 retriever and RoBERTa-large reader (not primarily LLM-based)\n   - FLAN-T5: This is an LLM\n   - GPT Reciter-Reader: Uses InstructGPT (LLM)\n   \n   The text states \"In the following, we use GPT Reciter-Reader as the default implementation for our QA module.\" So the default implementation uses LLM.\n\n4. **QA Validator (Section 3.4)**: \"We again implement the validator by prompting InstructGPT with a suite of ten demonstrations\" - This uses LLM (InstructGPT).\n\n5. **Reasoner (Section 3.5)**: \"We provide two different implementations for the reasoner: 1) the end-to-end QA model based on FLAN-T5, and 2) the InstructGPT model\" - Both implementations use LLMs.\n\nCounting the modules that are implemented with LLM backbone:\n1. Claim Verifier - Yes (InstructGPT)\n2. Question Generator - Yes (InstructGPT)\n3. Question Answering Model - Yes (default is GPT Reciter-Reader with InstructGPT)\n4. QA Validator - Yes (InstructGPT)\n5. Reasoner - Yes (FLAN-T5", "citations": [], "input_tokens": 11637, "output_tokens": 512, "cost_micros": 42591, "latency_ms": 10953, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "5"}}
+{"qid": "2310.07609v1.pdf::Q002", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "5", "pages": 10, "document_id": 7591, "arm": "azure_premium_lc", "question_id": "2310.07609v1.pdf::Q002", "answer_letter": null, "raw_text": "I need to identify which modules in the QACHECK system use Large Language Models (LLMs) as their backbone.\n\nLet me go through each of the five modules:\n\n1. **Claim Verifier** (Section 3.1): Uses InstructGPT with in-context learning\n   - LLM-based \u2713\n\n2. **Question Generator** (Section 3.2): Uses InstructGPT for in-context learning\n   - LLM-based \u2713\n\n3. **Question Answering Model** (Section 3.3): Has three implementations:\n   - Retriever-Reader: Uses BM25 + RoBERTa-large fine-tuned on SQUAD\n   - FLAN-T5: Uses FLAN-T5 model\n   - GPT Reciter-Reader: Uses InstructGPT\n   - Since the system uses one of these at a time, and the default is GPT Reciter-Reader (mentioned in the text), this module uses LLM \u2713\n\n4. **QA Validator** (Section 3.4): Uses InstructGPT with demonstrations\n   - LLM-based \u2713\n\n5. **Reasoner** (Section 3.5): Has two implementations:\n   - FLAN-T5 (end-to-end QA model)\n   - InstructGPT\n   - Both options use LLMs \u2713\n\nAll five modules are implemented with Large Language Models as their backbone (either InstructGPT or FLAN-T5, both of which are LLMs).\n\nAnswer: 5", "citations": [], "input_tokens": 12753, "output_tokens": 375, "cost_micros": 43884, "latency_ms": 8250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "5"}}
+{"qid": "2310.07609v1.pdf::Q002", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "5", "pages": 10, "document_id": 7591, "arm": "llamacloud_basic_lc", "question_id": "2310.07609v1.pdf::Q002", "answer_letter": null, "raw_text": "I need to identify which modules in the QACHECK system use Large Language Models (LLMs) as their backbone.\n\nLet me go through each of the five modules described in the document:\n\n1. **Claim Verifier** (Section 3.1): \"We build the claim verifier based on InstructGPT (Ouyang et al., 2022), utilizing its powerful in-context learning ability.\" - This uses LLM (InstructGPT).\n\n2. **Question Generator** (Section 3.2): \"Similar to the claim verifier, we also leverage InstructGPT for in-context learning.\" - This uses LLM (InstructGPT).\n\n3. **Question Answering Module** (Section 3.3): This module has three different implementations:\n   - Retriever-Reader: Uses RoBERTa-large (which is a language model)\n   - FLAN-T5: \"we also incorporate the FLAN-T5 model\" - This is an LLM\n   - GPT Reciter-Reader: Uses InstructGPT - This is an LLM\n   \n   The default implementation uses GPT Reciter-Reader (LLM-based).\n\n4. **QA Validator** (Section 3.4): \"We again implement the validator by prompting InstructGPT with a suite of ten demonstrations\" - This uses LLM (InstructGPT).\n\n5. **Reasoner** (Section 3.5): \"We provide two different implementations for the reasoner: 1) the end-to-end QA model based on FLAN-T5, and 2) the InstructGPT model\" - Both implementations use LLMs.\n\nAll five modules are implemented with Large Language Models as their backbone (either as the primary or available implementation).\n\nAnswer: 5", "citations": [], "input_tokens": 8622, "output_tokens": 425, "cost_micros": 32241, "latency_ms": 10062, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "5"}}
+{"qid": "2310.07609v1.pdf::Q002", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "5", "pages": 10, "document_id": 7591, "arm": "llamacloud_premium_lc", "question_id": "2310.07609v1.pdf::Q002", "answer_letter": null, "raw_text": "I need to identify which modules in the QACHECK system use Large Language Models (LLMs) as their backbone.\n\nLet me go through each of the five modules:\n\n1. **Claim Verifier** (Section 3.1): \"We build the claim verifier based on InstructGPT (Ouyang et al., 2022), utilizing its powerful in-context learning ability.\" - Uses LLM \u2713\n\n2. **Question Generator** (Section 3.2): \"Similar to the claim verifier, we also leverage InstructGPT for in-context learning.\" - Uses LLM \u2713\n\n3. **Question Answering Model** (Section 3.3): This module has three different implementations:\n   - Retriever-Reader: Uses RoBERTa-large fine-tuned on SQuAD (this is a reader model, but the main approach is retrieval-based)\n   - FLAN-T5: This is an LLM \u2713\n   - GPT Reciter-Reader: Uses InstructGPT as the reciter \u2713\n   \n   The document states \"In the following, we use GPT Reciter-Reader as the default implementation for our QA module.\" So the default uses LLM.\n\n4. **QA Validator** (Section 3.4): \"We again implement the validator by prompting InstructGPT with a suite of ten demonstrations\" - Uses LLM \u2713\n\n5. **Reasoner** (Section 3.5): \"We provide two different implementations for the reasoner: 1) the end-to-end QA model based on FLAN-T5, and 2) the InstructGPT model\" - Both implementations use LLMs \u2713\n\nCounting the modules that use LLMs:\n1. Claim Verifier - LLM\n2. Question Generator - LLM\n3. Question Answering Model - LLM (default implementation)\n4. QA Validator - LLM\n5. Reasoner - LLM\n\nAll 5 modules use LLMs in their implementation (either exclusively or as the default/primary option).\n\nAnswer: 5", "citations": [], "input_tokens": 12522, "output_tokens": 503, "cost_micros": 45111, "latency_ms": 11405, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "5"}}
+{"qid": "2310.07609v1.pdf::Q002", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "5", "pages": 10, "document_id": 7591, "arm": "surfsense_agentic", "question_id": "2310.07609v1.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the priority document to find information about modules implemented with Large Language Models backbone.Let me search for information about modules implemented with LLM backbone in the document.Now let me search for information about which modules use LLM backbone:Let me read more of the document to find details about the system architecture and which modules use Large Language Models:Now let me search for information specifically about which modules are implemented with Large Language Models:Based on the document, I can now identify which modules are implemented with Large Language Models (specifically InstructGPT). Let me verify this information:\n\nFrom the document, I can see that the QACHECK system has **five modules**:\n1. **Claim Verifier** - implemented with InstructGPT (Large Language Model)\n2. **Question Generator** - implemented with InstructGPT (Large Language Model)\n3. **Question Answering Model** - has three implementations, one of which (GPT Reciter-Reader) uses InstructGPT (Large Language Model)\n4. **QA Validator** - implemented with InstructGPT (Large Language Model)\n5. **Reasoner** - has two implementations, one of which uses InstructGPT (Large Language Model)\n\nHowever, I need to count how many modules are implemented with LLM backbone. Looking at the text more carefully:\n\n- **Claim Verifier**: Uses InstructGPT (LLM) \u2713\n- **Question Generator**: Uses InstructGPT (LLM) \u2713\n- **Question Answering Model**: Has 3 options (Retriever-Reader, FLAN-T5, GPT Reciter-Reader), but the QA module itself is not purely LLM-based in all implementations\n- **QA Validator**: Uses InstructGPT (LLM) \u2713\n- **Reasoner**: Has 2 implementations (FLAN-T5 and InstructGPT), one uses LLM \u2713\n\nLet me re-read to get the precise count of modules that ARE implemented with LLM backbone (not just have an option for it):\n\nFrom the text:\n- Claim Verifier: \"We build the claim verifier based on InstructGPT\" - **LLM**\n- Question Generator: \"we also leverage InstructGPT for in-context learning\" - **LLM**\n- QA Validator: \"We again implement the validator by prompting InstructGPT\" - **LLM**\n- Reasoner: \"We provide two different implementations for the reasoner: 1) the end-to-end QA model based on FLAN-T5, and 2) the InstructGPT model\" - has LLM option but also non-LLM option\n\nThe Question Answering Model has multiple implementations, and the document states the system uses \"GPT Reciter-Reader as the default implementation\" which uses InstructGPT.\n\nSo counting modules that are implemented with LLM backbone: Claim Verifier (1), Question Generator (2), QA Validator (3), and considering the default implementations, the Reasoner could be 4.\n\nLet me check if there's a clearer statement in the document about this specific question.Let me read the sections more carefully to understand which modules specifically use LLM backbone:\n\nBased on my reading of the document:\n\n1. **Claim Verifier** (Section 3.1): \"We build the claim verifier based on InstructGPT\" - Uses LLM \u2713\n2. **Question Generator** (Section 3.2): \"Similar to the claim verifier, we also leverage InstructGPT for in-context learning\" - Uses LLM \u2713\n3. **Question Answering Model** (Section 3.3): Multiple implementations, NOT all use LLM\n4. **QA Validator** (Section 3.4): \"We again implement the validator by prompting InstructGPT\" - Uses LLM \u2713\n5. **Reasoner** (Section 3.5): \"We provide two different implementations for the reasoner: 1) the end-to-end QA model based on FLAN-T5, and 2) the InstructGPT model\" - Has LLM option\n\nThe modules that are definitively implemented with LLM (InstructGPT) backbone are:\n1. Claim Verifier\n2. Question Generator  \n3. QA Validator\n\nThat's **3 modules** that are implemented with Large Language Models backbone.\n\nHowever, if we consider that the Reasoner also has an InstructGPT implementation and FLAN-T5 is also a Large Language Model, it could be 4. Let me verify if FLAN-T5 counts as an LLM.\n\nFLAN-T5 is described in the document as \"a Seq2Seq model pre-trained on more than 1.8K tasks with instruction tuning\" - this is indeed a Large Language Model.\n\nSo if the question is asking about modules that ARE implemented (not optionally, but actually) with LLM backbone:\n- Claim Verifier: InstructGPT (LLM) \u2713\n- Question Generator: InstructGPT (LLM) \u2713\n- QA Validator: InstructGPT (LLM) \u2713\n- Reasoner: Has two implementations, both are LLMs (FLAN-T5 and InstructGPT) \u2713\n\nThat would be **4 modules**.\n\nAnswer: 4", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 100172, "error": null, "extra": {"thread_id": 2012, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_5a2a7a1ab0194a9ba5958368d2c54734", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 686, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "5"}}
+{"qid": "2310.07609v1.pdf::Q004", "doc_id": "2310.07609v1.pdf", "answer_format": "str", "gold": "4. The final prediction result with rationale.", "pages": 10, "document_id": 7591, "arm": "native_pdf", "question_id": "2310.07609v1.pdf::Q004", "answer_letter": null, "raw_text": "I need to find the figure at the top of page 5 and identify the final step shown in it.\n\nLooking at page 5, I can see Figure 4, which shows a screenshot of the QACHECK user interface with numbered annotations.\n\nThe figure shows 4 numbered steps:\n1. Select or input a custom claim\n2. Submit to fact-check the input claim\n3. Visualize the question-answering guided reasoning process\n4. The final prediction result with rationale\n\nThe final step (step 4) is clearly labeled as \"The final prediction result with rationale\"\n\nAnswer: The final prediction result with rationale", "citations": [], "input_tokens": 27168, "output_tokens": 141, "cost_micros": 83619, "latency_ms": 10530, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.07609v1.pdf"}, "graded": {"correct": true, "f1": 0.9090909090909091, "method": "str_norm", "normalised_pred": "final prediction result with rationale", "normalised_gold": "4 final prediction result with rationale"}}
+{"qid": "2310.07609v1.pdf::Q004", "doc_id": "2310.07609v1.pdf", "answer_format": "str", "gold": "4. The final prediction result with rationale.", "pages": 10, "document_id": 7591, "arm": "azure_basic_lc", "question_id": "2310.07609v1.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Figure 4 on page 5, which shows the QACHECK user interface screenshot, the final step (labeled as \"4\") shows:\n\n\"The final prediction result with rationale\"\n\nThis section displays:\n- \"Prediction with rationale:\"\n- The reasoning explanation\n- \"Finish\"\n\nAnswer: The final prediction result with rationale", "citations": [], "input_tokens": 11656, "output_tokens": 80, "cost_micros": 36168, "latency_ms": 4530, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.9090909090909091, "method": "str_norm", "normalised_pred": "final prediction result with rationale", "normalised_gold": "4 final prediction result with rationale"}}
+{"qid": "2310.07609v1.pdf::Q004", "doc_id": "2310.07609v1.pdf", "answer_format": "str", "gold": "4. The final prediction result with rationale.", "pages": 10, "document_id": 7591, "arm": "azure_premium_lc", "question_id": "2310.07609v1.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Figure 4 (the screenshot at the top of page 5), I can see the interface is annotated with 4 numbered sections:\n\n1. Select or input a custom claim\n2. Submit to fact-check the input claim\n3. Visualize the question-answering guided reasoning process\n4. The final prediction result with rationale\n\nThe fourth and final step shows \"Prediction with rationale:\" followed by the reasoning and final answer.\n\nAnswer: The final prediction result with rationale", "citations": [], "input_tokens": 12772, "output_tokens": 111, "cost_micros": 39981, "latency_ms": 5250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.9090909090909091, "method": "str_norm", "normalised_pred": "final prediction result with rationale", "normalised_gold": "4 final prediction result with rationale"}}
+{"qid": "2310.07609v1.pdf::Q004", "doc_id": "2310.07609v1.pdf", "answer_format": "str", "gold": "4. The final prediction result with rationale.", "pages": 10, "document_id": 7591, "arm": "llamacloud_basic_lc", "question_id": "2310.07609v1.pdf::Q004", "answer_letter": null, "raw_text": "Looking at the figure at the top of page 5 (Figure 4), I can see the system interface showing the fact-checking process for the claim \"Lars Onsager won the Nobel prize when he was 30 years old.\"\n\nThe final step shown in the figure is labeled \"The final prediction result with rationale\" which displays:\n\n\"Lars Onsager won the Nobel prize in 1968. Lars Onsager was born in 1903. He was 65 when he won the Nobel prize. Therefore, the final answer is: False.\"\n\nAnswer: The final prediction result with rationale", "citations": [], "input_tokens": 8641, "output_tokens": 135, "cost_micros": 27948, "latency_ms": 5422, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.9090909090909091, "method": "str_norm", "normalised_pred": "final prediction result with rationale", "normalised_gold": "4 final prediction result with rationale"}}
+{"qid": "2310.07609v1.pdf::Q004", "doc_id": "2310.07609v1.pdf", "answer_format": "str", "gold": "4. The final prediction result with rationale.", "pages": 10, "document_id": 7591, "arm": "llamacloud_premium_lc", "question_id": "2310.07609v1.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Figure 2 on page 5, I need to identify the final step in the architecture diagram.\n\nThe flow shows:\n1. Claim \u2192 Claim Verifier\n2. Question Generator \u2192 QA Model \u2192 Validator \u2192 Relevant Context (in a loop)\n3. Claim Verifier \u2192 Reasoner\n4. Reasoner \u2192 Label\n\nThe final step in the process is when the Reasoner outputs the Label.\n\nAnswer: Label", "citations": [], "input_tokens": 12541, "output_tokens": 104, "cost_micros": 39183, "latency_ms": 4155, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "label", "normalised_gold": "4 final prediction result with rationale"}}
+{"qid": "2310.07609v1.pdf::Q004", "doc_id": "2310.07609v1.pdf", "answer_format": "str", "gold": "4. The final prediction result with rationale.", "pages": 10, "document_id": 7591, "arm": "surfsense_agentic", "question_id": "2310.07609v1.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to find the figure at the top of page 5 and identify the final step.Let me search for page 5 content in the document:Let me search for page markers and figures in the document:Let me read around line 819 where Figure 4 is mentioned, as this might be on page 5:Perfect! I can see Figure 4 which appears to be on page 5 (after the PageBreak marker at line 815). Looking at the figure, I can see it shows a step-by-step process. Let me examine the final step more carefully:\n\nThe figure shows 4 numbered steps:\n1. Select or input a custom claim\n2. Submit to fact-check the input claim\n3. Visualize the question-answering guided reasoning process\n4. The final prediction result with rationale\n\nThe final step (step 4) shows: \"Prediction with rationale:\" followed by the text and ending with \"**Finish**\".\n\nLooking at the structure, the final step in the figure is marked as \"**Finish**\" at line 888.\n\nAnswer: Finish", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 33953, "error": null, "extra": {"thread_id": 2013, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_ca5f20bedd764740aef245cbc4ae41c8", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 195, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "finish", "normalised_gold": "4 final prediction result with rationale"}}
+{"qid": "2310.07609v1.pdf::Q005", "doc_id": "2310.07609v1.pdf", "answer_format": "str", "gold": "Yellow", "pages": 10, "document_id": 7591, "arm": "native_pdf", "question_id": "2310.07609v1.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Figure 2 on page 2 and Figure 3 on page 4:\n\nIn Figure 2 (the system architecture), the \"Reasoner\" module is shown in a blue rectangle.\n\nIn Figure 3 (showing the three QA model implementations), the \"Reader\" component appears in orange/coral rectangles in parts (a) and (c).\n\nHowever, the question asks about a model rectangle that appears in BOTH the QA model and Reasoner modules. Looking at the figures more carefully:\n\n- In Figure 3(b), the \"FLAN-T5\" box appears in yellow\n- In Figure 3(c), the \"Reader\" boxes appear in orange\n- The \"Reasoner\" in the main architecture uses similar model components\n\nThe \"Reader\" model component appears in an orange/coral colored rectangle in both the QA module implementations and is used in the reasoning process.\n\nAnswer: Orange", "citations": [], "input_tokens": 27172, "output_tokens": 206, "cost_micros": 84606, "latency_ms": 10108, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.07609v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "orange", "normalised_gold": "yellow"}}
+{"qid": "2310.07609v1.pdf::Q005", "doc_id": "2310.07609v1.pdf", "answer_format": "str", "gold": "Yellow", "pages": 10, "document_id": 7591, "arm": "azure_basic_lc", "question_id": "2310.07609v1.pdf::Q005", "answer_letter": null, "raw_text": "I need to look at the figure on page 4 that shows the QA model and Reasoner modules.\n\nLooking at Figure 2 on page 4, which shows \"The architecture of our QACHECK system,\" I can see the system architecture with various modules including:\n- Claim Verifier\n- Question Generator\n- QA Model\n- Validator\n- Reasoner\n\nThe figure shows these modules connected with arrows. However, this figure does not clearly show colored rectangles that would indicate a specific model appearing in both the QA Model and Reasoner modules.\n\nLooking more carefully at the document, Figure 3 on page 5 shows \"Illustrations of the three different implementations of the Question Answering module in QACHECK\" with three subfigures (a), (b), and (c). In subfigure (a) \"Retriever-Reader\" and subfigure (c) \"GPT Reciter-Reader\", there is a \"Reader\" component shown.\n\nAccording to the text in Section 3.5, the Reasoner \"is a special question-answering model\" and the text mentions that both the QA module (Figure 3) and potentially the Reasoner use a \"Reader\" component.\n\nHowever, the document is in grayscale/black and white, so I cannot determine colors from the PDF provided.\n\nAnswer: Cannot determine - document appears to be in grayscale", "citations": [], "input_tokens": 11660, "output_tokens": 312, "cost_micros": 39660, "latency_ms": 9907, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "cannot determine document appears to be in grayscale", "normalised_gold": "yellow"}}
+{"qid": "2310.07609v1.pdf::Q005", "doc_id": "2310.07609v1.pdf", "answer_format": "str", "gold": "Yellow", "pages": 10, "document_id": 7591, "arm": "azure_premium_lc", "question_id": "2310.07609v1.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Figure 2 on page 4, I need to identify the rectangles that appear in both the \"QA Model\" and \"Reasoner\" modules.\n\nIn the architecture diagram (Figure 2), the QA Model outputs \"(Q, A)\" pairs, and the Reasoner outputs \"Label\". \n\nThe component that connects both modules is the \"Wikipedia Corpus\" which is shown as a cylinder/database shape that provides evidence to the QA Model, and the accumulated context from Q&A pairs feeds into the Reasoner.\n\nHowever, looking more carefully at the diagram structure, both the QA Model and Reasoner are shown as rectangular boxes in the flowchart. The rectangles representing these modules themselves appear to be gray/neutral colored boxes in the system architecture.\n\nBut examining the actual visual elements more carefully, there isn't a colored rectangle that appears in both specific modules as separate components. The modules themselves are represented as boxes in the flow diagram.\n\nAnswer: gray", "citations": [], "input_tokens": 12776, "output_tokens": 212, "cost_micros": 41508, "latency_ms": 7203, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "gray", "normalised_gold": "yellow"}}
+{"qid": "2310.07609v1.pdf::Q005", "doc_id": "2310.07609v1.pdf", "answer_format": "str", "gold": "Yellow", "pages": 10, "document_id": 7591, "arm": "llamacloud_basic_lc", "question_id": "2310.07609v1.pdf::Q005", "answer_letter": null, "raw_text": "I need to look at the figure on page 4 to identify the color of the model rectangle that appears in both the QA model and Reasoner modules.\n\nLooking at Figure 2 on page 3 (which shows the system architecture), I can see the general architecture diagram. However, the question asks specifically about page 4.\n\nOn page 4, I can see Figure 3, which shows three different QA model implementations:\n(a) Retriever-Reader\n(b) FLAN-T5\n(c) GPT Reciter-Reader\n\nLooking at the diagram, I need to identify which model appears in both the QA module and the Reasoner module. According to the text in section 3.5, the Reasoner has two implementations: \"1) the end-to-end QA model based on FLAN-T5, and 2) the InstructGPT model.\"\n\nIn Figure 3, I can see rectangles representing different models. The FLAN-T5 model is shown in Figure 3(b), and this same model is mentioned as being used in the Reasoner module.\n\nLooking at the visual representation in Figure 3(b), the FLAN-T5 rectangle appears to be blue in color.\n\nAnswer: blue", "citations": [], "input_tokens": 8645, "output_tokens": 287, "cost_micros": 30240, "latency_ms": 7250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "blue", "normalised_gold": "yellow"}}
+{"qid": "2310.07609v1.pdf::Q005", "doc_id": "2310.07609v1.pdf", "answer_format": "str", "gold": "Yellow", "pages": 10, "document_id": 7591, "arm": "llamacloud_premium_lc", "question_id": "2310.07609v1.pdf::Q005", "answer_letter": null, "raw_text": "I need to examine Figure 2 on page 4, which shows \"The architecture of our QACHECK system.\"\n\nLooking at the architecture diagram, I can see several modules represented as rectangles. The question asks about a rectangle that appears in both the QA Model and Reasoner modules.\n\nIn the diagram, I can see:\n- A \"QA Model\" box in the loop section\n- A \"Reasoner\" box after the loop\n- Both of these modules appear to have the same visual representation\n\nHowever, looking more carefully at the figure description and the actual diagram components, the modules that represent model components (like the Question Answering module and Reasoner) are shown as rectangles in the flowchart.\n\nThe document shows these as standard flowchart boxes, but I cannot definitively determine a specific color from the description provided. Looking at typical diagram conventions and the figure as presented, these model components appear to be represented in standard boxes.\n\nWithout being able to see the actual colors in the PDF image clearly described in the text, I cannot make a definitive statement about the specific color.\n\nAnswer: Cannot be determined from the provided text", "citations": [], "input_tokens": 12545, "output_tokens": 247, "cost_micros": 41340, "latency_ms": 8312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "cannot be determined from provided text", "normalised_gold": "yellow"}}
+{"qid": "2310.07609v1.pdf::Q005", "doc_id": "2310.07609v1.pdf", "answer_format": "str", "gold": "Yellow", "pages": 10, "document_id": 7591, "arm": "surfsense_agentic", "question_id": "2310.07609v1.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to find the figure on page 4 and identify the color of the model rectangle that appears in both the QA model and Reasoner module.Let me search for page 4 content and figures in the document:Let me search for figure-related content and read more of the document to find page 4:Let me read the section containing Figure 3 which appears to show different implementations of the QA module:This is Figure 3. Let me continue reading to find more figures and look for page markers:Let me search for page numbers or section breaks to locate page 4:Let me read around the fourth PageBreak (line 815) which should be around page 4:This is Figure 4, which shows a user interface. Let me continue reading to see if there are more visual elements described:Now I need to go back and look at Figure 2, which shows the architecture of the QACHECK system. That figure should show both the QA model and Reasoner module. Let me read around line 416 where Figure 2 is mentioned:I see Figure 2 is described, but I need to look at more details about the visual representation. Let me search for information about colors in the figure or read earlier to see the full figure description:The embedded image description in Figure 2 only describes a yellow/golden amber circular icon with a question mark. But the question asks about a model rectangle that appears in both QA model and Reasoner modules. Let me search for more complete visual descriptions or look for other figures. Let me check if there's more detailed architecture visualization:Let me read that chunk:Let me read that section:This doesn't seem to be the right figure. Let me search more specifically for information about the architecture diagram or look for mentions of colors in model components:Let me search for \"green\" or other colors that might appear in the architecture diagram:The green icons appear to be information icons. Let me look at the text structure of Figure 2 more carefully. The figure shows \"QA Model\" and \"Reasoner\" as separate boxes. Let me search for \"orange\" or \"red\" colors:Let me try searching for \"model\" and \"reasoner\" together or look at the complete Figure 2 text more carefully:Let me look at a different approach. The question mentions \"page 4\", so let me count the PageBreaks more carefully. Let me look between the 3rd and 4th PageBreak which should be page 4:So Figure 3 is on page 4 (after the 3rd PageBreak at line 648). Figure 3 shows the three different implementations of the Question Answering module. The visual description mentions a green circular icon with \"i\". But this doesn't show both QA model and Reasoner module together.\n\nLet me look for if there are more images or visual descriptions in Figure 3:I see in Figure 3 there are three parts (a), (b), and (c) showing different QA implementations. Each shows \"Reader\" as a component, and there's a green circular icon. But I need to understand the question better - it asks about a \"model rectangle\" that appears in both QA model and Reasoner module.\n\nLet me re-read Figure 2 which shows the overall architecture, and look for visual elements that might represent both components. Let me search for mentions of \"Reader\" since that appears in both:I notice that \"Reader\" appears in Figure 3 as a component. In the figure, the description says there's a green circular icon with \"i\". The \"Reader\" component in the Retriever-Reader, FLAN-T5, and GPT Reciter-Reader architectures likely uses the same visual element. But the question asks about something that appears in both \"QA model\" and \"Reasoner module\".\n\nWait - let me re-read the architecture description. The Reasoner is described as \"a special question-answering model\". So both might use the same underlying reader model. Let me look at where the reader model is used in Figure 3. The green \"i\" icon might be the answer indicator. But the question asks about a \"model rectangle\" not an icon.\n\nLet me think about this differently. Perhaps the document has better visual descriptions elsewhere. Let me search for all embedded images:There are multiple embedded images. The question specifically mentions Figure on page 4. Based on the PageBreaks, page 4 would be between line 648 and 815. Let me look more carefully at what figures or diagrams might show architectural components with rectangles:\n\nLooking back at Figure 3 (lines 651-707), it shows three sub-figures (a), (b), and (c) representing different QA implementations. Each has components like \"Retriever\", \"Reader\", \"FLAN-T5\", \"InstructGPT\". These might be shown as rectangles in the original figure.\n\nIn Figure 3, all three QA implementations show a \"Reader\" component. And the text mentions that the Reasoner module (section 3.5, lines 919-933) is \"a special question-answering model\" that can be implemented using \"FLAN-T5\" or \"InstructGPT model\".\n\nSo FLAN-T5 appears in both:\n- In Figure 3(b) as the QA model implementation  \n- In the Reasoner as one of the two implementation options\n\nLet me check if there's a visual description that mentions FLAN-T5 as a colored rectangle. Let me search for \"FLAN\" in visual descriptions:Perfect! So FLAN-T5 is used in BOTH:\n1. Figure 3(b) - as a QA Model implementation\n2. Section 3.5 (Reasoner) - as one of the two reasoner implementations\n\nNow I need to find the visual description showing what color the FLAN-T5 rectangle is. Let me look more carefully at the embedded images around Figure 3. However, the visual description at line 719 only describes the green \"i\" icon, not the model boxes themselves.\n\nLet me check if there are more visual descriptions or embedded images in the Figure 3 section. The document might have multiple embedded images for the same figure. Let me read more around that area:These are descriptions of other images. Let me look at the specific section where Im3.png first appears (around line 711) and see if there are additional visual descriptions I missed:Still showing the green \"i\" icon. Let me look at Im5.jpg which mentions GPT3 - this might be describing the model boxes:Im5.jpg shows a GPT3 logo with purple and pink/magenta colors. But I need to find descriptions of the model rectangles/boxes in Figure 3. Let me search for descriptions that mention \"rectangle\" or \"box\" with colors:Purple appears in the GPT3 logo. Since the question asks about a component that appears in BOTH QA Model and Reasoner, and we know FLAN-T5 is used in both, let me look for visual descriptions that might show FLAN-T5 box. Let me search for \"yellow\" since that was mentioned earlier:Yellow appears in question mark icons. Let me try a different approach. Let me look at all the visual descriptions more systematically. Let me check descriptions that mention model names or rectangular shapes directly. Let me search for visual descriptions that mention boxes or rectangles with specific colors:Not helpful. Let me think differently. The embedded images are specific parts of the figure. Maybe there are multiple embedded images within Figure 3 that I haven't found yet. Let me look at the embedded images that appear chronologically around Figure 3 content. Figure 3 is around lines 651-720. Let me check what other images might be part of it:\n\nActually, wait - let me reconsider. Looking at the OCR text, Im3.png shows \"i\", Im5.jpg shows \"GPT3\". These are components within the figures. Let me look for images that might show \"FLAN-T5\" or \"FLAN\" text:FLANonly appears as text, not in embedded image OCR. Let me look for embedded images with OCR text that might contain model names. Let me check if there are images showing \"T5\" or looking at what other embedded images show:All I'm finding are green \"i\" icons. Let me try a completely different approach. Perhaps I need to look at the actual figure diagram structure more carefully. In Figure 3, there are three sub-diagrams showing different QA implementations. Each has boxes/rectangles for components. Looking at the text structure:\n\nFigure 3(a): Retriever-Reader has \"Retriever\" and \"Reader\" boxes\nFigure 3(b): FLAN-T5 has \"FLAN-T5\" box  \nFigure 3(c): GPT Reciter-Reader has \"InstructGPT\" and \"Reader\" boxes\n\nThe common element between Figure 3 and the Reasoner would be the model itself. Since both QA module and Reasoner can use FLAN-T5, I should look for a visual description of the FLAN-T5 box/rectangle.\n\nLet me search through all the visual descriptions more systematically for anything mentioning model boxes or FLAN. Actually, let me look for embedded images that have model-related OCR text. Let me look for embedded images around the areas discussing models:\n\nWait, I should look for the actual image file names and descriptions that might show text boxes or model components. Let me search for descriptions that mention \"text\" and colors:Turquoise/cyan blue! Let me read that chunk:This turquoise/cyan blue button shows \"Finish\" - that's from Figure 4 (the UI screenshot). Let me look systematically for all visual descriptions that mention rectangular shapes or boxes. Actually, let me search for specific model component descriptions. Let me look for \"RoBERTa\" which is mentioned as the Reader model:RoBERTa is used in the Reader model but there's no visual description mentioning it. Let me try to systematically go through the embedded images and look for any that might show architectural component boxes. Let me look at the images that haven't been described yet. Let me check Im2.png more thoroughly since that's Figure 2:These are still just the icons. Let me try a completely different approach. Maybe the XML parser extracted images separately from the diagram. Let me look at the beginning of the document to see if there are more detailed figure descriptions I'm missing:This is Figure 1. The visual content shows search icons and question marks. Let me consider: maybe the XML extraction treats architectural diagram components differently. Perhaps the model boxes themselves don't have detailed color descriptions, or the color information is embedded in a way I haven't found yet.\n\nLet me reconsider what the question is actually asking. It says \"the model rectangle in the figure of page 4 that appears both in QA model and Reasoner module\". If Figure 3 is on page 4, and Figure 3 shows three different QA implementations, then the component that would also appear in the Reasoner would be the UNDERLYING MODEL.\n\nLooking back at the text:\n- Section 3.3 (QA module) describes three implementations, one being FLAN-T5 (line 675-686)\n- Section 3.5 (Reasoner) says it can be implemented using \"the end-to-end QA model based on FLAN-T5\" (line 931-932)\n\nSo FLAN-T5 is the model that appears in both. Now I need to find what color the FLAN-T5 rectangle is shown in Figure 3. Since the visual descriptions I've found so far only show icons (green \"i\", yellow \"?\", orange \"?\"), maybe the model boxes themselves are not described with specific colors in the extracted XML, or I need to look elsewhere.\n\nLet me try to find if there's any systematic description of the figure architecture. Maybe there's a legend or color scheme description:Turquoise appears as a \"Finish\" button and footer element. But wait, let me read chunk 155824 which mentions a layout with multiple colored sections:", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 328655, "error": null, "extra": {"thread_id": 2014, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_4736292b32d6461e98620f394ccee4da", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 2072, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "let me try to find if there s any systematic description of figure architecture maybe there s legend or color scheme description turquoise appears as finish button and footer element but wait let me read chunk 155824 which mentions layout with multiple colored sections", "normalised_gold": "yellow"}}
+{"qid": "2310.07609v1.pdf::Q006", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "4", "pages": 10, "document_id": 7591, "arm": "native_pdf", "question_id": "2310.07609v1.pdf::Q006", "answer_letter": null, "raw_text": "I need to count the prompt instruction examples shown in the main body of the paper (not in the appendix).\n\nLooking through the main body of the paper:\n\n1. **Section 3.1 (Claim Verifier)** - Shows 1 prompt instruction example:\n   - \"Claim = CLAIM / We already know the following: CONTEXT / Can we know whether the claim is true or false now? Yes or no?\"\n\n2. **Section 3.2 (Question Generator)** - Shows 2 prompt instruction examples:\n   - Initial question: \"Claim = CLAIM / To verify the above claim, we can first ask a simple question:\"\n   - Follow-up question: \"Claim = CLAIM / We already know the following: CONTEXT / To verify the claim, what is the next question we need to know the answer to?\"\n\n3. **Section 3.4 (QA Validator)** - Shows 1 prompt instruction example:\n   - \"Claim = CLAIM / We already know the following: CONTEXT / Now we further know: NEW QA PAIR / Does the QA pair have additional knowledge useful for verifying the claim?\"\n\n4. **Section 3.5 (Reasoner)** - The text mentions prompts are in \"Appendix A.4\" but doesn't show the full prompt in the main body.\n\nTotal prompts shown in the main body: 1 + 2 + 1 = 4\n\nAnswer: 4", "citations": [], "input_tokens": 27159, "output_tokens": 335, "cost_micros": 86502, "latency_ms": 9625, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.07609v1.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2310.07609v1.pdf::Q006", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "4", "pages": 10, "document_id": 7591, "arm": "azure_basic_lc", "question_id": "2310.07609v1.pdf::Q006", "answer_letter": null, "raw_text": "I need to count the number of prompt instruction examples listed in the main body of the paper.\n\nLooking through the document, I can find the following prompt instructions in the main body (not in the Appendix):\n\n1. **Claim Verifier prompt** (Section 3.1):\n```\nClaim = CLAIM We already know the following:\nCONTEXT\nCan we know whether the claim is true or false now? Yes or no?\n```\n\n2. **Question Generator - Initial question prompt** (Section 3.2):\n```\nClaim = CLAIM\nTo verify the above claim, we can first ask a simple question:\n```\n\n3. **Question Generator - Follow-up question prompt** (Section 3.2):\n```\nClaim = CLAIM\nWe already know the following: CONTEXT\nTo verify the claim, what is the next question we need to know the answer to?\n```\n\n4. **QA Validator prompt** (Section 3.4):\n```\nClaim = CLAIM\nWe already know the following:\nCONTEXT\nNow we further know: NEW QA PAIR\nDoes the QA pair have additional knowledge useful for verifying the claim?\n```\n\nThat gives us 4 prompt instruction examples in the main body of the paper.\n\nAnswer: 4", "citations": [], "input_tokens": 11647, "output_tokens": 289, "cost_micros": 39276, "latency_ms": 5532, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2310.07609v1.pdf::Q006", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "4", "pages": 10, "document_id": 7591, "arm": "azure_premium_lc", "question_id": "2310.07609v1.pdf::Q006", "answer_letter": null, "raw_text": "I need to count the prompts/instruction examples that are explicitly shown in the main body of the paper (not in the Appendix).\n\nLooking through the main body of the paper:\n\n1. **Section 3.1 (Claim Verifier)** - Shows 1 prompt instruction:\n   ```\n   Claim = CLAIM\n   We already know the following:\n   CONTEXT\n   Can we know whether the claim is true or false now? Yes or no?\n   ```\n\n2. **Section 3.2 (Question Generator)** - Shows 2 prompt instructions:\n   - For initial question generation:\n   ```\n   Claim = CLAIM\n   To verify the above claim, we can first ask a simple question:\n   ```\n   \n   - For follow-up questions:\n   ```\n   Claim = CLAIM\n   We already know the following:\n   CONTEXT\n   To verify the claim, what is the next question we need to know the answer to?\n   ```\n\n3. **Section 3.4 (QA Validator)** - Shows 1 prompt instruction:\n   ```\n   Claim = CLAIM\n   We already know the following:\n   CONTEXT\n   Now we further know:\n   NEW QA PAIR\n   Does the QA pair have additional knowledge useful for verifying the claim?\n   ```\n\nTotal prompts in the main body: 1 + 2 + 1 = 4\n\nAnswer: 4", "citations": [], "input_tokens": 12763, "output_tokens": 336, "cost_micros": 43329, "latency_ms": 6578, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2310.07609v1.pdf::Q006", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "4", "pages": 10, "document_id": 7591, "arm": "llamacloud_basic_lc", "question_id": "2310.07609v1.pdf::Q006", "answer_letter": null, "raw_text": "I need to count the number of prompt instruction examples listed in the main body of the paper (excluding the Appendix).\n\nLet me go through the main body of the paper systematically:\n\n1. **Section 3.1 (Claim Verifier)**: The text mentions \"We prompt InstructGPT with ten distinct in-context examples as detailed in Appendix A.1\" - but no examples are shown in the main body, only referenced.\n\n2. **Section 3.2 (Question Generator)**: Two prompts are shown:\n   - Initial question generation prompt (1 prompt)\n   - Follow-up question generation prompt (1 prompt)\n\n3. **Section 3.3 (Question Answering Model)**: No complete prompt examples are shown in the main body.\n\n4. **Section 3.4 (QA Validator)**: One prompt instruction is shown for the validator module (1 prompt)\n\n5. **Section 3.5 (Reasoner)**: The text mentions \"the InstructGPT model with the prompts given in Appendix A.4\" - but no examples are shown in the main body.\n\nCounting the prompts actually displayed in the main body:\n- Section 3.2: 2 prompts (initial question generation + follow-up question generation)\n- Section 3.4: 1 prompt (QA validator)\n\nTotal: 3 prompts in the main body\n\nAnswer: 3", "citations": [], "input_tokens": 8632, "output_tokens": 331, "cost_micros": 30861, "latency_ms": 7125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "4"}}
+{"qid": "2310.07609v1.pdf::Q006", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "4", "pages": 10, "document_id": 7591, "arm": "llamacloud_premium_lc", "question_id": "2310.07609v1.pdf::Q006", "answer_letter": null, "raw_text": "I need to count the number of prompt instruction examples listed in the main body of the paper (not in the Appendix).\n\nLooking through the main body of the paper:\n\n1. **Section 3.1 (Claim Verifier)** - 1 prompt instruction:\n   - \"Claim = CLAIM... Can we know whether the claim is true or false now? Yes or no?\"\n\n2. **Section 3.2 (Question Generator)** - 2 prompt instructions:\n   - Initial question generation: \"Claim = CLAIM... To verify the above claim, we can first ask a simple question:\"\n   - Follow-up questions: \"Claim = CLAIM... We already know the following: CONTEXT... To verify the claim, what is the next question we need to know the answer to?\"\n\n3. **Section 3.4 (QA Validator)** - 1 prompt instruction:\n   - \"Claim = CLAIM... We already know the following: CONTEXT... Now we further know: NEW QA PAIR... Does the QA pair have additional knowledge useful for verifying the claim?\"\n\nTotal prompts in the main body: 1 + 2 + 1 = 4\n\nAnswer: 4", "citations": [], "input_tokens": 12532, "output_tokens": 279, "cost_micros": 41781, "latency_ms": 5687, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2310.07609v1.pdf::Q006", "doc_id": "2310.07609v1.pdf", "answer_format": "int", "gold": "4", "pages": 10, "document_id": 7591, "arm": "surfsense_agentic", "question_id": "2310.07609v1.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the document to find information about instruction examples in the main body of the paper.Let me continue reading to find information about instruction examples in the main body of the paper.Let me read the Appendix section to find the prompts and instruction examples.Perfect! I can now see the prompts section clearly. Let me count the instruction examples mentioned. Looking at the document, I can see:\n\n1. **A.1 Prompts for Claim Verifier** - Shows \"<10 demonstrations in total>\"\n2. **A.2 Prompts for Question Generation** (initial question) - Shows \"<10 demonstrations in total>\"\n3. **A.2 Prompts for follow-up question generation** - Shows \"<10 demonstrations in total>\"\n4. **A.3 Prompts for Validator** - Shows \"<10 demonstrations in total>\"\n5. **A.4 Prompts for Reasoner** - Shows \"<10 demonstrations in total>\"\n\nHowever, the question asks about \"instruction examples listed in the main body of the paper.\" Let me check if there's more specific information in the main body about how many examples are actually shown.Let me search more specifically for mentions of how many examples are shown in the main body.Great! I found it. Let me read line 582 more carefully. The main body states: \"We prompt InstructGPT with **ten distinct in-context examples** as detailed in Appendix A.1\"\n\nLet me verify this is the answer by checking if there are other mentions of specific numbers in the main body.Perfect! I found another mention in line 790: \"with a suite of **ten demonstrations** shown in Appendix A.3\"\n\nLet me search for all instances where numbers of examples/demonstrations are mentioned in the main body.Based on my reading of the document, I found that in the main body of the paper, there are **two explicit mentions** of the number of instruction examples/demonstrations:\n\n1. Line 582: \"We prompt InstructGPT with **ten distinct in-context examples**\" (for the Claim Verifier)\n2. Line 790: \"with a suite of **ten demonstrations**\" (for the Validator)\n\nHowever, the question asks specifically \"How many prompts instruction examples are listed in the main body of the paper?\" - it asks for the COUNT/NUMBER that is mentioned, not how many times it's mentioned.\n\nAccording to the main body:\n- 10 examples for Claim Verifier (mentioned on line 582)\n- 10 demonstrations for Validator (mentioned on line 790)\n\nSince both mention \"10\" and the question asks for the number in float format and wants a single integer answer, the answer is:\n\n**10**\n\nAnswer: 10", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 77358, "error": null, "extra": {"thread_id": 2015, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_302989d449c9437a85f8635948950b78", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 438, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "10", "normalised_gold": "4"}}
+{"qid": "2310.07609v1.pdf::Q007", "doc_id": "2310.07609v1.pdf", "answer_format": "float", "gold": "0.45", "pages": 10, "document_id": 7591, "arm": "native_pdf", "question_id": "2310.07609v1.pdf::Q007", "answer_letter": null, "raw_text": "I need to find the performance scores for ProgramFC and QACheck (the proposed system) in the HOVER dataset from Table 1.\n\nFrom Table 1 on page 6:\n\n**HOVER dataset:**\n- ProgramFC scores: 54.27 (2-hop), 54.18 (3-hop), 52.88 (4-hop)\n- QACheck scores: 55.67 (2-hop), 54.67 (3-hop), 52.35 (4-hop)\n\nNow I'll calculate the performance gap for each:\n- 2-hop: 55.67 - 54.27 = 1.40\n- 3-hop: 54.67 - 54.18 = 0.49\n- 4-hop: 52.35 - 52.88 = -0.53\n\nAverage performance gap = (1.40 + 0.49 + (-0.53)) / 3 = 1.36 / 3 = 0.453333...\n\nRounded to 2 decimal places: 0.45\n\nAnswer: 0.45", "citations": [], "input_tokens": 27174, "output_tokens": 268, "cost_micros": 85542, "latency_ms": 9796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.07609v1.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "0.45", "normalised_gold": "0.45"}}
+{"qid": "2310.07609v1.pdf::Q007", "doc_id": "2310.07609v1.pdf", "answer_format": "float", "gold": "0.45", "pages": 10, "document_id": 7591, "arm": "azure_basic_lc", "question_id": "2310.07609v1.pdf::Q007", "answer_letter": null, "raw_text": "I need to find the performance gap between ProgramFC and QACheck on the HOVER dataset.\n\nLooking at Table 1 in the document, I can find the F1 scores for both systems on HOVER:\n\n**HOVER 2-hop:**\n- ProgramFC: 54.27\n- QACheck: 55.67\n- Gap: 55.67 - 54.27 = 1.40\n\n**HOVER 3-hop:**\n- ProgramFC: 54.18\n- QACheck: 54.67\n- Gap: 54.67 - 54.18 = 0.49\n\n**HOVER 4-hop:**\n- ProgramFC: 52.88\n- QACheck: 52.35\n- Gap: 52.35 - 52.88 = -0.53\n\nTo calculate the average performance gap across all three HOVER subsets:\nAverage gap = (1.40 + 0.49 + (-0.53)) / 3 = 1.36 / 3 = 0.45333...\n\nRounded to 2 decimals: 0.45\n\nAnswer: 0.45", "citations": [], "input_tokens": 11662, "output_tokens": 275, "cost_micros": 39111, "latency_ms": 5983, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "0.45", "normalised_gold": "0.45"}}
+{"qid": "2310.07609v1.pdf::Q007", "doc_id": "2310.07609v1.pdf", "answer_format": "float", "gold": "0.45", "pages": 10, "document_id": 7591, "arm": "azure_premium_lc", "question_id": "2310.07609v1.pdf::Q007", "answer_letter": null, "raw_text": "I need to find the performance gap between ProgramFC and QACheck on the HOVER dataset.\n\nFrom Table 1, the F1 scores are:\n\n**HOVER 2-hop:**\n- ProgramFC: 54.27\n- QACheck: 55.67\n- Gap: 55.67 - 54.27 = 1.40\n\n**HOVER 3-hop:**\n- ProgramFC: 54.18\n- QACheck: 54.67\n- Gap: 54.67 - 54.18 = 0.49\n\n**HOVER 4-hop:**\n- ProgramFC: 52.88\n- QACheck: 52.35\n- Gap: 52.35 - 52.88 = -0.53\n\nAverage gap = (1.40 + 0.49 + (-0.53)) / 3 = 1.36 / 3 = 0.45333...\n\nRounded to 2 decimals: 0.45\n\nAnswer: 0.45", "citations": [], "input_tokens": 12778, "output_tokens": 247, "cost_micros": 42039, "latency_ms": 5812, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "0.45", "normalised_gold": "0.45"}}
+{"qid": "2310.07609v1.pdf::Q007", "doc_id": "2310.07609v1.pdf", "answer_format": "float", "gold": "0.45", "pages": 10, "document_id": 7591, "arm": "llamacloud_basic_lc", "question_id": "2310.07609v1.pdf::Q007", "answer_letter": null, "raw_text": "I need to find the performance gap between ProgramFC and QACheck on the HOVER dataset.\n\nFrom Table 1 in the document, I can see the performance results:\n\nFor HOVER dataset:\n- ProgramFC: 54.27\n- QACheck: 55.67\n\nFor 2-hop:\n- ProgramFC: 54.18\n- QACheck: 54.67\n\nFor 3-hop:\n- ProgramFC: 52.88\n- QACheck: 52.35\n\nFor 4-hop:\n- ProgramFC: 59.66\n- QACheck: 59.47\n\nThe HOVER dataset has three subsets (2-hop, 3-hop, 4-hop) based on reasoning depth.\n\nTo calculate the average performance gap:\n- 2-hop gap: 54.67 - 54.18 = 0.49\n- 3-hop gap: 52.35 - 52.88 = -0.53\n- 4-hop gap: 59.47 - 59.66 = -0.19\n\nAverage gap = (0.49 + (-0.53) + (-0.19)) / 3 = -0.23 / 3 = -0.0767\n\nRounded to 2 decimal places: -0.08\n\nAnswer: -0.08", "citations": [], "input_tokens": 8647, "output_tokens": 319, "cost_micros": 30726, "latency_ms": 6530, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "-0.08", "normalised_gold": "0.45"}}
+{"qid": "2310.07609v1.pdf::Q007", "doc_id": "2310.07609v1.pdf", "answer_format": "float", "gold": "0.45", "pages": 10, "document_id": 7591, "arm": "llamacloud_premium_lc", "question_id": "2310.07609v1.pdf::Q007", "answer_letter": null, "raw_text": "I need to find the performance gap between ProgramFC and QACheck in the HOVER dataset from Table 1.\n\nFrom Table 1, the F1 scores are:\n\n**HOVER 2-hop:**\n- ProgramFC: 54.27\n- QACheck: 55.67\n- Gap: 55.67 - 54.27 = 1.40\n\n**HOVER 3-hop:**\n- ProgramFC: 54.18\n- QACheck: 54.67\n- Gap: 54.67 - 54.18 = 0.49\n\n**HOVER 4-hop:**\n- ProgramFC: 52.88\n- QACheck: 52.35\n- Gap: 52.35 - 52.88 = -0.53\n\nAverage gap = (1.40 + 0.49 + (-0.53)) / 3 = 1.36 / 3 = 0.45333...\n\nRounded to 2 decimal places: 0.45\n\nAnswer: 0.45", "citations": [], "input_tokens": 12547, "output_tokens": 251, "cost_micros": 41406, "latency_ms": 5296, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "0.45", "normalised_gold": "0.45"}}
+{"qid": "2310.07609v1.pdf::Q007", "doc_id": "2310.07609v1.pdf", "answer_format": "float", "gold": "0.45", "pages": 10, "document_id": 7591, "arm": "surfsense_agentic", "question_id": "2310.07609v1.pdf::Q007", "answer_letter": null, "raw_text": "I'll read the document to find information about ProgramFC performance and the proposed system on the HOVER dataset.Let me search for tables or sections containing performance comparisons on the HOVER dataset.Let me read more of the document to find the tables with performance results.Let me search for tables with results and performance metrics.Perfect! I found Table 1 with the performance results. Let me extract the exact values:\n\nFrom Table 1, for the HOVER dataset:\n- **ProgramFC**: \n  - 2-hop: 54.27\n  - 3-hop: 54.18\n  - 4-hop: 52.88\n\n- **QACheck (proposed system)**:\n  - 2-hop: 55.67\n  - 3-hop: 54.67\n  - 4-hop: 52.35\n\nNow I need to calculate the average performance gap between QACheck and ProgramFC:\n\nFor each hop level:\n- 2-hop gap: 55.67 - 54.27 = 1.40\n- 3-hop gap: 54.67 - 54.18 = 0.49\n- 4-hop gap: 52.35 - 52.88 = -0.53\n\nAverage gap = (1.40 + 0.49 + (-0.53)) / 3 = 1.36 / 3 = 0.45333...\n\nRounded to 2 decimals: 0.45\n\nAnswer: 0.45", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 51640, "error": null, "extra": {"thread_id": 2016, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_b7dbcb3c5c4d4293862871c529bf39fc", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 231, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "0.45", "normalised_gold": "0.45"}}
+{"qid": "2310.09158v1.pdf::Q000", "doc_id": "2310.09158v1.pdf", "answer_format": "list", "gold": "['event relation extraction', 'deductive reasoning']", "pages": 23, "document_id": 7592, "arm": "native_pdf", "question_id": "2310.09158v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to identify the main tasks the authors conducted experiments on by examining the document.\n\nFrom the abstract and introduction (page 1), the authors state:\n\"More in detail, we first investigate the deficiency of LLMs in logical reasoning on different tasks, including event relation extraction and deductive reasoning.\"\n\nFrom Section 2 PRELIMINARIES (page 2), the document explicitly describes two main tasks:\n- Section 2.1: Event Relation Extraction\n- Section 2.2: Deductive Reasoning\n\nThese are consistently mentioned throughout the paper as the primary experimental tasks:\n- The pilot study (Section 3.1.1) uses \"MAVEN-ERE and ProofWriter\" \n- Main experiments (Section 4.1) evaluate on \"MAVEN-ERE, Causal-TimeBank, and ProofWriter\"\n- MAVEN-ERE and Causal-TimeBank are event relation extraction datasets\n- ProofWriter is a deductive reasoning dataset\n\nAnswer: Event Relation Extraction, Deductive Reasoning", "citations": [], "input_tokens": 65638, "output_tokens": 233, "cost_micros": 200409, "latency_ms": 34842, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.09158v1.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "deductive reasoning, event relation extraction", "normalised_gold": "deductive reasoning, event relation extraction"}}
+{"qid": "2310.09158v1.pdf::Q000", "doc_id": "2310.09158v1.pdf", "answer_format": "list", "gold": "['event relation extraction', 'deductive reasoning']", "pages": 23, "document_id": 7592, "arm": "azure_basic_lc", "question_id": "2310.09158v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking through the document, I can identify the two main tasks that the authors conducted experiments on:\n\n1. **Event Relation Extraction (ERE)** - This is extensively discussed throughout the paper, with experiments on datasets like MAVEN-ERE and Causal-TimeBank. The task involves identifying coreference, temporal, causal, and subevent relations between events.\n\n2. **Deductive Reasoning** - This is the second main task, with experiments conducted on the ProofWriter dataset. The task involves logical reasoning to prove, disprove, or determine if statements are unknown based on given facts and rules.\n\nThese two tasks are explicitly mentioned in the abstract: \"we first investigate the deficiency of LLMs in logical reasoning on different tasks, including event relation extraction and deductive reasoning.\"\n\nAnswer: Event Relation Extraction, Deductive Reasoning", "citations": [], "input_tokens": 28196, "output_tokens": 189, "cost_micros": 87423, "latency_ms": 6483, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "deductive reasoning, event relation extraction", "normalised_gold": "deductive reasoning, event relation extraction"}}
+{"qid": "2310.09158v1.pdf::Q000", "doc_id": "2310.09158v1.pdf", "answer_format": "list", "gold": "['event relation extraction', 'deductive reasoning']", "pages": 23, "document_id": 7592, "arm": "azure_premium_lc", "question_id": "2310.09158v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking through the document, I can identify the two main tasks where experiments were conducted.\n\nFrom the Abstract: \"we first investigate the deficiency of LLMs in logical reasoning on different tasks, including event relation extraction and deductive reasoning.\"\n\nFrom Section 2 (Preliminaries): \"In this section, we first introduce two tasks that this paper mainly explores.\" followed by Section 2.1 \"Event Relation Extraction\" and Section 2.2 \"Deductive Reasoning\"\n\nFrom the experimental setup sections, both tasks are consistently used throughout the paper for evaluation (Tables 1, 2, 3, and various figures reference MAVEN-ERE/Causal-TimeBank for ERE and ProofWriter for deductive reasoning).\n\nAnswer: event relation extraction, deductive reasoning", "citations": [], "input_tokens": 34236, "output_tokens": 175, "cost_micros": 105333, "latency_ms": 7014, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "deductive reasoning, event relation extraction", "normalised_gold": "deductive reasoning, event relation extraction"}}
+{"qid": "2310.09158v1.pdf::Q000", "doc_id": "2310.09158v1.pdf", "answer_format": "list", "gold": "['event relation extraction', 'deductive reasoning']", "pages": 23, "document_id": 7592, "arm": "llamacloud_basic_lc", "question_id": "2310.09158v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking through the document for the main experimental tasks:\n\nFrom the Introduction and throughout the paper, the authors clearly state they conduct investigations on two main tasks:\n\n1. **Event Relation Extraction (ERE)** - mentioned extensively, including datasets MAVEN-ERE and Causal-TimeBank\n2. **Deductive Reasoning** - mentioned alongside ERE, including the ProofWriter dataset\n\nThis is confirmed in multiple places:\n- Section 2 \"PRELIMINARIES\" has subsections 2.1 \"Event Relation Extraction\" and 2.2 \"Deductive Reasoning\"\n- Section 3.1.1 states: \"We conduct a manual evaluation on MAVEN-ERE and ProofWriter\"\n- The abstract mentions \"event relation extraction and deductive reasoning\"\n\nAnswer: Event Relation Extraction, Deductive Reasoning", "citations": [], "input_tokens": 26309, "output_tokens": 193, "cost_micros": 81822, "latency_ms": 6296, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "deductive reasoning, event relation extraction", "normalised_gold": "deductive reasoning, event relation extraction"}}
+{"qid": "2310.09158v1.pdf::Q000", "doc_id": "2310.09158v1.pdf", "answer_format": "list", "gold": "['event relation extraction', 'deductive reasoning']", "pages": 23, "document_id": 7592, "arm": "llamacloud_premium_lc", "question_id": "2310.09158v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking through the document, I can identify the two main tasks the authors conducted experiments on:\n\n1. **Event Relation Extraction (ERE)** - This task aims to identify relations (Coreference, Temporal, Causal, and Subevent) between two events in text. The authors used datasets like MAVEN-ERE and Causal-TimeBank for this task.\n\n2. **Deductive Reasoning** - This task involves beginning with known facts and rules, then iteratively making new inferences. The authors used the ProofWriter dataset for this task.\n\nThese two tasks are explicitly mentioned in Section 2 (PRELIMINARIES) where the authors state: \"In this section, we first introduce two tasks that this paper mainly explores.\" The section then proceeds with subsections 2.1 Event Relation Extraction and 2.2 Deductive Reasoning.\n\nAnswer: Event Relation Extraction, Deductive Reasoning", "citations": [], "input_tokens": 34535, "output_tokens": 211, "cost_micros": 106770, "latency_ms": 6907, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "deductive reasoning, event relation extraction", "normalised_gold": "deductive reasoning, event relation extraction"}}
+{"qid": "2310.09158v1.pdf::Q000", "doc_id": "2310.09158v1.pdf", "answer_format": "list", "gold": "['event relation extraction', 'deductive reasoning']", "pages": 23, "document_id": 7592, "arm": "surfsense_agentic", "question_id": "2310.09158v1.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to identify the two main tasks the author conducted experiments on.Let me continue reading to find the specific tasks mentioned:Based on the document, I can see that the author evaluated LLMs in two practical scenarios:\n\n1. Event relation extraction (lines 473)\n2. Deductive reasoning tasks (lines 480-481)\n\nAnswer: event relation extraction, deductive reasoning", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 57891, "error": null, "extra": {"thread_id": 2017, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_fc47d9b3789248779051669f324d1694", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 114, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "deductive reasoning, event relation extraction", "normalised_gold": "deductive reasoning, event relation extraction"}}
+{"qid": "2310.09158v1.pdf::Q001", "doc_id": "2310.09158v1.pdf", "answer_format": "int", "gold": "6", "pages": 23, "document_id": 7592, "arm": "native_pdf", "question_id": "2310.09158v1.pdf::Q001", "answer_letter": null, "raw_text": "I need to find information about the number of possible combinations of relationships between two events.\n\nFrom the document, I can see in Section 2.1 (Event Relation Extraction) that the task involves identifying four types of relations between two events:\n\n1. **Coreference relations**: 2 possible labels (NO_COREFERENCE, COREFERENCE)\n2. **Temporal relations**: 7 possible labels (NO_TEMPORAL, BEFORE, OVERLAP, CONTAINS, SIMULTANEOUS, ENDS-ON, BEGINS-ON)\n3. **Causal relations**: 3 possible labels (NO_CAUSAL, PRECONDITION, CAUSE)\n4. **Subevent relations**: 2 possible labels (NO_SUBEVENT, SUBEVENT)\n\nThe total number of combinations would be:\n2 \u00d7 7 \u00d7 3 \u00d7 2 = 84\n\nHowever, let me verify this with Figure 1, which shows an example calculation. In Figure 1, it states:\n\n\"Logical Inconsistency (LI) = Number of Conflicts (1) / Number of Combinations (6) = 1/6 \u2248 16.7%\"\n\nThe text explains: \"for each pair of events, we have 4 types of relations to determine. The total combinations between these relations are calculated using the combinatorial formula: 4*(4-1)/2 = 6.\"\n\nThis is referring to pairwise combinations between the 4 relation types, not the total number of possible relationship combinations.\n\nFor the total number of possible combinations of relationships, the calculation is:\n2 \u00d7 7 \u00d7 3 \u00d7 2 = 84\n\nAnswer: 84", "citations": [], "input_tokens": 65637, "output_tokens": 393, "cost_micros": 202806, "latency_ms": 38250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.09158v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "84", "normalised_gold": "6"}}
+{"qid": "2310.09158v1.pdf::Q001", "doc_id": "2310.09158v1.pdf", "answer_format": "int", "gold": "6", "pages": 23, "document_id": 7592, "arm": "azure_basic_lc", "question_id": "2310.09158v1.pdf::Q001", "answer_letter": null, "raw_text": "I need to find how many possible combinations of relationships can exist between two events according to the paper's definition.\n\nFrom the document, I can see in Section 2.1 (Event Relation Extraction) that the paper defines 4 types of relations between two events:\n\n1. **Coreference relations**: 2 labels (NO_COREFERENCE, COREFERENCE)\n2. **Temporal relations**: 7 labels (NO_TEMPORAL, BEFORE, OVERLAP, CONTAINS, SIMULTANEOUS, ENDS-ON, BEGINS-ON)\n3. **Causal relations**: 3 labels (NO_CAUSAL, PRECONDITION, CAUSE)\n4. **Subevent relations**: 2 labels (NO_SUBEVENT, SUBEVENT)\n\nThe document also explicitly mentions in the logical inconsistency calculation (Figure 1 and Section 2.1):\n\n\"Now, regarding the total number of combinations: for each pair of events, we have 4 types of relations to determine. The total combinations between these relations are calculated using the combinatorial formula: 4*(4-1)/2 = 6.\"\n\nThis refers to combinations between the 4 relation types (choosing 2 from 4 types to check for conflicts), not the total possible relationship combinations.\n\nHowever, looking more carefully at the document, particularly in Figure 1, it states:\n\"Number of Combinations (+ O) = 6\"\n\nThis is the number of pairwise combinations when checking for conflicts between the 4 different relation types: C(4,2) = 4!/(2!\u00d72!) = 6.\n\nBut the question asks about \"combinations of relationships between two events,\" which would be the total number of possible relationship combinations (one label from each of the 4 types):\n\nTotal combinations = 2 \u00d7 7 \u00d7 3 \u00d7 2 = 84\n\nHowever, re-reading the context in Figure 1 more carefully, the \"6\" refers to checking consistency between pairs of the 4 relation types, not the total possible labelings.\n\nLooking at the question again and the context in the paper where they discuss logical inconsistency, the relevant number appears to be **6** - the number of combinations when checking pairs of relation types for conflicts", "citations": [], "input_tokens": 28195, "output_tokens": 512, "cost_micros": 92265, "latency_ms": 12843, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2310.09158v1.pdf::Q001", "doc_id": "2310.09158v1.pdf", "answer_format": "int", "gold": "6", "pages": 23, "document_id": 7592, "arm": "azure_premium_lc", "question_id": "2310.09158v1.pdf::Q001", "answer_letter": null, "raw_text": "I need to find how many possible combinations of relationships can exist between two events according to the paper's definition.\n\nFrom Section 2.1 (Event Relation Extraction), the paper states that for each pair of events, there are 4 types of relations to determine:\n\n1. **Coreference relations**: {NO_COREFERENCE, COREFERENCE} - 2 options\n2. **Temporal relations**: {NO_TEMPORAL, BEFORE, OVERLAP, CONTAINS, SIMULTANEOUS, ENDS-ON, BEGINS-ON} - 7 options\n3. **Causal relations**: {NO_CAUSAL, PRECONDITION, CAUSE} - 3 options\n4. **Subevent relations**: {NO_SUBEVENT, SUBEVENT} - 2 options\n\nThe paper also provides a specific formula in Figure 1 for calculating the \"total number of combinations\" when computing logical inconsistency:\n\nIn the example, it states: \"for each pair of events, we have 4 types of relations to determine. The total combinations between these relations are calculated using the combinatorial formula: 4*(4-1)/2 = 6.\"\n\nThis formula 4*(4-1)/2 = 6 represents the number of pairs of relation types (combinations of 2 from 4 types), which is used to check for conflicts between different relation types.\n\nHowever, if the question asks about the total possible combinations of relationship assignments (not pairs for conflict checking), it would be:\n2 \u00d7 7 \u00d7 3 \u00d7 2 = 84 possible combinations\n\nBut based on the context in Figure 1 where they explicitly calculate \"total number of combinations\" as 6 using the formula 4*(4-1)/2, the paper is referring to the number of pairs of relation types that need to be checked for logical consistency.\n\nAnswer: 6", "citations": [], "input_tokens": 34235, "output_tokens": 428, "cost_micros": 109125, "latency_ms": 11375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2310.09158v1.pdf::Q001", "doc_id": "2310.09158v1.pdf", "answer_format": "int", "gold": "6", "pages": 23, "document_id": 7592, "arm": "llamacloud_basic_lc", "question_id": "2310.09158v1.pdf::Q001", "answer_letter": null, "raw_text": "I need to find how many possible combinations of relationships can exist between two events according to the paper.\n\nFrom Section 2.1 (Event Relation Extraction), the paper identifies four types of relations between two events:\n\n1. **Coreference relations**: 2 options\n   - NO_COREFERENCE\n   - COREFERENCE\n\n2. **Temporal relations**: 7 options\n   - NO_TEMPORAL\n   - BEFORE\n   - OVERLAP\n   - CONTAINS\n   - SIMULTANEOUS\n   - ENDS-ON\n   - BEGINS-ON\n\n3. **Causal relations**: 3 options\n   - NO_CAUSAL\n   - PRECONDITION\n   - CAUSE\n\n4. **Subevent relations**: 2 options\n   - NO_SUBEVENT\n   - SUBEVENT\n\nThe paper states in Section 2.1: \"We formulate ERE as a multi-classification problem, determining one label (relation) for each of these four relation types.\"\n\nSince we need to determine one label for each of the four relation types, and these are independent choices, the total number of possible combinations is:\n\n2 \u00d7 7 \u00d7 3 \u00d7 2 = 84\n\nThis is also confirmed in Section 2.1 where it mentions calculating logical inconsistency as \"the ratio of the number of conflicts to the total number of combinations (i.e., all combinations of each two relations).\"\n\nAnswer: 84", "citations": [], "input_tokens": 26308, "output_tokens": 335, "cost_micros": 83949, "latency_ms": 7969, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "84", "normalised_gold": "6"}}
+{"qid": "2310.09158v1.pdf::Q001", "doc_id": "2310.09158v1.pdf", "answer_format": "int", "gold": "6", "pages": 23, "document_id": 7592, "arm": "llamacloud_premium_lc", "question_id": "2310.09158v1.pdf::Q001", "answer_letter": null, "raw_text": "I need to find how many possible combinations of relationships can exist between two events according to the paper's definition.\n\nFrom Section 2.1 (Event Relation Extraction), the paper discusses the Logical Inconsistency (LI) metric. In the example shown in Figure 1, there's a specific calculation:\n\n\"Among these, \"SIMULTANEOUS\" and \"CAUSE\" are identified as conflicting with each other based on the logical constraints we have defined, resulting in a single conflict. Now, regarding the total number of combinations: for each pair of events, we have 4 types of relations to determine. The total combinations between these relations are calculated using the combinatorial formula: $4 * (4 - 1) / 2 = 6$.\"\n\nThis calculation shows that with 4 types of relations (Coreference, Temporal, Causal, and Subevent), the number of combinations between pairs of these relations is 6.\n\nThis is confirmed in the definition of Logical Inconsistency in Section 2.1:\n\"logical inconsistency is calculated as the ratio of the number of conflicts (i.e., the answers that conflict with the given logical constraints) to the total number of combinations (i.e., all combinations of each two relations).\"\n\nThe example explicitly states: \"Hence, the logical inconsistency in this example is computed as LI = 1/6 (or approximately 16.7%).\"\n\nThe denominator 6 represents the total number of possible combinations of relationships between two events.\n\nAnswer: 6", "citations": [], "input_tokens": 34534, "output_tokens": 346, "cost_micros": 108792, "latency_ms": 10032, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2310.09158v1.pdf::Q001", "doc_id": "2310.09158v1.pdf", "answer_format": "int", "gold": "6", "pages": 23, "document_id": 7592, "arm": "surfsense_agentic", "question_id": "2310.09158v1.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the document to find the definition of relationships between two events.Let me search for more specific terms related to event relationships:Perfect! I found the answer. Let me read that specific chunk to get the complete information:Based on the document, I can see that for each pair of events, there are 4 types of relations to determine. The total combinations between these relations are calculated using the combinatorial formula: 4*(4-1)/2 = 6.\n\nAnswer: 6", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 36141, "error": null, "extra": {"thread_id": 2018, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_45c25fd9bb164100939e531a52487f98", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 140, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2310.09158v1.pdf::Q004", "doc_id": "2310.09158v1.pdf", "answer_format": "str", "gold": "CoT w. logical constraints", "pages": 23, "document_id": 7592, "arm": "native_pdf", "question_id": "2310.09158v1.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Table 3 in the document, which shows \"Vicuna-13B and Llama2-13B's performance on MAVEN-ERE and Causal-TimeBank after pre-training on LLM-LR\":\n\nFor Vicuna-13B-PT on MAVEN-ERE, the results are:\n- vanilla ICL: 15.3% Micro-F1\n- vanilla ICL w. CoT: 15.8% Micro-F1\n- CoT w. logical constraints: 18.0% Micro-F1\n- w. all logical constraints: 16.3% Micro-F1\n- w. retrieved logical constraints: 16.1% Micro-F1\n- w. post-processing: 11.0% Micro-F1\n\nThe best performance is 18.0% Micro-F1.\n\nAnswer: CoT w. logical constraints", "citations": [], "input_tokens": 65644, "output_tokens": 212, "cost_micros": 200112, "latency_ms": 35562, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.09158v1.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "cot w logical constraints", "normalised_gold": "cot w logical constraints"}}
+{"qid": "2310.09158v1.pdf::Q004", "doc_id": "2310.09158v1.pdf", "answer_format": "str", "gold": "CoT w. logical constraints", "pages": 23, "document_id": 7592, "arm": "azure_basic_lc", "question_id": "2310.09158v1.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Table 3 in the document, which shows the performance of Vicuna-13B after pre-training (Vicuna-13B-PT) on MAVEN-ERE:\n\n- vanilla ICL: 15.3%\n- vanilla ICL w. CoT: 15.8%\n- CoT w. logical constraints: 18.0%\n- w. all logical constraints: 16.3%\n- w. retrieved logical constraints: 16.1%\n- w. post-processing: 11.0%\n\nThe highest Micro-F1 score is 18.0%.\n\nAnswer: CoT w. logical constraints", "citations": [], "input_tokens": 28202, "output_tokens": 149, "cost_micros": 86841, "latency_ms": 5000, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "cot w logical constraints", "normalised_gold": "cot w logical constraints"}}
+{"qid": "2310.09158v1.pdf::Q004", "doc_id": "2310.09158v1.pdf", "answer_format": "str", "gold": "CoT w. logical constraints", "pages": 23, "document_id": 7592, "arm": "azure_premium_lc", "question_id": "2310.09158v1.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Table 3, I need to find the best performance of Vicuna-13B-PT on MAVEN-ERE.\n\nThe table shows the following Micro-F1 scores for Vicuna-13B-PT on MAVEN-ERE:\n- vanilla ICL: 15.3%\n- vanilla ICL w. CoT: 15.8%\n- CoT w. logical constraints: 18.0%\n- w. all logical constraints: 16.3%\n- w. retrieved logical constraints: 16.1%\n- w. post-processing: 11.0%\n\nThe highest score is 18.0% for \"CoT w. logical constraints\".\n\nAnswer: CoT w. logical constraints", "citations": [], "input_tokens": 34242, "output_tokens": 168, "cost_micros": 105246, "latency_ms": 5672, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "cot w logical constraints", "normalised_gold": "cot w logical constraints"}}
+{"qid": "2310.09158v1.pdf::Q004", "doc_id": "2310.09158v1.pdf", "answer_format": "str", "gold": "CoT w. logical constraints", "pages": 23, "document_id": 7592, "arm": "llamacloud_basic_lc", "question_id": "2310.09158v1.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Table 3, which shows the performance of Vicuna-13B-PT (pre-trained) on MAVEN-ERE:\n\nThe results show:\n- vanilla ICL: 15.3%\n- vanilla ICL w. CoT: 15.8%\n- CoT w. logical constraints: 18.0%\n- w. all logical constraints: 16.3%\n- w. retrieved logical constraints: 16.1%\n- w. post-processing: 11.0%\n\nThe highest micro-F1 score is 18.0% for \"CoT w. logical constraints\".\n\nAnswer: CoT w. logical constraints", "citations": [], "input_tokens": 26315, "output_tokens": 151, "cost_micros": 81210, "latency_ms": 4937, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "cot w logical constraints", "normalised_gold": "cot w logical constraints"}}
+{"qid": "2310.09158v1.pdf::Q004", "doc_id": "2310.09158v1.pdf", "answer_format": "str", "gold": "CoT w. logical constraints", "pages": 23, "document_id": 7592, "arm": "llamacloud_premium_lc", "question_id": "2310.09158v1.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Table 3 in the document, which shows the performance of Vicuna-13B and Llama2-13B after pre-training on LLM-LR:\n\nFor Vicuna-13B on MAVEN-ERE, the Micro-F1 scores are:\n- vanilla ICL: 15.3%\n- vanilla ICL w. CoT: 15.8%\n- CoT w. logical constraints: 18.0%\n- w. all logical constraints: 16.3%\n- w. retrieved logical constraints: 16.1%\n- w. post-processing: 11.0%\n\nThe highest performance is 18.0% with CoT w. logical constraints.\n\nAnswer: CoT w. logical constraints", "citations": [], "input_tokens": 34541, "output_tokens": 175, "cost_micros": 106248, "latency_ms": 5140, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "cot w logical constraints", "normalised_gold": "cot w logical constraints"}}
+{"qid": "2310.09158v1.pdf::Q004", "doc_id": "2310.09158v1.pdf", "answer_format": "str", "gold": "CoT w. logical constraints", "pages": 23, "document_id": 7592, "arm": "surfsense_agentic", "question_id": "2310.09158v1.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to find information about Vicuna-13B's performance on MAVEN-ERE.Let me search for information about MAVEN-ERE and Vicuna-13B performance:Let me read the table that shows the performance results:Perfect! I found the table showing Vicuna-13B's performance on MAVEN-ERE. Looking at the table, I can see three different approaches for Vicuna (Vicuna-13B):\n\n1. vanilla ICL: 13.8% Micro-F1\n2. vanilla ICL w. CoT: 11.6% Micro-F1\n3. CoT w. logical constraints: 14.9% Micro-F1\n\nThe best performance was with \"CoT w. logical constraints\" at 14.9% Micro-F1.\n\nAnswer: CoT w. logical constraints", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 38983, "error": null, "extra": {"thread_id": 2019, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_b286a9a39b954fac8755dc6ce4c285c7", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 166, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "cot w logical constraints", "normalised_gold": "cot w logical constraints"}}
+{"qid": "2311.16502v3.pdf::Q000", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "287", "pages": 117, "document_id": 7593, "arm": "native_pdf", "question_id": "2311.16502v3.pdf::Q000", "answer_letter": null, "raw_text": "Looking through the document for information about the Sociology subject...\n\nFrom Figure 3 on page 3, I can see that Sociology is listed under \"Humanities & Social Sci. (9%)\" with \"Sociology (287, 2.48%)\".\n\nFrom Table 8 on page 18 (Humanities & Social Science results), I can see that Sociology has 252 samples in the test set.\n\nAdding the validation and test samples:\n- Validation set appears to have a portion of the 947 total H&SS samples\n- Test set for Sociology: 252 samples\n\nLooking more carefully at the statistics in Table 1 on page 4, the total is 11,550 questions with a Dev:Validation:Test split of 150:900:10500.\n\nFrom the detailed breakdown in Figure 3, Sociology shows (287, 2.48%), which represents 287 total samples across all splits.\n\nAnswer: 287", "citations": [], "input_tokens": 259120, "output_tokens": 216, "cost_micros": 1559580, "latency_ms": 77812, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "287", "normalised_gold": "287"}}
+{"qid": "2311.16502v3.pdf::Q000", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "287", "pages": 117, "document_id": 7593, "arm": "azure_basic_lc", "question_id": "2311.16502v3.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the number of samples in MMMU that belong to the sociology subject.\n\nLooking at the document, I can find this information in Figure 3, which shows the breakdown of questions by disciplines, subjects, and subfields.\n\nIn Figure 3, under \"Humanities & Social Sci. (9%)\", I can see:\n\n- History (313, 2.71%)\n- Literature (147, 1.27%)\n- Sociology (287, 2.48%)\n- Psychology (340, 2.94%)\n\nThe number in parentheses after \"Sociology\" shows 287 questions, which represents 2.48% of the total.\n\nAnswer: 287", "citations": [], "input_tokens": 79424, "output_tokens": 155, "cost_micros": 240597, "latency_ms": 7077, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "287", "normalised_gold": "287"}}
+{"qid": "2311.16502v3.pdf::Q000", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "287", "pages": 117, "document_id": 7593, "arm": "azure_premium_lc", "question_id": "2311.16502v3.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the number of samples in the MMMU dataset that belong to the sociology subject.\n\nLooking through the document, I can find information about the distribution of samples across subjects in Figure 3 and Table 10.\n\nFrom Figure 3, which shows \"MMMU contains 11.5K multimodal questions covering six broad disciplines, 30 subjects, and 183 subfields,\" I can see the breakdown of subjects with their sample counts.\n\nUnder \"Humanities & Social Sci. (9%)\", I can see:\n- History (313, 2.71%)\n- Literature (147, 1.27%)\n- Psychology (340, 2.94%)\n- Sociology (287, 2.48%)\n\nThe number shown for Sociology is 287 samples.\n\nAnswer: 287", "citations": [], "input_tokens": 109175, "output_tokens": 183, "cost_micros": 330270, "latency_ms": 8484, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "287", "normalised_gold": "287"}}
+{"qid": "2311.16502v3.pdf::Q000", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "287", "pages": 117, "document_id": 7593, "arm": "llamacloud_basic_lc", "question_id": "2311.16502v3.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about the number of samples in the Sociology subject.\n\nIn Figure 3 on page 3, there is a breakdown table showing the distribution of samples across different subjects. Under \"Humanities & Social Sci. (9%)\", I can see:\n\nv Sociology (287, 2.48%)\n\nThe number 287 represents the number of samples in the Sociology subject.\n\nAnswer: 287", "citations": [], "input_tokens": 83138, "output_tokens": 100, "cost_micros": 250914, "latency_ms": 6203, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "287", "normalised_gold": "287"}}
+{"qid": "2311.16502v3.pdf::Q000", "doc_id": "2311.16502v3.pdf", "arm": "llamacloud_premium_lc", "answer_format": "int", "gold": "287", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:04:12Z", "latency_ms": 6562, "error": null, "raw_text_chars": 406}]}, "question_id": "2311.16502v3.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about the sociology subject in MMMU.\n\nIn Figure 3, which shows the distribution of subjects across disciplines, I can see:\n\nUnder \"Humanities & Social Sci. (9%)\":\n- History (313, 2.71%)\n- Literature (147, 1.27%)\n- Psychology (340, 2.94%)\n- **Sociology (287, 2.48%)**\n\nThe number 287 represents the number of samples in the sociology subject.\n\nAnswer: 287", "citations": [], "input_tokens": 103232, "output_tokens": 123, "cost_micros": 311541, "latency_ms": 6562, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "287", "normalised_gold": "287"}}
+{"qid": "2311.16502v3.pdf::Q000", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "287", "pages": 117, "document_id": 7593, "arm": "surfsense_agentic", "question_id": "2311.16502v3.pdf::Q000", "answer_letter": null, "raw_text": "I'll search for information about MMMU and sociology samples in the provided documents.Let me search for information about sociology and sample counts in the document:I found the information! Let me read the specific section with the table that shows the sample counts:Perfect! I found the answer. According to line 1572, Sociology has 287 samples in MMMU, which represents 2.48% of the dataset.\n\nAnswer: 287", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 63671, "error": null, "extra": {"thread_id": 2020, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_d6e9ac9023c84b0284ac8c1e7870f526", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 141, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "287", "normalised_gold": "287"}}
+{"qid": "2311.16502v3.pdf::Q001", "doc_id": "2311.16502v3.pdf", "answer_format": "float", "gold": "0.55%", "pages": 117, "document_id": 7593, "arm": "native_pdf", "question_id": "2311.16502v3.pdf::Q001", "answer_letter": null, "raw_text": "I need to find the number of Public Health samples showcased in the paper and compare it to the total number of Public Health samples.\n\nFrom the document:\n\n1. **Total Public Health samples**: According to Figure 3 on page 3, Public Health has **544 samples** (shown as \"Public Health (544, 4.7%)\").\n\n2. **Public Health samples showcased**: I need to count the case study figures related to Public Health from Table 10 on page 21. According to this table, Public Health has the following case study figures:\n   - Correct Case: 63\n   - Lack of Knowledge: 65\n   - Textual Understanding Error: 64\n\n   This gives us **3 showcased samples** from Public Health.\n\nThe ratio would be: 3/544 = 0.0055147...\n\nRounding to a reasonable number of decimal places: 0.0055\n\nAnswer: 0.0055", "citations": [], "input_tokens": 259140, "output_tokens": 214, "cost_micros": 1559655, "latency_ms": 83546, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "0.0055", "normalised_gold": "0.55"}}
+{"qid": "2311.16502v3.pdf::Q001", "doc_id": "2311.16502v3.pdf", "answer_format": "float", "gold": "0.55%", "pages": 117, "document_id": 7593, "arm": "azure_basic_lc", "question_id": "2311.16502v3.pdf::Q001", "answer_letter": null, "raw_text": "I need to find the number of Public Health samples showcased in the paper and calculate the ratio to the total Public Health samples.\n\nFrom Table 7 (Health & Medicine results), I can see that Public Health has 509 samples in the test set.\n\nNow I need to count how many Public Health samples are showcased in the paper (Appendix B - Case Study).\n\nLooking through the Case Study figures list and the actual figures:\n\nPublic Health samples shown:\n- Figure 63: Public Health 1 (Epidemiology) - Correct Case\n- Figure 64: Public Health 2 (Epidemiology) - Textual Understanding Error\n- Figure 65: Public Health 3 (Epidemiology) - Lack of Knowledge\n\nSo there are 3 Public Health samples showcased.\n\nThe ratio is: 3 / 509 = 0.005892...\n\nRounding to a reasonable number of decimal places:\n\nAnswer: 0.0059", "citations": [], "input_tokens": 79444, "output_tokens": 220, "cost_micros": 241632, "latency_ms": 8359, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "0.0059", "normalised_gold": "0.55"}}
+{"qid": "2311.16502v3.pdf::Q001", "doc_id": "2311.16502v3.pdf", "answer_format": "float", "gold": "0.55%", "pages": 117, "document_id": 7593, "arm": "azure_premium_lc", "question_id": "2311.16502v3.pdf::Q001", "answer_letter": null, "raw_text": "I need to find the percentage of Public Health samples that are showcased in the paper.\n\nFrom the document, I can find:\n\n1. Total number of Public Health samples in the test set:\n   - From Table 7, Public Health has 509 samples in the test set\n\n2. Number of Public Health samples showcased in the paper:\n   - I need to count the figures in Appendix B that are from Public Health\n   - From the List of Case Study Figures (pages 20-21):\n     - Figure 63: Public Health 1: Correct Case\n     - Figure 64: Public Health 2: Textual Understanding Error\n     - Figure 65: Public Health 3: Lack of Knowledge\n   \n   So there are 3 Public Health samples showcased.\n\n3. Calculate the percentage:\n   - Percentage = (3 / 509) \u00d7 100 = 0.589...\n\nWait, let me verify the total number. Looking at Table 1 (page 4), Public Health has 544 samples total. But the question asks about samples \"in this paper\", which could mean the test set.\n\nFrom Table 7 on page 17, Public Health (test set) = 509 samples.\n\nThe ratio would be: 3/509 = 0.00589...\n\nIf we want this as a percentage: 0.589%\n\nBut the question asks for \"a ratio\" as a decimal number, so:\n\n3/509 = 0.005893\n\nAnswer: 0.0059", "citations": [], "input_tokens": 109195, "output_tokens": 347, "cost_micros": 332790, "latency_ms": 10437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "0.0059", "normalised_gold": "0.55"}}
+{"qid": "2311.16502v3.pdf::Q001", "doc_id": "2311.16502v3.pdf", "answer_format": "float", "gold": "0.55%", "pages": 117, "document_id": 7593, "arm": "llamacloud_basic_lc", "question_id": "2311.16502v3.pdf::Q001", "answer_letter": null, "raw_text": "I need to find the number of Public Health samples showcased in the paper and calculate the percentage.\n\nFrom the document, I can find information about Public Health samples:\n\n1. First, let me find the total number of Public Health samples in the dataset.\nFrom Table 1 (Key statistics), I can see the total questions is 11,550.\n\nLooking at Figure 3 or the breakdown tables, I need to find Public Health information:\n- From Table 7 (Health & Medicine results), I can see Public Health is listed with 509 samples in the test set.\n- The validation set has 150 samples total for Health & Medicine.\n\nActually, looking more carefully at the tables:\n- Table 7 shows Health & Medicine breakdown with Public Health having 509 samples in the test set\n- There's also validation data\n\nLet me look at the case study section (Appendix B):\nFrom Table 10 (Table index of case study figures by subjects), I can count the Public Health cases:\n- Public Health 1: Correct Case (Figure 63)\n- Public Health 2: Textual Understanding Error (Figure 64)\n- Public Health 3: Lack of Knowledge (Figure 65)\n\nSo there are 3 Public Health cases showcased in the paper.\n\nNow I need to find the total number of Public Health samples:\nFrom Table 7, the Public Health row shows 509 samples in the test set.\n\nThe ratio would be: 3/509 = 0.00589...\n\nWait, let me verify this is asking for the percentage of showcased samples. \n\n3/509 \u00d7 100 = 0.589%\n\nAs a ratio (not percentage): 3/509 = 0.00589\n\nAnswer: 0.00589", "citations": [], "input_tokens": 83158, "output_tokens": 399, "cost_micros": 255459, "latency_ms": 11937, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "0.00589", "normalised_gold": "0.55"}}
+{"qid": "2311.16502v3.pdf::Q001", "doc_id": "2311.16502v3.pdf", "answer_format": "float", "gold": "0.55%", "pages": 117, "document_id": 7593, "arm": "llamacloud_premium_lc", "question_id": "2311.16502v3.pdf::Q001", "answer_letter": null, "raw_text": "I need to find the number of Public Health samples showcased in the paper and divide by the total number of Public Health samples.\n\nFrom the document:\n\n1. **Total Public Health samples**: Looking at Figure 3, Public Health has 544 samples (4.7% of total).\n\n2. **Public Health samples showcased in the paper**: Looking at Table 10 (Case Study Figures by subjects), I can find the Public Health figures:\n   - Figure 63: Public Health 1: Correct Case\n   - Figure 64: Public Health 2: Textual Understanding Error\n   - Figure 65: Public Health 3: Lack of Knowledge\n\nSo there are 3 Public Health case study figures shown in the paper.\n\nThe ratio is: 3 / 544 = 0.00551...\n\nAnswer: 0.0055", "citations": [], "input_tokens": 103252, "output_tokens": 191, "cost_micros": 312621, "latency_ms": 7812, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "0.0055", "normalised_gold": "0.55"}}
+{"qid": "2311.16502v3.pdf::Q001", "doc_id": "2311.16502v3.pdf", "answer_format": "float", "gold": "0.55%", "pages": 117, "document_id": 7593, "arm": "surfsense_agentic", "question_id": "2311.16502v3.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the document to find information about 'Public Health' samples and calculate the percentage.Let me search for tables and figures that might contain information about Public Health samples:Now let me look at Figure 3 and tables that show the breakdown of subjects:Perfect! I found the information. Let me read more to make sure I have all the details:Perfect! I found the information in the table. From line 1505, I can see:\n\n\"Public Health (544, 4.7%)\"\n\nThis indicates that:\n- Public Health has 544 samples\n- This represents 4.7% of all samples in the dataset\n\nThe answer to the question \"Among all samples belonging to 'Public Health' subject, how many percentage samples showcased in this paper?\" is 4.7%.\n\nSince the question asks for a ratio (percentage) and wants a single decimal number, the answer is:\n\nAnswer: 4.7", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 53796, "error": null, "extra": {"thread_id": 2021, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_8dc9bfdb2b5b47d48d7a6fa4dc0574a2", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 203, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "4.7", "normalised_gold": "0.55"}}
+{"qid": "2311.16502v3.pdf::Q002", "doc_id": "2311.16502v3.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "2", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:04:19Z", "latency_ms": 62062, "error": null, "raw_text_chars": 760}]}, "question_id": "2311.16502v3.pdf::Q002", "answer_letter": null, "raw_text": "I need to find Economics-related error cases that fall into the \"Perceptual Error\" category.\n\nLet me search through the document for Economics error cases:\n\nFrom the Case Study section (pages 20-21), I can see Table 10 which shows the index of case study figures by subjects with associated error categories.\n\nLooking at the Economics row in Table 10:\n- Economics has: Correct Case (22), Perceptual Error (23, 24)\n\nThis indicates that there are 2 Economics-related error cases in the Perceptual Error category (Figures 23 and 24).\n\nLet me verify by checking the actual figures:\n- Figure 23 (page 37): Economics: Macroeconomics - Error Category: Perceptual Error \u2713\n- Figure 24 (page 38): Economics: Macroeconomics - Error Category: Perceptual Error \u2713\n\nAnswer: 2", "citations": [], "input_tokens": 259127, "output_tokens": 212, "cost_micros": 1559532, "latency_ms": 62014, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2311.16502v3.pdf::Q002", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "2", "pages": 117, "document_id": 7593, "arm": "azure_basic_lc", "question_id": "2311.16502v3.pdf::Q002", "answer_letter": null, "raw_text": "I need to find all Economics-related error cases in the document and count how many fall into the \"Perceptual Error\" category.\n\nLet me search through the document for Economics-related figures with error categories.\n\nLooking through the case study figures in Appendix B:\n\n1. Figure 22 - Economics: Econometrics - This is a CORRECT case, not an error case.\n\n2. Figure 23 - Economics: Macroeconomics - Error Category: Perceptual Error\n   This case shows GPT-4V making an error in correctly identifying numbers in the table.\n\n3. Figure 24 - Economics: Macroeconomics - Error Category: Perceptual Error\n   This case shows GPT-4V failing to correctly interpret the intersection point in the graph.\n\nLet me verify there are no other Economics cases by checking the Table of Contents (Table 10):\n\nFrom Table 10, I can see:\n- Economics has entries for figures 22, 23, 24\n- Figure 22 is marked as \"Correct Case\"\n- Figure 23 is marked as \"Perceptual Error\"\n- Figure 24 is marked as \"Perceptual Error\"\n\nSo there are 2 Economics-related error cases that fall into the Perceptual Error category.\n\nAnswer: 2", "citations": [], "input_tokens": 79431, "output_tokens": 298, "cost_micros": 242763, "latency_ms": 8734, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2311.16502v3.pdf::Q002", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "2", "pages": 117, "document_id": 7593, "arm": "azure_premium_lc", "question_id": "2311.16502v3.pdf::Q002", "answer_letter": null, "raw_text": "I need to find all Economics-related error cases in the document and count how many fall into the \"Perceptual Error\" category.\n\nLet me search through the document for Economics error cases:\n\n1. **Figure 23** - Economics: Macroeconomics - Error Category: Perceptual Error\n   - GPT-4V incorrectly attributes numbers to the correct objects for production\n\n2. **Figure 24** - Economics: Macroeconomics - Error Category: Perceptual Error\n   - GPT-4V fails to interpret the image correctly regarding the equilibrium interest rate\n\nLooking through the List of Case Study Figures table (Table 10), I can see:\n- Economics 1: Correct Case (Figure 22)\n- Economics 2: Perceptual Error (Figure 23)\n- Economics 3: Perceptual Error (Figure 24)\n\nSo there are **2** Economics-related error cases that fall into the Perceptual Error category.\n\nAnswer: 2", "citations": [], "input_tokens": 109182, "output_tokens": 227, "cost_micros": 330951, "latency_ms": 8328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2311.16502v3.pdf::Q002", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "2", "pages": 117, "document_id": 7593, "arm": "llamacloud_basic_lc", "question_id": "2311.16502v3.pdf::Q002", "answer_letter": null, "raw_text": "I need to find all Economics-related error cases in the case study section and count how many fall into the \"Perceptual Error\" category.\n\nLooking through the case study figures, I'll identify Economics-related cases:\n\n1. Figure 22 - Economics: Econometrics - This is a correct case, not an error case.\n\n2. Figure 23 - Economics: Macroeconomics - Error Category: Perceptual Error\n\n3. Figure 24 - Economics: Macroeconomics - Error Category: Perceptual Error\n\nNow let me verify these are all the Economics cases by checking the Table of Contents in the case study section (Table 10):\n\nFrom Table 10:\n- Economics 1: Correct Case (Figure 22/36)\n- Economics 2: Perceptual Error (Figure 23/37)\n- Economics 3: Perceptual Error (Figure 24/38)\n\nSo there are 2 Economics-related error cases that fall into the Perceptual Error category.\n\nAnswer: 2", "citations": [], "input_tokens": 83145, "output_tokens": 237, "cost_micros": 252990, "latency_ms": 7797, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2311.16502v3.pdf::Q002", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "2", "pages": 117, "document_id": 7593, "arm": "llamacloud_premium_lc", "question_id": "2311.16502v3.pdf::Q002", "answer_letter": null, "raw_text": "I need to find all Economics-related error cases in Appendix B (Case Study) that are categorized as \"Perceptual Error\".\n\nLet me search through the Case Study section for Economics cases:\n\n1. **Figure 22** - Economics: Econometrics - This is a **correct case**, not an error case.\n\n2. **Figure 23** - Economics: Macroeconomics - Error Category: **Perceptual Error** \u2713\n\n3. **Figure 24** - Economics: Macroeconomics - Error Category: **Perceptual Error** \u2713\n\nLet me verify there are no other Economics cases by checking the Table of Contents (Table 10):\n- Economics 1: Correct Case (Figure 22)\n- Economics 2: Perceptual Error (Figure 23)\n- Economics 3: Perceptual Error (Figure 24)\n\nThat's all the Economics cases listed.\n\nTherefore, there are 2 Economics-related error cases that fall into the Error Category of Perceptual Error.\n\nAnswer: 2", "citations": [], "input_tokens": 103239, "output_tokens": 242, "cost_micros": 313347, "latency_ms": 8250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2311.16502v3.pdf::Q002", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "2", "pages": 117, "document_id": 7593, "arm": "surfsense_agentic", "question_id": "2311.16502v3.pdf::Q002", "answer_letter": null, "raw_text": "I'll search for information about Economics-related error cases and Perceptual Error categories in the provided documents.Let me search for content related to \"Economics\" and \"Perceptual Error\" in this document.Let me search for tables or error categories in the document:Now let me look for tables with error categories and subject areas. Let me search more broadly:Let me look at the specific area around line 7247 where there's a question about stocks (Economics-related):Let me search for a table that shows the distribution of error categories by subject area:Let me search for tables that might show error distribution:Let me look for error analysis tables. Let me search around the beginning of the document for methodology or error analysis sections:Let me search for \"Figure\" to find error analysis figures/tables:Great! Let me read the section with Figure 6 which shows error distribution:Good! I can see Figure 6 shows the overall error distribution, but I need to find a breakdown by subject area. Let me look for tables that show error categories by discipline:I can see there's a table showing different subjects and error cases. Let me read around line 5808 where there's an Accounting entry:Perfect! I can see the list of case studies. I can see:\n- Economics 1: Correct Case\n- Economics 2: Perceptual Error\n- Economics 3: Perceptual Error\n\nSo there are 2 Economics cases with Perceptual Error. Let me verify this is the complete list by checking if there are more Economics cases:Perfect! I found Table 10 which shows the case study figures by subjects with associated error categories. Looking at the Economics row:\n\n- Subject:", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 94360, "error": null, "extra": {"thread_id": 2022, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_ce7c6d6aee6641609ca2f09db6ff3175", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 531, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "2"}}
+{"qid": "2311.16502v3.pdf::Q003", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Thermodynamics', 'Heat Transfer', 'Fluid Mechanics']", "pages": 117, "document_id": 7593, "arm": "native_pdf", "question_id": "2311.16502v3.pdf::Q003", "answer_letter": null, "raw_text": "I need to find the subfields for the Energy & Power subject in the MMMU dataset.\n\nLooking at the document, I can find this information in Table 11 on page 111, which lists \"Subfields of each subject.\"\n\nUnder the \"Tech & Engineering\" discipline, I can see the \"Energy & Power\" subject with its subfields listed.\n\nFrom Table 11:\nEnergy & Power: Thermodynamics, Heat Transfer, Fluid Mechanics\n\nAnswer: Thermodynamics, Heat Transfer, Fluid Mechanics", "citations": [], "input_tokens": 259135, "output_tokens": 118, "cost_micros": 1557465, "latency_ms": 60233, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "fluid mechanics, heat transfer, thermodynamics", "normalised_gold": "fluid mechanics, heat transfer, thermodynamics"}}
+{"qid": "2311.16502v3.pdf::Q003", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Thermodynamics', 'Heat Transfer', 'Fluid Mechanics']", "pages": 117, "document_id": 7593, "arm": "azure_basic_lc", "question_id": "2311.16502v3.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Table 11 in the document which lists subfields of each subject:\n\nFor Energy & Power under the Tech & Engineering discipline, the subfields are:\n\nAnswer: Thermodynamics, Heat Transfer, Fluid Mechanics", "citations": [], "input_tokens": 79439, "output_tokens": 52, "cost_micros": 239097, "latency_ms": 5375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "fluid mechanics, heat transfer, thermodynamics", "normalised_gold": "fluid mechanics, heat transfer, thermodynamics"}}
+{"qid": "2311.16502v3.pdf::Q003", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Thermodynamics', 'Heat Transfer', 'Fluid Mechanics']", "pages": 117, "document_id": 7593, "arm": "azure_premium_lc", "question_id": "2311.16502v3.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Table 11 in the document which lists all subfields by subject:\n\nAnswer: Thermodynamics, Heat Transfer, Fluid Mechanics", "citations": [], "input_tokens": 109190, "output_tokens": 35, "cost_micros": 328095, "latency_ms": 5171, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "fluid mechanics, heat transfer, thermodynamics", "normalised_gold": "fluid mechanics, heat transfer, thermodynamics"}}
+{"qid": "2311.16502v3.pdf::Q003", "doc_id": "2311.16502v3.pdf", "arm": "llamacloud_basic_lc", "answer_format": "list", "gold": "['Thermodynamics', 'Heat Transfer', 'Fluid Mechanics']", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:05:21Z", "latency_ms": 5093, "error": null, "raw_text_chars": 174}]}, "question_id": "2311.16502v3.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Table 11 in the document which lists the subfields of each subject:\n\nFor Energy & Power, the subfields are:\n\nAnswer: Thermodynamics, Heat Transfer, Fluid Mechanics", "citations": [], "input_tokens": 83153, "output_tokens": 47, "cost_micros": 250164, "latency_ms": 5093, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "fluid mechanics, heat transfer, thermodynamics", "normalised_gold": "fluid mechanics, heat transfer, thermodynamics"}}
+{"qid": "2311.16502v3.pdf::Q003", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Thermodynamics', 'Heat Transfer', 'Fluid Mechanics']", "pages": 117, "document_id": 7593, "arm": "llamacloud_premium_lc", "question_id": "2311.16502v3.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Table 11 in the document, which lists \"Subfields of each subject,\" I can find the subfields for Energy & Power.\n\nAccording to the table:\n\n**Energy & Power:** Thermodynamics, Heat Transfer, Fluid Mechanics\n\nAnswer: Thermodynamics, Heat Transfer, Fluid Mechanics", "citations": [], "input_tokens": 103247, "output_tokens": 73, "cost_micros": 310836, "latency_ms": 4766, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "fluid mechanics, heat transfer, thermodynamics", "normalised_gold": "fluid mechanics, heat transfer, thermodynamics"}}
+{"qid": "2311.16502v3.pdf::Q003", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Thermodynamics', 'Heat Transfer', 'Fluid Mechanics']", "pages": 117, "document_id": 7593, "arm": "surfsense_agentic", "question_id": "2311.16502v3.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the document to find the subfields in the Energy & Power subject.I found two subfields mentioned for Energy and Power. Let me search for more information to ensure I have a complete list.Let me search more specifically for all Energy and Power subfield mentions:Based on the document, I found two subfields explicitly mentioned for Energy and Power:\n1. Fluid Mechanics (Figure 89)\n2. Thermodynamics (Figure 90)\n\nLet me verify if there are any other subfields by looking at tables or lists that might contain more comprehensive information:Perfect! I found the information. In line 1545, it shows the subfields for Energy Power:\n\n\"Fluid Mechanics, Heat Transfer ...\"\n\nLet me search for more complete information about all the subfields:Excellent! I found the complete list. Let me verify this is in a table showing all subfields:Perfect! I found the complete list of subfields for Energy & Power in the table. The subfields are:\n- Thermodynamics\n- Heat Transfer\n- Fluid Mechanics\n\nAnswer: Thermodynamics, Heat Transfer, Fluid Mechanics", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 66234, "error": null, "extra": {"thread_id": 2023, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_db9765522b274b60ae516c6ef2df0d73", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 314, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "fluid mechanics, heat transfer, thermodynamics", "normalised_gold": "fluid mechanics, heat transfer, thermodynamics"}}
+{"qid": "2311.16502v3.pdf::Q004", "doc_id": "2311.16502v3.pdf", "answer_format": "float", "gold": "12.1%", "pages": 117, "document_id": 7593, "arm": "native_pdf", "question_id": "2311.16502v3.pdf::Q004", "answer_letter": null, "raw_text": "I need to find GPT-4V's performance on Computer Science and Math subjects from the MMMU benchmark.\n\nLooking at the detailed subject breakdown tables in the appendix:\n\nFrom Table 9 (Tech & Engineering results on page 19):\n- Computer Science (Comp. Sci.) column shows GPT-4V(ision) (Playground) achieved **57.1%**\n\nFrom Table 6 (Science results on page 16):\n- Math column shows GPT-4V(ision) (Playground) achieved **45.0%**\n\nThe performance difference is:\n57.1% - 45.0% = 12.1%\n\nAnswer: 12.1", "citations": [], "input_tokens": 259141, "output_tokens": 158, "cost_micros": 1558401, "latency_ms": 61328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "12.1", "normalised_gold": "12.1"}}
+{"qid": "2311.16502v3.pdf::Q004", "doc_id": "2311.16502v3.pdf", "answer_format": "float", "gold": "12.1%", "pages": 117, "document_id": 7593, "arm": "azure_basic_lc", "question_id": "2311.16502v3.pdf::Q004", "answer_letter": null, "raw_text": "I need to find GPT-4V's performance on Computer Science and Math subjects in the MMMU dataset.\n\nLooking at Table 9 (Tech & Engineering results), I can find Computer Science:\n- Computer Science (371): 57.1%\n\nLooking at Table 6 (Science results), I can find Math:\n- Math (505): 45.0%\n\nThe difference is: 57.1% - 45.0% = 12.1%\n\nAnswer: 12.1", "citations": [], "input_tokens": 79445, "output_tokens": 114, "cost_micros": 240045, "latency_ms": 5546, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "12.1", "normalised_gold": "12.1"}}
+{"qid": "2311.16502v3.pdf::Q004", "doc_id": "2311.16502v3.pdf", "answer_format": "float", "gold": "12.1%", "pages": 117, "document_id": 7593, "arm": "azure_premium_lc", "question_id": "2311.16502v3.pdf::Q004", "answer_letter": null, "raw_text": "I need to find GPT-4V's performance on Computer Science and Math subjects from the MMMU dataset.\n\nLooking through the tables in the document, I need to find the breakdown by subject for GPT-4V.\n\nFrom Table 6 (Science results), I can see:\n- Math: GPT-4V(ision) (Playground) achieves 45.0%\n\nFrom Table 9 (Tech & Engineering results), I can see:\n- Computer Science (Comp. Sci.): GPT-4V(ision) (Playground) achieves 57.1%\n\nThe performance difference is:\n57.1% - 45.0% = 12.1%\n\nAnswer: 12.1", "citations": [], "input_tokens": 109196, "output_tokens": 167, "cost_micros": 330093, "latency_ms": 7889, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "12.1", "normalised_gold": "12.1"}}
+{"qid": "2311.16502v3.pdf::Q004", "doc_id": "2311.16502v3.pdf", "answer_format": "float", "gold": "12.1%", "pages": 117, "document_id": 7593, "arm": "llamacloud_basic_lc", "question_id": "2311.16502v3.pdf::Q004", "answer_letter": null, "raw_text": "I need to find GPT-4V's performance on Computer Science (CS) and Math subjects in the MMMU dataset.\n\nLooking through the document for GPT-4V results by subject:\n\nIn Table 2 (page 6), I can see overall results, but I need to find the breakdown by specific subjects.\n\nLooking at the appendix sections for detailed subject breakdowns:\n\nFrom Table 9 (page 19) - Tech & Engineering results:\n- Computer Science is listed under Tech & Engineering\n- GPT-4V(ision) (Playground) shows: Comp. Sci. = 57.1%\n\nFrom Table 6 (page 16) - Science results:\n- Math is listed under Science\n- GPT-4V(ision) (Playground) shows: Math = 45.0%\n\nLet me verify these numbers are correct by checking the table again:\n\nTable 6 (Science results):\n- GPT-4V(ision) (Playground): Math column shows 45.0\n\nTable 9 (Tech & Engineering results):\n- GPT-4V(ision) (Playground): Comp. Sci. column shows 57.1\n\nThe performance difference is:\n57.1 - 45.0 = 12.1\n\nAnswer: 12.1", "citations": [], "input_tokens": 83159, "output_tokens": 300, "cost_micros": 253977, "latency_ms": 9016, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "12.1", "normalised_gold": "12.1"}}
+{"qid": "2311.16502v3.pdf::Q004", "doc_id": "2311.16502v3.pdf", "answer_format": "float", "gold": "12.1%", "pages": 117, "document_id": 7593, "arm": "llamacloud_premium_lc", "question_id": "2311.16502v3.pdf::Q004", "answer_letter": null, "raw_text": "I need to find GPT-4V's performance on Computer Science and Math subjects in the MMMU dataset.\n\nLooking at Table 9 (Tech & Engineering results), I can see Computer Science is listed with GPT-4V achieving **57.1%**.\n\nLooking at Table 6 (Science results), I can see Math is listed with GPT-4V achieving **45.0%**.\n\nThe performance difference is: 57.1 - 45.0 = 12.1\n\nAnswer: 12.1", "citations": [], "input_tokens": 103253, "output_tokens": 119, "cost_micros": 311544, "latency_ms": 6312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "12.1", "normalised_gold": "12.1"}}
+{"qid": "2311.16502v3.pdf::Q004", "doc_id": "2311.16502v3.pdf", "answer_format": "float", "gold": "12.1%", "pages": 117, "document_id": 7593, "arm": "surfsense_agentic", "question_id": "2311.16502v3.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to find GPT-4V's performance on CS and Math subjects in the MMMU dataset.Let me search for performance tables or results sections that would contain GPT-4V's performance by subject.Let me search for tables with performance metrics:Let me look for a results table that shows performance by subject. Let me read around line 5873 and 5978 where I saw Computer Science and Math mentioned:This appears to be a table of figure references. Let me search for actual performance data tables. Let me look for \"accuracy\" or percentage data:Let me read the beginning of the paper to find the main results table:Let me search for a table with actual performance numbers. Let me look for \"GPT-4V\" with numbers:Let me look for Table 2 which should contain the main results broken down by subject:Good! I found Table 2. Now I need to find the breakdown by individual subjects. The table shows broad disciplines but I need specific subjects like Computer Science and Math. Let me search for another table with subject-level breakdown:Table 9 should have Tech & Engineering which includes Computer Science. Let me read that table:Perfect! I found GPT-4V's Computer Science performance at 57.1% (line 5254). Now I need to find Math performance. Let me check Table 6 for Science subjects:", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 132796, "error": null, "extra": {"thread_id": 2024, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_b9c01f114dd641199d8ac115ae26b4c6", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 506, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "-4.0", "normalised_gold": "12.1"}}
+{"qid": "2311.16502v3.pdf::Q007", "doc_id": "2311.16502v3.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "17", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:05:26Z", "latency_ms": 64485, "error": null, "raw_text_chars": 1513}]}, "question_id": "2311.16502v3.pdf::Q007", "answer_letter": null, "raw_text": "I need to find Figure 1 in the document and count the different icons shown as different image types.\n\nLooking at page 1, Figure 1 is titled \"Overview of the MMMU dataset\" and shows four main sections:\n\n1. **Comprehensive Disciplines** - Shows icons/images representing different disciplines (Engineering, Science, Art & Design, Business, Humanities & Social Sci., Medicine)\n\n2. **Heterogeneous Image Types** - Shows various image type icons including:\n   - Diagrams\n   - Tables\n   - Plots and Charts\n   - Photographs\n   - Chemical Structures\n   - Paintings\n   - Medical Images\n   - Sheet Music\n   - Geometric\n   - Pathology images\n   - Microscopic Images\n   - Comics\n\n3. **Interleaved Text and Images** - Shows a medical question example with 3 MRI images\n\n4. **Expert-level Skills Test** - Shows a diagram with three components: Perception, Knowledge, and Reasoning\n\nThe question asks specifically about \"different icon are shown as different image types\" in Figure 1. Looking at the \"Heterogeneous Image Types\" section, I can count the distinct image type icons shown.\n\nCounting the image types listed in the \"Heterogeneous Image Types\" section:\nDiagrams, Tables, Plots and Charts, Photographs, Chemical Structures, Paintings, Medical Images, Sheet Music, Geometric, Pathology images, Microscopic Images, Comics, and there's an ellipsis (...) suggesting more types.\n\nBased on what's explicitly visible and labeled in Figure 1, I can count approximately 12 distinct image type representations shown.\n\nAnswer: 12", "citations": [], "input_tokens": 259123, "output_tokens": 378, "cost_micros": 1563243, "latency_ms": 64406, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "12", "normalised_gold": "17"}}
+{"qid": "2311.16502v3.pdf::Q007", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "17", "pages": 117, "document_id": 7593, "arm": "azure_basic_lc", "question_id": "2311.16502v3.pdf::Q007", "answer_letter": null, "raw_text": "I need to examine Figure 1 in the document to count the different icons shown as different image types.\n\nLooking at Figure 1, which shows an \"Overview of the MMMU dataset,\" I can see it displays various categories:\n\n1. **Comprehensive Disciplines** - showing a pie chart with percentages for different disciplines (Engineering 26%, Art & Design 11%, Business 14%, Medicine 17%, Science 23%, Humanities & Social Sci. 9%)\n\n2. **Heterogeneous Image Types** - listing various types: Diagrams, Tables, Plots and Charts, Photographs, Chemical Structures, Paintings, Medical Images, Sheet Music, Geometric, Pathology images, Microscopic Images, Comics, ...\n\n3. **Interleaved Text and Images** - showing an example with 3 medical images\n\n4. **Expert-level Skills Test** - showing three categories:\n   - Perception (with an icon)\n   - Knowledge (with an icon)\n   - Reasoning (with an icon)\n\nThe question asks about \"different icons shown as different image types\" in Figure 1. Looking at the heterogeneous image types section, I can count the distinct image type categories mentioned (not just listed as text, but representing different types):\n\nFrom the visible list and description: Diagrams, Tables, Plots and Charts, Photographs, Chemical Structures, Paintings, Medical Images, Sheet Music, Geometric, Pathology images, Microscopic Images, Comics (and the ellipsis suggests more).\n\nThat's 12 explicitly mentioned types, but the question asks about icons shown. The figure shows examples and visual representations rather than just listing them.\n\nLooking more carefully at Figure 1, I need to count the actual different image type examples/icons displayed, not just mentioned in text.\n\nAnswer: 12", "citations": [], "input_tokens": 79427, "output_tokens": 403, "cost_micros": 244326, "latency_ms": 11984, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "12", "normalised_gold": "17"}}
+{"qid": "2311.16502v3.pdf::Q007", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "17", "pages": 117, "document_id": 7593, "arm": "azure_premium_lc", "question_id": "2311.16502v3.pdf::Q007", "answer_letter": null, "raw_text": "I need to examine Figure 1 in the document to count the different icons shown as different image types.\n\nLooking at Figure 1, which is titled \"Overview of the MMMU dataset,\" I can see it displays four main sections:\n\n1. **Comprehensive Disciplines** - showing a pie chart with 6 disciplines\n2. **Heterogeneous Image Types** - showing various image type icons\n3. **Interleaved Text and Images** - showing a sample question with images\n4. **Expert-level Skills Test** - showing three overlapping circles for Perception, Knowledge, and Reasoning\n\nIn the \"Heterogeneous Image Types\" section, I can count the following distinct icons representing different image types:\n1. Diagrams\n2. Tables\n3. Plots and Charts\n4. Chemical (structures)\n5. Photos/Photographs\n6. Paintings\n7. Geometric (shapes)\n8. Music (sheet music)\n9. Medical (images)\n\nLet me recount more carefully by examining the visual elements shown in that section. The figure shows various small icons/symbols representing different image types in a grid-like arrangement.\n\nCounting the distinct image type icons shown: 9 different icons are displayed representing different image types (Diagrams, Tables, Plots and Charts, Chemical Structures, Photographs, Paintings, Geometric shapes, Music/Sheet Music, and Medical Images).\n\nAnswer: 9", "citations": [], "input_tokens": 109178, "output_tokens": 314, "cost_micros": 332244, "latency_ms": 10828, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "9", "normalised_gold": "17"}}
+{"qid": "2311.16502v3.pdf::Q007", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "17", "pages": 117, "document_id": 7593, "arm": "llamacloud_basic_lc", "question_id": "2311.16502v3.pdf::Q007", "answer_letter": null, "raw_text": "I need to look at Figure 1 in the document to count the different icons shown as different image types.\n\nLooking at Figure 1, which shows an \"Overview of the MMMU dataset,\" I can see it displays four main characteristics:\n\n1. **Comprehensive Disciplines** - showing 6 disciplines with percentages:\n   - Engineering (26%)\n   - Art & Design (11%)\n   - Business (14%)\n   - Science (23%)\n   - Humanities & Social Sci. (9%)\n   - Medicine (17%)\n\n2. **Heterogeneous Image Types** - listing various types like: Diagrams, Tables, Plots and Charts, Photographs, Chemical Structures, Paintings, Medical Images, Music, Geometric, Pathology images, Microscopic Images, Comics, \u2026\n\n3. **Interleaved Text and Images** - showing an example question with images\n\n4. **Expert-level Skills Test** - listing various types of reasoning and knowledge\n\nThe figure shows icons/visual representations for these categories, but the question asks specifically about \"different icon shown as different image types.\"\n\nLooking at the \"Heterogeneous Image Types\" section in Figure 1, I can count the distinct image type categories mentioned:\n1. Diagrams\n2. Tables\n3. Plots and Charts\n4. Photographs\n5. Chemical Structures\n6. Paintings\n7. Medical Images\n8. Music\n9. Geometric\n10. Pathology images\n11. Microscopic Images\n12. Comics\n\nAnd there's an ellipsis (\u2026) indicating more types.\n\nHowever, the question asks about icons shown in Figure 1. Looking at the visual representation, the heterogeneous image types section appears to show example icons or representations of these different types.\n\nBased on the visible list in the \"Heterogeneous Image Types\" section of Figure 1, I can count 12 distinct image type categories explicitly listed before the ellipsis.\n\nAnswer: 12", "citations": [], "input_tokens": 83141, "output_tokens": 438, "cost_micros": 255993, "latency_ms": 11875, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "12", "normalised_gold": "17"}}
+{"qid": "2311.16502v3.pdf::Q007", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "17", "pages": 117, "document_id": 7593, "arm": "llamacloud_premium_lc", "question_id": "2311.16502v3.pdf::Q007", "answer_letter": null, "raw_text": "I need to carefully examine Figure 1 to count the different icons shown as different image types.\n\nLooking at Figure 1, which shows an \"Overview of the MMMU dataset,\" I can see four main sections:\n\n1. **Comprehensive Disciplines** - Shows a pie chart with 6 disciplines\n2. **Heterogeneous Image Types** - This section lists various image types\n3. **Interleaved Text and Images** - Shows an example question with images\n4. **Expert-level Skills Test** - Shows a diagram with arrows\n\nIn the \"Heterogeneous Image Types\" section, I can see a list of different image types mentioned:\n- Diagrams\n- Tables\n- Plots and Charts\n- Photographs\n- Chemical Structures\n- Paintings\n- Medical Images\n- Sheet Music\n- Geometric\n- Pathology images\n- Microscopic Images\n- Comics\n- (and the list continues with \"...\")\n\nHowever, the question asks about \"icons shown as different image types.\" Looking at the figure more carefully, I don't see actual icons representing each image type - rather, I see text labels listing the image types.\n\nLet me reconsider - perhaps the question is asking about visual representations or examples shown. In Figure 1, I can see:\n- A pie chart (1 type of visualization)\n- Example images in the \"Interleaved Text and Images\" section (3 medical scan images)\n- A flow diagram in the \"Expert-level Skills Test\" section\n\nBut this doesn't seem to match what the question is asking either.\n\nRe-reading the question: \"How many different icon are shown as different image types in Figure 1?\"\n\nLooking at the figure description more carefully, in the \"Heterogeneous Image Types\" section, there appears to be a list of image types. The text mentions: \"Diagrams, Tables, Plots and Charts, Photographs, Chemical Structures, Paintings, Medical Images, Sheet Music, Geometric, Pathology images, Microscopic Images, Comics, ...\"\n\nCounting the explicitly listed items before the \"...\": 12 different image types are mentioned.\n\nAnswer: 12", "citations": [], "input_tokens": 103235, "output_tokens": 475, "cost_micros": 316830, "latency_ms": 12858, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "12", "normalised_gold": "17"}}
+{"qid": "2311.16502v3.pdf::Q007", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "17", "pages": 117, "document_id": 7593, "arm": "surfsense_agentic", "question_id": "2311.16502v3.pdf::Q007", "answer_letter": null, "raw_text": "I'll read the document to find Figure 1 and count the different icons shown as different image types.Let me search for \"Figure 1\" in the document:Now let me read the beginning of the document content to find Figure 1:Good, I can see Figure 1 information. Let me continue reading to see the complete description of the heterogeneous image types shown in Figure 1:Perfect! I can see the \"Heterogeneous Image Types\" section in Figure 1. The document lists the following image types:\n\n1. Diagrams\n2. Tables\n3. Plots and Charts\n4. Photographs\n5. Chemical Structures\n6. Paintings\n7. Medical Images\n8. Sheet Music\n9. Geometric\n10. Pathology images\n11. Microscopic Images\n12. Comics\n\nThe text shows \"...\" after Comics, but I need to verify if there are icons shown for each type. Let me search for more information about Figure 1 to see if there's a visual description that tells us how many icons are actually displayed:Looking back at the text from Figure 1, I can see it lists these image types under \"Heterogeneous Image Types\":\n\n\"Diagrams, Tables, Plots and Charts, Photographs, Chemical Structures, Paintings, Medical Images, Sheet Music, Geometric, Pathology images, Microscopic Images, Comics, ...\"\n\nLet me count the distinct types mentioned:\n1. Diagrams\n2. Tables\n3. Plots and Charts (this appears to be one category)\n4. Photographs\n5. Chemical Structures\n6. Paintings\n7. Medical Images\n8. Sheet Music\n9. Geometric\n10. Pathology images\n11. Microscopic Images\n12. Comics\n\nHowever, I need to verify if the figure actually shows icons for each. Let me look for embedded images or visual descriptions around Figure 1 to see if there are actual icon images:I see that Im1.png is an embedded image showing a hierarchical flowchart. That's not the heterogeneous image types. Let me search more carefully through the document. Let me look for more embedded images around Figure 1:Let me look more carefully at the beginning of the document. Im1.png appeared right after the figure caption for Figure 1. Let me look at Figure 1 more systematically. Let me check if there are multiple images that are part of Figure 1:I see. The Figure 1 section ends at line 1160 with `</figure>`, and then there are embedded images. The text shows <image 1>, <image 2>, <image 3> within the figure. Let me search more systematically. The figure lists:\n\n\"Diagrams, Tables, Plots and Charts, Photographs, Chemical Structures, Paintings, Medical Images, Sheet Music, Geometric, Pathology images, Microscopic Images, Comics, ...\"\n\nBut I need to see if there are icons displayed. Let me search for where the actual icons might be described. Let me look for \"icon\" in the document:Perfect! I found it. At line 1265, there's a reference to a camera icon. Let me read that section more carefully since it's right after Figure 1:Good! So now I can see there are embedded images that are icons representing different image types. Let me systematically go through and look for all the icons that are part of Figure 1. The document structure shows Im11.png is a camera icon and Im12.png is a geometric icon. Let me look back at the text of Figure 1 to understand the layout better. Looking at the document, it seems the figure starts at line 1097 and the text lists image types. Let me search for more icon descriptions nearby:I see that after the icon descriptions (Im11.png - camera, Im12.png - geometric), the document moves to Figure 3 (a table). So the icons are between Figure 1 and Figure 3. Let me go back and look more carefully at the images embedded right after Figure 1 ends. Let me look for all embedded images between line 1160 (where Figure 1 ends) and line 1455 (where Figure 3 starts):I see Im13.png is a germ/pathogen icon. That's another icon in the series. Let me systematically look at all embedded images between Figure 1 ending and where we transition to other figures. Let me search systematically from line 1160 to 1860:\n\nBased on what I've seen:\n- Im1.png (line 1164): Flowchart diagram\n- Im11.png (line 1256): Camera icon (Photographs)\n- Im12.png (line 1428): Geometric icon\n- Im13.png (line 1863): Germ/pathogen icon\n\nLet me continue looking for more icons. Let me search for the next embedded images:Good! Im14.png is a microscope icon. Let me continue to find more icons. Let me look at the next embedded images. Let me search more systematically. Based on what the abstract says: \"comprising 30 highly heterogeneous image types\" - but Figure 1 specifically lists these types. Let me look at the section with the image types more carefully. I need to look back at line 2501 which says \"Music\" and \"Medical\" - these might be labels for icons:Ah! Figure 5 shows different types of images: Diagrams, Tables, Charts, Chemical, Photos, Paintings, Geometric, Music, Medical. That's 9 types. But this is Figure 5, not Figure 1. Let me go back and look very carefully at what exactly is in Figure 1. Let me re-read the text in Figure 1:\n\nFrom lines 1134-1138, the text explicitly states under \"Heterogeneous Image Types\":\n\"Diagrams, Tables, Plots and Charts, Photographs, Chemical Structures, Paintings, Medical Images, Sheet Music, Geometric, Pathology images, Microscopic Images, Comics, ...\"\n\nSo let me count these carefully:\n1. Diagrams\n2. Tables\n3. Plots and Charts\n4. Photographs\n5. Chemical Structures\n6. Paintings\n7. Medical Images\n8. Sheet Music\n9. Geometric\n10. Pathology images\n11. Microscopic Images\n12. Comics\n\nBut the question asks about \"different icons shown as different image types in Figure 1\". I need to look back at Figure 1 itself more carefully. Let me check if there are actually icons displayed IN Figure 1, not just listed. Let me re-examine the structure. The figure tag starts at 1097 and ends at 1160. Let me look at what images are referenced within that figure:I see! Within the figure (lines 1097-1160), there are references to `<image 1>`, `<image 2>`, and `<image 3>` (lines 1113-1115 as part of a question, and again at lines 1154-1158). But the key section is \"Heterogeneous Image Types\" which lists the types. However, the question asks about icons shown as different image types. \n\nThe text lists image types but doesn't specify how many icons are actually displayed visually. Based on the embedded images that come right after Figure 1:\n- Im1.png (flowchart - but this appears to be an example diagram, not an icon representing a type)\n- Im11.png (camera icon - represents Photographs)\n- Im12.png (geometric shapes icon - represents Geometric)\n- Im13.png (germ icon - could represent Medical/Pathology/Microscopic)\n- Im14.png (microscope icon - represents Microscopic/Medical)\n\nBut I need to find ALL the icons that are specifically showing different image types in Figure 1. Let me look more systematically at what comes after Figure 1 but before Figure 2 (which starts at line 1289). All embedded images between line 1160 and line 1289 might be icons for Figure 1.\n\nFrom my reading:\n- Line 1164: Im1.png (flowchart diagram - this might be part of the figure illustration)\n- Line 1256: Im11.png (camera icon)\n- Line 1428: Im12.png (geometric shapes)\n\nLet me check if there are other embedded images I missed before Figure 2:\n\nLooking at the embedded image list I got earlier, between lines 1160 and 1289 I have:\n- Im1.png at line 1164\n- Im11.png at line 1256\n- Im12.png at line 1428 (but line 1289 is Figure 2, so this is after Figure 2 starts)\n\nWait, let me recheck. Figure 2 caption is at line 1289. So Im12.png at line 1428 is after Figure 2. Let me look more carefully:So between Figure 1 (ending at line 1160) and Figure 2 (starting at line 1288), there are only 2 embedded images:\n- Im1.png at line 1164\n- Im11.png at line 1256\n\nBut actually, looking at the visual descriptions, Im1.png is described as a \"Hierarchical flowchart/organizational diagram\" which seems to be an example diagram that's PART of Figure 1, not an icon representing an image type.\n\nIm11.png is clearly described as a \"Stylized camera icon\" which represents the Photographs image type.\n\nLet me think about this differently. Maybe the icons are all embedded within the actual Figure 1 structure, but the XML parsing has extracted them as separate images. Looking at the text in Figure 1, it lists types under \"Heterogeneous Image Types\". Perhaps in the original PDF, there are visual icons shown alongside or above these text labels, and those icons have been extracted as separate embedded images.\n\nGiven the timeframe and what I can see, let me search for a clearer pattern. Looking at Figure 5 (line 2475) which shows \"Performance of models on different types of images\" and lists: Diagrams, Tables, Charts, Chemical, Photos, Paintings, Geometric, Music, Medical - that's 9 types.\n\nBut I need to answer specifically about Figure 1. Let me think about the structure: Figure 1 has a section called \"Heterogeneous Image Types\" that lists multiple types. The question asks how many different icons are shown. \n\nLooking at the abstract again (line 1212): \"comprising 30 highly heterogeneous image types, such as charts, diagrams, maps, tables, music sheets, and chemical structures.\"\n\nBut in Figure 1's \"Heterogeneous Image Types\" section, it explicitly lists: \"Diagrams, Tables, Plots and Charts, Photographs, Chemical Structures, Paintings, Medical Images, Sheet Music, Geometric, Pathology images, Microscopic Images, Comics, ...\"\n\nIf I count unique types (treating \"Plots and Charts\" as one type):\n1. Diagrams\n2. Tables\n3. Plots and Charts\n4. Photographs\n5. Chemical Structures\n6. Paintings\n7. Medical Images\n8. Sheet Music\n9. Geometric\n10. Pathology images\n11. Microscopic Images\n12. Comics\n\nThat's 12 distinct types listed (plus \"...\" suggesting more).\n\nHowever, Figure 5 later in the paper shows icons for 9 types. Perhaps Figure 1 shows icons for these 9 main types as well. Let me verify Figure 5 shows exactly 9 by re-reading:\n\nFrom line 2487-2503 in Figure 5:\n1. Diagrams\n2. Tables\n3. Charts\n4. Chemical\n5. Photos\n6. Paintings\n7. Geometric\n8. Music\n9. Medical\n\nThat's exactly 9 types in Figure 5.\n\nGiven the typical structure of such figures and the pattern I see, my best answer is that Figure 1 shows icons for **9** different image types.\n\nAnswer: 9", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 192280, "error": null, "extra": {"thread_id": 2025, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_f360703be6ba4537aef7760373d79d2f", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 1273, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "9", "normalised_gold": "17"}}
+{"qid": "2311.16502v3.pdf::Q009", "doc_id": "2311.16502v3.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:05:29Z", "latency_ms": 75843, "error": null, "raw_text_chars": 831}]}, "question_id": "2311.16502v3.pdf::Q009", "answer_letter": null, "raw_text": "I need to count the charts that appear in pages 5-10 of the document.\n\nLet me go through each page:\n\n**Page 5:**\n- Figure 4: A comparison chart showing benchmarks plotted on a 2D graph with \"Breadth (Knowledge)\" on x-axis and \"Depth (Reasoning)\" on y-axis\n- A table comparing different datasets\nTotal: 1 chart\n\n**Page 6:**\n- Table 2: Results table (not a chart)\nTotal: 0 charts\n\n**Page 7:**\n- Figure 5: Bar chart showing \"Performance of models on different types of images\"\n- Table 3: Results table (not a chart)\nTotal: 1 chart\n\n**Page 8:**\n- Figure 6: Pie chart showing \"Error distribution over 150 annotated GPT-4V errors\"\nTotal: 1 chart\n\n**Page 9:**\n- No charts, only text\nTotal: 0 charts\n\n**Page 10:**\n- No charts, only text (references section)\nTotal: 0 charts\n\nTotal charts in pages 5-10: 1 + 0 + 1 + 1 + 0 + 0 = 3\n\nAnswer: 3", "citations": [], "input_tokens": 259120, "output_tokens": 289, "cost_micros": 1561222, "latency_ms": 75796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "4"}}
+{"qid": "2311.16502v3.pdf::Q009", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "arm": "azure_basic_lc", "question_id": "2311.16502v3.pdf::Q009", "answer_letter": null, "raw_text": "I need to count the charts included in pages 5-10 of the document.\n\nLet me go through each page:\n\n**Page 5:**\n- Contains text and references but no charts visible\n\n**Page 6:**\n- Table 2: \"Overall results of different models on the MMMU validation and test set\" - This is a table, not a chart\n\n**Page 7:**\n- Figure 5: \"Performance of models on different types of images\" - This is a bar chart\n- Table 3: \"Result decomposition across question difficulty levels\" - This is a table, not a chart\n\n**Page 8:**\n- Figure 6: \"Error distribution over 150 annotated GPT-4V errors\" - This is a pie chart\n- Figure 7: Shows an example case with images and text - Not a chart\n\n**Page 9:**\n- Contains text discussion, no charts\n\n**Page 10:**\n- Contains text and references, no charts\n\nCounting the charts:\n- Page 7: Figure 5 (bar chart) = 1 chart\n- Page 8: Figure 6 (pie chart) = 1 chart\n\nTotal charts in pages 5-10: 2\n\nAnswer: 2", "citations": [], "input_tokens": 79424, "output_tokens": 276, "cost_micros": 242412, "latency_ms": 7796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "4"}}
+{"qid": "2311.16502v3.pdf::Q009", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "arm": "azure_premium_lc", "question_id": "2311.16502v3.pdf::Q009", "answer_letter": null, "raw_text": "I'll carefully count all charts in Pages 5-10.\n\n**Page 5:**\n- Figure 2: This shows sample MMMU examples in a table format, not a chart\n- No charts on this page\n\n**Page 6:**\n- Table 2: This is a table showing model results, not a chart\n- No charts on this page\n\n**Page 7:**\n- Figure 5: This is a bar chart showing \"Performance of models on different types of images\"\n- Table 3: This is a table, not a chart\n- Count: 1 chart\n\n**Page 8:**\n- Figure 6: This is a pie chart showing \"Error distribution over 150 annotated GPT-4V errors\"\n- Figure 7: This shows an example with illustrations but is not a chart\n- Count: 1 chart\n\n**Page 9:**\n- No figures or charts on this page (only references text)\n\n**Page 10:**\n- No figures or charts on this page (only references text)\n\nTotal charts in Pages 5-10: 1 (Figure 5) + 1 (Figure 6) = 2\n\nAnswer: 2", "citations": [], "input_tokens": 109175, "output_tokens": 264, "cost_micros": 331485, "latency_ms": 9110, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "4"}}
+{"qid": "2311.16502v3.pdf::Q009", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "arm": "llamacloud_basic_lc", "question_id": "2311.16502v3.pdf::Q009", "answer_letter": null, "raw_text": "I need to count the charts (plots, graphs, tables, diagrams) that appear in pages 5-10 of the document.\n\nLet me go through pages 5-10:\n\n**Page 5:**\n- Table 2: Overall results of different models (this is a large table)\n- Count: 1\n\n**Page 6:**\n- Table 3: Result decomposition across question difficulty levels\n- Figure 5: Performance of models on different types of images (this is a bar chart)\n- Count: 2\n\n**Page 7:**\n- Figure 6: Error analysis pie chart showing distribution of error types\n- Figure 7: Example error case with text and an image (not a chart)\n- Count: 1\n\n**Page 8:**\n- Continuation of references, no charts\n\n**Page 9:**\n- Continuation of references, no charts\n\n**Page 10:**\n- References continue, no charts\n\nTotal charts in pages 5-10:\n- Page 5: 1 (Table 2)\n- Page 6: 2 (Table 3, Figure 5)\n- Page 7: 1 (Figure 6)\n- Pages 8-10: 0\n\nTotal: 1 + 2 + 1 + 0 = 4\n\nAnswer: 4", "citations": [], "input_tokens": 83138, "output_tokens": 300, "cost_micros": 253914, "latency_ms": 8781, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2311.16502v3.pdf::Q009", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "arm": "llamacloud_premium_lc", "question_id": "2311.16502v3.pdf::Q009", "answer_letter": null, "raw_text": "I need to count all charts appearing in pages 5-10 of the document.\n\nLet me go through each page:\n\n**Page 5:**\n- Table 1: Key statistics of the MMMU benchmark\n\n**Page 6:**\n- Table 2: Overall results of different models on the MMMU validation and test set\n\n**Page 7:**\n- Table 3: Result decomposition across question difficulty levels\n- Figure 5: Performance of models on different types of images (this is a table/chart)\n\n**Page 8:**\n- Figure 6: Error distribution over 150 annotated GPT-4V errors (this is a table/chart)\n- Figure 7: A sample error case (this is an example, not a chart)\n\n**Page 9:**\n- No charts (this page discusses conclusion and references begin)\n\n**Page 10:**\n- No charts (continues references section)\n\nLet me recount more carefully by looking at what constitutes a \"chart\":\n\nPages 5-10:\n- Page 5: Table 1 (1 chart)\n- Page 6: Table 2 (1 chart)\n- Page 7: Table 3 (1 chart), Figure 5 which shows a table of performance data (1 chart)\n- Page 8: Figure 6 showing error distribution as a table (1 chart), Figure 7 is an example case study, not a chart\n- Pages 9-10: References, no charts\n\nTotal charts: 5\n\nAnswer: 5", "citations": [], "input_tokens": 103232, "output_tokens": 346, "cost_micros": 314886, "latency_ms": 10516, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "4"}}
+{"qid": "2311.16502v3.pdf::Q009", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "arm": "surfsense_agentic", "question_id": "2311.16502v3.pdf::Q009", "answer_letter": null, "raw_text": "I'll read the document to count the charts in pages 5-10.Let me search for page markers to identify pages 5-10:Let me read through the document to find page markers:I can see page markers. Let me continue searching for pages 5-10:Perfect! Now I know the line numbers for pages 5-10. Let me read those pages to count the charts:Now let me search for charts/figures systematically in pages 5-10. Let me look for figure markers:Now let me carefully count the charts in pages 5-10. Let me search for \"Figure\" markers in this range:\n\nBased on my reading of pages 5-10 (lines 2071-3184), I can identify the following charts:\n\n**Page 5** (starts at line 2071):\n- No charts (only text and Table 2)\n\n**Page 6** (starts at line 2462):\n- **Figure 5**: \"Performance of models on different types of images\" - this is a bar chart showing performance metrics\n\n**Page 7** (starts at line 2687):\n- **Figure 6**: \"Error distribution over 150 annotated GPT-4V errors\" - this is a pie chart showing error percentages\n- **Figure 7**: Shows a perceptual error example (not a chart, but a diagram/illustration)\n\n**Page 8** (starts at line 2912):\n- No charts (only text starting with Conclusion section)\n\nLet me continue reading pages 9 and 10:**Page 9** (starts at line 3045): References section - no charts\n\n**Page 10** (starts at line 3184): References section - no charts\n\nNow let me verify my count by carefully reviewing what I found:\n\n**Summary of charts in Pages 5-10:**\n\n- **Page 5**: 0 charts (contains Table 2, which is a table, not a chart)\n- **Page 6**: 1 chart (Figure 5 - bar chart)\n- **Page 7**: 1 chart (Figure 6 - pie chart) + Figure 7 (which is an illustration/example, not a chart)\n- **Page 8**: 0 charts (text only)\n- **Page 9**: 0 charts (references)\n- **Page 10**: 0 charts (references)\n\nTotal charts in pages 5-10: **2 charts**\n\n(Figure 5 is a bar chart and Figure 6 is a pie chart. Figure 7 is not a chart but an illustrative example showing a perceptual error case.)\n\nAnswer: 2", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 79750, "error": null, "extra": {"thread_id": 2026, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_8497190a9f7e4c40b3088d1ce4cbc759", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 453, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "4"}}
+{"qid": "2311.16502v3.pdf::Q011", "doc_id": "2311.16502v3.pdf", "answer_format": "str", "gold": "\"MMMU\"", "pages": 117, "document_id": 7593, "arm": "native_pdf", "question_id": "2311.16502v3.pdf::Q011", "answer_letter": null, "raw_text": "Looking at Figure 4 in the document, which compares MMMU with other existing benchmarks across two dimensions - Breadth (Knowledge) and Depth (Reasoning).\n\nThe figure shows various datasets plotted on these two axes:\n- VQA, GQA, VisWiz (basic breadth and depth)\n- TextVQA, MMBench, SEED (moderate breadth)\n- MM-Vet, ScienceQA (moderate breadth)\n- OKVQA (moderate breadth)\n- MMMU (positioned highest on both breadth and depth)\n\nAccording to the figure and the accompanying table, MMMU covers:\n- 11.5K questions\n- 30 Types of images\n- 6 disciplines, 30 subjects, and 183 subfields\n\nThis is explicitly described in the caption as excelling \"in both its breadth to cover a wide range of disciplines and its depth to test LMMs' reasoning abilities.\"\n\nAnswer: MMMU", "citations": [], "input_tokens": 259139, "output_tokens": 228, "cost_micros": 1559964, "latency_ms": 60530, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "mmmu", "normalised_gold": "mmmu"}}
+{"qid": "2311.16502v3.pdf::Q011", "doc_id": "2311.16502v3.pdf", "answer_format": "str", "gold": "\"MMMU\"", "pages": 117, "document_id": 7593, "arm": "azure_basic_lc", "question_id": "2311.16502v3.pdf::Q011", "answer_letter": null, "raw_text": "Looking at Figure 4 in the paper, I need to examine which dataset covers the broadest range of disciplines and knowledge areas.\n\nFrom the figure, the datasets compared are:\n- VQA\n- GQA\n- VisWiz\n- TextVQA\n- OKVQA\n- SEED\n- MMBench\n- MM-Vet\n- ScienceQA\n- MMMU\n\nThe figure shows that MMMU covers:\n- 6 disciplines\n- 30 subjects\n- 183 subfields\n- 11.5K questions\n\nIn comparison, other datasets are more limited in scope. For example, ScienceQA covers specific subjects but is noted as being at elementary to middle school level. The other datasets (VQA, GQA, etc.) focus on specific task types rather than broad disciplinary coverage.\n\nThe paper explicitly states that MMMU \"covers 30 subjects across 6 disciplines\" and emphasizes its \"breadth\" across multiple disciplines including Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, and Tech & Engineering.\n\nAnswer: MMMU", "citations": [], "input_tokens": 79443, "output_tokens": 256, "cost_micros": 242169, "latency_ms": 8655, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "mmmu", "normalised_gold": "mmmu"}}
+{"qid": "2311.16502v3.pdf::Q011", "doc_id": "2311.16502v3.pdf", "answer_format": "str", "gold": "\"MMMU\"", "pages": 117, "document_id": 7593, "arm": "azure_premium_lc", "question_id": "2311.16502v3.pdf::Q011", "answer_letter": null, "raw_text": "Looking at Figure 4 in the document, which compares MMMU with other existing benchmarks across breadth (knowledge) and depth (reasoning) dimensions:\n\nThe figure shows various datasets plotted, with MMMU positioned to excel in both breadth and depth. Among the nine different datasets shown (VQA, GQA, VizWiz, TextVQA, OKVQA, SEED, MMBench, MM-Vet, ScienceQA, and MMMU), MMMU is positioned highest on the breadth axis.\n\nAnswer: MMMU", "citations": [], "input_tokens": 109194, "output_tokens": 132, "cost_micros": 329562, "latency_ms": 7141, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "mmmu", "normalised_gold": "mmmu"}}
+{"qid": "2311.16502v3.pdf::Q011", "doc_id": "2311.16502v3.pdf", "answer_format": "str", "gold": "\"MMMU\"", "pages": 117, "document_id": 7593, "arm": "llamacloud_basic_lc", "question_id": "2311.16502v3.pdf::Q011", "answer_letter": null, "raw_text": "Looking at Figure 4 in the document, I need to identify which dataset covers the most breadth of knowledge.\n\nThe figure shows a comparison of different datasets along two axes:\n- Breadth (Knowledge) - horizontal axis\n- Depth (Reasoning) - vertical axis\n\nFrom the figure, the datasets shown include:\n- VQA\n- GQA\n- VisWiz\n- TextVQA\n- OKVQA\n- SEED\n- MMBench\n- MM-Vet\n- ScienceQA\n- MMMU\n\nLooking at the horizontal positioning (breadth), MMMU is positioned furthest to the right, indicating it has the greatest breadth of knowledge coverage among the datasets shown. This is also supported by the text which states MMMU covers \"30 Types\" of images and spans across 6 disciplines, 30 subjects, and 183 subfields.\n\nAnswer: MMMU", "citations": [], "input_tokens": 83157, "output_tokens": 210, "cost_micros": 252621, "latency_ms": 8312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "mmmu", "normalised_gold": "mmmu"}}
+{"qid": "2311.16502v3.pdf::Q011", "doc_id": "2311.16502v3.pdf", "answer_format": "str", "gold": "\"MMMU\"", "pages": 117, "document_id": 7593, "arm": "llamacloud_premium_lc", "question_id": "2311.16502v3.pdf::Q011", "answer_letter": null, "raw_text": "Looking at Figure 4 in the document, I need to identify which dataset represents the most breadth (knowledge) on the x-axis.\n\nThe scatter plot shows various datasets plotted with \"Breadth (Knowledge)\" on the x-axis and \"Depth (Reasoning)\" on the y-axis. MMMU is shown as a star at the highest position for both breadth and depth.\n\nAmong the datasets shown in Figure 4, MMMU is positioned furthest to the right on the Breadth axis, indicating it has the most breadth of knowledge coverage.\n\nAnswer: MMMU", "citations": [], "input_tokens": 103251, "output_tokens": 132, "cost_micros": 311733, "latency_ms": 6766, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "mmmu", "normalised_gold": "mmmu"}}
+{"qid": "2311.16502v3.pdf::Q011", "doc_id": "2311.16502v3.pdf", "answer_format": "str", "gold": "\"MMMU\"", "pages": 117, "document_id": 7593, "arm": "surfsense_agentic", "question_id": "2311.16502v3.pdf::Q011", "answer_letter": null, "raw_text": "I'll read the document to find Figure 4 and identify which dataset reflects the most breadth of knowledge.Let me search for Figure 4 in the document:Good! I found the caption for Figure 4. Now let me read the actual content around Figure 4 to see the details about which dataset reflects the most breadth of knowledge:Now let me look at the actual Figure 4 to see the visual comparison. Let me search for more context about breadth of knowledge in the figure:Let me read the section that shows the actual Figure 4 visualization:Perfect! I can see Figure 4 shows a scatter plot with \"Breadth (Knowledge)\" on one axis and \"Depth (Reasoning)\" on the other axis. The figure shows the following datasets plotted:\n- MMMU\n- VQA\n- GQA\n- VisWiz\n- TextVQA\n- MMBench\n- SEED\n- MM-Vet\n- ScienceQA\n- OKVQA\n\nFrom the figure structure (lines 1834-1858), I can see \"Breadth (Knowledge)\" is labeled as one of the axes. The caption states \"MMMU excels in both its breadth to cover a wide range of disciplines and its depth to test LMMs' reasoning abilities.\"\n\nLooking at the text in chunk 156205 (lines 2022-2029), it explicitly states: \"From the breadth perspective, the prior benchmarks are heavily focused on daily knowledge and common sense. The covered image format is also limited. Our benchmark aims to cover college-level knowledge with 30 image formats...\"\n\nThis indicates that **MMMU** reflects the most breadth of knowledge among the nine datasets shown in Figure 4.\n\nAnswer: MMMU", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 57452, "error": null, "extra": {"thread_id": 2027, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_e1415f7fe2384e1ba85595cd09d85b89", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 282, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "mmmu", "normalised_gold": "mmmu"}}
+{"qid": "2311.16502v3.pdf::Q013", "doc_id": "2311.16502v3.pdf", "arm": "native_pdf", "answer_format": "list", "gold": "['Perceptual Error', 'Lack of Knowledge', 'Reasoning Error']", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:06:30Z", "latency_ms": 77217, "error": null, "raw_text_chars": 404}]}, "question_id": "2311.16502v3.pdf::Q013", "answer_letter": null, "raw_text": "Looking at Figure 6 on page 8 of the document, which shows the \"Error distribution over 150 annotated GPT-4V errors\":\n\nThe error categories and their percentages are:\n- Perceptual Error: 35%\n- Lack of Knowledge: 29%\n- Reasoning Error: 26%\n- Textual Understanding: 4%\n- Reject to Answer: 3%\n- Annotation Error: 2%\n- Answer Extraction Error: 1%\n\nAnswer: Perceptual Error, Lack of Knowledge, Reasoning Error", "citations": [], "input_tokens": 259144, "output_tokens": 133, "cost_micros": 1557856, "latency_ms": 77171, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q013", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Perceptual Error', 'Lack of Knowledge', 'Reasoning Error']", "pages": 117, "document_id": 7593, "arm": "azure_basic_lc", "question_id": "2311.16502v3.pdf::Q013", "answer_letter": null, "raw_text": "I need to look at Figure 6, which shows the error distribution over 150 annotated GPT-4V errors.\n\nFrom Figure 6, I can see the following error categories and their percentages:\n- Perceptual Error: 35%\n- Lack of Knowledge: 29%\n- Reasoning Error: 26%\n- Textual Understanding: 4%\n- Reject to Answer: 3%\n- Annotation Error: 2%\n- Answer Extraction Error: 1%\n\nThe top-3 error types are:\n1. Perceptual Error (35%)\n2. Lack of Knowledge (29%)\n3. Reasoning Error (26%)\n\nAnswer: Perceptual Error, Lack of Knowledge, Reasoning Error", "citations": [], "input_tokens": 79448, "output_tokens": 174, "cost_micros": 240954, "latency_ms": 5937, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q013", "doc_id": "2311.16502v3.pdf", "arm": "azure_premium_lc", "answer_format": "list", "gold": "['Perceptual Error', 'Lack of Knowledge', 'Reasoning Error']", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:06:45Z", "latency_ms": 7750, "error": null, "raw_text_chars": 387}]}, "question_id": "2311.16502v3.pdf::Q013", "answer_letter": null, "raw_text": "Looking at Figure 6 \"Error distribution over 150 annotated GPT-4V errors\":\n\nThe pie chart shows the following error categories and their percentages:\n- Perceptual Error: 35%\n- Lack of Knowledge: 29%\n- Reasoning Error: 26%\n- Textual Understanding: 4%\n- Reject to Answer: 3%\n- Annotation Error: 2%\n- Answer Extraction Error: 1%\n\nAnswer: Perceptual Error, Lack of Knowledge, Reasoning Error", "citations": [], "input_tokens": 109199, "output_tokens": 125, "cost_micros": 329472, "latency_ms": 7750, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q013", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Perceptual Error', 'Lack of Knowledge', 'Reasoning Error']", "pages": 117, "document_id": 7593, "arm": "llamacloud_basic_lc", "question_id": "2311.16502v3.pdf::Q013", "answer_letter": null, "raw_text": "Looking at Figure 6 in the document, which shows the error analysis of 150 GPT-4V error cases:\n\nThe distribution shows:\n- Perceptual Error: 35%\n- Lack of Knowledge: 29%\n- Reasoning Error: 26%\n- Textual Understanding Error: 6%\n- Reject to Answer: 3%\n- Annotation Error: 2%\n- Answer Extraction Error: 1%\n\nAnswer: Perceptual Error, Lack of Knowledge, Reasoning Error", "citations": [], "input_tokens": 83162, "output_tokens": 122, "cost_micros": 251316, "latency_ms": 5827, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q013", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Perceptual Error', 'Lack of Knowledge', 'Reasoning Error']", "pages": 117, "document_id": 7593, "arm": "llamacloud_premium_lc", "question_id": "2311.16502v3.pdf::Q013", "answer_letter": null, "raw_text": "Looking at Figure 6 which shows the error distribution over 150 annotated GPT-4V errors:\n\n1. Perceptual Error: 35%\n2. Lack of Knowledge: 29%\n3. Reasoning Error: 26%\n\nAnswer: Perceptual Error, Lack of Knowledge, Reasoning Error", "citations": [], "input_tokens": 103256, "output_tokens": 77, "cost_micros": 310923, "latency_ms": 6108, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q013", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Perceptual Error', 'Lack of Knowledge', 'Reasoning Error']", "pages": 117, "document_id": 7593, "arm": "surfsense_agentic", "question_id": "2311.16502v3.pdf::Q013", "answer_letter": null, "raw_text": "I'll read the document to find Figure 6 and identify the top-3 error types over 150 annotated GPT-4V errors.Let me search for Figure 6 in the document:Now let me read the section around line 2692 where Figure 6 is located:Perfect! I found Figure 6 which shows the error distribution over 150 annotated GPT-4V errors. Looking at the data:\n\n- Perceptual Error: 35%\n- Lack of Knowledge: 29%\n- Reasoning Error: 26%\n- Textual Understanding: 4%\n- Reject to Answer: 3%\n- Annotation Error: 2%\n- Answer Extraction Error: 1%\n\nThe top-3 error types are:\n1. Perceptual Error (35%)\n2. Lack of Knowledge (29%)\n3. Reasoning Error (26%)\n\nAnswer: Perceptual Error, Lack of Knowledge, Reasoning Error", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 33125, "error": null, "extra": {"thread_id": 2028, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_c246c6154fa7469892ccf49cfee50145", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 158, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q014", "doc_id": "2311.16502v3.pdf", "arm": "native_pdf", "answer_format": "list", "gold": "['Reasoning Error', 'Perceptual Error', 'Lack of Knowledge']", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:06:53Z", "latency_ms": 59125, "error": null, "raw_text_chars": 830}]}, "question_id": "2311.16502v3.pdf::Q014", "answer_letter": null, "raw_text": "I'll search through pages 95-100 for error types mentioned in the case studies.\n\nFrom page 95 (Figure 80 - Architecture and Engineering: Surveying and Mapping):\n- This is a correct case, no error type mentioned\n\nFrom page 96 (Figure 82 - Architecture and Engineering: Civil Engineering):\n- Error Category: Reasoning Error\n\nFrom page 97 (Figure 83 - Computer Science: Data Structure and Algorithm):\n- This is a correct case, no error type mentioned\n\nFrom page 98 (Figure 84 - Computer Science: Compiler Principle):\n- Error Category: Perceptual Error, Lack of Knowledge\n\nFrom page 99 (Figure 85 - Computer Science: Operating System):\n- Error Category: Perceptual Error\n\nFrom page 100 (Figure 86 - Computer Science: Compiler Principle):\n- Error Category: Perceptual Error\n\nAnswer: Reasoning Error, Perceptual Error, Lack of Knowledge", "citations": [], "input_tokens": 259136, "output_tokens": 226, "cost_micros": 1559901, "latency_ms": 59077, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q014", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Reasoning Error', 'Perceptual Error', 'Lack of Knowledge']", "pages": 117, "document_id": 7593, "arm": "azure_basic_lc", "question_id": "2311.16502v3.pdf::Q014", "answer_letter": null, "raw_text": "I'll carefully examine pages 95-100 (Figures 82-87) to identify all error types mentioned in the case studies.\n\nLooking at each figure in the specified range:\n\n**Figure 82** (Architecture and Engineering: Civil Engineering) - Page 96\n- Error Category: Reasoning Error\n\n**Figure 83** (Computer Science: Data Structure and Algorithm) - Page 97\n- This is a correct case, no error category\n\n**Figure 84** (Computer Science: Compiler Principle) - Page 98\n- Error Category: Perceptual Error, Lack of Knowledge\n\n**Figure 85** (Computer Science: Operating System) - Page 99\n- Error Category: Perceptual Error\n\n**Figure 86** (Computer Science: Compiler Principle) - Page 100\n- Error Category: Perceptual Error\n\n**Figure 87** (Electronics: Signal Processing) - Page 101\n- This is a correct case, no error category\n\nSo the error types mentioned in pages 95-100 are:\n- Reasoning Error\n- Perceptual Error\n- Lack of Knowledge\n\nAnswer: Reasoning Error, Perceptual Error, Lack of Knowledge", "citations": [], "input_tokens": 79440, "output_tokens": 265, "cost_micros": 242295, "latency_ms": 8516, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q014", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Reasoning Error', 'Perceptual Error', 'Lack of Knowledge']", "pages": 117, "document_id": 7593, "arm": "azure_premium_lc", "question_id": "2311.16502v3.pdf::Q014", "answer_letter": null, "raw_text": "I'll carefully examine pages 95-100 of the document to identify all error types mentioned in the case studies.\n\nLooking through the figures on these pages:\n\n**Page 95 (Figure 81):** Architecture and Engineering - Correct case (no error type)\n\n**Page 96 (Figure 82):** Architecture and Engineering - **Reasoning Error**\n\n**Page 97 (Figure 83):** Computer Science - Correct case (no error type)\n\n**Page 98 (Figure 84):** Computer Science - **Perceptual Error, Lack of Knowledge**\n\n**Page 99 (Figure 85):** Computer Science - **Perceptual Error**\n\n**Page 100 (Figure 86):** Computer Science - **Perceptual Error**\n\nAnswer: Reasoning Error, Perceptual Error, Lack of Knowledge", "citations": [], "input_tokens": 109191, "output_tokens": 192, "cost_micros": 330453, "latency_ms": 8281, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q014", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Reasoning Error', 'Perceptual Error', 'Lack of Knowledge']", "pages": 117, "document_id": 7593, "arm": "llamacloud_basic_lc", "question_id": "2311.16502v3.pdf::Q014", "answer_letter": null, "raw_text": "I'll search through pages 95-100 of the document for error types mentioned in case studies.\n\nLooking at the case studies in this range:\n\nPage 95 (Figure 81 - Architecture and Engineering): No error type specified (correct case)\n\nPage 96 (Figure 82 - Architecture and Engineering): **Reasoning Error**\n\nPage 97 (Figure 83 - Computer Science): No error type specified (correct case)\n\nPage 98 (Figure 84 - Computer Science): **Perceptual Error, Lack of Knowledge**\n\nPage 99 (Figure 85 - Computer Science): **Perceptual Error**\n\nPage 100 (Figure 86 - Computer Science): **Perceptual Error**\n\nAnswer: Reasoning Error, Perceptual Error, Lack of Knowledge", "citations": [], "input_tokens": 83154, "output_tokens": 184, "cost_micros": 252222, "latency_ms": 6655, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q014", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Reasoning Error', 'Perceptual Error', 'Lack of Knowledge']", "pages": 117, "document_id": 7593, "arm": "llamacloud_premium_lc", "question_id": "2311.16502v3.pdf::Q014", "answer_letter": null, "raw_text": "Looking through pages 95-100 of the document (Figures 82-86 in the case studies):\n\nFigure 82 (Architecture and Engineering: Civil Engineering) - **Reasoning Error**\n\nFigure 83 (Computer Science: Data Structure and Algorithm) - No error type mentioned (this is a correct case)\n\nFigure 84 (Computer Science: Compiler Principle) - **Perceptual Error, Lack of Knowledge**\n\nFigure 85 (Computer Science: Operating System) - **Perceptual Error**\n\nFigure 86 (Computer Science: Compiler Principle) - **Perceptual Error**\n\nAnswer: Reasoning Error, Perceptual Error, Lack of Knowledge", "citations": [], "input_tokens": 103248, "output_tokens": 158, "cost_micros": 312114, "latency_ms": 11889, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q014", "doc_id": "2311.16502v3.pdf", "answer_format": "list", "gold": "['Reasoning Error', 'Perceptual Error', 'Lack of Knowledge']", "pages": 117, "document_id": 7593, "arm": "surfsense_agentic", "question_id": "2311.16502v3.pdf::Q014", "answer_letter": null, "raw_text": "I'll read the document to find the error types mentioned in the case studies on pages 95-100.Let me search for page numbers to locate pages 95-100:Let me search for \"case stud\" to find the case studies section:Let me read around line 5765 where Table 10 mentions case study figures with error categories:Good! I found Table 10 which shows the error categories. The columns are: Correct Case, Perception, Lack of Knowledge, Reasoning, and Other. Let me continue reading to see the complete table:Good! I found that the error categories are: Perception, Lack of Knowledge, Reasoning, and Other (plus \"Correct Case\" which is not an error). Now I need to find the actual case study pages 95-100. Let me search for figure 95 or case 95:Let me search for the case studies section more specifically. Let me look for the pattern of case numbers 95-100:Figure 96 is something different. Let me check the structure - from the earlier search, I saw \"90 Energy and Power 2: Reasoning Error 104\" and \"95 Mechanical Engineering 3: Re\". Let me search for these patterns directly:Good! So these are Figure numbers 90-95. Let me read that section where it lists figures 90-100:I can see figures 90-95 here. Now I need to find what comes after figure 95. Let me search earlier in the document to see the complete list:This appears to be a table of contents or index. I need to understand better - these numbers (95-100) might refer to page numbers or figure numbers. Let me look at this list more carefully. I can see:\n\n- Page 95: Figure 81 Architecture and Engineering 1: Correct Case\n- Page 96: Figure 83 Computer Science 1: Correct Case  \n- Page 97: ?\n- Page 98: Figure 84 Computer Science 2: Perceptual Error, Lack of Knowledge\n- Page 99: Figure 85 Computer Science 3: Perceptual Error\n- Page 100: Figure 86 Computer Science 4: Perceptual Error\n\nBut I need to verify this interpretation. Let me search for where these actual figures appear in the document:Perfect! Now let me read from page 95 to page 100. Let me find line numbers around page 95:Good, I'm at page 95. Now let me continue reading through pages 95-100:Good! I'm now at page 97. Let me continue reading to get to page 100:Good! I'm at page 99. Let me continue to page 100:Good! Now let me continue to see the rest of page 100:Perfect! Now I have reviewed pages 95-100. Let me compile all the error types I found in the case studies on these pages:\n\nFrom the case studies I read on pages 95-100:\n\n**Page 95:**\n- Figure 81: Correct Case (not an error)\n\n**Page 96:**\n- Figure 82: Reasoning Error\n\n**Page 97:**\n- Figure 83: Correct Case (not an error)\n\n**Page 98:**\n- Figure 84: Perceptual Error, Lack of Knowledge\n\n**Page 99:**\n- Figure 85: Perceptual Error\n\n**Page 100:**\n- Figure 86: Perceptual Error\n- Figure 87: Correct Case (not an error)\n\nNow let me verify by looking back at the table of contents I saw earlier to ensure I have the right mapping. Let me check once more:Perfect! This confirms my interpretation. The table shows:\n- Figure 81 is on pages 94-95\n- Figure 82 is on page 96 (spanning from before)\n- Figure 83 is on pages 96-97\n- Figure 84 is on page 98\n- Figure 85 is on page 99\n- Figure 86 is on page 100\n- Figure 87 is on page 101 (starting at page 100)\n\nSo the error types mentioned in case studies on pages 95-100 are:\n\n1. **Reasoning Error** (Figure 82 - Architecture and Engineering)\n2. **Perceptual Error** (Figure 84, 85, 86 - Computer Science)\n3. **Lack of Knowledge** (Figure 84 - Computer Science)\n\nAnswer: Reasoning Error, Perceptual Error, Lack of Knowledge", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 114937, "error": null, "extra": {"thread_id": 2029, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_77fb85ac92354e66b7280742dc24bcdb", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 755, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q015", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "arm": "native_pdf", "question_id": "2311.16502v3.pdf::Q015", "answer_letter": null, "raw_text": "I need to examine pages 65-70 to count the successful cases (marked with a checkmark \u2713).\n\n**Page 65** (Figure 51): Clinical Medicine: Pathology - **Correct case** \u2713\n\n**Page 66** (Figure 52): Clinical Medicine: Pathology - **Correct case** \u2713\n\n**Page 67** (Figure 53): Clinical Medicine: Pathology - **Correct case** \u2713\n\n**Page 68** (Figure 54): Clinical Medicine: Pathology - **Error case** \u2717 (Perceptual Error)\n\n**Page 69** (Figure 55): Clinical Medicine: Heart Disease - **Error case** \u2717 (Lack of Knowledge)\n\n**Page 70** (Figure 56): Diagnostics and Laboratory Medicine: Radiology - **Correct case** \u2713\n\nCounting the successful cases (those with \u2713):\n- Page 65: 1\n- Page 66: 1\n- Page 67: 1\n- Page 68: 0\n- Page 69: 0\n- Page 70: 1\n\nTotal: 1 + 1 + 1 + 0 + 0 + 1 = 4\n\nAnswer: 4", "citations": [], "input_tokens": 259120, "output_tokens": 295, "cost_micros": 1561358, "latency_ms": 64484, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2311.16502v3.pdf::Q015", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "arm": "azure_basic_lc", "question_id": "2311.16502v3.pdf::Q015", "answer_letter": null, "raw_text": "I need to count the successful (correct) cases in pages 65-70 of the document.\n\nLet me go through each figure in this range:\n\n**Page 65:**\n- Figure 51: Clinical Medicine: Pathology - \"A sample **correct** case\" \u2713\n\n**Page 66:**\n- Figure 52: Clinical Medicine: Pathology - \"A sample **correct** case\" \u2713\n\n**Page 67:**\n- Figure 53: Clinical Medicine: Pathology - \"A sample **correct** case\" \u2713\n\n**Page 68:**\n- Figure 54: Clinical Medicine: Pathology - \"A sample **error** case\" \u2717\n\n**Page 69:**\n- Figure 55: Clinical Medicine: Heart Disease - \"A sample **error** case\" \u2717\n\n**Page 70:**\n- Figure 56: Diagnostics and Laboratory Medicine: Radiology - \"A sample **correct** case\" \u2713\n\nCounting the correct cases: Pages 65, 66, 67, and 70 each have one correct case.\n\nTotal successful cases = 4\n\nAnswer: 4", "citations": [], "input_tokens": 79424, "output_tokens": 257, "cost_micros": 242127, "latency_ms": 7094, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2311.16502v3.pdf::Q015", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "arm": "azure_premium_lc", "question_id": "2311.16502v3.pdf::Q015", "answer_letter": null, "raw_text": "I need to count the successful cases (marked with \u2713 or \"V\") in Pages 65-70.\n\nLet me go through each figure in this page range:\n\n**Page 65:**\n- Figure 51: Clinical Medicine 1: Correct Case \u2713\n\n**Page 66:**\n- Figure 52: Clinical Medicine 2: Correct Case \u2713\n\n**Page 67:**\n- Figure 53: Clinical Medicine 3: Correct Case \u2713\n\n**Page 68:**\n- Figure 54: Clinical Medicine 4: Perceptual Error \u2717\n\n**Page 69:**\n- Figure 55: Clinical Medicine 5: Lack of Knowledge \u2717\n\n**Page 70:**\n- Figure 56: Diagnostics and Lab Medicine 1: Correct Case \u2713\n\nCounting the successful cases (marked with \u2713):\n- Page 65: 1\n- Page 66: 1\n- Page 67: 1\n- Page 68: 0\n- Page 69: 0\n- Page 70: 1\n\nTotal: 1 + 1 + 1 + 0 + 0 + 1 = 4\n\nAnswer: 4", "citations": [], "input_tokens": 109175, "output_tokens": 280, "cost_micros": 331725, "latency_ms": 8125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2311.16502v3.pdf::Q015", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "arm": "llamacloud_basic_lc", "question_id": "2311.16502v3.pdf::Q015", "answer_letter": null, "raw_text": "I need to count the successful cases in Pages 65-70 of the document.\n\nLet me go through each page:\n\n**Page 65 (Clinical Medicine 1):**\n- Figure 51: \"A sample correct case of Clinical Medicine (subfield: Pathology)\"\n- This is a CORRECT case \u2713\n\n**Page 66 (Clinical Medicine 2):**\n- Figure 52: \"A sample correct case of Clinical Medicine (subfield: Pathology)\"\n- This is a CORRECT case \u2713\n\n**Page 67 (Clinical Medicine 3):**\n- Figure 53: \"A sample correct case of Clinical Medicine (subfield: Pathology)\"\n- This is a CORRECT case \u2713\n\n**Page 68 (Clinical Medicine 4):**\n- Figure 54: \"A sample error case of Clinical Medicine (subfield: Pathology). Error category: Perceptual Error\"\n- This is an ERROR case \u2717\n\n**Page 69 (Clinical Medicine 5):**\n- Figure 55: \"A sample error case of Clinical Medicine (subfield: Heart Disease). Error category: Lack of Knowledge\"\n- This is an ERROR case \u2717\n\n**Page 70 (Diagnostics and Laboratory Medicine 1):**\n- Figure 56: \"A sample correct case of Diagnostics and Laboratory Medicine (subfield: Radiology)\"\n- This is a CORRECT case \u2713\n\nTotal successful (correct) cases in pages 65-70: 4\n\nAnswer: 4", "citations": [], "input_tokens": 83138, "output_tokens": 348, "cost_micros": 254634, "latency_ms": 8078, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2311.16502v3.pdf::Q015", "doc_id": "2311.16502v3.pdf", "arm": "llamacloud_premium_lc", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:07:48Z", "latency_ms": 11110, "error": null, "raw_text_chars": 1296}]}, "question_id": "2311.16502v3.pdf::Q015", "answer_letter": null, "raw_text": "I need to count the successful cases (correct predictions by GPT-4V) in Pages 65-70.\n\nLet me go through each figure in this range:\n\n**Page 65 - Figure 51 (Clinical Medicine: Pathology)**\n- GPT-4V correctly identified (E) Familial polyposis coli\n- Ground Truth: (E) Familial polyposis coli\n- **SUCCESS**\n\n**Page 66 - Figure 52 (Clinical Medicine: Pathology)**\n- GPT-4V correctly identified (D) Ruptured berry aneurysm\n- Ground Truth: (D) Ruptured berry aneurysm\n- **SUCCESS**\n\n**Page 67 - Figure 53 (Clinical Medicine: Pathology)**\n- GPT-4V answered (A) Human papillomavirus infection\n- Ground Truth: (E) Chronic inflammation\n- Error Category: Perceptual Error\n- **FAILURE**\n\n**Page 68 - Figure 54 (Clinical Medicine: Pathology)**\n- GPT-4V answered (A) Human papillomavirus infection\n- Ground Truth: (E) Chronic inflammation\n- Error Category: Perceptual Error\n- **FAILURE**\n\n**Page 69 - Figure 55 (Clinical Medicine: Heart Disease)**\n- GPT-4V answered (B) Mitral regurgitation\n- Ground Truth: (C) Aortic value regurgitation\n- Error Category: Lack of Knowledge\n- **FAILURE**\n\n**Page 70 - Figure 56 (Diagnostics and Laboratory Medicine: Radiology)**\n- GPT-4V correctly identified (C) Dandy-Walker syndrome\n- Ground Truth: (C) Dandy-Walker syndrome\n- **SUCCESS**\n\nTotal successful cases: 3\n\nAnswer: 3", "citations": [], "input_tokens": 103232, "output_tokens": 424, "cost_micros": 316056, "latency_ms": 11110, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "4"}}
+{"qid": "2311.16502v3.pdf::Q015", "doc_id": "2311.16502v3.pdf", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "arm": "surfsense_agentic", "question_id": "2311.16502v3.pdf::Q015", "answer_letter": null, "raw_text": "I'll read the document mentioned in the priority list and look at pages 65-70 to find information about successful cases.I need to search for content related to pages 65-70. Let me search for page markers in the document.Let me search for page numbers in a different format:Let me search for the word \"successful\" or \"cases\" to understand the document structure:Let me search for \"cases\" or look for page numbers differently. Let me read more of the document to understand its structure:This is not helpful. Let me search for the actual page numbers in the document. PDF documents often have page numbers embedded. Let me try different patterns:I can see there's a \"Case Study\" section. Let me look for the appendix section that might have pages 65-70. Let me read the entire document more systematically. First, let me check how long the document is:Great! I found a list of case studies with page numbers. I can see this is a table mapping case numbers to their descriptions and page numbers. Let me continue reading to see pages 65-70:Perfect! I found the information. Looking at pages 65-70, I can see:\n\n- Page 65: Case 51 - Clinical Medicine 1: **Correct Case**\n- Page 66: Case 52 - Clinical Medicine 2: **Correct Case**\n- Page 67: Case 53 - Clinical Medicine 3: **Correct Case**\n- Page 68: Case 54 - Clinical Medicine 4: Perceptual Error\n- Page 69: Case 55 - Clinical Medicine 5: Lack of Knowledge\n- Page 70: Case 56 - Diagnostics and Lab Medicine 1: **Correct Case**\n\nCounting the \"Correct Case\" entries in pages 65-70:\n1. Page 65: Clinical Medicine 1: Correct Case\n2. Page 66: Clinical Medicine 2: Correct Case\n3. Page 67: Clinical Medicine 3: Correct Case\n4. Page 70: Diagnostics and Lab Medicine 1: Correct Case\n\nAnswer: 4", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 54718, "error": null, "extra": {"thread_id": 2030, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_77d82c91b9624d83aeec575d8461194f", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 347, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "4"}}
+{"qid": "2312.04350v3.pdf::Q000", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "6", "pages": 28, "document_id": 7594, "arm": "native_pdf", "question_id": "2312.04350v3.pdf::Q000", "answer_letter": null, "raw_text": "I need to find information about how many key steps the authors divided causal inference into.\n\nLooking through the document, I can see in Section 3.2 \"Generating the Explanations\" on page 6, there is a discussion of step-by-step explanations and subskills:\n\n\"Apart from the question-answer pairs, we also generate the step-by-step explanations. Our goal is to provide all intermediate reasoning steps a student of causal inference would use to answer the questions, so that each necessary subskill necessary for causal inference can be evaluated individually. We identify the following six subskills: \u2460 causal graph extraction; \u2461 correct query type interpretation; \u2462 symbolic formalization of the query; \u2463 semantic parsing to compile the available data; \u2464 estimand derivation; and \u2465 arithmetic calculation to solve the estimand, as in the colored boxes in Figure 1.\"\n\nThis clearly states that they identified **six subskills** for causal inference.\n\nAdditionally, in Figure 1 on page 2, I can see these six steps are illustrated in the example.\n\nIn Section 4 \"Our CAUSALCOT Model\" on page 7, it also mentions \"We base our prompt design on the multi-step reasoning process of causal inference as shown in Figure 4, first starting with four preparation steps: \u2460 identifying the causal graph structure; \u2461 determining the causal query type; \u2462 formulating the query symbolically precisely; and \u2463 extracting relevant data from the prompt. Then, given all the information collected in the preparation stage, we introduce the formal solution: \u2464 correctly deducing the estimand using causal inference techniques; and finally \u2465 evaluating the estimand to answer the question.\"\n\nThis confirms the six steps again.\n\nAnswer: 6", "citations": [], "input_tokens": 75261, "output_tokens": 410, "cost_micros": 231933, "latency_ms": 16531, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.04350v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.04350v3.pdf::Q000", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "6", "pages": 28, "document_id": 7594, "arm": "azure_basic_lc", "question_id": "2312.04350v3.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, I need to find where the authors describe the steps they divided causal inference into.\n\nIn Figure 1, there is a section titled \"Correct steps to lead to the ground-truth answer:\" which shows 6 numbered steps:\n\n1) Parse the causal graph\n2) Classify the query type\n3) Formulate the query to its symbolic form\n4) Collect the available data\n5) Derive the estimand using causal inference\n6) Solve for the estimand by plugging in the relevant data\n\nThis is also confirmed in Section 3.2 where they discuss generating explanations: \"We identify the following six subskills: 1 causal graph extraction; 2 correct query type interpretation; 3 symbolic formalization of the query; 4 semantic parsing to compile the available data; 5 estimand derivation; and 6 arithmetic calculation to solve the estimand\"\n\nAdditionally, in Section 4 describing their CAUSALCOT model, they state: \"We base our prompt design on the multi-step reasoning process of causal inference as shown in Figure 4, first starting with four preparation steps... Then... the formal solution: 5 correctly deducing the estimand... and finally 6 evaluating the estimand\"\n\nAnswer: 6", "citations": [], "input_tokens": 31749, "output_tokens": 291, "cost_micros": 99612, "latency_ms": 9327, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.04350v3.pdf::Q000", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "6", "pages": 28, "document_id": 7594, "arm": "azure_premium_lc", "question_id": "2312.04350v3.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, I need to find where the authors describe the steps they divided causal inference into.\n\nIn Section 4, \"Our CAUSALCOT Model,\" the authors describe their approach based on the causal inference engine. On page 7, they explicitly state:\n\n\"We base our prompt design on the multi-step reasoning process of causal inference as shown in Figure 4, first starting with four preparation steps: 1 identifying the causal graph structure; 2 determining the causal query type; 3 formulating the query symbolically precisely; and 4 extracting relevant data from the prompt. Then, given all the information collected in the preparation stage, we introduce the formal solution: 5 correctly deducing the estimand using causal inference techniques; and finally 6 evaluating the estimand to answer the question.\"\n\nThis clearly describes 6 steps:\n1. Identifying the causal graph structure\n2. Determining the causal query type\n3. Formulating the query symbolically precisely\n4. Extracting relevant data from the prompt\n5. Correctly deducing the estimand using causal inference techniques\n6. Evaluating the estimand to answer the question\n\nThis is also shown in Figure 1 and Figure 4, where 6 steps are illustrated.\n\nAnswer: 6", "citations": [], "input_tokens": 38818, "output_tokens": 294, "cost_micros": 120864, "latency_ms": 8828, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.04350v3.pdf::Q000", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "6", "pages": 28, "document_id": 7594, "arm": "llamacloud_basic_lc", "question_id": "2312.04350v3.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, I need to find where the authors describe the key steps of causal inference.\n\nIn Section 3.2 \"Generating the Explanations,\" the document states:\n\n\"Our goal is to provide all intermediate reasoning steps a student of causal inference would use to answer the questions, so that each necessary subskill necessary for causal inference can be evaluated individually. We identify the following six subskills: \u2460 causal graph extraction; \u2461 correct query type interpretation; \u2462 symbolic formalization of the query; \u2463 semantic parsing to compile the available data; \u2464 estimand derivation; and \u2465 arithmetic calculation to solve the estimand, as in the colored boxes in Figure 1.\"\n\nThis is also reflected in Figure 4 which shows the CAUSALCOT model with 6 steps, and in Section 4 where they describe their prompting strategy that \"breaks down a causal reasoning problem into multiple symbolically-grounded, simpler steps.\"\n\nAnswer: 6", "citations": [], "input_tokens": 33630, "output_tokens": 227, "cost_micros": 104295, "latency_ms": 8516, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.04350v3.pdf::Q000", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "6", "pages": 28, "document_id": 7594, "arm": "llamacloud_premium_lc", "question_id": "2312.04350v3.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, specifically at Figure 1 which shows the \"Correct steps to lead to the ground-truth answer,\" I can identify the key steps that the authors divided causal inference into.\n\nThe figure clearly shows 6 numbered steps:\n\n1. Parse the causal graph (Causal Relation Extraction)\n2. Classify the query type (Causal Question Classification)\n3. Formulate the query to its symbolic form (Formalization)\n4. Collect the available data (Semantic Parsing)\n5. Derive the estimand using causal inference (Formal Causal Inference)\n6. Solve for the estimand by plugging in the relevant data (Arithmetics)\n\nThis is further confirmed in Section 3.2 where the explanation generation is described, and in Section 4 where the CAUSALCOT model is introduced, which follows these same 6 steps.\n\nAnswer: 6", "citations": [], "input_tokens": 39832, "output_tokens": 211, "cost_micros": 122661, "latency_ms": 8250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.04350v3.pdf::Q000", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "6", "pages": 28, "document_id": 7594, "arm": "surfsense_agentic", "question_id": "2312.04350v3.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the provided document to find information about the key steps the author divided causal inference into.Let me search for information about how the author divided the process or steps:Perfect! I found the answer. The text clearly states that the author divided causal inference into 6 steps. Let me verify this by reading a bit more context:Perfect! The document clearly shows that the author divided causal inference into **6 steps**:\n\n1. Extract the causal graph\n2. Determine the query type\n3. Formalize the query\n4. Gather all relevant data\n5. Deduce the estimand using causal inference\n6. Calculate the estimand\n\nThe text explicitly states: \"first starting with four preparation steps: 1 identifying the causal graph structure; 2 determining the causal query type; 3 formulating the query symbolically precisely; and 4 extracting relevant data from the prompt. Then... 5 correctly deducing the estimand using causal inference techniques; and finally 6 evaluating the estimand to answer the question.\"\n\nAnswer: 6", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 44937, "error": null, "extra": {"thread_id": 2031, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_1b14a0eb01624709bf2cbb4b451f998a", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 221, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.04350v3.pdf::Q002", "doc_id": "2312.04350v3.pdf", "answer_format": "str", "gold": "semantic parsing", "pages": 28, "document_id": 7594, "arm": "native_pdf", "question_id": "2312.04350v3.pdf::Q002", "answer_letter": null, "raw_text": "Looking through the document for information about subskills related to collecting available data...\n\nIn Figure 1 on page 2, I can see the step-by-step reasoning process for answering a causal inference question. Step 4 is labeled \"Collect the available data\" and it has an associated subskill label.\n\nThe subskill associated with Step 4 \"Collect the available data\" is **Semantic Parsing**.\n\nThis is also confirmed in Section 3.2 where it mentions the six subskills:\n- \u2460 causal graph extraction\n- \u2461 correct query type interpretation\n- \u2462 symbolic formalization of the query\n- \u2463 semantic parsing to compile the available data\n- \u2464 estimand derivation\n- \u2465 arithmetic calculation\n\nAnswer: Semantic Parsing", "citations": [], "input_tokens": 75267, "output_tokens": 185, "cost_micros": 228576, "latency_ms": 32921, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.04350v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "semantic parsing", "normalised_gold": "semantic parsing"}}
+{"qid": "2312.04350v3.pdf::Q002", "doc_id": "2312.04350v3.pdf", "answer_format": "str", "gold": "semantic parsing", "pages": 28, "document_id": 7594, "arm": "azure_basic_lc", "question_id": "2312.04350v3.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document, which shows the step-by-step process for answering a causal inference question, Step 4 is labeled as \"Collect the available data:\" and the corresponding subskill is clearly marked as \"Semantic Parsing\".\n\nAnswer: Semantic Parsing", "citations": [], "input_tokens": 31755, "output_tokens": 68, "cost_micros": 96285, "latency_ms": 4983, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "semantic parsing", "normalised_gold": "semantic parsing"}}
+{"qid": "2312.04350v3.pdf::Q002", "doc_id": "2312.04350v3.pdf", "answer_format": "str", "gold": "semantic parsing", "pages": 28, "document_id": 7594, "arm": "azure_premium_lc", "question_id": "2312.04350v3.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, specifically Figure 1 which shows the step-by-step explanation for solving a causal inference question:\n\nStep 4 is labeled \"Collect the available data\" and the subskill required for this step is identified as \"Semantic Parsing\" (shown in the colored box in the figure).\n\nAnswer: Semantic Parsing", "citations": [], "input_tokens": 38824, "output_tokens": 79, "cost_micros": 117657, "latency_ms": 5264, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "semantic parsing", "normalised_gold": "semantic parsing"}}
+{"qid": "2312.04350v3.pdf::Q002", "doc_id": "2312.04350v3.pdf", "answer_format": "str", "gold": "semantic parsing", "pages": 28, "document_id": 7594, "arm": "llamacloud_basic_lc", "question_id": "2312.04350v3.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, specifically Figure 1 which shows the step-by-step explanation, and Section 3.2 which discusses \"Generating the Explanations,\" I can identify the subskills mentioned.\n\nStep 4 in Figure 1 is labeled \"Collect the available data\" and the document describes this as requiring \"semantic parsing to compile the available data\" in Section 3.2.\n\nAnswer: semantic parsing", "citations": [], "input_tokens": 33636, "output_tokens": 96, "cost_micros": 102348, "latency_ms": 5125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "semantic parsing", "normalised_gold": "semantic parsing"}}
+{"qid": "2312.04350v3.pdf::Q002", "doc_id": "2312.04350v3.pdf", "answer_format": "str", "gold": "semantic parsing", "pages": 28, "document_id": 7594, "arm": "llamacloud_premium_lc", "question_id": "2312.04350v3.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, specifically Figure 1 which shows the \"Correct steps to lead to the ground-truth answer,\" I can see that Step 4 is labeled \"Collect the available data\" and the subskill associated with it is \"Semantic Parsing.\"\n\nThis is also confirmed in Section A.8 (Explanation Template) where Step \u2463 states: \"Collect all the available data: The available data are: '$d$'.\"\n\nAnswer: Semantic Parsing", "citations": [], "input_tokens": 39838, "output_tokens": 109, "cost_micros": 121149, "latency_ms": 5516, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "semantic parsing", "normalised_gold": "semantic parsing"}}
+{"qid": "2312.04350v3.pdf::Q002", "doc_id": "2312.04350v3.pdf", "answer_format": "str", "gold": "semantic parsing", "pages": 28, "document_id": 7594, "arm": "surfsense_agentic", "question_id": "2312.04350v3.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the priority document to find the answer to your question about what subskill is needed to collect the available data.Let me search for more specific terms related to data collection:Perfect! I found the answer. Let me read that section more carefully to get the complete context:Based on the document, I can now provide the answer. The document clearly shows that step 4 \"Collect the available data\" has the subskill listed as \"Semantic Parsing\".\n\nAnswer: Semantic Parsing", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 32687, "error": null, "extra": {"thread_id": 2032, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_46f68e1513d341d0b0ee21cfa0dae2f0", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 137, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "semantic parsing", "normalised_gold": "semantic parsing"}}
+{"qid": "2312.04350v3.pdf::Q003", "doc_id": "2312.04350v3.pdf", "answer_format": "str", "gold": "intervention", "pages": 28, "document_id": 7594, "arm": "native_pdf", "question_id": "2312.04350v3.pdf::Q003", "answer_letter": null, "raw_text": "Looking through the document for information about the rungs of the Ladder of Causation...\n\nFrom Section 2.1 \"The Ladder of Causation\" on page 3:\n\nThe document describes three distinct rungs:\n- Rung 1 (\"seeing\"): Statistical associations\n- Rung 2 (\"doing\"): Interventions\n- Rung 3 (\"imagining\"): Counterfactuals\n\nRung 2 is specifically described as dealing with interventions and the do-operator.\n\nAnswer: Intervention (or \"doing\")", "citations": [], "input_tokens": 75267, "output_tokens": 131, "cost_micros": 227766, "latency_ms": 31077, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.04350v3.pdf"}, "graded": {"correct": true, "f1": 0.5, "method": "str_norm", "normalised_pred": "intervention or doing", "normalised_gold": "intervention"}}
+{"qid": "2312.04350v3.pdf::Q003", "doc_id": "2312.04350v3.pdf", "answer_format": "str", "gold": "intervention", "pages": 28, "document_id": 7594, "arm": "azure_basic_lc", "question_id": "2312.04350v3.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the document, Section 2.1 \"The Ladder of Causation\" describes the three rungs:\n\n- Rung 1 (\"seeing\") - statistical associations\n- Rung 2 (\"doing\") - interventions\n- Rung 3 (\"imagining\") - counterfactuals\n\nThe document states that Rung 2 \"enables us to formalize the concept of actively intervening in the world.\"\n\nAnswer: Intervention (or \"doing\")", "citations": [], "input_tokens": 31755, "output_tokens": 111, "cost_micros": 96930, "latency_ms": 5344, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.5, "method": "str_norm", "normalised_pred": "intervention or doing", "normalised_gold": "intervention"}}
+{"qid": "2312.04350v3.pdf::Q003", "doc_id": "2312.04350v3.pdf", "answer_format": "str", "gold": "intervention", "pages": 28, "document_id": 7594, "arm": "azure_premium_lc", "question_id": "2312.04350v3.pdf::Q003", "answer_letter": null, "raw_text": "Looking through the document for information about the second rung of the ladder of causation...\n\nFrom Section 2.1 \"The Ladder of Causation\":\n\n\"Rung 2 (\"doing\"). This enables us to formalize the concept of actively intervening in the world, and modifying it toward some end (\"If I take an aspirin now, will my headache subside?\"). Interventions can be formalized using the do-operator [24] and Causal Bayesian Networks [67]...\"\n\nAnswer: Intervention (or \"doing\")", "citations": [], "input_tokens": 38824, "output_tokens": 126, "cost_micros": 118362, "latency_ms": 6250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.5, "method": "str_norm", "normalised_pred": "intervention or doing", "normalised_gold": "intervention"}}
+{"qid": "2312.04350v3.pdf::Q003", "doc_id": "2312.04350v3.pdf", "answer_format": "str", "gold": "intervention", "pages": 28, "document_id": 7594, "arm": "llamacloud_basic_lc", "question_id": "2312.04350v3.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the document, specifically Section 2.1 which describes \"The Ladder of Causation\":\n\nThe three rungs are:\n- Rung 1 (\"seeing\") - statistical associations\n- Rung 2 (\"doing\") - interventions\n- Rung 3 (\"imagining\") - counterfactuals\n\nRung 2 is explicitly described as dealing with interventions, using the do-operator to formalize the concept of actively intervening in the world.\n\nAnswer: Intervention (or \"doing\")", "citations": [], "input_tokens": 33636, "output_tokens": 122, "cost_micros": 102738, "latency_ms": 5062, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.5, "method": "str_norm", "normalised_pred": "intervention or doing", "normalised_gold": "intervention"}}
+{"qid": "2312.04350v3.pdf::Q003", "doc_id": "2312.04350v3.pdf", "answer_format": "str", "gold": "intervention", "pages": 28, "document_id": 7594, "arm": "llamacloud_premium_lc", "question_id": "2312.04350v3.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the document, specifically Section 2.1 \"The Ladder of Causation\":\n\n**Rung 2 (\"doing\").** This enables us to formalize the concept of actively intervening in the world, and modifying it toward some end (\"*If I take an aspirin now, will my headache subside?*\"). Interventions can be formalized using the *do-operator* [24] and *Causal Bayesian Networks* [67] to represent, for example, the distribution over $Y$ when intervening on $X$ to set its value to $x$ as $P(Y = y|do(X = x))$.\n\nAnswer: Intervention or \"doing\"", "citations": [], "input_tokens": 39838, "output_tokens": 159, "cost_micros": 121899, "latency_ms": 8094, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.5, "method": "str_norm", "normalised_pred": "intervention or doing", "normalised_gold": "intervention"}}
+{"qid": "2312.04350v3.pdf::Q003", "doc_id": "2312.04350v3.pdf", "answer_format": "str", "gold": "intervention", "pages": 28, "document_id": 7594, "arm": "surfsense_agentic", "question_id": "2312.04350v3.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the priority document to find the answer to your question about the second ladder of causation.Let me search for \"ladder of causation\" in the document:Let me search for \"second rung\" or information about the rungs:Good! I found a reference at line 553. Let me read that section:Perfect! I found the answer. The document clearly states that the three rungs of the Ladder of Causation are:\n- Rung 1: associational\n- Rung 2: interventional\n- Rung 3: counterfactual\n\nAnswer: interventional", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 43718, "error": null, "extra": {"thread_id": 2033, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_7af68403347145c59b6ebb40237510ed", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 215, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 0.0, "method": "str_norm", "normalised_pred": "interventional", "normalised_gold": "intervention"}}
+{"qid": "2312.04350v3.pdf::Q005", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "2", "pages": 28, "document_id": 7594, "arm": "native_pdf", "question_id": "2312.04350v3.pdf::Q005", "answer_letter": null, "raw_text": "I need to find information about the chain-type causal graph and count its directed edges.\n\nLooking through the document, I can find the causal graph structures in Figure 6 on page 19, which shows \"List of all ten causal graphs with treatment-effect pairs (CGTEs).\"\n\nIn Figure 6, I can see a \"Chain\" graph structure. The Chain graph shows:\n- X \u2192 (middle node) \u2192 Y\n\nThis represents a chain with three nodes connected by two directed edges:\n1. One edge from X to the middle node\n2. One edge from the middle node to Y\n\nTherefore, a chain-type causal graph has 2 directed edges.\n\nAnswer: 2", "citations": [], "input_tokens": 75271, "output_tokens": 156, "cost_micros": 228153, "latency_ms": 30437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.04350v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2312.04350v3.pdf::Q005", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "2", "pages": 28, "document_id": 7594, "arm": "azure_basic_lc", "question_id": "2312.04350v3.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about the chain-type causal graph structure.\n\nIn Figure 6 on page 19, titled \"List of all ten causal graphs with treatment-effect pairs (CGTEs)\", I can see the \"Chain\" graph structure.\n\nThe Chain graph shows: X \u2192 M \u2192 Y\n\nThis is a simple chain structure where:\n- X has an arrow to M (1 edge)\n- M has an arrow to Y (1 edge)\n\nTherefore, there are 2 directed edges in a chain-type causal graph.\n\nAnswer: 2", "citations": [], "input_tokens": 31759, "output_tokens": 133, "cost_micros": 97272, "latency_ms": 6187, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2312.04350v3.pdf::Q005", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "2", "pages": 28, "document_id": 7594, "arm": "azure_premium_lc", "question_id": "2312.04350v3.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about the chain-type causal graph structure.\n\nIn Figure 6 on page 19, there is a table titled \"List of all ten causal graphs with treatment-effect pairs (CGTEs)\" which shows:\n\nFor the \"Chain\" causal graph:\n- CGTE Alias: Chain\n- Treatment-Effect Pair: X \u2192 Y\n\nThe diagram shows a simple chain structure: X \u2192 Z \u2192 Y\n\nThis is a chain with 3 nodes (X, Z, Y) connected sequentially.\n\nCounting the directed edges:\n1. X \u2192 Z (one edge)\n2. Z \u2192 Y (one edge)\n\nTotal: 2 directed edges\n\nAnswer: 2", "citations": [], "input_tokens": 38828, "output_tokens": 165, "cost_micros": 118959, "latency_ms": 5296, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2312.04350v3.pdf::Q005", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "2", "pages": 28, "document_id": 7594, "arm": "llamacloud_basic_lc", "question_id": "2312.04350v3.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about the chain-type causal graph structure.\n\nIn Figure 6 on page 18, there is a \"List of all ten causal graphs with treatment-effect pairs (CGTEs).\" The table shows:\n\n| Causal Graph | CGTE Alias | Treatment-Effect Pair |\n| ------------ | ----------- | --------------------- |\n| Chain        | Chain       | X Y                   |\n\nThe visual representation shows: X \u2192 M \u2192 Y\n\nThis indicates a chain structure where X points to M (a mediator), and M points to Y.\n\nTherefore, there are 2 directed edges in a chain-type causal graph: one from X to M, and one from M to Y.\n\nAnswer: 2", "citations": [], "input_tokens": 33640, "output_tokens": 174, "cost_micros": 103530, "latency_ms": 6796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2312.04350v3.pdf::Q005", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "2", "pages": 28, "document_id": 7594, "arm": "llamacloud_premium_lc", "question_id": "2312.04350v3.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, I need to find the definition of a chain-type causal graph.\n\nIn Section A.3 \"Collection of Causal Graphs,\" there is a table (Figure 6) that lists all causal graph types with their structures.\n\nFor the \"Chain\" graph, the table shows:\n- Causal Graph Alias: Chain\n- CGTE Alias: Chain  \n- Treatment-Effect Pair: X \u2192 Z \u2192 Y (X and Y are treatment-effect pair)\n\nThe notation X \u2192 Z \u2192 Y indicates:\n- One edge from X to Z\n- One edge from Z to Y\n\nTherefore, there are 2 directed edges in a chain-type causal graph.\n\nAnswer: 2", "citations": [], "input_tokens": 39842, "output_tokens": 164, "cost_micros": 121986, "latency_ms": 7343, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2312.04350v3.pdf::Q005", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "2", "pages": 28, "document_id": 7594, "arm": "surfsense_agentic", "question_id": "2312.04350v3.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to find the definition of a chain-type causal graph and count the directed edges.Let me search for \"chain-type\" or \"causal graph\" in the document:Let me search for \"causal graph\" to find the definition:Let me read the section around line 2387 where the causal graph definition appears:Now let me search for \"chain\" with \"graph\" or look for specific graph types. Let me check Figure 6 which shows all the causal graphs:Good! I can see \"Chain\" is listed as one of the causal graphs at line 2101-2104. Now let me look for the actual visualization or definition of the Chain graph. Let me check if there's a figure showing these graphs:", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 48766, "error": null, "extra": {"thread_id": 2034, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_ca8eb7f650264585a1c5b7287fc4aa99", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 247, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2387", "normalised_gold": "2"}}
+{"qid": "2312.04350v3.pdf::Q006", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "GPT-4", "pages": 28, "document_id": 7594, "arm": "native_pdf", "question_id": "2312.04350v3.pdf::Q006", "answer_letter": null, "raw_text": "I need to find which model performs best on the CLADDER dataset by looking at the performance tables in the document.\n\nLooking at Table 2 on page 8, which shows \"Performance of all models on our CLADDER dataset v1.5\":\n\nThe \"Overall Acc.\" column shows:\n- Random: 49.27\n- LLaMa: 44.03\n- Alpaca: 44.66\n- GPT-3 Non-Instr. (davinci): 49.92\n- GPT-3 Instr. (text-davinci-001): 51.40\n- GPT-3 Instr. (text-davinci-002): 53.15\n- GPT-3 Instr. (text-davinci-003): 56.26\n- GPT-3.5: 52.18\n- GPT-4: 62.03\n- + CAUSALCOT: 70.40\n\nThe highest overall accuracy is achieved by CAUSALCOT with 70.40%.\n\nLooking at the document, CAUSALCOT is described as \"GPT-4 + CAUSALCOT\" in the context, which means it's GPT-4 enhanced with their causal chain-of-thought prompting strategy.\n\nThe best performing model is CAUSALCOT (which uses GPT-4 as the base model) with an accuracy of 70.40%.\n\nAnswer: 70", "citations": [], "input_tokens": 75259, "output_tokens": 336, "cost_micros": 230817, "latency_ms": 24141, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.04350v3.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "70", "normalised_gold": "-4"}}
+{"qid": "2312.04350v3.pdf::Q006", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "GPT-4", "pages": 28, "document_id": 7594, "arm": "azure_basic_lc", "question_id": "2312.04350v3.pdf::Q006", "answer_letter": null, "raw_text": "Looking at the performance results in Table 2 (on page 8), I need to find which model achieves the highest overall accuracy on the CLADDER dataset v1.5.\n\nFrom Table 2:\n- Random: 49.27%\n- LLaMa: 44.03%\n- Alpaca: 44.66%\n- GPT-3 Non-Instr. (davinci): 49.92%\n- GPT-3 Instr. (text-davinci-001): 51.40%\n- GPT-3 Instr. (text-davinci-002): 53.15%\n- GPT-3 Instr. (text-davinci-003): 56.26%\n- GPT-3.5: 52.18%\n- GPT-4: 62.03%\n- **+ CAUSALCOT: 70.40%**\n\nThe model with the highest overall accuracy is **CAUSALCOT** (which is GPT-4 with the CausalCoT prompting strategy) at 70.40%.\n\nAnswer: 4", "citations": [], "input_tokens": 31747, "output_tokens": 263, "cost_micros": 99186, "latency_ms": 6937, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "-4"}}
+{"qid": "2312.04350v3.pdf::Q006", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "GPT-4", "pages": 28, "document_id": 7594, "arm": "azure_premium_lc", "question_id": "2312.04350v3.pdf::Q006", "answer_letter": null, "raw_text": "Looking at the performance results in the document:\n\nFrom Table 2 (CLADDER v1.5 - the main/current version):\n- GPT-4 + CAUSALCOT achieves 70.40% accuracy, which is the highest performance among all models tested\n\nFrom Table 6 (CLADDER v1.0 - earlier version):\n- GPT-4 + CAUSALCOT achieves 66.64% accuracy, which is also the highest\n\nThe document clearly states on page 3: \"our CAUSALCOT achieves an accuracy of 70.40%, which substantially improves the performance of vanilla GPT-4 by 8.37 points on CLADDER.\"\n\nAnd on page 8: \"And amongst all, our CAUSALCOT achieves the highest performance of 70.40%\"\n\nCAUSALCOT is described as a chain-of-thought prompting strategy used with GPT-4, so the best performing model is GPT-4 with CAUSALCOT.\n\nSince GPT-4 is mentioned to have approximately 1T (1 trillion) parameters, and CAUSALCOT is a prompting strategy applied to GPT-4:\n\nAnswer: 1000000000000", "citations": [], "input_tokens": 38816, "output_tokens": 288, "cost_micros": 120768, "latency_ms": 8750, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "1000000000000", "normalised_gold": "-4"}}
+{"qid": "2312.04350v3.pdf::Q006", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "GPT-4", "pages": 28, "document_id": 7594, "arm": "llamacloud_basic_lc", "question_id": "2312.04350v3.pdf::Q006", "answer_letter": null, "raw_text": "I need to find which model performs best on the CLADDER dataset according to the document.\n\nLooking at Table 2 (page 8), which shows \"Performance of all models on our CLADDER dataset v1.5\", I can see the overall accuracy for each model:\n\n- Random: 49.27%\n- LLaMa: 44.03%\n- Alpaca: 44.66%\n- GPT-3 Non-Instr. (davinci): 49.92%\n- GPT-3 Instr. (text-davinci-001): 51.40%\n- GPT-3 Instr. (text-davinci-002): 53.15%\n- GPT-3 Instr. (text-davinci-003): 56.26%\n- GPT-3.5: 52.18%\n- GPT-4: 62.03%\n- + CAUSALCOT: 70.40%\n\nThe model with the highest overall accuracy is \"GPT-4 + CAUSALCOT\" with 70.40%.\n\nThe text also states: \"our CAUSALCOT achieves the highest performance of 70.40%, which is substantially better than the vanilla GPT-4 by 8.37 points on CLADDER.\"\n\nSince CAUSALCOT is based on GPT-4 (as stated in the document: \"We build our CAUSALCOT prompting strategy using GPT-4\"), the best performing model is GPT-4 when using the CAUSALCOT prompting strategy.\n\nThe overall accuracy is 70.40%, which rounds to 70%.\n\nAnswer: 70", "citations": [], "input_tokens": 33628, "output_tokens": 390, "cost_micros": 106734, "latency_ms": 9766, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "70", "normalised_gold": "-4"}}
+{"qid": "2312.04350v3.pdf::Q006", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "GPT-4", "pages": 28, "document_id": 7594, "arm": "llamacloud_premium_lc", "question_id": "2312.04350v3.pdf::Q006", "answer_letter": null, "raw_text": "I need to find which model performs best on the CLADDER dataset by looking at the performance tables in the document.\n\nLooking at Table 2 (Performance of all models on CLADDER dataset v1.5), which shows the main results:\n\nThe models and their Overall Accuracy are:\n- Random: 49.27\n- LLaMa: 44.03\n- Alpaca: 44.66\n- GPT-3 Non-Instr. (davinci): 49.92\n- GPT-3 Instr. (text-davinci-001): 51.40\n- GPT-3 Instr. (text-davinci-002): 53.15\n- GPT-3 Instr. (text-davinci-003): 56.26\n- GPT-3.5: 52.18\n- GPT-4: 62.03\n- + CAUSALCOT: 70.40\n\nThe model with the highest overall accuracy is \"+ CAUSALCOT\" at 70.40%.\n\nCAUSALCOT is described in the paper as a chain-of-thought prompting strategy built on top of GPT-4, so it's GPT-4 enhanced with their CAUSALCOT method.\n\nThe accuracy is 70.40%, which as a percentage would be approximately 70 (rounding to the nearest integer).\n\nAnswer: 70", "citations": [], "input_tokens": 39830, "output_tokens": 325, "cost_micros": 124365, "latency_ms": 8687, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "70", "normalised_gold": "-4"}}
+{"qid": "2312.04350v3.pdf::Q006", "doc_id": "2312.04350v3.pdf", "answer_format": "int", "gold": "GPT-4", "pages": 28, "document_id": 7594, "arm": "surfsense_agentic", "question_id": "2312.04350v3.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the document to find information about model performance on Cladder.Let me search for \"Cladder\" in the document to find the relevant information:Now let me search for tables or results sections that might contain performance metrics:Perfect! I found Table 2 which shows the performance of all models on the CLADDER dataset. Let me read that section:Perfect! I can see Table 2 which shows the performance of all models on the CLADDER dataset. Looking at the \"Overall Acc.\" column:\n\n- Random: 49.27\n- LLaMa: 44.03\n- Alpaca: 44.66\n- GPT-3 Non-Instr. (davinci): 49.92\n- GPT-3 Instr. (text-davinci-001): 51.40\n- GPT-3 Instr. (text-davinci-002): 53.15\n- GPT-3 Instr. (text-davinci-003): 56.26\n- GPT-3.5: 52.18\n- GPT-4: 62.03\n- +CAUSALCOT: 70.40\n\nThe model that performs the best on Cladder is \"+CAUSALCOT\" with an overall accuracy of 70.40.\n\nAnswer: 70", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 36610, "error": null, "extra": {"thread_id": 2035, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_e78cd64845f341ad917f431928ce6d64", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 165, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "70", "normalised_gold": "-4"}}
+{"qid": "2312.09390v1.pdf::Q000", "doc_id": "2312.09390v1.pdf", "answer_format": "str", "gold": "\"humans supervise models are weaker than humans\"", "pages": 49, "document_id": 7595, "arm": "native_pdf", "question_id": "2312.09390v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1 on page 2:\n\nThe first subfigure shows \"Traditional ML\" where the supervisor (human) is above/smarter than the student (model).\n\nThe second subfigure shows \"Superalignment\" where the supervisor (human) is below/weaker than the student (superhuman model).\n\nThe difference is the relative capability levels are reversed - in Traditional ML the supervisor is stronger, while in Superalignment the student is stronger.\n\nAnswer: Supervisor is stronger than student (vs. student stronger than supervisor)", "citations": [], "input_tokens": 121916, "output_tokens": 121, "cost_micros": 367563, "latency_ms": 32577, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.09390v1.pdf"}, "graded": {"correct": false, "f1": 0.11764705882352941, "method": "str_norm", "normalised_pred": "supervisor is stronger than student vs student stronger than supervisor", "normalised_gold": "humans supervise models are weaker than humans"}}
+{"qid": "2312.09390v1.pdf::Q000", "doc_id": "2312.09390v1.pdf", "answer_format": "str", "gold": "\"humans supervise models are weaker than humans\"", "pages": 49, "document_id": 7595, "arm": "azure_basic_lc", "question_id": "2312.09390v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document:\n\nThe figure shows three scenarios comparing Traditional ML, Superalignment, and \"Our Analogy.\"\n\nIn the first subfigure (Traditional ML), the supervisor (human) is above the human level line, while the student (model) is below the human level line.\n\nIn the second subfigure (Superalignment), the supervisor (human) is at the human level line, while the student (model) is far above the human level line.\n\nThe key difference is that in Traditional ML, the supervisor is stronger than/above the student, whereas in Superalignment, the student is much stronger than the supervisor.\n\nAnswer: Supervisor is stronger than student (versus student stronger than supervisor)", "citations": [], "input_tokens": 45459, "output_tokens": 162, "cost_micros": 138807, "latency_ms": 5828, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.11764705882352941, "method": "str_norm", "normalised_pred": "supervisor is stronger than student versus student stronger than supervisor", "normalised_gold": "humans supervise models are weaker than humans"}}
+{"qid": "2312.09390v1.pdf::Q000", "doc_id": "2312.09390v1.pdf", "answer_format": "str", "gold": "\"humans supervise models are weaker than humans\"", "pages": 49, "document_id": 7595, "arm": "azure_premium_lc", "question_id": "2312.09390v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document:\n\nThe first subfigure shows \"Traditional ML\" where the **supervisor is stronger than the student** (supervisor is above the human level line, student is below it).\n\nThe second subfigure shows \"Superalignment\" where the **supervisor is weaker than the student** (supervisor is at human level, student is above it - superhuman).\n\nAnswer: Supervisor is stronger than student (vs weaker than student)", "citations": [], "input_tokens": 49232, "output_tokens": 103, "cost_micros": 149241, "latency_ms": 5391, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.25, "method": "str_norm", "normalised_pred": "supervisor is stronger than student vs weaker than student", "normalised_gold": "humans supervise models are weaker than humans"}}
+{"qid": "2312.09390v1.pdf::Q000", "doc_id": "2312.09390v1.pdf", "answer_format": "str", "gold": "\"humans supervise models are weaker than humans\"", "pages": 49, "document_id": 7595, "arm": "llamacloud_basic_lc", "question_id": "2312.09390v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document:\n\nThe first subfigure shows \"Traditional ML\" where the supervisor is at human level and the student is below human level.\n\nThe second subfigure shows \"Superalignment\" where the supervisor is at human level and the student is above human level (superhuman).\n\nThe key difference is the student's capability relative to human level.\n\nAnswer: Student is weaker than human vs student is stronger than human", "citations": [], "input_tokens": 47601, "output_tokens": 99, "cost_micros": 144288, "latency_ms": 4594, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.2222222222222222, "method": "str_norm", "normalised_pred": "student is weaker than human vs student is stronger than human", "normalised_gold": "humans supervise models are weaker than humans"}}
+{"qid": "2312.09390v1.pdf::Q000", "doc_id": "2312.09390v1.pdf", "answer_format": "str", "gold": "\"humans supervise models are weaker than humans\"", "pages": 49, "document_id": 7595, "arm": "llamacloud_premium_lc", "question_id": "2312.09390v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1, I need to compare the first subfigure (Traditional ML) with the second subfigure (Superalignment).\n\nIn the first subfigure (Traditional ML):\n- Supervisor: Human\n- Student: Robot\n- The human is stronger/smarter than the robot\n\nIn the second subfigure (Superalignment):\n- Supervisor: Human  \n- Student: Superhuman Robot\n- The human is weaker/less capable than the superhuman robot\n\nThe key difference is the relative capability between supervisor and student.\n\nAnswer: Supervisor is stronger than student", "citations": [], "input_tokens": 89555, "output_tokens": 133, "cost_micros": 270660, "latency_ms": 6703, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.16666666666666666, "method": "str_norm", "normalised_pred": "supervisor is stronger than student", "normalised_gold": "humans supervise models are weaker than humans"}}
+{"qid": "2312.09390v1.pdf::Q000", "doc_id": "2312.09390v1.pdf", "answer_format": "str", "gold": "\"humans supervise models are weaker than humans\"", "pages": 49, "document_id": 7595, "arm": "surfsense_agentic", "question_id": "2312.09390v1.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find Figure 1 and analyze the difference between the first and second subfigures regarding supervisor-student relationships.Let me search for Figure 1 in the document:Now let me read the actual Figure 1 caption and description more carefully:Based on the figure caption and structure, I can see that Figure 1 has three subfigures:\n1. Traditional ML\n2. Superalignment  \n3. Our Analogy\n\nThe question asks about the difference between the first subfigure (Traditional ML) and the second subfigure (Superalignment) regarding the supervisor-student relationship.\n\nFrom the caption: \"Traditional ML focuses on the setting where humans supervise models that are weaker than humans. For the ultimate superalignment problem, humans will have to supervise models much smarter than them.\"\n\nThe key difference is about the relative capability/intelligence level between supervisor and student:\n- In Traditional ML (first subfigure): Supervisor is stronger/smarter than student\n- In Superalignment (second subfigure): Supervisor is weaker than student (students are \"much smarter\" than supervisors)\n\nAnswer: Supervisor is stronger than student", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 43967, "error": null, "extra": {"thread_id": 2036, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_e51022c429054652ae2e24b4002aba31", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 203, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.16666666666666666, "method": "str_norm", "normalised_pred": "supervisor is stronger than student", "normalised_gold": "humans supervise models are weaker than humans"}}
+{"qid": "2312.09390v1.pdf::Q002", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "5", "pages": 49, "document_id": 7595, "arm": "native_pdf", "question_id": "2312.09390v1.pdf::Q002", "answer_letter": null, "raw_text": "I need to count the figures from Page 3 to Page 10 in this document.\n\nLet me go through each page:\n\n**Page 3:**\n- Figure 2: Shows weak-to-strong generalization results with three panels (NLP Tasks, Chess Puzzles, ChatGPT Reward Modeling)\n\n**Page 4:**\n- No figures on this page (Related Work section)\n\n**Page 5:**\n- No figures on this page (Related Work continued)\n\n**Page 6:**\n- No figures on this page (Methodology section)\n\n**Page 7:**\n- No figures on this page (Main Results section continues)\n\n**Page 8:**\n- Figure 3: Shows weak-to-strong generalization with naive finetuning (6 panels labeled a-f)\n\n**Page 9:**\n- Figure 4: Shows bootstrapping results on chess puzzles (2 panels labeled a-b)\n\n**Page 10:**\n- Figure 5: Shows NLP results with auxiliary loss (2 panels labeled a-b)\n\nTotal figures from Page 3 to Page 10:\n- Figure 2 (Page 3)\n- Figure 3 (Page 8)\n- Figure 4 (Page 9)\n- Figure 5 (Page 10)\n\nAnswer: 4", "citations": [], "input_tokens": 121899, "output_tokens": 301, "cost_micros": 370212, "latency_ms": 35593, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.09390v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "5"}}
+{"qid": "2312.09390v1.pdf::Q002", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "5", "pages": 49, "document_id": 7595, "arm": "azure_basic_lc", "question_id": "2312.09390v1.pdf::Q002", "answer_letter": null, "raw_text": "I need to count all the figures that appear on pages 3 through 10 of this document.\n\nLet me go through each page:\n\n**Page 3:**\n- Figure 1: An illustration of methodology (Traditional ML, Superalignment, Our Analogy)\n- Figure 2: Shows test accuracy on representative NLP task, Chess Puzzles, and ChatGPT Reward Modeling\n\n**Page 4:**\n- No figures visible on this page (contains related work section)\n\n**Page 5:**\n- No figures visible on this page (continues related work section)\n\n**Page 6:**\n- No figures visible on this page (continues related work section)\n\n**Page 7:**\n- No figures visible on this page (contains methodology section)\n\n**Page 8:**\n- Figure 3: Shows weak-to-strong generalization results with multiple panels (a,b,c,d,e,f)\n- Figure 4: Shows bootstrapping results on chess puzzles\n\n**Page 9:**\n- No figures visible on this page (continues main results section)\n\n**Page 10:**\n- Figure 5: Shows NLP results with auxiliary loss\n- Figure 6: Shows auxiliary loss improving generalization across datasets\n\nCounting the figures from pages 3-10:\n- Page 3: 2 figures (Figure 1, Figure 2)\n- Page 8: 2 figures (Figure 3, Figure 4)\n- Page 10: 2 figures (Figure 5, Figure 6)\n\nTotal: 6 figures\n\nAnswer: 6", "citations": [], "input_tokens": 45442, "output_tokens": 356, "cost_micros": 141666, "latency_ms": 8421, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "5"}}
+{"qid": "2312.09390v1.pdf::Q002", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "5", "pages": 49, "document_id": 7595, "arm": "azure_premium_lc", "question_id": "2312.09390v1.pdf::Q002", "answer_letter": null, "raw_text": "I'll count the figures from Page 3 to Page 10 in this document.\n\n**Page 3:**\n- Figure 2: \"An illustration of our methodology...\" showing weak-to-strong generalization results\n\n**Page 4:**\n- None\n\n**Page 5:**\n- None\n\n**Page 6:**\n- None\n\n**Page 7:**\n- Figure 3: \"Promising weak-to-strong generalization...\" showing test accuracy panels (a,b,c,d,e,f)\n\n**Page 8:**\n- None\n\n**Page 9:**\n- Figure 4: \"Bootstrapping improves weak-to-strong generalization...\" showing panels (a,b)\n\n**Page 10:**\n- Figure 5: \"Substantially improved generalization...\" showing panels (a,b)\n- Figure 6: \"Simple auxiliary loss improves generalization...\" showing multiple test accuracy plots\n\nLet me count:\n- Page 3: 1 figure (Figure 2)\n- Page 7: 1 figure (Figure 3)\n- Page 9: 1 figure (Figure 4)\n- Page 10: 2 figures (Figure 5 and Figure 6)\n\nTotal: 1 + 1 + 1 + 2 = 5 figures\n\nAnswer: 5", "citations": [], "input_tokens": 49215, "output_tokens": 303, "cost_micros": 152190, "latency_ms": 7608, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "5"}}
+{"qid": "2312.09390v1.pdf::Q002", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "5", "pages": 49, "document_id": 7595, "arm": "llamacloud_basic_lc", "question_id": "2312.09390v1.pdf::Q002", "answer_letter": null, "raw_text": "I'll count the figures from page 3 to page 10 of this paper.\n\nLet me go through the document systematically:\n\n**Page 3:** Figure 1 (An illustration of methodology showing Traditional ML, Superalignment, and Our Analogy)\n\n**Page 4:** Figure 2 (Strong models trained with weak supervision generalize beyond their supervisor - showing 3 subplots for NLP Task, Chess, and ChatGPT Reward Modeling)\n\n**Page 5-6:** No figures visible in the main text sections shown\n\n**Page 7:** Figure 3 (mentioned in the text - showing test accuracy and performance gap recovered for NLP tasks, chess puzzles, and ChatGPT reward modeling - this appears to be on page 8 based on the layout)\n\n**Page 8:** Figure 3 appears here (6 subplots - a,b,c showing test accuracy and d,e,f showing performance gap recovered)\n\n**Page 9:** Figure 4 (Bootstrapping improves weak-to-strong generalization on chess puzzles - 2 subplots)\n\n**Page 10:** Figure 5 (Substantially improved generalization on NLP datasets with auxiliary loss - 2 subplots)\n\nCounting the figures:\n- Page 3: Figure 1\n- Page 4: Figure 2  \n- Page 8: Figure 3\n- Page 9: Figure 4\n- Page 10: Figure 5\n\nTotal figures from page 3 to page 10: 5 figures\n\nAnswer: 5", "citations": [], "input_tokens": 47584, "output_tokens": 358, "cost_micros": 148122, "latency_ms": 9766, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "5"}}
+{"qid": "2312.09390v1.pdf::Q002", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "5", "pages": 49, "document_id": 7595, "arm": "llamacloud_premium_lc", "question_id": "2312.09390v1.pdf::Q002", "answer_letter": null, "raw_text": "I'll count all figures from Page 3 to Page 10.\n\n**Page 3:**\n- Figure 2: Strong models trained with weak supervision generalize beyond their supervisor (bar chart)\n\n**Page 4:**\n- None\n\n**Page 5:**\n- None\n\n**Page 6:**\n- None\n\n**Page 7:**\n- None\n\n**Page 8:**\n- Figure 3: Promising weak-to-strong generalization with naive finetuning (grid of heatmaps/tables - appears as panels a, b, c, d, e, f)\n\n**Page 9:**\n- Figure 4: Bootstrapping improves weak-to-strong generalization on chess puzzles (2 panels: a and b)\n\n**Page 10:**\n- Figure 5: Substantially improved generalization on NLP datasets with a simple auxiliary loss (2 panels: a and b)\n\nTotal count:\n- Figure 2: 1\n- Figure 3: 1 (though it has 6 panels, it's one figure)\n- Figure 4: 1\n- Figure 5: 1\n\nAnswer: 4", "citations": [], "input_tokens": 89538, "output_tokens": 261, "cost_micros": 272529, "latency_ms": 9000, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "5"}}
+{"qid": "2312.09390v1.pdf::Q002", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "5", "pages": 49, "document_id": 7595, "arm": "surfsense_agentic", "question_id": "2312.09390v1.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the document to count the figures from Page 3 to Page 10.Now I need to identify which figures appear between Page 3 and Page 10. Let me read the relevant sections more carefully:Now let me check what's on page 11 to confirm the boundary:Based on my analysis, let me count the figures that appear from Page 3 to Page 10:\n\n- **Page 3**: No figures (only text - Related Work section starts)\n- **Page 4**: No figures (Related Work section continues)  \n- **Page 5**: 1 figure (the PGR illustration figure showing weak performance, weak-to-strong performance, strong ceiling performance)\n- **Page 6**: No figures (text only)\n- **Page 7**: 1 figure (Figure 3 - showing weak-to-strong generalization results across NLP, Chess, and ChatGPT RM)\n- **Page 8**: 1 figure (Figure 4 - Bootstrapping improves weak-to-strong generalization on chess puzzles)\n- **Page 9**: 1 figure (Figure 5 - NLP with auxiliary loss)\n- **Page 10**: 1 figure (Figure 6 - Simple auxiliary loss improves generalization across most datasets)\n\nTotal figures from Page 3 to Page 10: 5 figures\n\nAnswer: 5", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 55516, "error": null, "extra": {"thread_id": 2037, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_ad05d8ff7ccf4712bbe93b3462e811bc", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 271, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "5"}}
+{"qid": "2312.09390v1.pdf::Q004", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "6", "pages": 49, "document_id": 7595, "arm": "native_pdf", "question_id": "2312.09390v1.pdf::Q004", "answer_letter": null, "raw_text": "I need to locate Figure 3 in the document and count the number of subfigures it contains.\n\nLooking at the document, Figure 3 appears on page 8. The figure is titled \"Promising weak-to-strong generalization with naive finetuning on NLP tasks and chess, but poor generalization on the ChatGPT reward modeling task.\"\n\nThe figure contains:\n- Panel (a): NLP Tasks - test accuracy plot\n- Panel (b): Chess Puzzles - test accuracy plot  \n- Panel (c): ChatGPT Reward Modeling - test accuracy plot\n- Panel (d): NLP Tasks - performance gap recovered plot\n- Panel (e): Chess Puzzles - performance gap recovered plot\n- Panel (f): ChatGPT Reward Modeling - performance gap recovered plot\n\nThe figure has 6 labeled panels/subfigures (a, b, c, d, e, f).\n\nAnswer: 6", "citations": [], "input_tokens": 121893, "output_tokens": 212, "cost_micros": 368859, "latency_ms": 54641, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.09390v1.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.09390v1.pdf::Q004", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "6", "pages": 49, "document_id": 7595, "arm": "azure_basic_lc", "question_id": "2312.09390v1.pdf::Q004", "answer_letter": null, "raw_text": "I need to find Figure 3 in the document and count its subfigures.\n\nLooking at the document, I can find Figure 3 on page 8. The caption reads:\n\n\"Figure 3: Promising weak-to-strong generalization with naive finetuning on NLP tasks and chess, but poor generalization on the ChatGPT reward modeling task. (a,b,c) Test accuracy as a function of strong student size on (a) NLP tasks, (b) chess puzzles, and (c) the ChatGPT reward modeling task. Accuracy of strong students trained with ground truth in black, accuracy of strong students trained with weak supervision shown with colored lines (hue indicates size of weak supervisor). (d,e,f) Same as panels a,b,c but for performance gap recovered (see Section 3 for details).\"\n\nThe caption explicitly mentions panels (a,b,c) and (d,e,f), which indicates there are 6 subfigures total in Figure 3.\n\nLooking at the actual figure layout, I can see:\n- (a) NLP Tasks\n- (b) Chess Puzzles  \n- (c) ChatGPT Reward Modeling\n- (d) NLP Tasks (PGR)\n- (e) Chess Puzzles (PGR)\n- (f) ChatGPT Reward Modeling (PGR)\n\nAnswer: 6", "citations": [], "input_tokens": 45436, "output_tokens": 322, "cost_micros": 141138, "latency_ms": 8859, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.09390v1.pdf::Q004", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "6", "pages": 49, "document_id": 7595, "arm": "azure_premium_lc", "question_id": "2312.09390v1.pdf::Q004", "answer_letter": null, "raw_text": "I need to find Figure 3 in the document and count its subfigures.\n\nLooking at the document, I can find Figure 3 which is titled \"Promising weak-to-strong generalization with naive finetuning on NLP tasks and chess, but poor generalization on the ChatGPT reward modeling task.\"\n\nThe figure caption describes panels (a,b,c) and (d,e,f):\n- (a,b,c) Test accuracy as a function of strong student size on (a) NLP tasks, (b) chess puzzles, and (c) the ChatGPT reward modeling task\n- (d,e,f) Same as panels a,b,c but for performance gap recovered\n\nSo Figure 3 has 6 subfigures total: (a), (b), (c), (d), (e), and (f).\n\nAnswer: 6", "citations": [], "input_tokens": 49209, "output_tokens": 201, "cost_micros": 150642, "latency_ms": 6328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.09390v1.pdf::Q004", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "6", "pages": 49, "document_id": 7595, "arm": "llamacloud_basic_lc", "question_id": "2312.09390v1.pdf::Q004", "answer_letter": null, "raw_text": "I need to find Figure 3 in the document and count its subfigures.\n\nLooking at the document, I can see Figure 3 is described as:\n\n\"Figure 3: Promising weak-to-strong generalization with naive finetuning on NLP tasks and chess, but poor generalization on the ChatGPT reward modeling task. (a,b,c) Test accuracy as a function of strong student size on (a) NLP tasks, (b) chess puzzles, and (c) the ChatGPT reward modeling task. Accuracy of strong students trained with ground truth in black, accuracy of strong students trained with weak supervision shown with colored lines (hue indicates size of weak supervisor). (d,e,f) Same as panels a,b,c but for performance gap recovered (see Section 3 for details).\"\n\nThe description explicitly mentions:\n- (a), (b), (c) - three subfigures showing test accuracy\n- (d), (e), (f) - three subfigures showing performance gap recovered\n\nThis gives us a total of 6 subfigures in Figure 3.\n\nAnswer: 6", "citations": [], "input_tokens": 47578, "output_tokens": 255, "cost_micros": 146559, "latency_ms": 8389, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.09390v1.pdf::Q004", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "6", "pages": 49, "document_id": 7595, "arm": "llamacloud_premium_lc", "question_id": "2312.09390v1.pdf::Q004", "answer_letter": null, "raw_text": "I need to find Figure 3 in the document and count its subfigures.\n\nLooking through the document for Figure 3...\n\nFound it! Figure 3 is titled \"Promising weak-to-strong generalization with naive finetuning on NLP tasks and chess, but poor generalization on the ChatGPT reward modeling task.\"\n\nThe figure consists of:\n- (a) NLP Tasks - showing test accuracy\n- (b) Chess Puzzles - showing test accuracy  \n- (c) ChatGPT Reward Modeling - showing test accuracy\n- (d) Performance Gap Recovered (%) - NLP Tasks\n- (e) Performance Gap Recovered (%) - Chess Puzzles\n- (f) Performance Gap Recovered (%) - ChatGPT Reward Modeling\n\nSo Figure 3 has 6 subfigures labeled (a) through (f).\n\nAnswer: 6", "citations": [], "input_tokens": 89532, "output_tokens": 204, "cost_micros": 271656, "latency_ms": 8046, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.09390v1.pdf::Q004", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "6", "pages": 49, "document_id": 7595, "arm": "surfsense_agentic", "question_id": "2312.09390v1.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to find Figure 3 and count its subfigures.Let me search for Figure 3 in the document:Now let me search for where Figure 3 appears in the document:Good! I found Figure 3. Let me read the complete caption around line 975:Perfect! I can see from the caption that Figure 3 has subfigures labeled (a), (b), (c), (d), (e), and (f). Let me continue reading to confirm all subfigures:Perfect! I can now clearly see all the subfigures in Figure 3. Based on the caption and the content, Figure 3 contains:\n\n- (a) NLP Tasks - test accuracy\n- (b) Chess Puzzles - test accuracy  \n- (c) ChatGPT Reward Modeling - test accuracy\n- (d) NLP Tasks - performance gap recovered\n- (e) Chess Puzzles - performance gap recovered\n- (f) ChatGPT Reward Modeling - performance gap recovered\n\nAnswer: 6", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 40671, "error": null, "extra": {"thread_id": 2038, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_131c7eb21d53472097ae34a0c3413951", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 191, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.09390v1.pdf::Q005", "doc_id": "2312.09390v1.pdf", "answer_format": "list", "gold": "['Section 4.1', 'Section 4.2', 'Section 4.3', 'Appendix A']", "pages": 49, "document_id": 7595, "arm": "native_pdf", "question_id": "2312.09390v1.pdf::Q005", "answer_letter": null, "raw_text": "I need to find all sections that discuss the experiment setup in this paper about weak-to-strong generalization.\n\nLet me go through the document systematically:\n\n1. **Section 3: METHODOLOGY** - This section describes the general methodology and setup for weak-to-strong learning experiments.\n\n2. **Section 4.1: TASKS** - This describes the three main task settings: NLP benchmarks, chess puzzles, and ChatGPT reward modeling.\n\n3. **Appendix A: FURTHER EXPERIMENTAL DETAILS** - This entire appendix provides detailed experimental setup information:\n   - A.1: NLP TASKS - Details about data preprocessing, models, training hyperparameters, weak labels, and evaluation\n   - A.2: CHESS PUZZLES - Details about data preprocessing, training hyperparameters, weak labels, and evaluation\n   - A.3: CHATGPT REWARD MODELING - Details about data preprocessing, models, training hyperparameters, and weak labels\n   - A.4: AUXILIARY CONFIDENCE LOSS - Details about the confidence loss method\n\n4. **Appendix C: EASY-TO-HARD GENERALIZATION** - Describes additional experimental setups:\n   - C.1: CHESS PUZZLES\n   - C.2: NLP TASKS: DIFFICULTY THRESHOLDING\n   - C.3: GPT-4 PREDICTED DIFFICULTY\n\n5. **Appendix D: OTHER WEAK-TO-STRONG SETTINGS** - Describes additional experimental settings:\n   - D.1: SELF-SUPERVISED VISION MODELS\n   - D.2: LINEAR PROBING\n\n6. **Appendix E: THE EFFECTS OF WEAK LABEL STRUCTURE** - Describes synthetic experiments:\n   - E.1: SYNTHETIC EXPERIMENTS ON SIMULATION DIFFICULTY\n   - E.2: DIFFERENT WEAK ERROR STRUCTURE MEANS DIFFERENT GENERALIZATION\n   - E.3: MAKING IMITATION TRIVIAL\n\nAnswer: Section 3 (Methodology), Section 4.1 (Tasks), Appendix A (Further Experimental Details), Appendix A.1 (NLP Tasks), Appendix A.2 (Chess Puzzles), Appendix A.3 (ChatGP", "citations": [], "input_tokens": 121905, "output_tokens": 512, "cost_micros": 373395, "latency_ms": 49562, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "length", "pdf_filename": "2312.09390v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "appendix 1 nlp tasks, appendix 2 chess puzzles, appendix 3 chatgp, appendix further experimental details, section 3 methodology, section 4 1 tasks", "normalised_gold": "appendix, section 4 1, section 4 2, section 4 3"}}
+{"qid": "2312.09390v1.pdf::Q005", "doc_id": "2312.09390v1.pdf", "answer_format": "list", "gold": "['Section 4.1', 'Section 4.2', 'Section 4.3', 'Appendix A']", "pages": 49, "document_id": 7595, "arm": "azure_basic_lc", "question_id": "2312.09390v1.pdf::Q005", "answer_letter": null, "raw_text": "I need to identify all sections that discuss the experiment setup in this document.\n\nLet me go through the document systematically:\n\n1. **Section 3 METHODOLOGY** - This section describes the overall methodology and experimental setup for weak-to-strong learning.\n\n2. **Section 4.1 TASKS** - This describes the tasks used in experiments (NLP benchmarks, Chess puzzles, ChatGPT reward modeling).\n\n3. **Appendix A FURTHER EXPERIMENTAL DETAILS** - This entire appendix is dedicated to experimental details.\n\n4. **Appendix A.1 NLP TASKS** - Details about NLP task setup including data preprocessing, models, training hyperparameters, weak labels, and evaluation.\n\n5. **Appendix A.2 CHESS PUZZLES** - Details about chess puzzle setup including data preprocessing, training hyperparameters, weak labels, and evaluation.\n\n6. **Appendix A.3 CHATGPT REWARD MODELING** - Details about reward modeling setup including data preprocessing, models, training hyperparameters, weak labels.\n\n7. **Appendix A.4 AUXILIARY CONFIDENCE LOSS** - Detailed description of the auxiliary confidence loss method.\n\n8. **Appendix C EASY-TO-HARD GENERALIZATION** - Discusses experimental setup for easy-to-hard generalization experiments.\n\n9. **Appendix C.1 CHESS PUZZLES** - Setup for easy-to-hard generalization on chess.\n\n10. **Appendix C.2 NLP TASKS: DIFFICULTY THRESHOLDING** - Setup for difficulty-based experiments on NLP.\n\n11. **Appendix C.3 GPT-4 PREDICTED DIFFICULTY** - Setup using GPT-4 predicted difficulty.\n\n12. **Appendix D OTHER WEAK-TO-STRONG SETTINGS** - Discusses alternative experimental settings.\n\n13. **Appendix D.1 SELF-SUPERVISED VISION MODELS** - Setup for vision model experiments.\n\n14. **Appendix D.2 LINEAR PROBING** - Setup for linear probing experiments.\n\n15. **Appendix E THE EFFECTS OF WEAK LABEL STRUCTURE** - Discusses experimental setup for studying weak label structure.\n\n16. **", "citations": [], "input_tokens": 45448, "output_tokens": 512, "cost_micros": 144024, "latency_ms": 10405, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "16", "normalised_gold": "appendix, section 4 1, section 4 2, section 4 3"}}
+{"qid": "2312.09390v1.pdf::Q005", "doc_id": "2312.09390v1.pdf", "answer_format": "list", "gold": "['Section 4.1', 'Section 4.2', 'Section 4.3', 'Appendix A']", "pages": 49, "document_id": 7595, "arm": "azure_premium_lc", "question_id": "2312.09390v1.pdf::Q005", "answer_letter": null, "raw_text": "I need to identify all sections that discuss the experiment setup from the document.\n\nLet me go through the document systematically:\n\n1. **Section 3: METHODOLOGY** - This section describes the overall methodology and setup for the weak-to-strong learning experiments.\n\n2. **Section 4.1: TASKS** - This describes the specific tasks used in experiments (NLP benchmarks, chess puzzles, ChatGPT reward modeling).\n\n3. **Appendix A: FURTHER EXPERIMENTAL DETAILS** - This entire appendix section provides detailed experimental setup information, including:\n   - A.1 NLP TASKS\n   - A.2 CHESS PUZZLES\n   - A.3 CHATGPT REWARD MODELING\n   - A.4 AUXILIARY CONFIDENCE LOSS\n\n4. **Appendix C: EASY-TO-HARD GENERALIZATION** - Contains experimental setup for easy-to-hard generalization experiments, including:\n   - C.1 CHESS PUZZLES\n   - C.2 NLP TASKS: DIFFICULTY THRESHOLDING\n   - C.3 GPT-4 PREDICTED DIFFICULTY\n\n5. **Appendix D: OTHER WEAK-TO-STRONG SETTINGS** - Describes setup for additional experiments:\n   - D.1 SELF-SUPERVISED VISION MODELS\n   - D.2 LINEAR PROBING\n\n6. **Appendix E: THE EFFECTS OF WEAK LABEL STRUCTURE** - Contains experimental setup for studying weak label structures:\n   - E.1 SYNTHETIC EXPERIMENTS ON SIMULATION DIFFICULTY\n   - E.2 DIFFERENT WEAK ERROR STRUCTURE MEANS DIFFERENT GENERALIZATION\n   - E.3 MAKING IMITATION TRIVIAL\n\nAnswer: Section 3, Section 4.1, Appendix A, Appendix A.1, Appendix A.2, Appendix A.3, Appendix A.4, Appendix C.1, Appendix C.2, Appendix C.3, Appendix D.1, Appendix D.2, Appendix E.1, Appendix E.2, Appendix E.3", "citations": [], "input_tokens": 49221, "output_tokens": 499, "cost_micros": 155148, "latency_ms": 10391, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.2105263157894737, "method": "list_set", "normalised_pred": "appendix, appendix 1, appendix 2, appendix 3, appendix 4, appendix c 1, appendix c 2, appendix c 3, appendix d 1, appendix d 2, appendix e 1, appendix e 2, appendix e 3, section 3, section 4 1", "normalised_gold": "appendix, section 4 1, section 4 2, section 4 3"}}
+{"qid": "2312.09390v1.pdf::Q005", "doc_id": "2312.09390v1.pdf", "answer_format": "list", "gold": "['Section 4.1', 'Section 4.2', 'Section 4.3', 'Appendix A']", "pages": 49, "document_id": 7595, "arm": "llamacloud_basic_lc", "question_id": "2312.09390v1.pdf::Q005", "answer_letter": null, "raw_text": "I need to identify all sections that discuss the experimental setup from the document.\n\nLet me scan through the document systematically:\n\n1. **Section 3 METHODOLOGY** - This section describes the core experimental methodology for weak-to-strong learning, including creating weak supervisors, training strong students, and training with ground truth labels.\n\n2. **Section 4.1 TASKS** - This describes the specific tasks used in experiments: NLP benchmarks, chess puzzles, and ChatGPT reward modeling.\n\n3. **Appendix A FURTHER EXPERIMENTAL DETAILS** - This entire appendix provides detailed experimental setup information, including:\n   - A.1 NLP TASKS (data preprocessing, models, training hyperparameters, weak labels, evaluation)\n   - A.2 CHESS PUZZLES (data preprocessing, training hyperparameters, weak labels, evaluation)\n   - A.3 CHATGPT REWARD MODELING (data preprocessing, models, training hyperparameters, weak labels, evaluation)\n   - A.4 AUXILIARY CONFIDENCE LOSS (detailed description of the method)\n\n4. **Appendix D OTHER WEAK-TO-STRONG SETTINGS** - This discusses additional experimental settings:\n   - D.1 SELF-SUPERVISED VISION MODELS\n   - D.2 LINEAR PROBING\n\n5. **Appendix C** mentions experimental setup for easy-to-hard generalization experiments.\n\n6. **Appendix E** discusses experimental setup for synthetic experiments on weak label structure.\n\nAnswer: Section 3 METHODOLOGY, Section 4.1 TASKS, Appendix A FURTHER EXPERIMENTAL DETAILS, Appendix A.1 NLP TASKS, Appendix A.2 CHESS PUZZLES, Appendix A.3 CHATGPT REWARD MODELING, Appendix A.4 AUXILIARY CONFIDENCE LOSS, Appendix D OTHER WEAK-TO-STRONG SETTINGS, Appendix D.1 SELF-SUPERVISED VISION MODELS, Appendix D.2 LINEAR PROBING", "citations": [], "input_tokens": 47590, "output_tokens": 470, "cost_micros": 149820, "latency_ms": 10702, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "appendix 1 nlp tasks, appendix 2 chess puzzles, appendix 3 chatgpt reward modeling, appendix 4 auxiliary confidence loss, appendix d 1 self supervised vision models, appendix d 2 linear probing, appendix d other weak to strong settings, appendix further experimental details, section 3 methodology, section 4 1 tasks", "normalised_gold": "appendix, section 4 1, section 4 2, section 4 3"}}
+{"qid": "2312.09390v1.pdf::Q005", "doc_id": "2312.09390v1.pdf", "answer_format": "list", "gold": "['Section 4.1', 'Section 4.2', 'Section 4.3', 'Appendix A']", "pages": 49, "document_id": 7595, "arm": "llamacloud_premium_lc", "question_id": "2312.09390v1.pdf::Q005", "answer_letter": null, "raw_text": "Looking through the document for sections that discuss experiment setup:\n\n1. **Section 3 METHODOLOGY** - Describes the general weak-to-strong learning setup\n2. **Section 4.1 TASKS** - Describes the three task types (NLP, Chess, Reward Modeling)\n3. **Appendix A FURTHER EXPERIMENTAL DETAILS** - Contains detailed experimental setup information with subsections:\n   - A.1 NLP TASKS\n   - A.2 CHESS PUZZLES\n   - A.3 CHATGPT REWARD MODELING\n   - A.4 AUXILIARY CONFIDENCE LOSS\n4. **Appendix D.1 SELF-SUPERVISED VISION MODELS** - Describes ImageNet experiment setup\n5. **Appendix D.2 LINEAR PROBING** - Describes linear probing experiment setup\n6. **Appendix C EASY-TO-HARD GENERALIZATION** - Describes setup for easy-to-hard experiments with subsections:\n   - C.1 CHESS PUZZLES\n   - C.2 NLP TASKS: DIFFICULTY THRESHOLDING\n   - C.3 GPT-4 PREDICTED DIFFICULTY\n7. **Appendix E.1 SYNTHETIC EXPERIMENTS ON SIMULATION DIFFICULTY** - Describes synthetic experiment setup\n\nAnswer: Section 3 METHODOLOGY, Section 4.1 TASKS, Appendix A FURTHER EXPERIMENTAL DETAILS, Appendix A.1 NLP TASKS, Appendix A.2 CHESS PUZZLES, Appendix A.3 CHATGPT REWARD MODELING, Appendix A.4 AUXILIARY CONFIDENCE LOSS, Appendix C EASY-TO-HARD GENERALIZATION, Appendix C.1 CHESS PUZZLES, Appendix C.2 NLP TASKS: DIFFICULTY THRESHOLDING, Appendix C.3 GPT-4 PREDICTED DIFFICULTY, Appendix D.1 SELF-SUPERVISED VISION MODELS, Appendix D.2 LINEAR PROBING, Appendix E.1 SYNTHETIC EXPERIMENTS ON SIMULATION DIFFICULTY", "citations": [], "input_tokens": 89544, "output_tokens": 499, "cost_micros": 276117, "latency_ms": 10375, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "appendix 1 nlp tasks, appendix 2 chess puzzles, appendix 3 chatgpt reward modeling, appendix 4 auxiliary confidence loss, appendix c 1 chess puzzles, appendix c 2 nlp tasks difficulty thresholding, appendix c 3 gpt 4 predicted difficulty, appendix c easy to hard generalization, appendix d 1 self supervised vision models, appendix d 2 linear probing, appendix e 1 synthetic experiments on simulation difficulty, appendix further experimental details, section 3 methodology, section 4 1 tasks", "normalised_gold": "appendix, section 4 1, section 4 2, section 4 3"}}
+{"qid": "2312.09390v1.pdf::Q005", "doc_id": "2312.09390v1.pdf", "answer_format": "list", "gold": "['Section 4.1', 'Section 4.2', 'Section 4.3', 'Appendix A']", "pages": 49, "document_id": 7595, "arm": "surfsense_agentic", "question_id": "2312.09390v1.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to identify all sections that discuss the experiment setup.Let me continue reading to see the complete document structure and identify all sections related to experiment setup.Now let me search specifically for \"experiment\" or \"setup\" to find sections discussing experimental setup:Now let me search more systematically for sections that discuss experimental setup:Based on my reading of the document, I can now identify all the sections that discuss experiment setup. Let me compile the complete list:Now I have a comprehensive view of the document. Let me identify all sections that discuss experiment setup:\n\nAnswer: 3 METHODOLOGY, 4.1 TASKS, A FURTHER EXPERIMENTAL DETAILS, A.1 NLP TASKS, A.2 CHESS PUZZLES, A.3 CHATGPT REWARD MODELING, A.4 AUXILIARY CONFIDENCE LOSS, E.1 SYNTHETIC EXPERIMENTS ON SIMULATION DIFFICULTY", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 69733, "error": null, "extra": {"thread_id": 2039, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_bfc57bfcbbd44c6786a0cd5058f61c15", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 309, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "1 nlp tasks, 2 chess puzzles, 3 chatgpt reward modeling, 3 methodology, 4 1 tasks, 4 auxiliary confidence loss, e 1 synthetic experiments on simulation difficulty, further experimental details", "normalised_gold": "appendix, section 4 1, section 4 2, section 4 3"}}
+{"qid": "2312.09390v1.pdf::Q006", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "6", "pages": 49, "document_id": 7595, "arm": "native_pdf", "question_id": "2312.09390v1.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the citation information for the paper by Long Ouyang published in 2022.\n\nLooking through the references section, I can find:\n\nLong Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, et al. Training language models to follow instructions with human feedback. Advances in Neural Information Processing Systems, 35: 27730\u201327744, 2022.\n\nHowever, the document does not contain information about how many times this paper has been cited. The document is the paper itself (\"Weak-to-Strong Generalization\") which cites the Ouyang et al. 2022 paper, but it does not provide citation counts for the references it lists.\n\nI cannot determine the number of citations for the Ouyang et al. 2022 paper from this document alone.\n\nAnswer: Cannot be determined from the provided document", "citations": [], "input_tokens": 121904, "output_tokens": 231, "cost_micros": 369177, "latency_ms": 39750, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.09390v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "6"}}
+{"qid": "2312.09390v1.pdf::Q006", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "6", "pages": 49, "document_id": 7595, "arm": "azure_basic_lc", "question_id": "2312.09390v1.pdf::Q006", "answer_letter": null, "raw_text": "I need to find citations of the paper by Long Ouyang published in 2022.\n\nLooking through the document, I can find references to \"Ouyang et al., 2022\" in the references section:\n\n\"Long Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, et al. Training language models to follow instructions with human feedback. Advances in Neural Information Processing Systems, 35: 27730-27744, 2022.\"\n\nNow I need to count how many times this paper is cited throughout the document. Let me search for \"Ouyang et al., 2022\":\n\n1. Page 1: \"Christiano et al., 2017; Stiennon et al., 2020; Ouyang et al., 2022; Glaese et al., 2022; Bai et al., 2022a\"\n2. Page 2: \"Christiano et al., 2017; Wu et al., 2021; Ouyang et al., 2022; Bowman et al., 2022; Saunders et al., 2022\"\n3. Page 5: \"Christiano et al., 2017; Stiennon et al., 2020; Ouyang et al., 2022; Glaese et al., 2022; Bai et al., 2022a\"\n4. Page 7: \"(Ouyang et al., 2022)\"\n5. Page 32: \"Ouyang et al., 2022\"\n6. Page 47: \"Ouyang et al., 2022\"\n\nTotal count: 6 citations\n\nAnswer: 6", "citations": [], "input_tokens": 45447, "output_tokens": 423, "cost_micros": 142686, "latency_ms": 10109, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.09390v1.pdf::Q006", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "6", "pages": 49, "document_id": 7595, "arm": "azure_premium_lc", "question_id": "2312.09390v1.pdf::Q006", "answer_letter": null, "raw_text": "I need to find how many times the paper by Long Ouyang published in 2022 has been cited in this document.\n\nLet me search for citations to Long Ouyang 2022 in the references and throughout the document.\n\nLooking at the references section, I can find:\n\n\"Long Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, et al. Training language models to follow instructions with human feedback. Advances in Neural Information Processing Systems, 35: 27730-27744, 2022.\"\n\nNow I need to count how many times \"Ouyang et al., 2022\" or similar citations appear in the document.\n\nSearching through the document:\n\n1. Page 1: \"Ouyang et al., 2022\"\n2. Page 2: \"Ouyang et al., 2022\"\n3. Page 5: \"Ouyang et al., 2022\"\n4. Page 7: \"Ouyang et al., 2022\"\n5. Page 32: \"Ouyang et al., 2022\"\n6. Page 47: \"Ouyang et al., 2022\"\n\nLet me verify by searching more carefully through the text for all instances of \"Ouyang\":\n\n- Introduction section: 1 citation\n- Related Work section: 1 citation  \n- Tasks section: 1 citation\n- ChatGPT reward modeling subsection: 1 citation\n- Models subsection (A.3): 1 citation\n- Alignment plan section: 1 citation\n\nTotal count: 6 citations\n\nAnswer: 6", "citations": [], "input_tokens": 49220, "output_tokens": 405, "cost_micros": 153735, "latency_ms": 8905, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.09390v1.pdf::Q006", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "6", "pages": 49, "document_id": 7595, "arm": "llamacloud_basic_lc", "question_id": "2312.09390v1.pdf::Q006", "answer_letter": null, "raw_text": "I need to find how many times the paper by Long Ouyang published in 2022 has been cited in this document.\n\nLet me search for citations of Ouyang et al., 2022 or Long Ouyang, 2022.\n\nLooking through the document, I can find the following citations to \"Ouyang et al., 2022\":\n\n1. Page 1: \"Christiano et al., 2017; Stiennon et al., 2020; Ouyang et al., 2022; Glaese et al., 2022; Bai et al., 2022a\"\n2. Page 2: \"Wu et al., 2021; Ouyang et al., 2022; Bowman et al., 2022; Saunders et al., 2022\"\n3. Page 5: \"Christiano et al., 2017; Stiennon et al., 2020; Ouyang et al., 2022; Glaese et al., 2022; Bai et al., 2022a\"\n4. Page 7: \"(Ouyang et al., 2022)\"\n5. Page 32: \"(Stiennon et al., 2020; Ouyang et al., 2022)\"\n6. Page 47: \"(h - tiano et al., 2017; Ouyang et al., 2022)\"\n\nLet me count all instances:\n1. Page 1 - 1 citation\n2. Page 2 - 1 citation\n3. Page 5 - 1 citation\n4. Page 7 - 1 citation\n5. Page 32 - 1 citation\n6. Page 47 - 1 citation\n\nTotal: 6 citations\n\nAnswer: 6", "citations": [], "input_tokens": 47589, "output_tokens": 417, "cost_micros": 149022, "latency_ms": 7516, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.09390v1.pdf::Q006", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "6", "pages": 49, "document_id": 7595, "arm": "llamacloud_premium_lc", "question_id": "2312.09390v1.pdf::Q006", "answer_letter": null, "raw_text": "I need to find citations for the paper by Long Ouyang published in 2022.\n\nLooking through the document, I can find the reference:\n\nLong Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, et al. Training language models to follow instructions with human feedback. *Advances in Neural Information Processing Systems*, 35: 27730\u201327744, 2022.\n\nNow I need to count how many times this paper (Ouyang et al., 2022) is cited in the document.\n\nSearching through the document for \"Ouyang et al., 2022\":\n\n1. Page 1: \"Ouyang et al., 2022\"\n2. Page 2: \"Ouyang et al., 2022\"\n3. Page 5: \"Ouyang et al., 2022\"\n4. Page 7: \"Ouyang et al., 2022\"\n5. Page 32: \"Ouyang et al., 2022\"\n6. Page 47: \"Ouyang et al., 2022\"\n\nAnswer: 6", "citations": [], "input_tokens": 89543, "output_tokens": 289, "cost_micros": 272964, "latency_ms": 8312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "6"}}
+{"qid": "2312.09390v1.pdf::Q006", "doc_id": "2312.09390v1.pdf", "answer_format": "int", "gold": "6", "pages": 49, "document_id": 7595, "arm": "surfsense_agentic", "question_id": "2312.09390v1.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the document that was flagged as user-mentioned to find information about Long Ouyang's 2022 paper and its citation count.Let me search for \"Long Ouyang\" and \"2022\" in this document, and look for citation information.", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 25655, "error": null, "extra": {"thread_id": 2040, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_d3a50787f8a940c590bfec1b692a8eb3", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 70, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2022", "normalised_gold": "6"}}
+{"qid": "2312.09390v1.pdf::Q007", "doc_id": "2312.09390v1.pdf", "answer_format": "str", "gold": "finetune g.t.", "pages": 49, "document_id": 7595, "arm": "native_pdf", "question_id": "2312.09390v1.pdf::Q007", "answer_letter": null, "raw_text": "Looking at Figure 9 with its three subfigures (a), (b), and (c):\n\n- Subfigure (a) shows that for ground truth labels, **finetuning** (solid lines) performs best across all model sizes, reaching approximately 95% accuracy for the largest models.\n\n- Subfigure (b) shows weak-to-strong performance with weak labels, where **fewshot weak labels (n=5)** (the method shown with crosses/\u00d7) performs best for the largest student models.\n\n- Subfigure (c) directly compares methods and shows that **finetune aux. loss** (solid line with triangles) achieves the highest performance, reaching approximately 90% accuracy for the largest models.\n\nOverall, across the figure, **finetune aux. loss** appears to be the best method when considering weak-to-strong learning performance.\n\nAnswer: finetune aux. loss", "citations": [], "input_tokens": 121910, "output_tokens": 207, "cost_micros": 368835, "latency_ms": 39203, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.09390v1.pdf"}, "graded": {"correct": false, "f1": 0.3333333333333333, "method": "str_norm", "normalised_pred": "finetune aux loss", "normalised_gold": "finetune g t"}}
+{"qid": "2312.09390v1.pdf::Q007", "doc_id": "2312.09390v1.pdf", "answer_format": "str", "gold": "finetune g.t.", "pages": 49, "document_id": 7595, "arm": "azure_basic_lc", "question_id": "2312.09390v1.pdf::Q007", "answer_letter": null, "raw_text": "Looking at Figure 9, which shows three subfigures (a), (b), and (c) comparing different methods:\n\n- Subfigure (a) compares zero-shot, 5-shot ground truth, and finetune ground truth\n- Subfigure (b) compares zero-shot, 5-shot ground truth, and 5-shot with weak labels  \n- Subfigure (c) compares 5-shot with weak labels, naive finetuning, and finetuning with auxiliary confidence loss\n\nAcross all three subfigures, examining the performance curves for the largest strong student models (rightmost side of each plot), the method that achieves the highest test accuracy is **finetuning with the auxiliary confidence loss** (shown with triangles in subfigure c).\n\nAnswer: Finetuning with auxiliary confidence loss", "citations": [], "input_tokens": 45453, "output_tokens": 193, "cost_micros": 139254, "latency_ms": 6625, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "finetuning with auxiliary confidence loss", "normalised_gold": "finetune g t"}}
+{"qid": "2312.09390v1.pdf::Q007", "doc_id": "2312.09390v1.pdf", "answer_format": "str", "gold": "finetune g.t.", "pages": 49, "document_id": 7595, "arm": "azure_premium_lc", "question_id": "2312.09390v1.pdf::Q007", "answer_letter": null, "raw_text": "Looking at Figure 9, which shows three subfigures (a), (b), and (c) comparing different methods across model sizes:\n\nThe figure compares:\n- Zero-shot\n- Few-shot with ground truth (n=5)\n- Finetune with ground truth\n- Few-shot with weak labels (n=5)\n- Finetune (baseline)\n- Finetune with auxiliary loss\n\nFrom the plots, particularly subfigure (c) which directly compares weak-to-strong performance, the method that consistently achieves the highest test accuracy across strong student compute sizes is the finetuning with auxiliary confidence loss (shown with solid line and triangles).\n\nAnswer: finetune with aux. loss", "citations": [], "input_tokens": 49226, "output_tokens": 163, "cost_micros": 150123, "latency_ms": 7094, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.28571428571428575, "method": "str_norm", "normalised_pred": "finetune with aux loss", "normalised_gold": "finetune g t"}}
+{"qid": "2312.09390v1.pdf::Q007", "doc_id": "2312.09390v1.pdf", "answer_format": "str", "gold": "finetune g.t.", "pages": 49, "document_id": 7595, "arm": "llamacloud_basic_lc", "question_id": "2312.09390v1.pdf::Q007", "answer_letter": null, "raw_text": "Looking at Figure 9, which contains three subfigures (a), (b), and (c), I need to identify the six methods shown and determine which performs best.\n\nFrom the figure:\n- Subfigure (a) shows: zero-shot, 5-shot with ground truth (g.t.), and finetuning with ground truth (g.t.)\n- Subfigure (b) shows: zero-shot, 5-shot with ground truth, and 5-shot with weak labels\n- Subfigure (c) shows: 5-shot prompting (weak-to-strong), naive finetuning, and finetuning with auxiliary confidence loss\n\nThe six methods are:\n1. Zero-shot\n2. 5-shot with ground truth\n3. Finetuning with ground truth\n4. 5-shot with weak labels\n5. Naive finetuning (weak-to-strong)\n6. Finetuning with auxiliary confidence loss\n\nLooking at subfigure (c), which directly compares three weak-to-strong methods, the finetuning with auxiliary confidence loss (shown with solid line and triangles) achieves the highest performance, especially for larger model sizes.\n\nAmong all methods shown across the three subfigures, finetuning with ground truth labels would naturally perform best, but among the weak-to-strong methods (which is the focus of subfigure c), the auxiliary confidence loss performs best.\n\nAnswer: finetuning with auxiliary confidence loss", "citations": [], "input_tokens": 47595, "output_tokens": 339, "cost_micros": 147870, "latency_ms": 8719, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "finetuning with auxiliary confidence loss", "normalised_gold": "finetune g t"}}
+{"qid": "2312.09390v1.pdf::Q007", "doc_id": "2312.09390v1.pdf", "answer_format": "str", "gold": "finetune g.t.", "pages": 49, "document_id": 7595, "arm": "llamacloud_premium_lc", "question_id": "2312.09390v1.pdf::Q007", "answer_letter": null, "raw_text": "Looking at Figure 9 with its three subfigures (a), (b), and (c):\n\nIn subfigure (a), comparing zero-shot, 5-shot with ground truth, and finetuning with ground truth, **finetuning with ground truth** achieves the highest test accuracy.\n\nIn subfigure (b), which shows 5-shot prompting with weak labels from different weak supervisors, the best performing is **5-shot with weak labels from the largest weak supervisor (0.1)**.\n\nIn subfigure (c), which compares 5-shot weak labels, naive finetuning, and finetuning with auxiliary loss, **finetuning with auxiliary loss** achieves the highest performance.\n\nOverall, across all methods shown in Figure 9, the best performing method is:\n\nAnswer: finetuning with auxiliary loss", "citations": [], "input_tokens": 89549, "output_tokens": 194, "cost_micros": 271557, "latency_ms": 8141, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "finetuning with auxiliary loss", "normalised_gold": "finetune g t"}}
+{"qid": "2312.09390v1.pdf::Q007", "doc_id": "2312.09390v1.pdf", "answer_format": "str", "gold": "finetune g.t.", "pages": 49, "document_id": 7595, "arm": "surfsense_agentic", "question_id": "2312.09390v1.pdf::Q007", "answer_letter": null, "raw_text": "I'll read the document to find Figure 9 and identify which method performs best among the six shown.Let me search for Figure 9 in the document:Let me read the section containing Figure 9 to see the actual figure and understand which methods are being compared:Perfect! Now I can see Figure 9 with three subfigures (a), (b), and (c). Let me read a bit more to understand the comparison better:Based on my reading of the document, Figure 9 contains three subfigures (a, b, c) that compare six different methods:\n\nFrom the figure caption and description:\n- **Panel (a)**: zero-shot, 5-shot with ground truth, finetune with ground truth\n- **Panel (b)**: 5-shot with weak labels (multiple supervisor sizes), zero-shot, 5-shot with ground truth\n- **Panel (c)**: 5-shot weak-to-strong prompting, naive finetuning (weak-to-strong), and **finetuning with the confidence loss (auxiliary loss)**\n\nThe text at line 2078-2079 states: \"However, **weak-to-strong finetuning with the confidence loss still generally outperforms** weak-to-strong few-shot prompting.\"\n\nAnd the figure caption at line 1942-1944 states: \"Few-shot weak-to-strong performance becomes competitive with or outperforms finetuning for the largest strong students, though **finetuning with the confidence loss does better**.\"\n\nThe six methods being compared across the three subfigures are:\n1. Zero-shot\n2. 5-shot with ground truth\n3. Finetune with ground truth\n4. 5-shot with weak labels\n5. Naive finetuning (weak-to-strong)\n6. Finetuning with auxiliary/confidence loss\n\nAnswer: Finetuning with the confidence loss (auxiliary loss)", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 47672, "error": null, "extra": {"thread_id": 2041, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_4655e497891e45598d32be191f19ac1d", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 260, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "finetuning with confidence loss auxiliary loss", "normalised_gold": "finetune g t"}}
+{"qid": "2312.10997v5.pdf::Q000", "doc_id": "2312.10997v5.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "Pre-training", "pages": 21, "document_id": 7596, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:07:52Z", "latency_ms": 23593, "error": null, "raw_text_chars": 1091}]}, "question_id": "2312.10997v5.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the figure with a tree shape and identify which branch has the least leaves.\n\nLooking through the document, I can see Figure 1 on page 2, which shows a \"Technology tree of RAG research\" with a tree-like structure.\n\nExamining this tree diagram, I can see it has a trunk labeled \"Retrieval-Augmented Generation\" and branches extending upward. The main branches appear to be:\n\n1. A left branch (colored in teal/green) showing the evolution from pre-training through fine-tuning stages\n2. A middle branch (colored in orange) labeled \"Instructlens\" \n3. A right branch (colored in blue) showing the inference stage with multiple methods\n\nLooking at the number of \"leaves\" (individual methods/papers) on each branch:\n- The left branch (Pre-training/Fine-tuning) has several leaves\n- The middle branch (Instructlens) appears to have the fewest leaves - it's relatively short with minimal branching\n- The right branch (Inference) has many leaves with extensive branching\n\nThe branch labeled \"Instructlens\" in the middle appears to have the least number of leaves.\n\nAnswer: Instructlens", "citations": [], "input_tokens": 64633, "output_tokens": 269, "cost_micros": 197934, "latency_ms": 23577, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.10997v5.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "instructlens", "normalised_gold": "pre training"}}
+{"qid": "2312.10997v5.pdf::Q000", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "Pre-training", "pages": 21, "document_id": 7596, "arm": "azure_basic_lc", "question_id": "2312.10997v5.pdf::Q000", "answer_letter": null, "raw_text": "I need to examine the tree-shaped figure in the document to identify which branch has the least leaves.\n\nLooking at Figure 1 (the technology tree of RAG research), I can see it's organized by year (2020, 2023, 2024) and has three main branches at the top labeled:\n- \"Augmentation Stage\"\n- \"Fine-tuning\" \n- \"Pre-training\"\n\nLet me count the leaves (methods/papers) in each branch:\n\n**Pre-training branch:**\n- 2020: REALM (implied from context)\n- 2022: Retro, CoG, SANTA, TIGER\n- 2023-2024: RAVEN, Retro++, InstructRetro\n\nThis appears to have around 7-8 leaves.\n\n**Fine-tuning branch:**\nThis has many methods spread across 2023-2024, including PROMPTAGATOR, LM-Indexer, various methods - appears to have 15+ leaves.\n\n**Inference branch:**\nThis has the most methods, with numerous entries across 2023-2024 - appears to have 30+ leaves.\n\nLooking more carefully at the tree structure, the \"Pre-training\" branch (on the left side of the tree) has the fewest methods/leaves compared to the other branches.\n\nAnswer: Pre-training", "citations": [], "input_tokens": 33376, "output_tokens": 305, "cost_micros": 104703, "latency_ms": 9515, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "pre training", "normalised_gold": "pre training"}}
+{"qid": "2312.10997v5.pdf::Q000", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "Pre-training", "pages": 21, "document_id": 7596, "arm": "azure_premium_lc", "question_id": "2312.10997v5.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 1 (the Technology tree of RAG research), I need to count the number of methods/papers shown in each branch:\n\n- **Pre-training branch** (bottom): Contains methods like TIGER, SANTA, Atlas, RAVEN, RETRO++, INSTRUCTRETRO, CoG, EAR, KALMV - approximately 9-10 items\n\n- **Fine-tuning branch** (middle): Contains methods like G-Retriever, RADA, CT-RAG, HyKGE, FILCO, InstructRetro, RA-DIT, Self-RAG, FLARE, and many others - this is the largest branch with approximately 25+ items\n\n- **Inference branch** (top): Contains methods like RAPTOR, UniMS-RAG, CRAG, BGM, IAG, BEQ, ARM-RAG, and several others - approximately 15-20 items\n\nThe Pre-training branch clearly has the fewest methods/leaves compared to the other two branches.\n\nAnswer: Pre-training", "citations": [], "input_tokens": 40683, "output_tokens": 242, "cost_micros": 125679, "latency_ms": 8514, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "pre training", "normalised_gold": "pre training"}}
+{"qid": "2312.10997v5.pdf::Q000", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "Pre-training", "pages": 21, "document_id": 7596, "arm": "llamacloud_basic_lc", "question_id": "2312.10997v5.pdf::Q000", "answer_letter": null, "raw_text": "I need to look at the figure in the document that has a tree shape and identify which branch has the least leaves.\n\nLooking at the document, I can see Figure 1, which is described as a \"Technology tree of RAG research.\" This figure shows three main stages: Pre-training, Fine-tuning, and Inference, with various methods listed under each branch.\n\nLet me count the items in each branch:\n\n**Pre-training branch:**\n- CoG, RAVEN, Atlas, RAG, RETRO++, INSTRUCTRETRO, TIGER, SANTA (and a few more visible items)\n- This appears to have around 8-10 items\n\n**Fine-tuning branch:**\n- UniMS-RAG, Self-RAG, MK-ToD, QLM-Doc-ranking, SKR, SANTA, IRCOT, TIGER, Self-Mem, SCM4LLMs (and more)\n- This appears to have the most items, around 15-20+\n\n**Inference branch:**\n- CRAG, CT-RAG, EARCoN, CREA-ICL, PRCA, BEQ, ARM-RAG, PROMPTAGATOR, ZemiR-GQA (and many more)\n- This also has many items\n\nLooking at the figure more carefully, the **Pre-training** branch appears to have the fewest items/leaves compared to the Fine-tuning and Inference branches.\n\nAnswer: Pre-training", "citations": [], "input_tokens": 30919, "output_tokens": 336, "cost_micros": 97797, "latency_ms": 10046, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "pre training", "normalised_gold": "pre training"}}
+{"qid": "2312.10997v5.pdf::Q000", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "Pre-training", "pages": 21, "document_id": 7596, "arm": "llamacloud_premium_lc", "question_id": "2312.10997v5.pdf::Q000", "answer_letter": null, "raw_text": "I need to examine the tree-shaped figure in the document to identify which branch has the least leaves (items).\n\nLooking at Figure 1 titled \"Technology Tree of RAG research,\" I can see it's organized into three main columns representing different stages:\n- Pre-training\n- Fine-tuning  \n- Inference\n\nLet me count the items in each branch/column:\n\n**Pre-training column:** Contains items like:\n- G-Retriever, CRAG, IAG, FILCO, ARM-RAG, RADA, RAPTOR, BGM, HyKGE, DRAGON-AI, PaperQA, CREA-ICL\n- PRCA, Token-Elimination, KALMV, ITRG\n- 1-PAGER, FABULA, SKR, ToC, QLM-Doc-ranking, Recomp\n- KGP, LLM-R, ITER-RETGEN, COQ, LLM-IE, ICRALM, DSP\n- KnowledGPT, IRCOT, PGRA, PKG, SCM4LLMs, Filter-Reranker, RePLUG\n- CoG, TIGER, Retro++\n- GenRead\n- (empty row)\n- RECITE\n\n**Fine-tuning column:** Contains items like:\n- UniMS-RAG, CT-RAG, BEQ, CoN, EAR\n- RAST, Self-RAG, LM-Indexer, RA-DIT, Dual-Feedback-ToD, MK-ToD, RAG_Robust, Retrieve-and-Sample\n- InstructRetro\n- RAVEN\n- SANTA, RRR, Self-Mem, UPRISE, R-GQA, Zemi\n- SUGRE, AAR, FLARE\n- PROMPTAGATOR, RAG-e2e\n- Atlas\n- RAG\n\n**Inference column:** This appears to be the largest with many items distributed across multiple years.\n\nComparing the three branches, the **Fine-tuning** column appears to have fewer distinct method names than the other two columns, but I need to count more carefully", "citations": [], "input_tokens": 37501, "output_tokens": 512, "cost_micros": 120183, "latency_ms": 18250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "comparing three branches fine tuning column appears to have fewer distinct method names than other two columns but i need to count more carefully", "normalised_gold": "pre training"}}
+{"qid": "2312.10997v5.pdf::Q000", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "Pre-training", "pages": 21, "document_id": 7596, "arm": "surfsense_agentic", "question_id": "2312.10997v5.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find a figure with a tree shape and identify the branch with the least leaves.Let me search for references to figures with tree shapes in the document:Let me read through the document to find figures. I'll start by looking at the actual content:The document mentions a \"technology tree\" in Figure 1. Let me look for Figure 1:Let me search for where figures are actually displayed in the document:Good! There are embedded images. Let me read around Im1.png which should be Figure 1 (the technology tree):This looks like it's the OCR text from the tree figure. Let me continue reading to see all the branches:Perfect! I found the tree-shaped figure. It's a tree diagram with three main branches:\n1. Fine-tuning (green, left branch)\n2. Pre-training (orange/gold, middle branch) \n3. Inference (blue-green, right branch)\n\nLet me continue reading the visual description to see which branch has the least leaves (papers/items):Perfect! The visual description clearly states:\n\n- **Fine-tuning branch**: Most extensive with numerous sub-branches and approaches\n- **Pre-training branch**: Narrowest with fewer approaches\n- **Inference branch**: Dense with many recent (2024) developments\n\nThe Pre-training branch is described as the \"narrowest with fewer approaches,\" meaning it has the least leaves.\n\nAnswer: Pre-training", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 62969, "error": null, "extra": {"thread_id": 2042, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_bc29640067d14f5eb3f9f7d53e8522d7", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 357, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "pre training", "normalised_gold": "pre training"}}
+{"qid": "2312.10997v5.pdf::Q001", "doc_id": "2312.10997v5.pdf", "answer_format": "int", "gold": "9", "pages": 21, "document_id": 7596, "arm": "native_pdf", "question_id": "2312.10997v5.pdf::Q001", "answer_letter": null, "raw_text": "I need to examine Figure 3 to count the distinct icons used.\n\nLooking at Figure 3 on page 4 of the document, which shows \"Comparison between the three paradigms of RAG,\" I can see three main sections representing Naive RAG, Advanced RAG, and Modular RAG.\n\nLet me identify all the distinct icons/shapes used:\n\n1. User icon (person silhouette)\n2. Query icon (document/text box)\n3. Documents icon (stacked documents)\n4. Indexing box\n5. Retrieval box\n6. Prompt box (document icon)\n7. Frozen LLM (snowflake icon with text)\n8. Output box\n9. Pre-Retrieval box (with sub-elements like \"Query Rewriting\", \"Query Expansion\", \"Query Enhancement\")\n10. Post-Retrieval box (with sub-elements like \"Re-rank\", \"Memory\", \"Fusion\")\n11. Various module boxes in the Modular RAG section (Routing, Search, Predict, Retrieve, Rewrite, RAG, Rerank, Read, Demonstrate, Fusion, Memory)\n12. Different pattern sections showing Rewrite, Retrieve, Read combinations\n\nCounting the distinct icon types (not individual instances):\n- User icon (person)\n- Query icon\n- Documents icon\n- Standard rectangular boxes (for processes)\n- Snowflake icon (for Frozen LLM)\n- Document/page icon (for Prompt)\n\nLooking more carefully at the distinct visual elements:\n1. Person/user icon\n2. Text/query box\n3. Documents stack icon\n4. Process boxes (rectangles)\n5. Snowflake (frozen LLM)\n6. Document icon\n\nAnswer: 6", "citations": [], "input_tokens": 64616, "output_tokens": 404, "cost_micros": 199908, "latency_ms": 15077, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.10997v5.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "9"}}
+{"qid": "2312.10997v5.pdf::Q001", "doc_id": "2312.10997v5.pdf", "answer_format": "int", "gold": "9", "pages": 21, "document_id": 7596, "arm": "azure_basic_lc", "question_id": "2312.10997v5.pdf::Q001", "answer_letter": null, "raw_text": "I need to carefully examine Figure 3 to count the distinct icons used.\n\nLooking at Figure 3, which shows \"Comparison between the three paradigms of RAG,\" I can see:\n\n1. **User Query icon** - represented by a person/user symbol\n2. **Documents icon** - represented by stacked document/paper symbols\n3. **Indexing icon** - appears to show database/storage symbols\n4. **Retrieval icon** - shown with magnifying glass or search symbols\n5. **Read icon** - shown as a document being read\n6. **Predict icon** - shown with a specific symbol\n7. **Output icon** - shown at the end of processes\n8. **Prompt icon** - shown in the middle paradigm\n9. **Frozen LLM icon** - represented by a brain or model symbol\n10. **Rerank icon** - shown in Advanced RAG\n11. **Rewrite icon** - shown in Modular RAG\n12. **Demonstrate icon** - shown in Modular RAG\n13. **Routing icon** - shown in Modular RAG\n14. **Search icon** - shown in Modular RAG\n15. **Fusion icon** - shown in Modular RAG\n16. **Memory icon** - shown in Modular RAG\n\nLet me recount more carefully by looking at the actual distinct visual icons in the figure:\n\nFrom the figure, I can identify these distinct icon types:\n- User/person icon\n- Document/paper stack icon\n- Database/storage icon\n- Magnifying glass (search/retrieval)\n- Brain/model icon (for LLM)\n- Arrow/flow icons\n- Various process boxes with different labels\n\nCounting the truly distinct visual icons (not just labeled boxes):\n1. User icon\n2. Document icon\n3. Database/indexing icon\n4. Magnifying glass/search icon\n5. Brain/LLM icon\n6. Output/result icon\n\nAnswer: 6", "citations": [], "input_tokens": 33359, "output_tokens": 449, "cost_micros": 106812, "latency_ms": 10735, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "6", "normalised_gold": "9"}}
+{"qid": "2312.10997v5.pdf::Q001", "doc_id": "2312.10997v5.pdf", "answer_format": "int", "gold": "9", "pages": 21, "document_id": 7596, "arm": "azure_premium_lc", "question_id": "2312.10997v5.pdf::Q001", "answer_letter": null, "raw_text": "I need to carefully examine Figure 3 to count the distinct icons used.\n\nLooking at Figure 3 titled \"Comparison between the three paradigms of RAG,\" I can see three diagrams showing Naive RAG, Advanced RAG, and Modular RAG.\n\nLet me identify each distinct icon type used across all three paradigms:\n\n1. **User** - person icon (shown in all three)\n2. **Query** - document/text icon (shown in all three)\n3. **Documents** - stack of documents icon (shown in all three)\n4. **Indexing** - cylindrical database icon (shown in Naive and Advanced RAG)\n5. **Retrieval** - magnifying glass/search icon (shown in all three)\n6. **Rewrite** - circular arrow icon (shown in Advanced RAG)\n7. **Rerank** - list with arrows icon (shown in Advanced RAG)\n8. **Prompt** - text/document icon (shown in all three)\n9. **Frozen LLM** - brain/processing icon (shown in all three)\n10. **Read** - book/reading icon (shown in all three)\n11. **Output** - result/output icon (shown in all three)\n12. **Modules (R)** - box with R (shown in Modular RAG)\n13. **Search** - magnifying glass (shown in Modular RAG)\n14. **Routing** - directional arrow icon (shown in Modular RAG)\n15. **Predict** - crystal ball/prediction icon (shown in Modular RAG)\n16. **Demonstrate** - pointing hand icon (shown in Modular RAG)\n17. **Memory** - storage icon (shown in Modular RAG)\n18. **Fusion** - merge icon (shown in Modular RAG)\n19. **Summary** - document summary icon (shown in Advanced RAG)\n20. **Judge** - decision diamond (shown in Modular RAG)\n21. **Generate** - generation icon (shown in Modular RAG)\n\nLet me recount more carefully by looking at the actual distinct icon shapes:\n\n1. User (person)\n2. Query (document)\n3. Documents (stack)", "citations": [], "input_tokens": 40666, "output_tokens": 512, "cost_micros": 129678, "latency_ms": 10766, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "9"}}
+{"qid": "2312.10997v5.pdf::Q001", "doc_id": "2312.10997v5.pdf", "answer_format": "int", "gold": "9", "pages": 21, "document_id": 7596, "arm": "llamacloud_basic_lc", "question_id": "2312.10997v5.pdf::Q001", "answer_letter": null, "raw_text": "I need to carefully examine Figure 3 to count the distinct icons used.\n\nLooking at Figure 3, which shows \"Comparison between the three paradigms of RAG,\" I can see three sections (Left, Middle, Right) representing Naive RAG, Advanced RAG, and Modular RAG.\n\nLet me identify all the distinct icons/symbols used in the figure:\n\n1. \"User Query\" box (rectangular box with text)\n2. \"Indexing\" box\n3. \"Retrieval\" box\n4. \"Rerank\" box\n5. \"Rewrite\" box\n6. \"Fusion\" box\n7. \"Summary\" box\n8. \"Demonstrate\" box\n9. \"Read\" box\n10. \"Search\" box\n11. \"Retrieve\" box\n12. \"Memory\" box\n13. \"Predict\" box\n14. \"Routing\" box\n15. \"Documents\" box\n16. \"RAG\" box\n17. \"Output\" box\n18. \"Response\" box\n19. \"Judge\" box\n20. \"Generate\" box\n21. \"Transformation/Decomposition\" box\n22. \"Iterate\" box\n23. Arrow symbols (connecting the boxes)\n24. \"Max Times/Threshold\" decision point\n25. \"Max Depth (Tree)/Threshold\" decision point\n26. \"Generate Special Token/Threshold\" decision point\n27. \"N Times\" indicator\n\nLooking more carefully at the visual elements, the distinct icon types are:\n- Rectangular boxes (for processes/modules)\n- Arrows (for flow direction)\n- Diamond or decision point shapes (for conditional logic)\n\nCounting the distinct visual icon types (not the number of instances, but types):\n1. Rectangular box/module icon\n2. Arrow icon\n3. Diamond/decision point icon\n\nAnswer: 3", "citations": [], "input_tokens": 30902, "output_tokens": 415, "cost_micros": 98931, "latency_ms": 10655, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "9"}}
+{"qid": "2312.10997v5.pdf::Q001", "doc_id": "2312.10997v5.pdf", "answer_format": "int", "gold": "9", "pages": 21, "document_id": 7596, "arm": "llamacloud_premium_lc", "question_id": "2312.10997v5.pdf::Q001", "answer_letter": null, "raw_text": "I need to carefully examine Figure 3 in the document to count the distinct icons used.\n\nLooking at Figure 3, which shows \"Comparison between the three paradigms of RAG,\" I can see three main diagrams:\n\n1. **Naive RAG (Left)**: Shows a flowchart with:\n   - User Query\n   - Documents\n   - Indexing\n   - Retrieval\n   - Prompt\n   - Frozen LLM\n   - Output\n\n2. **Advanced RAG (Middle)**: Shows:\n   - User Query\n   - Documents\n   - Indexing\n   - Pre-Retrieval (with sub-components: Query Routing, Query Rewriting, Query Expansion)\n   - Retrieval\n   - Post-Retrieval (with sub-components: Rerank, Summary, Fusion)\n   - Prompt\n   - Frozen LLM\n   - Output\n\n3. **Modular RAG (Right)**: Shows:\n   - Multiple modules (Routing, Search, Predict, Rewrite, RAG, Rerank, Retrieve, Read, Demonstrate, Fusion, Memory)\n   - Various patterns including Naive RAG, Advanced RAG, DSP, and ITER-RETGEN\n\nLooking at the visual representation, the distinct icon shapes/types I can identify are:\n1. Rectangular boxes (for most components)\n2. Rounded rectangles/boxes (for grouping/subgraphs)\n3. Arrows/lines (for connections)\n\nHowever, if we're counting distinct visual elements or node types based on their representation in the figure, the components are primarily represented as boxes/rectangles with text labels. The figure uses a consistent box/rectangle style for representing different components.\n\nBased on the mermaid diagram structure shown, all components are represented using similar rectangular node shapes, with the main distinction being in the grouping (subgraphs) and connections rather than distinct icon types.\n\nAnswer: 1", "citations": [], "input_tokens": 37484, "output_tokens": 444, "cost_micros": 119112, "latency_ms": 12312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "1", "normalised_gold": "9"}}
+{"qid": "2312.10997v5.pdf::Q001", "doc_id": "2312.10997v5.pdf", "answer_format": "int", "gold": "9", "pages": 21, "document_id": 7596, "arm": "surfsense_agentic", "question_id": "2312.10997v5.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the document to find Figure 3 and count the distinct icons used.Let me search for \"Figure 3\" in the document:Now let me read around line 788 to find Figure 3:Let me continue reading to find Figure 3:I see Figure 2 but need to continue searching for Figure 3. Let me search more:Perfect! I found Figure 3. Let me continue reading to see the complete visual description:Perfect! I have found Figure 3 and its visual description. Let me carefully count the distinct icons mentioned in the visual description:\n\nFrom the visual description of Figure 3, I can identify the following distinct icons:\n\n1. **User/Query icon** - mentioned in Naive RAG and Advanced RAG\n2. **Documents icon** - mentioned in Naive RAG and Advanced RAG (stacked papers/documents)\n3. **Snowflake icon** - mentioned in Naive RAG and Advanced RAG (representing Frozen LLM)\n4. **Document icon** - mentioned in Naive RAG (at the Prompt stage)\n5. **Shuffle/rerank icon** - mentioned in Advanced RAG Post-Retrieval\n6. **Waveform/summary icon** - mentioned in Advanced RAG Post-Retrieval\n7. **Layers/fusion icon** - mentioned in Advanced RAG Post-Retrieval\n\nLet me recount more carefully by looking at what the visual description explicitly states:\n- User/Query icons (top of diagrams)\n- Documents icons (top of diagrams)\n- Document icon (at Prompt stage)\n- Snowflake icon (representing Frozen LLM)\n- Shuffle/rerank icon (in Post-Retrieval)\n- Waveform/summary icon (in Post-Retrieval)\n- Layers/fusion icon (in Post-Retrieval)\n\nThat gives us 7 distinct icons total.\n\nAnswer: 7", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 51812, "error": null, "extra": {"thread_id": 2043, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_c31c1d3df3774ac882e3f8a22f227910", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 283, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "7", "normalised_gold": "9"}}
+{"qid": "2312.10997v5.pdf::Q002", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "Copy is all you need.", "pages": 21, "document_id": 7596, "arm": "native_pdf", "question_id": "2312.10997v5.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the paper that has a retrieval granularity of \"Phrase\" in Table I.\n\nLooking at Table I (Summary of RAG methods) on page 6, I can see the columns: Method, Retrieval Source, Retrieval Data Type, Retrieval Granularity, Augmentation Stage, and Retrieval process.\n\nScanning the \"Retrieval Granularity\" column, I find one entry with \"Phrase\":\n- CoG [29] - which has retrieval granularity of \"Phrase\"\n\nNow I need to find the full title of reference [29]. Looking at the references section starting on page 17, reference [29] is:\n\n[29] T. Lan, D. Cai, Y. Wang, H. Huang, and X.-L. Mao, \"Copy is all you need,\" in The Eleventh International Conference on Learning Representations, 2022.\n\nAnswer: Copy is all you need", "citations": [], "input_tokens": 64633, "output_tokens": 220, "cost_micros": 197199, "latency_ms": 34000, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.10997v5.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "copy is all you need", "normalised_gold": "copy is all you need"}}
+{"qid": "2312.10997v5.pdf::Q002", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "Copy is all you need.", "pages": 21, "document_id": 7596, "arm": "azure_basic_lc", "question_id": "2312.10997v5.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the paper that uses \"phrase\" as its retrieval granularity.\n\nLooking at Table I \"SUMMARY OF RAG METHODS\", I can find the retrieval granularity column. Let me search for entries with \"Phrase\" as the retrieval granularity.\n\nFrom Table I, I can see:\n- CoG [29] has:\n  - Retrieval Source: Wikipedia\n  - Retrieval Data Type: Text\n  - Retrieval Granularity: Phrase\n  - Augmentation Stage: Pre-training\n  - Retrieval process: Iterative\n\nNow I need to find the full title of the CoG paper. Looking at the references section:\n\n[29] T. Lan, D. Cai, Y. Wang, H. Huang, and X.-L. Mao, \"Copy is all you need,\" in The Eleventh International Conference on Learning Representations, 2022.\n\nAnswer: Copy is all you need", "citations": [], "input_tokens": 33376, "output_tokens": 222, "cost_micros": 103458, "latency_ms": 7266, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "copy is all you need", "normalised_gold": "copy is all you need"}}
+{"qid": "2312.10997v5.pdf::Q002", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "Copy is all you need.", "pages": 21, "document_id": 7596, "arm": "azure_premium_lc", "question_id": "2312.10997v5.pdf::Q002", "answer_letter": null, "raw_text": "Looking through the document for methods with \"Phrase\" retrieval granularity in Table I...\n\nFrom Table I (SUMMARY OF RAG METHODS), I can see:\n- CoG [29] has retrieval granularity of \"Phrase\"\n\nLooking at the references section for reference [29]:\n\n[29] T. Lan, D. Cai, Y. Wang, H. Huang, and X.-L. Mao, \"Copy is all you need,\" in The Eleventh International Conference on Learning Representations, 2022.\n\nAnswer: Copy is all you need", "citations": [], "input_tokens": 40683, "output_tokens": 133, "cost_micros": 124044, "latency_ms": 8407, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "copy is all you need", "normalised_gold": "copy is all you need"}}
+{"qid": "2312.10997v5.pdf::Q002", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "Copy is all you need.", "pages": 21, "document_id": 7596, "arm": "llamacloud_basic_lc", "question_id": "2312.10997v5.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the paper that proposes a method with phrase-level retrieval granularity.\n\nLooking at Table I (Summary of Retrieval Source and Retrieval Granularity Used in RAG Methods), I can find methods with \"Phrase\" as their retrieval granularity.\n\nFrom the table, I can see:\n- CoG [29] has retrieval granularity of \"Phrase\"\n\nNow I need to find the full title of the CoG paper from the references section.\n\nLooking at the references, reference [29] is:\n\"T. Lan, D. Cai, Y. Wang, H. Huang, and X.-L. Mao, \"Copy is all you need,\" in The Eleventh International Conference on Learning Representations, 2022.\"\n\nAnswer: Copy is all you need", "citations": [], "input_tokens": 30919, "output_tokens": 187, "cost_micros": 95562, "latency_ms": 6766, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "copy is all you need", "normalised_gold": "copy is all you need"}}
+{"qid": "2312.10997v5.pdf::Q002", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "Copy is all you need.", "pages": 21, "document_id": 7596, "arm": "llamacloud_premium_lc", "question_id": "2312.10997v5.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the paper that proposes a method with retrieval granularity of \"Phrase\".\n\nLooking at Table I (Summary of RAG Methods), I can search for entries where \"Retrieval Granularity\" is \"Phrase\".\n\nScanning through Table I, I find:\n- CoG [29] has retrieval granularity: Phrase\n\nNow I need to find the full title of paper [29] from the references section.\n\nLooking at reference [29]:\n[29] T. Lan, D. Cai, Y. Wang, H. Huang, and X.-L. Mao, \"Copy is all you need,\" in *The Eleventh International Conference on Learning Representations*, 2022.\n\nAnswer: Copy is all you need", "citations": [], "input_tokens": 37501, "output_tokens": 174, "cost_micros": 115113, "latency_ms": 6453, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "copy is all you need", "normalised_gold": "copy is all you need"}}
+{"qid": "2312.10997v5.pdf::Q002", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "Copy is all you need.", "pages": 21, "document_id": 7596, "arm": "surfsense_agentic", "question_id": "2312.10997v5.pdf::Q002", "answer_letter": null, "raw_text": "I'll search for information about retrieval granularity and phrase-level methods in the provided documents.Let me continue reading to find the title and look for information about retrieval granularity:Good! I found the paper title: \"Retrieval-Augmented Generation for Large Language Models: A Survey\". Now I need to search for information about retrieval granularity and specifically which method has \"phrase\" as its retrieval granularity. Let me search for tables or sections that discuss retrieval granularity:Perfect! I found a table that mentions \"Phrase\" as a retrieval granularity. Let me read that section to find the complete information:Great! I can see that \"CoG [29]\" has \"Phrase\" as its retrieval granularity. Now I need to find the full title of the CoG paper. Let me look for the reference [29]:Perfect! I found reference [29]. Let me read that section to get the full title:I can see reference [29] starts at line 4090. Let me read a bit more to get the complete title:Perfect! I found the full title of reference [29], which is the paper that proposes the method (CoG) with phrase retrieval granularity.\n\nThe full title is: **\"Copy is all you need\"**\n\nAnswer: Copy is all you need", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 52827, "error": null, "extra": {"thread_id": 2044, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_27dfaf3e8bfa47e1b55fb4feceaed85a", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 297, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "copy is all you need", "normalised_gold": "copy is all you need"}}
+{"qid": "2312.10997v5.pdf::Q004", "doc_id": "2312.10997v5.pdf", "answer_format": "list", "gold": "['POPQA', 'MS MARCO', 'SST-2']", "pages": 21, "document_id": 7596, "arm": "native_pdf", "question_id": "2312.10997v5.pdf::Q004", "answer_letter": null, "raw_text": "I'll search through Table II to find datasets that have exactly three methods listed.\n\nLooking at Table II (SUMMARY OF RAG METHODS), I need to count the methods for each dataset:\n\n- WebQ (Web Questions): [3], [4], [13], [30], [50], [68] - 6 methods\n- PopQA: [7], [25], [67] - 3 methods \u2713\n- MS MARCO: [4], [40], [52] - 3 methods \u2713\n- MuSiQue: [14], [51], [61], [91] - 4 methods\n- ASQA: [24], [57] - 2 methods\n- QMSum(QM): [60], [123] - 2 methods\n- Qasper: [60], [63] - 2 methods\n- COVID-QA: [35], [46] - 2 methods\n- CMB [128],MMCU_Medical [129]: [81] - 1 method\n- QuALITY: [60], [63] - 2 methods\n- ARC: [25], [67] - 2 methods\n- CommonsenseQA: [58], [66] - 2 methods\n- GraphQA: [84] - 1 method\n- Amazon(Toys,Sport,Beauty): [39], [40] - 2 methods\n- T-REx [141],ZsRE [142]: [27], [51] - 2 methods\n- HellaSwag: [20], [66] - 2 methods\n- CoT Reasoning: [27] - 1 method\n- CSQA: [55] - 1 method\n- StrategyQA: [14], [24], [48], [51], [55], [58] - 6 methods\n- XSum: [17] - 1 method\n- VioLens: [19] - 1 method\n- TREC: [33] - 1 method\n- SST-2: [20], [33], [38] - 3 methods \u2713\n- CodeSearchNet: [76] -", "citations": [], "input_tokens": 64631, "output_tokens": 512, "cost_micros": 201573, "latency_ms": 17125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "length", "pdf_filename": "2312.10997v5.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "codesearchnet 76", "normalised_gold": "ms marco, popqa, sst 2"}}
+{"qid": "2312.10997v5.pdf::Q004", "doc_id": "2312.10997v5.pdf", "answer_format": "list", "gold": "['POPQA', 'MS MARCO', 'SST-2']", "pages": 21, "document_id": 7596, "arm": "azure_basic_lc", "question_id": "2312.10997v5.pdf::Q004", "answer_letter": null, "raw_text": "I need to examine Table II to find datasets that have exactly three methods listed.\n\nLet me go through Table II systematically:\n\n**QA - Single-hop:**\n- Natural Question (NQ): [3], [4], [22], [27], [40], [43], [54], [62], [71], [112] [20], [44], [72] [13], [30], [34], [45], [50], [64] - more than 3\n- TriviaQA (TQA): [4], [27], [59], [62], [112] [22], [25], [43], [44], [71], [72] [20], [23], [30], [32], [45], [69], [112] - more than 3\n- SQUAD: [3], [4], [13], [30], [50], [68] - 6 methods\n- Web Questions (WebQ): [4], [40], [52] - exactly 3 methods\n- PopQA: [4], [40], [52] - exactly 3 methods\n- MS MARCO: [117] - only 1 reference\n\n**QA - Multi-hop:**\n- HotpotQA: [7], [14], [22], [27], [59], [62], [69], [71], [91] - 9 methods\n- 2WikiMultiHopQA: [14], [51], [61], [91] - 4 methods\n- MuSiQue: [120] - only 1 reference\n\n**QA - Long-form:**\n- ELI5: [27], [34], [43], [49], [51] - 5 methods\n- NarrativeQA (NQA): [45], [60], [63], [123] - 4 methods\n- ASQA: [24], [57] - 2 methods\n- QMSum (QM): [60], [123] - 2 methods\n\n**QA - Domain:**\n- Qasper: [60], [63] - 2 methods\n- COVID-QA: [35], [46] - 2 methods\n- CMB: [", "citations": [], "input_tokens": 33374, "output_tokens": 512, "cost_micros": 107802, "latency_ms": 11546, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "cmb", "normalised_gold": "ms marco, popqa, sst 2"}}
+{"qid": "2312.10997v5.pdf::Q004", "doc_id": "2312.10997v5.pdf", "answer_format": "list", "gold": "['POPQA', 'MS MARCO', 'SST-2']", "pages": 21, "document_id": 7596, "arm": "azure_premium_lc", "question_id": "2312.10997v5.pdf::Q004", "answer_letter": null, "raw_text": "I need to examine Table II to find datasets that have exactly three methods listed.\n\nLet me go through Table II systematically:\n\n**QA - Single-hop:**\n- Natural Question (NQ): More than 3 methods\n- TriviaQA (TQA): More than 3 methods\n- SQUAD: More than 3 methods\n- Web Questions (WebQ): [3], [4], [13], [30], [50], [68] - 6 methods\n- PopQA: [7], [25], [67] - **exactly 3 methods**\n- MS MARCO: [4], [40], [52] - **exactly 3 methods**\n\n**QA - Multi-hop:**\n- HotpotQA: More than 3 methods\n- 2WikiMultiHopQA: [14], [24], [48], [59], [61], [91] - 6 methods\n- MuSiQue: [14], [51], [61], [91] - 4 methods\n\n**QA - Long-form QA:**\n- ELI5: [27], [34], [43], [49], [51] - 5 methods\n- NarrativeQA (NQA): [45], [60], [63], [123] - 4 methods\n- ASQA: [24], [57] - 2 methods\n- QMSum (QM): [60], [123] - 2 methods\n\n**QA - Domain QA:**\n- Qasper: [60], [63] - 2 methods\n- COVID-QA: [35], [46] - 2 methods\n- CMB, MMCU_Medical: [81] - 1 method\n\n**QA - Multi-Choice QA:**\n- QUALITY: [60], [63] - 2 methods\n- ARC: [25], [67] - 2 methods\n- CommonsenseQA: [58], [66] - 2 methods\n\n**Dialog:**\n- Wizard of Wikipedia (WoW): [13], [27], [34], [42] - 4 methods\n- KBP: [74],", "citations": [], "input_tokens": 40681, "output_tokens": 512, "cost_micros": 129723, "latency_ms": 12062, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "kbp 74", "normalised_gold": "ms marco, popqa, sst 2"}}
+{"qid": "2312.10997v5.pdf::Q004", "doc_id": "2312.10997v5.pdf", "answer_format": "list", "gold": "['POPQA', 'MS MARCO', 'SST-2']", "pages": 21, "document_id": 7596, "arm": "llamacloud_basic_lc", "question_id": "2312.10997v5.pdf::Q004", "answer_letter": null, "raw_text": "I need to look at Table II and find datasets that have exactly three methods listed.\n\nLet me go through Table II systematically:\n\n1. Natural Question (NQ) - has 10 methods listed\n2. TriviaQA (TQA) - has 5 methods listed\n3. SQuAD - has 7 methods listed\n4. Web Questions (WebQ) - has 6 methods listed\n5. PopQA - has 3 methods: [7], [25], [67] \u2713\n6. MS MARCO - has 3 methods: [4], [40], [52] \u2713\n7. HotpotQA - has 8 methods listed\n8. 2WikiMultiHopQA - has 6 methods listed\n9. MuSiQue - has 4 methods listed\n10. ELI5 - has 5 methods listed\n11. NarrativeQA (NQA) - has 4 methods listed\n12. ASQA - has 2 methods listed\n13. QMSum (QM) - has 2 methods listed\n14. Qasper - has 2 methods listed\n15. COVID-QA - has 2 methods listed\n16. CMB, MMCU Medical - has 1 method listed\n17. QuALITY - has 2 methods listed\n18. ARC - has 2 methods listed\n19. CommonsenseQA - has 2 methods listed\n20. GraphQA - has 1 method listed\n21. Wizard of Wikipedia (WoW) - has 4 methods listed\n22. KBP - has 2 methods listed\n23. DuleMon - has 1 method listed\n24. CamRest - has 2 methods listed\n25. Amazon (Toys, Sport, Beauty) - has 2 methods listed\n26. WikiEvent - has 4 methods listed\n27. RAMS - has 2 methods listed\n28. T-REx, ZsRE - has 2 methods listed\n29. HellaSwag - has 2 methods listed\n30. CoT Reasoning - has 1 method listed", "citations": [], "input_tokens": 30917, "output_tokens": 512, "cost_micros": 100431, "latency_ms": 11141, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "30 cot reasoning has 1 method listed", "normalised_gold": "ms marco, popqa, sst 2"}}
+{"qid": "2312.10997v5.pdf::Q004", "doc_id": "2312.10997v5.pdf", "answer_format": "list", "gold": "['POPQA', 'MS MARCO', 'SST-2']", "pages": 21, "document_id": 7596, "arm": "llamacloud_premium_lc", "question_id": "2312.10997v5.pdf::Q004", "answer_letter": null, "raw_text": "I need to find datasets in Table II that have exactly three methods listed.\n\nLet me go through Table II systematically and count the number of methods for each dataset:\n\n**QA - Single-hop:**\n- Natural Question (NQ): 12 methods\n- TriviaQA (TQA): 12 methods\n- SQuAD: 7 methods\n- Web Questions (WebQ): 6 methods\n- PopQA: 3 methods \u2713\n- MS MARCO: 3 methods \u2713\n\n**QA - Multi-hop:**\n- HotpotQA: 15 methods\n- 2WikiMultiHopQA: 6 methods\n- MuSiQue: 4 methods\n\n**QA - Long-form QA:**\n- ELI5: 5 methods\n- NarrativeQA (NQA): 4 methods\n- ASQA: 2 methods\n- QMSum (QM): 2 methods\n\n**QA - Domain QA:**\n- Qasper: 2 methods\n- COVID-QA: 2 methods\n- CMB, MMCU_Medical: 1 method\n\n**QA - Multi-Choice QA:**\n- QuALITY: 2 methods\n- ARC: 2 methods\n- CommonsenseQA: 2 methods\n\n**QA - Graph QA:**\n- GraphQA: 1 method\n\n**Dialog:**\n- Wizard of Wikipedia (WoW): 4 methods\n- KBP: 2 methods\n- DuleMon: 1 method\n- CamRest: 2 methods\n- Amazon (Toys, Sport, Beauty): 2 methods\n\n**IE:**\n- WikiEvent: 4 methods\n- RAMS: 2 methods\n- T-REx, ZsRE: 2 methods\n\n**Reasoning:**\n- HellaSwag: 2 methods\n- CoT Reasoning: 1 method\n- CSQA: 1 method\n\n**Others:**\n- MMLU: 7 methods\n- WikiText-103: 4 methods\n-", "citations": [], "input_tokens": 37499, "output_tokens": 512, "cost_micros": 120177, "latency_ms": 11500, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "", "normalised_gold": "ms marco, popqa, sst 2"}}
+{"qid": "2312.10997v5.pdf::Q004", "doc_id": "2312.10997v5.pdf", "answer_format": "list", "gold": "['POPQA', 'MS MARCO', 'SST-2']", "pages": 21, "document_id": 7596, "arm": "surfsense_agentic", "question_id": "2312.10997v5.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to find Table II and identify datasets with exactly three methods.Let me search for \"TABLE II\" in the document:Now let me read the section containing TABLE II:Now I can see Table II. Let me analyze it carefully to find datasets with exactly three methods. I need to count the number of method references for each dataset. Let me go through the table systematically:\n\nLooking at the table structure, I can see:\n- PopQA [116]: [7], [25], [67] - exactly 3 methods\n- MS MARCO [117]: [4], [40], [52] - exactly 3 methods\n- PopQA has methods: [7], [25], [67] - that's 3 methods\n- MS MARCO has methods: [4], [40], [52] - that's 3 methods\n- SST-2 [156]: [20], [33], [38] - exactly 3 methods\n\nLet me verify by checking each row more carefully:\n\n1. PopQA [116]: [7], [25], [67] = 3 methods\n2. MS MARCO [117]: [4], [40], [52] = 3 methods  \n3. QMSum(QM) [125]: [60], [123] = 2 methods\n4. Qasper [126]: [60], [63] = 2 methods\n5. COVID-QA [127]: [35], [46] = 2 methods\n6. QUALITY [130]: [60], [63] = 2 methods\n7. ARC [131]: [25], [67] = 2 methods\n8. CommonsenseQA [132]: [58], [66] = 2 methods\n9. GraphQA [84]: [84] = 1 method\n10. KBP [134]: [74], [135] = 2 methods\n11. DuleMon [136]: [74] = 1 method\n12. CamRest [137]: [78], [79] = 2 methods\n13. Amazon(Toys,Sport,Beauty) [138]: [39], [40] = 2 methods\n14. RAMS [140]: [36], [37] = 2 methods\n15. T-REX [141],ZsRE [142]: [27], [51] = 2 methods\n16. HellaSwag [143]: [20], [66] = 2 methods\n17. CoT Reasoning [144]: [27] = 1 method\n18. CSQA [145]: [55] = 1 method\n19. PubHealth [150]: [25], [67] = 2 methods\n20. Biography [151]: [67] = 1 method\n21. WikiASP [152]: [24] = 1 method\n22. XSum [153]: [17] = 1 method\n23. VioLens [154]: [19] = 1 method\n24. TREC [155]: [33] = 1 method\n25. SST-2 [156]: [20], [33], [38] = 3 methods\n26. CodeSearchNet [157]: [76] = 1 method\n27. NoMIRACL [56]: [56] = 1 method\n28. GSM8K [158]: [73] = 1 method\n29. JRC-Acquis [159]: [17] = 1 method\n\nAnswer: PopQA, MS MARCO, SST-2", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 42953, "error": null, "extra": {"thread_id": 2045, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_ea697b93518746f587b9ed16f4f922df", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 257, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "ms marco, popqa, sst 2", "normalised_gold": "ms marco, popqa, sst 2"}}
+{"qid": "2312.10997v5.pdf::Q005", "doc_id": "2312.10997v5.pdf", "answer_format": "list", "gold": "['Iterative Retrieval ', 'Recursive Retrieval ', 'Adaptive Retrieval ']", "pages": 21, "document_id": 7596, "arm": "native_pdf", "question_id": "2312.10997v5.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, specifically on page 10 where Section V \"AUGMENTATION PROCESS IN RAG\" begins, I can see the subsections listed:\n\nA. Iterative Retrieval\nB. Recursive Retrieval\nC. Adaptive Retrieval\n\nAnswer: Iterative Retrieval, Recursive Retrieval, Adaptive Retrieval", "citations": [], "input_tokens": 64634, "output_tokens": 83, "cost_micros": 195147, "latency_ms": 20108, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.10997v5.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "adaptive retrieval, iterative retrieval, recursive retrieval", "normalised_gold": "adaptive retrieval, iterative retrieval, recursive retrieval"}}
+{"qid": "2312.10997v5.pdf::Q005", "doc_id": "2312.10997v5.pdf", "answer_format": "list", "gold": "['Iterative Retrieval ', 'Recursive Retrieval ', 'Adaptive Retrieval ']", "pages": 21, "document_id": 7596, "arm": "azure_basic_lc", "question_id": "2312.10997v5.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, I need to find the section titled \"AUGMENTATION PROCESS IN RAG\" and identify its subsections.\n\nFrom Section V (AUGMENTATION PROCESS IN RAG), I can see the following structure:\n\nV. AUGMENTATION PROCESS IN RAG\n\nA. Iterative Retrieval\nB. Recursive Retrieval\nC. Adaptive Retrieval\n\nThese are the three subsections under \"AUGMENTATION PROCESS IN RAG\".\n\nAnswer: Iterative Retrieval, Recursive Retrieval, Adaptive Retrieval", "citations": [], "input_tokens": 33377, "output_tokens": 127, "cost_micros": 102036, "latency_ms": 4500, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "adaptive retrieval, iterative retrieval, recursive retrieval", "normalised_gold": "adaptive retrieval, iterative retrieval, recursive retrieval"}}
+{"qid": "2312.10997v5.pdf::Q005", "doc_id": "2312.10997v5.pdf", "answer_format": "list", "gold": "['Iterative Retrieval ', 'Recursive Retrieval ', 'Adaptive Retrieval ']", "pages": 21, "document_id": 7596, "arm": "azure_premium_lc", "question_id": "2312.10997v5.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Section V \"AUGMENTATION PROCESS IN RAG\" in the document, I can identify the subsections by examining the structure.\n\nThe section starts on page 10 and includes the following subsections:\n\nA. Iterative Retrieval\nB. Recursive Retrieval  \nC. Adaptive Retrieval\n\nAnswer: Iterative Retrieval, Recursive Retrieval, Adaptive Retrieval", "citations": [], "input_tokens": 40684, "output_tokens": 95, "cost_micros": 123477, "latency_ms": 4765, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "adaptive retrieval, iterative retrieval, recursive retrieval", "normalised_gold": "adaptive retrieval, iterative retrieval, recursive retrieval"}}
+{"qid": "2312.10997v5.pdf::Q005", "doc_id": "2312.10997v5.pdf", "answer_format": "list", "gold": "['Iterative Retrieval ', 'Recursive Retrieval ', 'Adaptive Retrieval ']", "pages": 21, "document_id": 7596, "arm": "llamacloud_basic_lc", "question_id": "2312.10997v5.pdf::Q005", "answer_letter": null, "raw_text": "Looking through the document for the section \"AUGMENTATION PROCESS IN RAG\" (Section V), I can identify its subsections:\n\nFrom the document structure, Section V contains:\n- A. Iterative Retrieval\n- B. Recursive Retrieval  \n- C. Adaptive Retrieval\n\nAnswer: Iterative Retrieval, Recursive Retrieval, Adaptive Retrieval", "citations": [], "input_tokens": 30920, "output_tokens": 91, "cost_micros": 94125, "latency_ms": 4125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "adaptive retrieval, iterative retrieval, recursive retrieval", "normalised_gold": "adaptive retrieval, iterative retrieval, recursive retrieval"}}
+{"qid": "2312.10997v5.pdf::Q005", "doc_id": "2312.10997v5.pdf", "answer_format": "list", "gold": "['Iterative Retrieval ', 'Recursive Retrieval ', 'Adaptive Retrieval ']", "pages": 21, "document_id": 7596, "arm": "llamacloud_premium_lc", "question_id": "2312.10997v5.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Section V \"AUGMENTATION PROCESS IN RAG\" in the document:\n\nThe section includes the following subsections:\n\nA. Iterative Retrieval\nB. Recursive Retrieval\nC. Adaptive Retrieval\n\nAnswer: Iterative Retrieval, Recursive Retrieval, Adaptive Retrieval", "citations": [], "input_tokens": 37502, "output_tokens": 76, "cost_micros": 113646, "latency_ms": 4125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "adaptive retrieval, iterative retrieval, recursive retrieval", "normalised_gold": "adaptive retrieval, iterative retrieval, recursive retrieval"}}
+{"qid": "2312.10997v5.pdf::Q005", "doc_id": "2312.10997v5.pdf", "answer_format": "list", "gold": "['Iterative Retrieval ', 'Recursive Retrieval ', 'Adaptive Retrieval ']", "pages": 21, "document_id": 7596, "arm": "surfsense_agentic", "question_id": "2312.10997v5.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to find the \"AUGMENTATION PROCESS IN RAG\" section and identify its subsections.Let me search for the \"AUGMENTATION PROCESS IN RAG\" section in the document.Now let me read the section around line 2636 to see the subsections.Let me continue reading to see all subsections under \"AUGMENTATION PROCESS IN RAG\".Perfect! I can now see the structure of the \"AUGMENTATION PROCESS IN RAG\" section. It has three subsections:\n\n1. A. Iterative Retrieval (line 2646)\n2. B. Recursive Retrieval (line 2827)\n3. C. Adaptive Retrieval (line 2858)\n\nThe next section \"VI. TASK AND EVALUATION\" starts at line 2908, confirming that these are the only three subsections under \"AUGMENTATION PROCESS IN RAG\".\n\nAnswer: Iterative Retrieval, Recursive Retrieval, Adaptive Retrieval", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 34782, "error": null, "extra": {"thread_id": 2046, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_dfd2aaa1a1dd45f1bdec0523c2899c57", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 177, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "adaptive retrieval, iterative retrieval, recursive retrieval", "normalised_gold": "adaptive retrieval, iterative retrieval, recursive retrieval"}}
+{"qid": "2312.10997v5.pdf::Q006", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "PKG", "pages": 21, "document_id": 7596, "arm": "native_pdf", "question_id": "2312.10997v5.pdf::Q006", "answer_letter": null, "raw_text": "I need to search through the document for information about methods that integrate knowledge into white-box models via directive fine-tuning.\n\nLooking through the document, I found relevant information on page 10 (document page 9-10) in section III. RETRIEVAL, subsection E. Adapter:\n\n\"Furthermore, PKG introduces an innovative method for integrating knowledge into white-box models via directive fine-tuning [75]. In this approach, the retriever module is directly substituted to generate relevant documents according to a query.\"\n\nThe text explicitly states that PKG (which likely stands for \"Parametric Knowledge Guiding\" based on reference [75] mentioned in the text) introduces this method.\n\nAnswer: PKG", "citations": [], "input_tokens": 64627, "output_tokens": 158, "cost_micros": 196251, "latency_ms": 20969, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.10997v5.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "pkg", "normalised_gold": "pkg"}}
+{"qid": "2312.10997v5.pdf::Q006", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "PKG", "pages": 21, "document_id": 7596, "arm": "azure_basic_lc", "question_id": "2312.10997v5.pdf::Q006", "answer_letter": null, "raw_text": "Looking through the document for methods that integrate knowledge into white-box models via directive fine-tuning...\n\nIn Section III.E (Adapter), I find the following passage:\n\n\"Furthermore, PKG introduces an innovative method for integrating knowledge into white-box models via directive fine-tuning [75]. In this approach, the retriever module is directly substituted to generate relevant documents according to a query.\"\n\nAnswer: PKG", "citations": [], "input_tokens": 33370, "output_tokens": 94, "cost_micros": 101520, "latency_ms": 5250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "pkg", "normalised_gold": "pkg"}}
+{"qid": "2312.10997v5.pdf::Q006", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "PKG", "pages": 21, "document_id": 7596, "arm": "azure_premium_lc", "question_id": "2312.10997v5.pdf::Q006", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about a method that integrates knowledge into white-box models via directive fine-tuning.\n\nIn Section III.E (Adapter), I can find the following passage:\n\n\"Furthermore, PKG introduces an innovative method for integrating knowledge into white-box models via directive fine-tuning [75]. In this approach, the retriever module is directly substituted to generate relevant documents according to a query.\"\n\nThe text clearly states that PKG introduces this method for integrating knowledge into white-box models via directive fine-tuning.\n\nAnswer: PKG", "citations": [], "input_tokens": 40677, "output_tokens": 130, "cost_micros": 123981, "latency_ms": 6671, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "pkg", "normalised_gold": "pkg"}}
+{"qid": "2312.10997v5.pdf::Q006", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "PKG", "pages": 21, "document_id": 7596, "arm": "llamacloud_basic_lc", "question_id": "2312.10997v5.pdf::Q006", "answer_letter": null, "raw_text": "Looking through the document for methods that integrate knowledge into white-box models via directive fine-tuning...\n\nThe document discusses RA-DIT (Retrieval-Augmented Dual Instruction Tuning) in reference [27], which is described as \"retrieval-augmented dual instruction tuning.\" This method involves fine-tuning both the retriever and generator components.\n\nIn Section IV.B on LLM Fine-tuning, the document states: \"Fine-tuning of LLM can also be coordinated with fine-tuning of the retriever to align preferences. A typical approach, such as RA-DIT [27], aligns the scoring functions between Retriever and Generator using KL divergence.\"\n\nThis method specifically focuses on instruction tuning (a form of directive fine-tuning) to integrate retrieval-augmented knowledge into language models.\n\nAnswer: RA-DIT", "citations": [], "input_tokens": 30913, "output_tokens": 200, "cost_micros": 95739, "latency_ms": 6766, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "ra dit", "normalised_gold": "pkg"}}
+{"qid": "2312.10997v5.pdf::Q006", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "PKG", "pages": 21, "document_id": 7596, "arm": "llamacloud_premium_lc", "question_id": "2312.10997v5.pdf::Q006", "answer_letter": null, "raw_text": "Looking through the document for information about integrating knowledge into white-box models via directive fine-tuning...\n\nIn Section III.E (Adapter), I find the following passage:\n\n\"Furthermore, PKG introduces an innovative method for integrating knowledge into white-box models via directive fine-tuning [75]. In this approach, the retriever module is directly substituted to generate relevant documents according to a query.\"\n\nAnswer: PKG", "citations": [], "input_tokens": 37495, "output_tokens": 95, "cost_micros": 113910, "latency_ms": 5500, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "pkg", "normalised_gold": "pkg"}}
+{"qid": "2312.10997v5.pdf::Q006", "doc_id": "2312.10997v5.pdf", "answer_format": "str", "gold": "PKG", "pages": 21, "document_id": 7596, "arm": "surfsense_agentic", "question_id": "2312.10997v5.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the priority document to find the answer to your question about which method integrates knowledge into white-box models via directive fine-tuning.Let me search for content about \"directive fine-tuning\" and \"white-box models\":Let me read more of the document to find tables or sections discussing methods:Let me search for tables that compare methods:Let me read around the tables to find information about methods:Let me search for \"Table I\" to find the table that might contain method comparisons:This table shows methods but doesn't mention white-box or directive fine-tuning. Let me search more specifically:Let me search for a combination of terms:Let me look for \"fine-tuning\" methods and check around line 2053 which mentions modular RAG integration with fine-tuning:Let me look for sections on fine-tuning methods:Found it! Let me read around line 2526:Perfect! I found the answer. The text states \"PKG introduces an innovative method for integrating knowledge into white-box models via directive fine-tuning [75].\" Let me verify this is PKG by reading a bit more context:Perfect! The answer is clear. The document states: \"Furthermore, PKG introduces an innovative method for integrating knowledge into white-box models via directive fine-tuning [75].\"\n\nAnswer: PKG", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 72530, "error": null, "extra": {"thread_id": 2047, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_9ee1ee526a9645f5ba543d1cda3a3a6d", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 421, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "pkg", "normalised_gold": "pkg"}}
+{"qid": "2401.18059v1.pdf::Q000", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "[16, 19, 25]", "pages": 23, "document_id": 7597, "arm": "native_pdf", "question_id": "2401.18059v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to examine Figure 4 to identify which nodes RAPTOR retrieved for both questions about the Cinderella story.\n\nFrom the document, Figure 4 shows the querying process for two questions:\n1. \"What is the central theme of the story?\"\n2. \"How did Cinderella find a happy ending?\"\n\nLooking at Figure 4 on page 7, I can see:\n- Nodes highlighted indicate RAPTOR's selections\n- The figure shows a tree structure with nodes numbered 0-15 at the leaf layer, nodes 16-22 at an intermediate layer, and nodes 23-26 at the root layer\n\nFor Question 1 (central theme), RAPTOR retrieved nodes that are highlighted in one color.\nFor Question 2 (happy ending), RAPTOR retrieved nodes that are highlighted in another color.\n\nThe nodes that overlap (retrieved for BOTH questions) would be those highlighted with both colors.\n\nFrom the visual representation in Figure 4, I can see that certain nodes appear to be selected for both queries. Looking at the highlighted patterns, the nodes that appear to be retrieved for both questions are nodes 18 and 19 (visible in the intermediate layer where both highlighting patterns overlap).\n\nAnswer: 18, 19", "citations": [], "input_tokens": 58460, "output_tokens": 279, "cost_micros": 179565, "latency_ms": 35969, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2401.18059v1.pdf"}, "graded": {"correct": false, "f1": 0.4, "method": "list_set", "normalised_pred": "18, 19", "normalised_gold": "16, 19, 25"}}
+{"qid": "2401.18059v1.pdf::Q000", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "[16, 19, 25]", "pages": 23, "document_id": 7597, "arm": "azure_basic_lc", "question_id": "2401.18059v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 4 in the document, I need to identify which nodes are highlighted (retrieved by RAPTOR) for both questions about Cinderella.\n\nThe figure shows two questions:\n1. \"What is the central theme of the story?\"\n2. \"How did Cinderella find a happy ending?\"\n\nAccording to the figure caption: \"Highlighted nodes indicate RAPTOR's selections, while arrows point to DPR's leaf nodes.\"\n\nExamining the visual representation in Figure 4, I can see the tree structure with nodes numbered 0-26. For nodes to be retrieved for both questions, they must be highlighted for both question 1 and question 2.\n\nLooking at the overlap between the two retrievals shown in the figure, the nodes that appear to be selected for both questions are those at the higher levels of the tree that contain summary information relevant to both the central theme and the happy ending.\n\nBased on the visual representation in Figure 4, the nodes retrieved for both questions appear to be from the upper layers of the tree.\n\nAnswer: 23, 24, 25, 26", "citations": [], "input_tokens": 22903, "output_tokens": 241, "cost_micros": 72324, "latency_ms": 8171, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.28571428571428575, "method": "list_set", "normalised_pred": "23, 24, 25, 26", "normalised_gold": "16, 19, 25"}}
+{"qid": "2401.18059v1.pdf::Q000", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "[16, 19, 25]", "pages": 23, "document_id": 7597, "arm": "azure_premium_lc", "question_id": "2401.18059v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 4, I need to identify which nodes are retrieved by RAPTOR for both questions about Cinderella.\n\nThe legend indicates:\n- Nodes retrieved for Question 1 (central theme) are shown in one color\n- Nodes retrieved for Question 2 (happy ending) are shown in another color\n- Nodes retrieved for BOTH questions would be those highlighted with both colors\n\nFrom the figure, examining the tree structure and the highlighted nodes, the nodes that appear to be retrieved for both questions are the nodes that contain overlapping/comprehensive information about the story.\n\nBased on the visual representation in Figure 4, the nodes retrieved by RAPTOR for both questions are:\n\nAnswer: 16, 17, 18", "citations": [], "input_tokens": 27206, "output_tokens": 161, "cost_micros": 84033, "latency_ms": 5530, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.3333333333333333, "method": "list_set", "normalised_pred": "16, 17, 18", "normalised_gold": "16, 19, 25"}}
+{"qid": "2401.18059v1.pdf::Q000", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "[16, 19, 25]", "pages": 23, "document_id": 7597, "arm": "llamacloud_basic_lc", "question_id": "2401.18059v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 4 in the document, I need to identify which nodes RAPTOR retrieves for both Question 1 (\"What is the central theme of the story?\") and Question 2 (\"How did Cinderella find a happy ending?\").\n\nFrom the figure description and the illustration:\n- For Question 1, RAPTOR retrieves certain highlighted nodes\n- For Question 2, RAPTOR retrieves certain highlighted nodes\n\nThe nodes that appear to be retrieved for BOTH questions (nodes that would be highlighted for both questions) are the nodes that contain information relevant to both the central theme and Cinderella's happy ending.\n\nBased on the visual representation in Figure 4, the nodes retrieved by RAPTOR for both questions appear to be nodes 9 and 10 (which are the upper-level summary nodes that contain broader thematic information relevant to both questions).\n\nAnswer: Node 9, Node 10", "citations": [], "input_tokens": 23495, "output_tokens": 203, "cost_micros": 73530, "latency_ms": 6937, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "node 10, node 9", "normalised_gold": "16, 19, 25"}}
+{"qid": "2401.18059v1.pdf::Q000", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "[16, 19, 25]", "pages": 23, "document_id": 7597, "arm": "llamacloud_premium_lc", "question_id": "2401.18059v1.pdf::Q000", "answer_letter": null, "raw_text": "Looking at Figure 4 in the document, I need to identify which nodes are retrieved by RAPTOR for both questions.\n\nThe legend indicates:\n- Orange nodes/arrows: RAPTOR retrieved for Question 1 (\"What is the central theme of the story?\")\n- Purple nodes/arrows: RAPTOR retrieved for Question 2 (\"How did Cinderella find a happy ending?\")\n\nFrom the tree diagram, I can see:\n- Question 1 (orange): Node 25 and Node 19 are highlighted\n- Question 2 (purple): Node 17 and Node 20 are highlighted\n\nLooking for nodes that appear in both colors (retrieved for both questions), I need to check if any nodes have both orange and purple highlighting. However, from the visual representation, each highlighted node appears to be associated with only one question - the nodes are distinct between the two questions.\n\nUpon careful examination of Figure 4, there don't appear to be any nodes that are highlighted in both colors, meaning no nodes are retrieved by RAPTOR for both questions simultaneously.\n\nAnswer: None", "citations": [], "input_tokens": 26136, "output_tokens": 239, "cost_micros": 81993, "latency_ms": 7280, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "none", "normalised_gold": "16, 19, 25"}}
+{"qid": "2401.18059v1.pdf::Q000", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "[16, 19, 25]", "pages": 23, "document_id": 7597, "arm": "surfsense_agentic", "question_id": "2401.18059v1.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find Figure 4 and identify which nodes are retrieved by RAPTOR for both questions.Let me search for Figure 4 in the document:Now let me read the specific section containing Figure 4 from the target document:Now let me look at the actual figure content. I need to read more lines to see the figure structure:I can see the figure caption clearly states the two questions. Now I need to look at the actual tree diagram to see which nodes are highlighted for both questions. Let me check if there's more detail in the figure or if I need to examine it more carefully. Let me search for more context about the nodes:Let me look at the image content more carefully. Let me read around line 863 where the OCR text shows the nodes:Perfect! Now I can see the key information. Looking at the visual description:\n\n- **Orange/peach borders** (RAPTOR for Question 1): Nodes 15, 11, 18\n- **Purple/magenta borders** (RAPTOR for Question 2): Nodes 2, 3, 5, 11, 16, 17, 19\n\nThe nodes that appear in BOTH lists (retrieved by RAPTOR for both questions) is: **Node 11**\n\nHowever, let me also check the arrows to be thorough:Based on the visual description of Figure 4:\n\n**Question 1** (orange/peach): Nodes 15, 11, 18\n**Question 2** (purple/magenta): Nodes 2, 3, 5, 11, 16, 17, 19\n\nThe node that appears in both lists is **11**.\n\nAdditionally, looking at the arrows (which point to DPR's retrievals), nodes 3 and 11 have both orange and purple arrows, but the question asks specifically about which nodes are retrieved by RAPTOR (indicated by the border colors), not DPR (indicated by the arrows).\n\nAnswer: 11", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 61625, "error": null, "extra": {"thread_id": 2048, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_99be096db80e4817854f19cccddee98b", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 378, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "11", "normalised_gold": "16, 19, 25"}}
+{"qid": "2401.18059v1.pdf::Q001", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['Context Length', 'F1']", "pages": 23, "document_id": 7597, "arm": "native_pdf", "question_id": "2401.18059v1.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 3 on page 6 of the document, I can identify the axes of the graph.\n\nThe figure shows \"Comparison of querying methods. Results on 20 stories from the QASPER dataset using tree traversal with different top-k values, and collapsed tree with different context lengths.\"\n\nThe horizontal axis is labeled \"Context Length\" and the vertical axis is labeled \"F1\" (which represents the F1 score metric).\n\nAnswer: Context Length, F1", "citations": [], "input_tokens": 58457, "output_tokens": 109, "cost_micros": 177006, "latency_ms": 33343, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2401.18059v1.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "context length, f1", "normalised_gold": "context length, f1"}}
+{"qid": "2401.18059v1.pdf::Q001", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['Context Length', 'F1']", "pages": 23, "document_id": 7597, "arm": "azure_basic_lc", "question_id": "2401.18059v1.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 3 in the document, which is titled \"Comparison of querying methods\":\n\nThe horizontal axis is labeled \"Context Length\" (measuring from 500 to 2500).\n\nThe vertical axis is labeled \"F1\" (measuring from approximately 40 to 55).\n\nAnswer: Context Length, F1", "citations": [], "input_tokens": 22900, "output_tokens": 74, "cost_micros": 69810, "latency_ms": 4109, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "context length, f1", "normalised_gold": "context length, f1"}}
+{"qid": "2401.18059v1.pdf::Q001", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['Context Length', 'F1']", "pages": 23, "document_id": 7597, "arm": "azure_premium_lc", "question_id": "2401.18059v1.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 3 in the document:\n\nThe figure shows \"Comparison of querying methods. Results on 20 stories from the QASPER dataset using tree traversal with different top-k values, and collapsed tree with different context lengths.\"\n\nThe horizontal axis is labeled \"Context Length\" and the vertical axis shows the performance metric (appears to be accuracy or F1 score percentage based on the scale showing values like 45, 80, etc.).\n\nAnswer: Context Length, Performance Score (F1/Accuracy)", "citations": [], "input_tokens": 27203, "output_tokens": 116, "cost_micros": 83349, "latency_ms": 5796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "context length, performance score f1 accuracy", "normalised_gold": "context length, f1"}}
+{"qid": "2401.18059v1.pdf::Q001", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['Context Length', 'F1']", "pages": 23, "document_id": 7597, "arm": "llamacloud_basic_lc", "question_id": "2401.18059v1.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 3 in the document, I need to identify the horizontal and vertical axes.\n\nFrom the figure caption and the graph itself:\n- The horizontal axis shows \"Context Length\" (measured in tokens, ranging from 500 to 2500)\n- The vertical axis shows the performance metric (which appears to be accuracy or F1 score, ranging from approximately 50 to 58)\n\nAnswer: Context Length, Performance metric (accuracy/F1 score)", "citations": [], "input_tokens": 23492, "output_tokens": 105, "cost_micros": 72051, "latency_ms": 5092, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "context length, performance metric accuracy f1 score", "normalised_gold": "context length, f1"}}
+{"qid": "2401.18059v1.pdf::Q001", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['Context Length', 'F1']", "pages": 23, "document_id": 7597, "arm": "llamacloud_premium_lc", "question_id": "2401.18059v1.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 3 in the document:\n\nThe figure shows \"Comparison of querying methods\" with results on 20 stories from the QASPER dataset.\n\nFrom the table in Figure 3:\n- The horizontal axis shows \"Context Length\" (measured in tokens: 500, 1000, 1250, 1500, 1750, 2000, 2500)\n- The vertical axis shows \"F1\" (performance metric)\n\nAnswer: Context Length, F1", "citations": [], "input_tokens": 26133, "output_tokens": 113, "cost_micros": 80094, "latency_ms": 4717, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "context length, f1", "normalised_gold": "context length, f1"}}
+{"qid": "2401.18059v1.pdf::Q001", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['Context Length', 'F1']", "pages": 23, "document_id": 7597, "arm": "surfsense_agentic", "question_id": "2401.18059v1.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the document to find Figure 3 and identify its axes.Let me search for \"Figure 3\" in the document.Now let me search for the figure content with context:Let me read the section around line 786 to see the full figure details:Let me continue reading to see the complete axis information:This appears to be describing a different figure. Let me look more carefully at the figure data around line 826:Based on the figure caption and the data I see, I can see \"Context Length\" is mentioned at line 826. However, I need to find what the vertical axis represents. Let me search around the figure more carefully. The numbers like 45, 5, 7, 3, 4, 80 seem to be data values. Let me look for axis labels by reading the embedded image description or OCR text more carefully:Let me search for any text that might indicate what metric is being measured (the vertical axis). Let me look more broadly around Figure 3:I can see \"Context Length\" appears at line 826, which is likely the horizontal axis. The text mentions \"Figure 3 shows the performance\" but I need to find what specific performance metric. Let me search for what metric is being used in QASPER:Great! I found it. Let me read around line 712 to confirm this is for Figure 3:Perfect! I found the information. Looking at lines 693, 704, 711-712:\n- Line 693: \"F1\"\n- Line 704: \"Context Length\"\n- Line 711: \"X-axis shows context length\"\n- Line 712: \"Y-axis shows F1 score\"\n\nSo for Figure 3:\n- Horizontal axis (X-axis) = Context Length\n- Vertical axis (Y-axis) = F1\n\nAnswer: Context Length, F1", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 67266, "error": null, "extra": {"thread_id": 2049, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_3509043a1828411e846352ac4e585d70", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 401, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "context length, f1", "normalised_gold": "context length, f1"}}
+{"qid": "2401.18059v1.pdf::Q002", "doc_id": "2401.18059v1.pdf", "answer_format": "str", "gold": "Collapsed Tree Algorithm", "pages": 23, "document_id": 7597, "arm": "native_pdf", "question_id": "2401.18059v1.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the \"PSEUDOCODE FOR RETRIEVAL METHODS\" section and count the lines in each algorithm.\n\nLooking at Appendix F on page 18, I can see:\n\n**Algorithm 1: Tree Traversal Algorithm**\nLet me count the lines of code (excluding comments and function declarations):\n- Line 1: Scurrent \u2190 tree.layer[0]\n- Line 2: for layer in range(tree.num layers) do\n- Line 3: topk \u2190 []\n- Line 4: for node in Scurrent do\n- Line 5: score \u2190 dot product(query, node)\n- Line 6: top k.append((node, score))\n- Line 7: end for\n- Line 8: Slayer \u2190 sorted(top k)[:k].nodes\n- Line 9: Scurrent \u2190 Slayer\n- Line 10: end for\n- Line 11: return S0 \u222a S1 \u222a S2 \u222a . . . \u222a Sk\n\nThat's approximately 11 lines of code.\n\n**Algorithm 2: Collapsed Tree Algorithm** (page 19)\nLet me count the lines:\n- Line 1: tree \u2190 flatten(tree)\n- Line 2: top nodes \u2190 []\n- Line 3: for node in tree do\n- Line 4: top nodes.append((node, dot product(query, node))\n- Line 5: end for\n- Line 6: top nodes \u2190 sorted(top nodes)\n- Line 7: result \u2190 []\n- Line 8: total tokens \u2190 0\n- Line 9: for node in top nodes do\n- Line 10: if total tokens + node.token size < max tokens then\n- Line 11: result.append(node)\n- Line 12: end if\n- Line 13: total tokens \u2190 total tokens + node.token size\n- Line 14: end for\n- Line 15: return result\n\nThat's approximately 15 lines of code.\n\nAnswer: Collapsed Tree Algorithm", "citations": [], "input_tokens": 58465, "output_tokens": 483, "cost_micros": 182640, "latency_ms": 36671, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2401.18059v1.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "collapsed tree algorithm", "normalised_gold": "collapsed tree algorithm"}}
+{"qid": "2401.18059v1.pdf::Q002", "doc_id": "2401.18059v1.pdf", "answer_format": "str", "gold": "Collapsed Tree Algorithm", "pages": 23, "document_id": 7597, "arm": "azure_basic_lc", "question_id": "2401.18059v1.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the pseudocode section in Appendix F:\n\n**Algorithm 1: Tree Traversal Algorithm**\n- Lines: function declaration, initialization (Scurrent), for loop, topk initialization, nested for loop, score calculation, top_k.append, end for, Slayer assignment, Scurrent assignment, end for, return statement, end function\n- Total: **13 lines**\n\n**Algorithm 2: Collapsed Tree Algorithm**\n- Lines: function declaration, tree flatten, top_nodes initialization, for loop, top_nodes.append, end for, top_nodes sort, result initialization, total_tokens initialization, for loop, if statement, result.append, end if, total_tokens update, end for, return statement, end function\n- Total: **17 lines**\n\nAnswer: Collapsed Tree Algorithm", "citations": [], "input_tokens": 22908, "output_tokens": 184, "cost_micros": 71484, "latency_ms": 5922, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "collapsed tree algorithm", "normalised_gold": "collapsed tree algorithm"}}
+{"qid": "2401.18059v1.pdf::Q002", "doc_id": "2401.18059v1.pdf", "arm": "azure_premium_lc", "answer_format": "str", "gold": "Collapsed Tree Algorithm", "pages": 23, "document_id": 7597, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:07:59Z", "latency_ms": 7156, "error": null, "raw_text_chars": 752}]}, "question_id": "2401.18059v1.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Appendix F \"PSEUDOCODE FOR RETRIEVAL METHODS\":\n\n**Algorithm 1 Tree Traversal Algorithm:**\n- Lines: function declaration, initialization of Scurrent, for layer loop, initialization of topk, for node loop, score calculation, append to top_k, end for, Slayer assignment, Scurrent update, end for, return statement, end function\n- Total: 13 lines\n\n**Algorithm 2 Collapsed Tree Algorithm:**\n- Lines: function declaration, flatten tree, initialization of top_nodes, for node loop, append with dot product, end for, sort top_nodes, result initialization, total_tokens initialization, for node loop, if condition, result append, end if, total_tokens update, end for, return statement, end function\n- Total: 16 lines\n\nAnswer: Collapsed Tree Algorithm", "citations": [], "input_tokens": 27211, "output_tokens": 190, "cost_micros": 84483, "latency_ms": 7156, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "collapsed tree algorithm", "normalised_gold": "collapsed tree algorithm"}}
+{"qid": "2401.18059v1.pdf::Q002", "doc_id": "2401.18059v1.pdf", "answer_format": "str", "gold": "Collapsed Tree Algorithm", "pages": 23, "document_id": 7597, "arm": "llamacloud_basic_lc", "question_id": "2401.18059v1.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the pseudocode in Appendix F:\n\n**Algorithm 1 (Tree Traversal Algorithm):**\n- Lines 1-13 (including function definition, loops, operations, and end statements)\n\n**Algorithm 2 (Collapsed Tree Algorithm):**\n- Lines 1-17 (including function definition, loops, operations, and end statements)\n\nCounting all lines including the function declarations, loop structures, operations, and end statements, Algorithm 2 (Collapsed Tree Algorithm) has more lines.\n\nAnswer: Collapsed Tree Algorithm", "citations": [], "input_tokens": 23500, "output_tokens": 124, "cost_micros": 72360, "latency_ms": 5562, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "collapsed tree algorithm", "normalised_gold": "collapsed tree algorithm"}}
+{"qid": "2401.18059v1.pdf::Q002", "doc_id": "2401.18059v1.pdf", "answer_format": "str", "gold": "Collapsed Tree Algorithm", "pages": 23, "document_id": 7597, "arm": "llamacloud_premium_lc", "question_id": "2401.18059v1.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the pseudocode in Section F \"PSEUDOCODE FOR RETRIEVAL METHODS\":\n\n**Algorithm 1 (Tree Traversal Algorithm):**\n- Lines include: function declaration, variable initialization, for loop with nested for loop, operations inside loops, and return statement\n- Counting all lines: approximately 13 lines\n\n**Algorithm 2 (Collapsed Tree Algorithm):**\n- Lines include: function declaration, tree flattening, initialization, for loop for scoring, sorting, result initialization, token counter, for loop with conditional, and return statement\n- Counting all lines: approximately 15 lines\n\nAnswer: Collapsed Tree Algorithm", "citations": [], "input_tokens": 26141, "output_tokens": 143, "cost_micros": 80568, "latency_ms": 6157, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "collapsed tree algorithm", "normalised_gold": "collapsed tree algorithm"}}
+{"qid": "2401.18059v1.pdf::Q002", "doc_id": "2401.18059v1.pdf", "answer_format": "str", "gold": "Collapsed Tree Algorithm", "pages": 23, "document_id": 7597, "arm": "surfsense_agentic", "question_id": "2401.18059v1.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the document to find the \"PSEUDOCODE FOR RETRIEVAL METHODS\" section and count the lines for each algorithm.Let me search for the \"PSEUDOCODE FOR RETRIEVAL METHODS\" section:Now let me search for more specific context around pseudocode:Let me read the section with the algorithms:Perfect! I can see both algorithms. Let me count the lines:\n\n**Algorithm 1: Tree Traversal Algorithm** (lines 2344-2356):\n- Line 2344: function TRAVERSETREE(tree, query, k)\n- Line 2345: Scurrent + tree.layer[0]\n- Line 2346: for layer in range(tree.num_layers) do\n- Line 2347: topk + []\n- Line 2348: for node in Scurrent do\n- Line 2349: score + dot_product(query, node)\n- Line 2350: top_k.append((node, score))\n- Line 2351: end for\n- Line 2352: Slayer + sorted(top_k)[:k].nodes\n- Line 2353: Scurrent + Slayer\n- Line 2354: end for\n- Line 2355: return So U S1 U S2 U . . . USk\n- Line 2356: end function\n\n**Total: 13 lines**\n\n**Algorithm 2: Collapsed Tree Algorithm** (lines 2375-2391):\n- Line 2375: function COLLAPSEDTREE(tree, query, k, max_tokens)\n- Line 2376: tree \u2039- flatten(tree)\n- Line 2377: top_nodes <- []\n- Line 2378: for node in tree do\n- Line 2379: top_nodes.append((node, dot_product(query, node))\n- Line 2380: end for\n- Line 2381: top_nodes \u2039- sorted(top_nodes)\n- Line 2382: result +- []\n- Line 2383: total_tokens <- 0\n- Line 2384: for node in top nodes do\n- Line 2385: if total_tokens + node.token_size < max_tokens then\n- Line 2386: result.append(node)\n- Line 2387: end if\n- Line 2388: total_tokens \u2039- total_tokens + node.token_size\n- Line 2389: end for\n- Line 2390: return result\n- Line 2391: end function\n\n**Total: 17 lines**\n\nAnswer: Collapsed Tree Algorithm", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 33921, "error": null, "extra": {"thread_id": 2050, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_8432e22fe2074cf88b3f3fbd3ebd2c32", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 208, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "collapsed tree algorithm", "normalised_gold": "collapsed tree algorithm"}}
+{"qid": "2401.18059v1.pdf::Q003", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['green', 'yellow']", "pages": 23, "document_id": 7597, "arm": "native_pdf", "question_id": "2401.18059v1.pdf::Q003", "answer_letter": null, "raw_text": "I need to examine Figure 1 to identify nodes that appear in more than one cluster and determine their colors.\n\nLooking at Figure 1 on page 2, which shows the \"Formation of one tree layer\" process:\n\nThe figure shows the clustering step (labeled \"1. Clustering\") where text chunks numbered 1-5 are grouped. The diagram illustrates how nodes are clustered together, with some nodes appearing to be part of multiple clusters during the soft clustering process.\n\nIn the clustering visualization, I can see that:\n- The nodes are represented as numbered boxes (1, 2, 3, 4, 5)\n- Some nodes appear in multiple cluster groupings\n- Looking at the color coding in the diagram, nodes that belong to multiple clusters appear to be shown with a pink/red outline or highlighting\n\nExamining the \"Contents of a node\" box on the right side and the clustering diagram in the middle, nodes that appear in multiple clusters are highlighted in pink/red color.\n\nAnswer: pink, red", "citations": [], "input_tokens": 58465, "output_tokens": 221, "cost_micros": 178710, "latency_ms": 37828, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2401.18059v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "pink, red", "normalised_gold": "green, yellow"}}
+{"qid": "2401.18059v1.pdf::Q003", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['green', 'yellow']", "pages": 23, "document_id": 7597, "arm": "azure_basic_lc", "question_id": "2401.18059v1.pdf::Q003", "answer_letter": null, "raw_text": "I need to examine Figure 1 in the document to identify nodes that appear in more than one cluster and determine their colors.\n\nLooking at Figure 1, which shows the \"Tree construction process\" and \"Formation of one tree layer\":\n\nThe figure shows:\n- Text chunks numbered 1-5 at the bottom (leaf layer)\n- A clustering process in the middle showing how these chunks are grouped\n- The tree structure on the right\n\nIn the \"Formation of one tree layer\" section, I can see:\n- The clustering diagram shows nodes being grouped together\n- Looking at the clustering visualization, I need to identify which numbered nodes appear in multiple clusters\n\nExamining the clustering section carefully:\n- Node 1 appears to be in one cluster\n- Node 2 appears in multiple positions\n- Node 3 appears in multiple positions  \n- Node 4 appears to be in one cluster\n- Node 5 appears in multiple positions\n\nLooking at the colors used in the figure:\n- The nodes that appear in multiple clusters appear to be colored in shades that look like they could be blue, green, or orange/yellow tones in the clustering diagram.\n\nHowever, upon closer examination of Figure 1, the nodes shown in the \"1. Clustering\" section use different colors. The nodes that appear in overlapping or multiple cluster regions appear to use colors like blue and green based on the grayscale representation in the figure.\n\nLooking more carefully at the specific nodes (2, 3, 5) that appear in the clustering overlap, they appear to use blue and green colors.\n\nAnswer: blue, green", "citations": [], "input_tokens": 22908, "output_tokens": 347, "cost_micros": 73929, "latency_ms": 10734, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "blue, green", "normalised_gold": "green, yellow"}}
+{"qid": "2401.18059v1.pdf::Q003", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['green', 'yellow']", "pages": 23, "document_id": 7597, "arm": "azure_premium_lc", "question_id": "2401.18059v1.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Figure 1, I need to identify nodes that appear in multiple clusters during the clustering step.\n\nIn the \"Formation of one tree layer\" section of Figure 1, I can see the clustering process where nodes 1-5 are being grouped. \n\nExamining the clusters:\n- One cluster contains nodes 2 and 3\n- Another cluster contains nodes 4 and 5\n\nLooking at the visual representation, the nodes that appear in the clusters are shown with different colors. The nodes that would appear in multiple clusters (if any existed) would need to be identified by their color.\n\nHowever, from the figure, each node (1-5) appears to belong to only one cluster. Node 3 appears in the cluster with node 2, and node 5 appears in the cluster with node 4. None of the nodes are shown appearing in more than one cluster simultaneously.\n\nUpon careful examination of the figure, there don't appear to be any nodes shown in multiple clusters at the same time - the soft clustering concept is mentioned in the text but not visually demonstrated in this particular figure with overlapping memberships.\n\nAnswer: None", "citations": [], "input_tokens": 27211, "output_tokens": 246, "cost_micros": 85323, "latency_ms": 7733, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "none", "normalised_gold": "green, yellow"}}
+{"qid": "2401.18059v1.pdf::Q003", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['green', 'yellow']", "pages": 23, "document_id": 7597, "arm": "llamacloud_basic_lc", "question_id": "2401.18059v1.pdf::Q003", "answer_letter": null, "raw_text": "I need to examine Figure 1 carefully to identify nodes that appear in more than one cluster and determine their colors.\n\nLooking at Figure 1, which shows the \"Formation of one tree layer\" section:\n\nThe figure shows a clustering process where text chunks (numbered 1-5 at the bottom) are being clustered. \n\nIn the clustering visualization, I can see:\n- Text chunks numbered 1, 2, 3, 4, 5 at the bottom\n- The chunks are shown being grouped into clusters\n- After clustering (step 1), chunks 3 and 5 appear to be in multiple clusters based on the overlapping circles/groupings shown\n\nLooking at the visual representation, the nodes that appear in more than one cluster are shown in a yellow/gold color, while nodes that appear in only one cluster are shown in blue.\n\nSpecifically, nodes 3 and 5 appear in overlapping cluster regions and are colored yellow, indicating they belong to multiple clusters (soft clustering).\n\nAnswer: yellow", "citations": [], "input_tokens": 23500, "output_tokens": 224, "cost_micros": 73860, "latency_ms": 8203, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.6666666666666666, "method": "list_set", "normalised_pred": "yellow", "normalised_gold": "green, yellow"}}
+{"qid": "2401.18059v1.pdf::Q003", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['green', 'yellow']", "pages": 23, "document_id": 7597, "arm": "llamacloud_premium_lc", "question_id": "2401.18059v1.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document, I need to examine the tree structure diagram to identify nodes that appear in multiple clusters.\n\nFrom the diagram showing \"RAPTOR Tree\" and \"Formation of one tree layer\":\n\nThe diagram shows:\n- Text chunks labeled 1, 2, 3, 4, 5 at the bottom (leaf nodes)\n- These are clustered into groups\n- Summary nodes 6, 7, 8 are created from clusters\n- Higher level nodes 9, 10 are created from further clustering\n\nExamining the tree structure:\n- Node 6 is connected to nodes 1 and 2\n- Node 7 is connected to node 3\n- Node 8 is connected to nodes 4 and 5\n- Node 9 is connected to nodes 6 and 7\n- Node 10 is connected to node 8\n\nLooking at the clustering process description, the figure shows standard hierarchical clustering where each node belongs to one cluster at each level. The nodes are represented in the diagram without specific color coding mentioned in the text, and the diagram appears to be a standard tree structure where nodes don't appear in multiple clusters simultaneously.\n\nHowever, based on the visual representation in the document, the nodes appear to be shown in a neutral/default color (likely gray or black outlines) in the standard tree diagram format.\n\nAnswer: The figure does not show nodes in multiple clusters with distinct colors", "citations": [], "input_tokens": 26141, "output_tokens": 317, "cost_micros": 83178, "latency_ms": 9719, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "figure does not show nodes in multiple clusters with distinct colors", "normalised_gold": "green, yellow"}}
+{"qid": "2401.18059v1.pdf::Q003", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['green', 'yellow']", "pages": 23, "document_id": 7597, "arm": "surfsense_agentic", "question_id": "2401.18059v1.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the document to find Figure 1 and identify the colors of nodes that appear in more than one cluster.Let me continue reading to see the full description of Figure 1:Based on my reading of the document, I can see the description of Figure 1. Let me look more carefully at the visual description section that describes the tree construction process and the colors of nodes. Looking at lines 426-456, I found the relevant information:\n\nIn the visual description of Figure 1:\n- **Purple/Lavender**: Tree nodes (non-selected or leaf nodes)\n- **Pink**: Selected/relevant nodes and retrieved context\n\nThe description states that in the tree traversal diagram:\n- \"Purple/lavender nodes at leaf positions\"\n- \"Pink nodes appear at internal/decision points\"\n\nAnd in the Part B (Collapsed Tree Retrieval) section, it mentions:\n- \"Mix of purple and pink boxes in linear sequence\"\n\nFrom the description of the tree formation process in lines 305-385, it shows nodes being clustered together, and the color coding explanation in lines 447-453 clearly indicates:\n- **Purple/Lavender**: Tree nodes (non-selected or leaf nodes)\n- **Pink**: Selected/relevant nodes and retrieved context\n\nSince the question asks about nodes that appear in more than one cluster, and based on the visual description which shows that pink nodes appear at internal/decision points while purple/lavender nodes are at leaf positions, the nodes that would appear in multiple clusters (as parent nodes summarizing multiple child clusters) would be the **pink** nodes.\n\nHowever, let me verify this interpretation by checking if there's more specific information about which colored nodes appear in multiple clusters in the demonstration.\n\nLooking at the figure caption (lines 302-303): \"Nodes clustered together are siblings; a parent node contains the text summary of that cluster.\"\n\nThis indicates that parent nodes (which would be pink based on the color coding) are the ones that span multiple clusters by summarizing them.\n\nAnswer: pink", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 36328, "error": null, "extra": {"thread_id": 2051, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_50d7b2d924b34b7ab5e0bc22ccd0b3d6", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 237, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "pink", "normalised_gold": "green, yellow"}}
+{"qid": "2401.18059v1.pdf::Q005", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['Gaussian Mixture Models', 'the high dimensionality of vector embeddings']", "pages": 23, "document_id": 7597, "arm": "native_pdf", "question_id": "2401.18059v1.pdf::Q005", "answer_letter": null, "raw_text": "I need to find information about the clustering algorithm used in this paper and what challenges it faces.\n\nFrom page 3-4, I can find the relevant information about the clustering algorithm:\n\nThe paper states: \"Our clustering algorithm is based on Gaussian Mixture Models (GMMs), an approach that offers both flexibility and a probabilistic framework.\"\n\nRegarding the challenge, it states: \"The high dimensionality of vector embeddings presents a challenge for traditional GMMs, as distance metrics may behave poorly when used to measure similarity in high-dimensional spaces (Aggarwal et al., 2001).\"\n\nTo address this challenge, the paper mentions: \"To mitigate this, we employ Uniform Manifold Approximation and Projection (UMAP), a manifold learning technique for dimensionality reduction (McInnes et al., 2018).\"\n\nAnswer: Gaussian Mixture Models (GMMs), high dimensionality of vector embeddings", "citations": [], "input_tokens": 58463, "output_tokens": 208, "cost_micros": 178509, "latency_ms": 33342, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2401.18059v1.pdf"}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "gaussian mixture models gmms, high dimensionality of vector embeddings", "normalised_gold": "gaussian mixture models, high dimensionality of vector embeddings"}}
+{"qid": "2401.18059v1.pdf::Q005", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['Gaussian Mixture Models', 'the high dimensionality of vector embeddings']", "pages": 23, "document_id": 7597, "arm": "azure_basic_lc", "question_id": "2401.18059v1.pdf::Q005", "answer_letter": null, "raw_text": "I'll search through the document for information about the clustering algorithm and its challenges.\n\nLooking at Section 3 \"METHODS\", specifically the \"Clustering Algorithm\" subsection:\n\nThe document states:\n\"Our clustering algorithm is based on Gaussian Mixture Models (GMMs), an approach that offers both flexibility and a probabilistic framework.\"\n\nAnd regarding the challenge:\n\"The high dimensionality of vector embeddings presents a challenge for traditional GMMs, as distance metrics may behave poorly when used to measure similarity in high-dimensional spaces\"\n\nAnswer: Gaussian Mixture Models, high dimensionality of vector embeddings", "citations": [], "input_tokens": 22906, "output_tokens": 134, "cost_micros": 70728, "latency_ms": 7625, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "gaussian mixture models, high dimensionality of vector embeddings", "normalised_gold": "gaussian mixture models, high dimensionality of vector embeddings"}}
+{"qid": "2401.18059v1.pdf::Q005", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['Gaussian Mixture Models', 'the high dimensionality of vector embeddings']", "pages": 23, "document_id": 7597, "arm": "azure_premium_lc", "question_id": "2401.18059v1.pdf::Q005", "answer_letter": null, "raw_text": "Looking through the document for information about the clustering algorithm...\n\nIn Section 3, under \"Clustering Algorithm\", the document states:\n\n\"Our clustering algorithm is based on Gaussian Mixture Models (GMMs), an approach that offers both flexibility and a probabilistic framework.\"\n\nIt then discusses the challenge:\n\n\"The high dimensionality of vector embeddings presents a challenge for traditional GMMs, as distance metrics may behave poorly when used to measure similarity in high-dimensional spaces\"\n\nAnswer: Gaussian Mixture Models, high dimensionality of vector embeddings", "citations": [], "input_tokens": 27209, "output_tokens": 120, "cost_micros": 83427, "latency_ms": 5312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "gaussian mixture models, high dimensionality of vector embeddings", "normalised_gold": "gaussian mixture models, high dimensionality of vector embeddings"}}
+{"qid": "2401.18059v1.pdf::Q005", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['Gaussian Mixture Models', 'the high dimensionality of vector embeddings']", "pages": 23, "document_id": 7597, "arm": "llamacloud_basic_lc", "question_id": "2401.18059v1.pdf::Q005", "answer_letter": null, "raw_text": "Looking through the document for information about the clustering algorithm...\n\nIn Section 3, under \"Clustering Algorithm\", I can find the relevant information:\n\nThe clustering algorithm is based on **Gaussian Mixture Models (GMMs)**.\n\nThe challenge presented to it is stated as: \"The high dimensionality of vector embeddings presents a challenge for traditional GMMs, as distance metrics may behave poorly when used to measure similarity in high-dimensional spaces.\"\n\nAnswer: Gaussian Mixture Models, high dimensionality of vector embeddings", "citations": [], "input_tokens": 23498, "output_tokens": 114, "cost_micros": 72204, "latency_ms": 4781, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "gaussian mixture models, high dimensionality of vector embeddings", "normalised_gold": "gaussian mixture models, high dimensionality of vector embeddings"}}
+{"qid": "2401.18059v1.pdf::Q005", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['Gaussian Mixture Models', 'the high dimensionality of vector embeddings']", "pages": 23, "document_id": 7597, "arm": "llamacloud_premium_lc", "question_id": "2401.18059v1.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about the clustering algorithm and what presents a challenge to it.\n\nFrom Section 3 \"METHODS\" under \"Clustering Algorithm\":\n\nThe document states:\n\"Our clustering algorithm is based on Gaussian Mixture Models (GMMs), an approach that offers both flexibility and a probabilistic framework.\"\n\nThen it mentions:\n\"The high dimensionality of vector embeddings presents a challenge for traditional GMMs, as distance metrics may behave poorly when used to measure similarity in high-dimensional spaces\"\n\nAnswer: Gaussian Mixture Models, high dimensionality of vector embeddings", "citations": [], "input_tokens": 26139, "output_tokens": 134, "cost_micros": 80427, "latency_ms": 6639, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "gaussian mixture models, high dimensionality of vector embeddings", "normalised_gold": "gaussian mixture models, high dimensionality of vector embeddings"}}
+{"qid": "2401.18059v1.pdf::Q005", "doc_id": "2401.18059v1.pdf", "answer_format": "list", "gold": "['Gaussian Mixture Models', 'the high dimensionality of vector embeddings']", "pages": 23, "document_id": 7597, "arm": "surfsense_agentic", "question_id": "2401.18059v1.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to find information about the clustering algorithm, what model it's based on, and what presents a challenge to it.Let me search for information about the clustering algorithm and its base model:Perfect! I found the information about the clustering algorithm. Let me verify by reading a bit more to ensure I have the complete answer:Based on the document, I have found the answer to the question:\n\n1. The clustering algorithm is based on **Gaussian Mixture Models (GMMs)**\n2. The challenge to it is **the high dimensionality of vector embeddings** (as stated: \"The high dimensionality of vector embeddings presents a challenge for traditional GMMs, as distance metrics may behave poorly when used to measure similarity in high-dimensional spaces\")\n\nAnswer: Gaussian Mixture Models, high dimensionality of vector embeddings", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 34000, "error": null, "extra": {"thread_id": 2052, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_6459e8a5fde745a3b9337a10119c8000", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 161, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "gaussian mixture models, high dimensionality of vector embeddings", "normalised_gold": "gaussian mixture models, high dimensionality of vector embeddings"}}
+{"qid": "2401.18059v1.pdf::Q006", "doc_id": "2401.18059v1.pdf", "answer_format": "str", "gold": "Slayer \u2190 sorted(top k)[:k].nodes", "pages": 23, "document_id": 7597, "arm": "native_pdf", "question_id": "2401.18059v1.pdf::Q006", "answer_letter": null, "raw_text": "I need to find the pseudocode for the tree traversal method in the appendix and identify step 5.\n\nLooking at Appendix F, I can see Algorithm 1 which is the Tree Traversal Algorithm.\n\nLet me examine the steps in the algorithm:\n\n```\nfunction TRAVERSETREE(tree, query, k)\n    Scurrent \u2190 tree.layer[0]\n    for layer in range(tree.num layers) do\n        topk \u2190 []\n        for node in Scurrent do\n            score \u2190 dot product(query, node)\n            top k.append((node, score))\n        end for\n        Slayer \u2190 sorted(top k)[:k].nodes\n        Scurrent \u2190 Slayer\n    end for\n    return S0 \u222a S1 \u222a S2 \u222a . . . \u222a Sk\nend function\n```\n\nCounting the steps:\n1. `Scurrent \u2190 tree.layer[0]`\n2. `for layer in range(tree.num layers) do`\n3. `topk \u2190 []`\n4. `for node in Scurrent do`\n5. `score \u2190 dot product(query, node)`\n\nStep 5 is: `score \u2190 dot product(query, node)`\n\nAnswer: score \u2190 dot product(query, node)", "citations": [], "input_tokens": 58458, "output_tokens": 318, "cost_micros": 180144, "latency_ms": 33764, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2401.18059v1.pdf"}, "graded": {"correct": false, "f1": 0.15384615384615383, "method": "str_norm", "normalised_pred": "score \u2190 dot product query node", "normalised_gold": "slayer \u2190 sorted top k k nodes"}}
+{"qid": "2401.18059v1.pdf::Q006", "doc_id": "2401.18059v1.pdf", "answer_format": "str", "gold": "Slayer \u2190 sorted(top k)[:k].nodes", "pages": 23, "document_id": 7597, "arm": "azure_basic_lc", "question_id": "2401.18059v1.pdf::Q006", "answer_letter": null, "raw_text": "Looking at the tree traversal method steps in Section 3 of the document, step 5 states:\n\"5. Continue this process recursively for d layers, producing sets S1, S2, . .. , Sd.\"\n\nNow checking Algorithm 1 (Tree Traversal Algorithm) in Appendix F, the line that corresponds to this step is:\n\n```\nfor layer in range(tree.num_layers) do\n```\n\nThis is the loop that continues the process recursively through the layers.\n\nAnswer: for layer in range(tree.num_layers) do", "citations": [], "input_tokens": 22901, "output_tokens": 129, "cost_micros": 70638, "latency_ms": 5359, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "for layer in range tree num layers do", "normalised_gold": "slayer \u2190 sorted top k k nodes"}}
+{"qid": "2401.18059v1.pdf::Q006", "doc_id": "2401.18059v1.pdf", "answer_format": "str", "gold": "Slayer \u2190 sorted(top k)[:k].nodes", "pages": 23, "document_id": 7597, "arm": "azure_premium_lc", "question_id": "2401.18059v1.pdf::Q006", "answer_letter": null, "raw_text": "Looking at the document, in Section 3 under \"Querying\", step 5 of the tree traversal method states:\n\n\"5. Continue this process recursively for d layers, producing sets S1, S2, ... , Sd.\"\n\nIn Appendix F, Algorithm 1 shows the Tree Traversal Algorithm pseudocode. Step 5 corresponds to the for loop that iterates through layers:\n\n```\nfor layer in range(tree.num_layers) do\n    topk + []\n    for node in Scurrent do\n        score + dot_product(query, node)\n        top_k.append((node, score))\n    end for\n    Slayer + sorted(top_k)[:k].nodes\n    Scurrent + Slayer\nend for\n```\n\nAnswer: for layer in range(tree.num_layers) do ... end for", "citations": [], "input_tokens": 27204, "output_tokens": 200, "cost_micros": 84612, "latency_ms": 7358, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "for layer in range tree num layers do end for", "normalised_gold": "slayer \u2190 sorted top k k nodes"}}
+{"qid": "2401.18059v1.pdf::Q006", "doc_id": "2401.18059v1.pdf", "answer_format": "str", "gold": "Slayer \u2190 sorted(top k)[:k].nodes", "pages": 23, "document_id": 7597, "arm": "llamacloud_basic_lc", "question_id": "2401.18059v1.pdf::Q006", "answer_letter": null, "raw_text": "Looking at the tree traversal method described in Section 3 (Querying section), step 5 states:\n\n\"5. Continue this process recursively for d layers, producing sets S\u2081, S\u2082, . . . , Sd.\"\n\nIn Appendix F, Algorithm 1 shows the Tree Traversal Algorithm pseudocode. The code that corresponds to step 5 is the for loop that iterates through the layers:\n\n```\nfor layer in range(tree.num_layers) do\n    top\u2096 \u2190 []\n    for node in Scurrent do\n        score \u2190 dot_product(query, node)\n        top_k.append((node, score))\n    end for\n    Slayer \u2190 sorted(top_k)[:k].nodes\n    Scurrent \u2190 Slayer\nend for\n```\n\nAnswer: for layer in range(tree.num_layers) do ... end for", "citations": [], "input_tokens": 23493, "output_tokens": 212, "cost_micros": 73659, "latency_ms": 7062, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "for layer in range tree num layers do end for", "normalised_gold": "slayer \u2190 sorted top k k nodes"}}
+{"qid": "2401.18059v1.pdf::Q006", "doc_id": "2401.18059v1.pdf", "answer_format": "str", "gold": "Slayer \u2190 sorted(top k)[:k].nodes", "pages": 23, "document_id": 7597, "arm": "llamacloud_premium_lc", "question_id": "2401.18059v1.pdf::Q006", "answer_letter": null, "raw_text": "Looking at the tree traversal method described in Section 3 (page 4-5), step 5 states:\n\n\"5. Continue this process recursively for $d$ layers, producing sets $S_1, S_2, \\dots, S_d$.\"\n\nNow looking at Algorithm 1 (Tree Traversal Algorithm) in Appendix F (page 18), the pseudo code line that corresponds to this step is:\n\n```\nS_current \u2190 S_layer\n```\n\nThis line updates the current set of nodes to continue the process to the next layer.\n\nAnswer: S_current \u2190 S_layer", "citations": [], "input_tokens": 26134, "output_tokens": 141, "cost_micros": 80517, "latency_ms": 5858, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.16666666666666666, "method": "str_norm", "normalised_pred": "s current \u2190 s layer", "normalised_gold": "slayer \u2190 sorted top k k nodes"}}
+{"qid": "2401.18059v1.pdf::Q006", "doc_id": "2401.18059v1.pdf", "answer_format": "str", "gold": "Slayer \u2190 sorted(top k)[:k].nodes", "pages": 23, "document_id": 7597, "arm": "surfsense_agentic", "question_id": "2401.18059v1.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the document to find the pseudocode in the appendix related to step 5 of the tree traversal method.Let me search for \"tree traversal\" and look at the appendix section to find the pseudocode.Now let me find the actual appendix section with pseudocode. Let me search for \"step 5\" or look at the end of the document where appendices typically are.Let me continue looking for the appendix sections. Let me search further in the document.Good, I found Appendix A. Let me continue searching for Appendix F which was mentioned to contain the pseudocode for both methods.Perfect! Found Appendix F. Let me read that section:Perfect! I found Algorithm 1 for Tree Traversal. Now let me look closely at the steps. The pseudocode shows:\n\n1. Line 2345: `Scurrent + tree.layer[0]` (Initialize current set)\n2. Line 2346: `for layer in range(tree.num_layers) do` (Loop through layers)\n3. Line 2347: `topk + []` (Initialize topk list)\n4. Line 2348: `for node in Scurrent do` (Loop through nodes)\n5. Line 2349: `score + dot_product(query, node)` (Calculate score)\n\nStep 5 corresponds to line 2349 which calculates the score using dot_product.\n\nAnswer: score + dot_product(query, node)", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 60780, "error": null, "extra": {"thread_id": 2053, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_cbc9bd5486c544c293802a0ecf0d5bc7", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 329, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "score dot product query node", "normalised_gold": "slayer \u2190 sorted top k k nodes"}}
+{"qid": "2405.09818v1.pdf::Q000", "doc_id": "2405.09818v1.pdf", "answer_format": "list", "gold": "['Figure 5', 'Figure 6']", "pages": 27, "document_id": 7598, "arm": "native_pdf", "question_id": "2405.09818v1.pdf::Q000", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 18671, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": null, "pdf_filename": "2405.09818v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "", "normalised_gold": "figure 5, figure 6"}}
+{"qid": "2405.09818v1.pdf::Q000", "doc_id": "2405.09818v1.pdf", "answer_format": "list", "gold": "['Figure 5', 'Figure 6']", "pages": 27, "document_id": 7598, "arm": "azure_basic_lc", "question_id": "2405.09818v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to identify which figures in the paper contain line plots.\n\nLet me go through the figures mentioned in the document:\n\n**Figure 1**: Shows the Chameleon architecture with image tokenizer, mixed-modal auto-regressive LM, and image de-tokenizer. This is a diagram, not a line plot.\n\n**Figure 2**: Shows sample interleaved image and text generation. This is an example output, not a line plot.\n\n**Figure 3**: Shows sample generation from Chameleon. This is an example output, not a line plot.\n\n**Figure 4**: Shows sample Chameleon outputs. This is an example output, not a line plot.\n\n**Figure 5**: Contains three subfigures:\n- Figure 5a: Shows \"output norm and training loss curves\" - this includes line plots\n- Figure 5b: \"An ablation with Chameleon-7B with and without QK-Norm\" - shows training loss, which is a line plot\n- Figure 5c: \"An ablation with Chameleon-7B with and without dropout\" - shows training loss, which is a line plot\n\n**Figure 6**: Contains three subfigures:\n- Figure 6a: Shows \"Training Curves for 600k steps for Chameleon-7B and Chameleon-34B\" - this is a line plot\n- Figure 6b: Shows \"Training loss curve with image generation disabled\" - this is a line plot\n- Figure 6c: Shows training loss curves with and without norm-reordering - this is a line plot\n\n**Figure 7**: Shows example alignment data for different categories. This appears to be text/table format, not line plots.\n\n**Figure 8**: Shows task categories and examples of prompts. This is not a line plot.\n\n**Figure 9**: Contains:\n- Figure 9a: Shows \"prompt task fulfillment rates\" as a horizontal bar chart\n- Figure 9b: Shows comparison results as a horizontal bar chart\n\n**Figure 10**: Shows \"inter-annotator agreement\" as a horizontal bar chart.\n\nAnswer: Figure 5, Figure 6", "citations": [], "input_tokens": 25078, "output_tokens": 479, "cost_micros": 82419, "latency_ms": 11155, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "figure 5, figure 6", "normalised_gold": "figure 5, figure 6"}}
+{"qid": "2405.09818v1.pdf::Q000", "doc_id": "2405.09818v1.pdf", "answer_format": "list", "gold": "['Figure 5', 'Figure 6']", "pages": 27, "document_id": 7598, "arm": "azure_premium_lc", "question_id": "2405.09818v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to identify all figures in the paper that contain line plots.\n\nLet me go through the figures systematically:\n\n**Figure 1**: Shows a diagram of the Chameleon architecture with image and text tokenization flow - not a line plot.\n\n**Figure 2**: Shows sample interleaved image and text generation - not a line plot.\n\n**Figure 3**: Shows sample generation from Chameleon - not a line plot.\n\n**Figure 4**: Shows sample Chameleon outputs - not a line plot.\n\n**Figure 5**: Contains three subfigures:\n- Figure 5a: Shows \"Output Norm\" vs \"Step\" - this is a line plot\n- Figure 5b: Shows \"Training Loss\" vs \"Step\" with and without QK-Norm - this is a line plot\n- Figure 5c: Shows \"Training Loss\" vs \"Step\" with and without dropout - this is a line plot\n\n**Figure 6**: Contains three subfigures:\n- Figure 6a: Shows \"Training Loss\" vs \"Step\" for 7B and 34B models - this is a line plot\n- Figure 6b: Shows \"Training Loss\" vs \"Step\" for 7B without image generation - this is a line plot\n- Figure 6c: Shows \"Training Loss\" vs \"Step\" with and without norm reordering - this is a line plot\n\n**Figure 7**: Shows example alignment data - not a line plot.\n\n**Figure 8**: Shows task categories and example prompts - not a line plot.\n\n**Figure 9**: Contains two subfigures:\n- Figure 9a: Shows a bar chart of task fulfillment rates - not a line plot\n- Figure 9b: Shows a horizontal bar chart comparing Chameleon vs baselines - not a line plot\n\n**Figure 10**: Shows a horizontal bar chart of inter-annotator agreement - not a line plot.\n\nAnswer: Figure 5a, Figure 5b, Figure 5c, Figure 6a, Figure 6b, Figure 6c", "citations": [], "input_tokens": 34447, "output_tokens": 457, "cost_micros": 110196, "latency_ms": 10469, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "figure 5a, figure 5b, figure 5c, figure 6a, figure 6b, figure 6c", "normalised_gold": "figure 5, figure 6"}}
+{"qid": "2405.09818v1.pdf::Q000", "doc_id": "2405.09818v1.pdf", "answer_format": "list", "gold": "['Figure 5', 'Figure 6']", "pages": 27, "document_id": 7598, "arm": "llamacloud_basic_lc", "question_id": "2405.09818v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to identify all figures in the document that contain line plots.\n\nLet me go through the figures systematically:\n\n**Figure 1**: Shows the Chameleon architecture diagram with tokens - not a line plot.\n\n**Figure 2**: Sample interleaved image and text generation - not a line plot.\n\n**Figure 3**: Sample generation from Chameleon with images - not a line plot.\n\n**Figure 4**: Sample Chameleon outputs with recipe - not a line plot.\n\n**Figure 5**: This figure has three subfigures:\n- (a) Shows \"Uncontrolled growth of output norms\" with what appears to be line plots showing training progress\n- (b) Shows \"An ablation with Chameleon-7B with and without QK-Norm\" - appears to be line plots\n- (c) Shows \"An ablation with Chameleon-7B with and without dropout\" - appears to be line plots\n\n**Figure 6**: This figure has parts showing training curves:\n- (a) \"Training loss curve with image generation disabled\" - line plot\n- (b) Shows training for Chameleon-34B - line plots\n\nThe caption mentions \"Training loss curves for Chameleon models under various settings.\"\n\n**Figure 7**: Example alignment data - not line plots.\n\n**Figure 8**: Task categories and examples of prompts - not line plots.\n\n**Figure 9**: The text mentions Figure 9a and 9b for task fulfillment and win rates - likely bar charts or similar, need to check the context. The text describes percentages but doesn't clearly indicate these are line plots.\n\n**Figure 10**: \"The inter-annotator agreement on the questions in the absolute evaluation\" - context suggests this might be a bar chart or similar visualization.\n\nAnswer: Figure 5, Figure 6", "citations": [], "input_tokens": 26087, "output_tokens": 410, "cost_micros": 84411, "latency_ms": 11625, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "figure 5, figure 6", "normalised_gold": "figure 5, figure 6"}}
+{"qid": "2405.09818v1.pdf::Q000", "doc_id": "2405.09818v1.pdf", "answer_format": "list", "gold": "['Figure 5', 'Figure 6']", "pages": 27, "document_id": 7598, "arm": "llamacloud_premium_lc", "question_id": "2405.09818v1.pdf::Q000", "answer_letter": null, "raw_text": "I need to identify all figures in the paper that contain line plots.\n\nLet me go through the figures systematically:\n\n**Figure 1**: Shows a diagram with flowcharts for pre-training and generation processes - not a line plot.\n\n**Figure 2**: Shows sample interleaved image and text generation - not a line plot.\n\n**Figure 3**: Shows sample generation from Chameleon - not a line plot.\n\n**Figure 4**: Shows sample Chameleon outputs - not a line plot.\n\n**Figure 5**: This figure has three parts:\n- (a) Shows \"Output norm\" over \"Step\" - this is a line plot\n- (b) Shows training loss over steps with and without QK-Norm - this is a line plot\n- (c) Shows training loss over steps with and without dropout - this is a line plot\n\n**Figure 6**: This figure has three parts:\n- (a) Shows \"Training Curves\" with loss over steps - this is a line plot\n- (b) Shows \"Training loss curve\" - this is a line plot\n- (c) Shows training loss with different settings - this is a line plot\n\n**Figure 7**: Shows example alignment data for different categories - not a line plot.\n\n**Figure 8**: Shows task categories and example prompts in a table - not a line plot.\n\n**Figure 9**: Shows performance comparison tables - not line plots, these are tables.\n\n**Figure 10**: Shows inter-annotator agreement in a table format - not a line plot.\n\nSo the figures containing line plots are: Figure 5 (parts a, b, c) and Figure 6 (parts a, b, c).\n\nAnswer: Figure 5, Figure 6", "citations": [], "input_tokens": 32138, "output_tokens": 382, "cost_micros": 102144, "latency_ms": 8875, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "figure 5, figure 6", "normalised_gold": "figure 5, figure 6"}}
+{"qid": "2405.09818v1.pdf::Q000", "doc_id": "2405.09818v1.pdf", "answer_format": "list", "gold": "['Figure 5', 'Figure 6']", "pages": 27, "document_id": 7598, "arm": "surfsense_agentic", "question_id": "2405.09818v1.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to identify which figures contain line plots.Let me search for more specific information about each figure and read the sections containing figure descriptions:Now let me look at Figure 5 more carefully and check other figures:Let me look at the beginning of Figure 5 to see the first part:Good! I can see Figure 5 has multiple line plots. Now let me check other figures. Let me search for more figure captions and descriptions:Let me look for Figure 7 and Figure 8:Let me continue reading Figure 8 and check if there are more figures:I can see Figure 9a is a bar chart. Let me continue examining more figures. Let me search for all figures systematically by searching for specific figure numbers:Now let me look at the actual content of each figure more carefully. Let me check Figure 1:Figure 1 appears to be a diagram/schematic, not a line plot. Let me check Figure 2 and 3:Figures 2 and 3 appear to be examples of text and image generation, not line plots. Let me check Figure 4:Figure 4 is also a sample output, not a line plot. Now let me look more carefully at Figure 10 which I saw earlier has a bar chart format:Figure 10 is a bar chart showing counts for different agreement levels. Let me check if there are any other figures I might have missed by searching for more figure references:Figure 8 shows task categories and examples (text-based). Now let me systematically review what I found:\n\nBased on my reading of the document:\n\n- **Figure 1**: Diagram/schematic showing the Chameleon architecture\n- **Figure 2**: Sample text and image generation (not a line plot)\n- **Figure 3**: Sample generation (not a line plot)  \n- **Figure 4**: Sample outputs (not a line plot)\n- **Figure 5**: Contains THREE line plots:\n  - (a) Output norm vs Step (line plot)\n  - (b) Training Loss vs Step with/without QK-Norm (line plot)\n  - (c) Training Loss vs Step with/without dropout (line plot)\n- **Figure 6**: Contains THREE line plots:\n  - (a) Training Loss vs Step for 7B and 34B models (line plot)\n  - (b) Training Loss vs Step without image generation (line plot)\n  - (c) Training Loss vs Step with/without norm reordering (line plot)\n- **Figure 7**: Example alignment data for different categories (not a line plot)\n- **Figure 8**: Task categories and examples of prompts (not a line plot)\n- **Figure 9**: Contains bar charts showing performance comparison (not line plots)\n- **Figure 10**: Bar chart showing inter-annotator agreement (not a line plot)\n\nAnswer: Figure 5, Figure 6", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 104391, "error": null, "extra": {"thread_id": 2054, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_3ce67a1a47334e71b23dc25fd26d8b74", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 654, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "figure 5, figure 6", "normalised_gold": "figure 5, figure 6"}}
+{"qid": "2405.09818v1.pdf::Q001", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "text tokens", "pages": 27, "document_id": 7598, "arm": "native_pdf", "question_id": "2405.09818v1.pdf::Q001", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "text tokens"}}
+{"qid": "2405.09818v1.pdf::Q001", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "text tokens", "pages": 27, "document_id": 7598, "arm": "azure_basic_lc", "question_id": "2405.09818v1.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document, I can see the caption states:\n\n\"Text tokens are represented in green and image tokens are represented in blue.\"\n\nAnswer: Text tokens", "citations": [], "input_tokens": 25075, "output_tokens": 41, "cost_micros": 75840, "latency_ms": 3905, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "text tokens", "normalised_gold": "text tokens"}}
+{"qid": "2405.09818v1.pdf::Q001", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "text tokens", "pages": 27, "document_id": 7598, "arm": "azure_premium_lc", "question_id": "2405.09818v1.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document, the caption states:\n\n\"Text tokens are represented in green and image tokens are represented in blue.\"\n\nAnswer: Text tokens", "citations": [], "input_tokens": 34444, "output_tokens": 38, "cost_micros": 103902, "latency_ms": 3641, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "text tokens", "normalised_gold": "text tokens"}}
+{"qid": "2405.09818v1.pdf::Q001", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "text tokens", "pages": 27, "document_id": 7598, "arm": "llamacloud_basic_lc", "question_id": "2405.09818v1.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 1 in the document, I can see the caption states:\n\n\"Text tokens are represented in green and image tokens are represented in blue.\"\n\nAnswer: Text tokens", "citations": [], "input_tokens": 26084, "output_tokens": 41, "cost_micros": 78867, "latency_ms": 3483, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "text tokens", "normalised_gold": "text tokens"}}
+{"qid": "2405.09818v1.pdf::Q001", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "text tokens", "pages": 27, "document_id": 7598, "arm": "llamacloud_premium_lc", "question_id": "2405.09818v1.pdf::Q001", "answer_letter": null, "raw_text": "Looking at Figure 1 and its caption, I can see that the figure shows Chameleon's architecture with different colored elements representing different types of tokens.\n\nThe caption explicitly states: \"Text tokens are represented in green and image tokens are represented in blue.\"\n\nAnswer: Text tokens", "citations": [], "input_tokens": 32135, "output_tokens": 62, "cost_micros": 97335, "latency_ms": 4108, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "text tokens", "normalised_gold": "text tokens"}}
+{"qid": "2405.09818v1.pdf::Q001", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "text tokens", "pages": 27, "document_id": 7598, "arm": "surfsense_agentic", "question_id": "2405.09818v1.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the document to find Fig 1 and determine what the green squares denote.Let me search for \"Fig 1\" in the document to locate the figure and its caption.Let me search for \"Figure 1\" instead:Good, I found Figure 1. Now let me read the section around line 354 to see the figure and find information about green squares:Perfect! I found the answer. In line 355, the figure caption explicitly states: \"Text tokens are represented in green and image tokens are represented in blue.\"\n\nAnswer: Text tokens", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 31469, "error": null, "extra": {"thread_id": 2055, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_242072fc7b9a40f1ba1d0b58497f99ea", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 150, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "text tokens", "normalised_gold": "text tokens"}}
+{"qid": "2405.09818v1.pdf::Q003", "doc_id": "2405.09818v1.pdf", "answer_format": "int", "gold": "18", "pages": 27, "document_id": 7598, "arm": "native_pdf", "question_id": "2405.09818v1.pdf::Q003", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "18"}}
+{"qid": "2405.09818v1.pdf::Q003", "doc_id": "2405.09818v1.pdf", "answer_format": "int", "gold": "18", "pages": 27, "document_id": 7598, "arm": "azure_basic_lc", "question_id": "2405.09818v1.pdf::Q003", "answer_letter": null, "raw_text": "I need to count the pictures used in chat example figures, excluding the Appendix.\n\nLet me go through the document systematically:\n\n**Figure 2** (page 3): Sample interleaved image and text generation from Chameleon\n- Shows 3 images (Keel-Billed Toucan, Puffin, Golden Pheasant)\n\n**Figure 3** (page 4): Sample generation from Chameleon\n- Shows 2 images (the STOP sign input image and the generated chameleon image)\n\n**Figure 4** (page 5): Sample Chameleon outputs\n- Shows 2 images (bananas input image and the banana bread output image)\n\n**Figure 7** (page 10): Example alignment data for different categories\n- This shows examples of different data types:\n  - (a) Text - no images\n  - (b) Code - no images\n  - (c) Image Generation - 1 image shown\n  - (d) Visual Chat - 1 image shown (flower arrangement)\n  - (e) Interleaved Text/Image Generation - 2 images shown (appears to be related to Gyumri/Armenian content)\n\n**Figure 8** (page 11): Task categories and examples of prompts\n- Shows 4 small example images in the prompt examples\n\nNow let me check the Appendix section (starts at page 23):\n- The Appendix section A \"Samples\" contains additional examples, but I should exclude these.\n\nTotal count (excluding Appendix):\n- Figure 2: 3 images\n- Figure 3: 2 images\n- Figure 4: 2 images\n- Figure 7: 4 images (1 + 1 + 2)\n- Figure 8: 4 images\n\nTotal: 3 + 2 + 2 + 4 + 4 = 15 images\n\nAnswer: 15", "citations": [], "input_tokens": 25074, "output_tokens": 446, "cost_micros": 81912, "latency_ms": 11907, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "15", "normalised_gold": "18"}}
+{"qid": "2405.09818v1.pdf::Q003", "doc_id": "2405.09818v1.pdf", "answer_format": "int", "gold": "18", "pages": 27, "document_id": 7598, "arm": "azure_premium_lc", "question_id": "2405.09818v1.pdf::Q003", "answer_letter": null, "raw_text": "I need to count the pictures used in the chat example figures, excluding the Appendix.\n\nLet me go through the figures that show chat examples:\n\n**Figure 2** (page 3): \"Sample interleaved image and text generation from Chameleon\"\n- Shows 3 bird images (Keel-Billed Toucan, Puffin, Golden Pheasant)\nTotal: 3 images\n\n**Figure 3** (page 4): \"Sample generation from Chameleon\"\n- Shows 1 stop sign image in the prompt\n- Shows 1 chameleon image in the response\nTotal: 2 images\n\n**Figure 4** (page 5): \"Sample Chameleon outputs\"\n- Shows 1 banana image in the prompt\n- Shows 1 banana bread image in the response\nTotal: 2 images\n\nThe Appendix starts at page 23 with \"A Samples\". The figures after that are in the Appendix and should be excluded.\n\nTotal pictures in chat example figures (excluding Appendix): 3 + 2 + 2 = 7\n\nAnswer: 7", "citations": [], "input_tokens": 34443, "output_tokens": 263, "cost_micros": 107274, "latency_ms": 7530, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "7", "normalised_gold": "18"}}
+{"qid": "2405.09818v1.pdf::Q003", "doc_id": "2405.09818v1.pdf", "answer_format": "int", "gold": "18", "pages": 27, "document_id": 7598, "arm": "llamacloud_basic_lc", "question_id": "2405.09818v1.pdf::Q003", "answer_letter": null, "raw_text": "I need to count the pictures/images used in the chat example figures in the main body of the document (excluding the Appendix).\n\nLet me go through the figures that show chat/conversation examples:\n\n1. **Figure 1**: Shows a recipe example with bananas image and banana bread image = 2 images\n\n2. **Figure 2**: \"Sample interleaved image and text generation from Chameleon\" - This shows a response about quirky-looking birds with 3 bird images (Keel-Billed Toucan, Puffin, and Golden Pheasant) = 3 images\n\n3. **Figure 3**: \"Sample generation from Chameleon\" - Shows a stop sign image and a chameleon image = 2 images\n\n4. **Figure 4**: \"Sample Chameleon outputs\" - Shows bananas image and banana bread image = 2 images\n\n5. **Figure 7**: \"Example alignment data for different categories\" - This shows multiple examples including:\n   - Poem (no image)\n   - Renaissance Square (1 image)\n   - Code example (no image)\n   - Gyumri text (no image)\n   - Flower arrangement (1 image)\n   = 2 images\n\n6. **Figure 8**: \"Task categories and examples of prompts\" - Shows 4 images in the examples (pastry, pony, construction site, and one more)\n\nWait, let me recount Figure 8 more carefully. Looking at the attributions at the bottom: \"Seguin (2010); Agriflanders (2009); Tuszy\u0144ski (2015); Sokolov (2022)\" - this indicates 4 images.\n\nTotal count:\n- Figure 1: 2 images\n- Figure 2: 3 images  \n- Figure 3: 2 images\n- Figure 4: 2 images\n- Figure 7: 2 images\n- Figure 8: 4 images\n\nTotal = 2 + 3 + 2 + 2 + 2 + 4 = 15 images\n\nAnswer: 15", "citations": [], "input_tokens": 26083, "output_tokens": 489, "cost_micros": 85584, "latency_ms": 11483, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "15", "normalised_gold": "18"}}
+{"qid": "2405.09818v1.pdf::Q003", "doc_id": "2405.09818v1.pdf", "answer_format": "int", "gold": "18", "pages": 27, "document_id": 7598, "arm": "llamacloud_premium_lc", "question_id": "2405.09818v1.pdf::Q003", "answer_letter": null, "raw_text": "I need to count the pictures used in the chat example figures, excluding the Appendix.\n\nLet me go through the figures that show chat examples:\n\n**Figure 2** (page 3): \"Sample interleaved image and text generation from Chameleon\"\n- Shows a conversation about quirky birds\n- Contains 3 images: Keel-Billed Toucan, Puffin, and Golden Pheasant\n- Count: 3 images\n\n**Figure 3** (page 4): \"Sample generation from Chameleon\"\n- Shows a conversation about a stop sign and chameleon camouflage\n- Contains 2 images: stop sign in wooded area, and a reddish-brown chameleon on tree branch\n- Count: 2 images\n\n**Figure 4** (page 5): \"Sample Chameleon outputs\"\n- Shows a conversation about baking with bananas\n- Contains 2 images: bowl of sliced bananas, and a loaf of banana bread\n- Count: 2 images\n\nThe Appendix section starts at \"Appendix\" on page 23. The example in Section A (page 24) with the Mountain Cur dog and the pothos plant are in the Appendix, so I should not count those.\n\nTotal pictures in chat example figures (excluding Appendix): 3 + 2 + 2 = 7\n\nAnswer: 7", "citations": [], "input_tokens": 32134, "output_tokens": 325, "cost_micros": 101277, "latency_ms": 8719, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "7", "normalised_gold": "18"}}
+{"qid": "2405.09818v1.pdf::Q003", "doc_id": "2405.09818v1.pdf", "answer_format": "int", "gold": "18", "pages": 27, "document_id": 7598, "arm": "surfsense_agentic", "question_id": "2405.09818v1.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the document to find chat example figures and count the pictures in them, excluding any in the Appendix.Let me search for figures more specifically:Now let me look for chat examples and appendix sections. Let me read through the document systematically:Let me read the figures mentioned (Figures 2-4) which appear to be the chat example figures, and check where the Appendix starts:Now let me check Figure 4:Let me go back and read Figure 2 more carefully to count the images:Now let me look at the beginning of Figure 2 to see all images:I see there's a mislabeled image at the beginning. Let me count more carefully. Looking at Figure 2, I can see references to:\n- Im3.png (Toucan)\n- Im4.png (Puffin)  \n- Im5.png (Golden Pheasant mentioned with <img> marker)\n\nLet me check Figure 3 now:Let me look for where Figure 3 starts:I need to see if there's another image in Figure 3. Let me check after line 747:Good! So I see Im7.png is a chameleon image. Let me check if this is part of Figure 3. Looking at the page break, Im7.png appears after the section break at line 756 where \"2.1 Tokenization\" starts. Let me verify where Figure 3 ends and Figure 4 starts:Perfect! Now I can see Figure 4 ends at line 922. Let me look for more images in Figure 4. The prompt at line 822 mentions an <img> tag. Let me check if there's an image for that:Now let me verify where the Appendix starts to ensure I don't count figures from there:Good! The Appendix starts at line 3372. Now let me systematically count the images in Figures 2, 3, and 4.\n\nLet me go back and look at the full scope of each figure more carefully:\n\n**Figure 2** (lines 471-620):\n- Starts with prompt at line 471\n- Im3.png - Toucan (line 543)\n- Im4.png - Puffin (line 579)\n- Im5.png - Golden Pheasant (line 641)\n- Figure 2 caption at line 619\n\nWait, Im5.png at line 641 seems to be after the Figure 2 caption. Let me re-check:I see! There's a STOP sign figure at line 633-637, and then Im5.png (the Golden Pheasant) at line 641. The STOP sign appears to be the beginning of Figure 3's prompt. Let me trace this more carefully:So the STOP sign at line 633-637 with Im5.png description is actually part of Figure 3's input (the <img> in the prompt at line 675 refers to it). Let me reorganize:\n\n**Figure 2** (chat example about birds):\n- Prompt starts at line 471\n- Im3.png - Toucan (line 543)\n- Im4.png - Puffin (line 579)\n- Golden Pheasant text mentions <img> at line 614, but the actual image appears to be missing from Figure 2 content\n- Figure 2 caption at line 619-620\n\nWait, I need to look more carefully at the structure. The embedded images seem to be labeled differently. Let me look for all embedded images in the main body before the Appendix:Now let me determine which images belong to Figures 2, 3, and 4 by checking the line ranges:\n\n- Figure 2 caption: line 619\n- Figure 3 caption: line 701  \n- Figure 4 caption: line 922\n- Appendix starts: line 3372\n\nImages before Appendix:\n- Image4.jpg (line 407) - before Figure 2\n- Image5.jpg (line 480) - This is the mislabeled banana bread image, but checking context it's BEFORE the birds prompt starts at line 471\n- Im3.png (line 543) - in Figure 2 range\n- Im4.png (line 579) - in Figure 2 range\n- Im5.png (line 641) - between Figure 2 (ends 620) and Figure 3 (ends 701)\n- Im6.png (line 711) - between Figure 3 (ends 701) and section 2.1 starts\n- Im7.png (line 777) - after section 2.1 starts (line 756)\n- Im8.png (line 876) - in Figure 4 range (Figure 4 caption at 922)\n- Im9.png (line 991) - after Figure 4 ends\n\nLet me read around line 480 to verify Image5.jpg:This Image5.jpg (banana bread) is clearly mislabeled/misplaced content. The prompt at line 471 asks for birds, but Image5.jpg shows banana bread. This seems to be a document processing error. However, it appears WITHIN the Figure 2 example (after the prompt, before the response text starts at line 518).\n\nLet me check where Figure 2 actually starts and what the structure is:So Figure 2 starts at line 471. Let me now carefully count all images that appear in the three chat example figures (2, 3, 4) before the Appendix:\n\nLooking at my grep results and the line ranges:\n\n**Figure 2** (lines 471-620):\n- Image5.jpg (line 480) - appears after the prompt but seems misplaced\n- Im3.png (line 543) - Toucan\n- Im4.png (line 579) - Puffin\n\nHowever, Image5.jpg shows banana bread which doesn't match the birds prompt. Let me check if there's a third bird image I'm missing. The text mentions three birds (Toucan, Puffin, Golden Pheasant) and has <img> markers. Let me look for where the third bird image might be:\n\nLooking at my earlier reading, Im5.png (line 641) shows a bird (Golden Pheasant based on the description at lines 649-668). The caption for Figure 2 is at line 619-620, so Im5.png at line 641 is AFTER the Figure 2 caption.\n\nLet me check if Im5.png is actually part of Figure 3:I see! Im5.png (the Golden Pheasant/colorful bird) appears at line 641, but the prompt for Figure 3 starts at line 675 which references \"<img>\" - and that <img> is referring to the STOP sign that appeared earlier. So Im5.png is actually the third bird from Figure 2, even though it appears after the Figure 2 caption.\n\nLooking back at the structure more carefully, the figure captions seem to come before all the images are shown. Let me look at where the STOP sign figure starts:\n\nFrom line 633-637, there's a figure with \"STOP\" as OCR text. This must be the input image for Figure 3.\n\nLet me now systematically list the images:\n\n**Before Figure 2 caption (line 619):**\n- Image5.jpg (line 480) - banana bread (misplaced/error)\n- Im3.png (line 543) - Toucan  \n- Im4.png (line 579) - Puffin\n\n**After Figure 2 caption but before Figure 3 caption (line 701):**\n- Im5.png (line 641) - Golden Pheasant (part of Figure 2 content)\n\n**Around Figure 3:**\n- STOP sign figure (lines 633-637, but before Im5.png structurally in the document)\n- Im6.png (line 711) - STOP sign (after Figure 3 caption)\n\nWait, I need to look at this more systematically. Let me check what the STOP sign figure at line 633 is:The figure at 633-637 has OCR text \"STOP\" but the embedded image Im5.png (line 641) has \"NO_CONTENT_HERE\" as OCR and shows a bird. This is confusing. Let me look at Im6.png which should be the STOP sign:Yes! Im6.png is the STOP sign. So the document structure has the figure tags and embedded images separated. The figure at line 633 with \"STOP\" corresponds to Im6.png (the actual embedded image file) at line 711.\n\nLet me now map this out properly by looking at the figure captions and the embedded images:\n\n**Figure 2** (caption at line 619): Chat about birds\n**Figure 3** (caption at line 701): Chat about chameleon camouflage  \n**Figure 4** (caption at line 922): Chat about baking with bananas\n\nNow let me find which embedded images belong to each:So Im6.png appears RIGHT AFTER the Figure 3 caption. Now let me check Figure 4:So Im8.png (bananas) appears before the Figure 4 caption at line 922, and there's an <img> marker at line 920 suggesting another image.\n\nNow let me look at Im7.png and Im9.png to see which figure they belong to:\n\nIm7.png is at line 777. Let me check the context:Im7.png (chameleon) is at line 777, which is AFTER section 2.1 starts at line 756 and after Figure 3 caption at line 701. But looking at the context, Im7.png shows a chameleon which matches Figure 3's theme. Let me check if this is part of the Figure 3 example by looking backwards:\n\nThe Figure 3 response at line 747 has an <img> marker, and the text before talks about generating a chameleon image. So Im7.png is part of Figure 3's output!\n\nNow let me check Im9.png:Im9.png shows a loaf of baked good (banana bread final result). This is at line 991, which is after Figure 4 caption at line 922. The <img> marker at line 920 in Figure 4 refers to this final dish image. So Im9.png is part of Figure 4.\n\nNow let me go back and check Im5.png more carefully. At line 641, the bird image appears after Figure 2 caption but before Figure 3 content. Looking at Figure 2's text, the Golden Pheasant is mentioned with <img> at line 614. So Im5.png (Golden Pheasant) is part of Figure 2.\n\nLet me now do a final count:\n\n**Figure 2** (Chat about birds):\n- Im3.png - Toucan\n- Im4.png - Puffin  \n- Im5.png - Golden Pheasant\nTotal: 3 images\n\n**Figure 3** (Chat about chameleon camouflage):\n- Im6.png - STOP sign (input)\n- Im7.png - Chameleon (generated output)\nTotal: 2 images\n\n**Figure 4** (Chat about baking with bananas):\n- Im8.png - Bananas (input)\n- Im9.png - Banana bread (generated output)\nTotal: 2 images\n\n**Total images in chat example figures (Figures 2-4), excluding Appendix: 3 + 2 + 2 = 7**\n\nWait, but I should double-check if Image5.jpg (the banana bread at line 480) is actually meant to be part of Figure 2 or if it's a document error. Let me look at the context again:Image5.jpg (banana bread) appears right after the birds prompt but before the response starts. This appears to be a document processing error or misplacement - it doesn't match the context at all. The response starting at line 518 talks about birds, not banana bread.\n\nGiven that this is clearly an error in the document, I should not count Image5.jpg as part of the chat example figures. The actual chat examples are:\n\n**Figure 2**: 3 bird images (Im3.png, Im4.png, Im5.png)\n**Figure 3**: 2 images (Im6.png STOP sign, Im7.png chameleon)  \n**Figure 4**: 2 images (Im8.png bananas, Im9.png banana bread)\n\n**Total: 7 images**\n\nAnswer: 7", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 178546, "error": null, "extra": {"thread_id": 2056, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_d55f4fd667ac403b901ea3294619b198", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 1600, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "7", "normalised_gold": "18"}}
+{"qid": "2405.09818v1.pdf::Q004", "doc_id": "2405.09818v1.pdf", "answer_format": "int", "gold": "1", "pages": 27, "document_id": 7598, "arm": "native_pdf", "question_id": "2405.09818v1.pdf::Q004", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 17687, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": null, "pdf_filename": "2405.09818v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "1"}}
+{"qid": "2405.09818v1.pdf::Q004", "doc_id": "2405.09818v1.pdf", "answer_format": "int", "gold": "1", "pages": 27, "document_id": 7598, "arm": "azure_basic_lc", "question_id": "2405.09818v1.pdf::Q004", "answer_letter": null, "raw_text": "I need to find subplots that show obvious loss spikes in the document.\n\nLet me examine the figures that show training loss curves:\n\n1. **Figure 5**: Contains 3 subplots:\n   - (a) Shows output norm with clear divergence/spike\n   - (b) Shows training loss comparison with and without QK-Norm - one curve diverges (spikes)\n   - (c) Shows training loss with and without dropout - appears stable\n\n2. **Figure 6**: Contains 3 subplots:\n   - (a) Shows training loss curves for 7B and 34B models - appears stable without obvious spikes\n   - (b) Shows training loss curve with image generation disabled - appears stable without obvious spikes\n   - (c) Shows training loss with and without norm reordering - shows clear divergence/spike in one curve\n\nLet me count the subplots with obvious loss spikes:\n- Figure 5(a): Shows norm growth leading to divergence (1)\n- Figure 5(b): Shows loss divergence/spike (1)\n- Figure 6(c): Shows loss divergence/spike (1)\n\nTotal: 3 subplots show obvious loss spikes or divergences.\n\nAnswer: 3", "citations": [], "input_tokens": 25071, "output_tokens": 284, "cost_micros": 79473, "latency_ms": 8016, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "1"}}
+{"qid": "2405.09818v1.pdf::Q004", "doc_id": "2405.09818v1.pdf", "answer_format": "int", "gold": "1", "pages": 27, "document_id": 7598, "arm": "azure_premium_lc", "question_id": "2405.09818v1.pdf::Q004", "answer_letter": null, "raw_text": "I need to find figures that show loss curves with obvious spikes.\n\nLooking through the document:\n\n**Figure 5** (page 6-7):\n- (a) Shows \"Output norm\" vs Step - not a loss curve with spikes\n- (b) Shows \"Training Loss\" with and without QK-Norm - shows relatively smooth curves, one diverges but no obvious spikes\n- (c) Shows \"Training Loss\" with and without dropout - shows smooth curves\n\n**Figure 6** (page 7):\n- (a) Shows \"Training Loss\" for 7B and 34B models over 600k steps - smooth curves, no obvious spikes\n- (b) Shows \"Training Loss\" for 7B without image generation - smooth curve, no obvious spikes\n- (c) Shows \"Training Loss\" with and without norm reordering - one curve shows a clear spike/divergence around 8k-10k steps\n\nFrom Figure 6c, I can see one subplot that clearly shows an obvious loss spike (the curve without norm reordering that spikes dramatically).\n\nLooking more carefully at Figure 5b, the curve \"w/o QK-norm\" shows a sharp spike/divergence near the end.\n\nSo counting subplots with obvious loss spikes:\n- Figure 5b: 1 subplot (shows divergence/spike)\n- Figure 6c: 1 subplot (shows clear spike)\n\nAnswer: 2", "citations": [], "input_tokens": 34440, "output_tokens": 324, "cost_micros": 108180, "latency_ms": 9828, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "1"}}
+{"qid": "2405.09818v1.pdf::Q004", "doc_id": "2405.09818v1.pdf", "answer_format": "int", "gold": "1", "pages": 27, "document_id": 7598, "arm": "llamacloud_basic_lc", "question_id": "2405.09818v1.pdf::Q004", "answer_letter": null, "raw_text": "I need to find figures in the document that show training loss curves with obvious loss spikes.\n\nLooking through the document:\n\n**Figure 5**: \"Output norm and training loss curves for Chameleon models under various settings\"\n- (a) Shows uncontrolled growth of output norms\n- (b) Shows \"An ablation with Chameleon-7B with and without QK-Norm\" - this shows training loss with a divergence/spike\n- (c) Shows \"An ablation with Chameleon-7B with and without dropout\" - this shows training loss with divergences/spikes\n\n**Figure 6**: \"Training loss curves for Chameleon models under various settings\"\n- (a) Shows \"Training loss curve with image generation disabled does not suffer from instability issues\" - smooth curves, no spikes\n- (b) Shows \"For Chameleon-34B, using dropout does not fix divergences, both with and without reordering\" - this shows loss spikes/divergences\n\nCounting subplots with obvious loss spikes:\n- Figure 5(b): 1 subplot with loss spike\n- Figure 5(c): 1 subplot with loss spike  \n- Figure 6(b): 1 subplot with loss spikes\n\nTotal: 3 subplots\n\nAnswer: 3", "citations": [], "input_tokens": 26080, "output_tokens": 301, "cost_micros": 82755, "latency_ms": 9171, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "1"}}
+{"qid": "2405.09818v1.pdf::Q004", "doc_id": "2405.09818v1.pdf", "answer_format": "int", "gold": "1", "pages": 27, "document_id": 7598, "arm": "llamacloud_premium_lc", "question_id": "2405.09818v1.pdf::Q004", "answer_letter": null, "raw_text": "I need to find figures that show loss spikes in the document.\n\nLooking through the document for figures showing training loss or related metrics:\n\n**Figure 5: \"Output norm and training loss curves for Chameleon models under various settings\"**\nThis figure has three subplots:\n- (a) Shows output norms with curves that spike/diverge\n- (b) Shows training loss with and without QK-Norm - one curve diverges/spikes\n- (c) Shows training loss with and without dropout - one curve appears to spike\n\n**Figure 6: \"Training loss curves for Chameleon models under various settings\"**\nThis figure has three subplots:\n- (a) Shows smooth training curves for 7B and 34B models - no obvious spikes\n- (b) Shows training loss without image generation - smooth curve, no spikes\n- (c) Shows training with/without norm reordering where one diverges - this shows a spike\n\nSo counting subplots with obvious loss spikes:\n- Figure 5(a): 1 (shows divergence/spikes in output norms)\n- Figure 5(b): 1 (shows divergence in loss)\n- Figure 5(c): 1 (shows divergence in loss)\n- Figure 6(c): 1 (shows divergence)\n\nTotal: 4 subplots show obvious spikes/divergences\n\nAnswer: 4", "citations": [], "input_tokens": 32131, "output_tokens": 324, "cost_micros": 101253, "latency_ms": 9125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "4", "normalised_gold": "1"}}
+{"qid": "2405.09818v1.pdf::Q004", "doc_id": "2405.09818v1.pdf", "answer_format": "int", "gold": "1", "pages": 27, "document_id": 7598, "arm": "surfsense_agentic", "question_id": "2405.09818v1.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to find information about subplots related to obvious loss spikes.Let me search for content related to \"loss spikes\" in the document:Let me search for \"subplot\":Let me read through the document more systematically to look for figures and their descriptions:Now let me look at Figure 5 and Figure 6 which seem to show training loss curves. Let me read those sections:Now let me look more closely at Figure 6 which appears to have subplots. Let me search for any reference to \"loss spikies\" or similar terms:Let me look at Figure 5 more carefully, as it shows \"Output norm and training loss curves for Chameleon models under various settings\":I can see Figure 5 has three subplots (a, b, c). Now let me check the figure caption more carefully and look at the actual charts. The caption says \"Figure 5 Output norm and training loss curves for Chameleon models under various settings.\" Let me read the text more carefully to understand what \"obvious loss spikies\" might refer to. Let me search for variations:Let me look at the entire document more systematically. Let me check around Figure 5 and 6 descriptions:Let me look more carefully at the whole document. Maybe \"spikies\" is a colloquial term in the figure or caption. Let me check the full text around Figures 5 and 6:So Figure 5 has subplots (a), (b), and (c). Let me check Figure 6:So Figure 6 also has subplots (a), (b), and (c). Now let me look at the actual chart data more carefully. The question asks about \"obvious loss spikies\" which likely refers to spikes in the loss curves. Let me look at the loss curve data more closely. Let me search the document for any terms that might be misspelled or colloquial:Based on my reading of the document, I can see that:\n\n**Figure 5** has three subplots:\n- (a) Output norms plot showing growth over training steps\n- (b) Training loss with and without QK-Norm\n- (c) Training loss with and without dropout\n\n**Figure 6** has three subplots:\n- (a) Training curves for 600k steps for Chameleon-7B and Chameleon-34B\n- (b) Training loss curve with image generation disabled\n- (c) For Chameleon-34B, using dropout comparison with and without norm-reordering\n\nLooking at the descriptions and the actual chart data in the text, the question asks about \"obvious loss spikies\" (spikes). From examining the text descriptions:\n\n- Figure 5(a) shows \"Uncontrolled growth of output norms\" - this is about norms, not loss spikes\n- Figure 5(b) shows training loss curves with the one \"w/o QK-norm\" showing a divergence\n- Figure 5(c) shows training loss curves with and without dropout - the \"w/o dropout\" curve appears to have issues\n- Figure 6(a) shows smooth training curves for both 7B and 34B models\n- Figure 6(b) shows a smooth training loss curve without image generation\n- Figure 6(c) shows curves where \"w/o norm reordering\" appears to diverge/spike\n\nLooking at the subplot captions that mention instability or divergences, which would correspond to loss spikes:\n- Figure 5(b): \"w/o QK-norm\" diverges\n- Figure 5(c): \"w/o dropout\" has spikes\n- Figure 6(c): \"w/o norm reordering\" has divergences/spikes\n\nThat would be 3 subplots showing obvious loss spikes.\n\nAnswer: 3", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 88405, "error": null, "extra": {"thread_id": 2057, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_d21eabfb0c2b44d69030354b180154e2", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 635, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "1"}}
+{"qid": "2405.09818v1.pdf::Q005", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "no", "pages": 27, "document_id": 7598, "arm": "native_pdf", "question_id": "2405.09818v1.pdf::Q005", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 15734, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": null, "pdf_filename": "2405.09818v1.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "no"}}
+{"qid": "2405.09818v1.pdf::Q005", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "no", "pages": 27, "document_id": 7598, "arm": "azure_basic_lc", "question_id": "2405.09818v1.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Table 1 in the document, which shows a \"Summary of core architecture and optimization decisions made in Chameleon in contrast to LLaMa-1 and LLaMa-2\", I can see that:\n\n- LLaMa-1 7B has an \"\u00d7\" (cross mark) in the Qknorm column\n- LLaMa-1 33B has an \"\u00d7\" (cross mark) in the Qknorm column\n\nThis indicates that LLaMa-1 does NOT apply QKnorm.\n\nAnswer: no", "citations": [], "input_tokens": 25087, "output_tokens": 130, "cost_micros": 77211, "latency_ms": 5343, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2405.09818v1.pdf::Q005", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "no", "pages": 27, "document_id": 7598, "arm": "azure_premium_lc", "question_id": "2405.09818v1.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Table 1 in the document, which compares core architecture and optimization decisions across models:\n\nFor LLaMa-1 models (both 7B and 33B), the Qknorm column shows \"\u00d7\" (which indicates no).\n\nAnswer: no", "citations": [], "input_tokens": 34456, "output_tokens": 63, "cost_micros": 104313, "latency_ms": 4297, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2405.09818v1.pdf::Q005", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "no", "pages": 27, "document_id": 7598, "arm": "llamacloud_basic_lc", "question_id": "2405.09818v1.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Table 1 in the document, which summarizes the core architecture and optimization decisions:\n\nFor LLaMa-1 models (both 7B and 33B), the \"Qknorm\" column shows \"\u00d7\", which indicates that QK-Norm was not applied.\n\nAnswer: No", "citations": [], "input_tokens": 26096, "output_tokens": 71, "cost_micros": 79353, "latency_ms": 4531, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2405.09818v1.pdf::Q005", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "no", "pages": 27, "document_id": 7598, "arm": "llamacloud_premium_lc", "question_id": "2405.09818v1.pdf::Q005", "answer_letter": null, "raw_text": "Looking at Table 1 in the document, which shows a comparison of core architecture and optimization decisions:\n\nFor LLaMa-1 (both 7B and 33B models), the \"Qknorm\" column shows \"\u00d7\", which indicates that query-key normalization (QK-Norm) was **not** applied.\n\nAnswer: No", "citations": [], "input_tokens": 32147, "output_tokens": 81, "cost_micros": 97656, "latency_ms": 4671, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2405.09818v1.pdf::Q005", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "no", "pages": 27, "document_id": 7598, "arm": "surfsense_agentic", "question_id": "2405.09818v1.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the document to find information about whether llama-1 applies qknorm.Let me search for \"qknorm\" and \"llama\" in the document to find relevant information.Let me read the table area that contains information about qknorm and llama-1:Perfect! I found the table that shows whether llama-1 applies qknorm. In the table, for both LLaMa-1 models (7B and 33B), the Qknorm column shows \"\u00d7 \u2612\" which indicates \"no\".\n\nAnswer: no", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 30905, "error": null, "extra": {"thread_id": 2058, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_2c3221de2294418da7d44837e6c67391", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 146, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "no", "normalised_gold": "no"}}
+{"qid": "2405.09818v1.pdf::Q007", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "150k", "pages": 27, "document_id": 7598, "arm": "native_pdf", "question_id": "2405.09818v1.pdf::Q007", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657603 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "150k"}}
+{"qid": "2405.09818v1.pdf::Q007", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "150k", "pages": 27, "document_id": 7598, "arm": "azure_basic_lc", "question_id": "2405.09818v1.pdf::Q007", "answer_letter": null, "raw_text": "Looking at Figure 5b in the document, which shows \"An ablation with Chameleon-7B with and without QK-Norm\", I can see the training loss curves.\n\nThe curve labeled \"w/o QK-norm\" (without QK-norm) shows a divergence/spike in the loss after approximately 20% of training progress, which appears to occur around step 20,000-25,000 based on the x-axis scale shown in the figure.\n\nAnswer: Around step 20,000-25,000", "citations": [], "input_tokens": 25078, "output_tokens": 122, "cost_micros": 77064, "latency_ms": 6000, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "around step 20 000 25 000", "normalised_gold": "150k"}}
+{"qid": "2405.09818v1.pdf::Q007", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "150k", "pages": 27, "document_id": 7598, "arm": "azure_premium_lc", "question_id": "2405.09818v1.pdf::Q007", "answer_letter": null, "raw_text": "Looking at Figure 5b, which shows \"An ablation with Chameleon-7B with and without QK-Norm,\" I can see the training loss curves for both conditions.\n\nThe curve labeled \"w/o QK-norm\" (without QK-Norm) shows the training progressing normally until it experiences a sudden spike and divergence.\n\nAnswer: Around 125k steps", "citations": [], "input_tokens": 34447, "output_tokens": 89, "cost_micros": 104676, "latency_ms": 5452, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "around 125k steps", "normalised_gold": "150k"}}
+{"qid": "2405.09818v1.pdf::Q007", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "150k", "pages": 27, "document_id": 7598, "arm": "llamacloud_basic_lc", "question_id": "2405.09818v1.pdf::Q007", "answer_letter": null, "raw_text": "Looking at Figure 5b in the document, which shows \"An ablation with Chameleon-7B with and without QK-Norm,\" I can see the training loss curves comparing the two conditions.\n\nThe figure shows that the model without QK-Norm diverges (loss spikes) after approximately 20% of training progress.\n\nAnswer: After approximately 20% of training", "citations": [], "input_tokens": 26087, "output_tokens": 87, "cost_micros": 79566, "latency_ms": 5046, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "after approximately 20 of training", "normalised_gold": "150k"}}
+{"qid": "2405.09818v1.pdf::Q007", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "150k", "pages": 27, "document_id": 7598, "arm": "llamacloud_premium_lc", "question_id": "2405.09818v1.pdf::Q007", "answer_letter": null, "raw_text": "Looking at Figure 5b, which shows \"An ablation with Chameleon-7B with and without QK-Norm,\" I can see the training loss curves for models with and without QK-norm.\n\nThe graph shows that the model without QK-norm or dropout (red line) diverges after approximately 20k-30k steps, where the loss spikes dramatically upward.\n\nAnswer: Around 20k-30k steps", "citations": [], "input_tokens": 32138, "output_tokens": 100, "cost_micros": 97914, "latency_ms": 5485, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "around 20k 30k steps", "normalised_gold": "150k"}}
+{"qid": "2405.09818v1.pdf::Q007", "doc_id": "2405.09818v1.pdf", "answer_format": "str", "gold": "150k", "pages": 27, "document_id": 7598, "arm": "surfsense_agentic", "question_id": "2405.09818v1.pdf::Q007", "answer_letter": null, "raw_text": "I'll read the document to find information about loss spikes when without QK-norm.Let me search for \"QK-norm\" in the document to find the relevant information.Now let me read the specific sections that mention loss spikes and QK-norm. Let me look at the chart/figure around lines 1028-1076 which seems to show training loss curves.I can see a figure showing training loss with and without QK-norm. The x-axis shows steps from 0k to 175k (approximately). Let me look more carefully at the surrounding text to understand where the spike occurs. Let me check the text before and after this figure.Let me read more to find the caption or description of this figure:Good! Now I can see that figure (b) shows training loss with and without QK-Norm. Let me look back at the chart more carefully to understand where the spike happens. Let me check the text around lines 938-1000 to see if there's more description:Now I need to look at the actual description of figure 5b more carefully. Let me read around the text that discusses the training loss spike:Perfect! I found the answer. The text clearly states:\n\n\"In Figure 5b, we show training loss curves for Chameleon-7B with and without QK-Norm, and the latter diverges after approximately 20% of a training epoch.\"\n\nLet me look at the figure again to verify the step count. Looking at the figure data from earlier, the x-axis goes from 0k to approximately 175k steps. The text mentions \"after approximately 20% of a training epoch\" for the divergence/spike.\n\nHowever, let me look more carefully at what specific step number might correspond to this. Let me check if there's more specific information:", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 73108, "error": null, "extra": {"thread_id": 2059, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_c22c13a6350947eda6562103bdc5468c", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 343, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "however let me look more carefully at what specific step number might correspond to this let me check if there s more specific information", "normalised_gold": "150k"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q000", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "str", "gold": "Special Agent Marc Silski", "pages": 16, "document_id": 7599, "arm": "native_pdf", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q000", "answer_letter": null, "raw_text": "I need to find who verified and signed the complaint by looking at the verification section of the document.\n\nLooking at page 16 of the document, I can see the \"VERIFICATION\" section which states:\n\n\"I, Marc Silski, am a Special Agent of the Federal Bureau of Investigation (\"FBI\"). I have read the foregoing Complaint for Forfeiture, and declare under penalty of perjury of the laws of the United States of America that the facts contained therein are true to the best of my knowledge and belief, based upon knowledge possessed by me and/or on information received from other law enforcement agents.\"\n\nThis is followed by a signature and \"Special Agent Marc Silski, Federal Bureau of Investigation\" with the date \"September 6, 2018\".\n\nAnswer: Special Agent Marc Silski", "citations": [], "input_tokens": 25011, "output_tokens": 176, "cost_micros": 77673, "latency_ms": 14359, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "special agent marc silski", "normalised_gold": "special agent marc silski"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q000", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "str", "gold": "Special Agent Marc Silski", "pages": 16, "document_id": 7599, "arm": "azure_basic_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the verification section at the end of the document (PageID.16 Page 16 of 16), I can see who verified and signed the complaint.\n\nAnswer: Special Agent Marc Silski, Federal Bureau of Investigation", "citations": [], "input_tokens": 5579, "output_tokens": 52, "cost_micros": 17517, "latency_ms": 3594, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.6666666666666666, "method": "str_norm", "normalised_pred": "special agent marc silski federal bureau of investigation", "normalised_gold": "special agent marc silski"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q000", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "str", "gold": "Special Agent Marc Silski", "pages": 16, "document_id": 7599, "arm": "azure_premium_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q000", "answer_letter": null, "raw_text": "According to the verification section at the end of the document, the officer who verified and signed the complaint was:\n\nAnswer: Special Agent Marc Silski", "citations": [], "input_tokens": 6098, "output_tokens": 34, "cost_micros": 18804, "latency_ms": 3062, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "special agent marc silski", "normalised_gold": "special agent marc silski"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q000", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "str", "gold": "Special Agent Marc Silski", "pages": 16, "document_id": 7599, "arm": "llamacloud_basic_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the verification section at the end of the document (page 16):\n\nThe verification states:\n\"I, Marc Silski, am a Special Agent of the Federal Bureau of Investigation (\"FBI\"). I have read the foregoing Complaint for Forfeiture, and declare under penalty of perjury...\"\n\nIt is signed by:\n\"Special Agent Marc Silski\nFederal Bureau of Investigation\nDated: September 6, 2018\"\n\nAnswer: Special Agent Marc Silski", "citations": [], "input_tokens": 5228, "output_tokens": 109, "cost_micros": 17319, "latency_ms": 3780, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "special agent marc silski", "normalised_gold": "special agent marc silski"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q000", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "str", "gold": "Special Agent Marc Silski", "pages": 16, "document_id": 7599, "arm": "llamacloud_premium_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the verification section at the end of the document (page 16), I can see who verified and signed the complaint.\n\nAnswer: Special Agent Marc Silski, Federal Bureau of Investigation", "citations": [], "input_tokens": 5778, "output_tokens": 43, "cost_micros": 17979, "latency_ms": 2312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 0.6666666666666666, "method": "str_norm", "normalised_pred": "special agent marc silski federal bureau of investigation", "normalised_gold": "special agent marc silski"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q000", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "str", "gold": "Special Agent Marc Silski", "pages": 16, "document_id": 7599, "arm": "surfsense_agentic", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the document to find which officer verified and signed the complaint.Based on the document, I found the verification section on page 16. The officer who verified and signed the complaint is:\n\nAnswer: Special Agent Marc Silski, Federal Bureau of Investigation", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 23546, "error": null, "extra": {"thread_id": 2060, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_890676fb4dc0465f94bad1a9648bd51a", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 82, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 0.6666666666666666, "method": "str_norm", "normalised_pred": "special agent marc silski federal bureau of investigation", "normalised_gold": "special agent marc silski"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q001", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['$35,000', '$40,684']", "pages": 16, "document_id": 7599, "arm": "native_pdf", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q001", "answer_letter": null, "raw_text": "I need to find all the checks issued to Mont Blanc company mentioned in the document.\n\nLooking through the document:\n\nOn page 11 (PageID.11), paragraph e states:\n\"Bank records obtained in this investigation show that in September 2012, the NTC issued a check, number 150301, to a company known as Mont Blanc in the amount of $35,000.\"\n\nOn page 11 (PageID.11), paragraph f states:\n\"Subsequently, in February 2013, a second check, number 151545, in the amount of $40,684 was issued by the NTC payable to Mont Blanc.\"\n\nThese are the two checks issued to Mont Blanc company.\n\nAnswer: $35,000, $40,684", "citations": [], "input_tokens": 25031, "output_tokens": 170, "cost_micros": 77643, "latency_ms": 10219, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "000, 35, 40, 684", "normalised_gold": "000, 35, 40, 684"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q001", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['$35,000', '$40,684']", "pages": 16, "document_id": 7599, "arm": "azure_basic_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q001", "answer_letter": null, "raw_text": "Looking through the document for information about checks issued to Mont Blanc:\n\nFrom paragraph 32(e): \"Bank records obtained in this investigation show that in September 2012, the NTC issued a check, number 150301, to a company known as Mont Blanc in the amount of $35,000.\"\n\nFrom paragraph 32(f): \"Subsequently, in February 2013, a second check, number 151545, in the amount of $40,684 was issued by the NTC payable to Mont Blanc.\"\n\nAnswer: $35,000, $40,684", "citations": [], "input_tokens": 5599, "output_tokens": 129, "cost_micros": 18732, "latency_ms": 3389, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "000, 35, 40, 684", "normalised_gold": "000, 35, 40, 684"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q001", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['$35,000', '$40,684']", "pages": 16, "document_id": 7599, "arm": "azure_premium_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q001", "answer_letter": null, "raw_text": "Looking through the document for checks issued to Mont Blanc:\n\nFrom paragraph 32, subsections e and f:\n\n1. Check number 150301 issued in September 2012: $35,000\n2. Check number 151545 issued in February 2013: $40,684\n\nAnswer: $35,000, $40,684", "citations": [], "input_tokens": 6118, "output_tokens": 79, "cost_micros": 19539, "latency_ms": 3640, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "000, 35, 40, 684", "normalised_gold": "000, 35, 40, 684"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q001", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['$35,000', '$40,684']", "pages": 16, "document_id": 7599, "arm": "llamacloud_basic_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the document for checks issued to Mont Blanc company:\n\nFrom paragraph 32.e: \"Bank records obtained in this investigation show that in September 2012, the NTC issued a check, number 150301, to a company known as Mont Blanc in the amount of $35,000.\"\n\nFrom paragraph 32.f: \"Subsequently, in February 2013, a second check, number 151545, in the amount of $40,684 was issued by the NTC payable to Mont Blanc.\"\n\nAnswer: $35,000, $40,684", "citations": [], "input_tokens": 5248, "output_tokens": 128, "cost_micros": 17664, "latency_ms": 3969, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "000, 35, 40, 684", "normalised_gold": "000, 35, 40, 684"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q001", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['$35,000', '$40,684']", "pages": 16, "document_id": 7599, "arm": "llamacloud_premium_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q001", "answer_letter": null, "raw_text": "Looking through the document for checks issued to Mont Blanc:\n\nFrom paragraph 32(e): \"Bank records obtained in this investigation show that in September 2012, the NTC issued a check, number 150301, to a company known as Mont Blanc in the amount of $35,000.\"\n\nFrom paragraph 32(f): \"Subsequently, in February 2013, a second check, number 151545, in the amount of $40,684 was issued by the NTC payable to Mont Blanc.\"\n\nAnswer: $35,000, $40,684", "citations": [], "input_tokens": 5798, "output_tokens": 127, "cost_micros": 19299, "latency_ms": 3592, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "000, 35, 40, 684", "normalised_gold": "000, 35, 40, 684"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q001", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['$35,000', '$40,684']", "pages": 16, "document_id": 7599, "arm": "surfsense_agentic", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the document that was mentioned to find information about checks issued to Mont Blanc company.Let me continue reading the document to find information about checks issued to Mont Blanc company.Let me continue reading to find information about checks issued to Mont Blanc.Perfect! I found the information about checks issued to Mont Blanc. Let me verify by reading a bit more to ensure I have all the check amounts.\n\nBased on the document, I found the following information about checks issued to Mont Blanc:\n\n1. Check number 150301 - $35,000 (issued in September 2012)\n2. Check number 151545 - $40,684 (issued in February 2013)\n\nAnswer: $35,000, $40,684", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 32530, "error": null, "extra": {"thread_id": 2061, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_5f1b7e2037a34d839cda54ca371fcea3", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 134, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "000, 35, 40, 684", "normalised_gold": "000, 35, 40, 684"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q002", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "str", "gold": "239516", "pages": 16, "document_id": 7599, "arm": "native_pdf", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the CA Bar number in the document.\n\nLooking at page 15, I can see the signature page where it states:\n\nADRIANA DYDELL\nAssistant United States Attorney\n211 W. Fort Street, Suite 2001\nDetroit, Michigan 48226\n(313) 226-9125\nadriana.dydell@usdoj.gov\n(CA. Bar No. 239516)\n\nAnswer: 239516", "citations": [], "input_tokens": 25014, "output_tokens": 103, "cost_micros": 76587, "latency_ms": 10625, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "239516", "normalised_gold": "239516"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q002", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "str", "gold": "239516", "pages": 16, "document_id": 7599, "arm": "azure_basic_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, specifically at page 15 (PageID.15), I can find the CA Bar number listed for Assistant United States Attorney Adriana Dydell.\n\nAnswer: 239516", "citations": [], "input_tokens": 5582, "output_tokens": 47, "cost_micros": 17451, "latency_ms": 3062, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "239516", "normalised_gold": "239516"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q002", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "str", "gold": "239516", "pages": 16, "document_id": 7599, "arm": "azure_premium_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q002", "answer_letter": null, "raw_text": "Looking through the document for the CA Bar number...\n\nAt the bottom of page 15, in the signature block for the Assistant United States Attorney, I can see:\n\nADRIANA DYDELL\nAssistant United States Attorney\n211 W. Fort Street, Suite 2001\nDetroit, Michigan 48226\n(313) 226-9125\nadriana.dydell@usdoj.gov\n(CA. Bar No. 239516)\n\nAnswer: 239516", "citations": [], "input_tokens": 6101, "output_tokens": 108, "cost_micros": 19923, "latency_ms": 4437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "239516", "normalised_gold": "239516"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q002", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "str", "gold": "239516", "pages": 16, "document_id": 7599, "arm": "llamacloud_basic_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q002", "answer_letter": null, "raw_text": "Looking through the document for the CA Bar number...\n\nAt the bottom of page 15, the signature block shows:\n\nADRIANA DYDELL\nAssistant United States Attorney\n211 W. Fort Street, Suite 2001\nDetroit, Michigan 48226\n(313) 226-9125\nDated: September, 2018\nadriana.dydell@usdoj.gov\n(CA. Bar No. 239516)\n\nAnswer: 239516", "citations": [], "input_tokens": 5231, "output_tokens": 107, "cost_micros": 17298, "latency_ms": 4078, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "239516", "normalised_gold": "239516"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q002", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "str", "gold": "239516", "pages": 16, "document_id": 7599, "arm": "llamacloud_premium_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, I can find the CA Bar number at the bottom of page 15, where it states:\n\n\"Dated: September 13, 2018 (CA. Bar No. 239516)\"\n\nThis appears next to the signature block for Adriana Dydell, Assistant United States Attorney.\n\nAnswer: 239516", "citations": [], "input_tokens": 5781, "output_tokens": 78, "cost_micros": 18513, "latency_ms": 3358, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "239516", "normalised_gold": "239516"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q002", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "str", "gold": "239516", "pages": 16, "document_id": 7599, "arm": "surfsense_agentic", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the document that was mentioned to find the CA Bar number.Answer: 239516", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 35797, "error": null, "extra": {"thread_id": 2062, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_67c2db9144b348b6ba61d0ad2f21abaf", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 115, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "239516", "normalised_gold": "239516"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q004", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['13', '14']", "pages": 16, "document_id": 7599, "arm": "native_pdf", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q004", "answer_letter": null, "raw_text": "I'll examine each page to identify where information has been blacked out (redacted).\n\nPage 1: No redactions visible\nPage 2: No redactions visible\nPage 3: No redactions visible\nPage 4: No redactions visible\nPage 5: No redactions visible\nPage 6: No redactions visible\nPage 7: No redactions visible\nPage 8: No redactions visible\nPage 9: No redactions visible\nPage 10: No redactions visible\nPage 11: No redactions visible\nPage 12: No redactions visible\nPage 13: Contains multiple black redaction boxes blocking out names and information (in paragraph l and m)\nPage 14: Contains black redaction boxes blocking out information (in paragraph n)\nPage 15: No redactions visible\nPage 16: No redactions visible\n\nAnswer: 13, 14", "citations": [], "input_tokens": 25034, "output_tokens": 196, "cost_micros": 78042, "latency_ms": 11015, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "13, 14", "normalised_gold": "13, 14"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q004", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['13', '14']", "pages": 16, "document_id": 7599, "arm": "azure_basic_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q004", "answer_letter": null, "raw_text": "I need to carefully examine the document to identify which pages have information that has been blacked out or redacted.\n\nLooking through the document:\n\n- Page 1 (PageID.1): No visible redactions\n- Page 2 (PageID.2): No visible redactions\n- Page 3 (PageID.3): No visible redactions\n- Page 4 (PageID.4): No visible redactions\n- Page 5 (PageID.5): No visible redactions\n- Page 6 (PageID.6): No visible redactions\n- Page 7 (PageID.7): No visible redactions\n- Page 8 (PageID.8): No visible redactions\n- Page 9 (PageID.9): No visible redactions\n- Page 10 (PageID.10): No visible redactions\n- Page 11 (PageID.11): No visible redactions\n- Page 12 (PageID.12): No visible redactions\n- Page 13 (PageID.13): In paragraph 1, there is redacted text after \"the last names of Iacobelli and\" and in paragraph m, there are multiple redactions of names\n- Page 14 (PageID.14): In paragraph n, there are redacted names at the beginning and within the text\n- Page 15 (PageID.15): No visible redactions\n- Page 16 (PageID.16): No visible redactions\n\nAnswer: 13, 14", "citations": [], "input_tokens": 5602, "output_tokens": 339, "cost_micros": 21891, "latency_ms": 5812, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "13, 14", "normalised_gold": "13, 14"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q004", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['13', '14']", "pages": 16, "document_id": 7599, "arm": "azure_premium_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q004", "answer_letter": null, "raw_text": "Looking through the document for redacted/blacked out information:\n\nOn page 14 (PageID.14), paragraph m, there are multiple instances of redacted information indicated by blank spaces where names should be:\n- \"was interviewed by agents.\"\n- \"was a former co-worker\"\n- \"During the interview,\"\n- \"informed agents that Iacobelli bragged to her about a Mont Blanc pen that he carried in his shirt pocket costing $25,000. Iacobelli also informed that\"\n\nOn page 14 (PageID.14), paragraph n, there is also redacted information:\n- \"On July 31, 2017, agents contacted who stated that\"\n- \"he was in possession of the Lincoln commemorative Mont Blanc pen. added that he ordered the pen, but that Iacobelli paid for both and\"\n\nPage 13 (PageID.13), paragraph l mentions \"having the last names of Iacobelli and\" followed by what appears to be a redaction.\n\nAnswer: 13, 14", "citations": [], "input_tokens": 6121, "output_tokens": 238, "cost_micros": 21933, "latency_ms": 6703, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "13, 14", "normalised_gold": "13, 14"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q004", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['13', '14']", "pages": 16, "document_id": 7599, "arm": "llamacloud_basic_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q004", "answer_letter": null, "raw_text": "I need to carefully review the document to identify which pages contain redacted or blacked out information.\n\nLooking through the document:\n\n- Page 2: No redactions visible\n- Page 3: No redactions visible\n- Page 4: No redactions visible\n- Page 5: No redactions visible\n- Page 6: No redactions visible\n- Page 7: No redactions visible\n- Page 8: No redactions visible\n- Page 9: No redactions visible\n- Page 10: No redactions visible\n- Page 11: No redactions visible\n- Page 12: No redactions visible\n- Page 13: Contains redacted/blacked out information (names are replaced with blank spaces/lines in sections l and m)\n- Page 14: Contains redacted/blacked out information (names are replaced with blank spaces/lines in section n)\n- Page 15: No redactions visible\n- Page 16: No redactions visible\n\nThe pages with blacked out information are pages 13 and 14, where personal names have been redacted.\n\nAnswer: 13, 14", "citations": [], "input_tokens": 5251, "output_tokens": 248, "cost_micros": 19473, "latency_ms": 5733, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "13, 14", "normalised_gold": "13, 14"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q004", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['13', '14']", "pages": 16, "document_id": 7599, "arm": "llamacloud_premium_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q004", "answer_letter": null, "raw_text": "I need to carefully examine the document to identify which pages contain redacted or blacked out information.\n\nLooking through the document:\n\n- Pages 1-12: No redactions visible\n- Page 13 (PageID.13): Contains multiple redactions marked with \"\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\" or similar black bars in paragraph m and n. Specifically:\n  - \"former co-worker \u2588\u2588\u2588\u2588\u2588\u2588\" \n  - Multiple instances of \"\u2588\u2588\u2588\u2588\u2588\u2588\" referring to a person's name\n  - \"podiatrist named \u2588\u2588\u2588\u2588\u2588\u2588\"\n  - \"[\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588]\" in paragraph n\n  - \"[\u2588\u2588\u2588\u2588]\" multiple times in paragraph n\n\n- Page 14 (PageID.14): No redactions visible\n- Pages 15-16: No redactions visible\n\nThe redactions appear only on page 13 of the document (which corresponds to PageID.13, Page 13 of 16).\n\nAnswer: 13", "citations": [], "input_tokens": 5801, "output_tokens": 206, "cost_micros": 20493, "latency_ms": 6671, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.6666666666666666, "method": "list_set", "normalised_pred": "13", "normalised_gold": "13, 14"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q004", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['13', '14']", "pages": 16, "document_id": 7599, "arm": "surfsense_agentic", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to identify pages with blacked out information.Based on my review of the document, I can identify pages with redacted (blacked out) information:\n\n- **Page 13**: Contains multiple black redaction bars in paragraphs l and m, obscuring names and identifying information. The visual description explicitly notes \"Multiple horizontal black bars throughout the document\" and \"Redacted sections appear in paragraphs l and m.\"\n\n- **Page 14**: Contains three black redaction blocks in paragraph n, obscuring names/identifying information. The visual description notes \"Contains redacted sections shown as solid black rectangular blocks\" and \"Multiple solid black rectangular blocks throughout paragraph n.\"\n\nAnswer: 13, 14", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 37265, "error": null, "extra": {"thread_id": 2063, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_5190eed28c9d49e68875e654ed7fc489", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 163, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "13, 14", "normalised_gold": "13, 14"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q005", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "int", "gold": "19", "pages": 16, "document_id": 7599, "arm": "native_pdf", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q005", "answer_letter": null, "raw_text": "I need to find manually written numbers on pages 15 and 16 of this document.\n\nLooking at page 15 (PageID.15):\nI can see a handwritten date that appears to be \"September 12, 2018\" where \"12\" is written manually.\n\nLooking at page 16 (PageID.16):\nI can see a handwritten date that appears to be \"September 6, 2018\" where \"6\" is written manually.\n\nThe manually written numbers are:\n- Page 15: 12\n- Page 16: 6\n\nSum: 12 + 6 = 18\n\nAnswer: 18", "citations": [], "input_tokens": 25026, "output_tokens": 148, "cost_micros": 77298, "latency_ms": 8937, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "18", "normalised_gold": "19"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q005", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "int", "gold": "19", "pages": 16, "document_id": 7599, "arm": "azure_basic_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q005", "answer_letter": null, "raw_text": "I need to find manually written numbers on pages 15 and 16 of the document.\n\nLooking at page 15 (PageID.15):\n- The date shows \"September 3 2018\" - there appears to be a handwritten \"3\" in the date field\n\nLooking at page 16 (PageID.16):\n- The date shows \"September 6 2018\" - there appears to be a handwritten \"6\" in the date field\n\nThe sum of the manually written numbers is: 3 + 6 = 9\n\nAnswer: 9", "citations": [], "input_tokens": 5594, "output_tokens": 130, "cost_micros": 18732, "latency_ms": 3875, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "9", "normalised_gold": "19"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q005", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "int", "gold": "19", "pages": 16, "document_id": 7599, "arm": "azure_premium_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q005", "answer_letter": null, "raw_text": "I need to find manually written numbers on pages 15 and 16 of the document.\n\nLooking at page 15 (PageID.15):\nThe date shows \"September 13 2018\" where \"13\" appears to be handwritten.\n\nLooking at page 16 (PageID.16):\nThe date shows \"September 6, 2018\" where \"6\" appears to be handwritten.\n\nSum: 13 + 6 = 19\n\nAnswer: 19", "citations": [], "input_tokens": 6113, "output_tokens": 112, "cost_micros": 20019, "latency_ms": 3219, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "19", "normalised_gold": "19"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q005", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "int", "gold": "19", "pages": 16, "document_id": 7599, "arm": "llamacloud_basic_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q005", "answer_letter": null, "raw_text": "I need to examine pages 15 and 16 of the document to find manually written numbers.\n\nLooking at page 15:\n- The page header shows \"Case 2:18-cv-12857-DML-RSW ECF No. 1 filed 09/13/18 PageID.15 Page 15 of 16\"\n- The page contains typed text including \"Respectfully submitted,\" signature block for Matthew Schneider and Adriana Dydell\n- At the bottom it shows \"Dated: September, 2018\"\n- The number \"15\" appears at the bottom of the page\n\nLooking at page 16:\n- The page header shows \"Case 2:18-cv-12857-DML-RSW ECF No. 1 filed 09/13/18 PageID.16 Page 16 of 16\"\n- Contains a verification section signed by Marc Silski\n- At the bottom it shows \"Dated: September 6, 2018\"\n- The number \"16\" appears at the bottom of the page\n\nThe manually written/handwritten numbers visible on these pages are:\n- Page 15: The page number \"15\" at the bottom\n- Page 16: The page number \"16\" at the bottom\n\nSum: 15 + 16 = 31\n\nAnswer: 31", "citations": [], "input_tokens": 5243, "output_tokens": 307, "cost_micros": 20334, "latency_ms": 6765, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "31", "normalised_gold": "19"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q005", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "int", "gold": "19", "pages": 16, "document_id": 7599, "arm": "llamacloud_premium_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q005", "answer_letter": null, "raw_text": "I need to identify manually written numbers on pages 15 and 16 of the document.\n\nLooking at page 15 (PageID.15):\n- The date \"September 13, 2018\" appears to have \"13\" that could be handwritten, but examining the format, it appears to be typed text, not manually written.\n\nLooking at page 16 (PageID.16):\n- The date \"September 6, 2018\" appears, where \"6\" could potentially be handwritten.\n- There is a signature line with \"Special Agent Marc Silski\" and a signature.\n\nHowever, upon closer examination of the document format, these appear to be typed dates in the standard court document format. The document shows underlined spaces where information was filled in, but the actual numbers \"13\" on page 15 and \"6\" on page 16 appear in the standard typed format of the document.\n\nLooking more carefully, the date on page 15 shows \"September <u>13</u>, 2018\" where \"13\" appears to be typed/filled in on an underlined space, and on page 16 \"September 6, 2018\" where \"6\" appears typed.\n\nIf these are considered the manually written/filled-in numbers:\n- Page 15: 13\n- Page 16: 6\n- Sum: 13 + 6 = 19\n\nAnswer: 19", "citations": [], "input_tokens": 5793, "output_tokens": 316, "cost_micros": 22119, "latency_ms": 7969, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "19", "normalised_gold": "19"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q005", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "int", "gold": "19", "pages": 16, "document_id": 7599, "arm": "surfsense_agentic", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q005", "answer_letter": null, "raw_text": "I'll help you find the manually written numbers on pages 15 and 16 of the document. Let me first read the document that was mentioned in the context.Let me continue reading to find pages 15 and 16. I need to search through the document to locate these specific pages.", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 27906, "error": null, "extra": {"thread_id": 2064, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_3dc2a747e29a4ced83b1f3d763f972af", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 79, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "15", "normalised_gold": "19"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q006", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['black', 'blue', 'white']", "pages": 16, "document_id": 7599, "arm": "native_pdf", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q006", "answer_letter": null, "raw_text": "I need to analyze the document to identify all the colors present in it.\n\nLooking at the document pages provided:\n- The text appears in black\n- The background is white\n- There are blue hyperlinks/case numbers at the top of each page\n- There are black redaction marks/blocks in some areas (pages 13-14)\n\nThe colors present in this document are:\n- Black (text and redactions)\n- Blue (case numbers and headers)\n- White (background)\n\nAnswer: black, blue, white", "citations": [], "input_tokens": 25038, "output_tokens": 116, "cost_micros": 76854, "latency_ms": 10546, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "black, blue, white", "normalised_gold": "black, blue, white"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q006", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['black', 'blue', 'white']", "pages": 16, "document_id": 7599, "arm": "azure_basic_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q006", "answer_letter": null, "raw_text": "I need to identify all colors mentioned in this legal document.\n\nScanning through the document for color references:\n\n1. Page 11, paragraph d: \"red 2013 Ferrari 458 Spider convertible\"\n2. Page 12, paragraph j: mentions \"gold\" (portrait in gold) and \"sapphire\" (which is blue)\n\nLet me verify these are the only color mentions:\n- \"red\" - explicitly mentioned for the Ferrari\n- \"gold\" - mentioned for Lincoln's portrait on the pen clip\n- The sapphire mentioned would typically be blue, but the document says \"sapphire\" not \"blue\"\n\nLooking more carefully at the text:\n- \"red 2013 Ferrari\" (paragraph 32d)\n- \"gold\" (paragraph 32j - \"Lincoln's portrait in gold\")\n- \"sapphire\" (paragraph 32j - \"has a sapphire\")\n\nThe colors explicitly stated are:\n- gold\n- red\n\nAlphabetically ordered:\n\nAnswer: gold, red", "citations": [], "input_tokens": 5606, "output_tokens": 228, "cost_micros": 20238, "latency_ms": 6780, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "gold, red", "normalised_gold": "black, blue, white"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q006", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['black', 'blue', 'white']", "pages": 16, "document_id": 7599, "arm": "azure_premium_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q006", "answer_letter": null, "raw_text": "I need to examine the document to identify all colors mentioned or visible in it.\n\nLooking through the document:\n\n1. The document itself appears to be a black and white legal document (black text on white background)\n2. In paragraph 32(d), a \"red 2013 Ferrari 458 Spider convertible\" is mentioned\n3. In paragraph 32(j), \"gold\" is mentioned (describing Lincoln's portrait in gold on the pen clip)\n4. In paragraph 32(j), \"sapphire\" is mentioned (the pen's clip has a sapphire)\n\nColors explicitly mentioned in the text:\n- Gold\n- Red\n- Sapphire (which is blue)\n\nColors of the document format itself:\n- Black (text)\n- White (background)\n\nListing all colors alphabetically:\n- Black\n- Blue (sapphire)\n- Gold\n- Red\n- White\n\nAnswer: black, blue, gold, red, white", "citations": [], "input_tokens": 6125, "output_tokens": 214, "cost_micros": 21585, "latency_ms": 5405, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.7499999999999999, "method": "list_set", "normalised_pred": "black, blue, gold, red, white", "normalised_gold": "black, blue, white"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q006", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['black', 'blue', 'white']", "pages": 16, "document_id": 7599, "arm": "llamacloud_basic_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q006", "answer_letter": null, "raw_text": "I need to identify all colors mentioned in this document.\n\nLet me search through the document for any color references:\n\n1. In paragraph 32.a, there is mention of \"a red 2013 Ferrari 458 Spider convertible\"\n2. In paragraph 32.j, there is mention of \"gold\" (describing Lincoln's portrait on the pen clip)\n3. The document is a legal court document that appears to be in black text on white paper (standard document format)\n\nThe colors explicitly mentioned in the text are:\n- Gold (mentioned in the description of the pen)\n- Red (mentioned in the description of the Ferrari)\n\nSince this is a standard black and white legal document with black text, and the question asks for colors in the document, I should include the standard document colors as well as the colors mentioned in the content.\n\nColors found:\n- Black (text)\n- Gold (pen description)\n- Red (Ferrari description)\n- White (paper/background)\n\nIn alphabetical order: black, gold, red, white\n\nAnswer: black, gold, red, white", "citations": [], "input_tokens": 5255, "output_tokens": 238, "cost_micros": 19335, "latency_ms": 7875, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.5714285714285715, "method": "list_set", "normalised_pred": "black, gold, red, white", "normalised_gold": "black, blue, white"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q006", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['black', 'blue', 'white']", "pages": 16, "document_id": 7599, "arm": "llamacloud_premium_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q006", "answer_letter": null, "raw_text": "I need to identify all colors mentioned in the document.\n\nGoing through the document carefully:\n\n1. Page 10, paragraph d: \"red 2013 Ferrari 458 Spider convertible\" - mentions **red**\n2. Page 12, paragraph j: \"sapphire\" (referring to a sapphire on the pen clip) - **blue** (sapphire is blue)\n3. Page 12, paragraph j: \"gold\" (portrait in gold on the pen's clip) - mentions **gold**\n\nLooking at the document format itself, this appears to be a standard black text on white background legal document, so:\n- **black** (text)\n- **white** (background)\n\nAlphabetically ordered:\n- black\n- blue\n- gold\n- red\n- white\n\nAnswer: black, blue, gold, red, white", "citations": [], "input_tokens": 5805, "output_tokens": 189, "cost_micros": 20250, "latency_ms": 6407, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.7499999999999999, "method": "list_set", "normalised_pred": "black, blue, gold, red, white", "normalised_gold": "black, blue, white"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q006", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "list", "gold": "['black', 'blue', 'white']", "pages": 16, "document_id": 7599, "arm": "surfsense_agentic", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q006", "answer_letter": null, "raw_text": "I'll read the document to identify all the colors present.Based on my careful reading of the document, I can identify the following colors mentioned or described:\n\nFrom the visual descriptions and content:\n- **Black** - mentioned multiple times in visual descriptions (\"black text\", \"black bars\", \"black rectangular blocks\" for redactions)\n- **Blue** - mentioned in the visual description (\"Blue text displaying case number\")\n- **Gold** - mentioned in the pen description (\"Lincoln's portrait in gold\")\n- **Red** - mentioned for the Ferrari (\"red 2013 Ferrari 458 Spider\")\n- **White** - mentioned in visual descriptions (\"white background\")\n\nThere's also a **sapphire** mentioned on the pen clip, which is typically blue, but since \"blue\" is already listed separately and sapphire is a gemstone rather than a color descriptor in this context, I'll include it as a separate item if we're being comprehensive about color-related terms.\n\nHowever, looking at the actual color words used as descriptors in the document:\n\nAnswer: black, blue, gold, red, white", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 37108, "error": null, "extra": {"thread_id": 2065, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_8c1d9833eaf14e8eae921556f7ecfecf", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 186, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.7499999999999999, "method": "list_set", "normalised_pred": "black, blue, gold, red, white", "normalised_gold": "black, blue, white"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q007", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "int", "gold": "2", "pages": 16, "document_id": 7599, "arm": "native_pdf", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q007", "answer_letter": null, "raw_text": "I need to count the signatures on pages 15 and 16 of the document.\n\nLooking at page 15 (PageID.15):\nI can see one signature above the typed name \"ADRIANA DYDELL, Assistant United States Attorney\"\n\nLooking at page 16 (PageID.16):\nI can see one signature above the typed name \"Special Agent Marc Silski, Federal Bureau of Investigation\"\n\nTotal signatures on pages 15 and 16: 1 + 1 = 2\n\nAnswer: 2", "citations": [], "input_tokens": 25021, "output_tokens": 122, "cost_micros": 76893, "latency_ms": 8171, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q007", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "int", "gold": "2", "pages": 16, "document_id": 7599, "arm": "azure_basic_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q007", "answer_letter": null, "raw_text": "I need to count the signatures on pages 15 and 16 of the document.\n\nLooking at page 15 (PageID.15):\nI can see one signature above the typed name \"ADRIANA DYDELL\" (Assistant United States Attorney).\n\nLooking at page 16 (PageID.16):\nI can see one signature above the typed name \"Special Agent Marc Silski\" (Federal Bureau of Investigation).\n\nTotal signatures on pages 15 and 16: 1 + 1 = 2\n\nAnswer: 2", "citations": [], "input_tokens": 5589, "output_tokens": 124, "cost_micros": 18627, "latency_ms": 3781, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q007", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "int", "gold": "2", "pages": 16, "document_id": 7599, "arm": "azure_premium_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q007", "answer_letter": null, "raw_text": "I need to count the signatures on pages 15 and 16 of the document.\n\nLooking at page 15:\n- There is one signature above the typed name \"ADRIANA DYDELL\"\n\nLooking at page 16:\n- There is one signature above the typed name \"Special Agent Marc Silski\"\n\nTotal signatures on pages 15 and 16: 1 + 1 = 2\n\nAnswer: 2", "citations": [], "input_tokens": 6108, "output_tokens": 100, "cost_micros": 19824, "latency_ms": 3750, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q007", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "int", "gold": "2", "pages": 16, "document_id": 7599, "arm": "llamacloud_basic_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q007", "answer_letter": null, "raw_text": "I need to count the signatures that appear on pages 15 and 16 of the document.\n\nLooking at page 15:\n- This page shows the document dated September, 2018\n- It has the names \"MATTHEW SCHNEIDER\" and \"ADRIANA DYDELL\" typed, but these appear to be typed names as part of the document format, not actual signatures\n- I do not see any actual handwritten or image signatures on page 15\n\nLooking at page 16:\n- This is the VERIFICATION page\n- It shows \"Special Agent Marc Silski\" and \"Federal Bureau of Investigation\" with a date of \"September 6, 2018\"\n- This also appears to be typed text rather than an actual signature image\n\nThe question asks for signatures \"appeared\" on these pages. Looking carefully at both pages, I can see typed names but no visible handwritten signatures or signature images in the document.\n\nHowever, these are signature blocks where signatures would typically appear. The document shows:\n- Page 15: 1 signature block (for Adriana Dydell)\n- Page 16: 1 signature block (for Marc Silski)\n\nTotal signature blocks: 2\n\nSince the question asks for the count in \"float format\" but then requests \"a single integer only,\" and given that there are 2 signature blocks present on these pages combined:\n\nAnswer: 2", "citations": [], "input_tokens": 5238, "output_tokens": 308, "cost_micros": 20334, "latency_ms": 8796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q007", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "int", "gold": "2", "pages": 16, "document_id": 7599, "arm": "llamacloud_premium_lc", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q007", "answer_letter": null, "raw_text": "I need to count the signatures on pages 15 and 16 of the document.\n\n**Page 15 (PageID.15):**\nLooking at page 15, I can see:\n1. A signature above \"ADRIANA DYDELL\" - there is an underlined signature line with \"Adriana Dydell\" written above it\n\nSo there is 1 signature on page 15.\n\n**Page 16 (PageID.16):**\nLooking at page 16, I can see:\n1. A signature notation \"[Signature of Marc Silski]\" above \"Special Agent Marc Silski\"\n\nSo there is 1 signature on page 16.\n\n**Total signatures on pages 15 and 16:** 1 + 1 = 2\n\nAnswer: 2", "citations": [], "input_tokens": 5788, "output_tokens": 185, "cost_micros": 20139, "latency_ms": 5296, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q007", "doc_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf", "answer_format": "int", "gold": "2", "pages": 16, "document_id": 7599, "arm": "surfsense_agentic", "question_id": "3276a5b991c49cf5f9a4af0f7d6fce67.pdf::Q007", "answer_letter": null, "raw_text": "I'll read the document to find the signatures on pages 15 and 16.Let me search for content related to pages 15 and 16 by reading more of the document:Let me read the entire document to find pages 15 and 16:Great! I found the relevant sections. Let me look more carefully at the content around pages 15 and 16. I can see:\n\n- Line 616: `<!-- PageNumber=\"15\" -->`\n- Line 637: `<!-- PageNumber=\"16\" -->`\n\nNow let me examine the signatures on these pages. From what I can see:\n\n**Page 15** (lines 590-618): Shows signatures for:\n1. Matthew Schneider (United States Attorney) \n2. Adriana Dydell (Assistant United States Attorney)\n\n**Page 16** (lines 619-638): Shows signature for:\n1. Marc Silski (Special Agent, Federal Bureau of Investigation)\n\nLet me read the section more carefully to confirm:Perfect! Now I can clearly see the signatures on both pages:\n\n**Page 15** (lines 596-616):\n- Shows \"Respectfully submitted,\" followed by:\n  1. MATTHEW SCHNEIDER (United States Attorney)\n  2. \"Adriana@iol\" (appears to be a signature)\n  3. ADRIANA DYDELL (Assistant United States Attorney)\n- The \"Adriana@iol\" on line 603 appears to be a signature representation\n- Dated: September 13, 2018\n\n**Page 16** (lines 619-637):\n- Shows VERIFICATION section with:\n  1. \"Josh,\" on line 631 (appears to be a signature)\n  2. Special Agent Marc Silski (Federal Bureau of Investigation)\n- Dated: September 6, 2018\n\nCounting the signatures:\n- Page 15: 1 signature (the \"Adriana@iol\" signature for Adriana Dydell)\n- Page 16: 1 signature (the \"Josh,\" signature for Marc Silski)\n\nTotal signatures on pages 15 and 16: 1 + 1 = 2\n\nAnswer: 2", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 42092, "error": null, "extra": {"thread_id": 2066, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_ddcf7e386683499793cc579d23850dce", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 267, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q001", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "str", "gold": "01983 873655", "pages": 17, "document_id": 7600, "arm": "native_pdf", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q001", "answer_letter": null, "raw_text": "According to the document, the telephone number for The Limes Residential Home is:\n\nAnswer: 01983 873655", "citations": [], "input_tokens": 37738, "output_tokens": 30, "cost_micros": 113664, "latency_ms": 7312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "379f44022bb27aa53efd5d322c7b57bf.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "01983 873655", "normalised_gold": "01983 873655"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q001", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "str", "gold": "01983 873655", "pages": 17, "document_id": 7600, "arm": "azure_basic_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the document header on the first page, I can find the contact information for The Limes Residential Home.\n\nAnswer: 01983 873655", "citations": [], "input_tokens": 10257, "output_tokens": 37, "cost_micros": 31326, "latency_ms": 4000, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "01983 873655", "normalised_gold": "01983 873655"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q001", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "str", "gold": "01983 873655", "pages": 17, "document_id": 7600, "arm": "azure_premium_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the document header on the first page, I can find the contact information for The Limes Residential Home.\n\nThe address and contact details are:\n43 Foreland Road\nBembridge\nIsle of Wight\nPO35 5XN\nTel: 01983 873655\n\nAnswer: 01983 873655", "citations": [], "input_tokens": 12045, "output_tokens": 79, "cost_micros": 37320, "latency_ms": 3766, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "01983 873655", "normalised_gold": "01983 873655"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q001", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "str", "gold": "01983 873655", "pages": 17, "document_id": 7600, "arm": "llamacloud_basic_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the document header, I can find the contact information for The Limes Residential Home.\n\nThe address block shows:\n43 Foreland Road\nBembridge\nIsle of Wight\nPO35 5XN\nTel: 01983 873655\n\nAnswer: 01983 873655", "citations": [], "input_tokens": 10599, "output_tokens": 73, "cost_micros": 32892, "latency_ms": 6344, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "01983 873655", "normalised_gold": "01983 873655"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q001", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "str", "gold": "01983 873655", "pages": 17, "document_id": 7600, "arm": "llamacloud_premium_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the document header, I can find the contact information for The Limes Residential Home.\n\nAnswer: 01983 873655", "citations": [], "input_tokens": 10834, "output_tokens": 33, "cost_micros": 32997, "latency_ms": 3625, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "01983 873655", "normalised_gold": "01983 873655"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q001", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "str", "gold": "01983 873655", "pages": 17, "document_id": 7600, "arm": "surfsense_agentic", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the document to find the telephone number for The Limes Residential Home.Let me search for \"The Limes Residential Home\" in the document:Let me search with more context to see the actual content:I can see from the search results that the telephone number appears on line 168. Let me read that section:Perfect! I found the telephone number for The Limes Residential Home on line 174.\n\nAnswer: 01983 873655", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 34734, "error": null, "extra": {"thread_id": 2067, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_de5d9004414b47159eff08bc68030dbf", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 151, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "01983 873655", "normalised_gold": "01983 873655"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q003", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "list", "gold": "['Is the servife safe?', 'Is the service effective', 'Is the serve caring?', 'Is the service responsive?', 'Is the service well-led?']", "pages": 17, "document_id": 7600, "arm": "native_pdf", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the document, specifically at page 3 which contains the section \"The five questions we ask about services and what we found\", I can identify the primary questions asked in this inspection report.\n\nAnswer: Is the service safe?, Is the service effective?, Is the service caring?, Is the service responsive?, Is the service well-led?", "citations": [], "input_tokens": 37742, "output_tokens": 73, "cost_micros": 114321, "latency_ms": 7515, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "379f44022bb27aa53efd5d322c7b57bf.pdf"}, "graded": {"correct": false, "f1": 0.6, "method": "list_set", "normalised_pred": "is service caring, is service effective, is service responsive, is service safe, is service well led", "normalised_gold": "is serve caring, is service effective, is service responsive, is service well led, is servife safe"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q003", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "list", "gold": "['Is the servife safe?', 'Is the service effective', 'Is the serve caring?', 'Is the service responsive?', 'Is the service well-led?']", "pages": 17, "document_id": 7600, "arm": "azure_basic_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the document, I can identify the five primary questions that were asked about the services in this inspection report.\n\nThese questions are clearly listed in the \"Summary of findings\" section under \"The five questions we ask about services and what we found\":\n\nAnswer: Is the service safe?, Is the service effective?, Is the service caring?, Is the service responsive?, Is the service well-led?", "citations": [], "input_tokens": 10261, "output_tokens": 84, "cost_micros": 32043, "latency_ms": 3844, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.6, "method": "list_set", "normalised_pred": "is service caring, is service effective, is service responsive, is service safe, is service well led", "normalised_gold": "is serve caring, is service effective, is service responsive, is service well led, is servife safe"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q003", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "list", "gold": "['Is the servife safe?', 'Is the service effective', 'Is the serve caring?', 'Is the service responsive?', 'Is the service well-led?']", "pages": 17, "document_id": 7600, "arm": "azure_premium_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the document, specifically the \"Summary of findings\" section which contains \"The five questions we ask about services and what we found\", I can identify the primary questions asked about the services.\n\nAnswer: Is the service safe?, Is the service effective?, Is the service caring?, Is the service responsive?, Is the service well-led?", "citations": [], "input_tokens": 12049, "output_tokens": 72, "cost_micros": 37227, "latency_ms": 4234, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.6, "method": "list_set", "normalised_pred": "is service caring, is service effective, is service responsive, is service safe, is service well led", "normalised_gold": "is serve caring, is service effective, is service responsive, is service well led, is servife safe"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q003", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "list", "gold": "['Is the servife safe?', 'Is the service effective', 'Is the serve caring?', 'Is the service responsive?', 'Is the service well-led?']", "pages": 17, "document_id": 7600, "arm": "llamacloud_basic_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the document, I can see the \"five questions we ask about services\" section which lists the primary questions asked during the inspection.\n\nAnswer: Is the service safe?, Is the service effective?, Is the service caring?, Is the service responsive?, Is the service well-led?", "citations": [], "input_tokens": 10603, "output_tokens": 61, "cost_micros": 32724, "latency_ms": 3125, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.6, "method": "list_set", "normalised_pred": "is service caring, is service effective, is service responsive, is service safe, is service well led", "normalised_gold": "is serve caring, is service effective, is service responsive, is service well led, is servife safe"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q003", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "list", "gold": "['Is the servife safe?', 'Is the service effective', 'Is the serve caring?', 'Is the service responsive?', 'Is the service well-led?']", "pages": 17, "document_id": 7600, "arm": "llamacloud_premium_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q003", "answer_letter": null, "raw_text": "Looking through the document, I can identify the five primary questions that were consistently asked about the services throughout the inspection report.\n\nThese questions appear in the \"Summary of findings\" section and structure the entire report, with each having its own detailed findings section:\n\n1. Is the service safe?\n2. Is the service effective?\n3. Is the service caring?\n4. Is the service responsive?\n5. Is the service well-led?\n\nAnswer: Is the service safe?, Is the service effective?, Is the service caring?, Is the service responsive?, Is the service well-led?", "citations": [], "input_tokens": 10838, "output_tokens": 126, "cost_micros": 34404, "latency_ms": 4328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.6, "method": "list_set", "normalised_pred": "is service caring, is service effective, is service responsive, is service safe, is service well led", "normalised_gold": "is serve caring, is service effective, is service responsive, is service well led, is servife safe"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q003", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "list", "gold": "['Is the servife safe?', 'Is the service effective', 'Is the serve caring?', 'Is the service responsive?', 'Is the service well-led?']", "pages": 17, "document_id": 7600, "arm": "surfsense_agentic", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the user-mentioned document to find the primary questions asked about the services.Looking at the document, I can see this is a Care Quality Commission inspection report. The report explicitly states \"The five questions we ask about services\" in a table starting around line 314. Let me identify these questions from the document structure:\n\nFrom the document, the five primary questions asked about the services are clearly listed:\n\n1. Is the service safe?\n2. Is the service effective?\n3. Is the service caring?\n4. Is the service responsive?\n5. Is the service well-led?\n\nAnswer: Is the service safe?, Is the service effective?, Is the service caring?, Is the service responsive?, Is the service well-led?", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 27546, "error": null, "extra": {"thread_id": 2068, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_8bdf643e270448d6aadaffef3d9744b8", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 108, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.6, "method": "list_set", "normalised_pred": "is service caring, is service effective, is service responsive, is service safe, is service well led", "normalised_gold": "is serve caring, is service effective, is service responsive, is service well led, is servife safe"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q004", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "list", "gold": "['Page 1', 'Page 5']", "pages": 17, "document_id": 7600, "arm": "native_pdf", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q004", "answer_letter": null, "raw_text": "I need to identify all pages where the CQC (Care Quality Commission) logo appears in this document.\n\nLet me examine each page:\n\nPage 1: I can see the CQC logo (the circular Q logo with \"Care Quality Commission\" text) at the top of the page.\n\nPage 2: No CQC logo visible on this page.\n\nPage 3: No CQC logo visible on this page.\n\nPage 4: No CQC logo visible on this page.\n\nPage 5: I can see the CQC logo at the top of this page.\n\nPages 6-17: Examining the remaining pages, I do not see the CQC logo appearing on any of these pages.\n\nAnswer: Page 1, Page 5", "citations": [], "input_tokens": 37764, "output_tokens": 169, "cost_micros": 115827, "latency_ms": 7890, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "379f44022bb27aa53efd5d322c7b57bf.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "page 1, page 5", "normalised_gold": "page 1, page 5"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q004", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "list", "gold": "['Page 1', 'Page 5']", "pages": 17, "document_id": 7600, "arm": "azure_basic_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q004", "answer_letter": null, "raw_text": "I need to identify all pages where the CQC (Care Quality Commission) logo appears in this document.\n\nLooking through the document, I can see \"Care Quality Commission\" text appears at the top of certain pages, which typically indicates where a logo would be placed in the actual PDF document.\n\nLet me go through each page:\n\nPage 1: \"Care Quality Commission\" appears at the top - \"The Limes Residential Care Home Limited The Limes Residential Home\"\n\nPage 2: \"Summary of findings\" section continues\n\nPage 3: \"Summary of findings\" continues\n\nPage 4: \"Summary of findings\" continues\n\nPage 5: \"Care Quality Commission\" appears - \"The Limes Residential Home Detailed findings\"\n\nPage 6: Continues with \"Is the service safe?\"\n\nPage 7: Continues with \"Is the service safe?\"\n\nPage 8: \"Inadequate\" and \"Is the service effective?\"\n\nPage 9: Continues with \"Is the service effective?\"\n\nPage 10: \"Requires improvement\" and \"Is the service caring?\"\n\nPage 11: Continues with \"Is the service caring?\"\n\nPage 12: \"Requires improvement\" and \"Is the service responsive?\"\n\nPage 13: \"Requires improvement\" and \"Is the service well-led?\"\n\nPage 14: Continues with \"Is the service well-led?\"\n\nPage 15: \"Action we have told the provider to take\"\n\nPage 16: Continues with \"Action we have told the provider to take\"\n\nPage 17: \"Enforcement actions\"\n\nBased on the document structure, \"Care Quality Commission\" appears as a header on pages 1 and 5, which is where the CQC logo would typically be located.\n\nAnswer: Page 1, Page 5", "citations": [], "input_tokens": 10283, "output_tokens": 395, "cost_micros": 36774, "latency_ms": 7358, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "page 1, page 5", "normalised_gold": "page 1, page 5"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q004", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "list", "gold": "['Page 1', 'Page 5']", "pages": 17, "document_id": 7600, "arm": "azure_premium_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q004", "answer_letter": null, "raw_text": "I need to identify all pages where the Care Quality Commission (CQC) logo appears in this document.\n\nLooking through the document:\n\n1. Page 1 (cover page) - Yes, CQC logo appears at the top\n2. Page 2 - No visible logo\n3. Page 3 - No visible logo\n4. Page 4 - No visible logo\n5. Page 5 - Yes, CQC logo appears at the top (start of \"Detailed findings\" section)\n6. Page 6 - No visible logo\n7. Page 7 - No visible logo\n8. Page 8 - No visible logo\n9. Page 9 - No visible logo\n10. Page 10 - No visible logo\n11. Page 11 - No visible logo\n12. Page 12 - No visible logo\n13. Page 13 - No visible logo\n14. Page 14 - No visible logo\n15. Page 15 - No visible logo\n16. Page 16 - No visible logo\n17. Page 17 - No visible logo\n\nThe CQC logo appears on:\n- Page 1 (the title page/cover)\n- Page 5 (at the start of the detailed findings section)\n\nAnswer: Page 1, Page 5", "citations": [], "input_tokens": 12071, "output_tokens": 294, "cost_micros": 40623, "latency_ms": 5390, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "page 1, page 5", "normalised_gold": "page 1, page 5"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q004", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "list", "gold": "['Page 1', 'Page 5']", "pages": 17, "document_id": 7600, "arm": "llamacloud_basic_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q004", "answer_letter": null, "raw_text": "I need to identify all pages where the CQC (Care Quality Commission) logo appears in this document.\n\nLooking through the document, I can see \"Care Quality Commission\" text appears at the top of several pages. The logo/header \"Care Quality Commission\" appears on:\n\n- Page 1 (the title page with \"The Limes Residential Care Home Limited\")\n- Page 5 (at the top before \"Detailed findings\")\n- Page 15 (at the top before \"Action we have told the provider to take\")\n\nLet me verify by checking each page systematically:\n\nPage 1: \"Care Quality Commission\" header present\nPage 2-4: Summary of findings pages - no clear CQC logo/header visible in the page headers\nPage 5: \"Care Quality Commission\" header present before \"Detailed findings\"\nPages 6-14: Detailed findings sections - these appear to be continuation pages\nPage 15: \"Care Quality Commission\" header present before enforcement actions section\nPages 16-17: Continuation of enforcement actions\n\nBased on the document structure, the CQC logo/header appears on:\n\nAnswer: Page 1, Page 5, Page 15", "citations": [], "input_tokens": 10625, "output_tokens": 262, "cost_micros": 35805, "latency_ms": 7218, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.8, "method": "list_set", "normalised_pred": "page 1, page 15, page 5", "normalised_gold": "page 1, page 5"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q004", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "list", "gold": "['Page 1', 'Page 5']", "pages": 17, "document_id": 7600, "arm": "llamacloud_premium_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q004", "answer_letter": null, "raw_text": "I need to identify all pages where the CQC (Care Quality Commission) logo appears in this document.\n\nLooking through the document systematically:\n\nPage 1: \"Care Quality Commission\" appears at the top as a header\nPage 2: \"Summary of findings\" - no logo visible in header\nPage 3: \"Summary of findings\" continues - no logo visible\nPage 4: \"Summary of findings\" continues - no logo visible\nPage 5: \"CareQuality Commission\" appears at the top (note: slightly different formatting)\nPage 6-17: These pages show section headings but I need to check for the CQC logo/branding\n\nLooking more carefully at the document structure, the CQC logo/branding appears to be present on:\n- Page 1 (top of document with \"Care Quality Commission\")\n- Page 5 (shows \"CareQuality Commission\" at top of \"Detailed findings\" section)\n\nThese appear to be the only pages with the CQC branding/logo clearly marked at the top.\n\nAnswer: Page 1, Page 5", "citations": [], "input_tokens": 10860, "output_tokens": 243, "cost_micros": 36225, "latency_ms": 6234, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "page 1, page 5", "normalised_gold": "page 1, page 5"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q004", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "list", "gold": "['Page 1', 'Page 5']", "pages": 17, "document_id": 7600, "arm": "surfsense_agentic", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q004", "answer_letter": null, "raw_text": "I'll read the document to identify all pages where the CQC logo appears.Let me search for \"CQC\" or \"logo\" throughout the document to find all pages where the logo appears:Now let me search for mentions of \"logo\" or \"Care Quality Commission\" to find where the logo appears:Let me search more specifically for image and figure tags to identify all instances of the logo:Let me read through the entire document more systematically to find all logo occurrences. Let me check what page the first logo is on and look for patterns:The first logo appears before \"PageFooter=1\", so it's on Page 1. Now let me search the entire document systematically for all instances of the CQC logo. Let me search for figures or images:Good! Now I found 7 figure tags. Let me read the context around each one to identify which ones are the CQC logo:Now I've identified that several figures contain only page headers. Let me verify which ones actually contain the CQC logo by checking the content. Based on my readings:\n\n1. Line 121 (before PageFooter 1) - Contains \"Care Quality Commission\" logo with visual description - **Page 1**\n2. Line 409 (after PageBreak on line 406, before PageFooter 4 on line 405) - Contains \"Care Quality Commission\" - **Page 5** (since PageFooter 4 appears at line 405-406, this is after the page break, so it's on page 5)\n\nLet me verify by checking the page breaks more carefully:Perfect! So after PageFooter 4 and PageBreak, we get Page 5, which has the CQC logo. Now let me check if there are any other CQC logos. The other figures at lines 868, 981, 1036, 1134, and 1247 only contain PageHeaders, not the logo.\n\nLet me verify this is correct by checking one more time if there are any other embedded images:Let me search for any other embedded images:Good! Only one embedded image (I1.jpg) exists in the document. Now let me confirm the two locations of \"Care Quality Commission\" text in figures:So the CQC logo on Page 5 appears between PageFooter 4 (line 405) and PageFooter 5 (line 456). That confirms it's on Page 5.\n\nNow let me verify there's only one actual logo image (I1.jpg) on Page 1, and the Page 5 figure is just text without an image:I notice that on Page 5 (line 409-414), there's a figure with \"Care Quality Commission\" text but NO embedded image notation. Let me check if there's an image reference after this figure:No embedded image noted for Page 5's figure. So the Page 5 figure has \"Care Quality Commission\" text but the question asks specifically about the \"logo\" which includes the visual/graphical element. Let me re-read the Page 1 section to see if it explicitly identifies this as a logo:\n\nLooking back at lines 130-147, Page 1 clearly shows:\n- \"**Embedded image:** `I1.jpg`\"\n- \"**Visual description:**\"\n- \"Organizational logo combining a graphical symbol with text\"\n\nThis definitively identifies Page 1 as having the CQC logo (the actual image/visual logo).\n\nOn Page 5 (lines 409-414), there's only the text \"Care Quality Commission\" in a figure, but no embedded image or logo graphic.\n\nHowever, let me check if both figures might represent the same logo - the system might have extracted it once with full details (Page 1) and then just shown the text on Page 5. In typical inspection reports, logos often appear on multiple pages. Let me search more broadly to understand the document structure better. Let me see if there are image references I might have missed:Only one \"Visual description\" exists - on Page 1. The interpretation here is that:\n- Page 1 has the actual CQC logo image (I1.jpg with full visual description)\n- Page 5 has a figure with just \"Care Quality Commission\" text\n\nGiven that the question asks for \"the logo of CQC\" (which implies the graphical/visual logo, not just text), and given that only Page 1 has an embedded image with the visual logo description, the answer should be Page 1 only.\n\nHowever, both figures contain \"Care Quality Commission\" and in PDF documents, logos can appear as repeated elements. The figure on Page 5 likely also represents the logo (just extracted differently by the parser). Let me verify by checking if standard CQC inspection reports have logos on multiple pages.\n\nBased on the document structure, I have:\n1. **Page 1** - Clear CQC logo with embedded image I1.jpg and full visual description\n2. **Page 5** - Figure with \"Care Quality Commission\" text (likely the same logo, but extraction didn't capture the image separately)\n\nGiven the ambiguity, I should report both pages where the CQC element appears in a figure tag, as both likely represent the logo placement in the original PDF.\n\nAnswer: Page 1, Page 5", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 135453, "error": null, "extra": {"thread_id": 2069, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_2a8af9e5bd4c48cebf2e6cbec000700b", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 1002, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "page 1, page 5", "normalised_gold": "page 1, page 5"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q005", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "int", "gold": "7", "pages": 17, "document_id": 7600, "arm": "native_pdf", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q005", "answer_letter": null, "raw_text": "I need to count all the unique regulations that were breached according to this CQC inspection report.\n\nLet me go through the document systematically:\n\n**From \"Action we have told the provider to take\" section (pages 15-16):**\n1. Regulation 18 HSCA (RA) Regulations 2014 - Staffing\n2. Regulation 11 HSCA (RA) Regulations 2014 - Need for consent\n3. Regulation 10 HSCA (RA) Regulations 2014 - Dignity and respect\n4. Regulation 9 HSCA (RA) Regulations 2014 - Person-centred care\n5. Regulation 17 HSCA (RA) Regulations 2014 - Good governance\n6. Regulation 18 CQC (Registration) Regulations 2009 - Notification of other incidents\n\n**From \"Enforcement actions\" section (page 17):**\n7. Regulation 13 HSCA (RA) Regulations 2014 - Safeguarding service users from abuse and improper treatment\n8. Regulation 12 HSCA (RA) Regulations 2014 - Safe care and treatment\n\nCounting the unique regulations breached:\n- Regulation 9 (Person-centred care)\n- Regulation 10 (Dignity and respect)\n- Regulation 11 (Need for consent)\n- Regulation 12 (Safe care and treatment)\n- Regulation 13 (Safeguarding)\n- Regulation 17 (Good governance)\n- Regulation 18 HSCA (RA) (Staffing)\n- Regulation 18 CQC (Registration) (Notification)\n\nThat's 8 regulations total.\n\nAnswer: 8", "citations": [], "input_tokens": 37740, "output_tokens": 419, "cost_micros": 119505, "latency_ms": 10562, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "379f44022bb27aa53efd5d322c7b57bf.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "7"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q005", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "int", "gold": "7", "pages": 17, "document_id": 7600, "arm": "azure_basic_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q005", "answer_letter": null, "raw_text": "I need to count all the breaches of the Health and Social Care Act 2008 (HSCA) regulations mentioned in this report.\n\nLet me go through the document systematically:\n\n**Breaches of HSCA (Regulated Activities) Regulations 2014:**\n\n1. **Regulation 13** - Safeguarding service users from abuse and improper treatment (mentioned twice - unlawful restraint and failure to respond to allegations/safeguarding incidents)\n\n2. **Regulation 12** - Safe care and treatment (mentioned twice - failure to assess risks, and failure to manage medicines safely)\n\n3. **Regulation 18** - Staffing (insufficient staffing levels)\n\n4. **Regulation 18** - Staffing (again, for induction, supervision and training issues)\n\n5. **Regulation 11** - Need for consent (MCA 2005 implementation)\n\n6. **Regulation 10** - Dignity and respect\n\n7. **Regulation 9** - Person-centred care\n\n8. **Regulation 17** - Good governance (mentioned twice - failure to assess/monitor quality, and failure to maintain accurate records)\n\n**Breach of HSCA (Registration) Regulations 2009:**\n\n9. **Regulation 18** - Notification of other incidents (CQC Registration Regulations 2009)\n\nNow let me count the distinct regulation numbers that were breached:\n- Regulation 13 (RA Regs 2014)\n- Regulation 12 (RA Regs 2014)\n- Regulation 18 (RA Regs 2014)\n- Regulation 11 (RA Regs 2014)\n- Regulation 10 (RA Regs 2014)\n- Regulation 9 (RA Regs 2014)\n- Regulation 17 (RA Regs 2014)\n- Regulation 18 (Registration Regs 2009)\n\nThat's 7 different regulations from the RA Regulations 2014 and 1 from the Registration Regulations 2009, making 8 total distinct regulations breached.\n\nAnswer: 8", "citations": [], "input_tokens": 10259, "output_tokens": 499, "cost_micros": 38262, "latency_ms": 8109, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "7"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q005", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "int", "gold": "7", "pages": 17, "document_id": 7600, "arm": "azure_premium_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q005", "answer_letter": null, "raw_text": "I need to count all the breached regulations mentioned in this inspection report.\n\nLet me go through the document systematically:\n\n**From \"Is the service safe?\" section:**\n1. Regulation 13 - Safeguarding (failure to respond to allegations and investigate safeguarding incidents)\n2. Regulation 12 - Safe care and treatment (failure to assess and mitigate risks)\n3. Regulation 18 - Staffing (insufficient staffing levels)\n4. Regulation 12 - Safe care and treatment (medicines not managed safely) - already counted above\n\n**From \"Is the service effective?\" section:**\n5. Regulation 18 - Staffing (induction, supervision and training) - already counted above\n6. Regulation 13 - Safeguarding (unlawful restraint) - already counted above\n7. Regulation 11 - Need for consent (MCA 2005 not implemented)\n\n**From \"Is the service caring?\" section:**\n8. Regulation 10 - Dignity and respect\n\n**From \"Is the service responsive?\" section:**\n9. Regulation 9 - Person-centred care (care records not reflecting current needs)\n\n**From \"Is the service well-led?\" section:**\n10. Regulation 17 - Good governance (failure to assess, monitor and improve quality)\n11. Regulation 17 - Good governance (failure to maintain accurate records) - already counted above\n12. Regulation 18 CQC (Registration) Regulations 2009 - Notification of incidents\n\nLet me count the unique HSCA 2008 regulations breached:\n- Regulation 9 (Person-centred care)\n- Regulation 10 (Dignity and respect)\n- Regulation 11 (Need for consent)\n- Regulation 12 (Safe care and treatment)\n- Regulation 13 (Safeguarding)\n- Regulation 17 (Good governance)\n- Regulation 18 (Staffing)\n\nThat's 7 regulations under HSCA 2008 (Regulated Activities) Regulations 2014.\n\nPlus 1 regulation under CQC (Registration) Regulations 2009 (Regulation 18).\n\nThe question asks specifically about \"regulations of the", "citations": [], "input_tokens": 12047, "output_tokens": 512, "cost_micros": 43821, "latency_ms": 8780, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "7"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q005", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "int", "gold": "7", "pages": 17, "document_id": 7600, "arm": "llamacloud_basic_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q005", "answer_letter": null, "raw_text": "I need to count all the unique regulations that were breached according to this report.\n\nLet me go through the document systematically:\n\n**From the \"Action we have told the provider to take\" section:**\n1. Regulation 18 HSCA (RA) Regulations 2014 - Staffing\n2. Regulation 11 HSCA (RA) Regulations 2014 - Need for consent\n3. Regulation 10 HSCA (RA) Regulations 2014 - Dignity and respect\n4. Regulation 9 HSCA (RA) Regulations 2014 - Person-centred care\n5. Regulation 17 HSCA (RA) Regulations 2014 - Good governance\n6. Regulation 18 CQC (Registration) Regulations 2009 - Notification of other incidents\n\n**From the \"Enforcement actions\" section:**\n7. Regulation 13 HSCA (RA) Regulations 2014 - Safeguarding service users from abuse and improper treatment\n8. Regulation 12 HSCA (RA) Regulations 2014 - Safe care and treatment\n\nLet me verify by checking the main body of the report for all breach mentions:\n- Regulation 13 (safeguarding) - breach mentioned\n- Regulation 12 (safe care and treatment, including medicines and risk assessment) - breach mentioned\n- Regulation 18 (staffing) - breach mentioned\n- Regulation 11 (Mental Capacity Act) - breach mentioned\n- Regulation 10 (dignity and respect) - breach mentioned\n- Regulation 9 (person-centred care) - breach mentioned\n- Regulation 17 (good governance - two separate breaches mentioned for different aspects) - breach mentioned\n- Regulation 18 CQC (Registration) Regulations 2009 (notifications) - breach mentioned\n\nTotal unique regulations breached: 8\n\nAnswer: 8", "citations": [], "input_tokens": 10601, "output_tokens": 448, "cost_micros": 38523, "latency_ms": 7250, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "8", "normalised_gold": "7"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q005", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "int", "gold": "7", "pages": 17, "document_id": 7600, "arm": "llamacloud_premium_lc", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q005", "answer_letter": null, "raw_text": "I need to count all the breaches of the Health and Social Care Act 2008 (HSCA 2008) regulations mentioned in this report.\n\nLet me go through the document systematically:\n\n**Breaches found in the main body of the report:**\n\n1. **Regulation 13** - Safeguarding service users from abuse and improper treatment (failure to respond to allegations and report safeguarding incidents)\n\n2. **Regulation 12** - Safe care and treatment (failure to assess, record and mitigate risks to people's health and safety)\n\n3. **Regulation 18** - Staffing (failure to ensure sufficient staffing levels)\n\n4. **Regulation 12** - Safe care and treatment (failure to manage medicines safely) - *same regulation as #2*\n\n5. **Regulation 18** - Staffing (failure to ensure staff received appropriate induction, supervision and training) - *same regulation as #3*\n\n6. **Regulation 13** - Safeguarding (failure to ensure people were not unlawfully restrained) - *same regulation as #1*\n\n7. **Regulation 11** - Need for consent (failure to ensure MCA 2005 code of practice was implemented)\n\n8. **Regulation 10** - Dignity and respect (failure to treat people with respect and dignity at all times)\n\n9. **Regulation 9** - Person-centred care (failure to ensure care records reflected current needs and preferences)\n\n10. **Regulation 17** - Good governance (failure to effectively assess, monitor and improve quality and safety of services)\n\n11. **Regulation 17** - Good governance (failure to maintain accurate and complete records) - *same regulation as #10*\n\n12. **Regulation 18 of the Health and Social Care Act 2008 (Registration) Regulations 2009** - This is a different Act (Registration Regulations, not Regulated Activities Regulations)\n\n**Counting unique HSCA 2008 (Regulated Activities) Regulations 2014 breaches:**\n- Regulation 9 (Person-centred care)\n- Regulation 10 (Dignity and respect)\n- Regulation 11 (Need for consent)\n- Regulation 12 (Safe care and treatment)\n-", "citations": [], "input_tokens": 10836, "output_tokens": 512, "cost_micros": 40188, "latency_ms": 9764, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "length"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "7"}}
+{"qid": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q005", "doc_id": "379f44022bb27aa53efd5d322c7b57bf.pdf", "answer_format": "int", "gold": "7", "pages": 17, "document_id": 7600, "arm": "surfsense_agentic", "question_id": "379f44022bb27aa53efd5d322c7b57bf.pdf::Q005", "answer_letter": null, "raw_text": "I'll read the provided document to find information about HSCA 2008 regulations breaches.Let me continue reading to find any summary or conclusion section that might provide the total count.Based on my careful reading of the document, I need to count all the breaches of the Health and Social Care Act 2008 (Regulated Activities) Regulations 2014 (HSCA 2008).\n\nThe breaches mentioned in the report are:\n\n1. **Regulation 13** - Safeguarding service users from abuse and improper treatment (mentioned twice - for failing to respond to allegations and for unlawful restraint)\n2. **Regulation 12** - Safe care and treatment (mentioned twice - for failing to assess/mitigate risks and for failing to manage medicines safely)\n3. **Regulation 18** - Staffing (mentioned twice - for insufficient staffing levels and for inadequate training/supervision)\n4. **Regulation 11** - Need for consent (MCA 2005 implementation)\n5. **Regulation 10** - Dignity and respect\n6. **Regulation 9** - Person-centred care\n7. **Regulation 17** - Good governance (mentioned twice - for failing to assess/monitor quality and for failing to maintain accurate records)\n\nAdditionally, there is one breach of:\n8. **Regulation 18 of the Health and Social Care Act 2008 (Registration) Regulations 2009** - Notification of other incidents\n\nCounting the distinct regulation numbers breached under HSCA (RA) Regulations 2014:\n- Regulation 13\n- Regulation 12\n- Regulation 18\n- Regulation 11\n- Regulation 10\n- Regulation 9\n- Regulation 17\n\nThat's 7 different regulations of the HSCA (RA) Regulations 2014 that were breached.\n\nAnswer: 7", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 38483, "error": null, "extra": {"thread_id": 2070, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_8ce6489cb9fc4be0aad3905ed014d904", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 217, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "7", "normalised_gold": "7"}}
+{"qid": "3M_2018_10K.pdf::Q000", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "$1577.00", "pages": 160, "document_id": 7601, "arm": "native_pdf", "question_id": "3M_2018_10K.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the capital expenditure amount for FY2018 from 3M's cash flow statement.\n\nLooking at the Consolidated Statement of Cash Flows for the year ended December 31, 2018 (page 60):\n\nUnder \"Cash Flows from Investing Activities\":\n- Purchases of property, plant and equipment (PP&E): $(1,577) million\n\nThe capital expenditure amount for FY2018 is shown as purchases of property, plant and equipment.\n\nAnswer: 1577", "citations": [], "input_tokens": 390262, "output_tokens": 116, "cost_micros": 2344182, "latency_ms": 44468, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "3M_2018_10K.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "1577.0", "normalised_gold": "1577.0"}}
+{"qid": "3M_2018_10K.pdf::Q000", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "$1577.00", "pages": 160, "document_id": 7601, "arm": "azure_basic_lc", "question_id": "3M_2018_10K.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the capital expenditure amount for FY2018 from the cash flow statement.\n\nLooking at the Consolidated Statement of Cash Flows for the years ended December 31, I can find the capital expenditures under \"Cash Flows from Investing Activities.\"\n\nFrom the document:\n\n**Cash Flows from Investing Activities:**\nYears ended December 31\n(Millions)                                          2018        2017        2016\n\nPurchases of property, plant and equipment (PP&E)   $(1,577)    $(1,373)    $(1,420)\n\nThe capital expenditure for FY2018 is shown as purchases of property, plant and equipment (PP&E) which equals $1,577 million.\n\nAnswer: 1577", "citations": [], "input_tokens": 140538, "output_tokens": 171, "cost_micros": 424179, "latency_ms": 8062, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "1577.0", "normalised_gold": "1577.0"}}
+{"qid": "3M_2018_10K.pdf::Q000", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "$1577.00", "pages": 160, "document_id": 7601, "arm": "azure_premium_lc", "question_id": "3M_2018_10K.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the capital expenditure amount for FY2018 from the cash flow statement.\n\nLooking at the Consolidated Statement of Cash Flows for the year ended December 31, 2018:\n\nUnder \"Cash Flows from Investing Activities\":\n- Purchases of property, plant and equipment (PP&E): $(1,577) million\n\nThe capital expenditure amount is shown as a negative number in the cash flow statement because it represents a cash outflow.\n\nThe capital expenditure amount for FY2018 is $1,577 million.\n\nAnswer: 1577", "citations": [], "input_tokens": 207953, "output_tokens": 130, "cost_micros": 1250643, "latency_ms": 8718, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "1577.0", "normalised_gold": "1577.0"}}
+{"qid": "3M_2018_10K.pdf::Q000", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "$1577.00", "pages": 160, "document_id": 7601, "arm": "llamacloud_basic_lc", "question_id": "3M_2018_10K.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the Consolidated Statement of Cash Flows for the year ended December 31, 2018:\n\nUnder \"Cash Flows from Investing Activities\":\n- Purchases of property, plant and equipment (PP&E): $(1,577) million\n\nThis represents the capital expenditures for FY2018.\n\nAnswer: 1577", "citations": [], "input_tokens": 163241, "output_tokens": 77, "cost_micros": 490878, "latency_ms": 8000, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "1577.0", "normalised_gold": "1577.0"}}
+{"qid": "3M_2018_10K.pdf::Q000", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "$1577.00", "pages": 160, "document_id": 7601, "arm": "llamacloud_premium_lc", "question_id": "3M_2018_10K.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the Consolidated Statement of Cash Flows for the year ended December 31, 2018:\n\nUnder \"Cash Flows from Investing Activities\":\n- Purchases of property, plant and equipment (PP&E): $(1,577) million\n\nThis represents the capital expenditures for fiscal year 2018.\n\nAnswer: 1577", "citations": [], "input_tokens": 177504, "output_tokens": 78, "cost_micros": 533682, "latency_ms": 7719, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "1577.0", "normalised_gold": "1577.0"}}
+{"qid": "3M_2018_10K.pdf::Q000", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "$1577.00", "pages": 160, "document_id": 7601, "arm": "surfsense_agentic", "question_id": "3M_2018_10K.pdf::Q000", "answer_letter": null, "raw_text": "I'll read the 3M 2018 10K document to find the capital expenditure amount from the cash flow statement.Let me search for the cash flow statement in this document.Great! I found the Consolidated Statement of Cash Flows. Let me read that specific chunk to find the capital expenditure line.Let me search for variations of capital expenditure terminology:Perfect! I found it. The \"Purchases of property, plant and equipment (PP&E)\" line shows the capital expenditure amounts. For 2018, it shows $(1,577) million. Let me read the actual chunk to confirm this is from the cash flow statement.Let me check a bit earlier to confirm this is the Consolidated Statement of Cash Flows:Let me check the title of this statement:Perfect! I've confirmed this is the \"3M Company and Subsidiaries Consolidated Statement of Cash Flows\" for years ended December 31, with columns for 2018, 2017, and 2016.\n\nIn the \"Cash Flows from Investing Activities\" section, the line item \"Purchases of property, plant and equipment (PP&E)\" shows:\n- 2018: (1,577)\n- 2017: (1,373)\n- 2016: (1,420)\n\nThe amounts are in millions of USD, and the parentheses indicate these are cash outflows. The FY2018 capital expenditure amount is 1,577 million USD.\n\nAnswer: 1577.0", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 53671, "error": null, "extra": {"thread_id": 2071, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_b68bd46e8dab41c29dc9086112158cbd", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 310, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "1577.0", "normalised_gold": "1577.0"}}
+{"qid": "3M_2018_10K.pdf::Q001", "doc_id": "3M_2018_10K.pdf", "answer_format": "list", "gold": "['Industrial Business', 'Safety and Graphics Business', 'Health Care Business', 'Electronics and Energy Business', 'Consumer Business']", "pages": 160, "document_id": 7601, "arm": "native_pdf", "question_id": "3M_2018_10K.pdf::Q001", "answer_letter": null, "raw_text": "Looking through the document for 3M's business segments in FY2018...\n\nFrom the document, specifically on pages discussing business segments (e.g., page 4, page 33-38), I can identify the following business segments:\n\n1. Industrial Business\n2. Safety and Graphics Business\n3. Health Care Business\n4. Electronics and Energy Business\n5. Consumer Business\n\nThese five segments are consistently mentioned throughout the document as 3M's reportable segments for fiscal year 2018.\n\nAnswer: Industrial, Safety and Graphics, Health Care, Electronics and Energy, Consumer", "citations": [], "input_tokens": 390260, "output_tokens": 126, "cost_micros": 2344395, "latency_ms": 49344, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "3M_2018_10K.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "consumer, electronics and energy, health care, industrial, safety and graphics", "normalised_gold": "consumer business, electronics and energy business, health care business, industrial business, safety and graphics business"}}
+{"qid": "3M_2018_10K.pdf::Q001", "doc_id": "3M_2018_10K.pdf", "answer_format": "list", "gold": "['Industrial Business', 'Safety and Graphics Business', 'Health Care Business', 'Electronics and Energy Business', 'Consumer Business']", "pages": 160, "document_id": 7601, "arm": "azure_basic_lc", "question_id": "3M_2018_10K.pdf::Q001", "answer_letter": null, "raw_text": "Looking at the document, I need to identify 3M's business segments for FY2018.\n\nFrom the document, I can find the business segment information in multiple places. The clearest statement is in the Overview section and repeated throughout:\n\n\"3M manages its operations in five operating business segments: Industrial; Safety and Graphics; Health Care; Electronics and Energy; and Consumer.\"\n\nThis is also confirmed in the Business Segment Products table which lists the major products for each segment:\n- Industrial\n- Safety and Graphics\n- Health Care\n- Electronics and Energy\n- Consumer\n\nAnswer: Industrial, Safety and Graphics, Health Care, Electronics and Energy, Consumer", "citations": [], "input_tokens": 140536, "output_tokens": 140, "cost_micros": 423708, "latency_ms": 9046, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "consumer, electronics and energy, health care, industrial, safety and graphics", "normalised_gold": "consumer business, electronics and energy business, health care business, industrial business, safety and graphics business"}}
+{"qid": "3M_2018_10K.pdf::Q001", "doc_id": "3M_2018_10K.pdf", "answer_format": "list", "gold": "['Industrial Business', 'Safety and Graphics Business', 'Health Care Business', 'Electronics and Energy Business', 'Consumer Business']", "pages": 160, "document_id": 7601, "arm": "azure_premium_lc", "question_id": "3M_2018_10K.pdf::Q001", "answer_letter": null, "raw_text": "Looking through the document for information about 3M's business segments in FY2018...\n\nFrom the document, I can find the business segment information clearly stated in multiple places:\n\nIn the \"BUSINESS SEGMENTS\" section (page 4), it states:\n\"3M manages its operations in five business segments. The reportable segments are Industrial, Safety and Graphics, Health Care, Electronics and Energy, and Consumer.\"\n\nThis is also confirmed in the \"OVERVIEW\" section (page 15):\n\"3M manages its operations in five operating business segments: Industrial; Safety and Graphics; Health Care; Electronics and Energy; and Consumer.\"\n\nAnd in the \"PERFORMANCE BY BUSINESS SEGMENT\" section (page 32):\n\"3M manages its operations in five business segments. The reportable segments are Industrial; Safety and Graphics; Health Care; Electronics and Energy; and Consumer.\"\n\nAnswer: Industrial, Safety and Graphics, Health Care, Electronics and Energy, Consumer", "citations": [], "input_tokens": 207951, "output_tokens": 206, "cost_micros": 1252341, "latency_ms": 10437, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "consumer, electronics and energy, health care, industrial, safety and graphics", "normalised_gold": "consumer business, electronics and energy business, health care business, industrial business, safety and graphics business"}}
+{"qid": "3M_2018_10K.pdf::Q001", "doc_id": "3M_2018_10K.pdf", "answer_format": "list", "gold": "['Industrial Business', 'Safety and Graphics Business', 'Health Care Business', 'Electronics and Energy Business', 'Consumer Business']", "pages": 160, "document_id": 7601, "arm": "llamacloud_basic_lc", "question_id": "3M_2018_10K.pdf::Q001", "answer_letter": null, "raw_text": "Looking through the document for information about 3M's business segments in FY2018:\n\nFrom the document, I can find the business segments clearly stated in multiple locations, including:\n\n1. In the Business Segments section (page 4):\n\"3M manages its operations in five business segments: Industrial; Safety and Graphics; Health Care; Electronics and Energy; and Consumer.\"\n\n2. In the Table of Contents and throughout the financial statements, these five segments are consistently referenced.\n\n3. In Note 18 (Business Segments), the same five segments are listed with their products and financial information.\n\nAnswer: Industrial, Safety and Graphics, Health Care, Electronics and Energy, Consumer", "citations": [], "input_tokens": 163239, "output_tokens": 147, "cost_micros": 491922, "latency_ms": 9858, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "consumer, electronics and energy, health care, industrial, safety and graphics", "normalised_gold": "consumer business, electronics and energy business, health care business, industrial business, safety and graphics business"}}
+{"qid": "3M_2018_10K.pdf::Q001", "doc_id": "3M_2018_10K.pdf", "answer_format": "list", "gold": "['Industrial Business', 'Safety and Graphics Business', 'Health Care Business', 'Electronics and Energy Business', 'Consumer Business']", "pages": 160, "document_id": 7601, "arm": "llamacloud_premium_lc", "question_id": "3M_2018_10K.pdf::Q001", "answer_letter": null, "raw_text": "Looking through the document for information about 3M's business segments in FY2018...\n\nFrom the document, I can find the business segment information in multiple sections, including:\n\n1. In the Executive Summary/Overview section\n2. In Note 18 - Business Segments\n3. In the Business Segments section of Item 1\n\nThe document clearly states that 3M manages its operations in **five business segments**:\n\n1. **Industrial** - serves automotive OEM, automotive aftermarket, electronics, appliance, paper and printing, packaging, food and beverage, and construction markets\n2. **Safety and Graphics** - serves markets for personal protection products, commercial solutions, transportation safety, and roofing granules\n3. **Health Care** - serves medical clinics and hospitals, pharmaceuticals, dental and orthodontic practitioners, health information systems, and food manufacturing and testing\n4. **Electronics and Energy** - serves electronics and energy markets, including display materials, electrical products, and power generation\n5. **Consumer** - serves consumer retail, office retail, home improvement, drug and pharmacy retail markets\n\nAnswer: Industrial, Safety and Graphics, Health Care, Electronics and Energy, Consumer", "citations": [], "input_tokens": 177502, "output_tokens": 255, "cost_micros": 536331, "latency_ms": 14344, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "consumer, electronics and energy, health care, industrial, safety and graphics", "normalised_gold": "consumer business, electronics and energy business, health care business, industrial business, safety and graphics business"}}
+{"qid": "3M_2018_10K.pdf::Q001", "doc_id": "3M_2018_10K.pdf", "answer_format": "list", "gold": "['Industrial Business', 'Safety and Graphics Business', 'Health Care Business', 'Electronics and Energy Business', 'Consumer Business']", "pages": 160, "document_id": 7601, "arm": "surfsense_agentic", "question_id": "3M_2018_10K.pdf::Q001", "answer_letter": null, "raw_text": "I'll read the 3M 2018 10K document to find information about their business segments.Perfect! I found the information. Let me read the specific section that clearly states the five business segments.Answer: Industrial, Safety and Graphics, Health Care, Electronics and Energy, Consumer", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 32219, "error": null, "extra": {"thread_id": 2072, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_5721bf743c374f36a4a98f84b6cf13eb", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 119, "n_mentioned_documents": 1}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "consumer, electronics and energy, health care, industrial, safety and graphics", "normalised_gold": "consumer business, electronics and energy business, health care business, industrial business, safety and graphics business"}}
+{"qid": "3M_2018_10K.pdf::Q002", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "93", "pages": 160, "document_id": 7601, "arm": "native_pdf", "question_id": "3M_2018_10K.pdf::Q002", "answer_letter": null, "raw_text": "I need to find the net fair value of derivative instruments for 3M in FY2018.\n\nLooking through the document, I can find information about derivative instruments in Note 14 (Derivatives) and the tables showing \"Location and Fair Value Amount of Derivative Instruments.\"\n\nFrom page 104-105, I can see tables showing the fair value of derivative instruments as of December 31, 2018:\n\n**Assets (December 31, 2018):**\n- Foreign currency forward/option contracts: $127 million\n- Interest rate swap contracts: $19 million\n- Total derivative assets: $146 million\n\n**Liabilities (December 31, 2018):**\n- Foreign currency forward/option contracts: $22 million\n- Interest rate swap contracts: $31 million\n- Total derivative liabilities: $53 million\n\nNet fair value = Total derivative assets - Total derivative liabilities\nNet fair value = $146 million - $53 million = $93 million\n\nAnswer: 93", "citations": [], "input_tokens": 390258, "output_tokens": 225, "cost_micros": 2346610, "latency_ms": 34921, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "3M_2018_10K.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "93.0", "normalised_gold": "93.0"}}
+{"qid": "3M_2018_10K.pdf::Q002", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "93", "pages": 160, "document_id": 7601, "arm": "azure_basic_lc", "question_id": "3M_2018_10K.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, I need to find the net fair value of derivative instruments as of December 31, 2018.\n\nFrom the section \"Location and Fair Value Amount of Derivative Instruments\" on page 104, I can see a table showing derivative instruments as of December 31, 2018:\n\n**Assets:**\n- Derivatives designated as hedging instruments: $132 million\n- Derivatives not designated as hedging instruments: $14 million\n- Total derivative instruments (Assets): $146 million\n\n**Liabilities:**\n- Derivatives designated as hedging instruments: $47 million\n- Derivatives not designated as hedging instruments: $6 million\n- Total derivative instruments (Liabilities): $53 million\n\nNet fair value = Total Assets - Total Liabilities\nNet fair value = $146 million - $53 million = $93 million\n\nAnswer: 93", "citations": [], "input_tokens": 140534, "output_tokens": 198, "cost_micros": 424572, "latency_ms": 6766, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "93.0", "normalised_gold": "93.0"}}
+{"qid": "3M_2018_10K.pdf::Q002", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "93", "pages": 160, "document_id": 7601, "arm": "azure_premium_lc", "question_id": "3M_2018_10K.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, I need to find the net fair value of derivative instruments for 3M in FY2018.\n\nFrom Note 14 \"Derivatives\" and Note 15 \"Fair Value Measurements\", I can find the fair value of derivative instruments.\n\nIn the section \"Location and Fair Value Amount of Derivative Instruments\" under Note 14, there is a table showing:\n\n**December 31, 2018:**\n\nAssets:\n- Foreign currency forward/option contracts (current): $74 million\n- Foreign currency forward/option contracts (non-current): $39 million\n- Interest rate swap contracts (current): $0 million\n- Interest rate swap contracts (non-current): $19 million\n- Foreign currency forward/option contracts (not designated): $14 million\n**Total derivative assets: $146 million**\n\nLiabilities:\n- Foreign currency forward/option contracts (current): $12 million\n- Foreign currency forward/option contracts (non-current): $4 million\n- Interest rate swap contracts (current): $14 million\n- Interest rate swap contracts (non-current): $17 million\n- Foreign currency forward/option contracts (not designated): $6 million\n**Total derivative liabilities: $53 million**\n\nNet fair value = Total derivative assets - Total derivative liabilities\nNet fair value = $146 million - $53 million = $93 million\n\nAnswer: 93", "citations": [], "input_tokens": 207949, "output_tokens": 317, "cost_micros": 1254826, "latency_ms": 11608, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "93.0", "normalised_gold": "93.0"}}
+{"qid": "3M_2018_10K.pdf::Q002", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "93", "pages": 160, "document_id": 7601, "arm": "llamacloud_basic_lc", "question_id": "3M_2018_10K.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, I need to find the net fair value of derivative instruments for 3M in FY2018.\n\nFrom the \"Location and Fair Value Amount of Derivative Instruments\" table for December 31, 2018:\n\n**Derivative Assets:**\n- Derivatives designated as hedging instruments: $132 million\n- Derivatives not designated as hedging instruments: $14 million\n- Total derivative assets: $146 million\n\n**Derivative Liabilities:**\n- Derivatives designated as hedging instruments: $47 million\n- Derivatives not designated as hedging instruments: $6 million\n- Total derivative liabilities: $53 million\n\n**Net Fair Value = Total Assets - Total Liabilities**\nNet Fair Value = $146 million - $53 million = $93 million\n\nAnswer: 93", "citations": [], "input_tokens": 163237, "output_tokens": 185, "cost_micros": 492486, "latency_ms": 7719, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "93.0", "normalised_gold": "93.0"}}
+{"qid": "3M_2018_10K.pdf::Q002", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "93", "pages": 160, "document_id": 7601, "arm": "llamacloud_premium_lc", "question_id": "3M_2018_10K.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the fair value of derivative instruments from the document:\n\nFrom the table \"Location and Fair Value Amount of Derivative Instruments\" on page 104:\n\n**December 31, 2018:**\n- Total derivative assets (Fair Value Amount): $146 million\n- Total derivative liabilities (Fair Value Amount): $53 million\n\nNet fair value = Assets - Liabilities = $146 million - $53 million = $93 million\n\nAnswer: 93", "citations": [], "input_tokens": 177500, "output_tokens": 105, "cost_micros": 534075, "latency_ms": 8358, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "93.0", "normalised_gold": "93.0"}}
+{"qid": "3M_2018_10K.pdf::Q002", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "93", "pages": 160, "document_id": 7601, "arm": "surfsense_agentic", "question_id": "3M_2018_10K.pdf::Q002", "answer_letter": null, "raw_text": "I'll read the 3M 2018 10K document to find information about the net fair value of derivative instruments.Let me search for information about derivative instruments in the document.Let me search more specifically for derivative instruments and their fair values in a table format.Now let me read the section on derivatives to find the fair value information, particularly around line 15104 where it mentions \"Location and Fair Value Amount of Derivative Instruments\":Perfect! I found the table showing the fair value of derivative instruments for 3M in FY2018. Looking at the table on lines 15230-15235:\n\nFor December 31, 2018:\n- Total derivative instruments (Assets): $146 million\n- Total derivative instruments (Liabilities): $53 million\n\nThe net fair value would be: Assets - Liabilities = $146 - $53 = $93 million\n\nAnswer: 93.0", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 50969, "error": null, "extra": {"thread_id": 2073, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_890847b672154ab2beed6f213e7b03d2", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 220, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "93.0", "normalised_gold": "93.0"}}
+{"qid": "3M_2018_10K.pdf::Q003", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "-3.65%", "pages": 160, "document_id": 7601, "arm": "native_pdf", "question_id": "3M_2018_10K.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the document for advertising and merchandising costs:\n\nFrom the document (page 65):\n\"Advertising and merchandising: These costs are charged to operations in the period incurred, and totaled $396 million in 2018, $411 million in 2017 and $385 million in 2016.\"\n\nTo calculate the growth rate for FY2018:\nGrowth rate = (2018 cost - 2017 cost) / 2017 cost \u00d7 100\nGrowth rate = ($396 million - $411 million) / $411 million \u00d7 100\nGrowth rate = -$15 million / $411 million \u00d7 100\nGrowth rate = -0.03649635... \u00d7 100\nGrowth rate = -3.649635...\nRounded to two decimal places = -3.65\n\nAnswer: -3.65", "citations": [], "input_tokens": 390267, "output_tokens": 193, "cost_micros": 2345944, "latency_ms": 37280, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "3M_2018_10K.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "-3.65", "normalised_gold": "-3.65"}}
+{"qid": "3M_2018_10K.pdf::Q003", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "-3.65%", "pages": 160, "document_id": 7601, "arm": "azure_basic_lc", "question_id": "3M_2018_10K.pdf::Q003", "answer_letter": null, "raw_text": "I need to find the advertising and merchandising costs for 2018 and 2017, then calculate the growth rate.\n\nFrom the document, I can find the advertising and merchandising costs in the \"Advertising and merchandising\" section:\n\n\"Advertising and merchandising: These costs are charged to operations in the period incurred, and totaled $396 million in 2018, $411 million in 2017 and $385 million in 2016.\"\n\nSo:\n- 2018: $396 million\n- 2017: $411 million\n\nGrowth rate = (2018 value - 2017 value) / 2017 value \u00d7 100\nGrowth rate = ($396 - $411) / $411 \u00d7 100\nGrowth rate = -$15 / $411 \u00d7 100\nGrowth rate = -0.03649635 \u00d7 100\nGrowth rate = -3.649635%\n\nRounded to two decimal places: -3.65%\n\nAnswer: -3.65", "citations": [], "input_tokens": 140543, "output_tokens": 230, "cost_micros": 425079, "latency_ms": 7875, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "-3.65", "normalised_gold": "-3.65"}}
+{"qid": "3M_2018_10K.pdf::Q003", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "-3.65%", "pages": 160, "document_id": 7601, "arm": "azure_premium_lc", "question_id": "3M_2018_10K.pdf::Q003", "answer_letter": null, "raw_text": "I need to find the advertising and merchandising costs for 3M in FY2018 and FY2017, then calculate the growth rate.\n\nFrom the document, I can find the advertising and merchandising costs in the \"Advertising and merchandising\" section:\n\n\"Advertising and merchandising: These costs are charged to operations in the period incurred, and totaled $396 million in 2018, $411 million in 2017 and $385 million in 2016.\"\n\nSo:\n- FY2018: $396 million\n- FY2017: $411 million\n\nGrowth rate = (FY2018 - FY2017) / FY2017 \u00d7 100\nGrowth rate = (396 - 411) / 411 \u00d7 100\nGrowth rate = -15 / 411 \u00d7 100\nGrowth rate = -3.6496...%\n\nRounded to two decimal places: -3.65%\n\nAnswer: -3.65", "citations": [], "input_tokens": 207958, "output_tokens": 224, "cost_micros": 1252788, "latency_ms": 9453, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "-3.65", "normalised_gold": "-3.65"}}
+{"qid": "3M_2018_10K.pdf::Q003", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "-3.65%", "pages": 160, "document_id": 7601, "arm": "llamacloud_basic_lc", "question_id": "3M_2018_10K.pdf::Q003", "answer_letter": null, "raw_text": "I need to find the advertising and merchandising costs for 3M in FY2018 and FY2017, then calculate the growth rate.\n\nFrom the document, under \"Advertising and merchandising\":\n- 2018: $396 million\n- 2017: $411 million\n- 2016: $385 million\n\nThe growth rate formula is: ((2018 - 2017) / 2017) \u00d7 100\n\nGrowth rate = ((396 - 411) / 411) \u00d7 100\nGrowth rate = (-15 / 411) \u00d7 100\nGrowth rate = -0.03649635 \u00d7 100\nGrowth rate = -3.649635%\n\nRounded to two decimal places: -3.65%\n\nAnswer: -3.65", "citations": [], "input_tokens": 163246, "output_tokens": 178, "cost_micros": 492408, "latency_ms": 8312, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "-3.65", "normalised_gold": "-3.65"}}
+{"qid": "3M_2018_10K.pdf::Q003", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "-3.65%", "pages": 160, "document_id": 7601, "arm": "llamacloud_premium_lc", "question_id": "3M_2018_10K.pdf::Q003", "answer_letter": null, "raw_text": "Looking at the advertising and merchandising costs:\n\nFrom the document (page 64):\n\"Advertising and merchandising: These costs are charged to operations in the period incurred, and totaled $396 million in 2018, $411 million in 2017 and $385 million in 2016.\"\n\nGrowth rate calculation for FY2018:\nGrowth rate = (2018 cost - 2017 cost) / 2017 cost \u00d7 100\nGrowth rate = ($396 million - $411 million) / $411 million \u00d7 100\nGrowth rate = -$15 million / $411 million \u00d7 100\nGrowth rate = -0.03649635 \u00d7 100\nGrowth rate = -3.65%\n\nAnswer: -3.65", "citations": [], "input_tokens": 177509, "output_tokens": 176, "cost_micros": 535167, "latency_ms": 8078, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "-3.65", "normalised_gold": "-3.65"}}
+{"qid": "3M_2018_10K.pdf::Q003", "doc_id": "3M_2018_10K.pdf", "answer_format": "float", "gold": "-3.65%", "pages": 160, "document_id": 7601, "arm": "surfsense_agentic", "question_id": "3M_2018_10K.pdf::Q003", "answer_letter": null, "raw_text": "I'll read the 3M 2018 10K document to find the advertising and merchandising costs for FY2018 and the prior year to calculate the growth rate.Let me search for \"advertising\" in the document to find the relevant information quickly.Great! I found a reference to advertising and merchandising costs. Let me read the chunk that contains this information.Perfect! I can see that advertising and merchandising costs were $396 million in 2018 and $411 million in 2017. Let me read more context to ensure I have the complete information.Perfect! I now have the complete information:\n- 2018: $396 million\n- 2017: $411 million\n\nNow I'll calculate the growth rate:\nGrowth rate = [(2018 value - 2017 value) / 2017 value] \u00d7 100\nGrowth rate = [(396 - 411) / 411] \u00d7 100\nGrowth rate = [-15 / 411] \u00d7 100\nGrowth rate = -0.036496... \u00d7 100\nGrowth rate = -3.6496...%\n\nRounded to two decimal places: -3.65%\n\nAnswer: -3.65", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 36500, "error": null, "extra": {"thread_id": 2074, "search_space_id": 55, "answer_letter_strategy": "none", "user_message_id": "msg_3f1bac0372cd48f5b95293aa87160368", "assistant_message_id": null, "finished_normally": true, "n_raw_events": 175, "n_mentioned_documents": 1}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "-3.65", "normalised_gold": "-3.65"}}
diff --git a/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_retries.jsonl b/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_retries.jsonl
new file mode 100644
index 0000000000..bc18da838c
--- /dev/null
+++ b/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_retries.jsonl
@@ -0,0 +1,37 @@
+{"qid": "05-03-18-political-release.pdf::Q000", "doc_id": "05-03-18-political-release.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "541", "pages": 17, "document_id": 5219, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:50:15Z", "latency_ms": 20985, "error": null, "raw_text_chars": 696}]}, "question_id": "05-03-18-political-release.pdf::Q000", "answer_letter": null, "raw_text": "I need to find information about how many adults rated Trump's government ethical standards as \"poor\" in the survey conducted April 25-May 1, 2018.\n\nLooking through the document, I find relevant information on page 4 (document page 6):\n\nThe document states: \"About four-in-ten Americans rate the ethical standards of Trump administration officials as excellent (9%) or good (30%), while 58% say they are not good (21%) or poor (36%).\"\n\nThe survey was conducted among 1,503 adults (as stated in the methodology section on page 14).\n\nTo find the number of adults who rated the ethical standards as \"poor\":\n36% of 1,503 = 0.36 \u00d7 1,503 = 541.08\n\nRounding to the nearest whole number: 541\n\nAnswer: 541", "citations": [], "input_tokens": 33894, "output_tokens": 191, "cost_micros": 104547, "latency_ms": 20968, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "05-03-18-political-release.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "541", "normalised_gold": "541"}}
+{"qid": "05-03-18-political-release.pdf::Q002", "doc_id": "05-03-18-political-release.pdf", "arm": "native_pdf", "answer_format": "list", "gold": "['Make good decisions about economic policy ', 'Make wise decisions about immigration policy ']", "pages": 17, "document_id": 5219, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:50:16Z", "latency_ms": 20453, "error": null, "raw_text_chars": 544}]}, "question_id": "05-03-18-political-release.pdf::Q002", "answer_letter": null, "raw_text": "Looking at page 5 of the document, which shows \"Public's confidence in Trump to handle several policy areas mixed,\" I can see the data for \"very confident\" and \"not at all confident\" percentages across different domains.\n\nFor \"Very confident\" (highest percentage):\n- Negotiate favorable trade agreements with other countries: 30%\n\nFor \"Not at all confident\" (highest percentage):\n- Make wise decisions about immigration policy: 55%\n\nAnswer: Make wise decisions about immigration policy, Negotiate favorable trade agreements with other countries", "citations": [], "input_tokens": 33942, "output_tokens": 117, "cost_micros": 103581, "latency_ms": 20438, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "05-03-18-political-release.pdf"}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "make wise decisions about immigration policy, negotiate favorable trade agreements with other countries", "normalised_gold": "make good decisions about economic policy, make wise decisions about immigration policy"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "6", "pages": 16, "document_id": 5220, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:50:36Z", "latency_ms": 41750, "error": null, "raw_text_chars": 1479}]}, "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "answer_letter": null, "raw_text": "I need to count the pictures in the document that contain only one person.\n\nLet me go through the document page by page:\n\nPage 1:\n- Photo of Dr. Xiaoyan \"Cathy\" Zou with Rear Adm. (Ret.) Tim Ziemer - 2 people\n\nPage 2:\n- Photo of Capt. John W. Sanders - 1 person\n- Photo of Capt. Steven M. Wechsler - 1 person\n\nPage 3:\n- Photo of Afghan trainees in laboratory - multiple people\n\nPage 4:\n- Group photo of Pacific Partnership 2012 Internal Medicine Department staff - multiple people\n- Photo of Cmdr. Charmagne Beckett on flight deck - 1 person\n\nPage 5:\n- Photo of Petty Officer 1st Class Dennis Gonzales - 1 person\n\nPage 6:\n- Photo of Lance Cpl. Kip Boker and Cpl. Jacob Echeverri - 2 people\n\nPage 7:\n- Photo with Lt. Cmdr. Jennifer Curry, Capt. Buhari Oyofo, Dr. Walter T. Gwenigale, Lt. Joseph Diclaro, and Dr. Fatorma Bolay - 5 people\n- Photo with Lt. Cmdr. Jennifer Curry, Capt. Chris Martinez, Capt. Buhari Oyofo, Col. Vernon Graham, and Lt. Joseph Diclaro - 5 people\n\nPage 8:\n- Group photo of Kazakh scientists and NMRC staff - multiple people\n\nPage 9:\n- Photo of Lt. j.g. Michael Rucker treating a child - 2 people\n- Photo of U.S. Marines and Sailors in aircraft - multiple people\n\nPage 10:\n- Group photo of Joint Combat Casualty Research Team - multiple people\n\nPage 12:\n- Photo of NMRC 2012 Dining Out Event - multiple people\n- Photo with Rear Adm. Bruce Doll, Capt. John Sanders and others - multiple people\n\nPage 13:\n- Photo with Ellen Flannery, Emmanuel Adu-Gyamfi, Eil", "citations": [], "input_tokens": 40387, "output_tokens": 512, "cost_micros": 128841, "latency_ms": 41735, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "length", "pdf_filename": "0b85477387a9d0cc33fca0f4becaa0e5.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "6"}}
+{"qid": "2005.12872v3.pdf::Q005", "doc_id": "2005.12872v3.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "36", "pages": 26, "document_id": 7577, "original_error": "JSONDecodeError: Expecting value: line 169 column 1 (char 924)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:50:36Z", "latency_ms": 36889, "error": null, "raw_text_chars": 1214}]}, "question_id": "2005.12872v3.pdf::Q005", "answer_letter": null, "raw_text": "I need to find information about DETR's default structure, specifically the number of multi-head self-attention layers.\n\nLet me search through the document for information about DETR's architecture.\n\nFrom page 2, I can see that DETR has a transformer encoder-decoder architecture.\n\nOn page 10 (Section 4.1), I find: \"To be comparable in the number of parameters we choose a model with 6 transformer and 6 decoder layers of width 256 with 8 attention heads.\"\n\nLooking at the detailed architecture description on page 22 (Section A.3) and Figure 10, I can see that:\n- The encoder has multiple layers, each containing multi-head self-attention\n- The decoder has multiple layers, each containing multi-head self-attention AND multi-head attention (cross-attention)\n\nFrom page 9-10, the baseline model uses \"6 encoder, 6 decoder layers\"\n\nEach encoder layer has one multi-head self-attention layer.\nEach decoder layer has one multi-head self-attention layer and one multi-head cross-attention layer.\n\nSo the total number of multi-head self-attention layers (not counting cross-attention) is:\n- 6 encoder layers \u00d7 1 self-attention per layer = 6\n- 6 decoder layers \u00d7 1 self-attention per layer = 6\n- Total = 12\n\nAnswer: 12", "citations": [], "input_tokens": 59866, "output_tokens": 318, "cost_micros": 184368, "latency_ms": 36875, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2005.12872v3.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "12", "normalised_gold": "36"}}
+{"qid": "2306.05425v1.pdf::Q002", "doc_id": "2306.05425v1.pdf", "arm": "llamacloud_basic_lc", "answer_format": "int", "gold": "6", "pages": 22, "document_id": 7587, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:51:13Z", "latency_ms": 11328, "error": null, "raw_text_chars": 1376}]}, "question_id": "2306.05425v1.pdf::Q002", "answer_letter": null, "raw_text": "I need to carefully examine the document for bar plots.\n\nLooking through the figures in the document:\n\n1. **Figure 1**: Shows example images with text descriptions - no bar plots\n2. **Figure 2**: Shows data format comparison with images and text - no bar plots\n3. **Figure 3**: Shows the Syphus overview diagram - no bar plots\n4. **Figure 4**: This appears to show statistics and analysis. Let me examine it carefully:\n   - Part (a): Shows verb-noun structure for instructions - this appears to be a bar chart\n   - Part (b): Shows verb-noun structure for responses - this appears to be a bar chart\n   - Part (c): Shows \"Diversity in our dataset\" with what appears to be distribution plots\n\nLooking at Figure 4 more carefully:\n- (a) contains what looks like a horizontal bar chart showing verbs and nouns from instructions\n- (b) contains what looks like a horizontal bar chart showing verbs and nouns from responses\n- (c) contains multiple distribution plots showing lengths and counts\n\n5. **Figure 5**: Shows response examples - no bar plots\n6. **Figure 6**: The document mentions Fig. 6(b) and Fig. 6(c) but these are not fully visible in the provided pages\n\nFrom what I can see clearly in Figure 4:\n- Subfigure (a): 1 bar plot (showing instruction verb-noun structure)\n- Subfigure (b): 1 bar plot (showing response verb-noun structure)\n\nTotal visible bar plots: 2\n\nAnswer: 2", "citations": [], "input_tokens": 20240, "output_tokens": 354, "cost_micros": 66030, "latency_ms": 11328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "6"}}
+{"qid": "2307.09288v2.pdf::Q000", "doc_id": "2307.09288v2.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "5", "pages": 77, "document_id": 7588, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:51:18Z", "latency_ms": 52514, "error": null, "raw_text_chars": 860}]}, "question_id": "2307.09288v2.pdf::Q000", "answer_letter": null, "raw_text": "I'll examine pages 5-10 to count the tables.\n\nPage 5: No tables visible in the text content shown.\n\nPage 6: Table 1 is present - \"Llama 2 family of models\" showing training data, parameters, context length, GQA, tokens, and LR for Llama 1 and Llama 2 models.\n\nPage 7: \n- Table 2: \"CO2 emissions during pretraining\" showing time, power consumption, and carbon emitted for different Llama 2 models.\n- Table 3: \"Overall performance on grouped academic benchmarks\" comparing MPT, Falcon, Llama 1, and Llama 2 models.\n\nPage 8:\n- Table 4: \"Comparison to closed-source models on academic benchmarks\" showing results for GPT-3.5, GPT-4, PaLM, PaLM-2-L, and Llama 2.\n- Table 5: \"SFT annotation\" showing examples of helpfulness and safety annotations.\n\nPage 9: No complete tables visible in the shown content.\n\nPage 10: No tables visible in the shown content.\n\nAnswer: 5", "citations": [], "input_tokens": 199213, "output_tokens": 280, "cost_micros": 601839, "latency_ms": 52468, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2307.09288v2.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "5"}}
+{"qid": "2307.09288v2.pdf::Q002", "doc_id": "2307.09288v2.pdf", "arm": "native_pdf", "answer_format": "list", "gold": "['Vicuna13b-v1.3', 'PaLM Bison', 'Vicuna13b-v1.1']", "pages": 77, "document_id": 7588, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:51:25Z", "latency_ms": 40452, "error": null, "raw_text_chars": 575}]}, "question_id": "2307.09288v2.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 3 on page 4, which shows \"Safety human evaluation results for Llama 2-Chat compared to other open-source and closed-source models,\" I need to identify the models with the lowest violation percentages (since lower is better for safety).\n\nFrom the bar chart showing \"Violation % (lower is safer)\":\n- Llama 2-70b-chat has the lowest violation percentage (approximately 0-5%)\n- Llama 2-13b-chat has the second lowest (approximately 0-5%)\n- Llama 2-34b-chat has the third lowest (approximately 5-10%)\n\nAnswer: Llama 2-70b-chat, Llama 2-13b-chat, Llama 2-34b-chat", "citations": [], "input_tokens": 199224, "output_tokens": 182, "cost_micros": 600402, "latency_ms": 40406, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2307.09288v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "llama 2 13b chat, llama 2 34b chat, llama 2 70b chat", "normalised_gold": "palm bison, vicuna13b v1 1, vicuna13b v1 3"}}
+{"qid": "2307.09288v2.pdf::Q002", "doc_id": "2307.09288v2.pdf", "arm": "llamacloud_premium_lc", "answer_format": "list", "gold": "['Vicuna13b-v1.3', 'PaLM Bison', 'Vicuna13b-v1.1']", "pages": 77, "document_id": 7588, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:52:05Z", "latency_ms": 10343, "error": null, "raw_text_chars": 376}]}, "question_id": "2307.09288v2.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 3, which shows \"Safety human evaluation results for LLAMA 2-CHAT compared to other open-source and closed-source models,\" I can see the violation percentages (lower is safer):\n\nThe top-3 safest models (lowest violation %) are:\n1. Llama-2 7b-chat: 3%\n2. Llama-2 13b-chat: 3%\n3. Llama-2 70b-chat: 4%\n\nAnswer: Llama-2 7b-chat, Llama-2 13b-chat, Llama-2 70b-chat", "citations": [], "input_tokens": 105645, "output_tokens": 151, "cost_micros": 319200, "latency_ms": 10343, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "llama 2 13b chat, llama 2 70b chat, llama 2 7b chat", "normalised_gold": "palm bison, vicuna13b v1 1, vicuna13b v1 3"}}
+{"qid": "2307.09288v2.pdf::Q004", "doc_id": "2307.09288v2.pdf", "arm": "llamacloud_premium_lc", "answer_format": "str", "gold": "Llama 1 33B", "pages": 77, "document_id": 7588, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:52:10Z", "latency_ms": 29391, "error": null, "raw_text_chars": 525}]}, "question_id": "2307.09288v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Table 23 in the document, which shows \"Comparison to open-source models on reading comprehension (SQUAD and QUAC)\", I need to find the QUAC (f1) 0-shot column.\n\nFrom the table, the QUAC (f1) 0-shot scores are:\n- MPT 7B: 38.0\n- MPT 30B: 40.4\n- Falcon 7B: 24.0\n- Falcon 40B: 41.2\n- LLAMA 1 7B: 38.9\n- LLAMA 1 13B: 39.9\n- LLAMA 1 33B: 44.1\n- LLAMA 1 65B: 41.0\n- LLAMA 2 7B: 39.4\n- LLAMA 2 13B: 42.7\n- LLAMA 2 34B: 42.9\n- LLAMA 2 70B: 42.4\n\nThe highest score is 44.1, which belongs to LLAMA 1 33B.\n\nAnswer: LLAMA 1 33B", "citations": [], "input_tokens": 105649, "output_tokens": 274, "cost_micros": 321057, "latency_ms": 29391, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "llama 1 33b", "normalised_gold": "llama 1 33b"}}
+{"qid": "2307.09288v2.pdf::Q007", "doc_id": "2307.09288v2.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "15", "pages": 77, "document_id": 7588, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:52:15Z", "latency_ms": 43610, "error": null, "raw_text_chars": 650}]}, "question_id": "2307.09288v2.pdf::Q007", "answer_letter": null, "raw_text": "I need to find Figure 9 and count the emojis in the left and right subfigures.\n\nLooking through the document, I find Figure 9 on page 16 (document page 16). The figure shows \"Issues with multi-turn memory (left) can be improved with GAtt (right).\"\n\nLooking at the left subfigure:\n- It shows a conversation with text-only responses (no emojis visible)\n\nLooking at the right subfigure:\n- First response: 4 emojis (\ud83d\udc4b\ud83d\udc41\ufe0f\ud83d\udc44\ud83d\udc41\ufe0f)\n- Second response: 4 emojis (\u2708\ufe0f\u2708\ufe0f\ud83d\udeeb\ud83d\udcba)\n- Third response: 4 emojis (\ud83e\uddb4\ud83e\udd95\ud83e\uddb4\ud83d\udd25)\n- Fourth response: 2 emojis (\ud83c\uddee\ud83c\uddf9\ud83c\udf5d\ud83d\udc90)\n\nTotal in right subfigure: 4 + 4 + 4 + 2 = 14 emojis\n\nTotal in left subfigure: 0 emojis\n\nDifference: 14 - 0 = 14\n\nAnswer: 14", "citations": [], "input_tokens": 199223, "output_tokens": 267, "cost_micros": 601674, "latency_ms": 43578, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2307.09288v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "14", "normalised_gold": "15"}}
+{"qid": "2309.17421v2.pdf::Q000", "doc_id": "2309.17421v2.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "Blue", "pages": 166, "document_id": 7589, "original_error": "HTTPStatusError: OpenRouter HTTP 502: {\"type\":\"https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-502/\",\"title\":\"Error 502: Bad gateway\",\"status\":502,\"detail\":\"The origin web server returned an invalid or incomplete response to Cloudflare. This typically indicates the origin is overlo", "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:52:40Z", "latency_ms": 43593, "error": "HTTPStatusError: OpenRouter HTTP 502: {\"type\":\"https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-502/\",\"title\":\"Error 502: Bad gateway\",\"status\":502,\"detail\":\"The origin web server returned an invalid or incomplete response to Cloudflare. This typically indicates the origin is overlo", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T01:53:24Z", "latency_ms": 44391, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T01:54:10Z", "latency_ms": 35797, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T01:54:48Z", "latency_ms": 42266, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T01:55:41Z", "latency_ms": 31983, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}]}, "question_id": "2309.17421v2.pdf::Q000", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 31843, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": null, "pdf_filename": "2309.17421v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "blue"}}
+{"qid": "2309.17421v2.pdf::Q001", "doc_id": "2309.17421v2.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "YouTube Music", "pages": 166, "document_id": 7589, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:52:59Z", "latency_ms": 33250, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T01:53:33Z", "latency_ms": 40889, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T01:54:16Z", "latency_ms": 51515, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T01:55:10Z", "latency_ms": 54500, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T01:56:14Z", "latency_ms": 32641, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}]}, "question_id": "2309.17421v2.pdf::Q001", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 32500, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": null, "pdf_filename": "2309.17421v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "youtube music"}}
+{"qid": "2309.17421v2.pdf::Q002", "doc_id": "2309.17421v2.pdf", "arm": "native_pdf", "answer_format": "float", "gold": "$49.99", "pages": 166, "document_id": 7589, "original_error": null, "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:56:13Z", "latency_ms": 42110, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T01:56:56Z", "latency_ms": 52202, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T01:57:50Z", "latency_ms": 40469, "error": "HTTPStatusError: OpenRouter HTTP 502: {\"type\":\"https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-502/\",\"title\":\"Error 502: Bad gateway\",\"status\":502,\"detail\":\"The origin web server returned an invalid or incomplete response to Cloudflare. This typically indicates the origin is overlo", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T01:58:34Z", "latency_ms": 49766, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T01:59:30Z", "latency_ms": 42110, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}]}, "question_id": "2309.17421v2.pdf::Q002", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 41968, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": null, "pdf_filename": "2309.17421v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "", "normalised_gold": "49.99"}}
+{"qid": "2309.17421v2.pdf::Q002", "doc_id": "2309.17421v2.pdf", "arm": "azure_premium_lc", "answer_format": "float", "gold": "$49.99", "pages": 166, "document_id": 7589, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:56:47Z", "latency_ms": 24578, "error": null, "raw_text_chars": 640}]}, "question_id": "2309.17421v2.pdf::Q002", "answer_letter": null, "raw_text": "I need to find Figure 111 in the document and identify the price of the keyboard shown in the screenshot.\n\nLooking through the document, I can find Figure 111 which is part of Section 9.8 on GUI Navigation, specifically about Online Shopping for an Ergonomic Keyboard.\n\nIn Figure 111, the screenshot shows a shopping cart with a keyboard item. The text indicates:\n\n\"Subtotal $4999\"\n\nAnd in the cart details, it shows:\n\"Kensington Pro Fit Ergonomic Wireless Keyboard - Black (K7540...\n$4999\"\n\nThe price shown is $49.99 (represented as $4999 in the format used in the screenshot, which appears to be missing the decimal point).\n\nAnswer: 49.99", "citations": [], "input_tokens": 133648, "output_tokens": 176, "cost_micros": 403584, "latency_ms": 24578, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "49.99", "normalised_gold": "49.99"}}
+{"qid": "2309.17421v2.pdf::Q003", "doc_id": "2309.17421v2.pdf", "arm": "native_pdf", "answer_format": "float", "gold": "76%", "pages": 166, "document_id": 7589, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:57:11Z", "latency_ms": 69062, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T01:58:21Z", "latency_ms": 35639, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T01:58:59Z", "latency_ms": 51312, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T01:59:54Z", "latency_ms": 49218, "error": "HTTPStatusError: OpenRouter HTTP 502: {\"type\":\"https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-502/\",\"title\":\"Error 502: Bad gateway\",\"status\":502,\"detail\":\"The origin web server returned an invalid or incomplete response to Cloudflare. This typically indicates the origin is overlo", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:00:48Z", "latency_ms": 56375, "error": "HTTPStatusError: OpenRouter HTTP 502: {\"type\":\"https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-502/\",\"title\":\"Error 502: Bad gateway\",\"status\":502,\"detail\":\"The origin web server returned an invalid or incomplete response to Cloudflare. This typically indicates the origin is overlo", "raw_text_chars": 0}]}, "question_id": "2309.17421v2.pdf::Q003", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 502: {\"type\":\"https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-502/\",\"title\":\"Error 502: Bad gateway\",\"status\":502,\"detail\":\"The origin web server returned an invalid or incomplete response to Cloudflare. This typically indicates the origin is overlo", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "", "normalised_gold": "76.0"}}
+{"qid": "2309.17421v2.pdf::Q003", "doc_id": "2309.17421v2.pdf", "arm": "azure_basic_lc", "answer_format": "float", "gold": "76%", "pages": 166, "document_id": 7589, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:00:12Z", "latency_ms": 7967, "error": null, "raw_text_chars": 594}]}, "question_id": "2309.17421v2.pdf::Q003", "answer_letter": null, "raw_text": "I need to find Figure 107 in the document and identify the battery percentage shown in that screenshot.\n\nLooking through the document for Figure 107, I can see it's in Section 9.8 on GUI Navigation, specifically related to online shopping.\n\nIn the image caption list, Figure 107 is described as \"Section 9.8: online shopping.\"\n\nLooking at the actual Figure 107 in the document, I can see it's a smartphone screenshot showing an Amazon shopping interface. At the top of the screenshot, there is a time display \"5:32\" and next to it shows \"75\" which represents the battery percentage.\n\nAnswer: 75", "citations": [], "input_tokens": 117429, "output_tokens": 143, "cost_micros": 354432, "latency_ms": 7967, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "75.0", "normalised_gold": "76.0"}}
+{"qid": "2309.17421v2.pdf::Q004", "doc_id": "2309.17421v2.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "Ukraine", "pages": 166, "document_id": 7589, "original_error": null, "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:00:20Z", "latency_ms": 33344, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:00:54Z", "latency_ms": 43733, "error": "HTTPStatusError: OpenRouter HTTP 502: {\"type\":\"https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-502/\",\"title\":\"Error 502: Bad gateway\",\"status\":502,\"detail\":\"The origin web server returned an invalid or incomplete response to Cloudflare. This typically indicates the origin is overlo", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:01:39Z", "latency_ms": 33390, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:02:16Z", "latency_ms": 33641, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:02:59Z", "latency_ms": 38000, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}]}, "question_id": "2309.17421v2.pdf::Q004", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 37859, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": null, "pdf_filename": "2309.17421v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "ukraine"}}
+{"qid": "2309.17421v2.pdf::Q005", "doc_id": "2309.17421v2.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "JoooDeee", "pages": 166, "document_id": 7589, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:01:45Z", "latency_ms": 35218, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:02:21Z", "latency_ms": 44921, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:03:08Z", "latency_ms": 52842, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:04:05Z", "latency_ms": 27250, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:04:41Z", "latency_ms": 48031, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}]}, "question_id": "2309.17421v2.pdf::Q005", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 47889, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": null, "pdf_filename": "2309.17421v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "jooodeee"}}
+{"qid": "2310.05634v2.pdf::Q005", "doc_id": "2310.05634v2.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "If it can be partially verified by the knowledge graph G", "pages": 24, "document_id": 7590, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:03:37Z", "latency_ms": 20843, "error": null, "raw_text_chars": 345}]}, "question_id": "2310.05634v2.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, specifically on page 2, the task formulation section discusses the \"Setting of Conscious Incompetence.\"\n\nThe relevant text states: \"A sentence can map to both [NA] and a list of sub-graph knowledge if it can be partially verified by the knowledge graph G.\"\n\nAnswer: if it can be partially verified by the knowledge graph", "citations": [], "input_tokens": 60205, "output_tokens": 80, "cost_micros": 181815, "latency_ms": 20828, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.05634v2.pdf"}, "graded": {"correct": true, "f1": 0.9473684210526316, "method": "str_norm", "normalised_pred": "if it can be partially verified by knowledge graph", "normalised_gold": "if it can be partially verified by knowledge graph g"}}
+{"qid": "2310.07609v1.pdf::Q002", "doc_id": "2310.07609v1.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "5", "pages": 10, "document_id": 7591, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:03:57Z", "latency_ms": 14639, "error": null, "raw_text_chars": 1219}]}, "question_id": "2310.07609v1.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, I need to identify which modules use Large Language Models (LLMs) as their backbone.\n\nFrom Section 3, the QACHECK system has 5 modules:\n1. **Claim Verifier** - Section 3.1 states it's \"based on InstructGPT\" (an LLM)\n2. **Question Generator** - Section 3.2 states \"Similar to the claim verifier, we also leverage InstructGPT\" (an LLM)\n3. **Question Answering Model** - Section 3.3 describes three implementations, with the default being \"GPT Reciter-Reader\" which uses InstructGPT (an LLM). However, this module also has non-LLM options (Retriever-Reader, FLAN-T5).\n4. **QA Validator** - Section 3.4 states \"We again implement the validator by prompting InstructGPT\" (an LLM)\n5. **Reasoner** - Section 3.5 provides \"two different implementations: 1) the end-to-end QA model based on FLAN-T5, and 2) the InstructGPT model\"\n\nCounting the modules that use LLM backbones:\n- Claim Verifier: Yes (InstructGPT)\n- Question Generator: Yes (InstructGPT)\n- Question Answering Model: Yes (uses GPT Reciter-Reader by default)\n- QA Validator: Yes (InstructGPT)\n- Reasoner: Yes (InstructGPT is one of the two implementations)\n\nAll 5 modules have LLM implementations, with some offering alternatives.\n\nAnswer: 5", "citations": [], "input_tokens": 27149, "output_tokens": 403, "cost_micros": 87492, "latency_ms": 14639, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.07609v1.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "5"}}
+{"qid": "2311.16502v3.pdf::Q000", "doc_id": "2311.16502v3.pdf", "arm": "llamacloud_premium_lc", "answer_format": "int", "gold": "287", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:04:12Z", "latency_ms": 6562, "error": null, "raw_text_chars": 406}]}, "question_id": "2311.16502v3.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about the sociology subject in MMMU.\n\nIn Figure 3, which shows the distribution of subjects across disciplines, I can see:\n\nUnder \"Humanities & Social Sci. (9%)\":\n- History (313, 2.71%)\n- Literature (147, 1.27%)\n- Psychology (340, 2.94%)\n- **Sociology (287, 2.48%)**\n\nThe number 287 represents the number of samples in the sociology subject.\n\nAnswer: 287", "citations": [], "input_tokens": 103232, "output_tokens": 123, "cost_micros": 311541, "latency_ms": 6562, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "287", "normalised_gold": "287"}}
+{"qid": "2311.16502v3.pdf::Q002", "doc_id": "2311.16502v3.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "2", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:04:19Z", "latency_ms": 62062, "error": null, "raw_text_chars": 760}]}, "question_id": "2311.16502v3.pdf::Q002", "answer_letter": null, "raw_text": "I need to find Economics-related error cases that fall into the \"Perceptual Error\" category.\n\nLet me search through the document for Economics error cases:\n\nFrom the Case Study section (pages 20-21), I can see Table 10 which shows the index of case study figures by subjects with associated error categories.\n\nLooking at the Economics row in Table 10:\n- Economics has: Correct Case (22), Perceptual Error (23, 24)\n\nThis indicates that there are 2 Economics-related error cases in the Perceptual Error category (Figures 23 and 24).\n\nLet me verify by checking the actual figures:\n- Figure 23 (page 37): Economics: Macroeconomics - Error Category: Perceptual Error \u2713\n- Figure 24 (page 38): Economics: Macroeconomics - Error Category: Perceptual Error \u2713\n\nAnswer: 2", "citations": [], "input_tokens": 259127, "output_tokens": 212, "cost_micros": 1559532, "latency_ms": 62014, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2311.16502v3.pdf::Q003", "doc_id": "2311.16502v3.pdf", "arm": "llamacloud_basic_lc", "answer_format": "list", "gold": "['Thermodynamics', 'Heat Transfer', 'Fluid Mechanics']", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:05:21Z", "latency_ms": 5093, "error": null, "raw_text_chars": 174}]}, "question_id": "2311.16502v3.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Table 11 in the document which lists the subfields of each subject:\n\nFor Energy & Power, the subfields are:\n\nAnswer: Thermodynamics, Heat Transfer, Fluid Mechanics", "citations": [], "input_tokens": 83153, "output_tokens": 47, "cost_micros": 250164, "latency_ms": 5093, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "fluid mechanics, heat transfer, thermodynamics", "normalised_gold": "fluid mechanics, heat transfer, thermodynamics"}}
+{"qid": "2311.16502v3.pdf::Q007", "doc_id": "2311.16502v3.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "17", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:05:26Z", "latency_ms": 64485, "error": null, "raw_text_chars": 1513}]}, "question_id": "2311.16502v3.pdf::Q007", "answer_letter": null, "raw_text": "I need to find Figure 1 in the document and count the different icons shown as different image types.\n\nLooking at page 1, Figure 1 is titled \"Overview of the MMMU dataset\" and shows four main sections:\n\n1. **Comprehensive Disciplines** - Shows icons/images representing different disciplines (Engineering, Science, Art & Design, Business, Humanities & Social Sci., Medicine)\n\n2. **Heterogeneous Image Types** - Shows various image type icons including:\n   - Diagrams\n   - Tables\n   - Plots and Charts\n   - Photographs\n   - Chemical Structures\n   - Paintings\n   - Medical Images\n   - Sheet Music\n   - Geometric\n   - Pathology images\n   - Microscopic Images\n   - Comics\n\n3. **Interleaved Text and Images** - Shows a medical question example with 3 MRI images\n\n4. **Expert-level Skills Test** - Shows a diagram with three components: Perception, Knowledge, and Reasoning\n\nThe question asks specifically about \"different icon are shown as different image types\" in Figure 1. Looking at the \"Heterogeneous Image Types\" section, I can count the distinct image type icons shown.\n\nCounting the image types listed in the \"Heterogeneous Image Types\" section:\nDiagrams, Tables, Plots and Charts, Photographs, Chemical Structures, Paintings, Medical Images, Sheet Music, Geometric, Pathology images, Microscopic Images, Comics, and there's an ellipsis (...) suggesting more types.\n\nBased on what's explicitly visible and labeled in Figure 1, I can count approximately 12 distinct image type representations shown.\n\nAnswer: 12", "citations": [], "input_tokens": 259123, "output_tokens": 378, "cost_micros": 1563243, "latency_ms": 64406, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "12", "normalised_gold": "17"}}
+{"qid": "2311.16502v3.pdf::Q009", "doc_id": "2311.16502v3.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:05:29Z", "latency_ms": 75843, "error": null, "raw_text_chars": 831}]}, "question_id": "2311.16502v3.pdf::Q009", "answer_letter": null, "raw_text": "I need to count the charts that appear in pages 5-10 of the document.\n\nLet me go through each page:\n\n**Page 5:**\n- Figure 4: A comparison chart showing benchmarks plotted on a 2D graph with \"Breadth (Knowledge)\" on x-axis and \"Depth (Reasoning)\" on y-axis\n- A table comparing different datasets\nTotal: 1 chart\n\n**Page 6:**\n- Table 2: Results table (not a chart)\nTotal: 0 charts\n\n**Page 7:**\n- Figure 5: Bar chart showing \"Performance of models on different types of images\"\n- Table 3: Results table (not a chart)\nTotal: 1 chart\n\n**Page 8:**\n- Figure 6: Pie chart showing \"Error distribution over 150 annotated GPT-4V errors\"\nTotal: 1 chart\n\n**Page 9:**\n- No charts, only text\nTotal: 0 charts\n\n**Page 10:**\n- No charts, only text (references section)\nTotal: 0 charts\n\nTotal charts in pages 5-10: 1 + 0 + 1 + 1 + 0 + 0 = 3\n\nAnswer: 3", "citations": [], "input_tokens": 259120, "output_tokens": 289, "cost_micros": 1561222, "latency_ms": 75796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "4"}}
+{"qid": "2311.16502v3.pdf::Q013", "doc_id": "2311.16502v3.pdf", "arm": "native_pdf", "answer_format": "list", "gold": "['Perceptual Error', 'Lack of Knowledge', 'Reasoning Error']", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:06:30Z", "latency_ms": 77217, "error": null, "raw_text_chars": 404}]}, "question_id": "2311.16502v3.pdf::Q013", "answer_letter": null, "raw_text": "Looking at Figure 6 on page 8 of the document, which shows the \"Error distribution over 150 annotated GPT-4V errors\":\n\nThe error categories and their percentages are:\n- Perceptual Error: 35%\n- Lack of Knowledge: 29%\n- Reasoning Error: 26%\n- Textual Understanding: 4%\n- Reject to Answer: 3%\n- Annotation Error: 2%\n- Answer Extraction Error: 1%\n\nAnswer: Perceptual Error, Lack of Knowledge, Reasoning Error", "citations": [], "input_tokens": 259144, "output_tokens": 133, "cost_micros": 1557856, "latency_ms": 77171, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q013", "doc_id": "2311.16502v3.pdf", "arm": "azure_premium_lc", "answer_format": "list", "gold": "['Perceptual Error', 'Lack of Knowledge', 'Reasoning Error']", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:06:45Z", "latency_ms": 7750, "error": null, "raw_text_chars": 387}]}, "question_id": "2311.16502v3.pdf::Q013", "answer_letter": null, "raw_text": "Looking at Figure 6 \"Error distribution over 150 annotated GPT-4V errors\":\n\nThe pie chart shows the following error categories and their percentages:\n- Perceptual Error: 35%\n- Lack of Knowledge: 29%\n- Reasoning Error: 26%\n- Textual Understanding: 4%\n- Reject to Answer: 3%\n- Annotation Error: 2%\n- Answer Extraction Error: 1%\n\nAnswer: Perceptual Error, Lack of Knowledge, Reasoning Error", "citations": [], "input_tokens": 109199, "output_tokens": 125, "cost_micros": 329472, "latency_ms": 7750, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q014", "doc_id": "2311.16502v3.pdf", "arm": "native_pdf", "answer_format": "list", "gold": "['Reasoning Error', 'Perceptual Error', 'Lack of Knowledge']", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:06:53Z", "latency_ms": 59125, "error": null, "raw_text_chars": 830}]}, "question_id": "2311.16502v3.pdf::Q014", "answer_letter": null, "raw_text": "I'll search through pages 95-100 for error types mentioned in the case studies.\n\nFrom page 95 (Figure 80 - Architecture and Engineering: Surveying and Mapping):\n- This is a correct case, no error type mentioned\n\nFrom page 96 (Figure 82 - Architecture and Engineering: Civil Engineering):\n- Error Category: Reasoning Error\n\nFrom page 97 (Figure 83 - Computer Science: Data Structure and Algorithm):\n- This is a correct case, no error type mentioned\n\nFrom page 98 (Figure 84 - Computer Science: Compiler Principle):\n- Error Category: Perceptual Error, Lack of Knowledge\n\nFrom page 99 (Figure 85 - Computer Science: Operating System):\n- Error Category: Perceptual Error\n\nFrom page 100 (Figure 86 - Computer Science: Compiler Principle):\n- Error Category: Perceptual Error\n\nAnswer: Reasoning Error, Perceptual Error, Lack of Knowledge", "citations": [], "input_tokens": 259136, "output_tokens": 226, "cost_micros": 1559901, "latency_ms": 59077, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q015", "doc_id": "2311.16502v3.pdf", "arm": "llamacloud_premium_lc", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:07:48Z", "latency_ms": 11110, "error": null, "raw_text_chars": 1296}]}, "question_id": "2311.16502v3.pdf::Q015", "answer_letter": null, "raw_text": "I need to count the successful cases (correct predictions by GPT-4V) in Pages 65-70.\n\nLet me go through each figure in this range:\n\n**Page 65 - Figure 51 (Clinical Medicine: Pathology)**\n- GPT-4V correctly identified (E) Familial polyposis coli\n- Ground Truth: (E) Familial polyposis coli\n- **SUCCESS**\n\n**Page 66 - Figure 52 (Clinical Medicine: Pathology)**\n- GPT-4V correctly identified (D) Ruptured berry aneurysm\n- Ground Truth: (D) Ruptured berry aneurysm\n- **SUCCESS**\n\n**Page 67 - Figure 53 (Clinical Medicine: Pathology)**\n- GPT-4V answered (A) Human papillomavirus infection\n- Ground Truth: (E) Chronic inflammation\n- Error Category: Perceptual Error\n- **FAILURE**\n\n**Page 68 - Figure 54 (Clinical Medicine: Pathology)**\n- GPT-4V answered (A) Human papillomavirus infection\n- Ground Truth: (E) Chronic inflammation\n- Error Category: Perceptual Error\n- **FAILURE**\n\n**Page 69 - Figure 55 (Clinical Medicine: Heart Disease)**\n- GPT-4V answered (B) Mitral regurgitation\n- Ground Truth: (C) Aortic value regurgitation\n- Error Category: Lack of Knowledge\n- **FAILURE**\n\n**Page 70 - Figure 56 (Diagnostics and Laboratory Medicine: Radiology)**\n- GPT-4V correctly identified (C) Dandy-Walker syndrome\n- Ground Truth: (C) Dandy-Walker syndrome\n- **SUCCESS**\n\nTotal successful cases: 3\n\nAnswer: 3", "citations": [], "input_tokens": 103232, "output_tokens": 424, "cost_micros": 316056, "latency_ms": 11110, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "4"}}
+{"qid": "2312.10997v5.pdf::Q000", "doc_id": "2312.10997v5.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "Pre-training", "pages": 21, "document_id": 7596, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:07:52Z", "latency_ms": 23593, "error": null, "raw_text_chars": 1091}]}, "question_id": "2312.10997v5.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the figure with a tree shape and identify which branch has the least leaves.\n\nLooking through the document, I can see Figure 1 on page 2, which shows a \"Technology tree of RAG research\" with a tree-like structure.\n\nExamining this tree diagram, I can see it has a trunk labeled \"Retrieval-Augmented Generation\" and branches extending upward. The main branches appear to be:\n\n1. A left branch (colored in teal/green) showing the evolution from pre-training through fine-tuning stages\n2. A middle branch (colored in orange) labeled \"Instructlens\" \n3. A right branch (colored in blue) showing the inference stage with multiple methods\n\nLooking at the number of \"leaves\" (individual methods/papers) on each branch:\n- The left branch (Pre-training/Fine-tuning) has several leaves\n- The middle branch (Instructlens) appears to have the fewest leaves - it's relatively short with minimal branching\n- The right branch (Inference) has many leaves with extensive branching\n\nThe branch labeled \"Instructlens\" in the middle appears to have the least number of leaves.\n\nAnswer: Instructlens", "citations": [], "input_tokens": 64633, "output_tokens": 269, "cost_micros": 197934, "latency_ms": 23577, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.10997v5.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "instructlens", "normalised_gold": "pre training"}}
+{"qid": "2401.18059v1.pdf::Q002", "doc_id": "2401.18059v1.pdf", "arm": "azure_premium_lc", "answer_format": "str", "gold": "Collapsed Tree Algorithm", "pages": 23, "document_id": 7597, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:07:59Z", "latency_ms": 7156, "error": null, "raw_text_chars": 752}]}, "question_id": "2401.18059v1.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Appendix F \"PSEUDOCODE FOR RETRIEVAL METHODS\":\n\n**Algorithm 1 Tree Traversal Algorithm:**\n- Lines: function declaration, initialization of Scurrent, for layer loop, initialization of topk, for node loop, score calculation, append to top_k, end for, Slayer assignment, Scurrent update, end for, return statement, end function\n- Total: 13 lines\n\n**Algorithm 2 Collapsed Tree Algorithm:**\n- Lines: function declaration, flatten tree, initialization of top_nodes, for node loop, append with dot product, end for, sort top_nodes, result initialization, total_tokens initialization, for node loop, if condition, result append, end if, total_tokens update, end for, return statement, end function\n- Total: 16 lines\n\nAnswer: Collapsed Tree Algorithm", "citations": [], "input_tokens": 27211, "output_tokens": 190, "cost_micros": 84483, "latency_ms": 7156, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "collapsed tree algorithm", "normalised_gold": "collapsed tree algorithm"}}
+{"qid": "2405.09818v1.pdf::Q000", "doc_id": "2405.09818v1.pdf", "arm": "native_pdf", "answer_format": "list", "gold": "['Figure 5', 'Figure 6']", "pages": 27, "document_id": 7598, "original_error": null, "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:08:06Z", "latency_ms": 16641, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657610 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:08:23Z", "latency_ms": 11860, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657610 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:08:36Z", "latency_ms": 12422, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657610 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:08:54Z", "latency_ms": 19625, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657610 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:09:25Z", "latency_ms": 14078, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657610 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}]}, "question_id": "2405.09818v1.pdf::Q000", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657610 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "", "normalised_gold": "figure 5, figure 6"}}
+{"qid": "2405.09818v1.pdf::Q001", "doc_id": "2405.09818v1.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "text tokens", "pages": 27, "document_id": 7598, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:08:16Z", "latency_ms": 13922, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657593 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:08:30Z", "latency_ms": 25952, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657593 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:08:57Z", "latency_ms": 14718, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657593 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:09:14Z", "latency_ms": 33452, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657593 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:09:54Z", "latency_ms": 22093, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657593 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}]}, "question_id": "2405.09818v1.pdf::Q001", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657593 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "text tokens"}}
+{"qid": "2405.09818v1.pdf::Q003", "doc_id": "2405.09818v1.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "18", "pages": 27, "document_id": 7598, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:09:39Z", "latency_ms": 11906, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657608 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:09:52Z", "latency_ms": 15921, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657608 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:10:10Z", "latency_ms": 21921, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657608 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:10:34Z", "latency_ms": 11906, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657608 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:10:53Z", "latency_ms": 13686, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657608 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}]}, "question_id": "2405.09818v1.pdf::Q003", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657608 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "18"}}
+{"qid": "2405.09818v1.pdf::Q004", "doc_id": "2405.09818v1.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "1", "pages": 27, "document_id": 7598, "original_error": null, "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:10:16Z", "latency_ms": 14281, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657583 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:10:32Z", "latency_ms": 12125, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657583 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:10:45Z", "latency_ms": 11952, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657583 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:11:01Z", "latency_ms": 20641, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657583 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:11:30Z", "latency_ms": 13985, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657583 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}]}, "question_id": "2405.09818v1.pdf::Q004", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657583 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "1"}}
+{"qid": "2405.09818v1.pdf::Q005", "doc_id": "2405.09818v1.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "no", "pages": 27, "document_id": 7598, "original_error": null, "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:11:06Z", "latency_ms": 17157, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"{\\\"message\\\":\\\"Input is too long.\\\"}\",\"provider_name\":\"Amazon Bedrock\",\"is_byok\":false}},\"user_id\":\"user_3CNdnY1vL3Ln9TYRiGAii5kmBvu\"}", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:11:25Z", "latency_ms": 12125, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657607 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:11:40Z", "latency_ms": 13734, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657607 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:11:59Z", "latency_ms": 15828, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657607 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:12:25Z", "latency_ms": 19985, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657607 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}]}, "question_id": "2405.09818v1.pdf::Q005", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657607 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "no"}}
+{"qid": "2405.09818v1.pdf::Q007", "doc_id": "2405.09818v1.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "150k", "pages": 27, "document_id": 7598, "original_error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657603 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:11:44Z", "latency_ms": 15875, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657603 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:12:00Z", "latency_ms": 16625, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657603 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:12:19Z", "latency_ms": 12937, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657603 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:12:35Z", "latency_ms": 14625, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657603 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:12:57Z", "latency_ms": 12156, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657603 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}]}, "question_id": "2405.09818v1.pdf::Q007", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657603 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "150k"}}
diff --git a/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_retries_summary.json b/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_retries_summary.json
new file mode 100644
index 0000000000..7c99def5f2
--- /dev/null
+++ b/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_retries_summary.json
@@ -0,0 +1,100 @@
+{
+  "config": {
+    "base_delay": 1.0,
+    "concurrency": 2,
+    "llm_model": "anthropic/claude-sonnet-4.5",
+    "max_attempts": 5,
+    "max_delay": 30.0,
+    "max_output_tokens": 512,
+    "pdf_engine": "native"
+  },
+  "elapsed_s": 1373.6,
+  "n_failed_rows_input": 37,
+  "n_retried": 37,
+  "per_arm": {
+    "azure_basic_lc": {
+      "attempts_distribution": [
+        1
+      ],
+      "recovered": 1,
+      "recovery_rate": 1.0,
+      "still_failed": 0,
+      "tried": 1
+    },
+    "azure_premium_lc": {
+      "attempts_distribution": [
+        1,
+        1,
+        1
+      ],
+      "recovered": 3,
+      "recovery_rate": 1.0,
+      "still_failed": 0,
+      "tried": 3
+    },
+    "llamacloud_basic_lc": {
+      "attempts_distribution": [
+        1,
+        1
+      ],
+      "recovered": 2,
+      "recovery_rate": 1.0,
+      "still_failed": 0,
+      "tried": 2
+    },
+    "llamacloud_premium_lc": {
+      "attempts_distribution": [
+        1,
+        1,
+        1,
+        1
+      ],
+      "recovered": 4,
+      "recovery_rate": 1.0,
+      "still_failed": 0,
+      "tried": 4
+    },
+    "native_pdf": {
+      "attempts_distribution": [
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        5,
+        5,
+        5,
+        5,
+        5,
+        5,
+        5,
+        5,
+        5,
+        5,
+        5,
+        5
+      ],
+      "recovered": 15,
+      "recovery_rate": 0.5555555555555556,
+      "still_failed": 12,
+      "tried": 27
+    }
+  },
+  "raw_retries_path": "data\\multimodal_doc\\runs\\2026-05-14T00-53-19Z\\parser_compare\\raw_retries.jsonl",
+  "run_id": "2026-05-14T00-53-19Z",
+  "totals": {
+    "recovered": 25,
+    "still_failed": 12,
+    "tried": 37
+  }
+}
diff --git a/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/run_artifact.json b/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/run_artifact.json
new file mode 100644
index 0000000000..a4687f64a3
--- /dev/null
+++ b/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/run_artifact.json
@@ -0,0 +1,1022 @@
+{
+  "benchmark": "parser_compare",
+  "extra": {
+    "active_arms": [
+      "native_pdf",
+      "azure_basic_lc",
+      "azure_premium_lc",
+      "llamacloud_basic_lc",
+      "llamacloud_premium_lc",
+      "surfsense_agentic"
+    ],
+    "agent_llm_id": -5138454,
+    "concurrency": 2,
+    "llm_model": "anthropic/claude-sonnet-4.5",
+    "n_pdfs": 30,
+    "n_questions": 171,
+    "no_mentions": false,
+    "pdf_engine": "native",
+    "preprocess_tariff": {
+      "basic_per_1k_pages": 1.0,
+      "premium_per_1k_pages": 10.0
+    },
+    "provider_model": "anthropic/claude-sonnet-4.5",
+    "scenario": "head-to-head",
+    "vision_provider_model": "anthropic/claude-sonnet-4.5"
+  },
+  "metrics": {
+    "n_questions": 171,
+    "n_unique_pdfs": 30,
+    "per_arm": {
+      "azure_basic_lc": {
+        "accuracy": 0.543859649122807,
+        "ci_high": 0.6167298584106974,
+        "ci_low": 0.46906215053943956,
+        "f1_mean": 0.5655925434323409,
+        "input_tokens_mean": 31882.88888888889,
+        "latency_ms_median": 7094,
+        "latency_ms_p95": 11984,
+        "llm_cost_per_q": 0.09939822807017544,
+        "n": 171,
+        "n_correct": 93,
+        "n_total": 171,
+        "output_tokens_mean": 249.97076023391813,
+        "preprocess_cost_per_q": 0.006771929824561403,
+        "preprocess_cost_total": 1.158,
+        "preprocess_label": "basic tier ($/basic/page = $0.0010)",
+        "preprocess_per_page_usd": 0.001,
+        "total_cost_per_q": 0.10617015789473684
+      },
+      "azure_premium_lc": {
+        "accuracy": 0.5672514619883041,
+        "ci_high": 0.6392285382926538,
+        "ci_low": 0.4923192087642231,
+        "f1_mean": 0.5956074243139755,
+        "input_tokens_mean": 39786.783625731,
+        "latency_ms_median": 6858,
+        "latency_ms_p95": 11608,
+        "llm_cost_per_q": 0.13734295321637427,
+        "n": 171,
+        "n_correct": 97,
+        "n_total": 171,
+        "output_tokens_mean": 223.39766081871346,
+        "preprocess_cost_per_q": 0.06771929824561404,
+        "preprocess_cost_total": 11.58,
+        "preprocess_label": "premium tier ($/premium/page = $0.0100)",
+        "preprocess_per_page_usd": 0.01,
+        "total_cost_per_q": 0.2050622514619883
+      },
+      "llamacloud_basic_lc": {
+        "accuracy": 0.5029239766081871,
+        "ci_high": 0.5769717486785515,
+        "ci_low": 0.42874771858479094,
+        "f1_mean": 0.5323894603029942,
+        "input_tokens_mean": 31493.333333333332,
+        "latency_ms_median": 7125,
+        "latency_ms_p95": 11922,
+        "llm_cost_per_q": 0.09812833333333335,
+        "n": 171,
+        "n_correct": 86,
+        "n_total": 171,
+        "output_tokens_mean": 243.22222222222223,
+        "preprocess_cost_per_q": 0.006771929824561403,
+        "preprocess_cost_total": 1.158,
+        "preprocess_label": "basic tier ($/basic/page = $0.0010)",
+        "preprocess_per_page_usd": 0.001,
+        "total_cost_per_q": 0.10490026315789475
+      },
+      "llamacloud_premium_lc": {
+        "accuracy": 0.5847953216374269,
+        "ci_high": 0.6559957259750998,
+        "ci_low": 0.5098688246618316,
+        "f1_mean": 0.6109434348065313,
+        "input_tokens_mean": 39130.666666666664,
+        "latency_ms_median": 6844,
+        "latency_ms_p95": 12656,
+        "llm_cost_per_q": 0.12080787719298246,
+        "n": 171,
+        "n_correct": 100,
+        "n_total": 171,
+        "output_tokens_mean": 227.7251461988304,
+        "preprocess_cost_per_q": 0.06771929824561404,
+        "preprocess_cost_total": 11.58,
+        "preprocess_label": "premium tier ($/premium/page = $0.0100)",
+        "preprocess_per_page_usd": 0.01,
+        "total_cost_per_q": 0.18852717543859648
+      },
+      "native_pdf": {
+        "accuracy": 0.47953216374269003,
+        "ci_high": 0.5540343616319661,
+        "ci_low": 0.40592936752463654,
+        "f1_mean": 0.5040888253091556,
+        "input_tokens_mean": 65772.61988304094,
+        "latency_ms_median": 29484,
+        "latency_ms_p95": 60530,
+        "llm_cost_per_q": 0.25520109356725146,
+        "n": 171,
+        "n_correct": 82,
+        "n_total": 171,
+        "output_tokens_mean": 209.09941520467837,
+        "preprocess_cost_per_q": 0.0,
+        "preprocess_cost_total": 0.0,
+        "preprocess_label": "n/a (PDF attached natively)",
+        "preprocess_per_page_usd": 0.0,
+        "total_cost_per_q": 0.25520109356725146
+      },
+      "surfsense_agentic": {
+        "accuracy": 0.5321637426900585,
+        "ci_high": 0.6054202002109953,
+        "ci_low": 0.4574939396857719,
+        "f1_mean": 0.5430214722229827,
+        "input_tokens_mean": 0.0,
+        "latency_ms_median": 52827,
+        "latency_ms_p95": 164109,
+        "llm_cost_per_q": 0.0,
+        "n": 171,
+        "n_correct": 91,
+        "n_total": 171,
+        "output_tokens_mean": 0.0,
+        "preprocess_cost_per_q": 0.06771929824561404,
+        "preprocess_cost_total": 11.58,
+        "preprocess_label": "premium tier (ingested by SurfSense at processing_mode=premium + vision_llm=on)",
+        "preprocess_per_page_usd": 0.01,
+        "total_cost_per_q": 0.06771929824561404
+      }
+    },
+    "per_pdf": {
+      "05-03-18-political-release.pdf": {
+        "arms": {
+          "azure_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "azure_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "native_pdf": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "surfsense_agentic": {
+            "correct": true,
+            "f1": 1.0
+          }
+        },
+        "pages": 17
+      },
+      "0b85477387a9d0cc33fca0f4becaa0e5.pdf": {
+        "arms": {
+          "azure_basic_lc": {
+            "correct": true,
+            "f1": 0.888888888888889
+          },
+          "azure_premium_lc": {
+            "correct": true,
+            "f1": 0.888888888888889
+          },
+          "llamacloud_basic_lc": {
+            "correct": false,
+            "f1": 0.7272727272727273
+          },
+          "llamacloud_premium_lc": {
+            "correct": true,
+            "f1": 0.888888888888889
+          },
+          "native_pdf": {
+            "correct": true,
+            "f1": 0.888888888888889
+          },
+          "surfsense_agentic": {
+            "correct": false,
+            "f1": 0.28571428571428575
+          }
+        },
+        "pages": 16
+      },
+      "0e94b4197b10096b1f4c699701570fbf.pdf": {
+        "arms": {
+          "azure_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "azure_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "native_pdf": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "surfsense_agentic": {
+            "correct": true,
+            "f1": 1.0
+          }
+        },
+        "pages": 15
+      },
+      "11-21-16-Updated-Post-Election-Release.pdf": {
+        "arms": {
+          "azure_basic_lc": {
+            "correct": false,
+            "f1": 0.6666666666666666
+          },
+          "azure_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "native_pdf": {
+            "correct": false,
+            "f1": 0.6666666666666666
+          },
+          "surfsense_agentic": {
+            "correct": true,
+            "f1": 1.0
+          }
+        },
+        "pages": 45
+      },
+      "12-15-15-ISIS-and-terrorism-release-final.pdf": {
+        "arms": {
+          "azure_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "azure_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "native_pdf": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "surfsense_agentic": {
+            "correct": true,
+            "f1": 1.0
+          }
+        },
+        "pages": 28
+      },
+      "2005.12872v3.pdf": {
+        "arms": {
+          "azure_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "azure_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "native_pdf": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "surfsense_agentic": {
+            "correct": true,
+            "f1": 1.0
+          }
+        },
+        "pages": 26
+      },
+      "2021-Apple-Catalog.pdf": {
+        "arms": {
+          "azure_basic_lc": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "azure_premium_lc": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "llamacloud_basic_lc": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "llamacloud_premium_lc": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "native_pdf": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "surfsense_agentic": {
+            "correct": false,
+            "f1": 0.0
+          }
+        },
+        "pages": 55
+      },
+      "2023.acl-long.386.pdf": {
+        "arms": {
+          "azure_basic_lc": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "azure_premium_lc": {
+            "correct": false,
+            "f1": 0.5
+          },
+          "llamacloud_basic_lc": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "llamacloud_premium_lc": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "native_pdf": {
+            "correct": false,
+            "f1": 0.5
+          },
+          "surfsense_agentic": {
+            "correct": false,
+            "f1": 0.5
+          }
+        },
+        "pages": 24
+      },
+      "2023.findings-emnlp.248.pdf": {
+        "arms": {
+          "azure_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "azure_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "native_pdf": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "surfsense_agentic": {
+            "correct": true,
+            "f1": 1.0
+          }
+        },
+        "pages": 19
+      },
+      "2024.ug.eprospectus.pdf": {
+        "arms": {
+          "azure_basic_lc": {
+            "correct": false,
+            "f1": 0.8
+          },
+          "azure_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "native_pdf": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "surfsense_agentic": {
+            "correct": true,
+            "f1": 1.0
+          }
+        },
+        "pages": 27
+      },
+      "2210.02442v1.pdf": {
+        "arms": {
+          "azure_basic_lc": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "azure_premium_lc": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "llamacloud_basic_lc": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "llamacloud_premium_lc": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "native_pdf": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "surfsense_agentic": {
+            "correct": false,
+            "f1": 0.0
+          }
+        },
+        "pages": 24
+      },
+      "2303.05039v2.pdf": {
+        "arms": {
+          "azure_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "azure_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_basic_lc": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "llamacloud_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "native_pdf": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "surfsense_agentic": {
+            "correct": true,
+            "f1": 1.0
+          }
+        },
+        "pages": 9
+      },
+      "2303.08559v2.pdf": {
+        "arms": {
+          "azure_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "azure_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_basic_lc": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "llamacloud_premium_lc": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "native_pdf": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "surfsense_agentic": {
+            "correct": true,
+            "f1": 1.0
+          }
+        },
+        "pages": 30
+      },
+      "2305.13186v3.pdf": {
+        "arms": {
+          "azure_basic_lc": {
+            "correct": false,
+            "f1": 0.5
+          },
+          "azure_premium_lc": {
+            "correct": false,
+            "f1": 0.5
+          },
+          "llamacloud_basic_lc": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "llamacloud_premium_lc": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "native_pdf": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "surfsense_agentic": {
+            "correct": false,
+            "f1": 0.0
+          }
+        },
+        "pages": 27
+      },
+      "2305.14160v4.pdf": {
+        "arms": {
+          "azure_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "azure_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "native_pdf": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "surfsense_agentic": {
+            "correct": true,
+            "f1": 1.0
+          }
+        },
+        "pages": 16
+      },
+      "2306.05425v1.pdf": {
+        "arms": {
+          "azure_basic_lc": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "azure_premium_lc": {
+            "correct": false,
+            "f1": 0.3333333333333333
+          },
+          "llamacloud_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_premium_lc": {
+            "correct": false,
+            "f1": 0.4
+          },
+          "native_pdf": {
+            "correct": false,
+            "f1": 0.4
+          },
+          "surfsense_agentic": {
+            "correct": false,
+            "f1": 0.0
+          }
+        },
+        "pages": 22
+      },
+      "2307.09288v2.pdf": {
+        "arms": {
+          "azure_basic_lc": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "azure_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_premium_lc": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "native_pdf": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "surfsense_agentic": {
+            "correct": false,
+            "f1": 0.0
+          }
+        },
+        "pages": 77
+      },
+      "2309.17421v2.pdf": {
+        "arms": {
+          "azure_basic_lc": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "azure_premium_lc": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "llamacloud_basic_lc": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "llamacloud_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "native_pdf": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "surfsense_agentic": {
+            "correct": true,
+            "f1": 1.0
+          }
+        },
+        "pages": 166
+      },
+      "2310.05634v2.pdf": {
+        "arms": {
+          "azure_basic_lc": {
+            "correct": false,
+            "f1": 0.28571428571428575
+          },
+          "azure_premium_lc": {
+            "correct": false,
+            "f1": 0.5
+          },
+          "llamacloud_basic_lc": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "llamacloud_premium_lc": {
+            "correct": false,
+            "f1": 0.6
+          },
+          "native_pdf": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "surfsense_agentic": {
+            "correct": false,
+            "f1": 0.0
+          }
+        },
+        "pages": 24
+      },
+      "2310.07609v1.pdf": {
+        "arms": {
+          "azure_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "azure_premium_lc": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "llamacloud_basic_lc": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "llamacloud_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "native_pdf": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "surfsense_agentic": {
+            "correct": true,
+            "f1": 1.0
+          }
+        },
+        "pages": 10
+      },
+      "2310.09158v1.pdf": {
+        "arms": {
+          "azure_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "azure_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "native_pdf": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "surfsense_agentic": {
+            "correct": true,
+            "f1": 1.0
+          }
+        },
+        "pages": 23
+      },
+      "2311.16502v3.pdf": {
+        "arms": {
+          "azure_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "azure_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_premium_lc": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "native_pdf": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "surfsense_agentic": {
+            "correct": true,
+            "f1": 1.0
+          }
+        },
+        "pages": 117
+      },
+      "2312.04350v3.pdf": {
+        "arms": {
+          "azure_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "azure_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "native_pdf": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "surfsense_agentic": {
+            "correct": true,
+            "f1": 1.0
+          }
+        },
+        "pages": 28
+      },
+      "2312.09390v1.pdf": {
+        "arms": {
+          "azure_basic_lc": {
+            "correct": false,
+            "f1": 0.11764705882352941
+          },
+          "azure_premium_lc": {
+            "correct": false,
+            "f1": 0.25
+          },
+          "llamacloud_basic_lc": {
+            "correct": false,
+            "f1": 0.2222222222222222
+          },
+          "llamacloud_premium_lc": {
+            "correct": false,
+            "f1": 0.16666666666666666
+          },
+          "native_pdf": {
+            "correct": false,
+            "f1": 0.11764705882352941
+          },
+          "surfsense_agentic": {
+            "correct": false,
+            "f1": 0.16666666666666666
+          }
+        },
+        "pages": 49
+      },
+      "2312.10997v5.pdf": {
+        "arms": {
+          "azure_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "azure_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_premium_lc": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "native_pdf": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "surfsense_agentic": {
+            "correct": true,
+            "f1": 1.0
+          }
+        },
+        "pages": 21
+      },
+      "2401.18059v1.pdf": {
+        "arms": {
+          "azure_basic_lc": {
+            "correct": false,
+            "f1": 0.28571428571428575
+          },
+          "azure_premium_lc": {
+            "correct": false,
+            "f1": 0.3333333333333333
+          },
+          "llamacloud_basic_lc": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "llamacloud_premium_lc": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "native_pdf": {
+            "correct": false,
+            "f1": 0.4
+          },
+          "surfsense_agentic": {
+            "correct": false,
+            "f1": 0.0
+          }
+        },
+        "pages": 23
+      },
+      "2405.09818v1.pdf": {
+        "arms": {
+          "azure_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "azure_premium_lc": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "llamacloud_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "native_pdf": {
+            "correct": false,
+            "f1": 0.0
+          },
+          "surfsense_agentic": {
+            "correct": true,
+            "f1": 1.0
+          }
+        },
+        "pages": 27
+      },
+      "3276a5b991c49cf5f9a4af0f7d6fce67.pdf": {
+        "arms": {
+          "azure_basic_lc": {
+            "correct": true,
+            "f1": 0.6666666666666666
+          },
+          "azure_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_premium_lc": {
+            "correct": true,
+            "f1": 0.6666666666666666
+          },
+          "native_pdf": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "surfsense_agentic": {
+            "correct": true,
+            "f1": 0.6666666666666666
+          }
+        },
+        "pages": 16
+      },
+      "379f44022bb27aa53efd5d322c7b57bf.pdf": {
+        "arms": {
+          "azure_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "azure_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "native_pdf": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "surfsense_agentic": {
+            "correct": true,
+            "f1": 1.0
+          }
+        },
+        "pages": 17
+      },
+      "3M_2018_10K.pdf": {
+        "arms": {
+          "azure_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "azure_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_basic_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "llamacloud_premium_lc": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "native_pdf": {
+            "correct": true,
+            "f1": 1.0
+          },
+          "surfsense_agentic": {
+            "correct": true,
+            "f1": 1.0
+          }
+        },
+        "pages": 160
+      }
+    },
+    "total_pages_in_scope": 1158
+  },
+  "raw_path": "raw.jsonl",
+  "suite": "multimodal_doc"
+}
diff --git a/surfsense_evals/pyproject.toml b/surfsense_evals/pyproject.toml
new file mode 100644
index 0000000000..3a03e1ed84
--- /dev/null
+++ b/surfsense_evals/pyproject.toml
@@ -0,0 +1,63 @@
+[project]
+name = "surfsense-evals"
+version = "0.1.0"
+description = "Domain-agnostic evaluation harness for SurfSense (medical RAG suite ships first; legal/finance/code suites slot in under suites/)."
+readme = "README.md"
+requires-python = ">=3.12"
+license = { text = "Apache-2.0" }
+authors = [{ name = "SurfSense" }]
+
+dependencies = [
+    "httpx>=0.27.0",
+    "httpx-sse>=0.4.0",
+    "datasets>=2.21.0",
+    "huggingface_hub>=0.24.0",
+    "reportlab>=4.0.0",
+    "Pillow>=10.0.0",
+    "pyarrow>=15.0.0",
+    "pydantic>=2.6.0",
+    "tqdm>=4.66.0",
+    "numpy>=1.26.0",
+    "scikit-learn>=1.4.0",
+    "scipy>=1.12.0",
+    "python-dotenv>=1.0.0",
+    "rich>=13.7.0",
+    "trafilatura>=1.12.0",
+    "pypdf>=5.1.0",
+    "azure-ai-documentintelligence>=1.0.2",
+    "llama-cloud-services>=0.6.25",
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "pytest-asyncio>=0.23.0",
+    "respx>=0.21.0",
+    "ruff>=0.5.0",
+]
+
+[project.scripts]
+surfsense-evals = "surfsense_evals.core.cli:main"
+
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools.packages.find]
+where = ["src"]
+include = ["surfsense_evals*"]
+
+[tool.pytest.ini_options]
+asyncio_mode = "auto"
+testpaths = ["tests"]
+markers = [
+    "integration: opt-in tests that hit a live SurfSense instance (run with `-m integration`)",
+]
+
+[tool.ruff]
+line-length = 100
+target-version = "py312"
+
+[tool.ruff.lint]
+select = ["E", "F", "I", "B", "UP", "SIM", "ASYNC"]
+ignore = ["E501"]
diff --git a/surfsense_evals/reports/.gitignore b/surfsense_evals/reports/.gitignore
new file mode 100644
index 0000000000..e33c279369
--- /dev/null
+++ b/surfsense_evals/reports/.gitignore
@@ -0,0 +1,13 @@
+# Default: don't track auto-generated `summary.md` / `summary.json` from
+# every benchmark run — those are derivative of `data/.../runs/<id>/`.
+*
+!.gitignore
+
+# Hand-curated sample report kept as a reference for the medical suite.
+!medical/
+!medical/sample_summary.md
+
+# Hand-curated blog-ready writeups (one per experiment) — these *are*
+# the public citation surface and must travel with the repo.
+!blog/
+!blog/*.md
diff --git a/surfsense_evals/scripts/analyze_failure_timing.py b/surfsense_evals/scripts/analyze_failure_timing.py
new file mode 100644
index 0000000000..f4f8aedba4
--- /dev/null
+++ b/surfsense_evals/scripts/analyze_failure_timing.py
@@ -0,0 +1,125 @@
+"""Were the SSL failures clustered in time (network blip) or evenly
+distributed (sustained limit)? Group failures by 1-min buckets using
+the run start time and the per-row latency_ms / answer order.
+
+Also: for the one *real* intrinsic failure — the 30MB Anthropic limit
+on 2405.09818v1.pdf::Q007 — print the full error message + raw payload
+sizes so the blog has a clean root cause.
+"""
+
+from __future__ import annotations
+
+import json
+from collections import Counter, defaultdict
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[1]
+RUN = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
+RAW = RUN / "raw.jsonl"
+PDFS = REPO / "data" / "multimodal_doc" / "mmlongbench" / "pdfs"
+
+
+def main() -> None:
+    rows = [
+        json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines()
+        if line.strip()
+    ]
+
+    # 1) SSL clustering: failures by question index per arm
+    by_arm_idx: dict[str, list[tuple[int, str]]] = defaultdict(list)
+    qid_order: dict[str, int] = {}
+    arm_seen_count: dict[str, int] = defaultdict(int)
+    for row in rows:
+        arm = row["arm"]
+        idx = arm_seen_count[arm]
+        arm_seen_count[arm] += 1
+        qid_order[f"{arm}::{row['qid']}"] = idx
+        err = row.get("error") or ""
+        cluster = "ssl" if "SSLError" in err else (
+            "empty" if not (row.get("raw_text") or "").strip() and not err else (
+                "5xx" if "502" in err or "503" in err else (
+                    "size_limit" if "exceeds" in err.lower() and "limit" in err.lower() else (
+                        "other_err" if err else "ok"
+                    )
+                )
+            )
+        )
+        if cluster != "ok":
+            by_arm_idx[arm].append((idx, cluster))
+
+    print("=" * 80)
+    print("SSL/network-error indices per arm (each arm processes 171 questions in")
+    print("order; index = sequential position within that arm). Tight clustering")
+    print("in time = transient blip, even spread = sustained limit.")
+    print("=" * 80)
+    for arm in sorted(by_arm_idx):
+        items = by_arm_idx[arm]
+        if not items:
+            continue
+        idxs = sorted(set(i for i, _ in items))
+        print(f"\n{arm}: {len(items)} failures at indices {idxs}")
+        # show clusters
+        cluster_runs = []
+        cur = [idxs[0]]
+        for i in idxs[1:]:
+            if i - cur[-1] <= 5:  # within 5 questions = same time window
+                cur.append(i)
+            else:
+                cluster_runs.append(cur)
+                cur = [i]
+        cluster_runs.append(cur)
+        print(f"   clusters (gap<=5): {len(cluster_runs)}: {cluster_runs}")
+
+    # 2) The 30MB intrinsic failure — full details
+    print()
+    print("=" * 80)
+    print("Intrinsic failure: 30MB Anthropic input limit on 2405.09818v1.pdf::Q007")
+    print("=" * 80)
+    for row in rows:
+        if row["qid"] == "2405.09818v1.pdf::Q007" and row["arm"] == "native_pdf":
+            err = row.get("error") or ""
+            print(f"  qid: {row['qid']}")
+            print(f"  doc: {row['doc_id']}, pages: {row.get('pages')}")
+            pdf_path = PDFS / row["doc_id"]
+            if pdf_path.exists():
+                size_mb = pdf_path.stat().st_size / (1024 * 1024)
+                print(f"  PDF size on disk: {size_mb:.1f} MB")
+                # base64 inflates ~33%
+                est_b64 = size_mb * 1.33
+                print(f"  estimated base64 wire size: {est_b64:.1f} MB")
+            print(f"  full error: {err[:600]}")
+            break
+
+    # 3) Per-PDF: which PDFs are pathological?
+    print()
+    print("=" * 80)
+    print("Per-PDF failure breakdown across all 6 arms (only PDFs with failures)")
+    print("=" * 80)
+    by_pdf: dict[str, list[dict]] = defaultdict(list)
+    for row in rows:
+        err = row.get("error") or ""
+        empty = not (row.get("raw_text") or "").strip()
+        if err or empty:
+            by_pdf[row["doc_id"]].append({
+                "arm": row["arm"],
+                "qid": row["qid"],
+                "err_kind": (
+                    "ssl" if "SSLError" in err
+                    else "size_limit" if "exceeds" in err.lower() and "limit" in err.lower()
+                    else "5xx" if "502" in err or "503" in err
+                    else "json_decode" if "JSONDecodeError" in err
+                    else "empty" if empty and not err
+                    else "other"
+                ),
+                "pages": row.get("pages"),
+            })
+    for doc, items in sorted(by_pdf.items(), key=lambda x: (-len(x[1]), x[0])):
+        kinds = Counter(i["err_kind"] for i in items)
+        arms = sorted({i["arm"] for i in items})
+        pages = items[0]["pages"]
+        print(f"  {doc}  pages={pages}  failures={len(items)}  arms={arms}")
+        print(f"     kinds: {dict(kinds)}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/surfsense_evals/scripts/analyze_failures.py b/surfsense_evals/scripts/analyze_failures.py
new file mode 100644
index 0000000000..e7ace1e1bb
--- /dev/null
+++ b/surfsense_evals/scripts/analyze_failures.py
@@ -0,0 +1,155 @@
+"""Drill into the parser_compare n=171 raw.jsonl to surface every
+failure, group by arm + PDF, and dump the underlying error strings so
+we can write up a clean failure-mode taxonomy for the blog post.
+
+Outputs (printed to stdout + written to `failures_n171.json`):
+* per-arm failure count and rate
+* per-PDF failure count across all arms (which docs are pathological?)
+* error-string clusters per arm (so we can give human-readable causes)
+* sample failure rows (one per cluster) for the appendix
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from collections import Counter, defaultdict
+from pathlib import Path
+from typing import Any
+
+
+REPO = Path(__file__).resolve().parents[1]
+RUN = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
+RAW = RUN / "raw.jsonl"
+OUT = REPO / "scripts" / "failures_n171.json"
+
+
+def _classify(error: str | None, raw_text: str) -> str:
+    """Coarse-grained bucket for an error message."""
+
+    blob = (error or "").lower()
+    if not blob and not raw_text.strip():
+        return "empty_response"
+    if "rate limit" in blob or "429" in blob:
+        return "rate_limit"
+    if "context_length" in blob or "context window" in blob or "too many tokens" in blob:
+        return "context_overflow"
+    if "could not process image" in blob or "invalid image" in blob:
+        return "image_decode_failure"
+    if "could not process pdf" in blob or "invalid_request_error" in blob and "pdf" in blob:
+        return "pdf_decode_failure"
+    if "timeout" in blob or "timed out" in blob:
+        return "timeout"
+    if "5xx" in blob or "internal server error" in blob or "503" in blob or "502" in blob:
+        return "provider_5xx"
+    if "filenotfound" in blob:
+        return "missing_extraction"
+    if "badrequest" in blob:
+        return "provider_400"
+    if blob:
+        return "other_error"
+    return "unknown"
+
+
+def main() -> None:
+    rows = [
+        json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines()
+        if line.strip()
+    ]
+
+    by_arm_failures: dict[str, list[dict]] = defaultdict(list)
+    by_pdf_failures: dict[str, list[dict]] = defaultdict(list)
+    error_clusters: dict[str, dict[str, list[dict]]] = defaultdict(lambda: defaultdict(list))
+
+    n_per_arm: dict[str, int] = defaultdict(int)
+    for row in rows:
+        arm = row["arm"]
+        n_per_arm[arm] += 1
+        err = row.get("error")
+        raw_text = row.get("raw_text") or ""
+        if err or not raw_text.strip():
+            cluster = _classify(err, raw_text)
+            entry = {
+                "qid": row["qid"],
+                "doc_id": row["doc_id"],
+                "answer_format": row["answer_format"],
+                "gold": row["gold"],
+                "error": err,
+                "cluster": cluster,
+                "raw_text_len": len(raw_text),
+                "pages": row.get("pages"),
+            }
+            by_arm_failures[arm].append(entry)
+            by_pdf_failures[row["doc_id"]].append({**entry, "arm": arm})
+            error_clusters[arm][cluster].append(entry)
+
+    print("=" * 90)
+    print("Per-arm failure count & rate")
+    print("=" * 90)
+    print(f"{'arm':<25} {'n':>4} {'fail':>5} {'rate%':>6}")
+    for arm in sorted(n_per_arm):
+        f = len(by_arm_failures[arm])
+        n = n_per_arm[arm]
+        print(f"{arm:<25} {n:>4} {f:>5} {f / n * 100:>5.1f}%")
+
+    print()
+    print("=" * 90)
+    print("Failure clusters per arm")
+    print("=" * 90)
+    for arm in sorted(error_clusters):
+        print(f"\n{arm}:")
+        for cluster, items in sorted(error_clusters[arm].items()):
+            print(f"  {cluster:<22} {len(items):>3}")
+            sample = items[0]
+            err_short = (sample["error"] or "")[:200].replace("\n", " ")
+            print(f"     example: {sample['qid']}  doc={sample['doc_id']} pages={sample['pages']}")
+            print(f"     error: {err_short}")
+
+    print()
+    print("=" * 90)
+    print("Per-PDF failure totals (PDFs with >=2 failures)")
+    print("=" * 90)
+    pdf_counts = Counter({pdf: len(rows) for pdf, rows in by_pdf_failures.items()})
+    for pdf, count in pdf_counts.most_common():
+        if count < 2:
+            break
+        arms_failed = sorted({r["arm"] for r in by_pdf_failures[pdf]})
+        pages = by_pdf_failures[pdf][0].get("pages")
+        print(f"  {pdf}  pages={pages}  failures={count}  arms={arms_failed}")
+
+    print()
+    print("=" * 90)
+    print("All native_pdf failures (one row per failure)")
+    print("=" * 90)
+    for entry in by_arm_failures.get("native_pdf", []):
+        err = (entry["error"] or "(no error string)")[:240].replace("\n", " ")
+        print(f"  {entry['qid']}  doc={entry['doc_id']} pages={entry['pages']} cluster={entry['cluster']}")
+        print(f"    err: {err}")
+
+    summary: dict[str, Any] = {
+        "per_arm": {
+            arm: {
+                "n": n_per_arm[arm],
+                "failures": len(by_arm_failures[arm]),
+                "rate": len(by_arm_failures[arm]) / n_per_arm[arm],
+                "clusters": {
+                    cluster: len(items)
+                    for cluster, items in error_clusters[arm].items()
+                },
+                "rows": by_arm_failures[arm],
+            }
+            for arm in sorted(n_per_arm)
+        },
+        "per_pdf": {
+            pdf: [
+                {**r, "arm": r["arm"]} for r in failures
+            ]
+            for pdf, failures in by_pdf_failures.items()
+        },
+    }
+    OUT.write_text(json.dumps(summary, indent=2), encoding="utf-8")
+    print(f"\nWrote: {OUT}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/surfsense_evals/scripts/check_extraction_sizes.py b/surfsense_evals/scripts/check_extraction_sizes.py
new file mode 100644
index 0000000000..712e693cbb
--- /dev/null
+++ b/surfsense_evals/scripts/check_extraction_sizes.py
@@ -0,0 +1,60 @@
+"""Sanity check extraction sizes against Sonnet 4.5's context window.
+
+Sonnet 4.5 supports ~200k tokens. As a *very* rough heuristic, English
+markdown is ~4 chars/token, so anything over ~750k chars likely won't
+fit alongside the system + question + 512 max_output_tokens. Print
+warnings for any extraction that's at risk.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[1]
+MAP = REPO / "data" / "multimodal_doc" / "maps" / "parser_compare_doc_map.jsonl"
+
+CHARS_PER_TOKEN = 4
+CTX_TOKENS = 200_000
+PROMPT_OVERHEAD_TOKENS = 1_000  # system + question + format hint
+MAX_OUTPUT_TOKENS = 512
+SAFE_CHARS = (CTX_TOKENS - PROMPT_OVERHEAD_TOKENS - MAX_OUTPUT_TOKENS) * CHARS_PER_TOKEN
+
+
+def main() -> None:
+    rows = [
+        json.loads(line)
+        for line in MAP.read_text(encoding="utf-8").splitlines()
+        if line.strip()
+    ]
+
+    total = len(rows)
+    arm_max: dict[str, tuple[int, str]] = {}
+    overflows: list[tuple[str, str, int]] = []
+    for row in rows:
+        for arm, ext in (row.get("extractions") or {}).items():
+            chars = int(ext.get("chars") or 0)
+            if arm not in arm_max or arm_max[arm][0] < chars:
+                arm_max[arm] = (chars, row["doc_id"])
+            if chars > SAFE_CHARS:
+                overflows.append((row["doc_id"], arm, chars))
+
+    print(f"PDFs in manifest: {total}")
+    print(f"safe char budget: {SAFE_CHARS:,}  (~{(SAFE_CHARS // CHARS_PER_TOKEN):,} tokens)")
+    print()
+    print("largest extraction per arm:")
+    for arm, (chars, doc_id) in sorted(arm_max.items()):
+        print(f"  {arm:25s}  {chars:>10,} chars  ({doc_id})")
+
+    print()
+    if overflows:
+        print(f"OVERFLOW RISK ({len(overflows)} extractions > safe budget):")
+        for doc_id, arm, chars in overflows:
+            est_tokens = chars // CHARS_PER_TOKEN
+            print(f"  {doc_id} :: {arm} :: {chars:,} chars (~{est_tokens:,} tokens)")
+    else:
+        print("no overflow risk — all extractions fit Sonnet 4.5's 200k context.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/surfsense_evals/scripts/check_uploaded_status.py b/surfsense_evals/scripts/check_uploaded_status.py
new file mode 100644
index 0000000000..7021ba83da
--- /dev/null
+++ b/surfsense_evals/scripts/check_uploaded_status.py
@@ -0,0 +1,77 @@
+"""Query SurfSense for the status of every MMLongBench PDF in scope.
+
+Uses the existing SurfSense documents client to query
+``/documents/status?document_ids=...`` for both the known-existing 5
+PDFs (doc ids 5219-5223) and the recently-uploaded mmlongbench batch
+(7577-7600 range).
+"""
+
+from __future__ import annotations
+
+import asyncio
+import os
+from pathlib import Path
+
+import httpx
+from dotenv import load_dotenv
+
+
+REPO = Path(__file__).resolve().parents[1]
+PDF_DIR = REPO / "data" / "multimodal_doc" / "mmlongbench" / "pdfs"
+
+
+async def main() -> None:
+    load_dotenv(REPO / ".env")
+    base = os.environ.get("SURFSENSE_API_BASE", "http://localhost:8000").rstrip("/")
+    token = os.environ.get("SURFSENSE_JWT")
+    if not token:
+        raise SystemExit("SURFSENSE_JWT missing from .env")
+
+    pdf_names = sorted(p.name for p in PDF_DIR.glob("*.pdf"))
+    print(f"local cached PDFs: {len(pdf_names)}")
+
+    candidate_ids = list(range(5219, 5224)) + list(range(7577, 7625))
+
+    headers = {
+        "Authorization": f"Bearer {token}",
+        "Accept": "application/json",
+    }
+    async with httpx.AsyncClient(timeout=30.0) as http:
+        r = await http.get(
+            f"{base}/api/v1/documents/status",
+            params={
+                "search_space_id": 55,
+                "document_ids": ",".join(str(d) for d in candidate_ids),
+            },
+            headers=headers,
+        )
+        r.raise_for_status()
+        items = r.json().get("items", [])
+
+    by_title: dict[str, dict] = {}
+    for it in items:
+        by_title[it.get("title", "")] = {
+            "id": it.get("id"),
+            "state": (it.get("status") or {}).get("state"),
+            "reason": (it.get("status") or {}).get("reason"),
+        }
+
+    by_state: dict[str, int] = {}
+    print()
+    for name in pdf_names:
+        info = by_title.get(name)
+        if info is None:
+            print(f"  [missing      ]              {name}")
+            by_state["missing"] = by_state.get("missing", 0) + 1
+        else:
+            tag = info["state"] or "?"
+            print(f"  [{tag:13s}] doc_id={info['id']:>5}  {name}")
+            by_state[tag] = by_state.get(tag, 0) + 1
+    print()
+    print("summary:")
+    for k, v in sorted(by_state.items()):
+        print(f"  {k}: {v}")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/surfsense_evals/scripts/compute_adjusted_accuracy.py b/surfsense_evals/scripts/compute_adjusted_accuracy.py
new file mode 100644
index 0000000000..13693c055a
--- /dev/null
+++ b/surfsense_evals/scripts/compute_adjusted_accuracy.py
@@ -0,0 +1,112 @@
+"""Compute "intrinsic" accuracy by removing transient network errors.
+
+A failure is *transient* if it's:
+  * SSLError: SSL bad-record-mac (TLS hiccup)
+  * Cloudflare 502 / 503 (provider-side load shedding)
+  * empty_response with no error string and no other signal (likely
+    connection reset mid-stream)
+  * JSONDecodeError (parse error mid-stream)
+
+A failure is *intrinsic* if it's a hard limit:
+  * "exceeds .* limit" (size limits)
+  * context_length errors
+  * provider 400 with image / pdf decode failure
+  * malformed-input failures
+
+We re-compute accuracy with two denominators:
+  * raw acc       = correct / 171  (what the headline reports)
+  * adjusted acc  = correct / (171 - transient_failures)  (intrinsic)
+
+Outputs a table that we can drop straight into the blog.
+"""
+
+from __future__ import annotations
+
+import json
+from collections import defaultdict
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[1]
+RUN = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
+RAW = RUN / "raw.jsonl"
+
+
+TRANSIENT_HINTS = (
+    "sslv3_alert_bad_record_mac",
+    "ssl_alert_bad_record_mac",
+    "ssl: ssl",
+    "cloudflare",
+    "error 502",
+    "error 503",
+    "bad gateway",
+    "service unavailable",
+    "gateway timeout",
+    "jsondecodeerror",
+)
+INTRINSIC_HINTS = (
+    "exceeds",
+    "context_length",
+    "context window",
+    "could not process pdf",
+    "could not process image",
+)
+
+
+def classify(error: str | None, raw_text: str) -> str:
+    err = (error or "").lower()
+    if not err and not raw_text.strip():
+        return "transient_empty"
+    if any(h in err for h in TRANSIENT_HINTS):
+        return "transient_ssl_or_5xx"
+    if any(h in err for h in INTRINSIC_HINTS):
+        return "intrinsic_limit"
+    if err:
+        return "other_error"
+    return "ok"
+
+
+def main() -> None:
+    rows = [
+        json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines()
+        if line.strip()
+    ]
+    by_arm: dict[str, dict] = defaultdict(lambda: {
+        "n": 0, "correct": 0,
+        "transient_ssl_or_5xx": 0, "transient_empty": 0,
+        "intrinsic_limit": 0, "other_error": 0,
+    })
+    for row in rows:
+        arm = row["arm"]
+        m = by_arm[arm]
+        m["n"] += 1
+        graded = row.get("graded") or {}
+        if graded.get("correct"):
+            m["correct"] += 1
+        kind = classify(row.get("error"), row.get("raw_text") or "")
+        if kind != "ok":
+            m[kind] += 1
+
+    print(f"{'arm':<25} {'raw acc%':>8} {'transient':>10} {'intrinsic':>10} {'other':>6} {'adj acc% (no transient)':>22}")
+    print("-" * 88)
+    for arm in sorted(by_arm):
+        m = by_arm[arm]
+        raw = m["correct"] / m["n"] * 100
+        transient = m["transient_ssl_or_5xx"] + m["transient_empty"]
+        intrinsic = m["intrinsic_limit"]
+        other = m["other_error"]
+        usable = m["n"] - transient
+        adj = m["correct"] / usable * 100 if usable else 0
+        print(
+            f"{arm:<25} {raw:>7.1f}% {transient:>10} {intrinsic:>10} {other:>6} {adj:>21.1f}%"
+        )
+
+    print()
+    print("transient   = SSLError / 502 / 503 / empty stream / mid-stream JSON decode (would")
+    print("              succeed on retry; eval harness has no built-in retry today).")
+    print("intrinsic   = hard limit (e.g. >30MB Anthropic request, model context overflow).")
+    print("adj acc%    = correct / (n - transient) — what the arm scores when network noise")
+    print("              is removed; closest thing we have to a like-for-like quality number.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/surfsense_evals/scripts/compute_blog_extras.py b/surfsense_evals/scripts/compute_blog_extras.py
new file mode 100644
index 0000000000..abe88d08b7
--- /dev/null
+++ b/surfsense_evals/scripts/compute_blog_extras.py
@@ -0,0 +1,381 @@
+"""Compute the deeper statistics the blog needs: McNemar pairwise tests,
+per-PDF heterogeneity, latency/token distribution percentiles.
+
+Reads the merged post-retry artifact:
+
+    data/multimodal_doc/runs/<run_id>/parser_compare/raw_post_retry.jsonl
+
+Outputs to stdout:
+
+  1) Per-arm latency distribution (n, mean, std, p10, p25, p50, p75, p90, p95, p99, max).
+  2) Per-arm input/output token distribution (mean, p50, p95, max).
+  3) McNemar pairwise table: for every (arm_i, arm_j) ordered pair on the
+     same 171 questions, count b_ij = #(arm_i correct & arm_j wrong) and
+     b_ji = #(arm_i wrong & arm_j correct), and report the exact-binomial
+     two-sided p-value. We include both raw (using the original raw.jsonl)
+     and post-retry results.
+  4) Per-PDF accuracy variance per arm (n_pdfs=30): mean, std, min, max.
+
+Pure stdlib — no scipy/numpy.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import statistics
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[1]
+
+
+# ---------------------------------------------------------------------------
+# I/O
+# ---------------------------------------------------------------------------
+
+
+def _read_jsonl(path: Path) -> list[dict]:
+    out: list[dict] = []
+    with path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            out.append(json.loads(line))
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Distribution helpers
+# ---------------------------------------------------------------------------
+
+
+def _percentile(values: list[float], p: float) -> float:
+    """Linear-interpolation percentile (p in [0, 100])."""
+
+    if not values:
+        return 0.0
+    s = sorted(values)
+    if len(s) == 1:
+        return float(s[0])
+    k = (len(s) - 1) * (p / 100.0)
+    lo, hi = math.floor(k), math.ceil(k)
+    if lo == hi:
+        return float(s[int(k)])
+    return float(s[lo] + (s[hi] - s[lo]) * (k - lo))
+
+
+# ---------------------------------------------------------------------------
+# McNemar exact-binomial p-value
+# ---------------------------------------------------------------------------
+
+
+def _binom_coef(n: int, k: int) -> int:
+    if k < 0 or k > n:
+        return 0
+    return math.comb(n, k)
+
+
+def _mcnemar_exact_pvalue(b: int, c: int) -> float:
+    """Two-sided exact-binomial McNemar p-value.
+
+    Tests H0: P(arm_i wrong, arm_j right) == P(arm_i right, arm_j wrong)
+    on discordant pairs only. Under H0 the count b ~ Bin(b+c, 0.5).
+    The two-sided p-value is
+
+        P(X <= min(b, c)) + P(X >= max(b, c))
+
+    computed exactly (cheap because b+c <= 27 in our run).
+    """
+
+    n = b + c
+    if n == 0:
+        return 1.0
+    k = min(b, c)
+    # Two-sided exact: 2 * P(X <= k) clipped at 1.0
+    cdf = sum(_binom_coef(n, i) for i in range(k + 1))
+    p = 2.0 * cdf / (2 ** n)
+    return min(1.0, p)
+
+
+def _mcnemar_table(rows: list[dict]) -> dict:
+    """Group rows -> {qid: {arm: bool_correct}} and compute pairwise."""
+
+    by_qid: dict[str, dict[str, bool]] = {}
+    arms_seen: set[str] = set()
+    for r in rows:
+        qid = r["qid"]
+        arm = r["arm"]
+        graded = r.get("graded") or {}
+        correct = bool(graded.get("correct"))
+        by_qid.setdefault(qid, {})[arm] = correct
+        arms_seen.add(arm)
+
+    arms = sorted(arms_seen)
+    qids = sorted(by_qid)
+    out: dict[str, dict] = {"arms": arms, "n_qids": len(qids), "pairs": []}
+    for i, ai in enumerate(arms):
+        for aj in arms[i + 1:]:
+            b = c = both = neither = 0
+            for q in qids:
+                row = by_qid[q]
+                if ai not in row or aj not in row:
+                    continue
+                ci, cj = row[ai], row[aj]
+                if ci and not cj:
+                    b += 1
+                elif cj and not ci:
+                    c += 1
+                elif ci and cj:
+                    both += 1
+                else:
+                    neither += 1
+            p = _mcnemar_exact_pvalue(b, c)
+            out["pairs"].append({
+                "arm_i": ai, "arm_j": aj,
+                "b_i_only": b, "c_j_only": c,
+                "both_correct": both, "both_wrong": neither,
+                "p_value": p,
+            })
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Per-PDF heterogeneity
+# ---------------------------------------------------------------------------
+
+
+def _per_pdf_stats(rows: list[dict]) -> dict[str, dict]:
+    """For each arm, per-PDF accuracy = correct/total questions on that PDF."""
+
+    bucket: dict[str, dict[str, list[bool]]] = {}
+    for r in rows:
+        arm = r["arm"]
+        pdf = r["doc_id"]
+        graded = r.get("graded") or {}
+        bucket.setdefault(arm, {}).setdefault(pdf, []).append(
+            bool(graded.get("correct"))
+        )
+
+    out: dict[str, dict] = {}
+    for arm, pdfs in bucket.items():
+        accs = [sum(b) / len(b) for b in pdfs.values() if b]
+        if not accs:
+            continue
+        out[arm] = {
+            "n_pdfs": len(accs),
+            "mean": statistics.mean(accs),
+            "std": statistics.stdev(accs) if len(accs) > 1 else 0.0,
+            "min": min(accs),
+            "max": max(accs),
+            "p25": _percentile(accs, 25),
+            "p50": _percentile(accs, 50),
+            "p75": _percentile(accs, 75),
+            "n_pdfs_zero": sum(1 for a in accs if a == 0.0),
+            "n_pdfs_perfect": sum(1 for a in accs if a == 1.0),
+        }
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Latency / token distributions
+# ---------------------------------------------------------------------------
+
+
+def _per_arm_latency(rows: list[dict]) -> dict[str, dict]:
+    by_arm: dict[str, list[float]] = {}
+    for r in rows:
+        lat = r.get("latency_ms")
+        if lat is None or lat == 0:
+            continue
+        by_arm.setdefault(r["arm"], []).append(float(lat))
+    out: dict[str, dict] = {}
+    for arm, lats in by_arm.items():
+        out[arm] = {
+            "n": len(lats),
+            "mean_s": statistics.mean(lats) / 1000,
+            "std_s": (statistics.stdev(lats) / 1000) if len(lats) > 1 else 0.0,
+            "p10_s": _percentile(lats, 10) / 1000,
+            "p25_s": _percentile(lats, 25) / 1000,
+            "p50_s": _percentile(lats, 50) / 1000,
+            "p75_s": _percentile(lats, 75) / 1000,
+            "p90_s": _percentile(lats, 90) / 1000,
+            "p95_s": _percentile(lats, 95) / 1000,
+            "p99_s": _percentile(lats, 99) / 1000,
+            "max_s": max(lats) / 1000,
+            # Coefficient of variation: std / mean (unitless tail-fatness).
+            "cv": (
+                statistics.stdev(lats) / statistics.mean(lats)
+                if len(lats) > 1 and statistics.mean(lats) > 0 else 0.0
+            ),
+        }
+    return out
+
+
+def _per_arm_tokens(rows: list[dict]) -> dict[str, dict]:
+    by_arm_in: dict[str, list[float]] = {}
+    by_arm_out: dict[str, list[float]] = {}
+    for r in rows:
+        t_in = r.get("input_tokens") or 0
+        t_out = r.get("output_tokens") or 0
+        if t_in:
+            by_arm_in.setdefault(r["arm"], []).append(float(t_in))
+        if t_out:
+            by_arm_out.setdefault(r["arm"], []).append(float(t_out))
+    out: dict[str, dict] = {}
+    for arm in sorted(set(by_arm_in) | set(by_arm_out)):
+        in_vals = by_arm_in.get(arm, [])
+        out_vals = by_arm_out.get(arm, [])
+        if not in_vals and not out_vals:
+            continue
+        entry: dict = {}
+        if in_vals:
+            entry["input"] = {
+                "n": len(in_vals),
+                "mean": statistics.mean(in_vals),
+                "p50": _percentile(in_vals, 50),
+                "p95": _percentile(in_vals, 95),
+                "max": max(in_vals),
+            }
+        if out_vals:
+            entry["output"] = {
+                "n": len(out_vals),
+                "mean": statistics.mean(out_vals),
+                "p50": _percentile(out_vals, 50),
+                "p95": _percentile(out_vals, 95),
+                "max": max(out_vals),
+            }
+        out[arm] = entry
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Pretty-printing
+# ---------------------------------------------------------------------------
+
+
+def _print_latency(title: str, lat: dict[str, dict]) -> None:
+    print()
+    print(title)
+    print("-" * len(title))
+    header = (f"{'arm':<25} {'n':>4} {'mean':>7} {'std':>7} "
+              f"{'p50':>7} {'p90':>7} {'p95':>7} {'p99':>7} {'max':>7} {'CV':>5}")
+    print(header)
+    print("-" * len(header))
+    for arm in sorted(lat, key=lambda a: lat[a]["mean_s"]):
+        s = lat[arm]
+        print(f"{arm:<25} {s['n']:>4} "
+              f"{s['mean_s']:>6.1f}s {s['std_s']:>6.1f}s "
+              f"{s['p50_s']:>6.1f}s {s['p90_s']:>6.1f}s {s['p95_s']:>6.1f}s "
+              f"{s['p99_s']:>6.1f}s {s['max_s']:>6.1f}s {s['cv']:>5.2f}")
+
+
+def _print_tokens(title: str, toks: dict[str, dict]) -> None:
+    print()
+    print(title)
+    print("-" * len(title))
+    header = (f"{'arm':<25} {'in mean':>9} {'in p50':>9} {'in p95':>9} {'in max':>9}"
+              f"  {'out mean':>9} {'out p95':>9}")
+    print(header)
+    print("-" * len(header))
+    for arm in sorted(toks):
+        e = toks[arm]
+        ein = e.get("input")
+        eout = e.get("output")
+        if not ein:
+            continue
+        print(f"{arm:<25} "
+              f"{ein['mean']:>9,.0f} {ein['p50']:>9,.0f} {ein['p95']:>9,.0f} {ein['max']:>9,.0f}  "
+              f"{(eout or {}).get('mean', 0):>9,.0f} {(eout or {}).get('p95', 0):>9,.0f}")
+
+
+def _print_pdf_var(title: str, var: dict[str, dict]) -> None:
+    print()
+    print(title)
+    print("-" * len(title))
+    header = (f"{'arm':<25} {'n_pdfs':>7} {'mean':>7} {'std':>7} {'min':>7} "
+              f"{'p25':>7} {'p50':>7} {'p75':>7} {'max':>7} {'#0%':>5} {'#100%':>6}")
+    print(header)
+    print("-" * len(header))
+    for arm in sorted(var, key=lambda a: -var[a]["mean"]):
+        s = var[arm]
+        print(f"{arm:<25} {s['n_pdfs']:>7} "
+              f"{s['mean']*100:>6.1f}% {s['std']*100:>6.1f}% {s['min']*100:>6.1f}% "
+              f"{s['p25']*100:>6.1f}% {s['p50']*100:>6.1f}% {s['p75']*100:>6.1f}% "
+              f"{s['max']*100:>6.1f}% {s['n_pdfs_zero']:>5} {s['n_pdfs_perfect']:>6}")
+
+
+def _print_mcnemar(title: str, table: dict) -> None:
+    print()
+    print(title)
+    print("-" * len(title))
+    print(f"n_qids on which all arms have a graded row: {table['n_qids']}")
+    header = (f"{'arm_i':<25} {'arm_j':<25} {'b':>4} {'c':>4} "
+              f"{'both ok':>8} {'both wr':>8} {'p (2-sided)':>13} {'sig':>4}")
+    print(header)
+    print("-" * len(header))
+    for pair in sorted(table["pairs"], key=lambda p: p["p_value"]):
+        sig = ""
+        if pair["p_value"] < 0.001:
+            sig = "***"
+        elif pair["p_value"] < 0.01:
+            sig = "**"
+        elif pair["p_value"] < 0.05:
+            sig = "*"
+        print(f"{pair['arm_i']:<25} {pair['arm_j']:<25} "
+              f"{pair['b_i_only']:>4} {pair['c_j_only']:>4} "
+              f"{pair['both_correct']:>8} {pair['both_wrong']:>8} "
+              f"{pair['p_value']:>13.4f} {sig:>4}")
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--run-id", default="2026-05-14T00-53-19Z")
+    args = parser.parse_args()
+
+    run_dir = REPO / "data" / "multimodal_doc" / "runs" / args.run_id / "parser_compare"
+    raw_path = run_dir / "raw.jsonl"
+    post_path = run_dir / "raw_post_retry.jsonl"
+    if not raw_path.exists() or not post_path.exists():
+        raise SystemExit(
+            "Missing raw.jsonl or raw_post_retry.jsonl. "
+            "Run scripts/compute_post_retry_accuracy.py first."
+        )
+
+    raw_rows = _read_jsonl(raw_path)
+    post_rows = _read_jsonl(post_path)
+
+    print(f"Run: {args.run_id}")
+    print(f"raw rows: {len(raw_rows)}, post-retry rows: {len(post_rows)}")
+
+    # Latency uses post-retry rows (post-retry rows include the retry's own
+    # latency for recovered rows). For raw, recovered rows have latency=0
+    # because the harness recorded a failure.
+    _print_latency("Per-arm latency (post-retry)", _per_arm_latency(post_rows))
+
+    _print_tokens("Per-arm token distribution (post-retry)", _per_arm_tokens(post_rows))
+
+    _print_pdf_var(
+        "Per-PDF accuracy heterogeneity (post-retry)",
+        _per_pdf_stats(post_rows),
+    )
+
+    _print_mcnemar(
+        "McNemar pairwise (RAW, no retries)",
+        _mcnemar_table(raw_rows),
+    )
+    _print_mcnemar(
+        "McNemar pairwise (POST-RETRY)",
+        _mcnemar_table(post_rows),
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/surfsense_evals/scripts/compute_post_retry_accuracy.py b/surfsense_evals/scripts/compute_post_retry_accuracy.py
new file mode 100644
index 0000000000..4c8c47672e
--- /dev/null
+++ b/surfsense_evals/scripts/compute_post_retry_accuracy.py
@@ -0,0 +1,180 @@
+"""Recompute per-arm accuracy/F1 after merging retry survivors into raw.jsonl.
+
+Reads:
+  - data/multimodal_doc/runs/<run_id>/parser_compare/raw.jsonl
+  - data/multimodal_doc/runs/<run_id>/parser_compare/raw_retries.jsonl
+
+For each (arm, qid) present in the retry artifact:
+  - if the retry RECOVERED, the retry row replaces the original row (same
+    grader is reused — see ``mmlongbench/grader.py``);
+  - if the retry did NOT recover, the original row stays (still a failure,
+    so ``correct=False`` and ``f1=0``).
+
+Prints two tables side by side:
+  * Raw run (no retries) — matches §1 of the blog.
+  * Post-retry run        — final, "what would the headline have been if
+                              the harness had had retries from day one".
+
+It also writes ``data/multimodal_doc/runs/<run_id>/parser_compare/raw_post_retry.jsonl``
+so any downstream notebook / report can join straight on it.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[1]
+
+
+def _read_jsonl(path: Path) -> list[dict]:
+    out: list[dict] = []
+    with path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            out.append(json.loads(line))
+    return out
+
+
+def _row_key(row: dict) -> tuple[str, str]:
+    return (str(row["arm"]), str(row["qid"]))
+
+
+def _is_failure(row: dict) -> bool:
+    if row.get("error"):
+        return True
+    if not (row.get("raw_text") or "").strip():
+        return True
+    return False
+
+
+def _summarise(rows_by_arm: dict[str, list[dict]]) -> dict[str, dict]:
+    out: dict[str, dict] = {}
+    for arm, rows in rows_by_arm.items():
+        n = len(rows)
+        n_correct = sum(1 for r in rows if r.get("graded", {}).get("correct"))
+        f1_sum = sum(float(r.get("graded", {}).get("f1") or 0.0) for r in rows)
+        n_fail = sum(1 for r in rows if _is_failure(r))
+        out[arm] = {
+            "n": n,
+            "n_correct": n_correct,
+            "n_failures": n_fail,
+            "accuracy": (n_correct / n) if n else 0.0,
+            "f1_mean": (f1_sum / n) if n else 0.0,
+            "failure_rate": (n_fail / n) if n else 0.0,
+        }
+    return out
+
+
+def _print_table(title: str, summary: dict[str, dict]) -> None:
+    print()
+    print(title)
+    print("-" * len(title))
+    header = f"{'arm':<25} {'n':>4} {'n_corr':>7} {'acc':>7} {'F1':>7} {'fails':>6} {'fail%':>7}"
+    print(header)
+    print("-" * len(header))
+    # stable order: highest accuracy first
+    arms_sorted = sorted(summary.items(), key=lambda kv: -kv[1]["accuracy"])
+    for arm, s in arms_sorted:
+        print(f"{arm:<25} {s['n']:>4} {s['n_correct']:>7} "
+              f"{s['accuracy']*100:>6.1f}% {s['f1_mean']*100:>6.1f}% "
+              f"{s['n_failures']:>6} {s['failure_rate']*100:>6.1f}%")
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--run-id", default="2026-05-14T00-53-19Z")
+    args = parser.parse_args()
+
+    run_dir = REPO / "data" / "multimodal_doc" / "runs" / args.run_id / "parser_compare"
+    raw_path = run_dir / "raw.jsonl"
+    retry_path = run_dir / "raw_retries.jsonl"
+    out_path = run_dir / "raw_post_retry.jsonl"
+
+    if not raw_path.exists():
+        print(f"raw.jsonl not found at {raw_path}", file=sys.stderr)
+        return 1
+    if not retry_path.exists():
+        print(f"raw_retries.jsonl not found at {retry_path}", file=sys.stderr)
+        return 1
+
+    raw_rows = _read_jsonl(raw_path)
+    retry_rows = _read_jsonl(retry_path)
+
+    retry_by_key: dict[tuple[str, str], dict] = {
+        _row_key(r): r for r in retry_rows
+    }
+
+    merged_rows: list[dict] = []
+    n_replaced_recovered = 0
+    n_replaced_still_failed = 0
+    n_unchanged = 0
+    for row in raw_rows:
+        key = _row_key(row)
+        retry = retry_by_key.get(key)
+        if retry is None:
+            merged_rows.append(row)
+            n_unchanged += 1
+            continue
+        # The retry artifact carries a fresh ArmResult + grade in the same
+        # shape, plus a "retry" sub-object. We use the retry row whenever
+        # it represents a recovery; otherwise we keep the original (the
+        # retry confirms it is intrinsic, but the original row is the one
+        # the headline numbers were computed from, and the failure verdict
+        # is identical either way).
+        recovered = bool(retry.get("retry", {}).get("recovered"))
+        if recovered:
+            merged_rows.append(retry)
+            n_replaced_recovered += 1
+        else:
+            merged_rows.append(row)
+            n_replaced_still_failed += 1
+
+    # Persist merged jsonl for downstream consumers
+    with out_path.open("w", encoding="utf-8") as fh:
+        for r in merged_rows:
+            fh.write(json.dumps(r) + "\n")
+
+    # Bucket per arm
+    raw_by_arm: dict[str, list[dict]] = {}
+    for r in raw_rows:
+        raw_by_arm.setdefault(r["arm"], []).append(r)
+    post_by_arm: dict[str, list[dict]] = {}
+    for r in merged_rows:
+        post_by_arm.setdefault(r["arm"], []).append(r)
+
+    raw_summary = _summarise(raw_by_arm)
+    post_summary = _summarise(post_by_arm)
+
+    print()
+    print(f"Run: {args.run_id}")
+    print(f"Replaced (retry recovered):     {n_replaced_recovered}")
+    print(f"Kept original (retry still failed): {n_replaced_still_failed}")
+    print(f"Untouched rows:                 {n_unchanged}")
+    print(f"Wrote merged artifact: {out_path.relative_to(REPO)}")
+
+    _print_table("Raw run (no retries)", raw_summary)
+    _print_table("Post-retry run (final)", post_summary)
+
+    print()
+    print("Delta (post-retry minus raw):")
+    print(f"{'arm':<25} {'d_acc':>7} {'d_fails':>8}")
+    print("-" * 42)
+    for arm in sorted(set(raw_summary) | set(post_summary)):
+        r = raw_summary.get(arm)
+        p = post_summary.get(arm)
+        if not r or not p:
+            continue
+        d_acc = (p["accuracy"] - r["accuracy"]) * 100
+        d_fail = p["n_failures"] - r["n_failures"]
+        print(f"{arm:<25} {d_acc:>+6.1f}p {d_fail:>+7d}")
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/surfsense_evals/scripts/inspect_first30.py b/surfsense_evals/scripts/inspect_first30.py
new file mode 100644
index 0000000000..e06c6c029a
--- /dev/null
+++ b/surfsense_evals/scripts/inspect_first30.py
@@ -0,0 +1,59 @@
+"""Inspect what the first 30 MMLongBench-Doc PDFs would look like for scoping.
+
+Run from surfsense_evals/ root via:
+    python scripts/inspect_first30.py
+
+Prints which docs are already ingested (existing 5), which are new (25 to
+upload), how many questions cover those 30 PDFs, and the answerable /
+unanswerable + format mix.
+"""
+
+from __future__ import annotations
+
+import json
+from collections import Counter
+from pathlib import Path
+
+
+def main() -> None:
+    qpath = Path("data/multimodal_doc/mmlongbench/questions.jsonl")
+    lines = qpath.read_text(encoding="utf-8").splitlines()
+    rows = [json.loads(line) for line in lines if line.strip()]
+
+    docs_by_id = sorted({r["doc_id"] for r in rows})
+    first30 = docs_by_id[:30]
+    existing5 = {
+        "05-03-18-political-release.pdf",
+        "0b85477387a9d0cc33fca0f4becaa0e5.pdf",
+        "0e94b4197b10096b1f4c699701570fbf.pdf",
+        "11-21-16-Updated-Post-Election-Release.pdf",
+        "12-15-15-ISIS-and-terrorism-release-final.pdf",
+    }
+    new25 = [d for d in first30 if d not in existing5]
+    print(
+        f"first 30 docs (alphabetical) — {len(new25)} new, "
+        f"{len(first30) - len(new25)} already in SurfSense"
+    )
+
+    qs_in_30 = [r for r in rows if r["doc_id"] in set(first30)]
+    fmts = Counter((r.get("answer_format") or "").lower() for r in qs_in_30)
+    answerable = sum(v for k, v in fmts.items() if k != "none")
+    unanswerable = fmts.get("none", 0)
+
+    print(
+        f"questions covering first 30 docs: total={len(qs_in_30)}  "
+        f"answerable={answerable}  unanswerable={unanswerable}"
+    )
+    print(
+        f"avg Qs/PDF: {len(qs_in_30) / 30:.1f}  "
+        f"answerable/PDF: {answerable / 30:.1f}"
+    )
+    print(f"format mix in scope: {dict(fmts)}")
+    print()
+    print("25 new PDFs to ingest:")
+    for d in new25:
+        print(f"  - {d}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/surfsense_evals/scripts/patch_manifest_for_parallel_ingest.py b/surfsense_evals/scripts/patch_manifest_for_parallel_ingest.py
new file mode 100644
index 0000000000..e1a2edc65e
--- /dev/null
+++ b/surfsense_evals/scripts/patch_manifest_for_parallel_ingest.py
@@ -0,0 +1,100 @@
+"""Stub the mmlongbench manifest so parser_compare can extract in parallel.
+
+The mmlongbench Surfsense ingest writes its manifest only at the very
+end of the upload pipeline (~hours of celery work). parser_compare's
+ingest, on the other hand, just needs a list of (doc_id, pdf_path)
+tuples to know which PDFs to extract — it doesn't care about the
+SurfSense ``document_id`` (the runner does, later, after a refresh).
+
+This script extends the existing manifest with the *additional* PDFs
+that mmlongbench has already cached on disk (i.e. all 30 PDFs in
+``data/multimodal_doc/mmlongbench/pdfs/`` even though only 5 have
+SurfSense ``document_id``s yet) so parser_compare can run all four
+extractions for them in parallel with the SurfSense ingest.
+
+After mmlongbench finishes, re-run::
+
+    python -m surfsense_evals ingest multimodal_doc parser_compare \
+        --max-docs 30
+
+…to refresh ``parser_compare_doc_map.jsonl`` with the now-populated
+``document_id`` values for the 25 new PDFs. The extractions
+themselves are cached on disk so the second pass is essentially free.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+
+REPO = Path(__file__).resolve().parents[1]
+MAP_PATH = REPO / "data" / "multimodal_doc" / "maps" / "mmlongbench_doc_map.jsonl"
+PDF_DIR = REPO / "data" / "multimodal_doc" / "mmlongbench" / "pdfs"
+QUESTIONS = REPO / "data" / "multimodal_doc" / "mmlongbench" / "questions.jsonl"
+
+
+def _question_count_per_doc() -> dict[str, int]:
+    counts: dict[str, int] = {}
+    with QUESTIONS.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            counts[row["doc_id"]] = counts.get(row["doc_id"], 0) + 1
+    return counts
+
+
+def main() -> None:
+    if not MAP_PATH.exists():
+        raise SystemExit(
+            f"manifest not found at {MAP_PATH} — "
+            "run `surfsense_evals ingest multimodal_doc mmlongbench` first."
+        )
+
+    existing_lines = MAP_PATH.read_text(encoding="utf-8").splitlines()
+    existing_rows: list[dict] = []
+    settings_line = None
+    for line in existing_lines:
+        line = line.strip()
+        if not line:
+            continue
+        row = json.loads(line)
+        if "__settings__" in row:
+            settings_line = line
+        else:
+            existing_rows.append(row)
+
+    by_doc_id = {r["doc_id"]: r for r in existing_rows}
+    counts = _question_count_per_doc()
+
+    cached_pdfs = sorted(p for p in PDF_DIR.glob("*.pdf"))
+    print(f"existing manifest entries: {len(existing_rows)}")
+    print(f"cached PDFs on disk:       {len(cached_pdfs)}")
+
+    added = 0
+    for pdf in cached_pdfs:
+        if pdf.name in by_doc_id:
+            continue
+        by_doc_id[pdf.name] = {
+            "doc_id": pdf.name,
+            "document_id": None,
+            "pdf_path": str(pdf),
+            "n_questions": counts.get(pdf.name, 0),
+        }
+        added += 1
+
+    out_lines: list[str] = []
+    if settings_line:
+        out_lines.append(settings_line)
+    for doc_id in sorted(by_doc_id):
+        out_lines.append(json.dumps(by_doc_id[doc_id]))
+    MAP_PATH.write_text("\n".join(out_lines) + "\n", encoding="utf-8")
+
+    print(f"added {added} stub rows; manifest now has {len(by_doc_id)} PDFs")
+    print(f"wrote: {MAP_PATH}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/surfsense_evals/scripts/peek_crag_run.py b/surfsense_evals/scripts/peek_crag_run.py
new file mode 100644
index 0000000000..225e5ec983
--- /dev/null
+++ b/surfsense_evals/scripts/peek_crag_run.py
@@ -0,0 +1,37 @@
+"""Tiny helper to inspect the latest CRAG run's per-question outputs."""
+
+from __future__ import annotations
+
+import glob
+import json
+from collections import defaultdict
+
+
+def main() -> None:
+    raw_path = sorted(glob.glob("data/research/runs/*/crag/raw.jsonl"))[-1]
+    print(f"Reading: {raw_path}")
+    rows = [json.loads(line) for line in open(raw_path, encoding="utf-8") if line.strip()]
+    by_q: dict[str, dict[str, dict]] = defaultdict(dict)
+    for r in rows:
+        by_q[r["qid"]][r["arm"]] = r
+
+    for qid, arms in list(by_q.items()):
+        b = arms.get("bare_llm", {})
+        l = arms.get("long_context", {})
+        s = arms.get("surfsense", {})
+        print(f"\n=== {qid} ({b.get('domain')}/{b.get('question_type')}) ===")
+        print(f"  question: {b.get('extra', {}).get('question', '?')!r}")
+        print(f"  gold: {b.get('gold')!r}")
+        for arm_name, a in (("bare_llm", b), ("long_context", l), ("surfsense", s)):
+            grade = a.get("graded", {})
+            text = (a.get("raw_text") or "").strip()
+            tail = text[-200:] if text else ""
+            print(
+                f"  [{arm_name}] grade={grade.get('grade')} "
+                f"method={grade.get('method')}"
+            )
+            print(f"    -> {tail!r}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/surfsense_evals/scripts/peek_disagreements.py b/surfsense_evals/scripts/peek_disagreements.py
new file mode 100644
index 0000000000..c0fe0acd97
--- /dev/null
+++ b/surfsense_evals/scripts/peek_disagreements.py
@@ -0,0 +1,64 @@
+"""Show questions where SurfSense was wrong but long-context was right (and vice versa)."""
+
+from __future__ import annotations
+
+import glob
+import json
+from collections import defaultdict
+
+
+def main() -> None:
+    raw_path = sorted(glob.glob("data/research/runs/*/crag/raw.jsonl"))[-1]
+    print(f"Reading: {raw_path}")
+    rows = [json.loads(line) for line in open(raw_path, encoding="utf-8") if line.strip()]
+    by_q: dict[str, dict[str, dict]] = defaultdict(dict)
+    for r in rows:
+        by_q[r["qid"]][r["arm"]] = r
+
+    surf_wrong_lc_right = []
+    lc_wrong_surf_right = []
+    surf_wrong_bare_right = []
+    for qid, arms in by_q.items():
+        b = arms.get("bare_llm", {}).get("graded", {}).get("grade")
+        lc = arms.get("long_context", {}).get("graded", {}).get("grade")
+        s = arms.get("surfsense", {}).get("graded", {}).get("grade")
+        if s == "incorrect" and lc == "correct":
+            surf_wrong_lc_right.append(qid)
+        if lc == "incorrect" and s == "correct":
+            lc_wrong_surf_right.append(qid)
+        if s == "incorrect" and b == "correct":
+            surf_wrong_bare_right.append(qid)
+
+    print(f"\nSurfSense INCORRECT but Long-Context CORRECT: {len(surf_wrong_lc_right)}")
+    print(f"Long-Context INCORRECT but SurfSense CORRECT: {len(lc_wrong_surf_right)}")
+    print(f"SurfSense INCORRECT but Bare CORRECT: {len(surf_wrong_bare_right)}")
+
+    print("\n=== Where SurfSense is wrong but long-context is right (top 5) ===")
+    for qid in surf_wrong_lc_right[:5]:
+        arms = by_q[qid]
+        b = arms.get("bare_llm", {})
+        print(f"\n[{qid}] domain={b.get('domain')} qtype={b.get('question_type')}")
+        print(f"  GOLD: {b.get('gold')!r}")
+        for arm_name in ("bare_llm", "long_context", "surfsense"):
+            a = arms.get(arm_name, {})
+            t = (a.get("raw_text") or "").strip()
+            tail = t[-180:] if t else ""
+            grade = a.get("graded", {})
+            print(f"  [{arm_name}] {grade.get('grade')} ({grade.get('method')}): {tail!r}")
+
+    print("\n=== Where Long-Context is wrong but SurfSense is right (top 5) ===")
+    for qid in lc_wrong_surf_right[:5]:
+        arms = by_q[qid]
+        b = arms.get("bare_llm", {})
+        print(f"\n[{qid}] domain={b.get('domain')} qtype={b.get('question_type')}")
+        print(f"  GOLD: {b.get('gold')!r}")
+        for arm_name in ("bare_llm", "long_context", "surfsense"):
+            a = arms.get(arm_name, {})
+            t = (a.get("raw_text") or "").strip()
+            tail = t[-180:] if t else ""
+            grade = a.get("graded", {})
+            print(f"  [{arm_name}] {grade.get('grade')} ({grade.get('method')}): {tail!r}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/surfsense_evals/scripts/retry_failed_questions.py b/surfsense_evals/scripts/retry_failed_questions.py
new file mode 100644
index 0000000000..7cc9478e04
--- /dev/null
+++ b/surfsense_evals/scripts/retry_failed_questions.py
@@ -0,0 +1,636 @@
+"""Retry only the failed (arm, question) pairs from a previous parser_compare run.
+
+The original parser_compare run records one row per (arm, qid) in
+``raw.jsonl``. Some of those rows came back with transient transport
+errors (SSL alerts, gateway 502s, empty SSE streams) or empty
+``raw_text``. This script re-issues *only* those calls with exponential
+backoff so we can see how many recover.
+
+Design constraints / choices:
+
+* **No re-ingest.** All cached PDFs and parser-extracted markdown stay
+  on disk. We rebuild ``ArmRequest`` objects from the existing manifest
+  + the original ``mmlongbench/questions.jsonl``.
+* **No SurfSense backend or celery required.** SurfSense had 0
+  reported failures; this script will skip any ``surfsense_agentic``
+  rows it encounters and warn rather than try to start the backend.
+* **Original ``raw.jsonl`` is never mutated.** Retries land in a
+  sibling ``raw_retries.jsonl`` so the original artifact stays
+  citeable.
+* **Idempotent.** Re-running this script re-tries the same set of
+  failed rows from ``raw.jsonl``. If you want to merge survivor rows
+  back in, do that as a separate aggregation step.
+
+Usage:
+
+    python scripts/retry_failed_questions.py \
+        --run-id 2026-05-14T00-53-19Z \
+        --max-attempts 5 \
+        --concurrency 2
+
+Outputs (written next to the original raw.jsonl):
+
+* ``raw_retries.jsonl`` — one line per retried (arm, qid). Each line
+  carries the original error, every retry attempt's timing/error,
+  and the final result (incl. grade) so you can drop it straight
+  into a notebook.
+* ``raw_retries_summary.json`` — per-arm tried/recovered/still-failed
+  counts and an aggregated retry-success rate.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import logging
+import os
+import random
+import sys
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+REPO = Path(__file__).resolve().parents[1]
+SRC = REPO / "src"
+if str(SRC) not in sys.path:
+    sys.path.insert(0, str(SRC))
+
+from dotenv import load_dotenv  # noqa: E402
+
+from surfsense_evals.core.arms import (  # noqa: E402
+    ArmRequest,
+    ArmResult,
+    BareLlmArm,
+    NativePdfArm,
+)
+from surfsense_evals.core.parse.freeform_answer import (  # noqa: E402
+    extract_freeform_answer,
+)
+from surfsense_evals.core.providers.openrouter_chat import (  # noqa: E402
+    OpenRouterChatProvider,
+)
+from surfsense_evals.core.providers.openrouter_pdf import (  # noqa: E402
+    OpenRouterPdfProvider,
+    PdfEngine,
+)
+from surfsense_evals.suites.multimodal_doc.mmlongbench.grader import grade  # noqa: E402
+from surfsense_evals.suites.multimodal_doc.parser_compare.prompt import (  # noqa: E402
+    build_long_context_prompt,
+    build_native_pdf_prompt,
+)
+
+logger = logging.getLogger("retry_failed_questions")
+
+LC_ARMS = {
+    "azure_basic_lc",
+    "azure_premium_lc",
+    "llamacloud_basic_lc",
+    "llamacloud_premium_lc",
+}
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _is_failure_row(row: dict[str, Any]) -> bool:
+    """A row counts as failed if it raised an error OR returned empty text.
+
+    We retry both because the empty-stream case is the same operational
+    failure mode (the call returned nothing usable) — we just didn't
+    raise it as an exception.
+    """
+
+    if row.get("error"):
+        return True
+    if not (row.get("raw_text") or "").strip():
+        return True
+    return False
+
+
+@dataclass
+class FailedRow:
+    arm: str
+    qid: str
+    doc_id: str
+    answer_format: str
+    gold: str
+    pages: int
+    document_id: int | None
+    original_error: str | None
+    original_row: dict[str, Any]
+
+
+def _load_failed_rows(raw_path: Path) -> list[FailedRow]:
+    out: list[FailedRow] = []
+    with raw_path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            if not _is_failure_row(row):
+                continue
+            out.append(FailedRow(
+                arm=str(row["arm"]),
+                qid=str(row["qid"]),
+                doc_id=str(row["doc_id"]),
+                answer_format=str(row.get("answer_format") or ""),
+                gold=str(row.get("gold") or ""),
+                pages=int(row.get("pages") or 0),
+                document_id=row.get("document_id"),
+                original_error=row.get("error"),
+                original_row=row,
+            ))
+    return out
+
+
+def _load_doc_map(map_path: Path) -> dict[str, dict[str, Any]]:
+    out: dict[str, dict[str, Any]] = {}
+    with map_path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            out[str(row["doc_id"])] = row
+    return out
+
+
+def _load_question_text_index(
+    questions_jsonl: Path,
+) -> dict[tuple[str, int], dict[str, Any]]:
+    """Map (doc_id, per_doc_index) -> raw question row.
+
+    qids in raw.jsonl are formatted ``{doc_id}::Q{NNN}`` where NNN is
+    the per-doc index. Reproducing the runner's question selection
+    requires walking ``questions.jsonl`` in order and assigning
+    indices per doc_id (so we match the runner's ``per_doc_idx`` logic
+    in ``_select_questions``).
+    """
+
+    out: dict[tuple[str, int], dict[str, Any]] = {}
+    per_doc_idx: dict[str, int] = {}
+    with questions_jsonl.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            doc_id = str(row.get("doc_id") or "")
+            if not doc_id:
+                continue
+            idx = per_doc_idx.get(doc_id, 0)
+            per_doc_idx[doc_id] = idx + 1
+            out[(doc_id, idx)] = row
+    return out
+
+
+def _qid_index(qid: str) -> int:
+    """Parse the per-doc question index out of a qid like ``foo.pdf::Q007``."""
+
+    _, _, q_part = qid.rpartition("::")
+    if not q_part.startswith("Q"):
+        raise ValueError(f"unexpected qid shape: {qid!r}")
+    return int(q_part[1:])
+
+
+# ---------------------------------------------------------------------------
+# Request building (mirrors runner.py exactly so prompts are byte-identical)
+# ---------------------------------------------------------------------------
+
+
+def _build_native_request(
+    qid: str, question: str, answer_format: str, pdf_path: Path,
+    *, max_output_tokens: int,
+) -> ArmRequest:
+    return ArmRequest(
+        question_id=qid,
+        prompt=build_native_pdf_prompt(question, answer_format=answer_format),
+        pdf_paths=[pdf_path],
+        options={"max_tokens": max_output_tokens},
+    )
+
+
+def _build_lc_request(
+    qid: str, question: str, answer_format: str, doc_id: str, md_path: Path,
+) -> ArmRequest:
+    if not md_path.exists():
+        raise FileNotFoundError(
+            f"Missing parser extraction at {md_path}; cannot retry LC arm."
+        )
+    markdown = md_path.read_text(encoding="utf-8")
+    return ArmRequest(
+        question_id=qid,
+        prompt=build_long_context_prompt(
+            question,
+            answer_format=answer_format,
+            document_markdown=markdown,
+            document_label=doc_id,
+        ),
+    )
+
+
+# ---------------------------------------------------------------------------
+# Retry driver
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class AttemptLog:
+    attempt: int
+    started_iso: str
+    latency_ms: int
+    error: str | None
+    raw_text_chars: int
+
+
+@dataclass
+class RetryOutcome:
+    arm: str
+    qid: str
+    attempts: list[AttemptLog]
+    final_result: ArmResult
+    recovered: bool
+
+
+async def _retry_one(
+    arm_obj: Any, request: ArmRequest, *,
+    arm_name: str,
+    qid: str,
+    max_attempts: int,
+    base_delay: float,
+    max_delay: float,
+) -> RetryOutcome:
+    attempts: list[AttemptLog] = []
+    final: ArmResult | None = None
+    for attempt in range(1, max_attempts + 1):
+        started_iso = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
+        t0 = time.monotonic()
+        result = await arm_obj.answer(request)
+        latency_ms = int((time.monotonic() - t0) * 1000)
+        raw_text = (result.raw_text or "").strip()
+        attempt_error = result.error
+        if not attempt_error and not raw_text:
+            attempt_error = "EmptyResponse: stream ended with no text"
+        attempts.append(AttemptLog(
+            attempt=attempt,
+            started_iso=started_iso,
+            latency_ms=latency_ms,
+            error=attempt_error,
+            raw_text_chars=len(raw_text),
+        ))
+        final = result
+        if not attempt_error and raw_text:
+            return RetryOutcome(
+                arm=arm_name, qid=qid, attempts=attempts,
+                final_result=result, recovered=True,
+            )
+        if attempt < max_attempts:
+            delay = min(max_delay, base_delay * (2 ** (attempt - 1)))
+            delay = delay * (0.5 + random.random())
+            logger.info(
+                "[%s::%s] attempt %d/%d failed (%s); sleeping %.1fs",
+                arm_name, qid, attempt, max_attempts, attempt_error, delay,
+            )
+            await asyncio.sleep(delay)
+    assert final is not None
+    return RetryOutcome(
+        arm=arm_name, qid=qid, attempts=attempts,
+        final_result=final, recovered=False,
+    )
+
+
+async def _gather_with_limit(coros: list, *, concurrency: int) -> list[Any]:
+    sem = asyncio.Semaphore(max(1, concurrency))
+
+    async def _wrap(coro):
+        async with sem:
+            return await coro
+
+    return await asyncio.gather(*(_wrap(c) for c in coros))
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+
+async def _run(args: argparse.Namespace) -> int:
+    load_dotenv(REPO / ".env")
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s %(levelname)s %(name)s %(message)s",
+    )
+
+    run_dir = REPO / "data" / "multimodal_doc" / "runs" / args.run_id / "parser_compare"
+    raw_path = run_dir / "raw.jsonl"
+    if not raw_path.exists():
+        raise SystemExit(f"raw.jsonl not found at {raw_path}")
+
+    map_path = REPO / "data" / "multimodal_doc" / "maps" / "parser_compare_doc_map.jsonl"
+    questions_jsonl = REPO / "data" / "multimodal_doc" / "mmlongbench" / "questions.jsonl"
+    if not map_path.exists():
+        raise SystemExit(f"parser_compare manifest not found at {map_path}")
+    if not questions_jsonl.exists():
+        raise SystemExit(f"mmlongbench questions not found at {questions_jsonl}")
+
+    failed = _load_failed_rows(raw_path)
+    if not failed:
+        logger.info("No failed rows in %s — nothing to retry.", raw_path)
+        return 0
+
+    # SurfSense rows: warn and skip; we don't want to start backend just to
+    # defensively retry a 0-failure arm.
+    surf_failed = [f for f in failed if f.arm == "surfsense_agentic"]
+    if surf_failed:
+        logger.warning(
+            "Skipping %d surfsense_agentic failures; this script doesn't drive the backend. "
+            "If you want those retried too, start backend + celery and rerun "
+            "with --include-surfsense.",
+            len(surf_failed),
+        )
+        if not args.include_surfsense:
+            failed = [f for f in failed if f.arm != "surfsense_agentic"]
+    else:
+        logger.info("No surfsense_agentic failures; backend/celery not needed for this retry.")
+
+    if not failed:
+        logger.info("Nothing left to retry after filtering.")
+        return 0
+
+    by_arm_count: dict[str, int] = {}
+    for f in failed:
+        by_arm_count[f.arm] = by_arm_count.get(f.arm, 0) + 1
+    logger.info(
+        "Loaded %d failed rows across %d arms: %s",
+        len(failed), len(by_arm_count),
+        ", ".join(f"{a}={n}" for a, n in sorted(by_arm_count.items())),
+    )
+
+    doc_map = _load_doc_map(map_path)
+    qtext_idx = _load_question_text_index(questions_jsonl)
+
+    api_key = os.environ.get("OPENROUTER_API_KEY")
+    if not api_key:
+        raise SystemExit("OPENROUTER_API_KEY missing from environment / .env")
+
+    native_provider = OpenRouterPdfProvider(
+        api_key=api_key,
+        base_url=os.environ.get("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1"),
+        model=args.llm_model,
+        engine=PdfEngine(args.pdf_engine),
+    )
+    native_arm = NativePdfArm(
+        provider=native_provider, max_output_tokens=args.max_output_tokens,
+    )
+
+    lc_arms: dict[str, BareLlmArm] = {}
+    for arm_name in sorted({f.arm for f in failed} & LC_ARMS):
+        lc_provider = OpenRouterChatProvider(
+            api_key=api_key,
+            base_url=os.environ.get("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1"),
+            model=args.llm_model,
+        )
+        lc_arms[arm_name] = BareLlmArm(
+            provider=lc_provider,
+            max_output_tokens=args.max_output_tokens,
+            name=arm_name,
+        )
+
+    coros: list = []
+    plan: list[tuple[FailedRow, ArmRequest, Any]] = []
+
+    for f in failed:
+        # Look up the question text from questions.jsonl
+        try:
+            q_idx = _qid_index(f.qid)
+        except Exception:
+            logger.error("Bad qid %r — skipping", f.qid)
+            continue
+        qrow = qtext_idx.get((f.doc_id, q_idx))
+        if qrow is None:
+            logger.error(
+                "Could not find question text for %s (idx %d) — skipping",
+                f.doc_id, q_idx,
+            )
+            continue
+        question_text = str(qrow.get("question") or "").strip()
+        answer_format = str(qrow.get("answer_format") or f.answer_format or "").strip().lower()
+
+        map_row = doc_map.get(f.doc_id)
+        if map_row is None:
+            logger.error("doc_id %s not in manifest — skipping", f.doc_id)
+            continue
+
+        if f.arm == "native_pdf":
+            pdf_path = Path(map_row["pdf_path"])
+            if not pdf_path.exists():
+                logger.error("PDF missing on disk: %s — skipping", pdf_path)
+                continue
+            request = _build_native_request(
+                f.qid, question_text, answer_format, pdf_path,
+                max_output_tokens=args.max_output_tokens,
+            )
+            arm_obj = native_arm
+        elif f.arm in LC_ARMS:
+            ext_blob = (map_row.get("extractions") or {}).get(f.arm) or {}
+            md_path_str = ext_blob.get("markdown_path")
+            if not md_path_str or ext_blob.get("status") != "ok":
+                logger.error(
+                    "Missing extraction for %s on %s — cannot retry; skipping",
+                    f.arm, f.doc_id,
+                )
+                continue
+            request = _build_lc_request(
+                f.qid, question_text, answer_format, f.doc_id, Path(md_path_str),
+            )
+            arm_obj = lc_arms[f.arm]
+        else:
+            logger.warning("Unhandled arm %s — skipping", f.arm)
+            continue
+
+        plan.append((f, request, arm_obj))
+        coros.append(_retry_one(
+            arm_obj, request,
+            arm_name=f.arm, qid=f.qid,
+            max_attempts=args.max_attempts,
+            base_delay=args.base_delay,
+            max_delay=args.max_delay,
+        ))
+
+    if not coros:
+        logger.warning("Nothing to retry after request building.")
+        return 0
+
+    logger.info(
+        "Retrying %d failed rows with up to %d attempts each "
+        "(base_delay=%.1fs, max_delay=%.1fs, concurrency=%d).",
+        len(coros), args.max_attempts, args.base_delay, args.max_delay,
+        args.concurrency,
+    )
+
+    started = time.monotonic()
+    outcomes: list[RetryOutcome] = await _gather_with_limit(
+        coros, concurrency=args.concurrency,
+    )
+    elapsed = time.monotonic() - started
+    logger.info("Retry pass finished in %.1fs.", elapsed)
+
+    out_path = run_dir / "raw_retries.jsonl"
+    summary_path = run_dir / "raw_retries_summary.json"
+
+    per_arm_recovered: dict[str, int] = {}
+    per_arm_total: dict[str, int] = {}
+    per_arm_attempts_dist: dict[str, list[int]] = {}
+
+    with out_path.open("w", encoding="utf-8") as fh:
+        for (f, _req, _arm_obj), outcome in zip(plan, outcomes, strict=True):
+            per_arm_total[outcome.arm] = per_arm_total.get(outcome.arm, 0) + 1
+            if outcome.recovered:
+                per_arm_recovered[outcome.arm] = (
+                    per_arm_recovered.get(outcome.arm, 0) + 1
+                )
+            per_arm_attempts_dist.setdefault(outcome.arm, []).append(
+                len(outcome.attempts)
+            )
+
+            g = grade(
+                pred=extract_freeform_answer(outcome.final_result.raw_text or ""),
+                gold=f.gold,
+                answer_format=f.answer_format,
+            )
+            row = {
+                "qid": f.qid,
+                "doc_id": f.doc_id,
+                "arm": f.arm,
+                "answer_format": f.answer_format,
+                "gold": f.gold,
+                "pages": f.pages,
+                "document_id": f.document_id,
+                "original_error": f.original_error,
+                "retry": {
+                    "max_attempts": args.max_attempts,
+                    "n_attempts": len(outcome.attempts),
+                    "recovered": outcome.recovered,
+                    "attempts": [
+                        {
+                            "attempt": a.attempt,
+                            "started_iso": a.started_iso,
+                            "latency_ms": a.latency_ms,
+                            "error": a.error,
+                            "raw_text_chars": a.raw_text_chars,
+                        }
+                        for a in outcome.attempts
+                    ],
+                },
+                **outcome.final_result.to_jsonl(),
+                "graded": {
+                    "correct": g.correct,
+                    "f1": g.f1,
+                    "method": g.method,
+                    "normalised_pred": g.normalised_pred,
+                    "normalised_gold": g.normalised_gold,
+                },
+            }
+            fh.write(json.dumps(row) + "\n")
+
+    summary = {
+        "run_id": args.run_id,
+        "raw_retries_path": str(out_path.relative_to(REPO)),
+        "n_failed_rows_input": len(failed),
+        "n_retried": len(coros),
+        "elapsed_s": round(elapsed, 1),
+        "config": {
+            "max_attempts": args.max_attempts,
+            "base_delay": args.base_delay,
+            "max_delay": args.max_delay,
+            "concurrency": args.concurrency,
+            "llm_model": args.llm_model,
+            "pdf_engine": args.pdf_engine,
+            "max_output_tokens": args.max_output_tokens,
+        },
+        "per_arm": {
+            arm: {
+                "tried": per_arm_total.get(arm, 0),
+                "recovered": per_arm_recovered.get(arm, 0),
+                "still_failed": (
+                    per_arm_total.get(arm, 0) - per_arm_recovered.get(arm, 0)
+                ),
+                "recovery_rate": (
+                    per_arm_recovered.get(arm, 0) / per_arm_total[arm]
+                    if per_arm_total.get(arm) else 0.0
+                ),
+                "attempts_distribution": sorted(per_arm_attempts_dist.get(arm, [])),
+            }
+            for arm in sorted(per_arm_total)
+        },
+        "totals": {
+            "tried": sum(per_arm_total.values()),
+            "recovered": sum(per_arm_recovered.values()),
+            "still_failed": sum(per_arm_total.values()) - sum(per_arm_recovered.values()),
+        },
+    }
+    summary_path.write_text(
+        json.dumps(summary, indent=2, sort_keys=True) + "\n",
+        encoding="utf-8",
+    )
+
+    print()
+    print("=" * 78)
+    print("Retry pass summary")
+    print("=" * 78)
+    header = f"{'arm':<25} {'tried':>6} {'recovered':>10} {'still fail':>11} {'rate':>7}"
+    print(header)
+    print("-" * len(header))
+    for arm in sorted(per_arm_total):
+        tried = per_arm_total[arm]
+        rec = per_arm_recovered.get(arm, 0)
+        rate = (rec / tried * 100) if tried else 0.0
+        print(f"{arm:<25} {tried:>6} {rec:>10} {tried - rec:>11} {rate:>6.1f}%")
+    total = sum(per_arm_total.values())
+    rec_total = sum(per_arm_recovered.values())
+    rate_total = (rec_total / total * 100) if total else 0.0
+    print("-" * len(header))
+    print(f"{'TOTAL':<25} {total:>6} {rec_total:>10} {total - rec_total:>11} "
+          f"{rate_total:>6.1f}%")
+    print()
+    print(f"Wrote {out_path.relative_to(REPO)}")
+    print(f"Wrote {summary_path.relative_to(REPO)}")
+    return 0
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--run-id", default="2026-05-14T00-53-19Z",
+        help="Run timestamp under data/multimodal_doc/runs/. Default is the "
+             "n=171 production run we wrote up in the blog.",
+    )
+    parser.add_argument("--max-attempts", type=int, default=5)
+    parser.add_argument("--base-delay", type=float, default=1.0,
+                        help="Base seconds for exponential backoff (default 1s).")
+    parser.add_argument("--max-delay", type=float, default=30.0,
+                        help="Cap on per-retry sleep (default 30s).")
+    parser.add_argument("--concurrency", type=int, default=2,
+                        help="Parallel retries in flight (default 2 — keep low "
+                             "to avoid the same transport stress that caused "
+                             "the original failures).")
+    parser.add_argument("--llm-model", default="anthropic/claude-sonnet-4.5")
+    parser.add_argument("--pdf-engine", default="native",
+                        choices=[e.value for e in PdfEngine])
+    parser.add_argument("--max-output-tokens", type=int, default=512)
+    parser.add_argument(
+        "--include-surfsense", action="store_true",
+        help="Also retry surfsense_agentic failures (requires backend + celery up). "
+             "Default is to skip them since the n=171 run had 0 SurfSense failures.",
+    )
+    args = parser.parse_args()
+    raise SystemExit(asyncio.run(_run(args)))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/surfsense_evals/scripts/summarise_crag_run.py b/surfsense_evals/scripts/summarise_crag_run.py
new file mode 100644
index 0000000000..646fb6a97d
--- /dev/null
+++ b/surfsense_evals/scripts/summarise_crag_run.py
@@ -0,0 +1,65 @@
+"""Render a quick textual summary of the latest CRAG run."""
+
+from __future__ import annotations
+
+import glob
+import json
+
+
+def main() -> None:
+    runs = sorted(glob.glob("data/research/runs/*/crag/run_artifact.json"))
+    if not runs:
+        print("(no CRAG runs found)")
+        return
+    m = json.load(open(runs[-1], encoding="utf-8"))
+    metrics = m["metrics"]
+
+    print(f"Reading: {runs[-1]}")
+    print(f"n_questions: {m['extra']['n_questions']}")
+    print()
+    print("=== ARMS ===")
+    for arm in ("bare_llm", "long_context", "surfsense"):
+        d = metrics[arm]
+        print(
+            f"{arm:14s}: "
+            f"acc={d['accuracy']*100:5.1f}% (Wilson 95% CI "
+            f"{d['ci_low']*100:.1f}-{d['ci_high']*100:.1f}) | "
+            f"correct={d['correct_rate']*100:5.1f}% "
+            f"missing={d['missing_rate']*100:5.1f}% "
+            f"incorrect={d['incorrect_rate']*100:5.1f}% | "
+            f"truth={d['truthfulness_score']*100:+5.1f}%"
+        )
+
+    print()
+    print("=== DELTAS ===")
+    for key, d in metrics["deltas"].items():
+        print(
+            f"{key:30s}: acc={d['accuracy_pp']:+5.1f}pp "
+            f"truth={d['truthfulness_score_pp']:+5.1f}pp "
+            f"McNemar p={d['mcnemar_p_value']:.4f} ({d['mcnemar_method']}) "
+            f"bootstrap CI [{d['bootstrap_ci_low']:+.1f}, {d['bootstrap_ci_high']:+.1f}]"
+        )
+
+    print()
+    print("=== PER-QUESTION-TYPE TRUTHFULNESS ===")
+    for qt, row in sorted(metrics["per_question_type"].items()):
+        n = row["n"]
+        pieces = [f"{qt:20s} (n={n:3d}):"]
+        for arm in ("bare_llm", "long_context", "surfsense"):
+            if arm in row:
+                pieces.append(f"{arm}={row[arm]['truthfulness_score']*100:+7.1f}%")
+        print(" ".join(pieces))
+
+    print()
+    print("=== PER-DOMAIN TRUTHFULNESS ===")
+    for dom, row in sorted(metrics["per_domain"].items()):
+        n = row["n"]
+        pieces = [f"{dom:10s} (n={n:3d}):"]
+        for arm in ("bare_llm", "long_context", "surfsense"):
+            if arm in row:
+                pieces.append(f"{arm}={row[arm]['truthfulness_score']*100:+7.1f}%")
+        print(" ".join(pieces))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/surfsense_evals/scripts/summarise_parser_compare_run.py b/surfsense_evals/scripts/summarise_parser_compare_run.py
new file mode 100644
index 0000000000..c54d82784f
--- /dev/null
+++ b/surfsense_evals/scripts/summarise_parser_compare_run.py
@@ -0,0 +1,122 @@
+"""Slice the parser_compare raw.jsonl for the n=171 run.
+
+Reports per-arm:
+  * tokens & cost stats (input/output mean, $/Q distribution)
+  * failures (status != ok or empty raw_text)
+  * answer_format breakdown (accuracy by str/int/float/list)
+
+Plus surfsense agentic breakdown so we can compare apples to apples
+even though the new_chat SSE doesn't surface per-call token counts.
+"""
+
+from __future__ import annotations
+
+import json
+import statistics
+from collections import defaultdict
+from pathlib import Path
+
+
+REPO = Path(__file__).resolve().parents[1]
+RUN_DIR = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
+RAW = RUN_DIR / "raw.jsonl"
+ARTIFACT = RUN_DIR / "run_artifact.json"
+
+
+def main() -> None:
+    rows = [json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines() if line.strip()]
+    print(f"raw rows: {len(rows)}")
+
+    by_qid: dict[str, list[dict]] = defaultdict(list)
+    for row in rows:
+        by_qid[row["qid"]].append(row)
+    print(f"unique questions: {len(by_qid)}")
+
+    arm_metrics: dict[str, dict] = defaultdict(lambda: {
+        "n": 0, "n_correct": 0, "n_failed": 0, "n_empty": 0,
+        "costs": [], "in_tokens": [], "out_tokens": [], "latency_ms": [],
+        "by_format": defaultdict(lambda: {"n": 0, "correct": 0}),
+    })
+
+    for row in rows:
+        arm = row["arm"]
+        m = arm_metrics[arm]
+        m["n"] += 1
+        graded = row.get("graded") or {}
+        if graded.get("correct"):
+            m["n_correct"] += 1
+
+        err = row.get("error")
+        raw_text = row.get("raw_text") or ""
+        if err:
+            m["n_failed"] += 1
+        elif not raw_text.strip():
+            m["n_empty"] += 1
+
+        cost = row.get("cost_usd")
+        if cost is not None:
+            m["costs"].append(float(cost))
+        ut = row.get("usage") or {}
+        if ut.get("prompt_tokens"):
+            m["in_tokens"].append(ut["prompt_tokens"])
+        if ut.get("completion_tokens"):
+            m["out_tokens"].append(ut["completion_tokens"])
+        if row.get("latency_ms"):
+            m["latency_ms"].append(row["latency_ms"])
+
+        fmt = row.get("answer_format") or "unknown"
+        m["by_format"][fmt]["n"] += 1
+        if graded.get("correct"):
+            m["by_format"][fmt]["correct"] += 1
+
+    print()
+    print("=" * 100)
+    print(f"{'arm':<25} {'n':>4} {'acc%':>6} {'F1%':>6} {'fail':>5} {'$ mean':>10} {'$ median':>10} {'in tok mean':>12} {'out tok mean':>12} {'p50 ms':>8}")
+    print("=" * 100)
+    art = json.loads(ARTIFACT.read_text(encoding="utf-8"))
+    per_arm_art = art["metrics"]["per_arm"]
+    for arm, m in sorted(arm_metrics.items()):
+        acc = m["n_correct"] / m["n"] * 100
+        fail = m["n_failed"]
+        cost_mean = statistics.mean(m["costs"]) if m["costs"] else 0.0
+        cost_med = statistics.median(m["costs"]) if m["costs"] else 0.0
+        in_mean = statistics.mean(m["in_tokens"]) if m["in_tokens"] else 0
+        out_mean = statistics.mean(m["out_tokens"]) if m["out_tokens"] else 0
+        lat_p50 = statistics.median(m["latency_ms"]) if m["latency_ms"] else 0
+        f1 = per_arm_art.get(arm, {}).get("f1_mean", 0.0) * 100
+        print(
+            f"{arm:<25} {m['n']:>4} {acc:>5.1f}% {f1:>5.1f}% {fail:>5} "
+            f"${cost_mean:>9.4f} ${cost_med:>9.4f} {in_mean:>12.0f} {out_mean:>12.0f} {lat_p50:>8.0f}"
+        )
+
+    print()
+    print("by answer_format (accuracy):")
+    formats = sorted({f for m in arm_metrics.values() for f in m["by_format"].keys()})
+    header = f"{'arm':<25} " + " ".join(f"{f:>10}" for f in formats)
+    print(header)
+    print("-" * len(header))
+    for arm, m in sorted(arm_metrics.items()):
+        cells = []
+        for f in formats:
+            row = m["by_format"][f]
+            if row["n"] == 0:
+                cells.append(f"{'-':>10}")
+            else:
+                pct = row["correct"] / row["n"] * 100
+                cells.append(f"{pct:>5.0f}% ({row['correct']:>2}/{row['n']:>2})")
+        print(f"{arm:<25} " + " ".join(cells))
+
+    print()
+    print("=" * 100)
+    print("Aggregated cost (from run_artifact.json):")
+    for arm, row in per_arm_art.items():
+        print(
+            f"  {arm:<25}  acc={row['accuracy']*100:5.1f}% "
+            f"  $/Q LLM={row['llm_cost_per_q']:.4f}  "
+            f"  preprocess total=${row['preprocess_cost_total']:.2f}  "
+            f"  $/Q total={row['total_cost_per_q']:.4f}"
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/surfsense_evals/scripts/test_context_overflow_hypothesis.py b/surfsense_evals/scripts/test_context_overflow_hypothesis.py
new file mode 100644
index 0000000000..89bd6cb3d0
--- /dev/null
+++ b/surfsense_evals/scripts/test_context_overflow_hypothesis.py
@@ -0,0 +1,155 @@
+"""Test the hypothesis: were the LC-arm errors actually context-window
+overflow errors disguised as SSL / network failures?
+
+If true, we'd expect:
+  (a) literal "prompt is too long" / "context_length_exceeded" / "exceeds .* tokens" strings,
+  (b) failures correlated with extraction size / input_tokens (large doc -> failure),
+  (c) failing requests near or over Sonnet 4.5's 200k input-token limit.
+
+If false (transport-layer hypothesis), we'd expect:
+  (a) only SSL / 502 / empty stream / JSONDecode strings,
+  (b) failures NOT correlated with size (uniform across PDFs by time, not by tokens),
+  (c) failing requests well below the 200k limit.
+"""
+
+from __future__ import annotations
+
+import json
+import statistics
+from collections import defaultdict
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[1]
+RUN = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
+RAW = RUN / "raw.jsonl"
+MANIFEST = REPO / "data" / "multimodal_doc" / "maps" / "parser_compare_doc_map.jsonl"
+
+CONTEXT_HINTS = (
+    "context_length",
+    "context window",
+    "prompt is too long",
+    "exceeds",
+    "maximum context",
+    "input tokens",
+    "too many tokens",
+    "over the maximum",
+    "200000",
+    "200_000",
+)
+
+
+def main() -> None:
+    rows = [
+        json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines()
+        if line.strip()
+    ]
+
+    extraction_size: dict[tuple[str, str], int] = {}
+    for line in MANIFEST.read_text(encoding="utf-8").splitlines():
+        if not line.strip():
+            continue
+        m = json.loads(line)
+        for arm, ext in (m.get("extractions") or {}).items():
+            extraction_size[(m["doc_id"], arm)] = int(ext.get("chars") or 0)
+
+    print("=" * 80)
+    print("(a) Literal 'context window' / 'prompt too long' error strings?")
+    print("=" * 80)
+    found = 0
+    for row in rows:
+        err = (row.get("error") or "").lower()
+        if not err:
+            continue
+        for hint in CONTEXT_HINTS:
+            if hint in err:
+                print(f"  {row['arm']:<25} {row['qid']:<50}")
+                print(f"      -> {err[:240]}")
+                found += 1
+                break
+    if not found:
+        print("  none found.")
+
+    print()
+    print("=" * 80)
+    print("(b) Extraction size for OK vs FAILED rows per arm")
+    print("=" * 80)
+    arm_buckets: dict[str, dict[str, list[int]]] = defaultdict(
+        lambda: {"ok": [], "fail": []}
+    )
+    parser_arms = (
+        "azure_basic_lc", "azure_premium_lc",
+        "llamacloud_basic_lc", "llamacloud_premium_lc",
+    )
+    for row in rows:
+        arm = row["arm"]
+        if arm not in parser_arms:
+            continue
+        size = extraction_size.get((row["doc_id"], arm), 0)
+        bucket = "fail" if (row.get("error") or not (row.get("raw_text") or "").strip()) else "ok"
+        arm_buckets[arm][bucket].append(size)
+
+    print(f"{'arm':<25} {'bucket':<5} {'n':>4} {'mean chars':>12} {'median':>10} {'max':>10}")
+    for arm in parser_arms:
+        for bucket in ("ok", "fail"):
+            sizes = arm_buckets[arm][bucket]
+            if not sizes:
+                print(f"  {arm:<23} {bucket:<5} {0:>4}  -")
+                continue
+            print(
+                f"  {arm:<23} {bucket:<5} {len(sizes):>4} "
+                f"{statistics.mean(sizes):>12,.0f} "
+                f"{statistics.median(sizes):>10,.0f} "
+                f"{max(sizes):>10,}"
+            )
+
+    print()
+    print("=" * 80)
+    print("(c) Largest extraction each arm processed *successfully* vs *failed*")
+    print("=" * 80)
+    print(
+        "(Sonnet 4.5 input limit ~200k tokens ~= 800k chars. If failures were "
+        "context-overflow, max-OK would be near that cap. If max-OK is well "
+        "above max-FAIL, the model handled bigger contexts than the failed "
+        "ones, so size cannot be the cause.)"
+    )
+    print()
+    for arm in parser_arms:
+        ok_sizes = arm_buckets[arm]["ok"]
+        fail_sizes = arm_buckets[arm]["fail"]
+        if not ok_sizes:
+            continue
+        max_ok = max(ok_sizes)
+        max_fail = max(fail_sizes) if fail_sizes else 0
+        print(
+            f"  {arm:<25} max OK = {max_ok:>10,} chars (~{max_ok / 4:>7,.0f} tokens)  "
+            f"max FAIL = {max_fail:>10,} chars (~{max_fail / 4:>7,.0f} tokens)"
+        )
+
+    print()
+    print("=" * 80)
+    print("(d) Did the *known* overflow candidate fail?")
+    print("=" * 80)
+    print(
+        "  3M_2018_10K x llamacloud_premium = 908,733 chars (~227k tokens) "
+        "-- this is above Sonnet 4.5's 200k window."
+    )
+    print("  If transport hypothesis is correct, this should still fail with a "
+          "real overflow error.")
+    print("  If transport hypothesis is correct AND the model truncates silently, "
+          "it might 'succeed' but be wrong.")
+    print()
+    for row in rows:
+        if row["doc_id"] != "3M_2018_10K.pdf":
+            continue
+        if row["arm"] != "llamacloud_premium_lc":
+            continue
+        err = row.get("error") or "(none)"
+        graded = row.get("graded") or {}
+        print(
+            f"  {row['qid']:<40} correct={graded.get('correct')!s:<5}  "
+            f"err={err[:100]}"
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/surfsense_evals/src/surfsense_evals/__init__.py b/surfsense_evals/src/surfsense_evals/__init__.py
new file mode 100644
index 0000000000..fc8a814825
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/__init__.py
@@ -0,0 +1,10 @@
+"""SurfSense Evals — domain-agnostic eval harness.
+
+Public entry-point is the ``surfsense_evals`` CLI (``python -m surfsense_evals``).
+Programmatic embedding is a non-goal for now; everything goes through the CLI
++ filesystem outputs (state.json, raw run JSONL, summary.md/json reports).
+"""
+
+from __future__ import annotations
+
+__version__ = "0.1.0"
diff --git a/surfsense_evals/src/surfsense_evals/__main__.py b/surfsense_evals/src/surfsense_evals/__main__.py
new file mode 100644
index 0000000000..0efb932ddc
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/__main__.py
@@ -0,0 +1,13 @@
+"""Module entry point: ``python -m surfsense_evals ...``.
+
+Delegates to ``core.cli.main``. ``core.cli`` lazily imports
+``surfsense_evals.suites`` so every benchmark gets a chance to register
+before argparse builds its subcommand groups.
+"""
+
+from __future__ import annotations
+
+from surfsense_evals.core.cli import main
+
+if __name__ == "__main__":  # pragma: no cover
+    raise SystemExit(main())
diff --git a/surfsense_evals/src/surfsense_evals/core/__init__.py b/surfsense_evals/src/surfsense_evals/core/__init__.py
new file mode 100644
index 0000000000..b5cc64a563
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/__init__.py
@@ -0,0 +1,8 @@
+"""Domain-agnostic infrastructure shared by every suite.
+
+Nothing under ``core/`` knows or cares about a specific evaluation domain.
+Suites live under ``surfsense_evals.suites.<domain>.<benchmark>`` and
+register themselves with ``core.registry`` on import.
+"""
+
+from __future__ import annotations
diff --git a/surfsense_evals/src/surfsense_evals/core/arms/__init__.py b/surfsense_evals/src/surfsense_evals/core/arms/__init__.py
new file mode 100644
index 0000000000..0e7ce46e4b
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/arms/__init__.py
@@ -0,0 +1,44 @@
+"""Arm protocol + concrete arms shared across suites.
+
+Concrete arms (``NativePdfArm``, ``SurfSenseArm``, ``BareLlmArm``) are
+imported lazily via ``__getattr__`` so consumers that only need the
+protocol — e.g. the registry's ``Arm`` re-export — don't transitively
+pull in ``httpx`` providers or the SurfSense client unless they
+actually use those arms.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from .base import Arm, ArmRequest, ArmResult
+
+if TYPE_CHECKING:  # pragma: no cover
+    from .bare_llm import BareLlmArm
+    from .native_pdf import NativePdfArm
+    from .surfsense import SurfSenseArm
+
+__all__ = [
+    "Arm",
+    "ArmRequest",
+    "ArmResult",
+    "BareLlmArm",
+    "NativePdfArm",
+    "SurfSenseArm",
+]
+
+
+def __getattr__(name: str):  # PEP 562
+    if name == "NativePdfArm":
+        from .native_pdf import NativePdfArm
+
+        return NativePdfArm
+    if name == "SurfSenseArm":
+        from .surfsense import SurfSenseArm
+
+        return SurfSenseArm
+    if name == "BareLlmArm":
+        from .bare_llm import BareLlmArm
+
+        return BareLlmArm
+    raise AttributeError(f"module 'surfsense_evals.core.arms' has no attribute {name!r}")
diff --git a/surfsense_evals/src/surfsense_evals/core/arms/bare_llm.py b/surfsense_evals/src/surfsense_evals/core/arms/bare_llm.py
new file mode 100644
index 0000000000..1e32154159
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/arms/bare_llm.py
@@ -0,0 +1,100 @@
+"""Bare-LLM arm: chat completion with prompt-only input, no retrieval.
+
+Pairs with ``SurfSenseArm`` for any benchmark that wants to measure
+"how much does the model already know without RAG?". For factuality /
+multi-hop benchmarks (FRAMES, MuSiQue, …) this produces the published
+"naive prompting" baseline — e.g. FRAMES's 40.8% on Gemini-Pro-1.5.
+
+Symmetric with ``NativePdfArm`` in shape, but the request carries no
+``pdf_paths``: the prompt itself is the only input the model gets.
+"""
+
+from __future__ import annotations
+
+import logging
+
+from ..providers.openrouter_chat import OpenRouterChatProvider
+from .base import Arm, ArmRequest, ArmResult
+
+logger = logging.getLogger(__name__)
+
+
+class BareLlmArm(Arm):
+    """``Arm`` implementation backed by ``OpenRouterChatProvider``.
+
+    ``name`` defaults to ``"bare_llm"`` but is overridable per-instance.
+    Suites that want two distinct OpenRouter chat arms (e.g. CRAG's
+    ``bare_llm`` vs ``long_context`` — both backed by chat-completions
+    but exercising different prompt strategies) instantiate twice with
+    different names so the metrics aggregator can keep them separate.
+    """
+
+    name: str = "bare_llm"
+
+    def __init__(
+        self,
+        *,
+        provider: OpenRouterChatProvider,
+        max_output_tokens: int | None = 1024,
+        system_prompt: str | None = None,
+        name: str | None = None,
+    ) -> None:
+        self._provider = provider
+        self._max_output = max_output_tokens
+        self._system_prompt = system_prompt
+        if name:
+            self.name = name
+
+    @classmethod
+    def from_env(
+        cls,
+        *,
+        api_key: str,
+        model: str,
+        base_url: str = "https://openrouter.ai/api/v1",
+        max_output_tokens: int | None = 1024,
+        system_prompt: str | None = None,
+        name: str | None = None,
+    ) -> BareLlmArm:
+        provider = OpenRouterChatProvider(
+            api_key=api_key,
+            base_url=base_url,
+            model=model,
+        )
+        return cls(
+            provider=provider,
+            max_output_tokens=max_output_tokens,
+            system_prompt=system_prompt,
+            name=name,
+        )
+
+    async def answer(self, request: ArmRequest) -> ArmResult:
+        try:
+            response = await self._provider.complete(
+                prompt=request.prompt,
+                system_prompt=self._system_prompt,
+                max_tokens=self._max_output,
+            )
+        except Exception as exc:  # noqa: BLE001
+            return ArmResult(
+                arm=self.name,
+                question_id=request.question_id,
+                raw_text="",
+                error=f"{type(exc).__name__}: {exc}",
+            )
+        return ArmResult(
+            arm=self.name,
+            question_id=request.question_id,
+            raw_text=response.text,
+            input_tokens=response.input_tokens,
+            output_tokens=response.output_tokens,
+            cost_micros=response.cost_micros,
+            latency_ms=response.latency_ms,
+            extra={
+                "model": self._provider.model,
+                "finish_reason": response.finish_reason,
+            },
+        )
+
+
+__all__ = ["BareLlmArm"]
diff --git a/surfsense_evals/src/surfsense_evals/core/arms/base.py b/surfsense_evals/src/surfsense_evals/core/arms/base.py
new file mode 100644
index 0000000000..3e327fef24
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/arms/base.py
@@ -0,0 +1,93 @@
+"""Arm protocol + the value types every arm exchanges with a runner.
+
+An ``Arm`` is "one way to answer one question". Two ship in this PR:
+
+* ``NativePdfArm`` — drop the PDF straight into an OpenRouter
+  chat-completions request with ``plugins=[{file-parser, engine:
+  native}]``. Used for the head-to-head "is the model good enough on
+  its own?" measurement.
+* ``SurfSenseArm`` — POST ``/api/v1/new_chat`` with the question
+  scoped to the relevant ``mentioned_document_ids``; consume the SSE
+  stream and parse citations.
+
+Both implement the same protocol so a benchmark runner only sees
+``Arm.answer(request) -> ArmResult``.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Protocol
+
+
+@dataclass
+class ArmRequest:
+    """One arm-call worth of input.
+
+    * ``question_id`` is opaque — used for logging and joining results.
+    * ``prompt`` is the fully-formatted text the arm should send. The
+      runner is responsible for prompt construction so head-to-head
+      comparisons use byte-identical text.
+    * ``pdf_paths`` is the per-question source PDFs (used by
+      ``NativePdfArm``). Empty for retrieval-only / corpus-wide
+      benchmarks.
+    * ``mentioned_document_ids`` is the SurfSense document scoping list
+      (used by ``SurfSenseArm``). When ``None`` SurfSense retrieves
+      across the whole search space.
+    * ``options`` is a free-form bag of arm-specific overrides
+      (e.g. SurfSense's ``disabled_tools``).
+    """
+
+    question_id: str
+    prompt: str
+    pdf_paths: list[Path] = field(default_factory=list)
+    mentioned_document_ids: list[int] | None = None
+    options: dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class ArmResult:
+    """Outcome of one ``Arm.answer`` invocation."""
+
+    arm: str
+    question_id: str
+    raw_text: str
+    answer_letter: str | None = None
+    citations: list[dict[str, Any]] = field(default_factory=list)
+    input_tokens: int = 0
+    output_tokens: int = 0
+    cost_micros: int = 0
+    latency_ms: int = 0
+    error: str | None = None
+    extra: dict[str, Any] = field(default_factory=dict)
+
+    @property
+    def ok(self) -> bool:
+        return self.error is None
+
+    def to_jsonl(self) -> dict[str, Any]:
+        """Stable dict shape for ``data/<suite>/runs/<ts>/<bench>_raw.jsonl``."""
+
+        return {
+            "arm": self.arm,
+            "question_id": self.question_id,
+            "answer_letter": self.answer_letter,
+            "raw_text": self.raw_text,
+            "citations": self.citations,
+            "input_tokens": self.input_tokens,
+            "output_tokens": self.output_tokens,
+            "cost_micros": self.cost_micros,
+            "latency_ms": self.latency_ms,
+            "error": self.error,
+            "extra": self.extra,
+        }
+
+
+class Arm(Protocol):
+    """One concrete way to answer questions for a given run."""
+
+    name: str
+
+    async def answer(self, request: ArmRequest) -> ArmResult:  # pragma: no cover - protocol
+        ...
diff --git a/surfsense_evals/src/surfsense_evals/core/arms/native_pdf.py b/surfsense_evals/src/surfsense_evals/core/arms/native_pdf.py
new file mode 100644
index 0000000000..9294ed0325
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/arms/native_pdf.py
@@ -0,0 +1,104 @@
+"""Native-PDF arm: drop the PDF straight into OpenRouter chat-completions.
+
+Generic across suites — a benchmark just supplies the prompt and the
+single PDF path. Multi-PDF questions concatenate in the runner before
+calling this arm so each ``answer`` invocation feeds the model exactly
+one ``data:application/pdf;base64,...`` block (matches the human
+"drag-and-drop one PDF into Claude" intent).
+"""
+
+from __future__ import annotations
+
+import logging
+
+from ..parse.answer_letter import extract_answer_letter
+from ..providers.openrouter_pdf import OpenRouterPdfProvider, PdfEngine
+from .base import Arm, ArmRequest, ArmResult
+
+logger = logging.getLogger(__name__)
+
+
+class NativePdfArm(Arm):
+    """``Arm`` implementation backed by ``OpenRouterPdfProvider``."""
+
+    name: str = "native_pdf"
+
+    def __init__(
+        self,
+        *,
+        provider: OpenRouterPdfProvider,
+        max_output_tokens: int | None = 1024,
+    ) -> None:
+        self._provider = provider
+        self._max_output = max_output_tokens
+
+    @classmethod
+    def from_env(
+        cls,
+        *,
+        api_key: str,
+        model: str,
+        engine: PdfEngine = PdfEngine.NATIVE,
+        base_url: str = "https://openrouter.ai/api/v1",
+        max_output_tokens: int | None = 1024,
+    ) -> NativePdfArm:
+        provider = OpenRouterPdfProvider(
+            api_key=api_key,
+            base_url=base_url,
+            model=model,
+            engine=engine,
+        )
+        return cls(provider=provider, max_output_tokens=max_output_tokens)
+
+    async def answer(self, request: ArmRequest) -> ArmResult:
+        if not request.pdf_paths:
+            return ArmResult(
+                arm=self.name,
+                question_id=request.question_id,
+                raw_text="",
+                error="native_pdf arm requires at least one pdf_path",
+            )
+        if len(request.pdf_paths) > 1:
+            # The plan calls out one-PDF-per-question so the head-to-head
+            # is fair; runners are responsible for upstream concatenation.
+            logger.debug(
+                "qid=%s native_pdf got %d pdfs; using first only",
+                request.question_id,
+                len(request.pdf_paths),
+            )
+        pdf = request.pdf_paths[0]
+        try:
+            response = await self._provider.complete(
+                prompt=request.prompt,
+                pdf_path=pdf,
+                max_tokens=self._max_output,
+            )
+        except Exception as exc:  # noqa: BLE001
+            return ArmResult(
+                arm=self.name,
+                question_id=request.question_id,
+                raw_text="",
+                error=f"{type(exc).__name__}: {exc}",
+            )
+
+        letter = extract_answer_letter(response.text)
+        return ArmResult(
+            arm=self.name,
+            question_id=request.question_id,
+            raw_text=response.text,
+            answer_letter=letter.letter,
+            input_tokens=response.input_tokens,
+            output_tokens=response.output_tokens,
+            cost_micros=response.cost_micros,
+            latency_ms=response.latency_ms,
+            extra={
+                "model": self._provider.model,
+                "engine": self._provider.engine.value,
+                "answer_letter_strategy": letter.strategy,
+                "finish_reason": response.finish_reason,
+                "pdf_filename": pdf.name,
+            },
+        )
+
+
+__all__ = ["NativePdfArm"]
diff --git a/surfsense_evals/src/surfsense_evals/core/arms/surfsense.py b/surfsense_evals/src/surfsense_evals/core/arms/surfsense.py
new file mode 100644
index 0000000000..a84350dfda
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/arms/surfsense.py
@@ -0,0 +1,104 @@
+"""SurfSense arm: per-question fresh thread + ``/api/v1/new_chat`` stream.
+
+For every question:
+
+* Create a fresh ``NewChatThread`` on the suite's pinned SearchSpace.
+  This sidesteps the per-thread ``THREAD_BUSY`` 409 (a single thread
+  serialises turns, see ``surfsense_backend/app/routes/new_chat_routes.py:191-220``).
+* POST ``/api/v1/new_chat`` with the prompt and the per-question
+  ``mentioned_document_ids`` (``surfsense_backend/app/schemas/new_chat.py:241-243``).
+* Consume the SSE stream via ``NewChatClient.ask`` which accumulates
+  text deltas and returns ``StreamedAnswer``.
+* Optionally delete the thread (default ON for ephemeral runs).
+
+Citations are parsed from the streamed assistant text via the
+canonical regex port; chunk ids are returned in ``ArmResult.citations``
+for the runner to map back to corpus ids.
+"""
+
+from __future__ import annotations
+
+import logging
+
+from ..clients import NewChatClient
+from ..parse.answer_letter import extract_answer_letter
+from .base import Arm, ArmRequest, ArmResult
+
+logger = logging.getLogger(__name__)
+
+
+class SurfSenseArm(Arm):
+    """``Arm`` implementation backed by ``NewChatClient``."""
+
+    name: str = "surfsense"
+
+    def __init__(
+        self,
+        *,
+        client: NewChatClient,
+        search_space_id: int,
+        ephemeral_threads: bool = True,
+        thread_title_prefix: str = "eval",
+    ) -> None:
+        self._client = client
+        self._search_space_id = search_space_id
+        self._ephemeral = ephemeral_threads
+        self._title_prefix = thread_title_prefix
+
+    async def answer(self, request: ArmRequest) -> ArmResult:
+        thread_id: int | None = None
+        try:
+            thread_id = await self._client.create_thread(
+                search_space_id=self._search_space_id,
+                title=f"{self._title_prefix}:{request.question_id}",
+            )
+            answer = await self._client.ask(
+                thread_id=thread_id,
+                search_space_id=self._search_space_id,
+                user_query=request.prompt,
+                mentioned_document_ids=request.mentioned_document_ids,
+                disabled_tools=request.options.get("disabled_tools"),
+            )
+        except Exception as exc:  # noqa: BLE001
+            return ArmResult(
+                arm=self.name,
+                question_id=request.question_id,
+                raw_text="",
+                error=f"{type(exc).__name__}: {exc}",
+                extra={"thread_id": thread_id},
+            )
+        finally:
+            if self._ephemeral and thread_id is not None:
+                try:
+                    await self._client.delete_thread(thread_id)
+                except Exception as exc:  # noqa: BLE001
+                    logger.debug(
+                        "Failed to delete thread %s: %s", thread_id, exc
+                    )
+
+        letter = extract_answer_letter(answer.text)
+        return ArmResult(
+            arm=self.name,
+            question_id=request.question_id,
+            raw_text=answer.text,
+            answer_letter=letter.letter,
+            citations=answer.citations,
+            latency_ms=answer.latency_ms,
+            # SurfSense doesn't surface input/output token counts in the
+            # SSE stream today; leaving the cost / token fields at 0
+            # documents that gap. Estimating from the raw text would
+            # bias the comparison against the SurfSense arm.
+            extra={
+                "thread_id": thread_id,
+                "search_space_id": self._search_space_id,
+                "answer_letter_strategy": letter.strategy,
+                "user_message_id": answer.user_message_id,
+                "assistant_message_id": answer.assistant_message_id,
+                "finished_normally": answer.finished_normally,
+                "n_raw_events": len(answer.raw_events),
+                "n_mentioned_documents": len(request.mentioned_document_ids or []),
+            },
+        )
+
+
+__all__ = ["SurfSenseArm"]
diff --git a/surfsense_evals/src/surfsense_evals/core/auth.py b/surfsense_evals/src/surfsense_evals/core/auth.py
new file mode 100644
index 0000000000..1e7cc5b3e1
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/auth.py
@@ -0,0 +1,273 @@
+"""Dual-mode credential resolver + httpx client factory with 401 auto-refresh.
+
+SurfSense supports ``AUTH_TYPE=LOCAL`` (email + password) and
+``AUTH_TYPE=GOOGLE`` (Google OAuth → frontend stores JWT in ``localStorage``).
+There is no headless equivalent of the Google flow, so the harness handles
+both modes by treating the JWT as the universal credential:
+
+* **LOCAL**: harness POSTs form-encoded ``username`` + ``password`` to
+  ``/auth/jwt/login``, reads ``{access_token, refresh_token}``.
+* **GOOGLE / pre-issued JWT**: operator pastes their existing JWT (and
+  optionally refresh token) into ``SURFSENSE_JWT`` /
+  ``SURFSENSE_REFRESH_TOKEN``; harness skips login.
+
+Either way ``client_with_auth`` returns one shared
+``httpx.AsyncClient`` with ``Authorization: Bearer <jwt>`` set and an
+event hook that, on a 401 with a refresh token in scope, calls
+``POST /auth/jwt/refresh`` and retries the original request once. JWT
+lifetime defaults to one day backend-side, so this matters for long
+MIRAGE runs.
+"""
+
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass, field
+from typing import Any
+
+import httpx
+
+from .config import Config
+
+logger = logging.getLogger(__name__)
+
+
+class CredentialError(RuntimeError):
+    """Raised when no credential mode is configured."""
+
+
+_NO_CREDENTIALS_MESSAGE = (
+    "No SurfSense credentials configured. Set ONE of:\n"
+    "  (LOCAL)  SURFSENSE_USER_EMAIL + SURFSENSE_USER_PASSWORD\n"
+    "  (GOOGLE) SURFSENSE_JWT (and optionally SURFSENSE_REFRESH_TOKEN)\n"
+    "For GOOGLE: log in to SurfSense in your browser, open DevTools → "
+    "Application → Local Storage → copy `surfsense_bearer_token` and "
+    "`surfsense_refresh_token` into those env vars."
+)
+
+
+@dataclass
+class TokenBundle:
+    """Mutable token state — refresh hook updates ``access_token`` in place."""
+
+    access_token: str
+    refresh_token: str | None = None
+    # ``mode`` is informational only ("local" or "jwt"); used in error messages.
+    mode: str = "jwt"
+
+
+# ---------------------------------------------------------------------------
+# Token acquisition
+# ---------------------------------------------------------------------------
+
+
+async def acquire_token(config: Config, *, http: httpx.AsyncClient | None = None) -> TokenBundle:
+    """Resolve credentials → ``TokenBundle``.
+
+    Precedence:
+
+    1. ``SURFSENSE_JWT`` set → use it directly. Refresh token captured if
+       supplied.
+    2. ``SURFSENSE_USER_EMAIL`` + ``SURFSENSE_USER_PASSWORD`` set →
+       form-encoded POST to ``/auth/jwt/login``.
+    3. Neither → raise ``CredentialError``.
+
+    The optional ``http`` argument lets tests inject a mocked client; if
+    omitted a one-shot client is created for the login call only.
+    """
+
+    if config.has_jwt_mode():
+        return TokenBundle(
+            access_token=config.surfsense_jwt or "",
+            refresh_token=config.surfsense_refresh_token,
+            mode="jwt",
+        )
+
+    if config.has_local_mode():
+        async def _login(client: httpx.AsyncClient) -> TokenBundle:
+            response = await client.post(
+                f"{config.surfsense_api_base}/auth/jwt/login",
+                data={
+                    "username": config.surfsense_user_email,
+                    "password": config.surfsense_user_password,
+                },
+                headers={"Accept": "application/json"},
+            )
+            if response.status_code != 200:
+                raise CredentialError(
+                    f"LOCAL login failed (HTTP {response.status_code}): "
+                    f"{_safe_text(response)}"
+                )
+            payload = response.json()
+            access = payload.get("access_token")
+            if not access:
+                raise CredentialError(
+                    f"LOCAL login response missing access_token: {payload!r}"
+                )
+            return TokenBundle(
+                access_token=access,
+                refresh_token=payload.get("refresh_token") or None,
+                mode="local",
+            )
+
+        if http is not None:
+            return await _login(http)
+        async with httpx.AsyncClient(timeout=httpx.Timeout(30.0, connect=10.0)) as client:
+            return await _login(client)
+
+    raise CredentialError(_NO_CREDENTIALS_MESSAGE)
+
+
+def _safe_text(response: httpx.Response, *, limit: int = 200) -> str:
+    body = response.text or ""
+    if len(body) > limit:
+        return body[:limit] + "…"
+    return body
+
+
+# ---------------------------------------------------------------------------
+# httpx client + 401 auto-refresh
+# ---------------------------------------------------------------------------
+
+
+class _AuthState:
+    """Shared mutable holder closed over by the auth event hook.
+
+    Kept private so callers can't accidentally mutate the access token
+    out-of-band; ``client_with_auth`` returns the client directly.
+    """
+
+    def __init__(self, config: Config, tokens: TokenBundle) -> None:
+        self.config = config
+        self.tokens = tokens
+        self._refresh_in_flight: bool = False
+
+
+def _build_auth_request(state: _AuthState, request: httpx.Request) -> None:
+    """Stamp the current bearer onto ``request`` (request-event hook)."""
+
+    request.headers["Authorization"] = f"Bearer {state.tokens.access_token}"
+
+
+async def _refresh_access_token(
+    state: _AuthState, transport: httpx.AsyncBaseTransport | None = None
+) -> bool:
+    """POST ``/auth/jwt/refresh`` with the current refresh token.
+
+    Returns ``True`` on success and updates ``state.tokens`` in place.
+    Returns ``False`` if no refresh token is configured or the call fails.
+    Recursive 401s are avoided by using a *new* client without the auth
+    hook.
+    """
+
+    refresh = state.tokens.refresh_token
+    if not refresh:
+        return False
+    try:
+        async with httpx.AsyncClient(
+            timeout=httpx.Timeout(15.0, connect=5.0),
+            transport=transport,
+        ) as inner:
+            response = await inner.post(
+                f"{state.config.surfsense_api_base}/auth/jwt/refresh",
+                json={"refresh_token": refresh},
+                headers={"Accept": "application/json"},
+            )
+    except httpx.HTTPError as exc:
+        logger.warning("Token refresh transport error: %s", exc)
+        return False
+    if response.status_code != 200:
+        logger.warning(
+            "Token refresh rejected (HTTP %s): %s",
+            response.status_code,
+            _safe_text(response),
+        )
+        return False
+    payload = response.json()
+    new_access = payload.get("access_token")
+    if not new_access:
+        logger.warning("Refresh response missing access_token: %r", payload)
+        return False
+    state.tokens.access_token = new_access
+    new_refresh = payload.get("refresh_token")
+    if new_refresh:
+        state.tokens.refresh_token = new_refresh
+    return True
+
+
+def client_with_auth(
+    config: Config,
+    tokens: TokenBundle,
+    *,
+    timeout: float = 60.0,
+    transport: httpx.AsyncBaseTransport | None = None,
+    base_url: str | None = None,
+) -> httpx.AsyncClient:
+    """Build a single shared ``httpx.AsyncClient`` for the SurfSense API.
+
+    * Stamps ``Authorization: Bearer <jwt>`` on every outgoing request.
+    * On any 401 response, attempts a single refresh (if a refresh token
+      is configured) and retries the original request once. The retry
+      uses a fresh stamping of the bearer header, so a successful
+      refresh transparently unblocks long runs.
+    * The retry is best-effort — repeated 401s after a refresh attempt
+      are surfaced to the caller so they can re-auth manually.
+
+    Pass ``base_url`` to scope a sub-client (e.g. tests). The default
+    keeps full URLs in calling code, which makes route-spec citations in
+    the codebase easier to grep.
+    """
+
+    state = _AuthState(config, tokens)
+
+    async def _request_hook(request: httpx.Request) -> None:
+        _build_auth_request(state, request)
+
+    # ``send`` is overridden in ``_AuthAwareClient`` to retry once on 401
+    # after refreshing the bearer. httpx's response event-hook can't
+    # *replace* a response, so we need a subclass to do the replay.
+    client = _AuthAwareClient(
+        state=state,
+        transport=transport,
+        timeout=httpx.Timeout(timeout, connect=10.0),
+        base_url=base_url or "",
+        event_hooks={"request": [_request_hook]},
+    )
+    return client
+
+
+class _AuthAwareClient(httpx.AsyncClient):
+    """``AsyncClient`` that retries once on 401 after refreshing the token."""
+
+    def __init__(self, *, state: _AuthState, **kwargs: Any) -> None:
+        super().__init__(**kwargs)
+        self._auth_state = state
+
+    async def send(  # type: ignore[override]
+        self, request: httpx.Request, **kwargs: Any
+    ) -> httpx.Response:
+        response = await super().send(request, **kwargs)
+        if response.status_code != 401:
+            return response
+        # Don't refresh while a refresh is itself in flight.
+        if self._auth_state._refresh_in_flight:
+            return response
+        self._auth_state._refresh_in_flight = True
+        try:
+            refreshed = await _refresh_access_token(self._auth_state)
+        finally:
+            self._auth_state._refresh_in_flight = False
+        if not refreshed:
+            return response
+        # Re-stamp and replay once. ``request`` is reusable.
+        await response.aclose()
+        request.headers["Authorization"] = f"Bearer {self._auth_state.tokens.access_token}"
+        return await super().send(request, **kwargs)
+
+
+__all__ = [
+    "CredentialError",
+    "TokenBundle",
+    "acquire_token",
+    "client_with_auth",
+]
diff --git a/surfsense_evals/src/surfsense_evals/core/cli.py b/surfsense_evals/src/surfsense_evals/core/cli.py
new file mode 100644
index 0000000000..3d4d0fd24e
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/cli.py
@@ -0,0 +1,790 @@
+"""Argparse CLI for ``python -m surfsense_evals``.
+
+Subcommands:
+
+* ``setup    --suite <name> --provider-model <slug> [--agent-llm-id <int>]``
+* ``teardown --suite <name>``
+* ``models  list [--provider openrouter] [--grep <s>]``
+* ``suites  list``
+* ``benchmarks list [--suite <name>]``
+* ``ingest <suite> <benchmark> [benchmark flags]``
+* ``run    <suite> <benchmark> [benchmark flags]``
+* ``report --suite <name> [--benchmark <name>]``
+
+The ``ingest`` / ``run`` subparsers are built dynamically from the
+registry — adding a new benchmark only requires registering it; the
+CLI surface comes for free. ``add_run_args`` lets each benchmark
+publish its own flags.
+
+Design choices worth flagging:
+
+* ``setup`` rejects ``agent_llm_id == 0`` (Auto / LiteLLM router) so
+  per-question accuracy is reproducible.
+* ``setup`` validates that the picked LLM config has
+  ``provider == "OPENROUTER"`` and ``model_name == --provider-model``
+  before declaring success — both arms of the head-to-head must hit
+  the same OpenRouter slug.
+* Lifecycle state is keyed by suite, so ``setup --suite legal`` does
+  not touch ``medical``'s SearchSpace, and vice versa.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import logging
+import sys
+from dataclasses import dataclass
+from typing import Any
+
+import sys
+
+import httpx
+from rich.console import Console
+from rich.table import Table
+
+# Windows' legacy console (cp1252) crashes when Rich tries to write characters
+# outside the active codepage (e.g. '->', em-dashes, box-drawing). Force UTF-8
+# on stdout/stderr and disable Rich's legacy_windows render path so the file
+# stream is used directly. Modern Windows (>=10, VS Code terminal, Windows
+# Terminal, PowerShell, cmd) all interpret ANSI escapes natively.
+if sys.platform == "win32":
+    for _stream in (sys.stdout, sys.stderr):
+        try:
+            _stream.reconfigure(encoding="utf-8", errors="replace")
+        except (AttributeError, ValueError):
+            pass
+
+from . import registry
+from .auth import CredentialError, acquire_token, client_with_auth
+from .clients import SearchSpaceClient
+from .clients.search_space import LlmPreferences
+from .config import (
+    DEFAULT_SCENARIO,
+    SCENARIOS,
+    Config,
+    SuiteState,
+    clear_suite_state,
+    get_suite_state,
+    load_config,
+    set_suite_state,
+    utc_iso_timestamp,
+)
+from .vision_llm import VisionConfigError, resolve_vision_llm
+
+logger = logging.getLogger("surfsense_evals")
+console = Console(legacy_windows=False)
+
+
+# ---------------------------------------------------------------------------
+# Discovery
+# ---------------------------------------------------------------------------
+
+
+def _discover_suites() -> list[str]:
+    """Trigger ``register(...)`` for every benchmark.
+
+    Imported lazily so ``models list`` (which doesn't need any
+    benchmark) still runs fast.
+    """
+
+    from surfsense_evals.suites import discover_suites
+
+    return discover_suites()
+
+
+# ---------------------------------------------------------------------------
+# Global LLM config fetcher (used by setup + models list)
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class LlmConfigEntry:
+    id: int
+    name: str
+    provider: str
+    model_name: str
+    raw: dict[str, Any]
+
+    @classmethod
+    def from_payload(cls, payload: dict[str, Any]) -> LlmConfigEntry:
+        return cls(
+            id=int(payload["id"]),
+            name=str(payload.get("name", "")),
+            provider=str(payload.get("provider", "")).upper(),
+            model_name=str(payload.get("model_name", "")),
+            raw=payload,
+        )
+
+
+async def _list_global_llm_configs(http: httpx.AsyncClient, base: str) -> list[LlmConfigEntry]:
+    response = await http.get(
+        f"{base}/api/v1/global-new-llm-configs",
+        headers={"Accept": "application/json"},
+    )
+    response.raise_for_status()
+    payload = response.json()
+    if not isinstance(payload, list):
+        raise RuntimeError(f"Unexpected /global-new-llm-configs payload: {payload!r}")
+    return [LlmConfigEntry.from_payload(item) for item in payload]
+
+
+def _resolve_openrouter_id(
+    candidates: list[LlmConfigEntry],
+    provider_model: str,
+    *,
+    explicit_id: int | None,
+) -> int:
+    """Resolve the SurfSense LLM id for ``provider_model``.
+
+    Behaviour:
+
+    * If ``explicit_id`` is given: return it directly. The caller is
+      then expected to GET-validate that the row's
+      ``provider == "OPENROUTER"`` and ``model_name`` matches the slug.
+      That branch supports positive BYOK ``NewLLMConfig`` rows whose
+      slugs may overlap with global OpenRouter virtuals.
+    * Otherwise: filter to ``provider == "OPENROUTER"`` and
+      ``model_name == provider_model``. Expect exactly one match —
+      raise with a friendly message otherwise.
+    """
+
+    if explicit_id is not None:
+        return explicit_id
+
+    matches = [
+        c for c in candidates if c.provider == "OPENROUTER" and c.model_name == provider_model
+    ]
+    if not matches:
+        sample = ", ".join(
+            f"{c.model_name} (id={c.id})" for c in candidates if c.provider == "OPENROUTER"
+        )[:600]
+        raise RuntimeError(
+            f"No OpenRouter config found for slug '{provider_model}'. "
+            "Make sure `openrouter_integration.enabled: true` in "
+            "global_llm_config.yaml and that the Celery worker has "
+            "finished its first refresh (the catalogue is fetched at "
+            "Celery startup per `app/celery_app.py`). "
+            f"Available OpenRouter slugs (sample): {sample or '<none>'}.\n"
+            "Browse with: python -m surfsense_evals models list --grep <substring>"
+        )
+    if len(matches) > 1:
+        listing = "\n".join(f"  id={c.id}  name={c.name!r}" for c in matches)
+        raise RuntimeError(
+            f"Multiple OpenRouter configs for slug '{provider_model}':\n{listing}\n"
+            "Pass --agent-llm-id <id> to disambiguate."
+        )
+    return matches[0].id
+
+
+# ---------------------------------------------------------------------------
+# Subcommand implementations
+# ---------------------------------------------------------------------------
+
+
+async def _cmd_setup(args: argparse.Namespace) -> int:
+    suite = args.suite
+    provider_model: str = args.provider_model
+    explicit_id: int | None = args.agent_llm_id
+    scenario: str = args.scenario
+    vision_llm_slug: str | None = args.vision_llm
+    native_arm_model: str | None = args.native_arm_model
+    skip_vision_setup: bool = args.no_vision_llm_setup
+
+    if explicit_id == 0:
+        console.print(
+            "[red]agent_llm_id == 0 (Auto / LiteLLM router) is not allowed — "
+            "results would not be reproducible.[/red]"
+        )
+        return 2
+
+    if scenario not in SCENARIOS:
+        console.print(
+            f"[red]Unknown scenario {scenario!r}. Pick one of: "
+            f"{', '.join(SCENARIOS)}[/red]"
+        )
+        return 2
+
+    # Scenario-specific validation. Each branch documents WHY the rule
+    # exists so the operator's mental model matches what the runner does.
+    if scenario == "cost-arbitrage":
+        if not native_arm_model:
+            console.print(
+                "[red]--scenario cost-arbitrage requires --native-arm-model "
+                "<vision-capable slug>.[/red] The native arm needs a vision "
+                "model to fairly answer image-bearing questions; SurfSense "
+                "answers from already-extracted text via --provider-model."
+            )
+            return 2
+        if native_arm_model == provider_model:
+            console.print(
+                "[yellow]--native-arm-model equals --provider-model in "
+                "cost-arbitrage; that's degenerate (same as head-to-head). "
+                "Pick a different slug or switch to --scenario head-to-head.[/yellow]"
+            )
+    elif scenario in ("head-to-head", "symmetric-cheap"):
+        if native_arm_model:
+            console.print(
+                f"[yellow]--native-arm-model is ignored for --scenario {scenario} "
+                f"(both arms answer with --provider-model={provider_model!r}).[/yellow]"
+            )
+            native_arm_model = None  # don't persist a stale value
+
+    config = load_config()
+    try:
+        token = await acquire_token(config)
+    except CredentialError as exc:
+        console.print(f"[red]{exc}[/red]")
+        return 2
+
+    async with client_with_auth(config, token) as http:
+        candidates = await _list_global_llm_configs(http, config.surfsense_api_base)
+
+        try:
+            agent_llm_id = _resolve_openrouter_id(
+                candidates, provider_model, explicit_id=explicit_id
+            )
+        except RuntimeError as exc:
+            console.print(f"[red]{exc}[/red]")
+            return 2
+
+        ss_client = SearchSpaceClient(http, config.surfsense_api_base)
+        existing = get_suite_state(config, suite)
+        if existing is not None:
+            try:
+                row = await ss_client.get(existing.search_space_id)
+                console.print(
+                    f"Reusing existing SearchSpace [cyan]{row.name}[/cyan] "
+                    f"(id={row.id}) for suite [bold]{suite}[/bold]."
+                )
+                search_space_id = row.id
+            except httpx.HTTPStatusError as exc:
+                if exc.response.status_code == 404:
+                    console.print(
+                        f"[yellow]state.json pointed at SearchSpace id={existing.search_space_id} "
+                        f"but backend returned 404; creating a fresh one.[/yellow]"
+                    )
+                    existing = None
+                else:
+                    raise
+        if existing is None:
+            ss_name = f"eval-{suite}-{utc_iso_timestamp()}"
+            row = await ss_client.create(
+                ss_name, description=f"surfsense-evals lifecycle ({suite})"
+            )
+            console.print(
+                f"Created SearchSpace [cyan]{row.name}[/cyan] (id={row.id}) "
+                f"for suite [bold]{suite}[/bold]."
+            )
+            search_space_id = row.id
+
+        # Resolve + attach the vision LLM config (unless explicitly skipped).
+        # Asymmetric scenarios make the vision LLM at ingest a hard
+        # requirement — without it, SurfSense's chunks have no image
+        # content and the entire framing collapses.
+        vision_required = scenario in ("symmetric-cheap", "cost-arbitrage")
+        vision_config_id: int | None = None
+        vision_provider_model: str | None = None
+        if not skip_vision_setup and (vision_required or vision_llm_slug is not None):
+            try:
+                vision_candidates = await ss_client.list_global_vision_llm_configs()
+                resolved = resolve_vision_llm(
+                    vision_candidates, explicit_slug=vision_llm_slug
+                )
+            except VisionConfigError as exc:
+                console.print(f"[red]{exc}[/red]")
+                return 2
+            vision_config_id = resolved.config_id
+            vision_provider_model = resolved.provider_model
+            console.print(
+                f"Vision LLM at ingest: [cyan]{vision_provider_model}[/cyan] "
+                f"(id={vision_config_id}, selected_via={resolved.selected_via})."
+            )
+
+        pref_kwargs: dict[str, Any] = {"agent_llm_id": agent_llm_id}
+        if vision_config_id is not None:
+            pref_kwargs["vision_llm_config_id"] = vision_config_id
+
+        await ss_client.set_llm_preferences(search_space_id, **pref_kwargs)
+        prefs = await ss_client.get_llm_preferences(search_space_id)
+        if not _validate_pin(prefs, provider_model):
+            agent = prefs.agent_llm or {}
+            console.print(
+                f"[red]LLM pin validation FAILED.[/red] After PUT, "
+                f"agent_llm.provider={agent.get('provider')!r}, "
+                f"model_name={agent.get('model_name')!r}; expected "
+                f"provider=OPENROUTER, model_name={provider_model!r}."
+            )
+            return 2
+        if vision_config_id is not None and prefs.vision_llm_config_id != vision_config_id:
+            console.print(
+                f"[red]Vision LLM pin validation FAILED.[/red] After PUT, "
+                f"vision_llm_config_id={prefs.vision_llm_config_id!r}; "
+                f"expected {vision_config_id!r}."
+            )
+            return 2
+
+        suite_state = SuiteState(
+            search_space_id=search_space_id,
+            agent_llm_id=agent_llm_id,
+            provider_model=provider_model,
+            created_at=utc_iso_timestamp(),
+            ingestion_maps=existing.ingestion_maps if existing else {},
+            scenario=scenario,
+            vision_llm_config_id=vision_config_id,
+            vision_provider_model=vision_provider_model,
+            native_arm_model=native_arm_model,
+        )
+        set_suite_state(config, suite, suite_state)
+
+    summary_bits = [
+        f"suite={suite!r}",
+        f"scenario={scenario!r}",
+        f"search_space_id={suite_state.search_space_id}",
+        f"agent_llm_id={suite_state.agent_llm_id}",
+        f"provider_model={suite_state.provider_model!r}",
+    ]
+    if suite_state.vision_provider_model:
+        summary_bits.append(f"vision_provider_model={suite_state.vision_provider_model!r}")
+    if suite_state.native_arm_model:
+        summary_bits.append(f"native_arm_model={suite_state.native_arm_model!r}")
+    console.print(f"[green]setup OK[/green] {' '.join(summary_bits)}")
+    return 0
+
+
+def _validate_pin(prefs: LlmPreferences, provider_model: str) -> bool:
+    agent = prefs.agent_llm or {}
+    return (
+        str(agent.get("provider", "")).upper() == "OPENROUTER"
+        and str(agent.get("model_name", "")) == provider_model
+    )
+
+
+async def _cmd_teardown(args: argparse.Namespace) -> int:
+    suite = args.suite
+    config = load_config()
+    state = get_suite_state(config, suite)
+    if state is None:
+        console.print(f"[yellow]No state for suite {suite!r}; nothing to tear down.[/yellow]")
+        return 0
+    try:
+        token = await acquire_token(config)
+    except CredentialError as exc:
+        console.print(f"[red]{exc}[/red]")
+        return 2
+    async with client_with_auth(config, token) as http:
+        ss_client = SearchSpaceClient(http, config.surfsense_api_base)
+        try:
+            await ss_client.delete(state.search_space_id)
+        except httpx.HTTPStatusError as exc:
+            console.print(
+                f"[yellow]DELETE failed (HTTP {exc.response.status_code}); "
+                "clearing state.json anyway.[/yellow]"
+            )
+    clear_suite_state(config, suite)
+    console.print(
+        f"[green]teardown OK[/green] suite={suite!r} "
+        f"(SearchSpace soft-deleted, state.json slot cleared)."
+    )
+    return 0
+
+
+async def _cmd_models_list(args: argparse.Namespace) -> int:
+    config = load_config()
+    try:
+        token = await acquire_token(config)
+    except CredentialError as exc:
+        console.print(f"[red]{exc}[/red]")
+        return 2
+    async with client_with_auth(config, token) as http:
+        entries = await _list_global_llm_configs(http, config.surfsense_api_base)
+    grep = (args.grep or "").lower()
+    provider_filter = (args.provider or "").upper()
+    rows: list[LlmConfigEntry] = []
+    for e in entries:
+        if provider_filter and e.provider != provider_filter:
+            continue
+        if grep and grep not in e.model_name.lower() and grep not in e.name.lower():
+            continue
+        rows.append(e)
+    table = Table(
+        title=f"Global LLM configs ({len(rows)} of {len(entries)})",
+        show_lines=False,
+    )
+    table.add_column("id", justify="right", style="cyan")
+    table.add_column("provider", style="magenta")
+    table.add_column("model_name", style="green")
+    table.add_column("name")
+    for e in sorted(rows, key=lambda x: (x.provider, x.model_name)):
+        table.add_row(str(e.id), e.provider, e.model_name, e.name)
+    console.print(table)
+    return 0
+
+
+def _cmd_suites_list(_args: argparse.Namespace) -> int:
+    _discover_suites()
+    suites = registry.list_suites()
+    if not suites:
+        console.print(
+            "[yellow]No suites registered. Drop a benchmark under "
+            "src/surfsense_evals/suites/<domain>/<benchmark>/.[/yellow]"
+        )
+        return 0
+    table = Table(title=f"Registered suites ({len(suites)})")
+    table.add_column("suite", style="bold")
+    table.add_column("benchmarks", style="green")
+    for suite in suites:
+        names = [b.name for b in registry.list_benchmarks(suite)]
+        table.add_row(suite, ", ".join(names) or "<none>")
+    console.print(table)
+    return 0
+
+
+def _cmd_benchmarks_list(args: argparse.Namespace) -> int:
+    _discover_suites()
+    benchmarks = registry.list_benchmarks(args.suite)
+    if not benchmarks:
+        console.print("[yellow]No benchmarks registered.[/yellow]")
+        return 0
+    table = Table(title=f"Benchmarks ({len(benchmarks)})")
+    table.add_column("suite", style="bold")
+    table.add_column("name", style="cyan")
+    table.add_column("headline", justify="center")
+    table.add_column("description")
+    for b in benchmarks:
+        table.add_row(
+            b.suite,
+            b.name,
+            "yes" if b.headline else "no",
+            getattr(b, "description", ""),
+        )
+    console.print(table)
+    return 0
+
+
+async def _cmd_ingest(args: argparse.Namespace) -> int:
+    benchmark = registry.get(args.suite, args.benchmark)
+    config = load_config()
+    state = get_suite_state(config, args.suite)
+    if state is None:
+        console.print(
+            f"[red]No setup for suite {args.suite!r}. Run "
+            f"`python -m surfsense_evals setup --suite {args.suite} "
+            f"--provider-model <slug>` first.[/red]"
+        )
+        return 2
+    try:
+        token = await acquire_token(config)
+    except CredentialError as exc:
+        console.print(f"[red]{exc}[/red]")
+        return 2
+
+    # Forward parsed CLI flags into ingest() so a benchmark can honour
+    # its own flags (e.g. MIRAGE's --skip-snippet-filter / --corpus).
+    extra_kwargs = {
+        k: v
+        for k, v in vars(args).items()
+        if k not in {"_func", "_async", "command", "subcommand", "suite", "benchmark", "log_level"}
+    }
+    async with client_with_auth(config, token) as http:
+        ctx = registry.RunContext(
+            suite=args.suite,
+            benchmark=args.benchmark,
+            config=config,
+            suite_state=state,
+            http=http,
+        )
+        await benchmark.ingest(ctx, **extra_kwargs)
+    console.print(f"[green]ingest OK[/green] {args.suite}/{args.benchmark}")
+    return 0
+
+
+async def _cmd_run(args: argparse.Namespace) -> int:
+    benchmark = registry.get(args.suite, args.benchmark)
+    config = load_config()
+    state = get_suite_state(config, args.suite)
+    if state is None:
+        console.print(
+            f"[red]No setup for suite {args.suite!r}. Run "
+            f"`python -m surfsense_evals setup --suite {args.suite} "
+            f"--provider-model <slug>` first.[/red]"
+        )
+        return 2
+    try:
+        token = await acquire_token(config)
+    except CredentialError as exc:
+        console.print(f"[red]{exc}[/red]")
+        return 2
+
+    extra_kwargs = {
+        k: v
+        for k, v in vars(args).items()
+        if k not in {"_func", "_async", "command", "subcommand", "suite", "benchmark", "log_level"}
+    }
+    async with client_with_auth(config, token) as http:
+        ctx = registry.RunContext(
+            suite=args.suite,
+            benchmark=args.benchmark,
+            config=config,
+            suite_state=state,
+            http=http,
+        )
+        artifact = await benchmark.run(ctx, **extra_kwargs)
+
+    console.print(
+        f"[green]run OK[/green] {args.suite}/{args.benchmark} → "
+        f"{artifact.raw_path}"
+    )
+    return 0
+
+
+async def _cmd_report(args: argparse.Namespace) -> int:
+    from .report import write_report
+
+    benchmark_filter = args.benchmark
+    config = load_config()
+    state = get_suite_state(config, args.suite)
+    if state is None:
+        console.print(f"[red]No setup for suite {args.suite!r}.[/red]")
+        return 2
+    benchmarks = registry.list_benchmarks(args.suite)
+    if benchmark_filter:
+        benchmarks = [b for b in benchmarks if b.name == benchmark_filter]
+        if not benchmarks:
+            console.print(
+                f"[red]No registered benchmark named {benchmark_filter!r} in suite {args.suite!r}.[/red]"
+            )
+            return 2
+
+    artifacts = _collect_artifacts(config, args.suite, [b.name for b in benchmarks])
+    if not artifacts:
+        console.print(
+            "[yellow]No run artifacts found under "
+            f"{config.suite_runs_dir(args.suite)}. Run a benchmark first.[/yellow]"
+        )
+        return 1
+
+    grouped: dict[str, list[registry.RunArtifact]] = {}
+    for art in artifacts:
+        grouped.setdefault(art.benchmark, []).append(art)
+    sections: list[registry.ReportSection] = []
+    for benchmark in benchmarks:
+        if benchmark.name not in grouped:
+            continue
+        sections.append(benchmark.report_section(grouped[benchmark.name]))
+
+    summary_path = write_report(
+        config=config,
+        suite=args.suite,
+        sections=sections,
+        run_timestamp=utc_iso_timestamp(),
+    )
+    console.print(f"[green]report OK[/green] → {summary_path}")
+    return 0
+
+
+def _collect_artifacts(
+    config: Config, suite: str, benchmark_names: list[str]
+) -> list[registry.RunArtifact]:
+    """Walk ``data/<suite>/runs/*/<benchmark>/`` for the latest artifacts.
+
+    Reads any ``run_artifact.json`` written by a benchmark runner. The
+    runner is responsible for writing this manifest alongside its raw
+    JSONL so the report writer doesn't have to know benchmark-specific
+    metric shapes.
+    """
+
+    runs_dir = config.suite_runs_dir(suite)
+    if not runs_dir.exists():
+        return []
+    artifacts: list[registry.RunArtifact] = []
+    by_bench: dict[str, registry.RunArtifact] = {}
+    for ts_dir in sorted(runs_dir.iterdir()):
+        if not ts_dir.is_dir():
+            continue
+        for bench_name in benchmark_names:
+            bench_dir = ts_dir / bench_name
+            manifest = bench_dir / "run_artifact.json"
+            if not manifest.exists():
+                continue
+            try:
+                with manifest.open("r", encoding="utf-8") as fh:
+                    payload = json.load(fh)
+            except (OSError, json.JSONDecodeError):
+                continue
+            artifact = registry.RunArtifact(
+                suite=suite,
+                benchmark=bench_name,
+                run_timestamp=ts_dir.name,
+                raw_path=bench_dir / payload.get("raw_path", "raw.jsonl"),
+                metrics=payload.get("metrics", {}),
+                extra=payload.get("extra", {}),
+            )
+            # Latest run wins per benchmark.
+            by_bench[bench_name] = artifact
+    artifacts = list(by_bench.values())
+    return artifacts
+
+
+# ---------------------------------------------------------------------------
+# Argparse wiring
+# ---------------------------------------------------------------------------
+
+
+def _build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog="surfsense-evals",
+        description="SurfSense evaluation harness — domain-agnostic core + pluggable suites.",
+    )
+    parser.add_argument(
+        "--log-level", default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"]
+    )
+    sub = parser.add_subparsers(dest="command", required=True)
+
+    p_setup = sub.add_parser("setup", help="Create per-suite SearchSpace + pin LLM.")
+    p_setup.add_argument("--suite", required=True)
+    p_setup.add_argument(
+        "--provider-model",
+        required=True,
+        help=(
+            "OpenRouter slug for the SurfSense answer LLM (and the native arm "
+            "too unless --native-arm-model is set), e.g. "
+            "'anthropic/claude-sonnet-4.5'."
+        ),
+    )
+    p_setup.add_argument(
+        "--agent-llm-id",
+        type=int,
+        default=None,
+        help="Optional override for BYOK NewLLMConfig rows.",
+    )
+    p_setup.add_argument(
+        "--scenario",
+        choices=SCENARIOS,
+        default=DEFAULT_SCENARIO,
+        help=(
+            "head-to-head (default): both arms answer with --provider-model; "
+            "symmetric-cheap: both arms use the same cheap text-only slug, "
+            "SurfSense pre-extracted images at ingest with a vision LLM; "
+            "cost-arbitrage: native arm uses --native-arm-model (vision), "
+            "SurfSense uses --provider-model (cheap, text-only) over chunks "
+            "the vision LLM already extracted at ingest."
+        ),
+    )
+    p_setup.add_argument(
+        "--vision-llm",
+        default=None,
+        metavar="SLUG",
+        help=(
+            "OpenRouter slug for the vision LLM SurfSense uses at ingest "
+            "when --use-vision-llm is on. If omitted in symmetric-cheap / "
+            "cost-arbitrage, the strongest registered vision config is "
+            "auto-picked (priority: claude-sonnet-4.5 > claude-opus-4.7 > "
+            "gpt-5 > gemini-2.5-pro)."
+        ),
+    )
+    p_setup.add_argument(
+        "--native-arm-model",
+        default=None,
+        metavar="SLUG",
+        help=(
+            "Required for --scenario cost-arbitrage. OpenRouter slug used "
+            "by the native_pdf arm only; SurfSense answers with "
+            "--provider-model. Ignored for head-to-head / symmetric-cheap."
+        ),
+    )
+    p_setup.add_argument(
+        "--no-vision-llm-setup",
+        action="store_true",
+        help=(
+            "Skip attaching a vision LLM config to the SearchSpace even if "
+            "the scenario would normally require one. Use when you want to "
+            "keep whatever is already attached (e.g. a per-user config)."
+        ),
+    )
+    p_setup.set_defaults(_func=_cmd_setup, _async=True)
+
+    p_teardown = sub.add_parser("teardown", help="Soft-delete the suite SearchSpace + clear state slot.")
+    p_teardown.add_argument("--suite", required=True)
+    p_teardown.set_defaults(_func=_cmd_teardown, _async=True)
+
+    p_models = sub.add_parser("models", help="LLM-config discovery helpers.")
+    models_sub = p_models.add_subparsers(dest="subcommand", required=True)
+    p_models_list = models_sub.add_parser("list", help="List global LLM configs.")
+    p_models_list.add_argument("--provider", default=None, help="Filter by provider, e.g. openrouter")
+    p_models_list.add_argument("--grep", default=None, help="Substring filter on name / model_name.")
+    p_models_list.set_defaults(_func=_cmd_models_list, _async=True)
+
+    p_suites = sub.add_parser("suites", help="List registered suites.")
+    suites_sub = p_suites.add_subparsers(dest="subcommand", required=True)
+    p_suites_list = suites_sub.add_parser("list", help="List suites.")
+    p_suites_list.set_defaults(_func=_cmd_suites_list, _async=False)
+
+    p_benchmarks = sub.add_parser("benchmarks", help="List registered benchmarks.")
+    bench_sub = p_benchmarks.add_subparsers(dest="subcommand", required=True)
+    p_bench_list = bench_sub.add_parser("list", help="List benchmarks.")
+    p_bench_list.add_argument("--suite", default=None)
+    p_bench_list.set_defaults(_func=_cmd_benchmarks_list, _async=False)
+
+    # Dynamic ingest / run subcommands need the registry populated, so
+    # discover up-front (cheap on import — modules just register).
+    _discover_suites()
+
+    p_ingest = sub.add_parser("ingest", help="Ingest a benchmark's corpus.")
+    ingest_sub = p_ingest.add_subparsers(dest="suite", required=True)
+    for suite in registry.list_suites():
+        suite_parser = ingest_sub.add_parser(suite, help=f"Ingest a {suite} benchmark.")
+        suite_bench = suite_parser.add_subparsers(dest="benchmark", required=True)
+        for benchmark in registry.list_benchmarks(suite):
+            bp = suite_bench.add_parser(benchmark.name, help=getattr(benchmark, "description", benchmark.name))
+            if hasattr(benchmark, "add_run_args"):
+                benchmark.add_run_args(bp)
+            bp.set_defaults(_func=_cmd_ingest, _async=True)
+
+    p_run = sub.add_parser("run", help="Run a benchmark.")
+    run_sub = p_run.add_subparsers(dest="suite", required=True)
+    for suite in registry.list_suites():
+        suite_parser = run_sub.add_parser(suite, help=f"Run a {suite} benchmark.")
+        suite_bench = suite_parser.add_subparsers(dest="benchmark", required=True)
+        for benchmark in registry.list_benchmarks(suite):
+            bp = suite_bench.add_parser(benchmark.name, help=getattr(benchmark, "description", benchmark.name))
+            if hasattr(benchmark, "add_run_args"):
+                benchmark.add_run_args(bp)
+            bp.set_defaults(_func=_cmd_run, _async=True)
+
+    p_report = sub.add_parser("report", help="Aggregate latest run artifacts into a summary.")
+    p_report.add_argument("--suite", required=True)
+    p_report.add_argument("--benchmark", default=None, help="Optional: report only this benchmark.")
+    p_report.set_defaults(_func=_cmd_report, _async=True)
+
+    return parser
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = _build_parser()
+    args = parser.parse_args(argv)
+    logging.basicConfig(
+        level=getattr(logging, args.log_level),
+        format="%(asctime)s %(levelname)s %(name)s %(message)s",
+    )
+    func = getattr(args, "_func", None)
+    if func is None:
+        parser.print_help()
+        return 2
+    is_async = getattr(args, "_async", False)
+    try:
+        if is_async:
+            return asyncio.run(func(args))
+        return func(args)
+    except KeyboardInterrupt:
+        console.print("[yellow]Interrupted.[/yellow]")
+        return 130
+    except Exception as exc:  # noqa: BLE001
+        logger.exception("CLI command failed")
+        console.print(f"[red]Command failed: {exc}[/red]")
+        return 1
+
+
+if __name__ == "__main__":  # pragma: no cover
+    sys.exit(main())
diff --git a/surfsense_evals/src/surfsense_evals/core/clients/__init__.py b/surfsense_evals/src/surfsense_evals/core/clients/__init__.py
new file mode 100644
index 0000000000..37246c2212
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/clients/__init__.py
@@ -0,0 +1,14 @@
+"""HTTP clients for the SurfSense API. All share one ``httpx.AsyncClient``."""
+
+from __future__ import annotations
+
+from .documents import DocumentsClient
+from .new_chat import NewChatClient, StreamedAnswer
+from .search_space import SearchSpaceClient
+
+__all__ = [
+    "DocumentsClient",
+    "NewChatClient",
+    "SearchSpaceClient",
+    "StreamedAnswer",
+]
diff --git a/surfsense_evals/src/surfsense_evals/core/clients/documents.py b/surfsense_evals/src/surfsense_evals/core/clients/documents.py
new file mode 100644
index 0000000000..02bcf74da3
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/clients/documents.py
@@ -0,0 +1,277 @@
+"""Client for ``/api/v1/documents/{fileupload,status,{id}/chunks}``.
+
+Verified against:
+
+* ``surfsense_backend/app/routes/documents_routes.py:122-292`` (POST fileupload)
+* ``surfsense_backend/app/routes/documents_routes.py:806-871`` (GET status batch)
+* ``surfsense_backend/app/routes/documents_routes.py:1062-1128`` (GET {id}/chunks paginated)
+
+Document processing is asynchronous:
+* ``POST /documents/fileupload`` returns immediately with
+  ``document_ids`` in ``pending``;
+* a Celery worker moves each through ``processing → ready/failed``;
+* the harness polls ``GET /documents/status?document_ids=...`` until
+  every doc is ``ready`` (otherwise the retriever sees an empty corpus
+  and accuracy numbers are meaningless).
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import mimetypes
+from collections.abc import Iterable, Sequence
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class FileUploadResult:
+    """Mirrors the JSON returned by ``POST /documents/fileupload``."""
+
+    document_ids: list[int]
+    duplicate_document_ids: list[int]
+    total_files: int
+    pending_files: int
+    skipped_duplicates: int
+    message: str = ""
+
+    @classmethod
+    def from_payload(cls, payload: dict[str, Any]) -> FileUploadResult:
+        return cls(
+            document_ids=[int(x) for x in payload.get("document_ids", [])],
+            duplicate_document_ids=[int(x) for x in payload.get("duplicate_document_ids", [])],
+            total_files=int(payload.get("total_files", 0)),
+            pending_files=int(payload.get("pending_files", 0)),
+            skipped_duplicates=int(payload.get("skipped_duplicates", 0)),
+            message=str(payload.get("message", "")),
+        )
+
+
+@dataclass
+class DocumentStatus:
+    document_id: int
+    title: str
+    document_type: str
+    state: str
+    reason: str | None = None
+
+    @property
+    def is_ready(self) -> bool:
+        return self.state == "ready"
+
+    @property
+    def is_failed(self) -> bool:
+        return self.state == "failed"
+
+
+@dataclass
+class ChunkRow:
+    id: int
+    document_id: int
+    content: str = ""
+    raw: dict[str, Any] = field(default_factory=dict)
+
+
+class DocumentProcessingFailed(RuntimeError):
+    """Raised when a polled document lands in ``failed``."""
+
+    def __init__(self, statuses: Sequence[DocumentStatus]) -> None:
+        details = ", ".join(
+            f"id={s.document_id} ({s.title!r}): {s.reason or 'unknown'}"
+            for s in statuses
+        )
+        super().__init__(f"Document(s) failed to process: {details}")
+        self.statuses = list(statuses)
+
+
+class DocumentProcessingTimeout(RuntimeError):
+    """Raised when polling exceeds the per-doc timeout budget."""
+
+
+class DocumentsClient:
+    """Document upload + status polling + chunk listing."""
+
+    def __init__(self, http: httpx.AsyncClient, base_url: str) -> None:
+        self._http = http
+        self._base = base_url.rstrip("/")
+
+    # ------------------------------------------------------------------
+    # upload
+    # ------------------------------------------------------------------
+
+    async def upload(
+        self,
+        files: Iterable[Path],
+        *,
+        search_space_id: int,
+        should_summarize: bool = False,
+        use_vision_llm: bool = False,
+        processing_mode: str = "basic",
+    ) -> FileUploadResult:
+        """Upload files to ``/api/v1/documents/fileupload``.
+
+        ``files`` is materialised to a list because we may need to
+        re-read on retry. Caller is responsible for ensuring each path
+        exists and respects the per-file size cap (50 MB backend default).
+        """
+
+        materialised = [Path(p) for p in files]
+        if not materialised:
+            return FileUploadResult(
+                document_ids=[],
+                duplicate_document_ids=[],
+                total_files=0,
+                pending_files=0,
+                skipped_duplicates=0,
+                message="No files supplied",
+            )
+
+        opened: list[tuple[str, Any]] = []
+        try:
+            for path in materialised:
+                # ``open`` directly — httpx wraps it in MultipartStream.
+                file_obj = path.open("rb")
+                mime, _ = mimetypes.guess_type(path.name)
+                opened.append(
+                    (
+                        "files",
+                        (path.name, file_obj, mime or "application/octet-stream"),
+                    )
+                )
+
+            response = await self._http.post(
+                f"{self._base}/api/v1/documents/fileupload",
+                data={
+                    "search_space_id": str(search_space_id),
+                    "should_summarize": "true" if should_summarize else "false",
+                    "use_vision_llm": "true" if use_vision_llm else "false",
+                    "processing_mode": processing_mode,
+                },
+                files=opened,
+                # Multipart uploads can be slow for big PDFs; bump per-call.
+                timeout=httpx.Timeout(120.0, connect=10.0),
+            )
+        finally:
+            for _, (_, file_obj, _) in opened:
+                try:
+                    file_obj.close()
+                except Exception:  # noqa: BLE001
+                    pass
+
+        response.raise_for_status()
+        return FileUploadResult.from_payload(response.json())
+
+    # ------------------------------------------------------------------
+    # status polling
+    # ------------------------------------------------------------------
+
+    async def get_status(
+        self, *, search_space_id: int, document_ids: Sequence[int]
+    ) -> list[DocumentStatus]:
+        if not document_ids:
+            return []
+        response = await self._http.get(
+            f"{self._base}/api/v1/documents/status",
+            params={
+                "search_space_id": search_space_id,
+                "document_ids": ",".join(str(d) for d in document_ids),
+            },
+            headers={"Accept": "application/json"},
+        )
+        response.raise_for_status()
+        payload = response.json()
+        return [
+            DocumentStatus(
+                document_id=int(item["id"]),
+                title=str(item.get("title", "")),
+                document_type=str(item.get("document_type", "")),
+                state=str((item.get("status") or {}).get("state", "ready")),
+                reason=(item.get("status") or {}).get("reason"),
+            )
+            for item in payload.get("items", [])
+        ]
+
+    async def wait_until_ready(
+        self,
+        *,
+        search_space_id: int,
+        document_ids: Sequence[int],
+        timeout_s: float = 300.0,
+        initial_poll_s: float = 1.0,
+        max_poll_s: float = 10.0,
+    ) -> list[DocumentStatus]:
+        """Poll ``GET /documents/status`` until every doc is ``ready``.
+
+        Exponential backoff from ``initial_poll_s`` up to ``max_poll_s``.
+        Raises ``DocumentProcessingFailed`` if any doc lands in
+        ``failed`` (with the offending document ids), or
+        ``DocumentProcessingTimeout`` if the budget is exhausted.
+        """
+
+        if not document_ids:
+            return []
+        deadline = asyncio.get_event_loop().time() + timeout_s
+        poll = initial_poll_s
+        while True:
+            statuses = await self.get_status(
+                search_space_id=search_space_id, document_ids=document_ids
+            )
+            failed = [s for s in statuses if s.is_failed]
+            if failed:
+                raise DocumentProcessingFailed(failed)
+            ready = [s for s in statuses if s.is_ready]
+            if len(ready) == len(document_ids):
+                return statuses
+            now = asyncio.get_event_loop().time()
+            if now >= deadline:
+                pending = [s for s in statuses if not s.is_ready and not s.is_failed]
+                pending_ids = [s.document_id for s in pending]
+                raise DocumentProcessingTimeout(
+                    f"Timed out after {timeout_s:.0f}s waiting for documents "
+                    f"(still pending/processing: {pending_ids})"
+                )
+            await asyncio.sleep(min(poll, max(0.1, deadline - now)))
+            poll = min(poll * 1.5, max_poll_s)
+
+    # ------------------------------------------------------------------
+    # chunks (chunk_id -> document_id map)
+    # ------------------------------------------------------------------
+
+    async def list_chunks(
+        self, document_id: int, *, page_size: int = 100
+    ) -> list[ChunkRow]:
+        """Walk ``GET /documents/{id}/chunks`` until ``has_more=False``.
+
+        Used by ingestion to materialise the ``chunk_id -> document_id``
+        map needed for retrieval scoring (CUREv1).
+        """
+
+        rows: list[ChunkRow] = []
+        page = 0
+        while True:
+            response = await self._http.get(
+                f"{self._base}/api/v1/documents/{document_id}/chunks",
+                params={"page": page, "page_size": page_size},
+                headers={"Accept": "application/json"},
+            )
+            response.raise_for_status()
+            payload = response.json()
+            for item in payload.get("items", []):
+                rows.append(
+                    ChunkRow(
+                        id=int(item["id"]),
+                        document_id=document_id,
+                        content=str(item.get("content", "")),
+                        raw=item,
+                    )
+                )
+            if not payload.get("has_more"):
+                break
+            page += 1
+        return rows
diff --git a/surfsense_evals/src/surfsense_evals/core/clients/new_chat.py b/surfsense_evals/src/surfsense_evals/core/clients/new_chat.py
new file mode 100644
index 0000000000..a4c23d010e
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/clients/new_chat.py
@@ -0,0 +1,280 @@
+"""Client for ``/api/v1/threads`` and ``/api/v1/new_chat`` (SSE).
+
+Verified against:
+
+* ``surfsense_backend/app/routes/new_chat_routes.py:793-848`` (POST /threads)
+* ``surfsense_backend/app/routes/new_chat_routes.py:1073-1142`` (DELETE /threads/{id})
+* ``surfsense_backend/app/routes/new_chat_routes.py:1689-1800`` (POST /new_chat SSE)
+* ``surfsense_backend/app/routes/new_chat_routes.py:191-220`` (THREAD_BUSY / TURN_CANCELLING 409)
+* ``surfsense_backend/app/services/streaming/envelope/sse.py`` (wire framing)
+* ``surfsense_backend/app/services/streaming/events/text.py`` (text-delta events)
+* ``surfsense_backend/app/schemas/new_chat.py:234-288`` (NewChatRequest body)
+
+The wire format is "Vercel AI SDK"-flavoured SSE with one event per
+``data: <json>\n\n`` block (or the literal ``data: [DONE]\n\n``
+terminator). Text deltas arrive as ``{"type":"text-delta","id":...,"delta":...}``
+events; we accumulate them per ``id`` and emit the final concatenated
+text plus parsed citations.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import time
+from collections.abc import AsyncIterator, Sequence
+from dataclasses import dataclass, field
+from typing import Any
+
+import httpx
+
+from ..parse import iter_sse_events, parse_citations
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class StreamedAnswer:
+    """Result of a single ``/new_chat`` turn."""
+
+    text: str
+    raw_events: list[dict[str, Any]] = field(default_factory=list)
+    latency_ms: int = 0
+    user_message_id: str | None = None
+    assistant_message_id: str | None = None
+    finished_normally: bool = False
+
+    @property
+    def citations(self) -> list[dict[str, Any]]:
+        """Parsed citation tokens (lazy; small enough to recompute)."""
+
+        return [token.to_dict() for token in parse_citations(self.text)]
+
+
+class ThreadBusyError(RuntimeError):
+    """Raised after exhausting retries on a 409 ``THREAD_BUSY`` / ``TURN_CANCELLING``."""
+
+    def __init__(self, error_code: str, message: str) -> None:
+        super().__init__(f"{error_code}: {message}")
+        self.error_code = error_code
+
+
+class NewChatClient:
+    """Thread create / delete / SSE ask."""
+
+    def __init__(self, http: httpx.AsyncClient, base_url: str) -> None:
+        self._http = http
+        self._base = base_url.rstrip("/")
+
+    # ------------------------------------------------------------------
+    # threads
+    # ------------------------------------------------------------------
+
+    async def create_thread(
+        self,
+        *,
+        search_space_id: int,
+        title: str = "eval",
+        archived: bool = False,
+        visibility: str = "PRIVATE",
+    ) -> int:
+        response = await self._http.post(
+            f"{self._base}/api/v1/threads",
+            json={
+                "search_space_id": search_space_id,
+                "title": title,
+                "archived": archived,
+                "visibility": visibility,
+            },
+            headers={"Accept": "application/json"},
+        )
+        response.raise_for_status()
+        payload = response.json()
+        return int(payload["id"])
+
+    async def delete_thread(self, thread_id: int) -> None:
+        response = await self._http.delete(
+            f"{self._base}/api/v1/threads/{thread_id}",
+            headers={"Accept": "application/json"},
+        )
+        if response.status_code == 404:
+            return  # idempotent
+        response.raise_for_status()
+
+    # ------------------------------------------------------------------
+    # /new_chat SSE
+    # ------------------------------------------------------------------
+
+    async def ask(
+        self,
+        *,
+        thread_id: int,
+        search_space_id: int,
+        user_query: str,
+        mentioned_document_ids: Sequence[int] | None = None,
+        disabled_tools: Sequence[str] | None = None,
+        max_busy_retries: int = 4,
+        timeout_s: float = 600.0,
+    ) -> StreamedAnswer:
+        """Stream a single turn and return the accumulated answer.
+
+        Honours backend ``THREAD_BUSY`` / ``TURN_CANCELLING`` 409
+        responses by sleeping for the ``Retry-After`` header (or the
+        ``retry-after-ms`` header if present) and replaying. Bounded
+        by ``max_busy_retries`` so a stuck thread never blocks the
+        whole run.
+        """
+
+        body: dict[str, Any] = {
+            "chat_id": thread_id,
+            "search_space_id": search_space_id,
+            "user_query": user_query,
+        }
+        if mentioned_document_ids:
+            body["mentioned_document_ids"] = list(mentioned_document_ids)
+        if disabled_tools:
+            body["disabled_tools"] = list(disabled_tools)
+
+        attempt = 0
+        while True:
+            try:
+                return await self._stream_once(body=body, timeout_s=timeout_s)
+            except ThreadBusyError as exc:
+                attempt += 1
+                if attempt > max_busy_retries:
+                    raise
+                # Cap wait at 30s; backend retry hint is exponential anyway.
+                wait = min(30.0, 0.5 * (2 ** attempt))
+                logger.info(
+                    "thread_id=%s busy (%s); retry %d/%d after %.1fs",
+                    thread_id,
+                    exc.error_code,
+                    attempt,
+                    max_busy_retries,
+                    wait,
+                )
+                await asyncio.sleep(wait)
+
+    async def _stream_once(
+        self,
+        *,
+        body: dict[str, Any],
+        timeout_s: float,
+    ) -> StreamedAnswer:
+        # Per-call timeout — the connect should be quick, the read needs
+        # to outlive the longest LLM completion.
+        timeout = httpx.Timeout(timeout_s, connect=10.0)
+        started = time.monotonic()
+        async with self._http.stream(
+            "POST",
+            f"{self._base}/api/v1/new_chat",
+            json=body,
+            headers={"Accept": "text/event-stream"},
+            timeout=timeout,
+        ) as response:
+            if response.status_code == 409:
+                detail = await self._extract_busy_detail(response)
+                raise ThreadBusyError(
+                    error_code=detail.get("errorCode", "THREAD_BUSY"),
+                    message=detail.get("message", "Thread is busy"),
+                )
+            response.raise_for_status()
+            answer = await self._consume_sse(response)
+        answer.latency_ms = int((time.monotonic() - started) * 1000)
+        return answer
+
+    @staticmethod
+    async def _extract_busy_detail(response: httpx.Response) -> dict[str, Any]:
+        try:
+            payload = json.loads(await response.aread())
+        except (json.JSONDecodeError, ValueError):
+            return {"errorCode": "THREAD_BUSY", "message": response.text}
+        if isinstance(payload, dict) and isinstance(payload.get("detail"), dict):
+            return payload["detail"]
+        return payload if isinstance(payload, dict) else {}
+
+    @staticmethod
+    async def _consume_sse(response: httpx.Response) -> StreamedAnswer:
+        """Walk SSE events, accumulate text-delta payloads.
+
+        Backend events of interest:
+
+        * ``{"type": "text-start", "id": ...}``
+        * ``{"type": "text-delta", "id": ..., "delta": ...}``
+        * ``{"type": "text-end", "id": ...}``
+        * ``{"type": "start", "messageId": ...}``  (top-level message id)
+        * ``{"type": "finish"}``
+        * literal ``[DONE]`` sentinel
+
+        Multiple ``text-start`` blocks can interleave — each gets its
+        own ``id`` and we concatenate them in arrival order. That
+        mirrors the AI SDK client behaviour: one continuous assistant
+        message visible to the user.
+        """
+
+        ordered_text_ids: list[str] = []
+        text_buffers: dict[str, list[str]] = {}
+        raw_events: list[dict[str, Any]] = []
+        user_message_id: str | None = None
+        assistant_message_id: str | None = None
+        finished = False
+
+        async for event in iter_sse_events(_aiter_lines(response)):
+            data = event.data
+            if data == "[DONE]":
+                finished = True
+                continue
+            try:
+                payload = json.loads(data)
+            except (json.JSONDecodeError, ValueError):
+                logger.debug("Skipping non-JSON SSE payload: %r", data[:120])
+                continue
+            if not isinstance(payload, dict):
+                continue
+            raw_events.append(payload)
+            ev_type = payload.get("type")
+            if ev_type == "text-delta":
+                tid = str(payload.get("id", ""))
+                delta = payload.get("delta", "")
+                if not isinstance(delta, str):
+                    continue
+                if tid not in text_buffers:
+                    text_buffers[tid] = []
+                    ordered_text_ids.append(tid)
+                text_buffers[tid].append(delta)
+            elif ev_type == "text-start":
+                tid = str(payload.get("id", ""))
+                if tid and tid not in text_buffers:
+                    text_buffers[tid] = []
+                    ordered_text_ids.append(tid)
+            elif ev_type == "start":
+                msg_id = payload.get("messageId")
+                if isinstance(msg_id, str):
+                    user_message_id = user_message_id or msg_id
+            elif ev_type == "data-user-message-id":
+                msg_id = (payload.get("data") or {}).get("id") or payload.get("id")
+                if isinstance(msg_id, str):
+                    user_message_id = msg_id
+            elif ev_type == "data-assistant-message-id":
+                msg_id = (payload.get("data") or {}).get("id") or payload.get("id")
+                if isinstance(msg_id, str):
+                    assistant_message_id = msg_id
+            elif ev_type == "finish":
+                finished = True
+
+        text = "".join("".join(text_buffers.get(tid, [])) for tid in ordered_text_ids)
+        return StreamedAnswer(
+            text=text,
+            raw_events=raw_events,
+            user_message_id=user_message_id,
+            assistant_message_id=assistant_message_id,
+            finished_normally=finished,
+        )
+
+
+async def _aiter_lines(response: httpx.Response) -> AsyncIterator[str]:
+    """Adapter so the parser can consume any line iterator (mockable in tests)."""
+
+    async for line in response.aiter_lines():
+        yield line
diff --git a/surfsense_evals/src/surfsense_evals/core/clients/search_space.py b/surfsense_evals/src/surfsense_evals/core/clients/search_space.py
new file mode 100644
index 0000000000..37fa69f806
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/clients/search_space.py
@@ -0,0 +1,207 @@
+"""Client for ``/api/v1/searchspaces`` and ``/api/v1/search-spaces/{id}/llm-preferences``.
+
+Verified against:
+
+* ``surfsense_backend/app/routes/search_spaces_routes.py:116`` (POST create)
+* ``surfsense_backend/app/routes/search_spaces_routes.py:234`` (GET by id)
+* ``surfsense_backend/app/routes/search_spaces_routes.py:422`` (DELETE soft-delete)
+* ``surfsense_backend/app/routes/search_spaces_routes.py:698-849`` (GET/PUT llm-preferences)
+* ``surfsense_backend/app/schemas/search_space.py:14`` (SearchSpaceCreate body)
+* ``surfsense_backend/app/routes/vision_llm_routes.py:60`` (GET global vision configs)
+
+Note the inconsistent pluralisation in the backend: ``/searchspaces``
+(no hyphen) for CRUD, but ``/search-spaces`` (hyphenated) for the
+``llm-preferences`` sub-resource. Both are mirrored verbatim here.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+
+import httpx
+
+
+@dataclass
+class SearchSpaceRow:
+    """Subset of the SearchSpace row we care about."""
+
+    id: int
+    name: str
+    description: str | None
+    user_id: str
+    citations_enabled: bool
+    qna_custom_instructions: str | None
+
+    @classmethod
+    def from_payload(cls, payload: dict[str, Any]) -> SearchSpaceRow:
+        return cls(
+            id=int(payload["id"]),
+            name=str(payload["name"]),
+            description=payload.get("description"),
+            user_id=str(payload.get("user_id", "")),
+            citations_enabled=bool(payload.get("citations_enabled", True)),
+            qna_custom_instructions=payload.get("qna_custom_instructions"),
+        )
+
+
+@dataclass
+class VisionLlmConfigEntry:
+    """Subset of one ``GET /global-vision-llm-configs`` row.
+
+    The backend returns negative ids for global / OpenRouter-derived
+    vision configs and positive ids for per-user BYOK rows. Either is
+    accepted by ``set_llm_preferences(vision_llm_config_id=...)``.
+    """
+
+    id: int
+    name: str
+    provider: str
+    model_name: str
+    is_auto_mode: bool
+    raw: dict[str, Any]
+
+    @classmethod
+    def from_payload(cls, payload: dict[str, Any]) -> VisionLlmConfigEntry:
+        return cls(
+            id=int(payload.get("id", 0)),
+            name=str(payload.get("name", "")),
+            provider=str(payload.get("provider", "")).upper(),
+            model_name=str(payload.get("model_name", "")),
+            is_auto_mode=bool(payload.get("is_auto_mode", False)),
+            raw=payload,
+        )
+
+
+@dataclass
+class LlmPreferences:
+    """Resolved LLM preferences with the embedded full config row.
+
+    Mirrors ``LLMPreferencesRead`` from the backend so the lifecycle
+    command can introspect ``provider`` / ``model_name`` to validate the
+    OpenRouter pin.
+    """
+
+    agent_llm_id: int | None
+    document_summary_llm_id: int | None
+    image_generation_config_id: int | None
+    vision_llm_config_id: int | None
+    agent_llm: dict[str, Any] | None
+    raw: dict[str, Any]
+
+    @classmethod
+    def from_payload(cls, payload: dict[str, Any]) -> LlmPreferences:
+        return cls(
+            agent_llm_id=payload.get("agent_llm_id"),
+            document_summary_llm_id=payload.get("document_summary_llm_id"),
+            image_generation_config_id=payload.get("image_generation_config_id"),
+            vision_llm_config_id=payload.get("vision_llm_config_id"),
+            agent_llm=payload.get("agent_llm"),
+            raw=payload,
+        )
+
+
+class SearchSpaceClient:
+    """Thin wrapper around the SearchSpace + LLM preferences endpoints."""
+
+    def __init__(self, http: httpx.AsyncClient, base_url: str) -> None:
+        self._http = http
+        self._base = base_url.rstrip("/")
+
+    async def create(self, name: str, *, description: str | None = None) -> SearchSpaceRow:
+        body: dict[str, Any] = {"name": name}
+        if description is not None:
+            body["description"] = description
+        # citations_enabled defaults to True backend-side; keep that default.
+        response = await self._http.post(
+            f"{self._base}/api/v1/searchspaces",
+            json=body,
+            headers={"Accept": "application/json"},
+        )
+        response.raise_for_status()
+        return SearchSpaceRow.from_payload(response.json())
+
+    async def get(self, search_space_id: int) -> SearchSpaceRow:
+        response = await self._http.get(
+            f"{self._base}/api/v1/searchspaces/{search_space_id}",
+            headers={"Accept": "application/json"},
+        )
+        response.raise_for_status()
+        return SearchSpaceRow.from_payload(response.json())
+
+    async def delete(self, search_space_id: int) -> None:
+        """Soft-delete: backend prefixes name with ``[DELETING]`` and dispatches a Celery cascade."""
+
+        response = await self._http.delete(
+            f"{self._base}/api/v1/searchspaces/{search_space_id}",
+            headers={"Accept": "application/json"},
+        )
+        # 404 means it's already gone — treat as success (idempotent teardown).
+        if response.status_code == 404:
+            return
+        response.raise_for_status()
+
+    async def get_llm_preferences(self, search_space_id: int) -> LlmPreferences:
+        response = await self._http.get(
+            f"{self._base}/api/v1/search-spaces/{search_space_id}/llm-preferences",
+            headers={"Accept": "application/json"},
+        )
+        response.raise_for_status()
+        return LlmPreferences.from_payload(response.json())
+
+    async def set_llm_preferences(
+        self,
+        search_space_id: int,
+        *,
+        agent_llm_id: int | None = None,
+        document_summary_llm_id: int | None = None,
+        image_generation_config_id: int | None = None,
+        vision_llm_config_id: int | None = None,
+    ) -> LlmPreferences:
+        """PUT a partial update to ``/search-spaces/{id}/llm-preferences``.
+
+        Backend uses ``model_dump(exclude_unset=True)`` so omitted fields
+        are left unchanged.
+        """
+
+        body: dict[str, Any] = {}
+        if agent_llm_id is not None:
+            body["agent_llm_id"] = agent_llm_id
+        if document_summary_llm_id is not None:
+            body["document_summary_llm_id"] = document_summary_llm_id
+        if image_generation_config_id is not None:
+            body["image_generation_config_id"] = image_generation_config_id
+        if vision_llm_config_id is not None:
+            body["vision_llm_config_id"] = vision_llm_config_id
+        response = await self._http.put(
+            f"{self._base}/api/v1/search-spaces/{search_space_id}/llm-preferences",
+            json=body,
+            headers={"Accept": "application/json"},
+        )
+        response.raise_for_status()
+        return LlmPreferences.from_payload(response.json())
+
+    async def list_global_vision_llm_configs(self) -> list[VisionLlmConfigEntry]:
+        """List the registered global vision LLM configs.
+
+        Used by ``setup`` to (a) resolve an explicit ``--vision-llm <slug>``
+        to a config id and (b) auto-pick the strongest registered vision
+        config when the operator doesn't pass one. The ``Auto (Fastest)``
+        entry (``id=0``) is filtered out — accuracy must be reproducible.
+        """
+
+        response = await self._http.get(
+            f"{self._base}/api/v1/global-vision-llm-configs",
+            headers={"Accept": "application/json"},
+        )
+        response.raise_for_status()
+        payload = response.json()
+        if not isinstance(payload, list):
+            raise RuntimeError(
+                f"Unexpected /global-vision-llm-configs payload: {payload!r}"
+            )
+        return [
+            VisionLlmConfigEntry.from_payload(item)
+            for item in payload
+            if not bool(item.get("is_auto_mode", False))
+        ]
diff --git a/surfsense_evals/src/surfsense_evals/core/config.py b/surfsense_evals/src/surfsense_evals/core/config.py
new file mode 100644
index 0000000000..164955914c
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/config.py
@@ -0,0 +1,279 @@
+"""Environment + filesystem configuration for the harness.
+
+Two responsibilities:
+
+1. Load env vars (with sensible defaults) into a single immutable ``Config``
+   so that every other module reads it from one place.
+2. Read / write ``data/state.json``. State is keyed by suite name so multiple
+   suites can be set up in parallel and torn down independently.
+
+The pinned ``search_space_id`` lives in ``state.json`` (not env) so re-runs
+are idempotent without forcing the operator to remember an integer.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+from collections.abc import Mapping
+from dataclasses import dataclass, field
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+
+from dotenv import load_dotenv
+
+# Resolve once at import time. ``find_dotenv`` walks up; an explicit ``.env``
+# at the package root or in CWD wins. Silent-no-op if neither exists.
+load_dotenv()
+
+
+_PROJECT_ROOT = Path(__file__).resolve().parents[3]
+"""Resolves to ``surfsense_evals/`` (the package root, not ``src/``)."""
+
+
+def _project_root() -> Path:
+    """Return the ``surfsense_evals/`` project root.
+
+    Computed from this file's path: ``src/surfsense_evals/core/config.py`` →
+    walk up four levels. Kept as a function so tests can monkeypatch.
+    """
+
+    return _PROJECT_ROOT
+
+
+@dataclass(frozen=True)
+class Config:
+    """Immutable runtime configuration."""
+
+    surfsense_api_base: str
+    openrouter_api_key: str | None
+    openrouter_base_url: str
+
+    # Credentials — exactly ONE mode must be supplied.
+    surfsense_jwt: str | None
+    surfsense_refresh_token: str | None
+    surfsense_user_email: str | None
+    surfsense_user_password: str | None
+
+    # Filesystem paths.
+    data_dir: Path
+    reports_dir: Path
+
+    @property
+    def state_path(self) -> Path:
+        return self.data_dir / "state.json"
+
+    def has_jwt_mode(self) -> bool:
+        return bool(self.surfsense_jwt)
+
+    def has_local_mode(self) -> bool:
+        return bool(self.surfsense_user_email and self.surfsense_user_password)
+
+    def credential_mode(self) -> str:
+        """Return ``"jwt"``, ``"local"``, or ``"none"`` (no credentials supplied)."""
+
+        if self.has_jwt_mode():
+            return "jwt"
+        if self.has_local_mode():
+            return "local"
+        return "none"
+
+    def suite_data_dir(self, suite: str) -> Path:
+        return self.data_dir / suite
+
+    def suite_reports_dir(self, suite: str) -> Path:
+        return self.reports_dir / suite
+
+    def suite_runs_dir(self, suite: str) -> Path:
+        return self.suite_data_dir(suite) / "runs"
+
+    def suite_maps_dir(self, suite: str) -> Path:
+        return self.suite_data_dir(suite) / "maps"
+
+
+def load_config() -> Config:
+    """Read the current process env into a ``Config``.
+
+    No validation is performed here; callers (e.g. ``auth.acquire_token``,
+    ``cli`` subcommands) decide which fields they require. This keeps
+    ``models list`` and ``suites list`` runnable without OpenRouter creds.
+    """
+
+    project_root = _project_root()
+    data_dir = Path(os.environ.get("EVAL_DATA_DIR") or (project_root / "data")).resolve()
+    reports_dir = Path(os.environ.get("EVAL_REPORTS_DIR") or (project_root / "reports")).resolve()
+    return Config(
+        surfsense_api_base=os.environ.get("SURFSENSE_API_BASE", "http://localhost:8000").rstrip("/"),
+        openrouter_api_key=os.environ.get("OPENROUTER_API_KEY") or None,
+        openrouter_base_url=os.environ.get(
+            "OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1"
+        ).rstrip("/"),
+        surfsense_jwt=os.environ.get("SURFSENSE_JWT") or None,
+        surfsense_refresh_token=os.environ.get("SURFSENSE_REFRESH_TOKEN") or None,
+        surfsense_user_email=os.environ.get("SURFSENSE_USER_EMAIL") or None,
+        surfsense_user_password=os.environ.get("SURFSENSE_USER_PASSWORD") or None,
+        data_dir=data_dir,
+        reports_dir=reports_dir,
+    )
+
+
+# ---------------------------------------------------------------------------
+# state.json — per-suite slots
+# ---------------------------------------------------------------------------
+
+
+# Scenario names — chosen at ``setup`` time, persisted in ``state.json``.
+#
+# * ``head-to-head`` (default, current behaviour): both arms answer with the
+#   SAME slug pinned via ``--provider-model``. Vision LLM at ingest is
+#   optional but recommended for image-bearing benchmarks.
+# * ``symmetric-cheap``: both arms answer with the SAME (cheap, text-only)
+#   slug; SurfSense pre-extracted images at ingest with a vision LLM.
+#   Measures whether vision-RAG ingestion lets a cheap downstream model
+#   match a vision one. Native arm structurally loses on image questions —
+#   that's the point, and the report labels it accordingly.
+# * ``cost-arbitrage``: native arm answers with an EXPENSIVE vision slug
+#   (``--native-arm-model``), SurfSense answers with a CHEAP text-only slug
+#   (``--provider-model``) over chunks the vision LLM already extracted at
+#   ingest. Measures how close SurfSense gets to native at a fraction of
+#   the per-query cost. The most compelling "shines" framing.
+SCENARIOS: tuple[str, ...] = ("head-to-head", "symmetric-cheap", "cost-arbitrage")
+DEFAULT_SCENARIO: str = "head-to-head"
+
+
+@dataclass
+class SuiteState:
+    """Per-suite persisted state.
+
+    ``provider_model`` is the slug pinned to the SearchSpace's
+    ``agent_llm`` — what answers SurfSense queries (and what the native
+    arm uses too, unless ``native_arm_model`` is set for cost-arbitrage).
+
+    ``vision_provider_model`` is the slug of the OpenRouter vision LLM
+    config attached to the SearchSpace's ``vision_llm_config_id`` — what
+    SurfSense uses to extract image content at ingest time when
+    ``use_vision_llm=True``. ``None`` means no vision config was attached
+    at setup (legacy or text-only suite).
+    """
+
+    search_space_id: int
+    agent_llm_id: int
+    provider_model: str
+    created_at: str
+    ingestion_maps: dict[str, str] = field(default_factory=dict)
+    scenario: str = DEFAULT_SCENARIO
+    vision_llm_config_id: int | None = None
+    vision_provider_model: str | None = None
+    native_arm_model: str | None = None
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "search_space_id": self.search_space_id,
+            "agent_llm_id": self.agent_llm_id,
+            "provider_model": self.provider_model,
+            "created_at": self.created_at,
+            "ingestion_maps": dict(self.ingestion_maps),
+            "scenario": self.scenario,
+            "vision_llm_config_id": self.vision_llm_config_id,
+            "vision_provider_model": self.vision_provider_model,
+            "native_arm_model": self.native_arm_model,
+        }
+
+    @classmethod
+    def from_dict(cls, payload: Mapping[str, Any]) -> SuiteState:
+        # ``scenario`` / vision / native fields default for back-compat with
+        # ``state.json`` written before scenarios shipped.
+        scenario = str(payload.get("scenario") or DEFAULT_SCENARIO)
+        if scenario not in SCENARIOS:
+            scenario = DEFAULT_SCENARIO
+        raw_vision_id = payload.get("vision_llm_config_id")
+        return cls(
+            search_space_id=int(payload["search_space_id"]),
+            agent_llm_id=int(payload["agent_llm_id"]),
+            provider_model=str(payload["provider_model"]),
+            created_at=str(payload.get("created_at") or ""),
+            ingestion_maps=dict(payload.get("ingestion_maps") or {}),
+            scenario=scenario,
+            vision_llm_config_id=int(raw_vision_id) if raw_vision_id is not None else None,
+            vision_provider_model=(
+                str(payload["vision_provider_model"])
+                if payload.get("vision_provider_model")
+                else None
+            ),
+            native_arm_model=(
+                str(payload["native_arm_model"])
+                if payload.get("native_arm_model")
+                else None
+            ),
+        )
+
+    @property
+    def effective_native_arm_model(self) -> str:
+        """Slug the native arm should use; falls back to ``provider_model``."""
+
+        return self.native_arm_model or self.provider_model
+
+
+def _load_state(config: Config) -> dict[str, Any]:
+    if not config.state_path.exists():
+        return {"suites": {}}
+    try:
+        with config.state_path.open("r", encoding="utf-8") as fh:
+            data = json.load(fh)
+    except (OSError, json.JSONDecodeError) as exc:
+        raise RuntimeError(
+            f"Failed to read state file {config.state_path}: {exc!s}. "
+            "Delete it if you want to start fresh."
+        ) from exc
+    if not isinstance(data, dict) or "suites" not in data:
+        return {"suites": {}}
+    return data
+
+
+def _write_state(config: Config, payload: Mapping[str, Any]) -> None:
+    config.data_dir.mkdir(parents=True, exist_ok=True)
+    tmp = config.state_path.with_suffix(".json.tmp")
+    with tmp.open("w", encoding="utf-8") as fh:
+        json.dump(dict(payload), fh, indent=2, sort_keys=True)
+        fh.write("\n")
+    tmp.replace(config.state_path)
+
+
+def get_suite_state(config: Config, suite: str) -> SuiteState | None:
+    """Return ``SuiteState`` for ``suite`` or ``None`` if not set up."""
+
+    state = _load_state(config)
+    raw = (state.get("suites") or {}).get(suite)
+    if not raw:
+        return None
+    return SuiteState.from_dict(raw)
+
+
+def set_suite_state(config: Config, suite: str, suite_state: SuiteState) -> None:
+    """Persist ``suite_state`` under the suite slot. Other suites are untouched."""
+
+    state = _load_state(config)
+    suites = dict(state.get("suites") or {})
+    suites[suite] = suite_state.to_dict()
+    state["suites"] = suites
+    _write_state(config, state)
+
+
+def clear_suite_state(config: Config, suite: str) -> bool:
+    """Remove the slot for ``suite``. Returns ``True`` if removal happened."""
+
+    state = _load_state(config)
+    suites = dict(state.get("suites") or {})
+    if suite not in suites:
+        return False
+    del suites[suite]
+    state["suites"] = suites
+    _write_state(config, state)
+    return True
+
+
+def utc_iso_timestamp() -> str:
+    """Filesystem-safe UTC ISO timestamp, e.g. ``2026-05-11T20-30-00Z``."""
+
+    return datetime.now(UTC).strftime("%Y-%m-%dT%H-%M-%SZ")
diff --git a/surfsense_evals/src/surfsense_evals/core/ingest_settings.py b/surfsense_evals/src/surfsense_evals/core/ingest_settings.py
new file mode 100644
index 0000000000..5cdece5778
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/ingest_settings.py
@@ -0,0 +1,311 @@
+"""Per-upload ingestion settings shared across every benchmark.
+
+The SurfSense ``POST /api/v1/documents/fileupload`` endpoint exposes
+exactly three knobs (verified at
+``surfsense_backend/app/routes/documents_routes.py`` and
+``surfsense_backend/app/etl_pipeline/etl_document.py``):
+
+* ``processing_mode``     — ``"basic"`` (default) | ``"premium"``
+* ``use_vision_llm``      — ``bool`` (run vision LLM during ingest to
+                            extract image content / captions / tables)
+* ``should_summarize``    — ``bool`` (generate document summary)
+
+This module gives every benchmark a uniform way to:
+
+1. Receive sensible per-benchmark defaults (text-only benchmarks
+   default vision off; image-bearing benchmarks default vision on).
+2. Accept CLI overrides (``--use-vision-llm`` / ``--no-vision-llm``,
+   ``--processing-mode {basic,premium}``,
+   ``--should-summarize`` / ``--no-summarize``).
+3. Persist the *actual* settings used into the doc-map manifest and
+   the run artifact so reports can show "vision=ON, mode=premium →
+   65% accuracy" head-to-head with "vision=OFF, mode=basic → 52%".
+
+A/B testing on the same corpus
+------------------------------
+
+SurfSense dedupes uploads by ``(filename, search_space_id)`` — NOT by
+content hash and NOT by ingestion settings. Re-uploading the same
+filename to the same SearchSpace with a different ``use_vision_llm``
+flag will hit the duplicate branch and *not* re-process. To compare
+two settings combos head-to-head on the same corpus you must give
+each combo its own SearchSpace, which today means:
+
+    teardown --suite <s>
+    setup    --suite <s> ...
+    ingest   <s> <bench>  --no-vision-llm   # baseline run
+    run      <s> <bench>
+    teardown --suite <s>
+    setup    --suite <s> ...
+    ingest   <s> <bench>  --use-vision-llm  # vision arm
+    run      <s> <bench>
+
+The runs land in different timestamped subdirectories under
+``data/<suite>/runs/`` and ``report --suite <s>`` aggregates whichever
+manifest is currently latest per benchmark.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from collections.abc import Mapping
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+# Keep the constant list of valid processing modes here so benchmarks
+# don't have to re-import from the backend (they don't have access to
+# the backend package anyway).
+PROCESSING_MODES: tuple[str, ...] = ("basic", "premium")
+
+
+@dataclass(frozen=True)
+class IngestSettings:
+    """Resolved per-upload knobs handed to ``DocumentsClient.upload``.
+
+    Use ``IngestSettings(...)`` directly to define benchmark defaults,
+    or ``IngestSettings.merge(defaults, opts)`` to apply CLI overrides
+    on top of those defaults.
+    """
+
+    use_vision_llm: bool = False
+    processing_mode: str = "basic"
+    should_summarize: bool = False
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "use_vision_llm": self.use_vision_llm,
+            "processing_mode": self.processing_mode,
+            "should_summarize": self.should_summarize,
+        }
+
+    @classmethod
+    def merge(cls, defaults: IngestSettings, opts: Mapping[str, Any]) -> IngestSettings:
+        """Apply CLI overrides on top of ``defaults``.
+
+        ``opts`` is the kwargs dict built by ``core.cli`` from the
+        argparse namespace (see ``_cmd_ingest`` / ``_cmd_run``). Keys
+        we look for: ``use_vision_llm`` (bool or None), ``processing_mode``
+        (str or None), ``should_summarize`` (bool or None). Anything
+        else is ignored so benchmarks can pass through their own opts.
+        """
+
+        return cls(
+            use_vision_llm=_coerce_bool(opts.get("use_vision_llm"), defaults.use_vision_llm),
+            processing_mode=_coerce_mode(opts.get("processing_mode"), defaults.processing_mode),
+            should_summarize=_coerce_bool(opts.get("should_summarize"), defaults.should_summarize),
+        )
+
+    def render_label(self) -> str:
+        """Human-readable single-line label for reports / log lines."""
+
+        return (
+            f"vision={'on' if self.use_vision_llm else 'off'}, "
+            f"mode={self.processing_mode}, "
+            f"summarize={'on' if self.should_summarize else 'off'}"
+        )
+
+
+def _coerce_bool(value: Any, default: bool) -> bool:
+    """Argparse with ``BooleanOptionalAction`` yields True/False/None.
+
+    ``None`` means the operator didn't pass the flag → fall back to
+    the benchmark default.
+    """
+
+    if value is None:
+        return default
+    if isinstance(value, bool):
+        return value
+    if isinstance(value, str):
+        return value.strip().lower() in {"1", "true", "yes", "on"}
+    return bool(value)
+
+
+def _coerce_mode(value: Any, default: str) -> str:
+    if value is None or value == "":
+        return default
+    val = str(value).strip().lower()
+    if val not in PROCESSING_MODES:
+        raise ValueError(
+            f"Invalid processing_mode {val!r}; must be one of {PROCESSING_MODES}"
+        )
+    return val
+
+
+# ---------------------------------------------------------------------------
+# Argparse helper
+# ---------------------------------------------------------------------------
+
+
+def _add_bool_pair(
+    parser: argparse.ArgumentParser,
+    *,
+    dest: str,
+    on_flag: str,
+    off_flag: str,
+    on_help: str,
+    off_help: str,
+) -> None:
+    """Add a mutually exclusive ``--foo`` / ``--no-foo`` pair.
+
+    We don't use ``argparse.BooleanOptionalAction`` because it would
+    auto-generate ``--no-use-vision-llm`` rather than the friendlier
+    ``--no-vision-llm`` that operators reach for. Default is ``None``
+    so ``IngestSettings.merge`` can distinguish "silent" from
+    "explicit false".
+    """
+
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        on_flag,
+        dest=dest,
+        action="store_true",
+        default=None,
+        help=on_help,
+    )
+    group.add_argument(
+        off_flag,
+        dest=dest,
+        action="store_false",
+        default=None,
+        help=off_help,
+    )
+
+
+def add_ingest_settings_args(
+    parser: argparse.ArgumentParser,
+    *,
+    defaults: IngestSettings,
+) -> None:
+    """Attach the three ingest-settings flag pairs to ``parser``.
+
+    Each bool exposes a mutually exclusive ``--foo`` / ``--no-foo``
+    pair so an operator can flip either direction without restating
+    every flag. Default is ``None`` so that "operator didn't pass the
+    flag" is distinguishable from "operator explicitly passed false"
+    — ``IngestSettings.merge`` then folds in the benchmark default
+    only when the operator was silent.
+    """
+
+    settings_group = parser.add_argument_group(
+        "ingest settings",
+        f"Per-upload knobs (forwarded to /documents/fileupload). "
+        f"Defaults for this benchmark: {defaults.render_label()}.",
+    )
+    _add_bool_pair(
+        settings_group,
+        dest="use_vision_llm",
+        on_flag="--use-vision-llm",
+        off_flag="--no-vision-llm",
+        on_help=(
+            "Run vision LLM during ingest to extract image content "
+            f"(default for this benchmark: "
+            f"{'on' if defaults.use_vision_llm else 'off'})."
+        ),
+        off_help="Skip vision LLM during ingest (text-only ETL).",
+    )
+    settings_group.add_argument(
+        "--processing-mode",
+        dest="processing_mode",
+        choices=PROCESSING_MODES,
+        default=None,
+        help=(
+            "SurfSense ETL processing mode (premium uses a 10x page "
+            f"multiplier and typically routes to a stronger ETL). "
+            f"Default for this benchmark: {defaults.processing_mode!r}."
+        ),
+    )
+    _add_bool_pair(
+        settings_group,
+        dest="should_summarize",
+        on_flag="--should-summarize",
+        off_flag="--no-summarize",
+        on_help=(
+            "Have SurfSense generate a document summary at ingest "
+            f"(default for this benchmark: "
+            f"{'on' if defaults.should_summarize else 'off'})."
+        ),
+        off_help="Skip per-document summary generation.",
+    )
+
+
+# ---------------------------------------------------------------------------
+# Doc-map manifest helpers
+# ---------------------------------------------------------------------------
+#
+# Every benchmark writes a doc-map JSONL under ``data/<suite>/maps/`` that
+# pairs source identifiers (case_id, snippet_id, doc_path, …) to the
+# SurfSense document_ids returned by the upload. To make the report
+# self-describing we also write a header line:
+#
+#     {"__settings__": {"use_vision_llm": ..., "processing_mode": ..., ...}}
+#
+# These two helpers centralise that protocol so each benchmark only has to
+# call ``write_settings_header`` and ``read_settings_header``.
+
+SETTINGS_HEADER_KEY = "__settings__"
+
+
+def settings_header_line(settings: IngestSettings) -> str:
+    """Return the JSON-serialised header line (no trailing newline)."""
+
+    return json.dumps({SETTINGS_HEADER_KEY: settings.to_dict()})
+
+
+def is_settings_header(row: Mapping[str, Any]) -> bool:
+    return SETTINGS_HEADER_KEY in row
+
+
+def read_settings_header(map_path: Path) -> dict[str, Any]:
+    """Read the ``__settings__`` header out of a doc-map JSONL.
+
+    Returns ``{}`` on a missing file, an empty file, an unreadable
+    file, or a file whose first non-blank line is not a settings
+    header (e.g. a corpus ingested before this feature existed).
+    Callers use this purely to surface settings in the report; it
+    must never fail the run.
+    """
+
+    if not map_path.exists():
+        return {}
+    try:
+        with map_path.open("r", encoding="utf-8") as fh:
+            for line in fh:
+                line = line.strip()
+                if not line:
+                    continue
+                row = json.loads(line)
+                if isinstance(row, dict) and SETTINGS_HEADER_KEY in row:
+                    return dict(row[SETTINGS_HEADER_KEY])
+                return {}
+    except (OSError, json.JSONDecodeError):
+        return {}
+    return {}
+
+
+def format_ingest_settings_md(settings: Any) -> str:
+    """Render the resolved settings as a single Markdown bullet line."""
+
+    if not isinstance(settings, Mapping) or not settings:
+        return "- SurfSense ingest settings: (not recorded — re-ingest to capture)"
+    vision = "on" if settings.get("use_vision_llm") else "off"
+    mode = settings.get("processing_mode") or "basic"
+    summarize = "on" if settings.get("should_summarize") else "off"
+    return (
+        f"- SurfSense ingest settings: vision_llm=`{vision}`, "
+        f"processing_mode=`{mode}`, summarize=`{summarize}`"
+    )
+
+
+__all__ = [
+    "PROCESSING_MODES",
+    "SETTINGS_HEADER_KEY",
+    "IngestSettings",
+    "add_ingest_settings_args",
+    "format_ingest_settings_md",
+    "is_settings_header",
+    "read_settings_header",
+    "settings_header_line",
+]
diff --git a/surfsense_evals/src/surfsense_evals/core/metrics/__init__.py b/surfsense_evals/src/surfsense_evals/core/metrics/__init__.py
new file mode 100644
index 0000000000..bd0e6aafb1
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/metrics/__init__.py
@@ -0,0 +1,50 @@
+"""Pure-function metric primitives. Lazy imports."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:  # pragma: no cover
+    from .comparison import McnemarResult, bootstrap_delta_ci, mcnemar_test, paired_aggregate
+    from .mc_accuracy import AccuracyResult, accuracy_with_wilson_ci, wilson_ci
+    from .retrieval import RetrievalScores, mrr, ndcg_at_k, recall_at_k, score_run
+
+__all__ = [
+    "AccuracyResult",
+    "McnemarResult",
+    "RetrievalScores",
+    "accuracy_with_wilson_ci",
+    "bootstrap_delta_ci",
+    "mcnemar_test",
+    "mrr",
+    "ndcg_at_k",
+    "paired_aggregate",
+    "recall_at_k",
+    "score_run",
+    "wilson_ci",
+]
+
+
+_MODULE_FOR = {
+    "AccuracyResult": "mc_accuracy",
+    "accuracy_with_wilson_ci": "mc_accuracy",
+    "wilson_ci": "mc_accuracy",
+    "RetrievalScores": "retrieval",
+    "mrr": "retrieval",
+    "ndcg_at_k": "retrieval",
+    "recall_at_k": "retrieval",
+    "score_run": "retrieval",
+    "McnemarResult": "comparison",
+    "bootstrap_delta_ci": "comparison",
+    "mcnemar_test": "comparison",
+    "paired_aggregate": "comparison",
+}
+
+
+def __getattr__(name: str):
+    if name in _MODULE_FOR:
+        from importlib import import_module
+
+        mod = import_module(f".{_MODULE_FOR[name]}", __name__)
+        return getattr(mod, name)
+    raise AttributeError(f"module 'surfsense_evals.core.metrics' has no attribute {name!r}")
diff --git a/surfsense_evals/src/surfsense_evals/core/metrics/comparison.py b/surfsense_evals/src/surfsense_evals/core/metrics/comparison.py
new file mode 100644
index 0000000000..579576f4ff
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/metrics/comparison.py
@@ -0,0 +1,258 @@
+"""Paired comparison statistics for head-to-head benchmarks.
+
+In every head-to-head benchmark (currently MedXpertQA-MM and
+MMLongBench-Doc) each question is answered by both arms (Native PDF
+and SurfSense). That makes per-question outcomes paired, so
+``McNemar's test`` on the discordant pairs is the right significance
+test for "are the two arms different?". We also expose a bootstrap
+delta CI for visualising effect size.
+
+Aggregate cost / latency / token deltas are mean-based; the runner
+slices them by arm before passing them in.
+"""
+
+from __future__ import annotations
+
+import math
+import statistics
+from collections.abc import Sequence
+from dataclasses import dataclass
+
+import numpy as np
+
+
+@dataclass(frozen=True)
+class McnemarResult:
+    """Discordant pair counts + the test statistics."""
+
+    n_total: int
+    b: int  # native correct, surfsense wrong
+    c: int  # native wrong,   surfsense correct
+    statistic: float
+    p_value: float
+    method: str
+
+    def to_dict(self) -> dict[str, float | int | str]:
+        return {
+            "n_total": self.n_total,
+            "b_native_correct_only": self.b,
+            "c_surfsense_correct_only": self.c,
+            "statistic": self.statistic,
+            "p_value": self.p_value,
+            "method": self.method,
+        }
+
+
+def mcnemar_test(
+    arm_a_correct: Sequence[bool],
+    arm_b_correct: Sequence[bool],
+    *,
+    use_exact_below: int = 11,
+) -> McnemarResult:
+    """Paired McNemar's test on per-question correctness.
+
+    ``arm_a_correct`` is treated as the reference arm (typically the
+    "native" arm); ``arm_b_correct`` is the challenger (typically
+    "surfsense"). The test statistic only depends on discordant pairs.
+
+    Default switch-over (``b + c < 11``): for very small discordant
+    samples the exact binomial test is preferred; above that the
+    continuity-corrected chi-square is well-behaved (Edwards 1948).
+    Callers can raise ``use_exact_below`` if they prefer the more
+    conservative ``b + c < 25`` rule.
+
+    No external statistical package is required: scipy is a heavy dep
+    and we only need binomial CDFs / chi-square sf, both implementable
+    in stdlib + numpy without surprises.
+    """
+
+    if len(arm_a_correct) != len(arm_b_correct):
+        raise ValueError(
+            f"Length mismatch: arm_a={len(arm_a_correct)}, arm_b={len(arm_b_correct)}"
+        )
+    n = len(arm_a_correct)
+    b = sum(1 for a, c in zip(arm_a_correct, arm_b_correct) if a and not c)
+    c = sum(1 for a, cc in zip(arm_a_correct, arm_b_correct) if (not a) and cc)
+    discordant = b + c
+    if discordant == 0:
+        return McnemarResult(
+            n_total=n, b=b, c=c, statistic=0.0, p_value=1.0, method="degenerate"
+        )
+
+    if discordant < use_exact_below:
+        # Exact binomial: under H0 each discordant pair is a Bernoulli(0.5).
+        # p-value = 2 * P(X <= min(b,c) | n=discordant, p=0.5), capped at 1.
+        k = min(b, c)
+        cdf = sum(_binom_pmf(discordant, i) for i in range(k + 1))
+        p_value = min(1.0, 2.0 * cdf)
+        return McnemarResult(
+            n_total=n, b=b, c=c, statistic=float(k), p_value=p_value, method="exact"
+        )
+
+    # Chi-square with continuity correction (McNemar-Edwards).
+    chi = ((abs(b - c) - 1) ** 2) / discordant
+    p_value = _chi2_sf(chi, df=1)
+    return McnemarResult(
+        n_total=n, b=b, c=c, statistic=chi, p_value=p_value, method="chi2_cc"
+    )
+
+
+def _binom_pmf(n: int, k: int) -> float:
+    return math.comb(n, k) * (0.5 ** n)
+
+
+def _chi2_sf(x: float, *, df: int) -> float:
+    """Survival function (1 - CDF) of chi-square; df=1 closed form."""
+
+    if x <= 0:
+        return 1.0
+    if df == 1:
+        # Chi^2(1) = N(0,1)^2; sf(x) = 2 * Phi_complement(sqrt(x))
+        return math.erfc(math.sqrt(x / 2.0))
+    # General fallback via regularized upper incomplete gamma.
+    a = df / 2.0
+    z = x / 2.0
+    return _gammaincc(a, z)
+
+
+def _gammaincc(a: float, x: float, *, max_iter: int = 200, tol: float = 1e-12) -> float:
+    """Regularised upper incomplete gamma Q(a, x). Series + continued fraction."""
+
+    if x < 0 or a <= 0:
+        return float("nan")
+    if x == 0:
+        return 1.0
+    if x < a + 1.0:
+        # Series for P(a, x); subtract from 1.
+        p_series = _gammainc_series(a, x, max_iter=max_iter, tol=tol)
+        return 1.0 - p_series
+    return _gammaincc_cf(a, x, max_iter=max_iter, tol=tol)
+
+
+def _gammainc_series(a: float, x: float, *, max_iter: int, tol: float) -> float:
+    term = 1.0 / a
+    summation = term
+    for n in range(1, max_iter):
+        term *= x / (a + n)
+        summation += term
+        if abs(term) < abs(summation) * tol:
+            break
+    log_pre = -x + a * math.log(x) - math.lgamma(a)
+    return summation * math.exp(log_pre)
+
+
+def _gammaincc_cf(a: float, x: float, *, max_iter: int, tol: float) -> float:
+    b = x + 1.0 - a
+    c_val = 1.0 / 1e-300
+    d = 1.0 / b
+    h = d
+    for i in range(1, max_iter):
+        an = -i * (i - a)
+        b += 2.0
+        d = an * d + b
+        if abs(d) < 1e-300:
+            d = 1e-300
+        c_val = b + an / c_val
+        if abs(c_val) < 1e-300:
+            c_val = 1e-300
+        d = 1.0 / d
+        delta = d * c_val
+        h *= delta
+        if abs(delta - 1.0) < tol:
+            break
+    log_pre = -x + a * math.log(x) - math.lgamma(a)
+    return h * math.exp(log_pre)
+
+
+# ---------------------------------------------------------------------------
+# Bootstrap delta CI
+# ---------------------------------------------------------------------------
+
+
+@dataclass(frozen=True)
+class BootstrapDelta:
+    delta: float
+    ci_low: float
+    ci_high: float
+    n_resamples: int
+
+    def to_dict(self) -> dict[str, float | int]:
+        return {
+            "delta": self.delta,
+            "ci_low": self.ci_low,
+            "ci_high": self.ci_high,
+            "n_resamples": self.n_resamples,
+        }
+
+
+def bootstrap_delta_ci(
+    arm_a_correct: Sequence[bool],
+    arm_b_correct: Sequence[bool],
+    *,
+    n_resamples: int = 5000,
+    level: float = 0.95,
+    random_state: int | None = 0,
+) -> BootstrapDelta:
+    """Paired-sample bootstrap CI for ``mean(arm_b) - mean(arm_a)``.
+
+    Resamples *paired indices* with replacement so the dependency
+    between arms is preserved.
+    """
+
+    if len(arm_a_correct) != len(arm_b_correct):
+        raise ValueError("paired arms must have the same length")
+    n = len(arm_a_correct)
+    if n == 0:
+        return BootstrapDelta(0.0, 0.0, 0.0, 0)
+    a = np.asarray(arm_a_correct, dtype=np.int8)
+    b = np.asarray(arm_b_correct, dtype=np.int8)
+    delta = float(b.mean() - a.mean())
+
+    rng = np.random.default_rng(random_state)
+    deltas = np.empty(n_resamples, dtype=np.float64)
+    for i in range(n_resamples):
+        idx = rng.integers(0, n, size=n)
+        deltas[i] = b[idx].mean() - a[idx].mean()
+    alpha = (1.0 - level) / 2.0
+    ci_low, ci_high = float(np.quantile(deltas, alpha)), float(np.quantile(deltas, 1 - alpha))
+    return BootstrapDelta(delta=delta, ci_low=ci_low, ci_high=ci_high, n_resamples=n_resamples)
+
+
+# ---------------------------------------------------------------------------
+# Simple aggregate helpers (cost / latency / tokens)
+# ---------------------------------------------------------------------------
+
+
+@dataclass(frozen=True)
+class Aggregate:
+    mean: float
+    median: float
+    p95: float
+    n: int
+
+    def to_dict(self) -> dict[str, float | int]:
+        return {"mean": self.mean, "median": self.median, "p95": self.p95, "n": self.n}
+
+
+def paired_aggregate(values: Sequence[float]) -> Aggregate:
+    """Mean / median / p95 of a list of numbers (e.g. cost-per-question)."""
+
+    if not values:
+        return Aggregate(0.0, 0.0, 0.0, 0)
+    arr = np.asarray(values, dtype=np.float64)
+    return Aggregate(
+        mean=float(arr.mean()),
+        median=float(statistics.median(values)),
+        p95=float(np.quantile(arr, 0.95)),
+        n=len(values),
+    )
+
+
+__all__ = [
+    "Aggregate",
+    "BootstrapDelta",
+    "McnemarResult",
+    "bootstrap_delta_ci",
+    "mcnemar_test",
+    "paired_aggregate",
+]
diff --git a/surfsense_evals/src/surfsense_evals/core/metrics/mc_accuracy.py b/surfsense_evals/src/surfsense_evals/core/metrics/mc_accuracy.py
new file mode 100644
index 0000000000..8b0188ca4b
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/metrics/mc_accuracy.py
@@ -0,0 +1,130 @@
+"""Multiple-choice accuracy + Wilson 95% confidence intervals.
+
+Wilson CI is preferred over normal-approximation because MIRAGE's
+per-task subsets can be small (PubMedQA* and BioASQ-Y/N have a few
+hundred questions each) and Wilson handles n→0 / p→{0,1} edges
+gracefully.
+
+Reference for the closed form: Wilson (1927); identical to the
+``statsmodels.stats.proportion.proportion_confint(method='wilson')``
+output and what scikit-learn implements internally for its bounded
+estimators.
+"""
+
+from __future__ import annotations
+
+import math
+from collections.abc import Mapping, Sequence
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class AccuracyResult:
+    """Per-task accuracy with Wilson CI."""
+
+    n_correct: int
+    n_total: int
+    accuracy: float
+    ci_low: float
+    ci_high: float
+
+    def to_dict(self) -> dict[str, float | int]:
+        return {
+            "n_correct": self.n_correct,
+            "n_total": self.n_total,
+            "accuracy": self.accuracy,
+            "ci_low": self.ci_low,
+            "ci_high": self.ci_high,
+        }
+
+
+# Two-sided Wilson z values. 1.959964 ≈ z_{0.975}.
+_Z_FOR_LEVEL: dict[float, float] = {
+    0.90: 1.6448536269514722,
+    0.95: 1.959963984540054,
+    0.99: 2.5758293035489004,
+}
+
+
+def wilson_ci(
+    n_correct: int, n_total: int, *, level: float = 0.95
+) -> tuple[float, float]:
+    """Two-sided Wilson score confidence interval for a proportion.
+
+    Returns ``(low, high)``. ``n_total == 0`` returns ``(0.0, 1.0)`` —
+    the maximally uncertain interval.
+    """
+
+    if n_total <= 0:
+        return 0.0, 1.0
+    if level not in _Z_FOR_LEVEL:
+        raise ValueError(f"Unsupported confidence level {level!r}")
+    z = _Z_FOR_LEVEL[level]
+    p = n_correct / n_total
+    n = n_total
+    denom = 1.0 + (z * z) / n
+    centre = (p + (z * z) / (2 * n)) / denom
+    half = (z / denom) * math.sqrt((p * (1 - p) / n) + (z * z) / (4 * n * n))
+    low = max(0.0, centre - half)
+    high = min(1.0, centre + half)
+    return low, high
+
+
+def accuracy_with_wilson_ci(
+    n_correct: int, n_total: int, *, level: float = 0.95
+) -> AccuracyResult:
+    if n_total < 0:
+        raise ValueError(f"n_total must be >= 0, got {n_total}")
+    if n_correct < 0 or n_correct > n_total:
+        raise ValueError(
+            f"n_correct must be in [0, n_total]; got n_correct={n_correct}, n_total={n_total}"
+        )
+    accuracy = (n_correct / n_total) if n_total > 0 else 0.0
+    low, high = wilson_ci(n_correct, n_total, level=level)
+    return AccuracyResult(
+        n_correct=n_correct,
+        n_total=n_total,
+        accuracy=accuracy,
+        ci_low=low,
+        ci_high=high,
+    )
+
+
+def per_task_accuracy(
+    rows: Sequence[Mapping[str, object]],
+    *,
+    task_key: str = "task",
+    correct_key: str = "is_correct",
+    level: float = 0.95,
+) -> dict[str, AccuracyResult]:
+    """Group ``rows`` by ``task_key`` and compute per-task ``AccuracyResult``.
+
+    ``rows[i][correct_key]`` must be truthy iff the answer was correct.
+    """
+
+    counts: dict[str, list[int]] = {}
+    for row in rows:
+        task = str(row.get(task_key, ""))
+        bucket = counts.setdefault(task, [0, 0])
+        bucket[1] += 1
+        if row.get(correct_key):
+            bucket[0] += 1
+    return {
+        task: accuracy_with_wilson_ci(c[0], c[1], level=level)
+        for task, c in counts.items()
+    }
+
+
+def macro_accuracy(per_task: Mapping[str, AccuracyResult]) -> float:
+    if not per_task:
+        return 0.0
+    return sum(r.accuracy for r in per_task.values()) / len(per_task)
+
+
+__all__ = [
+    "AccuracyResult",
+    "accuracy_with_wilson_ci",
+    "macro_accuracy",
+    "per_task_accuracy",
+    "wilson_ci",
+]
diff --git a/surfsense_evals/src/surfsense_evals/core/metrics/retrieval.py b/surfsense_evals/src/surfsense_evals/core/metrics/retrieval.py
new file mode 100644
index 0000000000..d4cfe10ae2
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/metrics/retrieval.py
@@ -0,0 +1,132 @@
+"""Retrieval metrics: Recall@k, MRR, nDCG@k.
+
+Used by CUREv1's runner to score the SurfSense arm against the
+benchmark's qrels. ``corpus_id`` is the canonical CUREv1 passage id
+(string); the runner maps SurfSense ``chunk_id`` → ``document_id`` →
+``corpus_id`` before calling these.
+
+Graded relevance (CUREv1 uses 0/1/2 grades) is honoured by ``ndcg_at_k``;
+``recall_at_k`` and ``mrr`` flatten anything > 0 to "relevant".
+"""
+
+from __future__ import annotations
+
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class RetrievalScores:
+    """Aggregated retrieval scores."""
+
+    recall_at_k: dict[int, float]
+    mrr: float
+    ndcg_at_10: float
+    n_queries: int
+
+    def to_dict(self) -> dict:
+        return {
+            "recall_at_k": dict(self.recall_at_k),
+            "mrr": self.mrr,
+            "ndcg_at_10": self.ndcg_at_10,
+            "n_queries": self.n_queries,
+        }
+
+
+def recall_at_k(retrieved: Sequence[str], relevant: Iterable[str], k: int) -> float:
+    """Fraction of ``relevant`` documents found in ``retrieved[:k]``."""
+
+    if not relevant:
+        return 0.0
+    relevant_set = set(relevant)
+    if not relevant_set:
+        return 0.0
+    top_k = list(retrieved)[:k]
+    hits = sum(1 for doc in top_k if doc in relevant_set)
+    return hits / len(relevant_set)
+
+
+def mrr(retrieved: Sequence[str], relevant: Iterable[str]) -> float:
+    """Reciprocal rank of the first relevant doc, 0 if none found."""
+
+    relevant_set = set(relevant)
+    for rank, doc in enumerate(retrieved, start=1):
+        if doc in relevant_set:
+            return 1.0 / rank
+    return 0.0
+
+
+def _dcg_at_k(grades: Sequence[float], k: int) -> float:
+    s = 0.0
+    for i, grade in enumerate(grades[:k], start=1):
+        # Standard log-base-2 discount; gain = 2^grade - 1 for graded relevance.
+        s += (2.0 ** grade - 1.0) / math.log2(i + 1)
+    return s
+
+
+def ndcg_at_k(
+    retrieved: Sequence[str],
+    qrels: Mapping[str, float],
+    k: int,
+) -> float:
+    """nDCG@k against graded ``qrels`` (``{doc_id: grade}``).
+
+    Unjudged documents in ``retrieved`` contribute zero gain. The
+    ideal ordering is ``qrels`` sorted by grade descending.
+    """
+
+    if not qrels:
+        return 0.0
+    grades = [float(qrels.get(doc, 0.0)) for doc in retrieved]
+    dcg = _dcg_at_k(grades, k)
+    ideal = sorted(qrels.values(), reverse=True)
+    idcg = _dcg_at_k([float(g) for g in ideal], k)
+    if idcg == 0.0:
+        return 0.0
+    return dcg / idcg
+
+
+def score_run(
+    *,
+    per_query_retrieved: Mapping[str, Sequence[str]],
+    per_query_qrels: Mapping[str, Mapping[str, float]],
+    ks: Sequence[int] = (1, 5, 10, 32),
+    ndcg_k: int = 10,
+) -> RetrievalScores:
+    """Aggregate Recall@k, MRR, nDCG@k across a run.
+
+    ``per_query_retrieved`` maps ``query_id -> ordered list of doc ids``.
+    ``per_query_qrels`` maps ``query_id -> {doc_id: grade}`` (grade > 0
+    is relevant).
+
+    Queries present in retrieved but not in qrels are skipped. Queries
+    in qrels but missing from retrieved contribute zeros.
+    """
+
+    qids = set(per_query_qrels.keys()) & set(per_query_retrieved.keys())
+    if not qids:
+        return RetrievalScores(recall_at_k={k: 0.0 for k in ks}, mrr=0.0, ndcg_at_10=0.0, n_queries=0)
+
+    recall_totals = {k: 0.0 for k in ks}
+    mrr_total = 0.0
+    ndcg_total = 0.0
+    for qid in qids:
+        retrieved = list(per_query_retrieved[qid])
+        qrels = per_query_qrels[qid]
+        relevant_docs = [d for d, g in qrels.items() if g > 0]
+        for k in ks:
+            recall_totals[k] += recall_at_k(retrieved, relevant_docs, k)
+        mrr_total += mrr(retrieved, relevant_docs)
+        ndcg_total += ndcg_at_k(retrieved, qrels, ndcg_k)
+
+    n = len(qids)
+    return RetrievalScores(
+        recall_at_k={k: v / n for k, v in recall_totals.items()},
+        mrr=mrr_total / n,
+        ndcg_at_10=ndcg_total / n,
+        n_queries=n,
+    )
+
+
+__all__ = ["RetrievalScores", "mrr", "ndcg_at_k", "recall_at_k", "score_run"]
diff --git a/surfsense_evals/src/surfsense_evals/core/parse/__init__.py b/surfsense_evals/src/surfsense_evals/core/parse/__init__.py
new file mode 100644
index 0000000000..208c2d374a
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/parse/__init__.py
@@ -0,0 +1,21 @@
+"""Parsers shared across suites: citations, MCQ envelopes, AI-SDK SSE."""
+
+from __future__ import annotations
+
+from .answer_letter import AnswerLetterResult, extract_answer_letter
+from .citations import CITATION_REGEX, CitationToken, ChunkCitation, UrlCitation, parse_citations
+from .freeform_answer import extract_freeform_answer
+from .sse import SseEvent, iter_sse_events
+
+__all__ = [
+    "CITATION_REGEX",
+    "CitationToken",
+    "ChunkCitation",
+    "UrlCitation",
+    "parse_citations",
+    "AnswerLetterResult",
+    "extract_answer_letter",
+    "extract_freeform_answer",
+    "SseEvent",
+    "iter_sse_events",
+]
diff --git a/surfsense_evals/src/surfsense_evals/core/parse/answer_letter.py b/surfsense_evals/src/surfsense_evals/core/parse/answer_letter.py
new file mode 100644
index 0000000000..8cf23869bb
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/parse/answer_letter.py
@@ -0,0 +1,122 @@
+"""Robust extractor for MCQ answer letters.
+
+Handles three answer shapes seen in the wild:
+
+1. **MedRAG envelope** — ``{"step_by_step_thinking": "...", "answer_choice": "A"}``
+   embedded somewhere in the assistant message (often inside ```` ```json ```` /
+   ``` ``` ``` fences). The regex grabs the JSON object and reads the
+   ``answer_choice`` field.
+
+2. **Final-line letter** — e.g. ``Answer: B`` or ``The correct answer is (C).``.
+   Falls back to a permissive regex over the last few lines.
+
+3. **Bare letter** — single uppercase letter at the end of the message.
+
+The function returns the parsed letter (uppercased) plus a discriminator
+of which strategy fired so the runner / report can flag suspicious
+parses (typically zero-confidence parses indicate the model didn't
+follow the prompt).
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from dataclasses import dataclass
+from typing import Literal
+
+ParserStrategy = Literal["json_envelope", "answer_line", "bare_letter", "none"]
+
+
+@dataclass(frozen=True)
+class AnswerLetterResult:
+    letter: str | None
+    strategy: ParserStrategy
+
+    @property
+    def found(self) -> bool:
+        return self.letter is not None
+
+
+# ---------------------------------------------------------------------------
+# Strategies
+# ---------------------------------------------------------------------------
+
+
+_JSON_BLOCK = re.compile(r"\{[^{}]*\"answer_choice\"\s*:\s*\"([A-Za-z])\"[^{}]*\}", re.DOTALL)
+_FENCED_JSON = re.compile(r"```(?:json)?\s*(\{.*?\})\s*```", re.DOTALL | re.IGNORECASE)
+_ANSWER_LINE = re.compile(
+    r"(?:final\s*answer|answer\s*choice|the\s+correct\s+answer\s+is|answer)\s*[:=\-]?\s*"
+    r"\(?\s*([A-Za-z])\s*[\)\.]*\s*$",
+    re.IGNORECASE | re.MULTILINE,
+)
+_BARE_LETTER = re.compile(r"^\s*\(?\s*([A-Za-z])\s*[\)\.]*\s*$", re.MULTILINE)
+
+
+def _from_json_envelope(text: str) -> str | None:
+    # Try fenced code blocks first (most likely to contain the JSON).
+    for fence in _FENCED_JSON.finditer(text):
+        try:
+            obj = json.loads(fence.group(1))
+        except (json.JSONDecodeError, ValueError):
+            continue
+        if isinstance(obj, dict):
+            choice = obj.get("answer_choice")
+            if isinstance(choice, str) and choice.strip():
+                return choice.strip()[:1].upper()
+
+    # Fall back to a tolerant regex over the whole text (handles
+    # responses that drop the fences).
+    match = _JSON_BLOCK.search(text)
+    if match:
+        return match.group(1).upper()
+    return None
+
+
+def _from_answer_line(text: str) -> str | None:
+    # Walk lines bottom-up; the answer is almost always near the end.
+    for match in reversed(list(_ANSWER_LINE.finditer(text))):
+        letter = match.group(1).upper()
+        if letter.isalpha():
+            return letter
+    return None
+
+
+def _from_bare_letter(text: str) -> str | None:
+    # Inspect only the final non-empty lines (avoid grabbing in-prose
+    # mentions of "A" or "I").
+    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
+    for ln in reversed(lines[-3:]):
+        match = _BARE_LETTER.match(ln)
+        if match:
+            return match.group(1).upper()
+    return None
+
+
+def extract_answer_letter(text: str) -> AnswerLetterResult:
+    """Run strategies in order and return the first hit.
+
+    Order: JSON envelope → final-answer-line regex → bare-letter
+    fallback. Empty / whitespace-only text returns
+    ``AnswerLetterResult(None, "none")``.
+    """
+
+    if not text or not text.strip():
+        return AnswerLetterResult(None, "none")
+
+    letter = _from_json_envelope(text)
+    if letter:
+        return AnswerLetterResult(letter, "json_envelope")
+
+    letter = _from_answer_line(text)
+    if letter:
+        return AnswerLetterResult(letter, "answer_line")
+
+    letter = _from_bare_letter(text)
+    if letter:
+        return AnswerLetterResult(letter, "bare_letter")
+
+    return AnswerLetterResult(None, "none")
+
+
+__all__ = ["AnswerLetterResult", "ParserStrategy", "extract_answer_letter"]
diff --git a/surfsense_evals/src/surfsense_evals/core/parse/citations.py b/surfsense_evals/src/surfsense_evals/core/parse/citations.py
new file mode 100644
index 0000000000..1fcd354346
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/parse/citations.py
@@ -0,0 +1,110 @@
+"""Python port of the canonical citation parser.
+
+Source of truth: ``surfsense_web/lib/citations/citation-parser.ts:20-21``.
+The pattern is byte-for-byte identical to the TS export ``CITATION_REGEX``
+so a SurfSense user reading the web client and a CUREv1 retrieval scorer
+running here see the same chunk_ids extracted from the same answer.
+
+The TS reference also handles a ``urlcite{N}`` placeholder produced by
+``preprocessCitationMarkdown`` — that pre-processing step is web-only
+(GFM autolink workaround), so the harness sees raw ``[citation:URL]``
+tokens and ``parse_citations`` returns them as ``UrlCitation`` directly.
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+from typing import Any, Union
+
+# Pattern preserves the TS source verbatim:
+#   /[\[【]\u200B?citation:\s*(https?:\/\/[^\]】\u200B]+|urlcite\d+|(?:doc-)?-?\d+(?:\s*,\s*(?:doc-)?-?\d+)*)\s*\u200B?[\]】]/g
+#
+# Notes:
+# * Matches both ASCII ``[]`` and Chinese fullwidth ``【】`` brackets.
+# * Allows an optional ZWSP (``\u200B``) just inside each bracket.
+# * ``citation:`` then EITHER a URL (anything not ``]``, ``】``, or ZWSP),
+#   OR a ``urlcite\d+`` placeholder, OR one or more comma-separated
+#   chunk ids (each optionally prefixed with ``doc-`` and optionally
+#   negative).
+# * URL char class deliberately excludes the closing brackets so a
+#   ``[citation:https://x.com]`` doesn't swallow the ``]``.
+# The ZWSP must be the actual code-point — the original TS source uses
+# the regex literal ``\u200B`` which the JS engine interprets as the
+# character. Python's ``re`` doesn't process the ``\u`` escape inside
+# the pattern source, so we splice the literal character in via an
+# f-string. This keeps our pattern functionally identical to the TS
+# reference and lets ``"\u200B" in CITATION_REGEX.pattern`` succeed.
+_ZWSP = "\u200B"
+CITATION_REGEX = re.compile(
+    rf"[\[【]{_ZWSP}?citation:\s*("
+    rf"https?://[^\]】{_ZWSP}]+|urlcite\d+|(?:doc-)?-?\d+(?:\s*,\s*(?:doc-)?-?\d+)*"
+    rf")\s*{_ZWSP}?[\]】]"
+)
+
+
+@dataclass(frozen=True)
+class ChunkCitation:
+    chunk_id: int
+    is_docs_chunk: bool
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "kind": "chunk",
+            "chunk_id": self.chunk_id,
+            "is_docs_chunk": self.is_docs_chunk,
+        }
+
+
+@dataclass(frozen=True)
+class UrlCitation:
+    url: str
+
+    def to_dict(self) -> dict[str, Any]:
+        return {"kind": "url", "url": self.url}
+
+
+CitationToken = Union[ChunkCitation, UrlCitation]
+
+
+def parse_citations(text: str, *, url_map: dict[str, str] | None = None) -> list[CitationToken]:
+    """Return the citation tokens found in ``text`` in document order.
+
+    ``url_map`` is the optional ``urlciteN -> URL`` lookup that the web
+    client builds in its preprocessing step. The harness ordinarily
+    doesn't preprocess (we don't render the markdown, we score it), so
+    the default empty map means ``urlciteN`` placeholders are dropped
+    rather than mis-resolved to a missing URL.
+
+    Multi-id payloads like ``[citation:1, doc-2, -3]`` are flattened
+    into separate ``ChunkCitation`` entries — same as the TS reference.
+    """
+
+    out: list[CitationToken] = []
+    for match in CITATION_REGEX.finditer(text):
+        captured = match.group(1)
+        if captured.startswith("http://") or captured.startswith("https://"):
+            out.append(UrlCitation(url=captured.strip()))
+            continue
+        if captured.startswith("urlcite"):
+            if url_map and captured in url_map:
+                out.append(UrlCitation(url=url_map[captured]))
+            continue
+        for raw_id in (s.strip() for s in captured.split(",")):
+            is_docs_chunk = raw_id.startswith("doc-")
+            number_part = raw_id[4:] if is_docs_chunk else raw_id
+            try:
+                chunk_id = int(number_part)
+            except ValueError:
+                continue
+            out.append(ChunkCitation(chunk_id=chunk_id, is_docs_chunk=is_docs_chunk))
+    return out
+
+
+__all__ = [
+    "CITATION_REGEX",
+    "ChunkCitation",
+    "UrlCitation",
+    "CitationToken",
+    "parse_citations",
+]
diff --git a/surfsense_evals/src/surfsense_evals/core/parse/freeform_answer.py b/surfsense_evals/src/surfsense_evals/core/parse/freeform_answer.py
new file mode 100644
index 0000000000..959b045a52
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/parse/freeform_answer.py
@@ -0,0 +1,85 @@
+"""Extract free-form answers from open-ended LLM responses.
+
+Used by benchmarks that don't have a fixed letter set (MMLongBench-Doc,
+DocVQA-style benchmarks, future legal/finance suites). The contract:
+
+* Strip leading "Answer:" / "Final answer:" markers if present.
+* Drop fenced code blocks if the model wrapped its answer in one.
+* Trim leading/trailing whitespace.
+* Return the *last* meaningful chunk — models often think out loud
+  before stating the answer.
+
+If the message is empty or only contains a fence, return ``""``.
+"""
+
+from __future__ import annotations
+
+import re
+
+_ANSWER_PREFIX = re.compile(
+    r"^\s*(?:final\s*answer|the\s+answer\s+is|answer)\s*[:=\-]\s*",
+    re.IGNORECASE,
+)
+# Marker-only regex (no capture group) used to find every "Answer:"
+# token position. We then slice from the LAST marker's end to the
+# next newline ourselves — robust to multiple inline answers because
+# we never let the engine greedy-capture across markers.
+_ANSWER_MARKER = re.compile(
+    r"(?:final\s*answer|the\s+answer\s+is|answer)\s*[:=\-]\s*",
+    re.IGNORECASE,
+)
+_FENCED_BLOCK = re.compile(r"```[a-zA-Z0-9]*\s*([\s\S]*?)\s*```")
+
+
+def extract_freeform_answer(text: str) -> str:
+    """Pull the model's final answer out of a possibly-verbose response."""
+
+    if not text or not text.strip():
+        return ""
+
+    # 1. Find the last line that starts with an Answer: marker. If
+    #    nothing matches, walk back to the last non-empty line.
+    lines = [ln.rstrip() for ln in text.strip().splitlines()]
+    candidate = ""
+    for ln in reversed(lines):
+        if not ln.strip():
+            continue
+        if _ANSWER_PREFIX.search(ln):
+            candidate = _ANSWER_PREFIX.sub("", ln, count=1).strip()
+            break
+
+    if not candidate:
+        # 2. Inline match: find every "Answer:" marker position and
+        # slice from the LAST marker's end to the next newline. Robust
+        # to "preamble.Answer: 42" one-liners and multiple inline
+        # markers (we always pick the final, freshest one).
+        marker_matches = list(_ANSWER_MARKER.finditer(text))
+        if marker_matches:
+            last = marker_matches[-1]
+            tail = text[last.end():]
+            nl = tail.find("\n")
+            if nl >= 0:
+                tail = tail[:nl]
+            candidate = tail.strip()
+
+    if not candidate:
+        # 3. No "Answer:" marker — try fenced blocks.
+        fences = _FENCED_BLOCK.findall(text)
+        if fences:
+            candidate = fences[-1].strip()
+        else:
+            # Last non-empty line as a fallback.
+            for ln in reversed(lines):
+                if ln.strip():
+                    candidate = ln.strip()
+                    break
+
+    # 2. Strip wrapping quotes / parens / trailing punctuation that
+    #    confuse the grader without changing meaning.
+    candidate = candidate.strip().strip("`").strip()
+    if candidate.startswith(("\"", "'")) and candidate.endswith(("\"", "'")):
+        candidate = candidate[1:-1].strip()
+    return candidate
+
+
+__all__ = ["extract_freeform_answer"]
diff --git a/surfsense_evals/src/surfsense_evals/core/parse/sse.py b/surfsense_evals/src/surfsense_evals/core/parse/sse.py
new file mode 100644
index 0000000000..76ded2d138
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/parse/sse.py
@@ -0,0 +1,72 @@
+"""Minimal SSE consumer compatible with SurfSense's wire format.
+
+SurfSense uses ``app/services/streaming/envelope/sse.py`` to frame events:
+
+* ``data: <single-line-string>\\n\\n``
+* ``data: <json-string>\\n\\n``  (most events)
+* ``data: [DONE]\\n\\n``  (terminator)
+
+There is no ``event:``, ``id:``, or ``retry:`` framing in production —
+``format_sse(payload)`` only emits the ``data:`` line. This implementation
+is therefore intentionally smaller than ``httpx-sse`` (which we still
+list as a dep so callers who want richer parsing can opt in): one event
+per ``data:`` line, separated by blank lines.
+
+We accept any line iterator (an ``httpx.Response.aiter_lines`` adapter
+in production, a list in tests) so this is unit-testable without a
+network mock.
+"""
+
+from __future__ import annotations
+
+from collections.abc import AsyncIterator
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class SseEvent:
+    """A parsed SSE event. Only the ``data`` field is populated.
+
+    Multi-line payloads (``data: a\\ndata: b``) are joined with ``\\n``
+    per the SSE spec, even though SurfSense doesn't currently emit them.
+    """
+
+    data: str
+
+
+async def iter_sse_events(lines: AsyncIterator[str]) -> AsyncIterator[SseEvent]:
+    """Yield one ``SseEvent`` per blank-line-terminated frame.
+
+    Lines that are empty or whitespace flush the buffer. ``data:`` lines
+    are accumulated into the buffer; everything else is ignored
+    (matches the lenient browser EventSource behaviour).
+    """
+
+    buffer: list[str] = []
+    async for raw in lines:
+        if raw is None:
+            continue
+        line = raw.rstrip("\r")
+        if line == "":
+            if buffer:
+                yield SseEvent(data="\n".join(buffer))
+                buffer.clear()
+            continue
+        if line.startswith(":"):
+            # comment / heartbeat
+            continue
+        if line.startswith("data:"):
+            # spec: optional single space after the colon.
+            payload = line[5:]
+            if payload.startswith(" "):
+                payload = payload[1:]
+            buffer.append(payload)
+            continue
+        # Any other field (event:, id:, retry:) is currently unused.
+        continue
+
+    if buffer:
+        yield SseEvent(data="\n".join(buffer))
+
+
+__all__ = ["SseEvent", "iter_sse_events"]
diff --git a/surfsense_evals/src/surfsense_evals/core/parsers/__init__.py b/surfsense_evals/src/surfsense_evals/core/parsers/__init__.py
new file mode 100644
index 0000000000..6a8e6c4ce5
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/parsers/__init__.py
@@ -0,0 +1,35 @@
+"""Direct parser invocations for the parser_compare benchmark.
+
+The SurfSense backend exposes a single ``ETL_SERVICE`` env var that
+picks one parser globally; per-ingestion overrides are not on the
+public API. To drive the four (Azure DI x basic/premium, LlamaCloud x
+basic/premium) extractions we need for ``multimodal_doc/parser_compare``
+we therefore call the Azure DI and LlamaCloud SDKs directly from the
+eval harness, mirroring the production code path in
+``surfsense_backend/app/etl_pipeline/parsers/``.
+
+Two design rules:
+
+* No backend imports — the eval harness cannot pull in the FastAPI
+  app's config layer (it would require the full backend ``.env`` plus a
+  reachable Postgres). We re-read keys from our own environment instead.
+* Same wire shape as the backend's parsers (Azure ``prebuilt-read`` /
+  ``prebuilt-layout`` selected by ``processing_mode``; LlamaCloud
+  ``parse_page_with_llm`` / ``parse_page_with_agent`` selected by
+  ``processing_mode``) so any quality conclusions transfer back to
+  production behaviour.
+"""
+
+from __future__ import annotations
+
+from .azure_di import AzureDIError, parse_with_azure_di
+from .llamacloud import LlamaCloudError, parse_with_llamacloud
+from .pdf_pages import count_pdf_pages
+
+__all__ = [
+    "AzureDIError",
+    "LlamaCloudError",
+    "count_pdf_pages",
+    "parse_with_azure_di",
+    "parse_with_llamacloud",
+]
diff --git a/surfsense_evals/src/surfsense_evals/core/parsers/azure_di.py b/surfsense_evals/src/surfsense_evals/core/parsers/azure_di.py
new file mode 100644
index 0000000000..eebad906aa
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/parsers/azure_di.py
@@ -0,0 +1,144 @@
+"""Azure Document Intelligence parser — eval-side mirror of the backend.
+
+Calls ``DocumentIntelligenceClient.begin_analyze_document`` with one
+of two ``model_id`` slugs depending on ``processing_mode``:
+
+* ``basic``   → ``prebuilt-read``   (text OCR only, cheaper, faster)
+* ``premium`` → ``prebuilt-layout`` (text + tables + structure;
+                                     produces real markdown headings,
+                                     pipe-tables, etc.)
+
+These are the same model selections the production
+``surfsense_backend/app/etl_pipeline/parsers/azure_doc_intelligence.py``
+makes per ``processing_mode``. Output format is forced to Markdown
+(``DocumentContentFormat.MARKDOWN``) so the long-context arm can stuff
+it into a prompt verbatim.
+
+Retry policy is intentionally light here (the eval harness re-runs
+the whole batch on top-level failure); we do one synchronous attempt
+plus exponential backoff on transient transport errors.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import os
+import random
+
+logger = logging.getLogger(__name__)
+
+
+_AZURE_MODEL_BY_MODE = {
+    "basic": "prebuilt-read",
+    "premium": "prebuilt-layout",
+}
+
+_MAX_RETRIES = 4
+_BASE_DELAY = 5.0
+_MAX_DELAY = 60.0
+
+
+class AzureDIError(RuntimeError):
+    """Raised when Azure DI fails after all retries."""
+
+
+async def parse_with_azure_di(
+    file_path: str | os.PathLike,
+    *,
+    processing_mode: str = "basic",
+    endpoint: str | None = None,
+    api_key: str | None = None,
+) -> str:
+    """Run Azure DI on ``file_path`` and return the markdown content.
+
+    ``endpoint`` / ``api_key`` default to ``AZURE_DI_ENDPOINT`` and
+    ``AZURE_DI_KEY`` env vars (set in ``surfsense_evals/.env``).
+
+    Raises ``AzureDIError`` after exhausting retries; ``ValueError`` if
+    credentials are missing.
+    """
+
+    endpoint = endpoint or os.environ.get("AZURE_DI_ENDPOINT")
+    api_key = api_key or os.environ.get("AZURE_DI_KEY")
+    if not endpoint or not api_key:
+        raise ValueError(
+            "AZURE_DI_ENDPOINT and AZURE_DI_KEY must be set "
+            "(see surfsense_evals/.env)."
+        )
+
+    model_id = _AZURE_MODEL_BY_MODE.get(processing_mode, "prebuilt-read")
+
+    # Lazy imports — surfsense_evals shouldn't pay the azure-sdk
+    # import cost on every CLI invocation that doesn't touch
+    # parser_compare.
+    from azure.ai.documentintelligence.aio import DocumentIntelligenceClient
+    from azure.ai.documentintelligence.models import DocumentContentFormat
+    from azure.core.credentials import AzureKeyCredential
+    from azure.core.exceptions import (
+        ClientAuthenticationError,
+        HttpResponseError,
+        ServiceRequestError,
+        ServiceResponseError,
+    )
+
+    file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
+    logger.info(
+        "Azure DI parsing %s (mode=%s, model=%s, size=%.1fMB)",
+        file_path, processing_mode, model_id, file_size_mb,
+    )
+
+    last_exc: Exception | None = None
+    for attempt in range(1, _MAX_RETRIES + 1):
+        try:
+            client = DocumentIntelligenceClient(
+                endpoint=endpoint,
+                credential=AzureKeyCredential(api_key),
+            )
+            async with client:
+                with open(file_path, "rb") as fh:
+                    poller = await client.begin_analyze_document(
+                        model_id,
+                        body=fh,
+                        output_content_format=DocumentContentFormat.MARKDOWN,
+                    )
+                result = await poller.result()
+            content = (result.content or "").strip()
+            if not content:
+                raise AzureDIError(
+                    f"Azure DI returned empty content for {file_path}"
+                )
+            logger.info(
+                "Azure DI OK: %s (%s) -> %d chars",
+                file_path, model_id, len(content),
+            )
+            return content
+
+        except ClientAuthenticationError:
+            raise
+        except HttpResponseError as exc:
+            # 4xx that's not auth: don't retry, the request itself is broken.
+            if exc.status_code and 400 <= exc.status_code < 500:
+                raise AzureDIError(
+                    f"Azure DI {exc.status_code} on {file_path}: {exc}"
+                ) from exc
+            last_exc = exc
+        except (ServiceRequestError, ServiceResponseError) as exc:
+            last_exc = exc
+
+        if attempt < _MAX_RETRIES:
+            delay = min(_BASE_DELAY * (2 ** (attempt - 1)), _MAX_DELAY)
+            jitter = delay * 0.25 * (2 * random.random() - 1)
+            sleep_for = delay + jitter
+            logger.warning(
+                "Azure DI attempt %d/%d failed (%s); retrying in %.1fs",
+                attempt, _MAX_RETRIES, type(last_exc).__name__, sleep_for,
+            )
+            await asyncio.sleep(sleep_for)
+
+    raise AzureDIError(
+        f"Azure DI failed after {_MAX_RETRIES} attempts on {file_path}"
+    ) from last_exc
+
+
+__all__ = ["AzureDIError", "parse_with_azure_di"]
diff --git a/surfsense_evals/src/surfsense_evals/core/parsers/llamacloud.py b/surfsense_evals/src/surfsense_evals/core/parsers/llamacloud.py
new file mode 100644
index 0000000000..ba3d787efe
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/parsers/llamacloud.py
@@ -0,0 +1,168 @@
+"""LlamaParse (LlamaCloud) parser — eval-side mirror of the backend.
+
+Calls ``LlamaParse.aparse`` with one of two ``parse_mode`` slugs
+depending on ``processing_mode``:
+
+* ``basic``   → ``parse_page_with_llm``   (cheap, single-LLM-call/page)
+* ``premium`` → ``parse_page_with_agent`` (multi-step agent per page;
+                                            handles tables / figures
+                                            substantially better)
+
+These are the exact mappings from production
+``surfsense_backend/app/etl_pipeline/parsers/llamacloud.py``. We keep
+``num_workers=1`` and language=``"en"`` to match production.
+
+The result is materialised via ``get_markdown_documents(split_by_page=False)``
+which concatenates every page into a single markdown string, exactly
+the shape we need for long-context stuffing.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import os
+import random
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+_LLAMA_PARSE_MODE_MAP = {
+    "basic": "parse_page_with_llm",
+    "premium": "parse_page_with_agent",
+}
+
+_MAX_RETRIES = 3
+_BASE_DELAY = 10.0
+_MAX_DELAY = 90.0
+
+
+class LlamaCloudError(RuntimeError):
+    """Raised when LlamaCloud parse fails after all retries."""
+
+
+def _extract_markdown(result) -> str:
+    """Pull markdown out of whatever object LlamaParse.aparse returns.
+
+    Mirrors backend's tolerant extraction: the SDK has gone through
+    several response shapes; we accept all of them so a minor SDK bump
+    doesn't silently zero the eval.
+    """
+
+    if hasattr(result, "get_markdown_documents"):
+        docs = result.get_markdown_documents(split_by_page=False)
+        if docs and hasattr(docs[0], "text"):
+            return docs[0].text
+        if hasattr(result, "pages") and result.pages:
+            return "\n\n".join(p.md for p in result.pages if hasattr(p, "md") and p.md)
+
+    if isinstance(result, list):
+        if result and hasattr(result[0], "text"):
+            return result[0].text
+        return "\n\n".join(
+            doc.page_content if hasattr(doc, "page_content") else str(doc)
+            for doc in result
+        )
+
+    return str(result)
+
+
+async def parse_with_llamacloud(
+    file_path: str | os.PathLike,
+    *,
+    processing_mode: str = "basic",
+    estimated_pages: int = 50,
+    api_key: str | None = None,
+) -> str:
+    """Run LlamaParse on ``file_path`` and return the markdown content.
+
+    ``api_key`` defaults to the ``LLAMA_CLOUD_API_KEY`` env var (set
+    in ``surfsense_evals/.env``).
+
+    Raises ``LlamaCloudError`` after exhausting retries; ``ValueError``
+    if the API key is missing.
+    """
+
+    api_key = api_key or os.environ.get("LLAMA_CLOUD_API_KEY")
+    if not api_key:
+        raise ValueError(
+            "LLAMA_CLOUD_API_KEY must be set (see surfsense_evals/.env)."
+        )
+
+    parse_mode = _LLAMA_PARSE_MODE_MAP.get(processing_mode, "parse_page_with_llm")
+
+    # Lazy import: llama-cloud pulls llama-index-core (~50 MB) on first
+    # touch; defer until the parser actually runs.
+    from llama_cloud_services import LlamaParse
+    from llama_cloud_services.parse.base import JobFailedException
+    from llama_cloud_services.parse.utils import ResultType
+
+    file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
+    # Match backend's per-page timeout heuristic so big PDFs don't drop
+    # mid-job: 60s baseline + 30s/page (premium agent runs longer than
+    # basic; both fit comfortably here).
+    job_timeout = max(180.0, 60.0 + 30.0 * estimated_pages)
+    upload_timeout = max(120.0, 30.0 * file_size_mb)
+
+    logger.info(
+        "LlamaCloud parsing %s (mode=%s, parse_mode=%s, %.1fMB, "
+        "job_timeout=%.0fs)",
+        file_path, processing_mode, parse_mode, file_size_mb, job_timeout,
+    )
+
+    custom_timeout = httpx.Timeout(
+        connect=120.0, read=upload_timeout, write=upload_timeout, pool=120.0,
+    )
+
+    last_exc: Exception | None = None
+    for attempt in range(1, _MAX_RETRIES + 1):
+        try:
+            async with httpx.AsyncClient(timeout=custom_timeout) as client:
+                parser = LlamaParse(
+                    api_key=api_key,
+                    num_workers=1,
+                    verbose=False,
+                    language="en",
+                    result_type=ResultType.MD,
+                    parse_mode=parse_mode,
+                    ignore_errors=False,
+                    max_timeout=int(max(2000.0, job_timeout + upload_timeout)),
+                    job_timeout_in_seconds=job_timeout,
+                    job_timeout_extra_time_per_page_in_seconds=60,
+                    custom_client=client,
+                )
+                result = await parser.aparse(str(file_path))
+            content = _extract_markdown(result).strip()
+            if not content:
+                raise LlamaCloudError(
+                    f"LlamaCloud returned empty content for {file_path}"
+                )
+            logger.info(
+                "LlamaCloud OK: %s (%s) -> %d chars",
+                file_path, parse_mode, len(content),
+            )
+            return content
+
+        except (
+            httpx.HTTPError,
+            JobFailedException,
+            RuntimeError,
+        ) as exc:
+            last_exc = exc
+            if attempt < _MAX_RETRIES:
+                delay = min(_BASE_DELAY * (2 ** (attempt - 1)), _MAX_DELAY)
+                jitter = delay * 0.25 * (2 * random.random() - 1)
+                sleep_for = delay + jitter
+                logger.warning(
+                    "LlamaCloud attempt %d/%d failed (%s); retrying in %.1fs",
+                    attempt, _MAX_RETRIES, type(last_exc).__name__, sleep_for,
+                )
+                await asyncio.sleep(sleep_for)
+
+    raise LlamaCloudError(
+        f"LlamaCloud failed after {_MAX_RETRIES} attempts on {file_path}"
+    ) from last_exc
+
+
+__all__ = ["LlamaCloudError", "parse_with_llamacloud"]
diff --git a/surfsense_evals/src/surfsense_evals/core/parsers/pdf_pages.py b/surfsense_evals/src/surfsense_evals/core/parsers/pdf_pages.py
new file mode 100644
index 0000000000..e3691cb734
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/parsers/pdf_pages.py
@@ -0,0 +1,35 @@
+"""Tiny pypdf wrapper for "how many pages does this PDF have?".
+
+Used by ``parser_compare`` to:
+
+* Decide LlamaCloud's per-page job timeout.
+* Compute the SurfSense preprocessing dollar cost
+  (``$1 / 1k pages`` for basic, ``$10 / 1k pages`` for premium) so the
+  report can show "ingest + LLM" total cost per arm.
+
+Returns ``0`` (and logs) on parse failure rather than raising — costs
+shown as ``?`` are always better than a benchmark that crashes mid-run.
+"""
+
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+def count_pdf_pages(path: Path) -> int:
+    """Return the page count for ``path``; ``0`` if pypdf can't open it."""
+
+    try:
+        from pypdf import PdfReader
+
+        reader = PdfReader(str(path))
+        return len(reader.pages)
+    except Exception as exc:  # noqa: BLE001
+        logger.warning("Failed to count pages for %s: %s", path, exc)
+        return 0
+
+
+__all__ = ["count_pdf_pages"]
diff --git a/surfsense_evals/src/surfsense_evals/core/pdf/__init__.py b/surfsense_evals/src/surfsense_evals/core/pdf/__init__.py
new file mode 100644
index 0000000000..e03fa34c93
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/pdf/__init__.py
@@ -0,0 +1,31 @@
+"""Domain-agnostic PDF rendering helper. Lazy import."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:  # pragma: no cover
+    from .render import (
+        PdfImage,
+        render_pdf,
+        render_pdf_with_images,
+        render_text_files_to_pdf,
+    )
+
+__all__ = [
+    "PdfImage",
+    "render_pdf",
+    "render_pdf_with_images",
+    "render_text_files_to_pdf",
+]
+
+
+_LAZY = {"PdfImage", "render_pdf", "render_pdf_with_images", "render_text_files_to_pdf"}
+
+
+def __getattr__(name: str):
+    if name in _LAZY:
+        from . import render as _mod
+
+        return getattr(_mod, name)
+    raise AttributeError(f"module 'surfsense_evals.core.pdf' has no attribute {name!r}")
diff --git a/surfsense_evals/src/surfsense_evals/core/pdf/render.py b/surfsense_evals/src/surfsense_evals/core/pdf/render.py
new file mode 100644
index 0000000000..624136d7c4
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/pdf/render.py
@@ -0,0 +1,351 @@
+"""Deterministic ``.txt`` / ``.md`` → single PDF via reportlab.
+
+Used wherever a benchmark needs the same source bytes fed to both the
+native-PDF arm and the SurfSense ingestion arm. The head-to-head
+comparison is fair only if the *same* PDF is the input to both arms,
+which is why we go to lengths to make the rendering deterministic.
+
+Determinism notes:
+
+* We pin the PDF metadata to a fixed creation date and producer
+  (``reportlab`` accepts neither directly, but ``Canvas.setAuthor`` and
+  the absence of an ``info`` mutator means the bytes only differ by
+  ``CreationDate`` / ``ModDate``). We post-process the PDF to scrub
+  those if ``deterministic=True`` is passed.
+* Page size, font, margins, and tab handling are fixed in code so the
+  same input yields the same byte output across machines.
+* PDF/A is overkill for our use; basic PDF 1.4 is what every model
+  expects.
+"""
+
+from __future__ import annotations
+
+import io
+import re
+from collections.abc import Iterable, Sequence
+from dataclasses import dataclass
+from datetime import UTC, datetime
+from pathlib import Path
+
+from reportlab.lib.pagesizes import LETTER
+from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet
+from reportlab.lib.units import inch
+from reportlab.lib.utils import ImageReader
+from reportlab.platypus import (
+    Image,
+    KeepTogether,
+    PageBreak,
+    Paragraph,
+    SimpleDocTemplate,
+    Spacer,
+)
+
+
+@dataclass
+class RenderedPdf:
+    path: Path
+    n_pages_estimate: int
+    n_chars: int
+
+
+_PDF_DATE_KEY = re.compile(rb"/(?:CreationDate|ModDate)\s*\(D:[^)]*\)")
+# reportlab also writes a `/ID [<hex1><hex2>]` trailer entry that
+# embeds a per-run hash. Scrub it so two renders of the same input
+# produce the same bytes.
+_PDF_ID_ARRAY = re.compile(rb"/ID\s*\[\s*<[^>]*>\s*<[^>]*>\s*\]")
+
+
+def _scrub_dates(pdf_bytes: bytes) -> bytes:
+    """Remove ``CreationDate`` / ``ModDate`` / trailer ``/ID`` so the
+    file is byte-deterministic across runs."""
+
+    pdf_bytes = _PDF_DATE_KEY.sub(b"/CreationDate (D:19700101000000Z)", pdf_bytes)
+    pdf_bytes = _PDF_ID_ARRAY.sub(b"/ID [<00><00>]", pdf_bytes)
+    return pdf_bytes
+
+
+_DEFAULT_STYLES = getSampleStyleSheet()
+
+
+def _build_body_style() -> ParagraphStyle:
+    base = _DEFAULT_STYLES["BodyText"]
+    style = ParagraphStyle(
+        "EvalBody",
+        parent=base,
+        fontName="Helvetica",
+        fontSize=10.5,
+        leading=14,
+        spaceAfter=6,
+        spaceBefore=0,
+    )
+    return style
+
+
+def _build_heading_style() -> ParagraphStyle:
+    base = _DEFAULT_STYLES["Heading2"]
+    style = ParagraphStyle(
+        "EvalHeading",
+        parent=base,
+        fontName="Helvetica-Bold",
+        fontSize=14,
+        leading=18,
+        spaceAfter=10,
+        spaceBefore=8,
+    )
+    return style
+
+
+def _normalise_paragraphs(text: str) -> list[str]:
+    """Split a text blob into paragraphs while preserving blank-line structure."""
+
+    blocks: list[list[str]] = [[]]
+    for line in text.splitlines():
+        stripped = line.rstrip()
+        if stripped == "":
+            if blocks[-1]:
+                blocks.append([])
+            continue
+        blocks[-1].append(stripped)
+    paragraphs: list[str] = []
+    for block in blocks:
+        if not block:
+            continue
+        # Join lines within a paragraph with spaces (text-from-PDF style).
+        paragraphs.append(" ".join(block))
+    return paragraphs
+
+
+def _escape_html(text: str) -> str:
+    return (
+        text.replace("&", "&amp;")
+        .replace("<", "&lt;")
+        .replace(">", "&gt;")
+    )
+
+
+def render_pdf(
+    *,
+    title: str,
+    sections: Sequence[tuple[str | None, str]],
+    output_path: Path,
+    deterministic: bool = True,
+) -> RenderedPdf:
+    """Render one PDF from a list of ``(section_heading, section_text)`` tuples.
+
+    ``section_heading`` may be ``None`` for an unnamed section. Each
+    section is followed by a page break so the model's PDF parser sees
+    a clean structural boundary between source files.
+    """
+
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    buffer = io.BytesIO()
+    doc = SimpleDocTemplate(
+        buffer,
+        pagesize=LETTER,
+        leftMargin=0.75 * inch,
+        rightMargin=0.75 * inch,
+        topMargin=0.75 * inch,
+        bottomMargin=0.75 * inch,
+        title=title,
+        author="surfsense-evals",
+        subject="Eval input",
+        creator="surfsense-evals",
+    )
+
+    body_style = _build_body_style()
+    heading_style = _build_heading_style()
+    title_style = ParagraphStyle(
+        "EvalTitle",
+        parent=_DEFAULT_STYLES["Title"],
+        fontName="Helvetica-Bold",
+        fontSize=18,
+        leading=22,
+        spaceAfter=14,
+    )
+
+    flow: list = [Paragraph(_escape_html(title), title_style)]
+    total_chars = 0
+    for index, (heading, text) in enumerate(sections):
+        if index > 0:
+            flow.append(PageBreak())
+        if heading:
+            flow.append(Paragraph(_escape_html(heading), heading_style))
+        for paragraph in _normalise_paragraphs(text):
+            total_chars += len(paragraph)
+            flow.append(Paragraph(_escape_html(paragraph), body_style))
+            flow.append(Spacer(1, 4))
+
+    doc.build(flow)
+    pdf_bytes = buffer.getvalue()
+    if deterministic:
+        pdf_bytes = _scrub_dates(pdf_bytes)
+    output_path.write_bytes(pdf_bytes)
+
+    # Conservative page estimate: ~3000 chars per LETTER page at 10.5pt.
+    n_pages = max(1, total_chars // 3000 + len(sections))
+    return RenderedPdf(path=output_path, n_pages_estimate=n_pages, n_chars=total_chars)
+
+
+@dataclass
+class PdfImage:
+    """One image to embed inside a section.
+
+    ``caption`` is rendered below the image (italic). ``max_width_in``
+    caps the rendered width in inches; height auto-scales to preserve
+    aspect ratio (read with PIL).
+    """
+
+    path: Path
+    caption: str = ""
+    max_width_in: float = 5.5  # default leaves margin for LETTER 8.5"
+
+
+def _make_image_flowable(image: PdfImage) -> Image:
+    """Build a reportlab Image flowable scaled to fit page width."""
+
+    reader = ImageReader(str(image.path))
+    iw, ih = reader.getSize()
+    if iw <= 0 or ih <= 0:
+        raise ValueError(f"Invalid image dimensions for {image.path}: {iw}x{ih}")
+    target_w = image.max_width_in * inch
+    target_h = target_w * (ih / iw)
+    # Cap height too — some medical images are extreme portrait.
+    max_h = 7.0 * inch
+    if target_h > max_h:
+        target_h = max_h
+        target_w = target_h * (iw / ih)
+    return Image(str(image.path), width=target_w, height=target_h)
+
+
+def render_pdf_with_images(
+    *,
+    title: str,
+    sections: Sequence[tuple[str | None, str, Sequence[PdfImage] | None]],
+    output_path: Path,
+    deterministic: bool = True,
+    page_break_between_sections: bool = False,
+) -> RenderedPdf:
+    """Render a PDF that mixes text and embedded images.
+
+    Each section is ``(heading, body_text, images)``. Images render
+    inline after the body text, each followed by an italic caption.
+    Set ``page_break_between_sections=True`` if you want explicit
+    structural boundaries (mostly useful for multi-case PDFs); the
+    default keeps everything on one page when possible (so a single
+    MedXpertQA case is one PDF page with case + images + options).
+    """
+
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    buffer = io.BytesIO()
+    doc = SimpleDocTemplate(
+        buffer,
+        pagesize=LETTER,
+        leftMargin=0.75 * inch,
+        rightMargin=0.75 * inch,
+        topMargin=0.75 * inch,
+        bottomMargin=0.75 * inch,
+        title=title,
+        author="surfsense-evals",
+        subject="Eval input",
+        creator="surfsense-evals",
+    )
+
+    body_style = _build_body_style()
+    heading_style = _build_heading_style()
+    caption_style = ParagraphStyle(
+        "EvalCaption",
+        parent=body_style,
+        fontSize=9,
+        leading=11,
+        textColor="#444",
+        spaceBefore=2,
+        spaceAfter=10,
+    )
+    title_style = ParagraphStyle(
+        "EvalTitle",
+        parent=_DEFAULT_STYLES["Title"],
+        fontName="Helvetica-Bold",
+        fontSize=18,
+        leading=22,
+        spaceAfter=14,
+    )
+
+    flow: list = [Paragraph(_escape_html(title), title_style)]
+    total_chars = 0
+    for index, (heading, text, images) in enumerate(sections):
+        if index > 0 and page_break_between_sections:
+            flow.append(PageBreak())
+        if heading:
+            flow.append(Paragraph(_escape_html(heading), heading_style))
+        for paragraph in _normalise_paragraphs(text):
+            total_chars += len(paragraph)
+            flow.append(Paragraph(_escape_html(paragraph), body_style))
+            flow.append(Spacer(1, 4))
+        for image in images or []:
+            try:
+                img_flow = _make_image_flowable(image)
+            except Exception:  # noqa: BLE001 — bad image shouldn't kill PDF
+                continue
+            grouped = [img_flow]
+            if image.caption:
+                grouped.append(Paragraph(_escape_html(image.caption), caption_style))
+            else:
+                grouped.append(Spacer(1, 8))
+            flow.append(KeepTogether(grouped))
+
+    doc.build(flow)
+    pdf_bytes = buffer.getvalue()
+    if deterministic:
+        pdf_bytes = _scrub_dates(pdf_bytes)
+    output_path.write_bytes(pdf_bytes)
+
+    n_pages = max(1, total_chars // 3000 + len(sections))
+    return RenderedPdf(path=output_path, n_pages_estimate=n_pages, n_chars=total_chars)
+
+
+def render_text_files_to_pdf(
+    *,
+    title: str,
+    files: Iterable[Path],
+    output_path: Path,
+    deterministic: bool = True,
+) -> RenderedPdf:
+    """Convenience wrapper: read a list of text files, render to one PDF.
+
+    The heading of each section is the file's name (no extension), so
+    e.g. ``admission_note.txt`` becomes a section header ``admission_note``
+    in the rendered PDF. Useful for any text-only benchmark that ships
+    a corpus as separate ``.txt`` / ``.md`` shards per logical document.
+    """
+
+    sections: list[tuple[str | None, str]] = []
+    for path in files:
+        path = Path(path)
+        text = path.read_text(encoding="utf-8")
+        sections.append((path.stem, text))
+    return render_pdf(
+        title=title,
+        sections=sections,
+        output_path=output_path,
+        deterministic=deterministic,
+    )
+
+
+# Tiny self-check — handy when debugging.
+def _self_test() -> None:  # pragma: no cover
+    out = Path("./_render_self_test.pdf")
+    sections = [
+        ("intro", "Hello world.\n\nThis is a test."),
+        ("body", "Line one.\nLine two."),
+    ]
+    rendered = render_pdf(title="Self test", sections=sections, output_path=out)
+    print(f"wrote {rendered.path} ({rendered.n_chars} chars)")
+
+
+# Importing ``datetime`` keeps the timezone helper handy if a future
+# benchmark wants to embed a real timestamp without losing determinism.
+_NOW_FROZEN = datetime(2026, 5, 11, tzinfo=UTC)
diff --git a/surfsense_evals/src/surfsense_evals/core/providers/__init__.py b/surfsense_evals/src/surfsense_evals/core/providers/__init__.py
new file mode 100644
index 0000000000..fa82bcbf26
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/providers/__init__.py
@@ -0,0 +1,22 @@
+"""External LLM providers (used by the native arm).
+
+Lazy imports so the SurfSense-only path doesn't transitively load the
+OpenRouter client until something actually constructs ``OpenRouterPdfProvider``.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:  # pragma: no cover
+    from .openrouter_pdf import OpenRouterPdfProvider, OpenRouterResponse
+
+__all__ = ["OpenRouterPdfProvider", "OpenRouterResponse"]
+
+
+def __getattr__(name: str):
+    if name in {"OpenRouterPdfProvider", "OpenRouterResponse"}:
+        from . import openrouter_pdf as _mod
+
+        return getattr(_mod, name)
+    raise AttributeError(f"module 'surfsense_evals.core.providers' has no attribute {name!r}")
diff --git a/surfsense_evals/src/surfsense_evals/core/providers/openrouter_chat.py b/surfsense_evals/src/surfsense_evals/core/providers/openrouter_chat.py
new file mode 100644
index 0000000000..2494434bec
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/providers/openrouter_chat.py
@@ -0,0 +1,118 @@
+"""Bare OpenRouter ``chat/completions`` provider — no PDF, no plugins.
+
+Used by ``BareLlmArm`` to measure "what does the model answer with
+zero retrieval context?". Same wire shape as ``OpenRouterPdfProvider``
+minus the file-parser plugin and the ``file`` content part:
+
+```json
+{
+  "model": "openai/gpt-5.4-mini",
+  "messages": [
+    {"role": "system", "content": "<optional>"},
+    {"role": "user",   "content": "<prompt>"}
+  ]
+}
+```
+
+The response shape is identical to the PDF provider's, so we re-use
+``_parse_chat_completion`` from ``openrouter_pdf`` and only specialise
+the request builder. That keeps cost-extraction, token-counting, and
+content-array handling in one place.
+"""
+
+from __future__ import annotations
+
+import logging
+import time
+from typing import Any
+
+import httpx
+
+from .openrouter_pdf import (
+    OpenRouterResponse,
+    _DEFAULT_HEADERS,
+    _parse_chat_completion,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class OpenRouterChatProvider:
+    """Stateless bare-chat client. No PDF, no file-parser plugin."""
+
+    def __init__(
+        self,
+        *,
+        api_key: str,
+        base_url: str = "https://openrouter.ai/api/v1",
+        model: str,
+        timeout_s: float = 600.0,
+    ) -> None:
+        if not api_key:
+            raise ValueError("OPENROUTER_API_KEY is required for the bare-LLM arm.")
+        self._api_key = api_key
+        self._base = base_url.rstrip("/")
+        self._model = model
+        self._timeout = httpx.Timeout(timeout_s, connect=15.0)
+
+    @property
+    def model(self) -> str:
+        return self._model
+
+    def _build_payload(
+        self,
+        *,
+        prompt: str,
+        system_prompt: str | None,
+        max_tokens: int | None,
+    ) -> dict[str, Any]:
+        messages: list[dict[str, Any]] = []
+        if system_prompt:
+            messages.append({"role": "system", "content": system_prompt})
+        messages.append({"role": "user", "content": prompt})
+        body: dict[str, Any] = {"model": self._model, "messages": messages}
+        if max_tokens:
+            body["max_tokens"] = max_tokens
+        return body
+
+    async def complete(
+        self,
+        *,
+        prompt: str,
+        system_prompt: str | None = None,
+        max_tokens: int | None = None,
+        http: httpx.AsyncClient | None = None,
+    ) -> OpenRouterResponse:
+        """Single chat completion. Errors are raised verbatim — caller decides retries."""
+
+        payload = self._build_payload(
+            prompt=prompt,
+            system_prompt=system_prompt,
+            max_tokens=max_tokens,
+        )
+        headers = {
+            "Authorization": f"Bearer {self._api_key}",
+            "Content-Type": "application/json",
+            "Accept": "application/json",
+            **_DEFAULT_HEADERS,
+        }
+        url = f"{self._base}/chat/completions"
+        started = time.monotonic()
+        if http is not None:
+            response = await http.post(url, json=payload, headers=headers, timeout=self._timeout)
+        else:
+            async with httpx.AsyncClient(timeout=self._timeout) as client:
+                response = await client.post(
+                    url, json=payload, headers=headers, timeout=self._timeout
+                )
+        latency_ms = int((time.monotonic() - started) * 1000)
+        if response.status_code >= 400:
+            raise httpx.HTTPStatusError(
+                f"OpenRouter HTTP {response.status_code}: {response.text[:300]}",
+                request=response.request,
+                response=response,
+            )
+        return _parse_chat_completion(response.json(), latency_ms=latency_ms)
+
+
+__all__ = ["OpenRouterChatProvider"]
diff --git a/surfsense_evals/src/surfsense_evals/core/providers/openrouter_pdf.py b/surfsense_evals/src/surfsense_evals/core/providers/openrouter_pdf.py
new file mode 100644
index 0000000000..e98590cbfb
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/providers/openrouter_pdf.py
@@ -0,0 +1,231 @@
+"""Native-PDF arm provider: OpenRouter ``chat/completions`` with PDF input.
+
+Per `<https://openrouter.ai/docs/features/multimodal/pdfs>`__ the wire
+shape is OpenAI-compatible with one PDF-specific extra:
+
+```json
+{
+  "model": "anthropic/claude-sonnet-4.5",
+  "messages": [{
+    "role": "user",
+    "content": [
+      {"type": "file", "file": {"filename": "case.pdf",
+        "file_data": "data:application/pdf;base64,<b64>"}},
+      {"type": "text", "text": "<prompt>"}
+    ]
+  }],
+  "plugins": [{"id": "file-parser", "pdf": {"engine": "native"}}]
+}
+```
+
+``engine: "native"`` is the only engine that doesn't pre-OCR the
+PDF — it forwards raw bytes to PDF-native models (Claude, Gemini),
+matching what a human user does when "dropping the PDF into Claude".
+``mistral-ocr`` and ``cloudflare-ai`` are exposed as enum options for
+non-native models.
+
+Headers ``HTTP-Referer`` and ``X-Title`` make spend show up cleanly on
+the OpenRouter dashboard.
+"""
+
+from __future__ import annotations
+
+import base64
+import logging
+import time
+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+from typing import Any
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+
+class PdfEngine(str, Enum):
+    NATIVE = "native"
+    MISTRAL_OCR = "mistral-ocr"
+    CLOUDFLARE_AI = "cloudflare-ai"
+
+
+@dataclass
+class OpenRouterResponse:
+    """Subset of the OpenRouter response we care about for scoring."""
+
+    text: str
+    input_tokens: int
+    output_tokens: int
+    total_tokens: int
+    cost_micros: int
+    latency_ms: int
+    finish_reason: str | None
+    raw: dict[str, Any]
+
+
+_DEFAULT_HEADERS = {
+    "HTTP-Referer": "https://github.com/MODSetter/SurfSense",
+    "X-Title": "SurfSense-evals",
+}
+
+
+class OpenRouterPdfProvider:
+    """Thin httpx-based client. Stateless; safe to reuse per arm instance."""
+
+    def __init__(
+        self,
+        *,
+        api_key: str,
+        base_url: str = "https://openrouter.ai/api/v1",
+        model: str,
+        engine: PdfEngine = PdfEngine.NATIVE,
+        timeout_s: float = 600.0,
+    ) -> None:
+        if not api_key:
+            raise ValueError("OPENROUTER_API_KEY is required for the native arm.")
+        self._api_key = api_key
+        self._base = base_url.rstrip("/")
+        self._model = model
+        self._engine = engine
+        self._timeout = httpx.Timeout(timeout_s, connect=15.0)
+
+    @property
+    def model(self) -> str:
+        return self._model
+
+    @property
+    def engine(self) -> PdfEngine:
+        return self._engine
+
+    def _build_payload(
+        self,
+        *,
+        prompt: str,
+        pdf_path: Path,
+        max_tokens: int | None,
+        extra_messages: list[dict[str, Any]] | None,
+    ) -> dict[str, Any]:
+        b64 = base64.b64encode(pdf_path.read_bytes()).decode("ascii")
+        user_content: list[dict[str, Any]] = [
+            {
+                "type": "file",
+                "file": {
+                    "filename": pdf_path.name,
+                    "file_data": f"data:application/pdf;base64,{b64}",
+                },
+            },
+            {"type": "text", "text": prompt},
+        ]
+        messages: list[dict[str, Any]] = list(extra_messages or [])
+        messages.append({"role": "user", "content": user_content})
+        body: dict[str, Any] = {
+            "model": self._model,
+            "messages": messages,
+            "plugins": [
+                {"id": "file-parser", "pdf": {"engine": self._engine.value}}
+            ],
+        }
+        if max_tokens:
+            body["max_tokens"] = max_tokens
+        return body
+
+    async def complete(
+        self,
+        *,
+        prompt: str,
+        pdf_path: Path,
+        max_tokens: int | None = None,
+        extra_messages: list[dict[str, Any]] | None = None,
+        http: httpx.AsyncClient | None = None,
+    ) -> OpenRouterResponse:
+        """Single chat completion. Errors are raised verbatim — runner decides retries."""
+
+        payload = self._build_payload(
+            prompt=prompt,
+            pdf_path=pdf_path,
+            max_tokens=max_tokens,
+            extra_messages=extra_messages,
+        )
+        headers = {
+            "Authorization": f"Bearer {self._api_key}",
+            "Content-Type": "application/json",
+            "Accept": "application/json",
+            **_DEFAULT_HEADERS,
+        }
+        url = f"{self._base}/chat/completions"
+        started = time.monotonic()
+        if http is not None:
+            response = await http.post(url, json=payload, headers=headers, timeout=self._timeout)
+        else:
+            async with httpx.AsyncClient(timeout=self._timeout) as client:
+                response = await client.post(
+                    url, json=payload, headers=headers, timeout=self._timeout
+                )
+        latency_ms = int((time.monotonic() - started) * 1000)
+        if response.status_code >= 400:
+            raise httpx.HTTPStatusError(
+                f"OpenRouter HTTP {response.status_code}: {response.text[:300]}",
+                request=response.request,
+                response=response,
+            )
+        data = response.json()
+        return _parse_chat_completion(data, latency_ms=latency_ms)
+
+
+def _parse_chat_completion(payload: dict[str, Any], *, latency_ms: int) -> OpenRouterResponse:
+    """Tolerant parser for OpenRouter / OpenAI chat-completions JSON.
+
+    OpenRouter passes through any provider-specific extras, but the
+    canonical shape is ``choices[0].message.content`` (string OR array
+    of content parts) and ``usage.prompt_tokens / completion_tokens / total_tokens``.
+    Cost lives at the top level (``payload["usage"]["cost"]`` or
+    ``payload["x-or-cost"]``) depending on routing.
+    """
+
+    text = ""
+    finish_reason: str | None = None
+    choices = payload.get("choices") or []
+    if choices:
+        message = (choices[0] or {}).get("message") or {}
+        content = message.get("content")
+        if isinstance(content, str):
+            text = content
+        elif isinstance(content, list):
+            chunks: list[str] = []
+            for part in content:
+                if isinstance(part, dict) and part.get("type") in {"text", "output_text"}:
+                    chunks.append(str(part.get("text", "")))
+            text = "".join(chunks)
+        finish_reason = (choices[0] or {}).get("finish_reason") or None
+
+    usage = payload.get("usage") or {}
+    input_tokens = int(usage.get("prompt_tokens") or 0)
+    output_tokens = int(usage.get("completion_tokens") or 0)
+    total_tokens = int(usage.get("total_tokens") or (input_tokens + output_tokens))
+
+    # OpenRouter exposes cost in dollars on `usage.cost` or `cost`. We
+    # convert to integer micros to avoid float-summing surprises across
+    # 7,663 MIRAGE questions.
+    raw_cost = usage.get("cost")
+    if raw_cost is None:
+        raw_cost = payload.get("cost")
+    cost_micros = 0
+    if raw_cost is not None:
+        try:
+            cost_micros = int(round(float(raw_cost) * 1_000_000))
+        except (TypeError, ValueError):
+            cost_micros = 0
+
+    return OpenRouterResponse(
+        text=text,
+        input_tokens=input_tokens,
+        output_tokens=output_tokens,
+        total_tokens=total_tokens,
+        cost_micros=cost_micros,
+        latency_ms=latency_ms,
+        finish_reason=finish_reason,
+        raw=payload,
+    )
+
+
+__all__ = ["OpenRouterPdfProvider", "OpenRouterResponse", "PdfEngine"]
diff --git a/surfsense_evals/src/surfsense_evals/core/registry.py b/surfsense_evals/src/surfsense_evals/core/registry.py
new file mode 100644
index 0000000000..cc8b725e0c
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/registry.py
@@ -0,0 +1,265 @@
+"""Suite + Benchmark protocols and the global registry.
+
+The extensibility seam: ``core.cli`` walks ``surfsense_evals.suites`` on
+import, which auto-imports every benchmark subpackage, which calls
+``register(<benchmark>)`` at module bottom. The CLI then iterates the
+populated registry to build subcommand groups dynamically.
+
+Adding a new domain = drop a folder under ``suites/<domain>/<bench>/``
+that ends in ``register(MyBenchmark())``. No edits anywhere in
+``core/`` are required.
+"""
+
+from __future__ import annotations
+
+import argparse
+from collections.abc import Mapping
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Protocol, runtime_checkable
+
+import httpx
+
+from .clients import DocumentsClient, NewChatClient, SearchSpaceClient
+from .config import Config, SuiteState
+
+# ---------------------------------------------------------------------------
+# Run context — what every benchmark.ingest/run receives
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class RunContext:
+    """Per-invocation environment threaded into ``ingest`` and ``run``.
+
+    A benchmark uses this to read pinned suite state, build new HTTP
+    clients on the shared ``http`` session, find the right data /
+    reports paths, and discover the active OpenRouter model + key.
+
+    ``http`` is the authenticated SurfSense client (auth event hook
+    attached). It is **not** an OpenRouter client — providers create
+    their own short-lived clients because OpenRouter doesn't share the
+    SurfSense bearer.
+    """
+
+    suite: str
+    benchmark: str
+    config: Config
+    suite_state: SuiteState
+    http: httpx.AsyncClient
+
+    @property
+    def search_space_id(self) -> int:
+        return self.suite_state.search_space_id
+
+    @property
+    def agent_llm_id(self) -> int:
+        return self.suite_state.agent_llm_id
+
+    @property
+    def provider_model(self) -> str:
+        """Slug used by the SurfSense agent (and the native arm by default).
+
+        For ``cost-arbitrage`` scenarios this is the *cheap, text-only*
+        slug — SurfSense answers from the chunks the vision LLM already
+        extracted at ingest. The native arm should use
+        ``native_arm_model`` instead in that scenario.
+        """
+
+        return self.suite_state.provider_model
+
+    @property
+    def native_arm_model(self) -> str:
+        """Slug the native_pdf arm should use.
+
+        Defaults to ``provider_model`` (head-to-head / symmetric-cheap);
+        for ``cost-arbitrage`` it returns the explicit
+        ``--native-arm-model`` so the native arm can fairly answer
+        image-bearing questions.
+        """
+
+        return self.suite_state.effective_native_arm_model
+
+    @property
+    def vision_provider_model(self) -> str | None:
+        """Slug of the OpenRouter vision LLM SurfSense used at ingest.
+
+        ``None`` if no vision config was attached at setup (legacy or
+        text-only suite). Used by runners purely to record what was
+        actually used in ``RunArtifact.extra`` and to label reports.
+        """
+
+        return self.suite_state.vision_provider_model
+
+    @property
+    def scenario(self) -> str:
+        """Scenario name pinned at setup time (see ``config.SCENARIOS``)."""
+
+        return self.suite_state.scenario
+
+    def search_space_client(self) -> SearchSpaceClient:
+        return SearchSpaceClient(self.http, self.config.surfsense_api_base)
+
+    def documents_client(self) -> DocumentsClient:
+        return DocumentsClient(self.http, self.config.surfsense_api_base)
+
+    def new_chat_client(self) -> NewChatClient:
+        return NewChatClient(self.http, self.config.surfsense_api_base)
+
+    def maps_dir(self) -> Path:
+        path = self.config.suite_maps_dir(self.suite)
+        path.mkdir(parents=True, exist_ok=True)
+        return path
+
+    def runs_dir(self, *, run_timestamp: str) -> Path:
+        path = self.config.suite_runs_dir(self.suite) / run_timestamp / self.benchmark
+        path.mkdir(parents=True, exist_ok=True)
+        return path
+
+    def benchmark_data_dir(self) -> Path:
+        path = self.config.suite_data_dir(self.suite) / self.benchmark
+        path.mkdir(parents=True, exist_ok=True)
+        return path
+
+
+# ---------------------------------------------------------------------------
+# Run artifact + report section
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class RunArtifact:
+    """Everything a runner persists for the report writer to consume.
+
+    ``raw_path`` points at the JSONL of per-question ``ArmResult``
+    rows. ``metrics`` is a free-form dict the benchmark fills in (e.g.
+    ``{"native": {...}, "surfsense": {...}, "delta": {...}}``).
+    """
+
+    suite: str
+    benchmark: str
+    run_timestamp: str
+    raw_path: Path
+    metrics: dict[str, Any] = field(default_factory=dict)
+    extra: dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class ReportSection:
+    """One benchmark's slice of the final summary."""
+
+    title: str
+    headline: bool
+    body_md: str
+    body_json: dict[str, Any] = field(default_factory=dict)
+
+
+# ---------------------------------------------------------------------------
+# Benchmark protocol + registry
+# ---------------------------------------------------------------------------
+
+
+@runtime_checkable
+class Benchmark(Protocol):
+    """The contract every benchmark module ends with ``register(<x>)``."""
+
+    suite: str
+    name: str
+    headline: bool
+    description: str
+
+    async def ingest(self, ctx: RunContext, **opts: Any) -> None:  # pragma: no cover - protocol
+        ...
+
+    async def run(self, ctx: RunContext, **opts: Any) -> RunArtifact:  # pragma: no cover - protocol
+        ...
+
+    def add_run_args(self, parser: argparse.ArgumentParser) -> None:  # pragma: no cover - protocol
+        """Add benchmark-specific flags to ``run <suite> <benchmark>``."""
+
+    def report_section(self, artifacts: list[RunArtifact]) -> ReportSection:  # pragma: no cover - protocol
+        ...
+
+
+# ---------------------------------------------------------------------------
+# Registry storage
+# ---------------------------------------------------------------------------
+
+
+_REGISTRY: dict[tuple[str, str], Benchmark] = {}
+
+
+def register(benchmark: Benchmark) -> None:
+    """Add ``benchmark`` to the registry. Last-wins on duplicate keys.
+
+    Duplicate registrations log a warning rather than raising so a
+    benchmark module imported twice (once via auto-discovery, once via
+    a test directly importing it) doesn't blow up the CLI.
+    """
+
+    key = (benchmark.suite, benchmark.name)
+    if key in _REGISTRY:
+        import logging
+
+        logging.getLogger(__name__).warning(
+            "Benchmark %s/%s re-registered (overwriting prior)", *key
+        )
+    _REGISTRY[key] = benchmark
+
+
+def unregister(suite: str, name: str) -> None:
+    """Test helper: drop a single benchmark from the registry."""
+
+    _REGISTRY.pop((suite, name), None)
+
+
+def reset() -> None:
+    """Test helper: wipe the registry (use with monkeypatched discovery)."""
+
+    _REGISTRY.clear()
+
+
+def get(suite: str, name: str) -> Benchmark:
+    try:
+        return _REGISTRY[(suite, name)]
+    except KeyError as exc:
+        available = ", ".join(f"{s}/{n}" for s, n in sorted(_REGISTRY)) or "<none>"
+        raise KeyError(
+            f"Unknown benchmark '{suite}/{name}'. Registered: {available}"
+        ) from exc
+
+
+def list_suites() -> list[str]:
+    return sorted({s for s, _ in _REGISTRY})
+
+
+def list_benchmarks(suite: str | None = None) -> list[Benchmark]:
+    if suite is None:
+        return [_REGISTRY[k] for k in sorted(_REGISTRY)]
+    return [_REGISTRY[k] for k in sorted(_REGISTRY) if k[0] == suite]
+
+
+def snapshot() -> Mapping[tuple[str, str], Benchmark]:
+    """Read-only view for diagnostics (e.g. ``benchmarks list`` rendering)."""
+
+    return dict(_REGISTRY)
+
+
+__all__ = [
+    "Arm",
+    "Benchmark",
+    "ReportSection",
+    "RunArtifact",
+    "RunContext",
+    "get",
+    "list_benchmarks",
+    "list_suites",
+    "register",
+    "reset",
+    "snapshot",
+    "unregister",
+]
+
+
+# Re-export Arm from arms.base so suites can `from core.registry import Arm`.
+from .arms.base import Arm  # noqa: E402, F401  (deliberate re-export at bottom)
diff --git a/surfsense_evals/src/surfsense_evals/core/report/__init__.py b/surfsense_evals/src/surfsense_evals/core/report/__init__.py
new file mode 100644
index 0000000000..c5ccbc64c4
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/report/__init__.py
@@ -0,0 +1,18 @@
+"""Report writer + section composition primitives. Lazy import."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:  # pragma: no cover
+    from .writer import write_report
+
+__all__ = ["write_report"]
+
+
+def __getattr__(name: str):
+    if name == "write_report":
+        from .writer import write_report
+
+        return write_report
+    raise AttributeError(f"module 'surfsense_evals.core.report' has no attribute {name!r}")
diff --git a/surfsense_evals/src/surfsense_evals/core/report/writer.py b/surfsense_evals/src/surfsense_evals/core/report/writer.py
new file mode 100644
index 0000000000..8d1ffa07a1
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/report/writer.py
@@ -0,0 +1,89 @@
+"""Report writer — composes per-benchmark sections into one summary.
+
+Output:
+
+* ``reports/<suite>/<run-timestamp>/summary.md`` — human-readable.
+  Bullet lists only (no tables) per project's coding-standards.
+* ``reports/<suite>/<run-timestamp>/summary.json`` — same content as
+  structured JSON for downstream tooling (CI dashboards, regressions).
+
+Headline benchmarks come first in both outputs.
+"""
+
+from __future__ import annotations
+
+import json
+from collections.abc import Iterable
+from pathlib import Path
+
+from ..config import Config
+from ..registry import ReportSection
+
+
+def write_report(
+    *,
+    config: Config,
+    suite: str,
+    sections: Iterable[ReportSection],
+    run_timestamp: str,
+) -> Path:
+    """Write ``summary.md`` + ``summary.json``. Returns the path of the .md file."""
+
+    sections_list = list(sections)
+    sections_list.sort(key=lambda s: (not s.headline, s.title.lower()))
+
+    out_dir = config.suite_reports_dir(suite) / run_timestamp
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    md_path = out_dir / "summary.md"
+    json_path = out_dir / "summary.json"
+
+    md_lines: list[str] = [
+        f"# SurfSense evals — suite `{suite}`",
+        "",
+        f"- Run timestamp: `{run_timestamp}`",
+        f"- Sections: {len(sections_list)}",
+        "",
+    ]
+    headline = [s for s in sections_list if s.headline]
+    secondary = [s for s in sections_list if not s.headline]
+    if headline:
+        md_lines.append("## Headline")
+        md_lines.append("")
+        for section in headline:
+            md_lines.append(f"### {section.title}")
+            md_lines.append("")
+            md_lines.append(section.body_md.rstrip())
+            md_lines.append("")
+    if secondary:
+        md_lines.append("## Secondary measurements")
+        md_lines.append("")
+        for section in secondary:
+            md_lines.append(f"### {section.title}")
+            md_lines.append("")
+            md_lines.append(section.body_md.rstrip())
+            md_lines.append("")
+
+    md_path.write_text("\n".join(md_lines).rstrip() + "\n", encoding="utf-8")
+
+    json_payload = {
+        "suite": suite,
+        "run_timestamp": run_timestamp,
+        "sections": [
+            {
+                "title": s.title,
+                "headline": s.headline,
+                "body_md": s.body_md,
+                "body_json": s.body_json,
+            }
+            for s in sections_list
+        ],
+    }
+    json_path.write_text(
+        json.dumps(json_payload, indent=2, sort_keys=True) + "\n",
+        encoding="utf-8",
+    )
+    return md_path
+
+
+__all__ = ["ReportSection", "write_report"]
diff --git a/surfsense_evals/src/surfsense_evals/core/scenarios.py b/surfsense_evals/src/surfsense_evals/core/scenarios.py
new file mode 100644
index 0000000000..16874a0699
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/scenarios.py
@@ -0,0 +1,58 @@
+"""Shared scenario formatting helpers for head-to-head benchmark reports.
+
+The scenario chosen at ``setup`` time (``head-to-head``, ``symmetric-cheap``,
+``cost-arbitrage``) materially changes how a head-to-head report should be
+read. This module produces the one-bullet summary every head-to-head
+runner stamps near the top of its ``report_section`` body so reviewers
+immediately see the framing — no need to dig into ``run_artifact.json``.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Mapping
+from typing import Any
+
+
+def format_scenario_md(extra: Mapping[str, Any] | None) -> str:
+    """Render a scenario-aware bullet for a benchmark report.
+
+    Reads ``extra["scenario"]`` plus the runtime LLM slugs the runner
+    recorded. Falls back to a sensible "head-to-head" line if the artifact
+    pre-dates scenarios so old runs still render cleanly.
+    """
+
+    extra = dict(extra or {})
+    scenario = str(extra.get("scenario") or "head-to-head")
+    surf_slug = str(extra.get("provider_model") or "?")
+    native_slug = str(extra.get("native_arm_model") or surf_slug)
+    vision_slug = extra.get("vision_provider_model")
+
+    if scenario == "cost-arbitrage":
+        body = (
+            f"- Scenario: **cost-arbitrage** — native arm answers with "
+            f"`{native_slug}` (vision); SurfSense answers with `{surf_slug}` "
+            f"over chunks vision-extracted at ingest"
+            f"{f' by `{vision_slug}`' if vision_slug else ''}. "
+            "Measures how close SurfSense gets to native at a fraction of "
+            "the per-query cost."
+        )
+    elif scenario == "symmetric-cheap":
+        body = (
+            f"- Scenario: **symmetric-cheap** — both arms answer with "
+            f"`{surf_slug}`; SurfSense pre-extracted images at ingest"
+            f"{f' via `{vision_slug}`' if vision_slug else ''}. "
+            "Native arm structurally loses on image-bearing questions "
+            "(text-only model can't see images) — that's the point."
+        )
+    else:
+        body = (
+            f"- Scenario: head-to-head — both arms answer with `{surf_slug}` "
+            "via OpenRouter."
+        )
+        if vision_slug:
+            body += f" SurfSense ingest VLM: `{vision_slug}`."
+
+    return body
+
+
+__all__ = ["format_scenario_md"]
diff --git a/surfsense_evals/src/surfsense_evals/core/vision_llm.py b/surfsense_evals/src/surfsense_evals/core/vision_llm.py
new file mode 100644
index 0000000000..ae96f12854
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/vision_llm.py
@@ -0,0 +1,127 @@
+"""Vision LLM resolution + auto-pick logic for the harness's ``setup`` command.
+
+Two responsibilities:
+
+1. Resolve an explicit ``--vision-llm <slug>`` to a global OpenRouter
+   vision LLM config id that ``set_llm_preferences(vision_llm_config_id=...)``
+   can accept.
+2. Auto-pick the strongest registered vision config when the operator
+   doesn't pass ``--vision-llm`` but the scenario / benchmark needs one.
+
+The priority list mirrors the recommended slugs in the README so the
+auto-pick is deterministic and reviewable.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Iterable
+from dataclasses import dataclass
+
+from .clients.search_space import VisionLlmConfigEntry
+
+# Order matters — first match wins when auto-picking. Keep these in sync
+# with the "Recommended vision slugs" table in the README so the
+# auto-pick story is the same one users read about.
+RECOMMENDED_VISION_PRIORITY: tuple[str, ...] = (
+    "anthropic/claude-sonnet-4.5",
+    "anthropic/claude-opus-4.7",
+    "openai/gpt-5",
+    "google/gemini-2.5-pro",
+)
+
+
+class VisionConfigError(RuntimeError):
+    """Raised when no vision config can be resolved (explicit or auto)."""
+
+
+@dataclass(frozen=True)
+class ResolvedVisionConfig:
+    """Result of ``resolve_vision_llm`` — what to attach + a label for logs."""
+
+    config_id: int
+    provider_model: str
+    selected_via: str  # "explicit" | "auto-priority" | "auto-fallback"
+
+
+def _openrouter_only(entries: Iterable[VisionLlmConfigEntry]) -> list[VisionLlmConfigEntry]:
+    return [e for e in entries if e.provider == "OPENROUTER" and not e.is_auto_mode]
+
+
+def resolve_vision_llm(
+    candidates: list[VisionLlmConfigEntry],
+    *,
+    explicit_slug: str | None,
+) -> ResolvedVisionConfig:
+    """Resolve a vision LLM config id from a slug or by auto-picking.
+
+    * If ``explicit_slug`` is given: must match exactly one OpenRouter
+      vision config's ``model_name``. Raises ``VisionConfigError`` with a
+      friendly listing if zero / many match.
+    * Otherwise: walk ``RECOMMENDED_VISION_PRIORITY`` in order and return
+      the first registered one. If none of the recommended slugs are
+      registered, fall back to the first OpenRouter vision config in the
+      list (deterministic by listing order). Raises ``VisionConfigError``
+      if zero are registered at all.
+    """
+
+    or_vision = _openrouter_only(candidates)
+
+    if explicit_slug is not None:
+        matches = [e for e in or_vision if e.model_name == explicit_slug]
+        if not matches:
+            sample = ", ".join(e.model_name for e in or_vision[:8]) or "<none>"
+            raise VisionConfigError(
+                f"No OpenRouter vision config found for slug '{explicit_slug}'. "
+                "Make sure `openrouter_integration.vision_enabled: true` in "
+                "global_llm_config.yaml and that the Celery worker has finished "
+                "its first refresh. "
+                f"Available OpenRouter vision slugs (sample): {sample}."
+            )
+        if len(matches) > 1:
+            listing = "\n".join(f"  id={e.id}  name={e.name!r}" for e in matches)
+            raise VisionConfigError(
+                f"Multiple OpenRouter vision configs match '{explicit_slug}':\n{listing}"
+            )
+        only = matches[0]
+        return ResolvedVisionConfig(
+            config_id=only.id,
+            provider_model=only.model_name,
+            selected_via="explicit",
+        )
+
+    if not or_vision:
+        raise VisionConfigError(
+            "No OpenRouter vision LLM configs are registered with this "
+            "SurfSense backend. Either pass `--no-vision-llm` to the ingest "
+            "step (text-only ingestion), or enable "
+            "`openrouter_integration.vision_enabled: true` in "
+            "global_llm_config.yaml so the Celery worker syncs vision-capable "
+            "OpenRouter models on next refresh."
+        )
+
+    by_slug = {e.model_name: e for e in or_vision}
+    for preferred in RECOMMENDED_VISION_PRIORITY:
+        match = by_slug.get(preferred)
+        if match is not None:
+            return ResolvedVisionConfig(
+                config_id=match.id,
+                provider_model=match.model_name,
+                selected_via="auto-priority",
+            )
+
+    # Fallback: first registered OpenRouter vision config. Deterministic
+    # because the backend returns them in a stable order.
+    fallback = or_vision[0]
+    return ResolvedVisionConfig(
+        config_id=fallback.id,
+        provider_model=fallback.model_name,
+        selected_via="auto-fallback",
+    )
+
+
+__all__ = [
+    "RECOMMENDED_VISION_PRIORITY",
+    "ResolvedVisionConfig",
+    "VisionConfigError",
+    "resolve_vision_llm",
+]
diff --git a/surfsense_evals/src/surfsense_evals/suites/__init__.py b/surfsense_evals/src/surfsense_evals/suites/__init__.py
new file mode 100644
index 0000000000..95ed958ca7
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/__init__.py
@@ -0,0 +1,66 @@
+"""Suite registry auto-discovery.
+
+Importing ``surfsense_evals.suites`` walks every subpackage one level deep
+(domain like ``medical``) AND its benchmark subpackages
+(``medical/medxpertqa``, ``medical/mirage``, ``medical/cure``). Each
+benchmark's ``__init__.py`` is expected to call
+``core.registry.register(<Benchmark>)`` at module bottom; merely importing
+the module is enough to populate the registry.
+
+Adding a new domain is therefore: drop a folder under ``suites/`` with the
+right structure. No edits anywhere else.
+
+Subpackages whose name starts with ``_`` are skipped — that's reserved for
+test fixtures (e.g. ``suites/_demo/``) so they don't accidentally show up
+in ``benchmarks list``.
+"""
+
+from __future__ import annotations
+
+import importlib
+import logging
+import pkgutil
+from typing import Iterable
+
+logger = logging.getLogger(__name__)
+
+
+def _iter_subpackages(package) -> Iterable[str]:
+    """Yield fully-qualified subpackage names one level deep, skipping ``_*``."""
+
+    for module_info in pkgutil.iter_modules(package.__path__, prefix=f"{package.__name__}."):
+        if not module_info.ispkg:
+            continue
+        leaf = module_info.name.rsplit(".", 1)[-1]
+        if leaf.startswith("_"):
+            continue
+        yield module_info.name
+
+
+def discover_suites() -> list[str]:
+    """Import every domain + benchmark subpackage so registrations fire.
+
+    Returns the list of fully-qualified benchmark module names that were
+    successfully imported. Failures are logged (not raised) so a single
+    broken benchmark doesn't take down the whole CLI — the operator still
+    sees the working benchmarks via ``benchmarks list``.
+    """
+
+    import surfsense_evals.suites as _suites  # self-import for __path__
+
+    imported: list[str] = []
+    for domain_name in _iter_subpackages(_suites):
+        try:
+            domain_pkg = importlib.import_module(domain_name)
+        except Exception as exc:  # noqa: BLE001
+            logger.warning("Failed to import suite domain %s: %s", domain_name, exc)
+            continue
+        for benchmark_name in _iter_subpackages(domain_pkg):
+            try:
+                importlib.import_module(benchmark_name)
+                imported.append(benchmark_name)
+            except Exception as exc:  # noqa: BLE001
+                logger.warning(
+                    "Failed to import benchmark %s: %s", benchmark_name, exc
+                )
+    return imported
diff --git a/surfsense_evals/src/surfsense_evals/suites/_demo/__init__.py b/surfsense_evals/src/surfsense_evals/suites/_demo/__init__.py
new file mode 100644
index 0000000000..9a8cd447e4
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/_demo/__init__.py
@@ -0,0 +1,8 @@
+"""Test fixture suite — skipped by the auto-discovery walker (name starts with ``_``).
+
+Imported explicitly by ``tests/core/test_registry.py`` to prove the
+register-on-import contract works without polluting the production
+benchmark list.
+"""
+
+from __future__ import annotations
diff --git a/surfsense_evals/src/surfsense_evals/suites/_demo/hello/__init__.py b/surfsense_evals/src/surfsense_evals/suites/_demo/hello/__init__.py
new file mode 100644
index 0000000000..1da33926c9
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/_demo/hello/__init__.py
@@ -0,0 +1,46 @@
+"""Demo benchmark — registers on import, used only by the registry tests."""
+
+from __future__ import annotations
+
+import argparse
+from typing import Any
+
+from ....core.registry import (
+    Benchmark,
+    ReportSection,
+    RunArtifact,
+    RunContext,
+    register,
+)
+
+
+class HelloBenchmark:
+    suite: str = "_demo"
+    name: str = "hello"
+    headline: bool = False
+    description: str = "Demo benchmark used by the registry test."
+
+    def add_run_args(self, parser: argparse.ArgumentParser) -> None:
+        parser.add_argument("--echo", default="hi")
+
+    async def ingest(self, ctx: RunContext, **_opts: Any) -> None:  # pragma: no cover
+        return None
+
+    async def run(self, ctx: RunContext, **opts: Any) -> RunArtifact:  # pragma: no cover
+        return RunArtifact(
+            suite=self.suite,
+            benchmark=self.name,
+            run_timestamp="0",
+            raw_path=ctx.benchmark_data_dir() / "raw.jsonl",
+            metrics={"echo": opts.get("echo")},
+        )
+
+    def report_section(self, artifacts: list[RunArtifact]) -> ReportSection:
+        return ReportSection(
+            title="Hello demo",
+            headline=False,
+            body_md="- runs: " + str(len(artifacts)),
+        )
+
+
+register(HelloBenchmark())
diff --git a/surfsense_evals/src/surfsense_evals/suites/medical/__init__.py b/surfsense_evals/src/surfsense_evals/suites/medical/__init__.py
new file mode 100644
index 0000000000..9c0067e25c
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/medical/__init__.py
@@ -0,0 +1,7 @@
+"""Medical RAG benchmarks (MedXpertQA-MM headline + MIRAGE/CUREv1 secondary).
+
+Subpackages register themselves with ``core.registry`` on import. The
+``suites/__init__.py`` discovery walker imports them automatically.
+"""
+
+from __future__ import annotations
diff --git a/surfsense_evals/src/surfsense_evals/suites/medical/cure/__init__.py b/surfsense_evals/src/surfsense_evals/suites/medical/cure/__init__.py
new file mode 100644
index 0000000000..e13224be73
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/medical/cure/__init__.py
@@ -0,0 +1,18 @@
+"""CUREv1 — secondary single-arm SurfSense retrieval measurement.
+
+Source: https://huggingface.co/datasets/clinia/CUREv1
+Paper: https://arxiv.org/html/2412.06954v4
+
+Pure retrieval benchmark — 10 medical disciplines, English/French/Spanish
+queries, expert-curated qrels (graded 0/1/2). The harness ingests the
+corpus, runs each query via SurfSense's ``/api/v1/new_chat``, parses
+chunk citations, maps them back to CUREv1 ``corpus-id``, and scores
+Recall@k / MRR / nDCG@10 against qrels.
+"""
+
+from __future__ import annotations
+
+from .runner import CureBenchmark
+from ....core import registry as _registry
+
+_registry.register(CureBenchmark())
diff --git a/surfsense_evals/src/surfsense_evals/suites/medical/cure/ingest.py b/surfsense_evals/src/surfsense_evals/suites/medical/cure/ingest.py
new file mode 100644
index 0000000000..6eca8810c0
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/medical/cure/ingest.py
@@ -0,0 +1,239 @@
+"""CUREv1 ingestion.
+
+For each (lang, discipline) requested, downloads the corpus split via
+``datasets.load_dataset(path="clinia/CUREv1", name="corpus", split=<discipline>)``,
+batches passages into ~5 MB markdown bundles, uploads them to
+SurfSense, polls until ``ready``, and persists the
+``corpus_id -> document_id`` map under
+``data/medical/maps/cure_corpus_map_<discipline>.jsonl``. A union map
+``cure_corpus_map.jsonl`` is also written so the runner can resolve
+citations across disciplines without juggling per-file paths.
+"""
+
+from __future__ import annotations
+
+import io
+import json
+import logging
+from collections.abc import Iterable
+from dataclasses import dataclass
+from pathlib import Path
+
+from ....core.config import set_suite_state
+from ....core.ingest_settings import IngestSettings, settings_header_line
+from ....core.registry import RunContext
+
+logger = logging.getLogger(__name__)
+
+
+_BATCH_SIZE_BYTES = 5 * 1024 * 1024
+
+# 10 disciplines covered by the dataset card. We exhaustively list
+# them so a smoke test can default to one.
+DISCIPLINES = (
+    "anesthesiology",
+    "cardiology",
+    "dermatology",
+    "endocrinology",
+    "gastroenterology",
+    "hematology",
+    "nephrology",
+    "neurology",
+    "obstetrics_gynecology",
+    "psychiatry",
+)
+
+
+@dataclass
+class CorpusPassage:
+    corpus_id: str
+    title: str
+    text: str
+
+    def to_markdown(self) -> str:
+        title = (self.title or "").strip() or "Untitled"
+        body = (self.text or "").strip()
+        return f"# {title}\n\n_id: `{self.corpus_id}`_\n\n{body}\n"
+
+
+@dataclass
+class PassageBatch:
+    path: Path
+    corpus_ids: list[str]
+
+
+def _stream_corpus(discipline: str) -> Iterable[CorpusPassage]:
+    """Stream corpus rows for one discipline via the ``datasets`` library."""
+
+    from datasets import load_dataset  # noqa: PLC0415
+
+    logger.info("Loading CUREv1 corpus for discipline=%s", discipline)
+    ds = load_dataset(path="clinia/CUREv1", name="corpus", split=discipline)
+    for row in ds:
+        cid = str(row.get("_id") or "")
+        if not cid:
+            continue
+        yield CorpusPassage(
+            corpus_id=cid,
+            title=str(row.get("title") or ""),
+            text=str(row.get("text") or ""),
+        )
+
+
+def _write_batches(
+    passages: Iterable[CorpusPassage],
+    *,
+    out_dir: Path,
+    discipline: str,
+    batch_bytes: int = _BATCH_SIZE_BYTES,
+) -> list[PassageBatch]:
+    out_dir.mkdir(parents=True, exist_ok=True)
+    batches: list[PassageBatch] = []
+    current_buffer = io.StringIO()
+    current_ids: list[str] = []
+    current_bytes = 0
+    batch_idx = 0
+
+    def _flush() -> None:
+        nonlocal current_buffer, current_ids, current_bytes, batch_idx
+        if not current_ids:
+            return
+        path = out_dir / f"cure_{discipline}_{batch_idx:04d}.md"
+        path.write_text(current_buffer.getvalue(), encoding="utf-8")
+        batches.append(PassageBatch(path=path, corpus_ids=current_ids))
+        batch_idx += 1
+        current_buffer = io.StringIO()
+        current_ids = []
+        current_bytes = 0
+
+    for passage in passages:
+        chunk = passage.to_markdown() + "\n---\n\n"
+        chunk_bytes = len(chunk.encode("utf-8"))
+        if current_bytes + chunk_bytes > batch_bytes and current_ids:
+            _flush()
+        current_buffer.write(chunk)
+        current_ids.append(passage.corpus_id)
+        current_bytes += chunk_bytes
+    _flush()
+    return batches
+
+
+async def run_ingest(
+    ctx: RunContext,
+    *,
+    disciplines: list[str] | None = None,
+    max_per_discipline: int | None = None,
+    settings: IngestSettings | None = None,
+) -> None:
+    disciplines = disciplines or list(DISCIPLINES)
+    settings = settings or IngestSettings(use_vision_llm=False, processing_mode="basic")
+    bench_dir = ctx.benchmark_data_dir()
+    batches_root = bench_dir / "batches"
+    batches_root.mkdir(parents=True, exist_ok=True)
+
+    docs_client = ctx.documents_client()
+    union_map_path = ctx.maps_dir() / "cure_corpus_map.jsonl"
+    union_map_fh = union_map_path.open("w", encoding="utf-8")
+    # Header row records the ingest-time settings so the runner can
+    # surface them in the report (see core/ingest_settings.py).
+    union_map_fh.write(settings_header_line(settings) + "\n")
+    try:
+        for discipline in disciplines:
+            try:
+                passages_iter = _stream_corpus(discipline)
+                if max_per_discipline is not None:
+                    passages_iter = _take(passages_iter, max_per_discipline)
+                batches = _write_batches(
+                    passages_iter,
+                    out_dir=batches_root / discipline,
+                    discipline=discipline,
+                )
+            except Exception as exc:  # noqa: BLE001
+                logger.warning("Skipping discipline %s: %s", discipline, exc)
+                continue
+            if not batches:
+                logger.warning("Discipline %s produced 0 batches; skipping upload", discipline)
+                continue
+            logger.info(
+                "Uploading %d batches for discipline %s", len(batches), discipline
+            )
+            upload_result = await docs_client.upload(
+                files=[b.path for b in batches],
+                search_space_id=ctx.search_space_id,
+                should_summarize=settings.should_summarize,
+                use_vision_llm=settings.use_vision_llm,
+                processing_mode=settings.processing_mode,
+            )
+            new_doc_ids = list(upload_result.document_ids)
+            if new_doc_ids:
+                await docs_client.wait_until_ready(
+                    search_space_id=ctx.search_space_id,
+                    document_ids=new_doc_ids,
+                    timeout_s=3600.0,
+                    max_poll_s=15.0,
+                )
+            statuses = await docs_client.get_status(
+                search_space_id=ctx.search_space_id,
+                document_ids=new_doc_ids + upload_result.duplicate_document_ids,
+            )
+            title_to_doc = {s.title: s.document_id for s in statuses}
+
+            per_discipline_path = (
+                ctx.maps_dir() / f"cure_corpus_map_{discipline}.jsonl"
+            )
+            with per_discipline_path.open("w", encoding="utf-8") as fh:
+                fh.write(settings_header_line(settings) + "\n")
+                for batch in batches:
+                    doc_id = title_to_doc.get(batch.path.name)
+                    if doc_id is None:
+                        logger.warning("No document_id for batch %s", batch.path.name)
+                        continue
+                    for cid in batch.corpus_ids:
+                        record = {
+                            "corpus_id": cid,
+                            "document_id": doc_id,
+                            "discipline": discipline,
+                        }
+                        fh.write(json.dumps(record) + "\n")
+                        union_map_fh.write(json.dumps(record) + "\n")
+
+            chunks_map_path = ctx.maps_dir() / f"cure_chunk_map_{discipline}.jsonl"
+            with chunks_map_path.open("w", encoding="utf-8") as fh:
+                for doc_id in {title_to_doc.get(b.path.name) for b in batches} - {None}:
+                    try:
+                        chunks = await docs_client.list_chunks(int(doc_id))
+                    except Exception as exc:  # noqa: BLE001
+                        logger.warning(
+                            "Failed to list chunks for doc_id=%s: %s", doc_id, exc
+                        )
+                        continue
+                    for chunk in chunks:
+                        fh.write(
+                            json.dumps(
+                                {
+                                    "chunk_id": chunk.id,
+                                    "document_id": doc_id,
+                                    "discipline": discipline,
+                                }
+                            )
+                            + "\n"
+                        )
+    finally:
+        union_map_fh.close()
+
+    new_state = ctx.suite_state
+    new_state.ingestion_maps["cure"] = str(union_map_path)
+    set_suite_state(ctx.config, ctx.suite, new_state)
+    logger.info("CUREv1 ingestion complete; union map at %s", union_map_path)
+
+
+def _take(it: Iterable, n: int) -> Iterable:
+    yielded = 0
+    for x in it:
+        if yielded >= n:
+            return
+        yield x
+        yielded += 1
+
+
+__all__ = ["DISCIPLINES", "CorpusPassage", "PassageBatch", "run_ingest"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/medical/cure/runner.py b/surfsense_evals/src/surfsense_evals/suites/medical/cure/runner.py
new file mode 100644
index 0000000000..416912b14c
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/medical/cure/runner.py
@@ -0,0 +1,397 @@
+"""CUREv1 runner — single-arm SurfSense retrieval scoring.
+
+For each query we ask SurfSense via ``/api/v1/new_chat`` (no
+``mentioned_document_ids``) and parse chunk citations from the
+streamed answer. Cited ``chunk_id`` → ``document_id`` (chunk map) →
+``corpus_id`` (corpus map). The resulting ranked list is scored
+against the dataset's qrels.
+
+The prompt nudges the model to surface its supporting passages via
+SurfSense's standard ``[citation:CHUNK_ID]`` format (already required
+by the agent system prompt), so we recover retrieval ordering from
+the answer text without needing a separate retrieval API.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import logging
+from collections import defaultdict
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from ....core.arms import ArmRequest, ArmResult, SurfSenseArm
+from ....core.config import utc_iso_timestamp
+from ....core.ingest_settings import (
+    IngestSettings,
+    add_ingest_settings_args,
+    format_ingest_settings_md,
+    is_settings_header,
+    read_settings_header,
+)
+from ....core.metrics.retrieval import score_run
+from ....core.registry import (
+    Benchmark,
+    ReportSection,
+    RunArtifact,
+    RunContext,
+)
+
+logger = logging.getLogger(__name__)
+
+
+_PROMPT = """\
+You are a medical literature retrieval assistant for the question
+below. Identify the top passages from the knowledge base that best
+answer it and cite each one in the standard format
+[citation:CHUNK_ID]. List as many citations as are useful, ordered
+from most to least relevant. Provide a one-sentence justification
+for each citation.
+
+Query: {query}
+"""
+
+
+_DESCRIPTION = "CUREv1 retrieval (single-arm SurfSense): Recall@k / MRR / nDCG@10."
+
+# CUREv1 corpus is text-only markdown bundles; vision LLM at ingest
+# is wasted by default but the operator can flip it via CLI for an
+# A/B comparison.
+_DEFAULT_INGEST_SETTINGS = IngestSettings(
+    use_vision_llm=False,
+    processing_mode="basic",
+    should_summarize=False,
+)
+
+
+@dataclass
+class CureQuery:
+    qid: str
+    text: str
+    discipline: str
+
+
+def _load_chunk_map(maps_dir: Path) -> dict[int, int]:
+    """Union all ``cure_chunk_map_<discipline>.jsonl`` into one dict."""
+
+    out: dict[int, int] = {}
+    for path in sorted(maps_dir.glob("cure_chunk_map_*.jsonl")):
+        with path.open("r", encoding="utf-8") as fh:
+            for line in fh:
+                if not line.strip():
+                    continue
+                row = json.loads(line)
+                if is_settings_header(row):
+                    continue
+                try:
+                    out[int(row["chunk_id"])] = int(row["document_id"])
+                except (KeyError, TypeError, ValueError):
+                    continue
+    return out
+
+
+def _load_doc_to_corpus(maps_dir: Path) -> dict[int, list[str]]:
+    """Map ``document_id -> [corpus_id, ...]`` from the union map.
+
+    Multiple corpus passages may live in one batched markdown
+    document, so each doc_id maps to a list. Citation ordering of the
+    first occurrence is preserved.
+    """
+
+    out: dict[int, list[str]] = defaultdict(list)
+    union_path = maps_dir / "cure_corpus_map.jsonl"
+    if not union_path.exists():
+        return out
+    with union_path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            if not line.strip():
+                continue
+            row = json.loads(line)
+            if is_settings_header(row):
+                continue
+            try:
+                out[int(row["document_id"])].append(str(row["corpus_id"]))
+            except (KeyError, TypeError, ValueError):
+                continue
+    return out
+
+
+def _load_queries(*, lang: str, disciplines: list[str], sample_n: int | None) -> list[CureQuery]:
+    from datasets import load_dataset  # noqa: PLC0415
+
+    out: list[CureQuery] = []
+    for discipline in disciplines:
+        try:
+            ds = load_dataset(path="clinia/CUREv1", name=f"queries-{lang}", split=discipline)
+        except Exception as exc:  # noqa: BLE001
+            logger.warning("Skipping queries for %s/%s: %s", lang, discipline, exc)
+            continue
+        for row in ds:
+            qid = str(row.get("_id") or "")
+            text = str(row.get("text") or "")
+            if not qid or not text:
+                continue
+            out.append(CureQuery(qid=qid, text=text, discipline=discipline))
+    out.sort(key=lambda q: (q.discipline, q.qid))
+    if sample_n is not None and sample_n > 0:
+        # Stratified-by-discipline slice.
+        per_d = max(1, sample_n // max(1, len(disciplines)))
+        sliced: list[CureQuery] = []
+        counter: dict[str, int] = defaultdict(int)
+        for q in out:
+            if counter[q.discipline] >= per_d:
+                continue
+            sliced.append(q)
+            counter[q.discipline] += 1
+            if len(sliced) >= sample_n:
+                break
+        out = sliced
+    return out
+
+
+def _load_qrels(*, disciplines: list[str]) -> dict[str, dict[str, float]]:
+    from datasets import load_dataset  # noqa: PLC0415
+
+    out: dict[str, dict[str, float]] = defaultdict(dict)
+    for discipline in disciplines:
+        try:
+            ds = load_dataset(path="clinia/CUREv1", name="qrels", split=discipline)
+        except Exception as exc:  # noqa: BLE001
+            logger.warning("Skipping qrels for %s: %s", discipline, exc)
+            continue
+        for row in ds:
+            qid = str(row.get("query-id") or row.get("query_id") or "")
+            cid = str(row.get("corpus-id") or row.get("corpus_id") or "")
+            score = row.get("score")
+            if not qid or not cid or score is None:
+                continue
+            try:
+                out[qid][cid] = float(score)
+            except (TypeError, ValueError):
+                continue
+    return out
+
+
+async def _gather_with_limit(coros, *, concurrency: int) -> list[Any]:
+    sem = asyncio.Semaphore(max(1, concurrency))
+
+    async def _wrap(c):
+        async with sem:
+            return await c
+
+    return await asyncio.gather(*(_wrap(c) for c in coros))
+
+
+class CureBenchmark:
+    suite: str = "medical"
+    name: str = "cure"
+    headline: bool = False
+    description: str = _DESCRIPTION
+
+    def add_run_args(self, parser: argparse.ArgumentParser) -> None:
+        parser.add_argument("--lang", default="en", choices=("en", "es", "fr"))
+        parser.add_argument("--discipline", default=None,
+                            help="Restrict to one discipline (default: all ingested).")
+        parser.add_argument("--n", dest="sample_n", type=int, default=None)
+        parser.add_argument("--concurrency", type=int, default=4)
+        parser.add_argument(
+            "--max-passages-per-discipline", type=int, default=None,
+            help="(ingest only) cap corpus rows per discipline for smoke testing.",
+        )
+        # Per-upload knobs forwarded to /documents/fileupload at ingest;
+        # ignored at run-time (runner reads resolved settings from the
+        # union-map header).
+        add_ingest_settings_args(parser, defaults=_DEFAULT_INGEST_SETTINGS)
+
+    async def ingest(self, ctx: RunContext, **opts: Any) -> None:
+        from .ingest import DISCIPLINES, run_ingest
+
+        settings = IngestSettings.merge(_DEFAULT_INGEST_SETTINGS, opts)
+        await run_ingest(
+            ctx,
+            disciplines=list(DISCIPLINES),
+            max_per_discipline=opts.get("max_passages_per_discipline"),
+            settings=settings,
+        )
+
+    async def run(self, ctx: RunContext, **opts: Any) -> RunArtifact:
+        lang = opts.get("lang") or "en"
+        discipline_filter = opts.get("discipline")
+        sample_n = opts.get("sample_n")
+        concurrency = int(opts.get("concurrency") or 4)
+
+        maps_dir = ctx.maps_dir()
+        chunk_to_doc = _load_chunk_map(maps_dir)
+        doc_to_corpus = _load_doc_to_corpus(maps_dir)
+        ingest_settings = read_settings_header(maps_dir / "cure_corpus_map.jsonl")
+        if not chunk_to_doc or not doc_to_corpus:
+            raise RuntimeError(
+                "CUREv1 not ingested for this suite. Run "
+                "`python -m surfsense_evals ingest medical cure` first."
+            )
+
+        # Disciplines to query are determined by the per-discipline maps
+        # actually present (either user-filtered or whatever was ingested).
+        ingested_disciplines = sorted({
+            row_disc
+            for path in maps_dir.glob("cure_corpus_map_*.jsonl")
+            for row_disc in [path.stem[len("cure_corpus_map_"):]]
+        })
+        if discipline_filter:
+            disciplines = [discipline_filter]
+        else:
+            disciplines = ingested_disciplines or ["dermatology"]
+
+        queries = _load_queries(lang=lang, disciplines=disciplines, sample_n=sample_n)
+        if not queries:
+            raise RuntimeError(
+                f"No CUREv1 queries matched lang={lang!r} disciplines={disciplines!r}."
+            )
+        qrels = _load_qrels(disciplines=disciplines)
+        logger.info(
+            "CUREv1: %d queries / %d qrels across disciplines %s",
+            len(queries),
+            len(qrels),
+            disciplines,
+        )
+
+        arm = SurfSenseArm(
+            client=ctx.new_chat_client(),
+            search_space_id=ctx.search_space_id,
+            ephemeral_threads=True,
+        )
+
+        async def _ask(q: CureQuery) -> ArmResult:
+            return await arm.answer(
+                ArmRequest(
+                    question_id=f"{q.discipline}::{q.qid}",
+                    prompt=_PROMPT.format(query=q.text.strip()),
+                )
+            )
+
+        results: list[ArmResult] = await _gather_with_limit(
+            (_ask(q) for q in queries), concurrency=concurrency
+        )
+
+        per_query_retrieved: dict[str, list[str]] = {}
+        for q, res in zip(queries, results):
+            chunk_ids: list[int] = []
+            seen: set[int] = set()
+            for citation in res.citations:
+                if citation.get("kind") != "chunk":
+                    continue
+                cid = int(citation.get("chunk_id"))
+                if cid in seen:
+                    continue
+                chunk_ids.append(cid)
+                seen.add(cid)
+            corpus_ids: list[str] = []
+            seen_corpus: set[str] = set()
+            for cid in chunk_ids:
+                doc_id = chunk_to_doc.get(cid)
+                if doc_id is None:
+                    continue
+                for corpus_id in doc_to_corpus.get(doc_id, []):
+                    if corpus_id in seen_corpus:
+                        continue
+                    corpus_ids.append(corpus_id)
+                    seen_corpus.add(corpus_id)
+            per_query_retrieved[q.qid] = corpus_ids
+
+        scores = score_run(
+            per_query_retrieved=per_query_retrieved,
+            per_query_qrels=qrels,
+            ks=(1, 5, 10, 32),
+            ndcg_k=10,
+        )
+
+        run_timestamp = utc_iso_timestamp()
+        run_dir = ctx.runs_dir(run_timestamp=run_timestamp)
+        raw_path = run_dir / "raw.jsonl"
+        with raw_path.open("w", encoding="utf-8") as fh:
+            for q, res in zip(queries, results):
+                fh.write(
+                    json.dumps(
+                        {
+                            "discipline": q.discipline,
+                            "qid": q.qid,
+                            "lang": lang,
+                            "retrieved_corpus_ids": per_query_retrieved.get(q.qid, []),
+                            **res.to_jsonl(),
+                        }
+                    )
+                    + "\n"
+                )
+
+        metrics = scores.to_dict()
+        metrics["lang"] = lang
+        metrics["disciplines"] = disciplines
+
+        artifact = RunArtifact(
+            suite=self.suite,
+            benchmark=self.name,
+            run_timestamp=run_timestamp,
+            raw_path=raw_path,
+            metrics=metrics,
+            extra={
+                "n_queries": len(queries),
+                "lang": lang,
+                "disciplines": disciplines,
+                "concurrency": concurrency,
+                "provider_model": ctx.provider_model,
+                "ingest_settings": ingest_settings,
+            },
+        )
+        manifest_path = run_dir / "run_artifact.json"
+        manifest_path.write_text(
+            json.dumps(
+                {
+                    "suite": self.suite,
+                    "benchmark": self.name,
+                    "raw_path": "raw.jsonl",
+                    "metrics": metrics,
+                    "extra": artifact.extra,
+                },
+                indent=2,
+                sort_keys=True,
+            )
+            + "\n",
+            encoding="utf-8",
+        )
+        return artifact
+
+    def report_section(self, artifacts: list[RunArtifact]) -> ReportSection:
+        if not artifacts:
+            return ReportSection(
+                title="CUREv1 — single-arm SurfSense retrieval",
+                headline=False,
+                body_md="(no run artifacts found)",
+                body_json={},
+            )
+        latest = max(artifacts, key=lambda a: a.run_timestamp)
+        m = latest.metrics
+        recall = m.get("recall_at_k", {})
+        lines: list[str] = [
+            format_ingest_settings_md(latest.extra.get("ingest_settings")),
+            f"- Language: {m.get('lang', '?')}",
+            f"- Disciplines: {', '.join(m.get('disciplines', []) or ['?'])}",
+            f"- n_queries (after qrels intersection): {m.get('n_queries', 0)}",
+        ]
+        for k in (1, 5, 10, 32):
+            v = recall.get(str(k), recall.get(k))
+            if v is not None:
+                lines.append(f"- Recall@{k}: {float(v):.3f}")
+        lines.append(f"- MRR: {float(m.get('mrr', 0.0)):.3f}")
+        lines.append(f"- nDCG@10: {float(m.get('ndcg_at_10', 0.0)):.3f}")
+        return ReportSection(
+            title="CUREv1 — single-arm SurfSense retrieval",
+            headline=False,
+            body_md="\n".join(lines),
+            body_json=m,
+        )
+
+
+__all__ = ["CureBenchmark", "CureQuery"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/__init__.py b/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/__init__.py
new file mode 100644
index 0000000000..3e803398d8
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/__init__.py
@@ -0,0 +1,25 @@
+"""MedXpertQA-MM — multimodal medical exam head-to-head (medical suite headline).
+
+Source: https://huggingface.co/datasets/TsinghuaC3I/MedXpertQA
+Paper:  https://arxiv.org/abs/2501.18362 (ICML 2025)
+
+* MM subset: ~2,000 expert-level exam questions with diverse medical
+  images (radiology, dermatology, pathology, ECGs, gross specimens,
+  fundus photos) and structured patient information embedded in the
+  question stem.
+* 5 answer choices per MM question (A–E).
+* USMLE / COMLEX / 17 specialty board sources; rigorously filtered
+  and reviewed by physicians.
+
+Real diagnostic images carry signal that text-only patient charts
+cannot (e.g. CT scans, dermoscopy), so this benchmark exercises the
+full vision RAG pipeline end-to-end against a vision-capable model
+fed the same PDF natively.
+"""
+
+from __future__ import annotations
+
+from ....core import registry as _registry
+from .runner import MedXpertQAMMBenchmark
+
+_registry.register(MedXpertQAMMBenchmark())
diff --git a/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/ingest.py b/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/ingest.py
new file mode 100644
index 0000000000..5293e116f0
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/ingest.py
@@ -0,0 +1,394 @@
+"""MedXpertQA-MM ingestion.
+
+Steps:
+
+1. Pull ``MM/test.jsonl`` (and optionally ``MM/dev.jsonl``) plus
+   ``images.zip`` from
+   ``hf://datasets/TsinghuaC3I/MedXpertQA``. Cache under
+   ``<data_dir>/medical/medxpertqa/``.
+2. Extract ``images.zip`` once into ``<data_dir>/medical/medxpertqa/images/``.
+3. Render one PDF per MM question (text question + structured patient
+   info embedded in the question stem + each image flowable + answer
+   options). Output: ``<data_dir>/medical/medxpertqa/pdfs/<id>.pdf``.
+4. Upload each PDF to SurfSense with ``use_vision_llm=True``; persist
+   ``id -> document_id`` in
+   ``<data_dir>/medical/maps/medxpertqa_doc_map.jsonl``.
+
+Both arms then receive byte-identical PDFs. The native arm sends the
+PDF directly to OpenRouter; SurfSense ingests via its own vision
+pipeline and the runner queries with ``mentioned_document_ids=[...]``
+to scope retrieval to the question's PDF.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import zipfile
+from collections.abc import Iterable
+from dataclasses import dataclass
+from pathlib import Path
+
+from ....core.config import set_suite_state
+from ....core.ingest_settings import IngestSettings, settings_header_line
+from ....core.pdf import PdfImage, render_pdf_with_images
+from ....core.registry import RunContext
+from .prompt import format_options
+
+logger = logging.getLogger(__name__)
+
+
+HF_REPO_ID = "TsinghuaC3I/MedXpertQA"
+HF_REPO_TYPE = "dataset"
+
+
+def _hf_hub_download(*args, **kwargs):
+    from huggingface_hub import hf_hub_download
+
+    return hf_hub_download(*args, **kwargs)
+
+
+# ---------------------------------------------------------------------------
+# Question shape
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class MedXpertQuestion:
+    qid: str                         # e.g. "MM-26"
+    question: str                    # full question text (case + ask)
+    options: dict[str, str]          # A-E
+    label: str                       # "A".."E"
+    image_files: list[str]           # filenames inside images.zip
+    medical_task: str
+    body_system: str
+    question_type: str
+    split: str                       # "test" or "dev"
+
+
+def _load_jsonl(path: Path, *, split: str) -> list[MedXpertQuestion]:
+    out: list[MedXpertQuestion] = []
+    with path.open("r", encoding="utf-8") as fh:
+        for raw_line in fh:
+            line = raw_line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            qid = str(row.get("id") or "").strip()
+            question = str(row.get("question") or "").strip()
+            options = row.get("options") or {}
+            label = str(row.get("label") or "").strip().upper()
+            if not qid or not question or not isinstance(options, dict) or not label:
+                continue
+            opts = {str(k).strip().upper(): str(v).strip() for k, v in options.items()}
+            images = row.get("images") or []
+            if not isinstance(images, list):
+                images = []
+            out.append(MedXpertQuestion(
+                qid=qid,
+                question=question,
+                options=opts,
+                label=label,
+                image_files=[str(x).strip() for x in images if str(x).strip()],
+                medical_task=str(row.get("medical_task") or "").strip(),
+                body_system=str(row.get("body_system") or "").strip(),
+                question_type=str(row.get("question_type") or "").strip(),
+                split=split,
+            ))
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Image archive helpers
+# ---------------------------------------------------------------------------
+
+
+def _ensure_images_extracted(images_zip: Path, images_dir: Path) -> None:
+    """Extract images.zip once, tolerantly handle re-runs."""
+
+    marker = images_dir / ".extracted_ok"
+    if marker.exists():
+        return
+    images_dir.mkdir(parents=True, exist_ok=True)
+    logger.info("Extracting MedXpertQA images.zip -> %s", images_dir)
+    with zipfile.ZipFile(images_zip) as zf:
+        zf.extractall(images_dir)
+    marker.write_text("ok\n", encoding="utf-8")
+
+
+def _resolve_image_path(image_filename: str, images_dir: Path) -> Path | None:
+    """Find a question's image in the (possibly nested) extract directory.
+
+    The zip layout sometimes nests under ``images/`` and sometimes
+    flat — handle both.
+    """
+
+    direct = images_dir / image_filename
+    if direct.exists():
+        return direct
+    nested = images_dir / "images" / image_filename
+    if nested.exists():
+        return nested
+    # Last-ditch: glob recursively (slow but correct for unusual layouts).
+    matches = list(images_dir.rglob(image_filename))
+    return matches[0] if matches else None
+
+
+# ---------------------------------------------------------------------------
+# PDF rendering
+# ---------------------------------------------------------------------------
+
+
+def _render_question_pdf(
+    q: MedXpertQuestion,
+    *,
+    images_dir: Path,
+    pdfs_dir: Path,
+) -> tuple[Path, list[str]]:
+    """Render one MedXpertQA question into a PDF.
+
+    Layout:
+      Title:    MedXpertQA — <qid>  (medical_task / body_system)
+      Section 1 (case):       full question text
+      Section 1 images:       each image flowable + caption
+      Section 2 (options):    A) ... B) ... C) ... D) ... E) ...
+
+    Returns (pdf_path, missing_images) so the caller can warn on
+    questions where some image files weren't found.
+    """
+
+    out_path = pdfs_dir / f"{q.qid}.pdf"
+    images: list[PdfImage] = []
+    missing: list[str] = []
+    for fname in q.image_files:
+        resolved = _resolve_image_path(fname, images_dir)
+        if resolved is None:
+            missing.append(fname)
+            continue
+        images.append(PdfImage(path=resolved, caption=f"Image: {fname}", max_width_in=5.5))
+
+    title_meta_parts = []
+    if q.medical_task:
+        title_meta_parts.append(q.medical_task)
+    if q.body_system:
+        title_meta_parts.append(q.body_system)
+    if q.question_type:
+        title_meta_parts.append(q.question_type)
+    title_suffix = f" ({' / '.join(title_meta_parts)})" if title_meta_parts else ""
+
+    sections = [
+        ("Clinical case", q.question, images),
+        ("Answer choices", format_options(q.options), None),
+    ]
+    render_pdf_with_images(
+        title=f"MedXpertQA-MM {q.qid}{title_suffix}",
+        sections=sections,
+        output_path=out_path,
+    )
+    return out_path, missing
+
+
+# ---------------------------------------------------------------------------
+# Upload helper
+# ---------------------------------------------------------------------------
+
+
+async def _upload_pdfs(
+    ctx: RunContext,
+    pdf_paths: Iterable[Path],
+    *,
+    batch_size: int,
+    settings: IngestSettings,
+) -> dict[str, int]:
+    docs_client = ctx.documents_client()
+    name_to_id: dict[str, int] = {}
+    pdf_list = list(pdf_paths)
+    for batch_start in range(0, len(pdf_list), batch_size):
+        batch = pdf_list[batch_start:batch_start + batch_size]
+        result = await docs_client.upload(
+            files=batch,
+            search_space_id=ctx.search_space_id,
+            should_summarize=settings.should_summarize,
+            use_vision_llm=settings.use_vision_llm,
+            processing_mode=settings.processing_mode,
+        )
+        all_ids = list(result.document_ids) + list(result.duplicate_document_ids)
+        if all_ids:
+            await docs_client.wait_until_ready(
+                search_space_id=ctx.search_space_id,
+                document_ids=result.document_ids,
+                timeout_s=1800.0,
+            )
+            statuses = await docs_client.get_status(
+                search_space_id=ctx.search_space_id,
+                document_ids=all_ids,
+            )
+            for s in statuses:
+                name_to_id[s.title] = s.document_id
+        logger.info(
+            "Uploaded MedXpertQA batch %d-%d: %d new, %d duplicate",
+            batch_start, batch_start + len(batch),
+            len(result.document_ids), len(result.duplicate_document_ids),
+        )
+    return name_to_id
+
+
+# ---------------------------------------------------------------------------
+# Public entry point
+# ---------------------------------------------------------------------------
+
+
+async def run_ingest(
+    ctx: RunContext,
+    *,
+    split: str = "test",
+    max_questions: int | None = None,
+    upload_batch_size: int = 8,
+    skip_upload: bool = False,
+    include_dev: bool = False,
+    settings: IngestSettings | None = None,
+) -> None:
+    """Ingest MedXpertQA-MM into the medical suite.
+
+    Parameters
+    ----------
+    split : 'test' (default), 'dev', or 'both'
+        Which subset to render + upload.
+    max_questions : int | None
+        Cap on number of questions ingested (handy for fast iteration).
+    upload_batch_size : int
+        PDFs per ``fileupload`` call.
+    skip_upload : bool
+        Render PDFs locally but don't push to SurfSense.
+    include_dev : bool
+        Convenience: equivalent to ``split='both'``.
+    """
+
+    settings = settings or IngestSettings(use_vision_llm=True, processing_mode="basic")
+    bench_dir = ctx.benchmark_data_dir()
+    images_zip_local = bench_dir / "images.zip"
+    images_dir = bench_dir / "images"
+    pdfs_dir = bench_dir / "pdfs"
+    pdfs_dir.mkdir(parents=True, exist_ok=True)
+    hf_cache = bench_dir / ".hf_cache"
+    hf_cache.mkdir(parents=True, exist_ok=True)
+
+    # Step 1: download jsonl(s)
+    splits_to_load: list[str] = []
+    if split == "both" or include_dev:
+        splits_to_load = ["dev", "test"]
+    elif split in {"dev", "test"}:
+        splits_to_load = [split]
+    else:
+        raise ValueError(f"Unknown split {split!r}; use 'test' / 'dev' / 'both'")
+
+    questions: list[MedXpertQuestion] = []
+    for sp in splits_to_load:
+        rel = f"MM/{sp}.jsonl"
+        local = _hf_hub_download(
+            repo_id=HF_REPO_ID,
+            filename=rel,
+            repo_type=HF_REPO_TYPE,
+            cache_dir=str(hf_cache),
+        )
+        loaded = _load_jsonl(Path(local), split=sp)
+        questions.extend(loaded)
+        logger.info("Loaded %d MedXpertQA-MM questions from %s split", len(loaded), sp)
+
+    if max_questions is not None and max_questions > 0:
+        questions = questions[:max_questions]
+    if not questions:
+        raise RuntimeError("No MedXpertQA-MM questions loaded; check the split argument.")
+
+    # Step 2: download images.zip + extract once
+    if not images_zip_local.exists():
+        local_zip = _hf_hub_download(
+            repo_id=HF_REPO_ID,
+            filename="images.zip",
+            repo_type=HF_REPO_TYPE,
+            cache_dir=str(hf_cache),
+        )
+        # Materialise into bench_dir so the path is stable.
+        try:
+            from os import link as _link
+            _link(local_zip, images_zip_local)
+        except OSError:
+            from shutil import copy2
+            copy2(local_zip, images_zip_local)
+    _ensure_images_extracted(images_zip_local, images_dir)
+
+    # Step 3: render PDFs
+    pdf_paths: dict[str, Path] = {}
+    missing_image_count = 0
+    for i, q in enumerate(questions, start=1):
+        try:
+            pdf, missing = _render_question_pdf(q, images_dir=images_dir, pdfs_dir=pdfs_dir)
+            pdf_paths[q.qid] = pdf
+            if missing:
+                missing_image_count += len(missing)
+                logger.debug("qid=%s missing %d images: %s", q.qid, len(missing), missing)
+        except Exception as exc:  # noqa: BLE001
+            logger.warning("Failed to render MedXpertQA PDF for %s: %s", q.qid, exc)
+        if i % 50 == 0:
+            logger.info("  ... rendered %d / %d PDFs", i, len(questions))
+    if missing_image_count:
+        logger.warning(
+            "MedXpertQA: %d image references could not be resolved on disk "
+            "(rendered PDFs may be missing some images).",
+            missing_image_count,
+        )
+
+    # Step 4: upload
+    name_to_id: dict[str, int] = {}
+    if skip_upload:
+        logger.info("MedXpertQA: --skip-upload set; skipping SurfSense ingestion")
+    else:
+        logger.info("MedXpertQA upload settings: %s", settings.render_label())
+        name_to_id = await _upload_pdfs(
+            ctx,
+            pdf_paths.values(),
+            batch_size=upload_batch_size,
+            settings=settings,
+        )
+
+    # Step 5: persist manifest + questions
+    questions_jsonl = bench_dir / "questions.jsonl"
+    with questions_jsonl.open("w", encoding="utf-8") as fh:
+        for q in questions:
+            fh.write(json.dumps({
+                "qid": q.qid,
+                "question": q.question,
+                "options": q.options,
+                "label": q.label,
+                "image_files": q.image_files,
+                "medical_task": q.medical_task,
+                "body_system": q.body_system,
+                "question_type": q.question_type,
+                "split": q.split,
+            }) + "\n")
+    logger.info("Wrote %d MedXpertQA questions to %s", len(questions), questions_jsonl)
+
+    map_path = ctx.maps_dir() / "medxpertqa_doc_map.jsonl"
+    with map_path.open("w", encoding="utf-8") as fh:
+        # Header line records the resolved ingest settings
+        # (see core/ingest_settings.py).
+        fh.write(settings_header_line(settings) + "\n")
+        for q in questions:
+            local = pdf_paths.get(q.qid)
+            if local is None:
+                continue
+            fh.write(json.dumps({
+                "qid": q.qid,
+                "document_id": name_to_id.get(local.name),
+                "pdf_path": str(local),
+                "n_images": len(q.image_files),
+                "split": q.split,
+            }) + "\n")
+    logger.info("Wrote MedXpertQA doc map to %s", map_path)
+
+    new_state = ctx.suite_state
+    new_state.ingestion_maps["medxpertqa"] = str(map_path)
+    set_suite_state(ctx.config, ctx.suite, new_state)
+
+
+__all__ = ["MedXpertQuestion", "run_ingest"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/prompt.py b/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/prompt.py
new file mode 100644
index 0000000000..5c4a69916e
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/prompt.py
@@ -0,0 +1,54 @@
+"""MedXpertQA-MM prompt.
+
+Mirrors the upstream paper's evaluation prompt (Zuo et al., ICML 2025
+§3.4): present case + 5 options A-E, ask for a single letter answer.
+We also instruct the model to use the embedded images explicitly,
+since the whole point of the MM subset is that the answer depends on
+visual evidence (radiology / dermoscopy / pathology / ECG, etc.).
+"""
+
+from __future__ import annotations
+
+from collections.abc import Mapping
+
+ANSWER_LETTERS = ("A", "B", "C", "D", "E")
+
+
+_PROMPT = """\
+You are a board-certified physician. The following exam question
+includes a clinical case and one or more medical images (radiology,
+dermatology, pathology, ECG, etc.). Use BOTH the text and the images
+to choose the best answer. Do not rely on memorisation of the case;
+read the images carefully — they often determine the correct answer.
+
+Case + question:
+{question}
+
+Answer choices:
+{options_block}
+
+Respond on a single line in the format `Answer: X` where X is one of
+A, B, C, D, or E.
+"""
+
+
+def format_options(options: Mapping[str, str]) -> str:
+    """Render the ``A) ... E) ...`` options block."""
+
+    parts: list[str] = []
+    for letter in ANSWER_LETTERS:
+        text = options.get(letter)
+        if text is None or str(text).strip() == "":
+            continue
+        parts.append(f"{letter}) {str(text).strip()}")
+    return "\n".join(parts)
+
+
+def build_prompt(question: str, options: Mapping[str, str]) -> str:
+    return _PROMPT.format(
+        question=question.strip(),
+        options_block=format_options(options),
+    )
+
+
+__all__ = ["ANSWER_LETTERS", "build_prompt", "format_options"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/runner.py b/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/runner.py
new file mode 100644
index 0000000000..75646ef32d
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/runner.py
@@ -0,0 +1,681 @@
+"""MedXpertQA-MM runner — Native PDF (vision) vs SurfSense (vision RAG).
+
+Headline benchmark for the medical suite.
+
+* Native arm reads the rendered PDF (case + images + options) via
+  OpenRouter ``chat/completions`` + the file-parser plugin.
+* SurfSense arm queries ``POST /api/v1/new_chat`` scoped via
+  ``mentioned_document_ids=[doc_id]`` to the same per-question PDF.
+
+Operational notes:
+
+* PDFs contain real images (radiology, dermoscopy, pathology, ECGs).
+  Operator must pin a vision-capable model via
+  ``setup --provider-model anthropic/claude-sonnet-4.5`` (or similar);
+  the runner emits a warning if a known text-only slug is pinned.
+* MedXpertQA tags ``medical_task`` (Diagnosis / Treatment / Basic
+  Medicine) and ``body_system`` (Cardiovascular / Lymphatic / …)
+  directly on every row; we slice the report by both.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import logging
+import os
+from collections.abc import Iterable
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from ....core.arms import ArmRequest, ArmResult, NativePdfArm, SurfSenseArm
+from ....core.config import utc_iso_timestamp
+from ....core.ingest_settings import (
+    IngestSettings,
+    add_ingest_settings_args,
+    format_ingest_settings_md,
+    is_settings_header,
+)
+from ....core.metrics.comparison import (
+    bootstrap_delta_ci,
+    mcnemar_test,
+    paired_aggregate,
+)
+from ....core.metrics.mc_accuracy import accuracy_with_wilson_ci
+from ....core.providers.openrouter_pdf import OpenRouterPdfProvider, PdfEngine
+from ....core.registry import (
+    ReportSection,
+    RunArtifact,
+    RunContext,
+)
+from ....core.scenarios import format_scenario_md
+from .prompt import ANSWER_LETTERS, build_prompt
+
+logger = logging.getLogger(__name__)
+
+
+_TEXT_ONLY_HINTS = ("gpt-5.4-mini", "gpt-3.5", "text-only", "instruct-")
+
+
+@dataclass
+class MXQuestion:
+    qid: str
+    question: str
+    options: dict[str, str]
+    label: str
+    medical_task: str
+    body_system: str
+    question_type: str
+    split: str
+    n_images: int
+    pdf_path: Path
+    document_id: int | None
+
+
+def _load_doc_map(map_path: Path) -> tuple[dict[str, dict[str, Any]], dict[str, Any]]:
+    """Read the doc map JSONL.
+
+    Returns ``(rows, settings)`` where ``settings`` is the
+    ``__settings__`` header blob (or ``{}`` for legacy maps).
+    """
+
+    rows: dict[str, dict[str, Any]] = {}
+    settings: dict[str, Any] = {}
+    with map_path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            if is_settings_header(row):
+                settings = dict(row["__settings__"])
+                continue
+            rows[str(row["qid"])] = row
+    return rows, settings
+
+
+def _load_questions(
+    questions_jsonl: Path,
+    doc_map: dict[str, dict[str, Any]],
+    *,
+    split_filter: str | None,
+    task_filter: str | None,
+    body_filter: str | None,
+    require_images: bool,
+    sample_n: int | None,
+) -> list[MXQuestion]:
+    out: list[MXQuestion] = []
+    with questions_jsonl.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            qid = str(row.get("qid") or "").strip()
+            if not qid:
+                continue
+            if split_filter and split_filter != "all" and row.get("split") != split_filter:
+                continue
+            if task_filter and task_filter != "all" and row.get("medical_task") != task_filter:
+                continue
+            if body_filter and body_filter != "all" and row.get("body_system") != body_filter:
+                continue
+            map_row = doc_map.get(qid)
+            if map_row is None:
+                logger.debug("No doc-map entry for %s; skipping", qid)
+                continue
+            n_images = int(map_row.get("n_images", 0))
+            if require_images and n_images <= 0:
+                continue
+            out.append(MXQuestion(
+                qid=qid,
+                question=str(row.get("question") or ""),
+                options={str(k).upper(): str(v) for k, v in (row.get("options") or {}).items()},
+                label=str(row.get("label") or "").strip().upper(),
+                medical_task=str(row.get("medical_task") or "").strip(),
+                body_system=str(row.get("body_system") or "").strip(),
+                question_type=str(row.get("question_type") or "").strip(),
+                split=str(row.get("split") or ""),
+                n_images=n_images,
+                pdf_path=Path(map_row["pdf_path"]),
+                document_id=map_row.get("document_id"),
+            ))
+    out.sort(key=lambda q: (q.split, q.qid))
+    if sample_n is not None and sample_n > 0:
+        out = out[:sample_n]
+    return out
+
+
+async def _gather_with_limit(coros: Iterable, *, concurrency: int) -> list[Any]:
+    sem = asyncio.Semaphore(max(1, concurrency))
+
+    async def _wrap(coro):
+        async with sem:
+            return await coro
+
+    return await asyncio.gather(*(_wrap(c) for c in coros))
+
+
+_DESCRIPTION = (
+    "MedXpertQA-MM (~2,000 multimodal medical exam questions, 5 options, with images) — "
+    "Native PDF (vision) vs SurfSense (vision RAG) head-to-head."
+)
+
+# MedXpertQA-MM PDFs embed clinical images; vision LLM at ingest is
+# the whole point. Operators can flip ``--no-vision-llm`` to measure
+# how much we degrade without it (likely material).
+_DEFAULT_INGEST_SETTINGS = IngestSettings(
+    use_vision_llm=True,
+    processing_mode="basic",
+    should_summarize=False,
+)
+
+
+class MedXpertQAMMBenchmark:
+    """Multimodal medical exam head-to-head."""
+
+    suite: str = "medical"
+    name: str = "medxpertqa"
+    headline: bool = True  # The medical suite headline.
+    description: str = _DESCRIPTION
+
+    def add_run_args(self, parser: argparse.ArgumentParser) -> None:
+        parser.add_argument(
+            "--split", default="test", choices=["test", "dev", "all"],
+            help="Which MedXpertQA-MM split to run (default: test).",
+        )
+        parser.add_argument(
+            "--task", default="all",
+            help="Filter by medical_task value (e.g. Diagnosis, Treatment, Basic Medicine).",
+        )
+        parser.add_argument(
+            "--body-system", dest="body_filter", default="all",
+            help="Filter by body_system value (e.g. Cardiovascular, Lymphatic).",
+        )
+        parser.add_argument(
+            "--require-images", dest="require_images", action="store_true",
+            help="Skip rare MM rows that ended up with zero resolvable images.",
+        )
+        parser.add_argument("--n", dest="sample_n", type=int, default=None,
+                            help="Run only the first N questions after filters apply.")
+        parser.add_argument("--concurrency", type=int, default=4,
+                            help="Parallel question workers per arm.")
+        parser.add_argument("--no-mentions", dest="no_mentions", action="store_true",
+                            help="SurfSense arm: skip mentioned_document_ids (unscoped retrieval).")
+        parser.add_argument(
+            "--pdf-engine", default="native",
+            choices=[e.value for e in PdfEngine],
+            help="OpenRouter file-parser engine for the native arm.",
+        )
+        parser.add_argument(
+            "--max-output-tokens", type=int, default=512,
+            help="Cap on completion length for both arms.",
+        )
+        # Ingest-only knobs (forwarded by the CLI to ingest.run_ingest).
+        parser.add_argument(
+            "--max-questions", dest="max_questions", type=int, default=None,
+            help="(ingest only) cap on number of MM questions to render + upload.",
+        )
+        parser.add_argument(
+            "--upload-batch-size", dest="upload_batch_size", type=int, default=8,
+            help="(ingest only) PDFs per fileupload call.",
+        )
+        parser.add_argument(
+            "--skip-upload", dest="skip_upload", action="store_true",
+            help="(ingest only) render PDFs locally but don't push to SurfSense.",
+        )
+        parser.add_argument(
+            "--include-dev", dest="include_dev", action="store_true",
+            help="(ingest only) shorthand for --split all.",
+        )
+        # Per-upload knobs forwarded to /documents/fileupload at ingest;
+        # ignored at run-time (runner reads the resolved settings out of
+        # the doc-map manifest header).
+        add_ingest_settings_args(parser, defaults=_DEFAULT_INGEST_SETTINGS)
+
+    async def ingest(self, ctx: RunContext, **opts: Any) -> None:
+        from .ingest import run_ingest
+
+        settings = IngestSettings.merge(_DEFAULT_INGEST_SETTINGS, opts)
+        await run_ingest(
+            ctx,
+            split=opts.get("split") or "test",
+            max_questions=opts.get("max_questions"),
+            upload_batch_size=int(opts.get("upload_batch_size") or 8),
+            skip_upload=bool(opts.get("skip_upload", False)),
+            include_dev=bool(opts.get("include_dev", False)),
+            settings=settings,
+        )
+
+    async def run(self, ctx: RunContext, **opts: Any) -> RunArtifact:
+        split_filter = opts.get("split") or "test"
+        task_filter = opts.get("task") or "all"
+        body_filter = opts.get("body_filter") or "all"
+        require_images = bool(opts.get("require_images"))
+        sample_n = opts.get("sample_n")
+        concurrency = int(opts.get("concurrency") or 4)
+        no_mentions = bool(opts.get("no_mentions"))
+        pdf_engine_name = opts.get("pdf_engine") or "native"
+        max_output_tokens = int(opts.get("max_output_tokens") or 512)
+
+        bench_dir = ctx.benchmark_data_dir()
+        questions_jsonl = bench_dir / "questions.jsonl"
+        map_path = ctx.maps_dir() / "medxpertqa_doc_map.jsonl"
+        if not questions_jsonl.exists() or not map_path.exists():
+            raise RuntimeError(
+                "MedXpertQA-MM not ingested for this suite. Run "
+                "`python -m surfsense_evals ingest medical medxpertqa` first."
+            )
+
+        doc_map, ingest_settings = _load_doc_map(map_path)
+        questions = _load_questions(
+            questions_jsonl, doc_map,
+            split_filter=split_filter,
+            task_filter=task_filter if task_filter != "all" else None,
+            body_filter=body_filter if body_filter != "all" else None,
+            require_images=require_images,
+            sample_n=sample_n,
+        )
+        if not questions:
+            raise RuntimeError(
+                "No MedXpertQA-MM questions matched the filters; broaden --split/--task/--body-system/--n."
+            )
+        logger.info("MedXpertQA-MM: scheduled %d questions", len(questions))
+
+        api_key = os.environ.get("OPENROUTER_API_KEY")
+        if not api_key:
+            raise RuntimeError("OPENROUTER_API_KEY env var is required for the native arm.")
+
+        # Native arm slug differs from SurfSense slug only in cost-arbitrage
+        # scenario; otherwise both arms answer with provider_model.
+        native_arm_model = ctx.native_arm_model
+        if any(hint in native_arm_model.lower() for hint in _TEXT_ONLY_HINTS):
+            if ctx.scenario == "symmetric-cheap":
+                logger.info(
+                    "symmetric-cheap: native arm pinned to text-only %r as "
+                    "intended; expect it to lose on image-bearing questions "
+                    "(SurfSense answers from vision-extracted chunks).",
+                    native_arm_model,
+                )
+            else:
+                logger.warning(
+                    "Native arm slug %r looks text-only; image content in "
+                    "MedXpertQA PDFs will be ignored. Re-pin via "
+                    "`setup --provider-model anthropic/claude-sonnet-4.5` "
+                    "(or pass --native-arm-model and --scenario cost-arbitrage "
+                    "to make this asymmetry explicit).",
+                    native_arm_model,
+                )
+
+        provider = OpenRouterPdfProvider(
+            api_key=api_key,
+            base_url=ctx.config.openrouter_base_url,
+            model=native_arm_model,
+            engine=PdfEngine(pdf_engine_name),
+        )
+        native_arm = NativePdfArm(provider=provider, max_output_tokens=max_output_tokens)
+        surf_arm = SurfSenseArm(
+            client=ctx.new_chat_client(),
+            search_space_id=ctx.search_space_id,
+            ephemeral_threads=True,
+        )
+
+        run_timestamp = utc_iso_timestamp()
+        run_dir = ctx.runs_dir(run_timestamp=run_timestamp)
+        raw_path = run_dir / "raw.jsonl"
+
+        async def _native_one(q: MXQuestion) -> ArmResult:
+            return await native_arm.answer(_make_native_request(q, max_output_tokens))
+
+        async def _surf_one(q: MXQuestion) -> ArmResult:
+            return await surf_arm.answer(_make_surfsense_request(q, no_mentions=no_mentions))
+
+        native_results, surf_results = await asyncio.gather(
+            _gather_with_limit((_native_one(q) for q in questions), concurrency=concurrency),
+            _gather_with_limit((_surf_one(q) for q in questions), concurrency=concurrency),
+        )
+
+        with raw_path.open("w", encoding="utf-8") as fh:
+            for q, n_res, s_res in zip(questions, native_results, surf_results, strict=False):
+                meta = {
+                    "qid": q.qid,
+                    "split": q.split,
+                    "medical_task": q.medical_task,
+                    "body_system": q.body_system,
+                    "question_type": q.question_type,
+                    "n_images": q.n_images,
+                    "correct": q.label,
+                    "document_id": q.document_id,
+                }
+                fh.write(json.dumps({**meta, **n_res.to_jsonl()}) + "\n")
+                fh.write(json.dumps({**meta, **s_res.to_jsonl()}) + "\n")
+
+        metrics = _compute_metrics(questions, native_results, surf_results)
+        artifact = RunArtifact(
+            suite=self.suite,
+            benchmark=self.name,
+            run_timestamp=run_timestamp,
+            raw_path=raw_path,
+            metrics=metrics,
+            extra={
+                "n_questions": len(questions),
+                "concurrency": concurrency,
+                "split_filter": split_filter,
+                "task_filter": task_filter,
+                "body_filter": body_filter,
+                "require_images": require_images,
+                "no_mentions": no_mentions,
+                "pdf_engine": pdf_engine_name,
+                "scenario": ctx.scenario,
+                "provider_model": ctx.provider_model,
+                "native_arm_model": native_arm_model,
+                "vision_provider_model": ctx.vision_provider_model,
+                "agent_llm_id": ctx.agent_llm_id,
+                "ingest_settings": ingest_settings,
+            },
+        )
+
+        manifest_path = run_dir / "run_artifact.json"
+        manifest_path.write_text(
+            json.dumps({
+                "suite": self.suite,
+                "benchmark": self.name,
+                "raw_path": "raw.jsonl",
+                "metrics": metrics,
+                "extra": artifact.extra,
+            }, indent=2, sort_keys=True) + "\n",
+            encoding="utf-8",
+        )
+        return artifact
+
+    def report_section(self, artifacts: list[RunArtifact]) -> ReportSection:
+        if not artifacts:
+            return ReportSection(
+                title="MedXpertQA-MM — Native PDF (vision) vs SurfSense (vision RAG)",
+                headline=False,
+                body_md="(no run artifacts found)",
+                body_json={},
+            )
+        latest = max(artifacts, key=lambda a: a.run_timestamp)
+        m = latest.metrics
+        native = m.get("native", {})
+        surf = m.get("surfsense", {})
+        delta = m.get("delta", {})
+        per_task = m.get("per_task", {})
+        per_body = m.get("per_body_system", {})
+        extra = latest.extra
+
+        body_lines: list[str] = []
+        body_lines.append(
+            f"- Sample size: {extra.get('n_questions', '?')} questions "
+            f"(split: `{extra.get('split_filter', 'test')}`, "
+            f"task: `{extra.get('task_filter', 'all')}`, "
+            f"body: `{extra.get('body_filter', 'all')}`, "
+            f"engine: `{extra.get('pdf_engine', 'native')}`)."
+        )
+        body_lines.append(format_scenario_md(extra))
+        body_lines.append(format_ingest_settings_md(extra.get("ingest_settings")))
+        body_lines.append(
+            "- Native arm (OpenRouter `chat/completions` + file plugin, "
+            f"`{extra.get('native_arm_model') or extra.get('provider_model', '?')}`):"
+        )
+        body_lines.append(_arm_summary_lines(native, indent="  "))
+        body_lines.append(
+            "- SurfSense arm (`POST /api/v1/new_chat`, vision RAG over chunks, "
+            f"`{extra.get('provider_model', '?')}`):"
+        )
+        body_lines.append(_arm_summary_lines(surf, indent="  "))
+        body_lines.append("- Delta (paired):")
+        body_lines.append(
+            f"  - Accuracy: SurfSense {_pp(delta.get('accuracy_pp'))} pp "
+            f"(McNemar p={_fmt(delta.get('mcnemar_p_value'), 4)}, "
+            f"method={delta.get('mcnemar_method')})"
+        )
+        body_lines.append(
+            f"  - Bootstrap 95% CI on delta: "
+            f"[{_pp(delta.get('bootstrap_ci_low'))}pp, {_pp(delta.get('bootstrap_ci_high'))}pp]"
+        )
+        body_lines.append(
+            f"  - Cost / question: native ${_dollars(native.get('cost_micros_mean'))}, "
+            f"surfsense ${_dollars(surf.get('cost_micros_mean'))} "
+            f"(SurfSense delta {_pct_change(delta.get('cost_micros_pct'))})"
+        )
+        body_lines.append(
+            f"  - Latency p50: native {_ms_to_s(native.get('latency_ms_median'))}, "
+            f"surfsense {_ms_to_s(surf.get('latency_ms_median'))} "
+            f"(SurfSense delta {_pct_change(delta.get('latency_ms_pct'))})"
+        )
+        if per_task:
+            body_lines.append("- Per-medical_task split:")
+            for task_name, vals in sorted(per_task.items()):
+                body_lines.append(
+                    f"  - {task_name}: SurfSense {_pp(vals.get('delta_accuracy_pp'))} pp "
+                    f"(n={vals.get('n')})"
+                )
+        if per_body:
+            body_lines.append("- Per-body_system split (top 5 by sample size):")
+            top = sorted(per_body.items(), key=lambda kv: -kv[1].get("n", 0))[:5]
+            for body_name, vals in top:
+                body_lines.append(
+                    f"  - {body_name}: SurfSense {_pp(vals.get('delta_accuracy_pp'))} pp "
+                    f"(n={vals.get('n')})"
+                )
+
+        return ReportSection(
+            title="MedXpertQA-MM — Native PDF (vision) vs SurfSense (vision RAG)",
+            headline=False,
+            body_md="\n".join(body_lines),
+            body_json=m,
+        )
+
+
+# ---------------------------------------------------------------------------
+# Per-question helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_native_request(q: MXQuestion, max_tokens: int) -> ArmRequest:
+    prompt = build_prompt(q.question, q.options)
+    return ArmRequest(
+        question_id=q.qid,
+        prompt=prompt,
+        pdf_paths=[q.pdf_path],
+        options={"max_tokens": max_tokens},
+    )
+
+
+def _make_surfsense_request(q: MXQuestion, *, no_mentions: bool) -> ArmRequest:
+    prompt = build_prompt(q.question, q.options)
+    mentions: list[int] | None = None
+    if not no_mentions and q.document_id is not None:
+        mentions = [int(q.document_id)]
+    return ArmRequest(
+        question_id=q.qid,
+        prompt=prompt,
+        mentioned_document_ids=mentions,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Metrics
+# ---------------------------------------------------------------------------
+
+
+def _compute_metrics(
+    questions: list[MXQuestion],
+    native_results: list[ArmResult],
+    surf_results: list[ArmResult],
+) -> dict[str, Any]:
+    native_correct: list[bool] = []
+    surf_correct: list[bool] = []
+    for q, n_res, s_res in zip(questions, native_results, surf_results, strict=False):
+        gold = q.label
+        n_ok = (n_res.answer_letter or "").upper() == gold and gold in ANSWER_LETTERS
+        s_ok = (s_res.answer_letter or "").upper() == gold and gold in ANSWER_LETTERS
+        native_correct.append(n_ok)
+        surf_correct.append(s_ok)
+
+    native_costs = [float(r.cost_micros) for r in native_results]
+    surf_costs = [float(r.cost_micros) for r in surf_results]
+    native_lats = [float(r.latency_ms) for r in native_results]
+    surf_lats = [float(r.latency_ms) for r in surf_results]
+    native_in = [float(r.input_tokens) for r in native_results]
+    native_out = [float(r.output_tokens) for r in native_results]
+
+    native_acc = accuracy_with_wilson_ci(sum(native_correct), len(native_correct))
+    surf_acc = accuracy_with_wilson_ci(sum(surf_correct), len(surf_correct))
+    mc = mcnemar_test(native_correct, surf_correct)
+    boot = bootstrap_delta_ci(native_correct, surf_correct, n_resamples=2000)
+
+    native_cost_agg = paired_aggregate(native_costs)
+    surf_cost_agg = paired_aggregate(surf_costs)
+    native_lat_agg = paired_aggregate(native_lats)
+    surf_lat_agg = paired_aggregate(surf_lats)
+
+    cost_pct = _safe_pct(surf_cost_agg.mean, native_cost_agg.mean)
+    lat_pct = _safe_pct(surf_lat_agg.median, native_lat_agg.median)
+
+    per_task = _per_field(questions, native_correct, surf_correct, key=lambda q: q.medical_task or "unknown")
+    per_body = _per_field(questions, native_correct, surf_correct, key=lambda q: q.body_system or "unknown")
+
+    return {
+        "native": {
+            **native_acc.to_dict(),
+            "cost_micros_mean": native_cost_agg.mean,
+            "cost_micros_median": native_cost_agg.median,
+            "latency_ms_mean": native_lat_agg.mean,
+            "latency_ms_median": native_lat_agg.median,
+            "latency_ms_p95": native_lat_agg.p95,
+            "input_tokens_mean": (sum(native_in) / len(native_in)) if native_in else 0.0,
+            "output_tokens_mean": (sum(native_out) / len(native_out)) if native_out else 0.0,
+        },
+        "surfsense": {
+            **surf_acc.to_dict(),
+            "cost_micros_mean": surf_cost_agg.mean,
+            "cost_micros_median": surf_cost_agg.median,
+            "latency_ms_mean": surf_lat_agg.mean,
+            "latency_ms_median": surf_lat_agg.median,
+            "latency_ms_p95": surf_lat_agg.p95,
+        },
+        "delta": {
+            "accuracy_pp": 100.0 * (surf_acc.accuracy - native_acc.accuracy),
+            "mcnemar_p_value": mc.p_value,
+            "mcnemar_method": mc.method,
+            "mcnemar_b_native_only": mc.b,
+            "mcnemar_c_surfsense_only": mc.c,
+            "bootstrap_ci_low": 100.0 * boot.ci_low,
+            "bootstrap_ci_high": 100.0 * boot.ci_high,
+            "cost_micros_pct": cost_pct,
+            "latency_ms_pct": lat_pct,
+        },
+        "per_task": per_task,
+        "per_body_system": per_body,
+    }
+
+
+def _per_field(
+    questions: list[MXQuestion],
+    native_correct: list[bool],
+    surf_correct: list[bool],
+    *,
+    key,
+) -> dict[str, dict[str, Any]]:
+    bucket: dict[str, list[tuple[bool, bool]]] = {}
+    for q, n_ok, s_ok in zip(questions, native_correct, surf_correct, strict=False):
+        bucket.setdefault(key(q), []).append((n_ok, s_ok))
+    out: dict[str, dict[str, Any]] = {}
+    for k, pairs in bucket.items():
+        n_correct = [a for a, _ in pairs]
+        s_correct = [b for _, b in pairs]
+        out[k] = {
+            "n": len(pairs),
+            "native_accuracy": (sum(n_correct) / len(pairs)) if pairs else 0.0,
+            "surfsense_accuracy": (sum(s_correct) / len(pairs)) if pairs else 0.0,
+            "delta_accuracy_pp": (
+                100.0 * (sum(s_correct) - sum(n_correct)) / len(pairs)
+                if pairs else 0.0
+            ),
+        }
+    return out
+
+
+def _safe_pct(numerator: float, denominator: float) -> float | None:
+    if denominator == 0:
+        return None
+    return 100.0 * (numerator - denominator) / denominator
+
+
+# ---------------------------------------------------------------------------
+# Formatters
+# ---------------------------------------------------------------------------
+
+
+def _arm_summary_lines(d: dict[str, Any], *, indent: str) -> str:
+    if not d:
+        return f"{indent}(no data)"
+    acc = d.get("accuracy", 0.0)
+    low = d.get("ci_low", 0.0)
+    high = d.get("ci_high", 0.0)
+    lines = [
+        f"{indent}- Accuracy: {acc * 100:.1f}% (Wilson 95% CI: {low * 100:.1f}% – {high * 100:.1f}%)",
+        f"{indent}- Cost / question: ${_dollars(d.get('cost_micros_mean'))} (mean), "
+        f"${_dollars(d.get('cost_micros_median'))} (median)",
+        f"{indent}- Latency: p50 {_ms_to_s(d.get('latency_ms_median'))}, "
+        f"p95 {_ms_to_s(d.get('latency_ms_p95'))}",
+    ]
+    if "input_tokens_mean" in d:
+        lines.append(
+            f"{indent}- Mean tokens / question: in {d.get('input_tokens_mean', 0):.0f}, "
+            f"out {d.get('output_tokens_mean', 0):.0f}"
+        )
+    return "\n".join(lines)
+
+
+def _dollars(micros: Any) -> str:
+    if micros is None:
+        return "?"
+    try:
+        return f"{(float(micros) / 1_000_000):.4f}"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _ms_to_s(ms: Any) -> str:
+    if ms is None:
+        return "?"
+    try:
+        return f"{float(ms) / 1000:.1f}s"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _pp(value: Any) -> str:
+    if value is None:
+        return "?"
+    try:
+        return f"{float(value):+.1f}"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _pct_change(value: Any) -> str:
+    if value is None:
+        return "?"
+    try:
+        return f"{float(value):+.0f}%"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _fmt(value: Any, ndigits: int) -> str:
+    if value is None:
+        return "?"
+    try:
+        return f"{float(value):.{ndigits}f}"
+    except (TypeError, ValueError):
+        return "?"
+
+
+__all__ = ["MedXpertQAMMBenchmark", "MXQuestion"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/medical/mirage/__init__.py b/surfsense_evals/src/surfsense_evals/suites/medical/mirage/__init__.py
new file mode 100644
index 0000000000..e527b37f42
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/medical/mirage/__init__.py
@@ -0,0 +1,17 @@
+"""MIRAGE — secondary single-arm SurfSense MCQ measurement.
+
+Source: https://github.com/Teddy-XiongGZ/MIRAGE, paper
+https://aclanthology.org/2024.findings-acl.372/. 7,663 questions
+across MMLU-Med, MedQA-US, MedMCQA, PubMedQA*, BioASQ-Y/N.
+
+This is a SurfSense-only measurement (not a head-to-head); native
+PDF-in-LLM doesn't apply because there is no per-question discrete
+document — the corpus is millions of biomedical snippets.
+"""
+
+from __future__ import annotations
+
+from .runner import MirageBenchmark
+from ....core import registry as _registry
+
+_registry.register(MirageBenchmark())
diff --git a/surfsense_evals/src/surfsense_evals/suites/medical/mirage/ingest.py b/surfsense_evals/src/surfsense_evals/suites/medical/mirage/ingest.py
new file mode 100644
index 0000000000..9769d078b9
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/medical/mirage/ingest.py
@@ -0,0 +1,548 @@
+"""MIRAGE ingestion.
+
+Downloads:
+
+* ``benchmark.json`` (≈ 4 MB; questions for the 5 sub-tasks).
+* ``retrieved_snippets_10k.zip`` (the union of top-10k snippet ids
+  retrieved by every retriever in the MedRAG paper, per task — a
+  recall ceiling that avoids needing the full 23.9M-doc PubMed mirror).
+
+Snippet *content* lives in the MedRAG HF mirrors
+(``MedRAG/textbooks``, ``MedRAG/pubmed``, ``MedRAG/statpearls``,
+``MedRAG/wikipedia``). We default to ``MedRAG/textbooks`` (212 MB,
+125k snippets) which is the smallest and covers the majority of
+``MedQA-US`` and the medical examination subsets. Operators can
+opt into larger corpora with ``--corpus``.
+
+Each snippet is written as one markdown file then batched into
+``~5 MB`` markdown bundles for SurfSense's file upload (smaller
+than backend default ``MAX_FILE_SIZE_BYTES`` and avoids the per-call
+overhead of one HTTP request per snippet).
+
+The ingestion produces two maps under ``data/medical/maps/``:
+
+* ``mirage_snippet_map.jsonl`` — ``{snippet_id, document_id, batch_path}``
+* ``mirage_chunk_map.jsonl`` — ``{chunk_id, document_id, snippet_id?}``
+  (best-effort; chunk text is heuristically attributed to the
+  snippet it overlaps when the SurfSense chunker splits a batched
+  markdown).
+"""
+
+from __future__ import annotations
+
+import asyncio
+import io
+import json
+import logging
+import zipfile
+from collections.abc import Iterable
+from dataclasses import dataclass
+from pathlib import Path
+
+import httpx
+
+from ....core.config import set_suite_state
+from ....core.ingest_settings import IngestSettings, settings_header_line
+from ....core.registry import RunContext
+
+logger = logging.getLogger(__name__)
+
+
+MIRAGE_BENCHMARK_URL = (
+    "https://raw.githubusercontent.com/Teddy-XiongGZ/MIRAGE/main/benchmark.json"
+)
+# Upstream only ships ONE zip — top-10k retrievals across 5 retrievers,
+# ~16 GB. We default to skipping it (see `--skip-snippet-filter`) and
+# ingesting the chosen corpus in full; this URL is only fetched when
+# the operator explicitly opts in.
+MIRAGE_SNIPPETS_ZIP_URL = (
+    "https://virginia.box.com/shared/static/cxq17th6eisl2pn04vp0x723zczlvlzc.zip"
+)
+
+
+_DEFAULT_CORPUS = "MedRAG/textbooks"
+_BATCH_SIZE_BYTES = 5 * 1024 * 1024
+# 2 GB safety cap. Anything larger requires --allow-large-download.
+# Set high enough that ``benchmark.json`` and small zips pass through
+# untouched but the 16 GB MIRAGE retrievals zip trips the guard.
+_LARGE_DOWNLOAD_BYTES = 2 * 1024 * 1024 * 1024
+_DOWNLOAD_RETRIES = 5
+_RETRYABLE_NET_EXC: tuple[type[BaseException], ...] = (
+    httpx.RemoteProtocolError,
+    httpx.ReadError,
+    httpx.ReadTimeout,
+    httpx.ConnectError,
+    httpx.ConnectTimeout,
+)
+
+
+@dataclass
+class SnippetRow:
+    snippet_id: str
+    title: str
+    content: str
+
+    def to_markdown(self) -> str:
+        title = (self.title or "").strip() or "Untitled"
+        body = (self.content or "").strip()
+        return f"# {title}\n\n_id: `{self.snippet_id}`_\n\n{body}\n"
+
+
+# ---------------------------------------------------------------------------
+# Download helpers
+# ---------------------------------------------------------------------------
+
+
+async def _fetch_to_path(
+    url: str,
+    *,
+    dest: Path,
+    label: str,
+    timeout_s: float = 600.0,
+    allow_large_download: bool = False,
+    expect_zip: bool = False,
+) -> Path:
+    """Download ``url`` to ``dest`` with retry, atomic-rename, and
+    HTTP ``Range`` resume.
+
+    Operational properties:
+
+    * If ``dest`` already exists *and* (when ``expect_zip`` is True) the
+      cached file is a valid ZIP, returns it immediately. A corrupt ZIP
+      is removed and re-downloaded — this is the safety net for the
+      `box.com truncated 16 GB zip` failure mode where the previous
+      run wrote a half-completed file then exited with an exception.
+    * Bytes are written to ``<dest>.partial`` and renamed only after the
+      stream completes cleanly (and, for zips, only after a quick
+      central-directory check). A failure mid-download leaves the
+      ``.partial`` file in place so the next attempt can resume from
+      where it stopped via an HTTP ``Range`` header.
+    * Retries on transient network errors (``RemoteProtocolError``,
+      ``ReadError``, ``ReadTimeout``, ``ConnectError``,
+      ``ConnectTimeout``) with exponential backoff, up to
+      ``_DOWNLOAD_RETRIES``.
+    * Aborts before downloading if the ``Content-Length`` (or already-
+      downloaded ``.partial`` size) is over ``_LARGE_DOWNLOAD_BYTES``
+      and ``allow_large_download`` is False, to keep an operator from
+      surprise-grabbing 16 GB on a slow link.
+    """
+
+    if dest.exists():
+        if expect_zip and not _is_valid_zip(dest):
+            logger.warning(
+                "Cached %s at %s failed ZIP validation (size=%d B); deleting "
+                "and re-downloading.",
+                label,
+                dest,
+                dest.stat().st_size,
+            )
+            dest.unlink(missing_ok=True)
+        else:
+            logger.info("Using cached %s at %s", label, dest)
+            return dest
+
+    dest.parent.mkdir(parents=True, exist_ok=True)
+    partial = dest.with_suffix(dest.suffix + ".partial")
+    last_exc: BaseException | None = None
+
+    for attempt in range(1, _DOWNLOAD_RETRIES + 1):
+        existing_bytes = partial.stat().st_size if partial.exists() else 0
+        headers: dict[str, str] = {}
+        if existing_bytes:
+            headers["Range"] = f"bytes={existing_bytes}-"
+            logger.info(
+                "Resuming %s from byte %d (attempt %d/%d)",
+                label,
+                existing_bytes,
+                attempt,
+                _DOWNLOAD_RETRIES,
+            )
+        else:
+            logger.info(
+                "Downloading %s from %s (attempt %d/%d)",
+                label,
+                url,
+                attempt,
+                _DOWNLOAD_RETRIES,
+            )
+
+        try:
+            async with httpx.AsyncClient(
+                timeout=httpx.Timeout(timeout_s, connect=20.0),
+                follow_redirects=True,
+            ) as client:
+                async with client.stream("GET", url, headers=headers) as response:
+                    if existing_bytes and response.status_code == 200:
+                        logger.warning(
+                            "Server ignored Range header for %s; restarting from 0.",
+                            label,
+                        )
+                        partial.unlink(missing_ok=True)
+                        existing_bytes = 0
+                    elif response.status_code == 416:
+                        # Range not satisfiable — the .partial is at or
+                        # past the end. Treat as "already downloaded";
+                        # validate by closing and re-opening for atomic
+                        # rename below.
+                        logger.info(
+                            "Server reports %s already complete (HTTP 416).",
+                            label,
+                        )
+                    elif response.status_code not in (200, 206):
+                        response.raise_for_status()
+
+                    total_size = _planned_total_size(response, existing_bytes)
+                    if (
+                        total_size is not None
+                        and total_size > _LARGE_DOWNLOAD_BYTES
+                        and not allow_large_download
+                    ):
+                        raise _LargeDownloadAbort(label, total_size)
+
+                    mode = "ab" if existing_bytes else "wb"
+                    with partial.open(mode) as fh:
+                        async for chunk in response.aiter_bytes(chunk_size=1 << 18):
+                            fh.write(chunk)
+            # Optional content sanity check before promoting to dest.
+            if expect_zip and not _is_valid_zip(partial):
+                raise zipfile.BadZipFile(
+                    f"{label} downloaded to {partial} but failed central-"
+                    "directory check; will retry."
+                )
+            partial.replace(dest)
+            return dest
+        except _LargeDownloadAbort:
+            raise
+        except _RETRYABLE_NET_EXC as exc:
+            last_exc = exc
+            wait = min(60.0, 2.0 ** attempt)
+            logger.warning(
+                "Network error fetching %s (%s: %s); retrying in %.0fs.",
+                label,
+                type(exc).__name__,
+                exc,
+                wait,
+            )
+            await asyncio.sleep(wait)
+        except zipfile.BadZipFile as exc:
+            last_exc = exc
+            # Truncated body — drop the partial and retry from scratch.
+            partial.unlink(missing_ok=True)
+            wait = min(60.0, 2.0 ** attempt)
+            logger.warning(
+                "Truncated ZIP for %s; restarting from byte 0 in %.0fs.",
+                label,
+                wait,
+            )
+            await asyncio.sleep(wait)
+
+    raise RuntimeError(
+        f"Failed to download {label} after {_DOWNLOAD_RETRIES} attempts: {last_exc!s}"
+    )
+
+
+def _planned_total_size(response: httpx.Response, existing_bytes: int) -> int | None:
+    """Best-effort total size including any already-buffered .partial bytes."""
+
+    cl = response.headers.get("Content-Length")
+    if not cl:
+        return None
+    try:
+        remaining = int(cl)
+    except ValueError:
+        return None
+    return existing_bytes + remaining
+
+
+def _is_valid_zip(path: Path) -> bool:
+    """Cheap ZIP validity check via central-directory parse."""
+
+    try:
+        with zipfile.ZipFile(path) as zf:
+            # ``namelist`` forces the central directory to be parsed.
+            zf.namelist()
+        return True
+    except (zipfile.BadZipFile, OSError):
+        return False
+
+
+class _LargeDownloadAbort(RuntimeError):
+    """Raised when a download exceeds the safety threshold without opt-in."""
+
+    def __init__(self, label: str, size_bytes: int) -> None:
+        gb = size_bytes / (1024 ** 3)
+        super().__init__(
+            f"{label} would download ~{gb:.1f} GB, above the {_LARGE_DOWNLOAD_BYTES / (1024 ** 3):.0f} GB safety cap. "
+            "Re-run with `--allow-large-download` to acknowledge, or use "
+            "`--skip-snippet-filter` to bypass this download entirely and "
+            "ingest the full corpus instead."
+        )
+
+
+def _read_snippet_ids(zip_path: Path, *, tasks: list[str]) -> dict[str, set[str]]:
+    """Walk the ZIP for files whose path contains any task name.
+
+    Each MedRAG retriever produces one JSON file per task in the zip;
+    we union all retrievers' top-K ids. The exact directory layout has
+    historically been ``<retriever>/<task>.json`` mapping
+    ``question_id -> [snippet_id, ...]``.
+    """
+
+    out: dict[str, set[str]] = {t: set() for t in tasks}
+    with zipfile.ZipFile(zip_path, "r") as zf:
+        for member in zf.namelist():
+            if not member.lower().endswith(".json"):
+                continue
+            stem = Path(member).stem.lower()
+            for task in tasks:
+                if task.lower() in stem:
+                    try:
+                        with zf.open(member) as fh:
+                            payload = json.loads(fh.read().decode("utf-8"))
+                    except (json.JSONDecodeError, KeyError):
+                        continue
+                    for ids in payload.values():
+                        if isinstance(ids, list):
+                            for sid in ids:
+                                if isinstance(sid, str):
+                                    out[task].add(sid)
+                                elif isinstance(sid, dict) and "id" in sid:
+                                    out[task].add(str(sid["id"]))
+                    break
+    return out
+
+
+def _load_corpus(
+    corpus_name: str, snippet_ids: set[str] | None
+) -> Iterable[SnippetRow]:
+    """Stream rows from a MedRAG HF corpus.
+
+    * ``snippet_ids=None`` → yield every row (full-corpus ingestion path).
+    * ``snippet_ids={...}`` → filter to the requested ids.
+
+    Imported lazily — ``datasets`` is a heavyweight dep.
+    """
+
+    if snippet_ids is not None and not snippet_ids:
+        return iter(())
+    from datasets import load_dataset  # noqa: PLC0415
+
+    logger.info("Loading corpus %s (this may take a while)", corpus_name)
+    ds = load_dataset(corpus_name, split="train", streaming=True)
+    for row in ds:
+        sid = str(row.get("id") or "")
+        if snippet_ids is not None and sid not in snippet_ids:
+            continue
+        yield SnippetRow(
+            snippet_id=sid,
+            title=str(row.get("title") or ""),
+            content=str(row.get("content") or row.get("contents") or ""),
+        )
+
+
+# ---------------------------------------------------------------------------
+# Batching + upload
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class SnippetBatch:
+    path: Path
+    snippet_ids: list[str]
+
+
+def _write_batches(
+    snippets: Iterable[SnippetRow],
+    *,
+    out_dir: Path,
+    batch_bytes: int = _BATCH_SIZE_BYTES,
+    prefix: str = "mirage",
+) -> list[SnippetBatch]:
+    out_dir.mkdir(parents=True, exist_ok=True)
+    batches: list[SnippetBatch] = []
+    current_buffer = io.StringIO()
+    current_ids: list[str] = []
+    current_bytes = 0
+    batch_idx = 0
+
+    def _flush() -> None:
+        nonlocal current_buffer, current_ids, current_bytes, batch_idx
+        if not current_ids:
+            return
+        path = out_dir / f"{prefix}_{batch_idx:04d}.md"
+        path.write_text(current_buffer.getvalue(), encoding="utf-8")
+        batches.append(SnippetBatch(path=path, snippet_ids=current_ids))
+        batch_idx += 1
+        current_buffer = io.StringIO()
+        current_ids = []
+        current_bytes = 0
+
+    for snippet in snippets:
+        chunk = snippet.to_markdown() + "\n---\n\n"
+        chunk_bytes = len(chunk.encode("utf-8"))
+        if current_bytes + chunk_bytes > batch_bytes and current_ids:
+            _flush()
+        current_buffer.write(chunk)
+        current_ids.append(snippet.snippet_id)
+        current_bytes += chunk_bytes
+    _flush()
+    return batches
+
+
+# ---------------------------------------------------------------------------
+# Public entry point
+# ---------------------------------------------------------------------------
+
+
+async def run_ingest(
+    ctx: RunContext,
+    *,
+    tasks: list[str] | None = None,
+    corpus: str = _DEFAULT_CORPUS,
+    max_snippets_per_task: int | None = None,
+    skip_snippet_filter: bool = True,
+    allow_large_download: bool = False,
+    settings: IngestSettings | None = None,
+) -> None:
+    """Ingest a MedRAG corpus into the suite SearchSpace.
+
+    By default (``skip_snippet_filter=True``) we ingest the **entire**
+    chosen corpus and let SurfSense's own retriever do the work. The
+    upstream MIRAGE retrieval zip is ~16 GB and only useful when you
+    want to pre-filter the corpus to the set of snippets some other
+    retriever surfaced; for ``MedRAG/textbooks`` (212 MB / 125k snippets)
+    that pre-filter is unnecessary overhead and routinely fails to
+    download (box.com truncates the stream). Set
+    ``skip_snippet_filter=False`` (CLI: ``--use-snippet-filter``) only
+    if you specifically want the upstream filter — and budget the
+    16 GB zip transfer.
+    """
+
+    tasks = tasks or ["mmlu", "medqa", "medmcqa", "pubmedqa", "bioasq"]
+    settings = settings or IngestSettings(use_vision_llm=False, processing_mode="basic")
+
+    bench_path = ctx.benchmark_data_dir() / "benchmark.json"
+    await _fetch_to_path(MIRAGE_BENCHMARK_URL, dest=bench_path, label="MIRAGE benchmark.json")
+
+    if skip_snippet_filter:
+        logger.info(
+            "Skipping retrieved_snippets_10k.zip (skip_snippet_filter=True); "
+            "ingesting entire corpus %s.",
+            corpus,
+        )
+        snippets = list(_load_corpus(corpus, snippet_ids=None))
+    else:
+        zip_path = ctx.benchmark_data_dir() / "retrieved_snippets_10k.zip"
+        await _fetch_to_path(
+            MIRAGE_SNIPPETS_ZIP_URL,
+            dest=zip_path,
+            label="MIRAGE retrieved_snippets_10k.zip",
+            allow_large_download=allow_large_download,
+            expect_zip=True,
+        )
+
+        by_task = _read_snippet_ids(zip_path, tasks=tasks)
+        if max_snippets_per_task is not None:
+            by_task = {k: set(list(v)[:max_snippets_per_task]) for k, v in by_task.items()}
+
+        union_ids = set().union(*by_task.values())
+        logger.info(
+            "MIRAGE: tasks=%s, snippet ids per task: %s, union=%d",
+            tasks,
+            {k: len(v) for k, v in by_task.items()},
+            len(union_ids),
+        )
+        if not union_ids:
+            raise RuntimeError(
+                f"No snippet ids parsed for tasks {tasks!r} from {zip_path}. "
+                "Check the zip layout (the upstream archive may have changed)."
+            )
+
+        snippets = list(_load_corpus(corpus, snippet_ids=union_ids))
+        logger.info(
+            "Loaded %d / %d requested snippets from corpus %s",
+            len(snippets),
+            len(union_ids),
+            corpus,
+        )
+    if not snippets:
+        raise RuntimeError(
+            f"Corpus {corpus} returned 0 matching rows. Either the snippet "
+            "ids reference a different corpus (e.g. PubMed) or the HF mirror "
+            "is unavailable. Pass --corpus to override."
+        )
+
+    batches_dir = ctx.benchmark_data_dir() / "batches"
+    batches = _write_batches(snippets, out_dir=batches_dir)
+    logger.info("Wrote %d snippet batches to %s", len(batches), batches_dir)
+
+    docs_client = ctx.documents_client()
+    upload_result = await docs_client.upload(
+        files=[b.path for b in batches],
+        search_space_id=ctx.search_space_id,
+        should_summarize=settings.should_summarize,
+        use_vision_llm=settings.use_vision_llm,
+        processing_mode=settings.processing_mode,
+    )
+    logger.info("MIRAGE upload settings: %s", settings.render_label())
+    new_doc_ids = list(upload_result.document_ids)
+    if new_doc_ids:
+        await docs_client.wait_until_ready(
+            search_space_id=ctx.search_space_id,
+            document_ids=new_doc_ids,
+            timeout_s=3600.0,
+            max_poll_s=15.0,
+        )
+
+    statuses = await docs_client.get_status(
+        search_space_id=ctx.search_space_id,
+        document_ids=new_doc_ids + upload_result.duplicate_document_ids,
+    )
+    title_to_doc = {s.title: s.document_id for s in statuses}
+
+    snippet_map_path = ctx.maps_dir() / "mirage_snippet_map.jsonl"
+    chunk_map_path = ctx.maps_dir() / "mirage_chunk_map.jsonl"
+    with snippet_map_path.open("w", encoding="utf-8") as fh:
+        # Header line records the ingest-time settings (see
+        # core/ingest_settings.py for the protocol).
+        fh.write(settings_header_line(settings) + "\n")
+        for batch in batches:
+            doc_id = title_to_doc.get(batch.path.name)
+            if doc_id is None:
+                logger.warning("No document_id for batch %s", batch.path.name)
+                continue
+            for sid in batch.snippet_ids:
+                fh.write(
+                    json.dumps(
+                        {
+                            "snippet_id": sid,
+                            "document_id": doc_id,
+                            "batch_path": str(batch.path),
+                        }
+                    )
+                    + "\n"
+                )
+
+    # Best-effort chunk map. SurfSense doesn't expose snippet attribution
+    # per chunk, so we just record (chunk_id -> document_id) here; the
+    # MIRAGE runner only needs document_id for accuracy scoring.
+    with chunk_map_path.open("w", encoding="utf-8") as fh:
+        for doc_id in {b.path.name and title_to_doc.get(b.path.name) for b in batches} - {None}:
+            try:
+                chunks = await docs_client.list_chunks(int(doc_id))
+            except Exception as exc:  # noqa: BLE001
+                logger.warning("Failed to list chunks for doc_id=%s: %s", doc_id, exc)
+                continue
+            for chunk in chunks:
+                fh.write(
+                    json.dumps({"chunk_id": chunk.id, "document_id": doc_id})
+                    + "\n"
+                )
+
+    new_state = ctx.suite_state
+    new_state.ingestion_maps["mirage"] = str(snippet_map_path)
+    set_suite_state(ctx.config, ctx.suite, new_state)
+    logger.info("Wrote MIRAGE maps to %s and %s", snippet_map_path, chunk_map_path)
+
+
+__all__ = ["run_ingest", "SnippetRow", "SnippetBatch"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/medical/mirage/prompt.py b/surfsense_evals/src/surfsense_evals/suites/medical/mirage/prompt.py
new file mode 100644
index 0000000000..9e5b1c618c
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/medical/mirage/prompt.py
@@ -0,0 +1,44 @@
+"""MedRAG ``{step_by_step_thinking, answer_choice}`` MCQ prompt.
+
+Mirrors the MedRAG paper's prompt format so accuracy numbers are
+comparable to the published MIRAGE leaderboard.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Mapping
+
+
+_PROMPT_TEMPLATE = """\
+You are a helpful medical expert. Answer the following multiple-choice
+question using the relevant medical knowledge available to you (and any
+retrieved context, if provided).
+
+Respond with a JSON object on a single line:
+{{"step_by_step_thinking": "<your reasoning>", "answer_choice": "<letter>"}}
+
+Question: {question}
+
+Options:
+{options_block}
+"""
+
+
+def _options_block(options: Mapping[str, str]) -> str:
+    parts: list[str] = []
+    for letter in sorted(options.keys()):
+        text = options.get(letter)
+        if text is None or text == "":
+            continue
+        parts.append(f"{letter}) {text}")
+    return "\n".join(parts)
+
+
+def build_prompt(question: str, options: Mapping[str, str]) -> str:
+    return _PROMPT_TEMPLATE.format(
+        question=question.strip(),
+        options_block=_options_block(options),
+    )
+
+
+__all__ = ["build_prompt"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/medical/mirage/runner.py b/surfsense_evals/src/surfsense_evals/suites/medical/mirage/runner.py
new file mode 100644
index 0000000000..0f336c0d5c
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/medical/mirage/runner.py
@@ -0,0 +1,332 @@
+"""MIRAGE runner: SurfSense-only per-task accuracy.
+
+The benchmark file format is one top-level dict per task (``mmlu``,
+``medqa``, ``medmcqa``, ``pubmedqa``, ``bioasq``); each task value is
+``{question_id: {question, options, answer}}``.
+
+We restrict retrieval to the suite SearchSpace's full corpus (no
+``mentioned_document_ids`` — MIRAGE has no per-question ground-truth
+document; retrieval *is* the test). Accuracy is paired against the
+``answer`` letter from the dataset.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import logging
+from dataclasses import dataclass
+from typing import Any
+
+from ....core.arms import ArmRequest, ArmResult, SurfSenseArm
+from ....core.config import utc_iso_timestamp
+from ....core.ingest_settings import (
+    IngestSettings,
+    add_ingest_settings_args,
+    format_ingest_settings_md,
+    read_settings_header,
+)
+from ....core.metrics.mc_accuracy import accuracy_with_wilson_ci, macro_accuracy
+from ....core.registry import (
+    Benchmark,
+    ReportSection,
+    RunArtifact,
+    RunContext,
+)
+from .prompt import build_prompt
+
+logger = logging.getLogger(__name__)
+
+
+_TASKS = ("mmlu", "medqa", "medmcqa", "pubmedqa", "bioasq")
+_DESCRIPTION = "MIRAGE (7,663 medical MCQs) — single-arm SurfSense per-task accuracy."
+
+# MIRAGE corpus is text-only (textbook + abstract markdown). Vision
+# LLM at ingest is wasted compute by default; flip ``--use-vision-llm``
+# to measure cost.
+_DEFAULT_INGEST_SETTINGS = IngestSettings(
+    use_vision_llm=False,
+    processing_mode="basic",
+    should_summarize=False,
+)
+
+
+@dataclass
+class MirageQuestion:
+    task: str
+    qid: str
+    question: str
+    options: dict[str, str]
+    correct: str
+
+    @property
+    def question_id(self) -> str:
+        return f"{self.task}::{self.qid}"
+
+
+def _load_questions(
+    benchmark: dict[str, Any],
+    *,
+    tasks: list[str],
+    sample_n: int | None,
+) -> list[MirageQuestion]:
+    out: list[MirageQuestion] = []
+    for task in tasks:
+        rows = benchmark.get(task) or {}
+        if not isinstance(rows, dict):
+            continue
+        for qid, raw in rows.items():
+            if not isinstance(raw, dict):
+                continue
+            options = raw.get("options") or {}
+            if not isinstance(options, dict):
+                continue
+            answer_raw = str(raw.get("answer") or "").strip()
+            if not answer_raw:
+                continue
+            answer_letter = answer_raw[:1].upper()
+            out.append(
+                MirageQuestion(
+                    task=task,
+                    qid=str(qid),
+                    question=str(raw.get("question", "")),
+                    options={str(k): str(v) for k, v in options.items() if v},
+                    correct=answer_letter,
+                )
+            )
+    out.sort(key=lambda q: (q.task, q.qid))
+    if sample_n is not None and sample_n > 0:
+        # Stratified-by-task slice so smoke runs cover every task.
+        per_task = max(1, sample_n // max(1, len(tasks)))
+        sliced: list[MirageQuestion] = []
+        per_task_counter: dict[str, int] = {}
+        for q in out:
+            n = per_task_counter.get(q.task, 0)
+            if n >= per_task:
+                continue
+            sliced.append(q)
+            per_task_counter[q.task] = n + 1
+            if len(sliced) >= sample_n:
+                break
+        out = sliced
+    return out
+
+
+async def _gather_with_limit(coros, *, concurrency: int) -> list[Any]:
+    sem = asyncio.Semaphore(max(1, concurrency))
+
+    async def _wrap(c):
+        async with sem:
+            return await c
+
+    return await asyncio.gather(*(_wrap(c) for c in coros))
+
+
+class MirageBenchmark:
+    suite: str = "medical"
+    name: str = "mirage"
+    headline: bool = False
+    description: str = _DESCRIPTION
+
+    def add_run_args(self, parser: argparse.ArgumentParser) -> None:
+        parser.add_argument(
+            "--task",
+            default="all",
+            choices=("all", *_TASKS),
+            help="Run a single task or all (default: all).",
+        )
+        parser.add_argument("--n", dest="sample_n", type=int, default=None,
+                            help="Stratified sample size across tasks.")
+        parser.add_argument("--concurrency", type=int, default=4)
+        parser.add_argument(
+            "--corpus", default="MedRAG/textbooks",
+            help="HF MedRAG corpus to ingest from (default: MedRAG/textbooks).",
+        )
+        parser.add_argument(
+            "--max-snippets-per-task", type=int, default=None,
+            help="Cap the per-task ingestion to N snippets (smoke).",
+        )
+        # Mutually exclusive: by default we skip the upstream 16 GB
+        # retrievals zip and ingest the entire corpus. Operators who
+        # want the upstream pre-filter pass --use-snippet-filter (and,
+        # if their corpus mismatch warrants the 16 GB transfer,
+        # --allow-large-download).
+        snippet_group = parser.add_mutually_exclusive_group()
+        snippet_group.add_argument(
+            "--use-snippet-filter", dest="use_snippet_filter", action="store_true",
+            default=False,
+            help="Download retrieved_snippets_10k.zip (~16 GB) and "
+                 "filter the corpus to those ids before ingest. "
+                 "Default: skip and ingest entire corpus.",
+        )
+        snippet_group.add_argument(
+            "--skip-snippet-filter", dest="use_snippet_filter", action="store_false",
+            help="(Default) Skip the 16 GB upstream zip; ingest entire corpus.",
+        )
+        parser.add_argument(
+            "--allow-large-download", action="store_true", default=False,
+            help="Permit downloads larger than 2 GB (e.g. retrieved_snippets_10k.zip).",
+        )
+        # Per-upload knobs; ignored at run-time (runner reads the
+        # resolved settings out of the snippet-map manifest header).
+        add_ingest_settings_args(parser, defaults=_DEFAULT_INGEST_SETTINGS)
+
+    async def ingest(self, ctx: RunContext, **opts: Any) -> None:
+        from .ingest import run_ingest
+
+        settings = IngestSettings.merge(_DEFAULT_INGEST_SETTINGS, opts)
+        await run_ingest(
+            ctx,
+            corpus=str(opts.get("corpus") or "MedRAG/textbooks"),
+            max_snippets_per_task=opts.get("max_snippets_per_task"),
+            skip_snippet_filter=not bool(opts.get("use_snippet_filter")),
+            allow_large_download=bool(opts.get("allow_large_download")),
+            settings=settings,
+        )
+
+    async def run(self, ctx: RunContext, **opts: Any) -> RunArtifact:
+        task_filter = opts.get("task") or "all"
+        tasks = list(_TASKS) if task_filter == "all" else [task_filter]
+        sample_n = opts.get("sample_n")
+        concurrency = int(opts.get("concurrency") or 4)
+
+        bench_path = ctx.benchmark_data_dir() / "benchmark.json"
+        if not bench_path.exists():
+            raise RuntimeError(
+                "MIRAGE benchmark.json missing. Run "
+                "`python -m surfsense_evals ingest medical mirage` first."
+            )
+        benchmark = json.loads(bench_path.read_text(encoding="utf-8"))
+        ingest_settings = read_settings_header(
+            ctx.maps_dir() / "mirage_snippet_map.jsonl"
+        )
+        questions = _load_questions(benchmark, tasks=tasks, sample_n=sample_n)
+        if not questions:
+            raise RuntimeError(
+                f"No MIRAGE questions matched task={task_filter!r} sample_n={sample_n!r}."
+            )
+        logger.info("MIRAGE: scheduled %d questions across tasks %s",
+                    len(questions), tasks)
+
+        arm = SurfSenseArm(
+            client=ctx.new_chat_client(),
+            search_space_id=ctx.search_space_id,
+            ephemeral_threads=True,
+        )
+
+        async def _ask(q: MirageQuestion) -> ArmResult:
+            request = ArmRequest(
+                question_id=q.question_id,
+                prompt=build_prompt(q.question, q.options),
+            )
+            return await arm.answer(request)
+
+        results: list[ArmResult] = await _gather_with_limit(
+            (_ask(q) for q in questions), concurrency=concurrency
+        )
+
+        run_timestamp = utc_iso_timestamp()
+        run_dir = ctx.runs_dir(run_timestamp=run_timestamp)
+        raw_path = run_dir / "raw.jsonl"
+        with raw_path.open("w", encoding="utf-8") as fh:
+            for q, res in zip(questions, results):
+                fh.write(
+                    json.dumps(
+                        {
+                            "task": q.task,
+                            "qid": q.qid,
+                            "correct": q.correct,
+                            **res.to_jsonl(),
+                        }
+                    )
+                    + "\n"
+                )
+
+        per_task_acc: dict[str, dict[str, Any]] = {}
+        for task in tasks:
+            n_correct = 0
+            n_total = 0
+            for q, res in zip(questions, results):
+                if q.task != task:
+                    continue
+                n_total += 1
+                if (res.answer_letter or "").upper() == q.correct:
+                    n_correct += 1
+            acc = accuracy_with_wilson_ci(n_correct, n_total)
+            per_task_acc[task] = acc.to_dict()
+
+        macro = macro_accuracy(
+            {t: accuracy_with_wilson_ci(d["n_correct"], d["n_total"]) for t, d in per_task_acc.items()}
+        )
+        metrics = {"per_task": per_task_acc, "macro_accuracy": macro}
+
+        artifact = RunArtifact(
+            suite=self.suite,
+            benchmark=self.name,
+            run_timestamp=run_timestamp,
+            raw_path=raw_path,
+            metrics=metrics,
+            extra={
+                "n_questions": len(questions),
+                "task_filter": task_filter,
+                "concurrency": concurrency,
+                "provider_model": ctx.provider_model,
+                "ingest_settings": ingest_settings,
+            },
+        )
+        manifest_path = run_dir / "run_artifact.json"
+        manifest_path.write_text(
+            json.dumps(
+                {
+                    "suite": self.suite,
+                    "benchmark": self.name,
+                    "raw_path": "raw.jsonl",
+                    "metrics": metrics,
+                    "extra": artifact.extra,
+                },
+                indent=2,
+                sort_keys=True,
+            )
+            + "\n",
+            encoding="utf-8",
+        )
+        return artifact
+
+    def report_section(self, artifacts: list[RunArtifact]) -> ReportSection:
+        if not artifacts:
+            return ReportSection(
+                title="MIRAGE — single-arm SurfSense per-task accuracy",
+                headline=False,
+                body_md="(no run artifacts found)",
+                body_json={},
+            )
+        latest = max(artifacts, key=lambda a: a.run_timestamp)
+        per_task = latest.metrics.get("per_task", {})
+        macro = latest.metrics.get("macro_accuracy", 0.0)
+        lines: list[str] = []
+        lines.append(format_ingest_settings_md(latest.extra.get("ingest_settings")))
+        for task in _TASKS:
+            row = per_task.get(task)
+            if not row:
+                continue
+            acc = row.get("accuracy", 0.0)
+            low = row.get("ci_low", 0.0)
+            high = row.get("ci_high", 0.0)
+            lines.append(
+                f"- {task}: {acc * 100:.1f}% "
+                f"(Wilson 95% CI: {low * 100:.1f}% – {high * 100:.1f}%, "
+                f"n={row.get('n_total', '?')})"
+            )
+        if not lines:
+            lines.append("- (no per-task results)")
+        lines.append(f"- Macro accuracy: {macro * 100:.2f}%")
+        return ReportSection(
+            title="MIRAGE — single-arm SurfSense per-task accuracy",
+            headline=False,
+            body_md="\n".join(lines),
+            body_json=latest.metrics,
+        )
+
+
+__all__ = ["MirageBenchmark", "MirageQuestion"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/__init__.py b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/__init__.py
new file mode 100644
index 0000000000..22682ed3f7
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/__init__.py
@@ -0,0 +1,14 @@
+"""Multimodal long-document benchmarks (PDFs with embedded images/charts/tables).
+
+Distinct from the medical suite because these documents are domain-mixed
+(research reports, financials, manuals, government, brochures, papers).
+The hypothesis being tested here is *general*: does SurfSense's
+chunking-based vision RAG preserve information that lives in pixels —
+across long PDFs, across pages — versus feeding the same PDF directly
+to a vision-capable model?
+
+Subpackages register themselves with ``core.registry`` on import. The
+``suites/__init__.py`` discovery walker imports them automatically.
+"""
+
+from __future__ import annotations
diff --git a/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/__init__.py b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/__init__.py
new file mode 100644
index 0000000000..1c2bfa84c2
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/__init__.py
@@ -0,0 +1,19 @@
+"""MMLongBench-Doc — head-to-head Native PDF (vision) vs SurfSense (vision RAG).
+
+Source: https://huggingface.co/datasets/yubo2333/MMLongBench-Doc
+Paper:  https://arxiv.org/abs/2407.01523 (NeurIPS 2024 D&B Track)
+
+* 135 long PDFs (avg 47 pages, multi-modal: text, images, charts, tables)
+* 1,091 expert-annotated questions
+* 33% require evidence from multiple pages
+* ~22% intentionally unanswerable (tests hallucination resistance)
+* 7 document types: research report, tutorial/workshop, academic paper,
+  financial report, brochure, government, manuals
+"""
+
+from __future__ import annotations
+
+from ....core import registry as _registry
+from .runner import MMLongBenchDocBenchmark
+
+_registry.register(MMLongBenchDocBenchmark())
diff --git a/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/grader.py b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/grader.py
new file mode 100644
index 0000000000..7edad73eb0
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/grader.py
@@ -0,0 +1,236 @@
+"""Format-aware grader for MMLongBench-Doc answers.
+
+The dataset ships with five ``answer_format`` values per question:
+
+* ``Str``  — short factoid string
+* ``Int``  — integer count / year
+* ``Float`` — decimal number (often with units stripped)
+* ``List`` — comma- or semicolon-separated bag of items
+* ``None`` — gold answer is literally "Not answerable" (hallucination probe)
+
+The official MMLongBench-Doc paper grades with GPT-4 as judge. We
+implement a *deterministic* rule-based grader as the default (so two
+researchers running the same harness get the same number); an
+LLM-judge mode is exposed via ``--judge gpt5`` and routed through the
+same OpenRouter key the arms use, but is opt-in to keep cost down.
+
+Returned by every grading call:
+
+* ``correct: bool`` — final pass/fail used for accuracy + McNemar
+* ``f1: float``     — token-level F1 (continuous credit, useful when
+  comparing arms that get *most* of a list right)
+* ``method: str``   — which path graded the row (one of
+  ``str_norm`` / ``int_eq`` / ``float_tol`` / ``list_set`` /
+  ``none_match`` / ``llm_judge``).
+"""
+
+from __future__ import annotations
+
+import re
+import string
+from collections import Counter
+from dataclasses import dataclass
+
+# ---------------------------------------------------------------------------
+# Public types
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class GradeResult:
+    correct: bool
+    f1: float
+    method: str
+    normalised_pred: str = ""
+    normalised_gold: str = ""
+
+
+# ---------------------------------------------------------------------------
+# Normalisation helpers (shared)
+# ---------------------------------------------------------------------------
+
+_PUNCT_TABLE = str.maketrans({c: " " for c in string.punctuation})
+_ARTICLES = re.compile(r"\b(a|an|the)\b", re.IGNORECASE)
+_WS = re.compile(r"\s+")
+_NOT_ANSWERABLE_TOKENS = {
+    "not answerable",
+    "cannot be answered",
+    "cannot answer",
+    "no answer",
+    "unknown",
+    "none",
+    "not specified",
+    "not mentioned",
+    "not provided",
+    "the answer is not in the document",
+}
+
+# Abbreviations that should be matched literally on the lowercased
+# prediction (because normalisation strips their punctuation and
+# leaves them too short to be safe as substring tokens).
+_NOT_ANSWERABLE_LITERAL = {"n/a", "na/", "n.a.", "n a"}
+
+
+def _normalise_text(s: str) -> str:
+    """SQuAD-style normalisation: lowercase, drop punctuation/articles, squash whitespace."""
+
+    s = s.lower()
+    s = s.translate(_PUNCT_TABLE)
+    s = _ARTICLES.sub(" ", s)
+    s = _WS.sub(" ", s).strip()
+    return s
+
+
+# ---------------------------------------------------------------------------
+# Per-format graders
+# ---------------------------------------------------------------------------
+
+
+def _grade_str(pred: str, gold: str) -> GradeResult:
+    p = _normalise_text(pred)
+    g = _normalise_text(gold)
+    if not p:
+        return GradeResult(False, 0.0, "str_norm", p, g)
+    if p == g:
+        return GradeResult(True, 1.0, "str_norm", p, g)
+    # Substring match in either direction = correct (handles the common
+    # "model emits a fuller sentence containing the gold" case).
+    if g and (g in p or p in g):
+        return GradeResult(True, _f1_tokens(p, g), "str_norm", p, g)
+    return GradeResult(False, _f1_tokens(p, g), "str_norm", p, g)
+
+
+_INT_RE = re.compile(r"-?\d[\d,]*")
+
+
+def _grade_int(pred: str, gold: str) -> GradeResult:
+    g_match = _INT_RE.search(gold)
+    if g_match is None:
+        return _grade_str(pred, gold)
+    g_val = int(g_match.group(0).replace(",", ""))
+    p_match = _INT_RE.search(pred)
+    if p_match is None:
+        return GradeResult(False, 0.0, "int_eq", str(p_match), str(g_val))
+    p_val = int(p_match.group(0).replace(",", ""))
+    return GradeResult(p_val == g_val, 1.0 if p_val == g_val else 0.0,
+                       "int_eq", str(p_val), str(g_val))
+
+
+_FLOAT_RE = re.compile(r"-?\d+(?:[.,]\d+)?")
+
+
+def _grade_float(pred: str, gold: str, *, rel_tol: float = 1e-2) -> GradeResult:
+    g_match = _FLOAT_RE.search(gold)
+    if g_match is None:
+        return _grade_str(pred, gold)
+    g_val = float(g_match.group(0).replace(",", "."))
+    p_match = _FLOAT_RE.search(pred)
+    if p_match is None:
+        return GradeResult(False, 0.0, "float_tol", "", str(g_val))
+    p_val = float(p_match.group(0).replace(",", "."))
+    # Tolerance: 1% relative or 0.01 absolute, whichever is looser.
+    abs_diff = abs(p_val - g_val)
+    tol = max(abs(g_val) * rel_tol, 0.01)
+    ok = abs_diff <= tol
+    return GradeResult(ok, 1.0 if ok else 0.0, "float_tol", str(p_val), str(g_val))
+
+
+_LIST_SPLIT = re.compile(r"[;,\n]")
+
+
+def _grade_list(pred: str, gold: str) -> GradeResult:
+    g_items = {_normalise_text(x) for x in _LIST_SPLIT.split(gold) if x.strip()}
+    p_items = {_normalise_text(x) for x in _LIST_SPLIT.split(pred) if x.strip()}
+    if not g_items:
+        return _grade_str(pred, gold)
+    inter = g_items & p_items
+    if not inter:
+        return GradeResult(False, 0.0, "list_set",
+                           ", ".join(sorted(p_items)),
+                           ", ".join(sorted(g_items)))
+    precision = len(inter) / len(p_items) if p_items else 0.0
+    recall = len(inter) / len(g_items)
+    f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) else 0.0
+    return GradeResult(f1 >= 0.999, f1, "list_set",
+                       ", ".join(sorted(p_items)),
+                       ", ".join(sorted(g_items)))
+
+
+def _grade_none(pred: str, gold: str) -> GradeResult:
+    """Gold == 'Not answerable'. The arm earns credit if its prediction
+    expresses inability to answer.
+
+    Two passes:
+
+    1. Literal-substring check on the lowercased+stripped pred for
+       ambiguous abbreviations like ``n/a`` (since normalisation
+       strips the punctuation and would over-match).
+    2. Word-boundary substring check on the normalised pred for the
+       multi-word phrases (``cannot answer``, ``not specified`` etc.).
+    """
+
+    raw_lower = (pred or "").strip().lower()
+    p = _normalise_text(pred)
+    expressed_unknown = False
+
+    # Pass 1: literal abbreviation hits on the raw lowercased text.
+    if any(lit in raw_lower for lit in _NOT_ANSWERABLE_LITERAL):
+        expressed_unknown = True
+
+    # Pass 2: word-boundary check on normalised tokens.
+    if not expressed_unknown:
+        p_padded = f" {p} "
+        for tok_raw in _NOT_ANSWERABLE_TOKENS:
+            tok = _normalise_text(tok_raw)
+            if not tok or len(tok) < 3:
+                continue
+            if f" {tok} " in p_padded:
+                expressed_unknown = True
+                break
+    return GradeResult(
+        expressed_unknown, 1.0 if expressed_unknown else 0.0,
+        "none_match", p, _normalise_text(gold),
+    )
+
+
+def _f1_tokens(pred: str, gold: str) -> float:
+    p_tok = pred.split()
+    g_tok = gold.split()
+    if not p_tok or not g_tok:
+        return 0.0
+    common = Counter(p_tok) & Counter(g_tok)
+    overlap = sum(common.values())
+    if overlap == 0:
+        return 0.0
+    precision = overlap / len(p_tok)
+    recall = overlap / len(g_tok)
+    return 2 * precision * recall / (precision + recall)
+
+
+# ---------------------------------------------------------------------------
+# Public dispatcher
+# ---------------------------------------------------------------------------
+
+
+_FORMAT_DISPATCH = {
+    "str": _grade_str,
+    "int": _grade_int,
+    "float": _grade_float,
+    "list": _grade_list,
+    "none": _grade_none,
+}
+
+
+def grade(*, pred: str, gold: str, answer_format: str) -> GradeResult:
+    """Grade a single (prediction, gold) pair.
+
+    ``answer_format`` is the dataset's ``answer_format`` column value.
+    Unknown / blank values fall through to string grading.
+    """
+
+    fmt = (answer_format or "").strip().lower()
+    fn = _FORMAT_DISPATCH.get(fmt, _grade_str)
+    return fn(pred or "", gold or "")
+
+
+__all__ = ["GradeResult", "grade"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/ingest.py b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/ingest.py
new file mode 100644
index 0000000000..cf0572df82
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/ingest.py
@@ -0,0 +1,365 @@
+"""MMLongBench-Doc ingestion.
+
+Steps:
+
+1. Pull the questions parquet from
+   ``hf://datasets/yubo2333/MMLongBench-Doc/data/`` and cache locally.
+2. Resolve the unique set of ``doc_id`` referenced by questions, and
+   download each PDF from
+   ``hf://datasets/yubo2333/MMLongBench-Doc/documents/<doc_id>``.
+   ``huggingface_hub.hf_hub_download`` is resumable + content-hash
+   verifying; we cache PDFs under ``<data_dir>/multimodal_doc/mmlongbench/pdfs/``.
+3. Upload every PDF to SurfSense via ``DocumentsClient.upload`` with
+   ``use_vision_llm=True`` so SurfSense's Pillow + LiteLLM vision
+   pipeline extracts captions / OCR for embedded images, charts, and
+   tables.
+4. Wait for ``processed`` status and persist
+   ``doc_id -> document_id`` in
+   ``<data_dir>/multimodal_doc/maps/mmlongbench_doc_map.jsonl``.
+
+By default we ingest **all** 135 PDFs (~660 MB, totally manageable).
+Operators can scope to a subset with ``--max-docs N`` if iterating on
+a slow vision pipeline.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+from collections.abc import Iterable
+from dataclasses import dataclass
+from pathlib import Path
+
+from ....core.config import set_suite_state
+from ....core.ingest_settings import IngestSettings, settings_header_line
+from ....core.registry import RunContext
+
+logger = logging.getLogger(__name__)
+
+
+HF_REPO_ID = "yubo2333/MMLongBench-Doc"
+HF_REPO_TYPE = "dataset"
+
+# Lazy import: huggingface_hub + pyarrow are heavyweight; keep the
+# benchmark module importable on machines that have only the core
+# install (e.g. CI lint jobs).
+def _hf_hub_download(*args, **kwargs):
+    from huggingface_hub import hf_hub_download
+
+    return hf_hub_download(*args, **kwargs)
+
+
+def _list_repo_files() -> list[str]:
+    from huggingface_hub import list_repo_files
+
+    return list_repo_files(repo_id=HF_REPO_ID, repo_type=HF_REPO_TYPE)
+
+
+# ---------------------------------------------------------------------------
+# Question parquet -> Python rows
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class MMLongBenchQuestion:
+    doc_id: str          # filename inside the documents/ folder
+    doc_type: str
+    question: str
+    answer: str
+    answer_format: str   # Str / Int / Float / List / None
+    evidence_pages: list[int]
+    evidence_sources: list[str]
+
+
+def _load_questions_from_parquet(parquet_path: Path) -> list[MMLongBenchQuestion]:
+    import pyarrow.parquet as pq
+
+    table = pq.read_table(parquet_path)
+    rows = table.to_pylist()
+    out: list[MMLongBenchQuestion] = []
+    for row in rows:
+        doc_id = str(row.get("doc_id") or "").strip()
+        if not doc_id:
+            continue
+        question = str(row.get("question") or "").strip()
+        if not question:
+            continue
+        out.append(
+            MMLongBenchQuestion(
+                doc_id=doc_id,
+                doc_type=str(row.get("doc_type") or "").strip(),
+                question=question,
+                answer=str(row.get("answer") or "").strip(),
+                answer_format=str(row.get("answer_format") or "").strip(),
+                evidence_pages=_parse_int_list(row.get("evidence_pages")),
+                evidence_sources=_parse_str_list(row.get("evidence_sources")),
+            )
+        )
+    return out
+
+
+def _parse_int_list(raw) -> list[int]:
+    if raw is None:
+        return []
+    if isinstance(raw, list):
+        out = []
+        for x in raw:
+            try:
+                out.append(int(x))
+            except (TypeError, ValueError):
+                continue
+        return out
+    text = str(raw).strip().strip("[]")
+    if not text:
+        return []
+    out: list[int] = []
+    for tok in text.split(","):
+        tok = tok.strip().strip("'\"")
+        if tok.isdigit():
+            out.append(int(tok))
+    return out
+
+
+def _parse_str_list(raw) -> list[str]:
+    if raw is None:
+        return []
+    if isinstance(raw, list):
+        return [str(x).strip().strip("'\"") for x in raw if str(x).strip()]
+    text = str(raw).strip().strip("[]")
+    if not text:
+        return []
+    return [tok.strip().strip("'\"") for tok in text.split(",") if tok.strip()]
+
+
+# ---------------------------------------------------------------------------
+# Download helpers
+# ---------------------------------------------------------------------------
+
+
+def _download_questions_parquet(cache_dir: Path) -> Path:
+    """Download every parquet under ``data/`` and concatenate.
+
+    The HF dataset usually publishes a single ``train`` split, but we
+    enumerate to be robust to repo restructuring.
+    """
+
+    parquet_paths: list[Path] = []
+    files = _list_repo_files()
+    data_files = [f for f in files if f.startswith("data/") and f.endswith(".parquet")]
+    if not data_files:
+        raise RuntimeError(
+            f"No parquet files found under data/ in {HF_REPO_ID}; "
+            f"upstream repo may have been restructured."
+        )
+    for rel in sorted(data_files):
+        local = _hf_hub_download(
+            repo_id=HF_REPO_ID,
+            filename=rel,
+            repo_type=HF_REPO_TYPE,
+            cache_dir=str(cache_dir),
+        )
+        parquet_paths.append(Path(local))
+        logger.info("Cached MMLongBench parquet shard %s -> %s", rel, local)
+    return parquet_paths[0] if len(parquet_paths) == 1 else _merge_parquets(parquet_paths, cache_dir)
+
+
+def _merge_parquets(paths: list[Path], cache_dir: Path) -> Path:
+    """Combine multiple parquet shards into one (rare branch, but correct)."""
+
+    import pyarrow as pa
+    import pyarrow.parquet as pq
+
+    tables = [pq.read_table(p) for p in paths]
+    merged = pa.concat_tables(tables, promote_options="default")
+    out = cache_dir / "merged_questions.parquet"
+    pq.write_table(merged, out)
+    return out
+
+
+def _download_pdf(doc_id: str, cache_dir: Path, pdfs_dir: Path) -> Path:
+    """Download a single PDF (resumable via huggingface_hub cache)."""
+
+    rel = f"documents/{doc_id}"
+    local = _hf_hub_download(
+        repo_id=HF_REPO_ID,
+        filename=rel,
+        repo_type=HF_REPO_TYPE,
+        cache_dir=str(cache_dir),
+    )
+    # Materialise to a stable path inside our data/ tree so the runner
+    # has a deterministic location regardless of HF cache internals.
+    dest = pdfs_dir / doc_id
+    if not dest.exists() or dest.stat().st_size != Path(local).stat().st_size:
+        # Use a hardlink when possible (cheap), fall back to copy.
+        try:
+            if dest.exists():
+                dest.unlink()
+            os.link(local, dest)
+        except OSError:
+            from shutil import copy2
+
+            copy2(local, dest)
+    return dest
+
+
+# ---------------------------------------------------------------------------
+# Upload helpers
+# ---------------------------------------------------------------------------
+
+
+async def _upload_pdfs(
+    ctx: RunContext,
+    pdf_paths: Iterable[Path],
+    *,
+    batch_size: int,
+    settings: IngestSettings,
+) -> dict[str, int]:
+    """Upload PDFs in batches, return ``filename -> document_id`` map."""
+
+    docs_client = ctx.documents_client()
+    name_to_id: dict[str, int] = {}
+    pdf_list = list(pdf_paths)
+    for batch_start in range(0, len(pdf_list), batch_size):
+        batch = pdf_list[batch_start:batch_start + batch_size]
+        result = await docs_client.upload(
+            files=batch,
+            search_space_id=ctx.search_space_id,
+            should_summarize=settings.should_summarize,
+            use_vision_llm=settings.use_vision_llm,
+            processing_mode=settings.processing_mode,
+        )
+        all_ids = list(result.document_ids) + list(result.duplicate_document_ids)
+        if all_ids:
+            await docs_client.wait_until_ready(
+                search_space_id=ctx.search_space_id,
+                document_ids=result.document_ids,  # only newly added need polling
+                timeout_s=1800.0,  # vision pipeline is slow on long PDFs
+            )
+            statuses = await docs_client.get_status(
+                search_space_id=ctx.search_space_id,
+                document_ids=all_ids,
+            )
+            for s in statuses:
+                name_to_id[s.title] = s.document_id
+        logger.info(
+            "Uploaded MMLongBench batch %d-%d: %d new, %d duplicate",
+            batch_start, batch_start + len(batch),
+            len(result.document_ids), len(result.duplicate_document_ids),
+        )
+    return name_to_id
+
+
+# ---------------------------------------------------------------------------
+# Public entry point
+# ---------------------------------------------------------------------------
+
+
+async def run_ingest(
+    ctx: RunContext,
+    *,
+    max_docs: int | None = None,
+    upload_batch_size: int = 8,
+    skip_upload: bool = False,
+    settings: IngestSettings | None = None,
+) -> None:
+    """Ingest MMLongBench-Doc into the multimodal_doc suite.
+
+    Parameters
+    ----------
+    max_docs : int | None
+        Cap the number of PDFs to download + upload. ``None`` = all 135.
+        Useful when iterating on the runner without paying for the full
+        vision pipeline pass each time.
+    upload_batch_size : int
+        How many PDFs to send per ``fileupload`` call. Smaller batches
+        recover faster from individual failures; larger batches reduce
+        round-trip overhead.
+    skip_upload : bool
+        Download + cache PDFs locally but skip SurfSense ingestion.
+        Useful for testing the native arm in isolation.
+    """
+
+    settings = settings or IngestSettings(use_vision_llm=True, processing_mode="basic")
+    bench_dir = ctx.benchmark_data_dir()
+    pdfs_dir = bench_dir / "pdfs"
+    pdfs_dir.mkdir(parents=True, exist_ok=True)
+    hf_cache = bench_dir / ".hf_cache"
+    hf_cache.mkdir(parents=True, exist_ok=True)
+
+    # Step 1: questions
+    parquet_path = _download_questions_parquet(hf_cache)
+    questions = _load_questions_from_parquet(parquet_path)
+    if not questions:
+        raise RuntimeError(
+            "MMLongBench-Doc parquet contains no parseable questions. "
+            "Upstream may have changed schema."
+        )
+
+    # Persist a copy alongside the PDFs so the runner has one place to read.
+    questions_jsonl = bench_dir / "questions.jsonl"
+    with questions_jsonl.open("w", encoding="utf-8") as fh:
+        for q in questions:
+            fh.write(json.dumps({
+                "doc_id": q.doc_id,
+                "doc_type": q.doc_type,
+                "question": q.question,
+                "answer": q.answer,
+                "answer_format": q.answer_format,
+                "evidence_pages": q.evidence_pages,
+                "evidence_sources": q.evidence_sources,
+            }) + "\n")
+    logger.info("Wrote %d MMLongBench questions to %s", len(questions), questions_jsonl)
+
+    # Step 2: download unique PDFs
+    unique_doc_ids = sorted({q.doc_id for q in questions})
+    if max_docs is not None and max_docs > 0:
+        unique_doc_ids = unique_doc_ids[:max_docs]
+    logger.info("MMLongBench: downloading %d unique PDFs", len(unique_doc_ids))
+
+    pdf_paths: dict[str, Path] = {}
+    for i, doc_id in enumerate(unique_doc_ids, start=1):
+        try:
+            pdf_paths[doc_id] = _download_pdf(doc_id, hf_cache, pdfs_dir)
+            if i % 10 == 0:
+                logger.info("  ... %d / %d PDFs cached", i, len(unique_doc_ids))
+        except Exception as exc:  # noqa: BLE001
+            logger.warning("Failed to download MMLongBench PDF %s: %s", doc_id, exc)
+
+    # Step 3: upload to SurfSense
+    name_to_id: dict[str, int] = {}
+    if skip_upload:
+        logger.info("MMLongBench: --skip-upload set; skipping SurfSense ingestion")
+    else:
+        logger.info("MMLongBench upload settings: %s", settings.render_label())
+        name_to_id = await _upload_pdfs(
+            ctx,
+            pdf_paths.values(),
+            batch_size=upload_batch_size,
+            settings=settings,
+        )
+
+    # Step 4: persist doc_id -> document_id manifest
+    map_path = ctx.maps_dir() / "mmlongbench_doc_map.jsonl"
+    with map_path.open("w", encoding="utf-8") as fh:
+        # Header line records the resolved ingest settings
+        # (see core/ingest_settings.py).
+        fh.write(settings_header_line(settings) + "\n")
+        for doc_id in unique_doc_ids:
+            local = pdf_paths.get(doc_id)
+            if local is None:
+                continue
+            fh.write(json.dumps({
+                "doc_id": doc_id,
+                "document_id": name_to_id.get(local.name),
+                "pdf_path": str(local),
+                "n_questions": sum(1 for q in questions if q.doc_id == doc_id),
+            }) + "\n")
+    logger.info("Wrote MMLongBench doc map to %s", map_path)
+
+    new_state = ctx.suite_state
+    new_state.ingestion_maps["mmlongbench"] = str(map_path)
+    set_suite_state(ctx.config, ctx.suite, new_state)
+
+
+__all__ = ["MMLongBenchQuestion", "run_ingest"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/prompt.py b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/prompt.py
new file mode 100644
index 0000000000..27d6a0d00f
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/prompt.py
@@ -0,0 +1,60 @@
+"""MMLongBench-Doc prompt template.
+
+Both arms get the same prompt — only the document delivery channel
+differs (native PDF embedded in the OpenRouter request vs SurfSense
+RAG retrieval). The format hint in the prompt mirrors what the
+upstream paper uses so the grader's regex can reliably extract the
+answer.
+"""
+
+from __future__ import annotations
+
+# ---------------------------------------------------------------------------
+# Per-format hint blocks
+# ---------------------------------------------------------------------------
+
+_FORMAT_HINTS: dict[str, str] = {
+    "str": (
+        "Respond with the answer as a short phrase, no full sentence. "
+        "Format your final line as `Answer: <text>`."
+    ),
+    "int": (
+        "Respond with a single integer only. "
+        "Format your final line as `Answer: <integer>`."
+    ),
+    "float": (
+        "Respond with a single decimal number only (no units). "
+        "Format your final line as `Answer: <number>`."
+    ),
+    "list": (
+        "Respond with a comma-separated list of items, no extra text. "
+        "Format your final line as `Answer: item1, item2, item3`."
+    ),
+    "none": (
+        "If the answer cannot be determined from the document, say so explicitly. "
+        "Format your final line as `Answer: Not answerable`."
+    ),
+}
+
+
+_PROMPT = """\
+You are a document-understanding assistant. Use ONLY the provided
+document to answer the question. The document may contain text,
+tables, charts, figures, and images. If the answer is in a chart or
+image, read it carefully. Do not use external knowledge.
+
+Question: {question}
+
+{format_hint}
+"""
+
+
+def build_prompt(question: str, *, answer_format: str) -> str:
+    """Assemble the full prompt for one MMLongBench question."""
+
+    fmt = (answer_format or "str").strip().lower()
+    hint = _FORMAT_HINTS.get(fmt, _FORMAT_HINTS["str"])
+    return _PROMPT.format(question=question.strip(), format_hint=hint)
+
+
+__all__ = ["build_prompt"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/runner.py b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/runner.py
new file mode 100644
index 0000000000..0e352d7ae2
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/runner.py
@@ -0,0 +1,704 @@
+"""MMLongBench-Doc runner — head-to-head Native PDF (vision) vs SurfSense (vision RAG).
+
+Differences from a typical MCQ head-to-head:
+
+* Open-ended answers (Str / Int / Float / List / Not-answerable) — uses
+  ``extract_freeform_answer`` instead of ``extract_answer_letter``.
+* Format-aware grader (see ``.grader``) returns both binary correctness
+  (for accuracy / McNemar) and continuous F1 (for nuanced reporting).
+* Native arm requires a vision-capable model — we don't enforce this
+  in code (operator's choice via ``setup --provider-model``) but we
+  emit a warning if the pinned slug looks text-only.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import logging
+import os
+from collections.abc import Iterable
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from ....core.arms import ArmRequest, ArmResult, NativePdfArm, SurfSenseArm
+from ....core.config import utc_iso_timestamp
+from ....core.ingest_settings import (
+    IngestSettings,
+    add_ingest_settings_args,
+    format_ingest_settings_md,
+    is_settings_header,
+)
+from ....core.metrics.comparison import (
+    bootstrap_delta_ci,
+    mcnemar_test,
+    paired_aggregate,
+)
+from ....core.metrics.mc_accuracy import accuracy_with_wilson_ci
+from ....core.parse.freeform_answer import extract_freeform_answer
+from ....core.providers.openrouter_pdf import OpenRouterPdfProvider, PdfEngine
+from ....core.registry import (
+    ReportSection,
+    RunArtifact,
+    RunContext,
+)
+from ....core.scenarios import format_scenario_md
+from .grader import GradeResult, grade
+from .prompt import build_prompt
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Question + map row shapes
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class MMLBQuestion:
+    qid: str               # synthesised from doc_id + index
+    doc_id: str            # filename inside the documents/ folder
+    doc_type: str
+    question: str
+    gold_answer: str
+    answer_format: str
+    evidence_pages: list[int]
+    evidence_sources: list[str]
+    pdf_path: Path
+    document_id: int | None  # SurfSense doc id (None if upload skipped)
+
+
+def _load_doc_map(map_path: Path) -> tuple[dict[str, dict[str, Any]], dict[str, Any]]:
+    """Read the doc map JSONL.
+
+    Returns ``(rows, settings)`` where ``settings`` is the
+    ``__settings__`` header blob (or ``{}`` for legacy maps).
+    """
+
+    rows: dict[str, dict[str, Any]] = {}
+    settings: dict[str, Any] = {}
+    with map_path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            if is_settings_header(row):
+                settings = dict(row["__settings__"])
+                continue
+            rows[str(row["doc_id"])] = row
+    return rows, settings
+
+
+def _load_questions(
+    questions_jsonl: Path,
+    doc_map: dict[str, dict[str, Any]],
+    *,
+    doc_filter: list[str] | None,
+    format_filter: str | None,
+    sample_n: int | None,
+    skip_unanswerable: bool,
+) -> list[MMLBQuestion]:
+    out: list[MMLBQuestion] = []
+    per_doc_counter: dict[str, int] = {}
+    with questions_jsonl.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            doc_id = str(row.get("doc_id") or "").strip()
+            if not doc_id:
+                continue
+            if doc_filter and doc_id not in doc_filter:
+                continue
+            map_row = doc_map.get(doc_id)
+            if map_row is None:
+                logger.debug("No doc-map entry for %s; skipping", doc_id)
+                continue
+            answer_format = str(row.get("answer_format") or "").strip().lower()
+            if format_filter and format_filter != "all" and format_filter != answer_format:
+                continue
+            gold = str(row.get("answer") or "").strip()
+            if skip_unanswerable and answer_format == "none":
+                continue
+            idx = per_doc_counter.get(doc_id, 0)
+            per_doc_counter[doc_id] = idx + 1
+            out.append(MMLBQuestion(
+                qid=f"{doc_id}::Q{idx:03d}",
+                doc_id=doc_id,
+                doc_type=str(row.get("doc_type") or "").strip(),
+                question=str(row.get("question") or "").strip(),
+                gold_answer=gold,
+                answer_format=answer_format,
+                evidence_pages=list(row.get("evidence_pages") or []),
+                evidence_sources=list(row.get("evidence_sources") or []),
+                pdf_path=Path(map_row["pdf_path"]),
+                document_id=map_row.get("document_id"),
+            ))
+    out.sort(key=lambda q: (q.doc_id, q.qid))
+    if sample_n is not None and sample_n > 0:
+        out = out[:sample_n]
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Bounded concurrency helper
+# ---------------------------------------------------------------------------
+
+
+async def _gather_with_limit(coros: Iterable, *, concurrency: int) -> list[Any]:
+    sem = asyncio.Semaphore(max(1, concurrency))
+
+    async def _wrap(coro):
+        async with sem:
+            return await coro
+
+    return await asyncio.gather(*(_wrap(c) for c in coros))
+
+
+# ---------------------------------------------------------------------------
+# Benchmark
+# ---------------------------------------------------------------------------
+
+
+_DESCRIPTION = (
+    "MMLongBench-Doc (135 long PDFs, 1,091 multimodal questions) — "
+    "Native PDF (vision) vs SurfSense (vision RAG) head-to-head."
+)
+
+
+_TEXT_ONLY_HINTS = ("gpt-5.4-mini", "gpt-3.5", "text-only", "instruct-")
+
+# MMLongBench-Doc PDFs are long documents with figures, charts, and
+# tables. Vision LLM at ingest is the whole point; flip --no-vision-llm
+# to measure how much SurfSense degrades on real document images.
+_DEFAULT_INGEST_SETTINGS = IngestSettings(
+    use_vision_llm=True,
+    processing_mode="basic",
+    should_summarize=False,
+)
+
+
+class MMLongBenchDocBenchmark:
+    """Long-document multimodal RAG vs native vision."""
+
+    suite: str = "multimodal_doc"
+    name: str = "mmlongbench"
+    headline: bool = True
+    description: str = _DESCRIPTION
+
+    def add_run_args(self, parser: argparse.ArgumentParser) -> None:
+        parser.add_argument(
+            "--docs",
+            default=None,
+            help="Comma-separated doc_ids (filenames) to run (default: all).",
+        )
+        parser.add_argument(
+            "--format",
+            default="all",
+            choices=["all", "str", "int", "float", "list", "none"],
+            help="Filter to one answer format. 'none' = unanswerable probes only.",
+        )
+        parser.add_argument(
+            "--n", dest="sample_n", type=int, default=None,
+            help="Run only the first N questions after filters apply.",
+        )
+        parser.add_argument(
+            "--skip-unanswerable", dest="skip_unanswerable", action="store_true",
+            help="Drop ~22%% unanswerable questions (use to compare against baselines that don't include them).",
+        )
+        parser.add_argument(
+            "--concurrency", type=int, default=4,
+            help="Parallel question workers per arm.",
+        )
+        parser.add_argument(
+            "--no-mentions", dest="no_mentions", action="store_true",
+            help="SurfSense arm: skip mentioned_document_ids (unscoped retrieval).",
+        )
+        parser.add_argument(
+            "--pdf-engine", default="native",
+            choices=[e.value for e in PdfEngine],
+            help="OpenRouter file-parser engine for the native arm.",
+        )
+        parser.add_argument(
+            "--max-output-tokens", type=int, default=512,
+            help="Cap on completion length for both arms.",
+        )
+        # Ingest-only knobs (forwarded by the CLI to ingest.run_ingest).
+        parser.add_argument(
+            "--max-docs", dest="max_docs", type=int, default=None,
+            help="(ingest only) cap on number of unique PDFs to download + upload.",
+        )
+        parser.add_argument(
+            "--upload-batch-size", dest="upload_batch_size", type=int, default=8,
+            help="(ingest only) PDFs per fileupload call.",
+        )
+        parser.add_argument(
+            "--skip-upload", dest="skip_upload", action="store_true",
+            help="(ingest only) cache PDFs locally but don't push to SurfSense.",
+        )
+        # Per-upload knobs forwarded to /documents/fileupload at ingest;
+        # ignored at run-time (runner reads the resolved settings out of
+        # the doc-map manifest header).
+        add_ingest_settings_args(parser, defaults=_DEFAULT_INGEST_SETTINGS)
+
+    async def ingest(self, ctx: RunContext, **opts: Any) -> None:
+        from .ingest import run_ingest
+
+        settings = IngestSettings.merge(_DEFAULT_INGEST_SETTINGS, opts)
+        await run_ingest(
+            ctx,
+            max_docs=opts.get("max_docs"),
+            upload_batch_size=int(opts.get("upload_batch_size") or 8),
+            skip_upload=bool(opts.get("skip_upload", False)),
+            settings=settings,
+        )
+
+    async def run(self, ctx: RunContext, **opts: Any) -> RunArtifact:
+        docs_raw: str | None = opts.get("docs")
+        doc_filter = [d.strip() for d in docs_raw.split(",")] if docs_raw else None
+        format_filter = opts.get("format") or "all"
+        sample_n = opts.get("sample_n")
+        skip_unanswerable = bool(opts.get("skip_unanswerable"))
+        concurrency = int(opts.get("concurrency") or 4)
+        no_mentions = bool(opts.get("no_mentions"))
+        pdf_engine_name = opts.get("pdf_engine") or "native"
+        max_output_tokens = int(opts.get("max_output_tokens") or 512)
+
+        bench_dir = ctx.benchmark_data_dir()
+        questions_jsonl = bench_dir / "questions.jsonl"
+        map_path = ctx.maps_dir() / "mmlongbench_doc_map.jsonl"
+        if not questions_jsonl.exists() or not map_path.exists():
+            raise RuntimeError(
+                "MMLongBench-Doc not ingested for this suite. Run "
+                "`python -m surfsense_evals ingest multimodal_doc mmlongbench` first."
+            )
+
+        doc_map, ingest_settings = _load_doc_map(map_path)
+        questions = _load_questions(
+            questions_jsonl, doc_map,
+            doc_filter=doc_filter,
+            format_filter=None if format_filter == "all" else format_filter,
+            sample_n=sample_n,
+            skip_unanswerable=skip_unanswerable,
+        )
+        if not questions:
+            raise RuntimeError(
+                "No MMLongBench questions matched the filters; broaden --docs/--format/--n."
+            )
+        logger.info("MMLongBench-Doc: scheduled %d questions", len(questions))
+
+        api_key = os.environ.get("OPENROUTER_API_KEY")
+        if not api_key:
+            raise RuntimeError(
+                "OPENROUTER_API_KEY env var is required for the native arm."
+            )
+
+        # Native arm slug differs from SurfSense slug only in cost-arbitrage
+        # scenario; otherwise both arms answer with provider_model.
+        native_arm_model = ctx.native_arm_model
+        if any(hint in native_arm_model.lower() for hint in _TEXT_ONLY_HINTS):
+            if ctx.scenario == "symmetric-cheap":
+                logger.info(
+                    "symmetric-cheap: native arm pinned to text-only %r as "
+                    "intended; expect it to lose on image-bearing pages "
+                    "(SurfSense answers from vision-extracted chunks).",
+                    native_arm_model,
+                )
+            else:
+                logger.warning(
+                    "Native arm slug %r looks text-only; image content in "
+                    "PDFs will be ignored. Re-pin via "
+                    "`setup --provider-model anthropic/claude-sonnet-4.5` "
+                    "(or pass --native-arm-model and --scenario cost-arbitrage "
+                    "to make this asymmetry explicit).",
+                    native_arm_model,
+                )
+
+        provider = OpenRouterPdfProvider(
+            api_key=api_key,
+            base_url=ctx.config.openrouter_base_url,
+            model=native_arm_model,
+            engine=PdfEngine(pdf_engine_name),
+        )
+        native_arm = NativePdfArm(provider=provider, max_output_tokens=max_output_tokens)
+        surf_arm = SurfSenseArm(
+            client=ctx.new_chat_client(),
+            search_space_id=ctx.search_space_id,
+            ephemeral_threads=True,
+        )
+
+        run_timestamp = utc_iso_timestamp()
+        run_dir = ctx.runs_dir(run_timestamp=run_timestamp)
+        raw_path = run_dir / "raw.jsonl"
+
+        async def _native_one(q: MMLBQuestion) -> ArmResult:
+            return await native_arm.answer(_make_native_request(q, max_output_tokens))
+
+        async def _surf_one(q: MMLBQuestion) -> ArmResult:
+            return await surf_arm.answer(_make_surfsense_request(q, no_mentions=no_mentions))
+
+        native_results, surf_results = await asyncio.gather(
+            _gather_with_limit((_native_one(q) for q in questions), concurrency=concurrency),
+            _gather_with_limit((_surf_one(q) for q in questions), concurrency=concurrency),
+        )
+
+        native_grades = [_grade_one(q, r) for q, r in zip(questions, native_results, strict=False)]
+        surf_grades = [_grade_one(q, r) for q, r in zip(questions, surf_results, strict=False)]
+
+        with raw_path.open("w", encoding="utf-8") as fh:
+            for q, n_res, s_res, n_g, s_g in zip(
+                questions, native_results, surf_results, native_grades, surf_grades, strict=False
+            ):
+                meta = {
+                    "qid": q.qid,
+                    "doc_id": q.doc_id,
+                    "doc_type": q.doc_type,
+                    "answer_format": q.answer_format,
+                    "gold": q.gold_answer,
+                    "evidence_pages": q.evidence_pages,
+                    "evidence_sources": q.evidence_sources,
+                    "document_id": q.document_id,
+                }
+                fh.write(json.dumps({
+                    **meta,
+                    **n_res.to_jsonl(),
+                    "graded": _grade_to_jsonl(n_g),
+                }) + "\n")
+                fh.write(json.dumps({
+                    **meta,
+                    **s_res.to_jsonl(),
+                    "graded": _grade_to_jsonl(s_g),
+                }) + "\n")
+
+        metrics = _compute_metrics(questions, native_results, surf_results, native_grades, surf_grades)
+        artifact = RunArtifact(
+            suite=self.suite,
+            benchmark=self.name,
+            run_timestamp=run_timestamp,
+            raw_path=raw_path,
+            metrics=metrics,
+            extra={
+                "n_questions": len(questions),
+                "concurrency": concurrency,
+                "format_filter": format_filter,
+                "skip_unanswerable": skip_unanswerable,
+                "no_mentions": no_mentions,
+                "pdf_engine": pdf_engine_name,
+                "scenario": ctx.scenario,
+                "provider_model": ctx.provider_model,
+                "native_arm_model": native_arm_model,
+                "vision_provider_model": ctx.vision_provider_model,
+                "agent_llm_id": ctx.agent_llm_id,
+                "ingest_settings": ingest_settings,
+            },
+        )
+
+        manifest_path = run_dir / "run_artifact.json"
+        manifest_path.write_text(
+            json.dumps({
+                "suite": self.suite,
+                "benchmark": self.name,
+                "raw_path": "raw.jsonl",
+                "metrics": metrics,
+                "extra": artifact.extra,
+            }, indent=2, sort_keys=True) + "\n",
+            encoding="utf-8",
+        )
+        return artifact
+
+    def report_section(self, artifacts: list[RunArtifact]) -> ReportSection:
+        if not artifacts:
+            return ReportSection(
+                title="MMLongBench-Doc — Native PDF (vision) vs SurfSense (vision RAG)",
+                headline=True,
+                body_md="(no run artifacts found)",
+                body_json={},
+            )
+        latest = max(artifacts, key=lambda a: a.run_timestamp)
+        m = latest.metrics
+        native = m.get("native", {})
+        surf = m.get("surfsense", {})
+        delta = m.get("delta", {})
+        per_format = m.get("per_format", {})
+        extra = latest.extra
+
+        body_lines: list[str] = []
+        body_lines.append(
+            f"- Sample size: {extra.get('n_questions', '?')} questions "
+            f"(format filter: `{extra.get('format_filter', 'all')}`, "
+            f"skip-unanswerable: `{extra.get('skip_unanswerable', False)}`, "
+            f"engine: `{extra.get('pdf_engine', 'native')}`)."
+        )
+        body_lines.append(format_scenario_md(extra))
+        body_lines.append(format_ingest_settings_md(extra.get("ingest_settings")))
+        body_lines.append(
+            "- Native arm (OpenRouter `chat/completions` + file plugin, "
+            f"`{extra.get('native_arm_model') or extra.get('provider_model', '?')}`):"
+        )
+        body_lines.append(_arm_summary_lines(native, indent="  "))
+        body_lines.append(
+            "- SurfSense arm (`POST /api/v1/new_chat`, vision RAG over chunks, "
+            f"`{extra.get('provider_model', '?')}`):"
+        )
+        body_lines.append(_arm_summary_lines(surf, indent="  "))
+        body_lines.append("- Delta (paired):")
+        body_lines.append(
+            f"  - Accuracy: SurfSense {_pp(delta.get('accuracy_pp'))} pp "
+            f"(McNemar p={_fmt(delta.get('mcnemar_p_value'), 4)}, "
+            f"method={delta.get('mcnemar_method')})"
+        )
+        body_lines.append(
+            f"  - F1 (mean): SurfSense {_pp(delta.get('f1_pp'))} pp"
+        )
+        body_lines.append(
+            f"  - Bootstrap 95% CI on accuracy delta: "
+            f"[{_pp(delta.get('bootstrap_ci_low'))}pp, {_pp(delta.get('bootstrap_ci_high'))}pp]"
+        )
+        body_lines.append(
+            f"  - Cost / question: native ${_dollars(native.get('cost_micros_mean'))}, "
+            f"surfsense ${_dollars(surf.get('cost_micros_mean'))} "
+            f"(SurfSense delta {_pct_change(delta.get('cost_micros_pct'))})"
+        )
+        body_lines.append(
+            f"  - Latency p50: native {_ms_to_s(native.get('latency_ms_median'))}, "
+            f"surfsense {_ms_to_s(surf.get('latency_ms_median'))} "
+            f"(SurfSense delta {_pct_change(delta.get('latency_ms_pct'))})"
+        )
+        if per_format:
+            body_lines.append("- Per-format split (accuracy delta in pp):")
+            for fmt, vals in sorted(per_format.items()):
+                body_lines.append(
+                    f"  - {fmt}: SurfSense {_pp(vals.get('delta_accuracy_pp'))} pp "
+                    f"(n={vals.get('n')}, native acc={vals.get('native_accuracy', 0)*100:.1f}%, "
+                    f"surf acc={vals.get('surfsense_accuracy', 0)*100:.1f}%)"
+                )
+
+        return ReportSection(
+            title="MMLongBench-Doc — Native PDF (vision) vs SurfSense (vision RAG)",
+            headline=True,
+            body_md="\n".join(body_lines),
+            body_json=m,
+        )
+
+
+# ---------------------------------------------------------------------------
+# Per-question helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_native_request(q: MMLBQuestion, max_tokens: int) -> ArmRequest:
+    prompt = build_prompt(q.question, answer_format=q.answer_format)
+    return ArmRequest(
+        question_id=q.qid,
+        prompt=prompt,
+        pdf_paths=[q.pdf_path],
+        options={"max_tokens": max_tokens},
+    )
+
+
+def _make_surfsense_request(q: MMLBQuestion, *, no_mentions: bool) -> ArmRequest:
+    prompt = build_prompt(q.question, answer_format=q.answer_format)
+    mentions: list[int] | None = None
+    if not no_mentions and q.document_id is not None:
+        mentions = [int(q.document_id)]
+    return ArmRequest(
+        question_id=q.qid,
+        prompt=prompt,
+        mentioned_document_ids=mentions,
+    )
+
+
+def _grade_one(q: MMLBQuestion, result: ArmResult) -> GradeResult:
+    pred_text = extract_freeform_answer(result.raw_text or "")
+    return grade(pred=pred_text, gold=q.gold_answer, answer_format=q.answer_format)
+
+
+def _grade_to_jsonl(g: GradeResult) -> dict[str, Any]:
+    return {
+        "correct": g.correct,
+        "f1": g.f1,
+        "method": g.method,
+        "normalised_pred": g.normalised_pred,
+        "normalised_gold": g.normalised_gold,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Metrics aggregation
+# ---------------------------------------------------------------------------
+
+
+def _compute_metrics(
+    questions: list[MMLBQuestion],
+    native_results: list[ArmResult],
+    surf_results: list[ArmResult],
+    native_grades: list[GradeResult],
+    surf_grades: list[GradeResult],
+) -> dict[str, Any]:
+    native_correct = [g.correct for g in native_grades]
+    surf_correct = [g.correct for g in surf_grades]
+    native_f1 = [g.f1 for g in native_grades]
+    surf_f1 = [g.f1 for g in surf_grades]
+
+    native_costs = [float(r.cost_micros) for r in native_results]
+    surf_costs = [float(r.cost_micros) for r in surf_results]
+    native_latencies = [float(r.latency_ms) for r in native_results]
+    surf_latencies = [float(r.latency_ms) for r in surf_results]
+    native_in_tokens = [float(r.input_tokens) for r in native_results]
+    native_out_tokens = [float(r.output_tokens) for r in native_results]
+
+    native_acc = accuracy_with_wilson_ci(sum(native_correct), len(native_correct))
+    surf_acc = accuracy_with_wilson_ci(sum(surf_correct), len(surf_correct))
+    mc = mcnemar_test(native_correct, surf_correct)
+    boot = bootstrap_delta_ci(native_correct, surf_correct, n_resamples=2000)
+
+    native_cost_agg = paired_aggregate(native_costs)
+    surf_cost_agg = paired_aggregate(surf_costs)
+    native_latency_agg = paired_aggregate(native_latencies)
+    surf_latency_agg = paired_aggregate(surf_latencies)
+
+    cost_pct = _safe_pct(surf_cost_agg.mean, native_cost_agg.mean)
+    latency_pct = _safe_pct(surf_latency_agg.median, native_latency_agg.median)
+
+    per_format_pairs: dict[str, list[tuple[bool, bool]]] = {}
+    for q, n_ok, s_ok in zip(questions, native_correct, surf_correct, strict=False):
+        per_format_pairs.setdefault(q.answer_format or "unknown", []).append((n_ok, s_ok))
+
+    per_format: dict[str, dict[str, Any]] = {}
+    for fmt, pairs in per_format_pairs.items():
+        n_correct = [a for a, _ in pairs]
+        s_correct = [b for _, b in pairs]
+        per_format[fmt] = {
+            "n": len(pairs),
+            "native_accuracy": (sum(n_correct) / len(pairs)) if pairs else 0.0,
+            "surfsense_accuracy": (sum(s_correct) / len(pairs)) if pairs else 0.0,
+            "delta_accuracy_pp": (
+                100.0 * (sum(s_correct) - sum(n_correct)) / len(pairs)
+                if pairs else 0.0
+            ),
+        }
+
+    native_f1_mean = sum(native_f1) / len(native_f1) if native_f1 else 0.0
+    surf_f1_mean = sum(surf_f1) / len(surf_f1) if surf_f1 else 0.0
+
+    return {
+        "native": {
+            **native_acc.to_dict(),
+            "f1_mean": native_f1_mean,
+            "cost_micros_mean": native_cost_agg.mean,
+            "cost_micros_median": native_cost_agg.median,
+            "latency_ms_mean": native_latency_agg.mean,
+            "latency_ms_median": native_latency_agg.median,
+            "latency_ms_p95": native_latency_agg.p95,
+            "input_tokens_mean": (sum(native_in_tokens) / len(native_in_tokens)) if native_in_tokens else 0.0,
+            "output_tokens_mean": (sum(native_out_tokens) / len(native_out_tokens)) if native_out_tokens else 0.0,
+        },
+        "surfsense": {
+            **surf_acc.to_dict(),
+            "f1_mean": surf_f1_mean,
+            "cost_micros_mean": surf_cost_agg.mean,
+            "cost_micros_median": surf_cost_agg.median,
+            "latency_ms_mean": surf_latency_agg.mean,
+            "latency_ms_median": surf_latency_agg.median,
+            "latency_ms_p95": surf_latency_agg.p95,
+        },
+        "delta": {
+            "accuracy_pp": 100.0 * (surf_acc.accuracy - native_acc.accuracy),
+            "f1_pp": 100.0 * (surf_f1_mean - native_f1_mean),
+            "mcnemar_p_value": mc.p_value,
+            "mcnemar_method": mc.method,
+            "mcnemar_b_native_only": mc.b,
+            "mcnemar_c_surfsense_only": mc.c,
+            "bootstrap_ci_low": 100.0 * boot.ci_low,
+            "bootstrap_ci_high": 100.0 * boot.ci_high,
+            "cost_micros_pct": cost_pct,
+            "latency_ms_pct": latency_pct,
+        },
+        "per_format": per_format,
+    }
+
+
+def _safe_pct(numerator: float, denominator: float) -> float | None:
+    if denominator == 0:
+        return None
+    return 100.0 * (numerator - denominator) / denominator
+
+
+# ---------------------------------------------------------------------------
+# Tiny formatting helpers used by report_section
+# ---------------------------------------------------------------------------
+
+
+def _arm_summary_lines(d: dict[str, Any], *, indent: str) -> str:
+    if not d:
+        return f"{indent}(no data)"
+    acc = d.get("accuracy", 0.0)
+    low = d.get("ci_low", 0.0)
+    high = d.get("ci_high", 0.0)
+    f1 = d.get("f1_mean", 0.0)
+    lines = [
+        f"{indent}- Accuracy: {acc * 100:.1f}% (Wilson 95% CI: {low * 100:.1f}% – {high * 100:.1f}%)",
+        f"{indent}- F1 (token-level mean): {f1 * 100:.1f}%",
+        f"{indent}- Cost / question: ${_dollars(d.get('cost_micros_mean'))} (mean), "
+        f"${_dollars(d.get('cost_micros_median'))} (median)",
+        f"{indent}- Latency: p50 {_ms_to_s(d.get('latency_ms_median'))}, "
+        f"p95 {_ms_to_s(d.get('latency_ms_p95'))}",
+    ]
+    if "input_tokens_mean" in d:
+        lines.append(
+            f"{indent}- Mean tokens / question: in {d.get('input_tokens_mean', 0):.0f}, "
+            f"out {d.get('output_tokens_mean', 0):.0f}"
+        )
+    return "\n".join(lines)
+
+
+def _dollars(micros: Any) -> str:
+    if micros is None:
+        return "?"
+    try:
+        return f"{(float(micros) / 1_000_000):.4f}"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _ms_to_s(ms: Any) -> str:
+    if ms is None:
+        return "?"
+    try:
+        return f"{float(ms) / 1000:.1f}s"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _pp(value: Any) -> str:
+    if value is None:
+        return "?"
+    try:
+        return f"{float(value):+.1f}"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _pct_change(value: Any) -> str:
+    if value is None:
+        return "?"
+    try:
+        return f"{float(value):+.0f}%"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _fmt(value: Any, ndigits: int) -> str:
+    if value is None:
+        return "?"
+    try:
+        return f"{float(value):.{ndigits}f}"
+    except (TypeError, ValueError):
+        return "?"
+
+
+__all__ = ["MMLBQuestion", "MMLongBenchDocBenchmark"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/parser_compare/__init__.py b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/parser_compare/__init__.py
new file mode 100644
index 0000000000..f6985e93d1
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/parser_compare/__init__.py
@@ -0,0 +1,46 @@
+"""parser_compare — six-way head-to-head on long multimodal PDFs.
+
+Same 5 mmlongbench PDFs that ``mmlongbench`` already ingested
+(``search_space_id=55``), one question per PDF for the smoke run.
+
+The point of this benchmark is to disentangle TWO orthogonal
+dimensions of "how good is our multimodal pipeline?":
+
+1. **Parser quality** — Azure DI prebuilt-read vs prebuilt-layout vs
+   LlamaParse parse_page_with_llm vs parse_page_with_agent. We run
+   each parser directly (bypassing ``/documents/fileupload`` because
+   the backend's parser routing is global, not per-call) and stuff the
+   resulting markdown into a long-context prompt.
+
+2. **Context-management strategy** — full-context stuffing (no chunk
+   selection, the model sees everything) vs SurfSense's agentic
+   retrieval over chunks of the same documents.
+
+Six arms, all answered by ``anthropic/claude-sonnet-4.5``:
+
+* ``native_pdf``           — PDF attached natively via OpenRouter
+                              (gold-standard reference).
+* ``azure_basic_lc``       — Azure DI ``prebuilt-read`` markdown stuffed
+                              into the prompt.
+* ``azure_premium_lc``     — Azure DI ``prebuilt-layout`` markdown stuffed.
+* ``llamacloud_basic_lc``  — LlamaParse ``parse_page_with_llm`` markdown stuffed.
+* ``llamacloud_premium_lc`` — LlamaParse ``parse_page_with_agent`` markdown stuffed.
+* ``surfsense_agentic``    — SurfSense ``/api/v1/new_chat`` with
+                              ``mentioned_document_ids`` scoped to the
+                              one source PDF, retrieving chunks from
+                              the existing search_space=55 ingestion
+                              (vision_llm=on, processing_mode=premium,
+                              ETL_SERVICE=LLAMACLOUD with Azure DI
+                              fallback ⇒ effectively azure_premium).
+
+The report includes preprocessing cost ($1 / 1k pages basic, $10 / 1k
+pages premium) on top of the OpenRouter LLM cost so each arm's true
+total-cost-per-question is directly comparable.
+"""
+
+from __future__ import annotations
+
+from ....core import registry as _registry
+from .runner import ParserCompareBenchmark
+
+_registry.register(ParserCompareBenchmark())
diff --git a/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/parser_compare/ingest.py b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/parser_compare/ingest.py
new file mode 100644
index 0000000000..93c8db4ab4
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/parser_compare/ingest.py
@@ -0,0 +1,356 @@
+"""parser_compare ingestion: pre-extract markdown 4 ways per PDF.
+
+For each PDF in scope, we run all four (parser × mode) combinations
+in parallel and persist the resulting markdown alongside the PDF:
+
+    data/multimodal_doc/parser_compare/extractions/
+      <doc_id>.azure_basic.md
+      <doc_id>.azure_premium.md
+      <doc_id>.llamacloud_basic.md
+      <doc_id>.llamacloud_premium.md
+
+A manifest at ``maps/parser_compare_doc_map.jsonl`` records, per PDF:
+
+* ``doc_id``         — filename of the source PDF.
+* ``pdf_path``       — local cached PDF path.
+* ``document_id``    — SurfSense document id (carried over from
+                        mmlongbench's existing ingestion so the
+                        SurfSense agentic arm can scope retrieval).
+* ``pages``          — page count via pypdf (drives preprocessing cost).
+* ``extractions``    — map of ``arm_name -> {markdown_path, chars,
+                        elapsed_s, status, error}``.
+
+The runner reads this manifest, loads the markdown for each long-context
+arm, and uses ``document_id`` for the SurfSense arm.
+
+Source PDFs come from the existing mmlongbench ingestion — no new
+download or upload happens here. The point of this benchmark is
+parser quality on the same physical PDFs SurfSense already has, so
+re-using mmlongbench's PDF cache is correct.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+from ....core.config import set_suite_state
+from ....core.parsers import (
+    AzureDIError,
+    LlamaCloudError,
+    count_pdf_pages,
+    parse_with_azure_di,
+    parse_with_llamacloud,
+)
+from ....core.registry import RunContext
+
+logger = logging.getLogger(__name__)
+
+
+# Order matters for the manifest only (deterministic JSONL diffs);
+# the runner doesn't rely on it.
+PARSER_ARMS: tuple[tuple[str, str, str], ...] = (
+    ("azure_basic_lc",       "azure",      "basic"),
+    ("azure_premium_lc",     "azure",      "premium"),
+    ("llamacloud_basic_lc",  "llamacloud", "basic"),
+    ("llamacloud_premium_lc", "llamacloud", "premium"),
+)
+
+
+@dataclass
+class ExtractionResult:
+    arm: str
+    parser: str
+    mode: str
+    markdown_path: Path | None = None
+    chars: int = 0
+    elapsed_s: float = 0.0
+    status: str = "ok"  # "ok" | "failed"
+    error: str | None = None
+
+    def to_jsonl(self) -> dict[str, Any]:
+        return {
+            "arm": self.arm,
+            "parser": self.parser,
+            "mode": self.mode,
+            "markdown_path": str(self.markdown_path) if self.markdown_path else None,
+            "chars": self.chars,
+            "elapsed_s": round(self.elapsed_s, 2),
+            "status": self.status,
+            "error": self.error,
+        }
+
+
+@dataclass
+class PdfManifestRow:
+    doc_id: str
+    pdf_path: Path
+    document_id: int | None
+    pages: int
+    extractions: dict[str, ExtractionResult] = field(default_factory=dict)
+
+    def to_jsonl(self) -> dict[str, Any]:
+        return {
+            "doc_id": self.doc_id,
+            "pdf_path": str(self.pdf_path),
+            "document_id": self.document_id,
+            "pages": self.pages,
+            "extractions": {
+                arm: ext.to_jsonl() for arm, ext in self.extractions.items()
+            },
+        }
+
+
+# ---------------------------------------------------------------------------
+# Single-PDF extraction
+# ---------------------------------------------------------------------------
+
+
+async def _run_one_extraction(
+    pdf_path: Path,
+    *,
+    parser: str,
+    mode: str,
+    out_path: Path,
+    estimated_pages: int,
+) -> tuple[str, float]:
+    """Invoke the requested parser, persist markdown, return (markdown, elapsed_s)."""
+
+    started = time.monotonic()
+    if parser == "azure":
+        markdown = await parse_with_azure_di(pdf_path, processing_mode=mode)
+    elif parser == "llamacloud":
+        markdown = await parse_with_llamacloud(
+            pdf_path, processing_mode=mode, estimated_pages=estimated_pages,
+        )
+    else:
+        raise ValueError(f"Unknown parser {parser!r}")
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.write_text(markdown, encoding="utf-8")
+    return markdown, time.monotonic() - started
+
+
+async def _extract_one_pdf(
+    pdf_path: Path,
+    *,
+    extractions_dir: Path,
+    force_reextract: bool,
+) -> dict[str, ExtractionResult]:
+    """Run all four parser combos for ``pdf_path``, returning per-arm results.
+
+    Re-uses any cached ``.md`` already on disk unless ``force_reextract``.
+    The four parser invocations run concurrently — they're independent
+    HTTP-bound jobs and the providers don't share state.
+    """
+
+    estimated_pages = count_pdf_pages(pdf_path) or 50
+    out: dict[str, ExtractionResult] = {}
+    coros = []
+    arm_specs: list[tuple[str, str, str, Path]] = []
+
+    for arm_name, parser, mode in PARSER_ARMS:
+        out_path = extractions_dir / f"{pdf_path.stem}.{arm_name}.md"
+        arm_specs.append((arm_name, parser, mode, out_path))
+
+        if out_path.exists() and not force_reextract:
+            cached = out_path.read_text(encoding="utf-8")
+            out[arm_name] = ExtractionResult(
+                arm=arm_name,
+                parser=parser,
+                mode=mode,
+                markdown_path=out_path,
+                chars=len(cached),
+                elapsed_s=0.0,
+                status="ok",
+                error="(cached)",
+            )
+            logger.info(
+                "Cached extraction reused: %s (%d chars)", out_path.name, len(cached),
+            )
+            coros.append(_noop())
+        else:
+            coros.append(
+                _run_one_extraction(
+                    pdf_path,
+                    parser=parser, mode=mode,
+                    out_path=out_path,
+                    estimated_pages=estimated_pages,
+                )
+            )
+
+    results = await asyncio.gather(*coros, return_exceptions=True)
+    for (arm_name, parser, mode, out_path), result in zip(arm_specs, results, strict=True):
+        if arm_name in out:
+            continue  # cached — already populated above
+        if isinstance(result, Exception):
+            err = result
+            err_msg = f"{type(err).__name__}: {err}"
+            logger.warning(
+                "Extraction FAILED for %s [%s/%s]: %s",
+                pdf_path.name, parser, mode, err_msg,
+            )
+            out[arm_name] = ExtractionResult(
+                arm=arm_name, parser=parser, mode=mode,
+                status="failed", error=err_msg,
+            )
+        else:
+            markdown, elapsed = result
+            out[arm_name] = ExtractionResult(
+                arm=arm_name, parser=parser, mode=mode,
+                markdown_path=out_path,
+                chars=len(markdown),
+                elapsed_s=elapsed,
+                status="ok",
+            )
+    return out
+
+
+async def _noop() -> tuple[str, float]:
+    """Placeholder so cached entries align with parallel gather indexing."""
+
+    return ("", 0.0)
+
+
+# ---------------------------------------------------------------------------
+# Public entry point
+# ---------------------------------------------------------------------------
+
+
+def _read_existing_mmlongbench_map(map_path: Path) -> list[dict[str, Any]]:
+    """Read the mmlongbench doc map (skipping its ``__settings__`` header)."""
+
+    if not map_path.exists():
+        raise RuntimeError(
+            f"mmlongbench doc map not found at {map_path}. Run "
+            "`python -m surfsense_evals ingest multimodal_doc mmlongbench` first."
+        )
+    rows: list[dict[str, Any]] = []
+    with map_path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            if "__settings__" in row:
+                continue
+            rows.append(row)
+    return rows
+
+
+async def run_ingest(
+    ctx: RunContext,
+    *,
+    docs_filter: list[str] | None = None,
+    max_docs: int | None = None,
+    force_reextract: bool = False,
+    pdf_concurrency: int = 2,
+) -> None:
+    """Pre-extract all four parser markdowns for each PDF.
+
+    Parameters
+    ----------
+    docs_filter : list[str] | None
+        Specific filenames to extract (default: all PDFs from
+        mmlongbench's existing manifest).
+    max_docs : int | None
+        Cap on number of PDFs to process. Default: all.
+    force_reextract : bool
+        Re-call parsers even if a cached ``.md`` already exists. Off
+        by default — extractions are deterministic and parser calls
+        cost real money.
+    pdf_concurrency : int
+        How many PDFs to extract in parallel. Each PDF triggers four
+        parser HTTP calls, so total in-flight = 4 * pdf_concurrency.
+        Default 2 keeps us comfortably under both Azure DI and
+        LlamaCloud per-IP rate limits.
+    """
+
+    # Pull the source PDFs and document_ids from mmlongbench's existing
+    # ingestion. parser_compare doesn't re-upload; SurfSense's agentic
+    # arm queries the same search_space=55 chunks.
+    mmlb_map = ctx.suite_state.ingestion_maps.get("mmlongbench")
+    if not mmlb_map:
+        raise RuntimeError(
+            "Suite state has no mmlongbench ingestion map. Run "
+            "`python -m surfsense_evals ingest multimodal_doc mmlongbench` first "
+            "so parser_compare can re-use those PDFs."
+        )
+    src_rows = _read_existing_mmlongbench_map(Path(mmlb_map))
+
+    rows_in_scope = src_rows
+    if docs_filter:
+        wanted = set(docs_filter)
+        rows_in_scope = [r for r in rows_in_scope if r["doc_id"] in wanted]
+    if max_docs is not None and max_docs > 0:
+        rows_in_scope = rows_in_scope[:max_docs]
+
+    if not rows_in_scope:
+        raise RuntimeError(
+            "No PDFs in scope for parser_compare. Check --docs / --max-docs."
+        )
+
+    bench_dir = ctx.benchmark_data_dir()
+    extractions_dir = bench_dir / "extractions"
+    extractions_dir.mkdir(parents=True, exist_ok=True)
+
+    sem = asyncio.Semaphore(max(1, pdf_concurrency))
+    manifest_rows: list[PdfManifestRow] = []
+
+    async def _process(row: dict[str, Any]) -> PdfManifestRow:
+        pdf_path = Path(row["pdf_path"])
+        async with sem:
+            extractions = await _extract_one_pdf(
+                pdf_path,
+                extractions_dir=extractions_dir,
+                force_reextract=force_reextract,
+            )
+        return PdfManifestRow(
+            doc_id=str(row["doc_id"]),
+            pdf_path=pdf_path,
+            document_id=row.get("document_id"),
+            pages=count_pdf_pages(pdf_path),
+            extractions=extractions,
+        )
+
+    logger.info(
+        "parser_compare: extracting %d PDFs x 4 parsers (concurrency=%d)",
+        len(rows_in_scope), pdf_concurrency,
+    )
+    manifest_rows = await asyncio.gather(*(_process(r) for r in rows_in_scope))
+
+    # Persist manifest
+    map_path = ctx.maps_dir() / "parser_compare_doc_map.jsonl"
+    with map_path.open("w", encoding="utf-8") as fh:
+        for mr in manifest_rows:
+            fh.write(json.dumps(mr.to_jsonl()) + "\n")
+    logger.info("parser_compare manifest -> %s", map_path)
+
+    # Update suite state so the runner can find us via
+    # ctx.suite_state.ingestion_maps.
+    new_state = ctx.suite_state
+    new_state.ingestion_maps["parser_compare"] = str(map_path)
+    set_suite_state(ctx.config, ctx.suite, new_state)
+
+    # Quick summary log
+    total_extractions = sum(len(mr.extractions) for mr in manifest_rows)
+    failures = sum(
+        1 for mr in manifest_rows for ext in mr.extractions.values()
+        if ext.status != "ok"
+    )
+    logger.info(
+        "parser_compare ingest done: %d PDFs, %d extractions, %d failures",
+        len(manifest_rows), total_extractions, failures,
+    )
+
+
+__all__ = [
+    "ExtractionResult",
+    "PARSER_ARMS",
+    "PdfManifestRow",
+    "run_ingest",
+]
diff --git a/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/parser_compare/prompt.py b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/parser_compare/prompt.py
new file mode 100644
index 0000000000..7119bbd290
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/parser_compare/prompt.py
@@ -0,0 +1,120 @@
+"""Prompt templates for the three input modalities in parser_compare.
+
+We deliberately reuse the *same* core question framing as
+``mmlongbench/prompt.py`` so byte-identical questions reach all six
+arms; only the document delivery channel changes.
+
+Three templates:
+
+* ``build_native_pdf_prompt``       — bare question + format hint.
+                                       The PDF rides as a separate file
+                                       part (``NativePdfArm`` handles it).
+* ``build_long_context_prompt``     — question + format hint + the
+                                       parser-extracted markdown wrapped
+                                       in fenced ``<document>`` tags so
+                                       the model can clearly delimit
+                                       "context" from "instruction".
+* ``build_surfsense_prompt``        — bare question + format hint
+                                       (chunks come from RAG retrieval,
+                                       not from the prompt).
+
+The ``<document>`` tag is doc-aware: even though parser_compare runs
+one PDF per question today, we keep the wrapper plural so this is
+trivial to extend to multi-doc later.
+"""
+
+from __future__ import annotations
+
+# ---------------------------------------------------------------------------
+# Per-format hint blocks (same lookup as mmlongbench/prompt.py)
+# ---------------------------------------------------------------------------
+
+_FORMAT_HINTS: dict[str, str] = {
+    "str": (
+        "Respond with the answer as a short phrase, no full sentence. "
+        "Format your final line as `Answer: <text>`."
+    ),
+    "int": (
+        "Respond with a single integer only. "
+        "Format your final line as `Answer: <integer>`."
+    ),
+    "float": (
+        "Respond with a single decimal number only (no units). "
+        "Format your final line as `Answer: <number>`."
+    ),
+    "list": (
+        "Respond with a comma-separated list of items, no extra text. "
+        "Format your final line as `Answer: item1, item2, item3`."
+    ),
+    "none": (
+        "If the answer cannot be determined from the document, say so explicitly. "
+        "Format your final line as `Answer: Not answerable`."
+    ),
+}
+
+
+def _format_hint(answer_format: str) -> str:
+    fmt = (answer_format or "str").strip().lower()
+    return _FORMAT_HINTS.get(fmt, _FORMAT_HINTS["str"])
+
+
+_BASE_INSTRUCTION = (
+    "You are a document-understanding assistant. Use ONLY the provided "
+    "document to answer the question. The document may contain text, "
+    "tables, charts, figures, and images. If the answer is in a chart "
+    "or image, read it carefully. Do not use external knowledge."
+)
+
+
+def build_native_pdf_prompt(question: str, *, answer_format: str) -> str:
+    """Prompt for ``NativePdfArm`` — PDF attached separately as a file part."""
+
+    return (
+        f"{_BASE_INSTRUCTION}\n\n"
+        f"Question: {question.strip()}\n\n"
+        f"{_format_hint(answer_format)}\n"
+    )
+
+
+def build_surfsense_prompt(question: str, *, answer_format: str) -> str:
+    """Prompt for ``SurfSenseArm`` — chunks retrieved by the agent."""
+
+    # SurfSense's agent already injects retrieved chunks via its tool
+    # loop; the prompt only carries the user-visible question + format
+    # hint, mirroring how a human asks the SurfSense UI.
+    return (
+        f"{_BASE_INSTRUCTION}\n\n"
+        f"Question: {question.strip()}\n\n"
+        f"{_format_hint(answer_format)}\n"
+    )
+
+
+def build_long_context_prompt(
+    question: str,
+    *,
+    answer_format: str,
+    document_markdown: str,
+    document_label: str,
+) -> str:
+    """Prompt for the four long-context arms — markdown stuffed inline.
+
+    ``document_label`` is a short human-readable name (e.g. the PDF
+    filename) so the model can reason about source provenance even
+    though only one document is in scope.
+    """
+
+    return (
+        f"{_BASE_INSTRUCTION}\n\n"
+        f"<document name=\"{document_label}\">\n"
+        f"{document_markdown.strip()}\n"
+        f"</document>\n\n"
+        f"Question: {question.strip()}\n\n"
+        f"{_format_hint(answer_format)}\n"
+    )
+
+
+__all__ = [
+    "build_long_context_prompt",
+    "build_native_pdf_prompt",
+    "build_surfsense_prompt",
+]
diff --git a/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/parser_compare/runner.py b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/parser_compare/runner.py
new file mode 100644
index 0000000000..e71dffa651
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/parser_compare/runner.py
@@ -0,0 +1,797 @@
+"""parser_compare runner — six-arm head-to-head on n shared questions.
+
+For each (PDF, question) pair we issue six LLM calls (all sonnet 4.5):
+
+* ``native_pdf``           — PDF attached natively.
+* ``azure_basic_lc``       — Azure prebuilt-read markdown stuffed.
+* ``azure_premium_lc``     — Azure prebuilt-layout markdown stuffed.
+* ``llamacloud_basic_lc``  — LlamaParse parse_page_with_llm markdown stuffed.
+* ``llamacloud_premium_lc`` — LlamaParse parse_page_with_agent markdown stuffed.
+* ``surfsense_agentic``    — SurfSense /api/v1/new_chat retrieval over chunks.
+
+Cost reporting:
+
+* ``llm_cost_per_q``       — mean OpenRouter ``usage.cost`` reported by
+                              the chat-completions API. Zero for the
+                              SurfSense agentic arm because the SSE
+                              stream doesn't surface per-call cost yet
+                              (a known gap; we annotate it in the
+                              report rather than estimating).
+* ``preprocess_cost_total`` — pages * $/1k according to the user's
+                              tariff:
+                                * basic   : $1   / 1k pages
+                                * premium : $10  / 1k pages
+                                * native_pdf : $0  (no preprocessing)
+                                * surfsense_agentic : $10 / 1k pages
+                                  (existing mmlongbench ingest used
+                                  processing_mode=premium with Azure DI).
+* ``preprocess_cost_per_q`` — preprocess_cost_total / n_questions.
+* ``total_cost_per_q``      — llm_cost_per_q + preprocess_cost_per_q.
+
+The grader is reused from ``mmlongbench/grader.py`` (deterministic,
+format-aware) so the metric is directly comparable to the existing
+mmlongbench runs.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import logging
+import os
+from collections.abc import Iterable
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from ....core.arms import (
+    ArmRequest,
+    ArmResult,
+    BareLlmArm,
+    NativePdfArm,
+    SurfSenseArm,
+)
+from ....core.config import utc_iso_timestamp
+from ....core.metrics.mc_accuracy import accuracy_with_wilson_ci
+from ....core.parse.freeform_answer import extract_freeform_answer
+from ....core.providers.openrouter_chat import OpenRouterChatProvider
+from ....core.providers.openrouter_pdf import OpenRouterPdfProvider, PdfEngine
+from ....core.registry import ReportSection, RunArtifact, RunContext
+from ..mmlongbench.grader import GradeResult, grade
+from .ingest import PARSER_ARMS
+from .prompt import (
+    build_long_context_prompt,
+    build_native_pdf_prompt,
+    build_surfsense_prompt,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# Cost tariff (per the user's spec: $1 / 1k pages basic, $10 / 1k pages premium).
+# Held as dollars-per-page so per-PDF math is a pure multiply.
+PREPROCESS_USD_PER_PAGE = {
+    "basic":   1.0  / 1000.0,
+    "premium": 10.0 / 1000.0,
+}
+
+ARM_NAMES = (
+    "native_pdf",
+    "azure_basic_lc",
+    "azure_premium_lc",
+    "llamacloud_basic_lc",
+    "llamacloud_premium_lc",
+    "surfsense_agentic",
+)
+
+# What ingest mode each LC arm corresponds to (drives preprocess cost).
+_LC_ARM_MODE: dict[str, str] = {
+    "azure_basic_lc": "basic",
+    "azure_premium_lc": "premium",
+    "llamacloud_basic_lc": "basic",
+    "llamacloud_premium_lc": "premium",
+}
+
+# The SurfSense agentic arm is fed by the existing mmlongbench
+# ingestion. That ingestion was performed with vision_llm=on and
+# processing_mode=premium, and the backend's ETL routes premium-mode
+# PDFs through Azure DI prebuilt-layout when AZURE_DI_KEY is set. So
+# the preprocessing cost is the premium tariff.
+SURFSENSE_INGEST_MODE = "premium"
+
+
+# ---------------------------------------------------------------------------
+# Question + PDF row shapes
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class PCQuestion:
+    qid: str
+    doc_id: str
+    question: str
+    gold_answer: str
+    answer_format: str
+    pdf_path: Path
+    document_id: int | None
+    pages: int
+    extractions: dict[str, Path]  # arm_name -> markdown path (only successes)
+
+
+def _read_doc_map(map_path: Path) -> dict[str, dict[str, Any]]:
+    out: dict[str, dict[str, Any]] = {}
+    with map_path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            out[str(row["doc_id"])] = row
+    return out
+
+
+def _select_questions(
+    questions_jsonl: Path,
+    doc_map: dict[str, dict[str, Any]],
+    *,
+    docs_filter: list[str] | None,
+    sample_per_doc: int,
+    skip_unanswerable: bool,
+    skip_format: list[str] | None,
+) -> list[PCQuestion]:
+    """Pick the first ``sample_per_doc`` questions per PDF in scope.
+
+    Defaults to one per PDF (n=5 across 5 PDFs ⇒ 5 questions). Filters
+    out unanswerable probes by default since they're noise at small n.
+    """
+
+    out: list[PCQuestion] = []
+    per_doc_taken: dict[str, int] = {}
+    per_doc_idx: dict[str, int] = {}
+    skip_format_set = {f.lower() for f in (skip_format or [])}
+
+    with questions_jsonl.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            doc_id = str(row.get("doc_id") or "")
+            if not doc_id:
+                continue
+            if docs_filter and doc_id not in docs_filter:
+                continue
+            map_row = doc_map.get(doc_id)
+            if map_row is None:
+                continue
+
+            answer_format = str(row.get("answer_format") or "").strip().lower()
+            idx = per_doc_idx.get(doc_id, 0)
+            per_doc_idx[doc_id] = idx + 1
+
+            if skip_unanswerable and answer_format == "none":
+                continue
+            if answer_format in skip_format_set:
+                continue
+
+            if per_doc_taken.get(doc_id, 0) >= sample_per_doc:
+                continue
+
+            extractions: dict[str, Path] = {}
+            for arm_name, ext_blob in (map_row.get("extractions") or {}).items():
+                if ext_blob.get("status") == "ok" and ext_blob.get("markdown_path"):
+                    extractions[arm_name] = Path(ext_blob["markdown_path"])
+
+            out.append(PCQuestion(
+                qid=f"{doc_id}::Q{idx:03d}",
+                doc_id=doc_id,
+                question=str(row.get("question") or "").strip(),
+                gold_answer=str(row.get("answer") or "").strip(),
+                answer_format=answer_format,
+                pdf_path=Path(map_row["pdf_path"]),
+                document_id=map_row.get("document_id"),
+                pages=int(map_row.get("pages", 0)),
+                extractions=extractions,
+            ))
+            per_doc_taken[doc_id] = per_doc_taken.get(doc_id, 0) + 1
+
+    out.sort(key=lambda q: (q.doc_id, q.qid))
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Bounded concurrency helper
+# ---------------------------------------------------------------------------
+
+
+async def _gather_with_limit(coros: Iterable, *, concurrency: int) -> list[Any]:
+    sem = asyncio.Semaphore(max(1, concurrency))
+
+    async def _wrap(coro):
+        async with sem:
+            return await coro
+
+    return await asyncio.gather(*(_wrap(c) for c in coros))
+
+
+# ---------------------------------------------------------------------------
+# Benchmark
+# ---------------------------------------------------------------------------
+
+
+_DESCRIPTION = (
+    "parser_compare — 6-arm head-to-head on shared MMLongBench-Doc PDFs: "
+    "native PDF + (Azure DI / LlamaCloud) x (basic / premium) long-context "
+    "stuffing + SurfSense agentic retrieval. Reports preprocessing dollars "
+    "($1 / 1k pages basic, $10 / 1k pages premium) on top of LLM cost."
+)
+
+
+class ParserCompareBenchmark:
+    """6-arm parser + agentic-vs-non-agentic head-to-head."""
+
+    suite: str = "multimodal_doc"
+    name: str = "parser_compare"
+    headline: bool = True
+    description: str = _DESCRIPTION
+
+    # ------------------------------------------------------------------
+    # CLI flags
+    # ------------------------------------------------------------------
+
+    def add_run_args(self, parser: argparse.ArgumentParser) -> None:
+        parser.add_argument(
+            "--docs", default=None,
+            help="Comma-separated doc_ids to include (default: all in manifest).",
+        )
+        parser.add_argument(
+            "--sample-per-doc", type=int, default=1,
+            help="Take the first N answerable questions per PDF (default 1).",
+        )
+        parser.add_argument(
+            "--skip-unanswerable", dest="skip_unanswerable",
+            action="store_true", default=True,
+            help="Drop 'None' format probes (default true; we want signal not "
+                 "hallucination probes for n=5).",
+        )
+        parser.add_argument(
+            "--include-unanswerable", dest="skip_unanswerable",
+            action="store_false",
+            help="Override --skip-unanswerable; include unanswerable probes too.",
+        )
+        parser.add_argument(
+            "--skip-format", default=None,
+            help="Comma-separated answer_format values to skip (e.g. 'none,float').",
+        )
+        parser.add_argument(
+            "--concurrency", type=int, default=2,
+            help="Parallel question workers per arm (default 2).",
+        )
+        parser.add_argument(
+            "--no-mentions", dest="no_mentions", action="store_true",
+            help="SurfSense arm: skip mentioned_document_ids (full-corpus retrieval).",
+        )
+        parser.add_argument(
+            "--pdf-engine", default="native",
+            choices=[e.value for e in PdfEngine],
+            help="OpenRouter file-parser engine for native_pdf arm.",
+        )
+        parser.add_argument(
+            "--max-output-tokens", type=int, default=512,
+            help="Cap on completion length for every arm.",
+        )
+        parser.add_argument(
+            "--llm-model", default="anthropic/claude-sonnet-4.5",
+            help="OpenRouter slug used by the 5 OpenRouter-driven arms. "
+                 "SurfSense arm uses whatever provider_model is pinned on the suite.",
+        )
+        parser.add_argument(
+            "--skip-arms", default=None,
+            help="Comma-separated arm names to skip (e.g. 'llamacloud_premium_lc').",
+        )
+        # Ingest-only flags (forwarded by the CLI to ingest.run_ingest).
+        parser.add_argument(
+            "--max-docs", type=int, default=None,
+            help="(ingest only) cap number of unique PDFs to process.",
+        )
+        parser.add_argument(
+            "--force-reextract", action="store_true",
+            help="(ingest only) re-call parsers even if cached .md exists.",
+        )
+        parser.add_argument(
+            "--pdf-concurrency", type=int, default=2,
+            help="(ingest only) parallel PDFs (each fans out to 4 parsers).",
+        )
+
+    # ------------------------------------------------------------------
+    # Lifecycle: ingest delegates to .ingest.run_ingest
+    # ------------------------------------------------------------------
+
+    async def ingest(self, ctx: RunContext, **opts: Any) -> None:
+        from .ingest import run_ingest
+
+        docs_raw: str | None = opts.get("docs")
+        docs_filter = (
+            [d.strip() for d in docs_raw.split(",") if d.strip()] if docs_raw else None
+        )
+        await run_ingest(
+            ctx,
+            docs_filter=docs_filter,
+            max_docs=opts.get("max_docs"),
+            force_reextract=bool(opts.get("force_reextract", False)),
+            pdf_concurrency=int(opts.get("pdf_concurrency") or 2),
+        )
+
+    # ------------------------------------------------------------------
+    # Run
+    # ------------------------------------------------------------------
+
+    async def run(self, ctx: RunContext, **opts: Any) -> RunArtifact:
+        docs_raw: str | None = opts.get("docs")
+        docs_filter = (
+            [d.strip() for d in docs_raw.split(",") if d.strip()] if docs_raw else None
+        )
+        sample_per_doc = int(opts.get("sample_per_doc") or 1)
+        skip_unanswerable = bool(opts.get("skip_unanswerable", True))
+        skip_format_raw: str | None = opts.get("skip_format")
+        skip_format = (
+            [f.strip() for f in skip_format_raw.split(",") if f.strip()]
+            if skip_format_raw else None
+        )
+        concurrency = int(opts.get("concurrency") or 2)
+        no_mentions = bool(opts.get("no_mentions"))
+        pdf_engine_name = opts.get("pdf_engine") or "native"
+        max_output_tokens = int(opts.get("max_output_tokens") or 512)
+        llm_model = str(opts.get("llm_model") or "anthropic/claude-sonnet-4.5")
+        skip_arms_raw: str | None = opts.get("skip_arms")
+        skip_arms = (
+            {a.strip() for a in skip_arms_raw.split(",") if a.strip()}
+            if skip_arms_raw else set()
+        )
+
+        active_arms = [a for a in ARM_NAMES if a not in skip_arms]
+        if not active_arms:
+            raise RuntimeError("All arms skipped; nothing to run.")
+
+        bench_dir = ctx.benchmark_data_dir()
+        # parser_compare reuses mmlongbench's questions.jsonl (already
+        # downloaded by `ingest multimodal_doc mmlongbench`).
+        questions_jsonl = bench_dir.parent / "mmlongbench" / "questions.jsonl"
+        map_path = ctx.maps_dir() / "parser_compare_doc_map.jsonl"
+        if not questions_jsonl.exists():
+            raise RuntimeError(
+                "Missing mmlongbench questions at "
+                f"{questions_jsonl}. Run "
+                "`python -m surfsense_evals ingest multimodal_doc mmlongbench` first."
+            )
+        if not map_path.exists():
+            raise RuntimeError(
+                "parser_compare doc map missing. Run "
+                "`python -m surfsense_evals ingest multimodal_doc parser_compare` first."
+            )
+
+        doc_map = _read_doc_map(map_path)
+        questions = _select_questions(
+            questions_jsonl, doc_map,
+            docs_filter=docs_filter,
+            sample_per_doc=sample_per_doc,
+            skip_unanswerable=skip_unanswerable,
+            skip_format=skip_format,
+        )
+        if not questions:
+            raise RuntimeError(
+                "No questions matched filters; broaden --docs / --skip-format."
+            )
+        logger.info(
+            "parser_compare: scheduled %d questions across %d arms (%s)",
+            len(questions), len(active_arms), ",".join(active_arms),
+        )
+
+        api_key = os.environ.get("OPENROUTER_API_KEY")
+        if not api_key:
+            raise RuntimeError("OPENROUTER_API_KEY env var is required.")
+
+        # Build arms
+        arms: dict[str, Any] = {}
+        if "native_pdf" in active_arms:
+            native_provider = OpenRouterPdfProvider(
+                api_key=api_key, base_url=ctx.config.openrouter_base_url,
+                model=llm_model, engine=PdfEngine(pdf_engine_name),
+            )
+            arms["native_pdf"] = NativePdfArm(
+                provider=native_provider, max_output_tokens=max_output_tokens,
+            )
+        for arm_name, _, _ in PARSER_ARMS:
+            if arm_name in active_arms:
+                lc_provider = OpenRouterChatProvider(
+                    api_key=api_key, base_url=ctx.config.openrouter_base_url,
+                    model=llm_model,
+                )
+                arms[arm_name] = BareLlmArm(
+                    provider=lc_provider,
+                    max_output_tokens=max_output_tokens,
+                    name=arm_name,
+                )
+        if "surfsense_agentic" in active_arms:
+            surf = SurfSenseArm(
+                client=ctx.new_chat_client(),
+                search_space_id=ctx.search_space_id,
+                ephemeral_threads=True,
+            )
+            # Override the default "surfsense" name so the metrics
+            # bucket lines up with the rest of parser_compare's arms.
+            surf.name = "surfsense_agentic"
+            arms["surfsense_agentic"] = surf
+
+        run_timestamp = utc_iso_timestamp()
+        run_dir = ctx.runs_dir(run_timestamp=run_timestamp)
+        raw_path = run_dir / "raw.jsonl"
+
+        # ---- per-arm answer coroutine helpers ----
+
+        def _native_req(q: PCQuestion) -> ArmRequest:
+            return ArmRequest(
+                question_id=q.qid,
+                prompt=build_native_pdf_prompt(q.question, answer_format=q.answer_format),
+                pdf_paths=[q.pdf_path],
+                options={"max_tokens": max_output_tokens},
+            )
+
+        def _lc_req(q: PCQuestion, arm_name: str) -> ArmRequest:
+            md_path = q.extractions.get(arm_name)
+            if md_path is None or not md_path.exists():
+                raise FileNotFoundError(
+                    f"Missing extraction for {arm_name} on {q.doc_id}"
+                )
+            markdown = md_path.read_text(encoding="utf-8")
+            return ArmRequest(
+                question_id=q.qid,
+                prompt=build_long_context_prompt(
+                    q.question,
+                    answer_format=q.answer_format,
+                    document_markdown=markdown,
+                    document_label=q.doc_id,
+                ),
+            )
+
+        def _surf_req(q: PCQuestion) -> ArmRequest:
+            mentions: list[int] | None = None
+            if not no_mentions and q.document_id is not None:
+                mentions = [int(q.document_id)]
+            return ArmRequest(
+                question_id=q.qid,
+                prompt=build_surfsense_prompt(q.question, answer_format=q.answer_format),
+                mentioned_document_ids=mentions,
+            )
+
+        async def _answer_one(arm_name: str, q: PCQuestion) -> ArmResult:
+            arm = arms[arm_name]
+            try:
+                if arm_name == "native_pdf":
+                    return await arm.answer(_native_req(q))
+                if arm_name == "surfsense_agentic":
+                    return await arm.answer(_surf_req(q))
+                return await arm.answer(_lc_req(q, arm_name))
+            except FileNotFoundError as exc:
+                return ArmResult(
+                    arm=arm_name,
+                    question_id=q.qid,
+                    raw_text="",
+                    error=f"FileNotFoundError: {exc}",
+                )
+
+        # Run all arms in parallel (each arm bounded by `concurrency`).
+        per_arm_tasks: dict[str, list] = {
+            arm_name: [_answer_one(arm_name, q) for q in questions]
+            for arm_name in active_arms
+        }
+        per_arm_results: dict[str, list[ArmResult]] = {}
+        gathered = await asyncio.gather(*[
+            _gather_with_limit(per_arm_tasks[arm_name], concurrency=concurrency)
+            for arm_name in active_arms
+        ])
+        for arm_name, results in zip(active_arms, gathered, strict=True):
+            per_arm_results[arm_name] = results
+
+        # Grade
+        per_arm_grades: dict[str, list[GradeResult]] = {}
+        for arm_name in active_arms:
+            per_arm_grades[arm_name] = [
+                grade(
+                    pred=extract_freeform_answer(r.raw_text or ""),
+                    gold=q.gold_answer,
+                    answer_format=q.answer_format,
+                )
+                for q, r in zip(questions, per_arm_results[arm_name], strict=True)
+            ]
+
+        # Persist raw.jsonl
+        with raw_path.open("w", encoding="utf-8") as fh:
+            for i, q in enumerate(questions):
+                base = {
+                    "qid": q.qid,
+                    "doc_id": q.doc_id,
+                    "answer_format": q.answer_format,
+                    "gold": q.gold_answer,
+                    "pages": q.pages,
+                    "document_id": q.document_id,
+                }
+                for arm_name in active_arms:
+                    res = per_arm_results[arm_name][i]
+                    g = per_arm_grades[arm_name][i]
+                    fh.write(json.dumps({
+                        **base,
+                        **res.to_jsonl(),
+                        "graded": {
+                            "correct": g.correct,
+                            "f1": g.f1,
+                            "method": g.method,
+                            "normalised_pred": g.normalised_pred,
+                            "normalised_gold": g.normalised_gold,
+                        },
+                    }) + "\n")
+
+        # Aggregate per-arm metrics + cost
+        metrics = _compute_metrics(
+            questions, per_arm_results, per_arm_grades, active_arms,
+        )
+
+        artifact = RunArtifact(
+            suite=self.suite,
+            benchmark=self.name,
+            run_timestamp=run_timestamp,
+            raw_path=raw_path,
+            metrics=metrics,
+            extra={
+                "n_questions": len(questions),
+                "n_pdfs": len({q.doc_id for q in questions}),
+                "active_arms": list(active_arms),
+                "concurrency": concurrency,
+                "no_mentions": no_mentions,
+                "pdf_engine": pdf_engine_name,
+                "llm_model": llm_model,
+                "scenario": ctx.scenario,
+                "provider_model": ctx.provider_model,
+                "vision_provider_model": ctx.vision_provider_model,
+                "agent_llm_id": ctx.agent_llm_id,
+                "preprocess_tariff": {
+                    "basic_per_1k_pages": 1.0,
+                    "premium_per_1k_pages": 10.0,
+                },
+            },
+        )
+
+        manifest_path = run_dir / "run_artifact.json"
+        manifest_path.write_text(
+            json.dumps({
+                "suite": self.suite,
+                "benchmark": self.name,
+                "raw_path": "raw.jsonl",
+                "metrics": metrics,
+                "extra": artifact.extra,
+            }, indent=2, sort_keys=True) + "\n",
+            encoding="utf-8",
+        )
+        return artifact
+
+    # ------------------------------------------------------------------
+    # Report
+    # ------------------------------------------------------------------
+
+    def report_section(self, artifacts: list[RunArtifact]) -> ReportSection:
+        if not artifacts:
+            return ReportSection(
+                title="Parser × agent-vs-stuffing comparison",
+                headline=True,
+                body_md="(no run artifacts found)",
+                body_json={},
+            )
+        latest = max(artifacts, key=lambda a: a.run_timestamp)
+        m = latest.metrics
+        extra = latest.extra
+        per_arm = m.get("per_arm", {})
+        active_arms = list(extra.get("active_arms", per_arm.keys()))
+
+        n_q = extra.get("n_questions", "?")
+        n_pdfs = extra.get("n_pdfs", "?")
+
+        body: list[str] = []
+        body.append(
+            f"- Sample size: **{n_q} questions across {n_pdfs} PDFs** "
+            f"(LLM: `{extra.get('llm_model', '?')}`, "
+            f"engine: `{extra.get('pdf_engine', 'native')}`)."
+        )
+        body.append(
+            f"- Preprocess tariff: basic = $1 / 1k pages, "
+            f"premium = $10 / 1k pages."
+        )
+        body.append("")
+        body.append("### Per-arm summary")
+        body.append("")
+        body.append(
+            "| Arm | Accuracy | F1 mean | LLM $/Q | Preprocess $ total | Total $/Q | Latency p50 |"
+        )
+        body.append("|---|---:|---:|---:|---:|---:|---:|")
+        for arm_name in active_arms:
+            row = per_arm.get(arm_name)
+            if not row:
+                body.append(f"| `{arm_name}` | (no data) | | | | | |")
+                continue
+            body.append(
+                f"| `{arm_name}` "
+                f"| {row['accuracy']*100:.1f}% "
+                f"({row['n_correct']}/{row['n']}) "
+                f"| {row['f1_mean']*100:.1f}% "
+                f"| ${row['llm_cost_per_q']:.4f} "
+                f"| ${row['preprocess_cost_total']:.4f} "
+                f"| ${row['total_cost_per_q']:.4f} "
+                f"| {row['latency_ms_median']/1000:.1f}s |"
+            )
+        body.append("")
+
+        # Notes / caveats
+        body.append("### Notes")
+        body.append("")
+        body.append(
+            "- `surfsense_agentic` LLM cost shows as $0.0000 because the "
+            "`/api/v1/new_chat` SSE stream does not surface per-call token "
+            "or cost yet (a known instrumentation gap). Preprocessing cost "
+            "is the premium tariff because the underlying mmlongbench "
+            "ingestion was performed with `processing_mode=premium` + "
+            "`vision_llm=on` + Azure DI."
+        )
+        body.append(
+            "- Long-context arms include the **same PDF text** for every "
+            "question against that PDF, so the OpenRouter input cost is "
+            "dominated by markdown size; preprocessing cost is paid once "
+            "across all questions sharing a PDF."
+        )
+        body.append(
+            "- Preprocessing $ total is computed as "
+            "`pages_processed_per_arm × tariff`, summed across the unique "
+            "PDFs in scope. With one question per PDF (n=5), preprocess $ "
+            "= preprocess $ / Q."
+        )
+        if extra.get("scenario"):
+            body.append(
+                f"- Scenario: `{extra.get('scenario')}` "
+                f"(suite-pinned `provider_model`: "
+                f"`{extra.get('provider_model', '?')}`)."
+            )
+
+        # Per-PDF breakdown if useful
+        per_pdf = m.get("per_pdf", {})
+        if per_pdf:
+            body.append("")
+            body.append("### Per-PDF correctness")
+            body.append("")
+            header = "| Doc | Pages | " + " | ".join(f"`{a}`" for a in active_arms) + " |"
+            sep = "|---|---:|" + "|".join(":---:" for _ in active_arms) + "|"
+            body.append(header)
+            body.append(sep)
+            for doc_id, info in sorted(per_pdf.items()):
+                row_cells = []
+                for arm_name in active_arms:
+                    g = info.get("arms", {}).get(arm_name, {})
+                    if not g:
+                        row_cells.append("?")
+                    else:
+                        row_cells.append("✓" if g.get("correct") else "✗")
+                body.append(
+                    f"| `{doc_id}` | {info.get('pages', '?')} | "
+                    + " | ".join(row_cells) + " |"
+                )
+
+        return ReportSection(
+            title="Parser × agent-vs-stuffing — long PDFs (sonnet 4.5)",
+            headline=True,
+            body_md="\n".join(body),
+            body_json=m,
+        )
+
+
+# ---------------------------------------------------------------------------
+# Metrics
+# ---------------------------------------------------------------------------
+
+
+def _compute_metrics(
+    questions: list[PCQuestion],
+    per_arm_results: dict[str, list[ArmResult]],
+    per_arm_grades: dict[str, list[GradeResult]],
+    active_arms: Iterable[str],
+) -> dict[str, Any]:
+    """Aggregate per-arm metrics + the user's preprocessing cost overlay."""
+
+    # Sum unique PDF pages — preprocessing pays per unique PDF, not per question.
+    pdf_pages: dict[str, int] = {}
+    for q in questions:
+        pdf_pages.setdefault(q.doc_id, q.pages)
+
+    per_arm: dict[str, dict[str, Any]] = {}
+    for arm_name in active_arms:
+        results = per_arm_results[arm_name]
+        grades = per_arm_grades[arm_name]
+        n = len(grades)
+        n_correct = sum(1 for g in grades if g.correct)
+        f1_sum = sum(g.f1 for g in grades)
+        acc_with_ci = accuracy_with_wilson_ci(n_correct, n)
+
+        # LLM cost: sum of per-call cost_micros across questions, then average.
+        cost_micros_total = sum(int(r.cost_micros or 0) for r in results)
+        llm_cost_per_q = (cost_micros_total / 1_000_000.0) / n if n else 0.0
+
+        # Preprocessing cost depends on which mode this arm corresponds to.
+        if arm_name == "native_pdf":
+            preprocess_per_page = 0.0
+            preprocess_label = "n/a (PDF attached natively)"
+        elif arm_name in _LC_ARM_MODE:
+            mode = _LC_ARM_MODE[arm_name]
+            preprocess_per_page = PREPROCESS_USD_PER_PAGE[mode]
+            preprocess_label = f"{mode} tier ($/{mode}/page = ${preprocess_per_page:.4f})"
+        elif arm_name == "surfsense_agentic":
+            preprocess_per_page = PREPROCESS_USD_PER_PAGE[SURFSENSE_INGEST_MODE]
+            preprocess_label = (
+                f"{SURFSENSE_INGEST_MODE} tier (ingested by SurfSense at "
+                f"processing_mode=premium + vision_llm=on)"
+            )
+        else:
+            preprocess_per_page = 0.0
+            preprocess_label = "unknown"
+
+        preprocess_cost_total = sum(
+            pages * preprocess_per_page for pages in pdf_pages.values()
+        )
+        preprocess_cost_per_q = preprocess_cost_total / n if n else 0.0
+        total_cost_per_q = llm_cost_per_q + preprocess_cost_per_q
+
+        latencies = sorted(int(r.latency_ms or 0) for r in results)
+        latency_median = latencies[len(latencies) // 2] if latencies else 0
+        latency_p95 = latencies[int(len(latencies) * 0.95)] if len(latencies) >= 20 else (
+            latencies[-1] if latencies else 0
+        )
+
+        in_tokens = [int(r.input_tokens or 0) for r in results]
+        out_tokens = [int(r.output_tokens or 0) for r in results]
+
+        per_arm[arm_name] = {
+            **acc_with_ci.to_dict(),
+            "n": n,
+            "n_correct": n_correct,
+            "f1_mean": f1_sum / n if n else 0.0,
+            "llm_cost_per_q": llm_cost_per_q,
+            "preprocess_per_page_usd": preprocess_per_page,
+            "preprocess_cost_total": preprocess_cost_total,
+            "preprocess_cost_per_q": preprocess_cost_per_q,
+            "total_cost_per_q": total_cost_per_q,
+            "preprocess_label": preprocess_label,
+            "latency_ms_median": latency_median,
+            "latency_ms_p95": latency_p95,
+            "input_tokens_mean": (sum(in_tokens) / len(in_tokens)) if in_tokens else 0.0,
+            "output_tokens_mean": (sum(out_tokens) / len(out_tokens)) if out_tokens else 0.0,
+        }
+
+    # Per-PDF breakdown (correct / not for each arm)
+    per_pdf: dict[str, dict[str, Any]] = {}
+    for i, q in enumerate(questions):
+        slot = per_pdf.setdefault(q.doc_id, {
+            "pages": q.pages,
+            "arms": {},
+        })
+        for arm_name in active_arms:
+            slot["arms"].setdefault(arm_name, {
+                "correct": per_arm_grades[arm_name][i].correct,
+                "f1": per_arm_grades[arm_name][i].f1,
+            })
+
+    return {
+        "per_arm": per_arm,
+        "per_pdf": per_pdf,
+        "n_questions": len(questions),
+        "n_unique_pdfs": len(pdf_pages),
+        "total_pages_in_scope": sum(pdf_pages.values()),
+    }
+
+
+__all__ = ["ParserCompareBenchmark", "PCQuestion"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/__init__.py b/surfsense_evals/src/surfsense_evals/suites/research/__init__.py
new file mode 100644
index 0000000000..03fe24c028
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/__init__.py
@@ -0,0 +1,18 @@
+"""Research / multi-document RAG benchmarks.
+
+Distinct from ``multimodal_doc`` (PDF-bound) and ``medical`` (one
+question = one source PDF). Benchmarks here put *retrieval and
+reasoning across many documents* in the critical path — the regime
+where SurfSense's chunk-level RAG should shine versus "pour the
+entire document into the LLM" or "ask the LLM cold".
+
+* ``frames`` (google/frames-benchmark) — 824 multi-hop Wikipedia
+  questions; tests bare-LLM vs SurfSense over a shared ~330-doc
+  corpus.
+* ``crag``   (facebookresearch/CRAG, KDD Cup 2024) — 2,706 web QA
+  pairs with 5 pre-retrieved HTML pages each; tests bare-LLM vs
+  long-context-stuffed LLM vs SurfSense over the question's 5
+  scoped pages — the closest comparison to a competing RAG product.
+"""
+
+from __future__ import annotations
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/crag/__init__.py b/surfsense_evals/src/surfsense_evals/suites/research/crag/__init__.py
new file mode 100644
index 0000000000..b9658ef688
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/crag/__init__.py
@@ -0,0 +1,44 @@
+"""CRAG — Comprehensive RAG Benchmark (Yang et al., Meta, KDD Cup 2024).
+
+Source: https://github.com/facebookresearch/CRAG (Tasks 1 & 2)
+Paper:  https://arxiv.org/abs/2406.04744
+
+CRAG ships ~2,706 factual QA pairs, each paired with **5 full HTML
+pages** retrieved as the top-5 of a real web search at ``query_time``.
+The benchmark spans 5 domains (finance, music, movie, sports, open)
+and 8 question types (simple, comparison, aggregation, set, multi-hop,
+post-processing, false_premise, simple_w_condition) — heads/torsos/
+tails of entity popularity — and an explicit static→real-time
+freshness axis.
+
+Why CRAG demonstrates SurfSense more clearly than FRAMES
+--------------------------------------------------------
+FRAMES tested SurfSense vs. *no retrieval at all* — a fair "naive
+prompting" baseline (the published 40.8% number) but not a competing
+RAG product. CRAG enables a three-way comparison:
+
+* ``bare_llm``      — chat completion with the question only. CRAG
+  paper: ≤34% accuracy ("LLM cold").
+* ``long_context``  — stuff all 5 extracted page texts straight into
+  the prompt (the "naive RAG" / "straightforward RAG" arm in the
+  paper). Published baseline: ~44%.
+* ``surfsense``     — POST ``/api/v1/new_chat`` with retrieval scoped
+  to the question's 5 ingested pages (``mentioned_document_ids``).
+
+So the headline becomes "SurfSense vs. context-stuffed long-context
+LLM, both fed the same 5 pages" — a head-to-head against the simplest
+realistic RAG strategy, not against an unarmed model.
+
+Scoring follows the CRAG paper: each prediction is graded as
+**correct** (+1), **missing/I-don't-know** (0), or **incorrect** (-1),
+and the headline metric is the *Truthfulness Score*:
+``(#correct - #incorrect) / total`` — penalising hallucinations
+relative to refusals.
+"""
+
+from __future__ import annotations
+
+from ....core import registry as _registry
+from .runner import CragBenchmark
+
+_registry.register(CragBenchmark())
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/crag/dataset.py b/surfsense_evals/src/surfsense_evals/suites/research/crag/dataset.py
new file mode 100644
index 0000000000..224dcae5c2
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/crag/dataset.py
@@ -0,0 +1,335 @@
+"""CRAG dataset loader — download ``crag_task_1_and_2_dev_v4.jsonl.bz2`` and parse.
+
+The CRAG repo (``facebookresearch/CRAG``) ships Tasks 1 & 2 as a
+single bzip2-compressed JSONL on GitHub raw. Each row carries:
+
+* ``interaction_id``    — opaque per-question id (we keep verbatim)
+* ``query_time``        — wall clock of the original web search
+* ``domain``            — finance | music | movie | sports | open
+* ``question_type``     — simple | comparison | aggregation | set |
+                          multi-hop | post-processing | false_premise |
+                          simple_w_condition
+* ``static_or_dynamic`` — static | slow-changing | fast-changing | real-time
+* ``query``             — the question
+* ``answer``            — gold short answer
+* ``alt_ans``           — list[str] of alternative valid answers
+                          (paraphrases / synonyms / unit variants)
+* ``split``             — 0 = validation, 1 = public test
+* ``popularity``        — head | torso | tail (KG questions); empty for web
+* ``search_results``    — list of up to 5 ``{page_name, page_url,
+                          page_snippet, page_result, page_last_modified}``;
+                          ``page_result`` is full HTML.
+
+We materialise this into ``CragQuestion`` objects keeping ``pages`` as
+a list of ``CragPage`` so downstream ingest can save each as its own
+file and SurfSense can dedupe on filename.
+"""
+
+from __future__ import annotations
+
+import bz2
+import hashlib
+import io
+import json
+import logging
+import urllib.request
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+# Tasks 1 & 2 share the same JSONL on the public CRAG repo.
+CRAG_TASK_1_2_URL = (
+    "https://github.com/facebookresearch/CRAG/raw/refs/heads/main/data/"
+    "crag_task_1_and_2_dev_v4.jsonl.bz2"
+)
+CRAG_TASK_1_2_FILENAME = "crag_task_1_and_2_dev_v4.jsonl.bz2"
+
+
+# ---------------------------------------------------------------------------
+# Question / page dataclasses
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class CragPage:
+    """One of the up-to-5 pre-retrieved web pages for a CRAG question."""
+
+    page_name: str
+    page_url: str
+    page_snippet: str
+    page_html: str
+    page_last_modified: str | None = None
+
+    @property
+    def url_hash(self) -> str:
+        """Stable 12-hex digest of the page URL for filename keys.
+
+        We can't use the raw URL as a filename (slashes, query strings,
+        unicode), and we *do* want collision-safety across the whole
+        ingest sample. ``sha1[:12]`` gives us 48 bits of namespace
+        which is overkill for a corpus capped at a few thousand pages.
+        """
+
+        return hashlib.sha1(self.page_url.encode("utf-8")).hexdigest()[:12]
+
+
+@dataclass
+class CragQuestion:
+    """One row of CRAG (Tasks 1 & 2)."""
+
+    qid: str                          # synthesised "C00000".."C02705"
+    interaction_id: str
+    query_time: str
+    query: str
+    gold_answer: str
+    alt_answers: list[str]
+    domain: str
+    question_type: str
+    static_or_dynamic: str
+    popularity: str                   # may be "" for web-sourced questions
+    split: int                        # 0=validation, 1=public_test
+    raw_index: int                    # row index in the source JSONL
+    pages: list[CragPage] = field(default_factory=list)
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "qid": self.qid,
+            "interaction_id": self.interaction_id,
+            "query_time": self.query_time,
+            "query": self.query,
+            "gold_answer": self.gold_answer,
+            "alt_answers": list(self.alt_answers),
+            "domain": self.domain,
+            "question_type": self.question_type,
+            "static_or_dynamic": self.static_or_dynamic,
+            "popularity": self.popularity,
+            "split": self.split,
+            "raw_index": self.raw_index,
+            "n_pages": len(self.pages),
+            "page_urls": [p.page_url for p in self.pages],
+        }
+
+
+# ---------------------------------------------------------------------------
+# Download + decompress
+# ---------------------------------------------------------------------------
+
+
+def download_task_1_2(cache_dir: Path) -> Path:
+    """Download the bz2 archive into ``cache_dir`` (skip if cached).
+
+    Returns the path to the local ``.jsonl.bz2``. We use stdlib
+    ``urllib`` rather than ``httpx`` to keep the download synchronous
+    and trivially resumable (re-running the function is a no-op once
+    the file is on disk and non-empty).
+    """
+
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    dest = cache_dir / CRAG_TASK_1_2_FILENAME
+    if dest.exists() and dest.stat().st_size > 0:
+        logger.debug("CRAG bz2 already cached at %s", dest)
+        return dest
+
+    logger.info("Downloading CRAG (Tasks 1 & 2) from %s ...", CRAG_TASK_1_2_URL)
+    tmp = dest.with_suffix(dest.suffix + ".part")
+    req = urllib.request.Request(
+        CRAG_TASK_1_2_URL,
+        headers={"User-Agent": "SurfSense-Evals/0.1 (CRAG dataset fetch)"},
+    )
+    with urllib.request.urlopen(req, timeout=600) as response, tmp.open("wb") as fh:
+        chunk = response.read(1 << 20)
+        while chunk:
+            fh.write(chunk)
+            chunk = response.read(1 << 20)
+    tmp.replace(dest)
+    logger.info("CRAG bz2 downloaded: %s (%.1f MiB)", dest, dest.stat().st_size / 1024 / 1024)
+    return dest
+
+
+# ---------------------------------------------------------------------------
+# Parse
+# ---------------------------------------------------------------------------
+
+
+def _parse_pages(raw_search_results: Any) -> list[CragPage]:
+    if not isinstance(raw_search_results, list):
+        return []
+    pages: list[CragPage] = []
+    for entry in raw_search_results:
+        if not isinstance(entry, dict):
+            continue
+        url = str(entry.get("page_url") or "").strip()
+        html = str(entry.get("page_result") or "")
+        if not url or not html.strip():
+            # No URL or empty HTML => useless for retrieval.
+            continue
+        pages.append(CragPage(
+            page_name=str(entry.get("page_name") or "").strip(),
+            page_url=url,
+            page_snippet=str(entry.get("page_snippet") or "").strip(),
+            page_html=html,
+            page_last_modified=(
+                str(entry.get("page_last_modified")).strip()
+                if entry.get("page_last_modified") else None
+            ),
+        ))
+    return pages
+
+
+def _parse_alt_answers(raw: Any) -> list[str]:
+    if isinstance(raw, list):
+        return [str(x).strip() for x in raw if str(x).strip()]
+    if isinstance(raw, str) and raw.strip():
+        return [raw.strip()]
+    return []
+
+
+def iter_questions(jsonl_bz2_path: Path) -> list[CragQuestion]:
+    """Stream-decompress + parse the CRAG JSONL into ``CragQuestion`` objects.
+
+    The bz2 expansion ratio is ~10x and the decompressed file is
+    multi-GB; we therefore decompress *line by line* via
+    ``bz2.open(..., "rt")``. Each row is a single (potentially very
+    large, due to embedded HTML) JSON object. We keep the entire row
+    in memory because we materialise the pages to disk immediately
+    after parsing in the ingest pipeline — the runner never holds
+    more than the current sample's worth of HTML.
+    """
+
+    out: list[CragQuestion] = []
+    with bz2.open(jsonl_bz2_path, mode="rt", encoding="utf-8") as fh:
+        for raw_idx, line in enumerate(fh):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                row = json.loads(line)
+            except json.JSONDecodeError as exc:
+                logger.warning("Skipping malformed CRAG row %d: %s", raw_idx, exc)
+                continue
+            query = str(row.get("query") or "").strip()
+            answer = str(row.get("answer") or "").strip()
+            if not query or not answer:
+                logger.debug("Skipping CRAG row %d with missing query/answer", raw_idx)
+                continue
+            interaction_id = str(row.get("interaction_id") or "").strip()
+            pages = _parse_pages(row.get("search_results"))
+            out.append(CragQuestion(
+                qid=f"C{raw_idx:05d}",
+                interaction_id=interaction_id,
+                query_time=str(row.get("query_time") or "").strip(),
+                query=query,
+                gold_answer=answer,
+                alt_answers=_parse_alt_answers(row.get("alt_ans")),
+                domain=str(row.get("domain") or "").strip().lower(),
+                question_type=str(row.get("question_type") or "").strip().lower(),
+                static_or_dynamic=str(row.get("static_or_dynamic") or "").strip().lower(),
+                popularity=str(row.get("popularity") or "").strip().lower(),
+                split=int(row.get("split") or 0),
+                raw_index=raw_idx,
+                pages=pages,
+            ))
+    return out
+
+
+def stratified_sample(
+    questions: list[CragQuestion],
+    *,
+    n: int,
+    seed: int = 17,
+) -> list[CragQuestion]:
+    """Take ``n`` questions that roughly preserve the domain × question-type mix.
+
+    CRAG is only ~2.7k rows so naive head-of-list sampling badly
+    over-weights ``finance`` (because the dataset isn't shuffled by
+    domain). We bucket on ``(domain, question_type)`` and round-robin
+    pick from each bucket until we hit ``n`` — this gives every
+    bucket a fair shot and keeps the sample composition stable across
+    re-runs (deterministic via the seeded shuffle inside each bucket).
+    """
+
+    if n <= 0 or n >= len(questions):
+        return list(questions)
+    import random
+
+    rng = random.Random(seed)
+    buckets: dict[tuple[str, str], list[CragQuestion]] = {}
+    for q in questions:
+        buckets.setdefault((q.domain, q.question_type), []).append(q)
+    for items in buckets.values():
+        rng.shuffle(items)
+
+    keys = sorted(buckets.keys())
+    chosen: list[CragQuestion] = []
+    cursor = 0
+    while len(chosen) < n and any(buckets[k] for k in keys):
+        key = keys[cursor % len(keys)]
+        cursor += 1
+        if buckets[key]:
+            chosen.append(buckets[key].pop())
+    chosen.sort(key=lambda q: q.raw_index)
+    return chosen
+
+
+def write_questions_jsonl(questions: list[CragQuestion], dest: Path) -> None:
+    """Persist a parsed copy (without page HTML) under the benchmark data dir."""
+
+    dest.parent.mkdir(parents=True, exist_ok=True)
+    with dest.open("w", encoding="utf-8") as fh:
+        for q in questions:
+            fh.write(json.dumps(q.to_dict()) + "\n")
+
+
+# ---------------------------------------------------------------------------
+# Reading the lightweight questions.jsonl back
+# ---------------------------------------------------------------------------
+
+
+def load_questions_jsonl(path: Path) -> list[dict[str, Any]]:
+    """Re-load the lightweight (no-HTML) questions JSONL from disk."""
+
+    out: list[dict[str, Any]] = []
+    if not path.exists():
+        return out
+    with path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                out.append(json.loads(line))
+            except json.JSONDecodeError:
+                continue
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Convenience: decompress a snippet to memory for tests
+# ---------------------------------------------------------------------------
+
+
+def decompress_to_memory(jsonl_bz2_path: Path) -> io.StringIO:
+    """For tests / one-off scripts: read the whole bz2 into a StringIO.
+
+    Avoids leaking gigabytes; use ``iter_questions`` in production.
+    """
+
+    with bz2.open(jsonl_bz2_path, mode="rb") as fh:
+        return io.StringIO(fh.read().decode("utf-8"))
+
+
+__all__ = [
+    "CRAG_TASK_1_2_FILENAME",
+    "CRAG_TASK_1_2_URL",
+    "CragPage",
+    "CragQuestion",
+    "decompress_to_memory",
+    "download_task_1_2",
+    "iter_questions",
+    "load_questions_jsonl",
+    "stratified_sample",
+    "write_questions_jsonl",
+]
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/crag/grader.py b/surfsense_evals/src/surfsense_evals/suites/research/crag/grader.py
new file mode 100644
index 0000000000..63f66702ba
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/crag/grader.py
@@ -0,0 +1,540 @@
+"""CRAG 3-class grader: ``correct`` (+1) / ``missing`` (0) / ``incorrect`` (-1).
+
+The CRAG paper's headline metric is the **Truthfulness Score**:
+
+    score = (#correct - #incorrect) / total
+
+which rewards calibrated abstention — refusing to answer is neutral
+(0), guessing wrong is negative (-1). Grading is therefore a 3-class
+problem rather than the 2-class accuracy used for FRAMES.
+
+Pipeline per (pred, gold, alt_ans, question_type):
+
+1. Detect refusal first (``Answer: I don't know`` / "I don't know" /
+   "no information") → ``missing`` (deterministic, never billed).
+2. ``false_premise`` questions: gold is canonically "the question
+   contains a false premise" — reward any answer that flags the
+   false premise (substring "false premise" / "incorrect premise" /
+   "no such") as correct.
+3. Run the FRAMES-style deterministic shortcut (exact / numeric /
+   substring) on ``pred`` against ``gold ∪ alt_ans``. Hit → correct.
+4. Fall through to the LLM judge (if configured), which returns one
+   of ``{correct, missing, incorrect}`` — verbatim CRAG protocol.
+5. No judge configured → record ``incorrect`` (pessimistic but at
+   least monotone with the deterministic grader).
+
+The judge is throttled by an asyncio.Semaphore so it doesn't outrun
+the OpenRouter rate limit; the pre-judge deterministic pass keeps
+the bill bounded (most easy "Beyoncé"-vs-"Beyoncé Knowles" cases
+short-circuit before we burn judge tokens).
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import re
+import string
+from collections.abc import Sequence
+from dataclasses import dataclass
+from typing import Any, Literal
+
+from ....core.providers.openrouter_chat import OpenRouterChatProvider
+
+logger = logging.getLogger(__name__)
+
+
+GradeClass = Literal["correct", "missing", "incorrect"]
+
+
+# ---------------------------------------------------------------------------
+# Public type
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class CragGradeResult:
+    """One graded (pred, gold) pair under CRAG's 3-class rubric."""
+
+    grade: GradeClass
+    score: int                     # +1 / 0 / -1
+    method: str                    # exact, numeric, substring, refusal,
+                                   # false_premise_correct, false_premise_miss,
+                                   # llm_judge, lexical_miss, ...
+    normalised_pred: str = ""
+    normalised_gold: str = ""
+    judge_rationale: str = ""
+
+    @property
+    def correct(self) -> bool:
+        return self.grade == "correct"
+
+    @property
+    def missing(self) -> bool:
+        return self.grade == "missing"
+
+    @property
+    def incorrect(self) -> bool:
+        return self.grade == "incorrect"
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "grade": self.grade,
+            "score": self.score,
+            "method": self.method,
+            "normalised_pred": self.normalised_pred,
+            "normalised_gold": self.normalised_gold,
+            "judge_rationale": self.judge_rationale,
+        }
+
+
+def _grade_to_score(grade: GradeClass) -> int:
+    return {"correct": 1, "missing": 0, "incorrect": -1}[grade]
+
+
+# ---------------------------------------------------------------------------
+# Normalisation
+# ---------------------------------------------------------------------------
+
+
+_PUNCT_TABLE = str.maketrans({c: " " for c in string.punctuation})
+_ARTICLES = re.compile(r"\b(a|an|the)\b", re.IGNORECASE)
+_WS = re.compile(r"\s+")
+
+
+def _normalise(s: str) -> str:
+    s = (s or "").lower()
+    s = s.translate(_PUNCT_TABLE)
+    s = _ARTICLES.sub(" ", s)
+    s = _WS.sub(" ", s).strip()
+    return s
+
+
+_WORD_NUMBERS = {
+    "zero": 0, "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
+    "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10, "eleven": 11,
+    "twelve": 12, "thirteen": 13, "fourteen": 14, "fifteen": 15, "sixteen": 16,
+    "seventeen": 17, "eighteen": 18, "nineteen": 19, "twenty": 20,
+}
+
+_NUMERIC_RE = re.compile(r"-?\d+(?:[.,]\d+)?")
+
+
+def _maybe_number(s: str) -> float | None:
+    """Extract a single numeric value from raw lowercased text."""
+
+    raw = (s or "").strip().lower()
+    if not raw:
+        return None
+    match = _NUMERIC_RE.search(raw)
+    if match:
+        try:
+            return float(match.group(0).replace(",", ""))
+        except ValueError:
+            pass
+    for tok in _normalise(s).split():
+        if tok in _WORD_NUMBERS:
+            return float(_WORD_NUMBERS[tok])
+    return None
+
+
+def _whole_word_substring(haystack: str, needle: str) -> bool:
+    if not needle:
+        return False
+    return f" {needle} " in f" {haystack} "
+
+
+# ---------------------------------------------------------------------------
+# Refusal detection
+# ---------------------------------------------------------------------------
+
+
+_REFUSAL_PATTERNS = [
+    re.compile(r"\bi\s+don'?t\s+know\b", re.IGNORECASE),
+    re.compile(r"\bi\s+do\s+not\s+know\b", re.IGNORECASE),
+    re.compile(r"\bnot\s+enough\s+information\b", re.IGNORECASE),
+    re.compile(r"\binsufficient\s+information\b", re.IGNORECASE),
+    re.compile(r"\bcannot\s+(?:be\s+)?(?:answered|determined)\b", re.IGNORECASE),
+    re.compile(r"\bunable\s+to\s+(?:answer|determine)\b", re.IGNORECASE),
+    re.compile(r"\bno\s+(?:information|data|evidence)\b", re.IGNORECASE),
+]
+
+
+def _is_refusal(pred: str) -> bool:
+    """Cheap deterministic check for "I don't know" -shaped responses."""
+
+    if not pred or not pred.strip():
+        return True  # empty answer is a de facto refusal
+    return any(p.search(pred) for p in _REFUSAL_PATTERNS)
+
+
+# ---------------------------------------------------------------------------
+# False-premise handling
+# ---------------------------------------------------------------------------
+
+
+_FALSE_PREMISE_PATTERNS = [
+    re.compile(r"false\s+premise", re.IGNORECASE),
+    re.compile(r"incorrect\s+premise", re.IGNORECASE),
+    re.compile(r"premise\s+(?:is|of)\s+the\s+question", re.IGNORECASE),
+    re.compile(r"\bno\s+such\b", re.IGNORECASE),
+    re.compile(r"never\s+(?:happened|occurred|existed)", re.IGNORECASE),
+    re.compile(r"\bdid\s+not\s+(?:happen|occur|exist)\b", re.IGNORECASE),
+    re.compile(r"\bdoes\s+not\s+exist\b", re.IGNORECASE),
+    re.compile(r"is\s+not\s+(?:true|correct|accurate)", re.IGNORECASE),
+    re.compile(r"\bisn'?t\s+(?:true|correct|accurate)\b", re.IGNORECASE),
+    re.compile(r"\binvalid\s+(?:premise|question|assumption)\b", re.IGNORECASE),
+]
+
+
+def _flags_false_premise(pred: str) -> bool:
+    return any(p.search(pred) for p in _FALSE_PREMISE_PATTERNS)
+
+
+# ---------------------------------------------------------------------------
+# Deterministic grader
+# ---------------------------------------------------------------------------
+
+
+def grade_deterministic(
+    *,
+    pred: str,
+    gold: str,
+    alt_answers: Sequence[str] = (),
+    question_type: str = "",
+) -> CragGradeResult:
+    """Try to grade without the LLM judge. Returns a final result.
+
+    Always returns *some* result — the caller checks ``method`` to
+    decide whether the LLM judge should overturn it. ``lexical_miss``
+    and ``false_premise_unclear`` are the two methods that trigger the
+    judge fallback.
+    """
+
+    qtype = (question_type or "").lower()
+    n_pred = _normalise(pred)
+    n_gold = _normalise(gold)
+
+    if _is_refusal(pred):
+        # CRAG protocol: refusal is *missing* (0), even on false-premise
+        # questions where one might argue refusal == correct. We
+        # follow the paper's grading literally.
+        return CragGradeResult(
+            grade="missing",
+            score=0,
+            method="refusal",
+            normalised_pred=n_pred,
+            normalised_gold=n_gold,
+        )
+
+    # Empty-gold guard (shouldn't happen, but defensively):
+    if not n_gold:
+        return CragGradeResult(
+            grade="incorrect",
+            score=-1,
+            method="empty_gold",
+            normalised_pred=n_pred,
+            normalised_gold=n_gold,
+        )
+
+    # False-premise questions: gold is typically "the question contains
+    # a false premise" / "no such X" / similar. Any answer that
+    # explicitly flags the false premise is correct.
+    if qtype == "false_premise":
+        if _flags_false_premise(pred):
+            return CragGradeResult(
+                grade="correct",
+                score=1,
+                method="false_premise_flagged",
+                normalised_pred=n_pred,
+                normalised_gold=n_gold,
+            )
+        # If the model commits to *any* concrete answer on a false-
+        # premise question without flagging the premise, it is wrong.
+        # But we don't classify ourselves — let the judge decide on
+        # the off chance the gold itself is e.g. "no" and the pred
+        # is "no" without explicit "false premise" wording.
+        return CragGradeResult(
+            grade="incorrect",
+            score=-1,
+            method="false_premise_unclear",
+            normalised_pred=n_pred,
+            normalised_gold=n_gold,
+        )
+
+    # All non-false-premise questions: try the standard chain against
+    # gold and each alt answer. First match wins.
+    candidates = [gold, *list(alt_answers)]
+    for candidate in candidates:
+        if not candidate or not str(candidate).strip():
+            continue
+        cand_norm = _normalise(candidate)
+        if not cand_norm:
+            continue
+        if n_pred == cand_norm:
+            return CragGradeResult(
+                grade="correct", score=1, method="exact",
+                normalised_pred=n_pred, normalised_gold=cand_norm,
+            )
+        p_num = _maybe_number(pred)
+        c_num = _maybe_number(candidate)
+        if p_num is not None and c_num is not None:
+            # Pure 1% relative tolerance for CRAG (currency, counts,
+            # ratios). Unlike FRAMES (which uses a 0.5 absolute floor
+            # for year-shaped answers), CRAG's numeric questions are
+            # often small-value (stock prices, percentages) where a
+            # 0.5 floor would let "$2.05" match "$2.17". The judge is
+            # the safety net for borderline rounding cases.
+            tol = abs(c_num) * 0.01
+            if abs(p_num - c_num) <= tol:
+                return CragGradeResult(
+                    grade="correct", score=1, method="numeric",
+                    normalised_pred=n_pred, normalised_gold=cand_norm,
+                )
+            # Numeric question with different numbers — keep looking
+            # at other candidates rather than declaring miss now;
+            # alt answers may include word forms that pass.
+        if _whole_word_substring(n_pred, cand_norm):
+            return CragGradeResult(
+                grade="correct", score=1, method="substring",
+                normalised_pred=n_pred, normalised_gold=cand_norm,
+            )
+        if _whole_word_substring(cand_norm, n_pred) and len(n_pred) >= 3:
+            return CragGradeResult(
+                grade="correct", score=1, method="substring_reverse",
+                normalised_pred=n_pred, normalised_gold=cand_norm,
+            )
+
+    return CragGradeResult(
+        grade="incorrect",
+        score=-1,
+        method="lexical_miss",
+        normalised_pred=n_pred,
+        normalised_gold=n_gold,
+    )
+
+
+# ---------------------------------------------------------------------------
+# LLM-as-judge (3-class)
+# ---------------------------------------------------------------------------
+
+
+_JUDGE_SYSTEM = (
+    "You are an impartial grader for short-answer factual questions, "
+    "following the CRAG benchmark rubric. Given a question, the gold "
+    "answer (and any alternative valid answers), and a model's "
+    "prediction, classify the prediction into exactly one of three "
+    "categories:\n\n"
+    "* \"correct\"   — the prediction expresses the same factual "
+    "content as the gold answer (paraphrasing OK; numbers as words "
+    "OK; partial-but-correct names OK; non-contradictory extra "
+    "detail OK).\n"
+    "* \"missing\"   — the prediction explicitly refuses, says \"I "
+    "don't know\", says there is insufficient information, or hedges "
+    "without committing.\n"
+    "* \"incorrect\" — the prediction commits to a fact that is "
+    "different from the gold answer, or fails to flag a false "
+    "premise when the question contains one.\n\n"
+    "Special case: if the question contains a false premise and the "
+    "gold answer says so, then a prediction that flags the false "
+    "premise is \"correct\".\n\n"
+    "Respond with ONLY a JSON object on a single line:\n"
+    '{\"grade\": \"correct\"|\"missing\"|\"incorrect\", \"rationale\": \"<one short sentence>\"}'
+)
+
+
+_JUDGE_TEMPLATE = """\
+Question: {question}
+Question type: {question_type}
+Gold answer: {gold}
+{alt_block}Model prediction: {pred}
+
+Decide whether the prediction is correct, missing, or incorrect.
+"""
+
+
+@dataclass
+class CragJudgeConfig:
+    api_key: str
+    model: str = "anthropic/claude-sonnet-4.5"
+    base_url: str = "https://openrouter.ai/api/v1"
+    max_tokens: int = 200
+    concurrency: int = 4
+
+
+class CragLlmJudge:
+    """Async LLM judge over OpenRouter chat completions, 3-class output."""
+
+    def __init__(self, *, config: CragJudgeConfig) -> None:
+        self._config = config
+        self._provider = OpenRouterChatProvider(
+            api_key=config.api_key,
+            base_url=config.base_url,
+            model=config.model,
+        )
+        self._sem = asyncio.Semaphore(max(1, config.concurrency))
+
+    @property
+    def model(self) -> str:
+        return self._config.model
+
+    async def judge(
+        self,
+        *,
+        question: str,
+        gold: str,
+        alt_answers: Sequence[str],
+        pred: str,
+        question_type: str = "",
+    ) -> tuple[GradeClass, str]:
+        """Return ``(grade, rationale)``. Errors return incorrect + reason."""
+
+        alt_block = ""
+        if alt_answers:
+            alt_lines = "\n".join(f"  - {a}" for a in alt_answers if a)
+            if alt_lines:
+                alt_block = f"Alternative valid answers:\n{alt_lines}\n"
+        prompt = _JUDGE_TEMPLATE.format(
+            question=question,
+            question_type=question_type or "unknown",
+            gold=gold,
+            alt_block=alt_block,
+            pred=pred,
+        )
+        try:
+            async with self._sem:
+                response = await self._provider.complete(
+                    prompt=prompt,
+                    system_prompt=_JUDGE_SYSTEM,
+                    max_tokens=self._config.max_tokens,
+                )
+        except Exception as exc:  # noqa: BLE001
+            return "incorrect", f"judge_error: {type(exc).__name__}: {exc}"
+        return _parse_judge_response(response.text)
+
+
+def _parse_judge_response(text: str) -> tuple[GradeClass, str]:
+    """Parse the judge reply into a 3-class label + rationale."""
+
+    if not text or not text.strip():
+        return "incorrect", "judge_returned_empty"
+    match = re.search(r"\{[^{}]*\}", text, flags=re.DOTALL)
+    candidate = match.group(0) if match else text
+    try:
+        data = json.loads(candidate)
+    except (json.JSONDecodeError, ValueError):
+        lowered = text.strip().lower()
+        if "correct" in lowered and "incorrect" not in lowered:
+            return "correct", "yes (parser_fallback)"
+        if "missing" in lowered or "i don" in lowered:
+            return "missing", "missing (parser_fallback)"
+        return "incorrect", f"unparseable_judge_response: {text[:200]}"
+    raw_grade = str(data.get("grade") or "").strip().lower()
+    rationale = str(data.get("rationale", "")).strip()[:280]
+    if raw_grade in {"correct", "missing", "incorrect"}:
+        return raw_grade, rationale  # type: ignore[return-value]
+    return "incorrect", f"unknown_grade={raw_grade!r}; {rationale}"
+
+
+# ---------------------------------------------------------------------------
+# Combined grader
+# ---------------------------------------------------------------------------
+
+
+# Methods that should *not* trigger the LLM judge — the deterministic
+# verdict is conclusive (refusal, exact match, numeric mismatch, etc.).
+_TERMINAL_METHODS = frozenset({
+    "refusal",
+    "exact",
+    "numeric",
+    "substring",
+    "substring_reverse",
+    "false_premise_flagged",
+    "empty_gold",
+})
+
+
+async def grade_with_judge(
+    *,
+    pred: str,
+    gold: str,
+    alt_answers: Sequence[str],
+    question: str,
+    question_type: str,
+    judge: CragLlmJudge | None,
+) -> CragGradeResult:
+    """One row → deterministic shortcut → optional LLM judge fallback."""
+
+    det = grade_deterministic(
+        pred=pred,
+        gold=gold,
+        alt_answers=alt_answers,
+        question_type=question_type,
+    )
+    if det.method in _TERMINAL_METHODS:
+        return det
+    if judge is None:
+        return det  # ``lexical_miss`` / ``false_premise_unclear`` → keep as-is
+    grade, rationale = await judge.judge(
+        question=question,
+        gold=gold,
+        alt_answers=alt_answers,
+        pred=pred,
+        question_type=question_type,
+    )
+    return CragGradeResult(
+        grade=grade,
+        score=_grade_to_score(grade),
+        method="llm_judge",
+        normalised_pred=det.normalised_pred,
+        normalised_gold=det.normalised_gold,
+        judge_rationale=rationale,
+    )
+
+
+@dataclass
+class CragGradeRow:
+    """One row to grade. Mirrors the FRAMES grader's tuple but typed."""
+
+    qid: str
+    question: str
+    gold: str
+    alt_answers: list[str]
+    pred: str
+    question_type: str = ""
+
+
+async def grade_many(
+    *,
+    rows: Sequence[CragGradeRow],
+    judge: CragLlmJudge | None,
+) -> list[CragGradeResult]:
+    """Grade every row concurrently. Judge enforces its own concurrency cap."""
+
+    if not rows:
+        return []
+    coros = [
+        grade_with_judge(
+            pred=r.pred,
+            gold=r.gold,
+            alt_answers=r.alt_answers,
+            question=r.question,
+            question_type=r.question_type,
+            judge=judge,
+        )
+        for r in rows
+    ]
+    return list(await asyncio.gather(*coros))
+
+
+__all__ = [
+    "CragGradeResult",
+    "CragGradeRow",
+    "CragJudgeConfig",
+    "CragLlmJudge",
+    "GradeClass",
+    "grade_deterministic",
+    "grade_many",
+    "grade_with_judge",
+]
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/crag/html_extract.py b/surfsense_evals/src/surfsense_evals/suites/research/crag/html_extract.py
new file mode 100644
index 0000000000..1b00aedc20
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/crag/html_extract.py
@@ -0,0 +1,206 @@
+"""HTML → markdown for CRAG pages, with boilerplate removal.
+
+Each CRAG page is a *full* HTML document (nav, ads, recommended-for-
+you, footer, ...). Without removing that boilerplate, retrieval over
+the chunks would surface menu items and "subscribe to our newsletter"
+boxes instead of the actual page content. We use ``trafilatura``,
+which is purpose-built for main-content extraction (the same library
+Common Crawl downstream pipelines use). It outputs clean prose with
+section headers, lists, and tables preserved.
+
+Extraction policy:
+1. ``trafilatura.extract`` with ``output_format="markdown"`` — main
+   content only, headers preserved, tables kept.
+2. If extraction fails or returns < 200 chars (paywalled / JS-only
+   page / extraction confused), fall back to a plain stdlib
+   ``HTMLParser`` that strips tags and collapses whitespace. Some
+   text is better than no text — SurfSense's chunker handles noisy
+   prose.
+
+We *intentionally* keep the page name and URL as visible H1 / link
+metadata so the SurfSense chunker preserves doc identity at the top of
+the first chunk (mirrors what we do for FRAMES Wikipedia pages).
+"""
+
+from __future__ import annotations
+
+import html
+import logging
+import re
+from dataclasses import dataclass
+from html.parser import HTMLParser
+
+logger = logging.getLogger(__name__)
+
+
+_MIN_TRAFILATURA_LENGTH = 200
+_MAX_OUTPUT_CHARS = 200_000  # cap to keep upload payloads sane
+
+
+@dataclass
+class ExtractionResult:
+    """Outcome of converting one HTML blob to plain markdown."""
+
+    text: str
+    method: str          # "trafilatura" | "fallback_strip" | "empty"
+    n_chars: int
+
+    @property
+    def ok(self) -> bool:
+        return self.n_chars > 0
+
+
+# ---------------------------------------------------------------------------
+# Trafilatura wrapper (lazy import so tests / small scripts don't pay)
+# ---------------------------------------------------------------------------
+
+
+def _trafilatura_extract(html_text: str, *, url: str) -> str | None:
+    try:
+        import trafilatura
+    except ImportError:  # pragma: no cover - dependency is required
+        logger.warning("trafilatura not installed; falling back to strip-tags only")
+        return None
+    try:
+        text = trafilatura.extract(
+            html_text,
+            url=url or None,
+            output_format="markdown",
+            include_links=False,
+            include_images=False,
+            include_tables=True,
+            favor_recall=True,
+        )
+    except Exception as exc:  # noqa: BLE001 - trafilatura raises a zoo
+        logger.debug("trafilatura.extract crashed for %s: %s", url, exc)
+        return None
+    if not text:
+        return None
+    return text.strip()
+
+
+# ---------------------------------------------------------------------------
+# Stdlib fallback: strip HTML tags
+# ---------------------------------------------------------------------------
+
+
+class _StripHTMLParser(HTMLParser):
+    """Collect text content, treating block tags as paragraph breaks.
+
+    We deliberately drop ``<script>``, ``<style>``, ``<nav>``,
+    ``<header>``, ``<footer>``, and ``<aside>`` content — these are
+    almost always boilerplate and they are the dominant source of
+    noise SurfSense ends up retrieving against if not removed.
+    """
+
+    _SKIP_TAGS = frozenset({"script", "style", "nav", "header", "footer", "aside", "svg"})
+    _BLOCK_TAGS = frozenset({
+        "p", "div", "section", "article", "li", "ul", "ol",
+        "h1", "h2", "h3", "h4", "h5", "h6", "br", "tr",
+        "td", "th", "table", "blockquote", "pre",
+    })
+
+    def __init__(self) -> None:
+        super().__init__(convert_charrefs=True)
+        self._buffer: list[str] = []
+        self._skip_depth: int = 0
+
+    def handle_starttag(self, tag: str, attrs: list) -> None:  # noqa: ARG002
+        if tag in self._SKIP_TAGS:
+            self._skip_depth += 1
+        if tag in self._BLOCK_TAGS:
+            self._buffer.append("\n")
+
+    def handle_endtag(self, tag: str) -> None:
+        if tag in self._SKIP_TAGS and self._skip_depth > 0:
+            self._skip_depth -= 1
+        if tag in self._BLOCK_TAGS:
+            self._buffer.append("\n")
+
+    def handle_data(self, data: str) -> None:
+        if self._skip_depth:
+            return
+        self._buffer.append(data)
+
+    def get_text(self) -> str:
+        text = "".join(self._buffer)
+        # Decode any leftover entities and collapse whitespace.
+        text = html.unescape(text)
+        text = re.sub(r"[ \t]+", " ", text)
+        text = re.sub(r"\n[ \t]+", "\n", text)
+        text = re.sub(r"\n{3,}", "\n\n", text)
+        return text.strip()
+
+
+def _strip_tags(html_text: str) -> str:
+    parser = _StripHTMLParser()
+    try:
+        parser.feed(html_text)
+    except Exception as exc:  # noqa: BLE001 - HTMLParser is fragile on garbage input
+        logger.debug("HTMLParser failed; using regex strip: %s", exc)
+        no_tags = re.sub(r"<[^>]+>", " ", html_text)
+        return re.sub(r"\s+", " ", html.unescape(no_tags)).strip()
+    return parser.get_text()
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+
+def extract_main_content(
+    html_text: str,
+    *,
+    url: str = "",
+    page_name: str = "",
+    last_modified: str | None = None,
+) -> ExtractionResult:
+    """Convert one HTML blob into clean markdown for ingest.
+
+    The returned ``text`` is prefixed with a small metadata header
+    (``# {page_name}\\n\\nSource: {url}\\n``) so that:
+
+    * SurfSense's chunker has a stable doc-identity anchor at the top
+      of the first chunk (matches what we do for FRAMES Wikipedia).
+    * The retrieval-augmented arm sees the URL inline, which the LLM
+      can surface as a citation if the prompt asks for one.
+    """
+
+    body = ""
+    method = "empty"
+    if html_text and html_text.strip():
+        body = _trafilatura_extract(html_text, url=url) or ""
+        if body and len(body) >= _MIN_TRAFILATURA_LENGTH:
+            method = "trafilatura"
+        else:
+            stripped = _strip_tags(html_text)
+            # Prefer trafilatura output even if short, but only if it
+            # contained any prose at all — empty trafilatura fall-through
+            # to the stripped form.
+            if body and stripped and len(stripped) > len(body) * 1.5:
+                body = stripped
+                method = "fallback_strip"
+            elif not body and stripped:
+                body = stripped
+                method = "fallback_strip"
+            elif body:
+                method = "trafilatura"
+
+    body = body.strip()
+    if len(body) > _MAX_OUTPUT_CHARS:
+        body = body[:_MAX_OUTPUT_CHARS] + "\n\n[...truncated...]"
+
+    if not body:
+        return ExtractionResult(text="", method="empty", n_chars=0)
+
+    title_line = (page_name or url or "Untitled").strip()
+    header_lines = [f"# {title_line}"]
+    if url:
+        header_lines.append(f"Source: {url}")
+    if last_modified:
+        header_lines.append(f"Last modified: {last_modified}")
+    final = "\n".join(header_lines) + "\n\n" + body + "\n"
+    return ExtractionResult(text=final, method=method, n_chars=len(final))
+
+
+__all__ = ["ExtractionResult", "extract_main_content"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/crag/ingest.py b/surfsense_evals/src/surfsense_evals/suites/research/crag/ingest.py
new file mode 100644
index 0000000000..aad6a70bff
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/crag/ingest.py
@@ -0,0 +1,439 @@
+"""CRAG ingestion: download → extract → upload → per-question doc map.
+
+Steps:
+
+1. Download ``crag_task_1_and_2_dev_v4.jsonl.bz2`` from
+   ``facebookresearch/CRAG`` (skip if cached).
+2. Stream-parse into ``CragQuestion`` objects.
+3. Optionally cap to ``--n-questions N`` (and *stratified* sample
+   across ``(domain, question_type)`` so the smoke / partial run
+   isn't dominated by ``finance`` or ``simple``).
+4. For each question, extract the 5 web pages to clean markdown via
+   ``trafilatura`` and write them to
+   ``<bench_dir>/pages/<qid>__<page_idx>__<url_hash>.md``. The
+   filename is unique across the whole sample (so SurfSense's
+   ``(filename, search_space)`` dedup never collides between
+   questions) and round-trippable (the ``<qid>__`` prefix lets the
+   ingest infer doc-membership at the title level even before we
+   land on a stable status response).
+5. Upload all extracted pages to SurfSense in batches with text-only
+   ETL (``use_vision_llm=False, processing_mode="basic"``) — these
+   are extracted plaintext, no images involved.
+6. Persist a doc map at
+   ``<suite_data>/maps/crag_doc_map.jsonl`` with one row per question:
+
+       {"qid": "C00042",
+        "interaction_id": "<uuid>",
+        "question": "<text>",
+        "gold_answer": "<text>",
+        "alt_answers": [...],
+        "domain": "...", "question_type": "...",
+        "static_or_dynamic": "...", "popularity": "...",
+        "query_time": "...",
+        "page_filenames": ["C00042__0__abc123.md", ...],
+        "document_ids": [42101, 42102, ...],
+        "missing_pages": [...]   # filenames whose upload failed
+       }
+
+The runner uses ``document_ids`` to scope SurfSense retrieval to
+exactly the 5 pages of the question (matches CRAG protocol — the
+benchmark explicitly hands over its own retrieved pages).
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from ....core.clients.documents import (
+    DocumentProcessingFailed,
+    DocumentProcessingTimeout,
+)
+from ....core.config import set_suite_state
+from ....core.ingest_settings import IngestSettings, settings_header_line
+from ....core.registry import RunContext
+from .dataset import (
+    CragPage,
+    CragQuestion,
+    download_task_1_2,
+    iter_questions,
+    stratified_sample,
+    write_questions_jsonl,
+)
+from .html_extract import extract_main_content
+
+logger = logging.getLogger(__name__)
+
+
+_FILENAME_SAFE = re.compile(r"[^A-Za-z0-9._\-]+")
+
+
+def _page_filename(qid: str, page_idx: int, page: CragPage) -> str:
+    """Filesystem-safe, globally unique markdown filename for a CRAG page.
+
+    Format: ``<qid>__<idx>__<url_hash>.md``. Both the qid (``C00042``)
+    and the URL-hash (``[:12]``) are alphanumeric so we don't need to
+    sanitise them, but we strip anything else just in case.
+    """
+
+    qid_safe = _FILENAME_SAFE.sub("_", qid)
+    return f"{qid_safe}__{page_idx:02d}__{page.url_hash}.md"
+
+
+# ---------------------------------------------------------------------------
+# Stats
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class _IngestStats:
+    n_questions: int
+    n_pages_total: int
+    n_pages_extracted: int
+    n_pages_empty: int
+    n_uploaded: int
+    n_existing: int
+    bench_dir: Path
+    map_path: Path
+
+
+# ---------------------------------------------------------------------------
+# Page extraction
+# ---------------------------------------------------------------------------
+
+
+def _materialise_pages(
+    questions: list[CragQuestion],
+    *,
+    pages_dir: Path,
+    overwrite: bool = False,
+) -> tuple[dict[str, list[str]], dict[str, str]]:
+    """Extract every page in every question to ``pages_dir`` as markdown.
+
+    Returns:
+      * ``qid -> [filename, filename, ...]`` (in page order, only
+        successful extractions)
+      * ``filename -> source_url`` for diagnostics
+
+    Empty extractions (paywall / JS / parse-fail with no fallback
+    output) are skipped — better to retrieve from 4 pages than feed
+    SurfSense's chunker an empty file.
+    """
+
+    pages_dir.mkdir(parents=True, exist_ok=True)
+    qid_to_files: dict[str, list[str]] = {}
+    file_to_url: dict[str, str] = {}
+    method_counts: dict[str, int] = {}
+    n_empty = 0
+
+    for q in questions:
+        names: list[str] = []
+        for idx, page in enumerate(q.pages):
+            filename = _page_filename(q.qid, idx, page)
+            dest = pages_dir / filename
+            if dest.exists() and dest.stat().st_size > 0 and not overwrite:
+                method_counts["cache_hit"] = method_counts.get("cache_hit", 0) + 1
+                names.append(filename)
+                file_to_url[filename] = page.page_url
+                continue
+            result = extract_main_content(
+                page.page_html,
+                url=page.page_url,
+                page_name=page.page_name,
+                last_modified=page.page_last_modified,
+            )
+            method_counts[result.method] = method_counts.get(result.method, 0) + 1
+            if not result.ok:
+                n_empty += 1
+                continue
+            dest.write_text(result.text, encoding="utf-8")
+            names.append(filename)
+            file_to_url[filename] = page.page_url
+        qid_to_files[q.qid] = names
+
+    logger.info(
+        "CRAG page extraction: %s; empty=%d, total_files=%d across %d questions",
+        method_counts, n_empty, len(file_to_url), len(qid_to_files),
+    )
+    return qid_to_files, file_to_url
+
+
+# ---------------------------------------------------------------------------
+# Upload
+# ---------------------------------------------------------------------------
+
+
+async def _upload_pages(
+    ctx: RunContext,
+    *,
+    pages_dir: Path,
+    filenames: list[str],
+    batch_size: int,
+    settings: IngestSettings,
+) -> dict[str, int]:
+    """Upload ``filenames`` (already on disk under ``pages_dir``) and return name → doc_id."""
+
+    if not filenames:
+        return {}
+    docs_client = ctx.documents_client()
+    name_to_id: dict[str, int] = {}
+    paths = [pages_dir / fn for fn in filenames if (pages_dir / fn).exists()]
+
+    for batch_start in range(0, len(paths), batch_size):
+        batch = paths[batch_start : batch_start + batch_size]
+        result = await docs_client.upload(
+            files=batch,
+            search_space_id=ctx.search_space_id,
+            should_summarize=settings.should_summarize,
+            use_vision_llm=settings.use_vision_llm,
+            processing_mode=settings.processing_mode,
+        )
+        all_ids = list(result.document_ids) + list(result.duplicate_document_ids)
+        if result.document_ids:
+            try:
+                await docs_client.wait_until_ready(
+                    search_space_id=ctx.search_space_id,
+                    document_ids=result.document_ids,
+                    timeout_s=900.0,
+                )
+            except (DocumentProcessingFailed, DocumentProcessingTimeout) as exc:
+                logger.warning("CRAG batch processing issue: %s", exc)
+        if all_ids:
+            statuses = await docs_client.get_status(
+                search_space_id=ctx.search_space_id,
+                document_ids=all_ids,
+            )
+            for s in statuses:
+                stem = Path(s.title).stem if s.title.endswith(".md") else s.title
+                name_to_id[stem] = s.document_id
+                name_to_id[s.title] = s.document_id
+                if not s.title.endswith(".md"):
+                    name_to_id[f"{s.title}.md"] = s.document_id
+        logger.info(
+            "CRAG upload batch %d-%d: %d new, %d duplicate",
+            batch_start, batch_start + len(batch),
+            len(result.document_ids), len(result.duplicate_document_ids),
+        )
+    return name_to_id
+
+
+# ---------------------------------------------------------------------------
+# Doc map writer
+# ---------------------------------------------------------------------------
+
+
+def _resolve_question_doc_ids(
+    questions: list[CragQuestion],
+    qid_to_files: dict[str, list[str]],
+    name_to_id: dict[str, int],
+) -> list[dict[str, Any]]:
+    rows: list[dict[str, Any]] = []
+    for q in questions:
+        filenames = qid_to_files.get(q.qid, [])
+        doc_ids: list[int] = []
+        missing: list[str] = []
+        for fn in filenames:
+            stem = Path(fn).stem
+            doc_id = name_to_id.get(stem) or name_to_id.get(fn)
+            if doc_id is not None and doc_id not in doc_ids:
+                doc_ids.append(doc_id)
+            else:
+                missing.append(fn)
+        rows.append({
+            "qid": q.qid,
+            "interaction_id": q.interaction_id,
+            "raw_index": q.raw_index,
+            "question": q.query,
+            "gold_answer": q.gold_answer,
+            "alt_answers": list(q.alt_answers),
+            "domain": q.domain,
+            "question_type": q.question_type,
+            "static_or_dynamic": q.static_or_dynamic,
+            "popularity": q.popularity,
+            "query_time": q.query_time,
+            "split": q.split,
+            "page_filenames": filenames,
+            "document_ids": doc_ids,
+            "missing_pages": missing,
+            "n_pages": len(filenames),
+        })
+    return rows
+
+
+# ---------------------------------------------------------------------------
+# Public entry
+# ---------------------------------------------------------------------------
+
+
+async def run_ingest(
+    ctx: RunContext,
+    *,
+    n_questions: int | None = None,
+    upload_batch_size: int = 16,
+    skip_upload: bool = False,
+    overwrite_extract: bool = False,
+    settings: IngestSettings | None = None,
+    sample_seed: int = 17,
+) -> None:
+    """Ingest the CRAG benchmark (Tasks 1 & 2) into the research suite.
+
+    Parameters
+    ----------
+    n_questions
+        Cap on the number of CRAG questions to materialise.
+        ``None`` = all 2,706 (~13,500 pages — large; smoke runs
+        should pass 10-20 and full runs ~200).
+    upload_batch_size
+        Markdown files per ``/documents/fileupload`` call.
+    skip_upload
+        Extract + cache markdown locally but don't push to SurfSense
+        (useful for debugging the extraction step).
+    overwrite_extract
+        Re-run trafilatura even when a cached markdown file exists.
+        Default False so re-running ingest is idempotent.
+    settings
+        Override per-upload knobs. CRAG defaults to text-only basic
+        ETL — these are *extracted* plaintext, no images.
+    sample_seed
+        RNG seed for ``stratified_sample``. Pin this for reproducibility.
+    """
+
+    settings = settings or IngestSettings(
+        use_vision_llm=False,
+        processing_mode="basic",
+        should_summarize=False,
+    )
+    bench_dir = ctx.benchmark_data_dir()
+    pages_dir = bench_dir / "pages"
+    raw_cache = bench_dir / ".raw_cache"
+    raw_cache.mkdir(parents=True, exist_ok=True)
+
+    bz2_path = download_task_1_2(raw_cache)
+    logger.info("CRAG: parsing %s ...", bz2_path.name)
+    all_questions = iter_questions(bz2_path)
+    if not all_questions:
+        raise RuntimeError(
+            "CRAG JSONL contained no parseable rows; upstream may have changed schema."
+        )
+    logger.info("CRAG: parsed %d total questions", len(all_questions))
+
+    if n_questions is not None and n_questions > 0:
+        questions = stratified_sample(all_questions, n=n_questions, seed=sample_seed)
+        logger.info(
+            "CRAG: stratified sample of %d questions across %d (domain, qtype) buckets",
+            len(questions),
+            len({(q.domain, q.question_type) for q in questions}),
+        )
+    else:
+        questions = all_questions
+
+    questions_jsonl = bench_dir / "questions.jsonl"
+    write_questions_jsonl(questions, questions_jsonl)
+
+    n_pages_total = sum(len(q.pages) for q in questions)
+    logger.info(
+        "CRAG: extracting up to %d pages across %d questions ...",
+        n_pages_total, len(questions),
+    )
+    qid_to_files, file_to_url = _materialise_pages(
+        questions, pages_dir=pages_dir, overwrite=overwrite_extract,
+    )
+    n_pages_extracted = sum(len(v) for v in qid_to_files.values())
+
+    name_to_id: dict[str, int] = {}
+    if skip_upload:
+        logger.info("CRAG: --skip-upload; skipping SurfSense ingestion")
+    else:
+        all_filenames = sorted({fn for fns in qid_to_files.values() for fn in fns})
+        logger.info("CRAG: uploading %d unique pages ...", len(all_filenames))
+        name_to_id = await _upload_pages(
+            ctx,
+            pages_dir=pages_dir,
+            filenames=all_filenames,
+            batch_size=upload_batch_size,
+            settings=settings,
+        )
+
+    doc_rows = _resolve_question_doc_ids(questions, qid_to_files, name_to_id)
+    map_path = ctx.maps_dir() / "crag_doc_map.jsonl"
+    with map_path.open("w", encoding="utf-8") as fh:
+        fh.write(settings_header_line(settings) + "\n")
+        for row in doc_rows:
+            fh.write(json.dumps(row) + "\n")
+    logger.info("Wrote CRAG doc map to %s (%d rows)", map_path, len(doc_rows))
+
+    new_state = ctx.suite_state
+    new_state.ingestion_maps["crag"] = str(map_path)
+    set_suite_state(ctx.config, ctx.suite, new_state)
+
+    stats = _IngestStats(
+        n_questions=len(questions),
+        n_pages_total=n_pages_total,
+        n_pages_extracted=n_pages_extracted,
+        n_pages_empty=n_pages_total - n_pages_extracted,
+        n_uploaded=len(name_to_id),
+        n_existing=0,
+        bench_dir=bench_dir,
+        map_path=map_path,
+    )
+    logger.info("CRAG ingest done: %s", stats)
+
+
+# ---------------------------------------------------------------------------
+# For runner: read extracted page text back from disk
+# ---------------------------------------------------------------------------
+
+
+def read_page_markdown(bench_dir: Path, filename: str) -> str | None:
+    """Return the on-disk markdown body for a previously-extracted page.
+
+    Used by the long-context runner arm to assemble the prompt at
+    inference time — we don't keep all 5×N pages in memory between
+    ingest and run.
+    """
+
+    path = bench_dir / "pages" / filename
+    if not path.exists():
+        return None
+    try:
+        return path.read_text(encoding="utf-8")
+    except OSError:
+        return None
+
+
+async def _retry_upload_idempotent(  # noqa: D401 - hidden helper
+    ctx: RunContext,
+    *,
+    pages_dir: Path,
+    filenames: list[str],
+    batch_size: int,
+    settings: IngestSettings,
+    max_attempts: int = 2,
+) -> dict[str, int]:
+    """Future-proofing hook (unused today): retry the ingest upload pass."""
+
+    last_exc: Exception | None = None
+    for attempt in range(max_attempts):
+        try:
+            return await _upload_pages(
+                ctx,
+                pages_dir=pages_dir,
+                filenames=filenames,
+                batch_size=batch_size,
+                settings=settings,
+            )
+        except Exception as exc:  # noqa: BLE001
+            last_exc = exc
+            logger.warning("CRAG upload attempt %d failed: %s", attempt + 1, exc)
+            await asyncio.sleep(2.0 * (attempt + 1))
+    if last_exc is not None:
+        raise last_exc
+    return {}
+
+
+__all__ = ["read_page_markdown", "run_ingest"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/crag/prompt.py b/surfsense_evals/src/surfsense_evals/suites/research/crag/prompt.py
new file mode 100644
index 0000000000..6268345051
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/crag/prompt.py
@@ -0,0 +1,146 @@
+"""CRAG prompt templates for the three competing arms.
+
+The CRAG paper grades each prediction as one of:
+
+* **correct**   — answer matches gold (with paraphrasing tolerance)
+* **missing**   — model refuses or says "I don't know"
+* **incorrect** — model commits to a wrong answer (hallucination)
+
+The truthfulness score `(correct - incorrect) / total` rewards
+calibrated abstention, so the prompts below explicitly *invite* the
+model to refuse when it isn't confident — otherwise the bare-LLM arm
+gets penalised twice (no docs *and* a no-refusal prompt) and the
+comparison stops being fair to the LLM-only baseline.
+
+Three templates, byte-identical instructions:
+
+* ``build_bare_prompt(q)``         — question-only.
+* ``build_long_context_prompt(q, contexts)`` — question + concatenated
+  page extracts, all stuffed into the user message. Mirrors the
+  paper's "straightforward RAG" baseline.
+* ``build_surfsense_prompt(q)``    — question + a hint that retrieval
+  over the question's 5 ingested pages is available; the SurfSense
+  agent itself owns the retrieval step.
+
+The ``Answer:`` line at the end is parsed by ``extract_freeform_answer``
+in the runner, so the format is mandatory.
+"""
+
+from __future__ import annotations
+
+
+_BASE_INSTRUCTIONS = (
+    "You are a careful question-answering assistant. The question is a "
+    "real-world factual question that may be about finance, music, "
+    "movies, sports, or any other domain.\n\n"
+    "Important rules:\n"
+    "1. If the question contains a false premise (an assumption that "
+    "is factually wrong), say so explicitly in your final answer "
+    "rather than answering as if the premise were true.\n"
+    "2. If you are not confident in an answer, prefer saying \"I don't "
+    "know\" over guessing. A wrong commit is penalised more than a "
+    "refusal.\n"
+    "3. Keep the final answer short — a name, a number, a date, a "
+    "phrase. Do not repeat the question.\n\n"
+    "Format your final line EXACTLY as:\n"
+    "Answer: <short answer>\n"
+    "If you don't know, write `Answer: I don't know`."
+)
+
+
+_BARE_TEMPLATE = """\
+{instructions}
+
+Question: {question}
+Question time: {query_time}
+"""
+
+
+_SURFSENSE_TEMPLATE = """\
+{instructions}
+
+You have access to a search index of up to 5 web pages that were
+retrieved for this question. Use the retrieval tool to look up any
+facts you are not confident about. The pages may be partially or fully
+relevant; some may contradict each other (prefer the more authoritative
+or more recent source).
+
+Question: {question}
+Question time: {query_time}
+"""
+
+
+_LONG_CONTEXT_TEMPLATE = """\
+{instructions}
+
+You are given the full text of {n_contexts} web pages that were
+retrieved for this question. Read all of them, then answer. The
+pages may be partially or fully relevant; some may contradict each
+other (prefer the more authoritative or more recent source).
+
+{contexts}
+
+Question: {question}
+Question time: {query_time}
+"""
+
+
+def build_bare_prompt(question: str, *, query_time: str = "") -> str:
+    """Prompt for the no-retrieval baseline arm."""
+
+    return _BARE_TEMPLATE.format(
+        instructions=_BASE_INSTRUCTIONS,
+        question=question.strip(),
+        query_time=query_time.strip() or "unknown",
+    )
+
+
+def build_surfsense_prompt(question: str, *, query_time: str = "") -> str:
+    """Prompt for the SurfSense arm (agent does retrieval itself)."""
+
+    return _SURFSENSE_TEMPLATE.format(
+        instructions=_BASE_INSTRUCTIONS,
+        question=question.strip(),
+        query_time=query_time.strip() or "unknown",
+    )
+
+
+def build_long_context_prompt(
+    question: str,
+    *,
+    contexts: list[tuple[str, str]],
+    query_time: str = "",
+    per_page_char_cap: int = 12_000,
+) -> str:
+    """Prompt for the "stuff all pages into the prompt" baseline.
+
+    ``contexts`` is a list of ``(page_title_or_url, page_text)`` pairs.
+    Each page is truncated at ``per_page_char_cap`` (default 12k chars
+    ≈ 3k tokens) so a 5-page CRAG question fits well under any
+    modern long-context window with room for the question + reasoning.
+    """
+
+    blocks: list[str] = []
+    for idx, (title, text) in enumerate(contexts, start=1):
+        body = (text or "").strip()
+        if len(body) > per_page_char_cap:
+            body = body[:per_page_char_cap].rstrip() + "\n[...truncated...]"
+        title_clean = (title or f"page_{idx}").strip().replace("\n", " ")
+        blocks.append(
+            f"--- PAGE {idx}: {title_clean} ---\n{body}\n"
+        )
+    contexts_block = "\n".join(blocks) if blocks else "(no pages retrieved)"
+    return _LONG_CONTEXT_TEMPLATE.format(
+        instructions=_BASE_INSTRUCTIONS,
+        n_contexts=len(contexts),
+        contexts=contexts_block,
+        question=question.strip(),
+        query_time=query_time.strip() or "unknown",
+    )
+
+
+__all__ = [
+    "build_bare_prompt",
+    "build_long_context_prompt",
+    "build_surfsense_prompt",
+]
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/crag/runner.py b/surfsense_evals/src/surfsense_evals/suites/research/crag/runner.py
new file mode 100644
index 0000000000..710f767445
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/crag/runner.py
@@ -0,0 +1,943 @@
+"""CRAG runner — Bare LLM vs Long-Context LLM vs SurfSense.
+
+Three arms run paired on every question in the sample. All three
+answer with the same model (CRAG is a head-to-head benchmark, not a
+cost-arbitrage benchmark). The arms differ only in *what they see*:
+
+1. ``bare_llm``      — chat completion with the question only
+   (paper baseline ≤34%).
+2. ``long_context``  — same model, but the user message also includes
+   the extracted text of all 5 web pages (paper baseline ~44%).
+3. ``surfsense``     — POST ``/api/v1/new_chat`` with retrieval scoped
+   to the question's 5 ingested pages via ``mentioned_document_ids``.
+   The agent retrieves and reasons; we only grade the final answer.
+
+Grading: 3-class CRAG rubric — correct/missing/incorrect — with
+deterministic shortcuts and an LLM-as-judge fallback. Headline is
+the **truthfulness score** ``(#correct - #incorrect) / total``, the
+metric the CRAG paper and KDD Cup 2024 leaderboard use.
+
+We keep paired stats (McNemar + bootstrap CI) on the **correct**
+flag for each arm pair (long_context vs bare, surfsense vs
+long_context, surfsense vs bare) so the report can call out exactly
+where the lift comes from. Per-domain and per-question-type breakdowns
+surface where SurfSense beats long-context (e.g. multi-hop / set
+questions where retrieval-then-reason wins over stuff-it-all-in).
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import logging
+import os
+from collections.abc import Iterable
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+from ....core.arms import ArmRequest, ArmResult, BareLlmArm, SurfSenseArm
+from ....core.config import utc_iso_timestamp
+from ....core.ingest_settings import (
+    IngestSettings,
+    add_ingest_settings_args,
+    format_ingest_settings_md,
+    is_settings_header,
+)
+from ....core.metrics.comparison import (
+    bootstrap_delta_ci,
+    mcnemar_test,
+    paired_aggregate,
+)
+from ....core.metrics.mc_accuracy import accuracy_with_wilson_ci
+from ....core.parse.freeform_answer import extract_freeform_answer
+from ....core.providers.openrouter_chat import OpenRouterChatProvider
+from ....core.registry import ReportSection, RunArtifact, RunContext
+from ....core.scenarios import format_scenario_md
+from .grader import (
+    CragGradeResult,
+    CragGradeRow,
+    CragJudgeConfig,
+    CragLlmJudge,
+    grade_many,
+)
+from .ingest import read_page_markdown
+from .prompt import (
+    build_bare_prompt,
+    build_long_context_prompt,
+    build_surfsense_prompt,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Question shape (post-ingest)
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class CragRunnerQuestion:
+    qid: str
+    raw_index: int
+    question: str
+    gold_answer: str
+    alt_answers: list[str]
+    domain: str
+    question_type: str
+    static_or_dynamic: str
+    popularity: str
+    query_time: str
+    page_filenames: list[str]
+    document_ids: list[int]
+    missing_pages: list[str] = field(default_factory=list)
+
+
+def _load_doc_map(map_path: Path) -> tuple[list[dict[str, Any]], dict[str, Any]]:
+    rows: list[dict[str, Any]] = []
+    settings: dict[str, Any] = {}
+    with map_path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            if is_settings_header(row):
+                settings = dict(row["__settings__"])
+                continue
+            rows.append(row)
+    return rows, settings
+
+
+def _filter_questions(
+    rows: list[dict[str, Any]],
+    *,
+    sample_n: int | None,
+    domain_filter: str | None,
+    qtype_filter: str | None,
+) -> list[CragRunnerQuestion]:
+    out: list[CragRunnerQuestion] = []
+    for row in rows:
+        domain = str(row.get("domain") or "").lower()
+        qtype = str(row.get("question_type") or "").lower()
+        if domain_filter and domain_filter != domain:
+            continue
+        if qtype_filter and qtype_filter not in qtype:
+            continue
+        out.append(CragRunnerQuestion(
+            qid=str(row.get("qid") or "").strip(),
+            raw_index=int(row.get("raw_index") or 0),
+            question=str(row.get("question") or "").strip(),
+            gold_answer=str(row.get("gold_answer") or "").strip(),
+            alt_answers=list(row.get("alt_answers") or []),
+            domain=domain,
+            question_type=qtype,
+            static_or_dynamic=str(row.get("static_or_dynamic") or "").lower(),
+            popularity=str(row.get("popularity") or "").lower(),
+            query_time=str(row.get("query_time") or "").strip(),
+            page_filenames=list(row.get("page_filenames") or []),
+            document_ids=list(row.get("document_ids") or []),
+            missing_pages=list(row.get("missing_pages") or []),
+        ))
+    out.sort(key=lambda q: q.raw_index)
+    if sample_n is not None and sample_n > 0:
+        out = out[:sample_n]
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Concurrency helper
+# ---------------------------------------------------------------------------
+
+
+async def _gather_with_limit(coros: Iterable, *, concurrency: int) -> list[Any]:
+    sem = asyncio.Semaphore(max(1, concurrency))
+
+    async def _wrap(coro):
+        async with sem:
+            return await coro
+
+    return await asyncio.gather(*(_wrap(c) for c in coros))
+
+
+# ---------------------------------------------------------------------------
+# Benchmark
+# ---------------------------------------------------------------------------
+
+
+_DESCRIPTION = (
+    "CRAG (Comprehensive RAG Benchmark, Meta KDD Cup 2024) — three "
+    "arms (Bare LLM / Long-Context LLM / SurfSense) over the same "
+    "5-page-per-question CRAG corpus. Tests competitive RAG vs naive "
+    "context-stuffing; CRAG truthfulness score is the headline metric."
+)
+
+
+_DEFAULT_INGEST_SETTINGS = IngestSettings(
+    use_vision_llm=False,
+    processing_mode="basic",
+    should_summarize=False,
+)
+
+
+class CragBenchmark:
+    """3-arm CRAG runner: bare vs long-context vs SurfSense."""
+
+    suite: str = "research"
+    name: str = "crag"
+    headline: bool = True
+    description: str = _DESCRIPTION
+
+    def add_run_args(self, parser: argparse.ArgumentParser) -> None:
+        parser.add_argument(
+            "--n", dest="sample_n", type=int, default=None,
+            help="Run only the first N questions after filters.",
+        )
+        parser.add_argument(
+            "--domain", dest="domain_filter", default=None,
+            help="Filter to a single CRAG domain (finance|music|movie|sports|open).",
+        )
+        parser.add_argument(
+            "--qtype", dest="qtype_filter", default=None,
+            help=(
+                "Filter to questions whose question_type contains this "
+                "substring (case-insensitive). Examples: 'multi-hop', "
+                "'comparison', 'false_premise'."
+            ),
+        )
+        parser.add_argument(
+            "--concurrency", type=int, default=4,
+            help="Parallel question workers per arm.",
+        )
+        parser.add_argument(
+            "--max-output-tokens", type=int, default=512,
+            help="Cap on completion length for the chat-completion arms.",
+        )
+        parser.add_argument(
+            "--per-page-char-cap", dest="per_page_char_cap", type=int, default=12_000,
+            help="Long-context arm: max chars per page before truncation (default 12k).",
+        )
+        parser.add_argument(
+            "--skip-bare", dest="skip_bare", action="store_true",
+            help="Skip the bare-LLM arm (saves cost on re-runs).",
+        )
+        parser.add_argument(
+            "--skip-long-context", dest="skip_long_context", action="store_true",
+            help="Skip the long-context arm.",
+        )
+        parser.add_argument(
+            "--skip-surfsense", dest="skip_surfsense", action="store_true",
+            help="Skip the SurfSense arm (useful when iterating on the LLM arms only).",
+        )
+        parser.add_argument(
+            "--no-mention-scope", dest="no_mention_scope", action="store_true",
+            help=(
+                "SurfSense arm: don't pass mentioned_document_ids; let "
+                "the agent retrieve over the entire SearchSpace. Default "
+                "is to scope to the question's 5 ingested pages "
+                "(matches CRAG protocol)."
+            ),
+        )
+        parser.add_argument(
+            "--no-judge", dest="no_judge", action="store_true",
+            help="Disable the LLM-as-judge fallback grader.",
+        )
+        parser.add_argument(
+            "--judge-model", dest="judge_model",
+            default="anthropic/claude-sonnet-4.5",
+            help="OpenRouter slug for the LLM judge.",
+        )
+        parser.add_argument(
+            "--judge-concurrency", dest="judge_concurrency", type=int, default=4,
+            help="Parallel judge calls.",
+        )
+        # Ingest knobs
+        parser.add_argument(
+            "--n-questions", dest="n_questions", type=int, default=None,
+            help="(ingest only) cap on number of questions to materialise + ingest.",
+        )
+        parser.add_argument(
+            "--upload-batch-size", dest="upload_batch_size", type=int, default=16,
+            help="(ingest only) markdown files per fileupload call.",
+        )
+        parser.add_argument(
+            "--skip-upload", dest="skip_upload", action="store_true",
+            help="(ingest only) extract pages locally but don't push to SurfSense.",
+        )
+        parser.add_argument(
+            "--overwrite-extract", dest="overwrite_extract", action="store_true",
+            help="(ingest only) re-run trafilatura even when cached markdown exists.",
+        )
+        parser.add_argument(
+            "--sample-seed", dest="sample_seed", type=int, default=17,
+            help="(ingest only) RNG seed for the stratified sample.",
+        )
+        add_ingest_settings_args(parser, defaults=_DEFAULT_INGEST_SETTINGS)
+
+    async def ingest(self, ctx: RunContext, **opts: Any) -> None:
+        from .ingest import run_ingest
+
+        settings = IngestSettings.merge(_DEFAULT_INGEST_SETTINGS, opts)
+        await run_ingest(
+            ctx,
+            n_questions=opts.get("n_questions"),
+            upload_batch_size=int(opts.get("upload_batch_size") or 16),
+            skip_upload=bool(opts.get("skip_upload", False)),
+            overwrite_extract=bool(opts.get("overwrite_extract", False)),
+            settings=settings,
+            sample_seed=int(opts.get("sample_seed") or 17),
+        )
+
+    async def run(self, ctx: RunContext, **opts: Any) -> RunArtifact:
+        sample_n = opts.get("sample_n")
+        domain_filter = (opts.get("domain_filter") or "").strip().lower() or None
+        qtype_filter = (opts.get("qtype_filter") or "").strip().lower() or None
+        concurrency = int(opts.get("concurrency") or 4)
+        max_output_tokens = int(opts.get("max_output_tokens") or 512)
+        per_page_char_cap = int(opts.get("per_page_char_cap") or 12_000)
+        skip_bare = bool(opts.get("skip_bare"))
+        skip_long_context = bool(opts.get("skip_long_context"))
+        skip_surfsense = bool(opts.get("skip_surfsense"))
+        no_mention_scope = bool(opts.get("no_mention_scope"))
+        no_judge = bool(opts.get("no_judge"))
+        judge_model = str(opts.get("judge_model") or "anthropic/claude-sonnet-4.5")
+        judge_concurrency = int(opts.get("judge_concurrency") or 4)
+
+        bench_dir = ctx.benchmark_data_dir()
+        map_path = ctx.maps_dir() / "crag_doc_map.jsonl"
+        if not map_path.exists():
+            raise RuntimeError(
+                "CRAG not ingested for this suite. Run "
+                "`python -m surfsense_evals ingest research crag --n-questions 200` first."
+            )
+
+        rows, ingest_settings = _load_doc_map(map_path)
+        questions = _filter_questions(
+            rows,
+            sample_n=sample_n,
+            domain_filter=domain_filter,
+            qtype_filter=qtype_filter,
+        )
+        if not questions:
+            raise RuntimeError(
+                "No CRAG questions matched the filters; broaden --n / --domain / --qtype."
+            )
+        logger.info("CRAG: scheduled %d questions", len(questions))
+
+        api_key = os.environ.get("OPENROUTER_API_KEY")
+        if not api_key and not (skip_bare and skip_long_context):
+            raise RuntimeError(
+                "OPENROUTER_API_KEY env var is required for the bare / long-context arms."
+            )
+
+        bare_arm = long_context_arm = surf_arm = None
+        chat_provider: OpenRouterChatProvider | None = None
+        if not (skip_bare and skip_long_context):
+            chat_provider = OpenRouterChatProvider(
+                api_key=api_key or "",
+                base_url=ctx.config.openrouter_base_url,
+                model=ctx.native_arm_model,
+            )
+        if not skip_bare and chat_provider is not None:
+            bare_arm = BareLlmArm(
+                provider=chat_provider,
+                max_output_tokens=max_output_tokens,
+                name="bare_llm",
+            )
+        if not skip_long_context and chat_provider is not None:
+            long_context_arm = BareLlmArm(
+                provider=chat_provider,
+                max_output_tokens=max_output_tokens,
+                name="long_context",
+            )
+        if not skip_surfsense:
+            surf_arm = SurfSenseArm(
+                client=ctx.new_chat_client(),
+                search_space_id=ctx.search_space_id,
+                ephemeral_threads=True,
+            )
+
+        judge: CragLlmJudge | None = None
+        if not no_judge:
+            if not api_key:
+                logger.warning("CRAG: --no-judge implied (no OPENROUTER_API_KEY for judge)")
+            else:
+                judge = CragLlmJudge(config=CragJudgeConfig(
+                    api_key=api_key,
+                    model=judge_model,
+                    base_url=ctx.config.openrouter_base_url,
+                    concurrency=judge_concurrency,
+                ))
+
+        run_timestamp = utc_iso_timestamp()
+        run_dir = ctx.runs_dir(run_timestamp=run_timestamp)
+        raw_path = run_dir / "raw.jsonl"
+
+        async def _bare_one(q: CragRunnerQuestion) -> ArmResult:
+            assert bare_arm is not None
+            return await bare_arm.answer(_make_bare_request(q, max_output_tokens))
+
+        async def _long_context_one(q: CragRunnerQuestion) -> ArmResult:
+            assert long_context_arm is not None
+            return await long_context_arm.answer(
+                _make_long_context_request(q, bench_dir, max_output_tokens, per_page_char_cap)
+            )
+
+        async def _surf_one(q: CragRunnerQuestion) -> ArmResult:
+            assert surf_arm is not None
+            return await surf_arm.answer(
+                _make_surfsense_request(q, scope_to_pages=not no_mention_scope)
+            )
+
+        # Run all enabled arms concurrently. Each arm is itself
+        # internally concurrency-bounded.
+        tasks: list[Any] = []
+        if bare_arm is not None:
+            tasks.append(_gather_with_limit((_bare_one(q) for q in questions), concurrency=concurrency))
+        else:
+            tasks.append(_make_skipped_results(questions, "bare_llm"))
+        if long_context_arm is not None:
+            tasks.append(_gather_with_limit((_long_context_one(q) for q in questions), concurrency=concurrency))
+        else:
+            tasks.append(_make_skipped_results(questions, "long_context"))
+        if surf_arm is not None:
+            tasks.append(_gather_with_limit((_surf_one(q) for q in questions), concurrency=concurrency))
+        else:
+            tasks.append(_make_skipped_results(questions, "surfsense"))
+
+        bare_results, long_context_results, surf_results = await asyncio.gather(*tasks)
+
+        bare_grades = await _grade_results(questions, bare_results, judge=judge) if bare_arm else _empty_grades(questions)
+        lc_grades = await _grade_results(questions, long_context_results, judge=judge) if long_context_arm else _empty_grades(questions)
+        surf_grades = await _grade_results(questions, surf_results, judge=judge) if surf_arm else _empty_grades(questions)
+
+        with raw_path.open("w", encoding="utf-8") as fh:
+            for q, b_res, l_res, s_res, b_g, l_g, s_g in zip(
+                questions,
+                bare_results, long_context_results, surf_results,
+                bare_grades, lc_grades, surf_grades,
+                strict=False,
+            ):
+                meta = {
+                    "qid": q.qid,
+                    "raw_index": q.raw_index,
+                    "domain": q.domain,
+                    "question_type": q.question_type,
+                    "static_or_dynamic": q.static_or_dynamic,
+                    "popularity": q.popularity,
+                    "n_pages": len(q.page_filenames),
+                    "n_doc_ids": len(q.document_ids),
+                    "gold": q.gold_answer,
+                    "alt_answers": q.alt_answers,
+                }
+                for res, grade in (
+                    (b_res, b_g), (l_res, l_g), (s_res, s_g),
+                ):
+                    fh.write(json.dumps({
+                        **meta,
+                        **res.to_jsonl(),
+                        "graded": grade.to_dict(),
+                    }) + "\n")
+
+        metrics = _compute_metrics(
+            questions=questions,
+            bare_results=bare_results, long_context_results=long_context_results, surf_results=surf_results,
+            bare_grades=bare_grades, lc_grades=lc_grades, surf_grades=surf_grades,
+            arms_active={
+                "bare_llm": bare_arm is not None,
+                "long_context": long_context_arm is not None,
+                "surfsense": surf_arm is not None,
+            },
+        )
+        artifact = RunArtifact(
+            suite=self.suite,
+            benchmark=self.name,
+            run_timestamp=run_timestamp,
+            raw_path=raw_path,
+            metrics=metrics,
+            extra={
+                "n_questions": len(questions),
+                "concurrency": concurrency,
+                "domain_filter": domain_filter,
+                "qtype_filter": qtype_filter,
+                "no_mention_scope": no_mention_scope,
+                "no_judge": no_judge,
+                "judge_model": judge_model if not no_judge else None,
+                "scenario": ctx.scenario,
+                "provider_model": ctx.provider_model,
+                "native_arm_model": ctx.native_arm_model,
+                "vision_provider_model": ctx.vision_provider_model,
+                "agent_llm_id": ctx.agent_llm_id,
+                "ingest_settings": ingest_settings,
+                "per_page_char_cap": per_page_char_cap,
+                "max_output_tokens": max_output_tokens,
+                "arms_active": {
+                    "bare_llm": bare_arm is not None,
+                    "long_context": long_context_arm is not None,
+                    "surfsense": surf_arm is not None,
+                },
+            },
+        )
+
+        manifest_path = run_dir / "run_artifact.json"
+        manifest_path.write_text(
+            json.dumps({
+                "suite": self.suite,
+                "benchmark": self.name,
+                "raw_path": "raw.jsonl",
+                "metrics": metrics,
+                "extra": artifact.extra,
+            }, indent=2, sort_keys=True) + "\n",
+            encoding="utf-8",
+        )
+        return artifact
+
+    def report_section(self, artifacts: list[RunArtifact]) -> ReportSection:
+        if not artifacts:
+            return ReportSection(
+                title="CRAG — Bare LLM vs Long-Context LLM vs SurfSense",
+                headline=True,
+                body_md="(no run artifacts found)",
+                body_json={},
+            )
+        latest = max(artifacts, key=lambda a: a.run_timestamp)
+        m = latest.metrics
+        bare = m.get("bare_llm", {})
+        lc = m.get("long_context", {})
+        surf = m.get("surfsense", {})
+        deltas = m.get("deltas", {})
+        per_domain = m.get("per_domain", {})
+        per_qtype = m.get("per_question_type", {})
+        extra = latest.extra
+
+        body_lines: list[str] = []
+        body_lines.append(
+            f"- Sample size: {extra.get('n_questions', '?')} questions "
+            f"(domain filter: `{extra.get('domain_filter') or 'none'}`, "
+            f"qtype filter: `{extra.get('qtype_filter') or 'none'}`, "
+            f"judge: `{extra.get('judge_model') or 'deterministic-only'}`)."
+        )
+        body_lines.append(format_scenario_md(extra))
+        body_lines.append(format_ingest_settings_md(extra.get("ingest_settings")))
+        active = extra.get("arms_active") or {}
+        if not active.get("bare_llm", True):
+            body_lines.append("- Bare-LLM arm: SKIPPED.")
+        else:
+            body_lines.append(
+                f"- Bare-LLM arm (`{extra.get('native_arm_model') or '?'}`, no retrieval):"
+            )
+            body_lines.append(_arm_summary_lines(bare, indent="  "))
+        if not active.get("long_context", True):
+            body_lines.append("- Long-context arm: SKIPPED.")
+        else:
+            body_lines.append(
+                f"- Long-context arm (`{extra.get('native_arm_model') or '?'}`, "
+                f"all 5 pages stuffed into prompt; per-page cap "
+                f"{extra.get('per_page_char_cap', 12_000):,} chars):"
+            )
+            body_lines.append(_arm_summary_lines(lc, indent="  "))
+        if not active.get("surfsense", True):
+            body_lines.append("- SurfSense arm: SKIPPED.")
+        else:
+            body_lines.append(
+                f"- SurfSense arm (`{extra.get('provider_model', '?')}`, retrieval over "
+                f"{'whole SearchSpace' if extra.get('no_mention_scope') else 'per-question 5 pages'}):"
+            )
+            body_lines.append(_arm_summary_lines(surf, indent="  "))
+
+        body_lines.append("- Headline truthfulness scores (CRAG paper rubric):")
+        for label, key in (
+            ("Bare LLM", "bare_llm"), ("Long-Context", "long_context"), ("SurfSense", "surfsense"),
+        ):
+            d = m.get(key, {})
+            body_lines.append(
+                f"  - {label}: score={_signed_pct(d.get('truthfulness_score'))}, "
+                f"correct={_pct(d.get('correct_rate'))}, "
+                f"missing={_pct(d.get('missing_rate'))}, "
+                f"incorrect={_pct(d.get('incorrect_rate'))}"
+            )
+
+        if deltas:
+            body_lines.append("- Pairwise deltas (paired):")
+            for label, key in (
+                ("SurfSense vs Bare", "surfsense_vs_bare"),
+                ("SurfSense vs Long-Context", "surfsense_vs_long_context"),
+                ("Long-Context vs Bare", "long_context_vs_bare"),
+            ):
+                d = deltas.get(key)
+                if not d:
+                    continue
+                body_lines.append(
+                    f"  - {label}: accuracy {_pp(d.get('accuracy_pp'))} pp, "
+                    f"truthfulness {_pp(d.get('truthfulness_score_pp'))} pp "
+                    f"(McNemar p={_fmt(d.get('mcnemar_p_value'), 4)}, "
+                    f"method={d.get('mcnemar_method')}; bootstrap CI on accuracy "
+                    f"[{_pp(d.get('bootstrap_ci_low'))}pp, {_pp(d.get('bootstrap_ci_high'))}pp])"
+                )
+
+        if per_domain:
+            body_lines.append("- Per-domain truthfulness score (active arms):")
+            for domain in sorted(per_domain.keys()):
+                row = per_domain[domain]
+                pieces: list[str] = [f"  - {domain} (n={row.get('n')}):"]
+                for arm in ("bare_llm", "long_context", "surfsense"):
+                    if arm not in row:
+                        continue
+                    pieces.append(
+                        f"{arm}={_signed_pct(row[arm].get('truthfulness_score'))}"
+                    )
+                body_lines.append(" ".join(pieces))
+
+        if per_qtype:
+            body_lines.append("- Per-question-type truthfulness score (active arms):")
+            for qtype in sorted(per_qtype.keys()):
+                row = per_qtype[qtype]
+                pieces = [f"  - {qtype} (n={row.get('n')}):"]
+                for arm in ("bare_llm", "long_context", "surfsense"):
+                    if arm not in row:
+                        continue
+                    pieces.append(
+                        f"{arm}={_signed_pct(row[arm].get('truthfulness_score'))}"
+                    )
+                body_lines.append(" ".join(pieces))
+
+        return ReportSection(
+            title="CRAG — Bare LLM vs Long-Context LLM vs SurfSense",
+            headline=True,
+            body_md="\n".join(body_lines),
+            body_json=m,
+        )
+
+
+# ---------------------------------------------------------------------------
+# Per-question helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_bare_request(q: CragRunnerQuestion, max_tokens: int) -> ArmRequest:
+    return ArmRequest(
+        question_id=q.qid,
+        prompt=build_bare_prompt(q.question, query_time=q.query_time),
+        options={"max_tokens": max_tokens},
+    )
+
+
+def _make_long_context_request(
+    q: CragRunnerQuestion,
+    bench_dir: Path,
+    max_tokens: int,
+    per_page_char_cap: int,
+) -> ArmRequest:
+    contexts: list[tuple[str, str]] = []
+    for fn in q.page_filenames:
+        text = read_page_markdown(bench_dir, fn) or ""
+        if not text.strip():
+            continue
+        # Use the filename stem as a stable title fallback (URLs are
+        # already in the markdown body's "Source:" header line).
+        contexts.append((Path(fn).stem, text))
+    prompt = build_long_context_prompt(
+        q.question,
+        contexts=contexts,
+        query_time=q.query_time,
+        per_page_char_cap=per_page_char_cap,
+    )
+    return ArmRequest(
+        question_id=q.qid,
+        prompt=prompt,
+        options={"max_tokens": max_tokens},
+    )
+
+
+def _make_surfsense_request(q: CragRunnerQuestion, *, scope_to_pages: bool) -> ArmRequest:
+    mentions: list[int] | None = None
+    if scope_to_pages and q.document_ids:
+        mentions = list(q.document_ids)
+    return ArmRequest(
+        question_id=q.qid,
+        prompt=build_surfsense_prompt(q.question, query_time=q.query_time),
+        mentioned_document_ids=mentions,
+    )
+
+
+async def _grade_results(
+    questions: list[CragRunnerQuestion],
+    results: list[ArmResult],
+    *,
+    judge: CragLlmJudge | None,
+) -> list[CragGradeResult]:
+    rows: list[CragGradeRow] = []
+    for q, r in zip(questions, results, strict=False):
+        pred = extract_freeform_answer(r.raw_text or "")
+        rows.append(CragGradeRow(
+            qid=q.qid,
+            question=q.question,
+            gold=q.gold_answer,
+            alt_answers=q.alt_answers,
+            pred=pred,
+            question_type=q.question_type,
+        ))
+    return await grade_many(rows=rows, judge=judge)
+
+
+def _empty_grades(questions: list[CragRunnerQuestion]) -> list[CragGradeResult]:
+    return [
+        CragGradeResult(grade="missing", score=0, method="skipped_arm")
+        for _ in questions
+    ]
+
+
+async def _make_skipped_results(
+    questions: list[CragRunnerQuestion], arm_name: str,
+) -> list[ArmResult]:
+    """Stand-in results so downstream code can assume parallel lists."""
+
+    return [
+        ArmResult(arm=arm_name, question_id=q.qid, raw_text="", error="skipped")
+        for q in questions
+    ]
+
+
+# ---------------------------------------------------------------------------
+# Metrics aggregation
+# ---------------------------------------------------------------------------
+
+
+def _arm_truthfulness(grades: list[CragGradeResult]) -> dict[str, Any]:
+    """Per-arm headline numbers — accuracy + 3-class rates + truthfulness."""
+
+    n = len(grades) or 1
+    n_correct = sum(g.correct for g in grades)
+    n_missing = sum(g.missing for g in grades)
+    n_incorrect = sum(g.incorrect for g in grades)
+    return {
+        "n_total": len(grades),
+        "n_correct": n_correct,
+        "n_missing": n_missing,
+        "n_incorrect": n_incorrect,
+        "correct_rate": n_correct / n,
+        "missing_rate": n_missing / n,
+        "incorrect_rate": n_incorrect / n,
+        "truthfulness_score": (n_correct - n_incorrect) / n,
+    }
+
+
+def _compute_metrics(
+    *,
+    questions: list[CragRunnerQuestion],
+    bare_results: list[ArmResult],
+    long_context_results: list[ArmResult],
+    surf_results: list[ArmResult],
+    bare_grades: list[CragGradeResult],
+    lc_grades: list[CragGradeResult],
+    surf_grades: list[CragGradeResult],
+    arms_active: dict[str, bool],
+) -> dict[str, Any]:
+    bare_correct = [g.correct for g in bare_grades]
+    lc_correct = [g.correct for g in lc_grades]
+    surf_correct = [g.correct for g in surf_grades]
+
+    bare_acc = accuracy_with_wilson_ci(sum(bare_correct), len(bare_correct))
+    lc_acc = accuracy_with_wilson_ci(sum(lc_correct), len(lc_correct))
+    surf_acc = accuracy_with_wilson_ci(sum(surf_correct), len(surf_correct))
+
+    bare_t = _arm_truthfulness(bare_grades)
+    lc_t = _arm_truthfulness(lc_grades)
+    surf_t = _arm_truthfulness(surf_grades)
+
+    def _arm_block(
+        results: list[ArmResult],
+        acc: Any,
+        truthfulness: dict[str, Any],
+    ) -> dict[str, Any]:
+        costs = [float(r.cost_micros) for r in results]
+        latencies = [float(r.latency_ms) for r in results]
+        ins = [float(r.input_tokens) for r in results]
+        outs = [float(r.output_tokens) for r in results]
+        cost_agg = paired_aggregate(costs)
+        lat_agg = paired_aggregate(latencies)
+        return {
+            **acc.to_dict(),
+            **truthfulness,
+            "cost_micros_mean": cost_agg.mean,
+            "cost_micros_median": cost_agg.median,
+            "latency_ms_mean": lat_agg.mean,
+            "latency_ms_median": lat_agg.median,
+            "latency_ms_p95": lat_agg.p95,
+            "input_tokens_mean": (sum(ins) / len(ins)) if ins else 0.0,
+            "output_tokens_mean": (sum(outs) / len(outs)) if outs else 0.0,
+        }
+
+    out: dict[str, Any] = {
+        "bare_llm": _arm_block(bare_results, bare_acc, bare_t),
+        "long_context": _arm_block(long_context_results, lc_acc, lc_t),
+        "surfsense": _arm_block(surf_results, surf_acc, surf_t),
+    }
+
+    deltas: dict[str, Any] = {}
+    for label, ref_correct, ref_t, chal_correct, chal_t, both_active in (
+        ("surfsense_vs_bare", bare_correct, bare_t, surf_correct, surf_t,
+         arms_active.get("bare_llm") and arms_active.get("surfsense")),
+        ("surfsense_vs_long_context", lc_correct, lc_t, surf_correct, surf_t,
+         arms_active.get("long_context") and arms_active.get("surfsense")),
+        ("long_context_vs_bare", bare_correct, bare_t, lc_correct, lc_t,
+         arms_active.get("bare_llm") and arms_active.get("long_context")),
+    ):
+        if not both_active:
+            continue
+        mc = mcnemar_test(ref_correct, chal_correct)
+        boot = bootstrap_delta_ci(ref_correct, chal_correct, n_resamples=2000)
+        deltas[label] = {
+            "accuracy_pp": 100.0 * (sum(chal_correct) - sum(ref_correct)) / max(1, len(chal_correct)),
+            "truthfulness_score_pp": 100.0 * (chal_t["truthfulness_score"] - ref_t["truthfulness_score"]),
+            "mcnemar_p_value": mc.p_value,
+            "mcnemar_method": mc.method,
+            "mcnemar_b_ref_only": mc.b,
+            "mcnemar_c_challenger_only": mc.c,
+            "bootstrap_ci_low": 100.0 * boot.ci_low,
+            "bootstrap_ci_high": 100.0 * boot.ci_high,
+        }
+    out["deltas"] = deltas
+
+    out["per_domain"] = _per_facet_truthfulness(
+        questions, bare_grades, lc_grades, surf_grades,
+        arms_active=arms_active,
+        key_fn=lambda q: q.domain or "(unspecified)",
+    )
+    out["per_question_type"] = _per_facet_truthfulness(
+        questions, bare_grades, lc_grades, surf_grades,
+        arms_active=arms_active,
+        key_fn=lambda q: q.question_type or "(unspecified)",
+    )
+
+    out["grader_methods"] = {
+        "bare_llm": _count_methods(bare_grades) if arms_active.get("bare_llm") else {},
+        "long_context": _count_methods(lc_grades) if arms_active.get("long_context") else {},
+        "surfsense": _count_methods(surf_grades) if arms_active.get("surfsense") else {},
+    }
+    return out
+
+
+def _per_facet_truthfulness(
+    questions: list[CragRunnerQuestion],
+    bare_grades: list[CragGradeResult],
+    lc_grades: list[CragGradeResult],
+    surf_grades: list[CragGradeResult],
+    *,
+    arms_active: dict[str, bool],
+    key_fn: Any,
+) -> dict[str, Any]:
+    """Bucket truthfulness scores by ``key_fn(q)``."""
+
+    buckets: dict[str, dict[str, list[CragGradeResult]]] = {}
+    for q, b, l, s in zip(questions, bare_grades, lc_grades, surf_grades, strict=False):
+        key = key_fn(q)
+        bucket = buckets.setdefault(key, {"bare_llm": [], "long_context": [], "surfsense": []})
+        bucket["bare_llm"].append(b)
+        bucket["long_context"].append(l)
+        bucket["surfsense"].append(s)
+    out: dict[str, Any] = {}
+    for key, arms in buckets.items():
+        row: dict[str, Any] = {"n": len(arms["bare_llm"])}
+        for arm_name, grades in arms.items():
+            if not arms_active.get(arm_name):
+                continue
+            row[arm_name] = _arm_truthfulness(grades)
+        out[key] = row
+    return out
+
+
+def _count_methods(grades: list[CragGradeResult]) -> dict[str, int]:
+    out: dict[str, int] = {}
+    for g in grades:
+        out[g.method] = out.get(g.method, 0) + 1
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Tiny formatting helpers
+# ---------------------------------------------------------------------------
+
+
+def _arm_summary_lines(d: dict[str, Any], *, indent: str) -> str:
+    if not d:
+        return f"{indent}(no data)"
+    acc = d.get("accuracy", 0.0)
+    low = d.get("ci_low", 0.0)
+    high = d.get("ci_high", 0.0)
+    lines = [
+        f"{indent}- Accuracy: {acc * 100:.1f}% (Wilson 95% CI: {low * 100:.1f}% – {high * 100:.1f}%)",
+        f"{indent}- 3-class: correct={d.get('correct_rate', 0)*100:.1f}%, "
+        f"missing={d.get('missing_rate', 0)*100:.1f}%, "
+        f"incorrect={d.get('incorrect_rate', 0)*100:.1f}%",
+        f"{indent}- Truthfulness score (correct - incorrect)/total: "
+        f"{d.get('truthfulness_score', 0)*100:+.1f}%",
+        f"{indent}- Cost / question: ${_dollars(d.get('cost_micros_mean'))} (mean), "
+        f"${_dollars(d.get('cost_micros_median'))} (median)",
+        f"{indent}- Latency: p50 {_ms_to_s(d.get('latency_ms_median'))}, "
+        f"p95 {_ms_to_s(d.get('latency_ms_p95'))}",
+    ]
+    if d.get("input_tokens_mean") or d.get("output_tokens_mean"):
+        lines.append(
+            f"{indent}- Mean tokens / question: in {d.get('input_tokens_mean', 0):.0f}, "
+            f"out {d.get('output_tokens_mean', 0):.0f}"
+        )
+    return "\n".join(lines)
+
+
+def _dollars(micros: Any) -> str:
+    if micros is None:
+        return "?"
+    try:
+        return f"{(float(micros) / 1_000_000):.4f}"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _ms_to_s(ms: Any) -> str:
+    if ms is None:
+        return "?"
+    try:
+        return f"{float(ms) / 1000:.1f}s"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _pp(value: Any) -> str:
+    if value is None:
+        return "?"
+    try:
+        return f"{float(value):+.1f}"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _pct(value: Any) -> str:
+    if value is None:
+        return "?"
+    try:
+        return f"{float(value)*100:.1f}%"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _signed_pct(value: Any) -> str:
+    if value is None:
+        return "?"
+    try:
+        return f"{float(value)*100:+.1f}%"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _fmt(value: Any, ndigits: int) -> str:
+    if value is None:
+        return "?"
+    try:
+        return f"{float(value):.{ndigits}f}"
+    except (TypeError, ValueError):
+        return "?"
+
+
+__all__ = ["CragBenchmark", "CragRunnerQuestion"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/frames/__init__.py b/surfsense_evals/src/surfsense_evals/suites/research/frames/__init__.py
new file mode 100644
index 0000000000..4e556cd84a
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/frames/__init__.py
@@ -0,0 +1,29 @@
+"""FRAMES — multi-hop Wikipedia retrieval & reasoning (google/frames-benchmark).
+
+Source: https://huggingface.co/datasets/google/frames-benchmark
+Paper:  https://arxiv.org/abs/2409.12941 (Krishna et al., 2024)
+
+* 824 multi-hop questions, each requiring 2-15 Wikipedia articles
+* 5 reasoning types: numerical, tabular, multiple constraints,
+  temporal, post-processing
+* Published Gemini-Pro-1.5 baselines:
+  - Naive prompting (no retrieval):    40.8%
+  - BM25, top-4:                       47.4%
+  - Multi-step retrieval & reasoning:  66.0%
+  - Oracle retrieval (gold articles):  72.9%
+
+This is the benchmark that *finally* puts SurfSense's strongest claim
+on trial: cross-document iterative retrieval. The harness ingests
+every Wikipedia article referenced by any question in the run sample
+into a single SearchSpace; SurfSense answers without
+``mentioned_document_ids`` so its agent has to actually retrieve.
+The bare-LLM arm answers from the prompt only (the published 40.8%
+baseline number).
+"""
+
+from __future__ import annotations
+
+from ....core import registry as _registry
+from .runner import FramesBenchmark
+
+_registry.register(FramesBenchmark())
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/frames/dataset.py b/surfsense_evals/src/surfsense_evals/suites/research/frames/dataset.py
new file mode 100644
index 0000000000..c3b6b878ea
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/frames/dataset.py
@@ -0,0 +1,174 @@
+"""FRAMES dataset loader — download ``test.tsv`` from HF and parse rows.
+
+The HF repo (``google/frames-benchmark``) ships a single tab-separated
+file at ``test.tsv`` (824 rows). Columns of interest for us:
+
+* unnamed first column → row index (``id`` we synthesise as ``Q000``..)
+* ``Prompt``  → the question (free-text, often multi-clause)
+* ``Answer``  → gold answer (short string: name, number, year, ...)
+* ``wikipedia_link_1`` ... ``wikipedia_link_11+`` → sparse per-question
+  link cells (we ignore in favour of the consolidated column below).
+* ``reasoning_types`` → pipe-separated tags (``"Numerical reasoning |
+  Tabular reasoning | Multiple constraints"``)
+* ``wiki_links`` → Python-list literal of every URL the question relies
+  on, e.g. ``"['https://en.wikipedia.org/wiki/...', '...']"``
+
+We use ``wiki_links`` (already deduplicated per row) and
+``ast.literal_eval`` to materialise it. The legacy
+``wikipedia_link_*`` columns are kept around only so a curious
+operator can compare cell-vs-list if upstream ever drift apart.
+"""
+
+from __future__ import annotations
+
+import ast
+import json
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+HF_REPO_ID = "google/frames-benchmark"
+HF_REPO_TYPE = "dataset"
+HF_TEST_FILE = "test.tsv"
+
+
+def _hf_hub_download(*args: Any, **kwargs: Any) -> str:
+    from huggingface_hub import hf_hub_download
+
+    return hf_hub_download(*args, **kwargs)
+
+
+# ---------------------------------------------------------------------------
+# Question dataclass
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class FramesQuestion:
+    """One row of FRAMES (post-parse)."""
+
+    qid: str                   # synthesised "Q000" .. "Q823"
+    question: str
+    gold_answer: str
+    wiki_urls: list[str]       # deduped, in original order
+    reasoning_types: list[str] # split on "|"
+    raw_index: int             # row index from the TSV (for debugging)
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "qid": self.qid,
+            "question": self.question,
+            "gold_answer": self.gold_answer,
+            "wiki_urls": list(self.wiki_urls),
+            "reasoning_types": list(self.reasoning_types),
+            "raw_index": self.raw_index,
+        }
+
+
+# ---------------------------------------------------------------------------
+# Download + parse
+# ---------------------------------------------------------------------------
+
+
+def download_test_tsv(cache_dir: Path) -> Path:
+    """Resumable download of ``test.tsv`` via ``huggingface_hub``."""
+
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    local = _hf_hub_download(
+        repo_id=HF_REPO_ID,
+        filename=HF_TEST_FILE,
+        repo_type=HF_REPO_TYPE,
+        cache_dir=str(cache_dir),
+    )
+    return Path(local)
+
+
+def _parse_wiki_links(raw: Any) -> list[str]:
+    """Convert the ``wiki_links`` cell (Python list literal) to ``list[str]``."""
+
+    if not raw:
+        return []
+    if isinstance(raw, list):
+        return [str(x).strip() for x in raw if str(x).strip()]
+    text = str(raw).strip()
+    if not text:
+        return []
+    try:
+        parsed = ast.literal_eval(text)
+    except (SyntaxError, ValueError):
+        # Fall back: maybe it's a comma-separated string with no quotes.
+        return [tok.strip() for tok in text.strip("[]").split(",") if tok.strip()]
+    if isinstance(parsed, (list, tuple)):
+        return [str(x).strip() for x in parsed if str(x).strip()]
+    return [str(parsed).strip()]
+
+
+def _parse_reasoning_types(raw: Any) -> list[str]:
+    if not raw:
+        return []
+    text = str(raw).strip()
+    if not text:
+        return []
+    return [tok.strip() for tok in text.split("|") if tok.strip()]
+
+
+def load_questions(tsv_path: Path) -> list[FramesQuestion]:
+    """Read FRAMES rows from disk into ``FramesQuestion`` objects.
+
+    Uses pandas for robust TSV parsing (tabs inside quoted strings are
+    rare in this dataset but pandas handles them; the stdlib ``csv``
+    module is fine too if pandas ever becomes a problem). We pin
+    ``index_col=0`` because the upstream TSV uses the first unnamed
+    column as the row index.
+    """
+
+    import pandas as pd
+
+    df = pd.read_csv(tsv_path, sep="\t", index_col=0, keep_default_na=False)
+    out: list[FramesQuestion] = []
+    for raw_idx, row in df.iterrows():
+        prompt = str(row.get("Prompt") or "").strip()
+        answer = str(row.get("Answer") or "").strip()
+        if not prompt or not answer:
+            logger.debug("Skipping FRAMES row %s with missing Prompt/Answer", raw_idx)
+            continue
+        urls = _parse_wiki_links(row.get("wiki_links"))
+        if not urls:
+            # Fall back to the per-cell ``wikipedia_link_*`` columns.
+            urls = []
+            for col in row.index:
+                if col.startswith("wikipedia_link"):
+                    val = str(row.get(col) or "").strip()
+                    if val and val not in urls:
+                        urls.append(val)
+        reasoning = _parse_reasoning_types(row.get("reasoning_types"))
+        out.append(FramesQuestion(
+            qid=f"Q{int(raw_idx):03d}",
+            question=prompt,
+            gold_answer=answer,
+            wiki_urls=urls,
+            reasoning_types=reasoning,
+            raw_index=int(raw_idx),
+        ))
+    return out
+
+
+def write_questions_jsonl(questions: list[FramesQuestion], dest: Path) -> None:
+    """Persist a parsed copy under the benchmark data dir."""
+
+    dest.parent.mkdir(parents=True, exist_ok=True)
+    with dest.open("w", encoding="utf-8") as fh:
+        for q in questions:
+            fh.write(json.dumps(q.to_dict()) + "\n")
+
+
+__all__ = [
+    "FramesQuestion",
+    "download_test_tsv",
+    "load_questions",
+    "write_questions_jsonl",
+]
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/frames/grader.py b/surfsense_evals/src/surfsense_evals/suites/research/frames/grader.py
new file mode 100644
index 0000000000..d280e3eafb
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/frames/grader.py
@@ -0,0 +1,341 @@
+"""FRAMES grader: deterministic shortcut + LLM-as-judge fallback.
+
+FRAMES gold answers are short factoids (a name, a year, an ordinal,
+a count). The published paper uses an LLM judge for grading, citing
+the long tail of paraphrasing ("Jane Ballou" vs "Mrs. Ballou (Jane)";
+"5" vs "five"; "London, England" vs "London"). We replicate that
+faithfully *but* avoid burning judge tokens on the obvious cases.
+
+Pipeline per (pred, gold):
+
+1. Normalise both sides (SQuAD-style).
+2. If normalised pred == normalised gold → CORRECT (``method=exact``).
+3. Numeric path: if both extract to a single number and the values
+   match within 1% relative tolerance → CORRECT (``method=numeric``).
+4. Substring path: if normalised gold appears as a *whole-word phrase*
+   inside normalised pred (or vice versa) → CORRECT
+   (``method=substring``).
+5. Otherwise → call the LLM judge if a judge is wired; the judge
+   returns yes/no with a one-line rationale.
+6. If no judge is configured, fall through to ``False``
+   (``method=lexical_miss``).
+
+The judge is called *concurrently* across the run via a semaphore (so
+it doesn't outrun the upstream rate limit). Cached on
+``(arm, qid)`` so re-running ``report`` doesn't re-judge.
+
+Returned shape mirrors ``mmlongbench.grader.GradeResult`` to keep
+report writers uniform across benchmarks.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import re
+import string
+from collections.abc import Sequence
+from dataclasses import dataclass
+from typing import Any
+
+from ....core.providers.openrouter_chat import OpenRouterChatProvider
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Public types
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class GradeResult:
+    """Shape mirrors mmlongbench.grader.GradeResult for report uniformity."""
+
+    correct: bool
+    f1: float
+    method: str
+    normalised_pred: str = ""
+    normalised_gold: str = ""
+    judge_rationale: str = ""
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "correct": self.correct,
+            "f1": self.f1,
+            "method": self.method,
+            "normalised_pred": self.normalised_pred,
+            "normalised_gold": self.normalised_gold,
+            "judge_rationale": self.judge_rationale,
+        }
+
+
+# ---------------------------------------------------------------------------
+# Normalisation
+# ---------------------------------------------------------------------------
+
+
+_PUNCT_TABLE = str.maketrans({c: " " for c in string.punctuation})
+_ARTICLES = re.compile(r"\b(a|an|the)\b", re.IGNORECASE)
+_WS = re.compile(r"\s+")
+
+
+def _normalise(s: str) -> str:
+    s = (s or "").lower()
+    s = s.translate(_PUNCT_TABLE)
+    s = _ARTICLES.sub(" ", s)
+    s = _WS.sub(" ", s).strip()
+    return s
+
+
+_WORD_NUMBERS = {
+    "zero": 0, "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
+    "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10, "eleven": 11,
+    "twelve": 12, "thirteen": 13, "fourteen": 14, "fifteen": 15, "sixteen": 16,
+    "seventeen": 17, "eighteen": 18, "nineteen": 19, "twenty": 20,
+}
+
+_NUMERIC_RE = re.compile(r"-?\d+(?:[.,]\d+)?")
+
+
+def _maybe_number(s: str) -> float | None:
+    """Extract a single numeric value, recognising digit and word forms.
+
+    Operates on the lowercased *raw* text (rather than the
+    punctuation-stripped normalisation) so that thousands separators
+    like ``1,234`` are preserved through the regex and parsed
+    correctly. We only fall back to ``_normalise`` for the word-number
+    pass, which doesn't care about punctuation.
+    """
+
+    raw = (s or "").strip().lower()
+    if not raw:
+        return None
+    match = _NUMERIC_RE.search(raw)
+    if match:
+        try:
+            return float(match.group(0).replace(",", ""))
+        except ValueError:
+            pass
+    for tok in _normalise(s).split():
+        if tok in _WORD_NUMBERS:
+            return float(_WORD_NUMBERS[tok])
+    return None
+
+
+def _whole_word_substring(haystack: str, needle: str) -> bool:
+    """Is ``needle`` present as a whole-word phrase in ``haystack``?"""
+
+    if not needle:
+        return False
+    pad_h = f" {haystack} "
+    pad_n = f" {needle} "
+    return pad_n in pad_h
+
+
+# ---------------------------------------------------------------------------
+# Deterministic shortcut
+# ---------------------------------------------------------------------------
+
+
+def grade_deterministic(*, pred: str, gold: str) -> GradeResult:
+    """Try to grade without the LLM judge. Returns a final-result object.
+
+    A ``False`` result with ``method == "lexical_miss"`` is the signal
+    to the caller that the LLM judge should be consulted (if available).
+    """
+
+    if not (pred or "").strip():
+        return GradeResult(False, 0.0, "empty_pred", "", _normalise(gold))
+
+    p = _normalise(pred)
+    g = _normalise(gold)
+    if not g:
+        # Defensively: gold should never be empty; if it is, we can't grade.
+        return GradeResult(False, 0.0, "empty_gold", p, g)
+
+    if p == g:
+        return GradeResult(True, 1.0, "exact", p, g)
+
+    p_num = _maybe_number(pred)
+    g_num = _maybe_number(gold)
+    if p_num is not None and g_num is not None:
+        # 1% relative tolerance, 0.5 absolute floor (handles year-ish answers).
+        tol = max(abs(g_num) * 0.01, 0.5)
+        if abs(p_num - g_num) <= tol:
+            return GradeResult(True, 1.0, "numeric", p, g)
+        return GradeResult(False, 0.0, "numeric_miss", p, g)
+
+    if _whole_word_substring(p, g):
+        return GradeResult(True, 1.0, "substring", p, g)
+    if _whole_word_substring(g, p) and len(p) >= 3:
+        # Be conservative the other direction — only credit if pred is
+        # at least 3 normalised chars (avoids "John" matching gold
+        # "John F. Kennedy" as correct).
+        return GradeResult(True, 1.0, "substring_reverse", p, g)
+
+    return GradeResult(False, 0.0, "lexical_miss", p, g)
+
+
+# ---------------------------------------------------------------------------
+# LLM-as-judge
+# ---------------------------------------------------------------------------
+
+
+_JUDGE_SYSTEM = (
+    "You are an impartial grader for short-answer factual questions. "
+    "Given a question, the gold answer, and a model's prediction, "
+    "decide whether the prediction is correct. The prediction is "
+    "correct if it expresses the same factual content as the gold "
+    "answer, allowing for paraphrasing, surface-level differences "
+    "(numbers as words, names with/without titles), and additional "
+    "non-contradictory detail. The prediction is incorrect if it "
+    "expresses a different fact, omits the central answer, or hedges "
+    "without committing.\n\n"
+    "Respond with ONLY a JSON object on a single line:\n"
+    '{\"correct\": true|false, \"rationale\": \"<one short sentence>\"}'
+)
+
+
+_JUDGE_TEMPLATE = """\
+Question: {question}
+Gold answer: {gold}
+Model prediction: {pred}
+
+Decide whether the prediction is correct.
+"""
+
+
+@dataclass
+class JudgeConfig:
+    """Configuration handed to ``LlmJudge`` at construction time."""
+
+    api_key: str
+    model: str = "anthropic/claude-sonnet-4.5"
+    base_url: str = "https://openrouter.ai/api/v1"
+    max_tokens: int = 200
+    concurrency: int = 4
+
+
+class LlmJudge:
+    """Async LLM judge over OpenRouter chat completions."""
+
+    def __init__(self, *, config: JudgeConfig) -> None:
+        self._config = config
+        self._provider = OpenRouterChatProvider(
+            api_key=config.api_key,
+            base_url=config.base_url,
+            model=config.model,
+        )
+        self._sem = asyncio.Semaphore(max(1, config.concurrency))
+
+    @property
+    def model(self) -> str:
+        return self._config.model
+
+    async def judge(
+        self,
+        *,
+        question: str,
+        gold: str,
+        pred: str,
+    ) -> tuple[bool, str]:
+        """Return ``(is_correct, rationale)``. Errors return False + reason."""
+
+        prompt = _JUDGE_TEMPLATE.format(question=question, gold=gold, pred=pred)
+        try:
+            async with self._sem:
+                response = await self._provider.complete(
+                    prompt=prompt,
+                    system_prompt=_JUDGE_SYSTEM,
+                    max_tokens=self._config.max_tokens,
+                )
+        except Exception as exc:  # noqa: BLE001
+            return False, f"judge_error: {type(exc).__name__}: {exc}"
+        return _parse_judge_response(response.text)
+
+
+def _parse_judge_response(text: str) -> tuple[bool, str]:
+    """Pull ``correct`` + ``rationale`` out of the judge's reply."""
+
+    if not text or not text.strip():
+        return False, "judge_returned_empty"
+    # Accept JSON anywhere in the message; some models prepend prose.
+    match = re.search(r"\{[^{}]*\}", text, flags=re.DOTALL)
+    candidate = match.group(0) if match else text
+    try:
+        data = json.loads(candidate)
+    except (json.JSONDecodeError, ValueError):
+        # Fallback: yes/no parsing.
+        lowered = text.strip().lower()
+        if lowered.startswith("yes") or "correct: yes" in lowered or '"correct": true' in lowered:
+            return True, "yes (parser_fallback)"
+        if lowered.startswith("no") or "correct: no" in lowered or '"correct": false' in lowered:
+            return False, "no (parser_fallback)"
+        return False, f"unparseable_judge_response: {text[:200]}"
+    correct = bool(data.get("correct"))
+    rationale = str(data.get("rationale", "")).strip()[:280]
+    return correct, rationale
+
+
+# ---------------------------------------------------------------------------
+# Combined grader
+# ---------------------------------------------------------------------------
+
+
+async def grade_with_judge(
+    *,
+    pred: str,
+    gold: str,
+    question: str,
+    judge: LlmJudge | None,
+) -> GradeResult:
+    """Grade one row: deterministic shortcut → optional LLM judge fallback."""
+
+    det = grade_deterministic(pred=pred, gold=gold)
+    if det.correct or det.method != "lexical_miss":
+        return det
+    if judge is None:
+        return det
+    is_correct, rationale = await judge.judge(question=question, gold=gold, pred=pred)
+    return GradeResult(
+        correct=is_correct,
+        f1=1.0 if is_correct else 0.0,
+        method="llm_judge",
+        normalised_pred=det.normalised_pred,
+        normalised_gold=det.normalised_gold,
+        judge_rationale=rationale,
+    )
+
+
+async def grade_many(
+    *,
+    rows: Sequence[tuple[str, str, str, str]],
+    judge: LlmJudge | None,
+) -> list[GradeResult]:
+    """Grade ``[(qid, question, gold, pred), ...]`` concurrently.
+
+    The judge already enforces its own concurrency cap; this just
+    schedules everything via ``asyncio.gather``. ``qid`` is unused
+    inside the grader but threaded through so callers can correlate
+    results back to their rows.
+    """
+
+    if not rows:
+        return []
+    coros = [
+        grade_with_judge(pred=p, gold=g, question=q, judge=judge)
+        for _qid, q, g, p in rows
+    ]
+    return list(await asyncio.gather(*coros))
+
+
+__all__ = [
+    "GradeResult",
+    "JudgeConfig",
+    "LlmJudge",
+    "grade_deterministic",
+    "grade_many",
+    "grade_with_judge",
+]
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/frames/ingest.py b/surfsense_evals/src/surfsense_evals/suites/research/frames/ingest.py
new file mode 100644
index 0000000000..9780be4eda
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/frames/ingest.py
@@ -0,0 +1,341 @@
+"""FRAMES ingestion: download → fetch Wikipedia → upload markdown.
+
+Steps:
+
+1. Download ``test.tsv`` from ``hf://datasets/google/frames-benchmark``.
+2. Parse rows into ``FramesQuestion`` objects.
+3. Optionally cap to the first ``--max-questions N`` so a smoke run
+   doesn't trigger a 1k-article fetch.
+4. Build the **deduplicated** set of Wikipedia URLs across the chosen
+   sample (questions share many articles — Q1 and Q42 might both
+   reference ``James_A._Garfield``).
+5. Fetch each unique article via ``WikiFetcher`` (polite 2 RPS) into
+   ``<bench_dir>/wiki/<title>.md``.
+6. Upload the resulting markdown files to SurfSense in batches with
+   ``use_vision_llm=False, processing_mode="basic"`` (text-only — no
+   reason to pay vision LLM costs on Wikipedia plaintext).
+7. Persist a doc map at
+   ``<suite_data>/maps/frames_doc_map.jsonl`` with one row per question
+   listing its ``document_ids`` (so the runner *could* scope retrieval
+   if requested, though by default we don't — see ``runner.py``).
+
+The doc map row shape:
+
+    {"qid": "Q000",
+     "wiki_titles": ["President of the United States", "James Buchanan", ...],
+     "document_ids": [123, 124, ...],
+     "missing_titles": []}
+
+We resolve titles → SurfSense document_ids via the post-upload
+``DocumentStatus.title`` field. SurfSense's title is the uploaded
+filename (without extension), so we round-trip via
+``cache_filename_for_title`` to match.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from ....core.clients.documents import (
+    DocumentProcessingFailed,
+    DocumentProcessingTimeout,
+)
+from ....core.config import set_suite_state
+from ....core.ingest_settings import IngestSettings, settings_header_line
+from ....core.registry import RunContext
+from .dataset import (
+    download_test_tsv,
+    load_questions,
+    write_questions_jsonl,
+)
+from .wiki_fetch import (
+    WikiArticle,
+    WikiFetcher,
+    cache_filename_for_title,
+    title_from_url,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class _IngestStats:
+    n_questions: int
+    n_unique_urls: int
+    n_fetched: int
+    n_cached_hits: int
+    n_missing: int
+    n_uploaded: int
+    n_existing: int
+    bench_dir: Path
+    map_path: Path
+
+
+async def _fetch_articles(
+    fetcher: WikiFetcher,
+    urls: list[str],
+) -> tuple[dict[str, WikiArticle], list[str]]:
+    """Fetch each URL serially (the WikiFetcher's rate-limiter serialises anyway).
+
+    Returns ``(url -> WikiArticle, missing_urls)``. Missing means
+    Wikipedia reported the title doesn't exist, the URL was non-wiki,
+    or the API returned an empty extract.
+    """
+
+    fetched: dict[str, WikiArticle] = {}
+    missing: list[str] = []
+    n_total = len(urls)
+    for i, url in enumerate(urls, start=1):
+        try:
+            article = await fetcher.fetch(url)
+        except Exception as exc:  # noqa: BLE001
+            logger.warning("FRAMES wiki fetch %s failed: %s", url, exc)
+            missing.append(url)
+            continue
+        if article is None:
+            missing.append(url)
+            continue
+        fetched[url] = article
+        if i % 25 == 0 or i == n_total:
+            logger.info("  ... fetched %d / %d Wikipedia articles", i, n_total)
+    return fetched, missing
+
+
+async def _upload_markdowns(
+    ctx: RunContext,
+    articles: list[WikiArticle],
+    *,
+    batch_size: int,
+    settings: IngestSettings,
+) -> dict[str, int]:
+    """Upload deduplicated markdown files. Returns ``filename -> document_id``.
+
+    SurfSense dedupes uploads on ``(filename, search_space_id)``, so
+    re-running ingest after a crash is idempotent — duplicates land in
+    ``duplicate_document_ids`` and we still recover their ids via the
+    status endpoint.
+    """
+
+    if not articles:
+        return {}
+    docs_client = ctx.documents_client()
+    name_to_id: dict[str, int] = {}
+    paths = [a.markdown_path for a in articles]
+    for batch_start in range(0, len(paths), batch_size):
+        batch = paths[batch_start : batch_start + batch_size]
+        result = await docs_client.upload(
+            files=batch,
+            search_space_id=ctx.search_space_id,
+            should_summarize=settings.should_summarize,
+            use_vision_llm=settings.use_vision_llm,
+            processing_mode=settings.processing_mode,
+        )
+        all_ids = list(result.document_ids) + list(result.duplicate_document_ids)
+        if result.document_ids:
+            try:
+                await docs_client.wait_until_ready(
+                    search_space_id=ctx.search_space_id,
+                    document_ids=result.document_ids,
+                    timeout_s=900.0,
+                )
+            except (DocumentProcessingFailed, DocumentProcessingTimeout) as exc:
+                logger.warning("FRAMES batch processing issue: %s", exc)
+        if all_ids:
+            statuses = await docs_client.get_status(
+                search_space_id=ctx.search_space_id,
+                document_ids=all_ids,
+            )
+            for s in statuses:
+                # SurfSense stores the uploaded filename as ``title`` (no extension).
+                stem = Path(s.title).stem if s.title.endswith(".md") else s.title
+                name_to_id[stem] = s.document_id
+                name_to_id[s.title] = s.document_id
+        logger.info(
+            "FRAMES upload batch %d-%d: %d new, %d duplicate",
+            batch_start, batch_start + len(batch),
+            len(result.document_ids), len(result.duplicate_document_ids),
+        )
+    return name_to_id
+
+
+def _resolve_question_doc_ids(
+    questions: list[Any],
+    fetched: dict[str, WikiArticle],
+    name_to_id: dict[str, int],
+) -> list[dict[str, Any]]:
+    """For each question, list the document_ids of its (fetched) wiki articles."""
+
+    rows: list[dict[str, Any]] = []
+    for q in questions:
+        doc_ids: list[int] = []
+        titles: list[str] = []
+        missing: list[str] = []
+        for url in q.wiki_urls:
+            article = fetched.get(url)
+            if article is None:
+                missing.append(url)
+                continue
+            titles.append(article.title)
+            stem = Path(cache_filename_for_title(article.title)).stem
+            doc_id = name_to_id.get(stem) or name_to_id.get(article.markdown_path.name)
+            if doc_id is not None and doc_id not in doc_ids:
+                doc_ids.append(doc_id)
+        rows.append({
+            "qid": q.qid,
+            "raw_index": q.raw_index,
+            "n_wiki_urls": len(q.wiki_urls),
+            "wiki_titles": titles,
+            "document_ids": doc_ids,
+            "missing_urls": missing,
+        })
+    return rows
+
+
+# ---------------------------------------------------------------------------
+# Public entry point
+# ---------------------------------------------------------------------------
+
+
+async def run_ingest(
+    ctx: RunContext,
+    *,
+    max_questions: int | None = None,
+    upload_batch_size: int = 16,
+    skip_upload: bool = False,
+    fetch_rate_limit_rps: float = 2.0,
+    settings: IngestSettings | None = None,
+) -> None:
+    """Ingest the FRAMES benchmark into the research suite.
+
+    Parameters
+    ----------
+    max_questions : int | None
+        Cap on the number of FRAMES questions to materialise. ``None`` =
+        all 824 (≈300+ unique articles). Smoke runs should pass 5-10.
+    upload_batch_size : int
+        Markdown files per ``/documents/fileupload`` call. Larger
+        batches reduce round-trip overhead; smaller batches recover
+        faster from individual processing failures.
+    skip_upload : bool
+        Fetch + cache Wikipedia articles locally but don't push to
+        SurfSense. Useful for debugging the fetcher in isolation.
+    fetch_rate_limit_rps : float
+        Maximum requests-per-second to the Wikipedia API. Default 2.0
+        is a polite ceiling; raise cautiously.
+    settings : IngestSettings | None
+        Override per-upload knobs. FRAMES defaults to text-only
+        (no vision LLM, basic mode) — the corpus is plain wikitext.
+    """
+
+    settings = settings or IngestSettings(
+        use_vision_llm=False,
+        processing_mode="basic",
+        should_summarize=False,
+    )
+    bench_dir = ctx.benchmark_data_dir()
+    wiki_cache = bench_dir / "wiki"
+    wiki_cache.mkdir(parents=True, exist_ok=True)
+    hf_cache = bench_dir / ".hf_cache"
+    hf_cache.mkdir(parents=True, exist_ok=True)
+
+    # 1. Download + parse questions.
+    tsv_path = download_test_tsv(hf_cache)
+    questions = load_questions(tsv_path)
+    if not questions:
+        raise RuntimeError(
+            "FRAMES test.tsv contained no parseable rows; upstream may "
+            "have changed schema."
+        )
+    logger.info("FRAMES: parsed %d questions from %s", len(questions), tsv_path.name)
+    if max_questions is not None and max_questions > 0:
+        questions = questions[:max_questions]
+        logger.info("FRAMES: capped to first %d questions", len(questions))
+
+    questions_jsonl = bench_dir / "questions.jsonl"
+    write_questions_jsonl(questions, questions_jsonl)
+
+    # 2. Build deduplicated URL set (preserving first-seen order).
+    seen_urls: dict[str, None] = {}
+    for q in questions:
+        for url in q.wiki_urls:
+            seen_urls.setdefault(url, None)
+    unique_urls = list(seen_urls.keys())
+    logger.info(
+        "FRAMES: %d unique Wikipedia URLs across %d questions",
+        len(unique_urls), len(questions),
+    )
+
+    # 3. Fetch (with cache).
+    fetcher = WikiFetcher(cache_dir=wiki_cache, rate_limit_rps=fetch_rate_limit_rps)
+    n_cached = sum(
+        1 for url in unique_urls
+        if (wiki_cache / cache_filename_for_title(_safe_title(url))).exists()
+    )
+    fetched, missing_urls = await _fetch_articles(fetcher, unique_urls)
+    logger.info(
+        "FRAMES: fetched=%d, cache_hits=%d, missing=%d",
+        len(fetched), n_cached, len(missing_urls),
+    )
+
+    # 4. Upload to SurfSense (deduped by filename).
+    name_to_id: dict[str, int] = {}
+    if skip_upload:
+        logger.info("FRAMES: --skip-upload; skipping SurfSense ingestion")
+    else:
+        unique_articles = list({a.markdown_path: a for a in fetched.values()}.values())
+        name_to_id = await _upload_markdowns(
+            ctx,
+            unique_articles,
+            batch_size=upload_batch_size,
+            settings=settings,
+        )
+
+    # 5. Persist per-question doc map.
+    doc_rows = _resolve_question_doc_ids(questions, fetched, name_to_id)
+
+    map_path = ctx.maps_dir() / "frames_doc_map.jsonl"
+    with map_path.open("w", encoding="utf-8") as fh:
+        fh.write(settings_header_line(settings) + "\n")
+        for row in doc_rows:
+            fh.write(json.dumps(row) + "\n")
+    logger.info("Wrote FRAMES doc map to %s (%d rows)", map_path, len(doc_rows))
+
+    # 6. Update suite state.
+    new_state = ctx.suite_state
+    new_state.ingestion_maps["frames"] = str(map_path)
+    set_suite_state(ctx.config, ctx.suite, new_state)
+
+    stats = _IngestStats(
+        n_questions=len(questions),
+        n_unique_urls=len(unique_urls),
+        n_fetched=len(fetched),
+        n_cached_hits=n_cached,
+        n_missing=len(missing_urls),
+        n_uploaded=len(name_to_id),
+        n_existing=0,
+        bench_dir=bench_dir,
+        map_path=map_path,
+    )
+    logger.info("FRAMES ingest done: %s", stats)
+
+
+def _safe_title(url: str) -> str:
+    """Pre-cache title resolution; returns ``""`` on bad URL."""
+
+    try:
+        return title_from_url(url)
+    except ValueError:
+        return ""
+
+
+__all__ = ["run_ingest"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/frames/prompt.py b/surfsense_evals/src/surfsense_evals/suites/research/frames/prompt.py
new file mode 100644
index 0000000000..16bb06da41
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/frames/prompt.py
@@ -0,0 +1,71 @@
+"""FRAMES prompt templates.
+
+Two templates: one for the bare-LLM arm (no retrieval), one for
+SurfSense (the agent retrieves; we mostly just instruct it on
+output format). Both arms must use byte-identical *content* for the
+question itself so the head-to-head is fair — the wrappers diverge
+only in framing.
+
+Format expectations (mirrors the FRAMES paper, section 4):
+
+* Short factual answer — names, dates, numbers, ordinals
+* No extra explanation in the final line; we anchor on
+  ``Answer: <text>`` for deterministic extraction
+* Free-text reasoning is *allowed* before the final ``Answer:`` line —
+  multi-hop questions often benefit from it. We just don't grade it.
+"""
+
+from __future__ import annotations
+
+
+_BASE_INSTRUCTIONS = (
+    "You are a careful question-answering assistant. The question may "
+    "require combining facts from multiple sources, doing arithmetic, "
+    "or reasoning about dates. Think step by step if needed, then give "
+    "the final answer.\n\n"
+    "Format your final line EXACTLY as:\n"
+    "Answer: <short answer>\n\n"
+    "The answer should be as short as possible — a name, a number, a "
+    "date, a single phrase. Do not repeat the question. Do not include "
+    "punctuation at the end unless it is part of the answer."
+)
+
+
+_BARE_TEMPLATE = """\
+{instructions}
+
+Question: {question}
+"""
+
+
+_SURFSENSE_TEMPLATE = """\
+{instructions}
+
+You have access to a Wikipedia knowledge base via retrieval. Use it
+to look up any facts you are not confident about. The corpus contains
+the Wikipedia articles needed to answer this question, but you must
+retrieve them yourself — they are not pre-selected.
+
+Question: {question}
+"""
+
+
+def build_bare_prompt(question: str) -> str:
+    """Prompt for the no-retrieval baseline arm."""
+
+    return _BARE_TEMPLATE.format(
+        instructions=_BASE_INSTRUCTIONS,
+        question=question.strip(),
+    )
+
+
+def build_surfsense_prompt(question: str) -> str:
+    """Prompt for the SurfSense arm (retrieval-augmented)."""
+
+    return _SURFSENSE_TEMPLATE.format(
+        instructions=_BASE_INSTRUCTIONS,
+        question=question.strip(),
+    )
+
+
+__all__ = ["build_bare_prompt", "build_surfsense_prompt"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/frames/runner.py b/surfsense_evals/src/surfsense_evals/suites/research/frames/runner.py
new file mode 100644
index 0000000000..a8dde0dd2c
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/frames/runner.py
@@ -0,0 +1,686 @@
+"""FRAMES runner — Bare LLM (no retrieval) vs SurfSense (multi-hop RAG).
+
+Two arms run paired on every question in the sample:
+
+1. ``BareLlmArm``  — OpenRouter chat completion with the question only.
+   Reproduces the published "naive prompting" baseline (40.8% on
+   Gemini-Pro-1.5).
+2. ``SurfSenseArm`` — POST ``/api/v1/new_chat`` with **no**
+   ``mentioned_document_ids`` so the agent retrieves over the entire
+   ingested Wikipedia corpus. This is the "multi-step retrieval &
+   reasoning" cell in the FRAMES paper.
+
+Open-ended grading: deterministic shortcut + optional LLM-as-judge
+(``--no-judge`` to disable). Cost / latency / token aggregates are
+collected per arm. Paired stats (McNemar, bootstrap CI) for the
+accuracy delta. Per-reasoning-type breakdown to surface where one
+arm beats the other (numerical vs temporal vs multi-constraint, ...).
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import logging
+import os
+from collections.abc import Iterable
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from ....core.arms import ArmRequest, ArmResult, BareLlmArm, SurfSenseArm
+from ....core.config import utc_iso_timestamp
+from ....core.ingest_settings import (
+    IngestSettings,
+    add_ingest_settings_args,
+    format_ingest_settings_md,
+    is_settings_header,
+)
+from ....core.metrics.comparison import (
+    bootstrap_delta_ci,
+    mcnemar_test,
+    paired_aggregate,
+)
+from ....core.metrics.mc_accuracy import accuracy_with_wilson_ci
+from ....core.parse.freeform_answer import extract_freeform_answer
+from ....core.providers.openrouter_chat import OpenRouterChatProvider
+from ....core.registry import ReportSection, RunArtifact, RunContext
+from ....core.scenarios import format_scenario_md
+from .grader import GradeResult, JudgeConfig, LlmJudge, grade_many
+from .prompt import build_bare_prompt, build_surfsense_prompt
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Question shape
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class FramesRunnerQuestion:
+    qid: str
+    raw_index: int
+    question: str
+    gold_answer: str
+    reasoning_types: list[str]
+    document_ids: list[int]   # subset of corpus relevant to this Q (may be empty)
+    n_wiki_urls: int
+    missing_urls: list[str]
+
+
+def _load_doc_map(map_path: Path) -> tuple[dict[str, dict[str, Any]], dict[str, Any]]:
+    rows: dict[str, dict[str, Any]] = {}
+    settings: dict[str, Any] = {}
+    with map_path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            if is_settings_header(row):
+                settings = dict(row["__settings__"])
+                continue
+            rows[str(row["qid"])] = row
+    return rows, settings
+
+
+def _load_questions(
+    questions_jsonl: Path,
+    doc_map: dict[str, dict[str, Any]],
+    *,
+    sample_n: int | None,
+    reasoning_filter: str | None,
+) -> list[FramesRunnerQuestion]:
+    out: list[FramesRunnerQuestion] = []
+    with questions_jsonl.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            qid = str(row.get("qid") or "").strip()
+            if not qid:
+                continue
+            map_row = doc_map.get(qid, {})
+            reasoning = list(row.get("reasoning_types") or [])
+            if reasoning_filter and reasoning_filter not in [r.lower() for r in reasoning]:
+                continue
+            out.append(FramesRunnerQuestion(
+                qid=qid,
+                raw_index=int(row.get("raw_index") or 0),
+                question=str(row.get("question") or "").strip(),
+                gold_answer=str(row.get("gold_answer") or "").strip(),
+                reasoning_types=reasoning,
+                document_ids=list(map_row.get("document_ids") or []),
+                n_wiki_urls=int(map_row.get("n_wiki_urls") or 0),
+                missing_urls=list(map_row.get("missing_urls") or []),
+            ))
+    out.sort(key=lambda q: q.raw_index)
+    if sample_n is not None and sample_n > 0:
+        out = out[:sample_n]
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Bounded concurrency helper
+# ---------------------------------------------------------------------------
+
+
+async def _gather_with_limit(coros: Iterable, *, concurrency: int) -> list[Any]:
+    sem = asyncio.Semaphore(max(1, concurrency))
+
+    async def _wrap(coro):
+        async with sem:
+            return await coro
+
+    return await asyncio.gather(*(_wrap(c) for c in coros))
+
+
+# ---------------------------------------------------------------------------
+# Benchmark
+# ---------------------------------------------------------------------------
+
+
+_DESCRIPTION = (
+    "FRAMES (824 multi-hop Wikipedia questions, 5 reasoning types) — "
+    "Bare LLM (no retrieval) vs SurfSense (multi-step RAG over the "
+    "Wikipedia corpus). Tests cross-document retrieval + reasoning."
+)
+
+
+_DEFAULT_INGEST_SETTINGS = IngestSettings(
+    use_vision_llm=False,
+    processing_mode="basic",
+    should_summarize=False,
+)
+
+
+class FramesBenchmark:
+    """Multi-hop Wikipedia RAG vs naive prompting."""
+
+    suite: str = "research"
+    name: str = "frames"
+    headline: bool = True
+    description: str = _DESCRIPTION
+
+    def add_run_args(self, parser: argparse.ArgumentParser) -> None:
+        parser.add_argument(
+            "--n", dest="sample_n", type=int, default=None,
+            help="Run only the first N questions after filters (default: all 824).",
+        )
+        parser.add_argument(
+            "--reasoning",
+            dest="reasoning_filter",
+            default=None,
+            help=(
+                "Filter to questions tagged with this reasoning type "
+                "(e.g. 'numerical reasoning', 'temporal reasoning'). "
+                "Case-insensitive substring against the upstream tags."
+            ),
+        )
+        parser.add_argument(
+            "--concurrency", type=int, default=4,
+            help="Parallel question workers per arm.",
+        )
+        parser.add_argument(
+            "--scope-mentions", dest="scope_mentions", action="store_true",
+            help=(
+                "SurfSense arm: scope retrieval to the per-question "
+                "document_ids (oracle-retrieval upper bound). Default "
+                "is full-corpus retrieval (the realistic FRAMES setting)."
+            ),
+        )
+        parser.add_argument(
+            "--max-output-tokens", type=int, default=512,
+            help="Cap on completion length for both arms.",
+        )
+        parser.add_argument(
+            "--no-judge", dest="no_judge", action="store_true",
+            help=(
+                "Disable LLM-as-judge fallback grading; use only the "
+                "deterministic grader (faster but more pessimistic)."
+            ),
+        )
+        parser.add_argument(
+            "--judge-model",
+            dest="judge_model",
+            default="anthropic/claude-sonnet-4.5",
+            help="OpenRouter slug for the LLM judge (default: claude-sonnet-4.5).",
+        )
+        parser.add_argument(
+            "--judge-concurrency",
+            dest="judge_concurrency",
+            type=int,
+            default=4,
+            help="Parallel judge calls (default: 4).",
+        )
+        # Ingest-only knobs.
+        parser.add_argument(
+            "--max-questions", dest="max_questions", type=int, default=None,
+            help="(ingest only) cap on number of questions to materialise + ingest.",
+        )
+        parser.add_argument(
+            "--upload-batch-size", dest="upload_batch_size", type=int, default=16,
+            help="(ingest only) markdown files per fileupload call.",
+        )
+        parser.add_argument(
+            "--skip-upload", dest="skip_upload", action="store_true",
+            help="(ingest only) cache wiki articles locally but don't push to SurfSense.",
+        )
+        parser.add_argument(
+            "--fetch-rps", dest="fetch_rate_limit_rps", type=float, default=2.0,
+            help="(ingest only) max requests/second to the Wikipedia API.",
+        )
+        add_ingest_settings_args(parser, defaults=_DEFAULT_INGEST_SETTINGS)
+
+    async def ingest(self, ctx: RunContext, **opts: Any) -> None:
+        from .ingest import run_ingest
+
+        settings = IngestSettings.merge(_DEFAULT_INGEST_SETTINGS, opts)
+        await run_ingest(
+            ctx,
+            max_questions=opts.get("max_questions"),
+            upload_batch_size=int(opts.get("upload_batch_size") or 16),
+            skip_upload=bool(opts.get("skip_upload", False)),
+            fetch_rate_limit_rps=float(opts.get("fetch_rate_limit_rps") or 2.0),
+            settings=settings,
+        )
+
+    async def run(self, ctx: RunContext, **opts: Any) -> RunArtifact:
+        sample_n = opts.get("sample_n")
+        reasoning_filter = opts.get("reasoning_filter")
+        if reasoning_filter:
+            reasoning_filter = reasoning_filter.strip().lower() or None
+        concurrency = int(opts.get("concurrency") or 4)
+        scope_mentions = bool(opts.get("scope_mentions"))
+        max_output_tokens = int(opts.get("max_output_tokens") or 512)
+        no_judge = bool(opts.get("no_judge"))
+        judge_model = str(opts.get("judge_model") or "anthropic/claude-sonnet-4.5")
+        judge_concurrency = int(opts.get("judge_concurrency") or 4)
+
+        bench_dir = ctx.benchmark_data_dir()
+        questions_jsonl = bench_dir / "questions.jsonl"
+        map_path = ctx.maps_dir() / "frames_doc_map.jsonl"
+        if not questions_jsonl.exists() or not map_path.exists():
+            raise RuntimeError(
+                "FRAMES not ingested for this suite. Run "
+                "`python -m surfsense_evals ingest research frames` first."
+            )
+
+        doc_map, ingest_settings = _load_doc_map(map_path)
+        questions = _load_questions(
+            questions_jsonl, doc_map,
+            sample_n=sample_n,
+            reasoning_filter=reasoning_filter,
+        )
+        if not questions:
+            raise RuntimeError(
+                "No FRAMES questions matched the filters; broaden --reasoning/--n."
+            )
+        logger.info("FRAMES: scheduled %d questions", len(questions))
+
+        api_key = os.environ.get("OPENROUTER_API_KEY")
+        if not api_key:
+            raise RuntimeError(
+                "OPENROUTER_API_KEY env var is required for the bare-LLM arm."
+            )
+
+        bare_provider = OpenRouterChatProvider(
+            api_key=api_key,
+            base_url=ctx.config.openrouter_base_url,
+            model=ctx.native_arm_model,
+        )
+        bare_arm = BareLlmArm(
+            provider=bare_provider,
+            max_output_tokens=max_output_tokens,
+        )
+        surf_arm = SurfSenseArm(
+            client=ctx.new_chat_client(),
+            search_space_id=ctx.search_space_id,
+            ephemeral_threads=True,
+        )
+
+        judge: LlmJudge | None = None
+        if not no_judge:
+            judge = LlmJudge(config=JudgeConfig(
+                api_key=api_key,
+                model=judge_model,
+                base_url=ctx.config.openrouter_base_url,
+                concurrency=judge_concurrency,
+            ))
+
+        run_timestamp = utc_iso_timestamp()
+        run_dir = ctx.runs_dir(run_timestamp=run_timestamp)
+        raw_path = run_dir / "raw.jsonl"
+
+        async def _bare_one(q: FramesRunnerQuestion) -> ArmResult:
+            return await bare_arm.answer(_make_bare_request(q, max_output_tokens))
+
+        async def _surf_one(q: FramesRunnerQuestion) -> ArmResult:
+            return await surf_arm.answer(
+                _make_surfsense_request(q, scope_mentions=scope_mentions)
+            )
+
+        bare_results, surf_results = await asyncio.gather(
+            _gather_with_limit((_bare_one(q) for q in questions), concurrency=concurrency),
+            _gather_with_limit((_surf_one(q) for q in questions), concurrency=concurrency),
+        )
+
+        bare_grades = await _grade_results(questions, bare_results, judge=judge)
+        surf_grades = await _grade_results(questions, surf_results, judge=judge)
+
+        with raw_path.open("w", encoding="utf-8") as fh:
+            for q, b_res, s_res, b_g, s_g in zip(
+                questions, bare_results, surf_results, bare_grades, surf_grades, strict=False
+            ):
+                meta = {
+                    "qid": q.qid,
+                    "raw_index": q.raw_index,
+                    "reasoning_types": q.reasoning_types,
+                    "n_wiki_urls": q.n_wiki_urls,
+                    "n_resolved_doc_ids": len(q.document_ids),
+                    "n_missing_urls": len(q.missing_urls),
+                    "gold": q.gold_answer,
+                }
+                fh.write(json.dumps({
+                    **meta,
+                    **b_res.to_jsonl(),
+                    "graded": b_g.to_dict(),
+                }) + "\n")
+                fh.write(json.dumps({
+                    **meta,
+                    **s_res.to_jsonl(),
+                    "graded": s_g.to_dict(),
+                }) + "\n")
+
+        metrics = _compute_metrics(questions, bare_results, surf_results, bare_grades, surf_grades)
+        artifact = RunArtifact(
+            suite=self.suite,
+            benchmark=self.name,
+            run_timestamp=run_timestamp,
+            raw_path=raw_path,
+            metrics=metrics,
+            extra={
+                "n_questions": len(questions),
+                "concurrency": concurrency,
+                "reasoning_filter": reasoning_filter,
+                "scope_mentions": scope_mentions,
+                "no_judge": no_judge,
+                "judge_model": judge_model if not no_judge else None,
+                "scenario": ctx.scenario,
+                "provider_model": ctx.provider_model,
+                "native_arm_model": ctx.native_arm_model,
+                "vision_provider_model": ctx.vision_provider_model,
+                "agent_llm_id": ctx.agent_llm_id,
+                "ingest_settings": ingest_settings,
+                "bare_arm_label": "bare_llm",
+            },
+        )
+
+        manifest_path = run_dir / "run_artifact.json"
+        manifest_path.write_text(
+            json.dumps({
+                "suite": self.suite,
+                "benchmark": self.name,
+                "raw_path": "raw.jsonl",
+                "metrics": metrics,
+                "extra": artifact.extra,
+            }, indent=2, sort_keys=True) + "\n",
+            encoding="utf-8",
+        )
+        return artifact
+
+    def report_section(self, artifacts: list[RunArtifact]) -> ReportSection:
+        if not artifacts:
+            return ReportSection(
+                title="FRAMES — Bare LLM vs SurfSense (multi-hop Wikipedia RAG)",
+                headline=True,
+                body_md="(no run artifacts found)",
+                body_json={},
+            )
+        latest = max(artifacts, key=lambda a: a.run_timestamp)
+        m = latest.metrics
+        bare = m.get("bare", {})
+        surf = m.get("surfsense", {})
+        delta = m.get("delta", {})
+        per_reasoning = m.get("per_reasoning", {})
+        extra = latest.extra
+
+        body_lines: list[str] = []
+        body_lines.append(
+            f"- Sample size: {extra.get('n_questions', '?')} questions "
+            f"(reasoning filter: `{extra.get('reasoning_filter') or 'none'}`, "
+            f"scope-mentions: `{extra.get('scope_mentions', False)}`, "
+            f"judge: `{extra.get('judge_model') or 'deterministic-only'}`)."
+        )
+        body_lines.append(format_scenario_md(extra))
+        body_lines.append(format_ingest_settings_md(extra.get("ingest_settings")))
+        body_lines.append(
+            "- Bare LLM arm (OpenRouter chat, no retrieval, "
+            f"`{extra.get('native_arm_model') or extra.get('provider_model', '?')}`):"
+        )
+        body_lines.append(_arm_summary_lines(bare, indent="  "))
+        body_lines.append(
+            "- SurfSense arm (`POST /api/v1/new_chat`, multi-step RAG, "
+            f"`{extra.get('provider_model', '?')}`):"
+        )
+        body_lines.append(_arm_summary_lines(surf, indent="  "))
+        body_lines.append("- Delta (paired):")
+        body_lines.append(
+            f"  - Accuracy: SurfSense {_pp(delta.get('accuracy_pp'))} pp "
+            f"(McNemar p={_fmt(delta.get('mcnemar_p_value'), 4)}, "
+            f"method={delta.get('mcnemar_method')})"
+        )
+        body_lines.append(
+            f"  - Bootstrap 95% CI on accuracy delta: "
+            f"[{_pp(delta.get('bootstrap_ci_low'))}pp, {_pp(delta.get('bootstrap_ci_high'))}pp]"
+        )
+        body_lines.append(
+            f"  - Cost / question: bare ${_dollars(bare.get('cost_micros_mean'))}, "
+            f"surfsense ${_dollars(surf.get('cost_micros_mean'))} "
+            f"(SurfSense delta {_pct_change(delta.get('cost_micros_pct'))})"
+        )
+        body_lines.append(
+            f"  - Latency p50: bare {_ms_to_s(bare.get('latency_ms_median'))}, "
+            f"surfsense {_ms_to_s(surf.get('latency_ms_median'))} "
+            f"(SurfSense delta {_pct_change(delta.get('latency_ms_pct'))})"
+        )
+        if per_reasoning:
+            body_lines.append("- Per-reasoning-type split (accuracy delta in pp):")
+            for tag, vals in sorted(per_reasoning.items()):
+                body_lines.append(
+                    f"  - {tag}: SurfSense {_pp(vals.get('delta_accuracy_pp'))} pp "
+                    f"(n={vals.get('n')}, bare acc={vals.get('bare_accuracy', 0)*100:.1f}%, "
+                    f"surf acc={vals.get('surfsense_accuracy', 0)*100:.1f}%)"
+                )
+
+        return ReportSection(
+            title="FRAMES — Bare LLM vs SurfSense (multi-hop Wikipedia RAG)",
+            headline=True,
+            body_md="\n".join(body_lines),
+            body_json=m,
+        )
+
+
+# ---------------------------------------------------------------------------
+# Per-question helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_bare_request(q: FramesRunnerQuestion, max_tokens: int) -> ArmRequest:
+    return ArmRequest(
+        question_id=q.qid,
+        prompt=build_bare_prompt(q.question),
+        options={"max_tokens": max_tokens},
+    )
+
+
+def _make_surfsense_request(q: FramesRunnerQuestion, *, scope_mentions: bool) -> ArmRequest:
+    mentions: list[int] | None = None
+    if scope_mentions and q.document_ids:
+        mentions = list(q.document_ids)
+    return ArmRequest(
+        question_id=q.qid,
+        prompt=build_surfsense_prompt(q.question),
+        mentioned_document_ids=mentions,
+    )
+
+
+async def _grade_results(
+    questions: list[FramesRunnerQuestion],
+    results: list[ArmResult],
+    *,
+    judge: LlmJudge | None,
+) -> list[GradeResult]:
+    rows: list[tuple[str, str, str, str]] = []
+    for q, r in zip(questions, results, strict=False):
+        pred = extract_freeform_answer(r.raw_text or "")
+        rows.append((q.qid, q.question, q.gold_answer, pred))
+    return await grade_many(rows=rows, judge=judge)
+
+
+# ---------------------------------------------------------------------------
+# Metrics aggregation
+# ---------------------------------------------------------------------------
+
+
+def _compute_metrics(
+    questions: list[FramesRunnerQuestion],
+    bare_results: list[ArmResult],
+    surf_results: list[ArmResult],
+    bare_grades: list[GradeResult],
+    surf_grades: list[GradeResult],
+) -> dict[str, Any]:
+    bare_correct = [g.correct for g in bare_grades]
+    surf_correct = [g.correct for g in surf_grades]
+
+    bare_costs = [float(r.cost_micros) for r in bare_results]
+    surf_costs = [float(r.cost_micros) for r in surf_results]
+    bare_latencies = [float(r.latency_ms) for r in bare_results]
+    surf_latencies = [float(r.latency_ms) for r in surf_results]
+    bare_in_tokens = [float(r.input_tokens) for r in bare_results]
+    bare_out_tokens = [float(r.output_tokens) for r in bare_results]
+
+    bare_acc = accuracy_with_wilson_ci(sum(bare_correct), len(bare_correct))
+    surf_acc = accuracy_with_wilson_ci(sum(surf_correct), len(surf_correct))
+    mc = mcnemar_test(bare_correct, surf_correct)
+    boot = bootstrap_delta_ci(bare_correct, surf_correct, n_resamples=2000)
+
+    bare_cost_agg = paired_aggregate(bare_costs)
+    surf_cost_agg = paired_aggregate(surf_costs)
+    bare_latency_agg = paired_aggregate(bare_latencies)
+    surf_latency_agg = paired_aggregate(surf_latencies)
+    cost_pct = _safe_pct(surf_cost_agg.mean, bare_cost_agg.mean)
+    latency_pct = _safe_pct(surf_latency_agg.median, bare_latency_agg.median)
+
+    # Per-reasoning-type breakdown. Each question may carry multiple
+    # reasoning tags; we count it under each tag (so totals don't
+    # equal len(questions) — the reader is expected to look at the
+    # per-tag ``n``).
+    per_reasoning_pairs: dict[str, list[tuple[bool, bool]]] = {}
+    for q, b_ok, s_ok in zip(questions, bare_correct, surf_correct, strict=False):
+        tags = q.reasoning_types or ["(untagged)"]
+        for tag in tags:
+            per_reasoning_pairs.setdefault(tag, []).append((b_ok, s_ok))
+
+    per_reasoning: dict[str, dict[str, Any]] = {}
+    for tag, pairs in per_reasoning_pairs.items():
+        b_correct = [a for a, _ in pairs]
+        s_correct = [b for _, b in pairs]
+        per_reasoning[tag] = {
+            "n": len(pairs),
+            "bare_accuracy": (sum(b_correct) / len(pairs)) if pairs else 0.0,
+            "surfsense_accuracy": (sum(s_correct) / len(pairs)) if pairs else 0.0,
+            "delta_accuracy_pp": (
+                100.0 * (sum(s_correct) - sum(b_correct)) / len(pairs)
+                if pairs else 0.0
+            ),
+        }
+
+    grader_methods = {
+        "bare": _count_methods(bare_grades),
+        "surfsense": _count_methods(surf_grades),
+    }
+
+    return {
+        "bare": {
+            **bare_acc.to_dict(),
+            "cost_micros_mean": bare_cost_agg.mean,
+            "cost_micros_median": bare_cost_agg.median,
+            "latency_ms_mean": bare_latency_agg.mean,
+            "latency_ms_median": bare_latency_agg.median,
+            "latency_ms_p95": bare_latency_agg.p95,
+            "input_tokens_mean": (sum(bare_in_tokens) / len(bare_in_tokens)) if bare_in_tokens else 0.0,
+            "output_tokens_mean": (sum(bare_out_tokens) / len(bare_out_tokens)) if bare_out_tokens else 0.0,
+        },
+        "surfsense": {
+            **surf_acc.to_dict(),
+            "cost_micros_mean": surf_cost_agg.mean,
+            "cost_micros_median": surf_cost_agg.median,
+            "latency_ms_mean": surf_latency_agg.mean,
+            "latency_ms_median": surf_latency_agg.median,
+            "latency_ms_p95": surf_latency_agg.p95,
+        },
+        "delta": {
+            "accuracy_pp": 100.0 * (surf_acc.accuracy - bare_acc.accuracy),
+            "mcnemar_p_value": mc.p_value,
+            "mcnemar_method": mc.method,
+            "mcnemar_b_bare_only": mc.b,
+            "mcnemar_c_surfsense_only": mc.c,
+            "bootstrap_ci_low": 100.0 * boot.ci_low,
+            "bootstrap_ci_high": 100.0 * boot.ci_high,
+            "cost_micros_pct": cost_pct,
+            "latency_ms_pct": latency_pct,
+        },
+        "per_reasoning": per_reasoning,
+        "grader_methods": grader_methods,
+    }
+
+
+def _count_methods(grades: list[GradeResult]) -> dict[str, int]:
+    out: dict[str, int] = {}
+    for g in grades:
+        out[g.method] = out.get(g.method, 0) + 1
+    return out
+
+
+def _safe_pct(numerator: float, denominator: float) -> float | None:
+    if denominator == 0:
+        return None
+    return 100.0 * (numerator - denominator) / denominator
+
+
+# ---------------------------------------------------------------------------
+# Tiny formatting helpers used by report_section
+# ---------------------------------------------------------------------------
+
+
+def _arm_summary_lines(d: dict[str, Any], *, indent: str) -> str:
+    if not d:
+        return f"{indent}(no data)"
+    acc = d.get("accuracy", 0.0)
+    low = d.get("ci_low", 0.0)
+    high = d.get("ci_high", 0.0)
+    lines = [
+        f"{indent}- Accuracy: {acc * 100:.1f}% (Wilson 95% CI: {low * 100:.1f}% – {high * 100:.1f}%)",
+        f"{indent}- Cost / question: ${_dollars(d.get('cost_micros_mean'))} (mean), "
+        f"${_dollars(d.get('cost_micros_median'))} (median)",
+        f"{indent}- Latency: p50 {_ms_to_s(d.get('latency_ms_median'))}, "
+        f"p95 {_ms_to_s(d.get('latency_ms_p95'))}",
+    ]
+    if "input_tokens_mean" in d:
+        lines.append(
+            f"{indent}- Mean tokens / question: in {d.get('input_tokens_mean', 0):.0f}, "
+            f"out {d.get('output_tokens_mean', 0):.0f}"
+        )
+    return "\n".join(lines)
+
+
+def _dollars(micros: Any) -> str:
+    if micros is None:
+        return "?"
+    try:
+        return f"{(float(micros) / 1_000_000):.4f}"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _ms_to_s(ms: Any) -> str:
+    if ms is None:
+        return "?"
+    try:
+        return f"{float(ms) / 1000:.1f}s"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _pp(value: Any) -> str:
+    if value is None:
+        return "?"
+    try:
+        return f"{float(value):+.1f}"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _pct_change(value: Any) -> str:
+    if value is None:
+        return "?"
+    try:
+        return f"{float(value):+.0f}%"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _fmt(value: Any, ndigits: int) -> str:
+    if value is None:
+        return "?"
+    try:
+        return f"{float(value):.{ndigits}f}"
+    except (TypeError, ValueError):
+        return "?"
+
+
+__all__ = ["FramesBenchmark", "FramesRunnerQuestion"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/frames/wiki_fetch.py b/surfsense_evals/src/surfsense_evals/suites/research/frames/wiki_fetch.py
new file mode 100644
index 0000000000..7f6b63e50d
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/frames/wiki_fetch.py
@@ -0,0 +1,241 @@
+"""Wikipedia article fetcher → plain-text markdown, with disk cache.
+
+We hit the MediaWiki action API for *plain text* extracts:
+
+    GET https://en.wikipedia.org/w/api.php
+        ?action=query&prop=extracts&explaintext=true
+        &redirects=1&titles=<Title>&format=json&formatversion=2
+
+This avoids HTML→markdown conversion (and its many edge cases). The
+``explaintext=true`` mode strips infoboxes / templates / wikilinks
+and returns clean section-headered prose, which is exactly what we
+want SurfSense to chunk + embed. We prepend ``# <Title>\n\n`` so the
+markdown has a visible H1 (helps SurfSense's chunker preserve doc
+identity at the top of the first chunk).
+
+Caching: every fetched article lands in
+``<bench_dir>/wiki/<sanitised-title>.md`` and is reused on subsequent
+runs. The cache key is the URL-decoded title (e.g.
+``Charlotte_Brontë`` regardless of source URL casing or
+percent-encoding).
+
+Politeness: 2 RPS rate limit + a descriptive User-Agent (Wikimedia
+asks for one). We don't parallelise above 2 RPS — this is a courtesy
+to Wikipedia and only ~300 articles for the n=100 sample.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import re
+import urllib.parse
+from dataclasses import dataclass
+from pathlib import Path
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+
+WIKI_API = "https://en.wikipedia.org/w/api.php"
+USER_AGENT = (
+    "SurfSense-Evals/0.1 (https://github.com/MODSetter/SurfSense; "
+    "research-benchmark fetch; respects 2 RPS rate limit)"
+)
+
+
+@dataclass(frozen=True)
+class WikiArticle:
+    """One fetched article + metadata."""
+
+    title: str            # canonical title returned by MW (post-redirect)
+    source_url: str       # the URL we were asked to fetch
+    markdown_path: Path   # where the cached body lives on disk
+    n_chars: int          # length of the body (post-prepend H1)
+    redirected_from: str | None = None
+
+
+# ---------------------------------------------------------------------------
+# Title <-> URL helpers
+# ---------------------------------------------------------------------------
+
+
+_WIKI_PATH_RE = re.compile(r"^/wiki/(?P<title>[^?#]+)$")
+
+
+def title_from_url(url: str) -> str:
+    """Pull the page title out of a wiki URL.
+
+    ``https://en.wikipedia.org/wiki/Charlotte_Bront%C3%AB`` → ``Charlotte Brontë``.
+    Spaces are preserved (the API accepts spaces and underscores
+    interchangeably; we use spaces to keep cache filenames human-readable).
+    """
+
+    parsed = urllib.parse.urlparse(url)
+    if parsed.netloc and "wikipedia.org" not in parsed.netloc:
+        raise ValueError(f"Not a Wikipedia URL: {url!r}")
+    match = _WIKI_PATH_RE.match(parsed.path)
+    if not match:
+        raise ValueError(f"Unrecognised wiki path: {parsed.path!r}")
+    raw_title = urllib.parse.unquote(match.group("title"))
+    # MW treats underscores and spaces as equivalent; spaces are friendlier.
+    return raw_title.replace("_", " ").strip()
+
+
+_FILENAME_SAFE = re.compile(r"[^A-Za-z0-9._\- ]")
+
+
+def cache_filename_for_title(title: str) -> str:
+    """Map a title to a filesystem-safe filename.
+
+    Replaces every non-(alnum / ``._- `` / space) character with ``_``.
+    Title collisions are rare (FRAMES only has English Wikipedia titles)
+    and a final ``hash(title)[:8]`` would obscure the otherwise-readable
+    filenames; we accept the (vanishingly small) collision risk.
+    """
+
+    safe = _FILENAME_SAFE.sub("_", title)
+    safe = safe.strip().replace(" ", "_")
+    return f"{safe}.md"
+
+
+# ---------------------------------------------------------------------------
+# Async fetcher with rate limiting + retry
+# ---------------------------------------------------------------------------
+
+
+class WikiFetcher:
+    """Polite fetch + disk cache + redirect handling."""
+
+    def __init__(
+        self,
+        *,
+        cache_dir: Path,
+        rate_limit_rps: float = 2.0,
+        timeout_s: float = 30.0,
+        max_retries: int = 3,
+    ) -> None:
+        self._cache_dir = Path(cache_dir)
+        self._cache_dir.mkdir(parents=True, exist_ok=True)
+        self._min_interval = 1.0 / max(rate_limit_rps, 0.1)
+        self._last_request_at = 0.0
+        self._rate_lock = asyncio.Lock()
+        self._timeout = httpx.Timeout(timeout_s, connect=10.0)
+        self._max_retries = max_retries
+
+    async def _throttle(self) -> None:
+        async with self._rate_lock:
+            now = asyncio.get_event_loop().time()
+            wait = self._last_request_at + self._min_interval - now
+            if wait > 0:
+                await asyncio.sleep(wait)
+            self._last_request_at = asyncio.get_event_loop().time()
+
+    async def fetch(
+        self,
+        url: str,
+        *,
+        http: httpx.AsyncClient | None = None,
+    ) -> WikiArticle | None:
+        """Fetch one article. Returns ``None`` only if MW reports the title is missing.
+
+        Raises on transport errors after retries. Caller decides
+        whether to abort the whole ingest or continue with the
+        successfully-fetched subset.
+        """
+
+        try:
+            title = title_from_url(url)
+        except ValueError as exc:
+            logger.warning("Skipping non-wiki URL %s: %s", url, exc)
+            return None
+
+        cache_path = self._cache_dir / cache_filename_for_title(title)
+        if cache_path.exists() and cache_path.stat().st_size > 0:
+            return WikiArticle(
+                title=title,
+                source_url=url,
+                markdown_path=cache_path,
+                n_chars=cache_path.stat().st_size,
+            )
+
+        last_exc: Exception | None = None
+        for attempt in range(self._max_retries):
+            try:
+                await self._throttle()
+                payload = await self._fetch_extract(title, http=http)
+                break
+            except (httpx.HTTPError, RuntimeError) as exc:
+                last_exc = exc
+                wait = 1.0 * (2 ** attempt)
+                logger.warning(
+                    "wiki fetch %r attempt %d failed: %s; retry in %.1fs",
+                    title, attempt + 1, exc, wait,
+                )
+                await asyncio.sleep(wait)
+        else:
+            assert last_exc is not None
+            raise last_exc
+
+        page = payload.get("page") or {}
+        if not page or page.get("missing"):
+            logger.warning("Wikipedia reports missing page for %r (url=%s)", title, url)
+            return None
+
+        canonical_title = str(page.get("title") or title).strip()
+        body = str(page.get("extract") or "").strip()
+        if not body:
+            logger.warning("Wikipedia returned empty extract for %r", title)
+            return None
+        markdown = f"# {canonical_title}\n\n{body}\n"
+        cache_path.write_text(markdown, encoding="utf-8")
+        return WikiArticle(
+            title=canonical_title,
+            source_url=url,
+            markdown_path=cache_path,
+            n_chars=len(markdown),
+            redirected_from=title if canonical_title != title else None,
+        )
+
+    async def _fetch_extract(
+        self,
+        title: str,
+        *,
+        http: httpx.AsyncClient | None,
+    ) -> dict:
+        """One MW API call. Returns ``{'page': {...}}`` (formatversion=2)."""
+
+        params = {
+            "action": "query",
+            "prop": "extracts",
+            "explaintext": "true",
+            "redirects": "1",
+            "format": "json",
+            "formatversion": "2",
+            "titles": title,
+        }
+        headers = {"User-Agent": USER_AGENT, "Accept": "application/json"}
+        if http is not None:
+            response = await http.get(WIKI_API, params=params, headers=headers, timeout=self._timeout)
+        else:
+            async with httpx.AsyncClient(timeout=self._timeout) as client:
+                response = await client.get(WIKI_API, params=params, headers=headers, timeout=self._timeout)
+        response.raise_for_status()
+        data = response.json()
+        if "error" in data:
+            raise RuntimeError(f"MediaWiki API error: {data['error']!r}")
+        pages = (data.get("query") or {}).get("pages") or []
+        if not pages:
+            return {"page": {}}
+        return {"page": pages[0]}
+
+
+__all__ = [
+    "WIKI_API",
+    "USER_AGENT",
+    "WikiArticle",
+    "WikiFetcher",
+    "cache_filename_for_title",
+    "title_from_url",
+]
diff --git a/surfsense_evals/tests/__init__.py b/surfsense_evals/tests/__init__.py
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/surfsense_evals/tests/__init__.py
@@ -0,0 +1 @@
+
diff --git a/surfsense_evals/tests/conftest.py b/surfsense_evals/tests/conftest.py
new file mode 100644
index 0000000000..1cb1d0faf7
--- /dev/null
+++ b/surfsense_evals/tests/conftest.py
@@ -0,0 +1,34 @@
+"""Shared pytest fixtures for surfsense-evals."""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+import pytest
+
+from surfsense_evals.core.config import Config
+
+
+@pytest.fixture
+def tmp_env(monkeypatch, tmp_path: Path) -> Path:
+    """Isolate env vars + filesystem state per test.
+
+    Wipes every ``SURFSENSE_*`` / ``OPENROUTER_*`` / ``EVAL_*`` var so a
+    test that wants a specific credential mode can ``monkeypatch.setenv``
+    just what it needs without leakage from the caller's shell.
+    """
+
+    for key in list(os.environ):
+        if key.startswith(("SURFSENSE_", "OPENROUTER_", "EVAL_")):
+            monkeypatch.delenv(key, raising=False)
+    monkeypatch.setenv("EVAL_DATA_DIR", str(tmp_path / "data"))
+    monkeypatch.setenv("EVAL_REPORTS_DIR", str(tmp_path / "reports"))
+    return tmp_path
+
+
+@pytest.fixture
+def isolated_config(tmp_env: Path) -> Config:  # noqa: ARG001
+    from surfsense_evals.core.config import load_config
+
+    return load_config()
diff --git a/surfsense_evals/tests/core/__init__.py b/surfsense_evals/tests/core/__init__.py
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/surfsense_evals/tests/core/__init__.py
@@ -0,0 +1 @@
+
diff --git a/surfsense_evals/tests/core/test_auth.py b/surfsense_evals/tests/core/test_auth.py
new file mode 100644
index 0000000000..43ec94b934
--- /dev/null
+++ b/surfsense_evals/tests/core/test_auth.py
@@ -0,0 +1,95 @@
+"""Auth credential resolution + 401 refresh hook."""
+
+from __future__ import annotations
+
+import httpx
+import pytest
+import respx
+
+from surfsense_evals.core.auth import (
+    CredentialError,
+    acquire_token,
+    client_with_auth,
+)
+from surfsense_evals.core.config import Config
+
+
+def _make_config(**overrides) -> Config:
+    base = {
+        "surfsense_api_base": "http://test",
+        "openrouter_api_key": None,
+        "openrouter_base_url": "https://openrouter.ai/api/v1",
+        "surfsense_jwt": None,
+        "surfsense_refresh_token": None,
+        "surfsense_user_email": None,
+        "surfsense_user_password": None,
+        "data_dir": None,
+        "reports_dir": None,
+    }
+    base.update(overrides)
+    # Path objects required by Config; tests don't touch the FS.
+    from pathlib import Path
+
+    base["data_dir"] = base["data_dir"] or Path("/tmp/eval_test_data")
+    base["reports_dir"] = base["reports_dir"] or Path("/tmp/eval_test_reports")
+    return Config(**base)
+
+
+@pytest.mark.asyncio
+async def test_acquire_token_jwt_mode_short_circuits():
+    config = _make_config(surfsense_jwt="abc", surfsense_refresh_token="ref")
+    bundle = await acquire_token(config)
+    assert bundle.access_token == "abc"
+    assert bundle.refresh_token == "ref"
+    assert bundle.mode == "jwt"
+
+
+@pytest.mark.asyncio
+@respx.mock
+async def test_acquire_token_local_mode_posts_form():
+    respx.post("http://test/auth/jwt/login").mock(
+        return_value=httpx.Response(
+            200, json={"access_token": "T", "refresh_token": "R", "token_type": "bearer"}
+        )
+    )
+    config = _make_config(
+        surfsense_user_email="u@example.com", surfsense_user_password="pw"
+    )
+    bundle = await acquire_token(config)
+    assert bundle.access_token == "T"
+    assert bundle.refresh_token == "R"
+    assert bundle.mode == "local"
+
+
+@pytest.mark.asyncio
+async def test_acquire_token_no_credentials():
+    config = _make_config()
+    with pytest.raises(CredentialError) as exc:
+        await acquire_token(config)
+    assert "SURFSENSE_USER_EMAIL" in str(exc.value)
+    assert "SURFSENSE_JWT" in str(exc.value)
+
+
+@pytest.mark.asyncio
+@respx.mock
+async def test_client_with_auth_refreshes_on_401():
+    config = _make_config(surfsense_jwt="old", surfsense_refresh_token="ref")
+    bundle = await acquire_token(config)
+
+    respx.post("http://test/auth/jwt/refresh").mock(
+        return_value=httpx.Response(200, json={"access_token": "new", "refresh_token": "ref2"})
+    )
+    # First call returns 401; the retry (post-refresh) returns 200.
+    respx.get("http://test/api/v1/searchspaces").mock(
+        side_effect=[
+            httpx.Response(401, json={"detail": "expired"}),
+            httpx.Response(200, json=[]),
+        ]
+    )
+
+    async with client_with_auth(config, bundle) as client:
+        response = await client.get("http://test/api/v1/searchspaces")
+
+    assert response.status_code == 200
+    assert bundle.access_token == "new"
+    assert bundle.refresh_token == "ref2"
diff --git a/surfsense_evals/tests/core/test_clients.py b/surfsense_evals/tests/core/test_clients.py
new file mode 100644
index 0000000000..9e2c4ad75a
--- /dev/null
+++ b/surfsense_evals/tests/core/test_clients.py
@@ -0,0 +1,262 @@
+"""respx-mocked tests for the SurfSense HTTP clients."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+import httpx
+import pytest
+import respx
+
+from surfsense_evals.core.clients import (
+    DocumentsClient,
+    NewChatClient,
+    SearchSpaceClient,
+)
+from surfsense_evals.core.clients.new_chat import ThreadBusyError
+
+_BASE = "http://test"
+
+
+@pytest.fixture
+def http() -> httpx.AsyncClient:
+    return httpx.AsyncClient(base_url=_BASE)
+
+
+# ---------------------------------------------------------------------------
+# SearchSpaceClient
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_create_search_space_returns_row(respx_mock, http):
+    respx_mock.post("/api/v1/searchspaces").mock(
+        return_value=httpx.Response(
+            200,
+            json={
+                "id": 99,
+                "name": "eval-medical-2026",
+                "description": None,
+                "user_id": "user-x",
+                "citations_enabled": True,
+                "qna_custom_instructions": None,
+            },
+        )
+    )
+    client = SearchSpaceClient(http, _BASE)
+    row = await client.create("eval-medical-2026")
+    assert row.id == 99
+    assert row.name == "eval-medical-2026"
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_delete_search_space_idempotent_on_404(respx_mock, http):
+    respx_mock.delete("/api/v1/searchspaces/42").mock(
+        return_value=httpx.Response(404, json={"detail": "gone"})
+    )
+    client = SearchSpaceClient(http, _BASE)
+    await client.delete(42)  # must not raise
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_set_llm_preferences_partial_update(respx_mock, http):
+    route = respx_mock.put("/api/v1/search-spaces/42/llm-preferences").mock(
+        return_value=httpx.Response(
+            200,
+            json={
+                "agent_llm_id": -10042,
+                "document_summary_llm_id": None,
+                "image_generation_config_id": None,
+                "vision_llm_config_id": None,
+                "agent_llm": {
+                    "id": -10042,
+                    "provider": "OPENROUTER",
+                    "model_name": "anthropic/claude-sonnet-4.5",
+                },
+            },
+        )
+    )
+    client = SearchSpaceClient(http, _BASE)
+    prefs = await client.set_llm_preferences(42, agent_llm_id=-10042)
+    assert prefs.agent_llm_id == -10042
+    assert prefs.agent_llm["provider"] == "OPENROUTER"
+    sent_body = json.loads(route.calls[-1].request.content)
+    assert sent_body == {"agent_llm_id": -10042}
+
+
+# ---------------------------------------------------------------------------
+# DocumentsClient
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_documents_status_parses_state(respx_mock, http):
+    respx_mock.get("/api/v1/documents/status").mock(
+        return_value=httpx.Response(
+            200,
+            json={
+                "items": [
+                    {"id": 1, "title": "a.pdf", "document_type": "FILE",
+                     "status": {"state": "ready", "reason": None}},
+                    {"id": 2, "title": "b.pdf", "document_type": "FILE",
+                     "status": {"state": "failed", "reason": "ETL boom"}},
+                ]
+            },
+        )
+    )
+    client = DocumentsClient(http, _BASE)
+    statuses = await client.get_status(search_space_id=1, document_ids=[1, 2])
+    assert {s.document_id for s in statuses} == {1, 2}
+    assert {s.is_ready for s in statuses} == {True, False}
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_documents_upload_returns_payload(respx_mock, http, tmp_path: Path):
+    f1 = tmp_path / "a.pdf"
+    f1.write_bytes(b"%PDF-1.4 small")
+    respx_mock.post("/api/v1/documents/fileupload").mock(
+        return_value=httpx.Response(
+            200,
+            json={
+                "message": "Files uploaded",
+                "document_ids": [101],
+                "duplicate_document_ids": [],
+                "total_files": 1,
+                "pending_files": 1,
+                "skipped_duplicates": 0,
+            },
+        )
+    )
+    client = DocumentsClient(http, _BASE)
+    result = await client.upload(files=[f1], search_space_id=7)
+    assert result.document_ids == [101]
+    assert result.pending_files == 1
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_documents_list_chunks_paginated(respx_mock, http):
+    respx_mock.get("/api/v1/documents/5/chunks").mock(
+        side_effect=[
+            httpx.Response(200, json={
+                "items": [{"id": 1, "content": "a"}, {"id": 2, "content": "b"}],
+                "total": 3, "page": 0, "page_size": 2, "has_more": True,
+            }),
+            httpx.Response(200, json={
+                "items": [{"id": 3, "content": "c"}],
+                "total": 3, "page": 1, "page_size": 2, "has_more": False,
+            }),
+        ]
+    )
+    client = DocumentsClient(http, _BASE)
+    rows = await client.list_chunks(5, page_size=2)
+    assert [r.id for r in rows] == [1, 2, 3]
+
+
+# ---------------------------------------------------------------------------
+# NewChatClient
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_create_thread_returns_id(respx_mock, http):
+    respx_mock.post("/api/v1/threads").mock(
+        return_value=httpx.Response(
+            200,
+            json={
+                "id": 555,
+                "title": "eval",
+                "archived": False,
+                "visibility": "PRIVATE",
+                "search_space_id": 1,
+                "messages": [],
+                "created_at": "2026-05-11T00:00:00Z",
+                "updated_at": "2026-05-11T00:00:00Z",
+            },
+        )
+    )
+    client = NewChatClient(http, _BASE)
+    tid = await client.create_thread(search_space_id=1)
+    assert tid == 555
+
+
+def _sse_body(events: list[dict]) -> bytes:
+    parts = []
+    for ev in events:
+        parts.append(f"data: {json.dumps(ev)}\n\n")
+    parts.append("data: [DONE]\n\n")
+    return "".join(parts).encode("utf-8")
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_ask_accumulates_text_deltas(respx_mock, http):
+    body = _sse_body([
+        {"type": "start", "messageId": "m1"},
+        {"type": "text-start", "id": "t1"},
+        {"type": "text-delta", "id": "t1", "delta": "Answer "},
+        {"type": "text-delta", "id": "t1", "delta": "is "},
+        {"type": "text-delta", "id": "t1", "delta": "B [citation:42]."},
+        {"type": "text-end", "id": "t1"},
+        {"type": "finish"},
+    ])
+    respx_mock.post("/api/v1/new_chat").mock(
+        return_value=httpx.Response(
+            200,
+            content=body,
+            headers={"Content-Type": "text/event-stream"},
+        )
+    )
+    client = NewChatClient(http, _BASE)
+    answer = await client.ask(
+        thread_id=1, search_space_id=2, user_query="What is the answer?"
+    )
+    assert answer.text == "Answer is B [citation:42]."
+    assert answer.finished_normally is True
+    assert any(c["chunk_id"] == 42 for c in answer.citations)
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_ask_409_thread_busy_retries(respx_mock, http):
+    body = _sse_body([
+        {"type": "text-delta", "id": "t1", "delta": "ok"},
+        {"type": "finish"},
+    ])
+    busy = httpx.Response(
+        409,
+        json={"detail": {"errorCode": "THREAD_BUSY", "message": "busy"}},
+        headers={"Retry-After": "1"},
+    )
+    success = httpx.Response(
+        200, content=body, headers={"Content-Type": "text/event-stream"}
+    )
+    respx_mock.post("/api/v1/new_chat").mock(side_effect=[busy, success])
+    client = NewChatClient(http, _BASE)
+    answer = await client.ask(
+        thread_id=1, search_space_id=2, user_query="hi", max_busy_retries=2
+    )
+    assert answer.text == "ok"
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_ask_409_exhausts_retries(respx_mock, http):
+    busy = httpx.Response(
+        409,
+        json={"detail": {"errorCode": "TURN_CANCELLING", "message": "wait"}},
+        headers={"Retry-After": "1"},
+    )
+    respx_mock.post("/api/v1/new_chat").mock(return_value=busy)
+    client = NewChatClient(http, _BASE)
+    with pytest.raises(ThreadBusyError):
+        await client.ask(
+            thread_id=1, search_space_id=2, user_query="hi", max_busy_retries=1
+        )
diff --git a/surfsense_evals/tests/core/test_config.py b/surfsense_evals/tests/core/test_config.py
new file mode 100644
index 0000000000..f7b8f72495
--- /dev/null
+++ b/surfsense_evals/tests/core/test_config.py
@@ -0,0 +1,160 @@
+"""Tests for env loading + state.json read/write."""
+
+from __future__ import annotations
+
+import json
+
+from surfsense_evals.core.config import (
+    DEFAULT_SCENARIO,
+    SCENARIOS,
+    SuiteState,
+    clear_suite_state,
+    get_suite_state,
+    load_config,
+    set_suite_state,
+)
+
+
+def test_load_config_defaults_to_localhost(tmp_env):  # noqa: ARG001
+    config = load_config()
+    assert config.surfsense_api_base == "http://localhost:8000"
+    assert config.has_jwt_mode() is False
+    assert config.has_local_mode() is False
+    assert config.credential_mode() == "none"
+
+
+def test_load_config_picks_up_jwt_env(tmp_env, monkeypatch):  # noqa: ARG001
+    monkeypatch.setenv("SURFSENSE_JWT", "tok")
+    config = load_config()
+    assert config.credential_mode() == "jwt"
+
+
+def test_load_config_picks_up_local_env(tmp_env, monkeypatch):  # noqa: ARG001
+    monkeypatch.setenv("SURFSENSE_USER_EMAIL", "u@x.com")
+    monkeypatch.setenv("SURFSENSE_USER_PASSWORD", "pw")
+    config = load_config()
+    assert config.credential_mode() == "local"
+
+
+def test_state_roundtrip_per_suite(tmp_env):  # noqa: ARG001
+    config = load_config()
+    assert get_suite_state(config, "medical") is None
+    state = SuiteState(
+        search_space_id=1,
+        agent_llm_id=-10042,
+        provider_model="anthropic/claude-sonnet-4.5",
+        created_at="2026-05-11T20-30-00Z",
+    )
+    set_suite_state(config, "medical", state)
+    legal = SuiteState(
+        search_space_id=2,
+        agent_llm_id=-1,
+        provider_model="openai/gpt-5",
+        created_at="2026-05-11T21-00-00Z",
+    )
+    set_suite_state(config, "legal", legal)
+
+    fetched = get_suite_state(config, "medical")
+    assert fetched.search_space_id == 1
+    assert fetched.provider_model == "anthropic/claude-sonnet-4.5"
+
+    # Other suite untouched after teardown.
+    cleared = clear_suite_state(config, "medical")
+    assert cleared is True
+    assert get_suite_state(config, "medical") is None
+    assert get_suite_state(config, "legal").search_space_id == 2
+
+    raw = json.loads(config.state_path.read_text(encoding="utf-8"))
+    assert "medical" not in raw["suites"]
+    assert "legal" in raw["suites"]
+
+
+def test_paths_are_per_suite(tmp_env):  # noqa: ARG001
+    config = load_config()
+    a = config.suite_data_dir("medical")
+    b = config.suite_data_dir("legal")
+    assert a != b
+    assert config.suite_reports_dir("medical").parent == config.reports_dir
+    assert config.suite_runs_dir("medical").name == "runs"
+    assert config.suite_maps_dir("medical").name == "maps"
+
+
+# ---------------------------------------------------------------------------
+# Scenario state — back-compat + new fields
+# ---------------------------------------------------------------------------
+
+
+def test_legacy_state_back_compat_defaults_to_head_to_head():
+    """state.json files written before scenarios shipped must still load.
+
+    Missing ``scenario`` / ``vision_*`` / ``native_arm_model`` keys all
+    default to ``head-to-head`` / ``None`` so old setups keep working
+    after upgrade — the runner's behaviour exactly mirrors the legacy
+    one (both arms answer with ``provider_model``).
+    """
+
+    legacy = {
+        "search_space_id": 7,
+        "agent_llm_id": -123,
+        "provider_model": "anthropic/claude-sonnet-4.5",
+        "created_at": "2026-05-11T20-30-00Z",
+        "ingestion_maps": {},
+    }
+    state = SuiteState.from_dict(legacy)
+    assert state.scenario == DEFAULT_SCENARIO == "head-to-head"
+    assert state.vision_llm_config_id is None
+    assert state.vision_provider_model is None
+    assert state.native_arm_model is None
+    # The native arm should still answer with the same slug as SurfSense.
+    assert state.effective_native_arm_model == state.provider_model
+
+
+def test_unknown_scenario_falls_back_to_default():
+    """Garbage scenario in state.json → default, not crash.
+
+    Defensive: we'd rather a stale state file render with the safe
+    head-to-head behaviour than break the whole run with a KeyError.
+    """
+
+    payload = {
+        "search_space_id": 1,
+        "agent_llm_id": -1,
+        "provider_model": "openai/gpt-5",
+        "scenario": "unknown-scenario-name",
+    }
+    state = SuiteState.from_dict(payload)
+    assert state.scenario == DEFAULT_SCENARIO
+
+
+def test_cost_arbitrage_state_persists_native_arm_model(tmp_env):  # noqa: ARG001
+    config = load_config()
+    state = SuiteState(
+        search_space_id=42,
+        agent_llm_id=-1,
+        provider_model="openai/gpt-5.4-mini",
+        created_at="2026-05-11T20-30-00Z",
+        scenario="cost-arbitrage",
+        vision_llm_config_id=-101,
+        vision_provider_model="anthropic/claude-sonnet-4.5",
+        native_arm_model="anthropic/claude-sonnet-4.5",
+    )
+    set_suite_state(config, "medical", state)
+
+    fetched = get_suite_state(config, "medical")
+    assert fetched.scenario == "cost-arbitrage"
+    assert fetched.vision_llm_config_id == -101
+    assert fetched.vision_provider_model == "anthropic/claude-sonnet-4.5"
+    assert fetched.native_arm_model == "anthropic/claude-sonnet-4.5"
+    # Cost arbitrage's whole point: native arm slug != surfsense slug.
+    assert fetched.effective_native_arm_model != fetched.provider_model
+    assert fetched.effective_native_arm_model == "anthropic/claude-sonnet-4.5"
+
+    raw = json.loads(config.state_path.read_text(encoding="utf-8"))
+    assert raw["suites"]["medical"]["scenario"] == "cost-arbitrage"
+
+
+def test_scenario_constants_are_stable():
+    """Pin the public scenario list; runners + tests key off these strings."""
+
+    assert SCENARIOS == ("head-to-head", "symmetric-cheap", "cost-arbitrage")
+    assert DEFAULT_SCENARIO == "head-to-head"
diff --git a/surfsense_evals/tests/core/test_ingest_settings.py b/surfsense_evals/tests/core/test_ingest_settings.py
new file mode 100644
index 0000000000..acfac57a6a
--- /dev/null
+++ b/surfsense_evals/tests/core/test_ingest_settings.py
@@ -0,0 +1,269 @@
+"""Unit tests for ``surfsense_evals.core.ingest_settings``.
+
+Covers:
+
+* ``IngestSettings.merge`` honours operator overrides and falls back
+  to per-benchmark defaults when the operator is silent.
+* ``add_ingest_settings_args`` exposes the three flag pairs and
+  argparse defaults of ``None`` correctly distinguish "not passed"
+  from "explicitly false".
+* ``settings_header_line`` / ``read_settings_header`` round-trip
+  through a JSONL file.
+* ``read_settings_header`` is fault-tolerant: missing files, missing
+  header, malformed JSON.
+* ``format_ingest_settings_md`` produces a stable Markdown bullet.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+
+import pytest
+
+from surfsense_evals.core.ingest_settings import (
+    PROCESSING_MODES,
+    SETTINGS_HEADER_KEY,
+    IngestSettings,
+    add_ingest_settings_args,
+    format_ingest_settings_md,
+    is_settings_header,
+    read_settings_header,
+    settings_header_line,
+)
+
+# ---------------------------------------------------------------------------
+# IngestSettings.merge
+# ---------------------------------------------------------------------------
+
+
+class TestMerge:
+    def test_silent_operator_uses_defaults(self) -> None:
+        defaults = IngestSettings(use_vision_llm=True, processing_mode="basic", should_summarize=True)
+        merged = IngestSettings.merge(defaults, {})
+        assert merged == defaults
+
+    def test_explicit_false_overrides_default_true(self) -> None:
+        defaults = IngestSettings(use_vision_llm=True)
+        merged = IngestSettings.merge(
+            defaults, {"use_vision_llm": False}
+        )
+        assert merged.use_vision_llm is False
+
+    def test_explicit_true_overrides_default_false(self) -> None:
+        defaults = IngestSettings(use_vision_llm=False)
+        merged = IngestSettings.merge(
+            defaults, {"use_vision_llm": True}
+        )
+        assert merged.use_vision_llm is True
+
+    def test_none_means_silent(self) -> None:
+        # Argparse with BooleanOptionalAction yields None when the
+        # operator passed neither --use-vision-llm nor --no-vision-llm.
+        defaults = IngestSettings(use_vision_llm=True)
+        merged = IngestSettings.merge(
+            defaults, {"use_vision_llm": None}
+        )
+        assert merged.use_vision_llm is True
+
+    def test_processing_mode_override(self) -> None:
+        defaults = IngestSettings(processing_mode="basic")
+        merged = IngestSettings.merge(
+            defaults, {"processing_mode": "premium"}
+        )
+        assert merged.processing_mode == "premium"
+
+    def test_processing_mode_invalid_raises(self) -> None:
+        defaults = IngestSettings(processing_mode="basic")
+        with pytest.raises(ValueError, match="Invalid processing_mode"):
+            IngestSettings.merge(defaults, {"processing_mode": "exotic"})
+
+    def test_processing_mode_blank_falls_back(self) -> None:
+        defaults = IngestSettings(processing_mode="basic")
+        merged = IngestSettings.merge(defaults, {"processing_mode": ""})
+        assert merged.processing_mode == "basic"
+
+    def test_string_truthy_coerced(self) -> None:
+        defaults = IngestSettings(use_vision_llm=False)
+        merged = IngestSettings.merge(defaults, {"use_vision_llm": "yes"})
+        assert merged.use_vision_llm is True
+
+    def test_string_falsy_coerced(self) -> None:
+        defaults = IngestSettings(use_vision_llm=True)
+        merged = IngestSettings.merge(defaults, {"use_vision_llm": "false"})
+        assert merged.use_vision_llm is False
+
+    def test_other_keys_ignored(self) -> None:
+        # Benchmarks pass the whole opts dict; merge must tolerate
+        # unrelated keys without crashing.
+        defaults = IngestSettings(use_vision_llm=True, processing_mode="basic")
+        merged = IngestSettings.merge(
+            defaults,
+            {
+                "use_vision_llm": False,
+                "concurrency": 4,
+                "task_filter": "all",
+                "no_mentions": True,
+            },
+        )
+        assert merged.use_vision_llm is False
+        assert merged.processing_mode == "basic"
+
+    def test_to_dict_round_trips(self) -> None:
+        s = IngestSettings(use_vision_llm=True, processing_mode="premium", should_summarize=False)
+        d = s.to_dict()
+        assert d == {
+            "use_vision_llm": True,
+            "processing_mode": "premium",
+            "should_summarize": False,
+        }
+
+    def test_render_label_format(self) -> None:
+        s = IngestSettings(use_vision_llm=True, processing_mode="premium", should_summarize=True)
+        assert s.render_label() == "vision=on, mode=premium, summarize=on"
+
+
+# ---------------------------------------------------------------------------
+# add_ingest_settings_args
+# ---------------------------------------------------------------------------
+
+
+class TestAddArgs:
+    @pytest.fixture
+    def parser(self) -> argparse.ArgumentParser:
+        p = argparse.ArgumentParser()
+        add_ingest_settings_args(
+            p,
+            defaults=IngestSettings(
+                use_vision_llm=False, processing_mode="basic", should_summarize=False
+            ),
+        )
+        return p
+
+    def test_silent_invocation_yields_none(self, parser: argparse.ArgumentParser) -> None:
+        args = parser.parse_args([])
+        assert args.use_vision_llm is None
+        assert args.processing_mode is None
+        assert args.should_summarize is None
+
+    def test_use_vision_llm_flag(self, parser: argparse.ArgumentParser) -> None:
+        args = parser.parse_args(["--use-vision-llm"])
+        assert args.use_vision_llm is True
+
+    def test_no_vision_llm_flag(self, parser: argparse.ArgumentParser) -> None:
+        args = parser.parse_args(["--no-vision-llm"])
+        assert args.use_vision_llm is False
+
+    def test_processing_mode_choices(self, parser: argparse.ArgumentParser) -> None:
+        for mode in PROCESSING_MODES:
+            args = parser.parse_args(["--processing-mode", mode])
+            assert args.processing_mode == mode
+
+    def test_processing_mode_rejects_unknown(
+        self, parser: argparse.ArgumentParser
+    ) -> None:
+        with pytest.raises(SystemExit):
+            parser.parse_args(["--processing-mode", "exotic"])
+
+    def test_summarize_flag_pair(self, parser: argparse.ArgumentParser) -> None:
+        on = parser.parse_args(["--should-summarize"])
+        assert on.should_summarize is True
+        off = parser.parse_args(["--no-summarize"])
+        assert off.should_summarize is False
+
+    def test_vision_flags_mutually_exclusive(
+        self, parser: argparse.ArgumentParser
+    ) -> None:
+        with pytest.raises(SystemExit):
+            parser.parse_args(["--use-vision-llm", "--no-vision-llm"])
+
+    def test_full_pipeline(self, parser: argparse.ArgumentParser) -> None:
+        # Operator passes flags + defaults are reasonable. Merge
+        # should yield exactly what they asked for.
+        args = parser.parse_args(
+            ["--use-vision-llm", "--processing-mode", "premium"]
+        )
+        defaults = IngestSettings(
+            use_vision_llm=False, processing_mode="basic", should_summarize=False
+        )
+        merged = IngestSettings.merge(defaults, vars(args))
+        assert merged == IngestSettings(
+            use_vision_llm=True, processing_mode="premium", should_summarize=False
+        )
+
+
+# ---------------------------------------------------------------------------
+# Header round-trip + read_settings_header fault tolerance
+# ---------------------------------------------------------------------------
+
+
+class TestHeader:
+    def test_header_line_round_trip(self, tmp_path: Path) -> None:
+        s = IngestSettings(use_vision_llm=True, processing_mode="premium")
+        path = tmp_path / "map.jsonl"
+        with path.open("w", encoding="utf-8") as fh:
+            fh.write(settings_header_line(s) + "\n")
+            fh.write(json.dumps({"case_id": "x", "document_id": 1}) + "\n")
+        loaded = read_settings_header(path)
+        assert loaded == s.to_dict()
+
+    def test_is_settings_header_recognises(self) -> None:
+        assert is_settings_header({SETTINGS_HEADER_KEY: {}})
+        assert not is_settings_header({"case_id": "x"})
+
+    def test_missing_file_returns_empty(self, tmp_path: Path) -> None:
+        assert read_settings_header(tmp_path / "does_not_exist.jsonl") == {}
+
+    def test_empty_file_returns_empty(self, tmp_path: Path) -> None:
+        path = tmp_path / "empty.jsonl"
+        path.write_text("", encoding="utf-8")
+        assert read_settings_header(path) == {}
+
+    def test_no_header_returns_empty(self, tmp_path: Path) -> None:
+        path = tmp_path / "legacy.jsonl"
+        with path.open("w", encoding="utf-8") as fh:
+            fh.write(json.dumps({"case_id": "x", "document_id": 1}) + "\n")
+            fh.write(json.dumps({"case_id": "y", "document_id": 2}) + "\n")
+        assert read_settings_header(path) == {}
+
+    def test_malformed_json_returns_empty(self, tmp_path: Path) -> None:
+        path = tmp_path / "broken.jsonl"
+        path.write_text("not json\n", encoding="utf-8")
+        assert read_settings_header(path) == {}
+
+    def test_skips_blank_first_lines(self, tmp_path: Path) -> None:
+        s = IngestSettings(use_vision_llm=True)
+        path = tmp_path / "padded.jsonl"
+        with path.open("w", encoding="utf-8") as fh:
+            fh.write("\n\n")
+            fh.write(settings_header_line(s) + "\n")
+        assert read_settings_header(path) == s.to_dict()
+
+
+# ---------------------------------------------------------------------------
+# format_ingest_settings_md
+# ---------------------------------------------------------------------------
+
+
+class TestFormatMd:
+    def test_full_settings(self) -> None:
+        out = format_ingest_settings_md(
+            {"use_vision_llm": True, "processing_mode": "premium", "should_summarize": True}
+        )
+        assert "vision_llm=`on`" in out
+        assert "processing_mode=`premium`" in out
+        assert "summarize=`on`" in out
+
+    def test_default_off(self) -> None:
+        out = format_ingest_settings_md(
+            {"use_vision_llm": False, "processing_mode": "basic", "should_summarize": False}
+        )
+        assert "vision_llm=`off`" in out
+        assert "processing_mode=`basic`" in out
+        assert "summarize=`off`" in out
+
+    def test_missing_returns_re_ingest_hint(self) -> None:
+        # Empty dict + None + non-mapping should all degrade gracefully.
+        for raw in [None, {}, "not-a-mapping"]:
+            assert "(not recorded" in format_ingest_settings_md(raw)
diff --git a/surfsense_evals/tests/core/test_metrics.py b/surfsense_evals/tests/core/test_metrics.py
new file mode 100644
index 0000000000..cde1bb9577
--- /dev/null
+++ b/surfsense_evals/tests/core/test_metrics.py
@@ -0,0 +1,153 @@
+"""Metric correctness — Wilson, McNemar, retrieval scores."""
+
+from __future__ import annotations
+
+import math
+
+import pytest
+
+from surfsense_evals.core.metrics import (
+    accuracy_with_wilson_ci,
+    bootstrap_delta_ci,
+    mcnemar_test,
+    mrr,
+    ndcg_at_k,
+    recall_at_k,
+    score_run,
+    wilson_ci,
+)
+
+# ---------------------------------------------------------------------------
+# Wilson
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "k,n,low,high",
+    [
+        (80, 100, 0.7111, 0.8666),  # cross-checked vs statsmodels.proportion_confint(method='wilson')
+        (50, 100, 0.4038, 0.5962),
+        (0, 0, 0.0, 1.0),
+        (0, 10, 0.0, 0.2775),
+        (10, 10, 0.7225, 1.0),
+    ],
+)
+def test_wilson_ci_known_values(k, n, low, high):
+    result_low, result_high = wilson_ci(k, n)
+    assert math.isclose(result_low, low, abs_tol=5e-4), (k, n, result_low, low)
+    assert math.isclose(result_high, high, abs_tol=5e-4), (k, n, result_high, high)
+
+
+def test_accuracy_with_wilson_ci_object():
+    res = accuracy_with_wilson_ci(70, 100)
+    assert res.accuracy == 0.7
+    assert 0.0 < res.ci_low < res.ci_high < 1.0
+
+
+def test_invalid_inputs_raise():
+    with pytest.raises(ValueError):
+        accuracy_with_wilson_ci(-1, 10)
+    with pytest.raises(ValueError):
+        accuracy_with_wilson_ci(11, 10)
+
+
+# ---------------------------------------------------------------------------
+# McNemar
+# ---------------------------------------------------------------------------
+
+
+def test_mcnemar_degenerate_returns_p_value_one():
+    a = [True, True, False, False]
+    b = [True, True, False, False]
+    res = mcnemar_test(a, b)
+    assert res.b == 0 and res.c == 0
+    assert res.p_value == 1.0
+    assert res.method == "degenerate"
+
+
+def test_mcnemar_exact_branch_strong_signal():
+    """B = 0, C = 10 → exact two-sided binomial p == 2 * (1/2)**10."""
+
+    a = [True] * 10 + [False] * 10
+    b = [True] * 10 + [True] * 10  # surfsense beats native on the 10 native-wrong
+    res = mcnemar_test(a, b)
+    assert res.b == 0
+    assert res.c == 10
+    assert res.method == "exact"
+    expected = 2 * (0.5 ** 10)
+    assert math.isclose(res.p_value, expected, rel_tol=1e-9)
+
+
+def test_mcnemar_chi_square_approx_for_large_discordant():
+    # Construct b=15, c=5 with continuity-corrected chi^2 = (|10|-1)^2/20 = 4.05.
+    a = [True] * 15 + [False] * 5 + [True] * 30 + [False] * 30
+    b = [False] * 15 + [True] * 5 + [True] * 30 + [False] * 30
+    res = mcnemar_test(a, b)
+    assert res.method == "chi2_cc"
+    assert res.b == 15 and res.c == 5
+    assert math.isclose(res.statistic, ((abs(15 - 5) - 1) ** 2) / 20.0, rel_tol=1e-9)
+    # p ≈ chi2.sf(4.05, df=1) ≈ 0.04417
+    assert 0.04 < res.p_value < 0.05
+
+
+def test_mcnemar_length_mismatch():
+    with pytest.raises(ValueError):
+        mcnemar_test([True], [True, False])
+
+
+# ---------------------------------------------------------------------------
+# Bootstrap
+# ---------------------------------------------------------------------------
+
+
+def test_bootstrap_delta_ci_shape_and_determinism():
+    a = [True, True, False, True, False, False, True, True]
+    b = [True, True, True, True, True, False, True, False]
+    res1 = bootstrap_delta_ci(a, b, n_resamples=500, random_state=42)
+    res2 = bootstrap_delta_ci(a, b, n_resamples=500, random_state=42)
+    assert res1.delta == res2.delta
+    assert res1.ci_low == res2.ci_low
+    assert res1.ci_high == res2.ci_high
+    assert res1.ci_low <= res1.delta <= res1.ci_high
+    assert res1.n_resamples == 500
+
+
+# ---------------------------------------------------------------------------
+# Retrieval
+# ---------------------------------------------------------------------------
+
+
+def test_recall_at_k():
+    retrieved = ["a", "b", "c", "d"]
+    relevant = ["b", "d", "z"]
+    assert recall_at_k(retrieved, relevant, k=2) == pytest.approx(1 / 3)
+    assert recall_at_k(retrieved, relevant, k=4) == pytest.approx(2 / 3)
+
+
+def test_mrr():
+    assert mrr(["a", "b", "c"], ["c"]) == pytest.approx(1 / 3)
+    assert mrr(["x", "y"], ["z"]) == 0.0
+
+
+def test_ndcg_at_k_perfect_order():
+    qrels = {"a": 2, "b": 1}
+    assert ndcg_at_k(["a", "b"], qrels, k=2) == pytest.approx(1.0)
+
+
+def test_ndcg_at_k_irrelevant_first():
+    qrels = {"a": 2, "b": 1}
+    # Wrong order should still be > 0 but < 1
+    val = ndcg_at_k(["c", "a", "b"], qrels, k=3)
+    assert 0 < val < 1
+
+
+def test_score_run_aggregates_across_queries():
+    scores = score_run(
+        per_query_retrieved={"q1": ["a", "b"], "q2": ["x", "y", "z"]},
+        per_query_qrels={"q1": {"a": 1}, "q2": {"z": 2}},
+        ks=(1, 5),
+        ndcg_k=5,
+    )
+    assert scores.n_queries == 2
+    assert scores.recall_at_k[1] == pytest.approx((1 + 0) / 2)  # q1 hits @1, q2 doesn't
+    assert scores.mrr == pytest.approx((1.0 + 1 / 3) / 2)
diff --git a/surfsense_evals/tests/core/test_parse_answer_letter.py b/surfsense_evals/tests/core/test_parse_answer_letter.py
new file mode 100644
index 0000000000..5adbf4bc33
--- /dev/null
+++ b/surfsense_evals/tests/core/test_parse_answer_letter.py
@@ -0,0 +1,27 @@
+"""Tests for the MCQ answer-letter extractor."""
+
+from __future__ import annotations
+
+import pytest
+
+from surfsense_evals.core.parse import extract_answer_letter
+from surfsense_evals.core.parse.answer_letter import AnswerLetterResult
+
+
+@pytest.mark.parametrize(
+    "text,expected_letter,expected_strategy",
+    [
+        ('```json\n{"step_by_step_thinking": "...", "answer_choice": "B"}\n```', "B", "json_envelope"),
+        ('Reasoning... {"step_by_step_thinking": "x", "answer_choice": "C"}', "C", "json_envelope"),
+        ("Long reasoning.\nAnswer: D", "D", "answer_line"),
+        ("The correct answer is (A).", "A", "answer_line"),
+        ("Final answer: e", "E", "answer_line"),
+        ("Long reasoning.\n\nB", "B", "bare_letter"),
+        ("Long reasoning.\n\n(C).", "C", "bare_letter"),
+        ("", None, "none"),
+        ("Just narrative without an answer.", None, "none"),
+    ],
+)
+def test_extract_answer_letter(text, expected_letter, expected_strategy):
+    result = extract_answer_letter(text)
+    assert result == AnswerLetterResult(expected_letter, expected_strategy)
diff --git a/surfsense_evals/tests/core/test_parse_citations.py b/surfsense_evals/tests/core/test_parse_citations.py
new file mode 100644
index 0000000000..eb444dab2e
--- /dev/null
+++ b/surfsense_evals/tests/core/test_parse_citations.py
@@ -0,0 +1,108 @@
+"""Parity tests for the citation regex.
+
+Each row mirrors a case from the canonical TS reference at
+``surfsense_web/lib/citations/citation-parser.ts``. If a future PR
+loosens or tightens the TS regex, these tests will start failing;
+that's the explicit signal to re-port the change.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from surfsense_evals.core.parse import (
+    CITATION_REGEX,
+    ChunkCitation,
+    UrlCitation,
+    parse_citations,
+)
+
+PARITY_TABLE = [
+    # (input, expected number of matches, expected first-token kind/value)
+    ("Plain text with no citation.", 0, None),
+    (
+        "The patient has fever [citation:42] and cough.",
+        1,
+        ChunkCitation(chunk_id=42, is_docs_chunk=False),
+    ),
+    (
+        "Negative chunk ids work [citation:-7].",
+        1,
+        ChunkCitation(chunk_id=-7, is_docs_chunk=False),
+    ),
+    (
+        "doc-prefix [citation:doc-12].",
+        1,
+        ChunkCitation(chunk_id=12, is_docs_chunk=True),
+    ),
+    (
+        "Multi id [citation:1, doc-2, -3].",
+        3,
+        ChunkCitation(chunk_id=1, is_docs_chunk=False),
+    ),
+    (
+        "URL form [citation:https://x.com/a].",
+        1,
+        UrlCitation(url="https://x.com/a"),
+    ),
+    (
+        "Chinese brackets【citation:5】.",
+        1,
+        ChunkCitation(chunk_id=5, is_docs_chunk=False),
+    ),
+    (
+        "ZWSP-decorated [\u200bcitation:9\u200b].",
+        1,
+        ChunkCitation(chunk_id=9, is_docs_chunk=False),
+    ),
+    (
+        "Whitespace [citation:  doc-100 ] tolerated.",
+        1,
+        ChunkCitation(chunk_id=100, is_docs_chunk=True),
+    ),
+    (
+        # The TS regex's URL char class excludes ']', so a trailing
+        # bracket isn't swallowed.
+        "Two URLs [citation:https://a.io] and [citation:https://b.io].",
+        2,
+        UrlCitation(url="https://a.io"),
+    ),
+    (
+        # Garbled form should match nothing.
+        "Citation-like but wrong [citation:].",
+        0,
+        None,
+    ),
+]
+
+
+@pytest.mark.parametrize("text,n_expected,first", PARITY_TABLE)
+def test_citation_regex_parity(text: str, n_expected: int, first):
+    tokens = parse_citations(text)
+    assert len(tokens) == n_expected, (text, tokens)
+    if first is not None:
+        assert tokens[0] == first, (text, tokens)
+
+
+def test_regex_pattern_matches_ts_source():
+    """Sanity: the compiled pattern carries the exact alternatives the TS source does."""
+
+    pattern = CITATION_REGEX.pattern
+    assert "https?://" in pattern
+    assert "urlcite" in pattern
+    assert "doc-" in pattern
+    assert "\u200B" in pattern
+    assert "【" in pattern and "】" in pattern
+
+
+def test_url_map_resolution():
+    text = "Inline placeholder [citation:urlcite0]."
+    tokens = parse_citations(text, url_map={"urlcite0": "https://resolved.example/x"})
+    assert tokens == [UrlCitation(url="https://resolved.example/x")]
+
+
+def test_url_map_missing_key_drops_token():
+    """Missing urlcite resolution returns no token (TS behaviour)."""
+
+    text = "[citation:urlcite99]"
+    assert parse_citations(text, url_map={}) == []
diff --git a/surfsense_evals/tests/core/test_parse_freeform_answer.py b/surfsense_evals/tests/core/test_parse_freeform_answer.py
new file mode 100644
index 0000000000..bdc7d74fc3
--- /dev/null
+++ b/surfsense_evals/tests/core/test_parse_freeform_answer.py
@@ -0,0 +1,73 @@
+"""Tests for ``surfsense_evals.core.parse.freeform_answer``."""
+
+from __future__ import annotations
+
+import pytest
+
+from surfsense_evals.core.parse.freeform_answer import extract_freeform_answer
+
+
+class TestExtractFreeformAnswer:
+    def test_empty_string_returns_empty(self) -> None:
+        assert extract_freeform_answer("") == ""
+        assert extract_freeform_answer("   \n\n  ") == ""
+
+    def test_simple_answer_marker(self) -> None:
+        assert extract_freeform_answer("Answer: 42") == "42"
+
+    def test_final_answer_marker(self) -> None:
+        assert extract_freeform_answer("Final answer: Paris") == "Paris"
+
+    def test_the_answer_is_marker(self) -> None:
+        assert extract_freeform_answer("The answer is: not answerable") == "not answerable"
+
+    def test_multiline_picks_last_answer_marker(self) -> None:
+        text = "Let me think...\nAnswer: 5\nAnswer: 7\n"
+        assert extract_freeform_answer(text) == "7"
+
+    def test_falls_back_to_last_nonempty_line(self) -> None:
+        text = "Some thinking here.\n\n42"
+        assert extract_freeform_answer(text) == "42"
+
+    def test_strips_quotes(self) -> None:
+        assert extract_freeform_answer('Answer: "Paris"') == "Paris"
+        assert extract_freeform_answer("Answer: 'Paris'") == "Paris"
+
+    def test_strips_backticks(self) -> None:
+        assert extract_freeform_answer("Answer: `42`") == "42"
+
+    def test_uses_fenced_block_when_no_marker(self) -> None:
+        text = "Here's my response:\n```\nfinal value\n```\n"
+        assert extract_freeform_answer(text) == "final value"
+
+    def test_case_insensitive_markers(self) -> None:
+        assert extract_freeform_answer("ANSWER: yes") == "yes"
+        assert extract_freeform_answer("answer: no") == "no"
+
+    @pytest.mark.parametrize("text,expected", [
+        ("Answer: 1, 2, 3", "1, 2, 3"),
+        ("Answer: 3.14", "3.14"),
+        ("Answer:    spaced   ", "spaced"),
+    ])
+    def test_various_payloads(self, text: str, expected: str) -> None:
+        assert extract_freeform_answer(text) == expected
+
+    def test_inline_answer_after_thinking_trace(self) -> None:
+        # Agent replies sometimes glue their thinking onto the same
+        # line as the final "Answer: ..." marker (no newline before it).
+        # The line-anchored regex misses this; the inline fallback
+        # should still extract the right value.
+        text = (
+            "Need the Charlotte Bronte book title/year and the rank "
+            "for a 128-foot NYC building.Answer: 128th"
+        )
+        assert extract_freeform_answer(text) == "128th"
+
+    def test_inline_picks_last_inline_answer(self) -> None:
+        text = "Thought: maybe Answer: 5 is right? Actually Answer: 7."
+        assert extract_freeform_answer(text) == "7."
+
+    def test_inline_does_not_override_proper_marker(self) -> None:
+        # When a clean line-anchored "Answer: ..." exists, that wins.
+        text = "Some preamble.Answer: 99\nAnswer: 42"
+        assert extract_freeform_answer(text) == "42"
diff --git a/surfsense_evals/tests/core/test_parse_sse.py b/surfsense_evals/tests/core/test_parse_sse.py
new file mode 100644
index 0000000000..3627172889
--- /dev/null
+++ b/surfsense_evals/tests/core/test_parse_sse.py
@@ -0,0 +1,84 @@
+"""Tests for the SSE consumer."""
+
+from __future__ import annotations
+
+import pytest
+
+from surfsense_evals.core.parse import iter_sse_events
+
+
+async def _alist(it):
+    out = []
+    async for x in it:
+        out.append(x)
+    return out
+
+
+async def _astream(lines):
+    for line in lines:
+        yield line
+
+
+@pytest.mark.asyncio
+async def test_basic_data_frame():
+    events = await _alist(
+        iter_sse_events(_astream([
+            'data: {"type": "text-delta", "delta": "hi"}',
+            "",
+            'data: {"type": "finish"}',
+            "",
+        ]))
+    )
+    assert [e.data for e in events] == [
+        '{"type": "text-delta", "delta": "hi"}',
+        '{"type": "finish"}',
+    ]
+
+
+@pytest.mark.asyncio
+async def test_done_sentinel_passes_through():
+    events = await _alist(
+        iter_sse_events(_astream([
+            "data: [DONE]",
+            "",
+        ]))
+    )
+    assert [e.data for e in events] == ["[DONE]"]
+
+
+@pytest.mark.asyncio
+async def test_multiline_data_joins_with_newline():
+    events = await _alist(
+        iter_sse_events(_astream([
+            "data: line1",
+            "data: line2",
+            "",
+        ]))
+    )
+    assert events[0].data == "line1\nline2"
+
+
+@pytest.mark.asyncio
+async def test_comments_and_other_fields_ignored():
+    events = await _alist(
+        iter_sse_events(_astream([
+            ": heartbeat",
+            "event: foo",
+            "id: 123",
+            "data: payload",
+            "",
+        ]))
+    )
+    assert [e.data for e in events] == ["payload"]
+
+
+@pytest.mark.asyncio
+async def test_handles_missing_trailing_blank():
+    """Some servers omit the final blank line; the consumer should still emit."""
+
+    events = await _alist(
+        iter_sse_events(_astream([
+            "data: only-one",
+        ]))
+    )
+    assert [e.data for e in events] == ["only-one"]
diff --git a/surfsense_evals/tests/core/test_pdf_render.py b/surfsense_evals/tests/core/test_pdf_render.py
new file mode 100644
index 0000000000..facdabbe82
--- /dev/null
+++ b/surfsense_evals/tests/core/test_pdf_render.py
@@ -0,0 +1,51 @@
+"""Smoke tests for PDF rendering.
+
+We don't pull a full PDF parser into the test deps; the assertions
+are bytes-level (``%PDF`` magic, deterministic CreationDate scrub).
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from surfsense_evals.core.pdf import render_pdf, render_text_files_to_pdf
+
+
+def test_render_pdf_writes_pdf_with_magic(tmp_path: Path):
+    out = tmp_path / "out.pdf"
+    rendered = render_pdf(
+        title="Test",
+        sections=[("intro", "Hello world."), ("body", "Line one.\nLine two.")],
+        output_path=out,
+    )
+    assert rendered.path == out
+    assert out.exists()
+    assert out.read_bytes().startswith(b"%PDF-")
+
+
+def test_render_pdf_deterministic_dates(tmp_path: Path):
+    out_a = tmp_path / "a.pdf"
+    out_b = tmp_path / "b.pdf"
+    sections = [("only", "deterministic body content")]
+    render_pdf(title="Det", sections=sections, output_path=out_a)
+    render_pdf(title="Det", sections=sections, output_path=out_b)
+    # CreationDate / ModDate are scrubbed to a fixed value, so the two
+    # files should compare equal (modulo any other internal randomness
+    # — reportlab's basic outputs are deterministic given fixed inputs).
+    assert out_a.read_bytes() == out_b.read_bytes()
+
+
+def test_render_text_files_uses_filename_as_section(tmp_path: Path):
+    files_dir = tmp_path / "src"
+    files_dir.mkdir()
+    (files_dir / "admission_note.txt").write_text("history of present illness", encoding="utf-8")
+    (files_dir / "labs.txt").write_text("Na 138, K 4.0", encoding="utf-8")
+    out = tmp_path / "case.pdf"
+    rendered = render_text_files_to_pdf(
+        title="Case 1",
+        files=[files_dir / "admission_note.txt", files_dir / "labs.txt"],
+        output_path=out,
+    )
+    assert out.exists()
+    # We don't decode the PDF; the n_chars estimate should reflect both inputs.
+    assert rendered.n_chars >= len("history of present illness") + len("Na 138, K 4.0")
diff --git a/surfsense_evals/tests/core/test_pdf_render_with_images.py b/surfsense_evals/tests/core/test_pdf_render_with_images.py
new file mode 100644
index 0000000000..c29503bc90
--- /dev/null
+++ b/surfsense_evals/tests/core/test_pdf_render_with_images.py
@@ -0,0 +1,73 @@
+"""Tests for ``render_pdf_with_images`` — covers image embedding +
+deterministic byte output, mirroring ``test_pdf_render.py`` for the
+text-only path.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from surfsense_evals.core.pdf import PdfImage, render_pdf_with_images
+
+
+@pytest.fixture
+def tiny_png(tmp_path: Path) -> Path:
+    """Generate a real 4x4 PNG via Pillow — embeds cleanly in reportlab.
+
+    Hand-crafted PNG headers tend to fail PIL's strict decoder, so we
+    delegate to Pillow which is already a transitive dep of reportlab.
+    """
+
+    from PIL import Image as PILImage
+
+    p = tmp_path / "pixel.png"
+    PILImage.new("RGB", (4, 4), color=(128, 128, 128)).save(p, format="PNG")
+    return p
+
+
+class TestRenderPdfWithImages:
+    def test_renders_pdf_with_no_images(self, tmp_path: Path) -> None:
+        out = tmp_path / "out.pdf"
+        rendered = render_pdf_with_images(
+            title="Test",
+            sections=[("Heading", "Body text here.", None)],
+            output_path=out,
+        )
+        assert rendered.path == out
+        assert out.exists()
+        assert out.read_bytes().startswith(b"%PDF-")
+
+    def test_renders_pdf_with_one_image(self, tmp_path: Path, tiny_png: Path) -> None:
+        out = tmp_path / "out.pdf"
+        render_pdf_with_images(
+            title="Test",
+            sections=[("Case", "Body text.", [PdfImage(path=tiny_png, caption="A pixel")])],
+            output_path=out,
+        )
+        assert out.exists()
+        assert out.stat().st_size > 200  # not empty
+
+    def test_deterministic_bytes(self, tmp_path: Path, tiny_png: Path) -> None:
+        out_a = tmp_path / "a.pdf"
+        out_b = tmp_path / "b.pdf"
+        sections = [
+            ("Case", "Some text.", [PdfImage(path=tiny_png, caption="cap")]),
+            ("Options", "A) one\nB) two", None),
+        ]
+        render_pdf_with_images(title="Test", sections=sections, output_path=out_a)
+        render_pdf_with_images(title="Test", sections=sections, output_path=out_b)
+        assert out_a.read_bytes() == out_b.read_bytes()
+
+    def test_skips_invalid_image_silently(self, tmp_path: Path) -> None:
+        """A bad image path should not abort the whole PDF render."""
+
+        out = tmp_path / "out.pdf"
+        render_pdf_with_images(
+            title="Test",
+            sections=[("Case", "Text", [PdfImage(path=tmp_path / "nope.jpg", caption="x")])],
+            output_path=out,
+        )
+        assert out.exists()
+        assert out.read_bytes().startswith(b"%PDF-")
diff --git a/surfsense_evals/tests/core/test_provider_openrouter.py b/surfsense_evals/tests/core/test_provider_openrouter.py
new file mode 100644
index 0000000000..eb78aa0537
--- /dev/null
+++ b/surfsense_evals/tests/core/test_provider_openrouter.py
@@ -0,0 +1,121 @@
+"""respx-mocked tests for the OpenRouter PDF provider."""
+
+from __future__ import annotations
+
+import base64
+import json
+from pathlib import Path
+
+import httpx
+import pytest
+import respx
+
+from surfsense_evals.core.providers.openrouter_pdf import (
+    OpenRouterPdfProvider,
+    PdfEngine,
+)
+
+_BASE = "https://openrouter.test"
+
+
+@pytest.fixture
+def tiny_pdf(tmp_path: Path) -> Path:
+    p = tmp_path / "case.pdf"
+    p.write_bytes(b"%PDF-1.4 minimal content")
+    return p
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_payload_shape_matches_openrouter_docs(respx_mock, tiny_pdf: Path):
+    captured = {}
+
+    def _capture(request):
+        captured["body"] = json.loads(request.content)
+        captured["headers"] = dict(request.headers)
+        return httpx.Response(
+            200,
+            json={
+                "choices": [{
+                    "message": {"content": "Answer: B"},
+                    "finish_reason": "stop",
+                }],
+                "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15, "cost": 0.0001},
+            },
+        )
+
+    respx_mock.post("/chat/completions").mock(side_effect=_capture)
+
+    provider = OpenRouterPdfProvider(
+        api_key="sk-or-test",
+        base_url=_BASE,
+        model="anthropic/claude-sonnet-4.5",
+        engine=PdfEngine.NATIVE,
+    )
+    response = await provider.complete(prompt="What is the diagnosis?", pdf_path=tiny_pdf)
+    body = captured["body"]
+    assert body["model"] == "anthropic/claude-sonnet-4.5"
+    assert body["plugins"] == [{"id": "file-parser", "pdf": {"engine": "native"}}]
+    user = body["messages"][-1]
+    assert user["role"] == "user"
+    file_part = user["content"][0]
+    assert file_part["type"] == "file"
+    assert file_part["file"]["filename"] == tiny_pdf.name
+    assert file_part["file"]["file_data"].startswith("data:application/pdf;base64,")
+    assert (
+        base64.b64decode(file_part["file"]["file_data"].split(",", 1)[1])
+        == tiny_pdf.read_bytes()  # noqa: ASYNC240 — test fixture, sync read is fine
+    )
+    assert user["content"][1] == {"type": "text", "text": "What is the diagnosis?"}
+    assert captured["headers"]["authorization"] == "Bearer sk-or-test"
+    assert captured["headers"].get("x-title") == "SurfSense-evals"
+
+    assert response.text == "Answer: B"
+    assert response.input_tokens == 10
+    assert response.output_tokens == 5
+    assert response.total_tokens == 15
+    # cost 0.0001 USD == 100 micros
+    assert response.cost_micros == 100
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_chat_array_content_concatenates(respx_mock, tiny_pdf: Path):
+    respx_mock.post("/chat/completions").mock(
+        return_value=httpx.Response(
+            200,
+            json={
+                "choices": [{
+                    "message": {
+                        "content": [
+                            {"type": "text", "text": "Hello "},
+                            {"type": "text", "text": "world"},
+                            {"type": "image_url", "image_url": "ignored"},
+                        ]
+                    }
+                }],
+                "usage": {"prompt_tokens": 1, "completion_tokens": 1},
+            },
+        )
+    )
+    provider = OpenRouterPdfProvider(
+        api_key="sk-or-test", base_url=_BASE, model="x/y"
+    )
+    response = await provider.complete(prompt="hi", pdf_path=tiny_pdf)
+    assert response.text == "Hello world"
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_provider_raises_on_4xx(respx_mock, tiny_pdf: Path):
+    respx_mock.post("/chat/completions").mock(
+        return_value=httpx.Response(429, json={"error": {"message": "rate limited"}})
+    )
+    provider = OpenRouterPdfProvider(api_key="sk-or-test", base_url=_BASE, model="x/y")
+    with pytest.raises(httpx.HTTPStatusError):
+        await provider.complete(prompt="hi", pdf_path=tiny_pdf)
+
+
+def test_missing_api_key_raises():
+    with pytest.raises(ValueError):
+        OpenRouterPdfProvider(api_key="", base_url=_BASE, model="x/y")
diff --git a/surfsense_evals/tests/core/test_registry.py b/surfsense_evals/tests/core/test_registry.py
new file mode 100644
index 0000000000..ffdbf22618
--- /dev/null
+++ b/surfsense_evals/tests/core/test_registry.py
@@ -0,0 +1,58 @@
+"""Registry + auto-discovery tests.
+
+* Auto-discovery skips packages starting with ``_`` (so test fixtures
+  don't leak into the production catalogue).
+* Manually importing a ``_demo`` benchmark fires its ``register(...)``
+  call and the CLI sees it.
+"""
+
+from __future__ import annotations
+
+import importlib
+
+from surfsense_evals.core import registry
+
+
+def _force_register_demo() -> None:
+    """Import (or reload) the demo module so its ``register(...)`` runs.
+
+    On a fresh interpreter, ``import_module`` triggers package
+    initialization. After the first call though, the module is cached
+    in ``sys.modules`` and a second ``import_module`` is a no-op — so
+    if a previous test already unregistered the entry, we have to
+    ``reload`` to re-execute the module body.
+    """
+
+    module = importlib.import_module("surfsense_evals.suites._demo.hello")
+    if ("_demo", "hello") not in registry.snapshot():
+        importlib.reload(module)
+
+
+def test_auto_discovery_skips_underscore_prefixed_subpackages():
+    from surfsense_evals.suites import discover_suites
+
+    discovered = discover_suites()
+    assert all(not part.startswith("_") for full in discovered for part in full.split("."))
+    # The medical suite's headline benchmark must always discover.
+    assert any(name.endswith(".medical.medxpertqa") for name in discovered)
+
+
+def test_demo_benchmark_registers_on_explicit_import():
+    _force_register_demo()
+    bench = registry.get("_demo", "hello")
+    assert bench is not None
+    assert bench.name == "hello"
+    assert bench.headline is False
+    # Cleanup so the test is idempotent under repeated runs.
+    registry.unregister("_demo", "hello")
+
+
+def test_register_unregister_roundtrip():
+    # Make sure no stale entry from a prior test in the session.
+    if ("_demo", "hello") in registry.snapshot():
+        registry.unregister("_demo", "hello")
+    snapshot_before = dict(registry.snapshot())
+    _force_register_demo()
+    assert ("_demo", "hello") in registry.snapshot()
+    registry.unregister("_demo", "hello")
+    assert dict(registry.snapshot()) == snapshot_before
diff --git a/surfsense_evals/tests/core/test_scenarios.py b/surfsense_evals/tests/core/test_scenarios.py
new file mode 100644
index 0000000000..5e93c266bf
--- /dev/null
+++ b/surfsense_evals/tests/core/test_scenarios.py
@@ -0,0 +1,68 @@
+"""Tests for the shared scenario formatter used in head-to-head reports."""
+
+from __future__ import annotations
+
+from surfsense_evals.core.scenarios import format_scenario_md
+
+
+def test_head_to_head_renders_both_arms_same_slug():
+    extra = {
+        "scenario": "head-to-head",
+        "provider_model": "anthropic/claude-sonnet-4.5",
+    }
+    line = format_scenario_md(extra)
+    assert "head-to-head" in line
+    assert "anthropic/claude-sonnet-4.5" in line
+
+
+def test_head_to_head_includes_vision_slug_when_recorded():
+    extra = {
+        "scenario": "head-to-head",
+        "provider_model": "anthropic/claude-sonnet-4.5",
+        "vision_provider_model": "anthropic/claude-sonnet-4.5",
+    }
+    line = format_scenario_md(extra)
+    assert "ingest VLM" in line
+    assert "claude-sonnet-4.5" in line
+
+
+def test_symmetric_cheap_calls_out_native_arm_disadvantage():
+    extra = {
+        "scenario": "symmetric-cheap",
+        "provider_model": "openai/gpt-5.4-mini",
+        "vision_provider_model": "anthropic/claude-sonnet-4.5",
+    }
+    line = format_scenario_md(extra)
+    assert "**symmetric-cheap**" in line
+    assert "gpt-5.4-mini" in line
+    # The "structurally loses" disclaimer must be there so reviewers
+    # don't read this as a fair comparison.
+    assert "structurally loses" in line.lower() or "structurally_loses" in line.lower()
+
+
+def test_cost_arbitrage_distinguishes_native_and_surfsense_slugs():
+    extra = {
+        "scenario": "cost-arbitrage",
+        "provider_model": "openai/gpt-5.4-mini",
+        "native_arm_model": "anthropic/claude-sonnet-4.5",
+        "vision_provider_model": "anthropic/claude-sonnet-4.5",
+    }
+    line = format_scenario_md(extra)
+    assert "**cost-arbitrage**" in line
+    # Both slugs surface; reader can see the asymmetry at a glance.
+    assert "anthropic/claude-sonnet-4.5" in line
+    assert "openai/gpt-5.4-mini" in line
+    assert "fraction of the per-query cost" in line
+
+
+def test_legacy_artifact_without_scenario_renders_as_head_to_head():
+    """Old run_artifact.json files don't have ``scenario`` — must still render."""
+
+    extra = {"provider_model": "anthropic/claude-sonnet-4.5"}
+    line = format_scenario_md(extra)
+    assert "head-to-head" in line
+
+
+def test_none_extra_does_not_crash():
+    line = format_scenario_md(None)
+    assert "head-to-head" in line
diff --git a/surfsense_evals/tests/core/test_vision_llm.py b/surfsense_evals/tests/core/test_vision_llm.py
new file mode 100644
index 0000000000..5c3dfd7191
--- /dev/null
+++ b/surfsense_evals/tests/core/test_vision_llm.py
@@ -0,0 +1,121 @@
+"""Tests for vision LLM auto-pick + explicit-slug resolution."""
+
+from __future__ import annotations
+
+import pytest
+
+from surfsense_evals.core.clients.search_space import VisionLlmConfigEntry
+from surfsense_evals.core.vision_llm import (
+    RECOMMENDED_VISION_PRIORITY,
+    VisionConfigError,
+    resolve_vision_llm,
+)
+
+
+def _entry(*, id: int, model_name: str, provider: str = "OPENROUTER") -> VisionLlmConfigEntry:
+    return VisionLlmConfigEntry(
+        id=id,
+        name=f"OpenRouter • {model_name}",
+        provider=provider,
+        model_name=model_name,
+        is_auto_mode=False,
+        raw={},
+    )
+
+
+# ---------------------------------------------------------------------------
+# Explicit slug resolution
+# ---------------------------------------------------------------------------
+
+
+def test_explicit_slug_resolves_to_matching_config_id():
+    candidates = [
+        _entry(id=-101, model_name="anthropic/claude-sonnet-4.5"),
+        _entry(id=-102, model_name="openai/gpt-5"),
+    ]
+    resolved = resolve_vision_llm(candidates, explicit_slug="openai/gpt-5")
+    assert resolved.config_id == -102
+    assert resolved.provider_model == "openai/gpt-5"
+    assert resolved.selected_via == "explicit"
+
+
+def test_explicit_slug_with_no_match_raises_with_helpful_listing():
+    candidates = [_entry(id=-101, model_name="anthropic/claude-sonnet-4.5")]
+    with pytest.raises(VisionConfigError) as exc_info:
+        resolve_vision_llm(candidates, explicit_slug="some/missing-slug")
+    msg = str(exc_info.value)
+    assert "some/missing-slug" in msg
+    assert "anthropic/claude-sonnet-4.5" in msg  # surfaced as a sample
+
+
+def test_explicit_slug_skips_non_openrouter_entries():
+    """A YAML BYOK entry with a colliding model_name shouldn't accidentally match."""
+
+    candidates = [
+        _entry(id=42, model_name="openai/gpt-5", provider="OPENAI"),
+        _entry(id=-101, model_name="openai/gpt-5"),
+    ]
+    resolved = resolve_vision_llm(candidates, explicit_slug="openai/gpt-5")
+    assert resolved.config_id == -101  # the OpenRouter one, not the BYOK one
+
+
+# ---------------------------------------------------------------------------
+# Auto-pick by recommended priority
+# ---------------------------------------------------------------------------
+
+
+def test_auto_pick_walks_priority_list_in_order():
+    candidates = [
+        _entry(id=-300, model_name="google/gemini-2.5-pro"),
+        _entry(id=-200, model_name="anthropic/claude-opus-4.7"),
+        _entry(id=-100, model_name="anthropic/claude-sonnet-4.5"),
+    ]
+    resolved = resolve_vision_llm(candidates, explicit_slug=None)
+    # claude-sonnet-4.5 is first in the priority tuple, so it wins.
+    assert resolved.config_id == -100
+    assert resolved.provider_model == "anthropic/claude-sonnet-4.5"
+    assert resolved.selected_via == "auto-priority"
+
+
+def test_auto_pick_skips_to_next_priority_when_first_unavailable():
+    candidates = [
+        _entry(id=-200, model_name="anthropic/claude-opus-4.7"),
+        _entry(id=-300, model_name="google/gemini-2.5-pro"),
+    ]
+    resolved = resolve_vision_llm(candidates, explicit_slug=None)
+    # claude-sonnet-4.5 not registered → claude-opus-4.7 is next in priority.
+    assert resolved.provider_model == "anthropic/claude-opus-4.7"
+    assert resolved.selected_via == "auto-priority"
+
+
+def test_auto_pick_falls_back_to_first_openrouter_when_no_recommended_match():
+    candidates = [
+        _entry(id=-700, model_name="some/exotic-vision-model"),
+        _entry(id=-800, model_name="another/exotic-vision-model"),
+    ]
+    resolved = resolve_vision_llm(candidates, explicit_slug=None)
+    # Neither matches the priority list → first OpenRouter entry wins.
+    assert resolved.config_id == -700
+    assert resolved.selected_via == "auto-fallback"
+
+
+def test_auto_pick_with_zero_openrouter_candidates_raises():
+    candidates: list[VisionLlmConfigEntry] = []
+    with pytest.raises(VisionConfigError) as exc_info:
+        resolve_vision_llm(candidates, explicit_slug=None)
+    assert "vision_enabled: true" in str(exc_info.value)
+
+
+def test_auto_pick_ignores_non_openrouter_entries():
+    candidates = [
+        _entry(id=99, model_name="anthropic/claude-sonnet-4.5", provider="ANTHROPIC"),
+    ]
+    with pytest.raises(VisionConfigError):
+        resolve_vision_llm(candidates, explicit_slug=None)
+
+
+def test_recommended_priority_is_a_stable_public_list():
+    """If you reorder this, update the README's auto-pick claim too."""
+
+    assert RECOMMENDED_VISION_PRIORITY[0] == "anthropic/claude-sonnet-4.5"
+    assert "google/gemini-2.5-pro" in RECOMMENDED_VISION_PRIORITY
diff --git a/surfsense_evals/tests/suites/__init__.py b/surfsense_evals/tests/suites/__init__.py
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/surfsense_evals/tests/suites/__init__.py
@@ -0,0 +1 @@
+
diff --git a/surfsense_evals/tests/suites/test_crag_dataset.py b/surfsense_evals/tests/suites/test_crag_dataset.py
new file mode 100644
index 0000000000..36114b52e1
--- /dev/null
+++ b/surfsense_evals/tests/suites/test_crag_dataset.py
@@ -0,0 +1,224 @@
+"""Tests for the CRAG dataset loader (parser + sampling).
+
+The full bz2 download is excluded — these tests synthesise a tiny
+JSONL-bz2 in a tmp dir and verify the parser / stratified-sampler
+produce well-shaped objects.
+"""
+
+from __future__ import annotations
+
+import bz2
+import json
+from pathlib import Path
+
+import pytest
+
+from surfsense_evals.suites.research.crag.dataset import (
+    CragPage,
+    CragQuestion,
+    iter_questions,
+    stratified_sample,
+)
+
+
+def _make_jsonl_bz2(rows: list[dict], tmp_path: Path) -> Path:
+    """Write ``rows`` as one JSON object per line, bz2-compressed."""
+
+    dest = tmp_path / "fake.jsonl.bz2"
+    payload = "\n".join(json.dumps(r) for r in rows).encode("utf-8")
+    with bz2.open(dest, "wb") as fh:
+        fh.write(payload)
+    return dest
+
+
+def _row(
+    *,
+    interaction_id: str,
+    query: str,
+    answer: str,
+    domain: str = "movie",
+    question_type: str = "simple",
+    pages: list[dict] | None = None,
+    alt_ans: list[str] | None = None,
+    popularity: str = "head",
+    static_or_dynamic: str = "static",
+    split: int = 0,
+    query_time: str = "2024-04-01",
+) -> dict:
+    return {
+        "interaction_id": interaction_id,
+        "query_time": query_time,
+        "domain": domain,
+        "question_type": question_type,
+        "static_or_dynamic": static_or_dynamic,
+        "query": query,
+        "answer": answer,
+        "alt_ans": alt_ans or [],
+        "split": split,
+        "popularity": popularity,
+        "search_results": pages or [],
+    }
+
+
+class TestParser:
+    def test_basic_parse(self, tmp_path: Path) -> None:
+        rows = [
+            _row(
+                interaction_id="abc",
+                query="Who directed Inception?",
+                answer="Christopher Nolan",
+                pages=[{
+                    "page_name": "Inception (film)",
+                    "page_url": "https://en.wikipedia.org/wiki/Inception",
+                    "page_snippet": "snippet",
+                    "page_result": "<html>full html</html>",
+                    "page_last_modified": "2024-01-01",
+                }],
+            ),
+        ]
+        path = _make_jsonl_bz2(rows, tmp_path)
+        parsed = iter_questions(path)
+        assert len(parsed) == 1
+        q = parsed[0]
+        assert q.query == "Who directed Inception?"
+        assert q.gold_answer == "Christopher Nolan"
+        assert q.qid == "C00000"
+        assert q.domain == "movie"
+        assert q.question_type == "simple"
+        assert len(q.pages) == 1
+        page = q.pages[0]
+        assert page.page_name == "Inception (film)"
+        assert page.page_url == "https://en.wikipedia.org/wiki/Inception"
+
+    def test_skips_missing_query_or_answer(self, tmp_path: Path) -> None:
+        rows = [
+            _row(interaction_id="1", query="", answer="x"),
+            _row(interaction_id="2", query="ok?", answer=""),
+            _row(interaction_id="3", query="ok?", answer="x"),
+        ]
+        path = _make_jsonl_bz2(rows, tmp_path)
+        parsed = iter_questions(path)
+        assert len(parsed) == 1
+        assert parsed[0].interaction_id == "3"
+
+    def test_skips_empty_pages(self, tmp_path: Path) -> None:
+        rows = [
+            _row(
+                interaction_id="x",
+                query="q?",
+                answer="a",
+                pages=[
+                    {"page_url": "", "page_result": "<html/>"},  # no URL
+                    {"page_url": "https://x.test/", "page_result": ""},  # empty html
+                    {"page_url": "https://y.test/", "page_result": "<html>good</html>"},
+                ],
+            ),
+        ]
+        path = _make_jsonl_bz2(rows, tmp_path)
+        parsed = iter_questions(path)
+        assert len(parsed) == 1
+        assert len(parsed[0].pages) == 1
+        assert parsed[0].pages[0].page_url == "https://y.test/"
+
+    def test_alt_answers_parsed(self, tmp_path: Path) -> None:
+        rows = [
+            _row(interaction_id="z", query="q?", answer="42",
+                 alt_ans=["forty-two", "42.0"]),
+        ]
+        path = _make_jsonl_bz2(rows, tmp_path)
+        parsed = iter_questions(path)
+        assert parsed[0].alt_answers == ["forty-two", "42.0"]
+
+    def test_handles_malformed_line(self, tmp_path: Path) -> None:
+        # Manually construct a bz2 with one valid line and one garbage line.
+        good = json.dumps(_row(interaction_id="ok", query="q?", answer="a"))
+        path = tmp_path / "mixed.jsonl.bz2"
+        with bz2.open(path, "wb") as fh:
+            fh.write(b"not-json{\n")
+            fh.write((good + "\n").encode("utf-8"))
+        parsed = iter_questions(path)
+        # Malformed line is skipped; one good row survives at index 1.
+        assert len(parsed) == 1
+        assert parsed[0].interaction_id == "ok"
+
+
+class TestPageHash:
+    def test_url_hash_stable(self) -> None:
+        a = CragPage(
+            page_name="A", page_url="https://x.test/p?q=1",
+            page_snippet="", page_html="<html/>",
+        )
+        b = CragPage(
+            page_name="B", page_url="https://x.test/p?q=1",
+            page_snippet="", page_html="<html/>",
+        )
+        assert a.url_hash == b.url_hash
+        assert len(a.url_hash) == 12
+
+    def test_url_hash_unique(self) -> None:
+        a = CragPage(
+            page_name="A", page_url="https://x.test/a", page_snippet="", page_html="<html/>",
+        )
+        b = CragPage(
+            page_name="B", page_url="https://x.test/b", page_snippet="", page_html="<html/>",
+        )
+        assert a.url_hash != b.url_hash
+
+
+class TestStratifiedSample:
+    def _make_pool(self) -> list[CragQuestion]:
+        out: list[CragQuestion] = []
+        idx = 0
+        # 30 finance/simple, 20 movie/comparison, 5 sports/multi-hop.
+        for n, domain, qtype in (
+            (30, "finance", "simple"),
+            (20, "movie", "comparison"),
+            (5, "sports", "multi-hop"),
+        ):
+            for _ in range(n):
+                out.append(CragQuestion(
+                    qid=f"C{idx:05d}",
+                    interaction_id=f"i{idx}",
+                    query_time="2024-01-01",
+                    query=f"q{idx}?",
+                    gold_answer="a",
+                    alt_answers=[],
+                    domain=domain,
+                    question_type=qtype,
+                    static_or_dynamic="static",
+                    popularity="head",
+                    split=0,
+                    raw_index=idx,
+                    pages=[],
+                ))
+                idx += 1
+        return out
+
+    def test_sample_smaller_than_pool(self) -> None:
+        pool = self._make_pool()
+        sample = stratified_sample(pool, n=15, seed=7)
+        assert len(sample) == 15
+        # Should pull from all three buckets at least once.
+        domains = {q.domain for q in sample}
+        assert domains == {"finance", "movie", "sports"}
+
+    def test_sample_returns_pool_when_n_geq(self) -> None:
+        pool = self._make_pool()
+        sample = stratified_sample(pool, n=999, seed=1)
+        assert len(sample) == len(pool)
+
+    def test_sample_sorted_by_raw_index(self) -> None:
+        pool = self._make_pool()
+        sample = stratified_sample(pool, n=10, seed=42)
+        assert [q.raw_index for q in sample] == sorted(q.raw_index for q in sample)
+
+    def test_sample_deterministic(self) -> None:
+        pool = self._make_pool()
+        s1 = stratified_sample(pool, n=20, seed=11)
+        s2 = stratified_sample(pool, n=20, seed=11)
+        assert [q.qid for q in s1] == [q.qid for q in s2]
+
+    def test_n_zero_or_negative_returns_pool(self) -> None:
+        pool = self._make_pool()
+        assert len(stratified_sample(pool, n=0)) == len(pool)
+        assert len(stratified_sample(pool, n=-1)) == len(pool)
diff --git a/surfsense_evals/tests/suites/test_crag_grader.py b/surfsense_evals/tests/suites/test_crag_grader.py
new file mode 100644
index 0000000000..93bf6f4783
--- /dev/null
+++ b/surfsense_evals/tests/suites/test_crag_grader.py
@@ -0,0 +1,248 @@
+"""Tests for the CRAG 3-class deterministic grader.
+
+The LLM-judge fallback is excluded here (network call); these tests
+exercise the deterministic shortcut + the special-case routing for
+``false_premise`` questions and refusal detection (``I don't know``).
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from surfsense_evals.suites.research.crag.grader import (
+    CragGradeResult,
+    _flags_false_premise,
+    _is_refusal,
+    _maybe_number,
+    _normalise,
+    _whole_word_substring,
+    grade_deterministic,
+)
+
+
+class TestNormalisation:
+    def test_lowercase_and_punct_stripped(self) -> None:
+        assert _normalise("Apple Inc.") == "apple inc"
+
+    def test_articles_removed(self) -> None:
+        assert _normalise("The Apple Watch") == "apple watch"
+
+    def test_empty_returns_empty(self) -> None:
+        assert _normalise("") == ""
+
+
+class TestNumericExtraction:
+    def test_simple_int(self) -> None:
+        assert _maybe_number("42") == 42.0
+
+    def test_int_with_commas(self) -> None:
+        assert _maybe_number("$1,234") == 1234.0
+
+    def test_year_in_sentence(self) -> None:
+        assert _maybe_number("released in 2008") == 2008.0
+
+    def test_word_number(self) -> None:
+        assert _maybe_number("seven") == 7.0
+
+
+class TestWholeWordSubstring:
+    def test_phrase_match(self) -> None:
+        assert _whole_word_substring("the new york yankees", "new york")
+
+    def test_word_boundary_required(self) -> None:
+        assert not _whole_word_substring("yorkshire", "york")
+
+
+class TestRefusalDetection:
+    def test_explicit_idk(self) -> None:
+        assert _is_refusal("Answer: I don't know")
+
+    def test_idk_no_apostrophe(self) -> None:
+        assert _is_refusal("I dont know")
+
+    def test_no_information(self) -> None:
+        assert _is_refusal("There is no information available about this.")
+
+    def test_unable_to_answer(self) -> None:
+        assert _is_refusal("I am unable to answer this question.")
+
+    def test_empty_is_refusal(self) -> None:
+        assert _is_refusal("")
+        assert _is_refusal("   ")
+
+    def test_real_answer_is_not_refusal(self) -> None:
+        assert not _is_refusal("Answer: Apple Inc")
+        assert not _is_refusal("The CEO is Tim Cook.")
+
+
+class TestFalsePremiseDetection:
+    def test_explicit_false_premise(self) -> None:
+        assert _flags_false_premise(
+            "The question contains a false premise; the company never had that product."
+        )
+
+    def test_no_such(self) -> None:
+        assert _flags_false_premise("There is no such album.")
+
+    def test_did_not_happen(self) -> None:
+        assert _flags_false_premise("That event did not happen.")
+
+    def test_does_not_exist(self) -> None:
+        assert _flags_false_premise("That movie does not exist.")
+
+    def test_normal_answer_is_not_premise_flag(self) -> None:
+        assert not _flags_false_premise("Apple, headquartered in Cupertino.")
+
+
+class TestGradeDeterministicHappyPath:
+    def test_exact_match_correct(self) -> None:
+        result = grade_deterministic(pred="Tim Cook", gold="Tim Cook", question_type="simple")
+        assert result.grade == "correct"
+        assert result.score == 1
+        assert result.method == "exact"
+
+    def test_substring_match(self) -> None:
+        result = grade_deterministic(
+            pred="The answer is Tim Cook, CEO of Apple.",
+            gold="Tim Cook",
+            question_type="simple",
+        )
+        assert result.grade == "correct"
+        assert result.method == "substring"
+
+    def test_alt_answer_match(self) -> None:
+        result = grade_deterministic(
+            pred="2,008",
+            gold="two thousand eight",
+            alt_answers=["2008"],
+            question_type="simple",
+        )
+        assert result.grade == "correct"
+        assert result.score == 1
+
+    def test_numeric_within_tolerance(self) -> None:
+        result = grade_deterministic(
+            pred="The revenue was $1,234,000 USD",
+            gold="$1,234,123",
+            question_type="aggregation",
+        )
+        assert result.grade == "correct"
+        assert result.method == "numeric"
+
+    def test_numeric_outside_tolerance(self) -> None:
+        result = grade_deterministic(
+            pred="100",
+            gold="500",
+            question_type="aggregation",
+        )
+        assert result.grade == "incorrect"
+        assert result.score == -1
+
+    def test_numeric_strict_small_currency(self) -> None:
+        # CRAG (unlike FRAMES) does not apply a 0.5 absolute floor —
+        # ``$2.05`` should NOT match ``$2.17`` (≈5.5% off, well over 1%).
+        result = grade_deterministic(
+            pred="$2.05",
+            gold="$2.17",
+            question_type="simple",
+        )
+        # Falls through to lexical_miss (no substring overlap either).
+        assert result.grade == "incorrect"
+        assert result.method == "lexical_miss"
+
+
+class TestGradeDeterministicRefusal:
+    def test_idk_maps_to_missing(self) -> None:
+        result = grade_deterministic(
+            pred="I don't know.", gold="Tim Cook", question_type="simple",
+        )
+        assert result.grade == "missing"
+        assert result.score == 0
+        assert result.method == "refusal"
+
+    def test_empty_pred_maps_to_missing(self) -> None:
+        result = grade_deterministic(pred="", gold="Tim Cook", question_type="simple")
+        assert result.grade == "missing"
+
+    def test_no_information_maps_to_missing(self) -> None:
+        result = grade_deterministic(
+            pred="There is not enough information to answer.",
+            gold="42",
+            question_type="simple",
+        )
+        assert result.grade == "missing"
+
+
+class TestGradeDeterministicFalsePremise:
+    def test_flagging_premise_is_correct(self) -> None:
+        result = grade_deterministic(
+            pred="The question contains a false premise; that movie does not exist.",
+            gold="invalid question",
+            question_type="false_premise",
+        )
+        assert result.grade == "correct"
+        assert result.method == "false_premise_flagged"
+
+    def test_committing_to_false_answer_is_unclear(self) -> None:
+        # Should land in false_premise_unclear → judge fallback territory.
+        result = grade_deterministic(
+            pred="The album was released in 2010.",
+            gold="invalid question",
+            question_type="false_premise",
+        )
+        assert result.grade == "incorrect"
+        assert result.method == "false_premise_unclear"
+
+    def test_idk_on_false_premise_is_missing(self) -> None:
+        # Refusal precedes false-premise routing.
+        result = grade_deterministic(
+            pred="I don't know.",
+            gold="invalid question",
+            question_type="false_premise",
+        )
+        assert result.grade == "missing"
+
+
+class TestGradeDeterministicLexicalMiss:
+    def test_unknown_paraphrase_routes_to_judge(self) -> None:
+        result = grade_deterministic(
+            pred="It is the technology giant in Cupertino.",
+            gold="Apple Inc",
+            question_type="simple",
+        )
+        # Without a judge, we fall through to lexical_miss → incorrect.
+        assert result.grade == "incorrect"
+        assert result.method == "lexical_miss"
+
+    def test_short_pred_no_substring_credit(self) -> None:
+        # Reverse-substring path requires len >= 3 to credit.
+        result = grade_deterministic(
+            pred="JK",
+            gold="JK Rowling",
+            question_type="simple",
+        )
+        assert result.grade == "incorrect"
+
+
+class TestGradeResultShape:
+    def test_to_dict_round_trip(self) -> None:
+        result = CragGradeResult(
+            grade="correct", score=1, method="exact",
+            normalised_pred="x", normalised_gold="x",
+        )
+        d = result.to_dict()
+        assert d["grade"] == "correct"
+        assert d["score"] == 1
+        assert d["method"] == "exact"
+
+    def test_score_matches_grade(self) -> None:
+        # Construct via grader so the score field is populated correctly.
+        for gold, pred, want_grade in (
+            ("hi", "hi", "correct"),
+            ("hi", "I don't know", "missing"),
+            ("hi", "bye", "incorrect"),
+        ):
+            result = grade_deterministic(pred=pred, gold=gold, question_type="simple")
+            assert result.grade == want_grade
+            expected_score = {"correct": 1, "missing": 0, "incorrect": -1}[want_grade]
+            assert result.score == expected_score
diff --git a/surfsense_evals/tests/suites/test_crag_html_extract.py b/surfsense_evals/tests/suites/test_crag_html_extract.py
new file mode 100644
index 0000000000..a2b47aa452
--- /dev/null
+++ b/surfsense_evals/tests/suites/test_crag_html_extract.py
@@ -0,0 +1,149 @@
+"""Tests for the CRAG HTML extractor.
+
+We don't network-fetch trafilatura; we just verify the wrapper:
+
+* Strips obvious boilerplate (nav/footer/scripts) out of the result.
+* Falls back to the stdlib stripper on degenerate input.
+* Caps output at the configured ceiling.
+* Always prepends a metadata header (``# title``) when content is
+  produced.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from surfsense_evals.suites.research.crag.html_extract import (
+    extract_main_content,
+)
+
+
+_RICH_HTML = """\
+<!DOCTYPE html>
+<html>
+<head><title>Apple Q3 Earnings</title>
+<script>const a=1;</script>
+<style>body{font-family:sans;}</style>
+</head>
+<body>
+<nav><a href="/home">Home</a><a href="/about">About</a></nav>
+<header><h1>Tech News Site</h1><p>Subscribe to our newsletter</p></header>
+<main>
+<article>
+  <h1>Apple posts $90B revenue in Q3 2024</h1>
+  <p>Apple Inc. announced its Q3 2024 financial results today, reporting
+  $90 billion in revenue, beating analyst expectations of $87 billion.</p>
+  <p>The company saw growth across iPhone, services, and wearables.
+  CEO Tim Cook attributed the performance to strong demand in emerging
+  markets, particularly India.</p>
+  <h2>Segment breakdown</h2>
+  <ul>
+    <li>iPhone: $45B</li>
+    <li>Services: $24B</li>
+    <li>Mac: $7B</li>
+  </ul>
+</article>
+</main>
+<footer><p>Copyright 2024 Tech News Site. All rights reserved.</p></footer>
+</body></html>
+"""
+
+
+class TestExtractMainContent:
+    def test_extracts_main_article(self) -> None:
+        result = extract_main_content(
+            _RICH_HTML,
+            url="https://example.com/apple",
+            page_name="Apple Q3 Earnings",
+        )
+        assert result.ok
+        assert "Apple" in result.text
+        assert "Q3 2024" in result.text
+        # Header line is prepended.
+        assert result.text.startswith("# Apple Q3 Earnings")
+        assert "Source: https://example.com/apple" in result.text
+
+    def test_strips_boilerplate(self) -> None:
+        result = extract_main_content(
+            _RICH_HTML,
+            url="https://example.com/apple",
+            page_name="Apple Q3 Earnings",
+        )
+        assert result.ok
+        # Boilerplate strings should NOT make it through.
+        assert "Subscribe to our newsletter" not in result.text
+        assert "Copyright 2024 Tech News Site" not in result.text
+        assert "const a=1" not in result.text  # script content
+
+    def test_includes_last_modified_when_provided(self) -> None:
+        result = extract_main_content(
+            _RICH_HTML,
+            url="https://example.com/apple",
+            page_name="Apple Q3 Earnings",
+            last_modified="2024-08-01",
+        )
+        assert "Last modified: 2024-08-01" in result.text
+
+    def test_empty_html_returns_empty_result(self) -> None:
+        result = extract_main_content("", url="https://x.test/")
+        assert not result.ok
+        assert result.method == "empty"
+        assert result.n_chars == 0
+
+    def test_whitespace_only_html_is_empty(self) -> None:
+        result = extract_main_content("   \n   ", url="https://x.test/")
+        assert not result.ok
+
+    def test_garbage_html_falls_back(self) -> None:
+        # Trafilatura should reject this, fallback strip should still yield text.
+        result = extract_main_content(
+            "<<weird>>not a tag>>>The brown fox<<jumped<<",
+            url="https://x.test/garbage",
+            page_name="Garbage",
+        )
+        # Either trafilatura recovers something or fallback_strip does.
+        if result.ok:
+            assert "brown fox" in result.text or "jumped" in result.text
+
+
+class TestFallbackStripper:
+    def test_extract_when_no_clear_main(self) -> None:
+        html = """
+        <html><body>
+        <p>This is content one.</p>
+        <p>This is content two.</p>
+        </body></html>
+        """
+        result = extract_main_content(
+            html, url="https://x.test/", page_name="Title",
+        )
+        assert result.ok
+        assert "content one" in result.text
+        assert "content two" in result.text
+
+    def test_html_entities_decoded(self) -> None:
+        html = """<html><body>
+        <article>
+        <p>Tom &amp; Jerry &mdash; classic cartoon &copy; 1940.</p>
+        <p>It's a story about a cat &lt;Tom&gt; and a mouse &lt;Jerry&gt;.</p>
+        </article>
+        </body></html>"""
+        result = extract_main_content(html, url="https://x.test/")
+        assert result.ok
+        # & should be decoded
+        assert "&amp;" not in result.text
+        assert "Tom" in result.text and "Jerry" in result.text
+
+
+class TestOutputCapping:
+    def test_long_output_is_truncated(self) -> None:
+        # Generate enough content to exceed 200k cap.
+        body = "<p>" + ("hello world " * 50_000) + "</p>"
+        html = f"<html><body><article>{body}</article></body></html>"
+        result = extract_main_content(html, url="https://x.test/", page_name="long")
+        assert result.ok
+        # The body text itself + the metadata header. Truncation marker
+        # appears either at the body limit or before EOF.
+        if "[...truncated...]" in result.text:
+            # The truncation kicked in.
+            assert len(result.text) <= 250_000  # header + 200k cap + slack
diff --git a/surfsense_evals/tests/suites/test_frames_dataset.py b/surfsense_evals/tests/suites/test_frames_dataset.py
new file mode 100644
index 0000000000..e79e7db89b
--- /dev/null
+++ b/surfsense_evals/tests/suites/test_frames_dataset.py
@@ -0,0 +1,154 @@
+"""Tests for the FRAMES dataset parser.
+
+Network-free: we round-trip a tiny fixture TSV through pandas and
+``load_questions`` to confirm:
+
+* row indices become zero-padded ``Q###`` ids,
+* ``wiki_links`` (Python list literal) is materialised correctly,
+* ``reasoning_types`` is split on the pipe separator,
+* missing Prompt/Answer rows are dropped, and
+* the legacy ``wikipedia_link_*`` per-cell fallback works when
+  ``wiki_links`` is missing/empty.
+"""
+
+from __future__ import annotations
+
+import textwrap
+from pathlib import Path
+
+import pytest
+
+from surfsense_evals.suites.research.frames.dataset import (
+    FramesQuestion,
+    _parse_reasoning_types,
+    _parse_wiki_links,
+    load_questions,
+)
+
+
+# ---------------------------------------------------------------------------
+# Pure-function tests
+# ---------------------------------------------------------------------------
+
+
+class TestParseWikiLinks:
+    def test_python_list_literal(self) -> None:
+        s = "['https://en.wikipedia.org/wiki/A', 'https://en.wikipedia.org/wiki/B']"
+        assert _parse_wiki_links(s) == [
+            "https://en.wikipedia.org/wiki/A",
+            "https://en.wikipedia.org/wiki/B",
+        ]
+
+    def test_none_or_empty(self) -> None:
+        assert _parse_wiki_links(None) == []
+        assert _parse_wiki_links("") == []
+        assert _parse_wiki_links("[]") == []
+
+    def test_unquoted_csv_fallback(self) -> None:
+        # Defensive: non-Python-list strings still split on commas.
+        s = "https://a, https://b"
+        assert _parse_wiki_links(s) == ["https://a", "https://b"]
+
+    def test_already_a_list(self) -> None:
+        assert _parse_wiki_links(["x", "y"]) == ["x", "y"]
+
+
+class TestParseReasoningTypes:
+    def test_pipe_separated(self) -> None:
+        assert _parse_reasoning_types("Numerical reasoning | Multiple constraints") == [
+            "Numerical reasoning",
+            "Multiple constraints",
+        ]
+
+    def test_single_tag(self) -> None:
+        assert _parse_reasoning_types("Tabular reasoning") == ["Tabular reasoning"]
+
+    def test_empty(self) -> None:
+        assert _parse_reasoning_types(None) == []
+        assert _parse_reasoning_types("") == []
+
+
+# ---------------------------------------------------------------------------
+# Round-trip via pandas
+# ---------------------------------------------------------------------------
+
+
+def _write_tsv(path: Path, body: str) -> None:
+    """Helper that writes a tab-separated fixture exactly as the user typed it."""
+
+    path.write_text(textwrap.dedent(body), encoding="utf-8")
+
+
+def test_load_questions_basic(tmp_path: Path) -> None:
+    tsv = tmp_path / "test.tsv"
+    rows = [
+        # Header (first column is unnamed → pandas treats as index)
+        "\tPrompt\tAnswer\twikipedia_link_1\twikipedia_link_2\treasoning_types\twiki_links",
+        # Row 0
+        "0\tWho was the 15th president?\tJames Buchanan\t"
+        "https://en.wikipedia.org/wiki/James_Buchanan\t\t"
+        "Multiple constraints\t"
+        "['https://en.wikipedia.org/wiki/James_Buchanan']",
+        # Row 1
+        "1\tHow many years between A and B?\t87\t"
+        "https://en.wikipedia.org/wiki/A\thttps://en.wikipedia.org/wiki/B\t"
+        "Numerical reasoning | Temporal reasoning\t"
+        "['https://en.wikipedia.org/wiki/A', 'https://en.wikipedia.org/wiki/B']",
+        # Row 2 (intentionally missing Prompt — should be dropped)
+        "2\t\tunused\t\t\t\t",
+    ]
+    tsv.write_text("\n".join(rows) + "\n", encoding="utf-8")
+
+    questions = load_questions(tsv)
+    assert len(questions) == 2
+
+    q0, q1 = questions
+    assert isinstance(q0, FramesQuestion)
+    assert q0.qid == "Q000"
+    assert q0.raw_index == 0
+    assert q0.gold_answer == "James Buchanan"
+    assert q0.wiki_urls == ["https://en.wikipedia.org/wiki/James_Buchanan"]
+    assert q0.reasoning_types == ["Multiple constraints"]
+
+    assert q1.qid == "Q001"
+    assert q1.gold_answer == "87"
+    assert q1.wiki_urls == [
+        "https://en.wikipedia.org/wiki/A",
+        "https://en.wikipedia.org/wiki/B",
+    ]
+    assert q1.reasoning_types == ["Numerical reasoning", "Temporal reasoning"]
+
+
+def test_load_questions_falls_back_to_per_cell_links(tmp_path: Path) -> None:
+    """When ``wiki_links`` is empty, the loader should glue the
+    ``wikipedia_link_*`` cells back together."""
+
+    tsv = tmp_path / "test.tsv"
+    rows = [
+        "\tPrompt\tAnswer\twikipedia_link_1\twikipedia_link_2\treasoning_types\twiki_links",
+        "0\tQ?\tA\t"
+        "https://en.wikipedia.org/wiki/Cell1\thttps://en.wikipedia.org/wiki/Cell2\t"
+        "Numerical reasoning\t",
+    ]
+    tsv.write_text("\n".join(rows) + "\n", encoding="utf-8")
+    questions = load_questions(tsv)
+    assert len(questions) == 1
+    assert questions[0].wiki_urls == [
+        "https://en.wikipedia.org/wiki/Cell1",
+        "https://en.wikipedia.org/wiki/Cell2",
+    ]
+
+
+def test_load_questions_to_dict_round_trip(tmp_path: Path) -> None:
+    tsv = tmp_path / "test.tsv"
+    rows = [
+        "\tPrompt\tAnswer\treasoning_types\twiki_links",
+        "0\tQ?\tParis\tTemporal reasoning\t['https://en.wikipedia.org/wiki/Paris']",
+    ]
+    tsv.write_text("\n".join(rows) + "\n", encoding="utf-8")
+
+    [q] = load_questions(tsv)
+    d = q.to_dict()
+    assert d["qid"] == "Q000"
+    assert d["wiki_urls"] == ["https://en.wikipedia.org/wiki/Paris"]
+    assert d["reasoning_types"] == ["Temporal reasoning"]
diff --git a/surfsense_evals/tests/suites/test_frames_grader.py b/surfsense_evals/tests/suites/test_frames_grader.py
new file mode 100644
index 0000000000..e6e38ff8a4
--- /dev/null
+++ b/surfsense_evals/tests/suites/test_frames_grader.py
@@ -0,0 +1,160 @@
+"""Tests for the FRAMES grader's deterministic shortcut.
+
+The LLM-judge fallback is excluded here (network call); we just
+confirm the rule-based path picks up obvious correct/incorrect
+cases and routes the ambiguous ones to ``lexical_miss`` so the
+runner knows to consult the judge.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from surfsense_evals.suites.research.frames.grader import (
+    GradeResult,
+    _maybe_number,
+    _normalise,
+    _whole_word_substring,
+    grade_deterministic,
+)
+
+
+class TestNormalisation:
+    def test_lowercase_and_punct_stripped(self) -> None:
+        assert _normalise("Jane Ballou.") == "jane ballou"
+
+    def test_articles_removed(self) -> None:
+        assert _normalise("The Eiffel Tower") == "eiffel tower"
+
+    def test_whitespace_squashed(self) -> None:
+        assert _normalise("  multi   space\tinput  ") == "multi space input"
+
+    def test_empty_returns_empty(self) -> None:
+        assert _normalise("") == ""
+        assert _normalise(None) == ""  # type: ignore[arg-type]
+
+
+class TestNumericExtraction:
+    def test_simple_int(self) -> None:
+        assert _maybe_number("42") == 42.0
+
+    def test_int_with_commas(self) -> None:
+        assert _maybe_number("1,234") == 1234.0
+
+    def test_year_in_sentence(self) -> None:
+        assert _maybe_number("It was published in 1847.") == 1847.0
+
+    def test_word_number(self) -> None:
+        assert _maybe_number("five") == 5.0
+        assert _maybe_number("Twenty") == 20.0
+
+    def test_no_number_returns_none(self) -> None:
+        assert _maybe_number("Jane Ballou") is None
+        assert _maybe_number("") is None
+
+
+class TestWholeWordSubstring:
+    def test_phrase_match(self) -> None:
+        assert _whole_word_substring("president of the united states", "united states")
+
+    def test_word_boundary_required(self) -> None:
+        # "states" should NOT match inside "statesman"
+        assert not _whole_word_substring("the renowned statesman", "states")
+
+    def test_empty_needle(self) -> None:
+        assert not _whole_word_substring("anything", "")
+
+
+class TestExactMatch:
+    def test_identical(self) -> None:
+        r = grade_deterministic(pred="Jane Ballou", gold="Jane Ballou")
+        assert r.correct is True
+        assert r.method == "exact"
+
+    def test_case_insensitive(self) -> None:
+        r = grade_deterministic(pred="paris", gold="Paris")
+        assert r.correct is True
+        assert r.method == "exact"
+
+    def test_punctuation_ignored(self) -> None:
+        r = grade_deterministic(pred="Jane Ballou.", gold="Jane Ballou")
+        assert r.correct is True
+
+
+class TestNumericPath:
+    def test_int_match(self) -> None:
+        r = grade_deterministic(pred="The answer is 87", gold="87")
+        assert r.correct is True
+        assert r.method == "numeric"
+
+    def test_word_number_matches_digit(self) -> None:
+        r = grade_deterministic(pred="five", gold="5")
+        assert r.correct is True
+        assert r.method == "numeric"
+
+    def test_off_by_more_than_tolerance_fails(self) -> None:
+        r = grade_deterministic(pred="86", gold="87")
+        # 86 vs 87, abs diff = 1, tol = max(0.01*87, 0.5) = 0.87 → fails
+        assert r.correct is False
+        assert r.method == "numeric_miss"
+
+    def test_within_one_percent_passes(self) -> None:
+        r = grade_deterministic(pred="100", gold="101")
+        # 1.0 abs diff, tol = max(0.01*101, 0.5) = 1.01 → passes
+        assert r.correct is True
+
+
+class TestSubstringPath:
+    def test_pred_contains_gold(self) -> None:
+        r = grade_deterministic(
+            pred="The answer is Jane Ballou according to records",
+            gold="Jane Ballou",
+        )
+        assert r.correct is True
+        assert r.method == "substring"
+
+    def test_gold_contains_pred_with_minimum_length(self) -> None:
+        # Gold = "John F Kennedy", pred = "Kennedy" → reverse substring,
+        # ≥3 chars, but the FRAMES style usually accepts this.
+        r = grade_deterministic(pred="Kennedy", gold="John F. Kennedy")
+        assert r.correct is True
+        assert r.method == "substring_reverse"
+
+    def test_too_short_pred_no_reverse_credit(self) -> None:
+        r = grade_deterministic(pred="of", gold="World of Warcraft")
+        # "of" passes length but is a stopword; the article-stripping
+        # normaliser removes it from gold, so substring fails. Either
+        # way, the grader should NOT credit this.
+        assert r.correct is False
+
+
+class TestLexicalMiss:
+    def test_completely_different_pred_falls_through(self) -> None:
+        r = grade_deterministic(pred="London", gold="Paris")
+        assert r.correct is False
+        assert r.method == "lexical_miss"
+
+    def test_empty_pred(self) -> None:
+        r = grade_deterministic(pred="", gold="Paris")
+        assert r.correct is False
+        assert r.method == "empty_pred"
+
+    def test_empty_gold_defensive(self) -> None:
+        r = grade_deterministic(pred="something", gold="")
+        # Defensive guard — gold should never be empty in practice.
+        assert r.correct is False
+        assert r.method == "empty_gold"
+
+
+class TestGradeResultShape:
+    def test_dict_has_all_expected_keys(self) -> None:
+        r = grade_deterministic(pred="Paris", gold="Paris")
+        d = r.to_dict()
+        assert set(d) >= {
+            "correct",
+            "f1",
+            "method",
+            "normalised_pred",
+            "normalised_gold",
+            "judge_rationale",
+        }
diff --git a/surfsense_evals/tests/suites/test_frames_wiki_fetch.py b/surfsense_evals/tests/suites/test_frames_wiki_fetch.py
new file mode 100644
index 0000000000..4941756f4c
--- /dev/null
+++ b/surfsense_evals/tests/suites/test_frames_wiki_fetch.py
@@ -0,0 +1,112 @@
+"""Tests for the FRAMES Wikipedia fetcher.
+
+We mock the MW API with respx so tests are network-free. Coverage:
+
+* URL → title parsing (percent-encoded, underscores, redirects)
+* Filename safety (slashes, special chars)
+* Cache hit short-circuits the API call
+* Missing pages return ``None`` (not an exception)
+* Successful fetches write ``# Title`` markdown to disk
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import httpx
+import pytest
+import respx
+
+from surfsense_evals.suites.research.frames.wiki_fetch import (
+    WIKI_API,
+    WikiFetcher,
+    cache_filename_for_title,
+    title_from_url,
+)
+
+
+class TestTitleFromUrl:
+    def test_basic(self) -> None:
+        assert title_from_url("https://en.wikipedia.org/wiki/James_Buchanan") == "James Buchanan"
+
+    def test_percent_encoded(self) -> None:
+        assert (
+            title_from_url("https://en.wikipedia.org/wiki/Charlotte_Bront%C3%AB")
+            == "Charlotte Brontë"
+        )
+
+    def test_query_string_dropped(self) -> None:
+        assert title_from_url("https://en.wikipedia.org/wiki/Foo?action=edit") == "Foo"
+
+    def test_non_wiki_raises(self) -> None:
+        with pytest.raises(ValueError):
+            title_from_url("https://example.com/wiki/Foo")
+
+
+class TestCacheFilename:
+    def test_simple(self) -> None:
+        assert cache_filename_for_title("James Buchanan") == "James_Buchanan.md"
+
+    def test_unicode_replaced_with_underscore(self) -> None:
+        # Brontë's diaeresis is non-ASCII so the regex replaces it with `_`.
+        # The space → `_` happens after the unicode swap, so the final
+        # name has exactly one underscore for the diaeresis. Acceptable:
+        # filenames stay round-trippable as long as the rule is deterministic.
+        assert cache_filename_for_title("Charlotte Brontë") == "Charlotte_Bront_.md"
+
+    def test_slashes_replaced(self) -> None:
+        # Wikipedia titles can contain ``/`` (e.g. "I/O"), which would
+        # break the filesystem layout if not sanitised.
+        assert cache_filename_for_title("I/O") == "I_O.md"
+
+
+@pytest.mark.asyncio
+@respx.mock
+async def test_fetch_success_writes_markdown(tmp_path: Path) -> None:
+    respx.get(WIKI_API).mock(return_value=httpx.Response(
+        200,
+        json={"query": {"pages": [{
+            "pageid": 1,
+            "title": "James Buchanan",
+            "extract": "James Buchanan was the 15th president of the United States.",
+        }]}},
+    ))
+    fetcher = WikiFetcher(cache_dir=tmp_path, rate_limit_rps=100)  # disable throttle
+    article = await fetcher.fetch("https://en.wikipedia.org/wiki/James_Buchanan")
+    assert article is not None
+    assert article.title == "James Buchanan"
+    body = article.markdown_path.read_text(encoding="utf-8")
+    assert body.startswith("# James Buchanan")
+    assert "15th president" in body
+
+
+@pytest.mark.asyncio
+@respx.mock
+async def test_fetch_missing_page_returns_none(tmp_path: Path) -> None:
+    respx.get(WIKI_API).mock(return_value=httpx.Response(
+        200,
+        json={"query": {"pages": [{
+            "title": "DoesNotExist",
+            "missing": True,
+        }]}},
+    ))
+    fetcher = WikiFetcher(cache_dir=tmp_path, rate_limit_rps=100)
+    article = await fetcher.fetch("https://en.wikipedia.org/wiki/DoesNotExist")
+    assert article is None
+    assert not (tmp_path / "DoesNotExist.md").exists()
+
+
+@pytest.mark.asyncio
+@respx.mock
+async def test_fetch_cache_hit_skips_api(tmp_path: Path) -> None:
+    # Pre-populate the cache.
+    cached = tmp_path / cache_filename_for_title("Cached Page")
+    cached.write_text("# Cached Page\n\nfrom disk\n", encoding="utf-8")
+    fetcher = WikiFetcher(cache_dir=tmp_path, rate_limit_rps=100)
+
+    # No respx mock registered; if the fetcher hits the network, respx
+    # would error out (it intercepts everything inside the decorator).
+    article = await fetcher.fetch("https://en.wikipedia.org/wiki/Cached_Page")
+    assert article is not None
+    assert article.markdown_path == cached
+    assert article.markdown_path.read_text(encoding="utf-8").endswith("from disk\n")
diff --git a/surfsense_evals/tests/suites/test_mmlongbench_grader.py b/surfsense_evals/tests/suites/test_mmlongbench_grader.py
new file mode 100644
index 0000000000..92cd5f0cb6
--- /dev/null
+++ b/surfsense_evals/tests/suites/test_mmlongbench_grader.py
@@ -0,0 +1,129 @@
+"""Tests for the MMLongBench-Doc format-aware grader.
+
+The grader is the critical correctness piece for the open-ended
+benchmark (no MCQ shortcut), so we cover all five formats with
+representative happy-path + edge-case rows.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from surfsense_evals.suites.multimodal_doc.mmlongbench.grader import grade
+
+
+class TestStrFormat:
+    def test_exact_match(self) -> None:
+        r = grade(pred="Apollo 11", gold="Apollo 11", answer_format="Str")
+        assert r.correct is True
+        assert r.f1 == 1.0
+        assert r.method == "str_norm"
+
+    def test_lowercase_normalised(self) -> None:
+        r = grade(pred="paris", gold="Paris", answer_format="Str")
+        assert r.correct is True
+
+    def test_punctuation_difference_drops_to_substring(self) -> None:
+        # "N.A.S.A." normalises to "n a s a" (whitespace tokens) which
+        # doesn't equal "nasa" — but the F1 token overlap is still 0
+        # because none of the single letters appear standalone in "nasa".
+        # We assert the grader fails closed rather than over-claiming.
+        r = grade(pred="N.A.S.A.", gold="NASA", answer_format="Str")
+        assert r.correct is False  # explicit: this is a failure mode we accept
+
+    def test_substring_credit(self) -> None:
+        r = grade(pred="The answer is Paris.", gold="Paris", answer_format="Str")
+        assert r.correct is True
+
+    def test_completely_wrong(self) -> None:
+        r = grade(pred="London", gold="Paris", answer_format="Str")
+        assert r.correct is False
+        assert r.f1 < 0.5
+
+    def test_empty_pred(self) -> None:
+        r = grade(pred="", gold="Paris", answer_format="Str")
+        assert r.correct is False
+        assert r.f1 == 0.0
+
+
+class TestIntFormat:
+    def test_exact_int(self) -> None:
+        assert grade(pred="42", gold="42", answer_format="Int").correct is True
+
+    def test_int_in_sentence(self) -> None:
+        assert grade(pred="The answer is 42 years.", gold="42", answer_format="Int").correct is True
+
+    def test_int_with_commas(self) -> None:
+        assert grade(pred="1,500", gold="1500", answer_format="Int").correct is True
+
+    def test_wrong_int(self) -> None:
+        assert grade(pred="41", gold="42", answer_format="Int").correct is False
+
+    def test_no_int_in_pred(self) -> None:
+        assert grade(pred="not answerable", gold="42", answer_format="Int").correct is False
+
+
+class TestFloatFormat:
+    def test_exact_float(self) -> None:
+        assert grade(pred="3.14", gold="3.14", answer_format="Float").correct is True
+
+    def test_within_tolerance(self) -> None:
+        # 1% tolerance — 3.14 vs 3.13 is well within.
+        assert grade(pred="3.13", gold="3.14", answer_format="Float").correct is True
+
+    def test_outside_tolerance(self) -> None:
+        assert grade(pred="3.5", gold="3.14", answer_format="Float").correct is False
+
+    def test_european_decimal_comma(self) -> None:
+        # ``3,14`` should parse as 3.14
+        assert grade(pred="3,14", gold="3.14", answer_format="Float").correct is True
+
+    def test_zero_gold_with_small_abs_diff(self) -> None:
+        # Absolute tolerance of 0.01 should kick in for near-zero golds.
+        assert grade(pred="0.005", gold="0", answer_format="Float").correct is True
+
+
+class TestListFormat:
+    def test_exact_set_match(self) -> None:
+        r = grade(pred="apple, banana, cherry", gold="apple, banana, cherry", answer_format="List")
+        assert r.correct is True
+        assert r.f1 == pytest.approx(1.0)
+
+    def test_set_match_different_order(self) -> None:
+        r = grade(pred="cherry, apple, banana", gold="apple, banana, cherry", answer_format="List")
+        assert r.correct is True
+
+    def test_partial_overlap_gives_f1(self) -> None:
+        r = grade(pred="apple, banana", gold="apple, banana, cherry", answer_format="List")
+        assert r.correct is False
+        assert 0.0 < r.f1 < 1.0
+
+    def test_extra_items_lower_precision(self) -> None:
+        r = grade(pred="apple, banana, cherry, date", gold="apple, banana, cherry", answer_format="List")
+        assert 0.0 < r.f1 < 1.0
+        # Recall=1, precision=3/4 → F1 ~= 0.857
+        assert r.f1 == pytest.approx(2 * (3 / 4) * 1 / (3 / 4 + 1), rel=1e-3)
+
+
+class TestNoneFormat:
+    def test_unknown_phrase_credited(self) -> None:
+        for phrase in ("Not answerable", "I cannot answer this.", "No answer", "N/A"):
+            r = grade(pred=phrase, gold="Not answerable", answer_format="None")
+            assert r.correct is True, phrase
+
+    def test_actual_answer_marked_wrong(self) -> None:
+        # The arm hallucinated an answer when it should have said "I don't know".
+        r = grade(pred="The answer is 42.", gold="Not answerable", answer_format="None")
+        assert r.correct is False
+
+
+class TestUnknownFormatFallsBackToStr:
+    def test_blank_format_uses_str_grader(self) -> None:
+        r = grade(pred="Paris", gold="Paris", answer_format="")
+        assert r.correct is True
+        assert r.method == "str_norm"
+
+    def test_garbage_format_uses_str_grader(self) -> None:
+        r = grade(pred="Paris", gold="Paris", answer_format="quux")
+        assert r.correct is True
+        assert r.method == "str_norm"
diff --git a/surfsense_evals/tests/test_integration_smoke.py b/surfsense_evals/tests/test_integration_smoke.py
new file mode 100644
index 0000000000..493c04c251
--- /dev/null
+++ b/surfsense_evals/tests/test_integration_smoke.py
@@ -0,0 +1,35 @@
+"""Opt-in integration smoke against ``http://localhost:8000``.
+
+Run with ``pytest -m integration``. Skipped by default. Touches the
+real backend — requires it to be reachable, OPENROUTER_API_KEY
+unrelated, and one credential mode set.
+"""
+
+from __future__ import annotations
+
+import os
+
+import httpx
+import pytest
+
+from surfsense_evals.core.auth import acquire_token, client_with_auth
+from surfsense_evals.core.config import load_config
+
+pytestmark = pytest.mark.integration
+
+
+@pytest.mark.asyncio
+async def test_smoke_against_localhost():
+    if "SURFSENSE_API_BASE" not in os.environ:
+        pytest.skip("SURFSENSE_API_BASE not set; skipping integration smoke")
+    config = load_config()
+    if config.credential_mode() == "none":
+        pytest.skip("No credentials in environment; skipping integration smoke")
+    bundle = await acquire_token(config)
+    async with client_with_auth(config, bundle) as client:
+        response = await client.get(f"{config.surfsense_api_base}/api/v1/global-new-llm-configs")
+        try:
+            response.raise_for_status()
+        except httpx.HTTPStatusError as exc:
+            pytest.fail(f"Backend rejected smoke call: {exc!s}")
+        assert isinstance(response.json(), list)
diff --git a/surfsense_evals/uv.lock b/surfsense_evals/uv.lock
new file mode 100644
index 0000000000..d2af421628
--- /dev/null
+++ b/surfsense_evals/uv.lock
@@ -0,0 +1,2684 @@
+version = 1
+revision = 1
+requires-python = ">=3.12"
+resolution-markers = [
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version < '3.14' and sys_platform == 'win32'",
+    "python_full_version < '3.14' and sys_platform == 'emscripten'",
+    "python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+]
+
+[[package]]
+name = "aiohappyeyeballs"
+version = "2.6.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/26/30/f84a107a9c4331c14b2b586036f40965c128aa4fee4dda5d3d51cb14ad54/aiohappyeyeballs-2.6.1.tar.gz", hash = "sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558", size = 22760 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0f/15/5bf3b99495fb160b63f95972b81750f18f7f4e02ad051373b669d17d44f2/aiohappyeyeballs-2.6.1-py3-none-any.whl", hash = "sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8", size = 15265 },
+]
+
+[[package]]
+name = "aiohttp"
+version = "3.13.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "aiohappyeyeballs" },
+    { name = "aiosignal" },
+    { name = "attrs" },
+    { name = "frozenlist" },
+    { name = "multidict" },
+    { name = "propcache" },
+    { name = "yarl" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/77/9a/152096d4808df8e4268befa55fba462f440f14beab85e8ad9bf990516918/aiohttp-3.13.5.tar.gz", hash = "sha256:9d98cc980ecc96be6eb4c1994ce35d28d8b1f5e5208a23b421187d1209dbb7d1", size = 7858271 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/be/6f/353954c29e7dcce7cf00280a02c75f30e133c00793c7a2ed3776d7b2f426/aiohttp-3.13.5-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:023ecba036ddd840b0b19bf195bfae970083fd7024ce1ac22e9bba90464620e9", size = 748876 },
+    { url = "https://files.pythonhosted.org/packages/f5/1b/428a7c64687b3b2e9cd293186695affc0e1e54a445d0361743b231f11066/aiohttp-3.13.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:15c933ad7920b7d9a20de151efcd05a6e38302cbf0e10c9b2acb9a42210a2416", size = 499557 },
+    { url = "https://files.pythonhosted.org/packages/29/47/7be41556bfbb6917069d6a6634bb7dd5e163ba445b783a90d40f5ac7e3a7/aiohttp-3.13.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ab2899f9fa2f9f741896ebb6fa07c4c883bfa5c7f2ddd8cf2aafa86fa981b2d2", size = 500258 },
+    { url = "https://files.pythonhosted.org/packages/67/84/c9ecc5828cb0b3695856c07c0a6817a99d51e2473400f705275a2b3d9239/aiohttp-3.13.5-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a60eaa2d440cd4707696b52e40ed3e2b0f73f65be07fd0ef23b6b539c9c0b0b4", size = 1749199 },
+    { url = "https://files.pythonhosted.org/packages/f0/d3/3c6d610e66b495657622edb6ae7c7fd31b2e9086b4ec50b47897ad6042a9/aiohttp-3.13.5-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:55b3bdd3292283295774ab585160c4004f4f2f203946997f49aac032c84649e9", size = 1721013 },
+    { url = "https://files.pythonhosted.org/packages/49/a0/24409c12217456df0bae7babe3b014e460b0b38a8e60753d6cb339f6556d/aiohttp-3.13.5-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c2b2355dc094e5f7d45a7bb262fe7207aa0460b37a0d87027dcf21b5d890e7d5", size = 1781501 },
+    { url = "https://files.pythonhosted.org/packages/98/9d/b65ec649adc5bccc008b0957a9a9c691070aeac4e41cea18559fef49958b/aiohttp-3.13.5-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b38765950832f7d728297689ad78f5f2cf79ff82487131c4d26fe6ceecdc5f8e", size = 1878981 },
+    { url = "https://files.pythonhosted.org/packages/57/d8/8d44036d7eb7b6a8ec4c5494ea0c8c8b94fbc0ed3991c1a7adf230df03bf/aiohttp-3.13.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b18f31b80d5a33661e08c89e202edabf1986e9b49c42b4504371daeaa11b47c1", size = 1767934 },
+    { url = "https://files.pythonhosted.org/packages/31/04/d3f8211f273356f158e3464e9e45484d3fb8c4ce5eb2f6fe9405c3273983/aiohttp-3.13.5-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:33add2463dde55c4f2d9635c6ab33ce154e5ecf322bd26d09af95c5f81cfa286", size = 1566671 },
+    { url = "https://files.pythonhosted.org/packages/41/db/073e4ebe00b78e2dfcacff734291651729a62953b48933d765dc513bf798/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:327cc432fdf1356fb4fbc6fe833ad4e9f6aacb71a8acaa5f1855e4b25910e4a9", size = 1705219 },
+    { url = "https://files.pythonhosted.org/packages/48/45/7dfba71a2f9fd97b15c95c06819de7eb38113d2cdb6319669195a7d64270/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:7c35b0bf0b48a70b4cb4fc5d7bed9b932532728e124874355de1a0af8ec4bc88", size = 1743049 },
+    { url = "https://files.pythonhosted.org/packages/18/71/901db0061e0f717d226386a7f471bb59b19566f2cae5f0d93874b017271f/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:df23d57718f24badef8656c49743e11a89fd6f5358fa8a7b96e728fda2abf7d3", size = 1749557 },
+    { url = "https://files.pythonhosted.org/packages/08/d5/41eebd16066e59cd43728fe74bce953d7402f2b4ddfdfef2c0e9f17ca274/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:02e048037a6501a5ec1f6fc9736135aec6eb8a004ce48838cb951c515f32c80b", size = 1558931 },
+    { url = "https://files.pythonhosted.org/packages/30/e6/4a799798bf05740e66c3a1161079bda7a3dd8e22ca392481d7a7f9af82a6/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:31cebae8b26f8a615d2b546fee45d5ffb76852ae6450e2a03f42c9102260d6fe", size = 1774125 },
+    { url = "https://files.pythonhosted.org/packages/84/63/7749337c90f92bc2cb18f9560d67aa6258c7060d1397d21529b8004fcf6f/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:888e78eb5ca55a615d285c3c09a7a91b42e9dd6fc699b166ebd5dee87c9ccf14", size = 1732427 },
+    { url = "https://files.pythonhosted.org/packages/98/de/cf2f44ff98d307e72fb97d5f5bbae3bfcb442f0ea9790c0bf5c5c2331404/aiohttp-3.13.5-cp312-cp312-win32.whl", hash = "sha256:8bd3ec6376e68a41f9f95f5ed170e2fcf22d4eb27a1f8cb361d0508f6e0557f3", size = 433534 },
+    { url = "https://files.pythonhosted.org/packages/aa/ca/eadf6f9c8fa5e31d40993e3db153fb5ed0b11008ad5d9de98a95045bed84/aiohttp-3.13.5-cp312-cp312-win_amd64.whl", hash = "sha256:110e448e02c729bcebb18c60b9214a87ba33bac4a9fa5e9a5f139938b56c6cb1", size = 460446 },
+    { url = "https://files.pythonhosted.org/packages/78/e9/d76bf503005709e390122d34e15256b88f7008e246c4bdbe915cd4f1adce/aiohttp-3.13.5-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a5029cc80718bbd545123cd8fe5d15025eccaaaace5d0eeec6bd556ad6163d61", size = 742930 },
+    { url = "https://files.pythonhosted.org/packages/57/00/4b7b70223deaebd9bb85984d01a764b0d7bd6526fcdc73cca83bcbe7243e/aiohttp-3.13.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4bb6bf5811620003614076bdc807ef3b5e38244f9d25ca5fe888eaccea2a9832", size = 496927 },
+    { url = "https://files.pythonhosted.org/packages/9c/f5/0fb20fb49f8efdcdce6cd8127604ad2c503e754a8f139f5e02b01626523f/aiohttp-3.13.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a84792f8631bf5a94e52d9cc881c0b824ab42717165a5579c760b830d9392ac9", size = 497141 },
+    { url = "https://files.pythonhosted.org/packages/3b/86/b7c870053e36a94e8951b803cb5b909bfbc9b90ca941527f5fcafbf6b0fa/aiohttp-3.13.5-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:57653eac22c6a4c13eb22ecf4d673d64a12f266e72785ab1c8b8e5940d0e8090", size = 1732476 },
+    { url = "https://files.pythonhosted.org/packages/b5/e5/4e161f84f98d80c03a238671b4136e6530453d65262867d989bbe78244d0/aiohttp-3.13.5-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:e5e5f7debc7a57af53fdf5c5009f9391d9f4c12867049d509bf7bb164a6e295b", size = 1706507 },
+    { url = "https://files.pythonhosted.org/packages/d4/56/ea11a9f01518bd5a2a2fcee869d248c4b8a0cfa0bb13401574fa31adf4d4/aiohttp-3.13.5-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c719f65bebcdf6716f10e9eff80d27567f7892d8988c06de12bbbd39307c6e3a", size = 1773465 },
+    { url = "https://files.pythonhosted.org/packages/eb/40/333ca27fb74b0383f17c90570c748f7582501507307350a79d9f9f3c6eb1/aiohttp-3.13.5-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d97f93fdae594d886c5a866636397e2bcab146fd7a132fd6bb9ce182224452f8", size = 1873523 },
+    { url = "https://files.pythonhosted.org/packages/f0/d2/e2f77eef1acb7111405433c707dc735e63f67a56e176e72e9e7a2cd3f493/aiohttp-3.13.5-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3df334e39d4c2f899a914f1dba283c1aadc311790733f705182998c6f7cae665", size = 1754113 },
+    { url = "https://files.pythonhosted.org/packages/fb/56/3f653d7f53c89669301ec9e42c95233e2a0c0a6dd051269e6e678db4fdb0/aiohttp-3.13.5-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:fe6970addfea9e5e081401bcbadf865d2b6da045472f58af08427e108d618540", size = 1562351 },
+    { url = "https://files.pythonhosted.org/packages/ec/a6/9b3e91eb8ae791cce4ee736da02211c85c6f835f1bdfac0594a8a3b7018c/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:7becdf835feff2f4f335d7477f121af787e3504b48b449ff737afb35869ba7bb", size = 1693205 },
+    { url = "https://files.pythonhosted.org/packages/98/fc/bfb437a99a2fcebd6b6eaec609571954de2ed424f01c352f4b5504371dd3/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:676e5651705ad5d8a70aeb8eb6936c436d8ebbd56e63436cb7dd9bb36d2a9a46", size = 1730618 },
+    { url = "https://files.pythonhosted.org/packages/e4/b6/c8534862126191a034f68153194c389addc285a0f1347d85096d349bbc15/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:9b16c653d38eb1a611cc898c41e76859ca27f119d25b53c12875fd0474ae31a8", size = 1745185 },
+    { url = "https://files.pythonhosted.org/packages/0b/93/4ca8ee2ef5236e2707e0fd5fecb10ce214aee1ff4ab307af9c558bda3b37/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:999802d5fa0389f58decd24b537c54aa63c01c3219ce17d1214cbda3c2b22d2d", size = 1557311 },
+    { url = "https://files.pythonhosted.org/packages/57/ae/76177b15f18c5f5d094f19901d284025db28eccc5ae374d1d254181d33f4/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:ec707059ee75732b1ba130ed5f9580fe10ff75180c812bc267ded039db5128c6", size = 1773147 },
+    { url = "https://files.pythonhosted.org/packages/01/a4/62f05a0a98d88af59d93b7fcac564e5f18f513cb7471696ac286db970d6a/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:2d6d44a5b48132053c2f6cd5c8cb14bc67e99a63594e336b0f2af81e94d5530c", size = 1730356 },
+    { url = "https://files.pythonhosted.org/packages/e4/85/fc8601f59dfa8c9523808281f2da571f8b4699685f9809a228adcc90838d/aiohttp-3.13.5-cp313-cp313-win32.whl", hash = "sha256:329f292ed14d38a6c4c435e465f48bebb47479fd676a0411936cc371643225cc", size = 432637 },
+    { url = "https://files.pythonhosted.org/packages/c0/1b/ac685a8882896acf0f6b31d689e3792199cfe7aba37969fa91da63a7fa27/aiohttp-3.13.5-cp313-cp313-win_amd64.whl", hash = "sha256:69f571de7500e0557801c0b51f4780482c0ec5fe2ac851af5a92cfce1af1cb83", size = 458896 },
+    { url = "https://files.pythonhosted.org/packages/5d/ce/46572759afc859e867a5bc8ec3487315869013f59281ce61764f76d879de/aiohttp-3.13.5-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:eb4639f32fd4a9904ab8fb45bf3383ba71137f3d9d4ba25b3b3f3109977c5b8c", size = 745721 },
+    { url = "https://files.pythonhosted.org/packages/13/fe/8a2efd7626dbe6049b2ef8ace18ffda8a4dfcbe1bcff3ac30c0c7575c20b/aiohttp-3.13.5-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:7e5dc4311bd5ac493886c63cbf76ab579dbe4641268e7c74e48e774c74b6f2be", size = 497663 },
+    { url = "https://files.pythonhosted.org/packages/9b/91/cc8cc78a111826c54743d88651e1687008133c37e5ee615fee9b57990fac/aiohttp-3.13.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:756c3c304d394977519824449600adaf2be0ccee76d206ee339c5e76b70ded25", size = 499094 },
+    { url = "https://files.pythonhosted.org/packages/0a/33/a8362cb15cf16a3af7e86ed11962d5cd7d59b449202dc576cdc731310bde/aiohttp-3.13.5-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecc26751323224cf8186efcf7fbcbc30f4e1d8c7970659daf25ad995e4032a56", size = 1726701 },
+    { url = "https://files.pythonhosted.org/packages/45/0c/c091ac5c3a17114bd76cbf85d674650969ddf93387876cf67f754204bd77/aiohttp-3.13.5-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:10a75acfcf794edf9d8db50e5a7ec5fc818b2a8d3f591ce93bc7b1210df016d2", size = 1683360 },
+    { url = "https://files.pythonhosted.org/packages/23/73/bcee1c2b79bc275e964d1446c55c54441a461938e70267c86afaae6fba27/aiohttp-3.13.5-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:0f7a18f258d124cd678c5fe072fe4432a4d5232b0657fca7c1847f599233c83a", size = 1773023 },
+    { url = "https://files.pythonhosted.org/packages/c7/ef/720e639df03004fee2d869f771799d8c23046dec47d5b81e396c7cda583a/aiohttp-3.13.5-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:df6104c009713d3a89621096f3e3e88cc323fd269dbd7c20afe18535094320be", size = 1853795 },
+    { url = "https://files.pythonhosted.org/packages/bd/c9/989f4034fb46841208de7aeeac2c6d8300745ab4f28c42f629ba77c2d916/aiohttp-3.13.5-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:241a94f7de7c0c3b616627aaad530fe2cb620084a8b144d3be7b6ecfe95bae3b", size = 1730405 },
+    { url = "https://files.pythonhosted.org/packages/ce/75/ee1fd286ca7dc599d824b5651dad7b3be7ff8d9a7e7b3fe9820d9180f7db/aiohttp-3.13.5-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c974fb66180e58709b6fc402846f13791240d180b74de81d23913abe48e96d94", size = 1558082 },
+    { url = "https://files.pythonhosted.org/packages/c3/20/1e9e6650dfc436340116b7aa89ff8cb2bbdf0abc11dfaceaad8f74273a10/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:6e27ea05d184afac78aabbac667450c75e54e35f62238d44463131bd3f96753d", size = 1692346 },
+    { url = "https://files.pythonhosted.org/packages/d8/40/8ebc6658d48ea630ac7903912fe0dd4e262f0e16825aa4c833c56c9f1f56/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:a79a6d399cef33a11b6f004c67bb07741d91f2be01b8d712d52c75711b1e07c7", size = 1698891 },
+    { url = "https://files.pythonhosted.org/packages/d8/78/ea0ae5ec8ba7a5c10bdd6e318f1ba5e76fcde17db8275188772afc7917a4/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:c632ce9c0b534fbe25b52c974515ed674937c5b99f549a92127c85f771a78772", size = 1742113 },
+    { url = "https://files.pythonhosted.org/packages/8a/66/9d308ed71e3f2491be1acb8769d96c6f0c47d92099f3bc9119cada27b357/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:fceedde51fbd67ee2bcc8c0b33d0126cc8b51ef3bbde2f86662bd6d5a6f10ec5", size = 1553088 },
+    { url = "https://files.pythonhosted.org/packages/da/a6/6cc25ed8dfc6e00c90f5c6d126a98e2cf28957ad06fa1036bd34b6f24a2c/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:f92995dfec9420bb69ae629abf422e516923ba79ba4403bc750d94fb4a6c68c1", size = 1757976 },
+    { url = "https://files.pythonhosted.org/packages/c1/2b/cce5b0ffe0de99c83e5e36d8f828e4161e415660a9f3e58339d07cce3006/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:20ae0ff08b1f2c8788d6fb85afcb798654ae6ba0b747575f8562de738078457b", size = 1712444 },
+    { url = "https://files.pythonhosted.org/packages/6c/cf/9e1795b4160c58d29421eafd1a69c6ce351e2f7c8d3c6b7e4ca44aea1a5b/aiohttp-3.13.5-cp314-cp314-win32.whl", hash = "sha256:b20df693de16f42b2472a9c485e1c948ee55524786a0a34345511afdd22246f3", size = 438128 },
+    { url = "https://files.pythonhosted.org/packages/22/4d/eaedff67fc805aeba4ba746aec891b4b24cebb1a7d078084b6300f79d063/aiohttp-3.13.5-cp314-cp314-win_amd64.whl", hash = "sha256:f85c6f327bf0b8c29da7d93b1cabb6363fb5e4e160a32fa241ed2dce21b73162", size = 464029 },
+    { url = "https://files.pythonhosted.org/packages/79/11/c27d9332ee20d68dd164dc12a6ecdef2e2e35ecc97ed6cf0d2442844624b/aiohttp-3.13.5-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:1efb06900858bb618ff5cee184ae2de5828896c448403d51fb633f09e109be0a", size = 778758 },
+    { url = "https://files.pythonhosted.org/packages/04/fb/377aead2e0a3ba5f09b7624f702a964bdf4f08b5b6728a9799830c80041e/aiohttp-3.13.5-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:fee86b7c4bd29bdaf0d53d14739b08a106fdda809ca5fe032a15f52fae5fe254", size = 512883 },
+    { url = "https://files.pythonhosted.org/packages/bb/a6/aa109a33671f7a5d3bd78b46da9d852797c5e665bfda7d6b373f56bff2ec/aiohttp-3.13.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:20058e23909b9e65f9da62b396b77dfa95965cbe840f8def6e572538b1d32e36", size = 516668 },
+    { url = "https://files.pythonhosted.org/packages/79/b3/ca078f9f2fa9563c36fb8ef89053ea2bb146d6f792c5104574d49d8acb63/aiohttp-3.13.5-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8cf20a8d6868cb15a73cab329ffc07291ba8c22b1b88176026106ae39aa6df0f", size = 1883461 },
+    { url = "https://files.pythonhosted.org/packages/b7/e3/a7ad633ca1ca497b852233a3cce6906a56c3225fb6d9217b5e5e60b7419d/aiohttp-3.13.5-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:330f5da04c987f1d5bdb8ae189137c77139f36bd1cb23779ca1a354a4b027800", size = 1747661 },
+    { url = "https://files.pythonhosted.org/packages/33/b9/cd6fe579bed34a906d3d783fe60f2fa297ef55b27bb4538438ee49d4dc41/aiohttp-3.13.5-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6f1cbf0c7926d315c3c26c2da41fd2b5d2fe01ac0e157b78caefc51a782196cf", size = 1863800 },
+    { url = "https://files.pythonhosted.org/packages/c0/3f/2c1e2f5144cefa889c8afd5cf431994c32f3b29da9961698ff4e3811b79a/aiohttp-3.13.5-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:53fc049ed6390d05423ba33103ded7281fe897cf97878f369a527070bd95795b", size = 1958382 },
+    { url = "https://files.pythonhosted.org/packages/66/1d/f31ec3f1013723b3babe3609e7f119c2c2fb6ef33da90061a705ef3e1bc8/aiohttp-3.13.5-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:898703aa2667e3c5ca4c54ca36cd73f58b7a38ef87a5606414799ebce4d3fd3a", size = 1803724 },
+    { url = "https://files.pythonhosted.org/packages/0e/b4/57712dfc6f1542f067daa81eb61da282fab3e6f1966fca25db06c4fc62d5/aiohttp-3.13.5-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:0494a01ca9584eea1e5fbd6d748e61ecff218c51b576ee1999c23db7066417d8", size = 1640027 },
+    { url = "https://files.pythonhosted.org/packages/25/3c/734c878fb43ec083d8e31bf029daae1beafeae582d1b35da234739e82ee7/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:6cf81fe010b8c17b09495cbd15c1d35afbc8fb405c0c9cf4738e5ae3af1d65be", size = 1806644 },
+    { url = "https://files.pythonhosted.org/packages/20/a5/f671e5cbec1c21d044ff3078223f949748f3a7f86b14e34a365d74a5d21f/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:c564dd5f09ddc9d8f2c2d0a301cd30a79a2cc1b46dd1a73bef8f0038863d016b", size = 1791630 },
+    { url = "https://files.pythonhosted.org/packages/0b/63/fb8d0ad63a0b8a99be97deac8c04dacf0785721c158bdf23d679a87aa99e/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:2994be9f6e51046c4f864598fd9abeb4fba6e88f0b2152422c9666dcd4aea9c6", size = 1809403 },
+    { url = "https://files.pythonhosted.org/packages/59/0c/bfed7f30662fcf12206481c2aac57dedee43fe1c49275e85b3a1e1742294/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:157826e2fa245d2ef46c83ea8a5faf77ca19355d278d425c29fda0beb3318037", size = 1634924 },
+    { url = "https://files.pythonhosted.org/packages/17/d6/fd518d668a09fd5a3319ae5e984d4d80b9a4b3df4e21c52f02251ef5a32e/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:a8aca50daa9493e9e13c0f566201a9006f080e7c50e5e90d0b06f53146a54500", size = 1836119 },
+    { url = "https://files.pythonhosted.org/packages/78/b7/15fb7a9d52e112a25b621c67b69c167805cb1f2ab8f1708a5c490d1b52fe/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3b13560160d07e047a93f23aaa30718606493036253d5430887514715b67c9d9", size = 1772072 },
+    { url = "https://files.pythonhosted.org/packages/7e/df/57ba7f0c4a553fc2bd8b6321df236870ec6fd64a2a473a8a13d4f733214e/aiohttp-3.13.5-cp314-cp314t-win32.whl", hash = "sha256:9a0f4474b6ea6818b41f82172d799e4b3d29e22c2c520ce4357856fced9af2f8", size = 471819 },
+    { url = "https://files.pythonhosted.org/packages/62/29/2f8418269e46454a26171bfdd6a055d74febf32234e474930f2f60a17145/aiohttp-3.13.5-cp314-cp314t-win_amd64.whl", hash = "sha256:18a2f6c1182c51baa1d28d68fea51513cb2a76612f038853c0ad3c145423d3d9", size = 505441 },
+]
+
+[[package]]
+name = "aiosignal"
+version = "1.4.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "frozenlist" },
+    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/61/62/06741b579156360248d1ec624842ad0edf697050bbaf7c3e46394e106ad1/aiosignal-1.4.0.tar.gz", hash = "sha256:f47eecd9468083c2029cc99945502cb7708b082c232f9aca65da147157b251c7", size = 25007 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e", size = 7490 },
+]
+
+[[package]]
+name = "aiosqlite"
+version = "0.22.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/4e/8a/64761f4005f17809769d23e518d915db74e6310474e733e3593cfc854ef1/aiosqlite-0.22.1.tar.gz", hash = "sha256:043e0bd78d32888c0a9ca90fc788b38796843360c855a7262a532813133a0650", size = 14821 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/00/b7/e3bf5133d697a08128598c8d0abc5e16377b51465a33756de24fa7dee953/aiosqlite-0.22.1-py3-none-any.whl", hash = "sha256:21c002eb13823fad740196c5a2e9d8e62f6243bd9e7e4a1f87fb5e44ecb4fceb", size = 17405 },
+]
+
+[[package]]
+name = "annotated-doc"
+version = "0.0.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/57/ba/046ceea27344560984e26a590f90bc7f4a75b06701f653222458922b558c/annotated_doc-0.0.4.tar.gz", hash = "sha256:fbcda96e87e9c92ad167c2e53839e57503ecfda18804ea28102353485033faa4", size = 7288 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1e/d3/26bf1008eb3d2daa8ef4cacc7f3bfdc11818d111f7e2d0201bc6e3b49d45/annotated_doc-0.0.4-py3-none-any.whl", hash = "sha256:571ac1dc6991c450b25a9c2d84a3705e2ae7a53467b5d111c24fa8baabbed320", size = 5303 },
+]
+
+[[package]]
+name = "annotated-types"
+version = "0.7.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643 },
+]
+
+[[package]]
+name = "anyio"
+version = "4.13.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "idna" },
+    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/19/14/2c5dd9f512b66549ae92767a9c7b330ae88e1932ca57876909410251fe13/anyio-4.13.0.tar.gz", hash = "sha256:334b70e641fd2221c1505b3890c69882fe4a2df910cba14d97019b90b24439dc", size = 231622 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/da/42/e921fccf5015463e32a3cf6ee7f980a6ed0f395ceeaa45060b61d86486c2/anyio-4.13.0-py3-none-any.whl", hash = "sha256:08b310f9e24a9594186fd75b4f73f4a4152069e3853f1ed8bfbf58369f4ad708", size = 114353 },
+]
+
+[[package]]
+name = "attrs"
+version = "26.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/9a/8e/82a0fe20a541c03148528be8cac2408564a6c9a0cc7e9171802bc1d26985/attrs-26.1.0.tar.gz", hash = "sha256:d03ceb89cb322a8fd706d4fb91940737b6642aa36998fe130a9bc96c985eff32", size = 952055 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/64/b4/17d4b0b2a2dc85a6df63d1157e028ed19f90d4cd97c36717afef2bc2f395/attrs-26.1.0-py3-none-any.whl", hash = "sha256:c647aa4a12dfbad9333ca4e71fe62ddc36f4e63b2d260a37a8b83d2f043ac309", size = 67548 },
+]
+
+[[package]]
+name = "azure-ai-documentintelligence"
+version = "1.0.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "azure-core" },
+    { name = "isodate" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/44/7b/8115cd713e2caa5e44def85f2b7ebd02a74ae74d7113ba20bdd41fd6dd80/azure_ai_documentintelligence-1.0.2.tar.gz", hash = "sha256:4d75a2513f2839365ebabc0e0e1772f5601b3a8c9a71e75da12440da13b63484", size = 170940 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d9/75/c9ec040f23082f54ffb1977ff8f364c2d21c79a640a13d1c1809e7fd6b1a/azure_ai_documentintelligence-1.0.2-py3-none-any.whl", hash = "sha256:e1fb446abbdeccc9759d897898a0fe13141ed29f9ad11fc705f951925822ed59", size = 106005 },
+]
+
+[[package]]
+name = "azure-core"
+version = "1.41.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "requests" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a6/f3/b416179e408990df5db0d516283022dde0f5d0111d98c1a848e41853e81c/azure_core-1.41.0.tar.gz", hash = "sha256:f46ff5dfcd230f25cf1c19e8a34b8dc08a337b2503e268bb600a16c00db8ad5a", size = 381042 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5b/db/325c6d7312d2200251c52323878281045aaffcb5586612296484e4280eaa/azure_core-1.41.0-py3-none-any.whl", hash = "sha256:522b4011e8180b1a3dcd2024396a4e7fe9ac37fb8597db47163d230b5efe892d", size = 220920 },
+]
+
+[[package]]
+name = "babel"
+version = "2.18.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/7d/b2/51899539b6ceeeb420d40ed3cd4b7a40519404f9baf3d4ac99dc413a834b/babel-2.18.0.tar.gz", hash = "sha256:b80b99a14bd085fcacfa15c9165f651fbb3406e66cc603abf11c5750937c992d", size = 9959554 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/77/f5/21d2de20e8b8b0408f0681956ca2c69f1320a3848ac50e6e7f39c6159675/babel-2.18.0-py3-none-any.whl", hash = "sha256:e2b422b277c2b9a9630c1d7903c2a00d0830c409c59ac8cae9081c92f1aeba35", size = 10196845 },
+]
+
+[[package]]
+name = "banks"
+version = "2.4.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "deprecated" },
+    { name = "filetype" },
+    { name = "griffe" },
+    { name = "jinja2" },
+    { name = "platformdirs" },
+    { name = "pydantic" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/bd/51/08fb68d23f4b0f6256fe85dc86e9576941550f890b079352fba719e07b39/banks-2.4.2.tar.gz", hash = "sha256:cda6013bd377ea7b701933578bfb9370fc21ad70bc13cedfc3f5cb2c034ca3dc", size = 188633 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/00/b6/8dc5477681b782e2f99de703e7a99828883364b9e03a60d3e2c47053d56a/banks-2.4.2-py3-none-any.whl", hash = "sha256:5fe407cc48c101f3e13d1cf732b83b8246003337612f13c0705d2e81f6faffb7", size = 35050 },
+]
+
+[[package]]
+name = "certifi"
+version = "2026.4.22"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/25/ee/6caf7a40c36a1220410afe15a1cc64993a1f864871f698c0f93acb72842a/certifi-2026.4.22.tar.gz", hash = "sha256:8d455352a37b71bf76a79caa83a3d6c25afee4a385d632127b6afb3963f1c580", size = 137077 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/22/30/7cd8fdcdfbc5b869528b079bfb76dcdf6056b1a2097a662e5e8c04f42965/certifi-2026.4.22-py3-none-any.whl", hash = "sha256:3cb2210c8f88ba2318d29b0388d1023c8492ff72ecdde4ebdaddbb13a31b1c4a", size = 135707 },
+]
+
+[[package]]
+name = "charset-normalizer"
+version = "3.4.7"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e7/a1/67fe25fac3c7642725500a3f6cfe5821ad557c3abb11c9d20d12c7008d3e/charset_normalizer-3.4.7.tar.gz", hash = "sha256:ae89db9e5f98a11a4bf50407d4363e7b09b31e55bc117b4f7d80aab97ba009e5", size = 144271 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0c/eb/4fc8d0a7110eb5fc9cc161723a34a8a6c200ce3b4fbf681bc86feee22308/charset_normalizer-3.4.7-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:eca9705049ad3c7345d574e3510665cb2cf844c2f2dcfe675332677f081cbd46", size = 311328 },
+    { url = "https://files.pythonhosted.org/packages/f8/e3/0fadc706008ac9d7b9b5be6dc767c05f9d3e5df51744ce4cc9605de7b9f4/charset_normalizer-3.4.7-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6178f72c5508bfc5fd446a5905e698c6212932f25bcdd4b47a757a50605a90e2", size = 208061 },
+    { url = "https://files.pythonhosted.org/packages/42/f0/3dd1045c47f4a4604df85ec18ad093912ae1344ac706993aff91d38773a2/charset_normalizer-3.4.7-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e1421b502d83040e6d7fb2fb18dff63957f720da3d77b2fbd3187ceb63755d7b", size = 229031 },
+    { url = "https://files.pythonhosted.org/packages/dc/67/675a46eb016118a2fbde5a277a5d15f4f69d5f3f5f338e5ee2f8948fcf43/charset_normalizer-3.4.7-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:edac0f1ab77644605be2cbba52e6b7f630731fc42b34cb0f634be1a6eface56a", size = 225239 },
+    { url = "https://files.pythonhosted.org/packages/4b/f8/d0118a2f5f23b02cd166fa385c60f9b0d4f9194f574e2b31cef350ad7223/charset_normalizer-3.4.7-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5649fd1c7bade02f320a462fdefd0b4bd3ce036065836d4f42e0de958038e116", size = 216589 },
+    { url = "https://files.pythonhosted.org/packages/b1/f1/6d2b0b261b6c4ceef0fcb0d17a01cc5bc53586c2d4796fa04b5c540bc13d/charset_normalizer-3.4.7-cp312-cp312-manylinux_2_31_armv7l.whl", hash = "sha256:203104ed3e428044fd943bc4bf45fa73c0730391f9621e37fe39ecf477b128cb", size = 202733 },
+    { url = "https://files.pythonhosted.org/packages/6f/c0/7b1f943f7e87cc3db9626ba17807d042c38645f0a1d4415c7a14afb5591f/charset_normalizer-3.4.7-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:298930cec56029e05497a76988377cbd7457ba864beeea92ad7e844fe74cd1f1", size = 212652 },
+    { url = "https://files.pythonhosted.org/packages/38/dd/5a9ab159fe45c6e72079398f277b7d2b523e7f716acc489726115a910097/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:708838739abf24b2ceb208d0e22403dd018faeef86ddac04319a62ae884c4f15", size = 211229 },
+    { url = "https://files.pythonhosted.org/packages/d5/ff/531a1cad5ca855d1c1a8b69cb71abfd6d85c0291580146fda7c82857caa1/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:0f7eb884681e3938906ed0434f20c63046eacd0111c4ba96f27b76084cd679f5", size = 203552 },
+    { url = "https://files.pythonhosted.org/packages/c1/4c/a5fb52d528a8ca41f7598cb619409ece30a169fbdf9cdce592e53b46c3a6/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:4dc1e73c36828f982bfe79fadf5919923f8a6f4df2860804db9a98c48824ce8d", size = 230806 },
+    { url = "https://files.pythonhosted.org/packages/59/7a/071feed8124111a32b316b33ae4de83d36923039ef8cf48120266844285b/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:aed52fea0513bac0ccde438c188c8a471c4e0f457c2dd20cdbf6ea7a450046c7", size = 212316 },
+    { url = "https://files.pythonhosted.org/packages/fd/35/f7dba3994312d7ba508e041eaac39a36b120f32d4c8662b8814dab876431/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:fea24543955a6a729c45a73fe90e08c743f0b3334bbf3201e6c4bc1b0c7fa464", size = 227274 },
+    { url = "https://files.pythonhosted.org/packages/8a/2d/a572df5c9204ab7688ec1edc895a73ebded3b023bb07364710b05dd1c9be/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:bb6d88045545b26da47aa879dd4a89a71d1dce0f0e549b1abcb31dfe4a8eac49", size = 218468 },
+    { url = "https://files.pythonhosted.org/packages/86/eb/890922a8b03a568ca2f336c36585a4713c55d4d67bf0f0c78924be6315ca/charset_normalizer-3.4.7-cp312-cp312-win32.whl", hash = "sha256:2257141f39fe65a3fdf38aeccae4b953e5f3b3324f4ff0daf9f15b8518666a2c", size = 148460 },
+    { url = "https://files.pythonhosted.org/packages/35/d9/0e7dffa06c5ab081f75b1b786f0aefc88365825dfcd0ac544bdb7b2b6853/charset_normalizer-3.4.7-cp312-cp312-win_amd64.whl", hash = "sha256:5ed6ab538499c8644b8a3e18debabcd7ce684f3fa91cf867521a7a0279cab2d6", size = 159330 },
+    { url = "https://files.pythonhosted.org/packages/9e/5d/481bcc2a7c88ea6b0878c299547843b2521ccbc40980cb406267088bc701/charset_normalizer-3.4.7-cp312-cp312-win_arm64.whl", hash = "sha256:56be790f86bfb2c98fb742ce566dfb4816e5a83384616ab59c49e0604d49c51d", size = 147828 },
+    { url = "https://files.pythonhosted.org/packages/c1/3b/66777e39d3ae1ddc77ee606be4ec6d8cbd4c801f65e5a1b6f2b11b8346dd/charset_normalizer-3.4.7-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:f496c9c3cc02230093d8330875c4c3cdfc3b73612a5fd921c65d39cbcef08063", size = 309627 },
+    { url = "https://files.pythonhosted.org/packages/2e/4e/b7f84e617b4854ade48a1b7915c8ccfadeba444d2a18c291f696e37f0d3b/charset_normalizer-3.4.7-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0ea948db76d31190bf08bd371623927ee1339d5f2a0b4b1b4a4439a65298703c", size = 207008 },
+    { url = "https://files.pythonhosted.org/packages/c4/bb/ec73c0257c9e11b268f018f068f5d00aa0ef8c8b09f7753ebd5f2880e248/charset_normalizer-3.4.7-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a277ab8928b9f299723bc1a2dabb1265911b1a76341f90a510368ca44ad9ab66", size = 228303 },
+    { url = "https://files.pythonhosted.org/packages/85/fb/32d1f5033484494619f701e719429c69b766bfc4dbc61aa9e9c8c166528b/charset_normalizer-3.4.7-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:3bec022aec2c514d9cf199522a802bd007cd588ab17ab2525f20f9c34d067c18", size = 224282 },
+    { url = "https://files.pythonhosted.org/packages/fa/07/330e3a0dda4c404d6da83b327270906e9654a24f6c546dc886a0eb0ffb23/charset_normalizer-3.4.7-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e044c39e41b92c845bc815e5ae4230804e8e7bc29e399b0437d64222d92809dd", size = 215595 },
+    { url = "https://files.pythonhosted.org/packages/e3/7c/fc890655786e423f02556e0216d4b8c6bcb6bdfa890160dc66bf52dee468/charset_normalizer-3.4.7-cp313-cp313-manylinux_2_31_armv7l.whl", hash = "sha256:f495a1652cf3fbab2eb0639776dad966c2fb874d79d87ca07f9d5f059b8bd215", size = 201986 },
+    { url = "https://files.pythonhosted.org/packages/d8/97/bfb18b3db2aed3b90cf54dc292ad79fdd5ad65c4eae454099475cbeadd0d/charset_normalizer-3.4.7-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e712b419df8ba5e42b226c510472b37bd57b38e897d3eca5e8cfd410a29fa859", size = 211711 },
+    { url = "https://files.pythonhosted.org/packages/6f/a5/a581c13798546a7fd557c82614a5c65a13df2157e9ad6373166d2a3e645d/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:7804338df6fcc08105c7745f1502ba68d900f45fd770d5bdd5288ddccb8a42d8", size = 210036 },
+    { url = "https://files.pythonhosted.org/packages/8c/bf/b3ab5bcb478e4193d517644b0fb2bf5497fbceeaa7a1bc0f4d5b50953861/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:481551899c856c704d58119b5025793fa6730adda3571971af568f66d2424bb5", size = 202998 },
+    { url = "https://files.pythonhosted.org/packages/e7/4e/23efd79b65d314fa320ec6017b4b5834d5c12a58ba4610aa353af2e2f577/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f59099f9b66f0d7145115e6f80dd8b1d847176df89b234a5a6b3f00437aa0832", size = 230056 },
+    { url = "https://files.pythonhosted.org/packages/b9/9f/1e1941bc3f0e01df116e68dc37a55c4d249df5e6fa77f008841aef68264f/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:f59ad4c0e8f6bba240a9bb85504faa1ab438237199d4cce5f622761507b8f6a6", size = 211537 },
+    { url = "https://files.pythonhosted.org/packages/80/0f/088cbb3020d44428964a6c97fe1edfb1b9550396bf6d278330281e8b709c/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:3dedcc22d73ec993f42055eff4fcfed9318d1eeb9a6606c55892a26964964e48", size = 226176 },
+    { url = "https://files.pythonhosted.org/packages/6a/9f/130394f9bbe06f4f63e22641d32fc9b202b7e251c9aef4db044324dac493/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:64f02c6841d7d83f832cd97ccf8eb8a906d06eb95d5276069175c696b024b60a", size = 217723 },
+    { url = "https://files.pythonhosted.org/packages/73/55/c469897448a06e49f8fa03f6caae97074fde823f432a98f979cc42b90e69/charset_normalizer-3.4.7-cp313-cp313-win32.whl", hash = "sha256:4042d5c8f957e15221d423ba781e85d553722fc4113f523f2feb7b188cc34c5e", size = 148085 },
+    { url = "https://files.pythonhosted.org/packages/5d/78/1b74c5bbb3f99b77a1715c91b3e0b5bdb6fe302d95ace4f5b1bec37b0167/charset_normalizer-3.4.7-cp313-cp313-win_amd64.whl", hash = "sha256:3946fa46a0cf3e4c8cb1cc52f56bb536310d34f25f01ca9b6c16afa767dab110", size = 158819 },
+    { url = "https://files.pythonhosted.org/packages/68/86/46bd42279d323deb8687c4a5a811fd548cb7d1de10cf6535d099877a9a9f/charset_normalizer-3.4.7-cp313-cp313-win_arm64.whl", hash = "sha256:80d04837f55fc81da168b98de4f4b797ef007fc8a79ab71c6ec9bc4dd662b15b", size = 147915 },
+    { url = "https://files.pythonhosted.org/packages/97/c8/c67cb8c70e19ef1960b97b22ed2a1567711de46c4ddf19799923adc836c2/charset_normalizer-3.4.7-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:c36c333c39be2dbca264d7803333c896ab8fa7d4d6f0ab7edb7dfd7aea6e98c0", size = 309234 },
+    { url = "https://files.pythonhosted.org/packages/99/85/c091fdee33f20de70d6c8b522743b6f831a2f1cd3ff86de4c6a827c48a76/charset_normalizer-3.4.7-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1c2aed2e5e41f24ea8ef1590b8e848a79b56f3a5564a65ceec43c9d692dc7d8a", size = 208042 },
+    { url = "https://files.pythonhosted.org/packages/87/1c/ab2ce611b984d2fd5d86a5a8a19c1ae26acac6bad967da4967562c75114d/charset_normalizer-3.4.7-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:54523e136b8948060c0fa0bc7b1b50c32c186f2fceee897a495406bb6e311d2b", size = 228706 },
+    { url = "https://files.pythonhosted.org/packages/a8/29/2b1d2cb00bf085f59d29eb773ce58ec2d325430f8c216804a0a5cd83cbca/charset_normalizer-3.4.7-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:715479b9a2802ecac752a3b0efa2b0b60285cf962ee38414211abdfccc233b41", size = 224727 },
+    { url = "https://files.pythonhosted.org/packages/47/5c/032c2d5a07fe4d4855fea851209cca2b6f03ebeb6d4e3afdb3358386a684/charset_normalizer-3.4.7-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bd6c2a1c7573c64738d716488d2cdd3c00e340e4835707d8fdb8dc1a66ef164e", size = 215882 },
+    { url = "https://files.pythonhosted.org/packages/2c/c2/356065d5a8b78ed04499cae5f339f091946a6a74f91e03476c33f0ab7100/charset_normalizer-3.4.7-cp314-cp314-manylinux_2_31_armv7l.whl", hash = "sha256:c45e9440fb78f8ddabcf714b68f936737a121355bf59f3907f4e17721b9d1aae", size = 200860 },
+    { url = "https://files.pythonhosted.org/packages/0c/cd/a32a84217ced5039f53b29f460962abb2d4420def55afabe45b1c3c7483d/charset_normalizer-3.4.7-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:3534e7dcbdcf757da6b85a0bbf5b6868786d5982dd959b065e65481644817a18", size = 211564 },
+    { url = "https://files.pythonhosted.org/packages/44/86/58e6f13ce26cc3b8f4a36b94a0f22ae2f00a72534520f4ae6857c4b81f89/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:e8ac484bf18ce6975760921bb6148041faa8fef0547200386ea0b52b5d27bf7b", size = 211276 },
+    { url = "https://files.pythonhosted.org/packages/8f/fe/d17c32dc72e17e155e06883efa84514ca375f8a528ba2546bee73fc4df81/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:a5fe03b42827c13cdccd08e6c0247b6a6d4b5e3cdc53fd1749f5896adcdc2356", size = 201238 },
+    { url = "https://files.pythonhosted.org/packages/6a/29/f33daa50b06525a237451cdb6c69da366c381a3dadcd833fa5676bc468b3/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:2d6eb928e13016cea4f1f21d1e10c1cebd5a421bc57ddf5b1142ae3f86824fab", size = 230189 },
+    { url = "https://files.pythonhosted.org/packages/b6/6e/52c84015394a6a0bdcd435210a7e944c5f94ea1055f5cc5d56c5fe368e7b/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:e74327fb75de8986940def6e8dee4f127cc9752bee7355bb323cc5b2659b6d46", size = 211352 },
+    { url = "https://files.pythonhosted.org/packages/8c/d7/4353be581b373033fb9198bf1da3cf8f09c1082561e8e922aa7b39bf9fe8/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:d6038d37043bced98a66e68d3aa2b6a35505dc01328cd65217cefe82f25def44", size = 227024 },
+    { url = "https://files.pythonhosted.org/packages/30/45/99d18aa925bd1740098ccd3060e238e21115fffbfdcb8f3ece837d0ace6c/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:7579e913a5339fb8fa133f6bbcfd8e6749696206cf05acdbdca71a1b436d8e72", size = 217869 },
+    { url = "https://files.pythonhosted.org/packages/5c/05/5ee478aa53f4bb7996482153d4bfe1b89e0f087f0ab6b294fcf92d595873/charset_normalizer-3.4.7-cp314-cp314-win32.whl", hash = "sha256:5b77459df20e08151cd6f8b9ef8ef1f961ef73d85c21a555c7eed5b79410ec10", size = 148541 },
+    { url = "https://files.pythonhosted.org/packages/48/77/72dcb0921b2ce86420b2d79d454c7022bf5be40202a2a07906b9f2a35c97/charset_normalizer-3.4.7-cp314-cp314-win_amd64.whl", hash = "sha256:92a0a01ead5e668468e952e4238cccd7c537364eb7d851ab144ab6627dbbe12f", size = 159634 },
+    { url = "https://files.pythonhosted.org/packages/c6/a3/c2369911cd72f02386e4e340770f6e158c7980267da16af8f668217abaa0/charset_normalizer-3.4.7-cp314-cp314-win_arm64.whl", hash = "sha256:67f6279d125ca0046a7fd386d01b311c6363844deac3e5b069b514ba3e63c246", size = 148384 },
+    { url = "https://files.pythonhosted.org/packages/94/09/7e8a7f73d24dba1f0035fbbf014d2c36828fc1bf9c88f84093e57d315935/charset_normalizer-3.4.7-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:effc3f449787117233702311a1b7d8f59cba9ced946ba727bdc329ec69028e24", size = 330133 },
+    { url = "https://files.pythonhosted.org/packages/8d/da/96975ddb11f8e977f706f45cddd8540fd8242f71ecdb5d18a80723dcf62c/charset_normalizer-3.4.7-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fbccdc05410c9ee21bbf16a35f4c1d16123dcdeb8a1d38f33654fa21d0234f79", size = 216257 },
+    { url = "https://files.pythonhosted.org/packages/e5/e8/1d63bf8ef2d388e95c64b2098f45f84758f6d102a087552da1485912637b/charset_normalizer-3.4.7-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:733784b6d6def852c814bce5f318d25da2ee65dd4839a0718641c696e09a2960", size = 234851 },
+    { url = "https://files.pythonhosted.org/packages/9b/40/e5ff04233e70da2681fa43969ad6f66ca5611d7e669be0246c4c7aaf6dc8/charset_normalizer-3.4.7-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a89c23ef8d2c6b27fd200a42aa4ac72786e7c60d40efdc76e6011260b6e949c4", size = 233393 },
+    { url = "https://files.pythonhosted.org/packages/be/c1/06c6c49d5a5450f76899992f1ee40b41d076aee9279b49cf9974d2f313d5/charset_normalizer-3.4.7-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6c114670c45346afedc0d947faf3c7f701051d2518b943679c8ff88befe14f8e", size = 223251 },
+    { url = "https://files.pythonhosted.org/packages/2b/9f/f2ff16fb050946169e3e1f82134d107e5d4ae72647ec8a1b1446c148480f/charset_normalizer-3.4.7-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:a180c5e59792af262bf263b21a3c49353f25945d8d9f70628e73de370d55e1e1", size = 206609 },
+    { url = "https://files.pythonhosted.org/packages/69/d5/a527c0cd8d64d2eab7459784fb4169a0ac76e5a6fc5237337982fd61347e/charset_normalizer-3.4.7-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:3c9a494bc5ec77d43cea229c4f6db1e4d8fe7e1bbffa8b6f0f0032430ff8ab44", size = 220014 },
+    { url = "https://files.pythonhosted.org/packages/7e/80/8a7b8104a3e203074dc9aa2c613d4b726c0e136bad1cc734594b02867972/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8d828b6667a32a728a1ad1d93957cdf37489c57b97ae6c4de2860fa749b8fc1e", size = 218979 },
+    { url = "https://files.pythonhosted.org/packages/02/9a/b759b503d507f375b2b5c153e4d2ee0a75aa215b7f2489cf314f4541f2c0/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:cf1493cd8607bec4d8a7b9b004e699fcf8f9103a9284cc94962cb73d20f9d4a3", size = 209238 },
+    { url = "https://files.pythonhosted.org/packages/c2/4e/0f3f5d47b86bdb79256e7290b26ac847a2832d9a4033f7eb2cd4bcf4bb5b/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:0c96c3b819b5c3e9e165495db84d41914d6894d55181d2d108cc1a69bfc9cce0", size = 236110 },
+    { url = "https://files.pythonhosted.org/packages/96/23/bce28734eb3ed2c91dcf93abeb8a5cf393a7b2749725030bb630e554fdd8/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:752a45dc4a6934060b3b0dab47e04edc3326575f82be64bc4fc293914566503e", size = 219824 },
+    { url = "https://files.pythonhosted.org/packages/2c/6f/6e897c6984cc4d41af319b077f2f600fc8214eb2fe2d6bcb79141b882400/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:8778f0c7a52e56f75d12dae53ae320fae900a8b9b4164b981b9c5ce059cd1fcb", size = 233103 },
+    { url = "https://files.pythonhosted.org/packages/76/22/ef7bd0fe480a0ae9b656189ec00744b60933f68b4f42a7bb06589f6f576a/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:ce3412fbe1e31eb81ea42f4169ed94861c56e643189e1e75f0041f3fe7020abe", size = 225194 },
+    { url = "https://files.pythonhosted.org/packages/c5/a7/0e0ab3e0b5bc1219bd80a6a0d4d72ca74d9250cb2382b7c699c147e06017/charset_normalizer-3.4.7-cp314-cp314t-win32.whl", hash = "sha256:c03a41a8784091e67a39648f70c5f97b5b6a37f216896d44d2cdcb82615339a0", size = 159827 },
+    { url = "https://files.pythonhosted.org/packages/7a/1d/29d32e0fb40864b1f878c7f5a0b343ae676c6e2b271a2d55cc3a152391da/charset_normalizer-3.4.7-cp314-cp314t-win_amd64.whl", hash = "sha256:03853ed82eeebbce3c2abfdbc98c96dc205f32a79627688ac9a27370ea61a49c", size = 174168 },
+    { url = "https://files.pythonhosted.org/packages/de/32/d92444ad05c7a6e41fb2036749777c163baf7a0301a040cb672d6b2b1ae9/charset_normalizer-3.4.7-cp314-cp314t-win_arm64.whl", hash = "sha256:c35abb8bfff0185efac5878da64c45dafd2b37fb0383add1be155a763c1f083d", size = 153018 },
+    { url = "https://files.pythonhosted.org/packages/db/8f/61959034484a4a7c527811f4721e75d02d653a35afb0b6054474d8185d4c/charset_normalizer-3.4.7-py3-none-any.whl", hash = "sha256:3dce51d0f5e7951f8bb4900c257dad282f49190fdbebecd4ba99bcc41fef404d", size = 61958 },
+]
+
+[[package]]
+name = "click"
+version = "8.3.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/bb/63/f9e1ea081ce35720d8b92acde70daaedace594dc93b693c869e0d5910718/click-8.3.3.tar.gz", hash = "sha256:398329ad4837b2ff7cbe1dd166a4c0f8900c3ca3a218de04466f38f6497f18a2", size = 328061 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ae/44/c1221527f6a71a01ec6fbad7fa78f1d50dfa02217385cf0fa3eec7087d59/click-8.3.3-py3-none-any.whl", hash = "sha256:a2bf429bb3033c89fa4936ffb35d5cb471e3719e1f3c8a7c3fff0b8314305613", size = 110502 },
+]
+
+[[package]]
+name = "colorama"
+version = "0.4.6"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 },
+]
+
+[[package]]
+name = "courlan"
+version = "1.3.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "babel" },
+    { name = "tld" },
+    { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/6f/54/6d6ceeff4bed42e7a10d6064d35ee43a810e7b3e8beb4abeae8cff4713ae/courlan-1.3.2.tar.gz", hash = "sha256:0b66f4db3a9c39a6e22dd247c72cfaa57d68ea660e94bb2c84ec7db8712af190", size = 206382 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8e/ca/6a667ccbe649856dcd3458bab80b016681b274399d6211187c6ab969fc50/courlan-1.3.2-py3-none-any.whl", hash = "sha256:d0dab52cf5b5b1000ee2839fbc2837e93b2514d3cb5bb61ae158a55b7a04c6be", size = 33848 },
+]
+
+[[package]]
+name = "dataclasses-json"
+version = "0.6.7"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "marshmallow" },
+    { name = "typing-inspect" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/64/a4/f71d9cf3a5ac257c993b5ca3f93df5f7fb395c725e7f1e6479d2514173c3/dataclasses_json-0.6.7.tar.gz", hash = "sha256:b6b3e528266ea45b9535223bc53ca645f5208833c29229e847b3f26a1cc55fc0", size = 32227 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c3/be/d0d44e092656fe7a06b55e6103cbce807cdbdee17884a5367c68c9860853/dataclasses_json-0.6.7-py3-none-any.whl", hash = "sha256:0dbf33f26c8d5305befd61b39d2b3414e8a407bedc2834dea9b8d642666fb40a", size = 28686 },
+]
+
+[[package]]
+name = "datasets"
+version = "4.8.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "dill" },
+    { name = "filelock" },
+    { name = "fsspec", extra = ["http"] },
+    { name = "httpx" },
+    { name = "huggingface-hub" },
+    { name = "multiprocess" },
+    { name = "numpy" },
+    { name = "packaging" },
+    { name = "pandas" },
+    { name = "pyarrow" },
+    { name = "pyyaml" },
+    { name = "requests" },
+    { name = "tqdm" },
+    { name = "xxhash" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/66/34/14cd8e76f907f7d4dca2334cfeec9f81d30fd15c25a015f99aaea694eaed/datasets-4.8.5.tar.gz", hash = "sha256:0f0c1c3d56ffff2c93b2f4c63c95bac94f3d7e8621aea2a2a576275233bba772", size = 605649 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/65/99/00f3196036501b53032c4b1ab8337a0b978dee832ed276dae3815df4e8b5/datasets-4.8.5-py3-none-any.whl", hash = "sha256:5079900781719c0e063a8efdd2cd95a31ad0c63209178669cd23cf1b926149ff", size = 528973 },
+]
+
+[[package]]
+name = "dateparser"
+version = "1.4.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "python-dateutil" },
+    { name = "pytz" },
+    { name = "regex" },
+    { name = "tzlocal" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/46/2d/a0ccdb78788064fa0dc901b8524e50615c42be1d78b78d646d0b28d09180/dateparser-1.4.0.tar.gz", hash = "sha256:97a21840d5ecdf7630c584f673338a5afac5dfe84f647baf4d7e8df98f9354a4", size = 321512 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b4/0b/3c3bb7cbe757279e693a0be6049048012f794d01f81099609ecd53b899f0/dateparser-1.4.0-py3-none-any.whl", hash = "sha256:7902b8e85d603494bf70a5a0b1decdddb2270b9c6e6b2bc8a57b93476c0df378", size = 300379 },
+]
+
+[[package]]
+name = "deprecated"
+version = "1.3.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "wrapt" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/49/85/12f0a49a7c4ffb70572b6c2ef13c90c88fd190debda93b23f026b25f9634/deprecated-1.3.1.tar.gz", hash = "sha256:b1b50e0ff0c1fddaa5708a2c6b0a6588bb09b892825ab2b214ac9ea9d92a5223", size = 2932523 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/84/d0/205d54408c08b13550c733c4b85429e7ead111c7f0014309637425520a9a/deprecated-1.3.1-py2.py3-none-any.whl", hash = "sha256:597bfef186b6f60181535a29fbe44865ce137a5079f295b479886c82729d5f3f", size = 11298 },
+]
+
+[[package]]
+name = "dill"
+version = "0.4.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/81/e1/56027a71e31b02ddc53c7d65b01e68edf64dea2932122fe7746a516f75d5/dill-0.4.1.tar.gz", hash = "sha256:423092df4182177d4d8ba8290c8a5b640c66ab35ec7da59ccfa00f6fa3eea5fa", size = 187315 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1e/77/dc8c558f7593132cf8fefec57c4f60c83b16941c574ac5f619abb3ae7933/dill-0.4.1-py3-none-any.whl", hash = "sha256:1e1ce33e978ae97fcfcff5638477032b801c46c7c65cf717f95fbc2248f79a9d", size = 120019 },
+]
+
+[[package]]
+name = "dirtyjson"
+version = "1.0.8"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/db/04/d24f6e645ad82ba0ef092fa17d9ef7a21953781663648a01c9371d9e8e98/dirtyjson-1.0.8.tar.gz", hash = "sha256:90ca4a18f3ff30ce849d100dcf4a003953c79d3a2348ef056f1d9c22231a25fd", size = 30782 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/68/69/1bcf70f81de1b4a9f21b3a62ec0c83bdff991c88d6cc2267d02408457e88/dirtyjson-1.0.8-py3-none-any.whl", hash = "sha256:125e27248435a58acace26d5c2c4c11a1c0de0a9c5124c5a94ba78e517d74f53", size = 25197 },
+]
+
+[[package]]
+name = "filelock"
+version = "3.29.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b5/fe/997687a931ab51049acce6fa1f23e8f01216374ea81374ddee763c493db5/filelock-3.29.0.tar.gz", hash = "sha256:69974355e960702e789734cb4871f884ea6fe50bd8404051a3530bc07809cf90", size = 57571 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/81/47/dd9a212ef6e343a6857485ffe25bba537304f1913bdbed446a23f7f592e1/filelock-3.29.0-py3-none-any.whl", hash = "sha256:96f5f6344709aa1572bbf631c640e4ebeeb519e08da902c39a001882f30ac258", size = 39812 },
+]
+
+[[package]]
+name = "filetype"
+version = "1.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/bb/29/745f7d30d47fe0f251d3ad3dc2978a23141917661998763bebb6da007eb1/filetype-1.2.0.tar.gz", hash = "sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb", size = 998020 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/18/79/1b8fa1bb3568781e84c9200f951c735f3f157429f44be0495da55894d620/filetype-1.2.0-py2.py3-none-any.whl", hash = "sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25", size = 19970 },
+]
+
+[[package]]
+name = "frozenlist"
+version = "1.8.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/2d/f5/c831fac6cc817d26fd54c7eaccd04ef7e0288806943f7cc5bbf69f3ac1f0/frozenlist-1.8.0.tar.gz", hash = "sha256:3ede829ed8d842f6cd48fc7081d7a41001a56f1f38603f9d49bf3020d59a31ad", size = 45875 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/69/29/948b9aa87e75820a38650af445d2ef2b6b8a6fab1a23b6bb9e4ef0be2d59/frozenlist-1.8.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:78f7b9e5d6f2fdb88cdde9440dc147259b62b9d3b019924def9f6478be254ac1", size = 87782 },
+    { url = "https://files.pythonhosted.org/packages/64/80/4f6e318ee2a7c0750ed724fa33a4bdf1eacdc5a39a7a24e818a773cd91af/frozenlist-1.8.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:229bf37d2e4acdaf808fd3f06e854a4a7a3661e871b10dc1f8f1896a3b05f18b", size = 50594 },
+    { url = "https://files.pythonhosted.org/packages/2b/94/5c8a2b50a496b11dd519f4a24cb5496cf125681dd99e94c604ccdea9419a/frozenlist-1.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f833670942247a14eafbb675458b4e61c82e002a148f49e68257b79296e865c4", size = 50448 },
+    { url = "https://files.pythonhosted.org/packages/6a/bd/d91c5e39f490a49df14320f4e8c80161cfcce09f1e2cde1edd16a551abb3/frozenlist-1.8.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:494a5952b1c597ba44e0e78113a7266e656b9794eec897b19ead706bd7074383", size = 242411 },
+    { url = "https://files.pythonhosted.org/packages/8f/83/f61505a05109ef3293dfb1ff594d13d64a2324ac3482be2cedc2be818256/frozenlist-1.8.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:96f423a119f4777a4a056b66ce11527366a8bb92f54e541ade21f2374433f6d4", size = 243014 },
+    { url = "https://files.pythonhosted.org/packages/d8/cb/cb6c7b0f7d4023ddda30cf56b8b17494eb3a79e3fda666bf735f63118b35/frozenlist-1.8.0-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3462dd9475af2025c31cc61be6652dfa25cbfb56cbbf52f4ccfe029f38decaf8", size = 234909 },
+    { url = "https://files.pythonhosted.org/packages/31/c5/cd7a1f3b8b34af009fb17d4123c5a778b44ae2804e3ad6b86204255f9ec5/frozenlist-1.8.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c4c800524c9cd9bac5166cd6f55285957fcfc907db323e193f2afcd4d9abd69b", size = 250049 },
+    { url = "https://files.pythonhosted.org/packages/c0/01/2f95d3b416c584a1e7f0e1d6d31998c4a795f7544069ee2e0962a4b60740/frozenlist-1.8.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d6a5df73acd3399d893dafc71663ad22534b5aa4f94e8a2fabfe856c3c1b6a52", size = 256485 },
+    { url = "https://files.pythonhosted.org/packages/ce/03/024bf7720b3abaebcff6d0793d73c154237b85bdf67b7ed55e5e9596dc9a/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:405e8fe955c2280ce66428b3ca55e12b3c4e9c336fb2103a4937e891c69a4a29", size = 237619 },
+    { url = "https://files.pythonhosted.org/packages/69/fa/f8abdfe7d76b731f5d8bd217827cf6764d4f1d9763407e42717b4bed50a0/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:908bd3f6439f2fef9e85031b59fd4f1297af54415fb60e4254a95f75b3cab3f3", size = 250320 },
+    { url = "https://files.pythonhosted.org/packages/f5/3c/b051329f718b463b22613e269ad72138cc256c540f78a6de89452803a47d/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:294e487f9ec720bd8ffcebc99d575f7eff3568a08a253d1ee1a0378754b74143", size = 246820 },
+    { url = "https://files.pythonhosted.org/packages/0f/ae/58282e8f98e444b3f4dd42448ff36fa38bef29e40d40f330b22e7108f565/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:74c51543498289c0c43656701be6b077f4b265868fa7f8a8859c197006efb608", size = 250518 },
+    { url = "https://files.pythonhosted.org/packages/8f/96/007e5944694d66123183845a106547a15944fbbb7154788cbf7272789536/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:776f352e8329135506a1d6bf16ac3f87bc25b28e765949282dcc627af36123aa", size = 239096 },
+    { url = "https://files.pythonhosted.org/packages/66/bb/852b9d6db2fa40be96f29c0d1205c306288f0684df8fd26ca1951d461a56/frozenlist-1.8.0-cp312-cp312-win32.whl", hash = "sha256:433403ae80709741ce34038da08511d4a77062aa924baf411ef73d1146e74faf", size = 39985 },
+    { url = "https://files.pythonhosted.org/packages/b8/af/38e51a553dd66eb064cdf193841f16f077585d4d28394c2fa6235cb41765/frozenlist-1.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:34187385b08f866104f0c0617404c8eb08165ab1272e884abc89c112e9c00746", size = 44591 },
+    { url = "https://files.pythonhosted.org/packages/a7/06/1dc65480ab147339fecc70797e9c2f69d9cea9cf38934ce08df070fdb9cb/frozenlist-1.8.0-cp312-cp312-win_arm64.whl", hash = "sha256:fe3c58d2f5db5fbd18c2987cba06d51b0529f52bc3a6cdc33d3f4eab725104bd", size = 40102 },
+    { url = "https://files.pythonhosted.org/packages/2d/40/0832c31a37d60f60ed79e9dfb5a92e1e2af4f40a16a29abcc7992af9edff/frozenlist-1.8.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8d92f1a84bb12d9e56f818b3a746f3efba93c1b63c8387a73dde655e1e42282a", size = 85717 },
+    { url = "https://files.pythonhosted.org/packages/30/ba/b0b3de23f40bc55a7057bd38434e25c34fa48e17f20ee273bbde5e0650f3/frozenlist-1.8.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:96153e77a591c8adc2ee805756c61f59fef4cf4073a9275ee86fe8cba41241f7", size = 49651 },
+    { url = "https://files.pythonhosted.org/packages/0c/ab/6e5080ee374f875296c4243c381bbdef97a9ac39c6e3ce1d5f7d42cb78d6/frozenlist-1.8.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f21f00a91358803399890ab167098c131ec2ddd5f8f5fd5fe9c9f2c6fcd91e40", size = 49417 },
+    { url = "https://files.pythonhosted.org/packages/d5/4e/e4691508f9477ce67da2015d8c00acd751e6287739123113a9fca6f1604e/frozenlist-1.8.0-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:fb30f9626572a76dfe4293c7194a09fb1fe93ba94c7d4f720dfae3b646b45027", size = 234391 },
+    { url = "https://files.pythonhosted.org/packages/40/76/c202df58e3acdf12969a7895fd6f3bc016c642e6726aa63bd3025e0fc71c/frozenlist-1.8.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eaa352d7047a31d87dafcacbabe89df0aa506abb5b1b85a2fb91bc3faa02d822", size = 233048 },
+    { url = "https://files.pythonhosted.org/packages/f9/c0/8746afb90f17b73ca5979c7a3958116e105ff796e718575175319b5bb4ce/frozenlist-1.8.0-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:03ae967b4e297f58f8c774c7eabcce57fe3c2434817d4385c50661845a058121", size = 226549 },
+    { url = "https://files.pythonhosted.org/packages/7e/eb/4c7eefc718ff72f9b6c4893291abaae5fbc0c82226a32dcd8ef4f7a5dbef/frozenlist-1.8.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f6292f1de555ffcc675941d65fffffb0a5bcd992905015f85d0592201793e0e5", size = 239833 },
+    { url = "https://files.pythonhosted.org/packages/c2/4e/e5c02187cf704224f8b21bee886f3d713ca379535f16893233b9d672ea71/frozenlist-1.8.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:29548f9b5b5e3460ce7378144c3010363d8035cea44bc0bf02d57f5a685e084e", size = 245363 },
+    { url = "https://files.pythonhosted.org/packages/1f/96/cb85ec608464472e82ad37a17f844889c36100eed57bea094518bf270692/frozenlist-1.8.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ec3cc8c5d4084591b4237c0a272cc4f50a5b03396a47d9caaf76f5d7b38a4f11", size = 229314 },
+    { url = "https://files.pythonhosted.org/packages/5d/6f/4ae69c550e4cee66b57887daeebe006fe985917c01d0fff9caab9883f6d0/frozenlist-1.8.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:517279f58009d0b1f2e7c1b130b377a349405da3f7621ed6bfae50b10adf20c1", size = 243365 },
+    { url = "https://files.pythonhosted.org/packages/7a/58/afd56de246cf11780a40a2c28dc7cbabbf06337cc8ddb1c780a2d97e88d8/frozenlist-1.8.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:db1e72ede2d0d7ccb213f218df6a078a9c09a7de257c2fe8fcef16d5925230b1", size = 237763 },
+    { url = "https://files.pythonhosted.org/packages/cb/36/cdfaf6ed42e2644740d4a10452d8e97fa1c062e2a8006e4b09f1b5fd7d63/frozenlist-1.8.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:b4dec9482a65c54a5044486847b8a66bf10c9cb4926d42927ec4e8fd5db7fed8", size = 240110 },
+    { url = "https://files.pythonhosted.org/packages/03/a8/9ea226fbefad669f11b52e864c55f0bd57d3c8d7eb07e9f2e9a0b39502e1/frozenlist-1.8.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:21900c48ae04d13d416f0e1e0c4d81f7931f73a9dfa0b7a8746fb2fe7dd970ed", size = 233717 },
+    { url = "https://files.pythonhosted.org/packages/1e/0b/1b5531611e83ba7d13ccc9988967ea1b51186af64c42b7a7af465dcc9568/frozenlist-1.8.0-cp313-cp313-win32.whl", hash = "sha256:8b7b94a067d1c504ee0b16def57ad5738701e4ba10cec90529f13fa03c833496", size = 39628 },
+    { url = "https://files.pythonhosted.org/packages/d8/cf/174c91dbc9cc49bc7b7aab74d8b734e974d1faa8f191c74af9b7e80848e6/frozenlist-1.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:878be833caa6a3821caf85eb39c5ba92d28e85df26d57afb06b35b2efd937231", size = 43882 },
+    { url = "https://files.pythonhosted.org/packages/c1/17/502cd212cbfa96eb1388614fe39a3fc9ab87dbbe042b66f97acb57474834/frozenlist-1.8.0-cp313-cp313-win_arm64.whl", hash = "sha256:44389d135b3ff43ba8cc89ff7f51f5a0bb6b63d829c8300f79a2fe4fe61bcc62", size = 39676 },
+    { url = "https://files.pythonhosted.org/packages/d2/5c/3bbfaa920dfab09e76946a5d2833a7cbdf7b9b4a91c714666ac4855b88b4/frozenlist-1.8.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:e25ac20a2ef37e91c1b39938b591457666a0fa835c7783c3a8f33ea42870db94", size = 89235 },
+    { url = "https://files.pythonhosted.org/packages/d2/d6/f03961ef72166cec1687e84e8925838442b615bd0b8854b54923ce5b7b8a/frozenlist-1.8.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:07cdca25a91a4386d2e76ad992916a85038a9b97561bf7a3fd12d5d9ce31870c", size = 50742 },
+    { url = "https://files.pythonhosted.org/packages/1e/bb/a6d12b7ba4c3337667d0e421f7181c82dda448ce4e7ad7ecd249a16fa806/frozenlist-1.8.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:4e0c11f2cc6717e0a741f84a527c52616140741cd812a50422f83dc31749fb52", size = 51725 },
+    { url = "https://files.pythonhosted.org/packages/bc/71/d1fed0ffe2c2ccd70b43714c6cab0f4188f09f8a67a7914a6b46ee30f274/frozenlist-1.8.0-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b3210649ee28062ea6099cfda39e147fa1bc039583c8ee4481cb7811e2448c51", size = 284533 },
+    { url = "https://files.pythonhosted.org/packages/c9/1f/fb1685a7b009d89f9bf78a42d94461bc06581f6e718c39344754a5d9bada/frozenlist-1.8.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:581ef5194c48035a7de2aefc72ac6539823bb71508189e5de01d60c9dcd5fa65", size = 292506 },
+    { url = "https://files.pythonhosted.org/packages/e6/3b/b991fe1612703f7e0d05c0cf734c1b77aaf7c7d321df4572e8d36e7048c8/frozenlist-1.8.0-cp313-cp313t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3ef2d026f16a2b1866e1d86fc4e1291e1ed8a387b2c333809419a2f8b3a77b82", size = 274161 },
+    { url = "https://files.pythonhosted.org/packages/ca/ec/c5c618767bcdf66e88945ec0157d7f6c4a1322f1473392319b7a2501ded7/frozenlist-1.8.0-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:5500ef82073f599ac84d888e3a8c1f77ac831183244bfd7f11eaa0289fb30714", size = 294676 },
+    { url = "https://files.pythonhosted.org/packages/7c/ce/3934758637d8f8a88d11f0585d6495ef54b2044ed6ec84492a91fa3b27aa/frozenlist-1.8.0-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:50066c3997d0091c411a66e710f4e11752251e6d2d73d70d8d5d4c76442a199d", size = 300638 },
+    { url = "https://files.pythonhosted.org/packages/fc/4f/a7e4d0d467298f42de4b41cbc7ddaf19d3cfeabaf9ff97c20c6c7ee409f9/frozenlist-1.8.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:5c1c8e78426e59b3f8005e9b19f6ff46e5845895adbde20ece9218319eca6506", size = 283067 },
+    { url = "https://files.pythonhosted.org/packages/dc/48/c7b163063d55a83772b268e6d1affb960771b0e203b632cfe09522d67ea5/frozenlist-1.8.0-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:eefdba20de0d938cec6a89bd4d70f346a03108a19b9df4248d3cf0d88f1b0f51", size = 292101 },
+    { url = "https://files.pythonhosted.org/packages/9f/d0/2366d3c4ecdc2fd391e0afa6e11500bfba0ea772764d631bbf82f0136c9d/frozenlist-1.8.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:cf253e0e1c3ceb4aaff6df637ce033ff6535fb8c70a764a8f46aafd3d6ab798e", size = 289901 },
+    { url = "https://files.pythonhosted.org/packages/b8/94/daff920e82c1b70e3618a2ac39fbc01ae3e2ff6124e80739ce5d71c9b920/frozenlist-1.8.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:032efa2674356903cd0261c4317a561a6850f3ac864a63fc1583147fb05a79b0", size = 289395 },
+    { url = "https://files.pythonhosted.org/packages/e3/20/bba307ab4235a09fdcd3cc5508dbabd17c4634a1af4b96e0f69bfe551ebd/frozenlist-1.8.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6da155091429aeba16851ecb10a9104a108bcd32f6c1642867eadaee401c1c41", size = 283659 },
+    { url = "https://files.pythonhosted.org/packages/fd/00/04ca1c3a7a124b6de4f8a9a17cc2fcad138b4608e7a3fc5877804b8715d7/frozenlist-1.8.0-cp313-cp313t-win32.whl", hash = "sha256:0f96534f8bfebc1a394209427d0f8a63d343c9779cda6fc25e8e121b5fd8555b", size = 43492 },
+    { url = "https://files.pythonhosted.org/packages/59/5e/c69f733a86a94ab10f68e496dc6b7e8bc078ebb415281d5698313e3af3a1/frozenlist-1.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:5d63a068f978fc69421fb0e6eb91a9603187527c86b7cd3f534a5b77a592b888", size = 48034 },
+    { url = "https://files.pythonhosted.org/packages/16/6c/be9d79775d8abe79b05fa6d23da99ad6e7763a1d080fbae7290b286093fd/frozenlist-1.8.0-cp313-cp313t-win_arm64.whl", hash = "sha256:bf0a7e10b077bf5fb9380ad3ae8ce20ef919a6ad93b4552896419ac7e1d8e042", size = 41749 },
+    { url = "https://files.pythonhosted.org/packages/f1/c8/85da824b7e7b9b6e7f7705b2ecaf9591ba6f79c1177f324c2735e41d36a2/frozenlist-1.8.0-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:cee686f1f4cadeb2136007ddedd0aaf928ab95216e7691c63e50a8ec066336d0", size = 86127 },
+    { url = "https://files.pythonhosted.org/packages/8e/e8/a1185e236ec66c20afd72399522f142c3724c785789255202d27ae992818/frozenlist-1.8.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:119fb2a1bd47307e899c2fac7f28e85b9a543864df47aa7ec9d3c1b4545f096f", size = 49698 },
+    { url = "https://files.pythonhosted.org/packages/a1/93/72b1736d68f03fda5fdf0f2180fb6caaae3894f1b854d006ac61ecc727ee/frozenlist-1.8.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:4970ece02dbc8c3a92fcc5228e36a3e933a01a999f7094ff7c23fbd2beeaa67c", size = 49749 },
+    { url = "https://files.pythonhosted.org/packages/a7/b2/fabede9fafd976b991e9f1b9c8c873ed86f202889b864756f240ce6dd855/frozenlist-1.8.0-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:cba69cb73723c3f329622e34bdbf5ce1f80c21c290ff04256cff1cd3c2036ed2", size = 231298 },
+    { url = "https://files.pythonhosted.org/packages/3a/3b/d9b1e0b0eed36e70477ffb8360c49c85c8ca8ef9700a4e6711f39a6e8b45/frozenlist-1.8.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:778a11b15673f6f1df23d9586f83c4846c471a8af693a22e066508b77d201ec8", size = 232015 },
+    { url = "https://files.pythonhosted.org/packages/dc/94/be719d2766c1138148564a3960fc2c06eb688da592bdc25adcf856101be7/frozenlist-1.8.0-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:0325024fe97f94c41c08872db482cf8ac4800d80e79222c6b0b7b162d5b13686", size = 225038 },
+    { url = "https://files.pythonhosted.org/packages/e4/09/6712b6c5465f083f52f50cf74167b92d4ea2f50e46a9eea0523d658454ae/frozenlist-1.8.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:97260ff46b207a82a7567b581ab4190bd4dfa09f4db8a8b49d1a958f6aa4940e", size = 240130 },
+    { url = "https://files.pythonhosted.org/packages/f8/d4/cd065cdcf21550b54f3ce6a22e143ac9e4836ca42a0de1022da8498eac89/frozenlist-1.8.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:54b2077180eb7f83dd52c40b2750d0a9f175e06a42e3213ce047219de902717a", size = 242845 },
+    { url = "https://files.pythonhosted.org/packages/62/c3/f57a5c8c70cd1ead3d5d5f776f89d33110b1addae0ab010ad774d9a44fb9/frozenlist-1.8.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2f05983daecab868a31e1da44462873306d3cbfd76d1f0b5b69c473d21dbb128", size = 229131 },
+    { url = "https://files.pythonhosted.org/packages/6c/52/232476fe9cb64f0742f3fde2b7d26c1dac18b6d62071c74d4ded55e0ef94/frozenlist-1.8.0-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:33f48f51a446114bc5d251fb2954ab0164d5be02ad3382abcbfe07e2531d650f", size = 240542 },
+    { url = "https://files.pythonhosted.org/packages/5f/85/07bf3f5d0fb5414aee5f47d33c6f5c77bfe49aac680bfece33d4fdf6a246/frozenlist-1.8.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:154e55ec0655291b5dd1b8731c637ecdb50975a2ae70c606d100750a540082f7", size = 237308 },
+    { url = "https://files.pythonhosted.org/packages/11/99/ae3a33d5befd41ac0ca2cc7fd3aa707c9c324de2e89db0e0f45db9a64c26/frozenlist-1.8.0-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:4314debad13beb564b708b4a496020e5306c7333fa9a3ab90374169a20ffab30", size = 238210 },
+    { url = "https://files.pythonhosted.org/packages/b2/60/b1d2da22f4970e7a155f0adde9b1435712ece01b3cd45ba63702aea33938/frozenlist-1.8.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:073f8bf8becba60aa931eb3bc420b217bb7d5b8f4750e6f8b3be7f3da85d38b7", size = 231972 },
+    { url = "https://files.pythonhosted.org/packages/3f/ab/945b2f32de889993b9c9133216c068b7fcf257d8595a0ac420ac8677cab0/frozenlist-1.8.0-cp314-cp314-win32.whl", hash = "sha256:bac9c42ba2ac65ddc115d930c78d24ab8d4f465fd3fc473cdedfccadb9429806", size = 40536 },
+    { url = "https://files.pythonhosted.org/packages/59/ad/9caa9b9c836d9ad6f067157a531ac48b7d36499f5036d4141ce78c230b1b/frozenlist-1.8.0-cp314-cp314-win_amd64.whl", hash = "sha256:3e0761f4d1a44f1d1a47996511752cf3dcec5bbdd9cc2b4fe595caf97754b7a0", size = 44330 },
+    { url = "https://files.pythonhosted.org/packages/82/13/e6950121764f2676f43534c555249f57030150260aee9dcf7d64efda11dd/frozenlist-1.8.0-cp314-cp314-win_arm64.whl", hash = "sha256:d1eaff1d00c7751b7c6662e9c5ba6eb2c17a2306ba5e2a37f24ddf3cc953402b", size = 40627 },
+    { url = "https://files.pythonhosted.org/packages/c0/c7/43200656ecc4e02d3f8bc248df68256cd9572b3f0017f0a0c4e93440ae23/frozenlist-1.8.0-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:d3bb933317c52d7ea5004a1c442eef86f426886fba134ef8cf4226ea6ee1821d", size = 89238 },
+    { url = "https://files.pythonhosted.org/packages/d1/29/55c5f0689b9c0fb765055629f472c0de484dcaf0acee2f7707266ae3583c/frozenlist-1.8.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:8009897cdef112072f93a0efdce29cd819e717fd2f649ee3016efd3cd885a7ed", size = 50738 },
+    { url = "https://files.pythonhosted.org/packages/ba/7d/b7282a445956506fa11da8c2db7d276adcbf2b17d8bb8407a47685263f90/frozenlist-1.8.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2c5dcbbc55383e5883246d11fd179782a9d07a986c40f49abe89ddf865913930", size = 51739 },
+    { url = "https://files.pythonhosted.org/packages/62/1c/3d8622e60d0b767a5510d1d3cf21065b9db874696a51ea6d7a43180a259c/frozenlist-1.8.0-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:39ecbc32f1390387d2aa4f5a995e465e9e2f79ba3adcac92d68e3e0afae6657c", size = 284186 },
+    { url = "https://files.pythonhosted.org/packages/2d/14/aa36d5f85a89679a85a1d44cd7a6657e0b1c75f61e7cad987b203d2daca8/frozenlist-1.8.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:92db2bf818d5cc8d9c1f1fc56b897662e24ea5adb36ad1f1d82875bd64e03c24", size = 292196 },
+    { url = "https://files.pythonhosted.org/packages/05/23/6bde59eb55abd407d34f77d39a5126fb7b4f109a3f611d3929f14b700c66/frozenlist-1.8.0-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:2dc43a022e555de94c3b68a4ef0b11c4f747d12c024a520c7101709a2144fb37", size = 273830 },
+    { url = "https://files.pythonhosted.org/packages/d2/3f/22cff331bfad7a8afa616289000ba793347fcd7bc275f3b28ecea2a27909/frozenlist-1.8.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:cb89a7f2de3602cfed448095bab3f178399646ab7c61454315089787df07733a", size = 294289 },
+    { url = "https://files.pythonhosted.org/packages/a4/89/5b057c799de4838b6c69aa82b79705f2027615e01be996d2486a69ca99c4/frozenlist-1.8.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:33139dc858c580ea50e7e60a1b0ea003efa1fd42e6ec7fdbad78fff65fad2fd2", size = 300318 },
+    { url = "https://files.pythonhosted.org/packages/30/de/2c22ab3eb2a8af6d69dc799e48455813bab3690c760de58e1bf43b36da3e/frozenlist-1.8.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:168c0969a329b416119507ba30b9ea13688fafffac1b7822802537569a1cb0ef", size = 282814 },
+    { url = "https://files.pythonhosted.org/packages/59/f7/970141a6a8dbd7f556d94977858cfb36fa9b66e0892c6dd780d2219d8cd8/frozenlist-1.8.0-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:28bd570e8e189d7f7b001966435f9dac6718324b5be2990ac496cf1ea9ddb7fe", size = 291762 },
+    { url = "https://files.pythonhosted.org/packages/c1/15/ca1adae83a719f82df9116d66f5bb28bb95557b3951903d39135620ef157/frozenlist-1.8.0-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:b2a095d45c5d46e5e79ba1e5b9cb787f541a8dee0433836cea4b96a2c439dcd8", size = 289470 },
+    { url = "https://files.pythonhosted.org/packages/ac/83/dca6dc53bf657d371fbc88ddeb21b79891e747189c5de990b9dfff2ccba1/frozenlist-1.8.0-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:eab8145831a0d56ec9c4139b6c3e594c7a83c2c8be25d5bcf2d86136a532287a", size = 289042 },
+    { url = "https://files.pythonhosted.org/packages/96/52/abddd34ca99be142f354398700536c5bd315880ed0a213812bc491cff5e4/frozenlist-1.8.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:974b28cf63cc99dfb2188d8d222bc6843656188164848c4f679e63dae4b0708e", size = 283148 },
+    { url = "https://files.pythonhosted.org/packages/af/d3/76bd4ed4317e7119c2b7f57c3f6934aba26d277acc6309f873341640e21f/frozenlist-1.8.0-cp314-cp314t-win32.whl", hash = "sha256:342c97bf697ac5480c0a7ec73cd700ecfa5a8a40ac923bd035484616efecc2df", size = 44676 },
+    { url = "https://files.pythonhosted.org/packages/89/76/c615883b7b521ead2944bb3480398cbb07e12b7b4e4d073d3752eb721558/frozenlist-1.8.0-cp314-cp314t-win_amd64.whl", hash = "sha256:06be8f67f39c8b1dc671f5d83aaefd3358ae5cdcf8314552c57e7ed3e6475bdd", size = 49451 },
+    { url = "https://files.pythonhosted.org/packages/e0/a3/5982da14e113d07b325230f95060e2169f5311b1017ea8af2a29b374c289/frozenlist-1.8.0-cp314-cp314t-win_arm64.whl", hash = "sha256:102e6314ca4da683dca92e3b1355490fed5f313b768500084fbe6371fddfdb79", size = 42507 },
+    { url = "https://files.pythonhosted.org/packages/9a/9a/e35b4a917281c0b8419d4207f4334c8e8c5dbf4f3f5f9ada73958d937dcc/frozenlist-1.8.0-py3-none-any.whl", hash = "sha256:0c18a16eab41e82c295618a77502e17b195883241c563b00f0aa5106fc4eaa0d", size = 13409 },
+]
+
+[[package]]
+name = "fsspec"
+version = "2026.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/51/7c/f60c259dcbf4f0c47cc4ddb8f7720d2dcdc8888c8e5ad84c73ea4531cc5b/fsspec-2026.2.0.tar.gz", hash = "sha256:6544e34b16869f5aacd5b90bdf1a71acb37792ea3ddf6125ee69a22a53fb8bff", size = 313441 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e6/ab/fb21f4c939bb440104cc2b396d3be1d9b7a9fd3c6c2a53d98c45b3d7c954/fsspec-2026.2.0-py3-none-any.whl", hash = "sha256:98de475b5cb3bd66bedd5c4679e87b4fdfe1a3bf4d707b151b3c07e58c9a2437", size = 202505 },
+]
+
+[package.optional-dependencies]
+http = [
+    { name = "aiohttp" },
+]
+
+[[package]]
+name = "greenlet"
+version = "3.5.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/3c/3f/dbf99fb14bfeb88c28f16729215478c0e265cacd6dc22270c8f31bb6892f/greenlet-3.5.0.tar.gz", hash = "sha256:d419647372241bc68e957bf38d5c1f98852155e4146bd1e4121adea81f4f01e4", size = 196995 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ef/32/f2ce6d4cac3e55bc6173f92dbe627e782e1850f89d986c3606feb63aafa7/greenlet-3.5.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:db2910d3c809444e0a20147361f343fe2798e106af8d9d8506f5305302655a9f", size = 286228 },
+    { url = "https://files.pythonhosted.org/packages/b7/aa/caed9e5adf742315fc7be2a84196373aab4816e540e38ba0d76cb7584d68/greenlet-3.5.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3ec9ea74e7268ace7f9aab1b1a4e730193fc661b39a993cd91c606c32d4a3628", size = 601775 },
+    { url = "https://files.pythonhosted.org/packages/c7/af/90ae08497400a941595d12774447f752d3dfe0fbb012e35b76bc5c0ff37e/greenlet-3.5.0-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:54d243512da35485fc7a6bf3c178fdda6327a9d6506fcdd62b1abd1e41b2927b", size = 614436 },
+    { url = "https://files.pythonhosted.org/packages/3f/e9/4eeadf8cb3403ac274245ba75f07844abc7fa5f6787583fc9156ba741e0f/greenlet-3.5.0-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:41353ec2ecedf7aa8f682753a41919f8718031a6edac46b8d3dc7ed9e1ceb136", size = 620610 },
+    { url = "https://files.pythonhosted.org/packages/2b/e0/2e13df68f367e2f9960616927d60857dd7e56aaadd59a47c644216b2f920/greenlet-3.5.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9d280a7f5c331622c69f97eb167f33577ff2d1df282c41cd15907fc0a3ca198c", size = 611388 },
+    { url = "https://files.pythonhosted.org/packages/ee/ef/f913b3c0eb7d26d86a2401c5e1546c9d46b657efee724b06f6f4ac5d8824/greenlet-3.5.0-cp312-cp312-manylinux_2_39_riscv64.whl", hash = "sha256:58c1c374fe2b3d852f9b6b11a7dff4c85404e51b9a596fd9e89cf904eb09866d", size = 422775 },
+    { url = "https://files.pythonhosted.org/packages/82/f7/393c64055132ac0d488ef6be549253b7e6274194863967ddc0bc8f5b87b8/greenlet-3.5.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1eb67d5adefb5bd2e182d42678a328979a209e4e82eb93575708185d31d1f588", size = 1570768 },
+    { url = "https://files.pythonhosted.org/packages/b8/4b/eaf7735253522cf56d1b74d672a58f54fc114702ceaf05def59aae72f6e1/greenlet-3.5.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2628d6c86f6cb0cb45e0c3c54058bbec559f57eaae699447748cb3928150577e", size = 1635983 },
+    { url = "https://files.pythonhosted.org/packages/4c/fe/4fb3a0805bd5165da5ebf858da7cc01cce8061674106d2cf5bdab32cbfde/greenlet-3.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:d4d9f0624c775f2dfc56ba54d515a8c771044346852a918b405914f6b19d7fd8", size = 238840 },
+    { url = "https://files.pythonhosted.org/packages/cb/cb/baa584cb00532126ffe12d9787db0a60c5a4f55c27bfe2666df5d4c30a32/greenlet-3.5.0-cp312-cp312-win_arm64.whl", hash = "sha256:83ed9f27f1680b50e89f40f6df348a290ea234b249a4003d366663a12eab94f2", size = 235615 },
+    { url = "https://files.pythonhosted.org/packages/0c/58/fc576f99037ce19c5aa16628e4c3226b6d1419f72a62c79f5f40576e6eb3/greenlet-3.5.0-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:5a5ed18de6a0f6cc7087f1563f6bd93fc7df1c19165ca01e9bde5a5dc281d106", size = 285066 },
+    { url = "https://files.pythonhosted.org/packages/4a/ba/b28ddbe6bfad6a8ac196ef0e8cff37bc65b79735995b9e410923fffeeb70/greenlet-3.5.0-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a717fbc46d8a354fa675f7c1e813485b6ba3885f9bef0cd56e5ba27d758ff5b", size = 604414 },
+    { url = "https://files.pythonhosted.org/packages/09/06/4b69f8f0b67603a8be2790e55107a190b376f2627fe0eaf5695d85ffb3cd/greenlet-3.5.0-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ddc090c5c1792b10246a78e8c2163ebbe04cf877f9d785c230a7b27b39ad038e", size = 617349 },
+    { url = "https://files.pythonhosted.org/packages/6a/15/a643b4ecd09969e30b8a150d5919960caae0abe4f5af75ab040b1ab85e78/greenlet-3.5.0-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4964101b8585c144cbda5532b1aa644255126c08a265dae90c16e7a0e63aaa9d", size = 623234 },
+    { url = "https://files.pythonhosted.org/packages/8a/17/a3918541fd0ddefe024a69de6d16aa7b46d36ac19562adaa63c7fa180eff/greenlet-3.5.0-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2094acd54b272cb6eae8c03dd87b3fa1820a4cef18d6889c378d503500a1dc13", size = 613927 },
+    { url = "https://files.pythonhosted.org/packages/77/18/3b13d5ef1275b0ffaf933b05efa21408ac4ca95823c7411d79682e4fdcff/greenlet-3.5.0-cp313-cp313-manylinux_2_39_riscv64.whl", hash = "sha256:7022615368890680e67b9965d33f5773aade330d5343bbe25560135aaa849eae", size = 425243 },
+    { url = "https://files.pythonhosted.org/packages/ee/e1/bd0af6213c7dd33175d8a462d4c1fe1175124ebed4855bc1475a5b5242c2/greenlet-3.5.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:5e05ba267789ea87b5a155cf0e810b1ab88bf18e9e8740813945ceb8ee4350ba", size = 1570893 },
+    { url = "https://files.pythonhosted.org/packages/9b/2a/0789702f864f5382cb476b93d7a9c823c10472658102ccd65f415747d2e2/greenlet-3.5.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:0ecec963079cd58cbd14723582384f11f166fd58883c15dcbfb342e0bc9b5846", size = 1636060 },
+    { url = "https://files.pythonhosted.org/packages/b2/8f/22bf9df92bbff0eb07842b60f7e63bf7675a9742df628437a9f02d09137f/greenlet-3.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:728d9667d8f2f586644b748dbd9bb67e50d6a9381767d1357714ea6825bb3bf5", size = 238740 },
+    { url = "https://files.pythonhosted.org/packages/b6/b7/9c5c3d653bd4ff614277c049ac676422e2c557db47b4fe43e6313fc005dc/greenlet-3.5.0-cp313-cp313-win_arm64.whl", hash = "sha256:47422135b1d308c14b2c6e758beedb1acd33bb91679f5670edf77bf46244722b", size = 235525 },
+    { url = "https://files.pythonhosted.org/packages/94/5e/a70f31e3e8d961c4ce589c15b28e4225d63704e431a23932a3808cbcc867/greenlet-3.5.0-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:f35807464c4c58c55f0d31dfa83c541a5615d825c2fe3d2b95360cf7c4e3c0a8", size = 285564 },
+    { url = "https://files.pythonhosted.org/packages/af/a6/046c0a28e21833e4086918218cfb3d8bed51c075a1b700f20b9d7861c0f4/greenlet-3.5.0-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:55fa7ea52771be44af0de27d8b80c02cd18c2c3cddde6c847ecebdf72418b6a1", size = 651166 },
+    { url = "https://files.pythonhosted.org/packages/47/f8/4af27f71c5ff32a7fbc516adb46370d9c4ae2bc7bd3dc7d066ac542b4b15/greenlet-3.5.0-cp314-cp314-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a97e4821aa710603f94de0da25f25096454d78ffdace5dc77f3a006bc01abba3", size = 663792 },
+    { url = "https://files.pythonhosted.org/packages/fb/89/2dadb89793c37ee8b4c237857188293e9060dc085f19845c292e00f8e091/greenlet-3.5.0-cp314-cp314-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:bf2d8a80bec89ab46221ae45c5373d5ba0bd36c19aa8508e85c6cd7e5106cd37", size = 668086 },
+    { url = "https://files.pythonhosted.org/packages/a3/59/1bd6d7428d6ed9106efbb8c52310c60fd04f6672490f452aeaa3829aa436/greenlet-3.5.0-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8f52a464e4ed91780bdfbbdd2b97197f3accaa629b98c200f4dffada759f3ae7", size = 660933 },
+    { url = "https://files.pythonhosted.org/packages/82/35/75722be7e26a2af4cbd2dc35b0ed382dacf9394b7e75551f76ed1abe87f2/greenlet-3.5.0-cp314-cp314-manylinux_2_39_riscv64.whl", hash = "sha256:1bae92a1dd94c5f9d9493c3a212dd874c202442047cf96446412c862feca83a2", size = 470799 },
+    { url = "https://files.pythonhosted.org/packages/83/e4/b903e5a5fae1e8a28cdd32a0cfbfd560b668c25b692f67768822ddc5f40f/greenlet-3.5.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:762612baf1161ccb8437c0161c668a688223cba28e1bf038f4eb47b13e39ccdf", size = 1618401 },
+    { url = "https://files.pythonhosted.org/packages/0e/e3/5ec408a329acb854fb607a122e1ee5fb3ff649f9a97952948a90803c0d8e/greenlet-3.5.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:57a43c6079a89713522bc4bcb9f75070ecf5d3dbad7792bfe42239362cbf2a16", size = 1682038 },
+    { url = "https://files.pythonhosted.org/packages/91/20/6b165108058767ee643c55c5c4904d591a830ee2b3c7dbd359828fbc829f/greenlet-3.5.0-cp314-cp314-win_amd64.whl", hash = "sha256:3bc59be3945ae9750b9e7d45067d01ae3fe90ea5f9ade99239dabdd6e28a5033", size = 239835 },
+    { url = "https://files.pythonhosted.org/packages/4e/62/1c498375cee177b55d980c1db319f26470e5309e54698c8f8fc06c0fd539/greenlet-3.5.0-cp314-cp314-win_arm64.whl", hash = "sha256:a96fcee45e03fe30a62669fd16ab5c9d3c172660d3085605cb1e2d1280d3c988", size = 236862 },
+    { url = "https://files.pythonhosted.org/packages/78/a8/4522939255bb5409af4e87132f915446bf3622c2c292d14d3c38d128ae82/greenlet-3.5.0-cp314-cp314t-macosx_11_0_universal2.whl", hash = "sha256:a10a732421ab4fec934783ce3e54763470d0181db6e3468f9103a275c3ed1853", size = 293614 },
+    { url = "https://files.pythonhosted.org/packages/15/5e/8744c52e2c027b5a8772a01561934c8835f869733e101f62075c60430340/greenlet-3.5.0-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7fc391b1566f2907d17aaebe78f8855dc45675159a775fcf9e61f8ee0078e87f", size = 650723 },
+    { url = "https://files.pythonhosted.org/packages/00/ef/7b4c39c03cf46ceca512c5d3f914afd85aa30b2cc9a93015b0dd73e4be6c/greenlet-3.5.0-cp314-cp314t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:680bd0e7ad5e8daa8a4aa89f68fd6adc834b8a8036dc256533f7e08f4a4b01f7", size = 656529 },
+    { url = "https://files.pythonhosted.org/packages/5f/5c/0602239503b124b70e39355cbdb39361ecfe65b87a5f2f63752c32f5286f/greenlet-3.5.0-cp314-cp314t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:1aa4ce8debcd4ea7fb2e150f3036588c41493d1d52c43538924ae1819003f4ce", size = 657015 },
+    { url = "https://files.pythonhosted.org/packages/0b/b5/c7768f352f5c010f92064d0063f987e7dc0cd290a6d92a34109015ce4aa1/greenlet-3.5.0-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ddb36c7d6c9c0a65f18c7258634e0c416c6ab59caac8c987b96f80c2ebda0112", size = 654364 },
+    { url = "https://files.pythonhosted.org/packages/38/51/8699f865f125dc952384cb432b0f7138aa4d8f2969a7d12d0df5b94d054d/greenlet-3.5.0-cp314-cp314t-manylinux_2_39_riscv64.whl", hash = "sha256:728a73687e39ae9ca34e4694cbf2f049d3fbc7174639468d0f67200a97d8f9e2", size = 488275 },
+    { url = "https://files.pythonhosted.org/packages/ef/d0/079ebe12e4b1fc758857ce5be1a5e73f06870f2101e52611d1e71925ce54/greenlet-3.5.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e5ddf316ced87539144621453c3aef229575825fe60c604e62bedc4003f372b2", size = 1614204 },
+    { url = "https://files.pythonhosted.org/packages/6d/89/6c2fb63df3596552d20e58fb4d96669243388cf680cff222758812c7bfaa/greenlet-3.5.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:4a448128607be0de65342dc9b31be7f948ef4cc0bc8832069350abefd310a8f2", size = 1675480 },
+    { url = "https://files.pythonhosted.org/packages/15/32/77ee8a6c1564fc345a491a4e85b3bf360e4cf26eac98c4532d2fdb96e01f/greenlet-3.5.0-cp314-cp314t-win_amd64.whl", hash = "sha256:d60097128cb0a1cab9ea541186ea13cd7b847b8449a7787c2e2350da0cb82d86", size = 245324 },
+]
+
+[[package]]
+name = "griffe"
+version = "2.0.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "griffecli" },
+    { name = "griffelib" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/4a/49/eb6d2935e27883af92c930ed40cc4c69bcd32c402be43b8ca4ab20510f67/griffe-2.0.2.tar.gz", hash = "sha256:c5d56326d159f274492e9bf93a9895cec101155d944caa66d0fc4e0c13751b92", size = 293757 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/94/c0/2bb018eecf9a83c68db9cd9fffd9dab25f102ad30ed869451046e46d1187/griffe-2.0.2-py3-none-any.whl", hash = "sha256:2b31816460aee1996af26050a1fc6927a2e5936486856707f55508e4c9b5960b", size = 5141 },
+]
+
+[[package]]
+name = "griffecli"
+version = "2.0.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama" },
+    { name = "griffelib" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/79/e0/6a7d661d71bb043656a109b91d84a42b5342752542074ec83b16a6eb97f0/griffecli-2.0.2.tar.gz", hash = "sha256:40a1ad4181fc39685d025e119ae2c5b669acdc1f19b705fb9bf971f4e6f6dffb", size = 56281 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2e/e8/90d93356c88ac34c20cb5edffca68138df55ca9bbd1a06eccfbcec8fdbe5/griffecli-2.0.2-py3-none-any.whl", hash = "sha256:0d44d39e59afa81e288a3e1c3bf352cc4fa537483326ac06b8bb6a51fd8303a0", size = 9500 },
+]
+
+[[package]]
+name = "griffelib"
+version = "2.0.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/9d/82/74f4a3310cdabfbb10da554c3a672847f1ed33c6f61dd472681ce7f1fe67/griffelib-2.0.2.tar.gz", hash = "sha256:3cf20b3bc470e83763ffbf236e0076b1211bac1bc67de13daf494640f2de707e", size = 166461 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/11/8c/c9138d881c79aa0ea9ed83cbd58d5ca75624378b38cee225dcf5c42cc91f/griffelib-2.0.2-py3-none-any.whl", hash = "sha256:925c857658fb1ba40c0772c37acbc2ab650bd794d9c1b9726922e36ea4117ea1", size = 142357 },
+]
+
+[[package]]
+name = "h11"
+version = "0.16.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515 },
+]
+
+[[package]]
+name = "hf-xet"
+version = "1.5.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/74/d8/5c06fc76461418326a7decf8367480c35be11a41fd938633929c60a9ec6b/hf_xet-1.5.0.tar.gz", hash = "sha256:e0fb0a34d9f406eed88233e829a67ec016bec5af19e480eac65a233ea289a948", size = 837196 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/68/9b/6912c99070915a4f28119e3c5b52a9abd1eec0ad5cb293b8c967a0c6f5a2/hf_xet-1.5.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:7d70fe2ce97b9db73b9c9b9c81fe3693640aec83416a966c446afea54acfae3c", size = 4023383 },
+    { url = "https://files.pythonhosted.org/packages/0f/6d/9563cfde59b5d8128a9c7ec972a087f4c782e4f7bac5a85234edfd5d5e49/hf_xet-1.5.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:73a0dae8c71de3b0633a45c73f4a4a5ed09e94b43441d82981a781d4f12baa42", size = 3792751 },
+    { url = "https://files.pythonhosted.org/packages/07/a5/ed5a0cf35b49a0571af5a8f53416dad1877a718c021c9937c3a53cb45781/hf_xet-1.5.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a60290ec57e9b71767fba7c3645ddafdd0759974b540441510c629c6db6db24a", size = 4456058 },
+    { url = "https://files.pythonhosted.org/packages/60/fb/3ae8bf2a7a37a4197d0195d7247fd25b3952e15cb8a599e285dfaa6f52b3/hf_xet-1.5.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:e5de0f6deada0dada870bb376a11bcd1f08abf3a968a6d118f33e72d1b1eb480", size = 4250783 },
+    { url = "https://files.pythonhosted.org/packages/a2/9b/8bae40d4d91525085137196e84eb0ed49cf65b5e96e5c3ecdadd8bd0fac2/hf_xet-1.5.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c799d49f1a5544a0ef7591c0ee75e0d6b93d6f56dc7a4979f59f7518d2872216", size = 4445594 },
+    { url = "https://files.pythonhosted.org/packages/13/59/c74efbbd4e8728172b2cc72a2bc014d2947a4b7bdced932fbd3f5da1a4e5/hf_xet-1.5.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:2baea1b0b989e5c152fe81425f7745ddc8901280ba3d97c98d8cdece7b706c60", size = 4663995 },
+    { url = "https://files.pythonhosted.org/packages/73/32/8e1e0410af64cda9b139d1dcebdc993a8ff9c8c7c0e2696ae356d75ccc0d/hf_xet-1.5.0-cp313-cp313t-win_amd64.whl", hash = "sha256:526345b3ed45f374f6317349df489167606736c876241ba984105afe7fd4839d", size = 3966608 },
+    { url = "https://files.pythonhosted.org/packages/fc/34/a8febc8f4edbea8b3e21b02ebc8b628679b84ba7e45cde624a7736b51500/hf_xet-1.5.0-cp313-cp313t-win_arm64.whl", hash = "sha256:786d28e2eb8315d5035544b9d137b4a842d600c434bb91bf7d0d953cce906ad4", size = 3796946 },
+    { url = "https://files.pythonhosted.org/packages/2a/20/8fc8996afe5815fa1a6be8e9e5c02f24500f409d599e905800d498a4e14d/hf_xet-1.5.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:872d5601e6deea30d15865ede55d29eac6daf5a534ab417b99b6ef6b076dd96c", size = 4023495 },
+    { url = "https://files.pythonhosted.org/packages/32/6a/93d84463c00cecb561a7508aa6303e35ee2894294eac14245526924415fe/hf_xet-1.5.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:9929561f5abf4581c8ea79587881dfef6b8abb2a0d8a51915936fc2a614f4e73", size = 3792731 },
+    { url = "https://files.pythonhosted.org/packages/9d/5a/8ec8e0c863b382d00b3c2e2af6ded6b06371be617144a625903a6d562f4b/hf_xet-1.5.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f7b7bbae318e583a86fb21e5a4a175d6721d628a2874f4bd022d0e660c32a682", size = 4456738 },
+    { url = "https://files.pythonhosted.org/packages/c5/ca/f7effa1a67717da2bcc6b6c28f71c6ca648c77acaec4e2c32f40cbe16d85/hf_xet-1.5.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:cf7b2dc6f31a4ea754bb50f74cde482dcf5d366d184076d8530b9872787f3761", size = 4251622 },
+    { url = "https://files.pythonhosted.org/packages/65/f2/19247dba3e231cf77dec59ddfb878f00057635ff773d099c9b59d37812c3/hf_xet-1.5.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8dbcbab554c9ef158ef2c991545c3e970ddd8cc7acdcd0a78c5a41095dab4ded", size = 4445667 },
+    { url = "https://files.pythonhosted.org/packages/7f/64/6f116801a3bcfb6f59f5c251f48cadc47ea54026441c4a385079286a94fa/hf_xet-1.5.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5906bf7718d3636dc13402914736abe723492cb730f744834f5f5b67d3a12702", size = 4664619 },
+    { url = "https://files.pythonhosted.org/packages/5c/e8/069542d37946ed08669b127e1496fa99e78196d71de8d41eda5e9f1b7a58/hf_xet-1.5.0-cp314-cp314t-win_amd64.whl", hash = "sha256:5f3dc2248fc01cc0a00cd392ab497f1ca373fcbc7e3f2da1f452480b384e839e", size = 3966802 },
+    { url = "https://files.pythonhosted.org/packages/f9/91/fc6fdec27b14d04e88c386ac0a0129732b53fa23f7c4a78f4b83a039c567/hf_xet-1.5.0-cp314-cp314t-win_arm64.whl", hash = "sha256:b285cea1b5bab46b758772716ba8d6854a1a0310fed1c249d678a8b38601e5a0", size = 3797168 },
+    { url = "https://files.pythonhosted.org/packages/3d/fb/69ff198a82cae7eb1a69fb84d93b3a3e4816564d76817fe541ddc96874eb/hf_xet-1.5.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:dad0dc84e941b8ba3c860659fe1fdc35c049d47cce293f003287757e971a8f56", size = 4030814 },
+    { url = "https://files.pythonhosted.org/packages/9b/ff/edcc2b40162bef3ff78e14ab637e5f3b89243d6aee72f5949d3bb6a5af83/hf_xet-1.5.0-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:fd6e5a9b0fdac4ed03ed45ef79254a655b1aaab514a02202617fbf643f5fdf7a", size = 3798444 },
+    { url = "https://files.pythonhosted.org/packages/49/4d/103f76b04310e5e57656696cc184690d20c466af0bca3ca88f8c8ea5d4f3/hf_xet-1.5.0-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3531b1823a0e6d77d80f9ed15ca0e00f0d115094f8ac033d5cae88f4564cc949", size = 4465986 },
+    { url = "https://files.pythonhosted.org/packages/c4/a2/546f47f464737b3edbab6f8ddb57f2599b93d2cbb66f06abb475ccb48651/hf_xet-1.5.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:9a0ee58cd18d5ea799f7ed11290bbccbe56bdd8b1d97ca74b9cc49a3945d7a3b", size = 4259865 },
+    { url = "https://files.pythonhosted.org/packages/95/7f/1be593c1f28613be2e196473481cd81bfc5910795e30a34e8f744f6cac4f/hf_xet-1.5.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:1e60df5a42e9bed8628b6416af2cba4cba57ae9f02de226a06b020d98e1aab18", size = 4459835 },
+    { url = "https://files.pythonhosted.org/packages/aa/b2/703569fc881f3284487e68cda7b42179978480da3c438042a6bbbb4a671c/hf_xet-1.5.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:4b35549ce62601b84da4ff9b24d970032ace3d4430f52d91bcbb26c901d6c690", size = 4672414 },
+    { url = "https://files.pythonhosted.org/packages/af/37/1b6def445c567286b50aa3b33828158e135b1be44938dde59f11382a500c/hf_xet-1.5.0-cp37-abi3-win_amd64.whl", hash = "sha256:2806c7c17b4d23f8d88f7c4814f838c3b6150773fe339c20af23e1cfaf2797e4", size = 3977238 },
+    { url = "https://files.pythonhosted.org/packages/62/94/3b66b148778ee100dcfd69c2ca22b57b41b44d3063ceec934f209e9184ce/hf_xet-1.5.0-cp37-abi3-win_arm64.whl", hash = "sha256:b6c9df403040248c76d808d3e047d64db2d923bae593eb244c41e425cf6cd7be", size = 3806916 },
+]
+
+[[package]]
+name = "htmldate"
+version = "1.9.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "charset-normalizer" },
+    { name = "dateparser" },
+    { name = "lxml" },
+    { name = "python-dateutil" },
+    { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9d/10/ead9dabc999f353c3aa5d0dc0835b1e355215a5ecb489a7f4ef2ddad5e33/htmldate-1.9.4.tar.gz", hash = "sha256:1129063e02dd0354b74264de71e950c0c3fcee191178321418ccad2074cc8ed0", size = 44690 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a1/bd/adfcdaaad5805c0c5156aeefd64c1e868c05e9c1cd6fd21751f168cd88c7/htmldate-1.9.4-py3-none-any.whl", hash = "sha256:1b94bcc4e08232a5b692159903acf95548b6a7492dddca5bb123d89d6325921c", size = 31558 },
+]
+
+[[package]]
+name = "httpcore"
+version = "1.0.9"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "certifi" },
+    { name = "h11" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784 },
+]
+
+[[package]]
+name = "httpx"
+version = "0.28.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "anyio" },
+    { name = "certifi" },
+    { name = "httpcore" },
+    { name = "idna" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517 },
+]
+
+[[package]]
+name = "httpx-sse"
+version = "0.4.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/0f/4c/751061ffa58615a32c31b2d82e8482be8dd4a89154f003147acee90f2be9/httpx_sse-0.4.3.tar.gz", hash = "sha256:9b1ed0127459a66014aec3c56bebd93da3c1bc8bb6618c8082039a44889a755d", size = 15943 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d2/fd/6668e5aec43ab844de6fc74927e155a3b37bf40d7c3790e49fc0406b6578/httpx_sse-0.4.3-py3-none-any.whl", hash = "sha256:0ac1c9fe3c0afad2e0ebb25a934a59f4c7823b60792691f779fad2c5568830fc", size = 8960 },
+]
+
+[[package]]
+name = "huggingface-hub"
+version = "1.14.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "filelock" },
+    { name = "fsspec" },
+    { name = "hf-xet", marker = "platform_machine == 'AMD64' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" },
+    { name = "httpx" },
+    { name = "packaging" },
+    { name = "pyyaml" },
+    { name = "tqdm" },
+    { name = "typer" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/39/40/43109e943fd718b0ccd0cd61eb4f1c347df22bf81f5874c6f22adf44bcff/huggingface_hub-1.14.0.tar.gz", hash = "sha256:d6d2c9cd6be1d02ae9ec6672d5587d10a427f377db688e82528f426a041622c2", size = 782365 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/89/a5/33b49ba7bea7c41bb37f74ec0f8beea0831e052330196633fe2c77516ea6/huggingface_hub-1.14.0-py3-none-any.whl", hash = "sha256:efe075535c62e130b30e836b138e13785f6f043d1f0539e0a39aa411a99e90b8", size = 661479 },
+]
+
+[[package]]
+name = "idna"
+version = "3.15"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/82/77/7b3966d0b9d1d31a36ddf1746926a11dface89a83409bf1483f0237aa758/idna-3.15.tar.gz", hash = "sha256:ca962446ea538f7092a95e057da437618e886f4d349216d2b1e294abfdb65fdc", size = 199245 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d2/23/408243171aa9aaba178d3e2559159c24c1171a641aa83b67bdd3394ead8e/idna-3.15-py3-none-any.whl", hash = "sha256:048adeaf8c2d788c40fee287673ccaa74c24ffd8dcf09ffa555a2fbb59f10ac8", size = 72340 },
+]
+
+[[package]]
+name = "iniconfig"
+version = "2.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484 },
+]
+
+[[package]]
+name = "isodate"
+version = "0.7.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/54/4d/e940025e2ce31a8ce1202635910747e5a87cc3a6a6bb2d00973375014749/isodate-0.7.2.tar.gz", hash = "sha256:4cd1aa0f43ca76f4a6c6c0292a85f40b35ec2e43e315b59f06e6d32171a953e6", size = 29705 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/15/aa/0aca39a37d3c7eb941ba736ede56d689e7be91cab5d9ca846bde3999eba6/isodate-0.7.2-py3-none-any.whl", hash = "sha256:28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15", size = 22320 },
+]
+
+[[package]]
+name = "jinja2"
+version = "3.1.6"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "markupsafe" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899 },
+]
+
+[[package]]
+name = "joblib"
+version = "1.5.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/41/f2/d34e8b3a08a9cc79a50b2208a93dce981fe615b64d5a4d4abee421d898df/joblib-1.5.3.tar.gz", hash = "sha256:8561a3269e6801106863fd0d6d84bb737be9e7631e33aaed3fb9ce5953688da3", size = 331603 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl", hash = "sha256:5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713", size = 309071 },
+]
+
+[[package]]
+name = "justext"
+version = "3.0.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "lxml", extra = ["html-clean"] },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/49/f3/45890c1b314f0d04e19c1c83d534e611513150939a7cf039664d9ab1e649/justext-3.0.2.tar.gz", hash = "sha256:13496a450c44c4cd5b5a75a5efcd9996066d2a189794ea99a49949685a0beb05", size = 828521 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f2/ac/52f4e86d1924a7fc05af3aeb34488570eccc39b4af90530dd6acecdf16b5/justext-3.0.2-py2.py3-none-any.whl", hash = "sha256:62b1c562b15c3c6265e121cc070874243a443bfd53060e869393f09d6b6cc9a7", size = 837940 },
+]
+
+[[package]]
+name = "llama-cloud"
+version = "0.1.46"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "certifi" },
+    { name = "httpx" },
+    { name = "pydantic" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/40/f3/f4d6520f8d546e6c5a02f6ebeed5c09774a074b8d2c24ad559ace97a56a6/llama_cloud-0.1.46.tar.gz", hash = "sha256:e86f8791c053590d70cc59e0fc13ce72f9b681a8e658bc61df86d0285288d8ee", size = 127752 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c4/3a/6caaea28c8c804add33c91d356ed7d5a5412d6c9598e1450af95a15e0bcd/llama_cloud-0.1.46-py3-none-any.whl", hash = "sha256:6c6546c09c04a038c86d84d42f00eae8fd3bff49991ad3aab844bd866ecdf352", size = 361989 },
+]
+
+[[package]]
+name = "llama-cloud-services"
+version = "0.6.94"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "click" },
+    { name = "llama-cloud" },
+    { name = "llama-index-core" },
+    { name = "packaging" },
+    { name = "platformdirs" },
+    { name = "pydantic" },
+    { name = "python-dotenv" },
+    { name = "tenacity" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d3/91/c3c94a58c44d0a12e0df2d5038b188fc283877f56cf2f6c41c60f43258e6/llama_cloud_services-0.6.94.tar.gz", hash = "sha256:127b8440d3d3a964d0c4b3f5fe7fcac3ead482f7645971cc8ae30768dcf63306", size = 64114 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/14/ab/876486e4f1c137cfeca8f876abd18eeec35a66a0fd8adb15afba7b28aa8c/llama_cloud_services-0.6.94-py3-none-any.whl", hash = "sha256:ac89785f3689d71298511f751bcf4ca16952a616bd75ff06e0ff164f04b0775b", size = 77098 },
+]
+
+[[package]]
+name = "llama-index-core"
+version = "0.14.21"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "aiohttp" },
+    { name = "aiosqlite" },
+    { name = "banks" },
+    { name = "dataclasses-json" },
+    { name = "deprecated" },
+    { name = "dirtyjson" },
+    { name = "filetype" },
+    { name = "fsspec" },
+    { name = "httpx" },
+    { name = "llama-index-workflows" },
+    { name = "nest-asyncio" },
+    { name = "networkx" },
+    { name = "nltk" },
+    { name = "numpy" },
+    { name = "pillow" },
+    { name = "platformdirs" },
+    { name = "pydantic" },
+    { name = "pyyaml" },
+    { name = "requests" },
+    { name = "setuptools" },
+    { name = "sqlalchemy", extra = ["asyncio"] },
+    { name = "tenacity" },
+    { name = "tiktoken" },
+    { name = "tinytag" },
+    { name = "tqdm" },
+    { name = "typing-extensions" },
+    { name = "typing-inspect" },
+    { name = "wrapt" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/7c/43/d6d2a368865e68c25d3400c017fb772daab71427f08c4e36c591f729dbc3/llama_index_core-0.14.21.tar.gz", hash = "sha256:29706defbe2f429d28330a4eea010f9d92d42db92539382f8c800e19590cae45", size = 11581087 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/88/23/55ec5f35a5c7f35b60d3928bcd2e867076440036a280cf4d07481719c249/llama_index_core-0.14.21-py3-none-any.whl", hash = "sha256:4a807d31e54d066068e076eb4d066efbf95e2d2a00dcbe0eba3d9340a04cad42", size = 11916624 },
+]
+
+[[package]]
+name = "llama-index-instrumentation"
+version = "0.5.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "deprecated" },
+    { name = "pydantic" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/4e/d0/671b23ccff255c9bce132a84ffd5a6f4541ceefdeab9c1786b08c9722f2e/llama_index_instrumentation-0.5.0.tar.gz", hash = "sha256:eeb724648b25d149de882a5ac9e21c5acb1ce780da214bda2b075341af29ad8e", size = 43831 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c3/45/6dcaccef44e541ffa138e4b45e33e0d40ab2a7d845338483954fcf77bc75/llama_index_instrumentation-0.5.0-py3-none-any.whl", hash = "sha256:aaab83cddd9dd434278891012d8995f47a3bc7ed1736a371db90965348c56a21", size = 16444 },
+]
+
+[[package]]
+name = "llama-index-workflows"
+version = "2.20.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "llama-index-instrumentation" },
+    { name = "pydantic" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c4/ec/05f3db99a2e6e252e3939e7751cad2fb1322dc6d32f4cf5c795cf7ddcad3/llama_index_workflows-2.20.0.tar.gz", hash = "sha256:df2760fea9e100c97a4e919d255461e344413acac4382d17d8217337806e4772", size = 97410 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/71/5f/385231406d777cb4b608fd8ebe3577dbd90962770717181e6b91b44fb1b8/llama_index_workflows-2.20.0-py3-none-any.whl", hash = "sha256:36f6b6ace77f837d9907078aea7e830251afe96a58daecff5ed090c88c55095d", size = 121238 },
+]
+
+[[package]]
+name = "lxml"
+version = "6.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/28/30/9abc9e34c657c33834eaf6cd02124c61bdf5944d802aa48e69be8da3585d/lxml-6.1.0.tar.gz", hash = "sha256:bfd57d8008c4965709a919c3e9a98f76c2c7cb319086b3d26858250620023b13", size = 4197006 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d2/d4/9326838b59dc36dfae42eec9656b97520f9997eee1de47b8316aaeed169c/lxml-6.1.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:d2f17a16cd8751e8eb233a7e41aecdf8e511712e00088bf9be455f604cd0d28d", size = 8570663 },
+    { url = "https://files.pythonhosted.org/packages/d8/a4/053745ce1f8303ccbb788b86c0db3a91b973675cefc42566a188637b7c40/lxml-6.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f0cea5b1d3e6e77d71bd2b9972eb2446221a69dc52bb0b9c3c6f6e5700592d93", size = 4624024 },
+    { url = "https://files.pythonhosted.org/packages/90/97/a517944b20f8fd0932ad2109482bee4e29fe721416387a363306667941f6/lxml-6.1.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fc46da94826188ed45cb53bd8e3fc076ae22675aea2087843d4735627f867c6d", size = 4930895 },
+    { url = "https://files.pythonhosted.org/packages/94/7c/e08a970727d556caa040a44773c7b7e3ad0f0d73dedc863543e9a8b931f2/lxml-6.1.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9147d8e386ec3b82c3b15d88927f734f565b0aaadef7def562b853adca45784a", size = 5093820 },
+    { url = "https://files.pythonhosted.org/packages/88/ee/2a5c2aa2c32016a226ca25d3e1056a8102ea6e1fe308bf50213586635400/lxml-6.1.0-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5715e0e28736a070f3f34a7ccc09e2fdcba0e3060abbcf61a1a5718ff6d6b105", size = 5005790 },
+    { url = "https://files.pythonhosted.org/packages/e3/38/a0db9be8f38ad6043ab9429487c128dd1d30f07956ef43040402f8da49e8/lxml-6.1.0-cp312-cp312-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4937460dc5df0cdd2f06a86c285c28afda06aefa3af949f9477d3e8df430c485", size = 5630827 },
+    { url = "https://files.pythonhosted.org/packages/31/ba/3c13d3fc24b7cacf675f808a3a1baabf43a30d0cd24c98f94548e9aa58eb/lxml-6.1.0-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bc783ee3147e60a25aa0445ea82b3e8aabb83b240f2b95d32cb75587ff781814", size = 5240445 },
+    { url = "https://files.pythonhosted.org/packages/55/ba/eeef4ccba09b2212fe239f46c1692a98db1878e0872ae320756488878a94/lxml-6.1.0-cp312-cp312-manylinux_2_28_i686.whl", hash = "sha256:40d9189f80075f2e1f88db21ef815a2b17b28adf8e50aaf5c789bfe737027f32", size = 5350121 },
+    { url = "https://files.pythonhosted.org/packages/7e/01/1da87c7b587c38d0cbe77a01aae3b9c1c49ed47d76918ef3db8fc151b1ca/lxml-6.1.0-cp312-cp312-manylinux_2_31_armv7l.whl", hash = "sha256:05b9b8787e35bec69e68daf4952b2e6dfcfb0db7ecf1a06f8cdfbbac4eb71aad", size = 4694949 },
+    { url = "https://files.pythonhosted.org/packages/a1/88/7db0fe66d5aaf128443ee1623dec3db1576f3e4c17751ec0ef5866468590/lxml-6.1.0-cp312-cp312-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:0f0f08beb0182e3e9a86fae124b3c47a7b41b7b69b225e1377db983802404e54", size = 5243901 },
+    { url = "https://files.pythonhosted.org/packages/00/a8/1346726af7d1f6fca1f11223ba34001462b0a3660416986d37641708d57c/lxml-6.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:73becf6d8c81d4c76b1014dbd3584cb26d904492dcf73ca85dc8bff08dcd6d2d", size = 5048054 },
+    { url = "https://files.pythonhosted.org/packages/2e/b7/85057012f035d1a0c87e02f8c723ca3c3e6e0728bcf4cb62080b21b1c1e3/lxml-6.1.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:1ae225f66e5938f4fa29d37e009a3bb3b13032ac57eb4eb42afa44f6e4054e69", size = 4777324 },
+    { url = "https://files.pythonhosted.org/packages/75/6c/ad2f94a91073ef570f33718040e8e160d5fb93331cf1ab3ca1323f939e2d/lxml-6.1.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:690022c7fae793b0489aa68a658822cea83e0d5933781811cabbf5ea3bcfe73d", size = 5645702 },
+    { url = "https://files.pythonhosted.org/packages/3b/89/0bb6c0bd549c19004c60eea9dc554dd78fd647b72314ef25d460e0d208c6/lxml-6.1.0-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:63aeafc26aac0be8aff14af7871249e87ea1319be92090bfd632ec68e03b16a5", size = 5232901 },
+    { url = "https://files.pythonhosted.org/packages/a1/d9/d609a11fb567da9399f525193e2b49847b5a409cdebe737f06a8b7126bdc/lxml-6.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:264c605ab9c0e4aa1a679636f4582c4d3313700009fac3ec9c3412ed0d8f3e1d", size = 5261333 },
+    { url = "https://files.pythonhosted.org/packages/a6/3a/ac3f99ec8ac93089e7dd556f279e0d14c24de0a74a507e143a2e4b496e7c/lxml-6.1.0-cp312-cp312-win32.whl", hash = "sha256:56971379bc5ee8037c5a0f09fa88f66cdb7d37c3e38af3e45cf539f41131ac1f", size = 3596289 },
+    { url = "https://files.pythonhosted.org/packages/f2/a7/0a915557538593cb1bbeedcd40e13c7a261822c26fecbbdb71dad0c2f540/lxml-6.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:bba078de0031c219e5dd06cf3e6bf8fb8e6e64a77819b358f53bb132e3e03366", size = 3997059 },
+    { url = "https://files.pythonhosted.org/packages/92/96/a5dc078cf0126fbfbc35611d77ecd5da80054b5893e28fb213a5613b9e1d/lxml-6.1.0-cp312-cp312-win_arm64.whl", hash = "sha256:c3592631e652afa34999a088f98ba7dfc7d6aff0d535c410bea77a71743f3819", size = 3659552 },
+    { url = "https://files.pythonhosted.org/packages/08/03/69347590f1cf4a6d5a4944bb6099e6d37f334784f16062234e1f892fdb1d/lxml-6.1.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a0092f2b107b69601adf562a57c956fbb596e05e3e6651cabd3054113b007e45", size = 8559689 },
+    { url = "https://files.pythonhosted.org/packages/3f/58/25e00bb40b185c974cfe156c110474d9a8a8390d5f7c92a4e328189bb60e/lxml-6.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:fc7140d7a7386e6b545d41b7358f4d02b656d4053f5fa6859f92f4b9c2572c4d", size = 4617892 },
+    { url = "https://files.pythonhosted.org/packages/f5/54/92ad98a94ac318dc4f97aaac22ff8d1b94212b2ae8af5b6e9b354bf825f7/lxml-6.1.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:419c58fc92cc3a2c3fa5f78c63dbf5da70c1fa9c1b25f25727ecee89a96c7de2", size = 4923489 },
+    { url = "https://files.pythonhosted.org/packages/15/3b/a20aecfab42bdf4f9b390590d345857ad3ffd7c51988d1c89c53a0c73faf/lxml-6.1.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:37fabd1452852636cf38ecdcc9dd5ca4bba7a35d6c53fa09725deeb894a87491", size = 5082162 },
+    { url = "https://files.pythonhosted.org/packages/45/26/2cdb3d281ac1bd175603e290cbe4bad6eff127c0f8de90bafd6f8548f0fd/lxml-6.1.0-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a2853c8b2170cc6cd54a6b4d50d2c1a8a7aeca201f23804b4898525c7a152cfc", size = 4993247 },
+    { url = "https://files.pythonhosted.org/packages/f6/05/d735aef963740022a08185c84821f689fc903acb3d50326e6b1e9886cc22/lxml-6.1.0-cp313-cp313-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:8e369cbd690e788c8d15e56222d91a09c6a417f49cbc543040cba0fe2e25a79e", size = 5613042 },
+    { url = "https://files.pythonhosted.org/packages/ee/b8/ead7c10efff731738c72e59ed6eb5791854879fbed7ae98781a12006263a/lxml-6.1.0-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e69aa6805905807186eb00e66c6d97a935c928275182eb02ee40ba00da9623b2", size = 5228304 },
+    { url = "https://files.pythonhosted.org/packages/6b/10/e9842d2ec322ea65f0a7270aa0315a53abed06058b88ef1b027f620e7a5f/lxml-6.1.0-cp313-cp313-manylinux_2_28_i686.whl", hash = "sha256:4bd1bdb8a9e0e2dd229de19b5f8aebac80e916921b4b2c6ef8a52bc131d0c1f9", size = 5341578 },
+    { url = "https://files.pythonhosted.org/packages/89/54/40d9403d7c2775fa7301d3ddd3464689bfe9ba71acc17dfff777071b4fdc/lxml-6.1.0-cp313-cp313-manylinux_2_31_armv7l.whl", hash = "sha256:cbd7b79cdcb4986ad78a2662625882747f09db5e4cd7b2ae178a88c9c51b3dfe", size = 4700209 },
+    { url = "https://files.pythonhosted.org/packages/85/b2/bbdcc2cf45dfc7dfffef4fd97e5c47b15919b6a365247d95d6f684ef5e82/lxml-6.1.0-cp313-cp313-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:43e4d297f11080ec9d64a4b1ad7ac02b4484c9f0e2179d9c4ef78e886e747b88", size = 5232365 },
+    { url = "https://files.pythonhosted.org/packages/48/5a/b06875665e53aaba7127611a7bed3b7b9658e20b22bc2dd217a0b7ab0091/lxml-6.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cc16682cc987a3da00aa56a3aa3075b08edb10d9b1e476938cfdbee8f3b67181", size = 5043654 },
+    { url = "https://files.pythonhosted.org/packages/e9/9c/e71a069d09641c1a7abeb30e693f828c7c90a41cbe3d650b2d734d876f85/lxml-6.1.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:d6d8efe71429635f0559579092bb5e60560d7b9115ee38c4adbea35632e7fa24", size = 4769326 },
+    { url = "https://files.pythonhosted.org/packages/cc/06/7a9cd84b3d4ed79adf35f874750abb697dec0b4a81a836037b36e47c091a/lxml-6.1.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:7e39ab3a28af7784e206d8606ec0e4bcad0190f63a492bca95e94e5a4aef7f6e", size = 5635879 },
+    { url = "https://files.pythonhosted.org/packages/cc/f0/9d57916befc1e54c451712c7ee48e9e74e80ae4d03bdce49914e0aee42cd/lxml-6.1.0-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:9eb667bf50856c4a58145f8ca2d5e5be160191e79eb9e30855a476191b3c3495", size = 5224048 },
+    { url = "https://files.pythonhosted.org/packages/99/75/90c4eefda0c08c92221fe0753db2d6699a4c628f76ff4465ec20dea84cc1/lxml-6.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7f4a77d6f7edf9230cee3e1f7f6764722a41604ee5681844f18db9a81ea0ec33", size = 5250241 },
+    { url = "https://files.pythonhosted.org/packages/5e/73/16596f7e4e38fa33084b9ccbccc22a15f82a290a055126f2c1541236d2ff/lxml-6.1.0-cp313-cp313-win32.whl", hash = "sha256:28902146ffbe5222df411c5d19e5352490122e14447e98cd118907ee3fd6ee62", size = 3596938 },
+    { url = "https://files.pythonhosted.org/packages/8e/63/981401c5680c1eb30893f00a19641ac80db5d1e7086c62cb4b13ed813038/lxml-6.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:4a1503c56e4e2b38dc76f2f2da7bae69670c0f1933e27cfa34b2fa5876410b16", size = 3995728 },
+    { url = "https://files.pythonhosted.org/packages/e7/e8/c358a38ac3e541d16a1b527e4e9cb78c0419b0506a070ace11777e5e8404/lxml-6.1.0-cp313-cp313-win_arm64.whl", hash = "sha256:e0af85773850417d994d019741239b901b22c6680206f46a34766926e466141d", size = 3658372 },
+    { url = "https://files.pythonhosted.org/packages/eb/45/cee4cf203ef0bab5c52afc118da61d6b460c928f2893d40023cfa27e0b80/lxml-6.1.0-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:ab863fd37458fed6456525f297d21239d987800c46e67da5ef04fc6b3dd93ac8", size = 8576713 },
+    { url = "https://files.pythonhosted.org/packages/8a/a7/eda05babeb7e046839204eaf254cd4d7c9130ce2bbf0d9e90ea41af5654d/lxml-6.1.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:6fd8b1df8254ff4fd93fd31da1fc15770bde23ac045be9bb1f87425702f61cc9", size = 4623874 },
+    { url = "https://files.pythonhosted.org/packages/e7/e9/db5846de9b436b91890a62f29d80cd849ea17948a49bf532d5278ee69a9e/lxml-6.1.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:47024feaae386a92a146af0d2aeed65229bf6fff738e6a11dda6b0015fb8fd03", size = 4949535 },
+    { url = "https://files.pythonhosted.org/packages/5a/ba/0d3593373dcae1d68f40dc3c41a5a92f2544e68115eb2f62319a4c2a6500/lxml-6.1.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3f00972f84450204cd5d93a5395965e348956aaceaadec693a22ec743f8ae3eb", size = 5086881 },
+    { url = "https://files.pythonhosted.org/packages/43/76/759a7484539ad1af0d125a9afe9c3fb5f82a8779fd1f5f56319d9e4ea2fd/lxml-6.1.0-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:97faa0860e13b05b15a51fb4986421ef7a30f0b3334061c416e0981e9450ca4c", size = 5031305 },
+    { url = "https://files.pythonhosted.org/packages/dc/b9/c1f0daf981a11e47636126901fd4ab82429e18c57aeb0fc3ad2940b42d8b/lxml-6.1.0-cp314-cp314-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:972a6451204798675407beaad97b868d0c733d9a74dafefc63120b81b8c2de28", size = 5647522 },
+    { url = "https://files.pythonhosted.org/packages/31/e6/1f533dcd205275363d9ba3511bcec52fa2df86abf8abe6a5f2c599f0dc31/lxml-6.1.0-cp314-cp314-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fe022f20bc4569ec66b63b3fb275a3d628d9d32da6326b2982584104db6d3086", size = 5239310 },
+    { url = "https://files.pythonhosted.org/packages/c3/8c/4175fb709c78a6e315ed814ed33be3defd8b8721067e70419a6cf6f971da/lxml-6.1.0-cp314-cp314-manylinux_2_28_i686.whl", hash = "sha256:75c4c7c619a744f972f4451bf5adf6d0fb00992a1ffc9fd78e13b0bc817cc99f", size = 5350799 },
+    { url = "https://files.pythonhosted.org/packages/fd/77/6ffdebc5994975f0dde4acb59761902bd9d9bb84422b9a0bd239a7da9ca8/lxml-6.1.0-cp314-cp314-manylinux_2_31_armv7l.whl", hash = "sha256:3648f20d25102a22b6061c688beb3a805099ea4beb0a01ce62975d926944d292", size = 4697693 },
+    { url = "https://files.pythonhosted.org/packages/f8/f1/565f36bd5c73294602d48e04d23f81ff4c8736be6ba5e1d1ec670ac9be80/lxml-6.1.0-cp314-cp314-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:77b9f99b17cbf14026d1e618035077060fc7195dd940d025149f3e2e830fbfcb", size = 5250708 },
+    { url = "https://files.pythonhosted.org/packages/5a/11/a68ab9dd18c5c499404deb4005f4bc4e0e88e5b72cd755ad96efec81d18d/lxml-6.1.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:32662519149fd7a9db354175aa5e417d83485a8039b8aaa62f873ceee7ea4cad", size = 5084737 },
+    { url = "https://files.pythonhosted.org/packages/ab/78/e8f41e2c74f4af564e6a0348aea69fb6daaefa64bc071ef469823d22cc18/lxml-6.1.0-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:73d658216fc173cf2c939e90e07b941c5e12736b0bf6a99e7af95459cfe8eabb", size = 4737817 },
+    { url = "https://files.pythonhosted.org/packages/06/2d/aa4e117aa2ce2f3b35d9ff246be74a2f8e853baba5d2a92c64744474603a/lxml-6.1.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:ac4db068889f8772a4a698c5980ec302771bb545e10c4b095d4c8be26749616f", size = 5670753 },
+    { url = "https://files.pythonhosted.org/packages/08/f5/dd745d50c0409031dbfcc4881740542a01e54d6f0110bd420fa7782110b8/lxml-6.1.0-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:45e9dfbd1b661eb64ba0d4dbe762bd210c42d86dd1e5bd2bdf89d634231beb43", size = 5238071 },
+    { url = "https://files.pythonhosted.org/packages/3e/74/ad424f36d0340a904665867dab310a3f1f4c96ff4039698de83b77f44c1f/lxml-6.1.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:89e8d73d09ac696a5ba42ec69787913d53284f12092f651506779314f10ba585", size = 5264319 },
+    { url = "https://files.pythonhosted.org/packages/53/36/a15d8b3514ec889bfd6aa3609107fcb6c9189f8dc347f1c0b81eded8d87c/lxml-6.1.0-cp314-cp314-win32.whl", hash = "sha256:ebe33f4ec1b2de38ceb225a1749a2965855bffeef435ba93cd2d5d540783bf2f", size = 3657139 },
+    { url = "https://files.pythonhosted.org/packages/1a/a4/263ebb0710851a3c6c937180a9a86df1206fdfe53cc43005aa2237fd7736/lxml-6.1.0-cp314-cp314-win_amd64.whl", hash = "sha256:398443df51c538bd578529aa7e5f7afc6c292644174b47961f3bf87fe5741120", size = 4064195 },
+    { url = "https://files.pythonhosted.org/packages/80/68/2000f29d323b6c286de077ad20b429fc52272e44eae6d295467043e56012/lxml-6.1.0-cp314-cp314-win_arm64.whl", hash = "sha256:8c8984e1d8c4b3949e419158fda14d921ff703a9ed8a47236c6eb7a2b6cb4946", size = 3741870 },
+    { url = "https://files.pythonhosted.org/packages/30/e9/21383c7c8d43799f0da90224c0d7c921870d476ec9b3e01e1b2c0b8237c5/lxml-6.1.0-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:1081dd10bc6fa437db2500e13993abf7cc30716d0a2f40e65abb935f02ec559c", size = 8827548 },
+    { url = "https://files.pythonhosted.org/packages/a5/01/c6bc11cd587030dd4f719f65c5657960649fe3e19196c844c75bf32cd0d6/lxml-6.1.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:dabecc48db5f42ba348d1f5d5afdc54c6c4cc758e676926c7cd327045749517d", size = 4735866 },
+    { url = "https://files.pythonhosted.org/packages/f3/01/757132fff5f4acf25463b5298f1a46099f3a94480b806547b29ce5e385de/lxml-6.1.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e3dd5fe19c9e0ac818a9c7f132a5e43c1339ec1cbbfecb1a938bd3a47875b7c9", size = 4969476 },
+    { url = "https://files.pythonhosted.org/packages/fd/fb/1bc8b9d27ed64be7c8903db6c89e74dc8c2cd9ec630a7462e4654316dc5b/lxml-6.1.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9e7b0a4ca6dcc007a4cef00a761bba2dea959de4bd2df98f926b33c92ca5dfb9", size = 5103719 },
+    { url = "https://files.pythonhosted.org/packages/d5/e7/5bf82fa28133536a54601aae633b14988e89ed61d4c1eb6b899b023233aa/lxml-6.1.0-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5d27bbe326c6b539c64b42638b18bc6003a8d88f76213a97ac9ed4f885efeab7", size = 5027890 },
+    { url = "https://files.pythonhosted.org/packages/2d/20/e048db5d4b4ea0366648aa595f26bb764b2670903fc585b87436d0a5032c/lxml-6.1.0-cp314-cp314t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c4e425db0c5445ef0ad56b0eec54f89b88b2d884656e536a90b2f52aecb4ca86", size = 5596008 },
+    { url = "https://files.pythonhosted.org/packages/9a/c2/d10807bc8da4824b39e5bd01b5d05c077b6fd01bd91584167edf6b269d22/lxml-6.1.0-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4b89b098105b8599dc57adac95d1813409ac476d3c948a498775d3d0c6124bfb", size = 5224451 },
+    { url = "https://files.pythonhosted.org/packages/3c/15/2ebea45bea427e7f0057e9ce7b2d62c5aba20c6b001cca89ed0aadb3ad41/lxml-6.1.0-cp314-cp314t-manylinux_2_28_i686.whl", hash = "sha256:c4a699432846df86cc3de502ee85f445ebad748a1c6021d445f3e514d2cd4b1c", size = 5312135 },
+    { url = "https://files.pythonhosted.org/packages/31/e2/87eeae151b0be2a308d49a7ec444ff3eb192b14251e62addb29d0bf3778f/lxml-6.1.0-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:30e7b2ed63b6c8e97cca8af048589a788ab5c9c905f36d9cf1c2bb549f450d2f", size = 4639126 },
+    { url = "https://files.pythonhosted.org/packages/a3/51/8a3f6a20902ad604dd746ec7b4000311b240d389dac5e9d95adefd349e0c/lxml-6.1.0-cp314-cp314t-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:022981127642fe19866d2907d76241bb07ed21749601f727d5d5dd1ce5d1b773", size = 5232579 },
+    { url = "https://files.pythonhosted.org/packages/6d/d2/650d619bdbe048d2c3f2c31edb00e35670a5e2d65b4fe3b61bce37b19121/lxml-6.1.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:23cad0cc86046d4222f7f418910e46b89971c5a45d3c8abfad0f64b7b05e4a9b", size = 5084206 },
+    { url = "https://files.pythonhosted.org/packages/dd/8a/672ca1a3cbeabd1f511ca275a916c0514b747f4b85bdaae103b8fa92f307/lxml-6.1.0-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:21c3302068f50d1e8728c67c87ba92aa87043abee517aa2576cca1855326b405", size = 4758906 },
+    { url = "https://files.pythonhosted.org/packages/be/f1/ef4b691da85c916cb2feb1eec7414f678162798ac85e042fa164419ac05c/lxml-6.1.0-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:be10838781cb3be19251e276910cd508fe127e27c3242e50521521a0f3781690", size = 5620553 },
+    { url = "https://files.pythonhosted.org/packages/59/17/94e81def74107809755ac2782fdad4404420f1c92ca83433d117a6d5acf0/lxml-6.1.0-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:2173a7bffe97667bbf0767f8a99e587740a8c56fdf3befac4b09cb29a80276fd", size = 5229458 },
+    { url = "https://files.pythonhosted.org/packages/21/55/c4be91b0f830a871fc1b0d730943d56013b683d4671d5198260e2eae722b/lxml-6.1.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:c6854e9cf99c84beb004eecd7d3a3868ef1109bf2b1df92d7bc11e96a36c2180", size = 5247861 },
+    { url = "https://files.pythonhosted.org/packages/c2/ca/77123e4d77df3cb1e968ade7b1f808f5d3a5c1c96b18a33895397de292c1/lxml-6.1.0-cp314-cp314t-win32.whl", hash = "sha256:00750d63ef0031a05331b9223463b1c7c02b9004cef2346a5b2877f0f9494dd2", size = 3897377 },
+    { url = "https://files.pythonhosted.org/packages/64/ce/3554833989d074267c063209bae8b09815e5656456a2d332b947806b05ff/lxml-6.1.0-cp314-cp314t-win_amd64.whl", hash = "sha256:80410c3a7e3c617af04de17caa9f9f20adaa817093293d69eae7d7d0522836f5", size = 4392701 },
+    { url = "https://files.pythonhosted.org/packages/2b/a0/9b916c68c0e57752c07f8f64b30138d9d4059dbeb27b90274dedbea128ff/lxml-6.1.0-cp314-cp314t-win_arm64.whl", hash = "sha256:26dd9f57ee3bd41e7d35b4c98a2ffd89ed11591649f421f0ec19f67d50ec67ac", size = 3817120 },
+]
+
+[package.optional-dependencies]
+html-clean = [
+    { name = "lxml-html-clean" },
+]
+
+[[package]]
+name = "lxml-html-clean"
+version = "0.4.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "lxml" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9a/a4/5c62acfacd69ff4f5db395100f5cfb9b54e7ac8c69a235e4e939fd13f021/lxml_html_clean-0.4.4.tar.gz", hash = "sha256:58f39a9d632711202ed1d6d0b9b47a904e306c85de5761543b90e3e3f736acfb", size = 23899 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d9/76/7ffc1d3005cf7749123bc47cb3ea343cd97b0ac2211bab40f57283577d0e/lxml_html_clean-0.4.4-py3-none-any.whl", hash = "sha256:ce2ef506614ecb85ee1c5fe0a2aa45b06a19514ec7949e9c8f34f06925cfabcb", size = 14565 },
+]
+
+[[package]]
+name = "markdown-it-py"
+version = "4.2.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "mdurl" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/06/ff/7841249c247aa650a76b9ee4bbaeae59370dc8bfd2f6c01f3630c35eb134/markdown_it_py-4.2.0.tar.gz", hash = "sha256:04a21681d6fbb623de53f6f364d352309d4094dd4194040a10fd51833e418d49", size = 82454 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b3/81/4da04ced5a082363ecfa159c010d200ecbd959ae410c10c0264a38cac0f5/markdown_it_py-4.2.0-py3-none-any.whl", hash = "sha256:9f7ebbcd14fe59494226453aed97c1070d83f8d24b6fc3a3bcf9a38092641c4a", size = 91687 },
+]
+
+[[package]]
+name = "markupsafe"
+version = "3.0.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5a/72/147da192e38635ada20e0a2e1a51cf8823d2119ce8883f7053879c2199b5/markupsafe-3.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d53197da72cc091b024dd97249dfc7794d6a56530370992a5e1a08983ad9230e", size = 11615 },
+    { url = "https://files.pythonhosted.org/packages/9a/81/7e4e08678a1f98521201c3079f77db69fb552acd56067661f8c2f534a718/markupsafe-3.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1872df69a4de6aead3491198eaf13810b565bdbeec3ae2dc8780f14458ec73ce", size = 12020 },
+    { url = "https://files.pythonhosted.org/packages/1e/2c/799f4742efc39633a1b54a92eec4082e4f815314869865d876824c257c1e/markupsafe-3.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a7e8ae81ae39e62a41ec302f972ba6ae23a5c5396c8e60113e9066ef893da0d", size = 24332 },
+    { url = "https://files.pythonhosted.org/packages/3c/2e/8d0c2ab90a8c1d9a24f0399058ab8519a3279d1bd4289511d74e909f060e/markupsafe-3.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d6dd0be5b5b189d31db7cda48b91d7e0a9795f31430b7f271219ab30f1d3ac9d", size = 22947 },
+    { url = "https://files.pythonhosted.org/packages/2c/54/887f3092a85238093a0b2154bd629c89444f395618842e8b0c41783898ea/markupsafe-3.0.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:94c6f0bb423f739146aec64595853541634bde58b2135f27f61c1ffd1cd4d16a", size = 21962 },
+    { url = "https://files.pythonhosted.org/packages/c9/2f/336b8c7b6f4a4d95e91119dc8521402461b74a485558d8f238a68312f11c/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:be8813b57049a7dc738189df53d69395eba14fb99345e0a5994914a3864c8a4b", size = 23760 },
+    { url = "https://files.pythonhosted.org/packages/32/43/67935f2b7e4982ffb50a4d169b724d74b62a3964bc1a9a527f5ac4f1ee2b/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:83891d0e9fb81a825d9a6d61e3f07550ca70a076484292a70fde82c4b807286f", size = 21529 },
+    { url = "https://files.pythonhosted.org/packages/89/e0/4486f11e51bbba8b0c041098859e869e304d1c261e59244baa3d295d47b7/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:77f0643abe7495da77fb436f50f8dab76dbc6e5fd25d39589a0f1fe6548bfa2b", size = 23015 },
+    { url = "https://files.pythonhosted.org/packages/2f/e1/78ee7a023dac597a5825441ebd17170785a9dab23de95d2c7508ade94e0e/markupsafe-3.0.3-cp312-cp312-win32.whl", hash = "sha256:d88b440e37a16e651bda4c7c2b930eb586fd15ca7406cb39e211fcff3bf3017d", size = 14540 },
+    { url = "https://files.pythonhosted.org/packages/aa/5b/bec5aa9bbbb2c946ca2733ef9c4ca91c91b6a24580193e891b5f7dbe8e1e/markupsafe-3.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:26a5784ded40c9e318cfc2bdb30fe164bdb8665ded9cd64d500a34fb42067b1c", size = 15105 },
+    { url = "https://files.pythonhosted.org/packages/e5/f1/216fc1bbfd74011693a4fd837e7026152e89c4bcf3e77b6692fba9923123/markupsafe-3.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:35add3b638a5d900e807944a078b51922212fb3dedb01633a8defc4b01a3c85f", size = 13906 },
+    { url = "https://files.pythonhosted.org/packages/38/2f/907b9c7bbba283e68f20259574b13d005c121a0fa4c175f9bed27c4597ff/markupsafe-3.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e1cf1972137e83c5d4c136c43ced9ac51d0e124706ee1c8aa8532c1287fa8795", size = 11622 },
+    { url = "https://files.pythonhosted.org/packages/9c/d9/5f7756922cdd676869eca1c4e3c0cd0df60ed30199ffd775e319089cb3ed/markupsafe-3.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:116bb52f642a37c115f517494ea5feb03889e04df47eeff5b130b1808ce7c219", size = 12029 },
+    { url = "https://files.pythonhosted.org/packages/00/07/575a68c754943058c78f30db02ee03a64b3c638586fba6a6dd56830b30a3/markupsafe-3.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:133a43e73a802c5562be9bbcd03d090aa5a1fe899db609c29e8c8d815c5f6de6", size = 24374 },
+    { url = "https://files.pythonhosted.org/packages/a9/21/9b05698b46f218fc0e118e1f8168395c65c8a2c750ae2bab54fc4bd4e0e8/markupsafe-3.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ccfcd093f13f0f0b7fdd0f198b90053bf7b2f02a3927a30e63f3ccc9df56b676", size = 22980 },
+    { url = "https://files.pythonhosted.org/packages/7f/71/544260864f893f18b6827315b988c146b559391e6e7e8f7252839b1b846a/markupsafe-3.0.3-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:509fa21c6deb7a7a273d629cf5ec029bc209d1a51178615ddf718f5918992ab9", size = 21990 },
+    { url = "https://files.pythonhosted.org/packages/c2/28/b50fc2f74d1ad761af2f5dcce7492648b983d00a65b8c0e0cb457c82ebbe/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4afe79fb3de0b7097d81da19090f4df4f8d3a2b3adaa8764138aac2e44f3af1", size = 23784 },
+    { url = "https://files.pythonhosted.org/packages/ed/76/104b2aa106a208da8b17a2fb72e033a5a9d7073c68f7e508b94916ed47a9/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:795e7751525cae078558e679d646ae45574b47ed6e7771863fcc079a6171a0fc", size = 21588 },
+    { url = "https://files.pythonhosted.org/packages/b5/99/16a5eb2d140087ebd97180d95249b00a03aa87e29cc224056274f2e45fd6/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8485f406a96febb5140bfeca44a73e3ce5116b2501ac54fe953e488fb1d03b12", size = 23041 },
+    { url = "https://files.pythonhosted.org/packages/19/bc/e7140ed90c5d61d77cea142eed9f9c303f4c4806f60a1044c13e3f1471d0/markupsafe-3.0.3-cp313-cp313-win32.whl", hash = "sha256:bdd37121970bfd8be76c5fb069c7751683bdf373db1ed6c010162b2a130248ed", size = 14543 },
+    { url = "https://files.pythonhosted.org/packages/05/73/c4abe620b841b6b791f2edc248f556900667a5a1cf023a6646967ae98335/markupsafe-3.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:9a1abfdc021a164803f4d485104931fb8f8c1efd55bc6b748d2f5774e78b62c5", size = 15113 },
+    { url = "https://files.pythonhosted.org/packages/f0/3a/fa34a0f7cfef23cf9500d68cb7c32dd64ffd58a12b09225fb03dd37d5b80/markupsafe-3.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:7e68f88e5b8799aa49c85cd116c932a1ac15caaa3f5db09087854d218359e485", size = 13911 },
+    { url = "https://files.pythonhosted.org/packages/e4/d7/e05cd7efe43a88a17a37b3ae96e79a19e846f3f456fe79c57ca61356ef01/markupsafe-3.0.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:218551f6df4868a8d527e3062d0fb968682fe92054e89978594c28e642c43a73", size = 11658 },
+    { url = "https://files.pythonhosted.org/packages/99/9e/e412117548182ce2148bdeacdda3bb494260c0b0184360fe0d56389b523b/markupsafe-3.0.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:3524b778fe5cfb3452a09d31e7b5adefeea8c5be1d43c4f810ba09f2ceb29d37", size = 12066 },
+    { url = "https://files.pythonhosted.org/packages/bc/e6/fa0ffcda717ef64a5108eaa7b4f5ed28d56122c9a6d70ab8b72f9f715c80/markupsafe-3.0.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4e885a3d1efa2eadc93c894a21770e4bc67899e3543680313b09f139e149ab19", size = 25639 },
+    { url = "https://files.pythonhosted.org/packages/96/ec/2102e881fe9d25fc16cb4b25d5f5cde50970967ffa5dddafdb771237062d/markupsafe-3.0.3-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8709b08f4a89aa7586de0aadc8da56180242ee0ada3999749b183aa23df95025", size = 23569 },
+    { url = "https://files.pythonhosted.org/packages/4b/30/6f2fce1f1f205fc9323255b216ca8a235b15860c34b6798f810f05828e32/markupsafe-3.0.3-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b8512a91625c9b3da6f127803b166b629725e68af71f8184ae7e7d54686a56d6", size = 23284 },
+    { url = "https://files.pythonhosted.org/packages/58/47/4a0ccea4ab9f5dcb6f79c0236d954acb382202721e704223a8aafa38b5c8/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9b79b7a16f7fedff2495d684f2b59b0457c3b493778c9eed31111be64d58279f", size = 24801 },
+    { url = "https://files.pythonhosted.org/packages/6a/70/3780e9b72180b6fecb83a4814d84c3bf4b4ae4bf0b19c27196104149734c/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:12c63dfb4a98206f045aa9563db46507995f7ef6d83b2f68eda65c307c6829eb", size = 22769 },
+    { url = "https://files.pythonhosted.org/packages/98/c5/c03c7f4125180fc215220c035beac6b9cb684bc7a067c84fc69414d315f5/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8f71bc33915be5186016f675cd83a1e08523649b0e33efdb898db577ef5bb009", size = 23642 },
+    { url = "https://files.pythonhosted.org/packages/80/d6/2d1b89f6ca4bff1036499b1e29a1d02d282259f3681540e16563f27ebc23/markupsafe-3.0.3-cp313-cp313t-win32.whl", hash = "sha256:69c0b73548bc525c8cb9a251cddf1931d1db4d2258e9599c28c07ef3580ef354", size = 14612 },
+    { url = "https://files.pythonhosted.org/packages/2b/98/e48a4bfba0a0ffcf9925fe2d69240bfaa19c6f7507b8cd09c70684a53c1e/markupsafe-3.0.3-cp313-cp313t-win_amd64.whl", hash = "sha256:1b4b79e8ebf6b55351f0d91fe80f893b4743f104bff22e90697db1590e47a218", size = 15200 },
+    { url = "https://files.pythonhosted.org/packages/0e/72/e3cc540f351f316e9ed0f092757459afbc595824ca724cbc5a5d4263713f/markupsafe-3.0.3-cp313-cp313t-win_arm64.whl", hash = "sha256:ad2cf8aa28b8c020ab2fc8287b0f823d0a7d8630784c31e9ee5edea20f406287", size = 13973 },
+    { url = "https://files.pythonhosted.org/packages/33/8a/8e42d4838cd89b7dde187011e97fe6c3af66d8c044997d2183fbd6d31352/markupsafe-3.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:eaa9599de571d72e2daf60164784109f19978b327a3910d3e9de8c97b5b70cfe", size = 11619 },
+    { url = "https://files.pythonhosted.org/packages/b5/64/7660f8a4a8e53c924d0fa05dc3a55c9cee10bbd82b11c5afb27d44b096ce/markupsafe-3.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c47a551199eb8eb2121d4f0f15ae0f923d31350ab9280078d1e5f12b249e0026", size = 12029 },
+    { url = "https://files.pythonhosted.org/packages/da/ef/e648bfd021127bef5fa12e1720ffed0c6cbb8310c8d9bea7266337ff06de/markupsafe-3.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f34c41761022dd093b4b6896d4810782ffbabe30f2d443ff5f083e0cbbb8c737", size = 24408 },
+    { url = "https://files.pythonhosted.org/packages/41/3c/a36c2450754618e62008bf7435ccb0f88053e07592e6028a34776213d877/markupsafe-3.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:457a69a9577064c05a97c41f4e65148652db078a3a509039e64d3467b9e7ef97", size = 23005 },
+    { url = "https://files.pythonhosted.org/packages/bc/20/b7fdf89a8456b099837cd1dc21974632a02a999ec9bf7ca3e490aacd98e7/markupsafe-3.0.3-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e8afc3f2ccfa24215f8cb28dcf43f0113ac3c37c2f0f0806d8c70e4228c5cf4d", size = 22048 },
+    { url = "https://files.pythonhosted.org/packages/9a/a7/591f592afdc734f47db08a75793a55d7fbcc6902a723ae4cfbab61010cc5/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:ec15a59cf5af7be74194f7ab02d0f59a62bdcf1a537677ce67a2537c9b87fcda", size = 23821 },
+    { url = "https://files.pythonhosted.org/packages/7d/33/45b24e4f44195b26521bc6f1a82197118f74df348556594bd2262bda1038/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:0eb9ff8191e8498cca014656ae6b8d61f39da5f95b488805da4bb029cccbfbaf", size = 21606 },
+    { url = "https://files.pythonhosted.org/packages/ff/0e/53dfaca23a69fbfbbf17a4b64072090e70717344c52eaaaa9c5ddff1e5f0/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2713baf880df847f2bece4230d4d094280f4e67b1e813eec43b4c0e144a34ffe", size = 23043 },
+    { url = "https://files.pythonhosted.org/packages/46/11/f333a06fc16236d5238bfe74daccbca41459dcd8d1fa952e8fbd5dccfb70/markupsafe-3.0.3-cp314-cp314-win32.whl", hash = "sha256:729586769a26dbceff69f7a7dbbf59ab6572b99d94576a5592625d5b411576b9", size = 14747 },
+    { url = "https://files.pythonhosted.org/packages/28/52/182836104b33b444e400b14f797212f720cbc9ed6ba34c800639d154e821/markupsafe-3.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:bdc919ead48f234740ad807933cdf545180bfbe9342c2bb451556db2ed958581", size = 15341 },
+    { url = "https://files.pythonhosted.org/packages/6f/18/acf23e91bd94fd7b3031558b1f013adfa21a8e407a3fdb32745538730382/markupsafe-3.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:5a7d5dc5140555cf21a6fefbdbf8723f06fcd2f63ef108f2854de715e4422cb4", size = 14073 },
+    { url = "https://files.pythonhosted.org/packages/3c/f0/57689aa4076e1b43b15fdfa646b04653969d50cf30c32a102762be2485da/markupsafe-3.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:1353ef0c1b138e1907ae78e2f6c63ff67501122006b0f9abad68fda5f4ffc6ab", size = 11661 },
+    { url = "https://files.pythonhosted.org/packages/89/c3/2e67a7ca217c6912985ec766c6393b636fb0c2344443ff9d91404dc4c79f/markupsafe-3.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1085e7fbddd3be5f89cc898938f42c0b3c711fdcb37d75221de2666af647c175", size = 12069 },
+    { url = "https://files.pythonhosted.org/packages/f0/00/be561dce4e6ca66b15276e184ce4b8aec61fe83662cce2f7d72bd3249d28/markupsafe-3.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1b52b4fb9df4eb9ae465f8d0c228a00624de2334f216f178a995ccdcf82c4634", size = 25670 },
+    { url = "https://files.pythonhosted.org/packages/50/09/c419f6f5a92e5fadde27efd190eca90f05e1261b10dbd8cbcb39cd8ea1dc/markupsafe-3.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fed51ac40f757d41b7c48425901843666a6677e3e8eb0abcff09e4ba6e664f50", size = 23598 },
+    { url = "https://files.pythonhosted.org/packages/22/44/a0681611106e0b2921b3033fc19bc53323e0b50bc70cffdd19f7d679bb66/markupsafe-3.0.3-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f190daf01f13c72eac4efd5c430a8de82489d9cff23c364c3ea822545032993e", size = 23261 },
+    { url = "https://files.pythonhosted.org/packages/5f/57/1b0b3f100259dc9fffe780cfb60d4be71375510e435efec3d116b6436d43/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e56b7d45a839a697b5eb268c82a71bd8c7f6c94d6fd50c3d577fa39a9f1409f5", size = 24835 },
+    { url = "https://files.pythonhosted.org/packages/26/6a/4bf6d0c97c4920f1597cc14dd720705eca0bf7c787aebc6bb4d1bead5388/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:f3e98bb3798ead92273dc0e5fd0f31ade220f59a266ffd8a4f6065e0a3ce0523", size = 22733 },
+    { url = "https://files.pythonhosted.org/packages/14/c7/ca723101509b518797fedc2fdf79ba57f886b4aca8a7d31857ba3ee8281f/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5678211cb9333a6468fb8d8be0305520aa073f50d17f089b5b4b477ea6e67fdc", size = 23672 },
+    { url = "https://files.pythonhosted.org/packages/fb/df/5bd7a48c256faecd1d36edc13133e51397e41b73bb77e1a69deab746ebac/markupsafe-3.0.3-cp314-cp314t-win32.whl", hash = "sha256:915c04ba3851909ce68ccc2b8e2cd691618c4dc4c4232fb7982bca3f41fd8c3d", size = 14819 },
+    { url = "https://files.pythonhosted.org/packages/1a/8a/0402ba61a2f16038b48b39bccca271134be00c5c9f0f623208399333c448/markupsafe-3.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4faffd047e07c38848ce017e8725090413cd80cbc23d86e55c587bf979e579c9", size = 15426 },
+    { url = "https://files.pythonhosted.org/packages/70/bc/6f1c2f612465f5fa89b95bead1f44dcb607670fd42891d8fdcd5d039f4f4/markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa", size = 14146 },
+]
+
+[[package]]
+name = "marshmallow"
+version = "3.26.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "packaging" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/55/79/de6c16cc902f4fc372236926b0ce2ab7845268dcc30fb2fbb7f71b418631/marshmallow-3.26.2.tar.gz", hash = "sha256:bbe2adb5a03e6e3571b573f42527c6fe926e17467833660bebd11593ab8dfd57", size = 222095 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/be/2f/5108cb3ee4ba6501748c4908b908e55f42a5b66245b4cfe0c99326e1ef6e/marshmallow-3.26.2-py3-none-any.whl", hash = "sha256:013fa8a3c4c276c24d26d84ce934dc964e2aa794345a0f8c7e5a7191482c8a73", size = 50964 },
+]
+
+[[package]]
+name = "mdurl"
+version = "0.1.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979 },
+]
+
+[[package]]
+name = "multidict"
+version = "6.7.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/1a/c2/c2d94cbe6ac1753f3fc980da97b3d930efe1da3af3c9f5125354436c073d/multidict-6.7.1.tar.gz", hash = "sha256:ec6652a1bee61c53a3e5776b6049172c53b6aaba34f18c9ad04f82712bac623d", size = 102010 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8d/9c/f20e0e2cf80e4b2e4b1c365bf5fe104ee633c751a724246262db8f1a0b13/multidict-6.7.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a90f75c956e32891a4eda3639ce6dd86e87105271f43d43442a3aedf3cddf172", size = 76893 },
+    { url = "https://files.pythonhosted.org/packages/fe/cf/18ef143a81610136d3da8193da9d80bfe1cb548a1e2d1c775f26b23d024a/multidict-6.7.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3fccb473e87eaa1382689053e4a4618e7ba7b9b9b8d6adf2027ee474597128cd", size = 45456 },
+    { url = "https://files.pythonhosted.org/packages/a9/65/1caac9d4cd32e8433908683446eebc953e82d22b03d10d41a5f0fefe991b/multidict-6.7.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0fa96985700739c4c7853a43c0b3e169360d6855780021bfc6d0f1ce7c123e7", size = 43872 },
+    { url = "https://files.pythonhosted.org/packages/cf/3b/d6bd75dc4f3ff7c73766e04e705b00ed6dbbaccf670d9e05a12b006f5a21/multidict-6.7.1-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:cb2a55f408c3043e42b40cc8eecd575afa27b7e0b956dfb190de0f8499a57a53", size = 251018 },
+    { url = "https://files.pythonhosted.org/packages/fd/80/c959c5933adedb9ac15152e4067c702a808ea183a8b64cf8f31af8ad3155/multidict-6.7.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eb0ce7b2a32d09892b3dd6cc44877a0d02a33241fafca5f25c8b6b62374f8b75", size = 258883 },
+    { url = "https://files.pythonhosted.org/packages/86/85/7ed40adafea3d4f1c8b916e3b5cc3a8e07dfcdcb9cd72800f4ed3ca1b387/multidict-6.7.1-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:c3a32d23520ee37bf327d1e1a656fec76a2edd5c038bf43eddfa0572ec49c60b", size = 242413 },
+    { url = "https://files.pythonhosted.org/packages/d2/57/b8565ff533e48595503c785f8361ff9a4fde4d67de25c207cd0ba3befd03/multidict-6.7.1-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9c90fed18bffc0189ba814749fdcc102b536e83a9f738a9003e569acd540a733", size = 268404 },
+    { url = "https://files.pythonhosted.org/packages/e0/50/9810c5c29350f7258180dfdcb2e52783a0632862eb334c4896ac717cebcb/multidict-6.7.1-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:da62917e6076f512daccfbbde27f46fed1c98fee202f0559adec8ee0de67f71a", size = 269456 },
+    { url = "https://files.pythonhosted.org/packages/f3/8d/5e5be3ced1d12966fefb5c4ea3b2a5b480afcea36406559442c6e31d4a48/multidict-6.7.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bfde23ef6ed9db7eaee6c37dcec08524cb43903c60b285b172b6c094711b3961", size = 256322 },
+    { url = "https://files.pythonhosted.org/packages/31/6e/d8a26d81ac166a5592782d208dd90dfdc0a7a218adaa52b45a672b46c122/multidict-6.7.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3758692429e4e32f1ba0df23219cd0b4fc0a52f476726fff9337d1a57676a582", size = 253955 },
+    { url = "https://files.pythonhosted.org/packages/59/4c/7c672c8aad41534ba619bcd4ade7a0dc87ed6b8b5c06149b85d3dd03f0cd/multidict-6.7.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:398c1478926eca669f2fd6a5856b6de9c0acf23a2cb59a14c0ba5844fa38077e", size = 251254 },
+    { url = "https://files.pythonhosted.org/packages/7b/bd/84c24de512cbafbdbc39439f74e967f19570ce7924e3007174a29c348916/multidict-6.7.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c102791b1c4f3ab36ce4101154549105a53dc828f016356b3e3bcae2e3a039d3", size = 252059 },
+    { url = "https://files.pythonhosted.org/packages/fa/ba/f5449385510825b73d01c2d4087bf6d2fccc20a2d42ac34df93191d3dd03/multidict-6.7.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:a088b62bd733e2ad12c50dad01b7d0166c30287c166e137433d3b410add807a6", size = 263588 },
+    { url = "https://files.pythonhosted.org/packages/d7/11/afc7c677f68f75c84a69fe37184f0f82fce13ce4b92f49f3db280b7e92b3/multidict-6.7.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:3d51ff4785d58d3f6c91bdbffcb5e1f7ddfda557727043aa20d20ec4f65e324a", size = 259642 },
+    { url = "https://files.pythonhosted.org/packages/2b/17/ebb9644da78c4ab36403739e0e6e0e30ebb135b9caf3440825001a0bddcb/multidict-6.7.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fc5907494fccf3e7d3f94f95c91d6336b092b5fc83811720fae5e2765890dfba", size = 251377 },
+    { url = "https://files.pythonhosted.org/packages/ca/a4/840f5b97339e27846c46307f2530a2805d9d537d8b8bd416af031cad7fa0/multidict-6.7.1-cp312-cp312-win32.whl", hash = "sha256:28ca5ce2fd9716631133d0e9a9b9a745ad7f60bac2bccafb56aa380fc0b6c511", size = 41887 },
+    { url = "https://files.pythonhosted.org/packages/80/31/0b2517913687895f5904325c2069d6a3b78f66cc641a86a2baf75a05dcbb/multidict-6.7.1-cp312-cp312-win_amd64.whl", hash = "sha256:fcee94dfbd638784645b066074b338bc9cc155d4b4bffa4adce1615c5a426c19", size = 46053 },
+    { url = "https://files.pythonhosted.org/packages/0c/5b/aba28e4ee4006ae4c7df8d327d31025d760ffa992ea23812a601d226e682/multidict-6.7.1-cp312-cp312-win_arm64.whl", hash = "sha256:ba0a9fb644d0c1a2194cf7ffb043bd852cea63a57f66fbd33959f7dae18517bf", size = 43307 },
+    { url = "https://files.pythonhosted.org/packages/f2/22/929c141d6c0dba87d3e1d38fbdf1ba8baba86b7776469f2bc2d3227a1e67/multidict-6.7.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:2b41f5fed0ed563624f1c17630cb9941cf2309d4df00e494b551b5f3e3d67a23", size = 76174 },
+    { url = "https://files.pythonhosted.org/packages/c7/75/bc704ae15fee974f8fccd871305e254754167dce5f9e42d88a2def741a1d/multidict-6.7.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:84e61e3af5463c19b67ced91f6c634effb89ef8bfc5ca0267f954451ed4bb6a2", size = 45116 },
+    { url = "https://files.pythonhosted.org/packages/79/76/55cd7186f498ed080a18440c9013011eb548f77ae1b297206d030eb1180a/multidict-6.7.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:935434b9853c7c112eee7ac891bc4cb86455aa631269ae35442cb316790c1445", size = 43524 },
+    { url = "https://files.pythonhosted.org/packages/e9/3c/414842ef8d5a1628d68edee29ba0e5bcf235dbfb3ccd3ea303a7fe8c72ff/multidict-6.7.1-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:432feb25a1cb67fe82a9680b4d65fb542e4635cb3166cd9c01560651ad60f177", size = 249368 },
+    { url = "https://files.pythonhosted.org/packages/f6/32/befed7f74c458b4a525e60519fe8d87eef72bb1e99924fa2b0f9d97a221e/multidict-6.7.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e82d14e3c948952a1a85503817e038cba5905a3352de76b9a465075d072fba23", size = 256952 },
+    { url = "https://files.pythonhosted.org/packages/03/d6/c878a44ba877f366630c860fdf74bfb203c33778f12b6ac274936853c451/multidict-6.7.1-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:4cfb48c6ea66c83bcaaf7e4dfa7ec1b6bbcf751b7db85a328902796dfde4c060", size = 240317 },
+    { url = "https://files.pythonhosted.org/packages/68/49/57421b4d7ad2e9e60e25922b08ceb37e077b90444bde6ead629095327a6f/multidict-6.7.1-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:1d540e51b7e8e170174555edecddbd5538105443754539193e3e1061864d444d", size = 267132 },
+    { url = "https://files.pythonhosted.org/packages/b7/fe/ec0edd52ddbcea2a2e89e174f0206444a61440b40f39704e64dc807a70bd/multidict-6.7.1-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:273d23f4b40f3dce4d6c8a821c741a86dec62cded82e1175ba3d99be128147ed", size = 268140 },
+    { url = "https://files.pythonhosted.org/packages/b0/73/6e1b01cbeb458807aa0831742232dbdd1fa92bfa33f52a3f176b4ff3dc11/multidict-6.7.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9d624335fd4fa1c08a53f8b4be7676ebde19cd092b3895c421045ca87895b429", size = 254277 },
+    { url = "https://files.pythonhosted.org/packages/6a/b2/5fb8c124d7561a4974c342bc8c778b471ebbeb3cc17df696f034a7e9afe7/multidict-6.7.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:12fad252f8b267cc75b66e8fc51b3079604e8d43a75428ffe193cd9e2195dfd6", size = 252291 },
+    { url = "https://files.pythonhosted.org/packages/5a/96/51d4e4e06bcce92577fcd488e22600bd38e4fd59c20cb49434d054903bd2/multidict-6.7.1-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:03ede2a6ffbe8ef936b92cb4529f27f42be7f56afcdab5ab739cd5f27fb1cbf9", size = 250156 },
+    { url = "https://files.pythonhosted.org/packages/db/6b/420e173eec5fba721a50e2a9f89eda89d9c98fded1124f8d5c675f7a0c0f/multidict-6.7.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:90efbcf47dbe33dcf643a1e400d67d59abeac5db07dc3f27d6bdeae497a2198c", size = 249742 },
+    { url = "https://files.pythonhosted.org/packages/44/a3/ec5b5bd98f306bc2aa297b8c6f11a46714a56b1e6ef5ebda50a4f5d7c5fb/multidict-6.7.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:5c4b9bfc148f5a91be9244d6264c53035c8a0dcd2f51f1c3c6e30e30ebaa1c84", size = 262221 },
+    { url = "https://files.pythonhosted.org/packages/cd/f7/e8c0d0da0cd1e28d10e624604e1a36bcc3353aaebdfdc3a43c72bc683a12/multidict-6.7.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:401c5a650f3add2472d1d288c26deebc540f99e2fb83e9525007a74cd2116f1d", size = 258664 },
+    { url = "https://files.pythonhosted.org/packages/52/da/151a44e8016dd33feed44f730bd856a66257c1ee7aed4f44b649fb7edeb3/multidict-6.7.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:97891f3b1b3ffbded884e2916cacf3c6fc87b66bb0dde46f7357404750559f33", size = 249490 },
+    { url = "https://files.pythonhosted.org/packages/87/af/a3b86bf9630b732897f6fc3f4c4714b90aa4361983ccbdcd6c0339b21b0c/multidict-6.7.1-cp313-cp313-win32.whl", hash = "sha256:e1c5988359516095535c4301af38d8a8838534158f649c05dd1050222321bcb3", size = 41695 },
+    { url = "https://files.pythonhosted.org/packages/b2/35/e994121b0e90e46134673422dd564623f93304614f5d11886b1b3e06f503/multidict-6.7.1-cp313-cp313-win_amd64.whl", hash = "sha256:960c83bf01a95b12b08fd54324a4eb1d5b52c88932b5cba5d6e712bb3ed12eb5", size = 45884 },
+    { url = "https://files.pythonhosted.org/packages/ca/61/42d3e5dbf661242a69c97ea363f2d7b46c567da8eadef8890022be6e2ab0/multidict-6.7.1-cp313-cp313-win_arm64.whl", hash = "sha256:563fe25c678aaba333d5399408f5ec3c383ca5b663e7f774dd179a520b8144df", size = 43122 },
+    { url = "https://files.pythonhosted.org/packages/6d/b3/e6b21c6c4f314bb956016b0b3ef2162590a529b84cb831c257519e7fde44/multidict-6.7.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:c76c4bec1538375dad9d452d246ca5368ad6e1c9039dadcf007ae59c70619ea1", size = 83175 },
+    { url = "https://files.pythonhosted.org/packages/fb/76/23ecd2abfe0957b234f6c960f4ade497f55f2c16aeb684d4ecdbf1c95791/multidict-6.7.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:57b46b24b5d5ebcc978da4ec23a819a9402b4228b8a90d9c656422b4bdd8a963", size = 48460 },
+    { url = "https://files.pythonhosted.org/packages/c4/57/a0ed92b23f3a042c36bc4227b72b97eca803f5f1801c1ab77c8a212d455e/multidict-6.7.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e954b24433c768ce78ab7929e84ccf3422e46deb45a4dc9f93438f8217fa2d34", size = 46930 },
+    { url = "https://files.pythonhosted.org/packages/b5/66/02ec7ace29162e447f6382c495dc95826bf931d3818799bbef11e8f7df1a/multidict-6.7.1-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:3bd231490fa7217cc832528e1cd8752a96f0125ddd2b5749390f7c3ec8721b65", size = 242582 },
+    { url = "https://files.pythonhosted.org/packages/58/18/64f5a795e7677670e872673aca234162514696274597b3708b2c0d276cce/multidict-6.7.1-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:253282d70d67885a15c8a7716f3a73edf2d635793ceda8173b9ecc21f2fb8292", size = 250031 },
+    { url = "https://files.pythonhosted.org/packages/c8/ed/e192291dbbe51a8290c5686f482084d31bcd9d09af24f63358c3d42fd284/multidict-6.7.1-cp313-cp313t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:0b4c48648d7649c9335cf1927a8b87fa692de3dcb15faa676c6a6f1f1aabda43", size = 228596 },
+    { url = "https://files.pythonhosted.org/packages/1e/7e/3562a15a60cf747397e7f2180b0a11dc0c38d9175a650e75fa1b4d325e15/multidict-6.7.1-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:98bc624954ec4d2c7cb074b8eefc2b5d0ce7d482e410df446414355d158fe4ca", size = 257492 },
+    { url = "https://files.pythonhosted.org/packages/24/02/7d0f9eae92b5249bb50ac1595b295f10e263dd0078ebb55115c31e0eaccd/multidict-6.7.1-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:1b99af4d9eec0b49927b4402bcbb58dea89d3e0db8806a4086117019939ad3dd", size = 255899 },
+    { url = "https://files.pythonhosted.org/packages/00/e3/9b60ed9e23e64c73a5cde95269ef1330678e9c6e34dd4eb6b431b85b5a10/multidict-6.7.1-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6aac4f16b472d5b7dc6f66a0d49dd57b0e0902090be16594dc9ebfd3d17c47e7", size = 247970 },
+    { url = "https://files.pythonhosted.org/packages/3e/06/538e58a63ed5cfb0bd4517e346b91da32fde409d839720f664e9a4ae4f9d/multidict-6.7.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:21f830fe223215dffd51f538e78c172ed7c7f60c9b96a2bf05c4848ad49921c3", size = 245060 },
+    { url = "https://files.pythonhosted.org/packages/b2/2f/d743a3045a97c895d401e9bd29aaa09b94f5cbdf1bd561609e5a6c431c70/multidict-6.7.1-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:f5dd81c45b05518b9aa4da4aa74e1c93d715efa234fd3e8a179df611cc85e5f4", size = 235888 },
+    { url = "https://files.pythonhosted.org/packages/38/83/5a325cac191ab28b63c52f14f1131f3b0a55ba3b9aa65a6d0bf2a9b921a0/multidict-6.7.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:eb304767bca2bb92fb9c5bd33cedc95baee5bb5f6c88e63706533a1c06ad08c8", size = 243554 },
+    { url = "https://files.pythonhosted.org/packages/20/1f/9d2327086bd15da2725ef6aae624208e2ef828ed99892b17f60c344e57ed/multidict-6.7.1-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:c9035dde0f916702850ef66460bc4239d89d08df4d02023a5926e7446724212c", size = 252341 },
+    { url = "https://files.pythonhosted.org/packages/e8/2c/2a1aa0280cf579d0f6eed8ee5211c4f1730bd7e06c636ba2ee6aafda302e/multidict-6.7.1-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:af959b9beeb66c822380f222f0e0a1889331597e81f1ded7f374f3ecb0fd6c52", size = 246391 },
+    { url = "https://files.pythonhosted.org/packages/e5/03/7ca022ffc36c5a3f6e03b179a5ceb829be9da5783e6fe395f347c0794680/multidict-6.7.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:41f2952231456154ee479651491e94118229844dd7226541788be783be2b5108", size = 243422 },
+    { url = "https://files.pythonhosted.org/packages/dc/1d/b31650eab6c5778aceed46ba735bd97f7c7d2f54b319fa916c0f96e7805b/multidict-6.7.1-cp313-cp313t-win32.whl", hash = "sha256:df9f19c28adcb40b6aae30bbaa1478c389efd50c28d541d76760199fc1037c32", size = 47770 },
+    { url = "https://files.pythonhosted.org/packages/ac/5b/2d2d1d522e51285bd61b1e20df8f47ae1a9d80839db0b24ea783b3832832/multidict-6.7.1-cp313-cp313t-win_amd64.whl", hash = "sha256:d54ecf9f301853f2c5e802da559604b3e95bb7a3b01a9c295c6ee591b9882de8", size = 53109 },
+    { url = "https://files.pythonhosted.org/packages/3d/a3/cc409ba012c83ca024a308516703cf339bdc4b696195644a7215a5164a24/multidict-6.7.1-cp313-cp313t-win_arm64.whl", hash = "sha256:5a37ca18e360377cfda1d62f5f382ff41f2b8c4ccb329ed974cc2e1643440118", size = 45573 },
+    { url = "https://files.pythonhosted.org/packages/91/cc/db74228a8be41884a567e88a62fd589a913708fcf180d029898c17a9a371/multidict-6.7.1-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:8f333ec9c5eb1b7105e3b84b53141e66ca05a19a605368c55450b6ba208cb9ee", size = 75190 },
+    { url = "https://files.pythonhosted.org/packages/d5/22/492f2246bb5b534abd44804292e81eeaf835388901f0c574bac4eeec73c5/multidict-6.7.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:a407f13c188f804c759fc6a9f88286a565c242a76b27626594c133b82883b5c2", size = 44486 },
+    { url = "https://files.pythonhosted.org/packages/f1/4f/733c48f270565d78b4544f2baddc2fb2a245e5a8640254b12c36ac7ac68e/multidict-6.7.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:0e161ddf326db5577c3a4cc2d8648f81456e8a20d40415541587a71620d7a7d1", size = 43219 },
+    { url = "https://files.pythonhosted.org/packages/24/bb/2c0c2287963f4259c85e8bcbba9182ced8d7fca65c780c38e99e61629d11/multidict-6.7.1-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:1e3a8bb24342a8201d178c3b4984c26ba81a577c80d4d525727427460a50c22d", size = 245132 },
+    { url = "https://files.pythonhosted.org/packages/a7/f9/44d4b3064c65079d2467888794dea218d1601898ac50222ab8a9a8094460/multidict-6.7.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:97231140a50f5d447d3164f994b86a0bed7cd016e2682f8650d6a9158e14fd31", size = 252420 },
+    { url = "https://files.pythonhosted.org/packages/8b/13/78f7275e73fa17b24c9a51b0bd9d73ba64bb32d0ed51b02a746eb876abe7/multidict-6.7.1-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:6b10359683bd8806a200fd2909e7c8ca3a7b24ec1d8132e483d58e791d881048", size = 233510 },
+    { url = "https://files.pythonhosted.org/packages/4b/25/8167187f62ae3cbd52da7893f58cb036b47ea3fb67138787c76800158982/multidict-6.7.1-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:283ddac99f7ac25a4acadbf004cb5ae34480bbeb063520f70ce397b281859362", size = 264094 },
+    { url = "https://files.pythonhosted.org/packages/a1/e7/69a3a83b7b030cf283fb06ce074a05a02322359783424d7edf0f15fe5022/multidict-6.7.1-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:538cec1e18c067d0e6103aa9a74f9e832904c957adc260e61cd9d8cf0c3b3d37", size = 260786 },
+    { url = "https://files.pythonhosted.org/packages/fe/3b/8ec5074bcfc450fe84273713b4b0a0dd47c0249358f5d82eb8104ffe2520/multidict-6.7.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7eee46ccb30ff48a1e35bb818cc90846c6be2b68240e42a78599166722cea709", size = 248483 },
+    { url = "https://files.pythonhosted.org/packages/48/5a/d5a99e3acbca0e29c5d9cba8f92ceb15dce78bab963b308ae692981e3a5d/multidict-6.7.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:fa263a02f4f2dd2d11a7b1bb4362aa7cb1049f84a9235d31adf63f30143469a0", size = 248403 },
+    { url = "https://files.pythonhosted.org/packages/35/48/e58cd31f6c7d5102f2a4bf89f96b9cf7e00b6c6f3d04ecc44417c00a5a3c/multidict-6.7.1-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:2e1425e2f99ec5bd36c15a01b690a1a2456209c5deed58f95469ffb46039ccbb", size = 240315 },
+    { url = "https://files.pythonhosted.org/packages/94/33/1cd210229559cb90b6786c30676bb0c58249ff42f942765f88793b41fdce/multidict-6.7.1-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:497394b3239fc6f0e13a78a3e1b61296e72bf1c5f94b4c4eb80b265c37a131cd", size = 245528 },
+    { url = "https://files.pythonhosted.org/packages/64/f2/6e1107d226278c876c783056b7db43d800bb64c6131cec9c8dfb6903698e/multidict-6.7.1-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:233b398c29d3f1b9676b4b6f75c518a06fcb2ea0b925119fb2c1bc35c05e1601", size = 258784 },
+    { url = "https://files.pythonhosted.org/packages/4d/c1/11f664f14d525e4a1b5327a82d4de61a1db604ab34c6603bb3c2cc63ad34/multidict-6.7.1-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:93b1818e4a6e0930454f0f2af7dfce69307ca03cdcfb3739bf4d91241967b6c1", size = 251980 },
+    { url = "https://files.pythonhosted.org/packages/e1/9f/75a9ac888121d0c5bbd4ecf4eead45668b1766f6baabfb3b7f66a410e231/multidict-6.7.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:f33dc2a3abe9249ea5d8360f969ec7f4142e7ac45ee7014d8f8d5acddf178b7b", size = 243602 },
+    { url = "https://files.pythonhosted.org/packages/9a/e7/50bf7b004cc8525d80dbbbedfdc7aed3e4c323810890be4413e589074032/multidict-6.7.1-cp314-cp314-win32.whl", hash = "sha256:3ab8b9d8b75aef9df299595d5388b14530839f6422333357af1339443cff777d", size = 40930 },
+    { url = "https://files.pythonhosted.org/packages/e0/bf/52f25716bbe93745595800f36fb17b73711f14da59ed0bb2eba141bc9f0f/multidict-6.7.1-cp314-cp314-win_amd64.whl", hash = "sha256:5e01429a929600e7dab7b166062d9bb54a5eed752384c7384c968c2afab8f50f", size = 45074 },
+    { url = "https://files.pythonhosted.org/packages/97/ab/22803b03285fa3a525f48217963da3a65ae40f6a1b6f6cf2768879e208f9/multidict-6.7.1-cp314-cp314-win_arm64.whl", hash = "sha256:4885cb0e817aef5d00a2e8451d4665c1808378dc27c2705f1bf4ef8505c0d2e5", size = 42471 },
+    { url = "https://files.pythonhosted.org/packages/e0/6d/f9293baa6146ba9507e360ea0292b6422b016907c393e2f63fc40ab7b7b5/multidict-6.7.1-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:0458c978acd8e6ea53c81eefaddbbee9c6c5e591f41b3f5e8e194780fe026581", size = 82401 },
+    { url = "https://files.pythonhosted.org/packages/7a/68/53b5494738d83558d87c3c71a486504d8373421c3e0dbb6d0db48ad42ee0/multidict-6.7.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:c0abd12629b0af3cf590982c0b413b1e7395cd4ec026f30986818ab95bfaa94a", size = 48143 },
+    { url = "https://files.pythonhosted.org/packages/37/e8/5284c53310dcdc99ce5d66563f6e5773531a9b9fe9ec7a615e9bc306b05f/multidict-6.7.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:14525a5f61d7d0c94b368a42cff4c9a4e7ba2d52e2672a7b23d84dc86fb02b0c", size = 46507 },
+    { url = "https://files.pythonhosted.org/packages/e4/fc/6800d0e5b3875568b4083ecf5f310dcf91d86d52573160834fb4bfcf5e4f/multidict-6.7.1-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:17307b22c217b4cf05033dabefe68255a534d637c6c9b0cc8382718f87be4262", size = 239358 },
+    { url = "https://files.pythonhosted.org/packages/41/75/4ad0973179361cdf3a113905e6e088173198349131be2b390f9fa4da5fc6/multidict-6.7.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7a7e590ff876a3eaf1c02a4dfe0724b6e69a9e9de6d8f556816f29c496046e59", size = 246884 },
+    { url = "https://files.pythonhosted.org/packages/c3/9c/095bb28b5da139bd41fb9a5d5caff412584f377914bd8787c2aa98717130/multidict-6.7.1-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:5fa6a95dfee63893d80a34758cd0e0c118a30b8dcb46372bf75106c591b77889", size = 225878 },
+    { url = "https://files.pythonhosted.org/packages/07/d0/c0a72000243756e8f5a277b6b514fa005f2c73d481b7d9e47cd4568aa2e4/multidict-6.7.1-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a0543217a6a017692aa6ae5cc39adb75e587af0f3a82288b1492eb73dd6cc2a4", size = 253542 },
+    { url = "https://files.pythonhosted.org/packages/c0/6b/f69da15289e384ecf2a68837ec8b5ad8c33e973aa18b266f50fe55f24b8c/multidict-6.7.1-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f99fe611c312b3c1c0ace793f92464d8cd263cc3b26b5721950d977b006b6c4d", size = 252403 },
+    { url = "https://files.pythonhosted.org/packages/a2/76/b9669547afa5a1a25cd93eaca91c0da1c095b06b6d2d8ec25b713588d3a1/multidict-6.7.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9004d8386d133b7e6135679424c91b0b854d2d164af6ea3f289f8f2761064609", size = 244889 },
+    { url = "https://files.pythonhosted.org/packages/7e/a9/a50d2669e506dad33cfc45b5d574a205587b7b8a5f426f2fbb2e90882588/multidict-6.7.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e628ef0e6859ffd8273c69412a2465c4be4a9517d07261b33334b5ec6f3c7489", size = 241982 },
+    { url = "https://files.pythonhosted.org/packages/c5/bb/1609558ad8b456b4827d3c5a5b775c93b87878fd3117ed3db3423dfbce1b/multidict-6.7.1-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:841189848ba629c3552035a6a7f5bf3b02eb304e9fea7492ca220a8eda6b0e5c", size = 232415 },
+    { url = "https://files.pythonhosted.org/packages/d8/59/6f61039d2aa9261871e03ab9dc058a550d240f25859b05b67fd70f80d4b3/multidict-6.7.1-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:ce1bbd7d780bb5a0da032e095c951f7014d6b0a205f8318308140f1a6aba159e", size = 240337 },
+    { url = "https://files.pythonhosted.org/packages/a1/29/fdc6a43c203890dc2ae9249971ecd0c41deaedfe00d25cb6564b2edd99eb/multidict-6.7.1-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:b26684587228afed0d50cf804cc71062cc9c1cdf55051c4c6345d372947b268c", size = 248788 },
+    { url = "https://files.pythonhosted.org/packages/a9/14/a153a06101323e4cf086ecee3faadba52ff71633d471f9685c42e3736163/multidict-6.7.1-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:9f9af11306994335398293f9958071019e3ab95e9a707dc1383a35613f6abcb9", size = 242842 },
+    { url = "https://files.pythonhosted.org/packages/41/5f/604ae839e64a4a6efc80db94465348d3b328ee955e37acb24badbcd24d83/multidict-6.7.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:b4938326284c4f1224178a560987b6cf8b4d38458b113d9b8c1db1a836e640a2", size = 240237 },
+    { url = "https://files.pythonhosted.org/packages/5f/60/c3a5187bf66f6fb546ff4ab8fb5a077cbdd832d7b1908d4365c7f74a1917/multidict-6.7.1-cp314-cp314t-win32.whl", hash = "sha256:98655c737850c064a65e006a3df7c997cd3b220be4ec8fe26215760b9697d4d7", size = 48008 },
+    { url = "https://files.pythonhosted.org/packages/0c/f7/addf1087b860ac60e6f382240f64fb99f8bfb532bb06f7c542b83c29ca61/multidict-6.7.1-cp314-cp314t-win_amd64.whl", hash = "sha256:497bde6223c212ba11d462853cfa4f0ae6ef97465033e7dc9940cdb3ab5b48e5", size = 53542 },
+    { url = "https://files.pythonhosted.org/packages/4c/81/4629d0aa32302ef7b2ec65c75a728cc5ff4fa410c50096174c1632e70b3e/multidict-6.7.1-cp314-cp314t-win_arm64.whl", hash = "sha256:2bbd113e0d4af5db41d5ebfe9ccaff89de2120578164f86a5d17d5a576d1e5b2", size = 44719 },
+    { url = "https://files.pythonhosted.org/packages/81/08/7036c080d7117f28a4af526d794aab6a84463126db031b007717c1a6676e/multidict-6.7.1-py3-none-any.whl", hash = "sha256:55d97cc6dae627efa6a6e548885712d4864b81110ac76fa4e534c03819fa4a56", size = 12319 },
+]
+
+[[package]]
+name = "multiprocess"
+version = "0.70.19"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "dill" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a2/f2/e783ac7f2aeeed14e9e12801f22529cc7e6b7ab80928d6dcce4e9f00922d/multiprocess-0.70.19.tar.gz", hash = "sha256:952021e0e6c55a4a9fe4cd787895b86e239a40e76802a789d6305398d3975897", size = 2079989 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e3/45/8004d1e6b9185c1a444d6b55ac5682acf9d98035e54386d967366035a03a/multiprocess-0.70.19-py310-none-any.whl", hash = "sha256:97404393419dcb2a8385910864eedf47a3cadf82c66345b44f036420eb0b5d87", size = 134948 },
+    { url = "https://files.pythonhosted.org/packages/86/c2/dec9722dc3474c164a0b6bcd9a7ed7da542c98af8cabce05374abab35edd/multiprocess-0.70.19-py311-none-any.whl", hash = "sha256:928851ae7973aea4ce0eaf330bbdafb2e01398a91518d5c8818802845564f45c", size = 144457 },
+    { url = "https://files.pythonhosted.org/packages/71/70/38998b950a97ea279e6bd657575d22d1a2047256caf707d9a10fbce4f065/multiprocess-0.70.19-py312-none-any.whl", hash = "sha256:3a56c0e85dd5025161bac5ce138dcac1e49174c7d8e74596537e729fd5c53c28", size = 150281 },
+    { url = "https://files.pythonhosted.org/packages/7f/74/d2c27e03cb84251dfe7249b8e82923643c6d48fa4883b9476b025e7dc7eb/multiprocess-0.70.19-py313-none-any.whl", hash = "sha256:8d5eb4ec5017ba2fab4e34a747c6d2c2b6fecfe9e7236e77988db91580ada952", size = 156414 },
+    { url = "https://files.pythonhosted.org/packages/a0/61/af9115673a5870fd885247e2f1b68c4f1197737da315b520a91c757a861a/multiprocess-0.70.19-py314-none-any.whl", hash = "sha256:e8cc7fbdff15c0613f0a1f1f8744bef961b0a164c0ca29bdff53e9d2d93c5e5f", size = 160318 },
+    { url = "https://files.pythonhosted.org/packages/7e/82/69e539c4c2027f1e1697e09aaa2449243085a0edf81ae2c6341e84d769b6/multiprocess-0.70.19-py39-none-any.whl", hash = "sha256:0d4b4397ed669d371c81dcd1ef33fd384a44d6c3de1bd0ca7ac06d837720d3c5", size = 133477 },
+]
+
+[[package]]
+name = "mypy-extensions"
+version = "1.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a2/6e/371856a3fb9d31ca8dac321cda606860fa4548858c0cc45d9d1d4ca2628b/mypy_extensions-1.1.0.tar.gz", hash = "sha256:52e68efc3284861e772bbcd66823fde5ae21fd2fdb51c62a211403730b916558", size = 6343 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/79/7b/2c79738432f5c924bef5071f933bcc9efd0473bac3b4aa584a6f7c1c8df8/mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505", size = 4963 },
+]
+
+[[package]]
+name = "nest-asyncio"
+version = "1.6.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/83/f8/51569ac65d696c8ecbee95938f89d4abf00f47d58d48f6fbabfe8f0baefe/nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe", size = 7418 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a0/c4/c2971a3ba4c6103a3d10c4b0f24f461ddc027f0f09763220cf35ca1401b3/nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c", size = 5195 },
+]
+
+[[package]]
+name = "networkx"
+version = "3.6.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/6a/51/63fe664f3908c97be9d2e4f1158eb633317598cfa6e1fc14af5383f17512/networkx-3.6.1.tar.gz", hash = "sha256:26b7c357accc0c8cde558ad486283728b65b6a95d85ee1cd66bafab4c8168509", size = 2517025 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9e/c9/b2622292ea83fbb4ec318f5b9ab867d0a28ab43c5717bb85b0a5f6b3b0a4/networkx-3.6.1-py3-none-any.whl", hash = "sha256:d47fbf302e7d9cbbb9e2555a0d267983d2aa476bac30e90dfbe5669bd57f3762", size = 2068504 },
+]
+
+[[package]]
+name = "nltk"
+version = "3.9.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "click" },
+    { name = "joblib" },
+    { name = "regex" },
+    { name = "tqdm" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/74/a1/b3b4adf15585a5bc4c357adde150c01ebeeb642173ded4d871e89468767c/nltk-3.9.4.tar.gz", hash = "sha256:ed03bc098a40481310320808b2db712d95d13ca65b27372f8a403949c8b523d0", size = 2946864 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9d/91/04e965f8e717ba0ab4bdca5c112deeab11c9e750d94c4d4602f050295d39/nltk-3.9.4-py3-none-any.whl", hash = "sha256:f2fa301c3a12718ce4a0e9305c5675299da5ad9e26068218b69d692fda84828f", size = 1552087 },
+]
+
+[[package]]
+name = "numpy"
+version = "2.4.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d7/9f/b8cef5bffa569759033adda9481211426f12f53299629b410340795c2514/numpy-2.4.4.tar.gz", hash = "sha256:2d390634c5182175533585cc89f3608a4682ccb173cc9bb940b2881c8d6f8fa0", size = 20731587 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/28/05/32396bec30fb2263770ee910142f49c1476d08e8ad41abf8403806b520ce/numpy-2.4.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:15716cfef24d3a9762e3acdf87e27f58dc823d1348f765bbea6bef8c639bfa1b", size = 16689272 },
+    { url = "https://files.pythonhosted.org/packages/c5/f3/a983d28637bfcd763a9c7aafdb6d5c0ebf3d487d1e1459ffdb57e2f01117/numpy-2.4.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:23cbfd4c17357c81021f21540da84ee282b9c8fba38a03b7b9d09ba6b951421e", size = 14699573 },
+    { url = "https://files.pythonhosted.org/packages/9b/fd/e5ecca1e78c05106d98028114f5c00d3eddb41207686b2b7de3e477b0e22/numpy-2.4.4-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:8b3b60bb7cba2c8c81837661c488637eee696f59a877788a396d33150c35d842", size = 5204782 },
+    { url = "https://files.pythonhosted.org/packages/de/2f/702a4594413c1a8632092beae8aba00f1d67947389369b3777aed783fdca/numpy-2.4.4-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:e4a010c27ff6f210ff4c6ef34394cd61470d01014439b192ec22552ee867f2a8", size = 6552038 },
+    { url = "https://files.pythonhosted.org/packages/7f/37/eed308a8f56cba4d1fdf467a4fc67ef4ff4bf1c888f5fc980481890104b1/numpy-2.4.4-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f9e75681b59ddaa5e659898085ae0eaea229d054f2ac0c7e563a62205a700121", size = 15670666 },
+    { url = "https://files.pythonhosted.org/packages/0a/0d/0e3ecece05b7a7e87ab9fb587855548da437a061326fff64a223b6dcb78a/numpy-2.4.4-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:81f4a14bee47aec54f883e0cad2d73986640c1590eb9bfaaba7ad17394481e6e", size = 16645480 },
+    { url = "https://files.pythonhosted.org/packages/34/49/f2312c154b82a286758ee2f1743336d50651f8b5195db18cdb63675ff649/numpy-2.4.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:62d6b0f03b694173f9fcb1fb317f7222fd0b0b103e784c6549f5e53a27718c44", size = 17020036 },
+    { url = "https://files.pythonhosted.org/packages/7b/e9/736d17bd77f1b0ec4f9901aaec129c00d59f5d84d5e79bba540ef12c2330/numpy-2.4.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fbc356aae7adf9e6336d336b9c8111d390a05df88f1805573ebb0807bd06fd1d", size = 18368643 },
+    { url = "https://files.pythonhosted.org/packages/63/f6/d417977c5f519b17c8a5c3bc9e8304b0908b0e21136fe43bf628a1343914/numpy-2.4.4-cp312-cp312-win32.whl", hash = "sha256:0d35aea54ad1d420c812bfa0385c71cd7cc5bcf7c65fed95fc2cd02fe8c79827", size = 5961117 },
+    { url = "https://files.pythonhosted.org/packages/2d/5b/e1deebf88ff431b01b7406ca3583ab2bbb90972bbe1c568732e49c844f7e/numpy-2.4.4-cp312-cp312-win_amd64.whl", hash = "sha256:b5f0362dc928a6ecd9db58868fca5e48485205e3855957bdedea308f8672ea4a", size = 12320584 },
+    { url = "https://files.pythonhosted.org/packages/58/89/e4e856ac82a68c3ed64486a544977d0e7bdd18b8da75b78a577ca31c4395/numpy-2.4.4-cp312-cp312-win_arm64.whl", hash = "sha256:846300f379b5b12cc769334464656bc882e0735d27d9726568bc932fdc49d5ec", size = 10221450 },
+    { url = "https://files.pythonhosted.org/packages/14/1d/d0a583ce4fefcc3308806a749a536c201ed6b5ad6e1322e227ee4848979d/numpy-2.4.4-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:08f2e31ed5e6f04b118e49821397f12767934cfdd12a1ce86a058f91e004ee50", size = 16684933 },
+    { url = "https://files.pythonhosted.org/packages/c1/62/2b7a48fbb745d344742c0277f01286dead15f3f68e4f359fbfcf7b48f70f/numpy-2.4.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e823b8b6edc81e747526f70f71a9c0a07ac4e7ad13020aa736bb7c9d67196115", size = 14694532 },
+    { url = "https://files.pythonhosted.org/packages/e5/87/499737bfba066b4a3bebff24a8f1c5b2dee410b209bc6668c9be692580f0/numpy-2.4.4-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:4a19d9dba1a76618dd86b164d608566f393f8ec6ac7c44f0cc879011c45e65af", size = 5199661 },
+    { url = "https://files.pythonhosted.org/packages/cd/da/464d551604320d1491bc345efed99b4b7034143a85787aab78d5691d5a0e/numpy-2.4.4-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:d2a8490669bfe99a233298348acc2d824d496dee0e66e31b66a6022c2ad74a5c", size = 6547539 },
+    { url = "https://files.pythonhosted.org/packages/7d/90/8d23e3b0dafd024bf31bdec225b3bb5c2dbfa6912f8a53b8659f21216cbf/numpy-2.4.4-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:45dbed2ab436a9e826e302fcdcbe9133f9b0006e5af7168afb8963a6520da103", size = 15668806 },
+    { url = "https://files.pythonhosted.org/packages/d1/73/a9d864e42a01896bb5974475438f16086be9ba1f0d19d0bb7a07427c4a8b/numpy-2.4.4-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c901b15172510173f5cb310eae652908340f8dede90fff9e3bf6c0d8dfd92f83", size = 16632682 },
+    { url = "https://files.pythonhosted.org/packages/34/fb/14570d65c3bde4e202a031210475ae9cde9b7686a2e7dc97ee67d2833b35/numpy-2.4.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:99d838547ace2c4aace6c4f76e879ddfe02bb58a80c1549928477862b7a6d6ed", size = 17019810 },
+    { url = "https://files.pythonhosted.org/packages/8a/77/2ba9d87081fd41f6d640c83f26fb7351e536b7ce6dd9061b6af5904e8e46/numpy-2.4.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:0aec54fd785890ecca25a6003fd9a5aed47ad607bbac5cd64f836ad8666f4959", size = 18357394 },
+    { url = "https://files.pythonhosted.org/packages/a2/23/52666c9a41708b0853fa3b1a12c90da38c507a3074883823126d4e9d5b30/numpy-2.4.4-cp313-cp313-win32.whl", hash = "sha256:07077278157d02f65c43b1b26a3886bce886f95d20aabd11f87932750dfb14ed", size = 5959556 },
+    { url = "https://files.pythonhosted.org/packages/57/fb/48649b4971cde70d817cf97a2a2fdc0b4d8308569f1dd2f2611959d2e0cf/numpy-2.4.4-cp313-cp313-win_amd64.whl", hash = "sha256:5c70f1cc1c4efbe316a572e2d8b9b9cc44e89b95f79ca3331553fbb63716e2bf", size = 12317311 },
+    { url = "https://files.pythonhosted.org/packages/ba/d8/11490cddd564eb4de97b4579ef6bfe6a736cc07e94c1598590ae25415e01/numpy-2.4.4-cp313-cp313-win_arm64.whl", hash = "sha256:ef4059d6e5152fa1a39f888e344c73fdc926e1b2dd58c771d67b0acfbf2aa67d", size = 10222060 },
+    { url = "https://files.pythonhosted.org/packages/99/5d/dab4339177a905aad3e2221c915b35202f1ec30d750dd2e5e9d9a72b804b/numpy-2.4.4-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:4bbc7f303d125971f60ec0aaad5e12c62d0d2c925f0ab1273debd0e4ba37aba5", size = 14822302 },
+    { url = "https://files.pythonhosted.org/packages/eb/e4/0564a65e7d3d97562ed6f9b0fd0fb0a6f559ee444092f105938b50043876/numpy-2.4.4-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:4d6d57903571f86180eb98f8f0c839fa9ebbfb031356d87f1361be91e433f5b7", size = 5327407 },
+    { url = "https://files.pythonhosted.org/packages/29/8d/35a3a6ce5ad371afa58b4700f1c820f8f279948cca32524e0a695b0ded83/numpy-2.4.4-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:4636de7fd195197b7535f231b5de9e4b36d2c440b6e566d2e4e4746e6af0ca93", size = 6647631 },
+    { url = "https://files.pythonhosted.org/packages/f4/da/477731acbd5a58a946c736edfdabb2ac5b34c3d08d1ba1a7b437fa0884df/numpy-2.4.4-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ad2e2ef14e0b04e544ea2fa0a36463f847f113d314aa02e5b402fdf910ef309e", size = 15727691 },
+    { url = "https://files.pythonhosted.org/packages/e6/db/338535d9b152beabeb511579598418ba0212ce77cf9718edd70262cc4370/numpy-2.4.4-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5a285b3b96f951841799528cd1f4f01cd70e7e0204b4abebac9463eecfcf2a40", size = 16681241 },
+    { url = "https://files.pythonhosted.org/packages/e2/a9/ad248e8f58beb7a0219b413c9c7d8151c5d285f7f946c3e26695bdbbe2df/numpy-2.4.4-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:f8474c4241bc18b750be2abea9d7a9ec84f46ef861dbacf86a4f6e043401f79e", size = 17085767 },
+    { url = "https://files.pythonhosted.org/packages/b5/1a/3b88ccd3694681356f70da841630e4725a7264d6a885c8d442a697e1146b/numpy-2.4.4-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4e874c976154687c1f71715b034739b45c7711bec81db01914770373d125e392", size = 18403169 },
+    { url = "https://files.pythonhosted.org/packages/c2/c9/fcfd5d0639222c6eac7f304829b04892ef51c96a75d479214d77e3ce6e33/numpy-2.4.4-cp313-cp313t-win32.whl", hash = "sha256:9c585a1790d5436a5374bac930dad6ed244c046ed91b2b2a3634eb2971d21008", size = 6083477 },
+    { url = "https://files.pythonhosted.org/packages/d5/e3/3938a61d1c538aaec8ed6fd6323f57b0c2d2d2219512434c5c878db76553/numpy-2.4.4-cp313-cp313t-win_amd64.whl", hash = "sha256:93e15038125dc1e5345d9b5b68aa7f996ec33b98118d18c6ca0d0b7d6198b7e8", size = 12457487 },
+    { url = "https://files.pythonhosted.org/packages/97/6a/7e345032cc60501721ef94e0e30b60f6b0bd601f9174ebd36389a2b86d40/numpy-2.4.4-cp313-cp313t-win_arm64.whl", hash = "sha256:0dfd3f9d3adbe2920b68b5cd3d51444e13a10792ec7154cd0a2f6e74d4ab3233", size = 10292002 },
+    { url = "https://files.pythonhosted.org/packages/6e/06/c54062f85f673dd5c04cbe2f14c3acb8c8b95e3384869bb8cc9bff8cb9df/numpy-2.4.4-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:f169b9a863d34f5d11b8698ead99febeaa17a13ca044961aa8e2662a6c7766a0", size = 16684353 },
+    { url = "https://files.pythonhosted.org/packages/4c/39/8a320264a84404c74cc7e79715de85d6130fa07a0898f67fb5cd5bd79908/numpy-2.4.4-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:2483e4584a1cb3092da4470b38866634bafb223cbcd551ee047633fd2584599a", size = 14704914 },
+    { url = "https://files.pythonhosted.org/packages/91/fb/287076b2614e1d1044235f50f03748f31fa287e3dbe6abeb35cdfa351eca/numpy-2.4.4-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:2d19e6e2095506d1736b7d80595e0f252d76b89f5e715c35e06e937679ea7d7a", size = 5210005 },
+    { url = "https://files.pythonhosted.org/packages/63/eb/fcc338595309910de6ecabfcef2419a9ce24399680bfb149421fa2df1280/numpy-2.4.4-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:6a246d5914aa1c820c9443ddcee9c02bec3e203b0c080349533fae17727dfd1b", size = 6544974 },
+    { url = "https://files.pythonhosted.org/packages/44/5d/e7e9044032a716cdfaa3fba27a8e874bf1c5f1912a1ddd4ed071bf8a14a6/numpy-2.4.4-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:989824e9faf85f96ec9c7761cd8d29c531ad857bfa1daa930cba85baaecf1a9a", size = 15684591 },
+    { url = "https://files.pythonhosted.org/packages/98/7c/21252050676612625449b4807d6b695b9ce8a7c9e1c197ee6216c8a65c7c/numpy-2.4.4-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:27a8d92cd10f1382a67d7cf4db7ce18341b66438bdd9f691d7b0e48d104c2a9d", size = 16637700 },
+    { url = "https://files.pythonhosted.org/packages/b1/29/56d2bbef9465db24ef25393383d761a1af4f446a1df9b8cded4fe3a5a5d7/numpy-2.4.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:e44319a2953c738205bf3354537979eaa3998ed673395b964c1176083dd46252", size = 17035781 },
+    { url = "https://files.pythonhosted.org/packages/e3/2b/a35a6d7589d21f44cea7d0a98de5ddcbb3d421b2622a5c96b1edf18707c3/numpy-2.4.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e892aff75639bbef0d2a2cfd55535510df26ff92f63c92cd84ef8d4ba5a5557f", size = 18362959 },
+    { url = "https://files.pythonhosted.org/packages/64/c9/d52ec581f2390e0f5f85cbfd80fb83d965fc15e9f0e1aec2195faa142cde/numpy-2.4.4-cp314-cp314-win32.whl", hash = "sha256:1378871da56ca8943c2ba674530924bb8ca40cd228358a3b5f302ad60cf875fc", size = 6008768 },
+    { url = "https://files.pythonhosted.org/packages/fa/22/4cc31a62a6c7b74a8730e31a4274c5dc80e005751e277a2ce38e675e4923/numpy-2.4.4-cp314-cp314-win_amd64.whl", hash = "sha256:715d1c092715954784bc79e1174fc2a90093dc4dc84ea15eb14dad8abdcdeb74", size = 12449181 },
+    { url = "https://files.pythonhosted.org/packages/70/2e/14cda6f4d8e396c612d1bf97f22958e92148801d7e4f110cabebdc0eef4b/numpy-2.4.4-cp314-cp314-win_arm64.whl", hash = "sha256:2c194dd721e54ecad9ad387c1d35e63dce5c4450c6dc7dd5611283dda239aabb", size = 10496035 },
+    { url = "https://files.pythonhosted.org/packages/b1/e8/8fed8c8d848d7ecea092dc3469643f9d10bc3a134a815a3b033da1d2039b/numpy-2.4.4-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2aa0613a5177c264ff5921051a5719d20095ea586ca88cc802c5c218d1c67d3e", size = 14824958 },
+    { url = "https://files.pythonhosted.org/packages/05/1a/d8007a5138c179c2bf33ef44503e83d70434d2642877ee8fbb230e7c0548/numpy-2.4.4-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:42c16925aa5a02362f986765f9ebabf20de75cdefdca827d14315c568dcab113", size = 5330020 },
+    { url = "https://files.pythonhosted.org/packages/99/64/ffb99ac6ae93faf117bcbd5c7ba48a7f45364a33e8e458545d3633615dda/numpy-2.4.4-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:874f200b2a981c647340f841730fc3a2b54c9d940566a3c4149099591e2c4c3d", size = 6650758 },
+    { url = "https://files.pythonhosted.org/packages/6e/6e/795cc078b78a384052e73b2f6281ff7a700e9bf53bcce2ee579d4f6dd879/numpy-2.4.4-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c9b39d38a9bd2ae1becd7eac1303d031c5c110ad31f2b319c6e7d98b135c934d", size = 15729948 },
+    { url = "https://files.pythonhosted.org/packages/5f/86/2acbda8cc2af5f3d7bfc791192863b9e3e19674da7b5e533fded124d1299/numpy-2.4.4-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b268594bccac7d7cf5844c7732e3f20c50921d94e36d7ec9b79e9857694b1b2f", size = 16679325 },
+    { url = "https://files.pythonhosted.org/packages/bc/59/cafd83018f4aa55e0ac6fa92aa066c0a1877b77a615ceff1711c260ffae8/numpy-2.4.4-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:ac6b31e35612a26483e20750126d30d0941f949426974cace8e6b5c58a3657b0", size = 17084883 },
+    { url = "https://files.pythonhosted.org/packages/f0/85/a42548db84e65ece46ab2caea3d3f78b416a47af387fcbb47ec28e660dc2/numpy-2.4.4-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:8e3ed142f2728df44263aaf5fb1f5b0b99f4070c553a0d7f033be65338329150", size = 18403474 },
+    { url = "https://files.pythonhosted.org/packages/ed/ad/483d9e262f4b831000062e5d8a45e342166ec8aaa1195264982bca267e62/numpy-2.4.4-cp314-cp314t-win32.whl", hash = "sha256:dddbbd259598d7240b18c9d87c56a9d2fb3b02fe266f49a7c101532e78c1d871", size = 6155500 },
+    { url = "https://files.pythonhosted.org/packages/c7/03/2fc4e14c7bd4ff2964b74ba90ecb8552540b6315f201df70f137faa5c589/numpy-2.4.4-cp314-cp314t-win_amd64.whl", hash = "sha256:a7164afb23be6e37ad90b2f10426149fd75aee07ca55653d2aa41e66c4ef697e", size = 12637755 },
+    { url = "https://files.pythonhosted.org/packages/58/78/548fb8e07b1a341746bfbecb32f2c268470f45fa028aacdbd10d9bc73aab/numpy-2.4.4-cp314-cp314t-win_arm64.whl", hash = "sha256:ba203255017337d39f89bdd58417f03c4426f12beed0440cfd933cb15f8669c7", size = 10566643 },
+]
+
+[[package]]
+name = "packaging"
+version = "26.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d7/f1/e7a6dd94a8d4a5626c03e4e99c87f241ba9e350cd9e6d75123f992427270/packaging-26.2.tar.gz", hash = "sha256:ff452ff5a3e828ce110190feff1178bb1f2ea2281fa2075aadb987c2fb221661", size = 228134 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/df/b2/87e62e8c3e2f4b32e5fe99e0b86d576da1312593b39f47d8ceef365e95ed/packaging-26.2-py3-none-any.whl", hash = "sha256:5fc45236b9446107ff2415ce77c807cee2862cb6fac22b8a73826d0693b0980e", size = 100195 },
+]
+
+[[package]]
+name = "pandas"
+version = "3.0.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy" },
+    { name = "python-dateutil" },
+    { name = "tzdata", marker = "sys_platform == 'emscripten' or sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f8/87/4341c6252d1c47b08768c3d25ac487362bf403f0313ddae4a2a26c9b1b4c/pandas-3.0.3.tar.gz", hash = "sha256:696a4a00a2a2a35d4e5deb3fc946641b96c944f02230e4f76137fe35d806c4fc", size = 4651414 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/24/f1/392f8c5bfc16f66a0d2d41561c01627c228fe7ed2a0d056ef11315042570/pandas-3.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fed2ff7fd9779120e388e285fc029bd5cf9490cdd2e4166a9ee22c0e49a9ab09", size = 10357846 },
+    { url = "https://files.pythonhosted.org/packages/cf/3d/b16412745651e855f357e5e66930248688378853a6e2698a214e331fba1f/pandas-3.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b168fc218fd80a6cbdbdbc1a97ddc7889ed057d7eb45f50d866ceab5f39904c4", size = 9899550 },
+    { url = "https://files.pythonhosted.org/packages/31/a8/fa2535168fffcedf67f4f6de28d2dd903a747ca7c8ea6989451aaeb3a92f/pandas-3.0.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0383c72c75cdcca61a9e116e611143902dbfd08bff356829c2f6d1cf40a9ca8c", size = 10412965 },
+    { url = "https://files.pythonhosted.org/packages/65/b6/09b01cdbc15224e2850365192d17b7bdebb8bdbd8780ed221fcdf0d9a515/pandas-3.0.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6dc0b3fd2169c9157deed50b4d519553a3655c8c6a96027136d654592be973a9", size = 10894600 },
+    { url = "https://files.pythonhosted.org/packages/c9/a4/2eb28f2fccb4ced4a2c79ab2a5dee9ade1ebf44922ebad6fea158c9f95d4/pandas-3.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:7e65d5407dc0b394f509699650e4a2ec01c0514f21850f453fa60f3be79a5dbf", size = 11422824 },
+    { url = "https://files.pythonhosted.org/packages/f8/45/830bb57f533a4604b355e07edcb8ea18cf88b5f94e5fca92f27052d7c597/pandas-3.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f8894dc474d648fe7b6ff0ca9b0bd73950d19952bc1a6534540762c5d79d305c", size = 11950889 },
+    { url = "https://files.pythonhosted.org/packages/b9/c5/fc1b368f303087d20e8c9bf3d6ceb186263cfac0ade735cd938538bea839/pandas-3.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:c7be265b62cef88e253a941e4698604973736dcfe242fdb5198f0f7bc473cdcc", size = 9755463 },
+    { url = "https://files.pythonhosted.org/packages/86/bd/fda8f9705b1b09c6ebe14bfc0fa0e4ec8584d54ea673628f157ff55131af/pandas-3.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:557409bc4178e70ee8d9ddb494798e51ebf6ea59330f6be22c51bab2a7db6c49", size = 9066158 },
+    { url = "https://files.pythonhosted.org/packages/c5/90/62d8302883c44308c477e222c3daf7c813a34c8e96985882fbd53d964352/pandas-3.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:67b3b64c11910cfa29f4e94a14d3bff9ee693b6fc76055e7cad549cee0aec5fa", size = 10331071 },
+    { url = "https://files.pythonhosted.org/packages/7f/ae/6a6493c783a101f165e4356953ba3c74d6f77f0042fa7d753da9dfbb640c/pandas-3.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:39436b377d56d2a2e52d0395bdbee171f01068e99af5250509aceeb929f765c7", size = 9875690 },
+    { url = "https://files.pythonhosted.org/packages/62/7c/5df8e9f56c69a2769fbe9382a5ef8f2658c007e376434e1e2cbb57ad895f/pandas-3.0.3-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d4be06d68f9ddcfc645b87534911da79a8fbffc7573c80e0edcf42a5020624d8", size = 10381634 },
+    { url = "https://files.pythonhosted.org/packages/99/68/1237369725aa617bb358263d535803e3053fdbc593513ec5ed9c9896b5b6/pandas-3.0.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a4eeb6830daf35a71cc09649bd823e2b542dac246cdee9614c6e4bd65028cd6a", size = 10891243 },
+    { url = "https://files.pythonhosted.org/packages/25/93/77d108e8af7222b4a503ebde0e30215b1c2e4f8e53a526431890f22d5586/pandas-3.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1928e07221f82db493cd4af1e23c1bfca524a19a4699887975bff68f49a72bfb", size = 11388659 },
+    { url = "https://files.pythonhosted.org/packages/d0/bd/eff5b4399f332ac386c853f6cd2bd3fa2ca0061b9f36ecd9c4d7c4265649/pandas-3.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:51b1fe551acb77dac643c6fda86084d8d446c10fe64b06a9cc29c4cc8540e7f2", size = 11942880 },
+    { url = "https://files.pythonhosted.org/packages/2c/20/559ace4200982c3887d0b86bfd0d856a2143ef8ddab63cc07934951a964c/pandas-3.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:a82d532a3351d435432cd913edbccaf8b8e01d4dd0e5ced5a8d2e8ecd94c7e44", size = 9757091 },
+    { url = "https://files.pythonhosted.org/packages/3a/66/69055a09fe200f29f922a3eeec4804611900b95f52d932ece3393c3c0c19/pandas-3.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:275c14e0fce14a2ec20eee474aecd305478ea3c1e6f6a9d8fe219a165542717e", size = 9057282 },
+    { url = "https://files.pythonhosted.org/packages/57/0e/efe801b0e6811e8e650cd21b7f2608e30f08a7067e2bf6e8752b0d56ee3c/pandas-3.0.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:46997386d528eb40376ecd6b033cf4a8a1e5282580f68f43de875b78cba2199d", size = 10767016 },
+    { url = "https://files.pythonhosted.org/packages/ea/dc/eb55135a1d5f0f0519f28da1f609a206d2cad1f9c35c32d51e38dd7261ae/pandas-3.0.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:261e308dfb22448384b7580cf719d2f998fe2966c92893c3e77d14008af1f066", size = 10420210 },
+    { url = "https://files.pythonhosted.org/packages/c6/3e/b1d5d955ce33ffecb407465a60bc32769d74fcf68224b7ae67ae11d4dea4/pandas-3.0.3-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dd1a5d1def6a46002e964510bdc67c368aa0951df5d1d9f8365336f5a1f490cd", size = 10336126 },
+    { url = "https://files.pythonhosted.org/packages/f5/76/a01261711ab60a22d71b862f0de20e4c504bf80457270ad8cb42110f6abc/pandas-3.0.3-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d72828c20c6d6e83e1e22a6a3b47b326b71664112fa9705dcbccfd7a39b62085", size = 10728051 },
+    { url = "https://files.pythonhosted.org/packages/e9/21/ea191195e587b18cf682e97f433f81b2d0fbe341380e80a3e0d6e4403c8e/pandas-3.0.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:d26cbe1fcfc12e8fd900e2454163e466b2d3af84f7c75481df7683ffc073d870", size = 11350796 },
+    { url = "https://files.pythonhosted.org/packages/64/69/f0eaaf54939f0e8c6768fd06be9af2cef9b36048b96dfb9e1b2c685a807e/pandas-3.0.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:3e91cec1879ada0624fc3dc9953c5cbd60208e59c0db28f540c5d6d47502422f", size = 11799741 },
+    { url = "https://files.pythonhosted.org/packages/45/a4/865e0e510cae5fc2194de4db28be638952de942571ba9125934fd9c01d47/pandas-3.0.3-cp313-cp313t-win_amd64.whl", hash = "sha256:08d789b41f87e0905880e293cedf6197ce71fe67cc081358b1e148a491b9bd13", size = 10499958 },
+    { url = "https://files.pythonhosted.org/packages/86/54/effdcc3c0ff7a08037889200e148ebe94c16c4f653be078c7b3675955df1/pandas-3.0.3-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:3650109c0f22879df8bd6179ab9ee3d7f1d1d4e7e0094a3f0032d9f51e2e64ac", size = 10336065 },
+    { url = "https://files.pythonhosted.org/packages/68/10/bf2d6738d72748b961a3751ab89522d58c54efc36a8e1a12161216cd45cf/pandas-3.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:bab900348131a7db1f69a7309ef141fd5680f1487094193bcbbb61791573bf8f", size = 9926101 },
+    { url = "https://files.pythonhosted.org/packages/ae/e9/e35cf11c8a136e757b956f5f0efdcaa50aecde85ea055f1898dfc68262f3/pandas-3.0.3-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ba7e08b9ac1d54569cd1e256e3668975ed624d6826f7b68df0342b012007bddb", size = 10457553 },
+    { url = "https://files.pythonhosted.org/packages/58/3b/1cdec6772bdbaf7b25dab360c59f03cadf05492dd724c6540af905389b07/pandas-3.0.3-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9d71c63ae4ebdbf70209742096f1fc46a83a0613c99d4b23766cced9ff8cd62a", size = 10914065 },
+    { url = "https://files.pythonhosted.org/packages/c4/c2/1ef644445fcd72e3627bceec77e3560636f87ddce4ed841afe76b83b5bf9/pandas-3.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:e3a2ec42c98ffa2565a67e08e218d06d72576d758d90facb7c00805194d8f360", size = 11459188 },
+    { url = "https://files.pythonhosted.org/packages/7e/49/4d8d4f42cbc9c4adc7a1870f269c02cbd6cd40d059622c06fb298addcbad/pandas-3.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:335f62418ed562cfc3c49e9e196375c28b729dcef8543abf4f9438e381bf3c76", size = 11982966 },
+    { url = "https://files.pythonhosted.org/packages/38/55/792619469bab9882d8bbd5865d45a72f6478762d04a9af4bf0d08c503e95/pandas-3.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:3c20a521bbb85902f79f7270c80a59e1b5452d96d170c034f207181870f97ac5", size = 9876755 },
+    { url = "https://files.pythonhosted.org/packages/2a/af/33c469653b0ba03b50c3a98192d4c07f0c75c66b263ceb097fce0ee97d31/pandas-3.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:a2d2dff8a04f3917b55ab3910c32990f8ddf7eceba114947838cefa976a68977", size = 9198658 },
+    { url = "https://files.pythonhosted.org/packages/a2/fa/b8c257bd76b8bd060c3a9151c1fca05e9b9c5e3af5d0f549c0356f6d143d/pandas-3.0.3-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:0d589105b3c14645af1738ff279b2995102d8f7a03b0a66dc8d95550eb513e04", size = 10787242 },
+    { url = "https://files.pythonhosted.org/packages/54/eb/f19206ffb0bf1919002969aa448b4702c6594845156a6f8050674855aac3/pandas-3.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:13fc1e853d9e04743d11ba75a985ccbc2a317fe07d8af61e445a6fd24dacd6a6", size = 10436369 },
+    { url = "https://files.pythonhosted.org/packages/fd/24/c7c39fb4fe22b71a0c2d78bf0c585c600092d85f94f086d2b3b2f6ca27e2/pandas-3.0.3-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:819959dab7bbd0049c15623fbac4e29a191b9528160a61fb1032242d8ced2d9c", size = 10358306 },
+    { url = "https://files.pythonhosted.org/packages/16/ec/dd2a9eb7fa1204df88c0864164e35b228ac581062ac612ba0a67fd812e4c/pandas-3.0.3-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:60ae316d3fd75d1858d450d0db0103ea2be3e7d4a95ec2f064f7e2ae63f7b028", size = 10758394 },
+    { url = "https://files.pythonhosted.org/packages/95/6e/00c61ea8e85b4f6d8d35e11852a1a4998fc7fafc91c6a602d1cc9c972d64/pandas-3.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:bd3a518890b400d32f9023722dc9a9a5c969f00b415419a3c06c043f09bb5d7d", size = 11375717 },
+    { url = "https://files.pythonhosted.org/packages/31/89/8fc1c268969fac43688d65fd92e67df24bd128d53cb4d2eee534cd307399/pandas-3.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:9c39be2d709d01fa972a0cabc522389fceca4f3969332ba25a7d6c5802cf976a", size = 11828897 },
+    { url = "https://files.pythonhosted.org/packages/56/3b/e7d20dea247a3e6dc0bd8a6953854afbedc03951def4e7371e05e7263e25/pandas-3.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4db8c527972a821cf5286b40ccc57642a39bc62e62022b42f99f8a67fca8c3a1", size = 10900855 },
+    { url = "https://files.pythonhosted.org/packages/0f/54/68a0978d1ef8502b8492099beaa6e7a0c1b32e3b5d4f677f5810cb08711c/pandas-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:b2c95f8bfc1ee412bf482605d7bfd30c12d1d26bd59fdd91efeef1d4718decb1", size = 9466464 },
+]
+
+[[package]]
+name = "pillow"
+version = "12.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/8c/21/c2bcdd5906101a30244eaffc1b6e6ce71a31bd0742a01eb89e660ebfac2d/pillow-12.2.0.tar.gz", hash = "sha256:a830b1a40919539d07806aa58e1b114df53ddd43213d9c8b75847eee6c0182b5", size = 46987819 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/58/be/7482c8a5ebebbc6470b3eb791812fff7d5e0216c2be3827b30b8bb6603ed/pillow-12.2.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:2d192a155bbcec180f8564f693e6fd9bccff5a7af9b32e2e4bf8c9c69dbad6b5", size = 5308279 },
+    { url = "https://files.pythonhosted.org/packages/d8/95/0a351b9289c2b5cbde0bacd4a83ebc44023e835490a727b2a3bd60ddc0f4/pillow-12.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f3f40b3c5a968281fd507d519e444c35f0ff171237f4fdde090dd60699458421", size = 4695490 },
+    { url = "https://files.pythonhosted.org/packages/de/af/4e8e6869cbed569d43c416fad3dc4ecb944cb5d9492defaed89ddd6fe871/pillow-12.2.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:03e7e372d5240cc23e9f07deca4d775c0817bffc641b01e9c3af208dbd300987", size = 6284462 },
+    { url = "https://files.pythonhosted.org/packages/e9/9e/c05e19657fd57841e476be1ab46c4d501bffbadbafdc31a6d665f8b737b6/pillow-12.2.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b86024e52a1b269467a802258c25521e6d742349d760728092e1bc2d135b4d76", size = 8094744 },
+    { url = "https://files.pythonhosted.org/packages/2b/54/1789c455ed10176066b6e7e6da1b01e50e36f94ba584dc68d9eebfe9156d/pillow-12.2.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7371b48c4fa448d20d2714c9a1f775a81155050d383333e0a6c15b1123dda005", size = 6398371 },
+    { url = "https://files.pythonhosted.org/packages/43/e3/fdc657359e919462369869f1c9f0e973f353f9a9ee295a39b1fea8ee1a77/pillow-12.2.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:62f5409336adb0663b7caa0da5c7d9e7bdbaae9ce761d34669420c2a801b2780", size = 7087215 },
+    { url = "https://files.pythonhosted.org/packages/8b/f8/2f6825e441d5b1959d2ca5adec984210f1ec086435b0ed5f52c19b3b8a6e/pillow-12.2.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:01afa7cf67f74f09523699b4e88c73fb55c13346d212a59a2db1f86b0a63e8c5", size = 6509783 },
+    { url = "https://files.pythonhosted.org/packages/67/f9/029a27095ad20f854f9dba026b3ea6428548316e057e6fc3545409e86651/pillow-12.2.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fc3d34d4a8fbec3e88a79b92e5465e0f9b842b628675850d860b8bd300b159f5", size = 7212112 },
+    { url = "https://files.pythonhosted.org/packages/be/42/025cfe05d1be22dbfdb4f264fe9de1ccda83f66e4fc3aac94748e784af04/pillow-12.2.0-cp312-cp312-win32.whl", hash = "sha256:58f62cc0f00fd29e64b29f4fd923ffdb3859c9f9e6105bfc37ba1d08994e8940", size = 6378489 },
+    { url = "https://files.pythonhosted.org/packages/5d/7b/25a221d2c761c6a8ae21bfa3874988ff2583e19cf8a27bf2fee358df7942/pillow-12.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:7f84204dee22a783350679a0333981df803dac21a0190d706a50475e361c93f5", size = 7084129 },
+    { url = "https://files.pythonhosted.org/packages/10/e1/542a474affab20fd4a0f1836cb234e8493519da6b76899e30bcc5d990b8b/pillow-12.2.0-cp312-cp312-win_arm64.whl", hash = "sha256:af73337013e0b3b46f175e79492d96845b16126ddf79c438d7ea7ff27783a414", size = 2463612 },
+    { url = "https://files.pythonhosted.org/packages/4a/01/53d10cf0dbad820a8db274d259a37ba50b88b24768ddccec07355382d5ad/pillow-12.2.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:8297651f5b5679c19968abefd6bb84d95fe30ef712eb1b2d9b2d31ca61267f4c", size = 4100837 },
+    { url = "https://files.pythonhosted.org/packages/0f/98/f3a6657ecb698c937f6c76ee564882945f29b79bad496abcba0e84659ec5/pillow-12.2.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:50d8520da2a6ce0af445fa6d648c4273c3eeefbc32d7ce049f22e8b5c3daecc2", size = 4176528 },
+    { url = "https://files.pythonhosted.org/packages/69/bc/8986948f05e3ea490b8442ea1c1d4d990b24a7e43d8a51b2c7d8b1dced36/pillow-12.2.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:766cef22385fa1091258ad7e6216792b156dc16d8d3fa607e7545b2b72061f1c", size = 3640401 },
+    { url = "https://files.pythonhosted.org/packages/34/46/6c717baadcd62bc8ed51d238d521ab651eaa74838291bda1f86fe1f864c9/pillow-12.2.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5d2fd0fa6b5d9d1de415060363433f28da8b1526c1c129020435e186794b3795", size = 5308094 },
+    { url = "https://files.pythonhosted.org/packages/71/43/905a14a8b17fdb1ccb58d282454490662d2cb89a6bfec26af6d3520da5ec/pillow-12.2.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:56b25336f502b6ed02e889f4ece894a72612fe885889a6e8c4c80239ff6e5f5f", size = 4695402 },
+    { url = "https://files.pythonhosted.org/packages/73/dd/42107efcb777b16fa0393317eac58f5b5cf30e8392e266e76e51cff28c3d/pillow-12.2.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f1c943e96e85df3d3478f7b691f229887e143f81fedab9b20205349ab04d73ed", size = 6280005 },
+    { url = "https://files.pythonhosted.org/packages/a8/68/b93e09e5e8549019e61acf49f65b1a8530765a7f812c77a7461bca7e4494/pillow-12.2.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:03f6fab9219220f041c74aeaa2939ff0062bd5c364ba9ce037197f4c6d498cd9", size = 8090669 },
+    { url = "https://files.pythonhosted.org/packages/4b/6e/3ccb54ce8ec4ddd1accd2d89004308b7b0b21c4ac3d20fa70af4760a4330/pillow-12.2.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5cdfebd752ec52bf5bb4e35d9c64b40826bc5b40a13df7c3cda20a2c03a0f5ed", size = 6395194 },
+    { url = "https://files.pythonhosted.org/packages/67/ee/21d4e8536afd1a328f01b359b4d3997b291ffd35a237c877b331c1c3b71c/pillow-12.2.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:eedf4b74eda2b5a4b2b2fb4c006d6295df3bf29e459e198c90ea48e130dc75c3", size = 7082423 },
+    { url = "https://files.pythonhosted.org/packages/78/5f/e9f86ab0146464e8c133fe85df987ed9e77e08b29d8d35f9f9f4d6f917ba/pillow-12.2.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:00a2865911330191c0b818c59103b58a5e697cae67042366970a6b6f1b20b7f9", size = 6505667 },
+    { url = "https://files.pythonhosted.org/packages/ed/1e/409007f56a2fdce61584fd3acbc2bbc259857d555196cedcadc68c015c82/pillow-12.2.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:1e1757442ed87f4912397c6d35a0db6a7b52592156014706f17658ff58bbf795", size = 7208580 },
+    { url = "https://files.pythonhosted.org/packages/23/c4/7349421080b12fb35414607b8871e9534546c128a11965fd4a7002ccfbee/pillow-12.2.0-cp313-cp313-win32.whl", hash = "sha256:144748b3af2d1b358d41286056d0003f47cb339b8c43a9ea42f5fea4d8c66b6e", size = 6375896 },
+    { url = "https://files.pythonhosted.org/packages/3f/82/8a3739a5e470b3c6cbb1d21d315800d8e16bff503d1f16b03a4ec3212786/pillow-12.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:390ede346628ccc626e5730107cde16c42d3836b89662a115a921f28440e6a3b", size = 7081266 },
+    { url = "https://files.pythonhosted.org/packages/c3/25/f968f618a062574294592f668218f8af564830ccebdd1fa6200f598e65c5/pillow-12.2.0-cp313-cp313-win_arm64.whl", hash = "sha256:8023abc91fba39036dbce14a7d6535632f99c0b857807cbbbf21ecc9f4717f06", size = 2463508 },
+    { url = "https://files.pythonhosted.org/packages/4d/a4/b342930964e3cb4dce5038ae34b0eab4653334995336cd486c5a8c25a00c/pillow-12.2.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:042db20a421b9bafecc4b84a8b6e444686bd9d836c7fd24542db3e7df7baad9b", size = 5309927 },
+    { url = "https://files.pythonhosted.org/packages/9f/de/23198e0a65a9cf06123f5435a5d95cea62a635697f8f03d134d3f3a96151/pillow-12.2.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:dd025009355c926a84a612fecf58bb315a3f6814b17ead51a8e48d3823d9087f", size = 4698624 },
+    { url = "https://files.pythonhosted.org/packages/01/a6/1265e977f17d93ea37aa28aa81bad4fa597933879fac2520d24e021c8da3/pillow-12.2.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:88ddbc66737e277852913bd1e07c150cc7bb124539f94c4e2df5344494e0a612", size = 6321252 },
+    { url = "https://files.pythonhosted.org/packages/3c/83/5982eb4a285967baa70340320be9f88e57665a387e3a53a7f0db8231a0cd/pillow-12.2.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d362d1878f00c142b7e1a16e6e5e780f02be8195123f164edf7eddd911eefe7c", size = 8126550 },
+    { url = "https://files.pythonhosted.org/packages/4e/48/6ffc514adce69f6050d0753b1a18fd920fce8cac87620d5a31231b04bfc5/pillow-12.2.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2c727a6d53cb0018aadd8018c2b938376af27914a68a492f59dfcaca650d5eea", size = 6433114 },
+    { url = "https://files.pythonhosted.org/packages/36/a3/f9a77144231fb8d40ee27107b4463e205fa4677e2ca2548e14da5cf18dce/pillow-12.2.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:efd8c21c98c5cc60653bcb311bef2ce0401642b7ce9d09e03a7da87c878289d4", size = 7115667 },
+    { url = "https://files.pythonhosted.org/packages/c1/fc/ac4ee3041e7d5a565e1c4fd72a113f03b6394cc72ab7089d27608f8aaccb/pillow-12.2.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9f08483a632889536b8139663db60f6724bfcb443c96f1b18855860d7d5c0fd4", size = 6538966 },
+    { url = "https://files.pythonhosted.org/packages/c0/a8/27fb307055087f3668f6d0a8ccb636e7431d56ed0750e07a60547b1e083e/pillow-12.2.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:dac8d77255a37e81a2efcbd1fc05f1c15ee82200e6c240d7e127e25e365c39ea", size = 7238241 },
+    { url = "https://files.pythonhosted.org/packages/ad/4b/926ab182c07fccae9fcb120043464e1ff1564775ec8864f21a0ebce6ac25/pillow-12.2.0-cp313-cp313t-win32.whl", hash = "sha256:ee3120ae9dff32f121610bb08e4313be87e03efeadfc6c0d18f89127e24d0c24", size = 6379592 },
+    { url = "https://files.pythonhosted.org/packages/c2/c4/f9e476451a098181b30050cc4c9a3556b64c02cf6497ea421ac047e89e4b/pillow-12.2.0-cp313-cp313t-win_amd64.whl", hash = "sha256:325ca0528c6788d2a6c3d40e3568639398137346c3d6e66bb61db96b96511c98", size = 7085542 },
+    { url = "https://files.pythonhosted.org/packages/00/a4/285f12aeacbe2d6dc36c407dfbbe9e96d4a80b0fb710a337f6d2ad978c75/pillow-12.2.0-cp313-cp313t-win_arm64.whl", hash = "sha256:2e5a76d03a6c6dcef67edabda7a52494afa4035021a79c8558e14af25313d453", size = 2465765 },
+    { url = "https://files.pythonhosted.org/packages/bf/98/4595daa2365416a86cb0d495248a393dfc84e96d62ad080c8546256cb9c0/pillow-12.2.0-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:3adc9215e8be0448ed6e814966ecf3d9952f0ea40eb14e89a102b87f450660d8", size = 4100848 },
+    { url = "https://files.pythonhosted.org/packages/0b/79/40184d464cf89f6663e18dfcf7ca21aae2491fff1a16127681bf1fa9b8cf/pillow-12.2.0-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:6a9adfc6d24b10f89588096364cc726174118c62130c817c2837c60cf08a392b", size = 4176515 },
+    { url = "https://files.pythonhosted.org/packages/b0/63/703f86fd4c422a9cf722833670f4f71418fb116b2853ff7da722ea43f184/pillow-12.2.0-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:6a6e67ea2e6feda684ed370f9a1c52e7a243631c025ba42149a2cc5934dec295", size = 3640159 },
+    { url = "https://files.pythonhosted.org/packages/71/e0/fb22f797187d0be2270f83500aab851536101b254bfa1eae10795709d283/pillow-12.2.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:2bb4a8d594eacdfc59d9e5ad972aa8afdd48d584ffd5f13a937a664c3e7db0ed", size = 5312185 },
+    { url = "https://files.pythonhosted.org/packages/ba/8c/1a9e46228571de18f8e28f16fabdfc20212a5d019f3e3303452b3f0a580d/pillow-12.2.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:80b2da48193b2f33ed0c32c38140f9d3186583ce7d516526d462645fd98660ae", size = 4695386 },
+    { url = "https://files.pythonhosted.org/packages/70/62/98f6b7f0c88b9addd0e87c217ded307b36be024d4ff8869a812b241d1345/pillow-12.2.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22db17c68434de69d8ecfc2fe821569195c0c373b25cccb9cbdacf2c6e53c601", size = 6280384 },
+    { url = "https://files.pythonhosted.org/packages/5e/03/688747d2e91cfbe0e64f316cd2e8005698f76ada3130d0194664174fa5de/pillow-12.2.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7b14cc0106cd9aecda615dd6903840a058b4700fcb817687d0ee4fc8b6e389be", size = 8091599 },
+    { url = "https://files.pythonhosted.org/packages/f6/35/577e22b936fcdd66537329b33af0b4ccfefaeabd8aec04b266528cddb33c/pillow-12.2.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8cbeb542b2ebc6fcdacabf8aca8c1a97c9b3ad3927d46b8723f9d4f033288a0f", size = 6396021 },
+    { url = "https://files.pythonhosted.org/packages/11/8d/d2532ad2a603ca2b93ad9f5135732124e57811d0168155852f37fbce2458/pillow-12.2.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4bfd07bc812fbd20395212969e41931001fd59eb55a60658b0e5710872e95286", size = 7083360 },
+    { url = "https://files.pythonhosted.org/packages/5e/26/d325f9f56c7e039034897e7380e9cc202b1e368bfd04d4cbe6a441f02885/pillow-12.2.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:9aba9a17b623ef750a4d11b742cbafffeb48a869821252b30ee21b5e91392c50", size = 6507628 },
+    { url = "https://files.pythonhosted.org/packages/5f/f7/769d5632ffb0988f1c5e7660b3e731e30f7f8ec4318e94d0a5d674eb65a4/pillow-12.2.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:deede7c263feb25dba4e82ea23058a235dcc2fe1f6021025dc71f2b618e26104", size = 7209321 },
+    { url = "https://files.pythonhosted.org/packages/6a/7a/c253e3c645cd47f1aceea6a8bacdba9991bf45bb7dfe927f7c893e89c93c/pillow-12.2.0-cp314-cp314-win32.whl", hash = "sha256:632ff19b2778e43162304d50da0181ce24ac5bb8180122cbe1bf4673428328c7", size = 6479723 },
+    { url = "https://files.pythonhosted.org/packages/cd/8b/601e6566b957ca50e28725cb6c355c59c2c8609751efbecd980db44e0349/pillow-12.2.0-cp314-cp314-win_amd64.whl", hash = "sha256:4e6c62e9d237e9b65fac06857d511e90d8461a32adcc1b9065ea0c0fa3a28150", size = 7217400 },
+    { url = "https://files.pythonhosted.org/packages/d6/94/220e46c73065c3e2951bb91c11a1fb636c8c9ad427ac3ce7d7f3359b9b2f/pillow-12.2.0-cp314-cp314-win_arm64.whl", hash = "sha256:b1c1fbd8a5a1af3412a0810d060a78b5136ec0836c8a4ef9aa11807f2a22f4e1", size = 2554835 },
+    { url = "https://files.pythonhosted.org/packages/b6/ab/1b426a3974cb0e7da5c29ccff4807871d48110933a57207b5a676cccc155/pillow-12.2.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:57850958fe9c751670e49b2cecf6294acc99e562531f4bd317fa5ddee2068463", size = 5314225 },
+    { url = "https://files.pythonhosted.org/packages/19/1e/dce46f371be2438eecfee2a1960ee2a243bbe5e961890146d2dee1ff0f12/pillow-12.2.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:d5d38f1411c0ed9f97bcb49b7bd59b6b7c314e0e27420e34d99d844b9ce3b6f3", size = 4698541 },
+    { url = "https://files.pythonhosted.org/packages/55/c3/7fbecf70adb3a0c33b77a300dc52e424dc22ad8cdc06557a2e49523b703d/pillow-12.2.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5c0a9f29ca8e79f09de89293f82fc9b0270bb4af1d58bc98f540cc4aedf03166", size = 6322251 },
+    { url = "https://files.pythonhosted.org/packages/1c/3c/7fbc17cfb7e4fe0ef1642e0abc17fc6c94c9f7a16be41498e12e2ba60408/pillow-12.2.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1610dd6c61621ae1cf811bef44d77e149ce3f7b95afe66a4512f8c59f25d9ebe", size = 8127807 },
+    { url = "https://files.pythonhosted.org/packages/ff/c3/a8ae14d6defd2e448493ff512fae903b1e9bd40b72efb6ec55ce0048c8ce/pillow-12.2.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0a34329707af4f73cf1782a36cd2289c0368880654a2c11f027bcee9052d35dd", size = 6433935 },
+    { url = "https://files.pythonhosted.org/packages/6e/32/2880fb3a074847ac159d8f902cb43278a61e85f681661e7419e6596803ed/pillow-12.2.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8e9c4f5b3c546fa3458a29ab22646c1c6c787ea8f5ef51300e5a60300736905e", size = 7116720 },
+    { url = "https://files.pythonhosted.org/packages/46/87/495cc9c30e0129501643f24d320076f4cc54f718341df18cc70ec94c44e1/pillow-12.2.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:fb043ee2f06b41473269765c2feae53fc2e2fbf96e5e22ca94fb5ad677856f06", size = 6540498 },
+    { url = "https://files.pythonhosted.org/packages/18/53/773f5edca692009d883a72211b60fdaf8871cbef075eaa9d577f0a2f989e/pillow-12.2.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:f278f034eb75b4e8a13a54a876cc4a5ab39173d2cdd93a638e1b467fc545ac43", size = 7239413 },
+    { url = "https://files.pythonhosted.org/packages/c9/e4/4b64a97d71b2a83158134abbb2f5bd3f8a2ea691361282f010998f339ec7/pillow-12.2.0-cp314-cp314t-win32.whl", hash = "sha256:6bb77b2dcb06b20f9f4b4a8454caa581cd4dd0643a08bacf821216a16d9c8354", size = 6482084 },
+    { url = "https://files.pythonhosted.org/packages/ba/13/306d275efd3a3453f72114b7431c877d10b1154014c1ebbedd067770d629/pillow-12.2.0-cp314-cp314t-win_amd64.whl", hash = "sha256:6562ace0d3fb5f20ed7290f1f929cae41b25ae29528f2af1722966a0a02e2aa1", size = 7225152 },
+    { url = "https://files.pythonhosted.org/packages/ff/6e/cf826fae916b8658848d7b9f38d88da6396895c676e8086fc0988073aaf8/pillow-12.2.0-cp314-cp314t-win_arm64.whl", hash = "sha256:aa88ccfe4e32d362816319ed727a004423aab09c5cea43c01a4b435643fa34eb", size = 2556579 },
+]
+
+[[package]]
+name = "platformdirs"
+version = "4.9.6"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/9f/4a/0883b8e3802965322523f0b200ecf33d31f10991d0401162f4b23c698b42/platformdirs-4.9.6.tar.gz", hash = "sha256:3bfa75b0ad0db84096ae777218481852c0ebc6c727b3168c1b9e0118e458cf0a", size = 29400 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/75/a6/a0a304dc33b49145b21f4808d763822111e67d1c3a32b524a1baf947b6e1/platformdirs-4.9.6-py3-none-any.whl", hash = "sha256:e61adb1d5e5cb3441b4b7710bea7e4c12250ca49439228cc1021c00dcfac0917", size = 21348 },
+]
+
+[[package]]
+name = "pluggy"
+version = "1.6.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538 },
+]
+
+[[package]]
+name = "propcache"
+version = "0.5.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ec/44/c87281c333769159c50594f22610f77398a47ccbfbbf23074e744e86f87c/propcache-0.5.2.tar.gz", hash = "sha256:01c4fc7480cd0598bb4b57022df55b9ca296da7fc5a8760bd8451a7e63a7d427", size = 50208 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4a/cb/e27bc2b2737a0bb49962b275efa051e8f1c35a936df7d5139b6b658b7dc9/propcache-0.5.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:806719138ecd720339a12410fb9614ac9b2b2d3a5fdf8235d56981c36f4039ba", size = 95887 },
+    { url = "https://files.pythonhosted.org/packages/e6/13/b8ae04c59392f8d11c6cd9fb4011d1dc7c86b81225c770280300e259ffe1/propcache-0.5.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:db2b80ea58eab4f86b2beec3cc8b39e8ff9276ac20e96b7cce43c8ae84cd6b5a", size = 54654 },
+    { url = "https://files.pythonhosted.org/packages/2c/7d/49777a3e20b55863d4794384a38acd460c04157b0a00f8602b0d508b8431/propcache-0.5.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e5cbfac9f61484f7e9f3597775500cd3ebe8274e9b050c38f9525c77c97520bf", size = 55190 },
+    { url = "https://files.pythonhosted.org/packages/44/c7/085d0cd63062e84044e3f05797749c3f8e3938ff3aeb0eb2f69d43fafc91/propcache-0.5.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5dbc581d2814337da56222fab8dc5f161cd798a434e49bac27930aaef798e144", size = 59995 },
+    { url = "https://files.pythonhosted.org/packages/9c/42/32cf8e3009e92b2645cf1e944f701e8ea4e924dffde1ee26db860bcbf7e4/propcache-0.5.2-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:857187f381f88c8e2fa2fe56ab94879d011b883d5a2ee5a1b60a8cd2a06846d9", size = 63422 },
+    { url = "https://files.pythonhosted.org/packages/9e/1b/f112433f99fc979431b87a39ef169e3f8df070d99a72792c56d6937ac48b/propcache-0.5.2-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:178b4a2cdaac1818e2bf1c5a99b94383fa73ea5382e032a48dec07dc5668dc42", size = 64342 },
+    { url = "https://files.pythonhosted.org/packages/14/15/5574111ae50dd6e879456888c0eadd4c5a869959775854e18e18a6b345f3/propcache-0.5.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6f328175a2cde1f0ff2c4ed8ce968b9dcfb55f3a7153f39e2957ed994da13476", size = 61639 },
+    { url = "https://files.pythonhosted.org/packages/cc/da/4d775080b1490c0ae604acda868bd71aabe3a89ed16f2aa4339eb8a283e7/propcache-0.5.2-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:5671d09a36b06d0fd4a3da0fccbcae360e9b1570924171a15e9e0997f0249fba", size = 61588 },
+    { url = "https://files.pythonhosted.org/packages/04/ac/f076982cbe2195ee9cf32de5a1e46951d9fb399fc207f390562dd0fd8fb2/propcache-0.5.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:80168e2ebe4d3ec6599d10ad8f520304ae1cad9b6c5a95372aef1b66b7bfb53a", size = 60029 },
+    { url = "https://files.pythonhosted.org/packages/70/60/189be62e0dd898dce3b331e1b8c7a543cd3a405ac0c81fe8ee8a9d5d77e1/propcache-0.5.2-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:45f11346f884bc47444f6e6647131055844134c3175b629f84952e2b5cd62b64", size = 56774 },
+    { url = "https://files.pythonhosted.org/packages/ea/9e/93377b9c7939c1ffae98f878dee955efadfd638078bc86dbc21f9d52f651/propcache-0.5.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:8e778ebd44ef4f66ed60a0416b06b489687db264a9c0b3620362f26489492913", size = 63532 },
+    { url = "https://files.pythonhosted.org/packages/14/f9/590ef6cfb9b8028d516d287812ece32bb0bc5f11fbb9c8bf6b2e6313fec8/propcache-0.5.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:c0cb9ed24c8964e172768d455a38254c2dd8a552905729ce006cad3d3dda59b1", size = 61592 },
+    { url = "https://files.pythonhosted.org/packages/b4/5e/70958b3034c297a630bba2f17ca7abc2d5f39a803ad7e370ab79d1ecd022/propcache-0.5.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:1d1ad32d9d4355e2be65574fd0bfd3677e7066b009cd5b9b2dee8aa6a6393b33", size = 64788 },
+    { url = "https://files.pythonhosted.org/packages/12/fd/77fe5936d8c3086ca9048f7f415f122ed82e53884a9ec193646b42deef06/propcache-0.5.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c80f4ba3e8f00189165999a742ee526ebeccedf6c3f7beb0c7df821e9772435a", size = 62514 },
+    { url = "https://files.pythonhosted.org/packages/cf/74/66bd798b5b3be70aa1b391f5cc9d6a0a5532d7fd3b19ec0b213e72e6ad9d/propcache-0.5.2-cp312-cp312-win32.whl", hash = "sha256:8c7972d8f193740d9175f0998ab38717e6cd322d5935c5b0fef8c0d323fd9031", size = 39018 },
+    { url = "https://files.pythonhosted.org/packages/61/7c/5c0d34aa3024694d6dcb9271cdbdd08c4e47c1c0ad95ec7e7bc74cdea145/propcache-0.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:d9ee8826a7d47863a08ac44e1a5f611a462eefc3a194b492da242128bec75b42", size = 42322 },
+    { url = "https://files.pythonhosted.org/packages/4d/91/875812f1a3feb20ceba818ef39fbe4d92f1081e04ac815c822496d0d038b/propcache-0.5.2-cp312-cp312-win_arm64.whl", hash = "sha256:2800a4a8ead6b28cccd1ec54b59346f0def7922ee1c7598e8499c733cfbb7c84", size = 38172 },
+    { url = "https://files.pythonhosted.org/packages/c5/09/f049e45385503fe67db75a6b6186a7b9f0c3930366dc960522c312a825b1/propcache-0.5.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:099aaf4b4d1a02265b92a977edf00b5c4f63b3b17ac6de39b0d637c9cac0188a", size = 94457 },
+    { url = "https://files.pythonhosted.org/packages/6b/65/83d1d05655baf63113731bd5a1008435e14f8d1e5a06cbe4ec5b23ad7a31/propcache-0.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:68ce1c44c7a813a7f71ea04315a8c7b330b63db99d059a797a4651bb6f69f117", size = 53835 },
+    { url = "https://files.pythonhosted.org/packages/a9/12/a6ba6482bb5ea3260c000c9b20881c95fa11c6b30173715668259f844ed7/propcache-0.5.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:fc299c129490f55f254cd90be0deca4764e36e9a7c08b4aa588479a3bbed3098", size = 54545 },
+    { url = "https://files.pythonhosted.org/packages/a9/19/7fa086f5764c59ec8a8e157cd93aa8497acc00aba9dcdec56bfffb32602d/propcache-0.5.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a6ae2198be502c10f09b2516e7b5d019816924bc3183a43ce792a7bd6625e6f4", size = 59886 },
+    { url = "https://files.pythonhosted.org/packages/a1/e4/5d7663dc8235956c8f5281698a3af1d351d8820341ddd890f59d9a9127f2/propcache-0.5.2-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6041d31504dc1779d700e1edcfb08eea334b357620b06681a4eabb57a74e574e", size = 63261 },
+    { url = "https://files.pythonhosted.org/packages/4a/4a/15a03adee24d6350da4292caeac44c34c033d2afe5e87eb370f38854560f/propcache-0.5.2-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f7eabc04151c78a9f4d5bbb5f1faf571e4defeb4b585e0fe95b60ff2dbe4d3d7", size = 64184 },
+    { url = "https://files.pythonhosted.org/packages/8b/c6/979176efdaa3d239e36d503d5af63a0a773b36662ed8f52e5b6a6d9fd40e/propcache-0.5.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4db0ba63d693afd40d249bd93f842b5f144f8fcbb83de05660373bcf30517b1d", size = 61534 },
+    { url = "https://files.pythonhosted.org/packages/c8/22/63e8cd1bae4c2d2be6493b6b7d10566ddafad88137cfbc99964a1119853c/propcache-0.5.2-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:1dbcf7675229b35d31abb6547d8ebc8c27a830ac3f9a794edff6254873ec7c0a", size = 61500 },
+    { url = "https://files.pythonhosted.org/packages/60/5a/28e5d9acbac1cc9ccb67045e8c1b943aa8d79fdf39c93bd73cacd68008ea/propcache-0.5.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d310c013aad2c72f1c3f2f8dd3279d460a858c551f97aeb8c63e4693cca7b4d2", size = 59994 },
+    { url = "https://files.pythonhosted.org/packages/f3/40/db650677f554a95b9c01a7c9d93d629e93a15562f5deb4573c9ee136fed2/propcache-0.5.2-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:06187263ddad280d05b4d8a8b3bb7d164cbebd469236544a42e6d9b28ac6a4fa", size = 56884 },
+    { url = "https://files.pythonhosted.org/packages/80/45/70b39b89516ff8b96bf732fa6fded8cef20f293cb1508690101c3c07ec51/propcache-0.5.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3115559b8effafd63b142ea5ed53d63a16ea6469cbc63dce4ee194b42db5d853", size = 63464 },
+    { url = "https://files.pythonhosted.org/packages/f9/e2/fa59d3a89eac5534293124af4f1d0d0ada091ce4a0ab4610ce03fd2bdd8d/propcache-0.5.2-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:c60462af8e6dc30c35407c7237ea908d777b22862bbee27bc4699c0d8bcdc45a", size = 61588 },
+    { url = "https://files.pythonhosted.org/packages/0b/97/efb547a55c4bc7381cfb202d6a2239ac621045277bc1ea5dfd3a7f0516c0/propcache-0.5.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:40314bca9ac559716fe374094fc81c11dcc34b64fd6c585360f5775690505704", size = 64667 },
+    { url = "https://files.pythonhosted.org/packages/92/56/f5c7d9b4b7595d5127da38974d791b2153f3d1eae6c674af3583ace92ad3/propcache-0.5.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:cfa21e036ce1e1db2be04ba3b85d2df1bb1702fa01932d984c5464c665228ff4", size = 62463 },
+    { url = "https://files.pythonhosted.org/packages/bd/3b/484a3a65fc9f9f60c41dcd17b428bace5389544e2c680994534a20755066/propcache-0.5.2-cp313-cp313-win32.whl", hash = "sha256:f156a3529f38063b6dbaf356e15602a7f95f8055b1295a438433a6386f10463d", size = 38621 },
+    { url = "https://files.pythonhosted.org/packages/1c/fd/3f0f10dba4dabad3bf53102be007abf55481067952bde0fdddff439e7c61/propcache-0.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:dfed59d0a5aeb01e242e66ff0300bc4a265a7c05f612d30016f0b60b1017d757", size = 41649 },
+    { url = "https://files.pythonhosted.org/packages/90/ec/6ce619cc32bb500a482f811f9cd509368b4e58e638d13f2c68f370d6b475/propcache-0.5.2-cp313-cp313-win_arm64.whl", hash = "sha256:ba338430e87ceb9c8f0cf754de38a9860560261e56c00376debd628698a7364f", size = 37636 },
+    { url = "https://files.pythonhosted.org/packages/1b/82/c1d268bbbf2ef981c5bf0fbbe746db617c66e3bcefe431a1aa8943fbe23a/propcache-0.5.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:a592f5f3da71c8691c788c13cb6734b6d17663d2e1cb8caddf0673d01ef8847d", size = 98872 },
+    { url = "https://files.pythonhosted.org/packages/f4/d4/52c871e73e864e6b34c0e2d58ac1ec5ccd149497ddc7ad2137ae98323a35/propcache-0.5.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:6a997d0489e9668a384fcfd5061b857aa5361de73191cac204d04b889cfbbafa", size = 56257 },
+    { url = "https://files.pythonhosted.org/packages/67/f0/9b90ca2a210b3d09bcfcd96ecd0f55545c091535abce2a45de2775cfd357/propcache-0.5.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:10734b5484ea113152ee25a91dccedf81631791805d2c9ccb054958e51842c94", size = 56696 },
+    { url = "https://files.pythonhosted.org/packages/9d/0e/6e9d4ba07c8e56e21ddec1e75f12148142b21ca83a51871babce095334f4/propcache-0.5.2-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cafca7e56c12bb02ae16d283742bef25a61122e9dab2b5b3f2ccbe589ce32164", size = 62378 },
+    { url = "https://files.pythonhosted.org/packages/65/19/c10badaa463dde8a27ce884f8ee2ec37e6035b7c9f5ff0c8f74f06f08dac/propcache-0.5.2-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f064f8d2b59177878b7615df1735cd8fe3462ed6be8c7b217d17a276489c2b7f", size = 65283 },
+    { url = "https://files.pythonhosted.org/packages/b0/b6/93bea99ca80e19cef6512a8580e5b7857bbe09422d9daa7fd4ef5723306c/propcache-0.5.2-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f78abfa8dfc32376fd1aacf597b2f2fbbe0ea751419aee718af5d4f82537ef8c", size = 66616 },
+    { url = "https://files.pythonhosted.org/packages/83/e4/5c7462e50625f051f37fb38b8224f7639f667184bbd34424ec83819bb1b7/propcache-0.5.2-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f7467da8a9822bf1a55336f877340c5bcbd3c482afc43a99771169f74a26dedc", size = 63773 },
+    { url = "https://files.pythonhosted.org/packages/ca/b6/99238894047b13c823be25027e736626cd414a52a5e30d2c3347c2733529/propcache-0.5.2-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a6ddc6ac9e25de626c1f129c1b467d7ecd33ce2237d3fd0c4e429feef0a7ee1f", size = 63664 },
+    { url = "https://files.pythonhosted.org/packages/85/1e/a3a1a63116a2b8edb415a8bb9a6f0c34bd03830b1e18e8ce2904e1dc1cf4/propcache-0.5.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:2f22cbbac9e26a8e864c0985ff1268d5d939d53d9d9411a9824279097e03a2cb", size = 62643 },
+    { url = "https://files.pythonhosted.org/packages/e4/03/893cf147de2fc6543c5eaa07ad833170e7e2a2385725bbebe8c0503723bb/propcache-0.5.2-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:fc76378c62a0f04d0cd82fbb1a2cd2d7e28fcb40d5873f28a6c44e388aaa2751", size = 59595 },
+    { url = "https://files.pythonhosted.org/packages/86/3b/04c1a2e12c57766568ba75ba72b3bf2042818d4c1425fab6fc07155c7cff/propcache-0.5.2-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:acd2c8edba48e31e58a363b8cf4e5c7db3b04b3f9e371f601df30d9b0d244836", size = 65711 },
+    { url = "https://files.pythonhosted.org/packages/1c/34/80f8d0099f8d6bacc4de1624c85672681c8cd1149ca2da0e38fd120b817f/propcache-0.5.2-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:452b5065457eb9991ec5eb38ff41d6cd4c991c9ac7c531c4d5849ae473a9a13f", size = 64247 },
+    { url = "https://files.pythonhosted.org/packages/f3/1a/8b08f3a5f1037e9e370c55883ceeeee0f6dd0416fb2d2d67b8bfc91f2a79/propcache-0.5.2-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:3430bb2bfe1331885c427745a751e774ee679fd4344f80b97bf879815fe8fa55", size = 67102 },
+    { url = "https://files.pythonhosted.org/packages/34/68/8bdb7bb7756d76e005490649d10e4a8369e610c74d619f71e1aedf889e9c/propcache-0.5.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:cef6cea3922890dd6c9654971001fa797b526c16ab5e1e46c05fd6f877be7568", size = 64964 },
+    { url = "https://files.pythonhosted.org/packages/0a/aa/50fb0b5d3968b61a510926ff8b8465f1d6e976b3ab74496d7a4b9fc42515/propcache-0.5.2-cp313-cp313t-win32.whl", hash = "sha256:72d61e16dd78228b58c5d47be830ff3da7e5f139abdf0aef9d86cde1c5cf2191", size = 42546 },
+    { url = "https://files.pythonhosted.org/packages/ae/4c/0ddbae64321bd4a95bcbfc19307238016b5b1fee645c84626c8d539e5b74/propcache-0.5.2-cp313-cp313t-win_amd64.whl", hash = "sha256:0958834041a0166d343b8d2cedcd8bcbaeb4fdbe0cf08320c5379f143c3be6e7", size = 46330 },
+    { url = "https://files.pythonhosted.org/packages/00/d9/9cddc8efb78d8af264c5ec9f6d10b62f57c515feda8d321595f56010fb23/propcache-0.5.2-cp313-cp313t-win_arm64.whl", hash = "sha256:6de8bd93ddde9b992cf2b2e0d796d501a19026b5b9fd87356d7d0779531a8d96", size = 40521 },
+    { url = "https://files.pythonhosted.org/packages/e2/ea/23ee535d90ce8bcc465a3028eb3cc0ce3bd1005f4bb27710b30587de798d/propcache-0.5.2-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:46088abff4cba581dea21ae0467a480526cb25aa5f3c269e909f800328bc3999", size = 94662 },
+    { url = "https://files.pythonhosted.org/packages/b5/06/c5a52f419b5d8972f8d46a7577476090d8e3263ff589ce40b5ca4968d5be/propcache-0.5.2-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:fc88b26f08d634f7bc819a7852e5214f5802641ab8d9fd5326892292eee1993e", size = 53928 },
+    { url = "https://files.pythonhosted.org/packages/63/b1/4260d67d6bd85e58a66b72d54ce15d5de789b6f3870cc6bedf8ff9667401/propcache-0.5.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:97797ebb098e670a2f92dd66f32897e30d7615b14e7f59711de23e30a9072539", size = 54650 },
+    { url = "https://files.pythonhosted.org/packages/70/06/2f46c318e3307cd7a6a7481def374ce838c0fe20084b39dd54b0879d0e99/propcache-0.5.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ba57fffe4ac99c5d30076161b5866336d97600769bad35cc68f7774b15298a4e", size = 59912 },
+    { url = "https://files.pythonhosted.org/packages/4c/29/fe1aebec2ce57ab985a9c382bded1124431f85078113aa222c5d278430d4/propcache-0.5.2-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:583c19759d9eec1e5b69e2fbef36a7d9c326041be9746cb822d335c8cedc2979", size = 63300 },
+    { url = "https://files.pythonhosted.org/packages/b4/18/2334b26768b6c82be8c69e83671b767d5ef426aa09b0cba6c2ea47816774/propcache-0.5.2-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d0326e2e5e1f3163fa306c834e48e8d490e5fae607a097a40c0648109b47ba80", size = 64208 },
+    { url = "https://files.pythonhosted.org/packages/2b/76/7f1bfd6afff4c5e38e36a3c6d68eb5f4b7311ea80baf693db78d95b603c4/propcache-0.5.2-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e00820e192c8dbebcafb383ebbf99030895f09905e7a0eb2e0340a0bcc2bc825", size = 61633 },
+    { url = "https://files.pythonhosted.org/packages/c4/46/b3ff8aba2b4953a3e50de2cf72f1b5748b8eca93b15f3dc2c84339084c09/propcache-0.5.2-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c66afea89b1e43725731d2004732a046fe6fe955d51f952c3e95a7314a284a39", size = 61724 },
+    { url = "https://files.pythonhosted.org/packages/c5/01/814cfcafbcff954f94c01cf30e097ddc88a076b5440fbcf4570753437d40/propcache-0.5.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d4dc37dec6c6cdad0b57881a5658fd14fbf53e333b1a86cf86559f190e1d9ec4", size = 60069 },
+    { url = "https://files.pythonhosted.org/packages/da/68/5c6f7622d510cc666a300687e06fd060c1a43361c0c9b20d284f06d8096a/propcache-0.5.2-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:5570dbcc97571c15f68068e529c92715a12f8d54030e272d264b377e22bd17a5", size = 57099 },
+    { url = "https://files.pythonhosted.org/packages/55/27/9cb0b4c679124085327957d42521c99dba04c88c90c3e55a6f0b633ebccc/propcache-0.5.2-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:f814362777a9f841adddb200ecdf8f5cb1e5a3c4b7a86378edbd6ccb26edd702", size = 63391 },
+    { url = "https://files.pythonhosted.org/packages/f0/9d/7258aaa5bdf60fc6f27591eef6fe52768cb0beda7140be477c8b12c9794a/propcache-0.5.2-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:196913dea116aeb5a2ba95af4ddcb7ea85559ae07d8eee8751688310d09168c3", size = 61626 },
+    { url = "https://files.pythonhosted.org/packages/8e/0d/41c602003e8a9b16fe1e7eadf62c7bfba9d5474370b24200bf48b315f45f/propcache-0.5.2-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:6e7b8719005dd1175be4ab1cd25e9b98659a5e0347331506ec6760d2773a7fb5", size = 64781 },
+    { url = "https://files.pythonhosted.org/packages/8b/f3/38e66b1856e9bd079deea015bc4a55f7767c0e4db2f7dcf69e7e680ba4ce/propcache-0.5.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:51f96d685ab16e88cab128cd37a52c5da540809c8b879fa047731bfcb4ad35a4", size = 62570 },
+    { url = "https://files.pythonhosted.org/packages/95/ca/bbfe9b910ce57dde8bb4876b4520fc02a4e89497c10de26be936758a3aaa/propcache-0.5.2-cp314-cp314-win32.whl", hash = "sha256:cc6fc3cc62e8501d3ed62894425040d2728ecddb1ed072737a5c70bd537aa9f0", size = 39436 },
+    { url = "https://files.pythonhosted.org/packages/61/d2/45c9defbaa1ea297035d9d4cce9e8f80daafbf19319c6007f157c6256ea9/propcache-0.5.2-cp314-cp314-win_amd64.whl", hash = "sha256:81e3a30b0bb60caa22033dd0f8a3618d1d67356212514f62c57db75cb0ef410c", size = 42373 },
+    { url = "https://files.pythonhosted.org/packages/44/68/9ea5103f41d5217d7d6ec24db90018e23aebec070c3f9a6e54d12b841fd8/propcache-0.5.2-cp314-cp314-win_arm64.whl", hash = "sha256:0d2c9bf8528f135dbb805ce027567e09164f7efa51a2be07458a2c0420f292d0", size = 38554 },
+    { url = "https://files.pythonhosted.org/packages/8a/81/fadf555f42d3b762eea8a53950b0489fdc0aa9da5f8ed9e10ce0a4e01b48/propcache-0.5.2-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:4bc8ff1feffc6a61c7002ffe84634c41b822e104990ae009f44a0834430070bb", size = 99395 },
+    { url = "https://files.pythonhosted.org/packages/f5/c9/c61e134a686949cf7971af3a390148b1156f7be81c73bc0cd12c873e2d48/propcache-0.5.2-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:79aa3ff0a9b566633b642fa9caf7e21ed1c13d6feca718187873f199e1514078", size = 56653 },
+    { url = "https://files.pythonhosted.org/packages/cb/73/daf935ea7048ddd7ec8eec5345b4a40b619d2d178b3c0a0900796bc3c794/propcache-0.5.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1b31822f4474c4036bae62de9402710051d431a606d6a0f907fec79935a071aa", size = 56914 },
+    { url = "https://files.pythonhosted.org/packages/79/9f/aba959b435ea18617edd7cf0a7ad0b9c574b8fc7e3d2cd55fb59cb255d33/propcache-0.5.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:13fef48778b5a2a756523fdb781326b028ca75e32858b04f2cdd19f394564917", size = 62567 },
+    { url = "https://files.pythonhosted.org/packages/6c/a1/859942de9a791ff42f6141736f5b37749b8f53e65edfa49638c67dd67e6a/propcache-0.5.2-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:8b73ab70f1a3351fbc71f663b3e645af6dd0329100c353081cf69c37433fc6fe", size = 65542 },
+    { url = "https://files.pythonhosted.org/packages/b5/61/315bc0fd6c0fc7f80a528b8afd209e5fc4a875ea79571b91b8f50f442907/propcache-0.5.2-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5538d2c13d93e4698af7e092b57bc7298fd35d1d58e656ae18f23ee0d0378e03", size = 66845 },
+    { url = "https://files.pythonhosted.org/packages/47/f7/9f8122e3132e8e354ac41975ef8f1099be7d5a16bc7ae562734e993665c0/propcache-0.5.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cd645f03898405cabe694fb8bc35241e3a9c332ec85627584fe3de201452b335", size = 63985 },
+    { url = "https://files.pythonhosted.org/packages/c8/54/c317819ec157cbf6f35df9df9657a6f82daf34d5faf15948b2f639c2192e/propcache-0.5.2-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a473b3440261e0c60706e732b2ed2f517857344fc21bf48fdfe211e2d98eb285", size = 63999 },
+    { url = "https://files.pythonhosted.org/packages/5a/56/387e3f7dfce0a9233df41fb888aa1c30222cb4bbbf09537c02dd9bd85fe2/propcache-0.5.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:7afa37062e6650640e932e4cc9297d81f9f42d9944029cc386b8247dea4da837", size = 62779 },
+    { url = "https://files.pythonhosted.org/packages/a1/9c/596784cb5824ed61ee960d3f8655a3f0993e107c6e98ab6c818b7fb92ccb/propcache-0.5.2-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:8a90efd5777e996e42d568db9ac740b944d691e565cbfd31b2f7832f9184b2b8", size = 59796 },
+    { url = "https://files.pythonhosted.org/packages/c2/3d/1a6cfa1726a48542c1e8784a0761421476a5b68e09b7f36bf95eb954aaba/propcache-0.5.2-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:f19bb891234d72535764d703bfed1153cc34f4214d5bd7150aee1eec9e8f4366", size = 66023 },
+    { url = "https://files.pythonhosted.org/packages/e4/0e/05fd6990369477076e4e280bcb970de760fddf0161a46e988bc95f7940ec/propcache-0.5.2-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:32775082acd2d807ee3db715c7770d38767b817870acfa08c29e057f3c4d5b56", size = 64448 },
+    { url = "https://files.pythonhosted.org/packages/cd/86/5f8da315a4309c62c10c0b2516b17492d5d3bbe1bb862b96604db67e2a37/propcache-0.5.2-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:9282fb1a3bccd038da9f768b927b24a0c753e466c086b7c4f3c6982851eefb2d", size = 67329 },
+    { url = "https://files.pythonhosted.org/packages/da/d3/3368efe79ab21f0cdf86ef49895811c9cc933131d4cde1f28a624e22e712/propcache-0.5.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:cc49723e2f60d6b32a0f0b08a3fd6d13203c07f1cd9566cfce0f12a917c967a2", size = 65172 },
+    { url = "https://files.pythonhosted.org/packages/d5/07/127e8b0bacfb325396196f9d976a22453049b89b9b2b08477cc3145faa44/propcache-0.5.2-cp314-cp314t-win32.whl", hash = "sha256:2d7aa89ebca5acc98cba9d1472d976e394782f587bad6661003602a619fd1821", size = 43813 },
+    { url = "https://files.pythonhosted.org/packages/88/fb/46dad6c0ae49ed230ab1b16c890c2b6314e2403e6c412976f4a72d64a527/propcache-0.5.2-cp314-cp314t-win_amd64.whl", hash = "sha256:d447bb0b3054be5818458fbb171208b1d9ff11eba14e18ca18b90cbb45767370", size = 47764 },
+    { url = "https://files.pythonhosted.org/packages/e7/c4/a47d0a63aa309d10d59ede6e9d4cff03a344a79d1f0f4cd0cd74997b53e0/propcache-0.5.2-cp314-cp314t-win_arm64.whl", hash = "sha256:fe67a3d11cd9b4efabfa45c3d00ffba2b26811442a73a581a94b67c2b5faccf6", size = 41140 },
+    { url = "https://files.pythonhosted.org/packages/3a/ed/1cdcab6ba3d6ab7feca11fc14f0eeea80755bb53ef4e892079f31b10a25f/propcache-0.5.2-py3-none-any.whl", hash = "sha256:be1ddfcbb376e3de5d2e2db1d58d6d67463e6b4f9f040c000de8e300295465fe", size = 14036 },
+]
+
+[[package]]
+name = "pyarrow"
+version = "24.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/91/13/13e1069b351bdc3881266e11147ffccf687505dbb0ea74036237f5d454a5/pyarrow-24.0.0.tar.gz", hash = "sha256:85fe721a14dd823aca09127acbb06c3ca723efbd436c004f16bca601b04dcc83", size = 1180261 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b4/a9/9686d9f07837f91f775e8932659192e02c74f9d8920524b480b85212cc68/pyarrow-24.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:6233c9ed9ab9d1db47de57d9753256d9dcffbf42db341576099f0fd9f6bf4810", size = 34981559 },
+    { url = "https://files.pythonhosted.org/packages/80/b6/0ddf0e9b6ead3474ab087ae598c76b031fc45532bf6a63f3a553440fb258/pyarrow-24.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:f7616236ec1bc2b15bfdec22a71ab38851c86f8f05ff64f379e1278cf20c634a", size = 36663654 },
+    { url = "https://files.pythonhosted.org/packages/7c/3b/926382efe8ce27ba729071d3566ade6dfb86bdf112f366000196b2f5780a/pyarrow-24.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:1617043b99bd33e5318ae18eb2919af09c71322ef1ca46566cdafc6e6712fb66", size = 45679394 },
+    { url = "https://files.pythonhosted.org/packages/b3/7a/829f7d9dfd37c207206081d6dad474d81dde29952401f07f2ba507814818/pyarrow-24.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:6165461f55ef6314f026de6638d661188e3455d3ec49834556a0ebbdbace18bb", size = 48863122 },
+    { url = "https://files.pythonhosted.org/packages/5f/e8/f88ce625fe8babaae64e8db2d417c7653adb3019b08aae85c5ed787dc816/pyarrow-24.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3b13dedfe76a0ad2d1d859b0811b53827a4e9d93a0bcb05cf59333ab4980cc7e", size = 49376032 },
+    { url = "https://files.pythonhosted.org/packages/36/7a/82c363caa145fff88fb475da50d3bf52bb024f61917be5424c3392eaf878/pyarrow-24.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:25ea65d868eb04015cd18e6df2fbe98f07e5bda2abefabcb88fce39a947716f6", size = 51929490 },
+    { url = "https://files.pythonhosted.org/packages/66/1c/e3e72c8014ad2743ca64a701652c733cc5cbcee15c0463a32a8c55518d9e/pyarrow-24.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:295f0a7f2e242dabd513737cf076007dc5b2d59237e3eca37b05c0c6446f3826", size = 27355660 },
+    { url = "https://files.pythonhosted.org/packages/6f/d3/a1abf004482026ddc17f4503db227787fa3cfe41ec5091ff20e4fea55e57/pyarrow-24.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:02b001b3ed4723caa44f6cd1af2d5c86aa2cf9971dacc2ffa55b21237713dfba", size = 34976759 },
+    { url = "https://files.pythonhosted.org/packages/4f/4a/34f0a36d28a2dd32225301b79daad44e243dc1a2bb77d43b60749be255c4/pyarrow-24.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:04920d6a71aabd08a0417709efce97d45ea8e6fb733d9ca9ecffb13c67839f68", size = 36658471 },
+    { url = "https://files.pythonhosted.org/packages/1f/78/543b94712ae8bb1a6023bcc1acf1a740fbff8286747c289cd9468fced2a5/pyarrow-24.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:a964266397740257f16f7bb2e4f08a0c81454004beab8ff59dd531b73610e9f2", size = 45675981 },
+    { url = "https://files.pythonhosted.org/packages/84/9f/8fb7c222b100d314137fa40ec050de56cd8c6d957d1cfff685ce72f15b17/pyarrow-24.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:6f066b179d68c413374294bc1735f68475457c933258df594443bb9d88ddc2a0", size = 48859172 },
+    { url = "https://files.pythonhosted.org/packages/a7/d3/1ea72538e6c8b3b475ed78d1049a2c518e655761ea50fe1171fc855fcab7/pyarrow-24.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1183baeb14c5f587b1ec52831e665718ce632caab84b7cd6b85fd44f96114495", size = 49385733 },
+    { url = "https://files.pythonhosted.org/packages/c3/be/c3d8b06a1ba35f2260f8e1f771abbee7d5e345c0937aab90675706b1690a/pyarrow-24.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:806f24b4085453c197a5078218d1ee08783ebbba271badd153d1ae22a3ee804f", size = 51934335 },
+    { url = "https://files.pythonhosted.org/packages/9c/62/89e07a1e7329d2cde3e3c6994ba0839a24977a2beda8be6005ea3d860b99/pyarrow-24.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:e4505fc6583f7b05ab854934896bcac8253b04ac1171a77dfb73efef92076d91", size = 27271748 },
+    { url = "https://files.pythonhosted.org/packages/17/1a/cff3a59f80b5b1658549d46611b67163f65e0664431c076ad728bf9d5af4/pyarrow-24.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:1a4e45017efbf115032e4475ee876d525e0e36c742214fbe405332480ecd6275", size = 35238554 },
+    { url = "https://files.pythonhosted.org/packages/a8/99/cce0f42a327bfef2c420fb6078a3eb834826e5d6697bf3009fe11d2ad051/pyarrow-24.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:7986f1fa71cee060ad00758bcc79d3a93bab8559bf978fab9e53472a2e25a17b", size = 36782301 },
+    { url = "https://files.pythonhosted.org/packages/2a/66/8e560d5ff6793ca29aca213c53eec0dd482dd46cb93b2819e5aab52e4252/pyarrow-24.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:d3e0b61e8efb24ed38898e5cdc5fffa9124be480008d401a1f8071500494ae42", size = 45721929 },
+    { url = "https://files.pythonhosted.org/packages/27/0c/a26e25505d030716e078d9f16eb74973cbf0b33b672884e9f9da1c83b871/pyarrow-24.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:55a3bc1e3df3b5567b7d27ef551b2283f0c68a5e86f1cd56abc569da4f31335b", size = 48825365 },
+    { url = "https://files.pythonhosted.org/packages/5f/eb/771f9ecb0c65e73fe9dccdd1717901b9594f08c4515d000c7c62df573811/pyarrow-24.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:641f795b361874ac9da5294f8f443dfdbee355cf2bd9e3b8d97aaac2306b9b37", size = 49451819 },
+    { url = "https://files.pythonhosted.org/packages/48/da/61ae89a88732f5a785646f3ec6125dbb640fa98a540eb2b9889caa561403/pyarrow-24.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8adc8e6ce5fccf5dc707046ae4914fd537def529709cc0d285d37a7f9cd442ca", size = 51909252 },
+    { url = "https://files.pythonhosted.org/packages/cb/1a/8dd5cafab7b66573fa91c03d06d213356ad4edd71813aa75e08ce2b3a844/pyarrow-24.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:9b18371ad2f44044b81a8d23bc2d8a9b6a6226dca775e8e16cfee640473d6c5d", size = 27388127 },
+    { url = "https://files.pythonhosted.org/packages/ad/80/d022a34ff05d2cbedd8ccf841fc1f532ecfa9eb5ed1711b56d0e0ea71fc9/pyarrow-24.0.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:1cc9057f0319e26333b357e17f3c2c022f1a83739b48a88b25bfd5fa2dc18838", size = 35007997 },
+    { url = "https://files.pythonhosted.org/packages/1a/ff/f01485fda6f4e5d441afb8dd5e7681e4db18826c1e271852f5d3957d6a80/pyarrow-24.0.0-cp314-cp314-macosx_12_0_x86_64.whl", hash = "sha256:e6f1278ee4785b6db21229374a1c9e54ec7c549de5d1efc9630b6207de7e170b", size = 36678720 },
+    { url = "https://files.pythonhosted.org/packages/9e/c2/2d2d5fea814237923f71b36495211f20b43a1576f9a4d6da7e751a64ec6f/pyarrow-24.0.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:adbbedc55506cbdabb830890444fb856bfb0060c46c6f8026c6c2f2cf86ae795", size = 45741852 },
+    { url = "https://files.pythonhosted.org/packages/8e/3a/28ba9c1c1ebdbb5f1b94dfebb46f207e52e6a554b7fe4132540fde29a3a0/pyarrow-24.0.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:ae8a1145af31d903fa9bb166824d7abe9b4681a000b0159c9fb99c11bc11ad26", size = 48889852 },
+    { url = "https://files.pythonhosted.org/packages/df/51/4a389acfd31dca009f8fb82d7f510bb4130f2b3a8e18cf00194d0687d8ac/pyarrow-24.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d7027eba1df3b2069e2e8d80f644fa0918b68c46432af3d088ddd390d063ecde", size = 49445207 },
+    { url = "https://files.pythonhosted.org/packages/19/4b/0bab2b23d2ae901b1b9a03c0efd4b2d070256f8ce3fc43f6e58c167b2081/pyarrow-24.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e56a1ffe9bf7b727432b89104cc0849c21582949dd7bdcb34f17b2001a351a76", size = 51954117 },
+    { url = "https://files.pythonhosted.org/packages/29/88/f4e9145da0417b3d2c12035a8492b35ff4a3dbc653e614fcfb51d9dedb38/pyarrow-24.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:38be1808cdd068605b787e6ca9119b27eb275a0234e50212c3492331680c3b1e", size = 28001155 },
+    { url = "https://files.pythonhosted.org/packages/79/4f/46a49a63f43526da895b1a45bbb51d5baf8e4d77159f8528fc3e5490007f/pyarrow-24.0.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:418e48ce50a45a6a6c73c454677203a9c75c966cb1e92ca3370959185f197a05", size = 35250387 },
+    { url = "https://files.pythonhosted.org/packages/a0/da/d5e0cd5ef00796922404806d5f00325cdadc3441ce2c13fe7115f2df9a64/pyarrow-24.0.0-cp314-cp314t-macosx_12_0_x86_64.whl", hash = "sha256:2f16197705a230a78270cdd4ea8a1d57e86b2fdcbc34a1f6aebc72e65c986f9a", size = 36797102 },
+    { url = "https://files.pythonhosted.org/packages/34/c7/5904145b0a593a05236c882933d439b5720f0a145381179063722fbfc123/pyarrow-24.0.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:fb24ac194bfc5e86839d7dcd52092ee31e5fe6733fe11f5e3b06ef0812b20072", size = 45745118 },
+    { url = "https://files.pythonhosted.org/packages/13/d3/cca42fe166d1c6e4d5b80e530b7949104d10e17508a90ae202dac205ce2a/pyarrow-24.0.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:9700ebd9a51f5895ce75ff4ac4b3c47a7d4b42bc618be8e713e5d56bacf5f931", size = 48844765 },
+    { url = "https://files.pythonhosted.org/packages/b0/49/942c3b79878ba928324d1e17c274ed84581db8c0a749b24bcf4cbdf15bd3/pyarrow-24.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d8ddd2768da81d3ee08cfea9b597f4abb4e8e1dc8ae7e204b608d23a0d3ab699", size = 49471890 },
+    { url = "https://files.pythonhosted.org/packages/76/97/ff71431000a75d84135a1ace5ca4ba11726a231a8007bbb320a4c54075d5/pyarrow-24.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:61a3d7eaa97a14768b542f3d284dc6400dd2470d9f080708b13cd46b6ae18136", size = 51932250 },
+    { url = "https://files.pythonhosted.org/packages/51/be/6f79d55816d5c22557cf27533543d5d70dfe692adfbee4b99f2760674f38/pyarrow-24.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:c91d00057f23b8d353039520dc3a6c09d8608164c692e9f59a175a42b2ae0c19", size = 28131282 },
+]
+
+[[package]]
+name = "pydantic"
+version = "2.13.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "annotated-types" },
+    { name = "pydantic-core" },
+    { name = "typing-extensions" },
+    { name = "typing-inspection" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/18/a5/b60d21ac674192f8ab0ba4e9fd860690f9b4a6e51ca5df118733b487d8d6/pydantic-2.13.4.tar.gz", hash = "sha256:c40756b57adaa8b1efeeced5c196f3f3b7c435f90e84ea7f443901bec8099ef6", size = 844775 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fd/7b/122376b1fd3c62c1ed9dc80c931ace4844b3c55407b6fb2d199377c9736f/pydantic-2.13.4-py3-none-any.whl", hash = "sha256:45a282cde31d808236fd7ea9d919b128653c8b38b393d1c4ab335c62924d9aba", size = 472262 },
+]
+
+[[package]]
+name = "pydantic-core"
+version = "2.46.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9d/56/921726b776ace8d8f5db44c4ef961006580d91dc52b803c489fafd1aa249/pydantic_core-2.46.4.tar.gz", hash = "sha256:62f875393d7f270851f20523dd2e29f082bcc82292d66db2b64ea71f64b6e1c1", size = 471464 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ce/8c/af022f0af448d7747c5154288d46b5f2bc5f17366eaa0e23e9aa04d59f3b/pydantic_core-2.46.4-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:3245406455a5d98187ec35530fd772b1d799b26667980872c8d4614991e2c4a2", size = 2106158 },
+    { url = "https://files.pythonhosted.org/packages/19/95/6195171e385007300f0f5574592e467c568becce2d937a0b6804f218bc49/pydantic_core-2.46.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:962ccbab7b642487b1d8b7df90ef677e03134cf1fd8880bf698649b22a69371f", size = 1951724 },
+    { url = "https://files.pythonhosted.org/packages/8e/bc/f47d1ff9cbb1620e1b5b697eef06010035735f07820180e74178226b27b3/pydantic_core-2.46.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8233f2947cf85404441fd7e0085f53b10c93e0ee78611099b5c7237e36aacbf7", size = 1975742 },
+    { url = "https://files.pythonhosted.org/packages/5b/11/9b9a5b0306345664a2da6410877af6e8082481b5884b3ddd78d47c6013ce/pydantic_core-2.46.4-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3a233125ac121aa3ffba9a2b59edfc4a985a76092dc8279586ab4b71390875e7", size = 2052418 },
+    { url = "https://files.pythonhosted.org/packages/f1/b7/a65fec226f5d78fc39f4a13c4cc0c768c22b113438f60c14adc9d2865038/pydantic_core-2.46.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5b712b53160b79a5850310b912a5ef8e57e56947c8ad690c227f5c9d7e561712", size = 2232274 },
+    { url = "https://files.pythonhosted.org/packages/68/f0/92039db98b907ef49269a8271f67db9cb78ae2fc68062ef7e4e77adb5f61/pydantic_core-2.46.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9401557acd873c3a7f3eb9383edef8ac4968f9510e340f4808d427e75667e7b4", size = 2309940 },
+    { url = "https://files.pythonhosted.org/packages/5f/97/2aab507d3d00ca626e8e57c1eac6a79e4e5fbcc63eb99733ff55d1717f65/pydantic_core-2.46.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:926c9541b14b12b1681dca8a0b75feb510b06c6341b70a8e500c2fdcff837cce", size = 2094516 },
+    { url = "https://files.pythonhosted.org/packages/22/37/a8aca44d40d737dde2bc05b3c6c07dff0de07ce6f82e9f3167aeaf4d5dea/pydantic_core-2.46.4-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:56cb4851bcaf3d117eddcef4fe66afd750a50274b0da8e22be256d10e5611987", size = 2136854 },
+    { url = "https://files.pythonhosted.org/packages/24/99/fcef1b79238c06a8cbec70819ac722ba76e02bc8ada9b0fd66eba40da01b/pydantic_core-2.46.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c68fcd102d71ea85c5b2dfac3f4f8476eff42a9e078fd5faefff6d145063536b", size = 2180306 },
+    { url = "https://files.pythonhosted.org/packages/ae/6c/fc44000918855b42779d007ae63b0532794739027b2f417321cddbc44f6a/pydantic_core-2.46.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:b2f69dec1725e79a012d920df1707de5caf7ed5e08f3be4435e25803efc47458", size = 2190044 },
+    { url = "https://files.pythonhosted.org/packages/6b/65/d9cadc9f1920d7a127ad2edba16c1db7916e59719285cd6c94600b0080ba/pydantic_core-2.46.4-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:8d0820e8192167f80d88d64038e609c31452eeca865b4e1d9950a27a4609b00b", size = 2329133 },
+    { url = "https://files.pythonhosted.org/packages/d0/cf/c873d91679f3a30bcf5e7ac280ce5573483e72295307685120d0d5ad3416/pydantic_core-2.46.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:fbdb89b3e1c94a30cc5edfce477c6e6a5dc4d8f84665b455c27582f211a1c72c", size = 2374464 },
+    { url = "https://files.pythonhosted.org/packages/47/bd/6f2fc8188f31bf10590f1e98e7b306336161fac930a8c514cd7bd828c7dc/pydantic_core-2.46.4-cp312-cp312-win32.whl", hash = "sha256:9aa768456404a8bf48a4406685ac2bec8e72b62c69313734fa3b73cf33b3a894", size = 1974823 },
+    { url = "https://files.pythonhosted.org/packages/40/8c/985c1d41ea1107c2534abd9870e4ed5c8e7669b5c308297835c001e7a1c4/pydantic_core-2.46.4-cp312-cp312-win_amd64.whl", hash = "sha256:e9c26f834c65f5752f3f06cb08cb86a913ceb7274d0db6e267808a708b46bc89", size = 2072919 },
+    { url = "https://files.pythonhosted.org/packages/c4/ba/f463d006e0c47373ca7ec5e1a261c59dc01ef4d62b2657af925fb0deee3a/pydantic_core-2.46.4-cp312-cp312-win_arm64.whl", hash = "sha256:4fc73cb559bdb54b1134a706a2802a4cddd27a0633f5abb7e53056268751ac6a", size = 2027604 },
+    { url = "https://files.pythonhosted.org/packages/51/a2/5d30b469c5267a17b39dec53208222f76a8d351dfac4af661888c5aee77d/pydantic_core-2.46.4-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:5d5902252db0d3cedf8d4a1bc68f70eeb430f7e4c7104c8c476753519b423008", size = 2106306 },
+    { url = "https://files.pythonhosted.org/packages/c1/81/4fa520eaffa8bd7d1525e644cd6d39e7d60b1592bc5b516693c7340b50f1/pydantic_core-2.46.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:c94f0688e7b8d0a67abf40e57a7eaaecd17cc9586706a31b76c031f63df052b4", size = 1951906 },
+    { url = "https://files.pythonhosted.org/packages/03/d5/fd02da45b659668b05923b17ba3a0100a0a3d5541e3bd8fcc4ecb711309e/pydantic_core-2.46.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f027324c56cd5406ca49c124b0db10e56c69064fec039acc571c29020cc87c76", size = 1976802 },
+    { url = "https://files.pythonhosted.org/packages/21/f2/95727e1368be3d3ed485eaab7adbd7dda408f33f7a36e8b48e0144002b91/pydantic_core-2.46.4-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e739fee756ba1010f8bcccb534252e85a35fe45ae92c295a06059ce58b74ccd3", size = 2052446 },
+    { url = "https://files.pythonhosted.org/packages/9c/86/5d99feea3f77c7234b8718075b23db11532773c1a0dbd9b9490215dc2eeb/pydantic_core-2.46.4-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9d56801be94b86a9da183e5f3766e6310752b99ff647e38b09a9500d88e46e76", size = 2232757 },
+    { url = "https://files.pythonhosted.org/packages/d2/3a/508ac615935ef7588cf6d9e9b91309fdc2da751af865e02a9098de88258c/pydantic_core-2.46.4-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2412e734dcb48da14d4e4006b82b46b74f2518b8a26ee7e58c6844a6cd6d03c4", size = 2309275 },
+    { url = "https://files.pythonhosted.org/packages/07/f8/41db9de19d7987d6b04715a02b3b40aea467000275d9d758ffaa31af7d50/pydantic_core-2.46.4-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9551187363ffc0de2a00b2e47c25aeaeb1020b69b668762966df15fc5659dd5a", size = 2094467 },
+    { url = "https://files.pythonhosted.org/packages/2c/e2/f35033184cb11d0052daf4416e8e10a502ea2ac006fc4f459aee872727d1/pydantic_core-2.46.4-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:0186750b482eefa11d7f435892b09c5c606193ef3375bcf94aa00ae6bfb66262", size = 2134417 },
+    { url = "https://files.pythonhosted.org/packages/7e/7b/6ceeb1cc90e193862f444ebe373d8fdf613f0a82572dde03fb10734c6c71/pydantic_core-2.46.4-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5855698a4856556d86e8e6cd8434bc3ac0314ee8e12089ae0e143f64c6256e4e", size = 2179782 },
+    { url = "https://files.pythonhosted.org/packages/5a/f2/c8d7773ede6af08036423a00ae0ceffce266c3c52a096c435d68c896083f/pydantic_core-2.46.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:cbaf13819775b7f769bf4a1f066cb6df7a28d4480081a589828ef190226881cd", size = 2188782 },
+    { url = "https://files.pythonhosted.org/packages/59/31/0c864784e31f09f05cdd87606f08923b9c9e7f6e51dd27f20f62f975ce9f/pydantic_core-2.46.4-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:633147d34cf4550417f12e2b1a0383973bdf5cdfde212cb09e9a581cf10820be", size = 2328334 },
+    { url = "https://files.pythonhosted.org/packages/c2/eb/4f6c8a41efa30baa755590f4141abf3a8c370fab610915733e74134a7270/pydantic_core-2.46.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:82cf5301172168103724d49a1444d3378cb20cdee30b116a1bd6031236298a5d", size = 2372986 },
+    { url = "https://files.pythonhosted.org/packages/5b/24/b375a480d53113860c299764bfe9f349a3dc9108b3adc0d7f0d786492ebf/pydantic_core-2.46.4-cp313-cp313-win32.whl", hash = "sha256:9fa8ae11da9e2b3126c6426f147e0fba88d96d65921799bb30c6abd1cb2c97fb", size = 1973693 },
+    { url = "https://files.pythonhosted.org/packages/7e/e8/cff247591966f2d22ec8c003cd7587e27b7ba7b81ab2fb888e3ab75dc285/pydantic_core-2.46.4-cp313-cp313-win_amd64.whl", hash = "sha256:6b3ace8194b0e5204818c92802dcdca7fc6d88aabbb799d7c795540d9cd6d292", size = 2071819 },
+    { url = "https://files.pythonhosted.org/packages/c6/1a/f4aee670d5670e9e148e0c82c7db98d780be566c6e6a97ee8035528ca0b3/pydantic_core-2.46.4-cp313-cp313-win_arm64.whl", hash = "sha256:184c081504d17f1c1066e430e117142b2c77d9448a97f7b65c6ac9fd9aee238d", size = 2027411 },
+    { url = "https://files.pythonhosted.org/packages/8d/74/228a26ddad29c6672b805d9fd78e8d251cd04004fa7eed0e622096cd0250/pydantic_core-2.46.4-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:428e04521a40150c85216fc8b85e8d39fece235a9cf5e383761238c7fa9b96fb", size = 2102079 },
+    { url = "https://files.pythonhosted.org/packages/ad/1f/8970b150a4b4365623ae00fc88603491f763c627311ae8031e3111356d6e/pydantic_core-2.46.4-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:23ace664830ee0bfe014a0c7bc248b1f7f25ed7ad103852c317624a1083af462", size = 1952179 },
+    { url = "https://files.pythonhosted.org/packages/95/30/5211a831ae054928054b2f79731661087a2bc5c01e825c672b3a4a8f1b3e/pydantic_core-2.46.4-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce5c1d2a8b27468f433ca974829c44060b8097eedc39933e3c206a90ee49c4a9", size = 1978926 },
+    { url = "https://files.pythonhosted.org/packages/57/e9/689668733b1eb67adeef047db3c2e8788fcf65a7fd9c9e2b46b7744fe245/pydantic_core-2.46.4-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7283d57845ecf5a163403eb0702dfc220cc4fbdd18919cb5ccea4f95ee1cdab4", size = 2046785 },
+    { url = "https://files.pythonhosted.org/packages/60/d9/6715260422ff50a2109878fd24d948a6c3446bb2664f34ee78cd972b3acd/pydantic_core-2.46.4-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8daafc69c93ee8a0204506a3b6b30f586ef54028f52aeeeb5c4cfc5184fd5914", size = 2228733 },
+    { url = "https://files.pythonhosted.org/packages/18/ae/fdb2f64316afca925640f8e70bb1a564b0ec2721c1389e25b8eb4bf9a299/pydantic_core-2.46.4-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd2213145bcc2ba85884d0ac63d222fece9209678f77b9b4d76f054c561adb28", size = 2307534 },
+    { url = "https://files.pythonhosted.org/packages/89/1d/8eff589b45bb8190a9d12c49cfad0f176a5cbd1534908a6b5125e2886239/pydantic_core-2.46.4-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a5f930472650a82629163023e630d160863fce524c616f4e5186e5de9d9a49b", size = 2099732 },
+    { url = "https://files.pythonhosted.org/packages/06/d5/ee5a3366637fee41dee51a1fc91562dcf12ddbc68fda34e6b253da2324bb/pydantic_core-2.46.4-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:c1b3f518abeca3aa13c712fd202306e145abf59a18b094a6bafb2d2bbf59192c", size = 2129627 },
+    { url = "https://files.pythonhosted.org/packages/94/33/2414be571d2c6a6c4d08be21f9292b6d3fdb08949a97b6dfe985017821db/pydantic_core-2.46.4-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1a7dd0b3ee80d90150e3495a3a13ac34dbcbfd4f012996a6a1d8900e91b5c0fb", size = 2179141 },
+    { url = "https://files.pythonhosted.org/packages/7b/79/7daa95be995be0eecc4cf75064cb33f9bbbfe3fe0158caf2f0d4a996a5c7/pydantic_core-2.46.4-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:3fb702cd90b0446a3a1c5e470bfa0dd23c0233b676a9099ddcc964fa6ca13898", size = 2184325 },
+    { url = "https://files.pythonhosted.org/packages/9f/cb/d0a382f5c0de8a222dc61c65348e0ce831b1f68e0a018450d31c2cace3a5/pydantic_core-2.46.4-cp314-cp314-musllinux_1_1_armv7l.whl", hash = "sha256:b8458003118a712e66286df6a707db01c52c0f52f7db8e4a38f0da1d3b94fc4e", size = 2323990 },
+    { url = "https://files.pythonhosted.org/packages/05/db/d9ba624cc4a5aced1598e88c04fdbd8310c8a69b9d38b9a3d39ce3a61ed7/pydantic_core-2.46.4-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:372429a130e469c9cd698925ce5fc50940b7a1336b0d82038e63d5bbc4edc519", size = 2369978 },
+    { url = "https://files.pythonhosted.org/packages/f2/20/d15df15ba918c423461905802bfd2981c3af0bfa0e40d05e13edbfa48bc3/pydantic_core-2.46.4-cp314-cp314-win32.whl", hash = "sha256:85bb3611ff1802f3ee7fdd7dbff26b56f343fb432d57a4728fdd49b6ef35e2f4", size = 1966354 },
+    { url = "https://files.pythonhosted.org/packages/fc/b6/6b8de4c0a7d7ab3004c439c80c5c1e0a3e8d78bbae19379b01960383d9e5/pydantic_core-2.46.4-cp314-cp314-win_amd64.whl", hash = "sha256:811ff8e9c313ab425368bcbb36e5c4ebd7108c2bbf4e4089cfbb0b01eff63fac", size = 2072238 },
+    { url = "https://files.pythonhosted.org/packages/32/36/51eb763beec1f4cf59b1db243a7dcc39cbb41230f050a09b9d69faaf0a48/pydantic_core-2.46.4-cp314-cp314-win_arm64.whl", hash = "sha256:bfec22eab3c8cc2ceec0248aec886624116dc079afa027ecc8ad4a7e62010f8a", size = 2018251 },
+    { url = "https://files.pythonhosted.org/packages/e8/91/855af51d625b23aa987116a19e231d2aaef9c4a415273ddc189b79a45fee/pydantic_core-2.46.4-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:af8244b2bef6aaad6d92cda81372de7f8c8d36c9f0c3ea36e827c60e7d9467a0", size = 2099593 },
+    { url = "https://files.pythonhosted.org/packages/fb/1b/8784a54c65edb5f49f0a14d6977cf1b209bba85a4c77445b255c2de58ab3/pydantic_core-2.46.4-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:5a4330cdbc57162e4b3aa303f588ba752257694c9c9be3e7ebb11b4aca659b5d", size = 1935226 },
+    { url = "https://files.pythonhosted.org/packages/e8/e7/1955d28d1afc56dd4b3ad7cc0cf39df1b9852964cf16e5d13912756d6d6b/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:29c61fc04a3d840155ff08e475a04809278972fe6aef51e2720554e96367e34b", size = 1974605 },
+    { url = "https://files.pythonhosted.org/packages/93/e2/3fedbf0ba7a22850e6e9fd78117f1c0f10f950182344d8a6c535d468fdd8/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c50f2528cf200c5eed56faf3f4e22fcd5f38c157a8b78576e6ba3168ec35f000", size = 2030777 },
+    { url = "https://files.pythonhosted.org/packages/f8/61/46be275fcaaba0b4f5b9669dd852267ce1ff616592dccf7a7845588df091/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0cbe8b01f948de4286c74cdd6c667aceb38f5c1e26f0693b3983d9d74887c65e", size = 2236641 },
+    { url = "https://files.pythonhosted.org/packages/60/db/12e93e46a8bac9988be3c016860f83293daea8c716c029c9ace279036f2f/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:617d7e2ca7dcb8c5cf6bcb8c59b8832c94b36196bbf1cbd1bfb56ed341905edd", size = 2286404 },
+    { url = "https://files.pythonhosted.org/packages/e2/4a/4d8b19008f38d31c53b8219cfedc2e3d5de5fe99d90076b7e767de29274f/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7027560ee92211647d0d34e3f7cd6f50da56399d26a9c8ad0da286d3869a53f3", size = 2109219 },
+    { url = "https://files.pythonhosted.org/packages/88/70/3cbc40978fefb7bb09c6708d40d4ad1a5d70fd7213c3d17f971de868ec1f/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:f99626688942fb746e545232e7726926f3be91b5975f8b55327665fafda991c7", size = 2110594 },
+    { url = "https://files.pythonhosted.org/packages/9d/20/b8d36736216e29491125531685b2f9e61aa5b4b2599893f8268551da3338/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fc3e9034a63de20e15e8ade85358bc6efc614008cab72898b4b4952bea0509ff", size = 2159542 },
+    { url = "https://files.pythonhosted.org/packages/1d/a2/367df868eb584dacf6bf82a389272406d7178e301c4ac82545ab98bc2dd9/pydantic_core-2.46.4-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:97e7cf2be5c77b7d1a9713a05605d49460d02c6078d38d8bef3cbe323c548424", size = 2168146 },
+    { url = "https://files.pythonhosted.org/packages/c1/b8/4460f77f7e201893f649a29ab355dddd3beee8a97bcb1a320db414f9a06e/pydantic_core-2.46.4-cp314-cp314t-musllinux_1_1_armv7l.whl", hash = "sha256:3bf92c5d0e00fefaab325a4d27828fe6b6e2a21848686b5b60d2d9eeb09d76c6", size = 2306309 },
+    { url = "https://files.pythonhosted.org/packages/64/c4/be2639293acd87dc8ddbcec41a73cee9b2ebf996fe6d892a1a74e88ad3f7/pydantic_core-2.46.4-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:3ecbc122d18468d06ca279dc26a8c2e2d5acb10943bb35e36ae92096dc3b5565", size = 2369736 },
+    { url = "https://files.pythonhosted.org/packages/30/a6/9f9f380dbb301f67023bf8f707aaa75daadf84f7152d95c410fd7e81d994/pydantic_core-2.46.4-cp314-cp314t-win32.whl", hash = "sha256:e846ae7835bf0703ae43f534ab79a867146dadd59dc9ca5c8b53d5c8f7c9ef02", size = 1955575 },
+    { url = "https://files.pythonhosted.org/packages/40/1f/f1eb9eb350e795d1af8586289746f5c5677d16043040d63710e22abc43c9/pydantic_core-2.46.4-cp314-cp314t-win_amd64.whl", hash = "sha256:2108ba5c1c1eca18030634489dc544844144ee36357f2f9f780b93e7ddbb44b5", size = 2051624 },
+    { url = "https://files.pythonhosted.org/packages/f6/d2/42dd53d0a85c27606f316d3aa5d2869c4e8470a5ed6dec30e4a1abe19192/pydantic_core-2.46.4-cp314-cp314t-win_arm64.whl", hash = "sha256:4fcbe087dbc2068af7eda3aa87634eba216dbda64d1ae73c8684b621d33f6596", size = 2017325 },
+]
+
+[[package]]
+name = "pygments"
+version = "2.20.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/c3/b2/bc9c9196916376152d655522fdcebac55e66de6603a76a02bca1b6414f6c/pygments-2.20.0.tar.gz", hash = "sha256:6757cd03768053ff99f3039c1a36d6c0aa0b263438fcab17520b30a303a82b5f", size = 4955991 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176", size = 1231151 },
+]
+
+[[package]]
+name = "pypdf"
+version = "6.11.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/bf/58/6dd97d78a4b17a7a6b9d1c6ad23895abc41f0fdc49c553cc05bdfdcc36d0/pypdf-6.11.0.tar.gz", hash = "sha256:062b51c81b0910e6d2755e99e1c5547a0a23b7d0a32322af66240d8edcfabe87", size = 6453975 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/07/b1/68feb7eb3b99f0c020b414234825f4a5d70e0126c18d933770e8c93a35fc/pypdf-6.11.0-py3-none-any.whl", hash = "sha256:769394d5756d5b304c9b6bef88b54b1816b328e7e6fc9254e625529a15ed4ab8", size = 338819 },
+]
+
+[[package]]
+name = "pytest"
+version = "9.0.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "iniconfig" },
+    { name = "packaging" },
+    { name = "pluggy" },
+    { name = "pygments" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/7d/0d/549bd94f1a0a402dc8cf64563a117c0f3765662e2e668477624baeec44d5/pytest-9.0.3.tar.gz", hash = "sha256:b86ada508af81d19edeb213c681b1d48246c1a91d304c6c81a427674c17eb91c", size = 1572165 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d4/24/a372aaf5c9b7208e7112038812994107bc65a84cd00e0354a88c2c77a617/pytest-9.0.3-py3-none-any.whl", hash = "sha256:2c5efc453d45394fdd706ade797c0a81091eccd1d6e4bccfcd476e2b8e0ab5d9", size = 375249 },
+]
+
+[[package]]
+name = "pytest-asyncio"
+version = "1.3.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pytest" },
+    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/90/2c/8af215c0f776415f3590cac4f9086ccefd6fd463befeae41cd4d3f193e5a/pytest_asyncio-1.3.0.tar.gz", hash = "sha256:d7f52f36d231b80ee124cd216ffb19369aa168fc10095013c6b014a34d3ee9e5", size = 50087 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e5/35/f8b19922b6a25bc0880171a2f1a003eaeb93657475193ab516fd87cac9da/pytest_asyncio-1.3.0-py3-none-any.whl", hash = "sha256:611e26147c7f77640e6d0a92a38ed17c3e9848063698d5c93d5aa7aa11cebff5", size = 15075 },
+]
+
+[[package]]
+name = "python-dateutil"
+version = "2.9.0.post0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "six" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892 },
+]
+
+[[package]]
+name = "python-dotenv"
+version = "1.2.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/82/ed/0301aeeac3e5353ef3d94b6ec08bbcabd04a72018415dcb29e588514bba8/python_dotenv-1.2.2.tar.gz", hash = "sha256:2c371a91fbd7ba082c2c1dc1f8bf89ca22564a087c2c287cd9b662adde799cf3", size = 50135 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0b/d7/1959b9648791274998a9c3526f6d0ec8fd2233e4d4acce81bbae76b44b2a/python_dotenv-1.2.2-py3-none-any.whl", hash = "sha256:1d8214789a24de455a8b8bd8ae6fe3c6b69a5e3d64aa8a8e5d68e694bbcb285a", size = 22101 },
+]
+
+[[package]]
+name = "pytz"
+version = "2026.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ff/46/dd499ec9038423421951e4fad73051febaa13d2df82b4064f87af8b8c0c3/pytz-2026.2.tar.gz", hash = "sha256:0e60b47b29f21574376f218fe21abc009894a2321ea16c6754f3cad6eb7cdd6a", size = 320861 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ec/dd/96da98f892250475bdf2328112d7468abdd4acc7b902b6af23f4ed958ea0/pytz-2026.2-py2.py3-none-any.whl", hash = "sha256:04156e608bee23d3792fd45c94ae47fae1036688e75032eea2e3bf0323d1f126", size = 510141 },
+]
+
+[[package]]
+name = "pyyaml"
+version = "6.0.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d1/33/422b98d2195232ca1826284a76852ad5a86fe23e31b009c9886b2d0fb8b2/pyyaml-6.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196", size = 182063 },
+    { url = "https://files.pythonhosted.org/packages/89/a0/6cf41a19a1f2f3feab0e9c0b74134aa2ce6849093d5517a0c550fe37a648/pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0", size = 173973 },
+    { url = "https://files.pythonhosted.org/packages/ed/23/7a778b6bd0b9a8039df8b1b1d80e2e2ad78aa04171592c8a5c43a56a6af4/pyyaml-6.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28", size = 775116 },
+    { url = "https://files.pythonhosted.org/packages/65/30/d7353c338e12baef4ecc1b09e877c1970bd3382789c159b4f89d6a70dc09/pyyaml-6.0.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5fdec68f91a0c6739b380c83b951e2c72ac0197ace422360e6d5a959d8d97b2c", size = 844011 },
+    { url = "https://files.pythonhosted.org/packages/8b/9d/b3589d3877982d4f2329302ef98a8026e7f4443c765c46cfecc8858c6b4b/pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ba1cc08a7ccde2d2ec775841541641e4548226580ab850948cbfda66a1befcdc", size = 807870 },
+    { url = "https://files.pythonhosted.org/packages/05/c0/b3be26a015601b822b97d9149ff8cb5ead58c66f981e04fedf4e762f4bd4/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8dc52c23056b9ddd46818a57b78404882310fb473d63f17b07d5c40421e47f8e", size = 761089 },
+    { url = "https://files.pythonhosted.org/packages/be/8e/98435a21d1d4b46590d5459a22d88128103f8da4c2d4cb8f14f2a96504e1/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:41715c910c881bc081f1e8872880d3c650acf13dfa8214bad49ed4cede7c34ea", size = 790181 },
+    { url = "https://files.pythonhosted.org/packages/74/93/7baea19427dcfbe1e5a372d81473250b379f04b1bd3c4c5ff825e2327202/pyyaml-6.0.3-cp312-cp312-win32.whl", hash = "sha256:96b533f0e99f6579b3d4d4995707cf36df9100d67e0c8303a0c55b27b5f99bc5", size = 137658 },
+    { url = "https://files.pythonhosted.org/packages/86/bf/899e81e4cce32febab4fb42bb97dcdf66bc135272882d1987881a4b519e9/pyyaml-6.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:5fcd34e47f6e0b794d17de1b4ff496c00986e1c83f7ab2fb8fcfe9616ff7477b", size = 154003 },
+    { url = "https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size = 140344 },
+    { url = "https://files.pythonhosted.org/packages/d1/11/0fd08f8192109f7169db964b5707a2f1e8b745d4e239b784a5a1dd80d1db/pyyaml-6.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8da9669d359f02c0b91ccc01cac4a67f16afec0dac22c2ad09f46bee0697eba8", size = 181669 },
+    { url = "https://files.pythonhosted.org/packages/b1/16/95309993f1d3748cd644e02e38b75d50cbc0d9561d21f390a76242ce073f/pyyaml-6.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2283a07e2c21a2aa78d9c4442724ec1eb15f5e42a723b99cb3d822d48f5f7ad1", size = 173252 },
+    { url = "https://files.pythonhosted.org/packages/50/31/b20f376d3f810b9b2371e72ef5adb33879b25edb7a6d072cb7ca0c486398/pyyaml-6.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee2922902c45ae8ccada2c5b501ab86c36525b883eff4255313a253a3160861c", size = 767081 },
+    { url = "https://files.pythonhosted.org/packages/49/1e/a55ca81e949270d5d4432fbbd19dfea5321eda7c41a849d443dc92fd1ff7/pyyaml-6.0.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a33284e20b78bd4a18c8c2282d549d10bc8408a2a7ff57653c0cf0b9be0afce5", size = 841159 },
+    { url = "https://files.pythonhosted.org/packages/74/27/e5b8f34d02d9995b80abcef563ea1f8b56d20134d8f4e5e81733b1feceb2/pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f29edc409a6392443abf94b9cf89ce99889a1dd5376d94316ae5145dfedd5d6", size = 801626 },
+    { url = "https://files.pythonhosted.org/packages/f9/11/ba845c23988798f40e52ba45f34849aa8a1f2d4af4b798588010792ebad6/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f7057c9a337546edc7973c0d3ba84ddcdf0daa14533c2065749c9075001090e6", size = 753613 },
+    { url = "https://files.pythonhosted.org/packages/3d/e0/7966e1a7bfc0a45bf0a7fb6b98ea03fc9b8d84fa7f2229e9659680b69ee3/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eda16858a3cab07b80edaf74336ece1f986ba330fdb8ee0d6c0d68fe82bc96be", size = 794115 },
+    { url = "https://files.pythonhosted.org/packages/de/94/980b50a6531b3019e45ddeada0626d45fa85cbe22300844a7983285bed3b/pyyaml-6.0.3-cp313-cp313-win32.whl", hash = "sha256:d0eae10f8159e8fdad514efdc92d74fd8d682c933a6dd088030f3834bc8e6b26", size = 137427 },
+    { url = "https://files.pythonhosted.org/packages/97/c9/39d5b874e8b28845e4ec2202b5da735d0199dbe5b8fb85f91398814a9a46/pyyaml-6.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:79005a0d97d5ddabfeeea4cf676af11e647e41d81c9a7722a193022accdb6b7c", size = 154090 },
+    { url = "https://files.pythonhosted.org/packages/73/e8/2bdf3ca2090f68bb3d75b44da7bbc71843b19c9f2b9cb9b0f4ab7a5a4329/pyyaml-6.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:5498cd1645aa724a7c71c8f378eb29ebe23da2fc0d7a08071d89469bf1d2defb", size = 140246 },
+    { url = "https://files.pythonhosted.org/packages/9d/8c/f4bd7f6465179953d3ac9bc44ac1a8a3e6122cf8ada906b4f96c60172d43/pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac", size = 181814 },
+    { url = "https://files.pythonhosted.org/packages/bd/9c/4d95bb87eb2063d20db7b60faa3840c1b18025517ae857371c4dd55a6b3a/pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310", size = 173809 },
+    { url = "https://files.pythonhosted.org/packages/92/b5/47e807c2623074914e29dabd16cbbdd4bf5e9b2db9f8090fa64411fc5382/pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7", size = 766454 },
+    { url = "https://files.pythonhosted.org/packages/02/9e/e5e9b168be58564121efb3de6859c452fccde0ab093d8438905899a3a483/pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788", size = 836355 },
+    { url = "https://files.pythonhosted.org/packages/88/f9/16491d7ed2a919954993e48aa941b200f38040928474c9e85ea9e64222c3/pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5", size = 794175 },
+    { url = "https://files.pythonhosted.org/packages/dd/3f/5989debef34dc6397317802b527dbbafb2b4760878a53d4166579111411e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764", size = 755228 },
+    { url = "https://files.pythonhosted.org/packages/d7/ce/af88a49043cd2e265be63d083fc75b27b6ed062f5f9fd6cdc223ad62f03e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35", size = 789194 },
+    { url = "https://files.pythonhosted.org/packages/23/20/bb6982b26a40bb43951265ba29d4c246ef0ff59c9fdcdf0ed04e0687de4d/pyyaml-6.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac", size = 156429 },
+    { url = "https://files.pythonhosted.org/packages/f4/f4/a4541072bb9422c8a883ab55255f918fa378ecf083f5b85e87fc2b4eda1b/pyyaml-6.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3", size = 143912 },
+    { url = "https://files.pythonhosted.org/packages/7c/f9/07dd09ae774e4616edf6cda684ee78f97777bdd15847253637a6f052a62f/pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3", size = 189108 },
+    { url = "https://files.pythonhosted.org/packages/4e/78/8d08c9fb7ce09ad8c38ad533c1191cf27f7ae1effe5bb9400a46d9437fcf/pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba", size = 183641 },
+    { url = "https://files.pythonhosted.org/packages/7b/5b/3babb19104a46945cf816d047db2788bcaf8c94527a805610b0289a01c6b/pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c", size = 831901 },
+    { url = "https://files.pythonhosted.org/packages/8b/cc/dff0684d8dc44da4d22a13f35f073d558c268780ce3c6ba1b87055bb0b87/pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702", size = 861132 },
+    { url = "https://files.pythonhosted.org/packages/b1/5e/f77dc6b9036943e285ba76b49e118d9ea929885becb0a29ba8a7c75e29fe/pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c", size = 839261 },
+    { url = "https://files.pythonhosted.org/packages/ce/88/a9db1376aa2a228197c58b37302f284b5617f56a5d959fd1763fb1675ce6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065", size = 805272 },
+    { url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923 },
+    { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062 },
+    { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341 },
+]
+
+[[package]]
+name = "regex"
+version = "2026.5.9"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/dc/0e/49aee608ad09480e7fd276898c99ec6192985fa331abe4eb3a986094490b/regex-2026.5.9.tar.gz", hash = "sha256:a8234aa23ec39894bfe4a3f1b85616a7032481964a13ac6fc9f10de4f6fca270", size = 416074 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/50/9b/6550044bc44e17c84d312c031c2ec42fbdb6a4ec4e29093be3a172d08772/regex-2026.5.9-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:57eeeb05db7979413dec5438f2db21d7ecbba787cde7a711df1a6f6df672aa06", size = 490451 },
+    { url = "https://files.pythonhosted.org/packages/1e/95/fc7ba4303b5a0f92446a12ee6778ef2c6c799233f5060042a31bf390cfe9/regex-2026.5.9-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:398c521292f4c7fb807001dcd54694d3a1fcafc179a36ad9cc56f98df85930b6", size = 292112 },
+    { url = "https://files.pythonhosted.org/packages/54/4b/ee27938d1b2c443e89a9a10e00d2d19aa5ee300cd3d61140644e93bb083e/regex-2026.5.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f7a7c26137296beba7784de6eba69c6a93a63ccebc385e4962fe67e267a91225", size = 289599 },
+    { url = "https://files.pythonhosted.org/packages/d8/dd/ba103dc19614e25f3880800ca67ce093d6e21b325d72b8383c7bf906e9fa/regex-2026.5.9-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6441cc660d76107934a09c22167200839a0e89604a6297f78a974e66e931d2c0", size = 796732 },
+    { url = "https://files.pythonhosted.org/packages/cf/e7/f035b4fd858b050b0080bf302968dc0f59ba34e391872d54936758e6844e/regex-2026.5.9-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:91328f1c23d47595ca3ef0a7557fa129c5a23404b775c770697d2f35b33e0107", size = 865440 },
+    { url = "https://files.pythonhosted.org/packages/0a/51/8cd301ecc899aea28124357f729f4272f44de7806fc7ca02490bfbe253e8/regex-2026.5.9-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:93a7860539414dddaefba2b40f8771765ae17949d4c7182b876ce429e11a8309", size = 912329 },
+    { url = "https://files.pythonhosted.org/packages/cc/1e/3fbe2fa1e8cebd62f3bb7d3321cff1640aca2e240b51d9bd624aad949260/regex-2026.5.9-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dd2810d22146b6d838acc5ec15602cb6b47920aa4e33015df3868eedfd20bab8", size = 801239 },
+    { url = "https://files.pythonhosted.org/packages/17/2f/6f6008682bf2cf98040a0d3153a8e557b6ab728d7713d045cee4ce544ab8/regex-2026.5.9-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:daff2bdbaf1d23e52fdff7c0b7bc2048b68f978df6a4d107ac981f94caef2e66", size = 777054 },
+    { url = "https://files.pythonhosted.org/packages/19/2b/eee0d20a6842ba04df4b8847a920b57ef56853f14ef85405473e586b605a/regex-2026.5.9-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4eeb011098fcb77af513dcef521a3dbecbf8849b1e38940759d293b7a93f5026", size = 785098 },
+    { url = "https://files.pythonhosted.org/packages/4a/98/6fc1e6410feefb92159edaed5041992bfe390e8d26c721865434acbca558/regex-2026.5.9-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:ea9c8ecfa1b73c73b626534d6626e5340d429630943672b8480724f44e84b962", size = 860095 },
+    { url = "https://files.pythonhosted.org/packages/18/a3/bd855e0f2cb1a978ecf6fa6bb69632dd9c3f6ea3b81cde62fde14c9daec7/regex-2026.5.9-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:cd2846168eb9ee3c513902bc8225409cb1caab31d04728b145171fa1625d9621", size = 765762 },
+    { url = "https://files.pythonhosted.org/packages/dc/66/0ae8c092e60b14c79d24f8e0b7f0aea5bfbffdcab00b5483d13404d3c3a5/regex-2026.5.9-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:39617fb0cde9c0e6306dc70e3bfc096f3da793219879f7ae7aa341a69fbdcf6d", size = 852100 },
+    { url = "https://files.pythonhosted.org/packages/21/de/8dfde60fc1b21c946a893ba273403b72617edb261370cb1087099a83f088/regex-2026.5.9-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fd03c4f0e33280d15cae17159b899245d6b7c53d21def19b263b39655061f5ce", size = 789479 },
+    { url = "https://files.pythonhosted.org/packages/c3/1c/bdcc98f9a4af4fdd166c74941174619ccff4726d3ce32faa8e9a2ecd38dd/regex-2026.5.9-cp312-cp312-win32.whl", hash = "sha256:164eba9b755ea6f244b0d881196fbc1fac09714e9782c9e2732b813142033c8e", size = 266699 },
+    { url = "https://files.pythonhosted.org/packages/78/87/240d36864f9e48ace85f72e79ced97ceb7f27ce87739a947dcb834b4e6bc/regex-2026.5.9-cp312-cp312-win_amd64.whl", hash = "sha256:86f40a5d6444db30a125c9c9177e6b25dad981cbc37451fd838f145e6edac92e", size = 277783 },
+    { url = "https://files.pythonhosted.org/packages/4f/b5/7b30f312b0669dff5beebe5b0989dc2d1a312b1a44fab852199c387a5b96/regex-2026.5.9-cp312-cp312-win_arm64.whl", hash = "sha256:96f5f58b54a063d7ea9dca08e1cf57bfe10499c4d579ee672da284f57f5f0070", size = 270513 },
+    { url = "https://files.pythonhosted.org/packages/aa/da/797e91ecec6f84135da778ddce78c20e0af5d2a15c26f87a81bc3eadb6db/regex-2026.5.9-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:d626b84406444b165fc0ba981604edea39f0588ff1f92baa23fe50799ea9afdb", size = 490303 },
+    { url = "https://files.pythonhosted.org/packages/44/da/bf30abaaa737b58f4a4b8c4a03659e02fd92092c822e0197ed9e0daab917/regex-2026.5.9-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:d7bdc0ab8f3dd7e1b4f9ab88634e13374669db86bb3c72e8292f07ae313f539f", size = 292019 },
+    { url = "https://files.pythonhosted.org/packages/2d/e7/d0eaf5713828417b9e5648cf81fa9bacd4961f6ab98c380c2034f8716e35/regex-2026.5.9-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a8820737949116ffff55fe18f9fc644530063ba6ebfcb8314239416e78f1347c", size = 289468 },
+    { url = "https://files.pythonhosted.org/packages/d3/9b/b3fdd62b003baa1a9b593cd8c8699c9651c2e80cc21a5c715707983c42d7/regex-2026.5.9-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:aa0fbdbac82cb3e4450d0ccde7d7a35607f4cb2dd9fba4b8b69bfaf8c9fa6aed", size = 796749 },
+    { url = "https://files.pythonhosted.org/packages/d4/30/66ab84588765f5b4b271a9ca09ef7ce2b87caa95176ec3d2ad65d7bc4902/regex-2026.5.9-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:57e8915c7986aa33d25e4d3629cef711cd2863f2961b10409f0c04cb8b7d9020", size = 865445 },
+    { url = "https://files.pythonhosted.org/packages/1a/89/f05169e8588aac365f35ffc7f3bc3184f095ef4cfded7cfaa3c7fd5dbd89/regex-2026.5.9-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:508f56a89ba9cb26e4168cbc37dbd60a28d82430a9e18ad1d25fe0883c314ca2", size = 912322 },
+    { url = "https://files.pythonhosted.org/packages/30/e1/c93444052cf41581f3c884ab3fb5823daf0992f11cd4388d4275ca610558/regex-2026.5.9-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b6d189041f15691cfa2b6c4290448ec221244d225b3f5fe9e7771b34ffcdf6e2", size = 801269 },
+    { url = "https://files.pythonhosted.org/packages/50/fe/0cf96b882f540e62e8b9956599798203d599c44cf4c77917ca27400ff69b/regex-2026.5.9-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e82db382b44d0111b22601c509c89f64434816c9e0eef9d1989cda8cc6ff1c04", size = 777085 },
+    { url = "https://files.pythonhosted.org/packages/23/5c/d78d4924e7fc875557b9e9b768423925fdfaac5549d06da7810019a9bd26/regex-2026.5.9-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:2acfb48634f64996b57f90f39afa692ff362162722581921fe92239a59960f3c", size = 785153 },
+    { url = "https://files.pythonhosted.org/packages/bf/e0/5214774090e7b4524dcea3e3c4aa74141d43043f8beb49c1599db1c8b53a/regex-2026.5.9-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:d29eebfc9525db68cad3c97eedd7f754fa265aa5cd0cf4f863b2421e1b48fc9f", size = 860164 },
+    { url = "https://files.pythonhosted.org/packages/6e/e1/4a57a83350319b1271f0d7a249b8672513ed928b237a741631270de6caea/regex-2026.5.9-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:debb893095e944091c16e641a6e33c1b0f4cb61ab945ec5afbf53ce7068834d8", size = 765731 },
+    { url = "https://files.pythonhosted.org/packages/12/f4/499e74a20c156fc75836ee04a72a38d1a063978f600937f9760467beb1b0/regex-2026.5.9-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:d659eee77986549c9ea45b861c7567e44d6287c3dc9a4565478853f7b9fe2ff6", size = 852062 },
+    { url = "https://files.pythonhosted.org/packages/5b/92/7eebc0d0a01e78629695f342ba17e0deaff8fb45e79cc0d7b98287da6e3e/regex-2026.5.9-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:2efa205e6d98b24d1f3ab395c11aa15cdf10935bca283d0285e0499c284fba21", size = 789577 },
+    { url = "https://files.pythonhosted.org/packages/05/a4/018e71f7d2ad48c1ebe6d3ae0026f9b7cb4802fd15c7cc02fdf724355102/regex-2026.5.9-cp313-cp313-win32.whl", hash = "sha256:f3844f134e834076677dd369976e9f5068679fcb8e50102fdf6b7ac96a3ec127", size = 266691 },
+    { url = "https://files.pythonhosted.org/packages/e6/1d/861a93719fb9ee7dbfc3761b3797b7a3e112a5d42c6129459d2d741be9b5/regex-2026.5.9-cp313-cp313-win_amd64.whl", hash = "sha256:3527bb4942d2c14552155406cdedd906567456821848aed1cb4933a391bf5eca", size = 277747 },
+    { url = "https://files.pythonhosted.org/packages/d9/c6/0a2436ae4da1ba76e51cb98943c6838a9a721faa40ebe2dce07694ae34e3/regex-2026.5.9-cp313-cp313-win_arm64.whl", hash = "sha256:56a33f191f17d8c417f99945ebdc1e691d3af9605d86ec68c7e54a57e3e17af6", size = 270500 },
+    { url = "https://files.pythonhosted.org/packages/e8/e9/d21346f7b60ed58789371358ed66b09d00f832e1bd7c06e55d9da5679882/regex-2026.5.9-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:01f28d868834624c934b8d2e0aa1c8341337e37831f4a012f18a5afcba4cbaf3", size = 494172 },
+    { url = "https://files.pythonhosted.org/packages/c4/43/fd1177a2032037c681baecdb3422ee4e1424aec4e4f470ef47793d325274/regex-2026.5.9-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:48036f6374aaa79eb3b754ec29c61d1c6b1606749d705a13f8854fa2539671f6", size = 293952 },
+    { url = "https://files.pythonhosted.org/packages/f2/7d/9fbf919768368d3f8a4f6c692cf2aa61e482b2b81ec6a298ace4cbf02480/regex-2026.5.9-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:b96350aa424e79d4fd6b567b344dcbe2b2d6bfc48dfe7717587e1fa6d43da6ff", size = 292314 },
+    { url = "https://files.pythonhosted.org/packages/e2/6c/e41bfeecb589716843e7c4df09ba46ff2a42961457afece19059d85caeef/regex-2026.5.9-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8f3af7a4903c5c04a11a196a5aa75cdd7dd3f8508132f9fb3259d9f5908e3b88", size = 811681 },
+    { url = "https://files.pythonhosted.org/packages/87/83/a5c1c525fba0aa656e88ad0face0b1829788ef4c2fb6b26df58aa1151b84/regex-2026.5.9-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7e87577720152d2caae19fe2baaf1f8d5ca12091e9e229f03915c37d1e4b9178", size = 871135 },
+    { url = "https://files.pythonhosted.org/packages/18/d4/80882e799e440dd878b0979cbebf8fa4d54624a332c83037c7a701649e3f/regex-2026.5.9-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c8b9b9d294cfea3cd19c718ade7cc93492b2c4991abd9a68d0b3477ae6d8e100", size = 917265 },
+    { url = "https://files.pythonhosted.org/packages/ae/ff/8db60211e2286e396aad7dc7725356c502bff0901ea05bd6cdc2e1a042b9/regex-2026.5.9-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:728d8bfd28a8845c8b6bc5dc7ce010453d206396786c0765c2740cb65f37791e", size = 816311 },
+    { url = "https://files.pythonhosted.org/packages/4c/47/742ef579c61730f8d268e5cf1f9ce0e37e2ea041ad0f5644724f2378e463/regex-2026.5.9-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:7e30b874d341fac767d7df5a0870540541c2c054b80cfaac116e8d367a8a7ff2", size = 785498 },
+    { url = "https://files.pythonhosted.org/packages/7f/ab/cb0999802dcb0fb95b1ab005e8d4163d8afdd67efc2cb6b6630ac13f8cb1/regex-2026.5.9-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:fd190e88a895a8901325fad284a3f74ea52b1da8525b76cc811fa9b1edf0ce2b", size = 801348 },
+    { url = "https://files.pythonhosted.org/packages/7d/62/8ca59a24c55bc34d166eefaf3717bd77772f329fdbf984d86581e0a3571c/regex-2026.5.9-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:8e76e8161ad00694cfce6767d5dea860c6391ac5b83e5c3a39661e696f11fc7e", size = 866493 },
+    { url = "https://files.pythonhosted.org/packages/8d/3d/30f2ae62cef3278bb5bb821f467277a55fb73f01032cf85997e15e8289a8/regex-2026.5.9-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:ddda5340e6c01a293027dd46232fa79eaff1b48058ce7a98f572b6445b088041", size = 772811 },
+    { url = "https://files.pythonhosted.org/packages/d8/ae/7d2089bcd78ad0c0161bc684339df50032acb438a7bd3305e7ddb1193cec/regex-2026.5.9-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:205109e96b3cf5adf8f4cd62bedde9487feb282b9497a3535451e5a24cd706a0", size = 856584 },
+    { url = "https://files.pythonhosted.org/packages/a9/29/92ff47f75990131ea4f24ba17819e5a9d141e10819807e09addd73409af6/regex-2026.5.9-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:dfbe4579b9f08036aa7d101d1835437a20783574ac66327e6b29b4018a138081", size = 803453 },
+    { url = "https://files.pythonhosted.org/packages/04/99/eff29f1037dcab36702c9ee5d6858cf1ce2336ea8ea2987f64245b99ea5e/regex-2026.5.9-cp313-cp313t-win32.whl", hash = "sha256:ed2c9e8068b614c574d8d30e543d617cf5379b0535d46f97ef00e904745a08b5", size = 269951 },
+    { url = "https://files.pythonhosted.org/packages/0e/9d/8870b8981d27b22cda77bb26a5ac7ebfa9c7d9e0dea195a834a82380e748/regex-2026.5.9-cp313-cp313t-win_amd64.whl", hash = "sha256:b46b0f094dc1d3b90356c85a0bd2c9bafc4a6a190b9d6f8ddd5a033b6e088ed4", size = 281240 },
+    { url = "https://files.pythonhosted.org/packages/72/b1/3379415e8f135c13ac551353397cc4fe97b4978f3cac73c5fcbcded548b8/regex-2026.5.9-cp313-cp313t-win_arm64.whl", hash = "sha256:872acc074bd29ffc9913ecdfedf6ea77502312ca44a4aa0d3779089c6069d8de", size = 272383 },
+    { url = "https://files.pythonhosted.org/packages/13/3e/9c3cd292d8808b3645a2ce517e200179b6d0e903f176300bd8b542e14de5/regex-2026.5.9-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:1bd7587a2948b4085195d5a3374eaf4a425dc3e55784c038175355ecf3bbbf8a", size = 490376 },
+    { url = "https://files.pythonhosted.org/packages/60/70/d43ee8a2ca0a8b68d167f21658b85520ac0574617c7f320367c5047f7556/regex-2026.5.9-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:dea2e88e1cce4522496cce630e11e67b98b7076620bc4336c3f674bc21a375f4", size = 291964 },
+    { url = "https://files.pythonhosted.org/packages/21/91/9d50b433828d8e74196904e168a43abf1e6e88b2a15d47ed742456720c37/regex-2026.5.9-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:2099f7e7ff7b6aa3192312650a56e91cc091e49d50b04e4f6f8b6e28b3b27f1c", size = 289682 },
+    { url = "https://files.pythonhosted.org/packages/3e/d2/b835e3cafbb9d977736912436259ff551d60919f7d7b3d37d46659c63564/regex-2026.5.9-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecd353045824e4477562a2ac718c25799cdaaa41f7aa925a806a8a3e6848a5b9", size = 796996 },
+    { url = "https://files.pythonhosted.org/packages/2c/a6/9f992d00019166b9de01c546dd4549bc679f2a68df11b877740b0760b7c2/regex-2026.5.9-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:65c8c8c37377794bd5b2f3ebe51919042bf17aec802e23c833d89782ed0c78af", size = 866089 },
+    { url = "https://files.pythonhosted.org/packages/e0/08/4d32af657e049b19cb62b02e46e38fe1518797bfb2203ee93a510b21b0dc/regex-2026.5.9-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5b73ab8afcf66c622db143d1c6fda4e58e4d537ee4f125229ad47b1ab80f34c0", size = 911530 },
+    { url = "https://files.pythonhosted.org/packages/d9/27/2af43dd1dc201d1fecefda64a45f4ad0995855b92724f795a777b402ee69/regex-2026.5.9-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0de5cf193997384ed2ca6f1cd4f78055b255d93d82d5a8cd6ba0d11c10b167e4", size = 800643 },
+    { url = "https://files.pythonhosted.org/packages/a4/dd/23a249047013b5321d4a60c4d2437462086f601b061776a525e5fba2a59f/regex-2026.5.9-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:d641a8c9a61618047796d572a39a79b26167b0411d2c3031937b2fe2d081e2cf", size = 777223 },
+    { url = "https://files.pythonhosted.org/packages/94/6a/e85ed9538cd19586d0465076a4578a12e093ce776d15f3f8ce92733a8dd6/regex-2026.5.9-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:24b2355ef5cc9aa5b8f07d17704face1c166fdcc2290fa7bd6e6c925655a8346", size = 785760 },
+    { url = "https://files.pythonhosted.org/packages/2a/c4/f25473209438638e947c55f9156fd8f236f74169229028cc99116380868e/regex-2026.5.9-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:a24852d3c29ad9e47593593d8a247c44ccc3d0548ef12c822d6ed0810affe676", size = 860891 },
+    { url = "https://files.pythonhosted.org/packages/f9/f7/f4f86e3c74419c37370e91f150ae0c2ef7d34b2e0e4cdd5da046a02e4022/regex-2026.5.9-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:916714069da19329ef7de197dcbc77bb3104145c7c2c864dbfbe318f46b88b14", size = 765891 },
+    { url = "https://files.pythonhosted.org/packages/26/70/704d8e13765939146b1cd0ef4e2feb71d7929727d2290f026eed10095955/regex-2026.5.9-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:fa411799ca8da32a8d38d020a88faa5b6f91657d284761352940ecf9f7c3bbdd", size = 851380 },
+    { url = "https://files.pythonhosted.org/packages/26/29/1a13582a8460038edc38e49f64ceb0dd7c60f5caba77571f4bf6601965d9/regex-2026.5.9-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:1e6da47d679b7010ef27556b6e0f99771b744936db1792a10ceac6547ae1503e", size = 789350 },
+    { url = "https://files.pythonhosted.org/packages/73/56/3dcafe34fc72e271d62ad9a291801e88a1457bb251c132f15fcc2e5aad1a/regex-2026.5.9-cp314-cp314-win32.whl", hash = "sha256:98bd73080e8756255137e1bd3f3f00295bbc5aa383c0e0f973920e9134d7c4ad", size = 272130 },
+    { url = "https://files.pythonhosted.org/packages/d0/9c/02eebf0be95efe416c664db7fb8b6b05b7a0b06a7544f2884f2558b0526f/regex-2026.5.9-cp314-cp314-win_amd64.whl", hash = "sha256:ff8d372ac2acdc048d1c19916f27ee61bc5722728458ba6ca5052f2c72d51763", size = 280999 },
+    { url = "https://files.pythonhosted.org/packages/70/5a/1dd1abee76cb7a846a0bcf42fdc87e5720c3c33c24f3e37814310a513d9f/regex-2026.5.9-cp314-cp314-win_arm64.whl", hash = "sha256:e1d93bf647916292e8edcec150c07ddf3dc50179ccaf770c04a7f9e452155372", size = 273500 },
+    { url = "https://files.pythonhosted.org/packages/86/c1/c5f619b0057a7965cb78ec559c1d7a45ce8c99a35bea95483d64959a93d9/regex-2026.5.9-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:83d0ee4a57d1c87cb549e195ec300b8f0ec3a82eba66d835e4e2ed8634fe4499", size = 494269 },
+    { url = "https://files.pythonhosted.org/packages/05/2c/5d01f1aee33de4bbe60c8452945bfc8477ca7c5ae4450f6bfe711036cb36/regex-2026.5.9-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:d3d7eb5c9a7f6df82ed3cfac9beb93882a5cbcb5b8b157b56cb2b3b276574ac1", size = 293954 },
+    { url = "https://files.pythonhosted.org/packages/7a/fe/e8988b2ae2108c6ef71bd4aa8d87fbe257976dd0810e826cd75f701c68b6/regex-2026.5.9-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:075160bf16658e16d35233300b8453aac25de4cbea808d22348b6979668e924d", size = 292405 },
+    { url = "https://files.pythonhosted.org/packages/79/34/d2b0937faa7859263f7f0a3c6b103a1296306be6952dc173d0154e9a2f49/regex-2026.5.9-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:45375819235558a4ff1c4971dc32881f022613abdb180128f5cb4768c1765a1c", size = 811855 },
+    { url = "https://files.pythonhosted.org/packages/80/fe/daf53a47457a8486db66c66c01ceb9c2303eecee3f87197f1e77eb1a736d/regex-2026.5.9-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ead4b163ac30a29574510cd4b3e2e985ac5290c05fc7095557d6a5f403fc31b5", size = 871189 },
+    { url = "https://files.pythonhosted.org/packages/1c/75/058fc4470cbfbf57d800aff1a0022b929a3f9fa553ee10a0cdf2070eb31f/regex-2026.5.9-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8c6e4218fbdfbcd4f6c19efca40930d24a621bf4b48cb76bc6640543bd28ef20", size = 917485 },
+    { url = "https://files.pythonhosted.org/packages/88/e7/179cfda3a28bc843b5c6cfe7f79f23489c791ed95f151083803660878432/regex-2026.5.9-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6351571c8a42b505eb555c0dc47d740d0fb66977dc142919eea6f4325b7c56a0", size = 816369 },
+    { url = "https://files.pythonhosted.org/packages/41/90/6f0cc422071688266d344fca8462d787cba0a2c144acb25721f9a61ec265/regex-2026.5.9-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:002205cafd2a9e78c6290c7d1df277bf3277b3b7a30e0b4bb0dac2e2e3f7cb2d", size = 785869 },
+    { url = "https://files.pythonhosted.org/packages/02/67/a31f1760f09c27b251ef39e9beb541f462cf977381d067faa764c2c0e393/regex-2026.5.9-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8abd33fef90b2a9efac5557d6033ca82d1195ed3a15fea5af15ba7b463c6a63b", size = 801427 },
+    { url = "https://files.pythonhosted.org/packages/e3/c4/1a80654597b6bc1e1ea0494824c31200e8a956abe290afae9b19a166a148/regex-2026.5.9-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:31037c82eccb44b7ea2e9e221d7c01429430e989a1f4b91ea5a855f6017b509a", size = 866482 },
+    { url = "https://files.pythonhosted.org/packages/d1/11/960724e06482c08466ff5611e242e86f80062949cdf6b4b9cc317b9dd93d/regex-2026.5.9-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:5604dfd046dc37eca90250fc3be938b076c8059fa772ac0ed6f499b0f0fb0415", size = 773022 },
+    { url = "https://files.pythonhosted.org/packages/50/a8/a9979c3e7918280e93159ebcab5ef1a65116dd4f3bd6091be0eae4a126e8/regex-2026.5.9-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:0e1b1b4e496afbb24f4a62aba855ee4f88f25578927697b340702e48c9ee6bc2", size = 856642 },
+    { url = "https://files.pythonhosted.org/packages/fe/d4/a9b732f2f0072c0ab12227483abb24fffcb9f73f8a2b203df0a6d0434735/regex-2026.5.9-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:be3372b9df6ddecff6486d37e19095a7b4973137caf5512407a89f4455361f41", size = 803552 },
+    { url = "https://files.pythonhosted.org/packages/d5/fe/1b3113817447a1d4155e4ac76d2e072f42c0bcba2f43fa8a0e756ea2cd91/regex-2026.5.9-cp314-cp314t-win32.whl", hash = "sha256:3ddd90103f9e5c471c49c7852ecc1fe27c7e45eb99e977aefe7caa4e779f4f58", size = 275746 },
+    { url = "https://files.pythonhosted.org/packages/92/73/93d42045302636c91f2e5ef588b65b84b01428f28ec77de256b1dfdfbe5c/regex-2026.5.9-cp314-cp314t-win_amd64.whl", hash = "sha256:ca518ed29c46eecba6010b15f1b9a479314d2de409536e71b6a13aa04e3b8a77", size = 285685 },
+    { url = "https://files.pythonhosted.org/packages/da/80/35b4c33c804a165a7f55289afda3ea9e3eb6d15800341a2d66455c0f1f30/regex-2026.5.9-cp314-cp314t-win_arm64.whl", hash = "sha256:5e41809d2683fcde7d5a8c87a6567ba1fb1ce0de9f31bff578de00a4b2d76daa", size = 275713 },
+]
+
+[[package]]
+name = "reportlab"
+version = "4.5.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "charset-normalizer" },
+    { name = "pillow" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/4d/3f/b3861b7e40c9d66f4a04e018958d681d16b948bfd1963c962d43a8c23f66/reportlab-4.5.1.tar.gz", hash = "sha256:9fdf68f4de9171ec66acb4a5feed8f8ca2af43479e707a6fbb0daa75d88e5494", size = 3939748 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a7/45/ea7fad10122440de6e845568d106bffdc456ca0e8a1d8ae10b46016087e4/reportlab-4.5.1-py3-none-any.whl", hash = "sha256:06fce8cb56c83307cfa4909cdf4e6a2ddbb44e5d6ef4d2edca896d7e9769f091", size = 1957812 },
+]
+
+[[package]]
+name = "requests"
+version = "2.34.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "certifi" },
+    { name = "charset-normalizer" },
+    { name = "idna" },
+    { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/43/b8/7a707d60fea4c49094e40262cc0e2ca6c768cca21587e34d3f705afec47e/requests-2.34.0.tar.gz", hash = "sha256:7d62fe92f50eb82c529b0916bb445afa1531a566fc8f35ffdc64446e771b856a", size = 142436 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ef/e6/e300fce5fe83c30520607a015dabd985df3251e188d234bfe9492e17a389/requests-2.34.0-py3-none-any.whl", hash = "sha256:917520a21b767485ce7c588f4ebb917c436b24a31231b44228715eaeb5a52c60", size = 73021 },
+]
+
+[[package]]
+name = "respx"
+version = "0.23.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "httpx" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/43/98/4e55c9c486404ec12373708d015ebce157966965a5ebe7f28ff2c784d41b/respx-0.23.1.tar.gz", hash = "sha256:242dcc6ce6b5b9bf621f5870c82a63997e8e82bc7c947f9ffe272b8f3dd5a780", size = 29243 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1d/4a/221da6ca167db45693d8d26c7dc79ccfc978a440251bf6721c9aaf251ac0/respx-0.23.1-py2.py3-none-any.whl", hash = "sha256:b18004b029935384bccfa6d7d9d74b4ec9af73a081cc28600fffc0447f4b8c1a", size = 25557 },
+]
+
+[[package]]
+name = "rich"
+version = "15.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "markdown-it-py" },
+    { name = "pygments" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c0/8f/0722ca900cc807c13a6a0c696dacf35430f72e0ec571c4275d2371fca3e9/rich-15.0.0.tar.gz", hash = "sha256:edd07a4824c6b40189fb7ac9bc4c52536e9780fbbfbddf6f1e2502c31b068c36", size = 230680 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/82/3b/64d4899d73f91ba49a8c18a8ff3f0ea8f1c1d75481760df8c68ef5235bf5/rich-15.0.0-py3-none-any.whl", hash = "sha256:33bd4ef74232fb73fe9279a257718407f169c09b78a87ad3d296f548e27de0bb", size = 310654 },
+]
+
+[[package]]
+name = "ruff"
+version = "0.15.12"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/99/43/3291f1cc9106f4c63bdce7a8d0df5047fe8422a75b091c16b5e9355e0b11/ruff-0.15.12.tar.gz", hash = "sha256:ecea26adb26b4232c0c2ca19ccbc0083a68344180bba2a600605538ce51a40a6", size = 4643852 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c3/6e/e78ffb61d4686f3d96ba3df2c801161843746dcbcbb17a1e927d4829312b/ruff-0.15.12-py3-none-linux_armv6l.whl", hash = "sha256:f86f176e188e94d6bdbc09f09bfd9dc729059ad93d0e7390b5a73efe19f8861c", size = 10640713 },
+    { url = "https://files.pythonhosted.org/packages/ae/08/a317bc231fb9e7b93e4ef3089501e51922ff88d6936ce5cf870c4fe55419/ruff-0.15.12-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:e3bcd123364c3770b8e1b7baaf343cc99a35f197c5c6e8af79015c666c423a6c", size = 11069267 },
+    { url = "https://files.pythonhosted.org/packages/aa/a4/f828e9718d3dce1f5f11c39c4f65afd32783c8b2aebb2e3d259e492c47bd/ruff-0.15.12-py3-none-macosx_11_0_arm64.whl", hash = "sha256:fe87510d000220aa1ed530d4448a7c696a0cae1213e5ec30e5874287b66557b5", size = 10397182 },
+    { url = "https://files.pythonhosted.org/packages/71/e0/3310fc6d1b5e1fdea22bf3b1b807c7e187b581021b0d7d4514cccdb5fb71/ruff-0.15.12-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:84a1630093121375a3e2a95b4a6dc7b59e2b4ee76216e32d81aae550a832d002", size = 10758012 },
+    { url = "https://files.pythonhosted.org/packages/11/c1/a606911aee04c324ddaa883ae418f3569792fd3c4a10c50e0dd0a2311e1e/ruff-0.15.12-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fb129f40f114f089ebe0ca56c0d251cf2061b17651d464bb6478dc01e69f11f5", size = 10447479 },
+    { url = "https://files.pythonhosted.org/packages/9d/68/4201e8444f0894f21ab4aeeaee68aa4f10b51613514a20d80bd628d57e88/ruff-0.15.12-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b0c862b172d695db7598426b8af465e7e9ac00a3ea2a3630ee67eb82e366aaa6", size = 11234040 },
+    { url = "https://files.pythonhosted.org/packages/34/ff/8a6d6cf4ccc23fd67060874e832c18919d1557a0611ebef03fdb01fff11e/ruff-0.15.12-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2849ea9f3484c3aca43a82f484210370319e7170df4dfe4843395ddf6c57bc33", size = 12087377 },
+    { url = "https://files.pythonhosted.org/packages/85/f6/c669cf73f5152f623d34e69866a46d5e6185816b19fcd5b6dd8a2d299922/ruff-0.15.12-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9e77c7e51c07fe396826d5969a5b846d9cd4c402535835fb6e21ce8b28fef847", size = 11367784 },
+    { url = "https://files.pythonhosted.org/packages/e8/39/c61d193b8a1daaa8977f7dea9e8d8ba866e02ea7b65d32f6861693aa4c12/ruff-0.15.12-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:83b2f4f2f3b1026b5fb449b467d9264bf22067b600f7b6f41fc5958909f449d0", size = 11344088 },
+    { url = "https://files.pythonhosted.org/packages/c2/8d/49afab3645e31e12c590acb6d3b5b69d7aab5b81926dbaf7461f9441f37a/ruff-0.15.12-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:9ba3b8f1afd7e2e43d8943e55f249e13f9682fde09711644a6e7290eb4f3e339", size = 11271770 },
+    { url = "https://files.pythonhosted.org/packages/46/06/33f41fe94403e2b755481cdfb9b7ef3e4e0ed031c4581124658d935d52b4/ruff-0.15.12-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:e852ba9fdc890655e1d78f2df1499efbe0e54126bd405362154a75e2bde159c5", size = 10719355 },
+    { url = "https://files.pythonhosted.org/packages/0d/59/18aa4e014debbf559670e4048e39260a85c7fcee84acfd761ac01e7b8d35/ruff-0.15.12-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:dd8aed930da53780d22fc70bdf84452c843cf64f8cb4eb38984319c24c5cd5fd", size = 10462758 },
+    { url = "https://files.pythonhosted.org/packages/25/e7/cc9f16fd0f3b5fddcbd7ec3d6ae30c8f3fde1047f32a4093a98d633c6570/ruff-0.15.12-py3-none-musllinux_1_2_i686.whl", hash = "sha256:01da3988d225628b709493d7dc67c3b9b12c0210016b08690ef9bd27970b262b", size = 10953498 },
+    { url = "https://files.pythonhosted.org/packages/72/7a/a9ba7f98c7a575978698f4230c5e8cc54bbc761af34f560818f933dafa0c/ruff-0.15.12-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:9cae0f92bd5700d1213188b31cd3bdd2b315361296d10b96b8e2337d3d11f53e", size = 11447765 },
+    { url = "https://files.pythonhosted.org/packages/ea/f9/0ae446942c846b8266059ad8a30702a35afae55f5cdc54c5adf8d7afdc27/ruff-0.15.12-py3-none-win32.whl", hash = "sha256:d0185894e038d7043ba8fd6aee7499ece6462dc0ea9f1e260c7451807c714c20", size = 10657277 },
+    { url = "https://files.pythonhosted.org/packages/33/f1/9614e03e1cdcbf9437570b5400ced8a720b5db22b28d8e0f1bda429f660d/ruff-0.15.12-py3-none-win_amd64.whl", hash = "sha256:c87a162d61ab3adca47c03f7f717c68672edec7d1b5499e652331780fe74950d", size = 11837758 },
+    { url = "https://files.pythonhosted.org/packages/c0/98/6beb4b351e472e5f4c4613f7c35a5290b8be2497e183825310c4c3a3984b/ruff-0.15.12-py3-none-win_arm64.whl", hash = "sha256:a538f7a82d061cee7be55542aca1d86d1393d55d81d4fcc314370f4340930d4f", size = 11120821 },
+]
+
+[[package]]
+name = "scikit-learn"
+version = "1.8.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "joblib" },
+    { name = "numpy" },
+    { name = "scipy" },
+    { name = "threadpoolctl" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/0e/d4/40988bf3b8e34feec1d0e6a051446b1f66225f8529b9309becaeef62b6c4/scikit_learn-1.8.0.tar.gz", hash = "sha256:9bccbb3b40e3de10351f8f5068e105d0f4083b1a65fa07b6634fbc401a6287fd", size = 7335585 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/90/74/e6a7cc4b820e95cc38cf36cd74d5aa2b42e8ffc2d21fe5a9a9c45c1c7630/scikit_learn-1.8.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:5fb63362b5a7ddab88e52b6dbb47dac3fd7dafeee740dc6c8d8a446ddedade8e", size = 8548242 },
+    { url = "https://files.pythonhosted.org/packages/49/d8/9be608c6024d021041c7f0b3928d4749a706f4e2c3832bbede4fb4f58c95/scikit_learn-1.8.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:5025ce924beccb28298246e589c691fe1b8c1c96507e6d27d12c5fadd85bfd76", size = 8079075 },
+    { url = "https://files.pythonhosted.org/packages/dd/47/f187b4636ff80cc63f21cd40b7b2d177134acaa10f6bb73746130ee8c2e5/scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4496bb2cf7a43ce1a2d7524a79e40bc5da45cf598dbf9545b7e8316ccba47bb4", size = 8660492 },
+    { url = "https://files.pythonhosted.org/packages/97/74/b7a304feb2b49df9fafa9382d4d09061a96ee9a9449a7cbea7988dda0828/scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a0bcfe4d0d14aec44921545fd2af2338c7471de9cb701f1da4c9d85906ab847a", size = 8931904 },
+    { url = "https://files.pythonhosted.org/packages/9f/c4/0ab22726a04ede56f689476b760f98f8f46607caecff993017ac1b64aa5d/scikit_learn-1.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:35c007dedb2ffe38fe3ee7d201ebac4a2deccd2408e8621d53067733e3c74809", size = 8019359 },
+    { url = "https://files.pythonhosted.org/packages/24/90/344a67811cfd561d7335c1b96ca21455e7e472d281c3c279c4d3f2300236/scikit_learn-1.8.0-cp312-cp312-win_arm64.whl", hash = "sha256:8c497fff237d7b4e07e9ef1a640887fa4fb765647f86fbe00f969ff6280ce2bb", size = 7641898 },
+    { url = "https://files.pythonhosted.org/packages/03/aa/e22e0768512ce9255eba34775be2e85c2048da73da1193e841707f8f039c/scikit_learn-1.8.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0d6ae97234d5d7079dc0040990a6f7aeb97cb7fa7e8945f1999a429b23569e0a", size = 8513770 },
+    { url = "https://files.pythonhosted.org/packages/58/37/31b83b2594105f61a381fc74ca19e8780ee923be2d496fcd8d2e1147bd99/scikit_learn-1.8.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:edec98c5e7c128328124a029bceb09eda2d526997780fef8d65e9a69eead963e", size = 8044458 },
+    { url = "https://files.pythonhosted.org/packages/2d/5a/3f1caed8765f33eabb723596666da4ebbf43d11e96550fb18bdec42b467b/scikit_learn-1.8.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:74b66d8689d52ed04c271e1329f0c61635bcaf5b926db9b12d58914cdc01fe57", size = 8610341 },
+    { url = "https://files.pythonhosted.org/packages/38/cf/06896db3f71c75902a8e9943b444a56e727418f6b4b4a90c98c934f51ed4/scikit_learn-1.8.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8fdf95767f989b0cfedb85f7ed8ca215d4be728031f56ff5a519ee1e3276dc2e", size = 8900022 },
+    { url = "https://files.pythonhosted.org/packages/1c/f9/9b7563caf3ec8873e17a31401858efab6b39a882daf6c1bfa88879c0aa11/scikit_learn-1.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:2de443b9373b3b615aec1bb57f9baa6bb3a9bd093f1269ba95c17d870422b271", size = 7989409 },
+    { url = "https://files.pythonhosted.org/packages/49/bd/1f4001503650e72c4f6009ac0c4413cb17d2d601cef6f71c0453da2732fc/scikit_learn-1.8.0-cp313-cp313-win_arm64.whl", hash = "sha256:eddde82a035681427cbedded4e6eff5e57fa59216c2e3e90b10b19ab1d0a65c3", size = 7619760 },
+    { url = "https://files.pythonhosted.org/packages/d2/7d/a630359fc9dcc95496588c8d8e3245cc8fd81980251079bc09c70d41d951/scikit_learn-1.8.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:7cc267b6108f0a1499a734167282c00c4ebf61328566b55ef262d48e9849c735", size = 8826045 },
+    { url = "https://files.pythonhosted.org/packages/cc/56/a0c86f6930cfcd1c7054a2bc417e26960bb88d32444fe7f71d5c2cfae891/scikit_learn-1.8.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:fe1c011a640a9f0791146011dfd3c7d9669785f9fed2b2a5f9e207536cf5c2fd", size = 8420324 },
+    { url = "https://files.pythonhosted.org/packages/46/1e/05962ea1cebc1cf3876667ecb14c283ef755bf409993c5946ade3b77e303/scikit_learn-1.8.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:72358cce49465d140cc4e7792015bb1f0296a9742d5622c67e31399b75468b9e", size = 8680651 },
+    { url = "https://files.pythonhosted.org/packages/fe/56/a85473cd75f200c9759e3a5f0bcab2d116c92a8a02ee08ccd73b870f8bb4/scikit_learn-1.8.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:80832434a6cc114f5219211eec13dcbc16c2bac0e31ef64c6d346cde3cf054cb", size = 8925045 },
+    { url = "https://files.pythonhosted.org/packages/cc/b7/64d8cfa896c64435ae57f4917a548d7ac7a44762ff9802f75a79b77cb633/scikit_learn-1.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:ee787491dbfe082d9c3013f01f5991658b0f38aa8177e4cd4bf434c58f551702", size = 8507994 },
+    { url = "https://files.pythonhosted.org/packages/5e/37/e192ea709551799379958b4c4771ec507347027bb7c942662c7fbeba31cb/scikit_learn-1.8.0-cp313-cp313t-win_arm64.whl", hash = "sha256:bf97c10a3f5a7543f9b88cbf488d33d175e9146115a451ae34568597ba33dcde", size = 7869518 },
+    { url = "https://files.pythonhosted.org/packages/24/05/1af2c186174cc92dcab2233f327336058c077d38f6fe2aceb08e6ab4d509/scikit_learn-1.8.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:c22a2da7a198c28dd1a6e1136f19c830beab7fdca5b3e5c8bba8394f8a5c45b3", size = 8528667 },
+    { url = "https://files.pythonhosted.org/packages/a8/25/01c0af38fe969473fb292bba9dc2b8f9b451f3112ff242c647fee3d0dfe7/scikit_learn-1.8.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:6b595b07a03069a2b1740dc08c2299993850ea81cce4fe19b2421e0c970de6b7", size = 8066524 },
+    { url = "https://files.pythonhosted.org/packages/be/ce/a0623350aa0b68647333940ee46fe45086c6060ec604874e38e9ab7d8e6c/scikit_learn-1.8.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:29ffc74089f3d5e87dfca4c2c8450f88bdc61b0fc6ed5d267f3988f19a1309f6", size = 8657133 },
+    { url = "https://files.pythonhosted.org/packages/b8/cb/861b41341d6f1245e6ca80b1c1a8c4dfce43255b03df034429089ca2a2c5/scikit_learn-1.8.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fb65db5d7531bccf3a4f6bec3462223bea71384e2cda41da0f10b7c292b9e7c4", size = 8923223 },
+    { url = "https://files.pythonhosted.org/packages/76/18/a8def8f91b18cd1ba6e05dbe02540168cb24d47e8dcf69e8d00b7da42a08/scikit_learn-1.8.0-cp314-cp314-win_amd64.whl", hash = "sha256:56079a99c20d230e873ea40753102102734c5953366972a71d5cb39a32bc40c6", size = 8096518 },
+    { url = "https://files.pythonhosted.org/packages/d1/77/482076a678458307f0deb44e29891d6022617b2a64c840c725495bee343f/scikit_learn-1.8.0-cp314-cp314-win_arm64.whl", hash = "sha256:3bad7565bc9cf37ce19a7c0d107742b320c1285df7aab1a6e2d28780df167242", size = 7754546 },
+    { url = "https://files.pythonhosted.org/packages/2d/d1/ef294ca754826daa043b2a104e59960abfab4cf653891037d19dd5b6f3cf/scikit_learn-1.8.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:4511be56637e46c25721e83d1a9cea9614e7badc7040c4d573d75fbe257d6fd7", size = 8848305 },
+    { url = "https://files.pythonhosted.org/packages/5b/e2/b1f8b05138ee813b8e1a4149f2f0d289547e60851fd1bb268886915adbda/scikit_learn-1.8.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:a69525355a641bf8ef136a7fa447672fb54fe8d60cab5538d9eb7c6438543fb9", size = 8432257 },
+    { url = "https://files.pythonhosted.org/packages/26/11/c32b2138a85dcb0c99f6afd13a70a951bfdff8a6ab42d8160522542fb647/scikit_learn-1.8.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c2656924ec73e5939c76ac4c8b026fc203b83d8900362eb2599d8aee80e4880f", size = 8678673 },
+    { url = "https://files.pythonhosted.org/packages/c7/57/51f2384575bdec454f4fe4e7a919d696c9ebce914590abf3e52d47607ab8/scikit_learn-1.8.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:15fc3b5d19cc2be65404786857f2e13c70c83dd4782676dd6814e3b89dc8f5b9", size = 8922467 },
+    { url = "https://files.pythonhosted.org/packages/35/4d/748c9e2872637a57981a04adc038dacaa16ba8ca887b23e34953f0b3f742/scikit_learn-1.8.0-cp314-cp314t-win_amd64.whl", hash = "sha256:00d6f1d66fbcf4eba6e356e1420d33cc06c70a45bb1363cd6f6a8e4ebbbdece2", size = 8774395 },
+    { url = "https://files.pythonhosted.org/packages/60/22/d7b2ebe4704a5e50790ba089d5c2ae308ab6bb852719e6c3bd4f04c3a363/scikit_learn-1.8.0-cp314-cp314t-win_arm64.whl", hash = "sha256:f28dd15c6bb0b66ba09728cf09fd8736c304be29409bd8445a080c1280619e8c", size = 8002647 },
+]
+
+[[package]]
+name = "scipy"
+version = "1.17.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/7a/97/5a3609c4f8d58b039179648e62dd220f89864f56f7357f5d4f45c29eb2cc/scipy-1.17.1.tar.gz", hash = "sha256:95d8e012d8cb8816c226aef832200b1d45109ed4464303e997c5b13122b297c0", size = 30573822 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/35/48/b992b488d6f299dbe3f11a20b24d3dda3d46f1a635ede1c46b5b17a7b163/scipy-1.17.1-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:35c3a56d2ef83efc372eaec584314bd0ef2e2f0d2adb21c55e6ad5b344c0dcb8", size = 31610954 },
+    { url = "https://files.pythonhosted.org/packages/b2/02/cf107b01494c19dc100f1d0b7ac3cc08666e96ba2d64db7626066cee895e/scipy-1.17.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:fcb310ddb270a06114bb64bbe53c94926b943f5b7f0842194d585c65eb4edd76", size = 28172662 },
+    { url = "https://files.pythonhosted.org/packages/cf/a9/599c28631bad314d219cf9ffd40e985b24d603fc8a2f4ccc5ae8419a535b/scipy-1.17.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:cc90d2e9c7e5c7f1a482c9875007c095c3194b1cfedca3c2f3291cdc2bc7c086", size = 20344366 },
+    { url = "https://files.pythonhosted.org/packages/35/f5/906eda513271c8deb5af284e5ef0206d17a96239af79f9fa0aebfe0e36b4/scipy-1.17.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:c80be5ede8f3f8eded4eff73cc99a25c388ce98e555b17d31da05287015ffa5b", size = 22704017 },
+    { url = "https://files.pythonhosted.org/packages/da/34/16f10e3042d2f1d6b66e0428308ab52224b6a23049cb2f5c1756f713815f/scipy-1.17.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e19ebea31758fac5893a2ac360fedd00116cbb7628e650842a6691ba7ca28a21", size = 32927842 },
+    { url = "https://files.pythonhosted.org/packages/01/8e/1e35281b8ab6d5d72ebe9911edcdffa3f36b04ed9d51dec6dd140396e220/scipy-1.17.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:02ae3b274fde71c5e92ac4d54bc06c42d80e399fec704383dcd99b301df37458", size = 35235890 },
+    { url = "https://files.pythonhosted.org/packages/c5/5c/9d7f4c88bea6e0d5a4f1bc0506a53a00e9fcb198de372bfe4d3652cef482/scipy-1.17.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8a604bae87c6195d8b1045eddece0514d041604b14f2727bbc2b3020172045eb", size = 35003557 },
+    { url = "https://files.pythonhosted.org/packages/65/94/7698add8f276dbab7a9de9fb6b0e02fc13ee61d51c7c3f85ac28b65e1239/scipy-1.17.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f590cd684941912d10becc07325a3eeb77886fe981415660d9265c4c418d0bea", size = 37625856 },
+    { url = "https://files.pythonhosted.org/packages/a2/84/dc08d77fbf3d87d3ee27f6a0c6dcce1de5829a64f2eae85a0ecc1f0daa73/scipy-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:41b71f4a3a4cab9d366cd9065b288efc4d4f3c0b37a91a8e0947fb5bd7f31d87", size = 36549682 },
+    { url = "https://files.pythonhosted.org/packages/bc/98/fe9ae9ffb3b54b62559f52dedaebe204b408db8109a8c66fdd04869e6424/scipy-1.17.1-cp312-cp312-win_arm64.whl", hash = "sha256:f4115102802df98b2b0db3cce5cb9b92572633a1197c77b7553e5203f284a5b3", size = 24547340 },
+    { url = "https://files.pythonhosted.org/packages/76/27/07ee1b57b65e92645f219b37148a7e7928b82e2b5dbeccecb4dff7c64f0b/scipy-1.17.1-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:5e3c5c011904115f88a39308379c17f91546f77c1667cea98739fe0fccea804c", size = 31590199 },
+    { url = "https://files.pythonhosted.org/packages/ec/ae/db19f8ab842e9b724bf5dbb7db29302a91f1e55bc4d04b1025d6d605a2c5/scipy-1.17.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:6fac755ca3d2c3edcb22f479fceaa241704111414831ddd3bc6056e18516892f", size = 28154001 },
+    { url = "https://files.pythonhosted.org/packages/5b/58/3ce96251560107b381cbd6e8413c483bbb1228a6b919fa8652b0d4090e7f/scipy-1.17.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:7ff200bf9d24f2e4d5dc6ee8c3ac64d739d3a89e2326ba68aaf6c4a2b838fd7d", size = 20325719 },
+    { url = "https://files.pythonhosted.org/packages/b2/83/15087d945e0e4d48ce2377498abf5ad171ae013232ae31d06f336e64c999/scipy-1.17.1-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:4b400bdc6f79fa02a4d86640310dde87a21fba0c979efff5248908c6f15fad1b", size = 22683595 },
+    { url = "https://files.pythonhosted.org/packages/b4/e0/e58fbde4a1a594c8be8114eb4aac1a55bcd6587047efc18a61eb1f5c0d30/scipy-1.17.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2b64ca7d4aee0102a97f3ba22124052b4bd2152522355073580bf4845e2550b6", size = 32896429 },
+    { url = "https://files.pythonhosted.org/packages/f5/5f/f17563f28ff03c7b6799c50d01d5d856a1d55f2676f537ca8d28c7f627cd/scipy-1.17.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:581b2264fc0aa555f3f435a5944da7504ea3a065d7029ad60e7c3d1ae09c5464", size = 35203952 },
+    { url = "https://files.pythonhosted.org/packages/8d/a5/9afd17de24f657fdfe4df9a3f1ea049b39aef7c06000c13db1530d81ccca/scipy-1.17.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:beeda3d4ae615106d7094f7e7cef6218392e4465cc95d25f900bebabfded0950", size = 34979063 },
+    { url = "https://files.pythonhosted.org/packages/8b/13/88b1d2384b424bf7c924f2038c1c409f8d88bb2a8d49d097861dd64a57b2/scipy-1.17.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6609bc224e9568f65064cfa72edc0f24ee6655b47575954ec6339534b2798369", size = 37598449 },
+    { url = "https://files.pythonhosted.org/packages/35/e5/d6d0e51fc888f692a35134336866341c08655d92614f492c6860dc45bb2c/scipy-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:37425bc9175607b0268f493d79a292c39f9d001a357bebb6b88fdfaff13f6448", size = 36510943 },
+    { url = "https://files.pythonhosted.org/packages/2a/fd/3be73c564e2a01e690e19cc618811540ba5354c67c8680dce3281123fb79/scipy-1.17.1-cp313-cp313-win_arm64.whl", hash = "sha256:5cf36e801231b6a2059bf354720274b7558746f3b1a4efb43fcf557ccd484a87", size = 24545621 },
+    { url = "https://files.pythonhosted.org/packages/6f/6b/17787db8b8114933a66f9dcc479a8272e4b4da75fe03b0c282f7b0ade8cd/scipy-1.17.1-cp313-cp313t-macosx_10_14_x86_64.whl", hash = "sha256:d59c30000a16d8edc7e64152e30220bfbd724c9bbb08368c054e24c651314f0a", size = 31936708 },
+    { url = "https://files.pythonhosted.org/packages/38/2e/524405c2b6392765ab1e2b722a41d5da33dc5c7b7278184a8ad29b6cb206/scipy-1.17.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:010f4333c96c9bb1a4516269e33cb5917b08ef2166d5556ca2fd9f082a9e6ea0", size = 28570135 },
+    { url = "https://files.pythonhosted.org/packages/fd/c3/5bd7199f4ea8556c0c8e39f04ccb014ac37d1468e6cfa6a95c6b3562b76e/scipy-1.17.1-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:2ceb2d3e01c5f1d83c4189737a42d9cb2fc38a6eeed225e7515eef71ad301dce", size = 20741977 },
+    { url = "https://files.pythonhosted.org/packages/d9/b8/8ccd9b766ad14c78386599708eb745f6b44f08400a5fd0ade7cf89b6fc93/scipy-1.17.1-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:844e165636711ef41f80b4103ed234181646b98a53c8f05da12ca5ca289134f6", size = 23029601 },
+    { url = "https://files.pythonhosted.org/packages/6d/a0/3cb6f4d2fb3e17428ad2880333cac878909ad1a89f678527b5328b93c1d4/scipy-1.17.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:158dd96d2207e21c966063e1635b1063cd7787b627b6f07305315dd73d9c679e", size = 33019667 },
+    { url = "https://files.pythonhosted.org/packages/f3/c3/2d834a5ac7bf3a0c806ad1508efc02dda3c8c61472a56132d7894c312dea/scipy-1.17.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:74cbb80d93260fe2ffa334efa24cb8f2f0f622a9b9febf8b483c0b865bfb3475", size = 35264159 },
+    { url = "https://files.pythonhosted.org/packages/4d/77/d3ed4becfdbd217c52062fafe35a72388d1bd82c2d0ba5ca19d6fcc93e11/scipy-1.17.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:dbc12c9f3d185f5c737d801da555fb74b3dcfa1a50b66a1a93e09190f41fab50", size = 35102771 },
+    { url = "https://files.pythonhosted.org/packages/bd/12/d19da97efde68ca1ee5538bb261d5d2c062f0c055575128f11a2730e3ac1/scipy-1.17.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:94055a11dfebe37c656e70317e1996dc197e1a15bbcc351bcdd4610e128fe1ca", size = 37665910 },
+    { url = "https://files.pythonhosted.org/packages/06/1c/1172a88d507a4baaf72c5a09bb6c018fe2ae0ab622e5830b703a46cc9e44/scipy-1.17.1-cp313-cp313t-win_amd64.whl", hash = "sha256:e30bdeaa5deed6bc27b4cc490823cd0347d7dae09119b8803ae576ea0ce52e4c", size = 36562980 },
+    { url = "https://files.pythonhosted.org/packages/70/b0/eb757336e5a76dfa7911f63252e3b7d1de00935d7705cf772db5b45ec238/scipy-1.17.1-cp313-cp313t-win_arm64.whl", hash = "sha256:a720477885a9d2411f94a93d16f9d89bad0f28ca23c3f8daa521e2dcc3f44d49", size = 24856543 },
+    { url = "https://files.pythonhosted.org/packages/cf/83/333afb452af6f0fd70414dc04f898647ee1423979ce02efa75c3b0f2c28e/scipy-1.17.1-cp314-cp314-macosx_10_14_x86_64.whl", hash = "sha256:a48a72c77a310327f6a3a920092fa2b8fd03d7deaa60f093038f22d98e096717", size = 31584510 },
+    { url = "https://files.pythonhosted.org/packages/ed/a6/d05a85fd51daeb2e4ea71d102f15b34fedca8e931af02594193ae4fd25f7/scipy-1.17.1-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:45abad819184f07240d8a696117a7aacd39787af9e0b719d00285549ed19a1e9", size = 28170131 },
+    { url = "https://files.pythonhosted.org/packages/db/7b/8624a203326675d7746a254083a187398090a179335b2e4a20e2ddc46e83/scipy-1.17.1-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:3fd1fcdab3ea951b610dc4cef356d416d5802991e7e32b5254828d342f7b7e0b", size = 20342032 },
+    { url = "https://files.pythonhosted.org/packages/c9/35/2c342897c00775d688d8ff3987aced3426858fd89d5a0e26e020b660b301/scipy-1.17.1-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:7bdf2da170b67fdf10bca777614b1c7d96ae3ca5794fd9587dce41eb2966e866", size = 22678766 },
+    { url = "https://files.pythonhosted.org/packages/ef/f2/7cdb8eb308a1a6ae1e19f945913c82c23c0c442a462a46480ce487fdc0ac/scipy-1.17.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:adb2642e060a6549c343603a3851ba76ef0b74cc8c079a9a58121c7ec9fe2350", size = 32957007 },
+    { url = "https://files.pythonhosted.org/packages/0b/2e/7eea398450457ecb54e18e9d10110993fa65561c4f3add5e8eccd2b9cd41/scipy-1.17.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:eee2cfda04c00a857206a4330f0c5e3e56535494e30ca445eb19ec624ae75118", size = 35221333 },
+    { url = "https://files.pythonhosted.org/packages/d9/77/5b8509d03b77f093a0d52e606d3c4f79e8b06d1d38c441dacb1e26cacf46/scipy-1.17.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d2650c1fb97e184d12d8ba010493ee7b322864f7d3d00d3f9bb97d9c21de4068", size = 35042066 },
+    { url = "https://files.pythonhosted.org/packages/f9/df/18f80fb99df40b4070328d5ae5c596f2f00fffb50167e31439e932f29e7d/scipy-1.17.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:08b900519463543aa604a06bec02461558a6e1cef8fdbb8098f77a48a83c8118", size = 37612763 },
+    { url = "https://files.pythonhosted.org/packages/4b/39/f0e8ea762a764a9dc52aa7dabcfad51a354819de1f0d4652b6a1122424d6/scipy-1.17.1-cp314-cp314-win_amd64.whl", hash = "sha256:3877ac408e14da24a6196de0ddcace62092bfc12a83823e92e49e40747e52c19", size = 37290984 },
+    { url = "https://files.pythonhosted.org/packages/7c/56/fe201e3b0f93d1a8bcf75d3379affd228a63d7e2d80ab45467a74b494947/scipy-1.17.1-cp314-cp314-win_arm64.whl", hash = "sha256:f8885db0bc2bffa59d5c1b72fad7a6a92d3e80e7257f967dd81abb553a90d293", size = 25192877 },
+    { url = "https://files.pythonhosted.org/packages/96/ad/f8c414e121f82e02d76f310f16db9899c4fcde36710329502a6b2a3c0392/scipy-1.17.1-cp314-cp314t-macosx_10_14_x86_64.whl", hash = "sha256:1cc682cea2ae55524432f3cdff9e9a3be743d52a7443d0cba9017c23c87ae2f6", size = 31949750 },
+    { url = "https://files.pythonhosted.org/packages/7c/b0/c741e8865d61b67c81e255f4f0a832846c064e426636cd7de84e74d209be/scipy-1.17.1-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:2040ad4d1795a0ae89bfc7e8429677f365d45aa9fd5e4587cf1ea737f927b4a1", size = 28585858 },
+    { url = "https://files.pythonhosted.org/packages/ed/1b/3985219c6177866628fa7c2595bfd23f193ceebbe472c98a08824b9466ff/scipy-1.17.1-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:131f5aaea57602008f9822e2115029b55d4b5f7c070287699fe45c661d051e39", size = 20757723 },
+    { url = "https://files.pythonhosted.org/packages/c0/19/2a04aa25050d656d6f7b9e7b685cc83d6957fb101665bfd9369ca6534563/scipy-1.17.1-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:9cdc1a2fcfd5c52cfb3045feb399f7b3ce822abdde3a193a6b9a60b3cb5854ca", size = 23043098 },
+    { url = "https://files.pythonhosted.org/packages/86/f1/3383beb9b5d0dbddd030335bf8a8b32d4317185efe495374f134d8be6cce/scipy-1.17.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6e3dcd57ab780c741fde8dc68619de988b966db759a3c3152e8e9142c26295ad", size = 33030397 },
+    { url = "https://files.pythonhosted.org/packages/41/68/8f21e8a65a5a03f25a79165ec9d2b28c00e66dc80546cf5eb803aeeff35b/scipy-1.17.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a9956e4d4f4a301ebf6cde39850333a6b6110799d470dbbb1e25326ac447f52a", size = 35281163 },
+    { url = "https://files.pythonhosted.org/packages/84/8d/c8a5e19479554007a5632ed7529e665c315ae7492b4f946b0deb39870e39/scipy-1.17.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:a4328d245944d09fd639771de275701ccadf5f781ba0ff092ad141e017eccda4", size = 35116291 },
+    { url = "https://files.pythonhosted.org/packages/52/52/e57eceff0e342a1f50e274264ed47497b59e6a4e3118808ee58ddda7b74a/scipy-1.17.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a77cbd07b940d326d39a1d1b37817e2ee4d79cb30e7338f3d0cddffae70fcaa2", size = 37682317 },
+    { url = "https://files.pythonhosted.org/packages/11/2f/b29eafe4a3fbc3d6de9662b36e028d5f039e72d345e05c250e121a230dd4/scipy-1.17.1-cp314-cp314t-win_amd64.whl", hash = "sha256:eb092099205ef62cd1782b006658db09e2fed75bffcae7cc0d44052d8aa0f484", size = 37345327 },
+    { url = "https://files.pythonhosted.org/packages/07/39/338d9219c4e87f3e708f18857ecd24d22a0c3094752393319553096b98af/scipy-1.17.1-cp314-cp314t-win_arm64.whl", hash = "sha256:200e1050faffacc162be6a486a984a0497866ec54149a01270adc8a59b7c7d21", size = 25489165 },
+]
+
+[[package]]
+name = "setuptools"
+version = "82.0.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/4f/db/cfac1baf10650ab4d1c111714410d2fbb77ac5a616db26775db562c8fab2/setuptools-82.0.1.tar.gz", hash = "sha256:7d872682c5d01cfde07da7bccc7b65469d3dca203318515ada1de5eda35efbf9", size = 1152316 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9d/76/f789f7a86709c6b087c5a2f52f911838cad707cc613162401badc665acfe/setuptools-82.0.1-py3-none-any.whl", hash = "sha256:a59e362652f08dcd477c78bb6e7bd9d80a7995bc73ce773050228a348ce2e5bb", size = 1006223 },
+]
+
+[[package]]
+name = "shellingham"
+version = "1.5.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/58/15/8b3609fd3830ef7b27b655beb4b4e9c62313a4e8da8c676e142cc210d58e/shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de", size = 10310 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755 },
+]
+
+[[package]]
+name = "six"
+version = "1.17.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050 },
+]
+
+[[package]]
+name = "sqlalchemy"
+version = "2.0.49"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "greenlet", marker = "platform_machine == 'AMD64' or platform_machine == 'WIN32' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'ppc64le' or platform_machine == 'win32' or platform_machine == 'x86_64'" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/09/45/461788f35e0364a8da7bda51a1fe1b09762d0c32f12f63727998d85a873b/sqlalchemy-2.0.49.tar.gz", hash = "sha256:d15950a57a210e36dd4cec1aac22787e2a4d57ba9318233e2ef8b2daf9ff2d5f", size = 9898221 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/49/b3/2de412451330756aaaa72d27131db6dde23995efe62c941184e15242a5fa/sqlalchemy-2.0.49-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4bbccb45260e4ff1b7db0be80a9025bb1e6698bdb808b83fff0000f7a90b2c0b", size = 2157681 },
+    { url = "https://files.pythonhosted.org/packages/50/84/b2a56e2105bd11ebf9f0b93abddd748e1a78d592819099359aa98134a8bf/sqlalchemy-2.0.49-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fb37f15714ec2652d574f021d479e78cd4eb9d04396dca36568fdfffb3487982", size = 3338976 },
+    { url = "https://files.pythonhosted.org/packages/2c/fa/65fcae2ed62f84ab72cf89536c7c3217a156e71a2c111b1305ab6f0690e2/sqlalchemy-2.0.49-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3bb9ec6436a820a4c006aad1ac351f12de2f2dbdaad171692ee457a02429b672", size = 3351937 },
+    { url = "https://files.pythonhosted.org/packages/f8/2f/6fd118563572a7fe475925742eb6b3443b2250e346a0cc27d8d408e73773/sqlalchemy-2.0.49-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8d6efc136f44a7e8bc8088507eaabbb8c2b55b3dbb63fe102c690da0ddebe55e", size = 3281646 },
+    { url = "https://files.pythonhosted.org/packages/c5/d7/410f4a007c65275b9cf82354adb4bb8ba587b176d0a6ee99caa16fe638f8/sqlalchemy-2.0.49-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e06e617e3d4fd9e51d385dfe45b077a41e9d1b033a7702551e3278ac597dc750", size = 3316695 },
+    { url = "https://files.pythonhosted.org/packages/d9/95/81f594aa60ded13273a844539041ccf1e66c5a7bed0a8e27810a3b52d522/sqlalchemy-2.0.49-cp312-cp312-win32.whl", hash = "sha256:83101a6930332b87653886c01d1ee7e294b1fe46a07dd9a2d2b4f91bcc88eec0", size = 2117483 },
+    { url = "https://files.pythonhosted.org/packages/47/9e/fd90114059175cac64e4fafa9bf3ac20584384d66de40793ae2e2f26f3bb/sqlalchemy-2.0.49-cp312-cp312-win_amd64.whl", hash = "sha256:618a308215b6cececb6240b9abde545e3acdabac7ae3e1d4e666896bf5ba44b4", size = 2144494 },
+    { url = "https://files.pythonhosted.org/packages/ae/81/81755f50eb2478eaf2049728491d4ea4f416c1eb013338682173259efa09/sqlalchemy-2.0.49-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:df2d441bacf97022e81ad047e1597552eb3f83ca8a8f1a1fdd43cd7fe3898120", size = 2154547 },
+    { url = "https://files.pythonhosted.org/packages/a2/bc/3494270da80811d08bcfa247404292428c4fe16294932bce5593f215cad9/sqlalchemy-2.0.49-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8e20e511dc15265fb433571391ba313e10dd8ea7e509d51686a51313b4ac01a2", size = 3280782 },
+    { url = "https://files.pythonhosted.org/packages/cd/f5/038741f5e747a5f6ea3e72487211579d8cbea5eb9827a9cbd61d0108c4bd/sqlalchemy-2.0.49-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:47604cb2159f8bbd5a1ab48a714557156320f20871ee64d550d8bf2683d980d3", size = 3297156 },
+    { url = "https://files.pythonhosted.org/packages/88/50/a6af0ff9dc954b43a65ca9b5367334e45d99684c90a3d3413fc19a02d43c/sqlalchemy-2.0.49-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:22d8798819f86720bc646ab015baff5ea4c971d68121cb36e2ebc2ee43ead2b7", size = 3228832 },
+    { url = "https://files.pythonhosted.org/packages/bc/d1/5f6bdad8de0bf546fc74370939621396515e0cdb9067402d6ba1b8afbe9a/sqlalchemy-2.0.49-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9b1c058c171b739e7c330760044803099c7fff11511e3ab3573e5327116a9c33", size = 3267000 },
+    { url = "https://files.pythonhosted.org/packages/f7/30/ad62227b4a9819a5e1c6abff77c0f614fa7c9326e5a3bdbee90f7139382b/sqlalchemy-2.0.49-cp313-cp313-win32.whl", hash = "sha256:a143af2ea6672f2af3f44ed8f9cd020e9cc34c56f0e8db12019d5d9ecf41cb3b", size = 2115641 },
+    { url = "https://files.pythonhosted.org/packages/17/3a/7215b1b7d6d49dc9a87211be44562077f5f04f9bb5a59552c1c8e2d98173/sqlalchemy-2.0.49-cp313-cp313-win_amd64.whl", hash = "sha256:12b04d1db2663b421fe072d638a138460a51d5a862403295671c4f3987fb9148", size = 2141498 },
+    { url = "https://files.pythonhosted.org/packages/28/4b/52a0cb2687a9cd1648252bb257be5a1ba2c2ded20ba695c65756a55a15a4/sqlalchemy-2.0.49-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:24bd94bb301ec672d8f0623eba9226cc90d775d25a0c92b5f8e4965d7f3a1518", size = 3560807 },
+    { url = "https://files.pythonhosted.org/packages/8c/d8/fda95459204877eed0458550d6c7c64c98cc50c2d8d618026737de9ed41a/sqlalchemy-2.0.49-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a51d3db74ba489266ef55c7a4534eb0b8db9a326553df481c11e5d7660c8364d", size = 3527481 },
+    { url = "https://files.pythonhosted.org/packages/ff/0a/2aac8b78ac6487240cf7afef8f203ca783e8796002dc0cf65c4ee99ff8bb/sqlalchemy-2.0.49-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:55250fe61d6ebfd6934a272ee16ef1244e0f16b7af6cd18ab5b1fc9f08631db0", size = 3468565 },
+    { url = "https://files.pythonhosted.org/packages/a5/3d/ce71cfa82c50a373fd2148b3c870be05027155ce791dc9a5dcf439790b8b/sqlalchemy-2.0.49-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:46796877b47034b559a593d7e4b549aba151dae73f9e78212a3478161c12ab08", size = 3477769 },
+    { url = "https://files.pythonhosted.org/packages/d5/e8/0a9f5c1f7c6f9ca480319bf57c2d7423f08d31445974167a27d14483c948/sqlalchemy-2.0.49-cp313-cp313t-win32.whl", hash = "sha256:9c4969a86e41454f2858256c39bdfb966a20961e9b58bf8749b65abf447e9a8d", size = 2143319 },
+    { url = "https://files.pythonhosted.org/packages/0e/51/fb5240729fbec73006e137c4f7a7918ffd583ab08921e6ff81a999d6517a/sqlalchemy-2.0.49-cp313-cp313t-win_amd64.whl", hash = "sha256:b9870d15ef00e4d0559ae10ee5bc71b654d1f20076dbe8bc7ed19b4c0625ceba", size = 2175104 },
+    { url = "https://files.pythonhosted.org/packages/55/33/bf28f618c0a9597d14e0b9ee7d1e0622faff738d44fe986ee287cdf1b8d0/sqlalchemy-2.0.49-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:233088b4b99ebcbc5258c755a097aa52fbf90727a03a5a80781c4b9c54347a2e", size = 2156356 },
+    { url = "https://files.pythonhosted.org/packages/d1/a7/5f476227576cb8644650eff68cc35fa837d3802b997465c96b8340ced1e2/sqlalchemy-2.0.49-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:57ca426a48eb2c682dae8204cd89ea8ab7031e2675120a47924fabc7caacbc2a", size = 3276486 },
+    { url = "https://files.pythonhosted.org/packages/2e/84/efc7c0bf3a1c5eef81d397f6fddac855becdbb11cb38ff957888603014a7/sqlalchemy-2.0.49-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:685e93e9c8f399b0c96a624799820176312f5ceef958c0f88215af4013d29066", size = 3281479 },
+    { url = "https://files.pythonhosted.org/packages/91/68/bb406fa4257099c67bd75f3f2261b129c63204b9155de0d450b37f004698/sqlalchemy-2.0.49-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:9e0400fa22f79acc334d9a6b185dc00a44a8e6578aa7e12d0ddcd8434152b187", size = 3226269 },
+    { url = "https://files.pythonhosted.org/packages/67/84/acb56c00cca9f251f437cb49e718e14f7687505749ea9255d7bd8158a6df/sqlalchemy-2.0.49-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:a05977bffe9bffd2229f477fa75eabe3192b1b05f408961d1bebff8d1cd4d401", size = 3248260 },
+    { url = "https://files.pythonhosted.org/packages/56/19/6a20ea25606d1efd7bd1862149bb2a22d1451c3f851d23d887969201633f/sqlalchemy-2.0.49-cp314-cp314-win32.whl", hash = "sha256:0f2fa354ba106eafff2c14b0cc51f22801d1e8b2e4149342023bd6f0955de5f5", size = 2118463 },
+    { url = "https://files.pythonhosted.org/packages/cf/4f/8297e4ed88e80baa1f5aa3c484a0ee29ef3c69c7582f206c916973b75057/sqlalchemy-2.0.49-cp314-cp314-win_amd64.whl", hash = "sha256:77641d299179c37b89cf2343ca9972c88bb6eef0d5fc504a2f86afd15cd5adf5", size = 2144204 },
+    { url = "https://files.pythonhosted.org/packages/1f/33/95e7216df810c706e0cd3655a778604bbd319ed4f43333127d465a46862d/sqlalchemy-2.0.49-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c1dc3368794d522f43914e03312202523cc89692f5389c32bea0233924f8d977", size = 3565474 },
+    { url = "https://files.pythonhosted.org/packages/0c/a4/ed7b18d8ccf7f954a83af6bb73866f5bc6f5636f44c7731fbb741f72cc4f/sqlalchemy-2.0.49-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7c821c47ecfe05cc32140dcf8dc6fd5d21971c86dbd56eabfe5ba07a64910c01", size = 3530567 },
+    { url = "https://files.pythonhosted.org/packages/73/a3/20faa869c7e21a827c4a2a42b41353a54b0f9f5e96df5087629c306df71e/sqlalchemy-2.0.49-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:9c04bff9a5335eb95c6ecf1c117576a0aa560def274876fd156cfe5510fccc61", size = 3474282 },
+    { url = "https://files.pythonhosted.org/packages/b7/50/276b9a007aa0764304ad467eceb70b04822dc32092492ee5f322d559a4dc/sqlalchemy-2.0.49-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:7f605a456948c35260e7b2a39f8952a26f077fd25653c37740ed186b90aaa68a", size = 3480406 },
+    { url = "https://files.pythonhosted.org/packages/e5/c3/c80fcdb41905a2df650c2a3e0337198b6848876e63d66fe9188ef9003d24/sqlalchemy-2.0.49-cp314-cp314t-win32.whl", hash = "sha256:6270d717b11c5476b0cbb21eedc8d4dbb7d1a956fd6c15a23e96f197a6193158", size = 2149151 },
+    { url = "https://files.pythonhosted.org/packages/05/52/9f1a62feab6ed368aff068524ff414f26a6daebc7361861035ae00b05530/sqlalchemy-2.0.49-cp314-cp314t-win_amd64.whl", hash = "sha256:275424295f4256fd301744b8f335cff367825d270f155d522b30c7bf49903ee7", size = 2184178 },
+    { url = "https://files.pythonhosted.org/packages/e5/30/8519fdde58a7bdf155b714359791ad1dc018b47d60269d5d160d311fdc36/sqlalchemy-2.0.49-py3-none-any.whl", hash = "sha256:ec44cfa7ef1a728e88ad41674de50f6db8cfdb3e2af84af86e0041aaf02d43d0", size = 1942158 },
+]
+
+[package.optional-dependencies]
+asyncio = [
+    { name = "greenlet" },
+]
+
+[[package]]
+name = "surfsense-evals"
+version = "0.1.0"
+source = { editable = "." }
+dependencies = [
+    { name = "azure-ai-documentintelligence" },
+    { name = "datasets" },
+    { name = "httpx" },
+    { name = "httpx-sse" },
+    { name = "huggingface-hub" },
+    { name = "llama-cloud-services" },
+    { name = "numpy" },
+    { name = "pillow" },
+    { name = "pyarrow" },
+    { name = "pydantic" },
+    { name = "pypdf" },
+    { name = "python-dotenv" },
+    { name = "reportlab" },
+    { name = "rich" },
+    { name = "scikit-learn" },
+    { name = "scipy" },
+    { name = "tqdm" },
+    { name = "trafilatura" },
+]
+
+[package.optional-dependencies]
+dev = [
+    { name = "pytest" },
+    { name = "pytest-asyncio" },
+    { name = "respx" },
+    { name = "ruff" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "azure-ai-documentintelligence", specifier = ">=1.0.2" },
+    { name = "datasets", specifier = ">=2.21.0" },
+    { name = "httpx", specifier = ">=0.27.0" },
+    { name = "httpx-sse", specifier = ">=0.4.0" },
+    { name = "huggingface-hub", specifier = ">=0.24.0" },
+    { name = "llama-cloud-services", specifier = ">=0.6.25" },
+    { name = "numpy", specifier = ">=1.26.0" },
+    { name = "pillow", specifier = ">=10.0.0" },
+    { name = "pyarrow", specifier = ">=15.0.0" },
+    { name = "pydantic", specifier = ">=2.6.0" },
+    { name = "pypdf", specifier = ">=5.1.0" },
+    { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0.0" },
+    { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.23.0" },
+    { name = "python-dotenv", specifier = ">=1.0.0" },
+    { name = "reportlab", specifier = ">=4.0.0" },
+    { name = "respx", marker = "extra == 'dev'", specifier = ">=0.21.0" },
+    { name = "rich", specifier = ">=13.7.0" },
+    { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.5.0" },
+    { name = "scikit-learn", specifier = ">=1.4.0" },
+    { name = "scipy", specifier = ">=1.12.0" },
+    { name = "tqdm", specifier = ">=4.66.0" },
+    { name = "trafilatura", specifier = ">=1.12.0" },
+]
+provides-extras = ["dev"]
+
+[[package]]
+name = "tenacity"
+version = "9.1.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/47/c6/ee486fd809e357697ee8a44d3d69222b344920433d3b6666ccd9b374630c/tenacity-9.1.4.tar.gz", hash = "sha256:adb31d4c263f2bd041081ab33b498309a57c77f9acf2db65aadf0898179cf93a", size = 49413 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d7/c1/eb8f9debc45d3b7918a32ab756658a0904732f75e555402972246b0b8e71/tenacity-9.1.4-py3-none-any.whl", hash = "sha256:6095a360c919085f28c6527de529e76a06ad89b23659fa881ae0649b867a9d55", size = 28926 },
+]
+
+[[package]]
+name = "threadpoolctl"
+version = "3.6.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b7/4d/08c89e34946fce2aec4fbb45c9016efd5f4d7f24af8e5d93296e935631d8/threadpoolctl-3.6.0.tar.gz", hash = "sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e", size = 21274 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638 },
+]
+
+[[package]]
+name = "tiktoken"
+version = "0.12.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "regex" },
+    { name = "requests" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/7d/ab/4d017d0f76ec3171d469d80fc03dfbb4e48a4bcaddaa831b31d526f05edc/tiktoken-0.12.0.tar.gz", hash = "sha256:b18ba7ee2b093863978fcb14f74b3707cdc8d4d4d3836853ce7ec60772139931", size = 37806 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a4/85/be65d39d6b647c79800fd9d29241d081d4eeb06271f383bb87200d74cf76/tiktoken-0.12.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b97f74aca0d78a1ff21b8cd9e9925714c15a9236d6ceacf5c7327c117e6e21e8", size = 1050728 },
+    { url = "https://files.pythonhosted.org/packages/4a/42/6573e9129bc55c9bf7300b3a35bef2c6b9117018acca0dc760ac2d93dffe/tiktoken-0.12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2b90f5ad190a4bb7c3eb30c5fa32e1e182ca1ca79f05e49b448438c3e225a49b", size = 994049 },
+    { url = "https://files.pythonhosted.org/packages/66/c5/ed88504d2f4a5fd6856990b230b56d85a777feab84e6129af0822f5d0f70/tiktoken-0.12.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:65b26c7a780e2139e73acc193e5c63ac754021f160df919add909c1492c0fb37", size = 1129008 },
+    { url = "https://files.pythonhosted.org/packages/f4/90/3dae6cc5436137ebd38944d396b5849e167896fc2073da643a49f372dc4f/tiktoken-0.12.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:edde1ec917dfd21c1f2f8046b86348b0f54a2c0547f68149d8600859598769ad", size = 1152665 },
+    { url = "https://files.pythonhosted.org/packages/a3/fe/26df24ce53ffde419a42f5f53d755b995c9318908288c17ec3f3448313a3/tiktoken-0.12.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:35a2f8ddd3824608b3d650a000c1ef71f730d0c56486845705a8248da00f9fe5", size = 1194230 },
+    { url = "https://files.pythonhosted.org/packages/20/cc/b064cae1a0e9fac84b0d2c46b89f4e57051a5f41324e385d10225a984c24/tiktoken-0.12.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83d16643edb7fa2c99eff2ab7733508aae1eebb03d5dfc46f5565862810f24e3", size = 1254688 },
+    { url = "https://files.pythonhosted.org/packages/81/10/b8523105c590c5b8349f2587e2fdfe51a69544bd5a76295fc20f2374f470/tiktoken-0.12.0-cp312-cp312-win_amd64.whl", hash = "sha256:ffc5288f34a8bc02e1ea7047b8d041104791d2ddbf42d1e5fa07822cbffe16bd", size = 878694 },
+    { url = "https://files.pythonhosted.org/packages/00/61/441588ee21e6b5cdf59d6870f86beb9789e532ee9718c251b391b70c68d6/tiktoken-0.12.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:775c2c55de2310cc1bc9a3ad8826761cbdc87770e586fd7b6da7d4589e13dab3", size = 1050802 },
+    { url = "https://files.pythonhosted.org/packages/1f/05/dcf94486d5c5c8d34496abe271ac76c5b785507c8eae71b3708f1ad9b45a/tiktoken-0.12.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a01b12f69052fbe4b080a2cfb867c4de12c704b56178edf1d1d7b273561db160", size = 993995 },
+    { url = "https://files.pythonhosted.org/packages/a0/70/5163fe5359b943f8db9946b62f19be2305de8c3d78a16f629d4165e2f40e/tiktoken-0.12.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:01d99484dc93b129cd0964f9d34eee953f2737301f18b3c7257bf368d7615baa", size = 1128948 },
+    { url = "https://files.pythonhosted.org/packages/0c/da/c028aa0babf77315e1cef357d4d768800c5f8a6de04d0eac0f377cb619fa/tiktoken-0.12.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:4a1a4fcd021f022bfc81904a911d3df0f6543b9e7627b51411da75ff2fe7a1be", size = 1151986 },
+    { url = "https://files.pythonhosted.org/packages/a0/5a/886b108b766aa53e295f7216b509be95eb7d60b166049ce2c58416b25f2a/tiktoken-0.12.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:981a81e39812d57031efdc9ec59fa32b2a5a5524d20d4776574c4b4bd2e9014a", size = 1194222 },
+    { url = "https://files.pythonhosted.org/packages/f4/f8/4db272048397636ac7a078d22773dd2795b1becee7bc4922fe6207288d57/tiktoken-0.12.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9baf52f84a3f42eef3ff4e754a0db79a13a27921b457ca9832cf944c6be4f8f3", size = 1255097 },
+    { url = "https://files.pythonhosted.org/packages/8e/32/45d02e2e0ea2be3a9ed22afc47d93741247e75018aac967b713b2941f8ea/tiktoken-0.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:b8a0cd0c789a61f31bf44851defbd609e8dd1e2c8589c614cc1060940ef1f697", size = 879117 },
+    { url = "https://files.pythonhosted.org/packages/ce/76/994fc868f88e016e6d05b0da5ac24582a14c47893f4474c3e9744283f1d5/tiktoken-0.12.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:d5f89ea5680066b68bcb797ae85219c72916c922ef0fcdd3480c7d2315ffff16", size = 1050309 },
+    { url = "https://files.pythonhosted.org/packages/f6/b8/57ef1456504c43a849821920d582a738a461b76a047f352f18c0b26c6516/tiktoken-0.12.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:b4e7ed1c6a7a8a60a3230965bdedba8cc58f68926b835e519341413370e0399a", size = 993712 },
+    { url = "https://files.pythonhosted.org/packages/72/90/13da56f664286ffbae9dbcfadcc625439142675845baa62715e49b87b68b/tiktoken-0.12.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:fc530a28591a2d74bce821d10b418b26a094bf33839e69042a6e86ddb7a7fb27", size = 1128725 },
+    { url = "https://files.pythonhosted.org/packages/05/df/4f80030d44682235bdaecd7346c90f67ae87ec8f3df4a3442cb53834f7e4/tiktoken-0.12.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:06a9f4f49884139013b138920a4c393aa6556b2f8f536345f11819389c703ebb", size = 1151875 },
+    { url = "https://files.pythonhosted.org/packages/22/1f/ae535223a8c4ef4c0c1192e3f9b82da660be9eb66b9279e95c99288e9dab/tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:04f0e6a985d95913cabc96a741c5ffec525a2c72e9df086ff17ebe35985c800e", size = 1194451 },
+    { url = "https://files.pythonhosted.org/packages/78/a7/f8ead382fce0243cb625c4f266e66c27f65ae65ee9e77f59ea1653b6d730/tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0ee8f9ae00c41770b5f9b0bb1235474768884ae157de3beb5439ca0fd70f3e25", size = 1253794 },
+    { url = "https://files.pythonhosted.org/packages/93/e0/6cc82a562bc6365785a3ff0af27a2a092d57c47d7a81d9e2295d8c36f011/tiktoken-0.12.0-cp313-cp313t-win_amd64.whl", hash = "sha256:dc2dd125a62cb2b3d858484d6c614d136b5b848976794edfb63688d539b8b93f", size = 878777 },
+    { url = "https://files.pythonhosted.org/packages/72/05/3abc1db5d2c9aadc4d2c76fa5640134e475e58d9fbb82b5c535dc0de9b01/tiktoken-0.12.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:a90388128df3b3abeb2bfd1895b0681412a8d7dc644142519e6f0a97c2111646", size = 1050188 },
+    { url = "https://files.pythonhosted.org/packages/e3/7b/50c2f060412202d6c95f32b20755c7a6273543b125c0985d6fa9465105af/tiktoken-0.12.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:da900aa0ad52247d8794e307d6446bd3cdea8e192769b56276695d34d2c9aa88", size = 993978 },
+    { url = "https://files.pythonhosted.org/packages/14/27/bf795595a2b897e271771cd31cb847d479073497344c637966bdf2853da1/tiktoken-0.12.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:285ba9d73ea0d6171e7f9407039a290ca77efcdb026be7769dccc01d2c8d7fff", size = 1129271 },
+    { url = "https://files.pythonhosted.org/packages/f5/de/9341a6d7a8f1b448573bbf3425fa57669ac58258a667eb48a25dfe916d70/tiktoken-0.12.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:d186a5c60c6a0213f04a7a802264083dea1bbde92a2d4c7069e1a56630aef830", size = 1151216 },
+    { url = "https://files.pythonhosted.org/packages/75/0d/881866647b8d1be4d67cb24e50d0c26f9f807f994aa1510cb9ba2fe5f612/tiktoken-0.12.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:604831189bd05480f2b885ecd2d1986dc7686f609de48208ebbbddeea071fc0b", size = 1194860 },
+    { url = "https://files.pythonhosted.org/packages/b3/1e/b651ec3059474dab649b8d5b69f5c65cd8fcd8918568c1935bd4136c9392/tiktoken-0.12.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:8f317e8530bb3a222547b85a58583238c8f74fd7a7408305f9f63246d1a0958b", size = 1254567 },
+    { url = "https://files.pythonhosted.org/packages/80/57/ce64fd16ac390fafde001268c364d559447ba09b509181b2808622420eec/tiktoken-0.12.0-cp314-cp314-win_amd64.whl", hash = "sha256:399c3dd672a6406719d84442299a490420b458c44d3ae65516302a99675888f3", size = 921067 },
+    { url = "https://files.pythonhosted.org/packages/ac/a4/72eed53e8976a099539cdd5eb36f241987212c29629d0a52c305173e0a68/tiktoken-0.12.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:c2c714c72bc00a38ca969dae79e8266ddec999c7ceccd603cc4f0d04ccd76365", size = 1050473 },
+    { url = "https://files.pythonhosted.org/packages/e6/d7/0110b8f54c008466b19672c615f2168896b83706a6611ba6e47313dbc6e9/tiktoken-0.12.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:cbb9a3ba275165a2cb0f9a83f5d7025afe6b9d0ab01a22b50f0e74fee2ad253e", size = 993855 },
+    { url = "https://files.pythonhosted.org/packages/5f/77/4f268c41a3957c418b084dd576ea2fad2e95da0d8e1ab705372892c2ca22/tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:dfdfaa5ffff8993a3af94d1125870b1d27aed7cb97aa7eb8c1cefdbc87dbee63", size = 1129022 },
+    { url = "https://files.pythonhosted.org/packages/4e/2b/fc46c90fe5028bd094cd6ee25a7db321cb91d45dc87531e2bdbb26b4867a/tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:584c3ad3d0c74f5269906eb8a659c8bfc6144a52895d9261cdaf90a0ae5f4de0", size = 1150736 },
+    { url = "https://files.pythonhosted.org/packages/28/c0/3c7a39ff68022ddfd7d93f3337ad90389a342f761c4d71de99a3ccc57857/tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:54c891b416a0e36b8e2045b12b33dd66fb34a4fe7965565f1b482da50da3e86a", size = 1194908 },
+    { url = "https://files.pythonhosted.org/packages/ab/0d/c1ad6f4016a3968c048545f5d9b8ffebf577774b2ede3e2e352553b685fe/tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5edb8743b88d5be814b1a8a8854494719080c28faaa1ccbef02e87354fe71ef0", size = 1253706 },
+    { url = "https://files.pythonhosted.org/packages/af/df/c7891ef9d2712ad774777271d39fdef63941ffba0a9d59b7ad1fd2765e57/tiktoken-0.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:f61c0aea5565ac82e2ec50a05e02a6c44734e91b51c10510b084ea1b8e633a71", size = 920667 },
+]
+
+[[package]]
+name = "tinytag"
+version = "2.2.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/96/59/8a8cb2331e2602b53e4dc06960f57d1387a2b18e7efd24e5f9cb60ea4925/tinytag-2.2.1.tar.gz", hash = "sha256:e6d06610ebe7cd66fd07be2d3b9495914ab32654a5e47657bb8cd44c2484523c", size = 38214 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ce/34/d50e338631baaf65ec5396e70085e5de0b52b24b28db1ffbc1c6e82190dc/tinytag-2.2.1-py3-none-any.whl", hash = "sha256:ed8b1e6d25367937e3321e054f4974f9abfde1a3e0a538824c87da377130c2b6", size = 32927 },
+]
+
+[[package]]
+name = "tld"
+version = "0.13.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/5c/5d/76b4383ac4e5b5e254e50c09807b3e13820bed6d6c11cd540264988d6802/tld-0.13.2.tar.gz", hash = "sha256:d983fa92b9d717400742fca844e29d5e18271079c7bcfabf66d01b39b4a14345", size = 467175 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9e/90/39a85a4b63c84213e78b3c17d22e1bf45328acf8ebb33ef93be30d0a3911/tld-0.13.2-py2.py3-none-any.whl", hash = "sha256:9b8fdbdb880e7ba65b216a4937f2c94c49a7226723783d5838fc958ac76f4e0c", size = 296743 },
+]
+
+[[package]]
+name = "tqdm"
+version = "4.67.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/09/a9/6ba95a270c6f1fbcd8dac228323f2777d886cb206987444e4bce66338dd4/tqdm-4.67.3.tar.gz", hash = "sha256:7d825f03f89244ef73f1d4ce193cb1774a8179fd96f31d7e1dcde62092b960bb", size = 169598 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl", hash = "sha256:ee1e4c0e59148062281c49d80b25b67771a127c85fc9676d3be5f243206826bf", size = 78374 },
+]
+
+[[package]]
+name = "trafilatura"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "certifi" },
+    { name = "charset-normalizer" },
+    { name = "courlan" },
+    { name = "htmldate" },
+    { name = "justext" },
+    { name = "lxml" },
+    { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/06/25/e3ebeefdebfdfae8c4a4396f5a6ea51fc6fa0831d63ce338e5090a8003dc/trafilatura-2.0.0.tar.gz", hash = "sha256:ceb7094a6ecc97e72fea73c7dba36714c5c5b577b6470e4520dca893706d6247", size = 253404 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8a/b6/097367f180b6383a3581ca1b86fcae284e52075fa941d1232df35293363c/trafilatura-2.0.0-py3-none-any.whl", hash = "sha256:77eb5d1e993747f6f20938e1de2d840020719735690c840b9a1024803a4cd51d", size = 132557 },
+]
+
+[[package]]
+name = "typer"
+version = "0.25.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "annotated-doc" },
+    { name = "click" },
+    { name = "rich" },
+    { name = "shellingham" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/e4/51/9aed62104cea109b820bbd6c14245af756112017d309da813ef107d42e7e/typer-0.25.1.tar.gz", hash = "sha256:9616eb8853a09ffeabab1698952f33c6f29ffdbceb4eaeecf571880e8d7664cc", size = 122276 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3f/f9/2b3ff4e56e5fa7debfaf9eb135d0da96f3e9a1d5b27222223c7296336e5f/typer-0.25.1-py3-none-any.whl", hash = "sha256:75caa44ed46a03fb2dab8808753ffacdbfea88495e74c85a28c5eefcf5f39c89", size = 58409 },
+]
+
+[[package]]
+name = "typing-extensions"
+version = "4.15.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614 },
+]
+
+[[package]]
+name = "typing-inspect"
+version = "0.9.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "mypy-extensions" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/dc/74/1789779d91f1961fa9438e9a8710cdae6bd138c80d7303996933d117264a/typing_inspect-0.9.0.tar.gz", hash = "sha256:b23fc42ff6f6ef6954e4852c1fb512cdd18dbea03134f91f856a95ccc9461f78", size = 13825 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/65/f3/107a22063bf27bdccf2024833d3445f4eea42b2e598abfbd46f6a63b6cb0/typing_inspect-0.9.0-py3-none-any.whl", hash = "sha256:9ee6fc59062311ef8547596ab6b955e1b8aa46242d854bfc78f4f6b0eff35f9f", size = 8827 },
+]
+
+[[package]]
+name = "typing-inspection"
+version = "0.4.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/55/e3/70399cb7dd41c10ac53367ae42139cf4b1ca5f36bb3dc6c9d33acdb43655/typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464", size = 75949 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611 },
+]
+
+[[package]]
+name = "tzdata"
+version = "2026.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ba/19/1b9b0e29f30c6d35cb345486df41110984ea67ae69dddbc0e8a100999493/tzdata-2026.2.tar.gz", hash = "sha256:9173fde7d80d9018e02a662e168e5a2d04f87c41ea174b139fbef642eda62d10", size = 198254 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ce/e4/dccd7f47c4b64213ac01ef921a1337ee6e30e8c6466046018326977efd95/tzdata-2026.2-py2.py3-none-any.whl", hash = "sha256:bbe9af844f658da81a5f95019480da3a89415801f6cc966806612cc7169bffe7", size = 349321 },
+]
+
+[[package]]
+name = "tzlocal"
+version = "5.3.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "tzdata", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/8b/2e/c14812d3d4d9cd1773c6be938f89e5735a1f11a9f184ac3639b93cef35d5/tzlocal-5.3.1.tar.gz", hash = "sha256:cceffc7edecefea1f595541dbd6e990cb1ea3d19bf01b2809f362a03dd7921fd", size = 30761 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c2/14/e2a54fabd4f08cd7af1c07030603c3356b74da07f7cc056e600436edfa17/tzlocal-5.3.1-py3-none-any.whl", hash = "sha256:eb1a66c3ef5847adf7a834f1be0800581b683b5608e74f86ecbcef8ab91bb85d", size = 18026 },
+]
+
+[[package]]
+name = "urllib3"
+version = "2.7.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/53/0c/06f8b233b8fd13b9e5ee11424ef85419ba0d8ba0b3138bf360be2ff56953/urllib3-2.7.0.tar.gz", hash = "sha256:231e0ec3b63ceb14667c67be60f2f2c40a518cb38b03af60abc813da26505f4c", size = 433602 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7f/3e/5db95bcf282c52709639744ca2a8b149baccf648e39c8cc87553df9eae0c/urllib3-2.7.0-py3-none-any.whl", hash = "sha256:9fb4c81ebbb1ce9531cce37674bbc6f1360472bc18ca9a553ede278ef7276897", size = 131087 },
+]
+
+[[package]]
+name = "wrapt"
+version = "2.1.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/2e/64/925f213fdcbb9baeb1530449ac71a4d57fc361c053d06bf78d0c5c7cd80c/wrapt-2.1.2.tar.gz", hash = "sha256:3996a67eecc2c68fd47b4e3c564405a5777367adfd9b8abb58387b63ee83b21e", size = 81678 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4c/b6/1db817582c49c7fcbb7df6809d0f515af29d7c2fbf57eb44c36e98fb1492/wrapt-2.1.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ff2aad9c4cda28a8f0653fc2d487596458c2a3f475e56ba02909e950a9efa6a9", size = 61255 },
+    { url = "https://files.pythonhosted.org/packages/a2/16/9b02a6b99c09227c93cd4b73acc3678114154ec38da53043c0ddc1fba0dc/wrapt-2.1.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6433ea84e1cfacf32021d2a4ee909554ade7fd392caa6f7c13f1f4bf7b8e8748", size = 61848 },
+    { url = "https://files.pythonhosted.org/packages/af/aa/ead46a88f9ec3a432a4832dfedb84092fc35af2d0ba40cd04aea3889f247/wrapt-2.1.2-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:c20b757c268d30d6215916a5fa8461048d023865d888e437fab451139cad6c8e", size = 121433 },
+    { url = "https://files.pythonhosted.org/packages/3a/9f/742c7c7cdf58b59085a1ee4b6c37b013f66ac33673a7ef4aaed5e992bc33/wrapt-2.1.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:79847b83eb38e70d93dc392c7c5b587efe65b3e7afcc167aa8abd5d60e8761c8", size = 123013 },
+    { url = "https://files.pythonhosted.org/packages/e8/44/2c3dd45d53236b7ed7c646fcf212251dc19e48e599debd3926b52310fafb/wrapt-2.1.2-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f8fba1bae256186a83d1875b2b1f4e2d1242e8fac0f58ec0d7e41b26967b965c", size = 117326 },
+    { url = "https://files.pythonhosted.org/packages/74/e2/b17d66abc26bd96f89dec0ecd0ef03da4a1286e6ff793839ec431b9fae57/wrapt-2.1.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e3d3b35eedcf5f7d022291ecd7533321c4775f7b9cd0050a31a68499ba45757c", size = 121444 },
+    { url = "https://files.pythonhosted.org/packages/3c/62/e2977843fdf9f03daf1586a0ff49060b1b2fc7ff85a7ea82b6217c1ae36e/wrapt-2.1.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:6f2c5390460de57fa9582bc8a1b7a6c86e1a41dfad74c5225fc07044c15cc8d1", size = 116237 },
+    { url = "https://files.pythonhosted.org/packages/88/dd/27fc67914e68d740bce512f11734aec08696e6b17641fef8867c00c949fc/wrapt-2.1.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7dfa9f2cf65d027b951d05c662cc99ee3bd01f6e4691ed39848a7a5fffc902b2", size = 120563 },
+    { url = "https://files.pythonhosted.org/packages/ec/9f/b750b3692ed2ef4705cb305bd68858e73010492b80e43d2a4faa5573cbe7/wrapt-2.1.2-cp312-cp312-win32.whl", hash = "sha256:eba8155747eb2cae4a0b913d9ebd12a1db4d860fc4c829d7578c7b989bd3f2f0", size = 58198 },
+    { url = "https://files.pythonhosted.org/packages/8e/b2/feecfe29f28483d888d76a48f03c4c4d8afea944dbee2b0cd3380f9df032/wrapt-2.1.2-cp312-cp312-win_amd64.whl", hash = "sha256:1c51c738d7d9faa0b3601708e7e2eda9bf779e1b601dce6c77411f2a1b324a63", size = 60441 },
+    { url = "https://files.pythonhosted.org/packages/44/e1/e328f605d6e208547ea9fd120804fcdec68536ac748987a68c47c606eea8/wrapt-2.1.2-cp312-cp312-win_arm64.whl", hash = "sha256:c8e46ae8e4032792eb2f677dbd0d557170a8e5524d22acc55199f43efedd39bf", size = 58836 },
+    { url = "https://files.pythonhosted.org/packages/4c/7a/d936840735c828b38d26a854e85d5338894cda544cb7a85a9d5b8b9c4df7/wrapt-2.1.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:787fd6f4d67befa6fe2abdffcbd3de2d82dfc6fb8a6d850407c53332709d030b", size = 61259 },
+    { url = "https://files.pythonhosted.org/packages/5e/88/9a9b9a90ac8ca11c2fdb6a286cb3a1fc7dd774c00ed70929a6434f6bc634/wrapt-2.1.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4bdf26e03e6d0da3f0e9422fd36bcebf7bc0eeb55fdf9c727a09abc6b9fe472e", size = 61851 },
+    { url = "https://files.pythonhosted.org/packages/03/a9/5b7d6a16fd6533fed2756900fc8fc923f678179aea62ada6d65c92718c00/wrapt-2.1.2-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:bbac24d879aa22998e87f6b3f481a5216311e7d53c7db87f189a7a0266dafffb", size = 121446 },
+    { url = "https://files.pythonhosted.org/packages/45/bb/34c443690c847835cfe9f892be78c533d4f32366ad2888972c094a897e39/wrapt-2.1.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:16997dfb9d67addc2e3f41b62a104341e80cac52f91110dece393923c0ebd5ca", size = 123056 },
+    { url = "https://files.pythonhosted.org/packages/93/b9/ff205f391cb708f67f41ea148545f2b53ff543a7ac293b30d178af4d2271/wrapt-2.1.2-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:162e4e2ba7542da9027821cb6e7c5e068d64f9a10b5f15512ea28e954893a267", size = 117359 },
+    { url = "https://files.pythonhosted.org/packages/1f/3d/1ea04d7747825119c3c9a5e0874a40b33594ada92e5649347c457d982805/wrapt-2.1.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f29c827a8d9936ac320746747a016c4bc66ef639f5cd0d32df24f5eacbf9c69f", size = 121479 },
+    { url = "https://files.pythonhosted.org/packages/78/cc/ee3a011920c7a023b25e8df26f306b2484a531ab84ca5c96260a73de76c0/wrapt-2.1.2-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:a9dd9813825f7ecb018c17fd147a01845eb330254dff86d3b5816f20f4d6aaf8", size = 116271 },
+    { url = "https://files.pythonhosted.org/packages/98/fd/e5ff7ded41b76d802cf1191288473e850d24ba2e39a6ec540f21ae3b57cb/wrapt-2.1.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6f8dbdd3719e534860d6a78526aafc220e0241f981367018c2875178cf83a413", size = 120573 },
+    { url = "https://files.pythonhosted.org/packages/47/c5/242cae3b5b080cd09bacef0591691ba1879739050cc7c801ff35c8886b66/wrapt-2.1.2-cp313-cp313-win32.whl", hash = "sha256:5c35b5d82b16a3bc6e0a04349b606a0582bc29f573786aebe98e0c159bc48db6", size = 58205 },
+    { url = "https://files.pythonhosted.org/packages/12/69/c358c61e7a50f290958809b3c61ebe8b3838ea3e070d7aac9814f95a0528/wrapt-2.1.2-cp313-cp313-win_amd64.whl", hash = "sha256:f8bc1c264d8d1cf5b3560a87bbdd31131573eb25f9f9447bb6252b8d4c44a3a1", size = 60452 },
+    { url = "https://files.pythonhosted.org/packages/8e/66/c8a6fcfe321295fd8c0ab1bd685b5a01462a9b3aa2f597254462fc2bc975/wrapt-2.1.2-cp313-cp313-win_arm64.whl", hash = "sha256:3beb22f674550d5634642c645aba4c72a2c66fb185ae1aebe1e955fae5a13baf", size = 58842 },
+    { url = "https://files.pythonhosted.org/packages/da/55/9c7052c349106e0b3f17ae8db4b23a691a963c334de7f9dbd60f8f74a831/wrapt-2.1.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0fc04bc8664a8bc4c8e00b37b5355cffca2535209fba1abb09ae2b7c76ddf82b", size = 63075 },
+    { url = "https://files.pythonhosted.org/packages/09/a8/ce7b4006f7218248dd71b7b2b732d0710845a0e49213b18faef64811ffef/wrapt-2.1.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a9b9d50c9af998875a1482a038eb05755dfd6fe303a313f6a940bb53a83c3f18", size = 63719 },
+    { url = "https://files.pythonhosted.org/packages/e4/e5/2ca472e80b9e2b7a17f106bb8f9df1db11e62101652ce210f66935c6af67/wrapt-2.1.2-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2d3ff4f0024dd224290c0eabf0240f1bfc1f26363431505fb1b0283d3b08f11d", size = 152643 },
+    { url = "https://files.pythonhosted.org/packages/36/42/30f0f2cefca9d9cbf6835f544d825064570203c3e70aa873d8ae12e23791/wrapt-2.1.2-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3278c471f4468ad544a691b31bb856374fbdefb7fee1a152153e64019379f015", size = 158805 },
+    { url = "https://files.pythonhosted.org/packages/bb/67/d08672f801f604889dcf58f1a0b424fe3808860ede9e03affc1876b295af/wrapt-2.1.2-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a8914c754d3134a3032601c6984db1c576e6abaf3fc68094bb8ab1379d75ff92", size = 145990 },
+    { url = "https://files.pythonhosted.org/packages/68/a7/fd371b02e73babec1de6ade596e8cd9691051058cfdadbfd62a5898f3295/wrapt-2.1.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:ff95d4264e55839be37bafe1536db2ab2de19da6b65f9244f01f332b5286cfbf", size = 155670 },
+    { url = "https://files.pythonhosted.org/packages/86/2d/9fe0095dfdb621009f40117dcebf41d7396c2c22dca6eac779f4c007b86c/wrapt-2.1.2-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:76405518ca4e1b76fbb1b9f686cff93aebae03920cc55ceeec48ff9f719c5f67", size = 144357 },
+    { url = "https://files.pythonhosted.org/packages/0e/b6/ec7b4a254abbe4cde9fa15c5d2cca4518f6b07d0f1b77d4ee9655e30280e/wrapt-2.1.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c0be8b5a74c5824e9359b53e7e58bef71a729bacc82e16587db1c4ebc91f7c5a", size = 150269 },
+    { url = "https://files.pythonhosted.org/packages/6e/6b/2fabe8ebf148f4ee3c782aae86a795cc68ffe7d432ef550f234025ce0cfa/wrapt-2.1.2-cp313-cp313t-win32.whl", hash = "sha256:f01277d9a5fc1862f26f7626da9cf443bebc0abd2f303f41c5e995b15887dabd", size = 59894 },
+    { url = "https://files.pythonhosted.org/packages/ca/fb/9ba66fc2dedc936de5f8073c0217b5d4484e966d87723415cc8262c5d9c2/wrapt-2.1.2-cp313-cp313t-win_amd64.whl", hash = "sha256:84ce8f1c2104d2f6daa912b1b5b039f331febfeee74f8042ad4e04992bd95c8f", size = 63197 },
+    { url = "https://files.pythonhosted.org/packages/c0/1c/012d7423c95d0e337117723eb8ecf73c622ce15a97847e84cf3f8f26cd7e/wrapt-2.1.2-cp313-cp313t-win_arm64.whl", hash = "sha256:a93cd767e37faeddbe07d8fc4212d5cba660af59bdb0f6372c93faaa13e6e679", size = 60363 },
+    { url = "https://files.pythonhosted.org/packages/39/25/e7ea0b417db02bb796182a5316398a75792cd9a22528783d868755e1f669/wrapt-2.1.2-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:1370e516598854e5b4366e09ce81e08bfe94d42b0fd569b88ec46cc56d9164a9", size = 61418 },
+    { url = "https://files.pythonhosted.org/packages/ec/0f/fa539e2f6a770249907757eaeb9a5ff4deb41c026f8466c1c6d799088a9b/wrapt-2.1.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:6de1a3851c27e0bd6a04ca993ea6f80fc53e6c742ee1601f486c08e9f9b900a9", size = 61914 },
+    { url = "https://files.pythonhosted.org/packages/53/37/02af1867f5b1441aaeda9c82deed061b7cd1372572ddcd717f6df90b5e93/wrapt-2.1.2-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:de9f1a2bbc5ac7f6012ec24525bdd444765a2ff64b5985ac6e0692144838542e", size = 120417 },
+    { url = "https://files.pythonhosted.org/packages/c3/b7/0138a6238c8ba7476c77cf786a807f871672b37f37a422970342308276e7/wrapt-2.1.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:970d57ed83fa040d8b20c52fe74a6ae7e3775ae8cff5efd6a81e06b19078484c", size = 122797 },
+    { url = "https://files.pythonhosted.org/packages/e1/ad/819ae558036d6a15b7ed290d5b14e209ca795dd4da9c58e50c067d5927b0/wrapt-2.1.2-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:3969c56e4563c375861c8df14fa55146e81ac11c8db49ea6fb7f2ba58bc1ff9a", size = 117350 },
+    { url = "https://files.pythonhosted.org/packages/8b/2d/afc18dc57a4600a6e594f77a9ae09db54f55ba455440a54886694a84c71b/wrapt-2.1.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:57d7c0c980abdc5f1d98b11a2aa3bb159790add80258c717fa49a99921456d90", size = 121223 },
+    { url = "https://files.pythonhosted.org/packages/b9/5b/5ec189b22205697bc56eb3b62aed87a1e0423e9c8285d0781c7a83170d15/wrapt-2.1.2-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:776867878e83130c7a04237010463372e877c1c994d449ca6aaafeab6aab2586", size = 116287 },
+    { url = "https://files.pythonhosted.org/packages/f7/2d/f84939a7c9b5e6cdd8a8d0f6a26cabf36a0f7e468b967720e8b0cd2bdf69/wrapt-2.1.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:fab036efe5464ec3291411fabb80a7a39e2dd80bae9bcbeeca5087fdfa891e19", size = 119593 },
+    { url = "https://files.pythonhosted.org/packages/0b/fe/ccd22a1263159c4ac811ab9374c061bcb4a702773f6e06e38de5f81a1bdc/wrapt-2.1.2-cp314-cp314-win32.whl", hash = "sha256:e6ed62c82ddf58d001096ae84ce7f833db97ae2263bff31c9b336ba8cfe3f508", size = 58631 },
+    { url = "https://files.pythonhosted.org/packages/65/0a/6bd83be7bff2e7efaac7b4ac9748da9d75a34634bbbbc8ad077d527146df/wrapt-2.1.2-cp314-cp314-win_amd64.whl", hash = "sha256:467e7c76315390331c67073073d00662015bb730c566820c9ca9b54e4d67fd04", size = 60875 },
+    { url = "https://files.pythonhosted.org/packages/6c/c0/0b3056397fe02ff80e5a5d72d627c11eb885d1ca78e71b1a5c1e8c7d45de/wrapt-2.1.2-cp314-cp314-win_arm64.whl", hash = "sha256:da1f00a557c66225d53b095a97eace0fc5349e3bfda28fa34ffae238978ee575", size = 59164 },
+    { url = "https://files.pythonhosted.org/packages/71/ed/5d89c798741993b2371396eb9d4634f009ff1ad8a6c78d366fe2883ea7a6/wrapt-2.1.2-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:62503ffbc2d3a69891cf29beeaccdb4d5e0a126e2b6a851688d4777e01428dbb", size = 63163 },
+    { url = "https://files.pythonhosted.org/packages/c6/8c/05d277d182bf36b0a13d6bd393ed1dec3468a25b59d01fba2dd70fe4d6ae/wrapt-2.1.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c7e6cd120ef837d5b6f860a6ea3745f8763805c418bb2f12eeb1fa6e25f22d22", size = 63723 },
+    { url = "https://files.pythonhosted.org/packages/f4/27/6c51ec1eff4413c57e72d6106bb8dec6f0c7cdba6503d78f0fa98767bcc9/wrapt-2.1.2-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:3769a77df8e756d65fbc050333f423c01ae012b4f6731aaf70cf2bef61b34596", size = 152652 },
+    { url = "https://files.pythonhosted.org/packages/db/4c/d7dd662d6963fc7335bfe29d512b02b71cdfa23eeca7ab3ac74a67505deb/wrapt-2.1.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a76d61a2e851996150ba0f80582dd92a870643fa481f3b3846f229de88caf044", size = 158807 },
+    { url = "https://files.pythonhosted.org/packages/b4/4d/1e5eea1a78d539d346765727422976676615814029522c76b87a95f6bcdd/wrapt-2.1.2-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:6f97edc9842cf215312b75fe737ee7c8adda75a89979f8e11558dfff6343cc4b", size = 146061 },
+    { url = "https://files.pythonhosted.org/packages/89/bc/62cabea7695cd12a288023251eeefdcb8465056ddaab6227cb78a2de005b/wrapt-2.1.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:4006c351de6d5007aa33a551f600404ba44228a89e833d2fadc5caa5de8edfbf", size = 155667 },
+    { url = "https://files.pythonhosted.org/packages/e9/99/6f2888cd68588f24df3a76572c69c2de28287acb9e1972bf0c83ce97dbc1/wrapt-2.1.2-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:a9372fc3639a878c8e7d87e1556fa209091b0a66e912c611e3f833e2c4202be2", size = 144392 },
+    { url = "https://files.pythonhosted.org/packages/40/51/1dfc783a6c57971614c48e361a82ca3b6da9055879952587bc99fe1a7171/wrapt-2.1.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3144b027ff30cbd2fca07c0a87e67011adb717eb5f5bd8496325c17e454257a3", size = 150296 },
+    { url = "https://files.pythonhosted.org/packages/6c/38/cbb8b933a0201076c1f64fc42883b0023002bdc14a4964219154e6ff3350/wrapt-2.1.2-cp314-cp314t-win32.whl", hash = "sha256:3b8d15e52e195813efe5db8cec156eebe339aaf84222f4f4f051a6c01f237ed7", size = 60539 },
+    { url = "https://files.pythonhosted.org/packages/82/dd/e5176e4b241c9f528402cebb238a36785a628179d7d8b71091154b3e4c9e/wrapt-2.1.2-cp314-cp314t-win_amd64.whl", hash = "sha256:08ffa54146a7559f5b8df4b289b46d963a8e74ed16ba3687f99896101a3990c5", size = 63969 },
+    { url = "https://files.pythonhosted.org/packages/5c/99/79f17046cf67e4a95b9987ea129632ba8bcec0bc81f3fb3d19bdb0bd60cd/wrapt-2.1.2-cp314-cp314t-win_arm64.whl", hash = "sha256:72aaa9d0d8e4ed0e2e98019cea47a21f823c9dd4b43c7b77bba6679ffcca6a00", size = 60554 },
+    { url = "https://files.pythonhosted.org/packages/1a/c7/8528ac2dfa2c1e6708f647df7ae144ead13f0a31146f43c7264b4942bf12/wrapt-2.1.2-py3-none-any.whl", hash = "sha256:b8fd6fa2b2c4e7621808f8c62e8317f4aae56e59721ad933bac5239d913cf0e8", size = 43993 },
+]
+
+[[package]]
+name = "xxhash"
+version = "3.7.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/24/2f/e183a1b407002f5af81822bee18b61cdb94b8670208ef34734d8d2b8ebe9/xxhash-3.7.0.tar.gz", hash = "sha256:6cc4eefbb542a5d6ffd6d70ea9c502957c925e800f998c5630ecc809d6702bae", size = 82022 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f2/8a/51a14cdef4728c6c2337db8a7d8704422cc65676d9199d77215464c880af/xxhash-3.7.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:082c87bfdd2b9f457606c7a4a53457f4c4b48b0cdc48de0277f4349d79bb3d7a", size = 33357 },
+    { url = "https://files.pythonhosted.org/packages/b9/1b/0c2c933809421ffd9bf42b59315552c143c755db5d9a816b2f1ae273e884/xxhash-3.7.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5e7ce913b61f35b0c1c839a49ac9c8e75dd8d860150688aed353b0ce1bf409d8", size = 30869 },
+    { url = "https://files.pythonhosted.org/packages/03/a8/89d5fdd6ee12d70ba99451de46dd0e8010167468dcd913ec855653f4dd50/xxhash-3.7.0-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:3beb1de3b1e9694fcdd853e570ee64c631c7062435d2f8c69c1adf809bc086f0", size = 194100 },
+    { url = "https://files.pythonhosted.org/packages/87/ee/2f9f2ed993e77206d1e66991290a1ebe22e843351ca3ebec8e49e01ba186/xxhash-3.7.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f3e7b689c3bce16699efcf736066f5c6cc4472c3840fe4b22bd8279daf4abdac", size = 212977 },
+    { url = "https://files.pythonhosted.org/packages/de/60/5a91644615a9e9d4e42c2e9925f1908e3a24e4e691d9de7340d565bea024/xxhash-3.7.0-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:a6545e6b409e3d5cbafc850fb84c55a1ca26ed15a6b11e3bf07a0e0cd84517c8", size = 236373 },
+    { url = "https://files.pythonhosted.org/packages/22/c0/f3a9384eaaed9d14d4d062a5d953aa0da489bfe9747877aa994caa87cd0b/xxhash-3.7.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:31ab1461c77a11461d703c88eb949e132a1c6515933cf675d97ec680f4bd18de", size = 212229 },
+    { url = "https://files.pythonhosted.org/packages/2e/67/02f07a9fd79726804190f2172c4894c3ed9a4ebccaca05653c84beb58025/xxhash-3.7.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:7c4d596b7676f811172687ec567cbafb9e4dea2f9be1bbb4f622410cb7f40f40", size = 445462 },
+    { url = "https://files.pythonhosted.org/packages/40/37/558f5a90c0672fc9b4402dc25d87ac5b7406616e8969430c9ca4e52ee74d/xxhash-3.7.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:13805f0461cba0a857924e70ff91ae6d52d2598f79a884e788db80532614a4a1", size = 193932 },
+    { url = "https://files.pythonhosted.org/packages/d5/90/aaa09cd58661d32044dbbad7df55bbe22a623032b810e7ed3b8c569a2a6f/xxhash-3.7.0-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:1d398f372496152f1c6933a33566373f8d1b37b98b8c9d608fa6edc0976f23b2", size = 284807 },
+    { url = "https://files.pythonhosted.org/packages/d6/f3/53df3719ab127a02c174f0c1c74924fcd110866e89c966bc7909cfa8fa84/xxhash-3.7.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d610aa62cdb7d4d497740741772a24a794903bf3e79eaa51d2e800082abe11e5", size = 210445 },
+    { url = "https://files.pythonhosted.org/packages/72/33/d219975c0e8b6fa2eb9ccd486fe47e21bf1847985b878dd2fbc3126e0d5c/xxhash-3.7.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:073c23900a9fbf3d26616c17c830db28af9803677cd5b33aea3224d824111514", size = 241273 },
+    { url = "https://files.pythonhosted.org/packages/3e/50/49b1afe610eb3964cedcb90a4d4c3d46a261ee8669cbd4f060652619ae3c/xxhash-3.7.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:418a463c3e6a590c0cdc890f8be19adb44a8c8acd175ca5b2a6de77e61d0b386", size = 197950 },
+    { url = "https://files.pythonhosted.org/packages/c6/75/5f42a1a4c78717d906a4b6a140c6dbf837ab1f547a54d23c4e2903310936/xxhash-3.7.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:03f8ff4474ee61c845758ce00711d7087a770d77efb36f7e74a6e867301000b8", size = 210709 },
+    { url = "https://files.pythonhosted.org/packages/8a/85/237e446c25abced71e9c53d269f2cef5bab8a82b3f88a12e00c5368e7368/xxhash-3.7.0-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:44fba4a5f1d179b7ddc7b3dc40f56f9209046421679b57025d4d8821b376fd8d", size = 275345 },
+    { url = "https://files.pythonhosted.org/packages/62/34/c2c26c0a6a9cc739bc2a5f0ae03ba8b87deb12b8bce35f7ac495e790dc6d/xxhash-3.7.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:31e3516a0f829d06ded4a2c0f3c7c5561993256bfa1c493975fb9dc7bfa828a1", size = 414056 },
+    { url = "https://files.pythonhosted.org/packages/a0/aa/5c58e9bc8071b8afd8dcf297ff362f723c4892168faba149f19904132bf4/xxhash-3.7.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b59ee2ac81de57771a09ecad09191e840a1d2fae1ef684208320591055768f83", size = 191485 },
+    { url = "https://files.pythonhosted.org/packages/d4/69/a929cf9d1e2e65a48b818cdce72cb6b69eab2e6877f21436d0a1942aff43/xxhash-3.7.0-cp312-cp312-win32.whl", hash = "sha256:74bbd92f8c7fcc397ba0a11bfdc106bc72ad7f11e3a60277753f87e7532b4d81", size = 30671 },
+    { url = "https://files.pythonhosted.org/packages/b9/1b/104b41a8947f4e1d4a66ce1e628eea752f37d1890bfd7453559ca7a3d950/xxhash-3.7.0-cp312-cp312-win_amd64.whl", hash = "sha256:7bd7bc82dd4f185f28f35193c2e968ef46131628e3cac62f639dadf321cba4d1", size = 31514 },
+    { url = "https://files.pythonhosted.org/packages/98/a0/1fd0ea1f1b886d9e7c73f0397571e22333a7d79e31da6d7127c2a4a71d75/xxhash-3.7.0-cp312-cp312-win_arm64.whl", hash = "sha256:7d7148180ec99ba36585b42c8c5de25e9b40191613bc4be68909b4d25a77a852", size = 27761 },
+    { url = "https://files.pythonhosted.org/packages/c1/ca/d5174b4c36d10f64d4ca7050563138c5a599efb01a765858ddefc9c1202a/xxhash-3.7.0-cp313-cp313-android_21_arm64_v8a.whl", hash = "sha256:4b6d6b33f141158692bd4eafbb96edbc5aa0dabdb593a962db01a91983d4f8fa", size = 36813 },
+    { url = "https://files.pythonhosted.org/packages/41/d0/abc6c9d347ba1f1e1e1d98125d0881a0452c7f9a76a9dd03a7b5d2197f23/xxhash-3.7.0-cp313-cp313-android_21_x86_64.whl", hash = "sha256:845d347df254d6c619f616afa921331bada8614b8d373d58725c663ba97c3605", size = 35121 },
+    { url = "https://files.pythonhosted.org/packages/bf/11/4cc834eb3d79f2f2b3a6ef7324195208bcdfbdcf7534d2b17267aa5f3a8f/xxhash-3.7.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:fddbbb69a6fff4f421e7a0d1fa28f894b20112e9e3fab306af451e2dfd0e459b", size = 29624 },
+    { url = "https://files.pythonhosted.org/packages/23/83/e97d3e7b635fe73a1dfb1e91f805324dd6d930bb42041cbf18f183bc0b6d/xxhash-3.7.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:54876a4e45101cec2bf8f31a973cda073a23e2e108538dad224ba07f85f22487", size = 30638 },
+    { url = "https://files.pythonhosted.org/packages/f4/40/d84951d80c35db1f4c40a29a64a8520eea5d56e764c603906b4fe763580f/xxhash-3.7.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:0c72fe9c7e3d6dfd7f1e21e224a877917fa09c465694ba4e06464b9511b65544", size = 33323 },
+    { url = "https://files.pythonhosted.org/packages/89/cc/c7dc6558d97e9ab023f663d69ab28b340ed9bf4d2d94f2c259cf896bb354/xxhash-3.7.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a6d73a830b17ef49bc04e00182bd839164c1b3c59c127cd7c54fcb10c7ed8ee8", size = 33362 },
+    { url = "https://files.pythonhosted.org/packages/2a/6e/46b84017b1301d54091430353d4ad5901654a3e0871649877a416f7f1644/xxhash-3.7.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:91c3b07cf3362086d8f126c6aecd8e5e9396ad8b2f2219ea7e49a8250c318acd", size = 30874 },
+    { url = "https://files.pythonhosted.org/packages/df/5e/8f9158e3ab906ad3fec51e09b5ea0093e769f12207bfa42a368ca204e7ab/xxhash-3.7.0-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:50e879ebbac351c81565ca108db766d7832f5b8b6a5b14b8c0151f7190028e3d", size = 194185 },
+    { url = "https://files.pythonhosted.org/packages/f3/29/a804ded9f5d3d3758292678d23e7528b08fda7b7e750688d08b052322475/xxhash-3.7.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:921c14e93817842dd0dd9f372890a0f0c72e534650b6ab13c5be5cd0db11d47e", size = 213033 },
+    { url = "https://files.pythonhosted.org/packages/8b/91/1ce5a7d2fdc975267320e2c78fc1cecfe7ab735ccbcf6993ec5dd541cb2c/xxhash-3.7.0-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:e64a7c9d7dfca3e0fafcbc5e455519090706a3e36e95d655cec3e04e79f95aaa", size = 236140 },
+    { url = "https://files.pythonhosted.org/packages/34/04/fd595a4fd8617b05fa27bd9b684ecb4985bfed27917848eea85d54036d06/xxhash-3.7.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2220af08163baf5fa36c2b8af079dc2cbe6e66ae061385267f9472362dfd53c6", size = 212291 },
+    { url = "https://files.pythonhosted.org/packages/03/fb/f1a379cbc372ae5b9f4ab36154c48a849ca6ebe3ac477067a57865bf3bc6/xxhash-3.7.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f14bb8b22a4a91325813e3d553b8963c10cf8c756cff65ee50c194431296c655", size = 445532 },
+    { url = "https://files.pythonhosted.org/packages/65/59/172424b79f8cfd4b6d8a122b2193e6b8ad4b11f7159bb3b6f9b3191329bb/xxhash-3.7.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:496736f86a9bedaf64b0dc70e3539d0766df01c71ea22032698e88f3f04a1ce9", size = 193990 },
+    { url = "https://files.pythonhosted.org/packages/b9/19/aeac22161d953f139f07ba5586cb4a17c5b7b6dff985122803bb12933500/xxhash-3.7.0-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:0ff71596bd79816975b3de7130ab1ff4541410285a3c084584eeb1c8239996fd", size = 284876 },
+    { url = "https://files.pythonhosted.org/packages/77/d5/4fd0b59e7a02242953da05ff679fbb961b0a4368eac97a217e11dae110c1/xxhash-3.7.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1ad86695c19b1d46fe106925db3c7a37f16be37669dcf58dcc70a9dd6e324676", size = 210495 },
+    { url = "https://files.pythonhosted.org/packages/aa/fb/976a3165c728c7faf74aa1b5ab3cf6a85e6d731612894741840524c7d28c/xxhash-3.7.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:970f9f8c50961d639cbd0d988c96f80ddf66006de93641719282c4fe7a87c5e6", size = 241331 },
+    { url = "https://files.pythonhosted.org/packages/4a/2c/6763d5901d53ac9e6ba296e5717ae599025c9d268396e8faa8b4b0a8e0ac/xxhash-3.7.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:5886ad85e9e347911783760a1d16cb6b393e8f9e3b52c982568226cb56927bdc", size = 198037 },
+    { url = "https://files.pythonhosted.org/packages/61/2b/876e722d533833f5f9a83473e6ba993e48745701096944e77bbecf29b2c3/xxhash-3.7.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:6e934bbae1e0ec74e27d5f0d7f37ef547ce5ff9f0a7e63fb39e559fc99526734", size = 210744 },
+    { url = "https://files.pythonhosted.org/packages/21/e6/d7e7baef7ce24166b4668d3c48557bb35a23b92ecadcac7e7718d099ab69/xxhash-3.7.0-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:3b6b3d28228af044ebcded71c4a3dd86e1dbd7e2f4645bf40f7b5da65bb5fb5a", size = 275406 },
+    { url = "https://files.pythonhosted.org/packages/92/fe/198b3763b2e01ca908f2154969a2352ec99bda892b574a11a9a151c5ede4/xxhash-3.7.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:6be4d70d9ab76c9f324ead9c01af6ff52c324745ea0c3731682a0cf99720f1fe", size = 414125 },
+    { url = "https://files.pythonhosted.org/packages/3a/6d/019a11affd5a5499137cacca53808659964785439855b5aa40dfd3412916/xxhash-3.7.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:151d7520838d4465461a0b7f4ae488b3b00de16183dd3214c1a6b14bf89d7fb6", size = 191555 },
+    { url = "https://files.pythonhosted.org/packages/76/21/b96d58568df2d01533244c3e0e5cbdd0c8b2b25c4bec4d72f19259a292d7/xxhash-3.7.0-cp313-cp313-win32.whl", hash = "sha256:d798c1e291bffb8e37b5bbe0dda77fc767cd19e89cadaf66e6ed5d0ff88c9fe6", size = 30668 },
+    { url = "https://files.pythonhosted.org/packages/99/57/d849a8d3afa1f8f4bc6a831cd89f49f9706fbbad94d2975d6140a171988c/xxhash-3.7.0-cp313-cp313-win_amd64.whl", hash = "sha256:875811ba23c543b1a1c3143c926e43996eb27ebb8f52d3500744aa608c275aed", size = 31524 },
+    { url = "https://files.pythonhosted.org/packages/81/52/bacc753e92dee78b058af8dcef0a50815f5f860986c664a92d75f965b6a5/xxhash-3.7.0-cp313-cp313-win_arm64.whl", hash = "sha256:54a675cb300dda83d71daae2a599389d22db8021a0f8db0dd659e14626eb3ecc", size = 27768 },
+    { url = "https://files.pythonhosted.org/packages/1c/47/ddbd683b7fc7e592c1a8d9d65f73ce9ab513f082b3967eee2baf549b8fc6/xxhash-3.7.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:a3b19a42111c4057c1547a4a1396a53961dca576a0f6b82bfa88a2d1561764b2", size = 33576 },
+    { url = "https://files.pythonhosted.org/packages/07/f2/36d3310161db7f72efb4562aadde0ed429f1d0531782dd6345b12d2da527/xxhash-3.7.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:8f4608a06e4d61b7a3425665a46d00e0579122e1a2fae97a0c52953a3aad9aa3", size = 31123 },
+    { url = "https://files.pythonhosted.org/packages/0d/3f/75937a5c69556ed213021e43cbedd84c8e0279d0d74e7d41a255d84ba4b1/xxhash-3.7.0-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:ad37c7792479e49cf96c1ab25517d7003fe0d93687a772ba19a097d235bbe41e", size = 196491 },
+    { url = "https://files.pythonhosted.org/packages/22/29/f10d7ff8c7a733d4403a43b9de18c8fabc005f98cec054644f04418659ee/xxhash-3.7.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dc026e3b89d98e30a8288c95cb696e77d150b3f0fb7a51f73dcd49ee6b5577fa", size = 215793 },
+    { url = "https://files.pythonhosted.org/packages/8b/fd/778f60aa295f58907938f030a8b514611f391405614a525cccd2ffc00eb5/xxhash-3.7.0-cp313-cp313t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:c9b31ab1f28b078a6a1ac1a54eb35e7d5390deddd56870d0be3a0a733d1c321c", size = 237993 },
+    { url = "https://files.pythonhosted.org/packages/70/f5/736db5de387b4a540e37a05b84b40dc58a1ce974bfd2b4e5754ce29b68c3/xxhash-3.7.0-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3bb5fd680c038fd5229e44e9c493782f90df9bef632fd0499d442374688ff70b", size = 214887 },
+    { url = "https://files.pythonhosted.org/packages/4d/aa/09a095f22fdb9a27fbb716841fbff52119721f9ca4261952d07a912f7839/xxhash-3.7.0-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:030c0fd688fce3569fbb49a2feefd4110cbb0b650186fb4610759ecfac677548", size = 448407 },
+    { url = "https://files.pythonhosted.org/packages/74/8a/b745efeeca9e34a91c26fdc97ad8514c43d5a81ac78565cba80a1353870a/xxhash-3.7.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5b1bde10324f4c31812ae0d0502e92d916ae8917cad7209353f122b8b8f610c3", size = 196119 },
+    { url = "https://files.pythonhosted.org/packages/8a/5c/0cfceb024af90c191f665c7933b1f318ee234f4797858383bebd1881d52f/xxhash-3.7.0-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:503722d52a615f2604f5e7611de7d43878df010dc0053094ef91cb9a9ac3d987", size = 286751 },
+    { url = "https://files.pythonhosted.org/packages/0b/0a/0793e405dc3cf8f4ebe2c1acec1e4e4608cd9e7e50ea691dabbc2a95ccbb/xxhash-3.7.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c72500a3b6d6c30ebfc135035bcace9eb5884f2dc220804efcaaba43e9f611dd", size = 212961 },
+    { url = "https://files.pythonhosted.org/packages/0c/7e/721118ffc63bfff94aa565bcf2555a820f9f4bdb0f001e0d609bdfad70de/xxhash-3.7.0-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:43475925a766d01ca8cd9a857fd87f3d50406983c8506a4c07c4df12adcc867f", size = 243703 },
+    { url = "https://files.pythonhosted.org/packages/6e/18/16f6267160488b8276fd3d449d425712512add292ba545c1b6946bfdb7dd/xxhash-3.7.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:8d09dfd2ab135b985daf868b594315ebe11ad86cd9fea46e6c69f19b28f7d25a", size = 200894 },
+    { url = "https://files.pythonhosted.org/packages/2d/94/80ba841287fd97e3e9cac1d228788c8ef623746f570404961eec748ecb5c/xxhash-3.7.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:c50269d0055ac1faecfd559886d2cbe4b730de236585aba0e873f9d9dadbe585", size = 213357 },
+    { url = "https://files.pythonhosted.org/packages/a1/7e/106d4067130c59f1e18a55ffadcd876d8c68534883a1e02685b29d3d8153/xxhash-3.7.0-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:1910df4756a5ab58cfad8744fc2d0f23926e3efcc346ee76e87b974abab922f4", size = 277600 },
+    { url = "https://files.pythonhosted.org/packages/c5/86/a081dd30da71d720b2612a792bfd55e45fa9a07ac76a0507f60487473c25/xxhash-3.7.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:d006faf3b491957efcb433489be3c149efe4787b7063d5cddb8ddaefdc60e0c1", size = 416980 },
+    { url = "https://files.pythonhosted.org/packages/35/29/1a95221a029a3c1293773869e1ab47b07cbbdd82444a42809e8c60156626/xxhash-3.7.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:abb65b4e947e958f7b3b0d71db3ce447d1bc5f37f5eab871ce7223bda8768a04", size = 193840 },
+    { url = "https://files.pythonhosted.org/packages/c5/e0/db909dd0823285de2286f67e10ee4d81e96ad35d7d8e964ecb07fccd8af9/xxhash-3.7.0-cp313-cp313t-win32.whl", hash = "sha256:178959906cb1716a1ce08e0d69c82886c70a15a6f2790fc084fdd146ca30cd49", size = 30966 },
+    { url = "https://files.pythonhosted.org/packages/7b/ff/d705b15b22f21ee106adce239cb65d35067a158c630b240270f09b17c2e6/xxhash-3.7.0-cp313-cp313t-win_amd64.whl", hash = "sha256:2524a1e20d4c231d13b50f7cf39e44265b055669a64a7a4b9a2a44faa03f19b6", size = 31784 },
+    { url = "https://files.pythonhosted.org/packages/a2/1f/b2cf83c3638fd0588e0b17f22e5a9400bdfb1a3e3755324ac0aee2250b88/xxhash-3.7.0-cp313-cp313t-win_arm64.whl", hash = "sha256:37d994d0ffe81ef087bb330d392caa809bb5853c77e22ea3f71db024a0543dba", size = 27932 },
+    { url = "https://files.pythonhosted.org/packages/0e/cc/431db584f6fbb9312e40a173af027644e5580d39df1f73603cbb9dca4d6b/xxhash-3.7.0-cp314-cp314-android_24_arm64_v8a.whl", hash = "sha256:8c5fcfd806c335bfa2adf1cd0b3110a44fc7b6995c3a648c27489bae85801465", size = 36644 },
+    { url = "https://files.pythonhosted.org/packages/bc/01/255ec513e0a705d1f9a61413e78dfce4e3235203f0ed525a24c2b4b56345/xxhash-3.7.0-cp314-cp314-android_24_x86_64.whl", hash = "sha256:506a0b488f190f0a06769575e30caf71615c898ed93ab18b0dbcb6dec5c3713c", size = 35003 },
+    { url = "https://files.pythonhosted.org/packages/68/70/c55fc33c93445b44d8fc5a17b41ed99e3cebe92bcf8396809e63fc9a1165/xxhash-3.7.0-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:ec68dbba21532c0173a9872298e65c89749f7c9d21538c3a78b5bb6105871568", size = 29655 },
+    { url = "https://files.pythonhosted.org/packages/c2/72/ff8de73df000d74467d12a59ce6d6e2b2a368b978d41ab7b1fba5ed442be/xxhash-3.7.0-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:fa77e7ec1450d415d20129961814787c9abd9a07f98872f070b1fe96c5084611", size = 30664 },
+    { url = "https://files.pythonhosted.org/packages/b6/91/08416d9bd9bc3bf39d831abe8a5631ac2db5141dfd6fe81c3fe59a1f9264/xxhash-3.7.0-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:fe32736295ea38e43e7d9424053c8c47c9f64fecfc7c895fb3da9b30b131c9ee", size = 33317 },
+    { url = "https://files.pythonhosted.org/packages/0e/3b/86b1caa4dee10a99f4bf9521e623359341c5e50d05158fa10c275b2bd079/xxhash-3.7.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:ab9dd2c83c4bbd63e422181a76f13502d049d3ddcac9a1bdc29196263d692bb8", size = 33457 },
+    { url = "https://files.pythonhosted.org/packages/ed/38/98ea14ad1517e1461292a65906951458d520689782bfbae111050145bdba/xxhash-3.7.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:3afec3a336a2286601a437cb07562ab0227685e6fbb9ec17e8c18457ff348ecf", size = 30894 },
+    { url = "https://files.pythonhosted.org/packages/61/a2/074654d0b893606541199993c7db70067d9fc63b748e0d60020a52a1bd36/xxhash-3.7.0-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:565df64437a9390f84465dcca33e7377114c7ede8d05cd2cf20081f831ea788e", size = 194409 },
+    { url = "https://files.pythonhosted.org/packages/e2/26/6d2a1afc468189f77ca28c32e1c83e1b9da1178231e05641dbc1b350e332/xxhash-3.7.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:12eca820a5d558633d423bf8bb78ce72a55394823f64089247f788a7e0ae691e", size = 213135 },
+    { url = "https://files.pythonhosted.org/packages/8e/0e/d8aecf95e09c42547453137be74d2f7b8b14e08f5177fa2fab6144a19061/xxhash-3.7.0-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:f262b8f7599516567e070abf607b9af649052b2c4bd6f9be02b0cb41b7024805", size = 236379 },
+    { url = "https://files.pythonhosted.org/packages/f2/74/8140e8210536b3dd0cc816c4faaeb5ba6e63e8125ab25af4bcddd6a037b3/xxhash-3.7.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f1598916cb197681e03e601901e4ab96a9a963de398c59d0964f8a6f44a2b361", size = 212447 },
+    { url = "https://files.pythonhosted.org/packages/a0/d2/462001d2903b4bee5a5689598a0a55e5e7cd1ac7f4247a5545cff10d3ebb/xxhash-3.7.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:322b2f0622230f526aeb1738149948a7ae357a9e2ceb1383c6fd1fdaecdafa16", size = 445660 },
+    { url = "https://files.pythonhosted.org/packages/23/09/2bd1ed7f8689b20e51727952cac8329d50c694dc32b2eba06ba5bc742b37/xxhash-3.7.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:24cc22070880cc57b830a65cde4e65fa884c6d9b28ae4803b5ee05911e7bafba", size = 194076 },
+    { url = "https://files.pythonhosted.org/packages/c9/6e/692302cd0a5f4ac4e6289f37fa888dc2e1e07750b68fe3e4bfe939b8cea3/xxhash-3.7.0-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:cb5a888a968b2434abf9ecda357b5d43f10d7b5a6da6fdbbe036208473aff0e2", size = 284990 },
+    { url = "https://files.pythonhosted.org/packages/05/d9/e54b159b3d9df7999d2a7c676ce7b323d1b5588a64f8f51ed8172567bd87/xxhash-3.7.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:a999771ff97bec27d18341be4f3a36b163bb1ac41ec17bef6d2dabd84acd33c7", size = 210590 },
+    { url = "https://files.pythonhosted.org/packages/50/93/0e0df1a3a196ced4ca71de76d65ead25d8e87bbfb87b64306ea47a40c00d/xxhash-3.7.0-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:ed4a6efe2dee1655adb73e7ad40c6aa955a6892422b1e3b95de6a34de56e3cbb", size = 241442 },
+    { url = "https://files.pythonhosted.org/packages/9a/a9/d917a7a814e90b218f8a0d37967105eea91bf752c3303683c99a1f7bfc1f/xxhash-3.7.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:9fd17f14ac0faa12126c2f9ca774a8cf342957265ec3c8669c144e5e6cdb478c", size = 198356 },
+    { url = "https://files.pythonhosted.org/packages/89/5e/f2ba1877c39469abbefc72991d6ebdcbd4c0880db01ae8cb1f553b0c537d/xxhash-3.7.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:05fd1254268c59b5cb2a029dfc204275e9fc52de2913f1e53aa8d01442c96b4d", size = 210898 },
+    { url = "https://files.pythonhosted.org/packages/90/c6/be56b58e73de531f39a10de1355bb77ceb663900dc4bf2d6d3002a9c3f9e/xxhash-3.7.0-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:a2eae53197c6276d5b317f75a1be226bbf440c20b58bf525f36b5d0e1f657ca6", size = 275519 },
+    { url = "https://files.pythonhosted.org/packages/92/e2/17ddc85d5765b9c709f192009ed8f5a1fc876f4eb35bba7c307b5b1169f9/xxhash-3.7.0-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:bfe6f92e3522dcbe8c4281efd74fa7542a336cb00b0e3272c4ec0edabeaeaf67", size = 414191 },
+    { url = "https://files.pythonhosted.org/packages/9c/42/85f5b79f4bf1ec7ba052491164adfd4f4e9519f5dc7246de4fbd64a1bd56/xxhash-3.7.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:7ab9a49c410d8c6c786ab99e79c529938d894c01433130353dd0fe999111077a", size = 191604 },
+    { url = "https://files.pythonhosted.org/packages/b8/d0/6127b623aa4cca18d8b7743592b048d689fd6c6e37ff26a22cddf6cd9d7f/xxhash-3.7.0-cp314-cp314-win32.whl", hash = "sha256:040ea63668f9185b92bc74942df09c7e65703deed71431333678fc6e739a9955", size = 31271 },
+    { url = "https://files.pythonhosted.org/packages/64/4f/44fc4788568004c43921701cbc127f48218a1eede2c9aea231115323564d/xxhash-3.7.0-cp314-cp314-win_amd64.whl", hash = "sha256:2a61e2a3fb23c892496d587b470dee7fa1b58b248a187719c65ea8e94ec13257", size = 32284 },
+    { url = "https://files.pythonhosted.org/packages/6d/77/18bb895eb60a49453d16e17d67990e5caff557c78eafc90ad4e2eabf4570/xxhash-3.7.0-cp314-cp314-win_arm64.whl", hash = "sha256:c7741c7524961d8c0cb4d4c21b28957ff731a3fd5b5cd8b856dc80a40e9e5acc", size = 28701 },
+    { url = "https://files.pythonhosted.org/packages/45/a0/46f72244570c550fbbb7db1ef554183dd5ebe9136385f30e032b781ae8f6/xxhash-3.7.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:fc84bf7aa7592f31ec63a3e7b11d624f468a3f19f5238cec7282a42e838ab1d7", size = 33646 },
+    { url = "https://files.pythonhosted.org/packages/4a/3a/453846a7eceea11e75def361eed01ec6a0205b9822c19927ed364ccae7cc/xxhash-3.7.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:9f1563fdc8abfc389748e6932c7e4e99c89a53e4ec37d4563c24fc06f5e5644b", size = 31125 },
+    { url = "https://files.pythonhosted.org/packages/bd/3e/49434aba738885d512f9e486db1bdd19db28dfa40372b56da26ef7a4e738/xxhash-3.7.0-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:2d415f18becf6f153046ab6adc97da77e3643a0ee205dae61c4012604113a020", size = 196633 },
+    { url = "https://files.pythonhosted.org/packages/a4/e9/006cb6127baeb9f8abe6d15e62faa01349f09b34e2bfd65175b2422d026b/xxhash-3.7.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bb16aa13ed175bc9be5c2491ba031b85a9b51c4ed90e0b3d4ebe63cf3fb54f8e", size = 215899 },
+    { url = "https://files.pythonhosted.org/packages/27/e4/cc57d72e66df0ae29b914335f1c6dcf61e8f3746ddf0ae3c471aa4f15e00/xxhash-3.7.0-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:f9fd595f1e5941b3d7863e4774e4b30caa6731fc34b9277da032295aa5656ee5", size = 238116 },
+    { url = "https://files.pythonhosted.org/packages/af/78/3531d4a3fd8a0038cc6be1f265a69c1b3587f557a10b677dd736de2202c1/xxhash-3.7.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:1295325c5a98d552333fa53dc2b026b0ef0ec9c8e73ca3a952990b4c7d65d459", size = 215012 },
+    { url = "https://files.pythonhosted.org/packages/b4/f6/259fb1eaaec921f59b17203b0daee69829761226d3b980d5191d7723dd83/xxhash-3.7.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:3573a651d146912da9daa9e29e5fbc45994420daaa9ef1e2fa5823e1dc485513", size = 448534 },
+    { url = "https://files.pythonhosted.org/packages/7b/16/a66d0eaf6a7e68532c07714361ddc904c663ec940f3b028c1ae4a21a7b9d/xxhash-3.7.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5ec1e080a3d02d94ea9335bfab0e3374b877e25411422c18f51a943fa4b46381", size = 196217 },
+    { url = "https://files.pythonhosted.org/packages/8d/ef/d2efc7fc51756dc52509109d1a25cefc859d74bc4b19a167b12dbd8c2786/xxhash-3.7.0-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:84415265192072d8638a3afc3c1bc5995e310570cd9acb54dc46d3939e364fe0", size = 286906 },
+    { url = "https://files.pythonhosted.org/packages/fc/67/25decd1d4a4018582ec4db2a868a2b7e40640f4adb20dfeb19ac923aa825/xxhash-3.7.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8d4dea659b57443989ef32f4295104fd6912c73d0bf26d1d148bb88a9f159b02", size = 213057 },
+    { url = "https://files.pythonhosted.org/packages/0d/5d/17651eb29d06786cdc40c60ae3d27d645aa5d61d2eca6237a7ba0b94789b/xxhash-3.7.0-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:05ece0fe4d9c9c2728912d1981ae1566cfc83a011571b24732cbf76e1fb70dca", size = 243886 },
+    { url = "https://files.pythonhosted.org/packages/8a/d4/174d9cf7502243d586e6a9ae842b1ae23026620995114f85f1380e588bc9/xxhash-3.7.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:fd880353cf1ffaf321bc18dd663e111976dbd0d3bbd8a66d58d2b470dfa7f396", size = 201015 },
+    { url = "https://files.pythonhosted.org/packages/91/8c/2254e2d06c3ac5e6fe22eaf3da791b87ea823ae9f2c17b4af66755c5752d/xxhash-3.7.0-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:4e15cc9e2817f6481160f930c62842b3ff419e20e13072bcbab12230943092bc", size = 213457 },
+    { url = "https://files.pythonhosted.org/packages/79/a2/e3daa762545921173e3360f3b4ff7fc63c2d27359f7230ec1a7a74e117f6/xxhash-3.7.0-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:90b9d1a8bd37d768ffc92a1f651ec69afc532a96fa1ac2ea7abbed5d630b3237", size = 277738 },
+    { url = "https://files.pythonhosted.org/packages/e1/4c/e186da2c46b87f5204640e008d42730bf3c1ee9f0efb71ae1ebcdfeac681/xxhash-3.7.0-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:157c49475b34ecea8809e51123d9769a534e139d1247942f7a4bc67710bb2533", size = 417127 },
+    { url = "https://files.pythonhosted.org/packages/17/28/3798e15007a3712d0da3d3fe70f8e11916569858b5cc371053bc26270832/xxhash-3.7.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5a6ddec83325685e729ca119d1f5c518ec39294212ecd770e60693cdc5f7eb79", size = 193962 },
+    { url = "https://files.pythonhosted.org/packages/ad/95/a26baa93b5241fd7630998816a4ec47a5a0bad193b3f8fc8f3593e1a4a67/xxhash-3.7.0-cp314-cp314t-win32.whl", hash = "sha256:a04a6cab47e2166435aaf5b9e5ee41d1532cc8300efdef87f2a4d0acb7db19ed", size = 31643 },
+    { url = "https://files.pythonhosted.org/packages/44/36/5454f13c447e395f9b06a3e91274c59f503d31fad84e1836efe3bdb71f6a/xxhash-3.7.0-cp314-cp314t-win_amd64.whl", hash = "sha256:8653dd7c2eda020545bb2c71c7f7039b53fe7434d0fc1a0a9deb79ab3f1a4fc1", size = 32522 },
+    { url = "https://files.pythonhosted.org/packages/74/35/698e7e3ff38e22992ea24870a511d8762474fb6783627a2910ff22a185c2/xxhash-3.7.0-cp314-cp314t-win_arm64.whl", hash = "sha256:468f0fc114faaa4b36699f8e328bbc3bb11dc418ba94ac52c26dd736d4b6c637", size = 28807 },
+]
+
+[[package]]
+name = "yarl"
+version = "1.23.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "idna" },
+    { name = "multidict" },
+    { name = "propcache" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/23/6e/beb1beec874a72f23815c1434518bfc4ed2175065173fb138c3705f658d4/yarl-1.23.0.tar.gz", hash = "sha256:53b1ea6ca88ebd4420379c330aea57e258408dd0df9af0992e5de2078dc9f5d5", size = 194676 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/88/8a/94615bc31022f711add374097ad4144d569e95ff3c38d39215d07ac153a0/yarl-1.23.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:1932b6b8bba8d0160a9d1078aae5838a66039e8832d41d2992daa9a3a08f7860", size = 124737 },
+    { url = "https://files.pythonhosted.org/packages/e3/6f/c6554045d59d64052698add01226bc867b52fe4a12373415d7991fdca95d/yarl-1.23.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:411225bae281f114067578891bc75534cfb3d92a3b4dfef7a6ca78ba354e6069", size = 87029 },
+    { url = "https://files.pythonhosted.org/packages/19/2a/725ecc166d53438bc88f76822ed4b1e3b10756e790bafd7b523fe97c322d/yarl-1.23.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:13a563739ae600a631c36ce096615fe307f131344588b0bc0daec108cdb47b25", size = 86310 },
+    { url = "https://files.pythonhosted.org/packages/99/30/58260ed98e6ff7f90ba84442c1ddd758c9170d70327394a6227b310cd60f/yarl-1.23.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9cbf44c5cb4a7633d078788e1b56387e3d3cf2b8139a3be38040b22d6c3221c8", size = 97587 },
+    { url = "https://files.pythonhosted.org/packages/76/0a/8b08aac08b50682e65759f7f8dde98ae8168f72487e7357a5d684c581ef9/yarl-1.23.0-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:53ad387048f6f09a8969631e4de3f1bf70c50e93545d64af4f751b2498755072", size = 92528 },
+    { url = "https://files.pythonhosted.org/packages/52/07/0b7179101fe5f8385ec6c6bb5d0cb9f76bd9fb4a769591ab6fb5cdbfc69a/yarl-1.23.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4a59ba56f340334766f3a4442e0efd0af895fae9e2b204741ef885c446b3a1a8", size = 105339 },
+    { url = "https://files.pythonhosted.org/packages/d3/8a/36d82869ab5ec829ca8574dfcb92b51286fcfb1e9c7a73659616362dc880/yarl-1.23.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:803a3c3ce4acc62eaf01eaca1208dcf0783025ef27572c3336502b9c232005e7", size = 105061 },
+    { url = "https://files.pythonhosted.org/packages/66/3e/868e5c3364b6cee19ff3e1a122194fa4ce51def02c61023970442162859e/yarl-1.23.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a3d2bff8f37f8d0f96c7ec554d16945050d54462d6e95414babaa18bfafc7f51", size = 100132 },
+    { url = "https://files.pythonhosted.org/packages/cf/26/9c89acf82f08a52cb52d6d39454f8d18af15f9d386a23795389d1d423823/yarl-1.23.0-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c75eb09e8d55bceb4367e83496ff8ef2bc7ea6960efb38e978e8073ea59ecb67", size = 99289 },
+    { url = "https://files.pythonhosted.org/packages/6f/54/5b0db00d2cb056922356104468019c0a132e89c8d3ab67d8ede9f4483d2a/yarl-1.23.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:877b0738624280e34c55680d6054a307aa94f7d52fa0e3034a9cc6e790871da7", size = 96950 },
+    { url = "https://files.pythonhosted.org/packages/f6/40/10fa93811fd439341fad7e0718a86aca0de9548023bbb403668d6555acab/yarl-1.23.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:b5405bb8f0e783a988172993cfc627e4d9d00432d6bbac65a923041edacf997d", size = 93960 },
+    { url = "https://files.pythonhosted.org/packages/bc/d2/8ae2e6cd77d0805f4526e30ec43b6f9a3dfc542d401ac4990d178e4bf0cf/yarl-1.23.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:1c3a3598a832590c5a3ce56ab5576361b5688c12cb1d39429cf5dba30b510760", size = 104703 },
+    { url = "https://files.pythonhosted.org/packages/2f/0c/b3ceacf82c3fe21183ce35fa2acf5320af003d52bc1fcf5915077681142e/yarl-1.23.0-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:8419ebd326430d1cbb7efb5292330a2cf39114e82df5cc3d83c9a0d5ebeaf2f2", size = 98325 },
+    { url = "https://files.pythonhosted.org/packages/9d/e0/12900edd28bdab91a69bd2554b85ad7b151f64e8b521fe16f9ad2f56477a/yarl-1.23.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:be61f6fff406ca40e3b1d84716fde398fc08bc63dd96d15f3a14230a0973ed86", size = 105067 },
+    { url = "https://files.pythonhosted.org/packages/15/61/74bb1182cf79c9bbe4eb6b1f14a57a22d7a0be5e9cedf8e2d5c2086474c3/yarl-1.23.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3ceb13c5c858d01321b5d9bb65e4cf37a92169ea470b70fec6f236b2c9dd7e34", size = 100285 },
+    { url = "https://files.pythonhosted.org/packages/69/7f/cd5ef733f2550de6241bd8bd8c3febc78158b9d75f197d9c7baa113436af/yarl-1.23.0-cp312-cp312-win32.whl", hash = "sha256:fffc45637bcd6538de8b85f51e3df3223e4ad89bccbfca0481c08c7fc8b7ed7d", size = 82359 },
+    { url = "https://files.pythonhosted.org/packages/f5/be/25216a49daeeb7af2bec0db22d5e7df08ed1d7c9f65d78b14f3b74fd72fc/yarl-1.23.0-cp312-cp312-win_amd64.whl", hash = "sha256:f69f57305656a4852f2a7203efc661d8c042e6cc67f7acd97d8667fb448a426e", size = 87674 },
+    { url = "https://files.pythonhosted.org/packages/d2/35/aeab955d6c425b227d5b7247eafb24f2653fedc32f95373a001af5dfeb9e/yarl-1.23.0-cp312-cp312-win_arm64.whl", hash = "sha256:6e87a6e8735b44816e7db0b2fbc9686932df473c826b0d9743148432e10bb9b9", size = 81879 },
+    { url = "https://files.pythonhosted.org/packages/9a/4b/a0a6e5d0ee8a2f3a373ddef8a4097d74ac901ac363eea1440464ccbe0898/yarl-1.23.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:16c6994ac35c3e74fb0ae93323bf8b9c2a9088d55946109489667c510a7d010e", size = 123796 },
+    { url = "https://files.pythonhosted.org/packages/67/b6/8925d68af039b835ae876db5838e82e76ec87b9782ecc97e192b809c4831/yarl-1.23.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4a42e651629dafb64fd5b0286a3580613702b5809ad3f24934ea87595804f2c5", size = 86547 },
+    { url = "https://files.pythonhosted.org/packages/ae/50/06d511cc4b8e0360d3c94af051a768e84b755c5eb031b12adaaab6dec6e5/yarl-1.23.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7c6b9461a2a8b47c65eef63bb1c76a4f1c119618ffa99ea79bc5bb1e46c5821b", size = 85854 },
+    { url = "https://files.pythonhosted.org/packages/c4/f4/4e30b250927ffdab4db70da08b9b8d2194d7c7b400167b8fbeca1e4701ca/yarl-1.23.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2569b67d616eab450d262ca7cb9f9e19d2f718c70a8b88712859359d0ab17035", size = 98351 },
+    { url = "https://files.pythonhosted.org/packages/86/fc/4118c5671ea948208bdb1492d8b76bdf1453d3e73df051f939f563e7dcc5/yarl-1.23.0-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:e9d9a4d06d3481eab79803beb4d9bd6f6a8e781ec078ac70d7ef2dcc29d1bea5", size = 92711 },
+    { url = "https://files.pythonhosted.org/packages/56/11/1ed91d42bd9e73c13dc9e7eb0dd92298d75e7ac4dd7f046ad0c472e231cd/yarl-1.23.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f514f6474e04179d3d33175ed3f3e31434d3130d42ec153540d5b157deefd735", size = 106014 },
+    { url = "https://files.pythonhosted.org/packages/ce/c9/74e44e056a23fbc33aca71779ef450ca648a5bc472bdad7a82339918f818/yarl-1.23.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:fda207c815b253e34f7e1909840fd14299567b1c0eb4908f8c2ce01a41265401", size = 105557 },
+    { url = "https://files.pythonhosted.org/packages/66/fe/b1e10b08d287f518994f1e2ff9b6d26f0adeecd8dd7d533b01bab29a3eda/yarl-1.23.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:34b6cf500e61c90f305094911f9acc9c86da1a05a7a3f5be9f68817043f486e4", size = 101559 },
+    { url = "https://files.pythonhosted.org/packages/72/59/c5b8d94b14e3d3c2a9c20cb100119fd534ab5a14b93673ab4cc4a4141ea5/yarl-1.23.0-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:d7504f2b476d21653e4d143f44a175f7f751cd41233525312696c76aa3dbb23f", size = 100502 },
+    { url = "https://files.pythonhosted.org/packages/77/4f/96976cb54cbfc5c9fd73ed4c51804f92f209481d1fb190981c0f8a07a1d7/yarl-1.23.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:578110dd426f0d209d1509244e6d4a3f1a3e9077655d98c5f22583d63252a08a", size = 98027 },
+    { url = "https://files.pythonhosted.org/packages/63/6e/904c4f476471afdbad6b7e5b70362fb5810e35cd7466529a97322b6f5556/yarl-1.23.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:609d3614d78d74ebe35f54953c5bbd2ac647a7ddb9c30a5d877580f5e86b22f2", size = 95369 },
+    { url = "https://files.pythonhosted.org/packages/9d/40/acfcdb3b5f9d68ef499e39e04d25e141fe90661f9d54114556cf83be8353/yarl-1.23.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:4966242ec68afc74c122f8459abd597afd7d8a60dc93d695c1334c5fd25f762f", size = 105565 },
+    { url = "https://files.pythonhosted.org/packages/5e/c6/31e28f3a6ba2869c43d124f37ea5260cac9c9281df803c354b31f4dd1f3c/yarl-1.23.0-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:e0fd068364a6759bc794459f0a735ab151d11304346332489c7972bacbe9e72b", size = 99813 },
+    { url = "https://files.pythonhosted.org/packages/08/1f/6f65f59e72d54aa467119b63fc0b0b1762eff0232db1f4720cd89e2f4a17/yarl-1.23.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:39004f0ad156da43e86aa71f44e033de68a44e5a31fc53507b36dd253970054a", size = 105632 },
+    { url = "https://files.pythonhosted.org/packages/a3/c4/18b178a69935f9e7a338127d5b77d868fdc0f0e49becd286d51b3a18c61d/yarl-1.23.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e5723c01a56c5028c807c701aa66722916d2747ad737a046853f6c46f4875543", size = 101895 },
+    { url = "https://files.pythonhosted.org/packages/8f/54/f5b870b5505663911dba950a8e4776a0dbd51c9c54c0ae88e823e4b874a0/yarl-1.23.0-cp313-cp313-win32.whl", hash = "sha256:1b6b572edd95b4fa8df75de10b04bc81acc87c1c7d16bcdd2035b09d30acc957", size = 82356 },
+    { url = "https://files.pythonhosted.org/packages/7a/84/266e8da36879c6edcd37b02b547e2d9ecdfea776be49598e75696e3316e1/yarl-1.23.0-cp313-cp313-win_amd64.whl", hash = "sha256:baaf55442359053c7d62f6f8413a62adba3205119bcb6f49594894d8be47e5e3", size = 87515 },
+    { url = "https://files.pythonhosted.org/packages/00/fd/7e1c66efad35e1649114fa13f17485f62881ad58edeeb7f49f8c5e748bf9/yarl-1.23.0-cp313-cp313-win_arm64.whl", hash = "sha256:fb4948814a2a98e3912505f09c9e7493b1506226afb1f881825368d6fb776ee3", size = 81785 },
+    { url = "https://files.pythonhosted.org/packages/9c/fc/119dd07004f17ea43bb91e3ece6587759edd7519d6b086d16bfbd3319982/yarl-1.23.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:aecfed0b41aa72b7881712c65cf764e39ce2ec352324f5e0837c7048d9e6daaa", size = 130719 },
+    { url = "https://files.pythonhosted.org/packages/e6/0d/9f2348502fbb3af409e8f47730282cd6bc80dec6630c1e06374d882d6eb2/yarl-1.23.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:a41bcf68efd19073376eb8cf948b8d9be0af26256403e512bb18f3966f1f9120", size = 89690 },
+    { url = "https://files.pythonhosted.org/packages/50/93/e88f3c80971b42cfc83f50a51b9d165a1dbf154b97005f2994a79f212a07/yarl-1.23.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:cde9a2ecd91668bcb7f077c4966d8ceddb60af01b52e6e3e2680e4cf00ad1a59", size = 89851 },
+    { url = "https://files.pythonhosted.org/packages/1c/07/61c9dd8ba8f86473263b4036f70fb594c09e99c0d9737a799dfd8bc85651/yarl-1.23.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5023346c4ee7992febc0068e7593de5fa2bf611848c08404b35ebbb76b1b0512", size = 95874 },
+    { url = "https://files.pythonhosted.org/packages/9e/e9/f9ff8ceefba599eac6abddcfb0b3bee9b9e636e96dbf54342a8577252379/yarl-1.23.0-cp313-cp313t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:d1009abedb49ae95b136a8904a3f71b342f849ffeced2d3747bf29caeda218c4", size = 88710 },
+    { url = "https://files.pythonhosted.org/packages/eb/78/0231bfcc5d4c8eec220bc2f9ef82cb4566192ea867a7c5b4148f44f6cbcd/yarl-1.23.0-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a8d00f29b42f534cc8aa3931cfe773b13b23e561e10d2b26f27a8d309b0e82a1", size = 101033 },
+    { url = "https://files.pythonhosted.org/packages/cd/9b/30ea5239a61786f18fd25797151a17fbb3be176977187a48d541b5447dd4/yarl-1.23.0-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:95451e6ce06c3e104556d73b559f5da6c34a069b6b62946d3ad66afcd51642ea", size = 100817 },
+    { url = "https://files.pythonhosted.org/packages/62/e2/a4980481071791bc83bce2b7a1a1f7adcabfa366007518b4b845e92eeee3/yarl-1.23.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:531ef597132086b6cf96faa7c6c1dcd0361dd5f1694e5cc30375907b9b7d3ea9", size = 97482 },
+    { url = "https://files.pythonhosted.org/packages/e5/1e/304a00cf5f6100414c4b5a01fc7ff9ee724b62158a08df2f8170dfc72a2d/yarl-1.23.0-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:88f9fb0116fbfcefcab70f85cf4b74a2b6ce5d199c41345296f49d974ddb4123", size = 95949 },
+    { url = "https://files.pythonhosted.org/packages/68/03/093f4055ed4cae649ac53bca3d180bd37102e9e11d048588e9ab0c0108d0/yarl-1.23.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:e7b0460976dc75cb87ad9cc1f9899a4b97751e7d4e77ab840fc9b6d377b8fd24", size = 95839 },
+    { url = "https://files.pythonhosted.org/packages/b9/28/4c75ebb108f322aa8f917ae10a8ffa4f07cae10a8a627b64e578617df6a0/yarl-1.23.0-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:115136c4a426f9da976187d238e84139ff6b51a20839aa6e3720cd1026d768de", size = 90696 },
+    { url = "https://files.pythonhosted.org/packages/23/9c/42c2e2dd91c1a570402f51bdf066bfdb1241c2240ba001967bad778e77b7/yarl-1.23.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:ead11956716a940c1abc816b7df3fa2b84d06eaed8832ca32f5c5e058c65506b", size = 100865 },
+    { url = "https://files.pythonhosted.org/packages/74/05/1bcd60a8a0a914d462c305137246b6f9d167628d73568505fce3f1cb2e65/yarl-1.23.0-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:fe8f8f5e70e6dbdfca9882cd9deaac058729bcf323cf7a58660901e55c9c94f6", size = 96234 },
+    { url = "https://files.pythonhosted.org/packages/90/b2/f52381aac396d6778ce516b7bc149c79e65bfc068b5de2857ab69eeea3b7/yarl-1.23.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:a0e317df055958a0c1e79e5d2aa5a5eaa4a6d05a20d4b0c9c3f48918139c9fc6", size = 100295 },
+    { url = "https://files.pythonhosted.org/packages/e5/e8/638bae5bbf1113a659b2435d8895474598afe38b4a837103764f603aba56/yarl-1.23.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6f0fd84de0c957b2d280143522c4f91a73aada1923caee763e24a2b3fda9f8a5", size = 97784 },
+    { url = "https://files.pythonhosted.org/packages/80/25/a3892b46182c586c202629fc2159aa13975d3741d52ebd7347fd501d48d5/yarl-1.23.0-cp313-cp313t-win32.whl", hash = "sha256:93a784271881035ab4406a172edb0faecb6e7d00f4b53dc2f55919d6c9688595", size = 88313 },
+    { url = "https://files.pythonhosted.org/packages/43/68/8c5b36aa5178900b37387937bc2c2fe0e9505537f713495472dcf6f6fccc/yarl-1.23.0-cp313-cp313t-win_amd64.whl", hash = "sha256:dd00607bffbf30250fe108065f07453ec124dbf223420f57f5e749b04295e090", size = 94932 },
+    { url = "https://files.pythonhosted.org/packages/c6/cc/d79ba8292f51f81f4dc533a8ccfb9fc6992cabf0998ed3245de7589dc07c/yarl-1.23.0-cp313-cp313t-win_arm64.whl", hash = "sha256:ac09d42f48f80c9ee1635b2fcaa819496a44502737660d3c0f2ade7526d29144", size = 84786 },
+    { url = "https://files.pythonhosted.org/packages/90/98/b85a038d65d1b92c3903ab89444f48d3cee490a883477b716d7a24b1a78c/yarl-1.23.0-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:21d1b7305a71a15b4794b5ff22e8eef96ff4a6d7f9657155e5aa419444b28912", size = 124455 },
+    { url = "https://files.pythonhosted.org/packages/39/54/bc2b45559f86543d163b6e294417a107bb87557609007c007ad889afec18/yarl-1.23.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:85610b4f27f69984932a7abbe52703688de3724d9f72bceb1cca667deff27474", size = 86752 },
+    { url = "https://files.pythonhosted.org/packages/24/f9/e8242b68362bffe6fb536c8db5076861466fc780f0f1b479fc4ffbebb128/yarl-1.23.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:23f371bd662cf44a7630d4d113101eafc0cfa7518a2760d20760b26021454719", size = 86291 },
+    { url = "https://files.pythonhosted.org/packages/ea/d8/d1cb2378c81dd729e98c716582b1ccb08357e8488e4c24714658cc6630e8/yarl-1.23.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c4a80f77dc1acaaa61f0934176fccca7096d9b1ff08c8ba9cddf5ae034a24319", size = 99026 },
+    { url = "https://files.pythonhosted.org/packages/0a/ff/7196790538f31debe3341283b5b0707e7feb947620fc5e8236ef28d44f72/yarl-1.23.0-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:bd654fad46d8d9e823afbb4f87c79160b5a374ed1ff5bde24e542e6ba8f41434", size = 92355 },
+    { url = "https://files.pythonhosted.org/packages/c1/56/25d58c3eddde825890a5fe6aa1866228377354a3c39262235234ab5f616b/yarl-1.23.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:682bae25f0a0dd23a056739f23a134db9f52a63e2afd6bfb37ddc76292bbd723", size = 106417 },
+    { url = "https://files.pythonhosted.org/packages/51/8a/882c0e7bc8277eb895b31bce0138f51a1ba551fc2e1ec6753ffc1e7c1377/yarl-1.23.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a82836cab5f197a0514235aaf7ffccdc886ccdaa2324bc0aafdd4ae898103039", size = 106422 },
+    { url = "https://files.pythonhosted.org/packages/42/2b/fef67d616931055bf3d6764885990a3ac647d68734a2d6a9e1d13de437a2/yarl-1.23.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1c57676bdedc94cd3bc37724cf6f8cd2779f02f6aba48de45feca073e714fe52", size = 101915 },
+    { url = "https://files.pythonhosted.org/packages/18/6a/530e16aebce27c5937920f3431c628a29a4b6b430fab3fd1c117b26ff3f6/yarl-1.23.0-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c7f8dc16c498ff06497c015642333219871effba93e4a2e8604a06264aca5c5c", size = 100690 },
+    { url = "https://files.pythonhosted.org/packages/88/08/93749219179a45e27b036e03260fda05190b911de8e18225c294ac95bbc9/yarl-1.23.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:5ee586fb17ff8f90c91cf73c6108a434b02d69925f44f5f8e0d7f2f260607eae", size = 98750 },
+    { url = "https://files.pythonhosted.org/packages/d9/cf/ea424a004969f5d81a362110a6ac1496d79efdc6d50c2c4b2e3ea0fc2519/yarl-1.23.0-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:17235362f580149742739cc3828b80e24029d08cbb9c4bda0242c7b5bc610a8e", size = 94685 },
+    { url = "https://files.pythonhosted.org/packages/e2/b7/14341481fe568e2b0408bcf1484c652accafe06a0ade9387b5d3fd9df446/yarl-1.23.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:0793e2bd0cf14234983bbb371591e6bea9e876ddf6896cdcc93450996b0b5c85", size = 106009 },
+    { url = "https://files.pythonhosted.org/packages/0a/e6/5c744a9b54f4e8007ad35bce96fbc9218338e84812d36f3390cea616881a/yarl-1.23.0-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:3650dc2480f94f7116c364096bc84b1d602f44224ef7d5c7208425915c0475dd", size = 100033 },
+    { url = "https://files.pythonhosted.org/packages/0c/23/e3bfc188d0b400f025bc49d99793d02c9abe15752138dcc27e4eaf0c4a9e/yarl-1.23.0-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:f40e782d49630ad384db66d4d8b73ff4f1b8955dc12e26b09a3e3af064b3b9d6", size = 106483 },
+    { url = "https://files.pythonhosted.org/packages/72/42/f0505f949a90b3f8b7a363d6cbdf398f6e6c58946d85c6d3a3bc70595b26/yarl-1.23.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:94f8575fbdf81749008d980c17796097e645574a3b8c28ee313931068dad14fe", size = 102175 },
+    { url = "https://files.pythonhosted.org/packages/aa/65/b39290f1d892a9dd671d1c722014ca062a9c35d60885d57e5375db0404b5/yarl-1.23.0-cp314-cp314-win32.whl", hash = "sha256:c8aa34a5c864db1087d911a0b902d60d203ea3607d91f615acd3f3108ac32169", size = 83871 },
+    { url = "https://files.pythonhosted.org/packages/a9/5b/9b92f54c784c26e2a422e55a8d2607ab15b7ea3349e28359282f84f01d43/yarl-1.23.0-cp314-cp314-win_amd64.whl", hash = "sha256:63e92247f383c85ab00dd0091e8c3fa331a96e865459f5ee80353c70a4a42d70", size = 89093 },
+    { url = "https://files.pythonhosted.org/packages/e0/7d/8a84dc9381fd4412d5e7ff04926f9865f6372b4c2fd91e10092e65d29eb8/yarl-1.23.0-cp314-cp314-win_arm64.whl", hash = "sha256:70efd20be968c76ece7baa8dafe04c5be06abc57f754d6f36f3741f7aa7a208e", size = 83384 },
+    { url = "https://files.pythonhosted.org/packages/dd/8d/d2fad34b1c08aa161b74394183daa7d800141aaaee207317e82c790b418d/yarl-1.23.0-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:9a18d6f9359e45722c064c97464ec883eb0e0366d33eda61cb19a244bf222679", size = 131019 },
+    { url = "https://files.pythonhosted.org/packages/19/ff/33009a39d3ccf4b94d7d7880dfe17fb5816c5a4fe0096d9b56abceea9ac7/yarl-1.23.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:2803ed8b21ca47a43da80a6fd1ed3019d30061f7061daa35ac54f63933409412", size = 89894 },
+    { url = "https://files.pythonhosted.org/packages/0c/f1/dab7ac5e7306fb79c0190766a3c00b4cb8d09a1f390ded68c85a5934faf5/yarl-1.23.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:394906945aa8b19fc14a61cf69743a868bb8c465efe85eee687109cc540b98f4", size = 89979 },
+    { url = "https://files.pythonhosted.org/packages/aa/b1/08e95f3caee1fad6e65017b9f26c1d79877b502622d60e517de01e72f95d/yarl-1.23.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:71d006bee8397a4a89f469b8deb22469fe7508132d3c17fa6ed871e79832691c", size = 95943 },
+    { url = "https://files.pythonhosted.org/packages/c0/cc/6409f9018864a6aa186c61175b977131f373f1988e198e031236916e87e4/yarl-1.23.0-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:62694e275c93d54f7ccedcfef57d42761b2aad5234b6be1f3e3026cae4001cd4", size = 88786 },
+    { url = "https://files.pythonhosted.org/packages/76/40/cc22d1d7714b717fde2006fad2ced5efe5580606cb059ae42117542122f3/yarl-1.23.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a31de1613658308efdb21ada98cbc86a97c181aa050ba22a808120bb5be3ab94", size = 101307 },
+    { url = "https://files.pythonhosted.org/packages/8f/0d/476c38e85ddb4c6ec6b20b815bdd779aa386a013f3d8b85516feee55c8dc/yarl-1.23.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:fb1e8b8d66c278b21d13b0a7ca22c41dd757a7c209c6b12c313e445c31dd3b28", size = 100904 },
+    { url = "https://files.pythonhosted.org/packages/72/32/0abe4a76d59adf2081dcb0397168553ece4616ada1c54d1c49d8936c74f8/yarl-1.23.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:50f9d8d531dfb767c565f348f33dd5139a6c43f5cbdf3f67da40d54241df93f6", size = 97728 },
+    { url = "https://files.pythonhosted.org/packages/b7/35/7b30f4810fba112f60f5a43237545867504e15b1c7647a785fbaf588fac2/yarl-1.23.0-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:575aa4405a656e61a540f4a80eaa5260f2a38fff7bfdc4b5f611840d76e9e277", size = 95964 },
+    { url = "https://files.pythonhosted.org/packages/2d/86/ed7a73ab85ef00e8bb70b0cb5421d8a2a625b81a333941a469a6f4022828/yarl-1.23.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:041b1a4cefacf65840b4e295c6985f334ba83c30607441ae3cf206a0eed1a2e4", size = 95882 },
+    { url = "https://files.pythonhosted.org/packages/19/90/d56967f61a29d8498efb7afb651e0b2b422a1e9b47b0ab5f4e40a19b699b/yarl-1.23.0-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:d38c1e8231722c4ce40d7593f28d92b5fc72f3e9774fe73d7e800ec32299f63a", size = 90797 },
+    { url = "https://files.pythonhosted.org/packages/72/00/8b8f76909259f56647adb1011d7ed8b321bcf97e464515c65016a47ecdf0/yarl-1.23.0-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:d53834e23c015ee83a99377db6e5e37d8484f333edb03bd15b4bc312cc7254fb", size = 101023 },
+    { url = "https://files.pythonhosted.org/packages/ac/e2/cab11b126fb7d440281b7df8e9ddbe4851e70a4dde47a202b6642586b8d9/yarl-1.23.0-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:2e27c8841126e017dd2a054a95771569e6070b9ee1b133366d8b31beb5018a41", size = 96227 },
+    { url = "https://files.pythonhosted.org/packages/c2/9b/2c893e16bfc50e6b2edf76c1a9eb6cb0c744346197e74c65e99ad8d634d0/yarl-1.23.0-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:76855800ac56f878847a09ce6dba727c93ca2d89c9e9d63002d26b916810b0a2", size = 100302 },
+    { url = "https://files.pythonhosted.org/packages/28/ec/5498c4e3a6d5f1003beb23405671c2eb9cdbf3067d1c80f15eeafe301010/yarl-1.23.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:e09fd068c2e169a7070d83d3bde728a4d48de0549f975290be3c108c02e499b4", size = 98202 },
+    { url = "https://files.pythonhosted.org/packages/fe/c3/cd737e2d45e70717907f83e146f6949f20cc23cd4bf7b2688727763aa458/yarl-1.23.0-cp314-cp314t-win32.whl", hash = "sha256:73309162a6a571d4cbd3b6a1dcc703c7311843ae0d1578df6f09be4e98df38d4", size = 90558 },
+    { url = "https://files.pythonhosted.org/packages/e1/19/3774d162f6732d1cfb0b47b4140a942a35ca82bb19b6db1f80e9e7bdc8f8/yarl-1.23.0-cp314-cp314t-win_amd64.whl", hash = "sha256:4503053d296bc6e4cbd1fad61cf3b6e33b939886c4f249ba7c78b602214fabe2", size = 97610 },
+    { url = "https://files.pythonhosted.org/packages/51/47/3fa2286c3cb162c71cdb34c4224d5745a1ceceb391b2bd9b19b668a8d724/yarl-1.23.0-cp314-cp314t-win_arm64.whl", hash = "sha256:44bb7bef4ea409384e3f8bc36c063d77ea1b8d4a5b2706956c0d6695f07dcc25", size = 86041 },
+    { url = "https://files.pythonhosted.org/packages/69/68/c8739671f5699c7dc470580a4f821ef37c32c4cb0b047ce223a7f115757f/yarl-1.23.0-py3-none-any.whl", hash = "sha256:a2df6afe50dea8ae15fa34c9f824a3ee958d785fd5d089063d960bae1daa0a3f", size = 48288 },
+]
diff --git a/surfsense_obsidian/.editorconfig b/surfsense_obsidian/.editorconfig
new file mode 100644
index 0000000000..81f3ec3548
--- /dev/null
+++ b/surfsense_obsidian/.editorconfig
@@ -0,0 +1,10 @@
+# top-most EditorConfig file
+root = true
+
+[*]
+charset = utf-8
+end_of_line = lf
+insert_final_newline = true
+indent_style = tab
+indent_size = 4
+tab_width = 4
diff --git a/surfsense_obsidian/.gitignore b/surfsense_obsidian/.gitignore
new file mode 100644
index 0000000000..386ac2bdb9
--- /dev/null
+++ b/surfsense_obsidian/.gitignore
@@ -0,0 +1,22 @@
+# vscode
+.vscode 
+
+# Intellij
+*.iml
+.idea
+
+# npm
+node_modules
+
+# Don't include the compiled main.js file in the repo.
+# They should be uploaded to GitHub releases instead.
+main.js
+
+# Exclude sourcemaps
+*.map
+
+# obsidian
+data.json
+
+# Exclude macOS Finder (System Explorer) View States
+.DS_Store
diff --git a/surfsense_obsidian/.npmrc b/surfsense_obsidian/.npmrc
new file mode 100644
index 0000000000..b9737525fd
--- /dev/null
+++ b/surfsense_obsidian/.npmrc
@@ -0,0 +1 @@
+tag-version-prefix=""
\ No newline at end of file
diff --git a/surfsense_obsidian/AGENTS.md b/surfsense_obsidian/AGENTS.md
new file mode 100644
index 0000000000..3f4274ac69
--- /dev/null
+++ b/surfsense_obsidian/AGENTS.md
@@ -0,0 +1,251 @@
+# Obsidian community plugin
+
+## Project overview
+
+- Target: Obsidian Community Plugin (TypeScript → bundled JavaScript).
+- Entry point: `main.ts` compiled to `main.js` and loaded by Obsidian.
+- Required release artifacts: `main.js`, `manifest.json`, and optional `styles.css`.
+
+## Environment & tooling
+
+- Node.js: use current LTS (Node 18+ recommended).
+- **Package manager: npm** (required for this sample - `package.json` defines npm scripts and dependencies).
+- **Bundler: esbuild** (required for this sample - `esbuild.config.mjs` and build scripts depend on it). Alternative bundlers like Rollup or webpack are acceptable for other projects if they bundle all external dependencies into `main.js`.
+- Types: `obsidian` type definitions.
+
+**Note**: This sample project has specific technical dependencies on npm and esbuild. If you're creating a plugin from scratch, you can choose different tools, but you'll need to replace the build configuration accordingly.
+
+### Install
+
+```bash
+npm install
+```
+
+### Dev (watch)
+
+```bash
+npm run dev
+```
+
+### Production build
+
+```bash
+npm run build
+```
+
+## Linting
+
+- To use eslint install eslint from terminal: `npm install -g eslint`
+- To use eslint to analyze this project use this command: `eslint main.ts`
+- eslint will then create a report with suggestions for code improvement by file and line number.
+- If your source code is in a folder, such as `src`, you can use eslint with this command to analyze all files in that folder: `eslint ./src/`
+
+## File & folder conventions
+
+- **Organize code into multiple files**: Split functionality across separate modules rather than putting everything in `main.ts`.
+- Source lives in `src/`. Keep `main.ts` small and focused on plugin lifecycle (loading, unloading, registering commands).
+- **Example file structure**:
+  ```
+  src/
+    main.ts           # Plugin entry point, lifecycle management
+    settings.ts       # Settings interface and defaults
+    commands/         # Command implementations
+      command1.ts
+      command2.ts
+    ui/              # UI components, modals, views
+      modal.ts
+      view.ts
+    utils/           # Utility functions, helpers
+      helpers.ts
+      constants.ts
+    types.ts         # TypeScript interfaces and types
+  ```
+- **Do not commit build artifacts**: Never commit `node_modules/`, `main.js`, or other generated files to version control.
+- Keep the plugin small. Avoid large dependencies. Prefer browser-compatible packages.
+- Generated output should be placed at the plugin root or `dist/` depending on your build setup. Release artifacts must end up at the top level of the plugin folder in the vault (`main.js`, `manifest.json`, `styles.css`).
+
+## Manifest rules (`manifest.json`)
+
+- Must include (non-exhaustive):  
+  - `id` (plugin ID; for local dev it should match the folder name)  
+  - `name`  
+  - `version` (Semantic Versioning `x.y.z`)  
+  - `minAppVersion`  
+  - `description`  
+  - `isDesktopOnly` (boolean)  
+  - Optional: `author`, `authorUrl`, `fundingUrl` (string or map)
+- Never change `id` after release. Treat it as stable API.
+- Keep `minAppVersion` accurate when using newer APIs.
+- Canonical requirements are coded here: https://github.com/obsidianmd/obsidian-releases/blob/master/.github/workflows/validate-plugin-entry.yml
+
+## Testing
+
+- Manual install for testing: copy `main.js`, `manifest.json`, `styles.css` (if any) to:
+  ```
+  <Vault>/.obsidian/plugins/<plugin-id>/
+  ```
+- Reload Obsidian and enable the plugin in **Settings → Community plugins**.
+
+## Commands & settings
+
+- Any user-facing commands should be added via `this.addCommand(...)`.
+- If the plugin has configuration, provide a settings tab and sensible defaults.
+- Persist settings using `this.loadData()` / `this.saveData()`.
+- Use stable command IDs; avoid renaming once released.
+
+## Versioning & releases
+
+- Bump `version` in `manifest.json` (SemVer) and update `versions.json` to map plugin version → minimum app version.
+- Create a GitHub release whose tag exactly matches `manifest.json`'s `version`. Do not use a leading `v`.
+- Attach `manifest.json`, `main.js`, and `styles.css` (if present) to the release as individual assets.
+- After the initial release, follow the process to add/update your plugin in the community catalog as required.
+
+## Security, privacy, and compliance
+
+Follow Obsidian's **Developer Policies** and **Plugin Guidelines**. In particular:
+
+- Default to local/offline operation. Only make network requests when essential to the feature.
+- No hidden telemetry. If you collect optional analytics or call third-party services, require explicit opt-in and document clearly in `README.md` and in settings.
+- Never execute remote code, fetch and eval scripts, or auto-update plugin code outside of normal releases.
+- Minimize scope: read/write only what's necessary inside the vault. Do not access files outside the vault.
+- Clearly disclose any external services used, data sent, and risks.
+- Respect user privacy. Do not collect vault contents, filenames, or personal information unless absolutely necessary and explicitly consented.
+- Avoid deceptive patterns, ads, or spammy notifications.
+- Register and clean up all DOM, app, and interval listeners using the provided `register*` helpers so the plugin unloads safely.
+
+## UX & copy guidelines (for UI text, commands, settings)
+
+- Prefer sentence case for headings, buttons, and titles.
+- Use clear, action-oriented imperatives in step-by-step copy.
+- Use **bold** to indicate literal UI labels. Prefer "select" for interactions.
+- Use arrow notation for navigation: **Settings → Community plugins**.
+- Keep in-app strings short, consistent, and free of jargon.
+
+## Performance
+
+- Keep startup light. Defer heavy work until needed.
+- Avoid long-running tasks during `onload`; use lazy initialization.
+- Batch disk access and avoid excessive vault scans.
+- Debounce/throttle expensive operations in response to file system events.
+
+## Coding conventions
+
+- TypeScript with `"strict": true` preferred.
+- **Keep `main.ts` minimal**: Focus only on plugin lifecycle (onload, onunload, addCommand calls). Delegate all feature logic to separate modules.
+- **Split large files**: If any file exceeds ~200-300 lines, consider breaking it into smaller, focused modules.
+- **Use clear module boundaries**: Each file should have a single, well-defined responsibility.
+- Bundle everything into `main.js` (no unbundled runtime deps).
+- Avoid Node/Electron APIs if you want mobile compatibility; set `isDesktopOnly` accordingly.
+- Prefer `async/await` over promise chains; handle errors gracefully.
+
+## Mobile
+
+- Where feasible, test on iOS and Android.
+- Don't assume desktop-only behavior unless `isDesktopOnly` is `true`.
+- Avoid large in-memory structures; be mindful of memory and storage constraints.
+
+## Agent do/don't
+
+**Do**
+- Add commands with stable IDs (don't rename once released).
+- Provide defaults and validation in settings.
+- Write idempotent code paths so reload/unload doesn't leak listeners or intervals.
+- Use `this.register*` helpers for everything that needs cleanup.
+
+**Don't**
+- Introduce network calls without an obvious user-facing reason and documentation.
+- Ship features that require cloud services without clear disclosure and explicit opt-in.
+- Store or transmit vault contents unless essential and consented.
+
+## Common tasks
+
+### Organize code across multiple files
+
+**main.ts** (minimal, lifecycle only):
+```ts
+import { Plugin } from "obsidian";
+import { MySettings, DEFAULT_SETTINGS } from "./settings";
+import { registerCommands } from "./commands";
+
+export default class MyPlugin extends Plugin {
+  settings: MySettings;
+
+  async onload() {
+    this.settings = Object.assign({}, DEFAULT_SETTINGS, await this.loadData());
+    registerCommands(this);
+  }
+}
+```
+
+**settings.ts**:
+```ts
+export interface MySettings {
+  enabled: boolean;
+  apiKey: string;
+}
+
+export const DEFAULT_SETTINGS: MySettings = {
+  enabled: true,
+  apiKey: "",
+};
+```
+
+**commands/index.ts**:
+```ts
+import { Plugin } from "obsidian";
+import { doSomething } from "./my-command";
+
+export function registerCommands(plugin: Plugin) {
+  plugin.addCommand({
+    id: "do-something",
+    name: "Do something",
+    callback: () => doSomething(plugin),
+  });
+}
+```
+
+### Add a command
+
+```ts
+this.addCommand({
+  id: "your-command-id",
+  name: "Do the thing",
+  callback: () => this.doTheThing(),
+});
+```
+
+### Persist settings
+
+```ts
+interface MySettings { enabled: boolean }
+const DEFAULT_SETTINGS: MySettings = { enabled: true };
+
+async onload() {
+  this.settings = Object.assign({}, DEFAULT_SETTINGS, await this.loadData());
+  await this.saveData(this.settings);
+}
+```
+
+### Register listeners safely
+
+```ts
+this.registerEvent(this.app.workspace.on("file-open", f => { /* ... */ }));
+this.registerDomEvent(window, "resize", () => { /* ... */ });
+this.registerInterval(window.setInterval(() => { /* ... */ }, 1000));
+```
+
+## Troubleshooting
+
+- Plugin doesn't load after build: ensure `main.js` and `manifest.json` are at the top level of the plugin folder under `<Vault>/.obsidian/plugins/<plugin-id>/`. 
+- Build issues: if `main.js` is missing, run `npm run build` or `npm run dev` to compile your TypeScript source code.
+- Commands not appearing: verify `addCommand` runs after `onload` and IDs are unique.
+- Settings not persisting: ensure `loadData`/`saveData` are awaited and you re-render the UI after changes.
+- Mobile-only issues: confirm you're not using desktop-only APIs; check `isDesktopOnly` and adjust.
+
+## References
+
+- Obsidian sample plugin: https://github.com/obsidianmd/obsidian-sample-plugin
+- API documentation: https://docs.obsidian.md
+- Developer policies: https://docs.obsidian.md/Developer+policies
+- Plugin guidelines: https://docs.obsidian.md/Plugins/Releasing/Plugin+guidelines
+- Style guide: https://help.obsidian.md/style-guide
diff --git a/surfsense_obsidian/LICENSE b/surfsense_obsidian/LICENSE
new file mode 100644
index 0000000000..261eeb9e9f
--- /dev/null
+++ b/surfsense_obsidian/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/surfsense_obsidian/README.md b/surfsense_obsidian/README.md
new file mode 100644
index 0000000000..71cb8566e9
--- /dev/null
+++ b/surfsense_obsidian/README.md
@@ -0,0 +1,150 @@
+# SurfSense for Obsidian
+
+Sync your Obsidian vault to [SurfSense](https://github.com/MODSetter/SurfSense)
+so your notes become searchable alongside the rest of your knowledge sources
+(GitHub, Slack, Linear, Drive, web pages, etc.) from any SurfSense chat.
+
+The plugin runs inside Obsidian itself, on desktop and mobile, so it works
+the same way for SurfSense Cloud and self-hosted deployments. There is no
+server-side vault mount and no Electron-only path; everything goes over HTTPS.
+
+## What it does
+
+- Realtime sync as you create, edit, rename, or delete notes
+- Initial scan + reconciliation against the server manifest on startup,
+  so vault edits made while the plugin was offline still show up
+- Persistent upload queue, so a crash or offline window never loses changes
+- Frontmatter, `[[wiki links]]`, `#tags`, headings, and resolved/unresolved
+  links are extracted and indexed
+- Each chat citation links straight back into Obsidian via the
+  `obsidian://open?vault=…&file=…` deep link
+- Multi-vault aware: each vault you enable the plugin in becomes its own
+  connector row in SurfSense, named after the vault
+
+## Install
+
+### Via [BRAT](https://github.com/TfTHacker/obsidian42-brat) (current)
+
+1. Install the BRAT community plugin.
+2. Run **BRAT: Add a beta plugin for testing**.
+3. Paste `MODSetter/SurfSense` and pick the latest release.
+4. Enable **SurfSense** in *Settings → Community plugins*.
+
+### Manual sideload
+
+1. Download `main.js`, `manifest.json`, and `styles.css` from the latest
+   GitHub release tagged with the plugin version (e.g. `0.1.0`, with no `v`
+   prefix, matching the `version` field in `manifest.json`).
+2. Copy them into `<vault>/.obsidian/plugins/surfsense/`.
+3. Restart Obsidian and enable the plugin.
+
+### Community plugin store
+
+Submission to the official Obsidian community plugin store is in progress.
+Once approved you will be able to install from *Settings → Community plugins*
+inside Obsidian.
+
+## Configure
+
+Open **Settings → SurfSense** in Obsidian and fill in:
+
+| Setting | Value |
+| --- | --- |
+| Server URL | `https://surfsense.com` for SurfSense Cloud, or your self-hosted URL |
+| API token | Copy from the *Connectors → Obsidian* dialog in the SurfSense web app |
+| Search space | Pick the search space this vault should sync into |
+| Vault name | Defaults to your Obsidian vault name; rename if you have multiple vaults |
+| Sync mode | *Auto* (recommended) or *Manual* |
+| Exclude patterns | Glob patterns of folders/files to skip (e.g. `.trash`, `_attachments`, `templates/**`) |
+| Include attachments | Off by default; enable to sync non-`.md` files |
+
+The connector row appears automatically inside SurfSense the first time the
+plugin successfully calls `/obsidian/connect`. You can manage or delete it
+from *Connectors → Obsidian* in the web app.
+
+> **Token lifetime.** The web app currently issues 24-hour JWTs. If you see
+> *"token expired"* in the plugin status bar, paste a fresh token from the
+> SurfSense web app. Long-lived personal access tokens are coming in a future
+> release.
+
+## Mobile
+
+The plugin works on Obsidian for iOS and Android. Sync runs whenever the
+app is in the foreground and once more on app close. Mobile OSes
+aggressively suspend background apps, so mobile sync is near-realtime rather
+than instant. Desktop is the source of truth for live editing.
+
+## Privacy & safety
+
+The SurfSense backend qualifies as server-side telemetry under Obsidian's
+[Developer policies](https://github.com/obsidianmd/obsidian-developer-docs/blob/main/en/Developer%20policies.md),
+so here is the full list of what the plugin sends and stores. The
+canonical SurfSense privacy policy lives at
+<https://surfsense.com/privacy>; this section is the plugin-specific
+addendum.
+
+**Sent on `/connect` (once per onload):**
+
+- `vault_id`: a random UUID minted in the plugin's `data.json` on first run
+- `vault_name`: the Obsidian vault folder name
+- `search_space_id`: the SurfSense search space you picked
+
+**Sent per note on `/sync`, `/rename`, `/delete`:**
+
+- `path`, `name`, `extension`
+- `content` (plain text of the note)
+- `frontmatter`, `tags`, `headings`, resolved and unresolved links,
+  `embeds`, `aliases`
+- `content_hash` (SHA-256 of the note body), `mtime`, `ctime`
+
+**Stored server-side per vault:**
+
+- One connector row keyed by `vault_id` with `{vault_name, source: "plugin",
+  last_connect_at}`. Nothing per-device, no plugin version, no analytics.
+- One `documents` row per note (soft-deleted rather than hard-deleted so
+  existing chat citations remain valid).
+
+**What never leaves the plugin:**
+
+- No remote code loading, no `eval`, no analytics.
+- All network traffic goes to your configured **Server URL** only.
+- The `Authorization: Bearer …` header is set per-request with the token
+  you paste; the plugin never reads cookies or other Obsidian state.
+- The plugin uses Obsidian's `requestUrl` (no `fetch`, no `node:http`,
+  no `node:https`) and Web Crypto for hashing, per Obsidian's mobile guidance.
+
+For retention, deletion, and contact details see
+<https://surfsense.com/privacy>.
+
+## Development
+
+This plugin lives in [`surfsense_obsidian/`](.) inside the SurfSense
+monorepo. To work on it locally:
+
+```sh
+cd surfsense_obsidian
+npm install
+npm run dev   # esbuild in watch mode → main.js
+```
+
+Symlink the folder into a test vault's `.obsidian/plugins/surfsense/`,
+enable the plugin, then **Cmd+R** in Obsidian whenever `main.js` rebuilds.
+
+Lint:
+
+```sh
+npm run lint
+```
+
+The release pipeline lives at
+[`.github/workflows/release-obsidian-plugin.yml`](../.github/workflows/release-obsidian-plugin.yml)
+in the repo root and is triggered by tags of the form `obsidian-v0.1.0`.
+It verifies the tag matches `manifest.json`, builds the plugin, attaches
+`main.js` + `manifest.json` + `styles.css` to a GitHub release tagged with
+the bare version (e.g. `0.1.0`, the form BRAT and the Obsidian community
+store look for), and mirrors `manifest.json` + `versions.json` to the repo
+root so Obsidian's community plugin browser can discover them.
+
+## License
+
+[Apache-2.0](LICENSE), same as the rest of SurfSense.
diff --git a/surfsense_obsidian/esbuild.config.mjs b/surfsense_obsidian/esbuild.config.mjs
new file mode 100644
index 0000000000..1c74a149e0
--- /dev/null
+++ b/surfsense_obsidian/esbuild.config.mjs
@@ -0,0 +1,49 @@
+import esbuild from "esbuild";
+import process from "process";
+import { builtinModules } from 'node:module';
+
+const banner =
+`/*
+THIS IS A GENERATED/BUNDLED FILE BY ESBUILD
+if you want to view the source, please visit the github repository of this plugin
+*/
+`;
+
+const prod = (process.argv[2] === "production");
+
+const context = await esbuild.context({
+	banner: {
+		js: banner,
+	},
+	entryPoints: ["src/main.ts"],
+	bundle: true,
+	external: [
+		"obsidian",
+		"electron",
+		"@codemirror/autocomplete",
+		"@codemirror/collab",
+		"@codemirror/commands",
+		"@codemirror/language",
+		"@codemirror/lint",
+		"@codemirror/search",
+		"@codemirror/state",
+		"@codemirror/view",
+		"@lezer/common",
+		"@lezer/highlight",
+		"@lezer/lr",
+		...builtinModules],
+	format: "cjs",
+	target: "es2018",
+	logLevel: "info",
+	sourcemap: prod ? false : "inline",
+	treeShaking: true,
+	outfile: "main.js",
+	minify: prod,
+});
+
+if (prod) {
+	await context.rebuild();
+	process.exit(0);
+} else {
+	await context.watch();
+}
diff --git a/surfsense_obsidian/eslint.config.mts b/surfsense_obsidian/eslint.config.mts
new file mode 100644
index 0000000000..a2615ae6d4
--- /dev/null
+++ b/surfsense_obsidian/eslint.config.mts
@@ -0,0 +1,55 @@
+import tseslint from 'typescript-eslint';
+import obsidianmd from "eslint-plugin-obsidianmd";
+import globals from "globals";
+import { globalIgnores } from "eslint/config";
+
+export default tseslint.config(
+	{
+		languageOptions: {
+			globals: {
+				...globals.browser,
+			},
+			parserOptions: {
+				projectService: {
+					allowDefaultProject: [
+						'eslint.config.js',
+						'manifest.json'
+					]
+				},
+				tsconfigRootDir: import.meta.dirname,
+				extraFileExtensions: ['.json']
+			},
+		},
+	},
+	...obsidianmd.configs.recommended,
+	{
+		plugins: { obsidianmd },
+		rules: {
+			"obsidianmd/ui/sentence-case": [
+				"error",
+				{
+					brands: [
+						"Surfsense",
+						"iOS",
+						"iPadOS",
+						"macOS",
+						"Windows",
+						"Android",
+						"Linux",
+						"Obsidian",
+						"Markdown",
+					],
+				},
+			],
+		},
+	},
+	globalIgnores([
+		"node_modules",
+		"dist",
+		"esbuild.config.mjs",
+		"eslint.config.js",
+		"version-bump.mjs",
+		"versions.json",
+		"main.js",
+	]),
+);
diff --git a/surfsense_obsidian/manifest.json b/surfsense_obsidian/manifest.json
new file mode 100644
index 0000000000..d03a5b6500
--- /dev/null
+++ b/surfsense_obsidian/manifest.json
@@ -0,0 +1,10 @@
+{
+	"id": "surfsense-obsidian",
+	"name": "SurfSense",
+	"version": "0.1.0",
+	"minAppVersion": "1.5.4",
+	"description": "Turn your vault into a searchable second brain with SurfSense.",
+	"author": "SurfSense",
+	"authorUrl": "https://www.surfsense.com",
+	"isDesktopOnly": false
+}
diff --git a/surfsense_obsidian/package-lock.json b/surfsense_obsidian/package-lock.json
new file mode 100644
index 0000000000..e62b898854
--- /dev/null
+++ b/surfsense_obsidian/package-lock.json
@@ -0,0 +1,5170 @@
+{
+	"name": "surfsense-obsidian",
+	"version": "0.1.0",
+	"lockfileVersion": 3,
+	"requires": true,
+	"packages": {
+		"": {
+			"name": "surfsense-obsidian",
+			"version": "0.1.0",
+			"license": "Apache-2.0",
+			"dependencies": {
+				"obsidian": "latest"
+			},
+			"devDependencies": {
+				"@eslint/js": "9.30.1",
+				"@types/node": "^20.19.39",
+				"esbuild": "0.25.5",
+				"eslint-plugin-obsidianmd": "0.1.9",
+				"globals": "14.0.0",
+				"jiti": "2.6.1",
+				"tslib": "2.4.0",
+				"typescript": "^5.8.3",
+				"typescript-eslint": "8.35.1"
+			}
+		},
+		"node_modules/@codemirror/state": {
+			"version": "6.5.0",
+			"resolved": "https://registry.npmjs.org/@codemirror/state/-/state-6.5.0.tgz",
+			"integrity": "sha512-MwBHVK60IiIHDcoMet78lxt6iw5gJOGSbNbOIVBHWVXIH4/Nq1+GQgLLGgI1KlnN86WDXsPudVaqYHKBIx7Eyw==",
+			"license": "MIT",
+			"peer": true,
+			"dependencies": {
+				"@marijn/find-cluster-break": "^1.0.0"
+			}
+		},
+		"node_modules/@codemirror/view": {
+			"version": "6.38.6",
+			"resolved": "https://registry.npmjs.org/@codemirror/view/-/view-6.38.6.tgz",
+			"integrity": "sha512-qiS0z1bKs5WOvHIAC0Cybmv4AJSkAXgX5aD6Mqd2epSLlVJsQl8NG23jCVouIgkh4All/mrbdsf2UOLFnJw0tw==",
+			"license": "MIT",
+			"peer": true,
+			"dependencies": {
+				"@codemirror/state": "^6.5.0",
+				"crelt": "^1.0.6",
+				"style-mod": "^4.1.0",
+				"w3c-keyname": "^2.2.4"
+			}
+		},
+		"node_modules/@esbuild/aix-ppc64": {
+			"version": "0.25.5",
+			"resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.25.5.tgz",
+			"integrity": "sha512-9o3TMmpmftaCMepOdA5k/yDw8SfInyzWWTjYTFCX3kPSDJMROQTb8jg+h9Cnwnmm1vOzvxN7gIfB5V2ewpjtGA==",
+			"cpu": [
+				"ppc64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"aix"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/android-arm": {
+			"version": "0.25.5",
+			"resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.25.5.tgz",
+			"integrity": "sha512-AdJKSPeEHgi7/ZhuIPtcQKr5RQdo6OO2IL87JkianiMYMPbCtot9fxPbrMiBADOWWm3T2si9stAiVsGbTQFkbA==",
+			"cpu": [
+				"arm"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"android"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/android-arm64": {
+			"version": "0.25.5",
+			"resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.25.5.tgz",
+			"integrity": "sha512-VGzGhj4lJO+TVGV1v8ntCZWJktV7SGCs3Pn1GRWI1SBFtRALoomm8k5E9Pmwg3HOAal2VDc2F9+PM/rEY6oIDg==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"android"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/android-x64": {
+			"version": "0.25.5",
+			"resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.25.5.tgz",
+			"integrity": "sha512-D2GyJT1kjvO//drbRT3Hib9XPwQeWd9vZoBJn+bu/lVsOZ13cqNdDeqIF/xQ5/VmWvMduP6AmXvylO/PIc2isw==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"android"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/darwin-arm64": {
+			"version": "0.25.5",
+			"resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.25.5.tgz",
+			"integrity": "sha512-GtaBgammVvdF7aPIgH2jxMDdivezgFu6iKpmT+48+F8Hhg5J/sfnDieg0aeG/jfSvkYQU2/pceFPDKlqZzwnfQ==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"darwin"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/darwin-x64": {
+			"version": "0.25.5",
+			"resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.25.5.tgz",
+			"integrity": "sha512-1iT4FVL0dJ76/q1wd7XDsXrSW+oLoquptvh4CLR4kITDtqi2e/xwXwdCVH8hVHU43wgJdsq7Gxuzcs6Iq/7bxQ==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"darwin"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/freebsd-arm64": {
+			"version": "0.25.5",
+			"resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.25.5.tgz",
+			"integrity": "sha512-nk4tGP3JThz4La38Uy/gzyXtpkPW8zSAmoUhK9xKKXdBCzKODMc2adkB2+8om9BDYugz+uGV7sLmpTYzvmz6Sw==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"freebsd"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/freebsd-x64": {
+			"version": "0.25.5",
+			"resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.25.5.tgz",
+			"integrity": "sha512-PrikaNjiXdR2laW6OIjlbeuCPrPaAl0IwPIaRv+SMV8CiM8i2LqVUHFC1+8eORgWyY7yhQY+2U2fA55mBzReaw==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"freebsd"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/linux-arm": {
+			"version": "0.25.5",
+			"resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.25.5.tgz",
+			"integrity": "sha512-cPzojwW2okgh7ZlRpcBEtsX7WBuqbLrNXqLU89GxWbNt6uIg78ET82qifUy3W6OVww6ZWobWub5oqZOVtwolfw==",
+			"cpu": [
+				"arm"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/linux-arm64": {
+			"version": "0.25.5",
+			"resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.25.5.tgz",
+			"integrity": "sha512-Z9kfb1v6ZlGbWj8EJk9T6czVEjjq2ntSYLY2cw6pAZl4oKtfgQuS4HOq41M/BcoLPzrUbNd+R4BXFyH//nHxVg==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/linux-ia32": {
+			"version": "0.25.5",
+			"resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.25.5.tgz",
+			"integrity": "sha512-sQ7l00M8bSv36GLV95BVAdhJ2QsIbCuCjh/uYrWiMQSUuV+LpXwIqhgJDcvMTj+VsQmqAHL2yYaasENvJ7CDKA==",
+			"cpu": [
+				"ia32"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/linux-loong64": {
+			"version": "0.25.5",
+			"resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.25.5.tgz",
+			"integrity": "sha512-0ur7ae16hDUC4OL5iEnDb0tZHDxYmuQyhKhsPBV8f99f6Z9KQM02g33f93rNH5A30agMS46u2HP6qTdEt6Q1kg==",
+			"cpu": [
+				"loong64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/linux-mips64el": {
+			"version": "0.25.5",
+			"resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.25.5.tgz",
+			"integrity": "sha512-kB/66P1OsHO5zLz0i6X0RxlQ+3cu0mkxS3TKFvkb5lin6uwZ/ttOkP3Z8lfR9mJOBk14ZwZ9182SIIWFGNmqmg==",
+			"cpu": [
+				"mips64el"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/linux-ppc64": {
+			"version": "0.25.5",
+			"resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.25.5.tgz",
+			"integrity": "sha512-UZCmJ7r9X2fe2D6jBmkLBMQetXPXIsZjQJCjgwpVDz+YMcS6oFR27alkgGv3Oqkv07bxdvw7fyB71/olceJhkQ==",
+			"cpu": [
+				"ppc64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/linux-riscv64": {
+			"version": "0.25.5",
+			"resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.25.5.tgz",
+			"integrity": "sha512-kTxwu4mLyeOlsVIFPfQo+fQJAV9mh24xL+y+Bm6ej067sYANjyEw1dNHmvoqxJUCMnkBdKpvOn0Ahql6+4VyeA==",
+			"cpu": [
+				"riscv64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/linux-s390x": {
+			"version": "0.25.5",
+			"resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.25.5.tgz",
+			"integrity": "sha512-K2dSKTKfmdh78uJ3NcWFiqyRrimfdinS5ErLSn3vluHNeHVnBAFWC8a4X5N+7FgVE1EjXS1QDZbpqZBjfrqMTQ==",
+			"cpu": [
+				"s390x"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/linux-x64": {
+			"version": "0.25.5",
+			"resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.25.5.tgz",
+			"integrity": "sha512-uhj8N2obKTE6pSZ+aMUbqq+1nXxNjZIIjCjGLfsWvVpy7gKCOL6rsY1MhRh9zLtUtAI7vpgLMK6DxjO8Qm9lJw==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/netbsd-arm64": {
+			"version": "0.25.5",
+			"resolved": "https://registry.npmjs.org/@esbuild/netbsd-arm64/-/netbsd-arm64-0.25.5.tgz",
+			"integrity": "sha512-pwHtMP9viAy1oHPvgxtOv+OkduK5ugofNTVDilIzBLpoWAM16r7b/mxBvfpuQDpRQFMfuVr5aLcn4yveGvBZvw==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"netbsd"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/netbsd-x64": {
+			"version": "0.25.5",
+			"resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.25.5.tgz",
+			"integrity": "sha512-WOb5fKrvVTRMfWFNCroYWWklbnXH0Q5rZppjq0vQIdlsQKuw6mdSihwSo4RV/YdQ5UCKKvBy7/0ZZYLBZKIbwQ==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"netbsd"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/openbsd-arm64": {
+			"version": "0.25.5",
+			"resolved": "https://registry.npmjs.org/@esbuild/openbsd-arm64/-/openbsd-arm64-0.25.5.tgz",
+			"integrity": "sha512-7A208+uQKgTxHd0G0uqZO8UjK2R0DDb4fDmERtARjSHWxqMTye4Erz4zZafx7Di9Cv+lNHYuncAkiGFySoD+Mw==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"openbsd"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/openbsd-x64": {
+			"version": "0.25.5",
+			"resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.25.5.tgz",
+			"integrity": "sha512-G4hE405ErTWraiZ8UiSoesH8DaCsMm0Cay4fsFWOOUcz8b8rC6uCvnagr+gnioEjWn0wC+o1/TAHt+It+MpIMg==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"openbsd"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/sunos-x64": {
+			"version": "0.25.5",
+			"resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.25.5.tgz",
+			"integrity": "sha512-l+azKShMy7FxzY0Rj4RCt5VD/q8mG/e+mDivgspo+yL8zW7qEwctQ6YqKX34DTEleFAvCIUviCFX1SDZRSyMQA==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"sunos"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/win32-arm64": {
+			"version": "0.25.5",
+			"resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.25.5.tgz",
+			"integrity": "sha512-O2S7SNZzdcFG7eFKgvwUEZ2VG9D/sn/eIiz8XRZ1Q/DO5a3s76Xv0mdBzVM5j5R639lXQmPmSo0iRpHqUUrsxw==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"win32"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/win32-ia32": {
+			"version": "0.25.5",
+			"resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.25.5.tgz",
+			"integrity": "sha512-onOJ02pqs9h1iMJ1PQphR+VZv8qBMQ77Klcsqv9CNW2w6yLqoURLcgERAIurY6QE63bbLuqgP9ATqajFLK5AMQ==",
+			"cpu": [
+				"ia32"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"win32"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/win32-x64": {
+			"version": "0.25.5",
+			"resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.25.5.tgz",
+			"integrity": "sha512-TXv6YnJ8ZMVdX+SXWVBo/0p8LTcrUYngpWjvm91TMjjBQii7Oz11Lw5lbDV5Y0TzuhSJHwiH4hEtC1I42mMS0g==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"win32"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@eslint-community/eslint-utils": {
+			"version": "4.9.0",
+			"resolved": "https://registry.npmjs.org/@eslint-community/eslint-utils/-/eslint-utils-4.9.0.tgz",
+			"integrity": "sha512-ayVFHdtZ+hsq1t2Dy24wCmGXGe4q9Gu3smhLYALJrr473ZH27MsnSL+LKUlimp4BWJqMDMLmPpx/Q9R3OAlL4g==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"eslint-visitor-keys": "^3.4.3"
+			},
+			"engines": {
+				"node": "^12.22.0 || ^14.17.0 || >=16.0.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/eslint"
+			},
+			"peerDependencies": {
+				"eslint": "^6.0.0 || ^7.0.0 || >=8.0.0"
+			}
+		},
+		"node_modules/@eslint-community/eslint-utils/node_modules/eslint-visitor-keys": {
+			"version": "3.4.3",
+			"resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-3.4.3.tgz",
+			"integrity": "sha512-wpc+LXeiyiisxPlEkUzU6svyS1frIO3Mgxj1fdy7Pm8Ygzguax2N3Fa/D/ag1WqbOprdI+uY6wMUl8/a2G+iag==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"engines": {
+				"node": "^12.22.0 || ^14.17.0 || >=16.0.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/eslint"
+			}
+		},
+		"node_modules/@eslint-community/regexpp": {
+			"version": "4.12.2",
+			"resolved": "https://registry.npmjs.org/@eslint-community/regexpp/-/regexpp-4.12.2.tgz",
+			"integrity": "sha512-EriSTlt5OC9/7SXkRSCAhfSxxoSUgBm33OH+IkwbdpgoqsSsUg7y3uh+IICI/Qg4BBWr3U2i39RpmycbxMq4ew==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": "^12.0.0 || ^14.0.0 || >=16.0.0"
+			}
+		},
+		"node_modules/@eslint/config-array": {
+			"version": "0.21.1",
+			"resolved": "https://registry.npmjs.org/@eslint/config-array/-/config-array-0.21.1.tgz",
+			"integrity": "sha512-aw1gNayWpdI/jSYVgzN5pL0cfzU02GT3NBpeT/DXbx1/1x7ZKxFPd9bwrzygx/qiwIQiJ1sw/zD8qY/kRvlGHA==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"dependencies": {
+				"@eslint/object-schema": "^2.1.7",
+				"debug": "^4.3.1",
+				"minimatch": "^3.1.2"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			}
+		},
+		"node_modules/@eslint/config-helpers": {
+			"version": "0.4.2",
+			"resolved": "https://registry.npmjs.org/@eslint/config-helpers/-/config-helpers-0.4.2.tgz",
+			"integrity": "sha512-gBrxN88gOIf3R7ja5K9slwNayVcZgK6SOUORm2uBzTeIEfeVaIhOpCtTox3P6R7o2jLFwLFTLnC7kU/RGcYEgw==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"dependencies": {
+				"@eslint/core": "^0.17.0"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			}
+		},
+		"node_modules/@eslint/core": {
+			"version": "0.17.0",
+			"resolved": "https://registry.npmjs.org/@eslint/core/-/core-0.17.0.tgz",
+			"integrity": "sha512-yL/sLrpmtDaFEiUj1osRP4TI2MDz1AddJL+jZ7KSqvBuliN4xqYY54IfdN8qD8Toa6g1iloph1fxQNkjOxrrpQ==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"dependencies": {
+				"@types/json-schema": "^7.0.15"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			}
+		},
+		"node_modules/@eslint/eslintrc": {
+			"version": "3.3.1",
+			"resolved": "https://registry.npmjs.org/@eslint/eslintrc/-/eslintrc-3.3.1.tgz",
+			"integrity": "sha512-gtF186CXhIl1p4pJNGZw8Yc6RlshoePRvE0X91oPGb3vZ8pM3qOS9W9NGPat9LziaBV7XrJWGylNQXkGcnM3IQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"ajv": "^6.12.4",
+				"debug": "^4.3.2",
+				"espree": "^10.0.1",
+				"globals": "^14.0.0",
+				"ignore": "^5.2.0",
+				"import-fresh": "^3.2.1",
+				"js-yaml": "^4.1.0",
+				"minimatch": "^3.1.2",
+				"strip-json-comments": "^3.1.1"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/eslint"
+			}
+		},
+		"node_modules/@eslint/js": {
+			"version": "9.30.1",
+			"resolved": "https://registry.npmjs.org/@eslint/js/-/js-9.30.1.tgz",
+			"integrity": "sha512-zXhuECFlyep42KZUhWjfvsmXGX39W8K8LFb8AWXM9gSV9dQB+MrJGLKvW6Zw0Ggnbpw0VHTtrhFXYe3Gym18jg==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"url": "https://eslint.org/donate"
+			}
+		},
+		"node_modules/@eslint/json": {
+			"version": "0.14.0",
+			"resolved": "https://registry.npmjs.org/@eslint/json/-/json-0.14.0.tgz",
+			"integrity": "sha512-rvR/EZtvUG3p9uqrSmcDJPYSH7atmWr0RnFWN6m917MAPx82+zQgPUmDu0whPFG6XTyM0vB/hR6c1Q63OaYtCQ==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"peer": true,
+			"dependencies": {
+				"@eslint/core": "^0.17.0",
+				"@eslint/plugin-kit": "^0.4.1",
+				"@humanwhocodes/momoa": "^3.3.10",
+				"natural-compare": "^1.4.0"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			}
+		},
+		"node_modules/@eslint/object-schema": {
+			"version": "2.1.7",
+			"resolved": "https://registry.npmjs.org/@eslint/object-schema/-/object-schema-2.1.7.tgz",
+			"integrity": "sha512-VtAOaymWVfZcmZbp6E2mympDIHvyjXs/12LqWYjVw6qjrfF+VK+fyG33kChz3nnK+SU5/NeHOqrTEHS8sXO3OA==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			}
+		},
+		"node_modules/@eslint/plugin-kit": {
+			"version": "0.4.1",
+			"resolved": "https://registry.npmjs.org/@eslint/plugin-kit/-/plugin-kit-0.4.1.tgz",
+			"integrity": "sha512-43/qtrDUokr7LJqoF2c3+RInu/t4zfrpYdoSDfYyhg52rwLV6TnOvdG4fXm7IkSB3wErkcmJS9iEhjVtOSEjjA==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"dependencies": {
+				"@eslint/core": "^0.17.0",
+				"levn": "^0.4.1"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			}
+		},
+		"node_modules/@humanfs/core": {
+			"version": "0.19.1",
+			"resolved": "https://registry.npmjs.org/@humanfs/core/-/core-0.19.1.tgz",
+			"integrity": "sha512-5DyQ4+1JEUzejeK1JGICcideyfUbGixgS9jNgex5nqkW+cY7WZhxBigmieN5Qnw9ZosSNVC9KQKyb+GUaGyKUA==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"engines": {
+				"node": ">=18.18.0"
+			}
+		},
+		"node_modules/@humanfs/node": {
+			"version": "0.16.7",
+			"resolved": "https://registry.npmjs.org/@humanfs/node/-/node-0.16.7.tgz",
+			"integrity": "sha512-/zUx+yOsIrG4Y43Eh2peDeKCxlRt/gET6aHfaKpuq267qXdYDFViVHfMaLyygZOnl0kGWxFIgsBy8QFuTLUXEQ==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"dependencies": {
+				"@humanfs/core": "^0.19.1",
+				"@humanwhocodes/retry": "^0.4.0"
+			},
+			"engines": {
+				"node": ">=18.18.0"
+			}
+		},
+		"node_modules/@humanwhocodes/module-importer": {
+			"version": "1.0.1",
+			"resolved": "https://registry.npmjs.org/@humanwhocodes/module-importer/-/module-importer-1.0.1.tgz",
+			"integrity": "sha512-bxveV4V8v5Yb4ncFTT3rPSgZBOpCkjfK0y4oVVVJwIuDVBRMDXrPyXRL988i5ap9m9bnyEEjWfm5WkBmtffLfA==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"engines": {
+				"node": ">=12.22"
+			},
+			"funding": {
+				"type": "github",
+				"url": "https://github.com/sponsors/nzakas"
+			}
+		},
+		"node_modules/@humanwhocodes/momoa": {
+			"version": "3.3.10",
+			"resolved": "https://registry.npmjs.org/@humanwhocodes/momoa/-/momoa-3.3.10.tgz",
+			"integrity": "sha512-KWiFQpSAqEIyrTXko3hFNLeQvSK8zXlJQzhhxsyVn58WFRYXST99b3Nqnu+ttOtjds2Pl2grUHGpe2NzhPynuQ==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"peer": true,
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@humanwhocodes/retry": {
+			"version": "0.4.3",
+			"resolved": "https://registry.npmjs.org/@humanwhocodes/retry/-/retry-0.4.3.tgz",
+			"integrity": "sha512-bV0Tgo9K4hfPCek+aMAn81RppFKv2ySDQeMoSZuvTASywNTnVJCArCZE2FWqpvIatKu7VMRLWlR1EazvVhDyhQ==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"engines": {
+				"node": ">=18.18"
+			},
+			"funding": {
+				"type": "github",
+				"url": "https://github.com/sponsors/nzakas"
+			}
+		},
+		"node_modules/@marijn/find-cluster-break": {
+			"version": "1.0.2",
+			"resolved": "https://registry.npmjs.org/@marijn/find-cluster-break/-/find-cluster-break-1.0.2.tgz",
+			"integrity": "sha512-l0h88YhZFyKdXIFNfSWpyjStDjGHwZ/U7iobcK1cQQD8sejsONdQtTVU+1wVN1PBw40PiiHB1vA5S7VTfQiP9g==",
+			"license": "MIT",
+			"peer": true
+		},
+		"node_modules/@microsoft/eslint-plugin-sdl": {
+			"version": "1.1.0",
+			"resolved": "https://registry.npmjs.org/@microsoft/eslint-plugin-sdl/-/eslint-plugin-sdl-1.1.0.tgz",
+			"integrity": "sha512-dxdNHOemLnBhfY3eByrujX9KyLigcNtW8sU+axzWv5nLGcsSBeKW2YYyTpfPo1hV8YPOmIGnfA4fZHyKVtWqBQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"eslint-plugin-n": "17.10.3",
+				"eslint-plugin-react": "7.37.3",
+				"eslint-plugin-security": "1.4.0"
+			},
+			"engines": {
+				"node": ">=18.0.0"
+			},
+			"peerDependencies": {
+				"eslint": "^9"
+			}
+		},
+		"node_modules/@microsoft/eslint-plugin-sdl/node_modules/eslint-plugin-security": {
+			"version": "1.4.0",
+			"resolved": "https://registry.npmjs.org/eslint-plugin-security/-/eslint-plugin-security-1.4.0.tgz",
+			"integrity": "sha512-xlS7P2PLMXeqfhyf3NpqbvbnW04kN8M9NtmhpR3XGyOvt/vNKS7XPXT5EDbwKW9vCjWH4PpfQvgD/+JgN0VJKA==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"dependencies": {
+				"safe-regex": "^1.1.0"
+			}
+		},
+		"node_modules/@microsoft/eslint-plugin-sdl/node_modules/safe-regex": {
+			"version": "1.1.0",
+			"resolved": "https://registry.npmjs.org/safe-regex/-/safe-regex-1.1.0.tgz",
+			"integrity": "sha512-aJXcif4xnaNUzvUuC5gcb46oTS7zvg4jpMTnuqtrEPlR3vFr4pxtdTwaF1Qs3Enjn9HK+ZlwQui+a7z0SywIzg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"ret": "~0.1.10"
+			}
+		},
+		"node_modules/@nodelib/fs.scandir": {
+			"version": "2.1.5",
+			"resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz",
+			"integrity": "sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@nodelib/fs.stat": "2.0.5",
+				"run-parallel": "^1.1.9"
+			},
+			"engines": {
+				"node": ">= 8"
+			}
+		},
+		"node_modules/@nodelib/fs.stat": {
+			"version": "2.0.5",
+			"resolved": "https://registry.npmjs.org/@nodelib/fs.stat/-/fs.stat-2.0.5.tgz",
+			"integrity": "sha512-RkhPPp2zrqDAQA/2jNhnztcPAlv64XdhIp7a7454A5ovI7Bukxgt7MX7udwAu3zg1DcpPU0rz3VV1SeaqvY4+A==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 8"
+			}
+		},
+		"node_modules/@nodelib/fs.walk": {
+			"version": "1.2.8",
+			"resolved": "https://registry.npmjs.org/@nodelib/fs.walk/-/fs.walk-1.2.8.tgz",
+			"integrity": "sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@nodelib/fs.scandir": "2.1.5",
+				"fastq": "^1.6.0"
+			},
+			"engines": {
+				"node": ">= 8"
+			}
+		},
+		"node_modules/@pkgr/core": {
+			"version": "0.1.2",
+			"resolved": "https://registry.npmjs.org/@pkgr/core/-/core-0.1.2.tgz",
+			"integrity": "sha512-fdDH1LSGfZdTH2sxdpVMw31BanV28K/Gry0cVFxaNP77neJSkd82mM8ErPNYs9e+0O7SdHBLTDzDgwUuy18RnQ==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": "^12.20.0 || ^14.18.0 || >=16.0.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/unts"
+			}
+		},
+		"node_modules/@rtsao/scc": {
+			"version": "1.1.0",
+			"resolved": "https://registry.npmjs.org/@rtsao/scc/-/scc-1.1.0.tgz",
+			"integrity": "sha512-zt6OdqaDoOnJ1ZYsCYGt9YmWzDXl4vQdKTyJev62gFhRGKdx7mcT54V9KIjg+d2wi9EXsPvAPKe7i7WjfVWB8g==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/@types/codemirror": {
+			"version": "5.60.8",
+			"resolved": "https://registry.npmjs.org/@types/codemirror/-/codemirror-5.60.8.tgz",
+			"integrity": "sha512-VjFgDF/eB+Aklcy15TtOTLQeMjTo07k7KAjql8OK5Dirr7a6sJY4T1uVBDuTVG9VEmn1uUsohOpYnVfgC6/jyw==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/tern": "*"
+			}
+		},
+		"node_modules/@types/eslint": {
+			"version": "8.56.2",
+			"resolved": "https://registry.npmjs.org/@types/eslint/-/eslint-8.56.2.tgz",
+			"integrity": "sha512-uQDwm1wFHmbBbCZCqAlq6Do9LYwByNZHWzXppSnay9SuwJ+VRbjkbLABer54kcPnMSlG6Fdiy2yaFXm/z9Z5gw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@types/estree": "*",
+				"@types/json-schema": "*"
+			}
+		},
+		"node_modules/@types/estree": {
+			"version": "1.0.8",
+			"resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz",
+			"integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==",
+			"license": "MIT"
+		},
+		"node_modules/@types/json-schema": {
+			"version": "7.0.15",
+			"resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.15.tgz",
+			"integrity": "sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/@types/json5": {
+			"version": "0.0.29",
+			"resolved": "https://registry.npmjs.org/@types/json5/-/json5-0.0.29.tgz",
+			"integrity": "sha512-dRLjCWHYg4oaA77cxO64oO+7JwCwnIzkZPdrrC71jQmQtlhM556pwKo5bUzqvZndkVbeFLIIi+9TC40JNF5hNQ==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/@types/node": {
+			"version": "20.19.39",
+			"resolved": "https://registry.npmjs.org/@types/node/-/node-20.19.39.tgz",
+			"integrity": "sha512-orrrD74MBUyK8jOAD/r0+lfa1I2MO6I+vAkmAWzMYbCcgrN4lCrmK52gRFQq/JRxfYPfonkr4b0jcY7Olqdqbw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"undici-types": "~6.21.0"
+			}
+		},
+		"node_modules/@types/node/node_modules/undici-types": {
+			"version": "6.21.0",
+			"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz",
+			"integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/@types/tern": {
+			"version": "0.23.9",
+			"resolved": "https://registry.npmjs.org/@types/tern/-/tern-0.23.9.tgz",
+			"integrity": "sha512-ypzHFE/wBzh+BlH6rrBgS5I/Z7RD21pGhZ2rltb/+ZrVM1awdZwjx7hE5XfuYgHWk9uvV5HLZN3SloevCAp3Bw==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/estree": "*"
+			}
+		},
+		"node_modules/@typescript-eslint/eslint-plugin": {
+			"version": "8.35.1",
+			"resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-8.35.1.tgz",
+			"integrity": "sha512-9XNTlo7P7RJxbVeICaIIIEipqxLKguyh+3UbXuT2XQuFp6d8VOeDEGuz5IiX0dgZo8CiI6aOFLg4e8cF71SFVg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@eslint-community/regexpp": "^4.10.0",
+				"@typescript-eslint/scope-manager": "8.35.1",
+				"@typescript-eslint/type-utils": "8.35.1",
+				"@typescript-eslint/utils": "8.35.1",
+				"@typescript-eslint/visitor-keys": "8.35.1",
+				"graphemer": "^1.4.0",
+				"ignore": "^7.0.0",
+				"natural-compare": "^1.4.0",
+				"ts-api-utils": "^2.1.0"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/typescript-eslint"
+			},
+			"peerDependencies": {
+				"@typescript-eslint/parser": "^8.35.1",
+				"eslint": "^8.57.0 || ^9.0.0",
+				"typescript": ">=4.8.4 <5.9.0"
+			}
+		},
+		"node_modules/@typescript-eslint/eslint-plugin/node_modules/ignore": {
+			"version": "7.0.5",
+			"resolved": "https://registry.npmjs.org/ignore/-/ignore-7.0.5.tgz",
+			"integrity": "sha512-Hs59xBNfUIunMFgWAbGX5cq6893IbWg4KnrjbYwX3tx0ztorVgTDA6B2sxf8ejHJ4wz8BqGUMYlnzNBer5NvGg==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 4"
+			}
+		},
+		"node_modules/@typescript-eslint/parser": {
+			"version": "8.35.1",
+			"resolved": "https://registry.npmjs.org/@typescript-eslint/parser/-/parser-8.35.1.tgz",
+			"integrity": "sha512-3MyiDfrfLeK06bi/g9DqJxP5pV74LNv4rFTyvGDmT3x2p1yp1lOd+qYZfiRPIOf/oON+WRZR5wxxuF85qOar+w==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@typescript-eslint/scope-manager": "8.35.1",
+				"@typescript-eslint/types": "8.35.1",
+				"@typescript-eslint/typescript-estree": "8.35.1",
+				"@typescript-eslint/visitor-keys": "8.35.1",
+				"debug": "^4.3.4"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/typescript-eslint"
+			},
+			"peerDependencies": {
+				"eslint": "^8.57.0 || ^9.0.0",
+				"typescript": ">=4.8.4 <5.9.0"
+			}
+		},
+		"node_modules/@typescript-eslint/project-service": {
+			"version": "8.35.1",
+			"resolved": "https://registry.npmjs.org/@typescript-eslint/project-service/-/project-service-8.35.1.tgz",
+			"integrity": "sha512-VYxn/5LOpVxADAuP3NrnxxHYfzVtQzLKeldIhDhzC8UHaiQvYlXvKuVho1qLduFbJjjy5U5bkGwa3rUGUb1Q6Q==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@typescript-eslint/tsconfig-utils": "^8.35.1",
+				"@typescript-eslint/types": "^8.35.1",
+				"debug": "^4.3.4"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/typescript-eslint"
+			},
+			"peerDependencies": {
+				"typescript": ">=4.8.4 <5.9.0"
+			}
+		},
+		"node_modules/@typescript-eslint/scope-manager": {
+			"version": "8.35.1",
+			"resolved": "https://registry.npmjs.org/@typescript-eslint/scope-manager/-/scope-manager-8.35.1.tgz",
+			"integrity": "sha512-s/Bpd4i7ht2934nG+UoSPlYXd08KYz3bmjLEb7Ye1UVob0d1ENiT3lY8bsCmik4RqfSbPw9xJJHbugpPpP5JUg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@typescript-eslint/types": "8.35.1",
+				"@typescript-eslint/visitor-keys": "8.35.1"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/typescript-eslint"
+			}
+		},
+		"node_modules/@typescript-eslint/tsconfig-utils": {
+			"version": "8.35.1",
+			"resolved": "https://registry.npmjs.org/@typescript-eslint/tsconfig-utils/-/tsconfig-utils-8.35.1.tgz",
+			"integrity": "sha512-K5/U9VmT9dTHoNowWZpz+/TObS3xqC5h0xAIjXPw+MNcKV9qg6eSatEnmeAwkjHijhACH0/N7bkhKvbt1+DXWQ==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/typescript-eslint"
+			},
+			"peerDependencies": {
+				"typescript": ">=4.8.4 <5.9.0"
+			}
+		},
+		"node_modules/@typescript-eslint/type-utils": {
+			"version": "8.35.1",
+			"resolved": "https://registry.npmjs.org/@typescript-eslint/type-utils/-/type-utils-8.35.1.tgz",
+			"integrity": "sha512-HOrUBlfVRz5W2LIKpXzZoy6VTZzMu2n8q9C2V/cFngIC5U1nStJgv0tMV4sZPzdf4wQm9/ToWUFPMN9Vq9VJQQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@typescript-eslint/typescript-estree": "8.35.1",
+				"@typescript-eslint/utils": "8.35.1",
+				"debug": "^4.3.4",
+				"ts-api-utils": "^2.1.0"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/typescript-eslint"
+			},
+			"peerDependencies": {
+				"eslint": "^8.57.0 || ^9.0.0",
+				"typescript": ">=4.8.4 <5.9.0"
+			}
+		},
+		"node_modules/@typescript-eslint/types": {
+			"version": "8.35.1",
+			"resolved": "https://registry.npmjs.org/@typescript-eslint/types/-/types-8.35.1.tgz",
+			"integrity": "sha512-q/O04vVnKHfrrhNAscndAn1tuQhIkwqnaW+eu5waD5IPts2eX1dgJxgqcPx5BX109/qAz7IG6VrEPTOYKCNfRQ==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/typescript-eslint"
+			}
+		},
+		"node_modules/@typescript-eslint/typescript-estree": {
+			"version": "8.35.1",
+			"resolved": "https://registry.npmjs.org/@typescript-eslint/typescript-estree/-/typescript-estree-8.35.1.tgz",
+			"integrity": "sha512-Vvpuvj4tBxIka7cPs6Y1uvM7gJgdF5Uu9F+mBJBPY4MhvjrjWGK4H0lVgLJd/8PWZ23FTqsaJaLEkBCFUk8Y9g==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@typescript-eslint/project-service": "8.35.1",
+				"@typescript-eslint/tsconfig-utils": "8.35.1",
+				"@typescript-eslint/types": "8.35.1",
+				"@typescript-eslint/visitor-keys": "8.35.1",
+				"debug": "^4.3.4",
+				"fast-glob": "^3.3.2",
+				"is-glob": "^4.0.3",
+				"minimatch": "^9.0.4",
+				"semver": "^7.6.0",
+				"ts-api-utils": "^2.1.0"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/typescript-eslint"
+			},
+			"peerDependencies": {
+				"typescript": ">=4.8.4 <5.9.0"
+			}
+		},
+		"node_modules/@typescript-eslint/typescript-estree/node_modules/brace-expansion": {
+			"version": "2.0.2",
+			"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.2.tgz",
+			"integrity": "sha512-Jt0vHyM+jmUBqojB7E1NIYadt0vI0Qxjxd2TErW94wDz+E2LAm5vKMXXwg6ZZBTHPuUlDgQHKXvjGBdfcF1ZDQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"balanced-match": "^1.0.0"
+			}
+		},
+		"node_modules/@typescript-eslint/typescript-estree/node_modules/minimatch": {
+			"version": "9.0.5",
+			"resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.5.tgz",
+			"integrity": "sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==",
+			"dev": true,
+			"license": "ISC",
+			"dependencies": {
+				"brace-expansion": "^2.0.1"
+			},
+			"engines": {
+				"node": ">=16 || 14 >=14.17"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/isaacs"
+			}
+		},
+		"node_modules/@typescript-eslint/typescript-estree/node_modules/semver": {
+			"version": "7.7.3",
+			"resolved": "https://registry.npmjs.org/semver/-/semver-7.7.3.tgz",
+			"integrity": "sha512-SdsKMrI9TdgjdweUSR9MweHA4EJ8YxHn8DFaDisvhVlUOe4BF1tLD7GAj0lIqWVl+dPb/rExr0Btby5loQm20Q==",
+			"dev": true,
+			"license": "ISC",
+			"bin": {
+				"semver": "bin/semver.js"
+			},
+			"engines": {
+				"node": ">=10"
+			}
+		},
+		"node_modules/@typescript-eslint/utils": {
+			"version": "8.35.1",
+			"resolved": "https://registry.npmjs.org/@typescript-eslint/utils/-/utils-8.35.1.tgz",
+			"integrity": "sha512-lhnwatFmOFcazAsUm3ZnZFpXSxiwoa1Lj50HphnDe1Et01NF4+hrdXONSUHIcbVu2eFb1bAf+5yjXkGVkXBKAQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@eslint-community/eslint-utils": "^4.7.0",
+				"@typescript-eslint/scope-manager": "8.35.1",
+				"@typescript-eslint/types": "8.35.1",
+				"@typescript-eslint/typescript-estree": "8.35.1"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/typescript-eslint"
+			},
+			"peerDependencies": {
+				"eslint": "^8.57.0 || ^9.0.0",
+				"typescript": ">=4.8.4 <5.9.0"
+			}
+		},
+		"node_modules/@typescript-eslint/visitor-keys": {
+			"version": "8.35.1",
+			"resolved": "https://registry.npmjs.org/@typescript-eslint/visitor-keys/-/visitor-keys-8.35.1.tgz",
+			"integrity": "sha512-VRwixir4zBWCSTP/ljEo091lbpypz57PoeAQ9imjG+vbeof9LplljsL1mos4ccG6H9IjfrVGM359RozUnuFhpw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@typescript-eslint/types": "8.35.1",
+				"eslint-visitor-keys": "^4.2.1"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/typescript-eslint"
+			}
+		},
+		"node_modules/acorn": {
+			"version": "8.15.0",
+			"resolved": "https://registry.npmjs.org/acorn/-/acorn-8.15.0.tgz",
+			"integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==",
+			"dev": true,
+			"license": "MIT",
+			"bin": {
+				"acorn": "bin/acorn"
+			},
+			"engines": {
+				"node": ">=0.4.0"
+			}
+		},
+		"node_modules/acorn-jsx": {
+			"version": "5.3.2",
+			"resolved": "https://registry.npmjs.org/acorn-jsx/-/acorn-jsx-5.3.2.tgz",
+			"integrity": "sha512-rq9s+JNhf0IChjtDXxllJ7g41oZk5SlXtp0LHwyA5cejwn7vKmKp4pPri6YEePv2PU65sAsegbXtIinmDFDXgQ==",
+			"dev": true,
+			"license": "MIT",
+			"peerDependencies": {
+				"acorn": "^6.0.0 || ^7.0.0 || ^8.0.0"
+			}
+		},
+		"node_modules/ajv": {
+			"version": "6.12.6",
+			"resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz",
+			"integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"fast-deep-equal": "^3.1.1",
+				"fast-json-stable-stringify": "^2.0.0",
+				"json-schema-traverse": "^0.4.1",
+				"uri-js": "^4.2.2"
+			},
+			"funding": {
+				"type": "github",
+				"url": "https://github.com/sponsors/epoberezkin"
+			}
+		},
+		"node_modules/ansi-styles": {
+			"version": "4.3.0",
+			"resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+			"integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"color-convert": "^2.0.1"
+			},
+			"engines": {
+				"node": ">=8"
+			},
+			"funding": {
+				"url": "https://github.com/chalk/ansi-styles?sponsor=1"
+			}
+		},
+		"node_modules/argparse": {
+			"version": "2.0.1",
+			"resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz",
+			"integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==",
+			"dev": true,
+			"license": "Python-2.0"
+		},
+		"node_modules/array-buffer-byte-length": {
+			"version": "1.0.2",
+			"resolved": "https://registry.npmjs.org/array-buffer-byte-length/-/array-buffer-byte-length-1.0.2.tgz",
+			"integrity": "sha512-LHE+8BuR7RYGDKvnrmcuSq3tDcKv9OFEXQt/HpbZhY7V6h0zlUXutnAD82GiFx9rdieCMjkvtcsPqBwgUl1Iiw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bound": "^1.0.3",
+				"is-array-buffer": "^3.0.5"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/array-includes": {
+			"version": "3.1.9",
+			"resolved": "https://registry.npmjs.org/array-includes/-/array-includes-3.1.9.tgz",
+			"integrity": "sha512-FmeCCAenzH0KH381SPT5FZmiA/TmpndpcaShhfgEN9eCVjnFBqq3l1xrI42y8+PPLI6hypzou4GXw00WHmPBLQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bind": "^1.0.8",
+				"call-bound": "^1.0.4",
+				"define-properties": "^1.2.1",
+				"es-abstract": "^1.24.0",
+				"es-object-atoms": "^1.1.1",
+				"get-intrinsic": "^1.3.0",
+				"is-string": "^1.1.1",
+				"math-intrinsics": "^1.1.0"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/array.prototype.findlast": {
+			"version": "1.2.5",
+			"resolved": "https://registry.npmjs.org/array.prototype.findlast/-/array.prototype.findlast-1.2.5.tgz",
+			"integrity": "sha512-CVvd6FHg1Z3POpBLxO6E6zr+rSKEQ9L6rZHAaY7lLfhKsWYUBBOuMs0e9o24oopj6H+geRCX0YJ+TJLBK2eHyQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bind": "^1.0.7",
+				"define-properties": "^1.2.1",
+				"es-abstract": "^1.23.2",
+				"es-errors": "^1.3.0",
+				"es-object-atoms": "^1.0.0",
+				"es-shim-unscopables": "^1.0.2"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/array.prototype.findlastindex": {
+			"version": "1.2.6",
+			"resolved": "https://registry.npmjs.org/array.prototype.findlastindex/-/array.prototype.findlastindex-1.2.6.tgz",
+			"integrity": "sha512-F/TKATkzseUExPlfvmwQKGITM3DGTK+vkAsCZoDc5daVygbJBnjEUCbgkAvVFsgfXfX4YIqZ/27G3k3tdXrTxQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bind": "^1.0.8",
+				"call-bound": "^1.0.4",
+				"define-properties": "^1.2.1",
+				"es-abstract": "^1.23.9",
+				"es-errors": "^1.3.0",
+				"es-object-atoms": "^1.1.1",
+				"es-shim-unscopables": "^1.1.0"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/array.prototype.flat": {
+			"version": "1.3.3",
+			"resolved": "https://registry.npmjs.org/array.prototype.flat/-/array.prototype.flat-1.3.3.tgz",
+			"integrity": "sha512-rwG/ja1neyLqCuGZ5YYrznA62D4mZXg0i1cIskIUKSiqF3Cje9/wXAls9B9s1Wa2fomMsIv8czB8jZcPmxCXFg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bind": "^1.0.8",
+				"define-properties": "^1.2.1",
+				"es-abstract": "^1.23.5",
+				"es-shim-unscopables": "^1.0.2"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/array.prototype.flatmap": {
+			"version": "1.3.3",
+			"resolved": "https://registry.npmjs.org/array.prototype.flatmap/-/array.prototype.flatmap-1.3.3.tgz",
+			"integrity": "sha512-Y7Wt51eKJSyi80hFrJCePGGNo5ktJCslFuboqJsbf57CCPcm5zztluPlc4/aD8sWsKvlwatezpV4U1efk8kpjg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bind": "^1.0.8",
+				"define-properties": "^1.2.1",
+				"es-abstract": "^1.23.5",
+				"es-shim-unscopables": "^1.0.2"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/array.prototype.tosorted": {
+			"version": "1.1.4",
+			"resolved": "https://registry.npmjs.org/array.prototype.tosorted/-/array.prototype.tosorted-1.1.4.tgz",
+			"integrity": "sha512-p6Fx8B7b7ZhL/gmUsAy0D15WhvDccw3mnGNbZpi3pmeJdxtWsj2jEaI4Y6oo3XiHfzuSgPwKc04MYt6KgvC/wA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bind": "^1.0.7",
+				"define-properties": "^1.2.1",
+				"es-abstract": "^1.23.3",
+				"es-errors": "^1.3.0",
+				"es-shim-unscopables": "^1.0.2"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/arraybuffer.prototype.slice": {
+			"version": "1.0.4",
+			"resolved": "https://registry.npmjs.org/arraybuffer.prototype.slice/-/arraybuffer.prototype.slice-1.0.4.tgz",
+			"integrity": "sha512-BNoCY6SXXPQ7gF2opIP4GBE+Xw7U+pHMYKuzjgCN3GwiaIR09UUeKfheyIry77QtrCBlC0KK0q5/TER/tYh3PQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"array-buffer-byte-length": "^1.0.1",
+				"call-bind": "^1.0.8",
+				"define-properties": "^1.2.1",
+				"es-abstract": "^1.23.5",
+				"es-errors": "^1.3.0",
+				"get-intrinsic": "^1.2.6",
+				"is-array-buffer": "^3.0.4"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/async-function": {
+			"version": "1.0.0",
+			"resolved": "https://registry.npmjs.org/async-function/-/async-function-1.0.0.tgz",
+			"integrity": "sha512-hsU18Ae8CDTR6Kgu9DYf0EbCr/a5iGL0rytQDobUcdpYOKokk8LEjVphnXkDkgpi0wYVsqrXuP0bZxJaTqdgoA==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/available-typed-arrays": {
+			"version": "1.0.7",
+			"resolved": "https://registry.npmjs.org/available-typed-arrays/-/available-typed-arrays-1.0.7.tgz",
+			"integrity": "sha512-wvUjBtSGN7+7SjNpq/9M2Tg350UZD3q62IFZLbRAR1bSMlCo1ZaeW+BJ+D090e4hIIZLBcTDWe4Mh4jvUDajzQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"possible-typed-array-names": "^1.0.0"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/balanced-match": {
+			"version": "1.0.2",
+			"resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz",
+			"integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/brace-expansion": {
+			"version": "1.1.12",
+			"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz",
+			"integrity": "sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"balanced-match": "^1.0.0",
+				"concat-map": "0.0.1"
+			}
+		},
+		"node_modules/braces": {
+			"version": "3.0.3",
+			"resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz",
+			"integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"fill-range": "^7.1.1"
+			},
+			"engines": {
+				"node": ">=8"
+			}
+		},
+		"node_modules/call-bind": {
+			"version": "1.0.8",
+			"resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.8.tgz",
+			"integrity": "sha512-oKlSFMcMwpUg2ednkhQ454wfWiU/ul3CkJe/PEHcTKuiX6RpbehUiFMXu13HalGZxfUwCQzZG747YXBn1im9ww==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bind-apply-helpers": "^1.0.0",
+				"es-define-property": "^1.0.0",
+				"get-intrinsic": "^1.2.4",
+				"set-function-length": "^1.2.2"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/call-bind-apply-helpers": {
+			"version": "1.0.2",
+			"resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz",
+			"integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"es-errors": "^1.3.0",
+				"function-bind": "^1.1.2"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/call-bound": {
+			"version": "1.0.4",
+			"resolved": "https://registry.npmjs.org/call-bound/-/call-bound-1.0.4.tgz",
+			"integrity": "sha512-+ys997U96po4Kx/ABpBCqhA9EuxJaQWDQg7295H4hBphv3IZg0boBKuwYpt4YXp6MZ5AmZQnU/tyMTlRpaSejg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bind-apply-helpers": "^1.0.2",
+				"get-intrinsic": "^1.3.0"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/callsites": {
+			"version": "3.1.0",
+			"resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz",
+			"integrity": "sha512-P8BjAsXvZS+VIDUI11hHCQEv74YT67YUi5JJFNWIqL235sBmjX4+qx9Muvls5ivyNENctx46xQLQ3aTuE7ssaQ==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=6"
+			}
+		},
+		"node_modules/chalk": {
+			"version": "4.1.2",
+			"resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+			"integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"ansi-styles": "^4.1.0",
+				"supports-color": "^7.1.0"
+			},
+			"engines": {
+				"node": ">=10"
+			},
+			"funding": {
+				"url": "https://github.com/chalk/chalk?sponsor=1"
+			}
+		},
+		"node_modules/color-convert": {
+			"version": "2.0.1",
+			"resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+			"integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"color-name": "~1.1.4"
+			},
+			"engines": {
+				"node": ">=7.0.0"
+			}
+		},
+		"node_modules/color-name": {
+			"version": "1.1.4",
+			"resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+			"integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/concat-map": {
+			"version": "0.0.1",
+			"resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz",
+			"integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/crelt": {
+			"version": "1.0.6",
+			"resolved": "https://registry.npmjs.org/crelt/-/crelt-1.0.6.tgz",
+			"integrity": "sha512-VQ2MBenTq1fWZUH9DJNGti7kKv6EeAuYr3cLwxUWhIu1baTaXh4Ib5W2CqHVqib4/MqbYGJqiL3Zb8GJZr3l4g==",
+			"license": "MIT",
+			"peer": true
+		},
+		"node_modules/cross-spawn": {
+			"version": "7.0.6",
+			"resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz",
+			"integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"path-key": "^3.1.0",
+				"shebang-command": "^2.0.0",
+				"which": "^2.0.1"
+			},
+			"engines": {
+				"node": ">= 8"
+			}
+		},
+		"node_modules/data-view-buffer": {
+			"version": "1.0.2",
+			"resolved": "https://registry.npmjs.org/data-view-buffer/-/data-view-buffer-1.0.2.tgz",
+			"integrity": "sha512-EmKO5V3OLXh1rtK2wgXRansaK1/mtVdTUEiEI0W8RkvgT05kfxaH29PliLnpLP73yYO6142Q72QNa8Wx/A5CqQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bound": "^1.0.3",
+				"es-errors": "^1.3.0",
+				"is-data-view": "^1.0.2"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/data-view-byte-length": {
+			"version": "1.0.2",
+			"resolved": "https://registry.npmjs.org/data-view-byte-length/-/data-view-byte-length-1.0.2.tgz",
+			"integrity": "sha512-tuhGbE6CfTM9+5ANGf+oQb72Ky/0+s3xKUpHvShfiz2RxMFgFPjsXuRLBVMtvMs15awe45SRb83D6wH4ew6wlQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bound": "^1.0.3",
+				"es-errors": "^1.3.0",
+				"is-data-view": "^1.0.2"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/inspect-js"
+			}
+		},
+		"node_modules/data-view-byte-offset": {
+			"version": "1.0.1",
+			"resolved": "https://registry.npmjs.org/data-view-byte-offset/-/data-view-byte-offset-1.0.1.tgz",
+			"integrity": "sha512-BS8PfmtDGnrgYdOonGZQdLZslWIeCGFP9tpan0hi1Co2Zr2NKADsvGYA8XxuG/4UWgJ6Cjtv+YJnB6MM69QGlQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bound": "^1.0.2",
+				"es-errors": "^1.3.0",
+				"is-data-view": "^1.0.1"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/debug": {
+			"version": "4.4.3",
+			"resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz",
+			"integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"ms": "^2.1.3"
+			},
+			"engines": {
+				"node": ">=6.0"
+			},
+			"peerDependenciesMeta": {
+				"supports-color": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/deep-is": {
+			"version": "0.1.4",
+			"resolved": "https://registry.npmjs.org/deep-is/-/deep-is-0.1.4.tgz",
+			"integrity": "sha512-oIPzksmTg4/MriiaYGO+okXDT7ztn/w3Eptv/+gSIdMdKsJo0u4CfYNFJPy+4SKMuCqGw2wxnA+URMg3t8a/bQ==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/define-data-property": {
+			"version": "1.1.4",
+			"resolved": "https://registry.npmjs.org/define-data-property/-/define-data-property-1.1.4.tgz",
+			"integrity": "sha512-rBMvIzlpA8v6E+SJZoo++HAYqsLrkg7MSfIinMPFhmkorw7X+dOXVJQs+QT69zGkzMyfDnIMN2Wid1+NbL3T+A==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"es-define-property": "^1.0.0",
+				"es-errors": "^1.3.0",
+				"gopd": "^1.0.1"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/define-properties": {
+			"version": "1.2.1",
+			"resolved": "https://registry.npmjs.org/define-properties/-/define-properties-1.2.1.tgz",
+			"integrity": "sha512-8QmQKqEASLd5nx0U1B1okLElbUuuttJ/AnYmRXbbbGDWh6uS208EjD4Xqq/I9wK7u0v6O08XhTWnt5XtEbR6Dg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"define-data-property": "^1.0.1",
+				"has-property-descriptors": "^1.0.0",
+				"object-keys": "^1.1.1"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/doctrine": {
+			"version": "2.1.0",
+			"resolved": "https://registry.npmjs.org/doctrine/-/doctrine-2.1.0.tgz",
+			"integrity": "sha512-35mSku4ZXK0vfCuHEDAwt55dg2jNajHZ1odvF+8SSr82EsZY4QmXfuWso8oEd8zRhVObSN18aM0CjSdoBX7zIw==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"dependencies": {
+				"esutils": "^2.0.2"
+			},
+			"engines": {
+				"node": ">=0.10.0"
+			}
+		},
+		"node_modules/dunder-proto": {
+			"version": "1.0.1",
+			"resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz",
+			"integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bind-apply-helpers": "^1.0.1",
+				"es-errors": "^1.3.0",
+				"gopd": "^1.2.0"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/empathic": {
+			"version": "2.0.0",
+			"resolved": "https://registry.npmjs.org/empathic/-/empathic-2.0.0.tgz",
+			"integrity": "sha512-i6UzDscO/XfAcNYD75CfICkmfLedpyPDdozrLMmQc5ORaQcdMoc21OnlEylMIqI7U8eniKrPMxxtj8k0vhmJhA==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=14"
+			}
+		},
+		"node_modules/enhanced-resolve": {
+			"version": "5.18.3",
+			"resolved": "https://registry.npmjs.org/enhanced-resolve/-/enhanced-resolve-5.18.3.tgz",
+			"integrity": "sha512-d4lC8xfavMeBjzGr2vECC3fsGXziXZQyJxD868h2M/mBI3PwAuODxAkLkq5HYuvrPYcUtiLzsTo8U3PgX3Ocww==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"graceful-fs": "^4.2.4",
+				"tapable": "^2.2.0"
+			},
+			"engines": {
+				"node": ">=10.13.0"
+			}
+		},
+		"node_modules/es-abstract": {
+			"version": "1.24.0",
+			"resolved": "https://registry.npmjs.org/es-abstract/-/es-abstract-1.24.0.tgz",
+			"integrity": "sha512-WSzPgsdLtTcQwm4CROfS5ju2Wa1QQcVeT37jFjYzdFz1r9ahadC8B8/a4qxJxM+09F18iumCdRmlr96ZYkQvEg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"array-buffer-byte-length": "^1.0.2",
+				"arraybuffer.prototype.slice": "^1.0.4",
+				"available-typed-arrays": "^1.0.7",
+				"call-bind": "^1.0.8",
+				"call-bound": "^1.0.4",
+				"data-view-buffer": "^1.0.2",
+				"data-view-byte-length": "^1.0.2",
+				"data-view-byte-offset": "^1.0.1",
+				"es-define-property": "^1.0.1",
+				"es-errors": "^1.3.0",
+				"es-object-atoms": "^1.1.1",
+				"es-set-tostringtag": "^2.1.0",
+				"es-to-primitive": "^1.3.0",
+				"function.prototype.name": "^1.1.8",
+				"get-intrinsic": "^1.3.0",
+				"get-proto": "^1.0.1",
+				"get-symbol-description": "^1.1.0",
+				"globalthis": "^1.0.4",
+				"gopd": "^1.2.0",
+				"has-property-descriptors": "^1.0.2",
+				"has-proto": "^1.2.0",
+				"has-symbols": "^1.1.0",
+				"hasown": "^2.0.2",
+				"internal-slot": "^1.1.0",
+				"is-array-buffer": "^3.0.5",
+				"is-callable": "^1.2.7",
+				"is-data-view": "^1.0.2",
+				"is-negative-zero": "^2.0.3",
+				"is-regex": "^1.2.1",
+				"is-set": "^2.0.3",
+				"is-shared-array-buffer": "^1.0.4",
+				"is-string": "^1.1.1",
+				"is-typed-array": "^1.1.15",
+				"is-weakref": "^1.1.1",
+				"math-intrinsics": "^1.1.0",
+				"object-inspect": "^1.13.4",
+				"object-keys": "^1.1.1",
+				"object.assign": "^4.1.7",
+				"own-keys": "^1.0.1",
+				"regexp.prototype.flags": "^1.5.4",
+				"safe-array-concat": "^1.1.3",
+				"safe-push-apply": "^1.0.0",
+				"safe-regex-test": "^1.1.0",
+				"set-proto": "^1.0.0",
+				"stop-iteration-iterator": "^1.1.0",
+				"string.prototype.trim": "^1.2.10",
+				"string.prototype.trimend": "^1.0.9",
+				"string.prototype.trimstart": "^1.0.8",
+				"typed-array-buffer": "^1.0.3",
+				"typed-array-byte-length": "^1.0.3",
+				"typed-array-byte-offset": "^1.0.4",
+				"typed-array-length": "^1.0.7",
+				"unbox-primitive": "^1.1.0",
+				"which-typed-array": "^1.1.19"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/es-define-property": {
+			"version": "1.0.1",
+			"resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz",
+			"integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/es-errors": {
+			"version": "1.3.0",
+			"resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz",
+			"integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/es-iterator-helpers": {
+			"version": "1.2.1",
+			"resolved": "https://registry.npmjs.org/es-iterator-helpers/-/es-iterator-helpers-1.2.1.tgz",
+			"integrity": "sha512-uDn+FE1yrDzyC0pCo961B2IHbdM8y/ACZsKD4dG6WqrjV53BADjwa7D+1aom2rsNVfLyDgU/eigvlJGJ08OQ4w==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bind": "^1.0.8",
+				"call-bound": "^1.0.3",
+				"define-properties": "^1.2.1",
+				"es-abstract": "^1.23.6",
+				"es-errors": "^1.3.0",
+				"es-set-tostringtag": "^2.0.3",
+				"function-bind": "^1.1.2",
+				"get-intrinsic": "^1.2.6",
+				"globalthis": "^1.0.4",
+				"gopd": "^1.2.0",
+				"has-property-descriptors": "^1.0.2",
+				"has-proto": "^1.2.0",
+				"has-symbols": "^1.1.0",
+				"internal-slot": "^1.1.0",
+				"iterator.prototype": "^1.1.4",
+				"safe-array-concat": "^1.1.3"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/es-object-atoms": {
+			"version": "1.1.1",
+			"resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz",
+			"integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"es-errors": "^1.3.0"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/es-set-tostringtag": {
+			"version": "2.1.0",
+			"resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.1.0.tgz",
+			"integrity": "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"es-errors": "^1.3.0",
+				"get-intrinsic": "^1.2.6",
+				"has-tostringtag": "^1.0.2",
+				"hasown": "^2.0.2"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/es-shim-unscopables": {
+			"version": "1.1.0",
+			"resolved": "https://registry.npmjs.org/es-shim-unscopables/-/es-shim-unscopables-1.1.0.tgz",
+			"integrity": "sha512-d9T8ucsEhh8Bi1woXCf+TIKDIROLG5WCkxg8geBCbvk22kzwC5G2OnXVMO6FUsvQlgUUXQ2itephWDLqDzbeCw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"hasown": "^2.0.2"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/es-to-primitive": {
+			"version": "1.3.0",
+			"resolved": "https://registry.npmjs.org/es-to-primitive/-/es-to-primitive-1.3.0.tgz",
+			"integrity": "sha512-w+5mJ3GuFL+NjVtJlvydShqE1eN3h3PbI7/5LAsYJP/2qtuMXjfL2LpHSRqo4b4eSF5K/DH1JXKUAHSB2UW50g==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"is-callable": "^1.2.7",
+				"is-date-object": "^1.0.5",
+				"is-symbol": "^1.0.4"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/esbuild": {
+			"version": "0.25.5",
+			"resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.25.5.tgz",
+			"integrity": "sha512-P8OtKZRv/5J5hhz0cUAdu/cLuPIKXpQl1R9pZtvmHWQvrAUVd0UNIPT4IB4W3rNOqVO0rlqHmCIbSwxh/c9yUQ==",
+			"dev": true,
+			"hasInstallScript": true,
+			"license": "MIT",
+			"bin": {
+				"esbuild": "bin/esbuild"
+			},
+			"engines": {
+				"node": ">=18"
+			},
+			"optionalDependencies": {
+				"@esbuild/aix-ppc64": "0.25.5",
+				"@esbuild/android-arm": "0.25.5",
+				"@esbuild/android-arm64": "0.25.5",
+				"@esbuild/android-x64": "0.25.5",
+				"@esbuild/darwin-arm64": "0.25.5",
+				"@esbuild/darwin-x64": "0.25.5",
+				"@esbuild/freebsd-arm64": "0.25.5",
+				"@esbuild/freebsd-x64": "0.25.5",
+				"@esbuild/linux-arm": "0.25.5",
+				"@esbuild/linux-arm64": "0.25.5",
+				"@esbuild/linux-ia32": "0.25.5",
+				"@esbuild/linux-loong64": "0.25.5",
+				"@esbuild/linux-mips64el": "0.25.5",
+				"@esbuild/linux-ppc64": "0.25.5",
+				"@esbuild/linux-riscv64": "0.25.5",
+				"@esbuild/linux-s390x": "0.25.5",
+				"@esbuild/linux-x64": "0.25.5",
+				"@esbuild/netbsd-arm64": "0.25.5",
+				"@esbuild/netbsd-x64": "0.25.5",
+				"@esbuild/openbsd-arm64": "0.25.5",
+				"@esbuild/openbsd-x64": "0.25.5",
+				"@esbuild/sunos-x64": "0.25.5",
+				"@esbuild/win32-arm64": "0.25.5",
+				"@esbuild/win32-ia32": "0.25.5",
+				"@esbuild/win32-x64": "0.25.5"
+			}
+		},
+		"node_modules/escape-string-regexp": {
+			"version": "4.0.0",
+			"resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz",
+			"integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=10"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/sindresorhus"
+			}
+		},
+		"node_modules/eslint": {
+			"version": "9.39.1",
+			"resolved": "https://registry.npmjs.org/eslint/-/eslint-9.39.1.tgz",
+			"integrity": "sha512-BhHmn2yNOFA9H9JmmIVKJmd288g9hrVRDkdoIgRCRuSySRUHH7r/DI6aAXW9T1WwUuY3DFgrcaqB+deURBLR5g==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@eslint-community/eslint-utils": "^4.8.0",
+				"@eslint-community/regexpp": "^4.12.1",
+				"@eslint/config-array": "^0.21.1",
+				"@eslint/config-helpers": "^0.4.2",
+				"@eslint/core": "^0.17.0",
+				"@eslint/eslintrc": "^3.3.1",
+				"@eslint/js": "9.39.1",
+				"@eslint/plugin-kit": "^0.4.1",
+				"@humanfs/node": "^0.16.6",
+				"@humanwhocodes/module-importer": "^1.0.1",
+				"@humanwhocodes/retry": "^0.4.2",
+				"@types/estree": "^1.0.6",
+				"ajv": "^6.12.4",
+				"chalk": "^4.0.0",
+				"cross-spawn": "^7.0.6",
+				"debug": "^4.3.2",
+				"escape-string-regexp": "^4.0.0",
+				"eslint-scope": "^8.4.0",
+				"eslint-visitor-keys": "^4.2.1",
+				"espree": "^10.4.0",
+				"esquery": "^1.5.0",
+				"esutils": "^2.0.2",
+				"fast-deep-equal": "^3.1.3",
+				"file-entry-cache": "^8.0.0",
+				"find-up": "^5.0.0",
+				"glob-parent": "^6.0.2",
+				"ignore": "^5.2.0",
+				"imurmurhash": "^0.1.4",
+				"is-glob": "^4.0.0",
+				"json-stable-stringify-without-jsonify": "^1.0.1",
+				"lodash.merge": "^4.6.2",
+				"minimatch": "^3.1.2",
+				"natural-compare": "^1.4.0",
+				"optionator": "^0.9.3"
+			},
+			"bin": {
+				"eslint": "bin/eslint.js"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"url": "https://eslint.org/donate"
+			},
+			"peerDependencies": {
+				"jiti": "*"
+			},
+			"peerDependenciesMeta": {
+				"jiti": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/eslint-compat-utils": {
+			"version": "0.5.1",
+			"resolved": "https://registry.npmjs.org/eslint-compat-utils/-/eslint-compat-utils-0.5.1.tgz",
+			"integrity": "sha512-3z3vFexKIEnjHE3zCMRo6fn/e44U7T1khUjg+Hp0ZQMCigh28rALD0nPFBcGZuiLC5rLZa2ubQHDRln09JfU2Q==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"semver": "^7.5.4"
+			},
+			"engines": {
+				"node": ">=12"
+			},
+			"peerDependencies": {
+				"eslint": ">=6.0.0"
+			}
+		},
+		"node_modules/eslint-compat-utils/node_modules/semver": {
+			"version": "7.7.3",
+			"resolved": "https://registry.npmjs.org/semver/-/semver-7.7.3.tgz",
+			"integrity": "sha512-SdsKMrI9TdgjdweUSR9MweHA4EJ8YxHn8DFaDisvhVlUOe4BF1tLD7GAj0lIqWVl+dPb/rExr0Btby5loQm20Q==",
+			"dev": true,
+			"license": "ISC",
+			"bin": {
+				"semver": "bin/semver.js"
+			},
+			"engines": {
+				"node": ">=10"
+			}
+		},
+		"node_modules/eslint-import-resolver-node": {
+			"version": "0.3.9",
+			"resolved": "https://registry.npmjs.org/eslint-import-resolver-node/-/eslint-import-resolver-node-0.3.9.tgz",
+			"integrity": "sha512-WFj2isz22JahUv+B788TlO3N6zL3nNJGU8CcZbPZvVEkBPaJdCV4vy5wyghty5ROFbCRnm132v8BScu5/1BQ8g==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"debug": "^3.2.7",
+				"is-core-module": "^2.13.0",
+				"resolve": "^1.22.4"
+			}
+		},
+		"node_modules/eslint-import-resolver-node/node_modules/debug": {
+			"version": "3.2.7",
+			"resolved": "https://registry.npmjs.org/debug/-/debug-3.2.7.tgz",
+			"integrity": "sha512-CFjzYYAi4ThfiQvizrFQevTTXHtnCqWfe7x1AhgEscTz6ZbLbfoLRLPugTQyBth6f8ZERVUSyWHFD/7Wu4t1XQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"ms": "^2.1.1"
+			}
+		},
+		"node_modules/eslint-module-utils": {
+			"version": "2.12.1",
+			"resolved": "https://registry.npmjs.org/eslint-module-utils/-/eslint-module-utils-2.12.1.tgz",
+			"integrity": "sha512-L8jSWTze7K2mTg0vos/RuLRS5soomksDPoJLXIslC7c8Wmut3bx7CPpJijDcBZtxQ5lrbUdM+s0OlNbz0DCDNw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"debug": "^3.2.7"
+			},
+			"engines": {
+				"node": ">=4"
+			},
+			"peerDependenciesMeta": {
+				"eslint": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/eslint-module-utils/node_modules/debug": {
+			"version": "3.2.7",
+			"resolved": "https://registry.npmjs.org/debug/-/debug-3.2.7.tgz",
+			"integrity": "sha512-CFjzYYAi4ThfiQvizrFQevTTXHtnCqWfe7x1AhgEscTz6ZbLbfoLRLPugTQyBth6f8ZERVUSyWHFD/7Wu4t1XQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"ms": "^2.1.1"
+			}
+		},
+		"node_modules/eslint-plugin-depend": {
+			"version": "1.3.1",
+			"resolved": "https://registry.npmjs.org/eslint-plugin-depend/-/eslint-plugin-depend-1.3.1.tgz",
+			"integrity": "sha512-1uo2rFAr9vzNrCYdp7IBZRB54LiyVxfaIso0R6/QV3t6Dax6DTbW/EV2Hktf0f4UtmGHK8UyzJWI382pwW04jw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"empathic": "^2.0.0",
+				"module-replacements": "^2.8.0",
+				"semver": "^7.6.3"
+			}
+		},
+		"node_modules/eslint-plugin-depend/node_modules/semver": {
+			"version": "7.7.3",
+			"resolved": "https://registry.npmjs.org/semver/-/semver-7.7.3.tgz",
+			"integrity": "sha512-SdsKMrI9TdgjdweUSR9MweHA4EJ8YxHn8DFaDisvhVlUOe4BF1tLD7GAj0lIqWVl+dPb/rExr0Btby5loQm20Q==",
+			"dev": true,
+			"license": "ISC",
+			"bin": {
+				"semver": "bin/semver.js"
+			},
+			"engines": {
+				"node": ">=10"
+			}
+		},
+		"node_modules/eslint-plugin-es-x": {
+			"version": "7.8.0",
+			"resolved": "https://registry.npmjs.org/eslint-plugin-es-x/-/eslint-plugin-es-x-7.8.0.tgz",
+			"integrity": "sha512-7Ds8+wAAoV3T+LAKeu39Y5BzXCrGKrcISfgKEqTS4BDN8SFEDQd0S43jiQ8vIa3wUKD07qitZdfzlenSi8/0qQ==",
+			"dev": true,
+			"funding": [
+				"https://github.com/sponsors/ota-meshi",
+				"https://opencollective.com/eslint"
+			],
+			"license": "MIT",
+			"dependencies": {
+				"@eslint-community/eslint-utils": "^4.1.2",
+				"@eslint-community/regexpp": "^4.11.0",
+				"eslint-compat-utils": "^0.5.1"
+			},
+			"engines": {
+				"node": "^14.18.0 || >=16.0.0"
+			},
+			"peerDependencies": {
+				"eslint": ">=8"
+			}
+		},
+		"node_modules/eslint-plugin-import": {
+			"version": "2.32.0",
+			"resolved": "https://registry.npmjs.org/eslint-plugin-import/-/eslint-plugin-import-2.32.0.tgz",
+			"integrity": "sha512-whOE1HFo/qJDyX4SnXzP4N6zOWn79WhnCUY/iDR0mPfQZO8wcYE4JClzI2oZrhBnnMUCBCHZhO6VQyoBU95mZA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@rtsao/scc": "^1.1.0",
+				"array-includes": "^3.1.9",
+				"array.prototype.findlastindex": "^1.2.6",
+				"array.prototype.flat": "^1.3.3",
+				"array.prototype.flatmap": "^1.3.3",
+				"debug": "^3.2.7",
+				"doctrine": "^2.1.0",
+				"eslint-import-resolver-node": "^0.3.9",
+				"eslint-module-utils": "^2.12.1",
+				"hasown": "^2.0.2",
+				"is-core-module": "^2.16.1",
+				"is-glob": "^4.0.3",
+				"minimatch": "^3.1.2",
+				"object.fromentries": "^2.0.8",
+				"object.groupby": "^1.0.3",
+				"object.values": "^1.2.1",
+				"semver": "^6.3.1",
+				"string.prototype.trimend": "^1.0.9",
+				"tsconfig-paths": "^3.15.0"
+			},
+			"engines": {
+				"node": ">=4"
+			},
+			"peerDependencies": {
+				"eslint": "^2 || ^3 || ^4 || ^5 || ^6 || ^7.2.0 || ^8 || ^9"
+			}
+		},
+		"node_modules/eslint-plugin-import/node_modules/debug": {
+			"version": "3.2.7",
+			"resolved": "https://registry.npmjs.org/debug/-/debug-3.2.7.tgz",
+			"integrity": "sha512-CFjzYYAi4ThfiQvizrFQevTTXHtnCqWfe7x1AhgEscTz6ZbLbfoLRLPugTQyBth6f8ZERVUSyWHFD/7Wu4t1XQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"ms": "^2.1.1"
+			}
+		},
+		"node_modules/eslint-plugin-json-schema-validator": {
+			"version": "5.1.0",
+			"resolved": "https://registry.npmjs.org/eslint-plugin-json-schema-validator/-/eslint-plugin-json-schema-validator-5.1.0.tgz",
+			"integrity": "sha512-ZmVyxRIjm58oqe2kTuy90PpmZPrrKvOjRPXKzq8WCgRgAkidCgm5X8domL2KSfadZ3QFAmifMgGTcVNhZ5ez2g==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@eslint-community/eslint-utils": "^4.3.0",
+				"ajv": "^8.0.0",
+				"debug": "^4.3.1",
+				"eslint-compat-utils": "^0.5.0",
+				"json-schema-migrate": "^2.0.0",
+				"jsonc-eslint-parser": "^2.0.0",
+				"minimatch": "^8.0.0",
+				"synckit": "^0.9.0",
+				"toml-eslint-parser": "^0.9.0",
+				"tunnel-agent": "^0.6.0",
+				"yaml-eslint-parser": "^1.0.0"
+			},
+			"engines": {
+				"node": "^14.18.0 || >=16.0.0"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ota-meshi"
+			},
+			"peerDependencies": {
+				"eslint": ">=6.0.0"
+			}
+		},
+		"node_modules/eslint-plugin-json-schema-validator/node_modules/ajv": {
+			"version": "8.17.1",
+			"resolved": "https://registry.npmjs.org/ajv/-/ajv-8.17.1.tgz",
+			"integrity": "sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"fast-deep-equal": "^3.1.3",
+				"fast-uri": "^3.0.1",
+				"json-schema-traverse": "^1.0.0",
+				"require-from-string": "^2.0.2"
+			},
+			"funding": {
+				"type": "github",
+				"url": "https://github.com/sponsors/epoberezkin"
+			}
+		},
+		"node_modules/eslint-plugin-json-schema-validator/node_modules/brace-expansion": {
+			"version": "2.0.2",
+			"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.2.tgz",
+			"integrity": "sha512-Jt0vHyM+jmUBqojB7E1NIYadt0vI0Qxjxd2TErW94wDz+E2LAm5vKMXXwg6ZZBTHPuUlDgQHKXvjGBdfcF1ZDQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"balanced-match": "^1.0.0"
+			}
+		},
+		"node_modules/eslint-plugin-json-schema-validator/node_modules/json-schema-traverse": {
+			"version": "1.0.0",
+			"resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
+			"integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/eslint-plugin-json-schema-validator/node_modules/minimatch": {
+			"version": "8.0.4",
+			"resolved": "https://registry.npmjs.org/minimatch/-/minimatch-8.0.4.tgz",
+			"integrity": "sha512-W0Wvr9HyFXZRGIDgCicunpQ299OKXs9RgZfaukz4qAW/pJhcpUfupc9c+OObPOFueNy8VSrZgEmDtk6Kh4WzDA==",
+			"dev": true,
+			"license": "ISC",
+			"dependencies": {
+				"brace-expansion": "^2.0.1"
+			},
+			"engines": {
+				"node": ">=16 || 14 >=14.17"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/isaacs"
+			}
+		},
+		"node_modules/eslint-plugin-n": {
+			"version": "17.10.3",
+			"resolved": "https://registry.npmjs.org/eslint-plugin-n/-/eslint-plugin-n-17.10.3.tgz",
+			"integrity": "sha512-ySZBfKe49nQZWR1yFaA0v/GsH6Fgp8ah6XV0WDz6CN8WO0ek4McMzb7A2xnf4DCYV43frjCygvb9f/wx7UUxRw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@eslint-community/eslint-utils": "^4.4.0",
+				"enhanced-resolve": "^5.17.0",
+				"eslint-plugin-es-x": "^7.5.0",
+				"get-tsconfig": "^4.7.0",
+				"globals": "^15.8.0",
+				"ignore": "^5.2.4",
+				"minimatch": "^9.0.5",
+				"semver": "^7.5.3"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/eslint"
+			},
+			"peerDependencies": {
+				"eslint": ">=8.23.0"
+			}
+		},
+		"node_modules/eslint-plugin-n/node_modules/brace-expansion": {
+			"version": "2.0.2",
+			"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.2.tgz",
+			"integrity": "sha512-Jt0vHyM+jmUBqojB7E1NIYadt0vI0Qxjxd2TErW94wDz+E2LAm5vKMXXwg6ZZBTHPuUlDgQHKXvjGBdfcF1ZDQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"balanced-match": "^1.0.0"
+			}
+		},
+		"node_modules/eslint-plugin-n/node_modules/globals": {
+			"version": "15.15.0",
+			"resolved": "https://registry.npmjs.org/globals/-/globals-15.15.0.tgz",
+			"integrity": "sha512-7ACyT3wmyp3I61S4fG682L0VA2RGD9otkqGJIwNUMF1SWUombIIk+af1unuDYgMm082aHYwD+mzJvv9Iu8dsgg==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=18"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/sindresorhus"
+			}
+		},
+		"node_modules/eslint-plugin-n/node_modules/minimatch": {
+			"version": "9.0.5",
+			"resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.5.tgz",
+			"integrity": "sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==",
+			"dev": true,
+			"license": "ISC",
+			"dependencies": {
+				"brace-expansion": "^2.0.1"
+			},
+			"engines": {
+				"node": ">=16 || 14 >=14.17"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/isaacs"
+			}
+		},
+		"node_modules/eslint-plugin-n/node_modules/semver": {
+			"version": "7.7.3",
+			"resolved": "https://registry.npmjs.org/semver/-/semver-7.7.3.tgz",
+			"integrity": "sha512-SdsKMrI9TdgjdweUSR9MweHA4EJ8YxHn8DFaDisvhVlUOe4BF1tLD7GAj0lIqWVl+dPb/rExr0Btby5loQm20Q==",
+			"dev": true,
+			"license": "ISC",
+			"bin": {
+				"semver": "bin/semver.js"
+			},
+			"engines": {
+				"node": ">=10"
+			}
+		},
+		"node_modules/eslint-plugin-obsidianmd": {
+			"version": "0.1.9",
+			"resolved": "https://registry.npmjs.org/eslint-plugin-obsidianmd/-/eslint-plugin-obsidianmd-0.1.9.tgz",
+			"integrity": "sha512-/gyo5vky3Y7re4BtT/8MQbHU5Wes4o6VRqas3YmXE7aTCnMsdV0kfzV1GDXJN9Hrsc9UQPoeKUMiapKL0aGE4g==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@microsoft/eslint-plugin-sdl": "^1.1.0",
+				"@types/eslint": "8.56.2",
+				"@types/node": "20.12.12",
+				"eslint": ">=9.0.0 <10.0.0",
+				"eslint-plugin-depend": "1.3.1",
+				"eslint-plugin-import": "^2.31.0",
+				"eslint-plugin-json-schema-validator": "5.1.0",
+				"eslint-plugin-security": "2.1.1",
+				"globals": "14.0.0",
+				"obsidian": "1.8.7",
+				"typescript": "5.4.5"
+			},
+			"bin": {
+				"eslint-plugin-obsidian": "dist/lib/index.js"
+			},
+			"engines": {
+				"node": ">= 18"
+			},
+			"peerDependencies": {
+				"@eslint/js": "^9.30.1",
+				"@eslint/json": "0.14.0",
+				"eslint": ">=9.0.0 <10.0.0",
+				"obsidian": "1.8.7",
+				"typescript-eslint": "^8.35.1"
+			}
+		},
+		"node_modules/eslint-plugin-obsidianmd/node_modules/@types/node": {
+			"version": "20.12.12",
+			"resolved": "https://registry.npmjs.org/@types/node/-/node-20.12.12.tgz",
+			"integrity": "sha512-eWLDGF/FOSPtAvEqeRAQ4C8LSA7M1I7i0ky1I8U7kD1J5ITyW3AsRhQrKVoWf5pFKZ2kILsEGJhsI9r93PYnOw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"undici-types": "~5.26.4"
+			}
+		},
+		"node_modules/eslint-plugin-obsidianmd/node_modules/obsidian": {
+			"version": "1.8.7",
+			"resolved": "https://registry.npmjs.org/obsidian/-/obsidian-1.8.7.tgz",
+			"integrity": "sha512-h4bWwNFAGRXlMlMAzdEiIM2ppTGlrh7uGOJS6w4gClrsjc+ei/3YAtU2VdFUlCiPuTHpY4aBpFJJW75S1Tl/JA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@types/codemirror": "5.60.8",
+				"moment": "2.29.4"
+			},
+			"peerDependencies": {
+				"@codemirror/state": "^6.0.0",
+				"@codemirror/view": "^6.0.0"
+			}
+		},
+		"node_modules/eslint-plugin-obsidianmd/node_modules/typescript": {
+			"version": "5.4.5",
+			"resolved": "https://registry.npmjs.org/typescript/-/typescript-5.4.5.tgz",
+			"integrity": "sha512-vcI4UpRgg81oIRUFwR0WSIHKt11nJ7SAVlYNIu+QpqeyXP+gpQJy/Z4+F0aGxSE4MqwjyXvW/TzgkLAx2AGHwQ==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"bin": {
+				"tsc": "bin/tsc",
+				"tsserver": "bin/tsserver"
+			},
+			"engines": {
+				"node": ">=14.17"
+			}
+		},
+		"node_modules/eslint-plugin-react": {
+			"version": "7.37.3",
+			"resolved": "https://registry.npmjs.org/eslint-plugin-react/-/eslint-plugin-react-7.37.3.tgz",
+			"integrity": "sha512-DomWuTQPFYZwF/7c9W2fkKkStqZmBd3uugfqBYLdkZ3Hii23WzZuOLUskGxB8qkSKqftxEeGL1TB2kMhrce0jA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"array-includes": "^3.1.8",
+				"array.prototype.findlast": "^1.2.5",
+				"array.prototype.flatmap": "^1.3.3",
+				"array.prototype.tosorted": "^1.1.4",
+				"doctrine": "^2.1.0",
+				"es-iterator-helpers": "^1.2.1",
+				"estraverse": "^5.3.0",
+				"hasown": "^2.0.2",
+				"jsx-ast-utils": "^2.4.1 || ^3.0.0",
+				"minimatch": "^3.1.2",
+				"object.entries": "^1.1.8",
+				"object.fromentries": "^2.0.8",
+				"object.values": "^1.2.1",
+				"prop-types": "^15.8.1",
+				"resolve": "^2.0.0-next.5",
+				"semver": "^6.3.1",
+				"string.prototype.matchall": "^4.0.12",
+				"string.prototype.repeat": "^1.0.0"
+			},
+			"engines": {
+				"node": ">=4"
+			},
+			"peerDependencies": {
+				"eslint": "^3 || ^4 || ^5 || ^6 || ^7 || ^8 || ^9.7"
+			}
+		},
+		"node_modules/eslint-plugin-react/node_modules/resolve": {
+			"version": "2.0.0-next.5",
+			"resolved": "https://registry.npmjs.org/resolve/-/resolve-2.0.0-next.5.tgz",
+			"integrity": "sha512-U7WjGVG9sH8tvjW5SmGbQuui75FiyjAX72HX15DwBBwF9dNiQZRQAg9nnPhYy+TUnE0+VcrttuvNI8oSxZcocA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"is-core-module": "^2.13.0",
+				"path-parse": "^1.0.7",
+				"supports-preserve-symlinks-flag": "^1.0.0"
+			},
+			"bin": {
+				"resolve": "bin/resolve"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/eslint-plugin-security": {
+			"version": "2.1.1",
+			"resolved": "https://registry.npmjs.org/eslint-plugin-security/-/eslint-plugin-security-2.1.1.tgz",
+			"integrity": "sha512-7cspIGj7WTfR3EhaILzAPcfCo5R9FbeWvbgsPYWivSurTBKW88VQxtP3c4aWMG9Hz/GfJlJVdXEJ3c8LqS+u2w==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"dependencies": {
+				"safe-regex": "^2.1.1"
+			}
+		},
+		"node_modules/eslint-scope": {
+			"version": "8.4.0",
+			"resolved": "https://registry.npmjs.org/eslint-scope/-/eslint-scope-8.4.0.tgz",
+			"integrity": "sha512-sNXOfKCn74rt8RICKMvJS7XKV/Xk9kA7DyJr8mJik3S7Cwgy3qlkkmyS2uQB3jiJg6VNdZd/pDBJu0nvG2NlTg==",
+			"dev": true,
+			"license": "BSD-2-Clause",
+			"dependencies": {
+				"esrecurse": "^4.3.0",
+				"estraverse": "^5.2.0"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/eslint"
+			}
+		},
+		"node_modules/eslint-visitor-keys": {
+			"version": "4.2.1",
+			"resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-4.2.1.tgz",
+			"integrity": "sha512-Uhdk5sfqcee/9H/rCOJikYz67o0a2Tw2hGRPOG2Y1R2dg7brRe1uG0yaNQDHu+TO/uQPF/5eCapvYSmHUjt7JQ==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/eslint"
+			}
+		},
+		"node_modules/eslint/node_modules/@eslint/js": {
+			"version": "9.39.1",
+			"resolved": "https://registry.npmjs.org/@eslint/js/-/js-9.39.1.tgz",
+			"integrity": "sha512-S26Stp4zCy88tH94QbBv3XCuzRQiZ9yXofEILmglYTh/Ug/a9/umqvgFtYBAo3Lp0nsI/5/qH1CCrbdK3AP1Tw==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"url": "https://eslint.org/donate"
+			}
+		},
+		"node_modules/espree": {
+			"version": "10.4.0",
+			"resolved": "https://registry.npmjs.org/espree/-/espree-10.4.0.tgz",
+			"integrity": "sha512-j6PAQ2uUr79PZhBjP5C5fhl8e39FmRnOjsD5lGnWrFU8i2G776tBK7+nP8KuQUTTyAZUwfQqXAgrVH5MbH9CYQ==",
+			"dev": true,
+			"license": "BSD-2-Clause",
+			"dependencies": {
+				"acorn": "^8.15.0",
+				"acorn-jsx": "^5.3.2",
+				"eslint-visitor-keys": "^4.2.1"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/eslint"
+			}
+		},
+		"node_modules/esquery": {
+			"version": "1.6.0",
+			"resolved": "https://registry.npmjs.org/esquery/-/esquery-1.6.0.tgz",
+			"integrity": "sha512-ca9pw9fomFcKPvFLXhBKUK90ZvGibiGOvRJNbjljY7s7uq/5YO4BOzcYtJqExdx99rF6aAcnRxHmcUHcz6sQsg==",
+			"dev": true,
+			"license": "BSD-3-Clause",
+			"dependencies": {
+				"estraverse": "^5.1.0"
+			},
+			"engines": {
+				"node": ">=0.10"
+			}
+		},
+		"node_modules/esrecurse": {
+			"version": "4.3.0",
+			"resolved": "https://registry.npmjs.org/esrecurse/-/esrecurse-4.3.0.tgz",
+			"integrity": "sha512-KmfKL3b6G+RXvP8N1vr3Tq1kL/oCFgn2NYXEtqP8/L3pKapUA4G8cFVaoF3SU323CD4XypR/ffioHmkti6/Tag==",
+			"dev": true,
+			"license": "BSD-2-Clause",
+			"dependencies": {
+				"estraverse": "^5.2.0"
+			},
+			"engines": {
+				"node": ">=4.0"
+			}
+		},
+		"node_modules/estraverse": {
+			"version": "5.3.0",
+			"resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz",
+			"integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==",
+			"dev": true,
+			"license": "BSD-2-Clause",
+			"engines": {
+				"node": ">=4.0"
+			}
+		},
+		"node_modules/esutils": {
+			"version": "2.0.3",
+			"resolved": "https://registry.npmjs.org/esutils/-/esutils-2.0.3.tgz",
+			"integrity": "sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==",
+			"dev": true,
+			"license": "BSD-2-Clause",
+			"engines": {
+				"node": ">=0.10.0"
+			}
+		},
+		"node_modules/fast-deep-equal": {
+			"version": "3.1.3",
+			"resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-3.1.3.tgz",
+			"integrity": "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/fast-glob": {
+			"version": "3.3.3",
+			"resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-3.3.3.tgz",
+			"integrity": "sha512-7MptL8U0cqcFdzIzwOTHoilX9x5BrNqye7Z/LuC7kCMRio1EMSyqRK3BEAUD7sXRq4iT4AzTVuZdhgQ2TCvYLg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@nodelib/fs.stat": "^2.0.2",
+				"@nodelib/fs.walk": "^1.2.3",
+				"glob-parent": "^5.1.2",
+				"merge2": "^1.3.0",
+				"micromatch": "^4.0.8"
+			},
+			"engines": {
+				"node": ">=8.6.0"
+			}
+		},
+		"node_modules/fast-glob/node_modules/glob-parent": {
+			"version": "5.1.2",
+			"resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz",
+			"integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==",
+			"dev": true,
+			"license": "ISC",
+			"dependencies": {
+				"is-glob": "^4.0.1"
+			},
+			"engines": {
+				"node": ">= 6"
+			}
+		},
+		"node_modules/fast-json-stable-stringify": {
+			"version": "2.1.0",
+			"resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz",
+			"integrity": "sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/fast-levenshtein": {
+			"version": "2.0.6",
+			"resolved": "https://registry.npmjs.org/fast-levenshtein/-/fast-levenshtein-2.0.6.tgz",
+			"integrity": "sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/fast-uri": {
+			"version": "3.1.0",
+			"resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.1.0.tgz",
+			"integrity": "sha512-iPeeDKJSWf4IEOasVVrknXpaBV0IApz/gp7S2bb7Z4Lljbl2MGJRqInZiUrQwV16cpzw/D3S5j5Julj/gT52AA==",
+			"dev": true,
+			"funding": [
+				{
+					"type": "github",
+					"url": "https://github.com/sponsors/fastify"
+				},
+				{
+					"type": "opencollective",
+					"url": "https://opencollective.com/fastify"
+				}
+			],
+			"license": "BSD-3-Clause"
+		},
+		"node_modules/fastq": {
+			"version": "1.19.1",
+			"resolved": "https://registry.npmjs.org/fastq/-/fastq-1.19.1.tgz",
+			"integrity": "sha512-GwLTyxkCXjXbxqIhTsMI2Nui8huMPtnxg7krajPJAjnEG/iiOS7i+zCtWGZR9G0NBKbXKh6X9m9UIsYX/N6vvQ==",
+			"dev": true,
+			"license": "ISC",
+			"dependencies": {
+				"reusify": "^1.0.4"
+			}
+		},
+		"node_modules/file-entry-cache": {
+			"version": "8.0.0",
+			"resolved": "https://registry.npmjs.org/file-entry-cache/-/file-entry-cache-8.0.0.tgz",
+			"integrity": "sha512-XXTUwCvisa5oacNGRP9SfNtYBNAMi+RPwBFmblZEF7N7swHYQS6/Zfk7SRwx4D5j3CH211YNRco1DEMNVfZCnQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"flat-cache": "^4.0.0"
+			},
+			"engines": {
+				"node": ">=16.0.0"
+			}
+		},
+		"node_modules/fill-range": {
+			"version": "7.1.1",
+			"resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz",
+			"integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"to-regex-range": "^5.0.1"
+			},
+			"engines": {
+				"node": ">=8"
+			}
+		},
+		"node_modules/find-up": {
+			"version": "5.0.0",
+			"resolved": "https://registry.npmjs.org/find-up/-/find-up-5.0.0.tgz",
+			"integrity": "sha512-78/PXT1wlLLDgTzDs7sjq9hzz0vXD+zn+7wypEe4fXQxCmdmqfGsEPQxmiCSQI3ajFV91bVSsvNtrJRiW6nGng==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"locate-path": "^6.0.0",
+				"path-exists": "^4.0.0"
+			},
+			"engines": {
+				"node": ">=10"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/sindresorhus"
+			}
+		},
+		"node_modules/flat-cache": {
+			"version": "4.0.1",
+			"resolved": "https://registry.npmjs.org/flat-cache/-/flat-cache-4.0.1.tgz",
+			"integrity": "sha512-f7ccFPK3SXFHpx15UIGyRJ/FJQctuKZ0zVuN3frBo4HnK3cay9VEW0R6yPYFHC0AgqhukPzKjq22t5DmAyqGyw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"flatted": "^3.2.9",
+				"keyv": "^4.5.4"
+			},
+			"engines": {
+				"node": ">=16"
+			}
+		},
+		"node_modules/flatted": {
+			"version": "3.3.3",
+			"resolved": "https://registry.npmjs.org/flatted/-/flatted-3.3.3.tgz",
+			"integrity": "sha512-GX+ysw4PBCz0PzosHDepZGANEuFCMLrnRTiEy9McGjmkCQYwRq4A/X786G/fjM/+OjsWSU1ZrY5qyARZmO/uwg==",
+			"dev": true,
+			"license": "ISC"
+		},
+		"node_modules/for-each": {
+			"version": "0.3.5",
+			"resolved": "https://registry.npmjs.org/for-each/-/for-each-0.3.5.tgz",
+			"integrity": "sha512-dKx12eRCVIzqCxFGplyFKJMPvLEWgmNtUrpTiJIR5u97zEhRG8ySrtboPHZXx7daLxQVrl643cTzbab2tkQjxg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"is-callable": "^1.2.7"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/function-bind": {
+			"version": "1.1.2",
+			"resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz",
+			"integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==",
+			"dev": true,
+			"license": "MIT",
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/function.prototype.name": {
+			"version": "1.1.8",
+			"resolved": "https://registry.npmjs.org/function.prototype.name/-/function.prototype.name-1.1.8.tgz",
+			"integrity": "sha512-e5iwyodOHhbMr/yNrc7fDYG4qlbIvI5gajyzPnb5TCwyhjApznQh1BMFou9b30SevY43gCJKXycoCBjMbsuW0Q==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bind": "^1.0.8",
+				"call-bound": "^1.0.3",
+				"define-properties": "^1.2.1",
+				"functions-have-names": "^1.2.3",
+				"hasown": "^2.0.2",
+				"is-callable": "^1.2.7"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/functions-have-names": {
+			"version": "1.2.3",
+			"resolved": "https://registry.npmjs.org/functions-have-names/-/functions-have-names-1.2.3.tgz",
+			"integrity": "sha512-xckBUXyTIqT97tq2x2AMb+g163b5JFysYk0x4qxNFwbfQkmNZoiRHb6sPzI9/QV33WeuvVYBUIiD4NzNIyqaRQ==",
+			"dev": true,
+			"license": "MIT",
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/generator-function": {
+			"version": "2.0.1",
+			"resolved": "https://registry.npmjs.org/generator-function/-/generator-function-2.0.1.tgz",
+			"integrity": "sha512-SFdFmIJi+ybC0vjlHN0ZGVGHc3lgE0DxPAT0djjVg+kjOnSqclqmj0KQ7ykTOLP6YxoqOvuAODGdcHJn+43q3g==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/get-intrinsic": {
+			"version": "1.3.0",
+			"resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz",
+			"integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bind-apply-helpers": "^1.0.2",
+				"es-define-property": "^1.0.1",
+				"es-errors": "^1.3.0",
+				"es-object-atoms": "^1.1.1",
+				"function-bind": "^1.1.2",
+				"get-proto": "^1.0.1",
+				"gopd": "^1.2.0",
+				"has-symbols": "^1.1.0",
+				"hasown": "^2.0.2",
+				"math-intrinsics": "^1.1.0"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/get-proto": {
+			"version": "1.0.1",
+			"resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz",
+			"integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"dunder-proto": "^1.0.1",
+				"es-object-atoms": "^1.0.0"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/get-symbol-description": {
+			"version": "1.1.0",
+			"resolved": "https://registry.npmjs.org/get-symbol-description/-/get-symbol-description-1.1.0.tgz",
+			"integrity": "sha512-w9UMqWwJxHNOvoNzSJ2oPF5wvYcvP7jUvYzhp67yEhTi17ZDBBC1z9pTdGuzjD+EFIqLSYRweZjqfiPzQ06Ebg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bound": "^1.0.3",
+				"es-errors": "^1.3.0",
+				"get-intrinsic": "^1.2.6"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/get-tsconfig": {
+			"version": "4.13.0",
+			"resolved": "https://registry.npmjs.org/get-tsconfig/-/get-tsconfig-4.13.0.tgz",
+			"integrity": "sha512-1VKTZJCwBrvbd+Wn3AOgQP/2Av+TfTCOlE4AcRJE72W1ksZXbAx8PPBR9RzgTeSPzlPMHrbANMH3LbltH73wxQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"resolve-pkg-maps": "^1.0.0"
+			},
+			"funding": {
+				"url": "https://github.com/privatenumber/get-tsconfig?sponsor=1"
+			}
+		},
+		"node_modules/glob-parent": {
+			"version": "6.0.2",
+			"resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-6.0.2.tgz",
+			"integrity": "sha512-XxwI8EOhVQgWp6iDL+3b0r86f4d6AX6zSU55HfB4ydCEuXLXc5FcYeOu+nnGftS4TEju/11rt4KJPTMgbfmv4A==",
+			"dev": true,
+			"license": "ISC",
+			"dependencies": {
+				"is-glob": "^4.0.3"
+			},
+			"engines": {
+				"node": ">=10.13.0"
+			}
+		},
+		"node_modules/globals": {
+			"version": "14.0.0",
+			"resolved": "https://registry.npmjs.org/globals/-/globals-14.0.0.tgz",
+			"integrity": "sha512-oahGvuMGQlPw/ivIYBjVSrWAfWLBeku5tpPE2fOPLi+WHffIWbuh2tCjhyQhTBPMf5E9jDEH4FOmTYgYwbKwtQ==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=18"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/sindresorhus"
+			}
+		},
+		"node_modules/globalthis": {
+			"version": "1.0.4",
+			"resolved": "https://registry.npmjs.org/globalthis/-/globalthis-1.0.4.tgz",
+			"integrity": "sha512-DpLKbNU4WylpxJykQujfCcwYWiV/Jhm50Goo0wrVILAv5jOr9d+H+UR3PhSCD2rCCEIg0uc+G+muBTwD54JhDQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"define-properties": "^1.2.1",
+				"gopd": "^1.0.1"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/gopd": {
+			"version": "1.2.0",
+			"resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz",
+			"integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/graceful-fs": {
+			"version": "4.2.11",
+			"resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz",
+			"integrity": "sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ==",
+			"dev": true,
+			"license": "ISC"
+		},
+		"node_modules/graphemer": {
+			"version": "1.4.0",
+			"resolved": "https://registry.npmjs.org/graphemer/-/graphemer-1.4.0.tgz",
+			"integrity": "sha512-EtKwoO6kxCL9WO5xipiHTZlSzBm7WLT627TqC/uVRd0HKmq8NXyebnNYxDoBi7wt8eTWrUrKXCOVaFq9x1kgag==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/has-bigints": {
+			"version": "1.1.0",
+			"resolved": "https://registry.npmjs.org/has-bigints/-/has-bigints-1.1.0.tgz",
+			"integrity": "sha512-R3pbpkcIqv2Pm3dUwgjclDRVmWpTJW2DcMzcIhEXEx1oh/CEMObMm3KLmRJOdvhM7o4uQBnwr8pzRK2sJWIqfg==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/has-flag": {
+			"version": "4.0.0",
+			"resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+			"integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=8"
+			}
+		},
+		"node_modules/has-property-descriptors": {
+			"version": "1.0.2",
+			"resolved": "https://registry.npmjs.org/has-property-descriptors/-/has-property-descriptors-1.0.2.tgz",
+			"integrity": "sha512-55JNKuIW+vq4Ke1BjOTjM2YctQIvCT7GFzHwmfZPGo5wnrgkid0YQtnAleFSqumZm4az3n2BS+erby5ipJdgrg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"es-define-property": "^1.0.0"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/has-proto": {
+			"version": "1.2.0",
+			"resolved": "https://registry.npmjs.org/has-proto/-/has-proto-1.2.0.tgz",
+			"integrity": "sha512-KIL7eQPfHQRC8+XluaIw7BHUwwqL19bQn4hzNgdr+1wXoU0KKj6rufu47lhY7KbJR2C6T6+PfyN0Ea7wkSS+qQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"dunder-proto": "^1.0.0"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/has-symbols": {
+			"version": "1.1.0",
+			"resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz",
+			"integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/has-tostringtag": {
+			"version": "1.0.2",
+			"resolved": "https://registry.npmjs.org/has-tostringtag/-/has-tostringtag-1.0.2.tgz",
+			"integrity": "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"has-symbols": "^1.0.3"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/hasown": {
+			"version": "2.0.2",
+			"resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz",
+			"integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"function-bind": "^1.1.2"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/ignore": {
+			"version": "5.3.2",
+			"resolved": "https://registry.npmjs.org/ignore/-/ignore-5.3.2.tgz",
+			"integrity": "sha512-hsBTNUqQTDwkWtcdYI2i06Y/nUBEsNEDJKjWdigLvegy8kDuJAS8uRlpkkcQpyEXL0Z/pjDy5HBmMjRCJ2gq+g==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 4"
+			}
+		},
+		"node_modules/import-fresh": {
+			"version": "3.3.1",
+			"resolved": "https://registry.npmjs.org/import-fresh/-/import-fresh-3.3.1.tgz",
+			"integrity": "sha512-TR3KfrTZTYLPB6jUjfx6MF9WcWrHL9su5TObK4ZkYgBdWKPOFoSoQIdEuTuR82pmtxH2spWG9h6etwfr1pLBqQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"parent-module": "^1.0.0",
+				"resolve-from": "^4.0.0"
+			},
+			"engines": {
+				"node": ">=6"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/sindresorhus"
+			}
+		},
+		"node_modules/imurmurhash": {
+			"version": "0.1.4",
+			"resolved": "https://registry.npmjs.org/imurmurhash/-/imurmurhash-0.1.4.tgz",
+			"integrity": "sha512-JmXMZ6wuvDmLiHEml9ykzqO6lwFbof0GG4IkcGaENdCRDDmMVnny7s5HsIgHCbaq0w2MyPhDqkhTUgS2LU2PHA==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=0.8.19"
+			}
+		},
+		"node_modules/internal-slot": {
+			"version": "1.1.0",
+			"resolved": "https://registry.npmjs.org/internal-slot/-/internal-slot-1.1.0.tgz",
+			"integrity": "sha512-4gd7VpWNQNB4UKKCFFVcp1AVv+FMOgs9NKzjHKusc8jTMhd5eL1NqQqOpE0KzMds804/yHlglp3uxgluOqAPLw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"es-errors": "^1.3.0",
+				"hasown": "^2.0.2",
+				"side-channel": "^1.1.0"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/is-array-buffer": {
+			"version": "3.0.5",
+			"resolved": "https://registry.npmjs.org/is-array-buffer/-/is-array-buffer-3.0.5.tgz",
+			"integrity": "sha512-DDfANUiiG2wC1qawP66qlTugJeL5HyzMpfr8lLK+jMQirGzNod0B12cFB/9q838Ru27sBwfw78/rdoU7RERz6A==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bind": "^1.0.8",
+				"call-bound": "^1.0.3",
+				"get-intrinsic": "^1.2.6"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/is-async-function": {
+			"version": "2.1.1",
+			"resolved": "https://registry.npmjs.org/is-async-function/-/is-async-function-2.1.1.tgz",
+			"integrity": "sha512-9dgM/cZBnNvjzaMYHVoxxfPj2QXt22Ev7SuuPrs+xav0ukGB0S6d4ydZdEiM48kLx5kDV+QBPrpVnFyefL8kkQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"async-function": "^1.0.0",
+				"call-bound": "^1.0.3",
+				"get-proto": "^1.0.1",
+				"has-tostringtag": "^1.0.2",
+				"safe-regex-test": "^1.1.0"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/is-bigint": {
+			"version": "1.1.0",
+			"resolved": "https://registry.npmjs.org/is-bigint/-/is-bigint-1.1.0.tgz",
+			"integrity": "sha512-n4ZT37wG78iz03xPRKJrHTdZbe3IicyucEtdRsV5yglwc3GyUfbAfpSeD0FJ41NbUNSt5wbhqfp1fS+BgnvDFQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"has-bigints": "^1.0.2"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/is-boolean-object": {
+			"version": "1.2.2",
+			"resolved": "https://registry.npmjs.org/is-boolean-object/-/is-boolean-object-1.2.2.tgz",
+			"integrity": "sha512-wa56o2/ElJMYqjCjGkXri7it5FbebW5usLw/nPmCMs5DeZ7eziSYZhSmPRn0txqeW4LnAmQQU7FgqLpsEFKM4A==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bound": "^1.0.3",
+				"has-tostringtag": "^1.0.2"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/is-callable": {
+			"version": "1.2.7",
+			"resolved": "https://registry.npmjs.org/is-callable/-/is-callable-1.2.7.tgz",
+			"integrity": "sha512-1BC0BVFhS/p0qtw6enp8e+8OD0UrK0oFLztSjNzhcKA3WDuJxxAPXzPuPtKkjEY9UUoEWlX/8fgKeu2S8i9JTA==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/is-core-module": {
+			"version": "2.16.1",
+			"resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.16.1.tgz",
+			"integrity": "sha512-UfoeMA6fIJ8wTYFEUjelnaGI67v6+N7qXJEvQuIGa99l4xsCruSYOVSQ0uPANn4dAzm8lkYPaKLrrijLq7x23w==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"hasown": "^2.0.2"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/is-data-view": {
+			"version": "1.0.2",
+			"resolved": "https://registry.npmjs.org/is-data-view/-/is-data-view-1.0.2.tgz",
+			"integrity": "sha512-RKtWF8pGmS87i2D6gqQu/l7EYRlVdfzemCJN/P3UOs//x1QE7mfhvzHIApBTRf7axvT6DMGwSwBXYCT0nfB9xw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bound": "^1.0.2",
+				"get-intrinsic": "^1.2.6",
+				"is-typed-array": "^1.1.13"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/is-date-object": {
+			"version": "1.1.0",
+			"resolved": "https://registry.npmjs.org/is-date-object/-/is-date-object-1.1.0.tgz",
+			"integrity": "sha512-PwwhEakHVKTdRNVOw+/Gyh0+MzlCl4R6qKvkhuvLtPMggI1WAHt9sOwZxQLSGpUaDnrdyDsomoRgNnCfKNSXXg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bound": "^1.0.2",
+				"has-tostringtag": "^1.0.2"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/is-extglob": {
+			"version": "2.1.1",
+			"resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz",
+			"integrity": "sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=0.10.0"
+			}
+		},
+		"node_modules/is-finalizationregistry": {
+			"version": "1.1.1",
+			"resolved": "https://registry.npmjs.org/is-finalizationregistry/-/is-finalizationregistry-1.1.1.tgz",
+			"integrity": "sha512-1pC6N8qWJbWoPtEjgcL2xyhQOP491EQjeUo3qTKcmV8YSDDJrOepfG8pcC7h/QgnQHYSv0mJ3Z/ZWxmatVrysg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bound": "^1.0.3"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/is-generator-function": {
+			"version": "1.1.2",
+			"resolved": "https://registry.npmjs.org/is-generator-function/-/is-generator-function-1.1.2.tgz",
+			"integrity": "sha512-upqt1SkGkODW9tsGNG5mtXTXtECizwtS2kA161M+gJPc1xdb/Ax629af6YrTwcOeQHbewrPNlE5Dx7kzvXTizA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bound": "^1.0.4",
+				"generator-function": "^2.0.0",
+				"get-proto": "^1.0.1",
+				"has-tostringtag": "^1.0.2",
+				"safe-regex-test": "^1.1.0"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/is-glob": {
+			"version": "4.0.3",
+			"resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz",
+			"integrity": "sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"is-extglob": "^2.1.1"
+			},
+			"engines": {
+				"node": ">=0.10.0"
+			}
+		},
+		"node_modules/is-map": {
+			"version": "2.0.3",
+			"resolved": "https://registry.npmjs.org/is-map/-/is-map-2.0.3.tgz",
+			"integrity": "sha512-1Qed0/Hr2m+YqxnM09CjA2d/i6YZNfF6R2oRAOj36eUdS6qIV/huPJNSEpKbupewFs+ZsJlxsjjPbc0/afW6Lw==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/is-negative-zero": {
+			"version": "2.0.3",
+			"resolved": "https://registry.npmjs.org/is-negative-zero/-/is-negative-zero-2.0.3.tgz",
+			"integrity": "sha512-5KoIu2Ngpyek75jXodFvnafB6DJgr3u8uuK0LEZJjrU19DrMD3EVERaR8sjz8CCGgpZvxPl9SuE1GMVPFHx1mw==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/is-number": {
+			"version": "7.0.0",
+			"resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz",
+			"integrity": "sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=0.12.0"
+			}
+		},
+		"node_modules/is-number-object": {
+			"version": "1.1.1",
+			"resolved": "https://registry.npmjs.org/is-number-object/-/is-number-object-1.1.1.tgz",
+			"integrity": "sha512-lZhclumE1G6VYD8VHe35wFaIif+CTy5SJIi5+3y4psDgWu4wPDoBhF8NxUOinEc7pHgiTsT6MaBb92rKhhD+Xw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bound": "^1.0.3",
+				"has-tostringtag": "^1.0.2"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/is-regex": {
+			"version": "1.2.1",
+			"resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.2.1.tgz",
+			"integrity": "sha512-MjYsKHO5O7mCsmRGxWcLWheFqN9DJ/2TmngvjKXihe6efViPqc274+Fx/4fYj/r03+ESvBdTXK0V6tA3rgez1g==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bound": "^1.0.2",
+				"gopd": "^1.2.0",
+				"has-tostringtag": "^1.0.2",
+				"hasown": "^2.0.2"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/is-set": {
+			"version": "2.0.3",
+			"resolved": "https://registry.npmjs.org/is-set/-/is-set-2.0.3.tgz",
+			"integrity": "sha512-iPAjerrse27/ygGLxw+EBR9agv9Y6uLeYVJMu+QNCoouJ1/1ri0mGrcWpfCqFZuzzx3WjtwxG098X+n4OuRkPg==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/is-shared-array-buffer": {
+			"version": "1.0.4",
+			"resolved": "https://registry.npmjs.org/is-shared-array-buffer/-/is-shared-array-buffer-1.0.4.tgz",
+			"integrity": "sha512-ISWac8drv4ZGfwKl5slpHG9OwPNty4jOWPRIhBpxOoD+hqITiwuipOQ2bNthAzwA3B4fIjO4Nln74N0S9byq8A==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bound": "^1.0.3"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/is-string": {
+			"version": "1.1.1",
+			"resolved": "https://registry.npmjs.org/is-string/-/is-string-1.1.1.tgz",
+			"integrity": "sha512-BtEeSsoaQjlSPBemMQIrY1MY0uM6vnS1g5fmufYOtnxLGUZM2178PKbhsk7Ffv58IX+ZtcvoGwccYsh0PglkAA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bound": "^1.0.3",
+				"has-tostringtag": "^1.0.2"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/is-symbol": {
+			"version": "1.1.1",
+			"resolved": "https://registry.npmjs.org/is-symbol/-/is-symbol-1.1.1.tgz",
+			"integrity": "sha512-9gGx6GTtCQM73BgmHQXfDmLtfjjTUDSyoxTCbp5WtoixAhfgsDirWIcVQ/IHpvI5Vgd5i/J5F7B9cN/WlVbC/w==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bound": "^1.0.2",
+				"has-symbols": "^1.1.0",
+				"safe-regex-test": "^1.1.0"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/is-typed-array": {
+			"version": "1.1.15",
+			"resolved": "https://registry.npmjs.org/is-typed-array/-/is-typed-array-1.1.15.tgz",
+			"integrity": "sha512-p3EcsicXjit7SaskXHs1hA91QxgTw46Fv6EFKKGS5DRFLD8yKnohjF3hxoju94b/OcMZoQukzpPpBE9uLVKzgQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"which-typed-array": "^1.1.16"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/is-weakmap": {
+			"version": "2.0.2",
+			"resolved": "https://registry.npmjs.org/is-weakmap/-/is-weakmap-2.0.2.tgz",
+			"integrity": "sha512-K5pXYOm9wqY1RgjpL3YTkF39tni1XajUIkawTLUo9EZEVUFga5gSQJF8nNS7ZwJQ02y+1YCNYcMh+HIf1ZqE+w==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/is-weakref": {
+			"version": "1.1.1",
+			"resolved": "https://registry.npmjs.org/is-weakref/-/is-weakref-1.1.1.tgz",
+			"integrity": "sha512-6i9mGWSlqzNMEqpCp93KwRS1uUOodk2OJ6b+sq7ZPDSy2WuI5NFIxp/254TytR8ftefexkWn5xNiHUNpPOfSew==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bound": "^1.0.3"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/is-weakset": {
+			"version": "2.0.4",
+			"resolved": "https://registry.npmjs.org/is-weakset/-/is-weakset-2.0.4.tgz",
+			"integrity": "sha512-mfcwb6IzQyOKTs84CQMrOwW4gQcaTOAWJ0zzJCl2WSPDrWk/OzDaImWFH3djXhb24g4eudZfLRozAvPGw4d9hQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bound": "^1.0.3",
+				"get-intrinsic": "^1.2.6"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/isarray": {
+			"version": "2.0.5",
+			"resolved": "https://registry.npmjs.org/isarray/-/isarray-2.0.5.tgz",
+			"integrity": "sha512-xHjhDr3cNBK0BzdUJSPXZntQUx/mwMS5Rw4A7lPJ90XGAO6ISP/ePDNuo0vhqOZU+UD5JoodwCAAoZQd3FeAKw==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/isexe": {
+			"version": "2.0.0",
+			"resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz",
+			"integrity": "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==",
+			"dev": true,
+			"license": "ISC"
+		},
+		"node_modules/iterator.prototype": {
+			"version": "1.1.5",
+			"resolved": "https://registry.npmjs.org/iterator.prototype/-/iterator.prototype-1.1.5.tgz",
+			"integrity": "sha512-H0dkQoCa3b2VEeKQBOxFph+JAbcrQdE7KC0UkqwpLmv2EC4P41QXP+rqo9wYodACiG5/WM5s9oDApTU8utwj9g==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"define-data-property": "^1.1.4",
+				"es-object-atoms": "^1.0.0",
+				"get-intrinsic": "^1.2.6",
+				"get-proto": "^1.0.0",
+				"has-symbols": "^1.1.0",
+				"set-function-name": "^2.0.2"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/jiti": {
+			"version": "2.6.1",
+			"resolved": "https://registry.npmjs.org/jiti/-/jiti-2.6.1.tgz",
+			"integrity": "sha512-ekilCSN1jwRvIbgeg/57YFh8qQDNbwDb9xT/qu2DAHbFFZUicIl4ygVaAvzveMhMVr3LnpSKTNnwt8PoOfmKhQ==",
+			"dev": true,
+			"license": "MIT",
+			"bin": {
+				"jiti": "lib/jiti-cli.mjs"
+			}
+		},
+		"node_modules/js-tokens": {
+			"version": "4.0.0",
+			"resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz",
+			"integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/js-yaml": {
+			"version": "4.1.1",
+			"resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.1.tgz",
+			"integrity": "sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"argparse": "^2.0.1"
+			},
+			"bin": {
+				"js-yaml": "bin/js-yaml.js"
+			}
+		},
+		"node_modules/json-buffer": {
+			"version": "3.0.1",
+			"resolved": "https://registry.npmjs.org/json-buffer/-/json-buffer-3.0.1.tgz",
+			"integrity": "sha512-4bV5BfR2mqfQTJm+V5tPPdf+ZpuhiIvTuAB5g8kcrXOZpTT/QwwVRWBywX1ozr6lEuPdbHxwaJlm9G6mI2sfSQ==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/json-schema-migrate": {
+			"version": "2.0.0",
+			"resolved": "https://registry.npmjs.org/json-schema-migrate/-/json-schema-migrate-2.0.0.tgz",
+			"integrity": "sha512-r38SVTtojDRp4eD6WsCqiE0eNDt4v1WalBXb9cyZYw9ai5cGtBwzRNWjHzJl38w6TxFkXAIA7h+fyX3tnrAFhQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"ajv": "^8.0.0"
+			}
+		},
+		"node_modules/json-schema-migrate/node_modules/ajv": {
+			"version": "8.17.1",
+			"resolved": "https://registry.npmjs.org/ajv/-/ajv-8.17.1.tgz",
+			"integrity": "sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"fast-deep-equal": "^3.1.3",
+				"fast-uri": "^3.0.1",
+				"json-schema-traverse": "^1.0.0",
+				"require-from-string": "^2.0.2"
+			},
+			"funding": {
+				"type": "github",
+				"url": "https://github.com/sponsors/epoberezkin"
+			}
+		},
+		"node_modules/json-schema-migrate/node_modules/json-schema-traverse": {
+			"version": "1.0.0",
+			"resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
+			"integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/json-schema-traverse": {
+			"version": "0.4.1",
+			"resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz",
+			"integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/json-stable-stringify-without-jsonify": {
+			"version": "1.0.1",
+			"resolved": "https://registry.npmjs.org/json-stable-stringify-without-jsonify/-/json-stable-stringify-without-jsonify-1.0.1.tgz",
+			"integrity": "sha512-Bdboy+l7tA3OGW6FjyFHWkP5LuByj1Tk33Ljyq0axyzdk9//JSi2u3fP1QSmd1KNwq6VOKYGlAu87CisVir6Pw==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/json5": {
+			"version": "1.0.2",
+			"resolved": "https://registry.npmjs.org/json5/-/json5-1.0.2.tgz",
+			"integrity": "sha512-g1MWMLBiz8FKi1e4w0UyVL3w+iJceWAFBAaBnnGKOpNa5f8TLktkbre1+s6oICydWAm+HRUGTmI+//xv2hvXYA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"minimist": "^1.2.0"
+			},
+			"bin": {
+				"json5": "lib/cli.js"
+			}
+		},
+		"node_modules/jsonc-eslint-parser": {
+			"version": "2.4.1",
+			"resolved": "https://registry.npmjs.org/jsonc-eslint-parser/-/jsonc-eslint-parser-2.4.1.tgz",
+			"integrity": "sha512-uuPNLJkKN8NXAlZlQ6kmUF9qO+T6Kyd7oV4+/7yy8Jz6+MZNyhPq8EdLpdfnPVzUC8qSf1b4j1azKaGnFsjmsw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"acorn": "^8.5.0",
+				"eslint-visitor-keys": "^3.0.0",
+				"espree": "^9.0.0",
+				"semver": "^7.3.5"
+			},
+			"engines": {
+				"node": "^12.22.0 || ^14.17.0 || >=16.0.0"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ota-meshi"
+			}
+		},
+		"node_modules/jsonc-eslint-parser/node_modules/eslint-visitor-keys": {
+			"version": "3.4.3",
+			"resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-3.4.3.tgz",
+			"integrity": "sha512-wpc+LXeiyiisxPlEkUzU6svyS1frIO3Mgxj1fdy7Pm8Ygzguax2N3Fa/D/ag1WqbOprdI+uY6wMUl8/a2G+iag==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"engines": {
+				"node": "^12.22.0 || ^14.17.0 || >=16.0.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/eslint"
+			}
+		},
+		"node_modules/jsonc-eslint-parser/node_modules/espree": {
+			"version": "9.6.1",
+			"resolved": "https://registry.npmjs.org/espree/-/espree-9.6.1.tgz",
+			"integrity": "sha512-oruZaFkjorTpF32kDSI5/75ViwGeZginGGy2NoOSg3Q9bnwlnmDm4HLnkl0RE3n+njDXR037aY1+x58Z/zFdwQ==",
+			"dev": true,
+			"license": "BSD-2-Clause",
+			"dependencies": {
+				"acorn": "^8.9.0",
+				"acorn-jsx": "^5.3.2",
+				"eslint-visitor-keys": "^3.4.1"
+			},
+			"engines": {
+				"node": "^12.22.0 || ^14.17.0 || >=16.0.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/eslint"
+			}
+		},
+		"node_modules/jsonc-eslint-parser/node_modules/semver": {
+			"version": "7.7.3",
+			"resolved": "https://registry.npmjs.org/semver/-/semver-7.7.3.tgz",
+			"integrity": "sha512-SdsKMrI9TdgjdweUSR9MweHA4EJ8YxHn8DFaDisvhVlUOe4BF1tLD7GAj0lIqWVl+dPb/rExr0Btby5loQm20Q==",
+			"dev": true,
+			"license": "ISC",
+			"bin": {
+				"semver": "bin/semver.js"
+			},
+			"engines": {
+				"node": ">=10"
+			}
+		},
+		"node_modules/jsx-ast-utils": {
+			"version": "3.3.5",
+			"resolved": "https://registry.npmjs.org/jsx-ast-utils/-/jsx-ast-utils-3.3.5.tgz",
+			"integrity": "sha512-ZZow9HBI5O6EPgSJLUb8n2NKgmVWTwCvHGwFuJlMjvLFqlGG6pjirPhtdsseaLZjSibD8eegzmYpUZwoIlj2cQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"array-includes": "^3.1.6",
+				"array.prototype.flat": "^1.3.1",
+				"object.assign": "^4.1.4",
+				"object.values": "^1.1.6"
+			},
+			"engines": {
+				"node": ">=4.0"
+			}
+		},
+		"node_modules/keyv": {
+			"version": "4.5.4",
+			"resolved": "https://registry.npmjs.org/keyv/-/keyv-4.5.4.tgz",
+			"integrity": "sha512-oxVHkHR/EJf2CNXnWxRLW6mg7JyCCUcG0DtEGmL2ctUo1PNTin1PUil+r/+4r5MpVgC/fn1kjsx7mjSujKqIpw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"json-buffer": "3.0.1"
+			}
+		},
+		"node_modules/levn": {
+			"version": "0.4.1",
+			"resolved": "https://registry.npmjs.org/levn/-/levn-0.4.1.tgz",
+			"integrity": "sha512-+bT2uH4E5LGE7h/n3evcS/sQlJXCpIp6ym8OWJ5eV6+67Dsql/LaaT7qJBAt2rzfoa/5QBGBhxDix1dMt2kQKQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"prelude-ls": "^1.2.1",
+				"type-check": "~0.4.0"
+			},
+			"engines": {
+				"node": ">= 0.8.0"
+			}
+		},
+		"node_modules/locate-path": {
+			"version": "6.0.0",
+			"resolved": "https://registry.npmjs.org/locate-path/-/locate-path-6.0.0.tgz",
+			"integrity": "sha512-iPZK6eYjbxRu3uB4/WZ3EsEIMJFMqAoopl3R+zuq0UjcAm/MO6KCweDgPfP3elTztoKP3KtnVHxTn2NHBSDVUw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"p-locate": "^5.0.0"
+			},
+			"engines": {
+				"node": ">=10"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/sindresorhus"
+			}
+		},
+		"node_modules/lodash.merge": {
+			"version": "4.6.2",
+			"resolved": "https://registry.npmjs.org/lodash.merge/-/lodash.merge-4.6.2.tgz",
+			"integrity": "sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/loose-envify": {
+			"version": "1.4.0",
+			"resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz",
+			"integrity": "sha512-lyuxPGr/Wfhrlem2CL/UcnUc1zcqKAImBDzukY7Y5F/yQiNdko6+fRLevlw1HgMySw7f611UIY408EtxRSoK3Q==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"js-tokens": "^3.0.0 || ^4.0.0"
+			},
+			"bin": {
+				"loose-envify": "cli.js"
+			}
+		},
+		"node_modules/math-intrinsics": {
+			"version": "1.1.0",
+			"resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz",
+			"integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/merge2": {
+			"version": "1.4.1",
+			"resolved": "https://registry.npmjs.org/merge2/-/merge2-1.4.1.tgz",
+			"integrity": "sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 8"
+			}
+		},
+		"node_modules/micromatch": {
+			"version": "4.0.8",
+			"resolved": "https://registry.npmjs.org/micromatch/-/micromatch-4.0.8.tgz",
+			"integrity": "sha512-PXwfBhYu0hBCPw8Dn0E+WDYb7af3dSLVWKi3HGv84IdF4TyFoC0ysxFd0Goxw7nSv4T/PzEJQxsYsEiFCKo2BA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"braces": "^3.0.3",
+				"picomatch": "^2.3.1"
+			},
+			"engines": {
+				"node": ">=8.6"
+			}
+		},
+		"node_modules/minimatch": {
+			"version": "3.1.2",
+			"resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz",
+			"integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==",
+			"dev": true,
+			"license": "ISC",
+			"dependencies": {
+				"brace-expansion": "^1.1.7"
+			},
+			"engines": {
+				"node": "*"
+			}
+		},
+		"node_modules/minimist": {
+			"version": "1.2.8",
+			"resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz",
+			"integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==",
+			"dev": true,
+			"license": "MIT",
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/module-replacements": {
+			"version": "2.10.1",
+			"resolved": "https://registry.npmjs.org/module-replacements/-/module-replacements-2.10.1.tgz",
+			"integrity": "sha512-qkKuLpMHDqRSM676OPL7HUpCiiP3NSxgf8NNR1ga2h/iJLNKTsOSjMEwrcT85DMSti2vmOqxknOVBGWj6H6etQ==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/moment": {
+			"version": "2.29.4",
+			"resolved": "https://registry.npmjs.org/moment/-/moment-2.29.4.tgz",
+			"integrity": "sha512-5LC9SOxjSc2HF6vO2CyuTDNivEdoz2IvyJJGj6X8DJ0eFyfszE0QiEd+iXmBvUP3WHxSjFH/vIsA0EN00cgr8w==",
+			"license": "MIT",
+			"engines": {
+				"node": "*"
+			}
+		},
+		"node_modules/ms": {
+			"version": "2.1.3",
+			"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
+			"integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/natural-compare": {
+			"version": "1.4.0",
+			"resolved": "https://registry.npmjs.org/natural-compare/-/natural-compare-1.4.0.tgz",
+			"integrity": "sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/object-assign": {
+			"version": "4.1.1",
+			"resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
+			"integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=0.10.0"
+			}
+		},
+		"node_modules/object-inspect": {
+			"version": "1.13.4",
+			"resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.4.tgz",
+			"integrity": "sha512-W67iLl4J2EXEGTbfeHCffrjDfitvLANg0UlX3wFUUSTx92KXRFegMHUVgSqE+wvhAbi4WqjGg9czysTV2Epbew==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/object-keys": {
+			"version": "1.1.1",
+			"resolved": "https://registry.npmjs.org/object-keys/-/object-keys-1.1.1.tgz",
+			"integrity": "sha512-NuAESUOUMrlIXOfHKzD6bpPu3tYt3xvjNdRIQ+FeT0lNb4K8WR70CaDxhuNguS2XG+GjkyMwOzsN5ZktImfhLA==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/object.assign": {
+			"version": "4.1.7",
+			"resolved": "https://registry.npmjs.org/object.assign/-/object.assign-4.1.7.tgz",
+			"integrity": "sha512-nK28WOo+QIjBkDduTINE4JkF/UJJKyf2EJxvJKfblDpyg0Q+pkOHNTL0Qwy6NP6FhE/EnzV73BxxqcJaXY9anw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bind": "^1.0.8",
+				"call-bound": "^1.0.3",
+				"define-properties": "^1.2.1",
+				"es-object-atoms": "^1.0.0",
+				"has-symbols": "^1.1.0",
+				"object-keys": "^1.1.1"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/object.entries": {
+			"version": "1.1.9",
+			"resolved": "https://registry.npmjs.org/object.entries/-/object.entries-1.1.9.tgz",
+			"integrity": "sha512-8u/hfXFRBD1O0hPUjioLhoWFHRmt6tKA4/vZPyckBr18l1KE9uHrFaFaUi8MDRTpi4uak2goyPTSNJLXX2k2Hw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bind": "^1.0.8",
+				"call-bound": "^1.0.4",
+				"define-properties": "^1.2.1",
+				"es-object-atoms": "^1.1.1"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/object.fromentries": {
+			"version": "2.0.8",
+			"resolved": "https://registry.npmjs.org/object.fromentries/-/object.fromentries-2.0.8.tgz",
+			"integrity": "sha512-k6E21FzySsSK5a21KRADBd/NGneRegFO5pLHfdQLpRDETUNJueLXs3WCzyQ3tFRDYgbq3KHGXfTbi2bs8WQ6rQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bind": "^1.0.7",
+				"define-properties": "^1.2.1",
+				"es-abstract": "^1.23.2",
+				"es-object-atoms": "^1.0.0"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/object.groupby": {
+			"version": "1.0.3",
+			"resolved": "https://registry.npmjs.org/object.groupby/-/object.groupby-1.0.3.tgz",
+			"integrity": "sha512-+Lhy3TQTuzXI5hevh8sBGqbmurHbbIjAi0Z4S63nthVLmLxfbj4T54a4CfZrXIrt9iP4mVAPYMo/v99taj3wjQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bind": "^1.0.7",
+				"define-properties": "^1.2.1",
+				"es-abstract": "^1.23.2"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/object.values": {
+			"version": "1.2.1",
+			"resolved": "https://registry.npmjs.org/object.values/-/object.values-1.2.1.tgz",
+			"integrity": "sha512-gXah6aZrcUxjWg2zR2MwouP2eHlCBzdV4pygudehaKXSGW4v2AsRQUK+lwwXhii6KFZcunEnmSUoYp5CXibxtA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bind": "^1.0.8",
+				"call-bound": "^1.0.3",
+				"define-properties": "^1.2.1",
+				"es-object-atoms": "^1.0.0"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/obsidian": {
+			"version": "1.10.3",
+			"resolved": "https://registry.npmjs.org/obsidian/-/obsidian-1.10.3.tgz",
+			"integrity": "sha512-VP+ZSxNMG7y6Z+sU9WqLvJAskCfkFrTz2kFHWmmzis+C+4+ELjk/sazwcTHrHXNZlgCeo8YOlM6SOrAFCynNew==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/codemirror": "5.60.8",
+				"moment": "2.29.4"
+			},
+			"peerDependencies": {
+				"@codemirror/state": "6.5.0",
+				"@codemirror/view": "6.38.6"
+			}
+		},
+		"node_modules/optionator": {
+			"version": "0.9.4",
+			"resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.4.tgz",
+			"integrity": "sha512-6IpQ7mKUxRcZNLIObR0hz7lxsapSSIYNZJwXPGeF0mTVqGKFIXj1DQcMoT22S3ROcLyY/rz0PWaWZ9ayWmad9g==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"deep-is": "^0.1.3",
+				"fast-levenshtein": "^2.0.6",
+				"levn": "^0.4.1",
+				"prelude-ls": "^1.2.1",
+				"type-check": "^0.4.0",
+				"word-wrap": "^1.2.5"
+			},
+			"engines": {
+				"node": ">= 0.8.0"
+			}
+		},
+		"node_modules/own-keys": {
+			"version": "1.0.1",
+			"resolved": "https://registry.npmjs.org/own-keys/-/own-keys-1.0.1.tgz",
+			"integrity": "sha512-qFOyK5PjiWZd+QQIh+1jhdb9LpxTF0qs7Pm8o5QHYZ0M3vKqSqzsZaEB6oWlxZ+q2sJBMI/Ktgd2N5ZwQoRHfg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"get-intrinsic": "^1.2.6",
+				"object-keys": "^1.1.1",
+				"safe-push-apply": "^1.0.0"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/p-limit": {
+			"version": "3.1.0",
+			"resolved": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz",
+			"integrity": "sha512-TYOanM3wGwNGsZN2cVTYPArw454xnXj5qmWF1bEoAc4+cU/ol7GVh7odevjp1FNHduHc3KZMcFduxU5Xc6uJRQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"yocto-queue": "^0.1.0"
+			},
+			"engines": {
+				"node": ">=10"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/sindresorhus"
+			}
+		},
+		"node_modules/p-locate": {
+			"version": "5.0.0",
+			"resolved": "https://registry.npmjs.org/p-locate/-/p-locate-5.0.0.tgz",
+			"integrity": "sha512-LaNjtRWUBY++zB5nE/NwcaoMylSPk+S+ZHNB1TzdbMJMny6dynpAGt7X/tl/QYq3TIeE6nxHppbo2LGymrG5Pw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"p-limit": "^3.0.2"
+			},
+			"engines": {
+				"node": ">=10"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/sindresorhus"
+			}
+		},
+		"node_modules/parent-module": {
+			"version": "1.0.1",
+			"resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz",
+			"integrity": "sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"callsites": "^3.0.0"
+			},
+			"engines": {
+				"node": ">=6"
+			}
+		},
+		"node_modules/path-exists": {
+			"version": "4.0.0",
+			"resolved": "https://registry.npmjs.org/path-exists/-/path-exists-4.0.0.tgz",
+			"integrity": "sha512-ak9Qy5Q7jYb2Wwcey5Fpvg2KoAc/ZIhLSLOSBmRmygPsGwkVVt0fZa0qrtMz+m6tJTAHfZQ8FnmB4MG4LWy7/w==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=8"
+			}
+		},
+		"node_modules/path-key": {
+			"version": "3.1.1",
+			"resolved": "https://registry.npmjs.org/path-key/-/path-key-3.1.1.tgz",
+			"integrity": "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=8"
+			}
+		},
+		"node_modules/path-parse": {
+			"version": "1.0.7",
+			"resolved": "https://registry.npmjs.org/path-parse/-/path-parse-1.0.7.tgz",
+			"integrity": "sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/picomatch": {
+			"version": "2.3.1",
+			"resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz",
+			"integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=8.6"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/jonschlinkert"
+			}
+		},
+		"node_modules/possible-typed-array-names": {
+			"version": "1.1.0",
+			"resolved": "https://registry.npmjs.org/possible-typed-array-names/-/possible-typed-array-names-1.1.0.tgz",
+			"integrity": "sha512-/+5VFTchJDoVj3bhoqi6UeymcD00DAwb1nJwamzPvHEszJ4FpF6SNNbUbOS8yI56qHzdV8eK0qEfOSiodkTdxg==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/prelude-ls": {
+			"version": "1.2.1",
+			"resolved": "https://registry.npmjs.org/prelude-ls/-/prelude-ls-1.2.1.tgz",
+			"integrity": "sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 0.8.0"
+			}
+		},
+		"node_modules/prop-types": {
+			"version": "15.8.1",
+			"resolved": "https://registry.npmjs.org/prop-types/-/prop-types-15.8.1.tgz",
+			"integrity": "sha512-oj87CgZICdulUohogVAR7AjlC0327U4el4L6eAvOqCeudMDVU0NThNaV+b9Df4dXgSP1gXMTnPdhfe/2qDH5cg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"loose-envify": "^1.4.0",
+				"object-assign": "^4.1.1",
+				"react-is": "^16.13.1"
+			}
+		},
+		"node_modules/punycode": {
+			"version": "2.3.1",
+			"resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz",
+			"integrity": "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=6"
+			}
+		},
+		"node_modules/queue-microtask": {
+			"version": "1.2.3",
+			"resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz",
+			"integrity": "sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==",
+			"dev": true,
+			"funding": [
+				{
+					"type": "github",
+					"url": "https://github.com/sponsors/feross"
+				},
+				{
+					"type": "patreon",
+					"url": "https://www.patreon.com/feross"
+				},
+				{
+					"type": "consulting",
+					"url": "https://feross.org/support"
+				}
+			],
+			"license": "MIT"
+		},
+		"node_modules/react-is": {
+			"version": "16.13.1",
+			"resolved": "https://registry.npmjs.org/react-is/-/react-is-16.13.1.tgz",
+			"integrity": "sha512-24e6ynE2H+OKt4kqsOvNd8kBpV65zoxbA4BVsEOB3ARVWQki/DHzaUoC5KuON/BiccDaCCTZBuOcfZs70kR8bQ==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/reflect.getprototypeof": {
+			"version": "1.0.10",
+			"resolved": "https://registry.npmjs.org/reflect.getprototypeof/-/reflect.getprototypeof-1.0.10.tgz",
+			"integrity": "sha512-00o4I+DVrefhv+nX0ulyi3biSHCPDe+yLv5o/p6d/UVlirijB8E16FtfwSAi4g3tcqrQ4lRAqQSoFEZJehYEcw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bind": "^1.0.8",
+				"define-properties": "^1.2.1",
+				"es-abstract": "^1.23.9",
+				"es-errors": "^1.3.0",
+				"es-object-atoms": "^1.0.0",
+				"get-intrinsic": "^1.2.7",
+				"get-proto": "^1.0.1",
+				"which-builtin-type": "^1.2.1"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/regexp-tree": {
+			"version": "0.1.27",
+			"resolved": "https://registry.npmjs.org/regexp-tree/-/regexp-tree-0.1.27.tgz",
+			"integrity": "sha512-iETxpjK6YoRWJG5o6hXLwvjYAoW+FEZn9os0PD/b6AP6xQwsa/Y7lCVgIixBbUPMfhu+i2LtdeAqVTgGlQarfA==",
+			"dev": true,
+			"license": "MIT",
+			"bin": {
+				"regexp-tree": "bin/regexp-tree"
+			}
+		},
+		"node_modules/regexp.prototype.flags": {
+			"version": "1.5.4",
+			"resolved": "https://registry.npmjs.org/regexp.prototype.flags/-/regexp.prototype.flags-1.5.4.tgz",
+			"integrity": "sha512-dYqgNSZbDwkaJ2ceRd9ojCGjBq+mOm9LmtXnAnEGyHhN/5R7iDW2TRw3h+o/jCFxus3P2LfWIIiwowAjANm7IA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bind": "^1.0.8",
+				"define-properties": "^1.2.1",
+				"es-errors": "^1.3.0",
+				"get-proto": "^1.0.1",
+				"gopd": "^1.2.0",
+				"set-function-name": "^2.0.2"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/require-from-string": {
+			"version": "2.0.2",
+			"resolved": "https://registry.npmjs.org/require-from-string/-/require-from-string-2.0.2.tgz",
+			"integrity": "sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=0.10.0"
+			}
+		},
+		"node_modules/resolve": {
+			"version": "1.22.11",
+			"resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.11.tgz",
+			"integrity": "sha512-RfqAvLnMl313r7c9oclB1HhUEAezcpLjz95wFH4LVuhk9JF/r22qmVP9AMmOU4vMX7Q8pN8jwNg/CSpdFnMjTQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"is-core-module": "^2.16.1",
+				"path-parse": "^1.0.7",
+				"supports-preserve-symlinks-flag": "^1.0.0"
+			},
+			"bin": {
+				"resolve": "bin/resolve"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/resolve-from": {
+			"version": "4.0.0",
+			"resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-4.0.0.tgz",
+			"integrity": "sha512-pb/MYmXstAkysRFx8piNI1tGFNQIFA3vkE3Gq4EuA1dF6gHp/+vgZqsCGJapvy8N3Q+4o7FwvquPJcnZ7RYy4g==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=4"
+			}
+		},
+		"node_modules/resolve-pkg-maps": {
+			"version": "1.0.0",
+			"resolved": "https://registry.npmjs.org/resolve-pkg-maps/-/resolve-pkg-maps-1.0.0.tgz",
+			"integrity": "sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw==",
+			"dev": true,
+			"license": "MIT",
+			"funding": {
+				"url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1"
+			}
+		},
+		"node_modules/ret": {
+			"version": "0.1.15",
+			"resolved": "https://registry.npmjs.org/ret/-/ret-0.1.15.tgz",
+			"integrity": "sha512-TTlYpa+OL+vMMNG24xSlQGEJ3B/RzEfUlLct7b5G/ytav+wPrplCpVMFuwzXbkecJrb6IYo1iFb0S9v37754mg==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=0.12"
+			}
+		},
+		"node_modules/reusify": {
+			"version": "1.1.0",
+			"resolved": "https://registry.npmjs.org/reusify/-/reusify-1.1.0.tgz",
+			"integrity": "sha512-g6QUff04oZpHs0eG5p83rFLhHeV00ug/Yf9nZM6fLeUrPguBTkTQOdpAWWspMh55TZfVQDPaN3NQJfbVRAxdIw==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"iojs": ">=1.0.0",
+				"node": ">=0.10.0"
+			}
+		},
+		"node_modules/run-parallel": {
+			"version": "1.2.0",
+			"resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz",
+			"integrity": "sha512-5l4VyZR86LZ/lDxZTR6jqL8AFE2S0IFLMP26AbjsLVADxHdhB/c0GUsH+y39UfCi3dzz8OlQuPmnaJOMoDHQBA==",
+			"dev": true,
+			"funding": [
+				{
+					"type": "github",
+					"url": "https://github.com/sponsors/feross"
+				},
+				{
+					"type": "patreon",
+					"url": "https://www.patreon.com/feross"
+				},
+				{
+					"type": "consulting",
+					"url": "https://feross.org/support"
+				}
+			],
+			"license": "MIT",
+			"dependencies": {
+				"queue-microtask": "^1.2.2"
+			}
+		},
+		"node_modules/safe-array-concat": {
+			"version": "1.1.3",
+			"resolved": "https://registry.npmjs.org/safe-array-concat/-/safe-array-concat-1.1.3.tgz",
+			"integrity": "sha512-AURm5f0jYEOydBj7VQlVvDrjeFgthDdEF5H1dP+6mNpoXOMo1quQqJ4wvJDyRZ9+pO3kGWoOdmV08cSv2aJV6Q==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bind": "^1.0.8",
+				"call-bound": "^1.0.2",
+				"get-intrinsic": "^1.2.6",
+				"has-symbols": "^1.1.0",
+				"isarray": "^2.0.5"
+			},
+			"engines": {
+				"node": ">=0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/safe-buffer": {
+			"version": "5.2.1",
+			"resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz",
+			"integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==",
+			"dev": true,
+			"funding": [
+				{
+					"type": "github",
+					"url": "https://github.com/sponsors/feross"
+				},
+				{
+					"type": "patreon",
+					"url": "https://www.patreon.com/feross"
+				},
+				{
+					"type": "consulting",
+					"url": "https://feross.org/support"
+				}
+			],
+			"license": "MIT"
+		},
+		"node_modules/safe-push-apply": {
+			"version": "1.0.0",
+			"resolved": "https://registry.npmjs.org/safe-push-apply/-/safe-push-apply-1.0.0.tgz",
+			"integrity": "sha512-iKE9w/Z7xCzUMIZqdBsp6pEQvwuEebH4vdpjcDWnyzaI6yl6O9FHvVpmGelvEHNsoY6wGblkxR6Zty/h00WiSA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"es-errors": "^1.3.0",
+				"isarray": "^2.0.5"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/safe-regex": {
+			"version": "2.1.1",
+			"resolved": "https://registry.npmjs.org/safe-regex/-/safe-regex-2.1.1.tgz",
+			"integrity": "sha512-rx+x8AMzKb5Q5lQ95Zoi6ZbJqwCLkqi3XuJXp5P3rT8OEc6sZCJG5AE5dU3lsgRr/F4Bs31jSlVN+j5KrsGu9A==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"regexp-tree": "~0.1.1"
+			}
+		},
+		"node_modules/safe-regex-test": {
+			"version": "1.1.0",
+			"resolved": "https://registry.npmjs.org/safe-regex-test/-/safe-regex-test-1.1.0.tgz",
+			"integrity": "sha512-x/+Cz4YrimQxQccJf5mKEbIa1NzeCRNI5Ecl/ekmlYaampdNLPalVyIcCZNNH3MvmqBugV5TMYZXv0ljslUlaw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bound": "^1.0.2",
+				"es-errors": "^1.3.0",
+				"is-regex": "^1.2.1"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/semver": {
+			"version": "6.3.1",
+			"resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
+			"integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==",
+			"dev": true,
+			"license": "ISC",
+			"bin": {
+				"semver": "bin/semver.js"
+			}
+		},
+		"node_modules/set-function-length": {
+			"version": "1.2.2",
+			"resolved": "https://registry.npmjs.org/set-function-length/-/set-function-length-1.2.2.tgz",
+			"integrity": "sha512-pgRc4hJ4/sNjWCSS9AmnS40x3bNMDTknHgL5UaMBTMyJnU90EgWh1Rz+MC9eFu4BuN/UwZjKQuY/1v3rM7HMfg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"define-data-property": "^1.1.4",
+				"es-errors": "^1.3.0",
+				"function-bind": "^1.1.2",
+				"get-intrinsic": "^1.2.4",
+				"gopd": "^1.0.1",
+				"has-property-descriptors": "^1.0.2"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/set-function-name": {
+			"version": "2.0.2",
+			"resolved": "https://registry.npmjs.org/set-function-name/-/set-function-name-2.0.2.tgz",
+			"integrity": "sha512-7PGFlmtwsEADb0WYyvCMa1t+yke6daIG4Wirafur5kcf+MhUnPms1UeR0CKQdTZD81yESwMHbtn+TR+dMviakQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"define-data-property": "^1.1.4",
+				"es-errors": "^1.3.0",
+				"functions-have-names": "^1.2.3",
+				"has-property-descriptors": "^1.0.2"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/set-proto": {
+			"version": "1.0.0",
+			"resolved": "https://registry.npmjs.org/set-proto/-/set-proto-1.0.0.tgz",
+			"integrity": "sha512-RJRdvCo6IAnPdsvP/7m6bsQqNnn1FCBX5ZNtFL98MmFF/4xAIJTIg1YbHW5DC2W5SKZanrC6i4HsJqlajw/dZw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"dunder-proto": "^1.0.1",
+				"es-errors": "^1.3.0",
+				"es-object-atoms": "^1.0.0"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/shebang-command": {
+			"version": "2.0.0",
+			"resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz",
+			"integrity": "sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"shebang-regex": "^3.0.0"
+			},
+			"engines": {
+				"node": ">=8"
+			}
+		},
+		"node_modules/shebang-regex": {
+			"version": "3.0.0",
+			"resolved": "https://registry.npmjs.org/shebang-regex/-/shebang-regex-3.0.0.tgz",
+			"integrity": "sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=8"
+			}
+		},
+		"node_modules/side-channel": {
+			"version": "1.1.0",
+			"resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.1.0.tgz",
+			"integrity": "sha512-ZX99e6tRweoUXqR+VBrslhda51Nh5MTQwou5tnUDgbtyM0dBgmhEDtWGP/xbKn6hqfPRHujUNwz5fy/wbbhnpw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"es-errors": "^1.3.0",
+				"object-inspect": "^1.13.3",
+				"side-channel-list": "^1.0.0",
+				"side-channel-map": "^1.0.1",
+				"side-channel-weakmap": "^1.0.2"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/side-channel-list": {
+			"version": "1.0.0",
+			"resolved": "https://registry.npmjs.org/side-channel-list/-/side-channel-list-1.0.0.tgz",
+			"integrity": "sha512-FCLHtRD/gnpCiCHEiJLOwdmFP+wzCmDEkc9y7NsYxeF4u7Btsn1ZuwgwJGxImImHicJArLP4R0yX4c2KCrMrTA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"es-errors": "^1.3.0",
+				"object-inspect": "^1.13.3"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/side-channel-map": {
+			"version": "1.0.1",
+			"resolved": "https://registry.npmjs.org/side-channel-map/-/side-channel-map-1.0.1.tgz",
+			"integrity": "sha512-VCjCNfgMsby3tTdo02nbjtM/ewra6jPHmpThenkTYh8pG9ucZ/1P8So4u4FGBek/BjpOVsDCMoLA/iuBKIFXRA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bound": "^1.0.2",
+				"es-errors": "^1.3.0",
+				"get-intrinsic": "^1.2.5",
+				"object-inspect": "^1.13.3"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/side-channel-weakmap": {
+			"version": "1.0.2",
+			"resolved": "https://registry.npmjs.org/side-channel-weakmap/-/side-channel-weakmap-1.0.2.tgz",
+			"integrity": "sha512-WPS/HvHQTYnHisLo9McqBHOJk2FkHO/tlpvldyrnem4aeQp4hai3gythswg6p01oSoTl58rcpiFAjF2br2Ak2A==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bound": "^1.0.2",
+				"es-errors": "^1.3.0",
+				"get-intrinsic": "^1.2.5",
+				"object-inspect": "^1.13.3",
+				"side-channel-map": "^1.0.1"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/stop-iteration-iterator": {
+			"version": "1.1.0",
+			"resolved": "https://registry.npmjs.org/stop-iteration-iterator/-/stop-iteration-iterator-1.1.0.tgz",
+			"integrity": "sha512-eLoXW/DHyl62zxY4SCaIgnRhuMr6ri4juEYARS8E6sCEqzKpOiE521Ucofdx+KnDZl5xmvGYaaKCk5FEOxJCoQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"es-errors": "^1.3.0",
+				"internal-slot": "^1.1.0"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/string.prototype.matchall": {
+			"version": "4.0.12",
+			"resolved": "https://registry.npmjs.org/string.prototype.matchall/-/string.prototype.matchall-4.0.12.tgz",
+			"integrity": "sha512-6CC9uyBL+/48dYizRf7H7VAYCMCNTBeM78x/VTUe9bFEaxBepPJDa1Ow99LqI/1yF7kuy7Q3cQsYMrcjGUcskA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bind": "^1.0.8",
+				"call-bound": "^1.0.3",
+				"define-properties": "^1.2.1",
+				"es-abstract": "^1.23.6",
+				"es-errors": "^1.3.0",
+				"es-object-atoms": "^1.0.0",
+				"get-intrinsic": "^1.2.6",
+				"gopd": "^1.2.0",
+				"has-symbols": "^1.1.0",
+				"internal-slot": "^1.1.0",
+				"regexp.prototype.flags": "^1.5.3",
+				"set-function-name": "^2.0.2",
+				"side-channel": "^1.1.0"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/string.prototype.repeat": {
+			"version": "1.0.0",
+			"resolved": "https://registry.npmjs.org/string.prototype.repeat/-/string.prototype.repeat-1.0.0.tgz",
+			"integrity": "sha512-0u/TldDbKD8bFCQ/4f5+mNRrXwZ8hg2w7ZR8wa16e8z9XpePWl3eGEcUD0OXpEH/VJH/2G3gjUtR3ZOiBe2S/w==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"define-properties": "^1.1.3",
+				"es-abstract": "^1.17.5"
+			}
+		},
+		"node_modules/string.prototype.trim": {
+			"version": "1.2.10",
+			"resolved": "https://registry.npmjs.org/string.prototype.trim/-/string.prototype.trim-1.2.10.tgz",
+			"integrity": "sha512-Rs66F0P/1kedk5lyYyH9uBzuiI/kNRmwJAR9quK6VOtIpZ2G+hMZd+HQbbv25MgCA6gEffoMZYxlTod4WcdrKA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bind": "^1.0.8",
+				"call-bound": "^1.0.2",
+				"define-data-property": "^1.1.4",
+				"define-properties": "^1.2.1",
+				"es-abstract": "^1.23.5",
+				"es-object-atoms": "^1.0.0",
+				"has-property-descriptors": "^1.0.2"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/string.prototype.trimend": {
+			"version": "1.0.9",
+			"resolved": "https://registry.npmjs.org/string.prototype.trimend/-/string.prototype.trimend-1.0.9.tgz",
+			"integrity": "sha512-G7Ok5C6E/j4SGfyLCloXTrngQIQU3PWtXGst3yM7Bea9FRURf1S42ZHlZZtsNque2FN2PoUhfZXYLNWwEr4dLQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bind": "^1.0.8",
+				"call-bound": "^1.0.2",
+				"define-properties": "^1.2.1",
+				"es-object-atoms": "^1.0.0"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/string.prototype.trimstart": {
+			"version": "1.0.8",
+			"resolved": "https://registry.npmjs.org/string.prototype.trimstart/-/string.prototype.trimstart-1.0.8.tgz",
+			"integrity": "sha512-UXSH262CSZY1tfu3G3Secr6uGLCFVPMhIqHjlgCUtCCcgihYc/xKs9djMTMUOb2j1mVSeU8EU6NWc/iQKU6Gfg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bind": "^1.0.7",
+				"define-properties": "^1.2.1",
+				"es-object-atoms": "^1.0.0"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/strip-bom": {
+			"version": "3.0.0",
+			"resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-3.0.0.tgz",
+			"integrity": "sha512-vavAMRXOgBVNF6nyEEmL3DBK19iRpDcoIwW+swQ+CbGiu7lju6t+JklA1MHweoWtadgt4ISVUsXLyDq34ddcwA==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=4"
+			}
+		},
+		"node_modules/strip-json-comments": {
+			"version": "3.1.1",
+			"resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-3.1.1.tgz",
+			"integrity": "sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=8"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/sindresorhus"
+			}
+		},
+		"node_modules/style-mod": {
+			"version": "4.1.3",
+			"resolved": "https://registry.npmjs.org/style-mod/-/style-mod-4.1.3.tgz",
+			"integrity": "sha512-i/n8VsZydrugj3Iuzll8+x/00GH2vnYsk1eomD8QiRrSAeW6ItbCQDtfXCeJHd0iwiNagqjQkvpvREEPtW3IoQ==",
+			"license": "MIT",
+			"peer": true
+		},
+		"node_modules/supports-color": {
+			"version": "7.2.0",
+			"resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+			"integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"has-flag": "^4.0.0"
+			},
+			"engines": {
+				"node": ">=8"
+			}
+		},
+		"node_modules/supports-preserve-symlinks-flag": {
+			"version": "1.0.0",
+			"resolved": "https://registry.npmjs.org/supports-preserve-symlinks-flag/-/supports-preserve-symlinks-flag-1.0.0.tgz",
+			"integrity": "sha512-ot0WnXS9fgdkgIcePe6RHNk1WA8+muPa6cSjeR3V8K27q9BB1rTE3R1p7Hv0z1ZyAc8s6Vvv8DIyWf681MAt0w==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/synckit": {
+			"version": "0.9.3",
+			"resolved": "https://registry.npmjs.org/synckit/-/synckit-0.9.3.tgz",
+			"integrity": "sha512-JJoOEKTfL1urb1mDoEblhD9NhEbWmq9jHEMEnxoC4ujUaZ4itA8vKgwkFAyNClgxplLi9tsUKX+EduK0p/l7sg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@pkgr/core": "^0.1.0",
+				"tslib": "^2.6.2"
+			},
+			"engines": {
+				"node": "^14.18.0 || >=16.0.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/unts"
+			}
+		},
+		"node_modules/synckit/node_modules/tslib": {
+			"version": "2.8.1",
+			"resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz",
+			"integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==",
+			"dev": true,
+			"license": "0BSD"
+		},
+		"node_modules/tapable": {
+			"version": "2.3.0",
+			"resolved": "https://registry.npmjs.org/tapable/-/tapable-2.3.0.tgz",
+			"integrity": "sha512-g9ljZiwki/LfxmQADO3dEY1CbpmXT5Hm2fJ+QaGKwSXUylMybePR7/67YW7jOrrvjEgL1Fmz5kzyAjWVWLlucg==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=6"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/webpack"
+			}
+		},
+		"node_modules/to-regex-range": {
+			"version": "5.0.1",
+			"resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz",
+			"integrity": "sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"is-number": "^7.0.0"
+			},
+			"engines": {
+				"node": ">=8.0"
+			}
+		},
+		"node_modules/toml-eslint-parser": {
+			"version": "0.9.3",
+			"resolved": "https://registry.npmjs.org/toml-eslint-parser/-/toml-eslint-parser-0.9.3.tgz",
+			"integrity": "sha512-moYoCvkNUAPCxSW9jmHmRElhm4tVJpHL8ItC/+uYD0EpPSFXbck7yREz9tNdJVTSpHVod8+HoipcpbQ0oE6gsw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"eslint-visitor-keys": "^3.0.0"
+			},
+			"engines": {
+				"node": "^12.22.0 || ^14.17.0 || >=16.0.0"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ota-meshi"
+			}
+		},
+		"node_modules/toml-eslint-parser/node_modules/eslint-visitor-keys": {
+			"version": "3.4.3",
+			"resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-3.4.3.tgz",
+			"integrity": "sha512-wpc+LXeiyiisxPlEkUzU6svyS1frIO3Mgxj1fdy7Pm8Ygzguax2N3Fa/D/ag1WqbOprdI+uY6wMUl8/a2G+iag==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"engines": {
+				"node": "^12.22.0 || ^14.17.0 || >=16.0.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/eslint"
+			}
+		},
+		"node_modules/ts-api-utils": {
+			"version": "2.1.0",
+			"resolved": "https://registry.npmjs.org/ts-api-utils/-/ts-api-utils-2.1.0.tgz",
+			"integrity": "sha512-CUgTZL1irw8u29bzrOD/nH85jqyc74D6SshFgujOIA7osm2Rz7dYH77agkx7H4FBNxDq7Cjf+IjaX/8zwFW+ZQ==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=18.12"
+			},
+			"peerDependencies": {
+				"typescript": ">=4.8.4"
+			}
+		},
+		"node_modules/tsconfig-paths": {
+			"version": "3.15.0",
+			"resolved": "https://registry.npmjs.org/tsconfig-paths/-/tsconfig-paths-3.15.0.tgz",
+			"integrity": "sha512-2Ac2RgzDe/cn48GvOe3M+o82pEFewD3UPbyoUHHdKasHwJKjds4fLXWf/Ux5kATBKN20oaFGu+jbElp1pos0mg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@types/json5": "^0.0.29",
+				"json5": "^1.0.2",
+				"minimist": "^1.2.6",
+				"strip-bom": "^3.0.0"
+			}
+		},
+		"node_modules/tslib": {
+			"version": "2.4.0",
+			"resolved": "https://registry.npmjs.org/tslib/-/tslib-2.4.0.tgz",
+			"integrity": "sha512-d6xOpEDfsi2CZVlPQzGeux8XMwLT9hssAsaPYExaQMuYskwb+x1x7J371tWlbBdWHroy99KnVB6qIkUbs5X3UQ==",
+			"dev": true,
+			"license": "0BSD"
+		},
+		"node_modules/tunnel-agent": {
+			"version": "0.6.0",
+			"resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz",
+			"integrity": "sha512-McnNiV1l8RYeY8tBgEpuodCC1mLUdbSN+CYBL7kJsJNInOP8UjDDEwdk6Mw60vdLLrr5NHKZhMAOSrR2NZuQ+w==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"dependencies": {
+				"safe-buffer": "^5.0.1"
+			},
+			"engines": {
+				"node": "*"
+			}
+		},
+		"node_modules/type-check": {
+			"version": "0.4.0",
+			"resolved": "https://registry.npmjs.org/type-check/-/type-check-0.4.0.tgz",
+			"integrity": "sha512-XleUoc9uwGXqjWwXaUTZAmzMcFZ5858QA2vvx1Ur5xIcixXIP+8LnFDgRplU30us6teqdlskFfu+ae4K79Ooew==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"prelude-ls": "^1.2.1"
+			},
+			"engines": {
+				"node": ">= 0.8.0"
+			}
+		},
+		"node_modules/typed-array-buffer": {
+			"version": "1.0.3",
+			"resolved": "https://registry.npmjs.org/typed-array-buffer/-/typed-array-buffer-1.0.3.tgz",
+			"integrity": "sha512-nAYYwfY3qnzX30IkA6AQZjVbtK6duGontcQm1WSG1MD94YLqK0515GNApXkoxKOWMusVssAHWLh9SeaoefYFGw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bound": "^1.0.3",
+				"es-errors": "^1.3.0",
+				"is-typed-array": "^1.1.14"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/typed-array-byte-length": {
+			"version": "1.0.3",
+			"resolved": "https://registry.npmjs.org/typed-array-byte-length/-/typed-array-byte-length-1.0.3.tgz",
+			"integrity": "sha512-BaXgOuIxz8n8pIq3e7Atg/7s+DpiYrxn4vdot3w9KbnBhcRQq6o3xemQdIfynqSeXeDrF32x+WvfzmOjPiY9lg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bind": "^1.0.8",
+				"for-each": "^0.3.3",
+				"gopd": "^1.2.0",
+				"has-proto": "^1.2.0",
+				"is-typed-array": "^1.1.14"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/typed-array-byte-offset": {
+			"version": "1.0.4",
+			"resolved": "https://registry.npmjs.org/typed-array-byte-offset/-/typed-array-byte-offset-1.0.4.tgz",
+			"integrity": "sha512-bTlAFB/FBYMcuX81gbL4OcpH5PmlFHqlCCpAl8AlEzMz5k53oNDvN8p1PNOWLEmI2x4orp3raOFB51tv9X+MFQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"available-typed-arrays": "^1.0.7",
+				"call-bind": "^1.0.8",
+				"for-each": "^0.3.3",
+				"gopd": "^1.2.0",
+				"has-proto": "^1.2.0",
+				"is-typed-array": "^1.1.15",
+				"reflect.getprototypeof": "^1.0.9"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/typed-array-length": {
+			"version": "1.0.7",
+			"resolved": "https://registry.npmjs.org/typed-array-length/-/typed-array-length-1.0.7.tgz",
+			"integrity": "sha512-3KS2b+kL7fsuk/eJZ7EQdnEmQoaho/r6KUef7hxvltNA5DR8NAUM+8wJMbJyZ4G9/7i3v5zPBIMN5aybAh2/Jg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bind": "^1.0.7",
+				"for-each": "^0.3.3",
+				"gopd": "^1.0.1",
+				"is-typed-array": "^1.1.13",
+				"possible-typed-array-names": "^1.0.0",
+				"reflect.getprototypeof": "^1.0.6"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/typescript": {
+			"version": "5.8.3",
+			"resolved": "https://registry.npmjs.org/typescript/-/typescript-5.8.3.tgz",
+			"integrity": "sha512-p1diW6TqL9L07nNxvRMM7hMMw4c5XOo/1ibL4aAIGmSAt9slTE1Xgw5KWuof2uTOvCg9BY7ZRi+GaF+7sfgPeQ==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"bin": {
+				"tsc": "bin/tsc",
+				"tsserver": "bin/tsserver"
+			},
+			"engines": {
+				"node": ">=14.17"
+			}
+		},
+		"node_modules/typescript-eslint": {
+			"version": "8.35.1",
+			"resolved": "https://registry.npmjs.org/typescript-eslint/-/typescript-eslint-8.35.1.tgz",
+			"integrity": "sha512-xslJjFzhOmHYQzSB/QTeASAHbjmxOGEP6Coh93TXmUBFQoJ1VU35UHIDmG06Jd6taf3wqqC1ntBnCMeymy5Ovw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@typescript-eslint/eslint-plugin": "8.35.1",
+				"@typescript-eslint/parser": "8.35.1",
+				"@typescript-eslint/utils": "8.35.1"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/typescript-eslint"
+			},
+			"peerDependencies": {
+				"eslint": "^8.57.0 || ^9.0.0",
+				"typescript": ">=4.8.4 <5.9.0"
+			}
+		},
+		"node_modules/unbox-primitive": {
+			"version": "1.1.0",
+			"resolved": "https://registry.npmjs.org/unbox-primitive/-/unbox-primitive-1.1.0.tgz",
+			"integrity": "sha512-nWJ91DjeOkej/TA8pXQ3myruKpKEYgqvpw9lz4OPHj/NWFNluYrjbz9j01CJ8yKQd2g4jFoOkINCTW2I5LEEyw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bound": "^1.0.3",
+				"has-bigints": "^1.0.2",
+				"has-symbols": "^1.1.0",
+				"which-boxed-primitive": "^1.1.1"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/undici-types": {
+			"version": "5.26.5",
+			"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz",
+			"integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/uri-js": {
+			"version": "4.4.1",
+			"resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.4.1.tgz",
+			"integrity": "sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==",
+			"dev": true,
+			"license": "BSD-2-Clause",
+			"dependencies": {
+				"punycode": "^2.1.0"
+			}
+		},
+		"node_modules/w3c-keyname": {
+			"version": "2.2.8",
+			"resolved": "https://registry.npmjs.org/w3c-keyname/-/w3c-keyname-2.2.8.tgz",
+			"integrity": "sha512-dpojBhNsCNN7T82Tm7k26A6G9ML3NkhDsnw9n/eoxSRlVBB4CEtIQ/KTCLI2Fwf3ataSXRhYFkQi3SlnFwPvPQ==",
+			"license": "MIT",
+			"peer": true
+		},
+		"node_modules/which": {
+			"version": "2.0.2",
+			"resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz",
+			"integrity": "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==",
+			"dev": true,
+			"license": "ISC",
+			"dependencies": {
+				"isexe": "^2.0.0"
+			},
+			"bin": {
+				"node-which": "bin/node-which"
+			},
+			"engines": {
+				"node": ">= 8"
+			}
+		},
+		"node_modules/which-boxed-primitive": {
+			"version": "1.1.1",
+			"resolved": "https://registry.npmjs.org/which-boxed-primitive/-/which-boxed-primitive-1.1.1.tgz",
+			"integrity": "sha512-TbX3mj8n0odCBFVlY8AxkqcHASw3L60jIuF8jFP78az3C2YhmGvqbHBpAjTRH2/xqYunrJ9g1jSyjCjpoWzIAA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"is-bigint": "^1.1.0",
+				"is-boolean-object": "^1.2.1",
+				"is-number-object": "^1.1.1",
+				"is-string": "^1.1.1",
+				"is-symbol": "^1.1.1"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/which-builtin-type": {
+			"version": "1.2.1",
+			"resolved": "https://registry.npmjs.org/which-builtin-type/-/which-builtin-type-1.2.1.tgz",
+			"integrity": "sha512-6iBczoX+kDQ7a3+YJBnh3T+KZRxM/iYNPXicqk66/Qfm1b93iu+yOImkg0zHbj5LNOcNv1TEADiZ0xa34B4q6Q==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bound": "^1.0.2",
+				"function.prototype.name": "^1.1.6",
+				"has-tostringtag": "^1.0.2",
+				"is-async-function": "^2.0.0",
+				"is-date-object": "^1.1.0",
+				"is-finalizationregistry": "^1.1.0",
+				"is-generator-function": "^1.0.10",
+				"is-regex": "^1.2.1",
+				"is-weakref": "^1.0.2",
+				"isarray": "^2.0.5",
+				"which-boxed-primitive": "^1.1.0",
+				"which-collection": "^1.0.2",
+				"which-typed-array": "^1.1.16"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/which-collection": {
+			"version": "1.0.2",
+			"resolved": "https://registry.npmjs.org/which-collection/-/which-collection-1.0.2.tgz",
+			"integrity": "sha512-K4jVyjnBdgvc86Y6BkaLZEN933SwYOuBFkdmBu9ZfkcAbdVbpITnDmjvZ/aQjRXQrv5EPkTnD1s39GiiqbngCw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"is-map": "^2.0.3",
+				"is-set": "^2.0.3",
+				"is-weakmap": "^2.0.2",
+				"is-weakset": "^2.0.3"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/which-typed-array": {
+			"version": "1.1.19",
+			"resolved": "https://registry.npmjs.org/which-typed-array/-/which-typed-array-1.1.19.tgz",
+			"integrity": "sha512-rEvr90Bck4WZt9HHFC4DJMsjvu7x+r6bImz0/BrbWb7A2djJ8hnZMrWnHo9F8ssv0OMErasDhftrfROTyqSDrw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"available-typed-arrays": "^1.0.7",
+				"call-bind": "^1.0.8",
+				"call-bound": "^1.0.4",
+				"for-each": "^0.3.5",
+				"get-proto": "^1.0.1",
+				"gopd": "^1.2.0",
+				"has-tostringtag": "^1.0.2"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/word-wrap": {
+			"version": "1.2.5",
+			"resolved": "https://registry.npmjs.org/word-wrap/-/word-wrap-1.2.5.tgz",
+			"integrity": "sha512-BN22B5eaMMI9UMtjrGd5g5eCYPpCPDUy0FJXbYsaT5zYxjFOckS53SQDE3pWkVoWpHXVb3BrYcEN4Twa55B5cA==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=0.10.0"
+			}
+		},
+		"node_modules/yaml": {
+			"version": "2.8.1",
+			"resolved": "https://registry.npmjs.org/yaml/-/yaml-2.8.1.tgz",
+			"integrity": "sha512-lcYcMxX2PO9XMGvAJkJ3OsNMw+/7FKes7/hgerGUYWIoWu5j/+YQqcZr5JnPZWzOsEBgMbSbiSTn/dv/69Mkpw==",
+			"dev": true,
+			"license": "ISC",
+			"bin": {
+				"yaml": "bin.mjs"
+			},
+			"engines": {
+				"node": ">= 14.6"
+			}
+		},
+		"node_modules/yaml-eslint-parser": {
+			"version": "1.3.0",
+			"resolved": "https://registry.npmjs.org/yaml-eslint-parser/-/yaml-eslint-parser-1.3.0.tgz",
+			"integrity": "sha512-E/+VitOorXSLiAqtTd7Yqax0/pAS3xaYMP+AUUJGOK1OZG3rhcj9fcJOM5HJ2VrP1FrStVCWr1muTfQCdj4tAA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"eslint-visitor-keys": "^3.0.0",
+				"yaml": "^2.0.0"
+			},
+			"engines": {
+				"node": "^14.17.0 || >=16.0.0"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ota-meshi"
+			}
+		},
+		"node_modules/yaml-eslint-parser/node_modules/eslint-visitor-keys": {
+			"version": "3.4.3",
+			"resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-3.4.3.tgz",
+			"integrity": "sha512-wpc+LXeiyiisxPlEkUzU6svyS1frIO3Mgxj1fdy7Pm8Ygzguax2N3Fa/D/ag1WqbOprdI+uY6wMUl8/a2G+iag==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"engines": {
+				"node": "^12.22.0 || ^14.17.0 || >=16.0.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/eslint"
+			}
+		},
+		"node_modules/yocto-queue": {
+			"version": "0.1.0",
+			"resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-0.1.0.tgz",
+			"integrity": "sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=10"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/sindresorhus"
+			}
+		}
+	}
+}
diff --git a/surfsense_obsidian/package.json b/surfsense_obsidian/package.json
new file mode 100644
index 0000000000..21504cd5a5
--- /dev/null
+++ b/surfsense_obsidian/package.json
@@ -0,0 +1,34 @@
+{
+	"name": "surfsense-obsidian",
+	"version": "0.1.0",
+	"description": "SurfSense plugin for Obsidian: sync your vault to SurfSense for AI-powered search.",
+	"main": "main.js",
+	"type": "module",
+	"scripts": {
+		"dev": "node esbuild.config.mjs",
+		"build": "tsc -noEmit -skipLibCheck && node esbuild.config.mjs production",
+		"version": "node version-bump.mjs && git add manifest.json versions.json",
+		"lint": "eslint ."
+	},
+	"keywords": [
+		"obsidian",
+		"surfsense",
+		"sync",
+		"search"
+	],
+	"license": "Apache-2.0",
+	"devDependencies": {
+		"@eslint/js": "9.30.1",
+		"@types/node": "^20.19.39",
+		"esbuild": "0.25.5",
+		"eslint-plugin-obsidianmd": "0.1.9",
+		"globals": "14.0.0",
+		"jiti": "2.6.1",
+		"tslib": "2.4.0",
+		"typescript": "^5.8.3",
+		"typescript-eslint": "8.35.1"
+	},
+	"dependencies": {
+		"obsidian": "latest"
+	}
+}
diff --git a/surfsense_obsidian/src/api-client.ts b/surfsense_obsidian/src/api-client.ts
new file mode 100644
index 0000000000..37f5ebb655
--- /dev/null
+++ b/surfsense_obsidian/src/api-client.ts
@@ -0,0 +1,296 @@
+import { requestUrl, type RequestUrlParam, type RequestUrlResponse } from "obsidian";
+import type {
+	ConnectResponse,
+	DeleteAck,
+	HealthResponse,
+	ManifestResponse,
+	NotePayload,
+	RenameAck,
+	RenameItem,
+	SearchSpace,
+	SyncAck,
+} from "./types";
+
+/**
+ * SurfSense backend client used by the Obsidian plugin.
+ *
+ * Mobile-safety contract (must hold for every transitive import):
+ *   - Use Obsidian `requestUrl` only — no `fetch`, no `axios`, no
+ *     `node:http`, no `node:https`. CORS is bypassed and mobile works.
+ *   - No top-level `node:*` imports anywhere reachable from this file.
+ *   - Hashing happens elsewhere via Web Crypto, not `node:crypto`.
+ *
+ * Auth + wire contract:
+ *   - Every request carries `Authorization: Bearer <token>` only. No
+ *     custom headers — the backend identifies the caller from the JWT
+ *     and feature-detects the API via the `capabilities` array on
+ *     `/health` and `/connect`.
+ *   - 401 surfaces as `AuthError` so the orchestrator can show the
+ *     "token expired, paste a fresh one" UX.
+ *   - HealthResponse / ConnectResponse use index signatures so any
+ *     additive backend field (e.g. new capabilities) parses without
+ *     breaking the decoder. This mirrors `ConfigDict(extra='ignore')`
+ *     on the server side.
+ */
+
+export class AuthError extends Error {
+	constructor(message: string) {
+		super(message);
+		this.name = "AuthError";
+	}
+}
+
+export class TransientError extends Error {
+	readonly status: number;
+	constructor(status: number, message: string) {
+		super(message);
+		this.name = "TransientError";
+		this.status = status;
+	}
+}
+
+export class PermanentError extends Error {
+	readonly status: number;
+	constructor(status: number, message: string) {
+		super(message);
+		this.name = "PermanentError";
+		this.status = status;
+	}
+}
+
+/** 404 `VAULT_NOT_REGISTERED` — `/connect` hasn't committed yet; retry after reconnect. */
+export class VaultNotRegisteredError extends TransientError {
+	constructor(message: string) {
+		super(404, message);
+		this.name = "VaultNotRegisteredError";
+	}
+}
+
+export interface ApiClientOptions {
+	getServerUrl: () => string;
+	getToken: () => string;
+	onAuthError?: () => void;
+}
+
+const AUTH_BLOCK_MS = 60_000;
+
+export class SurfSenseApiClient {
+	private readonly opts: ApiClientOptions;
+	private authBlockedUntil = 0;
+
+	constructor(opts: ApiClientOptions) {
+		this.opts = opts;
+	}
+
+	updateOptions(partial: Partial<ApiClientOptions>): void {
+		Object.assign(this.opts, partial);
+	}
+
+	resetAuthBlock(): void {
+		this.authBlockedUntil = 0;
+	}
+
+	async health(): Promise<HealthResponse> {
+		return await this.request<HealthResponse>("GET", "/api/v1/obsidian/health");
+	}
+
+	async listSearchSpaces(): Promise<SearchSpace[]> {
+		const resp = await this.request<SearchSpace[] | { items: SearchSpace[] }>(
+			"GET",
+			"/api/v1/searchspaces/"
+		);
+		if (Array.isArray(resp)) return resp;
+		if (resp && Array.isArray((resp as { items?: SearchSpace[] }).items)) {
+			return (resp as { items: SearchSpace[] }).items;
+		}
+		return [];
+	}
+
+	async verifyToken(): Promise<{ ok: true; health: HealthResponse }> {
+		// /health is gated by current_active_user, so a successful response
+		// transitively proves the token works. Cheaper than fetching a list.
+		const health = await this.health();
+		return { ok: true, health };
+	}
+
+	async connect(input: {
+		searchSpaceId: number;
+		vaultId: string;
+		vaultName: string;
+		vaultFingerprint: string;
+	}): Promise<ConnectResponse> {
+		return await this.request<ConnectResponse>(
+			"POST",
+			"/api/v1/obsidian/connect",
+			{
+				vault_id: input.vaultId,
+				vault_name: input.vaultName,
+				search_space_id: input.searchSpaceId,
+				vault_fingerprint: input.vaultFingerprint,
+			}
+		);
+	}
+
+	/** POST /sync — `failed[]` are paths whose `status === "error"` for retry. */
+	async syncBatch(input: {
+		vaultId: string;
+		notes: NotePayload[];
+	}): Promise<{ indexed: number; failed: string[] }> {
+		const resp = await this.request<SyncAck>(
+			"POST",
+			"/api/v1/obsidian/sync",
+			{ vault_id: input.vaultId, notes: input.notes }
+		);
+		const failed = resp.items
+			.filter((it) => it.status === "error")
+			.map((it) => it.path);
+		return { indexed: resp.indexed, failed };
+	}
+
+	/** POST /rename — `"missing"` counts as success; only `"error"` is retried. */
+	async renameBatch(input: {
+		vaultId: string;
+		renames: Pick<RenameItem, "oldPath" | "newPath">[];
+	}): Promise<{
+		renamed: number;
+		failed: Array<{ oldPath: string; newPath: string }>;
+	}> {
+		const resp = await this.request<RenameAck>(
+			"POST",
+			"/api/v1/obsidian/rename",
+			{
+				vault_id: input.vaultId,
+				renames: input.renames.map((r) => ({
+					old_path: r.oldPath,
+					new_path: r.newPath,
+				})),
+			}
+		);
+		const failed = resp.items
+			.filter((it) => it.status === "error")
+			.map((it) => ({ oldPath: it.old_path, newPath: it.new_path }));
+		return { renamed: resp.renamed, failed };
+	}
+
+	/** DELETE /notes — `"missing"` counts as success; only `"error"` is retried. */
+	async deleteBatch(input: {
+		vaultId: string;
+		paths: string[];
+	}): Promise<{ deleted: number; failed: string[] }> {
+		const resp = await this.request<DeleteAck>(
+			"DELETE",
+			"/api/v1/obsidian/notes",
+			{ vault_id: input.vaultId, paths: input.paths }
+		);
+		const failed = resp.items
+			.filter((it) => it.status === "error")
+			.map((it) => it.path);
+		return { deleted: resp.deleted, failed };
+	}
+
+	async getManifest(vaultId: string): Promise<ManifestResponse> {
+		return await this.request<ManifestResponse>(
+			"GET",
+			`/api/v1/obsidian/manifest?vault_id=${encodeURIComponent(vaultId)}`
+		);
+	}
+
+	private async request<T>(
+		method: RequestUrlParam["method"],
+		path: string,
+		body?: unknown
+	): Promise<T> {
+		const baseUrl = this.opts.getServerUrl().replace(/\/+$/, "");
+		const token = this.opts.getToken();
+		if (!token) {
+			throw new AuthError("Missing API token. Open plugin settings to paste one.");
+		}
+		if (Date.now() < this.authBlockedUntil) {
+			throw new AuthError("Token rejected. Paste a fresh one in settings.");
+		}
+		const headers: Record<string, string> = {
+			Authorization: `Bearer ${token}`,
+			Accept: "application/json",
+		};
+		if (body !== undefined) headers["Content-Type"] = "application/json";
+
+		let resp: RequestUrlResponse;
+		try {
+			resp = await requestUrl({
+				url: `${baseUrl}${path}`,
+				method,
+				headers,
+				body: body === undefined ? undefined : JSON.stringify(body),
+				throw: false,
+			});
+		} catch (err) {
+			throw new TransientError(0, `Network error: ${(err as Error).message}`);
+		}
+
+		if (resp.status >= 200 && resp.status < 300) {
+			return parseJson<T>(resp);
+		}
+
+		const detail = extractDetail(resp);
+
+		if (resp.status === 401) {
+			this.authBlockedUntil = Date.now() + AUTH_BLOCK_MS;
+			this.opts.onAuthError?.();
+			throw new AuthError(detail || "Unauthorized");
+		}
+
+		if (resp.status >= 500 || resp.status === 429) {
+			throw new TransientError(resp.status, detail || `HTTP ${resp.status}`);
+		}
+
+		if (resp.status === 404 && extractCode(resp) === "VAULT_NOT_REGISTERED") {
+			throw new VaultNotRegisteredError(detail || "Vault not registered yet");
+		}
+
+		throw new PermanentError(resp.status, detail || `HTTP ${resp.status}`);
+	}
+}
+
+function parseJson<T>(resp: RequestUrlResponse): T {
+	// Plugin endpoints always return JSON; non-JSON 2xx is usually a
+	// captive portal or CDN page — surface as transient so we back off.
+	const text = resp.text ?? "";
+	try {
+		return JSON.parse(text) as T;
+	} catch {
+		throw new TransientError(
+			resp.status,
+			`Invalid JSON from server (got: ${text.slice(0, 80)})`
+		);
+	}
+}
+
+function safeJson(resp: RequestUrlResponse): Record<string, unknown> {
+	try {
+		return resp.text ? (JSON.parse(resp.text) as Record<string, unknown>) : {};
+	} catch {
+		return {};
+	}
+}
+
+function extractDetail(resp: RequestUrlResponse): string {
+	const json = safeJson(resp);
+	if (typeof json.detail === "string") return json.detail;
+	if (typeof json.message === "string") return json.message;
+	const detailObj = json.detail;
+	if (detailObj && typeof detailObj === "object") {
+		const obj = detailObj as Record<string, unknown>;
+		if (typeof obj.message === "string") return obj.message;
+	}
+	return resp.text?.slice(0, 200) ?? "";
+}
+
+function extractCode(resp: RequestUrlResponse): string | undefined {
+	const json = safeJson(resp);
+	const detailObj = json.detail;
+	if (detailObj && typeof detailObj === "object") {
+		const code = (detailObj as Record<string, unknown>).code;
+		if (typeof code === "string") return code;
+	}
+	return undefined;
+}
diff --git a/surfsense_obsidian/src/attachments-confirm-modal.ts b/surfsense_obsidian/src/attachments-confirm-modal.ts
new file mode 100644
index 0000000000..1a79fd2bd8
--- /dev/null
+++ b/surfsense_obsidian/src/attachments-confirm-modal.ts
@@ -0,0 +1,61 @@
+import { type App, Modal, Setting } from "obsidian";
+
+/**
+ * Confirmation modal shown before enabling attachment sync.
+ * Attachment files can be large and increase sync latency/cost.
+ */
+export class AttachmentsConfirmModal extends Modal {
+	private resolver: ((confirmed: boolean) => void) | null = null;
+
+	constructor(app: App) {
+		super(app);
+	}
+
+	onOpen(): void {
+		this.setTitle("Enable attachment sync?");
+		this.contentEl.empty();
+
+		new Setting(this.contentEl).setDesc(
+			"Syncing attachments (images & PDFs) can make indexing slower, especially on large vaults."
+		);
+		new Setting(this.contentEl).setDesc(
+			"Syncing attachments can make indexing slower on large vaults. You can disable this anytime.",
+		);
+
+		new Setting(this.contentEl)
+			.addButton((btn) =>
+				btn
+					.setButtonText("Cancel")
+					.onClick(() => this.resolveAndClose(false)),
+			)
+			.addButton((btn) =>
+				btn
+					.setButtonText("Enable")
+					.setCta()
+					.onClick(() => this.resolveAndClose(true)),
+			);
+	}
+
+	onClose(): void {
+		this.contentEl.empty();
+		if (this.resolver) {
+			this.resolver(false);
+			this.resolver = null;
+		}
+	}
+
+	waitForConfirmation(): Promise<boolean> {
+		this.open();
+		return new Promise<boolean>((resolve) => {
+			this.resolver = resolve;
+		});
+	}
+
+	private resolveAndClose(confirmed: boolean): void {
+		if (this.resolver) {
+			this.resolver(confirmed);
+			this.resolver = null;
+		}
+		this.close();
+	}
+}
diff --git a/surfsense_obsidian/src/excludes.ts b/surfsense_obsidian/src/excludes.ts
new file mode 100644
index 0000000000..1f47170b1c
--- /dev/null
+++ b/surfsense_obsidian/src/excludes.ts
@@ -0,0 +1,94 @@
+/**
+ * Tiny glob matcher for exclude patterns.
+ *
+ * Supports `*` (any chars except `/`), `**` (any chars including `/`), and
+ * literal segments. Patterns without a slash are matched against any path
+ * segment (so `templates` excludes `templates/foo.md` and `notes/templates/x.md`).
+ *
+ * Intentionally not a full minimatch — Obsidian users overwhelmingly type
+ * folder names ("templates", ".trash") and the obvious wildcards. Avoiding
+ * the dependency keeps the bundle small and the mobile attack surface tiny.
+ */
+
+const cache = new Map<string, RegExp>();
+
+function compile(pattern: string): RegExp {
+	const cached = cache.get(pattern);
+	if (cached) return cached;
+
+	let body = "";
+	let i = 0;
+	while (i < pattern.length) {
+		const ch = pattern[i] ?? "";
+		if (ch === "*") {
+			if (pattern[i + 1] === "*") {
+				body += ".*";
+				i += 2;
+				if (pattern[i] === "/") i += 1;
+				continue;
+			}
+			body += "[^/]*";
+			i += 1;
+			continue;
+		}
+		if (".+^${}()|[]\\".includes(ch)) {
+			body += "\\" + ch;
+			i += 1;
+			continue;
+		}
+		body += ch;
+		i += 1;
+	}
+
+	const anchored = pattern.includes("/")
+		? `^${body}(/.*)?$`
+		: `(^|/)${body}(/.*)?$`;
+	const re = new RegExp(anchored);
+	cache.set(pattern, re);
+	return re;
+}
+
+export function isExcluded(path: string, patterns: string[]): boolean {
+	if (!patterns.length) return false;
+	for (const raw of patterns) {
+		const trimmed = raw.trim();
+		if (!trimmed || trimmed.startsWith("#")) continue;
+		if (compile(trimmed).test(path)) return true;
+	}
+	return false;
+}
+
+export function parseExcludePatterns(raw: string): string[] {
+	return raw
+		.split(/\r?\n/)
+		.map((line) => line.trim())
+		.filter((line) => line.length > 0 && !line.startsWith("#"));
+}
+
+/** Normalize a folder path: strip leading/trailing slashes; "" or "/" means vault root. */
+export function normalizeFolder(folder: string): string {
+	return folder.replace(/^\/+|\/+$/g, "");
+}
+
+/** True if `path` lives inside `folder` (or `folder` is the vault root). */
+export function isInFolder(path: string, folder: string): boolean {
+	const f = normalizeFolder(folder);
+	if (f === "") return true;
+	return path === f || path.startsWith(`${f}/`);
+}
+
+/** Exclude wins over include. Empty includeFolders means "include everything". */
+export function isFolderFiltered(
+	path: string,
+	includeFolders: string[],
+	excludeFolders: string[],
+): boolean {
+	for (const f of excludeFolders) {
+		if (isInFolder(path, f)) return true;
+	}
+	if (includeFolders.length === 0) return false;
+	for (const f of includeFolders) {
+		if (isInFolder(path, f)) return false;
+	}
+	return true;
+}
diff --git a/surfsense_obsidian/src/folder-suggest-modal.ts b/surfsense_obsidian/src/folder-suggest-modal.ts
new file mode 100644
index 0000000000..a037a620f9
--- /dev/null
+++ b/surfsense_obsidian/src/folder-suggest-modal.ts
@@ -0,0 +1,32 @@
+import { type App, FuzzySuggestModal, type TFolder } from "obsidian";
+
+/** Folder picker built on Obsidian's stock {@link FuzzySuggestModal}. */
+export class FolderSuggestModal extends FuzzySuggestModal<TFolder> {
+	private readonly onPick: (path: string) => void;
+	private readonly excluded: Set<string>;
+
+	constructor(app: App, onPick: (path: string) => void, excluded: string[] = []) {
+		super(app);
+		this.onPick = onPick;
+		this.excluded = new Set(excluded.map((p) => p.replace(/^\/+|\/+$/g, "")));
+		this.setPlaceholder("Type to filter folders…");
+	}
+
+	getItems(): TFolder[] {
+		return this.app.vault
+			.getAllFolders(true)
+			.filter((f) => !this.excluded.has(this.toPath(f)));
+	}
+
+	getItemText(folder: TFolder): string {
+		return this.toPath(folder) || "/";
+	}
+
+	onChooseItem(folder: TFolder): void {
+		this.onPick(this.toPath(folder));
+	}
+
+	private toPath(folder: TFolder): string {
+		return folder.isRoot() ? "" : folder.path;
+	}
+}
diff --git a/surfsense_obsidian/src/main.ts b/surfsense_obsidian/src/main.ts
new file mode 100644
index 0000000000..1dea47b953
--- /dev/null
+++ b/surfsense_obsidian/src/main.ts
@@ -0,0 +1,292 @@
+import { Notice, Platform, Plugin } from "obsidian";
+import { SurfSenseApiClient } from "./api-client";
+import { PersistentQueue } from "./queue";
+import { SurfSenseSettingTab } from "./settings";
+import { StatusBar } from "./status-bar";
+import { StatusModal } from "./status-modal";
+import { SyncEngine } from "./sync-engine";
+import {
+	DEFAULT_SETTINGS,
+	type QueueItem,
+	type StatusState,
+	type SurfsensePluginSettings,
+} from "./types";
+import { generateVaultUuid } from "./vault-identity";
+
+/** SurfSense plugin entry point. */
+export default class SurfSensePlugin extends Plugin {
+	settings!: SurfsensePluginSettings;
+	api!: SurfSenseApiClient;
+	queue!: PersistentQueue;
+	engine!: SyncEngine;
+	private statusBar: StatusBar | null = null;
+	lastStatus: StatusState = { kind: "needs-setup", queueDepth: 0 };
+	serverCapabilities: string[] = [];
+	private settingTab: SurfSenseSettingTab | null = null;
+	private statusListeners = new Set<() => void>();
+	private reconcileTimerId: number | null = null;
+	private lastAuthToastAt = 0;
+
+	async onload() {
+		await this.loadSettings();
+		this.seedIdentity();
+		await this.saveSettings();
+
+		this.api = new SurfSenseApiClient({
+			getServerUrl: () => this.settings.serverUrl,
+			getToken: () => this.settings.apiToken,
+			onAuthError: () => this.notifyAuthError(),
+		});
+
+		this.queue = new PersistentQueue(this.settings.queue ?? [], {
+			persist: async (items) => {
+				this.settings.queue = items;
+				await this.saveData(this.settings);
+			},
+		});
+
+		this.engine = new SyncEngine({
+			app: this.app,
+			apiClient: this.api,
+			queue: this.queue,
+			getSettings: () => this.settings,
+			saveSettings: async (mut) => {
+				mut(this.settings);
+				await this.saveSettings();
+				this.notifyStatusChange();
+			},
+			setStatus: (s) => {
+				this.lastStatus = s;
+				this.statusBar?.update(s);
+				this.notifyStatusChange();
+			},
+			onCapabilities: (caps) => {
+				this.serverCapabilities = [...caps];
+				this.notifyStatusChange();
+			},
+			onReconcileBackoffChanged: () => {
+				this.restartReconcileTimer();
+			},
+		});
+
+		this.queue.setFlushHandler(() => {
+			if (!this.shouldAutoSync()) return;
+			void this.engine.flushQueue();
+		});
+
+		this.settingTab = new SurfSenseSettingTab(this.app, this);
+		this.addSettingTab(this.settingTab);
+
+		const statusHost = this.addStatusBarItem();
+		this.statusBar = new StatusBar(statusHost, () => this.openStatusModal());
+		this.statusBar.update(this.lastStatus);
+
+		this.registerEvent(
+			this.app.vault.on("create", (file) => this.engine.onCreate(file)),
+		);
+		this.registerEvent(
+			this.app.vault.on("modify", (file) => this.engine.onModify(file)),
+		);
+		this.registerEvent(
+			this.app.vault.on("delete", (file) => this.engine.onDelete(file)),
+		);
+		this.registerEvent(
+			this.app.vault.on("rename", (file, oldPath) =>
+				this.engine.onRename(file, oldPath),
+			),
+		);
+		this.registerEvent(
+			this.app.metadataCache.on("changed", (file, data, cache) =>
+				this.engine.onMetadataChanged(file, data, cache),
+			),
+		);
+
+		this.addCommand({
+			id: "resync-vault",
+			name: "Re-sync entire vault",
+			callback: async () => {
+				try {
+					await this.engine.maybeReconcile(true);
+					new Notice("Surfsense: re-sync started.");
+				} catch (err) {
+					new Notice(`Surfsense: re-sync failed — ${(err as Error).message}`);
+				}
+			},
+		});
+
+		this.addCommand({
+			id: "sync-current-note",
+			name: "Sync current note",
+			checkCallback: (checking) => {
+				const file = this.app.workspace.getActiveFile();
+				if (!file || file.extension.toLowerCase() !== "md") return false;
+				if (checking) return true;
+				this.queue.enqueueUpsert(file.path);
+				void this.engine.flushQueue();
+				return true;
+			},
+		});
+
+		this.addCommand({
+			id: "open-status",
+			name: "Open sync status",
+			callback: () => this.openStatusModal(),
+		});
+
+		this.addCommand({
+			id: "open-settings",
+			name: "Open settings",
+			callback: () => {
+				// `app.setting` isn't in the d.ts; fall back silently if it moves.
+				type SettingHost = {
+					open?: () => void;
+					openTabById?: (id: string) => void;
+				};
+				const setting = (this.app as unknown as { setting?: SettingHost }).setting;
+				if (setting?.open) setting.open();
+				if (setting?.openTabById) setting.openTabById(this.manifest.id);
+			},
+		});
+
+		const onNetChange = () => {
+			void this.engine.recoverConnectivityStatus();
+			if (this.shouldAutoSync()) void this.engine.flushQueue();
+		};
+		this.registerDomEvent(window, "online", onNetChange);
+		const conn = (navigator as unknown as { connection?: NetworkConnection }).connection;
+		if (conn && typeof conn.addEventListener === "function") {
+			conn.addEventListener("change", onNetChange);
+			this.register(() => conn.removeEventListener?.("change", onNetChange));
+		}
+
+		// Wait for layout so the metadataCache is warm before reconcile.
+		this.app.workspace.onLayoutReady(() => {
+			void this.engine.start();
+			this.restartReconcileTimer();
+		});
+	}
+
+	onunload() {
+		this.queue?.cancelFlush();
+		this.queue?.requestStop();
+	}
+
+	/**
+	 * Obsidian fires this when another device rewrites our data.json.
+	 * If the synced vault_id differs from ours, adopt it and
+	 * re-handshake so the server routes us to the right row.
+	 */
+	async onExternalSettingsChange(): Promise<void> {
+		const previousVaultId = this.settings.vaultId;
+		const previousConnectorId = this.settings.connectorId;
+		await this.loadSettings();
+		const changed =
+			this.settings.vaultId !== previousVaultId ||
+			this.settings.connectorId !== previousConnectorId;
+		if (!changed) return;
+		this.engine?.refreshStatus();
+		this.notifyStatusChange();
+		if (this.settings.searchSpaceId !== null) {
+			void this.engine.ensureConnected();
+		}
+	}
+
+	get queueDepth(): number {
+		return this.queue?.size ?? 0;
+	}
+
+	openStatusModal(): void {
+		new StatusModal(this.app, this).open();
+	}
+
+	restartReconcileTimer(): void {
+		if (this.reconcileTimerId !== null) {
+			window.clearInterval(this.reconcileTimerId);
+			this.reconcileTimerId = null;
+		}
+		const minutes = this.settings.syncIntervalMinutes ?? 10;
+		if (minutes <= 0) return;
+		const baseMs = minutes * 60 * 1000;
+		// Idle vaults back off (×2 → ×4 → ×8); resets on the first edit or non-empty reconcile.
+		const effectiveMs = this.engine?.getReconcileBackoffMs(baseMs) ?? baseMs;
+		const id = window.setInterval(
+			() => {
+				if (!this.shouldAutoSync()) return;
+				void this.engine.maybeReconcile();
+			},
+			effectiveMs,
+		);
+		this.reconcileTimerId = id;
+		this.registerInterval(id);
+	}
+
+	/** Gate for background network activity; per-edit flush + periodic reconcile both consult this. */
+	shouldAutoSync(): boolean {
+		if (!this.settings.wifiOnly) return true;
+		if (!Platform.isMobileApp) return true;
+		// navigator.connection is supported on Android Capacitor; undefined on iOS.
+		// When unavailable, behave permissively so iOS users aren't blocked outright.
+		const conn = (navigator as unknown as { connection?: NetworkConnection }).connection;
+		if (!conn || typeof conn.type !== "string") return true;
+		return conn.type === "wifi" || conn.type === "ethernet";
+	}
+
+	onStatusChange(listener: () => void): void {
+		this.statusListeners.add(listener);
+	}
+
+	offStatusChange(listener: () => void): void {
+		this.statusListeners.delete(listener);
+	}
+
+	private notifyStatusChange(): void {
+		for (const fn of this.statusListeners) fn();
+	}
+
+	private notifyAuthError(): void {
+		this.engine?.reportAuthError();
+		const now = Date.now();
+		if (now - this.lastAuthToastAt < 10_000) return;
+		this.lastAuthToastAt = now;
+		new Notice("Surfsense: API token expired or invalid. Paste a fresh token in settings.", 8000);
+	}
+
+	async loadSettings() {
+		const data = (await this.loadData()) as Partial<SurfsensePluginSettings> | null;
+		this.settings = {
+			...DEFAULT_SETTINGS,
+			...(data ?? {}),
+			queue: (data?.queue ?? []).map((i: QueueItem) => ({ ...i })),
+			tombstones: { ...(data?.tombstones ?? {}) },
+			includeFolders: [...(data?.includeFolders ?? [])],
+			excludeFolders: [...(data?.excludeFolders ?? [])],
+			excludePatterns: data?.excludePatterns?.length
+				? [...data.excludePatterns]
+				: [...DEFAULT_SETTINGS.excludePatterns],
+		};
+	}
+
+	async saveSettings() {
+		await this.saveData(this.settings);
+		this.engine?.refreshStatus();
+	}
+
+	/**
+	 * Mint a tentative vault_id locally on first run. The server's
+	 * fingerprint dedup (see /obsidian/connect) may overwrite it on the
+	 * first /connect when another device of the same vault has already
+	 * registered; we always trust the server's response.
+	 */
+	private seedIdentity(): void {
+		if (!this.settings.vaultId) {
+			this.settings.vaultId = generateVaultUuid();
+		}
+	}
+}
+
+/** Subset of the Network Information API used to detect WiFi vs cellular on Android. */
+interface NetworkConnection {
+	type?: string;
+	addEventListener?: (event: string, handler: () => void) => void;
+	removeEventListener?: (event: string, handler: () => void) => void;
+}
diff --git a/surfsense_obsidian/src/payload.ts b/surfsense_obsidian/src/payload.ts
new file mode 100644
index 0000000000..3294d62df8
--- /dev/null
+++ b/surfsense_obsidian/src/payload.ts
@@ -0,0 +1,163 @@
+import {
+	type App,
+	type CachedMetadata,
+	type FrontMatterCache,
+	type HeadingCache,
+	type ReferenceCache,
+	type TFile,
+} from "obsidian";
+import type { HeadingRef, NotePayload } from "./types";
+
+/**
+ * Build a NotePayload from an Obsidian TFile.
+ *
+ * Mobile-safety contract:
+ *   - No top-level `node:fs` / `node:path` / `node:crypto` imports.
+ *     File IO uses `vault.cachedRead` (works on the mobile WASM adapter).
+ *     Hashing uses Web Crypto `subtle.digest`.
+ *   - Caller MUST first wait for `metadataCache.changed` before calling
+ *     this for a `.md` file, otherwise `frontmatter`/`tags`/`headings`
+ *     can lag the actual file contents.
+ */
+export async function buildNotePayload(
+	app: App,
+	file: TFile,
+	vaultId: string,
+): Promise<NotePayload> {
+	const content = await app.vault.cachedRead(file);
+	const cache: CachedMetadata | null = app.metadataCache.getFileCache(file);
+
+	const frontmatter = normalizeFrontmatter(cache?.frontmatter);
+	const tags = collectTags(cache);
+	const headings = collectHeadings(cache?.headings ?? []);
+	const aliases = collectAliases(frontmatter);
+	const { embeds, internalLinks } = collectLinks(cache);
+	const { resolved, unresolved } = resolveLinkTargets(
+		app,
+		file.path,
+		internalLinks,
+	);
+	const contentHash = await computeContentHash(content);
+
+	return {
+		vault_id: vaultId,
+		path: file.path,
+		name: file.basename,
+		extension: file.extension,
+		content,
+		frontmatter,
+		tags,
+		headings,
+		resolved_links: resolved,
+		unresolved_links: unresolved,
+		embeds,
+		aliases,
+		content_hash: contentHash,
+		size: file.stat.size,
+		mtime: file.stat.mtime,
+		ctime: file.stat.ctime,
+	};
+}
+
+export async function computeContentHash(content: string): Promise<string> {
+	const bytes = new TextEncoder().encode(content);
+	const digest = await crypto.subtle.digest("SHA-256", bytes);
+	return bufferToHex(digest);
+}
+
+function bufferToHex(buf: ArrayBuffer): string {
+	const view = new Uint8Array(buf);
+	let hex = "";
+	for (let i = 0; i < view.length; i++) {
+		hex += (view[i] ?? 0).toString(16).padStart(2, "0");
+	}
+	return hex;
+}
+
+function normalizeFrontmatter(
+	fm: FrontMatterCache | undefined,
+): Record<string, unknown> {
+	if (!fm) return {};
+	// FrontMatterCache extends a plain object; strip the `position` key
+	// the cache adds so the wire payload stays clean.
+	const rest: Record<string, unknown> = { ...(fm as Record<string, unknown>) };
+	delete rest.position;
+	return rest;
+}
+
+function collectTags(cache: CachedMetadata | null): string[] {
+	const out = new Set<string>();
+	for (const t of cache?.tags ?? []) {
+		const tag = t.tag.startsWith("#") ? t.tag.slice(1) : t.tag;
+		if (tag) out.add(tag);
+	}
+	const fmTags: unknown =
+		cache?.frontmatter?.tags ?? cache?.frontmatter?.tag;
+	if (Array.isArray(fmTags)) {
+		for (const t of fmTags) {
+			if (typeof t === "string" && t) out.add(t.replace(/^#/, ""));
+		}
+	} else if (typeof fmTags === "string" && fmTags) {
+		for (const t of fmTags.split(/[\s,]+/)) {
+			if (t) out.add(t.replace(/^#/, ""));
+		}
+	}
+	return [...out];
+}
+
+function collectHeadings(items: HeadingCache[]): HeadingRef[] {
+	return items.map((h) => ({ heading: h.heading, level: h.level }));
+}
+
+function collectAliases(frontmatter: Record<string, unknown>): string[] {
+	const raw = frontmatter.aliases ?? frontmatter.alias;
+	if (Array.isArray(raw)) {
+		return raw.filter((x): x is string => typeof x === "string" && x.length > 0);
+	}
+	if (typeof raw === "string" && raw) return [raw];
+	return [];
+}
+
+function collectLinks(cache: CachedMetadata | null): {
+	embeds: string[];
+	internalLinks: ReferenceCache[];
+} {
+	const linkRefs: ReferenceCache[] = [
+		...((cache?.links) ?? []),
+		...((cache?.embeds as ReferenceCache[] | undefined) ?? []),
+	];
+	const embeds = ((cache?.embeds as ReferenceCache[] | undefined) ?? []).map(
+		(e) => e.link,
+	);
+	return { embeds, internalLinks: linkRefs };
+}
+
+function resolveLinkTargets(
+	app: App,
+	sourcePath: string,
+	links: ReferenceCache[],
+): { resolved: string[]; unresolved: string[] } {
+	const resolved = new Set<string>();
+	const unresolved = new Set<string>();
+	for (const link of links) {
+		const target = app.metadataCache.getFirstLinkpathDest(
+			stripSubpath(link.link),
+			sourcePath,
+		);
+		if (target) {
+			resolved.add(target.path);
+		} else {
+			unresolved.add(link.link);
+		}
+	}
+	return { resolved: [...resolved], unresolved: [...unresolved] };
+}
+
+function stripSubpath(link: string): string {
+	const hashIdx = link.indexOf("#");
+	const pipeIdx = link.indexOf("|");
+	let end = link.length;
+	if (hashIdx !== -1) end = Math.min(end, hashIdx);
+	if (pipeIdx !== -1) end = Math.min(end, pipeIdx);
+	return link.slice(0, end);
+}
diff --git a/surfsense_obsidian/src/queue.ts b/surfsense_obsidian/src/queue.ts
new file mode 100644
index 0000000000..0f7082456b
--- /dev/null
+++ b/surfsense_obsidian/src/queue.ts
@@ -0,0 +1,228 @@
+import { type Debouncer, debounce } from "obsidian";
+import type { QueueItem } from "./types";
+
+/**
+ * Persistent upload queue.
+ *
+ * Mobile-safety contract:
+ *   - Persistence is delegated to a save callback (which the plugin wires
+ *     to `plugin.saveData()`); never `node:fs`. Items also live in the
+ *     plugin's settings JSON so a crash mid-flight loses nothing.
+ *   - No top-level `node:*` imports.
+ *
+ * Behavioural contract:
+ *   - Per-file debounce: enqueueing the same path coalesces, the latest
+ *     `enqueuedAt` wins so we don't ship a stale snapshot.
+ *   - `delete` for a path drops any pending `upsert` for that path
+ *     (otherwise we'd resurrect a note the user just deleted).
+ *   - `rename` is a first-class op so the backend can update
+ *     `unique_identifier_hash` instead of "delete + create" (which would
+ *     blow away document versions, citations, and the document_id used
+ *     in chat history).
+ *   - Drain takes a worker, returns once the worker either succeeds for
+ *     every batch or hits a stop signal (transient error, mid-drain
+ *     stop request).
+ */
+
+export interface QueueWorker {
+	processBatch(batch: QueueItem[]): Promise<BatchResult>;
+}
+
+export interface BatchResult {
+	/** Items that succeeded; they will be ack'd off the queue. */
+	acked: QueueItem[];
+	/** Items that should be retried; their `attempt` is bumped. */
+	retry: QueueItem[];
+	/** Items that failed permanently (4xx). They get dropped. */
+	dropped: QueueItem[];
+	/** If true, the drain loop stops (e.g. transient/network error). */
+	stop: boolean;
+	/** Optional retry-after for transient errors (ms). */
+	backoffMs?: number;
+}
+
+export interface PersistentQueueOptions {
+	debounceMs?: number;
+	batchSize?: number;
+	maxAttempts?: number;
+	persist: (items: QueueItem[]) => Promise<void> | void;
+	now?: () => number;
+}
+
+const DEFAULTS = {
+	debounceMs: 2000,
+	batchSize: 15,
+	maxAttempts: 8,
+};
+
+export class PersistentQueue {
+	private items: QueueItem[];
+	private readonly opts: Required<
+		Omit<PersistentQueueOptions, "persist" | "now">
+	> & {
+		persist: PersistentQueueOptions["persist"];
+		now: () => number;
+	};
+	private draining = false;
+	private stopRequested = false;
+	private debouncedFlush: Debouncer<[], void> | null = null;
+
+	constructor(initial: QueueItem[], opts: PersistentQueueOptions) {
+		this.items = [...initial];
+		this.opts = {
+			debounceMs: opts.debounceMs ?? DEFAULTS.debounceMs,
+			batchSize: opts.batchSize ?? DEFAULTS.batchSize,
+			maxAttempts: opts.maxAttempts ?? DEFAULTS.maxAttempts,
+			persist: opts.persist,
+			now: opts.now ?? (() => Date.now()),
+		};
+	}
+
+	get size(): number {
+		return this.items.length;
+	}
+
+	snapshot(): QueueItem[] {
+		return this.items.map((i) => ({ ...i }));
+	}
+
+	setFlushHandler(handler: () => void): void {
+		// resetTimer: true → each enqueue postpones the flush.
+		this.debouncedFlush = debounce(handler, this.opts.debounceMs, true);
+	}
+
+	enqueueUpsert(path: string): void {
+		const now = this.opts.now();
+		this.items = this.items.filter(
+			(i) => !(i.op === "upsert" && i.path === path),
+		);
+		this.items.push({ op: "upsert", path, enqueuedAt: now, attempt: 0 });
+		void this.persist();
+		this.scheduleFlush();
+	}
+
+	enqueueDelete(path: string): void {
+		const now = this.opts.now();
+		// A delete supersedes any pending upsert for the same path.
+		this.items = this.items.filter(
+			(i) =>
+				!(
+					(i.op === "upsert" && i.path === path) ||
+					(i.op === "delete" && i.path === path)
+				),
+		);
+		this.items.push({ op: "delete", path, enqueuedAt: now, attempt: 0 });
+		void this.persist();
+		this.scheduleFlush();
+	}
+
+	enqueueRename(oldPath: string, newPath: string): void {
+		const now = this.opts.now();
+		this.items = this.items.filter(
+			(i) =>
+				!(
+					(i.op === "upsert" && (i.path === oldPath || i.path === newPath)) ||
+					(i.op === "rename" && i.oldPath === oldPath && i.newPath === newPath)
+				),
+		);
+		this.items.push({
+			op: "rename",
+			oldPath,
+			newPath,
+			enqueuedAt: now,
+			attempt: 0,
+		});
+		// Pair with an upsert — content may have changed alongside the rename.
+		this.items.push({ op: "upsert", path: newPath, enqueuedAt: now, attempt: 0 });
+		void this.persist();
+		this.scheduleFlush();
+	}
+
+	requestStop(): void {
+		this.stopRequested = true;
+	}
+
+	cancelFlush(): void {
+		this.debouncedFlush?.cancel();
+	}
+
+	private scheduleFlush(): void {
+		this.debouncedFlush?.();
+	}
+
+	async drain(worker: QueueWorker): Promise<DrainSummary> {
+		if (this.draining) return { batches: 0, acked: 0, dropped: 0, stopped: false };
+		this.draining = true;
+		this.stopRequested = false;
+		const summary: DrainSummary = {
+			batches: 0,
+			acked: 0,
+			dropped: 0,
+			stopped: false,
+		};
+		try {
+			while (this.items.length > 0 && !this.stopRequested) {
+				const batch = this.takeBatch();
+				summary.batches += 1;
+
+				const result = await worker.processBatch(batch);
+				summary.acked += result.acked.length;
+				summary.dropped += result.dropped.length;
+
+				const ackKeys = new Set(result.acked.map(itemKey));
+				const dropKeys = new Set(result.dropped.map(itemKey));
+				const retryKeys = new Set(result.retry.map(itemKey));
+
+				// Items the worker didn't classify get retried — never silently dropped.
+				const unhandled = batch.filter(
+					(b) =>
+						!ackKeys.has(itemKey(b)) &&
+						!dropKeys.has(itemKey(b)) &&
+						!retryKeys.has(itemKey(b)),
+				);
+				const retry = [...result.retry, ...unhandled].map((i) => ({
+					...i,
+					attempt: i.attempt + 1,
+				}));
+				const survivors = retry.filter((i) => i.attempt <= this.opts.maxAttempts);
+				summary.dropped += retry.length - survivors.length;
+
+				this.items = [...survivors, ...this.items];
+				await this.persist();
+
+				if (result.stop) {
+					summary.stopped = true;
+					if (result.backoffMs) summary.backoffMs = result.backoffMs;
+					break;
+				}
+			}
+			if (this.stopRequested) summary.stopped = true;
+			return summary;
+		} finally {
+			this.draining = false;
+		}
+	}
+
+	private takeBatch(): QueueItem[] {
+		const head = this.items.slice(0, this.opts.batchSize);
+		this.items = this.items.slice(this.opts.batchSize);
+		return head;
+	}
+
+	private async persist(): Promise<void> {
+		await this.opts.persist(this.snapshot());
+	}
+}
+
+export interface DrainSummary {
+	batches: number;
+	acked: number;
+	dropped: number;
+	stopped: boolean;
+	backoffMs?: number;
+}
+
+export function itemKey(i: QueueItem): string {
+	if (i.op === "rename") return `rename:${i.oldPath}=>${i.newPath}`;
+	return `${i.op}:${i.path}`;
+}
diff --git a/surfsense_obsidian/src/settings.ts b/surfsense_obsidian/src/settings.ts
new file mode 100644
index 0000000000..6a01f2fd1f
--- /dev/null
+++ b/surfsense_obsidian/src/settings.ts
@@ -0,0 +1,389 @@
+import {
+	type App,
+	type ButtonComponent,
+	Notice,
+	Platform,
+	PluginSettingTab,
+	Setting,
+	setIcon,
+} from "obsidian";
+import { AuthError } from "./api-client";
+import { AttachmentsConfirmModal } from "./attachments-confirm-modal";
+import { normalizeFolder, parseExcludePatterns } from "./excludes";
+import { FolderSuggestModal } from "./folder-suggest-modal";
+import type SurfSensePlugin from "./main";
+import { STATUS_VISUALS } from "./status-visuals";
+import type { SearchSpace } from "./types";
+
+/** Plugin settings tab. */
+
+export class SurfSenseSettingTab extends PluginSettingTab {
+	private readonly plugin: SurfSensePlugin;
+	private searchSpaces: SearchSpace[] = [];
+	private loadingSpaces = false;
+	private connectionIndicator: HTMLElement | null = null;
+	private readonly onStatusChange = (): void => this.updateConnectionIndicator();
+
+	constructor(app: App, plugin: SurfSensePlugin) {
+		super(app, plugin);
+		this.plugin = plugin;
+	}
+
+	display(): void {
+		const { containerEl } = this;
+		containerEl.empty();
+		this.plugin.onStatusChange(this.onStatusChange);
+
+		const settings = this.plugin.settings;
+
+		this.renderConnectionHeading(containerEl);
+
+		new Setting(containerEl)
+			.setName("Server URL")
+			.setDesc(
+				"https://surfsense.com for SurfSense Cloud, or your self-hosted URL.",
+			)
+			.addText((text) =>
+				text
+					.setPlaceholder("https://surfsense.com")
+					.setValue(settings.serverUrl)
+					.onChange(async (value) => {
+						const next = value.trim();
+						const previous = this.plugin.settings.serverUrl;
+						if (previous !== "" && next !== previous) {
+							this.plugin.settings.searchSpaceId = null;
+							this.plugin.settings.connectorId = null;
+						}
+						this.plugin.settings.serverUrl = next;
+						await this.plugin.saveSettings();
+					}),
+			);
+
+		let verifyButton: ButtonComponent | null = null;
+		const updateVerifyDisabled = (): void => {
+			verifyButton?.setDisabled(this.plugin.settings.apiToken.trim().length === 0);
+		};
+
+		new Setting(containerEl)
+			.setName("API token")
+			.setDesc(
+				"Paste your Surfsense API token (expires after 24 hours; re-paste when you see an auth error).",
+			)
+			.addText((text) => {
+				text.inputEl.type = "password";
+				text.inputEl.autocomplete = "off";
+				text.inputEl.spellcheck = false;
+				text
+					.setPlaceholder("Paste token")
+					.setValue(settings.apiToken)
+					.onChange(async (value) => {
+						const next = value.trim();
+						const previous = this.plugin.settings.apiToken;
+						if (previous !== "" && next !== previous) {
+							this.plugin.settings.searchSpaceId = null;
+							this.plugin.settings.connectorId = null;
+						}
+						this.plugin.settings.apiToken = next;
+						updateVerifyDisabled();
+						await this.plugin.saveSettings();
+						this.plugin.api.resetAuthBlock();
+					});
+			})
+			.addButton((btn) => {
+				verifyButton = btn;
+				updateVerifyDisabled();
+				btn.setButtonText("Verify").setCta().onClick(async () => {
+					if (this.plugin.settings.apiToken.trim().length === 0) {
+						new Notice("Surfsense: paste an API token before verifying.");
+						return;
+					}
+					btn.setDisabled(true);
+					try {
+						await this.plugin.api.verifyToken();
+						new Notice("Surfsense: token verified.");
+						this.plugin.engine.refreshStatus({ force: true });
+						await this.refreshSearchSpaces();
+						this.display();
+					} catch (err) {
+						this.handleApiError(err);
+					} finally {
+						updateVerifyDisabled();
+					}
+				});
+			});
+
+		new Setting(containerEl)
+			.setName("Search space")
+			.setDesc(
+				"Which Surfsense search space this vault syncs into. Reload after changing your token.",
+			)
+			.addDropdown((drop) => {
+				drop.addOption("", this.loadingSpaces ? "Loading…" : "Select a search space");
+				for (const space of this.searchSpaces) {
+					drop.addOption(String(space.id), space.name);
+				}
+				if (settings.searchSpaceId !== null) {
+					drop.setValue(String(settings.searchSpaceId));
+				}
+				drop.onChange(async (value) => {
+					this.plugin.settings.searchSpaceId = value ? Number(value) : null;
+					this.plugin.settings.connectorId = null;
+					await this.plugin.saveSettings();
+					if (this.plugin.settings.searchSpaceId !== null) {
+						try {
+							await this.plugin.engine.ensureConnected();
+							await this.plugin.engine.maybeReconcile(true);
+							new Notice("Surfsense: vault connected.");
+							this.display();
+						} catch (err) {
+							this.handleApiError(err);
+						}
+					}
+				});
+			})
+			.addExtraButton((btn) =>
+				btn
+					.setIcon("refresh-ccw")
+					.setTooltip("Reload search spaces")
+					.onClick(async () => {
+						await this.refreshSearchSpaces();
+						this.display();
+					}),
+			);
+
+		new Setting(containerEl).setName("Vault").setHeading();
+
+		new Setting(containerEl)
+			.setName("Sync interval")
+			.setDesc(
+				"How often to check for changes made outside Obsidian.",
+			)
+			.addDropdown((drop) => {
+				const options: Array<[number, string]> = [
+					[0, "Off"],
+					[5, "5 minutes"],
+					[10, "10 minutes"],
+					[15, "15 minutes"],
+					[30, "30 minutes"],
+					[60, "60 minutes"],
+					[120, "2 hours"],
+					[360, "6 hours"],
+					[720, "12 hours"],
+					[1440, "24 hours"],
+				];
+				for (const [value, label] of options) {
+					drop.addOption(String(value), label);
+				}
+				drop.setValue(String(settings.syncIntervalMinutes));
+				drop.onChange(async (value) => {
+					this.plugin.settings.syncIntervalMinutes = Number(value);
+					await this.plugin.saveSettings();
+					this.plugin.restartReconcileTimer();
+				});
+			});
+
+		this.renderFolderList(
+			containerEl,
+			"Include folders",
+			"Folders to sync (leave empty to sync entire vault).",
+			settings.includeFolders,
+			(next) => {
+				this.plugin.settings.includeFolders = next;
+			},
+		);
+
+		this.renderFolderList(
+			containerEl,
+			"Exclude folders",
+			"Folders to exclude from sync (takes precedence over includes).",
+			settings.excludeFolders,
+			(next) => {
+				this.plugin.settings.excludeFolders = next;
+			},
+		);
+
+		new Setting(containerEl)
+			.setName("Advanced exclude patterns")
+			.setDesc(
+				"Glob fallback for power users. One pattern per line, supports * and **. Lines starting with # are comments. Applied on top of the folder lists above.",
+			)
+			.addTextArea((area) => {
+				area.inputEl.rows = 4;
+				area
+					.setPlaceholder(".trash\n_attachments\ntemplates/**")
+					.setValue(settings.excludePatterns.join("\n"))
+					.onChange(async (value) => {
+						this.plugin.settings.excludePatterns = parseExcludePatterns(value);
+						await this.plugin.saveSettings();
+					});
+			});
+
+		new Setting(containerEl)
+			.setName("Include attachments")
+			.setDesc(
+				"Also sync non-Markdown files such as images and PDFs. Other file types are skipped.",
+			)
+			.addToggle((toggle) =>
+				toggle
+					.setValue(settings.includeAttachments)
+					.onChange(async (value) => {
+						const isEnabling =
+							value && !this.plugin.settings.includeAttachments;
+						if (isEnabling) {
+							const confirmed = await new AttachmentsConfirmModal(
+								this.app,
+							).waitForConfirmation();
+							if (!confirmed) {
+								this.display();
+								return;
+							}
+						}
+						this.plugin.settings.includeAttachments = value;
+						await this.plugin.saveSettings();
+					}),
+			);
+
+		if (Platform.isAndroidApp) {
+			new Setting(containerEl)
+				.setName("Sync only on WiFi")
+				.setDesc("Pause automatic syncing on cellular.")
+				.addToggle((toggle) =>
+					toggle
+						.setValue(settings.wifiOnly)
+						.onChange(async (value) => {
+							this.plugin.settings.wifiOnly = value;
+							await this.plugin.saveSettings();
+						}),
+				);
+		}
+
+		new Setting(containerEl)
+			.setName("Force sync")
+			.setDesc("Manually re-index the entire vault now.")
+			.addButton((btn) =>
+				btn.setButtonText("Update").onClick(async () => {
+					btn.setDisabled(true);
+					try {
+						await this.plugin.engine.maybeReconcile(true);
+						new Notice("Surfsense: re-sync requested.");
+					} catch (err) {
+						this.handleApiError(err);
+					} finally {
+						btn.setDisabled(false);
+					}
+				}),
+			);
+
+		new Setting(containerEl)
+			.addButton((btn) =>
+				btn
+					.setButtonText("View sync status")
+					.setCta()
+					.onClick(() => this.plugin.openStatusModal()),
+			)
+			.addButton((btn) =>
+				btn.setButtonText("Open releases").onClick(() => {
+					window.open(
+						"https://github.com/MODSetter/SurfSense/releases?q=obsidian",
+						"_blank",
+					);
+				}),
+			);
+	}
+
+	hide(): void {
+		this.plugin.offStatusChange(this.onStatusChange);
+		this.connectionIndicator = null;
+	}
+
+	private renderConnectionHeading(containerEl: HTMLElement): void {
+		const heading = new Setting(containerEl).setName("Connection").setHeading();
+		heading.nameEl.addClass("surfsense-connection-heading");
+		this.connectionIndicator = heading.nameEl.createSpan({
+			cls: "surfsense-connection-indicator",
+		});
+		this.updateConnectionIndicator();
+	}
+
+	private updateConnectionIndicator(): void {
+		const indicator = this.connectionIndicator;
+		if (!indicator) return;
+		const visual = STATUS_VISUALS[this.plugin.lastStatus.kind];
+		indicator.empty();
+		indicator.removeClass("surfsense-connection-indicator--err");
+		if (visual.isError) {
+			indicator.addClass("surfsense-connection-indicator--err");
+		}
+		setIcon(indicator, visual.icon);
+		indicator.setAttr("aria-label", visual.label);
+		indicator.setAttr("title", visual.label);
+	}
+
+	private async refreshSearchSpaces(): Promise<void> {
+		this.loadingSpaces = true;
+		try {
+			this.searchSpaces = await this.plugin.api.listSearchSpaces();
+		} catch (err) {
+			this.handleApiError(err);
+			this.searchSpaces = [];
+		} finally {
+			this.loadingSpaces = false;
+		}
+	}
+
+	private renderFolderList(
+		containerEl: HTMLElement,
+		title: string,
+		desc: string,
+		current: string[],
+		write: (next: string[]) => void,
+	): void {
+		const setting = new Setting(containerEl).setName(title).setDesc(desc);
+
+		const persist = async (next: string[]): Promise<void> => {
+			const dedup = Array.from(new Set(next.map(normalizeFolder)));
+			write(dedup);
+			await this.plugin.saveSettings();
+			this.display();
+		};
+
+		setting.addButton((btn) =>
+			btn
+				.setButtonText("Add folder")
+				.setCta()
+				.onClick(() => {
+					new FolderSuggestModal(
+						this.app,
+						(picked) => {
+							void persist([...current, picked]);
+						},
+						current,
+					).open();
+				}),
+		);
+
+		for (const folder of current) {
+			new Setting(containerEl).setName(folder || "/").addExtraButton((btn) =>
+				btn
+					.setIcon("cross")
+					.setTooltip("Remove")
+					.onClick(() => {
+						void persist(current.filter((f) => f !== folder));
+					}),
+			);
+		}
+	}
+
+	private handleApiError(err: unknown): void {
+		if (err instanceof AuthError) {
+			if (err.message.startsWith("Missing API token")) {
+				new Notice("Surfsense: paste an API token before verifying.");
+			}
+			return;
+		}
+		this.plugin.engine.reportError(err);
+		new Notice(
+			`SurfSense: request failed — ${(err as Error).message ?? "unknown error"}`,
+		);
+	}
+}
diff --git a/surfsense_obsidian/src/status-bar.ts b/surfsense_obsidian/src/status-bar.ts
new file mode 100644
index 0000000000..30abea50ce
--- /dev/null
+++ b/surfsense_obsidian/src/status-bar.ts
@@ -0,0 +1,46 @@
+import { setIcon } from "obsidian";
+import { STATUS_VISUALS } from "./status-visuals";
+import type { StatusState } from "./types";
+
+/**
+ * Tiny status-bar adornment.
+ *
+ * Plain DOM (no HTML strings, no CSS-in-JS) so it stays cheap on mobile
+ * and Obsidian's lint doesn't complain about innerHTML.
+ */
+
+export class StatusBar {
+	private readonly el: HTMLElement;
+	private readonly icon: HTMLElement;
+	private readonly text: HTMLElement;
+
+	constructor(host: HTMLElement, onClick?: () => void) {
+		this.el = host;
+		this.el.addClass("surfsense-status");
+		this.icon = this.el.createSpan({ cls: "surfsense-status__icon" });
+		this.text = this.el.createSpan({ cls: "surfsense-status__text" });
+		if (onClick) {
+			this.el.addClass("surfsense-status--clickable");
+			this.el.addEventListener("click", onClick);
+		}
+		this.update({ kind: "idle", queueDepth: 0 });
+	}
+
+	update(state: StatusState): void {
+		const visual = STATUS_VISUALS[state.kind];
+		this.el.removeClass("surfsense-status--err");
+		if (visual.isError) this.el.addClass("surfsense-status--err");
+		setIcon(this.icon, visual.icon);
+
+		let label = `SurfSense: ${visual.label}`;
+		if (state.queueDepth > 0 && state.kind !== "idle") {
+			label += ` (${state.queueDepth})`;
+		}
+		this.text.setText(label);
+		this.el.setAttr(
+			"aria-label",
+			state.detail ? `${label} — ${state.detail}` : label,
+		);
+		this.el.setAttr("title", state.detail ?? label);
+	}
+}
diff --git a/surfsense_obsidian/src/status-modal.ts b/surfsense_obsidian/src/status-modal.ts
new file mode 100644
index 0000000000..e05b3a5bc8
--- /dev/null
+++ b/surfsense_obsidian/src/status-modal.ts
@@ -0,0 +1,77 @@
+import { type App, Modal, Notice, Setting } from "obsidian";
+import type SurfSensePlugin from "./main";
+import { STATUS_VISUALS } from "./status-visuals";
+
+/** Live status panel reachable from the status bar / command palette. */
+export class StatusModal extends Modal {
+	private readonly plugin: SurfSensePlugin;
+	private readonly onChange = (): void => this.render();
+
+	constructor(app: App, plugin: SurfSensePlugin) {
+		super(app);
+		this.plugin = plugin;
+	}
+
+	onOpen(): void {
+		this.setTitle("Surfsense status");
+		this.plugin.onStatusChange(this.onChange);
+		this.render();
+	}
+
+	onClose(): void {
+		this.plugin.offStatusChange(this.onChange);
+		this.contentEl.empty();
+	}
+
+	private render(): void {
+		const { contentEl, plugin } = this;
+		contentEl.empty();
+		const s = plugin.settings;
+
+		const rows: Array<[string, string]> = [
+			["Status", STATUS_VISUALS[plugin.lastStatus.kind].label],
+			[
+				"Last sync",
+				s.lastSyncAt ? new Date(s.lastSyncAt).toLocaleString() : "—",
+			],
+			[
+				"Last reconcile",
+				s.lastReconcileAt
+					? new Date(s.lastReconcileAt).toLocaleString()
+					: "—",
+			],
+			["Files synced", String(s.filesSynced ?? 0)],
+			["Queue depth", String(plugin.queueDepth)],
+			[
+				"Capabilities",
+				plugin.serverCapabilities.length
+					? plugin.serverCapabilities.join(", ")
+					: "(not yet handshaken)",
+			],
+		];
+		for (const [label, value] of rows) {
+			new Setting(contentEl).setName(label).setDesc(value);
+		}
+
+		new Setting(contentEl)
+			.addButton((btn) =>
+				btn
+					.setButtonText("Re-sync entire vault")
+					.setCta()
+					.onClick(async () => {
+						btn.setDisabled(true);
+						try {
+							await plugin.engine.maybeReconcile(true);
+							new Notice("Surfsense: re-sync requested.");
+						} catch (err) {
+							new Notice(
+								`Surfsense: re-sync failed — ${(err as Error).message}`,
+							);
+						} finally {
+							btn.setDisabled(false);
+						}
+					}),
+			)
+			.addButton((btn) => btn.setButtonText("Close").onClick(() => this.close()));
+	}
+}
diff --git a/surfsense_obsidian/src/status-visuals.ts b/surfsense_obsidian/src/status-visuals.ts
new file mode 100644
index 0000000000..96a3c8f341
--- /dev/null
+++ b/surfsense_obsidian/src/status-visuals.ts
@@ -0,0 +1,18 @@
+import type { StatusKind } from "./types";
+
+/** Shared by the status bar and the settings "Connection" heading. */
+export interface StatusVisual {
+	icon: string;
+	label: string;
+	isError: boolean;
+}
+
+export const STATUS_VISUALS: Record<StatusKind, StatusVisual> = {
+	idle: { icon: "check-circle", label: "Synced", isError: false },
+	syncing: { icon: "refresh-ccw", label: "Syncing", isError: false },
+	queued: { icon: "clock", label: "Queued", isError: false },
+	"needs-setup": { icon: "cloud-off", label: "Setup required", isError: false },
+	offline: { icon: "wifi-off", label: "Offline", isError: false },
+	"auth-error": { icon: "alert-circle", label: "Reauthenticate", isError: true },
+	error: { icon: "alert-circle", label: "Error", isError: true },
+};
diff --git a/surfsense_obsidian/src/sync-engine.ts b/surfsense_obsidian/src/sync-engine.ts
new file mode 100644
index 0000000000..80594dd9eb
--- /dev/null
+++ b/surfsense_obsidian/src/sync-engine.ts
@@ -0,0 +1,751 @@
+import {
+	type App,
+	type CachedMetadata,
+	type Debouncer,
+	Notice,
+	type TAbstractFile,
+	TFile,
+	debounce,
+} from "obsidian";
+import {
+	AuthError,
+	PermanentError,
+	type SurfSenseApiClient,
+	TransientError,
+	VaultNotRegisteredError,
+} from "./api-client";
+import { isExcluded, isFolderFiltered } from "./excludes";
+import { buildNotePayload } from "./payload";
+import { type BatchResult, PersistentQueue } from "./queue";
+import type {
+	HealthResponse,
+	ManifestEntry,
+	NotePayload,
+	QueueItem,
+	StatusKind,
+	StatusState,
+} from "./types";
+import { computeVaultFingerprint } from "./vault-identity";
+
+/**
+ * Reconciles vault state with the server.
+ * Start order: connect (or /health) → drain queue → reconcile → subscribe events.
+ */
+
+export interface SyncEngineDeps {
+	app: App;
+	apiClient: SurfSenseApiClient;
+	queue: PersistentQueue;
+	getSettings: () => SyncEngineSettings;
+	saveSettings: (mut: (s: SyncEngineSettings) => void) => Promise<void>;
+	setStatus: (s: StatusState) => void;
+	onCapabilities: (caps: string[]) => void;
+	/** Fired when the adaptive backoff multiplier may have changed; main.ts uses it to reschedule. */
+	onReconcileBackoffChanged?: () => void;
+}
+
+export interface SyncEngineSettings {
+	vaultId: string;
+	apiToken: string;
+	connectorId: number | null;
+	searchSpaceId: number | null;
+	includeFolders: string[];
+	excludeFolders: string[];
+	excludePatterns: string[];
+	includeAttachments: boolean;
+	lastReconcileAt: number | null;
+	lastSyncAt: number | null;
+	filesSynced: number;
+	tombstones: Record<string, number>;
+}
+
+export const RECONCILE_MIN_INTERVAL_MS = 5 * 60 * 1000;
+const TOMBSTONE_TTL_MS = 24 * 60 * 60 * 1000; // 1 day
+const PENDING_DEBOUNCE_MS = 1500;
+
+export class SyncEngine {
+	private readonly deps: SyncEngineDeps;
+	private capabilities: string[] = [];
+	private pendingMdEdits = new Map<string, Debouncer<[], void>>();
+	/** Consecutive reconciles that found no work; powers the adaptive interval. */
+	private idleReconcileStreak = 0;
+	/** 2^streak is capped at this value (e.g. 8 → max ×8 backoff). */
+	private readonly maxBackoffMultiplier = 8;
+	private lastAppliedKind: StatusKind = "needs-setup";
+
+	constructor(deps: SyncEngineDeps) {
+		this.deps = deps;
+	}
+
+	/** Returns the next-tick interval given the user's base, scaled by the idle streak. */
+	getReconcileBackoffMs(baseMs: number): number {
+		const multiplier = Math.min(2 ** this.idleReconcileStreak, this.maxBackoffMultiplier);
+		return baseMs * multiplier;
+	}
+
+	getCapabilities(): readonly string[] {
+		return this.capabilities;
+	}
+
+	supports(capability: string): boolean {
+		return this.capabilities.includes(capability);
+	}
+
+	/** Run the onload sequence described in this file's docstring. */
+	async start(): Promise<void> {
+		this.setStatus("syncing", "Connecting to SurfSense…");
+
+		const settings = this.deps.getSettings();
+		if (!settings.searchSpaceId) {
+			// No target yet — /health still surfaces auth/network errors.
+			try {
+				const health = await this.deps.apiClient.health();
+				this.applyHealth(health);
+			} catch (err) {
+				this.handleStartupError(err);
+				return;
+			}
+			this.setStatus("idle");
+			return;
+		}
+
+		// Re-announce so the backend sees the latest vault_name + last_connect_at.
+		// flushQueue gates on connectorId, so a failed connect leaves the queue intact.
+		await this.ensureConnected();
+
+		await this.flushQueue();
+		await this.maybeReconcile();
+		this.setStatus(this.queueStatusKind(), undefined);
+	}
+
+	/**
+	 * (Re)register the vault. Adopts server's `vault_id` in case fingerprint
+	 * dedup routed us to an existing row from another device.
+	 */
+	async ensureConnected(): Promise<boolean> {
+		const settings = this.deps.getSettings();
+		if (!settings.searchSpaceId) {
+			this.setStatus("idle");
+			return false;
+		}
+		this.setStatus("syncing", "Connecting to SurfSense");
+		try {
+			const fingerprint = await computeVaultFingerprint(this.deps.app);
+			const resp = await this.deps.apiClient.connect({
+				searchSpaceId: settings.searchSpaceId,
+				vaultId: settings.vaultId,
+				vaultName: this.deps.app.vault.getName(),
+				vaultFingerprint: fingerprint,
+			});
+			this.applyHealth(resp);
+			await this.deps.saveSettings((s) => {
+				s.vaultId = resp.vault_id;
+				s.connectorId = resp.connector_id;
+			});
+			this.setStatus(this.queueStatusKind(), this.statusDetail());
+			return true;
+		} catch (err) {
+			this.handleStartupError(err);
+			return false;
+		}
+	}
+
+	applyHealth(h: HealthResponse): void {
+		this.capabilities = Array.isArray(h.capabilities) ? [...h.capabilities] : [];
+		this.deps.onCapabilities(this.capabilities);
+	}
+
+	// ---- vault event handlers --------------------------------------------
+
+	onCreate(file: TAbstractFile): void {
+		if (!this.shouldTrack(file)) return;
+		const settings = this.deps.getSettings();
+		if (this.isExcluded(file.path, settings)) return;
+		this.resetIdleStreak();
+		if (this.isMarkdown(file)) {
+			this.scheduleMdUpsert(file.path);
+			return;
+		}
+		this.deps.queue.enqueueUpsert(file.path);
+	}
+
+	onModify(file: TAbstractFile): void {
+		if (!this.shouldTrack(file)) return;
+		const settings = this.deps.getSettings();
+		if (this.isExcluded(file.path, settings)) return;
+		this.resetIdleStreak();
+		if (this.isMarkdown(file)) {
+			// Wait for metadataCache.changed so the payload sees fresh metadata.
+			this.scheduleMdUpsert(file.path);
+			return;
+		}
+		this.deps.queue.enqueueUpsert(file.path);
+	}
+
+	onDelete(file: TAbstractFile): void {
+		if (!this.shouldTrack(file)) return;
+		this.resetIdleStreak();
+		this.deps.queue.enqueueDelete(file.path);
+		void this.deps.saveSettings((s) => {
+			s.tombstones[file.path] = Date.now();
+		});
+	}
+
+	onRename(file: TAbstractFile, oldPath: string): void {
+		if (!this.shouldTrack(file)) return;
+		this.resetIdleStreak();
+		const settings = this.deps.getSettings();
+		if (this.isExcluded(file.path, settings)) {
+			this.deps.queue.enqueueDelete(oldPath);
+			void this.deps.saveSettings((s) => {
+				s.tombstones[oldPath] = Date.now();
+			});
+			return;
+		}
+		this.deps.queue.enqueueRename(oldPath, file.path);
+	}
+
+	onMetadataChanged(file: TFile, _data: string, _cache: CachedMetadata): void {
+		if (!this.shouldTrack(file)) return;
+		const settings = this.deps.getSettings();
+		if (this.isExcluded(file.path, settings)) return;
+		if (!this.isMarkdown(file)) return;
+		// Metadata is fresh now — cancel the deferred upsert and enqueue immediately.
+		const pending = this.pendingMdEdits.get(file.path);
+		if (pending) {
+			pending.cancel();
+			this.pendingMdEdits.delete(file.path);
+		}
+		this.deps.queue.enqueueUpsert(file.path);
+	}
+
+	private scheduleMdUpsert(path: string): void {
+		let pending = this.pendingMdEdits.get(path);
+		if (!pending) {
+			// resetTimer: true → each edit pushes the upsert out by another PENDING_DEBOUNCE_MS.
+			pending = debounce(
+				() => {
+					this.pendingMdEdits.delete(path);
+					this.deps.queue.enqueueUpsert(path);
+				},
+				PENDING_DEBOUNCE_MS,
+				true,
+			);
+			this.pendingMdEdits.set(path, pending);
+		}
+		pending();
+	}
+
+	// ---- queue draining ---------------------------------------------------
+
+	async flushQueue(): Promise<void> {
+		if (this.deps.queue.size === 0) {
+			await this.recoverStatusIfNeeded();
+			return;
+		}
+		// Shared gate for every flush trigger so the first /sync can't race /connect.
+		if (!this.deps.getSettings().connectorId) {
+			const connected = await this.ensureConnected();
+			if (!connected) return;
+			if (!this.deps.getSettings().connectorId) return;
+		}
+		this.setStatus("syncing", `Syncing ${this.deps.queue.size} item(s)…`);
+		const summary = await this.deps.queue.drain({
+			processBatch: (batch) => this.processBatch(batch),
+		});
+		if (summary.acked > 0) {
+			await this.deps.saveSettings((s) => {
+				s.lastSyncAt = Date.now();
+				s.filesSynced = (s.filesSynced ?? 0) + summary.acked;
+			});
+		}
+		this.setStatus(this.queueStatusKind(), this.statusDetail());
+	}
+
+	/**
+	 * Lightweight status recovery path used after network-change signals.
+	 * Clears stale offline/auth/error only when connectivity/auth is explicitly re-validated.
+	 */
+	async recoverConnectivityStatus(): Promise<void> {
+		const settings = this.deps.getSettings();
+		if (!settings.apiToken) {
+			this.refreshStatus({ force: true });
+			return;
+		}
+		if (!settings.searchSpaceId) {
+			try {
+				const health = await this.deps.apiClient.health();
+				this.applyHealth(health);
+				this.refreshStatus({ force: true });
+			} catch (err) {
+				this.handleStartupError(err);
+			}
+			return;
+		}
+		const connected = await this.ensureConnected();
+		if (!connected) return;
+		this.refreshStatus({ force: true });
+	}
+
+	private async processBatch(batch: QueueItem[]): Promise<BatchResult> {
+		const settings = this.deps.getSettings();
+		const upserts = batch.filter((b): b is QueueItem & { op: "upsert" } => b.op === "upsert");
+		const renames = batch.filter((b): b is QueueItem & { op: "rename" } => b.op === "rename");
+		const deletes = batch.filter((b): b is QueueItem & { op: "delete" } => b.op === "delete");
+
+		const acked: QueueItem[] = [];
+		const retry: QueueItem[] = [];
+		const dropped: QueueItem[] = [];
+
+		// Renames first so paths line up before content upserts.
+		if (renames.length > 0) {
+			try {
+				const resp = await this.deps.apiClient.renameBatch({
+					vaultId: settings.vaultId,
+					renames: renames.map((r) => ({ oldPath: r.oldPath, newPath: r.newPath })),
+				});
+				const failed = new Set(
+					resp.failed.map((f) => `${f.oldPath}\u0000${f.newPath}`),
+				);
+				for (const r of renames) {
+					if (failed.has(`${r.oldPath}\u0000${r.newPath}`)) retry.push(r);
+					else acked.push(r);
+				}
+			} catch (err) {
+				if (await this.handleVaultNotRegistered(err)) {
+					retry.push(...renames);
+				} else {
+					const verdict = this.classify(err);
+					if (verdict === "stop") return { acked, retry: [...retry, ...renames], dropped, stop: true };
+					if (verdict === "retry") retry.push(...renames);
+					else dropped.push(...renames);
+				}
+			}
+		}
+
+		if (deletes.length > 0) {
+			try {
+				const resp = await this.deps.apiClient.deleteBatch({
+					vaultId: settings.vaultId,
+					paths: deletes.map((d) => d.path),
+				});
+				const failed = new Set(resp.failed);
+				for (const d of deletes) {
+					if (failed.has(d.path)) retry.push(d);
+					else acked.push(d);
+				}
+			} catch (err) {
+				if (await this.handleVaultNotRegistered(err)) {
+					retry.push(...deletes);
+				} else {
+					const verdict = this.classify(err);
+					if (verdict === "stop") return { acked, retry: [...retry, ...deletes], dropped, stop: true };
+					if (verdict === "retry") retry.push(...deletes);
+					else dropped.push(...deletes);
+				}
+			}
+		}
+
+		if (upserts.length > 0) {
+			const payloads: NotePayload[] = [];
+			for (const item of upserts) {
+				const file = this.deps.app.vault.getFileByPath(item.path);
+				if (!file) {
+					// Vanished — ack now; the delete event will follow if needed.
+					acked.push(item);
+					continue;
+				}
+				try {
+					const payload = this.isMarkdown(file)
+						? await buildNotePayload(this.deps.app, file, settings.vaultId)
+						: await this.buildBinaryPayload(file, settings.vaultId);
+					payloads.push(payload);
+				} catch (err) {
+					console.error("SurfSense: failed to build payload", item.path, err);
+					retry.push(item);
+				}
+			}
+
+			if (payloads.length > 0) {
+				try {
+					const resp = await this.deps.apiClient.syncBatch({
+						vaultId: settings.vaultId,
+						notes: payloads,
+					});
+					// Per-note failures retry; queue maxAttempts drops poison pills.
+					const failed = new Set(resp.failed);
+					for (const item of upserts) {
+						if (retry.find((r) => r === item)) continue;
+						if (failed.has(item.path)) retry.push(item);
+						else acked.push(item);
+					}
+				} catch (err) {
+					if (await this.handleVaultNotRegistered(err)) {
+						for (const item of upserts) {
+							if (retry.find((r) => r === item)) continue;
+							retry.push(item);
+						}
+					} else {
+						const verdict = this.classify(err);
+						if (verdict === "stop")
+							return { acked, retry: [...retry, ...upserts], dropped, stop: true };
+						if (verdict === "retry") retry.push(...upserts);
+						else dropped.push(...upserts);
+					}
+				}
+			}
+		}
+
+		return { acked, retry, dropped, stop: false };
+	}
+
+	private async buildBinaryPayload(file: TFile, vaultId: string): Promise<NotePayload> {
+		// Attachments skip buildNotePayload (no markdown metadata) but still
+		// need raw bytes + hash + stat so the backend can ETL-extract text
+		// and manifest diff still works.
+		const buf = await this.deps.app.vault.readBinary(file);
+		const digest = await crypto.subtle.digest("SHA-256", buf);
+		const hash = bufferToHex(digest);
+		const binaryBase64 = arrayBufferToBase64(buf);
+		return {
+			vault_id: vaultId,
+			path: file.path,
+			name: file.basename,
+			extension: file.extension,
+			content: "",
+			frontmatter: {},
+			tags: [],
+			headings: [],
+			resolved_links: [],
+			unresolved_links: [],
+			embeds: [],
+			aliases: [],
+			content_hash: hash,
+			size: file.stat.size,
+			mtime: file.stat.mtime,
+			ctime: file.stat.ctime,
+			is_binary: true,
+			binary_base64: binaryBase64,
+			mime_type: mimeTypeFor(file.extension),
+		};
+	}
+
+	// ---- reconcile --------------------------------------------------------
+
+	async maybeReconcile(force = false): Promise<void> {
+		const settings = this.deps.getSettings();
+		if (!settings.connectorId) return;
+		if (!force && settings.lastReconcileAt) {
+			if (Date.now() - settings.lastReconcileAt < RECONCILE_MIN_INTERVAL_MS) return;
+		}
+
+		// Re-handshake first: if the vault grew enough to match another
+		// device's fingerprint, the server merges and routes us to the
+		// survivor row, which the /manifest call below then uses.
+		const connected = await this.ensureConnected();
+		if (!connected) return;
+		const refreshed = this.deps.getSettings();
+		if (!refreshed.connectorId) return;
+
+		this.setStatus("syncing", "Reconciling vault with server…");
+		try {
+			const manifest = await this.deps.apiClient.getManifest(refreshed.vaultId);
+			const remote = manifest.items ?? {};
+			const enqueued = this.diffAndQueue(refreshed, remote);
+			await this.deps.saveSettings((s) => {
+				s.lastReconcileAt = Date.now();
+				s.tombstones = pruneTombstones(s.tombstones);
+			});
+			this.updateIdleStreak(enqueued);
+			await this.flushQueue();
+			this.refreshStatus({ force: true });
+		} catch (err) {
+			this.classifyAndStatus(err, "Reconcile failed");
+		}
+	}
+
+	/**
+	 * Diff local vault vs server manifest and enqueue work. Skips disk reads
+	 * on idle reconciles by short-circuiting on `mtime + size`; false positives
+	 * collapse to a no-op upsert via the server's `content_hash` check.
+	 * Returns the enqueued count to drive adaptive backoff.
+	 */
+	private diffAndQueue(
+		settings: SyncEngineSettings,
+		remote: Record<string, ManifestEntry>,
+	): number {
+		const localFiles = this.deps.app.vault.getFiles().filter((f) => {
+			if (!this.shouldTrack(f)) return false;
+			if (this.isExcluded(f.path, settings)) return false;
+			return true;
+		});
+		const localPaths = new Set(localFiles.map((f) => f.path));
+		let enqueued = 0;
+
+		for (const file of localFiles) {
+			const remoteEntry = remote[file.path];
+			if (!remoteEntry) {
+				this.deps.queue.enqueueUpsert(file.path);
+				enqueued++;
+				continue;
+			}
+			const remoteMtimeMs = toMillis(remoteEntry.mtime);
+			const mtimeMatches = file.stat.mtime <= remoteMtimeMs + 1000;
+			// Older server rows lack `size` — treat as unknown and re-upsert.
+			const sizeMatches =
+				typeof remoteEntry.size === "number" && file.stat.size === remoteEntry.size;
+			if (mtimeMatches && sizeMatches) continue;
+			this.deps.queue.enqueueUpsert(file.path);
+			enqueued++;
+		}
+
+		// Remote-only → delete, unless a fresh tombstone is already in the queue.
+		for (const path of Object.keys(remote)) {
+			if (localPaths.has(path)) continue;
+			const tombstone = settings.tombstones[path];
+			if (tombstone && Date.now() - tombstone < TOMBSTONE_TTL_MS) continue;
+			this.deps.queue.enqueueDelete(path);
+			enqueued++;
+		}
+
+		return enqueued;
+	}
+
+	/** Bump (idle) or reset (active) the streak; notify only when the capped multiplier changes. */
+	private updateIdleStreak(enqueued: number): void {
+		const previousStreak = this.idleReconcileStreak;
+		if (enqueued === 0) this.idleReconcileStreak++;
+		else this.idleReconcileStreak = 0;
+		const cap = Math.log2(this.maxBackoffMultiplier);
+		const cappedPrev = Math.min(previousStreak, cap);
+		const cappedNow = Math.min(this.idleReconcileStreak, cap);
+		if (cappedPrev !== cappedNow) this.deps.onReconcileBackoffChanged?.();
+	}
+
+	/** Vault edit — drop back to base interval immediately. */
+	private resetIdleStreak(): void {
+		if (this.idleReconcileStreak === 0) return;
+		this.idleReconcileStreak = 0;
+		this.deps.onReconcileBackoffChanged?.();
+	}
+
+	// ---- status helpers ---------------------------------------------------
+
+	/**
+	 * Conservative by default: real errors are preserved while setup is
+	 * complete, so unrelated edits don't optimistically clear the indicator.
+	 * Pass `force: true` after an explicit verify/reconcile confirmation.
+	 */
+	refreshStatus(opts: { force?: boolean } = {}): void {
+		if (!opts.force) {
+			const last = this.lastAppliedKind;
+			if (last === "syncing") return;
+			const isError =
+				last === "auth-error" || last === "offline" || last === "error";
+			const s = this.deps.getSettings();
+			const setupComplete = !!(s.apiToken && s.searchSpaceId && s.connectorId);
+			if (isError && setupComplete) return;
+		}
+		this.setStatus(this.queueStatusKind(), this.statusDetail());
+	}
+
+	reportAuthError(message?: string): void {
+		this.setStatus("auth-error", message ?? "API token expired or invalid");
+	}
+
+	reportError(err: unknown): void {
+		if (err instanceof AuthError) {
+			this.reportAuthError(err.message);
+			return;
+		}
+		if (err instanceof TransientError) {
+			this.setStatus("offline", err.message);
+			return;
+		}
+		this.setStatus("error", (err as Error).message ?? "Unknown error");
+	}
+
+	private setStatus(kind: StatusKind, detail?: string): void {
+		const s = this.deps.getSettings();
+		if (!s.apiToken) {
+			kind = "needs-setup";
+			detail = this.setupHint(s);
+		} else if (kind !== "auth-error" && kind !== "offline" && kind !== "error") {
+			if (!s.searchSpaceId || !s.connectorId) {
+				kind = "needs-setup";
+				detail = this.setupHint(s);
+			}
+		}
+		this.lastAppliedKind = kind;
+		this.deps.setStatus({ kind, detail, queueDepth: this.deps.queue.size });
+	}
+
+	private setupHint(s: SyncEngineSettings): string {
+		if (!s.apiToken) return "Paste your API token in settings.";
+		if (!s.searchSpaceId) return "Pick a search space in settings.";
+		return "Connecting…";
+	}
+
+	private queueStatusKind(): StatusKind {
+		if (this.deps.queue.size > 0) return "queued";
+		return "idle";
+	}
+
+	private statusDetail(): string | undefined {
+		const settings = this.deps.getSettings();
+		if (settings.lastSyncAt) {
+			return `Last sync ${formatRelative(settings.lastSyncAt)}`;
+		}
+		return undefined;
+	}
+
+	private handleStartupError(err: unknown): void {
+		if (err instanceof AuthError) {
+			this.setStatus("auth-error", err.message);
+			return;
+		}
+		if (err instanceof TransientError) {
+			this.setStatus("offline", err.message);
+			return;
+		}
+		this.setStatus("error", (err as Error).message ?? "Unknown error");
+	}
+
+	/** Re-connect on VAULT_NOT_REGISTERED so the next drain sees the new row. */
+	private async handleVaultNotRegistered(err: unknown): Promise<boolean> {
+		if (!(err instanceof VaultNotRegisteredError)) return false;
+		console.warn("SurfSense: vault not registered, re-connecting before retry", err);
+		await this.ensureConnected();
+		return true;
+	}
+
+	private classify(err: unknown): "ack" | "retry" | "drop" | "stop" {
+		if (err instanceof AuthError) {
+			this.setStatus("auth-error", err.message);
+			return "stop";
+		}
+		if (err instanceof TransientError) {
+			this.setStatus("offline", err.message);
+			return "stop";
+		}
+		if (err instanceof PermanentError) {
+			console.warn("SurfSense: permanent error, dropping batch", err);
+			new Notice(`Surfsense: ${err.message}`);
+			return "drop";
+		}
+		console.error("SurfSense: unknown error", err);
+		return "retry";
+	}
+
+	private classifyAndStatus(err: unknown, prefix: string): void {
+		const verdict = this.classify(err);
+		if (verdict === "stop") return;
+		this.setStatus(this.queueStatusKind(), `${prefix}: ${(err as Error).message}`);
+	}
+
+	private async recoverStatusIfNeeded(): Promise<void> {
+		if (!this.isRecoverableErrorState()) return;
+		await this.recoverConnectivityStatus();
+	}
+
+	private isRecoverableErrorState(): boolean {
+		return (
+			this.lastAppliedKind === "offline" ||
+			this.lastAppliedKind === "auth-error" ||
+			this.lastAppliedKind === "error"
+		);
+	}
+
+	// ---- predicates -------------------------------------------------------
+
+	private shouldTrack(file: TAbstractFile): boolean {
+		if (!isTFile(file)) return false;
+		if (this.isMarkdown(file)) return true;
+		const settings = this.deps.getSettings();
+		if (!settings.includeAttachments) return false;
+		return ALLOWED_ATTACHMENT_EXTENSIONS.has(file.extension.toLowerCase());
+	}
+
+	private isExcluded(path: string, settings: SyncEngineSettings): boolean {
+		if (isFolderFiltered(path, settings.includeFolders, settings.excludeFolders)) {
+			return true;
+		}
+		return isExcluded(path, settings.excludePatterns);
+	}
+
+	private isMarkdown(file: TAbstractFile): boolean {
+		return isTFile(file) && file.extension.toLowerCase() === "md";
+	}
+}
+
+function isTFile(f: TAbstractFile): f is TFile {
+	return f instanceof TFile;
+}
+
+function bufferToHex(buf: ArrayBuffer): string {
+	const view = new Uint8Array(buf);
+	let hex = "";
+	for (let i = 0; i < view.length; i++) hex += (view[i] ?? 0).toString(16).padStart(2, "0");
+	return hex;
+}
+
+function arrayBufferToBase64(buf: ArrayBuffer): string {
+	const bytes = new Uint8Array(buf);
+	const chunkSize = 0x8000;
+	let binary = "";
+	for (let i = 0; i < bytes.length; i += chunkSize) {
+		const chunk = bytes.subarray(i, i + chunkSize);
+		binary += String.fromCharCode(...Array.from(chunk));
+	}
+	return btoa(binary);
+}
+
+/** Source of truth for the attachment whitelist. Mirrors ATTACHMENT_MIME_TYPES on the backend. */
+export const MIME_BY_EXTENSION = {
+	pdf: "application/pdf",
+	png: "image/png",
+	jpg: "image/jpeg",
+	jpeg: "image/jpeg",
+	gif: "image/gif",
+	webp: "image/webp",
+	svg: "image/svg+xml",
+	txt: "text/plain",
+} as const satisfies Record<string, string>;
+
+export const ALLOWED_ATTACHMENT_EXTENSIONS: ReadonlySet<string> = new Set(
+	Object.keys(MIME_BY_EXTENSION),
+);
+
+function mimeTypeFor(extension: string): string {
+	const ext = extension.toLowerCase() as keyof typeof MIME_BY_EXTENSION;
+	const mime = MIME_BY_EXTENSION[ext];
+	if (!mime) {
+		throw new Error(`Unsupported attachment extension: .${extension}`);
+	}
+	return mime;
+}
+
+function formatRelative(ts: number): string {
+	const diff = Date.now() - ts;
+	if (diff < 60_000) return "just now";
+	if (diff < 3600_000) return `${Math.round(diff / 60_000)}m ago`;
+	if (diff < 86_400_000) return `${Math.round(diff / 3600_000)}h ago`;
+	return `${Math.round(diff / 86_400_000)}d ago`;
+}
+
+/** Manifest mtimes arrive as ISO strings, vault stats as epoch ms — normalise. */
+function toMillis(value: number | string | Date): number {
+	if (typeof value === "number") return value;
+	if (value instanceof Date) return value.getTime();
+	const parsed = Date.parse(value);
+	return Number.isFinite(parsed) ? parsed : 0;
+}
+
+function pruneTombstones(tombstones: Record<string, number>): Record<string, number> {
+	const out: Record<string, number> = {};
+	const cutoff = Date.now() - TOMBSTONE_TTL_MS;
+	for (const [k, v] of Object.entries(tombstones)) {
+		if (v >= cutoff) out[k] = v;
+	}
+	return out;
+}
diff --git a/surfsense_obsidian/src/types.ts b/surfsense_obsidian/src/types.ts
new file mode 100644
index 0000000000..192d34dc80
--- /dev/null
+++ b/surfsense_obsidian/src/types.ts
@@ -0,0 +1,202 @@
+/** Shared types for the SurfSense Obsidian plugin. Leaf module — no src/ imports. */
+
+export interface SurfsensePluginSettings {
+	serverUrl: string;
+	apiToken: string;
+	searchSpaceId: number | null;
+	connectorId: number | null;
+	/** UUID for the vault — lives here so Obsidian Sync replicates it across devices. */
+	vaultId: string;
+	/** 0 disables periodic reconcile (Force sync still works). */
+	syncIntervalMinutes: number;
+	/** Mobile-only: pause auto-sync when on cellular. iOS can't detect network type, so the toggle is a no-op there. */
+	wifiOnly: boolean;
+	includeFolders: string[];
+	excludeFolders: string[];
+	excludePatterns: string[];
+	includeAttachments: boolean;
+	lastSyncAt: number | null;
+	lastReconcileAt: number | null;
+	filesSynced: number;
+	queue: QueueItem[];
+	tombstones: Record<string, number>;
+}
+
+export const DEFAULT_SETTINGS: SurfsensePluginSettings = {
+	serverUrl: "https://surfsense.com",
+	apiToken: "",
+	searchSpaceId: null,
+	connectorId: null,
+	vaultId: "",
+	syncIntervalMinutes: 10,
+	wifiOnly: false,
+	includeFolders: [],
+	excludeFolders: [],
+	excludePatterns: [".trash", "_attachments", "templates"],
+	includeAttachments: false,
+	lastSyncAt: null,
+	lastReconcileAt: null,
+	filesSynced: 0,
+	queue: [],
+	tombstones: {},
+};
+
+export type QueueOp = "upsert" | "delete" | "rename";
+
+export interface UpsertItem {
+	op: "upsert";
+	path: string;
+	enqueuedAt: number;
+	attempt: number;
+}
+
+export interface DeleteItem {
+	op: "delete";
+	path: string;
+	enqueuedAt: number;
+	attempt: number;
+}
+
+export interface RenameItem {
+	op: "rename";
+	oldPath: string;
+	newPath: string;
+	enqueuedAt: number;
+	attempt: number;
+}
+
+export type QueueItem = UpsertItem | DeleteItem | RenameItem;
+
+interface NotePayloadBase {
+	vault_id: string;
+	path: string;
+	name: string;
+	extension: string;
+	content: string;
+	frontmatter: Record<string, unknown>;
+	tags: string[];
+	headings: HeadingRef[];
+	resolved_links: string[];
+	unresolved_links: string[];
+	embeds: string[];
+	aliases: string[];
+	content_hash: string;
+	/** Byte size of the local file; pairs with mtime for the reconcile short-circuit. */
+	size: number;
+	mtime: number;
+	ctime: number;
+}
+
+export interface MarkdownNotePayload extends NotePayloadBase {
+	is_binary?: false;
+}
+
+export interface BinaryNotePayload extends NotePayloadBase {
+	/** Non-markdown attachment marker; enables backend ETL path. */
+	is_binary: true;
+	/** Base64-encoded file bytes for binary attachments. */
+	binary_base64: string;
+	/** Canonical MIME type for the extension; required by the backend. */
+	mime_type: string;
+}
+
+export type NotePayload = MarkdownNotePayload | BinaryNotePayload;
+
+export interface HeadingRef {
+	heading: string;
+	level: number;
+}
+
+export interface SearchSpace {
+	id: number;
+	name: string;
+	description?: string;
+	[key: string]: unknown;
+}
+
+export interface ConnectResponse {
+	connector_id: number;
+	vault_id: string;
+	search_space_id: number;
+	capabilities: string[];
+	server_time_utc: string;
+	[key: string]: unknown;
+}
+
+export interface HealthResponse {
+	capabilities: string[];
+	server_time_utc: string;
+	[key: string]: unknown;
+}
+
+export interface ManifestEntry {
+	hash: string;
+	mtime: number;
+	/** Optional: byte size of stored content. Enables mtime+size short-circuit; falls back to upsert when missing. */
+	size?: number;
+	[key: string]: unknown;
+}
+
+export interface ManifestResponse {
+	vault_id: string;
+	items: Record<string, ManifestEntry>;
+	[key: string]: unknown;
+}
+
+/** Per-item ack shapes — mirror `app/schemas/obsidian_plugin.py` 1:1. */
+export interface SyncAckItem {
+	path: string;
+	status: "ok" | "queued" | "error";
+	document_id?: number;
+	error?: string;
+}
+
+export interface SyncAck {
+	vault_id: string;
+	indexed: number;
+	failed: number;
+	items: SyncAckItem[];
+}
+
+export interface RenameAckItem {
+	old_path: string;
+	new_path: string;
+	status: "ok" | "error" | "missing";
+	document_id?: number;
+	error?: string;
+}
+
+export interface RenameAck {
+	vault_id: string;
+	renamed: number;
+	missing: number;
+	items: RenameAckItem[];
+}
+
+export interface DeleteAckItem {
+	path: string;
+	status: "ok" | "error" | "missing";
+	error?: string;
+}
+
+export interface DeleteAck {
+	vault_id: string;
+	deleted: number;
+	missing: number;
+	items: DeleteAckItem[];
+}
+
+export type StatusKind =
+	| "idle"
+	| "syncing"
+	| "queued"
+	| "needs-setup"
+	| "offline"
+	| "auth-error"
+	| "error";
+
+export interface StatusState {
+	kind: StatusKind;
+	detail?: string;
+	queueDepth: number;
+}
diff --git a/surfsense_obsidian/src/vault-identity.ts b/surfsense_obsidian/src/vault-identity.ts
new file mode 100644
index 0000000000..86ae8b3b58
--- /dev/null
+++ b/surfsense_obsidian/src/vault-identity.ts
@@ -0,0 +1,43 @@
+import type { App } from "obsidian";
+
+/**
+ * Deterministic SHA-256 over the vault name + sorted markdown paths.
+ *
+ * Two devices observing the same vault content compute the same value,
+ * regardless of how it was synced (iCloud, Syncthing, Obsidian Sync, …).
+ * The server uses this as the cross-device dedup key on /connect.
+ */
+export async function computeVaultFingerprint(app: App): Promise<string> {
+	const vaultName = app.vault.getName();
+	const paths = app.vault
+		.getMarkdownFiles()
+		.map((f) => f.path)
+		.sort();
+	const payload = `${vaultName}\n${paths.join("\n")}`;
+	const bytes = new TextEncoder().encode(payload);
+	const digest = await crypto.subtle.digest("SHA-256", bytes);
+	return bufferToHex(digest);
+}
+
+function bufferToHex(buf: ArrayBuffer): string {
+	const view = new Uint8Array(buf);
+	let hex = "";
+	for (let i = 0; i < view.length; i++) {
+		hex += (view[i] ?? 0).toString(16).padStart(2, "0");
+	}
+	return hex;
+}
+
+export function generateVaultUuid(): string {
+	const c = globalThis.crypto;
+	if (c?.randomUUID) return c.randomUUID();
+	const buf = new Uint8Array(16);
+	c.getRandomValues(buf);
+	buf[6] = ((buf[6] ?? 0) & 0x0f) | 0x40;
+	buf[8] = ((buf[8] ?? 0) & 0x3f) | 0x80;
+	const hex = Array.from(buf, (b) => b.toString(16).padStart(2, "0")).join("");
+	return `${hex.slice(0, 8)}-${hex.slice(8, 12)}-${hex.slice(12, 16)}-${hex.slice(
+		16,
+		20,
+	)}-${hex.slice(20)}`;
+}
diff --git a/surfsense_obsidian/styles.css b/surfsense_obsidian/styles.css
new file mode 100644
index 0000000000..4aa831e6c6
--- /dev/null
+++ b/surfsense_obsidian/styles.css
@@ -0,0 +1,48 @@
+/*
+ * SurfSense Obsidian plugin styles. Status-bar widget only — the settings
+ * tab uses Obsidian's stock Setting rows, no custom CSS needed.
+ */
+
+.surfsense-status {
+	gap: 6px;
+}
+
+.surfsense-status--clickable {
+	cursor: pointer;
+}
+
+.surfsense-status__icon {
+	display: inline-flex;
+	width: 14px;
+	height: 14px;
+}
+
+.surfsense-status__icon svg {
+	width: 14px;
+	height: 14px;
+}
+
+.surfsense-status--err .surfsense-status__icon {
+	color: var(--color-red);
+}
+
+.surfsense-connection-indicator {
+	display: inline-flex;
+	width: 14px;
+	height: 14px;
+}
+
+.surfsense-connection-heading {
+	display: inline-flex;
+	align-items: center;
+	gap: 8px;
+}
+
+.surfsense-connection-indicator svg {
+	width: 14px;
+	height: 14px;
+}
+
+.surfsense-connection-indicator--err {
+	color: var(--color-red);
+}
diff --git a/surfsense_obsidian/tsconfig.json b/surfsense_obsidian/tsconfig.json
new file mode 100644
index 0000000000..222535dee2
--- /dev/null
+++ b/surfsense_obsidian/tsconfig.json
@@ -0,0 +1,30 @@
+{
+	"compilerOptions": {
+		"baseUrl": "src",
+		"inlineSourceMap": true,
+		"inlineSources": true,
+		"module": "ESNext",
+		"target": "ES6",
+		"allowJs": true,
+		"noImplicitAny": true,
+		"noImplicitThis": true,
+		"noImplicitReturns": true,
+		"moduleResolution": "node",
+		"importHelpers": true,
+		"noUncheckedIndexedAccess": true,
+		"isolatedModules": true,
+		"strictNullChecks": true,
+		"strictBindCallApply": true,
+		"allowSyntheticDefaultImports": true,
+		"useUnknownInCatchVariables": true,
+		"lib": [
+			"DOM",
+			"ES5",
+			"ES6",
+			"ES7"
+		]
+	},
+	"include": [
+		"src/**/*.ts"
+	]
+}
diff --git a/surfsense_obsidian/version-bump.mjs b/surfsense_obsidian/version-bump.mjs
new file mode 100644
index 0000000000..55d631fb6c
--- /dev/null
+++ b/surfsense_obsidian/version-bump.mjs
@@ -0,0 +1,17 @@
+import { readFileSync, writeFileSync } from "fs";
+
+const targetVersion = process.env.npm_package_version;
+
+// read minAppVersion from manifest.json and bump version to target version
+const manifest = JSON.parse(readFileSync("manifest.json", "utf8"));
+const { minAppVersion } = manifest;
+manifest.version = targetVersion;
+writeFileSync("manifest.json", JSON.stringify(manifest, null, "\t"));
+
+// update versions.json with target version and minAppVersion from manifest.json
+// but only if the target version is not already in versions.json
+const versions = JSON.parse(readFileSync('versions.json', 'utf8'));
+if (!Object.values(versions).includes(minAppVersion)) {
+    versions[targetVersion] = minAppVersion;
+    writeFileSync('versions.json', JSON.stringify(versions, null, '\t'));
+}
diff --git a/surfsense_obsidian/versions.json b/surfsense_obsidian/versions.json
new file mode 100644
index 0000000000..9a3c3429d1
--- /dev/null
+++ b/surfsense_obsidian/versions.json
@@ -0,0 +1,3 @@
+{
+	"0.1.0": "1.5.4"
+}
diff --git a/surfsense_web/.dockerignore b/surfsense_web/.dockerignore
index d5ebb2f2ce..dac602dbf0 100644
--- a/surfsense_web/.dockerignore
+++ b/surfsense_web/.dockerignore
@@ -17,3 +17,11 @@ Dockerfile
 .dockerignore
 node_modules
 README.md
+
+# Playwright (never ship E2E tests to production runtime image)
+tests
+playwright.config.ts
+playwright
+playwright-report
+test-results
+blob-report
diff --git a/surfsense_web/.env.example b/surfsense_web/.env.example
index 2313f3e56f..0eec4311ca 100644
--- a/surfsense_web/.env.example
+++ b/surfsense_web/.env.example
@@ -1,4 +1,8 @@
 NEXT_PUBLIC_FASTAPI_BACKEND_URL=http://localhost:8000
+
+# Server-only. Internal backend URL used by Next.js server code.
+FASTAPI_BACKEND_INTERNAL_URL=https://your-internal-backend.example.com
+
 NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE=LOCAL or GOOGLE
 
 # mPass proxy auth — set when deployed behind oauth2-proxy + Traefik ForwardAuth
@@ -15,4 +19,8 @@ DATABASE_URL=postgresql://postgres:[YOUR-PASSWORD]@db.sdsf.supabase.co:5432/post
 NEXT_PUBLIC_DEPLOYMENT_MODE="self-hosted" or "cloud"
 
 # PostHog analytics (optional, leave empty to disable)
-NEXT_PUBLIC_POSTHOG_KEY=
\ No newline at end of file
+NEXT_PUBLIC_POSTHOG_KEY=
+
+# Cloudflare Turnstile CAPTCHA for anonymous chat abuse prevention
+# Get your site key from https://dash.cloudflare.com/ -> Turnstile
+NEXT_PUBLIC_TURNSTILE_SITE_KEY=
\ No newline at end of file
diff --git a/surfsense_web/.gitignore b/surfsense_web/.gitignore
index 6ae7fe0c4a..3ae7683d32 100644
--- a/surfsense_web/.gitignore
+++ b/surfsense_web/.gitignore
@@ -12,6 +12,10 @@
 
 # testing
 /coverage
+/playwright/.auth/
+/playwright-report/
+/test-results/
+/blob-report/
 
 # next.js
 /.next/
@@ -48,5 +52,4 @@ next-env.d.ts
 # source
 /.source/
 
-.pnpm-store/
-
+.pnpm-store/
\ No newline at end of file
diff --git a/surfsense_web/Dockerfile b/surfsense_web/Dockerfile
index aad2bb4f35..a3e3e05ac4 100644
--- a/surfsense_web/Dockerfile
+++ b/surfsense_web/Dockerfile
@@ -12,7 +12,7 @@ WORKDIR /app
 RUN corepack enable pnpm && corepack prepare pnpm@10.33.4 --activate
 
 # Copy package files
-COPY package.json pnpm-lock.yaml* .npmrc* ./
+COPY package.json pnpm-lock.yaml* pnpm-workspace.yaml* .npmrc* ./
 
 # First copy the config file and content to avoid fumadocs-mdx postinstall error
 COPY source.config.ts ./
diff --git a/surfsense_web/app/(home)/announcements/layout.tsx b/surfsense_web/app/(home)/announcements/layout.tsx
new file mode 100644
index 0000000000..157666ba4a
--- /dev/null
+++ b/surfsense_web/app/(home)/announcements/layout.tsx
@@ -0,0 +1,25 @@
+import type { Metadata } from "next";
+import type { ReactNode } from "react";
+
+export const metadata: Metadata = {
+	title: "What's New | SurfSense",
+	description: "Latest product updates, feature releases, and news from SurfSense.",
+	alternates: {
+		canonical: "https://www.surfsense.com/announcements",
+	},
+	openGraph: {
+		title: "What's New | SurfSense",
+		description: "Latest product updates, feature releases, and news from SurfSense.",
+		url: "https://www.surfsense.com/announcements",
+		type: "website",
+	},
+	twitter: {
+		card: "summary_large_image",
+		title: "What's New | SurfSense",
+		description: "Latest product updates, feature releases, and news from SurfSense.",
+	},
+};
+
+export default function AnnouncementsLayout({ children }: { children: ReactNode }) {
+	return <>{children}</>;
+}
diff --git a/surfsense_web/app/(home)/announcements/page.tsx b/surfsense_web/app/(home)/announcements/page.tsx
index 966c09f777..f287e43d15 100644
--- a/surfsense_web/app/(home)/announcements/page.tsx
+++ b/surfsense_web/app/(home)/announcements/page.tsx
@@ -24,7 +24,7 @@ export default function AnnouncementsPage() {
 				<div className="max-w-5xl mx-auto relative">
 					<div className="p-6">
 						<h1 className="text-4xl font-bold tracking-tight bg-linear-to-r from-gray-900 to-gray-600 dark:from-white dark:to-gray-400 bg-clip-text text-transparent">
-							Announcements
+							What's New
 						</h1>
 					</div>
 				</div>
diff --git a/surfsense_web/app/(home)/blog/[slug]/loading.tsx b/surfsense_web/app/(home)/blog/[slug]/loading.tsx
new file mode 100644
index 0000000000..0cce7f80b0
--- /dev/null
+++ b/surfsense_web/app/(home)/blog/[slug]/loading.tsx
@@ -0,0 +1,66 @@
+import { Skeleton } from "@/components/ui/skeleton";
+
+export default function BlogPostLoading() {
+	return (
+		<div className="min-h-screen relative pt-20">
+			<div className="max-w-3xl mx-auto px-6 lg:px-10 pt-10 pb-20">
+				{/* Breadcrumb */}
+				<div className="flex items-center gap-2 mb-8">
+					<Skeleton className="h-4 w-10" />
+					<Skeleton className="h-4 w-3" />
+					<Skeleton className="h-4 w-10" />
+					<Skeleton className="h-4 w-3" />
+					<Skeleton className="h-4 w-40" />
+				</div>
+
+				{/* Tags */}
+				<div className="flex flex-wrap gap-2 mb-4">
+					<Skeleton className="h-6 w-16 rounded-full" />
+					<Skeleton className="h-6 w-20 rounded-full" />
+				</div>
+
+				{/* Title */}
+				<div className="space-y-3 mb-6">
+					<Skeleton className="h-10 w-full" />
+					<Skeleton className="h-10 w-4/5" />
+				</div>
+
+				{/* Description */}
+				<Skeleton className="h-5 w-full mb-2" />
+				<Skeleton className="h-5 w-3/4 mb-8" />
+
+				{/* Author + date */}
+				<div className="flex items-center gap-3 mb-10">
+					<Skeleton className="h-10 w-10 rounded-full" />
+					<div className="space-y-1.5">
+						<Skeleton className="h-4 w-32" />
+						<Skeleton className="h-3 w-24" />
+					</div>
+				</div>
+
+				{/* Cover image */}
+				<Skeleton className="w-full aspect-video rounded-xl mb-10" />
+
+				{/* Article body paragraphs */}
+				{Array.from({ length: 5 }).map((_, i) => (
+					<div key={i} className="space-y-2 mb-6">
+						<Skeleton className="h-4 w-full" />
+						<Skeleton className="h-4 w-full" />
+						<Skeleton className="h-4 w-4/5" />
+					</div>
+				))}
+
+				{/* Sub-heading */}
+				<Skeleton className="h-7 w-56 mt-8 mb-4" />
+
+				{Array.from({ length: 3 }).map((_, i) => (
+					<div key={i} className="space-y-2 mb-6">
+						<Skeleton className="h-4 w-full" />
+						<Skeleton className="h-4 w-11/12" />
+						<Skeleton className="h-4 w-3/4" />
+					</div>
+				))}
+			</div>
+		</div>
+	);
+}
diff --git a/surfsense_web/app/(home)/blog/[slug]/page.tsx b/surfsense_web/app/(home)/blog/[slug]/page.tsx
new file mode 100644
index 0000000000..1f4d629682
--- /dev/null
+++ b/surfsense_web/app/(home)/blog/[slug]/page.tsx
@@ -0,0 +1,166 @@
+import { loader } from "fumadocs-core/source";
+import type { Metadata } from "next";
+import Image from "next/image";
+import { notFound } from "next/navigation";
+import { blog } from "@/.source/server";
+import { BreadcrumbNav } from "@/components/seo/breadcrumb-nav";
+import { ArticleJsonLd, FAQJsonLd } from "@/components/seo/json-ld";
+import { extractFaqFromBlogPost } from "@/lib/blog-faq";
+import { formatDate } from "@/lib/utils";
+import { getMDXComponents } from "@/mdx-components";
+
+const source = loader({
+	baseUrl: "/blog",
+	source: blog.toFumadocsSource(),
+});
+
+interface BlogData {
+	title: string;
+	description: string;
+	date: string;
+	image?: string;
+	author?: string;
+	authorAvatar?: string;
+	tags?: string[];
+	// Populated by Fumadocs when `lastModifiedTime: "git"` is set in source.config.ts.
+	lastModified?: Date;
+	body: React.ComponentType<{
+		components?: Record<string, React.ComponentType>;
+	}>;
+}
+
+interface BlogPageItem {
+	url: string;
+	slugs: string[];
+	data: BlogData;
+}
+
+export async function generateStaticParams() {
+	return source.getPages().map((page) => ({
+		slug: (page as BlogPageItem).slugs.join("/"),
+	}));
+}
+
+export async function generateMetadata(props: {
+	params: Promise<{ slug: string }>;
+}): Promise<Metadata> {
+	const { slug } = await props.params;
+	const page = (source.getPages() as BlogPageItem[]).find((p) => p.slugs.join("/") === slug);
+
+	if (!page) return {};
+
+	return {
+		title: `${page.data.title} | SurfSense Blog`,
+		description: page.data.description,
+		alternates: {
+			canonical: `https://www.surfsense.com/blog/${slug}`,
+		},
+		openGraph: {
+			title: page.data.title,
+			description: page.data.description,
+			type: "article",
+			publishedTime: page.data.date,
+			authors: [page.data.author ?? "SurfSense Team"],
+			tags: page.data.tags,
+			images: page.data.image ? [{ url: page.data.image }] : [{ url: "/og-image.png" }],
+		},
+		twitter: {
+			card: "summary_large_image",
+			title: page.data.title,
+			description: page.data.description,
+			images: page.data.image ? [page.data.image] : ["/og-image.png"],
+		},
+	};
+}
+
+export default async function BlogPostPage(props: { params: Promise<{ slug: string }> }) {
+	const { slug } = await props.params;
+	const page = (source.getPages() as BlogPageItem[]).find((p) => p.slugs.join("/") === slug);
+
+	if (!page) notFound();
+
+	const MDX = page.data.body;
+	const date = new Date(page.data.date);
+	const dateModified = page.data.lastModified
+		? new Date(page.data.lastModified).toISOString()
+		: undefined;
+	const faqEntries = await extractFaqFromBlogPost(slug);
+
+	return (
+		<div className="min-h-screen relative pt-20">
+			<ArticleJsonLd
+				title={page.data.title}
+				description={page.data.description}
+				url={`https://www.surfsense.com/blog/${slug}`}
+				datePublished={page.data.date}
+				dateModified={dateModified}
+				author={page.data.author ?? "SurfSense Team"}
+				image={page.data.image ? `https://www.surfsense.com${page.data.image}` : undefined}
+			/>
+			{faqEntries.length > 0 && <FAQJsonLd questions={faqEntries} />}
+			<div className="max-w-3xl mx-auto px-6 lg:px-10 pt-10 pb-20">
+				<BreadcrumbNav
+					items={[
+						{ name: "Home", href: "/" },
+						{ name: "Blog", href: "/blog" },
+						{ name: page.data.title, href: `/blog/${slug}` },
+					]}
+					className="mb-8"
+				/>
+
+				{page.data.image && (
+					<div className="relative aspect-2/1 overflow-hidden rounded-2xl mb-8">
+						<Image
+							src={page.data.image}
+							alt={page.data.title}
+							fill
+							className="object-cover"
+							priority
+							sizes="(max-width: 768px) 100vw, 768px"
+						/>
+					</div>
+				)}
+
+				<div className="space-y-4 mb-10">
+					<h1 className="text-3xl md:text-4xl font-bold tracking-tight text-balance">
+						{page.data.title}
+					</h1>
+
+					{page.data.tags && page.data.tags.length > 0 && (
+						<div className="flex flex-wrap gap-2">
+							{page.data.tags.map((tag: string) => (
+								<span
+									key={tag}
+									className="h-6 w-fit px-2.5 text-xs font-medium bg-muted text-muted-foreground rounded-full border flex items-center justify-center"
+								>
+									{tag}
+								</span>
+							))}
+						</div>
+					)}
+
+					<div className="flex items-center gap-3 text-sm text-muted-foreground">
+						{page.data.authorAvatar && (
+							<Image
+								src={page.data.authorAvatar}
+								alt={page.data.author ?? "SurfSense Team"}
+								width={32}
+								height={32}
+								className="h-8 w-8 rounded-full object-cover"
+							/>
+						)}
+						<span className="font-medium text-foreground">
+							{page.data.author ?? "SurfSense Team"}
+						</span>
+						<span>·</span>
+						<time dateTime={page.data.date}>{formatDate(date)}</time>
+					</div>
+				</div>
+
+				<div className="prose dark:prose-invert max-w-none prose-headings:scroll-mt-8 prose-headings:font-semibold prose-a:no-underline prose-headings:tracking-tight prose-headings:text-balance prose-p:tracking-tight prose-p:text-balance prose-img:rounded-xl prose-img:shadow-lg">
+					<MDX components={getMDXComponents()} />
+				</div>
+			</div>
+		</div>
+	);
+}
diff --git a/surfsense_web/app/(home)/blog/blog-magazine.tsx b/surfsense_web/app/(home)/blog/blog-magazine.tsx
new file mode 100644
index 0000000000..fd6a82e58d
--- /dev/null
+++ b/surfsense_web/app/(home)/blog/blog-magazine.tsx
@@ -0,0 +1,255 @@
+"use client";
+
+import { format } from "date-fns";
+import FuzzySearch from "fuzzy-search";
+import Link from "next/link";
+import { useMemo, useState } from "react";
+import { Container } from "@/components/container";
+import type { BlogEntry } from "./page";
+
+function truncate(text: string, length: number) {
+	return text.length > length ? `${text.slice(0, length)}…` : text;
+}
+
+function SearchIcon({ className }: { className?: string }) {
+	return (
+		<svg
+			className={className}
+			xmlns="http://www.w3.org/2000/svg"
+			width="20"
+			height="20"
+			viewBox="0 0 24 24"
+			fill="none"
+			stroke="currentColor"
+			strokeWidth="2"
+			strokeLinecap="round"
+			strokeLinejoin="round"
+			aria-hidden="true"
+			role="img"
+		>
+			<title>Search</title>
+			<circle cx="11" cy="11" r="8" />
+			<path d="m21 21-4.3-4.3" />
+		</svg>
+	);
+}
+
+export function BlogWithSearchMagazine({ blogs }: { blogs: BlogEntry[] }) {
+	if (blogs.length === 0) {
+		return (
+			<div className="relative overflow-hidden bg-neutral-50 px-4 md:px-8 dark:bg-neutral-950">
+				<Container className="relative pt-12 pb-24 md:pt-20">
+					<p className="text-center text-neutral-500">No blog posts yet.</p>
+				</Container>
+			</div>
+		);
+	}
+
+	// `blogs` arrives pre-sorted from the server: explicitly featured posts
+	// first (ordered by `featured_order` asc, then date desc), then the rest
+	// by date desc. If nothing is explicitly featured, fall back to treating
+	// the newest post as the cover so the layout never feels empty up top.
+	// `MagazineSearchGrid` re-filters using `heroSlugs` so the hero/featured
+	// posts never duplicate into the archive grid.
+	const explicitlyFeatured = blogs.filter((b) => b.featured);
+	const heroBlogs = explicitlyFeatured.length > 0 ? explicitlyFeatured : blogs.slice(0, 1);
+	const heroSlugs = new Set(heroBlogs.map((b) => b.slug));
+	const [coverStory, ...secondaryFeatured] = heroBlogs;
+
+	return (
+		<div className="relative overflow-hidden bg-neutral-50 px-4 pt-20 md:px-8 dark:bg-neutral-950">
+			<div className="pointer-events-none absolute inset-0 bg-[radial-gradient(ellipse_80%_50%_at_50%_-20%,rgba(120,119,198,0.15),transparent)] dark:bg-[radial-gradient(ellipse_80%_50%_at_50%_-20%,rgba(120,119,198,0.12),transparent)]" />
+			<Container className="relative pt-12 pb-24 md:pt-20">
+				<header className="mb-10 md:mb-14">
+					<h1 className="text-4xl font-bold tracking-tight text-neutral-900 md:text-5xl dark:text-neutral-50">
+						Blog
+					</h1>
+				</header>
+
+				<MagazineFeatured blog={coverStory} />
+
+				{secondaryFeatured.length > 0 ? <MoreFeatured blogs={secondaryFeatured} /> : null}
+
+				<MagazineSearchGrid blogs={blogs} excludedSlugs={heroSlugs} />
+			</Container>
+		</div>
+	);
+}
+
+function MoreFeatured({ blogs }: { blogs: BlogEntry[] }) {
+	return (
+		<section aria-labelledby="more-featured-heading" className="mb-14">
+			<h2
+				id="more-featured-heading"
+				className="mb-6 font-serif text-2xl font-medium text-neutral-900 dark:text-neutral-100"
+			>
+				More featured
+			</h2>
+			<ul className="grid gap-6 sm:grid-cols-2">
+				{blogs.map((blog) => (
+					<li key={blog.slug}>
+						<MagazineCard blog={blog} />
+					</li>
+				))}
+			</ul>
+		</section>
+	);
+}
+
+function MagazineFeatured({ blog }: { blog: BlogEntry }) {
+	return (
+		<Link
+			href={blog.url}
+			className="group/cover relative mb-14 block overflow-hidden rounded-3xl border border-neutral-200/80 bg-neutral-900 shadow-sm dark:border-neutral-800 dark:shadow-none"
+		>
+			<div className="md:aspect-[2.4/1] relative aspect-21/9 min-h-[220px]">
+				{blog.image ? (
+					<img
+						src={blog.image}
+						alt={blog.title}
+						className="h-full w-full object-cover transition duration-500 group-hover/cover:scale-[1.03]"
+					/>
+				) : null}
+				<div className="absolute inset-0 bg-linear-to-t from-black/85 via-black/35 to-transparent" />
+				<div className="absolute inset-0 flex flex-col justify-end p-6 md:p-10">
+					<span className="mb-2 inline-flex w-fit rounded-full bg-white/15 px-3 py-1 text-xs font-medium text-white backdrop-blur-sm">
+						Cover story
+					</span>
+					<h2 className="max-w-3xl font-serif text-2xl leading-tight font-medium text-white md:text-4xl">
+						{blog.title}
+					</h2>
+					<p className="mt-3 max-w-2xl text-sm text-white/85 md:text-base">
+						{truncate(blog.description, 160)}
+					</p>
+					<div className="mt-5 flex flex-wrap items-center gap-3 text-sm text-white/90">
+						<span className="flex items-center gap-2">
+							<img
+								src={blog.authorAvatar}
+								alt={blog.author}
+								width={28}
+								height={28}
+								className="h-7 w-7 rounded-full ring-2 ring-white/30"
+							/>
+							{blog.author}
+						</span>
+						<span className="text-white/50">·</span>
+						<time dateTime={blog.date}>{format(new Date(blog.date), "MMMM d, yyyy")}</time>
+					</div>
+				</div>
+			</div>
+		</Link>
+	);
+}
+
+function MagazineSearchGrid({
+	blogs: allBlogs,
+	excludedSlugs,
+}: {
+	blogs: BlogEntry[];
+	/** Slugs already shown above the archive (cover story + "More featured"). */
+	excludedSlugs: Set<string>;
+}) {
+	const [search, setSearch] = useState("");
+
+	const searcher = useMemo(
+		() =>
+			new FuzzySearch(allBlogs, ["title", "description"], {
+				caseSensitive: false,
+			}),
+		[allBlogs]
+	);
+
+	const gridItems = useMemo(() => {
+		// When the reader is searching, surface every match (including
+		// featured posts they may be looking for); otherwise hide the posts
+		// that are already rendered as featured above the archive.
+		const results = search.trim() ? searcher.search(search) : allBlogs;
+		if (search.trim()) {
+			return results;
+		}
+		return results.filter((b) => !excludedSlugs.has(b.slug));
+	}, [search, searcher, allBlogs, excludedSlugs]);
+
+	return (
+		<section aria-labelledby="archive-heading">
+			<div className="mb-8 flex flex-col gap-4 sm:flex-row sm:items-center sm:justify-between">
+				<h2
+					id="archive-heading"
+					className="font-serif text-2xl font-medium text-neutral-900 dark:text-neutral-100"
+				>
+					From the archive
+				</h2>
+				<label className="relative w-full sm:max-w-md">
+					<span className="sr-only">Search articles</span>
+					<SearchIcon className="pointer-events-none absolute top-1/2 left-4 -translate-y-1/2 text-neutral-400" />
+					<input
+						type="search"
+						value={search}
+						onChange={(e) => setSearch(e.target.value)}
+						placeholder="Search by title or topic…"
+						className="w-full rounded-full bg-white py-3 pr-4 pl-12 text-sm text-neutral-800 shadow-sm ring-1 shadow-black/10 ring-black/10 transition outline-none placeholder:text-neutral-400 focus:border-neutral-400 focus:ring-2 focus:ring-neutral-200/80 dark:bg-neutral-900 dark:text-neutral-100 dark:placeholder:text-neutral-500 dark:focus:border-neutral-500 dark:focus:ring-neutral-700/50"
+					/>
+				</label>
+			</div>
+
+			{gridItems.length === 0 ? (
+				<p className="rounded-2xl border border-dashed border-neutral-300 py-16 text-center text-neutral-500 dark:border-neutral-700 dark:text-neutral-400">
+					No articles match that search.
+				</p>
+			) : (
+				<ul className="grid gap-6 sm:grid-cols-2 lg:grid-cols-3">
+					{gridItems.map((blog) => (
+						<li key={blog.slug}>
+							<MagazineCard blog={blog} />
+						</li>
+					))}
+				</ul>
+			)}
+		</section>
+	);
+}
+
+function MagazineCard({ blog }: { blog: BlogEntry }) {
+	return (
+		<Link
+			href={blog.url}
+			className="group/card flex h-full flex-col overflow-hidden rounded-2xl bg-white shadow-sm ring-1 shadow-black/10 ring-black/10 transition hover:-translate-y-0.5 hover:shadow-lg dark:border-neutral-800 dark:bg-neutral-900/50 dark:hover:border-neutral-700"
+		>
+			<div className="relative aspect-16/10 overflow-hidden bg-neutral-100 dark:bg-neutral-800">
+				{blog.image ? (
+					<img
+						src={blog.image}
+						alt={blog.title}
+						className="h-full w-full object-cover transition duration-300 group-hover/card:scale-105"
+					/>
+				) : (
+					<div className="flex h-full items-center justify-center text-neutral-400">No image</div>
+				)}
+			</div>
+			<div className="flex flex-1 flex-col p-5">
+				<time
+					className="text-xs font-medium tracking-wider text-neutral-500 uppercase dark:text-neutral-400"
+					dateTime={blog.date}
+				>
+					{format(new Date(blog.date), "MMM d, yyyy")}
+				</time>
+				<h3 className="mt-2 font-serif text-lg leading-snug font-medium text-neutral-900 dark:text-neutral-100">
+					{blog.title}
+				</h3>
+				<p className="mt-2 flex-1 text-sm leading-relaxed text-neutral-600 dark:text-neutral-400">
+					{truncate(blog.description, 110)}
+				</p>
+				<div className="mt-4 flex items-center gap-2 pt-4">
+					<img
+						src={blog.authorAvatar}
+						alt={blog.author}
+						width={24}
+						height={24}
+						className="h-6 w-6 rounded-full object-cover"
+					/>
+					<span className="text-xs text-neutral-600 dark:text-neutral-300">{blog.author}</span>
+				</div>
+			</div>
+		</Link>
+	);
+}
diff --git a/surfsense_web/app/(home)/blog/loading.tsx b/surfsense_web/app/(home)/blog/loading.tsx
new file mode 100644
index 0000000000..ddaf345f63
--- /dev/null
+++ b/surfsense_web/app/(home)/blog/loading.tsx
@@ -0,0 +1,50 @@
+import { Skeleton } from "@/components/ui/skeleton";
+
+export default function BlogIndexLoading() {
+	return (
+		<div className="relative overflow-hidden bg-neutral-50 px-4 pt-20 md:px-8 dark:bg-neutral-950">
+			<div className="mx-auto max-w-6xl pt-12 pb-24 md:pt-20">
+				{/* Header */}
+				<div className="mb-10 md:mb-14">
+					<Skeleton className="h-10 w-24 rounded-md" />
+				</div>
+
+				{/* Featured post skeleton */}
+				<div className="mb-14 overflow-hidden rounded-3xl border border-neutral-200/80 dark:border-neutral-800">
+					<Skeleton className="aspect-[2.4/1] min-h-[220px] w-full rounded-none" />
+					<div className="p-6 md:p-8 space-y-3">
+						<Skeleton className="h-5 w-24 rounded-full" />
+						<Skeleton className="h-8 w-3/4" />
+						<Skeleton className="h-4 w-full max-w-lg" />
+						<div className="flex items-center gap-3 pt-2">
+							<Skeleton className="h-8 w-8 rounded-full" />
+							<Skeleton className="h-4 w-28" />
+							<Skeleton className="h-4 w-20" />
+						</div>
+					</div>
+				</div>
+
+				{/* Search bar skeleton */}
+				<div className="mb-10">
+					<Skeleton className="h-11 w-full max-w-md rounded-full" />
+				</div>
+
+				{/* Grid of article cards */}
+				<div className="grid gap-8 md:grid-cols-2 lg:grid-cols-3">
+					{Array.from({ length: 6 }).map((_, i) => (
+						<div key={i} className="space-y-3">
+							<Skeleton className="aspect-video w-full rounded-2xl" />
+							<Skeleton className="h-5 w-3/4" />
+							<Skeleton className="h-4 w-full" />
+							<Skeleton className="h-4 w-5/6" />
+							<div className="flex items-center gap-2 pt-1">
+								<Skeleton className="h-6 w-6 rounded-full" />
+								<Skeleton className="h-4 w-24" />
+							</div>
+						</div>
+					))}
+				</div>
+			</div>
+		</div>
+	);
+}
diff --git a/surfsense_web/app/(home)/blog/page.tsx b/surfsense_web/app/(home)/blog/page.tsx
new file mode 100644
index 0000000000..e05d843a05
--- /dev/null
+++ b/surfsense_web/app/(home)/blog/page.tsx
@@ -0,0 +1,74 @@
+import { loader } from "fumadocs-core/source";
+import type { Metadata } from "next";
+import { blog } from "@/.source/server";
+import { BlogWithSearchMagazine } from "./blog-magazine";
+
+export const metadata: Metadata = {
+	title: "Blog | SurfSense - AI Search & Knowledge Management",
+	description: "Product updates, tutorials, and tips from the SurfSense team.",
+	alternates: {
+		canonical: "https://www.surfsense.com/blog",
+	},
+};
+
+const source = loader({
+	baseUrl: "/blog",
+	source: blog.toFumadocsSource(),
+});
+
+export interface BlogEntry {
+	title: string;
+	description: string;
+	date: string;
+	slug: string;
+	url: string;
+	image: string;
+	author: string;
+	authorAvatar: string;
+	featured: boolean;
+	featuredOrder?: number;
+}
+
+export default async function BlogPage() {
+	const allPages = source.getPages() as Array<{
+		url: string;
+		slugs: string[];
+		data: {
+			title: string;
+			description: string;
+			date: string;
+			image?: string;
+			author?: string;
+			authorAvatar?: string;
+			featured?: boolean;
+			featured_order?: number;
+		};
+	}>;
+
+	const blogs: BlogEntry[] = allPages
+		.map((page) => ({
+			title: page.data.title,
+			description: page.data.description ?? "",
+			date: page.data.date,
+			slug: page.slugs.join("/"),
+			url: page.url,
+			image: page.data.image ?? "/og-image.png",
+			author: page.data.author ?? "SurfSense Team",
+			authorAvatar: page.data.authorAvatar ?? "/logo.png",
+			featured: page.data.featured ?? false,
+			featuredOrder: page.data.featured_order,
+		}))
+		.sort((a, b) => {
+			// Featured first; then by `featured_order` asc within featured;
+			// then by `date` desc as the universal tie-breaker.
+			if (a.featured !== b.featured) return a.featured ? -1 : 1;
+			if (a.featured && b.featured) {
+				const aOrder = a.featuredOrder ?? Number.POSITIVE_INFINITY;
+				const bOrder = b.featuredOrder ?? Number.POSITIVE_INFINITY;
+				if (aOrder !== bOrder) return aOrder - bOrder;
+			}
+			return new Date(b.date).getTime() - new Date(a.date).getTime();
+		});
+
+	return <BlogWithSearchMagazine blogs={blogs} />;
+}
diff --git a/surfsense_web/app/(home)/changelog/loading.tsx b/surfsense_web/app/(home)/changelog/loading.tsx
new file mode 100644
index 0000000000..648f5a5e64
--- /dev/null
+++ b/surfsense_web/app/(home)/changelog/loading.tsx
@@ -0,0 +1,63 @@
+import { Skeleton } from "@/components/ui/skeleton";
+
+export default function ChangelogLoading() {
+	return (
+		<div className="min-h-screen relative pt-20">
+			{/* Header */}
+			<div className="border-b border-border/50">
+				<div className="max-w-5xl mx-auto relative">
+					<div className="p-6 flex items-center justify-between">
+						<div>
+							{/* Breadcrumb */}
+							<div className="flex items-center gap-2 mb-4">
+								<Skeleton className="h-4 w-10" />
+								<Skeleton className="h-4 w-3" />
+								<Skeleton className="h-4 w-20" />
+							</div>
+							<Skeleton className="h-10 w-48 mb-2" />
+							<Skeleton className="h-4 w-80" />
+						</div>
+					</div>
+				</div>
+			</div>
+
+			{/* Timeline */}
+			<div className="max-w-5xl mx-auto px-6 lg:px-10 pt-10 pb-20">
+				<div className="relative">
+					{Array.from({ length: 3 }).map((_, i) => (
+						<div key={i} className="relative flex flex-col md:flex-row gap-y-6 mb-10">
+							{/* Left: date + version */}
+							<div className="md:w-48 flex-shrink-0">
+								<Skeleton className="h-4 w-24 mb-3" />
+								<Skeleton className="h-12 w-12 rounded-xl" />
+							</div>
+
+							{/* Right: content */}
+							<div className="flex-1 md:pl-8 relative pb-10">
+								<div className="space-y-4">
+									{/* Title */}
+									<Skeleton className="h-7 w-2/3" />
+									{/* Tags */}
+									<div className="flex gap-2">
+										<Skeleton className="h-6 w-16 rounded-full" />
+										<Skeleton className="h-6 w-20 rounded-full" />
+									</div>
+									{/* Body paragraphs */}
+									<div className="space-y-2">
+										<Skeleton className="h-4 w-full" />
+										<Skeleton className="h-4 w-full" />
+										<Skeleton className="h-4 w-3/4" />
+									</div>
+									<div className="space-y-2">
+										<Skeleton className="h-4 w-full" />
+										<Skeleton className="h-4 w-5/6" />
+									</div>
+								</div>
+							</div>
+						</div>
+					))}
+				</div>
+			</div>
+		</div>
+	);
+}
diff --git a/surfsense_web/app/(home)/changelog/page.tsx b/surfsense_web/app/(home)/changelog/page.tsx
index c14218daba..f3ded03fad 100644
--- a/surfsense_web/app/(home)/changelog/page.tsx
+++ b/surfsense_web/app/(home)/changelog/page.tsx
@@ -1,12 +1,16 @@
 import { loader } from "fumadocs-core/source";
 import type { Metadata } from "next";
 import { changelog } from "@/.source/server";
+import { BreadcrumbNav } from "@/components/seo/breadcrumb-nav";
 import { formatDate } from "@/lib/utils";
 import { getMDXComponents } from "@/mdx-components";
 
 export const metadata: Metadata = {
 	title: "Changelog | SurfSense",
-	description: "See what's new in SurfSense.",
+	description: "See what's new in SurfSense. Latest updates, features, and improvements.",
+	alternates: {
+		canonical: "https://www.surfsense.com/changelog",
+	},
 };
 
 const source = loader({
@@ -42,6 +46,13 @@ export default async function ChangelogPage() {
 				<div className="max-w-5xl mx-auto relative">
 					<div className="p-6 flex items-center justify-between">
 						<div>
+							<BreadcrumbNav
+								items={[
+									{ name: "Home", href: "/" },
+									{ name: "Changelog", href: "/changelog" },
+								]}
+								className="mb-4"
+							/>
 							<h1 className="text-4xl font-bold tracking-tight bg-gradient-to-r from-gray-900 to-gray-600 dark:from-white dark:to-gray-400 bg-clip-text text-transparent">
 								Changelog
 							</h1>
diff --git a/surfsense_web/app/(home)/contact/page.tsx b/surfsense_web/app/(home)/contact/page.tsx
index 097f753cbc..27046d1bb9 100644
--- a/surfsense_web/app/(home)/contact/page.tsx
+++ b/surfsense_web/app/(home)/contact/page.tsx
@@ -3,7 +3,11 @@ import { ContactFormGridWithDetails } from "@/components/contact/contact-form";
 
 export const metadata: Metadata = {
 	title: "Contact | SurfSense",
-	description: "Get in touch with the SurfSense team.",
+	description:
+		"Get in touch with the SurfSense team for enterprise AI search, knowledge management, or partnership inquiries.",
+	alternates: {
+		canonical: "https://www.surfsense.com/contact",
+	},
 };
 
 const page = () => {
diff --git a/surfsense_web/app/(home)/free/[model_slug]/layout.tsx b/surfsense_web/app/(home)/free/[model_slug]/layout.tsx
new file mode 100644
index 0000000000..5c0a9fcf1c
--- /dev/null
+++ b/surfsense_web/app/(home)/free/[model_slug]/layout.tsx
@@ -0,0 +1,24 @@
+"use client";
+
+import type { ReactNode } from "react";
+import { use } from "react";
+import { FreeLayoutDataProvider } from "@/components/layout/providers/FreeLayoutDataProvider";
+import { AnonymousModeProvider } from "@/contexts/anonymous-mode";
+import { LoginGateProvider } from "@/contexts/login-gate";
+
+interface FreeModelLayoutProps {
+	children: ReactNode;
+	params: Promise<{ model_slug: string }>;
+}
+
+export default function FreeModelLayout({ children, params }: FreeModelLayoutProps) {
+	const { model_slug } = use(params);
+
+	return (
+		<AnonymousModeProvider initialModelSlug={model_slug}>
+			<LoginGateProvider>
+				<FreeLayoutDataProvider>{children}</FreeLayoutDataProvider>
+			</LoginGateProvider>
+		</AnonymousModeProvider>
+	);
+}
diff --git a/surfsense_web/app/(home)/free/[model_slug]/loading.tsx b/surfsense_web/app/(home)/free/[model_slug]/loading.tsx
new file mode 100644
index 0000000000..97660188d0
--- /dev/null
+++ b/surfsense_web/app/(home)/free/[model_slug]/loading.tsx
@@ -0,0 +1,65 @@
+import { Skeleton } from "@/components/ui/skeleton";
+
+export default function FreeModelLoading() {
+	return (
+		<>
+			{/* Chat area skeleton - fills viewport */}
+			<div className="h-full flex flex-col">
+				{/* Chat header */}
+				<div className="flex items-center gap-3 border-b px-4 py-3">
+					<Skeleton className="h-8 w-8 rounded-full" />
+					<Skeleton className="h-5 w-40" />
+				</div>
+
+				{/* Chat messages area */}
+				<div className="flex-1 flex flex-col justify-end gap-4 px-4 py-6">
+					<div className="flex justify-end">
+						<Skeleton className="h-10 w-56 rounded-2xl" />
+					</div>
+					<div className="space-y-2 max-w-lg">
+						<Skeleton className="h-4 w-full" />
+						<Skeleton className="h-4 w-4/5" />
+						<Skeleton className="h-4 w-3/4" />
+					</div>
+				</div>
+
+				{/* Input bar */}
+				<div className="border-t px-4 py-3">
+					<Skeleton className="h-12 w-full rounded-xl" />
+				</div>
+			</div>
+
+			{/* SEO section skeleton */}
+			<div className="border-t bg-background">
+				<div className="container mx-auto px-4 py-10 max-w-3xl">
+					{/* Breadcrumb */}
+					<div className="flex items-center gap-2 mb-6">
+						<Skeleton className="h-4 w-10" />
+						<Skeleton className="h-4 w-3" />
+						<Skeleton className="h-4 w-24" />
+						<Skeleton className="h-4 w-3" />
+						<Skeleton className="h-4 w-32" />
+					</div>
+
+					<Skeleton className="h-7 w-3/4 mb-2" />
+					<Skeleton className="h-4 w-full mb-1" />
+					<Skeleton className="h-4 w-2/3 mb-8" />
+
+					<div className="my-8 h-px bg-border" />
+
+					{/* FAQ skeleton */}
+					<Skeleton className="h-6 w-64 mb-4" />
+					<div className="flex flex-col gap-3">
+						{Array.from({ length: 4 }).map((_, i) => (
+							<div key={i} className="rounded-lg border bg-card p-4 space-y-2">
+								<Skeleton className="h-4 w-3/4" />
+								<Skeleton className="h-3 w-full" />
+								<Skeleton className="h-3 w-5/6" />
+							</div>
+						))}
+					</div>
+				</div>
+			</div>
+		</>
+	);
+}
diff --git a/surfsense_web/app/(home)/free/[model_slug]/page.tsx b/surfsense_web/app/(home)/free/[model_slug]/page.tsx
new file mode 100644
index 0000000000..35c5c0512b
--- /dev/null
+++ b/surfsense_web/app/(home)/free/[model_slug]/page.tsx
@@ -0,0 +1,254 @@
+import { SquareArrowOutUpRight } from "lucide-react";
+import type { Metadata } from "next";
+import Link from "next/link";
+import { notFound } from "next/navigation";
+import { FreeChatPage } from "@/components/free-chat/free-chat-page";
+import { BreadcrumbNav } from "@/components/seo/breadcrumb-nav";
+import { FAQJsonLd, JsonLd } from "@/components/seo/json-ld";
+import { Button } from "@/components/ui/button";
+import { Separator } from "@/components/ui/separator";
+import type { AnonModel } from "@/contracts/types/anonymous-chat.types";
+import { BACKEND_URL } from "@/lib/env-config";
+
+interface PageProps {
+	params: Promise<{ model_slug: string }>;
+}
+
+async function getModel(slug: string): Promise<AnonModel | null> {
+	try {
+		const res = await fetch(
+			`${BACKEND_URL}/api/v1/public/anon-chat/models/${encodeURIComponent(slug)}`,
+			{ next: { revalidate: 300 } }
+		);
+		if (!res.ok) return null;
+		return res.json();
+	} catch {
+		return null;
+	}
+}
+
+async function getAllModels(): Promise<AnonModel[]> {
+	try {
+		const res = await fetch(`${BACKEND_URL}/api/v1/public/anon-chat/models`, {
+			next: { revalidate: 300 },
+		});
+		if (!res.ok) return [];
+		return res.json();
+	} catch {
+		return [];
+	}
+}
+
+function buildSeoTitle(model: AnonModel): string {
+	if (model.seo_title) return model.seo_title;
+	return `Chat with ${model.name} Free, No Login | SurfSense`;
+}
+
+function buildSeoDescription(model: AnonModel): string {
+	if (model.seo_description) return model.seo_description;
+	return `Use ${model.name} free online without login. No sign-up required. Chat with ${model.name} by ${model.provider} instantly on SurfSense, the open source ChatGPT alternative with no login.`;
+}
+
+function buildModelFaq(model: AnonModel) {
+	return [
+		{
+			question: `Can I use ${model.name} without login?`,
+			answer: `Yes. ${model.name} is available on SurfSense without login. No account creation, no email, no password. Just open the page and start chatting with ${model.name} for free.`,
+		},
+		{
+			question: `Is ${model.name} really free on SurfSense?`,
+			answer: `Yes! You can use ${model.name} completely free without login or sign-up. SurfSense gives you 500,000 free tokens to use across any model, including ${model.name}.`,
+		},
+		{
+			question: `How do I use ${model.name} with no login?`,
+			answer: `Just start typing in the chat box above. ${model.name} will respond instantly. No login wall, no sign-up form, no verification. Your conversations are not stored in any database.`,
+		},
+		{
+			question: `What can I do with ${model.name} on SurfSense?`,
+			answer: `You can ask questions, get explanations, write content, brainstorm ideas, debug code, and more. ${model.name} is a powerful AI assistant available for free without login on SurfSense.`,
+		},
+		{
+			question: `How is SurfSense different from using ${model.name} directly?`,
+			answer: `SurfSense gives you free access without login to ${model.name} and many other AI models in one place. Create a free account to unlock document Q&A, team collaboration, and integrations with Slack, Google Drive, Notion, and more.`,
+		},
+	];
+}
+
+export async function generateMetadata({ params }: PageProps): Promise<Metadata> {
+	const { model_slug } = await params;
+	const model = await getModel(model_slug);
+	if (!model) return { title: "Model Not Found | SurfSense" };
+
+	const title = buildSeoTitle(model);
+	const description = buildSeoDescription(model);
+	const canonicalUrl = `https://www.surfsense.com/free/${model.seo_slug}`;
+	const modelNameLower = model.name.toLowerCase();
+
+	return {
+		title,
+		description,
+		alternates: { canonical: canonicalUrl },
+		keywords: [
+			`${modelNameLower} free`,
+			`free ${modelNameLower}`,
+			`${modelNameLower} online`,
+			`${modelNameLower} online free`,
+			`${modelNameLower} without login`,
+			`${modelNameLower} no login`,
+			`${modelNameLower} no sign up`,
+			`${modelNameLower} login free`,
+			`${modelNameLower} free without login`,
+			`${modelNameLower} free no login`,
+			`${modelNameLower} chat free`,
+			`${modelNameLower} free online`,
+			`use ${modelNameLower} for free`,
+			`use ${modelNameLower} without login`,
+			`${modelNameLower} alternative`,
+			`${modelNameLower} alternative free`,
+			"chatgpt no login",
+			"chatgpt without login",
+			"free ai chat no login",
+			"ai chat without login",
+			"free ai apps",
+		],
+		openGraph: {
+			title,
+			description,
+			url: canonicalUrl,
+			siteName: "SurfSense",
+			type: "website",
+			images: [
+				{
+					url: "/og-image.png",
+					width: 1200,
+					height: 630,
+					alt: `${model.name} Free Chat on SurfSense`,
+				},
+			],
+		},
+		twitter: {
+			card: "summary_large_image",
+			title,
+			description,
+			images: ["/og-image.png"],
+		},
+	};
+}
+
+export async function generateStaticParams() {
+	const models = await getAllModels();
+	return models.filter((m) => m.seo_slug).map((m) => ({ model_slug: m.seo_slug! }));
+}
+
+export default async function FreeModelPage({ params }: PageProps) {
+	const { model_slug } = await params;
+	const [model, allModels] = await Promise.all([getModel(model_slug), getAllModels()]);
+	if (!model) notFound();
+
+	const description = buildSeoDescription(model);
+	const faqItems = buildModelFaq(model);
+
+	const relatedModels = allModels
+		.filter((m) => m.seo_slug && m.seo_slug !== model.seo_slug)
+		.slice(0, 4);
+
+	return (
+		<>
+			{/* Invisible SEO metadata */}
+			<JsonLd
+				data={{
+					"@context": "https://schema.org",
+					"@type": "WebApplication",
+					name: `${model.name} Free Chat Without Login - SurfSense`,
+					description,
+					url: `https://www.surfsense.com/free/${model.seo_slug}`,
+					applicationCategory: "ChatApplication",
+					operatingSystem: "Web",
+					offers: {
+						"@type": "Offer",
+						price: "0",
+						priceCurrency: "USD",
+						description: `Free access to ${model.name} AI chat without login`,
+					},
+					provider: {
+						"@type": "Organization",
+						name: "SurfSense",
+						url: "https://www.surfsense.com",
+					},
+					isPartOf: {
+						"@type": "WebSite",
+						name: "SurfSense",
+						url: "https://www.surfsense.com",
+					},
+				}}
+			/>
+			<FAQJsonLd questions={faqItems} />
+
+			{/* Chat fills the entire viewport area inside LayoutShell */}
+			<div className="h-full">
+				<FreeChatPage />
+			</div>
+
+			{/* SEO content: in DOM for crawlers, clipped by parent overflow-hidden */}
+			<div className="border-t bg-background">
+				<article className="container mx-auto px-4 py-10 max-w-3xl">
+					<BreadcrumbNav
+						items={[
+							{ name: "Home", href: "/" },
+							{ name: "Free AI Chat", href: "/free" },
+							{ name: model.name, href: `/free/${model.seo_slug}` },
+						]}
+					/>
+
+					<header className="mt-6 mb-6">
+						<h1 className="text-2xl font-bold mb-2">Chat with {model.name} Free, No Login</h1>
+						<p className="text-sm text-muted-foreground leading-relaxed">
+							Use <strong>{model.name}</strong> free online without login or sign-up. No account, no
+							email, no password needed. Powered by SurfSense.
+						</p>
+					</header>
+
+					<Separator className="my-8" />
+
+					<section>
+						<h2 className="text-xl font-bold mb-4">
+							{model.name} Free: Frequently Asked Questions
+						</h2>
+						<dl className="flex flex-col gap-3">
+							{faqItems.map((item) => (
+								<div key={item.question} className="rounded-lg border bg-card p-4">
+									<dt className="font-medium text-sm">{item.question}</dt>
+									<dd className="mt-1.5 text-sm text-muted-foreground leading-relaxed">
+										{item.answer}
+									</dd>
+								</div>
+							))}
+						</dl>
+					</section>
+
+					{relatedModels.length > 0 && (
+						<>
+							<Separator className="my-8" />
+							<nav aria-label="Other free AI models">
+								<h2 className="text-xl font-bold mb-4">Try Other Free AI Models</h2>
+								<div className="flex flex-wrap gap-2">
+									{relatedModels.map((m) => (
+										<Button key={m.id} variant="outline" size="sm" asChild>
+											<Link href={`/free/${m.seo_slug}`}>
+												{m.name}
+												<SquareArrowOutUpRight className="size-3" />
+											</Link>
+										</Button>
+									))}
+									<Button variant="outline" size="sm" asChild>
+										<Link href="/free">View All Models</Link>
+									</Button>
+								</div>
+							</nav>
+						</>
+					)}
+				</article>
+			</div>
+		</>
+	);
+}
diff --git a/surfsense_web/app/(home)/free/loading.tsx b/surfsense_web/app/(home)/free/loading.tsx
new file mode 100644
index 0000000000..08a4ed6b64
--- /dev/null
+++ b/surfsense_web/app/(home)/free/loading.tsx
@@ -0,0 +1,60 @@
+import { Skeleton } from "@/components/ui/skeleton";
+
+export default function FreeChatLoading() {
+	return (
+		<div className="min-h-screen pt-20">
+			<article className="container mx-auto px-4 pb-20">
+				{/* Breadcrumb */}
+				<div className="flex items-center gap-2 mb-8">
+					<Skeleton className="h-4 w-10" />
+					<Skeleton className="h-4 w-3" />
+					<Skeleton className="h-4 w-24" />
+				</div>
+
+				{/* Hero section */}
+				<section className="mt-8 text-center max-w-3xl mx-auto space-y-4">
+					<Skeleton className="h-12 w-3/4 mx-auto" />
+					<Skeleton className="h-12 w-2/3 mx-auto" />
+					<Skeleton className="h-5 w-full max-w-lg mx-auto" />
+					<Skeleton className="h-5 w-4/5 max-w-lg mx-auto" />
+					<div className="flex flex-wrap items-center justify-center gap-3 mt-6">
+						{Array.from({ length: 4 }).map((_, i) => (
+							<Skeleton key={i} className="h-8 w-28 rounded-full" />
+						))}
+					</div>
+				</section>
+
+				<div className="my-12 max-w-4xl mx-auto h-px bg-border" />
+
+				{/* Model table */}
+				<section className="max-w-4xl mx-auto">
+					<Skeleton className="h-7 w-64 mb-2" />
+					<Skeleton className="h-4 w-80 mb-6" />
+
+					<div className="overflow-hidden rounded-lg border">
+						{/* Table header */}
+						<div className="flex gap-4 px-4 py-3 bg-muted/50 border-b">
+							<Skeleton className="h-4 w-[45%]" />
+							<Skeleton className="h-4 w-24" />
+							<Skeleton className="h-4 w-16" />
+							<Skeleton className="h-4 w-20" />
+						</div>
+
+						{/* Table rows */}
+						{Array.from({ length: 8 }).map((_, i) => (
+							<div key={i} className="flex items-center gap-4 px-4 py-3 border-b last:border-0">
+								<div className="flex-1 space-y-1.5">
+									<Skeleton className="h-4 w-40" />
+									<Skeleton className="h-3 w-24" />
+								</div>
+								<Skeleton className="h-4 w-24" />
+								<Skeleton className="h-6 w-14 rounded-full" />
+								<Skeleton className="h-8 w-20 rounded-md" />
+							</div>
+						))}
+					</div>
+				</section>
+			</article>
+		</div>
+	);
+}
diff --git a/surfsense_web/app/(home)/free/page.tsx b/surfsense_web/app/(home)/free/page.tsx
new file mode 100644
index 0000000000..cd75e79085
--- /dev/null
+++ b/surfsense_web/app/(home)/free/page.tsx
@@ -0,0 +1,387 @@
+import { SquareArrowOutUpRight } from "lucide-react";
+import type { Metadata } from "next";
+import Link from "next/link";
+import { BreadcrumbNav } from "@/components/seo/breadcrumb-nav";
+import { FAQJsonLd, JsonLd } from "@/components/seo/json-ld";
+import { Badge } from "@/components/ui/badge";
+import { Button } from "@/components/ui/button";
+import { Separator } from "@/components/ui/separator";
+import {
+	Table,
+	TableBody,
+	TableCell,
+	TableHead,
+	TableHeader,
+	TableRow,
+} from "@/components/ui/table";
+import type { AnonModel } from "@/contracts/types/anonymous-chat.types";
+import { BACKEND_URL } from "@/lib/env-config";
+
+export const metadata: Metadata = {
+	title: "Free AI Chat, No Login Required | SurfSense",
+	description:
+		"Use ChatGPT free online without login. Chat with GPT-4, Claude AI, Gemini and more for free. No sign-up required. Open source NotebookLM alternative with free AI chat and document Q&A.",
+	keywords: [
+		"chatgpt free",
+		"chat gpt free",
+		"free chatgpt",
+		"free chat gpt",
+		"chatgpt online",
+		"chat gpt online",
+		"online chatgpt",
+		"chatgpt free online",
+		"chatgpt online free",
+		"chat gpt free online",
+		"chatgpt no login",
+		"chatgpt without login",
+		"chat gpt login free",
+		"chat gpt login",
+		"free chatgpt without login",
+		"free chatgpt no login",
+		"ai chat no login",
+		"ai chat without login",
+		"claude ai without login",
+		"claude no login",
+		"chatgpt for free",
+		"gpt chat free",
+		"claude ai free",
+		"claude free",
+		"free claude ai",
+		"free claude",
+		"chatgpt alternative free",
+		"free chatgpt alternative",
+		"chatgpt free alternative",
+		"free alternative to chatgpt",
+		"alternative to chatgpt free",
+		"ai like chatgpt",
+		"sites like chatgpt",
+		"free ai chatbot like chatgpt",
+		"free ai chatbots like chatgpt",
+		"apps like chatgpt for free",
+		"best free alternative to chatgpt",
+		"free ai apps",
+		"ai with no restrictions",
+		"notebooklm alternative",
+	],
+	alternates: {
+		canonical: "https://www.surfsense.com/free",
+	},
+	openGraph: {
+		title: "Free AI Chat, No Login Required | SurfSense",
+		description:
+			"Use ChatGPT free online without login. Chat with GPT-4, Claude AI, Gemini and 100+ AI models. Open source NotebookLM alternative.",
+		url: "https://www.surfsense.com/free",
+		siteName: "SurfSense",
+		type: "website",
+		images: [
+			{
+				url: "/og-image.png",
+				width: 1200,
+				height: 630,
+				alt: "SurfSense - ChatGPT Free Online, Claude AI Free, No Login Required",
+			},
+		],
+	},
+	twitter: {
+		card: "summary_large_image",
+		title: "Free AI Chat, No Login Required | SurfSense",
+		description:
+			"Use ChatGPT free online without login. Chat with GPT-4, Claude AI, Gemini and more. No sign-up needed.",
+		images: ["/og-image.png"],
+	},
+};
+
+async function getModels(): Promise<AnonModel[]> {
+	try {
+		const res = await fetch(`${BACKEND_URL}/api/v1/public/anon-chat/models`, {
+			next: { revalidate: 300 },
+		});
+		if (!res.ok) return [];
+		return res.json();
+	} catch {
+		return [];
+	}
+}
+
+const FAQ_ITEMS = [
+	{
+		question: "Can I use ChatGPT without login?",
+		answer:
+			"Yes. SurfSense lets you use ChatGPT without login or any sign-up. Just pick a model and start chatting. No email, no password, no account needed. You get 500,000 free tokens to use across ChatGPT, Claude AI, Gemini, and other models.",
+	},
+	{
+		question: "Is ChatGPT really free on SurfSense?",
+		answer:
+			"Yes. SurfSense gives you free access to ChatGPT (GPT-4), Claude AI, Gemini, and other models without login. You get 500,000 free tokens across any model with no sign-up required.",
+	},
+	{
+		question: "How do I use ChatGPT no login?",
+		answer:
+			"Go to any model page on SurfSense and start typing your message. There is no login wall, no account creation, and no verification step. ChatGPT no login works instantly in your browser.",
+	},
+	{
+		question: "What AI models can I use for free without login?",
+		answer:
+			"SurfSense offers free access without login to models from OpenAI (GPT-4, GPT-4 Turbo), Anthropic (Claude 3, Claude free), Google (Gemini), DeepSeek, Mistral, Llama, and more. All available as a free ChatGPT alternative online with no login required.",
+	},
+	{
+		question: "What happens after I use my free tokens?",
+		answer:
+			"After your free tokens, create a free SurfSense account to unlock $5 of premium credit. Additional credit can be topped up at $1 for $1 of credit, billed at the actual provider cost. Non-premium models remain unlimited for registered users.",
+	},
+	{
+		question: "Is Claude AI available without login?",
+		answer:
+			"Yes. You can use Claude AI free without login on SurfSense. Both Claude 3 and other Anthropic models are available with no sign-up, alongside ChatGPT and Gemini.",
+	},
+	{
+		question: "How is SurfSense different from ChatGPT?",
+		answer:
+			"SurfSense is an open source NotebookLM alternative that gives you access to multiple AI models in one place without login. Unlike ChatGPT alone, SurfSense includes document Q&A with citations, integrations with Slack, Google Drive, Notion, and Confluence, plus team collaboration features.",
+	},
+	{
+		question: "Is SurfSense a free ChatGPT alternative?",
+		answer:
+			"Yes. SurfSense is a free, open source alternative to ChatGPT that works without login. It gives you access to Claude AI free, Gemini, and other AI models alongside document Q&A with citations, team collaboration, and 30+ integrations.",
+	},
+	{
+		question: "Is my data private when using free AI chat without login?",
+		answer:
+			"Anonymous chat sessions are not stored in any database. No account means no personal data collected. SurfSense is open source, so you can self-host for complete data control and privacy.",
+	},
+];
+
+export default async function FreeHubPage() {
+	const models = await getModels();
+	const seoModels = models.filter((m) => m.seo_slug);
+
+	return (
+		<div className="min-h-screen pt-20">
+			<JsonLd
+				data={{
+					"@context": "https://schema.org",
+					"@type": "CollectionPage",
+					name: "ChatGPT Free Online Without Login - SurfSense",
+					description:
+						"Use ChatGPT, Claude AI, Gemini and more AI models free online without login or sign-up. Open source NotebookLM alternative with no login required.",
+					url: "https://www.surfsense.com/free",
+					isPartOf: { "@type": "WebSite", name: "SurfSense", url: "https://www.surfsense.com" },
+					mainEntity: {
+						"@type": "ItemList",
+						numberOfItems: seoModels.length,
+						itemListElement: seoModels.map((m, i) => ({
+							"@type": "ListItem",
+							position: i + 1,
+							name: m.name,
+							url: `https://www.surfsense.com/free/${m.seo_slug}`,
+						})),
+					},
+				}}
+			/>
+			<FAQJsonLd questions={FAQ_ITEMS} />
+
+			<article className="container mx-auto px-4 pb-20">
+				<BreadcrumbNav
+					items={[
+						{ name: "Home", href: "/" },
+						{ name: "Free AI Chat", href: "/free" },
+					]}
+				/>
+
+				{/* Hero */}
+				<section className="mt-8 text-center max-w-3xl mx-auto">
+					<h1 className="text-4xl md:text-5xl font-bold tracking-tight">
+						ChatGPT Free Online Without Login
+					</h1>
+					<p className="mt-4 text-lg text-muted-foreground max-w-2xl mx-auto leading-relaxed">
+						Use <strong>ChatGPT</strong>, <strong>Claude AI</strong>, <strong>Gemini</strong>, and
+						other AI models free online without login. No sign-up, no email, no password. Pick a
+						model and start chatting instantly.
+					</p>
+					<div className="flex flex-wrap items-center justify-center gap-3 mt-6">
+						<Badge variant="secondary" className="px-3 py-1.5 text-sm">
+							No login required
+						</Badge>
+						<Badge variant="secondary" className="px-3 py-1.5 text-sm">
+							500K free tokens
+						</Badge>
+						<Badge variant="secondary" className="px-3 py-1.5 text-sm">
+							{seoModels.length} AI models
+						</Badge>
+						<Badge variant="secondary" className="px-3 py-1.5 text-sm">
+							Open source
+						</Badge>
+					</div>
+				</section>
+
+				<Separator className="my-12 max-w-4xl mx-auto" />
+
+				{/* Model Table */}
+				{seoModels.length > 0 ? (
+					<section
+						className="max-w-4xl mx-auto"
+						aria-label="Free AI models available without login"
+					>
+						<h2 className="text-2xl font-bold mb-2">Free AI Models Available Without Login</h2>
+						<p className="text-sm text-muted-foreground mb-6">
+							All models below work without login or sign-up. Click any model to start a free AI
+							chat instantly.
+						</p>
+
+						<div className="overflow-hidden rounded-lg border">
+							<Table>
+								<TableHeader>
+									<TableRow>
+										<TableHead className="w-[45%]">Model</TableHead>
+										<TableHead>Provider</TableHead>
+										<TableHead>Tier</TableHead>
+										<TableHead className="text-right w-[100px]" />
+									</TableRow>
+								</TableHeader>
+								<TableBody>
+									{seoModels.map((model) => (
+										<TableRow key={model.id}>
+											<TableCell>
+												<Link
+													href={`/free/${model.seo_slug}`}
+													className="group flex flex-col gap-0.5"
+												>
+													<span className="font-medium group-hover:underline">{model.name}</span>
+													{model.description && (
+														<span className="text-xs text-muted-foreground line-clamp-1">
+															{model.description}
+														</span>
+													)}
+												</Link>
+											</TableCell>
+											<TableCell>
+												<Badge variant="outline">{model.provider}</Badge>
+											</TableCell>
+											<TableCell>
+												{model.is_premium ? (
+													<Badge className="bg-purple-100 text-purple-700 dark:bg-purple-900/50 dark:text-purple-300 border-0">
+														Premium
+													</Badge>
+												) : (
+													<Badge variant="secondary">Free</Badge>
+												)}
+											</TableCell>
+											<TableCell className="text-right">
+												<Button variant="ghost" size="sm" asChild>
+													<Link href={`/free/${model.seo_slug}`}>
+														Chat
+														<SquareArrowOutUpRight className="size-3" />
+													</Link>
+												</Button>
+											</TableCell>
+										</TableRow>
+									))}
+								</TableBody>
+							</Table>
+						</div>
+					</section>
+				) : (
+					<section className="mt-12 text-center max-w-4xl mx-auto">
+						<p className="text-muted-foreground">
+							No models are currently available. Please check back later.
+						</p>
+					</section>
+				)}
+
+				<Separator className="my-12 max-w-4xl mx-auto" />
+
+				{/* Why SurfSense */}
+				<section className="max-w-4xl mx-auto">
+					<h2 className="text-2xl font-bold mb-6">
+						Why Use SurfSense as Your Free ChatGPT Alternative
+					</h2>
+					<div className="grid grid-cols-1 md:grid-cols-3 gap-6">
+						<div className="rounded-lg border bg-card p-5">
+							<h3 className="font-semibold mb-1.5">Multiple AI Models in One Place</h3>
+							<p className="text-sm text-muted-foreground leading-relaxed">
+								Access ChatGPT, Claude AI free, Gemini, DeepSeek, and more. Works like sites like
+								ChatGPT but with all AI models available, not just GPT. A true free AI chatbot like
+								ChatGPT and beyond.
+							</p>
+						</div>
+						<div className="rounded-lg border bg-card p-5">
+							<h3 className="font-semibold mb-1.5">No Login, No Sign-Up Required</h3>
+							<p className="text-sm text-muted-foreground leading-relaxed">
+								Start using ChatGPT free online immediately. No email, no password, no verification.
+								Get ChatGPT no login access and Claude AI free access from one platform. AI with no
+								restrictions on which model you can use.
+							</p>
+						</div>
+						<div className="rounded-lg border bg-card p-5">
+							<h3 className="font-semibold mb-1.5">Open Source NotebookLM Alternative</h3>
+							<p className="text-sm text-muted-foreground leading-relaxed">
+								SurfSense is a free, open source NotebookLM alternative with document Q&A and
+								citations, integrations with Slack, Google Drive, Notion, and Confluence, plus team
+								collaboration and self-hosting support.
+							</p>
+						</div>
+					</div>
+				</section>
+
+				<Separator className="my-12 max-w-4xl mx-auto" />
+
+				{/* CTA */}
+				<section className="max-w-3xl mx-auto text-center">
+					<h2 className="text-2xl font-bold mb-3">Want More Features?</h2>
+					<p className="text-muted-foreground mb-6 leading-relaxed">
+						Create a free SurfSense account to unlock $5 of premium credit, document uploads with
+						citations, team collaboration, and integrations with Slack, Google Drive, Notion, and
+						30+ more tools.
+					</p>
+					<Button size="lg" asChild>
+						<Link href="/register">Create Free Account</Link>
+					</Button>
+				</section>
+
+				<Separator className="my-12 max-w-4xl mx-auto" />
+
+				{/* FAQ */}
+				<section className="max-w-3xl mx-auto">
+					<h2 className="text-2xl font-bold text-center mb-8">Frequently Asked Questions</h2>
+					<dl className="flex flex-col gap-4">
+						{FAQ_ITEMS.map((item) => (
+							<div key={item.question} className="rounded-lg border bg-card p-5">
+								<dt className="font-medium text-sm">{item.question}</dt>
+								<dd className="mt-2 text-sm text-muted-foreground leading-relaxed">
+									{item.answer}
+								</dd>
+							</div>
+						))}
+					</dl>
+				</section>
+
+				{/* Internal links */}
+				<nav aria-label="Related pages" className="mt-16 max-w-3xl mx-auto">
+					<h2 className="text-lg font-semibold mb-3">Explore SurfSense</h2>
+					<ul className="flex flex-wrap gap-2">
+						<li>
+							<Button variant="outline" size="sm" asChild>
+								<Link href="/pricing">Pricing</Link>
+							</Button>
+						</li>
+						<li>
+							<Button variant="outline" size="sm" asChild>
+								<Link href="/docs">Documentation</Link>
+							</Button>
+						</li>
+						<li>
+							<Button variant="outline" size="sm" asChild>
+								<Link href="/blog">Blog</Link>
+							</Button>
+						</li>
+						<li>
+							<Button variant="outline" size="sm" asChild>
+								<Link href="/register">Sign Up Free</Link>
+							</Button>
+						</li>
+					</ul>
+				</nav>
+			</article>
+		</div>
+	);
+}
diff --git a/surfsense_web/app/(home)/layout.tsx b/surfsense_web/app/(home)/layout.tsx
index 3645d95129..f9fe64851b 100644
--- a/surfsense_web/app/(home)/layout.tsx
+++ b/surfsense_web/app/(home)/layout.tsx
@@ -11,6 +11,12 @@ export default function HomePageLayout({ children }: { children: React.ReactNode
 	// cookie-handoff redirect dance and should not flash any chrome.
 	const isSplashPage = pathname === "/";
 
+	const isFreeModelChat = /^\/free\/[^/]+$/.test(pathname);
+
+	if (isFreeModelChat) {
+		return <>{children}</>;
+	}
+
 	return (
 		<main className="min-h-screen bg-linear-to-b from-gray-50 to-gray-100 text-gray-900 dark:from-black dark:to-gray-900 dark:text-white overflow-x-hidden">
 			{!isSplashPage && <Navbar />}
diff --git a/surfsense_web/app/(home)/login/GoogleLoginButton.tsx b/surfsense_web/app/(home)/login/GoogleLoginButton.tsx
index f717761806..6e9bf65c28 100644
--- a/surfsense_web/app/(home)/login/GoogleLoginButton.tsx
+++ b/surfsense_web/app/(home)/login/GoogleLoginButton.tsx
@@ -1,14 +1,42 @@
 "use client";
-import { IconBrandGoogleFilled } from "@tabler/icons-react";
-import { motion } from "motion/react";
 import { useTranslations } from "next-intl";
-import { useEffect } from "react";
+import { useEffect, useState } from "react";
 import { Logo } from "@/components/Logo";
+import { Button } from "@/components/ui/button";
 import { trackLoginAttempt } from "@/lib/posthog/events";
 import { AmbientBackground } from "./AmbientBackground";
 
+function GoogleGLogo({ className }: { className?: string }) {
+	return (
+		<svg
+			className={className}
+			xmlns="http://www.w3.org/2000/svg"
+			viewBox="0 0 48 48"
+			aria-hidden="true"
+		>
+			<path
+				fill="#EA4335"
+				d="M24 9.5c3.54 0 6.71 1.22 9.21 3.6l6.85-6.85C35.9 2.38 30.47 0 24 0 14.62 0 6.51 5.38 2.56 13.22l7.98 6.19C12.43 13.72 17.74 9.5 24 9.5z"
+			/>
+			<path
+				fill="#4285F4"
+				d="M46.98 24.55c0-1.57-.15-3.09-.38-4.55H24v9.02h12.94c-.58 2.96-2.26 5.48-4.78 7.18l7.73 6c4.51-4.18 7.09-10.36 7.09-17.65z"
+			/>
+			<path
+				fill="#FBBC05"
+				d="M10.53 28.59c-.48-1.45-.76-2.99-.76-4.59s.27-3.14.76-4.59l-7.98-6.19C.92 16.46 0 20.12 0 24c0 3.88.92 7.54 2.56 10.78l7.97-6.19z"
+			/>
+			<path
+				fill="#34A853"
+				d="M24 48c6.48 0 11.93-2.13 15.89-5.81l-7.73-6c-2.15 1.45-4.92 2.3-8.16 2.3-6.26 0-11.57-4.22-13.47-9.91l-7.98 6.19C6.51 42.62 14.62 48 24 48z"
+			/>
+		</svg>
+	);
+}
+
 export function GoogleLoginButton() {
 	const t = useTranslations("auth");
+	const [isRedirecting, setIsRedirecting] = useState(false);
 
 	// Auto-redirect to proxy-login on mount — user is already authenticated
 	// via oauth2-proxy/Cognito so no button click is needed.
@@ -18,6 +46,8 @@ export function GoogleLoginButton() {
 	}, []);
 
 	const handleGoogleLogin = () => {
+		if (isRedirecting) return;
+		setIsRedirecting(true);
 		trackLoginAttempt("google");
 		window.location.href = `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/auth/jwt/proxy-login`;
 	};
@@ -74,21 +104,15 @@ export function GoogleLoginButton() {
 					</motion.div>
 				</motion.div> */}
 
-				<motion.button
-					whileHover={{ scale: 1.02 }}
-					whileTap={{ scale: 0.98 }}
-					className="group/btn relative flex w-full items-center justify-center space-x-2 rounded-lg bg-white px-6 py-3 md:py-4 text-neutral-700 shadow-lg transition-all duration-200 hover:shadow-xl dark:bg-neutral-800 dark:text-neutral-200"
+				<Button
+					variant="outline"
+					className="w-full max-w-md gap-2 rounded-lg border-white bg-white px-6 py-5 font-medium text-[#1f1f1f] shadow-sm hover:bg-zinc-100 hover:text-[#1f1f1f] dark:border-white md:py-5"
+					disabled={isRedirecting}
 					onClick={handleGoogleLogin}
 				>
-					<div className="absolute inset-0 h-full w-full transform opacity-0 transition duration-200 group-hover/btn:opacity-100">
-						<div className="absolute -left-px -top-px h-4 w-4 rounded-tl-lg border-l-2 border-t-2 border-blue-500 bg-transparent transition-all duration-200 group-hover/btn:-left-2 group-hover/btn:-top-2"></div>
-						<div className="absolute -right-px -top-px h-4 w-4 rounded-tr-lg border-r-2 border-t-2 border-blue-500 bg-transparent transition-all duration-200 group-hover/btn:-right-2 group-hover/btn:-top-2"></div>
-						<div className="absolute -bottom-px -left-px h-4 w-4 rounded-bl-lg border-b-2 border-l-2 border-blue-500 bg-transparent transition-all duration-200 group-hover/btn:-bottom-2 group-hover/btn:-left-2"></div>
-						<div className="absolute -bottom-px -right-px h-4 w-4 rounded-br-lg border-b-2 border-r-2 border-blue-500 bg-transparent transition-all duration-200 group-hover/btn:-bottom-2 group-hover/btn:-right-2"></div>
-					</div>
-					<IconBrandGoogleFilled className="h-5 w-5 text-neutral-700 dark:text-neutral-200" />
+					<GoogleGLogo className="h-5 w-5" />
 					<span className="text-base font-medium">{t("continue_with_google")}</span>
-				</motion.button>
+				</Button>
 			</div>
 		</div>
 	);
diff --git a/surfsense_web/app/(home)/login/LocalLoginForm.tsx b/surfsense_web/app/(home)/login/LocalLoginForm.tsx
index 2dd203884c..fef2d51184 100644
--- a/surfsense_web/app/(home)/login/LocalLoginForm.tsx
+++ b/surfsense_web/app/(home)/login/LocalLoginForm.tsx
@@ -7,6 +7,7 @@ import { useRouter } from "next/navigation";
 import { useTranslations } from "next-intl";
 import { useState } from "react";
 import { loginMutationAtom } from "@/atoms/auth/auth-mutation.atoms";
+import { Button } from "@/components/ui/button";
 import { Spinner } from "@/components/ui/spinner";
 import { getAuthErrorDetails, isNetworkError } from "@/lib/auth-errors";
 import { setBearerToken } from "@/lib/auth-utils";
@@ -119,11 +120,13 @@ export function LocalLoginForm() {
 									<p className="text-sm font-semibold mb-1">{error.title}</p>
 									<p className="text-sm text-destructive">{error.message}</p>
 								</div>
-								<button
+								<Button
+									variant="ghost"
+									size="icon"
 									onClick={() => {
 										setError({ title: null, message: null });
 									}}
-									className="flex-shrink-0 text-destructive hover:text-destructive/90 transition-colors"
+									className="size-6 flex-shrink-0 text-destructive hover:bg-transparent hover:text-destructive/90"
 									aria-label="Dismiss error"
 									type="button"
 								>
@@ -142,7 +145,7 @@ export function LocalLoginForm() {
 										<line x1="18" y1="6" x2="6" y2="18" />
 										<line x1="6" y1="6" x2="18" y2="18" />
 									</svg>
-								</button>
+								</Button>
 							</div>
 						</motion.div>
 					)}
@@ -157,6 +160,7 @@ export function LocalLoginForm() {
 						type="email"
 						autoComplete="username"
 						required
+						maxLength={254}
 						placeholder="you@example.com"
 						value={username}
 						onChange={(e) => setUsername(e.target.value)}
@@ -173,7 +177,7 @@ export function LocalLoginForm() {
 					<label htmlFor="password" className="block text-sm font-medium text-foreground">
 						{t("password")}
 					</label>
-					<div className="relative">
+					<div className="relative mt-1">
 						<input
 							id="password"
 							type={showPassword ? "text" : "password"}
@@ -182,28 +186,30 @@ export function LocalLoginForm() {
 							placeholder="Enter your password"
 							value={password}
 							onChange={(e) => setPassword(e.target.value)}
-							className={`mt-1 block w-full rounded-md border pr-10 px-3 py-1.5 md:py-2 shadow-sm focus:outline-none focus:ring-1 bg-background text-foreground transition-all ${
+							className={`block w-full rounded-md border pr-10 px-3 py-1.5 md:py-2 shadow-sm focus:outline-none focus:ring-1 bg-background text-foreground transition-all ${
 								error.title
 									? "border-destructive focus:border-destructive focus:ring-destructive/40"
 									: "border-border focus:border-primary focus:ring-primary/40"
 							}`}
 							disabled={isLoggingIn}
 						/>
-						<button
+						<Button
 							type="button"
+							variant="ghost"
+							size="icon"
 							onClick={() => setShowPassword((prev) => !prev)}
-							className="absolute inset-y-0 right-0 flex items-center pr-3 mt-1 text-muted-foreground hover:text-foreground"
+							className="absolute inset-y-0 right-0 h-full w-10 text-muted-foreground hover:bg-transparent hover:text-foreground"
 							aria-label={showPassword ? t("hide_password") : t("show_password")}
 						>
 							{showPassword ? <EyeOff className="h-4 w-4" /> : <Eye className="h-4 w-4" />}
-						</button>
+						</Button>
 					</div>
 				</div>
 
-				<button
+				<Button
 					type="submit"
 					disabled={isLoggingIn}
-					className="relative w-full rounded-md bg-primary px-4 py-1.5 md:py-2 text-primary-foreground shadow-sm hover:bg-primary/90 focus:outline-none focus:ring-2 focus:ring-primary focus:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-50 transition-all text-sm md:text-base flex items-center justify-center gap-2"
+					className="relative h-auto w-full px-4 py-1.5 text-sm md:py-2 md:text-base"
 				>
 					<span className={isLoggingIn ? "invisible" : ""}>{t("sign_in")}</span>
 					{isLoggingIn && (
@@ -211,7 +217,7 @@ export function LocalLoginForm() {
 							<Spinner size="sm" className="text-primary-foreground" />
 						</span>
 					)}
-				</button>
+				</Button>
 			</form>
 
 			{authType === "LOCAL" && (
diff --git a/surfsense_web/app/(home)/login/page.tsx b/surfsense_web/app/(home)/login/page.tsx
index 9961df3d8a..a13f2644c6 100644
--- a/surfsense_web/app/(home)/login/page.tsx
+++ b/surfsense_web/app/(home)/login/page.tsx
@@ -6,9 +6,10 @@ import { useTranslations } from "next-intl";
 import { Suspense, useEffect, useState } from "react";
 import { toast } from "sonner";
 import { Logo } from "@/components/Logo";
+import { Button } from "@/components/ui/button";
 import { useGlobalLoadingEffect } from "@/hooks/use-global-loading";
 import { getAuthErrorDetails, shouldRetry } from "@/lib/auth-errors";
-import { getBearerToken } from "@/lib/auth-utils";
+import { getBearerToken, setRedirectPath } from "@/lib/auth-utils";
 import { AUTH_TYPE, isSSOAuth } from "@/lib/env-config";
 import { AmbientBackground } from "./AmbientBackground";
 import { GoogleLoginButton } from "./GoogleLoginButton";
@@ -58,7 +59,7 @@ function LoginContent() {
 		// Save returnUrl to localStorage so it persists through OAuth flows (e.g., Google)
 		// This is read by TokenHandler after successful authentication
 		if (returnUrl) {
-			localStorage.setItem("surfsense_redirect_path", decodeURIComponent(returnUrl));
+			setRedirectPath(decodeURIComponent(returnUrl));
 		}
 
 		// Show registration success message
@@ -189,10 +190,12 @@ function LoginContent() {
 									<p className="text-sm font-semibold mb-1">{urlError.title}</p>
 									<p className="text-sm text-red-700 dark:text-red-300">{urlError.message}</p>
 								</div>
-								<button
+								<Button
 									type="button"
+									variant="ghost"
+									size="icon"
 									onClick={() => setUrlError(null)}
-									className="flex-shrink-0 text-red-500 hover:text-red-700 dark:text-red-400 dark:hover:text-red-200 transition-colors"
+									className="size-6 flex-shrink-0 text-red-500 hover:bg-transparent hover:text-red-700 dark:text-red-400 dark:hover:text-red-200"
 									aria-label="Dismiss error"
 								>
 									<svg
@@ -210,7 +213,7 @@ function LoginContent() {
 										<line x1="18" y1="6" x2="6" y2="18" />
 										<line x1="6" y1="6" x2="18" y2="18" />
 									</svg>
-								</button>
+								</Button>
 							</div>
 						</motion.div>
 					)}
diff --git a/surfsense_web/app/(home)/pricing/page.tsx b/surfsense_web/app/(home)/pricing/page.tsx
index 3236e54df6..b343a38530 100644
--- a/surfsense_web/app/(home)/pricing/page.tsx
+++ b/surfsense_web/app/(home)/pricing/page.tsx
@@ -1,14 +1,27 @@
 import type { Metadata } from "next";
 import PricingBasic from "@/components/pricing/pricing-section";
+import { BreadcrumbNav } from "@/components/seo/breadcrumb-nav";
 
 export const metadata: Metadata = {
-	title: "Pricing | SurfSense",
-	description: "Explore SurfSense plans and pricing options.",
+	title: "Pricing | SurfSense - Free AI Search Plans",
+	description:
+		"Explore SurfSense plans and pricing. Start free with 500 pages & $5 in premium credits. Use ChatGPT, Claude AI, and premium AI models. Pay as you go at provider cost.",
+	alternates: {
+		canonical: "https://www.surfsense.com/pricing",
+	},
 };
 
 const page = () => {
 	return (
 		<div>
+			<div className="container mx-auto pt-24 px-4">
+				<BreadcrumbNav
+					items={[
+						{ name: "Home", href: "/" },
+						{ name: "Pricing", href: "/pricing" },
+					]}
+				/>
+			</div>
 			<PricingBasic />
 		</div>
 	);
diff --git a/surfsense_web/app/(home)/privacy/page.tsx b/surfsense_web/app/(home)/privacy/page.tsx
index 970f50e278..1bc939a1f6 100644
--- a/surfsense_web/app/(home)/privacy/page.tsx
+++ b/surfsense_web/app/(home)/privacy/page.tsx
@@ -3,6 +3,9 @@ import type { Metadata } from "next";
 export const metadata: Metadata = {
 	title: "Privacy Policy | SurfSense",
 	description: "Privacy Policy for SurfSense application",
+	alternates: {
+		canonical: "https://www.surfsense.com/privacy",
+	},
 };
 
 export default function PrivacyPolicy() {
diff --git a/surfsense_web/app/(home)/register/page.tsx b/surfsense_web/app/(home)/register/page.tsx
index b5d9c10006..23b30edb56 100644
--- a/surfsense_web/app/(home)/register/page.tsx
+++ b/surfsense_web/app/(home)/register/page.tsx
@@ -9,6 +9,7 @@ import { useEffect, useState } from "react";
 import { type ExternalToast, toast } from "sonner";
 import { registerMutationAtom } from "@/atoms/auth/auth-mutation.atoms";
 import { Logo } from "@/components/Logo";
+import { Button } from "@/components/ui/button";
 import { Spinner } from "@/components/ui/spinner";
 import { getAuthErrorDetails, isNetworkError, shouldRetry } from "@/lib/auth-errors";
 import { getBearerToken } from "@/lib/auth-utils";
@@ -224,11 +225,13 @@ export default function RegisterPage() {
 											<p className="text-sm font-semibold mb-1">{error.title}</p>
 											<p className="text-sm text-red-700 dark:text-red-300">{error.message}</p>
 										</div>
-										<button
+										<Button
+											variant="ghost"
+											size="icon"
 											onClick={() => {
 												setError({ title: null, message: null });
 											}}
-											className="flex-shrink-0 text-red-500 hover:text-red-700 dark:text-red-400 dark:hover:text-red-200 transition-colors"
+											className="size-6 flex-shrink-0 text-red-500 hover:bg-transparent hover:text-red-700 dark:text-red-400 dark:hover:text-red-200"
 											aria-label="Dismiss error"
 											type="button"
 										>
@@ -247,7 +250,7 @@ export default function RegisterPage() {
 												<line x1="18" y1="6" x2="6" y2="18" />
 												<line x1="6" y1="6" x2="18" y2="18" />
 											</svg>
-										</button>
+										</Button>
 									</div>
 								</motion.div>
 							)}
@@ -262,6 +265,7 @@ export default function RegisterPage() {
 								type="email"
 								autoComplete="email"
 								required
+								maxLength={254}
 								placeholder="you@example.com"
 								value={email}
 								onChange={(e) => setEmail(e.target.value)}
@@ -319,18 +323,18 @@ export default function RegisterPage() {
 							/>
 						</div>
 
-						<button
+						<Button
 							type="submit"
 							disabled={isRegistering}
-							className="relative w-full rounded-md bg-primary px-4 py-1.5 md:py-2 text-primary-foreground shadow-sm hover:bg-primary/90 focus:outline-none focus:ring-1 focus:ring-primary/40 disabled:cursor-not-allowed disabled:opacity-50 transition-all text-sm md:text-base flex items-center justify-center gap-2"
+							className="relative h-auto w-full px-4 py-1.5 text-sm md:py-2 md:text-base"
 						>
 							<span className={isRegistering ? "invisible" : ""}>{t("register")}</span>
 							{isRegistering && (
 								<span className="absolute inset-0 flex items-center justify-center gap-2">
-									<Spinner size="sm" className="text-white" />
+									<Spinner size="sm" className="text-primary-foreground" />
 								</span>
 							)}
-						</button>
+						</Button>
 					</form>
 
 					<div className="mt-4 text-center text-sm">
diff --git a/surfsense_web/app/(home)/terms/page.tsx b/surfsense_web/app/(home)/terms/page.tsx
index dbc4c02536..5ec14aeaa1 100644
--- a/surfsense_web/app/(home)/terms/page.tsx
+++ b/surfsense_web/app/(home)/terms/page.tsx
@@ -3,6 +3,9 @@ import type { Metadata } from "next";
 export const metadata: Metadata = {
 	title: "Terms of Service | SurfSense",
 	description: "Terms of Service for SurfSense application",
+	alternates: {
+		canonical: "https://www.surfsense.com/terms",
+	},
 };
 
 export default function TermsOfService() {
diff --git a/surfsense_web/app/api/v1/[...path]/route.ts b/surfsense_web/app/api/v1/[...path]/route.ts
new file mode 100644
index 0000000000..418bf1a335
--- /dev/null
+++ b/surfsense_web/app/api/v1/[...path]/route.ts
@@ -0,0 +1,70 @@
+import type { NextRequest } from "next/server";
+
+export const dynamic = "force-dynamic";
+
+const HOP_BY_HOP_HEADERS = new Set([
+	"connection",
+	"keep-alive",
+	"proxy-authenticate",
+	"proxy-authorization",
+	"te",
+	"trailer",
+	"transfer-encoding",
+	"upgrade",
+]);
+
+function getBackendBaseUrl() {
+	const base = process.env.FASTAPI_BACKEND_INTERNAL_URL || "http://localhost:8000";
+	return base.endsWith("/") ? base.slice(0, -1) : base;
+}
+
+function toUpstreamHeaders(headers: Headers) {
+	const nextHeaders = new Headers(headers);
+	nextHeaders.delete("host");
+	nextHeaders.delete("content-length");
+	return nextHeaders;
+}
+
+function toClientHeaders(headers: Headers) {
+	const nextHeaders = new Headers(headers);
+	for (const header of HOP_BY_HOP_HEADERS) {
+		nextHeaders.delete(header);
+	}
+	return nextHeaders;
+}
+
+async function proxy(request: NextRequest, context: { params: Promise<{ path?: string[] }> }) {
+	const params = await context.params;
+	const path = params.path?.join("/") || "";
+	const upstreamUrl = new URL(`${getBackendBaseUrl()}/api/v1/${path}`);
+	upstreamUrl.search = request.nextUrl.search;
+
+	const hasBody = request.method !== "GET" && request.method !== "HEAD";
+
+	const response = await fetch(upstreamUrl, {
+		method: request.method,
+		headers: toUpstreamHeaders(request.headers),
+		body: hasBody ? request.body : undefined,
+		// `duplex: "half"` is required by the Fetch spec when streaming a
+		// ReadableStream as the request body. Avoids buffering uploads in heap.
+		// @ts-expect-error - `duplex` is not yet in lib.dom RequestInit types.
+		duplex: hasBody ? "half" : undefined,
+		redirect: "manual",
+	});
+
+	return new Response(response.body, {
+		status: response.status,
+		statusText: response.statusText,
+		headers: toClientHeaders(response.headers),
+	});
+}
+
+export {
+	proxy as GET,
+	proxy as POST,
+	proxy as PUT,
+	proxy as PATCH,
+	proxy as DELETE,
+	proxy as OPTIONS,
+	proxy as HEAD,
+};
diff --git a/surfsense_web/app/apple-icon.png b/surfsense_web/app/apple-icon.png
new file mode 100644
index 0000000000..dbda551dae
Binary files /dev/null and b/surfsense_web/app/apple-icon.png differ
diff --git a/surfsense_web/app/dashboard/[search_space_id]/buy-more/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/buy-more/page.tsx
new file mode 100644
index 0000000000..74bcaff2ef
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/buy-more/page.tsx
@@ -0,0 +1,54 @@
+"use client";
+
+import { motion } from "motion/react";
+import { useState } from "react";
+import { BuyPagesContent } from "@/components/settings/buy-pages-content";
+import { BuyTokensContent } from "@/components/settings/buy-tokens-content";
+import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs";
+
+const TABS = [
+	{ id: "pages", label: "Pages" },
+	{ id: "tokens", label: "Premium Credit" },
+] as const;
+
+type TabId = (typeof TABS)[number]["id"];
+
+export default function BuyMorePage() {
+	const [activeTab, setActiveTab] = useState<TabId>("pages");
+
+	return (
+		<motion.div
+			initial={{ opacity: 0, y: 20 }}
+			animate={{ opacity: 1, y: 0 }}
+			transition={{ duration: 0.3 }}
+			className="w-full select-none"
+		>
+			<Tabs
+				value={activeTab}
+				onValueChange={(value) => {
+					setActiveTab(value as TabId);
+				}}
+				className="relative min-h-[37rem] w-full"
+			>
+				<TabsList className="absolute top-20 left-1/2 -translate-x-1/2 rounded-xl bg-accent p-1">
+					{TABS.map((tab) => (
+						<TabsTrigger
+							key={tab.id}
+							value={tab.id}
+							className="h-8 rounded-lg px-4 text-sm font-semibold text-accent-foreground transition-colors hover:bg-transparent hover:text-white data-[state=active]:bg-[#4a4a4a] data-[state=active]:text-white data-[state=active]:shadow-none"
+						>
+							{tab.label}
+						</TabsTrigger>
+					))}
+				</TabsList>
+
+				<TabsContent value="pages" className="mt-0 flex min-h-[37rem] items-center pt-14">
+					<BuyPagesContent />
+				</TabsContent>
+				<TabsContent value="tokens" className="mt-0 flex min-h-[37rem] items-center pt-14">
+					<BuyTokensContent />
+				</TabsContent>
+			</Tabs>
+		</motion.div>
+	);
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/buy-pages/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/buy-pages/page.tsx
index 6cab15d08c..f2439da44c 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/buy-pages/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/buy-pages/page.tsx
@@ -1,19 +1,16 @@
 "use client";
 
-import { motion } from "motion/react";
-import { BuyPagesContent } from "@/components/settings/buy-pages-content";
+import { useParams, useRouter } from "next/navigation";
+import { useEffect } from "react";
 
 export default function BuyPagesPage() {
-	return (
-		<div className="flex min-h-[calc(100vh-64px)] select-none items-center justify-center px-4 py-8">
-			<motion.div
-				initial={{ opacity: 0, y: 20 }}
-				animate={{ opacity: 1, y: 0 }}
-				transition={{ duration: 0.3 }}
-				className="w-full max-w-md space-y-6"
-			>
-				<BuyPagesContent />
-			</motion.div>
-		</div>
-	);
+	const router = useRouter();
+	const params = useParams();
+	const searchSpaceId = params?.search_space_id ?? "";
+
+	useEffect(() => {
+		router.replace(`/dashboard/${searchSpaceId}/buy-more`);
+	}, [router, searchSpaceId]);
+
+	return null;
 }
diff --git a/surfsense_web/app/dashboard/[search_space_id]/buy-tokens/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/buy-tokens/page.tsx
new file mode 100644
index 0000000000..cfdbd39385
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/buy-tokens/page.tsx
@@ -0,0 +1,16 @@
+"use client";
+
+import { useParams, useRouter } from "next/navigation";
+import { useEffect } from "react";
+
+export default function BuyTokensPage() {
+	const router = useRouter();
+	const params = useParams();
+	const searchSpaceId = params?.search_space_id ?? "";
+
+	useEffect(() => {
+		router.replace(`/dashboard/${searchSpaceId}/buy-more`);
+	}, [router, searchSpaceId]);
+
+	return null;
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/client-layout.tsx b/surfsense_web/app/dashboard/[search_space_id]/client-layout.tsx
index eceb462314..759539ce33 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/client-layout.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/client-layout.tsx
@@ -6,6 +6,7 @@ import { useTranslations } from "next-intl";
 import type React from "react";
 import { useCallback, useEffect, useRef, useState } from "react";
 import { toast } from "sonner";
+import { pendingUserImageDataUrlsAtom } from "@/atoms/chat/pending-user-images.atom";
 import { myAccessAtom } from "@/atoms/members/members-query.atoms";
 import { updateLLMPreferencesMutationAtom } from "@/atoms/new-llm-config/new-llm-config-mutation.atoms";
 import {
@@ -33,6 +34,7 @@ export function DashboardClientLayout({
 	const pathname = usePathname();
 	const { search_space_id } = useParams();
 	const setActiveSearchSpaceIdState = useSetAtom(activeSearchSpaceIdAtom);
+	const setPendingUserImageUrls = useSetAtom(pendingUserImageDataUrlsAtom);
 
 	const {
 		data: preferences = {},
@@ -142,6 +144,27 @@ export function DashboardClientLayout({
 
 	const electronAPI = useElectronAPI();
 
+	useEffect(() => {
+		const htmlBackground = document.documentElement.style.backgroundColor;
+		const bodyBackground = document.body.style.backgroundColor;
+
+		document.documentElement.style.backgroundColor = "var(--panel)";
+		document.body.style.backgroundColor = "var(--panel)";
+
+		return () => {
+			document.documentElement.style.backgroundColor = htmlBackground;
+			document.body.style.backgroundColor = bodyBackground;
+		};
+	}, []);
+
+	useEffect(() => {
+		if (!electronAPI?.onChatScreenCapture) return;
+		return electronAPI.onChatScreenCapture((dataUrl: string) => {
+			if (typeof dataUrl !== "string" || !dataUrl.startsWith("data:image/")) return;
+			setPendingUserImageUrls((prev) => [...prev, dataUrl]);
+		});
+	}, [electronAPI, setPendingUserImageUrls]);
+
 	useEffect(() => {
 		const activeSeacrhSpaceId =
 			typeof search_space_id === "string"
@@ -153,12 +176,13 @@ export function DashboardClientLayout({
 		setActiveSearchSpaceIdState(activeSeacrhSpaceId);
 
 		// Sync to Electron store if stored value is null (first navigation)
-		if (electronAPI?.setActiveSearchSpace) {
+		if (electronAPI?.getActiveSearchSpace && electronAPI.setActiveSearchSpace) {
+			const setActiveSearchSpace = electronAPI.setActiveSearchSpace;
 			electronAPI
-				.getActiveSearchSpace?.()
-				.then((stored) => {
+				.getActiveSearchSpace()
+				.then((stored: string | null) => {
 					if (!stored) {
-						electronAPI.setActiveSearchSpace!(activeSeacrhSpaceId);
+						setActiveSearchSpace(activeSeacrhSpaceId);
 					}
 				})
 				.catch(() => {});
diff --git a/surfsense_web/app/dashboard/[search_space_id]/logs/(manage)/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/logs/(manage)/page.tsx
index c5be2b5903..dea1e16c26 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/logs/(manage)/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/logs/(manage)/page.tsx
@@ -17,7 +17,6 @@ import {
 } from "@tanstack/react-table";
 import { useAtomValue } from "jotai";
 import {
-	Activity,
 	AlertCircle,
 	AlertTriangle,
 	Bug,
@@ -38,6 +37,7 @@ import {
 	RefreshCw,
 	Terminal,
 	Trash,
+	Workflow,
 	X,
 	Zap,
 } from "lucide-react";
@@ -133,7 +133,6 @@ const logStatusConfig = {
 function MessageDetails({
 	message,
 	taskName,
-	metadata,
 	createdAt,
 	children,
 }: {
@@ -623,7 +622,7 @@ function LogsSummaryDashboard({
 				<Card>
 					<CardHeader className="flex flex-row items-center justify-between space-y-0 pb-2">
 						<CardTitle className="text-sm font-medium">{t("total_logs")}</CardTitle>
-						<Activity className="h-4 w-4 text-muted-foreground" />
+						<Workflow className="h-4 w-4 text-muted-foreground" />
 					</CardHeader>
 					<CardContent>
 						<div className="text-2xl font-bold">{summary.total_logs}</div>
@@ -739,7 +738,7 @@ function LogsFilters({
 					</div>
 					{Boolean(filterInput) && (
 						<Button
-							className="absolute inset-y-0 end-0 flex h-full w-9 items-center justify-center rounded-e-lg text-muted-foreground/80 hover:text-foreground"
+							className="absolute inset-y-0 end-0 flex h-full w-9 items-center justify-center rounded-e-lg text-muted-foreground/80 hover:text-accent-foreground"
 							variant="ghost"
 							size="icon"
 							onClick={() => {
@@ -1045,7 +1044,7 @@ function LogsTable({
 										}}
 										exit={{ opacity: 0, y: -10 }}
 										className={cn(
-											"border-b transition-colors hover:bg-muted/50",
+											"border-b transition-colors hover:bg-accent hover:text-accent-foreground",
 											row.getIsSelected() ? "bg-muted/50" : ""
 										)}
 									>
diff --git a/surfsense_web/app/dashboard/[search_space_id]/logs/loading.tsx b/surfsense_web/app/dashboard/[search_space_id]/logs/loading.tsx
index e8ea648900..e734d4e530 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/logs/loading.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/logs/loading.tsx
@@ -53,7 +53,10 @@ export default function Loading() {
 
 				{/* Table Rows */}
 				{[...Array(6)].map((_, i) => (
-					<div key={i} className="border-b px-4 py-3 flex items-center gap-4 hover:bg-muted/50">
+					<div
+						key={i}
+						className="border-b px-4 py-3 flex items-center gap-4 hover:bg-accent hover:text-accent-foreground"
+					>
 						<Skeleton className="h-4 w-4" />
 						<Skeleton className="h-6 w-12 rounded-full" />
 						<Skeleton className="h-6 w-16 rounded-full" />
diff --git a/surfsense_web/app/dashboard/[search_space_id]/more-pages/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/more-pages/page.tsx
index 448706caf3..4b3301b9ff 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/more-pages/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/more-pages/page.tsx
@@ -1,19 +1,11 @@
 "use client";
 
-import { motion } from "motion/react";
 import { MorePagesContent } from "@/components/settings/more-pages-content";
 
 export default function MorePagesPage() {
 	return (
-		<div className="flex min-h-[calc(100vh-64px)] select-none items-center justify-center px-4 py-8">
-			<motion.div
-				initial={{ opacity: 0, y: 20 }}
-				animate={{ opacity: 1, y: 0 }}
-				transition={{ duration: 0.3 }}
-				className="w-full max-w-md space-y-6"
-			>
-				<MorePagesContent />
-			</motion.div>
+		<div className="w-full select-none space-y-6">
+			<MorePagesContent />
 		</div>
 	);
 }
diff --git a/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx
index 2f8a285c15..d439cbb670 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx
@@ -13,6 +13,7 @@ import { useParams } from "next/navigation";
 import { useCallback, useEffect, useMemo, useRef, useState } from "react";
 import { toast } from "sonner";
 import { z } from "zod";
+import { agentFlagsAtom } from "@/atoms/agent/agent-flags-query.atom";
 import { disabledToolsAtom } from "@/atoms/agent-tools/agent-tools.atoms";
 import {
 	clearTargetCommentIdAtom,
@@ -24,40 +25,74 @@ import {
 	mentionedDocumentIdsAtom,
 	mentionedDocumentsAtom,
 	messageDocumentsMapAtom,
-	sidebarSelectedDocumentsAtom,
 } from "@/atoms/chat/mentioned-documents.atom";
+import { pendingUserImageDataUrlsAtom } from "@/atoms/chat/pending-user-images.atom";
 import {
 	clearPlanOwnerRegistry,
 	// extractWriteTodosFromContent,
 } from "@/atoms/chat/plan-state.atom";
+import { setPremiumAlertForThreadAtom } from "@/atoms/chat/premium-alert.atom";
 import { closeReportPanelAtom } from "@/atoms/chat/report-panel.atom";
 import { type AgentCreatedDocument, agentCreatedDocumentsAtom } from "@/atoms/documents/ui.atoms";
 import { closeEditorPanelAtom } from "@/atoms/editor/editor-panel.atom";
 import { membersAtom } from "@/atoms/members/members-query.atoms";
 import { removeChatTabAtom, updateChatTabTitleAtom } from "@/atoms/tabs/tabs.atom";
 import { currentUserAtom } from "@/atoms/user/user-query.atoms";
-import { ThinkingStepsDataUI } from "@/components/assistant-ui/thinking-steps";
+import {
+	EditMessageDialog,
+	type EditMessageDialogChoice,
+} from "@/components/assistant-ui/edit-message-dialog";
+import { StepSeparatorDataUI } from "@/components/assistant-ui/step-separator";
 import { Thread } from "@/components/assistant-ui/thread";
+import {
+	createTokenUsageStore,
+	type TokenUsageData,
+	TokenUsageProvider,
+} from "@/components/assistant-ui/token-usage-context";
+import { Button } from "@/components/ui/button";
+import {
+	type HitlDecision,
+	PendingInterruptProvider,
+	type PendingInterruptState,
+} from "@/features/chat-messages/hitl";
+import { TimelineDataUI } from "@/features/chat-messages/timeline";
+import {
+	applyActionLogSse,
+	applyActionLogUpdatedSse,
+	markActionRevertedInCache,
+	useAgentActionsQuery,
+} from "@/hooks/use-agent-actions-query";
 import { useChatSessionStateSync } from "@/hooks/use-chat-session-state";
 import { useMessagesSync } from "@/hooks/use-messages-sync";
+import { getAgentFilesystemSelection } from "@/lib/agent-filesystem";
 import { documentsApiService } from "@/lib/apis/documents-api.service";
-import { authenticatedFetch, getBearerToken } from "@/lib/auth-utils";
-import { convertToThreadMessage } from "@/lib/chat/message-utils";
+import { getBearerToken } from "@/lib/auth-utils";
+import { type ChatFlow, classifyChatError } from "@/lib/chat/chat-error-classifier";
+import { tagPreAcceptSendFailure, toHttpResponseError } from "@/lib/chat/chat-request-errors";
+import {
+	convertToThreadMessage,
+	reconcileInterruptedAssistantMessages,
+} from "@/lib/chat/message-utils";
 import {
 	isPodcastGenerating,
 	looksLikePodcastRequest,
 	setActivePodcastTaskId,
 } from "@/lib/chat/podcast-state";
+import { createStreamFlushHelpers } from "@/lib/chat/stream-flush";
+import { consumeSseEvents, processSharedStreamEvent } from "@/lib/chat/stream-pipeline";
+import {
+	applyTurnIdToAssistantMessageList,
+	mergeChatTurnIdIntoMessage,
+	readStreamedChatTurnId,
+	readStreamedMessageId,
+} from "@/lib/chat/stream-side-effects";
 import {
 	addToolCall,
-	appendText,
-	buildContentForPersistence,
 	buildContentForUI,
 	type ContentPartsState,
-	FrameBatchedUpdater,
-	readSSEStream,
+	type FrameBatchedUpdater,
 	type ThinkingStepData,
-	updateThinkingSteps,
+	type ToolUIGate,
 	updateToolCall,
 } from "@/lib/chat/streaming-state";
 import {
@@ -66,12 +101,19 @@ import {
 	getRegenerateUrl,
 	getThreadFull,
 	getThreadMessages,
+	type ThreadListItem,
+	type ThreadListResponse,
 	type ThreadRecord,
 } from "@/lib/chat/thread-persistence";
+import {
+	extractUserTurnForNewChatApi,
+	type NewChatUserImagePayload,
+} from "@/lib/chat/user-turn-api-parts";
 import { NotFoundError } from "@/lib/error";
 import {
+	trackChatBlocked,
 	trackChatCreated,
-	trackChatError,
+	trackChatErrorDetailed,
 	trackChatMessageSent,
 	trackChatResponseReceived,
 } from "@/lib/posthog/events";
@@ -86,7 +128,7 @@ const MobileEditorPanel = dynamic(
 );
 const MobileHitlEditPanel = dynamic(
 	() =>
-		import("@/components/hitl-edit-panel/hitl-edit-panel").then((m) => ({
+		import("@/features/chat-messages/hitl").then((m) => ({
 			default: m.MobileHitlEditPanel,
 		})),
 	{ ssr: false }
@@ -100,31 +142,75 @@ const MobileReportPanel = dynamic(
 );
 
 /**
- * After a tool produces output, mark any previously-decided interrupt tool
- * calls as completed so the ApprovalCard can transition from shimmer to done.
+ * Generate a synthetic ``toolCallId`` for an action_request that has no
+ * matching streamed tool-call card (HITL-blocked subagent calls don't surface
+ * as tool-call events). Suffixes a counter when the base id is already taken
+ * — sequential interrupts for the same tool name otherwise collide on
+ * ``interrupt-${name}-${i}`` and crash assistant-ui with a duplicate-key error.
  */
-function markInterruptsCompleted(contentParts: Array<{ type: string; result?: unknown }>): void {
-	for (const part of contentParts) {
-		if (
-			part.type === "tool-call" &&
-			typeof part.result === "object" &&
-			part.result !== null &&
-			(part.result as Record<string, unknown>).__interrupt__ === true &&
-			(part.result as Record<string, unknown>).__decided__ &&
-			!(part.result as Record<string, unknown>).__completed__
-		) {
-			part.result = { ...(part.result as Record<string, unknown>), __completed__: true };
+function freshSynthToolCallId(
+	toolCallIndices: Map<string, number>,
+	toolName: string,
+	index: number
+): string {
+	const base = `interrupt-${toolName}-${index}`;
+	if (!toolCallIndices.has(base)) return base;
+	let n = 1;
+	while (toolCallIndices.has(`${base}-${n}`)) n++;
+	return `${base}-${n}`;
+}
+
+/**
+ * Pair each ``action_request`` to a unique pending tool-call card, preserving
+ * order so ``decisions[i]`` lines up with ``action_requests[i]`` on the wire.
+ *
+ * Same-name bundles (e.g. three ``create_jira_issue``) used to collapse onto
+ * one card because the matcher keyed by name; this consumes each card via the
+ * ``claimed`` set and walks forward in DOM order.
+ */
+function pairBundleToolCallIds(
+	toolCallIndices: Map<string, number>,
+	contentParts: Array<{
+		type: string;
+		toolName?: string;
+		result?: unknown;
+	}>,
+	actionRequests: ReadonlyArray<{ name: string }>
+): Array<string | null> {
+	const claimed = new Set<string>();
+	const paired: Array<string | null> = [];
+	for (const action of actionRequests) {
+		let matched: string | null = null;
+		for (const [tcId, idx] of toolCallIndices) {
+			if (claimed.has(tcId)) continue;
+			const part = contentParts[idx];
+			if (!part || part.type !== "tool-call" || part.toolName !== action.name) continue;
+			const result = part.result as Record<string, unknown> | undefined | null;
+			if (result == null || (result.__interrupt__ === true && !result.__decided__)) {
+				matched = tcId;
+				claimed.add(tcId);
+				break;
+			}
 		}
+		paired.push(matched);
 	}
+	return paired;
 }
 
 /**
- * Zod schema for mentioned document info (for type-safe parsing)
+ * Zod schema for mentioned document info (for type-safe parsing).
+ *
+ * ``kind`` defaults to ``"doc"`` so messages persisted before folder
+ * mentions existed deserialise unchanged.
  */
 const MentionedDocumentInfoSchema = z.object({
 	id: z.number(),
 	title: z.string(),
 	document_type: z.string(),
+	kind: z
+		.union([z.literal("doc"), z.literal("folder")])
+		.optional()
+		.default("doc"),
 });
 
 const MentionedDocumentsPartSchema = z.object({
@@ -149,43 +235,30 @@ function extractMentionedDocuments(content: unknown): MentionedDocumentInfo[] {
 }
 
 /**
- * Tools that should render custom UI in the chat.
+ * Every tool call renders a card. The legacy
+ * ``BASE_TOOLS_WITH_UI`` allowlist used to drop unknown tool calls on the
+ * floor; we now route everything through ``ToolFallback``. Persisted
+ * payload size stays bounded because the backend's
+ * ``format_thinking_step`` summarisation and the
+ * ``result_length``-only default for unknown tools (see
+ * ``stream_new_chat.py``) keep the JSON from ballooning.
  */
-const TOOLS_WITH_UI = new Set([
-	"web_search",
-	"generate_podcast",
-	"generate_report",
-	"generate_video_presentation",
-	"display_image",
-	"generate_image",
-	"delete_notion_page",
-	"create_notion_page",
-	"update_notion_page",
-	"create_linear_issue",
-	"update_linear_issue",
-	"delete_linear_issue",
-	"create_google_drive_file",
-	"delete_google_drive_file",
-	"create_onedrive_file",
-	"delete_onedrive_file",
-	"create_dropbox_file",
-	"delete_dropbox_file",
-	"create_calendar_event",
-	"update_calendar_event",
-	"delete_calendar_event",
-	"create_gmail_draft",
-	"update_gmail_draft",
-	"send_gmail_email",
-	"trash_gmail_email",
-	"create_jira_issue",
-	"update_jira_issue",
-	"delete_jira_issue",
-	"create_confluence_page",
-	"update_confluence_page",
-	"delete_confluence_page",
-	"execute",
-	// "write_todos", // Disabled for now
-]);
+const TOOLS_WITH_UI_ALL: ToolUIGate = "all";
+const TURN_CANCELLING_INITIAL_DELAY_MS = 200;
+const TURN_CANCELLING_BACKOFF_FACTOR = 2;
+const TURN_CANCELLING_MAX_DELAY_MS = 1500;
+const RECENT_CANCEL_WINDOW_MS = 5_000;
+
+function sleep(ms: number): Promise<void> {
+	return new Promise((resolve) => setTimeout(resolve, ms));
+}
+
+function computeFallbackTurnCancellingRetryDelay(attempt: number): number {
+	const safeAttempt = Math.max(1, attempt);
+	const raw =
+		TURN_CANCELLING_INITIAL_DELAY_MS * TURN_CANCELLING_BACKOFF_FACTOR ** (safeAttempt - 1);
+	return Math.min(raw, TURN_CANCELLING_MAX_DELAY_MS);
+}
 
 export default function NewChatPage() {
 	const params = useParams();
@@ -195,24 +268,88 @@ export default function NewChatPage() {
 	const [currentThread, setCurrentThread] = useState<ThreadRecord | null>(null);
 	const [messages, setMessages] = useState<ThreadMessageLike[]>([]);
 	const [isRunning, setIsRunning] = useState(false);
+	const [tokenUsageStore] = useState(() => createTokenUsageStore());
 	const abortControllerRef = useRef<AbortController | null>(null);
-	const [pendingInterrupt, setPendingInterrupt] = useState<{
-		threadId: number;
-		assistantMsgId: string;
-		interruptData: Record<string, unknown>;
-	} | null>(null);
+	const recentCancelRequestedAtRef = useRef(0);
+	// One entry per paused subagent, in receipt order (which matches the
+	// backend's ``state.interrupts`` traversal — and therefore the order
+	// ``slice_decisions_by_tool_call`` consumes on resume). Cleared on submit
+	// or on a fresh user turn.
+	const [pendingInterrupts, setPendingInterrupts] = useState<PendingInterruptState[]>([]);
+	// Per-card staged decisions held until every pending card has submitted,
+	// at which point we batch them into one ``hitl-decision`` event in the
+	// same order as ``pendingInterrupts``. Using a ref because partial
+	// progress should not re-render the page.
+	const stagedDecisionsByInterruptIdRef = useRef<Map<string, HitlDecision[]>>(new Map());
+	const toolsWithUI = TOOLS_WITH_UI_ALL;
+	const setMessageDocumentsMap = useSetAtom(messageDocumentsMapAtom);
+
+	const persistAssistantErrorMessage = useCallback(
+		async ({
+			threadId,
+			assistantMsgId,
+			text,
+		}: {
+			threadId: number | null;
+			assistantMsgId: string;
+			text: string;
+		}) => {
+			setMessages((prev) =>
+				prev.map((m) =>
+					m.id === assistantMsgId
+						? {
+								...m,
+								content: [{ type: "text", text }],
+							}
+						: m
+				)
+			);
+
+			if (!threadId) return;
+
+			// Persist only temporary assistant placeholders to avoid duplicate rows
+			// when the message already has a database-backed ID.
+			if (!assistantMsgId.startsWith("msg-assistant-")) return;
+
+			try {
+				const savedMessage = await appendMessage(threadId, {
+					role: "assistant",
+					content: [{ type: "text", text }],
+				});
+				const newMsgId = `msg-${savedMessage.id}`;
+				tokenUsageStore.rename(assistantMsgId, newMsgId);
+				setMessages((prev) =>
+					prev.map((m) => (m.id === assistantMsgId ? { ...m, id: newMsgId } : m))
+				);
+			} catch (persistErr) {
+				console.error("Failed to persist assistant error message:", persistErr);
+			}
+		},
+		[tokenUsageStore]
+	);
+
+	// NOTE: ``persistUserTurn`` / ``persistAssistantTurn`` callbacks
+	// were removed in the SSE-based message ID handshake refactor.
+	// ``stream_new_chat`` and ``stream_resume_chat`` now persist both
+	// the user and assistant rows server-side via
+	// ``persist_user_turn`` / ``persist_assistant_shell`` and emit
+	// ``data-user-message-id`` / ``data-assistant-message-id`` SSE
+	// events; the consumers below rename the optimistic ids in real
+	// time. ``persistAssistantErrorMessage`` (above) is intentionally
+	// kept — it is the pre-stream-error fallback fired when the
+	// server NEVER accepted the request, and the BE has nothing to
+	// persist in that case.
 
 	// Get disabled tools from the tool toggle UI
 	const disabledTools = useAtomValue(disabledToolsAtom);
 
-	// Get mentioned document IDs from the composer (derived from @ mentions + sidebar selections)
+	// Get mentioned document IDs from the composer.
 	const mentionedDocumentIds = useAtomValue(mentionedDocumentIdsAtom);
 	const mentionedDocuments = useAtomValue(mentionedDocumentsAtom);
-	const sidebarDocuments = useAtomValue(sidebarSelectedDocumentsAtom);
+	const messageDocumentsMap = useAtomValue(messageDocumentsMapAtom);
 	const setMentionedDocuments = useSetAtom(mentionedDocumentsAtom);
-	const setSidebarDocuments = useSetAtom(sidebarSelectedDocumentsAtom);
-	const setMessageDocumentsMap = useSetAtom(messageDocumentsMapAtom);
 	const setCurrentThreadState = useSetAtom(currentThreadAtom);
+	const setPremiumAlertForThread = useSetAtom(setPremiumAlertForThreadAtom);
 	const setTargetCommentId = useSetAtom(setTargetCommentIdAtom);
 	const clearTargetCommentId = useSetAtom(clearTargetCommentIdAtom);
 	const closeReportPanel = useSetAtom(closeReportPanelAtom);
@@ -220,9 +357,24 @@ export default function NewChatPage() {
 	const updateChatTabTitle = useSetAtom(updateChatTabTitleAtom);
 	const removeChatTab = useSetAtom(removeChatTabAtom);
 	const setAgentCreatedDocuments = useSetAtom(agentCreatedDocumentsAtom);
+	const pendingUserImageUrls = useAtomValue(pendingUserImageDataUrlsAtom);
+	const setPendingUserImageUrls = useSetAtom(pendingUserImageDataUrlsAtom);
+	// Edit dialog state. Holds the message id being edited and
+	// the (already extracted) regenerate args so we can resume the edit
+	// after the user picks "revert all" / "continue" / "cancel".
+	const [editDialogState, setEditDialogState] = useState<{
+		fromMessageId: number;
+		userQuery: string | null;
+		userMessageContent: ThreadMessageLike["content"];
+		userImages: NewChatUserImagePayload[];
+		downstreamReversibleCount: number;
+		downstreamTotalCount: number;
+	} | null>(null);
 
 	// Get current user for author info in shared chats
 	const { data: currentUser } = useAtomValue(currentUserAtom);
+	const { data: agentFlags } = useAtomValue(agentFlagsAtom);
+	const localFilesystemEnabled = agentFlags?.enable_desktop_local_filesystem === true;
 
 	// Live collaboration: sync session state and messages via Zero
 	useChatSessionStateSync(threadId);
@@ -237,6 +389,11 @@ export default function NewChatPage() {
 				content: unknown;
 				author_id: string | null;
 				created_at: string;
+				// Forwarded so ``convertToThreadMessage`` can rebuild the
+				// ``metadata.custom.chatTurnId`` on the
+				// ``ThreadMessageLike``. Required by the inline Revert
+				// button's per-turn fallback.
+				turn_id?: string | null;
 			}[]
 		) => {
 			if (isRunning) {
@@ -251,7 +408,7 @@ export default function NewChatPage() {
 				const memberById = new Map(membersData?.map((m) => [m.user_id, m]) ?? []);
 				const prevById = new Map(prev.map((m) => [m.id, m]));
 
-				return syncedMessages.map((msg) => {
+				return reconcileInterruptedAssistantMessages(syncedMessages).map((msg) => {
 					const member = msg.author_id ? (memberById.get(msg.author_id) ?? null) : null;
 
 					// Preserve existing author info if member lookup fails (e.g., cloned chats)
@@ -269,6 +426,11 @@ export default function NewChatPage() {
 						created_at: msg.created_at,
 						author_display_name: member?.user_display_name ?? existingAuthor?.displayName ?? null,
 						author_avatar_url: member?.user_avatar_url ?? existingAuthor?.avatarUrl ?? null,
+						// Forward the per-turn correlation id so the
+						// inline Revert button's ``(chat_turn_id,
+						// tool_name, position)`` fallback survives the
+						// post-stream Zero re-sync.
+						turn_id: msg.turn_id ?? null,
 					});
 				});
 			});
@@ -285,6 +447,13 @@ export default function NewChatPage() {
 		return Number.isNaN(parsed) ? 0 : parsed;
 	}, [params.search_space_id]);
 
+	// Unified store for agent-action rows (the same react-query cache
+	// the agent-actions dialog, the inline Revert button, and the
+	// per-turn Revert button all read). Hydrates from
+	// ``GET /threads/{id}/actions`` and is updated incrementally by the
+	// SSE handlers + revert-batch results below — no atom side-channel.
+	const { items: agentActionItems } = useAgentActionsQuery(threadId);
+
 	// Extract chat_id from URL params
 	const urlChatId = useMemo(() => {
 		const id = params.chat_id;
@@ -297,6 +466,143 @@ export default function NewChatPage() {
 		return Number.isNaN(parsed) ? 0 : parsed;
 	}, [params.chat_id]);
 
+	const handleChatFailure = useCallback(
+		async ({
+			error,
+			flow,
+			threadId,
+			assistantMsgId,
+		}: {
+			error: unknown;
+			flow: ChatFlow;
+			threadId: number | null;
+			assistantMsgId: string;
+		}) => {
+			const normalized = classifyChatError({
+				error,
+				flow,
+				context: {
+					searchSpaceId,
+					threadId,
+				},
+			});
+
+			const logger =
+				normalized.severity === "error"
+					? console.error
+					: normalized.severity === "warn"
+						? console.warn
+						: console.info;
+			logger(`[NewChatPage] ${flow} ${normalized.kind}:`, error);
+
+			const telemetryPayload = {
+				flow,
+				kind: normalized.kind,
+				error_code: normalized.errorCode,
+				severity: normalized.severity,
+				is_expected: normalized.isExpected,
+				message: normalized.userMessage,
+			};
+			if (normalized.telemetryEvent === "chat_blocked") {
+				trackChatBlocked(searchSpaceId, threadId, telemetryPayload);
+			} else {
+				trackChatErrorDetailed(searchSpaceId, threadId, telemetryPayload);
+			}
+
+			if (normalized.channel === "silent") {
+				return;
+			}
+
+			if (normalized.channel === "pinned_inline") {
+				if (threadId) {
+					setPremiumAlertForThread({
+						threadId,
+						message: normalized.userMessage,
+						userId: currentUser?.id ?? null,
+					});
+				}
+				if (normalized.assistantMessage) {
+					await persistAssistantErrorMessage({
+						threadId,
+						assistantMsgId,
+						text: normalized.assistantMessage,
+					});
+				}
+				return;
+			}
+
+			toast.error(normalized.userMessage);
+		},
+		[currentUser?.id, persistAssistantErrorMessage, searchSpaceId, setPremiumAlertForThread]
+	);
+
+	const handleStreamTerminalError = useCallback(
+		async ({
+			error,
+			flow,
+			threadId,
+			assistantMsgId,
+			accepted,
+			onAbort,
+			onPreAcceptFailure,
+			onAcceptedStreamError,
+		}: {
+			error: unknown;
+			flow: ChatFlow;
+			threadId: number | null;
+			assistantMsgId: string;
+			accepted: boolean;
+			onAbort?: () => Promise<void>;
+			onPreAcceptFailure?: () => Promise<void>;
+			onAcceptedStreamError?: () => Promise<void>;
+		}) => {
+			if (error instanceof Error && error.name === "AbortError") {
+				await onAbort?.();
+				return;
+			}
+
+			if (!accepted) {
+				await onPreAcceptFailure?.();
+			} else {
+				await onAcceptedStreamError?.();
+			}
+
+			await handleChatFailure({
+				error: !accepted ? tagPreAcceptSendFailure(error) : error,
+				flow,
+				threadId,
+				assistantMsgId: accepted ? assistantMsgId : "no-persist-assistant",
+			});
+		},
+		[handleChatFailure]
+	);
+
+	const fetchWithTurnCancellingRetry = useCallback(async (runFetch: () => Promise<Response>) => {
+		const maxAttempts = 4;
+		for (let attempt = 1; attempt <= maxAttempts; attempt += 1) {
+			const response = await runFetch();
+			if (response.ok) {
+				return response;
+			}
+			const error = await toHttpResponseError(response);
+			const withMeta = error as Error & { errorCode?: string; retryAfterMs?: number };
+			const isTurnCancelling = withMeta.errorCode === "TURN_CANCELLING";
+			const isRecentThreadBusyAfterCancel =
+				withMeta.errorCode === "THREAD_BUSY" &&
+				Date.now() - recentCancelRequestedAtRef.current <= RECENT_CANCEL_WINDOW_MS;
+			if ((isTurnCancelling || isRecentThreadBusyAfterCancel) && attempt < maxAttempts) {
+				const waitMs = withMeta.retryAfterMs ?? computeFallbackTurnCancellingRetryDelay(attempt);
+				await sleep(waitMs);
+				continue;
+			}
+			throw error;
+		}
+
+		throw Object.assign(new Error("Turn cancellation retry limit exceeded"), {
+			errorCode: "TURN_CANCELLING",
+		});
+	}, []);
+
 	// Initialize thread and load messages
 	// For new chats (no urlChatId), we use lazy creation - thread is created on first message
 	const initializeThread = useCallback(async () => {
@@ -307,11 +613,13 @@ export default function NewChatPage() {
 		setThreadId(null);
 		setCurrentThread(null);
 		setMentionedDocuments([]);
-		setSidebarDocuments([]);
+		tokenUsageStore.clear();
 		setMessageDocumentsMap({});
 		clearPlanOwnerRegistry();
 		closeReportPanel();
 		closeEditorPanel();
+		// Note: agent-action data is keyed by threadId in react-query so
+		// switching threads naturally swaps caches; no explicit reset.
 
 		try {
 			if (urlChatId > 0) {
@@ -327,9 +635,17 @@ export default function NewChatPage() {
 				setCurrentThread(threadData);
 
 				if (messagesResponse.messages && messagesResponse.messages.length > 0) {
-					const loadedMessages = messagesResponse.messages.map(convertToThreadMessage);
+					const loadedMessages = reconcileInterruptedAssistantMessages(
+						messagesResponse.messages
+					).map(convertToThreadMessage);
 					setMessages(loadedMessages);
 
+					for (const msg of messagesResponse.messages) {
+						if (msg.token_usage) {
+							tokenUsageStore.set(`msg-${msg.id}`, msg.token_usage as TokenUsageData);
+						}
+					}
+
 					const restoredDocsMap: Record<string, MentionedDocumentInfo[]> = {};
 					for (const msg of messagesResponse.messages) {
 						if (msg.role === "user") {
@@ -369,11 +685,11 @@ export default function NewChatPage() {
 		urlChatId,
 		setMessageDocumentsMap,
 		setMentionedDocuments,
-		setSidebarDocuments,
 		closeReportPanel,
 		closeEditorPanel,
 		removeChatTab,
 		searchSpaceId,
+		tokenUsageStore,
 	]);
 
 	// Initialize on mount, and re-init when switching search spaces (even if urlChatId is the same)
@@ -458,12 +774,39 @@ export default function NewChatPage() {
 
 	// Cancel ongoing request
 	const cancelRun = useCallback(async () => {
+		if (threadId) {
+			const token = getBearerToken();
+			if (token) {
+				const backendUrl = process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL || "http://localhost:8000";
+				try {
+					const response = await fetch(
+						`${backendUrl}/api/v1/threads/${threadId}/cancel-active-turn`,
+						{
+							method: "POST",
+							headers: {
+								Authorization: `Bearer ${token}`,
+							},
+						}
+					);
+					if (response.ok) {
+						const payload = (await response.json()) as {
+							error_code?: string;
+						};
+						if (payload.error_code === "TURN_CANCELLING") {
+							recentCancelRequestedAtRef.current = Date.now();
+						}
+					}
+				} catch (error) {
+					console.warn("[NewChatPage] Failed to signal cancel-active-turn:", error);
+				}
+			}
+		}
 		if (abortControllerRef.current) {
 			abortControllerRef.current.abort();
 			abortControllerRef.current = null;
 		}
 		setIsRunning(false);
-	}, []);
+	}, [threadId]);
 
 	// Handle new message from user
 	const onNew = useCallback(
@@ -475,18 +818,12 @@ export default function NewChatPage() {
 				abortControllerRef.current = null;
 			}
 
-			// Extract user query text from content parts
-			let userQuery = "";
-			for (const part of message.content) {
-				if (part.type === "text") {
-					userQuery += part.text;
-				}
-			}
+			const urlsSnapshot = [...pendingUserImageUrls];
+			const { userQuery, userImages } = extractUserTurnForNewChatApi(message, urlsSnapshot);
 
-			if (!userQuery.trim()) return;
+			if (!userQuery.trim() && userImages.length === 0) return;
 
-			// Check if podcast is already generating
-			if (isPodcastGenerating() && looksLikePodcastRequest(userQuery)) {
+			if (userQuery.trim() && isPodcastGenerating() && looksLikePodcastRequest(userQuery)) {
 				toast.warning("A podcast is already being generated.");
 				return;
 			}
@@ -521,13 +858,27 @@ export default function NewChatPage() {
 					);
 				} catch (error) {
 					console.error("[NewChatPage] Failed to create thread:", error);
-					toast.error("Failed to start chat. Please try again.");
+					await handleChatFailure({
+						error: tagPreAcceptSendFailure(error),
+						flow: "new",
+						threadId: currentThreadId,
+						assistantMsgId: "no-persist-assistant",
+					});
 					return;
 				}
 			}
 
-			// Add user message to state
-			const userMsgId = `msg-user-${Date.now()}`;
+			if (urlsSnapshot.length > 0) {
+				setPendingUserImageUrls((prev) => prev.filter((u) => !urlsSnapshot.includes(u)));
+			}
+
+			// Add user message to state. Mutable because the SSE
+			// ``data-user-message-id`` handler (below) renames this
+			// optimistic id to the canonical ``msg-{db_id}`` once the
+			// backend's ``persist_user_turn`` resolves the row, and
+			// the in-stream flush / interrupt closures need to see
+			// the post-rename value via this live ``let`` binding.
+			let userMsgId = `msg-user-${Date.now()}`;
 
 			// Always include author metadata so the UI layer can decide visibility
 			const authorMetadata = currentUser
@@ -541,10 +892,27 @@ export default function NewChatPage() {
 					}
 				: undefined;
 
+			const existingImageUrls = new Set(
+				message.content
+					.filter(
+						(p): p is { type: "image"; image: string } =>
+							typeof p === "object" &&
+							p !== null &&
+							"type" in p &&
+							p.type === "image" &&
+							"image" in p
+					)
+					.map((p) => p.image)
+			);
+			const extraImageParts = urlsSnapshot
+				.filter((u) => !existingImageUrls.has(u))
+				.map((image) => ({ type: "image" as const, image }));
+			const userDisplayContent = [...message.content, ...extraImageParts];
+
 			const userMessage: ThreadMessageLike = {
 				id: userMsgId,
 				role: "user",
-				content: message.content,
+				content: userDisplayContent,
 				createdAt: new Date(),
 				metadata: authorMetadata,
 			};
@@ -552,22 +920,32 @@ export default function NewChatPage() {
 
 			// Track message sent
 			trackChatMessageSent(searchSpaceId, currentThreadId, {
-				hasAttachments: false,
+				hasAttachments: userImages.length > 0,
 				hasMentionedDocuments:
 					mentionedDocumentIds.surfsense_doc_ids.length > 0 ||
-					mentionedDocumentIds.document_ids.length > 0,
+					mentionedDocumentIds.document_ids.length > 0 ||
+					mentionedDocumentIds.folder_ids.length > 0,
 				messageLength: userQuery.length,
 			});
 
-			// Combine @-mention chips + sidebar selections for display & persistence
+			// Collect unique mention chips for display & persistence.
+			// Dedup key is ``kind:document_type:id`` so a folder and a
+			// doc with the same integer id never collapse into one
+			// entry. The ``kind`` field is forwarded to the backend
+			// so the persisted ``mentioned-documents`` content part
+			// can render the correct chip type on reload.
 			const allMentionedDocs: MentionedDocumentInfo[] = [];
 			const seenDocKeys = new Set<string>();
-			for (const doc of [...mentionedDocuments, ...sidebarDocuments]) {
-				const key = `${doc.document_type}:${doc.id}`;
-				if (!seenDocKeys.has(key)) {
-					seenDocKeys.add(key);
-					allMentionedDocs.push({ id: doc.id, title: doc.title, document_type: doc.document_type });
-				}
+			for (const doc of mentionedDocuments) {
+				const key = `${doc.kind}:${doc.document_type}:${doc.id}`;
+				if (seenDocKeys.has(key)) continue;
+				seenDocKeys.add(key);
+				allMentionedDocs.push({
+					id: doc.id,
+					title: doc.title,
+					document_type: doc.document_type,
+					kind: doc.kind,
+				});
 			}
 
 			if (allMentionedDocs.length > 0) {
@@ -577,67 +955,40 @@ export default function NewChatPage() {
 				}));
 			}
 
-			const persistContent: unknown[] = [...message.content];
-
-			if (allMentionedDocs.length > 0) {
-				persistContent.push({
-					type: "mentioned-documents",
-					documents: allMentionedDocs,
-				});
-			}
-
-			appendMessage(currentThreadId, {
-				role: "user",
-				content: persistContent,
-			})
-				.then((savedMessage) => {
-					const newUserMsgId = `msg-${savedMessage.id}`;
-					setMessages((prev) =>
-						prev.map((m) => (m.id === userMsgId ? { ...m, id: newUserMsgId } : m))
-					);
-					setMessageDocumentsMap((prev) => {
-						const docs = prev[userMsgId];
-						if (!docs) return prev;
-						const { [userMsgId]: _, ...rest } = prev;
-						return { ...rest, [newUserMsgId]: docs };
-					});
-					if (isNewThread) {
-						queryClient.invalidateQueries({ queryKey: ["threads", String(searchSpaceId)] });
-					}
-				})
-				.catch((err) => console.error("Failed to persist user message:", err));
-
 			// Start streaming response
 			setIsRunning(true);
 			const controller = new AbortController();
 			abortControllerRef.current = controller;
 
-			// Prepare assistant message
-			const assistantMsgId = `msg-assistant-${Date.now()}`;
+			// Prepare assistant message. Mutable for the same reason
+			// as ``userMsgId`` above — the ``data-assistant-message-id``
+			// SSE handler reassigns this once
+			// ``persist_assistant_shell`` returns its canonical id.
+			let assistantMsgId = `msg-assistant-${Date.now()}`;
 			const currentThinkingSteps = new Map<string, ThinkingStepData>();
-			const batcher = new FrameBatchedUpdater();
-
 			const contentPartsState: ContentPartsState = {
 				contentParts: [],
 				currentTextPartIndex: -1,
+				currentReasoningPartIndex: -1,
 				toolCallIndices: new Map(),
 			};
-			const { contentParts, toolCallIndices } = contentPartsState;
+			const { contentParts } = contentPartsState;
 			let wasInterrupted = false;
-
-			// Add placeholder assistant message
-			setMessages((prev) => [
-				...prev,
-				{
-					id: assistantMsgId,
-					role: "assistant",
-					content: [{ type: "text", text: "" }],
-					createdAt: new Date(),
-				},
-			]);
+			let newAccepted = false;
+			let streamBatcher: FrameBatchedUpdater | null = null;
 
 			try {
 				const backendUrl = process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL || "http://localhost:8000";
+				const selection = await getAgentFilesystemSelection(searchSpaceId, {
+					localFilesystemEnabled,
+				});
+				if (
+					selection.filesystem_mode === "desktop_local_folder" &&
+					(!selection.local_filesystem_mounts || selection.local_filesystem_mounts.length === 0)
+				) {
+					toast.error("Select a local folder before using Local Folder mode.");
+					return;
+				}
 
 				// Build message history for context
 				const messageHistory = messages
@@ -656,111 +1007,134 @@ export default function NewChatPage() {
 				// Get mentioned document IDs for context (separate fields for backend)
 				const hasDocumentIds = mentionedDocumentIds.document_ids.length > 0;
 				const hasSurfsenseDocIds = mentionedDocumentIds.surfsense_doc_ids.length > 0;
+				const hasFolderIds = mentionedDocumentIds.folder_ids.length > 0;
 
 				// Clear mentioned documents after capturing them
-				if (hasDocumentIds || hasSurfsenseDocIds) {
+				if (hasDocumentIds || hasSurfsenseDocIds || hasFolderIds) {
 					setMentionedDocuments([]);
-					setSidebarDocuments([]);
 				}
 
-				const response = await authenticatedFetch(`${backendUrl}/api/v1/new_chat`, {
-					method: "POST",
-					headers: {
-						"Content-Type": "application/json",
-					},
-					body: JSON.stringify({
-						chat_id: currentThreadId,
-						user_query: userQuery.trim(),
-						search_space_id: searchSpaceId,
-						messages: messageHistory,
-						mentioned_document_ids: hasDocumentIds ? mentionedDocumentIds.document_ids : undefined,
-						mentioned_surfsense_doc_ids: hasSurfsenseDocIds
-							? mentionedDocumentIds.surfsense_doc_ids
-							: undefined,
-						disabled_tools: disabledTools.length > 0 ? disabledTools : undefined,
-					}),
-					signal: controller.signal,
-				});
+				const response = await fetchWithTurnCancellingRetry(() =>
+					fetch(`${backendUrl}/api/v1/new_chat`, {
+						method: "POST",
+						headers: {
+							"Content-Type": "application/json",
+							Authorization: `Bearer ${token}`,
+						},
+						body: JSON.stringify({
+							chat_id: currentThreadId,
+							user_query: userQuery.trim(),
+							search_space_id: searchSpaceId,
+							filesystem_mode: selection.filesystem_mode,
+							client_platform: selection.client_platform,
+							local_filesystem_mounts: selection.local_filesystem_mounts,
+							messages: messageHistory,
+							mentioned_document_ids: hasDocumentIds
+								? mentionedDocumentIds.document_ids
+								: undefined,
+							mentioned_surfsense_doc_ids: hasSurfsenseDocIds
+								? mentionedDocumentIds.surfsense_doc_ids
+								: undefined,
+							mentioned_folder_ids: hasFolderIds ? mentionedDocumentIds.folder_ids : undefined,
+							// Full mention metadata (docs + folders, with
+							// ``kind`` discriminator) so the BE can embed a
+							// ``mentioned-documents`` ContentPart on the
+							// persisted user message (replaces the old FE-side
+							// injection in ``persistUserTurn``).
+							mentioned_documents:
+								allMentionedDocs.length > 0
+									? allMentionedDocs.map((d) => ({
+											id: d.id,
+											title: d.title,
+											document_type: d.document_type,
+											kind: d.kind,
+										}))
+									: undefined,
+							disabled_tools: disabledTools.length > 0 ? disabledTools : undefined,
+							...(userImages.length > 0 ? { user_images: userImages } : {}),
+						}),
+						signal: controller.signal,
+					})
+				);
 
 				if (!response.ok) {
-					throw new Error(`Backend error: ${response.status}`);
+					throw await toHttpResponseError(response);
 				}
+				newAccepted = true;
+				setMessages((prev) => [
+					...prev,
+					{
+						id: assistantMsgId,
+						role: "assistant",
+						content: [{ type: "text", text: "" }],
+						createdAt: new Date(),
+					},
+				]);
 
 				const flushMessages = () => {
 					setMessages((prev) =>
 						prev.map((m) =>
 							m.id === assistantMsgId
-								? { ...m, content: buildContentForUI(contentPartsState, TOOLS_WITH_UI) }
+								? { ...m, content: buildContentForUI(contentPartsState, toolsWithUI) }
 								: m
 						)
 					);
 				};
-				const scheduleFlush = () => batcher.schedule(flushMessages);
-
-				for await (const parsed of readSSEStream(response)) {
-					switch (parsed.type) {
-						case "text-delta":
-							appendText(contentPartsState, parsed.delta);
-							scheduleFlush();
-							break;
-
-						case "tool-input-start":
-							addToolCall(contentPartsState, TOOLS_WITH_UI, parsed.toolCallId, parsed.toolName, {});
-							batcher.flush();
-							break;
+				const { batcher, scheduleFlush, forceFlush } = createStreamFlushHelpers(flushMessages);
+				streamBatcher = batcher;
 
-						case "tool-input-available": {
-							if (toolCallIndices.has(parsed.toolCallId)) {
-								updateToolCall(contentPartsState, parsed.toolCallId, { args: parsed.input || {} });
-							} else {
-								addToolCall(
-									contentPartsState,
-									TOOLS_WITH_UI,
-									parsed.toolCallId,
-									parsed.toolName,
-									parsed.input || {}
-								);
-							}
-							batcher.flush();
-							break;
-						}
-
-						case "tool-output-available": {
-							updateToolCall(contentPartsState, parsed.toolCallId, { result: parsed.output });
-							markInterruptsCompleted(contentParts);
-							if (parsed.output?.status === "pending" && parsed.output?.podcast_id) {
-								const idx = toolCallIndices.get(parsed.toolCallId);
-								if (idx !== undefined) {
-									const part = contentParts[idx];
-									if (part?.type === "tool-call" && part.toolName === "generate_podcast") {
-										setActivePodcastTaskId(String(parsed.output.podcast_id));
-									}
+				await consumeSseEvents(response, async (parsed) => {
+					if (
+						processSharedStreamEvent(parsed, {
+							contentPartsState,
+							toolsWithUI,
+							currentThinkingSteps,
+							scheduleFlush,
+							forceFlush,
+							onTokenUsage: (data) => {
+								tokenUsageStore.set(assistantMsgId, data);
+							},
+							onTurnStatus: (data) => {
+								if (data.status === "cancelling") {
+									recentCancelRequestedAtRef.current = Date.now();
 								}
-							}
-							batcher.flush();
-							break;
-						}
-
-						case "data-thinking-step": {
-							const stepData = parsed.data as ThinkingStepData;
-							if (stepData?.id) {
-								currentThinkingSteps.set(stepData.id, stepData);
-								const didUpdate = updateThinkingSteps(contentPartsState, currentThinkingSteps);
-								if (didUpdate) {
-									scheduleFlush();
+							},
+							onToolOutputAvailable: (event, sharedCtx) => {
+								if (event.output?.status === "pending" && event.output?.podcast_id) {
+									const idx = sharedCtx.toolCallIndices.get(event.toolCallId);
+									if (idx !== undefined) {
+										const part = sharedCtx.contentPartsState.contentParts[idx];
+										if (part?.type === "tool-call" && part.toolName === "generate_podcast") {
+											setActivePodcastTaskId(String(event.output.podcast_id));
+										}
+									}
 								}
-							}
-							break;
-						}
-
+							},
+						})
+					) {
+						return;
+					}
+					switch (parsed.type) {
 						case "data-thread-title-update": {
 							const titleData = parsed.data as { threadId: number; title: string };
 							if (titleData?.title && titleData?.threadId === currentThreadId) {
 								setCurrentThread((prev) => (prev ? { ...prev, title: titleData.title } : prev));
 								updateChatTabTitle({ chatId: currentThreadId, title: titleData.title });
-								queryClient.invalidateQueries({
-									queryKey: ["threads", String(searchSpaceId)],
-								});
+								queryClient.setQueriesData<ThreadListResponse>(
+									{ queryKey: ["threads", String(searchSpaceId)] },
+									(old) => {
+										if (!old) return old;
+										const updateTitle = (list: ThreadListItem[]) =>
+											list.map((t) =>
+												t.id === titleData.threadId ? { ...t, title: titleData.title } : t
+											);
+										return {
+											...old,
+											threads: updateTitle(old.threads),
+											archived_threads: updateTitle(old.archived_threads),
+										};
+									}
+								);
 							}
 							break;
 						}
@@ -786,129 +1160,208 @@ export default function NewChatPage() {
 								name: string;
 								args: Record<string, unknown>;
 							}>;
-							for (const action of actionRequests) {
-								const existingIdx = Array.from(toolCallIndices.entries()).find(([, idx]) => {
-									const part = contentParts[idx];
-									return part?.type === "tool-call" && part.toolName === action.name;
-								});
-								if (existingIdx) {
-									updateToolCall(contentPartsState, existingIdx[0], {
-										result: { __interrupt__: true, ...interruptData },
-									});
-								} else {
-									const tcId = `interrupt-${action.name}`;
-									addToolCall(contentPartsState, TOOLS_WITH_UI, tcId, action.name, action.args);
-									updateToolCall(contentPartsState, tcId, {
-										result: { __interrupt__: true, ...interruptData },
-									});
+							const paired = pairBundleToolCallIds(
+								contentPartsState.toolCallIndices,
+								contentPartsState.contentParts,
+								actionRequests
+							);
+							const bundleToolCallIds: string[] = [];
+							for (let i = 0; i < actionRequests.length; i++) {
+								const action = actionRequests[i];
+								let targetTcId = paired[i];
+								if (!targetTcId) {
+									targetTcId = freshSynthToolCallId(
+										contentPartsState.toolCallIndices,
+										action.name,
+										i
+									);
+									addToolCall(
+										contentPartsState,
+										toolsWithUI,
+										targetTcId,
+										action.name,
+										action.args,
+										true
+									);
 								}
+								updateToolCall(contentPartsState, targetTcId, {
+									result: { __interrupt__: true, ...interruptData },
+								});
+								bundleToolCallIds.push(targetTcId);
 							}
 							setMessages((prev) =>
 								prev.map((m) =>
 									m.id === assistantMsgId
-										? { ...m, content: buildContentForUI(contentPartsState, TOOLS_WITH_UI) }
+										? { ...m, content: buildContentForUI(contentPartsState, toolsWithUI) }
 										: m
 								)
 							);
 							if (currentThreadId) {
-								setPendingInterrupt({
-									threadId: currentThreadId,
-									assistantMsgId,
-									interruptData,
-								});
+								// ``tool_call_id`` is stamped on the backend by
+								// ``checkpointed_subagent_middleware``. Without it we
+								// can't address the paused subagent on resume — skip
+								// rather than fabricate a synthetic key.
+								const interruptId = String(interruptData.tool_call_id ?? "");
+								if (interruptId) {
+									const incoming: PendingInterruptState = {
+										interruptId,
+										threadId: currentThreadId,
+										assistantMsgId,
+										interruptData,
+										bundleToolCallIds,
+									};
+									setPendingInterrupts((prev) => {
+										const without = prev.filter((p) => p.interruptId !== interruptId);
+										return [...without, incoming];
+									});
+								}
 							}
 							break;
 						}
 
-						case "error":
-							throw new Error(parsed.errorText || "Server error");
-					}
-				}
-
-				batcher.flush();
+						case "data-action-log": {
+							applyActionLogSse(queryClient, currentThreadId, searchSpaceId, parsed.data);
+							break;
+						}
 
-				// Skip persistence for interrupted messages -- handleResume will persist the final version
-				const finalContent = buildContentForPersistence(contentPartsState, TOOLS_WITH_UI);
-				if (contentParts.length > 0 && !wasInterrupted) {
-					try {
-						const savedMessage = await appendMessage(currentThreadId, {
-							role: "assistant",
-							content: finalContent,
-						});
+						case "data-action-log-updated": {
+							applyActionLogUpdatedSse(
+								queryClient,
+								currentThreadId,
+								parsed.data.id,
+								parsed.data.reversible
+							);
+							break;
+						}
 
-						// Update message ID from temporary to database ID so comments work immediately
-						const newMsgId = `msg-${savedMessage.id}`;
-						setMessages((prev) =>
-							prev.map((m) => (m.id === assistantMsgId ? { ...m, id: newMsgId } : m))
-						);
-
-						// Update pending interrupt with the new persisted message ID
-						setPendingInterrupt((prev) =>
-							prev && prev.assistantMsgId === assistantMsgId
-								? { ...prev, assistantMsgId: newMsgId }
-								: prev
-						);
-					} catch (err) {
-						console.error("Failed to persist assistant message:", err);
-					}
+						case "data-turn-info": {
+							const turnId = readStreamedChatTurnId(parsed.data);
+							if (turnId) {
+								setMessages((prev) =>
+									applyTurnIdToAssistantMessageList(prev, assistantMsgId, turnId)
+								);
+							}
+							break;
+						}
 
-					// Track successful response
-					trackChatResponseReceived(searchSpaceId, currentThreadId);
-				}
-			} catch (error) {
-				batcher.dispose();
-				if (error instanceof Error && error.name === "AbortError") {
-					// Request was cancelled by user - persist partial response if any content was received
-					const hasContent = contentParts.some(
-						(part) =>
-							(part.type === "text" && part.text.length > 0) ||
-							(part.type === "tool-call" && TOOLS_WITH_UI.has(part.toolName))
-					);
-					if (hasContent && currentThreadId) {
-						const partialContent = buildContentForPersistence(contentPartsState, TOOLS_WITH_UI);
-						try {
-							const savedMessage = await appendMessage(currentThreadId, {
-								role: "assistant",
-								content: partialContent,
-							});
+						case "data-user-message-id": {
+							// Server-authoritative user message id resolved by
+							// ``persist_user_turn`` (or recovered via ON CONFLICT).
+							// Rename the optimistic ``msg-user-XXX`` placeholder to
+							// the canonical ``msg-{db_id}`` so DB-id-gated UI
+							// (comments, edit-from-this-message) unlocks immediately,
+							// migrate the local mentioned-documents map, and reassign
+							// the closure variable so all downstream
+							// ``m.id === userMsgId`` checks see the new value.
+							const parsedMsg = readStreamedMessageId(parsed.data);
+							if (!parsedMsg) break;
+							const newUserMsgId = `msg-${parsedMsg.messageId}`;
+							const oldUserMsgId = userMsgId;
+							setMessages((prev) =>
+								prev.map((m) =>
+									m.id === oldUserMsgId
+										? mergeChatTurnIdIntoMessage({ ...m, id: newUserMsgId }, parsedMsg.turnId)
+										: m
+								)
+							);
+							if (allMentionedDocs.length > 0) {
+								setMessageDocumentsMap((prev) => {
+									if (!(oldUserMsgId in prev)) {
+										return { ...prev, [newUserMsgId]: allMentionedDocs };
+									}
+									const { [oldUserMsgId]: _removed, ...rest } = prev;
+									return { ...rest, [newUserMsgId]: allMentionedDocs };
+								});
+							}
+							userMsgId = newUserMsgId;
+							if (isNewThread) {
+								// First user-side row landed in ``new_chat_messages``;
+								// refresh the sidebar so the freshly-bumped
+								// ``thread.updated_at`` reorders this thread.
+								queryClient.invalidateQueries({
+									queryKey: ["threads", String(searchSpaceId)],
+								});
+							}
+							break;
+						}
 
-							// Update message ID from temporary to database ID
-							const newMsgId = `msg-${savedMessage.id}`;
+						case "data-assistant-message-id": {
+							// Server-authoritative assistant message id resolved
+							// by ``persist_assistant_shell``. Rename the optimistic
+							// id, migrate ``tokenUsageStore`` so any pending
+							// ``data-token-usage`` payload binds to the new id,
+							// remap any in-flight ``pendingInterrupts`` entries,
+							// and reassign the closure variable so the in-stream
+							// flush callback (line ~1074) keeps writing to the
+							// renamed message.
+							const parsedMsg = readStreamedMessageId(parsed.data);
+							if (!parsedMsg) break;
+							const newAssistantMsgId = `msg-${parsedMsg.messageId}`;
+							const oldAssistantMsgId = assistantMsgId;
+							tokenUsageStore.rename(oldAssistantMsgId, newAssistantMsgId);
 							setMessages((prev) =>
-								prev.map((m) => (m.id === assistantMsgId ? { ...m, id: newMsgId } : m))
+								prev.map((m) =>
+									m.id === oldAssistantMsgId
+										? mergeChatTurnIdIntoMessage({ ...m, id: newAssistantMsgId }, parsedMsg.turnId)
+										: m
+								)
 							);
-						} catch (err) {
-							console.error("Failed to persist partial assistant message:", err);
+							setPendingInterrupts((prev) =>
+								prev.map((p) =>
+									p.assistantMsgId === oldAssistantMsgId
+										? { ...p, assistantMsgId: newAssistantMsgId }
+										: p
+								)
+							);
+							assistantMsgId = newAssistantMsgId;
+							break;
 						}
 					}
-					return;
-				}
-				console.error("[NewChatPage] Chat error:", error);
+				});
 
-				// Track chat error
-				trackChatError(
-					searchSpaceId,
-					currentThreadId,
-					error instanceof Error ? error.message : "Unknown error"
-				);
+				batcher.flush();
 
-				toast.error("Failed to get response. Please try again.");
-				// Update assistant message with error
-				setMessages((prev) =>
-					prev.map((m) =>
-						m.id === assistantMsgId
-							? {
-									...m,
-									content: [
-										{
-											type: "text",
-											text: "Sorry, there was an error. Please try again.",
-										},
-									],
-								}
-							: m
-					)
-				);
+				// Server-authoritative persistence: ``stream_new_chat``
+				// already wrote the user row in ``persist_user_turn``
+				// (the FE renamed the optimistic id mid-stream via
+				// ``data-user-message-id``) and finalises the assistant
+				// row in ``finalize_assistant_turn`` from a shielded
+				// ``finally`` block. Nothing left for the FE to persist
+				// here — track the response and unblock the UI.
+				if (contentParts.length > 0 && !wasInterrupted) {
+					trackChatResponseReceived(searchSpaceId, currentThreadId);
+				}
+			} catch (error) {
+				streamBatcher?.dispose();
+				await handleStreamTerminalError({
+					error,
+					flow: "new",
+					threadId: currentThreadId,
+					assistantMsgId,
+					accepted: newAccepted,
+					// Server-side ``finalize_assistant_turn`` runs from a
+					// shielded ``anyio.CancelScope(shield=True)`` finally
+					// block, so partial content (incl. abort-mid-stream)
+					// is already persisted by the BE for the assistant
+					// row, and ``persist_user_turn`` ran before any LLM
+					// call. The FE's only remaining responsibility on
+					// abort / accepted-stream-error is to surface the
+					// error toast (handled by ``handleStreamTerminalError``
+					// itself).
+					onPreAcceptFailure: async () => {
+						// Pre-accept failure means the BE never accepted the
+						// request — no server-side persistence ran. Roll
+						// back the optimistic UI insertions we made before
+						// the fetch so the user message and any local
+						// mentioned-docs metadata don't linger.
+						setMessages((prev) => prev.filter((m) => m.id !== userMsgId));
+						setMessageDocumentsMap((prev) => {
+							if (!(userMsgId in prev)) return prev;
+							const { [userMsgId]: _removed, ...rest } = prev;
+							return rest;
+						});
+					},
+				});
 			} finally {
 				setIsRunning(false);
 				abortControllerRef.current = null;
@@ -920,15 +1373,20 @@ export default function NewChatPage() {
 			messages,
 			mentionedDocumentIds,
 			mentionedDocuments,
-			sidebarDocuments,
 			setMentionedDocuments,
-			setSidebarDocuments,
 			setMessageDocumentsMap,
 			setAgentCreatedDocuments,
 			queryClient,
 			currentUser,
+			localFilesystemEnabled,
 			disabledTools,
 			updateChatTabTitle,
+			tokenUsageStore,
+			pendingUserImageUrls,
+			setPendingUserImageUrls,
+			fetchWithTurnCancellingRetry,
+			handleStreamTerminalError,
+			handleChatFailure,
 		]
 	);
 
@@ -940,9 +1398,23 @@ export default function NewChatPage() {
 				edited_action?: { name: string; args: Record<string, unknown> };
 			}>
 		) => {
-			if (!pendingInterrupt) return;
-			const { threadId: resumeThreadId, assistantMsgId } = pendingInterrupt;
-			setPendingInterrupt(null);
+			if (pendingInterrupts.length === 0) return;
+			// All cards in this turn share the same threadId/assistantMsgId
+			// (they're siblings of one parent agent step), so reading from
+			// the first entry is safe.
+			const resumeThreadId = pendingInterrupts[0].threadId;
+			// Destructured separately as ``let`` so the SSE
+			// ``data-assistant-message-id`` handler (resume always
+			// allocates a fresh server-side row) can rename it to
+			// the canonical ``msg-{db_id}`` mid-stream.
+			let assistantMsgId = pendingInterrupts[0].assistantMsgId;
+			// Concatenate every card's tool-call ids in pendingInterrupts order;
+			// this matches the ``decisions`` ordering produced by
+			// ``handleApprovalSubmit`` and the backend slicer's traversal of
+			// ``state.interrupts``.
+			const allBundleToolCallIds = pendingInterrupts.flatMap((p) => p.bundleToolCallIds);
+			setPendingInterrupts([]);
+			stagedDecisionsByInterruptIdRef.current.clear();
 			setIsRunning(true);
 
 			const token = getBearerToken();
@@ -956,17 +1428,21 @@ export default function NewChatPage() {
 			abortControllerRef.current = controller;
 
 			const currentThinkingSteps = new Map<string, ThinkingStepData>();
-			const batcher = new FrameBatchedUpdater();
 
 			const contentPartsState: ContentPartsState = {
 				contentParts: [],
 				currentTextPartIndex: -1,
+				currentReasoningPartIndex: -1,
 				toolCallIndices: new Map(),
 			};
 			const { contentParts, toolCallIndices } = contentPartsState;
+			let resumeAccepted = false;
+			let streamBatcher: FrameBatchedUpdater | null = null;
 
 			const existingMsg = messages.find((m) => m.id === assistantMsgId);
 			if (existingMsg && Array.isArray(existingMsg.content)) {
+				// See ``ContentPartsState.suppressStepSeparators`` doc.
+				contentPartsState.suppressStepSeparators = true;
 				for (const part of existingMsg.content) {
 					if (typeof part === "object" && part !== null) {
 						const p = part as Record<string, unknown>;
@@ -981,6 +1457,19 @@ export default function NewChatPage() {
 								toolName: String(p.toolName),
 								args: (p.args as Record<string, unknown>) ?? {},
 								result: p.result as unknown,
+								// argsText: assistant-ui prefers it over
+								// JSON.stringify(args), so restoring it keeps
+								// pretty-printed JSON across reloads.
+								...(typeof p.argsText === "string" ? { argsText: p.argsText } : {}),
+								...(typeof p.langchainToolCallId === "string"
+									? { langchainToolCallId: p.langchainToolCallId }
+									: {}),
+								// metadata: spanId / thinkingStepId drive the
+								// timeline's step↔tool join. Dropping these
+								// here orphans every rehydrated tool-call.
+								...(p.metadata && typeof p.metadata === "object"
+									? { metadata: p.metadata as Record<string, unknown> }
+									: {}),
 							});
 							contentPartsState.currentTextPartIndex = -1;
 						} else if (p.type === "data-thinking-steps") {
@@ -997,191 +1486,246 @@ export default function NewChatPage() {
 				}
 			}
 
-			// Merge edited args if present to fix race condition
-			if (decisions.length > 0 && decisions[0].type === "edit" && decisions[0].edited_action) {
-				const editedAction = decisions[0].edited_action;
-				for (const part of contentParts) {
-					if (part.type === "tool-call" && part.toolName === editedAction.name) {
-						part.args = { ...part.args, ...editedAction.args };
-						break;
-					}
-				}
+			// Apply each decision to its own card by toolCallId so mixed
+			// bundles (approve/edit/reject) and multi-edit bundles do not
+			// collapse onto ``decisions[0]``. Cards outside the bundle are
+			// untouched. Mirrors the host ``hitl-decision`` handler.
+			const decisionByTcId = new Map<string, (typeof decisions)[number]>();
+			const tcIds = allBundleToolCallIds;
+			if (decisions.length === tcIds.length) {
+				for (let i = 0; i < tcIds.length; i++) decisionByTcId.set(tcIds[i], decisions[i]);
 			}
-
-			const decisionType = decisions[0]?.type as "approve" | "reject" | undefined;
-			if (decisionType) {
+			if (decisionByTcId.size > 0) {
 				for (const part of contentParts) {
-					if (
-						part.type === "tool-call" &&
-						typeof part.result === "object" &&
-						part.result !== null &&
-						"__interrupt__" in (part.result as Record<string, unknown>)
-					) {
-						part.result = {
-							...(part.result as Record<string, unknown>),
-							__decided__: decisionType,
-						};
+					if (part.type !== "tool-call") continue;
+					const tcId = part.toolCallId as string | undefined;
+					const d = tcId ? decisionByTcId.get(tcId) : undefined;
+					if (!d) continue;
+					if (typeof part.result !== "object" || part.result === null) continue;
+					if (!("__interrupt__" in (part.result as Record<string, unknown>))) continue;
+					const decided = d.type;
+					if (decided === "edit" && d.edited_action) {
+						const mergedArgs = { ...part.args, ...d.edited_action.args };
+						part.args = mergedArgs;
+						// Sync argsText so the rendered card shows the
+						// edited inputs (assistant-ui prefers it over
+						// JSON.stringify(args)).
+						part.argsText = JSON.stringify(mergedArgs, null, 2);
 					}
+					part.result = {
+						...(part.result as Record<string, unknown>),
+						__decided__: decided,
+					};
 				}
 			}
 
 			try {
 				const backendUrl = process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL || "http://localhost:8000";
-				const response = await authenticatedFetch(`${backendUrl}/api/v1/threads/${resumeThreadId}/resume`, {
-					method: "POST",
-					headers: {
-						"Content-Type": "application/json",
-					},
-					body: JSON.stringify({
-						search_space_id: searchSpaceId,
-						decisions,
-					}),
-					signal: controller.signal,
+				const selection = await getAgentFilesystemSelection(searchSpaceId, {
+					localFilesystemEnabled,
 				});
+				const response = await fetchWithTurnCancellingRetry(() =>
+					fetch(`${backendUrl}/api/v1/threads/${resumeThreadId}/resume`, {
+						method: "POST",
+						headers: {
+							"Content-Type": "application/json",
+							Authorization: `Bearer ${token}`,
+						},
+						body: JSON.stringify({
+							search_space_id: searchSpaceId,
+							decisions,
+							disabled_tools: disabledTools.length > 0 ? disabledTools : undefined,
+							filesystem_mode: selection.filesystem_mode,
+							client_platform: selection.client_platform,
+							local_filesystem_mounts: selection.local_filesystem_mounts,
+						}),
+						signal: controller.signal,
+					})
+				);
 
 				if (!response.ok) {
-					throw new Error(`Backend error: ${response.status}`);
+					throw await toHttpResponseError(response);
 				}
+				resumeAccepted = true;
 
 				const flushMessages = () => {
 					setMessages((prev) =>
 						prev.map((m) =>
 							m.id === assistantMsgId
-								? { ...m, content: buildContentForUI(contentPartsState, TOOLS_WITH_UI) }
+								? { ...m, content: buildContentForUI(contentPartsState, toolsWithUI) }
 								: m
 						)
 					);
 				};
-				const scheduleFlush = () => batcher.schedule(flushMessages);
+				const { batcher, scheduleFlush, forceFlush } = createStreamFlushHelpers(flushMessages);
+				streamBatcher = batcher;
 
-				for await (const parsed of readSSEStream(response)) {
-					switch (parsed.type) {
-						case "text-delta":
-							appendText(contentPartsState, parsed.delta);
-							scheduleFlush();
-							break;
-
-						case "tool-input-start":
-							addToolCall(contentPartsState, TOOLS_WITH_UI, parsed.toolCallId, parsed.toolName, {});
-							batcher.flush();
-							break;
-
-						case "tool-input-available":
-							if (toolCallIndices.has(parsed.toolCallId)) {
-								updateToolCall(contentPartsState, parsed.toolCallId, {
-									args: parsed.input || {},
-								});
-							} else {
-								addToolCall(
-									contentPartsState,
-									TOOLS_WITH_UI,
-									parsed.toolCallId,
-									parsed.toolName,
-									parsed.input || {}
-								);
-							}
-							batcher.flush();
-							break;
-
-						case "tool-output-available":
-							updateToolCall(contentPartsState, parsed.toolCallId, {
-								result: parsed.output,
-							});
-							markInterruptsCompleted(contentParts);
-							batcher.flush();
-							break;
-
-						case "data-thinking-step": {
-							const stepData = parsed.data as ThinkingStepData;
-							if (stepData?.id) {
-								currentThinkingSteps.set(stepData.id, stepData);
-								const didUpdate = updateThinkingSteps(contentPartsState, currentThinkingSteps);
-								if (didUpdate) {
-									scheduleFlush();
+				await consumeSseEvents(response, async (parsed) => {
+					if (
+						processSharedStreamEvent(parsed, {
+							contentPartsState,
+							toolsWithUI,
+							currentThinkingSteps,
+							scheduleFlush,
+							forceFlush,
+							onTokenUsage: (data) => {
+								tokenUsageStore.set(assistantMsgId, data);
+							},
+							onTurnStatus: (data) => {
+								if (data.status === "cancelling") {
+									recentCancelRequestedAtRef.current = Date.now();
 								}
-							}
-							break;
-						}
-
+							},
+						})
+					) {
+						return;
+					}
+					switch (parsed.type) {
 						case "data-interrupt-request": {
 							const interruptData = parsed.data as Record<string, unknown>;
 							const actionRequests = (interruptData.action_requests ?? []) as Array<{
 								name: string;
 								args: Record<string, unknown>;
 							}>;
-							for (const action of actionRequests) {
-								const existingIdx = Array.from(toolCallIndices.entries()).find(([, idx]) => {
-									const part = contentParts[idx];
-									return part?.type === "tool-call" && part.toolName === action.name;
-								});
-								if (existingIdx) {
-									updateToolCall(contentPartsState, existingIdx[0], {
-										result: {
-											__interrupt__: true,
-											...interruptData,
-										},
-									});
-								} else {
-									const tcId = `interrupt-${action.name}`;
-									addToolCall(contentPartsState, TOOLS_WITH_UI, tcId, action.name, action.args);
-									updateToolCall(contentPartsState, tcId, {
-										result: {
-											__interrupt__: true,
-											...interruptData,
-										},
-									});
+							const paired = pairBundleToolCallIds(
+								contentPartsState.toolCallIndices,
+								contentPartsState.contentParts,
+								actionRequests
+							);
+							const bundleToolCallIds: string[] = [];
+							for (let i = 0; i < actionRequests.length; i++) {
+								const action = actionRequests[i];
+								let targetTcId = paired[i];
+								if (!targetTcId) {
+									targetTcId = freshSynthToolCallId(
+										contentPartsState.toolCallIndices,
+										action.name,
+										i
+									);
+									addToolCall(
+										contentPartsState,
+										toolsWithUI,
+										targetTcId,
+										action.name,
+										action.args,
+										true
+									);
 								}
+								updateToolCall(contentPartsState, targetTcId, {
+									result: { __interrupt__: true, ...interruptData },
+								});
+								bundleToolCallIds.push(targetTcId);
 							}
 							setMessages((prev) =>
 								prev.map((m) =>
 									m.id === assistantMsgId
-										? { ...m, content: buildContentForUI(contentPartsState, TOOLS_WITH_UI) }
+										? { ...m, content: buildContentForUI(contentPartsState, toolsWithUI) }
 										: m
 								)
 							);
-							setPendingInterrupt({
-								threadId: resumeThreadId,
-								assistantMsgId,
-								interruptData,
-							});
+							{
+								const interruptId = String(interruptData.tool_call_id ?? "");
+								if (interruptId) {
+									const incoming: PendingInterruptState = {
+										interruptId,
+										threadId: resumeThreadId,
+										assistantMsgId,
+										interruptData,
+										bundleToolCallIds,
+									};
+									setPendingInterrupts((prev) => {
+										const without = prev.filter((p) => p.interruptId !== interruptId);
+										return [...without, incoming];
+									});
+								}
+							}
+							break;
+						}
+
+						case "data-action-log": {
+							applyActionLogSse(queryClient, resumeThreadId, searchSpaceId, parsed.data);
+							break;
+						}
+
+						case "data-action-log-updated": {
+							applyActionLogUpdatedSse(
+								queryClient,
+								resumeThreadId,
+								parsed.data.id,
+								parsed.data.reversible
+							);
+							break;
+						}
+
+						case "data-turn-info": {
+							const turnId = readStreamedChatTurnId(parsed.data);
+							if (turnId) {
+								setMessages((prev) =>
+									applyTurnIdToAssistantMessageList(prev, assistantMsgId, turnId)
+								);
+							}
 							break;
 						}
 
-						case "error":
-							throw new Error(parsed.errorText || "Server error");
+						case "data-assistant-message-id": {
+							// Resume always allocates a fresh ``new_chat_messages``
+							// row anchored to a new ``turn_id`` (the original
+							// interrupted turn's row stays as-is), so this is a
+							// real id swap. Rename the optimistic placeholder to
+							// ``msg-{db_id}`` and reassign closure state. Resume
+							// does NOT emit ``data-user-message-id`` — the user
+							// row belongs to the original interrupted turn.
+							const parsedMsg = readStreamedMessageId(parsed.data);
+							if (!parsedMsg) break;
+							const newAssistantMsgId = `msg-${parsedMsg.messageId}`;
+							const oldAssistantMsgId = assistantMsgId;
+							tokenUsageStore.rename(oldAssistantMsgId, newAssistantMsgId);
+							setMessages((prev) =>
+								prev.map((m) =>
+									m.id === oldAssistantMsgId
+										? mergeChatTurnIdIntoMessage({ ...m, id: newAssistantMsgId }, parsedMsg.turnId)
+										: m
+								)
+							);
+							assistantMsgId = newAssistantMsgId;
+							break;
+						}
 					}
-				}
+				});
 
 				batcher.flush();
 
-				const finalContent = buildContentForPersistence(contentPartsState, TOOLS_WITH_UI);
-				if (contentParts.length > 0) {
-					try {
-						const savedMessage = await appendMessage(resumeThreadId, {
-							role: "assistant",
-							content: finalContent,
-						});
-						const newMsgId = `msg-${savedMessage.id}`;
-						setMessages((prev) =>
-							prev.map((m) => (m.id === assistantMsgId ? { ...m, id: newMsgId } : m))
-						);
-					} catch (err) {
-						console.error("Failed to persist resumed assistant message:", err);
-					}
-				}
+				// Server-authoritative persistence: ``stream_resume_chat``
+				// finalises the assistant row in
+				// ``finalize_assistant_turn`` from a shielded
+				// ``finally`` block (covers both happy-path and
+				// abort-mid-stream). FE has no remaining persistence
+				// work here.
 			} catch (error) {
-				batcher.dispose();
-				if (error instanceof Error && error.name === "AbortError") {
-					return;
-				}
-				console.error("[NewChatPage] Resume error:", error);
-				toast.error("Failed to resume. Please try again.");
+				streamBatcher?.dispose();
+				await handleStreamTerminalError({
+					error,
+					flow: "resume",
+					threadId: resumeThreadId,
+					assistantMsgId,
+					accepted: resumeAccepted,
+				});
 			} finally {
 				setIsRunning(false);
 				abortControllerRef.current = null;
 			}
 		},
-		[pendingInterrupt, messages, searchSpaceId]
+		[
+			pendingInterrupts,
+			messages,
+			searchSpaceId,
+			localFilesystemEnabled,
+			disabledTools,
+			queryClient,
+			tokenUsageStore,
+			fetchWithTurnCancellingRetry,
+			handleStreamTerminalError,
+		]
 	);
 
 	useEffect(() => {
@@ -1193,51 +1737,83 @@ export default function NewChatPage() {
 					edited_action?: { name: string; args: Record<string, unknown> };
 				}>;
 			};
-			if (detail?.decisions && pendingInterrupt) {
-				const decision = detail.decisions[0];
-				const decisionType = decision?.type as "approve" | "reject" | "edit";
-
-				setMessages((prev) =>
-					prev.map((m) => {
-						if (m.id !== pendingInterrupt.assistantMsgId) return m;
-						const parts = m.content as unknown as Array<Record<string, unknown>>;
-						const newContent = parts.map((part) => {
-							if (
-								part.type === "tool-call" &&
-								typeof part.result === "object" &&
-								part.result !== null &&
-								"__interrupt__" in part.result
-							) {
-								// For edit decisions, also update the displayed args
-								if (decisionType === "edit" && decision.edited_action) {
-									return {
-										...part,
-										args: decision.edited_action.args, // Update displayed args
-										result: {
-											...(part.result as Record<string, unknown>),
-											__decided__: decisionType,
-										},
-									};
-								}
-								return {
-									...part,
-									result: {
-										...(part.result as Record<string, unknown>),
-										__decided__: decisionType,
-									},
-								};
-							}
-							return part;
-						});
-						return { ...m, content: newContent as unknown as ThreadMessageLike["content"] };
-					})
+			if (!detail?.decisions || pendingInterrupts.length === 0) return;
+			const incoming = detail.decisions;
+			if (incoming.length === 0) return;
+			// Concatenated tool-call ids across every pending card, in the
+			// order ``handleApprovalSubmit`` produced ``incoming``.
+			const tcIds = pendingInterrupts.flatMap((p) => p.bundleToolCallIds);
+			const N = tcIds.length;
+
+			// Refuse rather than silently broadcast or drop. The orchestrator
+			// only fires ``hitl-decision`` once every pending card has
+			// submitted, so a count mismatch indicates a contract drift
+			// (and would later make the backend slicer raise).
+			if (incoming.length !== N) {
+				toast.error(
+					`Cannot resume: ${incoming.length} decision(s) submitted for ${N} pending actions.`
 				);
-				handleResume(detail.decisions);
+				return;
+			}
+
+			const byTcId = new Map<string, (typeof incoming)[number]>();
+			const submittedDecisions: typeof incoming = [];
+			for (let i = 0; i < tcIds.length; i++) {
+				const tcId = tcIds[i];
+				const decision = incoming[i];
+				if (tcId === undefined || decision === undefined) {
+					toast.error(
+						`Cannot resume: ${incoming.length} decision(s) submitted for ${N} pending actions.`
+					);
+					return;
+				}
+				byTcId.set(tcId, decision);
+				submittedDecisions.push(decision);
 			}
+
+			// All pending cards belong to the same assistant message, so a
+			// single content-update pass suffices.
+			const targetAssistantMsgId = pendingInterrupts[0].assistantMsgId;
+			setMessages((prev) =>
+				prev.map((m) => {
+					if (m.id !== targetAssistantMsgId) return m;
+					const parts = m.content as unknown as Array<Record<string, unknown>>;
+					const newContent = parts.map((part) => {
+						const tcId = part.toolCallId as string | undefined;
+						const d = tcId ? byTcId.get(tcId) : undefined;
+						if (!d || part.type !== "tool-call") return part;
+						if (typeof part.result !== "object" || part.result === null) return part;
+						if (!("__interrupt__" in (part.result as Record<string, unknown>))) return part;
+						const decided = d.type;
+						if (decided === "edit" && d.edited_action) {
+							return {
+								...part,
+								args: d.edited_action.args,
+								// Sync argsText so the card renders the edited
+								// inputs (assistant-ui prefers it over JSON.stringify).
+								argsText: JSON.stringify(d.edited_action.args, null, 2),
+								result: {
+									...(part.result as Record<string, unknown>),
+									__decided__: decided,
+								},
+							};
+						}
+						return {
+							...part,
+							result: {
+								...(part.result as Record<string, unknown>),
+								__decided__: decided,
+							},
+						};
+					});
+					return { ...m, content: newContent as unknown as ThreadMessageLike["content"] };
+				})
+			);
+			handleResume(submittedDecisions);
 		};
 		window.addEventListener("hitl-decision", handler);
 		return () => window.removeEventListener("hitl-decision", handler);
-	}, [handleResume, pendingInterrupt]);
+	}, [handleResume, pendingInterrupts]);
 
 	// Convert message (pass through since already in correct format)
 	const convertMessage = useCallback(
@@ -1249,15 +1825,31 @@ export default function NewChatPage() {
 	 * Handle regeneration (edit or reload) by calling the regenerate endpoint
 	 * and streaming the response. This rewinds the LangGraph checkpointer state.
 	 *
-	 * @param newUserQuery - The new user query (for edit). Pass null/undefined for reload.
+	 * @param newUserQuery - `null` = reload with same turn from the server. A string = edit
+	 *   (including an empty string when the edited turn is images-only); pass `editExtras` for images/content.
 	 */
 	const handleRegenerate = useCallback(
-		async (newUserQuery?: string | null) => {
+		async (
+			newUserQuery: string | null,
+			editExtras?: {
+				userMessageContent: ThreadMessageLike["content"];
+				userImages: NewChatUserImagePayload[];
+				sourceUserMessageId?: string;
+			},
+			editFromPosition?: {
+				/** Message id (numeric, parsed from ``msg-<n>``) to rewind to. */
+				fromMessageId?: number | null;
+				/** When true, revert reversible downstream actions before stream. */
+				revertActions?: boolean;
+			}
+		) => {
 			if (!threadId) {
 				toast.error("Cannot regenerate: no active chat thread");
 				return;
 			}
 
+			const isEdit = newUserQuery !== null;
+
 			// Abort any previous streaming request
 			if (abortControllerRef.current) {
 				abortControllerRef.current.abort();
@@ -1271,14 +1863,16 @@ export default function NewChatPage() {
 			}
 
 			// Extract the original user query BEFORE removing messages (for reload mode)
-			let userQueryToDisplay = newUserQuery;
+			let userQueryToDisplay: string | undefined;
 			let originalUserMessageContent: ThreadMessageLike["content"] | null = null;
 			let originalUserMessageMetadata: ThreadMessageLike["metadata"] | undefined;
+			let sourceUserMessageId: string | undefined = editExtras?.sourceUserMessageId;
 
-			if (!newUserQuery) {
+			if (!isEdit) {
 				// Reload mode - find and preserve the last user message content
 				const lastUserMessage = [...messages].reverse().find((m) => m.role === "user");
 				if (lastUserMessage) {
+					sourceUserMessageId = lastUserMessage.id;
 					originalUserMessageContent = lastUserMessage.content;
 					originalUserMessageMetadata = lastUserMessage.metadata;
 					// Extract text for the API request
@@ -1289,236 +1883,505 @@ export default function NewChatPage() {
 						}
 					}
 				}
+			} else {
+				userQueryToDisplay = newUserQuery;
 			}
 
-			// Remove the last two messages (user + assistant) from the UI immediately
-			// The backend will also delete them from the database
-			setMessages((prev) => {
-				if (prev.length >= 2) {
-					return prev.slice(0, -2);
-				}
-				return prev;
-			});
-
 			// Start streaming
 			setIsRunning(true);
 			const controller = new AbortController();
 			abortControllerRef.current = controller;
 
-			// Add placeholder user message if we have a new query (edit mode)
-			const userMsgId = `msg-user-${Date.now()}`;
-			const assistantMsgId = `msg-assistant-${Date.now()}`;
+			// Add placeholder user message if we have a new query (edit mode).
+			// Mutable for the same reason as in ``onNew`` — both ids are
+			// renamed mid-stream by the new ``data-user-message-id`` /
+			// ``data-assistant-message-id`` SSE handlers below.
+			let userMsgId = `msg-user-${Date.now()}`;
+			let assistantMsgId = `msg-assistant-${Date.now()}`;
 			const currentThinkingSteps = new Map<string, ThinkingStepData>();
 
 			const contentPartsState: ContentPartsState = {
 				contentParts: [],
 				currentTextPartIndex: -1,
+				currentReasoningPartIndex: -1,
 				toolCallIndices: new Map(),
 			};
-			const { contentParts, toolCallIndices } = contentPartsState;
-			const batcher = new FrameBatchedUpdater();
+			const { contentParts } = contentPartsState;
+			let regenerateAccepted = false;
+			let streamBatcher: FrameBatchedUpdater | null = null;
 
 			// Add placeholder messages to UI
 			// Always add back the user message (with new query for edit, or original content for reload)
 			const userMessage: ThreadMessageLike = {
 				id: userMsgId,
 				role: "user",
-				content: newUserQuery
-					? [{ type: "text", text: newUserQuery }]
+				content: isEdit
+					? (editExtras?.userMessageContent ?? [{ type: "text", text: newUserQuery ?? "" }])
 					: originalUserMessageContent || [{ type: "text", text: userQueryToDisplay || "" }],
 				createdAt: new Date(),
-				metadata: newUserQuery ? undefined : originalUserMessageMetadata,
+				metadata: isEdit ? undefined : originalUserMessageMetadata,
 			};
-			setMessages((prev) => [...prev, userMessage]);
-
-			// Add placeholder assistant message
-			setMessages((prev) => [
-				...prev,
-				{
-					id: assistantMsgId,
-					role: "assistant",
-					content: [{ type: "text", text: "" }],
-					createdAt: new Date(),
-				},
-			]);
-
+			const sourceMentionedDocs =
+				sourceUserMessageId && messageDocumentsMap[sourceUserMessageId]
+					? messageDocumentsMap[sourceUserMessageId]
+					: [];
 			try {
-				const response = await authenticatedFetch(getRegenerateUrl(threadId), {
-					method: "POST",
-					headers: {
-						"Content-Type": "application/json",
-					},
-					body: JSON.stringify({
-						search_space_id: searchSpaceId,
-						user_query: newUserQuery || null,
-						disabled_tools: disabledTools.length > 0 ? disabledTools : undefined,
-					}),
-					signal: controller.signal,
+				const selection = await getAgentFilesystemSelection(searchSpaceId, {
+					localFilesystemEnabled,
 				});
+				// Partition the source mentions back into doc/surfsense_doc/folder
+				// id buckets so the regenerate route can pass them to
+				// ``stream_new_chat`` and the priority middleware sees the
+				// same ``[USER-MENTIONED]`` priority entries the original
+				// turn did. Without this partition the regenerate flow
+				// silently dropped the agent's mention awareness — same
+				// architectural bug we fixed on the new-chat path.
+				const regenerateSurfsenseDocIds = sourceMentionedDocs
+					.filter((d) => d.kind === "doc" && d.document_type === "SURFSENSE_DOCS")
+					.map((d) => d.id);
+				const regenerateDocIds = sourceMentionedDocs
+					.filter((d) => d.kind === "doc" && d.document_type !== "SURFSENSE_DOCS")
+					.map((d) => d.id);
+				const regenerateFolderIds = sourceMentionedDocs
+					.filter((d) => d.kind === "folder")
+					.map((d) => d.id);
+
+				const requestBody: Record<string, unknown> = {
+					search_space_id: searchSpaceId,
+					user_query: newUserQuery,
+					disabled_tools: disabledTools.length > 0 ? disabledTools : undefined,
+					filesystem_mode: selection.filesystem_mode,
+					client_platform: selection.client_platform,
+					local_filesystem_mounts: selection.local_filesystem_mounts,
+					mentioned_document_ids: regenerateDocIds.length > 0 ? regenerateDocIds : undefined,
+					mentioned_surfsense_doc_ids:
+						regenerateSurfsenseDocIds.length > 0 ? regenerateSurfsenseDocIds : undefined,
+					mentioned_folder_ids: regenerateFolderIds.length > 0 ? regenerateFolderIds : undefined,
+					// Full mention metadata for the regenerate-specific
+					// source list. Only meaningful for edit (the BE only
+					// re-persists a user row when ``user_query`` is set);
+					// reload reuses the original turn's mentioned_documents.
+					mentioned_documents:
+						sourceMentionedDocs.length > 0
+							? sourceMentionedDocs.map((d) => ({
+									id: d.id,
+									title: d.title,
+									document_type: d.document_type,
+									kind: d.kind,
+								}))
+							: undefined,
+				};
+				if (isEdit) {
+					requestBody.user_images = editExtras?.userImages ?? [];
+				}
+				// Explicit edit-from-arbitrary-position. Only send
+				// ``from_message_id`` / ``revert_actions`` when the
+				// caller asked for them; otherwise the backend keeps the
+				// legacy "last 2 messages" behaviour for back-compat.
+				if (editFromPosition?.fromMessageId != null) {
+					requestBody.from_message_id = editFromPosition.fromMessageId;
+					if (editFromPosition.revertActions) {
+						requestBody.revert_actions = true;
+					}
+				}
+				const response = await fetchWithTurnCancellingRetry(() =>
+					fetch(getRegenerateUrl(threadId), {
+						method: "POST",
+						headers: {
+							"Content-Type": "application/json",
+							Authorization: `Bearer ${token}`,
+						},
+						body: JSON.stringify(requestBody),
+						signal: controller.signal,
+					})
+				);
 
 				if (!response.ok) {
-					throw new Error(`Backend error: ${response.status}`);
+					throw await toHttpResponseError(response);
+				}
+				regenerateAccepted = true;
+
+				// Only switch UI to regenerated placeholder messages after the backend accepts
+				// regenerate. This avoids local message loss when regenerate fails early (e.g. 400).
+				//
+				// When an explicit ``editFromPosition.fromMessageId`` is passed, slice from
+				// that message forward so edit-from-arbitrary-position drops every downstream
+				// message; otherwise fall back to the legacy "drop the last 2" behaviour.
+				setMessages((prev) => {
+					let base = prev;
+					if (editFromPosition?.fromMessageId != null) {
+						const targetId = `msg-${editFromPosition.fromMessageId}`;
+						const sliceIndex = prev.findIndex((m) => m.id === targetId);
+						if (sliceIndex >= 0) {
+							base = prev.slice(0, sliceIndex);
+						}
+					} else if (prev.length >= 2) {
+						base = prev.slice(0, -2);
+					}
+					return [
+						...base,
+						userMessage,
+						{
+							id: assistantMsgId,
+							role: "assistant",
+							content: [{ type: "text", text: "" }],
+							createdAt: new Date(),
+						},
+					];
+				});
+				if (sourceMentionedDocs.length > 0) {
+					setMessageDocumentsMap((prev) => ({
+						...prev,
+						[userMsgId]: sourceMentionedDocs,
+					}));
 				}
 
 				const flushMessages = () => {
 					setMessages((prev) =>
 						prev.map((m) =>
 							m.id === assistantMsgId
-								? { ...m, content: buildContentForUI(contentPartsState, TOOLS_WITH_UI) }
+								? { ...m, content: buildContentForUI(contentPartsState, toolsWithUI) }
 								: m
 						)
 					);
 				};
-				const scheduleFlush = () => batcher.schedule(flushMessages);
+				const { batcher, scheduleFlush, forceFlush } = createStreamFlushHelpers(flushMessages);
+				streamBatcher = batcher;
 
-				for await (const parsed of readSSEStream(response)) {
+				await consumeSseEvents(response, async (parsed) => {
+					if (
+						processSharedStreamEvent(parsed, {
+							contentPartsState,
+							toolsWithUI,
+							currentThinkingSteps,
+							scheduleFlush,
+							forceFlush,
+							onTokenUsage: (data) => {
+								tokenUsageStore.set(assistantMsgId, data);
+							},
+							onTurnStatus: (data) => {
+								if (data.status === "cancelling") {
+									recentCancelRequestedAtRef.current = Date.now();
+								}
+							},
+							onToolOutputAvailable: (event, sharedCtx) => {
+								if (event.output?.status === "pending" && event.output?.podcast_id) {
+									const idx = sharedCtx.toolCallIndices.get(event.toolCallId);
+									if (idx !== undefined) {
+										const part = sharedCtx.contentPartsState.contentParts[idx];
+										if (part?.type === "tool-call" && part.toolName === "generate_podcast") {
+											setActivePodcastTaskId(String(event.output.podcast_id));
+										}
+									}
+								}
+							},
+						})
+					) {
+						return;
+					}
 					switch (parsed.type) {
-						case "text-delta":
-							appendText(contentPartsState, parsed.delta);
-							scheduleFlush();
+						case "data-action-log": {
+							if (threadId !== null) {
+								applyActionLogSse(queryClient, threadId, searchSpaceId, parsed.data);
+							}
 							break;
+						}
 
-						case "tool-input-start":
-							addToolCall(contentPartsState, TOOLS_WITH_UI, parsed.toolCallId, parsed.toolName, {});
-							batcher.flush();
+						case "data-action-log-updated": {
+							if (threadId !== null) {
+								applyActionLogUpdatedSse(
+									queryClient,
+									threadId,
+									parsed.data.id,
+									parsed.data.reversible
+								);
+							}
 							break;
+						}
 
-						case "tool-input-available":
-							if (toolCallIndices.has(parsed.toolCallId)) {
-								updateToolCall(contentPartsState, parsed.toolCallId, { args: parsed.input || {} });
-							} else {
-								addToolCall(
-									contentPartsState,
-									TOOLS_WITH_UI,
-									parsed.toolCallId,
-									parsed.toolName,
-									parsed.input || {}
+						case "data-turn-info": {
+							const turnId = readStreamedChatTurnId(parsed.data);
+							if (turnId) {
+								setMessages((prev) =>
+									applyTurnIdToAssistantMessageList(prev, assistantMsgId, turnId)
 								);
 							}
-							batcher.flush();
 							break;
+						}
 
-						case "tool-output-available":
-							updateToolCall(contentPartsState, parsed.toolCallId, { result: parsed.output });
-							markInterruptsCompleted(contentParts);
-							if (parsed.output?.status === "pending" && parsed.output?.podcast_id) {
-								const idx = toolCallIndices.get(parsed.toolCallId);
-								if (idx !== undefined) {
-									const part = contentParts[idx];
-									if (part?.type === "tool-call" && part.toolName === "generate_podcast") {
-										setActivePodcastTaskId(String(parsed.output.podcast_id));
+						case "data-user-message-id": {
+							// Same role as in ``onNew`` but the regenerate-specific
+							// mention metadata (``sourceMentionedDocs``) is the
+							// list to migrate onto the canonical id key.
+							const parsedMsg = readStreamedMessageId(parsed.data);
+							if (!parsedMsg) break;
+							const newUserMsgId = `msg-${parsedMsg.messageId}`;
+							const oldUserMsgId = userMsgId;
+							setMessages((prev) =>
+								prev.map((m) =>
+									m.id === oldUserMsgId
+										? mergeChatTurnIdIntoMessage({ ...m, id: newUserMsgId }, parsedMsg.turnId)
+										: m
+								)
+							);
+							if (sourceMentionedDocs.length > 0) {
+								setMessageDocumentsMap((prev) => {
+									if (!(oldUserMsgId in prev)) {
+										return { ...prev, [newUserMsgId]: sourceMentionedDocs };
 									}
-								}
+									const { [oldUserMsgId]: _removed, ...rest } = prev;
+									return { ...rest, [newUserMsgId]: sourceMentionedDocs };
+								});
 							}
-							batcher.flush();
+							userMsgId = newUserMsgId;
+							break;
+						}
+
+						case "data-assistant-message-id": {
+							const parsedMsg = readStreamedMessageId(parsed.data);
+							if (!parsedMsg) break;
+							const newAssistantMsgId = `msg-${parsedMsg.messageId}`;
+							const oldAssistantMsgId = assistantMsgId;
+							tokenUsageStore.rename(oldAssistantMsgId, newAssistantMsgId);
+							setMessages((prev) =>
+								prev.map((m) =>
+									m.id === oldAssistantMsgId
+										? mergeChatTurnIdIntoMessage({ ...m, id: newAssistantMsgId }, parsedMsg.turnId)
+										: m
+								)
+							);
+							assistantMsgId = newAssistantMsgId;
 							break;
+						}
 
-						case "data-thinking-step": {
-							const stepData = parsed.data as ThinkingStepData;
-							if (stepData?.id) {
-								currentThinkingSteps.set(stepData.id, stepData);
-								const didUpdate = updateThinkingSteps(contentPartsState, currentThinkingSteps);
-								if (didUpdate) {
-									scheduleFlush();
+						case "data-revert-results": {
+							const summary = parsed.data;
+							// failureCount must include every "not undone" bucket
+							// (not_reversible, permission_denied, failed) so the
+							// toast's "X could not be rolled back" math matches
+							// the response invariant ``total === sum(counters)``.
+							// ``skipped`` rows are batch revert artefacts (revert
+							// rows themselves) and are not user-facing failures.
+							const failureCount =
+								summary.failed + summary.not_reversible + (summary.permission_denied ?? 0);
+							if (failureCount > 0) {
+								toast.warning(
+									`Pre-revert: ${summary.reverted}/${summary.total} undone, ${failureCount} could not be rolled back.`
+								);
+							} else if (summary.reverted > 0) {
+								toast.success(
+									summary.reverted === 1
+										? "Reverted 1 downstream action before regenerating."
+										: `Reverted ${summary.reverted} downstream actions before regenerating.`
+								);
+							}
+							if (threadId !== null) {
+								for (const r of summary.results) {
+									if (r.status === "reverted" || r.status === "already_reverted") {
+										markActionRevertedInCache(
+											queryClient,
+											threadId,
+											r.action_id,
+											r.new_action_id ?? null
+										);
+									}
 								}
 							}
 							break;
 						}
-
-						case "error":
-							throw new Error(parsed.errorText || "Server error");
 					}
-				}
+				});
 
 				batcher.flush();
 
-				// Persist messages after streaming completes
-				const finalContent = buildContentForPersistence(contentPartsState, TOOLS_WITH_UI);
+				// Server-authoritative persistence: ``stream_new_chat``
+				// (regenerate flow) wrote the user row in
+				// ``persist_user_turn`` and finalises the assistant row
+				// in ``finalize_assistant_turn`` from a shielded
+				// ``finally`` block (covers both happy-path and
+				// abort-mid-stream). FE only needs to track the
+				// successful response here.
 				if (contentParts.length > 0) {
-					try {
-						// Persist user message (for both edit and reload modes, since backend deleted it)
-						const userContentToPersist = newUserQuery
-							? [{ type: "text", text: newUserQuery }]
-							: originalUserMessageContent || [{ type: "text", text: userQueryToDisplay || "" }];
-
-						const savedUserMessage = await appendMessage(threadId, {
-							role: "user",
-							content: userContentToPersist,
-						});
-
-						// Update user message ID to database ID
-						const newUserMsgId = `msg-${savedUserMessage.id}`;
-						setMessages((prev) =>
-							prev.map((m) => (m.id === userMsgId ? { ...m, id: newUserMsgId } : m))
-						);
-
-						// Persist assistant message
-						const savedMessage = await appendMessage(threadId, {
-							role: "assistant",
-							content: finalContent,
-						});
-
-						// Update assistant message ID to database ID
-						const newMsgId = `msg-${savedMessage.id}`;
-						setMessages((prev) =>
-							prev.map((m) => (m.id === assistantMsgId ? { ...m, id: newMsgId } : m))
-						);
-
-						trackChatResponseReceived(searchSpaceId, threadId);
-					} catch (err) {
-						console.error("Failed to persist regenerated message:", err);
-					}
+					trackChatResponseReceived(searchSpaceId, threadId);
 				}
 			} catch (error) {
-				if (error instanceof Error && error.name === "AbortError") {
-					return;
-				}
-				batcher.dispose();
-				console.error("[NewChatPage] Regeneration error:", error);
-				trackChatError(
-					searchSpaceId,
+				streamBatcher?.dispose();
+				await handleStreamTerminalError({
+					error,
+					flow: "regenerate",
 					threadId,
-					error instanceof Error ? error.message : "Unknown error"
-				);
-				toast.error("Failed to regenerate response. Please try again.");
-				setMessages((prev) =>
-					prev.map((m) =>
-						m.id === assistantMsgId
-							? {
-									...m,
-									content: [{ type: "text", text: "Sorry, there was an error. Please try again." }],
-								}
-							: m
-					)
-				);
+					assistantMsgId,
+					accepted: regenerateAccepted,
+				});
 			} finally {
 				setIsRunning(false);
 				abortControllerRef.current = null;
 			}
 		},
-		[threadId, searchSpaceId, messages, disabledTools]
+		[
+			threadId,
+			searchSpaceId,
+			messages,
+			disabledTools,
+			localFilesystemEnabled,
+			messageDocumentsMap,
+			setMessageDocumentsMap,
+			queryClient,
+			tokenUsageStore,
+			fetchWithTurnCancellingRetry,
+			handleStreamTerminalError,
+		]
 	);
 
-	// Handle editing a message - truncates history and regenerates with new query
+	// Handle editing a message - truncates history and regenerates with new query.
+	//
+	// When ``message.sourceId`` is set (the assistant-ui way to say
+	// "this edit replaces an older message"), we pin
+	// ``from_message_id`` so the backend rewinds to the right LangGraph
+	// checkpoint instead of relying on the legacy "last 2 messages"
+	// rewind. We also count downstream reversible actions and prompt the
+	// user to revert / continue / cancel before regenerating.
 	const onEdit = useCallback(
 		async (message: AppendMessage) => {
-			// Extract the new user query from the message content
-			let newUserQuery = "";
-			for (const part of message.content) {
-				if (part.type === "text") {
-					newUserQuery += part.text;
+			const { userQuery, userImages } = extractUserTurnForNewChatApi(message, []);
+			const queryForApi = userQuery.trim();
+			if (!queryForApi && userImages.length === 0) {
+				toast.error("Cannot edit with empty message");
+				return;
+			}
+
+			const userMessageContent = message.content as unknown as ThreadMessageLike["content"];
+
+			// ``sourceId`` per @assistant-ui/core's ``AppendMessage`` is
+			// "the ID of the message that was edited". Parse the numeric
+			// suffix so we can map it back to a DB row.
+			const sourceId = (message as { sourceId?: string }).sourceId;
+			const fromMessageId =
+				sourceId && /^msg-\d+$/.test(sourceId)
+					? Number.parseInt(sourceId.replace(/^msg-/, ""), 10)
+					: null;
+
+			if (fromMessageId == null) {
+				// No source id (or non-DB id) — fall back to today's
+				// last-2 behaviour. The user gets the legacy edit flow.
+				await handleRegenerate(queryForApi, {
+					userMessageContent,
+					userImages,
+					sourceUserMessageId: sourceId,
+				});
+				return;
+			}
+
+			// Pre-flight: count reversible downstream actions so we can
+			// auto-skip the dialog for harmless edits.
+			//
+			// "Downstream" means messages AFTER the edited one. The
+			// previous slice ``messages.slice(editedIndex)`` included
+			// the edited message itself in both the total
+			// count and the reversibility scan (any actions on the
+			// edited turn would be double-counted). Slice from
+			// ``editedIndex + 1`` so the dialog text matches reality:
+			// "N downstream messages will be dropped".
+			const editedIndex = messages.findIndex((m) => m.id === `msg-${fromMessageId}`);
+			let downstreamReversibleCount = 0;
+			let downstreamTotalCount = 0;
+			if (editedIndex >= 0) {
+				const downstream = messages.slice(editedIndex + 1);
+				downstreamTotalCount = downstream.length;
+				const seenTurns = new Set<string>();
+				const downstreamTurnIds = new Set<string>();
+				for (const m of downstream) {
+					const meta = (m.metadata ?? {}) as { custom?: { chatTurnId?: string } };
+					const tid = meta.custom?.chatTurnId;
+					if (!tid || seenTurns.has(tid)) continue;
+					seenTurns.add(tid);
+					downstreamTurnIds.add(tid);
+				}
+				// Source of truth: the unified react-query cache. Every
+				// action whose ``chat_turn_id`` belongs to the slice we're
+				// about to drop counts toward the prompt.
+				for (const a of agentActionItems) {
+					if (!a.chat_turn_id || !downstreamTurnIds.has(a.chat_turn_id)) continue;
+					if (
+						a.reversible &&
+						(a.reverted_by_action_id === null || a.reverted_by_action_id === undefined) &&
+						!a.is_revert_action &&
+						(a.error === null || a.error === undefined)
+					) {
+						downstreamReversibleCount += 1;
+					}
 				}
 			}
 
-			if (!newUserQuery.trim()) {
-				toast.error("Cannot edit with empty message");
+			if (downstreamReversibleCount === 0) {
+				// Nothing to revert — submit silently.
+				await handleRegenerate(
+					queryForApi,
+					{ userMessageContent, userImages, sourceUserMessageId: sourceId },
+					{ fromMessageId, revertActions: false }
+				);
+				return;
+			}
+
+			setEditDialogState({
+				fromMessageId,
+				userQuery: queryForApi,
+				userMessageContent,
+				userImages,
+				downstreamReversibleCount,
+				downstreamTotalCount,
+			});
+		},
+		[handleRegenerate, messages, agentActionItems]
+	);
+
+	const handleApprovalSubmit = useCallback(
+		(interruptId: string, decisions: HitlDecision[]) => {
+			// Stage this card's decisions; only fire the resume once every
+			// pending card in the current turn has submitted, so the
+			// backend slicer sees a single concatenated decisions list
+			// whose total matches the parent state's pending action count.
+			stagedDecisionsByInterruptIdRef.current.set(interruptId, decisions);
+			if (stagedDecisionsByInterruptIdRef.current.size < pendingInterrupts.length) {
 				return;
 			}
+			const ordered: HitlDecision[] = [];
+			for (const pi of pendingInterrupts) {
+				const staged = stagedDecisionsByInterruptIdRef.current.get(pi.interruptId);
+				if (!staged) {
+					// Defensive: a missing entry means the staging map and
+					// the pending list disagreed for one cycle. Bail rather
+					// than dispatch a count-mismatched batch.
+					return;
+				}
+				ordered.push(...staged);
+			}
+			stagedDecisionsByInterruptIdRef.current.clear();
+			window.dispatchEvent(new CustomEvent("hitl-decision", { detail: { decisions: ordered } }));
+		},
+		[pendingInterrupts]
+	);
 
-			// Call regenerate with the new query
-			await handleRegenerate(newUserQuery.trim());
+	const handleEditDialogChoice = useCallback(
+		async (choice: EditMessageDialogChoice) => {
+			const pending = editDialogState;
+			if (!pending) return;
+			setEditDialogState(null);
+			if (choice === "cancel") return;
+			await handleRegenerate(
+				pending.userQuery,
+				{
+					userMessageContent: pending.userMessageContent,
+					userImages: pending.userImages,
+					sourceUserMessageId: `msg-${pending.fromMessageId}`,
+				},
+				{
+					fromMessageId: pending.fromMessageId,
+					revertActions: choice === "revert",
+				}
+			);
 		},
-		[handleRegenerate]
+		[editDialogState, handleRegenerate]
 	);
 
 	// Handle reloading/refreshing the last AI response
@@ -1550,31 +2413,47 @@ export default function NewChatPage() {
 		return (
 			<div className="flex h-full flex-col items-center justify-center gap-4">
 				<div className="text-destructive">Failed to load chat</div>
-				<button
+				<Button
 					type="button"
 					onClick={() => {
 						setIsInitializing(true);
 						initializeThread();
 					}}
-					className="rounded-md bg-primary px-4 py-2 text-primary-foreground hover:bg-primary/90"
 				>
 					Try Again
-				</button>
+				</Button>
 			</div>
 		);
 	}
 
 	return (
-		<AssistantRuntimeProvider runtime={runtime}>
-			<ThinkingStepsDataUI />
-			<div key={searchSpaceId} className="flex h-full overflow-hidden">
-				<div className="flex-1 flex flex-col min-w-0 overflow-hidden">
-					<Thread />
-				</div>
-				<MobileReportPanel />
-				<MobileEditorPanel />
-				<MobileHitlEditPanel />
-			</div>
-		</AssistantRuntimeProvider>
+		<TokenUsageProvider store={tokenUsageStore}>
+			<AssistantRuntimeProvider runtime={runtime}>
+				<TimelineDataUI />
+				<StepSeparatorDataUI />
+				<PendingInterruptProvider
+					pendingInterrupts={pendingInterrupts}
+					onSubmit={handleApprovalSubmit}
+				>
+					<div key={searchSpaceId} className="flex h-full overflow-hidden">
+						<div className="flex-1 flex flex-col min-w-0 overflow-hidden">
+							<Thread />
+						</div>
+						<MobileReportPanel />
+						<MobileEditorPanel />
+						<MobileHitlEditPanel />
+					</div>
+				</PendingInterruptProvider>
+				<EditMessageDialog
+					open={editDialogState !== null}
+					onOpenChange={(open) => {
+						if (!open) setEditDialogState(null);
+					}}
+					downstreamReversibleCount={editDialogState?.downstreamReversibleCount ?? 0}
+					downstreamTotalCount={editDialogState?.downstreamTotalCount ?? 0}
+					onChoose={handleEditDialogChoice}
+				/>
+			</AssistantRuntimeProvider>
+		</TokenUsageProvider>
 	);
 }
diff --git a/surfsense_web/app/dashboard/[search_space_id]/new-chat/loading.tsx b/surfsense_web/app/dashboard/[search_space_id]/new-chat/loading.tsx
index 6eb9223ca3..1086716629 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/new-chat/loading.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/new-chat/loading.tsx
@@ -2,42 +2,59 @@ import { Skeleton } from "@/components/ui/skeleton";
 
 export default function Loading() {
 	return (
-		<div className="flex h-full flex-col bg-main-panel px-4">
-			<div className="mx-auto w-full max-w-[44rem] flex flex-1 flex-col gap-6 py-8">
-				{/* User message */}
-				<div className="flex justify-end">
-					<Skeleton className="h-12 w-56 rounded-2xl" />
-				</div>
+		<div
+			className="aui-root aui-thread-root @container flex h-full min-h-0 flex-col bg-main-panel"
+			style={{
+				["--thread-max-width" as string]: "42rem",
+			}}
+		>
+			<div
+				className="aui-thread-viewport relative flex flex-1 min-h-0 flex-col overflow-y-auto px-4 scroll-smooth"
+				style={{ scrollbarGutter: "stable" }}
+			>
+				<div
+					aria-hidden
+					className="aui-chat-viewport-top-fade pointer-events-none sticky top-0 z-10 -mx-4 h-2 shrink-0 bg-gradient-to-b from-main-panel from-20% to-transparent"
+				/>
+				<div className="mx-auto w-full max-w-(--thread-max-width) flex flex-1 flex-col gap-6 py-8">
+					{/* User message */}
+					<div className="flex justify-end">
+						<Skeleton className="h-12 w-56 rounded-2xl" />
+					</div>
 
-				{/* Assistant message */}
-				<div className="flex flex-col gap-2">
-					<Skeleton className="h-4 w-full" />
-					<Skeleton className="h-4 w-[85%]" />
-					<Skeleton className="h-18 w-[40%]" />
-				</div>
+					{/* Assistant message */}
+					<div className="flex flex-col gap-2">
+						<Skeleton className="h-4 w-full" />
+						<Skeleton className="h-4 w-[85%]" />
+						<Skeleton className="h-18 w-[40%]" />
+					</div>
 
-				{/* User message */}
-				<div className="flex gap-2 justify-end">
-					<Skeleton className="h-12 w-72 rounded-2xl" />
-				</div>
+					{/* User message */}
+					<div className="flex gap-2 justify-end">
+						<Skeleton className="h-12 w-72 rounded-2xl" />
+					</div>
 
-				{/* Assistant message */}
-				<div className="flex flex-col gap-2">
-					<Skeleton className="h-10 w-[30%]" />
-					<Skeleton className="h-4 w-[90%]" />
-					<Skeleton className="h-6 w-[60%]" />
-				</div>
+					{/* Assistant message */}
+					<div className="flex flex-col gap-2">
+						<Skeleton className="h-10 w-[30%]" />
+						<Skeleton className="h-4 w-[90%]" />
+						<Skeleton className="h-6 w-[60%]" />
+					</div>
 
-				{/* User message */}
-				<div className="flex gap-2 justify-end">
-					<Skeleton className="h-12 w-96 rounded-2xl" />
+					{/* User message */}
+					<div className="flex gap-2 justify-end">
+						<Skeleton className="h-12 w-96 rounded-2xl" />
+					</div>
 				</div>
-			</div>
 
-			{/* Input bar */}
-			<div className="sticky bottom-0 pb-6 bg-main-panel">
-				<div className="mx-auto w-full max-w-[44rem]">
-					<Skeleton className="h-24 w-full rounded-2xl" />
+				{/* Input bar */}
+				<div
+					className="aui-chat-composer-footer sticky bottom-0 z-20 -mx-4 mt-auto flex flex-col items-stretch bg-gradient-to-t from-main-panel from-60% to-transparent px-4 pt-6"
+					style={{ paddingBottom: "max(0.5rem, env(safe-area-inset-bottom))" }}
+				>
+					<div className="aui-chat-composer-area relative mx-auto flex w-full max-w-(--thread-max-width) flex-col gap-3 overflow-visible">
+						<Skeleton className="h-28 w-full rounded-3xl" />
+					</div>
 				</div>
 			</div>
 		</div>
diff --git a/surfsense_web/app/dashboard/[search_space_id]/onboard/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/onboard/page.tsx
index 4dba3bbb67..3e3f41deb1 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/onboard/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/onboard/page.tsx
@@ -151,7 +151,7 @@ export default function OnboardPage() {
 	}
 
 	return (
-		<div className="h-screen flex flex-col items-center p-4 bg-background dark:bg-neutral-900 select-none overflow-hidden">
+		<div className="h-screen flex flex-col items-center p-4 bg-main-panel select-none overflow-hidden">
 			<div className="w-full max-w-lg flex flex-col min-h-0 h-full gap-6 py-8">
 				{/* Header */}
 				<div className="text-center space-y-3 shrink-0">
@@ -165,7 +165,7 @@ export default function OnboardPage() {
 				</div>
 
 				{/* Form card */}
-				<div className="rounded-xl border bg-background dark:bg-neutral-900 flex-1 min-h-0 overflow-y-auto px-6 py-6">
+				<div className="rounded-xl border bg-main-panel flex-1 min-h-0 overflow-y-auto px-6 py-6">
 					<LLMConfigForm
 						searchSpaceId={searchSpaceId}
 						onSubmit={handleSubmit}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/purchase-cancel/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/purchase-cancel/page.tsx
index 0e5224700b..174c5681d7 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/purchase-cancel/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/purchase-cancel/page.tsx
@@ -23,16 +23,14 @@ export default function PurchaseCancelPage() {
 				<CardHeader className="text-center">
 					<CircleSlash2 className="mx-auto h-10 w-10 text-muted-foreground" />
 					<CardTitle className="text-2xl">Checkout canceled</CardTitle>
-					<CardDescription>
-						No charge was made and your current pages are unchanged.
-					</CardDescription>
+					<CardDescription>No charge was made and your account is unchanged.</CardDescription>
 				</CardHeader>
 				<CardContent className="text-center text-sm text-muted-foreground">
 					You can return to the pricing options and try again whenever you&apos;re ready.
 				</CardContent>
 				<CardFooter className="flex flex-col gap-2 sm:flex-row">
 					<Button asChild className="w-full">
-						<Link href={`/dashboard/${searchSpaceId}/more-pages`}>Back to Buy Pages</Link>
+						<Link href={`/dashboard/${searchSpaceId}/buy-more`}>Back to Pricing</Link>
 					</Button>
 					<Button asChild variant="outline" className="w-full">
 						<Link href={`/dashboard/${searchSpaceId}/new-chat`}>Back to Dashboard</Link>
diff --git a/surfsense_web/app/dashboard/[search_space_id]/purchase-success/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/purchase-success/page.tsx
index 82c6f8c74b..8eaec3e5ac 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/purchase-success/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/purchase-success/page.tsx
@@ -1,11 +1,9 @@
 "use client";
 
-import { useQueryClient } from "@tanstack/react-query";
-import { CheckCircle2 } from "lucide-react";
+import { AlertCircle, CheckCircle2, Loader2 } from "lucide-react";
 import Link from "next/link";
-import { useParams } from "next/navigation";
-import { useEffect } from "react";
-import { USER_QUERY_KEY } from "@/atoms/user/user-query.atoms";
+import { useParams, useSearchParams } from "next/navigation";
+import { useEffect, useRef, useState } from "react";
 import { Button } from "@/components/ui/button";
 import {
 	Card,
@@ -15,40 +13,156 @@ import {
 	CardHeader,
 	CardTitle,
 } from "@/components/ui/card";
+import type { FinalizeCheckoutResponse } from "@/contracts/types/stripe.types";
+import { stripeApiService } from "@/lib/apis/stripe-api.service";
+
+type FinalizeState =
+	| { kind: "loading" }
+	| { kind: "completed"; data: FinalizeCheckoutResponse }
+	| { kind: "pending"; data: FinalizeCheckoutResponse }
+	| { kind: "still_pending"; data: FinalizeCheckoutResponse }
+	| { kind: "failed"; data: FinalizeCheckoutResponse }
+	| { kind: "error"; message: string }
+	| { kind: "no_session" };
+
+const POLL_INTERVAL_MS = 2000;
+const MAX_POLL_ATTEMPTS = 15; // ~30s total before falling back to the still_pending state
 
 export default function PurchaseSuccessPage() {
 	const params = useParams();
-	const queryClient = useQueryClient();
+	const searchParams = useSearchParams();
 	const searchSpaceId = String(params.search_space_id ?? "");
+	const sessionId = searchParams.get("session_id");
+
+	const [state, setState] = useState<FinalizeState>(
+		sessionId ? { kind: "loading" } : { kind: "no_session" }
+	);
+	// Tracks active polling so component unmount cancels it
+	const cancelledRef = useRef(false);
 
 	useEffect(() => {
-		void queryClient.invalidateQueries({ queryKey: USER_QUERY_KEY });
-	}, [queryClient]);
+		if (!sessionId) return;
+
+		cancelledRef.current = false;
+
+		const poll = async (attempt: number): Promise<void> => {
+			if (cancelledRef.current) return;
+			try {
+				const data = await stripeApiService.finalizeCheckout(sessionId);
+				if (cancelledRef.current) return;
+
+				if (data.status === "completed") {
+					setState({ kind: "completed", data });
+					return;
+				}
+				if (data.status === "failed") {
+					setState({ kind: "failed", data });
+					return;
+				}
+
+				// Status is "pending" - either the user paid via async
+				// payment method (Klarna, ACH) or webhook + finalize both
+				// raced and lost. Keep polling up to MAX_POLL_ATTEMPTS,
+				// then fall back to a friendlier message that explains
+				// fulfilment may complete asynchronously.
+				if (attempt < MAX_POLL_ATTEMPTS) {
+					setState({ kind: "pending", data });
+					setTimeout(() => poll(attempt + 1), POLL_INTERVAL_MS);
+				} else {
+					setState({ kind: "still_pending", data });
+				}
+			} catch (err) {
+				if (cancelledRef.current) return;
+				const message = err instanceof Error ? err.message : "Unable to finalize checkout.";
+				setState({ kind: "error", message });
+			}
+		};
+
+		void poll(1);
+
+		return () => {
+			cancelledRef.current = true;
+		};
+	}, [sessionId]);
 
 	return (
 		<div className="flex min-h-[calc(100vh-64px)] items-center justify-center px-4 py-8">
 			<Card className="w-full max-w-lg">
 				<CardHeader className="text-center">
-					<CheckCircle2 className="mx-auto h-10 w-10 text-emerald-500" />
-					<CardTitle className="text-2xl">Purchase complete</CardTitle>
+					{state.kind === "loading" || state.kind === "pending" ? (
+						<Loader2 className="mx-auto h-10 w-10 animate-spin text-primary" />
+					) : state.kind === "completed" ? (
+						<CheckCircle2 className="mx-auto h-10 w-10 text-emerald-500" />
+					) : (
+						<AlertCircle className="mx-auto h-10 w-10 text-amber-500" />
+					)}
+					<CardTitle className="text-2xl">
+						{state.kind === "loading" && "Confirming payment…"}
+						{state.kind === "pending" && "Processing your payment…"}
+						{state.kind === "still_pending" && "Payment still processing"}
+						{state.kind === "completed" && "Purchase complete"}
+						{state.kind === "failed" && "Purchase failed"}
+						{state.kind === "error" && "Couldn't confirm payment"}
+						{state.kind === "no_session" && "Purchase complete"}
+					</CardTitle>
 					<CardDescription>
-						Your additional pages are being applied to your account now.
+						{state.kind === "loading" && "We're verifying your payment with Stripe."}
+						{state.kind === "pending" &&
+							"Your bank is taking a moment to confirm. This usually takes 5–30 seconds."}
+						{state.kind === "still_pending" &&
+							"Your payment is still being processed by your bank. We'll apply your purchase as soon as it clears — usually within a few minutes. You can safely close this page."}
+						{state.kind === "completed" &&
+							(state.data.purchase_type === "page_packs"
+								? `Added ${formatNumber(state.data.pages_granted ?? 0)} pages to your account.`
+								: `Added ${formatCredit(state.data.premium_credit_micros_granted ?? 0)} of premium credit to your account.`)}
+						{state.kind === "failed" &&
+							"Stripe reported the checkout as failed or expired. Your card was not charged."}
+						{state.kind === "error" &&
+							"Don't worry — if your card was charged, your purchase will still apply within a minute or two."}
+						{state.kind === "no_session" && "Your purchase is being applied to your account."}
 					</CardDescription>
 				</CardHeader>
 				<CardContent className="space-y-3 text-center">
-					<p className="text-sm text-muted-foreground">
-						Your sidebar usage meter should refresh automatically in a moment.
-					</p>
+					{state.kind === "completed" && state.data.purchase_type === "page_packs" && (
+						<p className="text-sm text-muted-foreground">
+							New balance: {formatNumber(state.data.pages_limit ?? 0)} total pages
+							{typeof state.data.pages_used === "number"
+								? ` (${formatNumber((state.data.pages_limit ?? 0) - state.data.pages_used)} remaining)`
+								: ""}
+						</p>
+					)}
+					{state.kind === "completed" && state.data.purchase_type === "premium_tokens" && (
+						<p className="text-sm text-muted-foreground">
+							New premium credit balance:{" "}
+							{formatCredit(state.data.premium_credit_micros_limit ?? 0)}
+						</p>
+					)}
+					{state.kind === "error" && (
+						<p className="text-sm text-muted-foreground">{state.message}</p>
+					)}
 				</CardContent>
 				<CardFooter className="flex flex-col gap-2">
 					<Button asChild className="w-full">
 						<Link href={`/dashboard/${searchSpaceId}/new-chat`}>Back to Dashboard</Link>
 					</Button>
 					<Button asChild variant="outline" className="w-full">
-						<Link href={`/dashboard/${searchSpaceId}/more-pages`}>Buy More Pages</Link>
+						<Link href={`/dashboard/${searchSpaceId}/buy-more`}>Buy More</Link>
 					</Button>
 				</CardFooter>
 			</Card>
 		</div>
 	);
 }
+
+function formatNumber(n: number): string {
+	return new Intl.NumberFormat("en-US").format(n);
+}
+
+function formatCredit(micros: number): string {
+	const dollars = micros / 1_000_000;
+	return new Intl.NumberFormat("en-US", {
+		style: "currency",
+		currency: "USD",
+		maximumFractionDigits: 2,
+	}).format(dollars);
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/general/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/general/page.tsx
new file mode 100644
index 0000000000..09124b3b03
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/general/page.tsx
@@ -0,0 +1,6 @@
+import { GeneralSettingsManager } from "@/components/settings/general-settings-manager";
+
+export default async function Page({ params }: { params: Promise<{ search_space_id: string }> }) {
+	const { search_space_id } = await params;
+	return <GeneralSettingsManager searchSpaceId={Number(search_space_id)} />;
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/image-models/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/image-models/page.tsx
new file mode 100644
index 0000000000..b300f80784
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/image-models/page.tsx
@@ -0,0 +1,6 @@
+import { ImageModelManager } from "@/components/settings/image-model-manager";
+
+export default async function Page({ params }: { params: Promise<{ search_space_id: string }> }) {
+	const { search_space_id } = await params;
+	return <ImageModelManager searchSpaceId={Number(search_space_id)} />;
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/layout-shell.tsx b/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/layout-shell.tsx
new file mode 100644
index 0000000000..96d77d1311
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/layout-shell.tsx
@@ -0,0 +1,179 @@
+"use client";
+
+import {
+	BookText,
+	Bot,
+	Brain,
+	CircleUser,
+	Earth,
+	ImageIcon,
+	ListChecks,
+	ScanEye,
+	UserKey,
+} from "lucide-react";
+import Link from "next/link";
+import { useSelectedLayoutSegment } from "next/navigation";
+import { useTranslations } from "next-intl";
+import type React from "react";
+import { useCallback, useMemo, useState } from "react";
+import { Separator } from "@/components/ui/separator";
+import { cn } from "@/lib/utils";
+
+export type SearchSpaceSettingsTab =
+	| "general"
+	| "roles"
+	| "models"
+	| "image-models"
+	| "vision-models"
+	| "team-roles"
+	| "prompts"
+	| "team-memory"
+	| "public-links";
+
+const DEFAULT_TAB: SearchSpaceSettingsTab = "general";
+
+interface SearchSpaceSettingsLayoutShellProps {
+	searchSpaceId: string;
+	children: React.ReactNode;
+}
+
+export function SearchSpaceSettingsLayoutShell({
+	searchSpaceId,
+	children,
+}: SearchSpaceSettingsLayoutShellProps) {
+	const t = useTranslations("searchSpaceSettings");
+	const segment = useSelectedLayoutSegment();
+	const [tabScrollPos, setTabScrollPos] = useState<"start" | "middle" | "end">("start");
+
+	const handleTabScroll = useCallback((e: React.UIEvent<HTMLDivElement>) => {
+		const el = e.currentTarget;
+		const atStart = el.scrollLeft <= 2;
+		const atEnd = el.scrollWidth - el.scrollLeft - el.clientWidth <= 2;
+		setTabScrollPos(atStart ? "start" : atEnd ? "end" : "middle");
+	}, []);
+
+	const navItems = useMemo(
+		() => [
+			{
+				value: "general" as const,
+				label: t("nav_general"),
+				icon: <CircleUser className="h-4 w-4" />,
+			},
+			{
+				value: "roles" as const,
+				label: t("nav_role_assignments"),
+				icon: <ListChecks className="h-4 w-4" />,
+			},
+			{
+				value: "models" as const,
+				label: t("nav_agent_models"),
+				icon: <Bot className="h-4 w-4" />,
+			},
+			{
+				value: "image-models" as const,
+				label: t("nav_image_models"),
+				icon: <ImageIcon className="h-4 w-4" />,
+			},
+			{
+				value: "vision-models" as const,
+				label: t("nav_vision_models"),
+				icon: <ScanEye className="h-4 w-4" />,
+			},
+			{
+				value: "team-roles" as const,
+				label: t("nav_team_roles"),
+				icon: <UserKey className="h-4 w-4" />,
+			},
+			{
+				value: "prompts" as const,
+				label: t("nav_system_instructions"),
+				icon: <BookText className="h-4 w-4" />,
+			},
+			{
+				value: "team-memory" as const,
+				label: "Team Memory",
+				icon: <Brain className="h-4 w-4" />,
+			},
+			{
+				value: "public-links" as const,
+				label: t("nav_public_links"),
+				icon: <Earth className="h-4 w-4" />,
+			},
+		],
+		[t]
+	);
+
+	const activeTab: SearchSpaceSettingsTab =
+		segment && navItems.some((item) => item.value === segment)
+			? (segment as SearchSpaceSettingsTab)
+			: DEFAULT_TAB;
+	const selectedLabel = navItems.find((item) => item.value === activeTab)?.label ?? t("title");
+
+	const hrefFor = (tab: SearchSpaceSettingsTab) =>
+		`/dashboard/${searchSpaceId}/search-space-settings/${tab}`;
+
+	return (
+		<section className="flex h-full min-h-[min(680px,calc(100vh-5rem))] w-full select-none flex-col gap-6 md:pt-6 md:flex-row">
+			<div className="md:w-[220px] md:shrink-0">
+				<h1 className="mb-4 px-1 text-2xl font-semibold tracking-tight">{t("title")}</h1>
+				<nav className="hidden flex-col gap-0.5 md:flex">
+					{navItems.map((item) => (
+						<Link
+							key={item.value}
+							href={hrefFor(item.value)}
+							replace
+							scroll={false}
+							prefetch
+							className={cn(
+								"inline-flex h-auto items-center justify-start gap-3 rounded-md px-3 py-2.5 text-left text-sm font-medium transition-colors duration-150 focus:outline-none focus-visible:outline-none",
+								activeTab === item.value
+									? "bg-accent text-accent-foreground"
+									: "bg-transparent text-muted-foreground hover:bg-accent hover:text-accent-foreground"
+							)}
+						>
+							{item.icon}
+							{item.label}
+						</Link>
+					))}
+				</nav>
+				<div
+					className="overflow-x-auto border-b border-border pb-3 md:hidden"
+					onScroll={handleTabScroll}
+					style={{
+						maskImage: `linear-gradient(to right, ${tabScrollPos === "start" ? "black" : "transparent"}, black 24px, black calc(100% - 24px), ${tabScrollPos === "end" ? "black" : "transparent"})`,
+						WebkitMaskImage: `linear-gradient(to right, ${tabScrollPos === "start" ? "black" : "transparent"}, black 24px, black calc(100% - 24px), ${tabScrollPos === "end" ? "black" : "transparent"})`,
+					}}
+				>
+					<div className="flex gap-1">
+						{navItems.map((item) => (
+							<Link
+								key={item.value}
+								href={hrefFor(item.value)}
+								replace
+								scroll={false}
+								prefetch
+								className={cn(
+									"inline-flex h-auto shrink-0 items-center gap-2 rounded-md px-3 py-1.5 text-xs font-medium transition-colors duration-150 focus:outline-none focus-visible:outline-none",
+									activeTab === item.value
+										? "bg-accent text-accent-foreground"
+										: "bg-transparent text-muted-foreground hover:bg-accent hover:text-accent-foreground"
+								)}
+							>
+								{item.icon}
+								{item.label}
+							</Link>
+						))}
+					</div>
+				</div>
+			</div>
+
+			<div className="min-w-0 flex-1">
+				<div className="hidden md:block">
+					<h2 className="text-lg font-semibold">{selectedLabel}</h2>
+					<Separator className="mt-4" />
+				</div>
+				<div className="min-w-0 pt-4 md:max-w-3xl">{children}</div>
+			</div>
+		</section>
+	);
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/layout.tsx b/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/layout.tsx
new file mode 100644
index 0000000000..330158da7a
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/layout.tsx
@@ -0,0 +1,19 @@
+import type React from "react";
+import { use } from "react";
+import { SearchSpaceSettingsLayoutShell } from "./layout-shell";
+
+export default function SearchSpaceSettingsLayout({
+	params,
+	children,
+}: {
+	params: Promise<{ search_space_id: string }>;
+	children: React.ReactNode;
+}) {
+	const { search_space_id } = use(params);
+
+	return (
+		<SearchSpaceSettingsLayoutShell searchSpaceId={search_space_id}>
+			{children}
+		</SearchSpaceSettingsLayoutShell>
+	);
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/models/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/models/page.tsx
new file mode 100644
index 0000000000..d681947821
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/models/page.tsx
@@ -0,0 +1,6 @@
+import { AgentModelManager } from "@/components/settings/agent-model-manager";
+
+export default async function Page({ params }: { params: Promise<{ search_space_id: string }> }) {
+	const { search_space_id } = await params;
+	return <AgentModelManager searchSpaceId={Number(search_space_id)} />;
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/page.tsx
new file mode 100644
index 0000000000..27c59328b6
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/page.tsx
@@ -0,0 +1,10 @@
+import { redirect } from "next/navigation";
+
+export default async function SearchSpaceSettingsPage({
+	params,
+}: {
+	params: Promise<{ search_space_id: string }>;
+}) {
+	const { search_space_id } = await params;
+	redirect(`/dashboard/${search_space_id}/search-space-settings/general`);
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/prompts/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/prompts/page.tsx
new file mode 100644
index 0000000000..cc837299d4
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/prompts/page.tsx
@@ -0,0 +1,6 @@
+import { PromptConfigManager } from "@/components/settings/prompt-config-manager";
+
+export default async function Page({ params }: { params: Promise<{ search_space_id: string }> }) {
+	const { search_space_id } = await params;
+	return <PromptConfigManager searchSpaceId={Number(search_space_id)} />;
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/public-links/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/public-links/page.tsx
new file mode 100644
index 0000000000..2cddfe3e06
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/public-links/page.tsx
@@ -0,0 +1,6 @@
+import { PublicChatSnapshotsManager } from "@/components/public-chat-snapshots/public-chat-snapshots-manager";
+
+export default async function Page({ params }: { params: Promise<{ search_space_id: string }> }) {
+	const { search_space_id } = await params;
+	return <PublicChatSnapshotsManager searchSpaceId={Number(search_space_id)} />;
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/roles/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/roles/page.tsx
new file mode 100644
index 0000000000..5bad50cd37
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/roles/page.tsx
@@ -0,0 +1,6 @@
+import { LLMRoleManager } from "@/components/settings/llm-role-manager";
+
+export default async function Page({ params }: { params: Promise<{ search_space_id: string }> }) {
+	const { search_space_id } = await params;
+	return <LLMRoleManager key={search_space_id} searchSpaceId={Number(search_space_id)} />;
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/team-memory/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/team-memory/page.tsx
new file mode 100644
index 0000000000..0652b012e4
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/team-memory/page.tsx
@@ -0,0 +1,6 @@
+import { TeamMemoryManager } from "@/components/settings/team-memory-manager";
+
+export default async function Page({ params }: { params: Promise<{ search_space_id: string }> }) {
+	const { search_space_id } = await params;
+	return <TeamMemoryManager searchSpaceId={Number(search_space_id)} />;
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/team-roles/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/team-roles/page.tsx
new file mode 100644
index 0000000000..a343eaacbe
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/team-roles/page.tsx
@@ -0,0 +1,6 @@
+import { RolesManager } from "@/components/settings/roles-manager";
+
+export default async function Page({ params }: { params: Promise<{ search_space_id: string }> }) {
+	const { search_space_id } = await params;
+	return <RolesManager searchSpaceId={Number(search_space_id)} />;
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/vision-models/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/vision-models/page.tsx
new file mode 100644
index 0000000000..06aea003ac
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/search-space-settings/vision-models/page.tsx
@@ -0,0 +1,6 @@
+import { VisionModelManager } from "@/components/settings/vision-model-manager";
+
+export default async function Page({ params }: { params: Promise<{ search_space_id: string }> }) {
+	const { search_space_id } = await params;
+	return <VisionModelManager searchSpaceId={Number(search_space_id)} />;
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/team/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/team/page.tsx
new file mode 100644
index 0000000000..c75eaf4e4f
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/team/page.tsx
@@ -0,0 +1,15 @@
+import { TeamContent } from "./team-content";
+
+export default async function TeamPage({
+	params,
+}: {
+	params: Promise<{ search_space_id: string }>;
+}) {
+	const { search_space_id } = await params;
+
+	return (
+		<div className="w-full select-none space-y-6">
+			<TeamContent searchSpaceId={Number(search_space_id)} />
+		</div>
+	);
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/team/team-content.tsx b/surfsense_web/app/dashboard/[search_space_id]/team/team-content.tsx
index d9ca9efb3c..f003dde1b4 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/team/team-content.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/team/team-content.tsx
@@ -1,7 +1,7 @@
 "use client";
 
 import { useQuery } from "@tanstack/react-query";
-import { useAtomValue, useSetAtom } from "jotai";
+import { useAtomValue } from "jotai";
 import {
 	Calendar,
 	Check,
@@ -20,6 +20,7 @@ import {
 	UserPlus,
 	Users,
 } from "lucide-react";
+import { useRouter } from "next/navigation";
 import { useCallback, useEffect, useMemo, useState } from "react";
 import { toast } from "sonner";
 import {
@@ -31,7 +32,6 @@ import {
 	updateMemberMutationAtom,
 } from "@/atoms/members/members-mutation.atoms";
 import { membersAtom, myAccessAtom } from "@/atoms/members/members-query.atoms";
-import { searchSpaceSettingsDialogAtom } from "@/atoms/settings/settings-dialog.atoms";
 import {
 	AlertDialog,
 	AlertDialogAction,
@@ -240,46 +240,77 @@ export function TeamContent({ searchSpaceId }: TeamContentProps) {
 
 	if (accessLoading || membersLoading) {
 		return (
-			<div className="space-y-6">
-				<div className="flex items-center justify-between">
-					<Skeleton className="h-9 w-36 rounded-md" />
-					<Skeleton className="h-4 w-20" />
+			<div className="space-y-4 md:space-y-6">
+				<div className="flex items-center gap-2 flex-wrap">
+					<Button
+						type="button"
+						variant="outline"
+						size="sm"
+						aria-disabled="true"
+						tabIndex={-1}
+						className="pointer-events-none gap-1.5 md:gap-2 text-xs md:text-sm bg-black text-white dark:bg-white dark:text-black"
+					>
+						<UserPlus className="h-3.5 w-3.5 md:h-4 md:w-4" />
+						Invite members
+					</Button>
+					<Button
+						type="button"
+						variant="secondary"
+						size="sm"
+						aria-disabled="true"
+						tabIndex={-1}
+						className="pointer-events-none gap-1.5 md:gap-2 text-xs md:text-sm"
+					>
+						<Link2 className="h-3.5 w-3.5 md:h-4 md:w-4 rotate-315" />
+						Active invites
+						<span className="inline-flex items-center justify-center h-4 md:h-5 min-w-4 md:min-w-5 px-1 rounded-full bg-neutral-700 text-neutral-200">
+							<Skeleton className="h-2.5 w-2.5 rounded-sm bg-neutral-500/60" />
+						</span>
+					</Button>
+					<div className="flex items-center gap-1 text-xs md:text-sm text-muted-foreground whitespace-nowrap">
+						<Skeleton className="h-3 w-2 rounded-sm" />
+						members
+					</div>
 				</div>
-				<div className="rounded-lg border border-border/40 bg-background overflow-hidden">
+				<div className="rounded-lg border border-border/60 bg-accent overflow-hidden">
 					<Table className="table-fixed w-full">
 						<TableHeader>
-							<TableRow className="hover:bg-transparent border-b border-border/40">
-								<TableHead className="w-[45%] px-4 md:px-6 border-r border-border/40">
-									<Skeleton className="h-3 w-16" />
+							<TableRow className="hover:bg-transparent border-b border-border/60">
+								<TableHead className="w-[45%] px-4 md:px-6 border-r border-border/60">
+									<span className="flex items-center gap-1.5 text-sm font-medium text-muted-foreground/70">
+										<User size={14} className="opacity-60 text-muted-foreground" />
+										Name
+									</span>
 								</TableHead>
-								<TableHead className="hidden md:table-cell w-[25%] border-r border-border/40">
-									<Skeleton className="h-3 w-24" />
+								<TableHead className="hidden md:table-cell w-[25%] border-r border-border/60">
+									<span className="flex items-center gap-1.5 text-sm font-medium text-muted-foreground/70">
+										<Clock size={14} className="opacity-60 text-muted-foreground" />
+										Last logged in
+									</span>
 								</TableHead>
 								<TableHead className="w-[30%] px-4 md:px-6">
-									<div className="flex justify-end">
-										<Skeleton className="h-3 w-12" />
-									</div>
+									<span className="flex items-center gap-1.5 text-sm font-medium text-muted-foreground/70 justify-end">
+										<ShieldUser size={14} className="opacity-60 text-muted-foreground" />
+										Role
+									</span>
 								</TableHead>
 							</TableRow>
 						</TableHeader>
 						<TableBody>
-							{SKELETON_KEYS.map((id) => (
-								<TableRow key={id} className="border-b border-border/40 hover:bg-transparent">
-									<TableCell className="w-[45%] py-2.5 px-4 md:px-6 border-r border-border/40">
+							{SKELETON_KEYS.slice(0, 2).map((id) => (
+								<TableRow key={id} className="border-b border-border/60 hover:bg-transparent">
+									<TableCell className="w-[45%] py-2.5 px-4 md:px-6 border-r border-border/60">
 										<div className="flex items-center gap-3">
 											<Skeleton className="h-10 w-10 rounded-full shrink-0" />
-											<div className="flex-1 min-w-0 space-y-1.5">
-												<Skeleton className="h-4 w-[60%]" />
-												<Skeleton className="h-3 w-[40%]" />
-											</div>
+											<Skeleton className="h-4 w-28 md:w-32" />
 										</div>
 									</TableCell>
-									<TableCell className="hidden md:table-cell w-[25%] py-2.5 border-r border-border/40">
+									<TableCell className="hidden md:table-cell w-[25%] py-2.5 border-r border-border/60">
 										<Skeleton className="h-4 w-24" />
 									</TableCell>
 									<TableCell className="w-[30%] py-2.5 px-4 md:px-6">
 										<div className="flex justify-end">
-											<Skeleton className="h-4 w-16" />
+											<Skeleton className="h-4 w-12" />
 										</div>
 									</TableCell>
 								</TableRow>
@@ -294,41 +325,63 @@ export function TeamContent({ searchSpaceId }: TeamContentProps) {
 	return (
 		<div className="space-y-4 md:space-y-6">
 			<div className="flex items-center gap-2 flex-wrap">
-				{rolesLoading ? (
-					<Skeleton className="h-9 w-32 rounded-md" />
-				) : (
-					canInvite && (
+				{canInvite &&
+					(rolesLoading ? (
+						<Button
+							type="button"
+							variant="outline"
+							size="sm"
+							aria-disabled="true"
+							tabIndex={-1}
+							className="pointer-events-none gap-1.5 md:gap-2 text-xs md:text-sm bg-black text-white dark:bg-white dark:text-black"
+						>
+							<UserPlus className="h-3.5 w-3.5 md:h-4 md:w-4" />
+							Invite members
+						</Button>
+					) : (
 						<CreateInviteDialog
 							roles={roles}
 							onCreateInvite={handleCreateInvite}
 							searchSpaceId={searchSpaceId}
 						/>
-					)
-				)}
-				{invitesLoading ? (
-					<Skeleton className="h-9 w-32 rounded-md" />
-				) : (
-					canInvite &&
-					activeInvites.length > 0 && (
-						<AllInvitesDialog invites={activeInvites} onRevokeInvite={handleRevokeInvite} />
-					)
-				)}
+					))}
+				{canInvite &&
+					(invitesLoading ? (
+						<Button
+							type="button"
+							variant="secondary"
+							size="sm"
+							aria-disabled="true"
+							tabIndex={-1}
+							className="pointer-events-none gap-1.5 md:gap-2 text-xs md:text-sm"
+						>
+							<Link2 className="h-3.5 w-3.5 md:h-4 md:w-4 rotate-315" />
+							Active invites
+							<span className="inline-flex items-center justify-center h-4 md:h-5 min-w-4 md:min-w-5 px-1 rounded-full bg-neutral-700 text-neutral-200">
+								<Skeleton className="h-2.5 w-2.5 rounded-sm bg-neutral-500/60" />
+							</span>
+						</Button>
+					) : (
+						activeInvites.length > 0 && (
+							<AllInvitesDialog invites={activeInvites} onRevokeInvite={handleRevokeInvite} />
+						)
+					))}
 				<p className="text-xs md:text-sm text-muted-foreground whitespace-nowrap">
 					{members.length} {members.length === 1 ? "member" : "members"}
 				</p>
 			</div>
 
-			<div className="rounded-lg border border-border/40 bg-background overflow-hidden">
+			<div className="rounded-lg border border-border/60 bg-accent overflow-hidden">
 				<Table className="table-fixed w-full">
 					<TableHeader>
-						<TableRow className="hover:bg-transparent border-b border-border/40">
-							<TableHead className="w-[45%] px-4 md:px-6 border-r border-border/40">
+						<TableRow className="hover:bg-transparent border-b border-border/60">
+							<TableHead className="w-[45%] px-4 md:px-6 border-r border-border/60">
 								<span className="flex items-center gap-1.5 text-sm font-medium text-muted-foreground/70">
 									<User size={14} className="opacity-60 text-muted-foreground" />
 									Name
 								</span>
 							</TableHead>
-							<TableHead className="hidden md:table-cell w-[25%] border-r border-border/40">
+							<TableHead className="hidden md:table-cell w-[25%] border-r border-border/60">
 								<span className="flex items-center gap-1.5 text-sm font-medium text-muted-foreground/70">
 									<Clock size={14} className="opacity-60 text-muted-foreground" />
 									Last logged in
@@ -346,6 +399,7 @@ export function TeamContent({ searchSpaceId }: TeamContentProps) {
 						{owners.map((member) => (
 							<MemberRow
 								key={`member-${member.id}`}
+								searchSpaceId={searchSpaceId}
 								member={member}
 								roles={roles}
 								canManageRoles={canManageRoles}
@@ -357,6 +411,7 @@ export function TeamContent({ searchSpaceId }: TeamContentProps) {
 						{paginatedMembers.map((member) => (
 							<MemberRow
 								key={`member-${member.id}`}
+								searchSpaceId={searchSpaceId}
 								member={member}
 								roles={roles}
 								canManageRoles={canManageRoles}
@@ -433,6 +488,7 @@ export function TeamContent({ searchSpaceId }: TeamContentProps) {
 }
 
 function MemberRow({
+	searchSpaceId,
 	member,
 	roles,
 	canManageRoles,
@@ -440,6 +496,7 @@ function MemberRow({
 	onUpdateRole,
 	onRemoveMember,
 }: {
+	searchSpaceId: number;
 	member: Membership;
 	roles: Role[];
 	canManageRoles: boolean;
@@ -447,21 +504,23 @@ function MemberRow({
 	onUpdateRole: (membershipId: number, roleId: number | null) => Promise<Membership>;
 	onRemoveMember: (membershipId: number) => Promise<boolean>;
 }) {
-	const setSearchSpaceSettingsDialog = useSetAtom(searchSpaceSettingsDialogAtom);
+	const router = useRouter();
 	const initials = getAvatarInitials(member);
 	const displayName = member.user_display_name || member.user_email || "Unknown";
 	const roleName = member.is_owner ? "Owner" : member.role?.name || "No role";
 	const showActions = !member.is_owner && (canManageRoles || canRemove);
 
 	return (
-		<TableRow className="border-b border-border/40 transition-colors hover:bg-muted/30">
-			<TableCell className="w-[45%] py-2.5 px-4 md:px-6 max-w-0 border-r border-border/40">
+		<TableRow className="border-b border-border/60 transition-colors hover:bg-accent hover:text-accent-foreground">
+			<TableCell className="w-[45%] py-2.5 px-4 md:px-6 max-w-0 border-r border-border/60">
 				<div className="flex items-center gap-3">
 					<Avatar className="size-10 shrink-0">
 						{member.user_avatar_url && (
 							<AvatarImage src={member.user_avatar_url} alt={displayName} />
 						)}
-						<AvatarFallback className="text-sm">{initials}</AvatarFallback>
+						<AvatarFallback className="bg-popover text-sm text-popover-foreground">
+							{initials}
+						</AvatarFallback>
 					</Avatar>
 					<div className="min-w-0">
 						<p className="font-medium text-sm truncate select-text">{displayName}</p>
@@ -474,7 +533,7 @@ function MemberRow({
 				</div>
 			</TableCell>
 
-			<TableCell className="hidden md:table-cell w-[25%] py-2.5 text-sm text-foreground border-r border-border/40">
+			<TableCell className="hidden md:table-cell w-[25%] py-2.5 text-sm text-foreground border-r border-border/60">
 				{member.user_last_login ? formatRelativeDate(member.user_last_login) : "Never"}
 			</TableCell>
 
@@ -482,18 +541,20 @@ function MemberRow({
 				{showActions ? (
 					<DropdownMenu>
 						<DropdownMenuTrigger asChild>
-							<button
+							<Button
 								type="button"
-								className="inline-flex items-center gap-1.5 text-sm text-muted-foreground hover:text-foreground transition-colors"
+								variant="ghost"
+								size="sm"
+								className="h-auto w-[74px] justify-end gap-1.5 px-0 py-0 text-sm text-muted-foreground hover:bg-transparent hover:text-accent-foreground has-[>svg]:px-0"
 							>
 								{roleName}
 								<ChevronDown className="h-4 w-4" />
-							</button>
+							</Button>
 						</DropdownMenuTrigger>
 						<DropdownMenuContent
 							align="end"
 							onCloseAutoFocus={(e) => e.preventDefault()}
-							className="min-w-[120px] dark:bg-neutral-900 dark:border dark:border-white/5"
+							className="min-w-[120px]"
 						>
 							{canManageRoles &&
 								roles
@@ -536,13 +597,10 @@ function MemberRow({
 									</AlertDialogContent>
 								</AlertDialog>
 							)}
-							<DropdownMenuSeparator className="dark:bg-white/5" />
+							<DropdownMenuSeparator className="bg-popover-border" />
 							<DropdownMenuItem
 								onClick={() =>
-									setSearchSpaceSettingsDialog({
-										open: true,
-										initialTab: "team-roles",
-									})
+									router.push(`/dashboard/${searchSpaceId}/search-space-settings/team-roles`)
 								}
 							>
 								Manage Roles
@@ -707,7 +765,7 @@ function CreateInviteDialog({
 							<div className="space-y-2">
 								<Label htmlFor="invite-role">Role</Label>
 								<Select value={roleId} onValueChange={setRoleId}>
-									<SelectTrigger>
+									<SelectTrigger className="border-popover-border">
 										<SelectValue placeholder="Assign a role" />
 									</SelectTrigger>
 									<SelectContent>
@@ -743,7 +801,7 @@ function CreateInviteDialog({
 											<Button
 												variant="outline"
 												className={cn(
-													"w-full justify-start text-left font-normal bg-transparent",
+													"w-full justify-start text-left font-normal bg-transparent border-popover-border",
 													!expiresAt && "text-muted-foreground"
 												)}
 											>
diff --git a/surfsense_web/app/dashboard/[search_space_id]/user-settings/agent-permissions/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/user-settings/agent-permissions/page.tsx
new file mode 100644
index 0000000000..2b925bd290
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/user-settings/agent-permissions/page.tsx
@@ -0,0 +1,5 @@
+import { AgentPermissionsContent } from "../components/AgentPermissionsContent";
+
+export default function Page() {
+	return <AgentPermissionsContent />;
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/user-settings/agent-status/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/user-settings/agent-status/page.tsx
new file mode 100644
index 0000000000..dc5c61d2aa
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/user-settings/agent-status/page.tsx
@@ -0,0 +1,5 @@
+import { AgentStatusContent } from "../components/AgentStatusContent";
+
+export default function Page() {
+	return <AgentStatusContent />;
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/user-settings/api-key/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/user-settings/api-key/page.tsx
new file mode 100644
index 0000000000..e59e500420
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/user-settings/api-key/page.tsx
@@ -0,0 +1,5 @@
+import { ApiKeyContent } from "../components/ApiKeyContent";
+
+export default function Page() {
+	return <ApiKeyContent />;
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/user-settings/community-prompts/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/user-settings/community-prompts/page.tsx
new file mode 100644
index 0000000000..e962cbed2d
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/user-settings/community-prompts/page.tsx
@@ -0,0 +1,5 @@
+import { CommunityPromptsContent } from "../components/CommunityPromptsContent";
+
+export default function Page() {
+	return <CommunityPromptsContent />;
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/AgentPermissionsContent.tsx b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/AgentPermissionsContent.tsx
new file mode 100644
index 0000000000..5af94f7e3d
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/AgentPermissionsContent.tsx
@@ -0,0 +1,485 @@
+"use client";
+
+import { useMutation, useQuery, useQueryClient } from "@tanstack/react-query";
+import { useAtomValue } from "jotai";
+import { AlertTriangle, Info, ShieldCheck, Trash2 } from "lucide-react";
+import { useCallback, useMemo, useState } from "react";
+import { toast } from "sonner";
+import { agentFlagsAtom } from "@/atoms/agent/agent-flags-query.atom";
+import { activeSearchSpaceIdAtom } from "@/atoms/search-spaces/search-space-query.atoms";
+import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
+import {
+	AlertDialog,
+	AlertDialogAction,
+	AlertDialogCancel,
+	AlertDialogContent,
+	AlertDialogDescription,
+	AlertDialogFooter,
+	AlertDialogHeader,
+	AlertDialogTitle,
+} from "@/components/ui/alert-dialog";
+import { Badge } from "@/components/ui/badge";
+import { Button } from "@/components/ui/button";
+import { Card, CardContent } from "@/components/ui/card";
+import {
+	Dialog,
+	DialogContent,
+	DialogDescription,
+	DialogFooter,
+	DialogHeader,
+	DialogTitle,
+} from "@/components/ui/dialog";
+import { Input } from "@/components/ui/input";
+import { Label } from "@/components/ui/label";
+import {
+	Select,
+	SelectContent,
+	SelectItem,
+	SelectTrigger,
+	SelectValue,
+} from "@/components/ui/select";
+import { Skeleton } from "@/components/ui/skeleton";
+import { Spinner } from "@/components/ui/spinner";
+import {
+	type AgentPermissionAction,
+	type AgentPermissionRule,
+	type AgentPermissionRuleCreate,
+	agentPermissionsApiService,
+} from "@/lib/apis/agent-permissions-api.service";
+import { AppError } from "@/lib/error";
+import { formatRelativeDate } from "@/lib/format-date";
+import { cn } from "@/lib/utils";
+
+const ACTION_DESCRIPTIONS: Record<AgentPermissionAction, string> = {
+	allow: "Always run without prompting",
+	deny: "Block silently",
+	ask: "Pause and ask for approval",
+};
+
+const ACTION_BADGE: Record<AgentPermissionAction, { label: string; className: string }> = {
+	allow: { label: "Allow", className: "bg-emerald-500/10 text-emerald-600 border-emerald-500/30" },
+	deny: { label: "Deny", className: "bg-destructive/10 text-destructive border-destructive/30" },
+	ask: { label: "Ask", className: "bg-amber-500/10 text-amber-600 border-amber-500/30" },
+};
+
+const EMPTY_FORM: AgentPermissionRuleCreate = {
+	permission: "",
+	pattern: "*",
+	action: "ask",
+	user_id: null,
+	thread_id: null,
+};
+
+function permissionRulesQueryKey(searchSpaceId: number) {
+	return ["agent-permission-rules", searchSpaceId] as const;
+}
+
+function ScopeBadge({ rule }: { rule: AgentPermissionRule }) {
+	if (rule.thread_id !== null) {
+		return (
+			<Badge
+				variant="secondary"
+				className="text-[10px] px-1.5 py-0.5 border-0 text-muted-foreground bg-muted"
+			>
+				Thread #{rule.thread_id}
+			</Badge>
+		);
+	}
+	if (rule.user_id !== null) {
+		return (
+			<Badge
+				variant="secondary"
+				className="text-[10px] px-1.5 py-0.5 border-0 text-muted-foreground bg-muted"
+			>
+				User-specific
+			</Badge>
+		);
+	}
+	return (
+		<Badge
+			variant="secondary"
+			className="text-[10px] px-1.5 py-0.5 border-0 text-muted-foreground bg-muted"
+		>
+			Search space
+		</Badge>
+	);
+}
+
+export function AgentPermissionsContent() {
+	const searchSpaceIdRaw = useAtomValue(activeSearchSpaceIdAtom);
+	const searchSpaceId = searchSpaceIdRaw ? Number(searchSpaceIdRaw) : null;
+
+	const { data: flags } = useAtomValue(agentFlagsAtom);
+	const featureEnabled = !!flags?.enable_permission && !flags?.disable_new_agent_stack;
+
+	const queryClient = useQueryClient();
+
+	const {
+		data: rules,
+		isLoading,
+		isError,
+		error,
+	} = useQuery({
+		queryKey: searchSpaceId
+			? permissionRulesQueryKey(searchSpaceId)
+			: ["agent-permission-rules", "none"],
+		queryFn: () => agentPermissionsApiService.list(searchSpaceId as number),
+		enabled: !!searchSpaceId && featureEnabled,
+		staleTime: 60 * 1000,
+	});
+
+	const createMutation = useMutation({
+		mutationFn: (payload: AgentPermissionRuleCreate) =>
+			agentPermissionsApiService.create(searchSpaceId as number, payload),
+		onSuccess: () => {
+			toast.success("Rule created.");
+			queryClient.invalidateQueries({
+				queryKey: permissionRulesQueryKey(searchSpaceId as number),
+			});
+		},
+		onError: (err: unknown) => {
+			toast.error(err instanceof Error ? err.message : "Failed to create rule.");
+		},
+	});
+
+	const updateMutation = useMutation({
+		mutationFn: (params: { ruleId: number; action: AgentPermissionAction; pattern?: string }) =>
+			agentPermissionsApiService.update(searchSpaceId as number, params.ruleId, {
+				action: params.action,
+				pattern: params.pattern,
+			}),
+		onSuccess: () => {
+			queryClient.invalidateQueries({
+				queryKey: permissionRulesQueryKey(searchSpaceId as number),
+			});
+		},
+		onError: (err: unknown) => {
+			toast.error(err instanceof Error ? err.message : "Failed to update rule.");
+		},
+	});
+
+	const deleteMutation = useMutation({
+		mutationFn: (ruleId: number) =>
+			agentPermissionsApiService.remove(searchSpaceId as number, ruleId),
+		onSuccess: () => {
+			toast.success("Rule deleted.");
+			queryClient.invalidateQueries({
+				queryKey: permissionRulesQueryKey(searchSpaceId as number),
+			});
+		},
+		onError: (err: unknown) => {
+			toast.error(err instanceof Error ? err.message : "Failed to delete rule.");
+		},
+	});
+
+	const [showForm, setShowForm] = useState(false);
+	const [formData, setFormData] = useState<AgentPermissionRuleCreate>(EMPTY_FORM);
+	const [deleteTarget, setDeleteTarget] = useState<number | null>(null);
+
+	const sortedRules = useMemo(() => rules ?? [], [rules]);
+
+	const handleCreate = useCallback(async () => {
+		if (!formData.permission.trim()) {
+			toast.error("Permission is required.");
+			return;
+		}
+		try {
+			await createMutation.mutateAsync({
+				...formData,
+				permission: formData.permission.trim(),
+				pattern: formData.pattern.trim() || "*",
+			});
+			setFormData(EMPTY_FORM);
+			setShowForm(false);
+		} catch (err) {
+			if (err instanceof AppError && err.message) {
+				// already toasted by onError
+			}
+		}
+	}, [createMutation, formData]);
+
+	const handleConfirmDelete = useCallback(async () => {
+		if (deleteTarget === null) return;
+		try {
+			await deleteMutation.mutateAsync(deleteTarget);
+		} finally {
+			setDeleteTarget(null);
+		}
+	}, [deleteMutation, deleteTarget]);
+
+	if (!featureEnabled) {
+		return (
+			<Alert>
+				<Info />
+				<AlertTitle>Permission middleware is disabled</AlertTitle>
+				<AlertDescription>
+					<p>
+						Flip{" "}
+						<code className="rounded bg-popover px-1 py-0.5 text-[10px] text-popover-foreground">
+							SURFSENSE_ENABLE_PERMISSION
+						</code>{" "}
+						on the backend to manage allow/deny/ask rules from this panel.
+					</p>
+				</AlertDescription>
+			</Alert>
+		);
+	}
+
+	if (!searchSpaceId) {
+		return (
+			<p className="text-sm text-muted-foreground">Open a search space to manage agent rules.</p>
+		);
+	}
+
+	return (
+		<div className="min-w-0 space-y-6 overflow-visible">
+			<div className="flex items-start justify-between gap-3">
+				<div className="space-y-1">
+					<p className="text-sm text-muted-foreground">
+						Tell the agent which tools to allow, deny, or ask before running. Rules use wildcard
+						patterns and are evaluated at the most specific scope first.
+					</p>
+				</div>
+				<Button
+					size="sm"
+					onClick={() => {
+						setShowForm(true);
+						setFormData(EMPTY_FORM);
+					}}
+					className="shrink-0 gap-1.5"
+				>
+					New rule
+				</Button>
+			</div>
+
+			<Dialog
+				open={showForm}
+				onOpenChange={(open) => {
+					setShowForm(open);
+					if (!open) setFormData(EMPTY_FORM);
+				}}
+			>
+				<DialogContent className="max-w-lg bg-popover text-popover-foreground">
+					<DialogHeader>
+						<DialogTitle>New permission rule</DialogTitle>
+						<DialogDescription>
+							Tell the agent whether matching tool calls should be allowed, denied, or paused for
+							approval.
+						</DialogDescription>
+					</DialogHeader>
+
+					<div className="space-y-4">
+						<div className="grid gap-3">
+							<div className="space-y-2">
+								<Label htmlFor="permission-name">Permission</Label>
+								<Input
+									id="permission-name"
+									value={formData.permission}
+									placeholder="e.g. tool:create_linear_issue or tool:*"
+									onChange={(e) => setFormData((p) => ({ ...p, permission: e.target.value }))}
+								/>
+								<p className="text-[11px] text-muted-foreground">
+									Match a tool capability. Use <code className="font-mono">*</code> for wildcards.
+								</p>
+							</div>
+
+							<div className="space-y-2">
+								<Label htmlFor="pattern">Argument pattern</Label>
+								<Input
+									id="pattern"
+									value={formData.pattern}
+									placeholder="*"
+									onChange={(e) => setFormData((p) => ({ ...p, pattern: e.target.value }))}
+								/>
+								<p className="text-[11px] text-muted-foreground">
+									Wildcard against the canonical argument (e.g. <code>prod-*</code>).
+								</p>
+							</div>
+						</div>
+
+						<div className="space-y-2">
+							<Label>Action</Label>
+							<Select
+								value={formData.action}
+								onValueChange={(value) =>
+									setFormData((p) => ({ ...p, action: value as AgentPermissionAction }))
+								}
+							>
+								<SelectTrigger>
+									<SelectValue />
+								</SelectTrigger>
+								<SelectContent>
+									<SelectItem value="allow">Allow (run without asking)</SelectItem>
+									<SelectItem value="ask">Ask (pause for approval)</SelectItem>
+									<SelectItem value="deny">Deny (block silently)</SelectItem>
+								</SelectContent>
+							</Select>
+							<p className="text-[11px] text-muted-foreground">
+								{ACTION_DESCRIPTIONS[formData.action]}
+							</p>
+						</div>
+					</div>
+
+					<DialogFooter>
+						<Button
+							type="button"
+							variant="secondary"
+							size="sm"
+							onClick={() => {
+								setShowForm(false);
+								setFormData(EMPTY_FORM);
+							}}
+							disabled={createMutation.isPending}
+							className="text-sm h-9"
+						>
+							Cancel
+						</Button>
+						<Button
+							size="sm"
+							onClick={handleCreate}
+							disabled={createMutation.isPending || !formData.permission.trim()}
+							className="relative text-sm h-9 min-w-[96px]"
+						>
+							<span className={createMutation.isPending ? "opacity-0" : ""}>Create</span>
+							{createMutation.isPending && <Spinner size="sm" className="absolute" />}
+						</Button>
+					</DialogFooter>
+				</DialogContent>
+			</Dialog>
+
+			{isLoading && (
+				<div className="-m-1 space-y-2 p-1">
+					{["skeleton-a", "skeleton-b", "skeleton-c"].map((key) => (
+						<Card key={key} className="border-accent bg-accent/20">
+							<CardContent className="p-4 flex flex-col gap-3 min-h-24">
+								<Skeleton className="h-4 w-32 md:w-40 bg-accent" />
+								<Skeleton className="h-3 w-full bg-accent" />
+								<Skeleton className="h-3 w-24 md:w-28 bg-accent mt-auto" />
+							</CardContent>
+						</Card>
+					))}
+				</div>
+			)}
+
+			{isError && (
+				<Alert variant="destructive">
+					<AlertTriangle />
+					<AlertTitle>Failed to load rules</AlertTitle>
+					<AlertDescription>
+						{error instanceof Error ? error.message : "Unknown error."}
+					</AlertDescription>
+				</Alert>
+			)}
+
+			{!isLoading && !isError && sortedRules.length === 0 && !showForm && (
+				<div className="rounded-lg border border-dashed border-border/60 p-8 text-center">
+					<ShieldCheck className="mx-auto size-8 text-muted-foreground/40" />
+					<p className="mt-2 text-sm text-muted-foreground">No rules yet</p>
+					<p className="text-xs text-muted-foreground/60">
+						Without rules the agent uses the deployment default for every tool.
+					</p>
+				</div>
+			)}
+
+			{!isLoading && !isError && sortedRules.length > 0 && (
+				<div className="-m-1 space-y-2 p-1">
+					{sortedRules.map((rule) => {
+						const badge = ACTION_BADGE[rule.action];
+						const isUpdating =
+							updateMutation.isPending && updateMutation.variables?.ruleId === rule.id;
+						const isDeleting = deleteMutation.isPending && deleteMutation.variables === rule.id;
+
+						return (
+							<Card
+								key={rule.id}
+								className="group relative overflow-hidden transition-all duration-200 border-accent bg-accent/20 hover:shadow-md h-full"
+							>
+								<CardContent className="p-4 flex items-center justify-between gap-3 h-full">
+									<div className="flex min-w-0 flex-1 flex-col gap-1.5">
+										<div className="flex flex-wrap items-center gap-1.5">
+											<code className="truncate font-mono text-sm font-medium text-foreground">
+												{rule.permission}
+											</code>
+											{rule.pattern !== "*" && (
+												<span className="text-xs text-muted-foreground">
+													→ <code className="font-mono">{rule.pattern}</code>
+												</span>
+											)}
+											<ScopeBadge rule={rule} />
+										</div>
+										<p className="text-[11px] text-muted-foreground">
+											Created {formatRelativeDate(rule.created_at)}
+										</p>
+									</div>
+
+									<div className="flex shrink-0 items-center self-center gap-1">
+										<Select
+											value={rule.action}
+											onValueChange={(value) =>
+												updateMutation.mutate({
+													ruleId: rule.id,
+													action: value as AgentPermissionAction,
+												})
+											}
+											disabled={isUpdating || isDeleting}
+										>
+											<SelectTrigger
+												className={cn("h-8 gap-1 border px-2 text-[11px]", badge.className)}
+											>
+												<SelectValue>
+													<span className="flex items-center gap-1">{badge.label}</span>
+												</SelectValue>
+											</SelectTrigger>
+											<SelectContent>
+												<SelectItem value="allow">Allow</SelectItem>
+												<SelectItem value="ask">Ask</SelectItem>
+												<SelectItem value="deny">Deny</SelectItem>
+											</SelectContent>
+										</Select>
+
+										<Button
+											size="sm"
+											variant="ghost"
+											className="h-7 w-7 rounded-lg p-0 text-muted-foreground hover:text-destructive"
+											onClick={() => setDeleteTarget(rule.id)}
+											disabled={isUpdating || isDeleting}
+											aria-label="Delete rule"
+										>
+											<Trash2 className="size-3.5" />
+										</Button>
+									</div>
+								</CardContent>
+							</Card>
+						);
+					})}
+				</div>
+			)}
+
+			<AlertDialog
+				open={deleteTarget !== null}
+				onOpenChange={(open) => !open && setDeleteTarget(null)}
+			>
+				<AlertDialogContent>
+					<AlertDialogHeader>
+						<AlertDialogTitle>Delete this rule?</AlertDialogTitle>
+						<AlertDialogDescription>
+							The agent will fall back to deployment defaults for matching tool calls.
+						</AlertDialogDescription>
+					</AlertDialogHeader>
+					<AlertDialogFooter>
+						<AlertDialogCancel disabled={deleteMutation.isPending}>Cancel</AlertDialogCancel>
+						<AlertDialogAction
+							onClick={(e) => {
+								e.preventDefault();
+								handleConfirmDelete();
+							}}
+							disabled={deleteMutation.isPending}
+						>
+							{deleteMutation.isPending ? "Deleting…" : "Delete"}
+						</AlertDialogAction>
+					</AlertDialogFooter>
+				</AlertDialogContent>
+			</AlertDialog>
+		</div>
+	);
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/AgentStatusContent.tsx b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/AgentStatusContent.tsx
new file mode 100644
index 0000000000..fd7be1a23c
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/AgentStatusContent.tsx
@@ -0,0 +1,327 @@
+"use client";
+
+import { useAtomValue } from "jotai";
+import { AlertTriangle, CircleCheck, CircleSlash, Info } from "lucide-react";
+import { Fragment, useMemo } from "react";
+import { agentFlagsAtom } from "@/atoms/agent/agent-flags-query.atom";
+import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
+import { Badge } from "@/components/ui/badge";
+import { Separator } from "@/components/ui/separator";
+import { Skeleton } from "@/components/ui/skeleton";
+import type { AgentFeatureFlags } from "@/lib/apis/agent-flags-api.service";
+import { cn } from "@/lib/utils";
+
+type FlagKey = keyof AgentFeatureFlags;
+
+interface FlagDef {
+	key: FlagKey;
+	label: string;
+	description: string;
+	envVar: string;
+}
+
+interface FlagGroup {
+	id: string;
+	title: string;
+	subtitle: string;
+	flags: FlagDef[];
+}
+
+const FLAG_GROUPS: FlagGroup[] = [
+	{
+		id: "tier1",
+		title: "Tier 1 — Agent quality",
+		subtitle: "Context editing, retries, fallbacks, doom-loop, tool-call repair.",
+		flags: [
+			{
+				key: "enable_context_editing",
+				label: "Context editing",
+				description: "Trim tool outputs and spill old text into backend storage.",
+				envVar: "SURFSENSE_ENABLE_CONTEXT_EDITING",
+			},
+			{
+				key: "enable_compaction_v2",
+				label: "Compaction v2",
+				description: "SurfSense-aware compaction replacing safe summarization.",
+				envVar: "SURFSENSE_ENABLE_COMPACTION_V2",
+			},
+			{
+				key: "enable_retry_after",
+				label: "Retry-After",
+				description: "Honour rate-limit retry-after headers automatically.",
+				envVar: "SURFSENSE_ENABLE_RETRY_AFTER",
+			},
+			{
+				key: "enable_model_fallback",
+				label: "Model fallback",
+				description: "Fail over to a backup model on persistent errors.",
+				envVar: "SURFSENSE_ENABLE_MODEL_FALLBACK",
+			},
+			{
+				key: "enable_model_call_limit",
+				label: "Model call limit",
+				description: "Cap total model calls per turn to prevent budget run-aways.",
+				envVar: "SURFSENSE_ENABLE_MODEL_CALL_LIMIT",
+			},
+			{
+				key: "enable_tool_call_limit",
+				label: "Tool call limit",
+				description: "Cap total tool calls per turn.",
+				envVar: "SURFSENSE_ENABLE_TOOL_CALL_LIMIT",
+			},
+			{
+				key: "enable_tool_call_repair",
+				label: "Tool-call name repair",
+				description: "Recover from lower-cased / fuzzy tool names emitted by smaller models.",
+				envVar: "SURFSENSE_ENABLE_TOOL_CALL_REPAIR",
+			},
+			{
+				key: "enable_doom_loop",
+				label: "Doom-loop detection",
+				description: "Detect repeated identical tool calls and ask the user to confirm.",
+				envVar: "SURFSENSE_ENABLE_DOOM_LOOP",
+			},
+		],
+	},
+	{
+		id: "tier2",
+		title: "Tier 2 — Safety",
+		subtitle: "Permission rules, busy-mutex, smarter tool selection.",
+		flags: [
+			{
+				key: "enable_permission",
+				label: "Permission middleware",
+				description: "Apply allow/deny/ask rules from the Agent Permissions tab.",
+				envVar: "SURFSENSE_ENABLE_PERMISSION",
+			},
+			{
+				key: "enable_busy_mutex",
+				label: "Busy mutex",
+				description: "Prevent two concurrent runs from corrupting the same thread.",
+				envVar: "SURFSENSE_ENABLE_BUSY_MUTEX",
+			},
+			{
+				key: "enable_llm_tool_selector",
+				label: "LLM tool selector",
+				description: "Use a smaller model to pre-filter the tool list per turn.",
+				envVar: "SURFSENSE_ENABLE_LLM_TOOL_SELECTOR",
+			},
+		],
+	},
+	{
+		id: "tier4",
+		title: "Tier 4 — Skills + subagents",
+		subtitle: "Built-in skills, specialized subagents, KB planner runnable.",
+		flags: [
+			{
+				key: "enable_skills",
+				label: "Skills",
+				description: "Load on-demand skill packs (kb-research, report-writing, …).",
+				envVar: "SURFSENSE_ENABLE_SKILLS",
+			},
+			{
+				key: "enable_specialized_subagents",
+				label: "Specialized subagents",
+				description: "Spin up explore / report_writer / connector_negotiator subagents.",
+				envVar: "SURFSENSE_ENABLE_SPECIALIZED_SUBAGENTS",
+			},
+			{
+				key: "enable_kb_planner_runnable",
+				label: "KB planner runnable",
+				description: "Compile a private planner sub-agent for KB search.",
+				envVar: "SURFSENSE_ENABLE_KB_PLANNER_RUNNABLE",
+			},
+		],
+	},
+	{
+		id: "tier5",
+		title: "Tier 5 — Audit + revert",
+		subtitle: "Action log + revert route used by the Agent Actions dialog.",
+		flags: [
+			{
+				key: "enable_action_log",
+				label: "Action log",
+				description: "Persist every tool call to agent_action_log.",
+				envVar: "SURFSENSE_ENABLE_ACTION_LOG",
+			},
+			{
+				key: "enable_revert_route",
+				label: "Revert route",
+				description: "Allow reverting reversible actions from the action log.",
+				envVar: "SURFSENSE_ENABLE_REVERT_ROUTE",
+			},
+		],
+	},
+	{
+		id: "tier6",
+		title: "Tier 6 — Plugins",
+		subtitle: "Optional middleware loaded from entry points.",
+		flags: [
+			{
+				key: "enable_plugin_loader",
+				label: "Plugin loader",
+				description: "Load surfsense.plugins entry-point middleware.",
+				envVar: "SURFSENSE_ENABLE_PLUGIN_LOADER",
+			},
+		],
+	},
+	{
+		id: "obs",
+		title: "Observability",
+		subtitle: "Telemetry pipelines (orthogonal to feature gating).",
+		flags: [
+			{
+				key: "enable_otel",
+				label: "OpenTelemetry",
+				description: "Emit OTel spans (also requires OTEL_EXPORTER_OTLP_ENDPOINT).",
+				envVar: "SURFSENSE_ENABLE_OTEL",
+			},
+		],
+	},
+	{
+		id: "desktop",
+		title: "Desktop",
+		subtitle: "Desktop-only capabilities exposed by the backend deployment.",
+		flags: [
+			{
+				key: "enable_desktop_local_filesystem",
+				label: "Local filesystem",
+				description: "Allow Desktop chat sessions to operate directly on selected local folders.",
+				envVar: "ENABLE_DESKTOP_LOCAL_FILESYSTEM",
+			},
+		],
+	},
+];
+
+function FlagRow({ def, value }: { def: FlagDef; value: boolean }) {
+	return (
+		<div className="flex items-start justify-between gap-4 py-3">
+			<div className="flex min-w-0 flex-1 flex-col gap-1">
+				<div className="flex flex-wrap items-center gap-2">
+					<span className="text-sm font-medium">{def.label}</span>
+					<code className="rounded bg-muted px-1.5 py-0.5 font-mono text-[10px] text-muted-foreground">
+						{def.envVar}
+					</code>
+				</div>
+				<p className="text-xs text-muted-foreground">{def.description}</p>
+			</div>
+			<Badge
+				variant={value ? "default" : "secondary"}
+				className={cn(
+					"shrink-0 gap-1",
+					value
+						? "border-emerald-500/30 bg-emerald-500/10 text-emerald-600"
+						: "text-muted-foreground"
+				)}
+			>
+				{value ? <CircleCheck className="size-3" /> : <CircleSlash className="size-3" />}
+				{value ? "On" : "Off"}
+			</Badge>
+		</div>
+	);
+}
+
+export function AgentStatusContent() {
+	const { data: flags, isLoading, isError, error } = useAtomValue(agentFlagsAtom);
+
+	const enabledCount = useMemo(() => {
+		if (!flags) return 0;
+		return Object.entries(flags).filter(([k, v]) => k !== "disable_new_agent_stack" && v === true)
+			.length;
+	}, [flags]);
+
+	if (isLoading) {
+		return (
+			<div className="flex flex-col gap-3">
+				<Skeleton className="h-12 w-full rounded-md" />
+				<Skeleton className="h-32 w-full rounded-md" />
+				<Skeleton className="h-32 w-full rounded-md" />
+			</div>
+		);
+	}
+
+	if (isError || !flags) {
+		return (
+			<Alert variant="destructive">
+				<AlertTriangle />
+				<AlertTitle>Failed to load agent status</AlertTitle>
+				<AlertDescription>
+					{error instanceof Error ? error.message : "Unknown error."}
+				</AlertDescription>
+			</Alert>
+		);
+	}
+
+	const masterOff = flags.disable_new_agent_stack;
+
+	return (
+		<div className="space-y-6">
+			{masterOff ? (
+				<Alert variant="destructive">
+					<AlertTriangle />
+					<AlertTitle>Master kill-switch is on</AlertTitle>
+					<AlertDescription>
+						<p>
+							Showing that{" "}
+							<code className="rounded bg-muted px-1 text-[10px]">
+								SURFSENSE_DISABLE_NEW_AGENT_STACK=true
+							</code>
+							, which forces every new middleware off, regardless of the individual flags below.
+							Restart the backend after changing it.
+						</p>
+					</AlertDescription>
+				</Alert>
+			) : (
+				<Alert>
+					<Info />
+					<AlertTitle className="flex items-center gap-2">
+						Agent stack
+						<Badge
+							variant="secondary"
+							className="rounded bg-popover px-1 py-0.5 text-[9px] text-popover-foreground"
+						>
+							{enabledCount} on
+						</Badge>
+					</AlertTitle>
+					<AlertDescription>
+						<p>
+							Showing a read-only mirror of the backend's <code>AgentFeatureFlags</code>. Flip an
+							env var and restart the backend to change a value.
+						</p>
+					</AlertDescription>
+				</Alert>
+			)}
+
+			{FLAG_GROUPS.map((group, groupIdx) => {
+				const allOff = group.flags.every((f) => !flags[f.key]);
+				return (
+					<div key={group.id}>
+						{groupIdx > 0 && <Separator className="my-4 bg-border" />}
+						<div className="rounded-lg border border-border/60 bg-card">
+							<div className="flex items-start justify-between gap-3 px-4 py-3">
+								<div>
+									<p className="text-sm font-semibold">{group.title}</p>
+									<p className="text-xs text-muted-foreground">{group.subtitle}</p>
+								</div>
+								{allOff && (
+									<Badge variant="outline" className="text-[10px] text-muted-foreground">
+										all off
+									</Badge>
+								)}
+							</div>
+							<Separator className="bg-border" />
+							<div className="px-4">
+								{group.flags.map((def, flagIdx) => (
+									<Fragment key={def.key}>
+										{flagIdx > 0 && <Separator className="bg-border" />}
+										<FlagRow def={def} value={flags[def.key]} />
+									</Fragment>
+								))}
+							</div>
+						</div>
+					</div>
+				);
+			})}
+		</div>
+	);
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/ApiKeyContent.tsx b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/ApiKeyContent.tsx
index 223a6e3e72..47cdf8f2dc 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/ApiKeyContent.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/ApiKeyContent.tsx
@@ -3,8 +3,9 @@
 import { Check, Copy, Info } from "lucide-react";
 import { useTranslations } from "next-intl";
 import { useCallback, useRef, useState } from "react";
-import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
+import { Alert, AlertDescription } from "@/components/ui/alert";
 import { Button } from "@/components/ui/button";
+import { Skeleton } from "@/components/ui/skeleton";
 import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/components/ui/tooltip";
 import { useApiKey } from "@/hooks/use-api-key";
 import { copyToClipboard as copyToClipboardUtil } from "@/lib/utils";
@@ -27,18 +28,20 @@ export function ApiKeyContent() {
 
 	return (
 		<div className="space-y-6 min-w-0 overflow-hidden">
-			<Alert className="border-border/60 bg-muted/30 text-muted-foreground">
-				<Info className="h-4 w-4 text-muted-foreground" />
-				<AlertTitle className="text-muted-foreground">{t("api_key_warning_title")}</AlertTitle>
-				<AlertDescription className="text-muted-foreground/60">
-					{t("api_key_warning_description")}
-				</AlertDescription>
+			<Alert>
+				<Info />
+				<AlertDescription>{t("api_key_warning_description")}</AlertDescription>
 			</Alert>
 
-			<div className="rounded-lg border border-border/60 bg-card p-6 min-w-0 overflow-hidden">
+			<div className="min-w-0 overflow-hidden">
 				<h3 className="mb-4 text-sm font-semibold tracking-tight">{t("your_api_key")}</h3>
 				{isLoading ? (
-					<div className="h-12 w-full animate-pulse rounded-md border border-border/60 bg-muted/30" />
+					<div className="flex items-center gap-2 rounded-md border border-border/60 bg-muted/30 px-2.5 py-1.5">
+						<div className="min-w-0 flex-1 overflow-hidden">
+							<Skeleton className="h-3 w-full bg-accent" />
+						</div>
+						<div className="h-6 w-6 shrink-0" />
+					</div>
 				) : apiKey ? (
 					<div className="flex items-center gap-2 rounded-md border border-border/60 bg-muted/30 px-2.5 py-1.5">
 						<div className="min-w-0 flex-1 overflow-x-auto scrollbar-hide">
@@ -53,7 +56,7 @@ export function ApiKeyContent() {
 										variant="ghost"
 										size="icon"
 										onClick={copyToClipboard}
-										className="h-6 w-6 shrink-0 text-muted-foreground hover:text-foreground"
+										className="h-6 w-6 shrink-0 text-muted-foreground hover:text-accent-foreground"
 									>
 										{copied ? (
 											<Check className="h-3 w-3 text-green-500" />
@@ -71,7 +74,7 @@ export function ApiKeyContent() {
 				)}
 			</div>
 
-			<div className="rounded-lg border border-border/60 bg-card p-6 min-w-0 overflow-hidden">
+			<div className="min-w-0 overflow-hidden">
 				<h3 className="mb-2 text-sm font-semibold tracking-tight">{t("usage_title")}</h3>
 				<p className="mb-4 text-[11px] text-muted-foreground/60">{t("usage_description")}</p>
 				<div className="flex items-center gap-2 rounded-md border border-border/60 bg-muted/30 px-2.5 py-1.5">
@@ -87,7 +90,7 @@ export function ApiKeyContent() {
 									variant="ghost"
 									size="icon"
 									onClick={copyUsageToClipboard}
-									className="h-6 w-6 shrink-0 text-muted-foreground hover:text-foreground"
+									className="h-6 w-6 shrink-0 text-muted-foreground hover:text-accent-foreground"
 								>
 									{copiedUsage ? (
 										<Check className="h-3 w-3 text-green-500" />
diff --git a/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/CommunityPromptsContent.tsx b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/CommunityPromptsContent.tsx
index 239832b2d0..56044de5b0 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/CommunityPromptsContent.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/CommunityPromptsContent.tsx
@@ -1,11 +1,14 @@
 "use client";
 
 import { useAtomValue } from "jotai";
-import { AlertTriangle, Copy, Globe, Sparkles } from "lucide-react";
+import { AlertTriangle, Copy, Library } from "lucide-react";
 import { useCallback, useState } from "react";
 import { copyPromptMutationAtom } from "@/atoms/prompts/prompts-mutation.atoms";
 import { publicPromptsAtom } from "@/atoms/prompts/prompts-query.atoms";
+import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
 import { Button } from "@/components/ui/button";
+import { Card, CardContent } from "@/components/ui/card";
+import { Skeleton } from "@/components/ui/skeleton";
 import { Spinner } from "@/components/ui/spinner";
 
 export function CommunityPromptsContent() {
@@ -34,33 +37,37 @@ export function CommunityPromptsContent() {
 
 	const list = prompts ?? [];
 
-	if (isLoading) {
-		return (
-			<div className="flex items-center justify-center py-12">
-				<Spinner className="size-6" />
-			</div>
-		);
-	}
-
-	if (isError) {
-		return (
-			<div className="rounded-lg border border-dashed border-destructive/40 p-8 text-center">
-				<AlertTriangle className="mx-auto size-8 text-destructive/60" />
-				<p className="mt-2 text-sm text-destructive">Failed to load community prompts</p>
-				<p className="text-xs text-muted-foreground">Please try refreshing the page.</p>
-			</div>
-		);
-	}
-
 	return (
 		<div className="space-y-6 min-w-0 overflow-hidden">
 			<p className="text-sm text-muted-foreground">
 				Prompts shared by other users. Add any to your collection with one click.
 			</p>
 
-			{list.length === 0 && (
+			{isLoading && (
+				<div className="space-y-2">
+					{["skeleton-a", "skeleton-b", "skeleton-c"].map((key) => (
+						<Card key={key} className="border-accent bg-accent/20">
+							<CardContent className="p-4 flex flex-col gap-3 min-h-24">
+								<Skeleton className="h-4 w-32 md:w-40 bg-accent" />
+								<Skeleton className="h-3 w-full bg-accent" />
+								<Skeleton className="h-3 w-24 md:w-28 bg-accent mt-auto" />
+							</CardContent>
+						</Card>
+					))}
+				</div>
+			)}
+
+			{isError && (
+				<Alert variant="destructive">
+					<AlertTriangle />
+					<AlertTitle>Failed to load community prompts</AlertTitle>
+					<AlertDescription>Please try refreshing the page.</AlertDescription>
+				</Alert>
+			)}
+
+			{!isLoading && !isError && list.length === 0 && (
 				<div className="rounded-lg border border-dashed border-border/60 p-8 text-center">
-					<Globe className="mx-auto size-8 text-muted-foreground" />
+					<Library className="mx-auto size-8 text-muted-foreground" />
 					<p className="mt-2 text-sm text-muted-foreground">No community prompts yet</p>
 					<p className="text-xs text-muted-foreground/60">
 						Share your own prompts from the My Prompts tab
@@ -68,58 +75,58 @@ export function CommunityPromptsContent() {
 				</div>
 			)}
 
-			{list.length > 0 && (
+			{!isLoading && !isError && list.length > 0 && (
 				<div className="space-y-2">
 					{list.map((prompt) => (
-						<div
+						<Card
 							key={prompt.id}
-							className="group flex items-start gap-3 rounded-lg border border-border/60 bg-card p-4"
+							className="group relative overflow-hidden transition-all duration-200 border-accent bg-accent/20 hover:shadow-md h-full"
 						>
-							<div className="mt-0.5 shrink-0 text-muted-foreground">
-								<Sparkles className="size-4" />
-							</div>
-							<div className="flex-1 min-w-0">
-								<div className="flex items-center gap-2">
-									<span className="text-sm font-medium">{prompt.name}</span>
-									<span className="rounded-full border px-2 py-0.5 text-[10px] text-muted-foreground">
-										{prompt.mode}
-									</span>
-									{prompt.author_name && (
-										<span className="text-[11px] text-muted-foreground/60">
-											by {prompt.author_name}
+							<CardContent className="p-4 flex items-start gap-3 h-full">
+								<div className="flex-1 min-w-0">
+									<div className="flex items-center gap-2">
+										<span className="text-sm font-medium">{prompt.name}</span>
+										<span className="rounded-md border-0 bg-muted px-1.5 py-0.5 text-[10px] font-medium text-muted-foreground">
+											{prompt.mode}
 										</span>
+										{prompt.author_name && (
+											<span className="text-[11px] text-muted-foreground/60">
+												by {prompt.author_name}
+											</span>
+										)}
+									</div>
+									<p
+										className={`mt-1 text-xs text-muted-foreground ${expandedId === prompt.id ? "whitespace-pre-wrap" : "line-clamp-2"}`}
+									>
+										{prompt.prompt}
+									</p>
+									{prompt.prompt.length > 100 && (
+										<Button
+											type="button"
+											variant="link"
+											onClick={() => setExpandedId(expandedId === prompt.id ? null : prompt.id)}
+											className="mt-1 h-auto cursor-pointer px-0 py-0 text-[11px] text-primary"
+										>
+											{expandedId === prompt.id ? "See less" : "See more"}
+										</Button>
 									)}
 								</div>
-								<p
-									className={`mt-1 text-xs text-muted-foreground ${expandedId === prompt.id ? "whitespace-pre-wrap" : "line-clamp-2"}`}
+								<Button
+									variant="ghost"
+									size="sm"
+									className="h-7 shrink-0 gap-1.5 rounded-lg px-2 text-muted-foreground hover:text-accent-foreground"
+									disabled={copyingIds.has(prompt.id)}
+									onClick={() => handleCopy(prompt.id)}
 								>
-									{prompt.prompt}
-								</p>
-								{prompt.prompt.length > 100 && (
-									<button
-										type="button"
-										onClick={() => setExpandedId(expandedId === prompt.id ? null : prompt.id)}
-										className="mt-1 text-[11px] text-primary hover:underline cursor-pointer"
-									>
-										{expandedId === prompt.id ? "See less" : "See more"}
-									</button>
-								)}
-							</div>
-							<Button
-								variant="outline"
-								size="sm"
-								className="shrink-0 gap-1.5"
-								disabled={copyingIds.has(prompt.id)}
-								onClick={() => handleCopy(prompt.id)}
-							>
-								{copyingIds.has(prompt.id) ? (
-									<Spinner className="size-3" />
-								) : (
-									<Copy className="size-3" />
-								)}
-								Add to mine
-							</Button>
-						</div>
+									{copyingIds.has(prompt.id) ? (
+										<Spinner className="size-3" />
+									) : (
+										<Copy className="size-3" />
+									)}
+									Add to mine
+								</Button>
+							</CardContent>
+						</Card>
 					))}
 				</div>
 			)}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/DesktopContent.tsx b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/DesktopContent.tsx
index c3f457f964..97debca9df 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/DesktopContent.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/DesktopContent.tsx
@@ -1,10 +1,7 @@
 "use client";
 
-import { BrainCog, Rocket, Zap } from "lucide-react";
 import { useEffect, useState } from "react";
 import { toast } from "sonner";
-import { DEFAULT_SHORTCUTS, ShortcutRecorder } from "@/components/desktop/shortcut-recorder";
-import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card";
 import { Label } from "@/components/ui/label";
 import {
 	Select,
@@ -13,7 +10,8 @@ import {
 	SelectTrigger,
 	SelectValue,
 } from "@/components/ui/select";
-import { Spinner } from "@/components/ui/spinner";
+import { Separator } from "@/components/ui/separator";
+import { Skeleton } from "@/components/ui/skeleton";
 import { Switch } from "@/components/ui/switch";
 import type { SearchSpace } from "@/contracts/types/search-space.types";
 import { useElectronAPI } from "@/hooks/use-platform";
@@ -22,42 +20,44 @@ import { searchSpacesApiService } from "@/lib/apis/search-spaces-api.service";
 export function DesktopContent() {
 	const api = useElectronAPI();
 	const [loading, setLoading] = useState(true);
-	const [enabled, setEnabled] = useState(true);
-
-	const [shortcuts, setShortcuts] = useState(DEFAULT_SHORTCUTS);
-	const [shortcutsLoaded, setShortcutsLoaded] = useState(false);
 
 	const [searchSpaces, setSearchSpaces] = useState<SearchSpace[]>([]);
 	const [activeSpaceId, setActiveSpaceId] = useState<string | null>(null);
 
+	const [autoLaunchEnabled, setAutoLaunchEnabled] = useState(false);
+	const [autoLaunchHidden, setAutoLaunchHidden] = useState(true);
+	const [autoLaunchSupported, setAutoLaunchSupported] = useState(false);
+
 	useEffect(() => {
 		if (!api) {
 			setLoading(false);
-			setShortcutsLoaded(true);
 			return;
 		}
 
 		let mounted = true;
+		const hasAutoLaunchApi =
+			typeof api.getAutoLaunch === "function" && typeof api.setAutoLaunch === "function";
+		setAutoLaunchSupported(hasAutoLaunchApi);
 
 		Promise.all([
-			api.getAutocompleteEnabled(),
-			api.getShortcuts?.() ?? Promise.resolve(null),
 			api.getActiveSearchSpace?.() ?? Promise.resolve(null),
 			searchSpacesApiService.getSearchSpaces(),
+			hasAutoLaunchApi ? api.getAutoLaunch() : Promise.resolve(null),
 		])
-			.then(([autoEnabled, config, spaceId, spaces]) => {
+			.then(([spaceId, spaces, autoLaunch]) => {
 				if (!mounted) return;
-				setEnabled(autoEnabled);
-				if (config) setShortcuts(config);
 				setActiveSpaceId(spaceId);
 				if (spaces) setSearchSpaces(spaces);
+				if (autoLaunch) {
+					setAutoLaunchEnabled(autoLaunch.enabled);
+					setAutoLaunchHidden(autoLaunch.openAsHidden);
+					setAutoLaunchSupported(autoLaunch.supported);
+				}
 				setLoading(false);
-				setShortcutsLoaded(true);
 			})
 			.catch(() => {
 				if (!mounted) return;
 				setLoading(false);
-				setShortcutsLoaded(true);
 			});
 
 		return () => {
@@ -69,7 +69,7 @@ export function DesktopContent() {
 		return (
 			<div className="flex flex-col items-center justify-center py-12 text-center">
 				<p className="text-sm text-muted-foreground">
-					Desktop settings are only available in the SurfSense desktop app.
+					App preferences are only available in the SurfSense desktop app.
 				</p>
 			</div>
 		);
@@ -77,33 +77,63 @@ export function DesktopContent() {
 
 	if (loading) {
 		return (
-			<div className="flex items-center justify-center py-12">
-				<Spinner size="md" className="text-muted-foreground" />
+			<div className="flex flex-col gap-4 md:gap-6">
+				<section>
+					<div className="flex flex-col gap-2 pb-2 md:pb-3">
+						<Skeleton className="h-6 w-48 bg-accent" />
+						<Skeleton className="h-4 w-full max-w-2xl bg-accent" />
+					</div>
+					<Skeleton className="h-10 w-full bg-accent" />
+				</section>
+
+				<Separator className="bg-border" />
+
+				<section>
+					<div className="flex flex-col gap-2 pb-2 md:pb-3">
+						<Skeleton className="h-6 w-44 bg-accent" />
+						<Skeleton className="h-4 w-full max-w-3xl bg-accent" />
+					</div>
+					<div className="flex flex-col gap-3">
+						<Skeleton className="h-20 w-full bg-accent" />
+						<Skeleton className="h-20 w-full bg-accent" />
+					</div>
+				</section>
 			</div>
 		);
 	}
 
-	const handleToggle = async (checked: boolean) => {
-		setEnabled(checked);
-		await api.setAutocompleteEnabled(checked);
-	};
-
-	const updateShortcut = (
-		key: "generalAssist" | "quickAsk" | "autocomplete",
-		accelerator: string
-	) => {
-		setShortcuts((prev) => {
-			const updated = { ...prev, [key]: accelerator };
-			api.setShortcuts?.({ [key]: accelerator }).catch(() => {
-				toast.error("Failed to update shortcut");
-			});
-			return updated;
-		});
-		toast.success("Shortcut updated");
+	const handleAutoLaunchToggle = async (checked: boolean) => {
+		if (!autoLaunchSupported || !api.setAutoLaunch) {
+			toast.error("Please update the desktop app to configure launch on startup");
+			return;
+		}
+		setAutoLaunchEnabled(checked);
+		try {
+			const next = await api.setAutoLaunch(checked, autoLaunchHidden);
+			if (next) {
+				setAutoLaunchEnabled(next.enabled);
+				setAutoLaunchHidden(next.openAsHidden);
+				setAutoLaunchSupported(next.supported);
+			}
+			toast.success(checked ? "SurfSense will launch on startup" : "Launch on startup disabled");
+		} catch {
+			setAutoLaunchEnabled(!checked);
+			toast.error("Failed to update launch on startup");
+		}
 	};
 
-	const resetShortcut = (key: "generalAssist" | "quickAsk" | "autocomplete") => {
-		updateShortcut(key, DEFAULT_SHORTCUTS[key]);
+	const handleAutoLaunchHiddenToggle = async (checked: boolean) => {
+		if (!autoLaunchSupported || !api.setAutoLaunch) {
+			toast.error("Please update the desktop app to configure startup behavior");
+			return;
+		}
+		setAutoLaunchHidden(checked);
+		try {
+			await api.setAutoLaunch(autoLaunchEnabled, checked);
+		} catch {
+			setAutoLaunchHidden(!checked);
+			toast.error("Failed to update startup behavior");
+		}
 	};
 
 	const handleSearchSpaceChange = (value: string) => {
@@ -113,17 +143,16 @@ export function DesktopContent() {
 	};
 
 	return (
-		<div className="space-y-4 md:space-y-6">
-			{/* Default Search Space */}
-			<Card>
-				<CardHeader className="px-3 md:px-6 pt-3 md:pt-6 pb-2 md:pb-3">
-					<CardTitle className="text-base md:text-lg">Default Search Space</CardTitle>
-					<CardDescription className="text-xs md:text-sm">
-						Choose which search space General Assist, Quick Assist, and Extreme Assist operate
-						against.
-					</CardDescription>
-				</CardHeader>
-				<CardContent className="px-3 md:px-6 pb-3 md:pb-6">
+		<div className="flex flex-col gap-4 md:gap-6">
+			<section>
+				<div className="pb-2 md:pb-3">
+					<h2 className="text-base md:text-lg font-semibold">Default Search Space</h2>
+					<p className="text-xs md:text-sm text-muted-foreground">
+						Choose which search space General Assist, Screenshot Assist, and Quick Assist use by
+						default.
+					</p>
+				</div>
+				<div>
 					{searchSpaces.length > 0 ? (
 						<Select value={activeSpaceId ?? undefined} onValueChange={handleSearchSpaceChange}>
 							<SelectTrigger className="w-full">
@@ -142,81 +171,61 @@ export function DesktopContent() {
 							No search spaces found. Create one first.
 						</p>
 					)}
-				</CardContent>
-			</Card>
-
-			{/* Keyboard Shortcuts */}
-			<Card>
-				<CardHeader className="px-3 md:px-6 pt-3 md:pt-6 pb-2 md:pb-3">
-					<CardTitle className="text-base md:text-lg">Keyboard Shortcuts</CardTitle>
-					<CardDescription className="text-xs md:text-sm">
-						Customize the global keyboard shortcuts for desktop features.
-					</CardDescription>
-				</CardHeader>
-				<CardContent className="px-3 md:px-6 pb-3 md:pb-6">
-					{shortcutsLoaded ? (
-						<div className="flex flex-col gap-3">
-							<ShortcutRecorder
-								value={shortcuts.generalAssist}
-								onChange={(accel) => updateShortcut("generalAssist", accel)}
-								onReset={() => resetShortcut("generalAssist")}
-								defaultValue={DEFAULT_SHORTCUTS.generalAssist}
-								label="General Assist"
-								description="Launch SurfSense instantly from any application"
-								icon={Rocket}
-							/>
-							<ShortcutRecorder
-								value={shortcuts.quickAsk}
-								onChange={(accel) => updateShortcut("quickAsk", accel)}
-								onReset={() => resetShortcut("quickAsk")}
-								defaultValue={DEFAULT_SHORTCUTS.quickAsk}
-								label="Quick Assist"
-								description="Select text anywhere, then ask AI to explain, rewrite, or act on it"
-								icon={Zap}
-							/>
-							<ShortcutRecorder
-								value={shortcuts.autocomplete}
-								onChange={(accel) => updateShortcut("autocomplete", accel)}
-								onReset={() => resetShortcut("autocomplete")}
-								defaultValue={DEFAULT_SHORTCUTS.autocomplete}
-								label="Extreme Assist"
-								description="AI drafts text using your screen context and knowledge base"
-								icon={BrainCog}
-							/>
-							<p className="text-[11px] text-muted-foreground">
-								Click a shortcut and press a new key combination to change it.
+				</div>
+			</section>
+
+			<Separator className="bg-border" />
+
+			<section>
+				<div className="pb-2 md:pb-3">
+					<h2 className="text-base md:text-lg font-semibold flex items-center gap-2">
+						Launch on Startup
+					</h2>
+					<p className="text-xs md:text-sm text-muted-foreground">
+						Automatically start SurfSense when you sign in to your computer so global shortcuts and
+						folder sync are always available.
+					</p>
+				</div>
+				<div className="flex flex-col gap-3">
+					<div className="flex items-center justify-between rounded-lg bg-accent p-4">
+						<div className="space-y-0.5">
+							<Label htmlFor="auto-launch-toggle" className="text-sm font-medium cursor-pointer">
+								Open SurfSense at login
+							</Label>
+							<p className="text-xs text-muted-foreground">
+								{autoLaunchSupported
+									? "Adds SurfSense to your system's login items."
+									: "Only available in the packaged desktop app."}
 							</p>
 						</div>
-					) : (
-						<div className="flex justify-center py-4">
-							<Spinner size="sm" />
-						</div>
-					)}
-				</CardContent>
-			</Card>
-
-			{/* Extreme Assist Toggle */}
-			<Card>
-				<CardHeader className="px-3 md:px-6 pt-3 md:pt-6 pb-2 md:pb-3">
-					<CardTitle className="text-base md:text-lg">Extreme Assist</CardTitle>
-					<CardDescription className="text-xs md:text-sm">
-						Get inline writing suggestions powered by your knowledge base as you type in any app.
-					</CardDescription>
-				</CardHeader>
-				<CardContent className="px-3 md:px-6 pb-3 md:pb-6">
-					<div className="flex items-center justify-between rounded-lg border p-4">
+						<Switch
+							id="auto-launch-toggle"
+							checked={autoLaunchEnabled}
+							onCheckedChange={handleAutoLaunchToggle}
+							disabled={!autoLaunchSupported}
+						/>
+					</div>
+					<div className="flex items-center justify-between rounded-lg bg-accent p-4">
 						<div className="space-y-0.5">
-							<Label htmlFor="autocomplete-toggle" className="text-sm font-medium cursor-pointer">
-								Enable Extreme Assist
+							<Label
+								htmlFor="auto-launch-hidden-toggle"
+								className="text-sm font-medium cursor-pointer"
+							>
+								Start minimized to tray
 							</Label>
 							<p className="text-xs text-muted-foreground">
-								Show suggestions while typing in other applications.
+								Skip the main window on boot. SurfSense lives in the system tray until you need it.
 							</p>
 						</div>
-						<Switch id="autocomplete-toggle" checked={enabled} onCheckedChange={handleToggle} />
+						<Switch
+							id="auto-launch-hidden-toggle"
+							checked={autoLaunchHidden}
+							onCheckedChange={handleAutoLaunchHiddenToggle}
+							disabled={!autoLaunchSupported || !autoLaunchEnabled}
+						/>
 					</div>
-				</CardContent>
-			</Card>
+				</div>
+			</section>
 		</div>
 	);
 }
diff --git a/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/HotkeysContent.tsx b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/HotkeysContent.tsx
new file mode 100644
index 0000000000..9916f6cdad
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/HotkeysContent.tsx
@@ -0,0 +1,204 @@
+"use client";
+
+import { Crop, Rocket, RotateCcw, Zap } from "lucide-react";
+import { useCallback, useEffect, useMemo, useRef, useState } from "react";
+import { toast } from "sonner";
+import { DEFAULT_SHORTCUTS, keyEventToAccelerator } from "@/components/desktop/shortcut-recorder";
+import { Button } from "@/components/ui/button";
+import { Separator } from "@/components/ui/separator";
+import { ShortcutKbd } from "@/components/ui/shortcut-kbd";
+import { Spinner } from "@/components/ui/spinner";
+import { useElectronAPI } from "@/hooks/use-platform";
+
+type ShortcutKey = "generalAssist" | "quickAsk" | "screenshotAssist";
+type ShortcutMap = typeof DEFAULT_SHORTCUTS;
+
+const HOTKEY_ROWS: Array<{ key: ShortcutKey; label: string; icon: React.ElementType }> = [
+	{ key: "generalAssist", label: "General Assist", icon: Rocket },
+	{ key: "screenshotAssist", label: "Screenshot Assist", icon: Crop },
+	{ key: "quickAsk", label: "Quick Assist", icon: Zap },
+];
+
+function acceleratorToKeys(accel: string, isMac: boolean): string[] {
+	if (!accel) return [];
+	return accel.split("+").map((part) => {
+		if (part === "CommandOrControl") {
+			return isMac ? "⌘" : "Ctrl";
+		}
+		if (part === "Alt") {
+			return isMac ? "⌥" : "Alt";
+		}
+		if (part === "Shift") {
+			return isMac ? "⇧" : "Shift";
+		}
+		if (part === "Space") return "Space";
+		return part.length === 1 ? part.toUpperCase() : part;
+	});
+}
+
+function HotkeyRow({
+	label,
+	value,
+	defaultValue,
+	icon: Icon,
+	isMac,
+	onChange,
+	onReset,
+}: {
+	label: string;
+	value: string;
+	defaultValue: string;
+	icon: React.ElementType;
+	isMac: boolean;
+	onChange: (accelerator: string) => void;
+	onReset: () => void;
+}) {
+	const [recording, setRecording] = useState(false);
+	const inputRef = useRef<HTMLButtonElement>(null);
+	const isDefault = value === defaultValue;
+	const displayKeys = useMemo(() => acceleratorToKeys(value, isMac), [value, isMac]);
+
+	const handleKeyDown = useCallback(
+		(e: React.KeyboardEvent) => {
+			if (!recording) return;
+			e.preventDefault();
+			e.stopPropagation();
+
+			if (e.key === "Escape") {
+				setRecording(false);
+				return;
+			}
+
+			const accel = keyEventToAccelerator(e);
+			if (accel) {
+				onChange(accel);
+				setRecording(false);
+			}
+		},
+		[onChange, recording]
+	);
+
+	return (
+		<div className="flex items-center justify-between gap-2.5 py-3">
+			<div className="flex items-center gap-2.5 min-w-0">
+				<div className="flex size-7 shrink-0 items-center justify-center rounded-md bg-primary/10 text-primary">
+					<Icon className="size-3.5" />
+				</div>
+				<p className="text-sm text-foreground truncate">{label}</p>
+			</div>
+			<div className="flex shrink-0 items-center gap-1">
+				{!isDefault && (
+					<Button
+						variant="ghost"
+						size="icon"
+						className="size-7 text-muted-foreground hover:text-accent-foreground"
+						onClick={onReset}
+						title="Reset to default"
+					>
+						<RotateCcw className="size-3" />
+					</Button>
+				)}
+				<Button
+					ref={inputRef}
+					type="button"
+					variant="ghost"
+					title={recording ? "Press shortcut keys" : "Click to edit shortcut"}
+					onClick={() => setRecording(true)}
+					onKeyDown={handleKeyDown}
+					onBlur={() => setRecording(false)}
+					className={
+						recording
+							? "h-7 border border-transparent bg-primary/5 px-0 outline-none ring-0 focus:outline-none focus-visible:outline-none focus-visible:ring-0"
+							: "h-7 cursor-pointer border border-transparent bg-transparent px-0 outline-none ring-0 transition-colors hover:bg-accent hover:text-accent-foreground focus:outline-none focus-visible:outline-none focus-visible:ring-0"
+					}
+				>
+					{recording ? (
+						<span className="px-2 text-[9px] text-primary whitespace-nowrap">Press hotkeys</span>
+					) : (
+						<ShortcutKbd keys={displayKeys} className="ml-0 px-1.5 text-foreground/85" />
+					)}
+				</Button>
+			</div>
+		</div>
+	);
+}
+
+export function HotkeysContent() {
+	const api = useElectronAPI();
+	const [shortcuts, setShortcuts] = useState(DEFAULT_SHORTCUTS);
+	const [shortcutsLoaded, setShortcutsLoaded] = useState(false);
+	const isMac = api?.versions?.platform === "darwin";
+
+	useEffect(() => {
+		if (!api) {
+			setShortcutsLoaded(true);
+			return;
+		}
+
+		let mounted = true;
+		(api.getShortcuts?.() ?? Promise.resolve(null))
+			.then((config: ShortcutMap | null) => {
+				if (!mounted) return;
+				if (config) setShortcuts(config);
+				setShortcutsLoaded(true);
+			})
+			.catch(() => {
+				if (!mounted) return;
+				setShortcutsLoaded(true);
+			});
+
+		return () => {
+			mounted = false;
+		};
+	}, [api]);
+
+	if (!api) {
+		return (
+			<div className="flex flex-col items-center justify-center py-12 text-center">
+				<p className="text-sm text-muted-foreground">
+					Hotkeys are only available in the SurfSense desktop app.
+				</p>
+			</div>
+		);
+	}
+
+	const updateShortcut = (key: ShortcutKey, accelerator: string) => {
+		setShortcuts((prev) => {
+			const updated = { ...prev, [key]: accelerator };
+			api.setShortcuts?.({ [key]: accelerator }).catch(() => {
+				toast.error("Failed to update shortcut");
+			});
+			return updated;
+		});
+		toast.success("Shortcut updated");
+	};
+
+	const resetShortcut = (key: ShortcutKey) => {
+		updateShortcut(key, DEFAULT_SHORTCUTS[key]);
+	};
+
+	return shortcutsLoaded ? (
+		<div className="flex flex-col gap-3">
+			<div>
+				{HOTKEY_ROWS.map((row, index) => (
+					<div key={row.key}>
+						<HotkeyRow
+							label={row.label}
+							value={shortcuts[row.key]}
+							defaultValue={DEFAULT_SHORTCUTS[row.key]}
+							icon={row.icon}
+							isMac={isMac}
+							onChange={(accel) => updateShortcut(row.key, accel)}
+							onReset={() => resetShortcut(row.key)}
+						/>
+						{index < HOTKEY_ROWS.length - 1 ? <Separator className="bg-border" /> : null}
+					</div>
+				))}
+			</div>
+		</div>
+	) : (
+		<div className="flex justify-center py-4">
+			<Spinner size="sm" />
+		</div>
+	);
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/MemoryContent.tsx b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/MemoryContent.tsx
new file mode 100644
index 0000000000..3542f0925e
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/MemoryContent.tsx
@@ -0,0 +1,293 @@
+"use client";
+
+import { useAtomValue } from "jotai";
+import { ArrowUp, ChevronDown, ClipboardCopy, Download, Info, Pencil } from "lucide-react";
+import { useCallback, useEffect, useRef, useState } from "react";
+import { toast } from "sonner";
+import { z } from "zod";
+import { activeSearchSpaceIdAtom } from "@/atoms/search-spaces/search-space-query.atoms";
+import { PlateEditor } from "@/components/editor/plate-editor";
+import { Alert, AlertDescription } from "@/components/ui/alert";
+import { Button } from "@/components/ui/button";
+import {
+	DropdownMenu,
+	DropdownMenuContent,
+	DropdownMenuItem,
+	DropdownMenuTrigger,
+} from "@/components/ui/dropdown-menu";
+import { Spinner } from "@/components/ui/spinner";
+
+import { baseApiService } from "@/lib/apis/base-api.service";
+
+const MEMORY_HARD_LIMIT = 25_000;
+
+const MemoryReadSchema = z.object({
+	memory_md: z.string(),
+});
+
+export function MemoryContent() {
+	const activeSearchSpaceId = useAtomValue(activeSearchSpaceIdAtom);
+	const [memory, setMemory] = useState("");
+	const [loading, setLoading] = useState(true);
+	const [saving, setSaving] = useState(false);
+	const [editQuery, setEditQuery] = useState("");
+	const [editing, setEditing] = useState(false);
+	const [showInput, setShowInput] = useState(false);
+	const textareaRef = useRef<HTMLInputElement>(null);
+	const inputContainerRef = useRef<HTMLDivElement>(null);
+
+	const fetchMemory = useCallback(async () => {
+		try {
+			setLoading(true);
+			const data = await baseApiService.get("/api/v1/users/me/memory", MemoryReadSchema);
+			setMemory(data.memory_md);
+		} catch {
+			toast.error("Failed to load memory");
+		} finally {
+			setLoading(false);
+		}
+	}, []);
+
+	useEffect(() => {
+		fetchMemory();
+	}, [fetchMemory]);
+
+	useEffect(() => {
+		if (!showInput) return;
+
+		const handlePointerDownOutside = (event: MouseEvent | TouchEvent) => {
+			const target = event.target;
+			if (!(target instanceof Node)) return;
+			if (inputContainerRef.current?.contains(target)) return;
+
+			setShowInput(false);
+		};
+
+		document.addEventListener("mousedown", handlePointerDownOutside);
+		document.addEventListener("touchstart", handlePointerDownOutside, { passive: true });
+
+		return () => {
+			document.removeEventListener("mousedown", handlePointerDownOutside);
+			document.removeEventListener("touchstart", handlePointerDownOutside);
+		};
+	}, [showInput]);
+
+	const handleClear = async () => {
+		try {
+			setSaving(true);
+			const data = await baseApiService.put("/api/v1/users/me/memory", MemoryReadSchema, {
+				body: { memory_md: "" },
+			});
+			setMemory(data.memory_md);
+			toast.success("Memory cleared");
+		} catch {
+			toast.error("Failed to clear memory");
+		} finally {
+			setSaving(false);
+		}
+	};
+
+	const handleEdit = async () => {
+		const query = editQuery.trim();
+		if (!query) return;
+
+		try {
+			setEditing(true);
+			const data = await baseApiService.post("/api/v1/users/me/memory/edit", MemoryReadSchema, {
+				body: { query, search_space_id: Number(activeSearchSpaceId) },
+			});
+			setMemory(data.memory_md);
+			setEditQuery("");
+			setShowInput(false);
+			toast.success("Memory updated");
+		} catch {
+			toast.error("Failed to edit memory");
+		} finally {
+			setEditing(false);
+		}
+	};
+
+	const openInput = () => {
+		setShowInput(true);
+		requestAnimationFrame(() => textareaRef.current?.focus());
+	};
+
+	const handleDownload = () => {
+		if (!memory) return;
+		try {
+			const blob = new Blob([memory], { type: "text/markdown;charset=utf-8" });
+			const url = URL.createObjectURL(blob);
+			const a = document.createElement("a");
+			a.href = url;
+			a.download = "personal-memory.md";
+			document.body.appendChild(a);
+			a.click();
+			document.body.removeChild(a);
+			URL.revokeObjectURL(url);
+		} catch {
+			toast.error("Failed to download memory");
+		}
+	};
+
+	const handleCopyMarkdown = async () => {
+		if (!memory) return;
+		try {
+			await navigator.clipboard.writeText(memory);
+			toast.success("Copied to clipboard");
+		} catch {
+			toast.error("Failed to copy memory");
+		}
+	};
+
+	const handleKeyDown = (e: React.KeyboardEvent<HTMLInputElement>) => {
+		if (e.key === "Enter" && !e.shiftKey) {
+			e.preventDefault();
+			handleEdit();
+		}
+	};
+
+	const displayMemory = memory.replace(/\(\d{4}-\d{2}-\d{2}\)\s*\[(fact|pref|instr)\]\s*/g, "");
+	const charCount = memory.length;
+
+	const getCounterColor = () => {
+		if (charCount > MEMORY_HARD_LIMIT) return "text-red-500";
+		if (charCount > 15_000) return "text-orange-500";
+		if (charCount > 10_000) return "text-yellow-500";
+		return "text-muted-foreground";
+	};
+
+	if (loading) {
+		return (
+			<div className="flex items-center justify-center py-12">
+				<Spinner size="md" className="text-muted-foreground" />
+			</div>
+		);
+	}
+
+	if (!memory) {
+		return (
+			<div className="flex flex-col items-center justify-center py-16 text-center">
+				<h3 className="text-base font-medium text-foreground">What does SurfSense remember?</h3>
+				<p className="mt-2 max-w-sm text-sm text-muted-foreground">
+					Nothing yet. SurfSense picks up on your preferences and context as you chat.
+				</p>
+			</div>
+		);
+	}
+
+	return (
+		<div className="space-y-4">
+			<Alert>
+				<Info />
+				<AlertDescription>
+					<p>
+						SurfSense uses this personal memory to personalize your responses across all
+						conversations.
+					</p>
+				</AlertDescription>
+			</Alert>
+
+			<div className="relative h-[380px] rounded-lg border bg-background">
+				<div className="h-full overflow-y-auto scrollbar-thin">
+					<PlateEditor
+						markdown={displayMemory}
+						readOnly
+						preset="readonly"
+						variant="default"
+						editorVariant="none"
+						className="px-5 py-4 text-sm min-h-full"
+					/>
+				</div>
+
+				{showInput ? (
+					<div className="absolute bottom-3 inset-x-3 z-10">
+						<div
+							ref={inputContainerRef}
+							className="relative flex h-[54px] items-center gap-2 rounded-[9999px] border bg-muted/60 backdrop-blur-sm pl-4 pr-1 shadow-sm"
+						>
+							<input
+								ref={textareaRef}
+								type="text"
+								value={editQuery}
+								onChange={(e) => setEditQuery(e.target.value)}
+								onKeyDown={handleKeyDown}
+								placeholder="Tell SurfSense what to remember or forget"
+								disabled={editing}
+								className="flex-1 bg-transparent text-sm outline-none placeholder:text-muted-foreground/70"
+							/>
+							<Button
+								type="button"
+								size="icon"
+								variant="ghost"
+								onClick={handleEdit}
+								disabled={editing || !editQuery.trim()}
+								className={`h-11 w-11 shrink-0 rounded-full ${
+									editing
+										? ""
+										: "bg-muted-foreground/15 hover:bg-accent hover:text-accent-foreground"
+								}`}
+							>
+								{editing ? (
+									<Spinner size="sm" />
+								) : (
+									<ArrowUp className="!h-5 !w-5 text-foreground" strokeWidth={2.25} />
+								)}
+							</Button>
+						</div>
+					</div>
+				) : (
+					<Button
+						type="button"
+						size="icon"
+						variant="secondary"
+						onClick={openInput}
+						className="absolute bottom-3 right-3 z-10 h-[54px] w-[54px] rounded-full border bg-muted/60 backdrop-blur-sm shadow-sm"
+					>
+						<Pencil className="!h-5 !w-5" />
+					</Button>
+				)}
+			</div>
+
+			<div className="flex items-center justify-between gap-2">
+				<span className={`text-xs shrink-0 ${getCounterColor()}`}>
+					{charCount.toLocaleString()} / {MEMORY_HARD_LIMIT.toLocaleString()}
+					<span className="hidden sm:inline"> characters</span>
+					<span className="sm:hidden"> chars</span>
+					{charCount > 15_000 && charCount <= MEMORY_HARD_LIMIT && " - Approaching limit"}
+					{charCount > MEMORY_HARD_LIMIT && " - Exceeds limit"}
+				</span>
+				<div className="flex items-center gap-1.5 sm:gap-2">
+					<Button
+						type="button"
+						variant="destructive"
+						size="sm"
+						className="text-xs sm:text-sm"
+						onClick={handleClear}
+						disabled={saving || editing || !memory}
+					>
+						<span className="hidden sm:inline">Reset Memory</span>
+						<span className="sm:hidden">Reset</span>
+					</Button>
+					<DropdownMenu>
+						<DropdownMenuTrigger asChild>
+							<Button type="button" variant="secondary" size="sm" disabled={!memory}>
+								Export
+								<ChevronDown className="h-3 w-3 opacity-60" />
+							</Button>
+						</DropdownMenuTrigger>
+						<DropdownMenuContent align="end">
+							<DropdownMenuItem onClick={handleCopyMarkdown}>
+								<ClipboardCopy className="h-4 w-4 mr-2" />
+								Copy as Markdown
+							</DropdownMenuItem>
+							<DropdownMenuItem onClick={handleDownload}>
+								<Download className="h-4 w-4 mr-2" />
+								Download as Markdown
+							</DropdownMenuItem>
+						</DropdownMenuContent>
+					</DropdownMenu>
+				</div>
+			</div>
+		</div>
+	);
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/ProfileContent.tsx b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/ProfileContent.tsx
index c1e81283b6..89bc362eb0 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/ProfileContent.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/ProfileContent.tsx
@@ -11,8 +11,17 @@ import { Button } from "@/components/ui/button";
 import { Input } from "@/components/ui/input";
 import { Label } from "@/components/ui/label";
 import { Spinner } from "@/components/ui/spinner";
+import { getUserAvatarColor, getUserInitials } from "@/lib/user-avatar";
 
-function AvatarDisplay({ url, fallback }: { url?: string; fallback: string }) {
+function AvatarDisplay({
+	url,
+	fallback,
+	bgColor,
+}: {
+	url?: string;
+	fallback: string;
+	bgColor: string;
+}) {
 	const [errorUrl, setErrorUrl] = useState<string>();
 	const hasError = errorUrl === url;
 
@@ -23,15 +32,19 @@ function AvatarDisplay({ url, fallback }: { url?: string; fallback: string }) {
 				alt="Avatar"
 				width={64}
 				height={64}
-				className="h-16 w-16 rounded-xl object-cover"
+				className="h-16 w-16 rounded-full object-cover select-none"
 				onError={() => setErrorUrl(url)}
+				referrerPolicy="no-referrer"
 				unoptimized
 			/>
 		);
 	}
 
 	return (
-		<div className="flex h-16 w-16 items-center justify-center rounded-xl bg-muted text-xl font-semibold text-muted-foreground">
+		<div
+			className="flex h-16 w-16 shrink-0 items-center justify-center rounded-full text-xl font-semibold text-white select-none"
+			style={{ backgroundColor: bgColor }}
+		>
 			{fallback}
 		</div>
 	);
@@ -50,11 +63,6 @@ export function ProfileContent() {
 		}
 	}, [user]);
 
-	const getInitials = (email: string) => {
-		const name = email.split("@")[0];
-		return name.slice(0, 2).toUpperCase();
-	};
-
 	const handleSubmit = async (e: React.FormEvent) => {
 		e.preventDefault();
 
@@ -69,6 +77,7 @@ export function ProfileContent() {
 	};
 
 	const hasChanges = displayName !== (user?.display_name || "");
+	const avatarBgColor = getUserAvatarColor(user?.email || "");
 
 	return (
 		<div>
@@ -78,13 +87,13 @@ export function ProfileContent() {
 				</div>
 			) : (
 				<form onSubmit={handleSubmit} className="space-y-6">
-					<div className="rounded-lg border bg-card p-6">
+					<div className="rounded-lg bg-main-panel">
 						<div className="flex flex-col gap-6">
 							<div className="space-y-2">
-								<Label>{t("profile_avatar")}</Label>
 								<AvatarDisplay
 									url={user?.avatar_url || undefined}
-									fallback={getInitials(user?.email || "")}
+									fallback={getUserInitials(user?.email || "")}
+									bgColor={avatarBgColor}
 								/>
 							</div>
 
@@ -94,6 +103,7 @@ export function ProfileContent() {
 									id="display-name"
 									type="text"
 									autoComplete="name"
+									maxLength={100}
 									placeholder={user?.email?.split("@")[0]}
 									value={displayName}
 									onChange={(e) => setDisplayName(e.target.value)}
@@ -113,10 +123,10 @@ export function ProfileContent() {
 							type="submit"
 							variant="outline"
 							disabled={isPending || !hasChanges}
-							className="gap-2 bg-white text-black hover:bg-neutral-100 dark:bg-white dark:text-black dark:hover:bg-neutral-200"
+							className="relative gap-2 bg-white text-black hover:bg-accent hover:text-accent-foreground dark:bg-white dark:text-black"
 						>
-							{isPending && <Spinner size="sm" className="mr-2" />}
-							{t("profile_save")}
+							<span className={isPending ? "opacity-0" : ""}>{t("profile_save")}</span>
+							{isPending && <Spinner size="sm" className="absolute" />}
 						</Button>
 					</div>
 				</form>
diff --git a/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/PromptsContent.tsx b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/PromptsContent.tsx
index 1e7087afc4..e9415a1f24 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/PromptsContent.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/PromptsContent.tsx
@@ -1,7 +1,7 @@
 "use client";
 
 import { useAtomValue } from "jotai";
-import { AlertTriangle, Globe, Lock, PenLine, Sparkles, Trash2 } from "lucide-react";
+import { AlertTriangle, Globe, Lock, MoreHorizontal, Pencil, Sparkles, Trash2 } from "lucide-react";
 import { useCallback, useState } from "react";
 import { toast } from "sonner";
 import {
@@ -10,6 +10,7 @@ import {
 	updatePromptMutationAtom,
 } from "@/atoms/prompts/prompts-mutation.atoms";
 import { promptsAtom } from "@/atoms/prompts/prompts-query.atoms";
+import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
 import {
 	AlertDialog,
 	AlertDialogAction,
@@ -21,9 +22,32 @@ import {
 	AlertDialogTitle,
 } from "@/components/ui/alert-dialog";
 import { Button } from "@/components/ui/button";
+import { Card, CardContent } from "@/components/ui/card";
+import {
+	Dialog,
+	DialogContent,
+	DialogDescription,
+	DialogFooter,
+	DialogHeader,
+	DialogTitle,
+} from "@/components/ui/dialog";
+import {
+	DropdownMenu,
+	DropdownMenuContent,
+	DropdownMenuItem,
+	DropdownMenuTrigger,
+} from "@/components/ui/dropdown-menu";
 import { Input } from "@/components/ui/input";
 import { Label } from "@/components/ui/label";
+import {
+	Select,
+	SelectContent,
+	SelectItem,
+	SelectTrigger,
+	SelectValue,
+} from "@/components/ui/select";
 import { ShortcutKbd } from "@/components/ui/shortcut-kbd";
+import { Skeleton } from "@/components/ui/skeleton";
 import { Spinner } from "@/components/ui/spinner";
 import { Switch } from "@/components/ui/switch";
 import type { PromptRead } from "@/contracts/types/prompts.types";
@@ -123,24 +147,6 @@ export function PromptsContent() {
 
 	const list = prompts ?? [];
 
-	if (isLoading) {
-		return (
-			<div className="flex items-center justify-center py-12">
-				<Spinner className="size-6" />
-			</div>
-		);
-	}
-
-	if (isError) {
-		return (
-			<div className="rounded-lg border border-dashed border-destructive/40 p-8 text-center">
-				<AlertTriangle className="mx-auto size-8 text-destructive/60" />
-				<p className="mt-2 text-sm text-destructive">Failed to load prompts</p>
-				<p className="text-xs text-muted-foreground">Please try refreshing the page.</p>
-			</div>
-		);
-	}
-
 	return (
 		<div className="space-y-6 min-w-0 overflow-hidden">
 			<div className="flex items-center justify-between">
@@ -148,97 +154,150 @@ export function PromptsContent() {
 					Create prompt templates triggered with <ShortcutKbd keys={["/"]} className="ml-0" /> in
 					the chat composer.
 				</p>
-				{!showForm && (
-					<Button
-						size="sm"
-						onClick={() => {
-							setShowForm(true);
-							setEditingId(null);
-							setFormData(EMPTY_FORM);
-						}}
-						className="shrink-0 gap-1.5"
-					>
-						New
-					</Button>
-				)}
+				<Button
+					size="sm"
+					onClick={() => {
+						setShowForm(true);
+						setEditingId(null);
+						setFormData(EMPTY_FORM);
+					}}
+					className="shrink-0 gap-1.5"
+				>
+					New
+				</Button>
 			</div>
 
-			{showForm && (
-				<div className="rounded-lg border border-border/60 bg-card p-6 space-y-4">
-					<h3 className="text-sm font-semibold tracking-tight">
-						{editingId !== null ? "Edit prompt" : "New prompt"}
-					</h3>
+			<Dialog
+				open={showForm}
+				onOpenChange={(open) => {
+					setShowForm(open);
+					if (!open) {
+						setFormData(EMPTY_FORM);
+						setEditingId(null);
+					}
+				}}
+			>
+				<DialogContent className="max-w-lg bg-popover text-popover-foreground">
+					<DialogHeader>
+						<DialogTitle>{editingId !== null ? "Edit prompt" : "New prompt"}</DialogTitle>
+						<DialogDescription>
+							Create prompt templates triggered with / in the chat composer.
+						</DialogDescription>
+					</DialogHeader>
 
-					<div className="space-y-2">
-						<Label htmlFor="prompt-name">Name</Label>
-						<Input
-							id="prompt-name"
-							value={formData.name}
-							onChange={(e) => setFormData((p) => ({ ...p, name: e.target.value }))}
-							placeholder="e.g. Fix grammar"
-						/>
-					</div>
+					<div className="space-y-4">
+						<div className="space-y-2">
+							<Label htmlFor="prompt-name">Name</Label>
+							<Input
+								id="prompt-name"
+								value={formData.name}
+								onChange={(e) => setFormData((p) => ({ ...p, name: e.target.value }))}
+								placeholder="e.g. Fix grammar"
+							/>
+						</div>
 
-					<div className="space-y-2">
-						<Label htmlFor="prompt-template">Prompt template</Label>
-						<textarea
-							id="prompt-template"
-							value={formData.prompt}
-							onChange={(e) => setFormData((p) => ({ ...p, prompt: e.target.value }))}
-							placeholder="e.g. Fix the grammar in the following text:\n\n{selection}"
-							rows={4}
-							className="w-full rounded-md border border-input bg-transparent px-3 py-2 text-sm outline-none resize-none focus:ring-1 focus:ring-ring"
-						/>
-						<p className="text-xs text-muted-foreground">
-							Use{" "}
-							<code className="rounded bg-muted px-1 py-0.5 font-mono text-[11px]">
-								{"{selection}"}
-							</code>{" "}
-							to insert the input text. If omitted, the text is appended automatically.
-						</p>
-					</div>
+						<div className="space-y-2">
+							<Label htmlFor="prompt-template">Prompt template</Label>
+							<textarea
+								id="prompt-template"
+								value={formData.prompt}
+								onChange={(e) => setFormData((p) => ({ ...p, prompt: e.target.value }))}
+								placeholder="e.g. Fix the grammar in the following text:\n\n{selection}"
+								rows={4}
+								className="w-full rounded-md border border-input bg-transparent px-3 py-2 text-sm outline-none resize-none focus:ring-1 focus:ring-ring"
+							/>
+							<p className="text-xs text-muted-foreground">
+								Use{" "}
+								<code className="rounded bg-muted px-1 py-0.5 font-mono text-[11px]">
+									{"{selection}"}
+								</code>{" "}
+								to insert the input text. If omitted, the text is appended automatically.
+							</p>
+						</div>
 
-					<div className="space-y-2">
-						<Label htmlFor="prompt-mode">Mode</Label>
-						<select
-							id="prompt-mode"
-							value={formData.mode}
-							onChange={(e) =>
-								setFormData((p) => ({ ...p, mode: e.target.value as "transform" | "explore" }))
-							}
-							className="w-full rounded-md border border-input bg-transparent px-3 py-2 text-sm outline-none focus:ring-1 focus:ring-ring"
-						>
-							<option value="transform">Transform — rewrites or modifies your text</option>
-							<option value="explore">Explore — answers a question about your text</option>
-						</select>
-					</div>
+						<div className="space-y-2">
+							<Label htmlFor="prompt-mode">Mode</Label>
+							<Select
+								value={formData.mode}
+								onValueChange={(value) =>
+									setFormData((p) => ({ ...p, mode: value as "transform" | "explore" }))
+								}
+							>
+								<SelectTrigger id="prompt-mode" className="w-full">
+									<SelectValue />
+								</SelectTrigger>
+								<SelectContent>
+									<SelectItem value="transform">
+										Transform — rewrites or modifies your text
+									</SelectItem>
+									<SelectItem value="explore">
+										Explore — answers a question about your text
+									</SelectItem>
+								</SelectContent>
+							</Select>
+						</div>
 
-					<div className="flex items-center gap-2">
-						<Switch
-							id="prompt-public"
-							checked={formData.is_public}
-							onCheckedChange={(checked) => setFormData((p) => ({ ...p, is_public: checked }))}
-						/>
-						<Label htmlFor="prompt-public" className="text-sm font-normal">
-							Share with community
-						</Label>
+						<div className="flex items-center gap-2">
+							<Switch
+								id="prompt-public"
+								checked={formData.is_public}
+								onCheckedChange={(checked) => setFormData((p) => ({ ...p, is_public: checked }))}
+							/>
+							<Label htmlFor="prompt-public" className="text-sm font-normal">
+								Share with community
+							</Label>
+						</div>
 					</div>
 
-					<div className="flex items-center justify-end gap-2 pt-2">
-						<Button variant="ghost" size="sm" onClick={handleCancel}>
+					<DialogFooter>
+						<Button
+							type="button"
+							variant="secondary"
+							size="sm"
+							onClick={handleCancel}
+							disabled={isSaving}
+							className="text-sm h-9"
+						>
 							Cancel
 						</Button>
-						<Button size="sm" onClick={handleSave} disabled={isSaving} className="relative">
+						<Button
+							size="sm"
+							onClick={handleSave}
+							disabled={isSaving}
+							className="relative text-sm h-9 min-w-[96px]"
+						>
 							<span className={isSaving ? "opacity-0" : ""}>
 								{editingId !== null ? "Update" : "Create"}
 							</span>
-							{isSaving && <Spinner className="size-3.5 absolute" />}
+							{isSaving && <Spinner size="sm" className="absolute" />}
 						</Button>
-					</div>
+					</DialogFooter>
+				</DialogContent>
+			</Dialog>
+
+			{isLoading && (
+				<div className="space-y-2">
+					{["skeleton-a", "skeleton-b", "skeleton-c"].map((key) => (
+						<Card key={key} className="border-accent bg-accent/20">
+							<CardContent className="p-4 flex flex-col gap-3 min-h-24">
+								<Skeleton className="h-4 w-32 md:w-40 bg-accent" />
+								<Skeleton className="h-3 w-full bg-accent" />
+								<Skeleton className="h-3 w-24 md:w-28 bg-accent mt-auto" />
+							</CardContent>
+						</Card>
+					))}
 				</div>
 			)}
 
-			{list.length === 0 && !showForm && (
+			{isError && (
+				<Alert variant="destructive">
+					<AlertTriangle />
+					<AlertTitle>Failed to load prompts</AlertTitle>
+					<AlertDescription>Please try refreshing the page.</AlertDescription>
+				</Alert>
+			)}
+
+			{!isLoading && !isError && list.length === 0 && !showForm && (
 				<div className="rounded-lg border border-dashed border-border/60 p-8 text-center">
 					<Sparkles className="mx-auto size-8 text-muted-foreground/40" />
 					<p className="mt-2 text-sm text-muted-foreground">No prompts yet</p>
@@ -248,24 +307,21 @@ export function PromptsContent() {
 				</div>
 			)}
 
-			{list.length > 0 && (
+			{!isLoading && !isError && list.length > 0 && (
 				<div className="space-y-2">
 					{list.map((prompt) => (
 						<div
 							key={prompt.id}
-							className="group flex items-start gap-3 rounded-lg border border-border/60 bg-card p-4"
+							className="group relative flex items-start gap-3 overflow-hidden rounded-lg border border-accent bg-accent/20 p-4 transition-all duration-200 hover:shadow-md"
 						>
-							<div className="mt-0.5 shrink-0 text-muted-foreground">
-								<Sparkles className="size-4" />
-							</div>
 							<div className="flex-1 min-w-0">
 								<div className="flex items-center gap-2">
 									<span className="text-sm font-medium">{prompt.name}</span>
-									<span className="rounded-full border px-2 py-0.5 text-[10px] text-muted-foreground">
+									<span className="rounded-md border-0 bg-muted px-1.5 py-0.5 text-[10px] font-medium text-muted-foreground">
 										{prompt.mode}
 									</span>
 									{prompt.is_public && (
-										<span className="flex items-center gap-1 rounded-full border border-primary/20 bg-primary/5 px-2 py-0.5 text-[10px] text-primary">
+										<span className="flex items-center gap-1 rounded-md border-0 bg-muted px-1.5 py-0.5 text-[10px] font-medium text-muted-foreground">
 											<Globe className="size-2.5" />
 											Public
 										</span>
@@ -277,48 +333,55 @@ export function PromptsContent() {
 									{prompt.prompt}
 								</p>
 								{prompt.prompt.length > 100 && (
-									<button
+									<Button
 										type="button"
+										variant="link"
 										onClick={() => setExpandedId(expandedId === prompt.id ? null : prompt.id)}
-										className="mt-1 text-[11px] text-primary hover:underline cursor-pointer"
+										className="mt-1 h-auto cursor-pointer px-0 py-0 text-[11px] text-primary"
 									>
 										{expandedId === prompt.id ? "See less" : "See more"}
-									</button>
+									</Button>
 								)}
 							</div>
-							<div className="hidden group-hover:flex items-center gap-1 shrink-0">
-								<button
-									type="button"
-									title={prompt.is_public ? "Make private" : "Share with community"}
-									onClick={() => handleTogglePublic(prompt)}
-									disabled={togglingPublicIds.has(prompt.id)}
-									className="flex items-center justify-center size-7 rounded-md text-muted-foreground hover:text-foreground hover:bg-accent transition-colors disabled:opacity-50 disabled:pointer-events-none"
-								>
-									{togglingPublicIds.has(prompt.id) ? (
-										<Spinner className="size-3.5" />
-									) : prompt.is_public ? (
-										<Lock className="size-3.5" />
-									) : (
-										<Globe className="size-3.5" />
-									)}
-								</button>
-								<Button
-									variant="ghost"
-									size="icon"
-									className="size-7"
-									onClick={() => handleEdit(prompt)}
-								>
-									<PenLine className="size-3.5" />
-								</Button>
-								<Button
-									variant="ghost"
-									size="icon"
-									className="size-7 text-destructive hover:text-destructive"
-									onClick={() => setDeleteTarget(prompt.id)}
-								>
-									<Trash2 className="size-3.5" />
-								</Button>
-							</div>
+							<DropdownMenu>
+								<DropdownMenuTrigger asChild>
+									<Button
+										type="button"
+										variant="ghost"
+										size="icon"
+										className="h-7 w-7 shrink-0 self-center rounded-lg text-muted-foreground opacity-100 pointer-events-auto transition-opacity duration-150 hover:text-accent-foreground sm:opacity-0 sm:pointer-events-none sm:group-hover:opacity-100 sm:group-hover:pointer-events-auto"
+									>
+										<MoreHorizontal className="size-3.5" />
+										<span className="sr-only">Prompt actions</span>
+									</Button>
+								</DropdownMenuTrigger>
+								<DropdownMenuContent align="end">
+									<DropdownMenuItem
+										onClick={() => handleTogglePublic(prompt)}
+										disabled={togglingPublicIds.has(prompt.id)}
+									>
+										{togglingPublicIds.has(prompt.id) ? (
+											<Spinner className="size-4" />
+										) : prompt.is_public ? (
+											<Lock className="size-4" />
+										) : (
+											<Globe className="size-4" />
+										)}
+										{prompt.is_public ? "Make private" : "Share with community"}
+									</DropdownMenuItem>
+									<DropdownMenuItem onClick={() => handleEdit(prompt)}>
+										<Pencil className="size-4" />
+										Edit
+									</DropdownMenuItem>
+									<DropdownMenuItem
+										onClick={() => setDeleteTarget(prompt.id)}
+										className="text-destructive focus:text-destructive"
+									>
+										<Trash2 className="size-4" />
+										Delete
+									</DropdownMenuItem>
+								</DropdownMenuContent>
+							</DropdownMenu>
 						</div>
 					))}
 				</div>
diff --git a/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/PurchaseHistoryContent.tsx b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/PurchaseHistoryContent.tsx
index 2eb56520bd..cf73b5ebac 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/PurchaseHistoryContent.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/PurchaseHistoryContent.tsx
@@ -1,7 +1,8 @@
 "use client";
 
-import { useQuery } from "@tanstack/react-query";
-import { Receipt } from "lucide-react";
+import { useQueries } from "@tanstack/react-query";
+import { Coins, FileText, ReceiptText } from "lucide-react";
+import { useMemo } from "react";
 import { Badge } from "@/components/ui/badge";
 import { Spinner } from "@/components/ui/spinner";
 import {
@@ -12,10 +13,32 @@ import {
 	TableHeader,
 	TableRow,
 } from "@/components/ui/table";
-import type { PagePurchase, PagePurchaseStatus } from "@/contracts/types/stripe.types";
+import type {
+	PagePurchase,
+	PagePurchaseStatus,
+	TokenPurchase,
+} from "@/contracts/types/stripe.types";
 import { stripeApiService } from "@/lib/apis/stripe-api.service";
 import { cn } from "@/lib/utils";
 
+type PurchaseKind = "pages" | "tokens";
+
+type UnifiedPurchase = {
+	id: string;
+	kind: PurchaseKind;
+	created_at: string;
+	status: PagePurchaseStatus;
+	/**
+	 * Granted units. Interpretation depends on ``kind``:
+	 *  - ``"pages"`` — integer number of indexed pages.
+	 *  - ``"tokens"`` — integer micro-USD of credit (1_000_000 = $1.00).
+	 * The ``Granted`` column formats accordingly.
+	 */
+	granted: number;
+	amount_total: number | null;
+	currency: string | null;
+};
+
 const STATUS_STYLES: Record<PagePurchaseStatus, { label: string; className: string }> = {
 	completed: {
 		label: "Completed",
@@ -31,6 +54,22 @@ const STATUS_STYLES: Record<PagePurchaseStatus, { label: string; className: stri
 	},
 };
 
+const KIND_META: Record<
+	PurchaseKind,
+	{ label: string; icon: React.ComponentType<{ className?: string }>; iconClass: string }
+> = {
+	pages: {
+		label: "Pages",
+		icon: FileText,
+		iconClass: "text-sky-500",
+	},
+	tokens: {
+		label: "Premium Credit",
+		icon: Coins,
+		iconClass: "text-amber-500",
+	},
+};
+
 function formatDate(iso: string): string {
 	return new Date(iso).toLocaleDateString(undefined, {
 		year: "numeric",
@@ -39,19 +78,76 @@ function formatDate(iso: string): string {
 	});
 }
 
-function formatAmount(purchase: PagePurchase): string {
-	if (purchase.amount_total == null) return "—";
-	const dollars = purchase.amount_total / 100;
-	const currency = (purchase.currency ?? "usd").toUpperCase();
-	return `$${dollars.toFixed(2)} ${currency}`;
+function formatAmount(amount: number | null, currency: string | null): string {
+	if (amount == null) return "—";
+	const dollars = amount / 100;
+	const code = (currency ?? "usd").toUpperCase();
+	return `$${dollars.toFixed(2)} ${code}`;
+}
+
+function normalizePagePurchase(p: PagePurchase): UnifiedPurchase {
+	return {
+		id: p.id,
+		kind: "pages",
+		created_at: p.created_at,
+		status: p.status,
+		granted: p.pages_granted,
+		amount_total: p.amount_total,
+		currency: p.currency,
+	};
+}
+
+function normalizeTokenPurchase(p: TokenPurchase): UnifiedPurchase {
+	return {
+		id: p.id,
+		kind: "tokens",
+		created_at: p.created_at,
+		status: p.status,
+		granted: p.credit_micros_granted,
+		amount_total: p.amount_total,
+		currency: p.currency,
+	};
+}
+
+function formatGranted(p: UnifiedPurchase): string {
+	if (p.kind === "tokens") {
+		const dollars = p.granted / 1_000_000;
+		// Premium credit packs are always whole dollars at the moment, but
+		// future fractional grants (refunds, partial top-ups) shouldn't
+		// silently round to "$0".
+		if (dollars >= 1) return `$${dollars.toFixed(2)} of credit`;
+		if (dollars > 0) return `$${dollars.toFixed(3)} of credit`;
+		return "$0 of credit";
+	}
+	return p.granted.toLocaleString();
 }
 
 export function PurchaseHistoryContent() {
-	const { data, isLoading } = useQuery({
-		queryKey: ["stripe-purchases"],
-		queryFn: () => stripeApiService.getPurchases(),
+	const results = useQueries({
+		queries: [
+			{
+				queryKey: ["stripe-purchases"],
+				queryFn: () => stripeApiService.getPurchases(),
+			},
+			{
+				queryKey: ["stripe-token-purchases"],
+				queryFn: () => stripeApiService.getTokenPurchases(),
+			},
+		],
 	});
 
+	const [pagesQuery, tokensQuery] = results;
+	const isLoading = pagesQuery.isLoading || tokensQuery.isLoading;
+
+	const purchases = useMemo<UnifiedPurchase[]>(() => {
+		const pagePurchases = pagesQuery.data?.purchases ?? [];
+		const tokenPurchases = tokensQuery.data?.purchases ?? [];
+		return [
+			...pagePurchases.map(normalizePagePurchase),
+			...tokenPurchases.map(normalizeTokenPurchase),
+		].sort((a, b) => new Date(b.created_at).getTime() - new Date(a.created_at).getTime());
+	}, [pagesQuery.data, tokensQuery.data]);
+
 	if (isLoading) {
 		return (
 			<div className="flex items-center justify-center py-12">
@@ -60,15 +156,13 @@ export function PurchaseHistoryContent() {
 		);
 	}
 
-	const purchases = data?.purchases ?? [];
-
 	if (purchases.length === 0) {
 		return (
 			<div className="flex flex-col items-center justify-center gap-2 py-16 text-center">
-				<Receipt className="h-8 w-8 text-muted-foreground" />
+				<ReceiptText className="h-8 w-8 text-muted-foreground" />
 				<p className="text-sm font-medium">No purchases yet</p>
 				<p className="text-xs text-muted-foreground">
-					Your page-pack purchases will appear here after checkout.
+					Your page and premium credit purchases will appear here after checkout.
 				</p>
 			</div>
 		);
@@ -81,25 +175,36 @@ export function PurchaseHistoryContent() {
 					<TableHeader>
 						<TableRow>
 							<TableHead>Date</TableHead>
-							<TableHead className="text-right">Pages</TableHead>
+							<TableHead>Type</TableHead>
+							<TableHead className="text-right">Granted</TableHead>
 							<TableHead className="text-right">Amount</TableHead>
 							<TableHead className="text-center">Status</TableHead>
 						</TableRow>
 					</TableHeader>
 					<TableBody>
 						{purchases.map((p) => {
-							const style = STATUS_STYLES[p.status];
+							const statusStyle = STATUS_STYLES[p.status];
+							const kind = KIND_META[p.kind];
+							const KindIcon = kind.icon;
 							return (
-								<TableRow key={p.id}>
+								<TableRow key={`${p.kind}-${p.id}`}>
 									<TableCell className="text-sm">{formatDate(p.created_at)}</TableCell>
+									<TableCell className="text-sm">
+										<div className="flex items-center gap-2">
+											<KindIcon className={cn("h-4 w-4", kind.iconClass)} />
+											<span>{kind.label}</span>
+										</div>
+									</TableCell>
 									<TableCell className="text-right tabular-nums text-sm">
-										{p.pages_granted.toLocaleString()}
+										{formatGranted(p)}
 									</TableCell>
 									<TableCell className="text-right tabular-nums text-sm">
-										{formatAmount(p)}
+										{formatAmount(p.amount_total, p.currency)}
 									</TableCell>
 									<TableCell className="text-center">
-										<Badge className={cn("text-[10px]", style.className)}>{style.label}</Badge>
+										<Badge className={cn("text-[10px]", statusStyle.className)}>
+											{statusStyle.label}
+										</Badge>
 									</TableCell>
 								</TableRow>
 							);
@@ -108,7 +213,8 @@ export function PurchaseHistoryContent() {
 				</Table>
 			</div>
 			<p className="text-center text-xs text-muted-foreground">
-				Showing your {purchases.length} most recent purchase{purchases.length !== 1 ? "s" : ""}.
+				Showing your {purchases.length} most recent purchase
+				{purchases.length !== 1 ? "s" : ""}.
 			</p>
 		</div>
 	);
diff --git a/surfsense_web/app/dashboard/[search_space_id]/user-settings/desktop/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/user-settings/desktop/page.tsx
new file mode 100644
index 0000000000..a7b0d1d270
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/user-settings/desktop/page.tsx
@@ -0,0 +1,5 @@
+import { DesktopContent } from "../components/DesktopContent";
+
+export default function Page() {
+	return <DesktopContent />;
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/user-settings/hotkeys/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/user-settings/hotkeys/page.tsx
new file mode 100644
index 0000000000..8f776f738c
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/user-settings/hotkeys/page.tsx
@@ -0,0 +1,5 @@
+import { HotkeysContent } from "../components/HotkeysContent";
+
+export default function Page() {
+	return <HotkeysContent />;
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/user-settings/layout-shell.tsx b/surfsense_web/app/dashboard/[search_space_id]/user-settings/layout-shell.tsx
new file mode 100644
index 0000000000..8200216220
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/user-settings/layout-shell.tsx
@@ -0,0 +1,188 @@
+"use client";
+
+import {
+	Brain,
+	CircleUser,
+	Keyboard,
+	KeyRound,
+	Library,
+	Monitor,
+	ReceiptText,
+	ShieldCheck,
+	WandSparkles,
+	Workflow,
+} from "lucide-react";
+import Link from "next/link";
+import { useSelectedLayoutSegment } from "next/navigation";
+import { useTranslations } from "next-intl";
+import type React from "react";
+import { useCallback, useMemo, useState } from "react";
+import { Separator } from "@/components/ui/separator";
+import { usePlatform } from "@/hooks/use-platform";
+import { cn } from "@/lib/utils";
+
+export type UserSettingsTab =
+	| "profile"
+	| "api-key"
+	| "prompts"
+	| "community-prompts"
+	| "memory"
+	| "agent-permissions"
+	| "agent-status"
+	| "purchases"
+	| "desktop"
+	| "hotkeys";
+
+const DEFAULT_TAB: UserSettingsTab = "profile";
+
+interface UserSettingsLayoutShellProps {
+	searchSpaceId: string;
+	children: React.ReactNode;
+}
+
+export function UserSettingsLayoutShell({ searchSpaceId, children }: UserSettingsLayoutShellProps) {
+	const t = useTranslations("userSettings");
+	const { isDesktop } = usePlatform();
+	const segment = useSelectedLayoutSegment();
+	const [tabScrollPos, setTabScrollPos] = useState<"start" | "middle" | "end">("start");
+
+	const handleTabScroll = useCallback((e: React.UIEvent<HTMLDivElement>) => {
+		const el = e.currentTarget;
+		const atStart = el.scrollLeft <= 2;
+		const atEnd = el.scrollWidth - el.scrollLeft - el.clientWidth <= 2;
+		setTabScrollPos(atStart ? "start" : atEnd ? "end" : "middle");
+	}, []);
+
+	const navItems = useMemo(
+		() => [
+			{
+				value: "profile" as const,
+				label: t("profile_nav_label"),
+				icon: <CircleUser className="h-4 w-4" />,
+			},
+			{
+				value: "api-key" as const,
+				label: t("api_key_nav_label"),
+				icon: <KeyRound className="h-4 w-4" />,
+			},
+			{
+				value: "prompts" as const,
+				label: "My Prompts",
+				icon: <WandSparkles className="h-4 w-4" />,
+			},
+			{
+				value: "community-prompts" as const,
+				label: "Community Prompts",
+				icon: <Library className="h-4 w-4" />,
+			},
+			{
+				value: "memory" as const,
+				label: "Memory",
+				icon: <Brain className="h-4 w-4" />,
+			},
+			{
+				value: "agent-permissions" as const,
+				label: "Agent Permissions",
+				icon: <ShieldCheck className="h-4 w-4" />,
+			},
+			{
+				value: "agent-status" as const,
+				label: "Agent Status",
+				icon: <Workflow className="h-4 w-4" />,
+			},
+			{
+				value: "purchases" as const,
+				label: "Purchase History",
+				icon: <ReceiptText className="h-4 w-4" />,
+			},
+			...(isDesktop
+				? [
+						{
+							value: "desktop" as const,
+							label: "App Preferences",
+							icon: <Monitor className="h-4 w-4" />,
+						},
+						{
+							value: "hotkeys" as const,
+							label: "Hotkeys",
+							icon: <Keyboard className="h-4 w-4" />,
+						},
+					]
+				: []),
+		],
+		[t, isDesktop]
+	);
+
+	const activeTab: UserSettingsTab =
+		segment && navItems.some((item) => item.value === segment)
+			? (segment as UserSettingsTab)
+			: DEFAULT_TAB;
+	const selectedLabel = navItems.find((item) => item.value === activeTab)?.label ?? t("title");
+
+	const hrefFor = (tab: UserSettingsTab) => `/dashboard/${searchSpaceId}/user-settings/${tab}`;
+
+	return (
+		<section className="flex h-full min-h-[min(680px,calc(100vh-5rem))] w-full select-none flex-col gap-6 md:pt-10 md:flex-row">
+			<div className="md:w-[220px] md:shrink-0">
+				<h1 className="mb-4 px-1 text-2xl font-semibold tracking-tight">{t("title")}</h1>
+				<nav className="hidden flex-col gap-0.5 md:flex">
+					{navItems.map((item) => (
+						<Link
+							key={item.value}
+							href={hrefFor(item.value)}
+							replace
+							scroll={false}
+							prefetch
+							className={cn(
+								"inline-flex h-auto items-center justify-start gap-3 rounded-md px-3 py-2.5 text-left text-sm font-medium transition-colors duration-150 focus:outline-none focus-visible:outline-none",
+								activeTab === item.value
+									? "bg-accent text-accent-foreground"
+									: "bg-transparent text-muted-foreground hover:bg-accent hover:text-accent-foreground"
+							)}
+						>
+							{item.icon}
+							{item.label}
+						</Link>
+					))}
+				</nav>
+				<div
+					className="overflow-x-auto border-b border-border pb-3 md:hidden"
+					onScroll={handleTabScroll}
+					style={{
+						maskImage: `linear-gradient(to right, ${tabScrollPos === "start" ? "black" : "transparent"}, black 24px, black calc(100% - 24px), ${tabScrollPos === "end" ? "black" : "transparent"})`,
+						WebkitMaskImage: `linear-gradient(to right, ${tabScrollPos === "start" ? "black" : "transparent"}, black 24px, black calc(100% - 24px), ${tabScrollPos === "end" ? "black" : "transparent"})`,
+					}}
+				>
+					<div className="flex gap-1">
+						{navItems.map((item) => (
+							<Link
+								key={item.value}
+								href={hrefFor(item.value)}
+								replace
+								scroll={false}
+								prefetch
+								className={cn(
+									"inline-flex h-auto shrink-0 items-center gap-2 rounded-md px-3 py-1.5 text-xs font-medium transition-colors duration-150 focus:outline-none focus-visible:outline-none",
+									activeTab === item.value
+										? "bg-accent text-accent-foreground"
+										: "bg-transparent text-muted-foreground hover:bg-accent hover:text-accent-foreground"
+								)}
+							>
+								{item.icon}
+								{item.label}
+							</Link>
+						))}
+					</div>
+				</div>
+			</div>
+
+			<div className="min-w-0 flex-1">
+				<div className="hidden md:block">
+					<h2 className="text-lg font-semibold">{selectedLabel}</h2>
+					<Separator className="mt-4 bg-border" />
+				</div>
+				<div className="min-w-0 pt-4 md:max-w-3xl">{children}</div>
+			</div>
+		</section>
+	);
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/user-settings/layout.tsx b/surfsense_web/app/dashboard/[search_space_id]/user-settings/layout.tsx
new file mode 100644
index 0000000000..e1f97a903f
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/user-settings/layout.tsx
@@ -0,0 +1,17 @@
+import type React from "react";
+import { use } from "react";
+import { UserSettingsLayoutShell } from "./layout-shell";
+
+export default function UserSettingsLayout({
+	params,
+	children,
+}: {
+	params: Promise<{ search_space_id: string }>;
+	children: React.ReactNode;
+}) {
+	const { search_space_id } = use(params);
+
+	return (
+		<UserSettingsLayoutShell searchSpaceId={search_space_id}>{children}</UserSettingsLayoutShell>
+	);
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/user-settings/memory/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/user-settings/memory/page.tsx
new file mode 100644
index 0000000000..b10c5bce5a
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/user-settings/memory/page.tsx
@@ -0,0 +1,5 @@
+import { MemoryContent } from "../components/MemoryContent";
+
+export default function Page() {
+	return <MemoryContent />;
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/user-settings/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/user-settings/page.tsx
new file mode 100644
index 0000000000..dde6431424
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/user-settings/page.tsx
@@ -0,0 +1,10 @@
+import { redirect } from "next/navigation";
+
+export default async function UserSettingsPage({
+	params,
+}: {
+	params: Promise<{ search_space_id: string }>;
+}) {
+	const { search_space_id } = await params;
+	redirect(`/dashboard/${search_space_id}/user-settings/profile`);
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/user-settings/profile/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/user-settings/profile/page.tsx
new file mode 100644
index 0000000000..9275ed26c0
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/user-settings/profile/page.tsx
@@ -0,0 +1,5 @@
+import { ProfileContent } from "../components/ProfileContent";
+
+export default function Page() {
+	return <ProfileContent />;
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/user-settings/prompts/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/user-settings/prompts/page.tsx
new file mode 100644
index 0000000000..559ce0a954
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/user-settings/prompts/page.tsx
@@ -0,0 +1,5 @@
+import { PromptsContent } from "../components/PromptsContent";
+
+export default function Page() {
+	return <PromptsContent />;
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/user-settings/purchases/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/user-settings/purchases/page.tsx
new file mode 100644
index 0000000000..3fa08c278a
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/user-settings/purchases/page.tsx
@@ -0,0 +1,5 @@
+import { PurchaseHistoryContent } from "../components/PurchaseHistoryContent";
+
+export default function Page() {
+	return <PurchaseHistoryContent />;
+}
diff --git a/surfsense_web/app/dashboard/error.tsx b/surfsense_web/app/dashboard/error.tsx
new file mode 100644
index 0000000000..28a22fe3c3
--- /dev/null
+++ b/surfsense_web/app/dashboard/error.tsx
@@ -0,0 +1,64 @@
+"use client";
+
+import { ExternalLink } from "lucide-react";
+import Link from "next/link";
+import { useEffect, useMemo } from "react";
+import { Button } from "@/components/ui/button";
+import { buildIssueUrl } from "@/lib/error-toast";
+
+export default function DashboardError({
+	error,
+	reset,
+}: {
+	error: globalThis.Error & { digest?: string; code?: string; requestId?: string };
+	reset: () => void;
+}) {
+	useEffect(() => {
+		import("posthog-js")
+			.then(({ default: posthog }) => {
+				posthog.captureException(error);
+			})
+			.catch(() => {});
+	}, [error]);
+
+	const issueUrl = useMemo(() => buildIssueUrl(error), [error]);
+
+	return (
+		<div className="flex flex-1 flex-col items-center justify-center gap-4 p-8 text-center">
+			<h2 className="text-xl font-semibold">Something went wrong</h2>
+			<p className="text-muted-foreground max-w-md">
+				An error occurred in this section. Your dashboard is still available. If this keeps
+				happening, please report it so we can fix it.
+			</p>
+
+			{(error.digest || error.code || error.requestId) && (
+				<div className="rounded-md border bg-muted/50 px-4 py-2 text-xs text-muted-foreground font-mono max-w-md">
+					{error.code && <span>Code: {error.code}</span>}
+					{error.requestId && <span className="ml-3">ID: {error.requestId}</span>}
+					{error.digest && <span className="ml-3">Digest: {error.digest}</span>}
+				</div>
+			)}
+
+			<div className="flex gap-2">
+				<Button type="button" onClick={reset}>
+					Try again
+				</Button>
+				<Link
+					href="/dashboard"
+					className="rounded-md border border-input bg-background px-4 py-2 text-sm font-medium hover:bg-accent hover:text-accent-foreground transition-colors"
+				>
+					Go to dashboard home
+				</Link>
+				<a
+					href={issueUrl}
+					target="_blank"
+					rel="noopener noreferrer"
+					className="inline-flex items-center gap-1.5 rounded-md border border-input bg-background px-4 py-2 text-sm font-medium hover:bg-accent hover:text-accent-foreground transition-colors"
+				>
+					<ExternalLink className="h-3.5 w-3.5" />
+					Report Issue
+				</a>
+			</div>
+		</div>
+	);
+}
diff --git a/surfsense_web/app/desktop/login/page.tsx b/surfsense_web/app/desktop/login/page.tsx
index 8f68d20c1d..41c956f3e9 100644
--- a/surfsense_web/app/desktop/login/page.tsx
+++ b/surfsense_web/app/desktop/login/page.tsx
@@ -1,18 +1,18 @@
 "use client";
 
-import { IconBrandGoogleFilled } from "@tabler/icons-react";
 import { useAtom } from "jotai";
-import { BrainCog, Eye, EyeOff, Rocket, Zap } from "lucide-react";
+import { Crop, Eye, EyeOff, Rocket, RotateCcw, Zap } from "lucide-react";
 import Image from "next/image";
 import { useRouter } from "next/navigation";
-import { useCallback, useEffect, useState } from "react";
+import { useCallback, useEffect, useMemo, useRef, useState } from "react";
 import { toast } from "sonner";
 import { loginMutationAtom } from "@/atoms/auth/auth-mutation.atoms";
-import { DEFAULT_SHORTCUTS, ShortcutRecorder } from "@/components/desktop/shortcut-recorder";
+import { DEFAULT_SHORTCUTS, keyEventToAccelerator } from "@/components/desktop/shortcut-recorder";
 import { Button } from "@/components/ui/button";
 import { Input } from "@/components/ui/input";
 import { Label } from "@/components/ui/label";
 import { Separator } from "@/components/ui/separator";
+import { ShortcutKbd } from "@/components/ui/shortcut-kbd";
 import { Spinner } from "@/components/ui/spinner";
 import { useElectronAPI } from "@/hooks/use-platform";
 import { searchSpacesApiService } from "@/lib/apis/search-spaces-api.service";
@@ -20,6 +20,171 @@ import { setBearerToken } from "@/lib/auth-utils";
 import { AUTH_TYPE, BACKEND_URL } from "@/lib/env-config";
 
 const isGoogleAuth = AUTH_TYPE === "GOOGLE";
+type ShortcutKey = "generalAssist" | "quickAsk" | "screenshotAssist";
+type ShortcutMap = typeof DEFAULT_SHORTCUTS;
+
+function GoogleGLogo({ className }: { className?: string }) {
+	return (
+		<svg
+			className={className}
+			xmlns="http://www.w3.org/2000/svg"
+			viewBox="0 0 48 48"
+			aria-hidden="true"
+		>
+			<path
+				fill="#EA4335"
+				d="M24 9.5c3.54 0 6.71 1.22 9.21 3.6l6.85-6.85C35.9 2.38 30.47 0 24 0 14.62 0 6.51 5.38 2.56 13.22l7.98 6.19C12.43 13.72 17.74 9.5 24 9.5z"
+			/>
+			<path
+				fill="#4285F4"
+				d="M46.98 24.55c0-1.57-.15-3.09-.38-4.55H24v9.02h12.94c-.58 2.96-2.26 5.48-4.78 7.18l7.73 6c4.51-4.18 7.09-10.36 7.09-17.65z"
+			/>
+			<path
+				fill="#FBBC05"
+				d="M10.53 28.59c-.48-1.45-.76-2.99-.76-4.59s.27-3.14.76-4.59l-7.98-6.19C.92 16.46 0 20.12 0 24c0 3.88.92 7.54 2.56 10.78l7.97-6.19z"
+			/>
+			<path
+				fill="#34A853"
+				d="M24 48c6.48 0 11.93-2.13 15.89-5.81l-7.73-6c-2.15 1.45-4.92 2.3-8.16 2.3-6.26 0-11.57-4.22-13.47-9.91l-7.98 6.19C6.51 42.62 14.62 48 24 48z"
+			/>
+		</svg>
+	);
+}
+
+const HOTKEY_ROWS: Array<{
+	key: ShortcutKey;
+	label: string;
+	description: string;
+	icon: React.ElementType;
+}> = [
+	{
+		key: "generalAssist",
+		label: "General Assist",
+		description: "Launch SurfSense instantly from any application",
+		icon: Rocket,
+	},
+	{
+		key: "screenshotAssist",
+		label: "Screenshot Assist",
+		description: "Draw a region on screen to attach that capture to chat",
+		icon: Crop,
+	},
+	{
+		key: "quickAsk",
+		label: "Quick Assist",
+		description: "Select text anywhere, then ask AI to explain, rewrite, or act on it",
+		icon: Zap,
+	},
+];
+
+function acceleratorToKeys(accel: string, isMac: boolean): string[] {
+	if (!accel) return [];
+	return accel.split("+").map((part) => {
+		if (part === "CommandOrControl") {
+			return isMac ? "⌘" : "Ctrl";
+		}
+		if (part === "Alt") {
+			return isMac ? "⌥" : "Alt";
+		}
+		if (part === "Shift") {
+			return isMac ? "⇧" : "Shift";
+		}
+		if (part === "Space") return "Space";
+		return part.length === 1 ? part.toUpperCase() : part;
+	});
+}
+
+function HotkeyRow({
+	label,
+	description,
+	value,
+	defaultValue,
+	icon: Icon,
+	isMac,
+	onChange,
+	onReset,
+}: {
+	label: string;
+	description: string;
+	value: string;
+	defaultValue: string;
+	icon: React.ElementType;
+	isMac: boolean;
+	onChange: (accelerator: string) => void;
+	onReset: () => void;
+}) {
+	const [recording, setRecording] = useState(false);
+	const inputRef = useRef<HTMLButtonElement>(null);
+	const isDefault = value === defaultValue;
+	const displayKeys = useMemo(() => acceleratorToKeys(value, isMac), [value, isMac]);
+
+	const handleKeyDown = useCallback(
+		(e: React.KeyboardEvent) => {
+			if (!recording) return;
+			e.preventDefault();
+			e.stopPropagation();
+
+			if (e.key === "Escape") {
+				setRecording(false);
+				return;
+			}
+
+			const accel = keyEventToAccelerator(e);
+			if (accel) {
+				onChange(accel);
+				setRecording(false);
+			}
+		},
+		[onChange, recording]
+	);
+
+	return (
+		<div className="flex items-center justify-between gap-2.5 border-border/60 border-b py-3 last:border-b-0">
+			<div className="flex items-center gap-2.5 min-w-0">
+				<div className="flex size-7 shrink-0 items-center justify-center rounded-md bg-primary/10 text-primary">
+					<Icon className="size-3.5" />
+				</div>
+				<div className="min-w-0">
+					<p className="text-sm font-medium text-foreground truncate">{label}</p>
+					<p className="text-xs text-muted-foreground line-clamp-2">{description}</p>
+				</div>
+			</div>
+			<div className="flex shrink-0 items-center gap-1">
+				{!isDefault && (
+					<Button
+						variant="ghost"
+						size="icon"
+						className="size-7 text-muted-foreground hover:text-foreground"
+						onClick={onReset}
+						title="Reset to default"
+					>
+						<RotateCcw className="size-3" />
+					</Button>
+				)}
+				<Button
+					ref={inputRef}
+					type="button"
+					variant="ghost"
+					title={recording ? "Press shortcut keys" : "Click to edit shortcut"}
+					onClick={() => setRecording(true)}
+					onKeyDown={handleKeyDown}
+					onBlur={() => setRecording(false)}
+					className={
+						recording
+							? "h-7 border border-transparent bg-primary/5 px-0 outline-none ring-0 focus:outline-none focus-visible:outline-none focus-visible:ring-0"
+							: "h-7 cursor-pointer border border-transparent bg-transparent px-0 outline-none ring-0 transition-colors hover:bg-accent hover:text-accent-foreground focus:outline-none focus-visible:outline-none focus-visible:ring-0"
+					}
+				>
+					{recording ? (
+						<span className="px-2 text-[9px] text-primary whitespace-nowrap">Press hotkeys</span>
+					) : (
+						<ShortcutKbd keys={displayKeys} className="ml-0 px-1.5 text-foreground/85" />
+					)}
+				</Button>
+			</div>
+		</div>
+	);
+}
 
 export default function DesktopLoginPage() {
 	const router = useRouter();
@@ -30,9 +195,11 @@ export default function DesktopLoginPage() {
 	const [password, setPassword] = useState("");
 	const [showPassword, setShowPassword] = useState(false);
 	const [loginError, setLoginError] = useState<string | null>(null);
+	const [isGoogleRedirecting, setIsGoogleRedirecting] = useState(false);
 
 	const [shortcuts, setShortcuts] = useState(DEFAULT_SHORTCUTS);
 	const [shortcutsLoaded, setShortcutsLoaded] = useState(false);
+	const isMac = api?.versions?.platform === "darwin";
 
 	useEffect(() => {
 		if (!api?.getShortcuts) {
@@ -41,7 +208,7 @@ export default function DesktopLoginPage() {
 		}
 		api
 			.getShortcuts()
-			.then((config) => {
+			.then((config: ShortcutMap | null) => {
 				if (config) setShortcuts(config);
 				setShortcutsLoaded(true);
 			})
@@ -49,7 +216,7 @@ export default function DesktopLoginPage() {
 	}, [api]);
 
 	const updateShortcut = useCallback(
-		(key: "generalAssist" | "quickAsk" | "autocomplete", accelerator: string) => {
+		(key: ShortcutKey, accelerator: string) => {
 			setShortcuts((prev) => {
 				const updated = { ...prev, [key]: accelerator };
 				api?.setShortcuts?.({ [key]: accelerator }).catch(() => {
@@ -63,13 +230,15 @@ export default function DesktopLoginPage() {
 	);
 
 	const resetShortcut = useCallback(
-		(key: "generalAssist" | "quickAsk" | "autocomplete") => {
+		(key: ShortcutKey) => {
 			updateShortcut(key, DEFAULT_SHORTCUTS[key]);
 		},
 		[updateShortcut]
 	);
 
 	const handleGoogleLogin = () => {
+		if (isGoogleRedirecting) return;
+		setIsGoogleRedirecting(true);
 		window.location.href = `${BACKEND_URL}/auth/google/authorize-redirect`;
 	};
 
@@ -117,18 +286,8 @@ export default function DesktopLoginPage() {
 	};
 
 	return (
-		<div className="relative flex min-h-svh items-center justify-center bg-background p-4 sm:p-6">
-			{/* Subtle radial glow */}
-			<div className="pointer-events-none fixed inset-0 overflow-hidden">
-				<div
-					className="absolute -top-1/2 left-1/2 size-[800px] -translate-x-1/2 rounded-full opacity-[0.03]"
-					style={{
-						background: "radial-gradient(circle, hsl(var(--primary)) 0%, transparent 70%)",
-					}}
-				/>
-			</div>
-
-			<div className="relative flex w-full max-w-md flex-col overflow-hidden rounded-xl border bg-card shadow-lg">
+		<div className="relative flex min-h-svh items-center justify-center bg-main-panel p-4 sm:p-6 select-none">
+			<div className="relative flex w-full max-w-md flex-col overflow-hidden bg-main-panel">
 				{/* Header */}
 				<div className="flex flex-col items-center px-6 pt-6 pb-2 text-center">
 					<Image
@@ -141,7 +300,7 @@ export default function DesktopLoginPage() {
 					/>
 					<h1 className="text-lg font-semibold tracking-tight">Welcome to SurfSense Desktop</h1>
 					<p className="mt-1 text-sm text-muted-foreground">
-						Configure shortcuts, then sign in to get started.
+						Configure shortcuts, then sign in to get started
 					</p>
 				</div>
 
@@ -151,41 +310,24 @@ export default function DesktopLoginPage() {
 						{/* ---- Shortcuts ---- */}
 						{shortcutsLoaded ? (
 							<div className="flex flex-col gap-2">
-								<p className="text-xs font-medium uppercase tracking-wider text-muted-foreground">
-									Keyboard Shortcuts
-								</p>
-								<div className="flex flex-col gap-1.5">
-									<ShortcutRecorder
-										value={shortcuts.generalAssist}
-										onChange={(accel) => updateShortcut("generalAssist", accel)}
-										onReset={() => resetShortcut("generalAssist")}
-										defaultValue={DEFAULT_SHORTCUTS.generalAssist}
-										label="General Assist"
-										description="Launch SurfSense instantly from any application"
-										icon={Rocket}
-									/>
-									<ShortcutRecorder
-										value={shortcuts.quickAsk}
-										onChange={(accel) => updateShortcut("quickAsk", accel)}
-										onReset={() => resetShortcut("quickAsk")}
-										defaultValue={DEFAULT_SHORTCUTS.quickAsk}
-										label="Quick Assist"
-										description="Select text anywhere, then ask AI to explain, rewrite, or act on it"
-										icon={Zap}
-									/>
-									<ShortcutRecorder
-										value={shortcuts.autocomplete}
-										onChange={(accel) => updateShortcut("autocomplete", accel)}
-										onReset={() => resetShortcut("autocomplete")}
-										defaultValue={DEFAULT_SHORTCUTS.autocomplete}
-										label="Extreme Assist"
-										description="AI drafts text using your screen context and knowledge base"
-										icon={BrainCog}
-									/>
+								{/* <p className="text-xs font-medium uppercase tracking-wider text-muted-foreground">
+									Hotkeys
+								</p> */}
+								<div>
+									{HOTKEY_ROWS.map((row) => (
+										<HotkeyRow
+											key={row.key}
+											label={row.label}
+											description={row.description}
+											value={shortcuts[row.key]}
+											defaultValue={DEFAULT_SHORTCUTS[row.key]}
+											icon={row.icon}
+											isMac={isMac}
+											onChange={(accel) => updateShortcut(row.key, accel)}
+											onReset={() => resetShortcut(row.key)}
+										/>
+									))}
 								</div>
-								<p className="text-[11px] text-muted-foreground text-center mt-1">
-									Click a shortcut and press a new key combination to change it.
-								</p>
 							</div>
 						) : (
 							<div className="flex justify-center py-6">
@@ -197,13 +339,18 @@ export default function DesktopLoginPage() {
 
 						{/* ---- Auth ---- */}
 						<div className="flex flex-col gap-3">
-							<p className="text-xs font-medium uppercase tracking-wider text-muted-foreground">
+							{/* <p className="text-xs font-medium uppercase tracking-wider text-muted-foreground">
 								Sign In
-							</p>
+							</p> */}
 
 							{isGoogleAuth ? (
-								<Button variant="outline" className="w-full gap-2 h-10" onClick={handleGoogleLogin}>
-									<IconBrandGoogleFilled className="size-4" />
+								<Button
+									variant="outline"
+									className="w-full gap-2 h-10 bg-white text-[#1f1f1f] border-white hover:bg-zinc-100 hover:text-[#1f1f1f] dark:border-white shadow-sm font-medium"
+									disabled={isGoogleRedirecting}
+									onClick={handleGoogleLogin}
+								>
+									<GoogleGLogo className="size-4" />
 									Continue with Google
 								</Button>
 							) : (
@@ -246,10 +393,11 @@ export default function DesktopLoginPage() {
 												disabled={isLoggingIn}
 												className="h-9 pr-9"
 											/>
-											<button
+											<Button
 												type="button"
+												variant="ghost"
 												onClick={() => setShowPassword((v) => !v)}
-												className="absolute inset-y-0 right-0 flex items-center pr-2.5 text-muted-foreground hover:text-foreground"
+												className="absolute inset-y-0 right-0 h-auto bg-transparent px-2.5 py-0 text-muted-foreground hover:bg-transparent hover:text-foreground"
 												tabIndex={-1}
 											>
 												{showPassword ? (
@@ -257,18 +405,14 @@ export default function DesktopLoginPage() {
 												) : (
 													<Eye className="size-3.5" />
 												)}
-											</button>
+											</Button>
 										</div>
 									</div>
 
-									<Button type="submit" disabled={isLoggingIn} className="h-9 mt-1">
-										{isLoggingIn ? (
-											<>
-												<Spinner size="sm" className="text-primary-foreground" />
-												Signing in…
-											</>
-										) : (
-											"Sign in"
+									<Button type="submit" disabled={isLoggingIn} className="relative h-9 mt-1">
+										<span className={isLoggingIn ? "opacity-0" : ""}>Sign in</span>
+										{isLoggingIn && (
+											<Spinner size="sm" className="absolute text-primary-foreground" />
 										)}
 									</Button>
 								</form>
diff --git a/surfsense_web/app/desktop/permissions/page.tsx b/surfsense_web/app/desktop/permissions/page.tsx
index a2fadc8ff7..c736bf87cc 100644
--- a/surfsense_web/app/desktop/permissions/page.tsx
+++ b/surfsense_web/app/desktop/permissions/page.tsx
@@ -19,14 +19,15 @@ const STEPS = [
 		id: "screen-recording",
 		title: "Screen Recording",
 		description:
-			"Lets SurfSense capture your screen to understand context and provide smart writing suggestions.",
+			"Lets SurfSense capture a region of your screen, full display, or browser (where supported) to attach to chat in Screenshot Assist, or to capture the full display from the composer.",
 		action: "requestScreenRecording",
 		field: "screenRecording" as const,
 	},
 	{
 		id: "accessibility",
 		title: "Accessibility",
-		description: "Lets SurfSense insert suggestions seamlessly, right where you\u2019re typing.",
+		description:
+			"Lets SurfSense bring the app to the foreground and work with the active application (for example Quick Assist) when you use desktop shortcuts.",
 		action: "requestAccessibility",
 		field: "accessibility" as const,
 	},
@@ -131,7 +132,8 @@ export default function DesktopPermissionsPage() {
 					<div className="space-y-1">
 						<h1 className="text-2xl font-semibold tracking-tight">System Permissions</h1>
 						<p className="text-sm text-muted-foreground">
-							SurfSense needs two macOS permissions to provide context-aware writing suggestions.
+							SurfSense needs two macOS permissions for Screenshot Assist and for desktop features
+							that require focusing the app or the active application.
 						</p>
 					</div>
 				</div>
@@ -205,13 +207,14 @@ export default function DesktopPermissionsPage() {
 							<Button disabled className="text-sm h-9 min-w-[180px]">
 								Grant permissions to continue
 							</Button>
-							<button
+							<Button
 								type="button"
+								variant="link"
 								onClick={handleSkip}
-								className="block mx-auto text-xs text-muted-foreground hover:text-foreground transition-colors"
+								className="mx-auto h-auto px-0 py-0 text-xs text-muted-foreground hover:text-foreground"
 							>
 								Skip for now
-							</button>
+							</Button>
 						</>
 					)}
 				</div>
diff --git a/surfsense_web/app/desktop/suggestion/layout.tsx b/surfsense_web/app/desktop/suggestion/layout.tsx
deleted file mode 100644
index fd8faf099e..0000000000
--- a/surfsense_web/app/desktop/suggestion/layout.tsx
+++ /dev/null
@@ -1,9 +0,0 @@
-import "./suggestion.css";
-
-export const metadata = {
-	title: "SurfSense Suggestion",
-};
-
-export default function SuggestionLayout({ children }: { children: React.ReactNode }) {
-	return <div className="suggestion-body">{children}</div>;
-}
diff --git a/surfsense_web/app/desktop/suggestion/page.tsx b/surfsense_web/app/desktop/suggestion/page.tsx
deleted file mode 100644
index d30da65f67..0000000000
--- a/surfsense_web/app/desktop/suggestion/page.tsx
+++ /dev/null
@@ -1,384 +0,0 @@
-"use client";
-
-import { useCallback, useEffect, useRef, useState } from "react";
-import { useElectronAPI } from "@/hooks/use-platform";
-import { ensureTokensFromElectron, getBearerToken } from "@/lib/auth-utils";
-
-type SSEEvent =
-	| { type: "text-delta"; id: string; delta: string }
-	| { type: "text-start"; id: string }
-	| { type: "text-end"; id: string }
-	| { type: "start"; messageId: string }
-	| { type: "finish" }
-	| { type: "error"; errorText: string }
-	| {
-			type: "data-thinking-step";
-			data: { id: string; title: string; status: string; items: string[] };
-	  }
-	| {
-			type: "data-suggestions";
-			data: { options: string[] };
-	  };
-
-interface AgentStep {
-	id: string;
-	title: string;
-	status: string;
-	items: string[];
-}
-
-type FriendlyError = { message: string; isSetup?: boolean };
-
-function friendlyError(raw: string | number): FriendlyError {
-	if (typeof raw === "number") {
-		if (raw === 401) return { message: "Please sign in to use suggestions." };
-		if (raw === 403) return { message: "You don\u2019t have permission for this." };
-		if (raw === 404) return { message: "Suggestion service not found. Is the backend running?" };
-		if (raw >= 500) return { message: "Something went wrong on the server. Try again." };
-		return { message: "Something went wrong. Try again." };
-	}
-	const lower = raw.toLowerCase();
-	if (lower.includes("not authenticated") || lower.includes("unauthorized"))
-		return { message: "Please sign in to use suggestions." };
-	if (lower.includes("no vision llm configured") || lower.includes("no llm configured"))
-		return {
-			message: "Configure a vision-capable model (e.g. GPT-4o, Gemini) to enable autocomplete.",
-			isSetup: true,
-		};
-	if (lower.includes("does not support vision"))
-		return {
-			message: "The selected model doesn\u2019t support vision. Choose a vision-capable model.",
-			isSetup: true,
-		};
-	if (lower.includes("fetch") || lower.includes("network") || lower.includes("econnrefused"))
-		return { message: "Can\u2019t reach the server. Check your connection." };
-	return { message: "Something went wrong. Try again." };
-}
-
-const AUTO_DISMISS_MS = 3000;
-
-function StepIcon({ status }: { status: string }) {
-	if (status === "complete") {
-		return (
-			<svg
-				className="step-icon step-icon-done"
-				viewBox="0 0 16 16"
-				fill="none"
-				aria-label="Step complete"
-			>
-				<circle cx="8" cy="8" r="7" stroke="#4ade80" strokeWidth="1.5" />
-				<path
-					d="M5 8.5l2 2 4-4.5"
-					stroke="#4ade80"
-					strokeWidth="1.5"
-					strokeLinecap="round"
-					strokeLinejoin="round"
-				/>
-			</svg>
-		);
-	}
-	return <span className="step-spinner" />;
-}
-
-export default function SuggestionPage() {
-	const api = useElectronAPI();
-	const [options, setOptions] = useState<string[]>([]);
-	const [isLoading, setIsLoading] = useState(true);
-	const [error, setError] = useState<FriendlyError | null>(null);
-	const [steps, setSteps] = useState<AgentStep[]>([]);
-	const [expandedOption, setExpandedOption] = useState<number | null>(null);
-	const abortRef = useRef<AbortController | null>(null);
-
-	const isDesktop = !!api?.onAutocompleteContext;
-
-	useEffect(() => {
-		if (!api?.onAutocompleteContext) {
-			setIsLoading(false);
-		}
-	}, [api]);
-
-	useEffect(() => {
-		if (!error || error.isSetup) return;
-		const timer = setTimeout(() => {
-			api?.dismissSuggestion?.();
-		}, AUTO_DISMISS_MS);
-		return () => clearTimeout(timer);
-	}, [error, api]);
-
-	useEffect(() => {
-		if (isLoading || error || options.length > 0) return;
-		const timer = setTimeout(() => {
-			api?.dismissSuggestion?.();
-		}, AUTO_DISMISS_MS);
-		return () => clearTimeout(timer);
-	}, [isLoading, error, options, api]);
-
-	const fetchSuggestion = useCallback(
-		async (screenshot: string, searchSpaceId: string, appName?: string, windowTitle?: string) => {
-			abortRef.current?.abort();
-			const controller = new AbortController();
-			abortRef.current = controller;
-
-			setIsLoading(true);
-			setOptions([]);
-			setError(null);
-			setSteps([]);
-			setExpandedOption(null);
-
-			let token = getBearerToken();
-			if (!token) {
-				await ensureTokensFromElectron();
-				token = getBearerToken();
-			}
-			if (!token) {
-				setError(friendlyError("not authenticated"));
-				setIsLoading(false);
-				return;
-			}
-
-			const backendUrl = process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL || "http://localhost:8000";
-
-			try {
-				const response = await fetch(`${backendUrl}/api/v1/autocomplete/vision/stream`, {
-					method: "POST",
-					headers: {
-						Authorization: `Bearer ${token}`,
-						"Content-Type": "application/json",
-					},
-					body: JSON.stringify({
-						screenshot,
-						search_space_id: parseInt(searchSpaceId, 10),
-						app_name: appName || "",
-						window_title: windowTitle || "",
-					}),
-					signal: controller.signal,
-				});
-
-				if (!response.ok) {
-					setError(friendlyError(response.status));
-					setIsLoading(false);
-					return;
-				}
-
-				if (!response.body) {
-					setError(friendlyError("network error"));
-					setIsLoading(false);
-					return;
-				}
-
-				const reader = response.body.getReader();
-				const decoder = new TextDecoder();
-				let buffer = "";
-
-				while (true) {
-					const { done, value } = await reader.read();
-					if (done) break;
-
-					buffer += decoder.decode(value, { stream: true });
-					const events = buffer.split(/\r?\n\r?\n/);
-					buffer = events.pop() || "";
-
-					for (const event of events) {
-						const lines = event.split(/\r?\n/);
-						for (const line of lines) {
-							if (!line.startsWith("data: ")) continue;
-							const data = line.slice(6).trim();
-							if (!data || data === "[DONE]") continue;
-
-							try {
-								const parsed: SSEEvent = JSON.parse(data);
-								if (parsed.type === "data-suggestions") {
-									setOptions(parsed.data.options);
-								} else if (parsed.type === "error") {
-									setError(friendlyError(parsed.errorText));
-								} else if (parsed.type === "data-thinking-step") {
-									const { id, title, status, items } = parsed.data;
-									setSteps((prev) => {
-										const existing = prev.findIndex((s) => s.id === id);
-										if (existing >= 0) {
-											const updated = [...prev];
-											updated[existing] = { id, title, status, items };
-											return updated;
-										}
-										return [...prev, { id, title, status, items }];
-									});
-								}
-							} catch {}
-						}
-					}
-				}
-			} catch (err) {
-				if (err instanceof DOMException && err.name === "AbortError") return;
-				setError(friendlyError("network error"));
-			} finally {
-				setIsLoading(false);
-			}
-		},
-		[]
-	);
-
-	useEffect(() => {
-		if (!api?.onAutocompleteContext) return;
-
-		const cleanup = api.onAutocompleteContext((data) => {
-			const searchSpaceId = data.searchSpaceId || "1";
-			if (data.screenshot) {
-				fetchSuggestion(data.screenshot, searchSpaceId, data.appName, data.windowTitle);
-			}
-		});
-
-		return cleanup;
-	}, [fetchSuggestion, api]);
-
-	if (!isDesktop) {
-		return (
-			<div className="suggestion-tooltip">
-				<span className="suggestion-error-text">
-					This page is only available in the SurfSense desktop app.
-				</span>
-			</div>
-		);
-	}
-
-	if (error) {
-		if (error.isSetup) {
-			return (
-				<div className="suggestion-tooltip suggestion-setup">
-					<div className="setup-icon">
-						<svg viewBox="0 0 24 24" fill="none" width="28" height="28" aria-hidden="true">
-							<path
-								d="M1 12C1 12 5 4 12 4C19 4 23 12 23 12C23 12 19 20 12 20C5 20 1 12 1 12Z"
-								stroke="#a78bfa"
-								strokeWidth="1.5"
-								strokeLinecap="round"
-								strokeLinejoin="round"
-							/>
-							<circle
-								cx="12"
-								cy="12"
-								r="3"
-								stroke="#a78bfa"
-								strokeWidth="1.5"
-								strokeLinecap="round"
-								strokeLinejoin="round"
-							/>
-						</svg>
-					</div>
-					<div className="setup-content">
-						<span className="setup-title">Vision Model Required</span>
-						<span className="setup-message">{error.message}</span>
-						<span className="setup-hint">Settings → Vision Models</span>
-					</div>
-					<button
-						type="button"
-						className="setup-dismiss"
-						onClick={() => api?.dismissSuggestion?.()}
-					>
-						✕
-					</button>
-				</div>
-			);
-		}
-		return (
-			<div className="suggestion-tooltip suggestion-error">
-				<span className="suggestion-error-text">{error.message}</span>
-			</div>
-		);
-	}
-
-	const showLoading = isLoading && options.length === 0;
-
-	if (showLoading) {
-		return (
-			<div className="suggestion-tooltip">
-				<div className="agent-activity">
-					{steps.length === 0 && (
-						<div className="activity-initial">
-							<span className="step-spinner" />
-							<span className="activity-label">Preparing…</span>
-						</div>
-					)}
-					{steps.length > 0 && (
-						<div className="activity-steps">
-							{steps.map((step) => (
-								<div key={step.id} className="activity-step">
-									<StepIcon status={step.status} />
-									<span className="step-label">
-										{step.title}
-										{step.items.length > 0 && (
-											<span className="step-detail"> · {step.items[0]}</span>
-										)}
-									</span>
-								</div>
-							))}
-						</div>
-					)}
-				</div>
-			</div>
-		);
-	}
-
-	const handleSelect = (text: string) => {
-		api?.acceptSuggestion?.(text);
-	};
-
-	const handleDismiss = () => {
-		api?.dismissSuggestion?.();
-	};
-
-	const TRUNCATE_LENGTH = 120;
-
-	if (options.length === 0) {
-		return (
-			<div className="suggestion-tooltip suggestion-error">
-				<span className="suggestion-error-text">No suggestions available.</span>
-			</div>
-		);
-	}
-
-	return (
-		<div className="suggestion-tooltip">
-			<div className="suggestion-options">
-				{options.map((option, index) => {
-					const isExpanded = expandedOption === index;
-					const needsTruncation = option.length > TRUNCATE_LENGTH;
-					const displayText =
-						needsTruncation && !isExpanded ? option.slice(0, TRUNCATE_LENGTH) + "…" : option;
-
-					return (
-						<button
-							type="button"
-							key={index}
-							className="suggestion-option"
-							onClick={() => handleSelect(option)}
-						>
-							<span className="option-number">{index + 1}</span>
-							<span className="option-text">{displayText}</span>
-							{needsTruncation && (
-								<button
-									type="button"
-									className="option-expand"
-									onClick={(e) => {
-										e.stopPropagation();
-										setExpandedOption(isExpanded ? null : index);
-									}}
-								>
-									{isExpanded ? "less" : "more"}
-								</button>
-							)}
-						</button>
-					);
-				})}
-			</div>
-			<div className="suggestion-actions">
-				<button
-					type="button"
-					className="suggestion-btn suggestion-btn-dismiss"
-					onClick={handleDismiss}
-				>
-					Dismiss
-				</button>
-			</div>
-		</div>
-	);
-}
diff --git a/surfsense_web/app/desktop/suggestion/suggestion.css b/surfsense_web/app/desktop/suggestion/suggestion.css
deleted file mode 100644
index b27fe7874b..0000000000
--- a/surfsense_web/app/desktop/suggestion/suggestion.css
+++ /dev/null
@@ -1,352 +0,0 @@
-html:has(.suggestion-body),
-body:has(.suggestion-body) {
-	margin: 0 !important;
-	padding: 0 !important;
-	background: transparent !important;
-	overflow: hidden !important;
-	height: auto !important;
-	width: 100% !important;
-}
-
-.suggestion-body {
-	margin: 0;
-	padding: 0;
-	background: transparent;
-	font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
-	-webkit-font-smoothing: antialiased;
-	user-select: none;
-	-webkit-app-region: no-drag;
-}
-
-.suggestion-tooltip {
-	box-sizing: border-box;
-	background: #1e1e1e;
-	border: 1px solid #3c3c3c;
-	border-radius: 8px;
-	padding: 8px 12px;
-	margin: 4px;
-	max-width: 400px;
-	/* MAX_HEIGHT in suggestion-window.ts is 400px. Subtract 8px for margin
-     (4px * 2) so the tooltip + margin fits within the Electron window.
-     box-sizing: border-box ensures padding + border are included. */
-	max-height: 392px;
-	box-shadow: 0 4px 16px rgba(0, 0, 0, 0.5);
-	display: flex;
-	flex-direction: column;
-	overflow: hidden;
-}
-
-.suggestion-text {
-	color: #d4d4d4;
-	font-size: 13px;
-	line-height: 1.45;
-	margin: 0 0 6px 0;
-	word-wrap: break-word;
-	white-space: pre-wrap;
-	overflow-y: auto;
-	flex: 1 1 auto;
-	min-height: 0;
-}
-
-.suggestion-text::-webkit-scrollbar {
-	width: 5px;
-}
-
-.suggestion-text::-webkit-scrollbar-track {
-	background: transparent;
-}
-
-.suggestion-text::-webkit-scrollbar-thumb {
-	background: #555;
-	border-radius: 3px;
-}
-
-.suggestion-text::-webkit-scrollbar-thumb:hover {
-	background: #777;
-}
-
-.suggestion-actions {
-	display: flex;
-	justify-content: flex-end;
-	gap: 4px;
-	border-top: 1px solid #2a2a2a;
-	padding-top: 6px;
-	flex-shrink: 0;
-}
-
-.suggestion-btn {
-	padding: 2px 8px;
-	border-radius: 3px;
-	border: 1px solid #3c3c3c;
-	font-family: inherit;
-	font-size: 10px;
-	font-weight: 500;
-	cursor: pointer;
-	line-height: 16px;
-	transition:
-		background 0.15s,
-		border-color 0.15s;
-}
-
-.suggestion-btn-accept {
-	background: #2563eb;
-	border-color: #3b82f6;
-	color: #fff;
-}
-
-.suggestion-btn-accept:hover {
-	background: #1d4ed8;
-}
-
-.suggestion-btn-dismiss {
-	background: #2a2a2a;
-	color: #999;
-}
-
-.suggestion-btn-dismiss:hover {
-	background: #333;
-	color: #ccc;
-}
-
-.suggestion-error {
-	border-color: #5c2626;
-}
-
-.suggestion-error-text {
-	color: #f48771;
-	font-size: 12px;
-}
-
-/* --- Setup prompt (vision model not configured) --- */
-
-.suggestion-setup {
-	display: flex;
-	flex-direction: row;
-	align-items: flex-start;
-	gap: 10px;
-	border-color: #3b2d6b;
-	padding: 10px 14px;
-}
-
-.setup-icon {
-	flex-shrink: 0;
-	margin-top: 1px;
-}
-
-.setup-content {
-	display: flex;
-	flex-direction: column;
-	gap: 3px;
-	min-width: 0;
-}
-
-.setup-title {
-	font-size: 13px;
-	font-weight: 600;
-	color: #c4b5fd;
-}
-
-.setup-message {
-	font-size: 11.5px;
-	color: #a1a1aa;
-	line-height: 1.4;
-}
-
-.setup-hint {
-	font-size: 10.5px;
-	color: #7c6dac;
-	margin-top: 2px;
-}
-
-.setup-dismiss {
-	flex-shrink: 0;
-	align-self: flex-start;
-	background: none;
-	border: none;
-	color: #6b6b7b;
-	font-size: 14px;
-	cursor: pointer;
-	padding: 2px 4px;
-	line-height: 1;
-	border-radius: 4px;
-	transition:
-		color 0.15s,
-		background 0.15s;
-}
-
-.setup-dismiss:hover {
-	color: #c4b5fd;
-	background: rgba(124, 109, 172, 0.15);
-}
-
-/* --- Agent activity indicator --- */
-
-.agent-activity {
-	display: flex;
-	flex-direction: column;
-	gap: 4px;
-	overflow-y: auto;
-	max-height: 340px;
-}
-
-.agent-activity::-webkit-scrollbar {
-	display: none;
-}
-
-.activity-initial {
-	display: flex;
-	align-items: center;
-	gap: 8px;
-	padding: 2px 0;
-}
-
-.activity-label {
-	color: #a1a1aa;
-	font-size: 12px;
-	white-space: nowrap;
-	overflow: hidden;
-	text-overflow: ellipsis;
-}
-
-.activity-steps {
-	display: flex;
-	flex-direction: column;
-	gap: 3px;
-}
-
-.activity-step {
-	display: flex;
-	align-items: center;
-	gap: 6px;
-	min-height: 18px;
-}
-
-.step-label {
-	color: #d4d4d4;
-	font-size: 12px;
-	white-space: nowrap;
-	overflow: hidden;
-	text-overflow: ellipsis;
-}
-
-.step-detail {
-	color: #71717a;
-	font-size: 11px;
-}
-
-/* Spinner (in_progress) */
-.step-spinner {
-	width: 14px;
-	height: 14px;
-	flex-shrink: 0;
-	border: 1.5px solid #3f3f46;
-	border-top-color: #a78bfa;
-	border-radius: 50%;
-	animation: step-spin 0.7s linear infinite;
-}
-
-/* Checkmark icon (complete) */
-.step-icon {
-	width: 14px;
-	height: 14px;
-	flex-shrink: 0;
-}
-
-@keyframes step-spin {
-	to {
-		transform: rotate(360deg);
-	}
-}
-
-/* --- Suggestion option cards --- */
-
-.suggestion-options {
-	display: flex;
-	flex-direction: column;
-	gap: 4px;
-	overflow-y: auto;
-	flex: 1 1 auto;
-	min-height: 0;
-	margin-bottom: 6px;
-}
-
-.suggestion-options::-webkit-scrollbar {
-	width: 5px;
-}
-
-.suggestion-options::-webkit-scrollbar-track {
-	background: transparent;
-}
-
-.suggestion-options::-webkit-scrollbar-thumb {
-	background: #555;
-	border-radius: 3px;
-}
-
-.suggestion-option {
-	display: flex;
-	align-items: flex-start;
-	gap: 8px;
-	padding: 6px 8px;
-	border-radius: 5px;
-	border: 1px solid #333;
-	background: #262626;
-	cursor: pointer;
-	text-align: left;
-	font-family: inherit;
-	transition:
-		background 0.15s,
-		border-color 0.15s;
-	width: 100%;
-}
-
-.suggestion-option:hover {
-	background: #2a2d3a;
-	border-color: #3b82f6;
-}
-
-.option-number {
-	flex-shrink: 0;
-	width: 18px;
-	height: 18px;
-	border-radius: 50%;
-	background: #3f3f46;
-	color: #d4d4d4;
-	font-size: 10px;
-	font-weight: 600;
-	display: flex;
-	align-items: center;
-	justify-content: center;
-	margin-top: 1px;
-}
-
-.suggestion-option:hover .option-number {
-	background: #2563eb;
-	color: #fff;
-}
-
-.option-text {
-	color: #d4d4d4;
-	font-size: 12px;
-	line-height: 1.45;
-	word-wrap: break-word;
-	white-space: pre-wrap;
-	flex: 1 1 auto;
-	min-width: 0;
-}
-
-.option-expand {
-	flex-shrink: 0;
-	background: none;
-	border: none;
-	color: #71717a;
-	font-size: 10px;
-	cursor: pointer;
-	padding: 0 2px;
-	font-family: inherit;
-	margin-top: 1px;
-}
-
-.option-expand:hover {
-	color: #a1a1aa;
-}
diff --git a/surfsense_web/app/docs/[[...slug]]/loading.tsx b/surfsense_web/app/docs/[[...slug]]/loading.tsx
new file mode 100644
index 0000000000..6bedcfc400
--- /dev/null
+++ b/surfsense_web/app/docs/[[...slug]]/loading.tsx
@@ -0,0 +1,55 @@
+import { Skeleton } from "@/components/ui/skeleton";
+
+export default function DocsLoading() {
+	return (
+		<div className="flex flex-1 flex-col gap-4 p-6 max-w-4xl mx-auto w-full">
+			{/* Title */}
+			<Skeleton className="h-9 w-64" />
+
+			{/* Description */}
+			<Skeleton className="h-5 w-full max-w-md" />
+
+			<div className="mt-4 space-y-8">
+				{/* Paragraph block 1 */}
+				<div className="space-y-2">
+					<Skeleton className="h-4 w-full" />
+					<Skeleton className="h-4 w-full" />
+					<Skeleton className="h-4 w-3/4" />
+				</div>
+
+				{/* Sub-heading */}
+				<Skeleton className="h-7 w-48" />
+
+				{/* Paragraph block 2 */}
+				<div className="space-y-2">
+					<Skeleton className="h-4 w-full" />
+					<Skeleton className="h-4 w-5/6" />
+					<Skeleton className="h-4 w-full" />
+					<Skeleton className="h-4 w-2/3" />
+				</div>
+
+				{/* Code block placeholder */}
+				<Skeleton className="h-28 w-full rounded-lg" />
+
+				{/* Sub-heading */}
+				<Skeleton className="h-7 w-56" />
+
+				{/* List items */}
+				<div className="space-y-3">
+					{Array.from({ length: 4 }).map((_, i) => (
+						<div key={i} className="flex items-start gap-3">
+							<Skeleton className="mt-1 h-3 w-3 shrink-0 rounded-full" />
+							<Skeleton className="h-4 w-full max-w-lg" />
+						</div>
+					))}
+				</div>
+
+				{/* Paragraph block 3 */}
+				<div className="space-y-2">
+					<Skeleton className="h-4 w-full" />
+					<Skeleton className="h-4 w-4/5" />
+				</div>
+			</div>
+		</div>
+	);
+}
diff --git a/surfsense_web/app/docs/[[...slug]]/page.tsx b/surfsense_web/app/docs/[[...slug]]/page.tsx
index 0905be8a02..b7a3d7b666 100644
--- a/surfsense_web/app/docs/[[...slug]]/page.tsx
+++ b/surfsense_web/app/docs/[[...slug]]/page.tsx
@@ -45,8 +45,17 @@ export async function generateMetadata(props: { params: Promise<{ slug?: string[
 	const page = getDocPage(params.slug);
 	if (!page) notFound();
 
+	const slugPath = params.slug ? params.slug.join("/") : "";
 	return {
-		title: page.data.title,
+		title: `${page.data.title} | SurfSense Docs`,
 		description: page.data.description,
+		alternates: {
+			canonical: `https://www.surfsense.com/docs${slugPath ? `/${slugPath}` : ""}`,
+		},
+		openGraph: {
+			title: `${page.data.title} | SurfSense Docs`,
+			description: page.data.description,
+			type: "article",
+		},
 	};
 }
diff --git a/surfsense_web/app/error.tsx b/surfsense_web/app/error.tsx
index 70c3d96329..42d05cfdf0 100644
--- a/surfsense_web/app/error.tsx
+++ b/surfsense_web/app/error.tsx
@@ -1,12 +1,15 @@
 "use client";
 
-import { useEffect } from "react";
+import { ExternalLink } from "lucide-react";
+import { useEffect, useMemo } from "react";
+import { Button } from "@/components/ui/button";
+import { buildIssueUrl } from "@/lib/error-toast";
 
 export default function ErrorPage({
 	error,
 	reset,
 }: {
-	error: globalThis.Error & { digest?: string };
+	error: globalThis.Error & { digest?: string; code?: string; requestId?: string };
 	reset: () => void;
 }) {
 	useEffect(() => {
@@ -17,19 +20,37 @@ export default function ErrorPage({
 			.catch(() => {});
 	}, [error]);
 
+	const issueUrl = useMemo(() => buildIssueUrl(error), [error]);
+
 	return (
-		<div className="flex min-h-[50vh] flex-col items-center justify-center gap-4 text-center">
+		<div className="flex min-h-[50vh] flex-col items-center justify-center gap-4 text-center px-4">
 			<h2 className="text-2xl font-semibold">Something went wrong</h2>
 			<p className="text-muted-foreground max-w-md">
-				An unexpected error occurred. Please try again.
+				An unexpected error occurred. Please try again, or report this issue if it persists.
 			</p>
-			<button
-				type="button"
-				onClick={reset}
-				className="rounded-md bg-primary px-4 py-2 text-sm font-medium text-primary-foreground hover:bg-primary/90 transition-colors"
-			>
-				Try again
-			</button>
+
+			{(error.digest || error.code || error.requestId) && (
+				<div className="rounded-md border bg-muted/50 px-4 py-2 text-xs text-muted-foreground font-mono max-w-md">
+					{error.code && <span>Code: {error.code}</span>}
+					{error.requestId && <span className="ml-3">ID: {error.requestId}</span>}
+					{error.digest && <span className="ml-3">Digest: {error.digest}</span>}
+				</div>
+			)}
+
+			<div className="flex gap-2">
+				<Button type="button" onClick={reset}>
+					Try again
+				</Button>
+				<a
+					href={issueUrl}
+					target="_blank"
+					rel="noopener noreferrer"
+					className="inline-flex items-center gap-1.5 rounded-md border border-input bg-background px-4 py-2 text-sm font-medium hover:bg-accent hover:text-accent-foreground transition-colors"
+				>
+					<ExternalLink className="h-3.5 w-3.5" />
+					Report Issue
+				</a>
+			</div>
 		</div>
 	);
 }
diff --git a/surfsense_web/app/global-error.tsx b/surfsense_web/app/global-error.tsx
index dc4763c1eb..add207cfd5 100644
--- a/surfsense_web/app/global-error.tsx
+++ b/surfsense_web/app/global-error.tsx
@@ -2,9 +2,32 @@
 
 import "./globals.css";
 import posthog from "posthog-js";
-import { useEffect } from "react";
+import { useEffect, useMemo } from "react";
 import { Button } from "@/components/ui/button";
 
+const ISSUES_URL = "https://github.com/MODSetter/SurfSense/issues/new";
+
+function buildBasicIssueUrl(error: Error & { digest?: string }) {
+	const params = new URLSearchParams();
+	const lines = [
+		"## Bug Report",
+		"",
+		"**Describe what happened:**",
+		"",
+		"",
+		"## Diagnostics (auto-filled)",
+		"",
+		`- **Error:** ${error.message}`,
+		...(error.digest ? [`- **Digest:** \`${error.digest}\``] : []),
+		`- **Timestamp:** ${new Date().toISOString()}`,
+		`- **Page:** \`${typeof window !== "undefined" ? window.location.pathname : "unknown"}\``,
+		`- **User Agent:** \`${typeof navigator !== "undefined" ? navigator.userAgent : "unknown"}\``,
+	];
+	params.set("body", lines.join("\n"));
+	params.set("labels", "bug");
+	return `${ISSUES_URL}?${params.toString()}`;
+}
+
 export default function GlobalError({
 	error,
 	reset,
@@ -16,13 +39,32 @@ export default function GlobalError({
 		posthog.captureException(error);
 	}, [error]);
 
+	const issueUrl = useMemo(() => buildBasicIssueUrl(error), [error]);
+
 	return (
 		<html lang="en">
 			<body>
-				<div className="flex min-h-screen flex-col items-center justify-center gap-4 p-4">
+				<div className="flex min-h-screen flex-col items-center justify-center gap-4 p-4 text-center">
 					<h2 className="text-xl font-semibold">Something went wrong</h2>
-					<p className="text-sm text-muted-foreground">An unexpected error occurred.</p>
-					<Button onClick={reset}>Try again</Button>
+					<p className="text-sm text-muted-foreground max-w-md">
+						An unexpected error occurred. Please try again, or report this issue if it persists.
+					</p>
+
+					{error.digest && (
+						<p className="text-xs text-muted-foreground font-mono">Digest: {error.digest}</p>
+					)}
+
+					<div className="flex gap-2">
+						<Button onClick={reset}>Try again</Button>
+						<a
+							href={issueUrl}
+							target="_blank"
+							rel="noopener noreferrer"
+							className="inline-flex items-center gap-1.5 rounded-md border border-input bg-background px-4 py-2 text-sm font-medium hover:bg-accent hover:text-accent-foreground transition-colors"
+						>
+							Report Issue
+						</a>
+					</div>
 				</div>
 			</body>
 		</html>
diff --git a/surfsense_web/app/globals.css b/surfsense_web/app/globals.css
index a37ddb8f38..3cdb34bffb 100644
--- a/surfsense_web/app/globals.css
+++ b/surfsense_web/app/globals.css
@@ -18,8 +18,9 @@
 	--foreground: oklch(0.145 0 0);
 	--card: oklch(1 0 0);
 	--card-foreground: oklch(0.145 0 0);
-	--popover: oklch(1 0 0);
+	--popover: oklch(0.99 0 0);
 	--popover-foreground: oklch(0.145 0 0);
+	--popover-border: oklch(0.92 0 0);
 	--primary: oklch(0.205 0 0);
 	--primary-foreground: oklch(0.985 0 0);
 	--secondary: oklch(0.97 0 0);
@@ -39,7 +40,11 @@
 	--chart-4: oklch(0.828 0.189 84.429);
 	--chart-5: oklch(0.769 0.188 70.08);
 	--radius: 0.625rem;
-	--sidebar: oklch(0.985 0 0);
+	/* Unified surface used by the left sidebar, main panel, and right panel. */
+	--panel: oklch(0.96 0 0);
+	/* Distinct (lighter) surface used by the leftmost icon rail. */
+	--rail: oklch(0.985 0 0);
+	--sidebar: var(--panel);
 	--sidebar-foreground: oklch(0.145 0 0);
 	--sidebar-primary: oklch(0.205 0 0);
 	--sidebar-primary-foreground: oklch(0.985 0 0);
@@ -47,7 +52,7 @@
 	--sidebar-accent-foreground: oklch(0.205 0 0);
 	--sidebar-border: oklch(0.922 0 0);
 	--sidebar-ring: oklch(0.708 0 0);
-	--main-panel: oklch(1 0 0);
+	--main-panel: var(--panel);
 	--syntax-bg: #f5f5f5;
 	--brand: oklch(0.623 0.214 259.815);
 	--highlight: oklch(0.852 0.199 91.936);
@@ -58,8 +63,9 @@
 	--foreground: oklch(0.985 0 0);
 	--card: oklch(0.145 0 0);
 	--card-foreground: oklch(0.985 0 0);
-	--popover: oklch(0.145 0 0);
+	--popover: oklch(0.32 0 0);
 	--popover-foreground: oklch(0.985 0 0);
+	--popover-border: oklch(0.4 0 0);
 	--primary: oklch(0.985 0 0);
 	--primary-foreground: oklch(0.205 0 0);
 	--secondary: oklch(0.269 0 0);
@@ -70,23 +76,27 @@
 	--accent-foreground: oklch(0.985 0 0);
 	--destructive: oklch(0.577 0.245 27.325);
 	--destructive-foreground: oklch(0.985 0 0);
-	--border: oklch(0.269 0 0);
-	--input: oklch(0.269 0 0);
+	--border: oklch(0.32 0 0);
+	--input: oklch(0.32 0 0);
 	--ring: oklch(0.439 0 0);
 	--chart-1: oklch(0.488 0.243 264.376);
 	--chart-2: oklch(0.696 0.17 162.48);
 	--chart-3: oklch(0.769 0.188 70.08);
 	--chart-4: oklch(0.627 0.265 303.9);
 	--chart-5: oklch(0.645 0.246 16.439);
-	--sidebar: oklch(0.205 0 0);
+	/* Unified surface used by the left sidebar, main panel, and right panel. */
+	--panel: oklch(0.24 0 0);
+	/* Distinct (slightly darker) surface used by the leftmost icon rail. */
+	--rail: oklch(0.205 0 0);
+	--sidebar: var(--panel);
 	--sidebar-foreground: oklch(0.985 0 0);
 	--sidebar-primary: oklch(0.488 0.243 264.376);
 	--sidebar-primary-foreground: oklch(0.985 0 0);
 	--sidebar-accent: oklch(0.269 0 0);
 	--sidebar-accent-foreground: oklch(0.985 0 0);
-	--sidebar-border: oklch(0.269 0 0);
+	--sidebar-border: oklch(0.32 0 0);
 	--sidebar-ring: oklch(0.439 0 0);
-	--main-panel: oklch(0.18 0 0);
+	--main-panel: var(--panel);
 	--syntax-bg: #1e1e1e;
 	--brand: oklch(0.707 0.165 254.624);
 	--highlight: oklch(0.852 0.199 91.936);
@@ -99,6 +109,7 @@
 	--color-card-foreground: var(--card-foreground);
 	--color-popover: var(--popover);
 	--color-popover-foreground: var(--popover-foreground);
+	--color-popover-border: var(--popover-border);
 	--color-primary: var(--primary);
 	--color-primary-foreground: var(--primary-foreground);
 	--color-secondary: var(--secondary);
@@ -118,6 +129,8 @@
 	--color-chart-4: var(--chart-4);
 	--color-chart-5: var(--chart-5);
 	--color-main-panel: var(--main-panel);
+	--color-panel: var(--panel);
+	--color-rail: var(--rail);
 	--radius-sm: calc(var(--radius) - 4px);
 	--radius-md: calc(var(--radius) - 2px);
 	--radius-lg: var(--radius);
diff --git a/surfsense_web/app/icon.png b/surfsense_web/app/icon.png
new file mode 100644
index 0000000000..dbda551dae
Binary files /dev/null and b/surfsense_web/app/icon.png differ
diff --git a/surfsense_web/app/invite/[invite_code]/page.tsx b/surfsense_web/app/invite/[invite_code]/page.tsx
index a778962b4d..959a6d6d19 100644
--- a/surfsense_web/app/invite/[invite_code]/page.tsx
+++ b/surfsense_web/app/invite/[invite_code]/page.tsx
@@ -31,7 +31,7 @@ import {
 import { Spinner } from "@/components/ui/spinner";
 import type { AcceptInviteResponse } from "@/contracts/types/invites.types";
 import { invitesApiService } from "@/lib/apis/invites-api.service";
-import { getBearerToken } from "@/lib/auth-utils";
+import { getBearerToken, setRedirectPath } from "@/lib/auth-utils";
 import {
 	trackSearchSpaceInviteAccepted,
 	trackSearchSpaceInviteDeclined,
@@ -125,7 +125,7 @@ export default function InviteAcceptPage() {
 		// Store the invite code to redirect back after login
 		localStorage.setItem("pending_invite_code", inviteCode);
 		// Save the current invite page URL so we can return after authentication
-		localStorage.setItem("surfsense_redirect_path", `/invite/${inviteCode}`);
+		setRedirectPath(`/invite/${inviteCode}`);
 		// Redirect to login (we manually set the path above since invite pages need special handling)
 		window.location.href = "/login";
 	};
diff --git a/surfsense_web/app/layout.tsx b/surfsense_web/app/layout.tsx
index 8ebb0e8482..4e88709e95 100644
--- a/surfsense_web/app/layout.tsx
+++ b/surfsense_web/app/layout.tsx
@@ -7,6 +7,11 @@ import { GlobalLoadingProvider } from "@/components/providers/GlobalLoadingProvi
 import { I18nProvider } from "@/components/providers/I18nProvider";
 import { PostHogProvider } from "@/components/providers/PostHogProvider";
 import { ZeroProvider } from "@/components/providers/ZeroProvider";
+import {
+	OrganizationJsonLd,
+	SoftwareApplicationJsonLd,
+	WebSiteJsonLd,
+} from "@/components/seo/json-ld";
 import { ThemeProvider } from "@/components/theme/theme-provider";
 import { Toaster } from "@/components/ui/sonner";
 import { LocaleProvider } from "@/contexts/LocaleContext";
@@ -36,82 +41,81 @@ export const viewport: Viewport = {
 };
 
 export const metadata: Metadata = {
-	title: "SurfSense - Open Source NotebookLM Alternative for Teams",
+	metadataBase: new URL("https://www.surfsense.com"),
+	alternates: {
+		canonical: "https://www.surfsense.com",
+	},
+	title: "SurfSense – Open Source, Privacy-Focused NotebookLM Alternative for Teams",
 	description:
-		"Connect any LLM to your internal knowledge sources and chat with it in real time alongside your team. SurfSense is an open source alternative to NotebookLM, built for enterprise AI search and knowledge management.",
+		"Open source NotebookLM alternative for teams with no data limits. Use ChatGPT, Claude AI, and any AI model for free.",
 	keywords: [
-		"enterprise ai",
-		"enterprise search",
-		"enterprise search software",
+		"chatgpt online",
+		"online chatgpt",
+		"chat gpt free",
+		"chatgpt free",
+		"free chatgpt",
+		"free chat gpt",
+		"chatgpt no login",
+		"chatgpt online free",
+		"chatgpt free online",
+		"chatgpt without login",
+		"free chatgpt without login",
+		"free chatgpt no login",
+		"chatgpt for free",
+		"claude ai free",
+		"claude free",
+		"free claude ai",
+		"free claude",
 		"chatgpt alternative free",
-		"ai enterprise search",
-		"enterprise search solutions",
-		"intranet search engine",
-		"federated search",
-		"enterprise search engine",
-		"what is enterprise search",
-		"enterprise knowledge management software",
 		"free chatgpt alternative",
-		"chatgpt free alternative",
-		"best enterprise search software",
-		"enterprise ai search",
-		"enterprise knowledge management",
-		"federated search engine",
-		"enterprise knowledge management system",
-		"free claude ai",
-		"what is enterprise search engine marketing",
-		"ai driven enterprise search",
 		"free alternative to chatgpt",
-		"free claude",
 		"alternative to chatgpt free",
+		"chatgpt alternative online free",
+		"ai like chatgpt",
+		"sites like chatgpt",
 		"free ai chatbot like chatgpt",
-		"enterprise search software comparison",
 		"apps like chatgpt for free",
-		"free chatgpt no login",
 		"free ai chatbots like chatgpt",
-		"enterprise document search",
-		"search engine for intranet",
-		"free chatgpt without login",
-		"unified search engine",
-		"chatgpt alternative online free",
+		"best free alternative to chatgpt",
 		"free chatgpt alternative app",
 		"free chatgpt alternative with image upload",
-		"best free alternative to chatgpt",
-		"enterprise search engine open source",
-		"open source notebooklm alternative",
+		"free ai apps",
+		"ai with no restrictions",
+		"notebooklm alternative",
 		"notebooklm alternative for teams",
+		"open source notebooklm alternative",
 		"SurfSense",
 	],
 	openGraph: {
-		title: "SurfSense - Open Source NotebookLM Alternative for Teams",
+		title: "SurfSense – Open Source, Privacy-Focused NotebookLM Alternative for Teams",
 		description:
-			"Connect any LLM to your internal knowledge sources and chat with it in real time alongside your team. Open source enterprise AI search and knowledge management.",
-		url: "https://surfsense.com",
+			"Open source NotebookLM alternative for teams with no data limits. Use ChatGPT, Claude, and any AI model for free.",
+		url: "https://www.surfsense.com",
 		siteName: "SurfSense",
 		type: "website",
 		images: [
 			{
-				url: "https://surfsense.com/og-image.png",
+				url: "/og-image.png",
 				width: 1200,
 				height: 630,
-				alt: "SurfSense - Open Source NotebookLM Alternative for Teams",
+				alt: "SurfSense - Open Source NotebookLM Alternative with Free ChatGPT and Claude AI",
 			},
 		],
 		locale: "en_US",
 	},
 	twitter: {
 		card: "summary_large_image",
-		title: "SurfSense - Open Source NotebookLM Alternative for Teams",
+		title: "SurfSense – Open Source, Privacy-Focused NotebookLM Alternative for Teams",
 		description:
-			"Connect any LLM to your internal knowledge sources and chat with it in real time alongside your team. Open source enterprise AI search and knowledge management.",
-		creator: "https://surfsense.com",
-		site: "https://surfsense.com",
+			"Open source NotebookLM alternative for teams with no data limits. Use ChatGPT, Claude AI, and any AI model for free.",
+		creator: "@SurfSenseAI",
+		site: "@SurfSenseAI",
 		images: [
 			{
-				url: "https://surfsense.com/og-image-twitter.png",
+				url: "/og-image-twitter.png",
 				width: 1200,
 				height: 630,
-				alt: "SurfSense - Open Source NotebookLM Alternative for Teams",
+				alt: "SurfSense - Open Source NotebookLM Alternative with Free ChatGPT and Claude AI",
 			},
 		],
 	},
@@ -129,8 +133,11 @@ export default function RootLayout({
 		<html lang="en" suppressHydrationWarning>
 			<head>
 				<link rel="preconnect" href="https://api.github.com" />
+				<OrganizationJsonLd />
+				<WebSiteJsonLd />
+				<SoftwareApplicationJsonLd />
 			</head>
-			<body className={cn(roboto.className, "bg-white dark:bg-black antialiased h-full w-full ")}>
+			<body className={cn(roboto.className, "bg-main-panel antialiased h-full w-full ")}>
 				<PostHogProvider>
 					<LocaleProvider>
 						<I18nProvider>
diff --git a/surfsense_web/app/not-found.tsx b/surfsense_web/app/not-found.tsx
new file mode 100644
index 0000000000..1a5d47afd3
--- /dev/null
+++ b/surfsense_web/app/not-found.tsx
@@ -0,0 +1,55 @@
+import type { Metadata } from "next";
+import Link from "next/link";
+
+export const metadata: Metadata = {
+	title: "Page Not Found | SurfSense",
+	description:
+		"The page you're looking for doesn't exist. Explore SurfSense - open source enterprise AI search and knowledge management.",
+};
+
+export default function NotFound() {
+	return (
+		<div className="flex min-h-screen flex-col items-center justify-center px-4 text-center">
+			<h1 className="text-8xl font-bold tracking-tight text-neutral-900 dark:text-neutral-100">
+				404
+			</h1>
+			<p className="mt-4 text-xl text-neutral-600 dark:text-neutral-400">
+				The page you&apos;re looking for doesn&apos;t exist.
+			</p>
+			<p className="mt-2 text-base text-neutral-500 dark:text-neutral-500">
+				It may have been moved, or the URL might be incorrect.
+			</p>
+			<div className="mt-10 flex flex-col items-center gap-4 sm:flex-row">
+				<Link
+					href="/"
+					className="rounded-lg bg-black px-6 py-3 text-sm font-medium text-white transition hover:bg-neutral-800 dark:bg-white dark:text-black dark:hover:bg-neutral-200"
+				>
+					Go Home
+				</Link>
+				<Link
+					href="/docs"
+					className="rounded-lg border border-neutral-200 px-6 py-3 text-sm font-medium text-neutral-700 transition hover:bg-neutral-50 dark:border-neutral-700 dark:text-neutral-300 dark:hover:bg-neutral-800"
+				>
+					Browse Docs
+				</Link>
+				<Link
+					href="/blog"
+					className="rounded-lg border border-neutral-200 px-6 py-3 text-sm font-medium text-neutral-700 transition hover:bg-neutral-50 dark:border-neutral-700 dark:text-neutral-300 dark:hover:bg-neutral-800"
+				>
+					Read Blog
+				</Link>
+			</div>
+			<nav className="mt-16 flex flex-wrap justify-center gap-x-6 gap-y-2 text-sm text-neutral-500 dark:text-neutral-400">
+				<Link href="/pricing" className="hover:text-neutral-900 dark:hover:text-neutral-200">
+					Pricing
+				</Link>
+				<Link href="/contact" className="hover:text-neutral-900 dark:hover:text-neutral-200">
+					Contact
+				</Link>
+				<Link href="/changelog" className="hover:text-neutral-900 dark:hover:text-neutral-200">
+					Changelog
+				</Link>
+			</nav>
+		</div>
+	);
+}
diff --git a/surfsense_web/app/robots.ts b/surfsense_web/app/robots.ts
new file mode 100644
index 0000000000..085c3d9a38
--- /dev/null
+++ b/surfsense_web/app/robots.ts
@@ -0,0 +1,22 @@
+import type { MetadataRoute } from "next";
+
+export default function robots(): MetadataRoute.Robots {
+	return {
+		rules: [
+			{
+				userAgent: "*",
+				allow: "/",
+				disallow: [
+					"/dashboard/",
+					"/desktop/",
+					"/auth/",
+					"/api/",
+					"/invite/",
+					"/public/",
+					"/verify-token/",
+				],
+			},
+		],
+		sitemap: "https://www.surfsense.com/sitemap.xml",
+	};
+}
diff --git a/surfsense_web/app/sitemap.ts b/surfsense_web/app/sitemap.ts
index 9f3f7ebdfa..82ec405f97 100644
--- a/surfsense_web/app/sitemap.ts
+++ b/surfsense_web/app/sitemap.ts
@@ -1,259 +1,84 @@
+import { loader } from "fumadocs-core/source";
 import type { MetadataRoute } from "next";
+import { blog, changelog } from "@/.source/server";
+import { source as docsSource } from "@/lib/source";
 
-// Returns a date rounded to the current hour (updates only once per hour)
-function getHourlyDate(): Date {
-	const now = new Date();
-	now.setMinutes(0, 0, 0);
-	return now;
+const blogSource = loader({
+	baseUrl: "/blog",
+	source: blog.toFumadocsSource(),
+});
+
+const changelogSource = loader({
+	baseUrl: "/changelog",
+	source: changelog.toFumadocsSource(),
+});
+
+const BASE_URL = "https://www.surfsense.com";
+const BACKEND_URL = process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL || "http://localhost:8000";
+
+async function getFreeModelSlugs(): Promise<string[]> {
+	try {
+		const res = await fetch(`${BACKEND_URL}/api/v1/public/anon-chat/models`, {
+			next: { revalidate: 3600 },
+		});
+		if (!res.ok) return [];
+		const models = await res.json();
+		return models
+			.filter((m: { seo_slug?: string }) => m.seo_slug)
+			.map((m: { seo_slug: string }) => m.seo_slug);
+	} catch {
+		return [];
+	}
 }
 
-export default function sitemap(): MetadataRoute.Sitemap {
-	const lastModified = getHourlyDate();
+export default async function sitemap(): Promise<MetadataRoute.Sitemap> {
+	const now = new Date();
+	now.setMinutes(0, 0, 0);
+	const lastModified = now;
 
-	return [
-		{
-			url: "https://www.surfsense.com/",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 1,
-		},
-		{
-			url: "https://www.surfsense.com/contact",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.9,
-		},
-		{
-			url: "https://www.surfsense.com/pricing",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.9,
-		},
-		{
-			url: "https://www.surfsense.com/privacy",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.9,
-		},
-		{
-			url: "https://www.surfsense.com/terms",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.9,
-		},
-		// Documentation pages
-		{
-			url: "https://www.surfsense.com/docs",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 1,
-		},
-		{
-			url: "https://www.surfsense.com/docs/installation",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.9,
-		},
-		{
-			url: "https://www.surfsense.com/docs/prerequisites",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.9,
-		},
-		{
-			url: "https://www.surfsense.com/docs/docker-installation/install-script",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.9,
-		},
-		{
-			url: "https://www.surfsense.com/docs/docker-installation/docker-compose",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.9,
-		},
-		{
-			url: "https://www.surfsense.com/docs/docker-installation/updating",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.9,
-		},
-		{
-			url: "https://www.surfsense.com/docs/docker-installation/dev-compose",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.9,
-		},
-		{
-			url: "https://www.surfsense.com/docs/docker-installation/migrate-from-allinone",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.9,
-		},
-		{
-			url: "https://www.surfsense.com/docs/manual-installation",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.9,
-		},
-		// Connector documentation
-		{
-			url: "https://www.surfsense.com/docs/connectors/airtable",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.8,
-		},
-		{
-			url: "https://www.surfsense.com/docs/connectors/bookstack",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.8,
-		},
-		{
-			url: "https://www.surfsense.com/docs/connectors/circleback",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.8,
-		},
-		{
-			url: "https://www.surfsense.com/docs/connectors/clickup",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.8,
-		},
-		{
-			url: "https://www.surfsense.com/docs/connectors/confluence",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.8,
-		},
-		{
-			url: "https://www.surfsense.com/docs/connectors/discord",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.8,
-		},
-		{
-			url: "https://www.surfsense.com/docs/connectors/dropbox",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.8,
-		},
-		{
-			url: "https://www.surfsense.com/docs/connectors/elasticsearch",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.8,
-		},
-		{
-			url: "https://www.surfsense.com/docs/connectors/github",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.8,
-		},
-		{
-			url: "https://www.surfsense.com/docs/connectors/gmail",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.8,
-		},
-		{
-			url: "https://www.surfsense.com/docs/connectors/google-calendar",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.8,
-		},
-		{
-			url: "https://www.surfsense.com/docs/connectors/google-drive",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.8,
-		},
-		{
-			url: "https://www.surfsense.com/docs/connectors/jira",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.8,
-		},
-		{
-			url: "https://www.surfsense.com/docs/connectors/linear",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.8,
-		},
-		{
-			url: "https://www.surfsense.com/docs/connectors/luma",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.8,
-		},
-		{
-			url: "https://www.surfsense.com/docs/connectors/microsoft-onedrive",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.8,
-		},
-		{
-			url: "https://www.surfsense.com/docs/connectors/microsoft-teams",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.8,
-		},
-		{
-			url: "https://www.surfsense.com/docs/connectors/notion",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.8,
-		},
-		{
-			url: "https://www.surfsense.com/docs/connectors/obsidian",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.8,
-		},
-		{
-			url: "https://www.surfsense.com/docs/connectors/slack",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.8,
-		},
-		{
-			url: "https://www.surfsense.com/docs/connectors/web-crawler",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.8,
-		},
-		// How-to documentation
-		{
-			url: "https://www.surfsense.com/docs/how-to/zero-sync",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.8,
-		},
-		{
-			url: "https://www.surfsense.com/docs/how-to/realtime-collaboration",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.8,
-		},
-		{
-			url: "https://www.surfsense.com/docs/how-to/web-search",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.8,
-		},
-		// Developer documentation
-		{
-			url: "https://www.surfsense.com/docs/testing",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.7,
-		},
-		{
-			url: "https://www.surfsense.com/docs/code-of-conduct",
-			lastModified,
-			changeFrequency: "daily",
-			priority: 0.7,
-		},
+	const staticPages: MetadataRoute.Sitemap = [
+		{ url: `${BASE_URL}/`, lastModified, changeFrequency: "daily", priority: 1 },
+		{ url: `${BASE_URL}/free`, lastModified, changeFrequency: "daily", priority: 0.95 },
+		{ url: `${BASE_URL}/pricing`, lastModified, changeFrequency: "weekly", priority: 0.9 },
+		{ url: `${BASE_URL}/contact`, lastModified, changeFrequency: "monthly", priority: 0.7 },
+		{ url: `${BASE_URL}/blog`, lastModified, changeFrequency: "daily", priority: 0.9 },
+		{ url: `${BASE_URL}/changelog`, lastModified, changeFrequency: "weekly", priority: 0.7 },
+		{ url: `${BASE_URL}/announcements`, lastModified, changeFrequency: "weekly", priority: 0.6 },
+		{ url: `${BASE_URL}/docs`, lastModified, changeFrequency: "daily", priority: 1 },
+		{ url: `${BASE_URL}/privacy`, lastModified, changeFrequency: "monthly", priority: 0.3 },
+		{ url: `${BASE_URL}/terms`, lastModified, changeFrequency: "monthly", priority: 0.3 },
+		{ url: `${BASE_URL}/login`, lastModified, changeFrequency: "monthly", priority: 0.5 },
+		{ url: `${BASE_URL}/register`, lastModified, changeFrequency: "monthly", priority: 0.5 },
 	];
+
+	const slugs = await getFreeModelSlugs();
+	const freeModelPages: MetadataRoute.Sitemap = slugs.map((slug) => ({
+		url: `${BASE_URL}/free/${slug}`,
+		lastModified,
+		changeFrequency: "daily" as const,
+		priority: 0.9,
+	}));
+
+	const docsPages: MetadataRoute.Sitemap = docsSource.getPages().map((page) => ({
+		url: `${BASE_URL}${page.url}`,
+		lastModified,
+		changeFrequency: "weekly" as const,
+		priority: 0.8,
+	}));
+
+	const blogPages: MetadataRoute.Sitemap = blogSource.getPages().map((page) => ({
+		url: `${BASE_URL}${page.url}`,
+		lastModified,
+		changeFrequency: "weekly" as const,
+		priority: 0.8,
+	}));
+
+	const changelogPages: MetadataRoute.Sitemap = changelogSource.getPages().map((page) => ({
+		url: `${BASE_URL}${page.url}`,
+		lastModified,
+		changeFrequency: "monthly" as const,
+		priority: 0.5,
+	}));
+
+	return [...staticPages, ...freeModelPages, ...docsPages, ...blogPages, ...changelogPages];
 }
diff --git a/surfsense_web/atoms/agent-tools/agent-tools.atoms.ts b/surfsense_web/atoms/agent-tools/agent-tools.atoms.ts
index 2c9e01fb23..84c6ed8415 100644
--- a/surfsense_web/atoms/agent-tools/agent-tools.atoms.ts
+++ b/surfsense_web/atoms/agent-tools/agent-tools.atoms.ts
@@ -1,6 +1,6 @@
 import { atom } from "jotai";
 import { atomWithQuery } from "jotai-tanstack-query";
-import { type AgentToolInfo, agentToolsApiService } from "@/lib/apis/agent-tools-api.service";
+import { agentToolsApiService } from "@/lib/apis/agent-tools-api.service";
 import { cacheKeys } from "@/lib/query-client/cache-keys";
 import { activeSearchSpaceIdAtom } from "../search-spaces/search-space-query.atoms";
 
diff --git a/surfsense_web/atoms/agent/action-log-dialog.atom.ts b/surfsense_web/atoms/agent/action-log-dialog.atom.ts
new file mode 100644
index 0000000000..bb0c386d8d
--- /dev/null
+++ b/surfsense_web/atoms/agent/action-log-dialog.atom.ts
@@ -0,0 +1,19 @@
+import { atom } from "jotai";
+
+interface ActionLogDialogState {
+	open: boolean;
+	threadId: number | null;
+}
+
+export const actionLogDialogAtom = atom<ActionLogDialogState>({
+	open: false,
+	threadId: null,
+});
+
+export const openActionLogDialogAtom = atom(null, (_get, set, threadId: number) => {
+	set(actionLogDialogAtom, { open: true, threadId });
+});
+
+export const closeActionLogDialogAtom = atom(null, (_get, set) => {
+	set(actionLogDialogAtom, { open: false, threadId: null });
+});
diff --git a/surfsense_web/atoms/agent/agent-flags-query.atom.ts b/surfsense_web/atoms/agent/agent-flags-query.atom.ts
new file mode 100644
index 0000000000..30158deaab
--- /dev/null
+++ b/surfsense_web/atoms/agent/agent-flags-query.atom.ts
@@ -0,0 +1,17 @@
+import { atomWithQuery } from "jotai-tanstack-query";
+import { agentFlagsApiService } from "@/lib/apis/agent-flags-api.service";
+import { getBearerToken } from "@/lib/auth-utils";
+
+export const AGENT_FLAGS_QUERY_KEY = ["agent", "flags"] as const;
+
+/**
+ * Reads the backend agent feature flags. Cached for the lifetime of the
+ * page (flags only change on backend restart) so we can drive UI gating
+ * without re-hitting the API.
+ */
+export const agentFlagsAtom = atomWithQuery(() => ({
+	queryKey: AGENT_FLAGS_QUERY_KEY,
+	staleTime: 10 * 60 * 1000,
+	enabled: !!getBearerToken(),
+	queryFn: () => agentFlagsApiService.get(),
+}));
diff --git a/surfsense_web/atoms/chat-comments/comments-mutation.atoms.ts b/surfsense_web/atoms/chat-comments/comments-mutation.atoms.ts
index e6a9767ca7..e945042232 100644
--- a/surfsense_web/atoms/chat-comments/comments-mutation.atoms.ts
+++ b/surfsense_web/atoms/chat-comments/comments-mutation.atoms.ts
@@ -11,6 +11,7 @@ import { cacheKeys } from "@/lib/query-client/cache-keys";
 import { queryClient } from "@/lib/query-client/client";
 
 export const createCommentMutationAtom = atomWithMutation(() => ({
+	meta: { suppressGlobalErrorToast: true },
 	mutationFn: async (request: CreateCommentRequest) => {
 		return chatCommentsApiService.createComment(request);
 	},
@@ -26,6 +27,7 @@ export const createCommentMutationAtom = atomWithMutation(() => ({
 }));
 
 export const createReplyMutationAtom = atomWithMutation(() => ({
+	meta: { suppressGlobalErrorToast: true },
 	mutationFn: async (request: CreateReplyRequest & { message_id: number }) => {
 		return chatCommentsApiService.createReply(request);
 	},
@@ -41,6 +43,7 @@ export const createReplyMutationAtom = atomWithMutation(() => ({
 }));
 
 export const updateCommentMutationAtom = atomWithMutation(() => ({
+	meta: { suppressGlobalErrorToast: true },
 	mutationFn: async (request: UpdateCommentRequest & { message_id: number }) => {
 		return chatCommentsApiService.updateComment(request);
 	},
@@ -56,6 +59,7 @@ export const updateCommentMutationAtom = atomWithMutation(() => ({
 }));
 
 export const deleteCommentMutationAtom = atomWithMutation(() => ({
+	meta: { suppressGlobalErrorToast: true },
 	mutationFn: async (request: DeleteCommentRequest & { message_id: number }) => {
 		return chatCommentsApiService.deleteComment(request);
 	},
diff --git a/surfsense_web/atoms/chat/current-thread.atom.ts b/surfsense_web/atoms/chat/current-thread.atom.ts
index d781df8d2a..131c98309e 100644
--- a/surfsense_web/atoms/chat/current-thread.atom.ts
+++ b/surfsense_web/atoms/chat/current-thread.atom.ts
@@ -26,7 +26,14 @@ export const setThreadVisibilityAtom = atom(null, (get, set, newVisibility: Chat
 
 export const resetCurrentThreadAtom = atom(null, (_, set) => {
 	set(currentThreadAtom, initialState);
-	set(reportPanelAtom, { isOpen: false, reportId: null, title: null, wordCount: null });
+	set(reportPanelAtom, {
+		isOpen: false,
+		reportId: null,
+		title: null,
+		wordCount: null,
+		shareToken: null,
+		contentType: "markdown",
+	});
 });
 
 /** Target comment ID to scroll to (from URL navigation or inbox click) */
diff --git a/surfsense_web/atoms/chat/mentioned-documents.atom.ts b/surfsense_web/atoms/chat/mentioned-documents.atom.ts
index ee93a409ad..9163960f49 100644
--- a/surfsense_web/atoms/chat/mentioned-documents.atom.ts
+++ b/surfsense_web/atoms/chat/mentioned-documents.atom.ts
@@ -4,55 +4,106 @@ import { atom } from "jotai";
 import type { Document } from "@/contracts/types/document.types";
 
 /**
- * Atom to store the full document objects mentioned via @-mention chips
- * in the current chat composer. This persists across component remounts.
+ * Sentinel ``document_type`` used for folder mention chips so the
+ * dedup key (`kind:document_type:id`) never collides a document with a
+ * folder that happens to share an integer id.
  */
-export const mentionedDocumentsAtom = atom<Pick<Document, "id" | "title" | "document_type">[]>([]);
+export const FOLDER_MENTION_DOCUMENT_TYPE = "FOLDER";
 
 /**
- * Atom to store documents selected via the sidebar checkboxes / row clicks.
- * These are NOT inserted as chips – the composer shows a count badge instead.
+ * Display metadata for a single ``@``-mention chip.
+ *
+ * The ``kind`` discriminator identifies whether the chip is a
+ * knowledge-base document or a knowledge-base folder. Folders carry
+ * the sentinel ``document_type === FOLDER_MENTION_DOCUMENT_TYPE`` so
+ * the editor, picker, and persisted ``mentioned-documents`` content
+ * part all stay aligned with the backend Pydantic schema.
  */
-export const sidebarSelectedDocumentsAtom = atom<
-	Pick<Document, "id" | "title" | "document_type">[]
->([]);
+export interface MentionedDocumentInfo {
+	id: number;
+	title: string;
+	document_type: string;
+	kind: "doc" | "folder";
+}
+
+/**
+ * Backwards-compatible doc-only chip shape for legacy callers that
+ * haven't migrated to the discriminated union yet. Keep narrow so
+ * accidental new callers fail typecheck and route through the
+ * discriminated type instead.
+ */
+type LegacyDocMention = Pick<Document, "id" | "title" | "document_type">;
+
+/**
+ * Normalize an arbitrary chip-like input into the discriminated
+ * ``MentionedDocumentInfo`` shape. Existing call sites that only have
+ * ``{id, title, document_type}`` flow through here so they don't have
+ * to thread ``kind`` everywhere — the helper defaults to ``"doc"`` and
+ * rewrites the document type for folders.
+ */
+export function toMentionedDocumentInfo(
+	input: LegacyDocMention | MentionedDocumentInfo
+): MentionedDocumentInfo {
+	if ("kind" in input && (input.kind === "doc" || input.kind === "folder")) {
+		return input;
+	}
+	return {
+		id: input.id,
+		title: input.title,
+		document_type: input.document_type,
+		kind: "doc",
+	};
+}
+
+/**
+ * Build a folder-mention chip from a folder row (id + name).
+ */
+export function makeFolderMention(input: { id: number; name: string }): MentionedDocumentInfo {
+	return {
+		id: input.id,
+		title: input.name,
+		document_type: FOLDER_MENTION_DOCUMENT_TYPE,
+		kind: "folder",
+	};
+}
 
 /**
- * Derived read-only atom that merges @-mention chips and sidebar selections
- * into a single deduplicated set of document IDs for the backend.
+ * Atom to store the full mention objects (documents + folders) attached
+ * via @-mention chips in the current chat composer. Persists across
+ * component remounts.
+ */
+export const mentionedDocumentsAtom = atom<MentionedDocumentInfo[]>([]);
+
+/**
+ * Derived read-only atom that maps deduplicated mention chips into
+ * backend payload fields. Doc chips split by ``document_type`` exactly
+ * like before; folder chips are projected into a separate
+ * ``folder_ids`` bucket so the route can forward
+ * ``mentioned_folder_ids`` to the agent without the priority middleware
+ * conflating them with hybrid-search ids.
  */
 export const mentionedDocumentIdsAtom = atom((get) => {
-	const chipDocs = get(mentionedDocumentsAtom);
-	const sidebarDocs = get(sidebarSelectedDocumentsAtom);
-	const allDocs = [...chipDocs, ...sidebarDocs];
+	const allMentions = get(mentionedDocumentsAtom);
 	const seen = new Set<string>();
-	const deduped = allDocs.filter((d) => {
-		const key = `${d.document_type}:${d.id}`;
+	const deduped = allMentions.filter((m) => {
+		const key = `${m.kind}:${m.document_type}:${m.id}`;
 		if (seen.has(key)) return false;
 		seen.add(key);
 		return true;
 	});
+	const docs = deduped.filter((m) => m.kind === "doc");
+	const folders = deduped.filter((m) => m.kind === "folder");
 	return {
-		surfsense_doc_ids: deduped
+		surfsense_doc_ids: docs
 			.filter((doc) => doc.document_type === "SURFSENSE_DOCS")
 			.map((doc) => doc.id),
-		document_ids: deduped
-			.filter((doc) => doc.document_type !== "SURFSENSE_DOCS")
-			.map((doc) => doc.id),
+		document_ids: docs.filter((doc) => doc.document_type !== "SURFSENSE_DOCS").map((doc) => doc.id),
+		folder_ids: folders.map((f) => f.id),
 	};
 });
 
 /**
- * Simplified document info for display purposes
- */
-export interface MentionedDocumentInfo {
-	id: number;
-	title: string;
-	document_type: string;
-}
-
-/**
- * Atom to store mentioned documents per message ID.
+ * Atom to store mentioned chips per message ID.
  * This allows displaying which documents were mentioned with each user message.
  */
 export const messageDocumentsMapAtom = atom<Record<string, MentionedDocumentInfo[]>>({});
diff --git a/surfsense_web/atoms/chat/pending-user-images.atom.ts b/surfsense_web/atoms/chat/pending-user-images.atom.ts
new file mode 100644
index 0000000000..6898e745da
--- /dev/null
+++ b/surfsense_web/atoms/chat/pending-user-images.atom.ts
@@ -0,0 +1,3 @@
+import { atom } from "jotai";
+
+export const pendingUserImageDataUrlsAtom = atom<string[]>([]);
diff --git a/surfsense_web/atoms/chat/premium-alert.atom.ts b/surfsense_web/atoms/chat/premium-alert.atom.ts
new file mode 100644
index 0000000000..1c837dd65f
--- /dev/null
+++ b/surfsense_web/atoms/chat/premium-alert.atom.ts
@@ -0,0 +1,45 @@
+import { atom } from "jotai";
+
+export type PremiumAlertState = {
+	message: string;
+};
+
+export const premiumAlertByThreadAtom = atom<Record<number, PremiumAlertState>>({});
+
+export const setPremiumAlertForThreadAtom = atom(
+	null,
+	(
+		get,
+		set,
+		payload: {
+			threadId: number;
+			message: string;
+			userId?: string | null;
+		}
+	) => {
+		const storageKey = `surfsense-premium-alert-seen-v1:${payload.userId ?? "anonymous"}`;
+
+		if (typeof window !== "undefined") {
+			const hasSeen = localStorage.getItem(storageKey) === "true";
+			if (hasSeen) return;
+		}
+
+		const current = get(premiumAlertByThreadAtom);
+		set(premiumAlertByThreadAtom, {
+			...current,
+			[payload.threadId]: { message: payload.message },
+		});
+
+		if (typeof window !== "undefined") {
+			localStorage.setItem(storageKey, "true");
+		}
+	}
+);
+
+export const clearPremiumAlertForThreadAtom = atom(null, (get, set, threadId: number) => {
+	const current = get(premiumAlertByThreadAtom);
+	if (!(threadId in current)) return;
+	const next = { ...current };
+	delete next[threadId];
+	set(premiumAlertByThreadAtom, next);
+});
diff --git a/surfsense_web/atoms/chat/report-panel.atom.ts b/surfsense_web/atoms/chat/report-panel.atom.ts
index edae8979d8..c80230f052 100644
--- a/surfsense_web/atoms/chat/report-panel.atom.ts
+++ b/surfsense_web/atoms/chat/report-panel.atom.ts
@@ -8,6 +8,8 @@ interface ReportPanelState {
 	wordCount: number | null;
 	/** When set, uses public endpoints for fetching report data (public shared chat) */
 	shareToken: string | null;
+	/** Content type of the report — "markdown" (default) or "typst" (resume) */
+	contentType: string;
 }
 
 const initialState: ReportPanelState = {
@@ -16,6 +18,7 @@ const initialState: ReportPanelState = {
 	title: null,
 	wordCount: null,
 	shareToken: null,
+	contentType: "markdown",
 };
 
 /** Core atom holding the report panel state */
@@ -38,7 +41,14 @@ export const openReportPanelAtom = atom(
 			title,
 			wordCount,
 			shareToken,
-		}: { reportId: number; title: string; wordCount?: number; shareToken?: string | null }
+			contentType,
+		}: {
+			reportId: number;
+			title: string;
+			wordCount?: number;
+			shareToken?: string | null;
+			contentType?: string;
+		}
 	) => {
 		if (!get(reportPanelAtom).isOpen) {
 			set(preReportCollapsedAtom, get(rightPanelCollapsedAtom));
@@ -49,6 +59,7 @@ export const openReportPanelAtom = atom(
 			title,
 			wordCount: wordCount ?? null,
 			shareToken: shareToken ?? null,
+			contentType: contentType ?? "markdown",
 		});
 		set(rightPanelTabAtom, "report");
 		set(rightPanelCollapsedAtom, false);
diff --git a/surfsense_web/atoms/citation/citation-panel.atom.ts b/surfsense_web/atoms/citation/citation-panel.atom.ts
new file mode 100644
index 0000000000..ca73128572
--- /dev/null
+++ b/surfsense_web/atoms/citation/citation-panel.atom.ts
@@ -0,0 +1,40 @@
+import { atom } from "jotai";
+import { rightPanelCollapsedAtom, rightPanelTabAtom } from "@/atoms/layout/right-panel.atom";
+
+interface CitationPanelState {
+	isOpen: boolean;
+	chunkId: number | null;
+}
+
+const initialState: CitationPanelState = {
+	isOpen: false,
+	chunkId: null,
+};
+
+export const citationPanelAtom = atom<CitationPanelState>(initialState);
+
+export const citationPanelOpenAtom = atom((get) => get(citationPanelAtom).isOpen);
+
+const preCitationCollapsedAtom = atom<boolean | null>(null);
+
+export const openCitationPanelAtom = atom(null, (get, set, payload: { chunkId: number }) => {
+	if (!get(citationPanelAtom).isOpen) {
+		set(preCitationCollapsedAtom, get(rightPanelCollapsedAtom));
+	}
+	set(citationPanelAtom, {
+		isOpen: true,
+		chunkId: payload.chunkId,
+	});
+	set(rightPanelTabAtom, "citation");
+	set(rightPanelCollapsedAtom, false);
+});
+
+export const closeCitationPanelAtom = atom(null, (get, set) => {
+	set(citationPanelAtom, initialState);
+	set(rightPanelTabAtom, "sources");
+	const prev = get(preCitationCollapsedAtom);
+	if (prev !== null) {
+		set(rightPanelCollapsedAtom, prev);
+		set(preCitationCollapsedAtom, null);
+	}
+});
diff --git a/surfsense_web/atoms/documents/folder.atoms.ts b/surfsense_web/atoms/documents/folder.atoms.ts
index fe7d556ebe..bbdc58e4e9 100644
--- a/surfsense_web/atoms/documents/folder.atoms.ts
+++ b/surfsense_web/atoms/documents/folder.atoms.ts
@@ -12,6 +12,15 @@ export const expandedFolderIdsAtom = atomWithStorage<Record<number, number[]>>(
 	{}
 );
 
+/**
+ * Expanded folder keys for Local filesystem tree, keyed by search space ID.
+ * Persisted so local tree expansion survives remounts/reloads.
+ */
+export const localExpandedFolderKeysAtom = atomWithStorage<Record<number, string[]>>(
+	"surfsense:localExpandedFolderKeys",
+	{}
+);
+
 /**
  * Folder currently being renamed (inline edit mode).
  * null means no folder is being renamed.
diff --git a/surfsense_web/atoms/editor/editor-panel.atom.ts b/surfsense_web/atoms/editor/editor-panel.atom.ts
index 7dc6add284..28563e7d31 100644
--- a/surfsense_web/atoms/editor/editor-panel.atom.ts
+++ b/surfsense_web/atoms/editor/editor-panel.atom.ts
@@ -3,14 +3,18 @@ import { rightPanelCollapsedAtom, rightPanelTabAtom } from "@/atoms/layout/right
 
 interface EditorPanelState {
 	isOpen: boolean;
+	kind: "document" | "local_file";
 	documentId: number | null;
+	localFilePath: string | null;
 	searchSpaceId: number | null;
 	title: string | null;
 }
 
 const initialState: EditorPanelState = {
 	isOpen: false,
+	kind: "document",
 	documentId: null,
+	localFilePath: null,
 	searchSpaceId: null,
 	title: null,
 };
@@ -26,20 +30,38 @@ export const openEditorPanelAtom = atom(
 	(
 		get,
 		set,
-		{
-			documentId,
-			searchSpaceId,
-			title,
-		}: { documentId: number; searchSpaceId: number; title?: string }
+		payload:
+			| { documentId: number; searchSpaceId: number; title?: string; kind?: "document" }
+			| {
+					kind: "local_file";
+					localFilePath: string;
+					title?: string;
+					searchSpaceId?: number;
+			  }
 	) => {
 		if (!get(editorPanelAtom).isOpen) {
 			set(preEditorCollapsedAtom, get(rightPanelCollapsedAtom));
 		}
+		if (payload.kind === "local_file") {
+			set(editorPanelAtom, {
+				isOpen: true,
+				kind: "local_file",
+				documentId: null,
+				localFilePath: payload.localFilePath,
+				searchSpaceId: payload.searchSpaceId ?? null,
+				title: payload.title ?? null,
+			});
+			set(rightPanelTabAtom, "editor");
+			set(rightPanelCollapsedAtom, false);
+			return;
+		}
 		set(editorPanelAtom, {
 			isOpen: true,
-			documentId,
-			searchSpaceId,
-			title: title ?? null,
+			kind: "document",
+			documentId: payload.documentId,
+			localFilePath: null,
+			searchSpaceId: payload.searchSpaceId,
+			title: payload.title ?? null,
 		});
 		set(rightPanelTabAtom, "editor");
 		set(rightPanelCollapsedAtom, false);
diff --git a/surfsense_web/atoms/folder-sync/folder-sync.atoms.ts b/surfsense_web/atoms/folder-sync/folder-sync.atoms.ts
new file mode 100644
index 0000000000..a9ae2bfcc2
--- /dev/null
+++ b/surfsense_web/atoms/folder-sync/folder-sync.atoms.ts
@@ -0,0 +1,12 @@
+import { atom } from "jotai";
+
+export interface SelectedFolder {
+	path: string;
+	name: string;
+}
+
+// Atom to control the folder watch dialog open state
+export const folderWatchDialogOpenAtom = atom(false);
+
+// Atom to store initial folder selection for the dialog
+export const folderWatchInitialFolderAtom = atom<SelectedFolder | null>(null);
diff --git a/surfsense_web/atoms/image-gen-config/image-gen-config-mutation.atoms.ts b/surfsense_web/atoms/image-gen-config/image-gen-config-mutation.atoms.ts
index 362c3a6908..922c398c98 100644
--- a/surfsense_web/atoms/image-gen-config/image-gen-config-mutation.atoms.ts
+++ b/surfsense_web/atoms/image-gen-config/image-gen-config-mutation.atoms.ts
@@ -21,6 +21,7 @@ export const createImageGenConfigMutationAtom = atomWithMutation((get) => {
 
 	return {
 		mutationKey: ["image-gen-configs", "create"],
+		meta: { suppressGlobalErrorToast: true },
 		enabled: !!searchSpaceId,
 		mutationFn: async (request: CreateImageGenConfigRequest) => {
 			return imageGenConfigApiService.createConfig(request);
@@ -45,6 +46,7 @@ export const updateImageGenConfigMutationAtom = atomWithMutation((get) => {
 
 	return {
 		mutationKey: ["image-gen-configs", "update"],
+		meta: { suppressGlobalErrorToast: true },
 		enabled: !!searchSpaceId,
 		mutationFn: async (request: UpdateImageGenConfigRequest) => {
 			return imageGenConfigApiService.updateConfig(request);
@@ -72,6 +74,7 @@ export const deleteImageGenConfigMutationAtom = atomWithMutation((get) => {
 
 	return {
 		mutationKey: ["image-gen-configs", "delete"],
+		meta: { suppressGlobalErrorToast: true },
 		enabled: !!searchSpaceId,
 		mutationFn: async (request: { id: number; name: string }) => {
 			return imageGenConfigApiService.deleteConfig(request.id);
diff --git a/surfsense_web/atoms/invites/invites-mutation.atoms.ts b/surfsense_web/atoms/invites/invites-mutation.atoms.ts
index f2acb6e4aa..31e8104a37 100644
--- a/surfsense_web/atoms/invites/invites-mutation.atoms.ts
+++ b/surfsense_web/atoms/invites/invites-mutation.atoms.ts
@@ -14,6 +14,7 @@ import { queryClient } from "@/lib/query-client/client";
  * Mutation atom for creating an invite
  */
 export const createInviteMutationAtom = atomWithMutation(() => ({
+	meta: { suppressGlobalErrorToast: true },
 	mutationFn: async (request: CreateInviteRequest) => {
 		return invitesApiService.createInvite(request);
 	},
@@ -33,6 +34,7 @@ export const createInviteMutationAtom = atomWithMutation(() => ({
  * Mutation atom for updating an invite
  */
 export const updateInviteMutationAtom = atomWithMutation(() => ({
+	meta: { suppressGlobalErrorToast: true },
 	mutationFn: async (request: UpdateInviteRequest) => {
 		return invitesApiService.updateInvite(request);
 	},
@@ -52,6 +54,7 @@ export const updateInviteMutationAtom = atomWithMutation(() => ({
  * Mutation atom for deleting an invite
  */
 export const deleteInviteMutationAtom = atomWithMutation(() => ({
+	meta: { suppressGlobalErrorToast: true },
 	mutationFn: async (request: DeleteInviteRequest) => {
 		return invitesApiService.deleteInvite(request);
 	},
@@ -71,6 +74,7 @@ export const deleteInviteMutationAtom = atomWithMutation(() => ({
  * Mutation atom for accepting an invite
  */
 export const acceptInviteMutationAtom = atomWithMutation(() => ({
+	meta: { suppressGlobalErrorToast: true },
 	mutationFn: async (request: AcceptInviteRequest) => {
 		return invitesApiService.acceptInvite(request);
 	},
diff --git a/surfsense_web/atoms/layout/dialogs.atom.ts b/surfsense_web/atoms/layout/dialogs.atom.ts
new file mode 100644
index 0000000000..01c4628cc5
--- /dev/null
+++ b/surfsense_web/atoms/layout/dialogs.atom.ts
@@ -0,0 +1,3 @@
+import { atom } from "jotai";
+
+export const announcementsDialogAtom = atom<boolean>(false);
diff --git a/surfsense_web/atoms/layout/right-panel.atom.ts b/surfsense_web/atoms/layout/right-panel.atom.ts
index e06500113f..d296587ed5 100644
--- a/surfsense_web/atoms/layout/right-panel.atom.ts
+++ b/surfsense_web/atoms/layout/right-panel.atom.ts
@@ -1,6 +1,6 @@
 import { atom } from "jotai";
 
-export type RightPanelTab = "sources" | "report" | "editor" | "hitl-edit";
+export type RightPanelTab = "sources" | "report" | "editor" | "hitl-edit" | "citation";
 
 export const rightPanelTabAtom = atom<RightPanelTab>("sources");
 
diff --git a/surfsense_web/atoms/logs/log-mutation.atoms.ts b/surfsense_web/atoms/logs/log-mutation.atoms.ts
index e17b42fb66..f9d73f6e09 100644
--- a/surfsense_web/atoms/logs/log-mutation.atoms.ts
+++ b/surfsense_web/atoms/logs/log-mutation.atoms.ts
@@ -19,10 +19,8 @@ export const createLogMutationAtom = atomWithMutation((get) => {
 		enabled: !!searchSpaceId,
 		mutationFn: async (request: CreateLogRequest) => logsApiService.createLog(request),
 		onSuccess: () => {
-			queryClient.invalidateQueries({ queryKey: cacheKeys.logs.list(searchSpaceId ?? undefined) });
-			queryClient.invalidateQueries({
-				queryKey: cacheKeys.logs.summary(searchSpaceId ?? undefined),
-			});
+			// Invalidate all log-related queries (list, summary, detail, withQueryParams)
+			queryClient.invalidateQueries({ queryKey: ["logs"] });
 		},
 	};
 });
@@ -38,11 +36,7 @@ export const updateLogMutationAtom = atomWithMutation((get) => {
 		mutationFn: async ({ logId, data }: { logId: number; data: UpdateLogRequest }) =>
 			logsApiService.updateLog(logId, data),
 		onSuccess: (_data, variables) => {
-			queryClient.invalidateQueries({ queryKey: cacheKeys.logs.detail(variables.logId) });
-			queryClient.invalidateQueries({ queryKey: cacheKeys.logs.list(searchSpaceId ?? undefined) });
-			queryClient.invalidateQueries({
-				queryKey: cacheKeys.logs.summary(searchSpaceId ?? undefined),
-			});
+			queryClient.invalidateQueries({ queryKey: ["logs"] });
 		},
 	};
 });
@@ -57,12 +51,7 @@ export const deleteLogMutationAtom = atomWithMutation((get) => {
 		enabled: !!searchSpaceId,
 		mutationFn: async (request: DeleteLogRequest) => logsApiService.deleteLog(request),
 		onSuccess: (_data, request) => {
-			queryClient.invalidateQueries({ queryKey: cacheKeys.logs.list(searchSpaceId ?? undefined) });
-			queryClient.invalidateQueries({
-				queryKey: cacheKeys.logs.summary(searchSpaceId ?? undefined),
-			});
-			if (request?.id)
-				queryClient.invalidateQueries({ queryKey: cacheKeys.logs.detail(request.id) });
+			queryClient.invalidateQueries({ queryKey: ["logs"] });
 		},
 	};
 });
diff --git a/surfsense_web/atoms/members/members-mutation.atoms.ts b/surfsense_web/atoms/members/members-mutation.atoms.ts
index d01d3b4893..9cf1b28818 100644
--- a/surfsense_web/atoms/members/members-mutation.atoms.ts
+++ b/surfsense_web/atoms/members/members-mutation.atoms.ts
@@ -14,6 +14,7 @@ import { queryClient } from "@/lib/query-client/client";
 
 export const updateMemberMutationAtom = atomWithMutation(() => {
 	return {
+		meta: { suppressGlobalErrorToast: true },
 		mutationFn: async (request: UpdateMembershipRequest) => {
 			return membersApiService.updateMember(request);
 		},
@@ -31,6 +32,7 @@ export const updateMemberMutationAtom = atomWithMutation(() => {
 
 export const deleteMemberMutationAtom = atomWithMutation(() => {
 	return {
+		meta: { suppressGlobalErrorToast: true },
 		mutationFn: async (request: DeleteMembershipRequest) => {
 			return membersApiService.deleteMember(request);
 		},
@@ -48,6 +50,7 @@ export const deleteMemberMutationAtom = atomWithMutation(() => {
 
 export const leaveSearchSpaceMutationAtom = atomWithMutation(() => {
 	return {
+		meta: { suppressGlobalErrorToast: true },
 		mutationFn: async (request: LeaveSearchSpaceRequest) => {
 			return membersApiService.leaveSearchSpace(request);
 		},
diff --git a/surfsense_web/atoms/new-llm-config/new-llm-config-mutation.atoms.ts b/surfsense_web/atoms/new-llm-config/new-llm-config-mutation.atoms.ts
index 2a048ca3e6..f4577a7a9f 100644
--- a/surfsense_web/atoms/new-llm-config/new-llm-config-mutation.atoms.ts
+++ b/surfsense_web/atoms/new-llm-config/new-llm-config-mutation.atoms.ts
@@ -23,6 +23,7 @@ export const createNewLLMConfigMutationAtom = atomWithMutation((get) => {
 
 	return {
 		mutationKey: ["new-llm-configs", "create"],
+		meta: { suppressGlobalErrorToast: true },
 		enabled: !!searchSpaceId,
 		mutationFn: async (request: CreateNewLLMConfigRequest) => {
 			return newLLMConfigApiService.createConfig(request);
@@ -47,6 +48,7 @@ export const updateNewLLMConfigMutationAtom = atomWithMutation((get) => {
 
 	return {
 		mutationKey: ["new-llm-configs", "update"],
+		meta: { suppressGlobalErrorToast: true },
 		enabled: !!searchSpaceId,
 		mutationFn: async (request: UpdateNewLLMConfigRequest) => {
 			return newLLMConfigApiService.updateConfig(request);
@@ -74,6 +76,7 @@ export const deleteNewLLMConfigMutationAtom = atomWithMutation((get) => {
 
 	return {
 		mutationKey: ["new-llm-configs", "delete"],
+		meta: { suppressGlobalErrorToast: true },
 		enabled: !!searchSpaceId,
 		mutationFn: async (request: DeleteNewLLMConfigRequest & { name: string }) => {
 			return newLLMConfigApiService.deleteConfig({ id: request.id });
@@ -105,6 +108,7 @@ export const updateLLMPreferencesMutationAtom = atomWithMutation((get) => {
 
 	return {
 		mutationKey: ["llm-preferences", "update"],
+		meta: { suppressGlobalErrorToast: true },
 		enabled: !!searchSpaceId,
 		mutationFn: async (request: UpdateLLMPreferencesRequest) => {
 			return newLLMConfigApiService.updateLLMPreferences(request);
diff --git a/surfsense_web/atoms/new-llm-config/new-llm-config-query.atoms.ts b/surfsense_web/atoms/new-llm-config/new-llm-config-query.atoms.ts
index 32250d394b..410d061e5f 100644
--- a/surfsense_web/atoms/new-llm-config/new-llm-config-query.atoms.ts
+++ b/surfsense_web/atoms/new-llm-config/new-llm-config-query.atoms.ts
@@ -2,6 +2,7 @@ import { atomWithQuery } from "jotai-tanstack-query";
 import type { LLMModel } from "@/contracts/enums/llm-models";
 import { LLM_MODELS } from "@/contracts/enums/llm-models";
 import { newLLMConfigApiService } from "@/lib/apis/new-llm-config-api.service";
+import { getBearerToken } from "@/lib/auth-utils";
 import { cacheKeys } from "@/lib/query-client/cache-keys";
 import { activeSearchSpaceIdAtom } from "../search-spaces/search-space-query.atoms";
 
@@ -30,6 +31,7 @@ export const globalNewLLMConfigsAtom = atomWithQuery(() => {
 	return {
 		queryKey: cacheKeys.newLLMConfigs.global(),
 		staleTime: 10 * 60 * 1000, // 10 minutes - global configs rarely change
+		enabled: !!getBearerToken(),
 		queryFn: async () => {
 			return newLLMConfigApiService.getGlobalConfigs();
 		},
diff --git a/surfsense_web/atoms/prompts/prompts-mutation.atoms.ts b/surfsense_web/atoms/prompts/prompts-mutation.atoms.ts
index 6996185fec..6eb6fa4a89 100644
--- a/surfsense_web/atoms/prompts/prompts-mutation.atoms.ts
+++ b/surfsense_web/atoms/prompts/prompts-mutation.atoms.ts
@@ -11,6 +11,7 @@ import { queryClient } from "@/lib/query-client/client";
 
 export const createPromptMutationAtom = atomWithMutation(() => ({
 	mutationKey: ["prompts", "create"],
+	meta: { suppressGlobalErrorToast: true },
 	mutationFn: async (request: PromptCreateRequest) => {
 		return promptsApiService.create(request);
 	},
@@ -25,6 +26,7 @@ export const createPromptMutationAtom = atomWithMutation(() => ({
 
 export const updatePromptMutationAtom = atomWithMutation(() => ({
 	mutationKey: ["prompts", "update"],
+	meta: { suppressGlobalErrorToast: true },
 	mutationFn: async ({ id, ...data }: PromptUpdateRequest & { id: number }) => {
 		return promptsApiService.update(id, data);
 	},
@@ -39,6 +41,7 @@ export const updatePromptMutationAtom = atomWithMutation(() => ({
 
 export const deletePromptMutationAtom = atomWithMutation(() => ({
 	mutationKey: ["prompts", "delete"],
+	meta: { suppressGlobalErrorToast: true },
 	mutationFn: async (id: number) => {
 		return promptsApiService.delete(id);
 	},
@@ -57,6 +60,7 @@ export const deletePromptMutationAtom = atomWithMutation(() => ({
 
 export const copyPromptMutationAtom = atomWithMutation(() => ({
 	mutationKey: ["prompts", "copy"],
+	meta: { suppressGlobalErrorToast: true },
 	mutationFn: async (promptId: number) => {
 		return promptsApiService.copy(promptId);
 	},
diff --git a/surfsense_web/atoms/public-chat-snapshots/public-chat-snapshots-mutation.atoms.ts b/surfsense_web/atoms/public-chat-snapshots/public-chat-snapshots-mutation.atoms.ts
index bfdfccb166..e4c60b809d 100644
--- a/surfsense_web/atoms/public-chat-snapshots/public-chat-snapshots-mutation.atoms.ts
+++ b/surfsense_web/atoms/public-chat-snapshots/public-chat-snapshots-mutation.atoms.ts
@@ -10,6 +10,7 @@ import { cacheKeys } from "@/lib/query-client/cache-keys";
 import { queryClient } from "@/lib/query-client/client";
 
 export const createPublicChatSnapshotMutationAtom = atomWithMutation(() => ({
+	meta: { suppressGlobalErrorToast: true },
 	mutationFn: async (request: PublicChatSnapshotCreateRequest) => {
 		return chatThreadsApiService.createPublicChatSnapshot(request);
 	},
@@ -37,6 +38,7 @@ export const createPublicChatSnapshotMutationAtom = atomWithMutation(() => ({
 }));
 
 export const deletePublicChatSnapshotMutationAtom = atomWithMutation(() => ({
+	meta: { suppressGlobalErrorToast: true },
 	mutationFn: async (request: PublicChatSnapshotDeleteRequest) => {
 		return chatThreadsApiService.deletePublicChatSnapshot(request);
 	},
diff --git a/surfsense_web/atoms/roles/roles-mutation.atoms.ts b/surfsense_web/atoms/roles/roles-mutation.atoms.ts
index ddbc68ca2a..cfcb6cba19 100644
--- a/surfsense_web/atoms/roles/roles-mutation.atoms.ts
+++ b/surfsense_web/atoms/roles/roles-mutation.atoms.ts
@@ -14,6 +14,7 @@ import { queryClient } from "@/lib/query-client/client";
 
 export const createRoleMutationAtom = atomWithMutation(() => {
 	return {
+		meta: { suppressGlobalErrorToast: true },
 		mutationFn: async (request: CreateRoleRequest) => {
 			return rolesApiService.createRole(request);
 		},
@@ -31,6 +32,7 @@ export const createRoleMutationAtom = atomWithMutation(() => {
 
 export const updateRoleMutationAtom = atomWithMutation(() => {
 	return {
+		meta: { suppressGlobalErrorToast: true },
 		mutationFn: async (request: UpdateRoleRequest) => {
 			return rolesApiService.updateRole(request);
 		},
@@ -54,6 +56,7 @@ export const updateRoleMutationAtom = atomWithMutation(() => {
 
 export const deleteRoleMutationAtom = atomWithMutation(() => {
 	return {
+		meta: { suppressGlobalErrorToast: true },
 		mutationFn: async (request: DeleteRoleRequest) => {
 			return rolesApiService.deleteRole(request);
 		},
diff --git a/surfsense_web/atoms/settings/settings-dialog.atoms.ts b/surfsense_web/atoms/settings/settings-dialog.atoms.ts
deleted file mode 100644
index 282cc65b34..0000000000
--- a/surfsense_web/atoms/settings/settings-dialog.atoms.ts
+++ /dev/null
@@ -1,25 +0,0 @@
-import { atom } from "jotai";
-
-export interface SearchSpaceSettingsDialogState {
-	open: boolean;
-	initialTab: string;
-}
-
-export interface UserSettingsDialogState {
-	open: boolean;
-	initialTab: string;
-}
-
-export const searchSpaceSettingsDialogAtom = atom<SearchSpaceSettingsDialogState>({
-	open: false,
-	initialTab: "general",
-});
-
-export const userSettingsDialogAtom = atom<UserSettingsDialogState>({
-	open: false,
-	initialTab: "profile",
-});
-
-export const teamDialogAtom = atom<boolean>(false);
-
-export const morePagesDialogAtom = atom<boolean>(false);
diff --git a/surfsense_web/atoms/tabs/tabs.atom.ts b/surfsense_web/atoms/tabs/tabs.atom.ts
index 22cc5373a9..dba459cc98 100644
--- a/surfsense_web/atoms/tabs/tabs.atom.ts
+++ b/surfsense_web/atoms/tabs/tabs.atom.ts
@@ -36,14 +36,16 @@ const initialState: TabsState = {
 // Prevent race conditions where route-sync recreates a just-deleted chat tab.
 const deletedChatIdsAtom = atom<Set<number>>(new Set<number>());
 
-const sessionStorageAdapter = createJSONStorage<TabsState>(
-	() => (typeof window !== "undefined" ? sessionStorage : undefined) as Storage
+// Persist tabs in localStorage so they survive a hard refresh and let the user
+// keep tabs open across multiple search spaces (browser-like behavior).
+const localStorageAdapter = createJSONStorage<TabsState>(
+	() => (typeof window !== "undefined" ? localStorage : undefined) as Storage
 );
 
 export const tabsStateAtom = atomWithStorage<TabsState>(
 	"surfsense:tabs",
 	initialState,
-	sessionStorageAdapter,
+	localStorageAdapter,
 	{ getOnInit: true }
 );
 
@@ -72,7 +74,17 @@ export const syncChatTabAtom = atom(
 	(
 		get,
 		set,
-		{ chatId, title, chatUrl }: { chatId: number | null; title?: string; chatUrl?: string }
+		{
+			chatId,
+			title,
+			chatUrl,
+			searchSpaceId,
+		}: {
+			chatId: number | null;
+			title?: string;
+			chatUrl?: string;
+			searchSpaceId: number;
+		}
 	) => {
 		if (chatId && get(deletedChatIdsAtom).has(chatId)) {
 			return;
@@ -87,20 +99,32 @@ export const syncChatTabAtom = atom(
 				...state,
 				activeTabId: tabId,
 				tabs: state.tabs.map((t) =>
-					t.id === tabId ? { ...t, title: title || t.title, chatUrl: chatUrl || t.chatUrl } : t
+					t.id === tabId
+						? {
+								...t,
+								title: title || t.title,
+								chatUrl: chatUrl || t.chatUrl,
+								searchSpaceId: searchSpaceId ?? t.searchSpaceId,
+							}
+						: t
 				),
 			});
 			return;
 		}
 
 		// If navigating to a new chat (no chatId), ensure there's a "new chat" tab
+		// scoped to the current search space.
 		if (!chatId) {
 			const hasNewChatTab = state.tabs.some((t) => t.id === "chat-new");
 			if (hasNewChatTab) {
-				set(tabsStateAtom, { ...state, activeTabId: "chat-new" });
+				set(tabsStateAtom, {
+					...state,
+					activeTabId: "chat-new",
+					tabs: state.tabs.map((t) => (t.id === "chat-new" ? { ...t, searchSpaceId, chatUrl } : t)),
+				});
 			} else {
 				set(tabsStateAtom, {
-					tabs: [...state.tabs, INITIAL_CHAT_TAB],
+					tabs: [...state.tabs, { ...INITIAL_CHAT_TAB, searchSpaceId, chatUrl }],
 					activeTabId: "chat-new",
 				});
 			}
@@ -112,9 +136,10 @@ export const syncChatTabAtom = atom(
 		const newTab: Tab = {
 			id: tabId,
 			type: "chat",
-			title: title || `Chat ${chatId}`,
+			title: title || "New Chat",
 			chatId,
 			chatUrl,
+			searchSpaceId,
 		};
 
 		let updatedTabs: Tab[];
diff --git a/surfsense_web/atoms/user/user-query.atoms.ts b/surfsense_web/atoms/user/user-query.atoms.ts
index ff14774822..8792c4a962 100644
--- a/surfsense_web/atoms/user/user-query.atoms.ts
+++ b/surfsense_web/atoms/user/user-query.atoms.ts
@@ -1,4 +1,5 @@
 import { atomWithQuery } from "jotai-tanstack-query";
+import { getBearerToken } from "@/lib/auth-utils";
 import { userApiService } from "@/lib/apis/user-api.service";
 
 export const USER_QUERY_KEY = ["user", "me"] as const;
@@ -7,8 +8,11 @@ const userQueryFn = () => userApiService.getMe();
 export const currentUserAtom = atomWithQuery(() => {
 	return {
 		queryKey: USER_QUERY_KEY,
-		staleTime: 5 * 60 * 1000,
-		enabled: true,
+		// Live-changing numeric fields (pages_*, premium_credit_micros_*)
+		// are now pushed via Zero (queries.user.me()), so /users/me only
+		// needs to fire once per session for the static profile fields.
+		staleTime: Infinity,
+		enabled: !!getBearerToken(),
 		queryFn: userQueryFn,
 	};
 });
diff --git a/surfsense_web/atoms/vision-llm-config/vision-llm-config-mutation.atoms.ts b/surfsense_web/atoms/vision-llm-config/vision-llm-config-mutation.atoms.ts
index b1aa01c6bb..f46b977d5e 100644
--- a/surfsense_web/atoms/vision-llm-config/vision-llm-config-mutation.atoms.ts
+++ b/surfsense_web/atoms/vision-llm-config/vision-llm-config-mutation.atoms.ts
@@ -18,6 +18,7 @@ export const createVisionLLMConfigMutationAtom = atomWithMutation((get) => {
 
 	return {
 		mutationKey: ["vision-llm-configs", "create"],
+		meta: { suppressGlobalErrorToast: true },
 		enabled: !!searchSpaceId,
 		mutationFn: async (request: CreateVisionLLMConfigRequest) => {
 			return visionLLMConfigApiService.createConfig(request);
@@ -39,6 +40,7 @@ export const updateVisionLLMConfigMutationAtom = atomWithMutation((get) => {
 
 	return {
 		mutationKey: ["vision-llm-configs", "update"],
+		meta: { suppressGlobalErrorToast: true },
 		enabled: !!searchSpaceId,
 		mutationFn: async (request: UpdateVisionLLMConfigRequest) => {
 			return visionLLMConfigApiService.updateConfig(request);
@@ -63,6 +65,7 @@ export const deleteVisionLLMConfigMutationAtom = atomWithMutation((get) => {
 
 	return {
 		mutationKey: ["vision-llm-configs", "delete"],
+		meta: { suppressGlobalErrorToast: true },
 		enabled: !!searchSpaceId,
 		mutationFn: async (request: { id: number; name: string }) => {
 			return visionLLMConfigApiService.deleteConfig(request.id);
diff --git a/surfsense_web/biome.json b/surfsense_web/biome.json
index 738a3636d8..aa71f509ed 100644
--- a/surfsense_web/biome.json
+++ b/surfsense_web/biome.json
@@ -7,7 +7,19 @@
 	},
 	"files": {
 		"ignoreUnknown": true,
-		"includes": ["**", "!!node_modules", "!!.git", "!!.next", "!!dist", "!!build", "!!coverage"],
+		"includes": [
+			"**",
+			"!!node_modules",
+			"!!.git",
+			"!!.next",
+			"!!dist",
+			"!!build",
+			"!!coverage",
+			"!!test-results",
+			"!!playwright-report",
+			"!!blob-report",
+			"!!playwright/.auth"
+		],
 		"maxSize": 1048576
 	},
 	"formatter": {
diff --git a/surfsense_web/blog/content/agentic-rag-vs-long-context-llms-benchmark.mdx b/surfsense_web/blog/content/agentic-rag-vs-long-context-llms-benchmark.mdx
new file mode 100644
index 0000000000..2475363395
--- /dev/null
+++ b/surfsense_web/blog/content/agentic-rag-vs-long-context-llms-benchmark.mdx
@@ -0,0 +1,387 @@
+---
+title: "Agentic RAG vs Long-Context LLMs: A 171-Question Benchmark on 30 Long PDFs"
+description: "We benchmarked agentic RAG against long-context LLMs and native PDF attachment on 171 real questions across 30 long, multimodal PDFs, using Claude Sonnet 4.5 on every arm. Accuracy, cost per query, failure modes, and a vision-LLM-vs-OCR finding the internet still expects to go the other way."
+date: "2026-05-15"
+image: "/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-01-hero-image.png"
+author: "SurfSense Team"
+authorAvatar: "/logo.png"
+tags:
+  - "Agentic RAG"
+  - "Long-Context LLM"
+  - "RAG vs Agentic"
+  - "Vision LLM vs OCR"
+  - "Benchmark"
+  - "Claude Sonnet 4.5"
+  - "MMLongBench-Doc"
+featured: true
+featured_order: 1
+---
+
+> **TL;DR for skimmers**
+>
+> We ran six different ways of answering questions over 30 long, image-heavy PDFs (a total of 171 real questions) using the *same* large language model, Claude Sonnet 4.5, and measured accuracy, cost per question, and how often each approach broke. The result:
+>
+> - **Full-context "long-context" approaches won on raw accuracy** (LlamaCloud premium 59.6%, Azure premium 58.5%).
+> - **Agentic RAG was nearly as accurate (53.2%) at less than half the cost ($0.0827 per question vs $0.18–$0.26)** and zero failed queries out of 171.
+> - **Most accuracy gaps were not statistically significant.** 12 of 15 head-to-head comparisons could be coin-flips (McNemar test, α = 0.05).
+> - **Vision LLMs did not beat traditional OCR.** Letting Claude read the PDF directly with its built-in vision (the `native_pdf` arm) finished 5th of 6, behind every parser-based pipeline, with a stubborn 7% intrinsic failure rate that survived 5 retries with exponential backoff.
+>
+> Practical takeaway: if you are building a long-PDF Q&A product, **agentic RAG is the boring-but-correct default**. Reach for full-context only when the document fits, the budget allows, and the accuracy gain matters. Don't bet on vision LLMs replacing OCR pipelines yet.
+
+<img
+  src="/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-01-hero-image.png"
+  alt="Diagram comparing agentic RAG, long-context LLM, and native PDF pipelines for document question answering."
+  width={1920}
+  height={1080}
+  style={{ width: '100%', height: 'auto', borderRadius: '12px' }}
+/>
+
+## Why this matters: the agentic RAG vs long-context debate
+
+If you are shipping anything that lets a user ask questions about a PDF, whether that is a contract analyser, a research assistant, or an internal docs chatbot, you have hit one of the loudest arguments in AI engineering today.
+
+On one side: **long-context LLMs**. Modern models from Anthropic, OpenAI and Google now accept hundreds of thousands of tokens in a single prompt. Just stuff the whole document in and ask the question. Simple, fast to build.
+
+On the other side: **agentic RAG** (retrieval-augmented generation, where an agent dynamically pulls relevant chunks instead of dumping the whole document into the prompt). More complex, but classically considered cheaper and safer at scale.
+
+Layered on top of that is a quieter argument: **do you even need a document parser anymore?** Frontier models from Anthropic, OpenAI and Google now read PDFs natively using their vision stack. The story everyone wants to be true is that vision-capable LLMs make OCR pipelines obsolete. We tested that story too. Spoiler: not yet.
+
+The internet is full of opinions and very thin on data, especially for *long, multimodal* PDFs (the messy, image-heavy real-world kind). So we built a benchmark with the same model on every arm and measured what actually happens, on both questions at once.
+
+## What is agentic RAG?
+
+Quick definitions, in plain English, then we move to the data.
+
+**RAG (retrieval-augmented generation)** is the standard pattern for letting a language model answer questions about your private documents. You chunk the documents into pieces, store them in a vector database, and at query time you retrieve the chunks most likely to contain the answer and pass them to the model.
+
+**Agentic RAG** is RAG with an LLM agent in the driver's seat. Instead of one fixed retrieval step, the agent can:
+
+- ask itself sub-questions,
+- run multiple searches with different queries,
+- decide when it has enough evidence,
+- ignore irrelevant chunks,
+- and stop when the answer is complete.
+
+Think of vanilla RAG as handing a librarian one note that says *"find me the answer to X"*. Agentic RAG is handing the same librarian a research brief and a clipboard, and letting them walk back and forth between the shelves until the report writes itself.
+
+<img
+  src="/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-02-architecture-diagram.png"
+  alt="Agentic RAG architecture diagram showing an LLM agent iteratively retrieving document chunks before producing a final answer."
+  width={1920}
+  height={1080}
+  style={{ width: '100%', height: 'auto', borderRadius: '12px' }}
+/>
+
+For a 5-minute video walk-through, IBM Technology has the highest-ranked explainer on YouTube right now (325k views, watched by us, and accurate):
+
+<div style={{ position: 'relative', paddingBottom: '56.25%', height: 0, overflow: 'hidden', borderRadius: '12px', margin: '1.5rem 0' }}>
+  <iframe
+    src="https://www.youtube-nocookie.com/embed/0z9_MhcYvcY"
+    title="What is Agentic RAG? - IBM Technology"
+    allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
+    referrerPolicy="strict-origin-when-cross-origin"
+    allowFullScreen
+    style={{ position: 'absolute', top: 0, left: 0, width: '100%', height: '100%', border: 0 }}
+  />
+</div>
+
+## How we ran the benchmark
+
+To make the comparison fair, every arm answered the same questions and used the exact same large language model: **Claude Sonnet 4.5**, called through OpenRouter so the API path was identical.
+
+The dataset was [**MMLongBench-Doc**](https://mayubo2333.github.io/MMLongBench-Doc/) ([paper](https://arxiv.org/abs/2407.01523), [GitHub](https://github.com/mayubo2333/MMLongBench-Doc), [Hugging Face](https://huggingface.co/datasets/yubo2333/MMLongBench-Doc)), an open multimodal-document benchmark of long PDFs with vetted question-answer pairs. The full corpus is 135 PDFs averaging 47.5 pages each, with 1,091 expert-annotated questions across 7 domains (33% cross-page, 22.5% deliberately unanswerable to detect hallucinations). We used the first 30 documents (a mix of research papers, financial filings, product catalogues, and image-heavy reports) and all 171 of their answerable questions.
+
+### Why multimodal documents?
+
+Real-world PDFs are messy. They contain charts, scanned tables, photos, multi-column layouts, and footnotes inside footnotes. A clean text-only benchmark wouldn't tell us anything useful about whether these approaches survive contact with the documents people actually upload to AI products. MMLongBench-Doc was built to include exactly that messiness, which is the territory where parser quality and retrieval strategy actually start to matter. We wanted the benchmark to look like the real inbox of an AI app, not a sanitised research toy.
+
+### Why only 30 documents?
+
+The full MMLongBench-Doc corpus has 135 PDFs. Processing the entire dataset across all six arms would have taken significantly longer to complete on my machine, so we capped the run at 30 to keep iteration time reasonable. We're upfront about what that costs us statistically in the significance section below: a bigger sample would have tightened every confidence interval. The findings here should be read as strong directional evidence, not a final verdict.
+
+### The six arms
+
+| Arm | What it does | Preprocessing | What goes in the prompt |
+|---|---|---|---|
+| `native_pdf` | Sends the raw PDF file directly to the model | None | The PDF itself, every question |
+| `azure_basic_lc` | Parses the PDF with Azure Document Intelligence (cheap mode) | $1 per 1,000 pages | The whole markdown, every question |
+| `azure_premium_lc` | Same as above, premium parser (preserves layout) | $10 per 1,000 pages | The whole markdown, every question |
+| `llamacloud_basic_lc` | Parses the PDF with LlamaParse (cheap mode) | $1 per 1,000 pages | The whole markdown, every question |
+| `llamacloud_premium_lc` | LlamaParse premium with layout/table preservation | $10 per 1,000 pages | The whole markdown, every question |
+| `surfsense_agentic` | Full agentic RAG pipeline | $10 per 1,000 pages (one-time ingest) | Only the chunks the agent decides to retrieve |
+
+Arms 2-5 are what we call **"long-context" or "full-context" stuffing**: parse the PDF once, paste the entire result into every prompt. Arm 6 is the agentic RAG approach. Arm 1, the `native_pdf` "just attach the PDF" pattern, is doing double duty here. It is also the **"vision LLM replaces OCR" hypothesis**: instead of any markdown parser, the model reads the PDF directly using its built-in vision capabilities. If vision-capable LLMs are good enough to retire OCR pipelines, this arm should be at the top of the table. (It isn't.)
+
+If you want to read the implementations, every arm lives in [`surfsense_evals/src/surfsense_evals/core/arms/`](https://github.com/MODSetter/SurfSense/tree/main/surfsense_evals/src/surfsense_evals/core/arms) — the [`bare_llm.py`](https://github.com/MODSetter/SurfSense/blob/main/surfsense_evals/src/surfsense_evals/core/arms/bare_llm.py) arm handles full-context stuffing, [`native_pdf.py`](https://github.com/MODSetter/SurfSense/blob/main/surfsense_evals/src/surfsense_evals/core/arms/native_pdf.py) handles vision-LLM PDF attachment, and [`surfsense.py`](https://github.com/MODSetter/SurfSense/blob/main/surfsense_evals/src/surfsense_evals/core/arms/surfsense.py) drives the agentic retrieval against the SurfSense `/api/v1/new_chat` endpoint. The full benchmark suite (prompts, ingest pipeline, runner) lives in [`suites/multimodal_doc/parser_compare/`](https://github.com/MODSetter/SurfSense/tree/main/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/parser_compare).
+
+We graded answers with a [deterministic, format-aware grader](https://github.com/MODSetter/SurfSense/blob/main/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/grader.py) (1% relative tolerance for floats, F1 over normalised tokens for lists). We logged input/output tokens, cost, latency, and any HTTP error per question.
+
+## Headline results: who wins on accuracy?
+
+After running all 171 questions through all 6 arms, then re-running the 37 failed queries with up to 5 attempts of exponential backoff, here is the scoreboard:
+
+<img
+  src="/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-03-accuracy-bar-chart-dark.png"
+  alt="Bar chart of post-retry accuracy on 171 long-PDF questions: LlamaCloud premium 59.6%, Azure premium 58.5%, Azure basic 54.4%, SurfSense agentic RAG 53.2%, Native PDF 52.0%, LlamaCloud basic 50.9%."
+  width={2200}
+  height={1240}
+  style={{ width: '100%', height: 'auto', borderRadius: '12px' }}
+/>
+
+The full table including F1, raw failures, and median latency:
+
+Bolded cell = winner of that column.
+
+| Rank | Arm | Accuracy | F1 | Median latency | Raw failures |
+|---:|---|---:|---:|---:|---:|
+| 1 | LlamaCloud premium, long-context | **59.6%** | **61.1%** | **6.8 s** | 4 |
+| 2 | Azure premium, long-context | 58.5% | 59.6% | 6.9 s | 3 |
+| 3 | Azure basic, long-context | 54.4% | 56.6% | 7.1 s | 1 |
+| 4 | SurfSense agentic RAG | 53.2% | 54.3% | 52.8 s | **0** |
+| 5 | Native PDF attachment | 52.0% | 50.4% | 29.5 s | 27 |
+| 6 | LlamaCloud basic, long-context | 50.9% | 53.2% | 7.1 s | 2 |
+
+A few things jump out:
+
+1. **The two long-context premium parsers win on raw accuracy**, but only by about 6 percentage points over agentic RAG.
+2. **Agentic RAG was the only arm with zero failures** out of 171 questions.
+3. **Native PDF attachment was the worst performer** despite being the most "AI-native" approach. More on why in the failure-mode section.
+4. **Latency on agentic RAG is high (52.8 s)** because the agent does several retrieval rounds. For batch jobs it's fine; for chat UX you'd stream partial results.
+
+Now the part most blog posts skip.
+
+## Cost per query: where agentic RAG wins big
+
+Accuracy is only half the story. Every approach also has a price tag: the LLM call plus the document preprocessing.
+
+<img
+  src="/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-04-cost-vs-accuracy-dark.png"
+  alt="Scatter plot of accuracy versus cost per query for six document-QA approaches; SurfSense agentic RAG sits at the cheapest end with competitive accuracy."
+  width={2200}
+  height={1280}
+  style={{ width: '100%', height: 'auto', borderRadius: '12px' }}
+/>
+
+Bolded cell = winner of that column.
+
+| Arm | Total $/Q | Accuracy |
+|---|---:|---:|
+| SurfSense agentic RAG | **$0.0827** | 53.2% |
+| LlamaCloud basic | $0.1049 | 50.9% |
+| Azure basic | $0.1062 | 54.4% |
+| LlamaCloud premium | $0.1885 | **59.6%** |
+| Azure premium | $0.2051 | 58.5% |
+| Native PDF | $0.2552 | 52.0% |
+
+The headline number: **agentic RAG was the cheapest arm at $0.0827 per question, about 60% cheaper than the most accurate full-context arm and 67% cheaper than native PDF attachment.** The technique wins on cost regardless of which agentic-RAG framework you use; we just happened to measure it with ours.
+
+Why is agentic RAG so much cheaper? Because every full-context arm pays the parser+LLM bill *for the entire document on every single question*. A 100-page PDF? You pay for 100 pages of input tokens 10 times if the user asks 10 questions. Agentic RAG pays the parser once at ingest time, then only sends the retrieved chunks (often 1–5% of the document) per question.
+
+There is a clean closed-form for this. If a document has *P* pages, a parser costs *c<sub>p</sub>* per page, the LLM costs *c<sub>L</sub>* per full-document call, and the user asks *Q* questions, then full-context cost-per-question is roughly:
+
+```
+Cost/Q ≈ (P × c_p) / Q + c_L
+```
+
+For agentic RAG it is:
+
+```
+Cost/Q ≈ (P × c_p) / Q + c_L × r
+```
+
+where `r` is the *retrieval ratio*, typically 0.02 to 0.10. So the more questions per document, the more agentic RAG dominates. For knowledge bases that get queried more than a couple of times, the gap widens by the day.
+
+## Failure modes: what 37 broken queries taught us
+
+We did not just count successes. We logged every error.
+
+Of 1,026 total `(arm, question)` cells, 37 returned no answer on the first pass. We then re-ran *only* those 37 with up to 5 attempts of exponential backoff (the [`retry_failed_questions.py`](https://github.com/MODSetter/SurfSense/blob/main/surfsense_evals/scripts/retry_failed_questions.py) script in the harness). The results separated **transient** (network/server) failures from **intrinsic** (the approach actually cannot do this) failures:
+
+Bolded cell = best result on that column (lowest failure rate, highest recovery rate).
+
+| Arm | First-pass failures | Recovered on retry | Intrinsic failures | Intrinsic failure rate |
+|---|---:|---:|---:|---:|
+| All 4 long-context arms (combined) | 10 | **10 (100%)** | **0** | **0%** |
+| Native PDF | 27 | 15 | 12 | 7.0% |
+| SurfSense agentic RAG | **0** | n/a | **0** | **0%** |
+
+Two findings worth highlighting:
+
+**1. Long-context "context overflow" was a myth.** We hypothesised that the long-context arms might be silently failing because the document didn't fit in the context window. We tested it: the failures clustered around HTTP/SSL errors (the request body was up to 30 MB, riding the public internet), not token limits. Once we retried, all 10 came back successfully. The Claude Sonnet 4.5 context window held up fine; the *transport layer* wobbled.
+
+**2. Native PDF has a stubborn 7% intrinsic failure rate.** Two specific PDFs broke it permanently:
+
+- a 27-page image-heavy PDF whose binary exceeded the provider's 30 MB request-body cap (6 questions broken);
+- a 166-page PDF whose response stream the provider could never reliably terminate (5 questions, repeated `empty stream` errors).
+
+Even with 5 attempts of exponential backoff, those 12 questions stayed broken. **For any production app that processes PDFs from arbitrary users, that is a 7% "this document cannot be answered today" rate**, which is unacceptable for most product flows.
+
+Agentic RAG sidesteps both problems because it never sends the raw PDF and never sends the entire document context in one giant request.
+
+### What this means for the vision-LLM-vs-OCR debate
+
+Bigger picture, the `native_pdf` numbers settle a question we wanted to answer: **on long, image-heavy PDFs, vision-capable LLMs reading the document directly did not outperform plain OCR plus markdown.** They came in 5th of 6 on accuracy (52.0% vs 50.9% to 59.6% for the OCR-based pipelines), were the most expensive arm at $0.2552 per question, and failed 7% of the time even after retries. Premium OCR with layout extraction held up better on the exact pages where you would expect vision to shine, the chart-and-table-heavy ones.
+
+The point is not that vision LLMs are bad. They are remarkable. The point is that the parser pipeline you already maintain is not yet obsolete, and the "skip the parser, attach the PDF" shortcut is not a free lunch.
+
+## Statistical significance: are these results actually different?
+
+This is the section most benchmarks omit, and it changes the conclusions.
+
+We ran McNemar's exact-binomial test on every pair of arms (15 pairs total) using [`compute_blog_extras.py`](https://github.com/MODSetter/SurfSense/blob/main/surfsense_evals/scripts/compute_blog_extras.py). McNemar is the right test for paired classifier comparisons: both arms answered the *same* questions, so we can ask: "of the questions where the two arms disagreed, did one really win more often than chance?"
+
+The result: **only 3 of 15 pairs are distinguishable at α = 0.05**.
+
+The three statistically-significant gaps are all between the *worst* arms (LlamaCloud basic, Native PDF) and the *best* arms (LlamaCloud premium, Azure premium). The most interesting comparison, **SurfSense agentic RAG vs the long-context premium arms**, does *not* clear the significance bar. The 6-point gap could plausibly be sample noise.
+
+In other words: on this dataset, the headline claim "long-context beats agentic RAG by 6 percentage points" is real on the scoreboard but **not statistically robust**. Run the same benchmark on a different sample of 30 PDFs and the order could shuffle. This is also the place where our 30-document scope bites us: a bigger run would have given more comparisons enough power to settle.
+
+## When to choose what: a decision framework
+
+Reading the data without an action plan is half the value. Here is how we would decide for a real product, using the same numbers.
+
+<img
+  src="/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-05-decision-tree.png"
+  alt="Decision tree showing when to choose agentic RAG, long-context full-context, or native PDF attachment for document question answering."
+  width={1920}
+  height={1080}
+  style={{ width: '100%', height: 'auto', borderRadius: '12px' }}
+/>
+
+### Use **agentic RAG** when
+
+- documents are long and mixed in size (some 5 pages, some 500),
+- the same documents will be queried more than once or twice,
+- cost per query matters (SaaS pricing, large user base),
+- you cannot afford intermittent failures on big PDFs,
+- you need to scale to corpora of thousands of documents.
+
+This is the default for most production AI products.
+
+### Use **long-context (full-context) LLMs** when
+
+- documents reliably fit in the context window after parsing (typically under ~150 pages of text),
+- the accuracy gain (6 percentage points in our benchmark, or zero, depending on which way the noise goes) actually justifies the 60–150% extra cost,
+- you have one or two questions per document, not dozens,
+- you can absorb occasional network failures on large request bodies.
+
+Premium parsing matters here. **Spending $10 per 1,000 pages on a layout-aware parser is worth it**: it gave us +4 to +9 accuracy points over basic parsers on the same questions.
+
+### Use **native PDF attachment** when
+
+- you are prototyping and want to ship in an afternoon,
+- documents are small and well-formatted,
+- you can tolerate a 7% failure rate (or you have validated the specific PDFs you care about don't trip the limits).
+
+Don't use it as the default for user-uploaded PDFs in production. The 30 MB request-body cap and unstable response streams will bite you, and exponential backoff will not save you.
+
+## Frequently asked questions
+
+<Accordion type="multiple" className="w-full not-prose">
+  <AccordionItem value="faq-1">
+    <AccordionTrigger>What is agentic RAG?</AccordionTrigger>
+    <AccordionContent className="flex flex-col gap-4 text-balance">
+      Agentic RAG is retrieval-augmented generation where an LLM agent (not a fixed pipeline) decides what to retrieve, when to stop, and how to combine evidence. Instead of one search and one answer, the agent can run multiple retrievals, refine its query, and iterate. It usually costs less than full-context prompting and handles arbitrarily large document collections.
+    </AccordionContent>
+  </AccordionItem>
+  <AccordionItem value="faq-2">
+    <AccordionTrigger>How is agentic RAG different from traditional RAG?</AccordionTrigger>
+    <AccordionContent className="flex flex-col gap-4 text-balance">
+      Traditional RAG runs a single, fixed retrieval step: take the user's question, find the top-k similar chunks, send them to the LLM. Agentic RAG lets the LLM plan, retrieve repeatedly, evaluate intermediate results, and decide when it has enough context. It is more flexible at the cost of more LLM calls, and it tends to outperform vanilla RAG on multi-hop or ambiguous queries.
+    </AccordionContent>
+  </AccordionItem>
+  <AccordionItem value="faq-3">
+    <AccordionTrigger>When should I use long-context LLMs instead of RAG?</AccordionTrigger>
+    <AccordionContent className="flex flex-col gap-4 text-balance">
+      When the document fits in the model's context window after parsing, you have a small number of queries per document, accuracy matters more than cost, and you can tolerate occasional transport-layer failures on multi-megabyte requests. In our benchmark, full-context premium parsers led on accuracy (about 58–60%) but cost 2–3× more per query than agentic RAG.
+    </AccordionContent>
+  </AccordionItem>
+  <AccordionItem value="faq-4">
+    <AccordionTrigger>What is a long context window?</AccordionTrigger>
+    <AccordionContent className="flex flex-col gap-4 text-balance">
+      A long context window is the maximum amount of text (measured in tokens) that an LLM can read in a single prompt. Modern frontier models support 200,000 tokens or more, which is roughly 150,000 words or 300+ printed pages. A long context window enables "just stuff the whole document in" approaches, but it does not eliminate the need for RAG when corpora exceed what one prompt can hold or when cost matters.
+    </AccordionContent>
+  </AccordionItem>
+  <AccordionItem value="faq-5">
+    <AccordionTrigger>How do you benchmark agentic RAG?</AccordionTrigger>
+    <AccordionContent className="flex flex-col gap-4 text-balance">
+      Run the same set of real-world questions through each approach using the *same* underlying LLM, log accuracy with a deterministic grader, log cost (LLM + preprocessing), log latency, and run pairwise McNemar tests for statistical significance. We used 171 questions across 30 long PDFs from MMLongBench-Doc.
+    </AccordionContent>
+  </AccordionItem>
+  <AccordionItem value="faq-6">
+    <AccordionTrigger>How much does agentic RAG cost per query?</AccordionTrigger>
+    <AccordionContent className="flex flex-col gap-4 text-balance">
+      In our benchmark, agentic RAG cost **$0.0827 per question** end-to-end (including a one-time premium parsing cost amortised across all questions for the document). The cheapest full-context arm cost $0.1049 (about 27% more); the most expensive cost $0.2552 (over 3× more). Cost per query for agentic RAG drops further as you ask more questions per document.
+    </AccordionContent>
+  </AccordionItem>
+  <AccordionItem value="faq-7">
+    <AccordionTrigger>Is RAG dead now that we have long context?</AccordionTrigger>
+    <AccordionContent className="flex flex-col gap-4 text-balance">
+      No, and this benchmark is part of the evidence. Long-context wins on raw accuracy by a small margin that is mostly within statistical noise, but RAG (especially agentic RAG) wins on cost per query, on robustness to large or malformed documents, and on horizontal scaling to large corpora. The right answer is "use the cheapest pattern that hits your accuracy target", which for most production apps is agentic RAG.
+    </AccordionContent>
+  </AccordionItem>
+  <AccordionItem value="faq-8">
+    <AccordionTrigger>Do vision LLMs outperform OCR for PDF question answering?</AccordionTrigger>
+    <AccordionContent className="flex flex-col gap-4 text-balance">
+      Not in our benchmark. The `native_pdf` arm, which lets Claude Sonnet 4.5 read each PDF directly using its native vision capabilities, finished 5th of 6 with 52.0% accuracy and a 7% intrinsic failure rate. Every OCR-based pipeline we tested (Azure Document Intelligence and LlamaParse, in both basic and premium tiers) either matched or beat it on accuracy at lower cost. Premium OCR with layout extraction held up especially well on chart-heavy and table-heavy pages, the exact territory where you would expect a vision model to dominate. Vision-capable LLMs may catch up as the models improve, but as of mid-2026, the safer default for long, multimodal PDFs is still parser plus markdown.
+    </AccordionContent>
+  </AccordionItem>
+</Accordion>
+
+## What this means for your AI app
+
+If you are choosing an architecture for a long-PDF Q&A product *today*:
+
+1. **Start with agentic RAG.** It is the cheapest, most robust default and gets you within statistical noise of full-context approaches.
+2. **Pay for premium parsing once.** Whether you choose RAG or full-context, layout-aware parsing buys you real accuracy points. The marginal cost is trivial against the LLM bill.
+3. **Avoid plain "attach the PDF" in production** unless you have validated every document path. The 7% intrinsic failure rate is real and not retry-able.
+4. **Don't trust accuracy gaps under 5–6 points** unless you have tested for significance. McNemar takes 30 seconds in Python and saves embarrassing benchmark posts.
+5. **Don't bet on vision LLMs replacing OCR yet.** On 30 long, image-heavy PDFs, the native PDF (vision LLM) path lost to every OCR-based pipeline on accuracy and was the most expensive arm at $0.2552 per question. The OCR pipeline you already maintain is not obsolete.
+
+## Reproduce this benchmark
+
+Everything that produced these numbers is open source. The eval harness is its own package inside the SurfSense monorepo:
+
+- [`surfsense_evals/`](https://github.com/MODSetter/SurfSense/tree/main/surfsense_evals) — the harness root (extensible base classes, providers, cost ledger).
+- [`suites/multimodal_doc/parser_compare/`](https://github.com/MODSetter/SurfSense/tree/main/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/parser_compare) — the benchmark used in this post (prompts, ingest, runner).
+- [`core/arms/bare_llm.py`](https://github.com/MODSetter/SurfSense/blob/main/surfsense_evals/src/surfsense_evals/core/arms/bare_llm.py), [`native_pdf.py`](https://github.com/MODSetter/SurfSense/blob/main/surfsense_evals/src/surfsense_evals/core/arms/native_pdf.py), [`surfsense.py`](https://github.com/MODSetter/SurfSense/blob/main/surfsense_evals/src/surfsense_evals/core/arms/surfsense.py) — the three arm implementations.
+- [`mmlongbench/grader.py`](https://github.com/MODSetter/SurfSense/blob/main/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/grader.py) — the deterministic format-aware grader.
+- [`scripts/retry_failed_questions.py`](https://github.com/MODSetter/SurfSense/blob/main/surfsense_evals/scripts/retry_failed_questions.py) — the failed-only retry pass with exponential backoff.
+- [`scripts/compute_blog_extras.py`](https://github.com/MODSetter/SurfSense/blob/main/surfsense_evals/scripts/compute_blog_extras.py) — the McNemar pairwise tests, latency/token percentiles, and per-PDF heterogeneity.
+- [`scripts/compute_post_retry_accuracy.py`](https://github.com/MODSetter/SurfSense/blob/main/surfsense_evals/scripts/compute_post_retry_accuracy.py) — merges retry survivors back into the run and recomputes the headline numbers.
+
+A minimal end-to-end run looks like:
+
+```bash
+# 1. Clone, install (uv recommended)
+git clone https://github.com/MODSetter/SurfSense
+cd SurfSense/surfsense_evals
+uv sync --extra dev
+
+# 2. Configure provider keys (Azure DI, LlamaCloud, OpenRouter, SurfSense)
+cp .env.example .env
+$EDITOR .env
+
+# 3. Ingest the first 30 PDFs from MMLongBench-Doc into all parsers
+uv run python -m surfsense_evals.cli setup multimodal_doc \
+  --vision-llm anthropic/claude-sonnet-4.5
+uv run python -m surfsense_evals.cli ingest multimodal_doc \
+  --suite parser_compare --max-docs 30
+
+# 4. Run all six arms × all 171 questions
+uv run python -m surfsense_evals.cli run multimodal_doc \
+  --suite parser_compare --sample-per-doc 20 --concurrency 2
+
+# 5. Retry failures + compute final stats
+uv run python scripts/retry_failed_questions.py
+uv run python scripts/compute_post_retry_accuracy.py
+uv run python scripts/compute_blog_extras.py
+```
+
+The dataset itself is on [Hugging Face](https://huggingface.co/datasets/yubo2333/MMLongBench-Doc) and the [original GitHub repo](https://github.com/mayubo2333/MMLongBench-Doc) (NeurIPS 2024 D&B Spotlight, [paper](https://arxiv.org/abs/2407.01523)). Bring your own LLM provider; swap `anthropic/claude-sonnet-4.5` for `openai/gpt-4o`, `google/gemini-2.5-pro`, or any OpenRouter slug to repeat the experiment with a different model.
+
+If you find that the rankings shuffle on your own document set, we want to hear about it. Open an issue on [the SurfSense repo](https://github.com/MODSetter/SurfSense/issues) with the run artifacts and we will link your results from this post.
+
+The eval harness is open source and runs against any OpenRouter model, so you can re-run the same questions on `openai/gpt-4o`, `google/gemini-2.5-pro`, or whichever model you are evaluating for production. Wire your own RAG framework into the [`Arm` base class](https://github.com/MODSetter/SurfSense/blob/main/surfsense_evals/src/surfsense_evals/core/arms/base.py) (LangChain, LlamaIndex, Haystack, your own stack) and you can drop it into the same comparison without changing the rest of the pipeline.
+
+If you want a hosted way to try agentic RAG on your own PDFs without writing the harness yourself, [SurfSense](/free) is one option (it is the same agentic stack that powered the `surfsense_agentic` arm above).
diff --git a/surfsense_web/blog/content/no-login-ai-privacy-reality-check.mdx b/surfsense_web/blog/content/no-login-ai-privacy-reality-check.mdx
new file mode 100644
index 0000000000..79e199a708
--- /dev/null
+++ b/surfsense_web/blog/content/no-login-ai-privacy-reality-check.mdx
@@ -0,0 +1,289 @@
+---
+title: "How to Use Claude, ChatGPT, and Gemini Without Signing Up: A Plain-English 2026 Guide"
+description: "Where to use Claude, ChatGPT, Gemini, and other top AI models without making an account. Honest 2026 guide for casual users and developers, covering message caps, what each tool limits, and the privacy reality behind 'no login required'."
+date: "2026-05-15"
+image: "/images/blog/no-login-ai-privacy-reality-check/placeholder-01-no-login-vs-no-tracking-hero.png"
+author: "SurfSense Team"
+authorAvatar: "/logo.png"
+tags:
+  - "AI Without Login"
+  - "Free AI Chat"
+  - "ChatGPT Without Account"
+  - "Claude Without Login"
+  - "Gemini Without Login"
+  - "Claude Incognito"
+  - "Duck.ai"
+  - "Brave Leo"
+  - "Self-Hosted AI"
+featured: false
+---
+
+> **TL;DR for skimmers**
+>
+> You don't need an account to use the best AI models in 2026. Here's the brand-by-brand answer:
+>
+> - **Want ChatGPT?** Open `chatgpt.com` in a fresh tab. Guest mode works, no signup, but the message cap is tight.
+> - **Want Claude?** `claude.ai` itself still wants an account, but Anthropic [shipped incognito mode on April 9, 2026](https://support.claude.com/en/articles/12260368-using-incognito-chats). For zero-account Claude, use [Duck.ai](https://duck.ai) (Claude Haiku 4.5, anonymized) or [Brave Leo](https://brave.com/leo/) inside the Brave browser.
+> - **Want Gemini?** Google requires a Google account on `gemini.google.com`. The closest no-signup path is an open-source aggregator like our own [SurfSense /free](/free), which lists Gemini among its model options without a Google sign-in.
+> - **Want all of them in one place?** Our open-source [SurfSense /free](/free) lets you pick from ChatGPT, Claude, Gemini, DeepSeek, Mistral, Llama, and a rotating list of other models with no account, and you get 500,000 free tokens to spend across any of them. [Duck.ai](https://duck.ai) and [Brave Leo](https://brave.com/leo/) are strong privacy-first alternatives.
+> - **Care about privacy too?** Most "no login" pages still log your IP and prompt content. The exceptions are Brave Leo, Duck.ai, and self-hosted models. Skip to [the privacy honest-talk](#the-privacy-honest-talk-no-login-is-not-the-same-as-anonymous) if that's the part you came for.
+
+<img
+  src="/images/blog/no-login-ai-privacy-reality-check/placeholder-01-no-login-vs-no-tracking-hero.png"
+  alt="Side-by-side illustration of a browser with a 'No login required' badge and the same browser with hidden IP, fingerprint, and prompt-log data being inspected, showing that no login is not the same as no tracking."
+  width={1920}
+  height={1080}
+  style={{ width: '100%', height: 'auto', borderRadius: '12px' }}
+/>
+
+The promise of the search query "AI without login" is simple: someone wants to chat with a top model without making an account. Maybe they're trying it out for the first time. Maybe they don't want another email in their inbox. Maybe they care about privacy. Maybe they're recommending an AI to a class, a parent, or a coworker who isn't going to deal with a signup wall.
+
+Whatever the reason, the answer in 2026 is mostly: **yes, you can do this**, but the options are scattered across product pages, browser features, and a long tail of wrapper sites that look identical. This guide is the cleanest map we could draw, organised so a casual user gets value in the first three minutes and a developer or privacy-conscious reader can keep going deeper.
+
+We're going to cite primary sources for everything that touches privacy or product behavior, so you can verify (and so the article ages well as the products change).
+
+## Where to use each major AI model without an account
+
+Let's go brand by brand. For each one, what works, what the cap looks like, and where to go.
+
+### ChatGPT, with no OpenAI account
+
+The shortest answer: open [chatgpt.com](https://chatgpt.com) in a private/incognito tab. ChatGPT's guest mode lets you send a few prompts and get GPT-5-class output without making an account. There's no specific page to go to; the site detects you're not logged in and gives you a guest experience automatically.
+
+**What it gives you:** the same headline GPT-5 model that paying users start with, for short queries. The interface is the standard ChatGPT UI without the sidebar or chat history panel.
+
+**What it doesn't give you:** file uploads, the Code Interpreter, conversation history (refresh the page and your chat is gone), and no advanced features like custom GPTs or memory.
+
+**The catch:** the guest cap is around 10 messages per 5-hour rolling window on the headline GPT-5-class model ([per OpenAI's current behavior](https://www.smashingapps.com/how-to-use-chatgpt-without-creating-an-account/)). After that you're auto-downgraded to the lighter GPT-5 Mini variant with no hard limit, not blocked by a hard "please sign up" wall. So you can keep going indefinitely on the smaller model, you just can't keep using the headline one.
+
+If GPT-5 quality on a no-signup page matters more than going through OpenAI specifically, [Bing Copilot](https://copilot.microsoft.com) at `copilot.microsoft.com` runs on a GPT-5-class backend, also works without a Microsoft account, and tends to have a more generous cap because Microsoft monetises through Bing search instead of subscriptions.
+
+You can also use **[SurfSense /free](/free)** (full disclosure: this is our own open-source aggregator) which lists ChatGPT among its no-signup models. You get 500,000 free tokens to spend across any model on the page, which is meaningfully more than what guest mode gives you before the signup wall. The source code is on [GitHub](https://github.com/MODSetter/SurfSense) so the privacy and quota behavior is auditable, not just promised.
+
+### Claude, with as little friction as possible
+
+Anthropic does require an account on `claude.ai` itself, but the picture is much better than it was a year ago. There are now four legitimate paths to using Claude without going through the full signup-and-stay-logged-in experience.
+
+**Path 1: Anthropic's incognito mode (existing account required).** Launched April 9, 2026, available on every Claude plan from Free to Enterprise. Click the ghost icon in the upper-right when starting a new chat. The interface gets a black border and an "Incognito chat" label. The conversation is not saved to your chat history, not used by Claude's memory feature, and not used for training. Source: [Anthropic Help Center](https://support.claude.com/en/articles/12260368-using-incognito-chats). This is the right answer if you already have a Claude account and want a temporary, no-trace conversation.
+
+**Path 2: Duck.ai (no account at all).** [duck.ai](https://duck.ai) is DuckDuckGo's chat product. Pick "Claude 4.5 Haiku" from the model dropdown and start chatting. No signup, no email. DuckDuckGo proxies your request through their own servers, so Anthropic never sees your IP. We'll cover the full privacy mechanics [below](#the-privacy-honest-talk-no-login-is-not-the-same-as-anonymous). Per-session cap exists but there's no persistent quota.
+
+**Path 3: Brave Leo (no account, browser-side).** Install the [Brave browser](https://brave.com), open the sidebar, click the Leo icon, pick Claude Haiku from the model dropdown. No signup. Brave doesn't collect identifiers tied to you ([per their docs](https://brave.com/leo/)). The trade-off is that you have to use Brave as your browser, and you're limited to Haiku on the free tier (Sonnet and Opus require Brave Leo Premium at $14.99/month).
+
+**Path 4: Multi-model aggregator pages.** These wrap the Anthropic API and serve Claude responses without an account on Anthropic. The pick we'd recommend (with the obvious disclosure that we made it) is **[SurfSense /free](/free)**: it lists Claude alongside ChatGPT, Gemini, DeepSeek, Mistral, and Llama in one chat UI, the source code is open on [GitHub](https://github.com/MODSetter/SurfSense) so the privacy and quota behavior is verifiable, and the 500K free token quota is shared across any model you pick (so you can spend the budget on Claude if that's what you came for). The closed-source alternatives ([HIX.AI](https://hix.ai/claude), [EaseMate](https://www.easemate.ai/ai-chat/ask-claude), [Eye2.ai](https://eye2.ai), [NoteGPT](https://notegpt.io/ai-models/claude-sonnet-4-5)) work too, but quality and message limits vary widely.
+
+For the developer-specific paths (Claude Code with Bedrock or Vertex AI authentication, the Claude for Open Source program), see Anthropic's [Claude Code authentication docs](https://code.claude.com/docs/en/authentication).
+
+### Gemini, the awkward one
+
+Google requires a Google account on `gemini.google.com` and there is no first-party guest mode. If you don't already use a Google account, this is the model with the most signup friction.
+
+The realistic options:
+
+- **Sign in with a throwaway Google account.** Imperfect but functional, and Google's free tier on Gemini is genuinely good (15 GB of Drive storage, deep web research, native voice).
+- **Use an aggregator that wraps Gemini.** **[SurfSense /free](/free)** (disclosure: ours, open source on [GitHub](https://github.com/MODSetter/SurfSense)) lists Gemini among its model options and forwards requests to Google's API behind the scenes, so the user-facing chat works with no Google sign-in and no Google identity tied to your prompts. Quality matches the underlying Gemini API tier we pay for. Other wrapper pages do the same thing but few publish their privacy or quota behavior; ours is auditable in source, and the 500K-token quota is shared across any model on the page (not just Gemini).
+- **Pick a different model.** If "I want long-context plus web search without signing in" is the actual need, Brave Leo with Claude Haiku, Bing Copilot, or Perplexity (more on that below) are no-signup substitutes for the most common Gemini use cases.
+
+### Multiple models in one tab
+
+If you want to compare answers from Claude, GPT, Gemini, and others without juggling four browser tabs and four signup walls, three products do this without an account:
+
+- **[SurfSense /free](/free)** (disclosure: ours, open source on [GitHub](https://github.com/MODSetter/SurfSense)) gives you a rotating list of models from OpenAI, Anthropic, Google, DeepSeek, Mistral, and Meta in one chat UI, no account, with **500,000 free tokens shared across any model** on the page (meaningfully more than the per-session caps you'll hit on Duck.ai or Brave Leo before they make you wait). The model lineup updates as new models ship, and the wrapper-layer code is on GitHub so the no-account session, the no-database-storage claim, and the quota behavior are all auditable. Trade-off: when you hit the 500K, you either sign up ($5 free credit, then $1 per $1 of model cost) or self-host.
+- **[Duck.ai](https://duck.ai)** has Claude 4.5 Haiku, Llama 4 Scout, Mistral Small 3 24B, GPT-4o mini, GPT-5 mini, and gpt-oss-120b on the free tier. The big win is genuine IP anonymisation; the trade-off is that everything is the cheaper tier of each model.
+- **[Brave Leo](https://brave.com/leo/)** has Claude Haiku, Llama 3.1 8B, Mixtral, and Qwen 3 14B in the Brave browser sidebar. Best privacy story of the three, but you have to use Brave as your browser.
+
+### Honourable mentions
+
+- **Perplexity** at [perplexity.ai](https://www.perplexity.ai) lets you do search-flavored AI lookups without an account. The interface is built around citations, which is great for fact-finding and not great for chat-style writing or code.
+- **DeepAI** at [deepai.org/chat](https://deepai.org/chat) and **HotBot AI** are older standalone chat products. They work, the model quality is below the frontier, and the privacy story is unremarkable.
+- **HIX.AI, EaseMate, NoteGPT, TalkAI, ChatBot Chat App** are wrapper sites that all do roughly the same thing: front a paid model API and serve it for free without an account. Use them if convenience is the only goal, with the caveat that none of them publicly commit to not retaining your prompts (more on that below).
+
+## What you give up by not making an account
+
+There's a real tradeoff between "skip the signup" and "have a good account experience". For most casual use it's worth it. For some workflows it isn't.
+
+Things you lose without an account:
+
+- **Saved chat history.** Most no-signup paths don't persist your conversation. Refresh the page and it's gone. (Brave Leo is an exception: it stores chat history locally on your device, so it survives between sessions but never leaves your machine. Duck.ai does the same with its "Recent Chats" feature.)
+- **File uploads, in most cases.** ChatGPT guest mode and Claude incognito do not let you attach a PDF or image. Duck.ai and Brave Leo are limited too. Aggregators vary.
+- **Tighter message limits.** ChatGPT guest mode caps fast. Claude on `claude.ai` lets account holders send 30-50 messages per 5-hour rolling window; guest paths are usually tighter.
+- **Cross-device continuity.** No signup means no syncing your conversations from laptop to phone.
+- **Power features.** No memory, no custom instructions, no Code Interpreter, no Anthropic Projects, no Gemini Workspace integrations.
+
+Things you gain:
+
+- **No marketing emails.** You're not on a list. You won't get the "you haven't tried our new feature!" emails or the retargeting ads that follow you around the web.
+- **No persistent identity.** The provider sees a session, not a user. Your prompts aren't tied to your purchase history, your YouTube viewing habits, or any other product the provider runs.
+- **No risk of accidental cross-account leakage.** A coworker who borrows your laptop sees a fresh chat, not your private history.
+
+For a quick prompt or a one-off question, the gains usually win. For sustained work, expect to either tolerate the limits or eventually sign up.
+
+## The privacy honest-talk: "no login" is not the same as "anonymous"
+
+Here's the part that surprises most readers, and the one most listicles dodge: skipping the signup does not make your AI usage anonymous. It removes one identifier (your account) but leaves several others in place. Whether that matters depends entirely on what you're pasting into the chat box.
+
+The honest categorisation of no-signup AI looks like this.
+
+<img
+  src="/images/blog/no-login-ai-privacy-reality-check/placeholder-02-three-tier-pyramid.png"
+  alt="A 3-tier pyramid showing: bottom tier 'genuinely anonymized' with Brave Leo, Duck.ai, and self-hosted Ollama; middle tier 'no account but provider logs' with chatgpt.com guest, Claude incognito, and Bing Copilot; top tier 'wrapper sites that also log you' with HIX, EaseMate, NoteGPT examples."
+  width={1920}
+  height={1080}
+  style={{ width: '100%', height: 'auto', borderRadius: '12px' }}
+/>
+
+**Anonymised tools** (the smallest group). The provider commits to not logging your IP and not retaining your prompts. There are essentially three options today:
+
+- **Brave Leo.** Per [Brave's product page](https://brave.com/leo/): "We do not collect identifiers that can be linked to you (such as IP Address), and no personal data is retained." Translation: the model provider that powers Leo (Anthropic, Mistral, Meta) sees the prompt content but not your IP.
+- **Duck.ai.** Per [DuckDuckGo's documentation](https://duckduckgo.com/duckduckgo-help-pages/duckai/): DuckDuckGo "anonymizes" your request by stripping your IP and replacing it with their own before forwarding to the underlying model. They don't store prompts, don't train on your data, and your "Recent Chats" are stored locally on your device.
+- **Self-hosted open-weights models.** Llama 3.3, Qwen 2.5, Mistral Large, DeepSeek R1 running locally on your laptop via [Ollama](https://ollama.com) or [LM Studio](https://lmstudio.ai). The model never leaves your machine. The only entity that sees your data is you. The "How to" is in [the section below](#for-developers-and-the-privacy-serious-self-host-in-5-minutes).
+
+**Account-free first-party tools.** No signup required, but the provider running the model still sees your IP, your prompt text, and your session metadata. The standard examples:
+
+- **`chatgpt.com` guest mode** is OpenAI logging your prompts directly. Their privacy policy applies. PCMag's [ChatGPT Tracks More Than You Think](https://www.pcmag.com/explainers/chatgpt-tracks-more-than-you-think-how-to-lock-down-your-privacy) is a good summary of what this looks like.
+- **Claude incognito mode** is Anthropic logging your prompts, with the specific feature carve-out that incognito chats aren't saved to your visible history, aren't used by Claude's memory, and aren't used for training. **The important caveat**: per [Anthropic's docs](https://support.claude.com/en/articles/12260368-using-incognito-chats), incognito chats "are retained for either 30 days (default), or longer in accordance with your organization's custom data retention setting (available for Enterprise plans)." Not in your history, not actually deleted either.
+- **Bing Copilot** is Microsoft logging your prompts. Standard Microsoft privacy policy applies.
+
+**Wrapper sites** that don't require an account but ALSO log you on top of the underlying provider's logging. Most of the "free Claude!" and "free GPT!" pages from the search results are in this group. They serve a real model, but they're a server in the middle that has its own logs, and most of them don't publicly commit to not retaining your prompts. Convenient. Not private.
+
+When you're evaluating any "free [Brand] without login" page, the question to ask is: *does their privacy page explicitly say they don't store prompts and don't pass identifying information to the model provider?* If the answer is just "no signup required!" with nothing about logging, you're in the wrapper category.
+
+**Open-source wrappers** are a half-step better than the closed ones, and worth calling out as a separate category. Our own [SurfSense /free](/free) is in this bucket: the source code that handles your prompt is on [GitHub](https://github.com/MODSetter/SurfSense), so the claims about anonymous sessions, no persistent identity, and no prompt retention are auditable rather than promised. That doesn't make it equivalent to Brave Leo or Duck.ai (the model provider behind /free still receives the prompt content), but it does mean you can verify the wrapper layer doesn't add its own logging on top. If you're going to use a wrapper anyway, prefer one whose code you can read.
+
+### What every public AI provider logs (with citations)
+
+This is the reference table the wrapper sites don't include. Sources are linked in each cell that needs one.
+
+<img
+  src="/images/blog/no-login-ai-privacy-reality-check/placeholder-03-what-providers-log-conceptual.png"
+  alt="A comparison illustration showing two flows: a 'standard no-login AI' lane where data flows past IP, prompt, and session logging stages into a tall stack of provider logs; and an 'anonymized AI' lane where a shield strips the IP before the data reaches a tiny single-sheet record."
+  width={1920}
+  height={1080}
+  style={{ width: '100%', height: 'auto', borderRadius: '12px' }}
+/>
+
+| Service | IP logged | Prompt content logged | Used for training | Retention |
+|---|---|---|---|---|
+| ChatGPT (guest, no login) | Yes | Yes | OpenAI may use guest prompts for service improvement | Per OpenAI privacy policy |
+| Claude (logged-in, normal) | Yes | Yes | No on Free/Pro/Max per [Anthropic privacy center](https://privacy.claude.com); stricter on Team/Enterprise | Per account settings |
+| Claude (incognito) | Yes | Yes (but not in your history) | No, per [Anthropic docs](https://support.claude.com/en/articles/12260368-using-incognito-chats) | 30 days default, longer for Enterprise |
+| Gemini (Google account) | Yes | Yes | Per Google account settings | Per Google account settings |
+| Bing Copilot (no account) | Yes | Yes | Per Microsoft privacy policy | Per Microsoft privacy policy |
+| Brave Leo | **No** ("no identifiers linkable to you" per [brave.com/leo](https://brave.com/leo/)) | Forwarded to model, not retained by Brave | No, per Brave docs | None per Brave docs |
+| Duck.ai | **No** (DuckDuckGo strips IP before forwarding) | Forwarded to model, not retained by DuckDuckGo | No, per [DuckDuckGo docs](https://duckduckgo.com/duckduckgo-help-pages/duckai/) | Local-only chat history |
+| Self-hosted Ollama | n/a (local only) | n/a (local only) | n/a | None unless you save it |
+| Wrapper sites (HIX, EaseMate, NoteGPT, etc.) | Yes (by wrapper) + downstream provider | Yes (by both wrapper and provider) | Depends on wrapper TOS | Depends on wrapper TOS |
+
+A few takeaways from the table that the existing search results never make explicit:
+
+- **Brave Leo and Duck.ai are the only mainstream products that publicly commit to not logging your IP.** They achieve this with a server-side proxy (DuckDuckGo) or a privacy-first browser pipeline (Brave). The model provider never sees your real IP; it sees the proxy's.
+- **Claude's incognito mode is honest about its limits.** It's still retained for 30 days. That's a thoughtful safety design, not an anonymity promise.
+- **Wrapper sites add a layer of logging on top of the model provider's logging.** That's strictly worse for privacy than going to the model provider's first-party guest mode, even if it removes the account requirement.
+
+## Pick the right tool for what you're actually doing
+
+The right tool depends on what you're pasting into the chat box. Three rough buckets cover almost every reader.
+
+<img
+  src="/images/blog/no-login-ai-privacy-reality-check/placeholder-04-decision-tree-by-use-case.png"
+  alt="A decision tree showing which no-login AI tool to pick based on what you are pasting: casual content can use any tool, sensitive code or customer data needs anonymized tools or self-hosting, and regulated data needs enterprise contracts or self-hosting only."
+  width={1920}
+  height={1080}
+  style={{ width: '100%', height: 'auto', borderRadius: '12px' }}
+/>
+
+### If you're a casual user with non-sensitive prompts
+
+Homework explanations, recipe ideas, brainstorming, drafts of emails, quick summaries of public articles, code from public GitHub repos. Your prompt isn't interesting to anyone with subpoena power. Privacy isn't really the constraint; convenience and model quality are.
+
+**What to use:** anything in this guide. Pick by which model you want.
+
+- For ChatGPT-like answers, `chatgpt.com` guest mode or Bing Copilot.
+- For Claude, [Duck.ai](https://duck.ai) (Haiku, anonymised) or an aggregator like [SurfSense /free](/free) which lists Claude among its model options.
+- For multi-model comparison in one tab, [SurfSense /free](/free) or [Duck.ai](https://duck.ai).
+
+### If you work with code, internal docs, or customer data
+
+Code that includes API keys, internal class names, or business logic; documents your employer hasn't published; conversations that involve customer details. The IP and the prompt content matter here.
+
+**What to use:** the anonymised category only. Brave Leo, Duck.ai, or a self-hosted model. If you must use a first-party guest mode, redact ruthlessly before pasting. Avoid wrapper sites for this kind of prompt.
+
+### If you're handling regulated or contractually sensitive data
+
+PHI under HIPAA, attorney-client privileged matter, financial data under SOX or GLBA, EU personal data under GDPR, anything covered by an NDA. The legal exposure is severe and the answer is not a free chat product.
+
+**What to use:** self-host an open-weights model on hardware you control, or use an enterprise contract with a BAA / DPA in place (Anthropic Enterprise, OpenAI Enterprise, Google Cloud Vertex AI). Public free chat is not an acceptable channel here, regardless of whether it asks for a login.
+
+## For developers and the privacy-serious: self-host in 5 minutes
+
+This section is for technical readers. If you're a casual user, you can skip it; the answers above are enough.
+
+Self-hosting an open-weights model is the only path where "private" means private in the strict sense. Your prompt content never leaves your machine. There is no provider, no logging, no retention, no training-on-your-data risk. And it's much easier than it used to be.
+
+1. Install [Ollama](https://ollama.com) (one-click installer for macOS, Windows, Linux).
+2. Open a terminal and run:
+   ```bash
+   ollama run llama3.3:8b
+   ```
+3. The first run downloads the model (about 5 GB). Subsequent runs start instantly.
+4. Type a prompt at the `>>>` prompt. You're chatting with a local model.
+
+Quality is genuinely competitive for most casual use. Llama 3.3 8B handles writing, summarisation, and general Q&A well. For better quality, swap to `qwen2.5:14b` or `mistral-small:24b` if you have 16+ GB of RAM. For coding-specific work, `deepseek-coder-v2` is the current open-weights leader.
+
+If you want a graphical interface instead of a terminal, install [LM Studio](https://lmstudio.ai). For a hosted-but-self-controlled experience, the open-source SurfSense stack on the [GitHub repo](https://github.com/MODSetter/SurfSense) gives you the same chat UI with the same model options, running on your own servers.
+
+For the deeper performance trade-offs between local and frontier models on real document Q&A, our [agentic RAG vs long-context LLMs benchmark](/blog/agentic-rag-vs-long-context-llms-benchmark) has the numbers.
+
+## FAQ
+
+### Can I use ChatGPT without an account?
+
+Yes. Open `chatgpt.com` in a private tab and you'll get guest mode automatically. You get around 10 messages on the headline GPT-5-class model per 5-hour rolling window, after which you're auto-switched to a lighter GPT-5 Mini variant with no hard limit (not blocked by a hard signup wall). No file uploads, no chat history, no Code Interpreter, but for short queries the model quality is the same as the paid first-tier experience.
+
+### Can I use Claude without an account?
+
+Not on `claude.ai` itself, which still requires signup. The closest no-account paths are [Duck.ai](https://duck.ai) (Claude Haiku 4.5, free, anonymised), [Brave Leo](https://brave.com/leo/) (Claude Haiku in the Brave browser sidebar), and aggregator pages like our open-source [SurfSense /free](/free), which lists Claude among the models you can pick with no Anthropic account and a 500K free token budget shared across the whole page.
+
+### Can I use Gemini without a Google account?
+
+Not on Google's own product pages. Aggregator sites like our open-source [SurfSense /free](/free) include Gemini among the models you can pick and forward requests to the Gemini API behind the scenes, so the user-facing chat works without a Google sign-in. If you specifically want what Gemini is best at (long-context, web research, Workspace integration), there isn't a perfect Google-free substitute, though Brave Leo with Claude Haiku and Perplexity cover most use cases.
+
+### What is Claude incognito mode?
+
+A feature [Anthropic launched on April 9, 2026](https://support.claude.com/en/articles/12260368-using-incognito-chats), available on every Claude plan from Free to Enterprise. Click the ghost icon when starting a new chat. The conversation isn't saved to your history, isn't pulled into Claude's memory, and isn't used for training. It still requires an existing Claude account, and the conversation is retained for 30 days for safety. Useful if you have a Claude account and want a temporary one-off chat.
+
+### Is using AI without an account actually private?
+
+Not by itself. "No login" removes one identifier (your account), but the model provider still sees your IP and the content of your prompt. For actual anonymity, use Brave Leo, Duck.ai, or a self-hosted open-weights model. The privacy section above explains the categories in detail.
+
+### Does ChatGPT guest mode keep my data private?
+
+Less than logged-in mode, but not by much. OpenAI still logs your IP and prompt content. Logged-out users have fewer opt-out controls than logged-in free-tier users. If your prompt is something you'd be uncomfortable seeing on someone else's screen, treat ChatGPT guest mode as recorded.
+
+### What's the most private AI chatbot in 2026?
+
+A self-hosted open-weights model running locally via Ollama or LM Studio. Among hosted options, Brave Leo and Duck.ai are the two that publicly commit to not logging your IP and not retaining your prompts.
+
+### Are wrapper sites that say "free Claude" or "free GPT" safe?
+
+They're convenient, not private. Most "free [Brand] without login" pages are servers that wrap the underlying API and serve responses for free. They don't ask you to sign up, but they're a third party in the middle that has their own logs, on top of the model provider's logs. Use them for casual prompts you'd be fine with showing up in someone else's database.
+
+### What's the difference between Duck.ai and a regular ChatGPT wrapper?
+
+Duck.ai is the only mainstream chat product that publicly documents an end-to-end anonymisation model: DuckDuckGo proxies your request, strips your IP, doesn't retain prompts, and doesn't train on your data. Standard wrapper sites do none of these things. They're just "no signup form".
+
+### Is Brave Leo really free with no login?
+
+Yes. Per [Brave's documentation](https://brave.com/leo/), no account or signup is required for the free tier, and Brave doesn't collect identifiers tied to you. The free tier includes Claude Haiku, Llama 3.1 8B, Mixtral, and Qwen 3 14B. Premium ($14.99/month) adds Claude Sonnet 4 and DeepSeek R1.
+
+### How can a developer avoid the browser login flow entirely?
+
+For Anthropic specifically: configure Claude Code to authenticate via Amazon Bedrock, Google Vertex AI, or Microsoft Foundry per the [Claude Code authentication docs](https://code.claude.com/docs/en/authentication). No browser login required, IAM-only auth. For OpenAI and Google, the standard answer is API keys with cloud-provider IAM and IP allow-listing. For full local control, the [self-hosting section](#for-developers-and-the-privacy-serious-self-host-in-5-minutes) covers Ollama and LM Studio.
+
+## The bottom line
+
+The question "can I use a top AI model without an account" has a much better answer in 2026 than it did a year ago. Anthropic added incognito mode, Duck.ai added free Claude Haiku with real anonymisation, Brave Leo grew into a credible browser-side option, and the multi-model aggregators got cheaper to run.
+
+If you just want to chat: pick the brand you want, use the path from the relevant section above, and be done. If you care about privacy: stick to Brave Leo, Duck.ai, or a self-hosted model, and remember that "no signup" alone doesn't make a tool anonymous. If you're handling sensitive or regulated data: don't use a free chat product at all, use an enterprise contract or run the model yourself.
+
+And if you want a single no-account chat hub that lets you pick from ChatGPT, Claude, Gemini, DeepSeek, Mistral, Llama, and a rotating list of others under one URL with the wrapper-layer code open on [GitHub](https://github.com/MODSetter/SurfSense), that's exactly what we built **[SurfSense /free](/free)** for. The pitch: 500,000 free tokens shared across any model on the page, no account, anonymous sessions not stored in any database, and the model lineup updates whenever new models ship. It's not the right answer for every reader (if you need IP anonymisation specifically, Brave Leo or Duck.ai is still the better fit), but it is a genuine, honest pick, and we'd rather list it confidently than pretend we don't make it. Whichever you choose, the goal of this guide was to give you the honest map first.
diff --git a/surfsense_web/changelog/content/2026-03-31.mdx b/surfsense_web/changelog/content/2026-03-31.mdx
new file mode 100644
index 0000000000..de23952530
--- /dev/null
+++ b/surfsense_web/changelog/content/2026-03-31.mdx
@@ -0,0 +1,132 @@
+---
+title: "SurfSense v0.0.14 - Dropbox, OneDrive, Agent Approvals, Reports & Self-Hosted Docker"
+description: "SurfSense v0.0.14 is a massive release: new Dropbox and Microsoft OneDrive connectors, human-in-the-loop approvals across Gmail, Google Calendar, Jira, Confluence, Linear, and Notion, a full report generation system with multi-format export, a faster real-time sync engine, a new desktop app, one-command self-hosted Docker install, and built-in private web search."
+date: "2026-03-31"
+tags: ["Connectors", "Approvals", "Reports", "Desktop App", "Docker", "Dropbox", "OneDrive"]
+version: "0.0.14"
+---
+
+## What's New in v0.0.14
+
+This release pushes SurfSense further as an open source, privacy-first, self-hostable knowledge platform and a serious **ChatGPT alternative** for teams. It ships two major connectors (Dropbox and Microsoft OneDrive), a new approval system for sensitive AI actions, a full report generation and export pipeline, a brand new desktop app, a one-command self-hosted installer, and a faster real-time sync backbone.
+
+### New Connectors
+
+#### Dropbox Integration
+
+- **Full Dropbox Connector**: Connect Dropbox in one click, then pick exactly which folders and files SurfSense should index.
+- **Incremental Sync**: Only re-index what actually changed, so syncs stay fast and cheap even on huge Dropbox workspaces.
+- **Dropbox Paper Support**: Dropbox Paper docs are now exported as clean, readable text so AI answers stay accurate.
+- **Write Actions in Chat**: Ask SurfSense to create or delete Dropbox files directly from chat, with a confirmation step before anything is changed.
+- **New Docs**: Dropbox connector documentation added to the site.
+
+#### Microsoft OneDrive Integration
+
+- **OneDrive Connector**: A native Microsoft OneDrive connector with secure sign-in and a built-in file picker.
+- **Folder Management UI**: Browse and select OneDrive folders visually, with proper handling of different file types.
+- **Create & Delete Files from Chat**: AI agents can create and delete OneDrive files for you. Newly created documents are auto-converted to Word format when appropriate, and expired sign-ins are handled gracefully.
+- **Instant Knowledge Base Sync**: Any OneDrive file created through SurfSense is synced into your knowledge base right away, so it's immediately searchable.
+- **New Docs**: OneDrive connector documentation added to the site.
+
+### Human-in-the-Loop Approvals
+
+SurfSense now treats every sensitive AI action as an explicit, reviewable step. This is the right default when you're letting an AI agent send emails, create calendar events, or edit tickets on your behalf.
+
+- **Unified Approval Flow**: One consistent approval experience across every connector, with built-in deduplication so you never get prompted twice for the same action.
+- **Rich Approval Card**: Edit anything before approving, including dates, times, email recipients, and formatted content (for Confluence pages).
+- **Approvals Across Connectors**: Gmail (drafts and updates), Google Calendar (events), Google Drive, Jira (issue creation with a direct link in the response), Confluence (page creation with a direct link), Linear (write actions), and Notion.
+- **Graceful Re-authentication**: When a connector's sign-in expires, SurfSense now asks you to re-authenticate inline without breaking the conversation.
+
+### Reports & Multi-Format Export
+
+- **New Reports System**: Generate versioned, editable Markdown reports from any chat. Edit them in a full rich-text editor with save, leave, and read-only viewing modes.
+- **Public Report Links**: Share reports publicly with a clear indicator and one-click markdown download.
+- **Multi-Format Export**: Export reports and documents to **PDF, Word (DOCX), HTML, LaTeX, EPUB, OpenDocument (ODT), and plain text**.
+- **Smarter Follow-Ups**: Chat now remembers recent report context, so follow-up questions feel natural and stay grounded in what you were just working on.
+- **Math & Code Rendering**: Markdown rendering now supports beautiful math equations and syntax-highlighted code out of the box.
+
+### Desktop App
+
+- **New SurfSense Desktop App**: First release of the SurfSense desktop client.
+- **Quick Ask Panel**: A floating mini-window opens at your cursor, doesn't steal focus from your current app, and can grab selected text and paste answers back for you.
+- **Global Shortcuts**: Keyboard shortcuts that feel right on each OS, with press-Escape-to-hide and smart positioning that stays on-screen.
+- **Cross-Platform**: A non-focus-stealing floating panel on macOS with a matching experience on Windows and Linux.
+
+### Faster Real-Time Sync
+
+- **Upgraded Sync Engine**: Moved to a faster real-time sync engine behind the scenes, so the inbox, documents, comments, chats, and connector status all update instantly.
+- **Production-Ready**: Self-hosted deployments get a dedicated sync service with Docker configuration and CI workflows included.
+
+### Self-Hosted Docker Installation
+
+- **One-Command Installer**: New Docker configuration and install script for Linux and macOS, plus a **Windows install script**.
+- **Background Workers Included**: Background indexing services are now part of the default Docker setup, so documents keep processing reliably without extra setup.
+- **Automated Releases**: Each release now builds official multi-platform images, tagged cleanly for every version, and ready to use with auto-update tools.
+- **Easy Migration**: An automatic migration script helps users move off the old all-in-one deployment.
+- **Stripe Config**: Optional Stripe configuration added for paid self-hosted tiers.
+
+### Web Search & Knowledge Base
+
+- **Private Web Search (SearxNG)**: Built-in private web search so AI answers can reference the live web without sending queries to third-party search providers.
+- **Citations with Hover Previews**: New citation cards with hover popovers on desktop and a dedicated drawer on mobile.
+- **Date-Filtered Knowledge Base Search**: Narrow knowledge base searches by time range for more precise answers.
+- **YouTube Playlist Support**: Index entire YouTube playlists, not just single videos.
+- **Faster Indexing**: Documents now index significantly faster thanks to batched embedding under the hood.
+
+### Faster Connectors
+
+- **Parallel Indexing**: Confluence, Jira, Linear, Notion, Gmail, Google Calendar, and Google Drive now index files in parallel, noticeably faster than before.
+- **Google Drive Upgrades**: Parallel file downloads, native Excel parsing, streaming downloads, and the ability to index just the files you pick.
+- **Skipped File Tracking**: Google connectors now clearly surface any files or messages that were skipped, so you always know what made it in.
+
+### UI & Mobile
+
+- **Mobile-First Polish**: Mobile citation drawer, long-press actions in chats and documents, a responsive documents sidebar, and a mobile-friendly onboarding tour.
+- **Reworked Composer**: Tool actions are grouped into a cleaner menu with better icons, plus a helpful "connect tools" banner.
+- **Settings & Team**: New tabbed user settings page (profile + API keys), team roles management with pagination, and a search space settings dialog.
+- **Right Panel & Docked Sidebar**: A tabbed Sources/Report panel with smooth transitions, plus an optional docked documents sidebar.
+- **Community Prompts**: Public prompt library with copy support, inline share toggles, and a see-more/less toggle for long prompts.
+- **New Homepage**: Smooth scrolling, a use-cases grid, an updated walkthrough hero, a GitHub stars badge, and a new carousel for AI-generated video.
+- **More Power**: Bulk document deletion, folder creation and organization, document metadata viewer (Ctrl+Click), and document preview right inside the sidebar.
+
+### New LLM Provider
+
+- **MiniMax**: Complete MiniMax LLM integration joins the growing list of supported models.
+
+<Accordion type="multiple" className="w-full not-prose">
+  <AccordionItem value="item-1">
+    <AccordionTrigger>Bug Fixes</AccordionTrigger>
+    <AccordionContent className="flex flex-col gap-4 text-balance">
+      <ul className="list-disc space-y-2 pl-4">
+        <li>Fixed citations for live connectors</li>
+        <li>Fixed Google Drive handling of failed documents and unsupported file types</li>
+        <li>Fixed Dropbox path retrieval and error handling during indexing</li>
+        <li>Fixed a build failure in the roles manager page</li>
+        <li>Re-enabled legacy AI image handling so older chats still render correctly</li>
+        <li>Fixed sign-in state syncing between the desktop app and web app</li>
+        <li>Fixed nested button accessibility issues across the roles manager, link toolbar, and hero section</li>
+        <li>Fixed several memory leaks (onboarding tour, Spotlight, register page)</li>
+        <li>Fixed Google Calendar event update behavior</li>
+        <li>Smoother client-side navigation in several places</li>
+      </ul>
+    </AccordionContent>
+  </AccordionItem>
+  <AccordionItem value="item-2">
+    <AccordionTrigger>Performance & Reliability</AccordionTrigger>
+    <AccordionContent className="flex flex-col gap-4 text-balance">
+      <ul className="list-disc space-y-2 pl-4">
+        <li>Large document migrations now run in batches to reduce memory usage</li>
+        <li>Smoother streaming in chat thanks to batched UI updates</li>
+        <li>Rate limiting now falls back gracefully when the cache is unavailable</li>
+        <li>Better error tracking and referral attribution</li>
+        <li>Smaller page bundles for the web app, with heavy editors loaded only when needed</li>
+        <li>End-to-end tests for the document upload pipeline</li>
+        <li>Health check endpoint and safer default rate limits</li>
+      </ul>
+    </AccordionContent>
+  </AccordionItem>
+</Accordion>
+
+v0.0.14 is one of the largest SurfSense releases to date. If you're looking for a **NotebookLM alternative** or comparing **free alternatives to ChatGPT** that you can actually self-host, this release is built for you.
+
+SurfSense connects all your knowledge sources in one place.
diff --git a/surfsense_web/changelog/content/2026-04-08.mdx b/surfsense_web/changelog/content/2026-04-08.mdx
new file mode 100644
index 0000000000..080a03e3c2
--- /dev/null
+++ b/surfsense_web/changelog/content/2026-04-08.mdx
@@ -0,0 +1,111 @@
+---
+title: "SurfSense v0.0.15 - Watch Local Folders, Vision Autocomplete & New Document Pipeline"
+description: "SurfSense v0.0.15 turns your laptop into a living knowledge base: watch any local folder and keep it auto-synced, a vision-powered quick-assist autocomplete in the desktop app, a brand new document processing pipeline with Docling, LlamaCloud, Unstructured, and Azure Document Intelligence, full document version history, and per-user page limits across every connector."
+date: "2026-04-08"
+tags: ["Desktop App", "Local Folder", "Document Pipeline", "Vision AI", "Version History", "Azure Document Intelligence"]
+version: "0.0.15"
+---
+
+## What's New in v0.0.15
+
+This release continues to push SurfSense as a fully self-hostable, open source **ChatGPT alternative** and **NotebookLM alternative** for teams who care about where their data lives. v0.0.15 is all about **bringing your local files into the knowledge base effortlessly** and making the SurfSense desktop app a first-class always-on assistant.
+
+### Watch Local Folder
+
+Your files already live in folders. Now SurfSense can watch them.
+
+- **Real-Time Folder Sync**: Pick any folder on your computer, and SurfSense automatically keeps it indexed as files are added, edited, or removed.
+- **Nothing Gets Lost**: Even across restarts or network blips, pending file changes are safely queued until they're fully processed.
+- **Smart Batching**: Many files at once? SurfSense now processes them in smart batches with progress feedback, so large folders stay responsive.
+- **Clean Folder Cleanup**: Deleted or emptied folders are cleaned up automatically in the background.
+- **Fast First Sync**: Existing folders seed their file timestamps in a single pass, so the first sync is both fast and correct.
+- **Upload-Based Sync Too**: Don't want the desktop watcher? Upload a folder from the web app and your folder structure is preserved automatically.
+- **Local Files as First-Class Documents**: Locally-watched files show up clearly in the documents list as their own category.
+
+### Document Version History
+
+- **Full Version History**: Every document now keeps a versioned history. View, compare, and restore older versions directly from the documents UI.
+- **Restore + Reindex**: Restoring an old version automatically triggers reindexing so search and chat results stay accurate.
+- **Built-In**: Versioning is automatic. You don't have to configure anything.
+
+### New Document Processing Pipeline
+
+We've rebuilt how SurfSense turns raw files into searchable knowledge.
+
+- **Pluggable Parsers**: Choose the best extractor for each document with built-in support for **Docling**, **LlamaCloud**, **Unstructured**, and plain text. Configure it globally or per connector.
+- **Azure Document Intelligence**: Added as a premium parsing option for high-accuracy PDF and scanned document parsing, available per upload.
+- **Audio Transcription**: Upload audio files and SurfSense will transcribe them for you.
+- **Fast Path for Common Files**: CSV, TSV, HTML, and XHTML files now skip heavier parsing and convert directly, cutting indexing time.
+- **Smart Routing**: Each file is automatically routed to the parser best suited for it.
+- **Vision Model Is Opt-In**: Vision models are now opt-in per upload/connector, with safer size limits and better fallbacks when an image can't be processed.
+
+### Vision-Powered Quick Assist (Desktop App)
+
+The SurfSense desktop app becomes a serious always-on **AI like ChatGPT** that actually sees your screen, without leaving your current window.
+
+- **Screenshot-Based Autocomplete**: Press a shortcut and SurfSense captures a screenshot, sends it to a vision model, and returns context-aware suggestions right where you're typing.
+- **Multi-Suggestion UI**: The suggestion popup now offers up to **3 options** to pick from, with clean cards and quick-assist detection.
+- **Knowledge Base Grounding**: Suggestions are grounded in your connected knowledge base, not just generic model output.
+- **macOS Permission Onboarding**: A clearer onboarding page walks macOS users through the permissions the app needs, only when they're actually needed.
+- **Switch Search Spaces from the Overlay**: Change the active search space without opening the main app.
+- **General Assist**: A new general-purpose assist mode with cleaner shortcut icons and descriptions.
+- **Stay Signed In Everywhere**: Sign-in is now synchronized between the desktop app and the web app.
+- **Vision Model Settings**: A dedicated Vision Models tab in Settings lets you pick and manage vision models, including a dynamic model list from OpenRouter.
+- **Smart Download Button**: The marketing site auto-detects your OS and offers the right desktop installer.
+
+### Connectors
+
+- **Dropbox Delta Sync**: A smarter incremental sync for Dropbox that preserves your place across re-authentication. Big Dropbox workspaces sync dramatically faster.
+- **Stronger Google OAuth**: Native Google sign-in now uses modern, best-practice security (PKCE).
+
+### Page Limits & Quotas
+
+- **Per-User Page Limits**: SurfSense now estimates and enforces per-user page limits before indexing begins, so usage stays predictable on cloud and self-hosted deployments.
+- **Consistent Across the App**: Local folder indexing respects the same page-limit checks as cloud connectors.
+
+### Document Upload
+
+- **Unified Picker**: Upload files and folders from one clean dropdown, with clearer size limits and properly localized messages.
+- **Better Processing Feedback**: Improved upload progress and processing indicators, plus a cleaner drag-and-drop area.
+
+### Performance
+
+- **Faster Startup**: Analytics and some dev tooling only load when needed, keeping the app lighter for end users.
+- **Leaner Pages**: Shared libraries now load smaller, more focused bundles.
+- **Smoother Self-Hosted Setup**: New installs of the real-time sync service now set themselves up automatically.
+
+<Accordion type="multiple" className="w-full not-prose">
+  <AccordionItem value="item-1">
+    <AccordionTrigger>Bug Fixes</AccordionTrigger>
+    <AccordionContent className="flex flex-col gap-4 text-balance">
+      <ul className="list-disc space-y-2 pl-4">
+        <li>Fixed several memory leaks when leaving pages during active work</li>
+        <li>Upload progress bar now clears correctly when leaving the upload screen</li>
+        <li>Smoother, less jittery thread list and chat refreshes</li>
+        <li>Smoother client-side navigation in several places</li>
+        <li>Improved responsiveness of the documents sidebar</li>
+        <li>Debounced search on the logs page to stop UI jank</li>
+        <li>Correct browser autocomplete hints on sign-in and register forms</li>
+        <li>Vision model is no longer triggered unnecessarily by the desktop folder watcher</li>
+        <li>Silenced noisy auto-updater errors in the desktop app</li>
+      </ul>
+    </AccordionContent>
+  </AccordionItem>
+  <AccordionItem value="item-2">
+    <AccordionTrigger>Performance & Reliability</AccordionTrigger>
+    <AccordionContent className="flex flex-col gap-4 text-balance">
+      <ul className="list-disc space-y-2 pl-4">
+        <li>Faster local folder indexing thanks to smarter content hashing</li>
+        <li>Smoother sidebar open/close animations</li>
+        <li>Long lists now scroll more smoothly on the documents and logs pages</li>
+        <li>Search and filter inputs feel snappier while typing</li>
+        <li>Unified version numbering across the web app, backend, and desktop app</li>
+        <li>Event tracking now includes the platform (web/desktop) for better analytics</li>
+      </ul>
+    </AccordionContent>
+  </AccordionItem>
+</Accordion>
+
+If you've been looking for a tool that works across your cloud connectors **and** your local machine, v0.0.15 is exactly that. It's also one of the most compelling **NotebookLM alternatives** and **free ChatGPT alternatives** for private team use, with full control over your data, your parsers, and your models.
+
+SurfSense connects all your knowledge sources in one place.
diff --git a/surfsense_web/changelog/content/2026-04-16.mdx b/surfsense_web/changelog/content/2026-04-16.mdx
new file mode 100644
index 0000000000..7b4e712ca5
--- /dev/null
+++ b/surfsense_web/changelog/content/2026-04-16.mdx
@@ -0,0 +1,136 @@
+---
+title: "SurfSense v0.0.16 - Try SurfSense With No Login, Long-Term Memory, Token Tracking & AI File Sorting"
+description: "SurfSense v0.0.16 introduces a no-login online experience so anyone can try this free ChatGPT alternative in seconds, long-term user and team memory, a reworked model selector with full token usage tracking, AI-powered file sorting, one-click Knowledge Base export as ZIP, trusted MCP tools, and OpenRouter integration."
+date: "2026-04-16"
+tags: ["No Login", "Memory", "Token Usage", "AI File Sorting", "KB Export", "OpenRouter", "MCP"]
+version: "0.0.16"
+---
+
+## What's New in v0.0.16
+
+If you've ever wanted to try an **online ChatGPT alternative** or a **free AI chatbot like ChatGPT** without creating an account, v0.0.16 is for you. This release makes SurfSense instantly accessible with a **no-login** online experience, adds durable long-term memory for users and teams, brings full token-usage visibility to every chat, introduces AI file sorting, and ships a brand-new knowledge base export feature. It turns SurfSense into an even more practical **free alternative to ChatGPT** for real knowledge work.
+
+### No-Login Online Experience
+
+- **ChatGPT Online, No Login Required**: Try SurfSense online for free without signing up. First-time visitors can chat and test the **free ChatGPT alternative** experience instantly, with free tokens included.
+- **Anonymous-Friendly**: Sign-in is now optional on the public experience, while your authenticated data stays fully protected.
+- **Cleaner Behind the Scenes**: Removed leftover artifacts from the older sync system for a smoother experience.
+
+### Long-Term Memory for Users & Teams
+
+SurfSense agents now actually **remember**.
+
+- **Personal & Team Memory**: SurfSense can now remember durable information about you and your team across chats, stored cleanly as markdown you can read and edit yourself.
+- **Natural Greetings**: Memory now includes your display name and first-name context, so replies feel more personal.
+- **One Simple Memory Tool**: A single, unified "update memory" action handles personal and shared memory, with sensible size limits.
+- **Runs in the Background**: Memory extraction happens in the background, so it never slows down your chat.
+- **Smart Team Memory Policy**: Team memory only captures durable, relevant context and ignores noise, so shared memory stays useful.
+- **Memory Management UI**: A clean screen for viewing, pinning, and editing personal and team memory, with helpful empty states and limit warnings.
+- **Export Memory as Markdown**: Download your personal or team memory as a markdown file with one click.
+- **Clean Migration**: Old memory storage has been fully migrated to the new format.
+
+### Token Usage Tracking Across Chat
+
+Token visibility is now first-class.
+
+- **Usage Per Call**: SurfSense now tracks how many tokens every AI call uses.
+- **Complete Coverage**: Token usage is captured for both streaming and non-streaming AI responses.
+- **Inline in Chat**: See token usage for any message via a simple info dropdown on the message itself.
+- **Better Response Handling**: Cleaner response behavior and chat title generation built around token tracking.
+
+### Reworked Model Selector
+
+- **Redesigned Model Picker**: A cleaner UI with a new GitHub Models icon and per-agent model management for power users.
+- **OpenRouter Integration**: Hundreds of models (including **free Claude**-compatible routes) are now available through a single OpenRouter configuration.
+- **Clearer Model Display**: Message info now shows which model was used for every reply, with simpler settings and improved icons.
+
+### AI File Sorting
+
+- **AI Sort Button**: A new AI-powered sort in the documents table. Let the AI rank and group your documents semantically. It's a real quality-of-life win for anyone with a large knowledge base.
+
+### Knowledge Base Export
+
+- **Export Your Knowledge Base as ZIP**: One click downloads your entire knowledge base as a ZIP with your folder structure preserved.
+- **Folder-Level Export**: Right-click any folder to export just that folder.
+- **Processing Warning**: SurfSense warns you if documents are still processing, so you never ship an incomplete archive.
+- **Prominent Export Buttons**: Export is easy to find in both the documents toolbar and the sidebar, with a loading spinner during large exports.
+
+### Trusted MCP Tools
+
+- **Trust Sensitive Tools Once**: Mark specific MCP tools as trusted and skip repeated approval prompts for actions you've already decided are safe.
+- **Cleanup on Delete**: Removing an MCP connector now correctly clears related caches, so stale entries never hang around.
+
+### Document Upload & Processing Modes
+
+- **Pick Your Processing Mode**: Document uploads now let you choose how each file is parsed, with clearer error messages if something goes wrong.
+- **LlamaCloud v2**: Upgraded to the new LlamaCloud integration.
+- **Smarter Image Handling**: Uploaded images now automatically go through a vision model, with a document-parser fallback and clearer behavior when vision models aren't configured.
+
+### Code Execution Sandbox
+
+- **Cleaner Code Tool**: The built-in "execute code" tool is simpler and more reliable, with helpful hints when your code produces no output.
+- **Run in Parallel, Download Files**: The sandbox now supports multi-threaded execution and lets you download files produced by your code back into chat.
+- **No More Orphans**: Broken sandboxes are cleaned up before a replacement is created, so nothing piles up in the background.
+- **Hardened Security**: Tighter security settings, safer file syncing, and a fix for potential path traversal issues.
+
+### Unified Approvals
+
+- **One Approval Flow Everywhere**: Approvals across Confluence, Dropbox, Gmail, Google Calendar, Google Drive, Jira, Linear, Notion, and OneDrive are now consistent, with a shared approval UI.
+- **Smarter Deduplication**: Duplicate approval requests are filtered out, so you're never prompted twice for the same action.
+
+### Connectors & Desktop
+
+- **Obsidian Connector Visibility**: The Obsidian connector is now easier to discover and set up on desktop.
+- **Cross-OS Download Buttons**: The sidebar profile and a new download button automatically show the right desktop installer for your OS, in your language.
+
+### Marketing & SEO
+
+- **New Blog Section**: Added a dedicated blog section to the marketing site for deeper guides on **ChatGPT alternatives**, **Claude alternatives**, and **NotebookLM alternatives**.
+- **Improved SEO Metadata**: Updated metadata across key pages to emphasize privacy, data limits, and open source positioning.
+- **Pricing FAQ**: New FAQ section in pricing with dynamic questions.
+- **Pricing Clarity**: Updated pricing to clearly show the 500 pages included in the free plan.
+
+### UI Polish
+
+- **Safer Error Recovery**: A dedicated error boundary around the dashboard prevents isolated errors from crashing the full page.
+- **Animated Report Skeleton**: A new animated loading state in the Report Panel while reports are being generated.
+- **Composer & Layout**: Loading placeholders in the composer, a better mention picker experience, and refined settings layout.
+- **Better Editing**: Trailing empty lines in the rich-text editor make long-form writing smoother.
+
+<Accordion type="multiple" className="w-full not-prose">
+  <AccordionItem value="item-1">
+    <AccordionTrigger>Bug Fixes</AccordionTrigger>
+    <AccordionContent className="flex flex-col gap-4 text-balance">
+      <ul className="list-disc space-y-2 pl-4">
+        <li>Fixed knowledge base search being skipped in certain chat sequences</li>
+        <li>Fixed a button nesting issue in the mobile upload area</li>
+        <li>Clearer logs and behavior when a vision model isn't available vs. not configured</li>
+        <li>Added a safety check for very large images before vision processing</li>
+        <li>Better fallback when an image can't be processed by the vision model</li>
+        <li>Fixed duplicate filename collisions during folder upload</li>
+        <li>Cleaner filenames on folder uploads</li>
+        <li>More accurate "documents still processing" warning during folder export</li>
+        <li>Handled display names that are just whitespace without crashing</li>
+        <li>Added user feedback when snapshot deletion fails</li>
+        <li>Updated billing text to reflect 500 pages included in demo plans</li>
+      </ul>
+    </AccordionContent>
+  </AccordionItem>
+  <AccordionItem value="item-2">
+    <AccordionTrigger>Under the Hood</AccordionTrigger>
+    <AccordionContent className="flex flex-col gap-4 text-balance">
+      <ul className="list-disc space-y-2 pl-4">
+        <li>Old memory storage fully migrated and retired</li>
+        <li>Smaller bundles on public marketing pages</li>
+        <li>Consistent approval handling across all connectors</li>
+        <li>Cleaner file type detection using explicit, well-defined rules</li>
+        <li>Rich-text editor now powers memory input, with inline alerts and a better layout</li>
+        <li>Localized folder-upload messages and download button text</li>
+      </ul>
+    </AccordionContent>
+  </AccordionItem>
+</Accordion>
+
+v0.0.16 makes SurfSense the easiest open source AI assistant to actually try: zero friction, no login, online. Whether you're searching for a **ChatGPT alternative free online**, a **free Claude AI alternative**, or a **NotebookLM alternative** you can self-host, SurfSense v0.0.16 is built for you.
+
+SurfSense connects all your knowledge sources in one place.
diff --git a/surfsense_web/components/Logo.tsx b/surfsense_web/components/Logo.tsx
index 0c1b2cce26..d43e3e1ee3 100644
--- a/surfsense_web/components/Logo.tsx
+++ b/surfsense_web/components/Logo.tsx
@@ -15,7 +15,7 @@ export const Logo = ({
 		<Image
 			src="/icon-128.svg"
 			className={cn("select-none dark:invert", className)}
-			alt="logo"
+			alt="SurfSense"
 			width={128}
 			height={128}
 			priority={priority}
diff --git a/surfsense_web/components/agent-action-log/action-log-button.tsx b/surfsense_web/components/agent-action-log/action-log-button.tsx
new file mode 100644
index 0000000000..6e4c72fc59
--- /dev/null
+++ b/surfsense_web/components/agent-action-log/action-log-button.tsx
@@ -0,0 +1,50 @@
+"use client";
+
+import { useAtomValue, useSetAtom } from "jotai";
+import { Workflow } from "lucide-react";
+import { useCallback } from "react";
+import { openActionLogDialogAtom } from "@/atoms/agent/action-log-dialog.atom";
+import { agentFlagsAtom } from "@/atoms/agent/agent-flags-query.atom";
+import { Button } from "@/components/ui/button";
+import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
+
+interface ActionLogButtonProps {
+	threadId: number | null;
+}
+
+/**
+ * Header button that opens the agent action log dialog for the current
+ * thread. Renders nothing when:
+ *   - the action log feature flag is off (graceful no-op for older
+ *     deployments), OR
+ *   - there is no active thread (lazy-created chats haven't started).
+ */
+export function ActionLogButton({ threadId }: ActionLogButtonProps) {
+	const { data: flags } = useAtomValue(agentFlagsAtom);
+	const open = useSetAtom(openActionLogDialogAtom);
+
+	const enabled = !!flags?.enable_action_log && !flags?.disable_new_agent_stack;
+
+	const handleClick = useCallback(() => {
+		if (threadId !== null) open(threadId);
+	}, [open, threadId]);
+
+	if (!enabled || threadId === null) return null;
+
+	return (
+		<Tooltip>
+			<TooltipTrigger asChild>
+				<Button
+					size="sm"
+					variant="ghost"
+					className="size-8 p-0"
+					aria-label="Open agent action log"
+					onClick={handleClick}
+				>
+					<Workflow className="size-4 text-muted-foreground" />
+				</Button>
+			</TooltipTrigger>
+			<TooltipContent>Agent actions</TooltipContent>
+		</Tooltip>
+	);
+}
diff --git a/surfsense_web/components/agent-action-log/action-log-dialog.tsx b/surfsense_web/components/agent-action-log/action-log-dialog.tsx
new file mode 100644
index 0000000000..1d0eefc17b
--- /dev/null
+++ b/surfsense_web/components/agent-action-log/action-log-dialog.tsx
@@ -0,0 +1,158 @@
+"use client";
+
+import { useQueryClient } from "@tanstack/react-query";
+import { useAtom, useAtomValue } from "jotai";
+import { RefreshCcw, Workflow } from "lucide-react";
+import { useCallback } from "react";
+import { actionLogDialogAtom } from "@/atoms/agent/action-log-dialog.atom";
+import { agentFlagsAtom } from "@/atoms/agent/agent-flags-query.atom";
+import { Badge } from "@/components/ui/badge";
+import { Button } from "@/components/ui/button";
+import { Dialog, DialogContent, DialogDescription, DialogTitle } from "@/components/ui/dialog";
+import { Separator } from "@/components/ui/separator";
+import { Skeleton } from "@/components/ui/skeleton";
+import { agentActionsQueryKey, useAgentActionsQuery } from "@/hooks/use-agent-actions-query";
+import { ActionLogItem } from "./action-log-item";
+
+function EmptyState() {
+	return (
+		<div className="flex flex-1 flex-col items-center justify-center gap-4 px-6 pb-12 text-center">
+			<div className="flex max-w-[260px] flex-col gap-1.5">
+				<p className="text-sm font-semibold tracking-tight">No actions logged yet</p>
+				<p className="text-xs leading-relaxed text-muted-foreground">
+					A complete audit trail of every tool the agent uses in this thread will appear here
+				</p>
+			</div>
+		</div>
+	);
+}
+
+function DisabledState() {
+	return (
+		<div className="flex flex-1 flex-col items-center justify-center gap-4 px-6 pb-12 text-center">
+			<div className="flex size-12 items-center justify-center rounded-full border border-popover-border bg-muted/40">
+				<Workflow className="size-5 text-muted-foreground" strokeWidth={1.75} />
+			</div>
+			<div className="flex max-w-[280px] flex-col gap-1.5">
+				<p className="text-sm font-semibold tracking-tight">Action log is disabled</p>
+				<p className="text-xs leading-relaxed text-muted-foreground">
+					This deployment hasn't enabled the agent action log. An admin can enable{" "}
+					<code className="rounded bg-muted px-1 py-0.5 font-mono text-[10px] text-foreground">
+						SURFSENSE_ENABLE_ACTION_LOG
+					</code>
+					.
+				</p>
+			</div>
+		</div>
+	);
+}
+
+const SKELETON_KEYS = ["s1", "s2", "s3", "s4"] as const;
+
+function LoadingState() {
+	return (
+		<div className="flex flex-col gap-2 p-4">
+			{SKELETON_KEYS.map((key) => (
+				<Skeleton key={key} className="h-16 w-full rounded-lg" />
+			))}
+		</div>
+	);
+}
+
+export function ActionLogDialog() {
+	const [state, setState] = useAtom(actionLogDialogAtom);
+	const queryClient = useQueryClient();
+
+	const { data: flags } = useAtomValue(agentFlagsAtom);
+	const actionLogEnabled = !!flags?.enable_action_log && !flags?.disable_new_agent_stack;
+
+	const threadId = state.threadId;
+
+	const { data, items, isLoading, isFetching, isError, error, refetch } = useAgentActionsQuery(
+		threadId,
+		{ enabled: state.open && actionLogEnabled }
+	);
+
+	const handleOpenChange = useCallback(
+		(open: boolean) => {
+			setState((current) => (open ? { ...current, open } : { open: false, threadId: null }));
+		},
+		[setState]
+	);
+
+	const handleRevertSuccess = useCallback(() => {
+		if (threadId !== null) {
+			queryClient.invalidateQueries({ queryKey: agentActionsQueryKey(threadId) });
+		}
+	}, [queryClient, threadId]);
+
+	return (
+		<Dialog open={state.open} onOpenChange={handleOpenChange}>
+			<DialogContent className="select-none flex h-[90vh] max-h-[640px] w-[95vw] max-w-[900px] flex-col gap-0 overflow-hidden p-0 [--card:var(--popover)] md:h-[80vh]">
+				<div className="shrink-0 px-6 pb-3 pt-6 pr-28">
+					<div className="flex items-center gap-2">
+						<DialogTitle className="text-lg font-semibold">Agent actions</DialogTitle>
+						{data?.total !== undefined && data.total > 0 ? (
+							<Badge variant="secondary" className="text-[10px]">
+								{data.total}
+							</Badge>
+						) : null}
+					</div>
+					<DialogDescription className="sr-only">
+						Audit trail of every tool call the agent made in this thread.
+					</DialogDescription>
+					<Separator className="mt-4" />
+				</div>
+
+				<Button
+					size="sm"
+					variant="ghost"
+					onClick={() => refetch()}
+					disabled={isFetching || !actionLogEnabled}
+					className="absolute right-14 top-4 size-8 rounded-full p-0 text-muted-foreground hover:bg-accent hover:text-accent-foreground"
+					aria-label="Refresh action log"
+				>
+					<RefreshCcw className={isFetching ? "size-3.5 animate-spin" : "size-3.5"} />
+				</Button>
+
+				<div className="flex min-h-0 flex-1 flex-col overflow-y-auto scrollbar-thin">
+					{!actionLogEnabled ? (
+						<DisabledState />
+					) : threadId === null ? (
+						<EmptyState />
+					) : isLoading ? (
+						<LoadingState />
+					) : isError ? (
+						<div className="flex flex-1 flex-col items-center justify-center gap-2 px-6 text-center">
+							<p className="text-sm font-medium text-destructive">Failed to load actions</p>
+							<p className="text-xs text-muted-foreground">
+								{error instanceof Error ? error.message : "Unknown error"}
+							</p>
+							<Button size="sm" variant="outline" onClick={() => refetch()}>
+								Try again
+							</Button>
+						</div>
+					) : items.length === 0 ? (
+						<EmptyState />
+					) : (
+						<div className="flex flex-col gap-2 px-4 pb-4">
+							{items.map((action) => (
+								<ActionLogItem
+									key={action.id}
+									action={action}
+									threadId={threadId}
+									onRevertSuccess={handleRevertSuccess}
+								/>
+							))}
+							{data?.has_more ? (
+								<p className="py-2 text-center text-[11px] text-muted-foreground">
+									Showing {items.length} of {data.total}. Older actions are paginated.
+								</p>
+							) : null}
+						</div>
+					)}
+				</div>
+			</DialogContent>
+		</Dialog>
+	);
+}
diff --git a/surfsense_web/components/agent-action-log/action-log-item.tsx b/surfsense_web/components/agent-action-log/action-log-item.tsx
new file mode 100644
index 0000000000..bdd3711b6e
--- /dev/null
+++ b/surfsense_web/components/agent-action-log/action-log-item.tsx
@@ -0,0 +1,282 @@
+"use client";
+
+import { Check, ChevronRight, Copy, RotateCcw, Undo2 } from "lucide-react";
+import { useState } from "react";
+import { toast } from "sonner";
+import {
+	AlertDialog,
+	AlertDialogAction,
+	AlertDialogCancel,
+	AlertDialogContent,
+	AlertDialogDescription,
+	AlertDialogFooter,
+	AlertDialogHeader,
+	AlertDialogTitle,
+	AlertDialogTrigger,
+} from "@/components/ui/alert-dialog";
+import { Badge } from "@/components/ui/badge";
+import { Button } from "@/components/ui/button";
+import { getToolDisplayName, getToolIcon } from "@/contracts/enums/toolIcons";
+import { type AgentAction, agentActionsApiService } from "@/lib/apis/agent-actions-api.service";
+import { AppError } from "@/lib/error";
+import { formatRelativeDate } from "@/lib/format-date";
+import { cn } from "@/lib/utils";
+
+interface ActionLogItemProps {
+	action: AgentAction;
+	threadId: number;
+	onRevertSuccess: () => void;
+}
+
+function formatPrimitiveValue(value: unknown) {
+	if (value === null) return "null";
+	if (value === undefined) return "undefined";
+	if (typeof value === "string") return value;
+	if (typeof value === "number" || typeof value === "boolean") return String(value);
+	return JSON.stringify(value, null, 2);
+}
+
+function ArgumentValue({ value }: { value: unknown }) {
+	const formatted = formatPrimitiveValue(value);
+	const isBlockValue =
+		typeof value === "object" ||
+		(typeof value === "string" && (value.includes("\n") || value.length > 120));
+
+	if (isBlockValue) {
+		return (
+			<pre className="mt-2 whitespace-pre-wrap break-words bg-popover px-4 py-3 text-[11px] leading-relaxed text-popover-foreground/80">
+				{formatted}
+			</pre>
+		);
+	}
+
+	return (
+		<p className="mt-1 break-words font-mono text-[11px] leading-relaxed text-popover-foreground/80">
+			{formatted}
+		</p>
+	);
+}
+
+function StructuredArguments({ args }: { args: Record<string, unknown> }) {
+	return (
+		<div className="divide-y divide-popover-border border-t border-popover-border">
+			{Object.entries(args).map(([key, value]) => (
+				<div key={key} className="bg-popover">
+					<div className="px-4 py-3">
+						<p className="font-mono text-[10px] font-medium text-muted-foreground">{key}</p>
+						<ArgumentValue value={value} />
+					</div>
+				</div>
+			))}
+		</div>
+	);
+}
+
+export function ActionLogItem({ action, threadId, onRevertSuccess }: ActionLogItemProps) {
+	const [isExpanded, setIsExpanded] = useState(false);
+	const [isReverting, setIsReverting] = useState(false);
+	const [confirmOpen, setConfirmOpen] = useState(false);
+	const [copiedSection, setCopiedSection] = useState<"arguments" | null>(null);
+
+	const isAlreadyReverted = action.reverted_by_action_id !== null;
+	const isRevertAction = action.is_revert_action;
+	const hasError = action.error !== null && action.error !== undefined;
+
+	const Icon = getToolIcon(action.tool_name);
+	const displayName = getToolDisplayName(action.tool_name);
+
+	const argsPreview = action.args ? JSON.stringify(action.args, null, 2) : null;
+
+	const canRevert = action.reversible && !isAlreadyReverted && !isRevertAction && !hasError;
+
+	const handleCopyArguments = async () => {
+		if (!argsPreview) return;
+
+		try {
+			await navigator.clipboard.writeText(argsPreview);
+			setCopiedSection("arguments");
+			toast.success("Arguments copied");
+			window.setTimeout(() => setCopiedSection(null), 1200);
+		} catch {
+			toast.error("Failed to copy arguments.");
+		}
+	};
+
+	const handleRevert = async () => {
+		setIsReverting(true);
+		try {
+			const response = await agentActionsApiService.revert(threadId, action.id);
+			toast.success(response.message || "Action reverted successfully.");
+			onRevertSuccess();
+		} catch (err) {
+			const message =
+				err instanceof AppError
+					? err.message
+					: err instanceof Error
+						? err.message
+						: "Failed to revert action.";
+			toast.error(message);
+		} finally {
+			setIsReverting(false);
+			setConfirmOpen(false);
+		}
+	};
+
+	return (
+		<div
+			className={cn(
+				"overflow-hidden rounded-lg border border-popover-border bg-popover text-popover-foreground transition-colors",
+				isAlreadyReverted && "opacity-70"
+			)}
+		>
+			<Button
+				type="button"
+				variant="ghost"
+				onClick={() => setIsExpanded((v) => !v)}
+				className="h-auto w-full items-start justify-start gap-3 rounded-none p-3 text-left hover:bg-accent hover:text-accent-foreground"
+				aria-expanded={isExpanded}
+			>
+				<div className="flex size-8 shrink-0 items-center justify-center rounded-md bg-accent">
+					{isRevertAction ? (
+						<Undo2 className="size-4 text-muted-foreground" />
+					) : (
+						<Icon className="size-4 text-muted-foreground" />
+					)}
+				</div>
+				<div className="flex min-w-0 flex-1 flex-col gap-1">
+					<div className="flex flex-wrap items-center gap-1.5">
+						<span className="truncate text-sm font-medium">{displayName}</span>
+						{isRevertAction && (
+							<Badge variant="secondary" className="text-[10px]">
+								Revert
+							</Badge>
+						)}
+						{hasError && (
+							<Badge variant="destructive" className="text-[10px]">
+								Error
+							</Badge>
+						)}
+						{!isRevertAction && action.reversible && !isAlreadyReverted && (
+							<Badge
+								variant="secondary"
+								className="border-0 bg-neutral-200 px-1.5 py-0.5 text-[10px] text-neutral-700 dark:bg-neutral-700 dark:text-neutral-200"
+							>
+								Reversible
+							</Badge>
+						)}
+						{isAlreadyReverted && (
+							<Badge variant="secondary" className="text-[10px]">
+								Reverted
+							</Badge>
+						)}
+					</div>
+					<p className="text-xs text-muted-foreground">{formatRelativeDate(action.created_at)}</p>
+				</div>
+				<ChevronRight
+					className={cn(
+						"size-4 shrink-0 self-center text-muted-foreground transition-transform",
+						isExpanded && "rotate-90"
+					)}
+				/>
+			</Button>
+
+			{isExpanded && (
+				<div className="flex flex-col border-t border-popover-border bg-accent/80">
+					{action.args && argsPreview && (
+						<div className="border-b border-popover-border">
+							<div className="flex items-center justify-between px-4 py-2">
+								<p className="text-[10px] font-medium uppercase tracking-wide text-muted-foreground">
+									Arguments
+								</p>
+								<Button
+									type="button"
+									size="sm"
+									variant="ghost"
+									onClick={handleCopyArguments}
+									className="size-6 rounded-lg p-0 text-muted-foreground hover:bg-popover hover:text-popover-foreground"
+									aria-label={copiedSection === "arguments" ? "Arguments copied" : "Copy arguments"}
+								>
+									{copiedSection === "arguments" ? (
+										<Check className="size-3" />
+									) : (
+										<Copy className="size-3" />
+									)}
+								</Button>
+							</div>
+							<StructuredArguments args={action.args} />
+						</div>
+					)}
+					{action.error && (
+						<div className="border-b border-popover-border">
+							<p className="px-4 py-2 text-[10px] font-medium uppercase tracking-wide text-muted-foreground">
+								Error
+							</p>
+							<pre className="max-h-32 overflow-auto border-t border-popover-border bg-destructive/10 px-4 py-3 text-[11px] text-destructive">
+								{JSON.stringify(action.error, null, 2)}
+							</pre>
+						</div>
+					)}
+					{action.reverse_descriptor && (
+						<div className="border-b border-popover-border">
+							<p className="px-4 py-2 text-[10px] font-medium uppercase tracking-wide text-muted-foreground">
+								Reverse plan
+							</p>
+							<pre className="max-h-32 overflow-auto border-t border-popover-border bg-popover px-4 py-3 text-[11px] text-popover-foreground/80">
+								{JSON.stringify(action.reverse_descriptor, null, 2)}
+							</pre>
+						</div>
+					)}
+
+					<div className="flex items-center justify-between px-4 py-3">
+						<p className="text-[10px] text-muted-foreground">
+							Action ID: <span className="font-mono">{action.id}</span>
+						</p>
+						{canRevert ? (
+							<AlertDialog open={confirmOpen} onOpenChange={setConfirmOpen}>
+								<AlertDialogTrigger asChild>
+									<Button size="sm" variant="secondary" className="gap-1.5">
+										<RotateCcw className="size-3.5" />
+										Revert
+									</Button>
+								</AlertDialogTrigger>
+								<AlertDialogContent>
+									<AlertDialogHeader>
+										<AlertDialogTitle>Revert this action?</AlertDialogTitle>
+										<AlertDialogDescription>
+											This will undo <span className="font-medium">{displayName}</span> and append a
+											new audit entry. The agent's chat history is preserved — only the tool's
+											effects on your knowledge base or connectors will be reversed where possible.
+										</AlertDialogDescription>
+									</AlertDialogHeader>
+									<AlertDialogFooter>
+										<AlertDialogCancel disabled={isReverting}>Cancel</AlertDialogCancel>
+										<AlertDialogAction
+											onClick={(e) => {
+												e.preventDefault();
+												handleRevert();
+											}}
+											disabled={isReverting}
+											className="bg-secondary text-secondary-foreground hover:bg-secondary/80 focus-visible:ring-0"
+										>
+											{isReverting ? "Reverting…" : "Revert"}
+										</AlertDialogAction>
+									</AlertDialogFooter>
+								</AlertDialogContent>
+							</AlertDialog>
+						) : (
+							<div className="flex items-center gap-1.5 text-[11px] text-muted-foreground">
+								{isAlreadyReverted
+									? "Already reverted"
+									: isRevertAction
+										? "Revert entry"
+										: hasError
+											? "Cannot revert errored action"
+											: "Not reversible"}
+							</div>
+						)}
+					</div>
+				</div>
+			)}
+		</div>
+	);
+}
diff --git a/surfsense_web/components/announcements/AnnouncementCard.tsx b/surfsense_web/components/announcements/AnnouncementCard.tsx
index 792887e60c..ea0288b43c 100644
--- a/surfsense_web/components/announcements/AnnouncementCard.tsx
+++ b/surfsense_web/components/announcements/AnnouncementCard.tsx
@@ -4,14 +4,7 @@ import { Bell, ExternalLink, Info, type LucideIcon, Rocket, Wrench, Zap } from "
 import Link from "next/link";
 import { Badge } from "@/components/ui/badge";
 import { Button } from "@/components/ui/button";
-import {
-	Card,
-	CardContent,
-	CardDescription,
-	CardFooter,
-	CardHeader,
-	CardTitle,
-} from "@/components/ui/card";
+import { Card, CardContent, CardDescription, CardFooter, CardHeader } from "@/components/ui/card";
 import type { AnnouncementCategory } from "@/contracts/types/announcement.types";
 import type { AnnouncementWithState } from "@/hooks/use-announcements";
 import { formatRelativeDate } from "@/lib/format-date";
@@ -67,7 +60,9 @@ export function AnnouncementCard({ announcement }: { announcement: AnnouncementW
 						</div>
 						<div className="min-w-0 flex-1">
 							<div className="flex items-center gap-2 flex-wrap">
-								<CardTitle className="text-base leading-tight">{announcement.title}</CardTitle>
+								<h2 className="text-base font-semibold leading-tight tracking-tight">
+									{announcement.title}
+								</h2>
 								<Badge variant={config.badgeVariant} className="text-[10px] px-1.5 py-0">
 									{config.label}
 								</Badge>
diff --git a/surfsense_web/components/announcements/AnnouncementToastProvider.tsx b/surfsense_web/components/announcements/AnnouncementToastProvider.tsx
index 6cb1b17e5b..3e99f5c320 100644
--- a/surfsense_web/components/announcements/AnnouncementToastProvider.tsx
+++ b/surfsense_web/components/announcements/AnnouncementToastProvider.tsx
@@ -65,7 +65,9 @@ export function AnnouncementToastProvider() {
 		if (hasChecked.current) return;
 		hasChecked.current = true;
 
-		const timer = setTimeout(() => {
+		const staggerTimers: ReturnType<typeof setTimeout>[] = [];
+
+		const outerTimer = setTimeout(() => {
 			const authed = isAuthenticated();
 			const active = getActiveAnnouncements(announcements, authed);
 			const importantUntoasted = active.filter(
@@ -74,11 +76,14 @@ export function AnnouncementToastProvider() {
 
 			for (let i = 0; i < importantUntoasted.length; i++) {
 				const announcement = importantUntoasted[i];
-				setTimeout(() => showAnnouncementToast(announcement), i * 800);
+				staggerTimers.push(setTimeout(() => showAnnouncementToast(announcement), i * 800));
 			}
 		}, 1500);
 
-		return () => clearTimeout(timer);
+		return () => {
+			clearTimeout(outerTimer);
+			for (const id of staggerTimers) clearTimeout(id);
+		};
 	}, []);
 
 	return null;
diff --git a/surfsense_web/components/announcements/AnnouncementsDialog.tsx b/surfsense_web/components/announcements/AnnouncementsDialog.tsx
new file mode 100644
index 0000000000..097103e3d2
--- /dev/null
+++ b/surfsense_web/components/announcements/AnnouncementsDialog.tsx
@@ -0,0 +1,50 @@
+"use client";
+
+import { useAtom } from "jotai";
+import { useEffect } from "react";
+import { announcementsDialogAtom } from "@/atoms/layout/dialogs.atom";
+import { AnnouncementCard } from "@/components/announcements/AnnouncementCard";
+import { AnnouncementsEmptyState } from "@/components/announcements/AnnouncementsEmptyState";
+import { Dialog, DialogContent, DialogTitle } from "@/components/ui/dialog";
+import { Separator } from "@/components/ui/separator";
+import { useAnnouncements } from "@/hooks/use-announcements";
+
+export function AnnouncementsDialog() {
+	const [open, setOpen] = useAtom(announcementsDialogAtom);
+	const { announcements, markAllRead } = useAnnouncements();
+
+	// Auto-mark all visible announcements as read when the dialog opens
+	useEffect(() => {
+		if (open) {
+			markAllRead();
+		}
+	}, [open, markAllRead]);
+
+	return (
+		<Dialog open={open} onOpenChange={setOpen}>
+			<DialogContent className="select-none max-w-[900px] w-[95vw] md:w-[90vw] h-[90vh] md:h-[80vh] max-h-[640px] flex flex-col p-0 gap-0 overflow-hidden bg-popover text-popover-foreground">
+				<DialogTitle className="sr-only">What's New</DialogTitle>
+
+				<div className="flex flex-1 flex-col overflow-hidden min-w-0">
+					<div className="px-6 md:px-8 pt-6 pb-2 shrink-0">
+						<h2 className="text-lg font-semibold">What's New</h2>
+						<Separator className="mt-4" />
+					</div>
+					<div className="flex-1 overflow-y-auto overflow-x-hidden">
+						<div className="px-4 md:px-8 pt-4 pb-6 min-w-0">
+							{announcements.length === 0 ? (
+								<AnnouncementsEmptyState />
+							) : (
+								<div className="flex flex-col gap-4">
+									{announcements.map((announcement) => (
+										<AnnouncementCard key={announcement.id} announcement={announcement} />
+									))}
+								</div>
+							)}
+						</div>
+					</div>
+				</div>
+			</DialogContent>
+		</Dialog>
+	);
+}
diff --git a/surfsense_web/components/announcements/AnnouncementsEmptyState.tsx b/surfsense_web/components/announcements/AnnouncementsEmptyState.tsx
index 9ed1ea45dd..7aa4564b6c 100644
--- a/surfsense_web/components/announcements/AnnouncementsEmptyState.tsx
+++ b/surfsense_web/components/announcements/AnnouncementsEmptyState.tsx
@@ -2,13 +2,13 @@ import { BellOff } from "lucide-react";
 
 export function AnnouncementsEmptyState() {
 	return (
-		<div className="flex flex-col items-center justify-center py-16 text-center">
-			<div className="flex h-16 w-16 items-center justify-center rounded-full bg-muted mb-4">
-				<BellOff className="h-7 w-7 text-muted-foreground" />
+		<div className="flex flex-col items-center justify-center py-12 text-center">
+			<div className="mb-3 flex h-12 w-12 items-center justify-center rounded-full bg-muted">
+				<BellOff className="h-5 w-5 text-muted-foreground" />
 			</div>
-			<h3 className="text-lg font-semibold">No announcements</h3>
-			<p className="mt-1 text-sm text-muted-foreground max-w-sm">
-				You're all caught up! New announcements will appear here.
+			<h3 className="text-sm font-semibold">Nothing new yet</h3>
+			<p className="mt-1 max-w-xs text-xs text-muted-foreground">
+				You're all caught up! New updates will appear here.
 			</p>
 		</div>
 	);
diff --git a/surfsense_web/components/assistant-ui/assistant-message.tsx b/surfsense_web/components/assistant-ui/assistant-message.tsx
index 097197107e..ac17324414 100644
--- a/surfsense_web/components/assistant-ui/assistant-message.tsx
+++ b/surfsense_web/components/assistant-ui/assistant-message.tsx
@@ -1,8 +1,10 @@
 import {
+	ActionBarMorePrimitive,
 	ActionBarPrimitive,
 	AuiIf,
 	ErrorPrimitive,
 	MessagePrimitive,
+	type ToolCallMessagePartComponent,
 	useAui,
 	useAuiState,
 } from "@assistant-ui/react";
@@ -14,24 +16,36 @@ import {
 	DownloadIcon,
 	ExternalLink,
 	Globe,
-	MessageSquare,
+	MessageCircleReply,
+	MoreHorizontalIcon,
 	RefreshCwIcon,
 } from "lucide-react";
 import dynamic from "next/dynamic";
 import type { FC } from "react";
 import { useEffect, useMemo, useRef, useState } from "react";
 import { commentsEnabledAtom, targetCommentIdAtom } from "@/atoms/chat/current-thread.atom";
+import {
+	globalNewLLMConfigsAtom,
+	newLLMConfigsAtom,
+} from "@/atoms/new-llm-config/new-llm-config-query.atoms";
 import { activeSearchSpaceIdAtom } from "@/atoms/search-spaces/search-space-query.atoms";
 import {
 	CitationMetadataProvider,
 	useAllCitationMetadata,
 } from "@/components/assistant-ui/citation-metadata-context";
 import { MarkdownText } from "@/components/assistant-ui/markdown-text";
-import { ToolFallback } from "@/components/assistant-ui/tool-fallback";
+import { ReasoningMessagePart } from "@/components/assistant-ui/reasoning-message-part";
+import { RevertTurnButton } from "@/components/assistant-ui/revert-turn-button";
+import { useTokenUsage } from "@/components/assistant-ui/token-usage-context";
 import { TooltipIconButton } from "@/components/assistant-ui/tooltip-icon-button";
 import { CommentPanelContainer } from "@/components/chat-comments/comment-panel-container/comment-panel-container";
 import { CommentSheet } from "@/components/chat-comments/comment-sheet/comment-sheet";
 import type { SerializableCitation } from "@/components/tool-ui/citation";
+import {
+	openSafeNavigationHref,
+	resolveSafeNavigationHref,
+} from "@/components/tool-ui/shared/media";
+import { Button } from "@/components/ui/button";
 import {
 	Drawer,
 	DrawerContent,
@@ -39,9 +53,12 @@ import {
 	DrawerHeader,
 	DrawerTitle,
 } from "@/components/ui/drawer";
+import { DropdownMenuLabel } from "@/components/ui/dropdown-menu";
 import { useComments } from "@/hooks/use-comments";
 import { useMediaQuery } from "@/hooks/use-media-query";
 import { useElectronAPI } from "@/hooks/use-platform";
+import { getProviderIcon } from "@/lib/provider-icons";
+import { tryGetHostname } from "@/lib/url";
 import { cn } from "@/lib/utils";
 
 // Captured once at module load — survives client-side navigations that strip the query param.
@@ -57,6 +74,13 @@ const GenerateReportToolUI = dynamic(
 		})),
 	{ ssr: false }
 );
+const GenerateResumeToolUI = dynamic(
+	() =>
+		import("@/components/tool-ui/generate-resume").then((m) => ({
+			default: m.GenerateResumeToolUI,
+		})),
+	{ ssr: false }
+);
 const GeneratePodcastToolUI = dynamic(
 	() =>
 		import("@/components/tool-ui/generate-podcast").then((m) => ({
@@ -76,164 +100,12 @@ const GenerateImageToolUI = dynamic(
 		import("@/components/tool-ui/generate-image").then((m) => ({ default: m.GenerateImageToolUI })),
 	{ ssr: false }
 );
-const SaveMemoryToolUI = dynamic(
-	() => import("@/components/tool-ui/user-memory").then((m) => ({ default: m.SaveMemoryToolUI })),
-	{ ssr: false }
-);
-const RecallMemoryToolUI = dynamic(
-	() => import("@/components/tool-ui/user-memory").then((m) => ({ default: m.RecallMemoryToolUI })),
-	{ ssr: false }
-);
-const SandboxExecuteToolUI = dynamic(
-	() =>
-		import("@/components/tool-ui/sandbox-execute").then((m) => ({
-			default: m.SandboxExecuteToolUI,
-		})),
-	{ ssr: false }
-);
-const CreateNotionPageToolUI = dynamic(
-	() => import("@/components/tool-ui/notion").then((m) => ({ default: m.CreateNotionPageToolUI })),
-	{ ssr: false }
-);
-const UpdateNotionPageToolUI = dynamic(
-	() => import("@/components/tool-ui/notion").then((m) => ({ default: m.UpdateNotionPageToolUI })),
-	{ ssr: false }
-);
-const DeleteNotionPageToolUI = dynamic(
-	() => import("@/components/tool-ui/notion").then((m) => ({ default: m.DeleteNotionPageToolUI })),
-	{ ssr: false }
-);
-const CreateLinearIssueToolUI = dynamic(
-	() => import("@/components/tool-ui/linear").then((m) => ({ default: m.CreateLinearIssueToolUI })),
-	{ ssr: false }
-);
-const UpdateLinearIssueToolUI = dynamic(
-	() => import("@/components/tool-ui/linear").then((m) => ({ default: m.UpdateLinearIssueToolUI })),
-	{ ssr: false }
-);
-const DeleteLinearIssueToolUI = dynamic(
-	() => import("@/components/tool-ui/linear").then((m) => ({ default: m.DeleteLinearIssueToolUI })),
-	{ ssr: false }
-);
-const CreateGoogleDriveFileToolUI = dynamic(
-	() =>
-		import("@/components/tool-ui/google-drive").then((m) => ({
-			default: m.CreateGoogleDriveFileToolUI,
-		})),
-	{ ssr: false }
-);
-const DeleteGoogleDriveFileToolUI = dynamic(
-	() =>
-		import("@/components/tool-ui/google-drive").then((m) => ({
-			default: m.DeleteGoogleDriveFileToolUI,
-		})),
-	{ ssr: false }
-);
-const CreateOneDriveFileToolUI = dynamic(
-	() =>
-		import("@/components/tool-ui/onedrive").then((m) => ({ default: m.CreateOneDriveFileToolUI })),
-	{ ssr: false }
-);
-const DeleteOneDriveFileToolUI = dynamic(
-	() =>
-		import("@/components/tool-ui/onedrive").then((m) => ({ default: m.DeleteOneDriveFileToolUI })),
-	{ ssr: false }
-);
-const CreateDropboxFileToolUI = dynamic(
-	() =>
-		import("@/components/tool-ui/dropbox").then((m) => ({ default: m.CreateDropboxFileToolUI })),
-	{ ssr: false }
-);
-const DeleteDropboxFileToolUI = dynamic(
-	() =>
-		import("@/components/tool-ui/dropbox").then((m) => ({ default: m.DeleteDropboxFileToolUI })),
-	{ ssr: false }
-);
-const CreateCalendarEventToolUI = dynamic(
-	() =>
-		import("@/components/tool-ui/google-calendar").then((m) => ({
-			default: m.CreateCalendarEventToolUI,
-		})),
-	{ ssr: false }
-);
-const UpdateCalendarEventToolUI = dynamic(
-	() =>
-		import("@/components/tool-ui/google-calendar").then((m) => ({
-			default: m.UpdateCalendarEventToolUI,
-		})),
-	{ ssr: false }
-);
-const DeleteCalendarEventToolUI = dynamic(
-	() =>
-		import("@/components/tool-ui/google-calendar").then((m) => ({
-			default: m.DeleteCalendarEventToolUI,
-		})),
-	{ ssr: false }
-);
-const CreateGmailDraftToolUI = dynamic(
-	() => import("@/components/tool-ui/gmail").then((m) => ({ default: m.CreateGmailDraftToolUI })),
-	{ ssr: false }
-);
-const UpdateGmailDraftToolUI = dynamic(
-	() => import("@/components/tool-ui/gmail").then((m) => ({ default: m.UpdateGmailDraftToolUI })),
-	{ ssr: false }
-);
-const SendGmailEmailToolUI = dynamic(
-	() => import("@/components/tool-ui/gmail").then((m) => ({ default: m.SendGmailEmailToolUI })),
-	{ ssr: false }
-);
-const TrashGmailEmailToolUI = dynamic(
-	() => import("@/components/tool-ui/gmail").then((m) => ({ default: m.TrashGmailEmailToolUI })),
-	{ ssr: false }
-);
-const CreateJiraIssueToolUI = dynamic(
-	() => import("@/components/tool-ui/jira").then((m) => ({ default: m.CreateJiraIssueToolUI })),
-	{ ssr: false }
-);
-const UpdateJiraIssueToolUI = dynamic(
-	() => import("@/components/tool-ui/jira").then((m) => ({ default: m.UpdateJiraIssueToolUI })),
-	{ ssr: false }
-);
-const DeleteJiraIssueToolUI = dynamic(
-	() => import("@/components/tool-ui/jira").then((m) => ({ default: m.DeleteJiraIssueToolUI })),
-	{ ssr: false }
-);
-const CreateConfluencePageToolUI = dynamic(
-	() =>
-		import("@/components/tool-ui/confluence").then((m) => ({
-			default: m.CreateConfluencePageToolUI,
-		})),
-	{ ssr: false }
-);
-const UpdateConfluencePageToolUI = dynamic(
-	() =>
-		import("@/components/tool-ui/confluence").then((m) => ({
-			default: m.UpdateConfluencePageToolUI,
-		})),
-	{ ssr: false }
-);
-const DeleteConfluencePageToolUI = dynamic(
-	() =>
-		import("@/components/tool-ui/confluence").then((m) => ({
-			default: m.DeleteConfluencePageToolUI,
-		})),
-	{ ssr: false }
-);
-
-function extractDomain(url: string): string | undefined {
-	try {
-		return new URL(url).hostname.replace(/^www\./, "");
-	} catch {
-		return undefined;
-	}
-}
-
 function useCitationsFromMetadata(): SerializableCitation[] {
 	const allCitations = useAllCitationMetadata();
 	return useMemo(() => {
 		const result: SerializableCitation[] = [];
 		for (const [url, meta] of allCitations) {
-			const domain = extractDomain(url);
+			const domain = tryGetHostname(url);
 			result.push({
 				id: `url-cite-${url}`,
 				href: url,
@@ -265,14 +137,15 @@ const MobileCitationDrawer: FC = () => {
 
 	return (
 		<>
-			<button
+			<Button
 				type="button"
+				variant="ghost"
 				onClick={() => setOpen(true)}
 				className={cn(
-					"isolate inline-flex cursor-pointer items-center gap-2 rounded-lg px-3 py-2",
+					"isolate h-auto cursor-pointer gap-2 rounded-lg px-3 py-2",
 					"bg-muted/40 outline-none",
 					"transition-colors duration-150",
-					"hover:bg-muted/70",
+					"hover:bg-accent hover:text-accent-foreground",
 					"focus-visible:ring-ring focus-visible:ring-2"
 				)}
 			>
@@ -315,7 +188,7 @@ const MobileCitationDrawer: FC = () => {
 				<span className="text-muted-foreground text-sm tabular-nums">
 					{citations.length} source{citations.length !== 1 && "s"}
 				</span>
-			</button>
+			</Button>
 
 			<Drawer open={open} onOpenChange={setOpen}>
 				<DrawerContent className="max-h-[85vh] flex flex-col">
@@ -325,11 +198,12 @@ const MobileCitationDrawer: FC = () => {
 					</DrawerHeader>
 					<div className="overflow-y-auto flex-1 min-h-0 px-1 pb-6">
 						{citations.map((citation) => (
-							<button
+							<Button
 								key={citation.id}
 								type="button"
+								variant="ghost"
 								onClick={() => handleNavigate(citation)}
-								className="group flex w-full items-center gap-2.5 rounded-md px-3 py-2.5 text-left transition-colors hover:bg-muted focus-visible:bg-muted focus-visible:outline-none"
+								className="group h-auto w-full justify-start gap-2.5 px-3 py-2.5 text-left hover:bg-accent hover:text-accent-foreground focus-visible:bg-muted"
 							>
 								{citation.favicon ? (
 									// biome-ignore lint/performance/noImgElement: external favicon from arbitrary domain
@@ -351,7 +225,7 @@ const MobileCitationDrawer: FC = () => {
 									<p className="text-muted-foreground truncate text-xs">{citation.domain}</p>
 								</div>
 								<ExternalLink className="text-muted-foreground size-3.5 shrink-0 opacity-0 transition-opacity group-hover:opacity-100" />
-							</button>
+							</Button>
 						))}
 					</div>
 				</DrawerContent>
@@ -370,6 +244,141 @@ export const MessageError: FC = () => {
 	);
 };
 
+function formatMessageDate(date: Date): string {
+	return date.toLocaleDateString(undefined, {
+		month: "short",
+		day: "numeric",
+		hour: "numeric",
+		minute: "2-digit",
+		hour12: true,
+	});
+}
+
+/**
+ * Format provider USD cost (in micro-USD) for inline display next to a
+ * token count. Falls back to ``"<$0.001"`` for sub-tenth-of-a-cent
+ * costs so a real-but-tiny figure doesn't render as ``$0.000``.
+ */
+function formatTurnCost(micros: number): string {
+	const dollars = micros / 1_000_000;
+	if (dollars >= 1) return `$${dollars.toFixed(2)}`;
+	if (dollars >= 0.01) return `$${dollars.toFixed(3)}`;
+	if (dollars > 0) return "<$0.001";
+	return "$0";
+}
+
+const MessageInfoDropdown: FC<{ chatTurnId: string | null | undefined }> = ({ chatTurnId }) => {
+	const messageId = useAuiState(({ message }) => message?.id);
+	const createdAt = useAuiState(({ message }) => message?.createdAt);
+	const usage = useTokenUsage(messageId);
+
+	const { data: localConfigs } = useAtomValue(newLLMConfigsAtom);
+	const { data: globalConfigs } = useAtomValue(globalNewLLMConfigsAtom);
+
+	const configByModel = useMemo(() => {
+		const map = new Map<string, { name: string; provider: string }>();
+		for (const c of [...(globalConfigs ?? []), ...(localConfigs ?? [])]) {
+			map.set(c.model_name, { name: c.name, provider: c.provider });
+		}
+		return map;
+	}, [localConfigs, globalConfigs]);
+
+	const resolveModel = (modelKey: string) => {
+		const parts = modelKey.split("/");
+		const bare = parts[parts.length - 1] ?? modelKey;
+		const config = configByModel.get(modelKey) ?? configByModel.get(bare);
+		return config
+			? { name: config.name, icon: getProviderIcon(config.provider, { className: "size-3.5" }) }
+			: { name: modelKey, icon: null };
+	};
+
+	const modelBreakdown = usage ? (usage.usage ?? usage.model_breakdown) : undefined;
+	const models = modelBreakdown ? Object.entries(modelBreakdown) : [];
+	const hasUsage = usage && usage.total_tokens > 0;
+
+	return (
+		<ActionBarMorePrimitive.Root>
+			<ActionBarMorePrimitive.Trigger asChild>
+				<Button variant="ghost" size="icon" className="aui-button-icon size-6 p-1">
+					<MoreHorizontalIcon className="size-4" />
+					<span className="sr-only">More</span>
+				</Button>
+			</ActionBarMorePrimitive.Trigger>
+			<ActionBarMorePrimitive.Content
+				align="start"
+				className="bg-popover text-popover-foreground z-50 max-h-(--radix-dropdown-menu-content-available-height) min-w-[180px] origin-(--radix-dropdown-menu-content-transform-origin) overflow-x-hidden overflow-y-auto rounded-md p-1 shadow-md data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2"
+			>
+				{createdAt && (
+					<DropdownMenuLabel className="text-xs text-muted-foreground font-normal select-none">
+						{formatMessageDate(createdAt)}
+					</DropdownMenuLabel>
+				)}
+				{hasUsage && (
+					<>
+						<ActionBarMorePrimitive.Separator className="bg-popover-border mx-1 my-1 h-px" />
+						{models.length > 0 ? (
+							models.map(([model, counts]) => {
+								const { name, icon } = resolveModel(model);
+								const costMicros = counts.cost_micros;
+								return (
+									<ActionBarMorePrimitive.Item
+										key={model}
+										className="focus:bg-accent focus:text-accent-foreground relative flex cursor-default flex-col items-start gap-0.5 rounded-sm px-2 py-1.5 text-sm outline-hidden select-none"
+										onSelect={(e) => e.preventDefault()}
+									>
+										<span className="flex items-center gap-1.5 text-xs font-medium">
+											{icon}
+											{name}
+										</span>
+										<span className="text-xs text-muted-foreground">
+											{counts.total_tokens.toLocaleString()} tokens
+											{costMicros && costMicros > 0 ? ` · ${formatTurnCost(costMicros)}` : ""}
+										</span>
+									</ActionBarMorePrimitive.Item>
+								);
+							})
+						) : (
+							<ActionBarMorePrimitive.Item
+								className="focus:bg-accent focus:text-accent-foreground relative flex cursor-default flex-col items-start gap-0.5 rounded-sm px-2 py-1.5 text-sm outline-hidden select-none"
+								onSelect={(e) => e.preventDefault()}
+							>
+								<span className="text-xs text-muted-foreground">
+									{usage.total_tokens.toLocaleString()} tokens
+									{usage.cost_micros && usage.cost_micros > 0
+										? ` · ${formatTurnCost(usage.cost_micros)}`
+										: ""}
+								</span>
+							</ActionBarMorePrimitive.Item>
+						)}
+					</>
+				)}
+				<RevertTurnButton chatTurnId={chatTurnId} variant="menu-item" />
+			</ActionBarMorePrimitive.Content>
+		</ActionBarMorePrimitive.Root>
+	);
+};
+
+/**
+ * Tools rendered in the message BODY — value-add deliverables only.
+ *
+ * Process tools (connector CRUD, sandbox execute, memory updates,
+ * etc.) are NOT here; they render in the timeline via the slice's
+ * tool registry (see ``features/chat-messages/timeline``). The body
+ * opts out of every other tool by registering ``NullBodyTool`` as the
+ * fallback — any tool name not in this map renders nothing in the
+ * body and is picked up by the timeline instead.
+ */
+const BODY_TOOLS = {
+	generate_report: GenerateReportToolUI,
+	generate_resume: GenerateResumeToolUI,
+	generate_podcast: GeneratePodcastToolUI,
+	generate_video_presentation: GenerateVideoPresentationToolUI,
+	display_image: GenerateImageToolUI,
+	generate_image: GenerateImageToolUI,
+} as const;
+
+const NullBodyTool: ToolCallMessagePartComponent = () => null;
+
 const AssistantMessageInner: FC = () => {
 	const isMobile = !useMediaQuery("(min-width: 768px)");
 
@@ -379,47 +388,10 @@ const AssistantMessageInner: FC = () => {
 				<MessagePrimitive.Parts
 					components={{
 						Text: MarkdownText,
+						Reasoning: ReasoningMessagePart,
 						tools: {
-							by_name: {
-								generate_report: GenerateReportToolUI,
-								generate_podcast: GeneratePodcastToolUI,
-								generate_video_presentation: GenerateVideoPresentationToolUI,
-								display_image: GenerateImageToolUI,
-								generate_image: GenerateImageToolUI,
-								save_memory: SaveMemoryToolUI,
-								recall_memory: RecallMemoryToolUI,
-								execute: SandboxExecuteToolUI,
-								create_notion_page: CreateNotionPageToolUI,
-								update_notion_page: UpdateNotionPageToolUI,
-								delete_notion_page: DeleteNotionPageToolUI,
-								create_linear_issue: CreateLinearIssueToolUI,
-								update_linear_issue: UpdateLinearIssueToolUI,
-								delete_linear_issue: DeleteLinearIssueToolUI,
-								create_google_drive_file: CreateGoogleDriveFileToolUI,
-								delete_google_drive_file: DeleteGoogleDriveFileToolUI,
-								create_onedrive_file: CreateOneDriveFileToolUI,
-								delete_onedrive_file: DeleteOneDriveFileToolUI,
-								create_dropbox_file: CreateDropboxFileToolUI,
-								delete_dropbox_file: DeleteDropboxFileToolUI,
-								create_calendar_event: CreateCalendarEventToolUI,
-								update_calendar_event: UpdateCalendarEventToolUI,
-								delete_calendar_event: DeleteCalendarEventToolUI,
-								create_gmail_draft: CreateGmailDraftToolUI,
-								update_gmail_draft: UpdateGmailDraftToolUI,
-								send_gmail_email: SendGmailEmailToolUI,
-								trash_gmail_email: TrashGmailEmailToolUI,
-								create_jira_issue: CreateJiraIssueToolUI,
-								update_jira_issue: UpdateJiraIssueToolUI,
-								delete_jira_issue: DeleteJiraIssueToolUI,
-								create_confluence_page: CreateConfluencePageToolUI,
-								update_confluence_page: UpdateConfluencePageToolUI,
-								delete_confluence_page: DeleteConfluencePageToolUI,
-								web_search: () => null,
-								link_preview: () => null,
-								multi_link_preview: () => null,
-								scrape_webpage: () => null,
-							},
-							Fallback: ToolFallback,
+							by_name: BODY_TOOLS,
+							Fallback: NullBodyTool,
 						},
 					}}
 				/>
@@ -432,8 +404,10 @@ const AssistantMessageInner: FC = () => {
 				</div>
 			)}
 
-			<div className="aui-assistant-message-footer mt-1 mb-5 ml-2 flex">
-				<AssistantActionBar />
+			<div className="aui-assistant-message-footer mt-3 mb-5 ml-2 h-6">
+				<div className="h-full opacity-100 transition-opacity">
+					<AssistantActionBar />
+				</div>
 			</div>
 		</CitationMetadataProvider>
 	);
@@ -526,35 +500,42 @@ export const AssistantMessage: FC = () => {
 			className="aui-assistant-message-root group fade-in slide-in-from-bottom-1 relative mx-auto w-full max-w-(--thread-max-width) animate-in py-3 duration-150"
 			data-role="assistant"
 		>
-			{/* Comment trigger — right-aligned, just below user query on all screen sizes */}
-			{showCommentTrigger && (
-				<div className="mr-2 mb-1 flex justify-end">
-					<button
-						ref={isDesktop ? commentTriggerRef : undefined}
-						type="button"
-						onClick={
-							isDesktop ? () => setIsInlineOpen((prev) => !prev) : () => setIsSheetOpen(true)
-						}
-						className={cn(
-							"flex items-center gap-1.5 rounded-full px-3 py-1 text-sm transition-colors",
-							isDesktop && isInlineOpen
-								? "bg-primary/10 text-primary"
-								: hasComments
-									? "text-primary hover:bg-primary/10"
-									: "text-muted-foreground hover:text-foreground hover:bg-muted"
-						)}
-					>
-						<MessageSquare className={cn("size-3.5", hasComments && "fill-current")} />
-						{hasComments ? (
-							<span>
-								{commentCount} {commentCount === 1 ? "comment" : "comments"}
-							</span>
-						) : (
-							<span>Add comment</span>
-						)}
-					</button>
-				</div>
-			)}
+			{/* Fixed trigger slot prevents any vertical reflow when visibility changes */}
+			<div className="mr-2 mb-1 flex h-7 justify-end">
+				<Button
+					ref={isDesktop ? commentTriggerRef : undefined}
+					type="button"
+					variant="ghost"
+					onClick={
+						showCommentTrigger
+							? isDesktop
+								? () => setIsInlineOpen((prev) => !prev)
+								: () => setIsSheetOpen(true)
+							: undefined
+					}
+					aria-hidden={!showCommentTrigger}
+					tabIndex={showCommentTrigger ? 0 : -1}
+					className={cn(
+						"h-auto gap-1.5 rounded-full px-3 py-1 text-sm transition-colors",
+						"opacity-0 pointer-events-none",
+						showCommentTrigger && "opacity-100 pointer-events-auto",
+						isDesktop && isInlineOpen
+							? "bg-primary/10 text-primary"
+							: hasComments
+								? "text-primary hover:bg-primary/10"
+								: "text-muted-foreground hover:text-accent-foreground hover:bg-accent hover:text-accent-foreground"
+					)}
+				>
+					<MessageCircleReply className={cn("size-3.5", hasComments && "fill-current")} />
+					{hasComments ? (
+						<span>
+							{commentCount} {commentCount === 1 ? "comment" : "comments"}
+						</span>
+					) : (
+						<span>Add comment</span>
+					)}
+				</Button>
+			</div>
 
 			{/* Desktop floating comment panel — overlays on top of chat content */}
 			{showCommentTrigger && isDesktop && isInlineOpen && dbMessageId && (
@@ -586,6 +567,13 @@ const AssistantActionBar: FC = () => {
 	const isLast = useAuiState((s) => s.message.isLast);
 	const aui = useAui();
 	const api = useElectronAPI();
+	// Surface the persisted ``chat_turn_id`` so the per-turn revert
+	// affordance can scope to just this message's actions. Streamed
+	// turns get their id once the assistant message is hydrated/finalised.
+	const chatTurnId = useAuiState(({ message }) => {
+		const meta = message?.metadata as { custom?: { chatTurnId?: string | null } } | undefined;
+		return meta?.custom?.chatTurnId ?? null;
+	});
 
 	const isQuickAssist = !!api?.replaceText && IS_QUICK_ASSIST_WINDOW;
 
@@ -597,7 +585,7 @@ const AssistantActionBar: FC = () => {
 			className="aui-assistant-action-bar-root -ml-1 col-start-3 row-start-2 flex gap-1 text-muted-foreground md:data-floating:absolute md:data-floating:rounded-md md:data-floating:p-1 [&>button]:opacity-100 md:[&>button]:opacity-[var(--aui-button-opacity,1)]"
 		>
 			<ActionBarPrimitive.Copy asChild>
-				<TooltipIconButton tooltip="Copy to clipboard">
+				<TooltipIconButton tooltip="Copy">
 					<AuiIf condition={({ message }) => message.isCopied}>
 						<CheckIcon />
 					</AuiIf>
@@ -629,6 +617,7 @@ const AssistantActionBar: FC = () => {
 					<ClipboardPaste />
 				</TooltipIconButton>
 			)}
+			<MessageInfoDropdown chatTurnId={chatTurnId} />
 		</ActionBarPrimitive.Root>
 	);
 };
diff --git a/surfsense_web/components/assistant-ui/chat-viewport.tsx b/surfsense_web/components/assistant-ui/chat-viewport.tsx
new file mode 100644
index 0000000000..cb0b484efd
--- /dev/null
+++ b/surfsense_web/components/assistant-ui/chat-viewport.tsx
@@ -0,0 +1,51 @@
+"use client";
+
+import { ThreadPrimitive } from "@assistant-ui/react";
+import { ArrowDownIcon } from "lucide-react";
+import type { FC, ReactNode } from "react";
+import { TooltipIconButton } from "@/components/assistant-ui/tooltip-icon-button";
+
+const ChatScrollToBottom: FC = () => (
+	<ThreadPrimitive.ScrollToBottom asChild>
+		<TooltipIconButton
+			tooltip="Scroll to bottom"
+			className="aui-thread-scroll-to-bottom -top-12 absolute z-10 self-center rounded-full border-0 bg-muted p-4 text-foreground hover:bg-accent hover:text-accent-foreground disabled:invisible"
+		>
+			<ArrowDownIcon />
+		</TooltipIconButton>
+	</ThreadPrimitive.ScrollToBottom>
+);
+
+export interface ChatViewportProps {
+	children: ReactNode;
+	footer?: ReactNode;
+}
+
+export const ChatViewport: FC<ChatViewportProps> = ({ children, footer }) => (
+	<ThreadPrimitive.Viewport
+		turnAnchor="top"
+		autoScroll
+		scrollToBottomOnRunStart
+		scrollToBottomOnInitialize
+		scrollToBottomOnThreadSwitch
+		className="aui-thread-viewport relative flex flex-1 min-h-0 flex-col overflow-y-auto px-4 scroll-smooth"
+		style={{ scrollbarGutter: "stable" }}
+	>
+		<div
+			aria-hidden
+			className="aui-chat-viewport-top-fade pointer-events-none sticky top-0 z-10 -mx-4 h-2 shrink-0 bg-gradient-to-b from-main-panel from-20% to-transparent"
+		/>
+		{children}
+		{footer ? (
+			<ThreadPrimitive.ViewportFooter
+				className="aui-chat-composer-footer sticky bottom-0 z-20 -mx-4 mt-auto flex flex-col items-stretch bg-gradient-to-t from-main-panel from-60% to-transparent px-4 pt-6"
+				style={{ paddingBottom: "max(0.5rem, env(safe-area-inset-bottom))" }}
+			>
+				<div className="aui-chat-composer-area relative mx-auto flex w-full max-w-(--thread-max-width) flex-col gap-3 overflow-visible">
+					<ChatScrollToBottom />
+					{footer}
+				</div>
+			</ThreadPrimitive.ViewportFooter>
+		) : null}
+	</ThreadPrimitive.Viewport>
+);
diff --git a/surfsense_web/components/assistant-ui/connector-popup.tsx b/surfsense_web/components/assistant-ui/connector-popup.tsx
index c41e986d44..6699a55675 100644
--- a/surfsense_web/components/assistant-ui/connector-popup.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup.tsx
@@ -1,7 +1,8 @@
 "use client";
 
-import { useAtomValue, useSetAtom } from "jotai";
+import { useAtomValue } from "jotai";
 import { AlertTriangle, Settings } from "lucide-react";
+import { useRouter } from "next/navigation";
 import { forwardRef, useEffect, useImperativeHandle, useMemo, useState } from "react";
 import { createPortal } from "react-dom";
 import { statusInboxItemsAtom } from "@/atoms/inbox/status-inbox.atom";
@@ -10,7 +11,6 @@ import {
 	llmPreferencesAtom,
 } from "@/atoms/new-llm-config/new-llm-config-query.atoms";
 import { activeSearchSpaceIdAtom } from "@/atoms/search-spaces/search-space-query.atoms";
-import { searchSpaceSettingsDialogAtom } from "@/atoms/settings/settings-dialog.atoms";
 import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
 import { Button } from "@/components/ui/button";
 import { Dialog, DialogContent, DialogTitle } from "@/components/ui/dialog";
@@ -44,8 +44,8 @@ interface ConnectorIndicatorProps {
 
 export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, ConnectorIndicatorProps>(
 	(_props, ref) => {
+		const router = useRouter();
 		const searchSpaceId = useAtomValue(activeSearchSpaceIdAtom);
-		const setSearchSpaceSettingsDialog = useSetAtom(searchSpaceSettingsDialogAtom);
 		const { data: preferences = {}, isFetching: preferencesLoading } =
 			useAtomValue(llmPreferencesAtom);
 		const { data: globalConfigs = [], isFetching: globalConfigsLoading } =
@@ -98,6 +98,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
 			periodicEnabled,
 			frequencyMinutes,
 			enableSummary,
+			enableVisionLlm,
 			allConnectors,
 			viewingAccountsType,
 			viewingMCPList,
@@ -109,6 +110,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
 			setPeriodicEnabled,
 			setFrequencyMinutes,
 			setEnableSummary,
+			setEnableVisionLlm,
 			handleOpenChange,
 			handleTabChange,
 			handleScroll,
@@ -216,7 +218,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
 					onPointerDownOutside={(e) => {
 						if (pickerOpen) e.preventDefault();
 					}}
-					className="max-w-3xl w-[95vw] sm:w-full h-[75vh] sm:h-[85vh] flex flex-col p-0 gap-0 overflow-hidden border border-border ring-0 dark:ring-0 bg-muted dark:bg-muted text-foreground [&>button]:right-4 sm:[&>button]:right-12 [&>button]:top-6 sm:[&>button]:top-10 [&>button]:opacity-80 [&>button]:hover:opacity-100 [&>button]:hover:bg-foreground/10 [&>button>svg]:size-5 select-none"
+					className="max-w-3xl w-[95vw] sm:w-full h-[75vh] sm:h-[85vh] flex flex-col p-0 gap-0 overflow-hidden ring-0 dark:ring-0 [&>button]:right-4 sm:[&>button]:right-12 [&>button]:top-6 sm:[&>button]:top-10 [&>button]:opacity-80 [&>button]:hover:opacity-100 [&>button]:hover:bg-accent [&>button]:hover:text-accent-foreground [&>button>svg]:size-5 select-none"
 				>
 					<DialogTitle className="sr-only">Manage Connectors</DialogTitle>
 					{/* YouTube Crawler View - shown when adding YouTube videos */}
@@ -279,6 +281,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
 							periodicEnabled={periodicEnabled}
 							frequencyMinutes={frequencyMinutes}
 							enableSummary={enableSummary}
+							enableVisionLlm={enableVisionLlm}
 							isSaving={isSaving}
 							isDisconnecting={isDisconnecting}
 							isIndexing={indexingConnectorIds.has(editingConnector.id)}
@@ -288,6 +291,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
 							onPeriodicEnabledChange={setPeriodicEnabled}
 							onFrequencyChange={setFrequencyMinutes}
 							onEnableSummaryChange={setEnableSummary}
+							onEnableVisionLlmChange={setEnableVisionLlm}
 							onSave={() => {
 								startIndexing(editingConnector.id);
 								handleSaveConnector(() => refreshConnectors());
@@ -336,6 +340,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
 							periodicEnabled={periodicEnabled}
 							frequencyMinutes={frequencyMinutes}
 							enableSummary={enableSummary}
+							enableVisionLlm={enableVisionLlm}
 							isStartingIndexing={isStartingIndexing}
 							isFromOAuth={isFromOAuth}
 							onStartDateChange={setStartDate}
@@ -343,6 +348,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
 							onPeriodicEnabledChange={setPeriodicEnabled}
 							onFrequencyChange={setFrequencyMinutes}
 							onEnableSummaryChange={setEnableSummary}
+							onEnableVisionLlmChange={setEnableVisionLlm}
 							onConfigChange={setIndexingConnectorConfig}
 							onStartIndexing={() => {
 								if (indexingConfig.connectorId) {
@@ -374,34 +380,32 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
 									<div className="px-4 sm:px-12 py-4 sm:py-8 pb-12 sm:pb-16">
 										{/* LLM Configuration Warning */}
 										{!llmConfigLoading && !hasDocumentSummaryLLM && (
-											<Alert
-												variant="destructive"
-												className="mb-6 bg-muted/50 rounded-xl border-destructive/30"
-											>
-												<AlertTriangle className="h-4 w-4" />
-												<AlertTitle>LLM Configuration Required</AlertTitle>
-												<AlertDescription className="mt-2">
-													<p className="mb-3">
-														{isAutoMode && !hasGlobalConfigs
-															? "Auto mode requires a global LLM configuration. Please add one in Settings"
-															: "A Document Summary LLM is required to process uploads, configure one in Settings"}
-													</p>
-													<Button
-														size="sm"
-														variant="outline"
-														onClick={() => {
-															handleOpenChange(false);
-															setSearchSpaceSettingsDialog({
-																open: true,
-																initialTab: "models",
-															});
-														}}
-													>
-														<Settings className="mr-2 h-4 w-4" />
-														Go to Settings
-													</Button>
-												</AlertDescription>
-											</Alert>
+											<div className="mb-6">
+												<Alert variant="destructive">
+													<AlertTriangle />
+													<AlertTitle>LLM Configuration Required</AlertTitle>
+													<AlertDescription>
+														<p>
+															{isAutoMode && !hasGlobalConfigs
+																? "Auto mode requires a global LLM configuration. Please add one in Settings"
+																: "A Document Summary LLM is required to process uploads, configure one in Settings"}
+														</p>
+														<Button
+															size="sm"
+															variant="outline"
+															onClick={() => {
+																handleOpenChange(false);
+																router.push(
+																	`/dashboard/${searchSpaceId}/search-space-settings/models`
+																);
+															}}
+														>
+															<Settings className="mr-2 h-4 w-4" />
+															Go to Settings
+														</Button>
+													</AlertDescription>
+												</Alert>
+											</div>
 										)}
 
 										<TabsContent value="all" className="m-0">
@@ -440,7 +444,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
 									</div>
 								</div>
 								{/* Bottom fade shadow */}
-								<div className="absolute bottom-0 left-0 right-0 h-7 bg-linear-to-t from-muted via-muted/80 to-transparent pointer-events-none z-10" />
+								<div className="absolute bottom-0 left-0 right-0 h-7 bg-linear-to-t from-popover via-popover/80 to-transparent pointer-events-none z-10" />
 							</div>
 						</Tabs>
 					)}
diff --git a/surfsense_web/components/assistant-ui/connector-popup/components/connector-card.tsx b/surfsense_web/components/assistant-ui/connector-popup/components/connector-card.tsx
index d24057b1cb..dadfe32ab7 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/components/connector-card.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/components/connector-card.tsx
@@ -8,6 +8,7 @@ import { Spinner } from "@/components/ui/spinner";
 import { EnumConnectorName } from "@/contracts/enums/connector";
 import { getConnectorIcon } from "@/contracts/enums/connectorIcons";
 import { cn } from "@/lib/utils";
+import { LIVE_CONNECTOR_TYPES } from "../constants/connector-constants";
 import { useConnectorStatus } from "../hooks/use-connector-status";
 import { ConnectorStatusBadge } from "./connector-status-badge";
 
@@ -55,6 +56,7 @@ export const ConnectorCard: FC<ConnectorCardProps> = ({
 	onManage,
 }) => {
 	const isMCP = connectorType === EnumConnectorName.MCP_CONNECTOR;
+	const isLive = !!connectorType && LIVE_CONNECTOR_TYPES.has(connectorType);
 	// Get connector status
 	const { getConnectorStatus, isConnectorEnabled, getConnectorStatusMessage, shouldShowWarnings } =
 		useConnectorStatus();
@@ -79,8 +81,8 @@ export const ConnectorCard: FC<ConnectorCardProps> = ({
 			className={cn(
 				"group relative flex items-center gap-4 p-4 rounded-xl text-left transition-all duration-200 w-full border",
 				status.status === "warning"
-					? "border-yellow-500/30 bg-slate-400/5 dark:bg-white/5 hover:bg-slate-400/10 dark:hover:bg-white/10"
-					: "border-border bg-slate-400/5 dark:bg-white/5 hover:bg-slate-400/10 dark:hover:bg-white/10"
+					? "border-yellow-500/30 bg-slate-400/5 dark:bg-white/5 hover:bg-accent hover:text-accent-foreground"
+					: "border-border bg-slate-400/5 dark:bg-white/5 hover:bg-accent hover:text-accent-foreground"
 			)}
 		>
 			<div
@@ -123,14 +125,14 @@ export const ConnectorCard: FC<ConnectorCardProps> = ({
 							</span>
 						) : (
 							<>
-								<span>{formatDocumentCount(documentCount)}</span>
+								{!isLive && <span>{formatDocumentCount(documentCount)}</span>}
+								{!isLive && accountCount !== undefined && accountCount > 0 && (
+									<span className="text-muted-foreground/50">•</span>
+								)}
 								{accountCount !== undefined && accountCount > 0 && (
-									<>
-										<span className="text-muted-foreground/50">•</span>
-										<span>
-											{accountCount} {accountCount === 1 ? "Account" : "Accounts"}
-										</span>
-									</>
+									<span>
+										{accountCount} {accountCount === 1 ? "Account" : "Accounts"}
+									</span>
 								)}
 							</>
 						)}
@@ -143,9 +145,9 @@ export const ConnectorCard: FC<ConnectorCardProps> = ({
 				size="sm"
 				variant={isConnected ? "secondary" : "default"}
 				className={cn(
-					"relative h-8 text-[11px] px-3 rounded-lg shrink-0 font-medium items-center justify-center",
+					"relative h-8 text-[11px] px-3 shrink-0 font-medium items-center justify-center",
 					isConnected &&
-						"bg-white text-slate-700 hover:bg-slate-50 border-0 shadow-xs dark:bg-secondary dark:text-secondary-foreground dark:hover:bg-secondary/80",
+						"bg-white text-slate-700 hover:bg-accent hover:text-accent-foreground border-0 shadow-xs dark:bg-secondary dark:text-secondary-foreground",
 					!isConnected && "shadow-xs"
 				)}
 				onClick={isConnected ? onManage : onConnect}
diff --git a/surfsense_web/components/assistant-ui/connector-popup/components/connector-dialog-header.tsx b/surfsense_web/components/assistant-ui/connector-popup/components/connector-dialog-header.tsx
index 819e11cd37..25ad22523d 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/components/connector-dialog-header.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/components/connector-dialog-header.tsx
@@ -2,6 +2,7 @@
 
 import { Search, X } from "lucide-react";
 import type { FC } from "react";
+import { Button } from "@/components/ui/button";
 import { DialogDescription, DialogHeader, DialogTitle } from "@/components/ui/dialog";
 import { TabsList, TabsTrigger } from "@/components/ui/tabs";
 import { cn } from "@/lib/utils";
@@ -25,7 +26,7 @@ export const ConnectorDialogHeader: FC<ConnectorDialogHeaderProps> = ({
 		<div
 			className={cn(
 				"flex-shrink-0 px-4 sm:px-12 pt-5 sm:pt-10 transition-shadow duration-200 relative z-10",
-				isScrolled && "shadow-xl bg-muted/50 backdrop-blur-md"
+				isScrolled && "bg-popover shadow-xl"
 			)}
 		>
 			<DialogHeader>
@@ -37,7 +38,7 @@ export const ConnectorDialogHeader: FC<ConnectorDialogHeaderProps> = ({
 				</DialogDescription>
 			</DialogHeader>
 
-			<div className="flex flex-col-reverse sm:flex-row sm:items-end justify-between gap-4 sm:gap-8 mt-4 sm:mt-8 border-b border-border/80 dark:border-white/5">
+			<div className="flex flex-col-reverse sm:flex-row sm:items-end justify-between gap-4 sm:gap-8 mt-4 sm:mt-8 border-b border-popover-border">
 				<TabsList className="bg-transparent p-0 gap-4 sm:gap-8 h-auto w-full sm:w-auto justify-center sm:justify-start">
 					<TabsTrigger
 						value="all"
@@ -63,27 +64,29 @@ export const ConnectorDialogHeader: FC<ConnectorDialogHeaderProps> = ({
 
 				<div className="w-full sm:w-72 sm:pb-1">
 					<div className="relative">
-						<Search className="absolute left-3 top-1/2 -translate-y-1/2 size-4 text-gray-500 dark:text-gray-500" />
+						<Search className="absolute left-3 top-1/2 -translate-y-1/2 size-4 text-muted-foreground" />
 						<input
 							type="text"
 							autoComplete="off"
 							placeholder="Search"
 							className={cn(
-								"w-full bg-slate-400/5 dark:bg-white/5 hover:bg-slate-400/10 dark:hover:bg-white/10 focus:bg-slate-400/10 dark:focus:bg-white/10 border border-border rounded-xl pl-9 py-2 text-sm transition-all outline-none placeholder:text-muted-foreground/50",
+								"w-full bg-slate-400/5 dark:bg-white/5 hover:bg-accent hover:text-accent-foreground focus:bg-slate-400/10 dark:focus:bg-white/10 border border-border rounded-xl pl-9 py-2 text-sm transition-all outline-none placeholder:text-muted-foreground/50",
 								searchQuery ? "pr-9" : "pr-4"
 							)}
 							value={searchQuery}
 							onChange={(e) => onSearchChange(e.target.value)}
 						/>
 						{searchQuery && (
-							<button
+							<Button
+								variant="ghost"
+								size="icon"
 								type="button"
 								onClick={() => onSearchChange("")}
-								className="absolute right-3 top-1/2 -translate-y-1/2 size-4 text-gray-500 dark:text-gray-500 hover:text-gray-700 dark:hover:text-gray-300 transition-colors"
+								className="absolute right-1.5 top-1/2 size-7 -translate-y-1/2 text-muted-foreground transition-colors hover:bg-transparent hover:text-accent-foreground"
 								aria-label="Clear search"
 							>
-								<X className="size-4" />
-							</button>
+								<X data-icon="inline-start" />
+							</Button>
 						)}
 					</div>
 				</div>
diff --git a/surfsense_web/components/assistant-ui/connector-popup/components/connector-warning-banner.tsx b/surfsense_web/components/assistant-ui/connector-popup/components/connector-warning-banner.tsx
index d1de3e37e1..ccd778be4a 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/components/connector-warning-banner.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/components/connector-warning-banner.tsx
@@ -3,6 +3,7 @@
 import { AlertTriangle, X } from "lucide-react";
 import type { FC } from "react";
 import { useState } from "react";
+import { Button } from "@/components/ui/button";
 import { cn } from "@/lib/utils";
 
 interface ConnectorWarningBannerProps {
@@ -42,14 +43,16 @@ export const ConnectorWarningBanner: FC<ConnectorWarningBannerProps> = ({
 				)}
 			</div>
 			{onDismiss && (
-				<button
+				<Button
+					variant="ghost"
+					size="icon"
 					type="button"
 					onClick={handleDismiss}
-					className="shrink-0 p-0.5 rounded hover:bg-yellow-500/20 transition-colors"
+					className="size-6 shrink-0 rounded p-0 transition-colors hover:bg-yellow-500/20"
 					aria-label="Dismiss warning"
 				>
-					<X className="size-3.5 text-yellow-700 dark:text-yellow-300" />
-				</button>
+					<X data-icon="inline-start" className="text-yellow-700 dark:text-yellow-300" />
+				</Button>
 			)}
 		</div>
 	);
diff --git a/surfsense_web/components/assistant-ui/connector-popup/components/date-range-selector.tsx b/surfsense_web/components/assistant-ui/connector-popup/components/date-range-selector.tsx
index 9d7044bb4f..b4aef0e668 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/components/date-range-selector.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/components/date-range-selector.tsx
@@ -136,7 +136,7 @@ export const DateRangeSelector: FC<DateRangeSelectorProps> = ({
 					variant="outline"
 					size="sm"
 					onClick={handleClearDates}
-					className="text-xs sm:text-sm bg-slate-400/5 dark:bg-slate-400/5 border-slate-400/20 hover:bg-slate-400/10 dark:hover:bg-slate-400/10"
+					className="text-xs sm:text-sm bg-slate-400/5 dark:bg-slate-400/5 border-slate-400/20 hover:bg-accent hover:text-accent-foreground"
 				>
 					Clear Dates
 				</Button>
@@ -145,7 +145,7 @@ export const DateRangeSelector: FC<DateRangeSelectorProps> = ({
 					variant="outline"
 					size="sm"
 					onClick={handleLast30Days}
-					className="text-xs sm:text-sm bg-slate-400/5 dark:bg-slate-400/5 border-slate-400/20 hover:bg-slate-400/10 dark:hover:bg-slate-400/10"
+					className="text-xs sm:text-sm bg-slate-400/5 dark:bg-slate-400/5 border-slate-400/20 hover:bg-accent hover:text-accent-foreground"
 				>
 					Last 30 Days
 				</Button>
@@ -155,7 +155,7 @@ export const DateRangeSelector: FC<DateRangeSelectorProps> = ({
 						variant="outline"
 						size="sm"
 						onClick={handleNext30Days}
-						className="text-xs sm:text-sm bg-slate-400/5 dark:bg-slate-400/5 border-slate-400/20 hover:bg-slate-400/10 dark:hover:bg-slate-400/10"
+						className="text-xs sm:text-sm bg-slate-400/5 dark:bg-slate-400/5 border-slate-400/20 hover:bg-accent hover:text-accent-foreground"
 					>
 						Next 30 Days
 					</Button>
@@ -165,7 +165,7 @@ export const DateRangeSelector: FC<DateRangeSelectorProps> = ({
 					variant="outline"
 					size="sm"
 					onClick={handleLastYear}
-					className="text-xs sm:text-sm bg-slate-400/5 dark:bg-slate-400/5 border-slate-400/20 hover:bg-slate-400/10 dark:hover:bg-slate-400/10"
+					className="text-xs sm:text-sm bg-slate-400/5 dark:bg-slate-400/5 border-slate-400/20 hover:bg-accent hover:text-accent-foreground"
 				>
 					Last Year
 				</Button>
diff --git a/surfsense_web/components/assistant-ui/connector-popup/components/vision-llm-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/components/vision-llm-config.tsx
new file mode 100644
index 0000000000..e5ebdbd067
--- /dev/null
+++ b/surfsense_web/components/assistant-ui/connector-popup/components/vision-llm-config.tsx
@@ -0,0 +1,25 @@
+"use client";
+
+import type { FC } from "react";
+import { Switch } from "@/components/ui/switch";
+
+interface VisionLLMConfigProps {
+	enabled: boolean;
+	onEnabledChange: (enabled: boolean) => void;
+}
+
+export const VisionLLMConfig: FC<VisionLLMConfigProps> = ({ enabled, onEnabledChange }) => {
+	return (
+		<div className="rounded-xl bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6">
+			<div className="flex items-center justify-between">
+				<div className="space-y-1">
+					<h3 className="font-medium text-sm sm:text-base">Enable Vision LLM</h3>
+					<p className="text-xs sm:text-sm text-muted-foreground">
+						Describes images using AI vision (costly, slower)
+					</p>
+				</div>
+				<Switch checked={enabled} onCheckedChange={onEnabledChange} />
+			</div>
+		</div>
+	);
+};
diff --git a/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json b/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json
index f62758256a..b4e85eab05 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json
+++ b/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json
@@ -9,6 +9,16 @@
 			"enabled": true,
 			"status": "warning",
 			"statusMessage": "Some requests may be blocked if not using Firecrawl."
+		},
+		"JIRA_CONNECTOR": {
+			"enabled": false,
+			"status": "maintenance",
+			"statusMessage": "Rework in progress."
+		},
+		"CONFLUENCE_CONNECTOR": {
+			"enabled": false,
+			"status": "maintenance",
+			"statusMessage": "Rework in progress."
 		}
 	},
 	"globalSettings": {
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/baidu-search-api-connect-form.tsx b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/baidu-search-api-connect-form.tsx
index a4f145e06e..726371f99f 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/baidu-search-api-connect-form.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/baidu-search-api-connect-form.tsx
@@ -70,20 +70,22 @@ export const BaiduSearchApiConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSu
 
 	return (
 		<div className="space-y-6 pb-6">
-			<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 p-2 sm:p-3">
-				<Info className="size-4 shrink-0" />
-				<AlertTitle className="text-xs sm:text-sm">API Key Required</AlertTitle>
-				<AlertDescription className="text-[10px] sm:text-xs">
-					You'll need a Baidu AppBuilder API key to use this connector. You can get one by signing
-					up at{" "}
-					<a
-						href="https://qianfan.cloud.baidu.com/"
-						target="_blank"
-						rel="noopener noreferrer"
-						className="font-medium underline underline-offset-4"
-					>
-						qianfan.cloud.baidu.com
-					</a>
+			<Alert>
+				<Info />
+				<AlertTitle>API Key Required</AlertTitle>
+				<AlertDescription>
+					<p>
+						You'll need a Baidu AppBuilder API key to use this connector. You can get one by signing
+						up at{" "}
+						<a
+							href="https://qianfan.cloud.baidu.com/"
+							target="_blank"
+							rel="noopener noreferrer"
+							className="font-medium underline underline-offset-4"
+						>
+							qianfan.cloud.baidu.com
+						</a>
+					</p>
 				</AlertDescription>
 			</Alert>
 
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/bookstack-connect-form.tsx b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/bookstack-connect-form.tsx
index b2a6b0b25c..b885df39c2 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/bookstack-connect-form.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/bookstack-connect-form.tsx
@@ -96,10 +96,10 @@ export const BookStackConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitt
 
 	return (
 		<div className="space-y-6 pb-6">
-			<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 p-2 sm:p-3">
-				<Info className="size-4 shrink-0" />
-				<AlertTitle className="text-xs sm:text-sm">API Token Required</AlertTitle>
-				<AlertDescription className="text-[10px] sm:text-xs">
+			<Alert>
+				<Info />
+				<AlertTitle>API Token Required</AlertTitle>
+				<AlertDescription>
 					You'll need a BookStack API Token to use this connector. You can create one from your
 					BookStack instance settings.
 				</AlertDescription>
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/elasticsearch-connect-form.tsx b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/elasticsearch-connect-form.tsx
index 352a8f3bca..943b2361d0 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/elasticsearch-connect-form.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/elasticsearch-connect-form.tsx
@@ -172,10 +172,10 @@ export const ElasticsearchConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSub
 
 	return (
 		<div className="space-y-6 pb-6">
-			<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 p-2 sm:p-3">
-				<Info className="size-4 shrink-0" />
-				<AlertTitle className="text-xs sm:text-sm">API Key Required</AlertTitle>
-				<AlertDescription className="text-[10px] sm:text-xs">
+			<Alert>
+				<Info />
+				<AlertTitle>API Key Required</AlertTitle>
+				<AlertDescription>
 					Enter your Elasticsearch cluster endpoint URL and authentication credentials to connect.
 				</AlertDescription>
 			</Alert>
@@ -428,10 +428,10 @@ export const ElasticsearchConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSub
 							</div>
 						)}
 
-						<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20">
-							<Info className="h-3 w-3 sm:h-4 sm:w-4" />
-							<AlertTitle className="text-[10px] sm:text-xs">Index Selection Tips</AlertTitle>
-							<AlertDescription className="text-[9px] sm:text-[10px] mt-2">
+						<Alert>
+							<Info />
+							<AlertTitle>Index Selection Tips</AlertTitle>
+							<AlertDescription>
 								<ul className="list-disc pl-4 space-y-1">
 									<li>Use wildcards like "logs-*" to match multiple indices</li>
 									<li>Separate multiple indices with commas</li>
@@ -643,231 +643,6 @@ export const ElasticsearchConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSub
 					</ul>
 				</div>
 			)}
-
-			{/* Documentation Section */}
-			<Accordion
-				type="single"
-				collapsible
-				className="w-full border border-border rounded-xl bg-slate-400/5 dark:bg-white/5"
-			>
-				<AccordionItem value="documentation" className="border-0">
-					<AccordionTrigger className="text-sm sm:text-base font-medium px-3 sm:px-6 no-underline hover:no-underline">
-						Documentation
-					</AccordionTrigger>
-					<AccordionContent className="px-3 sm:px-6 pb-3 sm:pb-6 space-y-6">
-						<div>
-							<h3 className="text-sm sm:text-base font-semibold mb-2">How it works</h3>
-							<p className="text-[10px] sm:text-xs text-muted-foreground">
-								The Elasticsearch connector allows you to search and retrieve documents from your
-								Elasticsearch cluster. Configure connection details, select specific indices, and
-								set search parameters to make your existing data searchable within SurfSense.
-							</p>
-						</div>
-
-						<div className="space-y-4">
-							<div>
-								<h3 className="text-sm sm:text-base font-semibold mb-2">Connection Setup</h3>
-								<div className="space-y-4 sm:space-y-6">
-									<div>
-										<h4 className="text-[10px] sm:text-xs font-medium mb-2">
-											Step 1: Get your Elasticsearch endpoint
-										</h4>
-										<p className="text-[10px] sm:text-xs text-muted-foreground mb-3">
-											You'll need the endpoint URL for your Elasticsearch cluster. This typically
-											looks like:
-										</p>
-										<ul className="list-disc pl-5 space-y-1 text-[10px] sm:text-xs text-muted-foreground mb-4">
-											<li>
-												Cloud:{" "}
-												<code className="bg-muted px-1 py-0.5 rounded">
-													https://your-cluster.es.region.aws.com:443
-												</code>
-											</li>
-											<li>
-												Self-hosted:{" "}
-												<code className="bg-muted px-1 py-0.5 rounded">
-													https://elasticsearch.example.com:9200
-												</code>
-											</li>
-										</ul>
-									</div>
-
-									<div>
-										<h4 className="text-[10px] sm:text-xs font-medium mb-2">
-											Step 2: Configure authentication
-										</h4>
-										<p className="text-[10px] sm:text-xs text-muted-foreground mb-3">
-											Elasticsearch requires authentication. You can use either:
-										</p>
-										<ul className="list-disc pl-5 space-y-2 text-[10px] sm:text-xs text-muted-foreground mb-4">
-											<li>
-												<strong>API Key:</strong> A base64-encoded API key. You can create one in
-												Elasticsearch by running:
-												<pre className="bg-muted p-2 rounded mt-1 text-[9px] overflow-x-auto">
-													<code>POST /_security/api_key</code>
-												</pre>
-											</li>
-											<li>
-												<strong>Username & Password:</strong> Basic authentication using your
-												Elasticsearch username and password.
-											</li>
-										</ul>
-									</div>
-
-									<div>
-										<h4 className="text-[10px] sm:text-xs font-medium mb-2">
-											Step 3: Select indices
-										</h4>
-										<p className="text-[10px] sm:text-xs text-muted-foreground mb-3">
-											Specify which indices to search. You can:
-										</p>
-										<ul className="list-disc pl-5 space-y-1 text-[10px] sm:text-xs text-muted-foreground">
-											<li>
-												Use wildcards: <code className="bg-muted px-1 py-0.5 rounded">logs-*</code>{" "}
-												to match multiple indices
-											</li>
-											<li>
-												List specific indices:{" "}
-												<code className="bg-muted px-1 py-0.5 rounded">
-													logs-2024, documents-2024
-												</code>
-											</li>
-											<li>
-												Leave empty to search all accessible indices (not recommended for
-												performance)
-											</li>
-										</ul>
-									</div>
-								</div>
-							</div>
-						</div>
-
-						<div className="space-y-4">
-							<div>
-								<h3 className="text-sm sm:text-base font-semibold mb-2">Advanced Configuration</h3>
-								<div className="space-y-4">
-									<div>
-										<h4 className="text-[10px] sm:text-xs font-medium mb-2">Search Query</h4>
-										<p className="text-[10px] sm:text-xs text-muted-foreground mb-2">
-											The default query used for searches. Use{" "}
-											<code className="bg-muted px-1 py-0.5 rounded">*</code> to match all
-											documents, or specify a more complex Elasticsearch query.
-										</p>
-									</div>
-
-									<div>
-										<h4 className="text-[10px] sm:text-xs font-medium mb-2">Search Fields</h4>
-										<p className="text-[10px] sm:text-xs text-muted-foreground mb-2">
-											Limit searches to specific fields for better performance. Common fields
-											include:
-										</p>
-										<ul className="list-disc pl-5 space-y-1 text-[10px] sm:text-xs text-muted-foreground">
-											<li>
-												<code className="bg-muted px-1 py-0.5 rounded">title</code> - Document
-												titles
-											</li>
-											<li>
-												<code className="bg-muted px-1 py-0.5 rounded">content</code> - Main content
-											</li>
-											<li>
-												<code className="bg-muted px-1 py-0.5 rounded">description</code> -
-												Descriptions
-											</li>
-										</ul>
-										<p className="text-[10px] sm:text-xs text-muted-foreground mt-2">
-											Leave empty to search all fields in your documents.
-										</p>
-									</div>
-
-									<div>
-										<h4 className="text-[10px] sm:text-xs font-medium mb-2">Maximum Documents</h4>
-										<p className="text-[10px] sm:text-xs text-muted-foreground">
-											Set a limit on the number of documents retrieved per search (1-10,000). This
-											helps control response times and resource usage. Leave empty to use
-											Elasticsearch's default limit.
-										</p>
-									</div>
-								</div>
-							</div>
-						</div>
-
-						<div className="space-y-4">
-							<div>
-								<h3 className="text-sm sm:text-base font-semibold mb-2">Troubleshooting</h3>
-								<div className="space-y-4">
-									<div>
-										<h4 className="text-[10px] sm:text-xs font-medium mb-2">Connection Issues</h4>
-										<ul className="list-disc pl-5 space-y-2 text-[10px] sm:text-xs text-muted-foreground">
-											<li>
-												<strong>Invalid URL:</strong> Ensure your endpoint URL includes the protocol
-												(https://) and port number if required.
-											</li>
-											<li>
-												<strong>SSL/TLS Errors:</strong> Verify that your cluster uses HTTPS and the
-												certificate is valid. Self-signed certificates may require additional
-												configuration.
-											</li>
-											<li>
-												<strong>Connection Timeout:</strong> Check your network connectivity and
-												firewall settings. Ensure the Elasticsearch cluster is accessible from
-												SurfSense servers.
-											</li>
-										</ul>
-									</div>
-
-									<div>
-										<h4 className="text-[10px] sm:text-xs font-medium mb-2">
-											Authentication Issues
-										</h4>
-										<ul className="list-disc pl-5 space-y-2 text-[10px] sm:text-xs text-muted-foreground">
-											<li>
-												<strong>Invalid Credentials:</strong> Double-check your username/password or
-												API key. API keys must be base64-encoded.
-											</li>
-											<li>
-												<strong>Permission Denied:</strong> Ensure your API key or user account has
-												read permissions for the indices you want to search.
-											</li>
-											<li>
-												<strong>API Key Format:</strong> Elasticsearch API keys are typically
-												base64-encoded strings. Make sure you're using the full key value.
-											</li>
-										</ul>
-									</div>
-
-									<div>
-										<h4 className="text-[10px] sm:text-xs font-medium mb-2">Search Issues</h4>
-										<ul className="list-disc pl-5 space-y-2 text-[10px] sm:text-xs text-muted-foreground">
-											<li>
-												<strong>No Results:</strong> Verify that your index selection matches
-												existing indices. Use wildcards carefully.
-											</li>
-											<li>
-												<strong>Slow Searches:</strong> Limit the number of indices or use specific
-												index names instead of wildcards. Reduce the maximum documents limit.
-											</li>
-											<li>
-												<strong>Field Not Found:</strong> Ensure the search fields you specify
-												actually exist in your Elasticsearch documents.
-											</li>
-										</ul>
-									</div>
-
-									<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 mt-4">
-										<Info className="h-3 w-3 sm:h-4 sm:w-4" />
-										<AlertTitle className="text-[10px] sm:text-xs">Need More Help?</AlertTitle>
-										<AlertDescription className="text-[9px] sm:text-[10px]">
-											If you continue to experience issues, check your Elasticsearch cluster logs
-											and ensure your cluster version is compatible. For Elasticsearch Cloud
-											deployments, verify your access policies and IP allowlists.
-										</AlertDescription>
-									</Alert>
-								</div>
-							</div>
-						</div>
-					</AccordionContent>
-				</AccordionItem>
-			</Accordion>
 		</div>
 	);
 };
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx
index 68d5ef5fbe..56609dbdd1 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx
@@ -105,20 +105,23 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
 
 	return (
 		<div className="space-y-6 pb-6">
-			<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 p-2 sm:p-3">
-				<Info className="size-4 shrink-0" />
-				<AlertTitle className="text-xs sm:text-sm">Personal Access Token (Optional)</AlertTitle>
-				<AlertDescription className="text-[10px] sm:text-xs">
-					A GitHub PAT is only required for private repositories. Public repos work without a token.{" "}
-					<a
-						href="https://github.com/settings/tokens/new?description=surfsense&scopes=repo"
-						target="_blank"
-						rel="noopener noreferrer"
-						className="font-medium underline underline-offset-4 inline-flex items-center gap-1.5"
-					>
-						Get your token
-						<ExternalLink className="h-3 w-3 sm:h-4 sm:w-4" />
-					</a>{" "}
+			<Alert>
+				<Info />
+				<AlertTitle>Personal Access Token (Optional)</AlertTitle>
+				<AlertDescription>
+					<p>
+						A GitHub PAT is only required for private repositories. Public repos work without a
+						token.{" "}
+						<a
+							href="https://github.com/settings/tokens/new?description=surfsense&scopes=repo"
+							target="_blank"
+							rel="noopener noreferrer"
+							className="font-medium underline underline-offset-4 inline-flex items-center gap-1.5"
+						>
+							Get your token
+							<ExternalLink className="h-3 w-3 sm:h-4 sm:w-4" />
+						</a>
+					</p>
 				</AlertDescription>
 			</Alert>
 
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/linkup-api-connect-form.tsx b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/linkup-api-connect-form.tsx
index b9771e95a0..46ec35b1fa 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/linkup-api-connect-form.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/linkup-api-connect-form.tsx
@@ -70,19 +70,21 @@ export const LinkupApiConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitt
 
 	return (
 		<div className="space-y-6 pb-6">
-			<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 p-2 sm:p-3">
-				<Info className="size-4 shrink-0" />
-				<AlertTitle className="text-xs sm:text-sm">API Key Required</AlertTitle>
-				<AlertDescription className="text-[10px] sm:text-xs">
-					You'll need a Linkup API key to use this connector. You can get one by signing up at{" "}
-					<a
-						href="https://linkup.so"
-						target="_blank"
-						rel="noopener noreferrer"
-						className="font-medium underline underline-offset-4"
-					>
-						linkup.so
-					</a>
+			<Alert>
+				<Info />
+				<AlertTitle>API Key Required</AlertTitle>
+				<AlertDescription>
+					<p>
+						You'll need a Linkup API key to use this connector. You can get one by signing up at{" "}
+						<a
+							href="https://linkup.so"
+							target="_blank"
+							rel="noopener noreferrer"
+							className="font-medium underline underline-offset-4"
+						>
+							linkup.so
+						</a>
+					</p>
 				</AlertDescription>
 			</Alert>
 
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/luma-connect-form.tsx b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/luma-connect-form.tsx
index 13cc7efe11..1c45e2dd7e 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/luma-connect-form.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/luma-connect-form.tsx
@@ -88,19 +88,21 @@ export const LumaConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting }
 
 	return (
 		<div className="space-y-6 pb-6">
-			<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 p-2 sm:p-3">
-				<Info className="size-4 shrink-0" />
-				<AlertTitle className="text-xs sm:text-sm">API Key Required</AlertTitle>
-				<AlertDescription className="text-[10px] sm:text-xs">
-					You'll need a Luma API Key to use this connector. You can create one from{" "}
-					<a
-						href="https://lu.ma/api"
-						target="_blank"
-						rel="noopener noreferrer"
-						className="font-medium underline underline-offset-4"
-					>
-						Luma API Settings
-					</a>
+			<Alert>
+				<Info />
+				<AlertTitle>API Key Required</AlertTitle>
+				<AlertDescription>
+					<p>
+						You'll need a Luma API Key to use this connector. You can create one from{" "}
+						<a
+							href="https://lu.ma/api"
+							target="_blank"
+							rel="noopener noreferrer"
+							className="font-medium underline underline-offset-4"
+						>
+							Luma API Settings
+						</a>
+					</p>
 				</AlertDescription>
 			</Alert>
 
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/mcp-connect-form.tsx b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/mcp-connect-form.tsx
index 58d3651285..92df6f5a1c 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/mcp-connect-form.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/mcp-connect-form.tsx
@@ -1,6 +1,6 @@
 "use client";
 
-import { CheckCircle2, ChevronDown, ChevronUp, Server, XCircle } from "lucide-react";
+import { CheckCircle2, ChevronDown, ChevronUp, Loader2, Server, XCircle } from "lucide-react";
 import { type FC, useRef, useState } from "react";
 import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
 import { Button } from "@/components/ui/button";
@@ -155,7 +155,7 @@ export const MCPConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting })
 										type="button"
 										variant="ghost"
 										size="sm"
-										className="h-6 px-2 text-xs text-muted-foreground hover:text-foreground"
+										className="h-6 px-2 text-xs text-muted-foreground hover:text-accent-foreground"
 										onClick={() => handleConfigChange(DEFAULT_STDIO_CONFIG)}
 									>
 										Local Example
@@ -164,7 +164,7 @@ export const MCPConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting })
 										type="button"
 										variant="ghost"
 										size="sm"
-										className="h-6 px-2 text-xs text-muted-foreground hover:text-foreground"
+										className="h-6 px-2 text-xs text-muted-foreground hover:text-accent-foreground"
 										onClick={() => handleConfigChange(DEFAULT_HTTP_CONFIG)}
 									>
 										Remote Example
@@ -210,9 +210,16 @@ export const MCPConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting })
 							onClick={handleTestConnection}
 							disabled={isTesting}
 							variant="secondary"
-							className="w-full h-8 text-[13px] px-3 rounded-lg font-medium bg-white text-slate-700 hover:bg-slate-50 border-0 shadow-xs dark:bg-secondary dark:text-secondary-foreground dark:hover:bg-secondary/80"
+							className="w-full h-8 text-[13px] px-3 font-medium bg-white text-slate-700 hover:bg-accent hover:text-accent-foreground border-0 shadow-xs dark:bg-secondary dark:text-secondary-foreground"
 						>
-							{isTesting ? "Testing Connection" : "Test Connection"}
+							{isTesting ? (
+								<>
+									<Loader2 className="h-3.5 w-3.5 animate-spin" />
+									Testing Connection...
+								</>
+							) : (
+								"Test Connection"
+							)}
 						</Button>
 					</div>
 
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/obsidian-connect-form.tsx b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/obsidian-connect-form.tsx
index 08c1dd30c4..695e97d7b2 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/obsidian-connect-form.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/obsidian-connect-form.tsx
@@ -1,311 +1,174 @@
 "use client";
 
-import { zodResolver } from "@hookform/resolvers/zod";
-import { Info } from "lucide-react";
+import { Check, Copy, Info } from "lucide-react";
 import type { FC } from "react";
-import { useRef, useState } from "react";
-import { useForm } from "react-hook-form";
-import * as z from "zod";
 import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
-import {
-	Form,
-	FormControl,
-	FormDescription,
-	FormField,
-	FormItem,
-	FormLabel,
-	FormMessage,
-} from "@/components/ui/form";
-import { Input } from "@/components/ui/input";
-import { Label } from "@/components/ui/label";
-import {
-	Select,
-	SelectContent,
-	SelectItem,
-	SelectTrigger,
-	SelectValue,
-} from "@/components/ui/select";
-import { Switch } from "@/components/ui/switch";
+import { Button } from "@/components/ui/button";
 import { EnumConnectorName } from "@/contracts/enums/connector";
+import { useApiKey } from "@/hooks/use-api-key";
 import { getConnectorBenefits } from "../connector-benefits";
 import type { ConnectFormProps } from "../index";
 
-const obsidianConnectorFormSchema = z.object({
-	name: z.string().min(3, {
-		message: "Connector name must be at least 3 characters.",
-	}),
-	vault_path: z.string().min(1, {
-		message: "Vault path is required.",
-	}),
-	vault_name: z.string().min(1, {
-		message: "Vault name is required.",
-	}),
-	exclude_folders: z.string().optional(),
-	include_attachments: z.boolean(),
-});
-
-type ObsidianConnectorFormValues = z.infer<typeof obsidianConnectorFormSchema>;
-
-export const ObsidianConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting }) => {
-	const isSubmittingRef = useRef(false);
-	const [periodicEnabled, setPeriodicEnabled] = useState(true);
-	const [frequencyMinutes, setFrequencyMinutes] = useState("60");
-	const form = useForm<ObsidianConnectorFormValues>({
-		resolver: zodResolver(obsidianConnectorFormSchema),
-		defaultValues: {
-			name: "Obsidian Vault",
-			vault_path: "",
-			vault_name: "",
-			exclude_folders: ".obsidian,.trash",
-			include_attachments: false,
-		},
-	});
-
-	const handleSubmit = async (values: ObsidianConnectorFormValues) => {
-		// Prevent multiple submissions
-		if (isSubmittingRef.current || isSubmitting) {
-			return;
-		}
-
-		isSubmittingRef.current = true;
-		try {
-			// Parse exclude_folders into an array
-			const excludeFolders = values.exclude_folders
-				? values.exclude_folders
-						.split(",")
-						.map((f) => f.trim())
-						.filter(Boolean)
-				: [".obsidian", ".trash"];
-
-			await onSubmit({
-				name: values.name,
-				connector_type: EnumConnectorName.OBSIDIAN_CONNECTOR,
-				config: {
-					vault_path: values.vault_path,
-					vault_name: values.vault_name,
-					exclude_folders: excludeFolders,
-					include_attachments: values.include_attachments,
-				},
-				is_indexable: true,
-				is_active: true,
-				last_indexed_at: null,
-				periodic_indexing_enabled: periodicEnabled,
-				indexing_frequency_minutes: periodicEnabled ? Number.parseInt(frequencyMinutes, 10) : null,
-				next_scheduled_at: null,
-				periodicEnabled,
-				frequencyMinutes,
-			});
-		} finally {
-			isSubmittingRef.current = false;
-		}
+const PLUGIN_RELEASES_URL =
+	"https://github.com/MODSetter/SurfSense/releases?q=obsidian&expanded=true";
+
+/**
+ * Obsidian connect form for the plugin-only architecture.
+ *
+ * The legacy `vault_path` form was removed because it only worked on
+ * self-hosted with a server-side bind mount and broke for everyone else.
+ * The plugin pushes data over HTTPS so this UI is purely instructional —
+ * there is no backend create call here. The connector row is created
+ * server-side the first time the plugin calls `POST /obsidian/connect`.
+ *
+ * The footer "Connect" button in `ConnectorConnectView` triggers this
+ * form's submit; we just close the dialog (`onBack()`) since there's
+ * nothing to validate or persist from this side.
+ */
+export const ObsidianConnectForm: FC<ConnectFormProps> = ({ onBack }) => {
+	const { apiKey, isLoading, copied, copyToClipboard } = useApiKey();
+
+	const handleSubmit = (event: React.FormEvent<HTMLFormElement>) => {
+		event.preventDefault();
+		onBack();
 	};
 
 	return (
 		<div className="space-y-6 pb-6">
-			<Alert className="bg-purple-500/10 dark:bg-purple-500/10 border-purple-500/30 p-2 sm:p-3">
-				<Info className="size-4 shrink-0 text-purple-500" />
-				<AlertTitle className="text-xs sm:text-sm">Self-Hosted Only</AlertTitle>
-				<AlertDescription className="text-[10px] sm:text-xs">
-					This connector requires direct file system access and only works with self-hosted
-					SurfSense installations.
+			{/* Form is intentionally empty so the footer Connect button is a no-op
+			    that just closes the dialog (see component-level docstring). */}
+			<form id="obsidian-connect-form" onSubmit={handleSubmit} />
+
+			<Alert>
+				<Info />
+				<AlertTitle>Plugin-based sync</AlertTitle>
+				<AlertDescription>
+					SurfSense now syncs Obsidian via an official plugin that runs inside Obsidian itself.
+					Works on desktop and mobile, in cloud and self-hosted deployments.
 				</AlertDescription>
 			</Alert>
 
-			<div className="rounded-xl border border-border bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6 space-y-3 sm:space-y-4">
-				<Form {...form}>
-					<form
-						id="obsidian-connect-form"
-						onSubmit={form.handleSubmit(handleSubmit)}
-						className="space-y-4 sm:space-y-6"
-					>
-						<FormField
-							control={form.control}
-							name="name"
-							render={({ field }) => (
-								<FormItem>
-									<FormLabel className="text-xs sm:text-sm">Connector Name</FormLabel>
-									<FormControl>
-										<Input
-											placeholder="My Obsidian Vault"
-											className="h-8 sm:h-10 px-2 sm:px-3 text-xs sm:text-sm border-slate-400/20 focus-visible:border-slate-400/40"
-											disabled={isSubmitting}
-											{...field}
-										/>
-									</FormControl>
-									<FormDescription className="text-[10px] sm:text-xs">
-										A friendly name to identify this connector.
-									</FormDescription>
-									<FormMessage />
-								</FormItem>
-							)}
-						/>
-
-						<FormField
-							control={form.control}
-							name="vault_path"
-							render={({ field }) => (
-								<FormItem>
-									<FormLabel className="text-xs sm:text-sm">Vault Path</FormLabel>
-									<FormControl>
-										<Input
-											placeholder="/path/to/your/obsidian/vault"
-											className="h-8 sm:h-10 px-2 sm:px-3 text-xs sm:text-sm border-slate-400/20 focus-visible:border-slate-400/40 font-mono"
-											disabled={isSubmitting}
-											{...field}
-										/>
-									</FormControl>
-									<FormDescription className="text-[10px] sm:text-xs">
-										The absolute path to your Obsidian vault on the server. This must be accessible
-										from the SurfSense backend.
-									</FormDescription>
-									<FormMessage />
-								</FormItem>
-							)}
-						/>
-
-						<FormField
-							control={form.control}
-							name="vault_name"
-							render={({ field }) => (
-								<FormItem>
-									<FormLabel className="text-xs sm:text-sm">Vault Name</FormLabel>
-									<FormControl>
-										<Input
-											placeholder="My Knowledge Base"
-											className="h-8 sm:h-10 px-2 sm:px-3 text-xs sm:text-sm border-slate-400/20 focus-visible:border-slate-400/40"
-											disabled={isSubmitting}
-											{...field}
-										/>
-									</FormControl>
-									<FormDescription className="text-[10px] sm:text-xs">
-										A display name for your vault. This will be used in search results.
-									</FormDescription>
-									<FormMessage />
-								</FormItem>
-							)}
-						/>
-
-						<FormField
-							control={form.control}
-							name="exclude_folders"
-							render={({ field }) => (
-								<FormItem>
-									<FormLabel className="text-xs sm:text-sm">Exclude Folders</FormLabel>
-									<FormControl>
-										<Input
-											placeholder=".obsidian,.trash,templates"
-											className="h-8 sm:h-10 px-2 sm:px-3 text-xs sm:text-sm border-slate-400/20 focus-visible:border-slate-400/40 font-mono"
-											disabled={isSubmitting}
-											{...field}
-										/>
-									</FormControl>
-									<FormDescription className="text-[10px] sm:text-xs">
-										Comma-separated list of folder names to exclude from indexing.
-									</FormDescription>
-									<FormMessage />
-								</FormItem>
-							)}
-						/>
-
-						<FormField
-							control={form.control}
-							name="include_attachments"
-							render={({ field }) => (
-								<FormItem className="flex flex-row items-center justify-between rounded-lg border border-slate-400/20 p-3">
-									<div className="space-y-0.5">
-										<FormLabel className="text-xs sm:text-sm">Include Attachments</FormLabel>
-										<FormDescription className="text-[10px] sm:text-xs">
-											Index attachment folders and embedded files (images, PDFs, etc.)
-										</FormDescription>
-									</div>
-									<FormControl>
-										<Switch
-											checked={field.value}
-											onCheckedChange={field.onChange}
-											disabled={isSubmitting}
-										/>
-									</FormControl>
-								</FormItem>
-							)}
-						/>
-
-						{/* Indexing Configuration */}
-						<div className="space-y-4 pt-4 border-t border-slate-400/20">
-							<h3 className="text-sm sm:text-base font-medium">Indexing Configuration</h3>
-
-							{/* Periodic Sync Config */}
-							<div className="rounded-xl bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6">
-								<div className="flex items-center justify-between">
-									<div className="space-y-1">
-										<h3 className="font-medium text-sm sm:text-base">Enable Periodic Sync</h3>
-										<p className="text-xs sm:text-sm text-muted-foreground">
-											Automatically re-index at regular intervals
-										</p>
-									</div>
-									<Switch
-										checked={periodicEnabled}
-										onCheckedChange={setPeriodicEnabled}
-										disabled={isSubmitting}
-									/>
+			<section className="rounded-xl border border-border bg-slate-400/5 p-3 sm:p-6 dark:bg-white/5">
+				<div className="space-y-5 sm:space-y-6">
+					{/* Step 1 — Install plugin */}
+					<article>
+						<header className="mb-3 flex items-center gap-2">
+							<div className="flex size-7 items-center justify-center rounded-md border border-slate-400/30 text-xs font-medium">
+								1
+							</div>
+							<h3 className="text-sm font-medium sm:text-base">Install the plugin</h3>
+						</header>
+						<p className="mb-3 text-[11px] text-muted-foreground sm:text-xs">
+							Grab the latest SurfSense plugin release. Once it's in the community store, you'll
+							also be able to install it from{" "}
+							<span className="font-medium">Settings → Community plugins</span> inside Obsidian.
+						</p>
+						<a
+							href={PLUGIN_RELEASES_URL}
+							target="_blank"
+							rel="noopener noreferrer"
+							className="inline-flex"
+						>
+							<Button
+								type="button"
+								variant="secondary"
+								size="sm"
+								className="gap-2 text-xs sm:text-sm"
+							>
+								Open plugin releases
+							</Button>
+						</a>
+					</article>
+
+					<div className="h-px bg-border/60" />
+
+					{/* Step 2 — Copy API key */}
+					<article>
+						<header className="mb-3 flex items-center gap-2">
+							<div className="flex size-7 items-center justify-center rounded-md border border-slate-400/30 text-xs font-medium">
+								2
+							</div>
+							<h3 className="text-sm font-medium sm:text-base">Copy your API key</h3>
+						</header>
+						<p className="mb-3 text-[11px] text-muted-foreground sm:text-xs">
+							Paste this into the plugin's <span className="font-medium">API token</span> setting.
+							The token expires after 24 hours. Long-lived personal access tokens are coming in a
+							future release.
+						</p>
+
+						{isLoading ? (
+							<div className="h-10 w-full animate-pulse rounded-md border border-border/60 bg-muted/30" />
+						) : apiKey ? (
+							<div className="flex items-center gap-2 rounded-md border border-border/60 bg-muted/30 px-2.5 py-1.5">
+								<div className="min-w-0 flex-1 overflow-x-auto scrollbar-hide">
+									<p className="cursor-text select-all whitespace-nowrap font-mono text-[10px] text-muted-foreground">
+										{apiKey}
+									</p>
 								</div>
-
-								{periodicEnabled && (
-									<div className="mt-4 pt-4 border-t border-slate-400/20 space-y-3">
-										<div className="space-y-2">
-											<Label htmlFor="frequency" className="text-xs sm:text-sm">
-												Sync Frequency
-											</Label>
-											<Select
-												value={frequencyMinutes}
-												onValueChange={setFrequencyMinutes}
-												disabled={isSubmitting}
-											>
-												<SelectTrigger
-													id="frequency"
-													className="w-full bg-slate-400/5 dark:bg-slate-400/5 border-slate-400/20 text-xs sm:text-sm"
-												>
-													<SelectValue placeholder="Select frequency" />
-												</SelectTrigger>
-												<SelectContent className="z-100">
-													<SelectItem value="5" className="text-xs sm:text-sm">
-														Every 5 minutes
-													</SelectItem>
-													<SelectItem value="15" className="text-xs sm:text-sm">
-														Every 15 minutes
-													</SelectItem>
-													<SelectItem value="60" className="text-xs sm:text-sm">
-														Every hour
-													</SelectItem>
-													<SelectItem value="360" className="text-xs sm:text-sm">
-														Every 6 hours
-													</SelectItem>
-													<SelectItem value="720" className="text-xs sm:text-sm">
-														Every 12 hours
-													</SelectItem>
-													<SelectItem value="1440" className="text-xs sm:text-sm">
-														Daily
-													</SelectItem>
-													<SelectItem value="10080" className="text-xs sm:text-sm">
-														Weekly
-													</SelectItem>
-												</SelectContent>
-											</Select>
-										</div>
-									</div>
-								)}
+								<Button
+									type="button"
+									variant="ghost"
+									size="icon"
+									onClick={copyToClipboard}
+									className="size-7 shrink-0 text-muted-foreground hover:text-accent-foreground"
+									aria-label={copied ? "Copied" : "Copy API key"}
+								>
+									{copied ? (
+										<Check className="size-3.5 text-green-500" />
+									) : (
+										<Copy className="size-3.5" />
+									)}
+								</Button>
+							</div>
+						) : (
+							<p className="text-center text-xs text-muted-foreground/60">
+								No API key available — try refreshing the page.
+							</p>
+						)}
+					</article>
+
+					<div className="h-px bg-border/60" />
+
+					{/* Step 3 — Server URL */}
+					<article>
+						<header className="mb-3 flex items-center gap-2">
+							<div className="flex size-7 items-center justify-center rounded-md border border-slate-400/30 text-xs font-medium">
+								3
 							</div>
-						</div>
-					</form>
-				</Form>
-			</div>
+							<h3 className="text-sm font-medium sm:text-base">Point the plugin at this server</h3>
+						</header>
+						<p className="text-[11px] text-muted-foreground sm:text-xs">
+							For SurfSense Cloud, use the default{" "}
+							<span className="font-medium">surfsense.com</span>. If you are self-hosting, set the
+							plugin's <span className="font-medium">Server URL</span> to your frontend domain.
+						</p>
+					</article>
+
+					<div className="h-px bg-border/60" />
+
+					{/* Step 4 — Pick search space */}
+					<article>
+						<header className="mb-3 flex items-center gap-2">
+							<div className="flex size-7 items-center justify-center rounded-md border border-slate-400/30 text-xs font-medium">
+								4
+							</div>
+							<h3 className="text-sm font-medium sm:text-base">Pick this search space</h3>
+						</header>
+						<p className="text-[11px] text-muted-foreground sm:text-xs">
+							In the plugin's <span className="font-medium">Search space</span> setting, choose the
+							search space you want this vault to sync into. The connector will appear here
+							automatically once the plugin makes its first sync.
+						</p>
+					</article>
+				</div>
+			</section>
 
-			{/* What you get section */}
 			{getConnectorBenefits(EnumConnectorName.OBSIDIAN_CONNECTOR) && (
-				<div className="rounded-xl border border-border bg-slate-400/5 dark:bg-white/5 px-3 sm:px-6 py-4 space-y-2">
-					<h4 className="text-xs sm:text-sm font-medium">
+				<div className="space-y-2 rounded-xl border border-border bg-slate-400/5 px-3 py-4 sm:px-6 dark:bg-white/5">
+					<h4 className="text-xs font-medium sm:text-sm">
 						What you get with Obsidian integration:
 					</h4>
-					<ul className="list-disc pl-5 text-[10px] sm:text-xs text-muted-foreground space-y-1">
+					<ul className="list-disc space-y-1 pl-5 text-[10px] text-muted-foreground sm:text-xs">
 						{getConnectorBenefits(EnumConnectorName.OBSIDIAN_CONNECTOR)?.map((benefit) => (
 							<li key={benefit}>{benefit}</li>
 						))}
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/searxng-connect-form.tsx b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/searxng-connect-form.tsx
index 5ff54fb9e7..727a1ef3f5 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/searxng-connect-form.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/searxng-connect-form.tsx
@@ -123,20 +123,22 @@ export const SearxngConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmittin
 
 	return (
 		<div className="space-y-6 pb-6">
-			<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 p-2 sm:p-3">
-				<Info className="size-4 shrink-0" />
-				<AlertTitle className="text-xs sm:text-sm">SearxNG Instance Required</AlertTitle>
-				<AlertDescription className="text-[10px] sm:text-xs">
-					You need access to a running SearxNG instance. Refer to the{" "}
-					<a
-						href="https://docs.searxng.org/admin/installation-docker.html"
-						target="_blank"
-						rel="noopener noreferrer"
-						className="font-medium underline underline-offset-4"
-					>
-						SearxNG installation guide
-					</a>{" "}
-					for setup instructions. If your instance requires an API key, include it below.
+			<Alert>
+				<Info />
+				<AlertTitle>SearxNG Instance Required</AlertTitle>
+				<AlertDescription>
+					<p>
+						You need access to a running SearxNG instance. Refer to the{" "}
+						<a
+							href="https://docs.searxng.org/admin/installation-docker.html"
+							target="_blank"
+							rel="noopener noreferrer"
+							className="font-medium underline underline-offset-4"
+						>
+							SearxNG installation guide
+						</a>{" "}
+						for setup instructions. If your instance requires an API key, include it below.
+					</p>
 				</AlertDescription>
 			</Alert>
 
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/tavily-api-connect-form.tsx b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/tavily-api-connect-form.tsx
index 57d183d44c..11e5d8a03a 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/tavily-api-connect-form.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/tavily-api-connect-form.tsx
@@ -70,19 +70,21 @@ export const TavilyApiConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitt
 
 	return (
 		<div className="space-y-6 pb-6">
-			<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 p-2 sm:p-3">
-				<Info className="size-4 shrink-0" />
-				<AlertTitle className="text-xs sm:text-sm">API Key Required</AlertTitle>
-				<AlertDescription className="text-[10px] sm:text-xs">
-					You'll need a Tavily API key to use this connector. You can get one by signing up at{" "}
-					<a
-						href="https://tavily.com"
-						target="_blank"
-						rel="noopener noreferrer"
-						className="font-medium underline underline-offset-4"
-					>
-						tavily.com
-					</a>
+			<Alert>
+				<Info />
+				<AlertTitle>API Key Required</AlertTitle>
+				<AlertDescription>
+					<p>
+						You'll need a Tavily API key to use this connector. You can get one by signing up at{" "}
+						<a
+							href="https://tavily.com"
+							target="_blank"
+							rel="noopener noreferrer"
+							className="font-medium underline underline-offset-4"
+						>
+							tavily.com
+						</a>
+					</p>
 				</AlertDescription>
 			</Alert>
 
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/connector-benefits.ts b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/connector-benefits.ts
index 0dc093100c..f4883fa36a 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/connector-benefits.ts
+++ b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/connector-benefits.ts
@@ -104,11 +104,11 @@ export function getConnectorBenefits(connectorType: string): string[] | null {
 			"No manual indexing required - meetings are added automatically",
 		],
 		OBSIDIAN_CONNECTOR: [
-			"Search through all your Obsidian notes and knowledge base",
-			"Access note content with YAML frontmatter metadata preserved",
-			"Wiki-style links ([[note]]) and #tags are indexed",
-			"Connect your personal knowledge base directly to your search space",
-			"Incremental sync - only changed files are re-indexed",
+			"Search through all of your Obsidian notes",
+			"Realtime sync as you create, edit, rename, or delete notes",
+			"YAML frontmatter, [[wiki links]], and #tags are preserved and indexed",
+			"Open any chat citation straight back in Obsidian via deep links",
+			"Each device is identifiable, so you can revoke a vault from one machine",
 			"Full support for your vault's folder structure",
 		],
 	};
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/circleback-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/circleback-config.tsx
index 268ab0f98b..af818671ae 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/circleback-config.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/circleback-config.tsx
@@ -166,10 +166,10 @@ export const CirclebackConfig: FC<CirclebackConfigProps> = ({ connector, onNameC
 				)}
 
 				{webhookInfo && (
-					<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20">
-						<Info className="h-3 w-3 sm:h-4 sm:w-4" />
-						<AlertTitle className="text-xs sm:text-sm">Configuration Instructions</AlertTitle>
-						<AlertDescription className="text-[10px] sm:text-xs mt-1">
+					<Alert>
+						<Info />
+						<AlertTitle>Configuration Instructions</AlertTitle>
+						<AlertDescription>
 							Configure this URL in Circleback Settings → Automations → Create automation → Send
 							webhook request. The webhook will automatically send meeting notes, transcripts, and
 							action items to this search space.
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/clickup-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/clickup-config.tsx
index 9d203a33b1..f5f75bbe85 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/clickup-config.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/clickup-config.tsx
@@ -3,6 +3,7 @@
 import { Info, KeyRound } from "lucide-react";
 import type { FC } from "react";
 import { useState } from "react";
+import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
 import { Input } from "@/components/ui/input";
 import { Label } from "@/components/ui/label";
 import type { ConnectorConfigProps } from "../index";
@@ -47,21 +48,17 @@ export const ClickUpConfig: FC<ClickUpConfigProps> = ({
 		return (
 			<div className="space-y-6">
 				{/* OAuth Info */}
-				<div className="rounded-xl border border-border bg-primary/5 p-4 flex items-start gap-3">
-					<div className="flex h-8 w-8 items-center justify-center rounded-lg bg-primary/10 shrink-0 mt-0.5">
-						<Info className="size-4" />
-					</div>
-					<div className="text-xs sm:text-sm">
-						<p className="font-medium text-xs sm:text-sm">Connected via OAuth</p>
-						<p className="text-muted-foreground mt-1 text-[10px] sm:text-sm">
+				<Alert>
+					<Info />
+					<AlertTitle>Connected via OAuth</AlertTitle>
+					<AlertDescription>
+						<p>
 							Workspace:{" "}
 							<code className="bg-muted px-1 py-0.5 rounded text-inherit">{workspaceName}</code>
 						</p>
-						<p className="text-muted-foreground mt-1 text-[10px] sm:text-sm">
-							To update your connection, reconnect this connector.
-						</p>
-					</div>
-				</div>
+						<p>To update your connection, reconnect this connector.</p>
+					</AlertDescription>
+				</Alert>
 			</div>
 		);
 	}
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-drive-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-drive-config.tsx
index c08011bb5e..6d119e8a22 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-drive-config.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/composio-drive-config.tsx
@@ -14,6 +14,7 @@ import {
 import type { FC } from "react";
 import { useCallback, useState } from "react";
 import { DriveFolderTree, type SelectedFolder } from "@/components/connectors/drive-folder-tree";
+import { Button } from "@/components/ui/button";
 import { Label } from "@/components/ui/label";
 import {
 	Select,
@@ -196,14 +197,16 @@ export const ComposioDriveConfig: FC<ConnectorConfigProps> = ({ connector, onCon
 								>
 									<FolderClosed className="size-3.5 shrink-0 text-muted-foreground" />
 									<span className="flex-1 truncate">{folder.name}</span>
-									<button
+									<Button
 										type="button"
+										variant="ghost"
+										size="icon"
 										onClick={() => handleRemoveFolder(folder.id)}
-										className="shrink-0 p-0.5 hover:bg-muted-foreground/20 rounded transition-colors"
+										className="size-5 shrink-0 rounded p-0 hover:bg-accent hover:text-accent-foreground"
 										aria-label={`Remove ${folder.name}`}
 									>
 										<X className="size-3.5" />
-									</button>
+									</Button>
 								</div>
 							))}
 							{selectedFiles.map((file) => (
@@ -214,14 +217,16 @@ export const ComposioDriveConfig: FC<ConnectorConfigProps> = ({ connector, onCon
 								>
 									{getFileIconFromName(file.name)}
 									<span className="flex-1 truncate">{file.name}</span>
-									<button
+									<Button
 										type="button"
+										variant="ghost"
+										size="icon"
 										onClick={() => handleRemoveFile(file.id)}
-										className="shrink-0 p-0.5 hover:bg-muted-foreground/20 rounded transition-colors"
+										className="size-5 shrink-0 rounded p-0 hover:bg-accent hover:text-accent-foreground"
 										aria-label={`Remove ${file.name}`}
 									>
 										<X className="size-3.5" />
-									</button>
+									</Button>
 								</div>
 							))}
 						</div>
@@ -237,10 +242,11 @@ export const ComposioDriveConfig: FC<ConnectorConfigProps> = ({ connector, onCon
 
 				{isEditMode ? (
 					<div className="space-y-2">
-						<button
+						<Button
 							type="button"
+							variant="ghost"
 							onClick={() => setIsFolderTreeOpen((prev) => !prev)}
-							className="flex items-center gap-2 text-xs sm:text-sm text-muted-foreground hover:text-foreground transition-colors w-fit"
+							className="h-auto w-fit gap-2 px-0 py-0 text-xs font-normal text-muted-foreground hover:bg-transparent hover:text-accent-foreground sm:text-sm"
 						>
 							Change Selection
 							{isFolderTreeOpen ? (
@@ -248,7 +254,7 @@ export const ComposioDriveConfig: FC<ConnectorConfigProps> = ({ connector, onCon
 							) : (
 								<ChevronRight className="size-4" />
 							)}
-						</button>
+						</Button>
 						{isFolderTreeOpen && (
 							<DriveFolderTree
 								fetchItems={fetchItems}
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/confluence-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/confluence-config.tsx
index 040479211b..b098f36944 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/confluence-config.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/confluence-config.tsx
@@ -3,6 +3,7 @@
 import { Info, KeyRound } from "lucide-react";
 import type { FC } from "react";
 import { useState } from "react";
+import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
 import { Input } from "@/components/ui/input";
 import { Label } from "@/components/ui/label";
 import type { ConnectorConfigProps } from "../index";
@@ -72,23 +73,17 @@ export const ConfluenceConfig: FC<ConfluenceConfigProps> = ({
 		return (
 			<div className="space-y-6">
 				{/* OAuth Info */}
-				<div className="rounded-xl border border-border bg-primary/5 p-4 flex items-start gap-3">
-					<div className="flex h-8 w-8 items-center justify-center rounded-lg bg-primary/10 shrink-0 mt-0.5">
-						<Info className="size-4" />
-					</div>
-					<div className="text-xs sm:text-sm">
-						<p className="font-medium text-xs sm:text-sm">Connected via OAuth</p>
-						<p className="text-muted-foreground mt-1 text-[10px] sm:text-sm">
-							This connector is authenticated using OAuth 2.0. Your Confluence instance is:
-						</p>
-						<p className="text-muted-foreground mt-1 text-[10px] sm:text-sm">
+				<Alert>
+					<Info />
+					<AlertTitle>Connected via OAuth</AlertTitle>
+					<AlertDescription>
+						<p>This connector is authenticated using OAuth 2.0. Your Confluence instance is:</p>
+						<p>
 							<code className="bg-muted px-1 py-0.5 rounded text-inherit">{siteUrl}</code>
 						</p>
-						<p className="text-muted-foreground mt-1 text-[10px] sm:text-sm">
-							To update your connection, reconnect this connector.
-						</p>
-					</div>
-				</div>
+						<p>To update your connection, reconnect this connector.</p>
+					</AlertDescription>
+				</Alert>
 			</div>
 		);
 	}
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/discord-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/discord-config.tsx
index f782a6f4da..93fc3ff817 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/discord-config.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/discord-config.tsx
@@ -2,6 +2,7 @@
 
 import { AlertCircle, CheckCircle2, Hash, Info, Megaphone, RefreshCw } from "lucide-react";
 import { type FC, useCallback, useEffect, useState } from "react";
+import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
 import { Button } from "@/components/ui/button";
 import { Spinner } from "@/components/ui/spinner";
 import { connectorsApiService, type DiscordChannel } from "@/lib/apis/connectors-api.service";
@@ -53,8 +54,7 @@ export const DiscordConfig: FC<DiscordConfigProps> = ({ connector }) => {
 		return () => document.removeEventListener("visibilitychange", handleVisibilityChange);
 	}, [connector?.id, fetchChannels]);
 
-	// Separate channels by indexing capability
-	const readyToIndex = channels.filter((ch) => ch.can_index);
+	const accessible = channels.filter((ch) => ch.can_index);
 	const needsPermissions = channels.filter((ch) => !ch.can_index);
 
 	// Format last fetched time
@@ -74,17 +74,14 @@ export const DiscordConfig: FC<DiscordConfigProps> = ({ connector }) => {
 	return (
 		<div className="space-y-6">
 			{/* Info box */}
-			<div className="rounded-xl border border-border bg-primary/5 p-4 flex items-start gap-3">
-				<div className="flex h-8 w-8 items-center justify-center rounded-lg bg-primary/10 shrink-0 mt-0.5">
-					<Info className="size-4" />
-				</div>
-				<div className="text-xs sm:text-sm">
-					<p className="text-muted-foreground mt-1 text-[10px] sm:text-sm">
-						The bot needs &quot;Read Message History&quot; permission to index channels. Ask a
-						server admin to grant this permission for channels shown below.
-					</p>
-				</div>
-			</div>
+			<Alert>
+				<Info />
+				<AlertTitle>Grant Channel Permissions</AlertTitle>
+				<AlertDescription>
+					The bot needs &quot;Read Message History&quot; permission to access channels. Ask a server
+					admin to grant this permission for channels shown below.
+				</AlertDescription>
+			</Alert>
 
 			{/* Channels Section */}
 			<div className="space-y-3">
@@ -101,7 +98,7 @@ export const DiscordConfig: FC<DiscordConfigProps> = ({ connector }) => {
 							size="sm"
 							onClick={fetchChannels}
 							disabled={isLoading}
-							className="h-7 px-2.5 text-[11px] bg-slate-400/10 dark:bg-white/10 hover:bg-slate-400/20 dark:hover:bg-white/20 border-slate-400/20 dark:border-white/20"
+							className="h-7 px-2.5 text-[11px] bg-slate-400/10 dark:bg-white/10 hover:bg-accent hover:text-accent-foreground border-slate-400/20 dark:border-white/20"
 						>
 							<RefreshCw className={cn("mr-1.5 size-3", isLoading && "animate-spin")} />
 							Refresh
@@ -127,18 +124,18 @@ export const DiscordConfig: FC<DiscordConfigProps> = ({ connector }) => {
 					</div>
 				) : (
 					<div className="rounded-xl bg-slate-400/5 dark:bg-white/5 overflow-hidden">
-						{/* Ready to index */}
-						{readyToIndex.length > 0 && (
+						{/* Accessible channels */}
+						{accessible.length > 0 && (
 							<div className={cn("p-3", needsPermissions.length > 0 && "border-b border-border")}>
 								<div className="flex items-center gap-2 mb-2">
 									<CheckCircle2 className="size-3.5 text-emerald-500" />
-									<span className="text-[11px] font-medium">Ready to index</span>
+									<span className="text-[11px] font-medium">Accessible</span>
 									<span className="text-[10px] text-muted-foreground">
-										{readyToIndex.length} {readyToIndex.length === 1 ? "channel" : "channels"}
+										{accessible.length} {accessible.length === 1 ? "channel" : "channels"}
 									</span>
 								</div>
 								<div className="flex flex-wrap gap-1.5">
-									{readyToIndex.map((channel) => (
+									{accessible.map((channel) => (
 										<ChannelPill key={channel.id} channel={channel} />
 									))}
 								</div>
@@ -150,7 +147,7 @@ export const DiscordConfig: FC<DiscordConfigProps> = ({ connector }) => {
 							<div className="p-3">
 								<div className="flex items-center gap-2 mb-2">
 									<AlertCircle className="size-3.5 text-amber-500" />
-									<span className="text-[11px] font-medium">Grant permissions to index</span>
+									<span className="text-[11px] font-medium">Needs permissions</span>
 									<span className="text-[10px] text-muted-foreground">
 										{needsPermissions.length}{" "}
 										{needsPermissions.length === 1 ? "channel" : "channels"}
@@ -176,7 +173,7 @@ interface ChannelPillProps {
 
 const ChannelPill: FC<ChannelPillProps> = ({ channel }) => {
 	return (
-		<div className="inline-flex items-center gap-1 px-2 py-1 rounded-md text-[11px] font-medium bg-slate-400/10 dark:bg-white/10 hover:bg-slate-400/20 dark:hover:bg-white/20 transition-colors">
+		<div className="inline-flex items-center gap-1 px-2 py-1 rounded-md text-[11px] font-medium bg-slate-400/10 dark:bg-white/10 hover:bg-accent hover:text-accent-foreground transition-colors">
 			{channel.type === "announcement" ? (
 				<Megaphone className="size-2.5 text-muted-foreground" />
 			) : (
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/dropbox-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/dropbox-config.tsx
index ce793a1a76..7696ce9a5b 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/dropbox-config.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/dropbox-config.tsx
@@ -14,6 +14,7 @@ import {
 import type { FC } from "react";
 import { useCallback, useEffect, useState } from "react";
 import { DriveFolderTree, type SelectedFolder } from "@/components/connectors/drive-folder-tree";
+import { Button } from "@/components/ui/button";
 import { Label } from "@/components/ui/label";
 import {
 	Select,
@@ -177,14 +178,16 @@ export const DropboxConfig: FC<ConnectorConfigProps> = ({ connector, onConfigCha
 								>
 									<FolderClosed className="size-3.5 shrink-0 text-muted-foreground" />
 									<span className="flex-1 truncate">{folder.name}</span>
-									<button
+									<Button
 										type="button"
+										variant="ghost"
+										size="icon"
 										onClick={() => handleRemoveFolder(folder.id)}
-										className="shrink-0 p-0.5 hover:bg-muted-foreground/20 rounded transition-colors"
+										className="size-5 shrink-0 rounded p-0 hover:bg-accent hover:text-accent-foreground"
 										aria-label={`Remove ${folder.name}`}
 									>
 										<X className="size-3.5" />
-									</button>
+									</Button>
 								</div>
 							))}
 							{selectedFiles.map((file) => (
@@ -195,14 +198,16 @@ export const DropboxConfig: FC<ConnectorConfigProps> = ({ connector, onConfigCha
 								>
 									{getFileIconFromName(file.name)}
 									<span className="flex-1 truncate">{file.name}</span>
-									<button
+									<Button
 										type="button"
+										variant="ghost"
+										size="icon"
 										onClick={() => handleRemoveFile(file.id)}
-										className="shrink-0 p-0.5 hover:bg-muted-foreground/20 rounded transition-colors"
+										className="size-5 shrink-0 rounded p-0 hover:bg-accent hover:text-accent-foreground"
 										aria-label={`Remove ${file.name}`}
 									>
 										<X className="size-3.5" />
-									</button>
+									</Button>
 								</div>
 							))}
 						</div>
@@ -217,10 +222,11 @@ export const DropboxConfig: FC<ConnectorConfigProps> = ({ connector, onConfigCha
 
 				{isEditMode ? (
 					<div className="space-y-2">
-						<button
+						<Button
 							type="button"
+							variant="ghost"
 							onClick={() => setIsFolderTreeOpen(!isFolderTreeOpen)}
-							className="flex items-center gap-2 text-xs sm:text-sm text-muted-foreground hover:text-foreground transition-colors w-fit"
+							className="h-auto w-fit gap-2 px-0 py-0 text-xs font-normal text-muted-foreground hover:bg-transparent hover:text-accent-foreground sm:text-sm"
 						>
 							Change Selection
 							{isFolderTreeOpen ? (
@@ -228,7 +234,7 @@ export const DropboxConfig: FC<ConnectorConfigProps> = ({ connector, onConfigCha
 							) : (
 								<ChevronRight className="size-4" />
 							)}
-						</button>
+						</Button>
 						{isFolderTreeOpen && (
 							<DriveFolderTree
 								fetchItems={fetchItems}
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/google-drive-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/google-drive-config.tsx
index c73eed7885..4ece648e69 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/google-drive-config.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/google-drive-config.tsx
@@ -92,20 +92,19 @@ export const GoogleDriveConfig: FC<ConnectorConfigProps> = ({ connector, onConfi
 	const [selectedFiles, setSelectedFiles] = useState<SelectedItem[]>(existingFiles);
 	const [indexingOptions, setIndexingOptions] = useState<IndexingOptions>(existingIndexingOptions);
 
-	const updateConfig = (
-		folders: SelectedItem[],
-		files: SelectedItem[],
-		options: IndexingOptions
-	) => {
-		if (onConfigChange) {
-			onConfigChange({
-				...connector.config,
-				selected_folders: folders,
-				selected_files: files,
-				indexing_options: options,
-			});
-		}
-	};
+	const updateConfig = useCallback(
+		(folders: SelectedItem[], files: SelectedItem[], options: IndexingOptions) => {
+			if (onConfigChange) {
+				onConfigChange({
+					...connector.config,
+					selected_folders: folders,
+					selected_files: files,
+					indexing_options: options,
+				});
+			}
+		},
+		[connector.config, onConfigChange]
+	);
 
 	const handlePicked = useCallback(
 		(result: PickerResult) => {
@@ -115,8 +114,7 @@ export const GoogleDriveConfig: FC<ConnectorConfigProps> = ({ connector, onConfi
 			setSelectedFiles(files);
 			updateConfig(folders, files, indexingOptions);
 		},
-		// eslint-disable-next-line react-hooks/exhaustive-deps
-		[indexingOptions, connector.config]
+		[indexingOptions, updateConfig]
 	);
 
 	const {
@@ -188,14 +186,16 @@ export const GoogleDriveConfig: FC<ConnectorConfigProps> = ({ connector, onConfi
 								>
 									<FolderClosed className="size-3.5 shrink-0 text-muted-foreground" />
 									<span className="flex-1 truncate">{folder.name}</span>
-									<button
+									<Button
 										type="button"
+										variant="ghost"
+										size="icon"
 										onClick={() => handleRemoveFolder(folder.id)}
-										className="shrink-0 p-0.5 hover:bg-muted-foreground/20 rounded transition-colors"
+										className="size-5 shrink-0 rounded p-0 hover:bg-accent hover:text-accent-foreground"
 										aria-label={`Remove ${folder.name}`}
 									>
 										<X className="size-3.5" />
-									</button>
+									</Button>
 								</div>
 							))}
 							{selectedFiles.map((file) => (
@@ -206,14 +206,16 @@ export const GoogleDriveConfig: FC<ConnectorConfigProps> = ({ connector, onConfi
 								>
 									{getFileIconFromName(file.name)}
 									<span className="flex-1 truncate">{file.name}</span>
-									<button
+									<Button
 										type="button"
+										variant="ghost"
+										size="icon"
 										onClick={() => handleRemoveFile(file.id)}
-										className="shrink-0 p-0.5 hover:bg-muted-foreground/20 rounded transition-colors"
+										className="size-5 shrink-0 rounded p-0 hover:bg-accent hover:text-accent-foreground"
 										aria-label={`Remove ${file.name}`}
 									>
 										<X className="size-3.5" />
-									</button>
+									</Button>
 								</div>
 							))}
 						</div>
@@ -225,7 +227,7 @@ export const GoogleDriveConfig: FC<ConnectorConfigProps> = ({ connector, onConfi
 					variant="outline"
 					onClick={openPicker}
 					disabled={pickerLoading || isAuthExpired}
-					className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 hover:bg-slate-400/10 dark:hover:bg-white/10 text-xs sm:text-sm h-8 sm:h-9"
+					className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 hover:bg-accent hover:text-accent-foreground text-xs sm:text-sm h-8 sm:h-9"
 				>
 					{pickerLoading && <Spinner size="xs" className="mr-1.5" />}
 					{totalSelected > 0 ? "Change Selection" : "Select from Google Drive"}
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/jira-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/jira-config.tsx
index c43bd1524d..bbabb195af 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/jira-config.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/jira-config.tsx
@@ -3,6 +3,7 @@
 import { Info, KeyRound } from "lucide-react";
 import type { FC } from "react";
 import { useState } from "react";
+import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
 import { Input } from "@/components/ui/input";
 import { Label } from "@/components/ui/label";
 import type { ConnectorConfigProps } from "../index";
@@ -65,23 +66,17 @@ export const JiraConfig: FC<JiraConfigProps> = ({ connector, onConfigChange, onN
 		return (
 			<div className="space-y-6">
 				{/* OAuth Info */}
-				<div className="rounded-xl border border-border bg-primary/5 p-4 flex items-start gap-3">
-					<div className="flex h-8 w-8 items-center justify-center rounded-lg bg-primary/10 shrink-0 mt-0.5">
-						<Info className="size-4" />
-					</div>
-					<div className="text-xs sm:text-sm">
-						<p className="font-medium text-xs sm:text-sm">Connected via OAuth</p>
-						<p className="text-muted-foreground mt-1 text-[10px] sm:text-sm">
-							This connector is authenticated using OAuth 2.0. Your Jira instance is:
-						</p>
-						<p className="text-muted-foreground mt-1 text-[10px] sm:text-sm">
+				<Alert>
+					<Info />
+					<AlertTitle>Connected via OAuth</AlertTitle>
+					<AlertDescription>
+						<p>This connector is authenticated using OAuth 2.0. Your Jira instance is:</p>
+						<p>
 							<code className="bg-muted px-1 py-0.5 rounded text-inherit">{baseUrl}</code>
 						</p>
-						<p className="text-muted-foreground mt-1 text-[10px] sm:text-sm">
-							To update your connection, reconnect this connector.
-						</p>
-					</div>
-				</div>
+						<p>To update your connection, reconnect this connector.</p>
+					</AlertDescription>
+				</Alert>
 			</div>
 		);
 	}
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/mcp-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/mcp-config.tsx
index ca997a9ba9..a03826d3a4 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/mcp-config.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/mcp-config.tsx
@@ -1,6 +1,6 @@
 "use client";
 
-import { CheckCircle2, ChevronDown, ChevronUp, Server, XCircle } from "lucide-react";
+import { CheckCircle2, ChevronDown, ChevronUp, Loader2, Server, XCircle } from "lucide-react";
 import type { FC } from "react";
 import { useCallback, useEffect, useRef, useState } from "react";
 import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
@@ -215,9 +215,16 @@ export const MCPConfig: FC<MCPConfigProps> = ({ connector, onConfigChange, onNam
 							onClick={handleTestConnection}
 							disabled={isTesting}
 							variant="secondary"
-							className="w-full h-8 text-[13px] px-3 rounded-lg font-medium bg-white text-slate-700 hover:bg-slate-50 border-0 shadow-xs dark:bg-secondary dark:text-secondary-foreground dark:hover:bg-secondary/80"
+							className="w-full h-8 text-[13px] px-3 font-medium bg-white text-slate-700 hover:bg-accent hover:text-accent-foreground border-0 shadow-xs dark:bg-secondary dark:text-secondary-foreground"
 						>
-							{isTesting ? "Testing Connection" : "Test Connection"}
+							{isTesting ? (
+								<>
+									<Loader2 className="h-3.5 w-3.5 animate-spin" />
+									Testing Connection...
+								</>
+							) : (
+								"Test Connection"
+							)}
 						</Button>
 					</div>
 
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/mcp-service-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/mcp-service-config.tsx
new file mode 100644
index 0000000000..cfa2cde384
--- /dev/null
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/mcp-service-config.tsx
@@ -0,0 +1,31 @@
+"use client";
+
+import { CheckCircle2 } from "lucide-react";
+import type { FC } from "react";
+import type { ConnectorConfigProps } from "../index";
+import { MCPTrustedTools } from "./mcp-trusted-tools";
+
+export const MCPServiceConfig: FC<ConnectorConfigProps> = ({ connector }) => {
+	const serviceName = connector.config?.mcp_service as string | undefined;
+	const displayName = serviceName
+		? serviceName.charAt(0).toUpperCase() + serviceName.slice(1)
+		: "this service";
+
+	return (
+		<div className="space-y-6">
+			<div className="rounded-xl border border-border bg-emerald-500/5 p-4 flex items-start gap-3">
+				<div className="flex h-8 w-8 items-center justify-center rounded-lg bg-emerald-500/10 shrink-0 mt-0.5">
+					<CheckCircle2 className="size-4 text-emerald-500" />
+				</div>
+				<div className="text-xs sm:text-sm">
+					<p className="font-medium text-xs sm:text-sm">Connected</p>
+					<p className="text-muted-foreground mt-1 text-[10px] sm:text-sm">
+						Your agent can search, read, and take actions in {displayName}.
+					</p>
+				</div>
+			</div>
+
+			{connector.id > 0 && <MCPTrustedTools connector={connector} />}
+		</div>
+	);
+};
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/mcp-trusted-tools.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/mcp-trusted-tools.tsx
new file mode 100644
index 0000000000..ed01511cac
--- /dev/null
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/mcp-trusted-tools.tsx
@@ -0,0 +1,89 @@
+"use client";
+
+import { ShieldCheck, Trash2 } from "lucide-react";
+import type { FC } from "react";
+import { useState } from "react";
+import { toast } from "sonner";
+import { Button } from "@/components/ui/button";
+import type { SearchSourceConnector } from "@/contracts/types/connector.types";
+import { connectorsApiService } from "@/lib/apis/connectors-api.service";
+
+interface MCPTrustedToolsProps {
+	connector: SearchSourceConnector;
+}
+
+/** Audit + revoke surface for tools promoted via in-chat "Always Allow". */
+export const MCPTrustedTools: FC<MCPTrustedToolsProps> = ({ connector }) => {
+	const trustedTools = readTrustedTools(connector.config);
+	const [pending, setPending] = useState<Set<string>>(new Set());
+
+	const handleRevoke = async (toolName: string) => {
+		setPending((prev) => new Set(prev).add(toolName));
+		try {
+			await connectorsApiService.untrustMCPTool(connector.id, toolName);
+			toast.success(`Removed ${toolName} from trusted tools`);
+		} catch {
+			toast.error(`Failed to remove ${toolName} from trusted tools`);
+		} finally {
+			setPending((prev) => {
+				const next = new Set(prev);
+				next.delete(toolName);
+				return next;
+			});
+		}
+	};
+
+	return (
+		<div className="space-y-4">
+			<h3 className="font-medium text-sm sm:text-base flex items-center gap-2">
+				<ShieldCheck className="h-4 w-4" />
+				Trusted Tools
+			</h3>
+
+			<div className="rounded-xl border border-border bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6 space-y-4">
+				<p className="text-[10px] sm:text-xs text-muted-foreground">
+					Tools listed here skip the approval prompt during chat. Trust is granted by clicking
+					"Always Allow" on an approval card; revoke it here to require approval again.
+				</p>
+
+				{trustedTools.length === 0 ? (
+					<p className="text-xs text-muted-foreground/70 italic">
+						No trusted tools yet for this connector.
+					</p>
+				) : (
+					<ul className="space-y-1">
+						{trustedTools.map((toolName) => {
+							const isPending = pending.has(toolName);
+							return (
+								<li
+									key={toolName}
+									className="flex items-center justify-between gap-3 rounded-lg px-3 py-2 hover:bg-muted/40 transition-colors"
+								>
+									<span className="text-xs font-mono break-all">{toolName}</span>
+									<Button
+										type="button"
+										variant="ghost"
+										size="sm"
+										className="h-7 px-2 text-muted-foreground hover:text-destructive shrink-0"
+										onClick={() => handleRevoke(toolName)}
+										disabled={isPending}
+										aria-label={`Revoke trust for ${toolName}`}
+									>
+										<Trash2 className="h-3.5 w-3.5" />
+										<span className="ml-1 hidden sm:inline">Revoke</span>
+									</Button>
+								</li>
+							);
+						})}
+					</ul>
+				)}
+			</div>
+		</div>
+	);
+};
+
+function readTrustedTools(config: Record<string, unknown> | undefined | null): string[] {
+	const raw = config?.trusted_tools;
+	if (!Array.isArray(raw)) return [];
+	return raw.filter((item): item is string => typeof item === "string");
+}
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/obsidian-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/obsidian-config.tsx
index 3da1d6e7e2..d4f18cb3a2 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/obsidian-config.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/obsidian-config.tsx
@@ -1,167 +1,160 @@
 "use client";
 
-import type { FC } from "react";
-import { useState } from "react";
-import { Input } from "@/components/ui/input";
-import { Label } from "@/components/ui/label";
-import { Switch } from "@/components/ui/switch";
+import { AlertTriangle, Info } from "lucide-react";
+import { type FC, useEffect, useMemo, useState } from "react";
+import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
+import { connectorsApiService, type ObsidianStats } from "@/lib/apis/connectors-api.service";
 import type { ConnectorConfigProps } from "../index";
 
-export interface ObsidianConfigProps extends ConnectorConfigProps {
-	onNameChange?: (name: string) => void;
-}
-
-export const ObsidianConfig: FC<ObsidianConfigProps> = ({
-	connector,
-	onConfigChange,
-	onNameChange,
-}) => {
-	const [vaultPath, setVaultPath] = useState<string>(
-		(connector.config?.vault_path as string) || ""
-	);
-	const [vaultName, setVaultName] = useState<string>(
-		(connector.config?.vault_name as string) || ""
-	);
-	const [excludeFolders, setExcludeFolders] = useState<string>(() => {
-		const folders = connector.config?.exclude_folders;
-		if (Array.isArray(folders)) {
-			return folders.join(", ");
-		}
-		return (folders as string) || ".obsidian, .trash";
-	});
-	const [includeAttachments, setIncludeAttachments] = useState<boolean>(
-		(connector.config?.include_attachments as boolean) || false
-	);
-	const [name, setName] = useState<string>(connector.name || "");
+const OBSIDIAN_SETUP_DOCS_URL = "/docs/connectors/obsidian";
 
-	const handleVaultPathChange = (value: string) => {
-		setVaultPath(value);
-		if (onConfigChange) {
-			onConfigChange({
-				...connector.config,
-				vault_path: value,
-			});
-		}
-	};
-
-	const handleVaultNameChange = (value: string) => {
-		setVaultName(value);
-		if (onConfigChange) {
-			onConfigChange({
-				...connector.config,
-				vault_name: value,
-			});
-		}
-	};
-
-	const handleExcludeFoldersChange = (value: string) => {
-		setExcludeFolders(value);
-		const foldersArray = value
-			.split(",")
-			.map((f) => f.trim())
-			.filter(Boolean);
-		if (onConfigChange) {
-			onConfigChange({
-				...connector.config,
-				exclude_folders: foldersArray,
-			});
-		}
-	};
+function formatTimestamp(value: unknown): string {
+	if (typeof value !== "string" || !value) return "—";
+	const d = new Date(value);
+	if (Number.isNaN(d.getTime())) return value;
+	return d.toLocaleString();
+}
 
-	const handleIncludeAttachmentsChange = (value: boolean) => {
-		setIncludeAttachments(value);
-		if (onConfigChange) {
-			onConfigChange({
-				...connector.config,
-				include_attachments: value,
-			});
-		}
-	};
+/**
+ * Obsidian connector config view.
+ *
+ * Read-only on purpose: the plugin owns vault identity, so the connector's
+ * display name is auto-derived from `payload.vault_name` server-side on
+ * every `/connect` (see `obsidian_plugin_routes.obsidian_connect`). The
+ * web UI doesn't expose a Name input or a Save button for Obsidian (the
+ * latter is suppressed in `connector-edit-view.tsx`).
+ *
+ * Renders one of three modes depending on the connector's `config`:
+ *
+ * 1. **Plugin connector** (`config.source === "plugin"`) — read-only stats
+ *    panel showing what the plugin most recently reported.
+ * 2. **Legacy server-path connector** (`config.legacy === true`, set by the
+ *    migration) — migration warning + docs link + explicit disconnect data-loss
+ *    warning so users move to the plugin flow safely.
+ * 3. **Unknown** — fallback for rows that escaped migration; suggests a
+ *    clean re-install.
+ */
+export const ObsidianConfig: FC<ConnectorConfigProps> = ({ connector }) => {
+	const config = (connector.config ?? {}) as Record<string, unknown>;
+	const isLegacy = config.legacy === true;
+	const isPlugin = config.source === "plugin";
 
-	const handleNameChange = (value: string) => {
-		setName(value);
-		if (onNameChange) {
-			onNameChange(value);
-		}
-	};
+	if (isLegacy) return <LegacyBanner />;
+	if (isPlugin) return <PluginStats config={config} />;
+	return <UnknownConnectorState />;
+};
 
+const LegacyBanner: FC = () => {
 	return (
 		<div className="space-y-6">
-			{/* Connector Name */}
-			<div className="rounded-xl border border-border bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6 space-y-3 sm:space-y-4">
-				<div className="space-y-2">
-					<Label className="text-xs sm:text-sm">Connector Name</Label>
-					<Input
-						value={name}
-						onChange={(e) => handleNameChange(e.target.value)}
-						placeholder="My Obsidian Vault"
-						className="border-slate-400/20 focus-visible:border-slate-400/40"
-					/>
-					<p className="text-[10px] sm:text-xs text-muted-foreground">
-						A friendly name to identify this connector.
-					</p>
-				</div>
+			<Alert variant="warning">
+				<AlertTriangle />
+				<AlertTitle>Sync stopped, install the plugin to migrate</AlertTitle>
+				<AlertDescription>
+					This Obsidian connector used the legacy server-path scanner, which has been removed. The
+					notes already indexed remain searchable, but they no longer reflect changes made in your
+					vault.
+				</AlertDescription>
+			</Alert>
+
+			<div className="rounded-xl border border-border bg-slate-400/5 p-3 sm:p-6 dark:bg-white/5">
+				<h3 className="mb-3 text-sm font-medium sm:text-base">Migration required</h3>
+				<p className="mb-3 text-[11px] leading-relaxed text-muted-foreground sm:text-xs">
+					Follow the{" "}
+					<a
+						href={OBSIDIAN_SETUP_DOCS_URL}
+						className="font-medium text-primary underline underline-offset-4 hover:text-primary/80"
+					>
+						Obsidian setup guide
+					</a>{" "}
+					to reconnect this vault through the plugin.
+				</p>
+				<p className="text-[11px] leading-relaxed text-amber-600 dark:text-amber-400 sm:text-xs">
+					Heads up: Disconnect also deletes every document this connector previously indexed.
+				</p>
 			</div>
+		</div>
+	);
+};
 
-			{/* Configuration */}
-			<div className="rounded-xl border border-border bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6 space-y-3 sm:space-y-4">
-				<div className="space-y-1 sm:space-y-2">
-					<h3 className="font-medium text-sm sm:text-base flex items-center gap-2">
-						Vault Configuration
-					</h3>
-				</div>
+const PluginStats: FC<{ config: Record<string, unknown> }> = ({ config }) => {
+	const vaultId = typeof config.vault_id === "string" ? config.vault_id : null;
+	const [stats, setStats] = useState<ObsidianStats | null>(null);
+	const [statsError, setStatsError] = useState(false);
 
-				<div className="space-y-4">
-					<div className="space-y-2">
-						<Label className="text-xs sm:text-sm">Vault Path</Label>
-						<Input
-							value={vaultPath}
-							onChange={(e) => handleVaultPathChange(e.target.value)}
-							placeholder="/path/to/your/obsidian/vault"
-							className="border-slate-400/20 focus-visible:border-slate-400/40 font-mono"
-						/>
-						<p className="text-[10px] sm:text-xs text-muted-foreground">
-							The absolute path to your Obsidian vault on the server.
-						</p>
-					</div>
+	useEffect(() => {
+		if (!vaultId) return;
+		let cancelled = false;
+		setStats(null);
+		setStatsError(false);
+		connectorsApiService
+			.getObsidianStats(vaultId)
+			.then((result) => {
+				if (!cancelled) setStats(result);
+			})
+			.catch((err) => {
+				if (!cancelled) {
+					console.error("Failed to fetch Obsidian stats", err);
+					setStatsError(true);
+				}
+			});
+		return () => {
+			cancelled = true;
+		};
+	}, [vaultId]);
 
-					<div className="space-y-2">
-						<Label className="text-xs sm:text-sm">Vault Name</Label>
-						<Input
-							value={vaultName}
-							onChange={(e) => handleVaultNameChange(e.target.value)}
-							placeholder="My Knowledge Base"
-							className="border-slate-400/20 focus-visible:border-slate-400/40"
-						/>
-						<p className="text-[10px] sm:text-xs text-muted-foreground">
-							A display name for your vault in search results.
-						</p>
-					</div>
+	const tileRows = useMemo(() => {
+		const placeholder = statsError ? "—" : stats ? null : "…";
+		return [
+			{ label: "Vault name", value: (config.vault_name as string) || "—" },
+			{
+				label: "Last sync",
+				value: placeholder ?? formatTimestamp(stats?.last_sync_at ?? null),
+			},
+			{
+				label: "Files synced",
+				value:
+					placeholder ??
+					(typeof stats?.files_synced === "number" ? stats.files_synced.toLocaleString() : "—"),
+			},
+		];
+	}, [config.vault_name, stats, statsError]);
 
-					<div className="space-y-2">
-						<Label className="text-xs sm:text-sm">Exclude Folders</Label>
-						<Input
-							value={excludeFolders}
-							onChange={(e) => handleExcludeFoldersChange(e.target.value)}
-							placeholder=".obsidian, .trash, templates"
-							className="border-slate-400/20 focus-visible:border-slate-400/40 font-mono"
-						/>
-						<p className="text-[10px] sm:text-xs text-muted-foreground">
-							Comma-separated list of folder names to exclude from indexing.
-						</p>
-					</div>
+	return (
+		<div className="space-y-4">
+			<Alert>
+				<Info />
+				<AlertTitle>Plugin connected</AlertTitle>
+				<AlertDescription>
+					Your notes stay synced automatically. To stop syncing, disable or uninstall the plugin in
+					Obsidian, or delete this connector.
+				</AlertDescription>
+			</Alert>
 
-					<div className="flex items-center justify-between rounded-lg border border-slate-400/20 p-3">
-						<div className="space-y-0.5">
-							<Label className="text-xs sm:text-sm">Include Attachments</Label>
-							<p className="text-[10px] sm:text-xs text-muted-foreground">
-								Index attachment folders and embedded files
-							</p>
+			<div className="rounded-xl bg-slate-400/5 p-3 sm:p-6 dark:bg-white/5">
+				<h3 className="mb-3 text-sm font-medium sm:text-base">Vault Status</h3>
+				<dl className="grid grid-cols-1 gap-3 sm:grid-cols-2">
+					{tileRows.map((stat) => (
+						<div key={stat.label} className="rounded-lg bg-background/50 p-3">
+							<dt className="text-xs tracking-wide text-muted-foreground sm:text-sm">
+								{stat.label}
+							</dt>
+							<dd className="mt-1 truncate text-xs font-medium sm:text-sm">{stat.value}</dd>
 						</div>
-						<Switch checked={includeAttachments} onCheckedChange={handleIncludeAttachmentsChange} />
-					</div>
-				</div>
+					))}
+				</dl>
 			</div>
 		</div>
 	);
 };
+
+const UnknownConnectorState: FC = () => (
+	<Alert>
+		<Info />
+		<AlertTitle>Unrecognized config</AlertTitle>
+		<AlertDescription>
+			This connector is missing plugin metadata and may predate the Obsidian plugin migration. You
+			can safely delete it and reinstall the SurfSense Obsidian plugin to resume syncing.
+		</AlertDescription>
+	</Alert>
+);
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/onedrive-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/onedrive-config.tsx
index b835dbcec8..5ecaee0458 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/onedrive-config.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/onedrive-config.tsx
@@ -14,6 +14,7 @@ import {
 import type { FC } from "react";
 import { useCallback, useEffect, useState } from "react";
 import { DriveFolderTree, type SelectedFolder } from "@/components/connectors/drive-folder-tree";
+import { Button } from "@/components/ui/button";
 import { Label } from "@/components/ui/label";
 import {
 	Select,
@@ -178,14 +179,16 @@ export const OneDriveConfig: FC<ConnectorConfigProps> = ({ connector, onConfigCh
 								>
 									<FolderClosed className="size-3.5 shrink-0 text-muted-foreground" />
 									<span className="flex-1 truncate">{folder.name}</span>
-									<button
+									<Button
 										type="button"
+										variant="ghost"
+										size="icon"
 										onClick={() => handleRemoveFolder(folder.id)}
-										className="shrink-0 p-0.5 hover:bg-muted-foreground/20 rounded transition-colors"
+										className="size-5 shrink-0 rounded p-0 hover:bg-accent hover:text-accent-foreground"
 										aria-label={`Remove ${folder.name}`}
 									>
 										<X className="size-3.5" />
-									</button>
+									</Button>
 								</div>
 							))}
 							{selectedFiles.map((file) => (
@@ -196,14 +199,16 @@ export const OneDriveConfig: FC<ConnectorConfigProps> = ({ connector, onConfigCh
 								>
 									{getFileIconFromName(file.name)}
 									<span className="flex-1 truncate">{file.name}</span>
-									<button
+									<Button
 										type="button"
+										variant="ghost"
+										size="icon"
 										onClick={() => handleRemoveFile(file.id)}
-										className="shrink-0 p-0.5 hover:bg-muted-foreground/20 rounded transition-colors"
+										className="size-5 shrink-0 rounded p-0 hover:bg-accent hover:text-accent-foreground"
 										aria-label={`Remove ${file.name}`}
 									>
 										<X className="size-3.5" />
-									</button>
+									</Button>
 								</div>
 							))}
 						</div>
@@ -218,10 +223,11 @@ export const OneDriveConfig: FC<ConnectorConfigProps> = ({ connector, onConfigCh
 
 				{isEditMode ? (
 					<div className="space-y-2">
-						<button
+						<Button
 							type="button"
+							variant="ghost"
 							onClick={() => setIsFolderTreeOpen((prev) => !prev)}
-							className="flex items-center gap-2 text-xs sm:text-sm text-muted-foreground hover:text-foreground transition-colors w-fit"
+							className="h-auto w-fit gap-2 px-0 py-0 text-xs font-normal text-muted-foreground hover:bg-transparent hover:text-accent-foreground sm:text-sm"
 						>
 							Change Selection
 							{isFolderTreeOpen ? (
@@ -229,7 +235,7 @@ export const OneDriveConfig: FC<ConnectorConfigProps> = ({ connector, onConfigCh
 							) : (
 								<ChevronRight className="size-4" />
 							)}
-						</button>
+						</Button>
 						{isFolderTreeOpen && (
 							<DriveFolderTree
 								fetchItems={fetchItems}
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/slack-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/slack-config.tsx
index ff01ac96a5..7f30d751ac 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/slack-config.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/slack-config.tsx
@@ -2,6 +2,7 @@
 
 import { AlertCircle, CheckCircle2, Hash, Info, Lock, RefreshCw } from "lucide-react";
 import { type FC, useCallback, useEffect, useState } from "react";
+import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
 import { Button } from "@/components/ui/button";
 import { Spinner } from "@/components/ui/spinner";
 import { connectorsApiService, type SlackChannel } from "@/lib/apis/connectors-api.service";
@@ -74,20 +75,20 @@ export const SlackConfig: FC<SlackConfigProps> = ({ connector }) => {
 	return (
 		<div className="space-y-6">
 			{/* Info box */}
-			<div className="rounded-xl border border-border bg-primary/5 p-4 flex items-start gap-3">
-				<div className="flex h-8 w-8 items-center justify-center rounded-lg bg-primary/10 shrink-0 mt-0.5">
-					<Info className="size-4" />
-				</div>
-				<div className="text-xs sm:text-sm">
-					<p className="font-medium text-xs sm:text-sm">Add Bot to Channels</p>
-					<p className="text-muted-foreground mt-1 text-[10px] sm:text-sm">
+			<Alert>
+				<Info />
+				<AlertTitle>Add Bot to Channels</AlertTitle>
+				<AlertDescription>
+					<p>
 						Before indexing, add the SurfSense bot to each channel you want to index. The bot can
 						only access messages from channels it's been added to. Type{" "}
-						<code className="bg-muted px-1 py-0.5 rounded text-[9px]">/invite @SurfSense</code> in
-						any channel to add it.
+						<code className="rounded bg-popover px-1 py-0.5 text-[9px] text-popover-foreground">
+							/invite @SurfSense
+						</code>{" "}
+						in any channel to add it.
 					</p>
-				</div>
-			</div>
+				</AlertDescription>
+			</Alert>
 
 			{/* Channels Section */}
 			<div className="space-y-3">
@@ -104,7 +105,7 @@ export const SlackConfig: FC<SlackConfigProps> = ({ connector }) => {
 							size="sm"
 							onClick={fetchChannels}
 							disabled={isLoading}
-							className="h-7 px-2.5 text-[11px] bg-slate-400/10 dark:bg-white/10 hover:bg-slate-400/20 dark:hover:bg-white/20 border-slate-400/20 dark:border-white/20"
+							className="h-7 px-2.5 text-[11px] bg-slate-400/10 dark:bg-white/10 hover:bg-accent hover:text-accent-foreground border-slate-400/20 dark:border-white/20"
 						>
 							<RefreshCw className={cn("mr-1.5 size-3", isLoading && "animate-spin")} />
 							Refresh
@@ -178,7 +179,7 @@ interface ChannelPillProps {
 
 const ChannelPill: FC<ChannelPillProps> = ({ channel }) => {
 	return (
-		<div className="inline-flex items-center gap-1 px-2 py-1 rounded-md text-[11px] font-medium bg-slate-400/10 dark:bg-white/10 hover:bg-slate-400/20 dark:hover:bg-white/20 transition-colors">
+		<div className="inline-flex items-center gap-1 px-2 py-1 rounded-md text-[11px] font-medium bg-slate-400/10 dark:bg-white/10 hover:bg-accent hover:text-accent-foreground transition-colors">
 			{channel.is_private ? (
 				<Lock className="size-2.5 text-muted-foreground" />
 			) : (
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/teams-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/teams-config.tsx
index ac08a6c03f..a1d387e909 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/teams-config.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/teams-config.tsx
@@ -2,6 +2,7 @@
 
 import { Info } from "lucide-react";
 import type { FC } from "react";
+import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
 import type { ConnectorConfigProps } from "../index";
 
 export interface TeamsConfigProps extends ConnectorConfigProps {
@@ -11,19 +12,17 @@ export interface TeamsConfigProps extends ConnectorConfigProps {
 export const TeamsConfig: FC<TeamsConfigProps> = () => {
 	return (
 		<div className="space-y-6">
-			<div className="rounded-xl border border-border bg-primary/5 p-4 flex items-start gap-3">
-				<div className="flex h-8 w-8 items-center justify-center rounded-lg bg-primary/10 shrink-0 mt-0.5">
-					<Info className="size-4" />
-				</div>
-				<div className="text-xs sm:text-sm">
-					<p className="font-medium text-xs sm:text-sm">Microsoft Teams Access</p>
-					<p className="text-muted-foreground mt-1 text-[10px] sm:text-sm">
-						SurfSense will index messages from Teams channels that you have access to. The app can
-						only read messages from teams and channels where you are a member. Make sure you're a
-						member of the teams you want to index before connecting.
+			<Alert>
+				<Info />
+				<AlertTitle>Microsoft Teams Access</AlertTitle>
+				<AlertDescription>
+					<p>
+						Your agent can search and read messages from Teams channels you have access to, and send
+						messages on your behalf. Make sure you&#39;re a member of the teams you want to interact
+						with.
 					</p>
-				</div>
-			</div>
+				</AlertDescription>
+			</Alert>
 		</div>
 	);
 };
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/webcrawler-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/webcrawler-config.tsx
index a01cde15ff..b7fbae91c2 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/webcrawler-config.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/webcrawler-config.tsx
@@ -52,13 +52,13 @@ export const WebcrawlerConfig: FC<ConnectorConfigProps> = ({ connector, onConfig
 			</div>
 
 			{/* Chat tip */}
-			<div className="flex items-start gap-3 rounded-lg border border-blue-200/50 bg-blue-50/50 dark:border-blue-500/20 dark:bg-blue-950/20 p-3 text-xs sm:text-sm">
-				<Info className="size-4 mt-0.5 shrink-0 text-blue-600 dark:text-blue-400" />
-				<p className="text-muted-foreground">
+			<Alert>
+				<Info />
+				<AlertDescription>
 					Want a quick answer from a webpage without indexing it? Just paste the URL directly into
 					the chat instead.
-				</p>
-			</div>
+				</AlertDescription>
+			</Alert>
 
 			{/* API Key Field */}
 			<div className="space-y-2">
@@ -79,7 +79,7 @@ export const WebcrawlerConfig: FC<ConnectorConfigProps> = ({ connector, onConfig
 						variant="ghost"
 						size="sm"
 						onClick={() => setShowApiKey((prev) => !prev)}
-						className="absolute right-1 top-1/2 -translate-y-1/2 h-7 px-2 text-xs text-muted-foreground hover:text-foreground"
+						className="absolute right-1 top-1/2 -translate-y-1/2 h-7 px-2 text-xs text-muted-foreground hover:text-accent-foreground"
 					>
 						{showApiKey ? "Hide" : "Show"}
 					</Button>
@@ -116,9 +116,9 @@ export const WebcrawlerConfig: FC<ConnectorConfigProps> = ({ connector, onConfig
 			</div>
 
 			{/* Info Alert */}
-			<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 p-2 sm:p-3">
-				<Info className="size-4 shrink-0" />
-				<AlertDescription className="text-[10px] sm:text-xs">
+			<Alert>
+				<Info />
+				<AlertDescription>
 					Configuration is saved when you start indexing. You can update these settings anytime from
 					the connector management page.
 				</AlertDescription>
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-connect-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-connect-view.tsx
index 8a0ef5ae12..7eaea271e4 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-connect-view.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-connect-view.tsx
@@ -90,14 +90,15 @@ export const ConnectorConnectView: FC<ConnectorConnectViewProps> = ({
 		<div className="flex-1 flex flex-col min-h-0 overflow-hidden">
 			{/* Header */}
 			<div className="flex-shrink-0 px-6 sm:px-12 pt-8 sm:pt-10">
-				<button
+				<Button
+					variant="ghost"
 					type="button"
 					onClick={onBack}
-					className="flex items-center gap-2 text-xs sm:text-sm text-muted-foreground hover:text-foreground mb-6 w-fit"
+					className="mb-6 h-auto w-fit justify-start gap-2 px-0 py-0 text-xs text-muted-foreground hover:bg-transparent hover:text-accent-foreground sm:text-sm"
 				>
-					<ArrowLeft className="size-4" />
+					<ArrowLeft data-icon="inline-start" />
 					Back to connectors
-				</button>
+				</Button>
 
 				<div className="flex items-center gap-4 mb-6">
 					<div className="flex h-14 w-14 items-center justify-center rounded-xl border border-slate-400/30">
@@ -111,7 +112,9 @@ export const ConnectorConnectView: FC<ConnectorConnectViewProps> = ({
 								: getConnectorTypeDisplay(connectorType)}
 						</h2>
 						<p className="text-xs sm:text-base text-muted-foreground mt-1">
-							Enter your connection details
+							{connectorType === "OBSIDIAN_CONNECTOR"
+								? "Follow the plugin setup steps below"
+								: "Enter your connection details"}
 						</p>
 					</div>
 				</div>
@@ -131,7 +134,7 @@ export const ConnectorConnectView: FC<ConnectorConnectViewProps> = ({
 			</div>
 
 			{/* Fixed Footer - Action buttons */}
-			<div className="flex-shrink-0 flex items-center justify-between px-6 sm:px-12 py-6 bg-muted border-t border-border">
+			<div className="flex-shrink-0 flex items-center justify-between px-6 sm:px-12 py-6 bg-popover">
 				<Button
 					variant="ghost"
 					onClick={onBack}
@@ -149,7 +152,9 @@ export const ConnectorConnectView: FC<ConnectorConnectViewProps> = ({
 					<span className={isSubmitting ? "opacity-0" : ""}>
 						{connectorType === "MCP_CONNECTOR"
 							? "Connect"
-							: `Connect ${getConnectorTypeDisplay(connectorType)}`}
+							: connectorType === "OBSIDIAN_CONNECTOR"
+								? "Done"
+								: `Connect ${getConnectorTypeDisplay(connectorType)}`}
 					</span>
 					{isSubmitting && <Spinner size="sm" className="absolute" />}
 				</Button>
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx
index 7308e1e263..b64f974143 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx
@@ -1,10 +1,11 @@
 "use client";
 
 import { useAtomValue } from "jotai";
-import { ArrowLeft, Info, RefreshCw, Trash2 } from "lucide-react";
+import { ArrowLeft, Info, RefreshCw } from "lucide-react";
 import { type FC, useCallback, useEffect, useMemo, useRef, useState } from "react";
 import { toast } from "sonner";
 import { activeSearchSpaceIdAtom } from "@/atoms/search-spaces/search-space-query.atoms";
+import { Alert, AlertDescription } from "@/components/ui/alert";
 import { Button } from "@/components/ui/button";
 import { Spinner } from "@/components/ui/spinner";
 import { EnumConnectorName } from "@/contracts/enums/connector";
@@ -15,21 +16,19 @@ import { cn } from "@/lib/utils";
 import { DateRangeSelector } from "../../components/date-range-selector";
 import { PeriodicSyncConfig } from "../../components/periodic-sync-config";
 import { SummaryConfig } from "../../components/summary-config";
+import { VisionLLMConfig } from "../../components/vision-llm-config";
+import { getReauthEndpoint, LIVE_CONNECTOR_TYPES } from "../../constants/connector-constants";
 import { getConnectorDisplayName } from "../../tabs/all-connectors-tab";
+import { MCPServiceConfig } from "../components/mcp-service-config";
 import { getConnectorConfigComponent } from "../index";
 
-const REAUTH_ENDPOINTS: Partial<Record<string, string>> = {
-	[EnumConnectorName.LINEAR_CONNECTOR]: "/api/v1/auth/linear/connector/reauth",
-	[EnumConnectorName.NOTION_CONNECTOR]: "/api/v1/auth/notion/connector/reauth",
-	[EnumConnectorName.GOOGLE_DRIVE_CONNECTOR]: "/api/v1/auth/google/drive/connector/reauth",
-	[EnumConnectorName.GOOGLE_GMAIL_CONNECTOR]: "/api/v1/auth/google/gmail/connector/reauth",
-	[EnumConnectorName.GOOGLE_CALENDAR_CONNECTOR]: "/api/v1/auth/google/calendar/connector/reauth",
-	[EnumConnectorName.COMPOSIO_GOOGLE_DRIVE_CONNECTOR]: "/api/v1/auth/composio/connector/reauth",
-	[EnumConnectorName.COMPOSIO_GMAIL_CONNECTOR]: "/api/v1/auth/composio/connector/reauth",
-	[EnumConnectorName.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR]: "/api/v1/auth/composio/connector/reauth",
-	[EnumConnectorName.ONEDRIVE_CONNECTOR]: "/api/v1/auth/onedrive/connector/reauth",
-	[EnumConnectorName.DROPBOX_CONNECTOR]: "/api/v1/auth/dropbox/connector/reauth",
-};
+const VISION_LLM_CONNECTOR_TYPES = new Set<SearchSourceConnector["connector_type"]>([
+	EnumConnectorName.GOOGLE_DRIVE_CONNECTOR,
+	EnumConnectorName.COMPOSIO_GOOGLE_DRIVE_CONNECTOR,
+	EnumConnectorName.DROPBOX_CONNECTOR,
+	EnumConnectorName.ONEDRIVE_CONNECTOR,
+	EnumConnectorName.OBSIDIAN_CONNECTOR,
+]);
 
 interface ConnectorEditViewProps {
 	connector: SearchSourceConnector;
@@ -38,6 +37,7 @@ interface ConnectorEditViewProps {
 	periodicEnabled: boolean;
 	frequencyMinutes: string;
 	enableSummary: boolean;
+	enableVisionLlm: boolean;
 	isSaving: boolean;
 	isDisconnecting: boolean;
 	isIndexing?: boolean;
@@ -47,6 +47,7 @@ interface ConnectorEditViewProps {
 	onPeriodicEnabledChange: (enabled: boolean) => void;
 	onFrequencyChange: (frequency: string) => void;
 	onEnableSummaryChange: (enabled: boolean) => void;
+	onEnableVisionLlmChange: (enabled: boolean) => void;
 	onSave: () => void;
 	onDisconnect: () => void;
 	onBack: () => void;
@@ -62,6 +63,7 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
 	periodicEnabled,
 	frequencyMinutes,
 	enableSummary,
+	enableVisionLlm,
 	isSaving,
 	isDisconnecting,
 	isIndexing = false,
@@ -71,6 +73,7 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
 	onPeriodicEnabledChange,
 	onFrequencyChange,
 	onEnableSummaryChange,
+	onEnableVisionLlmChange,
 	onSave,
 	onDisconnect,
 	onBack,
@@ -80,8 +83,11 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
 }) => {
 	const searchSpaceIdAtom = useAtomValue(activeSearchSpaceIdAtom);
 	const isAuthExpired = connector.config?.auth_expired === true;
-	const reauthEndpoint = REAUTH_ENDPOINTS[connector.connector_type];
+	const reauthEndpoint = getReauthEndpoint(connector);
 	const [reauthing, setReauthing] = useState(false);
+	const supportsVisionLlm = VISION_LLM_CONNECTOR_TYPES.has(connector.connector_type);
+	const showsAiToggles =
+		connector.is_indexable || connector.connector_type === EnumConnectorName.OBSIDIAN_CONNECTOR;
 
 	const handleReauth = useCallback(async () => {
 		const spaceId = searchSpaceId ?? searchSpaceIdAtom;
@@ -113,11 +119,14 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
 		}
 	}, [searchSpaceId, searchSpaceIdAtom, reauthEndpoint, connector.id]);
 
-	// Get connector-specific config component
-	const ConnectorConfigComponent = useMemo(
-		() => getConnectorConfigComponent(connector.connector_type),
-		[connector.connector_type]
-	);
+	const isMCPBacked = Boolean(connector.config?.server_config);
+	const isLive = isMCPBacked || LIVE_CONNECTOR_TYPES.has(connector.connector_type);
+
+	// Get connector-specific config component (MCP-backed connectors use a generic view)
+	const ConnectorConfigComponent = useMemo(() => {
+		if (isMCPBacked) return MCPServiceConfig;
+		return getConnectorConfigComponent(connector.connector_type);
+	}, [connector.connector_type, isMCPBacked]);
 	const [isScrolled, setIsScrolled] = useState(false);
 	const [hasMoreContent, setHasMoreContent] = useState(false);
 	const [showDisconnectConfirm, setShowDisconnectConfirm] = useState(false);
@@ -198,14 +207,15 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
 				)}
 			>
 				{/* Back button */}
-				<button
+				<Button
+					variant="ghost"
 					type="button"
 					onClick={onBack}
-					className="flex items-center gap-2 text-xs sm:text-sm text-muted-foreground hover:text-foreground mb-6 w-fit"
+					className="mb-6 h-auto w-fit justify-start gap-2 px-0 py-0 text-xs text-muted-foreground hover:bg-transparent hover:text-accent-foreground sm:text-sm"
 				>
-					<ArrowLeft className="size-4" />
+					<ArrowLeft data-icon="inline-start" />
 					Back to connectors
-				</button>
+				</Button>
 
 				{/* Connector header */}
 				<div className="flex flex-col sm:flex-row items-start sm:items-center gap-4 mb-6">
@@ -218,18 +228,20 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
 								{getConnectorDisplayName(connector.name)}
 							</h2>
 							<p className="text-xs sm:text-base text-muted-foreground mt-1">
-								Manage your connector settings and sync configuration
+								{isLive
+									? "Manage your connected account"
+									: "Manage your connector settings and sync configuration"}
 							</p>
 						</div>
 					</div>
-					{/* Quick Index Button - hidden when auth is expired */}
-					{connector.is_indexable && onQuickIndex && !isAuthExpired && (
+					{/* Quick Index Button - hidden for live connectors and when auth is expired */}
+					{connector.is_indexable && !isLive && onQuickIndex && !isAuthExpired && (
 						<Button
 							variant="secondary"
 							size="sm"
 							onClick={handleQuickIndex}
 							disabled={isQuickIndexing || isIndexing || isSaving || isDisconnecting}
-							className="text-xs sm:text-sm bg-slate-400/10 dark:bg-white/10 hover:bg-slate-400/20 dark:hover:bg-white/20 border-slate-400/20 dark:border-white/20 w-full sm:w-auto"
+							className="text-xs sm:text-sm bg-slate-400/10 dark:bg-white/10 hover:bg-accent hover:text-accent-foreground border-slate-400/20 dark:border-white/20 w-full sm:w-auto"
 						>
 							{isQuickIndexing || isIndexing ? (
 								<>
@@ -266,14 +278,23 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
 							/>
 						)}
 
-						{/* Summary and sync settings - only shown for indexable connectors */}
-						{connector.is_indexable && (
+						{/* Summary + vision toggles (Obsidian is plugin-push, non-indexable by design) */}
+						{showsAiToggles && !isLive && (
 							<>
 								{/* AI Summary toggle */}
 								<SummaryConfig enabled={enableSummary} onEnabledChange={onEnableSummaryChange} />
 
-								{/* Date range selector - not shown for file-based connectors (Drive, Dropbox, OneDrive), Webcrawler, GitHub, or Local Folder */}
-								{connector.connector_type !== "GOOGLE_DRIVE_CONNECTOR" &&
+								{/* Vision LLM toggle for file/attachment connectors */}
+								{supportsVisionLlm && (
+									<VisionLLMConfig
+										enabled={enableVisionLlm}
+										onEnabledChange={onEnableVisionLlmChange}
+									/>
+								)}
+
+								{/* Date-range and periodic sync stay indexable-only */}
+								{connector.is_indexable &&
+									connector.connector_type !== "GOOGLE_DRIVE_CONNECTOR" &&
 									connector.connector_type !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" &&
 									connector.connector_type !== "DROPBOX_CONNECTOR" &&
 									connector.connector_type !== "ONEDRIVE_CONNECTOR" &&
@@ -293,75 +314,70 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
 										/>
 									)}
 
-								{(() => {
-									const isGoogleDrive = connector.connector_type === "GOOGLE_DRIVE_CONNECTOR";
-									const isComposioGoogleDrive =
-										connector.connector_type === "COMPOSIO_GOOGLE_DRIVE_CONNECTOR";
-									const requiresFolderSelection = isGoogleDrive || isComposioGoogleDrive;
-									const selectedFolders =
-										(connector.config?.selected_folders as
-											| Array<{ id: string; name: string }>
-											| undefined) || [];
-									const selectedFiles =
-										(connector.config?.selected_files as
-											| Array<{ id: string; name: string }>
-											| undefined) || [];
-									const hasItemsSelected = selectedFolders.length > 0 || selectedFiles.length > 0;
-									const isDisabled = requiresFolderSelection && !hasItemsSelected;
+								{connector.is_indexable &&
+									(() => {
+										const isGoogleDrive = connector.connector_type === "GOOGLE_DRIVE_CONNECTOR";
+										const isComposioGoogleDrive =
+											connector.connector_type === "COMPOSIO_GOOGLE_DRIVE_CONNECTOR";
+										const requiresFolderSelection = isGoogleDrive || isComposioGoogleDrive;
+										const selectedFolders =
+											(connector.config?.selected_folders as
+												| Array<{ id: string; name: string }>
+												| undefined) || [];
+										const selectedFiles =
+											(connector.config?.selected_files as
+												| Array<{ id: string; name: string }>
+												| undefined) || [];
+										const hasItemsSelected = selectedFolders.length > 0 || selectedFiles.length > 0;
+										const isDisabled = requiresFolderSelection && !hasItemsSelected;
 
-									return (
-										<PeriodicSyncConfig
-											enabled={periodicEnabled}
-											frequencyMinutes={frequencyMinutes}
-											onEnabledChange={onPeriodicEnabledChange}
-											onFrequencyChange={onFrequencyChange}
-											disabled={isDisabled}
-											disabledMessage={
-												isDisabled
-													? "Select at least one folder or file above to enable periodic sync"
-													: undefined
-											}
-										/>
-									);
-								})()}
+										return (
+											<PeriodicSyncConfig
+												enabled={periodicEnabled}
+												frequencyMinutes={frequencyMinutes}
+												onEnabledChange={onPeriodicEnabledChange}
+												onFrequencyChange={onFrequencyChange}
+												disabled={isDisabled}
+												disabledMessage={
+													isDisabled
+														? "Select at least one folder or file above to enable periodic sync"
+														: undefined
+												}
+											/>
+										);
+									})()}
 							</>
 						)}
 
-						{/* Info box - only shown for indexable connectors */}
-						{connector.is_indexable && (
-							<div className="rounded-xl border border-border bg-primary/5 p-4 flex items-start gap-3">
-								<div className="flex h-8 w-8 items-center justify-center rounded-lg bg-primary/10 shrink-0 mt-0.5">
-									<Info className="size-4" />
-								</div>
-								<div className="text-xs sm:text-sm">
-									<p className="font-medium text-xs sm:text-sm">
-										Re-indexing runs in the background
-									</p>
-									<p className="text-muted-foreground mt-1 text-[10px] sm:text-sm">
-										You can continue using SurfSense while we sync your data. Check inbox for
-										updates.
-									</p>
-								</div>
-							</div>
+						{/* Info box - hidden for live connectors */}
+						{connector.is_indexable && !isLive && (
+							<Alert>
+								<Info />
+								<AlertDescription>
+									You can continue using SurfSense while we sync your data. Check inbox for updates.
+								</AlertDescription>
+							</Alert>
 						)}
 					</div>
 				</div>
 				{/* Top fade shadow - appears when scrolled */}
 				{isScrolled && (
-					<div className="absolute top-0 left-0 right-0 h-6 bg-gradient-to-b from-muted/50 to-transparent pointer-events-none z-10" />
+					<div className="absolute top-0 left-0 right-0 h-6 bg-gradient-to-b from-popover to-transparent pointer-events-none z-10" />
 				)}
 				{/* Bottom fade shadow - appears when there's more content */}
 				{hasMoreContent && (
-					<div className="absolute bottom-0 left-0 right-0 h-3 bg-gradient-to-t from-muted/50 to-transparent pointer-events-none z-10" />
+					<div className="absolute bottom-0 left-0 right-0 h-3 bg-gradient-to-t from-popover to-transparent pointer-events-none z-10" />
 				)}
 			</div>
 
 			{/* Fixed Footer - Action buttons */}
-			<div className="flex-shrink-0 flex flex-col sm:flex-row items-stretch sm:items-center justify-between gap-3 sm:gap-0 px-6 sm:px-12 py-6 sm:py-6 bg-muted border-t border-border">
+			<div className="flex-shrink-0 flex flex-col sm:flex-row items-stretch sm:items-center justify-between gap-3 sm:gap-0 px-6 sm:px-12 py-6 sm:py-6 bg-popover">
 				{showDisconnectConfirm ? (
 					<div className="flex flex-col sm:flex-row items-stretch sm:items-center gap-3 flex-1 sm:flex-initial">
 						<span className="text-xs sm:text-sm text-muted-foreground sm:whitespace-nowrap">
-							Are you sure?
+							{isLive
+								? "Your agent will lose access to this service"
+								: "This will remove all indexed data"}
 						</span>
 						<div className="flex items-center gap-2 sm:gap-3">
 							<Button
@@ -392,7 +408,6 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
 						disabled={isSaving || isDisconnecting}
 						className="text-xs sm:text-sm flex-1 sm:flex-initial h-12 sm:h-auto py-3 sm:py-2"
 					>
-						<Trash2 className="mr-2 h-4 w-4" />
 						Disconnect
 					</Button>
 				)}
@@ -405,7 +420,7 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
 						<RefreshCw className={cn("size-3.5", reauthing && "animate-spin")} />
 						Re-authenticate
 					</Button>
-				) : (
+				) : !isLive ? (
 					<Button
 						onClick={onSave}
 						disabled={isSaving || isDisconnecting}
@@ -414,7 +429,7 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
 						<span className={isSaving ? "opacity-0" : ""}>Save Changes</span>
 						{isSaving && <Spinner size="sm" className="absolute" />}
 					</Button>
-				)}
+				) : null}
 			</div>
 		</div>
 	);
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx
index e583cbe17d..74b4fad9f8 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx
@@ -2,18 +2,32 @@
 
 import { ArrowLeft, Check, Info } from "lucide-react";
 import { type FC, useCallback, useEffect, useMemo, useRef, useState } from "react";
+import { Alert, AlertDescription } from "@/components/ui/alert";
 import { Button } from "@/components/ui/button";
 import { Spinner } from "@/components/ui/spinner";
+import { EnumConnectorName } from "@/contracts/enums/connector";
 import type { SearchSourceConnector } from "@/contracts/types/connector.types";
 import { getConnectorTypeDisplay } from "@/lib/connectors/utils";
 import { cn } from "@/lib/utils";
 import { DateRangeSelector } from "../../components/date-range-selector";
 import { PeriodicSyncConfig } from "../../components/periodic-sync-config";
 import { SummaryConfig } from "../../components/summary-config";
-import type { IndexingConfigState } from "../../constants/connector-constants";
+import { VisionLLMConfig } from "../../components/vision-llm-config";
+import {
+	type IndexingConfigState,
+	LIVE_CONNECTOR_TYPES,
+} from "../../constants/connector-constants";
 import { getConnectorDisplayName } from "../../tabs/all-connectors-tab";
 import { getConnectorConfigComponent } from "../index";
 
+const VISION_LLM_CONNECTOR_TYPES = new Set<string>([
+	"GOOGLE_DRIVE_CONNECTOR",
+	"COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
+	"DROPBOX_CONNECTOR",
+	"ONEDRIVE_CONNECTOR",
+	"OBSIDIAN_CONNECTOR",
+]);
+
 interface IndexingConfigurationViewProps {
 	config: IndexingConfigState;
 	connector?: SearchSourceConnector;
@@ -22,6 +36,7 @@ interface IndexingConfigurationViewProps {
 	periodicEnabled: boolean;
 	frequencyMinutes: string;
 	enableSummary: boolean;
+	enableVisionLlm: boolean;
 	isStartingIndexing: boolean;
 	isFromOAuth?: boolean;
 	onStartDateChange: (date: Date | undefined) => void;
@@ -29,6 +44,7 @@ interface IndexingConfigurationViewProps {
 	onPeriodicEnabledChange: (enabled: boolean) => void;
 	onFrequencyChange: (frequency: string) => void;
 	onEnableSummaryChange: (enabled: boolean) => void;
+	onEnableVisionLlmChange: (enabled: boolean) => void;
 	onConfigChange?: (config: Record<string, unknown>) => void;
 	onStartIndexing: () => void;
 	onSkip: () => void;
@@ -42,6 +58,7 @@ export const IndexingConfigurationView: FC<IndexingConfigurationViewProps> = ({
 	periodicEnabled,
 	frequencyMinutes,
 	enableSummary,
+	enableVisionLlm,
 	isStartingIndexing,
 	isFromOAuth = false,
 	onStartDateChange,
@@ -49,15 +66,21 @@ export const IndexingConfigurationView: FC<IndexingConfigurationViewProps> = ({
 	onPeriodicEnabledChange,
 	onFrequencyChange,
 	onEnableSummaryChange,
+	onEnableVisionLlmChange,
 	onConfigChange,
 	onStartIndexing,
 	onSkip,
 }) => {
+	const isLive = LIVE_CONNECTOR_TYPES.has(config.connectorType);
+
 	// Get connector-specific config component
 	const ConnectorConfigComponent = useMemo(
 		() => (connector ? getConnectorConfigComponent(connector.connector_type) : null),
 		[connector]
 	);
+	const showsAiToggles =
+		(connector?.is_indexable ?? false) ||
+		connector?.connector_type === EnumConnectorName.OBSIDIAN_CONNECTOR;
 	const [isScrolled, setIsScrolled] = useState(false);
 	const [hasMoreContent, setHasMoreContent] = useState(false);
 	const scrollContainerRef = useRef<HTMLDivElement>(null);
@@ -106,14 +129,15 @@ export const IndexingConfigurationView: FC<IndexingConfigurationViewProps> = ({
 			>
 				{/* Back button - only show if not from OAuth */}
 				{!isFromOAuth && (
-					<button
+					<Button
+						variant="ghost"
 						type="button"
 						onClick={onSkip}
-						className="flex items-center gap-2 text-xs sm:text-sm text-muted-foreground hover:text-foreground mb-6 w-fit"
+						className="mb-6 h-auto w-fit justify-start gap-2 px-0 py-0 text-xs text-muted-foreground hover:bg-transparent hover:text-accent-foreground sm:text-sm"
 					>
-						<ArrowLeft className="size-4" />
+						<ArrowLeft data-icon="inline-start" />
 						Back to connectors
-					</button>
+					</Button>
 				)}
 
 				{/* Success header */}
@@ -133,7 +157,9 @@ export const IndexingConfigurationView: FC<IndexingConfigurationViewProps> = ({
 							)}
 						</div>
 						<p className="text-xs sm:text-base text-muted-foreground mt-1">
-							Configure when to start syncing your data
+							{isLive
+								? "Your account is ready to use"
+								: "Configure when to start syncing your data"}
 						</p>
 					</div>
 				</div>
@@ -152,14 +178,23 @@ export const IndexingConfigurationView: FC<IndexingConfigurationViewProps> = ({
 							<ConnectorConfigComponent connector={connector} onConfigChange={onConfigChange} />
 						)}
 
-						{/* Summary and sync settings - only shown for indexable connectors */}
-						{connector?.is_indexable && (
+						{/* Summary + vision toggles (Obsidian is plugin-push, non-indexable by design) */}
+						{showsAiToggles && !isLive && (
 							<>
 								{/* AI Summary toggle */}
 								<SummaryConfig enabled={enableSummary} onEnabledChange={onEnableSummaryChange} />
 
-								{/* Date range selector - not shown for file-based connectors (Drive, Dropbox, OneDrive), Webcrawler, GitHub, or Local Folder */}
-								{config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" &&
+								{/* Vision LLM toggle for file/attachment connectors */}
+								{VISION_LLM_CONNECTOR_TYPES.has(config.connectorType) && (
+									<VisionLLMConfig
+										enabled={enableVisionLlm}
+										onEnabledChange={onEnableVisionLlmChange}
+									/>
+								)}
+
+								{/* Date-range and periodic sync stay indexable-only */}
+								{connector?.is_indexable &&
+									config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" &&
 									config.connectorType !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" &&
 									config.connectorType !== "DROPBOX_CONNECTOR" &&
 									config.connectorType !== "ONEDRIVE_CONNECTOR" &&
@@ -179,7 +214,8 @@ export const IndexingConfigurationView: FC<IndexingConfigurationViewProps> = ({
 										/>
 									)}
 
-								{config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" &&
+								{connector?.is_indexable &&
+									config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" &&
 									config.connectorType !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" &&
 									config.connectorType !== "DROPBOX_CONNECTOR" &&
 									config.connectorType !== "ONEDRIVE_CONNECTOR" && (
@@ -193,43 +229,43 @@ export const IndexingConfigurationView: FC<IndexingConfigurationViewProps> = ({
 							</>
 						)}
 
-						{/* Info box - only shown for indexable connectors */}
-						{connector?.is_indexable && (
-							<div className="rounded-xl border border-border bg-primary/5 p-4 flex items-start gap-3">
-								<div className="flex h-8 w-8 items-center justify-center rounded-lg bg-primary/10 shrink-0 mt-0.5">
-									<Info className="size-4" />
-								</div>
-								<div className="text-xs sm:text-sm">
-									<p className="font-medium text-xs sm:text-sm">Indexing runs in the background</p>
-									<p className="text-muted-foreground mt-1 text-[10px] sm:text-sm">
-										You can continue using SurfSense while we sync your data. Check inbox for
-										updates.
-									</p>
-								</div>
-							</div>
+						{/* Info box - hidden for live connectors */}
+						{connector?.is_indexable && !isLive && (
+							<Alert>
+								<Info />
+								<AlertDescription>
+									You can continue using SurfSense while we sync your data. Check inbox for updates.
+								</AlertDescription>
+							</Alert>
 						)}
 					</div>
 				</div>
 				{/* Top fade shadow - appears when scrolled */}
 				{isScrolled && (
-					<div className="absolute top-0 left-0 right-0 h-6 bg-gradient-to-b from-muted/50 to-transparent pointer-events-none z-10" />
+					<div className="absolute top-0 left-0 right-0 h-6 bg-gradient-to-b from-popover to-transparent pointer-events-none z-10" />
 				)}
 				{/* Bottom fade shadow - appears when there's more content */}
 				{hasMoreContent && (
-					<div className="absolute bottom-0 left-0 right-0 h-3 bg-gradient-to-t from-muted/50 to-transparent pointer-events-none z-10" />
+					<div className="absolute bottom-0 left-0 right-0 h-3 bg-gradient-to-t from-popover to-transparent pointer-events-none z-10" />
 				)}
 			</div>
 
 			{/* Fixed Footer - Action buttons */}
-			<div className="flex-shrink-0 flex items-center justify-end px-6 sm:px-12 py-6 bg-muted">
-				<Button
-					onClick={onStartIndexing}
-					disabled={isStartingIndexing}
-					className="text-xs sm:text-sm relative"
-				>
-					<span className={isStartingIndexing ? "opacity-0" : ""}>Start Indexing</span>
-					{isStartingIndexing && <Spinner size="sm" className="absolute" />}
-				</Button>
+			<div className="flex-shrink-0 flex items-center justify-end px-6 sm:px-12 py-6 bg-popover">
+				{isLive ? (
+					<Button onClick={onSkip} className="text-xs sm:text-sm">
+						Done
+					</Button>
+				) : (
+					<Button
+						onClick={onStartIndexing}
+						disabled={isStartingIndexing}
+						className="text-xs sm:text-sm relative"
+					>
+						<span className={isStartingIndexing ? "opacity-0" : ""}>Start Indexing</span>
+						{isStartingIndexing && <Spinner size="sm" className="absolute" />}
+					</Button>
+				)}
 			</div>
 		</div>
 	);
diff --git a/surfsense_web/components/assistant-ui/connector-popup/constants/connector-constants.ts b/surfsense_web/components/assistant-ui/connector-popup/constants/connector-constants.ts
index 2e92f637bf..2f9605ea72 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/constants/connector-constants.ts
+++ b/surfsense_web/components/assistant-ui/connector-popup/constants/connector-constants.ts
@@ -1,4 +1,24 @@
 import { EnumConnectorName } from "@/contracts/enums/connector";
+import type { SearchSourceConnector } from "@/contracts/types/connector.types";
+
+/**
+ * Connectors that operate in real time (no background indexing).
+ * Used to adjust UI: hide sync controls, show "Connected" instead of doc counts.
+ */
+export const LIVE_CONNECTOR_TYPES = new Set<string>([
+	EnumConnectorName.LINEAR_CONNECTOR,
+	EnumConnectorName.SLACK_CONNECTOR,
+	EnumConnectorName.JIRA_CONNECTOR,
+	EnumConnectorName.CLICKUP_CONNECTOR,
+	EnumConnectorName.AIRTABLE_CONNECTOR,
+	EnumConnectorName.DISCORD_CONNECTOR,
+	EnumConnectorName.TEAMS_CONNECTOR,
+	EnumConnectorName.GOOGLE_CALENDAR_CONNECTOR,
+	EnumConnectorName.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
+	EnumConnectorName.GOOGLE_GMAIL_CONNECTOR,
+	EnumConnectorName.COMPOSIO_GMAIL_CONNECTOR,
+	EnumConnectorName.LUMA_CONNECTOR,
+]);
 
 // OAuth Connectors (Quick Connect)
 export const OAUTH_CONNECTORS = [
@@ -13,7 +33,7 @@ export const OAUTH_CONNECTORS = [
 	{
 		id: "google-gmail-connector",
 		title: "Gmail",
-		description: "Search through your emails",
+		description: "Search, read, draft, and send emails",
 		connectorType: EnumConnectorName.GOOGLE_GMAIL_CONNECTOR,
 		authEndpoint: "/api/v1/auth/google/gmail/connector/add/",
 		selfHostedOnly: true,
@@ -21,7 +41,7 @@ export const OAUTH_CONNECTORS = [
 	{
 		id: "google-calendar-connector",
 		title: "Google Calendar",
-		description: "Search through your events",
+		description: "Search and manage your events",
 		connectorType: EnumConnectorName.GOOGLE_CALENDAR_CONNECTOR,
 		authEndpoint: "/api/v1/auth/google/calendar/connector/add/",
 		selfHostedOnly: true,
@@ -29,35 +49,35 @@ export const OAUTH_CONNECTORS = [
 	{
 		id: "airtable-connector",
 		title: "Airtable",
-		description: "Search your Airtable bases",
+		description: "Browse bases, tables, and records",
 		connectorType: EnumConnectorName.AIRTABLE_CONNECTOR,
-		authEndpoint: "/api/v1/auth/airtable/connector/add/",
+		authEndpoint: "/api/v1/auth/mcp/airtable/connector/add/",
 	},
 	{
 		id: "notion-connector",
 		title: "Notion",
 		description: "Search your Notion pages",
 		connectorType: EnumConnectorName.NOTION_CONNECTOR,
-		authEndpoint: "/api/v1/auth/notion/connector/add/",
+		authEndpoint: "/api/v1/auth/notion/connector/add",
 	},
 	{
 		id: "linear-connector",
 		title: "Linear",
-		description: "Search issues & projects",
+		description: "Search, read, and manage issues & projects",
 		connectorType: EnumConnectorName.LINEAR_CONNECTOR,
-		authEndpoint: "/api/v1/auth/linear/connector/add/",
+		authEndpoint: "/api/v1/auth/mcp/linear/connector/add/",
 	},
 	{
 		id: "slack-connector",
 		title: "Slack",
-		description: "Search Slack messages",
+		description: "Search and read channels and threads",
 		connectorType: EnumConnectorName.SLACK_CONNECTOR,
-		authEndpoint: "/api/v1/auth/slack/connector/add/",
+		authEndpoint: "/api/v1/auth/mcp/slack/connector/add/",
 	},
 	{
 		id: "teams-connector",
 		title: "Microsoft Teams",
-		description: "Search Teams messages",
+		description: "Search, read, and send messages",
 		connectorType: EnumConnectorName.TEAMS_CONNECTOR,
 		authEndpoint: "/api/v1/auth/teams/connector/add/",
 	},
@@ -78,30 +98,30 @@ export const OAUTH_CONNECTORS = [
 	{
 		id: "discord-connector",
 		title: "Discord",
-		description: "Search Discord messages",
+		description: "Search, read, and send messages",
 		connectorType: EnumConnectorName.DISCORD_CONNECTOR,
 		authEndpoint: "/api/v1/auth/discord/connector/add/",
 	},
 	{
 		id: "jira-connector",
 		title: "Jira",
-		description: "Search Jira issues",
+		description: "Rework in progress.",
 		connectorType: EnumConnectorName.JIRA_CONNECTOR,
-		authEndpoint: "/api/v1/auth/jira/connector/add/",
+		authEndpoint: "/api/v1/auth/mcp/jira/connector/add/",
 	},
 	{
 		id: "confluence-connector",
 		title: "Confluence",
-		description: "Search documentation",
+		description: "Rework in progress.",
 		connectorType: EnumConnectorName.CONFLUENCE_CONNECTOR,
 		authEndpoint: "/api/v1/auth/confluence/connector/add/",
 	},
 	{
 		id: "clickup-connector",
 		title: "ClickUp",
-		description: "Search ClickUp tasks",
+		description: "Search and read tasks",
 		connectorType: EnumConnectorName.CLICKUP_CONNECTOR,
-		authEndpoint: "/api/v1/auth/clickup/connector/add/",
+		authEndpoint: "/api/v1/auth/mcp/clickup/connector/add/",
 	},
 ] as const;
 
@@ -138,7 +158,7 @@ export const OTHER_CONNECTORS = [
 	{
 		id: "luma-connector",
 		title: "Luma",
-		description: "Search Luma events",
+		description: "Browse, read, and create events",
 		connectorType: EnumConnectorName.LUMA_CONNECTOR,
 	},
 	{
@@ -180,9 +200,8 @@ export const OTHER_CONNECTORS = [
 	{
 		id: "obsidian-connector",
 		title: "Obsidian",
-		description: "Index your Obsidian vault (self-hosted only)",
+		description: "Sync your Obsidian vault on desktop or mobile",
 		connectorType: EnumConnectorName.OBSIDIAN_CONNECTOR,
-		selfHostedOnly: true,
 	},
 ] as const;
 
@@ -198,14 +217,14 @@ export const COMPOSIO_CONNECTORS = [
 	{
 		id: "composio-gmail",
 		title: "Gmail",
-		description: "Search through your emails via Composio",
+		description: "Search, read, draft, and send emails via Composio",
 		connectorType: EnumConnectorName.COMPOSIO_GMAIL_CONNECTOR,
 		authEndpoint: "/api/v1/auth/composio/connector/add/?toolkit_id=gmail",
 	},
 	{
 		id: "composio-googlecalendar",
 		title: "Google Calendar",
-		description: "Search through your events via Composio",
+		description: "Search and manage your events via Composio",
 		connectorType: EnumConnectorName.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
 		authEndpoint: "/api/v1/auth/composio/connector/add/?toolkit_id=googlecalendar",
 	},
@@ -222,14 +241,14 @@ export const COMPOSIO_TOOLKITS = [
 	{
 		id: "gmail",
 		name: "Gmail",
-		description: "Search through your emails",
-		isIndexable: true,
+		description: "Search, read, draft, and send emails",
+		isIndexable: false,
 	},
 	{
 		id: "googlecalendar",
 		name: "Google Calendar",
-		description: "Search through your events",
-		isIndexable: true,
+		description: "Search and manage your events",
+		isIndexable: false,
 	},
 	{
 		id: "slack",
@@ -259,66 +278,6 @@ export interface AutoIndexConfig {
 }
 
 export const AUTO_INDEX_DEFAULTS: Record<string, AutoIndexConfig> = {
-	[EnumConnectorName.GOOGLE_GMAIL_CONNECTOR]: {
-		daysBack: 30,
-		daysForward: 0,
-		frequencyMinutes: 1440,
-		syncDescription: "Syncing your last 30 days of emails.",
-	},
-	[EnumConnectorName.COMPOSIO_GMAIL_CONNECTOR]: {
-		daysBack: 30,
-		daysForward: 0,
-		frequencyMinutes: 1440,
-		syncDescription: "Syncing your last 30 days of emails.",
-	},
-	[EnumConnectorName.SLACK_CONNECTOR]: {
-		daysBack: 30,
-		daysForward: 0,
-		frequencyMinutes: 1440,
-		syncDescription: "Syncing your last 30 days of messages.",
-	},
-	[EnumConnectorName.DISCORD_CONNECTOR]: {
-		daysBack: 30,
-		daysForward: 0,
-		frequencyMinutes: 1440,
-		syncDescription: "Syncing your last 30 days of messages.",
-	},
-	[EnumConnectorName.TEAMS_CONNECTOR]: {
-		daysBack: 30,
-		daysForward: 0,
-		frequencyMinutes: 1440,
-		syncDescription: "Syncing your last 30 days of messages.",
-	},
-	[EnumConnectorName.GOOGLE_CALENDAR_CONNECTOR]: {
-		daysBack: 90,
-		daysForward: 90,
-		frequencyMinutes: 1440,
-		syncDescription: "Syncing 90 days of past and upcoming events.",
-	},
-	[EnumConnectorName.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR]: {
-		daysBack: 90,
-		daysForward: 90,
-		frequencyMinutes: 1440,
-		syncDescription: "Syncing 90 days of past and upcoming events.",
-	},
-	[EnumConnectorName.LINEAR_CONNECTOR]: {
-		daysBack: 90,
-		daysForward: 0,
-		frequencyMinutes: 1440,
-		syncDescription: "Syncing your last 90 days of issues.",
-	},
-	[EnumConnectorName.JIRA_CONNECTOR]: {
-		daysBack: 90,
-		daysForward: 0,
-		frequencyMinutes: 1440,
-		syncDescription: "Syncing your last 90 days of issues.",
-	},
-	[EnumConnectorName.CLICKUP_CONNECTOR]: {
-		daysBack: 90,
-		daysForward: 0,
-		frequencyMinutes: 1440,
-		syncDescription: "Syncing your last 90 days of tasks.",
-	},
 	[EnumConnectorName.NOTION_CONNECTOR]: {
 		daysBack: 365,
 		daysForward: 0,
@@ -331,15 +290,123 @@ export const AUTO_INDEX_DEFAULTS: Record<string, AutoIndexConfig> = {
 		frequencyMinutes: 1440,
 		syncDescription: "Syncing your documentation.",
 	},
-	[EnumConnectorName.AIRTABLE_CONNECTOR]: {
-		daysBack: 365,
-		daysForward: 0,
-		frequencyMinutes: 1440,
-		syncDescription: "Syncing your bases.",
-	},
 };
 
 export const AUTO_INDEX_CONNECTOR_TYPES = new Set<string>(Object.keys(AUTO_INDEX_DEFAULTS));
 
+// ============================================================================
+// CONNECTOR TELEMETRY REGISTRY
+// ----------------------------------------------------------------------------
+// Single source of truth for "what does this connector_type look like in
+// analytics?". Any connector added to the lists above is automatically
+// picked up here, so adding a new integration does NOT require touching
+// `lib/posthog/events.ts` or per-connector tracking code.
+// ============================================================================
+
+export type ConnectorTelemetryGroup = "oauth" | "composio" | "crawler" | "other" | "unknown";
+
+export interface ConnectorTelemetryMeta {
+	connector_type: string;
+	connector_title: string;
+	connector_group: ConnectorTelemetryGroup;
+	is_oauth: boolean;
+}
+
+const CONNECTOR_TELEMETRY_REGISTRY: ReadonlyMap<string, ConnectorTelemetryMeta> = (() => {
+	const map = new Map<string, ConnectorTelemetryMeta>();
+
+	for (const c of OAUTH_CONNECTORS) {
+		map.set(c.connectorType, {
+			connector_type: c.connectorType,
+			connector_title: c.title,
+			connector_group: "oauth",
+			is_oauth: true,
+		});
+	}
+	for (const c of COMPOSIO_CONNECTORS) {
+		map.set(c.connectorType, {
+			connector_type: c.connectorType,
+			connector_title: c.title,
+			connector_group: "composio",
+			is_oauth: true,
+		});
+	}
+	for (const c of CRAWLERS) {
+		map.set(c.connectorType, {
+			connector_type: c.connectorType,
+			connector_title: c.title,
+			connector_group: "crawler",
+			is_oauth: false,
+		});
+	}
+	for (const c of OTHER_CONNECTORS) {
+		map.set(c.connectorType, {
+			connector_type: c.connectorType,
+			connector_title: c.title,
+			connector_group: "other",
+			is_oauth: false,
+		});
+	}
+
+	return map;
+})();
+
+/**
+ * Returns telemetry metadata for a connector_type, or a minimal "unknown"
+ * record so tracking never no-ops for connectors that exist in the backend
+ * but were forgotten in the UI registry.
+ */
+export function getConnectorTelemetryMeta(connectorType: string): ConnectorTelemetryMeta {
+	const hit = CONNECTOR_TELEMETRY_REGISTRY.get(connectorType);
+	if (hit) return hit;
+
+	return {
+		connector_type: connectorType,
+		connector_title: connectorType,
+		connector_group: "unknown",
+		is_oauth: false,
+	};
+}
+
+// =============================================================================
+// REAUTH ENDPOINTS
+// =============================================================================
+
+/**
+ * Legacy (non-MCP) OAuth reauth endpoints, keyed by connector type.
+ * These are used for connectors that were NOT created via MCP OAuth.
+ */
+export const LEGACY_REAUTH_ENDPOINTS: Partial<Record<string, string>> = {
+	[EnumConnectorName.LINEAR_CONNECTOR]: "/api/v1/auth/linear/connector/reauth",
+	[EnumConnectorName.JIRA_CONNECTOR]: "/api/v1/auth/jira/connector/reauth",
+	[EnumConnectorName.NOTION_CONNECTOR]: "/api/v1/auth/notion/connector/reauth",
+	[EnumConnectorName.GOOGLE_DRIVE_CONNECTOR]: "/api/v1/auth/google/drive/connector/reauth",
+	[EnumConnectorName.GOOGLE_GMAIL_CONNECTOR]: "/api/v1/auth/google/gmail/connector/reauth",
+	[EnumConnectorName.GOOGLE_CALENDAR_CONNECTOR]: "/api/v1/auth/google/calendar/connector/reauth",
+	[EnumConnectorName.COMPOSIO_GOOGLE_DRIVE_CONNECTOR]: "/api/v1/auth/composio/connector/reauth",
+	[EnumConnectorName.COMPOSIO_GMAIL_CONNECTOR]: "/api/v1/auth/composio/connector/reauth",
+	[EnumConnectorName.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR]: "/api/v1/auth/composio/connector/reauth",
+	[EnumConnectorName.ONEDRIVE_CONNECTOR]: "/api/v1/auth/onedrive/connector/reauth",
+	[EnumConnectorName.DROPBOX_CONNECTOR]: "/api/v1/auth/dropbox/connector/reauth",
+	[EnumConnectorName.CONFLUENCE_CONNECTOR]: "/api/v1/auth/confluence/connector/reauth",
+	[EnumConnectorName.TEAMS_CONNECTOR]: "/api/v1/auth/teams/connector/reauth",
+	[EnumConnectorName.DISCORD_CONNECTOR]: "/api/v1/auth/discord/connector/reauth",
+};
+
+/**
+ * Resolve the reauth endpoint for a connector.
+ *
+ * MCP OAuth connectors (those with ``config.mcp_service``) dynamically build
+ * the URL from the service key. Legacy OAuth connectors fall back to the
+ * static ``LEGACY_REAUTH_ENDPOINTS`` map.
+ */
+export function getReauthEndpoint(connector: SearchSourceConnector): string | undefined {
+	const mcpService = connector.config?.mcp_service as string | undefined;
+	if (mcpService) {
+		return `/api/v1/auth/mcp/${mcpService}/connector/reauth`;
+	}
+	return LEGACY_REAUTH_ENDPOINTS[connector.connector_type];
+}
+
 // Re-export IndexingConfigState from schemas for backward compatibility
 export type { IndexingConfigState } from "./connector-popup.schemas";
diff --git a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts
index 6543bbd729..b49bfda969 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts
+++ b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts
@@ -18,6 +18,8 @@ import { authenticatedFetch } from "@/lib/auth-utils";
 import {
 	trackConnectorConnected,
 	trackConnectorDeleted,
+	trackConnectorSetupFailure,
+	trackConnectorSetupStarted,
 	trackIndexWithDateRangeOpened,
 	trackIndexWithDateRangeStarted,
 	trackPeriodicIndexingStarted,
@@ -30,9 +32,11 @@ import {
 	AUTO_INDEX_CONNECTOR_TYPES,
 	AUTO_INDEX_DEFAULTS,
 	COMPOSIO_CONNECTORS,
+	LIVE_CONNECTOR_TYPES,
 	OAUTH_CONNECTORS,
 	OTHER_CONNECTORS,
 } from "../constants/connector-constants";
+
 import {
 	dateRangeSchema,
 	frequencyMinutesSchema,
@@ -80,6 +84,7 @@ export const useConnectorDialog = () => {
 	const [periodicEnabled, setPeriodicEnabled] = useState(false);
 	const [frequencyMinutes, setFrequencyMinutes] = useState("1440");
 	const [enableSummary, setEnableSummary] = useState(false);
+	const [enableVisionLlm, setEnableVisionLlm] = useState(false);
 
 	// Edit mode state
 	const [editingConnector, setEditingConnector] = useState<SearchSourceConnector | null>(null);
@@ -220,10 +225,20 @@ export const useConnectorDialog = () => {
 
 		if (result.error) {
 			const oauthConnector = result.connector
-				? OAUTH_CONNECTORS.find((c) => c.id === result.connector)
+				? OAUTH_CONNECTORS.find((c) => c.id === result.connector) ||
+					COMPOSIO_CONNECTORS.find((c) => c.id === result.connector)
 				: null;
 			const name = oauthConnector?.title || "connector";
 
+			if (oauthConnector) {
+				trackConnectorSetupFailure(
+					Number(searchSpaceId),
+					oauthConnector.connectorType,
+					result.error,
+					"oauth_callback"
+				);
+			}
+
 			if (result.error === "duplicate_account") {
 				toast.error(`This ${name} account is already connected`, {
 					description: "Please use a different account or manage the existing connection.",
@@ -293,7 +308,12 @@ export const useConnectorDialog = () => {
 							newConnector.id
 						);
 
-						if (
+						const isLiveConnector = LIVE_CONNECTOR_TYPES.has(oauthConnector.connectorType);
+
+						if (isLiveConnector) {
+							toast.success(`${oauthConnector.title} connected successfully!`);
+							await refetchAllConnectors();
+						} else if (
 							newConnector.is_indexable &&
 							AUTO_INDEX_CONNECTOR_TYPES.has(oauthConnector.connectorType)
 						) {
@@ -302,6 +322,9 @@ export const useConnectorDialog = () => {
 								oauthConnector.title,
 								oauthConnector.connectorType
 							);
+						} else if (!newConnector.is_indexable) {
+							toast.success(`${oauthConnector.title} connected successfully!`);
+							await refetchAllConnectors();
 						} else {
 							toast.dismiss("auto-index");
 							const config = validateIndexingConfigState({
@@ -336,6 +359,8 @@ export const useConnectorDialog = () => {
 			// Set connecting state immediately to disable button and show spinner
 			setConnectingId(connector.id);
 
+			trackConnectorSetupStarted(Number(searchSpaceId), connector.connectorType, "oauth_click");
+
 			try {
 				// Check if authEndpoint already has query parameters
 				const separator = connector.authEndpoint.includes("?") ? "&" : "?";
@@ -357,6 +382,12 @@ export const useConnectorDialog = () => {
 				window.location.href = validatedData.auth_url;
 			} catch (error) {
 				console.error(`Error connecting to ${connector.title}:`, error);
+				trackConnectorSetupFailure(
+					Number(searchSpaceId),
+					connector.connectorType,
+					error instanceof Error ? error.message : "oauth_initiation_failed",
+					"oauth_init"
+				);
 				if (error instanceof Error && error.message.includes("Invalid auth URL")) {
 					toast.error(`Invalid response from ${connector.title} OAuth endpoint`);
 				} else {
@@ -380,6 +411,11 @@ export const useConnectorDialog = () => {
 		if (!searchSpaceId) return;
 
 		setConnectingId("webcrawler-connector");
+		trackConnectorSetupStarted(
+			Number(searchSpaceId),
+			EnumConnectorName.WEBCRAWLER_CONNECTOR,
+			"webcrawler_quick_add"
+		);
 		try {
 			await createConnector({
 				data: {
@@ -393,6 +429,7 @@ export const useConnectorDialog = () => {
 					indexing_frequency_minutes: null,
 					next_scheduled_at: null,
 					enable_summary: false,
+					enable_vision_llm: false,
 				},
 				queryParams: {
 					search_space_id: searchSpaceId,
@@ -429,16 +466,25 @@ export const useConnectorDialog = () => {
 			}
 		} catch (error) {
 			console.error("Error creating webcrawler connector:", error);
+			trackConnectorSetupFailure(
+				Number(searchSpaceId),
+				EnumConnectorName.WEBCRAWLER_CONNECTOR,
+				error instanceof Error ? error.message : "webcrawler_create_failed",
+				"webcrawler_quick_add"
+			);
 			toast.error("Failed to create web crawler connector");
 		} finally {
 			setConnectingId(null);
 		}
 	}, [searchSpaceId, createConnector, refetchAllConnectors, setIsOpen]);
 
-	// Handle connecting non-OAuth connectors (like Tavily API)
+	// Handle connecting non-OAuth connectors (like Tavily API, Obsidian plugin, etc.)
 	const handleConnectNonOAuth = useCallback(
 		(connectorType: string) => {
 			if (!searchSpaceId) return;
+
+			trackConnectorSetupStarted(Number(searchSpaceId), connectorType, "non_oauth_click");
+
 			setConnectingConnectorType(connectorType);
 		},
 		[searchSpaceId]
@@ -485,6 +531,7 @@ export const useConnectorDialog = () => {
 						is_active: true,
 						next_scheduled_at: connectorData.next_scheduled_at as string | null,
 						enable_summary: false,
+						enable_vision_llm: false,
 					},
 					queryParams: {
 						search_space_id: searchSpaceId,
@@ -621,6 +668,7 @@ export const useConnectorDialog = () => {
 									setPeriodicEnabled(false);
 									setFrequencyMinutes("1440");
 									setEnableSummary(connector.enable_summary ?? false);
+									setEnableVisionLlm(connector.enable_vision_llm ?? false);
 									setStartDate(undefined);
 									setEndDate(undefined);
 
@@ -651,6 +699,12 @@ export const useConnectorDialog = () => {
 				}
 			} catch (error) {
 				console.error("Error creating connector:", error);
+				trackConnectorSetupFailure(
+					Number(searchSpaceId),
+					connectingConnectorType ?? formData.connector_type,
+					error instanceof Error ? error.message : "connector_create_failed",
+					"non_oauth_form"
+				);
 				toast.error(error instanceof Error ? error.message : "Failed to create connector");
 			} finally {
 				isCreatingConnectorRef.current = false;
@@ -763,12 +817,13 @@ export const useConnectorDialog = () => {
 				const endDateStr = endDate ? format(endDate, "yyyy-MM-dd") : undefined;
 
 				// Update connector with summary, periodic sync settings, and config changes
-				if (enableSummary || periodicEnabled || indexingConnectorConfig) {
+				if (enableSummary || enableVisionLlm || periodicEnabled || indexingConnectorConfig) {
 					const frequency = periodicEnabled ? parseInt(frequencyMinutes, 10) : undefined;
 					await updateConnector({
 						id: indexingConfig.connectorId,
 						data: {
 							enable_summary: enableSummary,
+							enable_vision_llm: enableVisionLlm,
 							...(periodicEnabled && {
 								periodic_indexing_enabled: true,
 								indexing_frequency_minutes: frequency,
@@ -896,6 +951,7 @@ export const useConnectorDialog = () => {
 			periodicEnabled,
 			frequencyMinutes,
 			enableSummary,
+			enableVisionLlm,
 			indexingConnectorConfig,
 			setIsOpen,
 		]
@@ -960,6 +1016,7 @@ export const useConnectorDialog = () => {
 			setPeriodicEnabled(!connector.is_indexable ? false : connector.periodic_indexing_enabled);
 			setFrequencyMinutes(connector.indexing_frequency_minutes?.toString() || "1440");
 			setEnableSummary(connector.enable_summary ?? false);
+			setEnableVisionLlm(connector.enable_vision_llm ?? false);
 			setStartDate(undefined);
 			setEndDate(undefined);
 		},
@@ -1038,6 +1095,7 @@ export const useConnectorDialog = () => {
 					data: {
 						name: connectorName || editingConnector.name,
 						enable_summary: enableSummary,
+						enable_vision_llm: enableVisionLlm,
 						periodic_indexing_enabled: !editingConnector.is_indexable ? false : periodicEnabled,
 						indexing_frequency_minutes: !editingConnector.is_indexable ? null : frequency,
 						config: connectorConfig || editingConnector.config,
@@ -1172,6 +1230,7 @@ export const useConnectorDialog = () => {
 			periodicEnabled,
 			frequencyMinutes,
 			enableSummary,
+			enableVisionLlm,
 			getFrequencyLabel,
 			connectorConfig,
 			connectorName,
@@ -1332,6 +1391,7 @@ export const useConnectorDialog = () => {
 					setPeriodicEnabled(false);
 					setFrequencyMinutes("1440");
 					setEnableSummary(false);
+					setEnableVisionLlm(false);
 				}
 			}
 		},
@@ -1368,6 +1428,7 @@ export const useConnectorDialog = () => {
 		periodicEnabled,
 		frequencyMinutes,
 		enableSummary,
+		enableVisionLlm,
 		searchSpaceId,
 		allConnectors,
 		viewingAccountsType,
@@ -1382,6 +1443,7 @@ export const useConnectorDialog = () => {
 		setPeriodicEnabled,
 		setFrequencyMinutes,
 		setEnableSummary,
+		setEnableVisionLlm,
 		setConnectorName,
 
 		// Handlers
diff --git a/surfsense_web/components/assistant-ui/connector-popup/tabs/active-connectors-tab.tsx b/surfsense_web/components/assistant-ui/connector-popup/tabs/active-connectors-tab.tsx
index c088714454..42532859ad 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/tabs/active-connectors-tab.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/tabs/active-connectors-tab.tsx
@@ -2,14 +2,18 @@
 
 import { Search, Unplug } from "lucide-react";
 import type { FC } from "react";
-import { getDocumentTypeLabel } from "@/components/documents/DocumentTypeIcon";
 import { Button } from "@/components/ui/button";
 import { Spinner } from "@/components/ui/spinner";
 import { TabsContent } from "@/components/ui/tabs";
 import { getConnectorIcon } from "@/contracts/enums/connectorIcons";
 import type { SearchSourceConnector } from "@/contracts/types/connector.types";
+import { getDocumentTypeLabel } from "@/lib/documents/document-type-labels";
 import { cn } from "@/lib/utils";
-import { COMPOSIO_CONNECTORS, OAUTH_CONNECTORS } from "../constants/connector-constants";
+import {
+	COMPOSIO_CONNECTORS,
+	LIVE_CONNECTOR_TYPES,
+	OAUTH_CONNECTORS,
+} from "../constants/connector-constants";
 import { getDocumentCountForConnector } from "../utils/connector-document-mapping";
 import { getConnectorDisplayName } from "./all-connectors-tab";
 
@@ -156,6 +160,7 @@ export const ActiveConnectorsTab: FC<ActiveConnectorsTabProps> = ({
 								{/* OAuth Connectors - Grouped by Type */}
 								{filteredOAuthConnectorTypes.map(([connectorType, typeConnectors]) => {
 									const { title } = getOAuthConnectorTypeInfo(connectorType);
+									const isLive = LIVE_CONNECTOR_TYPES.has(connectorType);
 									const isAnyIndexing = typeConnectors.some((c: SearchSourceConnector) =>
 										indexingConnectorIds.has(c.id)
 									);
@@ -180,7 +185,7 @@ export const ActiveConnectorsTab: FC<ActiveConnectorsTabProps> = ({
 												"relative flex items-center gap-4 p-4 rounded-xl transition-all",
 												isAnyIndexing
 													? "bg-primary/5 border-0"
-													: "bg-slate-400/5 dark:bg-white/5 hover:bg-slate-400/10 dark:hover:bg-white/10 border border-border"
+													: "bg-slate-400/5 dark:bg-white/5 hover:bg-accent hover:text-accent-foreground border border-border"
 											)}
 										>
 											<div
@@ -202,8 +207,12 @@ export const ActiveConnectorsTab: FC<ActiveConnectorsTabProps> = ({
 													</p>
 												) : (
 													<p className="text-[10px] text-muted-foreground mt-1 flex items-center gap-1.5">
-														<span>{formatDocumentCount(documentCount)}</span>
-														<span className="text-muted-foreground/50">•</span>
+														{!isLive && (
+															<>
+																<span>{formatDocumentCount(documentCount)}</span>
+																<span className="text-muted-foreground/50">•</span>
+															</>
+														)}
 														<span>
 															{accountCount} {accountCount === 1 ? "Account" : "Accounts"}
 														</span>
@@ -213,7 +222,7 @@ export const ActiveConnectorsTab: FC<ActiveConnectorsTabProps> = ({
 											<Button
 												variant="secondary"
 												size="sm"
-												className="h-8 text-[11px] px-3 rounded-lg font-medium bg-white text-slate-700 hover:bg-slate-50 border-0 shadow-xs dark:bg-secondary dark:text-secondary-foreground dark:hover:bg-secondary/80 shrink-0"
+												className="h-8 text-[11px] px-3 font-medium bg-white text-slate-700 hover:bg-accent hover:text-accent-foreground border-0 shadow-xs dark:bg-secondary dark:text-secondary-foreground shrink-0"
 												onClick={handleManageClick}
 											>
 												Manage
@@ -230,6 +239,7 @@ export const ActiveConnectorsTab: FC<ActiveConnectorsTabProps> = ({
 										documentTypeCounts
 									);
 									const isMCPConnector = connector.connector_type === "MCP_CONNECTOR";
+									const isLive = LIVE_CONNECTOR_TYPES.has(connector.connector_type);
 									return (
 										<div
 											key={`connector-${connector.id}`}
@@ -237,7 +247,7 @@ export const ActiveConnectorsTab: FC<ActiveConnectorsTabProps> = ({
 												"flex items-center gap-4 p-4 rounded-xl transition-all",
 												isIndexing
 													? "bg-primary/5 border-0"
-													: "bg-slate-400/5 dark:bg-white/5 hover:bg-slate-400/10 dark:hover:bg-white/10 border border-border"
+													: "bg-slate-400/5 dark:bg-white/5 hover:bg-accent hover:text-accent-foreground border border-border"
 											)}
 										>
 											<div
@@ -261,7 +271,7 @@ export const ActiveConnectorsTab: FC<ActiveConnectorsTabProps> = ({
 														<Spinner size="xs" />
 														Syncing
 													</p>
-												) : !isMCPConnector ? (
+												) : !isLive && !isMCPConnector ? (
 													<p className="text-[10px] text-muted-foreground mt-1">
 														{formatDocumentCount(documentCount)}
 													</p>
@@ -270,7 +280,7 @@ export const ActiveConnectorsTab: FC<ActiveConnectorsTabProps> = ({
 											<Button
 												variant="secondary"
 												size="sm"
-												className="h-8 text-[11px] px-3 rounded-lg font-medium bg-white text-slate-700 hover:bg-slate-50 border-0 shadow-xs dark:bg-secondary dark:text-secondary-foreground dark:hover:bg-secondary/80 shrink-0"
+												className="h-8 text-[11px] px-3 font-medium bg-white text-slate-700 hover:bg-accent hover:text-accent-foreground border-0 shadow-xs dark:bg-secondary dark:text-secondary-foreground shrink-0"
 												onClick={onManage ? () => onManage(connector) : undefined}
 											>
 												Manage
@@ -292,7 +302,7 @@ export const ActiveConnectorsTab: FC<ActiveConnectorsTabProps> = ({
 								{standaloneDocuments.map((doc) => (
 									<div
 										key={doc.type}
-										className="inline-flex items-center gap-2 px-3 py-1.5 rounded-full border border-border bg-slate-400/5 dark:bg-white/5 hover:bg-slate-400/10 dark:hover:bg-white/10 transition-all"
+										className="inline-flex items-center gap-2 px-3 py-1.5 rounded-full border border-border bg-slate-400/5 dark:bg-white/5 hover:bg-accent hover:text-accent-foreground transition-all"
 									>
 										<div className="flex items-center justify-center">
 											{getConnectorIcon(doc.type, "size-3.5")}
diff --git a/surfsense_web/components/assistant-ui/connector-popup/views/connector-accounts-list-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/views/connector-accounts-list-view.tsx
index b4c049c5cb..65960db22b 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/views/connector-accounts-list-view.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/views/connector-accounts-list-view.tsx
@@ -13,24 +13,10 @@ import type { SearchSourceConnector } from "@/contracts/types/connector.types";
 import { authenticatedFetch } from "@/lib/auth-utils";
 import { formatRelativeDate } from "@/lib/format-date";
 import { cn } from "@/lib/utils";
+import { getReauthEndpoint, LIVE_CONNECTOR_TYPES } from "../constants/connector-constants";
 import { useConnectorStatus } from "../hooks/use-connector-status";
 import { getConnectorDisplayName } from "../tabs/all-connectors-tab";
 
-const REAUTH_ENDPOINTS: Partial<Record<string, string>> = {
-	[EnumConnectorName.LINEAR_CONNECTOR]: "/api/v1/auth/linear/connector/reauth",
-	[EnumConnectorName.NOTION_CONNECTOR]: "/api/v1/auth/notion/connector/reauth",
-	[EnumConnectorName.GOOGLE_DRIVE_CONNECTOR]: "/api/v1/auth/google/drive/connector/reauth",
-	[EnumConnectorName.GOOGLE_GMAIL_CONNECTOR]: "/api/v1/auth/google/gmail/connector/reauth",
-	[EnumConnectorName.GOOGLE_CALENDAR_CONNECTOR]: "/api/v1/auth/google/calendar/connector/reauth",
-	[EnumConnectorName.COMPOSIO_GOOGLE_DRIVE_CONNECTOR]: "/api/v1/auth/composio/connector/reauth",
-	[EnumConnectorName.COMPOSIO_GMAIL_CONNECTOR]: "/api/v1/auth/composio/connector/reauth",
-	[EnumConnectorName.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR]: "/api/v1/auth/composio/connector/reauth",
-	[EnumConnectorName.ONEDRIVE_CONNECTOR]: "/api/v1/auth/onedrive/connector/reauth",
-	[EnumConnectorName.JIRA_CONNECTOR]: "/api/v1/auth/jira/connector/reauth",
-	[EnumConnectorName.DROPBOX_CONNECTOR]: "/api/v1/auth/dropbox/connector/reauth",
-	[EnumConnectorName.CONFLUENCE_CONNECTOR]: "/api/v1/auth/confluence/connector/reauth",
-};
-
 interface ConnectorAccountsListViewProps {
 	connectorType: string;
 	connectorTitle: string;
@@ -38,19 +24,12 @@ interface ConnectorAccountsListViewProps {
 	indexingConnectorIds: Set<number>;
 	onBack: () => void;
 	onManage: (connector: SearchSourceConnector) => void;
+	onDisconnect?: (connector: SearchSourceConnector) => Promise<void> | void;
 	onAddAccount: () => void;
 	isConnecting?: boolean;
 	addButtonText?: string;
 }
 
-/**
- * Check if a connector type is indexable
- */
-function isIndexableConnector(connectorType: string): boolean {
-	const nonIndexableTypes = ["MCP_CONNECTOR"];
-	return !nonIndexableTypes.includes(connectorType);
-}
-
 export const ConnectorAccountsListView: FC<ConnectorAccountsListViewProps> = ({
 	connectorType,
 	connectorTitle,
@@ -58,12 +37,15 @@ export const ConnectorAccountsListView: FC<ConnectorAccountsListViewProps> = ({
 	indexingConnectorIds,
 	onBack,
 	onManage,
+	onDisconnect,
 	onAddAccount,
 	isConnecting = false,
 	addButtonText,
 }) => {
 	const searchSpaceId = useAtomValue(activeSearchSpaceIdAtom);
 	const [reauthingId, setReauthingId] = useState<number | null>(null);
+	const [confirmDisconnectId, setConfirmDisconnectId] = useState<number | null>(null);
+	const [disconnectingId, setDisconnectingId] = useState<number | null>(null);
 
 	// Get connector status
 	const { isConnectorEnabled, getConnectorStatusMessage } = useConnectorStatus();
@@ -71,16 +53,15 @@ export const ConnectorAccountsListView: FC<ConnectorAccountsListViewProps> = ({
 	const isEnabled = isConnectorEnabled(connectorType);
 	const statusMessage = getConnectorStatusMessage(connectorType);
 
-	const reauthEndpoint = REAUTH_ENDPOINTS[connectorType];
-
 	const handleReauth = useCallback(
-		async (connectorId: number) => {
-			if (!searchSpaceId || !reauthEndpoint) return;
-			setReauthingId(connectorId);
+		async (connector: SearchSourceConnector) => {
+			const endpoint = getReauthEndpoint(connector);
+			if (!searchSpaceId || !endpoint) return;
+			setReauthingId(connector.id);
 			try {
 				const backendUrl = process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL || "http://localhost:8000";
-				const url = new URL(`${backendUrl}${reauthEndpoint}`);
-				url.searchParams.set("connector_id", String(connectorId));
+				const url = new URL(`${backendUrl}${endpoint}`);
+				url.searchParams.set("connector_id", String(connector.id));
 				url.searchParams.set("space_id", String(searchSpaceId));
 				url.searchParams.set("return_url", window.location.pathname);
 				const response = await authenticatedFetch(url.toString());
@@ -102,7 +83,7 @@ export const ConnectorAccountsListView: FC<ConnectorAccountsListViewProps> = ({
 				setReauthingId(null);
 			}
 		},
-		[searchSpaceId, reauthEndpoint]
+		[searchSpaceId]
 	);
 
 	// Filter connectors to only show those of this type
@@ -127,16 +108,17 @@ export const ConnectorAccountsListView: FC<ConnectorAccountsListViewProps> = ({
 	return (
 		<div className="flex flex-col h-full">
 			{/* Header */}
-			<div className="px-6 sm:px-12 pt-8 sm:pt-10 pb-1 sm:pb-4 border-b border-border/50 bg-muted">
+			<div className="px-6 sm:px-12 pt-8 sm:pt-10 pb-1 sm:pb-4 bg-popover">
 				{/* Back button */}
-				<button
+				<Button
 					type="button"
+					variant="ghost"
 					onClick={onBack}
-					className="flex items-center gap-2 text-xs sm:text-sm text-muted-foreground hover:text-foreground mb-6 w-fit"
+					className="mb-6 h-auto w-fit gap-2 px-0 py-0 text-xs font-normal text-muted-foreground hover:bg-transparent hover:text-accent-foreground sm:text-sm"
 				>
 					<ArrowLeft className="size-4" />
 					Back to connectors
-				</button>
+				</Button>
 
 				{/* Connector header */}
 				<div className="flex flex-col sm:flex-row items-start sm:items-center gap-4 mb-6">
@@ -149,20 +131,21 @@ export const ConnectorAccountsListView: FC<ConnectorAccountsListViewProps> = ({
 								{connectorTitle}
 							</h2>
 							<p className="text-xs sm:text-base text-muted-foreground mt-1">
-								{statusMessage || "Manage your connector settings and sync configuration"}
+								{statusMessage || "Manage your connected accounts"}
 							</p>
 						</div>
 					</div>
 					{/* Add Account Button with dashed border */}
-					<button
+					<Button
 						type="button"
+						variant="ghost"
 						onClick={onAddAccount}
 						disabled={isConnecting || !isEnabled}
 						className={cn(
-							"flex items-center justify-center gap-1.5 h-8 px-3 rounded-md border-2 border-dashed text-xs sm:text-sm transition-all duration-200 shrink-0 w-full sm:w-auto",
+							"h-8 w-full shrink-0 gap-1.5 rounded-md border-2 border-dashed px-3 text-xs transition-all duration-200 sm:w-auto sm:text-sm",
 							!isEnabled
 								? "border-border/30 opacity-50 cursor-not-allowed"
-								: "border-slate-400/20 dark:border-white/20 hover:bg-primary/5",
+								: "border-slate-400/20 dark:border-white/20 hover:bg-accent hover:text-accent-foreground",
 							isConnecting && "opacity-50 cursor-not-allowed"
 						)}
 					>
@@ -174,7 +157,7 @@ export const ConnectorAccountsListView: FC<ConnectorAccountsListViewProps> = ({
 							)}
 						</div>
 						<span className="text-xs sm:text-sm font-medium">{buttonText}</span>
-					</button>
+					</Button>
 				</div>
 			</div>
 
@@ -203,7 +186,12 @@ export const ConnectorAccountsListView: FC<ConnectorAccountsListViewProps> = ({
 					<div className="grid grid-cols-1 sm:grid-cols-2 gap-3">
 						{typeConnectors.map((connector) => {
 							const isIndexing = indexingConnectorIds.has(connector.id);
-							const isAuthExpired = !!reauthEndpoint && connector.config?.auth_expired === true;
+							const connectorReauthEndpoint = getReauthEndpoint(connector);
+							const isAuthExpired =
+								!!connectorReauthEndpoint && connector.config?.auth_expired === true;
+							const isLive =
+								LIVE_CONNECTOR_TYPES.has(connector.connector_type) ||
+								Boolean(connector.config?.server_config);
 
 							return (
 								<div
@@ -212,7 +200,7 @@ export const ConnectorAccountsListView: FC<ConnectorAccountsListViewProps> = ({
 										"flex items-center gap-4 p-4 rounded-xl transition-all",
 										isIndexing
 											? "bg-primary/5 border-0"
-											: "bg-slate-400/5 dark:bg-white/5 hover:bg-slate-400/10 dark:hover:bg-white/10 border border-border"
+											: "bg-slate-400/5 dark:bg-white/5 hover:bg-accent hover:text-accent-foreground border border-border"
 									)}
 								>
 									<div
@@ -234,21 +222,19 @@ export const ConnectorAccountsListView: FC<ConnectorAccountsListViewProps> = ({
 												<Spinner size="xs" />
 												Syncing
 											</p>
-										) : (
-											<p className="text-[10px] text-muted-foreground mt-1 whitespace-nowrap truncate">
-												{isIndexableConnector(connector.connector_type)
-													? connector.last_indexed_at
-														? `Last indexed: ${formatRelativeDate(connector.last_indexed_at)}`
-														: "Never indexed"
-													: "Active"}
+										) : !isLive ? (
+											<p className="text-[10px] mt-1 whitespace-nowrap truncate text-muted-foreground">
+												{connector.last_indexed_at
+													? `Last indexed: ${formatRelativeDate(connector.last_indexed_at)}`
+													: "Never indexed"}
 											</p>
-										)}
+										) : null}
 									</div>
 									{isAuthExpired ? (
 										<Button
 											size="sm"
-											className="h-8 text-[11px] px-3 rounded-lg font-medium bg-amber-600 hover:bg-amber-700 text-white border-0 shadow-xs shrink-0"
-											onClick={() => handleReauth(connector.id)}
+											className="h-8 text-[11px] px-3 font-medium bg-amber-600 hover:bg-amber-700 text-white border-0 shadow-xs shrink-0"
+											onClick={() => handleReauth(connector)}
 											disabled={reauthingId === connector.id}
 										>
 											<RefreshCw
@@ -256,11 +242,55 @@ export const ConnectorAccountsListView: FC<ConnectorAccountsListViewProps> = ({
 											/>
 											Re-authenticate
 										</Button>
+									) : isLive && onDisconnect ? (
+										confirmDisconnectId === connector.id ? (
+											<div className="flex items-center gap-1.5 shrink-0">
+												<Button
+													variant="destructive"
+													size="sm"
+													className="h-8 text-[11px] px-3 font-medium shadow-xs"
+													onClick={async () => {
+														setDisconnectingId(connector.id);
+														setConfirmDisconnectId(null);
+														try {
+															await onDisconnect(connector);
+														} finally {
+															setDisconnectingId(null);
+														}
+													}}
+													disabled={disconnectingId === connector.id}
+												>
+													{disconnectingId === connector.id ? (
+														<RefreshCw className="size-3.5 animate-spin" />
+													) : (
+														"Confirm"
+													)}
+												</Button>
+												<Button
+													variant="ghost"
+													size="sm"
+													className="h-8 text-[11px] px-2"
+													onClick={() => setConfirmDisconnectId(null)}
+													disabled={disconnectingId === connector.id}
+												>
+													Cancel
+												</Button>
+											</div>
+										) : (
+											<Button
+												variant="destructive"
+												size="sm"
+												className="h-8 text-[11px] px-3 font-medium shrink-0"
+												onClick={() => setConfirmDisconnectId(connector.id)}
+											>
+												Disconnect
+											</Button>
+										)
 									) : (
 										<Button
 											variant="secondary"
 											size="sm"
-											className="h-8 text-[11px] px-3 rounded-lg font-medium bg-white text-slate-700 hover:bg-slate-50 border-0 shadow-xs dark:bg-secondary dark:text-secondary-foreground dark:hover:bg-secondary/80 shrink-0"
+											className="h-8 text-[11px] px-3 font-medium bg-white text-slate-700 hover:bg-accent hover:text-accent-foreground border-0 shadow-xs dark:bg-secondary dark:text-secondary-foreground shrink-0"
 											onClick={() => onManage(connector)}
 										>
 											Manage
diff --git a/surfsense_web/components/assistant-ui/connector-popup/views/youtube-crawler-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/views/youtube-crawler-view.tsx
index 8982b16a89..5bffa0dfe9 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/views/youtube-crawler-view.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/views/youtube-crawler-view.tsx
@@ -7,6 +7,7 @@ import { useTranslations } from "next-intl";
 import { type FC, useCallback, useState } from "react";
 import { toast } from "sonner";
 import { createDocumentMutationAtom } from "@/atoms/documents/document-mutation.atoms";
+import { Alert, AlertDescription } from "@/components/ui/alert";
 import { Button } from "@/components/ui/button";
 import { Label } from "@/components/ui/label";
 import { Spinner } from "@/components/ui/spinner";
@@ -216,14 +217,15 @@ export const YouTubeCrawlerView: FC<YouTubeCrawlerViewProps> = ({ searchSpaceId,
 		<div className="flex-1 flex flex-col min-h-0 overflow-hidden">
 			{/* Header */}
 			<div className="shrink-0 px-6 sm:px-12 pt-8 sm:pt-10">
-				<button
+				<Button
+					variant="ghost"
 					type="button"
 					onClick={onBack}
-					className="flex items-center gap-2 text-xs sm:text-sm text-muted-foreground hover:text-foreground mb-6 w-fit"
+					className="mb-6 h-auto w-fit justify-start gap-2 px-0 py-0 text-xs text-muted-foreground hover:bg-transparent hover:text-accent-foreground sm:text-sm"
 				>
-					<ArrowLeft className="size-4" />
+					<ArrowLeft data-icon="inline-start" />
 					Back to connectors
-				</button>
+				</Button>
 
 				<div className="flex items-center gap-4 mb-6">
 					<div className="flex h-14 w-14 items-center justify-center rounded-xl border border-slate-400/30">
@@ -259,7 +261,7 @@ export const YouTubeCrawlerView: FC<YouTubeCrawlerViewProps> = ({ searchSpaceId,
 									tag: {
 										body: "h-7 relative bg-background border border-input hover:bg-background rounded-md font-medium text-xs ps-2 pe-7 flex",
 										closeButton:
-											"absolute -inset-y-px -end-px p-0 rounded-e-lg flex size-7 transition-colors outline-0 focus-visible:outline focus-visible:outline-2 focus-visible:outline-ring/70 text-muted-foreground/80 hover:text-foreground",
+											"absolute -inset-y-px -end-px p-0 rounded-e-lg flex size-7 transition-colors outline-0 focus-visible:outline focus-visible:outline-2 focus-visible:outline-ring/70 text-muted-foreground/80 hover:text-accent-foreground",
 									},
 								}}
 								activeTagIndex={activeTagIndex}
@@ -278,10 +280,10 @@ export const YouTubeCrawlerView: FC<YouTubeCrawlerViewProps> = ({ searchSpaceId,
 
 					{error && <div className="text-sm text-red-500 mt-2">{error}</div>}
 
-					<div className="flex items-start gap-3 rounded-lg border border-blue-200/50 bg-blue-50/50 dark:border-blue-500/20 dark:bg-blue-950/20 p-4 text-sm">
-						<Info className="size-4 mt-0.5 shrink-0 text-blue-600 dark:text-blue-400" />
-						<p className="text-muted-foreground">{t("chat_tip")}</p>
-					</div>
+					<Alert>
+						<Info />
+						<AlertDescription>{t("chat_tip")}</AlertDescription>
+					</Alert>
 
 					<div className="bg-muted/50 rounded-lg p-4 text-sm">
 						<h4 className="font-medium mb-2">{t("tips_title")}</h4>
@@ -323,7 +325,7 @@ export const YouTubeCrawlerView: FC<YouTubeCrawlerViewProps> = ({ searchSpaceId,
 			</div>
 
 			{/* Fixed Footer - Action buttons */}
-			<div className="shrink-0 flex items-center justify-between px-6 sm:px-12 py-6 bg-muted border-t border-border">
+			<div className="shrink-0 flex items-center justify-between px-6 sm:px-12 py-6 bg-popover">
 				<Button
 					variant="ghost"
 					onClick={onBack}
diff --git a/surfsense_web/components/assistant-ui/document-upload-popup.tsx b/surfsense_web/components/assistant-ui/document-upload-popup.tsx
index 3c1d64a6ae..88e99e67fb 100644
--- a/surfsense_web/components/assistant-ui/document-upload-popup.tsx
+++ b/surfsense_web/components/assistant-ui/document-upload-popup.tsx
@@ -1,7 +1,8 @@
 "use client";
 
-import { useAtomValue, useSetAtom } from "jotai";
+import { useAtomValue } from "jotai";
 import { AlertTriangle, Settings } from "lucide-react";
+import { useRouter } from "next/navigation";
 import {
 	createContext,
 	type FC,
@@ -16,7 +17,6 @@ import {
 	llmPreferencesAtom,
 } from "@/atoms/new-llm-config/new-llm-config-query.atoms";
 import { activeSearchSpaceIdAtom } from "@/atoms/search-spaces/search-space-query.atoms";
-import { searchSpaceSettingsDialogAtom } from "@/atoms/settings/settings-dialog.atoms";
 import { DocumentUploadTab } from "@/components/sources/DocumentUploadTab";
 import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
 import { Button } from "@/components/ui/button";
@@ -36,12 +36,14 @@ interface DocumentUploadDialogContextType {
 
 const DocumentUploadDialogContext = createContext<DocumentUploadDialogContextType | null>(null);
 
-export const useDocumentUploadDialog = () => {
+const NOOP_DIALOG: DocumentUploadDialogContextType = {
+	openDialog: () => {},
+	closeDialog: () => {},
+};
+
+export const useDocumentUploadDialog = (): DocumentUploadDialogContextType => {
 	const context = useContext(DocumentUploadDialogContext);
-	if (!context) {
-		throw new Error("useDocumentUploadDialog must be used within DocumentUploadDialogProvider");
-	}
-	return context;
+	return context ?? NOOP_DIALOG;
 };
 
 // Provider component
@@ -96,8 +98,8 @@ const DocumentUploadPopupContent: FC<{
 	isOpen: boolean;
 	onOpenChange: (open: boolean) => void;
 }> = ({ isOpen, onOpenChange }) => {
+	const router = useRouter();
 	const searchSpaceId = useAtomValue(activeSearchSpaceIdAtom);
-	const setSearchSpaceSettingsDialog = useSetAtom(searchSpaceSettingsDialogAtom);
 	const { data: preferences = {}, isFetching: preferencesLoading } =
 		useAtomValue(llmPreferencesAtom);
 	const { data: globalConfigs = [], isFetching: globalConfigsLoading } =
@@ -131,10 +133,10 @@ const DocumentUploadPopupContent: FC<{
 				onPointerDownOutside={(e) => e.preventDefault()}
 				onInteractOutside={(e) => e.preventDefault()}
 				onEscapeKeyDown={(e) => e.preventDefault()}
-				className="select-none max-w-2xl w-[95vw] sm:w-[640px] h-[min(440px,75dvh)] sm:h-[min(520px,80vh)] flex flex-col p-0 gap-0 overflow-hidden border border-border ring-0 bg-muted dark:bg-muted text-foreground [&>button]:right-3 sm:[&>button]:right-6 [&>button]:top-5 sm:[&>button]:top-8 [&>button]:opacity-80 [&>button]:hover:opacity-100 [&>button]:hover:bg-foreground/10 [&>button]:z-[100] [&>button>svg]:size-4 sm:[&>button>svg]:size-5"
+				className="select-none max-w-2xl w-[95vw] sm:w-[640px] h-[min(440px,75dvh)] sm:h-[min(520px,80vh)] flex flex-col p-0 gap-0 overflow-hidden ring-0 [&>button]:right-3 sm:[&>button]:right-6 [&>button]:top-5 sm:[&>button]:top-8 [&>button]:opacity-80 [&>button]:hover:opacity-100 [&>button]:hover:bg-accent [&>button]:hover:text-accent-foreground [&>button]:z-[100] [&>button>svg]:size-4 sm:[&>button>svg]:size-5"
 			>
 				<div className="flex-1 min-h-0 overflow-y-auto overscroll-contain">
-					<DialogHeader className="sticky top-0 z-20 bg-muted px-4 sm:px-6 pt-6 sm:pt-8 pb-10">
+					<DialogHeader className="sticky top-0 z-20 bg-popover px-4 sm:px-6 pt-6 sm:pt-8 pb-10">
 						<DialogTitle className="text-xl sm:text-3xl font-semibold tracking-tight pr-8 sm:pr-0">
 							Upload Documents
 						</DialogTitle>
@@ -145,34 +147,30 @@ const DocumentUploadPopupContent: FC<{
 
 					<div className="px-4 sm:px-6 pb-4 sm:pb-6">
 						{!isLoading && !hasDocumentSummaryLLM ? (
-							<Alert
-								variant="destructive"
-								className="mb-4 bg-muted/50 rounded-xl border-destructive/30"
-							>
-								<AlertTriangle className="h-4 w-4" />
-								<AlertTitle>LLM Configuration Required</AlertTitle>
-								<AlertDescription className="mt-2">
-									<p className="mb-3">
-										{isAutoMode && !hasGlobalConfigs
-											? "Auto mode requires a global LLM configuration. Please add one in Settings"
-											: "A Document Summary LLM is required to process uploads, configure one in Settings"}
-									</p>
-									<Button
-										size="sm"
-										variant="outline"
-										onClick={() => {
-											onOpenChange(false);
-											setSearchSpaceSettingsDialog({
-												open: true,
-												initialTab: "models",
-											});
-										}}
-									>
-										<Settings className="mr-2 h-4 w-4" />
-										Go to Settings
-									</Button>
-								</AlertDescription>
-							</Alert>
+							<div className="mb-4">
+								<Alert variant="destructive">
+									<AlertTriangle />
+									<AlertTitle>LLM Configuration Required</AlertTitle>
+									<AlertDescription>
+										<p>
+											{isAutoMode && !hasGlobalConfigs
+												? "Auto mode requires a global LLM configuration. Please add one in Settings"
+												: "A Document Summary LLM is required to process uploads, configure one in Settings"}
+										</p>
+										<Button
+											size="sm"
+											variant="outline"
+											onClick={() => {
+												onOpenChange(false);
+												router.push(`/dashboard/${searchSpaceId}/search-space-settings/models`);
+											}}
+										>
+											<Settings className="mr-2 h-4 w-4" />
+											Go to Settings
+										</Button>
+									</AlertDescription>
+								</Alert>
+							</div>
 						) : (
 							<DocumentUploadTab searchSpaceId={searchSpaceId} onSuccess={handleSuccess} />
 						)}
diff --git a/surfsense_web/components/assistant-ui/edit-message-dialog.tsx b/surfsense_web/components/assistant-ui/edit-message-dialog.tsx
new file mode 100644
index 0000000000..807f16fe7a
--- /dev/null
+++ b/surfsense_web/components/assistant-ui/edit-message-dialog.tsx
@@ -0,0 +1,106 @@
+"use client";
+
+/**
+ * Confirmation dialog shown when the user edits a message that has
+ * reversible downstream actions. Three buttons:
+ *
+ *   • "Revert all & resubmit"  — POST regenerate with revert_actions=true
+ *   • "Continue without revert" — POST regenerate with revert_actions=false
+ *   • "Cancel"                  — abort the edit entirely
+ *
+ * The dialog is auto-skipped when zero reversible downstream actions
+ * exist (the caller checks first via ``downstreamReversibleCount``).
+ */
+
+import { useEffect, useRef, useState } from "react";
+import {
+	AlertDialog,
+	AlertDialogCancel,
+	AlertDialogContent,
+	AlertDialogDescription,
+	AlertDialogFooter,
+	AlertDialogHeader,
+	AlertDialogTitle,
+} from "@/components/ui/alert-dialog";
+import { Button } from "@/components/ui/button";
+
+export type EditMessageDialogChoice = "revert" | "continue" | "cancel";
+
+export interface EditMessageDialogProps {
+	open: boolean;
+	onOpenChange: (open: boolean) => void;
+	downstreamReversibleCount: number;
+	downstreamTotalCount: number;
+	onChoose: (choice: EditMessageDialogChoice) => void | Promise<void>;
+}
+
+export function EditMessageDialog({
+	open,
+	onOpenChange,
+	downstreamReversibleCount,
+	downstreamTotalCount,
+	onChoose,
+}: EditMessageDialogProps) {
+	const [busy, setBusy] = useState<EditMessageDialogChoice | null>(null);
+
+	// The parent's ``handleEditDialogChoice`` calls
+	// ``setEditDialogState(null)`` BEFORE awaiting ``handleRegenerate``.
+	// That collapses the dialog (Radix unmounts it) while ``onChoose``
+	// is still awaiting the long-running stream. Without this guard,
+	// the ``finally { setBusy(null) }`` below ran after unmount and
+	// produced a "state update on unmounted component" dev warning.
+	const mountedRef = useRef(true);
+	useEffect(() => {
+		mountedRef.current = true;
+		return () => {
+			mountedRef.current = false;
+		};
+	}, []);
+
+	const handle = async (choice: EditMessageDialogChoice) => {
+		setBusy(choice);
+		try {
+			await onChoose(choice);
+		} finally {
+			if (mountedRef.current) {
+				setBusy(null);
+			}
+		}
+	};
+
+	return (
+		<AlertDialog open={open} onOpenChange={onOpenChange}>
+			<AlertDialogContent>
+				<AlertDialogHeader>
+					<AlertDialogTitle>Edit this message?</AlertDialogTitle>
+					<AlertDialogDescription>
+						This edit drops {downstreamTotalCount} downstream message
+						{downstreamTotalCount === 1 ? "" : "s"} from the thread. {downstreamReversibleCount}{" "}
+						action
+						{downstreamReversibleCount === 1 ? "" : "s"} (e.g. file writes, connector changes) can
+						be rolled back. Pick how to handle them before regenerating.
+					</AlertDialogDescription>
+				</AlertDialogHeader>
+
+				<div className="grid gap-2">
+					<Button variant="default" disabled={busy !== null} onClick={() => handle("revert")}>
+						{busy === "revert"
+							? "Reverting & resubmitting…"
+							: `Revert ${downstreamReversibleCount} action${
+									downstreamReversibleCount === 1 ? "" : "s"
+								} & resubmit`}
+					</Button>
+					<Button variant="outline" disabled={busy !== null} onClick={() => handle("continue")}>
+						{busy === "continue" ? "Resubmitting…" : "Continue without reverting"}
+					</Button>
+				</div>
+
+				<AlertDialogFooter className="sm:justify-start">
+					<AlertDialogCancel disabled={busy !== null} onClick={() => handle("cancel")}>
+						Cancel
+					</AlertDialogCancel>
+				</AlertDialogFooter>
+			</AlertDialogContent>
+		</AlertDialog>
+	);
+}
diff --git a/surfsense_web/components/assistant-ui/image.tsx b/surfsense_web/components/assistant-ui/image.tsx
index b29b6ddcee..a82b532cf1 100644
--- a/surfsense_web/components/assistant-ui/image.tsx
+++ b/surfsense_web/components/assistant-ui/image.tsx
@@ -4,8 +4,9 @@ import type { ImageMessagePartComponent } from "@assistant-ui/react";
 import { cva, type VariantProps } from "class-variance-authority";
 import { ImageIcon, ImageOffIcon } from "lucide-react";
 import NextImage from "next/image";
-import { memo, type PropsWithChildren, useEffect, useRef, useState } from "react";
+import { memo, type PropsWithChildren, useEffect, useState } from "react";
 import { createPortal } from "react-dom";
+import { Button } from "@/components/ui/button";
 import { cn } from "@/lib/utils";
 
 const imageVariants = cva("aui-image-root relative overflow-hidden rounded-lg", {
@@ -44,8 +45,14 @@ function ImageRoot({ className, variant, size, children, ...props }: ImageRootPr
 	);
 }
 
-type ImagePreviewProps = Omit<React.ComponentProps<"img">, "children"> & {
+type ImagePreviewProps = Omit<
+	React.ComponentProps<"img">,
+	"children" | "height" | "onError" | "onLoad" | "src" | "width"
+> & {
 	containerClassName?: string;
+	onError?: React.ReactEventHandler<HTMLImageElement>;
+	onLoad?: React.ReactEventHandler<HTMLImageElement>;
+	src?: string;
 };
 
 function ImagePreview({
@@ -57,18 +64,17 @@ function ImagePreview({
 	src,
 	...props
 }: ImagePreviewProps) {
-	const imgRef = useRef<HTMLImageElement>(null);
 	const [loadedSrc, setLoadedSrc] = useState<string | undefined>(undefined);
 	const [errorSrc, setErrorSrc] = useState<string | undefined>(undefined);
+	const imageSrc = src ?? "";
 
-	const loaded = loadedSrc === src;
-	const error = errorSrc === src;
+	const loaded = imageSrc !== "" && loadedSrc === imageSrc;
+	const error = imageSrc === "" || errorSrc === imageSrc;
 
 	useEffect(() => {
-		if (typeof src === "string" && imgRef.current?.complete && imgRef.current.naturalWidth > 0) {
-			setLoadedSrc(src);
-		}
-	}, [src]);
+		setLoadedSrc((current) => (current === imageSrc ? current : undefined));
+		setErrorSrc((current) => (current === imageSrc ? current : undefined));
+	}, [imageSrc]);
 
 	return (
 		<div data-slot="image-preview" className={cn("relative min-h-32", containerClassName)}>
@@ -87,55 +93,22 @@ function ImagePreview({
 				>
 					<ImageOffIcon className="size-8 text-muted-foreground" />
 				</div>
-			) : isDataOrBlobUrl(src) ? (
-				// biome-ignore lint/performance/noImgElement: data/blob URLs need plain img
-				<img
-					ref={imgRef}
-					src={src}
-					alt={alt}
-					className={cn("block h-auto w-full object-contain", !loaded && "invisible", className)}
-					onLoad={(e) => {
-						if (typeof src === "string") setLoadedSrc(src);
-						onLoad?.(e);
-					}}
-					onError={(e) => {
-						if (typeof src === "string") setErrorSrc(src);
-						onError?.(e);
-					}}
-					{...props}
-				/>
 			) : (
-				// biome-ignore lint/performance/noImgElement: intentional for dynamic external URLs
-				// <img
-				// 	ref={imgRef}
-				// 	src={src}
-				// 	alt={alt}
-				// 	className={cn("block h-auto w-full object-contain", !loaded && "invisible", className)}
-				// 	onLoad={(e) => {
-				// 		if (typeof src === "string") setLoadedSrc(src);
-				// 		onLoad?.(e);
-				// 	}}
-				// 	onError={(e) => {
-				// 		if (typeof src === "string") setErrorSrc(src);
-				// 		onError?.(e);
-				// 	}}
-				// 	{...props}
-				// />
 				<NextImage
 					fill
-					src={src || ""}
+					src={imageSrc}
 					alt={alt}
 					sizes="(max-width: 768px) 100vw, (max-width: 1200px) 80vw, 60vw"
 					className={cn("block object-contain", !loaded && "invisible", className)}
-					onLoad={() => {
-						if (typeof src === "string") setLoadedSrc(src);
-						onLoad?.();
+					onLoad={(event) => {
+						setLoadedSrc(imageSrc);
+						onLoad?.(event);
 					}}
-					onError={() => {
-						if (typeof src === "string") setErrorSrc(src);
-						onError?.();
+					onError={(event) => {
+						setErrorSrc(imageSrc);
+						onError?.(event);
 					}}
-					unoptimized={false}
+					unoptimized={isDataOrBlobUrl(imageSrc)}
 					{...props}
 				/>
 			)}
@@ -196,59 +169,40 @@ function ImageZoom({ src, alt = "Image preview", children }: ImageZoomProps) {
 
 	return (
 		<>
-			<button
+			<Button
 				type="button"
+				variant="ghost"
 				onClick={handleOpen}
-				className="aui-image-zoom-trigger cursor-zoom-in border-0 bg-transparent p-0 text-left"
+				className="aui-image-zoom-trigger h-auto cursor-zoom-in border-0 bg-transparent p-0 text-left hover:bg-transparent"
 				aria-label="Click to zoom image"
 			>
 				{children}
-			</button>
+			</Button>
 			{isMounted &&
 				isOpen &&
 				createPortal(
-					<button
+					<Button
 						type="button"
+						variant="ghost"
 						data-slot="image-zoom-overlay"
-						className="aui-image-zoom-overlay fade-in fixed inset-0 z-50 flex animate-in cursor-zoom-out items-center justify-center border-0 bg-black/80 p-0 duration-200"
+						className="aui-image-zoom-overlay fade-in fixed inset-0 z-50 h-auto w-auto animate-in cursor-zoom-out items-center justify-center rounded-none border-0 bg-black/80 p-0 duration-200 hover:bg-black/80 focus-visible:ring-0"
 						onClick={handleClose}
 						aria-label="Close zoomed image"
 					>
-						{/** biome-ignore lint/performance/noImgElement: <explanation> */}
-						{isDataOrBlobUrl(src) ? (
-							// biome-ignore lint/performance/noImgElement: data/blob URLs need plain img
-							<img
-								data-slot="image-zoom-content"
-								src={src}
-								alt={alt}
-								className="aui-image-zoom-content fade-in zoom-in-95 max-h-[90vh] max-w-[90vw] animate-in object-contain duration-200"
-								onClick={(e) => {
-									e.stopPropagation();
-									handleClose();
-								}}
-								onKeyDown={(e) => {
-									if (e.key === "Enter") {
-										e.stopPropagation();
-										handleClose();
-									}
-								}}
-							/>
-						) : (
-							<NextImage
-								data-slot="image-zoom-content"
-								fill
-								src={src}
-								alt={alt}
-								sizes="90vw"
-								className="aui-image-zoom-content fade-in zoom-in-95 object-contain duration-200"
-								onClick={(e) => {
-									e.stopPropagation();
-									handleClose();
-								}}
-								unoptimized={false}
-							/>
-						)}
-					</button>,
+						<NextImage
+							data-slot="image-zoom-content"
+							fill
+							src={src}
+							alt={alt}
+							sizes="90vw"
+							className="aui-image-zoom-content fade-in zoom-in-95 object-contain duration-200"
+							onClick={(e) => {
+								e.stopPropagation();
+								handleClose();
+							}}
+							unoptimized={isDataOrBlobUrl(src)}
+						/>
+					</Button>,
 					document.body
 				)}
 		</>
diff --git a/surfsense_web/components/assistant-ui/inline-citation.tsx b/surfsense_web/components/assistant-ui/inline-citation.tsx
index 0c99090e6c..a788c0ce6c 100644
--- a/surfsense_web/components/assistant-ui/inline-citation.tsx
+++ b/surfsense_web/components/assistant-ui/inline-citation.tsx
@@ -1,10 +1,39 @@
 "use client";
 
+import { useQuery } from "@tanstack/react-query";
+import { useSetAtom } from "jotai";
+import { ExternalLink, FileText } from "lucide-react";
+import dynamic from "next/dynamic";
 import type { FC } from "react";
 import { useState } from "react";
+import { openCitationPanelAtom } from "@/atoms/citation/citation-panel.atom";
 import { useCitationMetadata } from "@/components/assistant-ui/citation-metadata-context";
-import { SourceDetailPanel } from "@/components/new-chat/source-detail-panel";
+import { CitationPanelContent } from "@/components/citation-panel/citation-panel";
 import { Citation } from "@/components/tool-ui/citation";
+import { CitationHoverPopover } from "@/components/tool-ui/citation/citation-hover-popover";
+import { Button } from "@/components/ui/button";
+import {
+	Drawer,
+	DrawerContent,
+	DrawerHandle,
+	DrawerHeader,
+	DrawerTitle,
+} from "@/components/ui/drawer";
+import { Spinner } from "@/components/ui/spinner";
+import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
+import { useMediaQuery } from "@/hooks/use-media-query";
+import { documentsApiService } from "@/lib/apis/documents-api.service";
+import { cacheKeys } from "@/lib/query-client/cache-keys";
+
+// Lazily load MarkdownViewer here to break the static import cycle:
+// `markdown-viewer.tsx` → `citation-renderer.tsx` → `inline-citation.tsx`
+// would otherwise pull `markdown-viewer.tsx` back in at module-init time.
+// Only `SurfsenseDocCitation` (popover body) ever renders this viewer, so
+// the lazy boundary is invisible to most call paths.
+const MarkdownViewer = dynamic(
+	() => import("@/components/markdown-viewer").then((m) => m.MarkdownViewer),
+	{ ssr: false, loading: () => <Spinner size="xs" /> }
+);
 
 interface InlineCitationProps {
 	chunkId: number;
@@ -12,44 +41,216 @@ interface InlineCitationProps {
 }
 
 /**
- * Inline citation for knowledge-base chunks (numeric chunk IDs).
- * Renders a clickable badge showing the actual chunk ID that opens the SourceDetailPanel.
+ * Inline citation badge for knowledge-base chunks (numeric chunk IDs) and
+ * Surfsense documentation chunks (`isDocsChunk`). Negative chunk IDs render as
+ * a static "doc" pill (anonymous/synthetic uploads).
+ *
+ * Numeric KB chunks: clicking opens the citation panel in the right
+ * sidebar (alongside the chat — does not replace it). The panel shows
+ * the cited chunk surrounded by adjacent chunks (via the API's
+ * `chunk_window`), with the cited one highlighted and an option to
+ * expand the window or jump into the full document via the editor panel.
+ *
+ * Surfsense docs chunks: rendered as a hover-controlled shadcn Popover that
+ * lazily fetches and previews the cited chunk inline, since those docs aren't
+ * indexed into the user's search space and have no tab to open.
  */
 export const InlineCitation: FC<InlineCitationProps> = ({ chunkId, isDocsChunk = false }) => {
-	const [isOpen, setIsOpen] = useState(false);
+	if (chunkId < 0) {
+		return (
+			<Tooltip>
+				<TooltipTrigger asChild>
+					<span
+						className="ml-0.5 inline-flex h-5 min-w-5 items-center justify-center gap-0.5 rounded-md bg-popover px-1.5 text-[11px] font-medium text-popover-foreground/80 align-baseline"
+						role="note"
+					>
+						<FileText className="size-3" />
+						doc
+					</span>
+				</TooltipTrigger>
+				<TooltipContent>Uploaded document</TooltipContent>
+			</Tooltip>
+		);
+	}
+
+	if (isDocsChunk) {
+		return <SurfsenseDocCitation chunkId={chunkId} />;
+	}
+
+	return <NumericChunkCitation chunkId={chunkId} />;
+};
+
+const NumericChunkCitation: FC<{ chunkId: number }> = ({ chunkId }) => {
+	const isTouchLike = useMediaQuery("(hover: none), (pointer: coarse)");
+	const openCitationPanel = useSetAtom(openCitationPanelAtom);
+	const [mobilePreviewOpen, setMobilePreviewOpen] = useState(false);
+
+	const handleClick = () => {
+		if (isTouchLike) {
+			setMobilePreviewOpen(true);
+			return;
+		}
+		openCitationPanel({ chunkId });
+	};
 
 	return (
-		<SourceDetailPanel
-			open={isOpen}
-			onOpenChange={setIsOpen}
-			chunkId={chunkId}
-			sourceType={isDocsChunk ? "SURFSENSE_DOCS" : ""}
-			title={isDocsChunk ? "Surfsense Documentation" : "Source"}
-			description=""
-			url=""
-			isDocsChunk={isDocsChunk}
-		>
-			<button
+		<>
+			<Button
 				type="button"
-				onClick={() => setIsOpen(true)}
-				className="ml-0.5 inline-flex h-5 min-w-5 cursor-pointer items-center justify-center rounded-md bg-muted/60 px-1.5 text-[11px] font-medium text-muted-foreground align-baseline shadow-sm transition-colors hover:bg-muted hover:text-foreground focus-visible:ring-ring focus-visible:ring-2 focus-visible:outline-none"
+				variant="ghost"
+				onClick={handleClick}
+				className="ml-0.5 inline-flex h-5 min-w-5 items-center justify-center gap-0.5 rounded-md bg-popover px-1.5 text-[11px] font-medium text-popover-foreground/80 align-baseline"
 				title={`View source chunk #${chunkId}`}
+				aria-label={`View cited chunk ${chunkId}`}
 			>
 				{chunkId}
-			</button>
-		</SourceDetailPanel>
+			</Button>
+			<Drawer
+				open={mobilePreviewOpen}
+				onOpenChange={setMobilePreviewOpen}
+				shouldScaleBackground={false}
+			>
+				<DrawerContent
+					className="h-[85vh] max-h-[85vh] z-80 overflow-hidden"
+					overlayClassName="z-80"
+				>
+					<DrawerHandle />
+					<DrawerHeader className="pb-0">
+						<DrawerTitle>Citation</DrawerTitle>
+					</DrawerHeader>
+					<div className="min-h-0 flex-1 flex flex-col overflow-hidden">
+						<CitationPanelContent chunkId={chunkId} showHeader={false} />
+					</div>
+				</DrawerContent>
+			</Drawer>
+		</>
 	);
 };
 
-function extractDomain(url: string): string {
-	try {
-		const hostname = new URL(url).hostname;
-		return hostname.replace(/^www\./, "");
-	} catch {
-		return url;
-	}
+const SurfsenseDocCitation: FC<{ chunkId: number }> = ({ chunkId }) => {
+	const isTouchLike = useMediaQuery("(hover: none), (pointer: coarse)");
+	const [mobilePreviewOpen, setMobilePreviewOpen] = useState(false);
+	const docQuery = useSurfsenseDocPreviewQuery(chunkId, mobilePreviewOpen);
+
+	const handleMobileClick = () => {
+		setMobilePreviewOpen(true);
+	};
+
+	return (
+		<>
+			<CitationHoverPopover
+				id={`doc-${chunkId}`}
+				contentClassName="w-96 max-w-[calc(100vw-2rem)] p-0"
+				align="start"
+				trigger={(hoverProps) => (
+					<Button
+						type="button"
+						variant="ghost"
+						size={null}
+						onClick={isTouchLike ? handleMobileClick : undefined}
+						className="ml-0.5 inline-flex h-5 min-w-5 items-center justify-center gap-0.5 rounded-md bg-popover px-1.5 text-[11px] font-medium text-popover-foreground/80 align-baseline"
+						aria-label={`Show Surfsense documentation chunk ${chunkId}`}
+						title="Surfsense documentation"
+						{...hoverProps}
+					>
+						<FileText className="size-3" />
+						doc
+					</Button>
+				)}
+			>
+				<SurfsenseDocPreview chunkId={chunkId} />
+			</CitationHoverPopover>
+			<Drawer
+				open={mobilePreviewOpen}
+				onOpenChange={setMobilePreviewOpen}
+				shouldScaleBackground={false}
+			>
+				<DrawerContent className="max-h-[85vh] z-80" overlayClassName="z-80">
+					<DrawerHandle />
+					<DrawerHeader className="pb-0">
+						<DrawerTitle>Surfsense documentation</DrawerTitle>
+					</DrawerHeader>
+					<SurfsenseDocPreviewContent
+						chunkId={chunkId}
+						query={docQuery}
+						contentClassName="max-h-[60vh]"
+					/>
+				</DrawerContent>
+			</Drawer>
+		</>
+	);
+};
+
+function useSurfsenseDocPreviewQuery(chunkId: number, enabled = true) {
+	return useQuery({
+		queryKey: cacheKeys.documents.byChunk(`doc-${chunkId}`),
+		queryFn: () => documentsApiService.getSurfsenseDocByChunk(chunkId),
+		staleTime: 5 * 60 * 1000,
+		enabled,
+	});
 }
 
+type SurfsenseDocPreviewQuery = ReturnType<typeof useSurfsenseDocPreviewQuery>;
+
+const SurfsenseDocPreview: FC<{ chunkId: number }> = ({ chunkId }) => {
+	const query = useSurfsenseDocPreviewQuery(chunkId);
+
+	return <SurfsenseDocPreviewContent chunkId={chunkId} query={query} />;
+};
+
+const SurfsenseDocPreviewContent: FC<{
+	chunkId: number;
+	query: SurfsenseDocPreviewQuery;
+	contentClassName?: string;
+}> = ({ chunkId, query, contentClassName = "max-h-72" }) => {
+	const { data, isLoading, error } = query;
+
+	const citedChunk = data?.chunks.find((c) => c.id === chunkId) ?? data?.chunks[0];
+
+	return (
+		<>
+			<div className="flex items-center justify-between gap-2 border-b px-3 py-2">
+				<div className="min-w-0">
+					<p className="truncate text-sm font-medium">{data?.title ?? "Surfsense documentation"}</p>
+					<p className="text-[11px] text-muted-foreground">Chunk #{chunkId}</p>
+				</div>
+				{data?.public_url && (
+					<a
+						href={data.public_url}
+						target="_blank"
+						rel="noopener noreferrer"
+						className="inline-flex shrink-0 items-center gap-1 rounded-md px-2 py-1 text-[11px] font-medium text-primary hover:bg-primary/10"
+					>
+						<ExternalLink className="size-3" />
+						Open
+					</a>
+				)}
+			</div>
+			<div className={`${contentClassName} overflow-auto px-3 py-2 text-sm`}>
+				{isLoading && (
+					<div className="flex items-center gap-2 py-4 text-muted-foreground">
+						<Spinner size="xs" />
+						<span className="text-xs">Loading…</span>
+					</div>
+				)}
+				{error && (
+					<p className="py-4 text-xs text-destructive">
+						{error instanceof Error ? error.message : "Failed to load chunk"}
+					</p>
+				)}
+				{!isLoading && !error && citedChunk?.content && (
+					<MarkdownViewer content={citedChunk.content} maxLength={1500} enableCitations />
+				)}
+				{!isLoading && !error && !citedChunk?.content && (
+					<p className="py-4 text-xs text-muted-foreground">No content available.</p>
+				)}
+			</div>
+		</>
+	);
+};
+
+import { tryGetHostname } from "@/lib/url";
+
 interface UrlCitationProps {
 	url: string;
 }
@@ -60,7 +261,7 @@ interface UrlCitationProps {
  * page title and snippet (extracted deterministically from web_search tool results).
  */
 export const UrlCitation: FC<UrlCitationProps> = ({ url }) => {
-	const domain = extractDomain(url);
+	const domain = tryGetHostname(url) ?? url;
 	const meta = useCitationMetadata(url);
 
 	return (
diff --git a/surfsense_web/components/assistant-ui/inline-mention-editor.tsx b/surfsense_web/components/assistant-ui/inline-mention-editor.tsx
index 2d55f4d20b..2c8ad62636 100644
--- a/surfsense_web/components/assistant-ui/inline-mention-editor.tsx
+++ b/surfsense_web/components/assistant-ui/inline-mention-editor.tsx
@@ -1,33 +1,67 @@
 "use client";
 
-import { X } from "lucide-react";
+import { Folder as FolderIcon, X as XIcon } from "lucide-react";
+import type { NodeEntry, TElement } from "platejs";
+import type { PlateElementProps } from "platejs/react";
 import {
-	createElement,
+	createPlatePlugin,
+	ParagraphPlugin,
+	Plate,
+	PlateContent,
+	usePlateEditor,
+} from "platejs/react";
+import {
+	createContext,
+	type FC,
 	forwardRef,
 	useCallback,
-	useEffect,
+	useContext,
 	useImperativeHandle,
+	useMemo,
 	useRef,
-	useState,
 } from "react";
-import ReactDOMServer from "react-dom/server";
+import { FOLDER_MENTION_DOCUMENT_TYPE } from "@/atoms/chat/mentioned-documents.atom";
 import { getConnectorIcon } from "@/contracts/enums/connectorIcons";
 import type { Document } from "@/contracts/types/document.types";
+import { getMentionDocKey } from "@/lib/chat/mention-doc-key";
 import { cn } from "@/lib/utils";
 
+export type MentionKind = "doc" | "folder";
+
 export interface MentionedDocument {
 	id: number;
 	title: string;
 	document_type?: string;
+	kind: MentionKind;
 }
 
+/**
+ * Input shape for inserting a chip. ``kind`` defaults to ``"doc"``.
+ * Folder chips default ``document_type`` to ``FOLDER_MENTION_DOCUMENT_TYPE``
+ * so the dedup key never collides with a doc chip sharing the same id.
+ */
+export type MentionChipInput = {
+	id: number;
+	title: string;
+	document_type?: string;
+	kind?: MentionKind;
+};
+
 export interface InlineMentionEditorRef {
 	focus: () => void;
 	clear: () => void;
 	setText: (text: string) => void;
 	getText: () => string;
 	getMentionedDocuments: () => MentionedDocument[];
-	insertDocumentChip: (doc: Pick<Document, "id" | "title" | "document_type">) => void;
+	insertMentionChip: (mention: MentionChipInput, options?: { removeTriggerText?: boolean }) => void;
+	/**
+	 * @deprecated Use ``insertMentionChip``. Kept for one transition
+	 * cycle so we don't break ad-hoc callers; prefer the new name.
+	 */
+	insertDocumentChip: (
+		doc: Pick<Document, "id" | "title" | "document_type">,
+		options?: { removeTriggerText?: boolean }
+	) => void;
 	removeDocumentChip: (docId: number, docType?: string) => void;
 	setDocumentChipStatus: (
 		docId: number,
@@ -49,42 +83,220 @@ interface InlineMentionEditorProps {
 	onKeyDown?: (e: React.KeyboardEvent) => void;
 	disabled?: boolean;
 	className?: string;
-	initialDocuments?: MentionedDocument[];
 	initialText?: string;
 }
 
-// Unique data attribute to identify chip elements
-const CHIP_DATA_ATTR = "data-mention-chip";
-const CHIP_ID_ATTR = "data-mention-id";
-const CHIP_DOCTYPE_ATTR = "data-mention-doctype";
-const CHIP_STATUS_ATTR = "data-mention-status";
+type MentionStatusKind = "pending" | "processing" | "ready" | "failed";
+type ComposerTextNode = { text: string };
+type MentionElementNode = {
+	type: "mention";
+	id: number;
+	title: string;
+	document_type?: string;
+	/** Discriminator; defaults to ``"doc"`` for legacy nodes. */
+	kind?: MentionKind;
+	statusLabel?: string | null;
+	statusKind?: MentionStatusKind;
+	children: [{ text: "" }];
+};
+type ComposerNode = ComposerTextNode | MentionElementNode;
+type ComposerParagraph = { type: "p"; children: ComposerNode[] };
+type ComposerValue = ComposerParagraph[];
+
+const MENTION_TYPE = "mention";
+const MENTION_CHIP_CLASSNAME =
+	"group inline-flex h-5 items-center gap-1 mx-0.5 rounded bg-primary/10 px-1 text-xs font-bold text-primary/60 select-none align-middle leading-none";
+const MENTION_CHIP_ICON_CLASSNAME = "flex items-center text-muted-foreground leading-none";
+const MENTION_CHIP_TITLE_CLASSNAME = "max-w-[120px] truncate leading-none";
+const COMPOSER_TEXT_METRICS_CLASSNAME = "text-sm leading-6";
+
+const EMPTY_VALUE: ComposerValue = [{ type: "p", children: [{ text: "" }] }];
 
 /**
- * Type guard to check if a node is a chip element
+ * Lets ``MentionElement`` reach the editor's chip-removal helper so
+ * the X button and Backspace go through the same call site.
  */
-function isChipElement(node: Node | null): node is HTMLSpanElement {
+type MentionEditorContextValue = {
+	removeChip: (docId: number, docType: string | undefined) => void;
+};
+const MentionEditorContext = createContext<MentionEditorContextValue | null>(null);
+
+const MentionElement: FC<PlateElementProps<MentionElementNode>> = ({
+	attributes,
+	children,
+	element,
+}) => {
+	const statusClass =
+		element.statusKind === "failed"
+			? "text-destructive"
+			: element.statusKind === "ready"
+				? "text-emerald-700"
+				: "text-amber-700";
+
+	const isFolder = element.kind === "folder";
+	const ctx = useContext(MentionEditorContext);
+
 	return (
-		node !== null &&
-		node.nodeType === Node.ELEMENT_NODE &&
-		(node as Element).hasAttribute(CHIP_DATA_ATTR)
+		<span {...attributes} className="inline-flex align-middle">
+			<span contentEditable={false} className={`${MENTION_CHIP_CLASSNAME} cursor-default`}>
+				<span className={MENTION_CHIP_ICON_CLASSNAME}>
+					<span className="relative flex h-3 w-3 items-center justify-center">
+						<span className="flex items-center justify-center transition-opacity group-hover:opacity-0">
+							{isFolder ? (
+								<FolderIcon className="h-3 w-3" />
+							) : (
+								getConnectorIcon(element.document_type ?? "UNKNOWN", "h-3 w-3")
+							)}
+						</span>
+						{ctx ? (
+							<button
+								type="button"
+								aria-label={`Remove mention ${element.title}`}
+								title={`Remove ${element.title}`}
+								onMouseDown={(e) => e.preventDefault()}
+								onClick={(e) => {
+									e.stopPropagation();
+									ctx.removeChip(element.id, element.document_type);
+								}}
+								className="absolute inset-0 flex items-center justify-center rounded-sm opacity-0 transition-opacity hover:text-primary focus-visible:opacity-100 focus-visible:outline-none group-hover:opacity-100"
+							>
+								<XIcon className="h-3 w-3" />
+							</button>
+						) : null}
+					</span>
+				</span>
+				<span className={MENTION_CHIP_TITLE_CLASSNAME} title={element.title}>
+					{element.title}
+				</span>
+				{element.statusLabel ? (
+					<span className={cn("text-[10px] font-semibold opacity-80", statusClass)}>
+						{element.statusLabel}
+					</span>
+				) : null}
+			</span>
+			{children}
+		</span>
 	);
+};
+
+const MentionPlugin = createPlatePlugin({
+	key: MENTION_TYPE,
+	node: {
+		isElement: true,
+		isInline: true,
+		isVoid: true,
+		type: MENTION_TYPE,
+		component: MentionElement,
+	},
+});
+
+function isMentionNode(node: ComposerNode): node is MentionElementNode {
+	return typeof node === "object" && "type" in node && node.type === MENTION_TYPE;
 }
 
-/**
- * Safely parse chip ID from element attribute
- */
-function getChipId(element: Element): number | null {
-	const idStr = element.getAttribute(CHIP_ID_ATTR);
-	if (!idStr) return null;
-	const id = parseInt(idStr, 10);
-	return Number.isNaN(id) ? null : id;
+function getTextNode(node: ComposerNode): ComposerTextNode | null {
+	if (typeof node === "object" && "text" in node && typeof node.text === "string") return node;
+	return null;
 }
 
-/**
- * Get chip document type from element attribute
- */
-function getChipDocType(element: Element): string {
-	return element.getAttribute(CHIP_DOCTYPE_ATTR) ?? "UNKNOWN";
+function toValueFromText(text: string): ComposerValue {
+	const lines = text.split("\n");
+	if (lines.length === 0) return EMPTY_VALUE;
+	return lines.map((line) => ({ type: "p", children: [{ text: line }] })) as ComposerValue;
+}
+
+function getPlainText(value: ComposerValue): string {
+	const lines = value.map((block) =>
+		block.children
+			.map((node) => {
+				if (isMentionNode(node)) return `@${node.title}`;
+				return getTextNode(node)?.text ?? "";
+			})
+			.join("")
+	);
+	return lines.join("\n").trim();
+}
+
+function getMentionedDocuments(value: ComposerValue): MentionedDocument[] {
+	const map = new Map<string, MentionedDocument>();
+	for (const block of value) {
+		for (const node of block.children) {
+			if (!isMentionNode(node)) continue;
+			const kind: MentionKind = node.kind ?? "doc";
+			const doc: MentionedDocument = {
+				id: node.id,
+				title: node.title,
+				document_type: node.document_type,
+				kind,
+			};
+			map.set(getMentionDocKey(doc), doc);
+		}
+	}
+	return Array.from(map.values());
+}
+
+type EditorSelection = {
+	anchor: { path: number[]; offset: number };
+	focus: { path: number[]; offset: number };
+} | null;
+
+function getCursorTextContext(value: ComposerValue, selection: EditorSelection) {
+	if (!selection || !selection.anchor || !selection.focus) return null;
+	if (
+		selection.anchor.path.length < 2 ||
+		selection.focus.path.length < 2 ||
+		selection.anchor.path[0] !== selection.focus.path[0] ||
+		selection.anchor.path[1] !== selection.focus.path[1]
+	) {
+		return null;
+	}
+
+	const block = value[selection.anchor.path[0]];
+	if (!block) return null;
+	const child = block.children[selection.anchor.path[1]];
+	const textNode = getTextNode(child);
+	if (!textNode) return null;
+
+	return {
+		blockIndex: selection.anchor.path[0],
+		childIndex: selection.anchor.path[1],
+		text: textNode.text,
+		cursor: selection.anchor.offset,
+	};
+}
+
+function scanActiveTrigger(text: string, cursor: number) {
+	let wordStart = 0;
+	for (let i = cursor - 1; i >= 0; i--) {
+		if (text[i] === " " || text[i] === "\n") {
+			wordStart = i + 1;
+			break;
+		}
+	}
+
+	let triggerChar: "@" | "/" | null = null;
+	let triggerIndex = -1;
+	for (let i = wordStart; i < cursor; i++) {
+		if (text[i] === "@" || text[i] === "/") {
+			triggerChar = text[i] as "@" | "/";
+			triggerIndex = i;
+			break;
+		}
+	}
+	if (!triggerChar || triggerIndex === -1) return null;
+
+	const query = text.slice(triggerIndex + 1, cursor);
+	if (query.startsWith(" ")) return null;
+	if (
+		triggerChar === "/" &&
+		triggerIndex > 0 &&
+		text[triggerIndex - 1] !== " " &&
+		text[triggerIndex - 1] !== "\n"
+	) {
+		return null;
+	}
+
+	return { triggerChar, query };
 }
 
 export const InlineMentionEditor = forwardRef<InlineMentionEditorRef, InlineMentionEditorProps>(
@@ -101,653 +313,349 @@ export const InlineMentionEditor = forwardRef<InlineMentionEditorRef, InlineMent
 			onKeyDown,
 			disabled = false,
 			className,
-			initialDocuments = [],
 			initialText,
 		},
 		ref
 	) => {
-		const editorRef = useRef<HTMLDivElement>(null);
-		const [isEmpty, setIsEmpty] = useState(true);
-		const [mentionedDocs, setMentionedDocs] = useState<Map<string, MentionedDocument>>(
-			() => new Map(initialDocuments.map((d) => [`${d.document_type ?? "UNKNOWN"}:${d.id}`, d]))
-		);
-		const isComposingRef = useRef(false);
-
-		// Sync initial documents
-		useEffect(() => {
-			if (initialDocuments.length > 0) {
-				setMentionedDocs(
-					new Map(initialDocuments.map((d) => [`${d.document_type ?? "UNKNOWN"}:${d.id}`, d]))
-				);
-			}
-		}, [initialDocuments]);
-
-		useEffect(() => {
-			if (!initialText || !editorRef.current) return;
-			editorRef.current.innerText = initialText;
-			editorRef.current.appendChild(document.createElement("br"));
-			editorRef.current.appendChild(document.createElement("br"));
-			setIsEmpty(false);
-			onChange?.(initialText, Array.from(mentionedDocs.values()));
-			editorRef.current.focus();
-			const sel = window.getSelection();
-			const range = document.createRange();
-			range.selectNodeContents(editorRef.current);
-			range.collapse(false);
-			sel?.removeAllRanges();
-			sel?.addRange(range);
-			const anchor = document.createElement("span");
-			range.insertNode(anchor);
-			anchor.scrollIntoView({ block: "end" });
-			anchor.remove();
-		}, [initialText]); // eslint-disable-line react-hooks/exhaustive-deps
-
-		// Focus at the end of the editor
+		const editableRef = useRef<HTMLDivElement | null>(null);
+		const editor = usePlateEditor({
+			readOnly: disabled,
+			plugins: [ParagraphPlugin, MentionPlugin],
+			value: initialText ? toValueFromText(initialText) : EMPTY_VALUE,
+		});
+
+		// Move the caret to end-of-doc and focus the editor. Falls back
+		// to DOM focus if Plate's API throws (transient unmount race).
 		const focusAtEnd = useCallback(() => {
-			if (!editorRef.current) return;
-			editorRef.current.focus();
-			const selection = window.getSelection();
-			const range = document.createRange();
-			range.selectNodeContents(editorRef.current);
-			range.collapse(false);
-			selection?.removeAllRanges();
-			selection?.addRange(range);
-		}, []);
-
-		// Get plain text content with inline mention tokens for chips.
-		// This preserves the original query structure sent to the backend/LLM.
-		const getText = useCallback((): string => {
-			if (!editorRef.current) return "";
-
-			const extractText = (node: Node): string => {
-				if (node.nodeType === Node.TEXT_NODE) {
-					return node.textContent ?? "";
-				}
-
-				if (node.nodeType === Node.ELEMENT_NODE) {
-					const element = node as Element;
-
-					// Preserve mention chips as inline @title tokens.
-					if (element.hasAttribute(CHIP_DATA_ATTR)) {
-						const title = element.querySelector("[data-mention-title='true']")?.textContent?.trim();
-						if (title) {
-							return `@${title}`;
-						}
-						return "";
-					}
-
-					let result = "";
-					for (const child of Array.from(element.childNodes)) {
-						result += extractText(child);
-					}
-					return result;
-				}
-
-				return "";
-			};
-
-			return extractText(editorRef.current).trim();
-		}, []);
-
-		// Get all mentioned documents
-		const getMentionedDocuments = useCallback((): MentionedDocument[] => {
-			return Array.from(mentionedDocs.values());
-		}, [mentionedDocs]);
-
-		// Create a chip element for a document
-		const createChipElement = useCallback(
-			(doc: MentionedDocument): HTMLSpanElement => {
-				const chip = document.createElement("span");
-				chip.setAttribute(CHIP_DATA_ATTR, "true");
-				chip.setAttribute(CHIP_ID_ATTR, String(doc.id));
-				chip.setAttribute(CHIP_DOCTYPE_ATTR, doc.document_type ?? "UNKNOWN");
-				chip.contentEditable = "false";
-				chip.className =
-					"inline-flex items-center gap-1 mx-0.5 px-1 py-0.5 rounded bg-primary/10 text-xs font-bold text-primary/60 select-none cursor-default";
-				chip.style.userSelect = "none";
-				chip.style.verticalAlign = "baseline";
-
-				// Container that swaps between icon and remove button on hover
-				const iconContainer = document.createElement("span");
-				iconContainer.className = "shrink-0 flex items-center size-3 relative";
-
-				const iconSpan = document.createElement("span");
-				iconSpan.className = "flex items-center text-muted-foreground";
-				iconSpan.innerHTML = ReactDOMServer.renderToString(
-					getConnectorIcon(doc.document_type ?? "UNKNOWN", "h-3 w-3")
-				);
-
-				const removeBtn = document.createElement("button");
-				removeBtn.type = "button";
-				removeBtn.className =
-					"size-3 items-center justify-center rounded-full text-muted-foreground transition-colors";
-				removeBtn.style.display = "none";
-				removeBtn.innerHTML = ReactDOMServer.renderToString(
-					createElement(X, { className: "h-3 w-3", strokeWidth: 2.5 })
-				);
-				removeBtn.onclick = (e) => {
-					e.preventDefault();
-					e.stopPropagation();
-					chip.remove();
-					const docKey = `${doc.document_type ?? "UNKNOWN"}:${doc.id}`;
-					setMentionedDocs((prev) => {
-						const next = new Map(prev);
-						next.delete(docKey);
-						return next;
-					});
-					onDocumentRemove?.(doc.id, doc.document_type);
-					focusAtEnd();
-				};
-
-				const titleSpan = document.createElement("span");
-				titleSpan.className = "max-w-[120px] truncate";
-				titleSpan.textContent = doc.title;
-				titleSpan.title = doc.title;
-				titleSpan.setAttribute("data-mention-title", "true");
-
-				const statusSpan = document.createElement("span");
-				statusSpan.setAttribute(CHIP_STATUS_ATTR, "true");
-				statusSpan.className = "text-[10px] font-semibold opacity-80 hidden";
-
-				const isTouchDevice = window.matchMedia("(hover: none)").matches;
-				if (isTouchDevice) {
-					// Mobile: icon on left, title, X on right
-					chip.appendChild(iconSpan);
-					chip.appendChild(titleSpan);
-					chip.appendChild(statusSpan);
-					removeBtn.style.display = "flex";
-					removeBtn.className += " ml-0.5";
-					chip.appendChild(removeBtn);
-				} else {
-					// Desktop: icon/X swap on hover in the same slot
-					iconContainer.appendChild(iconSpan);
-					iconContainer.appendChild(removeBtn);
-					chip.addEventListener("mouseenter", () => {
-						iconSpan.style.display = "none";
-						removeBtn.style.display = "flex";
-					});
-					chip.addEventListener("mouseleave", () => {
-						iconSpan.style.display = "";
-						removeBtn.style.display = "none";
-					});
-					chip.appendChild(iconContainer);
-					chip.appendChild(titleSpan);
-					chip.appendChild(statusSpan);
-				}
+			try {
+				editor.tf.select(editor.api.end([]));
+				editor.tf.focus();
+			} catch {
+				editableRef.current?.focus();
+			}
+		}, [editor]);
 
-				return chip;
-			},
-			[focusAtEnd, onDocumentRemove]
+		const getCurrentValue = useCallback(
+			() => (editor.children as ComposerValue) ?? EMPTY_VALUE,
+			[editor]
 		);
 
-		// Insert a document chip at the current cursor position
-		const insertDocumentChip = useCallback(
-			(doc: Pick<Document, "id" | "title" | "document_type">) => {
-				if (!editorRef.current) return;
+		const emitState = useCallback(
+			(nextValue: ComposerValue) => {
+				const text = getPlainText(nextValue);
+				const docs = getMentionedDocuments(nextValue);
+				onChange?.(text, docs);
 
-				// Validate required fields for type safety
-				if (typeof doc.id !== "number" || typeof doc.title !== "string") {
-					console.warn("[InlineMentionEditor] Invalid document passed to insertDocumentChip:", doc);
+				const cursorCtx = getCursorTextContext(nextValue, editor.selection);
+				if (!cursorCtx) {
+					onMentionClose?.();
+					onActionClose?.();
 					return;
 				}
 
-				const mentionDoc: MentionedDocument = {
-					id: doc.id,
-					title: doc.title,
-					document_type: doc.document_type,
-				};
+				const trigger = scanActiveTrigger(cursorCtx.text, cursorCtx.cursor);
+				if (!trigger) {
+					onMentionClose?.();
+					onActionClose?.();
+					return;
+				}
 
-				// Add to mentioned docs map using unique key
-				const docKey = `${doc.document_type ?? "UNKNOWN"}:${doc.id}`;
-				setMentionedDocs((prev) => new Map(prev).set(docKey, mentionDoc));
-
-				// Find and remove the @query text
-				const selection = window.getSelection();
-				if (!selection || selection.rangeCount === 0) {
-					// No selection, just append
-					const chip = createChipElement(mentionDoc);
-					editorRef.current.appendChild(chip);
-					editorRef.current.appendChild(document.createTextNode(" "));
-					focusAtEnd();
+				if (trigger.triggerChar === "@") {
+					onMentionTrigger?.(trigger.query);
+					onActionClose?.();
 					return;
 				}
 
-				// Find the @ symbol before the cursor and remove it along with any query text
-				const range = selection.getRangeAt(0);
-				const textNode = range.startContainer;
+				onActionTrigger?.(trigger.query);
+				onMentionClose?.();
+			},
+			[editor.selection, onActionClose, onActionTrigger, onChange, onMentionClose, onMentionTrigger]
+		);
 
-				if (textNode.nodeType === Node.TEXT_NODE) {
-					const text = textNode.textContent || "";
-					const cursorPos = range.startOffset;
+		const setValue = useCallback(
+			(nextValue: ComposerValue) => {
+				const tf = editor.tf as { setValue: (value: ComposerValue) => void };
+				tf.setValue(nextValue);
+				emitState(nextValue);
+			},
+			[editor, emitState]
+		);
 
-					// Find the @ symbol before cursor
-					let atIndex = -1;
-					for (let i = cursorPos - 1; i >= 0; i--) {
-						if (text[i] === "@") {
-							atIndex = i;
-							break;
-						}
-					}
+		// Insert chip + trailing space as a single ``insertNodes`` call.
+		// The chip is a void inline; ``select: true`` on it alone would
+		// land the caret inside its empty children (an unrenderable
+		// point). With the space as the last inserted node, the caret
+		// resolves to that text node and stays visible. The
+		// ``withoutNormalizing`` wrapper batches the optional trigger
+		// delete + insert into a single undo step.
+		const insertMentionChip = useCallback(
+			(mention: MentionChipInput, options?: { removeTriggerText?: boolean }) => {
+				if (typeof mention.id !== "number" || typeof mention.title !== "string") return;
+
+				const removeTriggerText = options?.removeTriggerText ?? true;
+				const kind: MentionKind = mention.kind ?? "doc";
+				const document_type =
+					mention.document_type ?? (kind === "folder" ? FOLDER_MENTION_DOCUMENT_TYPE : undefined);
+				const mentionNode: MentionElementNode = {
+					type: MENTION_TYPE,
+					id: mention.id,
+					title: mention.title,
+					document_type,
+					kind,
+					children: [{ text: "" }],
+				};
 
-					if (atIndex !== -1) {
-						// Remove @query and insert chip
-						const beforeAt = text.slice(0, atIndex);
-						const afterCursor = text.slice(cursorPos);
-
-						// Create chip
-						const chip = createChipElement(mentionDoc);
-
-						// Replace text node content
-						const parent = textNode.parentNode;
-						if (parent) {
-							const beforeNode = document.createTextNode(beforeAt);
-							const afterNode = document.createTextNode(` ${afterCursor}`);
-
-							parent.insertBefore(beforeNode, textNode);
-							parent.insertBefore(chip, textNode);
-							parent.insertBefore(afterNode, textNode);
-							parent.removeChild(textNode);
-
-							// Set cursor after the chip
-							const newRange = document.createRange();
-							newRange.setStart(afterNode, 1);
-							newRange.collapse(true);
-							selection.removeAllRanges();
-							selection.addRange(newRange);
+				editor.tf.withoutNormalizing(() => {
+					const selection = editor.selection;
+
+					// No active selection (focus moved to a picker) — snap
+					// to end-of-doc so the chip appends cleanly.
+					if (!selection) {
+						editor.tf.select(editor.api.end([]));
+					} else if (removeTriggerText) {
+						// Delete the in-progress "@query" so the chip stands in for it.
+						const cursorCtx = getCursorTextContext(getCurrentValue(), selection);
+						if (cursorCtx) {
+							const text = cursorCtx.text;
+							let triggerIndex = -1;
+							for (let i = cursorCtx.cursor - 1; i >= 0; i--) {
+								if (text[i] === "@") {
+									triggerIndex = i;
+									break;
+								}
+								if (text[i] === " " || text[i] === "\n") break;
+							}
+							if (triggerIndex >= 0 && triggerIndex < cursorCtx.cursor) {
+								const path = [cursorCtx.blockIndex, cursorCtx.childIndex];
+								editor.tf.delete({
+									at: {
+										anchor: { path, offset: triggerIndex },
+										focus: { path, offset: cursorCtx.cursor },
+									},
+								});
+							}
 						}
-					} else {
-						// No @ found, just insert at cursor
-						const chip = createChipElement(mentionDoc);
-						range.insertNode(chip);
-						range.setStartAfter(chip);
-						range.collapse(true);
-
-						// Add space after chip
-						const space = document.createTextNode(" ");
-						range.insertNode(space);
-						range.setStartAfter(space);
-						range.collapse(true);
 					}
-				} else {
-					// Not in a text node, append to editor
-					const chip = createChipElement(mentionDoc);
-					editorRef.current.appendChild(chip);
-					editorRef.current.appendChild(document.createTextNode(" "));
-					focusAtEnd();
-				}
 
-				// Update empty state
-				setIsEmpty(false);
+					editor.tf.insertNodes([mentionNode, { text: " " }] as unknown as TElement[], {
+						select: true,
+					});
+				});
+				editor.tf.focus();
+			},
+			[editor, getCurrentValue]
+		);
 
-				// Trigger onChange
-				if (onChange) {
-					setTimeout(() => {
-						onChange(getText(), getMentionedDocuments());
-					}, 0);
-				}
+		// Doc-only shim that routes through ``insertMentionChip``.
+		const insertDocumentChip = useCallback(
+			(
+				doc: Pick<Document, "id" | "title" | "document_type">,
+				options?: { removeTriggerText?: boolean }
+			) => {
+				insertMentionChip({ ...doc, kind: "doc" }, options);
 			},
-			[createChipElement, focusAtEnd, getText, getMentionedDocuments, onChange]
+			[insertMentionChip]
 		);
 
-		// Clear the editor
-		const clear = useCallback(() => {
-			if (editorRef.current) {
-				editorRef.current.innerHTML = "";
-				setIsEmpty(true);
-				setMentionedDocs(new Map());
-			}
-		}, []);
+		// Remove chip(s) matching (id, document_type). Iterates in
+		// descending path order so removing one entry can't invalidate
+		// later paths. Chips are deduped today, so this typically runs
+		// at most once.
+		const removeDocumentChip = useCallback(
+			(docId: number, docType?: string) => {
+				const match = (n: unknown) => {
+					if (!n || typeof n !== "object" || !("type" in n)) return false;
+					const node = n as MentionElementNode;
+					if (node.type !== MENTION_TYPE) return false;
+					if (node.id !== docId) return false;
+					return (node.document_type ?? "UNKNOWN") === (docType ?? "UNKNOWN");
+				};
 
-		// Replace editor content with plain text and place cursor at end
-		const setText = useCallback(
-			(text: string) => {
-				if (!editorRef.current) return;
-				editorRef.current.innerText = text;
-				const empty = text.length === 0;
-				setIsEmpty(empty);
-				onChange?.(text, Array.from(mentionedDocs.values()));
-				focusAtEnd();
+				const entries = Array.from(editor.api.nodes({ at: [], match })) as NodeEntry[];
+				if (entries.length === 0) return;
+				editor.tf.withoutNormalizing(() => {
+					for (const [, path] of entries.reverse()) {
+						editor.tf.removeNodes({ at: path });
+					}
+				});
+			},
+			[editor]
+		);
+
+		// Single removal call site for Backspace and the X button so the
+		// two can never diverge (e.g. one forgetting to notify the parent).
+		const removeChip = useCallback(
+			(docId: number, docType: string | undefined) => {
+				removeDocumentChip(docId, docType);
+				onDocumentRemove?.(docId, docType);
 			},
-			[focusAtEnd, onChange, mentionedDocs]
+			[onDocumentRemove, removeDocumentChip]
 		);
 
+		// Update chip status in place via ``tf.setNodes`` so the user's
+		// selection survives backend status events arriving mid-typing.
 		const setDocumentChipStatus = useCallback(
 			(
 				docId: number,
 				docType: string | undefined,
 				statusLabel: string | null,
-				statusKind: "pending" | "processing" | "ready" | "failed" = "pending"
+				statusKind: MentionStatusKind = "pending"
 			) => {
-				if (!editorRef.current) return;
+				const match = (n: unknown) => {
+					if (!n || typeof n !== "object" || !("type" in n)) return false;
+					const node = n as MentionElementNode;
+					if (node.type !== MENTION_TYPE) return false;
+					if (node.id !== docId) return false;
+					return (node.document_type ?? "UNKNOWN") === (docType ?? "UNKNOWN");
+				};
 
-				const chips = editorRef.current.querySelectorAll<HTMLSpanElement>(
-					`span[${CHIP_DATA_ATTR}="true"]`
+				editor.tf.setNodes(
+					{
+						statusLabel,
+						statusKind: statusLabel ? statusKind : undefined,
+					} as Partial<TElement>,
+					{ at: [], match }
 				);
-				for (const chip of chips) {
-					const chipId = getChipId(chip);
-					const chipType = getChipDocType(chip);
-					if (chipId !== docId) continue;
-					if ((docType ?? "UNKNOWN") !== chipType) continue;
-
-					const statusEl = chip.querySelector<HTMLSpanElement>(`span[${CHIP_STATUS_ATTR}="true"]`);
-					if (!statusEl) continue;
-
-					if (!statusLabel) {
-						statusEl.textContent = "";
-						statusEl.className = "text-[10px] font-semibold opacity-80 hidden";
-						continue;
-					}
-
-					const statusClass =
-						statusKind === "failed"
-							? "text-destructive"
-							: statusKind === "processing"
-								? "text-amber-700"
-								: statusKind === "ready"
-									? "text-emerald-700"
-									: "text-amber-700";
-					statusEl.textContent = statusLabel;
-					statusEl.className = `text-[10px] font-semibold opacity-80 ${statusClass}`;
-				}
 			},
-			[]
+			[editor]
 		);
 
-		const removeDocumentChip = useCallback(
-			(docId: number, docType?: string) => {
-				if (!editorRef.current) return;
-				const chipKey = `${docType ?? "UNKNOWN"}:${docId}`;
-				const chips = editorRef.current.querySelectorAll<HTMLSpanElement>(
-					`span[${CHIP_DATA_ATTR}="true"]`
-				);
-				for (const chip of chips) {
-					if (getChipId(chip) === docId && getChipDocType(chip) === (docType ?? "UNKNOWN")) {
-						chip.remove();
-						break;
-					}
-				}
-				setMentionedDocs((prev) => {
-					const next = new Map(prev);
-					next.delete(chipKey);
-					return next;
-				});
+		const clear = useCallback(() => {
+			setValue(EMPTY_VALUE);
+			// ``tf.setValue`` wipes the selection — refocus so the caret
+			// returns after Enter-to-submit.
+			requestAnimationFrame(focusAtEnd);
+		}, [focusAtEnd, setValue]);
 
-				const text = getText();
-				const empty = text.length === 0 && mentionedDocs.size <= 1;
-				setIsEmpty(empty);
+		const setText = useCallback(
+			(text: string) => {
+				setValue(toValueFromText(text));
+				requestAnimationFrame(focusAtEnd);
 			},
-			[getText, mentionedDocs.size]
+			[focusAtEnd, setValue]
 		);
 
-		// Expose methods via ref
-		useImperativeHandle(ref, () => ({
-			focus: () => editorRef.current?.focus(),
-			clear,
-			setText,
-			getText,
-			getMentionedDocuments,
-			insertDocumentChip,
-			removeDocumentChip,
-			setDocumentChipStatus,
-		}));
-
-		// Handle input changes
-		const handleInput = useCallback(() => {
-			if (!editorRef.current) return;
-
-			const text = getText();
-			const empty = text.length === 0 && mentionedDocs.size === 0;
-			setIsEmpty(empty);
-
-			// Check for @ mentions
-			const selection = window.getSelection();
-			let shouldTriggerMention = false;
-			let mentionQuery = "";
-
-			if (selection && selection.rangeCount > 0) {
-				const range = selection.getRangeAt(0);
-				const textNode = range.startContainer;
-
-				if (textNode.nodeType === Node.TEXT_NODE) {
-					const textContent = textNode.textContent || "";
-					const cursorPos = range.startOffset;
-
-					// Look for @ before cursor
-					let atIndex = -1;
-					for (let i = cursorPos - 1; i >= 0; i--) {
-						if (textContent[i] === "@") {
-							atIndex = i;
-							break;
-						}
-						// Stop if we hit a space (@ must be at word boundary)
-						if (textContent[i] === " " || textContent[i] === "\n") {
-							break;
-						}
-					}
-
-					if (atIndex !== -1) {
-						const query = textContent.slice(atIndex + 1, cursorPos);
-						// Only trigger if query doesn't start with space
-						if (!query.startsWith(" ")) {
-							shouldTriggerMention = true;
-							mentionQuery = query;
-						}
-					}
-				}
-			}
-
-			// Check for / actions (same pattern as @)
-			let shouldTriggerAction = false;
-			let actionQuery = "";
-
-			if (!shouldTriggerMention && selection && selection.rangeCount > 0) {
-				const range = selection.getRangeAt(0);
-				const textNode = range.startContainer;
-
-				if (textNode.nodeType === Node.TEXT_NODE) {
-					const textContent = textNode.textContent || "";
-					const cursorPos = range.startOffset;
-
-					let slashIndex = -1;
-					for (let i = cursorPos - 1; i >= 0; i--) {
-						if (textContent[i] === "/") {
-							slashIndex = i;
-							break;
-						}
-						if (textContent[i] === " " || textContent[i] === "\n") {
-							break;
-						}
-					}
+		const getText = useCallback(() => getPlainText(getCurrentValue()), [getCurrentValue]);
+		const getMentionedDocs = useCallback(
+			() => getMentionedDocuments(getCurrentValue()),
+			[getCurrentValue]
+		);
 
-					if (
-						slashIndex !== -1 &&
-						(slashIndex === 0 ||
-							textContent[slashIndex - 1] === " " ||
-							textContent[slashIndex - 1] === "\n")
-					) {
-						const query = textContent.slice(slashIndex + 1, cursorPos);
-						if (!query.startsWith(" ")) {
-							shouldTriggerAction = true;
-							actionQuery = query;
+		useImperativeHandle(
+			ref,
+			() => ({
+				// Preserve existing selection if any; otherwise seed one
+				// at end-of-doc so the contentEditable shows a caret.
+				focus: () => {
+					try {
+						if (!editor.selection) {
+							editor.tf.select(editor.api.end([]));
 						}
+						editor.tf.focus();
+					} catch {
+						editableRef.current?.focus();
 					}
-				}
-			}
-
-			// If no @ found before cursor, check if text contains @ at all
-			// If text is empty or doesn't contain @, close the mention
-			if (!shouldTriggerMention) {
-				if (text.length === 0 || !text.includes("@")) {
-					onMentionClose?.();
-				} else {
-					// Text contains @ but not before cursor, close mention
-					onMentionClose?.();
-				}
-			} else {
-				onMentionTrigger?.(mentionQuery);
-			}
-
-			if (!shouldTriggerAction) {
-				onActionClose?.();
-			} else {
-				onActionTrigger?.(actionQuery);
-			}
-
-			// Notify parent of change
-			onChange?.(text, Array.from(mentionedDocs.values()));
-		}, [
-			getText,
-			mentionedDocs,
-			onChange,
-			onMentionTrigger,
-			onMentionClose,
-			onActionTrigger,
-			onActionClose,
-		]);
+				},
+				clear,
+				setText,
+				getText,
+				getMentionedDocuments: getMentionedDocs,
+				insertMentionChip,
+				insertDocumentChip,
+				removeDocumentChip,
+				setDocumentChipStatus,
+			}),
+			[
+				clear,
+				editor,
+				getMentionedDocs,
+				getText,
+				insertMentionChip,
+				insertDocumentChip,
+				removeDocumentChip,
+				setDocumentChipStatus,
+				setText,
+			]
+		);
 
-		// Handle keydown
 		const handleKeyDown = useCallback(
 			(e: React.KeyboardEvent<HTMLDivElement>) => {
-				// Let parent handle navigation keys when mention popover is open
-				if (onKeyDown) {
-					onKeyDown(e);
-					if (e.defaultPrevented) return;
-				}
+				onKeyDown?.(e);
+				if (e.defaultPrevented) return;
 
-				// Handle Enter for submit (without shift)
 				if (e.key === "Enter" && !e.shiftKey) {
 					e.preventDefault();
 					onSubmit?.();
 					return;
 				}
 
-				// Handle backspace on chips
-				if (e.key === "Backspace") {
-					const selection = window.getSelection();
-					if (selection && selection.rangeCount > 0) {
-						const range = selection.getRangeAt(0);
-						if (range.collapsed) {
-							// Check if cursor is right after a chip
-							const node = range.startContainer;
-							const offset = range.startOffset;
-
-							if (node.nodeType === Node.TEXT_NODE && offset === 0) {
-								// Check previous sibling using type guard
-								const prevSibling = node.previousSibling;
-								if (isChipElement(prevSibling)) {
-									e.preventDefault();
-									const chipId = getChipId(prevSibling);
-									const chipDocType = getChipDocType(prevSibling);
-									if (chipId !== null) {
-										prevSibling.remove();
-										const chipKey = `${chipDocType}:${chipId}`;
-										setMentionedDocs((prev) => {
-											const next = new Map(prev);
-											next.delete(chipKey);
-											return next;
-										});
-										// Notify parent that a document was removed
-										onDocumentRemove?.(chipId, chipDocType);
-									}
-									return;
-								}
-								// Check if we're about to delete @ at the start
-								const textContent = node.textContent || "";
-								if (textContent.length > 0 && textContent[0] === "@") {
-									// Will delete @, close mention popover
-									setTimeout(() => {
-										onMentionClose?.();
-									}, 0);
-								}
-							} else if (node.nodeType === Node.TEXT_NODE && offset > 0) {
-								// Check if we're about to delete @
-								const textContent = node.textContent || "";
-								if (textContent[offset - 1] === "@") {
-									// Will delete @, close mention popover
-									setTimeout(() => {
-										onMentionClose?.();
-									}, 0);
-								}
-							} else if (node.nodeType === Node.ELEMENT_NODE && offset > 0) {
-								// Check if previous child is a chip using type guard
-								const prevChild = (node as Element).childNodes[offset - 1];
-								if (isChipElement(prevChild)) {
-									e.preventDefault();
-									const chipId = getChipId(prevChild);
-									const chipDocType = getChipDocType(prevChild);
-									if (chipId !== null) {
-										prevChild.remove();
-										const chipKey = `${chipDocType}:${chipId}`;
-										setMentionedDocs((prev) => {
-											const next = new Map(prev);
-											next.delete(chipKey);
-											return next;
-										});
-										// Notify parent that a document was removed
-										onDocumentRemove?.(chipId, chipDocType);
-									}
-								}
-							}
-						}
-					}
+				if (e.key !== "Backspace") return;
+				const selection = editor.selection;
+				if (!selection || !selection.anchor || !selection.focus) return;
+				if (
+					selection.anchor.path.length < 2 ||
+					selection.focus.path.length < 2 ||
+					selection.anchor.path[0] !== selection.focus.path[0]
+				) {
+					return;
 				}
+				if (selection.anchor.offset !== 0 || selection.focus.offset !== 0) return;
+
+				const value = getCurrentValue();
+				const block = value[selection.anchor.path[0]];
+				if (!block) return;
+				const childIndex = selection.anchor.path[1];
+				if (childIndex <= 0) return;
+				const prev = block.children[childIndex - 1];
+				if (!isMentionNode(prev)) return;
+
+				e.preventDefault();
+				removeChip(prev.id, prev.document_type);
 			},
-			[onKeyDown, onSubmit, onDocumentRemove, onMentionClose]
+			[editor.selection, getCurrentValue, onKeyDown, onSubmit, removeChip]
 		);
 
-		// Handle paste - strip formatting
-		const handlePaste = useCallback((e: React.ClipboardEvent) => {
-			e.preventDefault();
-			const text = e.clipboardData.getData("text/plain");
-			document.execCommand("insertText", false, text);
-		}, []);
-
-		// Handle composition (for IME input)
-		const handleCompositionStart = useCallback(() => {
-			isComposingRef.current = true;
-		}, []);
+		const editableProps = useMemo(
+			() => ({
+				placeholder,
+				onPaste: (e: React.ClipboardEvent<HTMLDivElement>) => {
+					e.preventDefault();
+					const text = e.clipboardData.getData("text/plain");
+					const tf = editor.tf as { insertText: (value: string) => void };
+					tf.insertText(text);
+				},
+				onKeyDown: handleKeyDown,
+			}),
+			[editor, handleKeyDown, placeholder]
+		);
 
-		const handleCompositionEnd = useCallback(() => {
-			isComposingRef.current = false;
-			handleInput();
-		}, [handleInput]);
+		const mentionEditorContextValue = useMemo<MentionEditorContextValue>(
+			() => ({ removeChip }),
+			[removeChip]
+		);
 
 		return (
 			<div className="relative w-full">
-				{/** biome-ignore lint/a11y/useSemanticElements: <not important> */}
-				<div
-					ref={editorRef}
-					contentEditable={!disabled}
-					suppressContentEditableWarning
-					tabIndex={disabled ? -1 : 0}
-					onInput={handleInput}
-					onKeyDown={handleKeyDown}
-					onPaste={handlePaste}
-					onCompositionStart={handleCompositionStart}
-					onCompositionEnd={handleCompositionEnd}
-					className={cn(
-						"min-h-[24px] max-h-32 overflow-y-auto",
-						"text-sm outline-none",
-						"whitespace-pre-wrap wrap-break-word",
-						disabled && "opacity-50 cursor-not-allowed",
-						className
-					)}
-					style={{ wordBreak: "break-word" }}
-					data-placeholder={placeholder}
-					aria-label="Message input with inline mentions"
-					role="textbox"
-					aria-multiline="true"
-				/>
-				{/* Placeholder with fade animation on change */}
-				{isEmpty && (
-					<div
-						key={placeholder}
-						className="absolute top-0 left-0 pointer-events-none text-muted-foreground text-sm animate-in fade-in duration-1000"
-						aria-hidden="true"
+				<MentionEditorContext.Provider value={mentionEditorContextValue}>
+					<Plate
+						editor={editor}
+						onChange={({ value }) => {
+							emitState(value as ComposerValue);
+						}}
 					>
-						{placeholder}
-					</div>
-				)}
+						<PlateContent
+							ref={editableRef}
+							readOnly={disabled}
+							{...editableProps}
+							className={cn(
+								"min-h-[24px] max-h-32 overflow-y-auto outline-none whitespace-pre-wrap wrap-break-word",
+								COMPOSER_TEXT_METRICS_CLASSNAME,
+								disabled && "opacity-50 cursor-not-allowed",
+								className
+							)}
+						/>
+					</Plate>
+				</MentionEditorContext.Provider>
 			</div>
 		);
 	}
diff --git a/surfsense_web/components/assistant-ui/markdown-code-block.tsx b/surfsense_web/components/assistant-ui/markdown-code-block.tsx
index 3c8d79ec9a..e6c735d1eb 100644
--- a/surfsense_web/components/assistant-ui/markdown-code-block.tsx
+++ b/surfsense_web/components/assistant-ui/markdown-code-block.tsx
@@ -7,7 +7,6 @@ import { Prism as SyntaxHighlighter } from "react-syntax-highlighter";
 import { materialDark, materialLight } from "react-syntax-highlighter/dist/esm/styles/prism";
 
 import { Button } from "@/components/ui/button";
-import { Skeleton } from "@/components/ui/skeleton";
 import { cn, copyToClipboard } from "@/lib/utils";
 
 type MarkdownCodeBlockProps = {
@@ -49,7 +48,7 @@ function MarkdownCodeBlockComponent({
 	}, [hasCopied]);
 
 	return (
-		<div className="mt-4 overflow-hidden rounded-2xl" style={{ background: "var(--syntax-bg)" }}>
+		<div className="mt-4 overflow-hidden rounded-md bg-accent">
 			<div className="flex items-center justify-between gap-4 px-4 py-2 font-semibold text-muted-foreground text-sm">
 				<span className="lowercase text-xs">{language}</span>
 				<Button
@@ -82,23 +81,3 @@ function MarkdownCodeBlockComponent({
 }
 
 export const MarkdownCodeBlock = memo(MarkdownCodeBlockComponent);
-
-export function MarkdownCodeBlockSkeleton() {
-	return (
-		<div
-			className="mt-4 overflow-hidden rounded-2xl border"
-			style={{ background: "var(--syntax-bg)" }}
-		>
-			<div className="flex items-center justify-between gap-4 border-b px-4 py-2">
-				<Skeleton className="h-3 w-16" />
-				<Skeleton className="h-8 w-8 rounded-md" />
-			</div>
-			<div className="space-y-2 p-4">
-				<Skeleton className="h-4 w-11/12" />
-				<Skeleton className="h-4 w-10/12" />
-				<Skeleton className="h-4 w-8/12" />
-				<Skeleton className="h-4 w-9/12" />
-			</div>
-		</div>
-	);
-}
diff --git a/surfsense_web/components/assistant-ui/markdown-text.tsx b/surfsense_web/components/assistant-ui/markdown-text.tsx
index c308f8c050..ee36e8499e 100644
--- a/surfsense_web/components/assistant-ui/markdown-text.tsx
+++ b/surfsense_web/components/assistant-ui/markdown-text.tsx
@@ -7,17 +7,21 @@ import {
 	unstable_memoizeMarkdownComponents as memoizeMarkdownComponents,
 	useIsMarkdownCodeBlock,
 } from "@assistant-ui/react-markdown";
-import { ExternalLinkIcon } from "lucide-react";
+import { useSetAtom } from "jotai";
+import { ExternalLinkIcon, FileIcon, Folder as FolderIcon } from "lucide-react";
 import dynamic from "next/dynamic";
+import { useParams } from "next/navigation";
 import { useTheme } from "next-themes";
-import { memo, type ReactNode } from "react";
+import { createContext, memo, type ReactNode, useCallback, useContext, useRef } from "react";
 import rehypeKatex from "rehype-katex";
 import remarkGfm from "remark-gfm";
 import remarkMath from "remark-math";
+import { openEditorPanelAtom } from "@/atoms/editor/editor-panel.atom";
 import { ImagePreview, ImageRoot, ImageZoom } from "@/components/assistant-ui/image";
+import { MentionChip } from "@/components/assistant-ui/mention-chip";
 import "katex/dist/katex.min.css";
-import { InlineCitation, UrlCitation } from "@/components/assistant-ui/inline-citation";
-import { Skeleton } from "@/components/ui/skeleton";
+import { toast } from "sonner";
+import { processChildrenWithCitations } from "@/components/citations/citation-renderer";
 import {
 	Table,
 	TableBody,
@@ -26,65 +30,56 @@ import {
 	TableHeader,
 	TableRow,
 } from "@/components/ui/table";
+import { useElectronAPI } from "@/hooks/use-platform";
+import { documentsApiService } from "@/lib/apis/documents-api.service";
+import { getVirtualPathDisplay } from "@/lib/chat/virtual-path-display";
+import { type CitationUrlMap, preprocessCitationMarkdown } from "@/lib/citations/citation-parser";
+import { tryGetHostname } from "@/lib/url";
 import { cn } from "@/lib/utils";
 
-function MarkdownCodeBlockSkeleton() {
-	return (
-		<div
-			className="mt-4 overflow-hidden rounded-2xl border"
-			style={{ background: "var(--syntax-bg)" }}
-		>
-			<div className="flex items-center justify-between gap-4 border-b px-4 py-2">
-				<Skeleton className="h-3 w-16" />
-				<Skeleton className="h-8 w-8 rounded-md" />
-			</div>
-			<div className="space-y-2 p-4">
-				<Skeleton className="h-4 w-11/12" />
-				<Skeleton className="h-4 w-10/12" />
-				<Skeleton className="h-4 w-8/12" />
-				<Skeleton className="h-4 w-9/12" />
-			</div>
-		</div>
-	);
+function MarkdownCodeBlockLoading() {
+	return <div className="mt-4 h-32 overflow-hidden rounded-md bg-accent" />;
 }
 
 const LazyMarkdownCodeBlock = dynamic(
 	() => import("./markdown-code-block").then((mod) => mod.MarkdownCodeBlock),
 	{
-		loading: () => <MarkdownCodeBlockSkeleton />,
+		loading: () => <MarkdownCodeBlockLoading />,
 	}
 );
 
-// Storage for URL citations replaced during preprocess to avoid GFM autolink interference.
-// Populated in preprocessMarkdown, consumed in parseTextWithCitations.
-let _pendingUrlCitations = new Map<string, string>();
-let _urlCiteIdx = 0;
+// Per-render URL placeholder map propagated to component overrides via
+// React Context. Replaces the previous module-level `_pendingUrlCitations`
+// state, which was unsafe under concurrent renders / SSR.
+type CitationUrlMapRef = { current: CitationUrlMap };
+const EMPTY_URL_MAP: CitationUrlMap = new Map();
+const CitationUrlMapContext = createContext<CitationUrlMapRef>({ current: EMPTY_URL_MAP });
+
+function useCitationUrlMap(): CitationUrlMap {
+	return useContext(CitationUrlMapContext).current;
+}
 
 /**
  * Preprocess raw markdown before it reaches the remark/rehype pipeline.
  * - Replaces URL-based citations with safe placeholders (prevents GFM autolinks)
  * - Normalises LaTeX delimiters to dollar-sign syntax for remark-math
  */
-function preprocessMarkdown(content: string): string {
+function preprocessMarkdown(content: string, urlMapRef: CitationUrlMapRef): string {
 	// Replace URL-based citations with safe placeholders BEFORE markdown parsing.
 	// GFM autolinks would otherwise convert the https://... inside [citation:URL]
 	// into an <a> element, splitting the text and preventing our citation regex
 	// from matching the full pattern.
-	_pendingUrlCitations = new Map();
-	_urlCiteIdx = 0;
-	content = content.replace(
-		/[[【]\u200B?citation:\s*(https?:\/\/[^\]】\u200B]+)\s*\u200B?[\]】]/g,
-		(_, url) => {
-			const key = `urlcite${_urlCiteIdx++}`;
-			_pendingUrlCitations.set(key, url.trim());
-			return `[citation:${key}]`;
-		}
-	);
+	const { content: rewritten, urlMap } = preprocessCitationMarkdown(content);
+	urlMapRef.current = urlMap;
+	content = rewritten;
 
+	// All math forms are normalised to $$...$$ so we can disable single-dollar
+	// inline math in remark-math (otherwise currency like "$3,120.00 and $0.00"
+	// gets parsed as a LaTeX expression).
 	// 1. Block math: \[...\] → $$...$$
 	content = content.replace(/\\\[([\s\S]*?)\\\]/g, (_, inner) => `$$${inner}$$`);
-	// 2. Inline math: \(...\) → $...$
-	content = content.replace(/\\\(([\s\S]*?)\\\)/g, (_, inner) => `$${inner}$`);
+	// 2. Inline math: \(...\) → $$...$$
+	content = content.replace(/\\\(([\s\S]*?)\\\)/g, (_, inner) => `$$${inner}$$`);
 	// 3. Block: \begin{equation}...\end{equation} → $$...$$
 	content = content.replace(
 		/\\begin\{equation\}([\s\S]*?)\\end\{equation\}/g,
@@ -95,8 +90,11 @@ function preprocessMarkdown(content: string): string {
 		/\\begin\{displaymath\}([\s\S]*?)\\end\{displaymath\}/g,
 		(_, inner) => `$$${inner}$$`
 	);
-	// 5. Inline: \begin{math}...\end{math} → $...$
-	content = content.replace(/\\begin\{math\}([\s\S]*?)\\end\{math\}/g, (_, inner) => `$${inner}$`);
+	// 5. Inline: \begin{math}...\end{math} → $$...$$
+	content = content.replace(
+		/\\begin\{math\}([\s\S]*?)\\end\{math\}/g,
+		(_, inner) => `$$${inner}$$`
+	);
 	// 6. Strip backtick wrapping around math: `$$...$$` → $$...$$ and `$...$` → $...$
 	content = content.replace(/`(\${1,2})((?:(?!\1).)+)\1`/g, "$1$2$1");
 
@@ -106,126 +104,166 @@ function preprocessMarkdown(content: string): string {
 	return content;
 }
 
-// Matches [citation:...] with numeric IDs (incl. doc- prefix, comma-separated),
-// URL-based IDs from live web search, or urlciteN placeholders from preprocess.
-// Also matches Chinese brackets 【】 and handles zero-width spaces that LLM sometimes inserts.
-const CITATION_REGEX =
-	/[[【]\u200B?citation:\s*(https?:\/\/[^\]】\u200B]+|urlcite\d+|(?:doc-)?\d+(?:\s*,\s*(?:doc-)?\d+)*)\s*\u200B?[\]】]/g;
+const MarkdownTextImpl = () => {
+	const urlMapRef = useRef<CitationUrlMap>(EMPTY_URL_MAP);
+	const preprocess = useCallback((content: string) => preprocessMarkdown(content, urlMapRef), []);
+	return (
+		<CitationUrlMapContext.Provider value={urlMapRef}>
+			<MarkdownTextPrimitive
+				smooth={false}
+				remarkPlugins={[remarkGfm, [remarkMath, { singleDollarTextMath: false }]]}
+				rehypePlugins={[rehypeKatex]}
+				className="aui-md"
+				components={defaultComponents}
+				preprocess={preprocess}
+			/>
+		</CitationUrlMapContext.Provider>
+	);
+};
 
-/**
- * Parses text and replaces [citation:XXX] patterns with citation components.
- * Supports:
- *  - Numeric chunk IDs: [citation:123]
- *  - Doc-prefixed IDs: [citation:doc-123]
- *  - Comma-separated IDs: [citation:4149, 4150, 4151]
- *  - URL-based citations from live search: [citation:https://example.com/page]
- */
-function parseTextWithCitations(text: string): ReactNode[] {
-	const parts: ReactNode[] = [];
-	let lastIndex = 0;
-	let match: RegExpExecArray | null;
-	let instanceIndex = 0;
+export const MarkdownText = memo(MarkdownTextImpl);
 
-	CITATION_REGEX.lastIndex = 0;
+// Canonical local-file virtual paths are mount-prefixed: /<mount>/<relative/path>
+const LOCAL_FILE_PATH_REGEX = /^\/[a-z0-9_-]+\/[^\s`]+(?:\/[^\s`]+)*$/;
 
-	match = CITATION_REGEX.exec(text);
-	while (match !== null) {
-		if (match.index > lastIndex) {
-			parts.push(text.substring(lastIndex, match.index));
-		}
+type AgentFilesystemMount = {
+	mount: string;
+	rootPath: string;
+};
 
-		const captured = match[1];
+function normalizeLocalVirtualPathForEditor(
+	candidatePath: string,
+	mounts: AgentFilesystemMount[]
+): string {
+	const normalizedCandidate = candidatePath.trim().replace(/\\/g, "/").replace(/\/+/g, "/");
+	if (!normalizedCandidate) {
+		return candidatePath;
+	}
+	const defaultMount = mounts[0]?.mount;
+	if (!defaultMount) {
+		return normalizedCandidate.startsWith("/")
+			? normalizedCandidate
+			: `/${normalizedCandidate.replace(/^\/+/, "")}`;
+	}
 
-		if (captured.startsWith("http://") || captured.startsWith("https://")) {
-			parts.push(<UrlCitation key={`citation-url-${instanceIndex}`} url={captured.trim()} />);
-			instanceIndex++;
-		} else if (captured.startsWith("urlcite")) {
-			const url = _pendingUrlCitations.get(captured);
-			if (url) {
-				parts.push(<UrlCitation key={`citation-url-${instanceIndex}`} url={url} />);
-			}
-			instanceIndex++;
-		} else {
-			const rawIds = captured.split(",").map((s) => s.trim());
-			for (const rawId of rawIds) {
-				const isDocsChunk = rawId.startsWith("doc-");
-				const chunkId = Number.parseInt(isDocsChunk ? rawId.slice(4) : rawId, 10);
-				parts.push(
-					<InlineCitation
-						key={`citation-${isDocsChunk ? "doc-" : ""}${chunkId}-${instanceIndex}`}
-						chunkId={chunkId}
-						isDocsChunk={isDocsChunk}
-					/>
-				);
-				instanceIndex++;
-			}
+	const mountNames = new Set(mounts.map((entry) => entry.mount));
+	if (normalizedCandidate.startsWith("/")) {
+		const relative = normalizedCandidate.replace(/^\/+/, "");
+		const [firstSegment] = relative.split("/", 1);
+		if (mountNames.has(firstSegment)) {
+			return `/${relative}`;
 		}
+		return `/${defaultMount}/${relative}`;
+	}
 
-		lastIndex = match.index + match[0].length;
-		match = CITATION_REGEX.exec(text);
+	const relative = normalizedCandidate.replace(/^\/+/, "");
+	const [firstSegment] = relative.split("/", 1);
+	if (mountNames.has(firstSegment)) {
+		return `/${relative}`;
 	}
+	return `/${defaultMount}/${relative}`;
+}
 
-	if (lastIndex < text.length) {
-		parts.push(text.substring(lastIndex));
+function isVirtualFilePathToken(value: string): boolean {
+	if (!LOCAL_FILE_PATH_REGEX.test(value) || value.startsWith("//")) {
+		return false;
 	}
+	const normalized = value.replace(/\/+$/, "");
+	const segments = normalized.split("/").filter(Boolean);
+	return segments.length >= 2;
+}
 
-	return parts.length > 0 ? parts : [text];
+function isStandaloneDocumentsPathText(node: ReactNode): string | null {
+	if (typeof node !== "string") return null;
+	const value = node.trim();
+	if (!value.startsWith("/documents/")) return null;
+	if (value.includes(" ")) return null;
+	const normalized = value.replace(/\/+$/, "");
+	const leaf = normalized.split("/").filter(Boolean).at(-1) ?? "";
+	if (!leaf || !leaf.includes(".")) return null;
+	return value;
 }
 
-const MarkdownTextImpl = () => {
-	return (
-		<MarkdownTextPrimitive
-			smooth={false}
-			remarkPlugins={[remarkGfm, remarkMath]}
-			rehypePlugins={[rehypeKatex]}
-			className="aui-md"
-			components={defaultComponents}
-			preprocess={preprocessMarkdown}
-		/>
-	);
-};
+function FilePathLink({ path, className }: { path: string; className?: string }) {
+	const openEditorPanel = useSetAtom(openEditorPanelAtom);
+	const params = useParams();
+	const electronAPI = useElectronAPI();
+	const searchSpaceIdParam = params?.search_space_id;
+	const parsedSearchSpaceId = Array.isArray(searchSpaceIdParam)
+		? Number(searchSpaceIdParam[0])
+		: Number(searchSpaceIdParam);
+	const resolvedSearchSpaceId = Number.isFinite(parsedSearchSpaceId)
+		? parsedSearchSpaceId
+		: undefined;
 
-export const MarkdownText = memo(MarkdownTextImpl);
+	const { displayName, isFolder } = getVirtualPathDisplay(path);
+	const icon = isFolder ? <FolderIcon className="size-3.5" /> : <FileIcon className="size-3.5" />;
 
-/**
- * Helper to process children and replace citation patterns with components
- */
-function processChildrenWithCitations(children: ReactNode): ReactNode {
-	if (typeof children === "string") {
-		const parsed = parseTextWithCitations(children);
-		return parsed.length === 1 && typeof parsed[0] === "string" ? children : parsed;
-	}
+	const handleClick = useCallback(
+		(event: React.MouseEvent<HTMLButtonElement>) => {
+			event.preventDefault();
+			event.stopPropagation();
+			void (async () => {
+				if (electronAPI) {
+					let resolvedLocalPath = path;
+					if (electronAPI.getAgentFilesystemMounts) {
+						try {
+							const mounts = (await electronAPI.getAgentFilesystemMounts(
+								resolvedSearchSpaceId
+							)) as AgentFilesystemMount[];
+							resolvedLocalPath = normalizeLocalVirtualPathForEditor(path, mounts);
+						} catch {
+							// Fall back to the raw path if mount lookup fails.
+						}
+					}
+					openEditorPanel({
+						kind: "local_file",
+						localFilePath: resolvedLocalPath,
+						title: resolvedLocalPath.split("/").pop() || resolvedLocalPath,
+						searchSpaceId: resolvedSearchSpaceId,
+					});
+					return;
+				}
 
-	if (Array.isArray(children)) {
-		return children.map((child) => {
-			if (typeof child === "string") {
-				const parsed = parseTextWithCitations(child);
-				return parsed.length === 1 && typeof parsed[0] === "string" ? (
-					child
-				) : (
-					<span key={child}>{parsed}</span>
-				);
-			}
-			return child;
-		});
-	}
+				if (!resolvedSearchSpaceId || !path.startsWith("/documents/")) return;
+				try {
+					const doc = await documentsApiService.getDocumentByVirtualPath({
+						search_space_id: resolvedSearchSpaceId,
+						virtual_path: path,
+					});
+					openEditorPanel({
+						kind: "document",
+						documentId: doc.id,
+						searchSpaceId: resolvedSearchSpaceId,
+						title: doc.title,
+					});
+				} catch {
+					toast.error("Document not found in knowledge base.");
+				}
+			})();
+		},
+		[electronAPI, openEditorPanel, path, resolvedSearchSpaceId]
+	);
 
-	return children;
-}
+	// Folders cannot open in the editor panel — keep them as visual chips.
+	const onClick = isFolder ? undefined : handleClick;
 
-function extractDomain(url: string): string {
-	try {
-		const parsed = new URL(url);
-		return parsed.hostname.replace(/^www\./, "");
-	} catch {
-		return "";
-	}
+	return (
+		<MentionChip
+			icon={icon}
+			label={displayName || path}
+			tooltip={path}
+			onClick={onClick}
+			ariaLabel={isFolder ? `Folder ${displayName}` : `Open ${displayName}`}
+			className={className}
+		/>
+	);
 }
 
 function MarkdownImage({ src, alt }: { src?: string; alt?: string }) {
 	if (!src) return null;
 
-	const domain = extractDomain(src);
+	const domain = tryGetHostname(src) ?? "";
 
 	return (
 		<div className="my-4 w-fit max-w-lg overflow-hidden rounded-2xl border bg-muted/30 select-none">
@@ -262,97 +300,132 @@ function MarkdownImage({ src, alt }: { src?: string; alt?: string }) {
 }
 
 const defaultComponents = memoizeMarkdownComponents({
-	h1: ({ className, children, ...props }) => (
-		<h1
-			className={cn(
-				"aui-md-h1 mb-8 scroll-m-20 font-extrabold text-4xl tracking-tight last:mb-0",
-				className
-			)}
-			{...props}
-		>
-			{processChildrenWithCitations(children)}
-		</h1>
-	),
-	h2: ({ className, children, ...props }) => (
-		<h2
-			className={cn(
-				"aui-md-h2 mt-8 mb-4 scroll-m-20 font-semibold text-3xl tracking-tight first:mt-0 last:mb-0",
-				className
-			)}
-			{...props}
-		>
-			{processChildrenWithCitations(children)}
-		</h2>
-	),
-	h3: ({ className, children, ...props }) => (
-		<h3
-			className={cn(
-				"aui-md-h3 mt-6 mb-4 scroll-m-20 font-semibold text-2xl tracking-tight first:mt-0 last:mb-0",
-				className
-			)}
-			{...props}
-		>
-			{processChildrenWithCitations(children)}
-		</h3>
-	),
-	h4: ({ className, children, ...props }) => (
-		<h4
-			className={cn(
-				"aui-md-h4 mt-6 mb-4 scroll-m-20 font-semibold text-xl tracking-tight first:mt-0 last:mb-0",
-				className
-			)}
-			{...props}
-		>
-			{processChildrenWithCitations(children)}
-		</h4>
-	),
-	h5: ({ className, children, ...props }) => (
-		<h5
-			className={cn("aui-md-h5 my-4 font-semibold text-lg first:mt-0 last:mb-0", className)}
-			{...props}
-		>
-			{processChildrenWithCitations(children)}
-		</h5>
-	),
-	h6: ({ className, children, ...props }) => (
-		<h6 className={cn("aui-md-h6 my-4 font-semibold first:mt-0 last:mb-0", className)} {...props}>
-			{processChildrenWithCitations(children)}
-		</h6>
-	),
-	p: ({ className, children, ...props }) => (
-		<p className={cn("aui-md-p mt-5 mb-5 leading-7 first:mt-0 last:mb-0", className)} {...props}>
-			{processChildrenWithCitations(children)}
-		</p>
-	),
-	a: ({ className, children, ...props }) => (
-		<a
-			className={cn("aui-md-a font-medium text-primary underline underline-offset-4", className)}
-			{...props}
-		>
-			{processChildrenWithCitations(children)}
-		</a>
-	),
-	blockquote: ({ className, children, ...props }) => (
-		<blockquote className={cn("aui-md-blockquote border-l-2 pl-6 italic", className)} {...props}>
-			{processChildrenWithCitations(children)}
-		</blockquote>
-	),
+	h1: function H1({ className, children, ...props }) {
+		const urlMap = useCitationUrlMap();
+		return (
+			<h1
+				className={cn(
+					"aui-md-h1 mb-8 scroll-m-20 font-extrabold text-4xl tracking-tight last:mb-0",
+					className
+				)}
+				{...props}
+			>
+				{processChildrenWithCitations(children, urlMap)}
+			</h1>
+		);
+	},
+	h2: function H2({ className, children, ...props }) {
+		const urlMap = useCitationUrlMap();
+		return (
+			<h2
+				className={cn(
+					"aui-md-h2 mt-8 mb-4 scroll-m-20 font-semibold text-3xl tracking-tight first:mt-0 last:mb-0",
+					className
+				)}
+				{...props}
+			>
+				{processChildrenWithCitations(children, urlMap)}
+			</h2>
+		);
+	},
+	h3: function H3({ className, children, ...props }) {
+		const urlMap = useCitationUrlMap();
+		return (
+			<h3
+				className={cn(
+					"aui-md-h3 mt-6 mb-4 scroll-m-20 font-semibold text-2xl tracking-tight first:mt-0 last:mb-0",
+					className
+				)}
+				{...props}
+			>
+				{processChildrenWithCitations(children, urlMap)}
+			</h3>
+		);
+	},
+	h4: function H4({ className, children, ...props }) {
+		const urlMap = useCitationUrlMap();
+		return (
+			<h4
+				className={cn(
+					"aui-md-h4 mt-6 mb-4 scroll-m-20 font-semibold text-xl tracking-tight first:mt-0 last:mb-0",
+					className
+				)}
+				{...props}
+			>
+				{processChildrenWithCitations(children, urlMap)}
+			</h4>
+		);
+	},
+	h5: function H5({ className, children, ...props }) {
+		const urlMap = useCitationUrlMap();
+		return (
+			<h5
+				className={cn("aui-md-h5 my-4 font-semibold text-lg first:mt-0 last:mb-0", className)}
+				{...props}
+			>
+				{processChildrenWithCitations(children, urlMap)}
+			</h5>
+		);
+	},
+	h6: function H6({ className, children, ...props }) {
+		const urlMap = useCitationUrlMap();
+		return (
+			<h6 className={cn("aui-md-h6 my-4 font-semibold first:mt-0 last:mb-0", className)} {...props}>
+				{processChildrenWithCitations(children, urlMap)}
+			</h6>
+		);
+	},
+	p: function P({ className, children, ...props }) {
+		const urlMap = useCitationUrlMap();
+		const standalonePath = isStandaloneDocumentsPathText(children);
+		return (
+			<p className={cn("aui-md-p mt-5 mb-5 leading-7 first:mt-0 last:mb-0", className)} {...props}>
+				{standalonePath ? (
+					<FilePathLink path={standalonePath} />
+				) : (
+					processChildrenWithCitations(children, urlMap)
+				)}
+			</p>
+		);
+	},
+	a: function A({ className, children, ...props }) {
+		const urlMap = useCitationUrlMap();
+		return (
+			<a
+				className={cn("aui-md-a font-medium text-primary underline underline-offset-4", className)}
+				{...props}
+			>
+				{processChildrenWithCitations(children, urlMap)}
+			</a>
+		);
+	},
+	blockquote: function Blockquote({ className, children, ...props }) {
+		const urlMap = useCitationUrlMap();
+		return (
+			<blockquote className={cn("aui-md-blockquote border-l-2 pl-6 italic", className)} {...props}>
+				{processChildrenWithCitations(children, urlMap)}
+			</blockquote>
+		);
+	},
 	ul: ({ className, ...props }) => (
 		<ul className={cn("aui-md-ul my-5 ml-6 list-disc [&>li]:mt-2", className)} {...props} />
 	),
 	ol: ({ className, ...props }) => (
 		<ol className={cn("aui-md-ol my-5 ml-6 list-decimal [&>li]:mt-2", className)} {...props} />
 	),
-	li: ({ className, children, ...props }) => (
-		<li className={cn("aui-md-li", className)} {...props}>
-			{processChildrenWithCitations(children)}
-		</li>
-	),
+	li: function Li({ className, children, ...props }) {
+		const urlMap = useCitationUrlMap();
+		return (
+			<li className={cn("aui-md-li", className)} {...props}>
+				{processChildrenWithCitations(children, urlMap)}
+			</li>
+		);
+	},
 	hr: ({ className, ...props }) => (
 		<hr className={cn("aui-md-hr my-5 border-b", className)} {...props} />
 	),
 	table: ({ className, ...props }) => (
-		<div className="aui-md-table-wrapper my-5 overflow-hidden rounded-2xl border">
+		<div className="aui-md-table-wrapper my-5 overflow-hidden rounded-md border">
 			<Table className={cn("aui-md-table", className)} {...props} />
 		</div>
 	),
@@ -362,28 +435,34 @@ const defaultComponents = memoizeMarkdownComponents({
 	tbody: ({ className, ...props }) => (
 		<TableBody className={cn("aui-md-tbody", className)} {...props} />
 	),
-	th: ({ className, children, ...props }) => (
-		<TableHead
-			className={cn(
-				"aui-md-th bg-muted/50 whitespace-normal [[align=center]]:text-center [[align=right]]:text-right",
-				className
-			)}
-			{...props}
-		>
-			{processChildrenWithCitations(children)}
-		</TableHead>
-	),
-	td: ({ className, children, ...props }) => (
-		<TableCell
-			className={cn(
-				"aui-md-td whitespace-normal [[align=center]]:text-center [[align=right]]:text-right",
-				className
-			)}
-			{...props}
-		>
-			{processChildrenWithCitations(children)}
-		</TableCell>
-	),
+	th: function Th({ className, children, ...props }) {
+		const urlMap = useCitationUrlMap();
+		return (
+			<TableHead
+				className={cn(
+					"aui-md-th bg-muted/50 whitespace-normal [[align=center]]:text-center [[align=right]]:text-right",
+					className
+				)}
+				{...props}
+			>
+				{processChildrenWithCitations(children, urlMap)}
+			</TableHead>
+		);
+	},
+	td: function Td({ className, children, ...props }) {
+		const urlMap = useCitationUrlMap();
+		return (
+			<TableCell
+				className={cn(
+					"aui-md-td whitespace-normal [[align=center]]:text-center [[align=right]]:text-right",
+					className
+				)}
+				{...props}
+			>
+				{processChildrenWithCitations(children, urlMap)}
+			</TableCell>
+		);
+	},
 	tr: ({ className, ...props }) => <TableRow className={cn("aui-md-tr", className)} {...props} />,
 	sup: ({ className, ...props }) => (
 		<sup className={cn("aui-md-sup [&>a]:text-xs [&>a]:no-underline", className)} {...props} />
@@ -392,11 +471,38 @@ const defaultComponents = memoizeMarkdownComponents({
 	code: function Code({ className, children, ...props }) {
 		const isCodeBlock = useIsMarkdownCodeBlock();
 		const { resolvedTheme } = useTheme();
+		const electronAPI = useElectronAPI();
+		const language = /language-(\w+)/.exec(className || "")?.[1] ?? "text";
+		const codeString = String(children).replace(/\n$/, "");
+		const isWebLocalFileCodeBlock =
+			isCodeBlock &&
+			!electronAPI &&
+			isVirtualFilePathToken(codeString.trim()) &&
+			!codeString.trim().startsWith("//") &&
+			!codeString.includes("\n");
 		if (!isCodeBlock) {
+			const inlineValue = String(children ?? "").trim();
+			const normalizedInlinePath = inlineValue.replace(/\/+$/, "");
+			const leafSegment = normalizedInlinePath.split("/").filter(Boolean).at(-1) ?? "";
+			const isLikelyFolder =
+				inlineValue.endsWith("/") || !leafSegment || !leafSegment.includes(".");
+			const isLocalPath =
+				(isVirtualFilePathToken(inlineValue) &&
+					!inlineValue.startsWith("//") &&
+					!isLikelyFolder &&
+					!!electronAPI) ||
+				(isVirtualFilePathToken(inlineValue) &&
+					!inlineValue.startsWith("//") &&
+					!isLikelyFolder &&
+					!electronAPI &&
+					inlineValue.startsWith("/documents/"));
+			if (isLocalPath) {
+				return <FilePathLink path={inlineValue} className="text-[0.9em]" />;
+			}
 			return (
 				<code
 					className={cn(
-						"aui-md-inline-code rounded-md border bg-muted px-1.5 py-0.5 font-mono text-[0.9em] font-normal",
+						"aui-md-inline-code rounded-md bg-primary/10 px-1.5 py-0.5 font-mono text-[0.9em] font-normal text-primary/80",
 						className
 					)}
 					{...props}
@@ -405,8 +511,19 @@ const defaultComponents = memoizeMarkdownComponents({
 				</code>
 			);
 		}
-		const language = /language-(\w+)/.exec(className || "")?.[1] ?? "text";
-		const codeString = String(children).replace(/\n$/, "");
+		if (isWebLocalFileCodeBlock) {
+			return (
+				<code
+					className={cn(
+						"aui-md-inline-code rounded-md bg-primary/10 px-1.5 py-0.5 font-mono text-[0.9em] font-normal text-primary/80",
+						className
+					)}
+					{...props}
+				>
+					{codeString.trim()}
+				</code>
+			);
+		}
 		return (
 			<LazyMarkdownCodeBlock
 				className={className}
@@ -416,16 +533,22 @@ const defaultComponents = memoizeMarkdownComponents({
 			/>
 		);
 	},
-	strong: ({ className, children, ...props }) => (
-		<strong className={cn("aui-md-strong font-semibold", className)} {...props}>
-			{processChildrenWithCitations(children)}
-		</strong>
-	),
-	em: ({ className, children, ...props }) => (
-		<em className={cn("aui-md-em", className)} {...props}>
-			{processChildrenWithCitations(children)}
-		</em>
-	),
+	strong: function Strong({ className, children, ...props }) {
+		const urlMap = useCitationUrlMap();
+		return (
+			<strong className={cn("aui-md-strong font-semibold", className)} {...props}>
+				{processChildrenWithCitations(children, urlMap)}
+			</strong>
+		);
+	},
+	em: function Em({ className, children, ...props }) {
+		const urlMap = useCitationUrlMap();
+		return (
+			<em className={cn("aui-md-em", className)} {...props}>
+				{processChildrenWithCitations(children, urlMap)}
+			</em>
+		);
+	},
 	img: ({ src, alt }) => (
 		<MarkdownImage src={typeof src === "string" ? src : undefined} alt={alt} />
 	),
diff --git a/surfsense_web/components/assistant-ui/mention-chip.tsx b/surfsense_web/components/assistant-ui/mention-chip.tsx
new file mode 100644
index 0000000000..7197fccdc0
--- /dev/null
+++ b/surfsense_web/components/assistant-ui/mention-chip.tsx
@@ -0,0 +1,90 @@
+"use client";
+
+import type { MouseEventHandler, ReactNode } from "react";
+import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
+import { cn } from "@/lib/utils";
+
+/**
+ * A single, minimal chip-button used in two places:
+ *
+ * 1. User-message mention chips (rendered for every `@`-mention the user
+ *    inserted in the composer).
+ * 2. AI-answer file/folder paths (rendered when the assistant emits
+ *    `/documents/.../file.xml` or `/<mount>/.../file.ext`).
+ *
+ * Both contexts want the same visual language: a compact, button-styled
+ * chip with an icon, a truncated label, and an optional tooltip. Sharing
+ * one component keeps the chat surface visually coherent and means a UX
+ * tweak (radius, hover, icon size) lands in both places at once.
+ *
+ * Styling rules (per shadcn skill):
+ * - Semantic tokens only (`border`, `bg-background`, `bg-accent`,
+ *   `text-foreground`, `text-muted-foreground`). No raw colors.
+ * - Layout via `gap-*`, never `space-x-*`.
+ * - `cn()` for conditional classes.
+ * - No manual `z-index` — the tooltip handles its own stacking.
+ */
+export interface MentionChipProps {
+	/**
+	 * Visual prefix. Keep this small (e.g. `size-3.5`); the chip controls
+	 * its own height and oversized icons will push the label out of place.
+	 */
+	icon: ReactNode;
+	/** Label shown inside the chip; truncated with `…` past the max width. */
+	label: string;
+	/**
+	 * Full title or path shown on hover. Omit to suppress the tooltip
+	 * entirely (e.g. when the label already conveys the full identity).
+	 */
+	tooltip?: ReactNode;
+	/**
+	 * When provided, the chip behaves like a button (focusable, hover
+	 * effect, pointer cursor). Omit for a purely decorative chip.
+	 */
+	onClick?: MouseEventHandler<HTMLButtonElement>;
+	disabled?: boolean;
+	className?: string;
+	/** Optional override for the accessible name; defaults to `label`. */
+	ariaLabel?: string;
+}
+
+export function MentionChip({
+	icon,
+	label,
+	tooltip,
+	onClick,
+	disabled,
+	className,
+	ariaLabel,
+}: MentionChipProps) {
+	const isInteractive = Boolean(onClick) && !disabled;
+
+	const chip = (
+		<button
+			type="button"
+			onClick={onClick}
+			disabled={disabled}
+			aria-label={ariaLabel ?? label}
+			className={cn(
+				"inline-flex h-5 items-center gap-1 rounded bg-primary/10 px-1 align-middle text-xs font-bold text-primary/60 leading-none focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring",
+				isInteractive ? "cursor-pointer" : "cursor-default",
+				disabled && "opacity-60",
+				className
+			)}
+		>
+			<span className="inline-flex shrink-0 text-muted-foreground">{icon}</span>
+			<span className="max-w-[120px] truncate leading-none">{label}</span>
+		</button>
+	);
+
+	if (!tooltip) return chip;
+
+	return (
+		<Tooltip delayDuration={600}>
+			<TooltipTrigger asChild>{chip}</TooltipTrigger>
+			<TooltipContent side="top" className="max-w-xs break-all">
+				{tooltip}
+			</TooltipContent>
+		</Tooltip>
+	);
+}
diff --git a/surfsense_web/components/assistant-ui/nested-scroll.tsx b/surfsense_web/components/assistant-ui/nested-scroll.tsx
new file mode 100644
index 0000000000..37c4790dfa
--- /dev/null
+++ b/surfsense_web/components/assistant-ui/nested-scroll.tsx
@@ -0,0 +1,24 @@
+"use client";
+
+import { type ComponentPropsWithoutRef, forwardRef, type WheelEvent } from "react";
+
+export type NestedScrollProps = ComponentPropsWithoutRef<"div">;
+
+export const NestedScroll = forwardRef<HTMLDivElement, NestedScrollProps>(
+	({ onWheel, ...props }, ref) => {
+		const handleWheel = (event: WheelEvent<HTMLDivElement>) => {
+			const el = event.currentTarget;
+			const canScrollUp = el.scrollTop > 0;
+			const canScrollDown = el.scrollTop < el.scrollHeight - el.clientHeight - 1;
+			const goingUp = event.deltaY < 0;
+			const goingDown = event.deltaY > 0;
+			if ((goingUp && canScrollUp) || (goingDown && canScrollDown)) {
+				event.stopPropagation();
+			}
+			onWheel?.(event);
+		};
+		return <div ref={ref} onWheel={handleWheel} {...props} />;
+	}
+);
+
+NestedScroll.displayName = "NestedScroll";
diff --git a/surfsense_web/components/assistant-ui/reasoning-message-part.tsx b/surfsense_web/components/assistant-ui/reasoning-message-part.tsx
new file mode 100644
index 0000000000..72347e33cf
--- /dev/null
+++ b/surfsense_web/components/assistant-ui/reasoning-message-part.tsx
@@ -0,0 +1,84 @@
+"use client";
+
+import type { ReasoningMessagePartComponent } from "@assistant-ui/react";
+import { ChevronRightIcon } from "lucide-react";
+import { useEffect, useMemo, useState } from "react";
+import { TextShimmerLoader } from "@/components/prompt-kit/loader";
+import { Button } from "@/components/ui/button";
+import { cn } from "@/lib/utils";
+
+/**
+ * Renders the structured `reasoning` part emitted by the backend stream
+ * (typed reasoning deltas from the chat model).
+ *
+ * Behaviour mirrors the existing `ThinkingStepsDisplay`:
+ *  - collapsed by default;
+ *  - auto-expanded while the part is still `running`;
+ *  - auto-collapsed once status flips to `complete`.
+ *
+ * The component is registered via the `Reasoning` slot on
+ * `MessagePrimitive.Parts` in `assistant-message.tsx` so it lives at the
+ * exact ordinal position of the reasoning block in the message content
+ * array (i.e. above the assistant text that follows it).
+ */
+export const ReasoningMessagePart: ReasoningMessagePartComponent = ({ text, status }) => {
+	const isRunning = status?.type === "running";
+	const [isOpen, setIsOpen] = useState(() => isRunning);
+
+	useEffect(() => {
+		if (isRunning) {
+			setIsOpen(true);
+		} else if (status?.type === "complete") {
+			setIsOpen(false);
+		}
+	}, [isRunning, status?.type]);
+
+	const headerLabel = useMemo(() => {
+		if (isRunning) return "Thinking";
+		if (status?.type === "incomplete") return "Thinking interrupted";
+		return "Thought";
+	}, [isRunning, status?.type]);
+
+	if (!text || text.length === 0) {
+		if (!isRunning) return null;
+	}
+
+	return (
+		<div className="mx-auto w-full max-w-(--thread-max-width) px-2 py-2">
+			<div className="rounded-lg">
+				<Button
+					variant="ghost"
+					type="button"
+					onClick={() => setIsOpen((prev) => !prev)}
+					className={cn(
+						"h-auto w-full justify-start gap-1.5 p-0 text-left text-sm font-normal transition-colors hover:bg-transparent",
+						"text-muted-foreground hover:text-accent-foreground"
+					)}
+				>
+					{isRunning ? (
+						<TextShimmerLoader text={headerLabel} size="sm" />
+					) : (
+						<span>{headerLabel}</span>
+					)}
+					<ChevronRightIcon
+						data-icon="inline-end"
+						className={cn("transition-transform duration-200", isOpen && "rotate-90")}
+					/>
+				</Button>
+
+				<div
+					className={cn(
+						"grid transition-[grid-template-rows] duration-300 ease-out",
+						isOpen ? "grid-rows-[1fr]" : "grid-rows-[0fr]"
+					)}
+				>
+					<div className="overflow-hidden">
+						<div className="mt-2 border-l border-muted-foreground/30 pl-3 text-sm leading-relaxed text-muted-foreground whitespace-pre-wrap wrap-break-word">
+							{text}
+						</div>
+					</div>
+				</div>
+			</div>
+		</div>
+	);
+};
diff --git a/surfsense_web/components/assistant-ui/revert-turn-button.tsx b/surfsense_web/components/assistant-ui/revert-turn-button.tsx
new file mode 100644
index 0000000000..7e733fd996
--- /dev/null
+++ b/surfsense_web/components/assistant-ui/revert-turn-button.tsx
@@ -0,0 +1,231 @@
+"use client";
+
+/**
+ * "Revert turn" button rendered at the bottom of every completed
+ * assistant turn that has at least one reversible action.
+ *
+ * The button reads from the unified ``useAgentActionsQuery`` cache
+ * (the SAME react-query cache the agent-actions dialog and the inline
+ * Revert button consume) filtered by ``chat_turn_id``. It shows a
+ * confirmation dialog summarising "N reversible / M total" and, on
+ * confirm, calls ``POST /threads/{id}/revert-turn/{chat_turn_id}``.
+ *
+ * The route returns a per-action result list and never collapses the
+ * batch into a 4xx — so we render any failed/not_reversible rows inline
+ * with their messages.
+ */
+
+import { ActionBarMorePrimitive } from "@assistant-ui/react";
+import { useQueryClient } from "@tanstack/react-query";
+import { useAtomValue } from "jotai";
+import { CheckIcon, RotateCcw, XCircleIcon } from "lucide-react";
+import { useMemo, useState } from "react";
+import { toast } from "sonner";
+import { chatSessionStateAtom } from "@/atoms/chat/chat-session-state.atom";
+import {
+	AlertDialog,
+	AlertDialogAction,
+	AlertDialogCancel,
+	AlertDialogContent,
+	AlertDialogDescription,
+	AlertDialogFooter,
+	AlertDialogHeader,
+	AlertDialogTitle,
+	AlertDialogTrigger,
+} from "@/components/ui/alert-dialog";
+import { Button } from "@/components/ui/button";
+import { getToolDisplayName } from "@/contracts/enums/toolIcons";
+import {
+	applyRevertTurnResultsToCache,
+	useAgentActionsQuery,
+} from "@/hooks/use-agent-actions-query";
+import {
+	agentActionsApiService,
+	type RevertTurnActionResult,
+} from "@/lib/apis/agent-actions-api.service";
+import { AppError } from "@/lib/error";
+import { cn } from "@/lib/utils";
+
+interface RevertTurnButtonProps {
+	chatTurnId: string | null | undefined;
+	variant?: "button" | "menu-item";
+}
+
+export function RevertTurnButton({ chatTurnId, variant = "button" }: RevertTurnButtonProps) {
+	const session = useAtomValue(chatSessionStateAtom);
+	const threadId = session?.threadId ?? null;
+	const queryClient = useQueryClient();
+	const { findByChatTurnId } = useAgentActionsQuery(threadId);
+	const [isReverting, setIsReverting] = useState(false);
+	const [confirmOpen, setConfirmOpen] = useState(false);
+	const [resultsOpen, setResultsOpen] = useState(false);
+	const [results, setResults] = useState<RevertTurnActionResult[]>([]);
+
+	const actions = useMemo(() => findByChatTurnId(chatTurnId), [findByChatTurnId, chatTurnId]);
+
+	const reversibleCount = useMemo(
+		() =>
+			actions.filter(
+				(a) =>
+					a.reversible &&
+					(a.reverted_by_action_id === null || a.reverted_by_action_id === undefined) &&
+					!a.is_revert_action &&
+					(a.error === null || a.error === undefined)
+			).length,
+		[actions]
+	);
+	const totalCount = useMemo(() => actions.filter((a) => !a.is_revert_action).length, [actions]);
+
+	if (!chatTurnId) return null;
+	if (reversibleCount === 0) return null;
+	if (!threadId) return null;
+
+	const handleRevertTurn = async () => {
+		setIsReverting(true);
+		try {
+			const response = await agentActionsApiService.revertTurn(threadId, chatTurnId);
+			setResults(response.results);
+			const revertedEntries = response.results
+				.filter((r) => r.status === "reverted" || r.status === "already_reverted")
+				.map((r) => ({ id: r.action_id, newActionId: r.new_action_id ?? null }));
+			if (revertedEntries.length > 0) {
+				applyRevertTurnResultsToCache(queryClient, threadId, revertedEntries);
+			}
+			if (response.status === "ok") {
+				toast.success(
+					response.reverted === 1 ? "Reverted 1 action." : `Reverted ${response.reverted} actions.`
+				);
+			} else {
+				// Every "not undone" bucket counts as a failure for the
+				// user-facing summary. ``skipped`` rows are batch
+				// artefacts (revert rows themselves) and intentionally
+				// excluded from the failure tally.
+				const failureCount =
+					response.failed + response.not_reversible + (response.permission_denied ?? 0);
+				toast.warning(
+					`Reverted ${response.reverted} of ${response.total}. ${failureCount} could not be undone.`
+				);
+				setResultsOpen(true);
+			}
+		} catch (err) {
+			if (err instanceof AppError && err.status === 503) {
+				return;
+			}
+			const message =
+				err instanceof AppError
+					? err.message
+					: err instanceof Error
+						? err.message
+						: "Failed to revert turn.";
+			toast.error(message);
+		} finally {
+			setIsReverting(false);
+			setConfirmOpen(false);
+		}
+	};
+
+	return (
+		<>
+			<AlertDialog open={confirmOpen} onOpenChange={setConfirmOpen}>
+				{variant === "menu-item" ? (
+					<ActionBarMorePrimitive.Item
+						className="focus:bg-accent focus:text-accent-foreground relative flex cursor-default items-center gap-2 rounded-sm px-2 py-1.5 text-sm outline-hidden select-none"
+						onSelect={(e) => {
+							e.preventDefault();
+							setConfirmOpen(true);
+						}}
+					>
+						<RotateCcw className="size-3.5" />
+						<span>Revert turn</span>
+						<span className="ml-auto text-xs tabular-nums opacity-70">
+							{reversibleCount}/{totalCount}
+						</span>
+					</ActionBarMorePrimitive.Item>
+				) : (
+					<AlertDialogTrigger asChild>
+						<Button
+							size="sm"
+							variant="ghost"
+							className="text-muted-foreground hover:text-accent-foreground gap-1.5"
+							onClick={(e) => {
+								e.stopPropagation();
+								setConfirmOpen(true);
+							}}
+						>
+							<RotateCcw className="size-3.5" />
+							<span>Revert turn</span>
+							<span className="text-xs tabular-nums opacity-70">
+								{reversibleCount}/{totalCount}
+							</span>
+						</Button>
+					</AlertDialogTrigger>
+				)}
+				<AlertDialogContent>
+					<AlertDialogHeader>
+						<AlertDialogTitle>Revert this turn?</AlertDialogTitle>
+						<AlertDialogDescription>
+							This will undo {reversibleCount} of {totalCount} action
+							{totalCount === 1 ? "" : "s"} from this turn in reverse order. The chat history and
+							any read-only actions are preserved. Some rows may not be reversible — partial success
+							is normal.
+						</AlertDialogDescription>
+					</AlertDialogHeader>
+					<AlertDialogFooter>
+						<AlertDialogCancel disabled={isReverting}>Cancel</AlertDialogCancel>
+						<AlertDialogAction
+							onClick={(e) => {
+								e.preventDefault();
+								handleRevertTurn();
+							}}
+							disabled={isReverting}
+						>
+							{isReverting ? "Reverting…" : "Revert turn"}
+						</AlertDialogAction>
+					</AlertDialogFooter>
+				</AlertDialogContent>
+			</AlertDialog>
+
+			<AlertDialog open={resultsOpen} onOpenChange={setResultsOpen}>
+				<AlertDialogContent>
+					<AlertDialogHeader>
+						<AlertDialogTitle>Revert results</AlertDialogTitle>
+						<AlertDialogDescription>
+							Some actions could not be reverted. Review per-row outcomes below.
+						</AlertDialogDescription>
+					</AlertDialogHeader>
+					<ul className="max-h-72 overflow-y-auto space-y-2 text-sm">
+						{results.map((r) => (
+							<RevertResultRow key={r.action_id} result={r} />
+						))}
+					</ul>
+					<AlertDialogFooter>
+						<AlertDialogAction onClick={() => setResultsOpen(false)}>Close</AlertDialogAction>
+					</AlertDialogFooter>
+				</AlertDialogContent>
+			</AlertDialog>
+		</>
+	);
+}
+
+function RevertResultRow({ result }: { result: RevertTurnActionResult }) {
+	const isOk = result.status === "reverted" || result.status === "already_reverted";
+	const Icon = isOk ? CheckIcon : XCircleIcon;
+	return (
+		<li className="flex items-start gap-2 rounded-md border bg-muted/30 px-3 py-2">
+			<Icon
+				className={cn("size-4 mt-0.5 shrink-0", isOk ? "text-emerald-500" : "text-destructive")}
+			/>
+			<div className="min-w-0 flex-1">
+				<p className="font-medium truncate">
+					{getToolDisplayName(result.tool_name)}{" "}
+					<span className="ml-1 text-xs text-muted-foreground">
+						{result.status.replace(/_/g, " ")}
+					</span>
+				</p>
+				{(result.message || result.error) && (
+					<p className="text-xs text-muted-foreground mt-0.5">{result.error ?? result.message}</p>
+				)}
+			</div>
+		</li>
+	);
+}
diff --git a/surfsense_web/components/assistant-ui/step-separator.tsx b/surfsense_web/components/assistant-ui/step-separator.tsx
new file mode 100644
index 0000000000..f591306617
--- /dev/null
+++ b/surfsense_web/components/assistant-ui/step-separator.tsx
@@ -0,0 +1,27 @@
+"use client";
+
+import { makeAssistantDataUI } from "@assistant-ui/react";
+
+/**
+ * Renders a thin horizontal divider between model steps within a single
+ * assistant turn. The data part is pushed by `addStepSeparator` in
+ * `streaming-state.ts` whenever a `start-step` SSE event arrives after
+ * the message already has non-step content.
+ *
+ * Today the backend emits one `start-step` / `finish-step` pair per turn,
+ * so most messages won't contain a separator. The renderer is wired up so
+ * the planned per-model-step refactor (A2 follow-up) can light up without
+ * touching the persistence path.
+ */
+function StepSeparatorDataRenderer() {
+	return (
+		<div className="mx-auto my-3 w-full max-w-(--thread-max-width) px-2">
+			<div className="border-t border-border/60" />
+		</div>
+	);
+}
+
+export const StepSeparatorDataUI = makeAssistantDataUI({
+	name: "step-separator",
+	render: StepSeparatorDataRenderer,
+});
diff --git a/surfsense_web/components/assistant-ui/thinking-steps.tsx b/surfsense_web/components/assistant-ui/thinking-steps.tsx
deleted file mode 100644
index df1cef12ca..0000000000
--- a/surfsense_web/components/assistant-ui/thinking-steps.tsx
+++ /dev/null
@@ -1,175 +0,0 @@
-import { makeAssistantDataUI, useAuiState } from "@assistant-ui/react";
-import { ChevronRightIcon } from "lucide-react";
-import type { FC } from "react";
-import { useCallback, useEffect, useState } from "react";
-import { ChainOfThoughtItem } from "@/components/prompt-kit/chain-of-thought";
-import { TextShimmerLoader } from "@/components/prompt-kit/loader";
-import { cn } from "@/lib/utils";
-
-export interface ThinkingStep {
-	id: string;
-	title: string;
-	items: string[];
-	status: "pending" | "in_progress" | "completed";
-}
-
-/**
- * Chain of thought display component - single collapsible dropdown design
- */
-export const ThinkingStepsDisplay: FC<{ steps: ThinkingStep[]; isThreadRunning?: boolean }> = ({
-	steps,
-	isThreadRunning = true,
-}) => {
-	const getEffectiveStatus = useCallback(
-		(step: ThinkingStep): "pending" | "in_progress" | "completed" => {
-			if (step.status === "in_progress" && !isThreadRunning) {
-				return "completed";
-			}
-			return step.status;
-		},
-		[isThreadRunning]
-	);
-
-	const inProgressStep = steps.find((s) => getEffectiveStatus(s) === "in_progress");
-	const allCompleted =
-		steps.length > 0 &&
-		!isThreadRunning &&
-		steps.every((s) => getEffectiveStatus(s) === "completed");
-	const isProcessing = isThreadRunning && !allCompleted;
-	const [isOpen, setIsOpen] = useState(() => isProcessing);
-
-	useEffect(() => {
-		if (isProcessing) {
-			setIsOpen(true);
-			return;
-		}
-
-		if (allCompleted) {
-			setIsOpen(false);
-		}
-	}, [allCompleted, isProcessing]);
-
-	if (steps.length === 0) return null;
-
-	const getHeaderText = () => {
-		if (allCompleted) {
-			return "Reviewed";
-		}
-		if (inProgressStep) {
-			return inProgressStep.title;
-		}
-		if (isProcessing) {
-			return "Processing";
-		}
-		return "Reviewed";
-	};
-
-	return (
-		<div className="mx-auto w-full max-w-(--thread-max-width) px-2 py-2">
-			<div className="rounded-lg">
-				<button
-					type="button"
-					onClick={() => setIsOpen((prev) => !prev)}
-					className={cn(
-						"flex w-full items-center gap-1.5 text-left text-sm transition-colors",
-						"text-muted-foreground hover:text-foreground"
-					)}
-				>
-					{isProcessing ? (
-						<TextShimmerLoader text={getHeaderText()} size="sm" />
-					) : (
-						<span>{getHeaderText()}</span>
-					)}
-
-					<ChevronRightIcon
-						className={cn("size-4 transition-transform duration-200", isOpen && "rotate-90")}
-					/>
-				</button>
-
-				<div
-					className={cn(
-						"grid transition-[grid-template-rows] duration-300 ease-out",
-						isOpen ? "grid-rows-[1fr]" : "grid-rows-[0fr]"
-					)}
-				>
-					<div className="overflow-hidden">
-						<div className="mt-3 pl-1">
-							{steps.map((step, index) => {
-								const effectiveStatus = getEffectiveStatus(step);
-								const isLast = index === steps.length - 1;
-
-								return (
-									<div key={step.id} className="relative flex gap-3">
-										<div className="relative flex flex-col items-center w-2">
-											{!isLast && (
-												<div className="absolute left-1/2 top-[15px] -bottom-[7px] w-px -translate-x-1/2 bg-muted-foreground/30" />
-											)}
-											<div className="relative z-10 mt-[7px] flex shrink-0 items-center justify-center">
-												{effectiveStatus === "in_progress" ? (
-													<span className="relative flex size-2">
-														<span className="absolute inline-flex size-full animate-ping rounded-full bg-primary/60" />
-														<span className="relative inline-flex size-2 rounded-full bg-primary" />
-													</span>
-												) : (
-													<span className="size-2 rounded-full bg-muted-foreground/30" />
-												)}
-											</div>
-										</div>
-
-										<div className="flex-1 min-w-0 pb-4">
-											<div
-												className={cn(
-													"text-sm leading-5",
-													effectiveStatus === "in_progress" && "text-foreground font-medium",
-													effectiveStatus === "completed" && "text-muted-foreground",
-													effectiveStatus === "pending" && "text-muted-foreground/60"
-												)}
-											>
-												{step.title}
-											</div>
-
-											{step.items && step.items.length > 0 && (
-												<div className="mt-1 space-y-0.5">
-													{step.items.map((item) => (
-														<ChainOfThoughtItem key={`${step.id}-${item}`} className="text-xs">
-															{item}
-														</ChainOfThoughtItem>
-													))}
-												</div>
-											)}
-										</div>
-									</div>
-								);
-							})}
-						</div>
-					</div>
-				</div>
-			</div>
-		</div>
-	);
-};
-
-/**
- * assistant-ui data UI component that renders thinking steps from message content.
- * Registered globally via makeAssistantDataUI — renders inside MessagePrimitive.Parts
- * at the position of the data part in the content array.
- */
-function ThinkingStepsDataRenderer({ data }: { name: string; data: unknown }) {
-	const isThreadRunning = useAuiState(({ thread }) => thread.isRunning);
-	const isLastMessage = useAuiState(({ message }) => message?.isLast ?? false);
-	const isMessageStreaming = isThreadRunning && isLastMessage;
-
-	const steps = (data as { steps: ThinkingStep[] } | null)?.steps ?? [];
-	if (steps.length === 0) return null;
-
-	return (
-		<div className="mb-3 -mx-2 leading-normal">
-			<ThinkingStepsDisplay steps={steps} isThreadRunning={isMessageStreaming} />
-		</div>
-	);
-}
-
-export const ThinkingStepsDataUI = makeAssistantDataUI({
-	name: "thinking-steps",
-	render: ThinkingStepsDataRenderer,
-});
diff --git a/surfsense_web/components/assistant-ui/thread-list.tsx b/surfsense_web/components/assistant-ui/thread-list.tsx
deleted file mode 100644
index bca36c037d..0000000000
--- a/surfsense_web/components/assistant-ui/thread-list.tsx
+++ /dev/null
@@ -1,298 +0,0 @@
-"use client";
-
-import {
-	ArchiveIcon,
-	MessageSquareIcon,
-	MoreVerticalIcon,
-	PlusIcon,
-	RotateCcwIcon,
-	TrashIcon,
-} from "lucide-react";
-import { useRouter } from "next/navigation";
-import { memo, useCallback, useEffect, useMemo, useState } from "react";
-import { Button } from "@/components/ui/button";
-import {
-	DropdownMenu,
-	DropdownMenuContent,
-	DropdownMenuItem,
-	DropdownMenuSeparator,
-	DropdownMenuTrigger,
-} from "@/components/ui/dropdown-menu";
-import {
-	createThreadListManager,
-	type ThreadListItem,
-	type ThreadListState,
-} from "@/lib/chat/thread-persistence";
-import { cn } from "@/lib/utils";
-
-interface ThreadListProps {
-	searchSpaceId: number;
-	currentThreadId?: number;
-	className?: string;
-}
-
-export function ThreadList({ searchSpaceId, currentThreadId, className }: ThreadListProps) {
-	const router = useRouter();
-	const [state, setState] = useState<ThreadListState>({
-		threads: [],
-		archivedThreads: [],
-		isLoading: true,
-		error: null,
-	});
-	const [showArchived, setShowArchived] = useState(false);
-
-	// Create the thread list manager
-	const manager = useCallback(
-		() =>
-			createThreadListManager({
-				searchSpaceId,
-				currentThreadId: currentThreadId ?? null,
-				onThreadSwitch: (threadId) => {
-					router.push(`/dashboard/${searchSpaceId}/new-chat/${threadId}`);
-				},
-				onNewThread: (threadId) => {
-					router.push(`/dashboard/${searchSpaceId}/new-chat/${threadId}`);
-				},
-			}),
-		[searchSpaceId, currentThreadId, router]
-	);
-
-	// Load threads on mount and when searchSpaceId changes
-	const loadThreads = useCallback(async () => {
-		setState((prev) => ({ ...prev, isLoading: true }));
-		const newState = await manager().loadThreads();
-		setState(newState);
-	}, [manager]);
-
-	useEffect(() => {
-		loadThreads();
-	}, [loadThreads]);
-
-	// Handle new thread creation
-	const handleNewThread = async () => {
-		await manager().createNewThread();
-		await loadThreads();
-	};
-
-	// Handle thread actions
-	const handleArchive = async (threadId: number) => {
-		const success = await manager().archiveThread(threadId);
-		if (success) await loadThreads();
-	};
-
-	const handleUnarchive = async (threadId: number) => {
-		const success = await manager().unarchiveThread(threadId);
-		if (success) await loadThreads();
-	};
-
-	const handleDelete = async (threadId: number) => {
-		const success = await manager().deleteThread(threadId);
-		if (success) {
-			await loadThreads();
-			// If we deleted the current thread, redirect to new chat
-			if (threadId === currentThreadId) {
-				router.push(`/dashboard/${searchSpaceId}/new-chat`);
-			}
-		}
-	};
-
-	const handleSwitchToThread = (threadId: number) => {
-		manager().switchToThread(threadId);
-	};
-
-	const displayedThreads = showArchived ? state.archivedThreads : state.threads;
-
-	if (state.isLoading) {
-		return (
-			<div className={cn("flex h-full flex-col", className)}>
-				<div className="flex items-center justify-center p-4">
-					<span className="text-muted-foreground text-sm">Loading threads...</span>
-				</div>
-			</div>
-		);
-	}
-
-	if (state.error) {
-		return (
-			<div className={cn("flex h-full flex-col", className)}>
-				<div className="p-4 text-center">
-					<span className="text-destructive text-sm">{state.error}</span>
-					<Button variant="ghost" size="sm" className="mt-2" onClick={loadThreads}>
-						Retry
-					</Button>
-				</div>
-			</div>
-		);
-	}
-
-	return (
-		<div className={cn("flex h-full flex-col", className)}>
-			{/* Header with New Chat button */}
-			<div className="flex items-center justify-between border-b p-3">
-				<h2 className="font-semibold text-sm">Conversations</h2>
-				<Button
-					variant="ghost"
-					size="icon"
-					className="size-8"
-					onClick={handleNewThread}
-					title="New Chat"
-				>
-					<PlusIcon className="size-4" />
-				</Button>
-			</div>
-
-			{/* Tab toggle for active/archived */}
-			<div className="flex border-b">
-				<button
-					type="button"
-					onClick={() => setShowArchived(false)}
-					className={cn(
-						"flex-1 px-3 py-2 text-center text-xs font-medium transition-colors",
-						!showArchived
-							? "border-b-2 border-primary text-primary"
-							: "text-muted-foreground hover:text-foreground"
-					)}
-				>
-					Active ({state.threads.length})
-				</button>
-				<button
-					type="button"
-					onClick={() => setShowArchived(true)}
-					className={cn(
-						"flex-1 px-3 py-2 text-center text-xs font-medium transition-colors",
-						showArchived
-							? "border-b-2 border-primary text-primary"
-							: "text-muted-foreground hover:text-foreground"
-					)}
-				>
-					Archived ({state.archivedThreads.length})
-				</button>
-			</div>
-
-			{/* Thread list */}
-			<div className="flex-1 overflow-y-auto">
-				{displayedThreads.length === 0 ? (
-					<div className="flex flex-col items-center justify-center p-6 text-center">
-						<MessageSquareIcon className="mb-2 size-8 text-muted-foreground/50" />
-						<p className="text-muted-foreground text-sm">
-							{showArchived ? "No archived conversations" : "No conversations yet"}
-						</p>
-						{!showArchived && (
-							<Button variant="outline" size="sm" className="mt-3" onClick={handleNewThread}>
-								<PlusIcon className="mr-1 size-3" />
-								Start a conversation
-							</Button>
-						)}
-					</div>
-				) : (
-					<div className="space-y-1 p-2">
-						{displayedThreads.map((thread) => (
-							<ThreadListItemComponent
-								key={thread.id}
-								thread={thread}
-								isActive={thread.id === currentThreadId}
-								isArchived={showArchived}
-								onClick={() => handleSwitchToThread(thread.id)}
-								onArchive={() => handleArchive(thread.id)}
-								onUnarchive={() => handleUnarchive(thread.id)}
-								onDelete={() => handleDelete(thread.id)}
-							/>
-						))}
-					</div>
-				)}
-			</div>
-		</div>
-	);
-}
-
-interface ThreadListItemComponentProps {
-	thread: ThreadListItem;
-	isActive: boolean;
-	isArchived: boolean;
-	onClick: () => void;
-	onArchive: () => void;
-	onUnarchive: () => void;
-	onDelete: () => void;
-}
-
-const ThreadListItemComponent = memo(function ThreadListItemComponent({
-	thread,
-	isActive,
-	isArchived,
-	onClick,
-	onArchive,
-	onUnarchive,
-	onDelete,
-}: ThreadListItemComponentProps) {
-	const relativeTime = useMemo(
-		() => formatRelativeTime(new Date(thread.updatedAt)),
-		[thread.updatedAt]
-	);
-
-	return (
-		<button
-			type="button"
-			className={cn(
-				"group flex w-full items-center gap-2 rounded-lg px-3 py-2 transition-colors cursor-pointer text-left",
-				isActive ? "bg-accent text-accent-foreground" : "hover:bg-muted/50"
-			)}
-			onClick={onClick}
-		>
-			<MessageSquareIcon className="size-4 shrink-0 text-muted-foreground" />
-			<div className="flex-1 min-w-0">
-				<p className="truncate text-sm font-medium">{thread.title || "New Chat"}</p>
-				<p className="truncate text-xs text-muted-foreground">{relativeTime}</p>
-			</div>
-			<DropdownMenu>
-				<DropdownMenuTrigger asChild>
-					<Button
-						variant="ghost"
-						size="icon"
-						className="size-7 opacity-0 group-hover:opacity-100 transition-opacity"
-						onClick={(e) => e.stopPropagation()}
-					>
-						<MoreVerticalIcon className="size-4" />
-					</Button>
-				</DropdownMenuTrigger>
-				<DropdownMenuContent align="end">
-					{isArchived ? (
-						<DropdownMenuItem onClick={onUnarchive}>
-							<RotateCcwIcon className="mr-2 size-4" />
-							Unarchive
-						</DropdownMenuItem>
-					) : (
-						<DropdownMenuItem onClick={onArchive}>
-							<ArchiveIcon className="mr-2 size-4" />
-							Archive
-						</DropdownMenuItem>
-					)}
-					<DropdownMenuSeparator />
-					<DropdownMenuItem onClick={onDelete}>
-						<TrashIcon className="mr-2 size-4" />
-						Delete
-					</DropdownMenuItem>
-				</DropdownMenuContent>
-			</DropdownMenu>
-		</button>
-	);
-});
-
-/**
- * Format a date as relative time (e.g., "2 hours ago", "Yesterday")
- */
-function formatRelativeTime(date: Date): string {
-	const now = new Date();
-	const diffMs = now.getTime() - date.getTime();
-	const diffSecs = Math.floor(diffMs / 1000);
-	const diffMins = Math.floor(diffSecs / 60);
-	const diffHours = Math.floor(diffMins / 60);
-	const diffDays = Math.floor(diffHours / 24);
-
-	if (diffSecs < 60) return "Just now";
-	if (diffMins < 60) return `${diffMins} min${diffMins === 1 ? "" : "s"} ago`;
-	if (diffHours < 24) return `${diffHours} hour${diffHours === 1 ? "" : "s"} ago`;
-	if (diffDays === 1) return "Yesterday";
-	if (diffDays < 7) return `${diffDays} days ago`;
-
-	return date.toLocaleDateString();
-}
diff --git a/surfsense_web/components/assistant-ui/thread-scroll-to-bottom.tsx b/surfsense_web/components/assistant-ui/thread-scroll-to-bottom.tsx
deleted file mode 100644
index 394ba5d795..0000000000
--- a/surfsense_web/components/assistant-ui/thread-scroll-to-bottom.tsx
+++ /dev/null
@@ -1,18 +0,0 @@
-import { ThreadPrimitive } from "@assistant-ui/react";
-import { ArrowDownIcon } from "lucide-react";
-import type { FC } from "react";
-import { TooltipIconButton } from "@/components/assistant-ui/tooltip-icon-button";
-
-export const ThreadScrollToBottom: FC = () => {
-	return (
-		<ThreadPrimitive.ScrollToBottom asChild>
-			<TooltipIconButton
-				tooltip="Scroll to bottom"
-				variant="outline"
-				className="aui-thread-scroll-to-bottom -top-12 absolute z-10 self-center rounded-full p-4 disabled:invisible dark:bg-main-panel dark:hover:bg-accent"
-			>
-				<ArrowDownIcon />
-			</TooltipIconButton>
-		</ThreadPrimitive.ScrollToBottom>
-	);
-};
diff --git a/surfsense_web/components/assistant-ui/thread.tsx b/surfsense_web/components/assistant-ui/thread.tsx
index e1351cb180..6876ce23e8 100644
--- a/surfsense_web/components/assistant-ui/thread.tsx
+++ b/surfsense_web/components/assistant-ui/thread.tsx
@@ -5,17 +5,15 @@ import {
 	ThreadPrimitive,
 	useAui,
 	useAuiState,
-	useThreadViewportStore,
 } from "@assistant-ui/react";
 import { useAtom, useAtomValue, useSetAtom } from "jotai";
 import {
 	AlertCircle,
-	ArrowDownIcon,
 	ArrowUpIcon,
+	Camera,
 	ChevronDown,
-	ChevronUp,
+	ChevronRight,
 	Clipboard,
-	Dot,
 	Globe,
 	Plus,
 	Settings2,
@@ -28,8 +26,7 @@ import {
 import { AnimatePresence, motion } from "motion/react";
 import Image from "next/image";
 import { useParams } from "next/navigation";
-import { type FC, useCallback, useEffect, useLayoutEffect, useMemo, useRef, useState } from "react";
-import { createPortal } from "react-dom";
+import { type FC, useCallback, useEffect, useMemo, useRef, useState } from "react";
 import {
 	agentToolsAtom,
 	disabledToolsAtom,
@@ -37,13 +34,18 @@ import {
 	toggleToolAtom,
 } from "@/atoms/agent-tools/agent-tools.atoms";
 import { chatSessionStateAtom } from "@/atoms/chat/chat-session-state.atom";
+import { currentThreadAtom } from "@/atoms/chat/current-thread.atom";
 import {
+	type MentionedDocumentInfo,
 	mentionedDocumentsAtom,
-	sidebarSelectedDocumentsAtom,
 } from "@/atoms/chat/mentioned-documents.atom";
+import { pendingUserImageDataUrlsAtom } from "@/atoms/chat/pending-user-images.atom";
+import {
+	clearPremiumAlertForThreadAtom,
+	premiumAlertByThreadAtom,
+} from "@/atoms/chat/premium-alert.atom";
 import { connectorDialogOpenAtom } from "@/atoms/connector-dialog/connector-dialog.atoms";
 import { connectorsAtom } from "@/atoms/connectors/connector-query.atoms";
-import { documentsSidebarOpenAtom } from "@/atoms/documents/ui.atoms";
 import { membersAtom } from "@/atoms/members/members-query.atoms";
 import {
 	globalNewLLMConfigsAtom,
@@ -53,15 +55,16 @@ import {
 import { currentUserAtom } from "@/atoms/user/user-query.atoms";
 import { AssistantMessage } from "@/components/assistant-ui/assistant-message";
 import { ChatSessionStatus } from "@/components/assistant-ui/chat-session-status";
+import { ChatViewport } from "@/components/assistant-ui/chat-viewport";
 import { ConnectorIndicator } from "@/components/assistant-ui/connector-popup";
 import { useDocumentUploadDialog } from "@/components/assistant-ui/document-upload-popup";
 import {
 	InlineMentionEditor,
 	type InlineMentionEditorRef,
+	type MentionedDocument,
 } from "@/components/assistant-ui/inline-mention-editor";
 import { TooltipIconButton } from "@/components/assistant-ui/tooltip-icon-button";
 import { UserMessage } from "@/components/assistant-ui/user-message";
-import { SLIDEOUT_PANEL_OPENED_EVENT } from "@/components/layout/ui/sidebar/SidebarSlideOutPanel";
 import {
 	DocumentMentionPicker,
 	type DocumentMentionPickerRef,
@@ -69,30 +72,43 @@ import {
 import { PromptPicker, type PromptPickerRef } from "@/components/new-chat/prompt-picker";
 import { Avatar, AvatarFallback, AvatarGroup } from "@/components/ui/avatar";
 import { Button } from "@/components/ui/button";
-import { Drawer, DrawerContent, DrawerHandle, DrawerTitle } from "@/components/ui/drawer";
+import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible";
+import {
+	Drawer,
+	DrawerContent,
+	DrawerHandle,
+	DrawerHeader,
+	DrawerTitle,
+} from "@/components/ui/drawer";
 import {
 	DropdownMenu,
 	DropdownMenuContent,
 	DropdownMenuItem,
+	DropdownMenuPortal,
+	DropdownMenuSub,
+	DropdownMenuSubContent,
+	DropdownMenuSubTrigger,
 	DropdownMenuTrigger,
 } from "@/components/ui/dropdown-menu";
-import { Popover, PopoverContent, PopoverTrigger } from "@/components/ui/popover";
+import { Skeleton } from "@/components/ui/skeleton";
 import { Switch } from "@/components/ui/switch";
-import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
 import { getConnectorIcon } from "@/contracts/enums/connectorIcons";
 import {
 	CONNECTOR_ICON_TO_TYPES,
 	CONNECTOR_TOOL_ICON_PATHS,
+	getToolDisplayName,
 	getToolIcon,
 } from "@/contracts/enums/toolIcons";
-import type { Document } from "@/contracts/types/document.types";
 import { useBatchCommentsPreload } from "@/hooks/use-comments";
 import { useCommentsSync } from "@/hooks/use-comments-sync";
 import { useMediaQuery } from "@/hooks/use-media-query";
 import { useElectronAPI } from "@/hooks/use-platform";
+import { captureDisplayToPngDataUrl } from "@/lib/chat/display-media-capture";
+import { getMentionDocKey } from "@/lib/chat/mention-doc-key";
+import { SLIDEOUT_PANEL_OPENED_EVENT } from "@/lib/layout-events";
 import { cn } from "@/lib/utils";
 
-const COMPOSER_PLACEHOLDER = "Ask anything · Type / for prompts · Type @ to mention docs";
+const COMPOSER_PLACEHOLDER = "Ask anything, type / for prompts, type @ to mention docs";
 
 export const Thread: FC = () => {
 	return <ThreadContent />;
@@ -103,13 +119,16 @@ const ThreadContent: FC = () => {
 		<ThreadPrimitive.Root
 			className="aui-root aui-thread-root @container flex h-full min-h-0 flex-col bg-main-panel"
 			style={{
-				["--thread-max-width" as string]: "44rem",
+				["--thread-max-width" as string]: "42rem",
 			}}
 		>
-			<ThreadPrimitive.Viewport
-				turnAnchor="top"
-				className="aui-thread-viewport relative flex flex-1 min-h-0 flex-col overflow-y-auto px-4 pt-4"
-				style={{ scrollbarGutter: "stable" }}
+			<ChatViewport
+				footer={
+					<AuiIf condition={({ thread }) => !thread.isEmpty}>
+						<PremiumQuotaPinnedAlert />
+						<Composer />
+					</AuiIf>
+				}
 			>
 				<AuiIf condition={({ thread }) => thread.isEmpty}>
 					<ThreadWelcome />
@@ -122,70 +141,65 @@ const ThreadContent: FC = () => {
 						AssistantMessage,
 					}}
 				/>
-
-				<ThreadPrimitive.ViewportFooter
-					className="aui-thread-viewport-footer sticky bottom-0 z-10 mx-auto flex w-full max-w-(--thread-max-width) flex-col gap-4 overflow-visible rounded-t-3xl bg-main-panel pb-4 md:pb-6"
-					style={{ paddingBottom: "max(1rem, env(safe-area-inset-bottom))" }}
-				>
-					<ThreadScrollToBottom />
-					<AuiIf condition={({ thread }) => !thread.isEmpty}>
-						<div className="fade-in slide-in-from-bottom-4 animate-in duration-500 ease-out fill-mode-both">
-							<Composer />
-						</div>
-					</AuiIf>
-				</ThreadPrimitive.ViewportFooter>
-			</ThreadPrimitive.Viewport>
+			</ChatViewport>
 		</ThreadPrimitive.Root>
 	);
 };
 
-const ThreadScrollToBottom: FC = () => {
+const PremiumQuotaPinnedAlert: FC = () => {
+	const currentThreadState = useAtomValue(currentThreadAtom);
+	const alertsByThread = useAtomValue(premiumAlertByThreadAtom);
+	const clearPremiumAlertForThread = useSetAtom(clearPremiumAlertForThreadAtom);
+
+	const currentThreadId = currentThreadState?.id;
+	if (!currentThreadId) return null;
+
+	const alert = alertsByThread[currentThreadId];
+	if (!alert) return null;
+
 	return (
-		<ThreadPrimitive.ScrollToBottom asChild>
-			<TooltipIconButton
-				tooltip="Scroll to bottom"
-				variant="outline"
-				className="aui-thread-scroll-to-bottom -top-12 absolute z-10 self-center rounded-full p-4 disabled:invisible dark:bg-main-panel dark:hover:bg-accent"
-			>
-				<ArrowDownIcon />
-			</TooltipIconButton>
-		</ThreadPrimitive.ScrollToBottom>
+		<div className="mx-0 overflow-hidden rounded-2xl border-input bg-muted px-4 py-4 text-foreground select-none">
+			<div className="flex items-center gap-2">
+				<AlertCircle className="size-4 shrink-0 text-muted-foreground" />
+				<div className="min-w-0 flex-1">
+					<p className="text-sm">{alert.message}</p>
+				</div>
+				<Button
+					type="button"
+					variant="ghost"
+					size="icon"
+					className="size-6 text-muted-foreground hover:bg-transparent hover:text-accent-foreground"
+					aria-label="Dismiss premium quota alert"
+					onClick={() => clearPremiumAlertForThread(currentThreadId)}
+				>
+					<X className="size-4" />
+				</Button>
+			</div>
+		</div>
 	);
 };
 
 const getTimeBasedGreeting = (user?: { display_name?: string | null; email?: string }): string => {
 	const hour = new Date().getHours();
 
-	// Extract first name: prefer display_name, fall back to email extraction
 	let firstName: string | null = null;
-
 	if (user?.display_name?.trim()) {
-		// Use display_name if available and not empty
-		// Extract first name from display_name (take first word)
 		const nameParts = user.display_name.trim().split(/\s+/);
 		firstName = nameParts[0].charAt(0).toUpperCase() + nameParts[0].slice(1).toLowerCase();
 	} else if (user?.email) {
-		// Fall back to email extraction if display_name is not available
 		firstName =
 			user.email.split("@")[0].split(".")[0].charAt(0).toUpperCase() +
 			user.email.split("@")[0].split(".")[0].slice(1);
 	}
 
-	// Array of greeting variations for each time period
 	const morningGreetings = ["Good morning", "Fresh start today", "Morning", "Hey there"];
-
 	const afternoonGreetings = ["Good afternoon", "Afternoon", "Hey there", "Hi there"];
-
 	const eveningGreetings = ["Good evening", "Evening", "Hey there", "Hi there"];
-
 	const nightGreetings = ["Good night", "Evening", "Hey there", "Winding down"];
-
 	const lateNightGreetings = ["Still up", "Night owl mode", "Up past bedtime", "Hi there"];
 
-	// Select a random greeting based on time
 	let greeting: string;
 	if (hour < 5) {
-		// Late night: midnight to 5 AM
 		greeting = lateNightGreetings[Math.floor(Math.random() * lateNightGreetings.length)];
 	} else if (hour < 12) {
 		greeting = morningGreetings[Math.floor(Math.random() * morningGreetings.length)];
@@ -194,33 +208,23 @@ const getTimeBasedGreeting = (user?: { display_name?: string | null; email?: str
 	} else if (hour < 22) {
 		greeting = eveningGreetings[Math.floor(Math.random() * eveningGreetings.length)];
 	} else {
-		// Night: 10 PM to midnight
 		greeting = nightGreetings[Math.floor(Math.random() * nightGreetings.length)];
 	}
 
-	// Add personalization with first name if available
-	if (firstName) {
-		return `${greeting}, ${firstName}!`;
-	}
-
-	return `${greeting}!`;
+	return firstName ? `${greeting}, ${firstName}!` : `${greeting}!`;
 };
 
 const ThreadWelcome: FC = () => {
 	const { data: user } = useAtomValue(currentUserAtom);
-
-	// Memoize greeting so it doesn't change on re-renders (only on user change)
 	const greeting = useMemo(() => getTimeBasedGreeting(user), [user]);
 
 	return (
 		<div className="aui-thread-welcome-root mx-auto flex w-full max-w-(--thread-max-width) grow flex-col items-center px-4 relative">
-			{/* Greeting positioned above the composer */}
 			<div className="aui-thread-welcome-message absolute bottom-[calc(50%+5rem)] left-0 right-0 flex flex-col items-center text-center">
-				<h1 className="aui-thread-welcome-message-inner text-3xl md:text-5xl select-none">
+				<h1 className="aui-thread-welcome-message-inner text-3xl md:text-[2.625rem] select-none">
 					{greeting}
 				</h1>
 			</div>
-			{/* Composer - top edge fixed, expands downward only */}
 			<div className="w-full flex items-start justify-center absolute top-[calc(50%-3.5rem)] left-0 right-0">
 				<Composer />
 			</div>
@@ -238,57 +242,120 @@ const BANNER_CONNECTORS = [
 
 const BANNER_DISMISSED_KEY = "surfsense-connect-tools-banner-dismissed";
 
-const ConnectToolsBanner: FC<{ isThreadEmpty: boolean }> = ({ isThreadEmpty }) => {
+const ConnectToolsBanner: FC<{
+	isThreadEmpty: boolean;
+	onVisibleChange?: (visible: boolean) => void;
+}> = ({ isThreadEmpty, onVisibleChange }) => {
 	const { data: connectors } = useAtomValue(connectorsAtom);
 	const setConnectorDialogOpen = useSetAtom(connectorDialogOpenAtom);
 	const [dismissed, setDismissed] = useState(() => {
 		if (typeof window === "undefined") return false;
 		return localStorage.getItem(BANNER_DISMISSED_KEY) === "true";
 	});
+	const [dismissRequested, setDismissRequested] = useState(false);
 
 	const hasConnectors = (connectors?.length ?? 0) > 0;
+	const isVisible = !dismissed && !hasConnectors && isThreadEmpty;
+	const shouldShowTray = isVisible && !dismissRequested;
 
-	if (dismissed || hasConnectors || !isThreadEmpty) return null;
+	useEffect(() => {
+		onVisibleChange?.(isVisible);
+	}, [isVisible, onVisibleChange]);
 
 	const handleDismiss = (e: React.MouseEvent) => {
 		e.stopPropagation();
-		setDismissed(true);
-		localStorage.setItem(BANNER_DISMISSED_KEY, "true");
+		setDismissRequested(true);
 	};
 
 	return (
-		<div className="border-t border-border/50">
-			<div className="flex w-full items-center gap-2.5 px-4 py-2.5">
-				<button
-					type="button"
-					className="flex flex-1 items-center gap-2.5 text-left cursor-pointer select-none"
-					onClick={() => setConnectorDialogOpen(true)}
+		<AnimatePresence
+			initial={false}
+			onExitComplete={() => {
+				if (!dismissRequested) return;
+				setDismissed(true);
+				localStorage.setItem(BANNER_DISMISSED_KEY, "true");
+			}}
+		>
+			{shouldShowTray ? (
+				<motion.div
+					key="connect-tools-tray"
+					initial={{ opacity: 0, y: -10 }}
+					animate={{ opacity: 1, y: 0 }}
+					exit={{ opacity: 0, y: -14 }}
+					transition={{ duration: 0.18, ease: "easeOut" }}
+					className="relative z-0 -mt-5 flex min-w-0 items-center gap-2 rounded-b-3xl border border-input bg-muted/40 px-4 pt-7 pb-3 shadow-sm shadow-black/5 dark:shadow-black/10"
 				>
-					<Unplug className="size-4 text-muted-foreground shrink-0" />
-					<span className="text-[13px] text-muted-foreground/80 flex-1">Connect your tools</span>
+					<Button
+						type="button"
+						variant="ghost"
+						size="sm"
+						className="h-7 min-w-0 cursor-pointer justify-start gap-2 rounded-md px-0 text-[13px] font-normal text-muted-foreground select-none hover:bg-transparent hover:text-foreground"
+						onClick={() => setConnectorDialogOpen(true)}
+					>
+						<Unplug className="size-4 shrink-0" />
+						<span className="truncate">Connect your tools</span>
+					</Button>
+					<div className="min-w-0 flex-1" />
 					<AvatarGroup className="shrink-0">
 						{BANNER_CONNECTORS.map(({ type }, i) => (
 							<Avatar
 								key={type}
-								className="size-6"
+								className="size-5"
 								style={{ zIndex: BANNER_CONNECTORS.length - i }}
 							>
-								<AvatarFallback className="bg-muted text-[10px]">
-									{getConnectorIcon(type, "size-3.5")}
+								<AvatarFallback className="bg-accent text-[10px]">
+									{getConnectorIcon(type, "size-3")}
 								</AvatarFallback>
 							</Avatar>
 						))}
 					</AvatarGroup>
-				</button>
-				<button
-					type="button"
-					onClick={handleDismiss}
-					className="shrink-0 ml-0.5 p-1.5 -mr-1 text-muted-foreground/40 hover:text-foreground transition-colors cursor-pointer"
-					aria-label="Dismiss"
+					<Button
+						type="button"
+						onClick={handleDismiss}
+						variant="ghost"
+						size="icon"
+						className="size-7 shrink-0 cursor-pointer rounded-md text-muted-foreground hover:bg-transparent hover:text-foreground"
+						aria-label="Dismiss"
+					>
+						<X className="size-3.5" />
+					</Button>
+				</motion.div>
+			) : null}
+		</AnimatePresence>
+	);
+};
+
+const PendingScreenImageStrip: FC = () => {
+	const [urls, setUrls] = useAtom(pendingUserImageDataUrlsAtom);
+	if (urls.length === 0) return null;
+	return (
+		<div className="mx-3 mt-2 flex flex-wrap gap-2">
+			{urls.map((url, index) => (
+				<div
+					key={url}
+					className="group relative h-14 w-14 shrink-0 overflow-hidden rounded-md border border-border/50 bg-muted"
 				>
-					<X className="size-3.5 text-muted-foreground" />
-				</button>
-			</div>
+					<Image
+						src={url}
+						alt="Pending screenshot preview"
+						fill
+						sizes="56px"
+						className="object-cover"
+						draggable={false}
+						unoptimized
+					/>
+					<Button
+						type="button"
+						onClick={() => setUrls((prev) => prev.filter((_, i) => i !== index))}
+						variant="ghost"
+						size="icon"
+						className="absolute right-0.5 top-0.5 size-5 rounded-full bg-background/90 text-muted-foreground shadow-sm transition-opacity hover:bg-background/90 hover:text-accent-foreground sm:opacity-0 sm:group-hover:opacity-100"
+						aria-label="Remove screenshot"
+					>
+						<X className="size-3" />
+					</Button>
+				</div>
+			))}
 		</div>
 	);
 };
@@ -305,21 +372,27 @@ const ClipboardChip: FC<{ text: string; onDismiss: () => void }> = ({ text, onDi
 				<span className="text-xs font-medium text-muted-foreground">From clipboard</span>
 				<div className="flex-1" />
 				{isLong && (
-					<button
+					<Button
 						type="button"
 						onClick={() => setExpanded((v) => !v)}
-						className="flex items-center text-muted-foreground hover:text-foreground transition-colors"
+						variant="ghost"
+						size="icon"
+						className="size-5 text-muted-foreground hover:bg-transparent hover:text-accent-foreground"
 					>
-						{expanded ? <ChevronUp className="size-3.5" /> : <ChevronDown className="size-3.5" />}
-					</button>
+						<ChevronDown
+							className={cn("size-3.5 transition-transform", expanded && "rotate-180")}
+						/>
+					</Button>
 				)}
-				<button
+				<Button
 					type="button"
 					onClick={onDismiss}
-					className="flex items-center text-muted-foreground hover:text-foreground transition-colors"
+					variant="ghost"
+					size="icon"
+					className="size-5 text-muted-foreground hover:bg-transparent hover:text-accent-foreground"
 				>
 					<X className="size-3.5" />
-				</button>
+				</Button>
 			</div>
 			<div className="px-3 pb-2">
 				<p className="text-xs text-foreground/80 whitespace-pre-wrap wrap-break-word leading-relaxed">
@@ -331,61 +404,20 @@ const ClipboardChip: FC<{ text: string; onDismiss: () => void }> = ({ text, onDi
 };
 
 const Composer: FC = () => {
-	// Document mention state (atoms persist across component remounts)
 	const [mentionedDocuments, setMentionedDocuments] = useAtom(mentionedDocumentsAtom);
-	const setSidebarDocs = useSetAtom(sidebarSelectedDocumentsAtom);
 	const [showDocumentPopover, setShowDocumentPopover] = useState(false);
 	const [showPromptPicker, setShowPromptPicker] = useState(false);
 	const [mentionQuery, setMentionQuery] = useState("");
 	const [actionQuery, setActionQuery] = useState("");
-	const [containerPos, setContainerPos] = useState({ bottom: "200px", left: "50%", top: "auto" });
 	const editorRef = useRef<InlineMentionEditorRef>(null);
-	const editorContainerRef = useRef<HTMLDivElement>(null);
-	const composerBoxRef = useRef<HTMLDivElement>(null);
+	const prevMentionedDocsRef = useRef<Map<string, MentionedDocumentInfo>>(new Map());
 	const documentPickerRef = useRef<DocumentMentionPickerRef>(null);
 	const promptPickerRef = useRef<PromptPickerRef>(null);
-	const viewportRef = useRef<Element | null>(null);
 	const { search_space_id, chat_id } = useParams();
 	const aui = useAui();
-	const threadViewportStore = useThreadViewportStore();
-	const hasAutoFocusedRef = useRef(false);
-	const submitCleanupRef = useRef<(() => void) | null>(null);
-
-	useEffect(() => {
-		return () => {
-			submitCleanupRef.current?.();
-		};
-	}, []);
-
-	// Store viewport element reference on mount
-	useEffect(() => {
-		viewportRef.current = document.querySelector(".aui-thread-viewport");
-	}, []);
-
-	// Compute picker positions using ResizeObserver to avoid layout reads during render
-	useLayoutEffect(() => {
-		if (!editorContainerRef.current) return;
-
-		const updatePosition = () => {
-			if (!editorContainerRef.current) return;
-			const rect = editorContainerRef.current.getBoundingClientRect();
-			const composerRect = composerBoxRef.current?.getBoundingClientRect();
-			setContainerPos({
-				bottom: `${window.innerHeight - rect.top + 8}px`,
-				left: `${rect.left}px`,
-				top: composerRect ? `${composerRect.bottom + 8}px` : "auto",
-			});
-		};
-
-		updatePosition();
-		const ro = new ResizeObserver(updatePosition);
-		ro.observe(editorContainerRef.current);
-		if (composerBoxRef.current) {
-			ro.observe(composerBoxRef.current);
-		}
-
-		return () => ro.disconnect();
-	}, []);
+	// Desktop-only auto-focus; on mobile, programmatic focus would
+	// summon the soft keyboard on every picker close / thread switch.
+	const isDesktop = useMediaQuery("(min-width: 640px)");
 
 	const electronAPI = useElectronAPI();
 	const [clipboardInitialText, setClipboardInitialText] = useState<string | undefined>();
@@ -393,7 +425,7 @@ const Composer: FC = () => {
 	useEffect(() => {
 		if (!electronAPI || clipboardLoadedRef.current) return;
 		clipboardLoadedRef.current = true;
-		electronAPI.getQuickAskText().then((text) => {
+		electronAPI.getQuickAskText().then((text: string) => {
 			if (text) {
 				setClipboardInitialText(text);
 			}
@@ -402,10 +434,10 @@ const Composer: FC = () => {
 
 	const isThreadEmpty = useAuiState(({ thread }) => thread.isEmpty);
 	const isThreadRunning = useAuiState(({ thread }) => thread.isRunning);
+	const [connectToolsTrayVisible, setConnectToolsTrayVisible] = useState(false);
 
 	const currentPlaceholder = COMPOSER_PLACEHOLDER;
 
-	// Live collaboration state
 	const { data: currentUser } = useAtomValue(currentUserAtom);
 	const { data: members } = useAtomValue(membersAtom);
 	const threadId = useMemo(() => {
@@ -419,13 +451,11 @@ const Composer: FC = () => {
 	const respondingToUserId = sessionState?.respondingToUserId ?? null;
 	const isBlockedByOtherUser = isAiResponding && respondingToUserId !== currentUser?.id;
 
-	// Sync comments for the entire thread via Zero (one subscription per thread)
+	// One Zero subscription per thread for comment sync.
 	useCommentsSync(threadId);
 
-	// Batch-prefetch comments for all assistant messages so individual useComments
-	// hooks never fire their own network requests (eliminates N+1 API calls).
-	// Return a primitive string from the selector so useSyncExternalStore can
-	// compare snapshots by value and avoid infinite re-render loops.
+	// Batch-prefetch assistant message comments to avoid N+1 fetches.
+	// Returns a primitive string so useSyncExternalStore can compare by value.
 	const assistantIdsKey = useAuiState(({ thread }) =>
 		thread.messages
 			.filter((m) => m.role === "assistant" && m.id?.startsWith("msg-"))
@@ -438,18 +468,17 @@ const Composer: FC = () => {
 	);
 	useBatchCommentsPreload(assistantDbMessageIds);
 
-	// Auto-focus editor on new chat page after mount
+	// Always-focused composer: refocus whenever no picker has taken
+	// over input. ``threadId`` is in the deps so the effect re-fires
+	// on thread switch (Composer instance is reused).
 	useEffect(() => {
-		if (isThreadEmpty && !hasAutoFocusedRef.current && editorRef.current) {
-			const timeoutId = setTimeout(() => {
-				editorRef.current?.focus();
-				hasAutoFocusedRef.current = true;
-			}, 100);
-			return () => clearTimeout(timeoutId);
-		}
-	}, [isThreadEmpty]);
+		if (!isDesktop) return;
+		if (showDocumentPopover || showPromptPicker) return;
+		void threadId;
+		editorRef.current?.focus();
+	}, [isDesktop, showDocumentPopover, showPromptPicker, threadId]);
 
-	// Close document picker when a slide-out panel (inbox, shared/private chats) opens
+	// Close document picker when a slide-out panel (inbox, etc.) opens.
 	useEffect(() => {
 		const handler = () => {
 			setShowDocumentPopover(false);
@@ -459,21 +488,41 @@ const Composer: FC = () => {
 		return () => window.removeEventListener(SLIDEOUT_PANEL_OPENED_EVENT, handler);
 	}, []);
 
-	// Sync editor text with assistant-ui composer runtime
+	// Sync editor text into assistant-ui's composer and mirror the chip
+	// atom from the editor's reported ``docs``. The editor is the
+	// single source of truth, so this catches every Plate deletion path
+	// (Backspace, X button, Cmd+Backspace, range-delete, cut,
+	// paste-over) without per-keybinding plumbing. The ``prev``
+	// short-circuit keeps pure-text keystrokes from churning the atom.
 	const handleEditorChange = useCallback(
-		(text: string) => {
+		(text: string, docs: MentionedDocument[]) => {
 			aui.composer().setText(text);
+			setMentionedDocuments((prev) => {
+				if (prev.length === docs.length) {
+					const editorKeys = new Set(docs.map((d) => getMentionDocKey(d)));
+					if (prev.every((d) => editorKeys.has(getMentionDocKey(d)))) {
+						return prev;
+					}
+				}
+				return docs.map<MentionedDocumentInfo>((d) => ({
+					id: d.id,
+					title: d.title,
+					// Atom requires a string; ``"UNKNOWN"`` matches the
+					// sentinel ``getMentionDocKey`` and the editor's
+					// match predicates use.
+					document_type: d.document_type ?? "UNKNOWN",
+					kind: d.kind,
+				}));
+			});
 		},
-		[aui]
+		[aui, setMentionedDocuments]
 	);
 
-	// Open document picker when @ mention is triggered
 	const handleMentionTrigger = useCallback((query: string) => {
 		setShowDocumentPopover(true);
 		setMentionQuery(query);
 	}, []);
 
-	// Close document picker and reset query
 	const handleMentionClose = useCallback(() => {
 		if (showDocumentPopover) {
 			setShowDocumentPopover(false);
@@ -481,13 +530,11 @@ const Composer: FC = () => {
 		}
 	}, [showDocumentPopover]);
 
-	// Open action picker when / is triggered
 	const handleActionTrigger = useCallback((query: string) => {
 		setShowPromptPicker(true);
 		setActionQuery(query);
 	}, []);
 
-	// Close action picker and reset query
 	const handleActionClose = useCallback(() => {
 		if (showPromptPicker) {
 			setShowPromptPicker(false);
@@ -531,7 +578,7 @@ const Composer: FC = () => {
 		[clipboardInitialText, electronAPI, aui]
 	);
 
-	// Keyboard navigation for document/action picker (arrow keys, Enter, Escape)
+	// Arrow / Enter / Escape navigation for the active picker.
 	const handleKeyDown = useCallback(
 		(e: React.KeyboardEvent) => {
 			if (showPromptPicker) {
@@ -584,80 +631,20 @@ const Composer: FC = () => {
 		[showDocumentPopover, showPromptPicker]
 	);
 
-	// Submit message (blocked during streaming, document picker open, or AI responding to another user)
 	const handleSubmit = useCallback(() => {
-		if (isThreadRunning || isBlockedByOtherUser) {
-			return;
-		}
-		if (!showDocumentPopover && !showPromptPicker) {
-			if (clipboardInitialText) {
-				const userText = editorRef.current?.getText() ?? "";
-				const combined = userText ? `${userText}\n\n${clipboardInitialText}` : clipboardInitialText;
-				aui.composer().setText(combined);
-				setClipboardInitialText(undefined);
-			}
-			aui.composer().send();
-			editorRef.current?.clear();
-			setMentionedDocuments([]);
-			setSidebarDocs([]);
-		}
 		if (isThreadRunning || isBlockedByOtherUser) return;
-		if (showDocumentPopover) return;
+		if (showDocumentPopover || showPromptPicker) return;
 
-		const viewportEl = viewportRef.current;
-		const heightBefore = viewportEl?.scrollHeight ?? 0;
+		if (clipboardInitialText) {
+			const userText = editorRef.current?.getText() ?? "";
+			const combined = userText ? `${userText}\n\n${clipboardInitialText}` : clipboardInitialText;
+			aui.composer().setText(combined);
+			setClipboardInitialText(undefined);
+		}
 
 		aui.composer().send();
 		editorRef.current?.clear();
 		setMentionedDocuments([]);
-		setSidebarDocs([]);
-
-		// With turnAnchor="top", ViewportSlack adds min-height to the last
-		// assistant message so that scrolling-to-bottom actually positions the
-		// user message at the TOP of the viewport. That slack height is
-		// calculated asynchronously (ResizeObserver → style → layout).
-		//
-		// We poll via rAF for ~2 s, re-scrolling whenever scrollHeight changes
-		// (user msg render → assistant placeholder → ViewportSlack min-height →
-		// first streamed content). Backup setTimeout calls cover cases where
-		// the batcher's 50 ms throttle delays the DOM update past the rAF.
-		const scrollToBottom = () =>
-			threadViewportStore.getState().scrollToBottom({ behavior: "instant" });
-
-		let lastHeight = heightBefore;
-		let frames = 0;
-		let cancelled = false;
-		const POLL_FRAMES = 120;
-
-		const pollAndScroll = () => {
-			if (cancelled) return;
-			const el = viewportRef.current;
-			if (el) {
-				const h = el.scrollHeight;
-				if (h !== lastHeight) {
-					lastHeight = h;
-					scrollToBottom();
-				}
-			}
-			if (++frames < POLL_FRAMES) {
-				requestAnimationFrame(pollAndScroll);
-			}
-		};
-		requestAnimationFrame(pollAndScroll);
-
-		const t1 = setTimeout(scrollToBottom, 100);
-		const t2 = setTimeout(scrollToBottom, 300);
-		const t3 = setTimeout(scrollToBottom, 600);
-
-		// Cleanup if component unmounts during the polling window. The ref is
-		// checked inside pollAndScroll; timeouts are cleared in the return below.
-		// Store cleanup fn so it can be called from a useEffect cleanup if needed.
-		submitCleanupRef.current = () => {
-			cancelled = true;
-			clearTimeout(t1);
-			clearTimeout(t2);
-			clearTimeout(t3);
-		};
 	}, [
 		showDocumentPopover,
 		showPromptPicker,
@@ -666,126 +653,142 @@ const Composer: FC = () => {
 		clipboardInitialText,
 		aui,
 		setMentionedDocuments,
-		setSidebarDocs,
-		threadViewportStore,
 	]);
 
 	const handleDocumentRemove = useCallback(
 		(docId: number, docType?: string) => {
-			setMentionedDocuments((prev) =>
-				prev.filter((doc) => !(doc.id === docId && doc.document_type === docType))
-			);
+			setMentionedDocuments((prev) => {
+				if (!docType) {
+					// Fallback when chip type is unavailable.
+					return prev.filter((doc) => doc.id !== docId);
+				}
+				const removedKey = getMentionDocKey({ id: docId, document_type: docType });
+				return prev.filter((doc) => getMentionDocKey(doc) !== removedKey);
+			});
 		},
 		[setMentionedDocuments]
 	);
 
-	const handleDocumentsMention = useCallback(
-		(documents: Pick<Document, "id" | "title" | "document_type">[]) => {
-			const existingKeys = new Set(mentionedDocuments.map((d) => `${d.document_type}:${d.id}`));
-			const newDocs = documents.filter(
-				(doc) => !existingKeys.has(`${doc.document_type}:${doc.id}`)
-			);
+	const handleDocumentsMention = useCallback((mentions: MentionedDocumentInfo[]) => {
+		const editorMentionedDocs = editorRef.current?.getMentionedDocuments() ?? [];
+		const editorDocKeys = new Set(editorMentionedDocs.map((doc) => getMentionDocKey(doc)));
 
-			for (const doc of newDocs) {
-				editorRef.current?.insertDocumentChip(doc);
-			}
+		for (const mention of mentions) {
+			const key = getMentionDocKey(mention);
+			if (editorDocKeys.has(key)) continue;
+			editorRef.current?.insertMentionChip(mention);
+			// Track within the loop so a duplicate-in-batch can't double-insert.
+			editorDocKeys.add(key);
+		}
 
-			setMentionedDocuments((prev) => {
-				const existingKeySet = new Set(prev.map((d) => `${d.document_type}:${d.id}`));
-				const uniqueNewDocs = documents.filter(
-					(doc) => !existingKeySet.has(`${doc.document_type}:${doc.id}`)
-				);
-				return [...prev, ...uniqueNewDocs];
-			});
+		// Atom is reconciled by ``handleEditorChange`` via the editor's
+		// onChange — no second write path here.
+		setMentionQuery("");
+	}, []);
 
-			setMentionQuery("");
-		},
-		[mentionedDocuments, setMentionedDocuments]
-	);
+	useEffect(() => {
+		const editor = editorRef.current;
+		const nextDocsMap = new Map(mentionedDocuments.map((doc) => [getMentionDocKey(doc), doc]));
+		const prevDocsMap = prevMentionedDocsRef.current;
+
+		if (!editor) {
+			prevMentionedDocsRef.current = nextDocsMap;
+			return;
+		}
+
+		const editorKeys = new Set(editor.getMentionedDocuments().map(getMentionDocKey));
+
+		for (const [key, doc] of nextDocsMap) {
+			if (prevDocsMap.has(key) || editorKeys.has(key)) continue;
+			editor.insertMentionChip(doc, { removeTriggerText: false });
+		}
+
+		for (const [key, doc] of prevDocsMap) {
+			if (!nextDocsMap.has(key)) {
+				editor.removeDocumentChip(doc.id, doc.document_type);
+			}
+		}
+
+		prevMentionedDocsRef.current = nextDocsMap;
+	}, [mentionedDocuments]);
 
 	return (
-		<ComposerPrimitive.Root
-			className="aui-composer-root relative flex w-full flex-col gap-2"
-			style={showPromptPicker && clipboardInitialText ? { marginBottom: 220 } : undefined}
-		>
+		<ComposerPrimitive.Root className="aui-composer-root relative flex w-full flex-col gap-2 rounded-2xl">
 			<ChatSessionStatus
 				isAiResponding={isAiResponding}
 				respondingToUserId={respondingToUserId}
 				currentUserId={currentUser?.id ?? null}
 				members={members ?? []}
 			/>
-			<div
-				ref={composerBoxRef}
-				className="aui-composer-attachment-dropzone flex w-full flex-col overflow-hidden rounded-2xl border-input bg-muted pt-2 outline-none transition-shadow"
-			>
-				{clipboardInitialText && (
-					<ClipboardChip
-						text={clipboardInitialText}
-						onDismiss={() => setClipboardInitialText(undefined)}
+			{showDocumentPopover && (
+				<div className="absolute bottom-full left-0 z-[9999] mb-2">
+					<DocumentMentionPicker
+						ref={documentPickerRef}
+						searchSpaceId={Number(search_space_id)}
+						onSelectionChange={handleDocumentsMention}
+						onDone={() => {
+							setShowDocumentPopover(false);
+							setMentionQuery("");
+						}}
+						initialSelectedDocuments={mentionedDocuments}
+						externalSearch={mentionQuery}
 					/>
-				)}
-				{/* Inline editor with @mention support */}
-				<div ref={editorContainerRef} className="aui-composer-input-wrapper px-4 pt-3 pb-6">
-					<InlineMentionEditor
-						ref={editorRef}
-						placeholder={currentPlaceholder}
-						onMentionTrigger={handleMentionTrigger}
-						onMentionClose={handleMentionClose}
-						onActionTrigger={handleActionTrigger}
-						onActionClose={handleActionClose}
-						onChange={handleEditorChange}
-						onDocumentRemove={handleDocumentRemove}
-						onSubmit={handleSubmit}
-						onKeyDown={handleKeyDown}
-						className="min-h-[24px]"
+				</div>
+			)}
+			{showPromptPicker && (
+				<div
+					className={cn(
+						"absolute left-0 z-[9999]",
+						clipboardInitialText ? "top-full mt-2" : "bottom-full mb-2"
+					)}
+				>
+					<PromptPicker
+						ref={promptPickerRef}
+						onSelect={clipboardInitialText ? handleQuickAskSelect : handleActionSelect}
+						onDone={() => {
+							setShowPromptPicker(false);
+							setActionQuery("");
+						}}
+						externalSearch={actionQuery}
 					/>
 				</div>
-				{/* Document picker popover (portal to body for proper z-index stacking) */}
-				{showDocumentPopover &&
-					typeof document !== "undefined" &&
-					createPortal(
-						<DocumentMentionPicker
-							ref={documentPickerRef}
-							searchSpaceId={Number(search_space_id)}
-							onSelectionChange={handleDocumentsMention}
-							onDone={() => {
-								setShowDocumentPopover(false);
-								setMentionQuery("");
-							}}
-							initialSelectedDocuments={mentionedDocuments}
-							externalSearch={mentionQuery}
-							containerStyle={{
-								bottom: containerPos.bottom,
-								left: containerPos.left,
-							}}
-						/>,
-						document.body
+			)}
+			<div className="flex w-full flex-col">
+				<div
+					className={cn(
+						"aui-composer-attachment-dropzone relative z-10 flex w-full flex-col overflow-hidden rounded-3xl border border-input bg-muted pt-2 shadow-sm shadow-black/5 outline-none transition-shadow dark:shadow-black/10",
+						connectToolsTrayVisible && "rounded-b-3xl shadow-none dark:shadow-none"
 					)}
-				{showPromptPicker &&
-					typeof document !== "undefined" &&
-					createPortal(
-						<PromptPicker
-							ref={promptPickerRef}
-							onSelect={clipboardInitialText ? handleQuickAskSelect : handleActionSelect}
-							onDone={() => {
-								setShowPromptPicker(false);
-								setActionQuery("");
-							}}
-							externalSearch={actionQuery}
-							containerStyle={{
-								position: "fixed",
-								...(clipboardInitialText
-									? { top: containerPos.top }
-									: { bottom: containerPos.bottom }),
-								left: containerPos.left,
-								zIndex: 50,
-							}}
-						/>,
-						document.body
+				>
+					<PendingScreenImageStrip />
+					{clipboardInitialText && (
+						<ClipboardChip
+							text={clipboardInitialText}
+							onDismiss={() => setClipboardInitialText(undefined)}
+						/>
 					)}
-				<ComposerAction isBlockedByOtherUser={isBlockedByOtherUser} />
-				<ConnectorIndicator showTrigger={false} />
-				<ConnectToolsBanner isThreadEmpty={isThreadEmpty} />
+					<div className="aui-composer-input-wrapper px-4 pt-3 pb-6">
+						<InlineMentionEditor
+							ref={editorRef}
+							placeholder={currentPlaceholder}
+							onMentionTrigger={handleMentionTrigger}
+							onMentionClose={handleMentionClose}
+							onActionTrigger={handleActionTrigger}
+							onActionClose={handleActionClose}
+							onChange={handleEditorChange}
+							onDocumentRemove={handleDocumentRemove}
+							onSubmit={handleSubmit}
+							onKeyDown={handleKeyDown}
+							className="min-h-[24px]"
+						/>
+					</div>
+					<ComposerAction isBlockedByOtherUser={isBlockedByOtherUser} />
+					<ConnectorIndicator showTrigger={false} />
+				</div>
+				<ConnectToolsBanner
+					isThreadEmpty={isThreadEmpty}
+					onVisibleChange={setConnectToolsTrayVisible}
+				/>
 			</div>
 		</ComposerPrimitive.Root>
 	);
@@ -797,35 +800,31 @@ interface ComposerActionProps {
 
 const ComposerAction: FC<ComposerActionProps> = ({ isBlockedByOtherUser = false }) => {
 	const mentionedDocuments = useAtomValue(mentionedDocumentsAtom);
-	const sidebarDocs = useAtomValue(sidebarSelectedDocumentsAtom);
-	const setDocumentsSidebarOpen = useSetAtom(documentsSidebarOpenAtom);
 	const setConnectorDialogOpen = useSetAtom(connectorDialogOpenAtom);
 	const [toolsPopoverOpen, setToolsPopoverOpen] = useState(false);
+	const [openConnectorSubmenu, setOpenConnectorSubmenu] = useState<string | null>(null);
+	const [expandedConnectorGroups, setExpandedConnectorGroups] = useState<Set<string>>(
+		() => new Set()
+	);
 	const isDesktop = useMediaQuery("(min-width: 640px)");
 	const { openDialog: openUploadDialog } = useDocumentUploadDialog();
-	const [toolsScrollPos, setToolsScrollPos] = useState<"top" | "middle" | "bottom">("top");
-	const toolsRafRef = useRef<number>();
-	const handleToolsScroll = useCallback((e: React.UIEvent<HTMLDivElement>) => {
-		const el = e.currentTarget;
-		if (toolsRafRef.current) return;
-		toolsRafRef.current = requestAnimationFrame(() => {
-			const atTop = el.scrollTop <= 2;
-			const atBottom = el.scrollHeight - el.scrollTop - el.clientHeight <= 2;
-			setToolsScrollPos(atTop ? "top" : atBottom ? "bottom" : "middle");
-			toolsRafRef.current = undefined;
-		});
-	}, []);
-	useEffect(
-		() => () => {
-			if (toolsRafRef.current) cancelAnimationFrame(toolsRafRef.current);
-		},
-		[]
-	);
+	const pendingScreenImages = useAtomValue(pendingUserImageDataUrlsAtom);
+	const setPendingScreenImages = useSetAtom(pendingUserImageDataUrlsAtom);
+	const electronAPI = useElectronAPI();
+
 	const isComposerTextEmpty = useAuiState(({ composer }) => {
 		const text = composer.text?.trim() || "";
 		return text.length === 0;
 	});
-	const isComposerEmpty = isComposerTextEmpty && mentionedDocuments.length === 0;
+	const isComposerEmpty =
+		isComposerTextEmpty && mentionedDocuments.length === 0 && pendingScreenImages.length === 0;
+
+	const handleScreenCapture = useCallback(async () => {
+		const url = electronAPI?.captureFullScreen
+			? await electronAPI.captureFullScreen()
+			: await captureDisplayToPngDataUrl();
+		if (url) setPendingScreenImages((prev) => [...prev, url]);
+	}, [electronAPI, setPendingScreenImages]);
 
 	const { data: userConfigs } = useAtomValue(newLLMConfigsAtom);
 	const { data: globalConfigs } = useAtomValue(globalNewLLMConfigsAtom);
@@ -855,6 +854,17 @@ const ComposerAction: FC<ComposerActionProps> = ({ isBlockedByOtherUser = false
 		},
 		[disabledToolsSet, setDisabledTools]
 	);
+	const setConnectorGroupExpanded = useCallback((label: string, expanded: boolean) => {
+		setExpandedConnectorGroups((prev) => {
+			const next = new Set(prev);
+			if (expanded) {
+				next.add(label);
+			} else {
+				next.delete(label);
+			}
+			return next;
+		});
+	}, []);
 
 	const hasWebSearchTool = agentTools?.some((t) => t.name === "web_search") ?? false;
 	const isWebSearchEnabled = hasWebSearchTool && !disabledToolsSet.has("web_search");
@@ -896,6 +906,9 @@ const ComposerAction: FC<ComposerActionProps> = ({ isBlockedByOtherUser = false
 
 		return result;
 	}, [filteredTools, connectedTypes]);
+	const regularToolGroups = groupedTools.filter((g) => !g.connectorIcon && g.label !== "Other");
+	const connectorToolGroups = groupedTools.filter((g) => g.connectorIcon);
+	const otherToolGroup = groupedTools.find((g) => !g.connectorIcon && g.label === "Other");
 
 	useEffect(() => {
 		hydrateDisabled();
@@ -915,7 +928,7 @@ const ComposerAction: FC<ComposerActionProps> = ({ isBlockedByOtherUser = false
 	const isSendDisabled = isComposerEmpty || !hasModelConfigured || isBlockedByOtherUser;
 
 	return (
-		<div className="aui-composer-action-wrapper relative mx-3 mb-2 flex items-center justify-between">
+		<div className="aui-composer-action-wrapper relative mx-3 mb-3 flex items-center justify-between">
 			<div className="flex items-center gap-1">
 				{!isDesktop ? (
 					<>
@@ -924,298 +937,464 @@ const ComposerAction: FC<ComposerActionProps> = ({ isBlockedByOtherUser = false
 								<Button
 									variant="ghost"
 									size="icon"
-									className="size-[34px] rounded-full p-1 font-semibold text-xs hover:bg-muted-foreground/15 dark:border-muted-foreground/15 dark:hover:bg-muted-foreground/30"
-									aria-label="More actions"
+									className="h-9 w-9 rounded-full p-0 font-semibold text-xs text-muted-foreground transition-colors dark:border-muted-foreground/15 hover:bg-foreground/10 hover:text-foreground"
+									aria-label="Upload files, manage tools and more"
 									data-joyride="connector-icon"
 								>
-									<Plus className="size-4" />
+									<Plus className="size-5" />
 								</Button>
 							</DropdownMenuTrigger>
 							<DropdownMenuContent side="bottom" align="start" sideOffset={8}>
-								<DropdownMenuItem onSelect={() => setToolsPopoverOpen(true)}>
-									<Settings2 className="size-4" />
-									Manage Tools
-								</DropdownMenuItem>
 								<DropdownMenuItem onSelect={() => openUploadDialog()}>
 									<Upload className="size-4" />
 									Upload Files
 								</DropdownMenuItem>
+								{hasWebSearchTool && (
+									<DropdownMenuItem
+										onSelect={(event) => {
+											event.preventDefault();
+											toggleTool("web_search");
+										}}
+									>
+										<Globe className="size-4" />
+										<span className="flex-1">Web Search</span>
+										<Switch
+											checked={isWebSearchEnabled}
+											tabIndex={-1}
+											className="pointer-events-none h-4 w-7 shrink-0 border [&>span]:h-3 [&>span]:w-3 [&>span[data-state=checked]]:translate-x-3"
+										/>
+									</DropdownMenuItem>
+								)}
+								<DropdownMenuItem onSelect={() => setConnectorDialogOpen(true)}>
+									<Unplug className="size-4" />
+									Manage Connectors
+								</DropdownMenuItem>
+								<DropdownMenuItem onSelect={() => setToolsPopoverOpen(true)}>
+									<Settings2 className="size-4" />
+									Manage Tools
+								</DropdownMenuItem>
 							</DropdownMenuContent>
 						</DropdownMenu>
-						<Drawer open={toolsPopoverOpen} onOpenChange={setToolsPopoverOpen}>
-							<DrawerContent className="max-h-[60dvh]">
+						<Drawer
+							open={toolsPopoverOpen}
+							onOpenChange={setToolsPopoverOpen}
+							shouldScaleBackground={false}
+						>
+							<DrawerContent className="h-[85vh] max-h-[85vh] z-80" overlayClassName="z-80">
 								<DrawerHandle />
-								<div className="px-4 py-2">
-									<DrawerTitle className="text-sm font-medium">Manage Tools</DrawerTitle>
-								</div>
-								<div className="overflow-y-auto pb-6" onScroll={handleToolsScroll}>
-									{groupedTools
-										.filter((g) => !g.connectorIcon)
-										.map((group) => (
-											<div key={group.label}>
-												<div className="px-4 pt-3 pb-1 text-xs text-muted-foreground/80 font-medium select-none">
-													{group.label}
-												</div>
-												{group.tools.map((tool) => {
-													const isDisabled = disabledToolsSet.has(tool.name);
-													const ToolIcon = getToolIcon(tool.name);
-													return (
-														<div
-															key={tool.name}
-															className="flex w-full items-center gap-3 px-4 py-2 hover:bg-muted-foreground/10 transition-colors"
-														>
-															<ToolIcon className="size-4 shrink-0 text-muted-foreground" />
-															<span className="flex-1 min-w-0 text-sm font-medium truncate">
-																{formatToolName(tool.name)}
-															</span>
-															<Switch
-																checked={!isDisabled}
-																onCheckedChange={() => toggleTool(tool.name)}
-																className="shrink-0"
-															/>
-														</div>
-													);
-												})}
+								<DrawerHeader className="px-4 pb-3 pt-2">
+									<DrawerTitle className="flex items-center justify-center gap-2 text-base font-semibold">
+										Manage Tools
+									</DrawerTitle>
+								</DrawerHeader>
+								<div className="min-h-0 flex-1 overflow-y-auto scrollbar-thin pb-6">
+									{regularToolGroups.map((group) => (
+										<div key={group.label}>
+											<div className="px-4 pt-3 pb-1 text-xs text-muted-foreground/80 font-medium select-none">
+												{group.label}
 											</div>
-										))}
-									{groupedTools.some((g) => g.connectorIcon) && (
+											{group.tools.map((tool) => {
+												const isDisabled = disabledToolsSet.has(tool.name);
+												const ToolIcon = getToolIcon(tool.name);
+												return (
+													<div
+														key={tool.name}
+														className="flex w-full items-center gap-3 px-4 py-2 hover:bg-accent hover:text-accent-foreground transition-colors"
+													>
+														<ToolIcon className="size-4 shrink-0 text-muted-foreground" />
+														<span className="flex-1 min-w-0 text-sm font-medium truncate">
+															{formatToolName(tool.name)}
+														</span>
+														<Switch
+															checked={!isDisabled}
+															onCheckedChange={() => toggleTool(tool.name)}
+															className="shrink-0"
+														/>
+													</div>
+												);
+											})}
+										</div>
+									))}
+									{connectorToolGroups.length > 0 && (
 										<div>
 											<div className="px-4 pt-3 pb-1 text-xs text-muted-foreground/80 font-medium select-none">
 												Connector Actions
 											</div>
-											{groupedTools
-												.filter((g) => g.connectorIcon)
-												.map((group) => {
-													const iconKey = group.connectorIcon ?? "";
-													const iconInfo = CONNECTOR_TOOL_ICON_PATHS[iconKey];
-													const toolNames = group.tools.map((t) => t.name);
-													const allDisabled = toolNames.every((n) => disabledToolsSet.has(n));
-													return (
-														<div
-															key={group.label}
-															className="flex w-full items-center gap-3 px-4 py-2 hover:bg-muted-foreground/10 transition-colors"
-														>
-															{iconInfo ? (
-																<Image
-																	src={iconInfo.src}
-																	alt={iconInfo.alt}
-																	width={18}
-																	height={18}
-																	className="size-[18px] shrink-0 select-none pointer-events-none"
-																	draggable={false}
-																/>
-															) : (
-																<Wrench className="size-4 shrink-0 text-muted-foreground" />
-															)}
-															<span className="flex-1 min-w-0 text-sm font-medium truncate">
-																{group.label}
-															</span>
+											{connectorToolGroups.map((group) => {
+												const iconKey = group.connectorIcon ?? "";
+												const iconInfo = CONNECTOR_TOOL_ICON_PATHS[iconKey];
+												const toolNames = group.tools.map((t) => t.name);
+												const allDisabled = toolNames.every((n) => disabledToolsSet.has(n));
+												const isExpanded = expandedConnectorGroups.has(group.label);
+												return (
+													<Collapsible
+														key={group.label}
+														open={isExpanded}
+														onOpenChange={(open) => setConnectorGroupExpanded(group.label, open)}
+													>
+														<div className="flex w-full items-center gap-3 px-4 py-2 hover:bg-accent hover:text-accent-foreground transition-colors">
+															<CollapsibleTrigger asChild>
+																<button
+																	type="button"
+																	className="flex min-w-0 flex-1 items-center gap-3 text-left"
+																>
+																	{iconInfo ? (
+																		<Image
+																			src={iconInfo.src}
+																			alt={iconInfo.alt}
+																			width={18}
+																			height={18}
+																			className="size-[18px] shrink-0 select-none pointer-events-none"
+																			draggable={false}
+																		/>
+																	) : (
+																		<Wrench className="size-4 shrink-0 text-muted-foreground" />
+																	)}
+																	<span className="min-w-0 flex-1 truncate text-sm font-medium">
+																		{group.label}
+																	</span>
+																	{isExpanded ? (
+																		<ChevronDown className="size-4 shrink-0 text-muted-foreground" />
+																	) : (
+																		<ChevronRight className="size-4 shrink-0 text-muted-foreground" />
+																	)}
+																</button>
+															</CollapsibleTrigger>
 															<Switch
 																checked={!allDisabled}
 																onCheckedChange={() => toggleToolGroup(toolNames)}
 																className="shrink-0"
 															/>
 														</div>
-													);
-												})}
+														<CollapsibleContent className="pb-1">
+															{group.tools.map((tool) => {
+																const isDisabled = disabledToolsSet.has(tool.name);
+																return (
+																	<div
+																		key={tool.name}
+																		className={cn(
+																			"ml-8 flex items-center gap-3 px-4 py-1.5 rounded-md transition-colors",
+																			"hover:bg-accent hover:text-accent-foreground",
+																			!isDisabled && "text-primary"
+																		)}
+																	>
+																		<span className="min-w-0 flex-1 truncate text-sm">
+																			{formatToolName(tool.name)}
+																		</span>
+																		<Switch
+																			checked={!isDisabled}
+																			onCheckedChange={() => toggleTool(tool.name)}
+																			className="shrink-0"
+																		/>
+																	</div>
+																);
+															})}
+														</CollapsibleContent>
+													</Collapsible>
+												);
+											})}
+										</div>
+									)}
+									{otherToolGroup && (
+										<div>
+											<div className="px-4 pt-3 pb-1 text-xs text-muted-foreground/80 font-medium select-none">
+												{otherToolGroup.label}
+											</div>
+											{otherToolGroup.tools.map((tool) => {
+												const isDisabled = disabledToolsSet.has(tool.name);
+												const ToolIcon = getToolIcon(tool.name);
+												return (
+													<div
+														key={tool.name}
+														className="flex w-full items-center gap-3 px-4 py-2 hover:bg-accent hover:text-accent-foreground transition-colors"
+													>
+														<ToolIcon className="size-4 shrink-0 text-muted-foreground" />
+														<span className="flex-1 min-w-0 text-sm font-medium truncate">
+															{formatToolName(tool.name)}
+														</span>
+														<Switch
+															checked={!isDisabled}
+															onCheckedChange={() => toggleTool(tool.name)}
+															className="shrink-0"
+														/>
+													</div>
+												);
+											})}
 										</div>
 									)}
 									{!filteredTools?.length && (
-										<div className="px-4 py-6 text-center text-sm text-muted-foreground">
-											Loading tools...
+										<div className="px-4 pt-3 pb-2">
+											<Skeleton className="h-3 w-16 mb-2" />
+											{["t1", "t2", "t3", "t4"].map((k) => (
+												<div key={k} className="flex items-center gap-3 py-2">
+													<Skeleton className="size-4 rounded shrink-0" />
+													<Skeleton className="h-3.5 flex-1" />
+													<Skeleton className="h-5 w-9 rounded-full shrink-0" />
+												</div>
+											))}
+											<Skeleton className="h-3 w-24 mt-3 mb-2" />
+											{["c1", "c2", "c3"].map((k) => (
+												<div key={k} className="flex items-center gap-3 py-2">
+													<Skeleton className="size-4 rounded shrink-0" />
+													<Skeleton className="h-3.5 flex-1" />
+													<Skeleton className="h-5 w-9 rounded-full shrink-0" />
+												</div>
+											))}
 										</div>
 									)}
 								</div>
 							</DrawerContent>
 						</Drawer>
-						<Button
-							variant="ghost"
-							size="icon"
-							className="size-[34px] rounded-full p-1 font-semibold text-xs hover:bg-muted-foreground/15 dark:border-muted-foreground/15 dark:hover:bg-muted-foreground/30"
-							aria-label="Manage connectors"
-							onClick={() => setConnectorDialogOpen(true)}
-						>
-							<Unplug className="size-4" />
-						</Button>
 					</>
 				) : (
-					<Popover open={toolsPopoverOpen} onOpenChange={setToolsPopoverOpen}>
-						<PopoverTrigger asChild>
+					<DropdownMenu
+						onOpenChange={(open) => {
+							if (!open) {
+								setToolsPopoverOpen(false);
+								setOpenConnectorSubmenu(null);
+							}
+						}}
+					>
+						<DropdownMenuTrigger asChild>
 							<TooltipIconButton
-								tooltip="Manage tools"
+								tooltip="Upload files, manage tools and more"
 								side="bottom"
 								disableTooltip={toolsPopoverOpen}
 								variant="ghost"
 								size="icon"
-								className="size-[34px] rounded-full p-1 font-semibold text-xs hover:bg-muted-foreground/15 dark:border-muted-foreground/15 dark:hover:bg-muted-foreground/30"
-								aria-label="Manage tools"
+								className="h-9 w-9 rounded-full p-0 font-semibold text-xs text-muted-foreground transition-colors dark:border-muted-foreground/15 hover:bg-foreground/10 hover:text-foreground"
+								aria-label="Upload files, manage tools and more"
 								data-joyride="connector-icon"
 							>
-								<Settings2 className="size-4" />
+								<Plus className="size-5" />
 							</TooltipIconButton>
-						</PopoverTrigger>
-						<PopoverContent
+						</DropdownMenuTrigger>
+						<DropdownMenuContent
+							className="w-48"
 							side="bottom"
 							align="start"
-							sideOffset={12}
-							className="w-[calc(100vw-2rem)] max-w-56 sm:max-w-72 sm:w-72 p-0 select-none"
-							onOpenAutoFocus={(e) => e.preventDefault()}
+							sideOffset={8}
+							onCloseAutoFocus={(event) => event.preventDefault()}
 						>
-							<div className="sr-only">Manage Tools</div>
-							<div
-								className="max-h-48 sm:max-h-64 overflow-y-auto overscroll-none py-0.5 sm:py-1"
-								onScroll={handleToolsScroll}
-								style={{
-									maskImage: `linear-gradient(to bottom, ${toolsScrollPos === "top" ? "black" : "transparent"}, black 16px, black calc(100% - 16px), ${toolsScrollPos === "bottom" ? "black" : "transparent"})`,
-									WebkitMaskImage: `linear-gradient(to bottom, ${toolsScrollPos === "top" ? "black" : "transparent"}, black 16px, black calc(100% - 16px), ${toolsScrollPos === "bottom" ? "black" : "transparent"})`,
+							<DropdownMenuItem onSelect={() => openUploadDialog()}>
+								<Upload className="h-4 w-4" />
+								Upload Files
+							</DropdownMenuItem>
+							<DropdownMenuItem onSelect={() => void handleScreenCapture()}>
+								<Camera className="h-4 w-4" />
+								Take a screenshot
+							</DropdownMenuItem>
+							{hasWebSearchTool && (
+								<DropdownMenuItem
+									onSelect={(event) => {
+										event.preventDefault();
+										toggleTool("web_search");
+									}}
+									className={cn(
+										"hover:bg-accent hover:text-accent-foreground",
+										isWebSearchEnabled && "text-primary"
+									)}
+								>
+									<Globe className="h-4 w-4" />
+									<span className="flex-1 min-w-0 truncate">Web Search</span>
+									<Switch
+										checked={isWebSearchEnabled}
+										tabIndex={-1}
+										className="pointer-events-none h-4 w-7 shrink-0 border [&>span]:h-3 [&>span]:w-3 [&>span[data-state=checked]]:translate-x-3"
+									/>
+								</DropdownMenuItem>
+							)}
+							<DropdownMenuSub
+								open={toolsPopoverOpen}
+								onOpenChange={(open) => {
+									setToolsPopoverOpen(open);
+									if (!open) setOpenConnectorSubmenu(null);
 								}}
 							>
-								{groupedTools
-									.filter((g) => !g.connectorIcon)
-									.map((group) => (
-										<div key={group.label}>
-											<div className="px-2.5 sm:px-3 pt-2 pb-0.5 text-[10px] sm:text-xs text-muted-foreground/80 font-normal select-none">
-												{group.label}
+								<DropdownMenuSubTrigger>
+									<Settings2 className="h-4 w-4" />
+									Manage Tools
+								</DropdownMenuSubTrigger>
+								<DropdownMenuPortal>
+									<DropdownMenuSubContent
+										alignOffset={-192}
+										collisionPadding={8}
+										className="w-60 h-56 gap-1 overflow-y-auto overscroll-none"
+										onScroll={() => setOpenConnectorSubmenu(null)}
+									>
+										{regularToolGroups.map((group) => (
+											<div key={group.label}>
+												<div className="px-2 pt-1.5 pb-0.5 text-[10px] text-muted-foreground/80 font-normal select-none">
+													{group.label}
+												</div>
+												{group.tools.map((tool) => {
+													const isDisabled = disabledToolsSet.has(tool.name);
+													const ToolIcon = getToolIcon(tool.name);
+													return (
+														<DropdownMenuItem
+															key={tool.name}
+															onSelect={(e) => {
+																e.preventDefault();
+																toggleTool(tool.name);
+															}}
+															className={cn(
+																"mb-1 last:mb-0 transition-all",
+																"hover:bg-accent hover:text-accent-foreground",
+																!isDisabled && "text-primary"
+															)}
+														>
+															<ToolIcon className="h-4 w-4" />
+															<span className="flex-1 min-w-0 truncate">
+																{formatToolName(tool.name)}
+															</span>
+															<Switch
+																checked={!isDisabled}
+																tabIndex={-1}
+																className="pointer-events-none shrink-0 scale-[0.6]"
+															/>
+														</DropdownMenuItem>
+													);
+												})}
 											</div>
-											{group.tools.map((tool) => {
-												const isDisabled = disabledToolsSet.has(tool.name);
-												const ToolIcon = getToolIcon(tool.name);
-												const row = (
-													<div className="flex w-full items-center gap-2 sm:gap-3 px-2.5 sm:px-3 py-1 sm:py-1.5 hover:bg-muted-foreground/10 transition-colors">
-														<ToolIcon className="size-3.5 sm:size-4 shrink-0 text-muted-foreground" />
-														<span className="flex-1 min-w-0 text-xs sm:text-sm font-medium truncate">
-															{formatToolName(tool.name)}
-														</span>
-														<Switch
-															checked={!isDisabled}
-															onCheckedChange={() => toggleTool(tool.name)}
-															className="shrink-0 scale-[0.6] sm:scale-75"
-														/>
-													</div>
-												);
-												return (
-													<Tooltip key={tool.name}>
-														<TooltipTrigger asChild>{row}</TooltipTrigger>
-														<TooltipContent side="right" className="max-w-64 text-xs">
-															{tool.description}
-														</TooltipContent>
-													</Tooltip>
-												);
-											})}
-										</div>
-									))}
-								{groupedTools.some((g) => g.connectorIcon) && (
-									<div>
-										<div className="px-2.5 sm:px-3 pt-2 pb-0.5 text-[10px] sm:text-xs text-muted-foreground/80 font-normal select-none">
-											Connector Actions
-										</div>
-										{groupedTools
-											.filter((g) => g.connectorIcon)
-											.map((group) => {
-												const iconKey = group.connectorIcon ?? "";
-												const iconInfo = CONNECTOR_TOOL_ICON_PATHS[iconKey];
-												const toolNames = group.tools.map((t) => t.name);
-												const allDisabled = toolNames.every((n) => disabledToolsSet.has(n));
-												const groupDef = TOOL_GROUPS.find((g) => g.label === group.label);
-												const row = (
-													<div className="flex w-full items-center gap-2 sm:gap-3 px-2.5 sm:px-3 py-1 sm:py-1.5 hover:bg-muted-foreground/10 transition-colors">
-														{iconInfo ? (
-															<Image
-																src={iconInfo.src}
-																alt={iconInfo.alt}
-																width={16}
-																height={16}
-																className="size-3.5 sm:size-4 shrink-0 select-none pointer-events-none"
-																draggable={false}
+										))}
+										{connectorToolGroups.length > 0 && (
+											<div>
+												<div className="px-2 pt-1.5 pb-0.5 text-[10px] text-muted-foreground/80 font-normal select-none">
+													Connector Actions
+												</div>
+												{connectorToolGroups.map((group) => {
+													const iconKey = group.connectorIcon ?? "";
+													const iconInfo = CONNECTOR_TOOL_ICON_PATHS[iconKey];
+													const toolNames = group.tools.map((t) => t.name);
+													const allDisabled = toolNames.every((n) => disabledToolsSet.has(n));
+													return (
+														<DropdownMenuSub
+															key={group.label}
+															open={openConnectorSubmenu === group.label}
+															onOpenChange={(open) =>
+																setOpenConnectorSubmenu(open ? group.label : null)
+															}
+														>
+															<DropdownMenuSubTrigger
+																className={cn(
+																	"mb-1 last:mb-0 transition-all",
+																	"hover:bg-accent hover:text-accent-foreground",
+																	"gap-1 [&>svg:last-child]:ml-0",
+																	!allDisabled && "text-primary"
+																)}
+															>
+																{iconInfo ? (
+																	<Image
+																		src={iconInfo.src}
+																		alt={iconInfo.alt}
+																		width={16}
+																		height={16}
+																		className="h-4 w-4 shrink-0 select-none pointer-events-none"
+																		draggable={false}
+																	/>
+																) : (
+																	<Wrench className="h-4 w-4" />
+																)}
+																<span className="min-w-0 flex-1 truncate">{group.label}</span>
+																<Switch
+																	checked={!allDisabled}
+																	tabIndex={-1}
+																	onPointerDown={(event) => event.stopPropagation()}
+																	onClick={(event) => event.stopPropagation()}
+																	onCheckedChange={() => toggleToolGroup(toolNames)}
+																	className="shrink-0 scale-[0.6]"
+																/>
+															</DropdownMenuSubTrigger>
+															<DropdownMenuPortal>
+																<DropdownMenuSubContent
+																	collisionPadding={8}
+																	className="w-60 max-h-56 overflow-y-auto overscroll-none"
+																>
+																	{group.tools.map((tool) => {
+																		const isDisabled = disabledToolsSet.has(tool.name);
+																		return (
+																			<DropdownMenuItem
+																				key={tool.name}
+																				onSelect={(e) => {
+																					e.preventDefault();
+																					toggleTool(tool.name);
+																				}}
+																				className={cn(
+																					"mb-1 last:mb-0 transition-all",
+																					"hover:bg-accent hover:text-accent-foreground",
+																					!isDisabled && "text-primary"
+																				)}
+																			>
+																				<span className="min-w-0 flex-1 truncate">
+																					{formatToolName(tool.name)}
+																				</span>
+																				<Switch
+																					checked={!isDisabled}
+																					tabIndex={-1}
+																					className="pointer-events-none shrink-0 scale-[0.6]"
+																				/>
+																			</DropdownMenuItem>
+																		);
+																	})}
+																</DropdownMenuSubContent>
+															</DropdownMenuPortal>
+														</DropdownMenuSub>
+													);
+												})}
+											</div>
+										)}
+										{otherToolGroup && (
+											<div>
+												<div className="px-2 pt-1.5 pb-0.5 text-[10px] text-muted-foreground/80 font-normal select-none">
+													{otherToolGroup.label}
+												</div>
+												{otherToolGroup.tools.map((tool) => {
+													const isDisabled = disabledToolsSet.has(tool.name);
+													const ToolIcon = getToolIcon(tool.name);
+													return (
+														<DropdownMenuItem
+															key={tool.name}
+															onSelect={(e) => {
+																e.preventDefault();
+																toggleTool(tool.name);
+															}}
+															className={cn(
+																"mb-1 last:mb-0 transition-all",
+																"hover:bg-accent hover:text-accent-foreground",
+																!isDisabled && "text-primary"
+															)}
+														>
+															<ToolIcon className="h-4 w-4" />
+															<span className="flex-1 min-w-0 truncate">
+																{formatToolName(tool.name)}
+															</span>
+															<Switch
+																checked={!isDisabled}
+																tabIndex={-1}
+																className="pointer-events-none shrink-0 scale-[0.6]"
 															/>
-														) : (
-															<Wrench className="size-3.5 sm:size-4 shrink-0 text-muted-foreground" />
-														)}
-														<span className="flex-1 min-w-0 text-xs sm:text-sm font-medium truncate">
-															{group.label}
-														</span>
-														<Switch
-															checked={!allDisabled}
-															onCheckedChange={() => toggleToolGroup(toolNames)}
-															className="shrink-0 scale-[0.6] sm:scale-75"
-														/>
+														</DropdownMenuItem>
+													);
+												})}
+											</div>
+										)}
+										{!filteredTools?.length && (
+											<div className="px-2 pt-1.5 pb-1">
+												<Skeleton className="h-2 w-12 mb-1.5" />
+												{["dt1", "dt2", "dt3", "dt4"].map((k) => (
+													<div key={k} className="flex items-center gap-2 py-1">
+														<Skeleton className="h-4 w-4 rounded shrink-0" />
+														<Skeleton className="h-3 flex-1" />
+														<Skeleton className="h-4 w-8 rounded-full shrink-0" />
 													</div>
-												);
-												return (
-													<Tooltip key={group.label}>
-														<TooltipTrigger asChild>{row}</TooltipTrigger>
-														<TooltipContent side="right" className="max-w-72 text-xs">
-															{groupDef?.tooltip ??
-																group.tools.flatMap((t, i) =>
-																	i === 0
-																		? [t.description]
-																		: [<Dot key={i} className="inline h-4 w-4" />, t.description]
-																)}
-														</TooltipContent>
-													</Tooltip>
-												);
-											})}
-									</div>
-								)}
-								{!filteredTools?.length && (
-									<div className="px-3 py-4 text-center text-xs text-muted-foreground">
-										Loading tools...
-									</div>
-								)}
-							</div>
-						</PopoverContent>
-					</Popover>
-				)}
-				{hasWebSearchTool && (
-					<button
-						type="button"
-						aria-label={isWebSearchEnabled ? "Disable web search" : "Enable web search"}
-						aria-pressed={isWebSearchEnabled}
-						onClick={() => toggleTool("web_search")}
-						className={cn(
-							"rounded-full transition-[background-color,border-color,color] flex items-center gap-1 px-2 py-1 border h-8 select-none",
-							isWebSearchEnabled
-								? "bg-sky-500/15 border-sky-500/60 text-sky-500"
-								: "bg-transparent border-transparent text-muted-foreground hover:text-foreground"
-						)}
-					>
-						<motion.div
-							animate={{
-								rotate: isWebSearchEnabled ? 360 : 0,
-								scale: isWebSearchEnabled ? 1.1 : 1,
-							}}
-							whileHover={{
-								rotate: isWebSearchEnabled ? 360 : 15,
-								scale: 1.1,
-								transition: { type: "spring", stiffness: 300, damping: 10 },
-							}}
-							transition={{ type: "spring", stiffness: 260, damping: 25 }}
-						>
-							<Globe className="size-4" />
-						</motion.div>
-						<AnimatePresence>
-							{isWebSearchEnabled && (
-								<motion.span
-									initial={{ width: 0, opacity: 0 }}
-									animate={{ width: "auto", opacity: 1 }}
-									exit={{ width: 0, opacity: 0 }}
-									transition={{ duration: 0.2 }}
-									className="text-xs overflow-hidden whitespace-nowrap"
-								>
-									Search
-								</motion.span>
-							)}
-						</AnimatePresence>
-					</button>
-				)}
-				{sidebarDocs.length > 0 && (
-					<button
-						type="button"
-						onClick={() => setDocumentsSidebarOpen(true)}
-						className="rounded-full border border-border/60 bg-accent/50 px-2.5 py-1 text-xs font-medium text-foreground/80 transition-colors hover:bg-accent"
-					>
-						{sidebarDocs.length} {sidebarDocs.length === 1 ? "source" : "sources"} selected
-					</button>
+												))}
+											</div>
+										)}
+									</DropdownMenuSubContent>
+								</DropdownMenuPortal>
+							</DropdownMenuSub>
+						</DropdownMenuContent>
+					</DropdownMenu>
 				)}
 			</div>
 			{!hasModelConfigured && (
@@ -1234,7 +1413,7 @@ const ComposerAction: FC<ComposerActionProps> = ({ isBlockedByOtherUser = false
 									: !hasModelConfigured
 										? "Please select a model from the header to start chatting"
 										: isComposerEmpty
-											? "Enter a message to send"
+											? "Enter a message or add a screenshot to send"
 											: "Send message"
 							}
 							side="bottom"
@@ -1242,13 +1421,13 @@ const ComposerAction: FC<ComposerActionProps> = ({ isBlockedByOtherUser = false
 							variant="default"
 							size="icon"
 							className={cn(
-								"aui-composer-send size-8 rounded-full",
+								"aui-composer-send size-9 rounded-full",
 								isSendDisabled && "cursor-not-allowed opacity-50"
 							)}
 							aria-label="Send message"
 							disabled={isSendDisabled}
 						>
-							<ArrowUpIcon className="aui-composer-send-icon size-4" />
+							<ArrowUpIcon className="aui-composer-send-icon size-5" />
 						</TooltipIconButton>
 					</ComposerPrimitive.Send>
 				</AuiIf>
@@ -1259,10 +1438,10 @@ const ComposerAction: FC<ComposerActionProps> = ({ isBlockedByOtherUser = false
 							type="button"
 							variant="default"
 							size="icon"
-							className="aui-composer-cancel size-8 rounded-full"
+							className="aui-composer-cancel size-9 rounded-full"
 							aria-label="Stop generating"
 						>
-							<SquareIcon className="aui-composer-cancel-icon size-3 fill-current" />
+							<SquareIcon className="aui-composer-cancel-icon size-3.5 fill-current" />
 						</Button>
 					</ComposerPrimitive.Cancel>
 				</AuiIf>
@@ -1271,12 +1450,9 @@ const ComposerAction: FC<ComposerActionProps> = ({ isBlockedByOtherUser = false
 	);
 };
 
-/** Convert snake_case tool names to human-readable labels */
+/** Friendly tool name (delegates to ``getToolDisplayName``). */
 function formatToolName(name: string): string {
-	return name
-		.split("_")
-		.map((word) => word.charAt(0).toUpperCase() + word.slice(1))
-		.join(" ");
+	return getToolDisplayName(name);
 }
 
 interface ToolGroup {
@@ -1293,23 +1469,41 @@ const TOOL_GROUPS: ToolGroup[] = [
 	},
 	{
 		label: "Generate",
-		tools: ["generate_podcast", "generate_video_presentation", "generate_report", "generate_image"],
+		tools: [
+			"generate_podcast",
+			"generate_video_presentation",
+			"generate_report",
+			"generate_resume",
+			"generate_image",
+		],
 	},
 	{
 		label: "Memory",
-		tools: ["save_memory", "recall_memory"],
+		tools: ["update_memory"],
 	},
 	{
 		label: "Gmail",
-		tools: ["create_gmail_draft", "update_gmail_draft", "send_gmail_email", "trash_gmail_email"],
+		tools: [
+			"search_gmail",
+			"read_gmail_email",
+			"create_gmail_draft",
+			"update_gmail_draft",
+			"send_gmail_email",
+			"trash_gmail_email",
+		],
 		connectorIcon: "gmail",
-		tooltip: "Create drafts, update drafts, send emails, and trash emails in Gmail",
+		tooltip: "Search, read, draft, update, send, and trash emails in Gmail",
 	},
 	{
 		label: "Google Calendar",
-		tools: ["create_calendar_event", "update_calendar_event", "delete_calendar_event"],
+		tools: [
+			"search_calendar_events",
+			"create_calendar_event",
+			"update_calendar_event",
+			"delete_calendar_event",
+		],
 		connectorIcon: "google_calendar",
-		tooltip: "Create, update, and delete events in Google Calendar",
+		tooltip: "Search, create, update, and delete events in Google Calendar",
 	},
 	{
 		label: "Google Drive",
@@ -1353,6 +1547,24 @@ const TOOL_GROUPS: ToolGroup[] = [
 		connectorIcon: "confluence",
 		tooltip: "Create, update, and delete pages in Confluence",
 	},
+	{
+		label: "Discord",
+		tools: ["list_discord_channels", "read_discord_messages", "send_discord_message"],
+		connectorIcon: "discord",
+		tooltip: "List channels, read messages, and send messages in Discord",
+	},
+	{
+		label: "Microsoft Teams",
+		tools: ["list_teams_channels", "read_teams_messages", "send_teams_message"],
+		connectorIcon: "teams",
+		tooltip: "List channels, read messages, and send messages in Microsoft Teams",
+	},
+	{
+		label: "Luma",
+		tools: ["list_luma_events", "read_luma_event", "create_luma_event"],
+		connectorIcon: "luma",
+		tooltip: "List, read, and create events in Luma",
+	},
 ];
 
 const EditComposer: FC = () => {
diff --git a/surfsense_web/components/assistant-ui/token-usage-context.tsx b/surfsense_web/components/assistant-ui/token-usage-context.tsx
new file mode 100644
index 0000000000..dd80bcac3a
--- /dev/null
+++ b/surfsense_web/components/assistant-ui/token-usage-context.tsx
@@ -0,0 +1,110 @@
+"use client";
+
+import {
+	createContext,
+	type FC,
+	type ReactNode,
+	useCallback,
+	useContext,
+	useSyncExternalStore,
+} from "react";
+
+export interface TokenUsageData {
+	prompt_tokens: number;
+	completion_tokens: number;
+	total_tokens: number;
+	/**
+	 * Total provider USD cost for this assistant turn, in micro-USD
+	 * (1_000_000 = $1.00). Populated from LiteLLM's response_cost on
+	 * the backend. Optional because pre-cost-credits messages persisted
+	 * before the migration won't have it.
+	 */
+	cost_micros?: number;
+	usage?: Record<
+		string,
+		{
+			prompt_tokens: number;
+			completion_tokens: number;
+			total_tokens: number;
+			cost_micros?: number;
+		}
+	>;
+	model_breakdown?: Record<
+		string,
+		{
+			prompt_tokens: number;
+			completion_tokens: number;
+			total_tokens: number;
+			cost_micros?: number;
+		}
+	>;
+}
+
+type Listener = () => void;
+
+class TokenUsageStore {
+	private data = new Map<string, TokenUsageData>();
+	private listeners = new Set<Listener>();
+
+	get(messageId: string): TokenUsageData | undefined {
+		return this.data.get(messageId);
+	}
+
+	set(messageId: string, usage: TokenUsageData): void {
+		this.data.set(messageId, usage);
+		this.notify();
+	}
+
+	rename(oldId: string, newId: string): void {
+		const usage = this.data.get(oldId);
+		if (usage) {
+			this.data.delete(oldId);
+			this.data.set(newId, usage);
+			this.notify();
+		}
+	}
+
+	clear(): void {
+		this.data.clear();
+		this.notify();
+	}
+
+	subscribe = (listener: Listener): (() => void) => {
+		this.listeners.add(listener);
+		return () => this.listeners.delete(listener);
+	};
+
+	private notify(): void {
+		for (const l of this.listeners) l();
+	}
+}
+
+const TokenUsageContext = createContext<TokenUsageStore | null>(null);
+
+export const TokenUsageProvider: FC<{ store: TokenUsageStore; children: ReactNode }> = ({
+	store,
+	children,
+}) => <TokenUsageContext.Provider value={store}>{children}</TokenUsageContext.Provider>;
+
+export function useTokenUsageStore(): TokenUsageStore {
+	const store = useContext(TokenUsageContext);
+	if (!store) throw new Error("useTokenUsageStore must be used within TokenUsageProvider");
+	return store;
+}
+
+export function useTokenUsage(messageId: string | undefined): TokenUsageData | undefined {
+	const store = useContext(TokenUsageContext);
+	const getSnapshot = useCallback(
+		() => (store && messageId ? store.get(messageId) : undefined),
+		[store, messageId]
+	);
+	const subscribe = useCallback(
+		(onStoreChange: () => void) => (store ? store.subscribe(onStoreChange) : () => {}),
+		[store]
+	);
+	return useSyncExternalStore(subscribe, getSnapshot, getSnapshot);
+}
+
+export function createTokenUsageStore(): TokenUsageStore {
+	return new TokenUsageStore();
+}
diff --git a/surfsense_web/components/assistant-ui/tool-fallback.tsx b/surfsense_web/components/assistant-ui/tool-fallback.tsx
deleted file mode 100644
index b658dba6d8..0000000000
--- a/surfsense_web/components/assistant-ui/tool-fallback.tsx
+++ /dev/null
@@ -1,147 +0,0 @@
-import type { ToolCallMessagePartComponent } from "@assistant-ui/react";
-import { CheckIcon, ChevronDownIcon, ChevronUpIcon, XCircleIcon } from "lucide-react";
-import { useMemo, useState } from "react";
-import { getToolIcon } from "@/contracts/enums/toolIcons";
-import { cn } from "@/lib/utils";
-
-function formatToolName(name: string): string {
-	return name.replace(/_/g, " ").replace(/\b\w/g, (c) => c.toUpperCase());
-}
-
-export const ToolFallback: ToolCallMessagePartComponent = ({
-	toolName,
-	argsText,
-	result,
-	status,
-}) => {
-	const [isExpanded, setIsExpanded] = useState(false);
-
-	const isCancelled = status?.type === "incomplete" && status.reason === "cancelled";
-	const isError = status?.type === "incomplete" && status.reason === "error";
-	const isRunning = status?.type === "running" || status?.type === "requires-action";
-	const errorData = status?.type === "incomplete" ? status.error : undefined;
-	const serializedError = useMemo(
-		() => (errorData && typeof errorData !== "string" ? JSON.stringify(errorData) : null),
-		[errorData]
-	);
-
-	const serializedResult = useMemo(
-		() =>
-			result !== undefined && typeof result !== "string" ? JSON.stringify(result, null, 2) : null,
-		[result]
-	);
-
-	const cancelledReason =
-		isCancelled && status.error
-			? typeof status.error === "string"
-				? status.error
-				: serializedError
-			: null;
-	const errorReason =
-		isError && status.error
-			? typeof status.error === "string"
-				? status.error
-				: serializedError
-			: null;
-
-	const Icon = getToolIcon(toolName);
-	const displayName = formatToolName(toolName);
-
-	return (
-		<div
-			className={cn(
-				"my-4 max-w-lg overflow-hidden rounded-2xl border bg-muted/30 select-none",
-				isCancelled && "opacity-60",
-				isError && "border-destructive/20 bg-destructive/5"
-			)}
-		>
-			<button
-				type="button"
-				onClick={() => setIsExpanded((prev) => !prev)}
-				className="flex w-full items-center gap-3 px-5 py-4 text-left transition-colors hover:bg-muted/50 focus:outline-none focus-visible:outline-none"
-			>
-				<div
-					className={cn(
-						"flex size-8 shrink-0 items-center justify-center rounded-lg",
-						isError ? "bg-destructive/10" : isCancelled ? "bg-muted" : "bg-primary/10"
-					)}
-				>
-					{isError ? (
-						<XCircleIcon className="size-4 text-destructive" />
-					) : isCancelled ? (
-						<XCircleIcon className="size-4 text-muted-foreground" />
-					) : isRunning ? (
-						<Icon className="size-4 text-primary animate-pulse" />
-					) : (
-						<CheckIcon className="size-4 text-primary" />
-					)}
-				</div>
-
-				<div className="flex-1 min-w-0">
-					<p
-						className={cn(
-							"text-sm font-semibold",
-							isError
-								? "text-destructive"
-								: isCancelled
-									? "text-muted-foreground line-through"
-									: "text-foreground"
-						)}
-					>
-						{isRunning
-							? displayName
-							: isCancelled
-								? `Cancelled: ${displayName}`
-								: isError
-									? `Failed: ${displayName}`
-									: displayName}
-					</p>
-					{isRunning && <p className="text-xs text-muted-foreground mt-0.5">Running...</p>}
-					{cancelledReason && (
-						<p className="text-xs text-muted-foreground mt-0.5 truncate">{cancelledReason}</p>
-					)}
-					{errorReason && (
-						<p className="text-xs text-destructive/80 mt-0.5 truncate">{errorReason}</p>
-					)}
-				</div>
-
-				{!isRunning && (
-					<div className="shrink-0 text-muted-foreground">
-						{isExpanded ? (
-							<ChevronDownIcon className="size-4" />
-						) : (
-							<ChevronUpIcon className="size-4" />
-						)}
-					</div>
-				)}
-			</button>
-
-			{isExpanded && !isRunning && (
-				<>
-					<div className="mx-5 h-px bg-border/50" />
-					<div className="px-5 py-3 space-y-3">
-						{argsText && (
-							<div>
-								<p className="text-xs font-medium text-muted-foreground mb-1">Arguments</p>
-								<pre className="text-xs text-foreground/80 whitespace-pre-wrap break-all">
-									{argsText}
-								</pre>
-							</div>
-						)}
-						{!isCancelled && result !== undefined && (
-							<>
-								<div className="h-px bg-border/30" />
-								<div>
-									<p className="text-xs font-medium text-muted-foreground mb-1">Result</p>
-									<pre className="text-xs text-foreground/80 whitespace-pre-wrap break-all">
-										{typeof result === "string" ? result : serializedResult}
-									</pre>
-								</div>
-							</>
-						)}
-					</div>
-				</>
-			)}
-		</div>
-	);
-};
diff --git a/surfsense_web/components/assistant-ui/user-message.tsx b/surfsense_web/components/assistant-ui/user-message.tsx
index 34945c4725..d17788c718 100644
--- a/surfsense_web/components/assistant-ui/user-message.tsx
+++ b/surfsense_web/components/assistant-ui/user-message.tsx
@@ -1,11 +1,24 @@
-import { ActionBarPrimitive, AuiIf, MessagePrimitive, useAuiState } from "@assistant-ui/react";
-import { useAtomValue } from "jotai";
-import { CheckIcon, CopyIcon, FileText, Pen } from "lucide-react";
+import {
+	ActionBarPrimitive,
+	AuiIf,
+	MessagePrimitive,
+	useAuiState,
+	useMessagePartText,
+} from "@assistant-ui/react";
+import { useAtomValue, useSetAtom } from "jotai";
+import { CheckIcon, CopyIcon, Folder as FolderIcon, Pencil } from "lucide-react";
 import Image from "next/image";
-import { type FC, useState } from "react";
+import { useParams } from "next/navigation";
+import { type FC, useCallback, useState } from "react";
+import { toast } from "sonner";
 import { currentThreadAtom } from "@/atoms/chat/current-thread.atom";
 import { messageDocumentsMapAtom } from "@/atoms/chat/mentioned-documents.atom";
+import { openEditorPanelAtom } from "@/atoms/editor/editor-panel.atom";
+import { MentionChip } from "@/components/assistant-ui/mention-chip";
 import { TooltipIconButton } from "@/components/assistant-ui/tooltip-icon-button";
+import { getConnectorIcon } from "@/contracts/enums/connectorIcons";
+import { getMentionDocKey } from "@/lib/chat/mention-doc-key";
+import { parseMentionSegments } from "@/lib/chat/parse-mention-segments";
 
 interface AuthorMetadata {
 	displayName: string | null;
@@ -40,16 +53,76 @@ const UserAvatar: FC<AuthorMetadata> = ({ displayName, avatarUrl }) => {
 	}
 
 	return (
-		<div className="flex size-8 items-center justify-center rounded-full bg-primary/10 text-xs font-medium text-primary select-none">
+		<div className="flex size-8 items-center justify-center rounded-full bg-muted text-xs font-medium text-foreground select-none">
 			{initials}
 		</div>
 	);
 };
 
-export const UserMessage: FC = () => {
+const UserTextPart: FC = () => {
 	const messageId = useAuiState(({ message }) => message?.id);
+	const part = useMessagePartText();
+	const text = (part as { text?: string }).text ?? "";
 	const messageDocumentsMap = useAtomValue(messageDocumentsMapAtom);
-	const mentionedDocs = messageId ? messageDocumentsMap[messageId] : undefined;
+	const mentionedDocs = (messageId ? messageDocumentsMap[messageId] : undefined) ?? [];
+	const openEditorPanel = useSetAtom(openEditorPanelAtom);
+	const params = useParams();
+	const searchSpaceIdParam = params?.search_space_id;
+	const parsedSearchSpaceId = Array.isArray(searchSpaceIdParam)
+		? Number(searchSpaceIdParam[0])
+		: Number(searchSpaceIdParam);
+	const resolvedSearchSpaceId = Number.isFinite(parsedSearchSpaceId)
+		? parsedSearchSpaceId
+		: undefined;
+
+	const handleOpenDoc = useCallback(
+		(docId: number, title: string) => {
+			if (!resolvedSearchSpaceId) {
+				toast.error("Cannot open document outside a search space.");
+				return;
+			}
+			openEditorPanel({
+				kind: "document",
+				documentId: docId,
+				searchSpaceId: resolvedSearchSpaceId,
+				title,
+			});
+		},
+		[openEditorPanel, resolvedSearchSpaceId]
+	);
+
+	const segments = parseMentionSegments(text, mentionedDocs);
+
+	return (
+		<p style={{ whiteSpace: "pre-line" }} className="wrap-break-word">
+			{segments.map((segment) => {
+				if (segment.type === "text") {
+					return <span key={`txt-${segment.start}`}>{segment.value}</span>;
+				}
+				const isFolder = segment.doc.kind === "folder";
+				const icon = isFolder ? (
+					<FolderIcon className="size-3.5" />
+				) : (
+					getConnectorIcon(segment.doc.document_type ?? "UNKNOWN", "size-3.5")
+				);
+				return (
+					<MentionChip
+						key={`mention-${getMentionDocKey(segment.doc)}-${segment.start}`}
+						icon={icon}
+						label={segment.doc.title}
+						tooltip={isFolder ? `Folder: ${segment.doc.title}` : segment.doc.title}
+						onClick={isFolder ? undefined : () => handleOpenDoc(segment.doc.id, segment.doc.title)}
+						className="mx-0.5"
+					/>
+				);
+			})}
+		</p>
+	);
+};
+
+const userMessageParts = { Text: UserTextPart };
+
+export const UserMessage: FC = () => {
 	const metadata = useAuiState(({ message }) => message?.metadata);
 	const author = metadata?.custom?.author as AuthorMetadata | undefined;
 	const isSharedChat = useAtomValue(currentThreadAtom).visibility === "SEARCH_SPACE";
@@ -63,22 +136,8 @@ export const UserMessage: FC = () => {
 			<div className="col-start-2 min-w-0">
 				<div className="aui-user-message-content-wrapper flex items-end gap-2">
 					<div className="relative flex-1 min-w-0">
-						{mentionedDocs && mentionedDocs.length > 0 && (
-							<div className="flex flex-wrap items-end gap-2 mb-2 justify-end">
-								{mentionedDocs?.map((doc) => (
-									<span
-										key={`${doc.document_type}:${doc.id}`}
-										className="inline-flex items-center gap-1 px-2 py-0.5 rounded-full bg-primary/10 text-xs font-medium text-primary border border-primary/20"
-										title={doc.title}
-									>
-										<FileText className="size-3" />
-										<span className="max-w-[150px] truncate">{doc.title}</span>
-									</span>
-								))}
-							</div>
-						)}
-						<div className="aui-user-message-content wrap-break-word rounded-2xl bg-muted px-4 py-2.5 text-foreground">
-							<MessagePrimitive.Parts />
+						<div className="aui-user-message-content wrap-break-word rounded-xl bg-muted px-4 py-2.5 text-foreground">
+							<MessagePrimitive.Parts components={userMessageParts} />
 						</div>
 						<div className="absolute right-0 top-full mt-1 z-10 opacity-100 pointer-events-auto md:opacity-0 md:pointer-events-none md:transition-opacity md:duration-200 md:delay-300 md:group-hover/user-msg:opacity-100 md:group-hover/user-msg:delay-0 md:group-hover/user-msg:pointer-events-auto">
 							<UserActionBar />
@@ -136,7 +195,7 @@ const UserActionBar: FC = () => {
 			{canEdit && (
 				<ActionBarPrimitive.Edit asChild>
 					<TooltipIconButton tooltip="Edit" className="aui-user-action-edit">
-						<Pen />
+						<Pencil />
 					</TooltipIconButton>
 				</ActionBarPrimitive.Edit>
 			)}
diff --git a/surfsense_web/components/auth/sign-in-button.tsx b/surfsense_web/components/auth/sign-in-button.tsx
index b6e4b22a73..ff1782c282 100644
--- a/surfsense_web/components/auth/sign-in-button.tsx
+++ b/surfsense_web/components/auth/sign-in-button.tsx
@@ -1,6 +1,5 @@
 "use client";
 
-import { motion } from "motion/react";
 import Link from "next/link";
 import { AUTH_TYPE, isSSOAuth } from "@/lib/env-config";
 import { trackLoginAttempt } from "@/lib/posthog/events";
@@ -76,19 +75,17 @@ export const SignInButton = ({ variant = "desktop" }: SignInButtonProps) => {
 
 	if (isProxyLogin) {
 		return (
-			<motion.button
+			<button
 				type="button"
 				onClick={handleProxyLogin}
-				whileHover={{ scale: 1.02 }}
-				whileTap={{ scale: 0.98 }}
 				className={cn(
-					"flex items-center justify-center gap-2 font-semibold transition-all duration-200",
+					"flex items-center justify-center gap-2 transition-colors duration-200 disabled:cursor-not-allowed disabled:opacity-50",
 					getClassName()
 				)}
 			>
 				{isGoogleAuth && <GoogleLogo className="h-4 w-4" />}
 				<span>Sign In</span>
-			</motion.button>
+			</button>
 		);
 	}
 
diff --git a/surfsense_web/components/chat-comments/comment-item/comment-actions.tsx b/surfsense_web/components/chat-comments/comment-item/comment-actions.tsx
index 9638ac01c0..dee3e457c5 100644
--- a/surfsense_web/components/chat-comments/comment-item/comment-actions.tsx
+++ b/surfsense_web/components/chat-comments/comment-item/comment-actions.tsx
@@ -1,6 +1,6 @@
 "use client";
 
-import { MoreHorizontal, PenLine, Trash2 } from "lucide-react";
+import { MoreHorizontal, Pencil, Trash2 } from "lucide-react";
 import { Button } from "@/components/ui/button";
 import {
 	DropdownMenu,
@@ -29,7 +29,7 @@ export function CommentActions({ canEdit, canDelete, onEdit, onDelete }: Comment
 			<DropdownMenuContent align="end">
 				{canEdit && (
 					<DropdownMenuItem onClick={onEdit}>
-						<PenLine className="mr-2 size-4" />
+						<Pencil className="mr-2 size-4" />
 						Edit
 					</DropdownMenuItem>
 				)}
diff --git a/surfsense_web/components/chat-comments/comment-item/comment-item.tsx b/surfsense_web/components/chat-comments/comment-item/comment-item.tsx
index eb374ba495..40b48f0e98 100644
--- a/surfsense_web/components/chat-comments/comment-item/comment-item.tsx
+++ b/surfsense_web/components/chat-comments/comment-item/comment-item.tsx
@@ -1,11 +1,12 @@
 "use client";
 
 import { useAtomValue, useSetAtom } from "jotai";
-import { MessageSquare } from "lucide-react";
+import { MessageCircleReply } from "lucide-react";
 import { useEffect, useRef, useState } from "react";
 import { clearTargetCommentIdAtom, targetCommentIdAtom } from "@/atoms/chat/current-thread.atom";
 import { Avatar, AvatarFallback, AvatarImage } from "@/components/ui/avatar";
 import { Button } from "@/components/ui/button";
+import { convertRenderedToDisplay } from "@/lib/comments/utils";
 import { cn } from "@/lib/utils";
 import { CommentComposer } from "../comment-composer/comment-composer";
 import { CommentActions } from "./comment-actions";
@@ -69,11 +70,6 @@ function formatTimestamp(dateString: string): string {
 	);
 }
 
-export function convertRenderedToDisplay(contentRendered: string): string {
-	// Convert @{DisplayName} format to @DisplayName for editing
-	return contentRendered.replace(/@\{([^}]+)\}/g, "@$1");
-}
-
 function renderMentions(content: string): React.ReactNode {
 	// Match @{DisplayName} format from backend
 	const mentionPattern = /@\{([^}]+)\}/g;
@@ -217,10 +213,10 @@ export function CommentItem({
 					<Button
 						variant="ghost"
 						size="sm"
-						className="mt-1 h-7 w-fit px-2 text-xs text-muted-foreground hover:text-foreground"
+						className="mt-1 h-7 w-fit px-2 text-xs text-muted-foreground hover:text-accent-foreground"
 						onClick={() => onReply(comment.id)}
 					>
-						<MessageSquare className="mr-1 size-3" />
+						<MessageCircleReply className="mr-1 size-3" />
 						Reply
 					</Button>
 				)}
diff --git a/surfsense_web/components/chat-comments/comment-panel/comment-panel.tsx b/surfsense_web/components/chat-comments/comment-panel/comment-panel.tsx
index 1226010084..ef569c81f1 100644
--- a/surfsense_web/components/chat-comments/comment-panel/comment-panel.tsx
+++ b/surfsense_web/components/chat-comments/comment-panel/comment-panel.tsx
@@ -1,5 +1,6 @@
 "use client";
 
+import { Separator } from "@/components/ui/separator";
 import { cn } from "@/lib/utils";
 import { CommentComposer } from "../comment-composer/comment-composer";
 import { CommentThread } from "../comment-thread/comment-thread";
@@ -85,7 +86,9 @@ export function CommentPanel({
 				</div>
 			)}
 
-			<div className={cn("p-3", isMobile && "fixed bottom-0 left-0 right-0 z-50 bg-card border-t")}>
+			{hasThreads && !isMobile ? <Separator className="bg-sidebar-border" /> : null}
+
+			<div className={cn("p-3", isMobile && "fixed bottom-0 left-0 right-0 z-50 bg-sidebar")}>
 				<CommentComposer
 					members={members}
 					membersLoading={membersLoading}
diff --git a/surfsense_web/components/chat-comments/comment-sheet/comment-sheet.tsx b/surfsense_web/components/chat-comments/comment-sheet/comment-sheet.tsx
index d483ab261d..054d77fa63 100644
--- a/surfsense_web/components/chat-comments/comment-sheet/comment-sheet.tsx
+++ b/surfsense_web/components/chat-comments/comment-sheet/comment-sheet.tsx
@@ -1,6 +1,5 @@
 "use client";
 
-import { MessageSquare } from "lucide-react";
 import {
 	Drawer,
 	DrawerContent,
@@ -29,8 +28,7 @@ export function CommentSheet({
 				<DrawerContent className="h-[85vh] max-h-[85vh] z-80" overlayClassName="z-80">
 					<DrawerHandle />
 					<DrawerHeader className="px-4 pb-3 pt-2">
-						<DrawerTitle className="flex items-center gap-2 text-base font-semibold">
-							<MessageSquare className="size-5" />
+						<DrawerTitle className="flex items-center justify-center gap-2 text-base font-semibold">
 							Comments
 							{commentCount > 0 && (
 								<span className="rounded-full bg-primary/10 px-2 py-0.5 text-xs font-medium text-primary">
@@ -56,7 +54,6 @@ export function CommentSheet({
 			>
 				<SheetHeader className="flex-shrink-0 px-4 py-4">
 					<SheetTitle className="flex items-center gap-2 text-base font-semibold">
-						<MessageSquare className="size-5" />
 						Comments
 						{commentCount > 0 && (
 							<span className="rounded-full bg-primary/10 px-2 py-0.5 text-xs font-medium text-primary">
diff --git a/surfsense_web/components/chat-comments/comment-thread/comment-thread.tsx b/surfsense_web/components/chat-comments/comment-thread/comment-thread.tsx
index e475311297..81407e2c6e 100644
--- a/surfsense_web/components/chat-comments/comment-thread/comment-thread.tsx
+++ b/surfsense_web/components/chat-comments/comment-thread/comment-thread.tsx
@@ -1,6 +1,6 @@
 "use client";
 
-import { ChevronDown, ChevronRight, MessageSquare } from "lucide-react";
+import { ChevronDown, ChevronRight, MessageCircleReply } from "lucide-react";
 import { useState } from "react";
 import { Button } from "@/components/ui/button";
 import { CommentComposer } from "../comment-composer/comment-composer";
@@ -92,7 +92,7 @@ export function CommentThread({
 							<Button
 								variant="ghost"
 								size="sm"
-								className="h-6 px-2 text-xs text-muted-foreground hover:text-foreground"
+								className="h-6 px-2 text-xs text-muted-foreground hover:text-accent-foreground"
 								onClick={() => setIsRepliesExpanded((prev) => !prev)}
 							>
 								{isRepliesExpanded ? (
@@ -143,7 +143,7 @@ export function CommentThread({
 							</div>
 						) : (
 							<Button variant="ghost" size="sm" className="h-7 px-2 text-xs" onClick={handleReply}>
-								<MessageSquare className="mr-1 size-3" />
+								<MessageCircleReply className="mr-1 size-3" />
 								Reply
 							</Button>
 						)}
@@ -155,7 +155,7 @@ export function CommentThread({
 			{!hasReplies && !isReplyComposerOpen && (
 				<div className="ml-7 mt-1">
 					<Button variant="ghost" size="sm" className="h-7 px-2 text-xs" onClick={handleReply}>
-						<MessageSquare className="mr-1 size-3" />
+						<MessageCircleReply className="mr-1 size-3" />
 						Reply
 					</Button>
 				</div>
diff --git a/surfsense_web/components/chat-comments/member-mention-picker/member-mention-item.tsx b/surfsense_web/components/chat-comments/member-mention-picker/member-mention-item.tsx
index 0ea53eeb41..015d56c768 100644
--- a/surfsense_web/components/chat-comments/member-mention-picker/member-mention-item.tsx
+++ b/surfsense_web/components/chat-comments/member-mention-picker/member-mention-item.tsx
@@ -1,6 +1,7 @@
 "use client";
 
 import { Avatar, AvatarFallback, AvatarImage } from "@/components/ui/avatar";
+import { Button } from "@/components/ui/button";
 import { cn } from "@/lib/utils";
 import type { MemberMentionItemProps } from "./types";
 
@@ -25,11 +26,14 @@ export function MemberMentionItem({
 	const displayName = member.displayName || member.email.split("@")[0];
 
 	return (
-		<button
+		<Button
+			variant="ghost"
 			type="button"
 			className={cn(
-				"flex w-full items-center gap-3 px-3 py-2 text-left transition-colors",
-				isHighlighted ? "bg-primary/15 text-accent-foreground" : "hover:bg-accent/50"
+				"h-auto w-full justify-start gap-3 rounded-none px-3 py-2 text-left transition-colors",
+				isHighlighted
+					? "bg-primary/15 text-accent-foreground"
+					: "hover:bg-accent hover:text-accent-foreground"
 			)}
 			onClick={() => onSelect(member)}
 			onMouseEnter={onMouseEnter}
@@ -44,6 +48,6 @@ export function MemberMentionItem({
 				<span className="truncate text-sm font-medium">{displayName}</span>
 				<span className="truncate text-xs text-muted-foreground">{member.email}</span>
 			</div>
-		</button>
+		</Button>
 	);
 }
diff --git a/surfsense_web/components/citation-panel/citation-panel.tsx b/surfsense_web/components/citation-panel/citation-panel.tsx
new file mode 100644
index 0000000000..890ac11acf
--- /dev/null
+++ b/surfsense_web/components/citation-panel/citation-panel.tsx
@@ -0,0 +1,196 @@
+"use client";
+
+import { useQuery } from "@tanstack/react-query";
+import { useSetAtom } from "jotai";
+import { XIcon } from "lucide-react";
+import type { FC } from "react";
+import { useEffect, useMemo, useRef } from "react";
+import { openEditorPanelAtom } from "@/atoms/editor/editor-panel.atom";
+import { MarkdownViewer } from "@/components/markdown-viewer";
+import { Button } from "@/components/ui/button";
+import { Spinner } from "@/components/ui/spinner";
+import { documentsApiService } from "@/lib/apis/documents-api.service";
+
+const DEFAULT_CHUNK_WINDOW = 5;
+
+interface CitationPanelContentProps {
+	chunkId: number;
+	onClose?: () => void;
+	showHeader?: boolean;
+}
+
+/**
+ * Right-panel citation viewer. Shows the cited chunk surrounded by
+ * adjacent chunks (±N chunks via the API's `chunk_window` parameter),
+ * with the cited one visually highlighted and auto-scrolled into view.
+ * The user can jump to the full document via the editor panel.
+ */
+export const CitationPanelContent: FC<CitationPanelContentProps> = ({
+	chunkId,
+	onClose,
+	showHeader = true,
+}) => {
+	const openEditorPanel = useSetAtom(openEditorPanelAtom);
+
+	const chunkWindow = DEFAULT_CHUNK_WINDOW;
+
+	const { data, isLoading, error } = useQuery({
+		queryKey: ["citation-panel", chunkId, chunkWindow] as const,
+		queryFn: () =>
+			documentsApiService.getDocumentByChunk({
+				chunk_id: chunkId,
+				chunk_window: chunkWindow,
+			}),
+		staleTime: 5 * 60 * 1000,
+	});
+
+	const cited = useMemo(() => data?.chunks.find((c) => c.id === chunkId) ?? null, [data, chunkId]);
+
+	const totalChunks = data?.total_chunks ?? data?.chunks.length ?? 0;
+	const startIndex = data?.chunk_start_index ?? 0;
+	const hasMoreAbove = startIndex > 0;
+	const hasMoreBelow = data ? startIndex + data.chunks.length < totalChunks : false;
+
+	// Scroll the cited chunk into view inside the panel's scroll container
+	// (not the page). We anchor the scroll to the panel's scroll element
+	// so opening the citation doesn't yank the chat scroll on the left.
+	const scrollContainerRef = useRef<HTMLDivElement | null>(null);
+	const citedRef = useRef<HTMLDivElement | null>(null);
+	useEffect(() => {
+		if (!cited) return;
+		const id = requestAnimationFrame(() => {
+			const container = scrollContainerRef.current;
+			const target = citedRef.current;
+			if (!container || !target) return;
+			const containerRect = container.getBoundingClientRect();
+			const targetRect = target.getBoundingClientRect();
+			const offset = targetRect.top - containerRect.top + container.scrollTop;
+			container.scrollTo({
+				top: Math.max(0, offset - 16),
+				behavior: "smooth",
+			});
+		});
+		return () => cancelAnimationFrame(id);
+	}, [cited]);
+
+	const handleOpenFullDocument = () => {
+		if (!data) return;
+		openEditorPanel({
+			documentId: data.id,
+			searchSpaceId: data.search_space_id,
+			title: data.title,
+		});
+	};
+
+	return (
+		<>
+			<div className="shrink-0">
+				{showHeader && (
+					<div className="shrink-0 flex h-12 items-center justify-between px-3 border-b">
+						<h2 className="select-none text-lg font-semibold">Citation</h2>
+						<div className="flex items-center gap-1 shrink-0">
+							{onClose && (
+								<Button
+									variant="ghost"
+									size="icon"
+									onClick={onClose}
+									className="h-8 w-8 rounded-full shrink-0 text-muted-foreground hover:text-accent-foreground"
+								>
+									<XIcon className="h-4 w-4" />
+									<span className="sr-only">Close citation panel</span>
+								</Button>
+							)}
+						</div>
+					</div>
+				)}
+				<div className="grid h-10 grid-cols-[minmax(0,1fr)_auto] items-center gap-3 border-b px-4">
+					<div className="min-w-0 flex flex-1 items-center gap-2">
+						<p className="truncate text-sm text-muted-foreground">
+							{data?.title ?? (isLoading ? "Loading…" : `Chunk #${chunkId}`)}
+						</p>
+					</div>
+					<div className="flex items-center gap-3 shrink-0 text-[11px] text-muted-foreground">
+						{totalChunks > 0 && <span>{totalChunks} chunks</span>}
+						{!isLoading && !error && data && (
+							<Button
+								variant="default"
+								size="sm"
+								className="h-6 px-1.5 text-[11px]"
+								onClick={handleOpenFullDocument}
+							>
+								Open
+							</Button>
+						)}
+					</div>
+				</div>
+			</div>
+
+			<div ref={scrollContainerRef} className="flex-1 overflow-y-auto px-5 py-4">
+				{isLoading && (
+					<div className="flex min-h-full items-center justify-center text-muted-foreground">
+						<Spinner size="md" />
+					</div>
+				)}
+
+				{error && (
+					<div className="flex min-h-full items-center justify-center text-center">
+						<p className="text-sm text-destructive">
+							{error instanceof Error ? error.message : "Failed to load citation"}
+						</p>
+					</div>
+				)}
+
+				{!isLoading && !error && data && (
+					<>
+						{hasMoreAbove && (
+							<p className="mb-3 text-center text-[11px] text-muted-foreground">
+								… {startIndex} earlier chunk{startIndex === 1 ? "" : "s"} not shown
+							</p>
+						)}
+						<div className="space-y-3">
+							{data.chunks.map((chunk) => {
+								const isCited = chunk.id === chunkId;
+								return (
+									<div
+										key={chunk.id}
+										ref={isCited ? citedRef : null}
+										data-cited={isCited || undefined}
+										className={
+											isCited
+												? "rounded-md border-2 border-primary bg-accent px-4 py-3 shadow-sm"
+												: "rounded-md bg-accent px-4 py-3 opacity-70 transition-opacity hover:opacity-100"
+										}
+									>
+										<div className="mb-1.5 flex items-center justify-between">
+											<span
+												className={
+													isCited
+														? "text-[11px] text-muted-foreground"
+														: "text-[11px] font-medium text-muted-foreground"
+												}
+											>
+												Chunk #{chunk.id}
+											</span>
+											{isCited && (
+												<span className="text-[11px] font-semibold text-primary">Cited chunk</span>
+											)}
+										</div>
+										<div className="text-sm">
+											<MarkdownViewer content={chunk.content} enableCitations />
+										</div>
+									</div>
+								);
+							})}
+						</div>
+						{hasMoreBelow && (
+							<p className="mt-3 text-center text-[11px] text-muted-foreground">
+								… {totalChunks - (startIndex + data.chunks.length)} later chunk
+								{totalChunks - (startIndex + data.chunks.length) === 1 ? "" : "s"} not shown
+							</p>
+						)}
+					</>
+				)}
+			</div>
+		</>
+	);
+};
diff --git a/surfsense_web/components/citations/citation-renderer.tsx b/surfsense_web/components/citations/citation-renderer.tsx
new file mode 100644
index 0000000000..f2de4b27dc
--- /dev/null
+++ b/surfsense_web/components/citations/citation-renderer.tsx
@@ -0,0 +1,77 @@
+"use client";
+
+import type { ReactNode } from "react";
+import { InlineCitation, UrlCitation } from "@/components/assistant-ui/inline-citation";
+import {
+	type CitationToken,
+	type CitationUrlMap,
+	parseTextWithCitations,
+} from "@/lib/citations/citation-parser";
+
+/**
+ * Render a single parsed citation token as JSX.
+ *
+ * `ordinalKey` should be a stable per-render counter so duplicate identical
+ * citations within the same parent don't collide on `key`. The previous
+ * implementation in `markdown-text.tsx` used the source string itself as
+ * the key, which produced React warnings when two segments rendered the
+ * same `[citation:N]` text.
+ */
+export function renderCitationToken(token: CitationToken, ordinalKey: number): ReactNode {
+	if (token.kind === "url") {
+		return <UrlCitation key={`citation-url-${ordinalKey}`} url={token.url} />;
+	}
+	return (
+		<InlineCitation
+			key={`citation-${token.isDocsChunk ? "doc-" : ""}${token.chunkId}-${ordinalKey}`}
+			chunkId={token.chunkId}
+			isDocsChunk={token.isDocsChunk}
+		/>
+	);
+}
+
+/**
+ * Walk a `ReactNode` (string, array, or arbitrary node) and replace any
+ * `[citation:...]` tokens inside string children with citation badges.
+ *
+ * Designed for use inside `Streamdown`/`react-markdown` `components`
+ * overrides where the renderer hands you `children`. Non-string children
+ * are returned untouched so block/phrasing structure is preserved.
+ */
+export function processChildrenWithCitations(
+	children: ReactNode,
+	urlMap: CitationUrlMap
+): ReactNode {
+	if (typeof children === "string") {
+		const segments = parseTextWithCitations(children, urlMap);
+		if (segments.length === 1 && typeof segments[0] === "string") {
+			return children;
+		}
+		let ordinal = 0;
+		return segments.map((segment) =>
+			typeof segment === "string" ? segment : renderCitationToken(segment, ordinal++)
+		);
+	}
+
+	if (Array.isArray(children)) {
+		let ordinal = 0;
+		return children.map((child, childIndex) => {
+			if (typeof child === "string") {
+				const segments = parseTextWithCitations(child, urlMap);
+				if (segments.length === 1 && typeof segments[0] === "string") {
+					return child;
+				}
+				return (
+					<span key={`citation-seg-${childIndex}`}>
+						{segments.map((segment) =>
+							typeof segment === "string" ? segment : renderCitationToken(segment, ordinal++)
+						)}
+					</span>
+				);
+			}
+			return child;
+		});
+	}
+
+	return children;
+}
diff --git a/surfsense_web/components/connectors/drive-folder-tree.tsx b/surfsense_web/components/connectors/drive-folder-tree.tsx
index 905d2b7ca6..dd9cd7323b 100644
--- a/surfsense_web/components/connectors/drive-folder-tree.tsx
+++ b/surfsense_web/components/connectors/drive-folder-tree.tsx
@@ -13,6 +13,7 @@ import {
 	Presentation,
 } from "lucide-react";
 import { useCallback, useEffect, useState } from "react";
+import { Button } from "@/components/ui/button";
 import { Checkbox } from "@/components/ui/checkbox";
 import { ScrollArea } from "@/components/ui/scroll-area";
 import { Spinner } from "@/components/ui/spinner";
@@ -261,15 +262,17 @@ export function DriveFolderTree({
 				<div
 					className={cn(
 						"flex items-center group gap-1 sm:gap-2 h-auto py-1 sm:py-2 px-1 sm:px-2 rounded-md",
-						isFolder && "hover:bg-accent cursor-pointer",
+						isFolder && "hover:bg-accent hover:text-accent-foreground cursor-pointer",
 						!isFolder && "cursor-default opacity-60",
-						isSelected && "bg-accent/50"
+						isSelected && "bg-accent text-accent-foreground"
 					)}
 				>
 					{isFolder ? (
-						<button
+						<Button
 							type="button"
-							className="flex items-center justify-center w-3 h-3 sm:w-4 sm:h-4 shrink-0 bg-transparent border-0 p-0 cursor-pointer"
+							variant="ghost"
+							size="icon"
+							className="h-3 w-3 shrink-0 cursor-pointer bg-transparent p-0 hover:bg-transparent sm:h-4 sm:w-4"
 							onClick={(e) => {
 								e.stopPropagation();
 								toggleFolder(item);
@@ -283,7 +286,7 @@ export function DriveFolderTree({
 							) : (
 								<ChevronRight className="h-3 w-3 sm:h-4 sm:w-4" />
 							)}
-						</button>
+						</Button>
 					) : (
 						<span className="w-3 h-3 sm:w-4 sm:h-4 shrink-0" />
 					)}
@@ -314,13 +317,14 @@ export function DriveFolderTree({
 					</div>
 
 					{isFolder ? (
-						<button
+						<Button
 							type="button"
-							className="truncate flex-1 text-left text-xs sm:text-sm min-w-0 bg-transparent border-0 p-0 cursor-pointer"
+							variant="ghost"
+							className="h-auto min-w-0 flex-1 justify-start truncate bg-transparent p-0 text-left text-xs font-normal hover:bg-transparent hover:text-inherit sm:text-sm"
 							onClick={() => toggleFolder(item)}
 						>
 							{item.name}
-						</button>
+						</Button>
 					) : (
 						<span className="truncate flex-1 text-left text-xs sm:text-sm min-w-0">
 							{item.name}
@@ -356,13 +360,14 @@ export function DriveFolderTree({
 								className="shrink-0 h-3.5 w-3.5 sm:h-4 sm:w-4 border-slate-400/20 dark:border-white/20"
 							/>
 							<HardDrive className="h-3 w-3 sm:h-4 sm:w-4 text-muted-foreground shrink-0" />
-							<button
+							<Button
 								type="button"
-								className="font-semibold truncate text-xs sm:text-sm cursor-pointer bg-transparent border-0 p-0 text-left"
+								variant="ghost"
+								className="h-auto min-w-0 justify-start truncate bg-transparent p-0 text-left text-xs font-semibold hover:bg-transparent hover:text-inherit sm:text-sm"
 								onClick={() => toggleFolderSelection("root", rootLabel)}
 							>
 								{rootLabel}
-							</button>
+							</Button>
 						</div>
 					</div>
 
diff --git a/surfsense_web/components/connectors/google-drive-folder-tree.tsx b/surfsense_web/components/connectors/google-drive-folder-tree.tsx
index 1584d05d0d..e47d1d1853 100644
--- a/surfsense_web/components/connectors/google-drive-folder-tree.tsx
+++ b/surfsense_web/components/connectors/google-drive-folder-tree.tsx
@@ -13,6 +13,7 @@ import {
 	Presentation,
 } from "lucide-react";
 import { useState } from "react";
+import { Button } from "@/components/ui/button";
 import { Checkbox } from "@/components/ui/checkbox";
 import { ScrollArea } from "@/components/ui/scroll-area";
 import { Spinner } from "@/components/ui/spinner";
@@ -237,15 +238,17 @@ export function GoogleDriveFolderTree({
 				<div
 					className={cn(
 						"flex items-center group gap-1 sm:gap-2 h-auto py-1 sm:py-2 px-1 sm:px-2 rounded-md",
-						isFolder && "hover:bg-accent cursor-pointer",
+						isFolder && "hover:bg-accent hover:text-accent-foreground cursor-pointer",
 						!isFolder && "cursor-default opacity-60",
-						isSelected && "bg-accent/50"
+						isSelected && "bg-accent text-accent-foreground"
 					)}
 				>
 					{isFolder ? (
-						<button
+						<Button
 							type="button"
-							className="flex items-center justify-center w-3 h-3 sm:w-4 sm:h-4 shrink-0 bg-transparent border-0 p-0 cursor-pointer"
+							variant="ghost"
+							size="icon"
+							className="h-3 w-3 shrink-0 cursor-pointer bg-transparent p-0 hover:bg-transparent sm:h-4 sm:w-4"
 							onClick={(e) => {
 								e.stopPropagation();
 								toggleFolder(item);
@@ -259,7 +262,7 @@ export function GoogleDriveFolderTree({
 							) : (
 								<ChevronRight className="h-3 w-3 sm:h-4 sm:w-4" />
 							)}
-						</button>
+						</Button>
 					) : (
 						<span className="w-3 h-3 sm:w-4 sm:h-4 shrink-0" />
 					)}
@@ -290,13 +293,14 @@ export function GoogleDriveFolderTree({
 					</div>
 
 					{isFolder ? (
-						<button
+						<Button
 							type="button"
-							className="truncate flex-1 text-left text-xs sm:text-sm min-w-0 bg-transparent border-0 p-0 cursor-pointer"
+							variant="ghost"
+							className="h-auto min-w-0 flex-1 cursor-pointer justify-start truncate bg-transparent p-0 text-left text-xs hover:bg-transparent sm:text-sm"
 							onClick={() => toggleFolder(item)}
 						>
 							{item.name}
-						</button>
+						</Button>
 					) : (
 						<span className="truncate flex-1 text-left text-xs sm:text-sm min-w-0">
 							{item.name}
@@ -332,13 +336,14 @@ export function GoogleDriveFolderTree({
 								className="shrink-0 h-3.5 w-3.5 sm:h-4 sm:w-4 border-slate-400/20 dark:border-white/20"
 							/>
 							<HardDrive className="h-3 w-3 sm:h-4 sm:w-4 text-muted-foreground shrink-0" />
-							<button
+							<Button
 								type="button"
-								className="font-semibold truncate text-xs sm:text-sm cursor-pointer bg-transparent border-0 p-0 text-left"
+								variant="ghost"
+								className="h-auto cursor-pointer truncate bg-transparent p-0 text-left text-xs font-semibold hover:bg-transparent sm:text-sm"
 								onClick={() => toggleFolderSelection("root", "My Drive")}
 							>
 								My Drive
-							</button>
+							</Button>
 						</div>
 					</div>
 
diff --git a/surfsense_web/components/contact/contact-form.tsx b/surfsense_web/components/contact/contact-form.tsx
index 2f934a0198..09b2c30a15 100644
--- a/surfsense_web/components/contact/contact-form.tsx
+++ b/surfsense_web/components/contact/contact-form.tsx
@@ -16,9 +16,9 @@ export function ContactFormGridWithDetails() {
 						<IconMailFilled className="h-6 w-6 text-blue-500" />
 					</FeatureIconContainer>
 				</div>
-				<h2 className="mt-9 bg-gradient-to-b from-neutral-800 to-neutral-900 bg-clip-text text-center text-xl font-bold text-transparent md:text-3xl lg:text-5xl dark:from-neutral-200 dark:to-neutral-300">
+				<h1 className="mt-9 bg-gradient-to-b from-neutral-800 to-neutral-900 bg-clip-text text-center text-xl font-bold text-transparent md:text-3xl lg:text-5xl dark:from-neutral-200 dark:to-neutral-300">
 					Contact
-				</h2>
+				</h1>
 				<p className="mt-8 max-w-lg text-center text-base text-neutral-600 dark:text-neutral-400">
 					We'd love to hear from you!
 				</p>
diff --git a/surfsense_web/components/container.tsx b/surfsense_web/components/container.tsx
new file mode 100644
index 0000000000..1fed862619
--- /dev/null
+++ b/surfsense_web/components/container.tsx
@@ -0,0 +1,11 @@
+import { cn } from "@/lib/utils";
+
+export function Container({
+	className,
+	children,
+}: {
+	className?: string;
+	children: React.ReactNode;
+}) {
+	return <div className={cn("mx-auto max-w-7xl px-4 sm:px-6 lg:px-8", className)}>{children}</div>;
+}
diff --git a/surfsense_web/components/desktop/shortcut-recorder.tsx b/surfsense_web/components/desktop/shortcut-recorder.tsx
index c872afaf11..cd49eac719 100644
--- a/surfsense_web/components/desktop/shortcut-recorder.tsx
+++ b/surfsense_web/components/desktop/shortcut-recorder.tsx
@@ -38,7 +38,7 @@ export function acceleratorToDisplay(accel: string): string[] {
 export const DEFAULT_SHORTCUTS = {
 	generalAssist: "CommandOrControl+Shift+S",
 	quickAsk: "CommandOrControl+Alt+S",
-	autocomplete: "CommandOrControl+Shift+Space",
+	screenshotAssist: "CommandOrControl+Shift+Space",
 };
 
 // ---------------------------------------------------------------------------
@@ -48,9 +48,9 @@ export const DEFAULT_SHORTCUTS = {
 export function Kbd({ keys, className }: { keys: string[]; className?: string }) {
 	return (
 		<span className={cn("inline-flex items-center gap-0.5", className)}>
-			{keys.map((key, i) => (
+			{keys.map((key) => (
 				<kbd
-					key={`${key}-${i}`}
+					key={key}
 					className={cn(
 						"inline-flex h-6 min-w-6 items-center justify-center rounded border bg-muted px-1 font-mono text-[11px] font-medium text-muted-foreground",
 						key.length > 3 && "px-1.5"
@@ -136,14 +136,15 @@ export function ShortcutRecorder({
 						<RotateCcw className="size-3" />
 					</Button>
 				)}
-				<button
+				<Button
+					variant="ghost"
 					ref={inputRef}
 					type="button"
 					onClick={() => setRecording(true)}
 					onKeyDown={handleKeyDown}
 					onBlur={() => setRecording(false)}
 					className={cn(
-						"flex h-7 items-center gap-0.5 rounded-md border px-2 transition-all focus:outline-none",
+						"h-7 justify-start gap-0.5 rounded-md border px-2 transition-all",
 						recording
 							? "border-primary bg-primary/5 ring-2 ring-primary/20"
 							: "border-input bg-muted/40 hover:bg-muted"
@@ -156,7 +157,7 @@ export function ShortcutRecorder({
 					) : (
 						<Kbd keys={displayKeys} />
 					)}
-				</button>
+				</Button>
 			</div>
 		</div>
 	);
diff --git a/surfsense_web/components/document-viewer.tsx b/surfsense_web/components/document-viewer.tsx
index 0f283e5671..710a04ba3f 100644
--- a/surfsense_web/components/document-viewer.tsx
+++ b/surfsense_web/components/document-viewer.tsx
@@ -32,7 +32,7 @@ export function DocumentViewer({ title, content, trigger }: DocumentViewerProps)
 					<DialogTitle>{title}</DialogTitle>
 				</DialogHeader>
 				<div className="mt-4">
-					<MarkdownViewer content={content} />
+					<MarkdownViewer content={content} enableCitations />
 				</div>
 			</DialogContent>
 		</Dialog>
diff --git a/surfsense_web/components/documents/CreateFolderDialog.tsx b/surfsense_web/components/documents/CreateFolderDialog.tsx
index 55548146fd..5ecfebbe7c 100644
--- a/surfsense_web/components/documents/CreateFolderDialog.tsx
+++ b/surfsense_web/components/documents/CreateFolderDialog.tsx
@@ -1,6 +1,6 @@
 "use client";
 
-import { useCallback, useEffect, useRef, useState } from "react";
+import { useCallback, useRef, useState } from "react";
 import { Button } from "@/components/ui/button";
 import {
 	Dialog,
@@ -29,12 +29,16 @@ export function CreateFolderDialog({
 	const [name, setName] = useState("");
 	const inputRef = useRef<HTMLInputElement>(null);
 
-	useEffect(() => {
-		if (open) {
-			setName("");
-			setTimeout(() => inputRef.current?.focus(), 0);
-		}
-	}, [open]);
+	const handleOpenChange = useCallback(
+		(next: boolean) => {
+			if (next) {
+				setName("");
+				setTimeout(() => inputRef.current?.focus(), 0);
+			}
+			onOpenChange(next);
+		},
+		[onOpenChange]
+	);
 
 	const handleSubmit = useCallback(
 		(e?: React.FormEvent) => {
@@ -50,7 +54,7 @@ export function CreateFolderDialog({
 	const isSubfolder = !!parentFolderName;
 
 	return (
-		<Dialog open={open} onOpenChange={onOpenChange}>
+		<Dialog open={open} onOpenChange={handleOpenChange}>
 			<DialogContent className="select-none max-w-[90vw] sm:max-w-sm p-4 sm:p-5 data-[state=open]:animate-none data-[state=closed]:animate-none">
 				<DialogHeader className="space-y-2 pb-2">
 					<div className="flex items-center gap-2 sm:gap-3">
diff --git a/surfsense_web/components/documents/DocumentNode.tsx b/surfsense_web/components/documents/DocumentNode.tsx
index 04cec5f89b..0f3cd4a192 100644
--- a/surfsense_web/components/documents/DocumentNode.tsx
+++ b/surfsense_web/components/documents/DocumentNode.tsx
@@ -8,7 +8,7 @@ import {
 	History,
 	MoreHorizontal,
 	Move,
-	PenLine,
+	Pencil,
 	Trash2,
 } from "lucide-react";
 import React, { useCallback, useRef, useState } from "react";
@@ -82,11 +82,11 @@ export const DocumentNode = React.memo(function DocumentNode({
 	onContextMenuOpenChange,
 }: DocumentNodeProps) {
 	const statusState = doc.status?.state ?? "ready";
-	const isSelectable = statusState !== "pending" && statusState !== "processing";
-	const isEditable =
-		EDITABLE_DOCUMENT_TYPES.has(doc.document_type) &&
-		statusState !== "pending" &&
-		statusState !== "processing";
+	const isFailed = statusState === "failed";
+	const isProcessing = statusState === "pending" || statusState === "processing";
+	const isUnavailable = isProcessing || isFailed;
+	const isSelectable = !isUnavailable;
+	const isEditable = EDITABLE_DOCUMENT_TYPES.has(doc.document_type) && !isUnavailable;
 
 	const handleCheckChange = useCallback(() => {
 		if (isSelectable) {
@@ -103,7 +103,6 @@ export const DocumentNode = React.memo(function DocumentNode({
 		[doc.id]
 	);
 
-	const isProcessing = statusState === "pending" || statusState === "processing";
 	const [dropdownOpen, setDropdownOpen] = useState(false);
 	const [exporting, setExporting] = useState<string | null>(null);
 	const [titleTooltipOpen, setTitleTooltipOpen] = useState(false);
@@ -139,24 +138,14 @@ export const DocumentNode = React.memo(function DocumentNode({
 	return (
 		<ContextMenu onOpenChange={onContextMenuOpenChange}>
 			<ContextMenuTrigger asChild>
-				{/* biome-ignore lint/a11y/useSemanticElements: contains nested interactive children (Checkbox) that render as <button>, making a semantic <button> wrapper invalid */}
 				<div
-					role="button"
-					tabIndex={0}
 					ref={attachRef}
 					className={cn(
-						"group flex h-8 w-full items-center gap-2.5 rounded-md px-1 text-sm hover:bg-accent/50 cursor-pointer select-none text-left",
-						isMentioned && "bg-accent/30",
+						"group flex h-8 w-full items-center gap-2.5 rounded-md px-1 text-sm hover:bg-accent hover:text-accent-foreground cursor-pointer select-none text-left",
+						isMentioned && "bg-accent text-accent-foreground",
 						isDragging && "opacity-40"
 					)}
 					style={{ paddingLeft: `${depth * 16 + 4}px` }}
-					onClick={handleCheckChange}
-					onKeyDown={(e) => {
-						if (e.key === "Enter" || e.key === " ") {
-							e.preventDefault();
-							handleCheckChange();
-						}
-					}}
 				>
 					{(() => {
 						if (statusState === "pending") {
@@ -213,9 +202,17 @@ export const DocumentNode = React.memo(function DocumentNode({
 						onOpenChange={handleTitleTooltipOpenChange}
 					>
 						<TooltipTrigger asChild>
-							<span ref={titleRef} className="flex-1 min-w-0 truncate">
-								{doc.title}
-							</span>
+							<Button
+								type="button"
+								variant="ghost"
+								aria-disabled={!isSelectable}
+								onClick={handleCheckChange}
+								className="h-full min-w-0 flex-1 justify-start bg-transparent px-0 py-0 text-left font-normal text-inherit hover:bg-transparent hover:text-inherit"
+							>
+								<span ref={titleRef} className="min-w-0 flex-1 truncate">
+									{doc.title}
+								</span>
+							</Button>
 						</TooltipTrigger>
 						<TooltipContent side="bottom" className="max-w-xs break-words">
 							{doc.title}
@@ -261,13 +258,13 @@ export const DocumentNode = React.memo(function DocumentNode({
 								className="w-40"
 								onClick={(e) => e.stopPropagation()}
 							>
-								<DropdownMenuItem onClick={() => onPreview(doc)} disabled={isProcessing}>
+								<DropdownMenuItem onClick={() => onPreview(doc)} disabled={isUnavailable}>
 									<Eye className="mr-2 h-4 w-4" />
 									Open
 								</DropdownMenuItem>
 								{isEditable && (
 									<DropdownMenuItem onClick={() => onEdit(doc)}>
-										<PenLine className="mr-2 h-4 w-4" />
+										<Pencil className="mr-2 h-4 w-4" />
 										Edit
 									</DropdownMenuItem>
 								)}
@@ -277,7 +274,7 @@ export const DocumentNode = React.memo(function DocumentNode({
 								</DropdownMenuItem>
 								{onExport && (
 									<DropdownMenuSub>
-										<DropdownMenuSubTrigger disabled={isProcessing}>
+										<DropdownMenuSubTrigger disabled={isUnavailable}>
 											<Download className="mr-2 h-4 w-4" />
 											Export
 										</DropdownMenuSubTrigger>
@@ -287,7 +284,7 @@ export const DocumentNode = React.memo(function DocumentNode({
 									</DropdownMenuSub>
 								)}
 								{onVersionHistory && isVersionableType(doc.document_type) && (
-									<DropdownMenuItem disabled={isProcessing} onClick={() => onVersionHistory(doc)}>
+									<DropdownMenuItem disabled={isUnavailable} onClick={() => onVersionHistory(doc)}>
 										<History className="mr-2 h-4 w-4" />
 										Versions
 									</DropdownMenuItem>
@@ -304,13 +301,13 @@ export const DocumentNode = React.memo(function DocumentNode({
 
 			{contextMenuOpen && (
 				<ContextMenuContent className="w-40" onClick={(e) => e.stopPropagation()}>
-					<ContextMenuItem onClick={() => onPreview(doc)} disabled={isProcessing}>
+					<ContextMenuItem onClick={() => onPreview(doc)} disabled={isUnavailable}>
 						<Eye className="mr-2 h-4 w-4" />
 						Open
 					</ContextMenuItem>
 					{isEditable && (
 						<ContextMenuItem onClick={() => onEdit(doc)}>
-							<PenLine className="mr-2 h-4 w-4" />
+							<Pencil className="mr-2 h-4 w-4" />
 							Edit
 						</ContextMenuItem>
 					)}
@@ -320,7 +317,7 @@ export const DocumentNode = React.memo(function DocumentNode({
 					</ContextMenuItem>
 					{onExport && (
 						<ContextMenuSub>
-							<ContextMenuSubTrigger disabled={isProcessing}>
+							<ContextMenuSubTrigger disabled={isUnavailable}>
 								<Download className="mr-2 h-4 w-4" />
 								Export
 							</ContextMenuSubTrigger>
@@ -330,7 +327,7 @@ export const DocumentNode = React.memo(function DocumentNode({
 						</ContextMenuSub>
 					)}
 					{onVersionHistory && isVersionableType(doc.document_type) && (
-						<ContextMenuItem disabled={isProcessing} onClick={() => onVersionHistory(doc)}>
+						<ContextMenuItem disabled={isUnavailable} onClick={() => onVersionHistory(doc)}>
 							<History className="mr-2 h-4 w-4" />
 							Versions
 						</ContextMenuItem>
diff --git a/surfsense_web/components/documents/DocumentTypeIcon.tsx b/surfsense_web/components/documents/DocumentTypeIcon.tsx
index 5c03d96faa..026fb46f15 100644
--- a/surfsense_web/components/documents/DocumentTypeIcon.tsx
+++ b/surfsense_web/components/documents/DocumentTypeIcon.tsx
@@ -4,53 +4,12 @@ import type React from "react";
 import { useEffect, useRef, useState } from "react";
 import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
 import { getConnectorIcon } from "@/contracts/enums/connectorIcons";
+import { getDocumentTypeLabel } from "@/lib/documents/document-type-labels";
 
 export function getDocumentTypeIcon(type: string, className?: string): React.ReactNode {
 	return getConnectorIcon(type, className);
 }
 
-export function getDocumentTypeLabel(type: string): string {
-	const labelMap: Record<string, string> = {
-		EXTENSION: "Extension",
-		CRAWLED_URL: "Web Page",
-		FILE: "File",
-		SLACK_CONNECTOR: "Slack",
-		TEAMS_CONNECTOR: "Microsoft Teams",
-		ONEDRIVE_FILE: "OneDrive",
-		DROPBOX_FILE: "Dropbox",
-		NOTION_CONNECTOR: "Notion",
-		YOUTUBE_VIDEO: "YouTube Video",
-		GITHUB_CONNECTOR: "GitHub",
-		LINEAR_CONNECTOR: "Linear",
-		DISCORD_CONNECTOR: "Discord",
-		JIRA_CONNECTOR: "Jira",
-		CONFLUENCE_CONNECTOR: "Confluence",
-		CLICKUP_CONNECTOR: "ClickUp",
-		GOOGLE_CALENDAR_CONNECTOR: "Google Calendar",
-		GOOGLE_GMAIL_CONNECTOR: "Gmail",
-		GOOGLE_DRIVE_FILE: "Google Drive",
-		AIRTABLE_CONNECTOR: "Airtable",
-		LUMA_CONNECTOR: "Luma",
-		ELASTICSEARCH_CONNECTOR: "Elasticsearch",
-		BOOKSTACK_CONNECTOR: "BookStack",
-		CIRCLEBACK: "Circleback",
-		OBSIDIAN_CONNECTOR: "Obsidian",
-		LOCAL_FOLDER_FILE: "Local Folder",
-		SURFSENSE_DOCS: "SurfSense Docs",
-		NOTE: "Note",
-		COMPOSIO_GOOGLE_DRIVE_CONNECTOR: "Composio Google Drive",
-		COMPOSIO_GMAIL_CONNECTOR: "Composio Gmail",
-		COMPOSIO_GOOGLE_CALENDAR_CONNECTOR: "Composio Google Calendar",
-	};
-	return (
-		labelMap[type] ||
-		type
-			.split("_")
-			.map((word) => word.charAt(0) + word.slice(1).toLowerCase())
-			.join(" ")
-	);
-}
-
 export function DocumentTypeChip({ type, className }: { type: string; className?: string }) {
 	const icon = getDocumentTypeIcon(type, "h-4 w-4");
 	const fullLabel = getDocumentTypeLabel(type);
diff --git a/surfsense_web/components/documents/DocumentsFilters.tsx b/surfsense_web/components/documents/DocumentsFilters.tsx
index a795b61c7c..86e91ee42b 100644
--- a/surfsense_web/components/documents/DocumentsFilters.tsx
+++ b/surfsense_web/components/documents/DocumentsFilters.tsx
@@ -1,5 +1,6 @@
 "use client";
 
+import { IconBinaryTree, IconBinaryTreeFilled } from "@tabler/icons-react";
 import { FolderPlus, ListFilter, Search, Upload, X } from "lucide-react";
 import { useTranslations } from "next-intl";
 import React, { useCallback, useMemo, useRef, useState } from "react";
@@ -8,10 +9,13 @@ import { Button } from "@/components/ui/button";
 import { Checkbox } from "@/components/ui/checkbox";
 import { Input } from "@/components/ui/input";
 import { Popover, PopoverContent, PopoverTrigger } from "@/components/ui/popover";
+import { Spinner } from "@/components/ui/spinner";
 import { ToggleGroup, ToggleGroupItem } from "@/components/ui/toggle-group";
 import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
 import type { DocumentTypeEnum } from "@/contracts/types/document.types";
-import { getDocumentTypeIcon, getDocumentTypeLabel } from "./DocumentTypeIcon";
+import { getDocumentTypeLabel } from "@/lib/documents/document-type-labels";
+import { cn } from "@/lib/utils";
+import { getDocumentTypeIcon } from "./DocumentTypeIcon";
 
 export function DocumentsFilters({
 	typeCounts: typeCountsRecord,
@@ -20,6 +24,10 @@ export function DocumentsFilters({
 	onToggleType,
 	activeTypes,
 	onCreateFolder,
+	aiSortEnabled = false,
+	aiSortBusy = false,
+	onToggleAiSort,
+	onUploadClick,
 }: {
 	typeCounts: Partial<Record<DocumentTypeEnum, number>>;
 	onSearch: (v: string) => void;
@@ -27,12 +35,17 @@ export function DocumentsFilters({
 	onToggleType: (type: DocumentTypeEnum, checked: boolean) => void;
 	activeTypes: DocumentTypeEnum[];
 	onCreateFolder?: () => void;
+	aiSortEnabled?: boolean;
+	aiSortBusy?: boolean;
+	onToggleAiSort?: () => void;
+	onUploadClick?: () => void;
 }) {
 	const t = useTranslations("documents");
 	const id = React.useId();
 	const inputRef = useRef<HTMLInputElement>(null);
 
 	const { openDialog: openUploadDialog } = useDocumentUploadDialog();
+	const handleUpload = onUploadClick ?? openUploadDialog;
 
 	const [typeSearchQuery, setTypeSearchQuery] = useState("");
 	const [scrollPos, setScrollPos] = useState<"top" | "middle" | "bottom">("top");
@@ -64,37 +77,77 @@ export function DocumentsFilters({
 	return (
 		<div className="flex select-none">
 			<div className="flex items-center gap-2 w-full">
-				{/* Filter + New Folder Toggle Group */}
-				<ToggleGroup type="multiple" variant="outline" value={[]} className="overflow-visible">
+				{/* New Folder + AI Sort + Filter Toggle Group */}
+				<ToggleGroup type="multiple" value={[]} className="overflow-visible">
 					{onCreateFolder && (
 						<Tooltip>
 							<TooltipTrigger asChild>
 								<ToggleGroupItem
 									value="folder"
-									className="h-9 w-9 shrink-0 border-sidebar-border text-sidebar-foreground/60 hover:text-sidebar-foreground hover:border-sidebar-border bg-sidebar"
+									className="h-8 w-8 shrink-0 border-0 bg-muted text-muted-foreground transition-colors hover:bg-accent hover:text-accent-foreground"
 									onClick={(e) => {
 										e.preventDefault();
 										onCreateFolder();
 									}}
 								>
-									<FolderPlus size={14} />
+									<FolderPlus size={13} />
 								</ToggleGroupItem>
 							</TooltipTrigger>
 							<TooltipContent>New folder</TooltipContent>
 						</Tooltip>
 					)}
 
+					{onToggleAiSort && (
+						<Tooltip>
+							<TooltipTrigger asChild>
+								<ToggleGroupItem
+									value="ai-sort"
+									disabled={aiSortBusy}
+									className={cn(
+										"h-8 w-8 shrink-0 border-0 bg-muted transition-colors",
+										"relative before:absolute before:left-0 before:top-1/2 before:h-4 before:w-px before:-translate-y-1/2 before:bg-border/60 before:content-[''] dark:before:bg-white/10",
+										"disabled:pointer-events-none disabled:opacity-50",
+										aiSortEnabled
+											? "bg-accent text-accent-foreground hover:bg-accent"
+											: "text-muted-foreground hover:bg-accent hover:text-accent-foreground"
+									)}
+									onClick={(e) => {
+										e.preventDefault();
+										onToggleAiSort();
+									}}
+									aria-label={aiSortEnabled ? "Disable AI sort" : "Enable AI sort"}
+									aria-pressed={aiSortEnabled}
+								>
+									{aiSortBusy ? (
+										<Spinner size="xs" />
+									) : aiSortEnabled ? (
+										<IconBinaryTreeFilled size={14} />
+									) : (
+										<IconBinaryTree size={14} />
+									)}
+								</ToggleGroupItem>
+							</TooltipTrigger>
+							<TooltipContent>
+								{aiSortBusy
+									? "AI sort in progress..."
+									: aiSortEnabled
+										? "AI sort active — click to disable"
+										: "Enable AI sort"}
+							</TooltipContent>
+						</Tooltip>
+					)}
+
 					<Popover>
 						<Tooltip>
 							<TooltipTrigger asChild>
 								<PopoverTrigger asChild>
 									<ToggleGroupItem
 										value="filter"
-										className="relative h-9 w-9 shrink-0 border-sidebar-border text-sidebar-foreground/60 hover:text-sidebar-foreground hover:border-sidebar-border bg-sidebar overflow-visible"
+										className="relative h-8 w-8 shrink-0 overflow-visible border-0 bg-muted text-muted-foreground transition-colors before:absolute before:left-0 before:top-1/2 before:h-4 before:w-px before:-translate-y-1/2 before:bg-border/60 before:content-[''] hover:bg-accent hover:text-accent-foreground dark:before:bg-white/10"
 									>
-										<ListFilter size={14} />
+										<ListFilter size={13} />
 										{activeTypes.length > 0 && (
-											<span className="absolute -top-1 -right-1 flex h-4 w-4 items-center justify-center rounded-full bg-sidebar-border text-[9px] font-medium text-sidebar-foreground">
+											<span className="absolute -top-1 -right-1 flex h-4 w-4 items-center justify-center rounded-full bg-neutral-300 text-[9px] font-medium text-neutral-700 dark:bg-neutral-700 dark:text-neutral-200">
 												{activeTypes.length}
 											</span>
 										)}
@@ -136,7 +189,7 @@ export function DocumentsFilters({
 												aria-selected={activeTypes.includes(value)}
 												tabIndex={0}
 												key={value}
-												className="flex w-full items-center gap-2.5 py-2 px-3 rounded-md hover:bg-neutral-200 dark:hover:bg-neutral-700 transition-colors cursor-pointer text-left"
+												className="flex w-full items-center gap-2.5 py-2 px-3 rounded-md hover:bg-accent hover:text-accent-foreground transition-colors cursor-pointer text-left"
 												onClick={() => onToggleType(value, !activeTypes.includes(value))}
 												onKeyDown={(e) => {
 													if (e.key === "Enter" || e.key === " ") {
@@ -175,12 +228,12 @@ export function DocumentsFilters({
 				{/* Search Input */}
 				<div className="relative flex-1 min-w-0">
 					<div className="pointer-events-none absolute inset-y-0 left-0 flex items-center pl-3 text-muted-foreground">
-						<Search size={14} aria-hidden="true" />
+						<Search size={13} aria-hidden="true" />
 					</div>
 					<Input
 						id={`${id}-input`}
 						ref={inputRef}
-						className="peer h-9 w-full pl-9 pr-9 text-sm bg-sidebar border-border/60 select-none focus:select-text"
+						className="h-8 w-full select-none border-0 bg-muted pl-8 pr-7 text-sm shadow-none focus:select-text"
 						value={searchValue}
 						onChange={(e) => onSearch(e.target.value)}
 						placeholder="Search docs"
@@ -188,9 +241,11 @@ export function DocumentsFilters({
 						aria-label={t("filter_placeholder")}
 					/>
 					{Boolean(searchValue) && (
-						<button
+						<Button
 							type="button"
-							className="absolute inset-y-0 right-0 flex h-full w-9 items-center justify-center rounded-r-md text-muted-foreground hover:text-foreground transition-colors"
+							variant="ghost"
+							size="icon"
+							className="absolute right-1 top-1/2 h-5 w-5 -translate-y-1/2 rounded-sm text-muted-foreground transition-colors hover:bg-accent hover:text-accent-foreground"
 							aria-label="Clear filter"
 							onClick={() => {
 								onSearch("");
@@ -198,19 +253,19 @@ export function DocumentsFilters({
 							}}
 						>
 							<X size={14} strokeWidth={2} aria-hidden="true" />
-						</button>
+						</Button>
 					)}
 				</div>
 
 				{/* Upload Button */}
 				<Button
 					data-joyride="upload-button"
-					onClick={openUploadDialog}
+					onClick={handleUpload}
 					variant="outline"
 					size="sm"
-					className="h-9 shrink-0 gap-1.5 bg-white text-gray-700 border-white hover:bg-gray-50 dark:bg-white dark:text-gray-800 dark:hover:bg-gray-100"
+					className="h-8 shrink-0 gap-1.5 border-0 bg-white text-gray-700 shadow-none hover:bg-accent hover:text-accent-foreground dark:bg-white dark:text-gray-800"
 				>
-					<Upload size={14} />
+					<Upload size={13} />
 					<span>Upload</span>
 				</Button>
 			</div>
diff --git a/surfsense_web/components/documents/FolderNode.tsx b/surfsense_web/components/documents/FolderNode.tsx
index 7f75f8abfc..2506713a46 100644
--- a/surfsense_web/components/documents/FolderNode.tsx
+++ b/surfsense_web/components/documents/FolderNode.tsx
@@ -4,6 +4,7 @@ import {
 	AlertCircle,
 	ChevronDown,
 	ChevronRight,
+	Download,
 	Eye,
 	EyeOff,
 	Folder,
@@ -11,7 +12,7 @@ import {
 	FolderPlus,
 	MoreHorizontal,
 	Move,
-	PenLine,
+	Pencil,
 	RefreshCw,
 	Trash2,
 } from "lucide-react";
@@ -80,6 +81,7 @@ interface FolderNodeProps {
 	isWatched?: boolean;
 	onRescan?: (folder: FolderDisplay) => void | Promise<void>;
 	onStopWatching?: (folder: FolderDisplay) => void;
+	onExportFolder?: (folder: FolderDisplay) => void;
 }
 
 function getDropZone(
@@ -120,6 +122,7 @@ export const FolderNode = React.memo(function FolderNode({
 	isWatched,
 	onRescan,
 	onStopWatching,
+	onExportFolder,
 }: FolderNodeProps) {
 	const [renameValue, setRenameValue] = useState(folder.name);
 	const inputRef = useRef<HTMLInputElement>(null);
@@ -257,7 +260,7 @@ export const FolderNode = React.memo(function FolderNode({
 					role="button"
 					tabIndex={0}
 					className={cn(
-						"group relative flex h-8 items-center gap-1 rounded-md px-1 text-sm hover:bg-accent/50 cursor-pointer select-none",
+						"group relative flex h-8 items-center gap-1 rounded-md px-1 text-sm hover:bg-accent hover:text-accent-foreground cursor-pointer select-none",
 						isExpanded && "font-medium",
 						isDragging && "opacity-40",
 						isOver && canDrop && dropZone === "middle" && "bg-accent ring-1 ring-primary/40",
@@ -396,7 +399,7 @@ export const FolderNode = React.memo(function FolderNode({
 										startRename();
 									}}
 								>
-									<PenLine className="mr-2 h-4 w-4" />
+									<Pencil className="mr-2 h-4 w-4" />
 									Rename
 								</DropdownMenuItem>
 								<DropdownMenuItem
@@ -408,6 +411,17 @@ export const FolderNode = React.memo(function FolderNode({
 									<Move className="mr-2 h-4 w-4" />
 									Move to...
 								</DropdownMenuItem>
+								{onExportFolder && (
+									<DropdownMenuItem
+										onClick={(e) => {
+											e.stopPropagation();
+											onExportFolder(folder);
+										}}
+									>
+										<Download className="mr-2 h-4 w-4" />
+										Export folder
+									</DropdownMenuItem>
+								)}
 								<DropdownMenuItem
 									onClick={(e) => {
 										e.stopPropagation();
@@ -442,13 +456,19 @@ export const FolderNode = React.memo(function FolderNode({
 						New subfolder
 					</ContextMenuItem>
 					<ContextMenuItem onClick={() => startRename()}>
-						<PenLine className="mr-2 h-4 w-4" />
+						<Pencil className="mr-2 h-4 w-4" />
 						Rename
 					</ContextMenuItem>
 					<ContextMenuItem onClick={() => onMove(folder)}>
 						<Move className="mr-2 h-4 w-4" />
 						Move to...
 					</ContextMenuItem>
+					{onExportFolder && (
+						<ContextMenuItem onClick={() => onExportFolder(folder)}>
+							<Download className="mr-2 h-4 w-4" />
+							Export folder
+						</ContextMenuItem>
+					)}
 					<ContextMenuItem onClick={() => onDelete(folder)}>
 						<Trash2 className="mr-2 h-4 w-4" />
 						Delete
diff --git a/surfsense_web/components/documents/FolderPickerDialog.tsx b/surfsense_web/components/documents/FolderPickerDialog.tsx
index 59e02f7262..13cff6aced 100644
--- a/surfsense_web/components/documents/FolderPickerDialog.tsx
+++ b/surfsense_web/components/documents/FolderPickerDialog.tsx
@@ -1,7 +1,7 @@
 "use client";
 
 import { ChevronDown, ChevronRight, Folder, FolderOpen, Home } from "lucide-react";
-import { useCallback, useEffect, useMemo, useState } from "react";
+import { useCallback, useMemo, useState } from "react";
 import { Button } from "@/components/ui/button";
 import {
 	Dialog,
@@ -36,12 +36,16 @@ export function FolderPickerDialog({
 	const [selectedId, setSelectedId] = useState<number | null>(null);
 	const [expandedIds, setExpandedIds] = useState<Set<number>>(new Set());
 
-	useEffect(() => {
-		if (open) {
-			setSelectedId(null);
-			setExpandedIds(new Set());
-		}
-	}, [open]);
+	const handleOpenChange = useCallback(
+		(next: boolean) => {
+			if (next) {
+				setSelectedId(null);
+				setExpandedIds(new Set());
+			}
+			onOpenChange(next);
+		},
+		[onOpenChange]
+	);
 
 	const foldersByParent = useMemo(() => {
 		const map: Record<string, FolderDisplay[]> = {};
@@ -81,49 +85,54 @@ export function FolderPickerDialog({
 			const FolderIcon = isExpanded ? FolderOpen : Folder;
 
 			return [
-				<button
-					key={f.id}
-					type="button"
-					disabled={isDisabled}
-					className={cn(
-						"flex w-full items-center gap-1.5 rounded-md px-2 py-1.5 text-sm transition-colors",
-						isSelected && "bg-accent text-accent-foreground",
-						!isSelected && !isDisabled && "hover:bg-accent/50",
-						isDisabled && "cursor-not-allowed opacity-40"
-					)}
-					style={{ paddingLeft: `${depth * 16 + 8}px` }}
-					onClick={() => {
-						if (!isDisabled) setSelectedId(f.id);
-					}}
-				>
-					{hasChildren ? (
-						<button
+				<div key={f.id} className="relative w-full">
+					{hasChildren && (
+						<Button
 							type="button"
-							className="flex h-4 w-4 shrink-0 items-center justify-center"
+							variant="ghost"
+							size="icon"
+							disabled={isDisabled}
+							className="absolute top-1/2 z-10 size-4 -translate-y-1/2 p-0"
+							style={{ left: `${depth * 16 + 8}px` }}
+							aria-label={isExpanded ? `Collapse ${f.name}` : `Expand ${f.name}`}
 							onClick={(e) => {
 								e.stopPropagation();
 								toggleExpand(f.id);
 							}}
 						>
 							{isExpanded ? (
-								<ChevronDown className="h-3.5 w-3.5" />
+								<ChevronDown data-icon="inline-start" />
 							) : (
-								<ChevronRight className="h-3.5 w-3.5" />
+								<ChevronRight data-icon="inline-start" />
 							)}
-						</button>
-					) : (
-						<span className="h-4 w-4 shrink-0" />
+						</Button>
 					)}
-					<FolderIcon className="h-4 w-4 shrink-0 text-muted-foreground" />
-					<span className="truncate">{f.name}</span>
-				</button>,
+					<Button
+						type="button"
+						variant="ghost"
+						disabled={isDisabled}
+						className={cn(
+							"h-auto w-full justify-start gap-1.5 px-2 py-1.5 text-sm font-normal",
+							isSelected && "bg-accent text-accent-foreground",
+							isDisabled && "cursor-not-allowed opacity-40"
+						)}
+						style={{ paddingLeft: `${depth * 16 + 8}px` }}
+						onClick={() => {
+							if (!isDisabled) setSelectedId(f.id);
+						}}
+					>
+						<span className="size-4 shrink-0" />
+						<FolderIcon data-icon="inline-start" className="text-muted-foreground" />
+						<span className="truncate">{f.name}</span>
+					</Button>
+				</div>,
 				...(isExpanded ? renderPickerLevel(f.id, depth + 1) : []),
 			];
 		});
 	}
 
 	return (
-		<Dialog open={open} onOpenChange={onOpenChange}>
+		<Dialog open={open} onOpenChange={handleOpenChange}>
 			<DialogContent className="select-none max-w-[90vw] sm:max-w-sm p-4 sm:p-5 data-[state=open]:animate-none data-[state=closed]:animate-none">
 				<DialogHeader className="space-y-2 pb-2">
 					<div className="flex items-center gap-2 sm:gap-3">
@@ -139,19 +148,19 @@ export function FolderPickerDialog({
 				</DialogHeader>
 
 				<div className="max-h-[300px] overflow-y-auto rounded-md border p-1">
-					<button
+					<Button
 						type="button"
+						variant="ghost"
 						className={cn(
-							"flex w-full items-center gap-1.5 rounded-md px-2 py-1.5 text-sm transition-colors",
-							selectedId === null && "bg-accent text-accent-foreground",
-							selectedId !== null && "hover:bg-accent/50"
+							"h-auto w-full justify-start gap-1.5 px-2 py-1.5 text-sm font-normal",
+							selectedId === null && "bg-accent text-accent-foreground"
 						)}
 						onClick={() => setSelectedId(null)}
 					>
-						<span className="h-4 w-4 shrink-0" />
-						<Home className="h-4 w-4 shrink-0 text-muted-foreground" />
+						<span className="size-4 shrink-0" />
+						<Home data-icon="inline-start" className="text-muted-foreground" />
 						<span>Root</span>
-					</button>
+					</Button>
 					{renderPickerLevel(null, 1)}
 				</div>
 
diff --git a/surfsense_web/components/documents/FolderTreeView.tsx b/surfsense_web/components/documents/FolderTreeView.tsx
index 6eb53da50c..eeeb1e779d 100644
--- a/surfsense_web/components/documents/FolderTreeView.tsx
+++ b/surfsense_web/components/documents/FolderTreeView.tsx
@@ -7,6 +7,7 @@ import { DndProvider } from "react-dnd";
 import { HTML5Backend } from "react-dnd-html5-backend";
 import { renamingFolderIdAtom } from "@/atoms/documents/folder.atoms";
 import type { DocumentTypeEnum } from "@/contracts/types/document.types";
+import { getMentionDocKey } from "@/lib/chat/mention-doc-key";
 import { DocumentNode, type DocumentNodeDoc } from "./DocumentNode";
 import { type FolderDisplay, FolderNode } from "./FolderNode";
 
@@ -17,7 +18,7 @@ interface FolderTreeViewProps {
 	documents: DocumentNodeDoc[];
 	expandedIds: Set<number>;
 	onToggleExpand: (folderId: number) => void;
-	mentionedDocIds: Set<number>;
+	mentionedDocKeys: Set<string>;
 	onToggleChatMention: (
 		doc: { id: number; title: string; document_type: string },
 		isMentioned: boolean
@@ -44,6 +45,7 @@ interface FolderTreeViewProps {
 	watchedFolderIds?: Set<number>;
 	onRescanFolder?: (folder: FolderDisplay) => void;
 	onStopWatchingFolder?: (folder: FolderDisplay) => void;
+	onExportFolder?: (folder: FolderDisplay) => void;
 }
 
 function groupBy<T>(items: T[], keyFn: (item: T) => string | number): Record<string | number, T[]> {
@@ -61,7 +63,7 @@ export function FolderTreeView({
 	documents,
 	expandedIds,
 	onToggleExpand,
-	mentionedDocIds,
+	mentionedDocKeys,
 	onToggleChatMention,
 	onToggleFolderSelect,
 	onRenameFolder,
@@ -81,6 +83,7 @@ export function FolderTreeView({
 	watchedFolderIds,
 	onRescanFolder,
 	onStopWatchingFolder,
+	onExportFolder,
 }: FolderTreeViewProps) {
 	const foldersByParent = useMemo(() => groupBy(folders, (f) => f.parentId ?? "root"), [folders]);
 
@@ -88,6 +91,8 @@ export function FolderTreeView({
 
 	const [openContextMenuId, setOpenContextMenuId] = useState<string | null>(null);
 
+	const [manuallyCollapsedAiIds, setManuallyCollapsedAiIds] = useState<Set<number>>(new Set());
+
 	// Single subscription for rename state — derived boolean passed to each FolderNode
 	const [renamingFolderId, setRenamingFolderId] = useAtom(renamingFolderIdAtom);
 	const handleStartRename = useCallback(
@@ -96,6 +101,38 @@ export function FolderTreeView({
 	);
 	const handleCancelRename = useCallback(() => setRenamingFolderId(null), [setRenamingFolderId]);
 
+	const aiSortFolderLevels = useMemo(() => {
+		const map = new Map<number, number>();
+		for (const f of folders) {
+			if (f.metadata?.ai_sort === true && typeof f.metadata?.ai_sort_level === "number") {
+				map.set(f.id, f.metadata.ai_sort_level as number);
+			}
+		}
+		return map;
+	}, [folders]);
+
+	const handleToggleExpand = useCallback(
+		(folderId: number) => {
+			const aiLevel = aiSortFolderLevels.get(folderId);
+			if (aiLevel !== undefined && aiLevel < 4) {
+				// AI-auto-expanded folder: only toggle the manual-collapse set.
+				// Calling onToggleExpand would add it to expandedIds and fight auto-expand.
+				setManuallyCollapsedAiIds((prev) => {
+					const next = new Set(prev);
+					if (next.has(folderId)) {
+						next.delete(folderId);
+					} else {
+						next.add(folderId);
+					}
+					return next;
+				});
+				return;
+			}
+			onToggleExpand(folderId);
+		},
+		[onToggleExpand, aiSortFolderLevels]
+	);
+
 	const effectiveActiveTypes = useMemo(() => {
 		if (
 			activeTypes.includes("FILE" as DocumentTypeEnum) &&
@@ -139,34 +176,25 @@ export function FolderTreeView({
 	}, [folders, docsByFolder, foldersByParent, effectiveActiveTypes, searchQuery]);
 
 	const folderSelectionStates = useMemo(() => {
+		// One folder = one chip. The checkbox now reflects whether the
+		// folder itself is mentioned, not whether every nested doc is —
+		// that reverses the old subtree-fanout semantics in
+		// ``DocumentsSidebar.handleToggleFolderSelect``. We keep the
+		// ``"all" | "some" | "none"`` tri-state on the type so the
+		// existing ``FolderNode`` UI (which renders an indeterminate
+		// glyph for ``"some"``) stays compatible, but only ``"all"``
+		// and ``"none"`` are used in practice.
 		const states: Record<number, FolderSelectionState> = {};
-		const isSelectable = (d: DocumentNodeDoc) =>
-			d.status?.state !== "pending" && d.status?.state !== "processing";
-
-		function compute(folderId: number): { selected: number; total: number } {
-			const directDocs = (docsByFolder[folderId] ?? []).filter(isSelectable);
-			let selected = directDocs.filter((d) => mentionedDocIds.has(d.id)).length;
-			let total = directDocs.length;
-
-			for (const child of foldersByParent[folderId] ?? []) {
-				const sub = compute(child.id);
-				selected += sub.selected;
-				total += sub.total;
-			}
-
-			if (total === 0) states[folderId] = "none";
-			else if (selected === total) states[folderId] = "all";
-			else if (selected > 0) states[folderId] = "some";
-			else states[folderId] = "none";
-
-			return { selected, total };
-		}
-
 		for (const f of folders) {
-			if (states[f.id] === undefined) compute(f.id);
+			const folderMentionKey = getMentionDocKey({
+				id: f.id,
+				document_type: "FOLDER",
+				kind: "folder",
+			});
+			states[f.id] = mentionedDocKeys.has(folderMentionKey) ? "all" : "none";
 		}
 		return states;
-	}, [folders, docsByFolder, foldersByParent, mentionedDocIds]);
+	}, [folders, mentionedDocKeys]);
 
 	const folderMap = useMemo(() => {
 		const map: Record<number, FolderDisplay> = {};
@@ -210,9 +238,14 @@ export function FolderTreeView({
 
 	function renderLevel(parentId: number | null, depth: number): React.ReactNode[] {
 		const key = parentId ?? "root";
-		const childFolders = (foldersByParent[key] ?? [])
-			.slice()
-			.sort((a, b) => a.position.localeCompare(b.position));
+		const childFolders = (foldersByParent[key] ?? []).slice().sort((a, b) => {
+			const aIsDate = a.metadata?.ai_sort === true && a.metadata?.ai_sort_level === 2;
+			const bIsDate = b.metadata?.ai_sort === true && b.metadata?.ai_sort_level === 2;
+			if (aIsDate && bIsDate) {
+				return b.name.localeCompare(a.name);
+			}
+			return a.position.localeCompare(b.position);
+		});
 		const visibleFolders = hasDescendantMatch
 			? childFolders.filter((f) => hasDescendantMatch[f.id])
 			: childFolders;
@@ -224,6 +257,32 @@ export function FolderTreeView({
 
 		const nodes: React.ReactNode[] = [];
 
+		if (parentId === null) {
+			const processingDocs = childDocs.filter((d) => {
+				const state = d.status?.state;
+				return state === "pending" || state === "processing";
+			});
+			for (const d of processingDocs) {
+				nodes.push(
+					<DocumentNode
+						key={`doc-${d.id}`}
+						doc={d}
+						depth={depth}
+						isMentioned={mentionedDocKeys.has(getMentionDocKey(d))}
+						onToggleChatMention={onToggleChatMention}
+						onPreview={onPreviewDocument}
+						onEdit={onEditDocument}
+						onDelete={onDeleteDocument}
+						onMove={onMoveDocument}
+						onExport={onExportDocument}
+						onVersionHistory={onVersionHistory}
+						contextMenuOpen={openContextMenuId === `doc-${d.id}`}
+						onContextMenuOpenChange={(open) => setOpenContextMenuId(open ? `doc-${d.id}` : null)}
+					/>
+				);
+			}
+		}
+
 		for (let i = 0; i < visibleFolders.length; i++) {
 			const f = visibleFolders[i];
 			const siblingPositions = {
@@ -231,8 +290,15 @@ export function FolderTreeView({
 				after: i < visibleFolders.length - 1 ? visibleFolders[i + 1].position : null,
 			};
 
-			const isAutoExpanded = !!searchQuery && !!hasDescendantMatch?.[f.id];
-			const isExpanded = expandedIds.has(f.id) || isAutoExpanded;
+			const isSearchAutoExpanded = !!searchQuery && !!hasDescendantMatch?.[f.id];
+			const isAiAutoExpandCandidate =
+				f.metadata?.ai_sort === true &&
+				typeof f.metadata?.ai_sort_level === "number" &&
+				(f.metadata.ai_sort_level as number) < 4;
+			const isManuallyCollapsed = manuallyCollapsedAiIds.has(f.id);
+			const isExpanded = isManuallyCollapsed
+				? isSearchAutoExpanded
+				: expandedIds.has(f.id) || isSearchAutoExpanded || isAiAutoExpandCandidate;
 
 			nodes.push(
 				<FolderNode
@@ -244,7 +310,7 @@ export function FolderTreeView({
 					selectionState={folderSelectionStates[f.id] ?? "none"}
 					processingState={folderProcessingStates[f.id] ?? "idle"}
 					onToggleSelect={onToggleFolderSelect}
-					onToggleExpand={onToggleExpand}
+					onToggleExpand={handleToggleExpand}
 					onRename={onRenameFolder}
 					onStartRename={handleStartRename}
 					onCancelRename={handleCancelRename}
@@ -259,6 +325,7 @@ export function FolderTreeView({
 					isWatched={watchedFolderIds?.has(f.id)}
 					onRescan={onRescanFolder}
 					onStopWatching={onStopWatchingFolder}
+					onExportFolder={onExportFolder}
 				/>
 			);
 
@@ -267,13 +334,21 @@ export function FolderTreeView({
 			}
 		}
 
-		for (const d of childDocs) {
+		const remainingDocs =
+			parentId === null
+				? childDocs.filter((d) => {
+						const state = d.status?.state;
+						return state !== "pending" && state !== "processing";
+					})
+				: childDocs;
+
+		for (const d of remainingDocs) {
 			nodes.push(
 				<DocumentNode
 					key={`doc-${d.id}`}
 					doc={d}
 					depth={depth}
-					isMentioned={mentionedDocIds.has(d.id)}
+					isMentioned={mentionedDocKeys.has(getMentionDocKey(d))}
 					onToggleChatMention={onToggleChatMention}
 					onPreview={onPreviewDocument}
 					onEdit={onEditDocument}
diff --git a/surfsense_web/components/documents/version-history.tsx b/surfsense_web/components/documents/version-history.tsx
index 27343dc6a8..498e89495b 100644
--- a/surfsense_web/components/documents/version-history.tsx
+++ b/surfsense_web/components/documents/version-history.tsx
@@ -29,7 +29,7 @@ export function isVersionableType(documentType: string) {
 }
 
 const DIALOG_CLASSES =
-	"select-none max-w-[900px] w-[95vw] md:w-[90vw] h-[90vh] md:h-[80vh] max-h-[640px] flex flex-col md:flex-row p-0 gap-0 overflow-hidden [--card:var(--background)] dark:[--card:oklch(0.205_0_0)] dark:[--background:oklch(0.205_0_0)]";
+	"select-none max-w-[900px] w-[95vw] md:w-[90vw] h-[90vh] md:h-[80vh] max-h-[640px] flex flex-col md:flex-row p-0 gap-0 overflow-hidden [--card:var(--popover)]";
 
 export function VersionHistoryButton({ documentId, documentType }: VersionHistoryProps) {
 	if (!isVersionableType(documentType)) return null;
@@ -177,15 +177,16 @@ function VersionHistoryPanel({ documentId }: { documentId: number }) {
 				<div className="flex-1 overflow-y-auto p-2">
 					<div className="flex flex-col gap-0.5">
 						{versions.map((v) => (
-							<button
+							<Button
 								key={v.version_number}
 								type="button"
+								variant="ghost"
 								onClick={() => handleSelectVersion(v.version_number)}
 								className={cn(
-									"flex items-center gap-2 rounded-lg px-3 py-2.5 text-left transition-colors focus:outline-none focus-visible:outline-none w-full",
+									"h-auto w-full justify-start gap-2 rounded-lg px-3 py-2.5 text-left transition-colors focus:outline-none focus-visible:outline-none",
 									selectedVersion === v.version_number
 										? "bg-accent text-accent-foreground"
-										: "text-muted-foreground hover:bg-accent/50 hover:text-foreground"
+										: "text-muted-foreground hover:bg-accent hover:text-accent-foreground"
 								)}
 							>
 								<div className="flex-1 min-w-0 space-y-0.5">
@@ -197,7 +198,7 @@ function VersionHistoryPanel({ documentId }: { documentId: number }) {
 									{v.title && <p className="text-xs text-muted-foreground truncate">{v.title}</p>}
 								</div>
 								<ChevronRight className="h-3.5 w-3.5 shrink-0 opacity-50" />
-							</button>
+							</Button>
 						))}
 					</div>
 				</div>
diff --git a/surfsense_web/components/editor-panel/editor-panel.tsx b/surfsense_web/components/editor-panel/editor-panel.tsx
index 7c94356d89..3cb30cee6b 100644
--- a/surfsense_web/components/editor-panel/editor-panel.tsx
+++ b/surfsense_web/components/editor-panel/editor-panel.tsx
@@ -1,18 +1,31 @@
 "use client";
 
 import { useAtomValue, useSetAtom } from "jotai";
-import { Download, FileQuestionMark, FileText, Loader2, RefreshCw, XIcon } from "lucide-react";
+import {
+	Check,
+	Copy,
+	Download,
+	FileQuestionMark,
+	FileText,
+	Pencil,
+	RefreshCw,
+	XIcon,
+} from "lucide-react";
 import dynamic from "next/dynamic";
 import { useCallback, useEffect, useRef, useState } from "react";
 import { toast } from "sonner";
 import { closeEditorPanelAtom, editorPanelAtom } from "@/atoms/editor/editor-panel.atom";
 import { VersionHistoryButton } from "@/components/documents/version-history";
+import { SourceCodeEditor } from "@/components/editor/source-code-editor";
 import { MarkdownViewer } from "@/components/markdown-viewer";
 import { Alert, AlertDescription } from "@/components/ui/alert";
 import { Button } from "@/components/ui/button";
 import { Drawer, DrawerContent, DrawerHandle, DrawerTitle } from "@/components/ui/drawer";
+import { Spinner } from "@/components/ui/spinner";
 import { useMediaQuery } from "@/hooks/use-media-query";
+import { useElectronAPI } from "@/hooks/use-platform";
 import { authenticatedFetch, getBearerToken, redirectToLogin } from "@/lib/auth-utils";
+import { inferMonacoLanguageFromPath } from "@/lib/editor-language";
 
 const PlateEditor = dynamic(
 	() => import("@/components/editor/plate-editor").then((m) => ({ default: m.PlateEditor })),
@@ -32,6 +45,43 @@ interface EditorContent {
 }
 
 const EDITABLE_DOCUMENT_TYPES = new Set(["FILE", "NOTE"]);
+type EditorRenderMode = "rich_markdown" | "source_code";
+
+type AgentFilesystemMount = {
+	mount: string;
+	rootPath: string;
+};
+
+function normalizeLocalVirtualPathForEditor(
+	candidatePath: string,
+	mounts: AgentFilesystemMount[]
+): string {
+	const normalizedCandidate = candidatePath.trim().replace(/\\/g, "/").replace(/\/+/g, "/");
+	if (!normalizedCandidate) return candidatePath;
+	const defaultMount = mounts[0]?.mount;
+	if (!defaultMount) {
+		return normalizedCandidate.startsWith("/")
+			? normalizedCandidate
+			: `/${normalizedCandidate.replace(/^\/+/, "")}`;
+	}
+
+	const mountNames = new Set(mounts.map((entry) => entry.mount));
+	if (normalizedCandidate.startsWith("/")) {
+		const relative = normalizedCandidate.replace(/^\/+/, "");
+		const [firstSegment] = relative.split("/", 1);
+		if (mountNames.has(firstSegment)) {
+			return `/${relative}`;
+		}
+		return `/${defaultMount}/${relative}`;
+	}
+
+	const relative = normalizedCandidate.replace(/^\/+/, "");
+	const [firstSegment] = relative.split("/", 1);
+	if (mountNames.has(firstSegment)) {
+		return `/${relative}`;
+	}
+	return `/${defaultMount}/${relative}`;
+}
 
 function EditorPanelSkeleton() {
 	return (
@@ -54,27 +104,55 @@ function EditorPanelSkeleton() {
 }
 
 export function EditorPanelContent({
+	kind = "document",
 	documentId,
+	localFilePath,
 	searchSpaceId,
 	title,
 	onClose,
 }: {
-	documentId: number;
-	searchSpaceId: number;
+	kind?: "document" | "local_file";
+	documentId?: number;
+	localFilePath?: string;
+	searchSpaceId?: number;
 	title: string | null;
 	onClose?: () => void;
 }) {
+	const electronAPI = useElectronAPI();
 	const [editorDoc, setEditorDoc] = useState<EditorContent | null>(null);
 	const [isLoading, setIsLoading] = useState(true);
 	const [error, setError] = useState<string | null>(null);
 	const [saving, setSaving] = useState(false);
 	const [downloading, setDownloading] = useState(false);
+	const [isEditing, setIsEditing] = useState(false);
 
 	const [editedMarkdown, setEditedMarkdown] = useState<string | null>(null);
+	const [localFileContent, setLocalFileContent] = useState("");
+	const [hasCopied, setHasCopied] = useState(false);
 	const markdownRef = useRef<string>("");
+	const copyResetTimeoutRef = useRef<ReturnType<typeof setTimeout> | null>(null);
 	const initialLoadDone = useRef(false);
 	const changeCountRef = useRef(0);
 	const [displayTitle, setDisplayTitle] = useState(title || "Untitled");
+	const isLocalFileMode = kind === "local_file";
+	const editorRenderMode: EditorRenderMode = isLocalFileMode ? "source_code" : "rich_markdown";
+
+	const resolveLocalVirtualPath = useCallback(
+		async (candidatePath: string): Promise<string> => {
+			if (!electronAPI?.getAgentFilesystemMounts) {
+				return candidatePath;
+			}
+			try {
+				const mounts = (await electronAPI.getAgentFilesystemMounts(
+					searchSpaceId
+				)) as AgentFilesystemMount[];
+				return normalizeLocalVirtualPathForEditor(candidatePath, mounts);
+			} catch {
+				return candidatePath;
+			}
+		},
+		[electronAPI, searchSpaceId]
+	);
 
 	const isLargeDocument = (editorDoc?.content_size_bytes ?? 0) > LARGE_DOCUMENT_THRESHOLD;
 
@@ -84,17 +162,52 @@ export function EditorPanelContent({
 		setError(null);
 		setEditorDoc(null);
 		setEditedMarkdown(null);
+		setLocalFileContent("");
+		setHasCopied(false);
+		setIsEditing(false);
 		initialLoadDone.current = false;
 		changeCountRef.current = 0;
 
 		const doFetch = async () => {
-			const token = getBearerToken();
-			if (!token) {
-				redirectToLogin();
-				return;
-			}
-
 			try {
+				if (isLocalFileMode) {
+					if (!localFilePath) {
+						throw new Error("Missing local file path");
+					}
+					if (!electronAPI?.readAgentLocalFileText) {
+						throw new Error("Local file editor is available only in desktop mode.");
+					}
+					const resolvedLocalPath = await resolveLocalVirtualPath(localFilePath);
+					const readResult = await electronAPI.readAgentLocalFileText(
+						resolvedLocalPath,
+						searchSpaceId
+					);
+					if (!readResult.ok) {
+						throw new Error(readResult.error || "Failed to read local file");
+					}
+					const inferredTitle = resolvedLocalPath.split("/").pop() || resolvedLocalPath;
+					const content: EditorContent = {
+						document_id: -1,
+						title: inferredTitle,
+						document_type: "NOTE",
+						source_markdown: readResult.content,
+					};
+					markdownRef.current = content.source_markdown;
+					setLocalFileContent(content.source_markdown);
+					setDisplayTitle(title || inferredTitle);
+					setEditorDoc(content);
+					initialLoadDone.current = true;
+					return;
+				}
+				if (!documentId || !searchSpaceId) {
+					throw new Error("Missing document context");
+				}
+				const token = getBearerToken();
+				if (!token) {
+					redirectToLogin();
+					return;
+				}
+
 				const url = new URL(
 					`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/editor-content`
 				);
@@ -136,7 +249,23 @@ export function EditorPanelContent({
 
 		doFetch().catch(() => {});
 		return () => controller.abort();
-	}, [documentId, searchSpaceId, title]);
+	}, [
+		documentId,
+		electronAPI,
+		isLocalFileMode,
+		localFilePath,
+		resolveLocalVirtualPath,
+		searchSpaceId,
+		title,
+	]);
+
+	useEffect(() => {
+		return () => {
+			if (copyResetTimeoutRef.current) {
+				clearTimeout(copyResetTimeoutRef.current);
+			}
+		};
+	}, []);
 
 	const handleMarkdownChange = useCallback((md: string) => {
 		markdownRef.current = md;
@@ -146,68 +275,339 @@ export function EditorPanelContent({
 		setEditedMarkdown(md);
 	}, []);
 
-	const handleSave = useCallback(async () => {
-		const token = getBearerToken();
-		if (!token) {
-			toast.error("Please login to save");
-			redirectToLogin();
-			return;
+	const handleCopy = useCallback(async () => {
+		try {
+			const textToCopy = markdownRef.current ?? editorDoc?.source_markdown ?? "";
+			await navigator.clipboard.writeText(textToCopy);
+			setHasCopied(true);
+			if (copyResetTimeoutRef.current) {
+				clearTimeout(copyResetTimeoutRef.current);
+			}
+			copyResetTimeoutRef.current = setTimeout(() => {
+				setHasCopied(false);
+			}, 1400);
+		} catch (err) {
+			console.error("Error copying content:", err);
 		}
+	}, [editorDoc?.source_markdown]);
 
-		setSaving(true);
-		try {
-			const response = await authenticatedFetch(
-				`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/save`,
-				{
-					method: "POST",
-					headers: { "Content-Type": "application/json" },
-					body: JSON.stringify({ source_markdown: markdownRef.current }),
+	const handleSave = useCallback(
+		async (options?: { silent?: boolean }) => {
+			setSaving(true);
+			try {
+				if (isLocalFileMode) {
+					if (!localFilePath) {
+						throw new Error("Missing local file path");
+					}
+					if (!electronAPI?.writeAgentLocalFileText) {
+						throw new Error("Local file editor is available only in desktop mode.");
+					}
+					const resolvedLocalPath = await resolveLocalVirtualPath(localFilePath);
+					const contentToSave = markdownRef.current;
+					const writeResult = await electronAPI.writeAgentLocalFileText(
+						resolvedLocalPath,
+						contentToSave,
+						searchSpaceId
+					);
+					if (!writeResult.ok) {
+						throw new Error(writeResult.error || "Failed to save local file");
+					}
+					setEditorDoc((prev) => (prev ? { ...prev, source_markdown: contentToSave } : prev));
+					setEditedMarkdown(markdownRef.current === contentToSave ? null : markdownRef.current);
+					return true;
 				}
-			);
+				if (!searchSpaceId || !documentId) {
+					throw new Error("Missing document context");
+				}
+				const token = getBearerToken();
+				if (!token) {
+					toast.error("Please login to save");
+					redirectToLogin();
+					return;
+				}
+				const response = await authenticatedFetch(
+					`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/save`,
+					{
+						method: "POST",
+						headers: { "Content-Type": "application/json" },
+						body: JSON.stringify({ source_markdown: markdownRef.current }),
+					}
+				);
 
-			if (!response.ok) {
-				const errorData = await response
-					.json()
-					.catch(() => ({ detail: "Failed to save document" }));
-				throw new Error(errorData.detail || "Failed to save document");
-			}
+				if (!response.ok) {
+					const errorData = await response
+						.json()
+						.catch(() => ({ detail: "Failed to save document" }));
+					throw new Error(errorData.detail || "Failed to save document");
+				}
 
-			setEditorDoc((prev) => (prev ? { ...prev, source_markdown: markdownRef.current } : prev));
-			setEditedMarkdown(null);
-			toast.success("Document saved! Reindexing in background...");
-		} catch (err) {
-			console.error("Error saving document:", err);
-			toast.error(err instanceof Error ? err.message : "Failed to save document");
-		} finally {
-			setSaving(false);
-		}
-	}, [documentId, searchSpaceId]);
+				setEditorDoc((prev) => (prev ? { ...prev, source_markdown: markdownRef.current } : prev));
+				setEditedMarkdown(null);
+				if (!options?.silent) {
+					toast.success("Document saved! Reindexing in background...");
+				}
+				return true;
+			} catch (err) {
+				console.error("Error saving document:", err);
+				if (!options?.silent) {
+					toast.error(err instanceof Error ? err.message : "Failed to save document");
+				}
+				return false;
+			} finally {
+				setSaving(false);
+			}
+		},
+		[
+			documentId,
+			electronAPI,
+			isLocalFileMode,
+			localFilePath,
+			resolveLocalVirtualPath,
+			searchSpaceId,
+		]
+	);
 
 	const isEditableType = editorDoc
-		? EDITABLE_DOCUMENT_TYPES.has(editorDoc.document_type ?? "") && !isLargeDocument
+		? (editorRenderMode === "source_code" ||
+				EDITABLE_DOCUMENT_TYPES.has(editorDoc.document_type ?? "")) &&
+			!isLargeDocument
 		: false;
+	// Render through PlateEditor for editable doc types (FILE/NOTE).
+	// Everything else (large docs, non-editable types) falls back to the
+	// lightweight `MarkdownViewer` — Plate is heavy on multi-MB docs and
+	// non-editable types don't benefit from its editing UX.
+	const renderInPlateEditor = isEditableType;
+	const hasUnsavedChanges = editedMarkdown !== null;
+	const showDesktopHeader = !!onClose;
+	const showEditingActions = isEditableType && isEditing;
+	const localFileLanguage = inferMonacoLanguageFromPath(localFilePath);
+
+	const handleCancelEditing = useCallback(() => {
+		const savedContent = editorDoc?.source_markdown ?? "";
+		markdownRef.current = savedContent;
+		setLocalFileContent(savedContent);
+		setEditedMarkdown(null);
+		changeCountRef.current = 0;
+		setIsEditing(false);
+	}, [editorDoc?.source_markdown]);
+
+	const handleDownloadMarkdown = useCallback(async () => {
+		if (!searchSpaceId || !documentId) return;
+		setDownloading(true);
+		try {
+			const response = await authenticatedFetch(
+				`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/download-markdown`,
+				{ method: "GET" }
+			);
+			if (!response.ok) throw new Error("Download failed");
+			const blob = await response.blob();
+			const url = URL.createObjectURL(blob);
+			const a = document.createElement("a");
+			a.href = url;
+			const disposition = response.headers.get("content-disposition");
+			const match = disposition?.match(/filename="(.+)"/);
+			a.download = match?.[1] ?? `${editorDoc?.title || "document"}.md`;
+			document.body.appendChild(a);
+			a.click();
+			a.remove();
+			URL.revokeObjectURL(url);
+			toast.success("Download started");
+		} catch {
+			toast.error("Failed to download document");
+		} finally {
+			setDownloading(false);
+		}
+	}, [documentId, editorDoc?.title, searchSpaceId]);
+
+	const largeDocAlert = isLargeDocument && !isLocalFileMode && editorDoc && (
+		<Alert className="mb-4">
+			<FileText className="size-4" />
+			<AlertDescription className="flex items-center justify-between gap-4">
+				<span>
+					This document is too large for the editor (
+					{Math.round((editorDoc.content_size_bytes ?? 0) / 1024 / 1024)}MB,{" "}
+					{editorDoc.chunk_count ?? 0} chunks). Showing a preview below.
+				</span>
+				<Button
+					variant="outline"
+					size="sm"
+					className="relative shrink-0"
+					disabled={downloading}
+					onClick={handleDownloadMarkdown}
+				>
+					<span className={`flex items-center gap-1.5 ${downloading ? "opacity-0" : ""}`}>
+						<Download className="size-3.5" />
+						Download .md
+					</span>
+					{downloading && <Spinner size="sm" className="absolute" />}
+				</Button>
+			</AlertDescription>
+		</Alert>
+	);
 
 	return (
 		<>
-			<div className="flex items-center justify-between px-4 py-2 shrink-0 border-b">
-				<div className="flex-1 min-w-0">
-					<h2 className="text-sm font-semibold truncate">{displayTitle}</h2>
-					{isEditableType && editedMarkdown !== null && (
-						<p className="text-[10px] text-muted-foreground">Unsaved changes</p>
-					)}
+			{showDesktopHeader ? (
+				<div className="shrink-0">
+					<div className="shrink-0 flex h-12 items-center justify-between px-3 border-b">
+						<h2 className="select-none text-lg font-semibold">File</h2>
+						<div className="flex items-center gap-1 shrink-0">
+							<Button
+								variant="ghost"
+								size="icon"
+								onClick={onClose}
+								className="h-8 w-8 rounded-full shrink-0 text-muted-foreground hover:text-accent-foreground"
+							>
+								<XIcon className="h-4 w-4" />
+								<span className="sr-only">Close editor panel</span>
+							</Button>
+						</div>
+					</div>
+					<div className="grid h-10 grid-cols-[minmax(0,1fr)_auto] items-center gap-3 border-b px-4">
+						<div className="min-w-0 flex flex-1 items-center gap-2">
+							<p className="truncate text-sm text-muted-foreground">{displayTitle}</p>
+						</div>
+						<div className="flex items-center gap-1 shrink-0">
+							{showEditingActions ? (
+								<>
+									<Button
+										variant="ghost"
+										size="sm"
+										className="h-6 px-2 text-xs"
+										onClick={handleCancelEditing}
+										disabled={saving}
+									>
+										Cancel
+									</Button>
+									<Button
+										variant="secondary"
+										size="sm"
+										className="relative h-6 w-[56px] px-0 text-xs"
+										onClick={async () => {
+											const saveSucceeded = await handleSave({ silent: true });
+											if (saveSucceeded) setIsEditing(false);
+										}}
+										disabled={saving || !hasUnsavedChanges}
+									>
+										<span className={saving ? "opacity-0" : ""}>Save</span>
+										{saving && <Spinner size="xs" className="absolute" />}
+									</Button>
+								</>
+							) : (
+								<>
+									{!isLocalFileMode && editorDoc?.document_type && documentId && (
+										<VersionHistoryButton
+											documentId={documentId}
+											documentType={editorDoc.document_type}
+										/>
+									)}
+									<Button
+										variant="ghost"
+										size="icon"
+										className="size-6"
+										onClick={() => {
+											void handleCopy();
+										}}
+										disabled={isLoading || !editorDoc}
+									>
+										{hasCopied ? <Check className="size-3.5" /> : <Copy className="size-3.5" />}
+										<span className="sr-only">
+											{hasCopied ? "Copied file contents" : "Copy file contents"}
+										</span>
+									</Button>
+									{isEditableType && (
+										<Button
+											variant="ghost"
+											size="icon"
+											className="size-6"
+											onClick={() => {
+												changeCountRef.current = 0;
+												setEditedMarkdown(null);
+												setIsEditing(true);
+											}}
+										>
+											<Pencil className="size-3.5" />
+											<span className="sr-only">Edit document</span>
+										</Button>
+									)}
+								</>
+							)}
+						</div>
+					</div>
 				</div>
-				<div className="flex items-center gap-1 shrink-0">
-					{editorDoc?.document_type && (
-						<VersionHistoryButton documentId={documentId} documentType={editorDoc.document_type} />
-					)}
-					{onClose && (
-						<Button variant="ghost" size="icon" onClick={onClose} className="size-7 shrink-0">
-							<XIcon className="size-4" />
-							<span className="sr-only">Close editor panel</span>
-						</Button>
-					)}
+			) : (
+				<div className="flex h-14 items-center justify-between border-b px-4 shrink-0">
+					<div className="flex flex-1 min-w-0 items-center gap-2">
+						<h2 className="text-sm font-semibold truncate">{displayTitle}</h2>
+					</div>
+					<div className="flex items-center gap-1 shrink-0">
+						{showEditingActions ? (
+							<>
+								<Button
+									variant="ghost"
+									size="sm"
+									className="h-6 px-2 text-xs"
+									onClick={handleCancelEditing}
+									disabled={saving}
+								>
+									Cancel
+								</Button>
+								<Button
+									variant="secondary"
+									size="sm"
+									className="relative h-6 w-[56px] px-0 text-xs"
+									onClick={async () => {
+										const saveSucceeded = await handleSave({ silent: true });
+										if (saveSucceeded) setIsEditing(false);
+									}}
+									disabled={saving || !hasUnsavedChanges}
+								>
+									<span className={saving ? "opacity-0" : ""}>Save</span>
+									{saving && <Spinner size="xs" className="absolute" />}
+								</Button>
+							</>
+						) : (
+							<>
+								{!isLocalFileMode && editorDoc?.document_type && documentId && (
+									<VersionHistoryButton
+										documentId={documentId}
+										documentType={editorDoc.document_type}
+									/>
+								)}
+								<Button
+									variant="ghost"
+									size="icon"
+									className="size-6"
+									onClick={() => {
+										void handleCopy();
+									}}
+									disabled={isLoading || !editorDoc}
+								>
+									{hasCopied ? <Check className="size-3.5" /> : <Copy className="size-3.5" />}
+									<span className="sr-only">
+										{hasCopied ? "Copied file contents" : "Copy file contents"}
+									</span>
+								</Button>
+								{isEditableType && (
+									<Button
+										variant="ghost"
+										size="icon"
+										className="size-6"
+										onClick={() => {
+											changeCountRef.current = 0;
+											setEditedMarkdown(null);
+											setIsEditing(true);
+										}}
+									>
+										<Pencil className="size-3.5" />
+										<span className="sr-only">Edit document</span>
+									</Button>
+								)}
+							</>
+						)}
+					</div>
 				</div>
-			</div>
+			)}
 
 			<div className="flex-1 overflow-hidden">
 				{isLoading ? (
@@ -234,77 +634,58 @@ export function EditorPanelContent({
 							</p>
 						</div>
 					</div>
-				) : isLargeDocument ? (
+				) : editorRenderMode === "source_code" ? (
+					<div className="h-full overflow-hidden">
+						<SourceCodeEditor
+							path={localFilePath ?? "local-file.txt"}
+							language={localFileLanguage}
+							value={localFileContent}
+							onSave={() => {
+								void handleSave({ silent: true });
+							}}
+							readOnly={!isEditing}
+							onChange={(next) => {
+								markdownRef.current = next;
+								setLocalFileContent(next);
+								if (!initialLoadDone.current) return;
+								setEditedMarkdown(next === (editorDoc?.source_markdown ?? "") ? null : next);
+							}}
+						/>
+					</div>
+				) : isLargeDocument && !isLocalFileMode ? (
+					// Large doc — fast Streamdown preview + download CTA.
+					// Plate is heavy on multi-MB docs.
 					<div className="h-full overflow-y-auto px-5 py-4">
-						<Alert className="mb-4">
-							<FileText className="size-4" />
-							<AlertDescription className="flex items-center justify-between gap-4">
-								<span>
-									This document is too large for the editor (
-									{Math.round((editorDoc.content_size_bytes ?? 0) / 1024 / 1024)}MB,{" "}
-									{editorDoc.chunk_count ?? 0} chunks). Showing a preview below.
-								</span>
-								<Button
-									variant="outline"
-									size="sm"
-									className="shrink-0 gap-1.5"
-									disabled={downloading}
-									onClick={async () => {
-										setDownloading(true);
-										try {
-											const response = await authenticatedFetch(
-												`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/download-markdown`,
-												{ method: "GET" }
-											);
-											if (!response.ok) throw new Error("Download failed");
-											const blob = await response.blob();
-											const url = URL.createObjectURL(blob);
-											const a = document.createElement("a");
-											a.href = url;
-											const disposition = response.headers.get("content-disposition");
-											const match = disposition?.match(/filename="(.+)"/);
-											a.download = match?.[1] ?? `${editorDoc.title || "document"}.md`;
-											document.body.appendChild(a);
-											a.click();
-											a.remove();
-											URL.revokeObjectURL(url);
-											toast.success("Download started");
-										} catch {
-											toast.error("Failed to download document");
-										} finally {
-											setDownloading(false);
-										}
-									}}
-								>
-									{downloading ? (
-										<Loader2 className="size-3.5 animate-spin" />
-									) : (
-										<Download className="size-3.5" />
-									)}
-									{downloading ? "Preparing..." : "Download .md"}
-								</Button>
-							</AlertDescription>
-						</Alert>
-						<MarkdownViewer content={editorDoc.source_markdown} />
+						{largeDocAlert}
+						<MarkdownViewer content={editorDoc.source_markdown} enableCitations />
+					</div>
+				) : renderInPlateEditor ? (
+					// Editable doc (FILE/NOTE) — Plate editing UX.
+					<div className="flex h-full min-h-0 flex-col">
+						<div className="flex-1 min-h-0 overflow-hidden">
+							<PlateEditor
+								key={`${isLocalFileMode ? (localFilePath ?? "local-file") : documentId}-${isEditing ? "editing" : "viewing"}`}
+								preset="full"
+								markdown={editorDoc.source_markdown}
+								onMarkdownChange={handleMarkdownChange}
+								readOnly={!isEditing}
+								placeholder="Start writing..."
+								editorVariant="default"
+								allowModeToggle={false}
+								reserveToolbarSpace={isEditing}
+								defaultEditing={isEditing}
+								className="**:[[role=toolbar]]:bg-sidebar!"
+								// Render `[citation:N]` badges in view mode only.
+								// Edit mode keeps raw text so the user can edit/delete
+								// tokens directly. `local_file` never reaches this branch
+								// (handled by the source_code editor above).
+								enableCitations={!isEditing && !isLocalFileMode}
+							/>
+						</div>
 					</div>
-				) : isEditableType ? (
-					<PlateEditor
-						key={documentId}
-						preset="full"
-						markdown={editorDoc.source_markdown}
-						onMarkdownChange={handleMarkdownChange}
-						readOnly={false}
-						placeholder="Start writing..."
-						editorVariant="default"
-						onSave={handleSave}
-						hasUnsavedChanges={editedMarkdown !== null}
-						isSaving={saving}
-						defaultEditing={true}
-						className="[&_[role=toolbar]]:!bg-sidebar"
-					/>
 				) : (
 					<div className="h-full overflow-y-auto px-5 py-4">
-						<MarkdownViewer content={editorDoc.source_markdown} />
+						<MarkdownViewer content={editorDoc.source_markdown} enableCitations />
 					</div>
 				)}
 			</div>
@@ -324,13 +705,19 @@ function DesktopEditorPanel() {
 		return () => document.removeEventListener("keydown", handleKeyDown);
 	}, [closePanel]);
 
-	if (!panelState.isOpen || !panelState.documentId || !panelState.searchSpaceId) return null;
+	const hasTarget =
+		panelState.kind === "document"
+			? !!panelState.documentId && !!panelState.searchSpaceId
+			: !!panelState.localFilePath;
+	if (!panelState.isOpen || !hasTarget) return null;
 
 	return (
 		<div className="flex w-[50%] max-w-[700px] min-w-[380px] flex-col border-l bg-sidebar text-sidebar-foreground animate-in slide-in-from-right-4 duration-300 ease-out">
 			<EditorPanelContent
-				documentId={panelState.documentId}
-				searchSpaceId={panelState.searchSpaceId}
+				kind={panelState.kind}
+				documentId={panelState.documentId ?? undefined}
+				localFilePath={panelState.localFilePath ?? undefined}
+				searchSpaceId={panelState.searchSpaceId ?? undefined}
 				title={panelState.title}
 				onClose={closePanel}
 			/>
@@ -342,7 +729,13 @@ function MobileEditorDrawer() {
 	const panelState = useAtomValue(editorPanelAtom);
 	const closePanel = useSetAtom(closeEditorPanelAtom);
 
-	if (!panelState.documentId || !panelState.searchSpaceId) return null;
+	if (panelState.kind === "local_file") return null;
+
+	const hasTarget =
+		panelState.kind === "document"
+			? !!panelState.documentId && !!panelState.searchSpaceId
+			: !!panelState.localFilePath;
+	if (!hasTarget) return null;
 
 	return (
 		<Drawer
@@ -360,8 +753,10 @@ function MobileEditorDrawer() {
 				<DrawerTitle className="sr-only">{panelState.title || "Editor"}</DrawerTitle>
 				<div className="min-h-0 flex-1 flex flex-col overflow-hidden">
 					<EditorPanelContent
-						documentId={panelState.documentId}
-						searchSpaceId={panelState.searchSpaceId}
+						kind={panelState.kind}
+						documentId={panelState.documentId ?? undefined}
+						localFilePath={panelState.localFilePath ?? undefined}
+						searchSpaceId={panelState.searchSpaceId ?? undefined}
 						title={panelState.title}
 					/>
 				</div>
@@ -373,8 +768,13 @@ function MobileEditorDrawer() {
 export function EditorPanel() {
 	const panelState = useAtomValue(editorPanelAtom);
 	const isDesktop = useMediaQuery("(min-width: 1024px)");
+	const hasTarget =
+		panelState.kind === "document"
+			? !!panelState.documentId && !!panelState.searchSpaceId
+			: !!panelState.localFilePath;
 
-	if (!panelState.isOpen || !panelState.documentId) return null;
+	if (!panelState.isOpen || !hasTarget) return null;
+	if (!isDesktop && panelState.kind === "local_file") return null;
 
 	if (isDesktop) {
 		return <DesktopEditorPanel />;
@@ -386,8 +786,13 @@ export function EditorPanel() {
 export function MobileEditorPanel() {
 	const panelState = useAtomValue(editorPanelAtom);
 	const isDesktop = useMediaQuery("(min-width: 1024px)");
+	const hasTarget =
+		panelState.kind === "document"
+			? !!panelState.documentId && !!panelState.searchSpaceId
+			: !!panelState.localFilePath;
 
-	if (isDesktop || !panelState.isOpen || !panelState.documentId) return null;
+	if (isDesktop || !panelState.isOpen || !hasTarget || panelState.kind === "local_file")
+		return null;
 
 	return <MobileEditorDrawer />;
 }
diff --git a/surfsense_web/components/editor/editor-save-context.tsx b/surfsense_web/components/editor/editor-save-context.tsx
index d53a4adce5..b4b3935a41 100644
--- a/surfsense_web/components/editor/editor-save-context.tsx
+++ b/surfsense_web/components/editor/editor-save-context.tsx
@@ -11,12 +11,15 @@ interface EditorSaveContextValue {
 	isSaving: boolean;
 	/** Whether the user can toggle between editing and viewing modes */
 	canToggleMode: boolean;
+	/** Whether fixed-toolbar space should be reserved even when controls are hidden */
+	reserveToolbarSpace: boolean;
 }
 
 export const EditorSaveContext = createContext<EditorSaveContextValue>({
 	hasUnsavedChanges: false,
 	isSaving: false,
 	canToggleMode: false,
+	reserveToolbarSpace: false,
 });
 
 export function useEditorSave() {
diff --git a/surfsense_web/components/editor/plate-editor.tsx b/surfsense_web/components/editor/plate-editor.tsx
index 66e9a0e5e7..348d1006f8 100644
--- a/surfsense_web/components/editor/plate-editor.tsx
+++ b/surfsense_web/components/editor/plate-editor.tsx
@@ -3,14 +3,20 @@
 import { MarkdownPlugin, remarkMdx } from "@platejs/markdown";
 import { slateToHtml } from "@slate-serializers/html";
 import type { AnyPluginConfig, Descendant, Value } from "platejs";
-import { createPlatePlugin, Key, Plate, usePlateEditor } from "platejs/react";
+import { createPlatePlugin, Key, Plate, useEditorReadOnly, usePlateEditor } from "platejs/react";
 import { useEffect, useMemo, useRef } from "react";
 import remarkGfm from "remark-gfm";
 import remarkMath from "remark-math";
 import { EditorSaveContext } from "@/components/editor/editor-save-context";
+import { CitationKit, injectCitationNodes } from "@/components/editor/plugins/citation-kit";
 import { type EditorPreset, presetMap } from "@/components/editor/presets";
 import { escapeMdxExpressions } from "@/components/editor/utils/escape-mdx";
+import { safeDeserializeMarkdown } from "@/components/editor/utils/safe-deserialize";
 import { Editor, EditorContainer } from "@/components/ui/editor";
+import { preprocessCitationMarkdown } from "@/lib/citations/citation-parser";
+
+/** Live editor instance returned by `usePlateEditor`. */
+export type PlateEditorInstance = ReturnType<typeof usePlateEditor>;
 
 export interface PlateEditorProps {
 	/** Markdown string to load as initial content */
@@ -36,12 +42,16 @@ export interface PlateEditorProps {
 	editorVariant?: "default" | "demo" | "fullWidth" | "none";
 	/** Additional className for the container */
 	className?: string;
-	/** Save callback. When provided, ⌘+S / Ctrl+S shortcut is registered and save button appears. */
+	/** Save callback. When provided, ⌘+Shift+S / Ctrl+Shift+S shortcut is registered (avoiding the browser's ⌘+S / Ctrl+S "Save Page As" conflict) and a save button appears in the toolbar. */
 	onSave?: () => void;
 	/** Whether there are unsaved changes */
 	hasUnsavedChanges?: boolean;
 	/** Whether a save is in progress */
 	isSaving?: boolean;
+	/** Whether edit/view mode toggle UI should be available in toolbars. */
+	allowModeToggle?: boolean;
+	/** Reserve fixed-toolbar vertical space even when controls are hidden. */
+	reserveToolbarSpace?: boolean;
 	/** Start the editor in editing mode instead of viewing mode. Ignored when readOnly is true. */
 	defaultEditing?: boolean;
 	/**
@@ -58,6 +68,32 @@ export interface PlateEditorProps {
 	 * without modifying the core editor component.
 	 */
 	extraPlugins?: AnyPluginConfig[];
+	/**
+	 * Render `[citation:N]` and `[citation:URL]` tokens in the deserialized
+	 * markdown as interactive citation badges/popovers (mirrors chat). Only
+	 * meant for read-only views — when true, `onMarkdownChange` is suppressed
+	 * because the in-memory tree contains custom inline-void elements that
+	 * have no markdown serialize rule.
+	 */
+	enableCitations?: boolean;
+}
+
+function PlateEditorContent({
+	editorVariant,
+	placeholder,
+}: {
+	editorVariant: PlateEditorProps["editorVariant"];
+	placeholder?: string;
+}) {
+	const isReadOnly = useEditorReadOnly();
+
+	return (
+		<Editor
+			variant={editorVariant}
+			placeholder={isReadOnly ? undefined : placeholder}
+			className="min-h-full"
+		/>
+	);
 }
 
 export function PlateEditor({
@@ -73,9 +109,12 @@ export function PlateEditor({
 	onSave,
 	hasUnsavedChanges = false,
 	isSaving = false,
+	allowModeToggle = true,
+	reserveToolbarSpace = false,
 	defaultEditing = false,
 	preset = "full",
 	extraPlugins = [],
+	enableCitations = false,
 }: PlateEditorProps) {
 	const lastMarkdownRef = useRef(markdown);
 	const lastHtmlRef = useRef(html);
@@ -118,6 +157,8 @@ export function PlateEditor({
 			...(onSave ? [SaveShortcutPlugin] : []),
 			// Consumer-provided extra plugins
 			...extraPlugins,
+			// Citation void inline element (read-only document viewer).
+			...(enableCitations ? CitationKit : []),
 			MarkdownPlugin.configure({
 				options: {
 					remarkPlugins: [remarkGfm, remarkMath, remarkMdx],
@@ -127,8 +168,14 @@ export function PlateEditor({
 		value: html
 			? (editor) => editor.api.html.deserialize({ element: html }) as Value
 			: markdown
-				? (editor) =>
-						editor.getApi(MarkdownPlugin).markdown.deserialize(escapeMdxExpressions(markdown))
+				? (editor) => {
+						if (!enableCitations) {
+							return safeDeserializeMarkdown(editor, escapeMdxExpressions(markdown)) as Value;
+						}
+						const { content: rewritten, urlMap } = preprocessCitationMarkdown(markdown);
+						const value = safeDeserializeMarkdown(editor, escapeMdxExpressions(rewritten));
+						return injectCitationNodes(value, urlMap) as Value;
+					}
 				: undefined,
 	});
 
@@ -147,16 +194,21 @@ export function PlateEditor({
 	useEffect(() => {
 		if (!html && markdown !== undefined && markdown !== lastMarkdownRef.current) {
 			lastMarkdownRef.current = markdown;
-			const newValue = editor
-				.getApi(MarkdownPlugin)
-				.markdown.deserialize(escapeMdxExpressions(markdown));
+			let newValue: Descendant[];
+			if (enableCitations) {
+				const { content: rewritten, urlMap } = preprocessCitationMarkdown(markdown);
+				const deserialized = safeDeserializeMarkdown(editor, escapeMdxExpressions(rewritten));
+				newValue = injectCitationNodes(deserialized, urlMap);
+			} else {
+				newValue = safeDeserializeMarkdown(editor, escapeMdxExpressions(markdown));
+			}
 			editor.tf.reset();
-			editor.tf.setValue(newValue);
+			editor.tf.setValue(newValue as Value);
 		}
-	}, [html, markdown, editor]);
+	}, [html, markdown, editor, enableCitations]);
 
 	// When not forced read-only, the user can toggle between editing/viewing.
-	const canToggleMode = !readOnly;
+	const canToggleMode = !readOnly && allowModeToggle;
 
 	const contextProviderValue = useMemo(
 		() => ({
@@ -164,8 +216,9 @@ export function PlateEditor({
 			hasUnsavedChanges,
 			isSaving,
 			canToggleMode,
+			reserveToolbarSpace,
 		}),
-		[onSave, hasUnsavedChanges, isSaving, canToggleMode]
+		[onSave, hasUnsavedChanges, isSaving, canToggleMode, reserveToolbarSpace]
 	);
 
 	return (
@@ -177,6 +230,16 @@ export function PlateEditor({
 				// (initialized to true via usePlateEditor, toggled via ModeToolbarButton).
 				{...(readOnly ? { readOnly: true } : {})}
 				onChange={({ value }) => {
+					// View-only citation mode: skip serialization. The custom
+					// `citation` inline-void element has no markdown serialize
+					// rule, so emitting changes here would overwrite
+					// `lastMarkdownRef.current` (and downstream copy-to-clipboard
+					// state in EditorPanelContent) with a tree that loses every
+					// citation token. `enableCitations` is only ever set in
+					// read-only paths, so user input cannot reach this branch
+					// in practice — the guard exists for the initial Plate
+					// normalize emit.
+					if (enableCitations) return;
 					if (onHtmlChange && html) {
 						const serialized = slateToHtml(value as Descendant[]);
 						onHtmlChange(serialized);
@@ -188,7 +251,7 @@ export function PlateEditor({
 				}}
 			>
 				<EditorContainer variant={variant} className={className}>
-					<Editor variant={editorVariant} placeholder={placeholder} />
+					<PlateEditorContent editorVariant={editorVariant} placeholder={placeholder} />
 				</EditorContainer>
 			</Plate>
 		</EditorSaveContext.Provider>
diff --git a/surfsense_web/components/editor/plugins/citation-kit.tsx b/surfsense_web/components/editor/plugins/citation-kit.tsx
new file mode 100644
index 0000000000..97e8ec7230
--- /dev/null
+++ b/surfsense_web/components/editor/plugins/citation-kit.tsx
@@ -0,0 +1,199 @@
+"use client";
+
+import { type Descendant, KEYS } from "platejs";
+import { createPlatePlugin, type PlateElementProps } from "platejs/react";
+import type { FC } from "react";
+import { InlineCitation, UrlCitation } from "@/components/assistant-ui/inline-citation";
+import {
+	CITATION_REGEX,
+	type CitationUrlMap,
+	parseTextWithCitations,
+} from "@/lib/citations/citation-parser";
+
+/**
+ * Plate inline-void node for one `[citation:...]` reference.
+ * Inline voids keep the citation chip atomic while preserving caret behavior
+ * around the surrounding text.
+ */
+export type CitationElementNode = {
+	type: "citation";
+	kind: "chunk" | "doc" | "url";
+	chunkId?: number;
+	url?: string;
+	/** Original literal token that produced this citation node. */
+	rawText: string;
+	children: [{ text: "" }];
+};
+
+const CITATION_TYPE = "citation";
+
+const CitationElement: FC<PlateElementProps<CitationElementNode>> = ({
+	attributes,
+	children,
+	element,
+}) => {
+	const isUrl = element.kind === "url";
+	return (
+		<span {...attributes} className="inline-flex align-baseline">
+			<span contentEditable={false}>
+				{isUrl && element.url ? (
+					<UrlCitation url={element.url} />
+				) : element.chunkId !== undefined ? (
+					<InlineCitation chunkId={element.chunkId} isDocsChunk={element.kind === "doc"} />
+				) : null}
+			</span>
+			{children}
+		</span>
+	);
+};
+
+const CitationPlugin = createPlatePlugin({
+	key: CITATION_TYPE,
+	node: {
+		isElement: true,
+		isInline: true,
+		isVoid: true,
+		type: CITATION_TYPE,
+		component: CitationElement,
+	},
+});
+
+export const CitationKit = [CitationPlugin];
+
+// ---------------------------------------------------------------------------
+// Slate value transform
+// ---------------------------------------------------------------------------
+
+// Local structural shapes keep the recursive walker readable without forcing
+// Plate's broad Descendant union into narrower generic predicates.
+type SlateText = { text: string } & Record<string, unknown>;
+type SlateElement = { type?: string; children: Descendant[] } & Record<string, unknown>;
+
+function isText(node: Descendant): boolean {
+	return typeof (node as { text?: unknown }).text === "string";
+}
+
+function asText(node: Descendant): SlateText {
+	return node as unknown as SlateText;
+}
+
+function asElement(node: Descendant): SlateElement {
+	return node as unknown as SlateElement;
+}
+
+/**
+ * Subtrees that should keep citation tokens as text:
+ * - Code nodes preserve source text and reject inline void children.
+ * - Link nodes already render as anchors; citation chips are interactive
+ *   shadcn Button-based controls, so injecting them would nest interactions.
+ */
+const SKIP_SUBTREE_TYPES = new Set<string>([KEYS.codeBlock, "code_line", KEYS.link]);
+
+/**
+ * Preserve text marks such as bold and italic when splitting around citations.
+ */
+function copyMarks(textNode: SlateText): Record<string, unknown> {
+	const { text: _text, ...marks } = textNode;
+	return marks;
+}
+
+function makeCitationElement(
+	rawText: string,
+	segment: { kind: "url"; url: string } | { kind: "chunk"; chunkId: number; isDocsChunk: boolean }
+): CitationElementNode {
+	if (segment.kind === "url") {
+		return {
+			type: CITATION_TYPE,
+			kind: "url",
+			url: segment.url,
+			rawText,
+			children: [{ text: "" }],
+		};
+	}
+	return {
+		type: CITATION_TYPE,
+		kind: segment.isDocsChunk ? "doc" : "chunk",
+		chunkId: segment.chunkId,
+		rawText,
+		children: [{ text: "" }],
+	};
+}
+
+/**
+ * Keep each original citation token on the generated node for diagnostics.
+ */
+function extractRawCitationMatches(text: string): string[] {
+	const matches: string[] = [];
+	CITATION_REGEX.lastIndex = 0;
+	let m: RegExpExecArray | null = CITATION_REGEX.exec(text);
+	while (m !== null) {
+		matches.push(m[0]);
+		m = CITATION_REGEX.exec(text);
+	}
+	return matches;
+}
+
+function transformTextNode(node: SlateText, urlMap: CitationUrlMap): Descendant[] {
+	const segments = parseTextWithCitations(node.text, urlMap);
+	if (segments.length === 1 && typeof segments[0] === "string") {
+		return [node as unknown as Descendant];
+	}
+
+	const marks = copyMarks(node);
+	const rawMatches = extractRawCitationMatches(node.text);
+	const out: Descendant[] = [];
+	let citationIdx = 0;
+	let pendingText: string | null = null;
+
+	const flushText = () => {
+		// Inline voids need text siblings, even at text boundaries.
+		out.push({ ...marks, text: pendingText ?? "" } as unknown as Descendant);
+		pendingText = null;
+	};
+
+	for (const segment of segments) {
+		if (typeof segment === "string") {
+			pendingText = (pendingText ?? "") + segment;
+		} else {
+			flushText();
+			const raw = rawMatches[citationIdx] ?? "";
+			out.push(makeCitationElement(raw, segment) as unknown as Descendant);
+			citationIdx += 1;
+			// Ensure a trailing text sibling if the citation ends the node.
+			pendingText = "";
+		}
+	}
+	flushText();
+
+	return out;
+}
+
+function transformChildren(children: Descendant[], urlMap: CitationUrlMap): Descendant[] {
+	const out: Descendant[] = [];
+	for (const child of children) {
+		if (isText(child)) {
+			out.push(...transformTextNode(asText(child), urlMap));
+			continue;
+		}
+		const elementChild = asElement(child);
+		const elementType = (elementChild.type ?? "") as string;
+		if (elementType && SKIP_SUBTREE_TYPES.has(elementType)) {
+			out.push(child);
+			continue;
+		}
+		out.push({
+			...elementChild,
+			children: transformChildren(elementChild.children, urlMap),
+		} as unknown as Descendant);
+	}
+	return out;
+}
+
+/**
+ * Replace citation tokens in a deserialized Slate tree with citation inline
+ * void nodes. URL placeholders from `preprocessCitationMarkdown` are resolved
+ * through `urlMap`; skipped subtrees are returned unchanged.
+ */
+export function injectCitationNodes(value: Descendant[], urlMap: CitationUrlMap): Descendant[] {
+	return transformChildren(value, urlMap);
+}
diff --git a/surfsense_web/components/editor/plugins/fixed-toolbar-kit.tsx b/surfsense_web/components/editor/plugins/fixed-toolbar-kit.tsx
index 85e0a08f28..346fe0378c 100644
--- a/surfsense_web/components/editor/plugins/fixed-toolbar-kit.tsx
+++ b/surfsense_web/components/editor/plugins/fixed-toolbar-kit.tsx
@@ -1,19 +1,39 @@
 "use client";
 
-import { createPlatePlugin } from "platejs/react";
+import { createPlatePlugin, useEditorReadOnly } from "platejs/react";
 
+import { useEditorSave } from "@/components/editor/editor-save-context";
 import { FixedToolbar } from "@/components/ui/fixed-toolbar";
 import { FixedToolbarButtons } from "@/components/ui/fixed-toolbar-buttons";
 
+function ConditionalFixedToolbar() {
+	const readOnly = useEditorReadOnly();
+	const { onSave, hasUnsavedChanges, canToggleMode, reserveToolbarSpace } = useEditorSave();
+
+	const hasVisibleControls =
+		!readOnly || canToggleMode || (!!onSave && hasUnsavedChanges && !readOnly);
+
+	if (!hasVisibleControls) {
+		if (!reserveToolbarSpace) return null;
+		return (
+			<FixedToolbar className="pointer-events-none opacity-0">
+				<div className="h-8 w-full" />
+			</FixedToolbar>
+		);
+	}
+
+	return (
+		<FixedToolbar>
+			<FixedToolbarButtons />
+		</FixedToolbar>
+	);
+}
+
 export const FixedToolbarKit = [
 	createPlatePlugin({
 		key: "fixed-toolbar",
 		render: {
-			beforeEditable: () => (
-				<FixedToolbar>
-					<FixedToolbarButtons />
-				</FixedToolbar>
-			),
+			beforeEditable: () => <ConditionalFixedToolbar />,
 		},
 	}),
 ];
diff --git a/surfsense_web/components/editor/presets.ts b/surfsense_web/components/editor/presets.ts
index 7800f7c7d3..c207b5e569 100644
--- a/surfsense_web/components/editor/presets.ts
+++ b/surfsense_web/components/editor/presets.ts
@@ -1,6 +1,7 @@
 "use client";
 
 import type { AnyPluginConfig } from "platejs";
+import { TrailingBlockPlugin } from "platejs";
 
 import { AutoformatKit } from "@/components/editor/plugins/autoformat-kit";
 import { BasicNodesKit } from "@/components/editor/plugins/basic-nodes-kit";
@@ -36,6 +37,7 @@ export const fullPreset: AnyPluginConfig[] = [
 	...FloatingToolbarKit,
 	...AutoformatKit,
 	...DndKit,
+	TrailingBlockPlugin,
 ];
 
 /**
@@ -48,8 +50,8 @@ export const minimalPreset: AnyPluginConfig[] = [
 	...ListKit,
 	...CodeBlockKit,
 	...LinkKit,
-	...FloatingToolbarKit,
 	...AutoformatKit,
+	TrailingBlockPlugin,
 ];
 
 /**
diff --git a/surfsense_web/components/editor/source-code-editor.tsx b/surfsense_web/components/editor/source-code-editor.tsx
new file mode 100644
index 0000000000..9102dffe9c
--- /dev/null
+++ b/surfsense_web/components/editor/source-code-editor.tsx
@@ -0,0 +1,162 @@
+"use client";
+
+import dynamic from "next/dynamic";
+import { useTheme } from "next-themes";
+import { useEffect, useRef } from "react";
+import { Spinner } from "@/components/ui/spinner";
+
+const MonacoEditor = dynamic(() => import("@monaco-editor/react"), {
+	ssr: false,
+});
+
+interface SourceCodeEditorProps {
+	value: string;
+	onChange: (next: string) => void;
+	path?: string;
+	language?: string;
+	readOnly?: boolean;
+	fontSize?: number;
+	onSave?: () => Promise<void> | void;
+}
+
+export function SourceCodeEditor({
+	value,
+	onChange,
+	path,
+	language = "plaintext",
+	readOnly = false,
+	fontSize = 12,
+	onSave,
+}: SourceCodeEditorProps) {
+	const { resolvedTheme } = useTheme();
+	const onSaveRef = useRef(onSave);
+	const monacoRef = useRef<any>(null);
+	const normalizedModelPath = (() => {
+		const raw = (path || "local-file.txt").trim();
+		const withLeadingSlash = raw.startsWith("/") ? raw : `/${raw}`;
+		// Monaco model paths should be stable and POSIX-like across platforms.
+		return withLeadingSlash.replace(/\\/g, "/").replace(/\/{2,}/g, "/");
+	})();
+
+	useEffect(() => {
+		onSaveRef.current = onSave;
+	}, [onSave]);
+
+	const resolveCssColorToHex = (cssColorValue: string): string | null => {
+		if (typeof document === "undefined") return null;
+		const probe = document.createElement("div");
+		probe.style.color = cssColorValue;
+		probe.style.position = "absolute";
+		probe.style.pointerEvents = "none";
+		probe.style.opacity = "0";
+		document.body.appendChild(probe);
+		const computedColor = getComputedStyle(probe).color;
+		probe.remove();
+		const match = computedColor.match(/rgba?\((\d+),\s*(\d+),\s*(\d+)/i);
+		if (!match) return null;
+		const toHex = (value: string) => Number(value).toString(16).padStart(2, "0");
+		return `#${toHex(match[1])}${toHex(match[2])}${toHex(match[3])}`;
+	};
+
+	const applySidebarTheme = (monaco: any) => {
+		const isDark = resolvedTheme === "dark";
+		const themeName = isDark ? "surfsense-dark" : "surfsense-light";
+		const fallbackBg = isDark ? "#1e1e1e" : "#ffffff";
+		const sidebarBgHex = resolveCssColorToHex("var(--sidebar)") ?? fallbackBg;
+		monaco.editor.defineTheme(themeName, {
+			base: isDark ? "vs-dark" : "vs",
+			inherit: true,
+			rules: [],
+			colors: {
+				"editor.background": sidebarBgHex,
+				"editorGutter.background": sidebarBgHex,
+				"minimap.background": sidebarBgHex,
+				"editorLineNumber.background": sidebarBgHex,
+				"editor.lineHighlightBackground": "#00000000",
+			},
+		});
+		monaco.editor.setTheme(themeName);
+	};
+
+	useEffect(() => {
+		if (!monacoRef.current) return;
+		applySidebarTheme(monacoRef.current);
+	}, [resolvedTheme]);
+
+	const isManualSaveEnabled = !!onSave && !readOnly;
+
+	return (
+		<div className="h-full w-full overflow-hidden bg-sidebar [&_.monaco-editor]:!bg-sidebar [&_.monaco-editor_.margin]:!bg-sidebar [&_.monaco-editor_.monaco-editor-background]:!bg-sidebar [&_.monaco-editor-background]:!bg-sidebar [&_.monaco-scrollable-element_.scrollbar_.slider]:rounded-full [&_.monaco-scrollable-element_.scrollbar_.slider]:bg-foreground/25 [&_.monaco-scrollable-element_.scrollbar_.slider:hover]:bg-foreground/40">
+			<MonacoEditor
+				path={normalizedModelPath}
+				language={language}
+				value={value}
+				theme={resolvedTheme === "dark" ? "surfsense-dark" : "surfsense-light"}
+				onChange={(next) => onChange(next ?? "")}
+				loading={
+					<div className="flex h-full w-full items-center justify-center">
+						<Spinner size="md" className="text-muted-foreground" />
+					</div>
+				}
+				beforeMount={(monaco) => {
+					monacoRef.current = monaco;
+					applySidebarTheme(monaco);
+				}}
+				onMount={(editor, monaco) => {
+					monacoRef.current = monaco;
+					applySidebarTheme(monaco);
+					if (!isManualSaveEnabled) return;
+					editor.addCommand(monaco.KeyMod.CtrlCmd | monaco.KeyCode.KeyS, () => {
+						void onSaveRef.current?.();
+					});
+				}}
+				options={{
+					automaticLayout: true,
+					minimap: { enabled: false },
+					lineNumbers: "on",
+					lineNumbersMinChars: 4,
+					lineDecorationsWidth: 20,
+					glyphMargin: false,
+					folding: false,
+					overviewRulerLanes: 0,
+					hideCursorInOverviewRuler: true,
+					scrollBeyondLastLine: false,
+					renderLineHighlight: "none",
+					selectionHighlight: false,
+					occurrencesHighlight: "off",
+					quickSuggestions: false,
+					suggestOnTriggerCharacters: false,
+					acceptSuggestionOnEnter: "off",
+					parameterHints: { enabled: false },
+					wordBasedSuggestions: "off",
+					wordWrap: "off",
+					scrollbar: {
+						vertical: "auto",
+						horizontal: "auto",
+						verticalScrollbarSize: 8,
+						horizontalScrollbarSize: 8,
+						alwaysConsumeMouseWheel: false,
+					},
+					tabSize: 2,
+					insertSpaces: true,
+					fontSize,
+					fontFamily:
+						"ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, Liberation Mono, monospace",
+					renderWhitespace: "none",
+					renderValidationDecorations: "off",
+					colorDecorators: false,
+					codeLens: false,
+					hover: { enabled: false },
+					stickyScroll: { enabled: false },
+					unicodeHighlight: {
+						ambiguousCharacters: false,
+						invisibleCharacters: false,
+						nonBasicASCII: false,
+					},
+					smoothScrolling: true,
+					readOnly,
+				}}
+			/>
+		</div>
+	);
+}
diff --git a/surfsense_web/components/editor/utils/escape-mdx.ts b/surfsense_web/components/editor/utils/escape-mdx.ts
index 41e8c9d0a7..14839b9fc6 100644
--- a/surfsense_web/components/editor/utils/escape-mdx.ts
+++ b/surfsense_web/components/editor/utils/escape-mdx.ts
@@ -1,25 +1,58 @@
 // ---------------------------------------------------------------------------
-// MDX curly-brace escaping helper
+// MDX pre-processing helpers
 // ---------------------------------------------------------------------------
-// remarkMdx treats { } as JSX expression delimiters. Arbitrary markdown
-// (e.g. AI-generated reports) can contain curly braces that are NOT valid JS
-// expressions, which makes acorn throw "Could not parse expression".
-// We escape unescaped { and } *outside* of fenced code blocks and inline code
-// so remarkMdx treats them as literal characters while still parsing
-// <mark>, <u>, <kbd>, etc. tags correctly.
+// remarkMdx treats { } as JSX expression delimiters and does NOT support
+// HTML comments (<!-- -->). Arbitrary markdown from document conversions
+// (e.g. PDF-to-markdown via Azure/DocIntel) can contain constructs that
+// break the MDX parser. This module sanitises them before deserialization.
 // ---------------------------------------------------------------------------
 
-const FENCED_OR_INLINE_CODE = /(```[\s\S]*?```|`[^`\n]+`)/g;
+import { FENCED_OR_INLINE_CODE } from "@/lib/markdown/code-regions";
 
-export function escapeMdxExpressions(md: string): string {
+// Strip HTML comments that MDX cannot parse.
+// PDF converters emit <!-- PageHeader="..." -->, <!-- PageBreak -->, etc.
+// MDX uses JSX-style comments and chokes on HTML comments, causing the
+// parser to stop at the first occurrence.
+// - <!-- PageBreak --> becomes a thematic break (---)
+// - All other HTML comments are removed
+function stripHtmlComments(md: string): string {
+	return md.replace(/<!--\s*PageBreak\s*-->/gi, "\n---\n").replace(/<!--[\s\S]*?-->/g, "");
+}
+
+// Convert <figure>...</figure> blocks to plain text blockquotes.
+// <figure> with arbitrary text content is not valid JSX, causing the MDX
+// parser to fail.
+function convertFigureBlocks(md: string): string {
+	return md.replace(/<figure[^>]*>([\s\S]*?)<\/figure>/gi, (_match, inner: string) => {
+		const trimmed = (inner as string).trim();
+		if (!trimmed) return "";
+		const quoted = trimmed
+			.split("\n")
+			.map((line) => `> ${line}`)
+			.join("\n");
+		return `\n${quoted}\n`;
+	});
+}
+
+// Escape unescaped { and } outside of fenced/inline code so remarkMdx
+// treats them as literal characters rather than JSX expression delimiters.
+function escapeCurlyBraces(md: string): string {
 	const parts = md.split(FENCED_OR_INLINE_CODE);
 
 	return parts
 		.map((part, i) => {
-			// Odd indices are code blocks / inline code – leave untouched
 			if (i % 2 === 1) return part;
-			// Escape { and } that are NOT already escaped (no preceding \)
 			return part.replace(/(?<!\\)\{/g, "\\{").replace(/(?<!\\)\}/g, "\\}");
 		})
 		.join("");
 }
+
+// Pre-process raw markdown so it can be safely parsed by the MDX-enabled
+// Plate editor. Applies all sanitisation steps in order.
+export function escapeMdxExpressions(md: string): string {
+	let result = md;
+	result = stripHtmlComments(result);
+	result = convertFigureBlocks(result);
+	result = escapeCurlyBraces(result);
+	return result;
+}
diff --git a/surfsense_web/components/editor/utils/safe-deserialize.ts b/surfsense_web/components/editor/utils/safe-deserialize.ts
new file mode 100644
index 0000000000..8f3e6275bf
--- /dev/null
+++ b/surfsense_web/components/editor/utils/safe-deserialize.ts
@@ -0,0 +1,61 @@
+// ---------------------------------------------------------------------------
+// Safe markdown deserialization for the Plate editor
+// ---------------------------------------------------------------------------
+// `remark-mdx` treats any HTML-like tag as JSX, so unbalanced inline HTML
+// (very common in GitHub READMEs, web-scraped pages, PDF conversions) makes
+// it throw "Expected a closing tag for `<a>`" and crash the editor.
+//
+// Per the MDX maintainers' guidance (mdx-js/mdx, ipikuka/next-mdx-remote-client
+// #14), MDX is the wrong format for untrusted markdown and the recommended
+// fix is to fall back to plain markdown parsing. `MarkdownPlugin.deserialize`
+// accepts a per-call `remarkPlugins` override, so we can:
+//
+//   1. Try with `remarkMdx` (rich MDX features, e.g. JSX-style components).
+//   2. On failure, retry without `remarkMdx` (lenient HTML, like GitHub).
+//   3. As a last resort, render the raw source in a paragraph so the user
+//      never sees a crashed editor.
+// ---------------------------------------------------------------------------
+
+import { MarkdownPlugin, remarkMdx } from "@platejs/markdown";
+import type { Descendant } from "platejs";
+import remarkGfm from "remark-gfm";
+import remarkMath from "remark-math";
+import type { PlateEditorInstance } from "@/components/editor/plate-editor";
+
+const STRICT_PLUGINS = [remarkGfm, remarkMath, remarkMdx];
+const LENIENT_PLUGINS = [remarkGfm, remarkMath];
+
+function plainTextFallback(markdown: string): Descendant[] {
+	return [
+		{
+			type: "p",
+			children: [{ text: markdown }],
+		} as unknown as Descendant,
+	];
+}
+
+/**
+ * Deserialize markdown into a Plate value, gracefully degrading when the
+ * MDX-strict parser rejects raw HTML. Always returns a renderable value;
+ * never throws.
+ */
+export function safeDeserializeMarkdown(
+	editor: PlateEditorInstance,
+	markdown: string
+): Descendant[] {
+	const api = editor.getApi(MarkdownPlugin).markdown;
+
+	try {
+		return api.deserialize(markdown, { remarkPlugins: STRICT_PLUGINS }) as Descendant[];
+	} catch (mdxError) {
+		if (process.env.NODE_ENV !== "production") {
+			console.warn("[plate-editor] MDX parse failed, retrying without remark-mdx:", mdxError);
+		}
+		try {
+			return api.deserialize(markdown, { remarkPlugins: LENIENT_PLUGINS }) as Descendant[];
+		} catch (fallbackError) {
+			console.error("[plate-editor] markdown deserialize failed:", fallbackError);
+			return plainTextFallback(markdown);
+		}
+	}
+}
diff --git a/surfsense_web/components/free-chat/anonymous-chat.tsx b/surfsense_web/components/free-chat/anonymous-chat.tsx
new file mode 100644
index 0000000000..0164c870b5
--- /dev/null
+++ b/surfsense_web/components/free-chat/anonymous-chat.tsx
@@ -0,0 +1,295 @@
+"use client";
+
+import { ArrowUp, Loader2, Square } from "lucide-react";
+import { useCallback, useEffect, useRef, useState } from "react";
+import { Button } from "@/components/ui/button";
+import type { AnonModel, AnonQuotaResponse } from "@/contracts/types/anonymous-chat.types";
+import { anonymousChatApiService } from "@/lib/apis/anonymous-chat-api.service";
+import { readSSEStream } from "@/lib/chat/streaming-state";
+import { trackAnonymousChatMessageSent } from "@/lib/posthog/events";
+import { cn } from "@/lib/utils";
+import { QuotaBar } from "./quota-bar";
+import { QuotaWarningBanner } from "./quota-warning-banner";
+
+interface Message {
+	id: string;
+	role: "user" | "assistant";
+	content: string;
+}
+
+interface AnonymousChatProps {
+	model: AnonModel;
+}
+
+export function AnonymousChat({ model }: AnonymousChatProps) {
+	const [messages, setMessages] = useState<Message[]>([]);
+	const [input, setInput] = useState("");
+	const [isStreaming, setIsStreaming] = useState(false);
+	const [quota, setQuota] = useState<AnonQuotaResponse | null>(null);
+	const abortRef = useRef<AbortController | null>(null);
+	const messagesEndRef = useRef<HTMLDivElement>(null);
+	const textareaRef = useRef<HTMLTextAreaElement>(null);
+	const modelSlug = model.seo_slug ?? model.model_name;
+
+	useEffect(() => {
+		anonymousChatApiService.getQuota().then(setQuota).catch(console.error);
+	}, []);
+
+	useEffect(() => {
+		if (messages.length === 0) return;
+		messagesEndRef.current?.scrollIntoView({ behavior: "smooth" });
+	}, [messages.length]);
+
+	const autoResizeTextarea = useCallback(() => {
+		const textarea = textareaRef.current;
+		if (textarea) {
+			textarea.style.height = "auto";
+			textarea.style.height = `${Math.min(textarea.scrollHeight, 200)}px`;
+		}
+	}, []);
+
+	const handleSubmit = useCallback(async () => {
+		const trimmed = input.trim();
+		if (!trimmed || isStreaming) return;
+		if (quota && quota.used >= quota.limit) return;
+
+		const userMsg: Message = { id: crypto.randomUUID(), role: "user", content: trimmed };
+		const assistantId = crypto.randomUUID();
+		const assistantMsg: Message = { id: assistantId, role: "assistant", content: "" };
+
+		setMessages((prev) => [...prev, userMsg, assistantMsg]);
+		setInput("");
+		setIsStreaming(true);
+
+		if (textareaRef.current) {
+			textareaRef.current.style.height = "auto";
+		}
+
+		trackAnonymousChatMessageSent({
+			modelSlug,
+			messageLength: trimmed.length,
+			surface: "free_model_page",
+		});
+
+		const controller = new AbortController();
+		abortRef.current = controller;
+
+		try {
+			const chatHistory = [...messages, userMsg].map((m) => ({
+				role: m.role,
+				content: m.content,
+			}));
+
+			const response = await fetch(
+				`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL || "http://localhost:8000"}/api/v1/public/anon-chat/stream`,
+				{
+					method: "POST",
+					headers: { "Content-Type": "application/json" },
+					credentials: "include",
+					body: JSON.stringify({
+						model_slug: modelSlug,
+						messages: chatHistory,
+					}),
+					signal: controller.signal,
+				}
+			);
+
+			if (!response.ok) {
+				if (response.status === 429) {
+					const errorData = await response.json();
+					setQuota({
+						used: errorData.detail?.used ?? quota?.limit ?? 1000000,
+						limit: errorData.detail?.limit ?? quota?.limit ?? 1000000,
+						remaining: 0,
+						status: "exceeded",
+						warning_threshold: quota?.warning_threshold ?? 800000,
+						captcha_required:
+							errorData.detail?.captcha_required ?? quota?.captcha_required ?? false,
+					});
+					setMessages((prev) => prev.filter((m) => m.id !== assistantId));
+					return;
+				}
+				const body = await response.text().catch(() => "");
+				const errorCode = response.status === 409 ? "THREAD_BUSY" : "SERVER_ERROR";
+				const message =
+					errorCode === "THREAD_BUSY"
+						? "A previous response is still stopping. Please try again in a moment."
+						: `Stream error: ${response.status}`;
+				throw Object.assign(new Error(body || message), { errorCode });
+			}
+
+			for await (const event of readSSEStream(response)) {
+				if (controller.signal.aborted) break;
+
+				if (event.type === "text-delta") {
+					setMessages((prev) =>
+						prev.map((m) => (m.id === assistantId ? { ...m, content: m.content + event.delta } : m))
+					);
+				} else if (event.type === "error") {
+					const message =
+						event.errorCode === "THREAD_BUSY"
+							? "A previous response is still stopping. Please try again in a moment."
+							: event.errorText;
+					setMessages((prev) =>
+						prev.map((m) => (m.id === assistantId ? { ...m, content: m.content || message } : m))
+					);
+				} else if ("type" in event && event.type === "data-token-usage") {
+					// After streaming completes, refresh quota
+					anonymousChatApiService.getQuota().then(setQuota).catch(console.error);
+				}
+			}
+		} catch (err) {
+			if (err instanceof DOMException && err.name === "AbortError") return;
+			console.error("Chat stream error:", err);
+			setMessages((prev) =>
+				prev.map((m) =>
+					m.id === assistantId && !m.content
+						? { ...m, content: "An error occurred. Please try again." }
+						: m
+				)
+			);
+		} finally {
+			setIsStreaming(false);
+			abortRef.current = null;
+			anonymousChatApiService.getQuota().then(setQuota).catch(console.error);
+		}
+	}, [input, isStreaming, messages, modelSlug, quota]);
+
+	const handleCancel = useCallback(() => {
+		abortRef.current?.abort();
+	}, []);
+
+	const handleKeyDown = (e: React.KeyboardEvent) => {
+		if (e.key === "Enter" && !e.shiftKey) {
+			e.preventDefault();
+			handleSubmit();
+		}
+	};
+
+	const isExceeded = quota ? quota.used >= quota.limit : false;
+
+	return (
+		<div className="flex flex-col h-[calc(100vh-8rem)] max-w-3xl mx-auto">
+			{quota && (
+				<QuotaWarningBanner
+					used={quota.used}
+					limit={quota.limit}
+					warningThreshold={quota.warning_threshold}
+					className="mb-3"
+				/>
+			)}
+
+			<div className="flex-1 overflow-y-auto space-y-4 pb-4 min-h-0">
+				{messages.length === 0 && (
+					<div className="flex flex-col items-center justify-center h-full text-center px-4">
+						<div className="rounded-full bg-linear-to-r from-purple-500/10 to-blue-500/10 p-4 mb-4">
+							<div className="h-10 w-10 rounded-full bg-linear-to-r from-purple-500 to-blue-500 flex items-center justify-center">
+								<span className="text-white text-lg font-bold">
+									{model.name.charAt(0).toUpperCase()}
+								</span>
+							</div>
+						</div>
+						<h2 className="text-xl font-semibold mb-2">{model.name}</h2>
+						{model.description && (
+							<p className="text-sm text-muted-foreground max-w-md">{model.description}</p>
+						)}
+						<p className="text-xs text-muted-foreground mt-4">
+							Free to use &middot; No login required &middot; Start typing below
+						</p>
+					</div>
+				)}
+
+				{messages.map((msg) => (
+					<div
+						key={msg.id}
+						className={cn("flex gap-3 px-4", msg.role === "user" ? "justify-end" : "justify-start")}
+					>
+						{msg.role === "assistant" && (
+							<div className="h-7 w-7 rounded-full bg-linear-to-r from-purple-500 to-blue-500 flex items-center justify-center shrink-0 mt-0.5">
+								<span className="text-white text-xs font-bold">
+									{model.name.charAt(0).toUpperCase()}
+								</span>
+							</div>
+						)}
+						<div
+							className={cn(
+								"rounded-2xl px-4 py-2.5 max-w-[80%] text-sm leading-relaxed",
+								msg.role === "user"
+									? "bg-primary text-primary-foreground"
+									: "bg-muted text-foreground"
+							)}
+						>
+							{msg.role === "assistant" && !msg.content && isStreaming ? (
+								<Loader2 className="h-4 w-4 animate-spin text-muted-foreground" />
+							) : (
+								<div className="whitespace-pre-wrap wrap-break-word">{msg.content}</div>
+							)}
+						</div>
+					</div>
+				))}
+				<div ref={messagesEndRef} />
+			</div>
+
+			<div className="border-t pt-3 pb-2 space-y-2">
+				{quota && (
+					<QuotaBar
+						used={quota.used}
+						limit={quota.limit}
+						warningThreshold={quota.warning_threshold}
+					/>
+				)}
+
+				<div className="relative">
+					<textarea
+						ref={textareaRef}
+						value={input}
+						onChange={(e) => {
+							setInput(e.target.value);
+							autoResizeTextarea();
+						}}
+						onKeyDown={handleKeyDown}
+						placeholder={
+							isExceeded
+								? "Token limit reached. Create a free account to continue."
+								: `Message ${model.name}...`
+						}
+						disabled={isExceeded}
+						rows={1}
+						className={cn(
+							"w-full resize-none rounded-xl border bg-background px-4 py-3 pr-12 text-sm",
+							"placeholder:text-muted-foreground focus:outline-none focus:ring-2 focus:ring-ring",
+							"disabled:cursor-not-allowed disabled:opacity-50",
+							"min-h-[44px] max-h-[200px]"
+						)}
+					/>
+					{isStreaming ? (
+						<Button
+							type="button"
+							variant="ghost"
+							size="icon"
+							onClick={handleCancel}
+							className="absolute right-2 bottom-2 size-8 bg-foreground text-background hover:bg-foreground hover:text-background hover:opacity-80"
+						>
+							<Square className="h-3.5 w-3.5" fill="currentColor" />
+						</Button>
+					) : (
+						<Button
+							type="button"
+							variant="ghost"
+							size="icon"
+							onClick={handleSubmit}
+							disabled={!input.trim() || isExceeded}
+							className="absolute right-2 bottom-2 size-8 bg-foreground text-background hover:bg-foreground hover:text-background hover:opacity-80 disabled:opacity-40"
+						>
+							<ArrowUp className="h-4 w-4" />
+						</Button>
+					)}
+				</div>
+
+				<p className="text-center text-[10px] text-muted-foreground">
+					{model.name} via SurfSense &middot; Responses may be inaccurate
+				</p>
+			</div>
+		</div>
+	);
+}
diff --git a/surfsense_web/components/free-chat/free-chat-page.tsx b/surfsense_web/components/free-chat/free-chat-page.tsx
new file mode 100644
index 0000000000..927eaef873
--- /dev/null
+++ b/surfsense_web/components/free-chat/free-chat-page.tsx
@@ -0,0 +1,509 @@
+"use client";
+
+import {
+	type AppendMessage,
+	AssistantRuntimeProvider,
+	type ThreadMessageLike,
+	useExternalStoreRuntime,
+} from "@assistant-ui/react";
+import { Turnstile, type TurnstileInstance } from "@marsidev/react-turnstile";
+import { ShieldCheck } from "lucide-react";
+import { useCallback, useEffect, useRef, useState } from "react";
+import { StepSeparatorDataUI } from "@/components/assistant-ui/step-separator";
+import {
+	createTokenUsageStore,
+	type TokenUsageData,
+	TokenUsageProvider,
+} from "@/components/assistant-ui/token-usage-context";
+import { useAnonymousMode } from "@/contexts/anonymous-mode";
+import { TimelineDataUI } from "@/features/chat-messages/timeline";
+import {
+	addStepSeparator,
+	addToolCall,
+	appendReasoning,
+	appendText,
+	appendToolInputDelta,
+	buildContentForUI,
+	type ContentPartsState,
+	endReasoning,
+	FrameBatchedUpdater,
+	readSSEStream,
+	type ThinkingStepData,
+	updateThinkingSteps,
+	updateToolCall,
+} from "@/lib/chat/streaming-state";
+import { BACKEND_URL } from "@/lib/env-config";
+import { trackAnonymousChatMessageSent } from "@/lib/posthog/events";
+import { FreeModelSelector } from "./free-model-selector";
+import { FreeThread } from "./free-thread";
+
+// Render all tool calls via ToolFallback; backend keeps persisted
+// payloads bounded by summarising / truncating outputs.
+const TOOLS_WITH_UI = "all" as const;
+const TURNSTILE_SITE_KEY = process.env.NEXT_PUBLIC_TURNSTILE_SITE_KEY ?? "";
+
+/** Try to parse a CAPTCHA_REQUIRED or CAPTCHA_INVALID code from a non-ok response. */
+function parseCaptchaError(status: number, body: string): string | null {
+	if (status !== 403) return null;
+	try {
+		const json = JSON.parse(body);
+		const code = json?.detail?.code ?? json?.error?.code;
+		if (code === "CAPTCHA_REQUIRED" || code === "CAPTCHA_INVALID") return code;
+	} catch {
+		/* not JSON */
+	}
+	return null;
+}
+
+function normalizeFreeChatErrorMessage(error: unknown): string {
+	if (!(error instanceof Error)) return "An unexpected error occurred";
+	const code = (error as Error & { errorCode?: string }).errorCode;
+	if (code === "THREAD_BUSY") {
+		return "A previous response is still stopping. Please try again in a moment.";
+	}
+	return error.message || "An unexpected error occurred";
+}
+
+function toFreeChatHttpError(status: number, body: string): Error & { errorCode?: string } {
+	let errorCode: string | undefined;
+	let message = body || `Server error: ${status}`;
+	try {
+		const parsed = JSON.parse(body) as Record<string, unknown>;
+		const detail =
+			typeof parsed.detail === "object" && parsed.detail !== null
+				? (parsed.detail as Record<string, unknown>)
+				: null;
+		errorCode =
+			(typeof detail?.error_code === "string" ? detail.error_code : undefined) ??
+			(typeof detail?.errorCode === "string" ? detail.errorCode : undefined) ??
+			(typeof parsed.error_code === "string" ? parsed.error_code : undefined) ??
+			(typeof parsed.errorCode === "string" ? parsed.errorCode : undefined);
+		message =
+			(typeof detail?.message === "string" ? detail.message : undefined) ??
+			(typeof parsed.message === "string" ? parsed.message : undefined) ??
+			(typeof parsed.detail === "string" ? parsed.detail : undefined) ??
+			message;
+	} catch {
+		// non-json response
+	}
+
+	if (!errorCode) {
+		if (status === 409) errorCode = "THREAD_BUSY";
+		else if (status === 429) errorCode = "RATE_LIMITED";
+		else if (status === 401 || status === 403) errorCode = "AUTH_EXPIRED";
+		else errorCode = "SERVER_ERROR";
+	}
+
+	return Object.assign(new Error(message), { errorCode });
+}
+
+export function FreeChatPage() {
+	const anonMode = useAnonymousMode();
+	const modelSlug = anonMode.isAnonymous ? anonMode.modelSlug : "";
+	const resetKey = anonMode.isAnonymous ? anonMode.resetKey : 0;
+
+	const [messages, setMessages] = useState<ThreadMessageLike[]>([]);
+	const [isRunning, setIsRunning] = useState(false);
+	const [tokenUsageStore] = useState(() => createTokenUsageStore());
+	const abortControllerRef = useRef<AbortController | null>(null);
+
+	// Turnstile CAPTCHA state
+	const [captchaRequired, setCaptchaRequired] = useState(false);
+	const turnstileRef = useRef<TurnstileInstance | null>(null);
+	const turnstileTokenRef = useRef<string | null>(null);
+	const pendingRetryRef = useRef<{
+		messageHistory: { role: string; content: string }[];
+		userMsgId: string;
+	} | null>(null);
+
+	useEffect(() => {
+		setMessages([]);
+		tokenUsageStore.clear();
+		if (abortControllerRef.current) {
+			abortControllerRef.current.abort();
+			abortControllerRef.current = null;
+		}
+		setIsRunning(false);
+		setCaptchaRequired(false);
+		turnstileTokenRef.current = null;
+		pendingRetryRef.current = null;
+	}, [resetKey, modelSlug, tokenUsageStore]);
+
+	const cancelRun = useCallback(() => {
+		if (abortControllerRef.current) {
+			abortControllerRef.current.abort();
+			abortControllerRef.current = null;
+		}
+		setIsRunning(false);
+	}, []);
+
+	/**
+	 * Core streaming logic shared by initial sends and CAPTCHA retries.
+	 * Returns "captcha" if the server demands a CAPTCHA, otherwise void.
+	 */
+	const doStream = useCallback(
+		async (
+			messageHistory: { role: string; content: string }[],
+			assistantMsgId: string,
+			signal: AbortSignal,
+			turnstileToken: string | null
+		): Promise<"captcha" | void> => {
+			const reqBody: Record<string, unknown> = {
+				model_slug: modelSlug,
+				messages: messageHistory,
+			};
+			if (turnstileToken) reqBody.turnstile_token = turnstileToken;
+
+			const response = await fetch(`${BACKEND_URL}/api/v1/public/anon-chat/stream`, {
+				method: "POST",
+				headers: { "Content-Type": "application/json" },
+				credentials: "include",
+				body: JSON.stringify(reqBody),
+				signal,
+			});
+
+			if (!response.ok) {
+				const body = await response.text().catch(() => "");
+				const captchaCode = parseCaptchaError(response.status, body);
+				if (captchaCode) return "captcha";
+				throw toFreeChatHttpError(response.status, body);
+			}
+
+			const currentThinkingSteps = new Map<string, ThinkingStepData>();
+			const batcher = new FrameBatchedUpdater();
+			const contentPartsState: ContentPartsState = {
+				contentParts: [],
+				currentTextPartIndex: -1,
+				currentReasoningPartIndex: -1,
+				toolCallIndices: new Map(),
+			};
+			const { toolCallIndices } = contentPartsState;
+
+			const flushMessages = () => {
+				setMessages((prev) =>
+					prev.map((m) =>
+						m.id === assistantMsgId
+							? { ...m, content: buildContentForUI(contentPartsState, TOOLS_WITH_UI) }
+							: m
+					)
+				);
+			};
+			const scheduleFlush = () => batcher.schedule(flushMessages);
+			const forceFlush = () => {
+				scheduleFlush();
+				batcher.flush();
+			};
+
+			try {
+				for await (const parsed of readSSEStream(response)) {
+					switch (parsed.type) {
+						case "text-delta":
+							appendText(contentPartsState, parsed.delta);
+							scheduleFlush();
+							break;
+
+						case "reasoning-delta":
+							appendReasoning(contentPartsState, parsed.delta);
+							scheduleFlush();
+							break;
+
+						case "reasoning-end":
+							endReasoning(contentPartsState);
+							scheduleFlush();
+							break;
+
+						case "start-step":
+							addStepSeparator(contentPartsState);
+							scheduleFlush();
+							break;
+
+						case "finish-step":
+							break;
+
+						case "tool-input-start":
+							addToolCall(
+								contentPartsState,
+								TOOLS_WITH_UI,
+								parsed.toolCallId,
+								parsed.toolName,
+								{},
+								false,
+								parsed.langchainToolCallId,
+								parsed.metadata
+							);
+							forceFlush();
+							break;
+
+						case "tool-input-delta":
+							appendToolInputDelta(contentPartsState, parsed.toolCallId, parsed.inputTextDelta);
+							scheduleFlush();
+							break;
+
+						case "tool-input-available": {
+							const finalArgsText = JSON.stringify(parsed.input ?? {}, null, 2);
+							if (toolCallIndices.has(parsed.toolCallId)) {
+								updateToolCall(contentPartsState, parsed.toolCallId, {
+									args: parsed.input || {},
+									argsText: finalArgsText,
+									langchainToolCallId: parsed.langchainToolCallId,
+									metadata: parsed.metadata,
+								});
+							} else {
+								addToolCall(
+									contentPartsState,
+									TOOLS_WITH_UI,
+									parsed.toolCallId,
+									parsed.toolName,
+									parsed.input || {},
+									false,
+									parsed.langchainToolCallId,
+									parsed.metadata
+								);
+								updateToolCall(contentPartsState, parsed.toolCallId, {
+									argsText: finalArgsText,
+								});
+							}
+							forceFlush();
+							break;
+						}
+
+						case "tool-output-available":
+							updateToolCall(contentPartsState, parsed.toolCallId, {
+								result: parsed.output,
+								langchainToolCallId: parsed.langchainToolCallId,
+								metadata: parsed.metadata,
+							});
+							forceFlush();
+							break;
+
+						case "data-thinking-step": {
+							const stepData = parsed.data as ThinkingStepData;
+							if (stepData?.id) {
+								currentThinkingSteps.set(stepData.id, stepData);
+								if (updateThinkingSteps(contentPartsState, currentThinkingSteps)) scheduleFlush();
+							}
+							break;
+						}
+
+						case "data-token-usage":
+							tokenUsageStore.set(assistantMsgId, parsed.data as TokenUsageData);
+							break;
+
+						case "error":
+							throw Object.assign(new Error(parsed.errorText || "Server error"), {
+								errorCode: parsed.errorCode,
+							});
+					}
+				}
+				batcher.flush();
+			} catch (err) {
+				batcher.dispose();
+				throw err;
+			}
+		},
+		[modelSlug, tokenUsageStore]
+	);
+
+	const onNew = useCallback(
+		async (message: AppendMessage) => {
+			let userQuery = "";
+			for (const part of message.content) {
+				if (part.type === "text") userQuery += part.text;
+			}
+			if (!userQuery.trim()) return;
+
+			trackAnonymousChatMessageSent({
+				modelSlug,
+				messageLength: userQuery.trim().length,
+				hasUploadedDoc: anonMode.isAnonymous && anonMode.uploadedDoc !== null ? true : false,
+				surface: "free_chat_page",
+			});
+
+			const userMsgId = `msg-user-${Date.now()}`;
+			setMessages((prev) => [
+				...prev,
+				{
+					id: userMsgId,
+					role: "user" as const,
+					content: [{ type: "text" as const, text: userQuery }],
+					createdAt: new Date(),
+				},
+			]);
+
+			setIsRunning(true);
+			const controller = new AbortController();
+			abortControllerRef.current = controller;
+
+			const assistantMsgId = `msg-assistant-${Date.now()}`;
+			setMessages((prev) => [
+				...prev,
+				{
+					id: assistantMsgId,
+					role: "assistant" as const,
+					content: [{ type: "text" as const, text: "" }],
+					createdAt: new Date(),
+				},
+			]);
+
+			const messageHistory = messages
+				.filter((m) => m.role === "user" || m.role === "assistant")
+				.map((m) => {
+					let text = "";
+					for (const part of m.content) {
+						if (typeof part === "object" && part.type === "text" && "text" in part) {
+							text += (part as { type: "text"; text: string }).text;
+						}
+					}
+					return { role: m.role as string, content: text };
+				})
+				.filter((m) => m.content.length > 0);
+			messageHistory.push({ role: "user", content: userQuery.trim() });
+
+			try {
+				const result = await doStream(
+					messageHistory,
+					assistantMsgId,
+					controller.signal,
+					turnstileTokenRef.current
+				);
+
+				// Consume the token after use regardless of outcome
+				turnstileTokenRef.current = null;
+
+				if (result === "captcha" && TURNSTILE_SITE_KEY) {
+					// Remove the empty assistant placeholder; keep the user message
+					setMessages((prev) => prev.filter((m) => m.id !== assistantMsgId));
+					pendingRetryRef.current = { messageHistory, userMsgId };
+					setCaptchaRequired(true);
+					setIsRunning(false);
+					abortControllerRef.current = null;
+					return;
+				}
+			} catch (error) {
+				if (error instanceof Error && error.name === "AbortError") return;
+				console.error("[FreeChatPage] Chat error:", error);
+				const errorText = normalizeFreeChatErrorMessage(error);
+				setMessages((prev) =>
+					prev.map((m) =>
+						m.id === assistantMsgId
+							? { ...m, content: [{ type: "text" as const, text: `Error: ${errorText}` }] }
+							: m
+					)
+				);
+			} finally {
+				setIsRunning(false);
+				abortControllerRef.current = null;
+			}
+		},
+		[messages, doStream]
+	);
+
+	/** Called when Turnstile resolves successfully. Stores the token and auto-retries. */
+	const handleTurnstileSuccess = useCallback(
+		async (token: string) => {
+			turnstileTokenRef.current = token;
+			setCaptchaRequired(false);
+
+			const pending = pendingRetryRef.current;
+			if (!pending) return;
+			pendingRetryRef.current = null;
+
+			setIsRunning(true);
+			const controller = new AbortController();
+			abortControllerRef.current = controller;
+
+			const assistantMsgId = `msg-assistant-${Date.now()}`;
+			setMessages((prev) => [
+				...prev,
+				{
+					id: assistantMsgId,
+					role: "assistant" as const,
+					content: [{ type: "text" as const, text: "" }],
+					createdAt: new Date(),
+				},
+			]);
+
+			try {
+				const result = await doStream(
+					pending.messageHistory,
+					assistantMsgId,
+					controller.signal,
+					token
+				);
+				turnstileTokenRef.current = null;
+
+				if (result === "captcha") {
+					setMessages((prev) => prev.filter((m) => m.id !== assistantMsgId));
+					pendingRetryRef.current = pending;
+					setCaptchaRequired(true);
+					turnstileRef.current?.reset();
+				}
+			} catch (error) {
+				if (error instanceof Error && error.name === "AbortError") return;
+				console.error("[FreeChatPage] Retry error:", error);
+				const errorText = normalizeFreeChatErrorMessage(error);
+				setMessages((prev) =>
+					prev.map((m) =>
+						m.id === assistantMsgId
+							? { ...m, content: [{ type: "text" as const, text: `Error: ${errorText}` }] }
+							: m
+					)
+				);
+			} finally {
+				setIsRunning(false);
+				abortControllerRef.current = null;
+			}
+		},
+		[doStream]
+	);
+
+	const convertMessage = useCallback(
+		(message: ThreadMessageLike): ThreadMessageLike => message,
+		[]
+	);
+
+	const runtime = useExternalStoreRuntime({
+		messages,
+		isRunning,
+		onNew,
+		convertMessage,
+		onCancel: cancelRun,
+	});
+
+	return (
+		<TokenUsageProvider store={tokenUsageStore}>
+			<AssistantRuntimeProvider runtime={runtime}>
+				<TimelineDataUI />
+				<StepSeparatorDataUI />
+				<div className="flex h-full flex-col overflow-hidden">
+					<div className="flex h-14 shrink-0 items-center justify-between border-b border-border/40 px-4">
+						<FreeModelSelector />
+					</div>
+
+					{captchaRequired && TURNSTILE_SITE_KEY && (
+						<div className="flex flex-col items-center gap-3 border-b border-border/40 bg-muted/30 py-4">
+							<div className="flex items-center gap-2 text-sm text-muted-foreground">
+								<ShieldCheck className="h-4 w-4" />
+								<span>Quick verification to continue chatting</span>
+							</div>
+							<Turnstile
+								ref={turnstileRef}
+								siteKey={TURNSTILE_SITE_KEY}
+								onSuccess={handleTurnstileSuccess}
+								onError={() => turnstileRef.current?.reset()}
+								onExpire={() => turnstileRef.current?.reset()}
+								options={{ theme: "auto", size: "normal" }}
+							/>
+						</div>
+					)}
+
+					<div className="flex flex-1 min-h-0 overflow-hidden">
+						<div className="flex-1 flex flex-col min-w-0">
+							<FreeThread />
+						</div>
+					</div>
+				</div>
+			</AssistantRuntimeProvider>
+		</TokenUsageProvider>
+	);
+}
diff --git a/surfsense_web/components/free-chat/free-composer.tsx b/surfsense_web/components/free-chat/free-composer.tsx
new file mode 100644
index 0000000000..943ace9aaf
--- /dev/null
+++ b/surfsense_web/components/free-chat/free-composer.tsx
@@ -0,0 +1,254 @@
+"use client";
+
+import { ComposerPrimitive, useAui, useAuiState } from "@assistant-ui/react";
+import { ArrowUpIcon, Globe, Paperclip, SquareIcon } from "lucide-react";
+import { type FC, useCallback, useRef, useState } from "react";
+import { toast } from "sonner";
+import { TooltipIconButton } from "@/components/assistant-ui/tooltip-icon-button";
+import { Button } from "@/components/ui/button";
+import { Switch } from "@/components/ui/switch";
+import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
+import { useAnonymousMode } from "@/contexts/anonymous-mode";
+import { useLoginGate } from "@/contexts/login-gate";
+import { anonymousChatApiService } from "@/lib/apis/anonymous-chat-api.service";
+import { cn } from "@/lib/utils";
+
+const ANON_ALLOWED_EXTENSIONS = new Set([
+	".md",
+	".markdown",
+	".txt",
+	".text",
+	".json",
+	".jsonl",
+	".yaml",
+	".yml",
+	".toml",
+	".ini",
+	".cfg",
+	".conf",
+	".xml",
+	".css",
+	".scss",
+	".py",
+	".js",
+	".jsx",
+	".ts",
+	".tsx",
+	".java",
+	".kt",
+	".go",
+	".rs",
+	".rb",
+	".php",
+	".c",
+	".h",
+	".cpp",
+	".hpp",
+	".cs",
+	".swift",
+	".sh",
+	".sql",
+	".log",
+	".rst",
+	".tex",
+	".vue",
+	".svelte",
+	".astro",
+	".tf",
+	".proto",
+	".csv",
+	".tsv",
+	".html",
+	".htm",
+	".xhtml",
+]);
+
+const ACCEPT_EXTENSIONS = Array.from(ANON_ALLOWED_EXTENSIONS).join(",");
+
+export const FreeComposer: FC = () => {
+	const aui = useAui();
+	const isRunning = useAuiState(({ thread }) => thread.isRunning);
+	const { gate } = useLoginGate();
+	const anonMode = useAnonymousMode();
+	const [text, setText] = useState("");
+	const [webSearchEnabled, setWebSearchEnabled] = useState(true);
+	const fileInputRef = useRef<HTMLInputElement>(null);
+
+	const hasUploadedDoc = anonMode.isAnonymous && anonMode.uploadedDoc !== null;
+
+	const handleTextChange = useCallback(
+		(e: React.ChangeEvent<HTMLTextAreaElement>) => {
+			setText(e.target.value);
+			aui.composer().setText(e.target.value);
+		},
+		[aui]
+	);
+
+	const handleKeyDown = useCallback(
+		(e: React.KeyboardEvent<HTMLTextAreaElement>) => {
+			if (e.key === "/" && text === "") {
+				e.preventDefault();
+				gate("use saved prompts");
+				return;
+			}
+			if (e.key === "@") {
+				e.preventDefault();
+				gate("mention documents");
+				return;
+			}
+			if (e.key === "Enter" && !e.shiftKey) {
+				e.preventDefault();
+				if (text.trim()) {
+					aui.composer().send();
+					setText("");
+				}
+			}
+		},
+		[text, aui, gate]
+	);
+
+	const handleUploadClick = useCallback(() => {
+		if (hasUploadedDoc) {
+			gate("upload more documents");
+			return;
+		}
+		fileInputRef.current?.click();
+	}, [hasUploadedDoc, gate]);
+
+	const handleFileChange = useCallback(
+		async (e: React.ChangeEvent<HTMLInputElement>) => {
+			const file = e.target.files?.[0];
+			if (!file) return;
+			e.target.value = "";
+
+			const ext = `.${file.name.split(".").pop()?.toLowerCase()}`;
+			if (!ANON_ALLOWED_EXTENSIONS.has(ext)) {
+				gate("upload PDFs, Word documents, images, and more");
+				return;
+			}
+
+			try {
+				const result = await anonymousChatApiService.uploadDocument(file);
+				if (!result.ok) {
+					if (result.reason === "quota_exceeded") gate("upload more documents");
+					return;
+				}
+				const data = result.data;
+				if (anonMode.isAnonymous) {
+					anonMode.setUploadedDoc({
+						filename: data.filename,
+						sizeBytes: data.size_bytes,
+					});
+				}
+				toast.success(`Uploaded "${data.filename}"`);
+			} catch (err) {
+				console.error("Upload failed:", err);
+				toast.error(err instanceof Error ? err.message : "Upload failed");
+			}
+		},
+		[gate, anonMode]
+	);
+
+	return (
+		<ComposerPrimitive.Root className="aui-composer-root relative mx-auto flex w-full max-w-(--thread-max-width) flex-col rounded-2xl border border-border/40 bg-background shadow-xs transition-shadow focus-within:shadow-md dark:bg-neutral-900">
+			{hasUploadedDoc && anonMode.isAnonymous && (
+				<div className="flex items-center gap-2 px-3 pt-2">
+					<Paperclip className="size-3.5 text-muted-foreground" />
+					<span className="text-xs text-muted-foreground truncate">
+						{anonMode.uploadedDoc?.filename}
+					</span>
+					<span className="text-xs text-muted-foreground/60">(1/1)</span>
+				</div>
+			)}
+
+			<textarea
+				placeholder="Ask anything..."
+				value={text}
+				onChange={handleTextChange}
+				onKeyDown={handleKeyDown}
+				rows={1}
+				className={cn(
+					"w-full resize-none bg-transparent px-4 pt-3 pb-0 text-sm",
+					"placeholder:text-muted-foreground focus:outline-none",
+					"min-h-[44px] max-h-[200px]"
+				)}
+				style={{ fieldSizing: "content" } as React.CSSProperties}
+			/>
+
+			<div className="flex items-center justify-between gap-2 px-3 pb-2 pt-1">
+				<div className="flex items-center gap-2">
+					<input
+						ref={fileInputRef}
+						type="file"
+						accept={ACCEPT_EXTENSIONS}
+						className="hidden"
+						onChange={handleFileChange}
+					/>
+					<Tooltip>
+						<TooltipTrigger asChild>
+							<Button
+								type="button"
+								variant="ghost"
+								onClick={handleUploadClick}
+								className={cn(
+									"h-auto gap-1.5 rounded-md px-2 py-1 text-xs transition-colors",
+									"text-muted-foreground hover:text-accent-foreground hover:bg-accent",
+									hasUploadedDoc && "text-primary"
+								)}
+							>
+								<Paperclip className="size-3.5" />
+								{hasUploadedDoc ? "1/1" : "Upload"}
+							</Button>
+						</TooltipTrigger>
+						<TooltipContent>
+							{hasUploadedDoc
+								? "Document limit reached. Create an account for more."
+								: "Upload a document (text files only)"}
+						</TooltipContent>
+					</Tooltip>
+
+					<div className="h-4 w-px bg-border/60" />
+
+					<Tooltip>
+						<TooltipTrigger asChild>
+							<label
+								htmlFor="free-web-search-toggle"
+								className="flex items-center gap-1.5 cursor-pointer select-none rounded-md px-2 py-1 text-xs text-muted-foreground hover:text-accent-foreground hover:bg-accent transition-colors"
+							>
+								<Globe className="size-3.5" />
+								<span className="hidden sm:inline">Web</span>
+								<Switch
+									id="free-web-search-toggle"
+									checked={webSearchEnabled}
+									onCheckedChange={setWebSearchEnabled}
+									className="scale-75"
+								/>
+							</label>
+						</TooltipTrigger>
+						<TooltipContent>Toggle web search</TooltipContent>
+					</Tooltip>
+				</div>
+
+				<div className="flex items-center gap-1">
+					{!isRunning ? (
+						<ComposerPrimitive.Send asChild>
+							<TooltipIconButton tooltip="Send" variant="default" className="size-8 rounded-full">
+								<ArrowUpIcon />
+							</TooltipIconButton>
+						</ComposerPrimitive.Send>
+					) : (
+						<ComposerPrimitive.Cancel asChild>
+							<TooltipIconButton
+								tooltip="Cancel"
+								variant="destructive"
+								className="size-8 rounded-full"
+							>
+								<SquareIcon className="size-3.5" />
+							</TooltipIconButton>
+						</ComposerPrimitive.Cancel>
+					)}
+				</div>
+			</div>
+		</ComposerPrimitive.Root>
+	);
+};
diff --git a/surfsense_web/components/free-chat/free-model-selector.tsx b/surfsense_web/components/free-chat/free-model-selector.tsx
new file mode 100644
index 0000000000..b7ae60bd33
--- /dev/null
+++ b/surfsense_web/components/free-chat/free-model-selector.tsx
@@ -0,0 +1,208 @@
+"use client";
+
+import { Bot, Check, ChevronDown, Search } from "lucide-react";
+import { useRouter } from "next/navigation";
+import { useCallback, useEffect, useMemo, useRef, useState } from "react";
+import { Badge } from "@/components/ui/badge";
+import { Button } from "@/components/ui/button";
+import { Popover, PopoverContent, PopoverTrigger } from "@/components/ui/popover";
+import { useAnonymousMode } from "@/contexts/anonymous-mode";
+import type { AnonModel } from "@/contracts/types/anonymous-chat.types";
+import { anonymousChatApiService } from "@/lib/apis/anonymous-chat-api.service";
+import { getProviderIcon } from "@/lib/provider-icons";
+import { cn } from "@/lib/utils";
+
+export function FreeModelSelector({ className }: { className?: string }) {
+	const router = useRouter();
+	const anonMode = useAnonymousMode();
+	const currentSlug = anonMode.isAnonymous ? anonMode.modelSlug : "";
+
+	const [open, setOpen] = useState(false);
+	const [models, setModels] = useState<AnonModel[]>([]);
+	const [searchQuery, setSearchQuery] = useState("");
+	const [focusedIndex, setFocusedIndex] = useState(-1);
+	const searchInputRef = useRef<HTMLInputElement>(null);
+
+	useEffect(() => {
+		anonymousChatApiService.getModels().then(setModels).catch(console.error);
+	}, []);
+
+	const handleOpenChange = useCallback((next: boolean) => {
+		if (next) {
+			setSearchQuery("");
+			setFocusedIndex(-1);
+			requestAnimationFrame(() => searchInputRef.current?.focus());
+		}
+		setOpen(next);
+	}, []);
+
+	const currentModel = useMemo(
+		() => models.find((m) => m.seo_slug === currentSlug) ?? null,
+		[models, currentSlug]
+	);
+
+	const sortedModels = useMemo(
+		() => [...models].sort((a, b) => Number(a.is_premium) - Number(b.is_premium)),
+		[models]
+	);
+
+	const filteredModels = useMemo(() => {
+		if (!searchQuery.trim()) return sortedModels;
+		const q = searchQuery.toLowerCase();
+		return sortedModels.filter(
+			(m) =>
+				m.name.toLowerCase().includes(q) ||
+				m.model_name.toLowerCase().includes(q) ||
+				m.provider.toLowerCase().includes(q)
+		);
+	}, [sortedModels, searchQuery]);
+
+	const handleSelect = useCallback(
+		(model: AnonModel) => {
+			setOpen(false);
+			if (model.seo_slug === currentSlug) return;
+			if (anonMode.isAnonymous) {
+				anonMode.setModelSlug(model.seo_slug ?? "");
+				anonMode.resetChat();
+			}
+			router.replace(`/free/${model.seo_slug}`);
+		},
+		[currentSlug, anonMode, router]
+	);
+
+	const handleKeyDown = useCallback(
+		(e: React.KeyboardEvent<HTMLInputElement>) => {
+			const count = filteredModels.length;
+			if (count === 0) return;
+			switch (e.key) {
+				case "ArrowDown":
+					e.preventDefault();
+					setFocusedIndex((p) => (p < count - 1 ? p + 1 : 0));
+					break;
+				case "ArrowUp":
+					e.preventDefault();
+					setFocusedIndex((p) => (p > 0 ? p - 1 : count - 1));
+					break;
+				case "Enter":
+					e.preventDefault();
+					if (focusedIndex >= 0 && focusedIndex < count) {
+						handleSelect(filteredModels[focusedIndex]);
+					}
+					break;
+			}
+		},
+		[filteredModels, focusedIndex, handleSelect]
+	);
+
+	return (
+		<Popover open={open} onOpenChange={handleOpenChange}>
+			<PopoverTrigger asChild>
+				<Button
+					variant="ghost"
+					size="sm"
+					role="combobox"
+					aria-expanded={open}
+					className={cn(
+						"h-8 gap-2 px-3 text-sm bg-muted hover:bg-muted/80 border-0 select-none",
+						className
+					)}
+				>
+					{currentModel ? (
+						<>
+							{getProviderIcon(currentModel.provider, { className: "size-4" })}
+							<span className="max-w-[160px] truncate">{currentModel.name}</span>
+						</>
+					) : (
+						<>
+							<Bot className="size-4 text-muted-foreground" />
+							<span className="text-muted-foreground">Select Model</span>
+						</>
+					)}
+					<ChevronDown className="h-3.5 w-3.5 text-muted-foreground ml-1 shrink-0" />
+				</Button>
+			</PopoverTrigger>
+			<PopoverContent
+				className="w-[320px] p-0 rounded-lg shadow-lg overflow-hidden select-none"
+				align="start"
+				sideOffset={8}
+				onCloseAutoFocus={(e) => e.preventDefault()}
+			>
+				<div className="relative">
+					<Search className="absolute left-3 top-1/2 -translate-y-1/2 size-3.5 text-muted-foreground pointer-events-none" />
+					<input
+						ref={searchInputRef}
+						placeholder="Search models"
+						value={searchQuery}
+						onChange={(e) => setSearchQuery(e.target.value)}
+						onKeyDown={handleKeyDown}
+						className="w-full pl-8 pr-3 py-2.5 text-sm bg-transparent focus:outline-none placeholder:text-muted-foreground"
+					/>
+				</div>
+				<div className="overflow-y-auto max-h-[320px] py-1 space-y-0.5">
+					{filteredModels.length === 0 ? (
+						<div className="flex flex-col items-center justify-center gap-2 py-8 px-4">
+							<Search className="size-6 text-muted-foreground" />
+							<p className="text-sm text-muted-foreground">No models found</p>
+						</div>
+					) : (
+						filteredModels.map((model, index) => {
+							const isSelected = model.seo_slug === currentSlug;
+							const isFocused = focusedIndex === index;
+							return (
+								<div
+									key={model.id}
+									role="option"
+									tabIndex={0}
+									aria-selected={isSelected}
+									onClick={() => handleSelect(model)}
+									onKeyDown={(e) => {
+										if (e.key === "Enter" || e.key === " ") {
+											e.preventDefault();
+											handleSelect(model);
+										}
+									}}
+									onMouseEnter={() => setFocusedIndex(index)}
+									className={cn(
+										"group flex items-center gap-2.5 px-3 py-2 rounded-xl cursor-pointer",
+										"transition-colors duration-150 mx-2",
+										"hover:bg-accent hover:text-accent-foreground",
+										isFocused && "bg-accent text-accent-foreground",
+										isSelected && "bg-accent text-accent-foreground"
+									)}
+								>
+									<div className="shrink-0">
+										{getProviderIcon(model.provider, { className: "size-5" })}
+									</div>
+									<div className="flex-1 min-w-0">
+										<div className="flex items-center gap-1.5">
+											<span className="font-medium text-sm truncate">{model.name}</span>
+											{model.is_premium ? (
+												<Badge
+													variant="secondary"
+													className="text-[9px] px-1 py-0 h-3.5 bg-purple-100 text-purple-700 dark:bg-purple-900/50 dark:text-purple-300 border-0"
+												>
+													Premium
+												</Badge>
+											) : (
+												<Badge
+													variant="secondary"
+													className="text-[9px] px-1 py-0 h-3.5 bg-emerald-100 text-emerald-700 dark:bg-emerald-900/50 dark:text-emerald-300 border-0"
+												>
+													Free
+												</Badge>
+											)}
+										</div>
+										<span className="text-xs text-muted-foreground truncate block">
+											{model.model_name}
+										</span>
+									</div>
+									{isSelected && <Check className="size-4 text-primary shrink-0" />}
+								</div>
+							);
+						})
+					)}
+				</div>
+			</PopoverContent>
+		</Popover>
+	);
+}
diff --git a/surfsense_web/components/free-chat/free-right-panel.tsx b/surfsense_web/components/free-chat/free-right-panel.tsx
new file mode 100644
index 0000000000..f2b07815fa
--- /dev/null
+++ b/surfsense_web/components/free-chat/free-right-panel.tsx
@@ -0,0 +1,45 @@
+"use client";
+
+import { Lock } from "lucide-react";
+import Link from "next/link";
+import type { FC } from "react";
+import { Button } from "@/components/ui/button";
+
+interface GatedTabProps {
+	title: string;
+	description: string;
+}
+
+const GatedTab: FC<GatedTabProps> = ({ title, description }) => (
+	<div className="flex flex-col items-center justify-center gap-3 p-8 text-center">
+		<div className="rounded-full bg-muted p-3">
+			<Lock className="size-5 text-muted-foreground" />
+		</div>
+		<h3 className="text-sm font-medium">{title}</h3>
+		<p className="text-xs text-muted-foreground max-w-[200px]">{description}</p>
+		<Button size="sm" asChild>
+			<Link href="/register">Create Free Account</Link>
+		</Button>
+	</div>
+);
+
+export const ReportsGatedPlaceholder: FC = () => (
+	<GatedTab
+		title="Generate Reports"
+		description="Create a free account to generate structured reports from your conversations."
+	/>
+);
+
+export const EditorGatedPlaceholder: FC = () => (
+	<GatedTab
+		title="Document Editor"
+		description="Create a free account to use the AI-powered document editor."
+	/>
+);
+
+export const HitlGatedPlaceholder: FC = () => (
+	<GatedTab
+		title="Human-in-the-Loop Editing"
+		description="Create a free account to collaborate with AI on document edits."
+	/>
+);
diff --git a/surfsense_web/components/free-chat/free-thread.tsx b/surfsense_web/components/free-chat/free-thread.tsx
new file mode 100644
index 0000000000..933847b2b3
--- /dev/null
+++ b/surfsense_web/components/free-chat/free-thread.tsx
@@ -0,0 +1,55 @@
+"use client";
+
+import { AuiIf, ThreadPrimitive } from "@assistant-ui/react";
+import type { FC } from "react";
+import { AssistantMessage } from "@/components/assistant-ui/assistant-message";
+import { ChatViewport } from "@/components/assistant-ui/chat-viewport";
+import { EditComposer } from "@/components/assistant-ui/edit-composer";
+import { UserMessage } from "@/components/assistant-ui/user-message";
+import { FreeComposer } from "./free-composer";
+
+const FreeThreadWelcome: FC = () => {
+	return (
+		<div className="aui-thread-welcome-root mx-auto flex w-full max-w-(--thread-max-width) grow flex-col items-center px-4 relative">
+			<div className="aui-thread-welcome-message absolute bottom-[calc(50%+5rem)] left-0 right-0 flex flex-col items-center text-center">
+				<h1 className="aui-thread-welcome-message-inner text-3xl md:text-5xl select-none">
+					What can I help with?
+				</h1>
+			</div>
+			<div className="w-full flex items-start justify-center absolute top-[calc(50%-3.5rem)] left-0 right-0">
+				<FreeComposer />
+			</div>
+		</div>
+	);
+};
+
+export const FreeThread: FC = () => {
+	return (
+		<ThreadPrimitive.Root
+			className="aui-root aui-thread-root @container flex h-full min-h-0 flex-col bg-main-panel"
+			style={{
+				["--thread-max-width" as string]: "44rem",
+			}}
+		>
+			<ChatViewport
+				footer={
+					<AuiIf condition={({ thread }) => !thread.isEmpty}>
+						<FreeComposer />
+					</AuiIf>
+				}
+			>
+				<AuiIf condition={({ thread }) => thread.isEmpty}>
+					<FreeThreadWelcome />
+				</AuiIf>
+
+				<ThreadPrimitive.Messages
+					components={{
+						UserMessage,
+						EditComposer,
+						AssistantMessage,
+					}}
+				/>
+			</ChatViewport>
+		</ThreadPrimitive.Root>
+	);
+};
diff --git a/surfsense_web/components/free-chat/quota-bar.tsx b/surfsense_web/components/free-chat/quota-bar.tsx
new file mode 100644
index 0000000000..0693ef5398
--- /dev/null
+++ b/surfsense_web/components/free-chat/quota-bar.tsx
@@ -0,0 +1,57 @@
+"use client";
+
+import { OctagonAlert, Orbit } from "lucide-react";
+import Link from "next/link";
+import { Progress } from "@/components/ui/progress";
+import { cn } from "@/lib/utils";
+
+interface QuotaBarProps {
+	used: number;
+	limit: number;
+	warningThreshold: number;
+	className?: string;
+}
+
+export function QuotaBar({ used, limit, warningThreshold, className }: QuotaBarProps) {
+	const percentage = Math.min((used / limit) * 100, 100);
+	const remaining = Math.max(limit - used, 0);
+	const isWarning = used >= warningThreshold;
+	const isExceeded = used >= limit;
+
+	return (
+		<div className={cn("space-y-1.5", className)}>
+			<div className="flex justify-between items-center text-xs">
+				<span className="text-muted-foreground">
+					{used.toLocaleString()} / {limit.toLocaleString()} tokens
+				</span>
+				{isExceeded ? (
+					<span className="font-medium text-red-500">Limit reached</span>
+				) : isWarning ? (
+					<span className="font-medium text-amber-500 flex items-center gap-1">
+						<OctagonAlert className="h-3 w-3" />
+						{remaining.toLocaleString()} remaining
+					</span>
+				) : (
+					<span className="font-medium">{percentage.toFixed(0)}%</span>
+				)}
+			</div>
+			<Progress
+				value={percentage}
+				className={cn(
+					"h-1.5",
+					isExceeded && "[&>div]:bg-red-500",
+					isWarning && !isExceeded && "[&>div]:bg-amber-500"
+				)}
+			/>
+			{isExceeded && (
+				<Link
+					href="/register"
+					className="flex items-center justify-center gap-1.5 rounded-md bg-linear-to-r from-purple-600 to-blue-600 px-3 py-1.5 text-xs font-medium text-white transition-opacity hover:opacity-90"
+				>
+					<Orbit className="h-3 w-3" />
+					Create free account for 5M more tokens
+				</Link>
+			)}
+		</div>
+	);
+}
diff --git a/surfsense_web/components/free-chat/quota-warning-banner.tsx b/surfsense_web/components/free-chat/quota-warning-banner.tsx
new file mode 100644
index 0000000000..828e8006ee
--- /dev/null
+++ b/surfsense_web/components/free-chat/quota-warning-banner.tsx
@@ -0,0 +1,87 @@
+"use client";
+
+import { OctagonAlert, Orbit, X } from "lucide-react";
+import Link from "next/link";
+import { useState } from "react";
+import { Button } from "@/components/ui/button";
+import { cn } from "@/lib/utils";
+
+interface QuotaWarningBannerProps {
+	used: number;
+	limit: number;
+	warningThreshold: number;
+	className?: string;
+}
+
+export function QuotaWarningBanner({
+	used,
+	limit,
+	warningThreshold,
+	className,
+}: QuotaWarningBannerProps) {
+	const [dismissed, setDismissed] = useState(false);
+	const isWarning = used >= warningThreshold && used < limit;
+	const isExceeded = used >= limit;
+
+	if (dismissed || (!isWarning && !isExceeded)) return null;
+
+	if (isExceeded) {
+		return (
+			<div
+				className={cn(
+					"rounded-lg border border-red-200 bg-red-50 dark:border-red-800 dark:bg-red-950/50 p-4",
+					className
+				)}
+			>
+				<div className="flex items-start gap-3">
+					<OctagonAlert className="h-5 w-5 text-red-500 shrink-0 mt-0.5" />
+					<div className="flex-1 space-y-2">
+						<p className="text-sm font-medium text-red-800 dark:text-red-200">
+							Free token limit reached
+						</p>
+						<p className="text-xs text-red-600 dark:text-red-300">
+							You&apos;ve used all {limit.toLocaleString()} free tokens. Create a free account to
+							get $5 of premium credit and access to all models.
+						</p>
+						<Link
+							href="/register"
+							className="inline-flex items-center gap-1.5 rounded-md bg-linear-to-r from-purple-600 to-blue-600 px-4 py-2 text-sm font-medium text-white transition-opacity hover:opacity-90"
+						>
+							<Orbit className="h-4 w-4" />
+							Create Free Account
+						</Link>
+					</div>
+				</div>
+			</div>
+		);
+	}
+
+	return (
+		<div
+			className={cn(
+				"rounded-lg border border-amber-200 bg-amber-50 dark:border-amber-800 dark:bg-amber-950/50 p-3",
+				className
+			)}
+		>
+			<div className="flex items-center gap-3">
+				<OctagonAlert className="h-4 w-4 text-amber-500 shrink-0" />
+				<p className="flex-1 text-xs text-amber-700 dark:text-amber-300">
+					You&apos;ve used {used.toLocaleString()} of {limit.toLocaleString()} free tokens.{" "}
+					<Link href="/register" className="font-medium underline hover:no-underline">
+						Create an account
+					</Link>{" "}
+					for $5 of premium credit.
+				</p>
+				<Button
+					type="button"
+					variant="ghost"
+					size="icon"
+					onClick={() => setDismissed(true)}
+					className="size-6 text-amber-400 hover:bg-transparent hover:text-amber-600 dark:hover:text-amber-200"
+				>
+					<X className="h-4 w-4" />
+				</Button>
+			</div>
+		</div>
+	);
+}
diff --git a/surfsense_web/components/hitl-edit-panel/hitl-edit-panel.tsx b/surfsense_web/components/hitl-edit-panel/hitl-edit-panel.tsx
deleted file mode 100644
index bd36431e95..0000000000
--- a/surfsense_web/components/hitl-edit-panel/hitl-edit-panel.tsx
+++ /dev/null
@@ -1,406 +0,0 @@
-"use client";
-
-import { format } from "date-fns";
-import { TagInput, type Tag as TagType } from "emblor";
-import { useAtomValue, useSetAtom } from "jotai";
-import { CalendarIcon, XIcon } from "lucide-react";
-import dynamic from "next/dynamic";
-import { useCallback, useEffect, useMemo, useRef, useState } from "react";
-import type { ExtraField } from "@/atoms/chat/hitl-edit-panel.atom";
-import { closeHitlEditPanelAtom, hitlEditPanelAtom } from "@/atoms/chat/hitl-edit-panel.atom";
-import { Button } from "@/components/ui/button";
-import { Calendar } from "@/components/ui/calendar";
-import { Drawer, DrawerContent, DrawerHandle, DrawerTitle } from "@/components/ui/drawer";
-import { Input } from "@/components/ui/input";
-import { Label } from "@/components/ui/label";
-import { Popover, PopoverContent, PopoverTrigger } from "@/components/ui/popover";
-import { Skeleton } from "@/components/ui/skeleton";
-import { Textarea } from "@/components/ui/textarea";
-import { useMediaQuery } from "@/hooks/use-media-query";
-
-const PlateEditor = dynamic(
-	() => import("@/components/editor/plate-editor").then((m) => ({ default: m.PlateEditor })),
-	{ ssr: false, loading: () => <Skeleton className="h-64 w-full" /> }
-);
-
-function parseEmailsToTags(value: string): TagType[] {
-	if (!value.trim()) return [];
-	return value
-		.split(",")
-		.map((s) => s.trim())
-		.filter(Boolean)
-		.map((email, i) => ({ id: `${Date.now()}-${i}`, text: email }));
-}
-
-function tagsToEmailString(tags: TagType[]): string {
-	return tags.map((t) => t.text).join(", ");
-}
-
-function EmailsTagField({
-	id,
-	value,
-	onChange,
-	placeholder,
-}: {
-	id: string;
-	value: string;
-	onChange: (value: string) => void;
-	placeholder?: string;
-}) {
-	const [tags, setTags] = useState<TagType[]>(() => parseEmailsToTags(value));
-	const [activeTagIndex, setActiveTagIndex] = useState<number | null>(null);
-	const isInitialMount = useRef(true);
-	const onChangeRef = useRef(onChange);
-	onChangeRef.current = onChange;
-
-	useEffect(() => {
-		if (isInitialMount.current) {
-			isInitialMount.current = false;
-			return;
-		}
-		onChangeRef.current(tagsToEmailString(tags));
-	}, [tags]);
-
-	const handleSetTags = useCallback((newTags: TagType[] | ((prev: TagType[]) => TagType[])) => {
-		setTags((prev) => (typeof newTags === "function" ? newTags(prev) : newTags));
-	}, []);
-
-	const handleAddTag = useCallback(
-		(text: string) => {
-			const trimmed = text.trim();
-			if (!trimmed) return;
-			if (tags.some((tag) => tag.text === trimmed)) return;
-			const newTag: TagType = { id: Date.now().toString(), text: trimmed };
-			setTags((prev) => [...prev, newTag]);
-		},
-		[tags]
-	);
-
-	return (
-		<TagInput
-			id={id}
-			tags={tags}
-			setTags={handleSetTags}
-			placeholder={placeholder ?? "Add email"}
-			onAddTag={handleAddTag}
-			styleClasses={{
-				inlineTagsContainer:
-					"border border-input rounded-md bg-transparent shadow-xs transition-[color,box-shadow] outline-none focus-within:border-ring p-1 gap-1",
-				input:
-					"w-full min-w-[80px] focus-visible:outline-none shadow-none px-2 h-7 text-foreground placeholder:text-muted-foreground bg-transparent text-sm md:text-sm",
-				tag: {
-					body: "h-7 relative bg-accent dark:bg-muted/60 border-0 hover:bg-accent/80 dark:hover:bg-muted rounded-md font-medium text-xs text-foreground/80 ps-2 pe-7 flex",
-					closeButton:
-						"absolute -inset-y-px -end-px p-0 rounded-e-md flex size-7 transition-colors outline-0 focus-visible:outline focus-visible:outline-2 focus-visible:outline-ring/70 text-foreground hover:text-foreground",
-				},
-			}}
-			activeTagIndex={activeTagIndex}
-			setActiveTagIndex={setActiveTagIndex}
-		/>
-	);
-}
-
-function parseDateTimeValue(value: string): { date: Date | undefined; time: string } {
-	if (!value) return { date: undefined, time: "09:00" };
-	try {
-		const d = new Date(value);
-		if (Number.isNaN(d.getTime())) return { date: undefined, time: "09:00" };
-		return {
-			date: d,
-			time: format(d, "HH:mm"),
-		};
-	} catch {
-		return { date: undefined, time: "09:00" };
-	}
-}
-
-function buildLocalDateTimeString(date: Date | undefined, time: string): string {
-	if (!date) return "";
-	const [hours, minutes] = time.split(":").map(Number);
-	const combined = new Date(date);
-	combined.setHours(hours ?? 9, minutes ?? 0, 0, 0);
-	const y = combined.getFullYear();
-	const m = String(combined.getMonth() + 1).padStart(2, "0");
-	const d = String(combined.getDate()).padStart(2, "0");
-	const h = String(combined.getHours()).padStart(2, "0");
-	const min = String(combined.getMinutes()).padStart(2, "0");
-	return `${y}-${m}-${d}T${h}:${min}:00`;
-}
-
-function DateTimePickerField({
-	id,
-	value,
-	onChange,
-}: {
-	id: string;
-	value: string;
-	onChange: (value: string) => void;
-}) {
-	const parsed = useMemo(() => parseDateTimeValue(value), [value]);
-	const [selectedDate, setSelectedDate] = useState<Date | undefined>(parsed.date);
-	const [time, setTime] = useState(parsed.time);
-	const [open, setOpen] = useState(false);
-
-	const handleDateSelect = useCallback(
-		(day: Date | undefined) => {
-			setSelectedDate(day);
-			onChange(buildLocalDateTimeString(day, time));
-			setOpen(false);
-		},
-		[time, onChange]
-	);
-
-	const handleTimeChange = useCallback(
-		(e: React.ChangeEvent<HTMLInputElement>) => {
-			const newTime = e.target.value;
-			setTime(newTime);
-			onChange(buildLocalDateTimeString(selectedDate, newTime));
-		},
-		[selectedDate, onChange]
-	);
-
-	const displayLabel = selectedDate
-		? `${format(selectedDate, "MMM d, yyyy")} at ${time}`
-		: "Pick date & time";
-
-	return (
-		<div className="flex gap-2">
-			<Popover open={open} onOpenChange={setOpen}>
-				<PopoverTrigger asChild>
-					<button
-						id={id}
-						type="button"
-						className="flex-1 flex items-center gap-2 h-9 rounded-md border border-input bg-transparent px-3 py-1 text-sm shadow-xs transition-[color,box-shadow] outline-none focus-visible:border-ring"
-					>
-						<CalendarIcon className="size-3.5 text-muted-foreground shrink-0" />
-						<span className={selectedDate ? "text-foreground" : "text-muted-foreground"}>
-							{displayLabel}
-						</span>
-					</button>
-				</PopoverTrigger>
-				<PopoverContent className="w-auto p-0" align="start">
-					<Calendar
-						mode="single"
-						selected={selectedDate}
-						onSelect={handleDateSelect}
-						defaultMonth={selectedDate}
-					/>
-				</PopoverContent>
-			</Popover>
-			<Input
-				type="time"
-				value={time}
-				onChange={handleTimeChange}
-				className="w-[120px] text-sm shrink-0 appearance-none [&::-webkit-calendar-picker-indicator]:hidden [&::-webkit-calendar-picker-indicator]:appearance-none"
-			/>
-		</div>
-	);
-}
-
-export function HitlEditPanelContent({
-	title: initialTitle,
-	content: initialContent,
-	contentFormat,
-	extraFields,
-	onSave,
-	onClose,
-	showCloseButton = true,
-}: {
-	title: string;
-	content: string;
-	toolName: string;
-	contentFormat?: "markdown" | "html";
-	extraFields?: ExtraField[];
-	onSave: (title: string, content: string, extraFieldValues?: Record<string, string>) => void;
-	onClose?: () => void;
-	showCloseButton?: boolean;
-}) {
-	const [editedTitle, setEditedTitle] = useState(initialTitle);
-	const contentRef = useRef(initialContent);
-	const [isSaving, setIsSaving] = useState(false);
-	const [extraFieldValues, setExtraFieldValues] = useState<Record<string, string>>(() => {
-		if (!extraFields) return {};
-		const initial: Record<string, string> = {};
-		for (const field of extraFields) {
-			initial[field.key] = field.value;
-		}
-		return initial;
-	});
-
-	const handleContentChange = useCallback((content: string) => {
-		contentRef.current = content;
-	}, []);
-
-	const handleExtraFieldChange = useCallback((key: string, value: string) => {
-		setExtraFieldValues((prev) => ({ ...prev, [key]: value }));
-	}, []);
-
-	const handleSave = useCallback(() => {
-		if (!editedTitle.trim()) return;
-		setIsSaving(true);
-		const extras = extraFields && extraFields.length > 0 ? extraFieldValues : undefined;
-		onSave(editedTitle, contentRef.current, extras);
-		onClose?.();
-	}, [editedTitle, onSave, onClose, extraFields, extraFieldValues]);
-
-	return (
-		<>
-			<div className="flex items-center gap-2 px-4 py-2 shrink-0 border-b">
-				<input
-					value={editedTitle}
-					onChange={(e) => setEditedTitle(e.target.value)}
-					placeholder="Untitled"
-					className="flex-1 min-w-0 bg-transparent text-sm font-semibold text-foreground outline-none placeholder:text-muted-foreground"
-					aria-label="Page title"
-				/>
-				{onClose && showCloseButton && (
-					<Button variant="ghost" size="icon" onClick={onClose} className="size-7 shrink-0">
-						<XIcon className="size-4" />
-						<span className="sr-only">Close panel</span>
-					</Button>
-				)}
-			</div>
-
-			{extraFields && extraFields.length > 0 && (
-				<div className="flex flex-col gap-3 px-4 py-3 border-b">
-					{extraFields.map((field) => (
-						<div key={field.key} className="flex flex-col gap-1.5">
-							<Label
-								htmlFor={`extra-field-${field.key}`}
-								className="text-xs font-medium text-muted-foreground"
-							>
-								{field.label}
-							</Label>
-							{field.type === "emails" ? (
-								<EmailsTagField
-									id={`extra-field-${field.key}`}
-									value={extraFieldValues[field.key] ?? ""}
-									onChange={(v) => handleExtraFieldChange(field.key, v)}
-									placeholder={`Add ${field.label.toLowerCase()}`}
-								/>
-							) : field.type === "datetime-local" ? (
-								<DateTimePickerField
-									id={`extra-field-${field.key}`}
-									value={extraFieldValues[field.key] ?? ""}
-									onChange={(v) => handleExtraFieldChange(field.key, v)}
-								/>
-							) : field.type === "textarea" ? (
-								<Textarea
-									id={`extra-field-${field.key}`}
-									value={extraFieldValues[field.key] ?? ""}
-									onChange={(e) => handleExtraFieldChange(field.key, e.target.value)}
-									className="text-sm min-h-[60px]"
-								/>
-							) : (
-								<Input
-									id={`extra-field-${field.key}`}
-									type={field.type}
-									value={extraFieldValues[field.key] ?? ""}
-									onChange={(e) => handleExtraFieldChange(field.key, e.target.value)}
-									className="text-sm"
-								/>
-							)}
-						</div>
-					))}
-				</div>
-			)}
-
-			<div className="flex-1 overflow-hidden">
-				<PlateEditor
-					{...(contentFormat === "html"
-						? { html: initialContent, onHtmlChange: handleContentChange }
-						: { markdown: initialContent, onMarkdownChange: handleContentChange })}
-					readOnly={false}
-					preset="full"
-					placeholder="Start writing..."
-					editorVariant="default"
-					defaultEditing
-					onSave={handleSave}
-					hasUnsavedChanges
-					isSaving={isSaving}
-					className="[&_[role=toolbar]]:!bg-sidebar"
-				/>
-			</div>
-		</>
-	);
-}
-
-function DesktopHitlEditPanel() {
-	const panelState = useAtomValue(hitlEditPanelAtom);
-	const closePanel = useSetAtom(closeHitlEditPanelAtom);
-
-	if (!panelState.isOpen || !panelState.onSave) return null;
-
-	return (
-		<div className="flex w-[50%] max-w-[700px] min-w-[380px] flex-col border-l bg-sidebar text-sidebar-foreground animate-in slide-in-from-right-4 duration-300 ease-out">
-			<HitlEditPanelContent
-				title={panelState.title}
-				content={panelState.content}
-				toolName={panelState.toolName}
-				contentFormat={panelState.contentFormat}
-				extraFields={panelState.extraFields}
-				onSave={panelState.onSave}
-				onClose={closePanel}
-			/>
-		</div>
-	);
-}
-
-function MobileHitlEditDrawer() {
-	const panelState = useAtomValue(hitlEditPanelAtom);
-	const closePanel = useSetAtom(closeHitlEditPanelAtom);
-
-	if (!panelState.onSave) return null;
-
-	return (
-		<Drawer
-			open={panelState.isOpen}
-			onOpenChange={(open) => {
-				if (!open) closePanel();
-			}}
-			shouldScaleBackground={false}
-		>
-			<DrawerContent
-				className="h-[90vh] max-h-[90vh] z-80 bg-sidebar overflow-hidden"
-				overlayClassName="z-80"
-			>
-				<DrawerHandle />
-				<DrawerTitle className="sr-only">Edit {panelState.toolName}</DrawerTitle>
-				<div className="min-h-0 flex-1 flex flex-col overflow-hidden">
-					<HitlEditPanelContent
-						title={panelState.title}
-						content={panelState.content}
-						toolName={panelState.toolName}
-						contentFormat={panelState.contentFormat}
-						extraFields={panelState.extraFields}
-						onSave={panelState.onSave}
-						onClose={closePanel}
-						showCloseButton={false}
-					/>
-				</div>
-			</DrawerContent>
-		</Drawer>
-	);
-}
-
-export function HitlEditPanel() {
-	const panelState = useAtomValue(hitlEditPanelAtom);
-	const isDesktop = useMediaQuery("(min-width: 1024px)");
-
-	if (!panelState.isOpen) return null;
-
-	if (isDesktop) {
-		return <DesktopHitlEditPanel />;
-	}
-
-	return <MobileHitlEditDrawer />;
-}
-
-export function MobileHitlEditPanel() {
-	const panelState = useAtomValue(hitlEditPanelAtom);
-	const isDesktop = useMediaQuery("(min-width: 1024px)");
-
-	if (isDesktop || !panelState.isOpen) return null;
-
-	return <MobileHitlEditDrawer />;
-}
diff --git a/surfsense_web/components/homepage/auth-redirect.tsx b/surfsense_web/components/homepage/auth-redirect.tsx
new file mode 100644
index 0000000000..6697ab744f
--- /dev/null
+++ b/surfsense_web/components/homepage/auth-redirect.tsx
@@ -0,0 +1,17 @@
+"use client";
+
+import { useRouter } from "next/navigation";
+import { useEffect } from "react";
+import { getBearerToken } from "@/lib/auth-utils";
+
+export function AuthRedirect() {
+	const router = useRouter();
+
+	useEffect(() => {
+		if (getBearerToken()) {
+			router.replace("/dashboard");
+		}
+	}, [router]);
+
+	return null;
+}
diff --git a/surfsense_web/components/homepage/cta.tsx b/surfsense_web/components/homepage/cta.tsx
index a09d7354c2..7e21a7fe20 100644
--- a/surfsense_web/components/homepage/cta.tsx
+++ b/surfsense_web/components/homepage/cta.tsx
@@ -2,6 +2,7 @@
 import { IconMessageCircleQuestion } from "@tabler/icons-react";
 import Link from "next/link";
 import type React from "react";
+import { Button } from "@/components/ui/button";
 import { cn } from "@/lib/utils";
 
 export function CTAHomepage() {
@@ -22,15 +23,16 @@ export function CTAHomepage() {
 				</p>
 
 				<div className="flex items-start sm:items-center flex-col sm:flex-row sm:gap-4">
-					<Link href="/contact">
-						<button
-							type="button"
-							className="mt-8 flex space-x-2 items-center group text-base px-4 py-2 rounded-lg  text-black dark:text-white border border-neutral-200 dark:border-neutral-800 shadow-[0px_2px_0px_0px_rgba(255,255,255,0.3)_inset]"
-						>
+					<Button
+						asChild
+						variant="ghost"
+						className="mt-8 h-auto gap-2 rounded-lg border border-neutral-200 px-4 py-2 text-base text-black shadow-[0px_2px_0px_0px_rgba(255,255,255,0.3)_inset] dark:border-neutral-800 dark:text-white"
+					>
+						<Link href="/contact" className="group">
 							<span>Talk to us</span>
 							<IconMessageCircleQuestion className="text-black dark:text-white group-hover:translate-x-1 stroke-[1px] h-3 w-3 mt-0.5 transition-transform duration-200" />
-						</button>
-					</Link>
+						</Link>
+					</Button>
 				</div>
 			</div>
 			{/* <div className="border-t md:border-t-0 md:border-l border-dashed p-8 md:p-14">
diff --git a/surfsense_web/components/homepage/features-bento-grid.tsx b/surfsense_web/components/homepage/features-bento-grid.tsx
index 32cbe2582a..7406223de5 100644
--- a/surfsense_web/components/homepage/features-bento-grid.tsx
+++ b/surfsense_web/components/homepage/features-bento-grid.tsx
@@ -1,4 +1,10 @@
-import { IconMessage, IconMicrophone, IconSearch, IconUsers } from "@tabler/icons-react";
+import {
+	IconBinaryTree,
+	IconMessage,
+	IconMicrophone,
+	IconSearch,
+	IconUsers,
+} from "@tabler/icons-react";
 import Image from "next/image";
 import React from "react";
 import { BentoGrid, BentoGridItem } from "@/components/ui/bento-grid";
@@ -414,6 +420,295 @@ const AudioCommentIllustration = () => (
 	</div>
 );
 
+const AiSortIllustration = () => (
+	<div className="relative flex w-full h-full min-h-[6rem] items-center justify-center overflow-hidden rounded-xl bg-gradient-to-br from-emerald-50 via-teal-50 to-cyan-50 dark:from-emerald-950/20 dark:via-teal-950/20 dark:to-cyan-950/20 p-4">
+		<svg viewBox="0 0 400 200" className="w-full h-full" xmlns="http://www.w3.org/2000/svg">
+			<title>AI File Sorting illustration showing automatic folder organization</title>
+			{/* Scattered documents on the left */}
+			<g opacity="0.5">
+				<rect
+					x="20"
+					y="40"
+					width="35"
+					height="45"
+					rx="4"
+					className="fill-neutral-200 dark:fill-neutral-700"
+					transform="rotate(-8 37 62)"
+				/>
+				<rect
+					x="50"
+					y="80"
+					width="35"
+					height="45"
+					rx="4"
+					className="fill-neutral-200 dark:fill-neutral-700"
+					transform="rotate(5 67 102)"
+				/>
+				<rect
+					x="15"
+					y="110"
+					width="35"
+					height="45"
+					rx="4"
+					className="fill-neutral-200 dark:fill-neutral-700"
+					transform="rotate(-3 32 132)"
+				/>
+			</g>
+
+			{/* AI sparkle / magic in the center */}
+			<g transform="translate(140, 90)">
+				<path
+					d="M 0,-18 L 4,-6 L 16,-4 L 6,4 L 8,16 L 0,10 L -8,16 L -6,4 L -16,-4 L -4,-6 Z"
+					className="fill-emerald-500 dark:fill-emerald-400"
+					opacity="0.85"
+				>
+					<animateTransform
+						attributeName="transform"
+						type="rotate"
+						from="0"
+						to="360"
+						dur="10s"
+						repeatCount="indefinite"
+					/>
+				</path>
+				<circle cx="0" cy="0" r="3" className="fill-white dark:fill-emerald-200">
+					<animate attributeName="opacity" values="0.5;1;0.5" dur="2s" repeatCount="indefinite" />
+				</circle>
+			</g>
+
+			{/* Animated sorting arrows */}
+			<g
+				className="stroke-emerald-500 dark:stroke-emerald-400"
+				strokeWidth="2"
+				fill="none"
+				opacity="0.6"
+			>
+				<path d="M 100 70 Q 140 60, 180 50" strokeDasharray="4,4">
+					<animate
+						attributeName="stroke-dashoffset"
+						from="8"
+						to="0"
+						dur="1s"
+						repeatCount="indefinite"
+					/>
+				</path>
+				<path d="M 100 100 Q 140 100, 180 100" strokeDasharray="4,4">
+					<animate
+						attributeName="stroke-dashoffset"
+						from="8"
+						to="0"
+						dur="1s"
+						repeatCount="indefinite"
+					/>
+				</path>
+				<path d="M 100 130 Q 140 140, 180 150" strokeDasharray="4,4">
+					<animate
+						attributeName="stroke-dashoffset"
+						from="8"
+						to="0"
+						dur="1s"
+						repeatCount="indefinite"
+					/>
+				</path>
+			</g>
+
+			{/* Organized folder tree on the right */}
+			{/* Root folder */}
+			<g>
+				<rect
+					x="220"
+					y="30"
+					width="160"
+					height="28"
+					rx="6"
+					className="fill-white dark:fill-neutral-800"
+					opacity="0.9"
+				/>
+				<rect
+					x="228"
+					y="36"
+					width="16"
+					height="14"
+					rx="3"
+					className="fill-emerald-500 dark:fill-emerald-400"
+				/>
+				<line
+					x1="252"
+					y1="43"
+					x2="330"
+					y2="43"
+					className="stroke-neutral-400 dark:stroke-neutral-500"
+					strokeWidth="2.5"
+					strokeLinecap="round"
+				/>
+			</g>
+
+			{/* Subfolder 1 */}
+			<g>
+				<line
+					x1="240"
+					y1="58"
+					x2="240"
+					y2="76"
+					className="stroke-neutral-300 dark:stroke-neutral-600"
+					strokeWidth="1.5"
+				/>
+				<line
+					x1="240"
+					y1="76"
+					x2="250"
+					y2="76"
+					className="stroke-neutral-300 dark:stroke-neutral-600"
+					strokeWidth="1.5"
+				/>
+				<rect
+					x="250"
+					y="64"
+					width="130"
+					height="24"
+					rx="5"
+					className="fill-white dark:fill-neutral-800"
+					opacity="0.85"
+				/>
+				<rect
+					x="257"
+					y="70"
+					width="12"
+					height="11"
+					rx="2"
+					className="fill-teal-400 dark:fill-teal-500"
+				/>
+				<line
+					x1="276"
+					y1="76"
+					x2="340"
+					y2="76"
+					className="stroke-neutral-400 dark:stroke-neutral-500"
+					strokeWidth="2"
+					strokeLinecap="round"
+				/>
+			</g>
+
+			{/* Subfolder 2 */}
+			<g>
+				<line
+					x1="240"
+					y1="76"
+					x2="240"
+					y2="108"
+					className="stroke-neutral-300 dark:stroke-neutral-600"
+					strokeWidth="1.5"
+				/>
+				<line
+					x1="240"
+					y1="108"
+					x2="250"
+					y2="108"
+					className="stroke-neutral-300 dark:stroke-neutral-600"
+					strokeWidth="1.5"
+				/>
+				<rect
+					x="250"
+					y="96"
+					width="130"
+					height="24"
+					rx="5"
+					className="fill-white dark:fill-neutral-800"
+					opacity="0.85"
+				/>
+				<rect
+					x="257"
+					y="102"
+					width="12"
+					height="11"
+					rx="2"
+					className="fill-cyan-400 dark:fill-cyan-500"
+				/>
+				<line
+					x1="276"
+					y1="108"
+					x2="350"
+					y2="108"
+					className="stroke-neutral-400 dark:stroke-neutral-500"
+					strokeWidth="2"
+					strokeLinecap="round"
+				/>
+			</g>
+
+			{/* Subfolder 3 */}
+			<g>
+				<line
+					x1="240"
+					y1="108"
+					x2="240"
+					y2="140"
+					className="stroke-neutral-300 dark:stroke-neutral-600"
+					strokeWidth="1.5"
+				/>
+				<line
+					x1="240"
+					y1="140"
+					x2="250"
+					y2="140"
+					className="stroke-neutral-300 dark:stroke-neutral-600"
+					strokeWidth="1.5"
+				/>
+				<rect
+					x="250"
+					y="128"
+					width="130"
+					height="24"
+					rx="5"
+					className="fill-white dark:fill-neutral-800"
+					opacity="0.85"
+				/>
+				<rect
+					x="257"
+					y="134"
+					width="12"
+					height="11"
+					rx="2"
+					className="fill-emerald-400 dark:fill-emerald-500"
+				/>
+				<line
+					x1="276"
+					y1="140"
+					x2="325"
+					y2="140"
+					className="stroke-neutral-400 dark:stroke-neutral-500"
+					strokeWidth="2"
+					strokeLinecap="round"
+				/>
+			</g>
+
+			{/* Sparkle accents */}
+			<g className="opacity-60">
+				<circle cx="170" cy="45" r="2" className="fill-emerald-400">
+					<animate attributeName="opacity" values="0;1;0" dur="2s" repeatCount="indefinite" />
+				</circle>
+				<circle cx="190" cy="155" r="1.5" className="fill-teal-400">
+					<animate
+						attributeName="opacity"
+						values="0;1;0"
+						dur="2.5s"
+						begin="0.8s"
+						repeatCount="indefinite"
+					/>
+				</circle>
+				<circle cx="155" cy="120" r="1.5" className="fill-cyan-400">
+					<animate
+						attributeName="opacity"
+						values="0;1;0"
+						dur="3s"
+						begin="0.4s"
+						repeatCount="indefinite"
+					/>
+				</circle>
+			</g>
+		</svg>
+	</div>
+);
+
 const items = [
 	{
 		title: "Find, Ask, Act",
@@ -431,6 +726,14 @@ const items = [
 		className: "md:col-span-1",
 		icon: <IconUsers className="h-4 w-4 text-neutral-500" />,
 	},
+	{
+		title: "AI File Sorting",
+		description:
+			"Automatically organize documents into a smart folder hierarchy using AI-powered categorization by source, date, and topic.",
+		header: <AiSortIllustration />,
+		className: "md:col-span-1",
+		icon: <IconBinaryTree className="h-4 w-4 text-neutral-500" />,
+	},
 	{
 		title: "Collaborate Beyond Text",
 		description:
@@ -443,7 +746,7 @@ const items = [
 		title: "Context Where It Counts",
 		description: "Add comments directly to your chats and docs for clear, in-the-moment feedback.",
 		header: <AnnotationIllustration />,
-		className: "md:col-span-2",
+		className: "md:col-span-1",
 		icon: <IconMessage className="h-4 w-4 text-neutral-500" />,
 	},
 ];
diff --git a/surfsense_web/components/homepage/github-stars-badge.tsx b/surfsense_web/components/homepage/github-stars-badge.tsx
index da449635ee..e6771d22af 100644
--- a/surfsense_web/components/homepage/github-stars-badge.tsx
+++ b/surfsense_web/components/homepage/github-stars-badge.tsx
@@ -1,9 +1,11 @@
 "use client";
 
 import { IconBrandGithub } from "@tabler/icons-react";
+import { useQuery } from "@tanstack/react-query";
 import { motion, useMotionValue, useSpring } from "motion/react";
 import * as React from "react";
 import { Skeleton } from "@/components/ui/skeleton";
+import { cacheKeys } from "@/lib/query-client/cache-keys";
 import { cn } from "@/lib/utils";
 
 // ---------------------------------------------------------------------------
@@ -244,28 +246,18 @@ function NavbarGitHubStars({
 	href = "https://github.com/MODSetter/SurfSense",
 	className,
 }: NavbarGitHubStarsProps) {
-	const [stars, setStars] = React.useState(0);
-	const [isLoading, setIsLoading] = React.useState(true);
-
-	React.useEffect(() => {
-		const abortController = new AbortController();
-		fetch(`https://api.github.com/repos/${username}/${repo}`, {
-			signal: abortController.signal,
-		})
-			.then((res) => res.json())
-			.then((data) => {
-				if (data && typeof data.stargazers_count === "number") {
-					setStars(data.stargazers_count);
-				}
-			})
-			.catch((err) => {
-				if (err instanceof Error && err.name !== "AbortError") {
-					console.error("Error fetching stars:", err);
-				}
-			})
-			.finally(() => setIsLoading(false));
-		return () => abortController.abort();
-	}, [username, repo]);
+	const { data: stars = 0, isLoading } = useQuery({
+		queryKey: cacheKeys.github.repoStars(username, repo),
+		queryFn: async ({ signal }) => {
+			const res = await fetch(`https://api.github.com/repos/${username}/${repo}`, { signal });
+			const data = await res.json();
+			if (data && typeof data.stargazers_count === "number") {
+				return data.stargazers_count as number;
+			}
+			return 0;
+		},
+		staleTime: 5 * 60 * 1000,
+	});
 
 	return (
 		<a
diff --git a/surfsense_web/components/homepage/hero-section.tsx b/surfsense_web/components/homepage/hero-section.tsx
index 23c17aa3e1..d2bbd3079e 100644
--- a/surfsense_web/components/homepage/hero-section.tsx
+++ b/surfsense_web/components/homepage/hero-section.tsx
@@ -2,8 +2,9 @@
 import { ChevronDown, Download, Monitor } from "lucide-react";
 import { AnimatePresence, motion } from "motion/react";
 import Link from "next/link";
-import React, { memo, useCallback, useEffect, useMemo, useRef, useState } from "react";
+import React, { memo, useCallback, useEffect, useRef, useState } from "react";
 import Balancer from "react-wrap-balancer";
+import { Button } from "@/components/ui/button";
 import {
 	DropdownMenu,
 	DropdownMenuContent,
@@ -12,6 +13,11 @@ import {
 } from "@/components/ui/dropdown-menu";
 import { ExpandedMediaOverlay, useExpandedMedia } from "@/components/ui/expanded-gif-overlay";
 import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
+import {
+	GITHUB_RELEASES_URL,
+	getAssetLabel,
+	usePrimaryDownload,
+} from "@/lib/desktop-download-utils";
 import { AUTH_TYPE, isSSOAuth } from "@/lib/env-config";
 import { trackLoginAttempt } from "@/lib/posthog/events";
 import { cn } from "@/lib/utils";
@@ -58,10 +64,10 @@ const TAB_ITEMS = [
 		featured: true,
 	},
 	{
-		title: "Extreme Assist",
+		title: "Screenshot Assist",
 		description:
-			"Get inline writing suggestions powered by your knowledge base as you type in any app.",
-		src: "/homepage/hero_tutorial/extreme_assist.mp4",
+			"Use a global shortcut to select a region on your screen and attach it to your chat message.",
+		src: "/homepage/hero_tutorial/screenshot_assist.mp4",
 		featured: true,
 	},
 	{
@@ -148,14 +154,14 @@ export function HeroSection() {
 				</h1>
 				<div className="mt-4 flex w-full flex-col items-start justify-between gap-4 md:mt-12 md:flex-row md:items-end md:gap-10">
 					<div>
-						<h2
+						<p
 							className={cn(
 								"relative mb-8 max-w-2xl text-left text-sm tracking-wide text-neutral-600 antialiased sm:text-base md:text-xl dark:text-neutral-400"
 							)}
 						>
-							An open source, privacy focused alternative to NotebookLM for teams with no data
-							limits.
-						</h2>
+							A free, open source NotebookLM alternative for teams with no data limits. Use ChatGPT,
+							Claude AI, and any AI model for free.
+						</p>
 
 						<div className="relative mb-4 flex w-full flex-col justify-center gap-y-2 sm:flex-row sm:justify-start sm:space-y-0 sm:space-x-4">
 							<DownloadButton />
@@ -183,190 +189,70 @@ function GetStartedButton() {
 
 	if (isProxyLogin) {
 		return (
-			<button
+			<Button
 				type="button"
+				variant="ghost"
 				onClick={handleProxyLogin}
-				whileHover="hover"
-				whileTap={{ scale: 0.98 }}
-				initial="idle"
-				className="group relative z-20 flex h-11 w-full cursor-pointer items-center justify-center gap-3 overflow-hidden rounded-xl bg-white px-6 py-2.5 text-sm font-semibold text-neutral-700 shadow-lg ring-1 ring-neutral-200/50 transition-shadow duration-300 hover:shadow-xl sm:w-56 dark:bg-neutral-900 dark:text-neutral-200 dark:ring-neutral-700/50"
-				variants={{
-					idle: { scale: 1, y: 0 },
-					hover: { scale: 1.02, y: -2 },
-				}}
+				className="h-14 w-full cursor-pointer gap-3 rounded-lg border border-white bg-white text-center text-base font-medium text-[#1f1f1f] shadow-sm transition duration-150 hover:bg-zinc-100 hover:text-[#1f1f1f] sm:w-56 dark:border-white"
 			>
-				{/* Animated gradient background on hover */}
-				<motion.div
-					className="absolute inset-0 bg-linear-to-r from-blue-50 via-green-50 to-yellow-50 dark:from-blue-950/30 dark:via-green-950/30 dark:to-yellow-950/30"
-					variants={{
-						idle: { opacity: 0 },
-						hover: { opacity: 1 },
-					}}
-					transition={{ duration: 0.3 }}
-				/>
-				{/* Show Google logo only for native Google OAuth, not SSO */}
-				{isGoogleAuth && (
-					<motion.div
-						className="relative"
-						variants={{
-							idle: { rotate: 0 },
-							hover: { rotate: [0, -8, 8, 0] },
-						}}
-						transition={{ duration: 0.4, ease: "easeInOut" }}
-					>
-						<GoogleLogo className="h-5 w-5" />
-					</motion.div>
-				)}
-				<span className="relative">{isGoogleAuth ? "Continue with Google" : "Sign In"}</span>
-			</motion.button>
+				{isGoogleAuth && <GoogleLogo className="h-5 w-5" />}
+				<span>{isGoogleAuth ? "Continue with Google" : "Sign In"}</span>
+			</Button>
 		);
 	}
 
 	return (
-		<Link
-			href="/login"
-			className="flex h-14 w-full items-center justify-center rounded-lg bg-black text-center text-base font-medium text-white shadow-sm ring-1 shadow-black/10 ring-black/10 transition duration-150 active:scale-98 sm:w-52 dark:bg-white dark:text-black"
+		<Button
+			asChild
+			variant="ghost"
+			className="h-14 w-full rounded-lg bg-black text-center text-base font-medium text-white shadow-sm ring-1 shadow-black/10 ring-black/10 transition duration-150 active:scale-98 hover:bg-black sm:w-52 dark:bg-white dark:text-black dark:hover:bg-white"
 		>
-			Get Started
-		</Link>
+			<Link href="/login">Get Started</Link>
+		</Button>
 	);
 }
 
-type OSInfo = {
-	os: "macOS" | "Windows" | "Linux";
-	arch: "arm64" | "x64";
-};
-
-function useUserOS(): OSInfo {
-	const [info, setInfo] = useState<OSInfo>({ os: "macOS", arch: "arm64" });
-	useEffect(() => {
-		const ua = navigator.userAgent;
-		let os: OSInfo["os"] = "macOS";
-		let arch: OSInfo["arch"] = "x64";
-
-		if (/Windows/i.test(ua)) {
-			os = "Windows";
-			arch = "x64";
-		} else if (/Linux/i.test(ua)) {
-			os = "Linux";
-			arch = "x64";
-		} else {
-			os = "macOS";
-			arch = /Mac/.test(ua) && !/Intel/.test(ua) ? "arm64" : "arm64";
-		}
-
-		const uaData = (navigator as Navigator & { userAgentData?: { architecture?: string } })
-			.userAgentData;
-		if (uaData?.architecture === "arm") arch = "arm64";
-		else if (uaData?.architecture === "x86") arch = "x64";
-
-		setInfo({ os, arch });
-	}, []);
-	return info;
-}
-
-interface ReleaseAsset {
-	name: string;
-	url: string;
-}
-
-function useLatestRelease() {
-	const [assets, setAssets] = useState<ReleaseAsset[]>([]);
-
-	useEffect(() => {
-		const controller = new AbortController();
-		fetch("https://api.github.com/repos/MODSetter/SurfSense/releases/latest", {
-			signal: controller.signal,
-		})
-			.then((r) => r.json())
-			.then((data) => {
-				if (data?.assets) {
-					setAssets(
-						data.assets
-							.filter((a: { name: string }) => /\.(exe|dmg|AppImage|deb)$/.test(a.name))
-							.map((a: { name: string; browser_download_url: string }) => ({
-								name: a.name,
-								url: a.browser_download_url,
-							}))
-					);
-				}
-			})
-			.catch(() => {});
-		return () => controller.abort();
-	}, []);
-
-	return assets;
-}
-
-const ASSET_LABELS: Record<string, string> = {
-	".exe": "Windows (exe)",
-	"-arm64.dmg": "macOS Apple Silicon (dmg)",
-	"-x64.dmg": "macOS Intel (dmg)",
-	"-arm64.zip": "macOS Apple Silicon (zip)",
-	"-x64.zip": "macOS Intel (zip)",
-	".AppImage": "Linux (AppImage)",
-	".deb": "Linux (deb)",
-};
-
-function getAssetLabel(name: string): string {
-	for (const [suffix, label] of Object.entries(ASSET_LABELS)) {
-		if (name.endsWith(suffix)) return label;
-	}
-	return name;
-}
-
 function DownloadButton() {
-	const { os, arch } = useUserOS();
-	const assets = useLatestRelease();
-
-	const { primary, alternatives } = useMemo(() => {
-		if (assets.length === 0) return { primary: null, alternatives: [] };
-
-		const matchers: Record<string, (n: string) => boolean> = {
-			Windows: (n) => n.endsWith(".exe"),
-			macOS: (n) => n.endsWith(`-${arch}.dmg`),
-			Linux: (n) => n.endsWith(".AppImage"),
-		};
-
-		const match = matchers[os];
-		const primary = assets.find((a) => match(a.name)) ?? null;
-		const alternatives = assets.filter((a) => a !== primary);
-		return { primary, alternatives };
-	}, [assets, os, arch]);
+	const { os, primary, alternatives } = usePrimaryDownload();
 
 	const fallbackUrl = GITHUB_RELEASES_URL;
 
 	if (!primary) {
 		return (
-			<a
-				href={fallbackUrl}
-				target="_blank"
-				rel="noopener noreferrer"
-				className="flex h-14 w-full items-center justify-center gap-2 rounded-lg border border-neutral-200 bg-white text-center text-base font-medium text-neutral-700 shadow-sm transition duration-150 active:scale-98 hover:bg-neutral-50 sm:w-auto sm:px-6 dark:border-neutral-700 dark:bg-neutral-900 dark:text-neutral-200 dark:hover:bg-neutral-800"
+			<Button
+				asChild
+				variant="ghost"
+				className="h-14 w-full gap-2 rounded-lg border border-neutral-200 bg-white text-center text-base font-medium text-neutral-700 shadow-sm transition duration-150 active:scale-98 hover:bg-neutral-50 sm:w-auto sm:px-6 dark:border-neutral-700 dark:bg-neutral-900 dark:text-neutral-200 dark:hover:bg-neutral-800"
 			>
-				<Download className="size-4" />
-				Download for {os}
-			</a>
+				<a href={fallbackUrl} target="_blank" rel="noopener noreferrer">
+					<Download className="size-4" />
+					Download for {os}
+				</a>
+			</Button>
 		);
 	}
 
 	return (
 		<div className="flex h-14 w-full items-stretch sm:w-auto">
-			<a
-				href={primary.url}
-				className="flex flex-1 items-center justify-center gap-2 rounded-l-lg border border-r-0 border-neutral-200 bg-white px-5 text-base font-medium text-neutral-700 shadow-sm transition duration-150 active:scale-[0.99] hover:bg-neutral-50 dark:border-neutral-700 dark:bg-neutral-900 dark:text-neutral-200 dark:hover:bg-neutral-800"
+			<Button
+				asChild
+				variant="ghost"
+				className="h-auto flex-1 gap-2 rounded-l-lg rounded-r-none border border-r-0 border-neutral-200 bg-white px-5 text-base font-medium text-neutral-700 shadow-sm transition duration-150 active:scale-[0.99] hover:bg-neutral-50 dark:border-neutral-700 dark:bg-neutral-900 dark:text-neutral-200 dark:hover:bg-neutral-800"
 			>
-				<Download className="size-4 shrink-0" />
-				Download for {os}
-			</a>
+				<a href={primary.url}>
+					<Download className="size-4 shrink-0" />
+					Download for {os}
+				</a>
+			</Button>
 			<DropdownMenu>
 				<DropdownMenuTrigger asChild>
-					<button
+					<Button
 						type="button"
-						className="flex items-center justify-center rounded-r-lg border border-neutral-200 bg-white px-2.5 text-neutral-500 shadow-sm transition duration-150 hover:bg-neutral-50 dark:border-neutral-700 dark:bg-neutral-900 dark:text-neutral-400 dark:hover:bg-neutral-800"
+						variant="ghost"
+						className="h-auto rounded-l-none rounded-r-lg border border-neutral-200 bg-white px-2.5 text-neutral-500 shadow-sm transition duration-150 hover:bg-neutral-50 dark:border-neutral-700 dark:bg-neutral-900 dark:text-neutral-400 dark:hover:bg-neutral-800"
 					>
 						<ChevronDown className="size-4" />
-					</button>
+					</Button>
 				</DropdownMenuTrigger>
 				<DropdownMenuContent align="end" className="w-64">
 					{alternatives.map((asset) => (
@@ -410,11 +296,12 @@ const BrowserWindow = () => {
 					<div className="no-visible-scrollbar flex min-w-0 shrink flex-row items-center justify-start gap-2 overflow-x-auto mask-l-from-98% py-0.5 pr-2 pl-2 md:pl-4">
 						{TAB_ITEMS.map((item, index) => (
 							<React.Fragment key={item.title}>
-								<button
+								<Button
 									type="button"
+									variant="ghost"
 									onClick={() => setSelectedIndex(index)}
 									className={cn(
-										"flex shrink-0 items-center gap-1.5 rounded-md px-2 py-1 text-xs transition duration-150 hover:bg-white sm:text-sm dark:hover:bg-neutral-950",
+										"h-auto shrink-0 gap-1.5 rounded-md px-2 py-1 text-xs transition duration-150 hover:bg-white sm:text-sm dark:hover:bg-neutral-950",
 										selectedIndex === index &&
 											!item.featured &&
 											"bg-white shadow ring-1 shadow-black/10 ring-black/10 dark:bg-neutral-900",
@@ -437,7 +324,7 @@ const BrowserWindow = () => {
 											<TooltipContent side="bottom">Desktop app only</TooltipContent>
 										</Tooltip>
 									)}
-								</button>
+								</Button>
 								{index !== TAB_ITEMS.length - 1 && (
 									<div className="h-4 w-px shrink-0 rounded-full bg-neutral-300 dark:bg-neutral-700" />
 								)}
@@ -480,13 +367,14 @@ const BrowserWindow = () => {
 									</p>
 								</div>
 							</div>
-							<button
+							<Button
 								type="button"
-								className="cursor-pointer bg-neutral-50 p-2 sm:p-3 dark:bg-neutral-950 w-full"
+								variant="ghost"
+								className="h-auto w-full cursor-pointer rounded-none bg-neutral-50 p-2 hover:bg-neutral-50 sm:p-3 dark:bg-neutral-950 dark:hover:bg-neutral-950"
 								onClick={open}
 							>
-								<TabVideo src={selectedItem.src} />
-							</button>
+								<TabVideo key={selectedItem.src} src={selectedItem.src} />
+							</Button>
 						</motion.div>
 					</AnimatePresence>
 				</div>
@@ -511,7 +399,7 @@ const TabVideo = memo(function TabVideo({ src }: { src: string }) {
 		if (!video) return;
 		video.currentTime = 0;
 		video.play().catch(() => {});
-	}, [src]);
+	}, []);
 
 	const handleCanPlay = useCallback(() => {
 		setHasLoaded(true);
@@ -536,5 +424,3 @@ const TabVideo = memo(function TabVideo({ src }: { src: string }) {
 		</div>
 	);
 });
-
-const GITHUB_RELEASES_URL = "https://github.com/MODSetter/SurfSense/releases/latest";
diff --git a/surfsense_web/components/homepage/integrations.tsx b/surfsense_web/components/homepage/integrations.tsx
index e4e28ede0c..83fff8604d 100644
--- a/surfsense_web/components/homepage/integrations.tsx
+++ b/surfsense_web/components/homepage/integrations.tsx
@@ -180,11 +180,11 @@ export default function ExternalIntegrations() {
 		>
 			{/* Heading */}
 			<div className="text-center mb-12 md:mb-16 relative z-20 px-4">
-				<h3 className="text-3xl sm:text-4xl md:text-5xl lg:text-6xl font-bold text-gray-900 dark:text-white leading-[1.1] tracking-tight">
+				<h2 className="text-3xl sm:text-4xl md:text-5xl lg:text-6xl font-bold text-gray-900 dark:text-white leading-[1.1] tracking-tight">
 					Integrate with your
 					<br />
 					team&apos;s most important tools
-				</h3>
+				</h2>
 			</div>
 
 			{/* Scrolling columns container — masked at edges so the page background shows through seamlessly */}
diff --git a/surfsense_web/components/homepage/navbar.tsx b/surfsense_web/components/homepage/navbar.tsx
index c62df226eb..6ffba848b5 100644
--- a/surfsense_web/components/homepage/navbar.tsx
+++ b/surfsense_web/components/homepage/navbar.tsx
@@ -7,13 +7,31 @@ import { SignInButton } from "@/components/auth/sign-in-button";
 import { NavbarGitHubStars } from "@/components/homepage/github-stars-badge";
 import { Logo } from "@/components/Logo";
 import { ThemeTogglerComponent } from "@/components/theme/theme-toggle";
+import { Button } from "@/components/ui/button";
 import { cn } from "@/lib/utils";
 
+interface NavItem {
+	name: string;
+	link: string;
+}
+
 interface NavbarProps {
 	/** Override the scrolled-state background classes (desktop & mobile). */
 	scrolledBgClassName?: string;
 }
 
+interface DesktopNavProps {
+	navItems: NavItem[];
+	isScrolled: boolean;
+	scrolledBgClassName?: string;
+}
+
+interface MobileNavProps {
+	navItems: NavItem[];
+	isScrolled: boolean;
+	scrolledBgClassName?: string;
+}
+
 export const Navbar = ({ scrolledBgClassName }: NavbarProps = {}) => {
 	const [isScrolled, setIsScrolled] = useState(false);
 
@@ -47,7 +65,7 @@ export const Navbar = ({ scrolledBgClassName }: NavbarProps = {}) => {
 	);
 };
 
-const DesktopNav = ({ navItems, isScrolled, scrolledBgClassName }: any) => {
+const DesktopNav = ({ navItems, isScrolled, scrolledBgClassName }: DesktopNavProps) => {
 	const [hovered, setHovered] = useState<number | null>(null);
 	return (
 		<motion.div
@@ -70,12 +88,12 @@ const DesktopNav = ({ navItems, isScrolled, scrolledBgClassName }: any) => {
 				<span className="dark:text-white/90 text-gray-800 text-lg font-bold">SurfSense</span>
 			</Link>
 			<div className="hidden flex-1 flex-row items-center justify-center space-x-2 text-sm font-medium text-zinc-600 transition duration-200 hover:text-zinc-800 lg:flex lg:space-x-2">
-				{navItems.map((navItem: any, idx: number) => (
+				{navItems.map((navItem: NavItem, idx: number) => (
 					<Link
 						onMouseEnter={() => setHovered(idx)}
 						onMouseLeave={() => setHovered(null)}
 						className="relative px-4 py-2 text-neutral-600 dark:text-neutral-300"
-						key={`link=${idx}`}
+						key={navItem.link}
 						href={navItem.link}
 					>
 						{hovered === idx && (
@@ -97,7 +115,7 @@ const DesktopNav = ({ navItems, isScrolled, scrolledBgClassName }: any) => {
 	);
 };
 
-const MobileNav = ({ navItems, isScrolled, scrolledBgClassName }: any) => {
+const MobileNav = ({ navItems, isScrolled, scrolledBgClassName }: MobileNavProps) => {
 	const [open, setOpen] = useState(false);
 	const navRef = useRef<HTMLDivElement>(null);
 
@@ -139,10 +157,12 @@ const MobileNav = ({ navItems, isScrolled, scrolledBgClassName }: any) => {
 					<Logo className="h-8 w-8 rounded-md" disableLink />
 					<span className="dark:text-white/90 text-gray-800 text-lg font-bold">SurfSense</span>
 				</Link>
-				<button
+				<Button
 					type="button"
+					variant="ghost"
+					size="icon"
 					onClick={() => setOpen((prev) => !prev)}
-					className="relative z-50 flex items-center justify-center p-2 -mr-2 rounded-lg hover:bg-gray-100 dark:hover:bg-neutral-800 transition-colors touch-manipulation"
+					className="relative z-50 -mr-2 rounded-lg p-2 hover:bg-gray-100 dark:hover:bg-neutral-800 touch-manipulation"
 					aria-label={open ? "Close menu" : "Open menu"}
 				>
 					{open ? (
@@ -150,7 +170,7 @@ const MobileNav = ({ navItems, isScrolled, scrolledBgClassName }: any) => {
 					) : (
 						<IconMenu2 className="h-6 w-6 text-black dark:text-white" />
 					)}
-				</button>
+				</Button>
 			</div>
 
 			<AnimatePresence>
@@ -162,9 +182,9 @@ const MobileNav = ({ navItems, isScrolled, scrolledBgClassName }: any) => {
 						transition={{ duration: 0.2, ease: "easeOut" }}
 						className="absolute inset-x-0 top-full mt-1 z-20 flex w-full flex-col items-start justify-start gap-4 rounded-xl bg-white/90 backdrop-blur-xl border border-white/20 shadow-2xl px-4 py-6 dark:bg-neutral-950/90 dark:border-neutral-800/50"
 					>
-						{navItems.map((navItem: any, idx: number) => (
+						{navItems.map((navItem: NavItem) => (
 							<Link
-								key={`link=${idx}`}
+								key={navItem.link}
 								href={navItem.link}
 								className="relative text-neutral-600 dark:text-neutral-300"
 							>
diff --git a/surfsense_web/components/homepage/use-cases-grid.tsx b/surfsense_web/components/homepage/use-cases-grid.tsx
index 7aa272a5c3..b515677008 100644
--- a/surfsense_web/components/homepage/use-cases-grid.tsx
+++ b/surfsense_web/components/homepage/use-cases-grid.tsx
@@ -68,6 +68,7 @@ function UseCaseCard({
 				<div
 					role="button"
 					tabIndex={0}
+					aria-label={`Expand ${title}`}
 					className="cursor-pointer overflow-hidden bg-neutral-50 p-2 dark:bg-neutral-950"
 					onClick={open}
 					onKeyDown={(e) => {
diff --git a/surfsense_web/components/homepage/why-surfsense.tsx b/surfsense_web/components/homepage/why-surfsense.tsx
index c50c3604af..8eeaf98740 100644
--- a/surfsense_web/components/homepage/why-surfsense.tsx
+++ b/surfsense_web/components/homepage/why-surfsense.tsx
@@ -348,6 +348,11 @@ const comparisonRows: {
 		notebookLm: false,
 		surfSense: true,
 	},
+	{
+		feature: "AI File Sorting",
+		notebookLm: false,
+		surfSense: true,
+	},
 ];
 
 function ComparisonStrip() {
diff --git a/surfsense_web/components/icons/providers/github.svg b/surfsense_web/components/icons/providers/github.svg
new file mode 100644
index 0000000000..7a51b8e0ee
--- /dev/null
+++ b/surfsense_web/components/icons/providers/github.svg
@@ -0,0 +1 @@
+<svg fill="currentColor" fill-rule="evenodd" height="1em" style="flex:none;line-height:1" viewBox="0 0 24 24" width="1em" xmlns="http://www.w3.org/2000/svg"><title>Github</title><path d="M12 0c6.63 0 12 5.276 12 11.79-.001 5.067-3.29 9.567-8.175 11.187-.6.118-.825-.25-.825-.56 0-.398.015-1.665.015-3.242 0-1.105-.375-1.813-.81-2.181 2.67-.295 5.475-1.297 5.475-5.822 0-1.297-.465-2.344-1.23-3.169.12-.295.54-1.503-.12-3.125 0 0-1.005-.324-3.3 1.209a11.32 11.32 0 00-3-.398c-1.02 0-2.04.133-3 .398-2.295-1.518-3.3-1.209-3.3-1.209-.66 1.622-.24 2.83-.12 3.125-.765.825-1.23 1.887-1.23 3.169 0 4.51 2.79 5.527 5.46 5.822-.345.294-.66.81-.765 1.577-.69.31-2.415.81-3.495-.973-.225-.354-.9-1.223-1.845-1.209-1.005.015-.405.56.015.781.51.28 1.095 1.327 1.23 1.666.24.663 1.02 1.93 4.035 1.385 0 .988.015 1.916.015 2.196 0 .31-.225.664-.825.56C3.303 21.374-.003 16.867 0 11.791 0 5.276 5.37 0 12 0z"></path></svg>
\ No newline at end of file
diff --git a/surfsense_web/components/icons/providers/index.ts b/surfsense_web/components/icons/providers/index.ts
index 2afed7fa5c..aefa2a0539 100644
--- a/surfsense_web/components/icons/providers/index.ts
+++ b/surfsense_web/components/icons/providers/index.ts
@@ -10,6 +10,7 @@ export { default as DeepInfraIcon } from "./deepinfra.svg";
 export { default as DeepSeekIcon } from "./deepseek.svg";
 export { default as FireworksAiIcon } from "./fireworksai.svg";
 export { default as GeminiIcon } from "./gemini.svg";
+export { default as GitHubModelsIcon } from "./github.svg";
 export { default as GroqIcon } from "./groq.svg";
 export { default as HuggingFaceIcon } from "./huggingface.svg";
 export { default as MiniMaxIcon } from "./minimax.svg";
diff --git a/surfsense_web/components/inference-params-editor.tsx b/surfsense_web/components/inference-params-editor.tsx
index 3764b1dac7..6e9514cdca 100644
--- a/surfsense_web/components/inference-params-editor.tsx
+++ b/surfsense_web/components/inference-params-editor.tsx
@@ -76,7 +76,7 @@ export default function InferenceParamsEditor({ params, setParams }: InferencePa
 						<SelectTrigger id="param-key" className="w-full">
 							<SelectValue placeholder="Select parameter" />
 						</SelectTrigger>
-						<SelectContent className="bg-muted dark:border-neutral-700">
+						<SelectContent>
 							{PARAM_KEYS.map((key) => (
 								<SelectItem key={key} value={key}>
 									{key}
@@ -128,7 +128,10 @@ export default function InferenceParamsEditor({ params, setParams }: InferencePa
 						</thead>
 						<tbody className="divide-y divide-gray-200 bg-black dark:bg-black">
 							{Object.entries(params).map(([key, val]) => (
-								<tr key={key} className="hover:bg-gray-50 dark:hover:bg-gray-800 transition-colors">
+								<tr
+									key={key}
+									className="hover:bg-accent hover:text-accent-foreground transition-colors"
+								>
 									<td className="px-4 py-3 font-medium text-gray-900 dark:text-white">{key}</td>
 									<td className="px-4 py-3 text-gray-700 dark:text-gray-300">{val.toString()}</td>
 									<td className="px-4 py-3">
diff --git a/surfsense_web/components/layout/hooks/SidebarContext.tsx b/surfsense_web/components/layout/hooks/SidebarContext.tsx
index bfb5b5aebc..35f76929d7 100644
--- a/surfsense_web/components/layout/hooks/SidebarContext.tsx
+++ b/surfsense_web/components/layout/hooks/SidebarContext.tsx
@@ -6,7 +6,6 @@ interface SidebarContextValue {
 	isCollapsed: boolean;
 	setIsCollapsed: (collapsed: boolean) => void;
 	toggleCollapsed: () => void;
-	sidebarWidth: number;
 }
 
 const SidebarContext = createContext<SidebarContextValue | null>(null);
diff --git a/surfsense_web/components/layout/hooks/useSidebarResize.ts b/surfsense_web/components/layout/hooks/useSidebarResize.ts
index 887c86dce6..a13af16631 100644
--- a/surfsense_web/components/layout/hooks/useSidebarResize.ts
+++ b/surfsense_web/components/layout/hooks/useSidebarResize.ts
@@ -10,18 +10,36 @@ export const SIDEBAR_MAX_WIDTH = 480;
 
 interface UseSidebarResizeReturn {
 	sidebarWidth: number;
-	handleMouseDown: (e: React.MouseEvent) => void;
+	handlePointerDown: (e: React.PointerEvent<HTMLElement>) => void;
 	isDragging: boolean;
 }
 
+function setGlobalDragCursor(active: boolean) {
+	const html = document.documentElement;
+	const body = document.body;
+	if (active) {
+		html.style.cursor = "col-resize";
+		body.style.cursor = "col-resize";
+		html.style.userSelect = "none";
+		body.style.userSelect = "none";
+	} else {
+		html.style.cursor = "";
+		body.style.cursor = "";
+		html.style.userSelect = "";
+		body.style.userSelect = "";
+	}
+}
+
 export function useSidebarResize(defaultWidth = SIDEBAR_MIN_WIDTH): UseSidebarResizeReturn {
 	const [sidebarWidth, setSidebarWidth] = useState(defaultWidth);
 	const [isDragging, setIsDragging] = useState(false);
 
 	const startXRef = useRef(0);
 	const startWidthRef = useRef(defaultWidth);
+	const widthRef = useRef(defaultWidth);
+	const pointerIdRef = useRef<number | null>(null);
+	const captureTargetRef = useRef<HTMLElement | null>(null);
 
-	// Initialize from cookie on mount
 	useEffect(() => {
 		try {
 			const match = document.cookie.match(/(?:^|; )sidebar_width=([^;]+)/);
@@ -29,73 +47,91 @@ export function useSidebarResize(defaultWidth = SIDEBAR_MIN_WIDTH): UseSidebarRe
 				const parsed = Number(match[1]);
 				if (!Number.isNaN(parsed) && parsed >= SIDEBAR_MIN_WIDTH && parsed <= SIDEBAR_MAX_WIDTH) {
 					setSidebarWidth(parsed);
+					widthRef.current = parsed;
 				}
 			}
-		} catch {
-			// Ignore cookie read errors
-		}
+		} catch {}
 	}, []);
 
-	// Persist width to cookie
 	const persistWidth = useCallback((width: number) => {
 		try {
+			// biome-ignore lint/suspicious/noDocumentCookie: SSR-readable preference, not security-sensitive
 			document.cookie = `${SIDEBAR_WIDTH_COOKIE_NAME}=${width}; path=/; max-age=${SIDEBAR_WIDTH_COOKIE_MAX_AGE}`;
 		} catch {
 			// Ignore cookie write errors
 		}
 	}, []);
 
-	const handleMouseDown = useCallback(
-		(e: React.MouseEvent) => {
-			e.preventDefault();
-			startXRef.current = e.clientX;
-			startWidthRef.current = sidebarWidth;
-			setIsDragging(true);
+	const releaseCapture = useCallback(() => {
+		const target = captureTargetRef.current;
+		const pointerId = pointerIdRef.current;
+		if (target && pointerId !== null) {
+			try {
+				if (target.hasPointerCapture(pointerId)) {
+					target.releasePointerCapture(pointerId);
+				}
+			} catch {}
+		}
+		captureTargetRef.current = null;
+		pointerIdRef.current = null;
+	}, []);
 
-			document.body.style.cursor = "col-resize";
-			document.body.style.userSelect = "none";
-		},
-		[sidebarWidth]
-	);
+	const handlePointerDown = useCallback((e: React.PointerEvent<HTMLElement>) => {
+		if (e.pointerType === "mouse" && e.button !== 0) return;
+
+		e.preventDefault();
+		const target = e.currentTarget;
+		try {
+			target.setPointerCapture(e.pointerId);
+		} catch {}
+		captureTargetRef.current = target;
+		pointerIdRef.current = e.pointerId;
+		startXRef.current = e.clientX;
+		startWidthRef.current = widthRef.current;
+		setIsDragging(true);
+		setGlobalDragCursor(true);
+	}, []);
 
 	useEffect(() => {
 		if (!isDragging) return;
 
-		const handleMouseMove = (e: MouseEvent) => {
+		const handlePointerMove = (e: PointerEvent) => {
+			if (pointerIdRef.current !== null && e.pointerId !== pointerIdRef.current) return;
 			const delta = e.clientX - startXRef.current;
 			const newWidth = Math.min(
 				SIDEBAR_MAX_WIDTH,
 				Math.max(SIDEBAR_MIN_WIDTH, startWidthRef.current + delta)
 			);
-			setSidebarWidth(newWidth);
+			if (newWidth !== widthRef.current) {
+				widthRef.current = newWidth;
+				setSidebarWidth(newWidth);
+			}
 		};
 
-		const handleMouseUp = () => {
+		const stop = (e: PointerEvent) => {
+			if (pointerIdRef.current !== null && e.pointerId !== pointerIdRef.current) return;
+			releaseCapture();
 			setIsDragging(false);
-			document.body.style.cursor = "";
-			document.body.style.userSelect = "";
-
-			// Persist the final width
-			setSidebarWidth((currentWidth) => {
-				persistWidth(currentWidth);
-				return currentWidth;
-			});
+			setGlobalDragCursor(false);
+			persistWidth(widthRef.current);
 		};
 
-		document.addEventListener("mousemove", handleMouseMove);
-		document.addEventListener("mouseup", handleMouseUp);
+		window.addEventListener("pointermove", handlePointerMove);
+		window.addEventListener("pointerup", stop);
+		window.addEventListener("pointercancel", stop);
 
 		return () => {
-			document.removeEventListener("mousemove", handleMouseMove);
-			document.removeEventListener("mouseup", handleMouseUp);
-			document.body.style.cursor = "";
-			document.body.style.userSelect = "";
+			window.removeEventListener("pointermove", handlePointerMove);
+			window.removeEventListener("pointerup", stop);
+			window.removeEventListener("pointercancel", stop);
+			setGlobalDragCursor(false);
+			releaseCapture();
 		};
-	}, [isDragging, persistWidth]);
+	}, [isDragging, persistWidth, releaseCapture]);
 
 	return {
 		sidebarWidth,
-		handleMouseDown,
+		handlePointerDown,
 		isDragging,
 	};
 }
diff --git a/surfsense_web/components/layout/providers/FreeLayoutDataProvider.tsx b/surfsense_web/components/layout/providers/FreeLayoutDataProvider.tsx
new file mode 100644
index 0000000000..1c5d7af6eb
--- /dev/null
+++ b/surfsense_web/components/layout/providers/FreeLayoutDataProvider.tsx
@@ -0,0 +1,143 @@
+"use client";
+
+import { Inbox, LibraryBig } from "lucide-react";
+import { useRouter } from "next/navigation";
+import type { ReactNode } from "react";
+import { Fragment, useCallback, useEffect, useMemo, useState } from "react";
+import { useAnonymousMode } from "@/contexts/anonymous-mode";
+import { useLoginGate } from "@/contexts/login-gate";
+import { useAnnouncements } from "@/hooks/use-announcements";
+import { useIsMobile } from "@/hooks/use-mobile";
+import { anonymousChatApiService } from "@/lib/apis/anonymous-chat-api.service";
+import type { ChatItem, NavItem, PageUsage, SearchSpace } from "../types/layout.types";
+import { LayoutShell } from "../ui/shell";
+
+interface FreeLayoutDataProviderProps {
+	children: ReactNode;
+}
+
+const GUEST_SPACE: SearchSpace = {
+	id: 0,
+	name: "SurfSense Free",
+	description: "Free AI chat without login",
+	isOwner: false,
+	memberCount: 1,
+};
+
+export function FreeLayoutDataProvider({ children }: FreeLayoutDataProviderProps) {
+	const router = useRouter();
+	const { gate } = useLoginGate();
+	const anonMode = useAnonymousMode();
+	const isMobile = useIsMobile();
+	const { unreadCount: announcementUnreadCount } = useAnnouncements();
+	const [quota, setQuota] = useState<{ used: number; limit: number } | null>(null);
+	const [isDocsSidebarOpen, setIsDocsSidebarOpen] = useState(false);
+
+	// Keep docs sidebar closed on mobile; auto-open only on desktop after hydration
+	useEffect(() => {
+		setIsDocsSidebarOpen(!isMobile);
+	}, [isMobile]);
+
+	useEffect(() => {
+		anonymousChatApiService
+			.getQuota()
+			.then((q) => {
+				setQuota({ used: q.used, limit: q.limit });
+			})
+			.catch(() => {});
+	}, []);
+
+	const resetChat = useCallback(() => {
+		if (anonMode.isAnonymous) {
+			anonMode.resetChat();
+		}
+	}, [anonMode]);
+
+	const gatedAction = useCallback((feature: string) => () => gate(feature), [gate]);
+
+	const navItems: NavItem[] = useMemo(
+		() =>
+			(
+				[
+					{
+						title: "Inbox",
+						url: "#inbox",
+						icon: Inbox,
+						isActive: false,
+					},
+					isMobile
+						? {
+								title: "Documents",
+								url: "#documents",
+								icon: LibraryBig,
+								isActive: false,
+							}
+						: null,
+				] as (NavItem | null)[]
+			).filter((item): item is NavItem => item !== null),
+		[isMobile]
+	);
+
+	const pageUsage: PageUsage | undefined = quota
+		? { pagesUsed: quota.used, pagesLimit: quota.limit }
+		: undefined;
+
+	const handleChatSelect = useCallback((_chat: ChatItem) => gate("view chat history"), [gate]);
+
+	const handleNavItemClick = useCallback(
+		(item: NavItem) => {
+			if (item.title === "Inbox") gate("use the inbox");
+			else if (item.title === "Documents") setIsDocsSidebarOpen((v) => !v);
+		},
+		[gate]
+	);
+
+	const handleAnnouncements = useCallback(() => gate("see what's new"), [gate]);
+
+	const handleSearchSpaceSelect = useCallback(
+		(_id: number) => gate("switch search spaces"),
+		[gate]
+	);
+
+	return (
+		<LayoutShell
+			searchSpaces={[GUEST_SPACE]}
+			activeSearchSpaceId={0}
+			onSearchSpaceSelect={handleSearchSpaceSelect}
+			onSearchSpaceSettings={gatedAction("search space settings")}
+			onAddSearchSpace={gatedAction("create search spaces")}
+			searchSpace={GUEST_SPACE}
+			navItems={navItems}
+			onNavItemClick={handleNavItemClick}
+			chats={[]}
+			sharedChats={[]}
+			activeChatId={null}
+			onNewChat={resetChat}
+			onChatSelect={handleChatSelect}
+			onChatRename={gatedAction("rename chats")}
+			onChatDelete={gatedAction("delete chats")}
+			onChatArchive={gatedAction("archive chats")}
+			onViewAllSharedChats={gatedAction("view shared chats")}
+			onViewAllPrivateChats={gatedAction("view chat history")}
+			user={{
+				email: "Guest",
+				name: "Guest",
+			}}
+			onSettings={gatedAction("search space settings")}
+			onManageMembers={gatedAction("team management")}
+			onUserSettings={gatedAction("account settings")}
+			onAnnouncements={handleAnnouncements}
+			announcementUnreadCount={announcementUnreadCount}
+			onLogout={() => router.push("/register")}
+			pageUsage={pageUsage}
+			isChatPage
+			isLoadingChats={false}
+			documentsPanel={{
+				open: isDocsSidebarOpen,
+				onOpenChange: setIsDocsSidebarOpen,
+			}}
+		>
+			<Fragment>{children}</Fragment>
+		</LayoutShell>
+	);
+}
diff --git a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx
index 6184f164f9..209fa7b598 100644
--- a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx
+++ b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx
@@ -2,7 +2,7 @@
 
 import { useQuery, useQueryClient } from "@tanstack/react-query";
 import { useAtom, useAtomValue, useSetAtom } from "jotai";
-import { AlertTriangle, Inbox, Megaphone, SquareLibrary } from "lucide-react";
+import { AlertTriangle, Inbox, LibraryBig } from "lucide-react";
 import { useParams, usePathname, useRouter } from "next/navigation";
 import { useTranslations } from "next-intl";
 import { useTheme } from "next-themes";
@@ -11,24 +11,14 @@ import { toast } from "sonner";
 import { currentThreadAtom, resetCurrentThreadAtom } from "@/atoms/chat/current-thread.atom";
 import { documentsSidebarOpenAtom } from "@/atoms/documents/ui.atoms";
 import { statusInboxItemsAtom } from "@/atoms/inbox/status-inbox.atom";
+import { announcementsDialogAtom } from "@/atoms/layout/dialogs.atom";
 import { rightPanelCollapsedAtom } from "@/atoms/layout/right-panel.atom";
 import { deleteSearchSpaceMutationAtom } from "@/atoms/search-spaces/search-space-mutation.atoms";
 import { searchSpacesAtom } from "@/atoms/search-spaces/search-space-query.atoms";
-import {
-	searchSpaceSettingsDialogAtom,
-	teamDialogAtom,
-	userSettingsDialogAtom,
-} from "@/atoms/settings/settings-dialog.atoms";
-import {
-	removeChatTabAtom,
-	resetTabsAtom,
-	syncChatTabAtom,
-	type Tab,
-} from "@/atoms/tabs/tabs.atom";
+import { removeChatTabAtom, syncChatTabAtom, type Tab } from "@/atoms/tabs/tabs.atom";
 import { currentUserAtom } from "@/atoms/user/user-query.atoms";
-import { SearchSpaceSettingsDialog } from "@/components/settings/search-space-settings-dialog";
-import { TeamDialog } from "@/components/settings/team-dialog";
-import { UserSettingsDialog } from "@/components/settings/user-settings-dialog";
+import { ActionLogDialog } from "@/components/agent-action-log/action-log-dialog";
+import { AnnouncementsDialog } from "@/components/announcements/AnnouncementsDialog";
 import {
 	AlertDialog,
 	AlertDialogAction,
@@ -104,7 +94,6 @@ export function LayoutDataProvider({ searchSpaceId, children }: LayoutDataProvid
 	const currentThreadState = useAtomValue(currentThreadAtom);
 	const resetCurrentThread = useSetAtom(resetCurrentThreadAtom);
 	const syncChatTab = useSetAtom(syncChatTabAtom);
-	const resetTabs = useSetAtom(resetTabsAtom);
 	const removeChatTab = useSetAtom(removeChatTabAtom);
 
 	// Key used to force-remount the page component (e.g. after deleting the active chat
@@ -131,16 +120,15 @@ export function LayoutDataProvider({ searchSpaceId, children }: LayoutDataProvid
 	});
 
 	// Unified slide-out panel state (only one can be open at a time)
-	type SlideoutPanel = "inbox" | "shared" | "private" | "announcements" | null;
+	type SlideoutPanel = "inbox" | "shared" | "private" | null;
 	const [activeSlideoutPanel, setActiveSlideoutPanel] = useState<SlideoutPanel>(null);
 
 	const isInboxSidebarOpen = activeSlideoutPanel === "inbox";
-	const isAnnouncementsSidebarOpen = activeSlideoutPanel === "announcements";
 
 	// Documents sidebar state (shared atom so Composer can toggle it)
 	const [isDocumentsSidebarOpen, setIsDocumentsSidebarOpen] = useAtom(documentsSidebarOpenAtom);
 	const [isDocumentsDocked, setIsDocumentsDocked] = useState(true);
-	const [isRightPanelCollapsed, setIsRightPanelCollapsed] = useAtom(rightPanelCollapsedAtom);
+	const setIsRightPanelCollapsed = useSetAtom(rightPanelCollapsedAtom);
 
 	// Open documents sidebar by default on desktop (docked mode)
 	const documentsInitialized = useRef(false);
@@ -251,16 +239,16 @@ export function LayoutDataProvider({ searchSpaceId, children }: LayoutDataProvid
 	const [isDeletingSearchSpace, setIsDeletingSearchSpace] = useState(false);
 	const [isLeavingSearchSpace, setIsLeavingSearchSpace] = useState(false);
 
-	// Reset transient slide-out panels and tabs when switching search spaces.
-	// Use a ref to skip the initial mount — only reset when the space actually changes.
+	// Reset transient slide-out panels when switching search spaces.
+	// Tabs intentionally persist across spaces — opening tabs from multiple
+	// search spaces is a supported flow (browser-tab semantics).
 	const prevSearchSpaceIdRef = useRef(searchSpaceId);
 	useEffect(() => {
 		if (prevSearchSpaceIdRef.current !== searchSpaceId) {
 			prevSearchSpaceIdRef.current = searchSpaceId;
 			setActiveSlideoutPanel(null);
-			resetTabs();
 		}
-	}, [searchSpaceId, resetTabs]);
+	}, [searchSpaceId]);
 
 	const searchSpaces: SearchSpace[] = useMemo(() => {
 		if (!searchSpacesData || !Array.isArray(searchSpacesData)) return [];
@@ -312,6 +300,7 @@ export function LayoutDataProvider({ searchSpaceId, children }: LayoutDataProvid
 			// Avoid overwriting live SSE-updated tab titles with fallback values.
 			title: chatId ? (thread?.title ?? undefined) : "New Chat",
 			chatUrl,
+			searchSpaceId: Number(searchSpaceId),
 		});
 	}, [currentChatId, searchSpaceId, threadsData?.threads, syncChatTab]);
 
@@ -346,6 +335,9 @@ export function LayoutDataProvider({ searchSpaceId, children }: LayoutDataProvid
 	}, [threadsData, searchSpaceId]);
 
 	// Navigation items
+	// Inbox is rendered explicitly below "New chat" in the sidebar (it is also
+	// surfaced in the icon rail's collapsed mode via this list). Announcements
+	// has been moved to the avatar dropdown and is no longer a nav item.
 	const navItems: NavItem[] = useMemo(
 		() =>
 			(
@@ -361,28 +353,13 @@ export function LayoutDataProvider({ searchSpaceId, children }: LayoutDataProvid
 						? {
 								title: "Documents",
 								url: "#documents",
-								icon: SquareLibrary,
+								icon: LibraryBig,
 								isActive: isDocumentsSidebarOpen,
 							}
 						: null,
-					{
-						title: "Announcements",
-						url: "#announcements",
-						icon: Megaphone,
-						isActive: isAnnouncementsSidebarOpen,
-						badge:
-							announcementUnreadCount > 0 ? formatInboxCount(announcementUnreadCount) : undefined,
-					},
 				] as (NavItem | null)[]
 			).filter((item): item is NavItem => item !== null),
-		[
-			isMobile,
-			isInboxSidebarOpen,
-			isDocumentsSidebarOpen,
-			totalUnreadCount,
-			isAnnouncementsSidebarOpen,
-			announcementUnreadCount,
-		]
+		[isMobile, isInboxSidebarOpen, isDocumentsSidebarOpen, totalUnreadCount]
 	);
 
 	// Handlers
@@ -397,19 +374,21 @@ export function LayoutDataProvider({ searchSpaceId, children }: LayoutDataProvid
 		setIsCreateSearchSpaceDialogOpen(true);
 	}, []);
 
-	const setUserSettingsDialog = useSetAtom(userSettingsDialogAtom);
-	const setSearchSpaceSettingsDialog = useSetAtom(searchSpaceSettingsDialogAtom);
-	const setTeamDialogOpen = useSetAtom(teamDialogAtom);
+	const setAnnouncementsDialog = useSetAtom(announcementsDialogAtom);
 
 	const handleUserSettings = useCallback(() => {
-		setUserSettingsDialog({ open: true, initialTab: "profile" });
-	}, [setUserSettingsDialog]);
+		router.push(`/dashboard/${searchSpaceId}/user-settings`);
+	}, [router, searchSpaceId]);
+
+	const handleAnnouncements = useCallback(() => {
+		setAnnouncementsDialog(true);
+	}, [setAnnouncementsDialog]);
 
 	const handleSearchSpaceSettings = useCallback(
-		(_space: SearchSpace) => {
-			setSearchSpaceSettingsDialog({ open: true, initialTab: "general" });
+		(space: SearchSpace) => {
+			router.push(`/dashboard/${space.id}/search-space-settings`);
 		},
-		[setSearchSpaceSettingsDialog]
+		[router]
 	);
 
 	const handleSearchSpaceDeleteClick = useCallback((space: SearchSpace) => {
@@ -518,10 +497,6 @@ export function LayoutDataProvider({ searchSpaceId, children }: LayoutDataProvid
 				}
 				return;
 			}
-			if (item.url === "#announcements") {
-				setActiveSlideoutPanel((prev) => (prev === "announcements" ? null : "announcements"));
-				return;
-			}
 			router.push(item.url);
 		},
 		[router, isMobile, isDocumentsSidebarOpen, setIsDocumentsSidebarOpen, setIsRightPanelCollapsed]
@@ -532,16 +507,14 @@ export function LayoutDataProvider({ searchSpaceId, children }: LayoutDataProvid
 		const isOutOfSync = currentThreadState.id !== null && !params?.chat_id;
 
 		if (isOutOfSync) {
-			// First sync Next.js router by navigating to the current chat's actual URL
-			// This updates the router's internal state to match the browser URL
 			resetCurrentThread();
-			router.replace(`/dashboard/${searchSpaceId}/new-chat/${currentThreadState.id}`);
-			// Allow router to sync, then navigate to fresh new-chat
-			setTimeout(() => {
-				router.push(`/dashboard/${searchSpaceId}/new-chat`);
-			}, 0);
+			// Immediately set the browser URL so the page remounts with a clean /new-chat path
+			window.history.replaceState(null, "", `/dashboard/${searchSpaceId}/new-chat`);
+			// Force-remount the page component to reset all React state synchronously
+			setChatResetKey((k) => k + 1);
+			// Sync Next.js router internals so useParams/usePathname stay correct going forward
+			router.replace(`/dashboard/${searchSpaceId}/new-chat`);
 		} else {
-			// Normal navigation - router is in sync
 			router.push(`/dashboard/${searchSpaceId}/new-chat`);
 		}
 	}, [router, searchSpaceId, currentThreadState.id, params?.chat_id, resetCurrentThread]);
@@ -587,12 +560,12 @@ export function LayoutDataProvider({ searchSpaceId, children }: LayoutDataProvid
 	);
 
 	const handleSettings = useCallback(() => {
-		setSearchSpaceSettingsDialog({ open: true, initialTab: "general" });
-	}, [setSearchSpaceSettingsDialog]);
+		router.push(`/dashboard/${searchSpaceId}/search-space-settings`);
+	}, [router, searchSpaceId]);
 
 	const handleManageMembers = useCallback(() => {
-		setTeamDialogOpen(true);
-	}, [setTeamDialogOpen]);
+		router.push(`/dashboard/${searchSpaceId}/team`);
+	}, [router, searchSpaceId]);
 
 	const handleLogout = useCallback(async () => {
 		try {
@@ -683,16 +656,17 @@ export function LayoutDataProvider({ searchSpaceId, children }: LayoutDataProvid
 		}
 	}, [chatToRename, newChatTitle, queryClient, searchSpaceId, tSidebar]);
 
-	// Page usage
-	const pageUsage = user
-		? {
-				pagesUsed: user.pages_used,
-				pagesLimit: user.pages_limit,
-			}
-		: undefined;
-
 	// Detect if we're on the chat page (needs overflow-hidden for chat's own scroll)
 	const isChatPage = pathname?.includes("/new-chat") ?? false;
+	const isUserSettingsPage = pathname?.includes("/user-settings") === true;
+	const isSearchSpaceSettingsPage = pathname?.includes("/search-space-settings") === true;
+	const isTeamPage = pathname?.endsWith("/team") === true;
+	const useWorkspacePanel =
+		pathname?.endsWith("/buy-more") === true ||
+		pathname?.endsWith("/more-pages") === true ||
+		isUserSettingsPage ||
+		isSearchSpaceSettingsPage ||
+		isTeamPage;
 
 	return (
 		<>
@@ -724,11 +698,21 @@ export function LayoutDataProvider({ searchSpaceId, children }: LayoutDataProvid
 				onSettings={handleSettings}
 				onManageMembers={handleManageMembers}
 				onUserSettings={handleUserSettings}
+				onAnnouncements={handleAnnouncements}
+				announcementUnreadCount={announcementUnreadCount}
 				onLogout={handleLogout}
-				pageUsage={pageUsage}
 				theme={theme}
 				setTheme={setTheme}
 				isChatPage={isChatPage}
+				useWorkspacePanel={useWorkspacePanel}
+				workspacePanelViewportClassName={
+					isUserSettingsPage || isSearchSpaceSettingsPage || isTeamPage
+						? "items-start justify-center px-6 py-8 md:px-10 md:py-10"
+						: undefined
+				}
+				workspacePanelContentClassName={
+					isUserSettingsPage || isSearchSpaceSettingsPage || isTeamPage ? "max-w-5xl" : undefined
+				}
 				isLoadingChats={isLoadingThreads}
 				activeSlideoutPanel={activeSlideoutPanel}
 				onSlideoutPanelChange={setActiveSlideoutPanel}
@@ -908,10 +892,10 @@ export function LayoutDataProvider({ searchSpaceId, children }: LayoutDataProvid
 				onOpenChange={setIsCreateSearchSpaceDialogOpen}
 			/>
 
-			{/* Settings Dialogs */}
-			<SearchSpaceSettingsDialog searchSpaceId={Number(searchSpaceId)} />
-			<UserSettingsDialog />
-			<TeamDialog searchSpaceId={Number(searchSpaceId)} />
+			<AnnouncementsDialog />
+
+			{/* Agent action log + revert dialog */}
+			<ActionLogDialog />
 		</>
 	);
 }
diff --git a/surfsense_web/components/layout/ui/header/Header.tsx b/surfsense_web/components/layout/ui/header/Header.tsx
index 2c8e41a42c..c6ccfddc6e 100644
--- a/surfsense_web/components/layout/ui/header/Header.tsx
+++ b/surfsense_web/components/layout/ui/header/Header.tsx
@@ -4,12 +4,11 @@ import { useAtomValue } from "jotai";
 import { usePathname } from "next/navigation";
 import { currentThreadAtom } from "@/atoms/chat/current-thread.atom";
 import { activeSearchSpaceIdAtom } from "@/atoms/search-spaces/search-space-query.atoms";
-import { activeTabAtom, tabsAtom } from "@/atoms/tabs/tabs.atom";
+import { activeTabAtom } from "@/atoms/tabs/tabs.atom";
+import { ActionLogButton } from "@/components/agent-action-log/action-log-button";
 import { ChatHeader } from "@/components/new-chat/chat-header";
 import { ChatShareButton } from "@/components/new-chat/chat-share-button";
-import { useIsMobile } from "@/hooks/use-mobile";
 import type { ChatVisibility, ThreadRecord } from "@/lib/chat/thread-persistence";
-import { RightPanelExpandButton } from "../right-panel/RightPanel";
 
 interface HeaderProps {
 	mobileMenuTrigger?: React.ReactNode;
@@ -18,18 +17,26 @@ interface HeaderProps {
 export function Header({ mobileMenuTrigger }: HeaderProps) {
 	const pathname = usePathname();
 	const searchSpaceId = useAtomValue(activeSearchSpaceIdAtom);
-	const isMobile = useIsMobile();
 	const activeTab = useAtomValue(activeTabAtom);
-	const tabs = useAtomValue(tabsAtom);
 
+	const isFreePage = pathname?.startsWith("/free") ?? false;
 	const isChatPage = pathname?.includes("/new-chat") ?? false;
 	const isDocumentTab = activeTab?.type === "document";
-	const hasTabBar = tabs.length > 1;
 
 	const currentThreadState = useAtomValue(currentThreadAtom);
 
 	const hasThread = isChatPage && !isDocumentTab && currentThreadState.id !== null;
 
+	// Free chat pages have their own header with model selector; only render mobile trigger
+	if (isFreePage) {
+		if (!mobileMenuTrigger) return null;
+		return (
+			<header className="sticky top-0 z-10 flex h-14 shrink-0 items-center gap-2 bg-main-panel/95 backdrop-blur supports-backdrop-filter:bg-main-panel/60 px-4">
+				{mobileMenuTrigger}
+			</header>
+		);
+	}
+
 	const threadForButton: ThreadRecord | null =
 		hasThread && currentThreadState.id !== null
 			? {
@@ -58,10 +65,10 @@ export function Header({ mobileMenuTrigger }: HeaderProps) {
 
 			{/* Right side - Actions */}
 			<div className="ml-auto flex items-center gap-2">
+				{hasThread && <ActionLogButton threadId={currentThreadState.id} />}
 				{hasThread && (
 					<ChatShareButton thread={threadForButton} onVisibilityChange={handleVisibilityChange} />
 				)}
-				{!isMobile && !hasTabBar && <RightPanelExpandButton />}
 			</div>
 		</header>
 	);
diff --git a/surfsense_web/components/layout/ui/icon-rail/IconRail.tsx b/surfsense_web/components/layout/ui/icon-rail/IconRail.tsx
index 062e8dcb7c..a47b00e5c9 100644
--- a/surfsense_web/components/layout/ui/icon-rail/IconRail.tsx
+++ b/surfsense_web/components/layout/ui/icon-rail/IconRail.tsx
@@ -1,11 +1,12 @@
 "use client";
 
-import { Plus } from "lucide-react";
+import { Plus, SquarePen } from "lucide-react";
 import { Button } from "@/components/ui/button";
 import { ScrollArea } from "@/components/ui/scroll-area";
 import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
 import { cn } from "@/lib/utils";
-import type { SearchSpace } from "../../types/layout.types";
+import type { NavItem, SearchSpace, User } from "../../types/layout.types";
+import { SidebarUserProfile } from "../sidebar/SidebarUserProfile";
 import { SearchSpaceAvatar } from "./SearchSpaceAvatar";
 
 interface IconRailProps {
@@ -15,6 +16,17 @@ interface IconRailProps {
 	onSearchSpaceDelete?: (searchSpace: SearchSpace) => void;
 	onSearchSpaceSettings?: (searchSpace: SearchSpace) => void;
 	onAddSearchSpace: () => void;
+	isSingleRailMode?: boolean;
+	onNewChat?: () => void;
+	navItems?: NavItem[];
+	onNavItemClick?: (item: NavItem) => void;
+	user: User;
+	onUserSettings?: () => void;
+	onAnnouncements?: () => void;
+	announcementUnreadCount?: number;
+	onLogout?: () => void;
+	theme?: string;
+	setTheme?: (theme: "light" | "dark" | "system") => void;
 	className?: string;
 }
 
@@ -25,11 +37,45 @@ export function IconRail({
 	onSearchSpaceDelete,
 	onSearchSpaceSettings,
 	onAddSearchSpace,
+	isSingleRailMode = false,
+	onNewChat,
+	navItems = [],
+	onNavItemClick,
+	user,
+	onUserSettings,
+	onAnnouncements,
+	announcementUnreadCount = 0,
+	onLogout,
+	theme,
+	setTheme,
 	className,
 }: IconRailProps) {
+	const actionItems = isSingleRailMode
+		? [
+				...(onNewChat
+					? [
+							{
+								key: "new-chat",
+								label: "New chat",
+								onClick: onNewChat,
+								icon: SquarePen,
+								isActive: false,
+							},
+						]
+					: []),
+				...navItems.map((item) => ({
+					key: item.url,
+					label: item.title,
+					onClick: () => onNavItemClick?.(item),
+					icon: item.icon,
+					isActive: !!item.isActive,
+				})),
+			]
+		: [];
+
 	return (
-		<div className={cn("flex h-full w-14 flex-col items-center", className)}>
-			<ScrollArea className="w-full">
+		<div className={cn("flex h-full w-14 min-h-0 flex-col items-center", className)}>
+			<ScrollArea className="w-full min-h-0 flex-1">
 				<div className="flex flex-col items-center gap-2 px-1.5 py-3">
 					{searchSpaces.map((searchSpace) => (
 						<SearchSpaceAvatar
@@ -63,8 +109,45 @@ export function IconRail({
 							Add search space
 						</TooltipContent>
 					</Tooltip>
+
+					{actionItems.length > 0 && (
+						<>
+							<div className="my-1 h-px w-8 bg-border/60" />
+							{actionItems.map(({ key, label, onClick, icon: Icon, isActive }) => (
+								<Tooltip key={key}>
+									<TooltipTrigger asChild>
+										<Button
+											variant="ghost"
+											size="icon"
+											onClick={onClick}
+											className={cn(
+												"h-10 w-10 rounded-lg",
+												isActive && "bg-accent text-accent-foreground"
+											)}
+										>
+											<Icon className="h-4 w-4" />
+											<span className="sr-only">{label}</span>
+										</Button>
+									</TooltipTrigger>
+									<TooltipContent side="right" sideOffset={8}>
+										{label}
+									</TooltipContent>
+								</Tooltip>
+							))}
+						</>
+					)}
 				</div>
 			</ScrollArea>
+			<SidebarUserProfile
+				user={user}
+				onUserSettings={onUserSettings}
+				onAnnouncements={onAnnouncements}
+				announcementUnreadCount={announcementUnreadCount}
+				onLogout={onLogout}
+				isCollapsed
+				theme={theme}
+				setTheme={setTheme}
+			/>
 		</div>
 	);
 }
diff --git a/surfsense_web/components/layout/ui/icon-rail/SearchSpaceAvatar.tsx b/surfsense_web/components/layout/ui/icon-rail/SearchSpaceAvatar.tsx
index 641e5f5afb..944536c8f7 100644
--- a/surfsense_web/components/layout/ui/icon-rail/SearchSpaceAvatar.tsx
+++ b/surfsense_web/components/layout/ui/icon-rail/SearchSpaceAvatar.tsx
@@ -3,18 +3,17 @@
 import { Settings, Trash2, Users } from "lucide-react";
 import { useTranslations } from "next-intl";
 import { useCallback, useRef, useState } from "react";
+import { Button } from "@/components/ui/button";
 import {
 	ContextMenu,
 	ContextMenuContent,
 	ContextMenuItem,
-	ContextMenuSeparator,
 	ContextMenuTrigger,
 } from "@/components/ui/context-menu";
 import {
 	DropdownMenu,
 	DropdownMenuContent,
 	DropdownMenuItem,
-	DropdownMenuSeparator,
 	DropdownMenuTrigger,
 } from "@/components/ui/dropdown-menu";
 import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
@@ -122,14 +121,16 @@ export function SearchSpaceAvatar({
 	);
 
 	const avatarButton = (
-		<button
+		<Button
 			type="button"
+			variant="ghost"
+			size="icon"
 			onClick={onClick}
 			className={cn(
-				"relative flex items-center justify-center rounded-lg font-semibold text-white transition-all select-none",
-				"hover:opacity-90 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring",
+				"relative rounded-lg font-semibold text-white transition-all select-none",
+				"hover:text-white hover:opacity-90 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring",
 				sizeClasses,
-				isActive && "ring-2 ring-primary ring-offset-1 ring-offset-background"
+				isActive && "ring-2 ring-primary ring-offset-1 ring-offset-rail"
 			)}
 			style={{ backgroundColor: bgColor }}
 		>
@@ -138,15 +139,15 @@ export function SearchSpaceAvatar({
 			{isShared && (
 				<span
 					className={cn(
-						"absolute -top-1 -right-1 flex items-center justify-center rounded-full bg-blue-500 text-white shadow-sm",
+						"absolute -top-1 -right-1 flex items-center justify-center rounded-full bg-gray-800 text-white shadow-sm",
 						size === "sm" ? "h-3.5 w-3.5" : "h-4 w-4"
 					)}
 					title={tCommon("shared")}
 				>
-					<Users className={cn(size === "sm" ? "h-2 w-2" : "h-2.5 w-2.5")} />
+					<Users className={cn(size === "sm" ? "size-2" : "size-2.5")} />
 				</span>
 			)}
-		</button>
+		</Button>
 	);
 
 	const menuItems = (
@@ -157,7 +158,6 @@ export function SearchSpaceAvatar({
 					{tCommon("settings")}
 				</DropdownMenuItem>
 			)}
-			{onSettings && onDelete && <DropdownMenuSeparator />}
 			{onDelete && isOwner && (
 				<DropdownMenuItem onClick={onDelete}>
 					<Trash2 className="mr-2 h-4 w-4" />
@@ -190,7 +190,7 @@ export function SearchSpaceAvatar({
 							{avatarButton}
 						</div>
 					</DropdownMenuTrigger>
-					<DropdownMenuContent className="w-48">{menuItems}</DropdownMenuContent>
+					<DropdownMenuContent>{menuItems}</DropdownMenuContent>
 				</DropdownMenu>
 			);
 		}
@@ -208,14 +208,13 @@ export function SearchSpaceAvatar({
 						{tooltipContent}
 					</TooltipContent>
 				</Tooltip>
-				<ContextMenuContent className="w-48">
+				<ContextMenuContent>
 					{onSettings && (
 						<ContextMenuItem onClick={onSettings}>
 							<Settings className="mr-2 h-4 w-4" />
 							{tCommon("settings")}
 						</ContextMenuItem>
 					)}
-					{onSettings && onDelete && <ContextMenuSeparator />}
 					{onDelete && isOwner && (
 						<ContextMenuItem onClick={onDelete}>
 							<Trash2 className="mr-2 h-4 w-4" />
diff --git a/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx b/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx
index febae35d39..b379e58e38 100644
--- a/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx
+++ b/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx
@@ -1,16 +1,18 @@
 "use client";
 
 import { useAtom, useAtomValue, useSetAtom } from "jotai";
-import { PanelRight, PanelRightClose } from "lucide-react";
+import { PanelRight } from "lucide-react";
 import dynamic from "next/dynamic";
-import { startTransition, useEffect } from "react";
-import { closeHitlEditPanelAtom, hitlEditPanelAtom } from "@/atoms/chat/hitl-edit-panel.atom";
+import { type MouseEvent, startTransition, useEffect } from "react";
 import { closeReportPanelAtom, reportPanelAtom } from "@/atoms/chat/report-panel.atom";
+import { citationPanelAtom, closeCitationPanelAtom } from "@/atoms/citation/citation-panel.atom";
 import { documentsSidebarOpenAtom } from "@/atoms/documents/ui.atoms";
 import { closeEditorPanelAtom, editorPanelAtom } from "@/atoms/editor/editor-panel.atom";
 import { rightPanelCollapsedAtom, rightPanelTabAtom } from "@/atoms/layout/right-panel.atom";
 import { Button } from "@/components/ui/button";
 import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
+import { closeHitlEditPanelAtom, hitlEditPanelAtom } from "@/features/chat-messages/hitl";
+import { cn } from "@/lib/utils";
 import { DocumentsSidebar } from "../sidebar";
 
 const EditorPanelContent = dynamic(
@@ -21,9 +23,17 @@ const EditorPanelContent = dynamic(
 	{ ssr: false, loading: () => null }
 );
 
+const CitationPanelContent = dynamic(
+	() =>
+		import("@/components/citation-panel/citation-panel").then((m) => ({
+			default: m.CitationPanelContent,
+		})),
+	{ ssr: false, loading: () => null }
+);
+
 const HitlEditPanelContent = dynamic(
 	() =>
-		import("@/components/hitl-edit-panel/hitl-edit-panel").then((m) => ({
+		import("@/features/chat-messages/hitl").then((m) => ({
 			default: m.HitlEditPanelContent,
 		})),
 	{ ssr: false, loading: () => null }
@@ -42,18 +52,86 @@ interface RightPanelProps {
 		open: boolean;
 		onOpenChange: (open: boolean) => void;
 	};
+	showCollapseButton?: boolean;
+	showTopBorder?: boolean;
+}
+
+function isKeyboardClick(event: MouseEvent) {
+	return event.detail === 0;
 }
 
 function CollapseButton({ onClick }: { onClick: () => void }) {
 	return (
 		<Tooltip>
 			<TooltipTrigger asChild>
-				<Button variant="ghost" size="icon" onClick={onClick} className="h-8 w-8 shrink-0">
-					<PanelRightClose className="h-4 w-4" />
+				<Button
+					variant="ghost"
+					size="icon"
+					tabIndex={-1}
+					onClick={(event) => {
+						if (isKeyboardClick(event)) return;
+						onClick();
+					}}
+					className="h-8 w-8 shrink-0 text-muted-foreground hover:bg-accent hover:text-accent-foreground"
+				>
+					<PanelRight className="h-4 w-4" />
 					<span className="sr-only">Collapse panel</span>
 				</Button>
 			</TooltipTrigger>
-			<TooltipContent side="left">Collapse panel</TooltipContent>
+			<TooltipContent side="bottom">Collapse panel</TooltipContent>
+		</Tooltip>
+	);
+}
+
+interface RightPanelToggleButtonProps {
+	className?: string;
+	iconClassName?: string;
+	disabled?: boolean;
+}
+
+export function RightPanelToggleButton({
+	className,
+	iconClassName,
+	disabled = false,
+}: RightPanelToggleButtonProps) {
+	const [collapsed, setCollapsed] = useAtom(rightPanelCollapsedAtom);
+	const documentsOpen = useAtomValue(documentsSidebarOpenAtom);
+	const reportState = useAtomValue(reportPanelAtom);
+	const editorState = useAtomValue(editorPanelAtom);
+	const hitlEditState = useAtomValue(hitlEditPanelAtom);
+	const citationState = useAtomValue(citationPanelAtom);
+	const reportOpen = reportState.isOpen && !!reportState.reportId;
+	const editorOpen =
+		editorState.isOpen &&
+		(editorState.kind === "document" ? !!editorState.documentId : !!editorState.localFilePath);
+	const hitlEditOpen = hitlEditState.isOpen && !!hitlEditState.onSave;
+	const citationOpen = citationState.isOpen && citationState.chunkId != null;
+	const hasContent = documentsOpen || reportOpen || editorOpen || hitlEditOpen || citationOpen;
+	const label = collapsed ? "Expand panel" : "Collapse panel";
+
+	if (!hasContent) return null;
+
+	return (
+		<Tooltip>
+			<TooltipTrigger asChild>
+				<Button
+					variant="ghost"
+					size="icon"
+					disabled={disabled}
+					onClick={() => {
+						if (disabled) return;
+						startTransition(() => setCollapsed((value) => !value));
+					}}
+					className={cn(
+						"h-8 w-8 shrink-0 text-muted-foreground hover:bg-accent hover:text-accent-foreground",
+						className
+					)}
+				>
+					<PanelRight className={cn("h-4 w-4", iconClassName)} />
+					<span className="sr-only">{label}</span>
+				</Button>
+			</TooltipTrigger>
+			<TooltipContent side="bottom">{label}</TooltipContent>
 		</Tooltip>
 	);
 }
@@ -64,41 +142,42 @@ function CollapseButton({ onClick }: { onClick: () => void }) {
  * inside the Documents header.
  */
 export function RightPanelExpandButton() {
-	const [collapsed, setCollapsed] = useAtom(rightPanelCollapsedAtom);
+	const [collapsed] = useAtom(rightPanelCollapsedAtom);
 	const documentsOpen = useAtomValue(documentsSidebarOpenAtom);
 	const reportState = useAtomValue(reportPanelAtom);
 	const editorState = useAtomValue(editorPanelAtom);
 	const hitlEditState = useAtomValue(hitlEditPanelAtom);
+	const citationState = useAtomValue(citationPanelAtom);
 	const reportOpen = reportState.isOpen && !!reportState.reportId;
-	const editorOpen = editorState.isOpen && !!editorState.documentId;
+	const editorOpen =
+		editorState.isOpen &&
+		(editorState.kind === "document" ? !!editorState.documentId : !!editorState.localFilePath);
 	const hitlEditOpen = hitlEditState.isOpen && !!hitlEditState.onSave;
-	const hasContent = documentsOpen || reportOpen || editorOpen || hitlEditOpen;
+	const citationOpen = citationState.isOpen && citationState.chunkId != null;
+	const hasContent = documentsOpen || reportOpen || editorOpen || hitlEditOpen || citationOpen;
 
 	if (!collapsed || !hasContent) return null;
 
 	return (
 		<div className="flex shrink-0 items-center px-0.5">
-			<Tooltip>
-				<TooltipTrigger asChild>
-					<Button
-						variant="ghost"
-						size="icon"
-						onClick={() => startTransition(() => setCollapsed(false))}
-						className="h-8 w-8 shrink-0 -m-0.5"
-					>
-						<PanelRight className="h-4 w-4" />
-						<span className="sr-only">Expand panel</span>
-					</Button>
-				</TooltipTrigger>
-				<TooltipContent side="left">Expand panel</TooltipContent>
-			</Tooltip>
+			<RightPanelToggleButton className="-m-0.5" />
 		</div>
 	);
 }
 
-const PANEL_WIDTHS = { sources: 420, report: 640, editor: 640, "hitl-edit": 640 } as const;
+const PANEL_WIDTHS = {
+	sources: 420,
+	report: 640,
+	editor: 640,
+	"hitl-edit": 640,
+	citation: 560,
+} as const;
 
-export function RightPanel({ documentsPanel }: RightPanelProps) {
+export function RightPanel({
+	documentsPanel,
+	showCollapseButton = true,
+	showTopBorder = false,
+}: RightPanelProps) {
 	const [activeTab] = useAtom(rightPanelTabAtom);
 	const reportState = useAtomValue(reportPanelAtom);
 	const closeReport = useSetAtom(closeReportPanelAtom);
@@ -106,54 +185,85 @@ export function RightPanel({ documentsPanel }: RightPanelProps) {
 	const closeEditor = useSetAtom(closeEditorPanelAtom);
 	const hitlEditState = useAtomValue(hitlEditPanelAtom);
 	const closeHitlEdit = useSetAtom(closeHitlEditPanelAtom);
+	const citationState = useAtomValue(citationPanelAtom);
+	const closeCitation = useSetAtom(closeCitationPanelAtom);
 	const [collapsed, setCollapsed] = useAtom(rightPanelCollapsedAtom);
 
 	const documentsOpen = documentsPanel?.open ?? false;
 	const reportOpen = reportState.isOpen && !!reportState.reportId;
-	const editorOpen = editorState.isOpen && !!editorState.documentId;
+	const editorOpen =
+		editorState.isOpen &&
+		(editorState.kind === "document" ? !!editorState.documentId : !!editorState.localFilePath);
 	const hitlEditOpen = hitlEditState.isOpen && !!hitlEditState.onSave;
+	const citationOpen = citationState.isOpen && citationState.chunkId != null;
 
 	useEffect(() => {
-		if (!reportOpen && !editorOpen && !hitlEditOpen) return;
+		if (!reportOpen && !editorOpen && !hitlEditOpen && !citationOpen) return;
 		const handleKeyDown = (e: KeyboardEvent) => {
 			if (e.key === "Escape") {
 				if (hitlEditOpen) closeHitlEdit();
+				else if (citationOpen) closeCitation();
 				else if (editorOpen) closeEditor();
 				else if (reportOpen) closeReport();
 			}
 		};
 		document.addEventListener("keydown", handleKeyDown);
 		return () => document.removeEventListener("keydown", handleKeyDown);
-	}, [reportOpen, editorOpen, hitlEditOpen, closeReport, closeEditor, closeHitlEdit]);
+	}, [
+		reportOpen,
+		editorOpen,
+		hitlEditOpen,
+		citationOpen,
+		closeReport,
+		closeEditor,
+		closeHitlEdit,
+		closeCitation,
+	]);
 
-	const isVisible = (documentsOpen || reportOpen || editorOpen || hitlEditOpen) && !collapsed;
+	const isVisible =
+		(documentsOpen || reportOpen || editorOpen || hitlEditOpen || citationOpen) && !collapsed;
 
 	let effectiveTab = activeTab;
 	if (effectiveTab === "hitl-edit" && !hitlEditOpen) {
+		effectiveTab = citationOpen
+			? "citation"
+			: editorOpen
+				? "editor"
+				: reportOpen
+					? "report"
+					: "sources";
+	} else if (effectiveTab === "citation" && !citationOpen) {
 		effectiveTab = editorOpen ? "editor" : reportOpen ? "report" : "sources";
 	} else if (effectiveTab === "editor" && !editorOpen) {
-		effectiveTab = reportOpen ? "report" : "sources";
+		effectiveTab = citationOpen ? "citation" : reportOpen ? "report" : "sources";
 	} else if (effectiveTab === "report" && !reportOpen) {
-		effectiveTab = editorOpen ? "editor" : "sources";
+		effectiveTab = citationOpen ? "citation" : editorOpen ? "editor" : "sources";
 	} else if (effectiveTab === "sources" && !documentsOpen) {
 		effectiveTab = hitlEditOpen
 			? "hitl-edit"
-			: editorOpen
-				? "editor"
-				: reportOpen
-					? "report"
-					: "sources";
+			: citationOpen
+				? "citation"
+				: editorOpen
+					? "editor"
+					: reportOpen
+						? "report"
+						: "sources";
 	}
 
 	const targetWidth = PANEL_WIDTHS[effectiveTab];
-	const collapseButton = <CollapseButton onClick={() => setCollapsed(true)} />;
+	const collapseButton = showCollapseButton ? (
+		<CollapseButton onClick={() => setCollapsed(true)} />
+	) : null;
 
 	if (!isVisible) return null;
 
 	return (
 		<aside
 			style={{ width: targetWidth }}
-			className="flex h-full shrink-0 flex-col rounded-xl border bg-sidebar text-sidebar-foreground overflow-hidden transition-[width] duration-200 ease-out"
+			className={cn(
+				"flex h-full shrink-0 flex-col border-l bg-panel text-sidebar-foreground overflow-hidden transition-[width] duration-200 ease-out",
+				showTopBorder && "border-t"
+			)}
 		>
 			<div className="relative flex-1 min-h-0 overflow-hidden">
 				{effectiveTab === "sources" && documentsOpen && documentsPanel && (
@@ -179,8 +289,10 @@ export function RightPanel({ documentsPanel }: RightPanelProps) {
 				{effectiveTab === "editor" && editorOpen && (
 					<div className="h-full flex flex-col">
 						<EditorPanelContent
-							documentId={editorState.documentId as number}
-							searchSpaceId={editorState.searchSpaceId as number}
+							kind={editorState.kind}
+							documentId={editorState.documentId ?? undefined}
+							localFilePath={editorState.localFilePath ?? undefined}
+							searchSpaceId={editorState.searchSpaceId ?? undefined}
 							title={editorState.title}
 							onClose={closeEditor}
 						/>
@@ -199,6 +311,11 @@ export function RightPanel({ documentsPanel }: RightPanelProps) {
 						/>
 					</div>
 				)}
+				{effectiveTab === "citation" && citationOpen && citationState.chunkId != null && (
+					<div className="h-full flex flex-col">
+						<CitationPanelContent chunkId={citationState.chunkId} onClose={closeCitation} />
+					</div>
+				)}
 			</div>
 		</aside>
 	);
diff --git a/surfsense_web/components/layout/ui/shell/LayoutShell.tsx b/surfsense_web/components/layout/ui/shell/LayoutShell.tsx
index 346e3fa9e2..2f799929bd 100644
--- a/surfsense_web/components/layout/ui/shell/LayoutShell.tsx
+++ b/surfsense_web/components/layout/ui/shell/LayoutShell.tsx
@@ -2,31 +2,85 @@
 
 import { useAtomValue } from "jotai";
 import { AnimatePresence, motion } from "motion/react";
+import dynamic from "next/dynamic";
 import { useCallback, useMemo, useState } from "react";
 import { activeTabAtom, type Tab } from "@/atoms/tabs/tabs.atom";
+import { Logo } from "@/components/Logo";
+import { Spinner } from "@/components/ui/spinner";
 import { TooltipProvider } from "@/components/ui/tooltip";
 import type { InboxItem } from "@/hooks/use-inbox";
 import { useIsMobile } from "@/hooks/use-mobile";
+import { useElectronAPI } from "@/hooks/use-platform";
 import { cn } from "@/lib/utils";
 import { SidebarProvider, useSidebarState } from "../../hooks";
-import { useSidebarResize } from "../../hooks/useSidebarResize";
+import {
+	SIDEBAR_MAX_WIDTH,
+	SIDEBAR_MIN_WIDTH,
+	useSidebarResize,
+} from "../../hooks/useSidebarResize";
 import type { ChatItem, NavItem, PageUsage, SearchSpace, User } from "../../types/layout.types";
 import { Header } from "../header";
 import { IconRail } from "../icon-rail";
-import { RightPanel, RightPanelExpandButton } from "../right-panel/RightPanel";
+import {
+	RightPanel,
+	RightPanelExpandButton,
+	RightPanelToggleButton,
+} from "../right-panel/RightPanel";
 import {
 	AllPrivateChatsSidebarContent,
 	AllSharedChatsSidebarContent,
-	AnnouncementsSidebarContent,
 	DocumentsSidebar,
 	InboxSidebarContent,
 	MobileSidebar,
 	MobileSidebarTrigger,
 	Sidebar,
+	SidebarCollapseButton,
 } from "../sidebar";
 import { SidebarSlideOutPanel } from "../sidebar/SidebarSlideOutPanel";
-import { DocumentTabContent } from "../tabs/DocumentTabContent";
 import { TabBar } from "../tabs/TabBar";
+import { WorkspacePanel } from "./WorkspacePanel";
+
+const DocumentTabContent = dynamic(
+	() => import("../tabs/DocumentTabContent").then((m) => ({ default: m.DocumentTabContent })),
+	{
+		ssr: false,
+		loading: () => (
+			<div className="flex-1 flex items-center justify-center h-full">
+				<Spinner size="lg" />
+			</div>
+		),
+	}
+);
+
+function MacDesktopTitleBar({
+	isSidebarCollapsed,
+	onToggleSidebar,
+	disableRightPanelToggle = false,
+}: {
+	isSidebarCollapsed: boolean;
+	onToggleSidebar: () => void;
+	disableRightPanelToggle?: boolean;
+}) {
+	return (
+		<div className="flex h-9 shrink-0 items-center bg-rail px-2 [app-region:drag] [-webkit-app-region:drag]">
+			<div className="ml-[72px] flex h-full items-center [app-region:no-drag] [-webkit-app-region:no-drag]">
+				<SidebarCollapseButton
+					isCollapsed={isSidebarCollapsed}
+					onToggle={onToggleSidebar}
+					className="h-6 w-6 rounded-md"
+					iconClassName="h-3.5 w-3.5"
+				/>
+			</div>
+			<div className="ml-auto flex h-full items-center [app-region:no-drag] [-webkit-app-region:no-drag]">
+				<RightPanelToggleButton
+					disabled={disableRightPanelToggle}
+					className="h-6 w-6 rounded-md"
+					iconClassName="h-3.5 w-3.5"
+				/>
+			</div>
+		</div>
+	);
+}
 
 // Per-tab data source
 interface TabDataSource {
@@ -40,7 +94,7 @@ interface TabDataSource {
 	markAllAsRead: () => Promise<boolean>;
 }
 
-export type ActiveSlideoutPanel = "inbox" | "shared" | "private" | "announcements" | null;
+export type ActiveSlideoutPanel = "inbox" | "shared" | "private" | null;
 
 // Inbox-related props — per-tab data sources with independent loading/pagination
 interface InboxProps {
@@ -74,12 +128,17 @@ interface LayoutShellProps {
 	onSettings?: () => void;
 	onManageMembers?: () => void;
 	onUserSettings?: () => void;
+	onAnnouncements?: () => void;
+	announcementUnreadCount?: number;
 	onLogout?: () => void;
 	pageUsage?: PageUsage;
 	theme?: string;
 	setTheme?: (theme: "light" | "dark" | "system") => void;
 	defaultCollapsed?: boolean;
 	isChatPage?: boolean;
+	useWorkspacePanel?: boolean;
+	workspacePanelViewportClassName?: string;
+	workspacePanelContentClassName?: string;
 	children: React.ReactNode;
 	className?: string;
 	// Unified slide-out panel state
@@ -108,25 +167,31 @@ function MainContentPanel({
 	isChatPage,
 	onTabSwitch,
 	onNewChat,
+	showRightPanelExpandButton = true,
+	showTopBorder = false,
 	children,
 }: {
 	isChatPage: boolean;
 	onTabSwitch?: (tab: Tab) => void;
 	onNewChat?: () => void;
+	showRightPanelExpandButton?: boolean;
+	showTopBorder?: boolean;
 	children: React.ReactNode;
 }) {
 	const activeTab = useAtomValue(activeTabAtom);
 	const isDocumentTab = activeTab?.type === "document";
 
 	return (
-		<div className="relative flex flex-1 flex-col min-w-0">
+		<div
+			className={cn("relative isolate flex flex-1 flex-col min-w-0", showTopBorder && "border-t")}
+		>
 			<TabBar
 				onTabSwitch={onTabSwitch}
 				onNewChat={onNewChat}
-				rightActions={<RightPanelExpandButton />}
+				rightActions={showRightPanelExpandButton ? <RightPanelExpandButton /> : null}
 				className="min-w-0"
 			/>
-			<div className="relative flex flex-1 flex-col rounded-xl border bg-main-panel overflow-hidden min-w-0">
+			<div className="relative flex flex-1 flex-col bg-panel overflow-hidden min-w-0">
 				<Header />
 
 				{isDocumentTab && activeTab.documentId && activeTab.searchSpaceId ? (
@@ -148,6 +213,10 @@ function MainContentPanel({
 	);
 }
 
+function DesktopWorkspaceRegion({ children }: { children: React.ReactNode }) {
+	return <div className="flex h-full min-w-0 flex-1 -mr-2">{children}</div>;
+}
+
 export function LayoutShell({
 	searchSpaces,
 	activeSearchSpaceId,
@@ -172,12 +241,17 @@ export function LayoutShell({
 	onSettings,
 	onManageMembers,
 	onUserSettings,
+	onAnnouncements,
+	announcementUnreadCount = 0,
 	onLogout,
 	pageUsage,
 	theme,
 	setTheme,
 	defaultCollapsed = false,
 	isChatPage = false,
+	useWorkspacePanel = false,
+	workspacePanelViewportClassName,
+	workspacePanelContentClassName,
 	children,
 	className,
 	activeSlideoutPanel = null,
@@ -190,18 +264,20 @@ export function LayoutShell({
 	onTabSwitch,
 }: LayoutShellProps) {
 	const isMobile = useIsMobile();
+	const electronAPI = useElectronAPI();
+	const isMacDesktop = electronAPI?.versions.platform === "darwin";
 	const [mobileMenuOpen, setMobileMenuOpen] = useState(false);
 	const { isCollapsed, setIsCollapsed, toggleCollapsed } = useSidebarState(defaultCollapsed);
 	const {
 		sidebarWidth,
-		handleMouseDown: onResizeMouseDown,
+		handlePointerDown: onResizePointerDown,
 		isDragging: isResizing,
 	} = useSidebarResize();
 
 	// Memoize context value to prevent unnecessary re-renders
 	const sidebarContextValue = useMemo(
-		() => ({ isCollapsed, setIsCollapsed, toggleCollapsed, sidebarWidth }),
-		[isCollapsed, setIsCollapsed, toggleCollapsed, sidebarWidth]
+		() => ({ isCollapsed, setIsCollapsed, toggleCollapsed }),
+		[isCollapsed, setIsCollapsed, toggleCollapsed]
 	);
 
 	const closeSlideout = useCallback(
@@ -220,16 +296,14 @@ export function LayoutShell({
 				? "Shared Chats"
 				: activeSlideoutPanel === "private"
 					? "Private Chats"
-					: activeSlideoutPanel === "announcements"
-						? "Announcements"
-						: "Panel";
+					: "Panel";
 
 	// Mobile layout
 	if (isMobile) {
 		return (
 			<SidebarProvider value={sidebarContextValue}>
 				<TooltipProvider delayDuration={0}>
-					<div className={cn("flex h-screen w-full flex-col bg-main-panel", className)}>
+					<div className={cn("flex h-screen w-full flex-col bg-panel", className)}>
 						<Header
 							mobileMenuTrigger={<MobileSidebarTrigger onClick={() => setMobileMenuOpen(true)} />}
 						/>
@@ -260,6 +334,8 @@ export function LayoutShell({
 							onSettings={onSettings}
 							onManageMembers={onManageMembers}
 							onUserSettings={onUserSettings}
+							onAnnouncements={onAnnouncements}
+							announcementUnreadCount={announcementUnreadCount}
 							onLogout={onLogout}
 							pageUsage={pageUsage}
 							theme={theme}
@@ -267,9 +343,18 @@ export function LayoutShell({
 							isLoadingChats={isLoadingChats}
 						/>
 
-						<main className={cn("flex-1", isChatPage ? "overflow-hidden" : "overflow-auto")}>
-							{children}
-						</main>
+						{useWorkspacePanel ? (
+							<WorkspacePanel
+								viewportClassName={workspacePanelViewportClassName}
+								contentClassName={workspacePanelContentClassName}
+							>
+								{children}
+							</WorkspacePanel>
+						) : (
+							<main className={cn("flex-1", isChatPage ? "overflow-hidden" : "overflow-auto")}>
+								{children}
+							</main>
+						)}
 
 						{/* Mobile unified slide-out panel */}
 						<SidebarSlideOutPanel
@@ -296,21 +381,6 @@ export function LayoutShell({
 										/>
 									</motion.div>
 								)}
-								{activeSlideoutPanel === "announcements" && (
-									<motion.div
-										key="announcements"
-										className="h-full flex flex-col"
-										initial={{ opacity: 0 }}
-										animate={{ opacity: 1 }}
-										exit={{ opacity: 0 }}
-										transition={{ duration: 0.15 }}
-									>
-										<AnnouncementsSidebarContent
-											onOpenChange={(open) => closeSlideout(open)}
-											onCloseMobileSidebar={() => setMobileMenuOpen(false)}
-										/>
-									</motion.div>
-								)}
 								{activeSlideoutPanel === "shared" && allSharedChatsPanel && (
 									<motion.div
 										key="shared"
@@ -363,161 +433,200 @@ export function LayoutShell({
 	return (
 		<SidebarProvider value={sidebarContextValue}>
 			<TooltipProvider delayDuration={0}>
-				<div
-					className={cn("flex h-screen w-full gap-2 p-2 overflow-hidden bg-muted/40", className)}
-				>
-					<div className="hidden md:flex overflow-hidden">
-						<IconRail
-							searchSpaces={searchSpaces}
-							activeSearchSpaceId={activeSearchSpaceId}
-							onSearchSpaceSelect={onSearchSpaceSelect}
-							onSearchSpaceDelete={onSearchSpaceDelete}
-							onSearchSpaceSettings={onSearchSpaceSettings}
-							onAddSearchSpace={onAddSearchSpace}
+				<div className={cn("flex h-screen w-full flex-col overflow-hidden bg-rail", className)}>
+					{isMacDesktop ? (
+						<MacDesktopTitleBar
+							isSidebarCollapsed={isCollapsed}
+							onToggleSidebar={toggleCollapsed}
+							disableRightPanelToggle={useWorkspacePanel}
 						/>
-					</div>
-
-					{/* Sidebar + slide-out panels share one container; overflow visible so panels can overlay main content */}
-					<div
-						className={cn(
-							"relative hidden md:flex shrink-0 border bg-sidebar z-20 transition-[border-radius,border-color] duration-200",
-							anySlideOutOpen ? "rounded-l-xl border-r-0 delay-0" : "rounded-xl delay-150"
-						)}
-					>
-						<Sidebar
-							searchSpace={searchSpace}
-							isCollapsed={isCollapsed}
-							onToggleCollapse={toggleCollapsed}
-							navItems={navItems}
-							onNavItemClick={onNavItemClick}
-							chats={chats}
-							sharedChats={sharedChats}
-							activeChatId={activeChatId}
-							onNewChat={onNewChat}
-							onChatSelect={onChatSelect}
-							onChatRename={onChatRename}
-							onChatDelete={onChatDelete}
-							onChatArchive={onChatArchive}
-							onViewAllSharedChats={onViewAllSharedChats}
-							onViewAllPrivateChats={onViewAllPrivateChats}
-							isSharedChatsPanelOpen={activeSlideoutPanel === "shared"}
-							isPrivateChatsPanelOpen={activeSlideoutPanel === "private"}
-							user={user}
-							onSettings={onSettings}
-							onManageMembers={onManageMembers}
-							onUserSettings={onUserSettings}
-							onLogout={onLogout}
-							pageUsage={pageUsage}
-							theme={theme}
-							setTheme={setTheme}
+					) : null}
+					<div className="flex min-h-0 flex-1 w-full gap-2 px-2 py-0 overflow-hidden">
+						<div
 							className={cn(
-								"flex shrink-0 transition-[border-radius] duration-200",
-								anySlideOutOpen ? "rounded-l-xl delay-0" : "rounded-xl delay-150"
+								"hidden md:flex overflow-hidden -mr-2 pr-2 bg-rail",
+								!isMacDesktop && "border-r"
 							)}
-							isLoadingChats={isLoadingChats}
-							sidebarWidth={sidebarWidth}
-							isResizing={isResizing}
-						/>
-
-						{/* Unified slide-out panel — shell stays open, content cross-fades */}
-						<SidebarSlideOutPanel
-							open={anySlideOutOpen}
-							onOpenChange={closeSlideout}
-							ariaLabel={panelAriaLabel}
 						>
-							<AnimatePresence mode="popLayout" initial={false}>
-								{activeSlideoutPanel === "inbox" && inbox && (
-									<motion.div
-										key="inbox"
-										className="h-full flex flex-col"
-										initial={{ opacity: 0 }}
-										animate={{ opacity: 1 }}
-										exit={{ opacity: 0 }}
-										transition={{ duration: 0.15 }}
-									>
-										<InboxSidebarContent
-											onOpenChange={(open) => closeSlideout(open)}
-											comments={inbox.comments}
-											status={inbox.status}
-											totalUnreadCount={inbox.totalUnreadCount}
-										/>
-									</motion.div>
-								)}
-								{activeSlideoutPanel === "announcements" && (
-									<motion.div
-										key="announcements"
-										className="h-full flex flex-col"
-										initial={{ opacity: 0 }}
-										animate={{ opacity: 1 }}
-										exit={{ opacity: 0 }}
-										transition={{ duration: 0.15 }}
-									>
-										<AnnouncementsSidebarContent onOpenChange={(open) => closeSlideout(open)} />
-									</motion.div>
-								)}
-								{activeSlideoutPanel === "shared" && allSharedChatsPanel && (
-									<motion.div
-										key="shared"
-										className="h-full flex flex-col"
-										initial={{ opacity: 0 }}
-										animate={{ opacity: 1 }}
-										exit={{ opacity: 0 }}
-										transition={{ duration: 0.15 }}
-									>
-										<AllSharedChatsSidebarContent
-											onOpenChange={(open) => closeSlideout(open)}
-											searchSpaceId={allSharedChatsPanel.searchSpaceId}
-										/>
-									</motion.div>
-								)}
-								{activeSlideoutPanel === "private" && allPrivateChatsPanel && (
-									<motion.div
-										key="private"
-										className="h-full flex flex-col"
-										initial={{ opacity: 0 }}
-										animate={{ opacity: 1 }}
-										exit={{ opacity: 0 }}
-										transition={{ duration: 0.15 }}
-									>
-										<AllPrivateChatsSidebarContent
-											onOpenChange={(open) => closeSlideout(open)}
-											searchSpaceId={allPrivateChatsPanel.searchSpaceId}
-										/>
-									</motion.div>
-								)}
-							</AnimatePresence>
-						</SidebarSlideOutPanel>
-					</div>
+							<IconRail
+								searchSpaces={searchSpaces}
+								activeSearchSpaceId={activeSearchSpaceId}
+								onSearchSpaceSelect={onSearchSpaceSelect}
+								onSearchSpaceDelete={onSearchSpaceDelete}
+								onSearchSpaceSettings={onSearchSpaceSettings}
+								onAddSearchSpace={onAddSearchSpace}
+								isSingleRailMode={false}
+								user={user}
+								onUserSettings={onUserSettings}
+								onAnnouncements={onAnnouncements}
+								announcementUnreadCount={announcementUnreadCount}
+								onLogout={onLogout}
+								theme={theme}
+								setTheme={setTheme}
+							/>
+						</div>
 
-					{/* Resize handle — negative margins eat the flex gap so spacing stays unchanged */}
-					{!isCollapsed && (
+						{/* Sidebar + slide-out panels share one container; overflow visible so panels can overlay main content. Negative right margin closes the flex gap so the sidebar sits flush against the main panel, separated only by a border. */}
 						<div
-							role="slider"
-							aria-label="Resize sidebar"
-							aria-valuemin={0}
-							aria-valuemax={100}
-							aria-valuenow={50}
-							tabIndex={0}
-							onMouseDown={onResizeMouseDown}
-							className="hidden md:block h-full cursor-col-resize z-30 focus:outline-none"
-							style={{ width: 8, marginLeft: -8, marginRight: -8 }}
-						/>
-					)}
+							className={cn(
+								"relative hidden md:flex shrink-0 z-20 -mr-2 bg-panel",
+								isMacDesktop ? "rounded-tl-xl border-t border-r border-l" : "border-r"
+							)}
+						>
+							<Sidebar
+								searchSpace={searchSpace}
+								isCollapsed={isCollapsed}
+								onToggleCollapse={toggleCollapsed}
+								navItems={navItems}
+								onNavItemClick={onNavItemClick}
+								chats={chats}
+								sharedChats={sharedChats}
+								activeChatId={activeChatId}
+								onNewChat={onNewChat}
+								onChatSelect={onChatSelect}
+								onChatRename={onChatRename}
+								onChatDelete={onChatDelete}
+								onChatArchive={onChatArchive}
+								onViewAllSharedChats={onViewAllSharedChats}
+								onViewAllPrivateChats={onViewAllPrivateChats}
+								isSharedChatsPanelOpen={activeSlideoutPanel === "shared"}
+								isPrivateChatsPanelOpen={activeSlideoutPanel === "private"}
+								user={user}
+								onSettings={onSettings}
+								onManageMembers={onManageMembers}
+								onUserSettings={onUserSettings}
+								onAnnouncements={onAnnouncements}
+								announcementUnreadCount={announcementUnreadCount}
+								onLogout={onLogout}
+								pageUsage={pageUsage}
+								theme={theme}
+								setTheme={setTheme}
+								renderUserProfile={false}
+								renderCollapseButton={!isMacDesktop}
+								collapsedHeaderContent={
+									isMacDesktop ? (
+										<Logo disableLink priority className="h-7 w-7 rounded-md" />
+									) : undefined
+								}
+								className={cn("flex shrink-0", isMacDesktop && "rounded-tl-xl")}
+								isLoadingChats={isLoadingChats}
+								sidebarWidth={sidebarWidth}
+								isResizing={isResizing}
+							/>
 
-					{/* Main content panel */}
-					<MainContentPanel isChatPage={isChatPage} onTabSwitch={onTabSwitch} onNewChat={onNewChat}>
-						{children}
-					</MainContentPanel>
+							{!isCollapsed && (
+								<hr
+									aria-orientation="vertical"
+									aria-label="Resize sidebar"
+									aria-valuemin={SIDEBAR_MIN_WIDTH}
+									aria-valuemax={SIDEBAR_MAX_WIDTH}
+									aria-valuenow={sidebarWidth}
+									tabIndex={0}
+									onPointerDown={onResizePointerDown}
+									style={{ touchAction: "none" }}
+									className={cn(
+										"absolute top-0 right-0 h-full w-4 translate-x-1/2 z-50 m-0 border-0 bg-transparent p-0 select-none cursor-col-resize",
+										"after:content-[''] after:absolute after:inset-y-0 after:left-1/2 after:w-px after:-translate-x-1/2 after:bg-transparent hover:after:bg-border/80 after:transition-colors",
+										isResizing && "after:bg-border"
+									)}
+								/>
+							)}
 
-					{/* Right panel — tabbed Sources/Report (desktop only) */}
-					{documentsPanel && (
-						<RightPanel
-							documentsPanel={{
-								open: documentsPanel.open,
-								onOpenChange: documentsPanel.onOpenChange,
-							}}
-						/>
-					)}
+							{/* Unified slide-out panel — shell stays open, content cross-fades */}
+							<SidebarSlideOutPanel
+								open={anySlideOutOpen}
+								onOpenChange={closeSlideout}
+								ariaLabel={panelAriaLabel}
+							>
+								<AnimatePresence mode="popLayout" initial={false}>
+									{activeSlideoutPanel === "inbox" && inbox && (
+										<motion.div
+											key="inbox"
+											className="h-full flex flex-col"
+											initial={{ opacity: 0 }}
+											animate={{ opacity: 1 }}
+											exit={{ opacity: 0 }}
+											transition={{ duration: 0.15 }}
+										>
+											<InboxSidebarContent
+												onOpenChange={(open) => closeSlideout(open)}
+												comments={inbox.comments}
+												status={inbox.status}
+												totalUnreadCount={inbox.totalUnreadCount}
+											/>
+										</motion.div>
+									)}
+									{activeSlideoutPanel === "shared" && allSharedChatsPanel && (
+										<motion.div
+											key="shared"
+											className="h-full flex flex-col"
+											initial={{ opacity: 0 }}
+											animate={{ opacity: 1 }}
+											exit={{ opacity: 0 }}
+											transition={{ duration: 0.15 }}
+										>
+											<AllSharedChatsSidebarContent
+												onOpenChange={(open) => closeSlideout(open)}
+												searchSpaceId={allSharedChatsPanel.searchSpaceId}
+											/>
+										</motion.div>
+									)}
+									{activeSlideoutPanel === "private" && allPrivateChatsPanel && (
+										<motion.div
+											key="private"
+											className="h-full flex flex-col"
+											initial={{ opacity: 0 }}
+											animate={{ opacity: 1 }}
+											exit={{ opacity: 0 }}
+											transition={{ duration: 0.15 }}
+										>
+											<AllPrivateChatsSidebarContent
+												onOpenChange={(open) => closeSlideout(open)}
+												searchSpaceId={allPrivateChatsPanel.searchSpaceId}
+											/>
+										</motion.div>
+									)}
+								</AnimatePresence>
+							</SidebarSlideOutPanel>
+						</div>
+
+						<DesktopWorkspaceRegion>
+							{useWorkspacePanel ? (
+								<WorkspacePanel
+									className={isMacDesktop ? "border-t" : undefined}
+									viewportClassName={workspacePanelViewportClassName}
+									contentClassName={workspacePanelContentClassName}
+								>
+									{children}
+								</WorkspacePanel>
+							) : (
+								<>
+									{/* Main content panel */}
+									<MainContentPanel
+										isChatPage={isChatPage}
+										onTabSwitch={onTabSwitch}
+										onNewChat={onNewChat}
+										showRightPanelExpandButton={!isMacDesktop}
+										showTopBorder={isMacDesktop}
+									>
+										{children}
+									</MainContentPanel>
+
+									{/* Right panel — tabbed Sources/Report (desktop only) */}
+									{documentsPanel ? (
+										<RightPanel
+											documentsPanel={{
+												open: documentsPanel.open,
+												onOpenChange: documentsPanel.onOpenChange,
+											}}
+											showCollapseButton={!isMacDesktop}
+											showTopBorder={isMacDesktop}
+										/>
+									) : null}
+								</>
+							)}
+						</DesktopWorkspaceRegion>
+					</div>
 				</div>
 			</TooltipProvider>
 		</SidebarProvider>
diff --git a/surfsense_web/components/layout/ui/shell/WorkspacePanel.tsx b/surfsense_web/components/layout/ui/shell/WorkspacePanel.tsx
new file mode 100644
index 0000000000..e90240a5a3
--- /dev/null
+++ b/surfsense_web/components/layout/ui/shell/WorkspacePanel.tsx
@@ -0,0 +1,39 @@
+import type { ReactNode } from "react";
+import { cn } from "@/lib/utils";
+
+interface WorkspacePanelProps {
+	children: ReactNode;
+	className?: string;
+	viewportClassName?: string;
+	contentClassName?: string;
+}
+
+/**
+ * Full workspace area to the right of the left rail/sidebar.
+ * Use this when a route should own the whole workspace instead of rendering
+ * inside the normal TabBar/Header/main/right-panel chrome.
+ */
+export function WorkspacePanel({
+	children,
+	className,
+	viewportClassName,
+	contentClassName,
+}: WorkspacePanelProps) {
+	return (
+		<main
+			className={cn(
+				"relative isolate flex min-w-0 flex-1 flex-col overflow-hidden bg-panel",
+				className
+			)}
+		>
+			<div
+				className={cn(
+					"flex min-h-0 flex-1 items-center justify-center overflow-auto px-4 py-8",
+					viewportClassName
+				)}
+			>
+				<div className={cn("w-full max-w-md", contentClassName)}>{children}</div>
+			</div>
+		</main>
+	);
+}
diff --git a/surfsense_web/components/layout/ui/shell/index.ts b/surfsense_web/components/layout/ui/shell/index.ts
index d7d96a5747..e58da76c11 100644
--- a/surfsense_web/components/layout/ui/shell/index.ts
+++ b/surfsense_web/components/layout/ui/shell/index.ts
@@ -1 +1,2 @@
 export { LayoutShell } from "./LayoutShell";
+export { WorkspacePanel } from "./WorkspacePanel";
diff --git a/surfsense_web/components/layout/ui/sidebar/AllPrivateChatsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/AllPrivateChatsSidebar.tsx
index 3459fccf60..762a210a9e 100644
--- a/surfsense_web/components/layout/ui/sidebar/AllPrivateChatsSidebar.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/AllPrivateChatsSidebar.tsx
@@ -1,14 +1,13 @@
 "use client";
 
 import { useQuery, useQueryClient } from "@tanstack/react-query";
-import { format } from "date-fns";
 import { useSetAtom } from "jotai";
 import {
 	ArchiveIcon,
 	ChevronLeft,
 	MessageCircleMore,
 	MoreHorizontal,
-	PenLine,
+	Pencil,
 	RotateCcwIcon,
 	Search,
 	Trash2,
@@ -49,6 +48,7 @@ import {
 	searchThreads,
 	updateThread,
 } from "@/lib/chat/thread-persistence";
+import { formatThreadTimestamp } from "@/lib/format-date";
 import { cn } from "@/lib/utils";
 import { SidebarSlideOutPanel } from "./SidebarSlideOutPanel";
 
@@ -252,37 +252,36 @@ export function AllPrivateChatsSidebarContent({
 
 	return (
 		<>
-			<div className="shrink-0 p-4 pb-2 space-y-3">
+			<div className="shrink-0 p-3 pb-1.5 space-y-2">
 				<div className="flex items-center gap-2">
 					{isMobile && (
 						<Button
 							variant="ghost"
 							size="icon"
-							className="h-8 w-8 rounded-full"
+							className="h-8 w-8 rounded-full text-muted-foreground hover:text-accent-foreground"
 							onClick={() => onOpenChange(false)}
 						>
-							<ChevronLeft className="h-4 w-4 text-muted-foreground" />
+							<ChevronLeft className="h-4 w-4" />
 							<span className="sr-only">{t("close") || "Close"}</span>
 						</Button>
 					)}
-					<User className="h-5 w-5 text-primary" />
 					<h2 className="text-lg font-semibold">{t("chats") || "Private Chats"}</h2>
 				</div>
 
 				<div className="relative">
-					<Search className="absolute left-3 top-1/2 -translate-y-1/2 h-4 w-4 text-muted-foreground" />
+					<Search className="absolute left-2.5 top-1/2 -translate-y-1/2 h-3.5 w-3.5 text-muted-foreground" />
 					<Input
 						type="text"
 						placeholder={t("search_chats") || "Search chats..."}
 						value={searchQuery}
 						onChange={(e) => setSearchQuery(e.target.value)}
-						className="pl-9 pr-8 h-9"
+						className="h-8 border-0 bg-muted pl-8 pr-7 text-sm shadow-none"
 					/>
 					{searchQuery && (
 						<Button
 							variant="ghost"
 							size="icon"
-							className="absolute right-1 top-1/2 -translate-y-1/2 h-6 w-6"
+							className="absolute right-1 top-1/2 h-5 w-5 -translate-y-1/2 rounded-sm text-muted-foreground transition-colors hover:bg-accent hover:text-accent-foreground"
 							onClick={handleClearSearch}
 						>
 							<X className="h-3.5 w-3.5" />
@@ -296,23 +295,23 @@ export function AllPrivateChatsSidebarContent({
 				<Tabs
 					value={showArchived ? "archived" : "active"}
 					onValueChange={(value) => setShowArchived(value === "archived")}
-					className="shrink-0 mx-4 mt-2"
+					className="shrink-0 mx-3 mt-1.5"
 				>
 					<TabsList stretch showBottomBorder size="sm">
 						<TabsTrigger value="active">
 							<span className="inline-flex items-center gap-1.5">
-								<MessageCircleMore className="h-4 w-4" />
+								<MessageCircleMore className="h-3.5 w-3.5" />
 								<span>Active</span>
-								<span className="inline-flex items-center justify-center min-w-5 h-5 px-1.5 rounded-full bg-primary/20 text-muted-foreground text-xs font-medium">
+								<span className="inline-flex h-4.5 min-w-4.5 items-center justify-center rounded-full bg-primary/20 px-1 text-[10px] font-medium text-muted-foreground">
 									{activeCount}
 								</span>
 							</span>
 						</TabsTrigger>
 						<TabsTrigger value="archived">
 							<span className="inline-flex items-center gap-1.5">
-								<ArchiveIcon className="h-4 w-4" />
+								<ArchiveIcon className="h-3.5 w-3.5" />
 								<span>Archived</span>
-								<span className="inline-flex items-center justify-center min-w-5 h-5 px-1.5 rounded-full bg-primary/20 text-muted-foreground text-xs font-medium">
+								<span className="inline-flex h-4.5 min-w-4.5 items-center justify-center rounded-full bg-primary/20 px-1 text-[10px] font-medium text-muted-foreground">
 									{archivedCount}
 								</span>
 							</span>
@@ -321,7 +320,7 @@ export function AllPrivateChatsSidebarContent({
 				</Tabs>
 			)}
 
-			<div className="flex-1 overflow-y-auto overflow-x-hidden p-2">
+			<div className="flex-1 overflow-y-auto overflow-x-hidden p-1.5">
 				{isLoading ? (
 					<div className="space-y-1">
 						{[75, 90, 55, 80, 65, 85].map((titleWidth) => (
@@ -347,19 +346,11 @@ export function AllPrivateChatsSidebarContent({
 							const isActive = currentChatId === thread.id;
 
 							return (
-								<div
-									key={thread.id}
-									className={cn(
-										"sidebar-item-lazy group flex items-center gap-2 rounded-md px-2 py-1.5 text-sm",
-										"hover:bg-accent hover:text-accent-foreground",
-										"transition-colors cursor-pointer",
-										isActive && "bg-accent text-accent-foreground",
-										isBusy && "opacity-50 pointer-events-none"
-									)}
-								>
+								<div key={thread.id} className="group/item relative w-full">
 									{isMobile ? (
-										<button
+										<Button
 											type="button"
+											variant="ghost"
 											onClick={() => {
 												if (wasLongPress()) return;
 												handleThreadClick(thread.id);
@@ -371,114 +362,134 @@ export function AllPrivateChatsSidebarContent({
 											onTouchEnd={longPressHandlers.onTouchEnd}
 											onTouchMove={longPressHandlers.onTouchMove}
 											disabled={isBusy}
-											className="flex items-center gap-2 flex-1 min-w-0 text-left overflow-hidden"
+											className={cn(
+												"h-auto w-full justify-start gap-2 overflow-hidden px-2 py-1.5 text-left font-normal",
+												"group-hover/item:bg-accent group-hover/item:text-accent-foreground",
+												"focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring",
+												isActive && "bg-accent text-accent-foreground",
+												isBusy && "opacity-50 pointer-events-none"
+											)}
 										>
 											<span className="truncate">{thread.title || "New Chat"}</span>
-										</button>
+										</Button>
 									) : (
 										<Tooltip delayDuration={600}>
 											<TooltipTrigger asChild>
-												<button
+												<Button
 													type="button"
+													variant="ghost"
 													onClick={() => handleThreadClick(thread.id)}
 													disabled={isBusy}
-													className="flex items-center gap-2 flex-1 min-w-0 text-left overflow-hidden"
+													className={cn(
+														"h-auto w-full justify-start gap-2 overflow-hidden px-2 py-1.5 text-left font-normal",
+														"group-hover/item:bg-accent group-hover/item:text-accent-foreground",
+														"focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring",
+														isActive && "bg-accent text-accent-foreground",
+														isBusy && "opacity-50 pointer-events-none"
+													)}
 												>
 													<span className="truncate">{thread.title || "New Chat"}</span>
-												</button>
+												</Button>
 											</TooltipTrigger>
 											<TooltipContent side="bottom" align="start">
 												<p>
-													{t("updated") || "Updated"}:{" "}
-													{format(new Date(thread.updatedAt), "MMM d, yyyy 'at' h:mm a")}
+													{t("updated") || "Updated"}: {formatThreadTimestamp(thread.updatedAt)}
 												</p>
 											</TooltipContent>
 										</Tooltip>
 									)}
 
-									<DropdownMenu
-										open={openDropdownId === thread.id}
-										onOpenChange={(isOpen) => setOpenDropdownId(isOpen ? thread.id : null)}
+									<div
+										className={cn(
+											"pointer-events-none absolute right-0 top-0 bottom-0 flex items-center rounded-r-md pl-6 pr-1",
+											isActive
+												? "bg-gradient-to-l from-accent from-60% to-transparent"
+												: "bg-gradient-to-l from-sidebar from-60% to-transparent group-hover/item:from-accent",
+											isMobile
+												? "opacity-0"
+												: openDropdownId === thread.id
+													? "opacity-100"
+													: "opacity-0 group-hover/item:opacity-100"
+										)}
 									>
-										<DropdownMenuTrigger asChild>
-											<Button
-												variant="ghost"
-												size="icon"
-												className={cn(
-													"h-6 w-6 shrink-0 hover:bg-transparent",
-													isMobile
-														? "opacity-0 pointer-events-none absolute"
-														: openDropdownId === thread.id
-															? "opacity-100"
-															: "md:opacity-0 md:group-hover:opacity-100 md:focus:opacity-100",
-													openDropdownId === thread.id && "bg-accent hover:bg-accent",
-													"transition-opacity"
-												)}
-												disabled={isBusy}
-											>
-												{isDeleting ? (
-													<Spinner size="xs" />
-												) : (
-													<MoreHorizontal className="h-3.5 w-3.5 text-muted-foreground" />
+										<DropdownMenu
+											open={openDropdownId === thread.id}
+											onOpenChange={(isOpen) => setOpenDropdownId(isOpen ? thread.id : null)}
+										>
+											<DropdownMenuTrigger asChild>
+												<Button
+													variant="ghost"
+													size="icon"
+													className={cn(
+														"pointer-events-auto h-6 w-6 hover:bg-transparent",
+														openDropdownId === thread.id && "bg-accent hover:bg-accent"
+													)}
+													disabled={isBusy}
+												>
+													{isDeleting ? (
+														<Spinner size="xs" />
+													) : (
+														<MoreHorizontal className="h-3.5 w-3.5 text-muted-foreground" />
+													)}
+													<span className="sr-only">{t("more_options") || "More options"}</span>
+												</Button>
+											</DropdownMenuTrigger>
+											<DropdownMenuContent align="end" className="w-40 z-80">
+												{!thread.archived && (
+													<DropdownMenuItem
+														onClick={() => handleStartRename(thread.id, thread.title || "New Chat")}
+													>
+														<Pencil className="mr-2 h-4 w-4" />
+														<span>{t("rename") || "Rename"}</span>
+													</DropdownMenuItem>
 												)}
-												<span className="sr-only">{t("more_options") || "More options"}</span>
-											</Button>
-										</DropdownMenuTrigger>
-										<DropdownMenuContent align="end" className="w-40 z-80">
-											{!thread.archived && (
 												<DropdownMenuItem
-													onClick={() => handleStartRename(thread.id, thread.title || "New Chat")}
+													onClick={() => handleToggleArchive(thread.id, thread.archived)}
+													disabled={isArchiving}
 												>
-													<PenLine className="mr-2 h-4 w-4" />
-													<span>{t("rename") || "Rename"}</span>
+													{thread.archived ? (
+														<>
+															<RotateCcwIcon className="mr-2 h-4 w-4" />
+															<span>{t("unarchive") || "Restore"}</span>
+														</>
+													) : (
+														<>
+															<ArchiveIcon className="mr-2 h-4 w-4" />
+															<span>{t("archive") || "Archive"}</span>
+														</>
+													)}
 												</DropdownMenuItem>
-											)}
-											<DropdownMenuItem
-												onClick={() => handleToggleArchive(thread.id, thread.archived)}
-												disabled={isArchiving}
-											>
-												{thread.archived ? (
-													<>
-														<RotateCcwIcon className="mr-2 h-4 w-4" />
-														<span>{t("unarchive") || "Restore"}</span>
-													</>
-												) : (
-													<>
-														<ArchiveIcon className="mr-2 h-4 w-4" />
-														<span>{t("archive") || "Archive"}</span>
-													</>
-												)}
-											</DropdownMenuItem>
-											<DropdownMenuItem onClick={() => handleDeleteThread(thread.id)}>
-												<Trash2 className="mr-2 h-4 w-4" />
-												<span>{t("delete") || "Delete"}</span>
-											</DropdownMenuItem>
-										</DropdownMenuContent>
-									</DropdownMenu>
+												<DropdownMenuItem onClick={() => handleDeleteThread(thread.id)}>
+													<Trash2 className="mr-2 h-4 w-4" />
+													<span>{t("delete") || "Delete"}</span>
+												</DropdownMenuItem>
+											</DropdownMenuContent>
+										</DropdownMenu>
+									</div>
 								</div>
 							);
 						})}
 					</div>
 				) : isSearchMode ? (
 					<div className="text-center py-8">
-						<Search className="h-12 w-12 mx-auto text-muted-foreground mb-3" />
-						<p className="text-sm text-muted-foreground">
+						<Search className="mx-auto mb-2.5 h-10 w-10 text-muted-foreground" />
+						<p className="text-xs text-muted-foreground">
 							{t("no_chats_found") || "No chats found"}
 						</p>
-						<p className="text-xs text-muted-foreground/70 mt-1">
+						<p className="mt-1 text-[11px] text-muted-foreground/70">
 							{t("try_different_search") || "Try a different search term"}
 						</p>
 					</div>
 				) : (
 					<div className="text-center py-8">
-						<User className="h-12 w-12 mx-auto text-muted-foreground mb-3" />
-						<p className="text-sm text-muted-foreground">
+						<User className="mx-auto mb-2.5 h-10 w-10 text-muted-foreground" />
+						<p className="text-xs text-muted-foreground">
 							{showArchived
 								? t("no_archived_chats") || "No archived chats"
 								: t("no_chats") || "No private chats"}
 						</p>
 						{!showArchived && (
-							<p className="text-xs text-muted-foreground/70 mt-1">
+							<p className="mt-1 text-[11px] text-muted-foreground/70">
 								{t("start_new_chat_hint") || "Start a new chat from the chat page"}
 							</p>
 						)}
diff --git a/surfsense_web/components/layout/ui/sidebar/AllSharedChatsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/AllSharedChatsSidebar.tsx
index 097d101216..3130dff928 100644
--- a/surfsense_web/components/layout/ui/sidebar/AllSharedChatsSidebar.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/AllSharedChatsSidebar.tsx
@@ -1,14 +1,13 @@
 "use client";
 
 import { useQuery, useQueryClient } from "@tanstack/react-query";
-import { format } from "date-fns";
 import { useSetAtom } from "jotai";
 import {
 	ArchiveIcon,
 	ChevronLeft,
 	MessageCircleMore,
 	MoreHorizontal,
-	PenLine,
+	Pencil,
 	RotateCcwIcon,
 	Search,
 	Trash2,
@@ -49,6 +48,7 @@ import {
 	searchThreads,
 	updateThread,
 } from "@/lib/chat/thread-persistence";
+import { formatThreadTimestamp } from "@/lib/format-date";
 import { cn } from "@/lib/utils";
 import { SidebarSlideOutPanel } from "./SidebarSlideOutPanel";
 
@@ -251,37 +251,36 @@ export function AllSharedChatsSidebarContent({
 
 	return (
 		<>
-			<div className="shrink-0 p-4 pb-2 space-y-3">
+			<div className="shrink-0 p-3 pb-1.5 space-y-2">
 				<div className="flex items-center gap-2">
 					{isMobile && (
 						<Button
 							variant="ghost"
 							size="icon"
-							className="h-8 w-8 rounded-full"
+							className="h-8 w-8 rounded-full text-muted-foreground hover:text-accent-foreground"
 							onClick={() => onOpenChange(false)}
 						>
-							<ChevronLeft className="h-4 w-4 text-muted-foreground" />
+							<ChevronLeft className="h-4 w-4" />
 							<span className="sr-only">{t("close") || "Close"}</span>
 						</Button>
 					)}
-					<Users className="h-5 w-5 text-primary" />
 					<h2 className="text-lg font-semibold">{t("shared_chats") || "Shared Chats"}</h2>
 				</div>
 
 				<div className="relative">
-					<Search className="absolute left-3 top-1/2 -translate-y-1/2 h-4 w-4 text-muted-foreground" />
+					<Search className="absolute left-2.5 top-1/2 -translate-y-1/2 h-3.5 w-3.5 text-muted-foreground" />
 					<Input
 						type="text"
 						placeholder={t("search_chats") || "Search chats..."}
 						value={searchQuery}
 						onChange={(e) => setSearchQuery(e.target.value)}
-						className="pl-9 pr-8 h-9"
+						className="h-8 border-0 bg-muted pl-8 pr-7 text-sm shadow-none"
 					/>
 					{searchQuery && (
 						<Button
 							variant="ghost"
 							size="icon"
-							className="absolute right-1 top-1/2 -translate-y-1/2 h-6 w-6"
+							className="absolute right-1 top-1/2 h-5 w-5 -translate-y-1/2 rounded-sm text-muted-foreground transition-colors hover:bg-accent hover:text-accent-foreground"
 							onClick={handleClearSearch}
 						>
 							<X className="h-3.5 w-3.5" />
@@ -295,23 +294,23 @@ export function AllSharedChatsSidebarContent({
 				<Tabs
 					value={showArchived ? "archived" : "active"}
 					onValueChange={(value) => setShowArchived(value === "archived")}
-					className="shrink-0 mx-4 mt-2"
+					className="shrink-0 mx-3 mt-1.5"
 				>
 					<TabsList stretch showBottomBorder size="sm">
 						<TabsTrigger value="active">
 							<span className="inline-flex items-center gap-1.5">
-								<MessageCircleMore className="h-4 w-4" />
+								<MessageCircleMore className="h-3.5 w-3.5" />
 								<span>Active</span>
-								<span className="inline-flex items-center justify-center min-w-5 h-5 px-1.5 rounded-full bg-primary/20 text-muted-foreground text-xs font-medium">
+								<span className="inline-flex h-4.5 min-w-4.5 items-center justify-center rounded-full bg-primary/20 px-1 text-[10px] font-medium text-muted-foreground">
 									{activeCount}
 								</span>
 							</span>
 						</TabsTrigger>
 						<TabsTrigger value="archived">
 							<span className="inline-flex items-center gap-1.5">
-								<ArchiveIcon className="h-4 w-4" />
+								<ArchiveIcon className="h-3.5 w-3.5" />
 								<span>Archived</span>
-								<span className="inline-flex items-center justify-center min-w-5 h-5 px-1.5 rounded-full bg-primary/20 text-muted-foreground text-xs font-medium">
+								<span className="inline-flex h-4.5 min-w-4.5 items-center justify-center rounded-full bg-primary/20 px-1 text-[10px] font-medium text-muted-foreground">
 									{archivedCount}
 								</span>
 							</span>
@@ -320,7 +319,7 @@ export function AllSharedChatsSidebarContent({
 				</Tabs>
 			)}
 
-			<div className="flex-1 overflow-y-auto overflow-x-hidden p-2">
+			<div className="flex-1 overflow-y-auto overflow-x-hidden p-1.5">
 				{isLoading ? (
 					<div className="space-y-1">
 						{[75, 90, 55, 80, 65, 85].map((titleWidth) => (
@@ -346,19 +345,11 @@ export function AllSharedChatsSidebarContent({
 							const isActive = currentChatId === thread.id;
 
 							return (
-								<div
-									key={thread.id}
-									className={cn(
-										"sidebar-item-lazy group flex items-center gap-2 rounded-md px-2 py-1.5 text-sm",
-										"hover:bg-accent hover:text-accent-foreground",
-										"transition-colors cursor-pointer",
-										isActive && "bg-accent text-accent-foreground",
-										isBusy && "opacity-50 pointer-events-none"
-									)}
-								>
+								<div key={thread.id} className="group/item relative w-full">
 									{isMobile ? (
-										<button
+										<Button
 											type="button"
+											variant="ghost"
 											onClick={() => {
 												if (wasLongPress()) return;
 												handleThreadClick(thread.id);
@@ -370,114 +361,134 @@ export function AllSharedChatsSidebarContent({
 											onTouchEnd={longPressHandlers.onTouchEnd}
 											onTouchMove={longPressHandlers.onTouchMove}
 											disabled={isBusy}
-											className="flex items-center gap-2 flex-1 min-w-0 text-left overflow-hidden"
+											className={cn(
+												"h-auto w-full justify-start gap-2 overflow-hidden rounded-md px-2 py-1.5 text-sm text-left",
+												"group-hover/item:bg-accent group-hover/item:text-accent-foreground",
+												"focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring",
+												isActive && "bg-accent text-accent-foreground",
+												isBusy && "opacity-50 pointer-events-none"
+											)}
 										>
 											<span className="truncate">{thread.title || "New Chat"}</span>
-										</button>
+										</Button>
 									) : (
 										<Tooltip delayDuration={600}>
 											<TooltipTrigger asChild>
-												<button
+												<Button
 													type="button"
+													variant="ghost"
 													onClick={() => handleThreadClick(thread.id)}
 													disabled={isBusy}
-													className="flex items-center gap-2 flex-1 min-w-0 text-left overflow-hidden"
+													className={cn(
+														"h-auto w-full justify-start gap-2 overflow-hidden rounded-md px-2 py-1.5 text-sm text-left",
+														"group-hover/item:bg-accent group-hover/item:text-accent-foreground",
+														"focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring",
+														isActive && "bg-accent text-accent-foreground",
+														isBusy && "opacity-50 pointer-events-none"
+													)}
 												>
 													<span className="truncate">{thread.title || "New Chat"}</span>
-												</button>
+												</Button>
 											</TooltipTrigger>
 											<TooltipContent side="bottom" align="start">
 												<p>
-													{t("updated") || "Updated"}:{" "}
-													{format(new Date(thread.updatedAt), "MMM d, yyyy 'at' h:mm a")}
+													{t("updated") || "Updated"}: {formatThreadTimestamp(thread.updatedAt)}
 												</p>
 											</TooltipContent>
 										</Tooltip>
 									)}
 
-									<DropdownMenu
-										open={openDropdownId === thread.id}
-										onOpenChange={(isOpen) => setOpenDropdownId(isOpen ? thread.id : null)}
+									<div
+										className={cn(
+											"pointer-events-none absolute right-0 top-0 bottom-0 flex items-center rounded-r-md pl-6 pr-1",
+											isActive
+												? "bg-gradient-to-l from-accent from-60% to-transparent"
+												: "bg-gradient-to-l from-sidebar from-60% to-transparent group-hover/item:from-accent",
+											isMobile
+												? "opacity-0"
+												: openDropdownId === thread.id
+													? "opacity-100"
+													: "opacity-0 group-hover/item:opacity-100"
+										)}
 									>
-										<DropdownMenuTrigger asChild>
-											<Button
-												variant="ghost"
-												size="icon"
-												className={cn(
-													"h-6 w-6 shrink-0 hover:bg-transparent",
-													isMobile
-														? "opacity-0 pointer-events-none absolute"
-														: openDropdownId === thread.id
-															? "opacity-100"
-															: "md:opacity-0 md:group-hover:opacity-100 md:focus:opacity-100",
-													openDropdownId === thread.id && "bg-accent hover:bg-accent",
-													"transition-opacity"
-												)}
-												disabled={isBusy}
-											>
-												{isDeleting ? (
-													<Spinner size="xs" />
-												) : (
-													<MoreHorizontal className="h-3.5 w-3.5 text-muted-foreground" />
+										<DropdownMenu
+											open={openDropdownId === thread.id}
+											onOpenChange={(isOpen) => setOpenDropdownId(isOpen ? thread.id : null)}
+										>
+											<DropdownMenuTrigger asChild>
+												<Button
+													variant="ghost"
+													size="icon"
+													className={cn(
+														"pointer-events-auto h-6 w-6 hover:bg-transparent",
+														openDropdownId === thread.id && "bg-accent hover:bg-accent"
+													)}
+													disabled={isBusy}
+												>
+													{isDeleting ? (
+														<Spinner size="xs" />
+													) : (
+														<MoreHorizontal className="h-3.5 w-3.5 text-muted-foreground" />
+													)}
+													<span className="sr-only">{t("more_options") || "More options"}</span>
+												</Button>
+											</DropdownMenuTrigger>
+											<DropdownMenuContent align="end" className="w-40 z-80">
+												{!thread.archived && (
+													<DropdownMenuItem
+														onClick={() => handleStartRename(thread.id, thread.title || "New Chat")}
+													>
+														<Pencil className="mr-2 h-4 w-4" />
+														<span>{t("rename") || "Rename"}</span>
+													</DropdownMenuItem>
 												)}
-												<span className="sr-only">{t("more_options") || "More options"}</span>
-											</Button>
-										</DropdownMenuTrigger>
-										<DropdownMenuContent align="end" className="w-40 z-80">
-											{!thread.archived && (
 												<DropdownMenuItem
-													onClick={() => handleStartRename(thread.id, thread.title || "New Chat")}
+													onClick={() => handleToggleArchive(thread.id, thread.archived)}
+													disabled={isArchiving}
 												>
-													<PenLine className="mr-2 h-4 w-4" />
-													<span>{t("rename") || "Rename"}</span>
+													{thread.archived ? (
+														<>
+															<RotateCcwIcon className="mr-2 h-4 w-4" />
+															<span>{t("unarchive") || "Restore"}</span>
+														</>
+													) : (
+														<>
+															<ArchiveIcon className="mr-2 h-4 w-4" />
+															<span>{t("archive") || "Archive"}</span>
+														</>
+													)}
 												</DropdownMenuItem>
-											)}
-											<DropdownMenuItem
-												onClick={() => handleToggleArchive(thread.id, thread.archived)}
-												disabled={isArchiving}
-											>
-												{thread.archived ? (
-													<>
-														<RotateCcwIcon className="mr-2 h-4 w-4" />
-														<span>{t("unarchive") || "Restore"}</span>
-													</>
-												) : (
-													<>
-														<ArchiveIcon className="mr-2 h-4 w-4" />
-														<span>{t("archive") || "Archive"}</span>
-													</>
-												)}
-											</DropdownMenuItem>
-											<DropdownMenuItem onClick={() => handleDeleteThread(thread.id)}>
-												<Trash2 className="mr-2 h-4 w-4" />
-												<span>{t("delete") || "Delete"}</span>
-											</DropdownMenuItem>
-										</DropdownMenuContent>
-									</DropdownMenu>
+												<DropdownMenuItem onClick={() => handleDeleteThread(thread.id)}>
+													<Trash2 className="mr-2 h-4 w-4" />
+													<span>{t("delete") || "Delete"}</span>
+												</DropdownMenuItem>
+											</DropdownMenuContent>
+										</DropdownMenu>
+									</div>
 								</div>
 							);
 						})}
 					</div>
 				) : isSearchMode ? (
 					<div className="text-center py-8">
-						<Search className="h-12 w-12 mx-auto text-muted-foreground mb-3" />
-						<p className="text-sm text-muted-foreground">
+						<Search className="mx-auto mb-2.5 h-10 w-10 text-muted-foreground" />
+						<p className="text-xs text-muted-foreground">
 							{t("no_chats_found") || "No chats found"}
 						</p>
-						<p className="text-xs text-muted-foreground/70 mt-1">
+						<p className="mt-1 text-[11px] text-muted-foreground/70">
 							{t("try_different_search") || "Try a different search term"}
 						</p>
 					</div>
 				) : (
 					<div className="text-center py-8">
-						<Users className="h-12 w-12 mx-auto text-muted-foreground mb-3" />
-						<p className="text-sm text-muted-foreground">
+						<Users className="mx-auto mb-2.5 h-10 w-10 text-muted-foreground" />
+						<p className="text-xs text-muted-foreground">
 							{showArchived
 								? t("no_archived_chats") || "No archived chats"
 								: t("no_shared_chats") || "No shared chats"}
 						</p>
 						{!showArchived && (
-							<p className="text-xs text-muted-foreground/70 mt-1">
+							<p className="mt-1 text-[11px] text-muted-foreground/70">
 								Share a chat to collaborate with your team
 							</p>
 						)}
diff --git a/surfsense_web/components/layout/ui/sidebar/AnnouncementsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/AnnouncementsSidebar.tsx
deleted file mode 100644
index 6a1233537f..0000000000
--- a/surfsense_web/components/layout/ui/sidebar/AnnouncementsSidebar.tsx
+++ /dev/null
@@ -1,84 +0,0 @@
-"use client";
-
-import { ChevronLeft } from "lucide-react";
-import { useEffect } from "react";
-import { AnnouncementCard } from "@/components/announcements/AnnouncementCard";
-import { AnnouncementsEmptyState } from "@/components/announcements/AnnouncementsEmptyState";
-import { Button } from "@/components/ui/button";
-import { useAnnouncements } from "@/hooks/use-announcements";
-import { useMediaQuery } from "@/hooks/use-media-query";
-import { SidebarSlideOutPanel } from "./SidebarSlideOutPanel";
-
-export interface AnnouncementsSidebarContentProps {
-	onOpenChange: (open: boolean) => void;
-	onCloseMobileSidebar?: () => void;
-}
-
-interface AnnouncementsSidebarProps extends AnnouncementsSidebarContentProps {
-	open: boolean;
-}
-
-export function AnnouncementsSidebarContent({
-	onOpenChange,
-	onCloseMobileSidebar,
-}: AnnouncementsSidebarContentProps) {
-	const isMobile = !useMediaQuery("(min-width: 640px)");
-	const { announcements, markAllRead } = useAnnouncements();
-
-	useEffect(() => {
-		markAllRead();
-	}, [markAllRead]);
-
-	return (
-		<div className="h-full flex flex-col">
-			<div className="shrink-0 p-4 pb-2 space-y-3">
-				<div className="flex items-center justify-between">
-					<div className="flex items-center gap-2">
-						{isMobile && (
-							<Button
-								variant="ghost"
-								size="icon"
-								className="h-8 w-8 rounded-full"
-								onClick={() => {
-									onOpenChange(false);
-									onCloseMobileSidebar?.();
-								}}
-							>
-								<ChevronLeft className="h-4 w-4 text-muted-foreground" />
-								<span className="sr-only">Close</span>
-							</Button>
-						)}
-						<h2 className="text-lg font-semibold">Announcements</h2>
-					</div>
-				</div>
-			</div>
-
-			<div className="flex-1 overflow-y-auto p-4">
-				{announcements.length === 0 ? (
-					<AnnouncementsEmptyState />
-				) : (
-					<div className="flex flex-col gap-4">
-						{announcements.map((announcement) => (
-							<AnnouncementCard key={announcement.id} announcement={announcement} />
-						))}
-					</div>
-				)}
-			</div>
-		</div>
-	);
-}
-
-export function AnnouncementsSidebar({
-	open,
-	onOpenChange,
-	onCloseMobileSidebar,
-}: AnnouncementsSidebarProps) {
-	return (
-		<SidebarSlideOutPanel open={open} onOpenChange={onOpenChange} ariaLabel="Announcements">
-			<AnnouncementsSidebarContent
-				onOpenChange={onOpenChange}
-				onCloseMobileSidebar={onCloseMobileSidebar}
-			/>
-		</SidebarSlideOutPanel>
-	);
-}
diff --git a/surfsense_web/components/layout/ui/sidebar/AuthenticatedPageUsageDisplay.tsx b/surfsense_web/components/layout/ui/sidebar/AuthenticatedPageUsageDisplay.tsx
new file mode 100644
index 0000000000..ad31d50bba
--- /dev/null
+++ b/surfsense_web/components/layout/ui/sidebar/AuthenticatedPageUsageDisplay.tsx
@@ -0,0 +1,15 @@
+"use client";
+
+import { useQuery } from "@rocicorp/zero/react";
+import { useIsAnonymous } from "@/contexts/anonymous-mode";
+import { queries } from "@/zero/queries";
+import { PageUsageDisplay } from "./PageUsageDisplay";
+
+export function AuthenticatedPageUsageDisplay() {
+	const isAnonymous = useIsAnonymous();
+	const [me] = useQuery(queries.user.me({}));
+
+	if (isAnonymous || !me) return null;
+
+	return <PageUsageDisplay pagesUsed={me.pagesUsed} pagesLimit={me.pagesLimit} />;
+}
diff --git a/surfsense_web/components/layout/ui/sidebar/ChatListItem.tsx b/surfsense_web/components/layout/ui/sidebar/ChatListItem.tsx
index 7f3089a899..ea4f946c2b 100644
--- a/surfsense_web/components/layout/ui/sidebar/ChatListItem.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/ChatListItem.tsx
@@ -1,6 +1,6 @@
 "use client";
 
-import { ArchiveIcon, MoreHorizontal, PenLine, RotateCcwIcon, Trash2 } from "lucide-react";
+import { ArchiveIcon, MoreHorizontal, Pencil, RotateCcwIcon, Trash2 } from "lucide-react";
 import { useTranslations } from "next-intl";
 import { useCallback, useState } from "react";
 import { Button } from "@/components/ui/button";
@@ -56,19 +56,20 @@ export function ChatListItem({
 
 	return (
 		<div className="group/item relative w-full">
-			<button
+			<Button
 				type="button"
+				variant="ghost"
 				onClick={handleClick}
 				{...(isMobile ? longPressHandlers : {})}
 				className={cn(
-					"flex w-full items-center gap-2 overflow-hidden rounded-md p-2 text-sm text-left",
+					"h-auto w-full justify-start gap-2 overflow-hidden px-2 py-1.5 text-left font-normal",
 					"group-hover/item:bg-accent group-hover/item:text-accent-foreground",
 					"focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring",
 					isActive && "bg-accent text-accent-foreground"
 				)}
 			>
 				<span className="truncate">{animatedName}</span>
-			</button>
+			</Button>
 
 			{/* Actions dropdown - trigger hidden on mobile, long-press opens it instead */}
 			<div
@@ -106,7 +107,7 @@ export function ChatListItem({
 									onRename();
 								}}
 							>
-								<PenLine className="mr-2 h-4 w-4" />
+								<Pencil className="mr-2 h-4 w-4" />
 								<span>{t("rename") || "Rename"}</span>
 							</DropdownMenuItem>
 						)}
diff --git a/surfsense_web/components/layout/ui/sidebar/DesktopLocalTabContent.tsx b/surfsense_web/components/layout/ui/sidebar/DesktopLocalTabContent.tsx
new file mode 100644
index 0000000000..71d798e22f
--- /dev/null
+++ b/surfsense_web/components/layout/ui/sidebar/DesktopLocalTabContent.tsx
@@ -0,0 +1,211 @@
+"use client";
+
+import { useAtom } from "jotai";
+import { Folder, FolderPlus, Search, X } from "lucide-react";
+import { useCallback, useMemo, useRef, useState } from "react";
+import { localExpandedFolderKeysAtom } from "@/atoms/documents/folder.atoms";
+import { Button } from "@/components/ui/button";
+import {
+	DropdownMenu,
+	DropdownMenuContent,
+	DropdownMenuItem,
+	DropdownMenuLabel,
+	DropdownMenuTrigger,
+} from "@/components/ui/dropdown-menu";
+import { Input } from "@/components/ui/input";
+import { Separator } from "@/components/ui/separator";
+import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
+import { useDebouncedValue } from "@/hooks/use-debounced-value";
+import { LocalFilesystemBrowser } from "./LocalFilesystemBrowser";
+
+const getFolderDisplayName = (rootPath: string): string =>
+	rootPath.split(/[\\/]/).at(-1) || rootPath;
+
+interface DesktopLocalTabContentProps {
+	localRootPaths: string[];
+	canAddMoreLocalRoots: boolean;
+	maxLocalFilesystemRoots: number;
+	searchSpaceId: number;
+	onPickFilesystemRoot: () => Promise<void> | void;
+	onRemoveFilesystemRoot: (rootPath: string) => Promise<void> | void;
+	onClearFilesystemRoots: () => Promise<void> | void;
+	onOpenLocalFile: (localFilePath: string) => void;
+	electronAvailable: boolean;
+}
+
+export function DesktopLocalTabContent({
+	localRootPaths,
+	canAddMoreLocalRoots,
+	maxLocalFilesystemRoots,
+	searchSpaceId,
+	onPickFilesystemRoot,
+	onRemoveFilesystemRoot,
+	onClearFilesystemRoots,
+	onOpenLocalFile,
+	electronAvailable,
+}: DesktopLocalTabContentProps) {
+	const [localSearch, setLocalSearch] = useState("");
+	const debouncedLocalSearch = useDebouncedValue(localSearch, 250);
+	const localSearchInputRef = useRef<HTMLInputElement>(null);
+	const [expandedFolderKeyMap, setExpandedFolderKeyMap] = useAtom(localExpandedFolderKeysAtom);
+	const expandedFolderKeys = useMemo(
+		() => new Set(expandedFolderKeyMap[searchSpaceId] ?? []),
+		[expandedFolderKeyMap, searchSpaceId]
+	);
+	const handleExpandedFolderKeysChange = useCallback(
+		(nextExpandedKeys: Set<string>) => {
+			setExpandedFolderKeyMap((prev) => ({
+				...prev,
+				[searchSpaceId]: Array.from(nextExpandedKeys),
+			}));
+		},
+		[searchSpaceId, setExpandedFolderKeyMap]
+	);
+
+	return (
+		<div className="flex min-h-0 flex-1 flex-col select-none">
+			<div className="mx-4 mt-4 mb-3">
+				<div className="flex h-7 w-full items-stretch rounded-lg border-0 bg-muted text-[11px] text-muted-foreground">
+					{localRootPaths.length > 0 ? (
+						<DropdownMenu>
+							<DropdownMenuTrigger asChild>
+								<Button
+									type="button"
+									variant="ghost"
+									size="sm"
+									className="min-w-0 flex-1 h-full justify-start gap-1 px-2 text-left text-[11px] text-muted-foreground"
+									title={localRootPaths.join("\n")}
+									aria-label="Manage selected folders"
+								>
+									<Folder className="size-3 shrink-0" />
+									<span className="truncate">
+										{localRootPaths.length === 1
+											? "1 folder selected"
+											: `${localRootPaths.length} folders selected`}
+									</span>
+								</Button>
+							</DropdownMenuTrigger>
+							<DropdownMenuContent align="start" className="w-56 select-none p-0.5">
+								<DropdownMenuLabel className="px-1.5 pt-1.5 pb-0.5 text-xs font-medium text-muted-foreground">
+									Selected folders
+								</DropdownMenuLabel>
+								{localRootPaths.map((rootPath) => (
+									<DropdownMenuItem
+										key={rootPath}
+										onSelect={(event) => event.preventDefault()}
+										className="group h-8 gap-1.5 px-1.5 text-sm text-foreground"
+									>
+										<Folder className="size-3.5 text-muted-foreground" />
+										<span className="min-w-0 flex-1 truncate">
+											{getFolderDisplayName(rootPath)}
+										</span>
+										<Button
+											type="button"
+											variant="ghost"
+											size="icon"
+											className="size-5 text-muted-foreground hover:text-accent-foreground"
+											onClick={(event) => {
+												event.stopPropagation();
+												void onRemoveFilesystemRoot(rootPath);
+											}}
+											aria-label={`Remove ${getFolderDisplayName(rootPath)}`}
+										>
+											<X className="size-3" />
+										</Button>
+									</DropdownMenuItem>
+								))}
+								<DropdownMenuItem
+									variant="destructive"
+									className="h-8 px-1.5 text-xs text-destructive focus:text-destructive"
+									onClick={() => {
+										void onClearFilesystemRoots();
+									}}
+								>
+									Clear all folders
+								</DropdownMenuItem>
+							</DropdownMenuContent>
+						</DropdownMenu>
+					) : (
+						<div
+							className="min-w-0 flex-1 flex items-center gap-1 px-2 transition-colors hover:bg-accent hover:text-accent-foreground"
+							title="No local folders selected"
+						>
+							<Folder className="size-3 shrink-0" />
+							<span className="truncate">No local folders selected</span>
+						</div>
+					)}
+					<Separator
+						orientation="vertical"
+						className="data-[orientation=vertical]:h-3 self-center bg-border/60 dark:bg-white/10"
+					/>
+					{electronAvailable ? (
+						<Tooltip>
+							<TooltipTrigger asChild>
+								<span className="inline-flex">
+									<Button
+										type="button"
+										variant="ghost"
+										size="icon"
+										className="h-full w-8 text-muted-foreground hover:text-accent-foreground"
+										onClick={() => {
+											void onPickFilesystemRoot();
+										}}
+										disabled={!canAddMoreLocalRoots}
+										aria-label="Add folder"
+									>
+										<FolderPlus className="size-3.5" />
+									</Button>
+								</span>
+							</TooltipTrigger>
+							<TooltipContent side="top" className="text-xs">
+								{canAddMoreLocalRoots
+									? "Add folder"
+									: `You can add up to ${maxLocalFilesystemRoots} folders`}
+							</TooltipContent>
+						</Tooltip>
+					) : null}
+				</div>
+			</div>
+			<div className="mx-4 mb-2">
+				<div className="relative flex-1 min-w-0">
+					<div className="pointer-events-none absolute inset-y-0 left-0 flex items-center pl-3 text-muted-foreground">
+						<Search size={13} aria-hidden="true" />
+					</div>
+					<Input
+						ref={localSearchInputRef}
+						className="peer h-8 w-full border-0 bg-muted pl-8 pr-8 text-sm shadow-none select-none focus:select-text"
+						value={localSearch}
+						onChange={(e) => setLocalSearch(e.target.value)}
+						placeholder="Search local files"
+						type="text"
+						aria-label="Search local files"
+					/>
+					{Boolean(localSearch) && (
+						<Button
+							type="button"
+							variant="ghost"
+							size="icon"
+							className="absolute inset-y-0 right-0 h-full w-8 text-muted-foreground hover:text-accent-foreground"
+							aria-label="Clear local search"
+							onClick={() => {
+								setLocalSearch("");
+								localSearchInputRef.current?.focus();
+							}}
+						>
+							<X size={13} strokeWidth={2} aria-hidden="true" />
+						</Button>
+					)}
+				</div>
+			</div>
+			<LocalFilesystemBrowser
+				rootPaths={localRootPaths}
+				searchSpaceId={searchSpaceId}
+				active
+				searchQuery={debouncedLocalSearch.trim() || undefined}
+				onOpenFile={onOpenLocalFile}
+				expandedFolderKeys={expandedFolderKeys}
+				onExpandedFolderKeysChange={handleExpandedFolderKeysChange}
+			/>
+		</div>
+	);
+}
diff --git a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx
index 8b3a119ae3..cdb757cb28 100644
--- a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx
@@ -2,19 +2,40 @@
 
 import { useQuery } from "@rocicorp/zero/react";
 import { useAtom, useAtomValue, useSetAtom } from "jotai";
-import { ChevronLeft, ChevronRight, FolderClock, Trash2, Unplug } from "lucide-react";
+import {
+	ChevronLeft,
+	ChevronRight,
+	FileText,
+	FolderClock,
+	Laptop,
+	Lock,
+	Paperclip,
+	Server,
+	Trash2,
+	Unplug,
+	Upload,
+	X,
+} from "lucide-react";
+import dynamic from "next/dynamic";
+import Link from "next/link";
 import { useParams } from "next/navigation";
 import { useTranslations } from "next-intl";
-import { useCallback, useEffect, useMemo, useState } from "react";
+import type React from "react";
+import { useCallback, useEffect, useMemo, useRef, useState } from "react";
 import { toast } from "sonner";
-import { sidebarSelectedDocumentsAtom } from "@/atoms/chat/mentioned-documents.atom";
+import { agentFlagsAtom } from "@/atoms/agent/agent-flags-query.atom";
+import { makeFolderMention, mentionedDocumentsAtom } from "@/atoms/chat/mentioned-documents.atom";
 import { connectorDialogOpenAtom } from "@/atoms/connector-dialog/connector-dialog.atoms";
 import { connectorsAtom } from "@/atoms/connectors/connector-query.atoms";
 import { deleteDocumentMutationAtom } from "@/atoms/documents/document-mutation.atoms";
 import { expandedFolderIdsAtom } from "@/atoms/documents/folder.atoms";
 import { agentCreatedDocumentsAtom } from "@/atoms/documents/ui.atoms";
 import { openEditorPanelAtom } from "@/atoms/editor/editor-panel.atom";
-import { rightPanelCollapsedAtom } from "@/atoms/layout/right-panel.atom";
+import {
+	folderWatchDialogOpenAtom,
+	folderWatchInitialFolderAtom,
+} from "@/atoms/folder-sync/folder-sync.atoms";
+import { searchSpacesAtom } from "@/atoms/search-spaces/search-space-query.atoms";
 import { CreateFolderDialog } from "@/components/documents/CreateFolderDialog";
 import type { DocumentNodeDoc } from "@/components/documents/DocumentNode";
 import { DocumentsFilters } from "@/components/documents/DocumentsFilters";
@@ -26,7 +47,6 @@ import { EXPORT_FILE_EXTENSIONS } from "@/components/shared/ExportMenuItems";
 import {
 	DEFAULT_EXCLUDE_PATTERNS,
 	FolderWatchDialog,
-	type SelectedFolder,
 } from "@/components/sources/FolderWatchDialog";
 import {
 	AlertDialog,
@@ -40,22 +60,79 @@ import {
 } from "@/components/ui/alert-dialog";
 import { Avatar, AvatarFallback, AvatarGroup } from "@/components/ui/avatar";
 import { Button } from "@/components/ui/button";
+import { Drawer, DrawerContent, DrawerHandle, DrawerTitle } from "@/components/ui/drawer";
+import { Skeleton } from "@/components/ui/skeleton";
 import { Spinner } from "@/components/ui/spinner";
+import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs";
 import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
+import { useAnonymousMode, useIsAnonymous } from "@/contexts/anonymous-mode";
+import { useLoginGate } from "@/contexts/login-gate";
 import { getConnectorIcon } from "@/contracts/enums/connectorIcons";
 import type { DocumentTypeEnum } from "@/contracts/types/document.types";
 import { useDebouncedValue } from "@/hooks/use-debounced-value";
 import { useMediaQuery } from "@/hooks/use-media-query";
-import { useElectronAPI } from "@/hooks/use-platform";
+import { useElectronAPI, usePlatform } from "@/hooks/use-platform";
+import { anonymousChatApiService } from "@/lib/apis/anonymous-chat-api.service";
 import { documentsApiService } from "@/lib/apis/documents-api.service";
 import { foldersApiService } from "@/lib/apis/folders-api.service";
+import { searchSpacesApiService } from "@/lib/apis/search-spaces-api.service";
 import { authenticatedFetch } from "@/lib/auth-utils";
+import { getMentionDocKey } from "@/lib/chat/mention-doc-key";
 import { uploadFolderScan } from "@/lib/folder-sync-upload";
 import { getSupportedExtensionsSet } from "@/lib/supported-extensions";
 import { queries } from "@/zero/queries/index";
 import { SidebarSlideOutPanel } from "./SidebarSlideOutPanel";
 
+const DesktopLocalTabContent = dynamic(
+	() => import("./DesktopLocalTabContent").then((mod) => mod.DesktopLocalTabContent),
+	{ ssr: false }
+);
+
 const NON_DELETABLE_DOCUMENT_TYPES: readonly string[] = ["SURFSENSE_DOCS"];
+const LOCAL_FILESYSTEM_TRUST_KEY = "surfsense.local-filesystem-trust.v1";
+const MAX_LOCAL_FILESYSTEM_ROOTS = 10;
+
+function CloudDocumentsSkeleton() {
+	const rows = [
+		{ id: "row-1", widthClass: "w-44" },
+		{ id: "row-2", widthClass: "w-32" },
+		{ id: "row-3", widthClass: "w-32" },
+		{ id: "row-4", widthClass: "w-44" },
+		{ id: "row-5", widthClass: "w-32" },
+		{ id: "row-6", widthClass: "w-32" },
+		{ id: "row-7", widthClass: "w-44" },
+		{ id: "row-8", widthClass: "w-32" },
+	];
+
+	return (
+		<div className="flex-1 min-h-0 overflow-y-auto px-2 py-1">
+			<div className="space-y-1">
+				{rows.map((row) => (
+					<div key={row.id} className="flex h-8 items-center gap-2 px-2">
+						<Skeleton className="h-4 w-4 rounded-sm" />
+						<Skeleton className={`h-4 ${row.widthClass}`} />
+					</div>
+				))}
+			</div>
+		</div>
+	);
+}
+
+type FilesystemSettings = {
+	mode: "cloud" | "desktop_local_folder";
+	localRootPaths: string[];
+	updatedAt: string;
+};
+
+interface WatchedFolderEntry {
+	path: string;
+	name: string;
+	excludePatterns: string[];
+	fileExtensions: string[] | null;
+	rootFolderId: number | null;
+	searchSpaceId: number;
+	active: boolean;
+}
 
 const SHOWCASE_CONNECTORS = [
 	{ type: "GOOGLE_DRIVE_CONNECTOR", label: "Google Drive" },
@@ -80,33 +157,215 @@ interface DocumentsSidebarProps {
 	headerAction?: React.ReactNode;
 }
 
-export function DocumentsSidebar({
+export function DocumentsSidebar(props: DocumentsSidebarProps) {
+	const isAnonymous = useIsAnonymous();
+	const { isDesktop } = usePlatform();
+	if (isAnonymous) {
+		return <AnonymousDocumentsSidebar {...props} />;
+	}
+	return isDesktop ? (
+		<AuthenticatedDesktopDocumentsSidebar {...props} />
+	) : (
+		<AuthenticatedWebDocumentsSidebar {...props} />
+	);
+}
+
+function AuthenticatedDesktopDocumentsSidebar(props: DocumentsSidebarProps) {
+	return <AuthenticatedDocumentsSidebarBase {...props} desktopFeaturesEnabled />;
+}
+
+function AuthenticatedWebDocumentsSidebar(props: DocumentsSidebarProps) {
+	return <AuthenticatedDocumentsSidebarBase {...props} desktopFeaturesEnabled={false} />;
+}
+
+function AuthenticatedDocumentsSidebarBase({
 	open,
 	onOpenChange,
 	isDocked = false,
 	onDockedChange,
 	embedded = false,
 	headerAction,
-}: DocumentsSidebarProps) {
+	desktopFeaturesEnabled,
+}: DocumentsSidebarProps & { desktopFeaturesEnabled: boolean }) {
 	const t = useTranslations("documents");
 	const tSidebar = useTranslations("sidebar");
 	const params = useParams();
 	const isMobile = !useMediaQuery("(min-width: 640px)");
-	const electronAPI = useElectronAPI();
+	const platformElectronAPI = useElectronAPI();
+	const electronAPI = desktopFeaturesEnabled ? platformElectronAPI : null;
 	const searchSpaceId = Number(params.search_space_id);
 	const setConnectorDialogOpen = useSetAtom(connectorDialogOpenAtom);
-	const setRightPanelCollapsed = useSetAtom(rightPanelCollapsedAtom);
 	const openEditorPanel = useSetAtom(openEditorPanelAtom);
+	const { data: agentFlags } = useAtomValue(agentFlagsAtom);
 	const { data: connectors } = useAtomValue(connectorsAtom);
 	const connectorCount = connectors?.length ?? 0;
 
 	const [search, setSearch] = useState("");
 	const debouncedSearch = useDebouncedValue(search, 250);
 	const [activeTypes, setActiveTypes] = useState<DocumentTypeEnum[]>([]);
+	const [filesystemSettings, setFilesystemSettings] = useState<FilesystemSettings | null>(null);
+	const [localTrustDialogOpen, setLocalTrustDialogOpen] = useState(false);
+	const [pendingLocalPath, setPendingLocalPath] = useState<string | null>(null);
 	const [watchedFolderIds, setWatchedFolderIds] = useState<Set<number>>(new Set());
-	const [folderWatchOpen, setFolderWatchOpen] = useState(false);
-	const [watchInitialFolder, setWatchInitialFolder] = useState<SelectedFolder | null>(null);
-	const isElectron = typeof window !== "undefined" && !!window.electronAPI;
+	const [folderWatchOpen, setFolderWatchOpen] = useAtom(folderWatchDialogOpenAtom);
+	const [watchInitialFolder, setWatchInitialFolder] = useAtom(folderWatchInitialFolderAtom);
+	const localFilesystemEnabled = agentFlags?.enable_desktop_local_filesystem === true;
+	const isElectron =
+		desktopFeaturesEnabled && typeof window !== "undefined" && !!window.electronAPI;
+
+	useEffect(() => {
+		if (!electronAPI?.getAgentFilesystemSettings) return;
+		let mounted = true;
+		electronAPI
+			.getAgentFilesystemSettings(searchSpaceId)
+			.then((settings: FilesystemSettings) => {
+				if (!mounted) return;
+				setFilesystemSettings(settings);
+			})
+			.catch(() => {
+				if (!mounted) return;
+				setFilesystemSettings({
+					mode: "cloud",
+					localRootPaths: [],
+					updatedAt: new Date().toISOString(),
+				});
+			});
+		return () => {
+			mounted = false;
+		};
+	}, [electronAPI, searchSpaceId]);
+
+	const hasLocalFilesystemTrust = useCallback(() => {
+		try {
+			return window.localStorage.getItem(LOCAL_FILESYSTEM_TRUST_KEY) === "true";
+		} catch {
+			return false;
+		}
+	}, []);
+
+	const localRootPaths = filesystemSettings?.localRootPaths ?? [];
+	const canAddMoreLocalRoots = localRootPaths.length < MAX_LOCAL_FILESYSTEM_ROOTS;
+
+	const applyLocalRootPath = useCallback(
+		async (path: string) => {
+			if (!electronAPI?.setAgentFilesystemSettings) return;
+			const nextLocalRootPaths = [path, ...localRootPaths]
+				.filter((rootPath, index, allPaths) => allPaths.indexOf(rootPath) === index)
+				.slice(0, MAX_LOCAL_FILESYSTEM_ROOTS);
+			if (nextLocalRootPaths.length === localRootPaths.length) return;
+			const updated = await electronAPI.setAgentFilesystemSettings(
+				{
+					mode: "desktop_local_folder",
+					localRootPaths: nextLocalRootPaths,
+				},
+				searchSpaceId
+			);
+			setFilesystemSettings(updated);
+		},
+		[electronAPI, localRootPaths, searchSpaceId]
+	);
+
+	const runPickLocalRoot = useCallback(async () => {
+		if (!electronAPI?.pickAgentFilesystemRoot) return;
+		const picked = await electronAPI.pickAgentFilesystemRoot();
+		if (!picked) return;
+		await applyLocalRootPath(picked);
+	}, [applyLocalRootPath, electronAPI]);
+
+	const handlePickFilesystemRoot = useCallback(async () => {
+		if (!canAddMoreLocalRoots) return;
+		if (hasLocalFilesystemTrust()) {
+			await runPickLocalRoot();
+			return;
+		}
+		if (!electronAPI?.pickAgentFilesystemRoot) return;
+		const picked = await electronAPI.pickAgentFilesystemRoot();
+		if (!picked) return;
+		setPendingLocalPath(picked);
+		setLocalTrustDialogOpen(true);
+	}, [canAddMoreLocalRoots, electronAPI, hasLocalFilesystemTrust, runPickLocalRoot]);
+
+	const handleRemoveFilesystemRoot = useCallback(
+		async (rootPathToRemove: string) => {
+			if (!electronAPI?.setAgentFilesystemSettings) return;
+			const updated = await electronAPI.setAgentFilesystemSettings(
+				{
+					mode: "desktop_local_folder",
+					localRootPaths: localRootPaths.filter((rootPath) => rootPath !== rootPathToRemove),
+				},
+				searchSpaceId
+			);
+			setFilesystemSettings(updated);
+		},
+		[electronAPI, localRootPaths, searchSpaceId]
+	);
+
+	const handleClearFilesystemRoots = useCallback(async () => {
+		if (!electronAPI?.setAgentFilesystemSettings) return;
+		const updated = await electronAPI.setAgentFilesystemSettings(
+			{
+				mode: "desktop_local_folder",
+				localRootPaths: [],
+			},
+			searchSpaceId
+		);
+		setFilesystemSettings(updated);
+	}, [electronAPI, searchSpaceId]);
+
+	const handleFilesystemTabChange = useCallback(
+		async (tab: "cloud" | "local") => {
+			if (!electronAPI?.setAgentFilesystemSettings) return;
+			const updated = await electronAPI.setAgentFilesystemSettings(
+				{
+					mode: tab === "cloud" ? "cloud" : "desktop_local_folder",
+				},
+				searchSpaceId
+			);
+			setFilesystemSettings(updated);
+		},
+		[electronAPI, searchSpaceId]
+	);
+
+	// AI File Sort state
+	const { data: searchSpaces, refetch: refetchSearchSpaces } = useAtomValue(searchSpacesAtom);
+	const activeSearchSpace = useMemo(
+		() => searchSpaces?.find((s) => s.id === searchSpaceId),
+		[searchSpaces, searchSpaceId]
+	);
+	const aiSortEnabled = activeSearchSpace?.ai_file_sort_enabled ?? false;
+	const [aiSortBusy, setAiSortBusy] = useState(false);
+	const [aiSortConfirmOpen, setAiSortConfirmOpen] = useState(false);
+
+	const handleToggleAiSort = useCallback(() => {
+		if (aiSortEnabled) {
+			// Disable: just update the setting, no confirmation needed
+			setAiSortBusy(true);
+			searchSpacesApiService
+				.updateSearchSpace({ id: searchSpaceId, data: { ai_file_sort_enabled: false } })
+				.then(() => {
+					refetchSearchSpaces();
+					toast.success("AI file sorting disabled");
+				})
+				.catch(() => toast.error("Failed to disable AI file sorting"))
+				.finally(() => setAiSortBusy(false));
+		} else {
+			setAiSortConfirmOpen(true);
+		}
+	}, [aiSortEnabled, searchSpaceId, refetchSearchSpaces]);
+
+	const handleConfirmEnableAiSort = useCallback(() => {
+		setAiSortConfirmOpen(false);
+		setAiSortBusy(true);
+		searchSpacesApiService
+			.updateSearchSpace({ id: searchSpaceId, data: { ai_file_sort_enabled: true } })
+			.then(() => searchSpacesApiService.triggerAiSort(searchSpaceId))
+			.then(() => {
+				refetchSearchSpaces();
+				toast.success("AI file sorting enabled — organizing your documents in the background");
+			})
+			.catch(() => toast.error("Failed to enable AI file sorting"))
+			.finally(() => setAiSortBusy(false));
+	}, [searchSpaceId, refetchSearchSpaces]);
 
 	const handleWatchLocalFolder = useCallback(async () => {
 		const api = window.electronAPI;
@@ -118,13 +377,13 @@ export function DocumentsSidebar({
 		const folderName = folderPath.split("/").pop() || folderPath.split("\\").pop() || folderPath;
 		setWatchInitialFolder({ path: folderPath, name: folderName });
 		setFolderWatchOpen(true);
-	}, []);
+	}, [setWatchInitialFolder, setFolderWatchOpen]);
 
 	const refreshWatchedIds = useCallback(async () => {
 		if (!electronAPI?.getWatchedFolders) return;
 		const api = electronAPI;
 
-		const folders = await api.getWatchedFolders();
+		const folders = (await api.getWatchedFolders()) as WatchedFolderEntry[];
 
 		if (folders.length === 0) {
 			try {
@@ -142,9 +401,11 @@ export function DocumentsSidebar({
 						active: true,
 					});
 				}
-				const recovered = await api.getWatchedFolders();
+				const recovered = (await api.getWatchedFolders()) as WatchedFolderEntry[];
 				const ids = new Set(
-					recovered.filter((f) => f.rootFolderId != null).map((f) => f.rootFolderId as number)
+					recovered
+						.filter((f: WatchedFolderEntry) => f.rootFolderId != null)
+						.map((f: WatchedFolderEntry) => f.rootFolderId as number)
 				);
 				setWatchedFolderIds(ids);
 				return;
@@ -154,7 +415,9 @@ export function DocumentsSidebar({
 		}
 
 		const ids = new Set(
-			folders.filter((f) => f.rootFolderId != null).map((f) => f.rootFolderId as number)
+			folders
+				.filter((f: WatchedFolderEntry) => f.rootFolderId != null)
+				.map((f: WatchedFolderEntry) => f.rootFolderId as number)
 		);
 		setWatchedFolderIds(ids);
 	}, [searchSpaceId, electronAPI]);
@@ -164,8 +427,11 @@ export function DocumentsSidebar({
 	}, [refreshWatchedIds]);
 	const { mutateAsync: deleteDocumentMutation } = useAtomValue(deleteDocumentMutationAtom);
 
-	const [sidebarDocs, setSidebarDocs] = useAtom(sidebarSelectedDocumentsAtom);
-	const mentionedDocIds = useMemo(() => new Set(sidebarDocs.map((d) => d.id)), [sidebarDocs]);
+	const [sidebarDocs, setSidebarDocs] = useAtom(mentionedDocumentsAtom);
+	const mentionedDocKeys = useMemo(
+		() => new Set(sidebarDocs.map((d) => getMentionDocKey(d))),
+		[sidebarDocs]
+	);
 
 	// Folder state
 	const [expandedFolderMap, setExpandedFolderMap] = useAtom(expandedFolderIdsAtom);
@@ -186,8 +452,8 @@ export function DocumentsSidebar({
 	);
 
 	// Zero queries for tree data
-	const [zeroFolders] = useQuery(queries.folders.bySpace({ searchSpaceId }));
-	const [zeroAllDocs] = useQuery(queries.documents.bySpace({ searchSpaceId }));
+	const [zeroFolders, zeroFoldersResult] = useQuery(queries.folders.bySpace({ searchSpaceId }));
+	const [zeroAllDocs, zeroAllDocsResult] = useQuery(queries.documents.bySpace({ searchSpaceId }));
 	const [agentCreatedDocs, setAgentCreatedDocs] = useAtom(agentCreatedDocumentsAtom);
 
 	const treeFolders: FolderDisplay[] = useMemo(
@@ -303,8 +569,10 @@ export function DocumentsSidebar({
 		async (folder: FolderDisplay) => {
 			if (!electronAPI) return;
 
-			const watchedFolders = await electronAPI.getWatchedFolders();
-			const matched = watchedFolders.find((wf) => wf.rootFolderId === folder.id);
+			const watchedFolders = (await electronAPI.getWatchedFolders()) as WatchedFolderEntry[];
+			const matched = watchedFolders.find(
+				(wf: WatchedFolderEntry) => wf.rootFolderId === folder.id
+			);
 			if (!matched) {
 				toast.error("This folder is not being watched");
 				return;
@@ -333,8 +601,10 @@ export function DocumentsSidebar({
 		async (folder: FolderDisplay) => {
 			if (!electronAPI) return;
 
-			const watchedFolders = await electronAPI.getWatchedFolders();
-			const matched = watchedFolders.find((wf) => wf.rootFolderId === folder.id);
+			const watchedFolders = (await electronAPI.getWatchedFolders()) as WatchedFolderEntry[];
+			const matched = watchedFolders.find(
+				(wf: WatchedFolderEntry) => wf.rootFolderId === folder.id
+			);
 			if (!matched) {
 				toast.error("This folder is not being watched");
 				return;
@@ -366,8 +636,10 @@ export function DocumentsSidebar({
 			if (!confirm(`Delete folder "${folder.name}" and all its contents?`)) return;
 			try {
 				if (electronAPI) {
-					const watchedFolders = await electronAPI.getWatchedFolders();
-					const matched = watchedFolders.find((wf) => wf.rootFolderId === folder.id);
+					const watchedFolders = (await electronAPI.getWatchedFolders()) as WatchedFolderEntry[];
+					const matched = watchedFolders.find(
+						(wf: WatchedFolderEntry) => wf.rootFolderId === folder.id
+					);
 					if (matched) {
 						await electronAPI.removeWatchedFolder(matched.path);
 					}
@@ -406,6 +678,110 @@ export function DocumentsSidebar({
 		setFolderPickerOpen(true);
 	}, []);
 
+	const isExportingKBRef = useRef(false);
+	const [exportWarningOpen, setExportWarningOpen] = useState(false);
+	const [exportWarningContext, setExportWarningContext] = useState<{
+		folder: FolderDisplay;
+		pendingCount: number;
+	} | null>(null);
+
+	const doExport = useCallback(async (url: string, downloadName: string) => {
+		const response = await authenticatedFetch(url, { method: "GET" });
+		if (!response.ok) {
+			const errorData = await response.json().catch(() => ({ detail: "Export failed" }));
+			throw new Error(errorData.detail || "Export failed");
+		}
+
+		const blob = await response.blob();
+		const blobUrl = URL.createObjectURL(blob);
+		const a = document.createElement("a");
+		a.href = blobUrl;
+		a.download = downloadName;
+		document.body.appendChild(a);
+		a.click();
+		document.body.removeChild(a);
+		URL.revokeObjectURL(blobUrl);
+	}, []);
+
+	const handleExportWarningConfirm = useCallback(async () => {
+		setExportWarningOpen(false);
+		const ctx = exportWarningContext;
+		if (!ctx?.folder) return;
+
+		isExportingKBRef.current = true;
+		try {
+			const safeName =
+				ctx.folder.name
+					.replace(/[^a-zA-Z0-9 _-]/g, "_")
+					.trim()
+					.slice(0, 80) || "folder";
+			await doExport(
+				`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export?folder_id=${ctx.folder.id}`,
+				`${safeName}.zip`
+			);
+			toast.success(`Folder "${ctx.folder.name}" exported`);
+		} catch (err) {
+			console.error("Folder export failed:", err);
+			toast.error(err instanceof Error ? err.message : "Export failed");
+		} finally {
+			isExportingKBRef.current = false;
+		}
+		setExportWarningContext(null);
+	}, [exportWarningContext, searchSpaceId, doExport]);
+
+	const getPendingCountInSubtree = useCallback(
+		(folderId: number): number => {
+			const subtreeIds = new Set<number>();
+			function collect(id: number) {
+				subtreeIds.add(id);
+				for (const child of foldersByParent[String(id)] ?? []) {
+					collect(child.id);
+				}
+			}
+			collect(folderId);
+			return treeDocuments.filter(
+				(d) =>
+					subtreeIds.has(d.folderId ?? -1) &&
+					(d.status?.state === "pending" || d.status?.state === "processing")
+			).length;
+		},
+		[foldersByParent, treeDocuments]
+	);
+
+	const handleExportFolder = useCallback(
+		async (folder: FolderDisplay) => {
+			const folderPendingCount = getPendingCountInSubtree(folder.id);
+			if (folderPendingCount > 0) {
+				setExportWarningContext({
+					folder,
+					pendingCount: folderPendingCount,
+				});
+				setExportWarningOpen(true);
+				return;
+			}
+
+			isExportingKBRef.current = true;
+			try {
+				const safeName =
+					folder.name
+						.replace(/[^a-zA-Z0-9 _-]/g, "_")
+						.trim()
+						.slice(0, 80) || "folder";
+				await doExport(
+					`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export?folder_id=${folder.id}`,
+					`${safeName}.zip`
+				);
+				toast.success(`Folder "${folder.name}" exported`);
+			} catch (err) {
+				console.error("Folder export failed:", err);
+				toast.error(err instanceof Error ? err.message : "Export failed");
+			} finally {
+				isExportingKBRef.current = false;
+			}
+		},
+		[searchSpaceId, getPendingCountInSubtree, doExport]
+	);
+
 	const handleExportDocument = useCallback(
 		async (doc: DocumentNodeDoc, format: string) => {
 			const safeTitle =
@@ -503,14 +879,20 @@ export function DocumentsSidebar({
 
 	const handleToggleChatMention = useCallback(
 		(doc: { id: number; title: string; document_type: string }, isMentioned: boolean) => {
+			const key = getMentionDocKey({ ...doc, kind: "doc" });
 			if (isMentioned) {
-				setSidebarDocs((prev) => prev.filter((d) => d.id !== doc.id));
+				setSidebarDocs((prev) => prev.filter((d) => getMentionDocKey(d) !== key));
 			} else {
 				setSidebarDocs((prev) => {
-					if (prev.some((d) => d.id === doc.id)) return prev;
+					if (prev.some((d) => getMentionDocKey(d) === key)) return prev;
 					return [
 						...prev,
-						{ id: doc.id, title: doc.title, document_type: doc.document_type as DocumentTypeEnum },
+						{
+							id: doc.id,
+							title: doc.title,
+							document_type: doc.document_type as DocumentTypeEnum,
+							kind: "doc",
+						},
 					];
 				});
 			}
@@ -520,39 +902,29 @@ export function DocumentsSidebar({
 
 	const handleToggleFolderSelect = useCallback(
 		(folderId: number, selectAll: boolean) => {
-			function collectSubtreeDocs(parentId: number): DocumentNodeDoc[] {
-				const directDocs = (treeDocuments ?? []).filter(
-					(d) =>
-						d.folderId === parentId &&
-						d.status?.state !== "pending" &&
-						d.status?.state !== "processing"
-				);
-				const childFolders = foldersByParent[String(parentId)] ?? [];
-				const descendantDocs = childFolders.flatMap((cf) => collectSubtreeDocs(cf.id));
-				return [...directDocs, ...descendantDocs];
-			}
-
-			const subtreeDocs = collectSubtreeDocs(folderId);
-			if (subtreeDocs.length === 0) return;
+			// One folder click = one folder-mention chip. The agent
+			// resolves the chip to its virtual path
+			// (``/documents/MyFolder/``) and walks it itself with
+			// ``ls`` / ``find_documents``. We deliberately don't
+			// fan out to per-doc chips anymore — the previous
+			// behaviour created N chips for one click and dropped
+			// nested folders entirely once selected, which the
+			// agent had no way to recover.
+			const folder = treeFolders.find((f) => f.id === folderId);
+			if (!folder) return;
+			const chip = makeFolderMention({ id: folder.id, name: folder.name });
+			const chipKey = getMentionDocKey(chip);
 
 			if (selectAll) {
 				setSidebarDocs((prev) => {
-					const existingIds = new Set(prev.map((d) => d.id));
-					const newDocs = subtreeDocs
-						.filter((d) => !existingIds.has(d.id))
-						.map((d) => ({
-							id: d.id,
-							title: d.title,
-							document_type: d.document_type as DocumentTypeEnum,
-						}));
-					return newDocs.length > 0 ? [...prev, ...newDocs] : prev;
+					const exists = prev.some((d) => getMentionDocKey(d) === chipKey);
+					return exists ? prev : [...prev, chip];
 				});
 			} else {
-				const idsToRemove = new Set(subtreeDocs.map((d) => d.id));
-				setSidebarDocs((prev) => prev.filter((d) => !idsToRemove.has(d.id)));
+				setSidebarDocs((prev) => prev.filter((d) => getMentionDocKey(d) !== chipKey));
 			}
 		},
-		[treeDocuments, foldersByParent, setSidebarDocs]
+		[treeFolders, setSidebarDocs]
 	);
 
 	const searchFilteredDocuments = useMemo(() => {
@@ -647,152 +1019,109 @@ export function DocumentsSidebar({
 
 	useEffect(() => {
 		const handleEscape = (e: KeyboardEvent) => {
-			if (e.key === "Escape" && open) {
-				if (isMobile) {
-					onOpenChange(false);
-				} else {
-					setRightPanelCollapsed(true);
-				}
+			if (e.key === "Escape" && open && isMobile) {
+				onOpenChange(false);
 			}
 		};
 		document.addEventListener("keydown", handleEscape);
 		return () => document.removeEventListener("keydown", handleEscape);
-	}, [open, onOpenChange, isMobile, setRightPanelCollapsed]);
-
-	const documentsContent = (
+	}, [open, onOpenChange, isMobile]);
+
+	const showFilesystemTabs =
+		!isMobile && !!electronAPI && !!filesystemSettings && localFilesystemEnabled;
+	const currentFilesystemTab =
+		localFilesystemEnabled && filesystemSettings?.mode === "desktop_local_folder"
+			? "local"
+			: "cloud";
+	const showCloudSkeleton =
+		currentFilesystemTab === "cloud" &&
+		(zeroFoldersResult.type !== "complete" || zeroAllDocsResult.type !== "complete");
+
+	const cloudContent = (
 		<>
-			<div className="shrink-0 flex h-14 items-center px-4">
-				<div className="flex w-full items-center justify-between">
-					<div className="flex items-center gap-2">
-						{isMobile && (
-							<Button
-								variant="ghost"
-								size="icon"
-								className="h-8 w-8 rounded-full"
-								onClick={() => onOpenChange(false)}
-							>
-								<ChevronLeft className="h-4 w-4 text-muted-foreground" />
-								<span className="sr-only">{tSidebar("close") || "Close"}</span>
-							</Button>
-						)}
-						<h2 className="select-none text-lg font-semibold">{t("title") || "Documents"}</h2>
-					</div>
-					<div className="flex items-center gap-1">
-						{!isMobile && onDockedChange && (
-							<Tooltip>
-								<TooltipTrigger asChild>
-									<Button
-										variant="ghost"
-										size="icon"
-										className="h-8 w-8 rounded-full"
-										onClick={() => {
-											if (isDocked) {
-												onDockedChange(false);
-												onOpenChange(false);
-											} else {
-												onDockedChange(true);
-											}
-										}}
-									>
-										{isDocked ? (
-											<ChevronLeft className="h-4 w-4 text-muted-foreground" />
-										) : (
-											<ChevronRight className="h-4 w-4 text-muted-foreground" />
-										)}
-										<span className="sr-only">{isDocked ? "Collapse panel" : "Expand panel"}</span>
-									</Button>
-								</TooltipTrigger>
-								<TooltipContent className="z-80">
-									{isDocked ? "Collapse panel" : "Expand panel"}
-								</TooltipContent>
-							</Tooltip>
-						)}
-						{headerAction}
-					</div>
-				</div>
-			</div>
-
 			{/* Connected tools strip */}
-			<div className="shrink-0 mx-4 mt-4 mb-4 flex select-none items-center gap-2 rounded-lg border bg-muted/50 transition-colors hover:bg-muted/80">
-				<button
-					type="button"
-					onClick={() => setConnectorDialogOpen(true)}
-					className="flex items-center gap-2 min-w-0 flex-1 text-left px-3 py-2"
-				>
-					<Unplug className="size-4 shrink-0 text-muted-foreground" />
-					<span className="truncate text-xs text-muted-foreground">
-						{connectorCount > 0 ? "Manage connectors" : "Connect your connectors"}
+			<Button
+				type="button"
+				variant="ghost"
+				size="sm"
+				onClick={() => setConnectorDialogOpen(true)}
+				className="shrink-0 mx-4 mt-6 mb-2.5 h-auto select-none justify-start gap-2 bg-muted px-3 py-1.5 text-xs text-muted-foreground"
+			>
+				<Unplug className="size-4 shrink-0" />
+				<span className="truncate">
+					{connectorCount > 0 ? "Manage connectors" : "Connect your connectors"}
+				</span>
+				{connectorCount > 0 && (
+					<span className="shrink-0 rounded-full bg-muted-foreground/15 px-1.5 py-0.5 text-[10px] font-medium text-muted-foreground">
+						{connectorCount}
 					</span>
-					{connectorCount > 0 && (
-						<span className="shrink-0 rounded-full bg-muted-foreground/15 px-1.5 py-0.5 text-[10px] font-medium text-muted-foreground">
-							{connectorCount}
-						</span>
-					)}
-					<AvatarGroup className="ml-auto shrink-0">
-						{connectorCount > 0 && connectors
-							? connectors.slice(0, isMobile ? 5 : 9).map((connector, i) => {
+				)}
+				<AvatarGroup className="ml-auto shrink-0">
+					{connectorCount > 0 && connectors
+						? connectors.slice(0, isMobile ? 5 : 9).map((connector, i) => {
+								const avatar = (
+									<Avatar
+										key={connector.id}
+										className="size-6"
+										style={{ zIndex: Math.max(9 - i, 1) }}
+									>
+										<AvatarFallback className="bg-muted text-[10px] text-muted-foreground">
+											{getConnectorIcon(connector.connector_type, "size-3.5")}
+										</AvatarFallback>
+									</Avatar>
+								);
+								if (isMobile) return avatar;
+								return (
+									<Tooltip key={connector.id}>
+										<TooltipTrigger asChild>{avatar}</TooltipTrigger>
+										<TooltipContent side="top" className="text-xs">
+											{connector.name}
+										</TooltipContent>
+									</Tooltip>
+								);
+							})
+						: (isMobile ? SHOWCASE_CONNECTORS.slice(0, 5) : SHOWCASE_CONNECTORS).map(
+								({ type, label }, i) => {
 									const avatar = (
 										<Avatar
-											key={connector.id}
+											key={type}
 											className="size-6"
-											style={{ zIndex: Math.max(9 - i, 1) }}
+											style={{ zIndex: SHOWCASE_CONNECTORS.length - i }}
 										>
-											<AvatarFallback className="bg-muted text-[10px]">
-												{getConnectorIcon(connector.connector_type, "size-3.5")}
+											<AvatarFallback className="bg-muted text-[10px] text-muted-foreground">
+												{getConnectorIcon(type, "size-3.5")}
 											</AvatarFallback>
 										</Avatar>
 									);
 									if (isMobile) return avatar;
 									return (
-										<Tooltip key={connector.id}>
+										<Tooltip key={type}>
 											<TooltipTrigger asChild>{avatar}</TooltipTrigger>
 											<TooltipContent side="top" className="text-xs">
-												{connector.name}
+												{label}
 											</TooltipContent>
 										</Tooltip>
 									);
-								})
-							: (isMobile ? SHOWCASE_CONNECTORS.slice(0, 5) : SHOWCASE_CONNECTORS).map(
-									({ type, label }, i) => {
-										const avatar = (
-											<Avatar
-												key={type}
-												className="size-6"
-												style={{ zIndex: SHOWCASE_CONNECTORS.length - i }}
-											>
-												<AvatarFallback className="bg-muted text-[10px]">
-													{getConnectorIcon(type, "size-3.5")}
-												</AvatarFallback>
-											</Avatar>
-										);
-										if (isMobile) return avatar;
-										return (
-											<Tooltip key={type}>
-												<TooltipTrigger asChild>{avatar}</TooltipTrigger>
-												<TooltipContent side="top" className="text-xs">
-													{label}
-												</TooltipContent>
-											</Tooltip>
-										);
-									}
-								)}
-					</AvatarGroup>
-				</button>
-			</div>
+								}
+							)}
+				</AvatarGroup>
+			</Button>
 
 			{isElectron && (
-				<button
+				<Button
 					type="button"
+					variant="ghost"
+					size="sm"
 					onClick={handleWatchLocalFolder}
-					className="shrink-0 mx-4 mb-4 flex select-none items-center gap-2 rounded-lg border bg-muted/50 px-3 py-2 transition-colors hover:bg-muted/80"
+					className="shrink-0 mx-4 mb-2.5 h-auto select-none justify-start gap-2 bg-muted px-3 py-1.5 text-xs text-muted-foreground"
 				>
-					<FolderClock className="size-4 shrink-0 text-muted-foreground" />
-					<span className="truncate text-xs text-muted-foreground">Watch local folder</span>
-				</button>
+					<FolderClock className="size-4 shrink-0" />
+					<span className="truncate">Watch local folder</span>
+				</Button>
 			)}
 
 			<div className="flex-1 min-h-0 pt-0 flex flex-col">
-				<div className="px-4 pb-2">
+				<div className="px-4 pb-1.5">
 					<DocumentsFilters
 						typeCounts={typeCounts}
 						onSearch={setSearch}
@@ -800,64 +1129,195 @@ export function DocumentsSidebar({
 						onToggleType={onToggleType}
 						activeTypes={activeTypes}
 						onCreateFolder={() => handleCreateFolder(null)}
+						aiSortEnabled={aiSortEnabled}
+						aiSortBusy={aiSortBusy}
+						onToggleAiSort={handleToggleAiSort}
 					/>
 				</div>
 
 				<div className="relative flex-1 min-h-0 overflow-auto">
 					{deletableSelectedIds.length > 0 && (
 						<div className="absolute inset-x-0 top-0 z-10 flex items-center justify-center px-4 py-1.5 animate-in fade-in duration-150 pointer-events-none">
-							<button
+							<Button
 								type="button"
+								variant="destructive"
+								size="sm"
 								onClick={() => setBulkDeleteConfirmOpen(true)}
-								className="pointer-events-auto flex items-center gap-1.5 px-3 py-1 rounded-md bg-destructive text-destructive-foreground shadow-lg text-xs font-medium hover:bg-destructive/90 transition-colors"
+								className="pointer-events-auto h-auto gap-1.5 px-3 py-1 text-xs shadow-lg"
 							>
 								<Trash2 size={12} />
 								Delete {deletableSelectedIds.length}{" "}
 								{deletableSelectedIds.length === 1 ? "item" : "items"}
-							</button>
+							</Button>
 						</div>
 					)}
 
-					<FolderTreeView
-						folders={treeFolders}
-						documents={searchFilteredDocuments}
-						expandedIds={expandedIds}
-						onToggleExpand={toggleFolderExpand}
-						mentionedDocIds={mentionedDocIds}
-						onToggleChatMention={handleToggleChatMention}
-						onToggleFolderSelect={handleToggleFolderSelect}
-						onRenameFolder={handleRenameFolder}
-						onDeleteFolder={handleDeleteFolder}
-						onMoveFolder={handleMoveFolder}
-						onCreateFolder={handleCreateFolder}
-						searchQuery={debouncedSearch.trim() || undefined}
-						onPreviewDocument={(doc) => {
-							openEditorPanel({
-								documentId: doc.id,
-								searchSpaceId,
-								title: doc.title,
-							});
-						}}
-						onEditDocument={(doc) => {
-							openEditorPanel({
-								documentId: doc.id,
-								searchSpaceId,
-								title: doc.title,
-							});
-						}}
-						onDeleteDocument={(doc) => handleDeleteDocument(doc.id)}
-						onMoveDocument={handleMoveDocument}
-						onExportDocument={handleExportDocument}
-						onVersionHistory={(doc) => setVersionDocId(doc.id)}
-						activeTypes={activeTypes}
-						onDropIntoFolder={handleDropIntoFolder}
-						onReorderFolder={handleReorderFolder}
-						watchedFolderIds={watchedFolderIds}
-						onRescanFolder={handleRescanFolder}
-						onStopWatchingFolder={handleStopWatching}
-					/>
+					{showCloudSkeleton ? (
+						<CloudDocumentsSkeleton />
+					) : (
+						<FolderTreeView
+							folders={treeFolders}
+							documents={searchFilteredDocuments}
+							expandedIds={expandedIds}
+							onToggleExpand={toggleFolderExpand}
+							mentionedDocKeys={mentionedDocKeys}
+							onToggleChatMention={handleToggleChatMention}
+							onToggleFolderSelect={handleToggleFolderSelect}
+							onRenameFolder={handleRenameFolder}
+							onDeleteFolder={handleDeleteFolder}
+							onMoveFolder={handleMoveFolder}
+							onCreateFolder={handleCreateFolder}
+							searchQuery={debouncedSearch.trim() || undefined}
+							onPreviewDocument={(doc) => {
+								openEditorPanel({
+									documentId: doc.id,
+									searchSpaceId,
+									title: doc.title,
+								});
+							}}
+							onEditDocument={(doc) => {
+								openEditorPanel({
+									documentId: doc.id,
+									searchSpaceId,
+									title: doc.title,
+								});
+							}}
+							onDeleteDocument={(doc) => handleDeleteDocument(doc.id)}
+							onMoveDocument={handleMoveDocument}
+							onExportDocument={handleExportDocument}
+							onVersionHistory={(doc) => setVersionDocId(doc.id)}
+							activeTypes={activeTypes}
+							onDropIntoFolder={handleDropIntoFolder}
+							onReorderFolder={handleReorderFolder}
+							watchedFolderIds={watchedFolderIds}
+							onRescanFolder={handleRescanFolder}
+							onStopWatchingFolder={handleStopWatching}
+							onExportFolder={handleExportFolder}
+						/>
+					)}
 				</div>
 			</div>
+		</>
+	);
+
+	const localContent = (
+		<DesktopLocalTabContent
+			localRootPaths={localRootPaths}
+			canAddMoreLocalRoots={canAddMoreLocalRoots}
+			maxLocalFilesystemRoots={MAX_LOCAL_FILESYSTEM_ROOTS}
+			searchSpaceId={searchSpaceId}
+			onPickFilesystemRoot={handlePickFilesystemRoot}
+			onRemoveFilesystemRoot={handleRemoveFilesystemRoot}
+			onClearFilesystemRoots={handleClearFilesystemRoots}
+			onOpenLocalFile={(localFilePath) => {
+				openEditorPanel({
+					kind: "local_file",
+					localFilePath,
+					title: localFilePath.split("/").pop() || localFilePath,
+					searchSpaceId,
+				});
+			}}
+			electronAvailable={!!electronAPI}
+		/>
+	);
+
+	const documentsContent = (
+		<>
+			<div className="shrink-0 flex h-12 items-center px-3 border-b">
+				<div className="flex w-full items-center justify-between">
+					<div className="flex items-center gap-3">
+						{isMobile && (
+							<Button
+								variant="ghost"
+								size="icon"
+								className="h-8 w-8 rounded-full text-muted-foreground hover:text-accent-foreground"
+								onClick={() => onOpenChange(false)}
+							>
+								<ChevronLeft className="h-4 w-4" />
+								<span className="sr-only">{tSidebar("close") || "Close"}</span>
+							</Button>
+						)}
+						<h2 className="select-none text-lg font-semibold">{t("title") || "Documents"}</h2>
+						{showFilesystemTabs && (
+							<Tabs
+								value={currentFilesystemTab}
+								onValueChange={(value) => {
+									void handleFilesystemTabChange(value === "local" ? "local" : "cloud");
+								}}
+							>
+								<TabsList className="h-6 gap-0 rounded-md bg-muted/60 p-0.5 select-none">
+									<TabsTrigger
+										value="cloud"
+										className="h-5 gap-1 px-1.5 text-[11px] select-none focus-visible:ring-0 focus-visible:ring-offset-0 data-[state=active]:bg-muted-foreground/25 data-[state=active]:text-foreground data-[state=active]:shadow-none"
+										title="Cloud"
+									>
+										<Server className="size-3 shrink-0" />
+										<span className="leading-none">Cloud</span>
+									</TabsTrigger>
+									<TabsTrigger
+										value="local"
+										className="h-5 gap-1 px-1.5 text-[11px] select-none focus-visible:ring-0 focus-visible:ring-offset-0 data-[state=active]:bg-muted-foreground/25 data-[state=active]:text-foreground data-[state=active]:shadow-none"
+										title="Local"
+									>
+										<Laptop className="size-3 shrink-0" />
+										<span className="leading-none">Local</span>
+									</TabsTrigger>
+								</TabsList>
+							</Tabs>
+						)}
+					</div>
+					<div className="flex items-center gap-1">
+						{!isMobile && onDockedChange && (
+							<Tooltip>
+								<TooltipTrigger asChild>
+									<Button
+										variant="ghost"
+										size="icon"
+										className="h-8 w-8 rounded-full text-muted-foreground hover:text-accent-foreground"
+										onClick={() => {
+											if (isDocked) {
+												onDockedChange(false);
+												onOpenChange(false);
+											} else {
+												onDockedChange(true);
+											}
+										}}
+									>
+										{isDocked ? (
+											<ChevronLeft className="h-4 w-4" />
+										) : (
+											<ChevronRight className="h-4 w-4" />
+										)}
+										<span className="sr-only">{isDocked ? "Collapse panel" : "Expand panel"}</span>
+									</Button>
+								</TooltipTrigger>
+								<TooltipContent className="z-80">
+									{isDocked ? "Collapse panel" : "Expand panel"}
+								</TooltipContent>
+							</Tooltip>
+						)}
+						{headerAction}
+					</div>
+				</div>
+			</div>
+			{showFilesystemTabs ? (
+				<Tabs
+					value={currentFilesystemTab}
+					onValueChange={(value) => {
+						void handleFilesystemTabChange(value === "local" ? "local" : "cloud");
+					}}
+					className="flex min-h-0 flex-1 flex-col"
+				>
+					<TabsContent value="cloud" className="mt-0 flex min-h-0 flex-1 flex-col">
+						{cloudContent}
+					</TabsContent>
+					<TabsContent value="local" className="mt-0 flex min-h-0 flex-1 flex-col">
+						{currentFilesystemTab === "local" ? localContent : null}
+					</TabsContent>
+				</Tabs>
+			) : (
+				cloudContent
+			)}
 
 			{versionDocId !== null && (
 				<VersionHistoryDialog
@@ -881,6 +1341,48 @@ export function DocumentsSidebar({
 					onSuccess={refreshWatchedIds}
 				/>
 			)}
+			<AlertDialog
+				open={localTrustDialogOpen}
+				onOpenChange={(nextOpen) => {
+					setLocalTrustDialogOpen(nextOpen);
+					if (!nextOpen) setPendingLocalPath(null);
+				}}
+			>
+				<AlertDialogContent className="sm:max-w-md select-none">
+					<AlertDialogHeader>
+						<AlertDialogTitle>Trust this workspace?</AlertDialogTitle>
+						<AlertDialogDescription>
+							Local mode can read and edit files inside the folders you select. Continue only if you
+							trust this workspace and its contents.
+						</AlertDialogDescription>
+						{pendingLocalPath && (
+							<AlertDialogDescription className="mt-1 whitespace-pre-wrap break-words font-mono text-xs">
+								Folder path: {pendingLocalPath}
+							</AlertDialogDescription>
+						)}
+					</AlertDialogHeader>
+					<AlertDialogFooter>
+						<AlertDialogCancel>Cancel</AlertDialogCancel>
+						<AlertDialogAction
+							onClick={async () => {
+								try {
+									window.localStorage.setItem(LOCAL_FILESYSTEM_TRUST_KEY, "true");
+								} catch {}
+								setLocalTrustDialogOpen(false);
+								const path = pendingLocalPath;
+								setPendingLocalPath(null);
+								if (path) {
+									await applyLocalRootPath(path);
+								} else {
+									await runPickLocalRoot();
+								}
+							}}
+						>
+							I trust this workspace
+						</AlertDialogAction>
+					</AlertDialogFooter>
+				</AlertDialogContent>
+			</AlertDialog>
 
 			<FolderPickerDialog
 				open={folderPickerOpen}
@@ -933,6 +1435,50 @@ export function DocumentsSidebar({
 					</AlertDialogFooter>
 				</AlertDialogContent>
 			</AlertDialog>
+
+			<AlertDialog
+				open={exportWarningOpen}
+				onOpenChange={(open) => {
+					if (!open) {
+						setExportWarningOpen(false);
+						setExportWarningContext(null);
+					}
+				}}
+			>
+				<AlertDialogContent>
+					<AlertDialogHeader>
+						<AlertDialogTitle>Some documents are still processing</AlertDialogTitle>
+						<AlertDialogDescription>
+							{exportWarningContext?.pendingCount} document
+							{exportWarningContext?.pendingCount !== 1 ? "s are" : " is"} currently being processed
+							and will be excluded from the export. Do you want to continue?
+						</AlertDialogDescription>
+					</AlertDialogHeader>
+					<AlertDialogFooter>
+						<AlertDialogCancel>Cancel</AlertDialogCancel>
+						<AlertDialogAction onClick={handleExportWarningConfirm}>
+							Export anyway
+						</AlertDialogAction>
+					</AlertDialogFooter>
+				</AlertDialogContent>
+			</AlertDialog>
+
+			<AlertDialog open={aiSortConfirmOpen} onOpenChange={setAiSortConfirmOpen}>
+				<AlertDialogContent>
+					<AlertDialogHeader>
+						<AlertDialogTitle>Enable AI File Sorting?</AlertDialogTitle>
+						<AlertDialogDescription>
+							All documents in this search space will be organized into folders by connector type,
+							date, and AI-generated categories. New documents will also be sorted automatically.
+							You can disable this at any time.
+						</AlertDialogDescription>
+					</AlertDialogHeader>
+					<AlertDialogFooter>
+						<AlertDialogCancel>Cancel</AlertDialogCancel>
+						<AlertDialogAction onClick={handleConfirmEnableAiSort}>Enable</AlertDialogAction>
+					</AlertDialogFooter>
+				</AlertDialogContent>
+			</AlertDialog>
 		</>
 	);
 
@@ -966,3 +1512,426 @@ export function DocumentsSidebar({
 		</SidebarSlideOutPanel>
 	);
 }
+
+// ---------------------------------------------------------------------------
+// Anonymous Documents Sidebar
+// ---------------------------------------------------------------------------
+
+const ANON_ALLOWED_EXTENSIONS = new Set([
+	".md",
+	".markdown",
+	".txt",
+	".text",
+	".json",
+	".jsonl",
+	".yaml",
+	".yml",
+	".toml",
+	".ini",
+	".cfg",
+	".conf",
+	".xml",
+	".css",
+	".scss",
+	".py",
+	".js",
+	".jsx",
+	".ts",
+	".tsx",
+	".java",
+	".kt",
+	".go",
+	".rs",
+	".rb",
+	".php",
+	".c",
+	".h",
+	".cpp",
+	".hpp",
+	".cs",
+	".swift",
+	".sh",
+	".sql",
+	".log",
+	".rst",
+	".tex",
+	".vue",
+	".svelte",
+	".astro",
+	".tf",
+	".proto",
+	".csv",
+	".tsv",
+	".html",
+	".htm",
+	".xhtml",
+]);
+
+const ANON_ACCEPT = Array.from(ANON_ALLOWED_EXTENSIONS).join(",");
+
+function AnonymousDocumentsSidebar({
+	open,
+	onOpenChange,
+	isDocked = false,
+	onDockedChange,
+	embedded = false,
+	headerAction,
+}: DocumentsSidebarProps) {
+	const t = useTranslations("documents");
+	const tSidebar = useTranslations("sidebar");
+	const isMobile = !useMediaQuery("(min-width: 640px)");
+	const anonMode = useAnonymousMode();
+	const { gate } = useLoginGate();
+
+	const fileInputRef = useRef<HTMLInputElement>(null);
+	const [isUploading, setIsUploading] = useState(false);
+	const [search, setSearch] = useState("");
+
+	const [sidebarDocs, setSidebarDocs] = useAtom(mentionedDocumentsAtom);
+	const mentionedDocKeys = useMemo(
+		() => new Set(sidebarDocs.map((d) => getMentionDocKey(d))),
+		[sidebarDocs]
+	);
+
+	const handleToggleChatMention = useCallback(
+		(doc: { id: number; title: string; document_type: string }, isMentioned: boolean) => {
+			const key = getMentionDocKey({ ...doc, kind: "doc" });
+			if (isMentioned) {
+				setSidebarDocs((prev) => prev.filter((d) => getMentionDocKey(d) !== key));
+			} else {
+				setSidebarDocs((prev) => {
+					if (prev.some((d) => getMentionDocKey(d) === key)) return prev;
+					return [
+						...prev,
+						{
+							id: doc.id,
+							title: doc.title,
+							document_type: doc.document_type as DocumentTypeEnum,
+							kind: "doc",
+						},
+					];
+				});
+			}
+		},
+		[setSidebarDocs]
+	);
+
+	const uploadedDoc = anonMode.isAnonymous ? anonMode.uploadedDoc : null;
+	const hasDoc = uploadedDoc !== null;
+
+	const handleAnonUploadClick = useCallback(() => {
+		if (hasDoc) {
+			gate("upload more documents");
+			return;
+		}
+		fileInputRef.current?.click();
+	}, [hasDoc, gate]);
+
+	const handleFileChange = useCallback(
+		async (e: React.ChangeEvent<HTMLInputElement>) => {
+			const file = e.target.files?.[0];
+			if (!file) return;
+			e.target.value = "";
+
+			const ext = `.${file.name.split(".").pop()?.toLowerCase()}`;
+			if (!ANON_ALLOWED_EXTENSIONS.has(ext)) {
+				gate("upload PDFs, Word documents, images, and more");
+				return;
+			}
+
+			setIsUploading(true);
+			try {
+				const result = await anonymousChatApiService.uploadDocument(file);
+				if (!result.ok) {
+					if (result.reason === "quota_exceeded") gate("upload more documents");
+					return;
+				}
+				const data = result.data;
+				if (anonMode.isAnonymous) {
+					anonMode.setUploadedDoc({
+						filename: data.filename,
+						sizeBytes: data.size_bytes,
+					});
+				}
+				toast.success(`Uploaded "${data.filename}"`);
+			} catch (err) {
+				console.error("Upload failed:", err);
+				toast.error(err instanceof Error ? err.message : "Upload failed");
+			} finally {
+				setIsUploading(false);
+			}
+		},
+		[gate, anonMode]
+	);
+
+	const handleRemoveDoc = useCallback(() => {
+		if (anonMode.isAnonymous) {
+			anonMode.setUploadedDoc(null);
+		}
+	}, [anonMode]);
+
+	const treeDocuments: DocumentNodeDoc[] = useMemo(() => {
+		if (!anonMode.isAnonymous || !anonMode.uploadedDoc) return [];
+		return [
+			{
+				id: -1,
+				title: anonMode.uploadedDoc.filename,
+				document_type: "FILE",
+				folderId: null,
+				status: { state: "ready" } as { state: string; reason?: string | null },
+			},
+		];
+	}, [anonMode]);
+
+	const searchFilteredDocs = useMemo(() => {
+		const q = search.trim().toLowerCase();
+		if (!q) return treeDocuments;
+		return treeDocuments.filter((d) => d.title.toLowerCase().includes(q));
+	}, [treeDocuments, search]);
+
+	useEffect(() => {
+		const handleEscape = (e: KeyboardEvent) => {
+			if (e.key === "Escape" && open && isMobile) {
+				onOpenChange(false);
+			}
+		};
+		document.addEventListener("keydown", handleEscape);
+		return () => document.removeEventListener("keydown", handleEscape);
+	}, [open, onOpenChange, isMobile]);
+
+	const documentsContent = (
+		<>
+			<input
+				ref={fileInputRef}
+				type="file"
+				accept={ANON_ACCEPT}
+				className="hidden"
+				onChange={handleFileChange}
+				disabled={isUploading}
+			/>
+
+			{/* Header */}
+			<div className="shrink-0 flex h-12 items-center px-3 border-b">
+				<div className="flex w-full items-center justify-between">
+					<div className="flex items-center gap-2">
+						<h2 className="select-none text-base font-semibold">{t("title") || "Documents"}</h2>
+					</div>
+					<div className="flex items-center gap-1">
+						{isMobile && (
+							<Button
+								variant="ghost"
+								size="icon"
+								className="h-8 w-8 rounded-full text-muted-foreground hover:text-accent-foreground"
+								onClick={() => onOpenChange(false)}
+							>
+								<X className="h-4 w-4" />
+								<span className="sr-only">{tSidebar("close") || "Close"}</span>
+							</Button>
+						)}
+						{!isMobile && onDockedChange && (
+							<Tooltip>
+								<TooltipTrigger asChild>
+									<Button
+										variant="ghost"
+										size="icon"
+										className="h-8 w-8 rounded-full text-muted-foreground hover:text-accent-foreground"
+										onClick={() => {
+											if (isDocked) {
+												onDockedChange(false);
+												onOpenChange(false);
+											} else {
+												onDockedChange(true);
+											}
+										}}
+									>
+										{isDocked ? (
+											<ChevronLeft className="h-4 w-4" />
+										) : (
+											<ChevronRight className="h-4 w-4" />
+										)}
+										<span className="sr-only">{isDocked ? "Collapse panel" : "Expand panel"}</span>
+									</Button>
+								</TooltipTrigger>
+								<TooltipContent className="z-80">
+									{isDocked ? "Collapse panel" : "Expand panel"}
+								</TooltipContent>
+							</Tooltip>
+						)}
+						{headerAction}
+					</div>
+				</div>
+			</div>
+
+			{/* Connectors strip (gated) */}
+			<Button
+				type="button"
+				variant="ghost"
+				size="sm"
+				onClick={() => gate("connect your data sources")}
+				className="shrink-0 mx-4 mt-6 mb-2.5 h-auto select-none justify-start gap-2 border bg-muted/50 px-3 py-1.5 text-xs text-muted-foreground"
+			>
+				<Unplug className="size-4 shrink-0" />
+				<span className="truncate">Connect your connectors</span>
+				<AvatarGroup className="ml-auto shrink-0">
+					{(isMobile ? SHOWCASE_CONNECTORS.slice(0, 5) : SHOWCASE_CONNECTORS).map(
+						({ type, label }, i) => {
+							const avatar = (
+								<Avatar
+									key={type}
+									className="size-6"
+									style={{ zIndex: SHOWCASE_CONNECTORS.length - i }}
+								>
+									<AvatarFallback className="bg-muted text-[10px] text-muted-foreground">
+										{getConnectorIcon(type, "size-3.5")}
+									</AvatarFallback>
+								</Avatar>
+							);
+							if (isMobile) return avatar;
+							return (
+								<Tooltip key={type}>
+									<TooltipTrigger asChild>{avatar}</TooltipTrigger>
+									<TooltipContent side="top" className="text-xs">
+										{label}
+									</TooltipContent>
+								</Tooltip>
+							);
+						}
+					)}
+				</AvatarGroup>
+			</Button>
+
+			{/* Filters & upload */}
+			<div className="flex-1 min-h-0 pt-0 flex flex-col">
+				<div className="px-4 pb-1.5">
+					<DocumentsFilters
+						typeCounts={hasDoc ? { FILE: 1 } : {}}
+						onSearch={setSearch}
+						searchValue={search}
+						onToggleType={() => {}}
+						activeTypes={[]}
+						onCreateFolder={() => gate("create folders")}
+						aiSortEnabled={false}
+						onUploadClick={handleAnonUploadClick}
+					/>
+				</div>
+
+				<div className="relative flex-1 min-h-0 overflow-auto">
+					<FolderTreeView
+						folders={[]}
+						documents={searchFilteredDocs}
+						expandedIds={new Set()}
+						onToggleExpand={() => {}}
+						mentionedDocKeys={mentionedDocKeys}
+						onToggleChatMention={handleToggleChatMention}
+						onToggleFolderSelect={() => {}}
+						onRenameFolder={() => gate("rename folders")}
+						onDeleteFolder={() => gate("delete folders")}
+						onMoveFolder={() => gate("organize folders")}
+						onCreateFolder={() => gate("create folders")}
+						searchQuery={search.trim() || undefined}
+						onPreviewDocument={() => gate("preview documents")}
+						onEditDocument={() => gate("edit documents")}
+						onDeleteDocument={async () => {
+							handleRemoveDoc();
+							setSidebarDocs((prev) => prev.filter((d) => d.id !== -1));
+							return true;
+						}}
+						onMoveDocument={() => gate("organize documents")}
+						onExportDocument={() => gate("export documents")}
+						onVersionHistory={() => gate("view version history")}
+						activeTypes={[]}
+						onDropIntoFolder={async () => gate("organize documents")}
+						onReorderFolder={async () => gate("organize folders")}
+						watchedFolderIds={new Set()}
+						onRescanFolder={() => gate("watch local folders")}
+						onStopWatchingFolder={() => gate("watch local folders")}
+						onExportFolder={() => gate("export folders")}
+					/>
+
+					{!hasDoc && (
+						<div className="px-4 py-8 text-center">
+							<Button
+								type="button"
+								variant="ghost"
+								onClick={handleAnonUploadClick}
+								disabled={isUploading}
+								className="relative h-auto w-full border-2 border-dashed border-primary/30 px-4 py-6 text-sm text-primary hover:border-primary/60 hover:bg-primary/5 hover:text-primary cursor-pointer"
+							>
+								<span className={`flex items-center gap-2 ${isUploading ? "opacity-0" : ""}`}>
+									<Upload className="size-4" />
+									Upload a document
+								</span>
+								{isUploading && <Spinner size="sm" className="absolute" />}
+							</Button>
+							<p className="mt-2 text-[11px] text-muted-foreground leading-relaxed">
+								Text, code, CSV, and HTML files only. Create an account for PDFs, images, and 30+
+								connectors.
+							</p>
+						</div>
+					)}
+				</div>
+			</div>
+
+			{/* CTA footer */}
+			<div className="border-t p-4 space-y-3">
+				<div className="flex items-center gap-2 text-xs text-muted-foreground">
+					<Lock className="size-3.5 shrink-0" />
+					<span>Create an account to unlock:</span>
+				</div>
+				<ul className="space-y-1.5 text-xs text-muted-foreground pl-5">
+					<li className="flex items-center gap-1.5">
+						<Paperclip className="size-3 shrink-0" /> PDF, Word, images, audio uploads
+					</li>
+					<li className="flex items-center gap-1.5">
+						<FileText className="size-3 shrink-0" /> Unlimited documents
+					</li>
+				</ul>
+				<Button size="sm" className="w-full" asChild>
+					<Link href="/register">Create Free Account</Link>
+				</Button>
+			</div>
+		</>
+	);
+
+	if (embedded) {
+		return (
+			<div className="flex h-full flex-col bg-sidebar text-sidebar-foreground">
+				{documentsContent}
+			</div>
+		);
+	}
+
+	if (isDocked && open && !isMobile) {
+		return (
+			<aside
+				className="h-full w-[380px] shrink-0 bg-sidebar text-sidebar-foreground flex flex-col border-r"
+				aria-label={t("title") || "Documents"}
+			>
+				{documentsContent}
+			</aside>
+		);
+	}
+
+	if (isMobile) {
+		return (
+			<Drawer open={open} onOpenChange={onOpenChange}>
+				<DrawerContent className="max-h-[75vh] flex flex-col">
+					<DrawerTitle className="sr-only">{t("title") || "Documents"}</DrawerTitle>
+					<DrawerHandle />
+					<div className="flex-1 min-h-0 flex flex-col overflow-hidden">{documentsContent}</div>
+				</DrawerContent>
+			</Drawer>
+		);
+	}
+
+	return (
+		<SidebarSlideOutPanel
+			open={open}
+			onOpenChange={onOpenChange}
+			ariaLabel={t("title") || "Documents"}
+			width={380}
+		>
+			{documentsContent}
+		</SidebarSlideOutPanel>
+	);
+}
diff --git a/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx
index 371f3dc6d8..f757db70e7 100644
--- a/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx
@@ -14,7 +14,7 @@ import {
 	Inbox,
 	LayoutGrid,
 	ListFilter,
-	MessageSquare,
+	MessageCircleReply,
 	Search,
 	X,
 } from "lucide-react";
@@ -22,8 +22,6 @@ import { useParams, useRouter } from "next/navigation";
 import { useTranslations } from "next-intl";
 import { useCallback, useDeferredValue, useEffect, useMemo, useRef, useState } from "react";
 import { setTargetCommentIdAtom } from "@/atoms/chat/current-thread.atom";
-import { convertRenderedToDisplay } from "@/components/chat-comments/comment-item/comment-item";
-import { getDocumentTypeLabel } from "@/components/documents/DocumentTypeIcon";
 import { Tabs, TabsList, TabsTrigger } from "@/components/ui/animated-tabs";
 import { Avatar, AvatarFallback, AvatarImage } from "@/components/ui/avatar";
 import { Button } from "@/components/ui/button";
@@ -57,6 +55,8 @@ import { useDebouncedValue } from "@/hooks/use-debounced-value";
 import type { InboxItem } from "@/hooks/use-inbox";
 import { useMediaQuery } from "@/hooks/use-media-query";
 import { notificationsApiService } from "@/lib/apis/notifications-api.service";
+import { convertRenderedToDisplay } from "@/lib/comments/utils";
+import { getDocumentTypeLabel } from "@/lib/documents/document-type-labels";
 import { cacheKeys } from "@/lib/query-client/cache-keys";
 import { cn } from "@/lib/utils";
 import { SidebarSlideOutPanel } from "./SidebarSlideOutPanel";
@@ -178,7 +178,7 @@ export function InboxSidebarContent({
 	const [mounted, setMounted] = useState(false);
 	const [openDropdown, setOpenDropdown] = useState<"filter" | null>(null);
 	const [connectorScrollPos, setConnectorScrollPos] = useState<"top" | "middle" | "bottom">("top");
-	const connectorRafRef = useRef<number>();
+	const connectorRafRef = useRef<number | null>(null);
 	const handleConnectorScroll = useCallback((e: React.UIEvent<HTMLDivElement>) => {
 		const el = e.currentTarget;
 		if (connectorRafRef.current) return;
@@ -186,7 +186,7 @@ export function InboxSidebarContent({
 			const atTop = el.scrollTop <= 2;
 			const atBottom = el.scrollHeight - el.scrollTop - el.clientHeight <= 2;
 			setConnectorScrollPos(atTop ? "top" : atBottom ? "bottom" : "middle");
-			connectorRafRef.current = undefined;
+			connectorRafRef.current = null;
 		});
 	}, []);
 	useEffect(
@@ -528,17 +528,17 @@ export function InboxSidebarContent({
 
 	return (
 		<>
-			<div className="shrink-0 p-4 pb-2 space-y-3">
+			<div className="shrink-0 p-3 pb-1.5 space-y-2">
 				<div className="flex items-center justify-between">
 					<div className="flex items-center gap-2">
 						{isMobile && (
 							<Button
 								variant="ghost"
 								size="icon"
-								className="h-8 w-8 rounded-full"
+								className="h-8 w-8 rounded-full text-muted-foreground hover:text-accent-foreground"
 								onClick={() => onOpenChange(false)}
 							>
-								<ChevronLeft className="h-4 w-4 text-muted-foreground" />
+								<ChevronLeft className="h-4 w-4" />
 								<span className="sr-only">{t("close") || "Close"}</span>
 							</Button>
 						)}
@@ -561,7 +561,10 @@ export function InboxSidebarContent({
 									onOpenChange={setFilterDrawerOpen}
 									shouldScaleBackground={false}
 								>
-									<DrawerContent className="max-h-[70vh] z-80" overlayClassName="z-80">
+									<DrawerContent
+										className="max-h-[70vh] z-80 bg-popover text-popover-foreground"
+										overlayClassName="z-80"
+									>
 										<DrawerHandle />
 										<DrawerHeader className="px-4 pb-3 pt-2">
 											<DrawerTitle className="flex items-center gap-2 text-base font-semibold">
@@ -575,14 +578,15 @@ export function InboxSidebarContent({
 													{t("filter") || "Filter"}
 												</p>
 												<div className="space-y-1">
-													<button
+													<Button
 														type="button"
+														variant="ghost"
 														onClick={() => {
 															setActiveFilter("all");
 															setFilterDrawerOpen(false);
 														}}
 														className={cn(
-															"flex w-full items-center justify-between rounded-lg px-3 py-2.5 text-sm transition-colors",
+															"h-auto w-full justify-between rounded-lg px-3 py-2.5 text-sm transition-colors",
 															activeFilter === "all"
 																? "bg-primary/10 text-primary"
 																: "hover:bg-muted"
@@ -593,15 +597,16 @@ export function InboxSidebarContent({
 															<span>{t("all") || "All"}</span>
 														</span>
 														{activeFilter === "all" && <Check className="h-4 w-4" />}
-													</button>
-													<button
+													</Button>
+													<Button
 														type="button"
+														variant="ghost"
 														onClick={() => {
 															setActiveFilter("unread");
 															setFilterDrawerOpen(false);
 														}}
 														className={cn(
-															"flex w-full items-center justify-between rounded-lg px-3 py-2.5 text-sm transition-colors",
+															"h-auto w-full justify-between rounded-lg px-3 py-2.5 text-sm transition-colors",
 															activeFilter === "unread"
 																? "bg-primary/10 text-primary"
 																: "hover:bg-muted"
@@ -612,16 +617,17 @@ export function InboxSidebarContent({
 															<span>{t("unread") || "Unread"}</span>
 														</span>
 														{activeFilter === "unread" && <Check className="h-4 w-4" />}
-													</button>
+													</Button>
 													{activeTab === "status" && (
-														<button
+														<Button
 															type="button"
+															variant="ghost"
 															onClick={() => {
 																setActiveFilter("errors");
 																setFilterDrawerOpen(false);
 															}}
 															className={cn(
-																"flex w-full items-center justify-between rounded-lg px-3 py-2.5 text-sm transition-colors",
+																"h-auto w-full justify-between rounded-lg px-3 py-2.5 text-sm transition-colors",
 																activeFilter === "errors"
 																	? "bg-primary/10 text-primary"
 																	: "hover:bg-muted"
@@ -632,7 +638,7 @@ export function InboxSidebarContent({
 																<span>{t("errors_only") || "Errors only"}</span>
 															</span>
 															{activeFilter === "errors" && <Check className="h-4 w-4" />}
-														</button>
+														</Button>
 													)}
 												</div>
 											</div>
@@ -642,14 +648,15 @@ export function InboxSidebarContent({
 														{t("sources") || "Sources"}
 													</p>
 													<div className="space-y-1">
-														<button
+														<Button
 															type="button"
+															variant="ghost"
 															onClick={() => {
 																setSelectedSource(null);
 																setFilterDrawerOpen(false);
 															}}
 															className={cn(
-																"flex w-full items-center justify-between rounded-lg px-3 py-2.5 text-sm transition-colors",
+																"h-auto w-full justify-between rounded-lg px-3 py-2.5 text-sm transition-colors",
 																selectedSource === null
 																	? "bg-primary/10 text-primary"
 																	: "hover:bg-muted"
@@ -660,17 +667,18 @@ export function InboxSidebarContent({
 																<span>{t("all_sources") || "All sources"}</span>
 															</span>
 															{selectedSource === null && <Check className="h-4 w-4" />}
-														</button>
+														</Button>
 														{statusSourceOptions.map((source) => (
-															<button
+															<Button
 																key={source.key}
 																type="button"
+																variant="ghost"
 																onClick={() => {
 																	setSelectedSource(source.key);
 																	setFilterDrawerOpen(false);
 																}}
 																className={cn(
-																	"flex w-full items-center justify-between rounded-lg px-3 py-2.5 text-sm transition-colors",
+																	"h-auto w-full justify-between rounded-lg px-3 py-2.5 text-sm transition-colors",
 																	selectedSource === source.key
 																		? "bg-primary/10 text-primary"
 																		: "hover:bg-muted"
@@ -681,7 +689,7 @@ export function InboxSidebarContent({
 																	<span>{source.displayName}</span>
 																</span>
 																{selectedSource === source.key && <Check className="h-4 w-4" />}
-															</button>
+															</Button>
 														))}
 													</div>
 												</div>
@@ -811,19 +819,19 @@ export function InboxSidebarContent({
 				</div>
 
 				<div className="relative">
-					<Search className="absolute left-3 top-1/2 -translate-y-1/2 h-4 w-4 text-muted-foreground" />
+					<Search className="absolute left-2.5 top-1/2 -translate-y-1/2 h-3.5 w-3.5 text-muted-foreground" />
 					<Input
 						type="text"
 						placeholder={t("search_inbox") || "Search inbox"}
 						value={searchQuery}
 						onChange={(e) => setSearchQuery(e.target.value)}
-						className="pl-9 pr-8 h-9"
+						className="h-8 border-0 bg-muted pl-8 pr-7 text-sm shadow-none"
 					/>
 					{searchQuery && (
 						<Button
 							variant="ghost"
 							size="icon"
-							className="absolute right-1 top-1/2 -translate-y-1/2 h-6 w-6"
+							className="absolute right-1 top-1/2 h-5 w-5 -translate-y-1/2 rounded-sm text-muted-foreground transition-colors hover:bg-accent hover:text-accent-foreground"
 							onClick={handleClearSearch}
 						>
 							<X className="h-3.5 w-3.5" />
@@ -842,23 +850,23 @@ export function InboxSidebarContent({
 						setActiveFilter("all");
 					}
 				}}
-				className="shrink-0 mx-4 mt-2"
+				className="shrink-0 mx-3 mt-1.5"
 			>
 				<TabsList stretch showBottomBorder size="sm">
 					<TabsTrigger value="comments">
 						<span className="inline-flex items-center gap-1.5">
-							<MessageSquare className="h-4 w-4" />
+							<MessageCircleReply className="h-3.5 w-3.5" />
 							<span>{t("comments") || "Comments"}</span>
-							<span className="inline-flex items-center justify-center min-w-5 h-5 px-1.5 rounded-full bg-primary/20 text-muted-foreground text-xs font-medium">
+							<span className="inline-flex h-4.5 min-w-4.5 items-center justify-center rounded-full bg-primary/20 px-1 text-[10px] font-medium text-muted-foreground">
 								{formatInboxCount(comments.unreadCount)}
 							</span>
 						</span>
 					</TabsTrigger>
 					<TabsTrigger value="status">
 						<span className="inline-flex items-center gap-1.5">
-							<History className="h-4 w-4" />
+							<History className="h-3.5 w-3.5" />
 							<span>{t("status") || "Status"}</span>
-							<span className="inline-flex items-center justify-center min-w-5 h-5 px-1.5 rounded-full bg-primary/20 text-muted-foreground text-xs font-medium">
+							<span className="inline-flex h-4.5 min-w-4.5 items-center justify-center rounded-full bg-primary/20 px-1 text-[10px] font-medium text-muted-foreground">
 								{formatInboxCount(status.unreadCount)}
 							</span>
 						</span>
@@ -922,11 +930,12 @@ export function InboxSidebarContent({
 									{activeTab === "status" ? (
 										<Tooltip delayDuration={600}>
 											<TooltipTrigger asChild>
-												<button
+												<Button
 													type="button"
+													variant="ghost"
 													onClick={() => handleItemClick(item)}
 													disabled={isMarkingAsRead}
-													className="flex items-center gap-3 flex-1 min-w-0 text-left overflow-hidden"
+													className="h-auto flex-1 justify-start gap-3 overflow-hidden bg-transparent p-0 text-left hover:bg-transparent"
 												>
 													<div className="shrink-0">{getStatusIcon(item)}</div>
 													<div className="flex-1 min-w-0 overflow-hidden">
@@ -942,7 +951,7 @@ export function InboxSidebarContent({
 															{convertRenderedToDisplay(item.message)}
 														</p>
 													</div>
-												</button>
+												</Button>
 											</TooltipTrigger>
 											<TooltipContent side="bottom" align="start" className="max-w-[250px]">
 												<p className="font-medium">{item.title}</p>
@@ -952,11 +961,12 @@ export function InboxSidebarContent({
 											</TooltipContent>
 										</Tooltip>
 									) : (
-										<button
+										<Button
 											type="button"
+											variant="ghost"
 											onClick={() => handleItemClick(item)}
 											disabled={isMarkingAsRead}
-											className="flex items-center gap-3 flex-1 min-w-0 text-left overflow-hidden"
+											className="h-auto flex-1 justify-start gap-3 overflow-hidden bg-transparent p-0 text-left hover:bg-transparent"
 										>
 											<div className="shrink-0">{getStatusIcon(item)}</div>
 											<div className="flex-1 min-w-0 overflow-hidden">
@@ -972,7 +982,7 @@ export function InboxSidebarContent({
 													{convertRenderedToDisplay(item.message)}
 												</p>
 											</div>
-										</button>
+										</Button>
 									)}
 
 									<div className="flex items-center justify-end gap-1.5 shrink-0 w-10">
@@ -1021,23 +1031,25 @@ export function InboxSidebarContent({
 					</div>
 				) : isSearchMode ? (
 					<div className="text-center py-8">
-						<Search className="h-12 w-12 mx-auto text-muted-foreground mb-3" />
-						<p className="text-sm text-muted-foreground">
+						<Search className="mx-auto mb-2.5 h-10 w-10 text-muted-foreground" />
+						<p className="text-xs text-muted-foreground">
 							{t("no_results_found") || "No results found"}
 						</p>
-						<p className="text-xs text-muted-foreground/70 mt-1">
+						<p className="mt-1 text-[11px] text-muted-foreground/70">
 							{t("try_different_search") || "Try a different search term"}
 						</p>
 					</div>
 				) : (
 					<div className="text-center py-8">
 						{activeTab === "comments" ? (
-							<MessageSquare className="h-12 w-12 mx-auto text-muted-foreground mb-3" />
+							<MessageCircleReply className="mx-auto mb-2.5 h-10 w-10 text-muted-foreground" />
 						) : (
-							<History className="h-12 w-12 mx-auto text-muted-foreground mb-3" />
+							<History className="mx-auto mb-2.5 h-10 w-10 text-muted-foreground" />
 						)}
-						<p className="text-sm text-muted-foreground">{getEmptyStateMessage().title}</p>
-						<p className="text-xs text-muted-foreground/70 mt-1">{getEmptyStateMessage().hint}</p>
+						<p className="text-xs text-muted-foreground">{getEmptyStateMessage().title}</p>
+						<p className="mt-1 text-[11px] text-muted-foreground/70">
+							{getEmptyStateMessage().hint}
+						</p>
 					</div>
 				)}
 			</div>
diff --git a/surfsense_web/components/layout/ui/sidebar/LocalFilesystemBrowser.tsx b/surfsense_web/components/layout/ui/sidebar/LocalFilesystemBrowser.tsx
new file mode 100644
index 0000000000..aa773f3e18
--- /dev/null
+++ b/surfsense_web/components/layout/ui/sidebar/LocalFilesystemBrowser.tsx
@@ -0,0 +1,586 @@
+"use client";
+
+import { ChevronDown, ChevronRight, FileText, Folder, FolderOpen } from "lucide-react";
+import { useCallback, useEffect, useMemo, useRef, useState } from "react";
+import { DEFAULT_EXCLUDE_PATTERNS } from "@/components/sources/FolderWatchDialog";
+import { Button } from "@/components/ui/button";
+import { Skeleton } from "@/components/ui/skeleton";
+import { Spinner } from "@/components/ui/spinner";
+import { useElectronAPI } from "@/hooks/use-platform";
+import { cn } from "@/lib/utils";
+
+interface LocalFilesystemBrowserProps {
+	rootPaths: string[];
+	searchSpaceId: number;
+	active?: boolean;
+	searchQuery?: string;
+	onOpenFile: (fullPath: string) => void;
+	expandedFolderKeys?: Set<string>;
+	onExpandedFolderKeysChange?: (nextExpandedKeys: Set<string>) => void;
+}
+
+interface LocalFolderFileEntry {
+	relativePath: string;
+	fullPath: string;
+	size: number;
+	mtimeMs: number;
+}
+
+type RootLoadState = {
+	loading: boolean;
+	error: string | null;
+	files: LocalFolderFileEntry[];
+};
+
+interface LocalFolderNode {
+	key: string;
+	name: string;
+	folders: Map<string, LocalFolderNode>;
+	files: LocalFolderFileEntry[];
+}
+
+type LocalRootMount = {
+	mount: string;
+	rootPath: string;
+};
+
+type MountLoadStatus = "idle" | "loading" | "complete" | "error";
+
+const LOCAL_OPENABLE_EXTENSIONS = [
+	".md",
+	".markdown",
+	".txt",
+	".json",
+	".yaml",
+	".yml",
+	".csv",
+	".tsv",
+	".xml",
+	".html",
+	".htm",
+	".css",
+	".scss",
+	".sass",
+	".sql",
+	".toml",
+	".ini",
+	".conf",
+	".log",
+	".py",
+	".js",
+	".jsx",
+	".mjs",
+	".cjs",
+	".ts",
+	".tsx",
+	".java",
+	".kt",
+	".kts",
+	".go",
+	".rs",
+	".rb",
+	".php",
+	".swift",
+	".r",
+	".lua",
+	".sh",
+	".bash",
+	".zsh",
+	".fish",
+	".env",
+	".mk",
+];
+
+const getFolderDisplayName = (rootPath: string): string =>
+	rootPath.split(/[\\/]/).at(-1) || rootPath;
+
+function createFolderNode(key: string, name: string): LocalFolderNode {
+	return {
+		key,
+		name,
+		folders: new Map(),
+		files: [],
+	};
+}
+
+function getFileName(pathValue: string): string {
+	return pathValue.split(/[\\/]/).at(-1) || pathValue;
+}
+
+function toVirtualPath(relativePath: string): string {
+	const normalized = relativePath.replace(/\\/g, "/").replace(/^\/+/, "");
+	return `/${normalized}`;
+}
+
+function normalizeRootPathForLookup(rootPath: string, isWindows: boolean): string {
+	const normalized = rootPath.replace(/\\/g, "/").replace(/\/+$/, "");
+	return isWindows ? normalized.toLowerCase() : normalized;
+}
+
+function toMountedVirtualPath(mount: string, relativePath: string): string {
+	return `/${mount}${toVirtualPath(relativePath)}`;
+}
+
+function getNormalizedExtension(pathValue: string): string {
+	const fileName = getFileName(pathValue).toLowerCase();
+	if (!fileName) return "";
+	if (fileName === "dockerfile" || fileName === "makefile") {
+		return `.${fileName}`;
+	}
+	const dotIndex = fileName.lastIndexOf(".");
+	if (dotIndex <= 0) return "";
+	return fileName.slice(dotIndex);
+}
+
+export function LocalFilesystemBrowser({
+	rootPaths,
+	searchSpaceId,
+	active = true,
+	searchQuery,
+	onOpenFile,
+	expandedFolderKeys,
+	onExpandedFolderKeysChange,
+}: LocalFilesystemBrowserProps) {
+	const electronAPI = useElectronAPI();
+	const [rootStateMap, setRootStateMap] = useState<Record<string, RootLoadState>>({});
+	const [internalExpandedFolderKeys, setInternalExpandedFolderKeys] = useState<Set<string>>(
+		new Set()
+	);
+	const [mountByRootKey, setMountByRootKey] = useState<Map<string, string>>(new Map());
+	const [mountStatus, setMountStatus] = useState<MountLoadStatus>("idle");
+	const [mountRefreshInFlight, setMountRefreshInFlight] = useState(false);
+	const [reloadNonceByRoot, setReloadNonceByRoot] = useState<Record<string, number>>({});
+	const lastLoadedSignatureByRootRef = useRef<Map<string, string>>(new Map());
+	const hasLoadedMountsOnceRef = useRef(false);
+	const hasResolvedAtLeastOneRootRef = useRef(false);
+	const openableExtensions = useMemo(() => new Set(LOCAL_OPENABLE_EXTENSIONS), []);
+	const isWindowsPlatform = electronAPI?.versions.platform === "win32";
+	const effectiveExpandedFolderKeys = expandedFolderKeys ?? internalExpandedFolderKeys;
+
+	useEffect(() => {
+		if (!active) return;
+		if (!electronAPI?.listAgentFilesystemFiles) {
+			for (const rootPath of rootPaths) {
+				setRootStateMap((prev) => ({
+					...prev,
+					[rootPath]: {
+						loading: false,
+						error: "Desktop app update required for local mode browsing.",
+						files: [],
+					},
+				}));
+			}
+			return;
+		}
+		const rootEntries = rootPaths.map((rootPath) => ({
+			rootPath,
+			rootKey: normalizeRootPathForLookup(rootPath, isWindowsPlatform),
+		}));
+		const activeRootKeys = new Set(rootEntries.map((entry) => entry.rootKey));
+		for (const key of Array.from(lastLoadedSignatureByRootRef.current.keys())) {
+			if (!activeRootKeys.has(key)) {
+				lastLoadedSignatureByRootRef.current.delete(key);
+			}
+		}
+		const rootsToReload = rootEntries.filter(({ rootKey }) => {
+			const nonce = reloadNonceByRoot[rootKey] ?? 0;
+			const signature = `${searchSpaceId}:${rootKey}:${nonce}`;
+			return lastLoadedSignatureByRootRef.current.get(rootKey) !== signature;
+		});
+		if (rootsToReload.length === 0) {
+			return;
+		}
+		for (const { rootKey } of rootsToReload) {
+			const nonce = reloadNonceByRoot[rootKey] ?? 0;
+			lastLoadedSignatureByRootRef.current.set(rootKey, `${searchSpaceId}:${rootKey}:${nonce}`);
+		}
+		let cancelled = false;
+
+		for (const { rootPath } of rootsToReload) {
+			setRootStateMap((prev) => ({
+				...prev,
+				[rootPath]: {
+					loading: true,
+					error: null,
+					files: prev[rootPath]?.files ?? [],
+				},
+			}));
+		}
+
+		void Promise.all(
+			rootsToReload.map(async ({ rootPath }) => {
+				try {
+					const files = (await electronAPI.listAgentFilesystemFiles({
+						rootPath,
+						searchSpaceId,
+						excludePatterns: DEFAULT_EXCLUDE_PATTERNS,
+					})) as LocalFolderFileEntry[];
+					if (cancelled) return;
+					setRootStateMap((prev) => ({
+						...prev,
+						[rootPath]: {
+							loading: false,
+							error: null,
+							files,
+						},
+					}));
+				} catch (error) {
+					if (cancelled) return;
+					setRootStateMap((prev) => ({
+						...prev,
+						[rootPath]: {
+							loading: false,
+							error: error instanceof Error ? error.message : "Failed to read folder",
+							files: [],
+						},
+					}));
+				}
+			})
+		);
+
+		return () => {
+			cancelled = true;
+		};
+	}, [active, electronAPI, isWindowsPlatform, reloadNonceByRoot, rootPaths, searchSpaceId]);
+
+	useEffect(() => {
+		if (active) return;
+		lastLoadedSignatureByRootRef.current.clear();
+	}, [active]);
+
+	useEffect(() => {
+		if (!electronAPI?.startAgentFilesystemTreeWatch) return;
+		if (!electronAPI?.stopAgentFilesystemTreeWatch) return;
+		if (!electronAPI?.onAgentFilesystemTreeDirty) return;
+		if (!active) return;
+		if (rootPaths.length === 0) {
+			void electronAPI.stopAgentFilesystemTreeWatch(searchSpaceId);
+			return;
+		}
+
+		const unsubscribe = electronAPI.onAgentFilesystemTreeDirty(
+			(event: {
+				searchSpaceId: number | null;
+				reason: "watcher_event" | "safety_poll";
+				rootPath: string;
+				changedPath: string | null;
+				timestamp: number;
+			}) => {
+				if ((event.searchSpaceId ?? null) !== (searchSpaceId ?? null)) {
+					return;
+				}
+				const eventRootKey = normalizeRootPathForLookup(event.rootPath, isWindowsPlatform);
+				const knownRootKeys = new Set(
+					rootPaths.map((rootPath) => normalizeRootPathForLookup(rootPath, isWindowsPlatform))
+				);
+				if (!knownRootKeys.has(eventRootKey)) {
+					setReloadNonceByRoot((prev) => {
+						const next = { ...prev };
+						for (const rootKey of knownRootKeys) {
+							next[rootKey] = (prev[rootKey] ?? 0) + 1;
+						}
+						return next;
+					});
+					return;
+				}
+				setReloadNonceByRoot((prev) => ({
+					...prev,
+					[eventRootKey]: (prev[eventRootKey] ?? 0) + 1,
+				}));
+			}
+		);
+		void electronAPI.startAgentFilesystemTreeWatch({
+			searchSpaceId,
+			rootPaths,
+			excludePatterns: DEFAULT_EXCLUDE_PATTERNS,
+		});
+
+		return () => {
+			unsubscribe();
+			void electronAPI.stopAgentFilesystemTreeWatch(searchSpaceId);
+		};
+	}, [active, electronAPI, isWindowsPlatform, rootPaths, searchSpaceId]);
+
+	useEffect(() => {
+		if (!electronAPI?.getAgentFilesystemMounts) {
+			setMountStatus("error");
+			setMountByRootKey(new Map());
+			return;
+		}
+		if (rootPaths.length === 0) {
+			setMountByRootKey(new Map());
+			setMountStatus("complete");
+			setMountRefreshInFlight(false);
+			hasLoadedMountsOnceRef.current = true;
+			return;
+		}
+		let cancelled = false;
+		const isInitialMountLoad = !hasLoadedMountsOnceRef.current;
+		if (isInitialMountLoad) {
+			setMountStatus("loading");
+		} else {
+			setMountRefreshInFlight(true);
+		}
+		void electronAPI
+			.getAgentFilesystemMounts(searchSpaceId)
+			.then((mounts: LocalRootMount[]) => {
+				if (cancelled) return;
+				const next = new Map<string, string>();
+				for (const entry of mounts) {
+					const normalizedRootKey = normalizeRootPathForLookup(entry.rootPath, isWindowsPlatform);
+					next.set(normalizedRootKey, entry.mount);
+				}
+				setMountByRootKey(next);
+				setMountStatus("complete");
+				hasLoadedMountsOnceRef.current = true;
+			})
+			.catch(() => {
+				if (cancelled) return;
+				if (isInitialMountLoad) {
+					setMountByRootKey(new Map());
+					setMountStatus("error");
+				}
+			})
+			.finally(() => {
+				if (cancelled) return;
+				setMountRefreshInFlight(false);
+			});
+		return () => {
+			cancelled = true;
+		};
+	}, [electronAPI, isWindowsPlatform, rootPaths, searchSpaceId]);
+
+	const treeByRoot = useMemo(() => {
+		const query = searchQuery?.trim().toLowerCase() ?? "";
+		const hasQuery = query.length > 0;
+
+		return rootPaths.map((rootPath) => {
+			const rootNode = createFolderNode(rootPath, getFolderDisplayName(rootPath));
+			const allFiles = rootStateMap[rootPath]?.files ?? [];
+			const files = hasQuery
+				? allFiles.filter((file) => {
+						const relativePath = file.relativePath.toLowerCase();
+						const fileName = getFileName(file.relativePath).toLowerCase();
+						return relativePath.includes(query) || fileName.includes(query);
+					})
+				: allFiles;
+			for (const file of files) {
+				const parts = file.relativePath.split(/[\\/]/).filter(Boolean);
+				let cursor = rootNode;
+				for (let i = 0; i < parts.length - 1; i++) {
+					const part = parts[i];
+					const folderKey = `${cursor.key}/${part}`;
+					if (!cursor.folders.has(part)) {
+						cursor.folders.set(part, createFolderNode(folderKey, part));
+					}
+					cursor = cursor.folders.get(part) as LocalFolderNode;
+				}
+				cursor.files.push(file);
+			}
+			return { rootPath, rootNode, matchCount: files.length, totalCount: allFiles.length };
+		});
+	}, [rootPaths, rootStateMap, searchQuery]);
+
+	const toggleFolder = useCallback(
+		(folderKey: string) => {
+			const update = (prev: Set<string>) => {
+				const next = new Set(prev);
+				if (next.has(folderKey)) {
+					next.delete(folderKey);
+				} else {
+					next.add(folderKey);
+				}
+				return next;
+			};
+			if (onExpandedFolderKeysChange) {
+				onExpandedFolderKeysChange(update(effectiveExpandedFolderKeys));
+				return;
+			}
+			setInternalExpandedFolderKeys(update);
+		},
+		[effectiveExpandedFolderKeys, onExpandedFolderKeysChange]
+	);
+
+	const renderFolder = useCallback(
+		(folder: LocalFolderNode, depth: number, mount: string) => {
+			const isExpanded = effectiveExpandedFolderKeys.has(folder.key);
+			const FolderIcon = isExpanded ? FolderOpen : Folder;
+			const childFolders = Array.from(folder.folders.values()).sort((a, b) =>
+				a.name.localeCompare(b.name)
+			);
+			const files = [...folder.files].sort((a, b) => a.relativePath.localeCompare(b.relativePath));
+			return (
+				<div key={folder.key} className="select-none">
+					<Button
+						type="button"
+						variant="ghost"
+						onClick={() => toggleFolder(folder.key)}
+						className="h-8 w-full justify-start gap-1.5 px-2 text-left text-sm font-normal hover:bg-accent hover:text-accent-foreground"
+						style={{ paddingInlineStart: `${depth * 12 + 8}px` }}
+						draggable={false}
+					>
+						{isExpanded ? (
+							<ChevronDown className="size-3.5 shrink-0 text-muted-foreground" />
+						) : (
+							<ChevronRight className="size-3.5 shrink-0 text-muted-foreground" />
+						)}
+						<FolderIcon className="size-3.5 shrink-0 text-muted-foreground" />
+						<span className="truncate">{folder.name}</span>
+					</Button>
+					{isExpanded && (
+						<>
+							{childFolders.map((childFolder) => renderFolder(childFolder, depth + 1, mount))}
+							{files.map((file) => {
+								const extension = getNormalizedExtension(file.relativePath);
+								const isOpenable = openableExtensions.has(extension);
+								return (
+									<Button
+										key={file.fullPath}
+										type="button"
+										variant="ghost"
+										onClick={
+											isOpenable
+												? () => onOpenFile(toMountedVirtualPath(mount, file.relativePath))
+												: undefined
+										}
+										className={cn(
+											"h-8 w-full justify-start gap-1.5 px-2 text-left text-sm font-normal",
+											isOpenable
+												? "hover:bg-accent hover:text-accent-foreground"
+												: "cursor-not-allowed opacity-60"
+										)}
+										style={{ paddingInlineStart: `${(depth + 1) * 12 + 22}px` }}
+										title={
+											isOpenable
+												? file.fullPath
+												: `${file.fullPath}\nThis file type cannot be opened in the editor.`
+										}
+										draggable={false}
+										disabled={!isOpenable}
+									>
+										<FileText className="size-3.5 shrink-0 text-muted-foreground" />
+										<span className="truncate">{getFileName(file.relativePath)}</span>
+									</Button>
+								);
+							})}
+						</>
+					)}
+				</div>
+			);
+		},
+		[effectiveExpandedFolderKeys, onOpenFile, openableExtensions, toggleFolder]
+	);
+
+	if (rootPaths.length === 0) {
+		return (
+			<div className="flex flex-1 flex-col items-center justify-center gap-2 px-4 py-10 text-center text-muted-foreground">
+				<p className="text-sm font-medium">No local folder selected</p>
+				<p className="text-xs text-muted-foreground/80">
+					Add a local folder above to browse files in desktop mode.
+				</p>
+			</div>
+		);
+	}
+
+	const allRootsLoaded = rootPaths.every((rootPath) => {
+		const state = rootStateMap[rootPath];
+		return !!state && !state.loading;
+	});
+	const mountsSettled = mountStatus === "complete" || mountStatus === "error";
+	if (allRootsLoaded && mountsSettled && rootPaths.length > 0) {
+		hasResolvedAtLeastOneRootRef.current = true;
+	}
+	const showInitialLoading =
+		!hasResolvedAtLeastOneRootRef.current && (!allRootsLoaded || !mountsSettled);
+
+	if (showInitialLoading) {
+		const rows = [
+			{ id: "local-row-1", widthClass: "w-44" },
+			{ id: "local-row-2", widthClass: "w-32" },
+			{ id: "local-row-3", widthClass: "w-32" },
+			{ id: "local-row-4", widthClass: "w-44" },
+			{ id: "local-row-5", widthClass: "w-32" },
+			{ id: "local-row-6", widthClass: "w-32" },
+			{ id: "local-row-7", widthClass: "w-44" },
+			{ id: "local-row-8", widthClass: "w-32" },
+		];
+
+		return (
+			<div className="flex-1 min-h-0 overflow-y-auto px-2 py-2">
+				<div className="space-y-1">
+					{rows.map((row) => (
+						<div key={row.id} className="flex h-8 items-center gap-2 px-2">
+							<Skeleton className="h-4 w-4 rounded-sm" />
+							<Skeleton className={`h-4 ${row.widthClass}`} />
+						</div>
+					))}
+				</div>
+			</div>
+		);
+	}
+
+	return (
+		<div className="flex-1 min-h-0 overflow-y-auto px-2 py-2">
+			{treeByRoot.map(({ rootPath, rootNode, matchCount, totalCount }) => {
+				const state = rootStateMap[rootPath];
+				const rootKey = normalizeRootPathForLookup(rootPath, isWindowsPlatform);
+				const mount = mountByRootKey.get(rootKey);
+				if (!state || state.loading) {
+					return (
+						<div key={rootPath} className="mb-1 px-3 py-2 text-xs text-muted-foreground/80">
+							<div className="flex items-center gap-2">
+								<Spinner className="size-3.5" />
+								<span>Loading {getFolderDisplayName(rootPath)}...</span>
+							</div>
+						</div>
+					);
+				}
+				if (state.error) {
+					return (
+						<div
+							key={rootPath}
+							className="rounded-md border border-destructive/20 bg-destructive/5 p-3"
+						>
+							<p className="text-sm font-medium text-destructive">Failed to load local folder</p>
+							<p className="mt-1 text-xs text-muted-foreground">{state.error}</p>
+						</div>
+					);
+				}
+				const isEmpty = totalCount === 0;
+				return (
+					<div key={rootPath} className="mb-1">
+						{mount ? renderFolder(rootNode, 0, mount) : null}
+						{!mount && (mountRefreshInFlight || mountStatus === "loading") && (
+							<div className="px-3 pb-2 text-xs text-muted-foreground/80">
+								<div className="flex items-center gap-2">
+									<Spinner className="size-3.5" />
+									<span>Loading {getFolderDisplayName(rootPath)}...</span>
+								</div>
+							</div>
+						)}
+						{!mount && mountStatus === "complete" && !mountRefreshInFlight && (
+							<div className="px-3 pb-2 text-xs text-muted-foreground/80">
+								Unable to resolve mounted root for this folder.
+							</div>
+						)}
+						{!mount && mountStatus === "error" && (
+							<div className="px-3 pb-2 text-xs text-muted-foreground/80">
+								Failed to resolve local folder mounts.
+							</div>
+						)}
+						{isEmpty && (
+							<div className="px-3 pb-2 text-xs text-muted-foreground/80">
+								No supported files found in this folder.
+							</div>
+						)}
+						{!isEmpty && matchCount === 0 && searchQuery && (
+							<div className="px-3 pb-2 text-xs text-muted-foreground/80">
+								No matching files in this folder.
+							</div>
+						)}
+					</div>
+				);
+			})}
+		</div>
+	);
+}
diff --git a/surfsense_web/components/layout/ui/sidebar/MobileSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/MobileSidebar.tsx
index def40f24ff..83d423ace4 100644
--- a/surfsense_web/components/layout/ui/sidebar/MobileSidebar.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/MobileSidebar.tsx
@@ -1,6 +1,6 @@
 "use client";
 
-import { PanelRightClose, Plus } from "lucide-react";
+import { PanelLeft, Plus } from "lucide-react";
 import { Button } from "@/components/ui/button";
 import { ScrollArea } from "@/components/ui/scroll-area";
 import { Sheet, SheetContent, SheetTitle } from "@/components/ui/sheet";
@@ -34,6 +34,8 @@ interface MobileSidebarProps {
 	onSettings?: () => void;
 	onManageMembers?: () => void;
 	onUserSettings?: () => void;
+	onAnnouncements?: () => void;
+	announcementUnreadCount?: number;
 	onLogout?: () => void;
 	pageUsage?: PageUsage;
 	theme?: string;
@@ -43,8 +45,13 @@ interface MobileSidebarProps {
 
 export function MobileSidebarTrigger({ onClick }: { onClick: () => void }) {
 	return (
-		<Button variant="ghost" size="icon" className="md:hidden h-8 w-8" onClick={onClick}>
-			<PanelRightClose className="h-5 w-5" />
+		<Button
+			variant="ghost"
+			size="icon"
+			onClick={onClick}
+			className="md:hidden h-8 w-8 shrink-0 text-muted-foreground hover:bg-transparent hover:text-muted-foreground"
+		>
+			<PanelLeft className="h-4 w-4" />
 			<span className="sr-only">Open menu</span>
 		</Button>
 	);
@@ -77,6 +84,8 @@ export function MobileSidebar({
 	onSettings,
 	onManageMembers,
 	onUserSettings,
+	onAnnouncements,
+	announcementUnreadCount = 0,
 	onLogout,
 	pageUsage,
 	theme,
@@ -99,11 +108,14 @@ export function MobileSidebar({
 
 	return (
 		<Sheet open={isOpen} onOpenChange={onOpenChange}>
-			<SheetContent side="left" className="w-[340px] p-0 flex flex-row gap-0 [&>button]:hidden">
+			<SheetContent
+				side="left"
+				className="w-[340px] p-0 flex flex-row gap-0 bg-panel [&>button]:hidden"
+			>
 				<SheetTitle className="sr-only">Navigation</SheetTitle>
 
 				{/* Vertical Search Spaces Rail - left side */}
-				<div className="flex h-full w-14 shrink-0 flex-col items-center bg-muted/40 border-r">
+				<div className="flex h-full w-14 shrink-0 flex-col items-center border-r bg-rail">
 					<ScrollArea className="w-full flex-1">
 						<div className="flex flex-col items-center gap-2 px-1.5 py-3">
 							{searchSpaces.map((space) => (
@@ -193,6 +205,16 @@ export function MobileSidebar({
 									}
 								: undefined
 						}
+						onAnnouncements={
+							onAnnouncements
+								? () => {
+										onOpenChange(false);
+										onAnnouncements();
+									}
+								: undefined
+						}
+						onNavigate={() => onOpenChange(false)}
+						announcementUnreadCount={announcementUnreadCount}
 						onLogout={onLogout}
 						pageUsage={pageUsage}
 						theme={theme}
diff --git a/surfsense_web/components/layout/ui/sidebar/NavSection.tsx b/surfsense_web/components/layout/ui/sidebar/NavSection.tsx
index 0781a66c28..d3412ccfc3 100644
--- a/surfsense_web/components/layout/ui/sidebar/NavSection.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/NavSection.tsx
@@ -137,12 +137,8 @@ function CollapsedOverlay({ item }: { item: NavItem }) {
 
 export function NavSection({ items, onItemClick, isCollapsed = false }: NavSectionProps) {
 	return (
-		<div className={cn("flex flex-col gap-0.5 py-2", isCollapsed && "items-center")}>
+		<div className="flex flex-col gap-0.5 py-2">
 			{items.map((item) => {
-				const joyrideAttr =
-					item.title === "Inbox" || item.title.toLowerCase().includes("inbox")
-						? { "data-joyride": "inbox-sidebar" as const }
-						: {};
 				const { tooltip } = getStatusInfo(item.statusIndicator);
 
 				return (
@@ -159,12 +155,11 @@ export function NavSection({ items, onItemClick, isCollapsed = false }: NavSecti
 							<StatusIcon
 								status={item.statusIndicator}
 								FallbackIcon={item.icon}
-								className="h-4 w-4"
+								className="h-3.5 w-3.5"
 							/>
 						}
 						trailingContent={<StatusPill status={item.statusIndicator} />}
 						tooltipContent={tooltip}
-						buttonProps={joyrideAttr}
 					/>
 				);
 			})}
diff --git a/surfsense_web/components/layout/ui/sidebar/PageUsageDisplay.tsx b/surfsense_web/components/layout/ui/sidebar/PageUsageDisplay.tsx
index 7dd46e4842..3d011b7620 100644
--- a/surfsense_web/components/layout/ui/sidebar/PageUsageDisplay.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/PageUsageDisplay.tsx
@@ -1,12 +1,6 @@
 "use client";
 
-import { useQuery } from "@tanstack/react-query";
-import { CreditCard, Zap } from "lucide-react";
-import Link from "next/link";
-import { useParams } from "next/navigation";
-import { Badge } from "@/components/ui/badge";
 import { Progress } from "@/components/ui/progress";
-import { stripeApiService } from "@/lib/apis/stripe-api.service";
 
 interface PageUsageDisplayProps {
 	pagesUsed: number;
@@ -14,50 +8,17 @@ interface PageUsageDisplayProps {
 }
 
 export function PageUsageDisplay({ pagesUsed, pagesLimit }: PageUsageDisplayProps) {
-	const params = useParams();
-	const searchSpaceId = params?.search_space_id ?? "";
 	const usagePercentage = (pagesUsed / pagesLimit) * 100;
-	const { data: stripeStatus } = useQuery({
-		queryKey: ["stripe-status"],
-		queryFn: () => stripeApiService.getStatus(),
-	});
-	const pageBuyingEnabled = stripeStatus?.page_buying_enabled ?? true;
 
 	return (
-		<div className="px-3 py-3 border-t">
-			<div className="space-y-1.5">
-				<div className="flex justify-between items-center text-xs">
-					<span className="text-muted-foreground">
-						{pagesUsed.toLocaleString()} / {pagesLimit.toLocaleString()} pages
-					</span>
-					<span className="font-medium">{usagePercentage.toFixed(0)}%</span>
-				</div>
-				<Progress value={usagePercentage} className="h-1.5" />
-				<Link
-					href={`/dashboard/${searchSpaceId}/more-pages`}
-					className="group flex w-[calc(100%+0.75rem)] items-center justify-between rounded-md px-1.5 py-1 -mx-1.5 transition-colors hover:bg-accent"
-				>
-					<span className="flex items-center gap-1.5 text-xs text-muted-foreground group-hover:text-accent-foreground">
-						<Zap className="h-3 w-3 shrink-0" />
-						Get Free Pages
-					</span>
-					<Badge className="h-4 rounded px-1 text-[10px] font-semibold leading-none bg-emerald-600 text-white border-transparent hover:bg-emerald-600">
-						FREE
-					</Badge>
-				</Link>
-				{pageBuyingEnabled && (
-					<Link
-						href={`/dashboard/${searchSpaceId}/buy-pages`}
-						className="group flex w-[calc(100%+0.75rem)] items-center justify-between rounded-md px-1.5 py-1 -mx-1.5 transition-colors hover:bg-accent"
-					>
-						<span className="flex items-center gap-1.5 text-xs text-muted-foreground group-hover:text-accent-foreground">
-							<CreditCard className="h-3 w-3 shrink-0" />
-							Buy Pages
-						</span>
-						<span className="text-[10px] font-medium text-muted-foreground">$1/1k</span>
-					</Link>
-				)}
+		<div className="space-y-1.5">
+			<div className="flex justify-between items-center text-xs">
+				<span className="text-muted-foreground">
+					{pagesUsed.toLocaleString()} / {pagesLimit.toLocaleString()} pages
+				</span>
+				<span className="font-medium">{usagePercentage.toFixed(0)}%</span>
 			</div>
+			<Progress value={usagePercentage} className="h-1.5" />
 		</div>
 	);
 }
diff --git a/surfsense_web/components/layout/ui/sidebar/PremiumTokenUsageDisplay.tsx b/surfsense_web/components/layout/ui/sidebar/PremiumTokenUsageDisplay.tsx
new file mode 100644
index 0000000000..983672d0b6
--- /dev/null
+++ b/surfsense_web/components/layout/ui/sidebar/PremiumTokenUsageDisplay.tsx
@@ -0,0 +1,49 @@
+"use client";
+
+import { useQuery } from "@rocicorp/zero/react";
+import { Progress } from "@/components/ui/progress";
+import { useIsAnonymous } from "@/contexts/anonymous-mode";
+import { queries } from "@/zero/queries";
+
+/**
+ * Premium credit balance shown in the sidebar.
+ *
+ * Values come from Zero (live-replicated from Postgres) and are stored as
+ * integer micro-USD (1_000_000 == $1.00). We render in dollars because
+ * users top up at $1/pack and the credit gets debited at actual provider
+ * cost.
+ */
+export function PremiumTokenUsageDisplay() {
+	const isAnonymous = useIsAnonymous();
+	const [me] = useQuery(queries.user.me({}));
+
+	if (isAnonymous || !me) return null;
+
+	const usagePercentage = Math.min(
+		(me.premiumCreditMicrosUsed / Math.max(me.premiumCreditMicrosLimit, 1)) * 100,
+		100
+	);
+
+	const formatUsd = (micros: number) => {
+		const dollars = micros / 1_000_000;
+		if (dollars >= 100) return `$${dollars.toFixed(0)}`;
+		if (dollars >= 1) return `$${dollars.toFixed(2)}`;
+		// Sub-dollar balances need extra precision so the bar still tells the
+		// user what's left ("$0.04 of credit") instead of rounding to "$0".
+		if (dollars > 0) return `$${dollars.toFixed(3)}`;
+		return "$0";
+	};
+
+	return (
+		<div className="space-y-1.5">
+			<div className="flex justify-between items-center text-xs">
+				<span className="text-muted-foreground">
+					{formatUsd(me.premiumCreditMicrosUsed)} / {formatUsd(me.premiumCreditMicrosLimit)} of
+					credit
+				</span>
+				<span className="font-medium">{usagePercentage.toFixed(0)}%</span>
+			</div>
+			<Progress value={usagePercentage} className="h-1.5 [&>div]:bg-purple-500" />
+		</div>
+	);
+}
diff --git a/surfsense_web/components/layout/ui/sidebar/Sidebar.tsx b/surfsense_web/components/layout/ui/sidebar/Sidebar.tsx
index dc3d1178e6..e0cb3072a1 100644
--- a/surfsense_web/components/layout/ui/sidebar/Sidebar.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/Sidebar.tsx
@@ -1,30 +1,65 @@
 "use client";
 
-import { PenSquare } from "lucide-react";
+import { CreditCard, Dot, SquarePen, Zap } from "lucide-react";
+import Link from "next/link";
+import { useParams } from "next/navigation";
 import { useTranslations } from "next-intl";
-import { useState } from "react";
+import { type ReactNode, useMemo, useState } from "react";
+import { Badge } from "@/components/ui/badge";
+import { Button } from "@/components/ui/button";
+import { Progress } from "@/components/ui/progress";
 import { Skeleton } from "@/components/ui/skeleton";
+import { useIsAnonymous } from "@/contexts/anonymous-mode";
 import { cn } from "@/lib/utils";
 import { SIDEBAR_MIN_WIDTH } from "../../hooks/useSidebarResize";
 import type { ChatItem, NavItem, PageUsage, SearchSpace, User } from "../../types/layout.types";
+import { AuthenticatedPageUsageDisplay } from "./AuthenticatedPageUsageDisplay";
 import { ChatListItem } from "./ChatListItem";
 import { NavSection } from "./NavSection";
-import { PageUsageDisplay } from "./PageUsageDisplay";
+import { PremiumTokenUsageDisplay } from "./PremiumTokenUsageDisplay";
 import { SidebarButton } from "./SidebarButton";
 import { SidebarCollapseButton } from "./SidebarCollapseButton";
 import { SidebarHeader } from "./SidebarHeader";
 import { SidebarSection } from "./SidebarSection";
 import { SidebarUserProfile } from "./SidebarUserProfile";
 
-function ChatListItemSkeleton() {
+const CHAT_LIST_SKELETON_WIDTHS = ["w-[78%]", "w-[64%]", "w-[86%]", "w-[58%]", "w-[72%]"];
+
+function ChatListItemSkeleton({ widthClass }: { widthClass: string }) {
+	return (
+		<div className="group/item relative w-full">
+			<div className="flex h-[32px] w-full items-center rounded-md px-2 py-1.5">
+				<Skeleton className={cn("h-4 rounded", widthClass)} />
+			</div>
+		</div>
+	);
+}
+
+function ChatListSkeletonRows() {
 	return (
-		<div className="flex w-full items-center gap-2 rounded-md p-2">
-			<Skeleton className="h-4 w-4 shrink-0 rounded" />
-			<Skeleton className="h-4 w-full max-w-[180px]" />
+		<div className="flex flex-col gap-0.5">
+			{CHAT_LIST_SKELETON_WIDTHS.map((widthClass) => (
+				<ChatListItemSkeleton key={widthClass} widthClass={widthClass} />
+			))}
 		</div>
 	);
 }
 
+function CollapsedInboxIcon({ item }: { item: NavItem }) {
+	const Icon = item.icon;
+
+	return (
+		<span className="relative flex h-3.5 w-3.5 items-center justify-center">
+			<Icon className="h-3.5 w-3.5" />
+			{typeof item.badge === "string" ? (
+				<span className="absolute right-0 top-0 flex min-w-3.5 -translate-y-1/2 translate-x-1/2 items-center justify-center rounded-full bg-destructive px-1 text-[9px] font-medium leading-3 text-destructive-foreground">
+					{item.badge}
+				</span>
+			) : null}
+		</span>
+	);
+}
+
 interface SidebarProps {
 	searchSpace: SearchSpace | null;
 	isCollapsed?: boolean;
@@ -47,6 +82,9 @@ interface SidebarProps {
 	onSettings?: () => void;
 	onManageMembers?: () => void;
 	onUserSettings?: () => void;
+	onAnnouncements?: () => void;
+	onNavigate?: () => void;
+	announcementUnreadCount?: number;
 	onLogout?: () => void;
 	pageUsage?: PageUsage;
 	theme?: string;
@@ -56,6 +94,9 @@ interface SidebarProps {
 	disableTooltips?: boolean;
 	sidebarWidth?: number;
 	isResizing?: boolean;
+	renderUserProfile?: boolean;
+	renderCollapseButton?: boolean;
+	collapsedHeaderContent?: ReactNode;
 }
 
 export function Sidebar({
@@ -80,6 +121,9 @@ export function Sidebar({
 	onSettings,
 	onManageMembers,
 	onUserSettings,
+	onAnnouncements,
+	onNavigate,
+	announcementUnreadCount = 0,
 	onLogout,
 	pageUsage,
 	theme,
@@ -89,62 +133,117 @@ export function Sidebar({
 	disableTooltips = false,
 	sidebarWidth = SIDEBAR_MIN_WIDTH,
 	isResizing = false,
+	renderUserProfile = true,
+	renderCollapseButton = true,
+	collapsedHeaderContent,
 }: SidebarProps) {
 	const t = useTranslations("sidebar");
 	const [openDropdownChatId, setOpenDropdownChatId] = useState<number | null>(null);
 
+	// Inbox and Documents are rendered explicitly right below New Chat. Pull
+	// them out of the nav items list so they don't also appear in the bottom
+	// NavSection. Documents is only present in navItems on mobile.
+	const inboxItem = useMemo(() => navItems.find((item) => item.url === "#inbox"), [navItems]);
+	const documentsItem = useMemo(
+		() => navItems.find((item) => item.url === "#documents"),
+		[navItems]
+	);
+	const footerNavItems = useMemo(
+		() => navItems.filter((item) => item.url !== "#inbox" && item.url !== "#documents"),
+		[navItems]
+	);
+
+	const collapsedWidth = 51;
+
 	return (
 		<div
 			className={cn(
-				"relative flex h-full flex-col bg-sidebar text-sidebar-foreground overflow-hidden select-none",
-				isCollapsed ? "w-[60px] transition-[width] duration-200" : "",
-				!isCollapsed && !isResizing ? "transition-[width] duration-200" : "",
+				"relative flex h-full flex-col bg-panel text-sidebar-foreground overflow-hidden select-none",
+				!isResizing && "transition-[width] duration-200 ease-out",
 				className
 			)}
-			style={!isCollapsed ? { width: sidebarWidth } : undefined}
+			style={{ width: isCollapsed ? collapsedWidth : sidebarWidth }}
 		>
-			{/* Header - search space name or collapse button when collapsed */}
-			{isCollapsed ? (
-				<div className="flex h-14 shrink-0 items-center justify-center border-b">
-					<SidebarCollapseButton
-						isCollapsed={isCollapsed}
-						onToggle={onToggleCollapse ?? (() => {})}
-						disableTooltip={disableTooltips}
-					/>
-				</div>
-			) : (
-				<div className="flex h-14 shrink-0 items-center gap-0 px-1 border-b">
+			<div className="relative flex h-12 shrink-0 items-center gap-0 px-1 border-b">
+				<div
+					className={cn(
+						"min-w-0 overflow-hidden",
+						"transition-[max-width,opacity,margin-right] duration-200 ease-out",
+						isCollapsed ? "max-w-0 opacity-0 mr-0" : "max-w-[400px] flex-1 opacity-100"
+					)}
+					aria-hidden={isCollapsed}
+				>
 					<SidebarHeader
 						searchSpace={searchSpace}
-						isCollapsed={isCollapsed}
+						isCollapsed={false}
 						onSettings={onSettings}
 						onManageMembers={onManageMembers}
 					/>
-					<div className="shrink-0">
+				</div>
+				{collapsedHeaderContent ? (
+					<div
+						aria-hidden={!isCollapsed}
+						className={cn(
+							"pointer-events-none absolute inset-y-0 left-0 flex items-center justify-center transition-opacity duration-150",
+							isCollapsed ? "opacity-100 delay-150" : "opacity-0"
+						)}
+						style={{ width: collapsedWidth }}
+					>
+						{collapsedHeaderContent}
+					</div>
+				) : null}
+				{renderCollapseButton ? (
+					<div className={cn("shrink-0", isCollapsed && "mx-auto")}>
 						<SidebarCollapseButton
 							isCollapsed={isCollapsed}
 							onToggle={onToggleCollapse ?? (() => {})}
 							disableTooltip={disableTooltips}
 						/>
 					</div>
-				</div>
-			)}
+				) : null}
+			</div>
 
-			{/* New chat button */}
-			<div className={cn("flex flex-col gap-0.5 py-2", isCollapsed && "items-center")}>
+			<div className="flex flex-col gap-0.5 py-1.5">
 				<SidebarButton
-					icon={PenSquare}
+					icon={SquarePen}
 					label={t("new_chat")}
 					onClick={onNewChat}
 					isCollapsed={isCollapsed}
 				/>
+				{inboxItem && (
+					<SidebarButton
+						icon={inboxItem.icon}
+						label={inboxItem.title}
+						onClick={() => onNavItemClick?.(inboxItem)}
+						isCollapsed={isCollapsed}
+						isActive={inboxItem.isActive}
+						badge={inboxItem.badge}
+						collapsedIconNode={<CollapsedInboxIcon item={inboxItem} />}
+						tooltipContent={isCollapsed ? inboxItem.title : undefined}
+						buttonProps={
+							{
+								"data-joyride": "inbox-sidebar",
+							} as React.ButtonHTMLAttributes<HTMLButtonElement>
+						}
+					/>
+				)}
+				{documentsItem && (
+					<SidebarButton
+						icon={documentsItem.icon}
+						label={documentsItem.title}
+						onClick={() => onNavItemClick?.(documentsItem)}
+						isCollapsed={isCollapsed}
+						isActive={documentsItem.isActive}
+						tooltipContent={isCollapsed ? documentsItem.title : undefined}
+					/>
+				)}
 			</div>
 
 			{/* Chat sections - fills available space */}
 			{isCollapsed ? (
-				<div className="flex-1 w-[60px]" />
+				<div className="flex-1 w-full" />
 			) : (
-				<div className="flex-1 flex flex-col gap-1 py-2 w-full min-h-0 overflow-hidden">
+				<div className="flex-1 flex flex-col gap-1 pt-2 w-full min-h-0 overflow-hidden">
 					{/* Shared Chats Section - takes only space needed, max 50% */}
 					<SidebarSection
 						title={t("shared_chats")}
@@ -154,28 +253,23 @@ export function Sidebar({
 						alwaysShowAction={!disableTooltips && isSharedChatsPanelOpen}
 						action={
 							onViewAllSharedChats ? (
-								<button
+								<Button
 									type="button"
+									variant="ghost"
 									onClick={onViewAllSharedChats}
-									className="text-xs font-medium text-muted-foreground/60 hover:text-muted-foreground transition-colors whitespace-nowrap cursor-pointer bg-transparent border-none p-0 focus:outline-none"
+									className="h-auto cursor-pointer whitespace-nowrap bg-transparent p-0 text-xs font-medium text-muted-foreground/60 transition-colors hover:bg-transparent hover:text-muted-foreground"
 								>
 									{!disableTooltips && isSharedChatsPanelOpen ? t("hide") : t("show_all")}
-								</button>
+								</Button>
 							) : undefined
 						}
 					>
 						{isLoadingChats ? (
-							<div className="flex flex-col gap-0.5">
-								<ChatListItemSkeleton />
-								<ChatListItemSkeleton />
-								<ChatListItemSkeleton />
-								<ChatListItemSkeleton />
-								<ChatListItemSkeleton />
-							</div>
+							<ChatListSkeletonRows />
 						) : sharedChats.length > 0 ? (
 							<div className="relative min-h-0 flex-1">
 								<div
-									className={`flex flex-col gap-0.5 max-h-full overflow-y-auto scrollbar-thin scrollbar-thumb-muted-foreground/20 scrollbar-track-transparent ${sharedChats.length > 4 ? "pb-8" : ""}`}
+									className={`flex flex-col gap-0.5 max-h-full overflow-y-auto scrollbar-thin scrollbar-thumb-muted-foreground/20 scrollbar-track-transparent ${sharedChats.length > 4 ? "pb-2" : ""}`}
 								>
 									{sharedChats.slice(0, 20).map((chat) => (
 										<ChatListItem
@@ -194,11 +288,11 @@ export function Sidebar({
 								</div>
 								{/* Gradient fade indicator when more than 4 items */}
 								{sharedChats.length > 4 && (
-									<div className="pointer-events-none absolute bottom-0 left-0 right-0 h-8 bg-gradient-to-t from-sidebar via-sidebar/90 to-transparent" />
+									<div className="pointer-events-none absolute bottom-0 left-0 right-0 h-5 bg-gradient-to-t from-sidebar/80 to-transparent" />
 								)}
 							</div>
 						) : (
-							<p className="px-2 py-1 text-xs text-muted-foreground">{t("no_shared_chats")}</p>
+							<p className="px-2 py-1 text-sm text-muted-foreground/60">{t("no_shared_chats")}</p>
 						)}
 					</SidebarSection>
 
@@ -210,28 +304,23 @@ export function Sidebar({
 						alwaysShowAction={!disableTooltips && isPrivateChatsPanelOpen}
 						action={
 							onViewAllPrivateChats ? (
-								<button
+								<Button
 									type="button"
+									variant="ghost"
 									onClick={onViewAllPrivateChats}
-									className="text-xs font-medium text-muted-foreground/60 hover:text-muted-foreground transition-colors whitespace-nowrap cursor-pointer bg-transparent border-none p-0 focus:outline-none"
+									className="h-auto cursor-pointer whitespace-nowrap bg-transparent p-0 text-xs font-medium text-muted-foreground/60 transition-colors hover:bg-transparent hover:text-muted-foreground"
 								>
 									{!disableTooltips && isPrivateChatsPanelOpen ? t("hide") : t("show_all")}
-								</button>
+								</Button>
 							) : undefined
 						}
 					>
 						{isLoadingChats ? (
-							<div className="flex flex-col gap-0.5">
-								<ChatListItemSkeleton />
-								<ChatListItemSkeleton />
-								<ChatListItemSkeleton />
-								<ChatListItemSkeleton />
-								<ChatListItemSkeleton />
-							</div>
+							<ChatListSkeletonRows />
 						) : chats.length > 0 ? (
 							<div className="relative flex-1 min-h-0">
 								<div
-									className={`flex flex-col gap-0.5 h-full overflow-y-auto scrollbar-thin scrollbar-thumb-muted-foreground/20 scrollbar-track-transparent ${chats.length > 4 ? "pb-8" : ""}`}
+									className={`flex flex-col gap-0.5 h-full overflow-y-auto scrollbar-thin scrollbar-thumb-muted-foreground/20 scrollbar-track-transparent ${chats.length > 4 ? "pb-2" : ""}`}
 								>
 									{chats.slice(0, 20).map((chat) => (
 										<ChatListItem
@@ -250,11 +339,11 @@ export function Sidebar({
 								</div>
 								{/* Gradient fade indicator when more than 4 items */}
 								{chats.length > 4 && (
-									<div className="pointer-events-none absolute bottom-0 left-0 right-0 h-8 bg-gradient-to-t from-sidebar via-sidebar/90 to-transparent" />
+									<div className="pointer-events-none absolute bottom-0 left-0 right-0 h-5 bg-gradient-to-t from-sidebar/80 to-transparent" />
 								)}
 							</div>
 						) : (
-							<p className="px-2 py-1 text-xs text-muted-foreground">{t("no_chats")}</p>
+							<p className="px-2 py-1 text-sm text-muted-foreground/60">{t("no_chats")}</p>
 						)}
 					</SidebarSection>
 				</div>
@@ -263,22 +352,124 @@ export function Sidebar({
 			{/* Footer */}
 			<div className="mt-auto border-t">
 				{/* Platform navigation */}
-				{navItems.length > 0 && (
-					<NavSection items={navItems} onItemClick={onNavItemClick} isCollapsed={isCollapsed} />
-				)}
-
-				{pageUsage && !isCollapsed && (
-					<PageUsageDisplay pagesUsed={pageUsage.pagesUsed} pagesLimit={pageUsage.pagesLimit} />
+				{footerNavItems.length > 0 && (
+					<NavSection
+						items={footerNavItems}
+						onItemClick={onNavItemClick}
+						isCollapsed={isCollapsed}
+					/>
 				)}
 
-				<SidebarUserProfile
-					user={user}
-					onUserSettings={onUserSettings}
-					onLogout={onLogout}
+				<SidebarUsageFooter
+					pageUsage={pageUsage}
 					isCollapsed={isCollapsed}
-					theme={theme}
-					setTheme={setTheme}
+					hasNavSectionAbove={footerNavItems.length > 0}
+					onNavigate={onNavigate}
 				/>
+
+				{renderUserProfile && (
+					<SidebarUserProfile
+						user={user}
+						onUserSettings={onUserSettings}
+						onAnnouncements={onAnnouncements}
+						announcementUnreadCount={announcementUnreadCount}
+						onLogout={onLogout}
+						isCollapsed={isCollapsed}
+						theme={theme}
+						setTheme={setTheme}
+					/>
+				)}
+			</div>
+		</div>
+	);
+}
+
+function SidebarUsageFooter({
+	pageUsage,
+	isCollapsed,
+	hasNavSectionAbove = false,
+	onNavigate,
+}: {
+	pageUsage?: PageUsage;
+	isCollapsed: boolean;
+	hasNavSectionAbove?: boolean;
+	onNavigate?: () => void;
+}) {
+	const params = useParams();
+	const searchSpaceId = params?.search_space_id ?? "";
+	const isAnonymous = useIsAnonymous();
+
+	if (isCollapsed) return null;
+
+	const containerClass = cn("px-3 py-3 space-y-3", hasNavSectionAbove && "border-t");
+
+	if (isAnonymous) {
+		return (
+			<div className={containerClass}>
+				{pageUsage && (
+					<div className="space-y-1.5">
+						<div className="flex justify-between items-center text-xs">
+							<span className="text-muted-foreground">
+								{pageUsage.pagesUsed.toLocaleString()} / {pageUsage.pagesLimit.toLocaleString()}{" "}
+								tokens
+							</span>
+							<span className="font-medium">
+								{Math.min(
+									(pageUsage.pagesUsed / Math.max(pageUsage.pagesLimit, 1)) * 100,
+									100
+								).toFixed(0)}
+								%
+							</span>
+						</div>
+						<Progress
+							value={Math.min((pageUsage.pagesUsed / Math.max(pageUsage.pagesLimit, 1)) * 100, 100)}
+							className="h-1.5"
+						/>
+					</div>
+				)}
+				<Link
+					href="/register"
+					className="flex items-center justify-center gap-1.5 rounded-md bg-primary px-3 py-1.5 text-xs font-medium text-primary-foreground transition-opacity hover:opacity-90"
+				>
+					Create Free Account
+				</Link>
+			</div>
+		);
+	}
+
+	return (
+		<div className={containerClass}>
+			<PremiumTokenUsageDisplay />
+			<AuthenticatedPageUsageDisplay />
+			<div className="space-y-0.5">
+				<Link
+					href={`/dashboard/${searchSpaceId}/more-pages`}
+					onClick={onNavigate}
+					className="group flex w-full items-center justify-between rounded-md px-1.5 py-1 transition-colors hover:bg-accent"
+				>
+					<span className="flex items-center gap-1.5 text-xs text-muted-foreground group-hover:text-accent-foreground">
+						<Zap className="h-3 w-3 shrink-0" />
+						Get Free Pages
+					</span>
+					<Badge className="h-4 rounded px-1 text-[10px] font-semibold leading-none bg-emerald-600 text-white border-transparent hover:bg-emerald-600">
+						FREE
+					</Badge>
+				</Link>
+				<Link
+					href={`/dashboard/${searchSpaceId}/buy-more`}
+					onClick={onNavigate}
+					className="group flex w-full items-center justify-between rounded-md px-1.5 py-1 transition-colors hover:bg-accent"
+				>
+					<span className="flex items-center gap-1.5 text-xs text-muted-foreground group-hover:text-accent-foreground">
+						<CreditCard className="h-3 w-3 shrink-0" />
+						Buy More
+					</span>
+					<span className="flex items-center text-[10px] font-medium text-muted-foreground">
+						$1/1k
+						<Dot className="h-3 w-3" />
+						$1/1M
+					</span>
+				</Link>
 			</div>
 		</div>
 	);
diff --git a/surfsense_web/components/layout/ui/sidebar/SidebarButton.tsx b/surfsense_web/components/layout/ui/sidebar/SidebarButton.tsx
index 4410edafab..cc7f83e1d8 100644
--- a/surfsense_web/components/layout/ui/sidebar/SidebarButton.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/SidebarButton.tsx
@@ -2,6 +2,7 @@
 
 import type { LucideIcon } from "lucide-react";
 import type React from "react";
+import { Button } from "@/components/ui/button";
 import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
 import { cn } from "@/lib/utils";
 
@@ -12,28 +13,18 @@ interface SidebarButtonProps {
 	isCollapsed?: boolean;
 	isActive?: boolean;
 	badge?: React.ReactNode;
-	/** Overlay in the top-right corner of the collapsed icon (e.g. status badge) */
 	collapsedOverlay?: React.ReactNode;
-	/** Custom icon node for expanded mode — overrides the default <Icon> rendering */
+	collapsedIconNode?: React.ReactNode;
 	expandedIconNode?: React.ReactNode;
-	/** Optional inline trailing content shown in expanded mode */
 	trailingContent?: React.ReactNode;
-	/** Optional tooltip content that replaces the default label tooltip */
 	tooltipContent?: React.ReactNode;
 	className?: string;
-	/** Extra attributes spread onto the inner <button> (e.g. data-joyride) */
 	buttonProps?: React.ButtonHTMLAttributes<HTMLButtonElement>;
 }
 
-const expandedClassName = cn(
-	"flex items-center gap-2 rounded-md mx-2 px-2 py-1.5 text-sm transition-colors text-left",
-	"hover:bg-accent hover:text-accent-foreground",
-	"focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring"
-);
-
-const collapsedClassName = cn(
-	"relative flex h-10 w-10 items-center justify-center rounded-md transition-colors",
-	"hover:bg-accent hover:text-accent-foreground",
+const baseClassName = cn(
+	"group/sidebar-button relative h-9 justify-start gap-0 rounded-md mx-2 px-2 text-sm text-left",
+	"transition-colors hover:bg-accent hover:text-accent-foreground",
 	"focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring"
 );
 
@@ -45,6 +36,7 @@ export function SidebarButton({
 	isActive = false,
 	badge,
 	collapsedOverlay,
+	collapsedIconNode,
 	expandedIconNode,
 	trailingContent,
 	tooltipContent,
@@ -53,53 +45,64 @@ export function SidebarButton({
 }: SidebarButtonProps) {
 	const activeClassName = "bg-accent text-accent-foreground";
 
-	if (isCollapsed) {
-		return (
-			<Tooltip>
-				<TooltipTrigger asChild>
-					<button
-						type="button"
-						onClick={onClick}
-						className={cn(collapsedClassName, isActive && activeClassName, className)}
-						{...buttonProps}
-					>
-						<Icon className="h-4 w-4" />
-						{collapsedOverlay}
-						<span className="sr-only">{label}</span>
-					</button>
-				</TooltipTrigger>
-				<TooltipContent side="right" className="max-w-xs">
-					{tooltipContent ?? (
-						<>
-							{label}
-							{typeof badge === "string" && ` (${badge})`}
-						</>
-					)}
-				</TooltipContent>
-			</Tooltip>
-		);
-	}
+	const iconNode = isCollapsed
+		? (collapsedIconNode ?? <Icon className="h-3.5 w-3.5" />)
+		: (expandedIconNode ?? <Icon className="h-3.5 w-3.5 shrink-0" />);
 
 	const button = (
-		<button
+		<Button
+			variant="ghost"
 			type="button"
 			onClick={onClick}
-			className={cn(expandedClassName, isActive && activeClassName, className)}
+			aria-label={isCollapsed ? label : undefined}
+			className={cn(baseClassName, isActive && activeClassName, className)}
 			{...buttonProps}
 		>
-			{expandedIconNode ?? <Icon className="h-4 w-4 shrink-0" />}
-			<span className="flex-1 truncate">{label}</span>
-			{trailingContent}
-			{badge && typeof badge !== "string" ? badge : null}
-			{badge && typeof badge === "string" ? (
-				<span className="inline-flex items-center justify-center min-w-4 h-4 px-1 rounded-full bg-red-500 text-white text-[10px] font-medium">
+			<span
+				className={cn(
+					"flex min-w-0 items-center translate-x-0.5 transition-transform duration-200 ease-out",
+					isCollapsed ? "shrink-0" : "flex-1"
+				)}
+			>
+				<span className="flex h-3.5 w-3.5 shrink-0 items-center justify-center">{iconNode}</span>
+
+				<span
+					className={cn(
+						"min-w-0 overflow-hidden whitespace-nowrap text-left",
+						"transition-[max-width,opacity,margin-left] duration-200 ease-out",
+						isCollapsed ? "max-w-0 opacity-0 ml-0" : "max-w-[260px] flex-1 opacity-100 ml-2"
+					)}
+				>
+					<span className="block truncate">{label}</span>
+				</span>
+			</span>
+
+			{!isCollapsed && trailingContent}
+			{!isCollapsed && badge && typeof badge !== "string" ? badge : null}
+			{!isCollapsed && badge && typeof badge === "string" ? (
+				<span className="ml-1 inline-flex items-center justify-center min-w-4 h-4 px-1 rounded-full bg-red-500 text-white text-[10px] font-medium">
 					{badge}
 				</span>
 			) : null}
-		</button>
+
+			{collapsedOverlay && (
+				<span
+					aria-hidden={!isCollapsed}
+					className={cn(
+						"pointer-events-none absolute inset-0 transition-opacity duration-150",
+						isCollapsed ? "opacity-100" : "opacity-0"
+					)}
+				>
+					{collapsedOverlay}
+				</span>
+			)}
+
+			<span className="sr-only">{label}</span>
+		</Button>
 	);
 
-	if (!tooltipContent) {
+	const renderTooltip = isCollapsed || !!tooltipContent;
+	if (!renderTooltip) {
 		return button;
 	}
 
@@ -107,7 +110,14 @@ export function SidebarButton({
 		<Tooltip>
 			<TooltipTrigger asChild>{button}</TooltipTrigger>
 			<TooltipContent side="right" className="max-w-xs">
-				{tooltipContent}
+				{isCollapsed
+					? (tooltipContent ?? (
+							<>
+								{label}
+								{typeof badge === "string" && ` (${badge})`}
+							</>
+						))
+					: tooltipContent}
 			</TooltipContent>
 		</Tooltip>
 	);
diff --git a/surfsense_web/components/layout/ui/sidebar/SidebarCollapseButton.tsx b/surfsense_web/components/layout/ui/sidebar/SidebarCollapseButton.tsx
index a01937cd65..b100b799a9 100644
--- a/surfsense_web/components/layout/ui/sidebar/SidebarCollapseButton.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/SidebarCollapseButton.tsx
@@ -1,29 +1,42 @@
 "use client";
 
-import { PanelLeft, PanelLeftClose } from "lucide-react";
+import { PanelLeft } from "lucide-react";
 import { useTranslations } from "next-intl";
 import { Button } from "@/components/ui/button";
 import { ShortcutKbd } from "@/components/ui/shortcut-kbd";
 import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
 import { usePlatformShortcut } from "@/hooks/use-platform-shortcut";
+import { cn } from "@/lib/utils";
 
 interface SidebarCollapseButtonProps {
 	isCollapsed: boolean;
 	onToggle: () => void;
 	disableTooltip?: boolean;
+	className?: string;
+	iconClassName?: string;
 }
 
 export function SidebarCollapseButton({
 	isCollapsed,
 	onToggle,
 	disableTooltip = false,
+	className,
+	iconClassName,
 }: SidebarCollapseButtonProps) {
 	const t = useTranslations("sidebar");
 	const { shortcutKeys } = usePlatformShortcut();
 
 	const button = (
-		<Button variant="ghost" size="icon" onClick={onToggle} className="h-8 w-8 shrink-0">
-			{isCollapsed ? <PanelLeft className="h-4 w-4" /> : <PanelLeftClose className="h-4 w-4" />}
+		<Button
+			variant="ghost"
+			size="icon"
+			onClick={onToggle}
+			className={cn(
+				"h-8 w-8 shrink-0 text-muted-foreground hover:bg-accent hover:text-accent-foreground",
+				className
+			)}
+		>
+			<PanelLeft className={cn("h-4 w-4", iconClassName)} />
 			<span className="sr-only">{isCollapsed ? t("expand_sidebar") : t("collapse_sidebar")}</span>
 		</Button>
 	);
@@ -35,7 +48,7 @@ export function SidebarCollapseButton({
 	return (
 		<Tooltip>
 			<TooltipTrigger asChild>{button}</TooltipTrigger>
-			<TooltipContent side={isCollapsed ? "right" : "bottom"}>
+			<TooltipContent side="bottom" avoidCollisions={false}>
 				<span className="flex items-center">
 					{isCollapsed ? t("expand_sidebar") : t("collapse_sidebar")}
 					<ShortcutKbd keys={shortcutKeys("Mod", "\\")} />
diff --git a/surfsense_web/components/layout/ui/sidebar/SidebarHeader.tsx b/surfsense_web/components/layout/ui/sidebar/SidebarHeader.tsx
index 2c8539dc8c..2a5761d6f2 100644
--- a/surfsense_web/components/layout/ui/sidebar/SidebarHeader.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/SidebarHeader.tsx
@@ -36,14 +36,14 @@ export function SidebarHeader({
 					<Button
 						variant="ghost"
 						className={cn(
-							"flex h-auto w-full items-center justify-between gap-1 overflow-hidden py-1.5 font-semibold",
+							"flex h-8 w-full items-center justify-between gap-1 overflow-hidden px-2 py-0.5 font-semibold",
 							isCollapsed && "w-10"
 						)}
 					>
-						<span className="truncate text-base">
+						<span className="truncate text-sm">
 							{searchSpace?.name ?? t("select_search_space")}
 						</span>
-						<ChevronsUpDown className="h-4 w-4 shrink-0 text-muted-foreground" />
+						<ChevronsUpDown className="h-3.5 w-3.5 shrink-0 text-muted-foreground" />
 					</Button>
 				</DropdownMenuTrigger>
 				<DropdownMenuContent align="start" className="w-48">
diff --git a/surfsense_web/components/layout/ui/sidebar/SidebarSection.tsx b/surfsense_web/components/layout/ui/sidebar/SidebarSection.tsx
index b8ea5e1ee2..99162dddfa 100644
--- a/surfsense_web/components/layout/ui/sidebar/SidebarSection.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/SidebarSection.tsx
@@ -39,8 +39,8 @@ export function SidebarSection({
 				className
 			)}
 		>
-			<div className="flex items-center group/section shrink-0 px-2 py-1.5">
-				<CollapsibleTrigger className="flex items-center gap-1.5 text-xs font-medium text-muted-foreground hover:text-foreground transition-colors min-w-0">
+			<div className="flex items-center group/section shrink-0 px-2 py-1">
+				<CollapsibleTrigger className="flex items-center gap-1 text-xs font-medium text-muted-foreground hover:text-accent-foreground transition-colors min-w-0">
 					<span className="truncate">{title}</span>
 					<ChevronRight
 						className={cn(
@@ -69,9 +69,7 @@ export function SidebarSection({
 			</div>
 
 			<CollapsibleContent className={cn("overflow-hidden flex-1 flex flex-col min-h-0")}>
-				<div className={cn("px-2 pb-2 flex-1 flex flex-col min-h-0 overflow-hidden")}>
-					{children}
-				</div>
+				<div className={cn("px-2 flex-1 flex flex-col min-h-0 overflow-hidden")}>{children}</div>
 			</CollapsibleContent>
 		</Collapsible>
 	);
diff --git a/surfsense_web/components/layout/ui/sidebar/SidebarSlideOutPanel.tsx b/surfsense_web/components/layout/ui/sidebar/SidebarSlideOutPanel.tsx
index ed54e65aab..3fa4dd5d34 100644
--- a/surfsense_web/components/layout/ui/sidebar/SidebarSlideOutPanel.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/SidebarSlideOutPanel.tsx
@@ -2,9 +2,8 @@
 
 import { AnimatePresence, motion } from "motion/react";
 import { useCallback, useEffect } from "react";
-import { useMediaQuery } from "@/hooks/use-media-query";
-
-export const SLIDEOUT_PANEL_OPENED_EVENT = "slideout-panel-opened";
+import { useIsMobile } from "@/hooks/use-mobile";
+import { SLIDEOUT_PANEL_OPENED_EVENT } from "@/lib/layout-events";
 
 interface SidebarSlideOutPanelProps {
 	open: boolean;
@@ -29,7 +28,7 @@ export function SidebarSlideOutPanel({
 	width = 360,
 	children,
 }: SidebarSlideOutPanelProps) {
-	const isMobile = !useMediaQuery("(min-width: 640px)");
+	const isMobile = useIsMobile();
 
 	useEffect(() => {
 		if (open) {
@@ -91,16 +90,16 @@ export function SidebarSlideOutPanel({
 
 					{/* Panel extending from sidebar's right edge, flush with the wrapper border */}
 					<motion.div
-						style={{ width, left: "100%", top: -1, bottom: -1 }}
-						initial={{ x: -width }}
-						animate={{ x: 0 }}
-						exit={{ x: -width }}
+						initial={{ width: 0 }}
+						animate={{ width }}
+						exit={{ width: 0 }}
 						transition={{ type: "tween", duration: 0.2, ease: [0.4, 0, 0.2, 1] }}
 						className="absolute z-20 overflow-hidden"
+						style={{ left: "100%", top: -1, bottom: -1 }}
 					>
 						<div
 							style={{ width }}
-							className="h-full bg-sidebar text-sidebar-foreground flex flex-col select-none border rounded-r-xl shadow-xl"
+							className="h-full bg-sidebar text-sidebar-foreground flex flex-col select-none border shadow-xl"
 							role="dialog"
 							aria-label={ariaLabel}
 						>
diff --git a/surfsense_web/components/layout/ui/sidebar/SidebarUserProfile.tsx b/surfsense_web/components/layout/ui/sidebar/SidebarUserProfile.tsx
index 6faa81f221..bc3b36efdc 100644
--- a/surfsense_web/components/layout/ui/sidebar/SidebarUserProfile.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/SidebarUserProfile.tsx
@@ -3,11 +3,13 @@
 import {
 	Check,
 	ChevronUp,
+	Download,
 	ExternalLink,
 	Info,
 	Languages,
-	Laptop,
 	LogOut,
+	Megaphone,
+	Monitor,
 	Moon,
 	Sun,
 	UserCog,
@@ -15,6 +17,7 @@ import {
 import Image from "next/image";
 import { useTranslations } from "next-intl";
 import { useState } from "react";
+import { Button } from "@/components/ui/button";
 import {
 	DropdownMenu,
 	DropdownMenuContent,
@@ -28,8 +31,14 @@ import {
 	DropdownMenuTrigger,
 } from "@/components/ui/dropdown-menu";
 import { Spinner } from "@/components/ui/spinner";
+import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
 import { useLocaleContext } from "@/contexts/LocaleContext";
+import { useMediaQuery } from "@/hooks/use-media-query";
+import { usePlatform } from "@/hooks/use-platform";
+import { GITHUB_RELEASES_URL, usePrimaryDownload } from "@/lib/desktop-download-utils";
 import { APP_VERSION } from "@/lib/env-config";
+import { trackDesktopDownloadClicked } from "@/lib/posthog/events";
+import { getUserAvatarColor, getUserInitials } from "@/lib/user-avatar";
 import { cn } from "@/lib/utils";
 import type { User } from "../../types/layout.types";
 
@@ -46,61 +55,31 @@ const LANGUAGES = [
 const THEMES = [
 	{ value: "light" as const, name: "Light", icon: Sun },
 	{ value: "dark" as const, name: "Dark", icon: Moon },
-	{ value: "system" as const, name: "System", icon: Laptop },
+	{ value: "system" as const, name: "System", icon: Monitor },
 ];
 
 const LEARN_MORE_LINKS = [
-	{ key: "documentation" as const, href: "https://surfsense.com/docs" },
+	{ key: "documentation" as const, href: "https://www.surfsense.com/docs" },
 	{ key: "github" as const, href: "https://github.com/MODSetter/SurfSense" },
 ];
 
 interface SidebarUserProfileProps {
 	user: User;
 	onUserSettings?: () => void;
+	onAnnouncements?: () => void;
+	announcementUnreadCount?: number;
 	onLogout?: () => void;
 	isCollapsed?: boolean;
 	theme?: string;
 	setTheme?: (theme: "light" | "dark" | "system") => void;
 }
 
-/**
- * Generates a consistent color based on email
- */
-function stringToColor(str: string): string {
-	let hash = 0;
-	for (let i = 0; i < str.length; i++) {
-		hash = str.charCodeAt(i) + ((hash << 5) - hash);
-	}
-	const colors = [
-		"#6366f1",
-		"#8b5cf6",
-		"#a855f7",
-		"#d946ef",
-		"#ec4899",
-		"#f43f5e",
-		"#ef4444",
-		"#f97316",
-		"#eab308",
-		"#84cc16",
-		"#22c55e",
-		"#14b8a6",
-		"#06b6d4",
-		"#0ea5e9",
-		"#3b82f6",
-	];
-	return colors[Math.abs(hash) % colors.length];
-}
-
-/**
- * Gets initials from email
- */
-function getInitials(email: string): string {
-	const name = email.split("@")[0];
-	const parts = name.split(/[._-]/);
-	if (parts.length >= 2) {
-		return (parts[0][0] + parts[1][0]).toUpperCase();
+function formatAnnouncementCount(count: number): string {
+	if (count <= 999) {
+		return count.toString();
 	}
-	return name.slice(0, 2).toUpperCase();
+	const thousands = Math.floor(count / 1000);
+	return `${thousands}k+`;
 }
 
 /**
@@ -110,19 +89,23 @@ function UserAvatar({
 	avatarUrl,
 	initials,
 	bgColor,
+	size = "sm",
 }: {
 	avatarUrl?: string;
 	initials: string;
 	bgColor: string;
+	size?: "sm" | "md";
 }) {
+	const sizeClass = size === "md" ? "h-10 w-10" : "h-8 w-8";
+
 	if (avatarUrl) {
 		return (
 			<Image
 				src={avatarUrl}
 				alt="User avatar"
-				width={32}
-				height={32}
-				className="h-8 w-8 shrink-0 rounded-lg object-cover select-none"
+				width={size === "md" ? 40 : 32}
+				height={size === "md" ? 40 : 32}
+				className={cn(sizeClass, "shrink-0 rounded-full object-cover select-none")}
 				referrerPolicy="no-referrer"
 				unoptimized
 			/>
@@ -131,7 +114,10 @@ function UserAvatar({
 
 	return (
 		<div
-			className="flex h-8 w-8 shrink-0 items-center justify-center rounded-lg text-xs font-semibold text-white select-none"
+			className={cn(
+				sizeClass,
+				"flex shrink-0 items-center justify-center rounded-full text-xs font-semibold text-white select-none"
+			)}
 			style={{ backgroundColor: bgColor }}
 		>
 			{initials}
@@ -142,6 +128,8 @@ function UserAvatar({
 export function SidebarUserProfile({
 	user,
 	onUserSettings,
+	onAnnouncements,
+	announcementUnreadCount = 0,
 	onLogout,
 	isCollapsed = false,
 	theme,
@@ -149,10 +137,16 @@ export function SidebarUserProfile({
 }: SidebarUserProfileProps) {
 	const t = useTranslations("sidebar");
 	const { locale, setLocale } = useLocaleContext();
+	const { isDesktop } = usePlatform();
+	const isDesktopViewport = useMediaQuery("(min-width: 768px)");
+	const { os, primary } = usePrimaryDownload();
 	const [isLoggingOut, setIsLoggingOut] = useState(false);
-	const bgColor = stringToColor(user.email);
-	const initials = getInitials(user.email);
+	const bgColor = getUserAvatarColor(user.email);
+	const initials = getUserInitials(user.email);
 	const displayName = user.name || user.email.split("@")[0];
+	const downloadUrl = primary?.url ?? GITHUB_RELEASES_URL;
+	const downloadLabel = t("download_for_os", { os });
+	const showDownloadCta = !isDesktop && isDesktopViewport;
 
 	const handleLanguageChange = (newLocale: "en" | "es" | "pt" | "hi" | "zh") => {
 		setLocale(newLocale);
@@ -175,64 +169,140 @@ export function SidebarUserProfile({
 	// Collapsed view - just show avatar with dropdown
 	if (isCollapsed) {
 		return (
-			<div className="border-t p-2">
-				<DropdownMenu>
-					<DropdownMenuTrigger asChild>
-						<button
-							type="button"
-							className={cn(
-								"flex h-10 w-full items-center justify-center rounded-md",
-								"hover:bg-accent transition-colors",
-								"focus:outline-none focus-visible:outline-none",
-								"data-[state=open]:bg-transparent"
-							)}
-						>
-							<UserAvatar avatarUrl={user.avatarUrl} initials={initials} bgColor={bgColor} />
-							<span className="sr-only">{displayName}</span>
-						</button>
-					</DropdownMenuTrigger>
-
-					<DropdownMenuContent className="w-48" side="right" align="center" sideOffset={8}>
-						<DropdownMenuLabel className="font-normal">
-							<div className="flex items-center gap-2">
-								<UserAvatar avatarUrl={user.avatarUrl} initials={initials} bgColor={bgColor} />
-								<div className="flex-1 min-w-0">
-									<p className="truncate text-sm font-medium">{displayName}</p>
-									<p className="truncate text-xs text-muted-foreground">{user.email}</p>
+			<div className="w-full border-t px-1.5 py-2">
+				<div className="flex flex-col items-center gap-2">
+					{showDownloadCta && (
+						<Tooltip>
+							<TooltipTrigger asChild>
+								<Button
+									asChild
+									variant="ghost"
+									size="icon"
+									className="h-10 w-10 rounded-lg bg-muted hover:bg-accent"
+								>
+									<a
+										href={downloadUrl}
+										target="_blank"
+										rel="noopener noreferrer"
+										aria-label={downloadLabel}
+										onClick={() =>
+											trackDesktopDownloadClicked({ os, placement: "sidebar_collapsed" })
+										}
+									>
+										<Download className="h-4 w-4" strokeWidth={2.5} />
+									</a>
+								</Button>
+							</TooltipTrigger>
+							<TooltipContent side="right" sideOffset={8}>
+								{downloadLabel}
+							</TooltipContent>
+						</Tooltip>
+					)}
+					<DropdownMenu>
+						<DropdownMenuTrigger asChild>
+							<Button
+								type="button"
+								variant="ghost"
+								className={cn(
+									"h-10 w-10 rounded-full p-0",
+									"transition-opacity hover:bg-transparent hover:opacity-90",
+									"focus:outline-none focus-visible:outline-none",
+									"data-[state=open]:opacity-90"
+								)}
+							>
+								<UserAvatar
+									avatarUrl={user.avatarUrl}
+									initials={initials}
+									bgColor={bgColor}
+									size="md"
+								/>
+								<span className="sr-only">{displayName}</span>
+							</Button>
+						</DropdownMenuTrigger>
+
+						<DropdownMenuContent className="w-48" side="right" align="end" sideOffset={8}>
+							<DropdownMenuLabel className="font-normal">
+								<div className="flex items-center gap-2">
+									<UserAvatar avatarUrl={user.avatarUrl} initials={initials} bgColor={bgColor} />
+									<div className="flex-1 min-w-0">
+										<p className="truncate text-sm font-medium">{displayName}</p>
+										<p className="truncate text-xs text-muted-foreground">{user.email}</p>
+									</div>
 								</div>
-							</div>
-						</DropdownMenuLabel>
-
-						<DropdownMenuSeparator className="dark:bg-neutral-700" />
+							</DropdownMenuLabel>
+
+							<DropdownMenuSeparator />
+
+							<DropdownMenuItem onClick={onUserSettings}>
+								<UserCog className="h-4 w-4" />
+								{t("user_settings")}
+							</DropdownMenuItem>
+
+							{onAnnouncements && (
+								<DropdownMenuItem onClick={onAnnouncements}>
+									<Megaphone className="h-4 w-4" />
+									<span className="flex-1">What's New</span>
+									{announcementUnreadCount > 0 && (
+										<span className="inline-flex items-center justify-center min-w-4 h-4 px-1 rounded-full bg-red-500 text-white text-[10px] font-medium">
+											{formatAnnouncementCount(announcementUnreadCount)}
+										</span>
+									)}
+								</DropdownMenuItem>
+							)}
 
-						<DropdownMenuItem onClick={onUserSettings}>
-							<UserCog className="h-4 w-4" />
-							{t("user_settings")}
-						</DropdownMenuItem>
+							{setTheme && (
+								<DropdownMenuSub>
+									<DropdownMenuSubTrigger>
+										<Sun className="h-4 w-4" />
+										{t("theme")}
+									</DropdownMenuSubTrigger>
+									<DropdownMenuPortal>
+										<DropdownMenuSubContent className="gap-1">
+											{THEMES.map((themeOption) => {
+												const Icon = themeOption.icon;
+												const isSelected = theme === themeOption.value;
+												return (
+													<DropdownMenuItem
+														key={themeOption.value}
+														onClick={() => handleThemeChange(themeOption.value)}
+														className={cn(
+															"mb-1 last:mb-0 transition-all",
+															"hover:bg-accent hover:text-accent-foreground",
+															isSelected && "text-primary"
+														)}
+													>
+														<Icon className="h-4 w-4" />
+														<span className="flex-1">{t(themeOption.value)}</span>
+														{isSelected && <Check className="h-4 w-4 shrink-0" />}
+													</DropdownMenuItem>
+												);
+											})}
+										</DropdownMenuSubContent>
+									</DropdownMenuPortal>
+								</DropdownMenuSub>
+							)}
 
-						{setTheme && (
 							<DropdownMenuSub>
 								<DropdownMenuSubTrigger>
-									<Sun className="h-4 w-4" />
-									{t("theme")}
+									<Languages className="h-4 w-4" />
+									{t("language")}
 								</DropdownMenuSubTrigger>
 								<DropdownMenuPortal>
 									<DropdownMenuSubContent className="gap-1">
-										{THEMES.map((themeOption) => {
-											const Icon = themeOption.icon;
-											const isSelected = theme === themeOption.value;
+										{LANGUAGES.map((language) => {
+											const isSelected = locale === language.code;
 											return (
 												<DropdownMenuItem
-													key={themeOption.value}
-													onClick={() => handleThemeChange(themeOption.value)}
+													key={language.code}
+													onClick={() => handleLanguageChange(language.code)}
 													className={cn(
 														"mb-1 last:mb-0 transition-all",
-														"hover:bg-accent/50",
+														"hover:bg-accent hover:text-accent-foreground",
 														isSelected && "text-primary"
 													)}
 												>
-													<Icon className="h-4 w-4" />
-													<span className="flex-1">{t(themeOption.value)}</span>
+													<span className="mr-2">{language.flag}</span>
+													<span className="flex-1">{language.name}</span>
 													{isSelected && <Check className="h-4 w-4 shrink-0" />}
 												</DropdownMenuItem>
 											);
@@ -240,72 +310,52 @@ export function SidebarUserProfile({
 									</DropdownMenuSubContent>
 								</DropdownMenuPortal>
 							</DropdownMenuSub>
-						)}
 
-						<DropdownMenuSub>
-							<DropdownMenuSubTrigger>
-								<Languages className="h-4 w-4" />
-								{t("language")}
-							</DropdownMenuSubTrigger>
-							<DropdownMenuPortal>
-								<DropdownMenuSubContent className="gap-1">
-									{LANGUAGES.map((language) => {
-										const isSelected = locale === language.code;
-										return (
-											<DropdownMenuItem
-												key={language.code}
-												onClick={() => handleLanguageChange(language.code)}
-												className={cn(
-													"mb-1 last:mb-0 transition-all",
-													"hover:bg-accent/50",
-													isSelected && "text-primary"
-												)}
-											>
-												<span className="mr-2">{language.flag}</span>
-												<span className="flex-1">{language.name}</span>
-												{isSelected && <Check className="h-4 w-4 shrink-0" />}
+							<DropdownMenuSub>
+								<DropdownMenuSubTrigger>
+									<Info className="h-4 w-4" />
+									{t("learn_more")}
+								</DropdownMenuSubTrigger>
+								<DropdownMenuPortal>
+									<DropdownMenuSubContent className="min-w-[180px] gap-1">
+										{LEARN_MORE_LINKS.map((link) => (
+											<DropdownMenuItem key={link.key} asChild>
+												<a href={link.href} target="_blank" rel="noopener noreferrer">
+													<span className="flex-1">{t(link.key)}</span>
+													<ExternalLink className="h-3.5 w-3.5 shrink-0 text-muted-foreground" />
+												</a>
 											</DropdownMenuItem>
-										);
-									})}
-								</DropdownMenuSubContent>
-							</DropdownMenuPortal>
-						</DropdownMenuSub>
-
-						<DropdownMenuSub>
-							<DropdownMenuSubTrigger>
-								<Info className="h-4 w-4" />
-								{t("learn_more")}
-							</DropdownMenuSubTrigger>
-							<DropdownMenuPortal>
-								<DropdownMenuSubContent className="min-w-[180px] gap-1">
-									{LEARN_MORE_LINKS.map((link) => (
-										<DropdownMenuItem key={link.key} asChild>
-											<a href={link.href} target="_blank" rel="noopener noreferrer">
-												<span className="flex-1">{t(link.key)}</span>
-												<ExternalLink className="h-3.5 w-3.5 shrink-0 text-muted-foreground" />
-											</a>
-										</DropdownMenuItem>
-									))}
-									<DropdownMenuSeparator className="dark:bg-neutral-700" />
-									<p className="select-none px-2 py-1.5 text-xs text-muted-foreground/50">
-										v{APP_VERSION}
-									</p>
-								</DropdownMenuSubContent>
-							</DropdownMenuPortal>
-						</DropdownMenuSub>
-
-						<DropdownMenuSeparator className="dark:bg-neutral-700" />
+										))}
+										<DropdownMenuSeparator />
+										<p className="select-none px-2 py-1.5 text-xs text-muted-foreground/50">
+											v{APP_VERSION}
+										</p>
+									</DropdownMenuSubContent>
+								</DropdownMenuPortal>
+							</DropdownMenuSub>
 
-						<DropdownMenuItem onClick={handleLogout} disabled={isLoggingOut}>
-							{isLoggingOut ? (
-								<Spinner size="sm" className="mr-2" />
-							) : (
-								<LogOut className="h-4 w-4" />
+							{!isDesktop && (
+								<DropdownMenuItem asChild className="font-medium">
+									<a href={downloadUrl} target="_blank" rel="noopener noreferrer">
+										<Download className="h-4 w-4" strokeWidth={2.5} />
+										{downloadLabel}
+									</a>
+								</DropdownMenuItem>
 							)}
-							{isLoggingOut ? t("loggingOut") : t("logout")}
-						</DropdownMenuItem>
-					</DropdownMenuContent>
-				</DropdownMenu>
+
+							<DropdownMenuSeparator />
+
+							<DropdownMenuItem onClick={handleLogout} disabled={isLoggingOut}>
+								{isLoggingOut ? (
+									<Spinner size="sm" className="mr-2" />
+								) : (
+									<LogOut className="h-4 w-4" />
+								)}
+								{isLoggingOut ? t("loggingOut") : t("logout")}
+							</DropdownMenuItem>
+						</DropdownMenuContent>
+					</DropdownMenu>
+				</div>
 			</div>
 		);
 	}
@@ -313,12 +363,30 @@ export function SidebarUserProfile({
 	// Expanded view
 	return (
 		<div className="border-t">
+			{showDownloadCta && (
+				<Button
+					asChild
+					variant="ghost"
+					className="mx-2 mt-2 mb-1 h-10 w-[calc(100%-1rem)] justify-start gap-2 rounded-md bg-muted px-3 text-sm font-semibold hover:bg-accent"
+				>
+					<a
+						href={downloadUrl}
+						target="_blank"
+						rel="noopener noreferrer"
+						onClick={() => trackDesktopDownloadClicked({ os, placement: "sidebar_expanded" })}
+					>
+						<Download className="h-4 w-4" strokeWidth={2.5} />
+						{downloadLabel}
+					</a>
+				</Button>
+			)}
 			<DropdownMenu>
 				<DropdownMenuTrigger asChild>
-					<button
+					<Button
 						type="button"
+						variant="ghost"
 						className={cn(
-							"flex w-full items-center gap-2 px-2 py-3 text-left",
+							"h-auto w-full justify-start gap-2 rounded-none px-2 py-3 text-left",
 							"hover:bg-accent transition-colors",
 							"focus:outline-none focus-visible:outline-none",
 							"data-[state=open]:bg-transparent"
@@ -334,7 +402,7 @@ export function SidebarUserProfile({
 
 						{/* Chevron icon */}
 						<ChevronUp className="h-4 w-4 shrink-0 text-muted-foreground" />
-					</button>
+					</Button>
 				</DropdownMenuTrigger>
 
 				<DropdownMenuContent className="w-48" side="top" align="center" sideOffset={4}>
@@ -348,13 +416,25 @@ export function SidebarUserProfile({
 						</div>
 					</DropdownMenuLabel>
 
-					<DropdownMenuSeparator className="dark:bg-neutral-700" />
+					<DropdownMenuSeparator />
 
 					<DropdownMenuItem onClick={onUserSettings}>
 						<UserCog className="h-4 w-4" />
 						{t("user_settings")}
 					</DropdownMenuItem>
 
+					{onAnnouncements && (
+						<DropdownMenuItem onClick={onAnnouncements}>
+							<Megaphone className="h-4 w-4" />
+							<span className="flex-1">What's New</span>
+							{announcementUnreadCount > 0 && (
+								<span className="inline-flex items-center justify-center min-w-4 h-4 px-1 rounded-full bg-red-500 text-white text-[10px] font-medium">
+									{formatAnnouncementCount(announcementUnreadCount)}
+								</span>
+							)}
+						</DropdownMenuItem>
+					)}
+
 					{setTheme && (
 						<DropdownMenuSub>
 							<DropdownMenuSubTrigger>
@@ -372,7 +452,7 @@ export function SidebarUserProfile({
 												onClick={() => handleThemeChange(themeOption.value)}
 												className={cn(
 													"mb-1 last:mb-0 transition-all",
-													"hover:bg-accent/50",
+													"hover:bg-accent hover:text-accent-foreground",
 													isSelected && "text-primary"
 												)}
 											>
@@ -402,7 +482,7 @@ export function SidebarUserProfile({
 											onClick={() => handleLanguageChange(language.code)}
 											className={cn(
 												"mb-1 last:mb-0 transition-all",
-												"hover:bg-accent/50",
+												"hover:bg-accent hover:text-accent-foreground",
 												isSelected && "text-primary"
 											)}
 										>
@@ -431,7 +511,7 @@ export function SidebarUserProfile({
 										</a>
 									</DropdownMenuItem>
 								))}
-								<DropdownMenuSeparator className="dark:bg-neutral-700" />
+								<DropdownMenuSeparator />
 								<p className="select-none px-2 py-1.5 text-xs text-muted-foreground/50">
 									v{APP_VERSION}
 								</p>
@@ -439,7 +519,16 @@ export function SidebarUserProfile({
 						</DropdownMenuPortal>
 					</DropdownMenuSub>
 
-					<DropdownMenuSeparator className="dark:bg-neutral-700" />
+					{!isDesktop && (
+						<DropdownMenuItem asChild className="font-medium">
+							<a href={downloadUrl} target="_blank" rel="noopener noreferrer">
+								<Download className="h-4 w-4" strokeWidth={2.5} />
+								{downloadLabel}
+							</a>
+						</DropdownMenuItem>
+					)}
+
+					<DropdownMenuSeparator />
 
 					<DropdownMenuItem onClick={handleLogout} disabled={isLoggingOut}>
 						{isLoggingOut ? <Spinner size="sm" className="mr-2" /> : <LogOut className="h-4 w-4" />}
diff --git a/surfsense_web/components/layout/ui/sidebar/index.ts b/surfsense_web/components/layout/ui/sidebar/index.ts
index 33023c9e80..d72f86c8a1 100644
--- a/surfsense_web/components/layout/ui/sidebar/index.ts
+++ b/surfsense_web/components/layout/ui/sidebar/index.ts
@@ -1,6 +1,5 @@
 export { AllPrivateChatsSidebar, AllPrivateChatsSidebarContent } from "./AllPrivateChatsSidebar";
 export { AllSharedChatsSidebar, AllSharedChatsSidebarContent } from "./AllSharedChatsSidebar";
-export { AnnouncementsSidebar, AnnouncementsSidebarContent } from "./AnnouncementsSidebar";
 export { ChatListItem } from "./ChatListItem";
 export { DocumentsSidebar } from "./DocumentsSidebar";
 export { InboxSidebar, InboxSidebarContent } from "./InboxSidebar";
diff --git a/surfsense_web/components/layout/ui/tabs/DocumentTabContent.tsx b/surfsense_web/components/layout/ui/tabs/DocumentTabContent.tsx
index 026f3afc30..7ad78be418 100644
--- a/surfsense_web/components/layout/ui/tabs/DocumentTabContent.tsx
+++ b/surfsense_web/components/layout/ui/tabs/DocumentTabContent.tsx
@@ -1,6 +1,6 @@
 "use client";
 
-import { Download, FileQuestionMark, FileText, Loader2, PenLine, RefreshCw } from "lucide-react";
+import { Download, FileQuestionMark, FileText, Pencil, RefreshCw } from "lucide-react";
 import { useRouter } from "next/navigation";
 import { useCallback, useEffect, useRef, useState } from "react";
 import { toast } from "sonner";
@@ -8,6 +8,7 @@ import { PlateEditor } from "@/components/editor/plate-editor";
 import { MarkdownViewer } from "@/components/markdown-viewer";
 import { Alert, AlertDescription } from "@/components/ui/alert";
 import { Button } from "@/components/ui/button";
+import { Spinner } from "@/components/ui/spinner";
 import { authenticatedFetch, getBearerToken, redirectToLogin } from "@/lib/auth-utils";
 
 const LARGE_DOCUMENT_THRESHOLD = 2 * 1024 * 1024; // 2MB
@@ -258,7 +259,7 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen
 						onClick={() => setIsEditing(true)}
 						className="gap-1.5"
 					>
-						<PenLine className="size-3.5" />
+						<Pencil className="size-3.5" />
 						Edit
 					</Button>
 				)}
@@ -278,7 +279,7 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen
 									<Button
 										variant="outline"
 										size="sm"
-										className="shrink-0 gap-1.5"
+										className="relative shrink-0"
 										disabled={downloading}
 										onClick={async () => {
 											setDownloading(true);
@@ -307,19 +308,18 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen
 											}
 										}}
 									>
-										{downloading ? (
-											<Loader2 className="size-3.5 animate-spin" />
-										) : (
+										<span className={`flex items-center gap-1.5 ${downloading ? "opacity-0" : ""}`}>
 											<Download className="size-3.5" />
-										)}
-										{downloading ? "Preparing..." : "Download .md"}
+											Download .md
+										</span>
+										{downloading && <Spinner size="sm" className="absolute" />}
 									</Button>
 								</AlertDescription>
 							</Alert>
-							<MarkdownViewer content={doc.source_markdown} />
+							<MarkdownViewer content={doc.source_markdown} enableCitations />
 						</>
 					) : (
-						<MarkdownViewer content={doc.source_markdown} />
+						<MarkdownViewer content={doc.source_markdown} enableCitations />
 					)}
 				</div>
 			</div>
diff --git a/surfsense_web/components/layout/ui/tabs/TabBar.tsx b/surfsense_web/components/layout/ui/tabs/TabBar.tsx
index 8d0d986d3b..c5fb91f4d4 100644
--- a/surfsense_web/components/layout/ui/tabs/TabBar.tsx
+++ b/surfsense_web/components/layout/ui/tabs/TabBar.tsx
@@ -2,7 +2,7 @@
 
 import { useAtomValue, useSetAtom } from "jotai";
 import { Plus, X } from "lucide-react";
-import { useCallback, useEffect, useRef } from "react";
+import { Fragment, useCallback, useEffect, useRef, useState } from "react";
 import {
 	activeTabIdAtom,
 	closeTabAtom,
@@ -10,21 +10,57 @@ import {
 	type Tab,
 	tabsAtom,
 } from "@/atoms/tabs/tabs.atom";
+import { Button } from "@/components/ui/button";
 import { cn } from "@/lib/utils";
 
 interface TabBarProps {
 	onTabSwitch?: (tab: Tab) => void;
 	onNewChat?: () => void;
+	leftActions?: React.ReactNode;
 	rightActions?: React.ReactNode;
 	className?: string;
 }
 
-export function TabBar({ onTabSwitch, onNewChat, rightActions, className }: TabBarProps) {
+// Pure scroll-target calculation for the tab list.
+// - When the list shrinks (a tab was closed), do not move the scroll.
+// - When the list overflows after growing, snap to the right edge so the new tab is visible.
+function nextTabListScrollLeft(input: {
+	prevScrollWidth: number;
+	scrollWidth: number;
+	clientWidth: number;
+}) {
+	if (input.scrollWidth <= input.prevScrollWidth) return;
+	if (input.scrollWidth <= input.clientWidth) return;
+	return input.scrollWidth - input.clientWidth;
+}
+
+export function TabBar({
+	onTabSwitch,
+	onNewChat,
+	leftActions,
+	rightActions,
+	className,
+}: TabBarProps) {
 	const tabs = useAtomValue(tabsAtom);
 	const activeTabId = useAtomValue(activeTabIdAtom);
 	const switchTab = useSetAtom(switchTabAtom);
 	const closeTab = useSetAtom(closeTabAtom);
 	const scrollRef = useRef<HTMLDivElement>(null);
+	const [hoveredTabIndex, setHoveredTabIndex] = useState<number | null>(null);
+	const activeTabIndex = tabs.findIndex((tab) => tab.id === activeTabId);
+
+	const shouldHideSeparator = useCallback(
+		(separatorIndex: number) => {
+			// separatorIndex sits between tabs[separatorIndex - 1] and tabs[separatorIndex].
+			return (
+				hoveredTabIndex === separatorIndex - 1 ||
+				hoveredTabIndex === separatorIndex ||
+				activeTabIndex === separatorIndex - 1 ||
+				activeTabIndex === separatorIndex
+			);
+		},
+		[hoveredTabIndex, activeTabIndex]
+	);
 
 	const handleTabClick = useCallback(
 		(tab: Tab) => {
@@ -46,7 +82,67 @@ export function TabBar({ onTabSwitch, onNewChat, rightActions, className }: TabB
 		[closeTab, onTabSwitch]
 	);
 
-	// Keep active tab visible with minimal scroll shift.
+	// React to tab list growth via a MutationObserver so the scroll catches the
+	// moment a new tab is added to the DOM, not after activation lands.
+	// Also remaps vertical wheel motion to horizontal scroll.
+	useEffect(() => {
+		const el = scrollRef.current;
+		if (!el) return;
+
+		let prevScrollWidth = el.scrollWidth;
+		let frame: number | undefined;
+
+		const update = () => {
+			const left = nextTabListScrollLeft({
+				prevScrollWidth,
+				scrollWidth: el.scrollWidth,
+				clientWidth: el.clientWidth,
+			});
+			if (left !== undefined) {
+				el.scrollTo({ left, behavior: "smooth" });
+			}
+			prevScrollWidth = el.scrollWidth;
+		};
+
+		const schedule = () => {
+			if (frame !== undefined) cancelAnimationFrame(frame);
+			frame = requestAnimationFrame(() => {
+				frame = undefined;
+				update();
+			});
+		};
+
+		const onWheel = (e: WheelEvent) => {
+			if (Math.abs(e.deltaY) <= Math.abs(e.deltaX)) return;
+
+			const delta =
+				e.deltaMode === WheelEvent.DOM_DELTA_LINE
+					? e.deltaY * 16
+					: e.deltaMode === WheelEvent.DOM_DELTA_PAGE
+						? e.deltaY * el.clientWidth
+						: e.deltaY;
+			const maxScrollLeft = el.scrollWidth - el.clientWidth;
+			const nextScrollLeft = Math.min(maxScrollLeft, Math.max(0, el.scrollLeft + delta));
+
+			if (nextScrollLeft === el.scrollLeft) return;
+
+			el.scrollLeft = nextScrollLeft;
+			e.preventDefault();
+		};
+
+		el.addEventListener("wheel", onWheel, { passive: false });
+		const observer = new MutationObserver(schedule);
+		observer.observe(el, { childList: true });
+
+		return () => {
+			el.removeEventListener("wheel", onWheel);
+			observer.disconnect();
+			if (frame !== undefined) cancelAnimationFrame(frame);
+		};
+	}, []);
+
+	// When the user activates a tab that's currently off-screen (e.g. clicked
+	// from the sidebar), nudge the scroller minimally so the active tab is in view.
 	useEffect(() => {
 		if (!scrollRef.current || !activeTabId) return;
 		const scroller = scrollRef.current;
@@ -68,71 +164,98 @@ export function TabBar({ onTabSwitch, onNewChat, rightActions, className }: TabB
 		}
 	}, [activeTabId]);
 
-	// Only show tab bar when there's more than one tab
-	if (tabs.length <= 1) return null;
-
 	return (
-		<div className={cn("mb-2 flex h-9 items-center shrink-0 px-1 gap-0.5 select-none", className)}>
+		<div
+			className={cn(
+				"mb-0 flex h-12 items-center shrink-0 px-1 gap-0.5 select-none border-b bg-panel",
+				className
+			)}
+		>
+			{leftActions ? <div className="flex items-center gap-0.5 shrink-0">{leftActions}</div> : null}
 			<div
 				ref={scrollRef}
-				className="flex h-full items-center flex-1 gap-0.5 overflow-x-auto overflow-y-hidden scrollbar-hide [scrollbar-width:none] [-ms-overflow-style:none] [&::-webkit-scrollbar]:hidden py-1"
+				className="flex h-8 items-center flex-1 gap-0 pl-2 overflow-x-auto overflow-y-hidden scrollbar-hide [scrollbar-width:none] [-ms-overflow-style:none] [&::-webkit-scrollbar]:hidden py-0"
 			>
-				{tabs.map((tab) => {
+				{tabs.map((tab, index) => {
 					const isActive = tab.id === activeTabId;
 
 					return (
-						<button
-							key={tab.id}
-							type="button"
-							data-tab-id={tab.id}
-							onClick={() => handleTabClick(tab)}
-							className={cn(
-								"group relative flex h-full w-[150px] items-center px-3 min-h-0 overflow-hidden text-[13px] font-medium rounded-lg transition-all duration-150 shrink-0",
-								isActive
-									? "bg-muted/60 text-foreground"
-									: "bg-transparent text-muted-foreground hover:bg-muted/30 hover:text-foreground"
-							)}
-						>
-							<span className="block min-w-0 flex-1 truncate text-left group-hover:pr-5 group-focus-within:pr-5">
-								{tab.title}
-							</span>
-							{/* biome-ignore lint/a11y/useSemanticElements: cannot nest button inside button */}
-							<span
-								role="button"
-								tabIndex={0}
-								onClick={(e) => handleTabClose(e, tab.id)}
-								onKeyDown={(e) => {
-									if (e.key === "Enter" || e.key === " ") {
-										e.preventDefault();
-										handleTabClose(e as unknown as React.MouseEvent, tab.id);
-									}
-								}}
-								className={cn(
-									"absolute right-2 top-1/2 -translate-y-1/2 shrink-0 rounded-full p-0.5 transition-all duration-150 hover:bg-muted-foreground/15",
-									isActive
-										? "opacity-0 group-hover:opacity-70 group-focus-within:opacity-70 hover:opacity-100"
-										: "opacity-0 group-hover:opacity-60 group-focus-within:opacity-60 hover:opacity-100!"
-								)}
-							>
-								<X className="size-3" />
-							</span>
-						</button>
+						<Fragment key={tab.id}>
+							{index > 0 ? (
+								<div
+									aria-hidden="true"
+									className={cn(
+										"mx-1.5 h-4 w-px shrink-0 bg-muted-foreground/20 transition-opacity duration-150 dark:bg-muted-foreground/25",
+										shouldHideSeparator(index) && "opacity-0"
+									)}
+								/>
+							) : null}
+							<div data-tab-id={tab.id} className="group relative h-full w-[180px] shrink-0">
+								<Button
+									type="button"
+									variant="ghost"
+									onClick={() => handleTabClick(tab)}
+									onMouseEnter={() => setHoveredTabIndex(index)}
+									onMouseLeave={() => setHoveredTabIndex(null)}
+									className={cn(
+										"h-full w-full justify-start overflow-hidden px-3 text-left text-[13px] font-medium transition-colors duration-150",
+										isActive
+											? "bg-accent text-accent-foreground"
+											: "bg-transparent text-muted-foreground hover:bg-accent hover:text-accent-foreground group-hover:bg-accent group-hover:text-accent-foreground group-focus-within:bg-accent group-focus-within:text-accent-foreground"
+									)}
+								>
+									<span className="block min-w-0 flex-1 truncate text-left">{tab.title}</span>
+								</Button>
+								{/* Hover-only gradient + close overlay (sidebar pattern) — keeps pill width fixed and avoids ellipsis shift. */}
+								<div
+									className={cn(
+										"pointer-events-none absolute right-0 top-0 bottom-0 flex items-center rounded-r-md pl-8 pr-2 opacity-0 transition-opacity duration-150",
+										"group-hover:opacity-100 group-focus-within:opacity-100",
+										"bg-gradient-to-l from-accent from-60% to-transparent"
+									)}
+								>
+									<Button
+										type="button"
+										variant="ghost"
+										size="icon"
+										onClick={(e) => handleTabClose(e, tab.id)}
+										onMouseEnter={() => setHoveredTabIndex(index)}
+										onMouseLeave={() => setHoveredTabIndex(null)}
+										className="pointer-events-auto size-4 rounded-full p-0.5 hover:bg-accent hover:text-accent-foreground"
+									>
+										<X data-icon="inline-start" />
+									</Button>
+								</div>
+							</div>
+						</Fragment>
 					);
 				})}
-			</div>
-			<div className="flex items-center gap-0.5 shrink-0">
 				{onNewChat && (
-					<button
-						type="button"
-						onClick={onNewChat}
-						className="flex h-6 w-6 items-center justify-center shrink-0 rounded-md text-muted-foreground transition-all duration-150 hover:text-foreground hover:bg-muted/40"
-						title="New Chat"
+					<div
+						className={cn(
+							// Solid bg + soft left-fade so tabs scrolling underneath the
+							// + button get visually masked into the bar's background.
+							"sticky right-0 z-10 ml-3 flex h-full shrink-0 items-center bg-panel pl-3 pr-1",
+							"before:content-[''] before:absolute before:inset-y-0 before:-left-4 before:w-4 before:pointer-events-none",
+							"before:bg-gradient-to-r before:from-transparent before:to-panel"
+						)}
 					>
-						<Plus className="size-3.5" />
-					</button>
+						<Button
+							type="button"
+							variant="ghost"
+							size="icon"
+							onClick={onNewChat}
+							className="size-8 shrink-0 text-muted-foreground transition-all duration-150 hover:bg-accent hover:text-accent-foreground"
+							title="New Chat"
+						>
+							<Plus data-icon="inline-start" />
+						</Button>
+					</div>
 				)}
-				{rightActions}
 			</div>
+			{rightActions ? (
+				<div className="flex items-center gap-0.5 shrink-0 pr-2">{rightActions}</div>
+			) : null}
 		</div>
 	);
 }
diff --git a/surfsense_web/components/markdown-viewer.tsx b/surfsense_web/components/markdown-viewer.tsx
index 5775fe0837..45bdc7be13 100644
--- a/surfsense_web/components/markdown-viewer.tsx
+++ b/surfsense_web/components/markdown-viewer.tsx
@@ -3,6 +3,9 @@ import { createMathPlugin } from "@streamdown/math";
 import { Streamdown, type StreamdownProps } from "streamdown";
 import "katex/dist/katex.min.css";
 import Image from "next/image";
+import { useMemo } from "react";
+import { processChildrenWithCitations } from "@/components/citations/citation-renderer";
+import { type CitationUrlMap, preprocessCitationMarkdown } from "@/lib/citations/citation-parser";
 import { cn } from "@/lib/utils";
 
 const code = createCodePlugin({
@@ -10,70 +13,45 @@ const code = createCodePlugin({
 });
 
 const math = createMathPlugin({
-	singleDollarTextMath: true,
+	// Keep currency from being parsed as math; real math is normalized below.
+	singleDollarTextMath: false,
 });
 
 interface MarkdownViewerProps {
 	content: string;
 	className?: string;
 	maxLength?: number;
+	/** Render citation tokens as interactive badges. */
+	enableCitations?: boolean;
 }
 
-/**
- * If the entire content is wrapped in a single ```markdown or ```md
- * code fence, strip the fence so the inner markdown renders properly.
- */
+const EMPTY_URL_MAP: CitationUrlMap = new Map();
+
+/** Strip a single outer markdown fence when the whole payload is fenced. */
 function stripOuterMarkdownFence(content: string): string {
 	const trimmed = content.trim();
-	// Match 3+ backtick fences (LLMs escalate to 4+ when content has triple-backtick blocks)
 	const match = trimmed.match(/^(`{3,})(?:markdown|md)?\s*\n([\s\S]+?)\n\1\s*$/);
 	return match ? match[2] : content;
 }
 
-/**
- * Convert all LaTeX delimiter styles to the double-dollar syntax
- * that Streamdown's @streamdown/math plugin understands.
- *
- * Streamdown math conventions (different from remark-math!):
- *   $$...$$  on the SAME line     → inline math
- *   $$\n...\n$$  on SEPARATE lines → block (display) math
- *
- * Conversions performed:
- *   \[...\]                              → $$\n ... \n$$  (block math)
- *   \(...\)                              → $$...$$        (inline math, same line)
- *   \begin{equation}...\end{equation}    → $$\n ... \n$$  (block math)
- *   \begin{displaymath}...\end{displaymath} → $$\n ... \n$$ (block math)
- *   \begin{math}...\end{math}            → $$...$$        (inline math, same line)
- *   `$$ … $$`                             → $$ … $$       (strip wrapping backtick code)
- *   `$ … $`                               → $ … $         (strip wrapping backtick code)
- *   $...$                                 → $$...$$        (normalise single-$ to double-$$)
- */
+/** Normalize common LaTeX delimiters to Streamdown's double-dollar syntax. */
 function convertLatexDelimiters(content: string): string {
-	// 1. Block math: \[...\] → $$\n...\n$$ (display math on separate lines)
 	content = content.replace(/\\\[([\s\S]*?)\\\]/g, (_, inner) => `\n$$\n${inner.trim()}\n$$\n`);
-	// 2. Inline math: \(...\) → $$...$$ (inline math on same line)
 	content = content.replace(/\\\(([\s\S]*?)\\\)/g, (_, inner) => `$$${inner.trim()}$$`);
-	// 3. Block: \begin{equation}...\end{equation} → $$\n...\n$$
 	content = content.replace(
 		/\\begin\{equation\}([\s\S]*?)\\end\{equation\}/g,
 		(_, inner) => `\n$$\n${inner.trim()}\n$$\n`
 	);
-	// 4. Block: \begin{displaymath}...\end{displaymath} → $$\n...\n$$
 	content = content.replace(
 		/\\begin\{displaymath\}([\s\S]*?)\\end\{displaymath\}/g,
 		(_, inner) => `\n$$\n${inner.trim()}\n$$\n`
 	);
-	// 5. Inline: \begin{math}...\end{math} → $$...$$
 	content = content.replace(
 		/\\begin\{math\}([\s\S]*?)\\end\{math\}/g,
 		(_, inner) => `$$${inner.trim()}$$`
 	);
-	// 6. Strip backtick wrapping around math: `$$...$$` → $$...$$ and `$...$` → $...$
 	content = content.replace(/`(\${1,2})((?:(?!\1).)+)\1`/g, "$1$2$1");
-	// 7. Normalise single-dollar $...$ to double-dollar $$...$$ so they render
-	//    reliably in Streamdown (single-$ has strict no-space rules that often fail).
-	//    We match $…$ where the content starts with a backslash (LaTeX command)
-	//    to avoid converting currency like $50.
+	// Only convert command-style single-dollar math, leaving currency alone.
 	content = content.replace(
 		/(?<!\$)\$(?!\$)(\\[a-zA-Z][\s\S]*?)(?<!\$)\$(?!\$)/g,
 		(_, inner) => `$$${inner.trim()}$$`
@@ -81,14 +59,39 @@ function convertLatexDelimiters(content: string): string {
 	return content;
 }
 
-export function MarkdownViewer({ content, className, maxLength }: MarkdownViewerProps) {
+export function MarkdownViewer({
+	content,
+	className,
+	maxLength,
+	enableCitations = false,
+}: MarkdownViewerProps) {
 	const isTruncated = maxLength != null && content.length > maxLength;
 	const displayContent = isTruncated ? content.slice(0, maxLength) : content;
-	const processedContent = convertLatexDelimiters(stripOuterMarkdownFence(displayContent));
+
+	// Rewrite citation URLs before markdown autolinking can split them.
+	const { processedContent, urlMap } = useMemo(() => {
+		const stripped = stripOuterMarkdownFence(displayContent);
+		if (!enableCitations) {
+			return {
+				processedContent: convertLatexDelimiters(stripped),
+				urlMap: EMPTY_URL_MAP,
+			};
+		}
+		const { content: rewritten, urlMap: map } = preprocessCitationMarkdown(stripped);
+		return {
+			processedContent: convertLatexDelimiters(rewritten),
+			urlMap: map,
+		};
+	}, [displayContent, enableCitations]);
+
+	// Do not wrap anchors or code; citation buttons inside them would be invalid.
+	const wrap = (children: React.ReactNode): React.ReactNode =>
+		enableCitations ? processChildrenWithCitations(children, urlMap) : children;
+
 	const components: StreamdownProps["components"] = {
 		p: ({ children, ...props }) => (
 			<p className="my-2" {...props}>
-				{children}
+				{wrap(children)}
 			</p>
 		),
 		a: ({ children, ...props }) => (
@@ -101,55 +104,67 @@ export function MarkdownViewer({ content, className, maxLength }: MarkdownViewer
 				{children}
 			</a>
 		),
-		li: ({ children, ...props }) => <li {...props}>{children}</li>,
+		li: ({ children, ...props }) => <li {...props}>{wrap(children)}</li>,
 		ul: ({ ...props }) => <ul className="list-disc pl-5 my-2" {...props} />,
 		ol: ({ ...props }) => <ol className="list-decimal pl-5 my-2" {...props} />,
 		h1: ({ children, ...props }) => (
 			<h1 className="text-2xl font-bold mt-6 mb-2" {...props}>
-				{children}
+				{wrap(children)}
 			</h1>
 		),
 		h2: ({ children, ...props }) => (
 			<h2 className="text-xl font-bold mt-5 mb-2" {...props}>
-				{children}
+				{wrap(children)}
 			</h2>
 		),
 		h3: ({ children, ...props }) => (
 			<h3 className="text-lg font-bold mt-4 mb-2" {...props}>
-				{children}
+				{wrap(children)}
 			</h3>
 		),
 		h4: ({ children, ...props }) => (
 			<h4 className="text-base font-bold mt-3 mb-1" {...props}>
-				{children}
+				{wrap(children)}
 			</h4>
 		),
-		blockquote: ({ ...props }) => (
-			<blockquote className="border-l-4 border-muted pl-4 italic my-2" {...props} />
+		h5: ({ children, ...props }) => (
+			<h5 className="text-sm font-bold mt-3 mb-1" {...props}>
+				{wrap(children)}
+			</h5>
+		),
+		h6: ({ children, ...props }) => (
+			<h6 className="text-xs font-bold mt-3 mb-1" {...props}>
+				{wrap(children)}
+			</h6>
+		),
+		strong: ({ children, ...props }) => (
+			<strong className="font-semibold" {...props}>
+				{wrap(children)}
+			</strong>
+		),
+		em: ({ children, ...props }) => <em {...props}>{wrap(children)}</em>,
+		blockquote: ({ children, ...props }) => (
+			<blockquote className="border-l-4 border-muted pl-4 italic my-2" {...props}>
+				{wrap(children)}
+			</blockquote>
 		),
 		hr: ({ ...props }) => <hr className="my-4 border-muted" {...props} />,
 		img: ({ src, alt, width: _w, height: _h, ...props }) => {
-			const isDataOrUnknownUrl =
-				typeof src === "string" && (src.startsWith("data:") || !src.startsWith("http"));
+			if (typeof src !== "string") return null;
 
-			return isDataOrUnknownUrl ? (
-				// eslint-disable-next-line @next/next/no-img-element
-				<img
-					className="max-w-full h-auto my-4 rounded"
-					alt={alt || "markdown image"}
-					src={src}
-					loading="lazy"
-					{...props}
-				/>
-			) : (
+			const width = typeof _w === "number" ? _w : Number(_w) || 800;
+			const height = typeof _h === "number" ? _h : Number(_h) || 600;
+			const shouldSkipOptimization = src.startsWith("data:");
+
+			return (
 				<Image
 					className="max-w-full h-auto my-4 rounded"
 					alt={alt || "markdown image"}
-					src={typeof src === "string" ? src : ""}
-					width={_w || 800}
-					height={_h || 600}
+					src={src}
+					width={width}
+					height={height}
 					sizes="(max-width: 768px) 100vw, (max-width: 1200px) 75vw, 60vw"
-					unoptimized={isDataOrUnknownUrl}
+					unoptimized={shouldSkipOptimization}
 					{...props}
 				/>
 			);
@@ -159,17 +174,21 @@ export function MarkdownViewer({ content, className, maxLength }: MarkdownViewer
 				<table className="w-full divide-y divide-border" {...props} />
 			</div>
 		),
-		th: ({ ...props }) => (
+		th: ({ children, ...props }) => (
 			<th
 				className="px-4 py-2.5 text-left text-sm font-semibold text-muted-foreground/80 bg-muted/30 border-r border-border/40 last:border-r-0"
 				{...props}
-			/>
+			>
+				{wrap(children)}
+			</th>
 		),
-		td: ({ ...props }) => (
+		td: ({ children, ...props }) => (
 			<td
 				className="px-4 py-2.5 text-sm border-t border-r border-border/40 last:border-r-0"
 				{...props}
-			/>
+			>
+				{wrap(children)}
+			</td>
 		),
 	};
 
diff --git a/surfsense_web/components/new-chat/chat-header.tsx b/surfsense_web/components/new-chat/chat-header.tsx
index 0c5253c6cb..4716418ee9 100644
--- a/surfsense_web/components/new-chat/chat-header.tsx
+++ b/surfsense_web/components/new-chat/chat-header.tsx
@@ -44,21 +44,28 @@ export function ChatHeader({ searchSpaceId, className }: ChatHeaderProps) {
 	const [isVisionGlobal, setIsVisionGlobal] = useState(false);
 	const [visionDialogMode, setVisionDialogMode] = useState<"create" | "edit" | "view">("view");
 
+	// Default provider for create dialogs
+	const [defaultLLMProvider, setDefaultLLMProvider] = useState<string | undefined>();
+	const [defaultImageProvider, setDefaultImageProvider] = useState<string | undefined>();
+	const [defaultVisionProvider, setDefaultVisionProvider] = useState<string | undefined>();
+
 	// LLM handlers
 	const handleEditLLMConfig = useCallback(
 		(config: NewLLMConfigPublic | GlobalNewLLMConfig, global: boolean) => {
 			setSelectedConfig(config);
 			setIsGlobal(global);
 			setDialogMode(global ? "view" : "edit");
+			setDefaultLLMProvider(undefined);
 			setDialogOpen(true);
 		},
 		[]
 	);
 
-	const handleAddNewLLM = useCallback(() => {
+	const handleAddNewLLM = useCallback((provider?: string) => {
 		setSelectedConfig(null);
 		setIsGlobal(false);
 		setDialogMode("create");
+		setDefaultLLMProvider(provider);
 		setDialogOpen(true);
 	}, []);
 
@@ -68,10 +75,11 @@ export function ChatHeader({ searchSpaceId, className }: ChatHeaderProps) {
 	}, []);
 
 	// Image model handlers
-	const handleAddImageModel = useCallback(() => {
+	const handleAddImageModel = useCallback((provider?: string) => {
 		setSelectedImageConfig(null);
 		setIsImageGlobal(false);
 		setImageDialogMode("create");
+		setDefaultImageProvider(provider);
 		setImageDialogOpen(true);
 	}, []);
 
@@ -80,6 +88,7 @@ export function ChatHeader({ searchSpaceId, className }: ChatHeaderProps) {
 			setSelectedImageConfig(config);
 			setIsImageGlobal(global);
 			setImageDialogMode(global ? "view" : "edit");
+			setDefaultImageProvider(undefined);
 			setImageDialogOpen(true);
 		},
 		[]
@@ -91,10 +100,11 @@ export function ChatHeader({ searchSpaceId, className }: ChatHeaderProps) {
 	}, []);
 
 	// Vision model handlers
-	const handleAddVisionModel = useCallback(() => {
+	const handleAddVisionModel = useCallback((provider?: string) => {
 		setSelectedVisionConfig(null);
 		setIsVisionGlobal(false);
 		setVisionDialogMode("create");
+		setDefaultVisionProvider(provider);
 		setVisionDialogOpen(true);
 	}, []);
 
@@ -103,6 +113,7 @@ export function ChatHeader({ searchSpaceId, className }: ChatHeaderProps) {
 			setSelectedVisionConfig(config);
 			setIsVisionGlobal(global);
 			setVisionDialogMode(global ? "view" : "edit");
+			setDefaultVisionProvider(undefined);
 			setVisionDialogOpen(true);
 		},
 		[]
@@ -131,6 +142,7 @@ export function ChatHeader({ searchSpaceId, className }: ChatHeaderProps) {
 				isGlobal={isGlobal}
 				searchSpaceId={searchSpaceId}
 				mode={dialogMode}
+				defaultProvider={defaultLLMProvider}
 			/>
 			<ImageConfigDialog
 				open={imageDialogOpen}
@@ -139,6 +151,7 @@ export function ChatHeader({ searchSpaceId, className }: ChatHeaderProps) {
 				isGlobal={isImageGlobal}
 				searchSpaceId={searchSpaceId}
 				mode={imageDialogMode}
+				defaultProvider={defaultImageProvider}
 			/>
 			<VisionConfigDialog
 				open={visionDialogOpen}
@@ -147,6 +160,7 @@ export function ChatHeader({ searchSpaceId, className }: ChatHeaderProps) {
 				isGlobal={isVisionGlobal}
 				searchSpaceId={searchSpaceId}
 				mode={visionDialogMode}
+				defaultProvider={defaultVisionProvider}
 			/>
 		</div>
 	);
diff --git a/surfsense_web/components/new-chat/chat-share-button.tsx b/surfsense_web/components/new-chat/chat-share-button.tsx
index 239745745b..101f73ade8 100644
--- a/surfsense_web/components/new-chat/chat-share-button.tsx
+++ b/surfsense_web/components/new-chat/chat-share-button.tsx
@@ -3,13 +3,13 @@
 import { useQuery, useQueryClient } from "@tanstack/react-query";
 import { useAtomValue, useSetAtom } from "jotai";
 import { Earth, User, Users } from "lucide-react";
+import { useRouter } from "next/navigation";
 
 import { useCallback, useState } from "react";
 import { toast } from "sonner";
 import { currentThreadAtom, setThreadVisibilityAtom } from "@/atoms/chat/current-thread.atom";
 import { myAccessAtom } from "@/atoms/members/members-query.atoms";
 import { createPublicChatSnapshotMutationAtom } from "@/atoms/public-chat-snapshots/public-chat-snapshots-mutation.atoms";
-import { searchSpaceSettingsDialogAtom } from "@/atoms/settings/settings-dialog.atoms";
 import { Button } from "@/components/ui/button";
 import { Popover, PopoverContent, PopoverTrigger } from "@/components/ui/popover";
 import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
@@ -49,8 +49,8 @@ const visibilityOptions: {
 
 export function ChatShareButton({ thread, onVisibilityChange, className }: ChatShareButtonProps) {
 	const queryClient = useQueryClient();
+	const router = useRouter();
 	const [open, setOpen] = useState(false);
-	const setSearchSpaceSettingsDialog = useSetAtom(searchSpaceSettingsDialogAtom);
 
 	// Use Jotai atom for visibility (single source of truth)
 	const currentThreadState = useAtomValue(currentThreadAtom);
@@ -143,18 +143,19 @@ export function ChatShareButton({ thread, onVisibilityChange, className }: ChatS
 			{hasPublicSnapshots && (
 				<Tooltip>
 					<TooltipTrigger asChild>
-						<button
+						<Button
 							type="button"
+							variant="ghost"
+							size="icon"
 							onClick={() =>
-								setSearchSpaceSettingsDialog({
-									open: true,
-									initialTab: "public-links",
-								})
+								router.push(
+									`/dashboard/${thread.search_space_id}/search-space-settings/public-links`
+								)
 							}
-							className="flex items-center justify-center h-8 w-8 rounded-md bg-muted/50 hover:bg-muted transition-colors"
+							className="size-8 bg-muted/50 hover:bg-accent hover:text-accent-foreground"
 						>
-							<Earth className="h-4 w-4 text-muted-foreground" />
-						</button>
+							<Earth data-icon="inline-start" className="text-muted-foreground" />
+						</Button>
 					</TooltipTrigger>
 					<TooltipContent>Manage public links</TooltipContent>
 				</Tooltip>
@@ -165,7 +166,7 @@ export function ChatShareButton({ thread, onVisibilityChange, className }: ChatS
 					<Button
 						variant="outline"
 						size="icon"
-						className="h-8 w-8 md:w-auto md:px-3 md:gap-2 relative bg-muted hover:bg-muted/80 border-0 select-none"
+						className="h-8 w-8 md:w-auto md:px-3 md:gap-2 relative bg-muted hover:bg-accent hover:text-accent-foreground border-0 select-none"
 					>
 						<CurrentIcon className="h-4 w-4" />
 						<span className="hidden md:inline text-sm">{buttonLabel}</span>
@@ -173,7 +174,7 @@ export function ChatShareButton({ thread, onVisibilityChange, className }: ChatS
 				</PopoverTrigger>
 
 				<PopoverContent
-					className="w-[280px] md:w-[320px] p-0 rounded-lg shadow-lg border-border/60 dark:bg-neutral-900 dark:border dark:border-white/5 select-none"
+					className="w-[280px] md:w-[320px] p-0 rounded-lg shadow-lg select-none"
 					align="end"
 					sideOffset={8}
 					onCloseAutoFocus={(e) => e.preventDefault()}
@@ -185,15 +186,14 @@ export function ChatShareButton({ thread, onVisibilityChange, className }: ChatS
 							const Icon = option.icon;
 
 							return (
-								<button
+								<Button
 									type="button"
+									variant="ghost"
 									key={option.value}
 									onClick={() => handleVisibilityChange(option.value)}
 									className={cn(
-										"w-full flex items-center gap-2.5 px-2.5 py-2 rounded-md transition-all",
-										"hover:bg-accent/50 dark:hover:bg-white/10 cursor-pointer",
-										"focus:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2",
-										isSelected && "bg-accent/80 dark:bg-white/10"
+										"h-auto w-full justify-start gap-2.5 whitespace-normal px-2.5 py-2 font-normal",
+										isSelected && "bg-accent text-accent-foreground"
 									)}
 								>
 									<div
@@ -204,7 +204,7 @@ export function ChatShareButton({ thread, onVisibilityChange, className }: ChatS
 									>
 										<Icon
 											className={cn(
-												"size-4 block",
+												"block",
 												isSelected ? "text-primary dark:text-white" : "text-muted-foreground"
 											)}
 										/>
@@ -224,29 +224,28 @@ export function ChatShareButton({ thread, onVisibilityChange, className }: ChatS
 											{option.description}
 										</p>
 									</div>
-								</button>
+								</Button>
 							);
 						})}
 
 						{canCreatePublicLink && (
 							<>
 								{/* Divider */}
-								<div className="border-t border-border dark:border-white/5 my-1" />
+								<div className="border-t border-popover-border my-1" />
 
 								{/* Public Link Option */}
-								<button
+								<Button
 									type="button"
+									variant="ghost"
 									onClick={handleCreatePublicLink}
 									disabled={isCreatingSnapshot}
 									className={cn(
-										"w-full flex items-center gap-2.5 px-2.5 py-2 rounded-md transition-all",
-										"hover:bg-accent/50 dark:hover:bg-white/10 cursor-pointer",
-										"focus:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2",
-										"disabled:opacity-50 disabled:cursor-not-allowed"
+										"h-auto w-full justify-start gap-2.5 whitespace-normal px-2.5 py-2 font-normal",
+										"disabled:cursor-not-allowed"
 									)}
 								>
 									<div className="size-7 rounded-md shrink-0 grid place-items-center bg-muted dark:bg-white/5">
-										<Earth className="size-4 block text-muted-foreground" />
+										<Earth className="block text-muted-foreground" />
 									</div>
 									<div className="flex-1 text-left min-w-0">
 										<div className="flex items-center gap-1.5">
@@ -258,7 +257,7 @@ export function ChatShareButton({ thread, onVisibilityChange, className }: ChatS
 											Creates a shareable snapshot of this chat
 										</p>
 									</div>
-								</button>
+								</Button>
 							</>
 						)}
 					</div>
diff --git a/surfsense_web/components/new-chat/document-mention-picker.tsx b/surfsense_web/components/new-chat/document-mention-picker.tsx
index 9c6521f31b..c3b9072665 100644
--- a/surfsense_web/components/new-chat/document-mention-picker.tsx
+++ b/surfsense_web/components/new-chat/document-mention-picker.tsx
@@ -1,6 +1,8 @@
 "use client";
 
+import { useQuery as useZeroQuery } from "@rocicorp/zero/react";
 import { keepPreviousData, useQuery } from "@tanstack/react-query";
+import { Folder as FolderIcon } from "lucide-react";
 import {
 	forwardRef,
 	useCallback,
@@ -11,11 +13,18 @@ import {
 	useRef,
 	useState,
 } from "react";
+import {
+	FOLDER_MENTION_DOCUMENT_TYPE,
+	type MentionedDocumentInfo,
+} from "@/atoms/chat/mentioned-documents.atom";
+import { Button } from "@/components/ui/button";
 import { Skeleton } from "@/components/ui/skeleton";
 import { getConnectorIcon } from "@/contracts/enums/connectorIcons";
 import type { Document, SearchDocumentTitlesResponse } from "@/contracts/types/document.types";
 import { documentsApiService } from "@/lib/apis/documents-api.service";
+import { getMentionDocKey } from "@/lib/chat/mention-doc-key";
 import { cn } from "@/lib/utils";
+import { queries } from "@/zero/queries";
 
 export interface DocumentMentionPickerRef {
 	selectHighlighted: () => void;
@@ -25,12 +34,10 @@ export interface DocumentMentionPickerRef {
 
 interface DocumentMentionPickerProps {
 	searchSpaceId: number;
-	onSelectionChange: (documents: Pick<Document, "id" | "title" | "document_type">[]) => void;
+	onSelectionChange: (mentions: MentionedDocumentInfo[]) => void;
 	onDone: () => void;
-	initialSelectedDocuments?: Pick<Document, "id" | "title" | "document_type">[];
+	initialSelectedDocuments?: MentionedDocumentInfo[];
 	externalSearch?: string;
-	/** Positioning styles for the container */
-	containerStyle?: React.CSSProperties;
 }
 
 const PAGE_SIZE = 20;
@@ -69,14 +76,7 @@ export const DocumentMentionPicker = forwardRef<
 	DocumentMentionPickerRef,
 	DocumentMentionPickerProps
 >(function DocumentMentionPicker(
-	{
-		searchSpaceId,
-		onSelectionChange,
-		onDone,
-		initialSelectedDocuments = [],
-		externalSearch = "",
-		containerStyle,
-	},
+	{ searchSpaceId, onSelectionChange, onDone, initialSelectedDocuments = [], externalSearch = "" },
 	ref
 ) {
 	// Debounced search value to minimize API calls and prevent race conditions
@@ -98,6 +98,11 @@ export const DocumentMentionPicker = forwardRef<
 	const [hasMore, setHasMore] = useState(false);
 	const [isLoadingMore, setIsLoadingMore] = useState(false);
 
+	// Folders for this search space — pulled from Zero so the picker
+	// stays consistent with the documents sidebar (same source of
+	// truth, automatic updates on rename/delete).
+	const [zeroFolders] = useZeroQuery(queries.folders.bySpace({ searchSpaceId }));
+
 	/**
 	 * Search Strategy:
 	 * - Single character (length === 1): Client-side filtering for instant results
@@ -276,21 +281,48 @@ export const DocumentMentionPicker = forwardRef<
 		[actualDocuments]
 	);
 
-	// Track selected documents with composite key (document_type:id) to prevent cross-type ID collisions
+	// Folder mention candidates filtered by the current search term.
+	// Single-char and server-search both use the same client filter
+	// — folder counts in a workspace are tiny compared to docs, so we
+	// don't need a paged endpoint. Empty search shows all folders.
+	const folderMentions: MentionedDocumentInfo[] = useMemo(() => {
+		const all = (zeroFolders ?? []).map((f) => ({
+			id: f.id,
+			title: f.name,
+			document_type: FOLDER_MENTION_DOCUMENT_TYPE,
+			kind: "folder" as const,
+		}));
+		if (!shouldSearch) return all;
+		const needle = (isSingleCharSearch ? deferredSearch : debouncedSearch).trim().toLowerCase();
+		if (!needle) return all;
+		return all.filter((f) => f.title.toLowerCase().includes(needle));
+	}, [zeroFolders, debouncedSearch, deferredSearch, isSingleCharSearch, shouldSearch]);
+
+	// Doc-shape entries reuse their ``document_type`` discriminator;
+	// folder entries lift the existing kind-aware key so the same
+	// matchers used by the chip atom apply unchanged.
 	const selectedKeys = useMemo(
-		() => new Set(initialSelectedDocuments.map((d) => `${d.document_type}:${d.id}`)),
+		() => new Set(initialSelectedDocuments.map((d) => getMentionDocKey(d))),
 		[initialSelectedDocuments]
 	);
 
-	// Exclude already-selected documents from keyboard navigation
-	const selectableDocuments = useMemo(
-		() => actualDocuments.filter((doc) => !selectedKeys.has(`${doc.document_type}:${doc.id}`)),
-		[actualDocuments, selectedKeys]
-	);
-
-	const handleSelectDocument = useCallback(
-		(doc: Pick<Document, "id" | "title" | "document_type">) => {
-			onSelectionChange([...initialSelectedDocuments, doc]);
+	// Combined navigation order: SurfSense docs -> User docs -> Folders.
+	// Mirrors the on-screen ordering so keyboard arrows match what the
+	// user sees.
+	const selectableMentions = useMemo<MentionedDocumentInfo[]>(() => {
+		const docs: MentionedDocumentInfo[] = actualDocuments.map((doc) => ({
+			id: doc.id,
+			title: doc.title,
+			document_type: doc.document_type,
+			kind: "doc" as const,
+		}));
+		const ordered = [...docs, ...folderMentions];
+		return ordered.filter((m) => !selectedKeys.has(getMentionDocKey(m)));
+	}, [actualDocuments, folderMentions, selectedKeys]);
+
+	const handleSelectMention = useCallback(
+		(mention: MentionedDocumentInfo) => {
+			onSelectionChange([...initialSelectedDocuments, mention]);
 			onDone();
 		},
 		[initialSelectedDocuments, onSelectionChange, onDone]
@@ -347,42 +379,42 @@ export const DocumentMentionPicker = forwardRef<
 		ref,
 		() => ({
 			selectHighlighted: () => {
-				if (selectableDocuments[highlightedIndex]) {
-					handleSelectDocument(selectableDocuments[highlightedIndex]);
+				if (selectableMentions[highlightedIndex]) {
+					handleSelectMention(selectableMentions[highlightedIndex]);
 				}
 			},
 			moveUp: () => {
 				shouldScrollRef.current = true;
-				setHighlightedIndex((prev) => (prev > 0 ? prev - 1 : selectableDocuments.length - 1));
+				setHighlightedIndex((prev) => (prev > 0 ? prev - 1 : selectableMentions.length - 1));
 			},
 			moveDown: () => {
 				shouldScrollRef.current = true;
-				setHighlightedIndex((prev) => (prev < selectableDocuments.length - 1 ? prev + 1 : 0));
+				setHighlightedIndex((prev) => (prev < selectableMentions.length - 1 ? prev + 1 : 0));
 			},
 		}),
-		[selectableDocuments, highlightedIndex, handleSelectDocument]
+		[selectableMentions, highlightedIndex, handleSelectMention]
 	);
 
 	// Keyboard navigation handler for arrow keys, Enter, and Escape
 	const handleKeyDown = useCallback(
 		(e: React.KeyboardEvent) => {
-			if (selectableDocuments.length === 0) return;
+			if (selectableMentions.length === 0) return;
 
 			switch (e.key) {
 				case "ArrowDown":
 					e.preventDefault();
 					shouldScrollRef.current = true;
-					setHighlightedIndex((prev) => (prev < selectableDocuments.length - 1 ? prev + 1 : 0));
+					setHighlightedIndex((prev) => (prev < selectableMentions.length - 1 ? prev + 1 : 0));
 					break;
 				case "ArrowUp":
 					e.preventDefault();
 					shouldScrollRef.current = true;
-					setHighlightedIndex((prev) => (prev > 0 ? prev - 1 : selectableDocuments.length - 1));
+					setHighlightedIndex((prev) => (prev > 0 ? prev - 1 : selectableMentions.length - 1));
 					break;
 				case "Enter":
 					e.preventDefault();
-					if (selectableDocuments[highlightedIndex]) {
-						handleSelectDocument(selectableDocuments[highlightedIndex]);
+					if (selectableMentions[highlightedIndex]) {
+						handleSelectMention(selectableMentions[highlightedIndex]);
 					}
 					break;
 				case "Escape":
@@ -391,22 +423,12 @@ export const DocumentMentionPicker = forwardRef<
 					break;
 			}
 		},
-		[selectableDocuments, highlightedIndex, handleSelectDocument, onDone]
+		[selectableMentions, highlightedIndex, handleSelectMention, onDone]
 	);
 
-	// Hide popup when there are no documents to display (regardless of fetch state)
-	// Search continues in background; popup reappears when results arrive
-	if (!actualLoading && actualDocuments.length === 0) {
-		return null;
-	}
-
 	return (
 		<div
-			className="fixed shadow-2xl rounded-lg border border-border dark:border-white/5 overflow-hidden bg-popover dark:bg-neutral-900 flex flex-col w-[280px] sm:w-[320px] select-none"
-			style={{
-				zIndex: 9999,
-				...containerStyle,
-			}}
+			className="shadow-2xl rounded-lg overflow-hidden bg-popover text-popover-foreground flex flex-col w-[280px] sm:w-[320px] select-none"
 			onKeyDown={handleKeyDown}
 			role="listbox"
 			tabIndex={-1}
@@ -439,7 +461,7 @@ export const DocumentMentionPicker = forwardRef<
 							</div>
 						))}
 					</div>
-				) : actualDocuments.length > 0 ? (
+				) : actualDocuments.length > 0 || folderMentions.length > 0 ? (
 					<div className="py-1 px-2">
 						{/* SurfSense Documentation */}
 						{surfsenseDocsList.length > 0 && (
@@ -448,15 +470,21 @@ export const DocumentMentionPicker = forwardRef<
 									SurfSense Docs
 								</div>
 								{surfsenseDocsList.map((doc) => {
-									const docKey = `${doc.document_type}:${doc.id}`;
+									const mention: MentionedDocumentInfo = {
+										id: doc.id,
+										title: doc.title,
+										document_type: doc.document_type,
+										kind: "doc",
+									};
+									const docKey = getMentionDocKey(mention);
 									const isAlreadySelected = selectedKeys.has(docKey);
-									const selectableIndex = selectableDocuments.findIndex(
-										(d) => d.document_type === doc.document_type && d.id === doc.id
+									const selectableIndex = selectableMentions.findIndex(
+										(m) => getMentionDocKey(m) === docKey
 									);
 									const isHighlighted = !isAlreadySelected && selectableIndex === highlightedIndex;
 
 									return (
-										<button
+										<Button
 											key={docKey}
 											ref={(el) => {
 												if (el && selectableIndex >= 0) {
@@ -464,7 +492,8 @@ export const DocumentMentionPicker = forwardRef<
 												}
 											}}
 											type="button"
-											onClick={() => !isAlreadySelected && handleSelectDocument(doc)}
+											variant="ghost"
+											onClick={() => !isAlreadySelected && handleSelectMention(mention)}
 											onMouseEnter={() => {
 												if (!isAlreadySelected && selectableIndex >= 0) {
 													setHighlightedIndex(selectableIndex);
@@ -472,9 +501,9 @@ export const DocumentMentionPicker = forwardRef<
 											}}
 											disabled={isAlreadySelected}
 											className={cn(
-												"w-full flex items-center gap-2 px-3 py-2 text-left transition-colors rounded-md",
+												"h-auto w-full justify-start gap-2 px-3 py-2 text-left transition-colors",
 												isAlreadySelected ? "opacity-50 cursor-not-allowed" : "cursor-pointer",
-												isHighlighted && "bg-accent"
+												isHighlighted && "bg-accent text-accent-foreground"
 											)}
 										>
 											<span className="shrink-0 text-muted-foreground text-sm">
@@ -483,7 +512,7 @@ export const DocumentMentionPicker = forwardRef<
 											<span className="flex-1 text-sm truncate" title={doc.title}>
 												{doc.title}
 											</span>
-										</button>
+										</Button>
 									);
 								})}
 							</>
@@ -493,21 +522,27 @@ export const DocumentMentionPicker = forwardRef<
 						{userDocsList.length > 0 && (
 							<>
 								{surfsenseDocsList.length > 0 && (
-									<div className="mx-2 my-4 border-t border-border dark:border-white/5" />
+									<div className="mx-2 my-4 border-t border-popover-border" />
 								)}
 								<div className="px-3 py-2 text-xs font-bold text-muted-foreground/55">
 									Your Documents
 								</div>
 								{userDocsList.map((doc) => {
-									const docKey = `${doc.document_type}:${doc.id}`;
+									const mention: MentionedDocumentInfo = {
+										id: doc.id,
+										title: doc.title,
+										document_type: doc.document_type,
+										kind: "doc",
+									};
+									const docKey = getMentionDocKey(mention);
 									const isAlreadySelected = selectedKeys.has(docKey);
-									const selectableIndex = selectableDocuments.findIndex(
-										(d) => d.document_type === doc.document_type && d.id === doc.id
+									const selectableIndex = selectableMentions.findIndex(
+										(m) => getMentionDocKey(m) === docKey
 									);
 									const isHighlighted = !isAlreadySelected && selectableIndex === highlightedIndex;
 
 									return (
-										<button
+										<Button
 											key={docKey}
 											ref={(el) => {
 												if (el && selectableIndex >= 0) {
@@ -515,7 +550,8 @@ export const DocumentMentionPicker = forwardRef<
 												}
 											}}
 											type="button"
-											onClick={() => !isAlreadySelected && handleSelectDocument(doc)}
+											variant="ghost"
+											onClick={() => !isAlreadySelected && handleSelectMention(mention)}
 											onMouseEnter={() => {
 												if (!isAlreadySelected && selectableIndex >= 0) {
 													setHighlightedIndex(selectableIndex);
@@ -523,9 +559,9 @@ export const DocumentMentionPicker = forwardRef<
 											}}
 											disabled={isAlreadySelected}
 											className={cn(
-												"w-full flex items-center gap-2 px-3 py-2 text-left transition-colors rounded-md",
+												"h-auto w-full justify-start gap-2 px-3 py-2 text-left transition-colors",
 												isAlreadySelected ? "opacity-50 cursor-not-allowed" : "cursor-pointer",
-												isHighlighted && "bg-accent"
+												isHighlighted && "bg-accent text-accent-foreground"
 											)}
 										>
 											<span className="shrink-0 text-muted-foreground text-sm">
@@ -534,7 +570,60 @@ export const DocumentMentionPicker = forwardRef<
 											<span className="flex-1 text-sm truncate" title={doc.title}>
 												{doc.title}
 											</span>
-										</button>
+										</Button>
+									);
+								})}
+							</>
+						)}
+
+						{/* Folders — single source of truth is Zero (same store
+						    that powers the documents sidebar). Selecting a
+						    folder inserts a folder chip whose path the agent
+						    can walk with ``ls`` / ``find_documents``. */}
+						{folderMentions.length > 0 && (
+							<>
+								{(surfsenseDocsList.length > 0 || userDocsList.length > 0) && (
+									<div className="mx-2 my-4 border-t border-popover-border" />
+								)}
+								<div className="px-3 py-2 text-xs font-bold text-muted-foreground/55">Folders</div>
+								{folderMentions.map((folder) => {
+									const folderKey = getMentionDocKey(folder);
+									const isAlreadySelected = selectedKeys.has(folderKey);
+									const selectableIndex = selectableMentions.findIndex(
+										(m) => getMentionDocKey(m) === folderKey
+									);
+									const isHighlighted = !isAlreadySelected && selectableIndex === highlightedIndex;
+
+									return (
+										<Button
+											key={folderKey}
+											ref={(el) => {
+												if (el && selectableIndex >= 0) {
+													itemRefs.current.set(selectableIndex, el);
+												}
+											}}
+											type="button"
+											variant="ghost"
+											onClick={() => !isAlreadySelected && handleSelectMention(folder)}
+											onMouseEnter={() => {
+												if (!isAlreadySelected && selectableIndex >= 0) {
+													setHighlightedIndex(selectableIndex);
+												}
+											}}
+											disabled={isAlreadySelected}
+											className={cn(
+												"h-auto w-full justify-start gap-2 px-3 py-2 text-left transition-colors",
+												isAlreadySelected ? "opacity-50 cursor-not-allowed" : "cursor-pointer",
+												isHighlighted && "bg-accent text-accent-foreground"
+											)}
+										>
+											<span className="shrink-0 text-muted-foreground text-sm">
+												<FolderIcon className="h-4 w-4" />
+											</span>
+											<span className="flex-1 text-sm truncate" title={folder.title}>
+												{folder.title}
+											</span>
+										</Button>
 									);
 								})}
 							</>
@@ -547,7 +636,11 @@ export const DocumentMentionPicker = forwardRef<
 							</div>
 						)}
 					</div>
-				) : null}
+				) : (
+					<div className="py-1 px-2">
+						<p className="px-3 py-2 text-xs text-muted-foreground">No matching documents</p>
+					</div>
+				)}
 			</div>
 		</div>
 	);
diff --git a/surfsense_web/components/new-chat/model-selector.tsx b/surfsense_web/components/new-chat/model-selector.tsx
index ec5bf67602..0a096f5f82 100644
--- a/surfsense_web/components/new-chat/model-selector.tsx
+++ b/surfsense_web/components/new-chat/model-selector.tsx
@@ -1,9 +1,25 @@
 "use client";
 
 import { useAtomValue } from "jotai";
-import { Bot, Check, ChevronDown, Edit3, Eye, ImageIcon, Plus, Search, Zap } from "lucide-react";
-import { type UIEvent, useCallback, useMemo, useState } from "react";
+import {
+	Bot,
+	Check,
+	ChevronDown,
+	ChevronLeft,
+	ChevronRight,
+	ChevronUp,
+	ImageIcon,
+	Layers,
+	Pencil,
+	Plus,
+	ScanEye,
+	Search,
+	Zap,
+} from "lucide-react";
+import type React from "react";
+import { Fragment, useCallback, useEffect, useMemo, useRef, useState } from "react";
 import { toast } from "sonner";
+import { pendingUserImageDataUrlsAtom } from "@/atoms/chat/pending-user-images.atom";
 import {
 	globalImageGenConfigsAtom,
 	imageGenConfigsAtom,
@@ -22,17 +38,16 @@ import {
 import { Badge } from "@/components/ui/badge";
 import { Button } from "@/components/ui/button";
 import {
-	Command,
-	CommandEmpty,
-	CommandGroup,
-	CommandInput,
-	CommandItem,
-	CommandList,
-	CommandSeparator,
-} from "@/components/ui/command";
+	Drawer,
+	DrawerContent,
+	DrawerHandle,
+	DrawerHeader,
+	DrawerTitle,
+	DrawerTrigger,
+} from "@/components/ui/drawer";
 import { Popover, PopoverContent, PopoverTrigger } from "@/components/ui/popover";
 import { Spinner } from "@/components/ui/spinner";
-import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs";
+import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
 import type {
 	GlobalImageGenConfig,
 	GlobalNewLLMConfig,
@@ -41,16 +56,283 @@ import type {
 	NewLLMConfigPublic,
 	VisionLLMConfig,
 } from "@/contracts/types/new-llm-config.types";
+import { useIsMobile } from "@/hooks/use-mobile";
 import { getProviderIcon } from "@/lib/provider-icons";
 import { cn } from "@/lib/utils";
 
+// ─── Helpers ────────────────────────────────────────────────────────
+
+const PROVIDER_NAMES: Record<string, string> = {
+	OPENAI: "OpenAI",
+	ANTHROPIC: "Anthropic",
+	GOOGLE: "Google",
+	AZURE: "Azure",
+	AZURE_OPENAI: "Azure OpenAI",
+	AWS_BEDROCK: "AWS Bedrock",
+	BEDROCK: "Bedrock",
+	DEEPSEEK: "DeepSeek",
+	MISTRAL: "Mistral",
+	COHERE: "Cohere",
+	GITHUB_MODELS: "GitHub Models",
+	GROQ: "Groq",
+	OLLAMA: "Ollama",
+	TOGETHER_AI: "Together AI",
+	FIREWORKS_AI: "Fireworks AI",
+	REPLICATE: "Replicate",
+	HUGGINGFACE: "HuggingFace",
+	PERPLEXITY: "Perplexity",
+	XAI: "xAI",
+	OPENROUTER: "OpenRouter",
+	CEREBRAS: "Cerebras",
+	SAMBANOVA: "SambaNova",
+	VERTEX_AI: "Vertex AI",
+	MINIMAX: "MiniMax",
+	MOONSHOT: "Moonshot",
+	ZHIPU: "Zhipu",
+	DEEPINFRA: "DeepInfra",
+	CLOUDFLARE: "Cloudflare",
+	DATABRICKS: "Databricks",
+	NSCALE: "NScale",
+	RECRAFT: "Recraft",
+	XINFERENCE: "XInference",
+	CUSTOM: "Custom",
+	AI21: "AI21",
+	ALIBABA_QWEN: "Qwen",
+	ANYSCALE: "Anyscale",
+	COMETAPI: "CometAPI",
+};
+
+// Provider keys valid per model type, matching backend enums
+// (LiteLLMProvider, ImageGenProvider, VisionProvider in db.py)
+const LLM_PROVIDER_KEYS: string[] = [
+	"OPENAI",
+	"ANTHROPIC",
+	"GOOGLE",
+	"AZURE_OPENAI",
+	"BEDROCK",
+	"VERTEX_AI",
+	"GROQ",
+	"DEEPSEEK",
+	"XAI",
+	"MISTRAL",
+	"COHERE",
+	"OPENROUTER",
+	"TOGETHER_AI",
+	"FIREWORKS_AI",
+	"REPLICATE",
+	"PERPLEXITY",
+	"OLLAMA",
+	"CEREBRAS",
+	"SAMBANOVA",
+	"DEEPINFRA",
+	"AI21",
+	"ALIBABA_QWEN",
+	"MOONSHOT",
+	"ZHIPU",
+	"MINIMAX",
+	"HUGGINGFACE",
+	"CLOUDFLARE",
+	"DATABRICKS",
+	"ANYSCALE",
+	"COMETAPI",
+	"GITHUB_MODELS",
+	"CUSTOM",
+];
+
+const IMAGE_PROVIDER_KEYS: string[] = [
+	"OPENAI",
+	"AZURE_OPENAI",
+	"GOOGLE",
+	"VERTEX_AI",
+	"BEDROCK",
+	"RECRAFT",
+	"OPENROUTER",
+	"XINFERENCE",
+	"NSCALE",
+];
+
+const VISION_PROVIDER_KEYS: string[] = [
+	"OPENAI",
+	"ANTHROPIC",
+	"GOOGLE",
+	"AZURE_OPENAI",
+	"VERTEX_AI",
+	"BEDROCK",
+	"XAI",
+	"OPENROUTER",
+	"OLLAMA",
+	"GROQ",
+	"TOGETHER_AI",
+	"FIREWORKS_AI",
+	"DEEPSEEK",
+	"MISTRAL",
+	"CUSTOM",
+];
+
+const PROVIDER_KEYS_BY_TAB: Record<string, string[]> = {
+	llm: LLM_PROVIDER_KEYS,
+	image: IMAGE_PROVIDER_KEYS,
+	vision: VISION_PROVIDER_KEYS,
+};
+
+function formatProviderName(provider: string): string {
+	const key = provider.toUpperCase();
+	return (
+		PROVIDER_NAMES[key] ??
+		provider.charAt(0).toUpperCase() + provider.slice(1).toLowerCase().replace(/_/g, " ")
+	);
+}
+
+function normalizeText(input: string): string {
+	return input
+		.normalize("NFD")
+		.replace(/\p{Diacritic}/gu, "")
+		.toLowerCase()
+		.replace(/[^a-z0-9]+/g, " ")
+		.trim();
+}
+
+interface ConfigBase {
+	id: number;
+	name: string;
+	model_name: string;
+	provider: string;
+}
+
+function filterAndScore<T extends ConfigBase>(
+	configs: T[],
+	selectedProvider: string,
+	searchQuery: string
+): T[] {
+	let result = configs;
+
+	if (selectedProvider !== "all") {
+		result = result.filter((c) => c.provider.toUpperCase() === selectedProvider);
+	}
+
+	if (!searchQuery.trim()) return result;
+
+	const normalized = normalizeText(searchQuery);
+	const tokens = normalized.split(/\s+/).filter(Boolean);
+
+	const scored = result.map((c) => {
+		const aggregate = normalizeText([c.name, c.model_name, c.provider].join(" "));
+		let score = 0;
+		if (aggregate.includes(normalized)) score += 5;
+		for (const token of tokens) {
+			if (aggregate.includes(token)) score += 1;
+		}
+		return { config: c, score };
+	});
+
+	return scored
+		.filter((s) => s.score > 0)
+		.sort((a, b) => b.score - a.score)
+		.map((s) => s.config);
+}
+
+interface DisplayItem {
+	config: ConfigBase & Record<string, unknown>;
+	isGlobal: boolean;
+	isAutoMode: boolean;
+}
+
+const TruncatedNameWithTooltip: React.FC<{
+	text: string;
+	className?: string;
+	enableTooltip: boolean;
+}> = ({ text, className, enableTooltip }) => {
+	const textRef = useRef<HTMLSpanElement>(null);
+	const openTimerRef = useRef<number | undefined>(undefined);
+	const [isTruncated, setIsTruncated] = useState(false);
+	const [open, setOpen] = useState(false);
+
+	const recalcTruncation = useCallback(() => {
+		const el = textRef.current;
+		if (!el) return;
+		setIsTruncated(el.scrollWidth > el.clientWidth + 1);
+	}, []);
+
+	useEffect(() => {
+		if (!enableTooltip) return;
+		const el = textRef.current;
+		if (!el) return;
+
+		const raf = requestAnimationFrame(recalcTruncation);
+		recalcTruncation();
+
+		const observer = new ResizeObserver(recalcTruncation);
+		observer.observe(el);
+		if (el.parentElement) observer.observe(el.parentElement);
+		window.addEventListener("resize", recalcTruncation);
+
+		return () => {
+			cancelAnimationFrame(raf);
+			observer.disconnect();
+			window.removeEventListener("resize", recalcTruncation);
+		};
+	}, [enableTooltip, recalcTruncation]);
+
+	useEffect(() => {
+		// Recompute when row text changes.
+		void text;
+		requestAnimationFrame(recalcTruncation);
+	}, [text, recalcTruncation]);
+
+	useEffect(
+		() => () => {
+			if (openTimerRef.current) window.clearTimeout(openTimerRef.current);
+		},
+		[]
+	);
+
+	if (!enableTooltip) {
+		return (
+			<span ref={textRef} className={cn("block max-w-full", className)}>
+				{text}
+			</span>
+		);
+	}
+
+	const handleOpenChange = (nextOpen: boolean) => {
+		if (openTimerRef.current) {
+			window.clearTimeout(openTimerRef.current);
+			openTimerRef.current = undefined;
+		}
+		if (!nextOpen) {
+			setOpen(false);
+			return;
+		}
+		if (!isTruncated) return;
+		openTimerRef.current = window.setTimeout(() => {
+			setOpen(true);
+			openTimerRef.current = undefined;
+		}, 220);
+	};
+
+	return (
+		<Tooltip open={open} onOpenChange={handleOpenChange}>
+			<TooltipTrigger asChild>
+				<span ref={textRef} className={cn("block max-w-full", className)}>
+					{text}
+				</span>
+			</TooltipTrigger>
+			<TooltipContent side="top" align="start">
+				{text}
+			</TooltipContent>
+		</Tooltip>
+	);
+};
+
+// ─── Component ──────────────────────────────────────────────────────
+
 interface ModelSelectorProps {
 	onEditLLM: (config: NewLLMConfigPublic | GlobalNewLLMConfig, isGlobal: boolean) => void;
-	onAddNewLLM: () => void;
+	onAddNewLLM: (provider?: string) => void;
 	onEditImage?: (config: ImageGenerationConfig | GlobalImageGenConfig, isGlobal: boolean) => void;
-	onAddNewImage?: () => void;
+	onAddNewImage?: (provider?: string) => void;
 	onEditVision?: (config: VisionLLMConfig | GlobalVisionLLMConfig, isGlobal: boolean) => void;
-	onAddNewVision?: () => void;
+	onAddNewVision?: (provider?: string) => void;
 	className?: string;
 }
 
@@ -65,42 +347,133 @@ export function ModelSelector({
 }: ModelSelectorProps) {
 	const [open, setOpen] = useState(false);
 	const [activeTab, setActiveTab] = useState<"llm" | "image" | "vision">("llm");
-	const [llmSearchQuery, setLlmSearchQuery] = useState("");
-	const [imageSearchQuery, setImageSearchQuery] = useState("");
-	const [visionSearchQuery, setVisionSearchQuery] = useState("");
-	const [llmScrollPos, setLlmScrollPos] = useState<"top" | "middle" | "bottom">("top");
-	const [imageScrollPos, setImageScrollPos] = useState<"top" | "middle" | "bottom">("top");
-	const [visionScrollPos, setVisionScrollPos] = useState<"top" | "middle" | "bottom">("top");
-	const handleListScroll = useCallback(
-		(setter: typeof setLlmScrollPos) => (e: UIEvent<HTMLDivElement>) => {
+	const [searchQuery, setSearchQuery] = useState("");
+	const [selectedProvider, setSelectedProvider] = useState<string>("all");
+	const [focusedIndex, setFocusedIndex] = useState(-1);
+	const [modelScrollPos, setModelScrollPos] = useState<"top" | "middle" | "bottom">("top");
+	const [sidebarScrollPos, setSidebarScrollPos] = useState<"top" | "middle" | "bottom">("top");
+	const providerSidebarRef = useRef<HTMLDivElement>(null);
+	const modelListRef = useRef<HTMLDivElement>(null);
+	const searchInputRef = useRef<HTMLInputElement>(null);
+	const isMobile = useIsMobile();
+
+	const handleOpenChange = useCallback(
+		(next: boolean) => {
+			if (next) {
+				setSearchQuery("");
+				setSelectedProvider("all");
+				if (!isMobile) {
+					requestAnimationFrame(() => searchInputRef.current?.focus());
+				}
+			}
+			setOpen(next);
+		},
+		[isMobile]
+	);
+
+	const handleTabChange = useCallback(
+		(next: "llm" | "image" | "vision") => {
+			setActiveTab(next);
+			setSelectedProvider("all");
+			setSearchQuery("");
+			setFocusedIndex(-1);
+			setModelScrollPos("top");
+			if (open && !isMobile) {
+				requestAnimationFrame(() => searchInputRef.current?.focus());
+			}
+		},
+		[open, isMobile]
+	);
+
+	const handleModelListScroll = useCallback((e: React.UIEvent<HTMLDivElement>) => {
+		const el = e.currentTarget;
+		const atTop = el.scrollTop <= 2;
+		const atBottom = el.scrollHeight - el.scrollTop - el.clientHeight <= 2;
+		setModelScrollPos(atTop ? "top" : atBottom ? "bottom" : "middle");
+	}, []);
+
+	const handleSidebarScroll = useCallback(
+		(e: React.UIEvent<HTMLDivElement>) => {
 			const el = e.currentTarget;
-			const atTop = el.scrollTop <= 2;
-			const atBottom = el.scrollHeight - el.scrollTop - el.clientHeight <= 2;
-			setter(atTop ? "top" : atBottom ? "bottom" : "middle");
+			if (isMobile) {
+				const atStart = el.scrollLeft <= 2;
+				const atEnd = el.scrollWidth - el.scrollLeft - el.clientWidth <= 2;
+				setSidebarScrollPos(atStart ? "top" : atEnd ? "bottom" : "middle");
+			} else {
+				const atTop = el.scrollTop <= 2;
+				const atBottom = el.scrollHeight - el.scrollTop - el.clientHeight <= 2;
+				setSidebarScrollPos(atTop ? "top" : atBottom ? "bottom" : "middle");
+			}
 		},
-		[]
+		[isMobile]
 	);
 
-	// LLM data
+	const scrollProviderSidebar = useCallback(
+		(direction: "backward" | "forward") => {
+			const el = providerSidebarRef.current;
+			if (!el) return;
+			const delta = isMobile
+				? Math.max(56, Math.floor(el.clientWidth * 0.5))
+				: Math.max(44, Math.floor(el.clientHeight * 0.4));
+
+			if (isMobile) {
+				el.scrollBy({
+					left: direction === "backward" ? -delta : delta,
+					behavior: "smooth",
+				});
+				return;
+			}
+
+			el.scrollBy({
+				top: direction === "backward" ? -delta : delta,
+				behavior: "smooth",
+			});
+		},
+		[isMobile]
+	);
+
+	// Cmd/Ctrl+M shortcut (desktop only)
+	useEffect(() => {
+		if (isMobile) return;
+		const handler = (e: KeyboardEvent) => {
+			if ((e.metaKey || e.ctrlKey) && e.key === "m") {
+				e.preventDefault();
+				// setOpen((prev) => !prev);
+				handleOpenChange(!open);
+			}
+		};
+		document.addEventListener("keydown", handler);
+		return () => document.removeEventListener("keydown", handler);
+	}, [isMobile, open, handleOpenChange]);
+
+	// ─── Data ───
 	const { data: llmUserConfigs, isLoading: llmUserLoading } = useAtomValue(newLLMConfigsAtom);
 	const { data: llmGlobalConfigs, isLoading: llmGlobalLoading } =
 		useAtomValue(globalNewLLMConfigsAtom);
 	const { data: preferences, isLoading: prefsLoading } = useAtomValue(llmPreferencesAtom);
 	const searchSpaceId = useAtomValue(activeSearchSpaceIdAtom);
 	const { mutateAsync: updatePreferences } = useAtomValue(updateLLMPreferencesMutationAtom);
-
-	// Image data
 	const { data: imageGlobalConfigs, isLoading: imageGlobalLoading } =
 		useAtomValue(globalImageGenConfigsAtom);
 	const { data: imageUserConfigs, isLoading: imageUserLoading } = useAtomValue(imageGenConfigsAtom);
-
-	// Vision data
 	const { data: visionGlobalConfigs, isLoading: visionGlobalLoading } = useAtomValue(
 		globalVisionLLMConfigsAtom
 	);
 	const { data: visionUserConfigs, isLoading: visionUserLoading } =
 		useAtomValue(visionLLMConfigsAtom);
 
+	// Pending image attachments on the composer. Used to surface an
+	// amber "No image" hint on chat models the catalog reports as
+	// non-vision (`supports_image_input=false`) when the next message
+	// will carry an image. The hint is purely advisory: selection,
+	// focus, and click handling are unaffected. The backend's safety
+	// net (`is_known_text_only_chat_model`) is the actual block, and
+	// it only fires when LiteLLM *explicitly* marks a model as
+	// text-only — so a model that's secretly capable but hasn't been
+	// annotated will still flow through to the provider.
+	const pendingUserImageUrls = useAtomValue(pendingUserImageDataUrlsAtom);
+	const hasPendingImages = pendingUserImageUrls.length > 0;
+
 	const isLoading =
 		llmUserLoading ||
 		llmGlobalLoading ||
@@ -110,133 +483,176 @@ export function ModelSelector({
 		visionGlobalLoading ||
 		visionUserLoading;
 
-	// ─── LLM current config ───
+	// ─── Current selected configs ───
 	const currentLLMConfig = useMemo(() => {
 		if (!preferences) return null;
-		const agentLlmId = preferences.agent_llm_id;
-		if (agentLlmId === null || agentLlmId === undefined) return null;
-		if (agentLlmId <= 0) {
-			return llmGlobalConfigs?.find((c) => c.id === agentLlmId) ?? null;
-		}
-		return llmUserConfigs?.find((c) => c.id === agentLlmId) ?? null;
+		const id = preferences.agent_llm_id;
+		if (id === null || id === undefined) return null;
+		if (id <= 0) return llmGlobalConfigs?.find((c) => c.id === id) ?? null;
+		return llmUserConfigs?.find((c) => c.id === id) ?? null;
 	}, [preferences, llmGlobalConfigs, llmUserConfigs]);
 
 	const isLLMAutoMode =
 		currentLLMConfig && "is_auto_mode" in currentLLMConfig && currentLLMConfig.is_auto_mode;
 
-	// ─── Image current config ───
 	const currentImageConfig = useMemo(() => {
 		if (!preferences) return null;
 		const id = preferences.image_generation_config_id;
 		if (id === null || id === undefined) return null;
-		const globalMatch = imageGlobalConfigs?.find((c) => c.id === id);
-		if (globalMatch) return globalMatch;
-		return imageUserConfigs?.find((c) => c.id === id) ?? null;
+		return (
+			imageGlobalConfigs?.find((c) => c.id === id) ??
+			imageUserConfigs?.find((c) => c.id === id) ??
+			null
+		);
 	}, [preferences, imageGlobalConfigs, imageUserConfigs]);
 
 	const isImageAutoMode =
 		currentImageConfig && "is_auto_mode" in currentImageConfig && currentImageConfig.is_auto_mode;
 
-	// ─── Vision current config ───
 	const currentVisionConfig = useMemo(() => {
 		if (!preferences) return null;
 		const id = preferences.vision_llm_config_id;
 		if (id === null || id === undefined) return null;
-		const globalMatch = visionGlobalConfigs?.find((c) => c.id === id);
-		if (globalMatch) return globalMatch;
-		return visionUserConfigs?.find((c) => c.id === id) ?? null;
-	}, [preferences, visionGlobalConfigs, visionUserConfigs]);
-
-	const isVisionAutoMode = useMemo(() => {
 		return (
-			currentVisionConfig &&
-			"is_auto_mode" in currentVisionConfig &&
-			currentVisionConfig.is_auto_mode
+			visionGlobalConfigs?.find((c) => c.id === id) ??
+			visionUserConfigs?.find((c) => c.id === id) ??
+			null
 		);
-	}, [currentVisionConfig]);
-
-	// ─── LLM filtering ───
-	const filteredLLMGlobal = useMemo(() => {
-		if (!llmGlobalConfigs) return [];
-		if (!llmSearchQuery) return llmGlobalConfigs;
-		const q = llmSearchQuery.toLowerCase();
-		return llmGlobalConfigs.filter(
-			(c) =>
-				c.name.toLowerCase().includes(q) ||
-				c.model_name.toLowerCase().includes(q) ||
-				c.provider.toLowerCase().includes(q)
-		);
-	}, [llmGlobalConfigs, llmSearchQuery]);
-
-	const filteredLLMUser = useMemo(() => {
-		if (!llmUserConfigs) return [];
-		if (!llmSearchQuery) return llmUserConfigs;
-		const q = llmSearchQuery.toLowerCase();
-		return llmUserConfigs.filter(
-			(c) =>
-				c.name.toLowerCase().includes(q) ||
-				c.model_name.toLowerCase().includes(q) ||
-				c.provider.toLowerCase().includes(q)
-		);
-	}, [llmUserConfigs, llmSearchQuery]);
-
-	const totalLLMModels = (llmGlobalConfigs?.length ?? 0) + (llmUserConfigs?.length ?? 0);
-
-	// ─── Image filtering ───
-	const filteredImageGlobal = useMemo(() => {
-		if (!imageGlobalConfigs) return [];
-		if (!imageSearchQuery) return imageGlobalConfigs;
-		const q = imageSearchQuery.toLowerCase();
-		return imageGlobalConfigs.filter(
-			(c) =>
-				c.name.toLowerCase().includes(q) ||
-				c.model_name.toLowerCase().includes(q) ||
-				c.provider.toLowerCase().includes(q)
-		);
-	}, [imageGlobalConfigs, imageSearchQuery]);
-
-	const filteredImageUser = useMemo(() => {
-		if (!imageUserConfigs) return [];
-		if (!imageSearchQuery) return imageUserConfigs;
-		const q = imageSearchQuery.toLowerCase();
-		return imageUserConfigs.filter(
-			(c) =>
-				c.name.toLowerCase().includes(q) ||
-				c.model_name.toLowerCase().includes(q) ||
-				c.provider.toLowerCase().includes(q)
-		);
-	}, [imageUserConfigs, imageSearchQuery]);
-
-	const totalImageModels = (imageGlobalConfigs?.length ?? 0) + (imageUserConfigs?.length ?? 0);
-
-	// ─── Vision filtering ───
-	const filteredVisionGlobal = useMemo(() => {
-		if (!visionGlobalConfigs) return [];
-		if (!visionSearchQuery) return visionGlobalConfigs;
-		const q = visionSearchQuery.toLowerCase();
-		return visionGlobalConfigs.filter(
-			(c) =>
-				c.name.toLowerCase().includes(q) ||
-				c.model_name.toLowerCase().includes(q) ||
-				c.provider.toLowerCase().includes(q)
-		);
-	}, [visionGlobalConfigs, visionSearchQuery]);
-
-	const filteredVisionUser = useMemo(() => {
-		if (!visionUserConfigs) return [];
-		if (!visionSearchQuery) return visionUserConfigs;
-		const q = visionSearchQuery.toLowerCase();
-		return visionUserConfigs.filter(
-			(c) =>
-				c.name.toLowerCase().includes(q) ||
-				c.model_name.toLowerCase().includes(q) ||
-				c.provider.toLowerCase().includes(q)
-		);
-	}, [visionUserConfigs, visionSearchQuery]);
+	}, [preferences, visionGlobalConfigs, visionUserConfigs]);
+
+	const isVisionAutoMode =
+		currentVisionConfig &&
+		"is_auto_mode" in currentVisionConfig &&
+		currentVisionConfig.is_auto_mode;
+
+	// ─── Filtered configs (separate global / user for section headers) ───
+	const filteredLLMGlobal = useMemo(
+		() => filterAndScore(llmGlobalConfigs ?? [], selectedProvider, searchQuery),
+		[llmGlobalConfigs, selectedProvider, searchQuery]
+	);
+	const filteredLLMUser = useMemo(
+		() => filterAndScore(llmUserConfigs ?? [], selectedProvider, searchQuery),
+		[llmUserConfigs, selectedProvider, searchQuery]
+	);
+	const filteredImageGlobal = useMemo(
+		() => filterAndScore(imageGlobalConfigs ?? [], selectedProvider, searchQuery),
+		[imageGlobalConfigs, selectedProvider, searchQuery]
+	);
+	const filteredImageUser = useMemo(
+		() => filterAndScore(imageUserConfigs ?? [], selectedProvider, searchQuery),
+		[imageUserConfigs, selectedProvider, searchQuery]
+	);
+	const filteredVisionGlobal = useMemo(
+		() => filterAndScore(visionGlobalConfigs ?? [], selectedProvider, searchQuery),
+		[visionGlobalConfigs, selectedProvider, searchQuery]
+	);
+	const filteredVisionUser = useMemo(
+		() => filterAndScore(visionUserConfigs ?? [], selectedProvider, searchQuery),
+		[visionUserConfigs, selectedProvider, searchQuery]
+	);
+
+	// Combined display list for keyboard navigation
+	const currentDisplayItems: DisplayItem[] = useMemo(() => {
+		const toItems = (configs: ConfigBase[], isGlobal: boolean): DisplayItem[] =>
+			configs.map((c) => ({
+				config: c as ConfigBase & Record<string, unknown>,
+				isGlobal,
+				isAutoMode:
+					isGlobal && "is_auto_mode" in c && !!(c as Record<string, unknown>).is_auto_mode,
+			}));
 
-	const totalVisionModels = (visionGlobalConfigs?.length ?? 0) + (visionUserConfigs?.length ?? 0);
+		const sortGlobalItems = (items: DisplayItem[]): DisplayItem[] =>
+			[...items].sort((a, b) => {
+				if (a.isAutoMode !== b.isAutoMode) return a.isAutoMode ? -1 : 1;
+				const aPremium = !!(a.config as Record<string, unknown>).is_premium;
+				const bPremium = !!(b.config as Record<string, unknown>).is_premium;
+				if (aPremium !== bPremium) return aPremium ? 1 : -1;
+				return 0;
+			});
 
-	// ─── Handlers ───
+		switch (activeTab) {
+			case "llm":
+				return [
+					...sortGlobalItems(toItems(filteredLLMGlobal, true)),
+					...toItems(filteredLLMUser, false),
+				];
+			case "image":
+				return [
+					...sortGlobalItems(toItems(filteredImageGlobal, true)),
+					...toItems(filteredImageUser, false),
+				];
+			case "vision":
+				return [
+					...sortGlobalItems(toItems(filteredVisionGlobal, true)),
+					...toItems(filteredVisionUser, false),
+				];
+		}
+	}, [
+		activeTab,
+		filteredLLMGlobal,
+		filteredLLMUser,
+		filteredImageGlobal,
+		filteredImageUser,
+		filteredVisionGlobal,
+		filteredVisionUser,
+	]);
+
+	// ─── Provider sidebar data ───
+	// Collect which providers actually have configured models for the active tab
+	const configuredProviderSet = useMemo(() => {
+		const configs =
+			activeTab === "llm"
+				? [...(llmGlobalConfigs ?? []), ...(llmUserConfigs ?? [])]
+				: activeTab === "image"
+					? [...(imageGlobalConfigs ?? []), ...(imageUserConfigs ?? [])]
+					: [...(visionGlobalConfigs ?? []), ...(visionUserConfigs ?? [])];
+		const set = new Set<string>();
+		for (const c of configs) {
+			if (c.provider) set.add(c.provider.toUpperCase());
+		}
+		return set;
+	}, [
+		activeTab,
+		llmGlobalConfigs,
+		llmUserConfigs,
+		imageGlobalConfigs,
+		imageUserConfigs,
+		visionGlobalConfigs,
+		visionUserConfigs,
+	]);
+
+	// Show only providers valid for the active tab; configured ones first
+	const activeProviders = useMemo(() => {
+		const tabKeys = PROVIDER_KEYS_BY_TAB[activeTab] ?? LLM_PROVIDER_KEYS;
+		const configured = tabKeys.filter((p) => configuredProviderSet.has(p));
+		const unconfigured = tabKeys.filter((p) => !configuredProviderSet.has(p));
+		return ["all", ...configured, ...unconfigured];
+	}, [activeTab, configuredProviderSet]);
+
+	const providerModelCounts = useMemo(() => {
+		const allConfigs =
+			activeTab === "llm"
+				? [...(llmGlobalConfigs ?? []), ...(llmUserConfigs ?? [])]
+				: activeTab === "image"
+					? [...(imageGlobalConfigs ?? []), ...(imageUserConfigs ?? [])]
+					: [...(visionGlobalConfigs ?? []), ...(visionUserConfigs ?? [])];
+		const counts: Record<string, number> = { all: allConfigs.length };
+		for (const c of allConfigs) {
+			const p = c.provider.toUpperCase();
+			counts[p] = (counts[p] || 0) + 1;
+		}
+		return counts;
+	}, [
+		activeTab,
+		llmGlobalConfigs,
+		llmUserConfigs,
+		imageGlobalConfigs,
+		imageUserConfigs,
+		visionGlobalConfigs,
+		visionUserConfigs,
+	]);
+
+	// ─── Selection handlers ───
 	const handleSelectLLM = useCallback(
 		async (config: NewLLMConfigPublic | GlobalNewLLMConfig) => {
 			if (currentLLMConfig?.id === config.id) {
@@ -254,23 +670,13 @@ export function ModelSelector({
 				});
 				toast.success(`Switched to ${config.name}`);
 				setOpen(false);
-			} catch (error) {
-				console.error("Failed to switch model:", error);
+			} catch {
 				toast.error("Failed to switch model");
 			}
 		},
 		[currentLLMConfig, searchSpaceId, updatePreferences]
 	);
 
-	const handleEditLLMConfig = useCallback(
-		(e: React.MouseEvent, config: NewLLMConfigPublic | GlobalNewLLMConfig, isGlobal: boolean) => {
-			e.stopPropagation();
-			onEditLLM(config, isGlobal);
-			setOpen(false);
-		},
-		[onEditLLM]
-	);
-
 	const handleSelectImage = useCallback(
 		async (configId: number) => {
 			if (currentImageConfig?.id === configId) {
@@ -319,646 +725,724 @@ export function ModelSelector({
 		[currentVisionConfig, searchSpaceId, updatePreferences]
 	);
 
-	return (
-		<Popover open={open} onOpenChange={setOpen}>
-			<PopoverTrigger asChild>
-				<Button
-					variant="ghost"
-					size="sm"
-					role="combobox"
-					aria-expanded={open}
+	const handleSelectItem = useCallback(
+		(item: DisplayItem) => {
+			switch (activeTab) {
+				case "llm":
+					handleSelectLLM(item.config as NewLLMConfigPublic | GlobalNewLLMConfig);
+					break;
+				case "image":
+					handleSelectImage(item.config.id);
+					break;
+				case "vision":
+					handleSelectVision(item.config.id);
+					break;
+			}
+		},
+		[activeTab, handleSelectLLM, handleSelectImage, handleSelectVision]
+	);
+
+	const handleEditItem = useCallback(
+		(e: React.MouseEvent, item: DisplayItem) => {
+			e.stopPropagation();
+			setOpen(false);
+			switch (activeTab) {
+				case "llm":
+					onEditLLM(item.config as NewLLMConfigPublic | GlobalNewLLMConfig, item.isGlobal);
+					break;
+				case "image":
+					onEditImage?.(item.config as ImageGenerationConfig | GlobalImageGenConfig, item.isGlobal);
+					break;
+				case "vision":
+					onEditVision?.(item.config as VisionLLMConfig | GlobalVisionLLMConfig, item.isGlobal);
+					break;
+			}
+		},
+		[activeTab, onEditLLM, onEditImage, onEditVision]
+	);
+
+	// ─── Keyboard navigation ───
+	// biome-ignore lint/correctness/useExhaustiveDependencies: searchQuery and selectedProvider are intentional triggers to reset focus
+	useEffect(() => {
+		setFocusedIndex(-1);
+	}, [searchQuery, selectedProvider]);
+
+	useEffect(() => {
+		if (focusedIndex < 0 || !modelListRef.current) return;
+		const items = modelListRef.current.querySelectorAll("[data-model-index]");
+		items[focusedIndex]?.scrollIntoView({
+			block: "nearest",
+			behavior: "smooth",
+		});
+	}, [focusedIndex]);
+
+	const handleKeyDown = useCallback(
+		(e: React.KeyboardEvent<HTMLInputElement>) => {
+			const count = currentDisplayItems.length;
+
+			// Arrow Left/Right cycle provider filters
+			if (e.key === "ArrowLeft" || e.key === "ArrowRight") {
+				e.preventDefault();
+				const providers = activeProviders;
+				const idx = providers.indexOf(selectedProvider);
+				let next: number;
+				if (e.key === "ArrowLeft") {
+					next = idx > 0 ? idx - 1 : providers.length - 1;
+				} else {
+					next = idx < providers.length - 1 ? idx + 1 : 0;
+				}
+				setSelectedProvider(providers[next]);
+				if (providerSidebarRef.current) {
+					const buttons = providerSidebarRef.current.querySelectorAll("button");
+					buttons[next]?.scrollIntoView({
+						block: "nearest",
+						inline: "nearest",
+						behavior: "smooth",
+					});
+				}
+				return;
+			}
+
+			if (count === 0) return;
+
+			switch (e.key) {
+				case "ArrowDown":
+					e.preventDefault();
+					setFocusedIndex((prev) => (prev < count - 1 ? prev + 1 : 0));
+					break;
+				case "ArrowUp":
+					e.preventDefault();
+					setFocusedIndex((prev) => (prev > 0 ? prev - 1 : count - 1));
+					break;
+				case "Enter":
+					e.preventDefault();
+					if (focusedIndex >= 0 && focusedIndex < count) {
+						handleSelectItem(currentDisplayItems[focusedIndex]);
+					}
+					break;
+				case "Home":
+					e.preventDefault();
+					setFocusedIndex(0);
+					break;
+				case "End":
+					e.preventDefault();
+					setFocusedIndex(count - 1);
+					break;
+			}
+		},
+		[currentDisplayItems, focusedIndex, activeProviders, selectedProvider, handleSelectItem]
+	);
+
+	// ─── Render: Provider sidebar ───
+	const renderProviderSidebar = () => {
+		const configuredCount = configuredProviderSet.size;
+
+		return (
+			<div
+				className={cn(
+					"shrink-0 border-popover-border flex relative",
+					isMobile ? "flex-row items-center border-b" : "flex-col w-10 border-r"
+				)}
+			>
+				{!isMobile && (
+					<div
+						className={cn(
+							"absolute top-0 left-0 right-0 z-10 h-5 flex items-center justify-center transition-all duration-200 ease-out",
+							sidebarScrollPos === "top"
+								? "opacity-0 -translate-y-1 pointer-events-none"
+								: "opacity-100 translate-y-0 pointer-events-auto"
+						)}
+					>
+						<Button
+							type="button"
+							variant="ghost"
+							aria-label="Scroll providers up"
+							onClick={() => scrollProviderSidebar("backward")}
+							className="h-4 w-4 rounded-sm p-0 text-muted-foreground/90 hover:bg-accent hover:text-accent-foreground"
+						>
+							<ChevronUp className="size-3" />
+						</Button>
+					</div>
+				)}
+				{isMobile && (
+					<div
+						className={cn(
+							"absolute left-0 top-0 bottom-0 z-10 w-5 flex items-center justify-center transition-all duration-200 ease-out pointer-events-none",
+							sidebarScrollPos === "top" ? "opacity-0 -translate-x-1" : "opacity-100 translate-x-0"
+						)}
+					>
+						<ChevronLeft className="size-3 text-muted-foreground" />
+					</div>
+				)}
+				<div
+					ref={providerSidebarRef}
+					onScroll={handleSidebarScroll}
 					className={cn(
-						"h-8 gap-2 px-3 text-sm bg-main-panel hover:bg-accent/50 dark:hover:bg-white/[0.06] border border-border/40 select-none",
-						className
+						isMobile
+							? "flex flex-row gap-0.5 px-1 py-1.5 overflow-x-auto [&::-webkit-scrollbar]:h-0 [&::-webkit-scrollbar-track]:bg-transparent"
+							: "flex flex-col gap-0.5 p-1 overflow-y-auto flex-1 [&::-webkit-scrollbar]:w-0 [&::-webkit-scrollbar-track]:bg-transparent"
 					)}
+					style={
+						isMobile
+							? {
+									maskImage: `linear-gradient(to right, ${sidebarScrollPos === "top" ? "black" : "transparent"}, black 24px, black calc(100% - 24px), ${sidebarScrollPos === "bottom" ? "black" : "transparent"})`,
+									WebkitMaskImage: `linear-gradient(to right, ${sidebarScrollPos === "top" ? "black" : "transparent"}, black 24px, black calc(100% - 24px), ${sidebarScrollPos === "bottom" ? "black" : "transparent"})`,
+								}
+							: {
+									maskImage: `linear-gradient(to bottom, ${sidebarScrollPos === "top" ? "black" : "transparent"}, black 32px, black calc(100% - 32px), ${sidebarScrollPos === "bottom" ? "black" : "transparent"})`,
+									WebkitMaskImage: `linear-gradient(to bottom, ${sidebarScrollPos === "top" ? "black" : "transparent"}, black 32px, black calc(100% - 32px), ${sidebarScrollPos === "bottom" ? "black" : "transparent"})`,
+								}
+					}
 				>
-					{isLoading ? (
-						<>
-							<Spinner size="sm" className="text-muted-foreground" />
-							<span className="text-muted-foreground hidden md:inline">Loading</span>
-						</>
-					) : (
-						<>
-							{/* LLM section */}
-							{currentLLMConfig ? (
-								<>
-									{getProviderIcon(currentLLMConfig.provider, {
-										isAutoMode: isLLMAutoMode ?? false,
-									})}
-									<span className="max-w-[100px] md:max-w-[120px] truncate hidden md:inline">
-										{currentLLMConfig.name}
-									</span>
-								</>
-							) : (
-								<>
-									<Bot className="size-4 text-muted-foreground" />
-									<span className="text-muted-foreground hidden md:inline">Select Model</span>
-								</>
-							)}
+					{activeProviders.map((provider, idx) => {
+						const isAll = provider === "all";
+						const isActive = selectedProvider === provider;
+						const count = providerModelCounts[provider] || 0;
+						const isConfigured = isAll || configuredProviderSet.has(provider);
 
-							{/* Divider */}
-							<div className="h-4 w-px bg-border/60 dark:bg-white/10 mx-0.5" />
+						// Separator between configured and unconfigured providers
+						// idx 0 is "all", configured run from 1..configuredCount, unconfigured start at configuredCount+1
+						const showSeparator = !isAll && idx === configuredCount + 1 && configuredCount > 0;
 
-							{/* Image section */}
-							{currentImageConfig ? (
-								<>
-									{getProviderIcon(currentImageConfig.provider, {
-										isAutoMode: isImageAutoMode ?? false,
-									})}
-									<span className="max-w-[80px] md:max-w-[100px] truncate hidden md:inline">
-										{currentImageConfig.name}
-									</span>
-								</>
-							) : (
-								<ImageIcon className="size-4 text-muted-foreground" />
-							)}
+						return (
+							<Fragment key={provider}>
+								{showSeparator &&
+									(isMobile ? (
+										<div className="w-px h-5 bg-popover-border shrink-0 self-center mx-0.5" />
+									) : (
+										<div className="h-px w-5 bg-popover-border mx-auto my-0.5" />
+									))}
+								<Tooltip>
+									<TooltipTrigger asChild>
+										<Button
+											type="button"
+											variant="ghost"
+											onClick={() => setSelectedProvider(provider)}
+											tabIndex={-1}
+											className={cn(
+												"relative h-auto rounded-md transition-all duration-150",
+												isMobile ? "p-2 shrink-0" : "p-1.5 w-full",
+												isActive
+													? "bg-primary/10 text-primary"
+													: isConfigured
+														? "hover:bg-accent text-muted-foreground hover:text-accent-foreground"
+														: "opacity-50 hover:opacity-80 hover:bg-accent/40 text-muted-foreground"
+											)}
+										>
+											{isAll ? (
+												<Layers className="size-4" />
+											) : (
+												getProviderIcon(provider, {
+													className: "size-4",
+												})
+											)}
+										</Button>
+									</TooltipTrigger>
+									<TooltipContent side={isMobile ? "bottom" : "right"}>
+										{isAll ? "All Models" : formatProviderName(provider)}
+										{isConfigured ? ` (${count})` : " (not configured)"}
+									</TooltipContent>
+								</Tooltip>
+							</Fragment>
+						);
+					})}
+				</div>
+				{!isMobile && (
+					<div
+						className={cn(
+							"absolute bottom-0 left-0 right-0 z-10 h-5 flex items-center justify-center transition-all duration-200 ease-out",
+							sidebarScrollPos === "bottom"
+								? "opacity-0 translate-y-1 pointer-events-none"
+								: "opacity-100 translate-y-0 pointer-events-auto"
+						)}
+					>
+						<Button
+							type="button"
+							variant="ghost"
+							aria-label="Scroll providers down"
+							onClick={() => scrollProviderSidebar("forward")}
+							className="h-4 w-4 rounded-sm p-0 text-muted-foreground/90 hover:bg-accent hover:text-accent-foreground"
+						>
+							<ChevronDown className="size-3" />
+						</Button>
+					</div>
+				)}
+				{isMobile && (
+					<div
+						className={cn(
+							"absolute right-0 top-0 bottom-0 z-10 w-5 flex items-center justify-center transition-all duration-200 ease-out pointer-events-none",
+							sidebarScrollPos === "bottom"
+								? "opacity-0 translate-x-1"
+								: "opacity-100 translate-x-0"
+						)}
+					>
+						<ChevronRight className="size-3 text-muted-foreground" />
+					</div>
+				)}
+			</div>
+		);
+	};
 
-							{/* Divider */}
-							<div className="h-4 w-px bg-border/60 dark:bg-white/10 mx-0.5" />
+	// ─── Render: Model card ───
+	const getSelectedId = () => {
+		switch (activeTab) {
+			case "llm":
+				return currentLLMConfig?.id;
+			case "image":
+				return currentImageConfig?.id;
+			case "vision":
+				return currentVisionConfig?.id;
+		}
+	};
 
-							{/* Vision section */}
-							{currentVisionConfig ? (
-								<>
-									{getProviderIcon(currentVisionConfig.provider, {
-										isAutoMode: isVisionAutoMode ?? false,
-									})}
-									<span className="max-w-[80px] md:max-w-[100px] truncate hidden md:inline">
-										{currentVisionConfig.name}
-									</span>
-								</>
-							) : (
-								<Eye className="size-4 text-muted-foreground" />
-							)}
-						</>
-					)}
-					<ChevronDown className="h-3.5 w-3.5 text-muted-foreground ml-1 shrink-0" />
-				</Button>
-			</PopoverTrigger>
+	const renderModelCard = (item: DisplayItem, index: number) => {
+		const { config, isAutoMode } = item;
+		const isSelected = getSelectedId() === config.id;
+		const isFocused = focusedIndex === index;
+		const hasCitations = "citations_enabled" in config && !!config.citations_enabled;
+		const hasPremiumStatus = "is_premium" in config && !isAutoMode;
+		const isPremium = hasPremiumStatus && !!(config as Record<string, unknown>).is_premium;
+		// Chat-tab only: surface an amber "No image" hint when the
+		// composer carries images and the catalog reports the model as
+		// non-vision. This is purely advisory — selection is *not*
+		// blocked. The backend's narrow safety net
+		// (`is_known_text_only_chat_model`) is the source of truth for
+		// rejecting image turns, and it only fires when LiteLLM
+		// explicitly marks the model as text-only. A model surfaced as
+		// `supports_image_input=false` here may still be capable in
+		// practice (unknown / unmapped LiteLLM entry), so we let the
+		// user pick it and the provider response decide.
+		const isImageIncompatibleChatModel =
+			activeTab === "llm" &&
+			hasPendingImages &&
+			"supports_image_input" in config &&
+			(config as Record<string, unknown>).supports_image_input === false;
 
-			<PopoverContent
-				className="w-[280px] md:w-[360px] p-0 rounded-lg shadow-lg bg-white border-border/60 dark:bg-neutral-900 dark:border dark:border-white/5 select-none"
-				align="start"
-				sideOffset={8}
+		return (
+			<div
+				key={`${activeTab}-${item.isGlobal ? "g" : "u"}-${config.id}`}
+				data-model-index={index}
+				role="option"
+				tabIndex={isMobile ? -1 : 0}
+				aria-selected={isSelected}
+				title={
+					isImageIncompatibleChatModel
+						? "This model is reported as text-only. You can still pick it; the provider may reject image turns."
+						: undefined
+				}
+				onClick={() => handleSelectItem(item)}
+				onKeyDown={
+					isMobile
+						? undefined
+						: (e) => {
+								if (e.key === "Enter" || e.key === " ") {
+									e.preventDefault();
+									handleSelectItem(item);
+								}
+							}
+				}
+				onMouseEnter={() => setFocusedIndex(index)}
+				className={cn(
+					"group flex items-center gap-2.5 px-3 py-2 rounded-xl",
+					"transition-colors duration-150 mx-2 cursor-pointer",
+					"hover:bg-accent hover:text-accent-foreground",
+					isFocused && "bg-accent text-accent-foreground",
+					isSelected && "bg-accent text-accent-foreground"
+				)}
 			>
-				<Tabs
-					value={activeTab}
-					onValueChange={(v) => setActiveTab(v as "llm" | "image" | "vision")}
-					className="w-full"
-				>
-					<div className="border-b border-border/80 dark:border-neutral-800">
-						<TabsList className="w-full grid grid-cols-3 rounded-none rounded-t-lg bg-transparent h-11 p-0 gap-0">
-							<TabsTrigger
-								value="llm"
-								className="gap-1.5 text-sm font-medium rounded-none text-muted-foreground transition-all duration-200 h-full bg-transparent data-[state=active]:bg-transparent shadow-none data-[state=active]:shadow-none border-b-[1.5px] border-transparent data-[state=active]:border-foreground dark:data-[state=active]:border-white data-[state=active]:text-foreground"
-							>
-								<Zap className="size-3.5" />
-								LLM
-							</TabsTrigger>
-							<TabsTrigger
-								value="image"
-								className="gap-1.5 text-sm font-medium rounded-none text-muted-foreground transition-all duration-200 h-full bg-transparent data-[state=active]:bg-transparent shadow-none data-[state=active]:shadow-none border-b-[1.5px] border-transparent data-[state=active]:border-foreground dark:data-[state=active]:border-white data-[state=active]:text-foreground"
+				{/* Provider icon */}
+				<div className="shrink-0">
+					{getProviderIcon(config.provider as string, {
+						isAutoMode,
+						className: "size-5",
+					})}
+				</div>
+
+				{/* Model info */}
+				<div className="flex-1 min-w-0">
+					<div className="flex items-center gap-1.5">
+						<TruncatedNameWithTooltip
+							text={config.name}
+							enableTooltip={!isMobile}
+							className="font-medium text-sm truncate"
+						/>
+						{isAutoMode && (
+							<Badge
+								variant="secondary"
+								className="text-[9px] px-1 py-0 h-3.5 bg-zinc-200 text-zinc-600 dark:bg-zinc-700 dark:text-zinc-300 border-0"
 							>
-								<ImageIcon className="size-3.5" />
-								Image
-							</TabsTrigger>
-							<TabsTrigger
-								value="vision"
-								className="gap-1.5 text-sm font-medium rounded-none text-muted-foreground transition-all duration-200 h-full bg-transparent data-[state=active]:bg-transparent shadow-none data-[state=active]:shadow-none border-b-[1.5px] border-transparent data-[state=active]:border-foreground dark:data-[state=active]:border-white data-[state=active]:text-foreground"
+								Recommended
+							</Badge>
+						)}
+						{isImageIncompatibleChatModel && (
+							<Badge
+								variant="secondary"
+								className="text-[9px] px-1 py-0 h-3.5 bg-amber-100 text-amber-700 dark:bg-amber-900/50 dark:text-amber-300 border-0"
 							>
-								<Eye className="size-3.5" />
-								Vision
-							</TabsTrigger>
-						</TabsList>
+								No image
+							</Badge>
+						)}
 					</div>
+					{isAutoMode ? (
+						<div className="flex items-center gap-1.5 mt-0.5">
+							<span className="text-xs text-muted-foreground truncate">Auto Mode</span>
+						</div>
+					) : (
+						(hasPremiumStatus || hasCitations) && (
+							<div className="flex items-center gap-1.5 mt-0.5">
+								{hasPremiumStatus && (
+									<Badge
+										variant="secondary"
+										className={cn(
+											"text-[10px] px-1.5 py-0.5 border-0",
+											isPremium
+												? "bg-purple-100 text-purple-700 dark:bg-purple-900/50 dark:text-purple-300"
+												: "bg-emerald-100 text-emerald-700 dark:bg-emerald-900/50 dark:text-emerald-300"
+										)}
+									>
+										{isPremium ? "Premium" : "Free"}
+									</Badge>
+								)}
+								{hasCitations && (
+									<Badge
+										variant="secondary"
+										className="text-[10px] px-1.5 py-0.5 border-0 bg-neutral-200 text-neutral-700 dark:bg-neutral-700 dark:text-neutral-200"
+									>
+										Citations
+									</Badge>
+								)}
+							</div>
+						)
+					)}
+				</div>
 
-					{/* ─── LLM Tab ─── */}
-					<TabsContent value="llm" className="mt-0">
-						<Command
-							shouldFilter={false}
-							className="rounded-none rounded-b-lg relative dark:bg-neutral-900 [&_[data-slot=command-input-wrapper]]:border-0 [&_[data-slot=command-input-wrapper]]:px-0 [&_[data-slot=command-input-wrapper]]:gap-2"
+				{/* Actions */}
+				<div className="flex items-center gap-1 shrink-0">
+					{!isAutoMode && (
+						<Button
+							variant="ghost"
+							size="icon"
+							className="size-7 rounded-md hover:bg-accent hover:text-accent-foreground opacity-0 group-hover:opacity-100 transition-opacity"
+							onClick={(e) => handleEditItem(e, item)}
 						>
-							{totalLLMModels > 3 && (
-								<div className="px-2 md:px-3 py-1.5 md:py-2">
-									<CommandInput
-										placeholder="Search models"
-										value={llmSearchQuery}
-										onValueChange={setLlmSearchQuery}
-										className="h-7 md:h-8 w-full text-xs md:text-sm border-0 bg-transparent focus:ring-0 placeholder:text-muted-foreground/60"
-									/>
-								</div>
-							)}
+							<Pencil className="size-3.5 text-muted-foreground" />
+						</Button>
+					)}
+					{isSelected && (
+						<div className="size-7 grid place-items-center shrink-0">
+							<Check className="size-4" />
+						</div>
+					)}
+				</div>
+			</div>
+		);
+	};
 
-							<CommandList
-								className="max-h-[300px] md:max-h-[400px] overflow-y-auto"
-								onScroll={handleListScroll(setLlmScrollPos)}
-								style={{
-									maskImage: `linear-gradient(to bottom, ${llmScrollPos === "top" ? "black" : "transparent"}, black 16px, black calc(100% - 16px), ${llmScrollPos === "bottom" ? "black" : "transparent"})`,
-									WebkitMaskImage: `linear-gradient(to bottom, ${llmScrollPos === "top" ? "black" : "transparent"}, black 16px, black calc(100% - 16px), ${llmScrollPos === "bottom" ? "black" : "transparent"})`,
-								}}
-							>
-								<CommandEmpty className="py-8 text-center">
-									<div className="flex flex-col items-center gap-2">
-										<Search className="size-8 text-muted-foreground" />
-										<p className="text-sm text-muted-foreground">No models found</p>
-										<p className="text-xs text-muted-foreground/60">Try a different search term</p>
-									</div>
-								</CommandEmpty>
-
-								{/* Global LLM Configs */}
-								{filteredLLMGlobal.length > 0 && (
-									<CommandGroup>
-										<div className="flex items-center gap-2 px-3 py-2 text-xs font-semibold text-muted-foreground tracking-wider">
-											Global Models
-										</div>
-										{filteredLLMGlobal.map((config) => {
-											const isSelected = currentLLMConfig?.id === config.id;
-											const isAutoMode = "is_auto_mode" in config && config.is_auto_mode;
-											return (
-												<CommandItem
-													key={`llm-g-${config.id}`}
-													value={`llm-g-${config.id}`}
-													onSelect={() => handleSelectLLM(config)}
-													className={cn(
-														"mx-2 rounded-lg mb-1 cursor-pointer group transition-all",
-														"hover:bg-accent/50 dark:hover:bg-white/[0.06]",
-														isSelected && "bg-accent/80 dark:bg-white/[0.06]",
-														isAutoMode && ""
-													)}
-												>
-													<div className="flex items-center justify-between w-full gap-2">
-														<div className="flex items-center gap-3 min-w-0 flex-1">
-															<div className="shrink-0">
-																{getProviderIcon(config.provider, { isAutoMode })}
-															</div>
-															<div className="min-w-0 flex-1">
-																<div className="flex items-center gap-2">
-																	<span className="font-medium truncate">{config.name}</span>
-																	{isAutoMode && (
-																		<Badge
-																			variant="secondary"
-																			className="text-[9px] px-1 py-0 h-3.5 bg-violet-800 text-white dark:bg-violet-800 dark:text-white border-0"
-																		>
-																			Recommended
-																		</Badge>
-																	)}
-																	{isSelected && (
-																		<Check className="size-3.5 text-primary shrink-0" />
-																	)}
-																</div>
-																<div className="flex items-center gap-1.5 mt-0.5">
-																	<span className="text-xs text-muted-foreground truncate">
-																		{isAutoMode ? "Auto Mode" : config.model_name}
-																	</span>
-																	{!isAutoMode && config.citations_enabled && (
-																		<Badge
-																			variant="outline"
-																			className="text-[9px] px-1 py-0 h-3.5 bg-primary/10 text-primary border-primary/20"
-																		>
-																			Citations
-																		</Badge>
-																	)}
-																</div>
-															</div>
-														</div>
-														{!isAutoMode && (
-															<Button
-																variant="ghost"
-																size="icon"
-																className="size-7 shrink-0 rounded-md hover:bg-muted opacity-0 group-hover:opacity-100 transition-opacity"
-																onClick={(e) => handleEditLLMConfig(e, config, true)}
-															>
-																<Edit3 className="size-3.5 text-muted-foreground" />
-															</Button>
-														)}
-													</div>
-												</CommandItem>
-											);
-										})}
-									</CommandGroup>
-								)}
+	// ─── Render: Full content ───
+	const renderContent = () => {
+		const globalItems = currentDisplayItems.filter((i) => i.isGlobal);
+		const userItems = currentDisplayItems.filter((i) => !i.isGlobal);
+		const globalStartIdx = 0;
+		const userStartIdx = globalItems.length;
 
-								{filteredLLMGlobal.length > 0 && filteredLLMUser.length > 0 && (
-									<CommandSeparator className="my-1 mx-4 bg-border/60" />
-								)}
+		const addHandler =
+			activeTab === "llm" ? onAddNewLLM : activeTab === "image" ? onAddNewImage : onAddNewVision;
+		const addLabel =
+			activeTab === "llm"
+				? "Add Model"
+				: activeTab === "image"
+					? "Add Image Model"
+					: "Add Vision Model";
 
-								{/* User LLM Configs */}
-								{filteredLLMUser.length > 0 && (
-									<CommandGroup>
-										<div className="flex items-center gap-2 px-3 py-2 text-xs font-semibold text-muted-foreground tracking-wider">
-											Your Configurations
-										</div>
-										{filteredLLMUser.map((config) => {
-											const isSelected = currentLLMConfig?.id === config.id;
-											return (
-												<CommandItem
-													key={`llm-u-${config.id}`}
-													value={`llm-u-${config.id}`}
-													onSelect={() => handleSelectLLM(config)}
-													className={cn(
-														"mx-2 rounded-lg mb-1 cursor-pointer group transition-all",
-														"hover:bg-accent/50 dark:hover:bg-white/[0.06]",
-														isSelected && "bg-accent/80 dark:bg-white/[0.06]"
-													)}
-												>
-													<div className="flex items-center justify-between w-full gap-2">
-														<div className="flex items-center gap-3 min-w-0 flex-1">
-															<div className="shrink-0">{getProviderIcon(config.provider)}</div>
-															<div className="min-w-0 flex-1">
-																<div className="flex items-center gap-2">
-																	<span className="font-medium truncate">{config.name}</span>
-																	{isSelected && (
-																		<Check className="size-3.5 text-primary shrink-0" />
-																	)}
-																</div>
-																<div className="flex items-center gap-1.5 mt-0.5">
-																	<span className="text-xs text-muted-foreground truncate">
-																		{config.model_name}
-																	</span>
-																	{config.citations_enabled && (
-																		<Badge
-																			variant="outline"
-																			className="text-[9px] px-1 py-0 h-3.5 bg-primary/10 text-primary border-primary/20"
-																		>
-																			Citations
-																		</Badge>
-																	)}
-																</div>
-															</div>
-														</div>
-														<Button
-															variant="ghost"
-															size="icon"
-															className="size-7 shrink-0 rounded-md hover:bg-muted opacity-0 group-hover:opacity-100 transition-opacity"
-															onClick={(e) => handleEditLLMConfig(e, config, false)}
-														>
-															<Edit3 className="size-3.5 text-muted-foreground" />
-														</Button>
-													</div>
-												</CommandItem>
-											);
-										})}
-									</CommandGroup>
+		return (
+			<div className="flex flex-col w-full overflow-hidden">
+				{/* Tab header */}
+				<div className="border-b border-popover-border">
+					<div className="w-full grid grid-cols-3 h-11">
+						{(
+							[
+								{
+									value: "llm" as const,
+									icon: Zap,
+									label: "LLM",
+								},
+								{
+									value: "image" as const,
+									icon: ImageIcon,
+									label: "Image",
+								},
+								{
+									value: "vision" as const,
+									icon: ScanEye,
+									label: "Vision",
+								},
+							] as const
+						).map(({ value, icon: Icon, label }) => (
+							<Button
+								key={value}
+								type="button"
+								variant="ghost"
+								// onClick={() => setActiveTab(value)}
+								onClick={() => handleTabChange(value)}
+								className={cn(
+									"h-auto rounded-none px-0 py-0 gap-1.5 text-sm font-medium transition-all duration-200 border-b-[1.5px] hover:bg-transparent",
+									activeTab === value
+										? "border-foreground dark:border-white text-foreground"
+										: "border-transparent text-muted-foreground hover:text-accent-foreground"
 								)}
-
-								{/* Add New LLM Config */}
-								<div className="p-2 bg-muted/20 dark:bg-neutral-900">
-									<Button
-										variant="ghost"
-										size="sm"
-										className="w-full justify-start gap-2 h-9 rounded-lg hover:bg-accent/50 dark:hover:bg-white/[0.06]"
-										onClick={() => {
-											setOpen(false);
-											onAddNewLLM();
-										}}
-									>
-										<Plus className="size-4 text-primary" />
-										<span className="text-sm font-medium">Add Model</span>
-									</Button>
-								</div>
-							</CommandList>
-						</Command>
-					</TabsContent>
-
-					{/* ─── Image Tab ─── */}
-					<TabsContent value="image" className="mt-0">
-						<Command
-							shouldFilter={false}
-							className="rounded-none rounded-b-lg dark:bg-neutral-900 [&_[data-slot=command-input-wrapper]]:border-0 [&_[data-slot=command-input-wrapper]]:px-0 [&_[data-slot=command-input-wrapper]]:gap-2"
-						>
-							{totalImageModels > 3 && (
-								<div className="px-2 md:px-3 py-1.5 md:py-2">
-									<CommandInput
-										placeholder="Search models"
-										value={imageSearchQuery}
-										onValueChange={setImageSearchQuery}
-										className="h-7 md:h-8 w-full text-xs md:text-sm border-0 bg-transparent focus:ring-0 placeholder:text-muted-foreground/60"
-									/>
-								</div>
-							)}
-							<CommandList
-								className="max-h-[300px] md:max-h-[400px] overflow-y-auto"
-								onScroll={handleListScroll(setImageScrollPos)}
-								style={{
-									maskImage: `linear-gradient(to bottom, ${imageScrollPos === "top" ? "black" : "transparent"}, black 16px, black calc(100% - 16px), ${imageScrollPos === "bottom" ? "black" : "transparent"})`,
-									WebkitMaskImage: `linear-gradient(to bottom, ${imageScrollPos === "top" ? "black" : "transparent"}, black 16px, black calc(100% - 16px), ${imageScrollPos === "bottom" ? "black" : "transparent"})`,
-								}}
 							>
-								<CommandEmpty className="py-8 text-center">
-									<div className="flex flex-col items-center gap-2">
-										<Search className="size-8 text-muted-foreground" />
-										<p className="text-sm text-muted-foreground">No image models found</p>
-										<p className="text-xs text-muted-foreground/60">Try a different search term</p>
-									</div>
-								</CommandEmpty>
-
-								{/* Global Image Configs */}
-								{filteredImageGlobal.length > 0 && (
-									<CommandGroup>
-										<div className="flex items-center gap-2 px-3 py-2 text-xs font-semibold text-muted-foreground tracking-wider">
-											Global Image Models
-										</div>
-										{filteredImageGlobal.map((config) => {
-											const isSelected = currentImageConfig?.id === config.id;
-											const isAuto = "is_auto_mode" in config && config.is_auto_mode;
-											return (
-												<CommandItem
-													key={`img-g-${config.id}`}
-													value={`img-g-${config.id}`}
-													onSelect={() => handleSelectImage(config.id)}
-													className={cn(
-														"mx-2 rounded-lg mb-1 cursor-pointer group transition-all hover:bg-accent/50 dark:hover:bg-white/[0.06]",
-														isSelected && "bg-accent/80 dark:bg-white/[0.06]",
-														isAuto && ""
-													)}
-												>
-													<div className="flex items-center gap-3 min-w-0 flex-1">
-														<div className="shrink-0">
-															{getProviderIcon(config.provider, { isAutoMode: isAuto })}
-														</div>
-														<div className="min-w-0 flex-1">
-															<div className="flex items-center gap-2">
-																<span className="font-medium truncate">{config.name}</span>
-																{isAuto && (
-																	<Badge
-																		variant="secondary"
-																		className="text-[9px] px-1 py-0 h-3.5 bg-violet-800 text-white dark:bg-violet-800 dark:text-white border-0"
-																	>
-																		Recommended
-																	</Badge>
-																)}
-																{isSelected && <Check className="size-3.5 text-primary shrink-0" />}
-															</div>
-															<span className="text-xs text-muted-foreground truncate block">
-																{isAuto ? "Auto Mode" : config.model_name}
-															</span>
-														</div>
-														{onEditImage && !isAuto && (
-															<Button
-																variant="ghost"
-																size="icon"
-																className="size-7 shrink-0 rounded-md hover:bg-muted opacity-0 group-hover:opacity-100 transition-opacity"
-																onClick={(e) => {
-																	e.stopPropagation();
-																	setOpen(false);
-																	onEditImage(config, true);
-																}}
-															>
-																<Edit3 className="size-3.5 text-muted-foreground" />
-															</Button>
-														)}
-													</div>
-												</CommandItem>
-											);
-										})}
-									</CommandGroup>
-								)}
+								<Icon className="size-3.5" />
+								{label}
+							</Button>
+						))}
+					</div>
+				</div>
 
-								{/* User Image Configs */}
-								{filteredImageUser.length > 0 && (
-									<>
-										{filteredImageGlobal.length > 0 && (
-											<CommandSeparator className="my-1 mx-4 bg-border/60" />
-										)}
-										<CommandGroup>
-											<div className="flex items-center gap-2 px-3 py-2 text-xs font-semibold text-muted-foreground tracking-wider">
-												Your Image Models
-											</div>
-											{filteredImageUser.map((config) => {
-												const isSelected = currentImageConfig?.id === config.id;
-												return (
-													<CommandItem
-														key={`img-u-${config.id}`}
-														value={`img-u-${config.id}`}
-														onSelect={() => handleSelectImage(config.id)}
-														className={cn(
-															"mx-2 rounded-lg mb-1 cursor-pointer group transition-all hover:bg-accent/50 dark:hover:bg-white/[0.06]",
-															isSelected && "bg-accent/80 dark:bg-white/[0.06]"
-														)}
-													>
-														<div className="flex items-center gap-3 min-w-0 flex-1">
-															<div className="shrink-0">{getProviderIcon(config.provider)}</div>
-															<div className="min-w-0 flex-1">
-																<div className="flex items-center gap-2">
-																	<span className="font-medium truncate">{config.name}</span>
-																	{isSelected && (
-																		<Check className="size-3.5 text-primary shrink-0" />
-																	)}
-																</div>
-																<span className="text-xs text-muted-foreground truncate block">
-																	{config.model_name}
-																</span>
-															</div>
-															{onEditImage && (
-																<Button
-																	variant="ghost"
-																	size="icon"
-																	className="h-7 w-7 shrink-0 opacity-0 group-hover:opacity-100 transition-opacity"
-																	onClick={(e) => {
-																		e.stopPropagation();
-																		setOpen(false);
-																		onEditImage(config, false);
-																	}}
-																>
-																	<Edit3 className="size-3.5 text-muted-foreground" />
-																</Button>
-															)}
-														</div>
-													</CommandItem>
-												);
-											})}
-										</CommandGroup>
-									</>
-								)}
+				{/* Two-pane layout */}
+				<div className={cn("flex", isMobile ? "flex-col h-[60vh]" : "flex-row h-[380px]")}>
+					{/* Provider sidebar */}
+					{renderProviderSidebar()}
 
-								{/* Add New Image Config */}
-								{onAddNewImage && (
-									<div className="p-2 bg-muted/20 dark:bg-neutral-900">
-										<Button
-											variant="ghost"
-											size="sm"
-											className="w-full justify-start gap-2 h-9 rounded-lg hover:bg-accent/50 dark:hover:bg-white/[0.06]"
-											onClick={() => {
-												setOpen(false);
-												onAddNewImage();
-											}}
-										>
-											<Plus className="size-4 text-primary" />
-											<span className="text-sm font-medium">Add Image Model</span>
-										</Button>
-									</div>
+					{/* Main content */}
+					<div className="flex flex-col min-w-0 min-h-0 flex-1 overflow-hidden">
+						{/* Search */}
+						<div className="relative">
+							<Search className="absolute left-3 top-1/2 -translate-y-1/2 size-3.5 text-muted-foreground/100 pointer-events-none" />
+							<input
+								ref={searchInputRef}
+								placeholder="Search models"
+								value={searchQuery}
+								onChange={(e) => setSearchQuery(e.target.value)}
+								onKeyDown={isMobile ? undefined : handleKeyDown}
+								role="combobox"
+								aria-expanded={true}
+								aria-controls="model-selector-list"
+								className={cn(
+									"w-full pl-8 pr-3 py-2.5 text-sm bg-transparent",
+									"focus:outline-none",
+									"placeholder:text-muted-foreground"
 								)}
-							</CommandList>
-						</Command>
-					</TabsContent>
-
-					{/* ─── Vision Tab ─── */}
-					<TabsContent value="vision" className="mt-0">
-						<Command
-							shouldFilter={false}
-							className="rounded-none rounded-b-lg dark:bg-neutral-900 [&_[data-slot=command-input-wrapper]]:border-0 [&_[data-slot=command-input-wrapper]]:px-0 [&_[data-slot=command-input-wrapper]]:gap-2"
+							/>
+						</div>
+
+						{/* Provider header when filtered */}
+						{selectedProvider !== "all" && (
+							<div className="flex items-center gap-2 px-3 py-1.5">
+								{getProviderIcon(selectedProvider, {
+									className: "size-4",
+								})}
+								<span className="text-sm font-medium">{formatProviderName(selectedProvider)}</span>
+								<span className="text-xs text-muted-foreground ml-auto">
+									{configuredProviderSet.has(selectedProvider)
+										? `${providerModelCounts[selectedProvider] || 0} models`
+										: "Not configured"}
+								</span>
+							</div>
+						)}
+
+						{/* Model list */}
+						<div
+							id="model-selector-list"
+							ref={modelListRef}
+							role="listbox"
+							className="overflow-y-auto flex-1 py-1 space-y-1 flex flex-col"
+							onScroll={handleModelListScroll}
+							style={{
+								maskImage: `linear-gradient(to bottom, ${modelScrollPos === "top" ? "black" : "transparent"}, black 16px, black calc(100% - 16px), ${modelScrollPos === "bottom" ? "black" : "transparent"})`,
+								WebkitMaskImage: `linear-gradient(to bottom, ${modelScrollPos === "top" ? "black" : "transparent"}, black 16px, black calc(100% - 16px), ${modelScrollPos === "bottom" ? "black" : "transparent"})`,
+							}}
 						>
-							{totalVisionModels > 3 && (
-								<div className="px-2 md:px-3 py-1.5 md:py-2">
-									<CommandInput
-										placeholder="Search vision models"
-										value={visionSearchQuery}
-										onValueChange={setVisionSearchQuery}
-										className="h-7 md:h-8 w-full text-xs md:text-sm border-0 bg-transparent focus:ring-0 placeholder:text-muted-foreground/60"
-									/>
+							{currentDisplayItems.length === 0 ? (
+								<div className="flex-1 flex flex-col items-center justify-center gap-3 px-4">
+									{selectedProvider !== "all" && !configuredProviderSet.has(selectedProvider) ? (
+										<>
+											<div className="opacity-40">
+												{getProviderIcon(selectedProvider, {
+													className: "size-10",
+												})}
+											</div>
+											<p className="text-sm font-medium text-muted-foreground">
+												No {formatProviderName(selectedProvider)} models configured
+											</p>
+											<p className="text-xs text-muted-foreground/60 text-center">
+												Add a model with this provider to get started
+											</p>
+											{addHandler && (
+												<Button
+													variant="secondary"
+													size="sm"
+													className="mt-1"
+													onClick={() => {
+														setOpen(false);
+														addHandler(selectedProvider !== "all" ? selectedProvider : undefined);
+													}}
+												>
+													{addLabel}
+												</Button>
+											)}
+										</>
+									) : searchQuery ? (
+										<>
+											<Search className="size-8 text-muted-foreground" />
+											<p className="text-sm text-muted-foreground">No models found</p>
+											<p className="text-xs text-muted-foreground/60">
+												Try a different search term
+											</p>
+										</>
+									) : (
+										<>
+											<p className="text-sm font-medium text-muted-foreground">
+												No models configured
+											</p>
+											<p className="text-xs text-muted-foreground/60 text-center">
+												Configure models in your search space settings
+											</p>
+										</>
+									)}
 								</div>
+							) : (
+								<>
+									{globalItems.length > 0 && (
+										<>
+											<div className="flex items-center gap-2 px-3 py-1.5 text-[12px] font-semibold text-muted-foreground tracking-wider">
+												Global Models
+											</div>
+											{globalItems.map((item, i) => renderModelCard(item, globalStartIdx + i))}
+										</>
+									)}
+									{globalItems.length > 0 && userItems.length > 0 && (
+										<div className="my-1.5 mx-4 h-px bg-popover-border" />
+									)}
+									{userItems.length > 0 && (
+										<>
+											<div className="flex items-center gap-2 px-3 py-1.5 text-[12px] font-semibold text-muted-foreground tracking-wider">
+												Your Configurations
+											</div>
+											{userItems.map((item, i) => renderModelCard(item, userStartIdx + i))}
+										</>
+									)}
+								</>
 							)}
-							<CommandList
-								className="max-h-[300px] md:max-h-[400px] overflow-y-auto"
-								onScroll={handleListScroll(setVisionScrollPos)}
-								style={{
-									maskImage: `linear-gradient(to bottom, ${visionScrollPos === "top" ? "black" : "transparent"}, black 16px, black calc(100% - 16px), ${visionScrollPos === "bottom" ? "black" : "transparent"})`,
-									WebkitMaskImage: `linear-gradient(to bottom, ${visionScrollPos === "top" ? "black" : "transparent"}, black 16px, black calc(100% - 16px), ${visionScrollPos === "bottom" ? "black" : "transparent"})`,
-								}}
-							>
-								<CommandEmpty className="py-8 text-center">
-									<div className="flex flex-col items-center gap-2">
-										<Search className="size-8 text-muted-foreground" />
-										<p className="text-sm text-muted-foreground">No vision models found</p>
-										<p className="text-xs text-muted-foreground/60">Try a different search term</p>
-									</div>
-								</CommandEmpty>
-
-								{filteredVisionGlobal.length > 0 && (
-									<CommandGroup>
-										<div className="flex items-center gap-2 px-3 py-2 text-xs font-semibold text-muted-foreground tracking-wider">
-											Global Vision Models
-										</div>
-										{filteredVisionGlobal.map((config) => {
-											const isSelected = currentVisionConfig?.id === config.id;
-											const isAuto = "is_auto_mode" in config && config.is_auto_mode;
-											return (
-												<CommandItem
-													key={`vis-g-${config.id}`}
-													value={`vis-g-${config.id}`}
-													onSelect={() => handleSelectVision(config.id)}
-													className={cn(
-														"mx-2 rounded-lg mb-1 cursor-pointer group transition-all hover:bg-accent/50 dark:hover:bg-white/[0.06]",
-														isSelected && "bg-accent/80 dark:bg-white/[0.06]"
-													)}
-												>
-													<div className="flex items-center gap-3 min-w-0 flex-1">
-														<div className="shrink-0">
-															{getProviderIcon(config.provider, { isAutoMode: isAuto })}
-														</div>
-														<div className="min-w-0 flex-1">
-															<div className="flex items-center gap-2">
-																<span className="font-medium truncate">{config.name}</span>
-																{isAuto && (
-																	<Badge
-																		variant="secondary"
-																		className="text-[9px] px-1 py-0 h-3.5 bg-violet-800 text-white dark:bg-violet-800 dark:text-white border-0"
-																	>
-																		Recommended
-																	</Badge>
-																)}
-																{isSelected && <Check className="size-3.5 text-primary shrink-0" />}
-															</div>
-															<span className="text-xs text-muted-foreground truncate block">
-																{isAuto ? "Auto Mode" : config.model_name}
-															</span>
-														</div>
-														{onEditVision && !isAuto && (
-															<Button
-																variant="ghost"
-																size="icon"
-																className="size-7 shrink-0 rounded-md hover:bg-muted opacity-0 group-hover:opacity-100 transition-opacity"
-																onClick={(e) => {
-																	e.stopPropagation();
-																	setOpen(false);
-																	onEditVision(config as VisionLLMConfig, true);
-																}}
-															>
-																<Edit3 className="size-3.5 text-muted-foreground" />
-															</Button>
-														)}
-													</div>
-												</CommandItem>
-											);
-										})}
-									</CommandGroup>
-								)}
+						</div>
 
-								{filteredVisionUser.length > 0 && (
-									<>
-										{filteredVisionGlobal.length > 0 && (
-											<CommandSeparator className="my-1 mx-4 bg-border/60" />
-										)}
-										<CommandGroup>
-											<div className="flex items-center gap-2 px-3 py-2 text-xs font-semibold text-muted-foreground tracking-wider">
-												Your Vision Models
-											</div>
-											{filteredVisionUser.map((config) => {
-												const isSelected = currentVisionConfig?.id === config.id;
-												return (
-													<CommandItem
-														key={`vis-u-${config.id}`}
-														value={`vis-u-${config.id}`}
-														onSelect={() => handleSelectVision(config.id)}
-														className={cn(
-															"mx-2 rounded-lg mb-1 cursor-pointer group transition-all hover:bg-accent/50 dark:hover:bg-white/[0.06]",
-															isSelected && "bg-accent/80 dark:bg-white/[0.06]"
-														)}
-													>
-														<div className="flex items-center gap-3 min-w-0 flex-1">
-															<div className="shrink-0">{getProviderIcon(config.provider)}</div>
-															<div className="min-w-0 flex-1">
-																<div className="flex items-center gap-2">
-																	<span className="font-medium truncate">{config.name}</span>
-																	{isSelected && (
-																		<Check className="size-3.5 text-primary shrink-0" />
-																	)}
-																</div>
-																<span className="text-xs text-muted-foreground truncate block">
-																	{config.model_name}
-																</span>
-															</div>
-															{onEditVision && (
-																<Button
-																	variant="ghost"
-																	size="icon"
-																	className="h-7 w-7 shrink-0 opacity-0 group-hover:opacity-100 transition-opacity"
-																	onClick={(e) => {
-																		e.stopPropagation();
-																		setOpen(false);
-																		onEditVision(config, false);
-																	}}
-																>
-																	<Edit3 className="size-3.5 text-muted-foreground" />
-																</Button>
-															)}
-														</div>
-													</CommandItem>
-												);
-											})}
-										</CommandGroup>
-									</>
-								)}
+						{/* Add model button */}
+						{addHandler && (
+							<div className="p-2">
+								<Button
+									variant="ghost"
+									size="sm"
+									className="w-full justify-start gap-2 h-9 rounded-lg hover:bg-accent hover:text-accent-foreground "
+									onClick={() => {
+										setOpen(false);
+										addHandler(selectedProvider !== "all" ? selectedProvider : undefined);
+									}}
+								>
+									<Plus className="size-4 text-primary" />
+									<span className="text-sm font-medium">{addLabel}</span>
+								</Button>
+							</div>
+						)}
+					</div>
+				</div>
+			</div>
+		);
+	};
 
-								{onAddNewVision && (
-									<div className="p-2 bg-muted/20 dark:bg-neutral-900">
-										<Button
-											variant="ghost"
-											size="sm"
-											className="w-full justify-start gap-2 h-9 rounded-lg hover:bg-accent/50 dark:hover:bg-white/[0.06]"
-											onClick={() => {
-												setOpen(false);
-												onAddNewVision();
-											}}
-										>
-											<Plus className="size-4 text-primary" />
-											<span className="text-sm font-medium">Add Vision Model</span>
-										</Button>
-									</div>
-								)}
-							</CommandList>
-						</Command>
-					</TabsContent>
-				</Tabs>
+	// ─── Trigger button ───
+	const triggerButton = (
+		<Button
+			variant="ghost"
+			size="sm"
+			role="combobox"
+			aria-expanded={open}
+			className={cn(
+				"h-8 gap-2 px-3 text-sm bg-muted shadow-xs hover:bg-accent hover:text-accent-foreground border-0 select-none",
+				className
+			)}
+		>
+			{isLoading ? (
+				<>
+					<Spinner size="sm" className="text-muted-foreground" />
+					<span className="text-muted-foreground hidden md:inline">Loading</span>
+				</>
+			) : (
+				<>
+					{/* LLM */}
+					{currentLLMConfig ? (
+						<>
+							{getProviderIcon(currentLLMConfig.provider, {
+								isAutoMode: isLLMAutoMode ?? false,
+							})}
+							<span className="max-w-[100px] md:max-w-[120px] truncate hidden md:inline">
+								{currentLLMConfig.name}
+							</span>
+						</>
+					) : (
+						<>
+							<Bot className="size-4 text-muted-foreground" />
+							<span className="text-muted-foreground hidden md:inline">Select Model</span>
+						</>
+					)}
+					<div className="h-4 w-px bg-border/60 dark:bg-white/10 mx-0.5" />
+					{/* Image */}
+					{currentImageConfig ? (
+						<>
+							{getProviderIcon(currentImageConfig.provider, {
+								isAutoMode: isImageAutoMode ?? false,
+							})}
+							<span className="max-w-[80px] md:max-w-[100px] truncate hidden md:inline">
+								{currentImageConfig.name}
+							</span>
+						</>
+					) : (
+						<ImageIcon className="size-4 text-muted-foreground" />
+					)}
+					<div className="h-4 w-px bg-border/60 dark:bg-white/10 mx-0.5" />
+					{/* Vision */}
+					{currentVisionConfig ? (
+						<>
+							{getProviderIcon(currentVisionConfig.provider, {
+								isAutoMode: isVisionAutoMode ?? false,
+							})}
+							<span className="max-w-[80px] md:max-w-[100px] truncate hidden md:inline">
+								{currentVisionConfig.name}
+							</span>
+						</>
+					) : (
+						<ScanEye className="size-4 text-muted-foreground" />
+					)}
+				</>
+			)}
+			<ChevronDown className="h-3.5 w-3.5 text-muted-foreground ml-1 shrink-0" />
+		</Button>
+	);
+
+	// ─── Shell: Drawer on mobile, Popover on desktop ───
+	if (isMobile) {
+		return (
+			<Drawer open={open} onOpenChange={handleOpenChange}>
+				<DrawerTrigger asChild>{triggerButton}</DrawerTrigger>
+				<DrawerContent className="max-h-[85vh]">
+					<DrawerHandle />
+					<DrawerHeader className="pb-0">
+						<DrawerTitle>Select Model</DrawerTitle>
+					</DrawerHeader>
+					<div className="flex-1 overflow-hidden">{renderContent()}</div>
+				</DrawerContent>
+			</Drawer>
+		);
+	}
+
+	return (
+		<Popover open={open} onOpenChange={handleOpenChange}>
+			<PopoverTrigger asChild>{triggerButton}</PopoverTrigger>
+			<PopoverContent
+				className="w-[300px] md:w-[380px] p-0 rounded-lg shadow-lg overflow-hidden select-none"
+				align="start"
+				sideOffset={8}
+				onCloseAutoFocus={(e) => e.preventDefault()}
+			>
+				{renderContent()}
 			</PopoverContent>
 		</Popover>
 	);
diff --git a/surfsense_web/components/new-chat/prompt-picker.tsx b/surfsense_web/components/new-chat/prompt-picker.tsx
index 3e6457b8c3..1cb9f80f5c 100644
--- a/surfsense_web/components/new-chat/prompt-picker.tsx
+++ b/surfsense_web/components/new-chat/prompt-picker.tsx
@@ -1,7 +1,8 @@
 "use client";
 
-import { useAtomValue, useSetAtom } from "jotai";
-import { Plus, Zap } from "lucide-react";
+import { useAtomValue } from "jotai";
+import { Plus, WandSparkles } from "lucide-react";
+import { useParams, useRouter } from "next/navigation";
 import {
 	forwardRef,
 	useCallback,
@@ -14,8 +15,8 @@ import {
 } from "react";
 
 import { promptsAtom } from "@/atoms/prompts/prompts-query.atoms";
-import { userSettingsDialogAtom } from "@/atoms/settings/settings-dialog.atoms";
-import { Spinner } from "@/components/ui/spinner";
+import { Button } from "@/components/ui/button";
+import { Skeleton } from "@/components/ui/skeleton";
 import { cn } from "@/lib/utils";
 
 export interface PromptPickerRef {
@@ -28,14 +29,14 @@ interface PromptPickerProps {
 	onSelect: (action: { name: string; prompt: string; mode: "transform" | "explore" }) => void;
 	onDone: () => void;
 	externalSearch?: string;
-	containerStyle?: React.CSSProperties;
 }
 
 export const PromptPicker = forwardRef<PromptPickerRef, PromptPickerProps>(function PromptPicker(
-	{ onSelect, onDone, externalSearch = "", containerStyle },
+	{ onSelect, onDone, externalSearch = "" },
 	ref
 ) {
-	const setUserSettingsDialog = useSetAtom(userSettingsDialogAtom);
+	const router = useRouter();
+	const params = useParams();
 	const { data: prompts, isLoading, isError } = useAtomValue(promptsAtom);
 	const [highlightedIndex, setHighlightedIndex] = useState(0);
 	const scrollContainerRef = useRef<HTMLDivElement>(null);
@@ -60,13 +61,26 @@ export const PromptPicker = forwardRef<PromptPickerRef, PromptPickerProps>(funct
 		}
 	}
 
+	const createPromptIndex = filtered.length;
+	const totalItems = filtered.length + 1;
+	const searchSpaceId = Array.isArray(params?.search_space_id)
+		? params.search_space_id[0]
+		: params?.search_space_id;
+
 	const handleSelect = useCallback(
 		(index: number) => {
+			if (index === createPromptIndex) {
+				onDone();
+				if (searchSpaceId) {
+					router.push(`/dashboard/${searchSpaceId}/user-settings/prompts`);
+				}
+				return;
+			}
 			const action = filtered[index];
 			if (!action) return;
 			onSelect({ name: action.name, prompt: action.prompt, mode: action.mode });
 		},
-		[filtered, onSelect]
+		[filtered, onSelect, createPromptIndex, onDone, router, searchSpaceId]
 	);
 
 	useEffect(() => {
@@ -93,69 +107,102 @@ export const PromptPicker = forwardRef<PromptPickerRef, PromptPickerProps>(funct
 		() => ({
 			selectHighlighted: () => handleSelect(highlightedIndex),
 			moveUp: () => {
-				if (filtered.length === 0) return;
 				shouldScrollRef.current = true;
-				setHighlightedIndex((prev) => (prev > 0 ? prev - 1 : filtered.length - 1));
+				setHighlightedIndex((prev) => (prev > 0 ? prev - 1 : totalItems - 1));
 			},
 			moveDown: () => {
-				if (filtered.length === 0) return;
 				shouldScrollRef.current = true;
-				setHighlightedIndex((prev) => (prev < filtered.length - 1 ? prev + 1 : 0));
+				setHighlightedIndex((prev) => (prev < totalItems - 1 ? prev + 1 : 0));
 			},
 		}),
-		[filtered.length, highlightedIndex, handleSelect]
+		[totalItems, highlightedIndex, handleSelect]
 	);
 
 	return (
-		<div
-			className="w-64 rounded-lg border bg-popover shadow-lg overflow-hidden"
-			style={containerStyle}
-		>
-			<div ref={scrollContainerRef} className="max-h-48 overflow-y-auto py-1">
+		<div className="shadow-2xl rounded-lg overflow-hidden bg-popover text-popover-foreground flex flex-col w-[280px] sm:w-[320px] select-none">
+			<div ref={scrollContainerRef} className="max-h-[180px] sm:max-h-[280px] overflow-y-auto">
 				{isLoading ? (
-					<div className="flex items-center justify-center py-3">
-						<Spinner className="size-4" />
+					<div className="py-1 px-2">
+						<div className="px-3 py-2">
+							<Skeleton className="h-[16px] w-24" />
+						</div>
+						{["a", "b", "c", "d", "e"].map((id, i) => (
+							<div
+								key={id}
+								className={cn(
+									"w-full flex items-center gap-2 px-3 py-2 text-left rounded-md",
+									i >= 3 && "hidden sm:flex"
+								)}
+							>
+								<span className="shrink-0">
+									<Skeleton className="h-4 w-4" />
+								</span>
+								<span className="flex-1 text-sm">
+									<Skeleton className="h-[20px]" style={{ width: `${60 + ((i * 7) % 30)}%` }} />
+								</span>
+							</div>
+						))}
 					</div>
 				) : isError ? (
-					<p className="px-3 py-2 text-xs text-destructive">Failed to load prompts</p>
+					<div className="py-1 px-2">
+						<p className="px-3 py-2 text-xs text-destructive">Failed to load prompts</p>
+					</div>
 				) : filtered.length === 0 ? (
-					<p className="px-3 py-2 text-xs text-muted-foreground">No matching prompts</p>
+					<div className="py-1 px-2">
+						<p className="px-3 py-2 text-xs text-muted-foreground">No matching prompts</p>
+					</div>
 				) : (
-					filtered.map((action, index) => (
-						<button
-							key={action.id}
+					<div className="py-1 px-2">
+						<div className="px-3 py-2 text-xs font-bold text-muted-foreground/55">
+							Saved Prompts
+						</div>
+						{filtered.map((action, index) => (
+							<Button
+								key={action.id}
+								ref={(el) => {
+									if (el) itemRefs.current.set(index, el);
+									else itemRefs.current.delete(index);
+								}}
+								type="button"
+								variant="ghost"
+								onClick={() => handleSelect(index)}
+								onMouseEnter={() => setHighlightedIndex(index)}
+								className={cn(
+									"h-auto w-full justify-start gap-2 rounded-md px-3 py-2 text-left text-sm font-normal transition-colors",
+									index === highlightedIndex && "bg-accent text-accent-foreground"
+								)}
+							>
+								<span className="shrink-0 text-muted-foreground">
+									<WandSparkles className="size-4" />
+								</span>
+								<span className="flex-1 text-sm truncate">{action.name}</span>
+							</Button>
+						))}
+
+						<div className="mx-2 my-1 border-t border-popover-border" />
+						<Button
 							ref={(el) => {
-								if (el) itemRefs.current.set(index, el);
-								else itemRefs.current.delete(index);
+								if (el) itemRefs.current.set(createPromptIndex, el);
+								else itemRefs.current.delete(createPromptIndex);
 							}}
 							type="button"
-							onClick={() => handleSelect(index)}
-							onMouseEnter={() => setHighlightedIndex(index)}
+							variant="ghost"
+							onClick={() => handleSelect(createPromptIndex)}
+							onMouseEnter={() => setHighlightedIndex(createPromptIndex)}
 							className={cn(
-								"flex w-full items-center gap-2 px-3 py-1.5 text-sm cursor-pointer",
-								index === highlightedIndex ? "bg-accent" : "hover:bg-accent/50"
+								"h-auto w-full justify-start gap-2 rounded-md px-3 py-2 text-left text-sm font-normal text-muted-foreground transition-colors",
+								highlightedIndex === createPromptIndex
+									? "bg-accent text-accent-foreground"
+									: "hover:text-accent-foreground hover:bg-accent"
 							)}
 						>
-							<span className="text-muted-foreground">
-								<Zap className="size-3.5" />
+							<span className="shrink-0">
+								<Plus className="size-4" />
 							</span>
-							<span className="truncate">{action.name}</span>
-						</button>
-					))
+							<span>Create prompt</span>
+						</Button>
+					</div>
 				)}
-
-				<div className="my-1 h-px bg-border mx-2" />
-				<button
-					type="button"
-					onClick={() => {
-						onDone();
-						setUserSettingsDialog({ open: true, initialTab: "prompts" });
-					}}
-					className="flex w-full items-center gap-2 px-3 py-1.5 text-sm text-muted-foreground hover:text-foreground hover:bg-accent/50 cursor-pointer"
-				>
-					<Plus className="size-3.5" />
-					<span>Create prompt</span>
-				</button>
 			</div>
 		</div>
 	);
diff --git a/surfsense_web/components/new-chat/source-detail-panel.tsx b/surfsense_web/components/new-chat/source-detail-panel.tsx
deleted file mode 100644
index ed6c7e4ac6..0000000000
--- a/surfsense_web/components/new-chat/source-detail-panel.tsx
+++ /dev/null
@@ -1,711 +0,0 @@
-"use client";
-
-import { useQuery } from "@tanstack/react-query";
-import {
-	BookOpen,
-	ChevronDown,
-	ChevronUp,
-	ExternalLink,
-	FileQuestionMark,
-	FileText,
-	Hash,
-	Loader2,
-	Sparkles,
-	X,
-} from "lucide-react";
-import { AnimatePresence, motion, useReducedMotion } from "motion/react";
-import { useTranslations } from "next-intl";
-import type React from "react";
-import { forwardRef, memo, type ReactNode, useCallback, useEffect, useRef, useState } from "react";
-import { createPortal } from "react-dom";
-import { MarkdownViewer } from "@/components/markdown-viewer";
-import { Badge } from "@/components/ui/badge";
-import { Button } from "@/components/ui/button";
-import { ScrollArea } from "@/components/ui/scroll-area";
-import { Spinner } from "@/components/ui/spinner";
-import type {
-	GetDocumentByChunkResponse,
-	GetSurfsenseDocsByChunkResponse,
-} from "@/contracts/types/document.types";
-import { documentsApiService } from "@/lib/apis/documents-api.service";
-import { cacheKeys } from "@/lib/query-client/cache-keys";
-import { cn } from "@/lib/utils";
-
-type DocumentData = GetDocumentByChunkResponse | GetSurfsenseDocsByChunkResponse;
-
-interface SourceDetailPanelProps {
-	open: boolean;
-	onOpenChange: (open: boolean) => void;
-	chunkId: number;
-	sourceType: string;
-	title: string;
-	description?: string;
-	url?: string;
-	children?: ReactNode;
-	isDocsChunk?: boolean;
-}
-
-const formatDocumentType = (type: string) => {
-	if (!type) return "";
-	return type
-		.split("_")
-		.map((word) => word.charAt(0) + word.slice(1).toLowerCase())
-		.join(" ");
-};
-
-// Chunk card component
-// For large documents (>30 chunks), we disable animation to prevent layout shifts
-// which break auto-scroll functionality
-interface ChunkCardProps {
-	chunk: { id: number; content: string };
-	localIndex: number;
-	chunkNumber: number;
-	totalChunks: number;
-	isCited: boolean;
-	isActive: boolean;
-	disableLayoutAnimation?: boolean;
-}
-
-const ChunkCard = memo(
-	forwardRef<HTMLDivElement, ChunkCardProps>(
-		({ chunk, localIndex, chunkNumber, totalChunks, isCited }, ref) => {
-			return (
-				<div
-					ref={ref}
-					data-chunk-index={localIndex}
-					className={cn(
-						"group relative rounded-2xl border-2 transition-all duration-300",
-						isCited
-							? "bg-linear-to-br from-primary/5 via-primary/10 to-primary/5 border-primary shadow-lg shadow-primary/10"
-							: "bg-card border-border/50 hover:border-border hover:shadow-md"
-					)}
-				>
-					{isCited && <div className="absolute inset-0 rounded-2xl bg-primary/5 blur-xl -z-10" />}
-
-					<div className="flex items-center justify-between px-5 py-4 border-b border-border/50">
-						<div className="flex items-center gap-3">
-							<div
-								className={cn(
-									"flex items-center justify-center w-8 h-8 rounded-full text-sm font-semibold transition-colors",
-									isCited
-										? "bg-primary text-primary-foreground"
-										: "bg-muted text-muted-foreground group-hover:bg-muted/80"
-								)}
-							>
-								{chunkNumber}
-							</div>
-							<span className="text-sm text-muted-foreground">
-								Chunk {chunkNumber} of {totalChunks}
-							</span>
-						</div>
-						{isCited && (
-							<Badge variant="default" className="gap-1.5 px-3 py-1">
-								<Sparkles className="h-3 w-3" />
-								Cited Source
-							</Badge>
-						)}
-					</div>
-
-					<div className="p-5 overflow-hidden">
-						<MarkdownViewer content={chunk.content} maxLength={100_000} />
-					</div>
-				</div>
-			);
-		}
-	)
-);
-ChunkCard.displayName = "ChunkCard";
-
-export function SourceDetailPanel({
-	open,
-	onOpenChange,
-	chunkId,
-	sourceType,
-	title,
-	description,
-	url,
-	children,
-	isDocsChunk = false,
-}: SourceDetailPanelProps) {
-	const t = useTranslations("dashboard");
-	const scrollAreaRef = useRef<HTMLDivElement>(null);
-	const hasScrolledRef = useRef(false); // Use ref to avoid stale closures
-	const [activeChunkIndex, setActiveChunkIndex] = useState<number | null>(null);
-	const [mounted, setMounted] = useState(false);
-	const [_hasScrolledToCited, setHasScrolledToCited] = useState(false);
-	const shouldReduceMotion = useReducedMotion();
-
-	useEffect(() => {
-		setMounted(true);
-	}, []);
-
-	const {
-		data: documentData,
-		isLoading: isDocumentByChunkFetching,
-		error: documentByChunkFetchingError,
-	} = useQuery<DocumentData>({
-		queryKey: isDocsChunk
-			? cacheKeys.documents.byChunk(`doc-${chunkId}`)
-			: cacheKeys.documents.byChunk(chunkId.toString()),
-		queryFn: async () => {
-			if (isDocsChunk) {
-				return documentsApiService.getSurfsenseDocByChunk(chunkId);
-			}
-			return documentsApiService.getDocumentByChunk({ chunk_id: chunkId, chunk_window: 5 });
-		},
-		enabled: !!chunkId && open,
-		staleTime: 5 * 60 * 1000,
-	});
-
-	const totalChunks =
-		documentData && "total_chunks" in documentData
-			? (documentData.total_chunks ?? documentData.chunks.length)
-			: (documentData?.chunks?.length ?? 0);
-	const [beforeChunks, setBeforeChunks] = useState<
-		Array<{ id: number; content: string; created_at: string }>
-	>([]);
-	const [afterChunks, setAfterChunks] = useState<
-		Array<{ id: number; content: string; created_at: string }>
-	>([]);
-	const [loadingBefore, setLoadingBefore] = useState(false);
-	const [loadingAfter, setLoadingAfter] = useState(false);
-
-	useEffect(() => {
-		setBeforeChunks([]);
-		setAfterChunks([]);
-	}, [chunkId, open]);
-
-	const chunkStartIndex =
-		documentData && "chunk_start_index" in documentData ? (documentData.chunk_start_index ?? 0) : 0;
-	const initialChunks = documentData?.chunks ?? [];
-	const allChunks = [...beforeChunks, ...initialChunks, ...afterChunks];
-	const absoluteStart = chunkStartIndex - beforeChunks.length;
-	const absoluteEnd = chunkStartIndex + initialChunks.length + afterChunks.length;
-	const canLoadBefore = absoluteStart > 0;
-	const canLoadAfter = absoluteEnd < totalChunks;
-
-	const EXPAND_SIZE = 10;
-
-	const loadBefore = useCallback(async () => {
-		if (!documentData || !("search_space_id" in documentData) || !canLoadBefore) return;
-		setLoadingBefore(true);
-		try {
-			const count = Math.min(EXPAND_SIZE, absoluteStart);
-			const result = await documentsApiService.getDocumentChunks({
-				document_id: documentData.id,
-				page: 0,
-				page_size: count,
-				start_offset: absoluteStart - count,
-			});
-			const existingIds = new Set(allChunks.map((c) => c.id));
-			const newChunks = result.items
-				.filter((c) => !existingIds.has(c.id))
-				.map((c) => ({ id: c.id, content: c.content, created_at: c.created_at }));
-			setBeforeChunks((prev) => [...newChunks, ...prev]);
-		} catch (err) {
-			console.error("Failed to load earlier chunks:", err);
-		} finally {
-			setLoadingBefore(false);
-		}
-	}, [documentData, absoluteStart, canLoadBefore, allChunks]);
-
-	const loadAfter = useCallback(async () => {
-		if (!documentData || !("search_space_id" in documentData) || !canLoadAfter) return;
-		setLoadingAfter(true);
-		try {
-			const result = await documentsApiService.getDocumentChunks({
-				document_id: documentData.id,
-				page: 0,
-				page_size: EXPAND_SIZE,
-				start_offset: absoluteEnd,
-			});
-			const existingIds = new Set(allChunks.map((c) => c.id));
-			const newChunks = result.items
-				.filter((c) => !existingIds.has(c.id))
-				.map((c) => ({ id: c.id, content: c.content, created_at: c.created_at }));
-			setAfterChunks((prev) => [...prev, ...newChunks]);
-		} catch (err) {
-			console.error("Failed to load later chunks:", err);
-		} finally {
-			setLoadingAfter(false);
-		}
-	}, [documentData, absoluteEnd, canLoadAfter, allChunks]);
-
-	const isDirectRenderSource =
-		sourceType === "TAVILY_API" ||
-		sourceType === "LINKUP_API" ||
-		sourceType === "SEARXNG_API" ||
-		sourceType === "BAIDU_SEARCH_API";
-
-	const citedChunkIndex = allChunks.findIndex((chunk) => chunk.id === chunkId);
-
-	// Simple scroll function that scrolls to a chunk by index
-	const scrollToChunkByIndex = useCallback(
-		(chunkIndex: number, smooth = true) => {
-			const scrollContainer = scrollAreaRef.current;
-			if (!scrollContainer) return;
-
-			const viewport = scrollContainer.querySelector(
-				"[data-radix-scroll-area-viewport]"
-			) as HTMLElement | null;
-			if (!viewport) return;
-
-			const chunkElement = scrollContainer.querySelector(
-				`[data-chunk-index="${chunkIndex}"]`
-			) as HTMLElement | null;
-			if (!chunkElement) return;
-
-			// Get positions using getBoundingClientRect for accuracy
-			const viewportRect = viewport.getBoundingClientRect();
-			const chunkRect = chunkElement.getBoundingClientRect();
-
-			// Calculate where to scroll to center the chunk
-			const currentScrollTop = viewport.scrollTop;
-			const chunkTopRelativeToViewport = chunkRect.top - viewportRect.top + currentScrollTop;
-			const scrollTarget =
-				chunkTopRelativeToViewport - viewportRect.height / 2 + chunkRect.height / 2;
-
-			viewport.scrollTo({
-				top: Math.max(0, scrollTarget),
-				behavior: smooth && !shouldReduceMotion ? "smooth" : "auto",
-			});
-
-			setActiveChunkIndex(chunkIndex);
-		},
-		[shouldReduceMotion]
-	);
-
-	// Callback ref for the cited chunk - scrolls when the element mounts
-	const citedChunkRefCallback = useCallback(
-		(node: HTMLDivElement | null) => {
-			if (node && !hasScrolledRef.current && open) {
-				hasScrolledRef.current = true; // Mark immediately to prevent duplicate scrolls
-
-				// Store the node reference for the delayed scroll
-				const scrollToCitedChunk = () => {
-					const scrollContainer = scrollAreaRef.current;
-					if (!scrollContainer || !node.isConnected) return false;
-
-					const viewport = scrollContainer.querySelector(
-						"[data-radix-scroll-area-viewport]"
-					) as HTMLElement | null;
-					if (!viewport) return false;
-
-					// Get positions
-					const viewportRect = viewport.getBoundingClientRect();
-					const chunkRect = node.getBoundingClientRect();
-
-					// Calculate scroll position to center the chunk
-					const currentScrollTop = viewport.scrollTop;
-					const chunkTopRelativeToViewport = chunkRect.top - viewportRect.top + currentScrollTop;
-					const scrollTarget =
-						chunkTopRelativeToViewport - viewportRect.height / 2 + chunkRect.height / 2;
-
-					viewport.scrollTo({
-						top: Math.max(0, scrollTarget),
-						behavior: "auto", // Instant scroll for initial positioning
-					});
-
-					return true;
-				};
-
-				// Scroll multiple times with delays to handle progressive content rendering
-				// Each subsequent scroll will correct for any layout shifts
-				const scrollAttempts = [50, 150, 300, 600, 1000];
-
-				scrollAttempts.forEach((delay) => {
-					setTimeout(() => {
-						scrollToCitedChunk();
-					}, delay);
-				});
-
-				// After final attempt, mark state as scrolled
-				setTimeout(
-					() => {
-						setHasScrolledToCited(true);
-						setActiveChunkIndex(citedChunkIndex);
-					},
-					scrollAttempts[scrollAttempts.length - 1] + 50
-				);
-			}
-		},
-		[open, citedChunkIndex]
-	);
-
-	// Reset scroll state when panel closes
-	useEffect(() => {
-		if (!open) {
-			hasScrolledRef.current = false;
-			setHasScrolledToCited(false);
-			setActiveChunkIndex(null);
-		}
-	}, [open]);
-
-	// Handle escape key
-	useEffect(() => {
-		const handleEscape = (e: KeyboardEvent) => {
-			if (e.key === "Escape" && open) {
-				onOpenChange(false);
-			}
-		};
-		window.addEventListener("keydown", handleEscape);
-		return () => window.removeEventListener("keydown", handleEscape);
-	}, [open, onOpenChange]);
-
-	// Prevent body scroll when open
-	useEffect(() => {
-		if (open) {
-			document.body.style.overflow = "hidden";
-		} else {
-			document.body.style.overflow = "";
-		}
-		return () => {
-			document.body.style.overflow = "";
-		};
-	}, [open]);
-
-	const handleUrlClick = (e: React.MouseEvent, clickUrl: string) => {
-		e.preventDefault();
-		e.stopPropagation();
-		window.open(clickUrl, "_blank", "noopener,noreferrer");
-	};
-
-	const scrollToChunk = useCallback(
-		(index: number) => {
-			scrollToChunkByIndex(index, true);
-		},
-		[scrollToChunkByIndex]
-	);
-
-	const panelContent = (
-		<AnimatePresence mode="wait">
-			{open && (
-				<>
-					{/* Backdrop */}
-					<motion.div
-						key="backdrop"
-						initial={{ opacity: 0 }}
-						animate={{ opacity: 1 }}
-						exit={{ opacity: 0 }}
-						transition={{ duration: 0.2 }}
-						className="fixed inset-0 z-50 bg-black/60 backdrop-blur-sm"
-						onClick={() => onOpenChange(false)}
-					/>
-
-					{/* Panel */}
-					<motion.div
-						key="panel"
-						initial={shouldReduceMotion ? { opacity: 0 } : { opacity: 0, scale: 0.95, y: 20 }}
-						animate={{ opacity: 1, scale: 1, y: 0 }}
-						exit={shouldReduceMotion ? { opacity: 0 } : { opacity: 0, scale: 0.95, y: 20 }}
-						transition={{
-							type: "spring",
-							damping: 30,
-							stiffness: 300,
-						}}
-						className="fixed inset-3 sm:inset-6 md:inset-10 lg:inset-16 z-50 flex flex-col bg-background rounded-3xl shadow-2xl border overflow-hidden"
-					>
-						{/* Header */}
-						<motion.div
-							initial={{ opacity: 0, y: -10 }}
-							animate={{ opacity: 1, y: 0 }}
-							transition={{ delay: 0.1 }}
-							className="flex items-center justify-between px-6 py-5 border-b bg-linear-to-r from-muted/50 to-muted/30"
-						>
-							<div className="min-w-0 flex-1">
-								<h2 className="text-xl font-semibold truncate">
-									{documentData?.title || title || "Source Document"}
-								</h2>
-								<p className="text-sm text-muted-foreground mt-0.5">
-									{documentData && "document_type" in documentData
-										? formatDocumentType(documentData.document_type)
-										: sourceType && formatDocumentType(sourceType)}
-									{totalChunks > 0 && (
-										<span className="ml-2">
-											• {totalChunks} chunk{totalChunks !== 1 ? "s" : ""}
-											{allChunks.length < totalChunks && ` (showing ${allChunks.length})`}
-										</span>
-									)}
-								</p>
-							</div>
-							<div className="flex items-center gap-3 shrink-0">
-								{url && (
-									<Button
-										size="sm"
-										variant="outline"
-										onClick={(e) => handleUrlClick(e, url)}
-										className="hidden sm:flex gap-2 rounded-xl"
-									>
-										<ExternalLink className="h-4 w-4" />
-										Open Source
-									</Button>
-								)}
-								<Button
-									size="icon"
-									variant="ghost"
-									onClick={() => onOpenChange(false)}
-									className="h-8 w-8 rounded-full"
-								>
-									<X className="h-4 w-4" />
-									<span className="sr-only">Close</span>
-								</Button>
-							</div>
-						</motion.div>
-
-						{/* Loading State */}
-						{!isDirectRenderSource && isDocumentByChunkFetching && (
-							<div className="flex-1 flex items-center justify-center">
-								<motion.div
-									initial={{ opacity: 0, scale: 0.9 }}
-									animate={{ opacity: 1, scale: 1 }}
-									className="flex flex-col items-center gap-4"
-								>
-									<Spinner size="lg" />
-									<p className="text-sm text-muted-foreground font-medium">
-										{t("loading_document")}
-									</p>
-								</motion.div>
-							</div>
-						)}
-
-						{/* Error State */}
-						{!isDirectRenderSource && documentByChunkFetchingError && (
-							<div className="flex-1 flex items-center justify-center">
-								<motion.div
-									initial={{ opacity: 0, scale: 0.9 }}
-									animate={{ opacity: 1, scale: 1 }}
-									className="flex flex-col items-center gap-4 text-center px-6"
-								>
-									<div className="w-20 h-20 rounded-full bg-muted/50 flex items-center justify-center">
-										<FileQuestionMark className="h-10 w-10 text-muted-foreground" />
-									</div>
-									<div>
-										<p className="font-semibold text-foreground text-lg">Document unavailable</p>
-										<p className="text-sm text-muted-foreground mt-2 max-w-md">
-											{documentByChunkFetchingError.message ||
-												"An unexpected error occurred. Please try again."}
-										</p>
-									</div>
-									<Button variant="outline" onClick={() => onOpenChange(false)} className="mt-2">
-										Close Panel
-									</Button>
-								</motion.div>
-							</div>
-						)}
-
-						{/* Direct render for web search providers */}
-						{isDirectRenderSource && (
-							<ScrollArea className="flex-1">
-								<div className="p-6 max-w-3xl mx-auto">
-									{url && (
-										<Button
-											size="default"
-											variant="outline"
-											onClick={(e) => handleUrlClick(e, url)}
-											className="w-full mb-6 sm:hidden rounded-xl"
-										>
-											<ExternalLink className="mr-2 h-4 w-4" />
-											Open in Browser
-										</Button>
-									)}
-									<motion.div
-										initial={{ opacity: 0, y: 10 }}
-										animate={{ opacity: 1, y: 0 }}
-										className="p-6 bg-muted/50 rounded-2xl border"
-									>
-										<h3 className="text-base font-semibold mb-4 flex items-center gap-2">
-											<BookOpen className="h-4 w-4" />
-											Source Information
-										</h3>
-										<div className="text-sm text-muted-foreground mb-3 font-medium">
-											{title || "Untitled"}
-										</div>
-										<div className="text-sm text-foreground leading-relaxed">
-											{description || "No content available"}
-										</div>
-									</motion.div>
-								</div>
-							</ScrollArea>
-						)}
-
-						{/* API-fetched document content */}
-						{!isDirectRenderSource && documentData && (
-							<div className="flex-1 flex overflow-hidden">
-								{/* Chunk Navigation Sidebar */}
-								{allChunks.length > 1 && (
-									<motion.div
-										initial={{ opacity: 0, x: -20 }}
-										animate={{ opacity: 1, x: 0 }}
-										transition={{ delay: 0.2 }}
-										className="hidden lg:flex flex-col w-16 border-r bg-muted/10 overflow-hidden"
-									>
-										<ScrollArea className="flex-1 h-full">
-											<div className="p-2 pt-3 flex flex-col gap-1.5">
-												{allChunks.map((chunk, idx) => {
-													const absNum = absoluteStart + idx + 1;
-													const isCited = chunk.id === chunkId;
-													const isActive = activeChunkIndex === idx;
-													return (
-														<motion.button
-															key={chunk.id}
-															type="button"
-															onClick={() => scrollToChunk(idx)}
-															initial={{ opacity: 0, scale: 0.8 }}
-															animate={{ opacity: 1, scale: 1 }}
-															transition={{ delay: Math.min(idx * 0.02, 0.2) }}
-															className={cn(
-																"relative w-11 h-9 mx-auto rounded-lg text-xs font-semibold transition-all duration-200 flex items-center justify-center",
-																isCited
-																	? "bg-primary text-primary-foreground shadow-md"
-																	: isActive
-																		? "bg-muted text-foreground"
-																		: "bg-muted/50 text-muted-foreground hover:bg-muted hover:text-foreground"
-															)}
-															title={isCited ? `Chunk ${absNum} (Cited)` : `Chunk ${absNum}`}
-														>
-															{absNum}
-															{isCited && (
-																<span className="absolute -top-1.5 -right-1.5 flex items-center justify-center w-4 h-4 bg-primary rounded-full border-2 border-background shadow-sm">
-																	<Sparkles className="h-2.5 w-2.5 text-primary-foreground" />
-																</span>
-															)}
-														</motion.button>
-													);
-												})}
-											</div>
-										</ScrollArea>
-									</motion.div>
-								)}
-
-								{/* Main Content */}
-								<ScrollArea className="flex-1" ref={scrollAreaRef}>
-									<div className="p-6 lg:p-8 max-w-4xl mx-auto space-y-6">
-										{/* Document Metadata */}
-										{"document_metadata" in documentData &&
-											documentData.document_metadata &&
-											Object.keys(documentData.document_metadata).length > 0 && (
-												<motion.div
-													initial={{ opacity: 0, y: 10 }}
-													animate={{ opacity: 1, y: 0 }}
-													transition={{ delay: 0.1 }}
-													className="p-5 bg-muted/30 rounded-2xl border"
-												>
-													<h3 className="text-sm font-semibold mb-4 text-muted-foreground uppercase tracking-wider flex items-center gap-2">
-														<FileText className="h-4 w-4" />
-														Document Information
-													</h3>
-													<dl className="grid grid-cols-1 sm:grid-cols-2 gap-4 text-sm">
-														{Object.entries(documentData.document_metadata).map(([key, value]) => (
-															<div key={key} className="space-y-1">
-																<dt className="font-medium text-muted-foreground capitalize text-xs">
-																	{key.replace(/_/g, " ")}
-																</dt>
-																<dd className="text-foreground wrap-break-word">{String(value)}</dd>
-															</div>
-														))}
-													</dl>
-												</motion.div>
-											)}
-
-										{/* Chunks Header */}
-										<div className="flex items-center justify-between pt-2">
-											<h3 className="text-sm font-semibold text-muted-foreground uppercase tracking-wider flex items-center gap-2">
-												<Hash className="h-4 w-4" />
-												Chunks {absoluteStart + 1}–{absoluteEnd} of {totalChunks}
-											</h3>
-											{citedChunkIndex !== -1 && (
-												<Button
-													variant="ghost"
-													size="sm"
-													onClick={() => scrollToChunk(citedChunkIndex)}
-													className="gap-2 text-primary hover:text-primary"
-												>
-													<Sparkles className="h-3.5 w-3.5" />
-													Jump to cited
-												</Button>
-											)}
-										</div>
-
-										{/* Load Earlier */}
-										{canLoadBefore && (
-											<div className="flex items-center justify-center">
-												<Button
-													variant="outline"
-													size="sm"
-													onClick={loadBefore}
-													disabled={loadingBefore}
-													className="gap-2"
-												>
-													{loadingBefore ? (
-														<Loader2 className="h-3.5 w-3.5 animate-spin" />
-													) : (
-														<ChevronUp className="h-3.5 w-3.5" />
-													)}
-													{loadingBefore
-														? "Loading..."
-														: `Load ${Math.min(EXPAND_SIZE, absoluteStart)} earlier chunks`}
-												</Button>
-											</div>
-										)}
-
-										{/* Chunks */}
-										<div className="space-y-4">
-											{allChunks.map((chunk, idx) => {
-												const isCited = chunk.id === chunkId;
-												const chunkNumber = absoluteStart + idx + 1;
-												return (
-													<ChunkCard
-														key={chunk.id}
-														ref={isCited ? citedChunkRefCallback : undefined}
-														chunk={chunk}
-														localIndex={idx}
-														chunkNumber={chunkNumber}
-														totalChunks={totalChunks}
-														isCited={isCited}
-														isActive={activeChunkIndex === idx}
-														disableLayoutAnimation={allChunks.length > 30}
-													/>
-												);
-											})}
-										</div>
-
-										{/* Load Later */}
-										{canLoadAfter && (
-											<div className="flex items-center justify-center py-3">
-												<Button
-													variant="outline"
-													size="sm"
-													onClick={loadAfter}
-													disabled={loadingAfter}
-													className="gap-2"
-												>
-													{loadingAfter ? (
-														<Loader2 className="h-3.5 w-3.5 animate-spin" />
-													) : (
-														<ChevronDown className="h-3.5 w-3.5" />
-													)}
-													{loadingAfter
-														? "Loading..."
-														: `Load ${Math.min(EXPAND_SIZE, totalChunks - absoluteEnd)} later chunks`}
-												</Button>
-											</div>
-										)}
-									</div>
-								</ScrollArea>
-							</div>
-						)}
-					</motion.div>
-				</>
-			)}
-		</AnimatePresence>
-	);
-
-	if (!mounted) return <>{children}</>;
-
-	return (
-		<>
-			{children}
-			{createPortal(panelContent, globalThis.document.body)}
-		</>
-	);
-}
diff --git a/surfsense_web/components/onboarding-tour.tsx b/surfsense_web/components/onboarding-tour.tsx
index 178b6c97e8..a3c530ac05 100644
--- a/surfsense_web/components/onboarding-tour.tsx
+++ b/surfsense_web/components/onboarding-tour.tsx
@@ -9,6 +9,7 @@ import { createPortal } from "react-dom";
 import { connectorsAtom } from "@/atoms/connectors/connector-query.atoms";
 import { activeSearchSpaceIdAtom } from "@/atoms/search-spaces/search-space-query.atoms";
 import { currentUserAtom } from "@/atoms/user/user-query.atoms";
+import { Button } from "@/components/ui/button";
 import { useIsMobile } from "@/hooks/use-mobile";
 import { useZeroDocumentTypeCounts } from "@/hooks/use-zero-document-type-counts";
 import { fetchThreads } from "@/lib/chat/thread-persistence";
@@ -178,10 +179,6 @@ function TourTooltip({
 	const isLastStep = stepIndex === totalSteps - 1;
 	const isFirstStep = stepIndex === 0;
 
-	const bgColor = isDarkMode ? "#27272a" : "#ffffff";
-	const textColor = isDarkMode ? "#ffffff" : "#18181b";
-	const mutedTextColor = isDarkMode ? "#a1a1aa" : "#71717a";
-
 	// Calculate pointer line position
 	const getPointerStyles = (): React.CSSProperties => {
 		const lineLength = 16;
@@ -224,68 +221,27 @@ function TourTooltip({
 	};
 
 	const renderPointer = () => {
-		const lineColor = isDarkMode ? "#27272a" : "#ffffff";
-
 		if (position.pointerPosition === "left") {
 			return (
 				<div style={getPointerStyles()}>
-					<div
-						style={{
-							width: 6,
-							height: 6,
-							borderRadius: "50%",
-							backgroundColor: lineColor,
-						}}
-					/>
-					<div
-						style={{
-							width: 16,
-							height: 2,
-							backgroundColor: lineColor,
-						}}
-					/>
+					<div className="size-1.5 rounded-full bg-main-panel" />
+					<div className="h-0.5 w-4 bg-main-panel" />
 				</div>
 			);
 		}
 		if (position.pointerPosition === "right") {
 			return (
 				<div style={getPointerStyles()}>
-					<div
-						style={{
-							width: 16,
-							height: 2,
-							backgroundColor: lineColor,
-						}}
-					/>
-					<div
-						style={{
-							width: 6,
-							height: 6,
-							borderRadius: "50%",
-							backgroundColor: lineColor,
-						}}
-					/>
+					<div className="h-0.5 w-4 bg-main-panel" />
+					<div className="size-1.5 rounded-full bg-main-panel" />
 				</div>
 			);
 		}
 		if (position.pointerPosition === "top") {
 			return (
 				<div style={getPointerStyles()}>
-					<div
-						style={{
-							width: 6,
-							height: 6,
-							borderRadius: "50%",
-							backgroundColor: lineColor,
-						}}
-					/>
-					<div
-						style={{
-							width: 2,
-							height: 16,
-							backgroundColor: lineColor,
-						}}
-					/>
+					<div className="size-1.5 rounded-full bg-main-panel" />
+					<div className="h-4 w-0.5 bg-main-panel" />
 				</div>
 			);
 		}
@@ -338,10 +294,8 @@ function TourTooltip({
 			{renderPointer()}
 
 			<div
-				className="relative rounded-lg p-4"
+				className="relative rounded-lg bg-main-panel p-4 text-foreground"
 				style={{
-					backgroundColor: bgColor,
-					color: textColor,
 					boxShadow: isDarkMode
 						? "0 25px 50px -12px rgba(0, 0, 0, 0.5)"
 						: "0 25px 50px -12px rgba(0, 0, 0, 0.15)",
@@ -355,12 +309,10 @@ function TourTooltip({
 					}}
 					onAnimationEnd={onAnimationEnd}
 				>
-					<h3 id="tour-title" className="text-sm font-semibold mb-1.5" style={{ color: textColor }}>
+					<h3 id="tour-title" className="mb-1.5 text-sm font-semibold">
 						{step.title}
 					</h3>
-					<p className="text-sm leading-relaxed" style={{ color: mutedTextColor }}>
-						{step.content}
-					</p>
+					<p className="text-sm leading-relaxed text-muted-foreground">{step.content}</p>
 				</div>
 
 				{/* Footer */}
@@ -371,42 +323,45 @@ function TourTooltip({
 					{/* Navigation buttons */}
 					<div className="flex items-center gap-3">
 						{!isFirstStep && (
-							<button
+							<Button
 								type="button"
+								variant="ghost"
+								size="sm"
 								onClick={(e) => {
 									e.stopPropagation();
 									onPrev();
 								}}
-								className="text-sm font-medium transition-opacity hover:opacity-80"
-								style={{ color: mutedTextColor }}
+								className="h-auto px-0 py-0 text-sm text-muted-foreground hover:bg-transparent hover:text-muted-foreground hover:opacity-80"
 							>
 								Back
-							</button>
+							</Button>
 						)}
 						{isFirstStep && (
-							<button
+							<Button
 								type="button"
+								variant="ghost"
+								size="sm"
 								onClick={(e) => {
 									e.stopPropagation();
 									onSkip();
 								}}
-								className="text-sm font-medium transition-opacity hover:opacity-80"
-								style={{ color: mutedTextColor }}
+								className="h-auto px-0 py-0 text-sm text-muted-foreground hover:bg-transparent hover:text-muted-foreground hover:opacity-80"
 							>
 								Skip
-							</button>
+							</Button>
 						)}
-						<button
+						<Button
 							type="button"
+							variant="ghost"
+							size="sm"
 							onClick={(e) => {
 								e.stopPropagation();
 								onNext();
 							}}
-							className="text-sm font-medium transition-opacity hover:opacity-80"
-							style={{ color: textColor }}
+							className="h-auto px-0 py-0 text-sm text-foreground hover:bg-transparent hover:text-foreground hover:opacity-80"
 						>
 							{isLastStep ? "Done" : "Next"}
-						</button>
+						</Button>
 					</div>
 				</div>
 			</div>
@@ -766,9 +721,10 @@ export function OnboardingTour() {
 			`}</style>
 			<div className="fixed inset-0 z-[99995]">
 				{/* Clickable backdrop to close */}
-				<button
+				<Button
 					type="button"
-					className="fixed inset-0 w-full h-full bg-transparent border-0 cursor-default"
+					variant="ghost"
+					className="fixed inset-0 h-full w-full rounded-none border-0 bg-transparent p-0 hover:bg-transparent cursor-default"
 					onClick={handleOverlayClick}
 					aria-label="Close tour"
 				/>
diff --git a/surfsense_web/components/pricing.tsx b/surfsense_web/components/pricing.tsx
index 9a8e29c327..1b0947e3a7 100644
--- a/surfsense_web/components/pricing.tsx
+++ b/surfsense_web/components/pricing.tsx
@@ -72,7 +72,7 @@ export function Pricing({
 	return (
 		<div className="container mx-auto py-20">
 			<div className="text-center space-y-4 mb-12">
-				<h2 className="text-4xl font-bold tracking-tight sm:text-5xl">{title}</h2>
+				<h1 className="text-4xl font-bold tracking-tight sm:text-5xl">{title}</h1>
 				<p className="text-muted-foreground text-lg whitespace-pre-line">{description}</p>
 			</div>
 
diff --git a/surfsense_web/components/pricing/pricing-section.tsx b/surfsense_web/components/pricing/pricing-section.tsx
index ceea176a93..b2eef27787 100644
--- a/surfsense_web/components/pricing/pricing-section.tsx
+++ b/surfsense_web/components/pricing/pricing-section.tsx
@@ -1,6 +1,11 @@
 "use client";
-
+import { IconPlus } from "@tabler/icons-react";
+import { AnimatePresence, motion } from "motion/react";
+import type React from "react";
+import { useEffect, useRef, useState } from "react";
 import { Pricing } from "@/components/pricing";
+import { Button } from "@/components/ui/button";
+import { cn } from "@/lib/utils";
 
 const demoPlans = [
 	{
@@ -8,11 +13,11 @@ const demoPlans = [
 		price: "0",
 		yearlyPrice: "0",
 		period: "",
-		billingText: "500 pages included",
+		billingText: "500 pages + $5 in premium credits included",
 		features: [
 			"Self Hostable",
 			"500 pages included to start",
-			"Earn up to 3,000+ bonus pages for free",
+			"$5 in premium credits for paid AI models and premium AI features",
 			"Includes access to OpenAI text, audio and image models",
 			"Realtime Collaborative Group Chats with teammates",
 			"Community support on Discord",
@@ -26,11 +31,12 @@ const demoPlans = [
 		name: "PAY AS YOU GO",
 		price: "1",
 		yearlyPrice: "1",
-		period: "1,000 pages",
+		period: "pack",
 		billingText: "No subscription, buy only when you need more",
 		features: [
 			"Everything in Free",
-			"Buy 1,000-page packs at $1 each",
+			"Buy 1,000-page packs or $1 in premium credits at $1 each",
+			"Use premium AI models like GPT-5.4, Claude Sonnet 4.6, Gemini 2.5 Pro & 100+ more via OpenRouter",
 			"Priority support on Discord",
 		],
 		description: "",
@@ -61,14 +67,285 @@ const demoPlans = [
 	},
 ];
 
-function PricingBasic() {
+interface FAQItem {
+	question: string;
+	answer: string;
+}
+
+interface FAQSection {
+	title: string;
+	items: FAQItem[];
+}
+
+const faqData: FAQSection[] = [
+	{
+		title: "Pages & Document Billing",
+		items: [
+			{
+				question: 'What exactly is a "page" in SurfSense?',
+				answer:
+					"A page is a simple billing unit that measures how much content you add to your knowledge base. For PDFs, one page equals one real PDF page. For other document types like Word, PowerPoint, and Excel files, pages are automatically estimated based on the file. Every file uses at least 1 page.",
+			},
+			{
+				question: "What are Basic and Premium processing modes?",
+				answer:
+					"When uploading documents, you can choose between two processing modes. Basic mode uses standard extraction and costs 1 page credit per page, great for most documents. Premium processing mode uses advanced extraction optimized for complex financial, medical, and legal documents with intricate tables, layouts, and formatting. It costs 10 page credits per page and does not use your premium AI credits.",
+			},
+			{
+				question: "How does the Pay As You Go plan work?",
+				answer:
+					"There's no monthly subscription. When you need more pages, simply purchase 1,000-page packs at $1 each. Purchased pages are added to your account immediately so you can keep indexing right away. You only pay when you actually need more.",
+			},
+			{
+				question: "What happens if I run out of pages?",
+				answer:
+					"SurfSense checks your remaining pages before processing each file. If you don't have enough, the upload is paused and you'll be notified. You can purchase additional page packs at any time to continue. For cloud connector syncs, a small overage may be allowed so your sync doesn't partially fail.",
+			},
+			{
+				question: "If I delete a document, do I get my pages back?",
+				answer:
+					"No. Deleting a document removes it from your knowledge base, but the pages it used are not refunded. Pages track your total usage over time, not how much is currently stored. So be mindful of what you index. Once pages are spent, they're spent even if you later remove the document.",
+			},
+		],
+	},
+	{
+		title: "File Types & Connectors",
+		items: [
+			{
+				question: "Which file types count toward my page limit?",
+				answer:
+					"Page limits only apply to document files that need processing, including PDFs, Word documents (DOC, DOCX, ODT, RTF), presentations (PPT, PPTX, ODP), spreadsheets (XLS, XLSX, ODS), ebooks (EPUB), and images (JPG, PNG, TIFF, WebP, BMP). Plain text files, code files, Markdown, CSV, TSV, HTML, audio, and video files do not consume any pages.",
+			},
+			{
+				question: "How are pages consumed?",
+				answer:
+					"Pages are deducted whenever a document file is successfully indexed into your knowledge base, whether through direct uploads or file-based connector syncs (Google Drive, OneDrive, Dropbox, Local Folder). In Basic mode, each page costs 1 page credit; in Premium mode, each page costs 10 page credits. SurfSense checks your remaining credits before processing and only charges you after the file is indexed. Duplicate documents are automatically detected and won't cost you extra pages.",
+			},
+			{
+				question: "Do connectors like Slack, Notion, or Gmail use pages?",
+				answer:
+					"No. Connectors that work with structured text data like Slack, Discord, Notion, Confluence, Jira, Linear, ClickUp, GitHub, Gmail, Google Calendar, Microsoft Teams, Airtable, Elasticsearch, Web Crawler, BookStack, Obsidian, and Luma do not use pages at all. Page limits only apply to file-based connectors that need document processing, such as Google Drive, OneDrive, Dropbox, and Local Folder syncs.",
+			},
+		],
+	},
+	{
+		title: "Premium Credits",
+		items: [
+			{
+				question: 'What are "premium credits"?',
+				answer:
+					"Premium credits are your USD balance for paid AI usage in SurfSense, including premium AI models like GPT-5.4, Claude Sonnet 4.6, and Gemini 2.5 Pro, plus premium AI features such as image generation, podcasts, and video presentations when they use paid models. Each request debits the actual USD provider cost, so cheaper and more expensive models bill proportionally.",
+			},
+			{
+				question: "How many premium credits do I get for free?",
+				answer:
+					"Every registered SurfSense account starts with $5 in premium credits at no cost. Anonymous users (no login) get 500,000 free tokens across free models before creating an account. Once your included premium credits run out, you can top up at any time.",
+			},
+			{
+				question: "How does buying premium credits work?",
+				answer:
+					"Premium credit top-ups are pay as you go, with no subscription. $1 buys $1 of credit, and your balance is spent at provider cost. Purchased credit is added to your account immediately. You can buy up to $100 at a time.",
+			},
+			{
+				question: "Are premium credits the same as page credits?",
+				answer:
+					"No. Page credits pay for document indexing and file-based connector processing. Premium credits pay for paid AI usage, such as premium model chats and premium AI generation features. Premium document processing mode sounds similar, but it consumes page credits, not premium credits.",
+			},
+			{
+				question: "What happens if I run out of premium credits?",
+				answer:
+					"When your premium credit balance runs low, you'll see a warning. Once you run out, paid model requests and premium AI features are paused until you top up. You can still use non-premium models and features that do not consume premium credits.",
+			},
+		],
+	},
+	{
+		title: "Self-Hosting",
+		items: [
+			{
+				question: "Can I self-host SurfSense with unlimited pages and credit?",
+				answer:
+					"Yes! When self-hosting, you have full control over your page and premium credit limits. The default self-hosted setup gives you effectively unlimited pages and premium credits, so you can index as much data and use as many AI queries as your infrastructure supports.",
+			},
+		],
+	},
+];
+
+const GridLineHorizontal = ({ className, offset }: { className?: string; offset?: string }) => {
+	return (
+		<div
+			style={
+				{
+					"--background": "#ffffff",
+					"--color": "rgba(0, 0, 0, 0.2)",
+					"--height": "1px",
+					"--width": "5px",
+					"--fade-stop": "90%",
+					"--offset": offset || "200px",
+					"--color-dark": "rgba(255, 255, 255, 0.2)",
+					maskComposite: "exclude",
+				} as React.CSSProperties
+			}
+			className={cn(
+				"[--background:var(--color-neutral-200)] [--color:var(--color-neutral-400)] dark:[--background:var(--color-neutral-800)] dark:[--color:var(--color-neutral-600)]",
+				"absolute left-[calc(var(--offset)/2*-1)] h-(--height) w-[calc(100%+var(--offset))]",
+				"bg-[linear-gradient(to_right,var(--color),var(--color)_50%,transparent_0,transparent)]",
+				"bg-size-[var(--width)_var(--height)]",
+				"[mask:linear-gradient(to_left,var(--background)_var(--fade-stop),transparent),linear-gradient(to_right,var(--background)_var(--fade-stop),transparent),linear-gradient(black,black)]",
+				"mask-exclude",
+				"z-30",
+				"dark:bg-[linear-gradient(to_right,var(--color-dark),var(--color-dark)_50%,transparent_0,transparent)]",
+				className
+			)}
+		/>
+	);
+};
+
+const GridLineVertical = ({ className, offset }: { className?: string; offset?: string }) => {
 	return (
-		<Pricing
-			plans={demoPlans}
-			title="SurfSense Pricing"
-			description="Start free with 500 pages and pay as you go."
+		<div
+			style={
+				{
+					"--background": "#ffffff",
+					"--color": "rgba(0, 0, 0, 0.2)",
+					"--height": "5px",
+					"--width": "1px",
+					"--fade-stop": "90%",
+					"--offset": offset || "150px",
+					"--color-dark": "rgba(255, 255, 255, 0.2)",
+					maskComposite: "exclude",
+				} as React.CSSProperties
+			}
+			className={cn(
+				"absolute top-[calc(var(--offset)/2*-1)] h-[calc(100%+var(--offset))] w-(--width)",
+				"bg-[linear-gradient(to_bottom,var(--color),var(--color)_50%,transparent_0,transparent)]",
+				"bg-size-[var(--width)_var(--height)]",
+				"[mask:linear-gradient(to_top,var(--background)_var(--fade-stop),transparent),linear-gradient(to_bottom,var(--background)_var(--fade-stop),transparent),linear-gradient(black,black)]",
+				"mask-exclude",
+				"z-30",
+				"dark:bg-[linear-gradient(to_bottom,var(--color-dark),var(--color-dark)_50%,transparent_0,transparent)]",
+				className
+			)}
 		/>
 	);
+};
+
+function PricingFAQ() {
+	const [activeId, setActiveId] = useState<string | null>(null);
+	const containerRef = useRef<HTMLDivElement>(null);
+
+	useEffect(() => {
+		function handleClickOutside(event: MouseEvent) {
+			if (containerRef.current && !containerRef.current.contains(event.target as Node)) {
+				setActiveId(null);
+			}
+		}
+
+		document.addEventListener("mousedown", handleClickOutside);
+		return () => document.removeEventListener("mousedown", handleClickOutside);
+	}, []);
+
+	const toggleQuestion = (id: string) => {
+		setActiveId(activeId === id ? null : id);
+	};
+
+	return (
+		<div className="mx-auto w-full max-w-4xl overflow-hidden px-4 py-20 md:px-8 md:py-32">
+			<div className="text-center">
+				<h2 className="text-4xl font-bold tracking-tight sm:text-5xl">
+					Frequently Asked Questions
+				</h2>
+				<p className="mx-auto mt-4 max-w-2xl text-lg text-muted-foreground">
+					Everything you need to know about SurfSense pages, premium credits, and billing.
+					Can&apos;t find what you need? Reach out at{" "}
+					<a href="mailto:rohan@surfsense.com" className="text-blue-500 underline">
+						rohan@surfsense.com
+					</a>
+				</p>
+			</div>
+
+			<div ref={containerRef} className="relative mt-16 flex w-full flex-col gap-12 px-4 md:px-8">
+				{faqData.map((section) => (
+					<div key={`${section.title}faq`}>
+						<h3 className="mb-6 text-lg font-medium text-neutral-800 dark:text-neutral-200">
+							{section.title}
+						</h3>
+						<div className="flex flex-col gap-3">
+							{section.items.map((item, index) => {
+								const id = `${section.title}-${index}`;
+								const isActive = activeId === id;
+
+								return (
+									<div
+										key={`${id}faq-item`}
+										className={cn(
+											"relative rounded-lg transition-all duration-200",
+											isActive
+												? "bg-white shadow-sm ring-1 shadow-black/10 ring-black/10 dark:bg-neutral-900 dark:shadow-white/5 dark:ring-white/10"
+												: "hover:bg-neutral-50 dark:hover:bg-neutral-900"
+										)}
+									>
+										{isActive && (
+											<div className="absolute inset-0">
+												<GridLineHorizontal className="-top-[2px]" offset="100px" />
+												<GridLineHorizontal className="-bottom-[2px]" offset="100px" />
+												<GridLineVertical className="-left-[2px]" offset="100px" />
+												<GridLineVertical className="-right-[2px] left-auto" offset="100px" />
+											</div>
+										)}
+										<Button
+											variant="ghost"
+											type="button"
+											onClick={() => toggleQuestion(id)}
+											className="h-auto w-full justify-between rounded-lg px-4 py-4 text-left hover:bg-transparent"
+										>
+											<span className="text-sm font-medium text-neutral-700 md:text-base dark:text-neutral-300">
+												{item.question}
+											</span>
+											<motion.div
+												animate={{ rotate: isActive ? 45 : 0 }}
+												transition={{ duration: 0.2 }}
+												className="ml-4 shrink-0"
+											>
+												<IconPlus className="size-5 text-neutral-500 dark:text-neutral-400" />
+											</motion.div>
+										</Button>
+										<AnimatePresence initial={false}>
+											{isActive && (
+												<motion.div
+													initial={{ height: 0, opacity: 0 }}
+													animate={{ height: "auto", opacity: 1 }}
+													exit={{ height: 0, opacity: 0 }}
+													transition={{ duration: 0.15, ease: "easeInOut" }}
+													className="relative"
+												>
+													<p className="max-w-[90%] px-4 pb-4 text-sm text-neutral-600 dark:text-neutral-400">
+														{item.answer}
+													</p>
+												</motion.div>
+											)}
+										</AnimatePresence>
+									</div>
+								);
+							})}
+						</div>
+					</div>
+				))}
+			</div>
+		</div>
+	);
+}
+
+function PricingBasic() {
+	return (
+		<>
+			<Pricing
+				plans={demoPlans}
+				title="SurfSense Pricing"
+				description="Start free with 500 pages & $5 in premium credits. Pay as you go."
+			/>
+			<PricingFAQ />
+		</>
+	);
 }
 
 export default PricingBasic;
diff --git a/surfsense_web/components/prompt-kit/chain-of-thought.tsx b/surfsense_web/components/prompt-kit/chain-of-thought.tsx
index ca9698b31a..3ed0b6677a 100644
--- a/surfsense_web/components/prompt-kit/chain-of-thought.tsx
+++ b/surfsense_web/components/prompt-kit/chain-of-thought.tsx
@@ -203,7 +203,7 @@ export const ChainOfThoughtTrigger: React.FC<ChainOfThoughtTriggerProps> = ({
 }) => (
 	<CollapsibleTrigger
 		className={cn(
-			"group text-muted-foreground hover:text-foreground flex cursor-pointer items-center justify-start gap-1 text-left text-sm transition-colors",
+			"group text-muted-foreground hover:text-accent-foreground flex cursor-pointer items-center justify-start gap-1 text-left text-sm transition-colors",
 			className
 		)}
 		{...props}
diff --git a/surfsense_web/components/providers/GlobalLoadingProvider.tsx b/surfsense_web/components/providers/GlobalLoadingProvider.tsx
index 08c8889543..c0fa8e4d37 100644
--- a/surfsense_web/components/providers/GlobalLoadingProvider.tsx
+++ b/surfsense_web/components/providers/GlobalLoadingProvider.tsx
@@ -34,7 +34,7 @@ export function GlobalLoadingProvider({ children }: { children: React.ReactNode
 			)}
 			aria-hidden={!isLoading}
 		>
-			<div className="flex min-h-screen flex-col items-center justify-center bg-background">
+			<div className="flex min-h-screen flex-col items-center justify-center bg-main-panel">
 				<div className="h-12 w-12 flex items-center justify-center">
 					<Spinner size="lg" className="text-muted-foreground" />
 				</div>
diff --git a/surfsense_web/components/providers/ZeroProvider.tsx b/surfsense_web/components/providers/ZeroProvider.tsx
index c97eac0726..5bb43db995 100644
--- a/surfsense_web/components/providers/ZeroProvider.tsx
+++ b/surfsense_web/components/providers/ZeroProvider.tsx
@@ -6,7 +6,7 @@ import {
 	ZeroProvider as ZeroReactProvider,
 } from "@rocicorp/zero/react";
 import { useAtomValue } from "jotai";
-import { useEffect, useRef } from "react";
+import { useEffect, useMemo, useRef } from "react";
 import { currentUserAtom } from "@/atoms/user/user-query.atoms";
 import { getBearerToken, handleUnauthorized, refreshAccessToken } from "@/lib/auth-utils";
 import { queries } from "@/zero/queries";
@@ -43,19 +43,30 @@ function ZeroAuthSync() {
 export function ZeroProvider({ children }: { children: React.ReactNode }) {
 	const { data: user } = useAtomValue(currentUserAtom);
 
-	const hasUser = !!user?.id;
-	const userID = hasUser ? String(user.id) : "anon";
-	const context = hasUser ? { userId: String(user.id) } : undefined;
+	const userId = user?.id;
+	const hasUser = !!userId;
+	const userID = hasUser ? String(userId) : "anon";
+	// getBearerToken() returns a string (a primitive), so it's safe to read
+	// on every render — reference equality holds as long as the token is
+	// unchanged, which keeps the memoized `opts` below stable.
 	const auth = hasUser ? getBearerToken() || undefined : undefined;
 
-	const opts = {
-		userID,
-		schema,
-		queries,
-		context,
-		cacheURL,
-		auth,
-	};
+	const context = useMemo(
+		() => (hasUser ? { userId: String(userId) } : undefined),
+		[hasUser, userId]
+	);
+
+	const opts = useMemo(
+		() => ({
+			userID,
+			schema,
+			queries,
+			context,
+			cacheURL,
+			auth,
+		}),
+		[userID, context, auth]
+	);
 
 	return (
 		<ZeroReactProvider {...opts}>
diff --git a/surfsense_web/components/public-chat-snapshots/public-chat-snapshot-row.tsx b/surfsense_web/components/public-chat-snapshots/public-chat-snapshot-row.tsx
index 4bb49c48d4..99a9ad0e9a 100644
--- a/surfsense_web/components/public-chat-snapshots/public-chat-snapshot-row.tsx
+++ b/surfsense_web/components/public-chat-snapshots/public-chat-snapshot-row.tsx
@@ -1,14 +1,23 @@
 "use client";
 
-import { Check, Copy, Dot, ExternalLink, MessageSquare, Trash2 } from "lucide-react";
-import { useCallback, useRef, useState } from "react";
+import { Copy, Dot, ExternalLink, MessageSquare, MoreHorizontal, Trash2 } from "lucide-react";
+import { useCallback, useState } from "react";
+import { toast } from "sonner";
 import { Avatar, AvatarFallback, AvatarImage } from "@/components/ui/avatar";
 import { Badge } from "@/components/ui/badge";
 import { Button } from "@/components/ui/button";
 import { Card, CardContent } from "@/components/ui/card";
+import {
+	DropdownMenu,
+	DropdownMenuContent,
+	DropdownMenuItem,
+	DropdownMenuTrigger,
+} from "@/components/ui/dropdown-menu";
+import { Separator } from "@/components/ui/separator";
 import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/components/ui/tooltip";
 import type { PublicChatSnapshotDetail } from "@/contracts/types/chat-threads.types";
 import { useMediaQuery } from "@/hooks/use-media-query";
+import { cn } from "@/lib/utils";
 
 function getInitials(name: string): string {
 	const parts = name.trim().split(/\s+/);
@@ -35,15 +44,12 @@ export function PublicChatSnapshotRow({
 	isDeleting = false,
 	memberMap,
 }: PublicChatSnapshotRowProps) {
-	const [copied, setCopied] = useState(false);
-	const copyTimeoutRef = useRef<ReturnType<typeof setTimeout>>(null);
+	const [dropdownOpen, setDropdownOpen] = useState(false);
 	const isDesktop = useMediaQuery("(min-width: 768px)");
 
 	const handleCopyClick = useCallback(() => {
 		onCopy(snapshot);
-		setCopied(true);
-		if (copyTimeoutRef.current) clearTimeout(copyTimeoutRef.current);
-		copyTimeoutRef.current = setTimeout(() => setCopied(false), 2000);
+		toast.success("Link copied to clipboard");
 	}, [onCopy, snapshot]);
 
 	const formattedDate = new Date(snapshot.created_at).toLocaleDateString(undefined, {
@@ -55,127 +61,100 @@ export function PublicChatSnapshotRow({
 	const member = snapshot.created_by_user_id ? memberMap.get(snapshot.created_by_user_id) : null;
 
 	return (
-		<Card className="group relative overflow-hidden transition-all duration-200 border-border/60 hover:shadow-md h-full">
-			<CardContent className="p-4 flex flex-col gap-3 h-full">
+		<Card className="group relative overflow-hidden transition-all duration-200 border-accent bg-accent/20 hover:shadow-md h-full">
+			<CardContent className="p-4 flex flex-col gap-3 h-full min-h-32">
 				{/* Header: Title + Actions */}
-				<div className="relative">
-					<div className="min-w-0 pr-16 sm:pr-0 sm:group-hover:pr-16">
-						<h4
-							className="text-sm font-semibold tracking-tight truncate"
-							title={snapshot.thread_title}
-						>
-							{snapshot.thread_title}
-						</h4>
-					</div>
-					<div className="flex items-center gap-0.5 shrink-0 sm:hidden sm:group-hover:flex absolute right-0 top-0">
-						<TooltipProvider>
-							<Tooltip open={isDesktop ? undefined : false}>
-								<TooltipTrigger asChild>
-									<Button
-										variant="ghost"
-										size="icon"
-										asChild
-										className="h-7 w-7 text-muted-foreground hover:text-foreground"
-									>
-										<a href={snapshot.public_url} target="_blank" rel="noopener noreferrer">
-											<ExternalLink className="h-3 w-3" />
-										</a>
-									</Button>
-								</TooltipTrigger>
-								<TooltipContent>Open link</TooltipContent>
-							</Tooltip>
-						</TooltipProvider>
-						{canDelete && (
-							<TooltipProvider>
-								<Tooltip open={isDesktop ? undefined : false}>
-									<TooltipTrigger asChild>
-										<Button
-											variant="ghost"
-											size="icon"
-											onClick={() => onDelete(snapshot)}
-											disabled={isDeleting}
-											className="h-7 w-7 text-muted-foreground hover:text-destructive"
-										>
-											<Trash2 className="h-3 w-3" />
-										</Button>
-									</TooltipTrigger>
-									<TooltipContent>Delete</TooltipContent>
-								</Tooltip>
-							</TooltipProvider>
+				<div className="relative flex items-center">
+					<h4
+						className={cn(
+							"text-sm font-semibold tracking-tight truncate",
+							dropdownOpen ? "pr-8" : "sm:group-hover:pr-8"
 						)}
-					</div>
+						title={snapshot.thread_title}
+					>
+						{snapshot.thread_title}
+					</h4>
+					<DropdownMenu open={dropdownOpen} onOpenChange={setDropdownOpen}>
+						<DropdownMenuTrigger asChild>
+							<Button
+								variant="ghost"
+								size="icon"
+								className={cn(
+									"absolute right-0 h-6 w-6 shrink-0",
+									"hover:bg-accent",
+									dropdownOpen
+										? "opacity-100 bg-accent hover:bg-accent"
+										: "sm:opacity-0 sm:group-hover:opacity-100"
+								)}
+							>
+								<MoreHorizontal className="h-3.5 w-3.5 text-muted-foreground" />
+							</Button>
+						</DropdownMenuTrigger>
+						<DropdownMenuContent align="end" className="w-40">
+							<DropdownMenuItem onClick={handleCopyClick}>
+								<Copy className="mr-2 h-4 w-4" />
+								Copy link
+							</DropdownMenuItem>
+							<DropdownMenuItem asChild>
+								<a href={snapshot.public_url} target="_blank" rel="noopener noreferrer">
+									<ExternalLink className="mr-2 h-4 w-4" />
+									Open link
+								</a>
+							</DropdownMenuItem>
+							{canDelete && (
+								<DropdownMenuItem onClick={() => onDelete(snapshot)} disabled={isDeleting}>
+									<Trash2 className="mr-2 h-4 w-4" />
+									Delete
+								</DropdownMenuItem>
+							)}
+						</DropdownMenuContent>
+					</DropdownMenu>
 				</div>
 
 				{/* Message count badge */}
 				<div className="flex items-center gap-1.5">
 					<Badge
-						variant="outline"
-						className="text-[10px] px-1.5 py-0.5 border-muted-foreground/20 text-muted-foreground"
+						variant="secondary"
+						className="text-[10px] px-1.5 py-0.5 border-0 text-muted-foreground bg-muted"
 					>
 						<MessageSquare className="h-2.5 w-2.5 mr-1" />
 						{snapshot.message_count} messages
 					</Badge>
 				</div>
 
-				{/* Public URL – selectable fallback for manual copy */}
-				<div className="flex items-center gap-2 rounded-md border border-border/60 bg-muted/30 px-2.5 py-1.5">
-					<div className="min-w-0 flex-1 overflow-x-auto scrollbar-hide">
-						<p
-							className="text-[10px] font-mono text-muted-foreground whitespace-nowrap select-all cursor-text"
-							title={snapshot.public_url}
-						>
-							{snapshot.public_url}
-						</p>
-					</div>
-					<TooltipProvider>
-						<Tooltip open={isDesktop ? undefined : false}>
-							<TooltipTrigger asChild>
-								<Button
-									variant="ghost"
-									size="icon"
-									onClick={handleCopyClick}
-									className="h-6 w-6 shrink-0 text-muted-foreground hover:text-foreground"
-								>
-									{copied ? (
-										<Check className="h-3 w-3 text-green-500" />
-									) : (
-										<Copy className="h-3 w-3" />
-									)}
-								</Button>
-							</TooltipTrigger>
-							<TooltipContent>{copied ? "Copied!" : "Copy link"}</TooltipContent>
-						</Tooltip>
-					</TooltipProvider>
-				</div>
-
 				{/* Footer: Date + Creator */}
-				<div className="flex items-center gap-2 pt-2 border-t border-border/40 mt-auto">
-					<span className="text-[11px] text-muted-foreground/60">{formattedDate}</span>
-					{member && (
-						<>
-							<Dot className="h-4 w-4 text-muted-foreground/30" />
-							<TooltipProvider>
-								<Tooltip open={isDesktop ? undefined : false}>
-									<TooltipTrigger asChild>
-										<div className="flex items-center gap-1.5 cursor-default">
-											<Avatar className="size-4.5 shrink-0">
-												{member.avatarUrl && (
-													<AvatarImage src={member.avatarUrl} alt={member.name} />
-												)}
-												<AvatarFallback className="text-[9px]">
-													{getInitials(member.name)}
-												</AvatarFallback>
-											</Avatar>
-											<span className="text-[11px] text-muted-foreground/60 truncate max-w-[120px]">
-												{member.name}
-											</span>
-										</div>
-									</TooltipTrigger>
-									<TooltipContent side="bottom">{member.email || member.name}</TooltipContent>
-								</Tooltip>
-							</TooltipProvider>
-						</>
-					)}
+				<div className="mt-auto space-y-2">
+					<Separator className="bg-accent" />
+					<div className="flex items-center">
+						<span className="shrink-0 text-[11px] text-muted-foreground/60 whitespace-nowrap">
+							{formattedDate}
+						</span>
+						{member && (
+							<>
+								<Dot className="h-4 w-4 text-muted-foreground/30 shrink-0" />
+								<TooltipProvider>
+									<Tooltip open={isDesktop ? undefined : false}>
+										<TooltipTrigger asChild>
+											<div className="min-w-0 flex items-center gap-1.5 cursor-default">
+												<Avatar className="size-4.5 shrink-0">
+													{member.avatarUrl && (
+														<AvatarImage src={member.avatarUrl} alt={member.name} />
+													)}
+													<AvatarFallback className="text-[9px]">
+														{getInitials(member.name)}
+													</AvatarFallback>
+												</Avatar>
+												<span className="text-[11px] text-muted-foreground/60 truncate">
+													{member.name}
+												</span>
+											</div>
+										</TooltipTrigger>
+										<TooltipContent side="bottom">{member.email || member.name}</TooltipContent>
+									</Tooltip>
+								</TooltipProvider>
+							</>
+						)}
+					</div>
 				</div>
 			</CardContent>
 		</Card>
diff --git a/surfsense_web/components/public-chat-snapshots/public-chat-snapshots-manager.tsx b/surfsense_web/components/public-chat-snapshots/public-chat-snapshots-manager.tsx
index 5dd4ca17ad..3cf07c27a4 100644
--- a/surfsense_web/components/public-chat-snapshots/public-chat-snapshots-manager.tsx
+++ b/surfsense_web/components/public-chat-snapshots/public-chat-snapshots-manager.tsx
@@ -69,6 +69,7 @@ export function PublicChatSnapshotsManager({
 				});
 			} catch (error) {
 				console.error("Failed to delete snapshot:", error);
+				toast.error("Failed to delete snapshot");
 			} finally {
 				setDeletingId(undefined);
 			}
@@ -80,30 +81,26 @@ export function PublicChatSnapshotsManager({
 	if (isLoading) {
 		return (
 			<div className="space-y-4 md:space-y-5">
-				{/* Info alert skeleton */}
-				<Skeleton className="h-12 w-full rounded-lg" />
+				<Alert>
+					<Info />
+					<AlertDescription>
+						<div className="flex min-h-[1.625em] items-center">
+							<Skeleton className="h-4 w-60 bg-accent-foreground/15" />
+						</div>
+					</AlertDescription>
+				</Alert>
 
 				{/* Cards grid skeleton */}
 				<div className="grid gap-3 grid-cols-1 sm:grid-cols-2">
 					{["skeleton-a", "skeleton-b", "skeleton-c"].map((key) => (
-						<Card key={key} className="border-border/60">
-							<CardContent className="p-4 flex flex-col gap-3">
-								{/* Header: Title */}
-								<div className="flex items-start justify-between gap-2">
-									<Skeleton className="h-4 w-36 md:w-44" />
-								</div>
-								{/* Message count badge */}
-								<div className="flex items-center gap-1.5">
-									<Skeleton className="h-5 w-24 rounded-full" />
-								</div>
-								{/* URL skeleton */}
-								<Skeleton className="h-3 w-full rounded" />
-								{/* Footer: Date + Creator */}
-								<div className="flex items-center gap-2 pt-2 border-t border-border/40">
-									<Skeleton className="h-3 w-20" />
-									<Skeleton className="h-4 w-4 rounded-full" />
-									<Skeleton className="h-3 w-16" />
-								</div>
+						<Card
+							key={key}
+							className="group relative overflow-hidden transition-all duration-200 border-accent bg-accent/20 hover:shadow-md h-full"
+						>
+							<CardContent className="p-4 flex flex-col gap-3 h-full min-h-32">
+								<Skeleton className="h-4 w-32 md:w-40 bg-accent" />
+								<Skeleton className="h-3 w-full bg-accent" />
+								<Skeleton className="h-3 w-24 md:w-28 bg-accent mt-auto" />
 							</CardContent>
 						</Card>
 					))}
@@ -127,8 +124,8 @@ export function PublicChatSnapshotsManager({
 	// Permission denied
 	if (!canView) {
 		return (
-			<Alert variant="destructive">
-				<Info className="h-4 w-4" />
+			<Alert>
+				<Info />
 				<AlertDescription>
 					You don't have permission to view public chat links in this search space.
 				</AlertDescription>
@@ -140,9 +137,9 @@ export function PublicChatSnapshotsManager({
 
 	return (
 		<div className="space-y-4 md:space-y-5">
-			<Alert className="bg-muted/50 py-3 md:py-4">
-				<Info className="h-3 w-3 md:h-4 md:w-4 shrink-0" />
-				<AlertDescription className="text-xs md:text-sm">
+			<Alert>
+				<Info />
+				<AlertDescription>
 					Public chat links allow anyone with the URL to view a snapshot of a chat. These links do
 					not update when the original chat changes.
 				</AlertDescription>
diff --git a/surfsense_web/components/public-chat/public-chat-footer.tsx b/surfsense_web/components/public-chat/public-chat-footer.tsx
index e341a9a0c9..7d32633417 100644
--- a/surfsense_web/components/public-chat/public-chat-footer.tsx
+++ b/surfsense_web/components/public-chat/public-chat-footer.tsx
@@ -68,7 +68,7 @@ export function PublicChatFooter({ shareToken }: PublicChatFooterProps) {
 				size="lg"
 				onClick={handleCopyAndContinue}
 				disabled={isCloning}
-				className="gap-2 rounded-full px-6 shadow-lg transition-all duration-200 hover:scale-[1.02] hover:shadow-xl hover:brightness-110 hover:bg-primary"
+				className="gap-2 rounded-full px-6 shadow-lg transition-al select-none duration-200 hover:scale-[1.02] hover:shadow-xl hover:brightness-110 hover:bg-primary"
 			>
 				{isCloning ? <Spinner size="sm" /> : <Copy className="size-4" />}
 				Copy and continue this chat
diff --git a/surfsense_web/components/public-chat/public-chat-view.tsx b/surfsense_web/components/public-chat/public-chat-view.tsx
index f8dd6db5aa..356174280c 100644
--- a/surfsense_web/components/public-chat/public-chat-view.tsx
+++ b/surfsense_web/components/public-chat/public-chat-view.tsx
@@ -1,10 +1,11 @@
 "use client";
 
 import { AssistantRuntimeProvider } from "@assistant-ui/react";
-import { ThinkingStepsDataUI } from "@/components/assistant-ui/thinking-steps";
+import { StepSeparatorDataUI } from "@/components/assistant-ui/step-separator";
 import { Navbar } from "@/components/homepage/navbar";
 import { ReportPanel } from "@/components/report-panel/report-panel";
 import { Spinner } from "@/components/ui/spinner";
+import { TimelineDataUI } from "@/features/chat-messages/timeline";
 import { usePublicChat } from "@/hooks/use-public-chat";
 import { usePublicChatRuntime } from "@/hooks/use-public-chat-runtime";
 import { PublicChatFooter } from "./public-chat-footer";
@@ -40,7 +41,8 @@ export function PublicChatView({ shareToken }: PublicChatViewProps) {
 		<main className="min-h-screen bg-main-panel text-foreground overflow-x-hidden">
 			<Navbar scrolledBgClassName={navbarScrolledBg} />
 			<AssistantRuntimeProvider runtime={runtime}>
-				<ThinkingStepsDataUI />
+				<TimelineDataUI />
+				<StepSeparatorDataUI />
 				<div className="flex h-screen pt-16 overflow-hidden">
 					<div className="flex-1 flex flex-col min-w-0 overflow-hidden">
 						<PublicThread footer={<PublicChatFooter shareToken={shareToken} />} />
diff --git a/surfsense_web/components/public-chat/public-thread.tsx b/surfsense_web/components/public-chat/public-thread.tsx
index 1caaba2995..d35193cbe8 100644
--- a/surfsense_web/components/public-chat/public-thread.tsx
+++ b/surfsense_web/components/public-chat/public-thread.tsx
@@ -5,6 +5,7 @@ import {
 	AuiIf,
 	MessagePrimitive,
 	ThreadPrimitive,
+	type ToolCallMessagePartComponent,
 	useAuiState,
 } from "@assistant-ui/react";
 import { CheckIcon, CopyIcon } from "lucide-react";
@@ -13,11 +14,12 @@ import Image from "next/image";
 import { type FC, type ReactNode, useState } from "react";
 import { CitationMetadataProvider } from "@/components/assistant-ui/citation-metadata-context";
 import { MarkdownText } from "@/components/assistant-ui/markdown-text";
-import { ToolFallback } from "@/components/assistant-ui/tool-fallback";
+import { ReasoningMessagePart } from "@/components/assistant-ui/reasoning-message-part";
 import { TooltipIconButton } from "@/components/assistant-ui/tooltip-icon-button";
 import { GenerateImageToolUI } from "@/components/tool-ui/generate-image";
 import { GeneratePodcastToolUI } from "@/components/tool-ui/generate-podcast";
 import { GenerateReportToolUI } from "@/components/tool-ui/generate-report";
+import { GenerateResumeToolUI } from "@/components/tool-ui/generate-resume";
 
 const GenerateVideoPresentationToolUI = dynamic(
 	() =>
@@ -27,6 +29,8 @@ const GenerateVideoPresentationToolUI = dynamic(
 	{ ssr: false }
 );
 
+const NullToolUi: ToolCallMessagePartComponent = () => null;
+
 interface PublicThreadProps {
 	footer?: ReactNode;
 }
@@ -43,20 +47,21 @@ export const PublicThread: FC<PublicThreadProps> = ({ footer }) => {
 				["--thread-max-width" as string]: "44rem",
 			}}
 		>
-			<ThreadPrimitive.Viewport className="aui-thread-viewport relative flex flex-1 min-h-0 flex-col overflow-y-auto px-4 pt-4">
+			<ThreadPrimitive.Viewport
+				scrollToBottomOnInitialize
+				scrollToBottomOnThreadSwitch
+				className="aui-thread-viewport relative flex flex-1 min-h-0 flex-col overflow-y-auto px-4 pt-4 pb-6"
+			>
 				<ThreadPrimitive.Messages
 					components={{
 						UserMessage: PublicUserMessage,
 						AssistantMessage: PublicAssistantMessage,
 					}}
 				/>
-
-				{/* Spacer to ensure footer doesn't overlap last message */}
-				<div className="h-24" />
 			</ThreadPrimitive.Viewport>
 
 			{footer && (
-				<div className="sticky bottom-0 z-20 border-t bg-main-panel/95 backdrop-blur supports-backdrop-filter:bg-main-panel/60">
+				<div className="border-t bg-main-panel/95 backdrop-blur supports-backdrop-filter:bg-main-panel/60">
 					{footer}
 				</div>
 			)}
@@ -102,7 +107,7 @@ const UserAvatar: FC<AuthorMetadata & { hasError: boolean; onError: () => void }
 	}
 
 	return (
-		<div className="flex size-8 items-center justify-center rounded-full bg-primary/10 text-xs font-medium text-primary">
+		<div className="flex size-8 items-center justify-center rounded-full bg-muted text-xs font-medium text-foreground">
 			{initials}
 		</div>
 	);
@@ -119,7 +124,7 @@ const PublicUserMessage: FC = () => {
 		>
 			<div className="aui-user-message-content-wrapper col-start-2 min-w-0 flex items-end gap-2">
 				<div className="flex-1 min-w-0">
-					<div className="aui-user-message-content wrap-break-word rounded-2xl bg-muted px-4 py-2.5 text-foreground">
+					<div className="aui-user-message-content wrap-break-word rounded-xl bg-muted px-4 py-2.5 text-foreground">
 						<MessagePrimitive.Parts />
 					</div>
 				</div>
@@ -156,19 +161,17 @@ const PublicAssistantMessage: FC = () => {
 					<MessagePrimitive.Parts
 						components={{
 							Text: MarkdownText,
+							Reasoning: ReasoningMessagePart,
 							tools: {
 								by_name: {
 									generate_podcast: GeneratePodcastToolUI,
 									generate_report: GenerateReportToolUI,
+									generate_resume: GenerateResumeToolUI,
 									generate_video_presentation: GenerateVideoPresentationToolUI,
 									display_image: GenerateImageToolUI,
 									generate_image: GenerateImageToolUI,
-									web_search: () => null,
-									link_preview: () => null,
-									multi_link_preview: () => null,
-									scrape_webpage: () => null,
 								},
-								Fallback: ToolFallback,
+								Fallback: NullToolUi,
 							},
 						}}
 					/>
diff --git a/surfsense_web/components/report-panel/pdf-viewer.tsx b/surfsense_web/components/report-panel/pdf-viewer.tsx
new file mode 100644
index 0000000000..77d0f83a69
--- /dev/null
+++ b/surfsense_web/components/report-panel/pdf-viewer.tsx
@@ -0,0 +1,360 @@
+"use client";
+
+import { ZoomInIcon, ZoomOutIcon } from "lucide-react";
+import type { PDFDocumentProxy, RenderTask } from "pdfjs-dist";
+import * as pdfjsLib from "pdfjs-dist";
+import { type ReactNode, useCallback, useEffect, useRef, useState } from "react";
+import { Button } from "@/components/ui/button";
+import { Spinner } from "@/components/ui/spinner";
+import { getAuthHeaders } from "@/lib/auth-utils";
+
+pdfjsLib.GlobalWorkerOptions.workerSrc = new URL(
+	"pdfjs-dist/build/pdf.worker.min.mjs",
+	import.meta.url
+).toString();
+
+interface PdfViewerProps {
+	pdfUrl: string;
+	isPublic?: boolean;
+	/** Extra actions rendered on the right side of the zoom toolbar (e.g. download, version switcher) */
+	toolbarActions?: ReactNode;
+}
+
+interface PageDimensions {
+	width: number;
+	height: number;
+}
+
+const ZOOM_STEP = 0.15;
+const MIN_ZOOM = 0.5;
+const MAX_ZOOM = 3;
+const PAGE_GAP = 12;
+const SCROLL_DEBOUNCE_MS = 30;
+const BUFFER_PAGES = 1;
+
+export function PdfViewer({ pdfUrl, isPublic = false, toolbarActions }: PdfViewerProps) {
+	const [numPages, setNumPages] = useState(0);
+	const [scale, setScale] = useState(1);
+	const [loading, setLoading] = useState(true);
+	const [loadError, setLoadError] = useState<string | null>(null);
+
+	const scrollContainerRef = useRef<HTMLDivElement>(null);
+	const pdfDocRef = useRef<PDFDocumentProxy | null>(null);
+	const canvasRefs = useRef<Map<number, HTMLCanvasElement>>(new Map());
+	const renderTasksRef = useRef<Map<number, RenderTask>>(new Map());
+	const renderedScalesRef = useRef<Map<number, number>>(new Map());
+	const pageDimsRef = useRef<PageDimensions[]>([]);
+	const visiblePagesRef = useRef<Set<number>>(new Set());
+	const scrollTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
+
+	const getScaledHeight = useCallback(
+		(pageIndex: number) => {
+			const dims = pageDimsRef.current[pageIndex];
+			return dims ? Math.floor(dims.height * scale) : 0;
+		},
+		[scale]
+	);
+
+	const getVisibleRange = useCallback(() => {
+		const container = scrollContainerRef.current;
+		if (!container || pageDimsRef.current.length === 0) return { first: 1, last: 1 };
+
+		const scrollTop = container.scrollTop;
+		const viewportHeight = container.clientHeight;
+		const scrollBottom = scrollTop + viewportHeight;
+
+		let cumTop = 16;
+		let first = 1;
+		let last = pageDimsRef.current.length;
+
+		for (let i = 0; i < pageDimsRef.current.length; i++) {
+			const pageHeight = getScaledHeight(i);
+			const pageBottom = cumTop + pageHeight;
+
+			if (pageBottom >= scrollTop && first === 1) {
+				first = i + 1;
+			}
+			if (cumTop > scrollBottom) {
+				last = i;
+				break;
+			}
+
+			cumTop = pageBottom + PAGE_GAP;
+		}
+
+		first = Math.max(1, first - BUFFER_PAGES);
+		last = Math.min(pageDimsRef.current.length, last + BUFFER_PAGES);
+
+		return { first, last };
+	}, [getScaledHeight]);
+
+	const renderPage = useCallback(async (pageNum: number, currentScale: number) => {
+		const pdf = pdfDocRef.current;
+		const canvas = canvasRefs.current.get(pageNum);
+		if (!pdf || !canvas) return;
+
+		if (renderedScalesRef.current.get(pageNum) === currentScale) return;
+
+		const existing = renderTasksRef.current.get(pageNum);
+		if (existing) {
+			existing.cancel();
+			renderTasksRef.current.delete(pageNum);
+		}
+
+		try {
+			const page = await pdf.getPage(pageNum);
+			const viewport = page.getViewport({ scale: currentScale });
+			const dpr = window.devicePixelRatio || 1;
+
+			canvas.width = Math.floor(viewport.width * dpr);
+			canvas.height = Math.floor(viewport.height * dpr);
+			canvas.style.width = `${Math.floor(viewport.width)}px`;
+			canvas.style.height = `${Math.floor(viewport.height)}px`;
+
+			const renderTask = page.render({
+				canvas,
+				viewport,
+				transform: dpr !== 1 ? [dpr, 0, 0, dpr, 0, 0] : undefined,
+			});
+
+			renderTasksRef.current.set(pageNum, renderTask);
+
+			await renderTask.promise;
+			renderTasksRef.current.delete(pageNum);
+			renderedScalesRef.current.set(pageNum, currentScale);
+			page.cleanup();
+		} catch (err: unknown) {
+			if (err instanceof Error && err.message?.includes("cancelled")) return;
+			console.error(`Failed to render page ${pageNum}:`, err);
+		}
+	}, []);
+
+	const cleanupPage = useCallback((pageNum: number) => {
+		const existing = renderTasksRef.current.get(pageNum);
+		if (existing) {
+			existing.cancel();
+			renderTasksRef.current.delete(pageNum);
+		}
+
+		const canvas = canvasRefs.current.get(pageNum);
+		if (canvas) {
+			const ctx = canvas.getContext("2d");
+			if (ctx) ctx.clearRect(0, 0, canvas.width, canvas.height);
+			canvas.width = 0;
+			canvas.height = 0;
+		}
+
+		renderedScalesRef.current.delete(pageNum);
+	}, []);
+
+	const renderVisiblePages = useCallback(() => {
+		if (!pdfDocRef.current || pageDimsRef.current.length === 0) return;
+
+		const { first, last } = getVisibleRange();
+		const newVisible = new Set<number>();
+
+		for (let i = first; i <= last; i++) {
+			newVisible.add(i);
+			renderPage(i, scale);
+		}
+
+		for (const pageNum of visiblePagesRef.current) {
+			if (!newVisible.has(pageNum)) {
+				cleanupPage(pageNum);
+			}
+		}
+
+		visiblePagesRef.current = newVisible;
+	}, [getVisibleRange, renderPage, cleanupPage, scale]);
+
+	useEffect(() => {
+		let cancelled = false;
+
+		const loadDocument = async () => {
+			setLoading(true);
+			setLoadError(null);
+			setNumPages(0);
+			pageDimsRef.current = [];
+
+			try {
+				const loadingTask = pdfjsLib.getDocument({
+					url: pdfUrl,
+					httpHeaders: getAuthHeaders(),
+				});
+
+				const pdf = await loadingTask.promise;
+				if (cancelled) {
+					pdf.destroy();
+					return;
+				}
+
+				const dims: PageDimensions[] = [];
+				for (let i = 1; i <= pdf.numPages; i++) {
+					const page = await pdf.getPage(i);
+					const viewport = page.getViewport({ scale: 1 });
+					dims.push({ width: viewport.width, height: viewport.height });
+					page.cleanup();
+				}
+
+				if (cancelled) {
+					pdf.destroy();
+					return;
+				}
+
+				pdfDocRef.current = pdf;
+				pageDimsRef.current = dims;
+				setNumPages(pdf.numPages);
+				setLoading(false);
+			} catch (err: unknown) {
+				if (cancelled) return;
+				const message = err instanceof Error ? err.message : "Failed to load PDF";
+				setLoadError(message);
+				setLoading(false);
+			}
+		};
+
+		loadDocument();
+
+		return () => {
+			cancelled = true;
+			for (const task of renderTasksRef.current.values()) {
+				task.cancel();
+			}
+			renderTasksRef.current.clear();
+			renderedScalesRef.current.clear();
+			visiblePagesRef.current.clear();
+			pdfDocRef.current?.destroy();
+			pdfDocRef.current = null;
+		};
+	}, [pdfUrl]);
+
+	useEffect(() => {
+		if (numPages === 0) return;
+
+		renderedScalesRef.current.clear();
+		visiblePagesRef.current.clear();
+
+		const frame = requestAnimationFrame(() => {
+			renderVisiblePages();
+		});
+
+		return () => cancelAnimationFrame(frame);
+	}, [numPages, renderVisiblePages]);
+
+	useEffect(() => {
+		const container = scrollContainerRef.current;
+		if (!container || numPages === 0) return;
+
+		const handleScroll = () => {
+			if (scrollTimerRef.current) clearTimeout(scrollTimerRef.current);
+			scrollTimerRef.current = setTimeout(() => {
+				renderVisiblePages();
+			}, SCROLL_DEBOUNCE_MS);
+		};
+
+		container.addEventListener("scroll", handleScroll, { passive: true });
+		return () => {
+			container.removeEventListener("scroll", handleScroll);
+			if (scrollTimerRef.current) clearTimeout(scrollTimerRef.current);
+		};
+	}, [numPages, renderVisiblePages]);
+
+	const setCanvasRef = useCallback((pageNum: number, el: HTMLCanvasElement | null) => {
+		if (el) {
+			canvasRefs.current.set(pageNum, el);
+		} else {
+			canvasRefs.current.delete(pageNum);
+		}
+	}, []);
+
+	const zoomIn = useCallback(() => {
+		setScale((prev) => Math.min(MAX_ZOOM, +(prev + ZOOM_STEP).toFixed(2)));
+	}, []);
+
+	const zoomOut = useCallback(() => {
+		setScale((prev) => Math.max(MIN_ZOOM, +(prev - ZOOM_STEP).toFixed(2)));
+	}, []);
+
+	if (loadError) {
+		return (
+			<div className="flex flex-col items-center justify-center h-full gap-3 p-6 text-center">
+				<p className="font-medium text-foreground">Failed to load PDF</p>
+				<p className="text-sm text-muted-foreground">{loadError}</p>
+			</div>
+		);
+	}
+
+	return (
+		<div className="flex flex-col h-full">
+			{numPages > 0 && (
+				<div
+					className={`flex items-center px-4 py-2 border-b shrink-0 select-none ${isPublic ? "bg-main-panel" : "bg-sidebar"}`}
+				>
+					<div className="flex-1" aria-hidden="true" />
+					<div className="flex items-center justify-center gap-2">
+						<Button
+							variant="ghost"
+							size="icon"
+							onClick={zoomOut}
+							disabled={scale <= MIN_ZOOM}
+							className="size-7"
+						>
+							<ZoomOutIcon className="size-4" />
+						</Button>
+						<span className="text-xs text-muted-foreground tabular-nums min-w-[40px] text-center">
+							{Math.round(scale * 100)}%
+						</span>
+						<Button
+							variant="ghost"
+							size="icon"
+							onClick={zoomIn}
+							disabled={scale >= MAX_ZOOM}
+							className="size-7"
+						>
+							<ZoomInIcon className="size-4" />
+						</Button>
+					</div>
+					<div className="flex flex-1 items-center justify-end gap-1">{toolbarActions}</div>
+				</div>
+			)}
+
+			<div
+				ref={scrollContainerRef}
+				className={`relative flex-1 overflow-auto ${isPublic ? "bg-main-panel" : "bg-sidebar"}`}
+			>
+				{loading ? (
+					<div
+						className={`absolute inset-0 flex items-center justify-center ${isPublic ? "text-foreground" : "text-sidebar-foreground"}`}
+					>
+						<Spinner size="md" />
+					</div>
+				) : (
+					<div className="flex flex-col items-center py-4" style={{ gap: `${PAGE_GAP}px` }}>
+						{pageDimsRef.current.map((dims, i) => {
+							const pageNum = i + 1;
+							const scaledWidth = Math.floor(dims.width * scale);
+							const scaledHeight = Math.floor(dims.height * scale);
+							return (
+								<div
+									key={pageNum}
+									className="relative shrink-0"
+									style={{ width: scaledWidth, height: scaledHeight }}
+								>
+									<canvas
+										ref={(el) => setCanvasRef(pageNum, el)}
+										className="shadow-lg absolute inset-0"
+									/>
+									{numPages > 1 && (
+										<span className="absolute bottom-2 right-3 text-[10px] tabular-nums text-white/80 bg-black/50 px-1.5 py-0.5 rounded pointer-events-none">
+											Page {pageNum}/{numPages}
+										</span>
+									)}
+								</div>
+							);
+						})}
+					</div>
+				)}
+			</div>
+		</div>
+	);
+}
diff --git a/surfsense_web/components/report-panel/report-panel.tsx b/surfsense_web/components/report-panel/report-panel.tsx
index c602a3f04c..e3b8b23544 100644
--- a/surfsense_web/components/report-panel/report-panel.tsx
+++ b/surfsense_web/components/report-panel/report-panel.tsx
@@ -1,7 +1,7 @@
 "use client";
 
 import { useAtomValue, useSetAtom } from "jotai";
-import { ChevronDownIcon, XIcon } from "lucide-react";
+import { Check, ChevronDownIcon, Copy, Download, Pencil, XIcon } from "lucide-react";
 import dynamic from "next/dynamic";
 import { useCallback, useEffect, useRef, useState } from "react";
 import { toast } from "sonner";
@@ -18,14 +18,45 @@ import {
 	DropdownMenuItem,
 	DropdownMenuTrigger,
 } from "@/components/ui/dropdown-menu";
-import { Skeleton } from "@/components/ui/skeleton";
+import { Spinner } from "@/components/ui/spinner";
 import { useMediaQuery } from "@/hooks/use-media-query";
 import { baseApiService } from "@/lib/apis/base-api.service";
 import { authenticatedFetch } from "@/lib/auth-utils";
 
+function ReportPanelSkeleton() {
+	return (
+		<div className="space-y-6 p-6">
+			<div className="h-6 w-3/4 rounded-md bg-muted/60 animate-pulse" />
+			<div className="space-y-2.5">
+				<div className="h-3 w-full rounded-md bg-muted/60 animate-pulse" />
+				<div className="h-3 w-[95%] rounded-md bg-muted/60 animate-pulse [animation-delay:100ms]" />
+				<div className="h-3 w-[88%] rounded-md bg-muted/60 animate-pulse [animation-delay:200ms]" />
+				<div className="h-3 w-[60%] rounded-md bg-muted/60 animate-pulse [animation-delay:300ms]" />
+			</div>
+			<div className="h-5 w-2/5 rounded-md bg-muted/60 animate-pulse [animation-delay:400ms]" />
+			<div className="space-y-2.5">
+				<div className="h-3 w-full rounded-md bg-muted/60 animate-pulse [animation-delay:500ms]" />
+				<div className="h-3 w-[92%] rounded-md bg-muted/60 animate-pulse [animation-delay:600ms]" />
+				<div className="h-3 w-[97%] rounded-md bg-muted/60 animate-pulse [animation-delay:700ms]" />
+			</div>
+			<div className="h-5 w-1/3 rounded-md bg-muted/60 animate-pulse [animation-delay:800ms]" />
+			<div className="space-y-2.5">
+				<div className="h-3 w-[90%] rounded-md bg-muted/60 animate-pulse [animation-delay:900ms]" />
+				<div className="h-3 w-full rounded-md bg-muted/60 animate-pulse [animation-delay:1000ms]" />
+				<div className="h-3 w-[75%] rounded-md bg-muted/60 animate-pulse [animation-delay:1100ms]" />
+			</div>
+		</div>
+	);
+}
+
 const PlateEditor = dynamic(
 	() => import("@/components/editor/plate-editor").then((m) => ({ default: m.PlateEditor })),
-	{ ssr: false, loading: () => <Skeleton className="h-64 w-full" /> }
+	{ ssr: false, loading: () => <ReportPanelSkeleton /> }
+);
+
+const PdfViewer = dynamic(
+	() => import("@/components/report-panel/pdf-viewer").then((m) => ({ default: m.PdfViewer })),
+	{ ssr: false, loading: () => <ReportPanelSkeleton /> }
 );
 
 /**
@@ -43,6 +74,7 @@ const ReportContentResponseSchema = z.object({
 	id: z.number(),
 	title: z.string(),
 	content: z.string().nullish(),
+	content_type: z.string().default("markdown"),
 	report_metadata: z
 		.object({
 			status: z.enum(["ready", "failed"]).nullish(),
@@ -59,46 +91,6 @@ const ReportContentResponseSchema = z.object({
 type ReportContentResponse = z.infer<typeof ReportContentResponseSchema>;
 type VersionInfo = z.infer<typeof VersionInfoSchema>;
 
-/**
- * Shimmer loading skeleton for report panel
- */
-function ReportPanelSkeleton() {
-	return (
-		<div className="space-y-6 p-6">
-			{/* Title skeleton */}
-			<div className="h-6 w-3/4 rounded-md bg-muted/60 animate-pulse" />
-
-			{/* Paragraph 1 */}
-			<div className="space-y-2.5">
-				<div className="h-3 w-full rounded-md bg-muted/60 animate-pulse" />
-				<div className="h-3 w-[95%] rounded-md bg-muted/60 animate-pulse [animation-delay:100ms]" />
-				<div className="h-3 w-[88%] rounded-md bg-muted/60 animate-pulse [animation-delay:200ms]" />
-				<div className="h-3 w-[60%] rounded-md bg-muted/60 animate-pulse [animation-delay:300ms]" />
-			</div>
-
-			{/* Heading */}
-			<div className="h-5 w-2/5 rounded-md bg-muted/60 animate-pulse [animation-delay:400ms]" />
-
-			{/* Paragraph 2 */}
-			<div className="space-y-2.5">
-				<div className="h-3 w-full rounded-md bg-muted/60 animate-pulse [animation-delay:500ms]" />
-				<div className="h-3 w-[92%] rounded-md bg-muted/60 animate-pulse [animation-delay:600ms]" />
-				<div className="h-3 w-[97%] rounded-md bg-muted/60 animate-pulse [animation-delay:700ms]" />
-			</div>
-
-			{/* Heading */}
-			<div className="h-5 w-1/3 rounded-md bg-muted/60 animate-pulse [animation-delay:800ms]" />
-
-			{/* Paragraph 3 */}
-			<div className="space-y-2.5">
-				<div className="h-3 w-[90%] rounded-md bg-muted/60 animate-pulse [animation-delay:900ms]" />
-				<div className="h-3 w-full rounded-md bg-muted/60 animate-pulse [animation-delay:1000ms]" />
-				<div className="h-3 w-[75%] rounded-md bg-muted/60 animate-pulse [animation-delay:1100ms]" />
-			</div>
-		</div>
-	);
-}
-
 /**
  * Inner content component used by desktop panel, mobile drawer, and the layout right panel
  */
@@ -124,6 +116,7 @@ export function ReportPanelContent({
 	const [exporting, setExporting] = useState<string | null>(null);
 	const [saving, setSaving] = useState(false);
 	const copyTimerRef = useRef<ReturnType<typeof setTimeout> | undefined>(undefined);
+	const changeCountRef = useRef(0);
 
 	useEffect(() => {
 		return () => {
@@ -133,6 +126,7 @@ export function ReportPanelContent({
 
 	// Editor state — tracks the latest markdown from the Plate editor
 	const [editedMarkdown, setEditedMarkdown] = useState<string | null>(null);
+	const [isEditing, setIsEditing] = useState(false);
 
 	// Read-only when public (shareToken) OR shared (SEARCH_SPACE visibility)
 	const currentThreadState = useAtomValue(currentThreadAtom);
@@ -196,8 +190,22 @@ export function ReportPanelContent({
 	// Reset edited markdown when switching versions or reports
 	useEffect(() => {
 		setEditedMarkdown(null);
+		setIsEditing(false);
+		changeCountRef.current = 0;
 	}, [activeReportId]);
 
+	const handleReportMarkdownChange = useCallback(
+		(nextMarkdown: string) => {
+			if (!isEditing) return;
+			changeCountRef.current += 1;
+			// Plate may emit an initial normalize/serialize change on mount.
+			if (changeCountRef.current <= 1) return;
+			const savedMarkdown = reportContent?.content ?? "";
+			setEditedMarkdown(nextMarkdown === savedMarkdown ? null : nextMarkdown);
+		},
+		[isEditing, reportContent?.content]
+	);
+
 	// Copy markdown content (uses latest editor content)
 	const handleCopy = useCallback(async () => {
 		if (!currentMarkdown) return;
@@ -265,7 +273,7 @@ export function ReportPanelContent({
 
 	// Save edited report content
 	const handleSave = useCallback(async () => {
-		if (!currentMarkdown || !activeReportId) return;
+		if (!currentMarkdown || !activeReportId) return false;
 		setSaving(true);
 		try {
 			const response = await authenticatedFetch(
@@ -286,123 +294,247 @@ export function ReportPanelContent({
 			setReportContent((prev) => (prev ? { ...prev, content: currentMarkdown } : prev));
 			setEditedMarkdown(null);
 			toast.success("Report saved successfully");
+			return true;
 		} catch (err) {
 			console.error("Error saving report:", err);
 			toast.error(err instanceof Error ? err.message : "Failed to save report");
+			return false;
 		} finally {
 			setSaving(false);
 		}
 	}, [activeReportId, currentMarkdown]);
 
 	const activeVersionIndex = versions.findIndex((v) => v.id === activeReportId);
+	const isPublic = !!shareToken;
+	const isResume = reportContent?.content_type === "typst";
+	const showReportEditingTier = !isResume;
+	const hasUnsavedChanges = editedMarkdown !== null;
+	const showDesktopHeader = !!onClose;
 
-	return (
-		<>
-			{/* Action bar — always visible; buttons are disabled while loading */}
-			<div className="flex items-center justify-between px-4 py-2 shrink-0">
-				<div className="flex items-center gap-2">
-					{/* Copy button */}
+	const handleCancelEditing = useCallback(() => {
+		setEditedMarkdown(null);
+		changeCountRef.current = 0;
+		setIsEditing(false);
+	}, []);
+
+	const exportButton =
+		!isEditing &&
+		(isResume ? (
+			<Button
+				variant="ghost"
+				size="icon"
+				className="size-6"
+				onClick={() => handleExport("pdf")}
+				disabled={isLoading || !reportContent?.content || exporting !== null}
+			>
+				{exporting === "pdf" ? <Spinner size="xs" /> : <Download className="size-3.5" />}
+				<span className="sr-only">Download report</span>
+			</Button>
+		) : (
+			<DropdownMenu modal={insideDrawer ? false : undefined}>
+				<DropdownMenuTrigger asChild>
 					<Button
-						variant="outline"
-						size="sm"
-						onClick={handleCopy}
+						variant="ghost"
+						size="icon"
+						className="size-6"
 						disabled={isLoading || !reportContent?.content}
-						className="h-8 min-w-[80px] px-3.5 py-4 text-[15px] bg-sidebar select-none"
 					>
-						{copied ? "Copied" : "Copy"}
+						<Download className="size-3.5" />
+						<span className="sr-only">Export report</span>
 					</Button>
+				</DropdownMenuTrigger>
+				<DropdownMenuContent
+					align="end"
+					className={`min-w-[200px] select-none${insideDrawer ? " z-[100]" : ""}`}
+				>
+					<ExportDropdownItems
+						onExport={handleExport}
+						exporting={exporting}
+						showAllFormats={!shareToken}
+					/>
+				</DropdownMenuContent>
+			</DropdownMenu>
+		));
+
+	const versionSwitcher = !isEditing && versions.length > 1 && (
+		<DropdownMenu modal={insideDrawer ? false : undefined}>
+			<DropdownMenuTrigger asChild>
+				<Button variant="ghost" size="sm" className="h-6 gap-1 px-1.5 text-xs">
+					v{activeVersionIndex + 1}
+					<ChevronDownIcon className="size-3" />
+				</Button>
+			</DropdownMenuTrigger>
+			<DropdownMenuContent
+				align="end"
+				className={`min-w-[120px] select-none${insideDrawer ? " z-[100]" : ""}`}
+			>
+				{versions.map((v, i) => (
+					<DropdownMenuItem
+						key={v.id}
+						onClick={() => setActiveReportId(v.id)}
+						className={v.id === activeReportId ? "bg-accent font-medium" : ""}
+					>
+						Version {i + 1}
+					</DropdownMenuItem>
+				))}
+			</DropdownMenuContent>
+		</DropdownMenu>
+	);
+
+	const copyButton = !isEditing && showReportEditingTier && (
+		<Button
+			variant="ghost"
+			size="icon"
+			className="size-6"
+			onClick={() => {
+				void handleCopy();
+			}}
+			disabled={isLoading || !reportContent?.content}
+		>
+			{copied ? <Check className="size-3.5" /> : <Copy className="size-3.5" />}
+			<span className="sr-only">{copied ? "Copied report content" : "Copy report content"}</span>
+		</Button>
+	);
+
+	const editingActions =
+		showReportEditingTier &&
+		!isReadOnly &&
+		(isEditing ? (
+			<>
+				<Button
+					variant="ghost"
+					size="sm"
+					className="h-6 px-2 text-xs"
+					onClick={handleCancelEditing}
+					disabled={saving}
+				>
+					Cancel
+				</Button>
+				<Button
+					variant="secondary"
+					size="sm"
+					className="relative h-6 w-[56px] px-0 text-xs"
+					onClick={async () => {
+						const saveSucceeded = await handleSave();
+						if (saveSucceeded) setIsEditing(false);
+					}}
+					disabled={saving || !hasUnsavedChanges}
+				>
+					<span className={saving ? "opacity-0" : ""}>Save</span>
+					{saving && <Spinner size="xs" className="absolute" />}
+				</Button>
+			</>
+		) : (
+			<Button
+				variant="ghost"
+				size="icon"
+				className="size-6"
+				onClick={() => {
+					setEditedMarkdown(null);
+					changeCountRef.current = 0;
+					setIsEditing(true);
+				}}
+			>
+				<Pencil className="size-3.5" />
+				<span className="sr-only">Edit report</span>
+			</Button>
+		));
 
-					{/* Export dropdown */}
-					<DropdownMenu modal={insideDrawer ? false : undefined}>
-						<DropdownMenuTrigger asChild>
+	return (
+		<>
+			{showDesktopHeader ? (
+				<>
+					{/* Header — matches the Documents panel header pattern */}
+					<div className="shrink-0 flex h-12 items-center justify-between px-3 border-b">
+						<h2 className="select-none text-lg font-semibold">{isResume ? "Resume" : "Report"}</h2>
+						{onClose && (
 							<Button
-								variant="outline"
-								size="sm"
-								disabled={isLoading || !reportContent?.content}
-								className="h-8 px-3.5 py-4 text-[15px] gap-1.5 bg-sidebar select-none"
+								variant="ghost"
+								size="icon"
+								onClick={onClose}
+								className="h-8 w-8 rounded-full shrink-0 text-muted-foreground hover:text-accent-foreground"
 							>
-								Export
-								<ChevronDownIcon className="size-3" />
+								<XIcon className="h-4 w-4" />
+								<span className="sr-only">Close report panel</span>
 							</Button>
-						</DropdownMenuTrigger>
-						<DropdownMenuContent
-							align="start"
-							className={`min-w-[200px] select-none${insideDrawer ? " z-[100]" : ""}`}
-						>
-							<ExportDropdownItems
-								onExport={handleExport}
-								exporting={exporting}
-								showAllFormats={!shareToken}
-							/>
-						</DropdownMenuContent>
-					</DropdownMenu>
-
-					{/* Version switcher — only shown when multiple versions exist */}
-					{versions.length > 1 && (
-						<DropdownMenu modal={insideDrawer ? false : undefined}>
-							<DropdownMenuTrigger asChild>
-								<Button
-									variant="outline"
-									size="sm"
-									className="h-8 px-3.5 py-4 text-[15px] gap-1.5 bg-sidebar select-none"
-								>
-									v{activeVersionIndex + 1}
-									<ChevronDownIcon className="size-3" />
-								</Button>
-							</DropdownMenuTrigger>
-							<DropdownMenuContent
-								align="start"
-								className={`min-w-[120px] select-none${insideDrawer ? " z-[100]" : ""}`}
-							>
-								{versions.map((v, i) => (
-									<DropdownMenuItem
-										key={v.id}
-										onClick={() => setActiveReportId(v.id)}
-										className={v.id === activeReportId ? "bg-accent font-medium" : ""}
-									>
-										Version {i + 1}
-									</DropdownMenuItem>
-								))}
-							</DropdownMenuContent>
-						</DropdownMenu>
+						)}
+					</div>
+
+					{!isResume && (
+						<div className="flex h-10 items-center justify-between gap-2 border-b px-4 shrink-0">
+							<div className="min-w-0 flex-1">
+								<p className="truncate text-sm text-muted-foreground">
+									{reportContent?.title || title}
+								</p>
+							</div>
+							<div className="flex items-center gap-1 shrink-0">
+								{versionSwitcher}
+								{exportButton}
+								{copyButton}
+								{editingActions}
+							</div>
+						</div>
 					)}
-				</div>
-				{onClose && (
-					<Button variant="ghost" size="icon" onClick={onClose} className="size-7 shrink-0">
-						<XIcon className="size-4" />
-						<span className="sr-only">Close report panel</span>
-					</Button>
-				)}
-			</div>
+				</>
+			) : (
+				!isResume && (
+					<div className="flex h-14 items-center justify-between border-b px-4 shrink-0">
+						<div className="flex-1 min-w-0">
+							<h2 className="text-sm font-semibold truncate">{reportContent?.title || title}</h2>
+						</div>
+						<div className="flex items-center gap-1 shrink-0">
+							{versionSwitcher}
+							{exportButton}
+							{copyButton}
+							{editingActions}
+						</div>
+					</div>
+				)
+			)}
 
 			{/* Report content — skeleton/error/viewer/editor shown only in this area */}
 			<div className="flex-1 overflow-hidden">
 				{isLoading ? (
 					<ReportPanelSkeleton />
 				) : error || !reportContent ? (
-					<div className="flex flex-1 flex-col items-center justify-center gap-3 p-6 text-center">
+					<div className="flex flex-1 flex-col items-center justify-center gap-3 p-6 text-center select-none">
 						<div>
 							<p className="font-medium text-foreground">Failed to load report</p>
 							<p className="text-sm text-red-500 mt-1">{error || "An unknown error occurred"}</p>
 						</div>
 					</div>
+				) : reportContent.content_type === "typst" ? (
+					<PdfViewer
+						pdfUrl={`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}${shareToken ? `/api/v1/public/${shareToken}/reports/${activeReportId}/preview` : `/api/v1/reports/${activeReportId}/preview`}`}
+						isPublic={isPublic}
+						toolbarActions={
+							<>
+								{versionSwitcher}
+								{exportButton}
+							</>
+						}
+					/>
 				) : reportContent.content ? (
 					isReadOnly ? (
 						<div className="h-full overflow-y-auto px-5 py-4">
-							<MarkdownViewer content={reportContent.content} />
+							<MarkdownViewer content={reportContent.content} enableCitations />
 						</div>
 					) : (
 						<PlateEditor
+							key={`report-${activeReportId}-${isEditing ? "editing" : "viewing"}`}
 							preset="full"
 							markdown={reportContent.content}
-							onMarkdownChange={setEditedMarkdown}
-							readOnly={false}
+							onMarkdownChange={handleReportMarkdownChange}
+							readOnly={!isEditing}
 							placeholder="Report content..."
 							editorVariant="default"
-							onSave={handleSave}
-							hasUnsavedChanges={editedMarkdown !== null}
-							isSaving={saving}
+							allowModeToggle={false}
+							reserveToolbarSpace={isEditing}
+							defaultEditing={isEditing}
 							className="[&_[role=toolbar]]:!bg-sidebar"
+							// Show citation badges in view mode; raw `[citation:N]`
+							// text in edit mode so users can edit/delete tokens.
+							enableCitations={!isEditing}
 						/>
 					)
 				) : (
@@ -436,10 +568,12 @@ function DesktopReportPanel() {
 
 	if (!panelState.isOpen || !panelState.reportId) return null;
 
+	const isPublic = !!panelState.shareToken;
+
 	return (
 		<div
 			ref={panelRef}
-			className="flex w-[50%] max-w-[700px] min-w-[380px] flex-col border-l bg-sidebar text-sidebar-foreground animate-in slide-in-from-right-4 duration-300 ease-out"
+			className={`flex w-[50%] max-w-[700px] min-w-[380px] flex-col border-l animate-in slide-in-from-right-4 duration-300 ease-out ${isPublic ? "bg-main-panel text-foreground" : "bg-sidebar text-sidebar-foreground"}`}
 		>
 			<ReportPanelContent
 				reportId={panelState.reportId}
@@ -460,6 +594,8 @@ function MobileReportDrawer() {
 
 	if (!panelState.reportId) return null;
 
+	const isPublic = !!panelState.shareToken;
+
 	return (
 		<Drawer
 			open={panelState.isOpen}
@@ -469,7 +605,7 @@ function MobileReportDrawer() {
 			shouldScaleBackground={false}
 		>
 			<DrawerContent
-				className="h-[90vh] max-h-[90vh] z-80 bg-sidebar overflow-hidden"
+				className={`h-[90vh] max-h-[90vh] z-80 overflow-hidden ${isPublic ? "bg-main-panel" : "bg-sidebar"}`}
 				overlayClassName="z-80"
 			>
 				<DrawerHandle />
diff --git a/surfsense_web/components/search-space-form.tsx b/surfsense_web/components/search-space-form.tsx
index bcea48e3da..e87b365046 100644
--- a/surfsense_web/components/search-space-form.tsx
+++ b/surfsense_web/components/search-space-form.tsx
@@ -131,7 +131,7 @@ export function SearchSpaceForm({
 				>
 					<MoveLeftIcon
 						size={18}
-						className="text-muted-foreground group-hover:text-foreground transition-colors duration-200"
+						className="text-muted-foreground group-hover:text-accent-foreground transition-colors duration-200"
 					/>
 					<div className="absolute inset-0 rounded-full bg-gradient-to-r from-blue-500/10 to-purple-500/10 opacity-0 group-hover:opacity-100 transition-opacity duration-300" />
 				</Button>
diff --git a/surfsense_web/components/seo/breadcrumb-nav.tsx b/surfsense_web/components/seo/breadcrumb-nav.tsx
new file mode 100644
index 0000000000..824a695a52
--- /dev/null
+++ b/surfsense_web/components/seo/breadcrumb-nav.tsx
@@ -0,0 +1,47 @@
+import { ChevronRight } from "lucide-react";
+import Link from "next/link";
+import { BreadcrumbJsonLd } from "./json-ld";
+
+interface BreadcrumbItem {
+	name: string;
+	href: string;
+}
+
+interface BreadcrumbNavProps {
+	items: BreadcrumbItem[];
+	className?: string;
+}
+
+export function BreadcrumbNav({ items, className }: BreadcrumbNavProps) {
+	const jsonLdItems = items.map((item) => ({
+		name: item.name,
+		url: `https://www.surfsense.com${item.href}`,
+	}));
+
+	return (
+		<>
+			<BreadcrumbJsonLd items={jsonLdItems} />
+			<nav aria-label="Breadcrumb" className={className}>
+				<ol className="flex items-center gap-1.5 text-sm text-muted-foreground">
+					{items.map((item, index) => {
+						const isLast = index === items.length - 1;
+						return (
+							<li key={item.href} className="flex items-center gap-1.5">
+								{index > 0 && <ChevronRight className="h-3.5 w-3.5 shrink-0" aria-hidden />}
+								{isLast ? (
+									<span className="font-medium text-foreground" aria-current="page">
+										{item.name}
+									</span>
+								) : (
+									<Link href={item.href} className="transition-colors hover:text-foreground">
+										{item.name}
+									</Link>
+								)}
+							</li>
+						);
+					})}
+				</ol>
+			</nav>
+		</>
+	);
+}
diff --git a/surfsense_web/components/seo/json-ld.tsx b/surfsense_web/components/seo/json-ld.tsx
new file mode 100644
index 0000000000..03f3293c32
--- /dev/null
+++ b/surfsense_web/components/seo/json-ld.tsx
@@ -0,0 +1,176 @@
+interface JsonLdProps {
+	data: Record<string, unknown>;
+}
+
+export function JsonLd({ data }: JsonLdProps) {
+	return (
+		// biome-ignore lint/security/noDangerouslySetInnerHtml: JSON-LD structured data requires dangerouslySetInnerHTML for script injection
+		<script type="application/ld+json" dangerouslySetInnerHTML={{ __html: JSON.stringify(data) }} />
+	);
+}
+
+export function OrganizationJsonLd() {
+	return (
+		<JsonLd
+			data={{
+				"@context": "https://schema.org",
+				"@type": "Organization",
+				name: "SurfSense",
+				url: "https://www.surfsense.com",
+				logo: "https://www.surfsense.com/logo.png",
+				description:
+					"Open source NotebookLM alternative for teams with no data limits. Use ChatGPT, Claude AI, and any AI model for free.",
+				sameAs: ["https://github.com/MODSetter/SurfSense", "https://discord.gg/Cg2M4GUJ"],
+				contactPoint: {
+					"@type": "ContactPoint",
+					email: "rohan@surfsense.com",
+					contactType: "sales",
+				},
+			}}
+		/>
+	);
+}
+
+export function WebSiteJsonLd() {
+	return (
+		<JsonLd
+			data={{
+				"@context": "https://schema.org",
+				"@type": "WebSite",
+				name: "SurfSense",
+				url: "https://www.surfsense.com",
+				description:
+					"Open source NotebookLM alternative for teams with no data limits. Free ChatGPT, Claude AI, and any AI model.",
+				potentialAction: {
+					"@type": "SearchAction",
+					target: {
+						"@type": "EntryPoint",
+						urlTemplate: "https://www.surfsense.com/docs?search={search_term_string}",
+					},
+					"query-input": "required name=search_term_string",
+				},
+			}}
+		/>
+	);
+}
+
+export function SoftwareApplicationJsonLd() {
+	return (
+		<JsonLd
+			data={{
+				"@context": "https://schema.org",
+				"@type": "SoftwareApplication",
+				name: "SurfSense",
+				applicationCategory: "BusinessApplication",
+				operatingSystem: "Windows, macOS, Linux, Web",
+				offers: {
+					"@type": "Offer",
+					price: "0",
+					priceCurrency: "USD",
+					description: "Free plan with 500 pages included",
+				},
+				description:
+					"Open source NotebookLM alternative with free access to ChatGPT, Claude AI, and any model. Connect Slack, Google Drive, Notion, Confluence, GitHub, and dozens more data sources.",
+				url: "https://www.surfsense.com",
+				downloadUrl: "https://github.com/MODSetter/SurfSense/releases",
+				featureList: [
+					"Free access to ChatGPT, Claude AI, and any AI model",
+					"AI-powered semantic search across all connected tools",
+					"Federated search across Slack, Google Drive, Notion, Confluence, GitHub",
+					"No data limits with open source self-hosting",
+					"Real-time collaborative team chats",
+					"Document Q&A with citations",
+					"Report generation",
+					"Podcast and video generation from sources",
+					"Enterprise knowledge management",
+					"Self-hostable and privacy-focused",
+				],
+			}}
+		/>
+	);
+}
+
+export function ArticleJsonLd({
+	title,
+	description,
+	url,
+	datePublished,
+	dateModified,
+	author,
+	image,
+}: {
+	title: string;
+	description: string;
+	url: string;
+	datePublished: string;
+	dateModified?: string;
+	author: string;
+	image?: string;
+}) {
+	return (
+		<JsonLd
+			data={{
+				"@context": "https://schema.org",
+				"@type": "Article",
+				headline: title,
+				description,
+				url,
+				datePublished,
+				...(dateModified ? { dateModified } : {}),
+				author: {
+					"@type": "Organization",
+					name: author,
+				},
+				publisher: {
+					"@type": "Organization",
+					name: "SurfSense",
+					logo: {
+						"@type": "ImageObject",
+						url: "https://www.surfsense.com/logo.png",
+					},
+				},
+				image: image || "https://www.surfsense.com/og-image.png",
+				mainEntityOfPage: {
+					"@type": "WebPage",
+					"@id": url,
+				},
+			}}
+		/>
+	);
+}
+
+export function BreadcrumbJsonLd({ items }: { items: { name: string; url: string }[] }) {
+	return (
+		<JsonLd
+			data={{
+				"@context": "https://schema.org",
+				"@type": "BreadcrumbList",
+				itemListElement: items.map((item, index) => ({
+					"@type": "ListItem",
+					position: index + 1,
+					name: item.name,
+					item: item.url,
+				})),
+			}}
+		/>
+	);
+}
+
+export function FAQJsonLd({ questions }: { questions: { question: string; answer: string }[] }) {
+	return (
+		<JsonLd
+			data={{
+				"@context": "https://schema.org",
+				"@type": "FAQPage",
+				mainEntity: questions.map((q) => ({
+					"@type": "Question",
+					name: q.question,
+					acceptedAnswer: {
+						"@type": "Answer",
+						text: q.answer,
+					},
+				})),
+			}}
+		/>
+	);
+}
diff --git a/surfsense_web/components/settings/model-config-manager.tsx b/surfsense_web/components/settings/agent-model-manager.tsx
similarity index 62%
rename from surfsense_web/components/settings/model-config-manager.tsx
rename to surfsense_web/components/settings/agent-model-manager.tsx
index 0b89379beb..b0e13d3d74 100644
--- a/surfsense_web/components/settings/model-config-manager.tsx
+++ b/surfsense_web/components/settings/agent-model-manager.tsx
@@ -1,17 +1,7 @@
 "use client";
 
 import { useAtomValue } from "jotai";
-import {
-	AlertCircle,
-	Dot,
-	Edit3,
-	FileText,
-	Info,
-	MessageSquareQuote,
-	RefreshCw,
-	Trash2,
-	Wand2,
-} from "lucide-react";
+import { AlertCircle, Dot, FileText, Info, Pencil, RefreshCw, Trash2 } from "lucide-react";
 import { useMemo, useState } from "react";
 import { membersAtom, myAccessAtom } from "@/atoms/members/members-query.atoms";
 import { deleteNewLLMConfigMutationAtom } from "@/atoms/new-llm-config/new-llm-config-mutation.atoms";
@@ -35,6 +25,7 @@ import { Avatar, AvatarFallback, AvatarImage } from "@/components/ui/avatar";
 import { Badge } from "@/components/ui/badge";
 import { Button } from "@/components/ui/button";
 import { Card, CardContent } from "@/components/ui/card";
+import { Separator } from "@/components/ui/separator";
 import { Skeleton } from "@/components/ui/skeleton";
 import { Spinner } from "@/components/ui/spinner";
 import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/components/ui/tooltip";
@@ -43,7 +34,7 @@ import { useMediaQuery } from "@/hooks/use-media-query";
 import { getProviderIcon } from "@/lib/provider-icons";
 import { cn } from "@/lib/utils";
 
-interface ModelConfigManagerProps {
+interface AgentModelManagerProps {
 	searchSpaceId: number;
 }
 
@@ -55,7 +46,7 @@ function getInitials(name: string): string {
 	return name.slice(0, 2).toUpperCase();
 }
 
-export function ModelConfigManager({ searchSpaceId }: ModelConfigManagerProps) {
+export function AgentModelManager({ searchSpaceId }: AgentModelManagerProps) {
 	const isDesktop = useMediaQuery("(min-width: 768px)");
 	// Mutations
 	const { mutateAsync: deleteConfig, isPending: isDeleting } = useAtomValue(
@@ -140,7 +131,7 @@ export function ModelConfigManager({ searchSpaceId }: ModelConfigManagerProps) {
 					<Button
 						variant="outline"
 						onClick={openNewDialog}
-						className="gap-2 bg-white text-black hover:bg-neutral-100 dark:bg-white dark:text-black dark:hover:bg-neutral-200"
+						className="gap-2 border-transparent bg-white text-[#1f1f1f] font-medium hover:bg-zinc-100 hover:text-[#1f1f1f] dark:border-transparent dark:bg-white dark:text-[#1f1f1f]"
 					>
 						Add Model
 					</Button>
@@ -162,42 +153,52 @@ export function ModelConfigManager({ searchSpaceId }: ModelConfigManagerProps) {
 			{/* Read-only / Limited permissions notice */}
 			{access && !isLoading && isReadOnly && (
 				<div>
-					<Alert className="bg-muted/50 py-3 md:py-4">
-						<Info className="h-3 w-3 md:h-4 md:w-4 shrink-0" />
-						<AlertDescription className="text-xs md:text-sm">
-							You have <span className="font-medium">read-only</span> access to LLM configurations.
-							Contact a space owner to request additional permissions.
+					<Alert>
+						<Info />
+						<AlertDescription>
+							<p>
+								You have <span className="font-medium">read-only</span> access to LLM
+								configurations. Contact a space owner to request additional permissions.
+							</p>
 						</AlertDescription>
 					</Alert>
 				</div>
 			)}
 			{access && !isLoading && !isReadOnly && (!canCreate || !canUpdate || !canDelete) && (
 				<div>
-					<Alert className="bg-muted/50 py-3 md:py-4">
-						<Info className="h-3 w-3 md:h-4 md:w-4 shrink-0" />
-						<AlertDescription className="text-xs md:text-sm">
-							You can{" "}
-							{[canCreate && "create", canUpdate && "edit", canDelete && "delete"]
-								.filter(Boolean)
-								.join(" and ")}{" "}
-							configurations
-							{!canDelete && ", but cannot delete them"}.
+					<Alert>
+						<Info />
+						<AlertDescription>
+							<p>
+								You can{" "}
+								{[canCreate && "create", canUpdate && "edit", canDelete && "delete"]
+									.filter(Boolean)
+									.join(" and ")}{" "}
+								configurations
+								{!canDelete && ", but cannot delete them"}.
+							</p>
 						</AlertDescription>
 					</Alert>
 				</div>
 			)}
 
 			{/* Global Configs Info */}
-			{globalConfigs.length > 0 && (
-				<Alert className="bg-muted/50 py-3">
-					<Info className="h-3 w-3 md:h-4 md:w-4 shrink-0" />
-					<AlertDescription className="text-xs md:text-sm">
-						<p>
-							<span className="font-medium">
-								{globalConfigs.length} global {globalConfigs.length === 1 ? "model" : "models"}
-							</span>{" "}
-							available from your administrator. Use the model selector to view and select them.
-						</p>
+			{(isLoading || globalConfigs.length > 0) && (
+				<Alert>
+					<Info />
+					<AlertDescription>
+						{isLoading ? (
+							<div className="flex min-h-[1.625em] items-center">
+								<Skeleton className="h-4 w-60 bg-accent-foreground/15" />
+							</div>
+						) : (
+							<p>
+								<span className="font-medium">
+									{globalConfigs.length} global {globalConfigs.length === 1 ? "model" : "models"}
+								</span>{" "}
+								available from your administrator.
+							</p>
+						)}
 					</AlertDescription>
 				</Alert>
 			)}
@@ -206,31 +207,11 @@ export function ModelConfigManager({ searchSpaceId }: ModelConfigManagerProps) {
 			{isLoading && (
 				<div className="grid gap-3 grid-cols-1 sm:grid-cols-2 xl:grid-cols-3">
 					{["skeleton-a", "skeleton-b", "skeleton-c"].map((key) => (
-						<Card key={key} className="border-border/60">
-							<CardContent className="p-4 flex flex-col gap-3">
-								{/* Header */}
-								<div className="flex items-start justify-between gap-2">
-									<div className="space-y-1.5 flex-1 min-w-0">
-										<Skeleton className="h-4 w-28 md:w-32" />
-										<Skeleton className="h-3 w-40 md:w-48" />
-									</div>
-								</div>
-								{/* Provider + Model */}
-								<div className="flex items-center gap-2">
-									<Skeleton className="h-5 w-16 rounded-full" />
-									<Skeleton className="h-5 w-24 rounded-md" />
-								</div>
-								{/* Feature badges */}
-								<div className="flex items-center gap-1.5">
-									<Skeleton className="h-5 w-20 rounded-full" />
-									<Skeleton className="h-5 w-16 rounded-full" />
-								</div>
-								{/* Footer */}
-								<div className="flex items-center gap-2 pt-2 border-t border-border/40">
-									<Skeleton className="h-3 w-20" />
-									<Skeleton className="h-4 w-4 rounded-full" />
-									<Skeleton className="h-3 w-16" />
-								</div>
+						<Card key={key} className="border-accent bg-accent/20">
+							<CardContent className="p-4 flex flex-col gap-3 min-h-32">
+								<Skeleton className="h-4 w-32 md:w-40 bg-accent" />
+								<Skeleton className="h-3 w-full bg-accent" />
+								<Skeleton className="h-3 w-24 md:w-28 bg-accent mt-auto" />
 							</CardContent>
 						</Card>
 					))}
@@ -260,22 +241,27 @@ export function ModelConfigManager({ searchSpaceId }: ModelConfigManagerProps) {
 
 								return (
 									<div key={config.id}>
-										<Card className="group relative overflow-hidden transition-all duration-200 border-border/60 hover:shadow-md h-full">
+										<Card className="group relative overflow-hidden transition-all duration-200 border-accent bg-accent/20 hover:shadow-md h-full">
 											<CardContent className="p-4 flex flex-col gap-3 h-full">
-												{/* Header: Name + Actions */}
-												<div className="flex items-start justify-between gap-2">
-													<div className="min-w-0 flex-1">
-														<h4 className="text-sm font-semibold tracking-tight truncate">
-															{config.name}
-														</h4>
-														{config.description && (
-															<p className="text-[11px] text-muted-foreground/70 truncate mt-0.5">
-																{config.description}
-															</p>
-														)}
+												{/* Header: Icon + Name + Actions */}
+												<div className="flex items-center justify-between gap-2">
+													<div className="flex items-center gap-2.5 min-w-0 flex-1">
+														<div className="shrink-0">
+															{getProviderIcon(config.provider, { className: "size-4" })}
+														</div>
+														<div className="min-w-0 flex-1">
+															<h4 className="text-sm font-semibold tracking-tight truncate">
+																{config.name}
+															</h4>
+															{config.description && (
+																<p className="text-[11px] text-muted-foreground/70 truncate mt-0.5">
+																	{config.description}
+																</p>
+															)}
+														</div>
 													</div>
 													{(canUpdate || canDelete) && (
-														<div className="flex items-center gap-0.5 shrink-0 sm:opacity-0 sm:group-hover:opacity-100 transition-opacity duration-150">
+														<div className="flex items-center gap-1 shrink-0 sm:w-0 sm:overflow-hidden sm:group-hover:w-auto sm:opacity-0 sm:group-hover:opacity-100 transition-all duration-150">
 															{canUpdate && (
 																<TooltipProvider>
 																	<Tooltip open={isDesktop ? undefined : false}>
@@ -284,9 +270,9 @@ export function ModelConfigManager({ searchSpaceId }: ModelConfigManagerProps) {
 																				variant="ghost"
 																				size="icon"
 																				onClick={() => openEditDialog(config)}
-																				className="h-7 w-7 text-muted-foreground hover:text-foreground"
+																				className="h-7 w-7 rounded-lg text-muted-foreground hover:text-accent-foreground"
 																			>
-																				<Edit3 className="h-3 w-3" />
+																				<Pencil className="h-3 w-3" />
 																			</Button>
 																		</TooltipTrigger>
 																		<TooltipContent>Edit</TooltipContent>
@@ -301,7 +287,7 @@ export function ModelConfigManager({ searchSpaceId }: ModelConfigManagerProps) {
 																				variant="ghost"
 																				size="icon"
 																				onClick={() => setConfigToDelete(config)}
-																				className="h-7 w-7 text-muted-foreground hover:text-destructive"
+																				className="h-7 w-7 rounded-lg text-muted-foreground hover:text-destructive"
 																			>
 																				<Trash2 className="h-3 w-3" />
 																			</Button>
@@ -314,30 +300,21 @@ export function ModelConfigManager({ searchSpaceId }: ModelConfigManagerProps) {
 													)}
 												</div>
 
-												{/* Provider + Model */}
-												<div className="flex items-center gap-2 flex-wrap">
-													{getProviderIcon(config.provider, { className: "size-3.5 shrink-0" })}
-													<code className="text-[11px] font-mono text-muted-foreground bg-muted/60 px-2 py-0.5 rounded-md truncate max-w-[160px]">
-														{config.model_name}
-													</code>
-												</div>
-
 												{/* Feature badges */}
 												<div className="flex items-center gap-1.5 flex-wrap">
 													{config.citations_enabled && (
 														<Badge
-															variant="outline"
-															className="text-[10px] px-1.5 py-0.5 border-emerald-500/30 text-emerald-700 dark:text-emerald-300 bg-emerald-500/5"
+															variant="secondary"
+															className="text-[10px] px-1.5 py-0.5 border-0 text-muted-foreground bg-muted"
 														>
-															<MessageSquareQuote className="h-2.5 w-2.5 mr-1" />
 															Citations
 														</Badge>
 													)}
 													{!config.use_default_system_instructions &&
 														config.system_instructions && (
 															<Badge
-																variant="outline"
-																className="text-[10px] px-1.5 py-0.5 border-blue-500/30 text-blue-700 dark:text-blue-300 bg-blue-500/5"
+																variant="secondary"
+																className="text-[10px] px-1.5 py-0.5 border-0 text-muted-foreground bg-muted"
 															>
 																<FileText className="h-2.5 w-2.5 mr-1" />
 																Custom
@@ -346,41 +323,44 @@ export function ModelConfigManager({ searchSpaceId }: ModelConfigManagerProps) {
 												</div>
 
 												{/* Footer: Date + Creator */}
-												<div className="flex items-center gap-2 pt-2 border-t border-border/40 mt-auto">
-													<span className="text-[11px] text-muted-foreground/60">
-														{new Date(config.created_at).toLocaleDateString(undefined, {
-															year: "numeric",
-															month: "short",
-															day: "numeric",
-														})}
-													</span>
-													{member && (
-														<>
-															<Dot className="h-4 w-4 text-muted-foreground/30" />
-															<TooltipProvider>
-																<Tooltip open={isDesktop ? undefined : false}>
-																	<TooltipTrigger asChild>
-																		<div className="flex items-center gap-1.5 cursor-default">
-																			<Avatar className="size-4.5 shrink-0">
-																				{member.avatarUrl && (
-																					<AvatarImage src={member.avatarUrl} alt={member.name} />
-																				)}
-																				<AvatarFallback className="text-[9px]">
-																					{getInitials(member.name)}
-																				</AvatarFallback>
-																			</Avatar>
-																			<span className="text-[11px] text-muted-foreground/60 truncate max-w-[120px]">
-																				{member.name}
-																			</span>
-																		</div>
-																	</TooltipTrigger>
-																	<TooltipContent side="bottom">
-																		{member.email || member.name}
-																	</TooltipContent>
-																</Tooltip>
-															</TooltipProvider>
-														</>
-													)}
+												<div className="mt-auto space-y-2">
+													<Separator className="bg-accent" />
+													<div className="flex items-center">
+														<span className="shrink-0 text-[11px] text-muted-foreground/60 whitespace-nowrap">
+															{new Date(config.created_at).toLocaleDateString(undefined, {
+																year: "numeric",
+																month: "short",
+																day: "numeric",
+															})}
+														</span>
+														{member && (
+															<>
+																<Dot className="h-4 w-4 text-muted-foreground/30 shrink-0" />
+																<TooltipProvider>
+																	<Tooltip open={isDesktop ? undefined : false}>
+																		<TooltipTrigger asChild>
+																			<div className="min-w-0 flex items-center gap-1.5 cursor-default">
+																				<Avatar className="size-4.5 shrink-0">
+																					{member.avatarUrl && (
+																						<AvatarImage src={member.avatarUrl} alt={member.name} />
+																					)}
+																					<AvatarFallback className="text-[9px]">
+																						{getInitials(member.name)}
+																					</AvatarFallback>
+																				</Avatar>
+																				<span className="text-[11px] text-muted-foreground/60 truncate">
+																					{member.name}
+																				</span>
+																			</div>
+																		</TooltipTrigger>
+																		<TooltipContent side="bottom">
+																			{member.email || member.name}
+																		</TooltipContent>
+																	</Tooltip>
+																</TooltipProvider>
+															</>
+														)}
+													</div>
 												</div>
 											</CardContent>
 										</Card>
diff --git a/surfsense_web/components/settings/buy-pages-content.tsx b/surfsense_web/components/settings/buy-pages-content.tsx
index 5e2c6f9dbc..82b8d8e2ab 100644
--- a/surfsense_web/components/settings/buy-pages-content.tsx
+++ b/surfsense_web/components/settings/buy-pages-content.tsx
@@ -72,44 +72,49 @@ export function BuyPagesContent() {
 			<div className="space-y-3">
 				{/* Stepper */}
 				<div className="flex items-center justify-center gap-3">
-					<button
+					<Button
 						type="button"
+						variant="ghost"
+						size="icon"
 						onClick={() => setQuantity((q) => Math.max(1, q - 1))}
 						disabled={quantity <= 1 || purchaseMutation.isPending}
-						className="flex h-8 w-8 items-center justify-center rounded-md border transition-colors hover:bg-muted disabled:opacity-40"
+						className="size-8 text-muted-foreground shadow-none transition-colors hover:bg-muted hover:text-white disabled:opacity-40"
 					>
 						<Minus className="h-3.5 w-3.5" />
-					</button>
+					</Button>
 					<span className="min-w-28 text-center text-lg font-semibold tabular-nums">
 						{totalPages.toLocaleString()}
 					</span>
-					<button
+					<Button
 						type="button"
+						variant="ghost"
+						size="icon"
 						onClick={() => setQuantity((q) => Math.min(100, q + 1))}
 						disabled={quantity >= 100 || purchaseMutation.isPending}
-						className="flex h-8 w-8 items-center justify-center rounded-md border transition-colors hover:bg-muted disabled:opacity-40"
+						className="size-8 text-muted-foreground shadow-none transition-colors hover:bg-muted hover:text-white disabled:opacity-40"
 					>
 						<Plus className="h-3.5 w-3.5" />
-					</button>
+					</Button>
 				</div>
 
 				{/* Quick-pick presets */}
 				<div className="flex flex-wrap justify-center gap-1.5">
 					{PRESET_MULTIPLIERS.map((m) => (
-						<button
+						<Button
 							key={m}
 							type="button"
+							variant="ghost"
 							onClick={() => setQuantity(m)}
 							disabled={purchaseMutation.isPending}
 							className={cn(
-								"rounded-md border px-2.5 py-1 text-xs font-medium tabular-nums transition-colors disabled:opacity-60",
+								"h-auto rounded-md px-2.5 py-1 text-xs font-medium tabular-nums transition-colors disabled:opacity-60",
 								quantity === m
-									? "border-emerald-500 bg-emerald-500/10 text-emerald-600 dark:text-emerald-400"
-									: "border-border hover:border-emerald-500/40 hover:bg-muted/40"
+									? "bg-accent text-accent-foreground"
+									: "text-muted-foreground hover:bg-accent hover:text-accent-foreground"
 							)}
 						>
 							{(m * PAGE_PACK_SIZE).toLocaleString()}
-						</button>
+						</Button>
 					))}
 				</div>
 
@@ -121,7 +126,7 @@ export function BuyPagesContent() {
 				</div>
 
 				<Button
-					className="w-full bg-emerald-600 text-white hover:bg-emerald-700"
+					className="w-full"
 					disabled={purchaseMutation.isPending || !hasValidSearchSpace}
 					onClick={handleBuyNow}
 				>
diff --git a/surfsense_web/components/settings/buy-tokens-content.tsx b/surfsense_web/components/settings/buy-tokens-content.tsx
new file mode 100644
index 0000000000..4b0605f28a
--- /dev/null
+++ b/surfsense_web/components/settings/buy-tokens-content.tsx
@@ -0,0 +1,180 @@
+"use client";
+
+import { useQuery as useZeroQuery } from "@rocicorp/zero/react";
+import { useMutation, useQuery } from "@tanstack/react-query";
+import { Minus, Plus } from "lucide-react";
+import { useParams } from "next/navigation";
+import { useState } from "react";
+import { toast } from "sonner";
+import { Button } from "@/components/ui/button";
+import { Progress } from "@/components/ui/progress";
+import { Spinner } from "@/components/ui/spinner";
+import { stripeApiService } from "@/lib/apis/stripe-api.service";
+import { AppError } from "@/lib/error";
+import { cn } from "@/lib/utils";
+import { queries } from "@/zero/queries";
+
+// One pack = $1.00 of credit, stored as 1_000_000 micro-USD on the
+// backend. Premium turns are debited at the actual provider cost
+// reported by LiteLLM, so $1 of credit always buys $1 of provider
+// usage at cost.
+const CREDIT_PER_PACK_MICROS = 1_000_000;
+const PRICE_PER_PACK_USD = 1;
+const PRESET_MULTIPLIERS = [1, 2, 5, 10, 25, 50] as const;
+
+const formatUsd = (micros: number, options?: { compact?: boolean }) => {
+	const dollars = micros / 1_000_000;
+	if (options?.compact && dollars >= 1) return `$${dollars.toFixed(2)}`;
+	if (dollars >= 100) return `$${dollars.toFixed(0)}`;
+	if (dollars >= 1) return `$${dollars.toFixed(2)}`;
+	if (dollars > 0) return `$${dollars.toFixed(3)}`;
+	return "$0";
+};
+
+export function BuyTokensContent() {
+	const params = useParams();
+	const searchSpaceId = Number(params?.search_space_id);
+	const [quantity, setQuantity] = useState(1);
+
+	// Server config flag: stays on REST, not per-user.
+	const { data: tokenStatus } = useQuery({
+		queryKey: ["token-status"],
+		queryFn: () => stripeApiService.getTokenStatus(),
+	});
+
+	// Live per-user balance via Zero.
+	const [me] = useZeroQuery(queries.user.me({}));
+
+	const purchaseMutation = useMutation({
+		mutationFn: stripeApiService.createTokenCheckoutSession,
+		onSuccess: (response) => {
+			window.location.assign(response.checkout_url);
+		},
+		onError: (error) => {
+			if (error instanceof AppError && error.message) {
+				toast.error(error.message);
+				return;
+			}
+			toast.error("Failed to start checkout. Please try again.");
+		},
+	});
+
+	const totalCreditMicros = quantity * CREDIT_PER_PACK_MICROS;
+	const totalPrice = quantity * PRICE_PER_PACK_USD;
+
+	if (tokenStatus && !tokenStatus.token_buying_enabled) {
+		return (
+			<div className="w-full space-y-3 text-center">
+				<h2 className="text-xl font-bold tracking-tight">Buy Premium Credit</h2>
+				<p className="text-sm text-muted-foreground">
+					Credit purchases are temporarily unavailable.
+				</p>
+			</div>
+		);
+	}
+
+	const used = me?.premiumCreditMicrosUsed ?? 0;
+	const limit = me?.premiumCreditMicrosLimit ?? 0;
+	// Mirrors the backend formula in stripe_routes.py (max(0, limit - used)).
+	const remaining = Math.max(0, limit - used);
+	const usagePercentage = me ? Math.min((used / Math.max(limit, 1)) * 100, 100) : 0;
+
+	return (
+		<div className="w-full space-y-5">
+			<div className="text-center">
+				<h2 className="text-xl font-bold tracking-tight">Buy Premium Credit</h2>
+				<p className="mt-1 text-sm text-muted-foreground">
+					$1 buys $1 of credit, billed at provider cost
+				</p>
+			</div>
+
+			{me && (
+				<div className="rounded-lg border bg-muted/20 p-3 space-y-1.5">
+					<div className="flex justify-between items-center text-xs">
+						<span className="text-muted-foreground">
+							{formatUsd(used)} / {formatUsd(limit)} of credit
+						</span>
+						<span className="font-medium">{usagePercentage.toFixed(0)}%</span>
+					</div>
+					<Progress value={usagePercentage} className="h-1.5 [&>div]:bg-purple-500" />
+					<p className="text-[11px] text-muted-foreground">
+						{formatUsd(remaining)} of credit remaining
+					</p>
+				</div>
+			)}
+
+			<div className="space-y-3">
+				<div className="flex items-center justify-center gap-3">
+					<Button
+						type="button"
+						variant="ghost"
+						size="icon"
+						onClick={() => setQuantity((q) => Math.max(1, q - 1))}
+						disabled={quantity <= 1 || purchaseMutation.isPending}
+						className="size-8 text-muted-foreground shadow-none transition-colors hover:bg-muted hover:text-white disabled:opacity-40"
+					>
+						<Minus className="h-3.5 w-3.5" />
+					</Button>
+					<span className="min-w-32 text-center text-lg font-semibold tabular-nums">
+						${(totalCreditMicros / 1_000_000).toFixed(0)} of credit
+					</span>
+					<Button
+						type="button"
+						variant="ghost"
+						size="icon"
+						onClick={() => setQuantity((q) => Math.min(100, q + 1))}
+						disabled={quantity >= 100 || purchaseMutation.isPending}
+						className="size-8 text-muted-foreground shadow-none transition-colors hover:bg-muted hover:text-white disabled:opacity-40"
+					>
+						<Plus className="h-3.5 w-3.5" />
+					</Button>
+				</div>
+
+				<div className="flex flex-wrap justify-center gap-1.5">
+					{PRESET_MULTIPLIERS.map((m) => (
+						<Button
+							key={m}
+							type="button"
+							variant="ghost"
+							onClick={() => setQuantity(m)}
+							disabled={purchaseMutation.isPending}
+							className={cn(
+								"h-auto rounded-md px-2.5 py-1 text-xs font-medium tabular-nums transition-colors disabled:opacity-60",
+								quantity === m
+									? "bg-accent text-accent-foreground"
+									: "text-muted-foreground hover:bg-accent hover:text-accent-foreground"
+							)}
+						>
+							${m}
+						</Button>
+					))}
+				</div>
+
+				<div className="flex items-center justify-between rounded-lg border bg-muted/30 px-3 py-2">
+					<span className="text-sm font-medium tabular-nums">
+						${(totalCreditMicros / 1_000_000).toFixed(0)} of credit
+					</span>
+					<span className="text-sm font-semibold tabular-nums">${totalPrice}</span>
+				</div>
+
+				<Button
+					className="w-full"
+					disabled={purchaseMutation.isPending}
+					onClick={() => purchaseMutation.mutate({ quantity, search_space_id: searchSpaceId })}
+				>
+					{purchaseMutation.isPending ? (
+						<>
+							<Spinner size="xs" />
+							Redirecting
+						</>
+					) : (
+						<>
+							Buy ${(totalCreditMicros / 1_000_000).toFixed(0)} of credit for ${totalPrice}
+						</>
+					)}
+				</Button>
+				<p className="text-center text-[11px] text-muted-foreground">Secure checkout via Stripe</p>
+			</div>
+		</div>
+	);
+}
diff --git a/surfsense_web/components/settings/general-settings-manager.tsx b/surfsense_web/components/settings/general-settings-manager.tsx
index e83525982d..44177e52da 100644
--- a/surfsense_web/components/settings/general-settings-manager.tsx
+++ b/surfsense_web/components/settings/general-settings-manager.tsx
@@ -2,18 +2,16 @@
 
 import { useQuery } from "@tanstack/react-query";
 import { useAtomValue } from "jotai";
-import { Info } from "lucide-react";
 import { useTranslations } from "next-intl";
-import { useEffect, useState } from "react";
+import { useCallback, useEffect, useState } from "react";
 import { toast } from "sonner";
 import { updateSearchSpaceMutationAtom } from "@/atoms/search-spaces/search-space-mutation.atoms";
-import { Alert, AlertDescription } from "@/components/ui/alert";
 import { Button } from "@/components/ui/button";
-import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card";
 import { Input } from "@/components/ui/input";
 import { Label } from "@/components/ui/label";
 import { Skeleton } from "@/components/ui/skeleton";
 import { searchSpacesApiService } from "@/lib/apis/search-spaces-api.service";
+import { authenticatedFetch } from "@/lib/auth-utils";
 import { cacheKeys } from "@/lib/query-client/cache-keys";
 import { Spinner } from "../ui/spinner";
 
@@ -40,14 +38,48 @@ export function GeneralSettingsManager({ searchSpaceId }: GeneralSettingsManager
 	const [name, setName] = useState("");
 	const [description, setDescription] = useState("");
 	const [saving, setSaving] = useState(false);
+	const [isExporting, setIsExporting] = useState(false);
+	const hasSearchSpace = !!searchSpace;
+	const searchSpaceName = searchSpace?.name;
+	const searchSpaceDescription = searchSpace?.description;
+
+	const handleExportKB = useCallback(async () => {
+		if (isExporting) return;
+		setIsExporting(true);
+		try {
+			const response = await authenticatedFetch(
+				`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export`,
+				{ method: "GET" }
+			);
+			if (!response.ok) {
+				const errorData = await response.json().catch(() => ({ detail: "Export failed" }));
+				throw new Error(errorData.detail || "Export failed");
+			}
+			const blob = await response.blob();
+			const url = URL.createObjectURL(blob);
+			const a = document.createElement("a");
+			a.href = url;
+			a.download = "knowledge-base.zip";
+			document.body.appendChild(a);
+			a.click();
+			document.body.removeChild(a);
+			URL.revokeObjectURL(url);
+			toast.success("Knowledge base exported");
+		} catch (err) {
+			console.error("KB export failed:", err);
+			toast.error(err instanceof Error ? err.message : "Export failed");
+		} finally {
+			setIsExporting(false);
+		}
+	}, [searchSpaceId, isExporting]);
 
 	// Initialize state from fetched search space
 	useEffect(() => {
-		if (searchSpace) {
-			setName(searchSpace.name || "");
-			setDescription(searchSpace.description || "");
+		if (hasSearchSpace) {
+			setName(searchSpaceName || "");
+			setDescription(searchSpaceDescription || "");
 		}
-	}, [searchSpace?.name, searchSpace?.description]);
+	}, [hasSearchSpace, searchSpaceName, searchSpaceDescription]);
 
 	// Derive hasChanges during render
 	const hasChanges =
@@ -67,9 +99,9 @@ export function GeneralSettingsManager({ searchSpaceId }: GeneralSettingsManager
 			});
 
 			await fetchSearchSpace();
-		} catch (error: any) {
+		} catch (error: unknown) {
 			console.error("Error saving search space details:", error);
-			toast.error(error.message || "Failed to save search space details");
+			toast.error(error instanceof Error ? error.message : "Failed to save search space details");
 		} finally {
 			setSaving(false);
 		}
@@ -83,16 +115,10 @@ export function GeneralSettingsManager({ searchSpaceId }: GeneralSettingsManager
 	if (loading) {
 		return (
 			<div className="space-y-4 md:space-y-6">
-				<Card>
-					<CardHeader className="px-3 md:px-6 pt-3 md:pt-6 pb-2 md:pb-3">
-						<Skeleton className="h-5 md:h-6 w-36 md:w-48" />
-						<Skeleton className="h-3 md:h-4 w-full max-w-md mt-2" />
-					</CardHeader>
-					<CardContent className="space-y-3 md:space-y-4 px-3 md:px-6 pb-3 md:pb-6">
-						<Skeleton className="h-10 md:h-12 w-full" />
-						<Skeleton className="h-10 md:h-12 w-full" />
-					</CardContent>
-				</Card>
+				<div className="flex flex-col gap-6">
+					<Skeleton className="h-10 md:h-12 w-full" />
+					<Skeleton className="h-10 md:h-12 w-full" />
+				</div>
 			</div>
 		);
 	}
@@ -110,75 +136,67 @@ export function GeneralSettingsManager({ searchSpaceId }: GeneralSettingsManager
 
 	return (
 		<div className="space-y-4 md:space-y-6">
-			<Alert className="bg-muted/50 py-3 md:py-4">
-				<Info className="h-3 w-3 md:h-4 md:w-4 shrink-0" />
-				<AlertDescription className="text-xs md:text-sm">
-					Update your search space name and description. These details help identify and organize
-					your workspace.
-				</AlertDescription>
-			</Alert>
-
-			{/* Search Space Details Card */}
-			<form onSubmit={onSubmit} className="space-y-4 md:space-y-6">
-				<Card>
-					<CardHeader className="px-3 md:px-6 pt-3 md:pt-6 pb-2 md:pb-3">
-						<CardTitle className="text-base md:text-lg">Search Space Details</CardTitle>
-						<CardDescription className="text-xs md:text-sm">
-							Manage the basic information for this search space.
-						</CardDescription>
-					</CardHeader>
-					<CardContent className="space-y-4 md:space-y-5 px-3 md:px-6 pb-3 md:pb-6">
-						<div className="space-y-1.5 md:space-y-2">
-							<Label htmlFor="search-space-name" className="text-sm md:text-base font-medium">
-								{t("general_name_label")}
-							</Label>
-							<Input
-								id="search-space-name"
-								placeholder={t("general_name_placeholder")}
-								value={name}
-								onChange={(e) => setName(e.target.value)}
-								className="text-sm md:text-base h-9 md:h-10"
-							/>
-							<p className="text-[10px] md:text-xs text-muted-foreground">
-								{t("general_name_description")}
-							</p>
-						</div>
-
-						<div className="space-y-1.5 md:space-y-2">
-							<Label
-								htmlFor="search-space-description"
-								className="text-sm md:text-base font-medium"
-							>
-								{t("general_description_label")}{" "}
-								<span className="text-muted-foreground font-normal">({tCommon("optional")})</span>
-							</Label>
-							<Input
-								id="search-space-description"
-								placeholder={t("general_description_placeholder")}
-								value={description}
-								onChange={(e) => setDescription(e.target.value)}
-								className="text-sm md:text-base h-9 md:h-10"
-							/>
-							<p className="text-[10px] md:text-xs text-muted-foreground">
-								{t("general_description_description")}
-							</p>
-						</div>
-					</CardContent>
-				</Card>
-
-				{/* Action Buttons */}
-				<div className="flex justify-end pt-3 md:pt-4">
+			<form onSubmit={onSubmit} className="space-y-6">
+				<div className="flex flex-col gap-6">
+					<div className="space-y-2">
+						<Label htmlFor="search-space-name">{t("general_name_label")}</Label>
+						<Input
+							id="search-space-name"
+							maxLength={100}
+							placeholder={t("general_name_placeholder")}
+							value={name}
+							onChange={(e) => setName(e.target.value)}
+						/>
+						<p className="text-xs text-muted-foreground">{t("general_name_description")}</p>
+					</div>
+
+					<div className="space-y-2">
+						<Label htmlFor="search-space-description">
+							{t("general_description_label")}{" "}
+							<span className="text-muted-foreground font-normal">({tCommon("optional")})</span>
+						</Label>
+						<Input
+							id="search-space-description"
+							placeholder={t("general_description_placeholder")}
+							value={description}
+							onChange={(e) => setDescription(e.target.value)}
+						/>
+						<p className="text-xs text-muted-foreground">{t("general_description_description")}</p>
+					</div>
+				</div>
+
+				<div className="flex justify-end">
 					<Button
 						type="submit"
 						variant="outline"
 						disabled={!hasChanges || saving || !name.trim()}
-						className="gap-2 bg-white text-black hover:bg-neutral-100 dark:bg-white dark:text-black dark:hover:bg-neutral-200"
+						className="relative gap-2 bg-white text-black hover:bg-accent hover:text-accent-foreground dark:bg-white dark:text-black"
 					>
-						{saving ? <Spinner size="sm" /> : null}
-						{saving ? t("general_saving") : t("general_save")}
+						<span className={saving ? "opacity-0" : ""}>{t("general_save")}</span>
+						{saving && <Spinner size="sm" className="absolute" />}
 					</Button>
 				</div>
 			</form>
+
+			<div className="border-t pt-6 flex flex-col gap-3 md:flex-row md:items-center md:justify-between">
+				<div className="space-y-1">
+					<Label>Export knowledge base</Label>
+					<p className="text-xs text-muted-foreground">
+						Download all documents in this search space as a ZIP of markdown files.
+					</p>
+				</div>
+				<Button
+					type="button"
+					variant="secondary"
+					size="sm"
+					disabled={isExporting}
+					onClick={handleExportKB}
+					className="relative w-fit shrink-0"
+				>
+					<span className={isExporting ? "opacity-0" : ""}>Export</span>
+					{isExporting && <Spinner size="sm" className="absolute" />}
+				</Button>
+			</div>
 		</div>
 	);
 }
diff --git a/surfsense_web/components/settings/image-model-manager.tsx b/surfsense_web/components/settings/image-model-manager.tsx
index 55dfe3d310..494f7aae99 100644
--- a/surfsense_web/components/settings/image-model-manager.tsx
+++ b/surfsense_web/components/settings/image-model-manager.tsx
@@ -1,7 +1,7 @@
 "use client";
 
 import { useAtomValue } from "jotai";
-import { AlertCircle, Dot, Edit3, Info, RefreshCw, Trash2, Wand2 } from "lucide-react";
+import { AlertCircle, Dot, Info, Pencil, RefreshCw, Trash2 } from "lucide-react";
 import { useMemo, useState } from "react";
 import { deleteImageGenConfigMutationAtom } from "@/atoms/image-gen-config/image-gen-config-mutation.atoms";
 import {
@@ -22,8 +22,10 @@ import {
 	AlertDialogTitle,
 } from "@/components/ui/alert-dialog";
 import { Avatar, AvatarFallback, AvatarImage } from "@/components/ui/avatar";
+import { Badge } from "@/components/ui/badge";
 import { Button } from "@/components/ui/button";
 import { Card, CardContent } from "@/components/ui/card";
+import { Separator } from "@/components/ui/separator";
 import { Skeleton } from "@/components/ui/skeleton";
 import { Spinner } from "@/components/ui/spinner";
 import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/components/ui/tooltip";
@@ -116,8 +118,8 @@ export function ImageModelManager({ searchSpaceId }: ImageModelManagerProps) {
 
 	return (
 		<div className="space-y-4 md:space-y-6">
-			{/* Header */}
-			<div className="flex flex-col space-y-4 sm:flex-row sm:items-center sm:justify-between sm:space-y-0">
+			{/* Header actions */}
+			<div className="flex items-center justify-between">
 				<Button
 					variant="secondary"
 					size="sm"
@@ -132,7 +134,7 @@ export function ImageModelManager({ searchSpaceId }: ImageModelManagerProps) {
 					<Button
 						variant="outline"
 						onClick={openNewDialog}
-						className="gap-2 bg-white text-black hover:bg-neutral-100 dark:bg-white dark:text-black dark:hover:bg-neutral-200"
+						className="gap-2 border-transparent bg-white text-[#1f1f1f] font-medium hover:bg-zinc-100 hover:text-[#1f1f1f] dark:border-transparent dark:bg-white dark:text-[#1f1f1f]"
 					>
 						Add Image Model
 					</Button>
@@ -152,78 +154,159 @@ export function ImageModelManager({ searchSpaceId }: ImageModelManagerProps) {
 			{/* Read-only / Limited permissions notice */}
 			{access && !isLoading && isReadOnly && (
 				<div>
-					<Alert className="bg-muted/50 py-3 md:py-4">
-						<Info className="h-3 w-3 md:h-4 md:w-4 shrink-0" />
-						<AlertDescription className="text-xs md:text-sm">
-							You have <span className="font-medium">read-only</span> access to image generation
-							configurations. Contact a space owner to request additional permissions.
+					<Alert>
+						<Info />
+						<AlertDescription>
+							<p>
+								You have <span className="font-medium">read-only</span> access to image generation
+								configurations. Contact a space owner to request additional permissions.
+							</p>
 						</AlertDescription>
 					</Alert>
 				</div>
 			)}
 			{access && !isLoading && !isReadOnly && (!canCreate || !canDelete) && (
 				<div>
-					<Alert className="bg-muted/50 py-3 md:py-4">
-						<Info className="h-3 w-3 md:h-4 md:w-4 shrink-0" />
-						<AlertDescription className="text-xs md:text-sm">
-							You can{" "}
-							{[canCreate && "create and edit", canDelete && "delete"]
-								.filter(Boolean)
-								.join(" and ")}{" "}
-							image model configurations
-							{!canDelete && ", but cannot delete them"}.
+					<Alert>
+						<Info />
+						<AlertDescription>
+							<p>
+								You can{" "}
+								{[canCreate && "create and edit", canDelete && "delete"]
+									.filter(Boolean)
+									.join(" and ")}{" "}
+								image model configurations
+								{!canDelete && ", but cannot delete them"}.
+							</p>
 						</AlertDescription>
 					</Alert>
 				</div>
 			)}
 
 			{/* Global info */}
-			{globalConfigs.filter((g) => !("is_auto_mode" in g && g.is_auto_mode)).length > 0 && (
-				<Alert className="bg-muted/50 py-3">
-					<Info className="h-3 w-3 md:h-4 md:w-4 shrink-0" />
-					<AlertDescription className="text-xs md:text-sm">
-						<p>
-							<span className="font-medium">
-								{globalConfigs.filter((g) => !("is_auto_mode" in g && g.is_auto_mode)).length}{" "}
-								global image{" "}
-								{globalConfigs.filter((g) => !("is_auto_mode" in g && g.is_auto_mode)).length === 1
-									? "model"
-									: "models"}
-							</span>{" "}
-							available from your administrator. Use the model selector to view and select them.
-						</p>
+			{(isLoading ||
+				globalConfigs.filter((g) => !("is_auto_mode" in g && g.is_auto_mode)).length > 0) && (
+				<Alert>
+					<Info />
+					<AlertDescription>
+						{isLoading ? (
+							<div className="flex min-h-[1.625em] items-center">
+								<Skeleton className="h-4 w-60 bg-accent-foreground/15" />
+							</div>
+						) : (
+							<p>
+								<span className="font-medium">
+									{globalConfigs.filter((g) => !("is_auto_mode" in g && g.is_auto_mode)).length}{" "}
+									global image{" "}
+									{globalConfigs.filter((g) => !("is_auto_mode" in g && g.is_auto_mode)).length ===
+									1
+										? "model"
+										: "models"}
+								</span>{" "}
+								available from your administrator. {(() => {
+									const nonAuto = globalConfigs.filter(
+										(g) => !("is_auto_mode" in g && g.is_auto_mode)
+									);
+									const premium = nonAuto.filter(
+										(g) =>
+											"billing_tier" in g &&
+											(g as { billing_tier?: string }).billing_tier === "premium"
+									).length;
+									const free = nonAuto.length - premium;
+									if (premium > 0 && free > 0) {
+										return `${premium} premium, ${free} free.`;
+									}
+									if (premium > 0) {
+										return `All ${premium} premium — debits your shared credit pool.`;
+									}
+									return `All ${free} free.`;
+								})()}
+							</p>
+						)}
 					</AlertDescription>
 				</Alert>
 			)}
 
+			{/* Global Image Models — read-only cards with per-model Free/Premium
+			    badges. Mirrors the badge palette used by the chat role selector
+			    (`llm-role-manager.tsx`) so the meaning is consistent across
+			    every model-configuration surface (chat / image / vision). */}
+			{!isLoading &&
+				globalConfigs.filter((g) => !("is_auto_mode" in g && g.is_auto_mode)).length > 0 && (
+					<div className="space-y-3">
+						<div className="grid gap-3 grid-cols-1 sm:grid-cols-2 xl:grid-cols-3">
+							{globalConfigs
+								.filter((g) => !("is_auto_mode" in g && g.is_auto_mode))
+								.map((cfg) => {
+									const billingTier =
+										("billing_tier" in cfg &&
+											typeof (cfg as { billing_tier?: string }).billing_tier === "string" &&
+											(cfg as { billing_tier?: string }).billing_tier) ||
+										"free";
+									const isPremium = billingTier === "premium";
+									return (
+										<Card
+											key={cfg.id}
+											className="group relative overflow-hidden transition-all duration-200 border-accent bg-accent/20 hover:shadow-md h-full"
+										>
+											<CardContent className="p-4 flex flex-col gap-3 h-full">
+												<div className="flex items-center gap-2 min-w-0">
+													<div className="shrink-0">
+														{getProviderIcon(cfg.provider, { className: "size-4" })}
+													</div>
+													<div className="min-w-0 flex-1 flex items-center gap-1.5">
+														<h4 className="text-sm font-semibold tracking-tight truncate">
+															{cfg.name}
+														</h4>
+														{isPremium ? (
+															<Badge
+																variant="secondary"
+																className="text-[8px] md:text-[9px] shrink-0 bg-purple-100 text-purple-700 dark:bg-purple-900/50 dark:text-purple-300 border-0"
+															>
+																Premium
+															</Badge>
+														) : (
+															<Badge
+																variant="secondary"
+																className="text-[8px] md:text-[9px] shrink-0 bg-emerald-100 text-emerald-700 dark:bg-emerald-900/50 dark:text-emerald-300 border-0"
+															>
+																Free
+															</Badge>
+														)}
+													</div>
+												</div>
+												{cfg.description && (
+													<p className="text-[11px] text-muted-foreground/70 line-clamp-2">
+														{cfg.description}
+													</p>
+												)}
+												<div className="mt-auto space-y-2">
+													<Separator className="bg-accent" />
+													<div className="flex items-center">
+														<span className="text-[11px] text-muted-foreground/60 truncate">
+															{cfg.model_name}
+														</span>
+													</div>
+												</div>
+											</CardContent>
+										</Card>
+									);
+								})}
+						</div>
+					</div>
+				)}
+
 			{/* Loading Skeleton */}
 			{isLoading && (
 				<div className="space-y-4 md:space-y-6">
 					<div className="space-y-4">
-						<div className="flex items-center justify-between">
-							<Skeleton className="h-6 md:h-7 w-40 md:w-48" />
-							<Skeleton className="h-8 md:h-9 w-32 md:w-36 rounded-md" />
-						</div>
-
 						<div className="grid gap-3 grid-cols-1 sm:grid-cols-2 xl:grid-cols-3">
 							{["skeleton-a", "skeleton-b", "skeleton-c"].map((key) => (
-								<Card key={key} className="border-border/60">
-									<CardContent className="p-4 flex flex-col gap-3">
-										<div className="flex items-start justify-between gap-2">
-											<div className="space-y-1.5 flex-1 min-w-0">
-												<Skeleton className="h-4 w-28 md:w-32" />
-												<Skeleton className="h-3 w-40 md:w-48" />
-											</div>
-										</div>
-										<div className="flex items-center gap-2">
-											<Skeleton className="h-5 w-16 rounded-full" />
-											<Skeleton className="h-5 w-24 rounded-md" />
-										</div>
-										<div className="flex items-center gap-2 pt-2 border-t border-border/40">
-											<Skeleton className="h-3 w-20" />
-											<Skeleton className="h-4 w-4 rounded-full" />
-											<Skeleton className="h-3 w-16" />
-										</div>
+								<Card key={key} className="border-accent bg-accent/20">
+									<CardContent className="p-4 flex flex-col gap-3 min-h-32">
+										<Skeleton className="h-4 w-32 md:w-40 bg-accent" />
+										<Skeleton className="h-3 w-full bg-accent" />
+										<Skeleton className="h-3 w-24 md:w-28 bg-accent mt-auto" />
 									</CardContent>
 								</Card>
 							))}
@@ -253,22 +336,27 @@ export function ImageModelManager({ searchSpaceId }: ImageModelManagerProps) {
 
 								return (
 									<div key={config.id}>
-										<Card className="group relative overflow-hidden transition-all duration-200 border-border/60 hover:shadow-md h-full">
+										<Card className="group relative overflow-hidden transition-all duration-200 border-accent bg-accent/20 hover:shadow-md h-full">
 											<CardContent className="p-4 flex flex-col gap-3 h-full">
-												{/* Header: Name + Actions */}
-												<div className="flex items-start justify-between gap-2">
-													<div className="min-w-0 flex-1">
-														<h4 className="text-sm font-semibold tracking-tight truncate">
-															{config.name}
-														</h4>
-														{config.description && (
-															<p className="text-[11px] text-muted-foreground/70 truncate mt-0.5">
-																{config.description}
-															</p>
-														)}
+												{/* Header: Icon + Name + Actions */}
+												<div className="flex items-center justify-between gap-2">
+													<div className="flex items-center gap-2.5 min-w-0 flex-1">
+														<div className="shrink-0">
+															{getProviderIcon(config.provider, { className: "size-4" })}
+														</div>
+														<div className="min-w-0 flex-1">
+															<h4 className="text-sm font-semibold tracking-tight truncate">
+																{config.name}
+															</h4>
+															{config.description && (
+																<p className="text-[11px] text-muted-foreground/70 truncate mt-0.5">
+																	{config.description}
+																</p>
+															)}
+														</div>
 													</div>
 													{(canUpdate || canDelete) && (
-														<div className="flex items-center gap-0.5 shrink-0 sm:opacity-0 sm:group-hover:opacity-100 transition-opacity duration-150">
+														<div className="flex items-center gap-1 shrink-0 sm:w-0 sm:overflow-hidden sm:group-hover:w-auto sm:opacity-0 sm:group-hover:opacity-100 transition-all duration-150">
 															{canUpdate && (
 																<TooltipProvider>
 																	<Tooltip open={isDesktop ? undefined : false}>
@@ -277,9 +365,9 @@ export function ImageModelManager({ searchSpaceId }: ImageModelManagerProps) {
 																				variant="ghost"
 																				size="icon"
 																				onClick={() => openEditDialog(config)}
-																				className="h-7 w-7 text-muted-foreground hover:text-foreground"
+																				className="h-7 w-7 rounded-lg text-muted-foreground hover:text-accent-foreground"
 																			>
-																				<Edit3 className="h-3 w-3" />
+																				<Pencil className="h-3 w-3" />
 																			</Button>
 																		</TooltipTrigger>
 																		<TooltipContent>Edit</TooltipContent>
@@ -294,7 +382,7 @@ export function ImageModelManager({ searchSpaceId }: ImageModelManagerProps) {
 																				variant="ghost"
 																				size="icon"
 																				onClick={() => setConfigToDelete(config)}
-																				className="h-7 w-7 text-muted-foreground hover:text-destructive"
+																				className="h-7 w-7 rounded-lg text-muted-foreground hover:text-destructive"
 																			>
 																				<Trash2 className="h-3 w-3" />
 																			</Button>
@@ -307,50 +395,45 @@ export function ImageModelManager({ searchSpaceId }: ImageModelManagerProps) {
 													)}
 												</div>
 
-												{/* Provider + Model */}
-												<div className="flex items-center gap-2 flex-wrap">
-													{getProviderIcon(config.provider, { className: "size-3.5 shrink-0" })}
-													<code className="text-[11px] font-mono text-muted-foreground bg-muted/60 px-2 py-0.5 rounded-md truncate max-w-[160px]">
-														{config.model_name}
-													</code>
-												</div>
-
 												{/* Footer: Date + Creator */}
-												<div className="flex items-center gap-2 pt-2 border-t border-border/40 mt-auto">
-													<span className="text-[11px] text-muted-foreground/60">
-														{new Date(config.created_at).toLocaleDateString(undefined, {
-															year: "numeric",
-															month: "short",
-															day: "numeric",
-														})}
-													</span>
-													{member && (
-														<>
-															<Dot className="h-4 w-4 text-muted-foreground/30" />
-															<TooltipProvider>
-																<Tooltip open={isDesktop ? undefined : false}>
-																	<TooltipTrigger asChild>
-																		<div className="flex items-center gap-1.5 cursor-default">
-																			<Avatar className="size-4.5 shrink-0">
-																				{member.avatarUrl && (
-																					<AvatarImage src={member.avatarUrl} alt={member.name} />
-																				)}
-																				<AvatarFallback className="text-[9px]">
-																					{getInitials(member.name)}
-																				</AvatarFallback>
-																			</Avatar>
-																			<span className="text-[11px] text-muted-foreground/60 truncate max-w-[120px]">
-																				{member.name}
-																			</span>
-																		</div>
-																	</TooltipTrigger>
-																	<TooltipContent side="bottom">
-																		{member.email || member.name}
-																	</TooltipContent>
-																</Tooltip>
-															</TooltipProvider>
-														</>
-													)}
+												<div className="mt-auto space-y-2">
+													<Separator className="bg-accent" />
+													<div className="flex items-center">
+														<span className="shrink-0 text-[11px] text-muted-foreground/60 whitespace-nowrap">
+															{new Date(config.created_at).toLocaleDateString(undefined, {
+																year: "numeric",
+																month: "short",
+																day: "numeric",
+															})}
+														</span>
+														{member && (
+															<>
+																<Dot className="h-4 w-4 text-muted-foreground/30 shrink-0" />
+																<TooltipProvider>
+																	<Tooltip open={isDesktop ? undefined : false}>
+																		<TooltipTrigger asChild>
+																			<div className="min-w-0 flex items-center gap-1.5 cursor-default">
+																				<Avatar className="size-4.5 shrink-0">
+																					{member.avatarUrl && (
+																						<AvatarImage src={member.avatarUrl} alt={member.name} />
+																					)}
+																					<AvatarFallback className="text-[9px]">
+																						{getInitials(member.name)}
+																					</AvatarFallback>
+																				</Avatar>
+																				<span className="text-[11px] text-muted-foreground/60 truncate">
+																					{member.name}
+																				</span>
+																			</div>
+																		</TooltipTrigger>
+																		<TooltipContent side="bottom">
+																			{member.email || member.name}
+																		</TooltipContent>
+																	</Tooltip>
+																</TooltipProvider>
+															</>
+														)}
+													</div>
 												</div>
 											</CardContent>
 										</Card>
diff --git a/surfsense_web/components/settings/llm-role-manager.tsx b/surfsense_web/components/settings/llm-role-manager.tsx
index 995159d581..17370b8f23 100644
--- a/surfsense_web/components/settings/llm-role-manager.tsx
+++ b/surfsense_web/components/settings/llm-role-manager.tsx
@@ -6,13 +6,12 @@ import {
 	Bot,
 	CircleCheck,
 	CircleDashed,
-	Eye,
 	FileText,
 	ImageIcon,
 	RefreshCw,
-	Shuffle,
+	ScanEye,
 } from "lucide-react";
-import { useCallback, useEffect, useRef, useState } from "react";
+import { useCallback, useState } from "react";
 import { toast } from "sonner";
 import {
 	globalImageGenConfigsAtom,
@@ -44,7 +43,6 @@ import {
 } from "@/components/ui/select";
 import { Skeleton } from "@/components/ui/skeleton";
 import { Spinner } from "@/components/ui/spinner";
-import { getProviderIcon } from "@/lib/provider-icons";
 import { cn } from "@/lib/utils";
 
 const ROLE_DESCRIPTIONS = {
@@ -76,11 +74,11 @@ const ROLE_DESCRIPTIONS = {
 		configType: "image" as const,
 	},
 	vision: {
-		icon: Eye,
+		icon: ScanEye,
 		title: "Vision LLM",
 		description: "Vision-capable model for screenshot analysis and context extraction",
-		color: "text-amber-600 dark:text-amber-400",
-		bgColor: "bg-amber-500/10",
+		color: "text-muted-foreground",
+		bgColor: "bg-muted",
 		prefKey: "vision_llm_config_id" as const,
 		configType: "vision" as const,
 	},
@@ -145,23 +143,6 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) {
 	}));
 
 	const [savingRole, setSavingRole] = useState<string | null>(null);
-	const savingRef = useRef(false);
-
-	useEffect(() => {
-		if (!savingRef.current) {
-			setAssignments({
-				agent_llm_id: preferences.agent_llm_id ?? "",
-				document_summary_llm_id: preferences.document_summary_llm_id ?? "",
-				image_generation_config_id: preferences.image_generation_config_id ?? "",
-				vision_llm_config_id: preferences.vision_llm_config_id ?? "",
-			});
-		}
-	}, [
-		preferences?.agent_llm_id,
-		preferences?.document_summary_llm_id,
-		preferences?.image_generation_config_id,
-		preferences?.vision_llm_config_id,
-	]);
 
 	const handleRoleAssignment = useCallback(
 		async (prefKey: string, configId: string) => {
@@ -169,7 +150,6 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) {
 
 			setAssignments((prev) => ({ ...prev, [prefKey]: value }));
 			setSavingRole(prefKey);
-			savingRef.current = true;
 
 			try {
 				await updatePreferences({
@@ -179,7 +159,6 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) {
 				toast.success("Role assignment updated");
 			} finally {
 				setSavingRole(null);
-				savingRef.current = false;
 			}
 		},
 		[updatePreferences, searchSpaceId]
@@ -205,11 +184,6 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) {
 		),
 	];
 
-	const isAssignmentComplete =
-		allLLMConfigs.some((c) => c.id === assignments.agent_llm_id) &&
-		allLLMConfigs.some((c) => c.id === assignments.document_summary_llm_id) &&
-		allImageConfigs.some((c) => c.id === assignments.image_generation_config_id);
-
 	const isLoading =
 		configsLoading ||
 		preferencesLoading ||
@@ -231,7 +205,7 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) {
 	return (
 		<div className="space-y-5 md:space-y-6">
 			{/* Header actions */}
-			<div className="flex items-center justify-between">
+			<div className="flex items-center justify-start">
 				<Button
 					variant="secondary"
 					size="sm"
@@ -239,15 +213,9 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) {
 					disabled={isLoading}
 					className="gap-2"
 				>
-					<RefreshCw className="h-3.5 w-3.5" />
+					<RefreshCw className={cn("h-3.5 w-3.5", isLoading && "animate-spin")} />
 					Refresh
 				</Button>
-				{isAssignmentComplete && !isLoading && !hasError && (
-					<Badge variant="outline" className="text-xs gap-1.5 text-muted-foreground">
-						<CircleCheck className="h-3 w-3" />
-						All roles assigned
-					</Badge>
-				)}
 			</div>
 
 			{/* Error Alert */}
@@ -268,35 +236,11 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) {
 			{isLoading && (
 				<div className="grid gap-4 grid-cols-1 lg:grid-cols-2">
 					{["skeleton-a", "skeleton-b", "skeleton-c"].map((key) => (
-						<Card key={key} className="border-border/60">
-							<CardContent className="p-4 md:p-5 space-y-4">
-								{/* Header: icon + title + status */}
-								<div className="flex items-start justify-between gap-3">
-									<div className="flex items-center gap-3 min-w-0">
-										<Skeleton className="h-9 w-9 rounded-lg shrink-0" />
-										<div className="space-y-1.5 flex-1">
-											<Skeleton className="h-4 w-24 md:w-28" />
-											<Skeleton className="h-3 w-40 md:w-52" />
-										</div>
-									</div>
-									<Skeleton className="h-4 w-4 rounded-full shrink-0" />
-								</div>
-								{/* Label */}
-								<div className="space-y-1.5">
-									<Skeleton className="h-3 w-20" />
-									<Skeleton className="h-9 md:h-10 w-full rounded-md" />
-								</div>
-								{/* Summary block */}
-								<div className="rounded-lg border border-border/50 p-3 space-y-2">
-									<div className="flex items-center gap-2">
-										<Skeleton className="h-3.5 w-3.5 rounded shrink-0" />
-										<Skeleton className="h-3.5 w-28" />
-									</div>
-									<div className="flex items-center gap-1.5">
-										<Skeleton className="h-4 w-14 rounded-full" />
-										<Skeleton className="h-3 w-24" />
-									</div>
-								</div>
+						<Card key={key} className="border-accent bg-accent/20">
+							<CardContent className="p-4 flex flex-col gap-3 min-h-32">
+								<Skeleton className="h-4 w-32 md:w-40 bg-accent" />
+								<Skeleton className="h-3 w-full bg-accent" />
+								<Skeleton className="h-3 w-24 md:w-28 bg-accent mt-auto" />
 							</CardContent>
 						</Card>
 					))}
@@ -343,12 +287,10 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) {
 
 						const assignedConfig = roleAllConfigs.find((config) => config.id === currentAssignment);
 						const isAssigned = !!assignedConfig;
-						const isAutoMode =
-							assignedConfig && "is_auto_mode" in assignedConfig && assignedConfig.is_auto_mode;
 
 						return (
 							<div key={key}>
-								<Card className="group relative overflow-hidden transition-all duration-200 border-border/60 hover:shadow-md h-full">
+								<Card className="group relative overflow-hidden transition-all duration-200 border-accent bg-accent/20 hover:shadow-md h-full">
 									<CardContent className="p-4 md:p-5 space-y-4">
 										{/* Role Header */}
 										<div className="flex items-start justify-between gap-3">
@@ -389,7 +331,7 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) {
 												<SelectTrigger className="w-full h-9 md:h-10 text-xs md:text-sm">
 													<SelectValue placeholder="Select a configuration" />
 												</SelectTrigger>
-												<SelectContent className="max-w-[calc(100vw-2rem)]">
+												<SelectContent className="max-w-[calc(100vw-2rem)] select-none">
 													<SelectItem
 														value="unassigned"
 														className="text-xs md:text-sm py-1.5 md:py-2"
@@ -405,35 +347,49 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) {
 															</SelectLabel>
 															{roleGlobalConfigs.map((config) => {
 																const isAuto = "is_auto_mode" in config && config.is_auto_mode;
+																// Read billing_tier from the global config; default to "free"
+																// for legacy YAMLs / Auto stub. Premium gets a purple badge,
+																// free gets an emerald one — same palette as the chat
+																// model selector so the meaning is consistent across
+																// surfaces (issues E, H).
+																const billingTier =
+																	("billing_tier" in config &&
+																		typeof config.billing_tier === "string" &&
+																		config.billing_tier) ||
+																	"free";
+																const isPremium = billingTier === "premium";
 																return (
 																	<SelectItem
 																		key={config.id}
 																		value={config.id.toString()}
 																		className="text-xs md:text-sm py-1.5 md:py-2"
+																		textValue={config.name}
 																	>
 																		<div className="flex items-center gap-1 md:gap-1.5 flex-wrap min-w-0">
-																			{isAuto ? (
-																				<Shuffle className="size-3 md:size-3.5 shrink-0 text-muted-foreground" />
-																			) : (
-																				getProviderIcon(config.provider, {
-																					className: "size-3 md:size-3.5 shrink-0",
-																				})
-																			)}
 																			<span className="truncate text-xs md:text-sm">
 																				{config.name}
 																			</span>
-																			{!isAuto && (
-																				<span className="text-muted-foreground text-[10px] md:text-[11px] truncate">
-																					({config.model_name})
-																				</span>
-																			)}
-																			{isAuto && (
+																			{isAuto ? (
 																				<Badge
 																					variant="secondary"
-																					className="text-[8px] md:text-[9px] shrink-0 bg-violet-100 text-violet-700 dark:bg-violet-900/30 dark:text-violet-300"
+																					className="text-[8px] md:text-[9px] shrink-0 bg-zinc-200 text-zinc-600 dark:bg-zinc-700 dark:text-zinc-300 [[data-slot=select-trigger]_&]:hidden"
 																				>
 																					Recommended
 																				</Badge>
+																			) : isPremium ? (
+																				<Badge
+																					variant="secondary"
+																					className="text-[8px] md:text-[9px] shrink-0 bg-purple-100 text-purple-700 dark:bg-purple-900/50 dark:text-purple-300 border-0 [[data-slot=select-trigger]_&]:hidden"
+																				>
+																					Premium
+																				</Badge>
+																			) : (
+																				<Badge
+																					variant="secondary"
+																					className="text-[8px] md:text-[9px] shrink-0 bg-emerald-100 text-emerald-700 dark:bg-emerald-900/50 dark:text-emerald-300 border-0 [[data-slot=select-trigger]_&]:hidden"
+																				>
+																					Free
+																				</Badge>
 																			)}
 																		</div>
 																	</SelectItem>
@@ -455,15 +411,9 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) {
 																	className="text-xs md:text-sm py-1.5 md:py-2"
 																>
 																	<div className="flex items-center gap-1 md:gap-1.5 flex-wrap min-w-0">
-																		{getProviderIcon(config.provider, {
-																			className: "size-3 md:size-3.5 shrink-0",
-																		})}
 																		<span className="truncate text-xs md:text-sm">
 																			{config.name}
 																		</span>
-																		<span className="text-muted-foreground text-[10px] md:text-[11px] truncate">
-																			({config.model_name})
-																		</span>
 																	</div>
 																</SelectItem>
 															))}
@@ -472,63 +422,6 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) {
 												</SelectContent>
 											</Select>
 										</div>
-
-										{/* Assigned Config Summary */}
-										{assignedConfig && (
-											<div
-												className={cn(
-													"rounded-lg p-3 border",
-													isAutoMode
-														? "bg-violet-50 dark:bg-violet-900/10 border-violet-200/50 dark:border-violet-800/30"
-														: "bg-muted/40 border-border/50"
-												)}
-											>
-												{isAutoMode ? (
-													<div className="flex items-center gap-2">
-														<Shuffle
-															className={cn(
-																"w-3.5 h-3.5 shrink-0 text-violet-600 dark:text-violet-400"
-															)}
-														/>
-														<div className="min-w-0">
-															<p className="text-xs font-medium text-violet-700 dark:text-violet-300">
-																Auto Mode
-															</p>
-															<p className="text-[10px] text-violet-600/70 dark:text-violet-400/70 mt-0.5">
-																Routes across all available providers
-															</p>
-														</div>
-													</div>
-												) : (
-													<div className="flex items-start gap-2">
-														<IconComponent className="w-3.5 h-3.5 shrink-0 mt-0.5 text-muted-foreground" />
-														<div className="min-w-0 flex-1">
-															<div className="flex items-center gap-1.5 flex-wrap">
-																<span className="text-xs font-medium">{assignedConfig.name}</span>
-																{"is_global" in assignedConfig && assignedConfig.is_global && (
-																	<Badge variant="secondary" className="text-[9px] px-1.5 py-0">
-																		🌐 Global
-																	</Badge>
-																)}
-															</div>
-															<div className="flex items-center gap-1.5 mt-1">
-																{getProviderIcon(assignedConfig.provider, {
-																	className: "size-3 shrink-0",
-																})}
-																<code className="text-[10px] text-muted-foreground font-mono truncate">
-																	{assignedConfig.model_name}
-																</code>
-															</div>
-															{assignedConfig.api_base && (
-																<p className="text-[10px] text-muted-foreground/60 mt-1 truncate">
-																	{assignedConfig.api_base}
-																</p>
-															)}
-														</div>
-													</div>
-												)}
-											</div>
-										)}
 									</CardContent>
 								</Card>
 							</div>
diff --git a/surfsense_web/components/settings/more-pages-content.tsx b/surfsense_web/components/settings/more-pages-content.tsx
index 29245775cb..e1b05f4d27 100644
--- a/surfsense_web/components/settings/more-pages-content.tsx
+++ b/surfsense_web/components/settings/more-pages-content.tsx
@@ -1,21 +1,14 @@
 "use client";
 
 import { useMutation, useQuery, useQueryClient } from "@tanstack/react-query";
-import { Check, ExternalLink, Mail } from "lucide-react";
+import { Check, ExternalLink } from "lucide-react";
 import Link from "next/link";
 import { useParams } from "next/navigation";
-import { useEffect, useState } from "react";
+import { useEffect } from "react";
 import { toast } from "sonner";
 import { USER_QUERY_KEY } from "@/atoms/user/user-query.atoms";
 import { Button } from "@/components/ui/button";
 import { Card, CardContent } from "@/components/ui/card";
-import {
-	Dialog,
-	DialogContent,
-	DialogDescription,
-	DialogHeader,
-	DialogTitle,
-} from "@/components/ui/dialog";
 import { Separator } from "@/components/ui/separator";
 import { Skeleton } from "@/components/ui/skeleton";
 import { Spinner } from "@/components/ui/spinner";
@@ -33,7 +26,6 @@ export function MorePagesContent() {
 	const params = useParams();
 	const queryClient = useQueryClient();
 	const searchSpaceId = params?.search_space_id ?? "";
-	const [claimOpen, setClaimOpen] = useState(false);
 
 	useEffect(() => {
 		trackIncentivePageViewed();
@@ -78,48 +70,23 @@ export function MorePagesContent() {
 		<div className="w-full space-y-5">
 			<div className="text-center">
 				<h2 className="text-xl font-bold tracking-tight">Get Free Pages</h2>
-				<p className="mt-1 text-sm text-muted-foreground">
-					Claim your free page offer and earn bonus pages
-				</p>
+				<p className="mt-1 text-sm text-muted-foreground">Earn bonus pages by completing tasks</p>
 			</div>
 
-			{/* 3k free offer */}
-			<Card className="border-emerald-500/30 bg-emerald-500/5">
-				<CardContent className="flex items-center gap-3 p-4">
-					<div className="flex h-10 w-10 shrink-0 items-center justify-center rounded-full bg-emerald-600 text-white text-xs font-bold">
-						3k
-					</div>
-					<div className="min-w-0 flex-1">
-						<p className="text-sm font-semibold">Claim 3,000 Free Pages</p>
-						<p className="text-xs text-muted-foreground">
-							Limited offer. Schedule a meeting or email us to claim.
-						</p>
-					</div>
-					<Button
-						size="sm"
-						className="bg-emerald-600 text-white hover:bg-emerald-700"
-						onClick={() => setClaimOpen(true)}
-					>
-						Claim
-					</Button>
-				</CardContent>
-			</Card>
-
-			<Separator />
-
-			{/* Free tasks */}
 			<div className="space-y-2">
 				<h3 className="text-sm font-semibold">Earn Bonus Pages</h3>
 				{isLoading ? (
-					<Card>
-						<CardContent className="flex items-center gap-3 p-3">
-							<Skeleton className="h-8 w-8 rounded-full bg-muted" />
-							<div className="flex-1 space-y-2">
-								<Skeleton className="h-4 w-3/4 bg-muted" />
-							</div>
-							<Skeleton className="h-8 w-16 bg-muted" />
-						</CardContent>
-					</Card>
+					<div className="space-y-1.5">
+						{["github", "reddit", "discord"].map((task) => (
+							<Card key={task} className="bg-transparent">
+								<CardContent className="flex items-center gap-3 p-3">
+									<Skeleton className="h-8 w-8 rounded-full bg-muted" />
+									<Skeleton className="h-4 flex-1 bg-muted" />
+									<Skeleton className="h-8 w-16 bg-muted" />
+								</CardContent>
+							</Card>
+						))}
+					</div>
 				) : (
 					<div className="space-y-1.5">
 						{data?.tasks.map((task) => (
@@ -154,7 +121,7 @@ export function MorePagesContent() {
 										disabled={task.completed || completeMutation.isPending}
 										onClick={() => handleTaskClick(task)}
 										asChild={!task.completed}
-										className="text-muted-foreground hover:text-foreground"
+										className="text-muted-foreground hover:text-accent-foreground"
 									>
 										{task.completed ? (
 											<span>Done</span>
@@ -182,7 +149,6 @@ export function MorePagesContent() {
 
 			<Separator />
 
-			{/* Link to buy pages */}
 			<div className="text-center">
 				<p className="text-sm text-muted-foreground">Need more?</p>
 				{pageBuyingEnabled ? (
@@ -197,25 +163,6 @@ export function MorePagesContent() {
 					</p>
 				)}
 			</div>
-
-			{/* Claim 3k dialog */}
-			<Dialog open={claimOpen} onOpenChange={setClaimOpen}>
-				<DialogContent className="sm:max-w-md">
-					<DialogHeader>
-						<DialogTitle>Claim 3,000 Free Pages</DialogTitle>
-						<DialogDescription>
-							Send us an email to claim your free 3,000 pages. Include your account email and
-							primary usecase for free pages.
-						</DialogDescription>
-					</DialogHeader>
-					<Button asChild className="w-full gap-2">
-						<a href="mailto:rohan@surfsense.com?subject=Claim%203%2C000%20Free%20Pages&body=Hi%2C%20I'd%20like%20to%20claim%20the%203%2C000%20free%20pages%20offer.%0A%0AMy%20account%20email%3A%20">
-							<Mail className="h-4 w-4" />
-							rohan@surfsense.com
-						</a>
-					</Button>
-				</DialogContent>
-			</Dialog>
 		</div>
 	);
 }
diff --git a/surfsense_web/components/settings/more-pages-dialog.tsx b/surfsense_web/components/settings/more-pages-dialog.tsx
deleted file mode 100644
index 450079f36d..0000000000
--- a/surfsense_web/components/settings/more-pages-dialog.tsx
+++ /dev/null
@@ -1,24 +0,0 @@
-"use client";
-
-import { useAtom } from "jotai";
-import { morePagesDialogAtom } from "@/atoms/settings/settings-dialog.atoms";
-import { Dialog, DialogContent, DialogTitle } from "@/components/ui/dialog";
-import { MorePagesContent } from "./more-pages-content";
-
-export function MorePagesDialog() {
-	const [open, setOpen] = useAtom(morePagesDialogAtom);
-
-	return (
-		<Dialog open={open} onOpenChange={setOpen}>
-			<DialogContent
-				className="select-none max-w-md w-[95vw] max-h-[90vh] flex flex-col p-0 gap-0 overflow-hidden"
-				onOpenAutoFocus={(e) => e.preventDefault()}
-			>
-				<DialogTitle className="sr-only">Get More Pages</DialogTitle>
-				<div className="flex-1 overflow-y-auto p-6">
-					<MorePagesContent />
-				</div>
-			</DialogContent>
-		</Dialog>
-	);
-}
diff --git a/surfsense_web/components/settings/prompt-config-manager.tsx b/surfsense_web/components/settings/prompt-config-manager.tsx
index 66fda9e384..d04ee89dc2 100644
--- a/surfsense_web/components/settings/prompt-config-manager.tsx
+++ b/surfsense_web/components/settings/prompt-config-manager.tsx
@@ -4,9 +4,8 @@ import { useQuery } from "@tanstack/react-query";
 import { AlertTriangle, Info } from "lucide-react";
 import { useEffect, useState } from "react";
 import { toast } from "sonner";
-import { Alert, AlertDescription } from "@/components/ui/alert";
+import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
 import { Button } from "@/components/ui/button";
-import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card";
 import { Label } from "@/components/ui/label";
 import { Skeleton } from "@/components/ui/skeleton";
 import { Textarea } from "@/components/ui/textarea";
@@ -32,13 +31,15 @@ export function PromptConfigManager({ searchSpaceId }: PromptConfigManagerProps)
 
 	const [customInstructions, setCustomInstructions] = useState("");
 	const [saving, setSaving] = useState(false);
+	const hasSearchSpace = !!searchSpace;
+	const searchSpaceInstructions = searchSpace?.qna_custom_instructions;
 
 	// Initialize state from fetched search space
 	useEffect(() => {
-		if (searchSpace) {
-			setCustomInstructions(searchSpace.qna_custom_instructions || "");
+		if (hasSearchSpace) {
+			setCustomInstructions(searchSpaceInstructions || "");
 		}
-	}, [searchSpace?.qna_custom_instructions]);
+	}, [hasSearchSpace, searchSpaceInstructions]);
 
 	// Derive hasChanges during render
 	const hasChanges =
@@ -69,9 +70,9 @@ export function PromptConfigManager({ searchSpaceId }: PromptConfigManagerProps)
 			toast.success("System instructions saved successfully");
 
 			await fetchSearchSpace();
-		} catch (error: any) {
+		} catch (error: unknown) {
 			console.error("Error saving system instructions:", error);
-			toast.error(error.message || "Failed to save system instructions");
+			toast.error(error instanceof Error ? error.message : "Failed to save system instructions");
 		} finally {
 			setSaving(false);
 		}
@@ -85,16 +86,16 @@ export function PromptConfigManager({ searchSpaceId }: PromptConfigManagerProps)
 	if (loading) {
 		return (
 			<div className="space-y-4 md:space-y-6">
-				<Card>
-					<CardHeader className="px-3 md:px-6 pt-3 md:pt-6 pb-2 md:pb-3">
+				<div className="space-y-3 md:space-y-4">
+					<div className="space-y-2">
 						<Skeleton className="h-5 md:h-6 w-36 md:w-48" />
 						<Skeleton className="h-3 md:h-4 w-full max-w-md mt-2" />
-					</CardHeader>
-					<CardContent className="space-y-3 md:space-y-4 px-3 md:px-6 pb-3 md:pb-6">
+					</div>
+					<div className="space-y-3 md:space-y-4">
 						<Skeleton className="h-16 md:h-20 w-full" />
 						<Skeleton className="h-24 md:h-32 w-full" />
-					</CardContent>
-				</Card>
+					</div>
+				</div>
 			</div>
 		);
 	}
@@ -102,21 +103,19 @@ export function PromptConfigManager({ searchSpaceId }: PromptConfigManagerProps)
 	return (
 		<div className="space-y-4 md:space-y-6">
 			{/* Work in Progress Notice */}
-			<Alert
-				variant="default"
-				className="bg-amber-50 dark:bg-amber-950/30 border-amber-300 dark:border-amber-700 py-3 md:py-4"
-			>
-				<AlertTriangle className="h-3 w-3 md:h-4 md:w-4 text-amber-600 dark:text-amber-500 shrink-0" />
-				<AlertDescription className="text-amber-800 dark:text-amber-300 text-xs md:text-sm">
-					<span className="font-semibold">Work in Progress:</span> This functionality is currently
-					under development and not yet connected to the backend. Your instructions will be saved
-					but won't affect AI behavior until the feature is fully implemented.
+			<Alert variant="warning">
+				<AlertTriangle />
+				<AlertTitle>Work in Progress</AlertTitle>
+				<AlertDescription>
+					This functionality is currently under development and not yet connected to the backend.
+					Your instructions will be saved but won't affect AI behavior until the feature is fully
+					implemented.
 				</AlertDescription>
 			</Alert>
 
-			<Alert className="bg-muted/50 py-3 md:py-4">
-				<Info className="h-3 w-3 md:h-4 md:w-4 shrink-0" />
-				<AlertDescription className="text-xs md:text-sm">
+			<Alert>
+				<Info />
+				<AlertDescription>
 					System instructions apply to all AI interactions in this search space. They guide how the
 					AI responds, its tone, focus areas, and behavior patterns.
 				</AlertDescription>
@@ -124,15 +123,17 @@ export function PromptConfigManager({ searchSpaceId }: PromptConfigManagerProps)
 
 			{/* System Instructions Card */}
 			<form onSubmit={onSubmit} className="space-y-4 md:space-y-6">
-				<Card>
-					<CardHeader className="px-3 md:px-6 pt-3 md:pt-6 pb-2 md:pb-3">
-						<CardTitle className="text-base md:text-lg">Custom System Instructions</CardTitle>
-						<CardDescription className="text-xs md:text-sm">
+				<div className="space-y-3 md:space-y-4">
+					<div className="space-y-1.5 md:space-y-2">
+						<h3 className="text-base md:text-lg font-semibold tracking-tight">
+							Custom System Instructions
+						</h3>
+						<p className="text-xs md:text-sm text-muted-foreground">
 							Provide specific guidelines for how you want the AI to respond. These instructions
 							will be applied to all answers in this search space.
-						</CardDescription>
-					</CardHeader>
-					<CardContent className="space-y-3 md:space-y-4 px-3 md:px-6 pb-3 md:pb-6">
+						</p>
+					</div>
+					<div className="space-y-3 md:space-y-4">
 						<div className="space-y-1.5 md:space-y-2">
 							<Label
 								htmlFor="custom-instructions-settings"
@@ -167,15 +168,15 @@ export function PromptConfigManager({ searchSpaceId }: PromptConfigManagerProps)
 						</div>
 
 						{customInstructions.trim().length === 0 && (
-							<Alert className="py-2 md:py-3">
-								<Info className="h-3 w-3 md:h-4 md:w-4 shrink-0" />
-								<AlertDescription className="text-xs md:text-sm">
+							<Alert>
+								<Info />
+								<AlertDescription>
 									No system instructions are currently set. The AI will use default behavior.
 								</AlertDescription>
 							</Alert>
 						)}
-					</CardContent>
-				</Card>
+					</div>
+				</div>
 
 				{/* Action Buttons */}
 				<div className="flex justify-end pt-3 md:pt-4">
@@ -183,7 +184,7 @@ export function PromptConfigManager({ searchSpaceId }: PromptConfigManagerProps)
 						type="submit"
 						variant="outline"
 						disabled={!hasChanges || saving}
-						className="gap-2 bg-white text-black hover:bg-neutral-100 dark:bg-white dark:text-black dark:hover:bg-neutral-200"
+						className="gap-2 bg-white text-black hover:bg-accent hover:text-accent-foreground dark:bg-white dark:text-black"
 					>
 						{saving ? <Spinner size="sm" /> : null}
 						{saving ? "Saving" : "Save Instructions"}
diff --git a/surfsense_web/components/settings/roles-manager.tsx b/surfsense_web/components/settings/roles-manager.tsx
index 5b30b5f600..ee32f6e698 100644
--- a/surfsense_web/components/settings/roles-manager.tsx
+++ b/surfsense_web/components/settings/roles-manager.tsx
@@ -4,20 +4,25 @@ import { useQuery } from "@tanstack/react-query";
 import { useAtomValue } from "jotai";
 import {
 	Bot,
-	Edit2,
+	ChevronRight,
+	Earth,
 	FileText,
-	Globe,
+	Image,
 	Logs,
 	type LucideIcon,
-	MessageCircle,
+	MessageCircleReply,
 	MessageSquare,
 	Mic,
 	MoreHorizontal,
-	Plug,
+	Pencil,
+	ScanEye,
 	Settings,
 	Shield,
+	SlidersHorizontal,
 	Trash2,
+	Unplug,
 	Users,
+	Video,
 } from "lucide-react";
 import { useCallback, useEffect, useMemo, useState } from "react";
 import { toast } from "sonner";
@@ -47,7 +52,6 @@ import {
 	DialogDescription,
 	DialogHeader,
 	DialogTitle,
-	DialogTrigger,
 } from "@/components/ui/dialog";
 import {
 	DropdownMenu,
@@ -58,7 +62,6 @@ import {
 } from "@/components/ui/dropdown-menu";
 import { Input } from "@/components/ui/input";
 import { Label } from "@/components/ui/label";
-import { ScrollArea } from "@/components/ui/scroll-area";
 import { Spinner } from "@/components/ui/spinner";
 import type { PermissionInfo } from "@/contracts/types/permissions.types";
 import type {
@@ -89,7 +92,7 @@ const CATEGORY_CONFIG: Record<
 	},
 	comments: {
 		label: "Comments",
-		icon: MessageCircle,
+		icon: MessageCircleReply,
 		description: "Add annotations to documents",
 		order: 3,
 	},
@@ -99,6 +102,24 @@ const CATEGORY_CONFIG: Record<
 		description: "Configure AI model settings",
 		order: 4,
 	},
+	image_generations: {
+		label: "Image Models",
+		icon: Image,
+		description: "Configure image generation model settings",
+		order: 4.1,
+	},
+	vision_configs: {
+		label: "Vision Models",
+		icon: ScanEye,
+		description: "Configure vision model settings",
+		order: 4.2,
+	},
+	video_presentations: {
+		label: "Video Presentations",
+		icon: Video,
+		description: "Generate and manage video presentations",
+		order: 4.3,
+	},
 	podcasts: {
 		label: "Podcasts",
 		icon: Mic,
@@ -106,8 +127,8 @@ const CATEGORY_CONFIG: Record<
 		order: 5,
 	},
 	connectors: {
-		label: "Integrations",
-		icon: Plug,
+		label: "Connectors",
+		icon: Unplug,
 		description: "Connect external data sources",
 		order: 6,
 	},
@@ -137,10 +158,16 @@ const CATEGORY_CONFIG: Record<
 	},
 	public_sharing: {
 		label: "Public Chat Sharing",
-		icon: Globe,
+		icon: Earth,
 		description: "Share chats publicly via links",
 		order: 11,
 	},
+	general: {
+		label: "General",
+		icon: SlidersHorizontal,
+		description: "General search space permissions",
+		order: 12,
+	},
 };
 
 const ACTION_LABELS: Record<string, string> = {
@@ -319,111 +346,17 @@ export function RolesManager({ searchSpaceId }: { searchSpaceId: number }) {
 	);
 }
 
-// ============ Role Permissions Display ============
-
-function RolePermissionsDialog({
-	permissions,
-	roleName,
-	children,
-}: {
-	permissions: string[];
-	roleName: string;
-	children: React.ReactNode;
-}) {
-	const isFullAccess = permissions.includes("*");
-
-	const grouped: Record<string, string[]> = {};
-	if (!isFullAccess) {
-		for (const perm of permissions) {
-			const [category, action] = perm.split(":");
-			if (!grouped[category]) grouped[category] = [];
-			grouped[category].push(action);
-		}
-	}
-
-	const sortedCategories = Object.keys(grouped).sort((a, b) => {
-		const orderA = CATEGORY_CONFIG[a]?.order ?? 99;
-		const orderB = CATEGORY_CONFIG[b]?.order ?? 99;
-		return orderA - orderB;
-	});
-
-	const categoryCount = sortedCategories.length;
-
-	return (
-		<Dialog>
-			<DialogTrigger asChild>{children}</DialogTrigger>
-			<DialogContent className="w-[92vw] max-w-md p-0 gap-0">
-				<DialogHeader className="p-4 md:p-5">
-					<DialogTitle className="text-base">{roleName} — Permissions</DialogTitle>
-					<DialogDescription className="text-xs">
-						{isFullAccess
-							? "This role has unrestricted access to all resources"
-							: `${permissions.length} permissions across ${categoryCount} categories`}
-					</DialogDescription>
-				</DialogHeader>
-				{isFullAccess ? (
-					<div className="flex items-center gap-3 px-4 md:px-5 py-6">
-						<div className="h-9 w-9 rounded-lg bg-muted/60 flex items-center justify-center shrink-0">
-							<Shield className="h-4 w-4 text-muted-foreground" />
-						</div>
-						<div>
-							<p className="text-sm font-medium">Full access</p>
-							<p className="text-xs text-muted-foreground">
-								All permissions granted across every category
-							</p>
-						</div>
-					</div>
-				) : (
-					<ScrollArea className="max-h-[55vh]">
-						<div className="divide-y divide-border/50">
-							{sortedCategories.map((category) => {
-								const actions = grouped[category];
-								const config = CATEGORY_CONFIG[category] || {
-									label: category,
-									icon: FileText,
-								};
-								const IconComponent = config.icon;
-								return (
-									<div
-										key={category}
-										className="flex items-center justify-between gap-3 px-4 md:px-5 py-2.5"
-									>
-										<div className="flex items-center gap-2 shrink-0">
-											<IconComponent className="h-3.5 w-3.5 text-muted-foreground" />
-											<span className="text-sm text-muted-foreground">{config.label}</span>
-										</div>
-										<div className="flex flex-wrap justify-end gap-1">
-											{actions.map((action) => (
-												<span
-													key={action}
-													className="px-1.5 py-0.5 rounded bg-muted text-muted-foreground text-[11px] font-medium"
-												>
-													{ACTION_LABELS[action] || action.replace(/_/g, " ")}
-												</span>
-											))}
-										</div>
-									</div>
-								);
-							})}
-						</div>
-					</ScrollArea>
-				)}
-			</DialogContent>
-		</Dialog>
-	);
-}
-
 function PermissionsBadge({ permissions }: { permissions: string[] }) {
 	if (permissions.includes("*")) {
 		return (
-			<div className="px-2.5 py-1 rounded-md bg-muted/50 border border-border/60 text-muted-foreground">
-				<span className="text-xs font-medium whitespace-nowrap">Full access</span>
+			<div className="rounded-md border-0 bg-muted px-1.5 py-0.5 text-muted-foreground">
+				<span className="text-[10px] font-medium whitespace-nowrap">Full access</span>
 			</div>
 		);
 	}
 	return (
-		<div className="px-2.5 py-1 rounded-md border border-border/60 bg-muted/50 text-muted-foreground">
-			<span className="text-xs font-medium whitespace-nowrap">
+		<div className="rounded-md border-0 bg-muted px-1.5 py-0.5 text-muted-foreground">
+			<span className="text-[10px] font-medium whitespace-nowrap">
 				{permissions.length} permissions
 			</span>
 		</div>
@@ -463,6 +396,7 @@ function RolesContent({
 }) {
 	const [showCreateRole, setShowCreateRole] = useState(false);
 	const [editingRoleId, setEditingRoleId] = useState<number | null>(null);
+	const [expandedRoleId, setExpandedRoleId] = useState<number | null>(null);
 
 	if (loading) {
 		return (
@@ -481,7 +415,7 @@ function RolesContent({
 					<Button
 						variant="outline"
 						onClick={() => setShowCreateRole(true)}
-						className="gap-2 bg-white text-black hover:bg-neutral-100 dark:bg-white dark:text-black dark:hover:bg-neutral-200"
+						className="gap-2 bg-white text-black hover:bg-accent hover:text-accent-foreground dark:bg-white dark:text-black"
 					>
 						Create Custom Role
 					</Button>
@@ -508,12 +442,35 @@ function RolesContent({
 			)}
 
 			<div className="space-y-3">
-				{roles.map((role) => (
-					<div key={role.id}>
-						<div className="w-full text-left relative flex items-center gap-4 rounded-lg border border-border/60 p-4 transition-colors hover:bg-muted/30">
-							<div className="flex-1 min-w-0">
-								<RolePermissionsDialog permissions={role.permissions} roleName={role.name}>
-									<button type="button" className="w-full text-left cursor-pointer">
+				{roles.map((role) => {
+					const isExpanded = expandedRoleId === role.id;
+					const isFullAccess = role.permissions.includes("*");
+
+					const grouped: Record<string, string[]> = {};
+					if (!isFullAccess) {
+						for (const perm of role.permissions) {
+							const [category, action] = perm.split(":");
+							if (!grouped[category]) grouped[category] = [];
+							grouped[category].push(action);
+						}
+					}
+					const sortedCategories = Object.keys(grouped).sort((a, b) => {
+						const orderA = CATEGORY_CONFIG[a]?.order ?? 99;
+						const orderB = CATEGORY_CONFIG[b]?.order ?? 99;
+						return orderA - orderB;
+					});
+
+					return (
+						<div key={role.id} className="rounded-lg border border-border/60 overflow-hidden">
+							<div className="group/role-header flex items-center gap-4 p-4 transition-colors hover:bg-accent hover:text-accent-foreground focus-within:bg-accent focus-within:text-accent-foreground">
+								<Button
+									type="button"
+									variant="ghost"
+									aria-expanded={isExpanded}
+									className="h-auto min-w-0 flex-1 justify-start gap-4 p-0 text-left font-normal hover:bg-transparent hover:text-inherit focus-visible:ring-0"
+									onClick={() => setExpandedRoleId(isExpanded ? null : role.id)}
+								>
+									<div className="flex-1 min-w-0 text-left">
 										<div className="flex items-center gap-2">
 											<span className="font-medium text-sm">{role.name}</span>
 											{role.is_system_role && (
@@ -532,67 +489,129 @@ function RolesContent({
 												{role.description}
 											</p>
 										)}
-									</button>
-								</RolePermissionsDialog>
-							</div>
+									</div>
 
-							<div className="shrink-0">
-								<PermissionsBadge permissions={role.permissions} />
+									<div className="shrink-0">
+										<PermissionsBadge permissions={role.permissions} />
+									</div>
+								</Button>
+
+								{!role.is_system_role && (
+									<div className="shrink-0">
+										<DropdownMenu>
+											<DropdownMenuTrigger asChild>
+												<Button variant="ghost" size="icon" className="h-8 w-8">
+													<MoreHorizontal className="h-4 w-4" />
+												</Button>
+											</DropdownMenuTrigger>
+											<DropdownMenuContent align="end" onCloseAutoFocus={(e) => e.preventDefault()}>
+												{canUpdate && (
+													<DropdownMenuItem onClick={() => setEditingRoleId(role.id)}>
+														<Pencil className="h-4 w-4 mr-2" />
+														Edit Role
+													</DropdownMenuItem>
+												)}
+												{canDelete && (
+													<>
+														<DropdownMenuSeparator />
+														<AlertDialog>
+															<AlertDialogTrigger asChild>
+																<DropdownMenuItem onSelect={(e) => e.preventDefault()}>
+																	<Trash2 className="h-4 w-4 mr-2" />
+																	Delete Role
+																</DropdownMenuItem>
+															</AlertDialogTrigger>
+															<AlertDialogContent>
+																<AlertDialogHeader>
+																	<AlertDialogTitle>Delete role?</AlertDialogTitle>
+																	<AlertDialogDescription>
+																		This will permanently delete the &quot;{role.name}&quot; role.
+																		Members with this role will lose their permissions.
+																	</AlertDialogDescription>
+																</AlertDialogHeader>
+																<AlertDialogFooter>
+																	<AlertDialogCancel>Cancel</AlertDialogCancel>
+																	<AlertDialogAction
+																		onClick={() => onDeleteRole(role.id)}
+																		className="bg-destructive text-destructive-foreground hover:bg-destructive/90"
+																	>
+																		Delete
+																	</AlertDialogAction>
+																</AlertDialogFooter>
+															</AlertDialogContent>
+														</AlertDialog>
+													</>
+												)}
+											</DropdownMenuContent>
+										</DropdownMenu>
+									</div>
+								)}
+
+								<Button
+									type="button"
+									variant="ghost"
+									size="icon"
+									aria-label={isExpanded ? `Collapse ${role.name}` : `Expand ${role.name}`}
+									aria-expanded={isExpanded}
+									className="size-6 shrink-0 p-1 hover:bg-transparent hover:text-inherit focus-visible:ring-0"
+									onClick={() => setExpandedRoleId(isExpanded ? null : role.id)}
+								>
+									<ChevronRight
+										className={cn(
+											"h-4 w-4 text-muted-foreground transition-transform duration-200",
+											isExpanded && "rotate-90"
+										)}
+									/>
+								</Button>
 							</div>
 
-							{!role.is_system_role && (
-								<div className="shrink-0" role="none">
-									<DropdownMenu>
-										<DropdownMenuTrigger asChild>
-											<Button variant="ghost" size="icon" className="h-8 w-8">
-												<MoreHorizontal className="h-4 w-4" />
-											</Button>
-										</DropdownMenuTrigger>
-										<DropdownMenuContent align="end" onCloseAutoFocus={(e) => e.preventDefault()}>
-											{canUpdate && (
-												<DropdownMenuItem onClick={() => setEditingRoleId(role.id)}>
-													<Edit2 className="h-4 w-4 mr-2" />
-													Edit Role
-												</DropdownMenuItem>
-											)}
-											{canDelete && (
-												<>
-													<DropdownMenuSeparator />
-													<AlertDialog>
-														<AlertDialogTrigger asChild>
-															<DropdownMenuItem onSelect={(e) => e.preventDefault()}>
-																<Trash2 className="h-4 w-4 mr-2" />
-																Delete Role
-															</DropdownMenuItem>
-														</AlertDialogTrigger>
-														<AlertDialogContent>
-															<AlertDialogHeader>
-																<AlertDialogTitle>Delete role?</AlertDialogTitle>
-																<AlertDialogDescription>
-																	This will permanently delete the &quot;{role.name}&quot; role.
-																	Members with this role will lose their permissions.
-																</AlertDialogDescription>
-															</AlertDialogHeader>
-															<AlertDialogFooter>
-																<AlertDialogCancel>Cancel</AlertDialogCancel>
-																<AlertDialogAction
-																	onClick={() => onDeleteRole(role.id)}
-																	className="bg-destructive text-destructive-foreground hover:bg-destructive/90"
+							{isExpanded && (
+								<div className="border-t border-border/40 px-4 py-3">
+									{isFullAccess ? (
+										<div className="flex items-center gap-3 py-2">
+											<Shield className="h-4 w-4 text-muted-foreground shrink-0" />
+											<p className="text-sm text-muted-foreground">
+												Full access — all permissions granted across every category
+											</p>
+										</div>
+									) : (
+										<div className="divide-y divide-border/30">
+											{sortedCategories.map((category) => {
+												const actions = grouped[category];
+												const config = CATEGORY_CONFIG[category] || {
+													label: category,
+													icon: FileText,
+												};
+												const IconComponent = config.icon;
+												return (
+													<div
+														key={category}
+														className="flex items-center justify-between gap-3 py-2.5"
+													>
+														<div className="flex items-center gap-2 shrink-0">
+															<IconComponent className="h-3.5 w-3.5 text-muted-foreground" />
+															<span className="text-sm text-muted-foreground">{config.label}</span>
+														</div>
+														<div className="flex flex-wrap justify-end gap-1">
+															{actions.map((action) => (
+																<span
+																	key={action}
+																	className="px-1.5 py-0.5 rounded bg-muted text-muted-foreground text-[11px] font-medium"
 																>
-																	Delete
-																</AlertDialogAction>
-															</AlertDialogFooter>
-														</AlertDialogContent>
-													</AlertDialog>
-												</>
-											)}
-										</DropdownMenuContent>
-									</DropdownMenu>
+																	{ACTION_LABELS[action] || action.replace(/_/g, " ")}
+																</span>
+															))}
+														</div>
+													</div>
+												);
+											})}
+										</div>
+									)}
 								</div>
 							)}
 						</div>
-					</div>
-				))}
+					);
+				})}
 			</div>
 		</div>
 	);
@@ -676,46 +695,44 @@ function PermissionsEditor({
 
 					return (
 						<div key={category} className="rounded-lg border border-border/60 overflow-hidden">
-							<button
-								type="button"
-								className="w-full flex items-center justify-between px-3 py-2.5 cursor-pointer hover:bg-muted/40 transition-colors"
-								onClick={() => toggleCategoryExpanded(category)}
-							>
-								<div className="flex items-center gap-2.5">
+							<div className="group/category-header flex items-center justify-between px-3 py-2.5 transition-colors hover:bg-accent hover:text-accent-foreground focus-within:bg-accent focus-within:text-accent-foreground">
+								<Button
+									type="button"
+									variant="ghost"
+									aria-expanded={isExpanded}
+									className="h-auto min-w-0 flex-1 justify-start gap-2.5 p-0 text-left font-normal hover:bg-transparent hover:text-inherit focus-visible:ring-0"
+									onClick={() => toggleCategoryExpanded(category)}
+								>
 									<IconComponent className="h-4 w-4 text-muted-foreground shrink-0" />
 									<span className="font-medium text-sm">{config.label}</span>
 									<span className="text-[11px] text-muted-foreground tabular-nums">
 										{stats.selected}/{stats.total}
 									</span>
-								</div>
+								</Button>
 								<div className="flex items-center gap-2">
 									<Checkbox
 										checked={stats.allSelected}
 										onCheckedChange={() => onToggleCategory(category)}
-										onClick={(e) => e.stopPropagation()}
 										aria-label={`Select all ${config.label} permissions`}
 									/>
-									<div
-										className={cn("transition-transform duration-200", isExpanded && "rotate-180")}
+									<Button
+										type="button"
+										variant="ghost"
+										size="icon"
+										aria-label={isExpanded ? `Collapse ${config.label}` : `Expand ${config.label}`}
+										aria-expanded={isExpanded}
+										className="size-6 p-1 hover:bg-transparent hover:text-inherit focus-visible:ring-0"
+										onClick={() => toggleCategoryExpanded(category)}
 									>
-										<svg
-											className="h-4 w-4 text-muted-foreground"
-											fill="none"
-											viewBox="0 0 24 24"
-											stroke="currentColor"
-											aria-hidden="true"
-										>
-											<title>Toggle</title>
-											<path
-												strokeLinecap="round"
-												strokeLinejoin="round"
-												strokeWidth={2}
-												d="M19 9l-7 7-7-7"
-											/>
-										</svg>
-									</div>
+										<ChevronRight
+											className={cn(
+												"h-4 w-4 text-muted-foreground transition-transform duration-200",
+												isExpanded && "rotate-90"
+											)}
+										/>
+									</Button>
 								</div>
-							</button>
+							</div>
 
 							{isExpanded && (
 								<div className="border-t border-border/60">
@@ -726,28 +743,34 @@ function PermissionsEditor({
 											const isSelected = selectedPermissions.includes(perm.value);
 
 											return (
-												<button
+												<div
 													key={perm.value}
-													type="button"
 													className={cn(
-														"w-full flex items-center justify-between gap-3 px-2.5 py-2 rounded-md cursor-pointer transition-colors",
-														isSelected ? "bg-muted/60 hover:bg-muted/80" : "hover:bg-muted/40"
+														"flex items-center justify-between gap-3 px-2.5 py-2 rounded-md transition-colors",
+														isSelected
+															? "bg-muted/60 hover:bg-accent hover:text-accent-foreground"
+															: "hover:bg-accent hover:text-accent-foreground"
 													)}
-													onClick={() => onTogglePermission(perm.value)}
 												>
-													<div className="flex-1 min-w-0 text-left">
-														<span className="text-sm font-medium">{actionLabel}</span>
-														<p className="text-xs text-muted-foreground truncate">
-															{perm.description}
-														</p>
-													</div>
+													<Button
+														type="button"
+														variant="ghost"
+														className="h-auto flex-1 min-w-0 justify-start p-0 text-left font-normal hover:bg-transparent hover:text-inherit focus-visible:ring-0"
+														onClick={() => onTogglePermission(perm.value)}
+													>
+														<span className="min-w-0">
+															<span className="block text-sm font-medium">{actionLabel}</span>
+															<span className="block text-xs text-muted-foreground truncate">
+																{perm.description}
+															</span>
+														</span>
+													</Button>
 													<Checkbox
 														checked={isSelected}
 														onCheckedChange={() => onTogglePermission(perm.value)}
-														onClick={(e) => e.stopPropagation()}
 														className="shrink-0"
 													/>
-												</button>
+												</div>
 											);
 										})}
 									</div>
@@ -858,12 +881,14 @@ function CreateRoleDialog({
 							<Label className="text-sm font-medium">Start from a template</Label>
 							<div className="grid grid-cols-3 gap-2">
 								{Object.entries(ROLE_PRESETS).map(([key, preset]) => (
-									<button
+									<Button
 										key={key}
 										type="button"
+										variant="outline"
 										onClick={() => applyPreset(key as keyof typeof ROLE_PRESETS)}
 										className={cn(
-											"p-3 rounded-lg border text-left transition-colors hover:bg-muted/40",
+											"h-auto p-3 whitespace-normal transition-colors hover:bg-accent hover:text-accent-foreground",
+											"flex items-center justify-center text-center sm:block sm:text-left",
 											selectedPermissions.length > 0 &&
 												preset.permissions.every((p) => selectedPermissions.includes(p))
 												? "border-foreground/30 bg-muted/40"
@@ -871,10 +896,10 @@ function CreateRoleDialog({
 										)}
 									>
 										<span className="font-medium text-sm">{preset.name}</span>
-										<p className="text-xs text-muted-foreground mt-0.5 line-clamp-2">
+										<p className="hidden sm:block text-xs text-muted-foreground mt-0.5 line-clamp-2">
 											{preset.description}
 										</p>
-									</button>
+									</Button>
 								))}
 							</div>
 						</div>
@@ -884,6 +909,7 @@ function CreateRoleDialog({
 								<Label htmlFor="role-name">Role Name *</Label>
 								<Input
 									id="role-name"
+									maxLength={100}
 									placeholder="e.g., Content Manager"
 									value={name}
 									onChange={(e) => setName(e.target.value)}
@@ -893,6 +919,7 @@ function CreateRoleDialog({
 								<Label htmlFor="role-description">Description</Label>
 								<Input
 									id="role-description"
+									maxLength={500}
 									placeholder="Brief description of this role"
 									value={description}
 									onChange={(e) => setDescription(e.target.value)}
@@ -1042,6 +1069,7 @@ function EditRoleDialog({
 								<Label htmlFor="edit-role-name">Role Name *</Label>
 								<Input
 									id="edit-role-name"
+									maxLength={100}
 									placeholder="e.g., Content Manager"
 									value={name}
 									onChange={(e) => setName(e.target.value)}
@@ -1051,6 +1079,7 @@ function EditRoleDialog({
 								<Label htmlFor="edit-role-description">Description</Label>
 								<Input
 									id="edit-role-description"
+									maxLength={500}
 									placeholder="Brief description of this role"
 									value={description}
 									onChange={(e) => setDescription(e.target.value)}
diff --git a/surfsense_web/components/settings/search-space-settings-dialog.tsx b/surfsense_web/components/settings/search-space-settings-dialog.tsx
deleted file mode 100644
index 34d28eb2ac..0000000000
--- a/surfsense_web/components/settings/search-space-settings-dialog.tsx
+++ /dev/null
@@ -1,117 +0,0 @@
-"use client";
-
-import { useAtom } from "jotai";
-import { Bot, Brain, Eye, FileText, Globe, ImageIcon, MessageSquare, Shield } from "lucide-react";
-import dynamic from "next/dynamic";
-import { useTranslations } from "next-intl";
-import type React from "react";
-import { searchSpaceSettingsDialogAtom } from "@/atoms/settings/settings-dialog.atoms";
-import { SettingsDialog } from "@/components/settings/settings-dialog";
-
-const GeneralSettingsManager = dynamic(
-	() =>
-		import("@/components/settings/general-settings-manager").then((m) => ({
-			default: m.GeneralSettingsManager,
-		})),
-	{ ssr: false }
-);
-const ModelConfigManager = dynamic(
-	() =>
-		import("@/components/settings/model-config-manager").then((m) => ({
-			default: m.ModelConfigManager,
-		})),
-	{ ssr: false }
-);
-const LLMRoleManager = dynamic(
-	() =>
-		import("@/components/settings/llm-role-manager").then((m) => ({ default: m.LLMRoleManager })),
-	{ ssr: false }
-);
-const ImageModelManager = dynamic(
-	() =>
-		import("@/components/settings/image-model-manager").then((m) => ({
-			default: m.ImageModelManager,
-		})),
-	{ ssr: false }
-);
-const VisionModelManager = dynamic(
-	() =>
-		import("@/components/settings/vision-model-manager").then((m) => ({
-			default: m.VisionModelManager,
-		})),
-	{ ssr: false }
-);
-const RolesManager = dynamic(
-	() => import("@/components/settings/roles-manager").then((m) => ({ default: m.RolesManager })),
-	{ ssr: false }
-);
-const PromptConfigManager = dynamic(
-	() =>
-		import("@/components/settings/prompt-config-manager").then((m) => ({
-			default: m.PromptConfigManager,
-		})),
-	{ ssr: false }
-);
-const PublicChatSnapshotsManager = dynamic(
-	() =>
-		import("@/components/public-chat-snapshots/public-chat-snapshots-manager").then((m) => ({
-			default: m.PublicChatSnapshotsManager,
-		})),
-	{ ssr: false }
-);
-
-interface SearchSpaceSettingsDialogProps {
-	searchSpaceId: number;
-}
-
-export function SearchSpaceSettingsDialog({ searchSpaceId }: SearchSpaceSettingsDialogProps) {
-	const t = useTranslations("searchSpaceSettings");
-	const [state, setState] = useAtom(searchSpaceSettingsDialogAtom);
-
-	const navItems = [
-		{ value: "general", label: t("nav_general"), icon: <FileText className="h-4 w-4" /> },
-		{ value: "models", label: t("nav_agent_configs"), icon: <Bot className="h-4 w-4" /> },
-		{ value: "roles", label: t("nav_role_assignments"), icon: <Brain className="h-4 w-4" /> },
-		{
-			value: "image-models",
-			label: t("nav_image_models"),
-			icon: <ImageIcon className="h-4 w-4" />,
-		},
-		{
-			value: "vision-models",
-			label: t("nav_vision_models"),
-			icon: <Eye className="h-4 w-4" />,
-		},
-		{ value: "team-roles", label: t("nav_team_roles"), icon: <Shield className="h-4 w-4" /> },
-		{
-			value: "prompts",
-			label: t("nav_system_instructions"),
-			icon: <MessageSquare className="h-4 w-4" />,
-		},
-		{ value: "public-links", label: t("nav_public_links"), icon: <Globe className="h-4 w-4" /> },
-	];
-
-	const content: Record<string, React.ReactNode> = {
-		general: <GeneralSettingsManager searchSpaceId={searchSpaceId} />,
-		models: <ModelConfigManager searchSpaceId={searchSpaceId} />,
-		roles: <LLMRoleManager searchSpaceId={searchSpaceId} />,
-		"image-models": <ImageModelManager searchSpaceId={searchSpaceId} />,
-		"vision-models": <VisionModelManager searchSpaceId={searchSpaceId} />,
-		"team-roles": <RolesManager searchSpaceId={searchSpaceId} />,
-		prompts: <PromptConfigManager searchSpaceId={searchSpaceId} />,
-		"public-links": <PublicChatSnapshotsManager searchSpaceId={searchSpaceId} />,
-	};
-
-	return (
-		<SettingsDialog
-			open={state.open}
-			onOpenChange={(open) => setState((prev) => ({ ...prev, open }))}
-			title={t("title")}
-			navItems={navItems}
-			activeItem={state.initialTab}
-			onItemChange={(tab) => setState((prev) => ({ ...prev, initialTab: tab }))}
-		>
-			<div className="pt-4">{content[state.initialTab]}</div>
-		</SettingsDialog>
-	);
-}
diff --git a/surfsense_web/components/settings/settings-dialog.tsx b/surfsense_web/components/settings/settings-dialog.tsx
deleted file mode 100644
index 057d7643a9..0000000000
--- a/surfsense_web/components/settings/settings-dialog.tsx
+++ /dev/null
@@ -1,126 +0,0 @@
-"use client";
-
-import type * as React from "react";
-import { useCallback, useRef, useState } from "react";
-import { Dialog, DialogContent, DialogTitle } from "@/components/ui/dialog";
-import { Separator } from "@/components/ui/separator";
-import { cn } from "@/lib/utils";
-
-interface NavItem {
-	value: string;
-	label: string;
-	icon: React.ReactNode;
-}
-
-interface SettingsDialogProps {
-	open: boolean;
-	onOpenChange: (open: boolean) => void;
-	title: string;
-	navItems: NavItem[];
-	activeItem: string;
-	onItemChange: (value: string) => void;
-	children: React.ReactNode;
-}
-
-export function SettingsDialog({
-	open,
-	onOpenChange,
-	title,
-	navItems,
-	activeItem,
-	onItemChange,
-	children,
-}: SettingsDialogProps) {
-	const activeRef = useRef<HTMLButtonElement>(null);
-	const [tabScrollPos, setTabScrollPos] = useState<"start" | "middle" | "end">("start");
-
-	const handleTabScroll = useCallback((e: React.UIEvent<HTMLDivElement>) => {
-		const el = e.currentTarget;
-		const atStart = el.scrollLeft <= 2;
-		const atEnd = el.scrollWidth - el.scrollLeft - el.clientWidth <= 2;
-		setTabScrollPos(atStart ? "start" : atEnd ? "end" : "middle");
-	}, []);
-
-	const handleItemChange = (value: string) => {
-		onItemChange(value);
-		activeRef.current?.scrollIntoView({ inline: "center", block: "nearest", behavior: "smooth" });
-	};
-
-	return (
-		<Dialog open={open} onOpenChange={onOpenChange}>
-			<DialogContent className="select-none max-w-[900px] w-[95vw] md:w-[90vw] h-[90vh] md:h-[80vh] max-h-[640px] flex flex-col md:flex-row p-0 gap-0 overflow-hidden [--card:var(--background)] dark:[--card:oklch(0.205_0_0)] dark:[--background:oklch(0.205_0_0)]">
-				<DialogTitle className="sr-only">{title}</DialogTitle>
-
-				{/* Desktop: Left sidebar */}
-				<nav className="hidden md:flex w-[220px] shrink-0 flex-col border-r border-border p-3 pt-6">
-					<div className="flex flex-col gap-0.5">
-						{navItems.map((item) => (
-							<button
-								key={item.value}
-								type="button"
-								onClick={() => onItemChange(item.value)}
-								className={cn(
-									"flex items-center gap-3 rounded-lg px-3 py-2.5 text-sm font-medium transition-colors text-left focus:outline-none focus-visible:outline-none",
-									activeItem === item.value
-										? "bg-accent text-accent-foreground"
-										: "text-muted-foreground hover:bg-accent/50 hover:text-foreground"
-								)}
-							>
-								{item.icon}
-								{item.label}
-							</button>
-						))}
-					</div>
-				</nav>
-
-				{/* Mobile: Top header + horizontal tabs */}
-				<div className="flex md:hidden flex-col shrink-0">
-					<div className="px-4 pt-4 pb-2">
-						<h2 className="text-base font-semibold">{title}</h2>
-					</div>
-					<div
-						className="overflow-x-auto scrollbar-hide border-b border-border"
-						onScroll={handleTabScroll}
-						style={{
-							maskImage: `linear-gradient(to right, ${tabScrollPos === "start" ? "black" : "transparent"}, black 24px, black calc(100% - 24px), ${tabScrollPos === "end" ? "black" : "transparent"})`,
-							WebkitMaskImage: `linear-gradient(to right, ${tabScrollPos === "start" ? "black" : "transparent"}, black 24px, black calc(100% - 24px), ${tabScrollPos === "end" ? "black" : "transparent"})`,
-						}}
-					>
-						<div className="flex gap-1 px-4 pb-2">
-							{navItems.map((item) => (
-								<button
-									key={item.value}
-									ref={activeItem === item.value ? activeRef : undefined}
-									type="button"
-									onClick={() => handleItemChange(item.value)}
-									className={cn(
-										"flex items-center gap-2 whitespace-nowrap rounded-full px-3 py-1.5 text-xs font-medium transition-colors shrink-0 focus:outline-none focus-visible:outline-none",
-										activeItem === item.value
-											? "bg-accent text-accent-foreground"
-											: "text-muted-foreground hover:bg-accent/50 hover:text-foreground"
-									)}
-								>
-									{item.icon}
-									{item.label}
-								</button>
-							))}
-						</div>
-					</div>
-				</div>
-
-				{/* Content area */}
-				<div className="flex flex-1 flex-col overflow-hidden min-w-0">
-					<div className="hidden md:block px-8 pt-6 pb-2">
-						<h2 className="text-lg font-semibold">
-							{navItems.find((i) => i.value === activeItem)?.label ?? title}
-						</h2>
-						<Separator className="mt-4" />
-					</div>
-					<div className="flex-1 overflow-y-auto overflow-x-hidden">
-						<div className="px-4 md:px-8 pb-6 pt-4 md:pt-0 min-w-0">{children}</div>
-					</div>
-				</div>
-			</DialogContent>
-		</Dialog>
-	);
-}
diff --git a/surfsense_web/components/settings/team-dialog.tsx b/surfsense_web/components/settings/team-dialog.tsx
deleted file mode 100644
index a3a9623783..0000000000
--- a/surfsense_web/components/settings/team-dialog.tsx
+++ /dev/null
@@ -1,32 +0,0 @@
-"use client";
-
-import { useAtom } from "jotai";
-import { useTranslations } from "next-intl";
-import { TeamContent } from "@/app/dashboard/[search_space_id]/team/team-content";
-import { teamDialogAtom } from "@/atoms/settings/settings-dialog.atoms";
-import { Dialog, DialogContent, DialogTitle } from "@/components/ui/dialog";
-
-interface TeamDialogProps {
-	searchSpaceId: number;
-}
-
-export function TeamDialog({ searchSpaceId }: TeamDialogProps) {
-	const t = useTranslations("sidebar");
-	const [open, setOpen] = useAtom(teamDialogAtom);
-
-	return (
-		<Dialog open={open} onOpenChange={setOpen}>
-			<DialogContent
-				className="select-none max-w-[900px] w-[95vw] md:w-[90vw] h-[90vh] md:h-[80vh] max-h-[640px] flex flex-col p-0 gap-0 overflow-hidden [--card:var(--background)] dark:[--card:oklch(0.205_0_0)] dark:[--background:oklch(0.205_0_0)]"
-				onOpenAutoFocus={(e) => e.preventDefault()}
-			>
-				<DialogTitle className="sr-only">{t("manage_members")}</DialogTitle>
-				<div className="flex-1 overflow-y-auto overflow-x-hidden">
-					<div className="px-6 md:px-8 py-6 min-w-0">
-						<TeamContent searchSpaceId={searchSpaceId} />
-					</div>
-				</div>
-			</DialogContent>
-		</Dialog>
-	);
-}
diff --git a/surfsense_web/components/settings/team-memory-manager.tsx b/surfsense_web/components/settings/team-memory-manager.tsx
new file mode 100644
index 0000000000..9d3a40e466
--- /dev/null
+++ b/surfsense_web/components/settings/team-memory-manager.tsx
@@ -0,0 +1,299 @@
+"use client";
+
+import { useQuery, useQueryClient } from "@tanstack/react-query";
+import { useAtomValue } from "jotai";
+import { ArrowUp, ChevronDown, ClipboardCopy, Download, Info, Pencil } from "lucide-react";
+import { useEffect, useRef, useState } from "react";
+import { toast } from "sonner";
+import { z } from "zod";
+import { updateSearchSpaceMutationAtom } from "@/atoms/search-spaces/search-space-mutation.atoms";
+import { PlateEditor } from "@/components/editor/plate-editor";
+import { Alert, AlertDescription } from "@/components/ui/alert";
+import { Button } from "@/components/ui/button";
+import {
+	DropdownMenu,
+	DropdownMenuContent,
+	DropdownMenuItem,
+	DropdownMenuTrigger,
+} from "@/components/ui/dropdown-menu";
+import { Spinner } from "@/components/ui/spinner";
+import { baseApiService } from "@/lib/apis/base-api.service";
+import { searchSpacesApiService } from "@/lib/apis/search-spaces-api.service";
+import { cacheKeys } from "@/lib/query-client/cache-keys";
+
+const MEMORY_HARD_LIMIT = 25_000;
+
+const SearchSpaceSchema = z
+	.object({
+		shared_memory_md: z.string().optional().default(""),
+	})
+	.passthrough();
+
+interface TeamMemoryManagerProps {
+	searchSpaceId: number;
+}
+
+export function TeamMemoryManager({ searchSpaceId }: TeamMemoryManagerProps) {
+	const queryClient = useQueryClient();
+	const { data: searchSpace, isLoading: loading } = useQuery({
+		queryKey: cacheKeys.searchSpaces.detail(searchSpaceId.toString()),
+		queryFn: () => searchSpacesApiService.getSearchSpace({ id: searchSpaceId }),
+		enabled: !!searchSpaceId,
+	});
+
+	const { mutateAsync: updateSearchSpace } = useAtomValue(updateSearchSpaceMutationAtom);
+
+	const [saving, setSaving] = useState(false);
+	const [editQuery, setEditQuery] = useState("");
+	const [editing, setEditing] = useState(false);
+	const [showInput, setShowInput] = useState(false);
+	const textareaRef = useRef<HTMLInputElement>(null);
+	const inputContainerRef = useRef<HTMLDivElement>(null);
+
+	const memory = searchSpace?.shared_memory_md || "";
+
+	useEffect(() => {
+		if (!showInput) return;
+
+		const handlePointerDownOutside = (event: MouseEvent | TouchEvent) => {
+			const target = event.target;
+			if (!(target instanceof Node)) return;
+			if (inputContainerRef.current?.contains(target)) return;
+
+			setShowInput(false);
+		};
+
+		document.addEventListener("mousedown", handlePointerDownOutside);
+		document.addEventListener("touchstart", handlePointerDownOutside, { passive: true });
+
+		return () => {
+			document.removeEventListener("mousedown", handlePointerDownOutside);
+			document.removeEventListener("touchstart", handlePointerDownOutside);
+		};
+	}, [showInput]);
+
+	const handleClear = async () => {
+		try {
+			setSaving(true);
+			await updateSearchSpace({
+				id: searchSpaceId,
+				data: { shared_memory_md: "" },
+			});
+			toast.success("Team memory cleared");
+		} catch {
+			toast.error("Failed to clear team memory");
+		} finally {
+			setSaving(false);
+		}
+	};
+
+	const handleEdit = async () => {
+		const query = editQuery.trim();
+		if (!query) return;
+
+		try {
+			setEditing(true);
+			await baseApiService.post(
+				`/api/v1/searchspaces/${searchSpaceId}/memory/edit`,
+				SearchSpaceSchema,
+				{ body: { query } }
+			);
+			setEditQuery("");
+			setShowInput(false);
+			await queryClient.invalidateQueries({
+				queryKey: cacheKeys.searchSpaces.detail(searchSpaceId.toString()),
+			});
+			toast.success("Team memory updated");
+		} catch {
+			toast.error("Failed to edit team memory");
+		} finally {
+			setEditing(false);
+		}
+	};
+
+	const openInput = () => {
+		setShowInput(true);
+		requestAnimationFrame(() => textareaRef.current?.focus());
+	};
+
+	const handleDownload = () => {
+		if (!memory) return;
+		try {
+			const blob = new Blob([memory], { type: "text/markdown;charset=utf-8" });
+			const url = URL.createObjectURL(blob);
+			const a = document.createElement("a");
+			a.href = url;
+			a.download = "team-memory.md";
+			document.body.appendChild(a);
+			a.click();
+			document.body.removeChild(a);
+			URL.revokeObjectURL(url);
+		} catch {
+			toast.error("Failed to download team memory");
+		}
+	};
+
+	const handleCopyMarkdown = async () => {
+		if (!memory) return;
+		try {
+			await navigator.clipboard.writeText(memory);
+			toast.success("Copied to clipboard");
+		} catch {
+			toast.error("Failed to copy team memory");
+		}
+	};
+
+	const handleKeyDown = (e: React.KeyboardEvent<HTMLInputElement>) => {
+		if (e.key === "Enter" && !e.shiftKey) {
+			e.preventDefault();
+			handleEdit();
+		}
+	};
+
+	const displayMemory = memory.replace(/\(\d{4}-\d{2}-\d{2}\)\s*\[(fact|pref|instr)\]\s*/g, "");
+	const charCount = memory.length;
+
+	const getCounterColor = () => {
+		if (charCount > MEMORY_HARD_LIMIT) return "text-red-500";
+		if (charCount > 15_000) return "text-orange-500";
+		if (charCount > 10_000) return "text-yellow-500";
+		return "text-muted-foreground";
+	};
+
+	if (loading) {
+		return (
+			<div className="flex items-center justify-center py-12">
+				<Spinner size="md" className="text-muted-foreground" />
+			</div>
+		);
+	}
+
+	if (!memory) {
+		return (
+			<div className="flex flex-col items-center justify-center py-16 text-center">
+				<h3 className="text-base font-medium text-foreground">
+					What does SurfSense remember about your team?
+				</h3>
+				<p className="mt-2 max-w-sm text-sm text-muted-foreground">
+					Nothing yet. SurfSense picks up on team decisions and conventions as your team chats.
+				</p>
+			</div>
+		);
+	}
+
+	return (
+		<div className="space-y-4">
+			<Alert>
+				<Info />
+				<AlertDescription>
+					<p>
+						SurfSense uses this shared memory to provide team-wide context across all conversations
+						in this search space.
+					</p>
+				</AlertDescription>
+			</Alert>
+
+			<div className="relative h-[380px] rounded-lg border bg-background">
+				<div className="h-full overflow-y-auto scrollbar-thin">
+					<PlateEditor
+						markdown={displayMemory}
+						readOnly
+						preset="readonly"
+						variant="default"
+						editorVariant="none"
+						className="px-5 py-4 text-sm min-h-full"
+					/>
+				</div>
+
+				{showInput ? (
+					<div className="absolute bottom-3 inset-x-3 z-10">
+						<div
+							ref={inputContainerRef}
+							className="relative flex h-[54px] items-center gap-2 rounded-[9999px] border bg-muted/60 backdrop-blur-sm pl-4 pr-1 shadow-sm"
+						>
+							<input
+								ref={textareaRef}
+								type="text"
+								value={editQuery}
+								onChange={(e) => setEditQuery(e.target.value)}
+								onKeyDown={handleKeyDown}
+								placeholder="Tell SurfSense what to remember or forget about your team"
+								disabled={editing}
+								className="flex-1 bg-transparent text-sm outline-none placeholder:text-muted-foreground/70"
+							/>
+							<Button
+								type="button"
+								size="icon"
+								variant="ghost"
+								onClick={handleEdit}
+								disabled={editing || !editQuery.trim()}
+								className={`h-11 w-11 shrink-0 rounded-full ${
+									editing
+										? ""
+										: "bg-muted-foreground/15 hover:bg-accent hover:text-accent-foreground"
+								}`}
+							>
+								{editing ? (
+									<Spinner size="sm" />
+								) : (
+									<ArrowUp className="!h-5 !w-5 text-foreground" strokeWidth={2.25} />
+								)}
+							</Button>
+						</div>
+					</div>
+				) : (
+					<Button
+						type="button"
+						size="icon"
+						variant="secondary"
+						onClick={openInput}
+						className="absolute bottom-3 right-3 z-10 h-[54px] w-[54px] rounded-full border bg-muted/60 backdrop-blur-sm shadow-sm"
+					>
+						<Pencil className="!h-5 !w-5" />
+					</Button>
+				)}
+			</div>
+
+			<div className="flex items-center justify-between gap-2">
+				<span className={`text-xs shrink-0 ${getCounterColor()}`}>
+					{charCount.toLocaleString()} / {MEMORY_HARD_LIMIT.toLocaleString()}
+					<span className="hidden sm:inline"> characters</span>
+					<span className="sm:hidden"> chars</span>
+					{charCount > 15_000 && charCount <= MEMORY_HARD_LIMIT && " - Approaching limit"}
+					{charCount > MEMORY_HARD_LIMIT && " - Exceeds limit"}
+				</span>
+				<div className="flex items-center gap-1.5 sm:gap-2">
+					<Button
+						type="button"
+						variant="destructive"
+						size="sm"
+						className="text-xs sm:text-sm"
+						onClick={handleClear}
+						disabled={saving || editing || !memory}
+					>
+						<span className="hidden sm:inline">Reset Memory</span>
+						<span className="sm:hidden">Reset</span>
+					</Button>
+					<DropdownMenu>
+						<DropdownMenuTrigger asChild>
+							<Button type="button" variant="secondary" size="sm" disabled={!memory}>
+								Export
+								<ChevronDown className="h-3 w-3 opacity-60" />
+							</Button>
+						</DropdownMenuTrigger>
+						<DropdownMenuContent align="end">
+							<DropdownMenuItem onClick={handleCopyMarkdown}>
+								<ClipboardCopy className="h-4 w-4 mr-2" />
+								Copy as Markdown
+							</DropdownMenuItem>
+							<DropdownMenuItem onClick={handleDownload}>
+								<Download className="h-4 w-4 mr-2" />
+								Download as Markdown
+							</DropdownMenuItem>
+						</DropdownMenuContent>
+					</DropdownMenu>
+				</div>
+			</div>
+		</div>
+	);
+}
diff --git a/surfsense_web/components/settings/user-settings-dialog.tsx b/surfsense_web/components/settings/user-settings-dialog.tsx
deleted file mode 100644
index e755da1976..0000000000
--- a/surfsense_web/components/settings/user-settings-dialog.tsx
+++ /dev/null
@@ -1,109 +0,0 @@
-"use client";
-
-import { useAtom } from "jotai";
-import { Globe, KeyRound, Monitor, Receipt, Sparkles, User } from "lucide-react";
-import dynamic from "next/dynamic";
-import { useTranslations } from "next-intl";
-import { useMemo } from "react";
-import { userSettingsDialogAtom } from "@/atoms/settings/settings-dialog.atoms";
-import { SettingsDialog } from "@/components/settings/settings-dialog";
-import { usePlatform } from "@/hooks/use-platform";
-
-const ProfileContent = dynamic(
-	() =>
-		import("@/app/dashboard/[search_space_id]/user-settings/components/ProfileContent").then(
-			(m) => ({ default: m.ProfileContent })
-		),
-	{ ssr: false }
-);
-const ApiKeyContent = dynamic(
-	() =>
-		import("@/app/dashboard/[search_space_id]/user-settings/components/ApiKeyContent").then(
-			(m) => ({ default: m.ApiKeyContent })
-		),
-	{ ssr: false }
-);
-const PromptsContent = dynamic(
-	() =>
-		import("@/app/dashboard/[search_space_id]/user-settings/components/PromptsContent").then(
-			(m) => ({ default: m.PromptsContent })
-		),
-	{ ssr: false }
-);
-const CommunityPromptsContent = dynamic(
-	() =>
-		import(
-			"@/app/dashboard/[search_space_id]/user-settings/components/CommunityPromptsContent"
-		).then((m) => ({ default: m.CommunityPromptsContent })),
-	{ ssr: false }
-);
-const PurchaseHistoryContent = dynamic(
-	() =>
-		import(
-			"@/app/dashboard/[search_space_id]/user-settings/components/PurchaseHistoryContent"
-		).then((m) => ({ default: m.PurchaseHistoryContent })),
-	{ ssr: false }
-);
-const DesktopContent = dynamic(
-	() =>
-		import("@/app/dashboard/[search_space_id]/user-settings/components/DesktopContent").then(
-			(m) => ({ default: m.DesktopContent })
-		),
-	{ ssr: false }
-);
-
-export function UserSettingsDialog() {
-	const t = useTranslations("userSettings");
-	const [state, setState] = useAtom(userSettingsDialogAtom);
-	const { isDesktop } = usePlatform();
-
-	const navItems = useMemo(
-		() => [
-			{ value: "profile", label: t("profile_nav_label"), icon: <User className="h-4 w-4" /> },
-			{
-				value: "api-key",
-				label: t("api_key_nav_label"),
-				icon: <KeyRound className="h-4 w-4" />,
-			},
-			{
-				value: "prompts",
-				label: "My Prompts",
-				icon: <Sparkles className="h-4 w-4" />,
-			},
-			{
-				value: "community-prompts",
-				label: "Community Prompts",
-				icon: <Globe className="h-4 w-4" />,
-			},
-			{
-				value: "purchases",
-				label: "Purchase History",
-				icon: <Receipt className="h-4 w-4" />,
-			},
-			...(isDesktop
-				? [{ value: "desktop", label: "Desktop", icon: <Monitor className="h-4 w-4" /> }]
-				: []),
-		],
-		[t, isDesktop]
-	);
-
-	return (
-		<SettingsDialog
-			open={state.open}
-			onOpenChange={(open) => setState((prev) => ({ ...prev, open }))}
-			title={t("title")}
-			navItems={navItems}
-			activeItem={state.initialTab}
-			onItemChange={(tab) => setState((prev) => ({ ...prev, initialTab: tab }))}
-		>
-			<div className="pt-4">
-				{state.initialTab === "profile" && <ProfileContent />}
-				{state.initialTab === "api-key" && <ApiKeyContent />}
-				{state.initialTab === "prompts" && <PromptsContent />}
-				{state.initialTab === "community-prompts" && <CommunityPromptsContent />}
-				{state.initialTab === "purchases" && <PurchaseHistoryContent />}
-				{state.initialTab === "desktop" && <DesktopContent />}
-			</div>
-		</SettingsDialog>
-	);
-}
diff --git a/surfsense_web/components/settings/vision-model-manager.tsx b/surfsense_web/components/settings/vision-model-manager.tsx
index 31e6655cb9..31578b4f1b 100644
--- a/surfsense_web/components/settings/vision-model-manager.tsx
+++ b/surfsense_web/components/settings/vision-model-manager.tsx
@@ -1,7 +1,7 @@
 "use client";
 
 import { useAtomValue } from "jotai";
-import { AlertCircle, Dot, Edit3, Info, RefreshCw, Trash2 } from "lucide-react";
+import { AlertCircle, Dot, Info, Pencil, RefreshCw, Trash2 } from "lucide-react";
 import { useMemo, useState } from "react";
 import { membersAtom, myAccessAtom } from "@/atoms/members/members-query.atoms";
 import { deleteVisionLLMConfigMutationAtom } from "@/atoms/vision-llm-config/vision-llm-config-mutation.atoms";
@@ -22,8 +22,10 @@ import {
 	AlertDialogTitle,
 } from "@/components/ui/alert-dialog";
 import { Avatar, AvatarFallback, AvatarImage } from "@/components/ui/avatar";
+import { Badge } from "@/components/ui/badge";
 import { Button } from "@/components/ui/button";
 import { Card, CardContent } from "@/components/ui/card";
+import { Separator } from "@/components/ui/separator";
 import { Skeleton } from "@/components/ui/skeleton";
 import { Spinner } from "@/components/ui/spinner";
 import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/components/ui/tooltip";
@@ -121,7 +123,7 @@ export function VisionModelManager({ searchSpaceId }: VisionModelManagerProps) {
 
 	return (
 		<div className="space-y-4 md:space-y-6">
-			<div className="flex flex-col space-y-4 sm:flex-row sm:items-center sm:justify-between sm:space-y-0">
+			<div className="flex items-center justify-between">
 				<Button
 					variant="secondary"
 					size="sm"
@@ -136,7 +138,7 @@ export function VisionModelManager({ searchSpaceId }: VisionModelManagerProps) {
 					<Button
 						variant="outline"
 						onClick={openNewDialog}
-						className="gap-2 bg-white text-black hover:bg-neutral-100 dark:bg-white dark:text-black dark:hover:bg-neutral-200"
+						className="gap-2 border-transparent bg-white text-[#1f1f1f] font-medium hover:bg-zinc-100 hover:text-[#1f1f1f] dark:border-transparent dark:bg-white dark:text-[#1f1f1f]"
 					>
 						Add Vision Model
 					</Button>
@@ -154,75 +156,157 @@ export function VisionModelManager({ searchSpaceId }: VisionModelManagerProps) {
 
 			{access && !isLoading && isReadOnly && (
 				<div>
-					<Alert className="bg-muted/50 py-3 md:py-4">
-						<Info className="h-3 w-3 md:h-4 md:w-4 shrink-0" />
-						<AlertDescription className="text-xs md:text-sm">
-							You have <span className="font-medium">read-only</span> access to vision model
-							configurations. Contact a space owner to request additional permissions.
+					<Alert>
+						<Info />
+						<AlertDescription>
+							<p>
+								You have <span className="font-medium">read-only</span> access to vision model
+								configurations. Contact a space owner to request additional permissions.
+							</p>
 						</AlertDescription>
 					</Alert>
 				</div>
 			)}
 			{access && !isLoading && !isReadOnly && (!canCreate || !canDelete) && (
 				<div>
-					<Alert className="bg-muted/50 py-3 md:py-4">
-						<Info className="h-3 w-3 md:h-4 md:w-4 shrink-0" />
-						<AlertDescription className="text-xs md:text-sm">
-							You can{" "}
-							{[canCreate && "create and edit", canDelete && "delete"]
-								.filter(Boolean)
-								.join(" and ")}{" "}
-							vision model configurations
-							{!canDelete && ", but cannot delete them"}.
+					<Alert>
+						<Info />
+						<AlertDescription>
+							<p>
+								You can{" "}
+								{[canCreate && "create and edit", canDelete && "delete"]
+									.filter(Boolean)
+									.join(" and ")}{" "}
+								vision model configurations
+								{!canDelete && ", but cannot delete them"}.
+							</p>
 						</AlertDescription>
 					</Alert>
 				</div>
 			)}
 
-			{globalConfigs.filter((g) => !("is_auto_mode" in g && g.is_auto_mode)).length > 0 && (
-				<Alert className="bg-muted/50 py-3">
-					<Info className="h-3 w-3 md:h-4 md:w-4 shrink-0" />
-					<AlertDescription className="text-xs md:text-sm">
-						<p>
-							<span className="font-medium">
-								{globalConfigs.filter((g) => !("is_auto_mode" in g && g.is_auto_mode)).length}{" "}
-								global vision{" "}
-								{globalConfigs.filter((g) => !("is_auto_mode" in g && g.is_auto_mode)).length === 1
-									? "model"
-									: "models"}
-							</span>{" "}
-							available from your administrator. Use the model selector to view and select them.
-						</p>
+			{(isLoading ||
+				globalConfigs.filter((g) => !("is_auto_mode" in g && g.is_auto_mode)).length > 0) && (
+				<Alert>
+					<Info />
+					<AlertDescription>
+						{isLoading ? (
+							<div className="flex min-h-[1.625em] items-center">
+								<Skeleton className="h-4 w-60 bg-accent-foreground/15" />
+							</div>
+						) : (
+							<p>
+								<span className="font-medium">
+									{globalConfigs.filter((g) => !("is_auto_mode" in g && g.is_auto_mode)).length}{" "}
+									global vision{" "}
+									{globalConfigs.filter((g) => !("is_auto_mode" in g && g.is_auto_mode)).length ===
+									1
+										? "model"
+										: "models"}
+								</span>{" "}
+								available from your administrator. {(() => {
+									const nonAuto = globalConfigs.filter(
+										(g) => !("is_auto_mode" in g && g.is_auto_mode)
+									);
+									const premium = nonAuto.filter(
+										(g) =>
+											"billing_tier" in g &&
+											(g as { billing_tier?: string }).billing_tier === "premium"
+									).length;
+									const free = nonAuto.length - premium;
+									if (premium > 0 && free > 0) {
+										return `${premium} premium, ${free} free.`;
+									}
+									if (premium > 0) {
+										return `All ${premium} premium — debits your shared credit pool.`;
+									}
+									return `All ${free} free.`;
+								})()}
+							</p>
+						)}
 					</AlertDescription>
 				</Alert>
 			)}
 
+			{/* Global Vision Models — read-only cards with per-model Free/Premium
+			    badges. Mirrors the badge palette used by the chat role selector
+			    (`llm-role-manager.tsx`) so the meaning is consistent across
+			    every model-configuration surface (chat / image / vision). */}
+			{!isLoading &&
+				globalConfigs.filter((g) => !("is_auto_mode" in g && g.is_auto_mode)).length > 0 && (
+					<div className="space-y-3">
+						<div className="grid gap-3 grid-cols-1 sm:grid-cols-2 xl:grid-cols-3">
+							{globalConfigs
+								.filter((g) => !("is_auto_mode" in g && g.is_auto_mode))
+								.map((cfg) => {
+									const billingTier =
+										("billing_tier" in cfg &&
+											typeof (cfg as { billing_tier?: string }).billing_tier === "string" &&
+											(cfg as { billing_tier?: string }).billing_tier) ||
+										"free";
+									const isPremium = billingTier === "premium";
+									return (
+										<Card
+											key={cfg.id}
+											className="group relative overflow-hidden transition-all duration-200 border-accent bg-accent/20 hover:shadow-md h-full"
+										>
+											<CardContent className="p-4 flex flex-col gap-3 h-full">
+												<div className="flex items-center gap-2 min-w-0">
+													<div className="shrink-0">
+														{getProviderIcon(cfg.provider, { className: "size-4" })}
+													</div>
+													<div className="min-w-0 flex-1 flex items-center gap-1.5">
+														<h4 className="text-sm font-semibold tracking-tight truncate">
+															{cfg.name}
+														</h4>
+														{isPremium ? (
+															<Badge
+																variant="secondary"
+																className="text-[8px] md:text-[9px] shrink-0 bg-purple-100 text-purple-700 dark:bg-purple-900/50 dark:text-purple-300 border-0"
+															>
+																Premium
+															</Badge>
+														) : (
+															<Badge
+																variant="secondary"
+																className="text-[8px] md:text-[9px] shrink-0 bg-emerald-100 text-emerald-700 dark:bg-emerald-900/50 dark:text-emerald-300 border-0"
+															>
+																Free
+															</Badge>
+														)}
+													</div>
+												</div>
+												{cfg.description && (
+													<p className="text-[11px] text-muted-foreground/70 line-clamp-2">
+														{cfg.description}
+													</p>
+												)}
+												<div className="mt-auto space-y-2">
+													<Separator className="bg-accent" />
+													<div className="flex items-center">
+														<span className="text-[11px] text-muted-foreground/60 truncate">
+															{cfg.model_name}
+														</span>
+													</div>
+												</div>
+											</CardContent>
+										</Card>
+									);
+								})}
+						</div>
+					</div>
+				)}
+
 			{isLoading && (
 				<div className="space-y-4 md:space-y-6">
 					<div className="space-y-4">
-						<div className="flex items-center justify-between">
-							<Skeleton className="h-6 md:h-7 w-40 md:w-48" />
-							<Skeleton className="h-8 md:h-9 w-32 md:w-36 rounded-md" />
-						</div>
 						<div className="grid gap-3 grid-cols-1 sm:grid-cols-2 xl:grid-cols-3">
 							{["skeleton-a", "skeleton-b", "skeleton-c"].map((key) => (
-								<Card key={key} className="border-border/60">
-									<CardContent className="p-4 flex flex-col gap-3">
-										<div className="flex items-start justify-between gap-2">
-											<div className="space-y-1.5 flex-1 min-w-0">
-												<Skeleton className="h-4 w-28 md:w-32" />
-												<Skeleton className="h-3 w-40 md:w-48" />
-											</div>
-										</div>
-										<div className="flex items-center gap-2">
-											<Skeleton className="h-5 w-16 rounded-full" />
-											<Skeleton className="h-5 w-24 rounded-md" />
-										</div>
-										<div className="flex items-center gap-2 pt-2 border-t border-border/40">
-											<Skeleton className="h-3 w-20" />
-											<Skeleton className="h-4 w-4 rounded-full" />
-											<Skeleton className="h-3 w-16" />
-										</div>
+								<Card key={key} className="border-accent bg-accent/20">
+									<CardContent className="p-4 flex flex-col gap-3 min-h-32">
+										<Skeleton className="h-4 w-32 md:w-40 bg-accent" />
+										<Skeleton className="h-3 w-full bg-accent" />
+										<Skeleton className="h-3 w-24 md:w-28 bg-accent mt-auto" />
 									</CardContent>
 								</Card>
 							))}
@@ -251,21 +335,27 @@ export function VisionModelManager({ searchSpaceId }: VisionModelManagerProps) {
 
 								return (
 									<div key={config.id}>
-										<Card className="group relative overflow-hidden transition-all duration-200 border-border/60 hover:shadow-md h-full">
+										<Card className="group relative overflow-hidden transition-all duration-200 border-accent bg-accent/20 hover:shadow-md h-full">
 											<CardContent className="p-4 flex flex-col gap-3 h-full">
-												<div className="flex items-start justify-between gap-2">
-													<div className="min-w-0 flex-1">
-														<h4 className="text-sm font-semibold tracking-tight truncate">
-															{config.name}
-														</h4>
-														{config.description && (
-															<p className="text-[11px] text-muted-foreground/70 truncate mt-0.5">
-																{config.description}
-															</p>
-														)}
+												{/* Header: Icon + Name + Actions */}
+												<div className="flex items-center justify-between gap-2">
+													<div className="flex items-center gap-2.5 min-w-0 flex-1">
+														<div className="shrink-0">
+															{getProviderIcon(config.provider, { className: "size-4" })}
+														</div>
+														<div className="min-w-0 flex-1">
+															<h4 className="text-sm font-semibold tracking-tight truncate">
+																{config.name}
+															</h4>
+															{config.description && (
+																<p className="text-[11px] text-muted-foreground/70 truncate mt-0.5">
+																	{config.description}
+																</p>
+															)}
+														</div>
 													</div>
 													{(canUpdate || canDelete) && (
-														<div className="flex items-center gap-0.5 shrink-0 sm:opacity-0 sm:group-hover:opacity-100 transition-opacity duration-150">
+														<div className="flex items-center gap-1 shrink-0 sm:w-0 sm:overflow-hidden sm:group-hover:w-auto sm:opacity-0 sm:group-hover:opacity-100 transition-all duration-150">
 															{canUpdate && (
 																<TooltipProvider>
 																	<Tooltip open={isDesktop ? undefined : false}>
@@ -274,9 +364,9 @@ export function VisionModelManager({ searchSpaceId }: VisionModelManagerProps) {
 																				variant="ghost"
 																				size="icon"
 																				onClick={() => openEditDialog(config)}
-																				className="h-7 w-7 text-muted-foreground hover:text-foreground"
+																				className="h-6 w-6 text-muted-foreground hover:text-accent-foreground"
 																			>
-																				<Edit3 className="h-3 w-3" />
+																				<Pencil className="h-3 w-3" />
 																			</Button>
 																		</TooltipTrigger>
 																		<TooltipContent>Edit</TooltipContent>
@@ -291,7 +381,7 @@ export function VisionModelManager({ searchSpaceId }: VisionModelManagerProps) {
 																				variant="ghost"
 																				size="icon"
 																				onClick={() => setConfigToDelete(config)}
-																				className="h-7 w-7 text-muted-foreground hover:text-destructive"
+																				className="h-7 w-7 rounded-lg text-muted-foreground hover:text-destructive"
 																			>
 																				<Trash2 className="h-3 w-3" />
 																			</Button>
@@ -304,50 +394,45 @@ export function VisionModelManager({ searchSpaceId }: VisionModelManagerProps) {
 													)}
 												</div>
 
-												<div className="flex items-center gap-2 flex-wrap">
-													{getProviderIcon(config.provider, {
-														className: "size-3.5 shrink-0",
-													})}
-													<code className="text-[11px] font-mono text-muted-foreground bg-muted/60 px-2 py-0.5 rounded-md truncate max-w-[160px]">
-														{config.model_name}
-													</code>
-												</div>
-
-												<div className="flex items-center gap-2 pt-2 border-t border-border/40 mt-auto">
-													<span className="text-[11px] text-muted-foreground/60">
-														{new Date(config.created_at).toLocaleDateString(undefined, {
-															year: "numeric",
-															month: "short",
-															day: "numeric",
-														})}
-													</span>
-													{member && (
-														<>
-															<Dot className="h-4 w-4 text-muted-foreground/30" />
-															<TooltipProvider>
-																<Tooltip open={isDesktop ? undefined : false}>
-																	<TooltipTrigger asChild>
-																		<div className="flex items-center gap-1.5 cursor-default">
-																			<Avatar className="size-4.5 shrink-0">
-																				{member.avatarUrl && (
-																					<AvatarImage src={member.avatarUrl} alt={member.name} />
-																				)}
-																				<AvatarFallback className="text-[9px]">
-																					{getInitials(member.name)}
-																				</AvatarFallback>
-																			</Avatar>
-																			<span className="text-[11px] text-muted-foreground/60 truncate max-w-[120px]">
-																				{member.name}
-																			</span>
-																		</div>
-																	</TooltipTrigger>
-																	<TooltipContent side="bottom">
-																		{member.email || member.name}
-																	</TooltipContent>
-																</Tooltip>
-															</TooltipProvider>
-														</>
-													)}
+												{/* Footer: Date + Creator */}
+												<div className="mt-auto space-y-2">
+													<Separator className="bg-accent" />
+													<div className="flex items-center">
+														<span className="shrink-0 text-[11px] text-muted-foreground/60 whitespace-nowrap">
+															{new Date(config.created_at).toLocaleDateString(undefined, {
+																year: "numeric",
+																month: "short",
+																day: "numeric",
+															})}
+														</span>
+														{member && (
+															<>
+																<Dot className="h-4 w-4 text-muted-foreground/30 shrink-0" />
+																<TooltipProvider>
+																	<Tooltip open={isDesktop ? undefined : false}>
+																		<TooltipTrigger asChild>
+																			<div className="min-w-0 flex items-center gap-1.5 cursor-default">
+																				<Avatar className="size-4.5 shrink-0">
+																					{member.avatarUrl && (
+																						<AvatarImage src={member.avatarUrl} alt={member.name} />
+																					)}
+																					<AvatarFallback className="text-[9px]">
+																						{getInitials(member.name)}
+																					</AvatarFallback>
+																				</Avatar>
+																				<span className="text-[11px] text-muted-foreground/60 truncate">
+																					{member.name}
+																				</span>
+																			</div>
+																		</TooltipTrigger>
+																		<TooltipContent side="bottom">
+																			{member.email || member.name}
+																		</TooltipContent>
+																	</Tooltip>
+																</TooltipProvider>
+															</>
+														)}
+													</div>
 												</div>
 											</CardContent>
 										</Card>
diff --git a/surfsense_web/components/shared/ExportMenuItems.tsx b/surfsense_web/components/shared/ExportMenuItems.tsx
index 86bc12c427..3f90bed77b 100644
--- a/surfsense_web/components/shared/ExportMenuItems.tsx
+++ b/surfsense_web/components/shared/ExportMenuItems.tsx
@@ -24,18 +24,30 @@ interface ExportMenuItemsProps {
 	exporting: string | null;
 	/** Hide server-side formats (PDF, DOCX, etc.) — only show md */
 	showAllFormats?: boolean;
+	/** When true, only show PDF export (used for Typst-based resumes) */
+	pdfOnly?: boolean;
 }
 
 export function ExportDropdownItems({
 	onExport,
 	exporting,
 	showAllFormats = true,
+	pdfOnly = false,
 }: ExportMenuItemsProps) {
 	const handle = (format: string) => (e: React.MouseEvent) => {
 		e.stopPropagation();
 		onExport(format);
 	};
 
+	if (pdfOnly) {
+		return (
+			<DropdownMenuItem onClick={handle("pdf")} disabled={exporting !== null}>
+				{exporting === "pdf" && <Loader2 className="mr-2 h-3.5 w-3.5 animate-spin" />}
+				PDF (.pdf)
+			</DropdownMenuItem>
+		);
+	}
+
 	return (
 		<>
 			{showAllFormats && (
diff --git a/surfsense_web/components/shared/image-config-dialog.tsx b/surfsense_web/components/shared/image-config-dialog.tsx
index 2ae53ccca0..36d16081a4 100644
--- a/surfsense_web/components/shared/image-config-dialog.tsx
+++ b/surfsense_web/components/shared/image-config-dialog.tsx
@@ -48,6 +48,7 @@ interface ImageConfigDialogProps {
 	isGlobal: boolean;
 	searchSpaceId: number;
 	mode: "create" | "edit" | "view";
+	defaultProvider?: string;
 }
 
 const INITIAL_FORM = {
@@ -67,6 +68,7 @@ export function ImageConfigDialog({
 	isGlobal,
 	searchSpaceId,
 	mode,
+	defaultProvider,
 }: ImageConfigDialogProps) {
 	const [isSubmitting, setIsSubmitting] = useState(false);
 	const [formData, setFormData] = useState(INITIAL_FORM);
@@ -87,11 +89,11 @@ export function ImageConfigDialog({
 					api_version: config.api_version || "",
 				});
 			} else if (mode === "create") {
-				setFormData(INITIAL_FORM);
+				setFormData({ ...INITIAL_FORM, provider: defaultProvider ?? "" });
 			}
 			setScrollPos("top");
 		}
-	}, [open, mode, config, isGlobal]);
+	}, [open, mode, config, isGlobal, defaultProvider]);
 
 	const { mutateAsync: createConfig } = useAtomValue(createImageGenConfigMutationAtom);
 	const { mutateAsync: updateConfig } = useAtomValue(updateImageGenConfigMutationAtom);
@@ -296,7 +298,7 @@ export function ImageConfigDialog({
 								/>
 							</div>
 
-							<Separator />
+							<Separator className="bg-popover-border" />
 
 							<div className="space-y-2">
 								<Label className="text-sm font-medium">Provider *</Label>
diff --git a/surfsense_web/components/shared/llm-config-form.tsx b/surfsense_web/components/shared/llm-config-form.tsx
index 732bf971e7..06de4129b5 100644
--- a/surfsense_web/components/shared/llm-config-form.tsx
+++ b/surfsense_web/components/shared/llm-config-form.tsx
@@ -202,7 +202,7 @@ export function LLMConfigForm({
 											<SelectValue placeholder="Select a provider" />
 										</SelectTrigger>
 									</FormControl>
-									<SelectContent className="max-h-[300px] bg-muted dark:border-neutral-700">
+									<SelectContent className="max-h-[300px]">
 										{LLM_PROVIDERS.map((provider) => (
 											<SelectItem
 												key={provider.value}
@@ -251,7 +251,7 @@ export function LLMConfigForm({
 												role="combobox"
 												aria-expanded={modelComboboxOpen}
 												className={cn(
-													"w-full justify-between font-normal bg-transparent",
+													"w-full justify-between border-popover-border bg-transparent font-normal",
 													!field.value && "text-muted-foreground"
 												)}
 											>
@@ -260,10 +260,7 @@ export function LLMConfigForm({
 											</Button>
 										</FormControl>
 									</PopoverTrigger>
-									<PopoverContent
-										className="w-full p-0 bg-muted dark:border-neutral-700"
-										align="start"
-									>
+									<PopoverContent className="w-full p-0" align="start">
 										<Command shouldFilter={false} className="bg-transparent">
 											<CommandInput
 												placeholder={selectedProvider?.example || "Search model name"}
@@ -407,12 +404,13 @@ export function LLMConfigForm({
 				{/* Advanced Parameters */}
 				{showAdvanced && (
 					<>
-						<Separator />
+						<Separator className="bg-popover-border" />
 						<Collapsible open={advancedOpen} onOpenChange={setAdvancedOpen}>
 							<CollapsibleTrigger asChild>
-								<button
+								<Button
 									type="button"
-									className="flex w-full items-center justify-between py-2 text-xs sm:text-sm font-medium text-muted-foreground hover:text-foreground transition-colors"
+									variant="ghost"
+									className="h-auto w-full justify-between px-0 py-2 text-xs font-medium text-muted-foreground hover:bg-transparent hover:text-accent-foreground sm:text-sm"
 								>
 									<span>Advanced Parameters</span>
 									<ChevronDown
@@ -421,7 +419,7 @@ export function LLMConfigForm({
 											advancedOpen && "rotate-180"
 										)}
 									/>
-								</button>
+								</Button>
 							</CollapsibleTrigger>
 							<CollapsibleContent className="space-y-4 pt-2">
 								<FormField
@@ -445,12 +443,13 @@ export function LLMConfigForm({
 				)}
 
 				{/* System Instructions & Citations Section */}
-				<Separator />
+				<Separator className="bg-popover-border" />
 				<Collapsible open={systemInstructionsOpen} onOpenChange={setSystemInstructionsOpen}>
 					<CollapsibleTrigger asChild>
-						<button
+						<Button
 							type="button"
-							className="flex w-full items-center justify-between py-2 text-xs sm:text-sm font-medium text-muted-foreground hover:text-foreground transition-colors"
+							variant="ghost"
+							className="h-auto w-full justify-between px-0 py-2 text-xs font-medium text-muted-foreground hover:bg-transparent hover:text-accent-foreground sm:text-sm"
 						>
 							<span>System Instructions</span>
 							<ChevronDown
@@ -459,7 +458,7 @@ export function LLMConfigForm({
 									systemInstructionsOpen && "rotate-180"
 								)}
 							/>
-						</button>
+						</Button>
 					</CollapsibleTrigger>
 					<CollapsibleContent className="space-y-4 pt-2">
 						{/* System Instructions */}
@@ -478,7 +477,7 @@ export function LLMConfigForm({
 												onClick={() =>
 													field.onChange(defaultInstructions.default_system_instructions)
 												}
-												className="h-7 text-[10px] sm:text-xs text-muted-foreground hover:text-foreground"
+												className="h-7 text-[10px] sm:text-xs text-muted-foreground hover:text-accent-foreground"
 											>
 												Reset to Default
 											</Button>
diff --git a/surfsense_web/components/shared/model-config-dialog.tsx b/surfsense_web/components/shared/model-config-dialog.tsx
index 4d2373b490..d4f57ff7d5 100644
--- a/surfsense_web/components/shared/model-config-dialog.tsx
+++ b/surfsense_web/components/shared/model-config-dialog.tsx
@@ -28,6 +28,7 @@ interface ModelConfigDialogProps {
 	isGlobal: boolean;
 	searchSpaceId: number;
 	mode: "create" | "edit" | "view";
+	defaultProvider?: string;
 }
 
 export function ModelConfigDialog({
@@ -37,6 +38,7 @@ export function ModelConfigDialog({
 	isGlobal,
 	searchSpaceId,
 	mode,
+	defaultProvider,
 }: ModelConfigDialogProps) {
 	const [isSubmitting, setIsSubmitting] = useState(false);
 	const [scrollPos, setScrollPos] = useState<"top" | "middle" | "bottom">("top");
@@ -194,10 +196,14 @@ export function ModelConfigDialog({
 
 					{mode === "create" ? (
 						<LLMConfigForm
+							key={defaultProvider ?? "no-provider"}
 							searchSpaceId={searchSpaceId}
 							onSubmit={handleSubmit}
 							mode="create"
 							formId="model-config-form"
+							initialData={
+								defaultProvider ? { provider: defaultProvider as LiteLLMProvider } : undefined
+							}
 						/>
 					) : isGlobal && config ? (
 						<div className="space-y-6">
diff --git a/surfsense_web/components/shared/vision-config-dialog.tsx b/surfsense_web/components/shared/vision-config-dialog.tsx
index 7332d3dcd8..2646f38422 100644
--- a/surfsense_web/components/shared/vision-config-dialog.tsx
+++ b/surfsense_web/components/shared/vision-config-dialog.tsx
@@ -49,6 +49,7 @@ interface VisionConfigDialogProps {
 	isGlobal: boolean;
 	searchSpaceId: number;
 	mode: "create" | "edit" | "view";
+	defaultProvider?: string;
 }
 
 const INITIAL_FORM = {
@@ -68,6 +69,7 @@ export function VisionConfigDialog({
 	isGlobal,
 	searchSpaceId,
 	mode,
+	defaultProvider,
 }: VisionConfigDialogProps) {
 	const [isSubmitting, setIsSubmitting] = useState(false);
 	const [formData, setFormData] = useState(INITIAL_FORM);
@@ -87,11 +89,11 @@ export function VisionConfigDialog({
 					api_version: (config as VisionLLMConfig).api_version || "",
 				});
 			} else if (mode === "create") {
-				setFormData(INITIAL_FORM);
+				setFormData({ ...INITIAL_FORM, provider: defaultProvider ?? "" });
 			}
 			setScrollPos("top");
 		}
-	}, [open, mode, config, isGlobal]);
+	}, [open, mode, config, isGlobal, defaultProvider]);
 
 	const { mutateAsync: createConfig } = useAtomValue(createVisionLLMConfigMutationAtom);
 	const { mutateAsync: updateConfig } = useAtomValue(updateVisionLLMConfigMutationAtom);
@@ -297,7 +299,7 @@ export function VisionConfigDialog({
 								/>
 							</div>
 
-							<Separator />
+							<Separator className="bg-popover-border" />
 
 							<div className="space-y-2">
 								<Label className="text-sm font-medium">Provider *</Label>
@@ -337,10 +339,7 @@ export function VisionConfigDialog({
 											<ChevronsUpDown className="ml-2 h-4 w-4 shrink-0 opacity-50" />
 										</Button>
 									</PopoverTrigger>
-									<PopoverContent
-										className="w-full p-0 bg-muted dark:border-neutral-700"
-										align="start"
-									>
+									<PopoverContent className="w-full p-0" align="start">
 										<Command shouldFilter={false} className="bg-transparent">
 											<CommandInput
 												placeholder={selectedProvider?.example || "Search model name"}
diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx
index 124354a494..ce9a5bc74b 100644
--- a/surfsense_web/components/sources/DocumentUploadTab.tsx
+++ b/surfsense_web/components/sources/DocumentUploadTab.tsx
@@ -1,7 +1,7 @@
 "use client";
 
 import { useAtom } from "jotai";
-import { ChevronDown, Dot, File as FileIcon, FolderOpen, Upload, X } from "lucide-react";
+import { ChevronDown, Crown, Dot, File as FileIcon, FolderOpen, X, Zap } from "lucide-react";
 
 import { useTranslations } from "next-intl";
 import { type ChangeEvent, useCallback, useEffect, useMemo, useRef, useState } from "react";
@@ -25,7 +25,9 @@ import {
 import { Progress } from "@/components/ui/progress";
 import { Spinner } from "@/components/ui/spinner";
 import { Switch } from "@/components/ui/switch";
+import type { ProcessingMode } from "@/contracts/types/document.types";
 import { useElectronAPI } from "@/hooks/use-platform";
+import { documentsApiService } from "@/lib/apis/documents-api.service";
 import {
 	trackDocumentUploadFailure,
 	trackDocumentUploadStarted,
@@ -48,6 +50,80 @@ interface FileWithId {
 	file: File;
 }
 
+interface FolderEntry {
+	id: string;
+	file: File;
+	relativePath: string;
+}
+
+interface FolderUploadData {
+	folderName: string;
+	entries: FolderEntry[];
+}
+
+interface FolderTreeNode {
+	name: string;
+	isFolder: boolean;
+	size?: number;
+	children: FolderTreeNode[];
+}
+
+function buildFolderTree(entries: FolderEntry[]): FolderTreeNode[] {
+	const root: FolderTreeNode = { name: "", isFolder: true, children: [] };
+
+	for (const entry of entries) {
+		const parts = entry.relativePath.split("/");
+		let current = root;
+
+		for (let i = 0; i < parts.length - 1; i++) {
+			let child = current.children.find((c) => c.name === parts[i] && c.isFolder);
+			if (!child) {
+				child = { name: parts[i], isFolder: true, children: [] };
+				current.children.push(child);
+			}
+			current = child;
+		}
+
+		current.children.push({
+			name: parts[parts.length - 1],
+			isFolder: false,
+			size: entry.file.size,
+			children: [],
+		});
+	}
+
+	function sortNodes(node: FolderTreeNode) {
+		node.children.sort((a, b) => {
+			if (a.isFolder !== b.isFolder) return a.isFolder ? -1 : 1;
+			return a.name.localeCompare(b.name);
+		});
+		for (const child of node.children) sortNodes(child);
+	}
+	sortNodes(root);
+
+	return root.children;
+}
+
+function flattenTree(
+	nodes: FolderTreeNode[],
+	depth = 0,
+	parentPath = ""
+): { name: string; isFolder: boolean; depth: number; path: string; size?: number }[] {
+	const items: { name: string; isFolder: boolean; depth: number; path: string; size?: number }[] =
+		[];
+	for (const node of nodes) {
+		const path = parentPath ? `${parentPath}/${node.name}` : node.name;
+		items.push({ name: node.name, isFolder: node.isFolder, depth, path, size: node.size });
+		if (node.isFolder && node.children.length > 0) {
+			items.push(...flattenTree(node.children, depth + 1, path));
+		}
+	}
+	return items;
+}
+
+const FOLDER_BATCH_SIZE_BYTES = 20 * 1024 * 1024;
+const FOLDER_BATCH_MAX_FILES = 10;
+
 const MAX_FILE_SIZE_MB = 500;
 const MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024;
 
@@ -64,11 +140,15 @@ export function DocumentUploadTab({
 	const [uploadProgress, setUploadProgress] = useState(0);
 	const [accordionValue, setAccordionValue] = useState<string>("");
 	const [shouldSummarize, setShouldSummarize] = useState(false);
+	const [useVisionLlm, setUseVisionLlm] = useState(false);
+	const [processingMode, setProcessingMode] = useState<ProcessingMode>("basic");
 	const [uploadDocumentMutation] = useAtom(uploadDocumentMutationAtom);
 	const { mutate: uploadDocuments, isPending: isUploading } = uploadDocumentMutation;
 	const fileInputRef = useRef<HTMLInputElement>(null);
 	const folderInputRef = useRef<HTMLInputElement>(null);
 	const progressIntervalRef = useRef<ReturnType<typeof setInterval> | null>(null);
+	const [folderUpload, setFolderUpload] = useState<FolderUploadData | null>(null);
+	const [isFolderUploading, setIsFolderUploading] = useState(false);
 
 	useEffect(() => {
 		return () => {
@@ -105,6 +185,7 @@ export function DocumentUploadTab({
 			const valid = incoming.filter((f) => f.size <= MAX_FILE_SIZE_BYTES);
 			if (valid.length === 0) return;
 
+			setFolderUpload(null);
 			setFiles((prev) => {
 				const newEntries = valid.map((f) => ({
 					id: crypto.randomUUID?.() ?? `file-${Date.now()}-${Math.random().toString(36)}`,
@@ -159,6 +240,7 @@ export function DocumentUploadTab({
 				file: new File([fd.data], fd.name, { type: fd.mimeType }),
 			})
 		);
+		setFolderUpload(null);
 		setFiles((prev) => [...prev, ...newFiles]);
 	}, [electronAPI, supportedExtensionsSet, t]);
 
@@ -167,18 +249,35 @@ export function DocumentUploadTab({
 			const fileList = e.target.files;
 			if (!fileList || fileList.length === 0) return;
 
-			const folderFiles = Array.from(fileList).filter((f) => {
-				const ext = f.name.includes(".") ? `.${f.name.split(".").pop()?.toLowerCase()}` : "";
-				return ext !== "" && supportedExtensionsSet.has(ext);
-			});
+			const allFiles = Array.from(fileList);
+			const firstPath = allFiles[0]?.webkitRelativePath || "";
+			const folderName = firstPath.split("/")[0];
+
+			if (!folderName) {
+				addFiles(allFiles);
+				e.target.value = "";
+				return;
+			}
+
+			const entries: FolderEntry[] = allFiles
+				.filter((f) => {
+					const ext = f.name.includes(".") ? `.${f.name.split(".").pop()?.toLowerCase()}` : "";
+					return ext !== "" && supportedExtensionsSet.has(ext);
+				})
+				.map((f) => ({
+					id: crypto.randomUUID?.() ?? `file-${Date.now()}-${Math.random().toString(36)}`,
+					file: f,
+					relativePath: f.webkitRelativePath.substring(folderName.length + 1),
+				}));
 
-			if (folderFiles.length === 0) {
+			if (entries.length === 0) {
 				toast.error(t("no_supported_files_in_folder"));
 				e.target.value = "";
 				return;
 			}
 
-			addFiles(folderFiles);
+			setFiles([]);
+			setFolderUpload({ folderName, entries });
 			e.target.value = "";
 		},
 		[addFiles, supportedExtensionsSet, t]
@@ -192,9 +291,18 @@ export function DocumentUploadTab({
 		return `${parseFloat((bytes / k ** i).toFixed(2))} ${sizes[i]}`;
 	};
 
-	const totalFileSize = files.reduce((total, entry) => total + entry.file.size, 0);
+	const totalFileSize = folderUpload
+		? folderUpload.entries.reduce((total, entry) => total + entry.file.size, 0)
+		: files.reduce((total, entry) => total + entry.file.size, 0);
 
-	const hasContent = files.length > 0;
+	const fileCount = folderUpload ? folderUpload.entries.length : files.length;
+	const hasContent = files.length > 0 || folderUpload !== null;
+	const isAnyUploading = isUploading || isFolderUploading;
+
+	const folderTreeItems = useMemo(() => {
+		if (!folderUpload) return [];
+		return flattenTree(buildFolderTree(folderUpload.entries));
+	}, [folderUpload]);
 
 	const handleAccordionChange = useCallback(
 		(value: string) => {
@@ -204,7 +312,96 @@ export function DocumentUploadTab({
 		[onAccordionStateChange]
 	);
 
+	const handleFolderUpload = async () => {
+		if (!folderUpload) return;
+
+		setUploadProgress(0);
+		setIsFolderUploading(true);
+		const total = folderUpload.entries.length;
+		trackDocumentUploadStarted(Number(searchSpaceId), total, totalFileSize);
+
+		try {
+			const batches: FolderEntry[][] = [];
+			let currentBatch: FolderEntry[] = [];
+			let currentSize = 0;
+
+			for (const entry of folderUpload.entries) {
+				const size = entry.file.size;
+
+				if (size >= FOLDER_BATCH_SIZE_BYTES) {
+					if (currentBatch.length > 0) {
+						batches.push(currentBatch);
+						currentBatch = [];
+						currentSize = 0;
+					}
+					batches.push([entry]);
+					continue;
+				}
+
+				if (
+					currentBatch.length >= FOLDER_BATCH_MAX_FILES ||
+					currentSize + size > FOLDER_BATCH_SIZE_BYTES
+				) {
+					batches.push(currentBatch);
+					currentBatch = [];
+					currentSize = 0;
+				}
+
+				currentBatch.push(entry);
+				currentSize += size;
+			}
+
+			if (currentBatch.length > 0) {
+				batches.push(currentBatch);
+			}
+
+			let rootFolderId: number | null = null;
+			let uploaded = 0;
+
+			for (const batch of batches) {
+				const result = await documentsApiService.folderUploadFiles(
+					batch.map((e) => e.file),
+					{
+						folder_name: folderUpload.folderName,
+						search_space_id: Number(searchSpaceId),
+						relative_paths: batch.map((e) => e.relativePath),
+						root_folder_id: rootFolderId,
+						enable_summary: shouldSummarize,
+						use_vision_llm: useVisionLlm,
+						processing_mode: processingMode,
+					}
+				);
+
+				if (result.root_folder_id && !rootFolderId) {
+					rootFolderId = result.root_folder_id;
+				}
+
+				uploaded += batch.length;
+				setUploadProgress(Math.round((uploaded / total) * 100));
+			}
+
+			trackDocumentUploadSuccess(Number(searchSpaceId), total);
+			toast(t("upload_initiated"), { description: t("upload_initiated_desc") });
+			setFolderUpload(null);
+			onSuccess?.();
+		} catch (error) {
+			const message = error instanceof Error ? error.message : "Upload failed";
+			trackDocumentUploadFailure(Number(searchSpaceId), message);
+			toast(t("upload_error"), {
+				description: `${t("upload_error_desc")}: ${message}`,
+			});
+		} finally {
+			setIsFolderUploading(false);
+			setUploadProgress(0);
+		}
+	};
+
 	const handleUpload = async () => {
+		if (folderUpload) {
+			await handleFolderUpload();
+			return;
+		}
+
 		setUploadProgress(0);
 		trackDocumentUploadStarted(Number(searchSpaceId), files.length, totalFileSize);
 
@@ -218,6 +415,8 @@ export function DocumentUploadTab({
 				files: rawFiles,
 				search_space_id: Number(searchSpaceId),
 				should_summarize: shouldSummarize,
+				use_vision_llm: useVisionLlm,
+				processing_mode: processingMode,
 			},
 			{
 				onSuccess: () => {
@@ -252,7 +451,7 @@ export function DocumentUploadTab({
 						<Button
 							variant="ghost"
 							size="sm"
-							className={`text-xs gap-1 bg-neutral-700/50 hover:bg-neutral-600/50 ${sizeClass} ${widthClass}`}
+							className={`text-xs gap-1 bg-neutral-700/50 hover:bg-accent hover:text-accent-foreground ${sizeClass} ${widthClass}`}
 						>
 							Browse
 							<ChevronDown className="h-3 w-3 opacity-60" />
@@ -282,7 +481,7 @@ export function DocumentUploadTab({
 					<Button
 						variant="ghost"
 						size="sm"
-						className={`text-xs gap-1 bg-neutral-700/50 hover:bg-neutral-600/50 ${sizeClass} ${widthClass}`}
+						className={`text-xs gap-1 bg-neutral-700/50 hover:bg-accent hover:text-accent-foreground ${sizeClass} ${widthClass}`}
 					>
 						Browse
 						<ChevronDown className="h-3 w-3 opacity-60" />
@@ -330,39 +529,49 @@ export function DocumentUploadTab({
 			<div className="sm:hidden">
 				{hasContent ? (
 					isElectron ? (
-						<div className="w-full">{renderBrowseButton({ compact: true, fullWidth: true })}</div>
+						<div className="flex w-full justify-center">
+							{renderBrowseButton({ compact: true })}
+						</div>
 					) : (
-						<button
+						<Button
 							type="button"
-							className="w-full text-xs h-8 flex items-center justify-center gap-1.5 rounded-md border border-dashed border-muted-foreground/30 text-muted-foreground hover:text-foreground hover:border-foreground/50 transition-colors"
+							variant="ghost"
+							className="h-8 w-full gap-1.5 rounded-md border border-dashed border-muted-foreground/30 px-0 text-xs text-muted-foreground transition-colors hover:border-foreground/50 hover:bg-transparent hover:text-accent-foreground"
 							onClick={() => fileInputRef.current?.click()}
 						>
 							Add more files
-						</button>
+						</Button>
 					)
 				) : (
-					<button
-						type="button"
-						className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer w-full bg-transparent border-none"
-						onClick={() => {
-							if (!isElectron) fileInputRef.current?.click();
-						}}
-					>
-						<Upload className="h-10 w-10 text-muted-foreground" />
-						<div className="text-center space-y-1.5">
-							<p className="text-base font-medium">
-								{isElectron ? "Select files or folder" : "Tap to select files or folder"}
-							</p>
-							<p className="text-sm text-muted-foreground">{t("file_size_limit")}</p>
-						</div>
+					<div className="flex w-full flex-col items-center gap-4 bg-transparent px-4 py-12 select-none">
+						{isElectron ? (
+							<div className="flex w-full flex-col items-center gap-4">
+								<div className="text-center space-y-1.5">
+									<p className="text-base font-medium">{t("select_files_or_folder")}</p>
+									<p className="text-sm text-muted-foreground">{t("file_size_limit")}</p>
+								</div>
+							</div>
+						) : (
+							<Button
+								type="button"
+								variant="ghost"
+								className="h-auto w-full flex-col gap-4 whitespace-normal bg-transparent p-0 text-foreground hover:bg-transparent hover:text-foreground"
+								onClick={() => fileInputRef.current?.click()}
+							>
+								<div className="text-center space-y-1.5">
+									<p className="text-base font-medium">{t("tap_select_files_or_folder")}</p>
+									<p className="text-sm text-muted-foreground">{t("file_size_limit")}</p>
+								</div>
+							</Button>
+						)}
 						<fieldset
-							className="w-full mt-1 border-none p-0 m-0"
+							className="mt-1 flex w-full justify-center border-none p-0 m-0"
 							onClick={(e) => e.stopPropagation()}
 							onKeyDown={(e) => e.stopPropagation()}
 						>
-							{renderBrowseButton({ fullWidth: true })}
+							{renderBrowseButton()}
 						</fieldset>
-					</button>
+					</div>
 				)}
 			</div>
 
@@ -373,7 +582,6 @@ export function DocumentUploadTab({
 			>
 				{hasContent ? (
 					<div className="flex items-center gap-3">
-						<Upload className="h-4 w-4 text-muted-foreground shrink-0" />
 						<span className="text-xs text-muted-foreground flex-1 truncate">
 							{isDragActive ? t("drop_files") : t("drag_drop_more")}
 						</span>
@@ -383,12 +591,10 @@ export function DocumentUploadTab({
 					<div className="relative">
 						{isDragActive && (
 							<div className="absolute inset-0 flex flex-col items-center justify-center gap-2">
-								<Upload className="h-8 w-8 text-primary" />
 								<p className="text-sm font-medium text-primary">{t("drop_files")}</p>
 							</div>
 						)}
 						<div className={`flex flex-col items-center gap-2 ${isDragActive ? "invisible" : ""}`}>
-							<Upload className="h-8 w-8 text-muted-foreground" />
 							<p className="text-sm font-medium">{t("drag_drop")}</p>
 							<p className="text-xs text-muted-foreground">{t("file_size_limit")}</p>
 							<div className="mt-1">{renderBrowseButton()}</div>
@@ -398,55 +604,92 @@ export function DocumentUploadTab({
 			</div>
 
 			{/* FILES SELECTED */}
-			{files.length > 0 && (
+			{hasContent && (
 				<div className="rounded-lg border border-border p-3 space-y-2">
 					<div className="flex items-center justify-between">
 						<p className="text-sm font-medium">
-							{t("selected_files", { count: files.length })}
-							<Dot className="inline h-4 w-4" />
-							{formatFileSize(totalFileSize)}
+							{folderUpload ? (
+								<>
+									<FolderOpen className="inline h-4 w-4 mr-1 -mt-0.5" />
+									{folderUpload.folderName}
+									<Dot className="inline h-4 w-4" />
+									{folderUpload.entries.length}{" "}
+									{folderUpload.entries.length === 1 ? "file" : "files"}
+									<Dot className="inline h-4 w-4" />
+									{formatFileSize(totalFileSize)}
+								</>
+							) : (
+								<>
+									{t("selected_files", { count: files.length })}
+									<Dot className="inline h-4 w-4" />
+									{formatFileSize(totalFileSize)}
+								</>
+							)}
 						</p>
 						<Button
 							variant="ghost"
 							size="sm"
-							className="h-7 text-xs text-muted-foreground hover:text-foreground"
-							onClick={() => setFiles([])}
-							disabled={isUploading}
+							className="h-7 text-xs text-muted-foreground hover:text-accent-foreground"
+							onClick={() => {
+								setFiles([]);
+								setFolderUpload(null);
+							}}
+							disabled={isAnyUploading}
 						>
 							{t("clear_all")}
 						</Button>
 					</div>
 
 					<div className="max-h-[160px] sm:max-h-[200px] overflow-y-auto -mx-1">
-						{files.map((entry) => (
-							<div
-								key={entry.id}
-								className="flex items-center gap-2 py-1.5 px-2 rounded-md hover:bg-slate-400/5 dark:hover:bg-white/5 group"
-							>
-								<span className="text-[10px] font-medium uppercase leading-none bg-muted px-1.5 py-0.5 rounded text-muted-foreground shrink-0">
-									{entry.file.name.split(".").pop() || "?"}
-								</span>
-								<span className="text-sm truncate flex-1 min-w-0">{entry.file.name}</span>
-								<span className="text-xs text-muted-foreground shrink-0">
-									{formatFileSize(entry.file.size)}
-								</span>
-								<Button
-									variant="ghost"
-									size="icon"
-									className="h-6 w-6 shrink-0"
-									onClick={() => setFiles((prev) => prev.filter((e) => e.id !== entry.id))}
-									disabled={isUploading}
-								>
-									<X className="h-3 w-3" />
-								</Button>
-							</div>
-						))}
+						{folderUpload
+							? folderTreeItems.map((item) => (
+									<div
+										key={item.path}
+										className="flex items-center gap-1.5 py-0.5 px-2"
+										style={{ paddingLeft: `${item.depth * 16 + 8}px` }}
+									>
+										{item.isFolder ? (
+											<FolderOpen className="h-3.5 w-3.5 text-blue-400 shrink-0" />
+										) : (
+											<FileIcon className="h-3.5 w-3.5 text-muted-foreground shrink-0" />
+										)}
+										<span className="text-sm truncate flex-1 min-w-0">{item.name}</span>
+										{!item.isFolder && item.size != null && (
+											<span className="text-xs text-muted-foreground shrink-0">
+												{formatFileSize(item.size)}
+											</span>
+										)}
+									</div>
+								))
+							: files.map((entry) => (
+									<div
+										key={entry.id}
+										className="flex items-center gap-2 py-1.5 px-2 rounded-md hover:bg-accent hover:text-accent-foreground group"
+									>
+										<span className="text-[10px] font-medium uppercase leading-none bg-muted px-1.5 py-0.5 rounded text-muted-foreground shrink-0">
+											{entry.file.name.split(".").pop() || "?"}
+										</span>
+										<span className="text-sm truncate flex-1 min-w-0">{entry.file.name}</span>
+										<span className="text-xs text-muted-foreground shrink-0">
+											{formatFileSize(entry.file.size)}
+										</span>
+										<Button
+											variant="ghost"
+											size="icon"
+											className="h-6 w-6 shrink-0"
+											onClick={() => setFiles((prev) => prev.filter((e) => e.id !== entry.id))}
+											disabled={isAnyUploading}
+										>
+											<X className="h-3 w-3" />
+										</Button>
+									</div>
+								))}
 					</div>
 
-					{isUploading && (
+					{isAnyUploading && (
 						<div className="space-y-1">
 							<div className="flex items-center justify-between text-xs">
-								<span>{t("uploading_files")}</span>
+								<span>{folderUpload ? t("uploading_folder") : t("uploading_files")}</span>
 								<span>{Math.round(uploadProgress)}%</span>
 							</div>
 							<Progress value={uploadProgress} className="h-1.5" />
@@ -463,21 +706,69 @@ export function DocumentUploadTab({
 						<Switch checked={shouldSummarize} onCheckedChange={setShouldSummarize} />
 					</div>
 
+					<div className={toggleRowClass}>
+						<div className="space-y-0.5">
+							<p className="font-medium text-sm">Enable Vision LLM</p>
+							<p className="text-xs text-muted-foreground">
+								Describes images using AI vision (costly, slower)
+							</p>
+						</div>
+						<Switch checked={useVisionLlm} onCheckedChange={setUseVisionLlm} />
+					</div>
+
+					<div className="space-y-1.5">
+						<p className="font-medium text-sm px-1">{t("processing_mode")}</p>
+						<div className="grid grid-cols-2 gap-2">
+							<Button
+								type="button"
+								onClick={() => setProcessingMode("basic")}
+								variant="ghost"
+								className={`h-auto w-full items-start justify-start whitespace-normal rounded-lg border p-3 text-left transition-colors hover:bg-transparent hover:text-foreground ${
+									processingMode === "basic"
+										? "border-primary bg-primary/5"
+										: "border-border hover:border-muted-foreground/50"
+								}`}
+							>
+								<Zap
+									className={`h-4 w-4 mt-0.5 shrink-0 ${processingMode === "basic" ? "text-primary" : "text-muted-foreground"}`}
+								/>
+								<div className="space-y-0.5 min-w-0">
+									<p className="font-medium text-sm">{t("basic_mode")}</p>
+									<p className="text-xs text-muted-foreground">{t("basic_mode_desc")}</p>
+								</div>
+							</Button>
+							<Button
+								type="button"
+								onClick={() => setProcessingMode("premium")}
+								variant="ghost"
+								className={`h-auto w-full items-start justify-start whitespace-normal rounded-lg border p-3 text-left transition-colors hover:bg-transparent hover:text-foreground ${
+									processingMode === "premium"
+										? "border-amber-500 bg-amber-500/5"
+										: "border-border hover:border-muted-foreground/50"
+								}`}
+							>
+								<Crown
+									className={`h-4 w-4 mt-0.5 shrink-0 ${processingMode === "premium" ? "text-amber-500" : "text-muted-foreground"}`}
+								/>
+								<div className="space-y-0.5 min-w-0">
+									<p className="font-medium text-sm">{t("premium_mode")}</p>
+									<p className="text-xs text-muted-foreground">{t("premium_mode_desc")}</p>
+								</div>
+							</Button>
+						</div>
+					</div>
+
 					<Button
-						className="w-full"
+						className="w-full relative"
 						onClick={handleUpload}
-						disabled={isUploading || files.length === 0}
+						disabled={isAnyUploading || fileCount === 0}
 					>
-						{isUploading ? (
-							<span className="flex items-center gap-2">
-								<Spinner size="sm" />
-								{t("uploading")}
-							</span>
-						) : (
-							<span className="flex items-center gap-2">
-								{t("upload_button", { count: files.length })}
-							</span>
-						)}
+						<span className={isAnyUploading ? "opacity-0" : ""}>
+							{folderUpload
+								? t("upload_folder_button", { count: fileCount })
+								: t("upload_button", { count: fileCount })}
+						</span>
+						{isAnyUploading && <Spinner size="sm" className="absolute" />}
 					</Button>
 				</div>
 			)}
diff --git a/surfsense_web/components/sources/FolderWatchDialog.tsx b/surfsense_web/components/sources/FolderWatchDialog.tsx
index 3590279cef..4cc566ef6c 100644
--- a/surfsense_web/components/sources/FolderWatchDialog.tsx
+++ b/surfsense_web/components/sources/FolderWatchDialog.tsx
@@ -164,7 +164,7 @@ export function FolderWatchDialog({
 
 	return (
 		<Dialog open={open} onOpenChange={handleOpenChange}>
-			<DialogContent className="sm:max-w-md select-none p-0 gap-0 overflow-hidden bg-muted dark:bg-muted border border-border [&>button]:opacity-80 [&>button]:hover:opacity-100 [&>button]:hover:bg-foreground/10">
+			<DialogContent className="sm:max-w-md select-none p-0 gap-0 overflow-hidden [&>button]:opacity-80 [&>button]:hover:opacity-100 [&>button]:hover:bg-accent [&>button]:hover:text-accent-foreground">
 				<DialogHeader className="px-4 sm:px-6 pt-5 sm:pt-6 pb-3">
 					<DialogTitle className="text-lg sm:text-xl font-semibold tracking-tight">
 						Watch Local Folder
@@ -194,13 +194,14 @@ export function FolderWatchDialog({
 							</Button>
 						</div>
 					) : (
-						<button
+						<Button
+							variant="ghost"
 							type="button"
 							onClick={handleSelectFolder}
-							className="flex flex-1 w-full items-center justify-center gap-2 rounded-lg border-2 border-dashed border-muted-foreground/30 text-sm text-muted-foreground transition-colors hover:border-foreground/50 hover:text-foreground"
+							className="h-auto flex-1 w-full gap-2 rounded-lg border-2 border-dashed border-muted-foreground/30 bg-transparent text-sm text-muted-foreground transition-colors hover:border-foreground/50 hover:bg-transparent hover:text-accent-foreground"
 						>
 							Browse for a folder
-						</button>
+						</Button>
 					)}
 
 					{selectedFolder && (
diff --git a/surfsense_web/components/theme/theme-toggle.tsx b/surfsense_web/components/theme/theme-toggle.tsx
index a59c5248b3..9985e863af 100644
--- a/surfsense_web/components/theme/theme-toggle.tsx
+++ b/surfsense_web/components/theme/theme-toggle.tsx
@@ -4,6 +4,7 @@ import { motion } from "motion/react";
 import { useTheme } from "next-themes";
 import { useCallback, useEffect, useId, useState } from "react";
 
+import { Button } from "@/components/ui/button";
 import { cn } from "@/lib/utils";
 
 // ///////////////////////////////////////////////////////////////////////////
@@ -586,7 +587,7 @@ export const useThemeToggle = ({
 	}, []);
 
 	const toggleTheme = useCallback(() => {
-		setIsDark(!isDark);
+		setIsDark((prev) => !prev);
 
 		const animation = createAnimation(variant, start, blur, gifUrl);
 
@@ -604,7 +605,7 @@ export const useThemeToggle = ({
 		}
 
 		document.startViewTransition(switchTheme);
-	}, [theme, setTheme, variant, start, blur, gifUrl, updateStyles, isDark]);
+	}, [theme, setTheme, variant, start, blur, gifUrl, updateStyles]);
 
 	const setCrazyLightTheme = useCallback(() => {
 		setIsDark(false);
@@ -706,10 +707,12 @@ export const ThemeToggleButton = ({
 	const clipPathId = `theme-toggle-clip-${clipId}`;
 
 	return (
-		<button
+		<Button
+			variant="ghost"
+			size="icon"
 			type="button"
 			className={cn(
-				"size-10 cursor-pointer rounded-full p-2 transition-all duration-300 active:scale-95 bg-transparent",
+				"size-10 cursor-pointer rounded-full bg-transparent p-2 transition-all duration-300 hover:bg-transparent active:scale-95",
 				isDark ? "text-white" : "text-black",
 				className
 			)}
@@ -720,6 +723,7 @@ export const ThemeToggleButton = ({
 			<svg
 				xmlns="http://www.w3.org/2000/svg"
 				aria-hidden="true"
+				className="size-full"
 				fill="currentColor"
 				strokeLinecap="round"
 				viewBox="0 0 32 32"
@@ -759,7 +763,7 @@ export const ThemeToggleButton = ({
 					</motion.g>
 				</g>
 			</svg>
-		</button>
+		</Button>
 	);
 };
 
diff --git a/surfsense_web/components/tool-ui/citation/citation-hover-popover.tsx b/surfsense_web/components/tool-ui/citation/citation-hover-popover.tsx
new file mode 100644
index 0000000000..9c85e0f5b2
--- /dev/null
+++ b/surfsense_web/components/tool-ui/citation/citation-hover-popover.tsx
@@ -0,0 +1,148 @@
+"use client";
+
+import type { ComponentProps, HTMLAttributes, ReactElement, ReactNode } from "react";
+import { useCallback, useEffect, useRef, useSyncExternalStore } from "react";
+import { useMediaQuery } from "@/hooks/use-media-query";
+import { Popover, PopoverContent, PopoverTrigger } from "./_adapter";
+
+type PopoverContentProps = ComponentProps<typeof PopoverContent>;
+
+export type CitationHoverTriggerProps = Pick<
+	HTMLAttributes<HTMLElement>,
+	"onBlur" | "onFocus" | "onPointerEnter" | "onPointerLeave"
+>;
+
+interface CitationHoverPopoverProps {
+	id: string;
+	trigger: (props: CitationHoverTriggerProps) => ReactElement;
+	children: ReactNode;
+	contentClassName?: string;
+	side?: PopoverContentProps["side"];
+	align?: PopoverContentProps["align"];
+	sideOffset?: PopoverContentProps["sideOffset"];
+	onContentClick?: PopoverContentProps["onClick"];
+}
+
+const OPEN_DELAY_MS = 80;
+const CLOSE_DELAY_MS = 120;
+
+let activeCitationId: string | null = null;
+const listeners = new Set<() => void>();
+
+function subscribe(listener: () => void) {
+	listeners.add(listener);
+	return () => {
+		listeners.delete(listener);
+	};
+}
+
+function getSnapshot() {
+	return activeCitationId;
+}
+
+function setActiveCitationId(id: string | null) {
+	if (activeCitationId === id) return;
+	activeCitationId = id;
+	for (const listener of listeners) {
+		listener();
+	}
+}
+
+function useCitationHoverState(id: string) {
+	const activeId = useSyncExternalStore(subscribe, getSnapshot, getSnapshot);
+	const openTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
+	const closeTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
+
+	const clearTimers = useCallback(() => {
+		if (openTimerRef.current) {
+			clearTimeout(openTimerRef.current);
+			openTimerRef.current = null;
+		}
+		if (closeTimerRef.current) {
+			clearTimeout(closeTimerRef.current);
+			closeTimerRef.current = null;
+		}
+	}, []);
+
+	const open = activeId === id;
+
+	const scheduleOpen = useCallback(() => {
+		clearTimers();
+		openTimerRef.current = setTimeout(() => {
+			setActiveCitationId(id);
+			openTimerRef.current = null;
+		}, OPEN_DELAY_MS);
+	}, [clearTimers, id]);
+
+	const scheduleClose = useCallback(() => {
+		clearTimers();
+		closeTimerRef.current = setTimeout(() => {
+			if (activeCitationId === id) {
+				setActiveCitationId(null);
+			}
+			closeTimerRef.current = null;
+		}, CLOSE_DELAY_MS);
+	}, [clearTimers, id]);
+
+	const handleOpenChange = useCallback(
+		(nextOpen: boolean) => {
+			clearTimers();
+			setActiveCitationId(nextOpen ? id : null);
+		},
+		[clearTimers, id]
+	);
+
+	useEffect(() => {
+		return () => {
+			clearTimers();
+			if (activeCitationId === id) {
+				setActiveCitationId(null);
+			}
+		};
+	}, [clearTimers, id]);
+
+	return { open, scheduleOpen, scheduleClose, handleOpenChange };
+}
+
+export function CitationHoverPopover({
+	id,
+	trigger,
+	children,
+	contentClassName,
+	side = "top",
+	align = "start",
+	sideOffset = 6,
+	onContentClick,
+}: CitationHoverPopoverProps) {
+	const isTouchLike = useMediaQuery("(hover: none), (pointer: coarse)");
+	const { open, scheduleOpen, scheduleClose, handleOpenChange } = useCitationHoverState(id);
+	const hoverProps = {
+		onPointerEnter: scheduleOpen,
+		onPointerLeave: scheduleClose,
+		onFocus: scheduleOpen,
+		onBlur: scheduleClose,
+	} satisfies CitationHoverTriggerProps;
+
+	if (isTouchLike) {
+		return trigger({});
+	}
+
+	return (
+		<Popover open={open} onOpenChange={handleOpenChange}>
+			<PopoverTrigger asChild>{trigger(hoverProps)}</PopoverTrigger>
+			<PopoverContent
+				side={side}
+				align={align}
+				sideOffset={sideOffset}
+				className={contentClassName}
+				onPointerEnter={scheduleOpen}
+				onPointerLeave={scheduleClose}
+				onOpenAutoFocus={(event) => event.preventDefault()}
+				onCloseAutoFocus={(event) => event.preventDefault()}
+				onClick={onContentClick}
+			>
+				{children}
+			</PopoverContent>
+		</Popover>
+	);
+}
diff --git a/surfsense_web/components/tool-ui/citation/citation-list.tsx b/surfsense_web/components/tool-ui/citation/citation-list.tsx
deleted file mode 100644
index bbe869a095..0000000000
--- a/surfsense_web/components/tool-ui/citation/citation-list.tsx
+++ /dev/null
@@ -1,396 +0,0 @@
-"use client";
-
-import type { LucideIcon } from "lucide-react";
-import { Code2, Database, ExternalLink, File, FileText, Globe, Newspaper } from "lucide-react";
-import NextImage from "next/image";
-import * as React from "react";
-import { openSafeNavigationHref, resolveSafeNavigationHref } from "../shared/media";
-import { cn, Popover, PopoverContent, PopoverTrigger } from "./_adapter";
-import { Citation } from "./citation";
-import type { CitationType, CitationVariant, SerializableCitation } from "./schema";
-
-const TYPE_ICONS: Record<CitationType, LucideIcon> = {
-	webpage: Globe,
-	document: FileText,
-	article: Newspaper,
-	api: Database,
-	code: Code2,
-	other: File,
-};
-
-function useHoverPopover(delay = 100) {
-	const [open, setOpen] = React.useState(false);
-	const timeoutRef = React.useRef<ReturnType<typeof setTimeout> | null>(null);
-	const containerRef = React.useRef<HTMLDivElement>(null);
-
-	const handleMouseEnter = React.useCallback(() => {
-		if (timeoutRef.current) clearTimeout(timeoutRef.current);
-		timeoutRef.current = setTimeout(() => setOpen(true), delay);
-	}, [delay]);
-
-	const handleMouseLeave = React.useCallback(() => {
-		if (timeoutRef.current) clearTimeout(timeoutRef.current);
-		timeoutRef.current = setTimeout(() => setOpen(false), delay);
-	}, [delay]);
-
-	const handleFocus = React.useCallback(() => {
-		if (timeoutRef.current) clearTimeout(timeoutRef.current);
-		setOpen(true);
-	}, []);
-
-	const handleBlur = React.useCallback(
-		(e: React.FocusEvent) => {
-			const relatedTarget = e.relatedTarget as HTMLElement | null;
-			if (containerRef.current?.contains(relatedTarget)) {
-				return;
-			}
-			if (relatedTarget?.closest("[data-radix-popper-content-wrapper]")) {
-				return;
-			}
-			if (timeoutRef.current) clearTimeout(timeoutRef.current);
-			timeoutRef.current = setTimeout(() => setOpen(false), delay);
-		},
-		[delay]
-	);
-
-	React.useEffect(() => {
-		return () => {
-			if (timeoutRef.current) clearTimeout(timeoutRef.current);
-		};
-	}, []);
-
-	return {
-		open,
-		setOpen,
-		containerRef,
-		handleMouseEnter,
-		handleMouseLeave,
-		handleFocus,
-		handleBlur,
-	};
-}
-
-export interface CitationListProps {
-	id: string;
-	citations: SerializableCitation[];
-	variant?: CitationVariant;
-	maxVisible?: number;
-	className?: string;
-	onNavigate?: (href: string, citation: SerializableCitation) => void;
-}
-
-export function CitationList(props: CitationListProps) {
-	const { id, citations, variant = "default", maxVisible, className, onNavigate } = props;
-
-	const shouldTruncate = maxVisible !== undefined && citations.length > maxVisible;
-	const visibleCitations = shouldTruncate ? citations.slice(0, maxVisible) : citations;
-	const overflowCitations = shouldTruncate ? citations.slice(maxVisible) : [];
-	const overflowCount = overflowCitations.length;
-
-	const wrapperClass =
-		variant === "inline" ? "flex flex-wrap items-center gap-1.5" : "flex flex-col gap-2";
-
-	// Stacked variant: overlapping favicons with popover
-	if (variant === "stacked") {
-		return (
-			<StackedCitations
-				id={id}
-				citations={citations}
-				className={className}
-				onNavigate={onNavigate}
-			/>
-		);
-	}
-
-	if (variant === "default") {
-		return (
-			<div
-				className={cn("isolate flex flex-col gap-4", className)}
-				data-tool-ui-id={id}
-				data-slot="citation-list"
-			>
-				{visibleCitations.map((citation) => (
-					<Citation key={citation.id} {...citation} variant="default" onNavigate={onNavigate} />
-				))}
-				{shouldTruncate && (
-					<OverflowIndicator
-						citations={overflowCitations}
-						count={overflowCount}
-						variant="default"
-						onNavigate={onNavigate}
-					/>
-				)}
-			</div>
-		);
-	}
-
-	return (
-		<div
-			className={cn("isolate", wrapperClass, className)}
-			data-tool-ui-id={id}
-			data-slot="citation-list"
-		>
-			{visibleCitations.map((citation) => (
-				<Citation key={citation.id} {...citation} variant={variant} onNavigate={onNavigate} />
-			))}
-			{shouldTruncate && (
-				<OverflowIndicator
-					citations={overflowCitations}
-					count={overflowCount}
-					variant={variant}
-					onNavigate={onNavigate}
-				/>
-			)}
-		</div>
-	);
-}
-
-interface OverflowIndicatorProps {
-	citations: SerializableCitation[];
-	count: number;
-	variant: CitationVariant;
-	onNavigate?: (href: string, citation: SerializableCitation) => void;
-}
-
-function OverflowIndicator({ citations, count, variant, onNavigate }: OverflowIndicatorProps) {
-	const { open, handleMouseEnter, handleMouseLeave } = useHoverPopover();
-
-	const handleClick = (citation: SerializableCitation) => {
-		const href = resolveSafeNavigationHref(citation.href);
-		if (!href) return;
-		if (onNavigate) {
-			onNavigate(href, citation);
-		} else {
-			openSafeNavigationHref(href);
-		}
-	};
-
-	const popoverContent = (
-		<div className="flex max-h-72 flex-col overflow-y-auto">
-			{citations.map((citation) => (
-				<OverflowItem key={citation.id} citation={citation} onClick={() => handleClick(citation)} />
-			))}
-		</div>
-	);
-
-	if (variant === "inline") {
-		return (
-			<Popover open={open}>
-				<PopoverTrigger asChild>
-					<button
-						type="button"
-						onMouseEnter={handleMouseEnter}
-						onMouseLeave={handleMouseLeave}
-						className={cn(
-							"inline-flex items-center gap-1 rounded-md px-2 py-1",
-							"bg-muted/60 text-sm tabular-nums",
-							"transition-colors duration-150",
-							"hover:bg-muted",
-							"focus-visible:ring-ring focus-visible:ring-2 focus-visible:outline-none"
-						)}
-					>
-						<span className="text-muted-foreground">+{count} more</span>
-					</button>
-				</PopoverTrigger>
-				<PopoverContent
-					side="top"
-					align="start"
-					className="w-80 p-1"
-					onMouseEnter={handleMouseEnter}
-					onMouseLeave={handleMouseLeave}
-					onOpenAutoFocus={(e) => e.preventDefault()}
-				>
-					{popoverContent}
-				</PopoverContent>
-			</Popover>
-		);
-	}
-
-	// Default variant
-	return (
-		<Popover open={open}>
-			<PopoverTrigger asChild>
-				<button
-					type="button"
-					onMouseEnter={handleMouseEnter}
-					onMouseLeave={handleMouseLeave}
-					className={cn(
-						"flex items-center justify-center rounded-xl px-4 py-3",
-						"border-border bg-card border border-dashed",
-						"transition-colors duration-150",
-						"hover:border-foreground/25 hover:bg-muted/50",
-						"focus-visible:ring-ring focus-visible:ring-2 focus-visible:ring-offset-2 focus-visible:outline-none"
-					)}
-				>
-					<span className="text-muted-foreground text-sm tabular-nums">+{count} more sources</span>
-				</button>
-			</PopoverTrigger>
-			<PopoverContent
-				side="bottom"
-				align="start"
-				className="w-80 p-1"
-				onMouseEnter={handleMouseEnter}
-				onMouseLeave={handleMouseLeave}
-				onOpenAutoFocus={(e) => e.preventDefault()}
-			>
-				{popoverContent}
-			</PopoverContent>
-		</Popover>
-	);
-}
-
-interface OverflowItemProps {
-	citation: SerializableCitation;
-	onClick: () => void;
-}
-
-function OverflowItem({ citation, onClick }: OverflowItemProps) {
-	const TypeIcon = TYPE_ICONS[citation.type ?? "webpage"] ?? Globe;
-
-	return (
-		<button
-			type="button"
-			onClick={onClick}
-			className="group hover:bg-muted focus-visible:bg-muted flex w-full cursor-pointer items-center gap-2.5 rounded-md px-2 py-2 text-left transition-colors focus-visible:outline-none"
-		>
-			{citation.favicon ? (
-				<NextImage
-					src={citation.favicon}
-					alt=""
-					aria-hidden="true"
-					width={18}
-					height={18}
-					className="size-4.5 rounded-full object-cover"
-					unoptimized={true}
-				/>
-			) : (
-				<TypeIcon className="text-muted-foreground size-3" aria-hidden="true" />
-			)}
-			<div className="min-w-0 flex-1">
-				<p className="group-hover:decoration-foreground/30 truncate text-sm font-medium group-hover:underline group-hover:underline-offset-2">
-					{citation.title}
-				</p>
-				<p className="text-muted-foreground truncate text-xs">{citation.domain}</p>
-			</div>
-			<ExternalLink className="text-muted-foreground mt-0.5 size-3.5 shrink-0 self-start opacity-0 transition-opacity group-hover:opacity-100" />
-		</button>
-	);
-}
-
-interface StackedCitationsProps {
-	id: string;
-	citations: SerializableCitation[];
-	className?: string;
-	onNavigate?: (href: string, citation: SerializableCitation) => void;
-}
-
-function StackedCitations({ id, citations, className, onNavigate }: StackedCitationsProps) {
-	const { open, setOpen, containerRef, handleMouseEnter, handleMouseLeave, handleBlur } =
-		useHoverPopover();
-	const maxIcons = 4;
-	const visibleCitations = citations.slice(0, maxIcons);
-	const remainingCount = Math.max(0, citations.length - maxIcons);
-
-	const handleClick = (citation: SerializableCitation) => {
-		const href = resolveSafeNavigationHref(citation.href);
-		if (!href) return;
-		if (onNavigate) {
-			onNavigate(href, citation);
-		} else {
-			openSafeNavigationHref(href);
-		}
-	};
-
-	return (
-		// biome-ignore lint/a11y/noStaticElementInteractions: blur boundary for popover focus management
-		<div ref={containerRef} onBlur={handleBlur} className="inline-flex">
-			<Popover open={open}>
-				<PopoverTrigger asChild>
-					<button
-						type="button"
-						data-tool-ui-id={id}
-						data-slot="citation-list"
-						onMouseEnter={handleMouseEnter}
-						onMouseLeave={handleMouseLeave}
-						onKeyDown={(e) => {
-							if (e.key === "Enter" || e.key === " ") {
-								e.preventDefault();
-								setOpen(true);
-							}
-						}}
-						className={cn(
-							"isolate inline-flex cursor-pointer items-center gap-2 rounded-lg px-3 py-2",
-							"bg-muted/40 outline-none",
-							"transition-colors duration-150",
-							"hover:bg-muted/70",
-							"focus-visible:ring-ring focus-visible:ring-2",
-							className
-						)}
-					>
-						<div className="flex items-center">
-							{visibleCitations.map((citation, index) => {
-								const TypeIcon = TYPE_ICONS[citation.type ?? "webpage"] ?? Globe;
-								return (
-									<div
-										key={citation.id}
-										className={cn(
-											"border-border bg-background dark:border-foreground/20 relative flex size-6 items-center justify-center rounded-full border shadow-xs",
-											index > 0 && "-ml-2"
-										)}
-										style={{ zIndex: maxIcons - index }}
-									>
-										{citation.favicon ? (
-											<NextImage
-												src={citation.favicon}
-												alt=""
-												aria-hidden="true"
-												width={18}
-												height={18}
-												className="size-4.5 rounded-full object-cover"
-												unoptimized={true}
-											/>
-										) : (
-											<TypeIcon className="text-muted-foreground size-3" aria-hidden="true" />
-										)}
-									</div>
-								);
-							})}
-							{remainingCount > 0 && (
-								<div
-									className="border-border bg-background dark:border-foreground/20 relative -ml-2 flex size-6 items-center justify-center rounded-full border shadow-xs"
-									style={{ zIndex: 0 }}
-								>
-									<span className="text-muted-foreground text-[10px] font-medium tracking-tight">
-										•••
-									</span>
-								</div>
-							)}
-						</div>
-						<span className="text-muted-foreground text-sm tabular-nums">
-							{citations.length} source{citations.length !== 1 && "s"}
-						</span>
-					</button>
-				</PopoverTrigger>
-				<PopoverContent
-					side="bottom"
-					align="start"
-					className="w-80 p-1"
-					onMouseEnter={handleMouseEnter}
-					onMouseLeave={handleMouseLeave}
-					onBlur={handleBlur}
-					onEscapeKeyDown={() => setOpen(false)}
-				>
-					<div className="flex max-h-72 flex-col overflow-y-auto">
-						{citations.map((citation) => (
-							<OverflowItem
-								key={citation.id}
-								citation={citation}
-								onClick={() => handleClick(citation)}
-							/>
-						))}
-					</div>
-				</PopoverContent>
-			</Popover>
-		</div>
-	);
-}
diff --git a/surfsense_web/components/tool-ui/citation/citation.tsx b/surfsense_web/components/tool-ui/citation/citation.tsx
index c60034a0a7..c2f2c5e701 100644
--- a/surfsense_web/components/tool-ui/citation/citation.tsx
+++ b/surfsense_web/components/tool-ui/citation/citation.tsx
@@ -1,33 +1,17 @@
 "use client";
 
-import type { LucideIcon } from "lucide-react";
-import { Code2, Database, ExternalLink, File, FileText, Globe, Newspaper } from "lucide-react";
+import { ExternalLink, Globe } from "lucide-react";
 import NextImage from "next/image";
-import * as React from "react";
+import { Button } from "@/components/ui/button";
+import { tryGetHostname } from "@/lib/url";
 import { openSafeNavigationHref, sanitizeHref } from "../shared/media";
-import { cn, Popover, PopoverContent, PopoverTrigger } from "./_adapter";
-import type { CitationType, CitationVariant, SerializableCitation } from "./schema";
+import { cn } from "./_adapter";
+import { CitationHoverPopover } from "./citation-hover-popover";
+import type { CitationVariant, SerializableCitation } from "./schema";
+import { TYPE_ICONS } from "./type-icons";
 
 const FALLBACK_LOCALE = "en-US";
 
-const TYPE_ICONS: Record<CitationType, LucideIcon> = {
-	webpage: Globe,
-	document: FileText,
-	article: Newspaper,
-	api: Database,
-	code: Code2,
-	other: File,
-};
-
-function extractDomain(url: string): string | undefined {
-	try {
-		const urlObj = new URL(url);
-		return urlObj.hostname.replace(/^www\./, "");
-	} catch {
-		return undefined;
-	}
-}
-
 function formatDate(isoString: string, locale: string): string {
 	try {
 		const date = new Date(isoString);
@@ -40,29 +24,6 @@ function formatDate(isoString: string, locale: string): string {
 	}
 }
 
-function useHoverPopover(delay = 100) {
-	const [open, setOpen] = React.useState(false);
-	const timeoutRef = React.useRef<ReturnType<typeof setTimeout> | null>(null);
-
-	const handleMouseEnter = React.useCallback(() => {
-		if (timeoutRef.current) clearTimeout(timeoutRef.current);
-		timeoutRef.current = setTimeout(() => setOpen(true), delay);
-	}, [delay]);
-
-	const handleMouseLeave = React.useCallback(() => {
-		if (timeoutRef.current) clearTimeout(timeoutRef.current);
-		timeoutRef.current = setTimeout(() => setOpen(false), delay);
-	}, [delay]);
-
-	React.useEffect(() => {
-		return () => {
-			if (timeoutRef.current) clearTimeout(timeoutRef.current);
-		};
-	}, []);
-
-	return { open, setOpen, handleMouseEnter, handleMouseLeave };
-}
-
 export interface CitationProps extends SerializableCitation {
 	variant?: CitationVariant;
 	className?: string;
@@ -87,7 +48,7 @@ export function Citation(props: CitationProps) {
 
 	const locale = providedLocale ?? FALLBACK_LOCALE;
 	const sanitizedHref = sanitizeHref(rawHref);
-	const domain = providedDomain ?? extractDomain(rawHref);
+	const domain = providedDomain ?? tryGetHostname(rawHref);
 
 	const citationData: SerializableCitation = {
 		...serializable,
@@ -107,13 +68,6 @@ export function Citation(props: CitationProps) {
 		}
 	};
 
-	const handleKeyDown = (e: React.KeyboardEvent) => {
-		if (sanitizedHref && (e.key === "Enter" || e.key === " ")) {
-			e.preventDefault();
-			handleClick();
-		}
-	};
-
 	const iconElement = favicon ? (
 		<NextImage
 			src={favicon}
@@ -128,61 +82,95 @@ export function Citation(props: CitationProps) {
 		<TypeIcon className="size-3.5 shrink-0 opacity-60" aria-hidden="true" />
 	);
 
-	const { open, handleMouseEnter, handleMouseLeave } = useHoverPopover();
-
 	// Inline variant: compact chip with hover popover
 	if (variant === "inline") {
 		return (
-			<Popover open={open}>
-				<PopoverTrigger asChild>
-					<button
+			<CitationHoverPopover
+				id={id}
+				contentClassName="w-72 cursor-pointer overflow-hidden p-0"
+				onContentClick={handleClick}
+				trigger={(hoverProps) => (
+					<Button
+						variant="ghost"
 						type="button"
 						aria-label={title}
 						data-tool-ui-id={id}
 						data-slot="citation"
 						onClick={handleClick}
-						onMouseEnter={handleMouseEnter}
-						onMouseLeave={handleMouseLeave}
+						{...hoverProps}
 						className={cn(
-							"inline-flex cursor-pointer items-center gap-1.5 rounded-md px-2 py-1",
-							"bg-muted/60 text-sm outline-none",
-							"transition-colors duration-150",
-							"hover:bg-muted",
-							"focus-visible:ring-ring focus-visible:ring-2",
+							"ml-0.5 inline-flex h-5 min-w-5 cursor-pointer items-center justify-center gap-1.5 rounded-md bg-popover px-1.5 text-[11px] font-medium text-popover-foreground/80 align-baseline",
 							className
 						)}
 					>
 						{iconElement}
-						<span className="text-muted-foreground">{domain}</span>
-					</button>
-				</PopoverTrigger>
-				<PopoverContent
-					side="top"
-					align="start"
-					className="w-72 cursor-pointer p-0"
-					onMouseEnter={handleMouseEnter}
-					onMouseLeave={handleMouseLeave}
-					onOpenAutoFocus={(e) => e.preventDefault()}
-					onCloseAutoFocus={(e) => e.preventDefault()}
-					onClick={handleClick}
-				>
-					<div className="hover:bg-muted/50 flex flex-col gap-2 p-3 transition-colors">
-						<div className="flex items-start gap-2">
-							{iconElement}
-							<span className="text-muted-foreground text-xs">{domain}</span>
-						</div>
-						<p className="text-sm leading-snug font-medium">{title}</p>
-						{snippet && (
-							<p className="text-muted-foreground line-clamp-2 text-xs leading-relaxed">
-								{snippet}
-							</p>
-						)}
+						<span>{domain}</span>
+					</Button>
+				)}
+			>
+				<div className="flex flex-col gap-2 p-3">
+					<div className="flex items-start gap-2">
+						{iconElement}
+						<span className="text-muted-foreground text-xs">{domain}</span>
 					</div>
-				</PopoverContent>
-			</Popover>
+					<p className="text-sm leading-snug font-medium">{title}</p>
+					{snippet && (
+						<p className="text-muted-foreground line-clamp-2 text-xs leading-relaxed">{snippet}</p>
+					)}
+				</div>
+			</CitationHoverPopover>
 		);
 	}
 
+	const cardClassName = cn(
+		"group @container relative isolate flex w-full min-w-0 flex-col overflow-hidden rounded-xl",
+		"border-border bg-card border text-sm shadow-xs",
+		"transition-colors duration-150",
+		sanitizedHref && [
+			"cursor-pointer no-underline",
+			"hover:border-foreground/25",
+			"focus-visible:ring-ring focus-visible:ring-2 focus-visible:ring-offset-2 focus-visible:outline-none",
+		]
+	);
+
+	const cardContent = (
+		<div className="flex flex-col gap-2 p-4">
+			<div className="text-muted-foreground flex min-w-0 items-center justify-between gap-1.5 text-xs">
+				<div className="flex min-w-0 items-center gap-1.5">
+					{iconElement}
+					<span className="truncate font-medium">{domain}</span>
+					{(author || publishedAt) && (
+						<span className="opacity-70">
+							<span className="opacity-60"> — </span>
+							{author}
+							{author && publishedAt && ", "}
+							{publishedAt && (
+								<time dateTime={publishedAt} className="tabular-nums">
+									{formatDate(publishedAt, locale)}
+								</time>
+							)}
+						</span>
+					)}
+				</div>
+				{sanitizedHref && (
+					<ExternalLink className="size-3.5 shrink-0 opacity-0 transition-opacity group-hover:opacity-100" />
+				)}
+			</div>
+
+			<h3 className="text-foreground text-[15px] leading-snug font-medium text-pretty">
+				<span className="group-hover:decoration-foreground/30 line-clamp-2 group-hover:underline group-hover:underline-offset-2">
+					{title}
+				</span>
+			</h3>
+
+			{snippet && (
+				<p className="text-muted-foreground text-[13px] leading-relaxed text-pretty">
+					<span className="line-clamp-3">{snippet}</span>
+				</p>
+			)}
+		</div>
+	);
+
 	// Default variant: full card
 	return (
 		<article
@@ -191,59 +179,22 @@ export function Citation(props: CitationProps) {
 			data-tool-ui-id={id}
 			data-slot="citation"
 		>
-			{/* biome-ignore lint/a11y/noStaticElementInteractions: div receives role="link" conditionally when href is present */}
-			<div
-				className={cn(
-					"group @container relative isolate flex w-full min-w-0 flex-col overflow-hidden rounded-xl",
-					"border-border bg-card border text-sm shadow-xs",
-					"transition-colors duration-150",
-					sanitizedHref && [
-						"cursor-pointer",
-						"hover:border-foreground/25",
-						"focus-visible:ring-ring focus-visible:ring-2 focus-visible:ring-offset-2 focus-visible:outline-none",
-					]
-				)}
-				onClick={sanitizedHref ? handleClick : undefined}
-				role={sanitizedHref ? "link" : undefined}
-				tabIndex={sanitizedHref ? 0 : undefined}
-				onKeyDown={sanitizedHref ? handleKeyDown : undefined}
-			>
-				<div className="flex flex-col gap-2 p-4">
-					<div className="text-muted-foreground flex min-w-0 items-center justify-between gap-1.5 text-xs">
-						<div className="flex min-w-0 items-center gap-1.5">
-							{iconElement}
-							<span className="truncate font-medium">{domain}</span>
-							{(author || publishedAt) && (
-								<span className="opacity-70">
-									<span className="opacity-60"> — </span>
-									{author}
-									{author && publishedAt && ", "}
-									{publishedAt && (
-										<time dateTime={publishedAt} className="tabular-nums">
-											{formatDate(publishedAt, locale)}
-										</time>
-									)}
-								</span>
-							)}
-						</div>
-						{sanitizedHref && (
-							<ExternalLink className="size-3.5 shrink-0 opacity-0 transition-opacity group-hover:opacity-100" />
-						)}
-					</div>
-
-					<h3 className="text-foreground text-[15px] leading-snug font-medium text-pretty">
-						<span className="group-hover:decoration-foreground/30 line-clamp-2 group-hover:underline group-hover:underline-offset-2">
-							{title}
-						</span>
-					</h3>
-
-					{snippet && (
-						<p className="text-muted-foreground text-[13px] leading-relaxed text-pretty">
-							<span className="line-clamp-3">{snippet}</span>
-						</p>
-					)}
-				</div>
-			</div>
+			{sanitizedHref ? (
+				<a
+					href={sanitizedHref}
+					target="_blank"
+					rel="noopener noreferrer"
+					className={cardClassName}
+					onClick={(event) => {
+						event.preventDefault();
+						handleClick();
+					}}
+				>
+					{cardContent}
+				</a>
+			) : (
+				<div className={cardClassName}>{cardContent}</div>
+			)}
 		</article>
 	);
 }
diff --git a/surfsense_web/components/tool-ui/citation/index.ts b/surfsense_web/components/tool-ui/citation/index.ts
index 2abcc5746f..8a64159031 100644
--- a/surfsense_web/components/tool-ui/citation/index.ts
+++ b/surfsense_web/components/tool-ui/citation/index.ts
@@ -1,7 +1,5 @@
 export type { CitationProps } from "./citation";
 export { Citation } from "./citation";
-export type { CitationListProps } from "./citation-list";
-export { CitationList } from "./citation-list";
 export type {
 	CitationType,
 	CitationVariant,
diff --git a/surfsense_web/components/tool-ui/citation/type-icons.ts b/surfsense_web/components/tool-ui/citation/type-icons.ts
new file mode 100644
index 0000000000..81bcf8a000
--- /dev/null
+++ b/surfsense_web/components/tool-ui/citation/type-icons.ts
@@ -0,0 +1,12 @@
+import type { LucideIcon } from "lucide-react";
+import { Code2, Database, File, FileText, Globe, Newspaper } from "lucide-react";
+import type { CitationType } from "./schema";
+
+export const TYPE_ICONS: Record<CitationType, LucideIcon> = {
+	webpage: Globe,
+	document: FileText,
+	article: Newspaper,
+	api: Database,
+	code: Code2,
+	other: File,
+};
diff --git a/surfsense_web/components/tool-ui/confluence/create-confluence-page.tsx b/surfsense_web/components/tool-ui/confluence/create-confluence-page.tsx
index d22f317b85..d5add7b8be 100644
--- a/surfsense_web/components/tool-ui/confluence/create-confluence-page.tsx
+++ b/surfsense_web/components/tool-ui/confluence/create-confluence-page.tsx
@@ -2,9 +2,8 @@
 
 import type { ToolCallMessagePartProps } from "@assistant-ui/react";
 import { useSetAtom } from "jotai";
-import { CornerDownLeftIcon, Pen } from "lucide-react";
+import { CornerDownLeftIcon, Pencil } from "lucide-react";
 import { useCallback, useEffect, useMemo, useState } from "react";
-import { openHitlEditPanelAtom } from "@/atoms/chat/hitl-edit-panel.atom";
 import { PlateEditor } from "@/components/editor/plate-editor";
 import { TextShimmerLoader } from "@/components/prompt-kit/loader";
 import { Button } from "@/components/ui/button";
@@ -15,7 +14,13 @@ import {
 	SelectTrigger,
 	SelectValue,
 } from "@/components/ui/select";
-import { useHitlPhase } from "@/hooks/use-hitl-phase";
+import type { HitlDecision, InterruptResult } from "@/features/chat-messages/hitl";
+import {
+	isInterruptResult,
+	openHitlEditPanelAtom,
+	useHitlDecision,
+	useHitlPhase,
+} from "@/features/chat-messages/hitl";
 
 interface ConfluenceAccount {
 	id: number;
@@ -30,25 +35,11 @@ interface ConfluenceSpace {
 	name: string;
 }
 
-interface InterruptResult {
-	__interrupt__: true;
-	__decided__?: "approve" | "reject" | "edit";
-	__completed__?: boolean;
-	action_requests: Array<{
-		name: string;
-		args: Record<string, unknown>;
-	}>;
-	review_configs: Array<{
-		action_name: string;
-		allowed_decisions: Array<"approve" | "edit" | "reject">;
-	}>;
-	interrupt_type?: string;
-	context?: {
-		accounts?: ConfluenceAccount[];
-		spaces?: ConfluenceSpace[];
-		error?: string;
-	};
-}
+type CreateConfluencePageInterruptContext = {
+	accounts?: ConfluenceAccount[];
+	spaces?: ConfluenceSpace[];
+	error?: string;
+};
 
 interface SuccessResult {
 	status: "success";
@@ -76,21 +67,12 @@ interface InsufficientPermissionsResult {
 }
 
 type CreateConfluencePageResult =
-	| InterruptResult
+	| InterruptResult<CreateConfluencePageInterruptContext>
 	| SuccessResult
 	| ErrorResult
 	| AuthErrorResult
 	| InsufficientPermissionsResult;
 
-function isInterruptResult(result: unknown): result is InterruptResult {
-	return (
-		typeof result === "object" &&
-		result !== null &&
-		"__interrupt__" in result &&
-		(result as InterruptResult).__interrupt__ === true
-	);
-}
-
 function isErrorResult(result: unknown): result is ErrorResult {
 	return (
 		typeof result === "object" &&
@@ -124,12 +106,8 @@ function ApprovalCard({
 	onDecision,
 }: {
 	args: { title: string; content?: string; space_id?: string };
-	interruptData: InterruptResult;
-	onDecision: (decision: {
-		type: "approve" | "reject" | "edit";
-		message?: string;
-		edited_action?: { name: string; args: Record<string, unknown> };
-	}) => void;
+	interruptData: InterruptResult<CreateConfluencePageInterruptContext>;
+	onDecision: (decision: HitlDecision) => void;
 }) {
 	const { phase, setProcessing, setRejected } = useHitlPhase(interruptData);
 	const [isPanelOpen, setIsPanelOpen] = useState(false);
@@ -247,7 +225,7 @@ function ApprovalCard({
 							});
 						}}
 					>
-						<Pen className="size-3.5" />
+						<Pencil className="size-3.5" />
 						Edit
 					</Button>
 				)}
@@ -464,18 +442,16 @@ export const CreateConfluencePageToolUI = ({
 	{ title: string; content?: string; space_id?: string },
 	CreateConfluencePageResult
 >) => {
+	const { dispatch } = useHitlDecision();
+
 	if (!result) return null;
 
 	if (isInterruptResult(result)) {
 		return (
 			<ApprovalCard
 				args={args}
-				interruptData={result}
-				onDecision={(decision) => {
-					window.dispatchEvent(
-						new CustomEvent("hitl-decision", { detail: { decisions: [decision] } })
-					);
-				}}
+				interruptData={result as InterruptResult<CreateConfluencePageInterruptContext>}
+				onDecision={(decision) => dispatch([decision])}
 			/>
 		);
 	}
diff --git a/surfsense_web/components/tool-ui/confluence/delete-confluence-page.tsx b/surfsense_web/components/tool-ui/confluence/delete-confluence-page.tsx
index 258f259f09..68ecca6a4a 100644
--- a/surfsense_web/components/tool-ui/confluence/delete-confluence-page.tsx
+++ b/surfsense_web/components/tool-ui/confluence/delete-confluence-page.tsx
@@ -6,39 +6,26 @@ import { useCallback, useEffect, useState } from "react";
 import { TextShimmerLoader } from "@/components/prompt-kit/loader";
 import { Button } from "@/components/ui/button";
 import { Checkbox } from "@/components/ui/checkbox";
-import { useHitlPhase } from "@/hooks/use-hitl-phase";
+import type { HitlDecision, InterruptResult } from "@/features/chat-messages/hitl";
+import { isInterruptResult, useHitlDecision, useHitlPhase } from "@/features/chat-messages/hitl";
 
-interface InterruptResult {
-	__interrupt__: true;
-	__decided__?: "approve" | "reject";
-	__completed__?: boolean;
-	action_requests: Array<{
+type DeleteConfluencePageInterruptContext = {
+	account?: {
+		id: number;
 		name: string;
-		args: Record<string, unknown>;
-	}>;
-	review_configs: Array<{
-		action_name: string;
-		allowed_decisions: Array<"approve" | "reject">;
-	}>;
-	interrupt_type?: string;
-	context?: {
-		account?: {
-			id: number;
-			name: string;
-			base_url: string;
-			auth_expired?: boolean;
-		};
-		page?: {
-			page_id: string;
-			page_title: string;
-			space_id: string;
-			connector_id?: number;
-			document_id?: number;
-			indexed_at?: string;
-		};
-		error?: string;
+		base_url: string;
+		auth_expired?: boolean;
 	};
-}
+	page?: {
+		page_id: string;
+		page_title: string;
+		space_id: string;
+		connector_id?: number;
+		document_id?: number;
+		indexed_at?: string;
+	};
+	error?: string;
+};
 
 interface SuccessResult {
 	status: "success";
@@ -77,7 +64,7 @@ interface InsufficientPermissionsResult {
 }
 
 type DeleteConfluencePageResult =
-	| InterruptResult
+	| InterruptResult<DeleteConfluencePageInterruptContext>
 	| SuccessResult
 	| ErrorResult
 	| NotFoundResult
@@ -85,15 +72,6 @@ type DeleteConfluencePageResult =
 	| AuthErrorResult
 	| InsufficientPermissionsResult;
 
-function isInterruptResult(result: unknown): result is InterruptResult {
-	return (
-		typeof result === "object" &&
-		result !== null &&
-		"__interrupt__" in result &&
-		(result as InterruptResult).__interrupt__ === true
-	);
-}
-
 function isErrorResult(result: unknown): result is ErrorResult {
 	return (
 		typeof result === "object" &&
@@ -145,12 +123,8 @@ function ApprovalCard({
 	interruptData,
 	onDecision,
 }: {
-	interruptData: InterruptResult;
-	onDecision: (decision: {
-		type: "approve" | "reject";
-		message?: string;
-		edited_action?: { name: string; args: Record<string, unknown> };
-	}) => void;
+	interruptData: InterruptResult<DeleteConfluencePageInterruptContext>;
+	onDecision: (decision: HitlDecision) => void;
 }) {
 	const { phase, setProcessing, setRejected } = useHitlPhase(interruptData);
 	const [deleteFromKb, setDeleteFromKb] = useState(false);
@@ -402,18 +376,15 @@ export const DeleteConfluencePageToolUI = ({
 	{ page_title_or_id: string; delete_from_kb?: boolean },
 	DeleteConfluencePageResult
 >) => {
+	const { dispatch } = useHitlDecision();
+
 	if (!result) return null;
 
 	if (isInterruptResult(result)) {
 		return (
 			<ApprovalCard
-				interruptData={result}
-				onDecision={(decision) => {
-					const event = new CustomEvent("hitl-decision", {
-						detail: { decisions: [decision] },
-					});
-					window.dispatchEvent(event);
-				}}
+				interruptData={result as InterruptResult<DeleteConfluencePageInterruptContext>}
+				onDecision={(decision) => dispatch([decision])}
 			/>
 		);
 	}
diff --git a/surfsense_web/components/tool-ui/confluence/update-confluence-page.tsx b/surfsense_web/components/tool-ui/confluence/update-confluence-page.tsx
index 8bc0772a41..436d695326 100644
--- a/surfsense_web/components/tool-ui/confluence/update-confluence-page.tsx
+++ b/surfsense_web/components/tool-ui/confluence/update-confluence-page.tsx
@@ -2,46 +2,37 @@
 
 import type { ToolCallMessagePartProps } from "@assistant-ui/react";
 import { useSetAtom } from "jotai";
-import { CornerDownLeftIcon, Pen } from "lucide-react";
+import { CornerDownLeftIcon, Pencil } from "lucide-react";
 import { useCallback, useEffect, useState } from "react";
-import { openHitlEditPanelAtom } from "@/atoms/chat/hitl-edit-panel.atom";
 import { PlateEditor } from "@/components/editor/plate-editor";
 import { TextShimmerLoader } from "@/components/prompt-kit/loader";
 import { Button } from "@/components/ui/button";
-import { useHitlPhase } from "@/hooks/use-hitl-phase";
-
-interface InterruptResult {
-	__interrupt__: true;
-	__decided__?: "approve" | "reject" | "edit";
-	__completed__?: boolean;
-	action_requests: Array<{
+import type { HitlDecision, InterruptResult } from "@/features/chat-messages/hitl";
+import {
+	isInterruptResult,
+	openHitlEditPanelAtom,
+	useHitlDecision,
+	useHitlPhase,
+} from "@/features/chat-messages/hitl";
+
+type UpdateConfluencePageInterruptContext = {
+	account?: {
+		id: number;
 		name: string;
-		args: Record<string, unknown>;
-	}>;
-	review_configs: Array<{
-		action_name: string;
-		allowed_decisions: Array<"approve" | "edit" | "reject">;
-	}>;
-	interrupt_type?: string;
-	context?: {
-		account?: {
-			id: number;
-			name: string;
-			base_url: string;
-			auth_expired?: boolean;
-		};
-		page?: {
-			page_id: string;
-			page_title: string;
-			space_id: string;
-			body: string;
-			version: number;
-			document_id: number;
-			indexed_at?: string;
-		};
-		error?: string;
+		base_url: string;
+		auth_expired?: boolean;
 	};
-}
+	page?: {
+		page_id: string;
+		page_title: string;
+		space_id: string;
+		body: string;
+		version: number;
+		document_id: number;
+		indexed_at?: string;
+	};
+	error?: string;
+};
 
 interface SuccessResult {
 	status: "success";
@@ -74,22 +65,13 @@ interface InsufficientPermissionsResult {
 }
 
 type UpdateConfluencePageResult =
-	| InterruptResult
+	| InterruptResult<UpdateConfluencePageInterruptContext>
 	| SuccessResult
 	| ErrorResult
 	| NotFoundResult
 	| AuthErrorResult
 	| InsufficientPermissionsResult;
 
-function isInterruptResult(result: unknown): result is InterruptResult {
-	return (
-		typeof result === "object" &&
-		result !== null &&
-		"__interrupt__" in result &&
-		(result as InterruptResult).__interrupt__ === true
-	);
-}
-
 function isErrorResult(result: unknown): result is ErrorResult {
 	return (
 		typeof result === "object" &&
@@ -136,12 +118,8 @@ function ApprovalCard({
 		new_title?: string;
 		new_content?: string;
 	};
-	interruptData: InterruptResult;
-	onDecision: (decision: {
-		type: "approve" | "reject" | "edit";
-		message?: string;
-		edited_action?: { name: string; args: Record<string, unknown> };
-	}) => void;
+	interruptData: InterruptResult<UpdateConfluencePageInterruptContext>;
+	onDecision: (decision: HitlDecision) => void;
 }) {
 	const { phase, setProcessing, setRejected } = useHitlPhase(interruptData);
 
@@ -266,7 +244,7 @@ function ApprovalCard({
 							});
 						}}
 					>
-						<Pen className="size-3.5" />
+						<Pencil className="size-3.5" />
 						Edit
 					</Button>
 				)}
@@ -502,18 +480,16 @@ export const UpdateConfluencePageToolUI = ({
 	},
 	UpdateConfluencePageResult
 >) => {
+	const { dispatch } = useHitlDecision();
+
 	if (!result) return null;
 
 	if (isInterruptResult(result)) {
 		return (
 			<ApprovalCard
 				args={args}
-				interruptData={result}
-				onDecision={(decision) => {
-					window.dispatchEvent(
-						new CustomEvent("hitl-decision", { detail: { decisions: [decision] } })
-					);
-				}}
+				interruptData={result as InterruptResult<UpdateConfluencePageInterruptContext>}
+				onDecision={(decision) => dispatch([decision])}
 			/>
 		);
 	}
diff --git a/surfsense_web/components/tool-ui/dropbox/create-file.tsx b/surfsense_web/components/tool-ui/dropbox/create-file.tsx
index ac45f1f5b9..d810f88c26 100644
--- a/surfsense_web/components/tool-ui/dropbox/create-file.tsx
+++ b/surfsense_web/components/tool-ui/dropbox/create-file.tsx
@@ -2,9 +2,8 @@
 
 import type { ToolCallMessagePartProps } from "@assistant-ui/react";
 import { useSetAtom } from "jotai";
-import { CornerDownLeftIcon, FileIcon, Pen } from "lucide-react";
+import { CornerDownLeftIcon, FileIcon, Pencil } from "lucide-react";
 import { useCallback, useEffect, useMemo, useState } from "react";
-import { openHitlEditPanelAtom } from "@/atoms/chat/hitl-edit-panel.atom";
 import { PlateEditor } from "@/components/editor/plate-editor";
 import { TextShimmerLoader } from "@/components/prompt-kit/loader";
 import { Button } from "@/components/ui/button";
@@ -15,7 +14,13 @@ import {
 	SelectTrigger,
 	SelectValue,
 } from "@/components/ui/select";
-import { useHitlPhase } from "@/hooks/use-hitl-phase";
+import type { HitlDecision, InterruptResult } from "@/features/chat-messages/hitl";
+import {
+	isInterruptResult,
+	openHitlEditPanelAtom,
+	useHitlDecision,
+	useHitlPhase,
+} from "@/features/chat-messages/hitl";
 
 interface DropboxAccount {
 	id: number;
@@ -29,22 +34,12 @@ interface SupportedType {
 	label: string;
 }
 
-interface InterruptResult {
-	__interrupt__: true;
-	__decided__?: "approve" | "reject" | "edit";
-	__completed__?: boolean;
-	action_requests: Array<{ name: string; args: Record<string, unknown> }>;
-	review_configs: Array<{
-		action_name: string;
-		allowed_decisions: Array<"approve" | "edit" | "reject">;
-	}>;
-	context?: {
-		accounts?: DropboxAccount[];
-		parent_folders?: Record<number, Array<{ folder_path: string; name: string }>>;
-		supported_types?: SupportedType[];
-		error?: string;
-	};
-}
+type DropboxCreateFileContext = {
+	accounts?: DropboxAccount[];
+	parent_folders?: Record<number, Array<{ folder_path: string; name: string }>>;
+	supported_types?: SupportedType[];
+	error?: string;
+};
 
 interface SuccessResult {
 	status: "success";
@@ -65,16 +60,11 @@ interface AuthErrorResult {
 	connector_type?: string;
 }
 
-type CreateDropboxFileResult = InterruptResult | SuccessResult | ErrorResult | AuthErrorResult;
-
-function isInterruptResult(result: unknown): result is InterruptResult {
-	return (
-		typeof result === "object" &&
-		result !== null &&
-		"__interrupt__" in result &&
-		(result as InterruptResult).__interrupt__ === true
-	);
-}
+type CreateDropboxFileResult =
+	| InterruptResult<DropboxCreateFileContext>
+	| SuccessResult
+	| ErrorResult
+	| AuthErrorResult;
 
 function isErrorResult(result: unknown): result is ErrorResult {
 	return (
@@ -100,12 +90,8 @@ function ApprovalCard({
 	onDecision,
 }: {
 	args: { name: string; file_type?: string; content?: string };
-	interruptData: InterruptResult;
-	onDecision: (decision: {
-		type: "approve" | "reject" | "edit";
-		message?: string;
-		edited_action?: { name: string; args: Record<string, unknown> };
-	}) => void;
+	interruptData: InterruptResult<DropboxCreateFileContext>;
+	onDecision: (decision: HitlDecision) => void;
 }) {
 	const { phase, setProcessing, setRejected } = useHitlPhase(interruptData);
 	const [isPanelOpen, setIsPanelOpen] = useState(false);
@@ -241,7 +227,7 @@ function ApprovalCard({
 							});
 						}}
 					>
-						<Pen className="size-3.5" />
+						<Pencil className="size-3.5" />
 						Edit
 					</Button>
 				)}
@@ -455,17 +441,14 @@ export const CreateDropboxFileToolUI = ({
 	{ name: string; file_type?: string; content?: string },
 	CreateDropboxFileResult
 >) => {
+	const { dispatch } = useHitlDecision();
 	if (!result) return null;
 	if (isInterruptResult(result)) {
 		return (
 			<ApprovalCard
 				args={args}
-				interruptData={result}
-				onDecision={(decision) => {
-					window.dispatchEvent(
-						new CustomEvent("hitl-decision", { detail: { decisions: [decision] } })
-					);
-				}}
+				interruptData={result as InterruptResult<DropboxCreateFileContext>}
+				onDecision={(decision) => dispatch([decision])}
 			/>
 		);
 	}
diff --git a/surfsense_web/components/tool-ui/dropbox/trash-file.tsx b/surfsense_web/components/tool-ui/dropbox/trash-file.tsx
index 0b38777c12..57dff9d388 100644
--- a/surfsense_web/components/tool-ui/dropbox/trash-file.tsx
+++ b/surfsense_web/components/tool-ui/dropbox/trash-file.tsx
@@ -6,7 +6,8 @@ import { useCallback, useEffect, useState } from "react";
 import { TextShimmerLoader } from "@/components/prompt-kit/loader";
 import { Button } from "@/components/ui/button";
 import { Checkbox } from "@/components/ui/checkbox";
-import { useHitlPhase } from "@/hooks/use-hitl-phase";
+import type { HitlDecision, InterruptResult } from "@/features/chat-messages/hitl";
+import { isInterruptResult, useHitlDecision, useHitlPhase } from "@/features/chat-messages/hitl";
 
 interface DropboxAccount {
 	id: number;
@@ -22,14 +23,11 @@ interface DropboxFile {
 	document_id?: number;
 }
 
-interface InterruptResult {
-	__interrupt__: true;
-	__decided__?: "approve" | "reject";
-	__completed__?: boolean;
-	action_requests: Array<{ name: string; args: Record<string, unknown> }>;
-	review_configs: Array<{ action_name: string; allowed_decisions: Array<"approve" | "reject"> }>;
-	context?: { account?: DropboxAccount; file?: DropboxFile; error?: string };
-}
+type DropboxTrashFileContext = {
+	account?: DropboxAccount;
+	file?: DropboxFile;
+	error?: string;
+};
 
 interface SuccessResult {
 	status: "success";
@@ -52,20 +50,12 @@ interface AuthErrorResult {
 }
 
 type DeleteDropboxFileResult =
-	| InterruptResult
+	| InterruptResult<DropboxTrashFileContext>
 	| SuccessResult
 	| ErrorResult
 	| NotFoundResult
 	| AuthErrorResult;
 
-function isInterruptResult(result: unknown): result is InterruptResult {
-	return (
-		typeof result === "object" &&
-		result !== null &&
-		"__interrupt__" in result &&
-		(result as InterruptResult).__interrupt__ === true
-	);
-}
 function isErrorResult(result: unknown): result is ErrorResult {
 	return (
 		typeof result === "object" &&
@@ -95,12 +85,8 @@ function ApprovalCard({
 	interruptData,
 	onDecision,
 }: {
-	interruptData: InterruptResult;
-	onDecision: (decision: {
-		type: "approve" | "reject";
-		message?: string;
-		edited_action?: { name: string; args: Record<string, unknown> };
-	}) => void;
+	interruptData: InterruptResult<DropboxTrashFileContext>;
+	onDecision: (decision: HitlDecision) => void;
 }) {
 	const { phase, setProcessing, setRejected } = useHitlPhase(interruptData);
 	const [deleteFromKb, setDeleteFromKb] = useState(false);
@@ -308,16 +294,13 @@ export const DeleteDropboxFileToolUI = ({
 	{ file_name: string; delete_from_kb?: boolean },
 	DeleteDropboxFileResult
 >) => {
+	const { dispatch } = useHitlDecision();
 	if (!result) return null;
 	if (isInterruptResult(result)) {
 		return (
 			<ApprovalCard
-				interruptData={result}
-				onDecision={(decision) => {
-					window.dispatchEvent(
-						new CustomEvent("hitl-decision", { detail: { decisions: [decision] } })
-					);
-				}}
+				interruptData={result as InterruptResult<DropboxTrashFileContext>}
+				onDecision={(decision) => dispatch([decision])}
 			/>
 		);
 	}
diff --git a/surfsense_web/components/tool-ui/generate-podcast.tsx b/surfsense_web/components/tool-ui/generate-podcast.tsx
index 02f53efadd..b690530a1b 100644
--- a/surfsense_web/components/tool-ui/generate-podcast.tsx
+++ b/surfsense_web/components/tool-ui/generate-podcast.tsx
@@ -264,7 +264,7 @@ function PodcastPlayer({
 					<div className="mx-5 h-px bg-border/50" />
 					<Accordion type="single" collapsible className="px-5">
 						<AccordionItem value="transcript" className="border-b-0">
-							<AccordionTrigger className="py-3 text-xs sm:text-sm font-medium text-muted-foreground hover:text-foreground hover:no-underline">
+							<AccordionTrigger className="py-3 text-xs sm:text-sm font-medium text-muted-foreground hover:text-accent-foreground hover:no-underline">
 								View transcript
 							</AccordionTrigger>
 							<AccordionContent className="pb-0">
@@ -416,9 +416,19 @@ export const GeneratePodcastToolUI = ({
 		return <PodcastErrorState title={title} error={result.error || "Generation failed"} />;
 	}
 
-	// Already generating - show simple warning, don't create another poller
-	// The FIRST tool call will display the podcast when ready
-	// (new: "generating", legacy: "already_generating")
+	// Pending/generating rows have a stable podcast_id, so the card can poll
+	// independently while the chat stream finishes.
+	if (
+		(result.status === "pending" ||
+			result.status === "generating" ||
+			result.status === "processing") &&
+		result.podcast_id
+	) {
+		return <PodcastStatusPoller podcastId={result.podcast_id} title={result.title || title} />;
+	}
+
+	// Legacy duplicate/no-ID result - show a simple warning, don't create
+	// another poller. The first tool call will display the podcast when ready.
 	if (result.status === "generating" || result.status === "already_generating") {
 		return (
 			<div className="my-4 max-w-lg overflow-hidden rounded-2xl border bg-muted/30 select-none">
@@ -432,11 +442,6 @@ export const GeneratePodcastToolUI = ({
 		);
 	}
 
-	// Pending - poll for completion (new: "pending" with podcast_id)
-	if (result.status === "pending" && result.podcast_id) {
-		return <PodcastStatusPoller podcastId={result.podcast_id} title={result.title || title} />;
-	}
-
 	// Ready with podcast_id (new: "ready", legacy: "success")
 	if ((result.status === "ready" || result.status === "success") && result.podcast_id) {
 		return <PodcastPlayer podcastId={result.podcast_id} title={result.title || title} />;
diff --git a/surfsense_web/components/tool-ui/generate-report.tsx b/surfsense_web/components/tool-ui/generate-report.tsx
index 7606a8723c..2e21dd4e7a 100644
--- a/surfsense_web/components/tool-ui/generate-report.tsx
+++ b/surfsense_web/components/tool-ui/generate-report.tsx
@@ -9,6 +9,7 @@ import { z } from "zod";
 import { openReportPanelAtom, reportPanelAtom } from "@/atoms/chat/report-panel.atom";
 import { PlateEditor } from "@/components/editor/plate-editor";
 import { TextShimmerLoader } from "@/components/prompt-kit/loader";
+import { Button } from "@/components/ui/button";
 import { useMediaQuery } from "@/hooks/use-media-query";
 import { baseApiService } from "@/lib/apis/base-api.service";
 
@@ -96,8 +97,12 @@ function ReportErrorState({ title, error }: { title: string; error: string }) {
 			</div>
 			<div className="mx-5 h-px bg-border/50" />
 			<div className="px-5 py-4">
-				<p className="text-sm font-medium text-foreground line-clamp-2">{title}</p>
-				<p className="text-sm text-muted-foreground mt-1">{error}</p>
+				{title && title !== "Report" && (
+					<p className="text-sm font-medium text-foreground line-clamp-2">{title}</p>
+				)}
+				<p className={`text-sm text-muted-foreground${title && title !== "Report" ? " mt-1" : ""}`}>
+					{error}
+				</p>
 			</div>
 		</div>
 	);
@@ -133,10 +138,9 @@ function ReportCard({
 	const autoOpenedRef = useRef(false);
 	const [metadata, setMetadata] = useState<{
 		title: string;
-		wordCount: number | null;
 		versionLabel: string | null;
 		content: string | null;
-	}>({ title, wordCount: wordCount ?? null, versionLabel: null, content: null });
+	}>({ title, versionLabel: null, content: null });
 	const [isLoading, setIsLoading] = useState(true);
 	const [error, setError] = useState<string | null>(null);
 
@@ -165,10 +169,8 @@ function ReportCard({
 							}
 						}
 						const resolvedTitle = parsed.data.title || title;
-						const resolvedWordCount = parsed.data.report_metadata?.word_count ?? wordCount ?? null;
 						setMetadata({
 							title: resolvedTitle,
-							wordCount: resolvedWordCount,
 							versionLabel,
 							content: parsed.data.content ?? null,
 						});
@@ -178,7 +180,7 @@ function ReportCard({
 							openPanel({
 								reportId,
 								title: resolvedTitle,
-								wordCount: resolvedWordCount ?? undefined,
+								wordCount: parsed.data.report_metadata?.word_count ?? wordCount ?? undefined,
 								shareToken,
 							});
 						}
@@ -206,7 +208,6 @@ function ReportCard({
 		openPanel({
 			reportId,
 			title: metadata.title,
-			wordCount: metadata.wordCount ?? undefined,
 			shareToken,
 		});
 	};
@@ -215,18 +216,11 @@ function ReportCard({
 		<div
 			className={`my-4 max-w-lg overflow-hidden rounded-2xl border bg-muted/30 transition-[box-shadow] duration-300 ${isActive ? "ring-1 ring-primary/50" : ""}`}
 		>
-			{/* biome-ignore lint/a11y/useSemanticElements: can't use <button> here because PlateEditor renders nested <button> elements (e.g. CopyButton) */}
-			<div
-				role="button"
-				tabIndex={0}
+			<Button
+				variant="ghost"
+				type="button"
 				onClick={handleOpen}
-				onKeyDown={(e) => {
-					if (e.key === "Enter" || e.key === " ") {
-						e.preventDefault();
-						handleOpen();
-					}
-				}}
-				className="w-full text-left transition-colors hover:bg-muted/50 focus:outline-none focus-visible:outline-none cursor-pointer"
+				className="h-auto w-full flex-col items-stretch justify-start gap-0 rounded-2xl p-0 text-left font-normal whitespace-normal transition-colors hover:bg-accent hover:text-accent-foreground focus:outline-none focus-visible:outline-none cursor-pointer"
 			>
 				<div className="px-5 pt-5 pb-4 select-none">
 					<p className="text-sm font-semibold text-foreground line-clamp-2">
@@ -237,10 +231,8 @@ function ReportCard({
 							<span className="inline-block h-3 w-24 rounded bg-muted/60 animate-pulse" />
 						) : (
 							<>
-								{metadata.wordCount != null && `${metadata.wordCount.toLocaleString()} words`}
-								{metadata.wordCount != null && metadata.versionLabel && (
-									<Dot className="inline size-4" />
-								)}
+								Markdown
+								{metadata.versionLabel && <Dot className="inline size-4" />}
 								{metadata.versionLabel}
 							</>
 						)}
@@ -272,7 +264,7 @@ function ReportCard({
 						<p className="text-sm text-muted-foreground italic">No content available</p>
 					)}
 				</div>
-			</div>
+			</Button>
 		</div>
 	);
 }
diff --git a/surfsense_web/components/tool-ui/generate-resume.tsx b/surfsense_web/components/tool-ui/generate-resume.tsx
new file mode 100644
index 0000000000..448e8c531e
--- /dev/null
+++ b/surfsense_web/components/tool-ui/generate-resume.tsx
@@ -0,0 +1,385 @@
+"use client";
+
+import type { ToolCallMessagePartProps } from "@assistant-ui/react";
+import { useAtomValue, useSetAtom } from "jotai";
+import { Dot } from "lucide-react";
+import { useParams, usePathname } from "next/navigation";
+import * as pdfjsLib from "pdfjs-dist";
+import { useCallback, useEffect, useRef, useState } from "react";
+import { z } from "zod";
+import { openReportPanelAtom, reportPanelAtom } from "@/atoms/chat/report-panel.atom";
+import { TextShimmerLoader } from "@/components/prompt-kit/loader";
+import { Button } from "@/components/ui/button";
+import { useMediaQuery } from "@/hooks/use-media-query";
+import { baseApiService } from "@/lib/apis/base-api.service";
+import { getAuthHeaders } from "@/lib/auth-utils";
+
+pdfjsLib.GlobalWorkerOptions.workerSrc = new URL(
+	"pdfjs-dist/build/pdf.worker.min.mjs",
+	import.meta.url
+).toString();
+
+const GenerateResumeArgsSchema = z.object({
+	user_info: z.string(),
+	user_instructions: z.string().nullish(),
+	parent_report_id: z.number().nullish(),
+	max_pages: z.number().int().min(1).max(5).optional(),
+});
+
+const GenerateResumeResultSchema = z.object({
+	status: z.enum(["ready", "failed"]),
+	report_id: z.number().nullish(),
+	title: z.string().nullish(),
+	content_type: z.string().nullish(),
+	message: z.string().nullish(),
+	error: z.string().nullish(),
+});
+
+const ResumeVersionsResponseSchema = z.object({
+	id: z.number(),
+	versions: z
+		.array(
+			z.object({
+				id: z.number(),
+				created_at: z.string().nullish(),
+			})
+		)
+		.nullish(),
+});
+
+type GenerateResumeArgs = z.infer<typeof GenerateResumeArgsSchema>;
+type GenerateResumeResult = z.infer<typeof GenerateResumeResultSchema>;
+
+function ResumeGeneratingState() {
+	return (
+		<div className="my-4 max-w-lg overflow-hidden rounded-2xl border bg-muted/30 select-none">
+			<div className="px-5 pt-5 pb-4">
+				<div className="flex items-center gap-2">
+					<p className="text-sm font-semibold text-foreground">Resume</p>
+				</div>
+				<TextShimmerLoader text="Crafting your resume" size="sm" />
+			</div>
+			<div className="mx-5 h-px bg-border/50" />
+			<div className="px-5 pt-3 pb-4">
+				<div className="h-[7rem] space-y-2">
+					<div className="h-3 w-full rounded bg-muted/60 animate-pulse" />
+					<div className="h-3 w-[92%] rounded bg-muted/60 animate-pulse [animation-delay:100ms]" />
+					<div className="h-3 w-[75%] rounded bg-muted/60 animate-pulse [animation-delay:200ms]" />
+					<div className="h-3 w-[85%] rounded bg-muted/60 animate-pulse [animation-delay:300ms]" />
+					<div className="h-3 w-[60%] rounded bg-muted/60 animate-pulse [animation-delay:400ms]" />
+				</div>
+			</div>
+		</div>
+	);
+}
+
+function ResumeErrorState({ title, error }: { title: string; error: string }) {
+	return (
+		<div className="my-4 max-w-lg overflow-hidden rounded-2xl border bg-muted/30 select-none">
+			<div className="px-5 pt-5 pb-4">
+				<div className="flex items-center gap-2">
+					<p className="text-sm font-semibold text-destructive">Resume Generation Failed</p>
+				</div>
+			</div>
+			<div className="mx-5 h-px bg-border/50" />
+			<div className="px-5 py-4">
+				{title && title !== "Resume" && (
+					<p className="text-sm font-medium text-foreground line-clamp-2">{title}</p>
+				)}
+				<p className={`text-sm text-muted-foreground${title && title !== "Resume" ? " mt-1" : ""}`}>
+					{error}
+				</p>
+			</div>
+		</div>
+	);
+}
+
+function ResumeCancelledState() {
+	return (
+		<div className="my-4 max-w-lg overflow-hidden rounded-2xl border bg-muted/30 select-none">
+			<div className="px-5 pt-5 pb-4">
+				<div className="flex items-center gap-2">
+					<p className="text-sm font-semibold text-muted-foreground">Resume Cancelled</p>
+				</div>
+				<p className="text-xs text-muted-foreground mt-0.5">Resume generation was cancelled</p>
+			</div>
+		</div>
+	);
+}
+
+function ThumbnailSkeleton() {
+	return (
+		<div className="h-[7rem] space-y-2">
+			<div className="h-3 w-full rounded bg-muted/60 animate-pulse" />
+			<div className="h-3 w-[92%] rounded bg-muted/60 animate-pulse [animation-delay:100ms]" />
+			<div className="h-3 w-[75%] rounded bg-muted/60 animate-pulse [animation-delay:200ms]" />
+			<div className="h-3 w-[85%] rounded bg-muted/60 animate-pulse [animation-delay:300ms]" />
+			<div className="h-3 w-[60%] rounded bg-muted/60 animate-pulse [animation-delay:400ms]" />
+		</div>
+	);
+}
+
+function PdfThumbnail({
+	pdfUrl,
+	onLoad,
+	onError,
+}: {
+	pdfUrl: string;
+	onLoad: () => void;
+	onError: () => void;
+}) {
+	const wrapperRef = useRef<HTMLDivElement>(null);
+	const canvasRef = useRef<HTMLCanvasElement>(null);
+	const [ready, setReady] = useState(false);
+
+	useEffect(() => {
+		let cancelled = false;
+
+		const renderThumbnail = async () => {
+			try {
+				const loadingTask = pdfjsLib.getDocument({
+					url: pdfUrl,
+					httpHeaders: getAuthHeaders(),
+				});
+
+				const pdf = await loadingTask.promise;
+				if (cancelled) {
+					pdf.destroy();
+					return;
+				}
+
+				const page = await pdf.getPage(1);
+				if (cancelled) {
+					pdf.destroy();
+					return;
+				}
+
+				const canvas = canvasRef.current;
+				if (!canvas) {
+					pdf.destroy();
+					return;
+				}
+
+				const containerWidth = wrapperRef.current?.clientWidth || 400;
+				const unscaledViewport = page.getViewport({ scale: 1 });
+				const fitScale = containerWidth / unscaledViewport.width;
+				const viewport = page.getViewport({ scale: fitScale });
+				const dpr = window.devicePixelRatio || 1;
+
+				canvas.width = Math.ceil(viewport.width * dpr);
+				canvas.height = Math.ceil(viewport.height * dpr);
+
+				await page.render({
+					canvas,
+					viewport,
+					transform: dpr !== 1 ? [dpr, 0, 0, dpr, 0, 0] : undefined,
+				}).promise;
+
+				if (!cancelled) {
+					setReady(true);
+					onLoad();
+				}
+
+				pdf.destroy();
+			} catch {
+				if (!cancelled) onError();
+			}
+		};
+
+		renderThumbnail();
+		return () => {
+			cancelled = true;
+		};
+	}, [pdfUrl, onLoad, onError]);
+
+	return (
+		<div ref={wrapperRef}>
+			<canvas ref={canvasRef} className={ready ? "w-full h-auto" : "hidden"} />
+		</div>
+	);
+}
+
+function ResumeCard({
+	reportId,
+	title,
+	shareToken,
+	autoOpen = false,
+}: {
+	reportId: number;
+	title: string;
+	shareToken?: string | null;
+	autoOpen?: boolean;
+}) {
+	const openPanel = useSetAtom(openReportPanelAtom);
+	const panelState = useAtomValue(reportPanelAtom);
+	const isDesktop = useMediaQuery("(min-width: 768px)");
+	const autoOpenedRef = useRef(false);
+	const [pdfUrl, setPdfUrl] = useState<string | null>(null);
+	const [thumbState, setThumbState] = useState<"loading" | "ready" | "error">("loading");
+	const [versionLabel, setVersionLabel] = useState<string | null>(null);
+
+	useEffect(() => {
+		const previewPath = shareToken
+			? `/api/v1/public/${shareToken}/reports/${reportId}/preview`
+			: `/api/v1/reports/${reportId}/preview`;
+		setPdfUrl(`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}${previewPath}`);
+
+		if (autoOpen && isDesktop && !autoOpenedRef.current) {
+			autoOpenedRef.current = true;
+			openPanel({
+				reportId,
+				title,
+				shareToken,
+				contentType: "typst",
+			});
+		}
+	}, [reportId, title, shareToken, autoOpen, isDesktop, openPanel]);
+
+	useEffect(() => {
+		let cancelled = false;
+		const fetchVersions = async () => {
+			try {
+				const url = shareToken
+					? `/api/v1/public/${shareToken}/reports/${reportId}/content`
+					: `/api/v1/reports/${reportId}/content`;
+				const rawData = await baseApiService.get<unknown>(url);
+				if (cancelled) return;
+				const parsed = ResumeVersionsResponseSchema.safeParse(rawData);
+				if (parsed.success) {
+					const versions = parsed.data.versions;
+					if (versions && versions.length > 1) {
+						const idx = versions.findIndex((v) => v.id === reportId);
+						if (idx >= 0) {
+							setVersionLabel(`version ${idx + 1}`);
+						}
+					}
+				}
+			} catch {
+				// silently ignore — version label is non-critical
+			}
+		};
+		fetchVersions();
+		return () => {
+			cancelled = true;
+		};
+	}, [reportId, shareToken]);
+
+	const onThumbLoad = useCallback(() => setThumbState("ready"), []);
+	const onThumbError = useCallback(() => setThumbState("error"), []);
+
+	const isActive = panelState.isOpen && panelState.reportId === reportId;
+
+	const handleOpen = () => {
+		openPanel({
+			reportId,
+			title,
+			shareToken,
+			contentType: "typst",
+		});
+	};
+
+	return (
+		<div
+			className={`my-4 max-w-lg overflow-hidden rounded-2xl border bg-muted/30 transition-[box-shadow] duration-300 ${isActive ? "ring-1 ring-primary/50" : ""}`}
+		>
+			<Button
+				variant="ghost"
+				type="button"
+				onClick={handleOpen}
+				className="h-auto w-full flex-col items-stretch justify-start gap-0 rounded-2xl p-0 text-left font-normal whitespace-normal transition-colors hover:bg-accent hover:text-accent-foreground focus:outline-none focus-visible:outline-none cursor-pointer select-none"
+			>
+				<div className="px-5 pt-5 pb-4">
+					<p className="text-sm font-semibold text-foreground line-clamp-2">{title}</p>
+					<p className="text-xs text-muted-foreground mt-0.5">
+						PDF
+						{versionLabel && <Dot className="inline size-4" />}
+						{versionLabel}
+					</p>
+				</div>
+
+				<div className="mx-5 h-px bg-border/50" />
+
+				<div className="px-5 pt-3 pb-4">
+					{thumbState === "loading" && <ThumbnailSkeleton />}
+					{thumbState === "error" && (
+						<p className="text-sm text-muted-foreground">Preview unavailable</p>
+					)}
+					{pdfUrl && (
+						<div
+							className={`max-h-[7rem] overflow-hidden pointer-events-none mix-blend-multiply dark:mix-blend-screen ${thumbState !== "ready" ? "hidden" : ""}`}
+							style={{
+								maskImage: "linear-gradient(to bottom, black 50%, transparent 100%)",
+								WebkitMaskImage: "linear-gradient(to bottom, black 50%, transparent 100%)",
+							}}
+						>
+							<div className="dark:invert dark:hue-rotate-180">
+								<PdfThumbnail pdfUrl={pdfUrl} onLoad={onThumbLoad} onError={onThumbError} />
+							</div>
+						</div>
+					)}
+				</div>
+			</Button>
+		</div>
+	);
+}
+
+export const GenerateResumeToolUI = ({
+	result,
+	status,
+}: ToolCallMessagePartProps<GenerateResumeArgs, GenerateResumeResult>) => {
+	const params = useParams();
+	const pathname = usePathname();
+	const isPublicRoute = pathname?.startsWith("/public/");
+	const shareToken = isPublicRoute && typeof params?.token === "string" ? params.token : null;
+
+	const sawRunningRef = useRef(false);
+	if (status.type === "running" || status.type === "requires-action") {
+		sawRunningRef.current = true;
+	}
+
+	if (status.type === "running" || status.type === "requires-action") {
+		return <ResumeGeneratingState />;
+	}
+
+	if (status.type === "incomplete") {
+		if (status.reason === "cancelled") {
+			return <ResumeCancelledState />;
+		}
+		if (status.reason === "error") {
+			return (
+				<ResumeErrorState
+					title="Resume"
+					error={typeof status.error === "string" ? status.error : "An error occurred"}
+				/>
+			);
+		}
+	}
+
+	if (!result) {
+		return <ResumeGeneratingState />;
+	}
+
+	if (result.status === "failed") {
+		return (
+			<ResumeErrorState
+				title={result.title || "Resume"}
+				error={
+					result.error || "Resume generation failed. Please try again or rephrase your request."
+				}
+			/>
+		);
+	}
+
+	if (result.status === "ready" && result.report_id) {
+		return (
+			<ResumeCard
+				reportId={result.report_id}
+				title={result.title || "Resume"}
+				shareToken={shareToken}
+				autoOpen={sawRunningRef.current}
+			/>
+		);
+	}
+
+	return <ResumeErrorState title="Resume" error="Missing report ID" />;
+};
diff --git a/surfsense_web/components/tool-ui/gmail/create-draft.tsx b/surfsense_web/components/tool-ui/gmail/create-draft.tsx
index 97e93fecb9..0c0f3a71c1 100644
--- a/surfsense_web/components/tool-ui/gmail/create-draft.tsx
+++ b/surfsense_web/components/tool-ui/gmail/create-draft.tsx
@@ -2,10 +2,8 @@
 
 import type { ToolCallMessagePartProps } from "@assistant-ui/react";
 import { useSetAtom } from "jotai";
-import { CornerDownLeftIcon, Pen, UserIcon, UsersIcon } from "lucide-react";
+import { CornerDownLeftIcon, Pencil, UserIcon, UsersIcon } from "lucide-react";
 import { useCallback, useEffect, useMemo, useState } from "react";
-import type { ExtraField } from "@/atoms/chat/hitl-edit-panel.atom";
-import { openHitlEditPanelAtom } from "@/atoms/chat/hitl-edit-panel.atom";
 import { PlateEditor } from "@/components/editor/plate-editor";
 import { TextShimmerLoader } from "@/components/prompt-kit/loader";
 import { Button } from "@/components/ui/button";
@@ -16,7 +14,13 @@ import {
 	SelectTrigger,
 	SelectValue,
 } from "@/components/ui/select";
-import { useHitlPhase } from "@/hooks/use-hitl-phase";
+import type { ExtraField, HitlDecision, InterruptResult } from "@/features/chat-messages/hitl";
+import {
+	isInterruptResult,
+	openHitlEditPanelAtom,
+	useHitlDecision,
+	useHitlPhase,
+} from "@/features/chat-messages/hitl";
 
 interface GmailAccount {
 	id: number;
@@ -25,23 +29,10 @@ interface GmailAccount {
 	auth_expired?: boolean;
 }
 
-interface InterruptResult {
-	__interrupt__: true;
-	__decided__?: "approve" | "reject" | "edit";
-	__completed__?: boolean;
-	action_requests: Array<{
-		name: string;
-		args: Record<string, unknown>;
-	}>;
-	review_configs: Array<{
-		action_name: string;
-		allowed_decisions: Array<"approve" | "edit" | "reject">;
-	}>;
-	context?: {
-		accounts?: GmailAccount[];
-		error?: string;
-	};
-}
+type GmailCreateDraftContext = {
+	accounts?: GmailAccount[];
+	error?: string;
+};
 
 interface SuccessResult {
 	status: "success";
@@ -68,21 +59,12 @@ interface InsufficientPermissionsResult {
 }
 
 type CreateGmailDraftResult =
-	| InterruptResult
+	| InterruptResult<GmailCreateDraftContext>
 	| SuccessResult
 	| ErrorResult
 	| InsufficientPermissionsResult
 	| AuthErrorResult;
 
-function isInterruptResult(result: unknown): result is InterruptResult {
-	return (
-		typeof result === "object" &&
-		result !== null &&
-		"__interrupt__" in result &&
-		(result as InterruptResult).__interrupt__ === true
-	);
-}
-
 function isErrorResult(result: unknown): result is ErrorResult {
 	return (
 		typeof result === "object" &&
@@ -116,12 +98,8 @@ function ApprovalCard({
 	onDecision,
 }: {
 	args: { to: string; subject: string; body: string; cc?: string; bcc?: string };
-	interruptData: InterruptResult;
-	onDecision: (decision: {
-		type: "approve" | "reject" | "edit";
-		message?: string;
-		edited_action?: { name: string; args: Record<string, unknown> };
-	}) => void;
+	interruptData: InterruptResult<GmailCreateDraftContext>;
+	onDecision: (decision: HitlDecision) => void;
 }) {
 	const { phase, setProcessing, setRejected } = useHitlPhase(interruptData);
 	const [isPanelOpen, setIsPanelOpen] = useState(false);
@@ -275,7 +253,7 @@ function ApprovalCard({
 							});
 						}}
 					>
-						<Pen className="size-3.5" />
+						<Pencil className="size-3.5" />
 						Edit
 					</Button>
 				)}
@@ -473,18 +451,16 @@ export const CreateGmailDraftToolUI = ({
 	{ to: string; subject: string; body: string; cc?: string; bcc?: string },
 	CreateGmailDraftResult
 >) => {
+	const { dispatch } = useHitlDecision();
+
 	if (!result) return null;
 
 	if (isInterruptResult(result)) {
 		return (
 			<ApprovalCard
 				args={args}
-				interruptData={result}
-				onDecision={(decision) => {
-					window.dispatchEvent(
-						new CustomEvent("hitl-decision", { detail: { decisions: [decision] } })
-					);
-				}}
+				interruptData={result as InterruptResult<GmailCreateDraftContext>}
+				onDecision={(decision) => dispatch([decision])}
 			/>
 		);
 	}
diff --git a/surfsense_web/components/tool-ui/gmail/send-email.tsx b/surfsense_web/components/tool-ui/gmail/send-email.tsx
index e0e1ffa6c7..defd795bf8 100644
--- a/surfsense_web/components/tool-ui/gmail/send-email.tsx
+++ b/surfsense_web/components/tool-ui/gmail/send-email.tsx
@@ -2,10 +2,8 @@
 
 import type { ToolCallMessagePartProps } from "@assistant-ui/react";
 import { useSetAtom } from "jotai";
-import { CornerDownLeftIcon, MailIcon, Pen, UserIcon, UsersIcon } from "lucide-react";
+import { CornerDownLeftIcon, MailIcon, Pencil, UserIcon, UsersIcon } from "lucide-react";
 import { useCallback, useEffect, useMemo, useState } from "react";
-import type { ExtraField } from "@/atoms/chat/hitl-edit-panel.atom";
-import { openHitlEditPanelAtom } from "@/atoms/chat/hitl-edit-panel.atom";
 import { PlateEditor } from "@/components/editor/plate-editor";
 import { TextShimmerLoader } from "@/components/prompt-kit/loader";
 import { Button } from "@/components/ui/button";
@@ -16,7 +14,13 @@ import {
 	SelectTrigger,
 	SelectValue,
 } from "@/components/ui/select";
-import { useHitlPhase } from "@/hooks/use-hitl-phase";
+import type { ExtraField, HitlDecision, InterruptResult } from "@/features/chat-messages/hitl";
+import {
+	isInterruptResult,
+	openHitlEditPanelAtom,
+	useHitlDecision,
+	useHitlPhase,
+} from "@/features/chat-messages/hitl";
 
 interface GmailAccount {
 	id: number;
@@ -25,23 +29,10 @@ interface GmailAccount {
 	auth_expired?: boolean;
 }
 
-interface InterruptResult {
-	__interrupt__: true;
-	__decided__?: "approve" | "reject" | "edit";
-	__completed__?: boolean;
-	action_requests: Array<{
-		name: string;
-		args: Record<string, unknown>;
-	}>;
-	review_configs: Array<{
-		action_name: string;
-		allowed_decisions: Array<"approve" | "edit" | "reject">;
-	}>;
-	context?: {
-		accounts?: GmailAccount[];
-		error?: string;
-	};
-}
+type GmailSendEmailContext = {
+	accounts?: GmailAccount[];
+	error?: string;
+};
 
 interface SuccessResult {
 	status: "success";
@@ -67,21 +58,12 @@ interface InsufficientPermissionsResult {
 }
 
 type SendGmailEmailResult =
-	| InterruptResult
+	| InterruptResult<GmailSendEmailContext>
 	| SuccessResult
 	| ErrorResult
 	| InsufficientPermissionsResult
 	| AuthErrorResult;
 
-function isInterruptResult(result: unknown): result is InterruptResult {
-	return (
-		typeof result === "object" &&
-		result !== null &&
-		"__interrupt__" in result &&
-		(result as InterruptResult).__interrupt__ === true
-	);
-}
-
 function isErrorResult(result: unknown): result is ErrorResult {
 	return (
 		typeof result === "object" &&
@@ -115,12 +97,8 @@ function ApprovalCard({
 	onDecision,
 }: {
 	args: { to: string; subject: string; body: string; cc?: string; bcc?: string };
-	interruptData: InterruptResult;
-	onDecision: (decision: {
-		type: "approve" | "reject" | "edit";
-		message?: string;
-		edited_action?: { name: string; args: Record<string, unknown> };
-	}) => void;
+	interruptData: InterruptResult<GmailSendEmailContext>;
+	onDecision: (decision: HitlDecision) => void;
 }) {
 	const { phase, setProcessing, setRejected } = useHitlPhase(interruptData);
 	const [isPanelOpen, setIsPanelOpen] = useState(false);
@@ -274,7 +252,7 @@ function ApprovalCard({
 							});
 						}}
 					>
-						<Pen className="size-3.5" />
+						<Pencil className="size-3.5" />
 						Edit
 					</Button>
 				)}
@@ -471,18 +449,16 @@ export const SendGmailEmailToolUI = ({
 	{ to: string; subject: string; body: string; cc?: string; bcc?: string },
 	SendGmailEmailResult
 >) => {
+	const { dispatch } = useHitlDecision();
+
 	if (!result) return null;
 
 	if (isInterruptResult(result)) {
 		return (
 			<ApprovalCard
 				args={args}
-				interruptData={result}
-				onDecision={(decision) => {
-					window.dispatchEvent(
-						new CustomEvent("hitl-decision", { detail: { decisions: [decision] } })
-					);
-				}}
+				interruptData={result as InterruptResult<GmailSendEmailContext>}
+				onDecision={(decision) => dispatch([decision])}
 			/>
 		);
 	}
diff --git a/surfsense_web/components/tool-ui/gmail/trash-email.tsx b/surfsense_web/components/tool-ui/gmail/trash-email.tsx
index f1640f3fcf..d2b4c71983 100644
--- a/surfsense_web/components/tool-ui/gmail/trash-email.tsx
+++ b/surfsense_web/components/tool-ui/gmail/trash-email.tsx
@@ -6,7 +6,8 @@ import { useCallback, useEffect, useState } from "react";
 import { TextShimmerLoader } from "@/components/prompt-kit/loader";
 import { Button } from "@/components/ui/button";
 import { Checkbox } from "@/components/ui/checkbox";
-import { useHitlPhase } from "@/hooks/use-hitl-phase";
+import type { HitlDecision, InterruptResult } from "@/features/chat-messages/hitl";
+import { isInterruptResult, useHitlDecision, useHitlPhase } from "@/features/chat-messages/hitl";
 
 interface GmailAccount {
 	id: number;
@@ -25,24 +26,11 @@ interface GmailMessage {
 	document_id: number;
 }
 
-interface InterruptResult {
-	__interrupt__: true;
-	__decided__?: "approve" | "reject";
-	__completed__?: boolean;
-	action_requests: Array<{
-		name: string;
-		args: Record<string, unknown>;
-	}>;
-	review_configs: Array<{
-		action_name: string;
-		allowed_decisions: Array<"approve" | "reject">;
-	}>;
-	context?: {
-		account?: GmailAccount;
-		email?: GmailMessage;
-		error?: string;
-	};
-}
+type GmailTrashEmailContext = {
+	account?: GmailAccount;
+	email?: GmailMessage;
+	error?: string;
+};
 
 interface SuccessResult {
 	status: "success";
@@ -74,22 +62,13 @@ interface InsufficientPermissionsResult {
 }
 
 type TrashGmailEmailResult =
-	| InterruptResult
+	| InterruptResult<GmailTrashEmailContext>
 	| SuccessResult
 	| ErrorResult
 	| NotFoundResult
 	| InsufficientPermissionsResult
 	| AuthErrorResult;
 
-function isInterruptResult(result: unknown): result is InterruptResult {
-	return (
-		typeof result === "object" &&
-		result !== null &&
-		"__interrupt__" in result &&
-		(result as InterruptResult).__interrupt__ === true
-	);
-}
-
 function isErrorResult(result: unknown): result is ErrorResult {
 	return (
 		typeof result === "object" &&
@@ -134,12 +113,8 @@ function ApprovalCard({
 	interruptData,
 	onDecision,
 }: {
-	interruptData: InterruptResult;
-	onDecision: (decision: {
-		type: "approve" | "reject";
-		message?: string;
-		edited_action?: { name: string; args: Record<string, unknown> };
-	}) => void;
+	interruptData: InterruptResult<GmailTrashEmailContext>;
+	onDecision: (decision: HitlDecision) => void;
 }) {
 	const { phase, setProcessing, setRejected } = useHitlPhase(interruptData);
 	const [deleteFromKb, setDeleteFromKb] = useState(false);
@@ -385,18 +360,15 @@ export const TrashGmailEmailToolUI = ({
 	{ email_subject_or_id: string; delete_from_kb?: boolean },
 	TrashGmailEmailResult
 >) => {
+	const { dispatch } = useHitlDecision();
+
 	if (!result) return null;
 
 	if (isInterruptResult(result)) {
 		return (
 			<ApprovalCard
-				interruptData={result}
-				onDecision={(decision) => {
-					const event = new CustomEvent("hitl-decision", {
-						detail: { decisions: [decision] },
-					});
-					window.dispatchEvent(event);
-				}}
+				interruptData={result as InterruptResult<GmailTrashEmailContext>}
+				onDecision={(decision) => dispatch([decision])}
 			/>
 		);
 	}
diff --git a/surfsense_web/components/tool-ui/gmail/update-draft.tsx b/surfsense_web/components/tool-ui/gmail/update-draft.tsx
index b3b0d2d212..1a05dc972f 100644
--- a/surfsense_web/components/tool-ui/gmail/update-draft.tsx
+++ b/surfsense_web/components/tool-ui/gmail/update-draft.tsx
@@ -2,14 +2,18 @@
 
 import type { ToolCallMessagePartProps } from "@assistant-ui/react";
 import { useSetAtom } from "jotai";
-import { CornerDownLeftIcon, MailIcon, Pen, UserIcon, UsersIcon } from "lucide-react";
+import { CornerDownLeftIcon, MailIcon, Pencil, UserIcon, UsersIcon } from "lucide-react";
 import { useCallback, useEffect, useState } from "react";
-import type { ExtraField } from "@/atoms/chat/hitl-edit-panel.atom";
-import { openHitlEditPanelAtom } from "@/atoms/chat/hitl-edit-panel.atom";
 import { PlateEditor } from "@/components/editor/plate-editor";
 import { TextShimmerLoader } from "@/components/prompt-kit/loader";
 import { Button } from "@/components/ui/button";
-import { useHitlPhase } from "@/hooks/use-hitl-phase";
+import type { ExtraField, HitlDecision, InterruptResult } from "@/features/chat-messages/hitl";
+import {
+	isInterruptResult,
+	openHitlEditPanelAtom,
+	useHitlDecision,
+	useHitlPhase,
+} from "@/features/chat-messages/hitl";
 
 interface GmailAccount {
 	id: number;
@@ -28,26 +32,13 @@ interface GmailMessage {
 	document_id: number;
 }
 
-interface InterruptResult {
-	__interrupt__: true;
-	__decided__?: "approve" | "reject" | "edit";
-	__completed__?: boolean;
-	action_requests: Array<{
-		name: string;
-		args: Record<string, unknown>;
-	}>;
-	review_configs: Array<{
-		action_name: string;
-		allowed_decisions: Array<"approve" | "edit" | "reject">;
-	}>;
-	context?: {
-		account?: GmailAccount;
-		email?: GmailMessage;
-		draft_id?: string;
-		existing_body?: string;
-		error?: string;
-	};
-}
+type GmailUpdateDraftContext = {
+	account?: GmailAccount;
+	email?: GmailMessage;
+	draft_id?: string;
+	existing_body?: string;
+	error?: string;
+};
 
 interface SuccessResult {
 	status: "success";
@@ -78,22 +69,13 @@ interface InsufficientPermissionsResult {
 }
 
 type UpdateGmailDraftResult =
-	| InterruptResult
+	| InterruptResult<GmailUpdateDraftContext>
 	| SuccessResult
 	| ErrorResult
 	| NotFoundResult
 	| InsufficientPermissionsResult
 	| AuthErrorResult;
 
-function isInterruptResult(result: unknown): result is InterruptResult {
-	return (
-		typeof result === "object" &&
-		result !== null &&
-		"__interrupt__" in result &&
-		(result as InterruptResult).__interrupt__ === true
-	);
-}
-
 function isErrorResult(result: unknown): result is ErrorResult {
 	return (
 		typeof result === "object" &&
@@ -143,12 +125,8 @@ function ApprovalCard({
 		cc?: string;
 		bcc?: string;
 	};
-	interruptData: InterruptResult;
-	onDecision: (decision: {
-		type: "approve" | "reject" | "edit";
-		message?: string;
-		edited_action?: { name: string; args: Record<string, unknown> };
-	}) => void;
+	interruptData: InterruptResult<GmailUpdateDraftContext>;
+	onDecision: (decision: HitlDecision) => void;
 }) {
 	const { phase, setProcessing, setRejected } = useHitlPhase(interruptData);
 	const [isPanelOpen, setIsPanelOpen] = useState(false);
@@ -307,7 +285,7 @@ function ApprovalCard({
 							});
 						}}
 					>
-						<Pen className="size-3.5" />
+						<Pencil className="size-3.5" />
 						Edit
 					</Button>
 				)}
@@ -522,20 +500,16 @@ export const UpdateGmailDraftToolUI = ({
 	},
 	UpdateGmailDraftResult
 >) => {
+	const { dispatch } = useHitlDecision();
+
 	if (!result) return null;
 
 	if (isInterruptResult(result)) {
 		return (
 			<ApprovalCard
 				args={args}
-				interruptData={result}
-				onDecision={(decision) => {
-					window.dispatchEvent(
-						new CustomEvent("hitl-decision", {
-							detail: { decisions: [decision] },
-						})
-					);
-				}}
+				interruptData={result as InterruptResult<GmailUpdateDraftContext>}
+				onDecision={(decision) => dispatch([decision])}
 			/>
 		);
 	}
diff --git a/surfsense_web/components/tool-ui/google-calendar/create-event.tsx b/surfsense_web/components/tool-ui/google-calendar/create-event.tsx
index 37e5028e7f..4835c5694f 100644
--- a/surfsense_web/components/tool-ui/google-calendar/create-event.tsx
+++ b/surfsense_web/components/tool-ui/google-calendar/create-event.tsx
@@ -2,10 +2,15 @@
 
 import type { ToolCallMessagePartProps } from "@assistant-ui/react";
 import { useSetAtom } from "jotai";
-import { ClockIcon, CornerDownLeftIcon, GlobeIcon, MapPinIcon, Pen, UsersIcon } from "lucide-react";
+import {
+	ClockIcon,
+	CornerDownLeftIcon,
+	GlobeIcon,
+	MapPinIcon,
+	Pencil,
+	UsersIcon,
+} from "lucide-react";
 import { useCallback, useEffect, useMemo, useState } from "react";
-import type { ExtraField } from "@/atoms/chat/hitl-edit-panel.atom";
-import { openHitlEditPanelAtom } from "@/atoms/chat/hitl-edit-panel.atom";
 import { PlateEditor } from "@/components/editor/plate-editor";
 import { TextShimmerLoader } from "@/components/prompt-kit/loader";
 import { Button } from "@/components/ui/button";
@@ -16,7 +21,13 @@ import {
 	SelectTrigger,
 	SelectValue,
 } from "@/components/ui/select";
-import { useHitlPhase } from "@/hooks/use-hitl-phase";
+import type { ExtraField, HitlDecision, InterruptResult } from "@/features/chat-messages/hitl";
+import {
+	isInterruptResult,
+	openHitlEditPanelAtom,
+	useHitlDecision,
+	useHitlPhase,
+} from "@/features/chat-messages/hitl";
 
 interface GoogleCalendarAccount {
 	id: number;
@@ -30,25 +41,12 @@ interface CalendarEntry {
 	primary?: boolean;
 }
 
-interface InterruptResult {
-	__interrupt__: true;
-	__decided__?: "approve" | "reject" | "edit";
-	__completed__?: boolean;
-	action_requests: Array<{
-		name: string;
-		args: Record<string, unknown>;
-	}>;
-	review_configs: Array<{
-		action_name: string;
-		allowed_decisions: Array<"approve" | "edit" | "reject">;
-	}>;
-	context?: {
-		accounts?: GoogleCalendarAccount[];
-		calendars?: CalendarEntry[];
-		timezone?: string;
-		error?: string;
-	};
-}
+type CalendarCreateEventContext = {
+	accounts?: GoogleCalendarAccount[];
+	calendars?: CalendarEntry[];
+	timezone?: string;
+	error?: string;
+};
 
 interface SuccessResult {
 	status: "success";
@@ -75,21 +73,12 @@ interface InsufficientPermissionsResult {
 }
 
 type CreateCalendarEventResult =
-	| InterruptResult
+	| InterruptResult<CalendarCreateEventContext>
 	| SuccessResult
 	| ErrorResult
 	| InsufficientPermissionsResult
 	| AuthErrorResult;
 
-function isInterruptResult(result: unknown): result is InterruptResult {
-	return (
-		typeof result === "object" &&
-		result !== null &&
-		"__interrupt__" in result &&
-		(result as InterruptResult).__interrupt__ === true
-	);
-}
-
 function isErrorResult(result: unknown): result is ErrorResult {
 	return (
 		typeof result === "object" &&
@@ -141,12 +130,8 @@ function ApprovalCard({
 		location?: string;
 		attendees?: string[];
 	};
-	interruptData: InterruptResult;
-	onDecision: (decision: {
-		type: "approve" | "reject" | "edit";
-		message?: string;
-		edited_action?: { name: string; args: Record<string, unknown> };
-	}) => void;
+	interruptData: InterruptResult<CalendarCreateEventContext>;
+	onDecision: (decision: HitlDecision) => void;
 }) {
 	const { phase, setProcessing, setRejected } = useHitlPhase(interruptData);
 	const [isPanelOpen, setIsPanelOpen] = useState(false);
@@ -356,7 +341,7 @@ function ApprovalCard({
 							});
 						}}
 					>
-						<Pen className="size-3.5" />
+						<Pencil className="size-3.5" />
 						Edit
 					</Button>
 				)}
@@ -620,18 +605,16 @@ export const CreateCalendarEventToolUI = ({
 	},
 	CreateCalendarEventResult
 >) => {
+	const { dispatch } = useHitlDecision();
+
 	if (!result) return null;
 
 	if (isInterruptResult(result)) {
 		return (
 			<ApprovalCard
 				args={args}
-				interruptData={result}
-				onDecision={(decision) => {
-					window.dispatchEvent(
-						new CustomEvent("hitl-decision", { detail: { decisions: [decision] } })
-					);
-				}}
+				interruptData={result as InterruptResult<CalendarCreateEventContext>}
+				onDecision={(decision) => dispatch([decision])}
 			/>
 		);
 	}
diff --git a/surfsense_web/components/tool-ui/google-calendar/delete-event.tsx b/surfsense_web/components/tool-ui/google-calendar/delete-event.tsx
index 039c0e3b0d..5c5cdea636 100644
--- a/surfsense_web/components/tool-ui/google-calendar/delete-event.tsx
+++ b/surfsense_web/components/tool-ui/google-calendar/delete-event.tsx
@@ -6,7 +6,8 @@ import { useCallback, useEffect, useState } from "react";
 import { TextShimmerLoader } from "@/components/prompt-kit/loader";
 import { Button } from "@/components/ui/button";
 import { Checkbox } from "@/components/ui/checkbox";
-import { useHitlPhase } from "@/hooks/use-hitl-phase";
+import type { HitlDecision, InterruptResult } from "@/features/chat-messages/hitl";
+import { isInterruptResult, useHitlDecision, useHitlPhase } from "@/features/chat-messages/hitl";
 
 interface GoogleCalendarAccount {
 	id: number;
@@ -27,24 +28,11 @@ interface CalendarEvent {
 	indexed_at?: string;
 }
 
-interface InterruptResult {
-	__interrupt__: true;
-	__decided__?: "approve" | "reject";
-	__completed__?: boolean;
-	action_requests: Array<{
-		name: string;
-		args: Record<string, unknown>;
-	}>;
-	review_configs: Array<{
-		action_name: string;
-		allowed_decisions: Array<"approve" | "reject">;
-	}>;
-	context?: {
-		account?: GoogleCalendarAccount;
-		event?: CalendarEvent;
-		error?: string;
-	};
-}
+type CalendarDeleteEventContext = {
+	account?: GoogleCalendarAccount;
+	event?: CalendarEvent;
+	error?: string;
+};
 
 interface SuccessResult {
 	status: "success";
@@ -83,7 +71,7 @@ interface InsufficientPermissionsResult {
 }
 
 type DeleteCalendarEventResult =
-	| InterruptResult
+	| InterruptResult<CalendarDeleteEventContext>
 	| SuccessResult
 	| ErrorResult
 	| NotFoundResult
@@ -91,15 +79,6 @@ type DeleteCalendarEventResult =
 	| InsufficientPermissionsResult
 	| AuthErrorResult;
 
-function isInterruptResult(result: unknown): result is InterruptResult {
-	return (
-		typeof result === "object" &&
-		result !== null &&
-		"__interrupt__" in result &&
-		(result as InterruptResult).__interrupt__ === true
-	);
-}
-
 function isErrorResult(result: unknown): result is ErrorResult {
 	return (
 		typeof result === "object" &&
@@ -162,12 +141,8 @@ function ApprovalCard({
 	interruptData,
 	onDecision,
 }: {
-	interruptData: InterruptResult;
-	onDecision: (decision: {
-		type: "approve" | "reject";
-		message?: string;
-		edited_action?: { name: string; args: Record<string, unknown> };
-	}) => void;
+	interruptData: InterruptResult<CalendarDeleteEventContext>;
+	onDecision: (decision: HitlDecision) => void;
 }) {
 	const { phase, setProcessing, setRejected } = useHitlPhase(interruptData);
 	const [deleteFromKb, setDeleteFromKb] = useState(false);
@@ -437,18 +412,15 @@ export const DeleteCalendarEventToolUI = ({
 	{ event_title_or_id: string; delete_from_kb?: boolean },
 	DeleteCalendarEventResult
 >) => {
+	const { dispatch } = useHitlDecision();
+
 	if (!result) return null;
 
 	if (isInterruptResult(result)) {
 		return (
 			<ApprovalCard
-				interruptData={result}
-				onDecision={(decision) => {
-					const event = new CustomEvent("hitl-decision", {
-						detail: { decisions: [decision] },
-					});
-					window.dispatchEvent(event);
-				}}
+				interruptData={result as InterruptResult<CalendarDeleteEventContext>}
+				onDecision={(decision) => dispatch([decision])}
 			/>
 		);
 	}
diff --git a/surfsense_web/components/tool-ui/google-calendar/update-event.tsx b/surfsense_web/components/tool-ui/google-calendar/update-event.tsx
index a2304b179f..fcab5091ec 100644
--- a/surfsense_web/components/tool-ui/google-calendar/update-event.tsx
+++ b/surfsense_web/components/tool-ui/google-calendar/update-event.tsx
@@ -7,16 +7,20 @@ import {
 	ClockIcon,
 	CornerDownLeftIcon,
 	MapPinIcon,
-	Pen,
+	Pencil,
 	UsersIcon,
 } from "lucide-react";
 import { useCallback, useEffect, useState } from "react";
-import type { ExtraField } from "@/atoms/chat/hitl-edit-panel.atom";
-import { openHitlEditPanelAtom } from "@/atoms/chat/hitl-edit-panel.atom";
 import { PlateEditor } from "@/components/editor/plate-editor";
 import { TextShimmerLoader } from "@/components/prompt-kit/loader";
 import { Button } from "@/components/ui/button";
-import { useHitlPhase } from "@/hooks/use-hitl-phase";
+import type { ExtraField, HitlDecision, InterruptResult } from "@/features/chat-messages/hitl";
+import {
+	isInterruptResult,
+	openHitlEditPanelAtom,
+	useHitlDecision,
+	useHitlPhase,
+} from "@/features/chat-messages/hitl";
 
 interface GoogleCalendarAccount {
 	id: number;
@@ -37,24 +41,11 @@ interface CalendarEvent {
 	indexed_at?: string;
 }
 
-interface InterruptResult {
-	__interrupt__: true;
-	__decided__?: "approve" | "reject" | "edit";
-	__completed__?: boolean;
-	action_requests: Array<{
-		name: string;
-		args: Record<string, unknown>;
-	}>;
-	review_configs: Array<{
-		action_name: string;
-		allowed_decisions: Array<"approve" | "edit" | "reject">;
-	}>;
-	context?: {
-		account?: GoogleCalendarAccount;
-		event?: CalendarEvent;
-		error?: string;
-	};
-}
+type CalendarUpdateEventContext = {
+	account?: GoogleCalendarAccount;
+	event?: CalendarEvent;
+	error?: string;
+};
 
 interface SuccessResult {
 	status: "success";
@@ -86,22 +77,13 @@ interface InsufficientPermissionsResult {
 }
 
 type UpdateCalendarEventResult =
-	| InterruptResult
+	| InterruptResult<CalendarUpdateEventContext>
 	| SuccessResult
 	| ErrorResult
 	| NotFoundResult
 	| InsufficientPermissionsResult
 	| AuthErrorResult;
 
-function isInterruptResult(result: unknown): result is InterruptResult {
-	return (
-		typeof result === "object" &&
-		result !== null &&
-		"__interrupt__" in result &&
-		(result as InterruptResult).__interrupt__ === true
-	);
-}
-
 function isErrorResult(result: unknown): result is ErrorResult {
 	return (
 		typeof result === "object" &&
@@ -163,12 +145,8 @@ function ApprovalCard({
 		new_location?: string;
 		new_attendees?: string[];
 	};
-	interruptData: InterruptResult;
-	onDecision: (decision: {
-		type: "approve" | "reject" | "edit";
-		message?: string;
-		edited_action?: { name: string; args: Record<string, unknown> };
-	}) => void;
+	interruptData: InterruptResult<CalendarUpdateEventContext>;
+	onDecision: (decision: HitlDecision) => void;
 }) {
 	const { phase, setProcessing, setRejected } = useHitlPhase(interruptData);
 	const actionArgs = interruptData.action_requests[0]?.args ?? {};
@@ -439,7 +417,7 @@ function ApprovalCard({
 							});
 						}}
 					>
-						<Pen className="size-3.5" />
+						<Pencil className="size-3.5" />
 						Edit
 					</Button>
 				)}
@@ -686,18 +664,16 @@ export const UpdateCalendarEventToolUI = ({
 	},
 	UpdateCalendarEventResult
 >) => {
+	const { dispatch } = useHitlDecision();
+
 	if (!result) return null;
 
 	if (isInterruptResult(result)) {
 		return (
 			<ApprovalCard
 				args={args}
-				interruptData={result}
-				onDecision={(decision) => {
-					window.dispatchEvent(
-						new CustomEvent("hitl-decision", { detail: { decisions: [decision] } })
-					);
-				}}
+				interruptData={result as InterruptResult<CalendarUpdateEventContext>}
+				onDecision={(decision) => dispatch([decision])}
 			/>
 		);
 	}
diff --git a/surfsense_web/components/tool-ui/google-drive/create-file.tsx b/surfsense_web/components/tool-ui/google-drive/create-file.tsx
index af263032da..a38abea041 100644
--- a/surfsense_web/components/tool-ui/google-drive/create-file.tsx
+++ b/surfsense_web/components/tool-ui/google-drive/create-file.tsx
@@ -2,9 +2,8 @@
 
 import type { ToolCallMessagePartProps } from "@assistant-ui/react";
 import { useSetAtom } from "jotai";
-import { CornerDownLeftIcon, FileIcon, Pen } from "lucide-react";
+import { CornerDownLeftIcon, FileIcon, Pencil } from "lucide-react";
 import { useCallback, useEffect, useMemo, useState } from "react";
-import { openHitlEditPanelAtom } from "@/atoms/chat/hitl-edit-panel.atom";
 import { PlateEditor } from "@/components/editor/plate-editor";
 import { TextShimmerLoader } from "@/components/prompt-kit/loader";
 import { Button } from "@/components/ui/button";
@@ -15,7 +14,13 @@ import {
 	SelectTrigger,
 	SelectValue,
 } from "@/components/ui/select";
-import { useHitlPhase } from "@/hooks/use-hitl-phase";
+import type { HitlDecision, InterruptResult } from "@/features/chat-messages/hitl";
+import {
+	isInterruptResult,
+	openHitlEditPanelAtom,
+	useHitlDecision,
+	useHitlPhase,
+} from "@/features/chat-messages/hitl";
 
 interface GoogleDriveAccount {
 	id: number;
@@ -23,25 +28,12 @@ interface GoogleDriveAccount {
 	auth_expired?: boolean;
 }
 
-interface InterruptResult {
-	__interrupt__: true;
-	__decided__?: "approve" | "reject" | "edit";
-	__completed__?: boolean;
-	action_requests: Array<{
-		name: string;
-		args: Record<string, unknown>;
-	}>;
-	review_configs: Array<{
-		action_name: string;
-		allowed_decisions: Array<"approve" | "edit" | "reject">;
-	}>;
-	context?: {
-		accounts?: GoogleDriveAccount[];
-		supported_types?: string[];
-		parent_folders?: Record<number, Array<{ folder_id: string; name: string }>>;
-		error?: string;
-	};
-}
+type DriveCreateFileContext = {
+	accounts?: GoogleDriveAccount[];
+	supported_types?: string[];
+	parent_folders?: Record<number, Array<{ folder_id: string; name: string }>>;
+	error?: string;
+};
 
 interface SuccessResult {
 	status: "success";
@@ -69,21 +61,12 @@ interface AuthErrorResult {
 }
 
 type CreateGoogleDriveFileResult =
-	| InterruptResult
+	| InterruptResult<DriveCreateFileContext>
 	| SuccessResult
 	| ErrorResult
 	| InsufficientPermissionsResult
 	| AuthErrorResult;
 
-function isInterruptResult(result: unknown): result is InterruptResult {
-	return (
-		typeof result === "object" &&
-		result !== null &&
-		"__interrupt__" in result &&
-		(result as InterruptResult).__interrupt__ === true
-	);
-}
-
 function isErrorResult(result: unknown): result is ErrorResult {
 	return (
 		typeof result === "object" &&
@@ -122,12 +105,8 @@ function ApprovalCard({
 	onDecision,
 }: {
 	args: { name: string; file_type: string; content?: string };
-	interruptData: InterruptResult;
-	onDecision: (decision: {
-		type: "approve" | "reject" | "edit";
-		message?: string;
-		edited_action?: { name: string; args: Record<string, unknown> };
-	}) => void;
+	interruptData: InterruptResult<DriveCreateFileContext>;
+	onDecision: (decision: HitlDecision) => void;
 }) {
 	const { phase, setProcessing, setRejected } = useHitlPhase(interruptData);
 	const [isPanelOpen, setIsPanelOpen] = useState(false);
@@ -264,7 +243,7 @@ function ApprovalCard({
 							});
 						}}
 					>
-						<Pen className="size-3.5" />
+						<Pencil className="size-3.5" />
 						Edit
 					</Button>
 				)}
@@ -499,18 +478,15 @@ export const CreateGoogleDriveFileToolUI = ({
 	{ name: string; file_type: string; content?: string },
 	CreateGoogleDriveFileResult
 >) => {
+	const { dispatch } = useHitlDecision();
 	if (!result) return null;
 
 	if (isInterruptResult(result)) {
 		return (
 			<ApprovalCard
 				args={args}
-				interruptData={result}
-				onDecision={(decision) => {
-					window.dispatchEvent(
-						new CustomEvent("hitl-decision", { detail: { decisions: [decision] } })
-					);
-				}}
+				interruptData={result as InterruptResult<DriveCreateFileContext>}
+				onDecision={(decision) => dispatch([decision])}
 			/>
 		);
 	}
diff --git a/surfsense_web/components/tool-ui/google-drive/trash-file.tsx b/surfsense_web/components/tool-ui/google-drive/trash-file.tsx
index f5ac11a3c8..bcce546feb 100644
--- a/surfsense_web/components/tool-ui/google-drive/trash-file.tsx
+++ b/surfsense_web/components/tool-ui/google-drive/trash-file.tsx
@@ -6,7 +6,8 @@ import { useCallback, useEffect, useState } from "react";
 import { TextShimmerLoader } from "@/components/prompt-kit/loader";
 import { Button } from "@/components/ui/button";
 import { Checkbox } from "@/components/ui/checkbox";
-import { useHitlPhase } from "@/hooks/use-hitl-phase";
+import type { HitlDecision, InterruptResult } from "@/features/chat-messages/hitl";
+import { isInterruptResult, useHitlDecision, useHitlPhase } from "@/features/chat-messages/hitl";
 
 interface GoogleDriveAccount {
 	id: number;
@@ -21,24 +22,11 @@ interface GoogleDriveFile {
 	web_view_link: string;
 }
 
-interface InterruptResult {
-	__interrupt__: true;
-	__decided__?: "approve" | "reject";
-	__completed__?: boolean;
-	action_requests: Array<{
-		name: string;
-		args: Record<string, unknown>;
-	}>;
-	review_configs: Array<{
-		action_name: string;
-		allowed_decisions: Array<"approve" | "reject">;
-	}>;
-	context?: {
-		account?: GoogleDriveAccount;
-		file?: GoogleDriveFile;
-		error?: string;
-	};
-}
+type DriveTrashFileContext = {
+	account?: GoogleDriveAccount;
+	file?: GoogleDriveFile;
+	error?: string;
+};
 
 interface SuccessResult {
 	status: "success";
@@ -77,7 +65,7 @@ interface AuthErrorResult {
 }
 
 type DeleteGoogleDriveFileResult =
-	| InterruptResult
+	| InterruptResult<DriveTrashFileContext>
 	| SuccessResult
 	| WarningResult
 	| ErrorResult
@@ -85,15 +73,6 @@ type DeleteGoogleDriveFileResult =
 	| InsufficientPermissionsResult
 	| AuthErrorResult;
 
-function isInterruptResult(result: unknown): result is InterruptResult {
-	return (
-		typeof result === "object" &&
-		result !== null &&
-		"__interrupt__" in result &&
-		(result as InterruptResult).__interrupt__ === true
-	);
-}
-
 function isErrorResult(result: unknown): result is ErrorResult {
 	return (
 		typeof result === "object" &&
@@ -151,12 +130,8 @@ function ApprovalCard({
 	interruptData,
 	onDecision,
 }: {
-	interruptData: InterruptResult;
-	onDecision: (decision: {
-		type: "approve" | "reject";
-		message?: string;
-		edited_action?: { name: string; args: Record<string, unknown> };
-	}) => void;
+	interruptData: InterruptResult<DriveTrashFileContext>;
+	onDecision: (decision: HitlDecision) => void;
 }) {
 	const { phase, setProcessing, setRejected } = useHitlPhase(interruptData);
 	const [deleteFromKb, setDeleteFromKb] = useState(false);
@@ -416,18 +391,14 @@ export const DeleteGoogleDriveFileToolUI = ({
 	{ file_name: string; delete_from_kb?: boolean },
 	DeleteGoogleDriveFileResult
 >) => {
+	const { dispatch } = useHitlDecision();
 	if (!result) return null;
 
 	if (isInterruptResult(result)) {
 		return (
 			<ApprovalCard
-				interruptData={result}
-				onDecision={(decision) => {
-					const event = new CustomEvent("hitl-decision", {
-						detail: { decisions: [decision] },
-					});
-					window.dispatchEvent(event);
-				}}
+				interruptData={result as InterruptResult<DriveTrashFileContext>}
+				onDecision={(decision) => dispatch([decision])}
 			/>
 		);
 	}
diff --git a/surfsense_web/components/tool-ui/image/index.tsx b/surfsense_web/components/tool-ui/image/index.tsx
index 9c39f49287..0536ede66f 100644
--- a/surfsense_web/components/tool-ui/image/index.tsx
+++ b/surfsense_web/components/tool-ui/image/index.tsx
@@ -288,7 +288,7 @@ export function Image({
 							alt={alt}
 							width={0}
 							height={0}
-							sizes="100vw"
+							sizes={`(max-width: ${maxWidth}) 100vw, ${maxWidth}`}
 							loading="eager"
 							className={cn(
 								"w-full h-auto transition-transform duration-300",
@@ -307,7 +307,7 @@ export function Image({
 							src={src}
 							alt={alt}
 							fill
-							sizes="(max-width: 512px) 100vw, 512px"
+							sizes={`(max-width: ${maxWidth}) 100vw, ${maxWidth}`}
 							className={cn(
 								"transition-transform duration-300",
 								fit === "cover" ? "object-cover" : "object-contain",
diff --git a/surfsense_web/components/tool-ui/index.ts b/surfsense_web/components/tool-ui/index.ts
index 6371554ae8..4d885a38ce 100644
--- a/surfsense_web/components/tool-ui/index.ts
+++ b/surfsense_web/components/tool-ui/index.ts
@@ -51,17 +51,11 @@ export {
 	SandboxExecuteToolUI,
 } from "./sandbox-execute";
 export {
-	type MemoryItem,
-	type RecallMemoryArgs,
-	RecallMemoryArgsSchema,
-	type RecallMemoryResult,
-	RecallMemoryResultSchema,
-	RecallMemoryToolUI,
-	type SaveMemoryArgs,
-	SaveMemoryArgsSchema,
-	type SaveMemoryResult,
-	SaveMemoryResultSchema,
-	SaveMemoryToolUI,
+	type UpdateMemoryArgs,
+	UpdateMemoryArgsSchema,
+	type UpdateMemoryResult,
+	UpdateMemoryResultSchema,
+	UpdateMemoryToolUI,
 } from "./user-memory";
 export { GenerateVideoPresentationToolUI } from "./video-presentation";
 export { type WriteTodosData, WriteTodosSchema, WriteTodosToolUI } from "./write-todos";
diff --git a/surfsense_web/components/tool-ui/jira/create-jira-issue.tsx b/surfsense_web/components/tool-ui/jira/create-jira-issue.tsx
index cabb0d8400..1e91114012 100644
--- a/surfsense_web/components/tool-ui/jira/create-jira-issue.tsx
+++ b/surfsense_web/components/tool-ui/jira/create-jira-issue.tsx
@@ -2,9 +2,8 @@
 
 import type { ToolCallMessagePartProps } from "@assistant-ui/react";
 import { useSetAtom } from "jotai";
-import { CornerDownLeftIcon, Pen } from "lucide-react";
+import { CornerDownLeftIcon, Pencil } from "lucide-react";
 import { useCallback, useEffect, useMemo, useState } from "react";
-import { openHitlEditPanelAtom } from "@/atoms/chat/hitl-edit-panel.atom";
 import { PlateEditor } from "@/components/editor/plate-editor";
 import { TextShimmerLoader } from "@/components/prompt-kit/loader";
 import { Button } from "@/components/ui/button";
@@ -15,7 +14,13 @@ import {
 	SelectTrigger,
 	SelectValue,
 } from "@/components/ui/select";
-import { useHitlPhase } from "@/hooks/use-hitl-phase";
+import type { HitlDecision, InterruptResult } from "@/features/chat-messages/hitl";
+import {
+	isInterruptResult,
+	openHitlEditPanelAtom,
+	useHitlDecision,
+	useHitlPhase,
+} from "@/features/chat-messages/hitl";
 
 interface JiraAccount {
 	id: number;
@@ -40,27 +45,13 @@ interface JiraPriority {
 	name: string;
 }
 
-interface InterruptResult {
-	__interrupt__: true;
-	__decided__?: "approve" | "reject" | "edit";
-	__completed__?: boolean;
-	action_requests: Array<{
-		name: string;
-		args: Record<string, unknown>;
-	}>;
-	review_configs: Array<{
-		action_name: string;
-		allowed_decisions: Array<"approve" | "edit" | "reject">;
-	}>;
-	interrupt_type?: string;
-	context?: {
-		accounts?: JiraAccount[];
-		projects?: JiraProject[];
-		issue_types?: JiraIssueType[];
-		priorities?: JiraPriority[];
-		error?: string;
-	};
-}
+type CreateJiraIssueInterruptContext = {
+	accounts?: JiraAccount[];
+	projects?: JiraProject[];
+	issue_types?: JiraIssueType[];
+	priorities?: JiraPriority[];
+	error?: string;
+};
 
 interface SuccessResult {
 	status: "success";
@@ -88,21 +79,12 @@ interface InsufficientPermissionsResult {
 }
 
 type CreateJiraIssueResult =
-	| InterruptResult
+	| InterruptResult<CreateJiraIssueInterruptContext>
 	| SuccessResult
 	| ErrorResult
 	| AuthErrorResult
 	| InsufficientPermissionsResult;
 
-function isInterruptResult(result: unknown): result is InterruptResult {
-	return (
-		typeof result === "object" &&
-		result !== null &&
-		"__interrupt__" in result &&
-		(result as InterruptResult).__interrupt__ === true
-	);
-}
-
 function isErrorResult(result: unknown): result is ErrorResult {
 	return (
 		typeof result === "object" &&
@@ -142,12 +124,8 @@ function ApprovalCard({
 		description?: string;
 		priority?: string;
 	};
-	interruptData: InterruptResult;
-	onDecision: (decision: {
-		type: "approve" | "reject" | "edit";
-		message?: string;
-		edited_action?: { name: string; args: Record<string, unknown> };
-	}) => void;
+	interruptData: InterruptResult<CreateJiraIssueInterruptContext>;
+	onDecision: (decision: HitlDecision) => void;
 }) {
 	const { phase, setProcessing, setRejected } = useHitlPhase(interruptData);
 	const [isPanelOpen, setIsPanelOpen] = useState(false);
@@ -282,7 +260,7 @@ function ApprovalCard({
 							});
 						}}
 					>
-						<Pen className="size-3.5" />
+						<Pencil className="size-3.5" />
 						Edit
 					</Button>
 				)}
@@ -549,18 +527,16 @@ export const CreateJiraIssueToolUI = ({
 	},
 	CreateJiraIssueResult
 >) => {
+	const { dispatch } = useHitlDecision();
+
 	if (!result) return null;
 
 	if (isInterruptResult(result)) {
 		return (
 			<ApprovalCard
 				args={args}
-				interruptData={result}
-				onDecision={(decision) => {
-					window.dispatchEvent(
-						new CustomEvent("hitl-decision", { detail: { decisions: [decision] } })
-					);
-				}}
+				interruptData={result as InterruptResult<CreateJiraIssueInterruptContext>}
+				onDecision={(decision) => dispatch([decision])}
 			/>
 		);
 	}
diff --git a/surfsense_web/components/tool-ui/jira/delete-jira-issue.tsx b/surfsense_web/components/tool-ui/jira/delete-jira-issue.tsx
index 2cad2be461..20044b1b68 100644
--- a/surfsense_web/components/tool-ui/jira/delete-jira-issue.tsx
+++ b/surfsense_web/components/tool-ui/jira/delete-jira-issue.tsx
@@ -6,7 +6,8 @@ import { useCallback, useEffect, useState } from "react";
 import { TextShimmerLoader } from "@/components/prompt-kit/loader";
 import { Button } from "@/components/ui/button";
 import { Checkbox } from "@/components/ui/checkbox";
-import { useHitlPhase } from "@/hooks/use-hitl-phase";
+import type { HitlDecision, InterruptResult } from "@/features/chat-messages/hitl";
+import { isInterruptResult, useHitlDecision, useHitlPhase } from "@/features/chat-messages/hitl";
 
 interface JiraAccount {
 	id: number;
@@ -23,25 +24,11 @@ interface JiraIssue {
 	document_id?: number;
 }
 
-interface InterruptResult {
-	__interrupt__: true;
-	__decided__?: "approve" | "reject";
-	__completed__?: boolean;
-	action_requests: Array<{
-		name: string;
-		args: Record<string, unknown>;
-	}>;
-	review_configs: Array<{
-		action_name: string;
-		allowed_decisions: Array<"approve" | "reject">;
-	}>;
-	interrupt_type?: string;
-	context?: {
-		account?: JiraAccount;
-		issue?: JiraIssue;
-		error?: string;
-	};
-}
+type DeleteJiraIssueInterruptContext = {
+	account?: JiraAccount;
+	issue?: JiraIssue;
+	error?: string;
+};
 
 interface SuccessResult {
 	status: "success";
@@ -79,7 +66,7 @@ interface InsufficientPermissionsResult {
 }
 
 type DeleteJiraIssueResult =
-	| InterruptResult
+	| InterruptResult<DeleteJiraIssueInterruptContext>
 	| SuccessResult
 	| ErrorResult
 	| NotFoundResult
@@ -87,15 +74,6 @@ type DeleteJiraIssueResult =
 	| AuthErrorResult
 	| InsufficientPermissionsResult;
 
-function isInterruptResult(result: unknown): result is InterruptResult {
-	return (
-		typeof result === "object" &&
-		result !== null &&
-		"__interrupt__" in result &&
-		(result as InterruptResult).__interrupt__ === true
-	);
-}
-
 function isErrorResult(result: unknown): result is ErrorResult {
 	return (
 		typeof result === "object" &&
@@ -147,12 +125,8 @@ function ApprovalCard({
 	interruptData,
 	onDecision,
 }: {
-	interruptData: InterruptResult;
-	onDecision: (decision: {
-		type: "approve" | "reject";
-		message?: string;
-		edited_action?: { name: string; args: Record<string, unknown> };
-	}) => void;
+	interruptData: InterruptResult<DeleteJiraIssueInterruptContext>;
+	onDecision: (decision: HitlDecision) => void;
 }) {
 	const { phase, setProcessing, setRejected } = useHitlPhase(interruptData);
 	const [deleteFromKb, setDeleteFromKb] = useState(false);
@@ -399,18 +373,15 @@ export const DeleteJiraIssueToolUI = ({
 	{ issue_title_or_key: string; delete_from_kb?: boolean },
 	DeleteJiraIssueResult
 >) => {
+	const { dispatch } = useHitlDecision();
+
 	if (!result) return null;
 
 	if (isInterruptResult(result)) {
 		return (
 			<ApprovalCard
-				interruptData={result}
-				onDecision={(decision) => {
-					const event = new CustomEvent("hitl-decision", {
-						detail: { decisions: [decision] },
-					});
-					window.dispatchEvent(event);
-				}}
+				interruptData={result as InterruptResult<DeleteJiraIssueInterruptContext>}
+				onDecision={(decision) => dispatch([decision])}
 			/>
 		);
 	}
diff --git a/surfsense_web/components/tool-ui/jira/update-jira-issue.tsx b/surfsense_web/components/tool-ui/jira/update-jira-issue.tsx
index 4132fad067..030f16af84 100644
--- a/surfsense_web/components/tool-ui/jira/update-jira-issue.tsx
+++ b/surfsense_web/components/tool-ui/jira/update-jira-issue.tsx
@@ -2,9 +2,8 @@
 
 import type { ToolCallMessagePartProps } from "@assistant-ui/react";
 import { useSetAtom } from "jotai";
-import { CornerDownLeftIcon, Pen } from "lucide-react";
+import { CornerDownLeftIcon, Pencil } from "lucide-react";
 import { useCallback, useEffect, useState } from "react";
-import { openHitlEditPanelAtom } from "@/atoms/chat/hitl-edit-panel.atom";
 import { PlateEditor } from "@/components/editor/plate-editor";
 import { TextShimmerLoader } from "@/components/prompt-kit/loader";
 import { Badge } from "@/components/ui/badge";
@@ -16,7 +15,13 @@ import {
 	SelectTrigger,
 	SelectValue,
 } from "@/components/ui/select";
-import { useHitlPhase } from "@/hooks/use-hitl-phase";
+import type { HitlDecision, InterruptResult } from "@/features/chat-messages/hitl";
+import {
+	isInterruptResult,
+	openHitlEditPanelAtom,
+	useHitlDecision,
+	useHitlPhase,
+} from "@/features/chat-messages/hitl";
 
 interface JiraIssue {
 	issue_id: string;
@@ -43,26 +48,12 @@ interface JiraPriority {
 	name: string;
 }
 
-interface InterruptResult {
-	__interrupt__: true;
-	__decided__?: "approve" | "reject" | "edit";
-	__completed__?: boolean;
-	action_requests: Array<{
-		name: string;
-		args: Record<string, unknown>;
-	}>;
-	review_configs: Array<{
-		action_name: string;
-		allowed_decisions: Array<"approve" | "edit" | "reject">;
-	}>;
-	interrupt_type?: string;
-	context?: {
-		account?: JiraAccount;
-		issue?: JiraIssue;
-		priorities?: JiraPriority[];
-		error?: string;
-	};
-}
+type UpdateJiraIssueInterruptContext = {
+	account?: JiraAccount;
+	issue?: JiraIssue;
+	priorities?: JiraPriority[];
+	error?: string;
+};
 
 interface SuccessResult {
 	status: "success";
@@ -95,22 +86,13 @@ interface InsufficientPermissionsResult {
 }
 
 type UpdateJiraIssueResult =
-	| InterruptResult
+	| InterruptResult<UpdateJiraIssueInterruptContext>
 	| SuccessResult
 	| ErrorResult
 	| NotFoundResult
 	| AuthErrorResult
 	| InsufficientPermissionsResult;
 
-function isInterruptResult(result: unknown): result is InterruptResult {
-	return (
-		typeof result === "object" &&
-		result !== null &&
-		"__interrupt__" in result &&
-		(result as InterruptResult).__interrupt__ === true
-	);
-}
-
 function isErrorResult(result: unknown): result is ErrorResult {
 	return (
 		typeof result === "object" &&
@@ -158,12 +140,8 @@ function ApprovalCard({
 		new_description?: string;
 		new_priority?: string;
 	};
-	interruptData: InterruptResult;
-	onDecision: (decision: {
-		type: "approve" | "reject" | "edit";
-		message?: string;
-		edited_action?: { name: string; args: Record<string, unknown> };
-	}) => void;
+	interruptData: InterruptResult<UpdateJiraIssueInterruptContext>;
+	onDecision: (decision: HitlDecision) => void;
 }) {
 	const { phase, setProcessing, setRejected } = useHitlPhase(interruptData);
 
@@ -298,7 +276,7 @@ function ApprovalCard({
 							});
 						}}
 					>
-						<Pen className="size-3.5" />
+						<Pencil className="size-3.5" />
 						Edit
 					</Button>
 				)}
@@ -563,18 +541,16 @@ export const UpdateJiraIssueToolUI = ({
 	},
 	UpdateJiraIssueResult
 >) => {
+	const { dispatch } = useHitlDecision();
+
 	if (!result) return null;
 
 	if (isInterruptResult(result)) {
 		return (
 			<ApprovalCard
 				args={args}
-				interruptData={result}
-				onDecision={(decision) => {
-					window.dispatchEvent(
-						new CustomEvent("hitl-decision", { detail: { decisions: [decision] } })
-					);
-				}}
+				interruptData={result as InterruptResult<UpdateJiraIssueInterruptContext>}
+				onDecision={(decision) => dispatch([decision])}
 			/>
 		);
 	}
diff --git a/surfsense_web/components/tool-ui/linear/create-linear-issue.tsx b/surfsense_web/components/tool-ui/linear/create-linear-issue.tsx
index 1dec11c601..0970cd124d 100644
--- a/surfsense_web/components/tool-ui/linear/create-linear-issue.tsx
+++ b/surfsense_web/components/tool-ui/linear/create-linear-issue.tsx
@@ -2,9 +2,8 @@
 
 import type { ToolCallMessagePartProps } from "@assistant-ui/react";
 import { useSetAtom } from "jotai";
-import { CornerDownLeftIcon, Pen } from "lucide-react";
+import { CornerDownLeftIcon, Pencil } from "lucide-react";
 import { useCallback, useEffect, useMemo, useState } from "react";
-import { openHitlEditPanelAtom } from "@/atoms/chat/hitl-edit-panel.atom";
 import { PlateEditor } from "@/components/editor/plate-editor";
 import { TextShimmerLoader } from "@/components/prompt-kit/loader";
 import { Badge } from "@/components/ui/badge";
@@ -17,7 +16,13 @@ import {
 	SelectValue,
 } from "@/components/ui/select";
 import { ToggleGroup, ToggleGroupItem } from "@/components/ui/toggle-group";
-import { useHitlPhase } from "@/hooks/use-hitl-phase";
+import type { HitlDecision, InterruptResult } from "@/features/chat-messages/hitl";
+import {
+	isInterruptResult,
+	openHitlEditPanelAtom,
+	useHitlDecision,
+	useHitlPhase,
+} from "@/features/chat-messages/hitl";
 
 interface LinearLabel {
 	id: string;
@@ -64,24 +69,10 @@ interface LinearWorkspace {
 	auth_expired?: boolean;
 }
 
-interface InterruptResult {
-	__interrupt__: true;
-	__decided__?: "approve" | "reject" | "edit";
-	__completed__?: boolean;
-	action_requests: Array<{
-		name: string;
-		args: Record<string, unknown>;
-	}>;
-	review_configs: Array<{
-		action_name: string;
-		allowed_decisions: Array<"approve" | "edit" | "reject">;
-	}>;
-	interrupt_type?: string;
-	context?: {
-		workspaces?: LinearWorkspace[];
-		error?: string;
-	};
-}
+type LinearCreateIssueContext = {
+	workspaces?: LinearWorkspace[];
+	error?: string;
+};
 
 interface SuccessResult {
 	status: "success";
@@ -103,16 +94,11 @@ interface AuthErrorResult {
 	connector_type: string;
 }
 
-type CreateLinearIssueResult = InterruptResult | SuccessResult | ErrorResult | AuthErrorResult;
-
-function isInterruptResult(result: unknown): result is InterruptResult {
-	return (
-		typeof result === "object" &&
-		result !== null &&
-		"__interrupt__" in result &&
-		(result as InterruptResult).__interrupt__ === true
-	);
-}
+type CreateLinearIssueResult =
+	| InterruptResult<LinearCreateIssueContext>
+	| SuccessResult
+	| ErrorResult
+	| AuthErrorResult;
 
 function isErrorResult(result: unknown): result is ErrorResult {
 	return (
@@ -138,12 +124,8 @@ function ApprovalCard({
 	onDecision,
 }: {
 	args: { title: string; description?: string };
-	interruptData: InterruptResult;
-	onDecision: (decision: {
-		type: "approve" | "reject" | "edit";
-		message?: string;
-		edited_action?: { name: string; args: Record<string, unknown> };
-	}) => void;
+	interruptData: InterruptResult<LinearCreateIssueContext>;
+	onDecision: (decision: HitlDecision) => void;
 }) {
 	const { phase, setProcessing, setRejected } = useHitlPhase(interruptData);
 	const [isPanelOpen, setIsPanelOpen] = useState(false);
@@ -290,7 +272,7 @@ function ApprovalCard({
 							});
 						}}
 					>
-						<Pen className="size-3.5" />
+						<Pencil className="size-3.5" />
 						Edit
 					</Button>
 				)}
@@ -609,18 +591,16 @@ export const CreateLinearIssueToolUI = ({
 	args,
 	result,
 }: ToolCallMessagePartProps<{ title: string; description?: string }, CreateLinearIssueResult>) => {
+	const { dispatch } = useHitlDecision();
+
 	if (!result) return null;
 
 	if (isInterruptResult(result)) {
 		return (
 			<ApprovalCard
 				args={args}
-				interruptData={result}
-				onDecision={(decision) => {
-					window.dispatchEvent(
-						new CustomEvent("hitl-decision", { detail: { decisions: [decision] } })
-					);
-				}}
+				interruptData={result as InterruptResult<LinearCreateIssueContext>}
+				onDecision={(decision) => dispatch([decision])}
 			/>
 		);
 	}
diff --git a/surfsense_web/components/tool-ui/linear/delete-linear-issue.tsx b/surfsense_web/components/tool-ui/linear/delete-linear-issue.tsx
index 18790f5606..de04a5fa42 100644
--- a/surfsense_web/components/tool-ui/linear/delete-linear-issue.tsx
+++ b/surfsense_web/components/tool-ui/linear/delete-linear-issue.tsx
@@ -6,34 +6,21 @@ import { useCallback, useEffect, useState } from "react";
 import { TextShimmerLoader } from "@/components/prompt-kit/loader";
 import { Button } from "@/components/ui/button";
 import { Checkbox } from "@/components/ui/checkbox";
-import { useHitlPhase } from "@/hooks/use-hitl-phase";
-
-interface InterruptResult {
-	__interrupt__: true;
-	__decided__?: "approve" | "reject";
-	__completed__?: boolean;
-	action_requests: Array<{
-		name: string;
-		args: Record<string, unknown>;
-	}>;
-	review_configs: Array<{
-		action_name: string;
-		allowed_decisions: Array<"approve" | "reject">;
-	}>;
-	interrupt_type?: string;
-	context?: {
-		workspace?: { id: number; organization_name: string };
-		issue?: {
-			id: string;
-			identifier: string;
-			title: string;
-			state?: string;
-			document_id?: number;
-			indexed_at?: string;
-		};
-		error?: string;
+import type { HitlDecision, InterruptResult } from "@/features/chat-messages/hitl";
+import { isInterruptResult, useHitlDecision, useHitlPhase } from "@/features/chat-messages/hitl";
+
+type LinearDeleteIssueContext = {
+	workspace?: { id: number; organization_name: string };
+	issue?: {
+		id: string;
+		identifier: string;
+		title: string;
+		state?: string;
+		document_id?: number;
+		indexed_at?: string;
 	};
-}
+	error?: string;
+};
 
 interface SuccessResult {
 	status: "success";
@@ -65,22 +52,13 @@ interface AuthErrorResult {
 }
 
 type DeleteLinearIssueResult =
-	| InterruptResult
+	| InterruptResult<LinearDeleteIssueContext>
 	| SuccessResult
 	| ErrorResult
 	| NotFoundResult
 	| WarningResult
 	| AuthErrorResult;
 
-function isInterruptResult(result: unknown): result is InterruptResult {
-	return (
-		typeof result === "object" &&
-		result !== null &&
-		"__interrupt__" in result &&
-		(result as InterruptResult).__interrupt__ === true
-	);
-}
-
 function isErrorResult(result: unknown): result is ErrorResult {
 	return (
 		typeof result === "object" &&
@@ -123,12 +101,8 @@ function ApprovalCard({
 	interruptData,
 	onDecision,
 }: {
-	interruptData: InterruptResult;
-	onDecision: (decision: {
-		type: "approve" | "reject";
-		message?: string;
-		edited_action?: { name: string; args: Record<string, unknown> };
-	}) => void;
+	interruptData: InterruptResult<LinearDeleteIssueContext>;
+	onDecision: (decision: HitlDecision) => void;
 }) {
 	const { phase, setProcessing, setRejected } = useHitlPhase(interruptData);
 	const [deleteFromKb, setDeleteFromKb] = useState(false);
@@ -366,18 +340,15 @@ export const DeleteLinearIssueToolUI = ({
 	{ issue_ref: string; delete_from_kb?: boolean },
 	DeleteLinearIssueResult
 >) => {
+	const { dispatch } = useHitlDecision();
+
 	if (!result) return null;
 
 	if (isInterruptResult(result)) {
 		return (
 			<ApprovalCard
-				interruptData={result}
-				onDecision={(decision) => {
-					const event = new CustomEvent("hitl-decision", {
-						detail: { decisions: [decision] },
-					});
-					window.dispatchEvent(event);
-				}}
+				interruptData={result as InterruptResult<LinearDeleteIssueContext>}
+				onDecision={(decision) => dispatch([decision])}
 			/>
 		);
 	}
diff --git a/surfsense_web/components/tool-ui/linear/update-linear-issue.tsx b/surfsense_web/components/tool-ui/linear/update-linear-issue.tsx
index f15f042240..c24023e343 100644
--- a/surfsense_web/components/tool-ui/linear/update-linear-issue.tsx
+++ b/surfsense_web/components/tool-ui/linear/update-linear-issue.tsx
@@ -2,9 +2,8 @@
 
 import type { ToolCallMessagePartProps } from "@assistant-ui/react";
 import { useSetAtom } from "jotai";
-import { CornerDownLeftIcon, Pen } from "lucide-react";
+import { CornerDownLeftIcon, Pencil } from "lucide-react";
 import { useCallback, useEffect, useState } from "react";
-import { openHitlEditPanelAtom } from "@/atoms/chat/hitl-edit-panel.atom";
 import { PlateEditor } from "@/components/editor/plate-editor";
 import { TextShimmerLoader } from "@/components/prompt-kit/loader";
 import { Badge } from "@/components/ui/badge";
@@ -17,7 +16,13 @@ import {
 	SelectValue,
 } from "@/components/ui/select";
 import { ToggleGroup, ToggleGroupItem } from "@/components/ui/toggle-group";
-import { useHitlPhase } from "@/hooks/use-hitl-phase";
+import type { HitlDecision, InterruptResult } from "@/features/chat-messages/hitl";
+import {
+	isInterruptResult,
+	openHitlEditPanelAtom,
+	useHitlDecision,
+	useHitlPhase,
+} from "@/features/chat-messages/hitl";
 
 interface LinearLabel {
 	id: string;
@@ -45,46 +50,32 @@ interface LinearPriority {
 	label: string;
 }
 
-interface InterruptResult {
-	__interrupt__: true;
-	__decided__?: "approve" | "reject" | "edit";
-	__completed__?: boolean;
-	action_requests: Array<{
+type LinearUpdateIssueContext = {
+	workspace?: { id: number; organization_name: string };
+	priorities?: LinearPriority[];
+	issue?: {
+		id: string;
+		identifier: string;
+		title: string;
+		description?: string;
+		priority: number;
+		url: string;
+		current_state?: LinearState;
+		current_assignee?: { id: string; name: string; email: string } | null;
+		current_labels?: LinearLabel[];
+		team_id: string;
+		document_id: number;
+	};
+	team?: {
+		id: string;
 		name: string;
-		args: Record<string, unknown>;
-	}>;
-	review_configs: Array<{
-		action_name: string;
-		allowed_decisions: Array<"approve" | "edit" | "reject">;
-	}>;
-	interrupt_type?: string;
-	context?: {
-		workspace?: { id: number; organization_name: string };
-		priorities?: LinearPriority[];
-		issue?: {
-			id: string;
-			identifier: string;
-			title: string;
-			description?: string;
-			priority: number;
-			url: string;
-			current_state?: LinearState;
-			current_assignee?: { id: string; name: string; email: string } | null;
-			current_labels?: LinearLabel[];
-			team_id: string;
-			document_id: number;
-		};
-		team?: {
-			id: string;
-			name: string;
-			key: string;
-			states: LinearState[];
-			members: LinearMember[];
-			labels: LinearLabel[];
-		};
-		error?: string;
+		key: string;
+		states: LinearState[];
+		members: LinearMember[];
+		labels: LinearLabel[];
 	};
-}
+	error?: string;
+};
 
 interface SuccessResult {
 	status: "success";
@@ -111,21 +102,12 @@ interface AuthErrorResult {
 }
 
 type UpdateLinearIssueResult =
-	| InterruptResult
+	| InterruptResult<LinearUpdateIssueContext>
 	| SuccessResult
 	| ErrorResult
 	| NotFoundResult
 	| AuthErrorResult;
 
-function isInterruptResult(result: unknown): result is InterruptResult {
-	return (
-		typeof result === "object" &&
-		result !== null &&
-		"__interrupt__" in result &&
-		(result as InterruptResult).__interrupt__ === true
-	);
-}
-
 function isErrorResult(result: unknown): result is ErrorResult {
 	return (
 		typeof result === "object" &&
@@ -167,12 +149,8 @@ function ApprovalCard({
 		new_priority?: number;
 		new_label_names?: string[];
 	};
-	interruptData: InterruptResult;
-	onDecision: (decision: {
-		type: "approve" | "reject" | "edit";
-		message?: string;
-		edited_action?: { name: string; args: Record<string, unknown> };
-	}) => void;
+	interruptData: InterruptResult<LinearUpdateIssueContext>;
+	onDecision: (decision: HitlDecision) => void;
 }) {
 	const { phase, setProcessing, setRejected } = useHitlPhase(interruptData);
 
@@ -357,7 +335,7 @@ function ApprovalCard({
 							});
 						}}
 					>
-						<Pen className="size-3.5" />
+						<Pencil className="size-3.5" />
 						Edit
 					</Button>
 				)}
@@ -752,18 +730,16 @@ export const UpdateLinearIssueToolUI = ({
 	},
 	UpdateLinearIssueResult
 >) => {
+	const { dispatch } = useHitlDecision();
+
 	if (!result) return null;
 
 	if (isInterruptResult(result)) {
 		return (
 			<ApprovalCard
 				args={args}
-				interruptData={result}
-				onDecision={(decision) => {
-					window.dispatchEvent(
-						new CustomEvent("hitl-decision", { detail: { decisions: [decision] } })
-					);
-				}}
+				interruptData={result as InterruptResult<LinearUpdateIssueContext>}
+				onDecision={(decision) => dispatch([decision])}
 			/>
 		);
 	}
diff --git a/surfsense_web/components/tool-ui/notion/create-notion-page.tsx b/surfsense_web/components/tool-ui/notion/create-notion-page.tsx
index 52ac363ee2..6d783ab307 100644
--- a/surfsense_web/components/tool-ui/notion/create-notion-page.tsx
+++ b/surfsense_web/components/tool-ui/notion/create-notion-page.tsx
@@ -2,9 +2,8 @@
 
 import type { ToolCallMessagePartProps } from "@assistant-ui/react";
 import { useSetAtom } from "jotai";
-import { CornerDownLeftIcon, Pen } from "lucide-react";
+import { CornerDownLeftIcon, Pencil } from "lucide-react";
 import { useCallback, useEffect, useMemo, useState } from "react";
-import { openHitlEditPanelAtom } from "@/atoms/chat/hitl-edit-panel.atom";
 import { PlateEditor } from "@/components/editor/plate-editor";
 import { TextShimmerLoader } from "@/components/prompt-kit/loader";
 import { Button } from "@/components/ui/button";
@@ -15,43 +14,33 @@ import {
 	SelectTrigger,
 	SelectValue,
 } from "@/components/ui/select";
-import { useHitlPhase } from "@/hooks/use-hitl-phase";
-
-interface InterruptResult {
-	__interrupt__: true;
-	__decided__?: "approve" | "reject" | "edit";
-	__completed__?: boolean;
-	action_requests: Array<{
+import type { HitlDecision, InterruptResult } from "@/features/chat-messages/hitl";
+import {
+	isInterruptResult,
+	openHitlEditPanelAtom,
+	useHitlDecision,
+	useHitlPhase,
+} from "@/features/chat-messages/hitl";
+
+type NotionCreatePageContext = {
+	accounts?: Array<{
+		id: number;
 		name: string;
-		args: Record<string, unknown>;
-		description?: string;
+		workspace_id: string | null;
+		workspace_name: string;
+		workspace_icon: string;
+		auth_expired?: boolean;
 	}>;
-	review_configs: Array<{
-		action_name: string;
-		allowed_decisions: Array<"approve" | "edit" | "reject">;
-	}>;
-	interrupt_type?: string;
-	message?: string;
-	context?: {
-		accounts?: Array<{
-			id: number;
-			name: string;
-			workspace_id: string | null;
-			workspace_name: string;
-			workspace_icon: string;
-			auth_expired?: boolean;
-		}>;
-		parent_pages?: Record<
-			number,
-			Array<{
-				page_id: string;
-				title: string;
-				document_id: number;
-			}>
-		>;
-		error?: string;
-	};
-}
+	parent_pages?: Record<
+		number,
+		Array<{
+			page_id: string;
+			title: string;
+			document_id: number;
+		}>
+	>;
+	error?: string;
+};
 
 interface SuccessResult {
 	status: "success";
@@ -75,16 +64,11 @@ interface AuthErrorResult {
 	connector_type: string;
 }
 
-type CreateNotionPageResult = InterruptResult | SuccessResult | ErrorResult | AuthErrorResult;
-
-function isInterruptResult(result: unknown): result is InterruptResult {
-	return (
-		typeof result === "object" &&
-		result !== null &&
-		"__interrupt__" in result &&
-		(result as InterruptResult).__interrupt__ === true
-	);
-}
+type CreateNotionPageResult =
+	| InterruptResult<NotionCreatePageContext>
+	| SuccessResult
+	| ErrorResult
+	| AuthErrorResult;
 
 function isAuthErrorResult(result: unknown): result is AuthErrorResult {
 	return (
@@ -110,12 +94,8 @@ function ApprovalCard({
 	onDecision,
 }: {
 	args: Record<string, unknown>;
-	interruptData: InterruptResult;
-	onDecision: (decision: {
-		type: "approve" | "reject" | "edit";
-		message?: string;
-		edited_action?: { name: string; args: Record<string, unknown> };
-	}) => void;
+	interruptData: InterruptResult<NotionCreatePageContext>;
+	onDecision: (decision: HitlDecision) => void;
 }) {
 	const { phase, setProcessing, setRejected } = useHitlPhase(interruptData);
 	const [isPanelOpen, setIsPanelOpen] = useState(false);
@@ -242,7 +222,7 @@ function ApprovalCard({
 							});
 						}}
 					>
-						<Pen className="size-3.5" />
+						<Pencil className="size-3.5" />
 						Edit
 					</Button>
 				)}
@@ -449,19 +429,16 @@ export const CreateNotionPageToolUI = ({
 	args,
 	result,
 }: ToolCallMessagePartProps<{ title: string; content: string }, CreateNotionPageResult>) => {
+	const { dispatch } = useHitlDecision();
+
 	if (!result) return null;
 
 	if (isInterruptResult(result)) {
 		return (
 			<ApprovalCard
 				args={args}
-				interruptData={result}
-				onDecision={(decision) => {
-					const event = new CustomEvent("hitl-decision", {
-						detail: { decisions: [decision] },
-					});
-					window.dispatchEvent(event);
-				}}
+				interruptData={result as InterruptResult<NotionCreatePageContext>}
+				onDecision={(decision) => dispatch([decision])}
 			/>
 		);
 	}
diff --git a/surfsense_web/components/tool-ui/notion/delete-notion-page.tsx b/surfsense_web/components/tool-ui/notion/delete-notion-page.tsx
index 792a3e8c40..0f54a96c92 100644
--- a/surfsense_web/components/tool-ui/notion/delete-notion-page.tsx
+++ b/surfsense_web/components/tool-ui/notion/delete-notion-page.tsx
@@ -6,38 +6,23 @@ import { useCallback, useEffect, useState } from "react";
 import { TextShimmerLoader } from "@/components/prompt-kit/loader";
 import { Button } from "@/components/ui/button";
 import { Checkbox } from "@/components/ui/checkbox";
-import { useHitlPhase } from "@/hooks/use-hitl-phase";
+import type { HitlDecision, InterruptResult } from "@/features/chat-messages/hitl";
+import { isInterruptResult, useHitlDecision, useHitlPhase } from "@/features/chat-messages/hitl";
 
-interface InterruptResult {
-	__interrupt__: true;
-	__decided__?: "approve" | "reject";
-	__completed__?: boolean;
-	action_requests: Array<{
+type NotionDeletePageContext = {
+	account?: {
+		id: number;
 		name: string;
-		args: Record<string, unknown>;
-		description?: string;
-	}>;
-	review_configs: Array<{
-		action_name: string;
-		allowed_decisions: Array<"approve" | "reject">;
-	}>;
-	interrupt_type?: string;
-	message?: string;
-	context?: {
-		account?: {
-			id: number;
-			name: string;
-			workspace_id: string | null;
-			workspace_name: string;
-			workspace_icon: string;
-		};
-		page_id?: string;
-		current_title?: string;
-		document_id?: number;
-		indexed_at?: string;
-		error?: string;
+		workspace_id: string | null;
+		workspace_name: string;
+		workspace_icon: string;
 	};
-}
+	page_id?: string;
+	current_title?: string;
+	document_id?: number;
+	indexed_at?: string;
+	error?: string;
+};
 
 interface SuccessResult {
 	status: "success";
@@ -73,22 +58,13 @@ interface AuthErrorResult {
 }
 
 type DeleteNotionPageResult =
-	| InterruptResult
+	| InterruptResult<NotionDeletePageContext>
 	| SuccessResult
 	| ErrorResult
 	| InfoResult
 	| WarningResult
 	| AuthErrorResult;
 
-function isInterruptResult(result: unknown): result is InterruptResult {
-	return (
-		typeof result === "object" &&
-		result !== null &&
-		"__interrupt__" in result &&
-		(result as InterruptResult).__interrupt__ === true
-	);
-}
-
 function isErrorResult(result: unknown): result is ErrorResult {
 	return (
 		typeof result === "object" &&
@@ -131,12 +107,8 @@ function ApprovalCard({
 	interruptData,
 	onDecision,
 }: {
-	interruptData: InterruptResult;
-	onDecision: (decision: {
-		type: "approve" | "reject";
-		message?: string;
-		edited_action?: { name: string; args: Record<string, unknown> };
-	}) => void;
+	interruptData: InterruptResult<NotionDeletePageContext>;
+	onDecision: (decision: HitlDecision) => void;
 }) {
 	const { phase, setProcessing, setRejected } = useHitlPhase(interruptData);
 	const [deleteFromKb, setDeleteFromKb] = useState(false);
@@ -378,18 +350,15 @@ export const DeleteNotionPageToolUI = ({
 	{ page_title: string; delete_from_kb?: boolean },
 	DeleteNotionPageResult
 >) => {
+	const { dispatch } = useHitlDecision();
+
 	if (!result) return null;
 
 	if (isInterruptResult(result)) {
 		return (
 			<ApprovalCard
-				interruptData={result}
-				onDecision={(decision) => {
-					const event = new CustomEvent("hitl-decision", {
-						detail: { decisions: [decision] },
-					});
-					window.dispatchEvent(event);
-				}}
+				interruptData={result as InterruptResult<NotionDeletePageContext>}
+				onDecision={(decision) => dispatch([decision])}
 			/>
 		);
 	}
diff --git a/surfsense_web/components/tool-ui/notion/update-notion-page.tsx b/surfsense_web/components/tool-ui/notion/update-notion-page.tsx
index 5b7ce7daae..ef40c31991 100644
--- a/surfsense_web/components/tool-ui/notion/update-notion-page.tsx
+++ b/surfsense_web/components/tool-ui/notion/update-notion-page.tsx
@@ -2,44 +2,33 @@
 
 import type { ToolCallMessagePartProps } from "@assistant-ui/react";
 import { useSetAtom } from "jotai";
-import { CornerDownLeftIcon, Pen } from "lucide-react";
+import { CornerDownLeftIcon, Pencil } from "lucide-react";
 import { useCallback, useEffect, useState } from "react";
-import { openHitlEditPanelAtom } from "@/atoms/chat/hitl-edit-panel.atom";
 import { PlateEditor } from "@/components/editor/plate-editor";
 import { TextShimmerLoader } from "@/components/prompt-kit/loader";
 import { Button } from "@/components/ui/button";
-import { useHitlPhase } from "@/hooks/use-hitl-phase";
-
-interface InterruptResult {
-	__interrupt__: true;
-	__decided__?: "approve" | "reject" | "edit";
-	__completed__?: boolean;
-	action_requests: Array<{
+import type { HitlDecision, InterruptResult } from "@/features/chat-messages/hitl";
+import {
+	isInterruptResult,
+	openHitlEditPanelAtom,
+	useHitlDecision,
+	useHitlPhase,
+} from "@/features/chat-messages/hitl";
+
+type NotionUpdatePageContext = {
+	account?: {
+		id: number;
 		name: string;
-		args: Record<string, unknown>;
-		description?: string;
-	}>;
-	review_configs: Array<{
-		action_name: string;
-		allowed_decisions: Array<"approve" | "edit" | "reject">;
-	}>;
-	interrupt_type?: string;
-	message?: string;
-	context?: {
-		account?: {
-			id: number;
-			name: string;
-			workspace_id: string | null;
-			workspace_name: string;
-			workspace_icon: string;
-		};
-		page_id?: string;
-		current_title?: string;
-		document_id?: number;
-		indexed_at?: string;
-		error?: string;
+		workspace_id: string | null;
+		workspace_name: string;
+		workspace_icon: string;
 	};
-}
+	page_id?: string;
+	current_title?: string;
+	document_id?: number;
+	indexed_at?: string;
+	error?: string;
+};
 
 interface SuccessResult {
 	status: "success";
@@ -69,21 +58,12 @@ interface AuthErrorResult {
 }
 
 type UpdateNotionPageResult =
-	| InterruptResult
+	| InterruptResult<NotionUpdatePageContext>
 	| SuccessResult
 	| ErrorResult
 	| InfoResult
 	| AuthErrorResult;
 
-function isInterruptResult(result: unknown): result is InterruptResult {
-	return (
-		typeof result === "object" &&
-		result !== null &&
-		"__interrupt__" in result &&
-		(result as InterruptResult).__interrupt__ === true
-	);
-}
-
 function isErrorResult(result: unknown): result is ErrorResult {
 	return (
 		typeof result === "object" &&
@@ -117,12 +97,8 @@ function ApprovalCard({
 	onDecision,
 }: {
 	args: Record<string, unknown>;
-	interruptData: InterruptResult;
-	onDecision: (decision: {
-		type: "approve" | "reject" | "edit";
-		message?: string;
-		edited_action?: { name: string; args: Record<string, unknown> };
-	}) => void;
+	interruptData: InterruptResult<NotionUpdatePageContext>;
+	onDecision: (decision: HitlDecision) => void;
 }) {
 	const { phase, setProcessing, setRejected } = useHitlPhase(interruptData);
 	const [isPanelOpen, setIsPanelOpen] = useState(false);
@@ -223,7 +199,7 @@ function ApprovalCard({
 							});
 						}}
 					>
-						<Pen className="size-3.5" />
+						<Pencil className="size-3.5" />
 						Edit
 					</Button>
 				)}
@@ -399,19 +375,16 @@ export const UpdateNotionPageToolUI = ({
 	args,
 	result,
 }: ToolCallMessagePartProps<{ page_title: string; content: string }, UpdateNotionPageResult>) => {
+	const { dispatch } = useHitlDecision();
+
 	if (!result) return null;
 
 	if (isInterruptResult(result)) {
 		return (
 			<ApprovalCard
 				args={args}
-				interruptData={result}
-				onDecision={(decision) => {
-					const event = new CustomEvent("hitl-decision", {
-						detail: { decisions: [decision] },
-					});
-					window.dispatchEvent(event);
-				}}
+				interruptData={result as InterruptResult<NotionUpdatePageContext>}
+				onDecision={(decision) => dispatch([decision])}
 			/>
 		);
 	}
diff --git a/surfsense_web/components/tool-ui/onedrive/create-file.tsx b/surfsense_web/components/tool-ui/onedrive/create-file.tsx
index 244b894dc7..c5f4db3656 100644
--- a/surfsense_web/components/tool-ui/onedrive/create-file.tsx
+++ b/surfsense_web/components/tool-ui/onedrive/create-file.tsx
@@ -2,9 +2,8 @@
 
 import type { ToolCallMessagePartProps } from "@assistant-ui/react";
 import { useSetAtom } from "jotai";
-import { CornerDownLeftIcon, FileIcon, Pen } from "lucide-react";
+import { CornerDownLeftIcon, FileIcon, Pencil } from "lucide-react";
 import { useCallback, useEffect, useMemo, useState } from "react";
-import { openHitlEditPanelAtom } from "@/atoms/chat/hitl-edit-panel.atom";
 import { PlateEditor } from "@/components/editor/plate-editor";
 import { TextShimmerLoader } from "@/components/prompt-kit/loader";
 import { Button } from "@/components/ui/button";
@@ -15,7 +14,13 @@ import {
 	SelectTrigger,
 	SelectValue,
 } from "@/components/ui/select";
-import { useHitlPhase } from "@/hooks/use-hitl-phase";
+import type { HitlDecision, InterruptResult } from "@/features/chat-messages/hitl";
+import {
+	isInterruptResult,
+	openHitlEditPanelAtom,
+	useHitlDecision,
+	useHitlPhase,
+} from "@/features/chat-messages/hitl";
 
 interface OneDriveAccount {
 	id: number;
@@ -24,21 +29,11 @@ interface OneDriveAccount {
 	auth_expired?: boolean;
 }
 
-interface InterruptResult {
-	__interrupt__: true;
-	__decided__?: "approve" | "reject" | "edit";
-	__completed__?: boolean;
-	action_requests: Array<{ name: string; args: Record<string, unknown> }>;
-	review_configs: Array<{
-		action_name: string;
-		allowed_decisions: Array<"approve" | "edit" | "reject">;
-	}>;
-	context?: {
-		accounts?: OneDriveAccount[];
-		parent_folders?: Record<number, Array<{ folder_id: string; name: string }>>;
-		error?: string;
-	};
-}
+type OneDriveCreateFileContext = {
+	accounts?: OneDriveAccount[];
+	parent_folders?: Record<number, Array<{ folder_id: string; name: string }>>;
+	error?: string;
+};
 
 interface SuccessResult {
 	status: "success";
@@ -59,16 +54,11 @@ interface AuthErrorResult {
 	connector_type?: string;
 }
 
-type CreateOneDriveFileResult = InterruptResult | SuccessResult | ErrorResult | AuthErrorResult;
-
-function isInterruptResult(result: unknown): result is InterruptResult {
-	return (
-		typeof result === "object" &&
-		result !== null &&
-		"__interrupt__" in result &&
-		(result as InterruptResult).__interrupt__ === true
-	);
-}
+type CreateOneDriveFileResult =
+	| InterruptResult<OneDriveCreateFileContext>
+	| SuccessResult
+	| ErrorResult
+	| AuthErrorResult;
 
 function isErrorResult(result: unknown): result is ErrorResult {
 	return (
@@ -94,12 +84,8 @@ function ApprovalCard({
 	onDecision,
 }: {
 	args: { name: string; content?: string };
-	interruptData: InterruptResult;
-	onDecision: (decision: {
-		type: "approve" | "reject" | "edit";
-		message?: string;
-		edited_action?: { name: string; args: Record<string, unknown> };
-	}) => void;
+	interruptData: InterruptResult<OneDriveCreateFileContext>;
+	onDecision: (decision: HitlDecision) => void;
 }) {
 	const { phase, setProcessing, setRejected } = useHitlPhase(interruptData);
 	const [isPanelOpen, setIsPanelOpen] = useState(false);
@@ -226,7 +212,7 @@ function ApprovalCard({
 							});
 						}}
 					>
-						<Pen className="size-3.5" />
+						<Pencil className="size-3.5" />
 						Edit
 					</Button>
 				)}
@@ -434,17 +420,14 @@ export const CreateOneDriveFileToolUI = ({
 	args,
 	result,
 }: ToolCallMessagePartProps<{ name: string; content?: string }, CreateOneDriveFileResult>) => {
+	const { dispatch } = useHitlDecision();
 	if (!result) return null;
 	if (isInterruptResult(result)) {
 		return (
 			<ApprovalCard
 				args={args}
-				interruptData={result}
-				onDecision={(decision) => {
-					window.dispatchEvent(
-						new CustomEvent("hitl-decision", { detail: { decisions: [decision] } })
-					);
-				}}
+				interruptData={result as InterruptResult<OneDriveCreateFileContext>}
+				onDecision={(decision) => dispatch([decision])}
 			/>
 		);
 	}
diff --git a/surfsense_web/components/tool-ui/onedrive/trash-file.tsx b/surfsense_web/components/tool-ui/onedrive/trash-file.tsx
index b5d2e3d678..2643e103d5 100644
--- a/surfsense_web/components/tool-ui/onedrive/trash-file.tsx
+++ b/surfsense_web/components/tool-ui/onedrive/trash-file.tsx
@@ -6,7 +6,8 @@ import { useCallback, useEffect, useState } from "react";
 import { TextShimmerLoader } from "@/components/prompt-kit/loader";
 import { Button } from "@/components/ui/button";
 import { Checkbox } from "@/components/ui/checkbox";
-import { useHitlPhase } from "@/hooks/use-hitl-phase";
+import type { HitlDecision, InterruptResult } from "@/features/chat-messages/hitl";
+import { isInterruptResult, useHitlDecision, useHitlPhase } from "@/features/chat-messages/hitl";
 
 interface OneDriveAccount {
 	id: number;
@@ -22,14 +23,11 @@ interface OneDriveFile {
 	web_url?: string;
 }
 
-interface InterruptResult {
-	__interrupt__: true;
-	__decided__?: "approve" | "reject";
-	__completed__?: boolean;
-	action_requests: Array<{ name: string; args: Record<string, unknown> }>;
-	review_configs: Array<{ action_name: string; allowed_decisions: Array<"approve" | "reject"> }>;
-	context?: { account?: OneDriveAccount; file?: OneDriveFile; error?: string };
-}
+type OneDriveTrashFileContext = {
+	account?: OneDriveAccount;
+	file?: OneDriveFile;
+	error?: string;
+};
 
 interface SuccessResult {
 	status: "success";
@@ -52,20 +50,11 @@ interface AuthErrorResult {
 }
 
 type DeleteOneDriveFileResult =
-	| InterruptResult
+	| InterruptResult<OneDriveTrashFileContext>
 	| SuccessResult
 	| ErrorResult
 	| NotFoundResult
 	| AuthErrorResult;
-
-function isInterruptResult(result: unknown): result is InterruptResult {
-	return (
-		typeof result === "object" &&
-		result !== null &&
-		"__interrupt__" in result &&
-		(result as InterruptResult).__interrupt__ === true
-	);
-}
 function isErrorResult(result: unknown): result is ErrorResult {
 	return (
 		typeof result === "object" &&
@@ -95,12 +84,8 @@ function ApprovalCard({
 	interruptData,
 	onDecision,
 }: {
-	interruptData: InterruptResult;
-	onDecision: (decision: {
-		type: "approve" | "reject";
-		message?: string;
-		edited_action?: { name: string; args: Record<string, unknown> };
-	}) => void;
+	interruptData: InterruptResult<OneDriveTrashFileContext>;
+	onDecision: (decision: HitlDecision) => void;
 }) {
 	const { phase, setProcessing, setRejected } = useHitlPhase(interruptData);
 	const [deleteFromKb, setDeleteFromKb] = useState(false);
@@ -311,16 +296,13 @@ export const DeleteOneDriveFileToolUI = ({
 	{ file_name: string; delete_from_kb?: boolean },
 	DeleteOneDriveFileResult
 >) => {
+	const { dispatch } = useHitlDecision();
 	if (!result) return null;
 	if (isInterruptResult(result)) {
 		return (
 			<ApprovalCard
-				interruptData={result}
-				onDecision={(decision) => {
-					window.dispatchEvent(
-						new CustomEvent("hitl-decision", { detail: { decisions: [decision] } })
-					);
-				}}
+				interruptData={result as InterruptResult<OneDriveTrashFileContext>}
+				onDecision={(decision) => dispatch([decision])}
 			/>
 		);
 	}
diff --git a/surfsense_web/components/tool-ui/plan/plan.tsx b/surfsense_web/components/tool-ui/plan/plan.tsx
index 37a94fb9f0..4405cfc353 100644
--- a/surfsense_web/components/tool-ui/plan/plan.tsx
+++ b/surfsense_web/components/tool-ui/plan/plan.tsx
@@ -195,7 +195,7 @@ export const Plan: FC<PlanProps> = ({
 							<Button
 								variant="ghost"
 								size="sm"
-								className="w-full mt-2 text-xs text-muted-foreground hover:text-foreground"
+								className="w-full mt-2 text-xs text-muted-foreground hover:text-accent-foreground"
 							>
 								{isExpanded
 									? "Show less"
diff --git a/surfsense_web/components/tool-ui/sandbox-execute.tsx b/surfsense_web/components/tool-ui/sandbox-execute.tsx
index f54b035e36..3d309332e6 100644
--- a/surfsense_web/components/tool-ui/sandbox-execute.tsx
+++ b/surfsense_web/components/tool-ui/sandbox-execute.tsx
@@ -295,7 +295,7 @@ function ExecuteCompleted({
 			<Collapsible open={open} onOpenChange={setOpen}>
 				<CollapsibleTrigger
 					className={cn(
-						"flex w-full items-center gap-2 rounded-xl border bg-card px-4 py-2.5 text-left transition-colors hover:bg-accent/50",
+						"flex w-full items-center gap-2 rounded-xl border bg-card px-4 py-2.5 text-left transition-colors hover:bg-accent hover:text-accent-foreground",
 						open && "rounded-b-none border-b-0",
 						parsed.isError && "border-destructive/20"
 					)}
diff --git a/surfsense_web/components/tool-ui/user-memory.tsx b/surfsense_web/components/tool-ui/user-memory.tsx
index e232bdcc7a..f7c446806e 100644
--- a/surfsense_web/components/tool-ui/user-memory.tsx
+++ b/surfsense_web/components/tool-ui/user-memory.tsx
@@ -1,100 +1,38 @@
 "use client";
 
 import type { ToolCallMessagePartProps } from "@assistant-ui/react";
-import { BrainIcon, CheckIcon, Loader2Icon, SearchIcon, XIcon } from "lucide-react";
+import { AlertTriangleIcon, BrainIcon, CheckIcon, Loader2Icon, XIcon } from "lucide-react";
 import { z } from "zod";
 
 // ============================================================================
-// Zod Schemas for save_memory tool
+// Zod Schemas for update_memory tool
 // ============================================================================
 
-const SaveMemoryArgsSchema = z.object({
-	content: z.string(),
-	category: z.string().default("fact"),
+const UpdateMemoryArgsSchema = z.object({
+	updated_memory: z.string(),
 });
 
-const SaveMemoryResultSchema = z.object({
+const UpdateMemoryResultSchema = z.object({
 	status: z.enum(["saved", "error"]),
-	memory_id: z.number().nullish(),
-	memory_text: z.string().nullish(),
-	category: z.string().nullish(),
 	message: z.string().nullish(),
-	error: z.string().nullish(),
+	warning: z.string().nullish(),
 });
 
-type SaveMemoryArgs = z.infer<typeof SaveMemoryArgsSchema>;
-type SaveMemoryResult = z.infer<typeof SaveMemoryResultSchema>;
+type UpdateMemoryArgs = z.infer<typeof UpdateMemoryArgsSchema>;
+type UpdateMemoryResult = z.infer<typeof UpdateMemoryResultSchema>;
 
 // ============================================================================
-// Zod Schemas for recall_memory tool
+// Update Memory Tool UI
 // ============================================================================
 
-const RecallMemoryArgsSchema = z.object({
-	query: z.string().nullish(),
-	category: z.string().nullish(),
-	top_k: z.number().default(5),
-});
-
-const MemoryItemSchema = z.object({
-	id: z.number(),
-	memory_text: z.string(),
-	category: z.string(),
-	updated_at: z.string().nullish(),
-});
-
-const RecallMemoryResultSchema = z.object({
-	status: z.enum(["success", "error"]),
-	count: z.number().nullish(),
-	memories: z.array(MemoryItemSchema).nullish(),
-	formatted_context: z.string().nullish(),
-	error: z.string().nullish(),
-});
-
-type RecallMemoryArgs = z.infer<typeof RecallMemoryArgsSchema>;
-type RecallMemoryResult = z.infer<typeof RecallMemoryResultSchema>;
-type MemoryItem = z.infer<typeof MemoryItemSchema>;
-
-// ============================================================================
-// Category badge colors
-// ============================================================================
-
-const categoryColors: Record<string, string> = {
-	preference: "bg-blue-500/10 text-blue-600 dark:text-blue-400",
-	fact: "bg-green-500/10 text-green-600 dark:text-green-400",
-	instruction: "bg-purple-500/10 text-purple-600 dark:text-purple-400",
-	context: "bg-orange-500/10 text-orange-600 dark:text-orange-400",
-};
-
-function CategoryBadge({ category }: { category: string }) {
-	const colorClass = categoryColors[category] || "bg-gray-500/10 text-gray-600 dark:text-gray-400";
-	return (
-		<span
-			className={`inline-flex items-center rounded-full px-2 py-0.5 text-xs font-medium ${colorClass}`}
-		>
-			{category}
-		</span>
-	);
-}
-
-// ============================================================================
-// Save Memory Tool UI
-// ============================================================================
-
-export const SaveMemoryToolUI = ({
-	args,
+export const UpdateMemoryToolUI = ({
 	result,
 	status,
-}: ToolCallMessagePartProps<SaveMemoryArgs, SaveMemoryResult>) => {
+}: ToolCallMessagePartProps<UpdateMemoryArgs, UpdateMemoryResult>) => {
 	const isRunning = status.type === "running" || status.type === "requires-action";
 	const isComplete = status.type === "complete";
 	const isError = result?.status === "error";
 
-	// Parse args safely
-	const parsedArgs = SaveMemoryArgsSchema.safeParse(args);
-	const content = parsedArgs.success ? parsedArgs.data.content : "";
-	const category = parsedArgs.success ? parsedArgs.data.category : "fact";
-
-	// Loading state
 	if (isRunning) {
 		return (
 			<div className="my-3 flex items-center gap-3 rounded-lg border bg-card/60 px-4 py-3">
@@ -102,13 +40,12 @@ export const SaveMemoryToolUI = ({
 					<Loader2Icon className="size-4 animate-spin text-primary" />
 				</div>
 				<div className="flex-1">
-					<span className="text-sm text-muted-foreground">Saving to memory...</span>
+					<span className="text-sm text-muted-foreground">Updating memory...</span>
 				</div>
 			</div>
 		);
 	}
 
-	// Error state
 	if (isError) {
 		return (
 			<div className="my-3 flex items-center gap-3 rounded-lg border border-destructive/20 bg-destructive/5 px-4 py-3">
@@ -116,14 +53,13 @@ export const SaveMemoryToolUI = ({
 					<XIcon className="size-4 text-destructive" />
 				</div>
 				<div className="flex-1">
-					<span className="text-sm text-destructive">Failed to save memory</span>
-					{result?.error && <p className="mt-1 text-xs text-destructive/70">{result.error}</p>}
+					<span className="text-sm text-destructive">Failed to update memory</span>
+					{result?.message && <p className="mt-1 text-xs text-destructive/70">{result.message}</p>}
 				</div>
 			</div>
 		);
 	}
 
-	// Success state
 	if (isComplete && result?.status === "saved") {
 		return (
 			<div className="my-3 flex items-center gap-3 rounded-lg border border-primary/20 bg-primary/5 px-4 py-3">
@@ -133,138 +69,19 @@ export const SaveMemoryToolUI = ({
 				<div className="flex-1 min-w-0">
 					<div className="flex items-center gap-2">
 						<CheckIcon className="size-3 text-green-500 shrink-0" />
-						<span className="text-sm font-medium text-foreground">Memory saved</span>
-						<CategoryBadge category={category} />
-					</div>
-					<p className="mt-1 truncate text-sm text-muted-foreground">{content}</p>
-				</div>
-			</div>
-		);
-	}
-
-	// Default/incomplete state - show what's being saved
-	if (content) {
-		return (
-			<div className="my-3 flex items-center gap-3 rounded-lg border bg-card/60 px-4 py-3">
-				<div className="flex size-8 items-center justify-center rounded-full bg-muted">
-					<BrainIcon className="size-4 text-muted-foreground" />
-				</div>
-				<div className="flex-1 min-w-0">
-					<div className="flex items-center gap-2">
-						<span className="text-sm text-muted-foreground">Saving memory</span>
-						<CategoryBadge category={category} />
-					</div>
-					<p className="mt-1 truncate text-sm text-muted-foreground">{content}</p>
-				</div>
-			</div>
-		);
-	}
-
-	return null;
-};
-
-// ============================================================================
-// Recall Memory Tool UI
-// ============================================================================
-
-export const RecallMemoryToolUI = ({
-	args,
-	result,
-	status,
-}: ToolCallMessagePartProps<RecallMemoryArgs, RecallMemoryResult>) => {
-	const isRunning = status.type === "running" || status.type === "requires-action";
-	const isComplete = status.type === "complete";
-	const isError = result?.status === "error";
-
-	// Parse args safely
-	const parsedArgs = RecallMemoryArgsSchema.safeParse(args);
-	const query = parsedArgs.success ? parsedArgs.data.query : null;
-
-	// Loading state
-	if (isRunning) {
-		return (
-			<div className="my-3 flex items-center gap-3 rounded-lg border bg-card/60 px-4 py-3">
-				<div className="flex size-8 items-center justify-center rounded-full bg-primary/10">
-					<Loader2Icon className="size-4 animate-spin text-primary" />
-				</div>
-				<div className="flex-1">
-					<span className="text-sm text-muted-foreground">
-						{query ? `Searching memories for "${query}"...` : "Recalling memories..."}
-					</span>
-				</div>
-			</div>
-		);
-	}
-
-	// Error state
-	if (isError) {
-		return (
-			<div className="my-3 flex items-center gap-3 rounded-lg border border-destructive/20 bg-destructive/5 px-4 py-3">
-				<div className="flex size-8 items-center justify-center rounded-full bg-destructive/10">
-					<XIcon className="size-4 text-destructive" />
-				</div>
-				<div className="flex-1">
-					<span className="text-sm text-destructive">Failed to recall memories</span>
-					{result?.error && <p className="mt-1 text-xs text-destructive/70">{result.error}</p>}
-				</div>
-			</div>
-		);
-	}
-
-	// Success state with memories
-	if (isComplete && result?.status === "success") {
-		const memories = result.memories || [];
-		const count = result.count || 0;
-
-		if (count === 0) {
-			return (
-				<div className="my-3 flex items-center gap-3 rounded-lg border bg-card/60 px-4 py-3">
-					<div className="flex size-8 items-center justify-center rounded-full bg-muted">
-						<SearchIcon className="size-4 text-muted-foreground" />
+						<span className="text-sm font-medium text-foreground">Memory updated</span>
 					</div>
-					<span className="text-sm text-muted-foreground">No memories found</span>
-				</div>
-			);
-		}
-
-		return (
-			<div className="my-3 rounded-lg border bg-card/60 px-4 py-3">
-				<div className="flex items-center gap-2 mb-2">
-					<BrainIcon className="size-4 text-primary" />
-					<span className="text-sm font-medium text-foreground">
-						Recalled {count} {count === 1 ? "memory" : "memories"}
-					</span>
-				</div>
-				<div className="space-y-2">
-					{memories.slice(0, 5).map((memory: MemoryItem) => (
-						<div
-							key={memory.id}
-							className="flex items-start gap-2 rounded-md bg-muted/50 px-3 py-2"
-						>
-							<CategoryBadge category={memory.category} />
-							<span className="text-sm text-muted-foreground flex-1">{memory.memory_text}</span>
+					{result.warning && (
+						<div className="mt-1.5 flex items-start gap-1.5">
+							<AlertTriangleIcon className="size-3 text-yellow-500 shrink-0 mt-0.5" />
+							<p className="text-xs text-yellow-600 dark:text-yellow-400">{result.warning}</p>
 						</div>
-					))}
-					{memories.length > 5 && (
-						<p className="text-xs text-muted-foreground">...and {memories.length - 5} more</p>
 					)}
 				</div>
 			</div>
 		);
 	}
 
-	// Default/incomplete state
-	if (query) {
-		return (
-			<div className="my-3 flex items-center gap-3 rounded-lg border bg-card/60 px-4 py-3">
-				<div className="flex size-8 items-center justify-center rounded-full bg-muted">
-					<SearchIcon className="size-4 text-muted-foreground" />
-				</div>
-				<span className="text-sm text-muted-foreground">Searching memories for "{query}"</span>
-			</div>
-		);
-	}
-
 	return null;
 };
 
@@ -273,13 +90,8 @@ export const RecallMemoryToolUI = ({
 // ============================================================================
 
 export {
-	SaveMemoryArgsSchema,
-	SaveMemoryResultSchema,
-	RecallMemoryArgsSchema,
-	RecallMemoryResultSchema,
-	type SaveMemoryArgs,
-	type SaveMemoryResult,
-	type RecallMemoryArgs,
-	type RecallMemoryResult,
-	type MemoryItem,
+	UpdateMemoryArgsSchema,
+	UpdateMemoryResultSchema,
+	type UpdateMemoryArgs,
+	type UpdateMemoryResult,
 };
diff --git a/surfsense_web/components/ui/alert-dialog.tsx b/surfsense_web/components/ui/alert-dialog.tsx
index ec0d3cbd50..4db90020f5 100644
--- a/surfsense_web/components/ui/alert-dialog.tsx
+++ b/surfsense_web/components/ui/alert-dialog.tsx
@@ -45,7 +45,7 @@ function AlertDialogContent({
 			<AlertDialogPrimitive.Content
 				data-slot="alert-dialog-content"
 				className={cn(
-					"bg-background dark:bg-neutral-900 data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 fixed top-[50%] left-[50%] z-50 grid w-full max-w-[calc(100%-2rem)] translate-x-[-50%] translate-y-[-50%] gap-4 rounded-xl p-6 shadow-2xl duration-200 sm:max-w-lg",
+					"bg-popover text-popover-foreground data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 fixed top-[50%] left-[50%] z-50 grid w-full max-w-[calc(100%-2rem)] translate-x-[-50%] translate-y-[-50%] gap-4 rounded-xl p-6 shadow-2xl duration-200 sm:max-w-lg select-none",
 					className
 				)}
 				{...props}
diff --git a/surfsense_web/components/ui/alert.tsx b/surfsense_web/components/ui/alert.tsx
index 450b60285f..2d777db67b 100644
--- a/surfsense_web/components/ui/alert.tsx
+++ b/surfsense_web/components/ui/alert.tsx
@@ -8,9 +8,11 @@ const alertVariants = cva(
 	{
 		variants: {
 			variant: {
-				default: "bg-card text-card-foreground",
+				default: "border-popover-border bg-accent text-accent-foreground",
+				warning:
+					"border-highlight/40 bg-highlight/10 text-foreground *:data-[slot=alert-description]:text-muted-foreground [&>svg]:text-highlight",
 				destructive:
-					"bg-card text-destructive *:data-[slot=alert-description]:text-destructive/90 [&>svg]:text-current",
+					"border-destructive/30 bg-destructive/10 text-destructive *:data-[slot=alert-description]:text-destructive/90 [&>svg]:text-current",
 			},
 		},
 		defaultVariants: {
diff --git a/surfsense_web/components/ui/calendar.tsx b/surfsense_web/components/ui/calendar.tsx
index e3e1be301b..a048bbd7bb 100644
--- a/surfsense_web/components/ui/calendar.tsx
+++ b/surfsense_web/components/ui/calendar.tsx
@@ -24,7 +24,7 @@ function Calendar({
 		<DayPicker
 			showOutsideDays={showOutsideDays}
 			className={cn(
-				"bg-background group/calendar p-3 [--cell-size:--spacing(8)] [[data-slot=card-content]_&]:bg-transparent [[data-slot=popover-content]_&]:bg-transparent",
+				"bg-background group/calendar rounded-md border border-popover-border p-3 [--cell-size:--spacing(8)] [[data-slot=card-content]_&]:bg-transparent [[data-slot=popover-content]_&]:bg-transparent",
 				String.raw`rtl:**:[.rdp-button\_next>svg]:rotate-180`,
 				String.raw`rtl:**:[.rdp-button\_previous>svg]:rotate-180`,
 				className
@@ -61,7 +61,7 @@ function Calendar({
 					defaultClassNames.dropdowns
 				),
 				dropdown_root: cn(
-					"relative has-focus:border-ring border border-input shadow-xs has-focus:ring-ring/50 has-focus:ring-[3px] rounded-md",
+					"relative has-focus:border-ring border border-popover-border shadow-xs has-focus:ring-ring/50 has-focus:ring-[3px] rounded-md",
 					defaultClassNames.dropdown_root
 				),
 				dropdown: cn("absolute inset-0 opacity-0", defaultClassNames.dropdown),
diff --git a/surfsense_web/components/ui/callout-node.tsx b/surfsense_web/components/ui/callout-node.tsx
index 0b5213b43b..0c8901a1dc 100644
--- a/surfsense_web/components/ui/callout-node.tsx
+++ b/surfsense_web/components/ui/callout-node.tsx
@@ -6,6 +6,7 @@ import type { TCalloutElement } from "platejs";
 import { PlateElement, type PlateElementProps, useEditorPlugin } from "platejs/react";
 import * as React from "react";
 
+import { Button } from "@/components/ui/button";
 import { cn } from "@/lib/utils";
 
 const calloutVariants = cva("my-1 flex w-full items-start gap-2 rounded-lg border p-4", {
@@ -24,7 +25,10 @@ const calloutVariants = cva("my-1 flex w-full items-start gap-2 rounded-lg borde
 	},
 });
 
-const calloutIcons: Record<string, string> = {
+const variantCycle = ["info", "warning", "error", "success", "note", "tip"] as const;
+type CalloutVariant = (typeof variantCycle)[number];
+
+const calloutIcons: Record<CalloutVariant, string> = {
 	info: "💡",
 	warning: "⚠️",
 	error: "🚨",
@@ -33,13 +37,13 @@ const calloutIcons: Record<string, string> = {
 	tip: "💜",
 };
 
-const variantCycle = ["info", "warning", "error", "success", "note", "tip"] as const;
-
 export function CalloutElement({ children, ...props }: PlateElementProps<TCalloutElement>) {
 	const { editor } = useEditorPlugin(CalloutPlugin);
 	const element = props.element;
-	const variant = element.variant || "info";
-	const icon = element.icon || calloutIcons[variant] || "💡";
+	const variant = variantCycle.includes(element.variant as CalloutVariant)
+		? (element.variant as CalloutVariant)
+		: "info";
+	const icon = element.icon || calloutIcons[variant];
 
 	const cycleVariant = React.useCallback(() => {
 		const currentIndex = variantCycle.indexOf(variant as (typeof variantCycle)[number]);
@@ -56,19 +60,17 @@ export function CalloutElement({ children, ...props }: PlateElementProps<TCallou
 	}, [editor, variant, props.path]);
 
 	return (
-		<PlateElement
-			{...props}
-			className={cn(calloutVariants({ variant: variant as any }), props.className)}
-		>
-			<button
-				className="mt-0.5 shrink-0 cursor-pointer select-none text-lg leading-none"
+		<PlateElement {...props} className={cn(calloutVariants({ variant }), props.className)}>
+			<Button
+				variant="ghost"
+				className="mt-0.5 h-auto shrink-0 cursor-pointer select-none p-0 text-lg leading-none hover:bg-transparent"
 				contentEditable={false}
 				onClick={cycleVariant}
 				type="button"
 				aria-label="Change callout type"
 			>
 				{icon}
-			</button>
+			</Button>
 			<div className="min-w-0 flex-1">{children}</div>
 		</PlateElement>
 	);
diff --git a/surfsense_web/components/ui/command.tsx b/surfsense_web/components/ui/command.tsx
index 203d8b32cb..034dbab5eb 100644
--- a/surfsense_web/components/ui/command.tsx
+++ b/surfsense_web/components/ui/command.tsx
@@ -118,7 +118,7 @@ function CommandSeparator({
 	return (
 		<CommandPrimitive.Separator
 			data-slot="command-separator"
-			className={cn("bg-border -mx-1 h-px", className)}
+			className={cn("bg-popover-border -mx-1 h-px", className)}
 			{...props}
 		/>
 	);
diff --git a/surfsense_web/components/ui/context-menu.tsx b/surfsense_web/components/ui/context-menu.tsx
index f3d4063e26..8fa7c6d1a3 100644
--- a/surfsense_web/components/ui/context-menu.tsx
+++ b/surfsense_web/components/ui/context-menu.tsx
@@ -66,7 +66,7 @@ function ContextMenuSubContent({
 		<ContextMenuPrimitive.SubContent
 			data-slot="context-menu-sub-content"
 			className={cn(
-				"bg-muted text-popover-foreground data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2 z-50 min-w-[8rem] overflow-hidden rounded-md border dark:border-neutral-700 p-1 shadow-lg",
+				"bg-popover text-popover-foreground data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2 z-50 min-w-[8rem] overflow-hidden rounded-md p-1 shadow-lg",
 				className
 			)}
 			{...props}
@@ -83,7 +83,7 @@ function ContextMenuContent({
 			<ContextMenuPrimitive.Content
 				data-slot="context-menu-content"
 				className={cn(
-					"bg-muted text-popover-foreground data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2 z-50 min-w-[8rem] overflow-hidden rounded-md border dark:border-neutral-700 p-1 shadow-md",
+					"bg-popover text-popover-foreground data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2 z-50 min-w-[8rem] overflow-hidden rounded-md p-1 shadow-md",
 					className
 				)}
 				{...props}
@@ -107,7 +107,7 @@ function ContextMenuItem({
 			data-inset={inset}
 			data-variant={variant}
 			className={cn(
-				"focus:bg-accent focus:text-accent-foreground dark:focus:bg-neutral-700 relative flex cursor-pointer items-center gap-2 rounded-sm px-2 py-1.5 text-sm outline-hidden select-none data-[disabled]:pointer-events-none data-[disabled]:opacity-50 data-[inset]:pl-8 [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4 [&_svg:not([class*='text-'])]:text-muted-foreground",
+				"focus:bg-accent focus:text-accent-foreground relative flex cursor-pointer items-center gap-2 rounded-sm px-2 py-1.5 text-sm outline-hidden select-none data-[disabled]:pointer-events-none data-[disabled]:opacity-50 data-[inset]:pl-8 [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4 [&_svg:not([class*='text-'])]:text-muted-foreground",
 				className
 			)}
 			{...props}
@@ -189,7 +189,7 @@ function ContextMenuSeparator({
 	return (
 		<ContextMenuPrimitive.Separator
 			data-slot="context-menu-separator"
-			className={cn("bg-border dark:bg-neutral-700 -mx-1 my-1 h-px", className)}
+			className={cn("bg-popover-border -mx-1 my-1 h-px", className)}
 			{...props}
 		/>
 	);
diff --git a/surfsense_web/components/ui/dialog.tsx b/surfsense_web/components/ui/dialog.tsx
index 5b2ca36e1c..90c8bf5383 100644
--- a/surfsense_web/components/ui/dialog.tsx
+++ b/surfsense_web/components/ui/dialog.tsx
@@ -38,13 +38,13 @@ const DialogContent = React.forwardRef<
 		<DialogPrimitive.Content
 			ref={ref}
 			className={cn(
-				"fixed left-[50%] top-[50%] z-50 grid w-full max-w-lg translate-x-[-50%] translate-y-[-50%] gap-4 bg-background dark:bg-neutral-900 p-6 shadow-2xl duration-200 data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 rounded-xl focus:outline-none focus:ring-0 focus-visible:outline-none focus-visible:ring-0",
+				"fixed left-[50%] top-[50%] z-50 grid w-full max-w-lg translate-x-[-50%] translate-y-[-50%] gap-4 bg-popover text-popover-foreground p-6 shadow-2xl duration-200 data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 rounded-xl focus:outline-none focus:ring-0 focus-visible:outline-none focus-visible:ring-0 select-none",
 				className
 			)}
 			{...props}
 		>
 			{children}
-			<DialogPrimitive.Close className="absolute right-4 top-4 z-50 h-8 w-8 rounded-full inline-flex items-center justify-center text-muted-foreground transition-colors hover:text-foreground hover:bg-accent focus:outline-none disabled:pointer-events-none">
+			<DialogPrimitive.Close className="absolute right-4 top-4 z-50 h-8 w-8 rounded-full inline-flex items-center justify-center text-muted-foreground transition-colors hover:text-accent-foreground hover:bg-accent focus:outline-none disabled:pointer-events-none">
 				<X className="h-4 w-4" />
 				<span className="sr-only">Close</span>
 			</DialogPrimitive.Close>
diff --git a/surfsense_web/components/ui/dropdown-menu.tsx b/surfsense_web/components/ui/dropdown-menu.tsx
index 78ddad710f..421bcfcd29 100644
--- a/surfsense_web/components/ui/dropdown-menu.tsx
+++ b/surfsense_web/components/ui/dropdown-menu.tsx
@@ -33,7 +33,7 @@ function DropdownMenuContent({
 				data-slot="dropdown-menu-content"
 				sideOffset={sideOffset}
 				className={cn(
-					"bg-muted text-popover-foreground data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2 z-50 max-h-(--radix-dropdown-menu-content-available-height) min-w-[8rem] origin-(--radix-dropdown-menu-content-transform-origin) overflow-x-hidden overflow-y-auto rounded-md border dark:border-neutral-700 p-1 shadow-md",
+					"bg-popover text-popover-foreground border border-popover-border data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2 z-50 max-h-(--radix-dropdown-menu-content-available-height) min-w-[8rem] origin-(--radix-dropdown-menu-content-transform-origin) overflow-x-hidden overflow-y-auto rounded-md p-1 shadow-md",
 					className
 				)}
 				{...props}
@@ -61,7 +61,7 @@ function DropdownMenuItem({
 			data-inset={inset}
 			data-variant={variant}
 			className={cn(
-				"focus:bg-neutral-200 focus:text-accent-foreground dark:focus:bg-neutral-700 [&_svg:not([class*='text-'])]:text-muted-foreground relative flex cursor-default items-center gap-2 rounded-sm px-2 py-1.5 text-sm outline-hidden select-none data-[disabled]:pointer-events-none data-[disabled]:opacity-50 data-[inset]:pl-8 [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4",
+				"focus:bg-accent focus:text-accent-foreground [&_svg:not([class*='text-'])]:text-muted-foreground relative flex cursor-default items-center gap-2 rounded-sm px-2 py-1.5 text-sm outline-hidden select-none data-[disabled]:pointer-events-none data-[disabled]:opacity-50 data-[inset]:pl-8 [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4",
 				className
 			)}
 			{...props}
@@ -79,7 +79,7 @@ function DropdownMenuCheckboxItem({
 		<DropdownMenuPrimitive.CheckboxItem
 			data-slot="dropdown-menu-checkbox-item"
 			className={cn(
-				"focus:bg-neutral-200 focus:text-accent-foreground dark:focus:bg-neutral-700 relative flex cursor-default items-center gap-2 rounded-sm py-1.5 pr-2 pl-8 text-sm outline-hidden select-none data-[disabled]:pointer-events-none data-[disabled]:opacity-50 [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4",
+				"focus:bg-accent focus:text-accent-foreground relative flex cursor-default items-center gap-2 rounded-sm py-1.5 pr-2 pl-8 text-sm outline-hidden select-none data-[disabled]:pointer-events-none data-[disabled]:opacity-50 [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4",
 				className
 			)}
 			checked={checked}
@@ -110,7 +110,7 @@ function DropdownMenuRadioItem({
 		<DropdownMenuPrimitive.RadioItem
 			data-slot="dropdown-menu-radio-item"
 			className={cn(
-				"focus:bg-neutral-200 focus:text-accent-foreground dark:focus:bg-neutral-700 relative flex cursor-default items-center gap-2 rounded-sm py-1.5 pr-2 pl-8 text-sm outline-hidden select-none data-[disabled]:pointer-events-none data-[disabled]:opacity-50 [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4",
+				"focus:bg-accent focus:text-accent-foreground relative flex cursor-default items-center gap-2 rounded-sm py-1.5 pr-2 pl-8 text-sm outline-hidden select-none data-[disabled]:pointer-events-none data-[disabled]:opacity-50 [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4",
 				className
 			)}
 			{...props}
@@ -149,7 +149,7 @@ function DropdownMenuSeparator({
 	return (
 		<DropdownMenuPrimitive.Separator
 			data-slot="dropdown-menu-separator"
-			className={cn("bg-border mx-2 my-1 h-px", className)}
+			className={cn("bg-popover-border mx-2 my-1 h-px", className)}
 			{...props}
 		/>
 	);
@@ -182,7 +182,7 @@ function DropdownMenuSubTrigger({
 			data-slot="dropdown-menu-sub-trigger"
 			data-inset={inset}
 			className={cn(
-				"focus:bg-neutral-200 focus:text-accent-foreground dark:focus:bg-neutral-700 data-[state=open]:bg-neutral-200 data-[state=open]:text-accent-foreground dark:data-[state=open]:bg-neutral-700 [&_svg:not([class*='text-'])]:text-muted-foreground flex cursor-default items-center gap-2 rounded-sm px-2 py-1.5 text-sm outline-hidden select-none data-[disabled]:pointer-events-none data-[disabled]:opacity-50 data-[inset]:pl-8 [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4",
+				"focus:bg-accent focus:text-accent-foreground data-[state=open]:bg-accent data-[state=open]:text-accent-foreground [&_svg:not([class*='text-'])]:text-muted-foreground flex cursor-default items-center gap-2 rounded-sm px-2 py-1.5 text-sm outline-hidden select-none data-[disabled]:pointer-events-none data-[disabled]:opacity-50 data-[inset]:pl-8 [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4",
 				className
 			)}
 			{...props}
@@ -201,7 +201,7 @@ function DropdownMenuSubContent({
 		<DropdownMenuPrimitive.SubContent
 			data-slot="dropdown-menu-sub-content"
 			className={cn(
-				"bg-muted text-popover-foreground data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2 z-50 min-w-[8rem] origin-(--radix-dropdown-menu-content-transform-origin) overflow-hidden rounded-md border dark:border-neutral-700 p-1 shadow-lg",
+				"bg-popover text-popover-foreground border border-popover-border data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2 z-50 min-w-[8rem] origin-(--radix-dropdown-menu-content-transform-origin) overflow-hidden rounded-md p-1 shadow-lg",
 				className
 			)}
 			{...props}
diff --git a/surfsense_web/components/ui/editor.tsx b/surfsense_web/components/ui/editor.tsx
index 0a9cb5b766..92d6545559 100644
--- a/surfsense_web/components/ui/editor.tsx
+++ b/surfsense_web/components/ui/editor.tsx
@@ -54,8 +54,8 @@ const editorVariants = cva(
 	cn(
 		"group/editor",
 		"relative w-full cursor-text select-text overflow-x-hidden whitespace-pre-wrap break-words",
-		"rounded-md ring-offset-background focus-visible:outline-none",
-		"**:data-slate-placeholder:!top-1/2 **:data-slate-placeholder:-translate-y-1/2 placeholder:text-muted-foreground/80 **:data-slate-placeholder:text-muted-foreground/80 **:data-slate-placeholder:opacity-100!",
+		"rounded-none ring-offset-background focus-visible:outline-none",
+		"placeholder:text-muted-foreground/80 **:data-slate-placeholder:text-muted-foreground/80 **:data-slate-placeholder:py-1",
 		"[&_strong]:font-bold"
 	),
 	{
diff --git a/surfsense_web/components/ui/equation-node.tsx b/surfsense_web/components/ui/equation-node.tsx
index e708c32385..54024dc527 100644
--- a/surfsense_web/components/ui/equation-node.tsx
+++ b/surfsense_web/components/ui/equation-node.tsx
@@ -6,6 +6,7 @@ import type { TEquationElement } from "platejs";
 import { PlateElement, type PlateElementProps, useSelected } from "platejs/react";
 import * as React from "react";
 
+import { Button } from "@/components/ui/button";
 import { cn } from "@/lib/utils";
 
 export function EquationElement({ children, ...props }: PlateElementProps<TEquationElement>) {
@@ -74,20 +75,18 @@ export function EquationElement({ children, ...props }: PlateElementProps<TEquat
 						{...inputProps}
 					/>
 					<div className="mt-1 flex justify-end gap-1">
-						<button
-							className="rounded px-2 py-1 text-xs text-muted-foreground hover:bg-accent"
+						<Button
+							variant="ghost"
+							size="sm"
+							className="h-auto px-2 py-1 text-xs text-muted-foreground"
 							onClick={onDismiss}
 							type="button"
 						>
 							Cancel
-						</button>
-						<button
-							className="rounded bg-primary px-2 py-1 text-xs text-primary-foreground hover:bg-primary/90"
-							onClick={onSubmit}
-							type="button"
-						>
+						</Button>
+						<Button size="sm" className="h-auto px-2 py-1 text-xs" onClick={onSubmit} type="button">
 							Done
-						</button>
+						</Button>
 					</div>
 				</div>
 			)}
@@ -151,7 +150,7 @@ export function InlineEquationElement({ children, ...props }: PlateElementProps<
 
 			{isEditing && (
 				<span
-					className="absolute z-50 mt-1 rounded-md border bg-popover p-2 shadow-md"
+					className="absolute z-50 mt-1 rounded-md bg-popover p-2 shadow-md"
 					contentEditable={false}
 				>
 					<textarea
@@ -162,20 +161,23 @@ export function InlineEquationElement({ children, ...props }: PlateElementProps<
 						{...inputProps}
 					/>
 					<span className="mt-1 flex justify-end gap-1">
-						<button
-							className="rounded px-2 py-0.5 text-xs text-muted-foreground hover:bg-accent"
+						<Button
+							variant="ghost"
+							size="sm"
+							className="h-auto px-2 py-0.5 text-xs text-muted-foreground"
 							onClick={onDismiss}
 							type="button"
 						>
 							Cancel
-						</button>
-						<button
-							className="rounded bg-primary px-2 py-0.5 text-xs text-primary-foreground hover:bg-primary/90"
+						</Button>
+						<Button
+							size="sm"
+							className="h-auto px-2 py-0.5 text-xs"
 							onClick={onSubmit}
 							type="button"
 						>
 							Done
-						</button>
+						</Button>
 					</span>
 				</span>
 			)}
diff --git a/surfsense_web/components/ui/fixed-toolbar.tsx b/surfsense_web/components/ui/fixed-toolbar.tsx
index 40677e777d..540088ea5e 100644
--- a/surfsense_web/components/ui/fixed-toolbar.tsx
+++ b/surfsense_web/components/ui/fixed-toolbar.tsx
@@ -14,7 +14,7 @@ export function FixedToolbar({
 	return (
 		<Toolbar
 			className={cn(
-				"scrollbar-hide sticky top-0 left-0 z-10 w-full justify-between overflow-x-auto border-b bg-background/95 p-1 backdrop-blur supports-backdrop-filter:bg-background/60",
+				"scrollbar-hide absolute top-0 left-0 z-40 w-full justify-between overflow-x-auto border-b bg-background p-1",
 				className
 			)}
 			{...props}
diff --git a/surfsense_web/components/ui/floating-toolbar.tsx b/surfsense_web/components/ui/floating-toolbar.tsx
index 6349194744..33560573a0 100644
--- a/surfsense_web/components/ui/floating-toolbar.tsx
+++ b/surfsense_web/components/ui/floating-toolbar.tsx
@@ -65,8 +65,9 @@ export function FloatingToolbar({
 				{...rootProps}
 				ref={ref}
 				className={cn(
-					"scrollbar-hide absolute z-50 overflow-x-auto whitespace-nowrap rounded-md border bg-popover p-1 opacity-100 shadow-md print:hidden",
+					"scrollbar-hide absolute z-50 overflow-x-auto whitespace-nowrap rounded-md bg-muted p-1 opacity-100 shadow-md print:hidden",
 					"max-w-[80vw]",
+					"[&_button:hover]:bg-neutral-200 dark:[&_button:hover]:bg-neutral-700",
 					className
 				)}
 			>
diff --git a/surfsense_web/components/ui/heading-node.tsx b/surfsense_web/components/ui/heading-node.tsx
index 11bdcf0af9..0474718118 100644
--- a/surfsense_web/components/ui/heading-node.tsx
+++ b/surfsense_web/components/ui/heading-node.tsx
@@ -6,7 +6,7 @@ import type { PlateElementProps } from "platejs/react";
 import { PlateElement } from "platejs/react";
 import * as React from "react";
 
-const headingVariants = cva("relative mb-1", {
+const headingVariants = cva("relative mb-1 first:mt-0", {
 	variants: {
 		variant: {
 			h1: "mt-[1.6em] pb-1 font-bold font-heading text-4xl",
diff --git a/surfsense_web/components/ui/hero-carousel.tsx b/surfsense_web/components/ui/hero-carousel.tsx
index c93fe59793..53c1febac3 100644
--- a/surfsense_web/components/ui/hero-carousel.tsx
+++ b/surfsense_web/components/ui/hero-carousel.tsx
@@ -3,7 +3,8 @@
 import { ChevronLeft, ChevronRight } from "lucide-react";
 import { AnimatePresence, motion } from "motion/react";
 import { useCallback, useEffect, useRef, useState } from "react";
-import { ExpandedGifOverlay, useExpandedGif } from "@/components/ui/expanded-gif-overlay";
+import { Button } from "@/components/ui/button";
+import { ExpandedMediaOverlay, useExpandedMedia } from "@/components/ui/expanded-gif-overlay";
 
 const carouselItems = [
 	{
@@ -70,7 +71,7 @@ function HeroCarouselCard({
 	src: string;
 	onExpandedChange?: (expanded: boolean) => void;
 }) {
-	const { expanded, open, close } = useExpandedGif();
+	const { expanded, open, close } = useExpandedMedia();
 	const videoRef = useRef<HTMLVideoElement>(null);
 	const [hasLoaded, setHasLoaded] = useState(false);
 
@@ -118,6 +119,7 @@ function HeroCarouselCard({
 				<div
 					role="button"
 					tabIndex={0}
+					aria-label={`Expand ${title}`}
 					className="cursor-pointer bg-neutral-50 p-2 sm:p-3 dark:bg-neutral-950"
 					onClick={open}
 					onKeyDown={(e) => {
@@ -146,7 +148,7 @@ function HeroCarouselCard({
 			</div>
 
 			<AnimatePresence>
-				{expanded && <ExpandedGifOverlay src={src} alt={title} onClose={close} />}
+				{expanded && <ExpandedMediaOverlay src={src} alt={title} onClose={close} />}
 			</AnimatePresence>
 		</>
 	);
@@ -203,22 +205,26 @@ function HeroCarousel() {
 			</div>
 
 			<div className="relative z-5 mt-4 flex items-center justify-center gap-2">
-				<button
+				<Button
 					type="button"
+					variant="ghost"
+					size="icon"
 					onClick={() => !isGifExpanded && goToPrev()}
-					className="flex size-11 items-center justify-center rounded-full border border-neutral-200 bg-white text-neutral-700 shadow-sm transition-colors hover:bg-neutral-100 touch-manipulation dark:border-neutral-700 dark:bg-neutral-800 dark:text-neutral-200 dark:hover:bg-neutral-700"
+					className="size-11 rounded-full border border-neutral-200 bg-white text-neutral-700 shadow-sm hover:bg-neutral-100 touch-manipulation dark:border-neutral-700 dark:bg-neutral-800 dark:text-neutral-200 dark:hover:bg-neutral-700"
 					aria-label="Previous slide"
 				>
 					<ChevronLeft className="size-5" />
-				</button>
+				</Button>
 
 				<div className="flex items-center">
-					{carouselItems.map((_, i) => (
-						<button
-							key={`dot_${i}`}
+					{carouselItems.map((carouselItem, i) => (
+						<Button
+							key={carouselItem.src}
 							type="button"
+							variant="ghost"
+							size="icon"
 							onClick={() => !isGifExpanded && goTo(i)}
-							className="flex h-11 min-w-[28px] items-center justify-center touch-manipulation"
+							className="h-11 min-w-[28px] bg-transparent p-0 hover:bg-transparent touch-manipulation"
 							aria-label={`Go to slide ${i + 1}`}
 						>
 							<span
@@ -228,18 +234,20 @@ function HeroCarousel() {
 										: "w-2.5 bg-neutral-300 hover:bg-neutral-400 dark:bg-neutral-600 dark:hover:bg-neutral-500"
 								}`}
 							/>
-						</button>
+						</Button>
 					))}
 				</div>
 
-				<button
+				<Button
 					type="button"
+					variant="ghost"
+					size="icon"
 					onClick={() => !isGifExpanded && goToNext()}
-					className="flex size-11 items-center justify-center rounded-full border border-neutral-200 bg-white text-neutral-700 shadow-sm transition-colors hover:bg-neutral-100 touch-manipulation dark:border-neutral-700 dark:bg-neutral-800 dark:text-neutral-200 dark:hover:bg-neutral-700"
+					className="size-11 rounded-full border border-neutral-200 bg-white text-neutral-700 shadow-sm hover:bg-neutral-100 touch-manipulation dark:border-neutral-700 dark:bg-neutral-800 dark:text-neutral-200 dark:hover:bg-neutral-700"
 					aria-label="Next slide"
 				>
 					<ChevronRight className="size-5" />
-				</button>
+				</Button>
 			</div>
 		</div>
 	);
diff --git a/surfsense_web/components/ui/inline-combobox.tsx b/surfsense_web/components/ui/inline-combobox.tsx
index 941c1c7db7..2c45457b13 100644
--- a/surfsense_web/components/ui/inline-combobox.tsx
+++ b/surfsense_web/components/ui/inline-combobox.tsx
@@ -280,7 +280,7 @@ const InlineComboboxContent: typeof ComboboxPopover = ({ className, ...props })
 		<Portal>
 			<ComboboxPopover
 				className={cn(
-					"z-500 max-h-[288px] w-[300px] overflow-y-auto rounded-md bg-popover shadow-md",
+					"z-500 max-h-[288px] w-[300px] overflow-y-auto overscroll-none rounded-md bg-popover shadow-md",
 					className
 				)}
 				onKeyDownCapture={handleKeyDown}
@@ -299,7 +299,7 @@ const comboboxItemVariants = cva(
 		variants: {
 			interactive: {
 				false: "",
-				true: "cursor-pointer transition-colors hover:bg-accent hover:text-accent-foreground data-[active-item=true]:bg-accent data-[active-item=true]:text-accent-foreground dark:hover:bg-neutral-700 dark:data-[active-item=true]:bg-neutral-700",
+				true: "cursor-pointer transition-colors hover:bg-accent hover:text-accent-foreground data-[active-item=true]:bg-accent data-[active-item=true]:text-accent-foreground",
 			},
 		},
 	}
diff --git a/surfsense_web/components/ui/input.tsx b/surfsense_web/components/ui/input.tsx
index d2b9edec36..549a71affb 100644
--- a/surfsense_web/components/ui/input.tsx
+++ b/surfsense_web/components/ui/input.tsx
@@ -8,7 +8,7 @@ function Input({ className, type, ...props }: React.ComponentProps<"input">) {
 			type={type}
 			data-slot="input"
 			className={cn(
-				"border-input file:text-foreground placeholder:text-muted-foreground selection:bg-primary selection:text-primary-foreground flex h-9 w-full min-w-0 rounded-md border bg-transparent px-3 py-1 text-base shadow-xs transition-[color,box-shadow] outline-none file:inline-flex file:h-7 file:border-0 file:bg-transparent file:text-sm file:font-medium disabled:pointer-events-none disabled:cursor-not-allowed disabled:opacity-50 md:text-sm",
+				"border-popover-border file:text-foreground placeholder:text-muted-foreground selection:bg-primary selection:text-primary-foreground flex h-9 w-full min-w-0 rounded-md border bg-transparent px-3 py-1 text-base shadow-xs transition-[color,box-shadow] outline-none file:inline-flex file:h-7 file:border-0 file:bg-transparent file:text-sm file:font-medium disabled:pointer-events-none disabled:cursor-not-allowed disabled:opacity-50 md:text-sm",
 				"focus-visible:border-ring",
 				"aria-invalid:ring-destructive/20 dark:aria-invalid:ring-destructive/40 aria-invalid:border-destructive",
 				className
diff --git a/surfsense_web/components/ui/link-toolbar.tsx b/surfsense_web/components/ui/link-toolbar.tsx
index 5779d91578..462c716f08 100644
--- a/surfsense_web/components/ui/link-toolbar.tsx
+++ b/surfsense_web/components/ui/link-toolbar.tsx
@@ -22,11 +22,11 @@ import {
 } from "platejs/react";
 import * as React from "react";
 
-import { buttonVariants } from "@/components/ui/button";
+import { Button } from "@/components/ui/button";
 import { Separator } from "@/components/ui/separator";
 
 const popoverVariants = cva(
-	"z-50 w-auto rounded-md border bg-popover p-1 text-popover-foreground shadow-md outline-hidden"
+	"z-50 w-auto rounded-md bg-popover p-1 text-popover-foreground shadow-md outline-hidden"
 );
 
 const inputVariants = cva(
@@ -116,13 +116,9 @@ export function LinkFloatingToolbar({ state }: { state?: LinkFloatingToolbarStat
 		input
 	) : (
 		<div className="box-content flex items-center">
-			<button
-				className={buttonVariants({ size: "sm", variant: "ghost" })}
-				type="button"
-				{...editButtonProps}
-			>
+			<Button size="sm" variant="ghost" type="button" {...editButtonProps}>
 				Edit link
-			</button>
+			</Button>
 
 			<Separator orientation="vertical" />
 
@@ -130,16 +126,9 @@ export function LinkFloatingToolbar({ state }: { state?: LinkFloatingToolbarStat
 
 			<Separator orientation="vertical" />
 
-			<button
-				className={buttonVariants({
-					size: "sm",
-					variant: "ghost",
-				})}
-				type="button"
-				{...unlinkButtonProps}
-			>
+			<Button size="sm" variant="ghost" type="button" {...unlinkButtonProps}>
 				<Unlink width={18} />
-			</button>
+			</Button>
 		</div>
 	);
 
@@ -158,29 +147,22 @@ export function LinkFloatingToolbar({ state }: { state?: LinkFloatingToolbarStat
 
 function LinkOpenButton() {
 	const editor = useEditorRef();
-	const selection = useEditorSelection();
-
-	// biome-ignore lint/correctness/useExhaustiveDependencies: selection triggers recalculation of link attributes
-	const attributes = React.useMemo(() => {
-		const entry = editor.api.node<TLinkElement>({
-			match: { type: editor.getType(KEYS.link) },
-		});
-		if (!entry) {
-			return {};
-		}
-		const [element] = entry;
-		return getLinkAttributes(editor, element);
-	}, [editor, selection]);
+	useEditorSelection();
+
+	const entry = editor.api.node<TLinkElement>({
+		match: { type: editor.getType(KEYS.link) },
+	});
+	const attributes = entry ? getLinkAttributes(editor, entry[0]) : {};
+	const href = typeof attributes.href === "string" ? attributes.href : undefined;
 
 	return (
-		// biome-ignore lint/a11y/noStaticElementInteractions: <a> with spread attributes has dynamic href
-		// biome-ignore lint/a11y/useAriaPropsSupportedByRole: aria-label needed for icon-only link
-		<a
-			{...attributes}
-			className={buttonVariants({
-				size: "sm",
-				variant: "ghost",
-			})}
+		<Button
+			size="sm"
+			variant="ghost"
+			type="button"
+			onClick={() => {
+				if (href) window.open(href, "_blank", "noopener,noreferrer");
+			}}
 			onMouseOver={(e) => {
 				e.stopPropagation();
 			}}
@@ -188,10 +170,9 @@ function LinkOpenButton() {
 				e.stopPropagation();
 			}}
 			aria-label="Open link in a new tab"
-			target="_blank"
-			rel="noopener noreferrer"
+			disabled={!href}
 		>
 			<ExternalLink width={18} />
-		</a>
+		</Button>
 	);
 }
diff --git a/surfsense_web/components/ui/mode-toolbar-button.tsx b/surfsense_web/components/ui/mode-toolbar-button.tsx
index 37231991ff..394eaf97ca 100644
--- a/surfsense_web/components/ui/mode-toolbar-button.tsx
+++ b/surfsense_web/components/ui/mode-toolbar-button.tsx
@@ -1,6 +1,6 @@
 "use client";
 
-import { BookOpenIcon, PenLineIcon } from "lucide-react";
+import { BookOpenIcon, Pencil } from "lucide-react";
 import { usePlateState } from "platejs/react";
 
 import { ToolbarButton } from "./toolbar";
@@ -13,7 +13,7 @@ export function ModeToolbarButton() {
 			tooltip={readOnly ? "Click to edit" : "Click to view"}
 			onClick={() => setReadOnly(!readOnly)}
 		>
-			{readOnly ? <BookOpenIcon /> : <PenLineIcon />}
+			{readOnly ? <BookOpenIcon /> : <Pencil />}
 		</ToolbarButton>
 	);
 }
diff --git a/surfsense_web/components/ui/popover.tsx b/surfsense_web/components/ui/popover.tsx
index 2dd8d280b5..0d2759543a 100644
--- a/surfsense_web/components/ui/popover.tsx
+++ b/surfsense_web/components/ui/popover.tsx
@@ -26,7 +26,7 @@ function PopoverContent({
 				align={align}
 				sideOffset={sideOffset}
 				className={cn(
-					"bg-muted text-popover-foreground data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2 z-50 w-72 rounded-md border dark:border-neutral-700 p-4 shadow-md outline-hidden",
+					"bg-popover text-popover-foreground data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2 z-50 w-72 rounded-md p-4 shadow-md outline-hidden",
 					className
 				)}
 				{...props}
diff --git a/surfsense_web/components/ui/select.tsx b/surfsense_web/components/ui/select.tsx
index 5bde69af25..cf22bf6a32 100644
--- a/surfsense_web/components/ui/select.tsx
+++ b/surfsense_web/components/ui/select.tsx
@@ -27,7 +27,7 @@ function SelectTrigger({
 		<SelectPrimitive.Trigger
 			data-slot="select-trigger"
 			className={cn(
-				"border-input data-[placeholder]:text-muted-foreground [&_svg:not([class*='text-'])]:text-muted-foreground aria-invalid:ring-destructive/20 dark:aria-invalid:ring-destructive/40 aria-invalid:border-destructive flex h-9 w-fit items-center justify-between gap-2 rounded-md border bg-transparent px-3 py-2 text-sm whitespace-nowrap shadow-xs transition-[color,box-shadow] outline-none focus-visible:ring-0 focus-visible:ring-offset-0 disabled:cursor-not-allowed disabled:opacity-50 *:data-[slot=select-value]:line-clamp-1 *:data-[slot=select-value]:flex *:data-[slot=select-value]:items-center *:data-[slot=select-value]:gap-2 [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4",
+				"border-popover-border data-[placeholder]:text-muted-foreground [&_svg:not([class*='text-'])]:text-muted-foreground aria-invalid:ring-destructive/20 dark:aria-invalid:ring-destructive/40 aria-invalid:border-destructive flex h-9 w-fit items-center justify-between gap-2 rounded-md border bg-transparent px-3 py-2 text-sm whitespace-nowrap shadow-xs transition-[color,box-shadow] outline-none focus-visible:ring-0 focus-visible:ring-offset-0 disabled:cursor-not-allowed disabled:opacity-50 *:data-[slot=select-value]:line-clamp-1 *:data-[slot=select-value]:flex *:data-[slot=select-value]:items-center *:data-[slot=select-value]:gap-2 [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4",
 				className
 			)}
 			{...props}
@@ -51,7 +51,7 @@ function SelectContent({
 			<SelectPrimitive.Content
 				data-slot="select-content"
 				className={cn(
-					"bg-muted text-popover-foreground data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2 relative z-50 max-h-96 min-w-[8rem] overflow-hidden rounded-md border dark:border-neutral-700 shadow-md",
+					"bg-popover text-popover-foreground border border-popover-border data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2 relative z-50 max-h-96 min-w-[8rem] overflow-hidden rounded-md shadow-md",
 					position === "popper" &&
 						"data-[side=bottom]:translate-y-1 data-[side=left]:-translate-x-1 data-[side=right]:translate-x-1 data-[side=top]:-translate-y-1",
 					className
@@ -97,7 +97,7 @@ function SelectItem({
 		<SelectPrimitive.Item
 			data-slot="select-item"
 			className={cn(
-				"focus:bg-accent focus:text-accent-foreground dark:focus:bg-neutral-700 hover:bg-accent dark:hover:bg-neutral-700 [&_svg:not([class*='text-'])]:text-muted-foreground relative flex w-full cursor-default items-center gap-2 rounded-sm py-1.5 px-2 text-sm outline-hidden select-none transition-all data-[disabled]:pointer-events-none data-[disabled]:opacity-50 [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4 *:[span]:last:flex *:[span]:last:items-center *:[span]:last:gap-2 data-[highlighted]:bg-accent dark:data-[highlighted]:bg-neutral-700",
+				"focus:bg-accent focus:text-accent-foreground hover:bg-accent hover:text-accent-foreground [&_svg:not([class*='text-'])]:text-muted-foreground relative flex w-full cursor-default items-center gap-2 rounded-sm py-1.5 px-2 text-sm outline-hidden select-none transition-colors data-[disabled]:pointer-events-none data-[disabled]:opacity-50 [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4 *:[span]:last:flex *:[span]:last:items-center *:[span]:last:gap-2 data-[highlighted]:bg-accent data-[highlighted]:text-accent-foreground",
 				className
 			)}
 			{...props}
@@ -121,7 +121,7 @@ function SelectSeparator({
 	return (
 		<SelectPrimitive.Separator
 			data-slot="select-separator"
-			className={cn("bg-border dark:bg-neutral-700 pointer-events-none -mx-1 my-1 h-px", className)}
+			className={cn("bg-popover-border pointer-events-none -mx-1 my-1 h-px", className)}
 			{...props}
 		/>
 	);
diff --git a/surfsense_web/components/ui/sheet.tsx b/surfsense_web/components/ui/sheet.tsx
index 650e854032..6ba22350e4 100644
--- a/surfsense_web/components/ui/sheet.tsx
+++ b/surfsense_web/components/ui/sheet.tsx
@@ -54,21 +54,21 @@ function SheetContent({
 			<SheetPrimitive.Content
 				data-slot="sheet-content"
 				className={cn(
-					"bg-background data-[state=open]:animate-in data-[state=closed]:animate-out fixed z-50 flex flex-col gap-4 shadow-lg transition ease-in-out data-[state=closed]:duration-300 data-[state=open]:duration-500",
+					"bg-popover text-popover-foreground data-[state=open]:animate-in data-[state=closed]:animate-out fixed z-50 flex flex-col gap-4 shadow-lg transition ease-in-out data-[state=closed]:duration-300 data-[state=open]:duration-500",
 					side === "right" &&
-						"data-[state=closed]:slide-out-to-right data-[state=open]:slide-in-from-right inset-y-0 right-0 h-full w-3/4 border-l sm:max-w-sm",
+						"data-[state=closed]:slide-out-to-right data-[state=open]:slide-in-from-right inset-y-0 right-0 h-full w-3/4 sm:max-w-sm",
 					side === "left" &&
-						"data-[state=closed]:slide-out-to-left data-[state=open]:slide-in-from-left inset-y-0 left-0 h-full w-3/4 border-r sm:max-w-sm",
+						"data-[state=closed]:slide-out-to-left data-[state=open]:slide-in-from-left inset-y-0 left-0 h-full w-3/4 sm:max-w-sm",
 					side === "top" &&
-						"data-[state=closed]:slide-out-to-top data-[state=open]:slide-in-from-top inset-x-0 top-0 h-auto border-b",
+						"data-[state=closed]:slide-out-to-top data-[state=open]:slide-in-from-top inset-x-0 top-0 h-auto",
 					side === "bottom" &&
-						"data-[state=closed]:slide-out-to-bottom data-[state=open]:slide-in-from-bottom inset-x-0 bottom-0 h-auto border-t",
+						"data-[state=closed]:slide-out-to-bottom data-[state=open]:slide-in-from-bottom inset-x-0 bottom-0 h-auto",
 					className
 				)}
 				{...props}
 			>
 				{children}
-				<SheetPrimitive.Close className="ring-offset-background focus:ring-ring data-[state=open]:bg-secondary absolute top-4 right-4 rounded-xs opacity-70 transition-opacity hover:opacity-100 focus:ring-2 focus:ring-offset-2 focus:outline-hidden disabled:pointer-events-none">
+				<SheetPrimitive.Close className="ring-offset-background focus:ring-ring absolute top-4 right-4 inline-flex h-8 w-8 items-center justify-center rounded-full text-muted-foreground transition-colors hover:bg-accent hover:text-accent-foreground focus:ring-2 focus:ring-offset-2 focus:outline-hidden disabled:pointer-events-none">
 					<XIcon className="size-4" />
 					<span className="sr-only">Close</span>
 				</SheetPrimitive.Close>
diff --git a/surfsense_web/components/ui/sidebar.tsx b/surfsense_web/components/ui/sidebar.tsx
index 7399a30aa5..a5592bf9c6 100644
--- a/surfsense_web/components/ui/sidebar.tsx
+++ b/surfsense_web/components/ui/sidebar.tsx
@@ -263,11 +263,12 @@ function SidebarTrigger({ className, onClick, ...props }: React.ComponentProps<t
 	);
 }
 
-function SidebarRail({ className, ...props }: React.ComponentProps<"button">) {
+function SidebarRail({ className, ...props }: React.ComponentProps<typeof Button>) {
 	const { toggleSidebar } = useSidebar();
 
 	return (
-		<button
+		<Button
+			variant="ghost"
 			data-sidebar="rail"
 			data-slot="sidebar-rail"
 			aria-label="Toggle Sidebar"
@@ -275,7 +276,7 @@ function SidebarRail({ className, ...props }: React.ComponentProps<"button">) {
 			onClick={toggleSidebar}
 			title="Toggle Sidebar"
 			className={cn(
-				"hover:after:bg-sidebar-border absolute inset-y-0 z-20 hidden w-4 -translate-x-1/2 transition-all ease-linear group-data-[side=left]:-right-4 group-data-[side=right]:left-0 after:absolute after:inset-y-0 after:left-1/2 after:w-[2px] sm:flex",
+				"hover:after:bg-sidebar-border absolute inset-y-0 z-20 hidden h-auto w-4 -translate-x-1/2 rounded-none p-0 transition-all ease-linear hover:bg-transparent group-data-[side=left]:-right-4 group-data-[side=right]:left-0 after:absolute after:inset-y-0 after:left-1/2 after:w-[2px] sm:flex",
 				"in-data-[side=left]:cursor-w-resize in-data-[side=right]:cursor-e-resize",
 				"[[data-side=left][data-state=collapsed]_&]:cursor-e-resize [[data-side=right][data-state=collapsed]_&]:cursor-w-resize",
 				"hover:group-data-[collapsible=offcanvas]:bg-sidebar group-data-[collapsible=offcanvas]:translate-x-0 group-data-[collapsible=offcanvas]:after:left-full",
diff --git a/surfsense_web/components/ui/switch.tsx b/surfsense_web/components/ui/switch.tsx
index de2c35fc0e..2ebd7bfd85 100644
--- a/surfsense_web/components/ui/switch.tsx
+++ b/surfsense_web/components/ui/switch.tsx
@@ -19,7 +19,7 @@ const Switch = React.forwardRef<
 	>
 		<SwitchPrimitives.Thumb
 			className={cn(
-				"pointer-events-none block h-5 w-5 rounded-full bg-background shadow-lg ring-0 transition-transform data-[state=checked]:translate-x-5 data-[state=checked]:bg-background data-[state=unchecked]:translate-x-0 data-[state=unchecked]:bg-muted-foreground/40"
+				"pointer-events-none block h-5 w-5 rounded-full bg-background shadow-lg ring-0 transition-transform data-[state=checked]:translate-x-5 data-[state=checked]:bg-[var(--panel)] data-[state=unchecked]:translate-x-0 data-[state=unchecked]:bg-muted-foreground/40"
 			)}
 		/>
 	</SwitchPrimitives.Root>
diff --git a/surfsense_web/components/ui/table-node.tsx b/surfsense_web/components/ui/table-node.tsx
index c0f1d6ea8a..06d1826e92 100644
--- a/surfsense_web/components/ui/table-node.tsx
+++ b/surfsense_web/components/ui/table-node.tsx
@@ -123,7 +123,7 @@ function TableFloatingToolbar({ children, ...props }: React.ComponentProps<typeo
 				{...props}
 			>
 				<Toolbar
-					className="scrollbar-hide flex w-auto max-w-[80vw] flex-row overflow-x-auto rounded-md border bg-popover p-1 shadow-md print:hidden"
+					className="scrollbar-hide flex w-auto max-w-[80vw] flex-row overflow-x-auto rounded-md bg-popover p-1 shadow-md print:hidden"
 					contentEditable={false}
 				>
 					<ToolbarGroup>
diff --git a/surfsense_web/components/ui/textarea.tsx b/surfsense_web/components/ui/textarea.tsx
index 8a2691aab8..8ec113c9e0 100644
--- a/surfsense_web/components/ui/textarea.tsx
+++ b/surfsense_web/components/ui/textarea.tsx
@@ -7,7 +7,7 @@ function Textarea({ className, ...props }: React.ComponentProps<"textarea">) {
 		<textarea
 			data-slot="textarea"
 			className={cn(
-				"border-input placeholder:text-muted-foreground focus-visible:border-ring focus-visible:ring-ring/50 aria-invalid:ring-destructive/20 dark:aria-invalid:ring-destructive/40 aria-invalid:border-destructive dark:bg-input/30 flex field-sizing-content min-h-16 w-full rounded-md border bg-transparent px-3 py-2 text-base shadow-xs transition-[color,box-shadow] outline-none focus-visible:ring-[3px] disabled:cursor-not-allowed disabled:opacity-50 md:text-sm",
+				"border-popover-border placeholder:text-muted-foreground focus-visible:border-ring focus-visible:ring-ring/50 aria-invalid:ring-destructive/20 dark:aria-invalid:ring-destructive/40 aria-invalid:border-destructive dark:bg-input/30 flex field-sizing-content min-h-16 w-full rounded-md border bg-transparent px-3 py-2 text-base shadow-xs transition-[color,box-shadow] outline-none focus-visible:ring-[3px] disabled:cursor-not-allowed disabled:opacity-50 md:text-sm",
 				className
 			)}
 			{...props}
diff --git a/surfsense_web/components/ui/toggle-node.tsx b/surfsense_web/components/ui/toggle-node.tsx
index 4d5d617873..f56632200f 100644
--- a/surfsense_web/components/ui/toggle-node.tsx
+++ b/surfsense_web/components/ui/toggle-node.tsx
@@ -3,8 +3,8 @@
 import { useToggleButton, useToggleButtonState } from "@platejs/toggle/react";
 import { ChevronRightIcon } from "lucide-react";
 import { PlateElement, type PlateElementProps } from "platejs/react";
-import * as React from "react";
 
+import { Button } from "@/components/ui/button";
 import { cn } from "@/lib/utils";
 
 export function ToggleElement({ children, ...props }: PlateElementProps) {
@@ -14,7 +14,9 @@ export function ToggleElement({ children, ...props }: PlateElementProps) {
 
 	return (
 		<PlateElement {...props} className="relative py-1 pl-6">
-			<button
+			<Button
+				variant="ghost"
+				size="icon"
 				className={cn(
 					"absolute top-1.5 left-0 flex size-6 cursor-pointer select-none items-center justify-center rounded-sm text-muted-foreground transition-colors hover:bg-accent hover:text-accent-foreground"
 				)}
@@ -25,7 +27,7 @@ export function ToggleElement({ children, ...props }: PlateElementProps) {
 				<ChevronRightIcon
 					className={cn("size-4 transition-transform duration-200", open && "rotate-90")}
 				/>
-			</button>
+			</Button>
 			<div>{children}</div>
 		</PlateElement>
 	);
diff --git a/surfsense_web/components/ui/toolbar.tsx b/surfsense_web/components/ui/toolbar.tsx
index b030f5c6d7..35ad1352c0 100644
--- a/surfsense_web/components/ui/toolbar.tsx
+++ b/surfsense_web/components/ui/toolbar.tsx
@@ -6,6 +6,7 @@ import { cva, type VariantProps } from "class-variance-authority";
 import { ChevronDown } from "lucide-react";
 import * as React from "react";
 
+import { Button } from "@/components/ui/button";
 import {
 	DropdownMenuLabel,
 	DropdownMenuRadioGroup,
@@ -219,7 +220,8 @@ export function ToolbarSplitButtonSecondary({
 	...props
 }: React.ComponentPropsWithoutRef<"span"> & VariantProps<typeof dropdownArrowVariants>) {
 	return (
-		<button
+		<Button
+			variant="ghost"
 			type="button"
 			className={cn(
 				dropdownArrowVariants({
@@ -233,7 +235,7 @@ export function ToolbarSplitButtonSecondary({
 			{...(props as React.ComponentPropsWithoutRef<"button">)}
 		>
 			<ChevronDown className="size-3.5 text-muted-foreground" data-icon />
-		</button>
+		</Button>
 	);
 }
 
@@ -337,8 +339,7 @@ export function ToolbarMenuGroup({
 			<DropdownMenuSeparator
 				className={cn(
 					"hidden",
-					"mb-0 mx-2 shrink-0 peer-has-[[role=menuitem]]/menu-group:block peer-has-[[role=menuitemradio]]/menu-group:block peer-has-[[role=option]]/menu-group:block",
-					"dark:bg-neutral-700"
+					"mb-0 mx-2 shrink-0 peer-has-[[role=menuitem]]/menu-group:block peer-has-[[role=menuitemradio]]/menu-group:block peer-has-[[role=option]]/menu-group:block"
 				)}
 			/>
 
diff --git a/surfsense_web/components/ui/tooltip.tsx b/surfsense_web/components/ui/tooltip.tsx
index bcf1c72f88..4253c50d48 100644
--- a/surfsense_web/components/ui/tooltip.tsx
+++ b/surfsense_web/components/ui/tooltip.tsx
@@ -6,20 +6,19 @@ import { useEffect, useState } from "react";
 
 import { cn } from "@/lib/utils";
 
-const MOBILE_BREAKPOINT = 768;
-
-function useIsTouchDevice() {
-	const [isTouch, setIsTouch] = useState(false);
+function useCanHover() {
+	const [canHover, setCanHover] = useState(false);
 
 	useEffect(() => {
-		const mql = window.matchMedia(`(max-width: ${MOBILE_BREAKPOINT - 1}px)`);
-		const update = () => setIsTouch(mql.matches);
+		// Hover-capable pointers are a better cross-platform signal than viewport width.
+		const mql = window.matchMedia("(hover: hover) and (pointer: fine)");
+		const update = () => setCanHover(mql.matches);
 		update();
 		mql.addEventListener("change", update);
 		return () => mql.removeEventListener("change", update);
 	}, []);
 
-	return isTouch;
+	return canHover;
 }
 
 function TooltipProvider({
@@ -42,14 +41,14 @@ function Tooltip({
 	onOpenChange,
 	...props
 }: React.ComponentProps<typeof TooltipPrimitive.Root>) {
-	const isMobile = useIsTouchDevice();
+	const canHover = useCanHover();
 
 	return (
 		<TooltipProvider>
 			<TooltipPrimitive.Root
 				data-slot="tooltip"
-				open={isMobile ? false : open}
-				onOpenChange={isMobile ? undefined : onOpenChange}
+				open={canHover ? open : false}
+				onOpenChange={canHover ? onOpenChange : undefined}
 				{...props}
 			/>
 		</TooltipProvider>
@@ -72,7 +71,7 @@ function TooltipContent({
 				data-slot="tooltip-content"
 				sideOffset={sideOffset}
 				className={cn(
-					"bg-black text-white font-medium shadow-xl px-3 py-1.5 dark:bg-zinc-800 dark:text-zinc-50 border-none animate-in fade-in-0 zoom-in-95 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2 z-50 w-fit rounded-md text-xs text-pretty pointer-events-none select-none",
+					"bg-neutral-950 text-white font-medium px-3 py-1.5 dark:bg-neutral-950 dark:text-white border-none animate-in fade-in-0 zoom-in-95 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2 z-50 w-fit rounded-md text-xs text-pretty pointer-events-none select-none",
 					className
 				)}
 				{...props}
diff --git a/surfsense_web/content/docs/connectors/baidu-search.mdx b/surfsense_web/content/docs/connectors/baidu-search.mdx
new file mode 100644
index 0000000000..56d048d5bd
--- /dev/null
+++ b/surfsense_web/content/docs/connectors/baidu-search.mdx
@@ -0,0 +1,121 @@
+---
+title: Baidu Search
+description: Search the Chinese web with Baidu AI Search in SurfSense
+---
+
+# Baidu Search Integration Setup Guide
+
+This guide walks you through connecting Baidu AI Search to SurfSense for Chinese web search and AI-powered research.
+
+## How it works
+
+The Baidu Search connector uses Baidu AI Search through Qianfan AppBuilder's intelligent search generation API. It is a live search connector: SurfSense queries Baidu when the assistant needs current web results instead of periodically indexing content into your knowledge base.
+
+- Baidu Search is best for Simplified Chinese queries and China-focused web content.
+- Results are merged with SurfSense's other configured web search engines.
+- The connector returns Baidu references as sources that can be cited in chat responses.
+
+---
+
+## Authorization
+
+<Callout type="info" title="API Key Required">
+You need a Baidu Qianfan AppBuilder API key to use this connector. The key is encrypted and stored securely by SurfSense.
+</Callout>
+
+### Step 1: Get Your Baidu AI Search API Key
+
+1. Open the [Baidu AI Search product page](https://cloud.baidu.com/product/ai-search.html) and sign in with your Baidu Cloud account.
+2. Open Qianfan AppBuilder or the AI Search console from Baidu Cloud.
+3. Create or select an application that has access to Baidu AI Search.
+4. Generate an API key for the application.
+5. Copy the API key. SurfSense uses it as the `BAIDU_API_KEY` connector setting.
+
+<Callout type="warn">
+Keep this key private. Do not paste it into chat messages, issue reports, screenshots, or public repositories.
+</Callout>
+
+---
+
+## Connecting to SurfSense
+
+1. Navigate to **Connectors** → **Add Connector** → **Baidu Search**.
+2. Fill in the required fields:
+
+| Field | Description | Example |
+|-------|-------------|---------|
+| **Connector Name** | A friendly name to identify this connector | `Baidu Search` |
+| **Baidu AppBuilder API Key** | Your Qianfan AppBuilder API key | `bce-v3/...` |
+
+3. Click **Connect** to save the connector.
+4. Ask a current Chinese web query in chat, such as `今天中国人工智能行业有什么重要新闻？`.
+
+### Optional Advanced Settings
+
+SurfSense stores advanced Baidu options in the connector config. If your deployment exposes these fields, use the following values:
+
+| Setting | Description | Default |
+|---------|-------------|---------|
+| `BAIDU_MODEL` | The model Baidu AI Search uses for answer generation | `ernie-3.5-8k` |
+| `BAIDU_SEARCH_SOURCE` | Baidu search source version | `baidu_search_v2` |
+| `BAIDU_ENABLE_DEEP_SEARCH` | Enables Baidu's deeper search mode when supported by your account | `false` |
+
+SurfSense calls Baidu's intelligent search generation endpoint:
+
+```text
+POST https://qianfan.baidubce.com/v2/ai_search/chat/completions
+```
+
+For request and response details, see Baidu's [intelligent search generation API documentation](https://cloud.baidu.com/doc/qianfan/s/Omh4su4s0).
+
+---
+
+## When to Use Baidu Search
+
+| Use Case | Why Baidu Search Helps |
+|----------|------------------------|
+| Chinese news and current events | Better coverage for China-focused sources |
+| Chinese company, product, or policy research | More local web results than global search engines alone |
+| Mandarin-language fact finding | Native Chinese search and summarization behavior |
+| Cross-checking web search | Adds another source alongside SearXNG, Tavily, or Linkup |
+
+<Callout type="info" title="Live Search Connector">
+Baidu Search does not create indexed documents in your knowledge base. It runs when the assistant calls web search, then returns live sources for that answer.
+</Callout>
+
+---
+
+## Troubleshooting
+
+**No Baidu results appear**
+
+- Confirm the Baidu Search connector is active in the current search space.
+- Try a Chinese query with clear search intent, for example `百度智能云千帆 AppBuilder 最新功能`.
+- Check whether other web search engines are returning results. If none are, review the general [Web Search](/docs/how-to/web-search) setup.
+
+**Authentication failed**
+
+- Verify that the API key was copied from Qianfan AppBuilder, not another Baidu Cloud product.
+- Regenerate the API key if it was rotated, expired, or copied with extra whitespace.
+- Make sure the related application has access to Baidu AI Search.
+
+**Requests time out**
+
+- Baidu AI Search can take longer than ordinary keyword search because it performs search and summarization.
+- Retry with a narrower query.
+- If you self-host SurfSense, verify that the backend container can reach `qianfan.baidubce.com`.
+
+**Results are not relevant**
+
+- Use Chinese keywords for China-focused topics.
+- Include entity names, dates, or locations in the query.
+- Compare with SearXNG or another configured live search connector for broader coverage.
+
+---
+
+## Verification Checklist
+
+- The Baidu Search connector appears in your connector list.
+- A Chinese current-events query triggers web search in chat.
+- Chat responses include Baidu-backed sources with titles and URLs.
+- Invalid API keys fail without breaking other configured search engines.
diff --git a/surfsense_web/content/docs/connectors/index.mdx b/surfsense_web/content/docs/connectors/index.mdx
index e3d06aa3cf..9b8fa5f931 100644
--- a/surfsense_web/content/docs/connectors/index.mdx
+++ b/surfsense_web/content/docs/connectors/index.mdx
@@ -83,6 +83,11 @@ Connect SurfSense to your favorite tools and services. Browse the available inte
     description="Connect your GitHub repositories to SurfSense"
     href="/docs/connectors/github"
   />
+  <Card
+    title="Baidu Search"
+    description="Search the Chinese web with Baidu AI Search"
+    href="/docs/connectors/baidu-search"
+  />
   <Card
     title="Luma"
     description="Connect your Luma events to SurfSense"
@@ -105,7 +110,7 @@ Connect SurfSense to your favorite tools and services. Browse the available inte
   />
   <Card
     title="Obsidian"
-    description="Connect your Obsidian vault to SurfSense"
+    description="Sync your Obsidian vault using the SurfSense plugin"
     href="/docs/connectors/obsidian"
   />
   <Card
diff --git a/surfsense_web/content/docs/connectors/meta.json b/surfsense_web/content/docs/connectors/meta.json
index 3a3b6e039d..d616515750 100644
--- a/surfsense_web/content/docs/connectors/meta.json
+++ b/surfsense_web/content/docs/connectors/meta.json
@@ -17,6 +17,7 @@
 		"airtable",
 		"clickup",
 		"github",
+		"baidu-search",
 		"luma",
 		"circleback",
 		"elasticsearch",
diff --git a/surfsense_web/content/docs/connectors/obsidian.mdx b/surfsense_web/content/docs/connectors/obsidian.mdx
index c8475c97fa..19c6a34c68 100644
--- a/surfsense_web/content/docs/connectors/obsidian.mdx
+++ b/surfsense_web/content/docs/connectors/obsidian.mdx
@@ -1,143 +1,73 @@
 ---
 title: Obsidian
-description: Connect your Obsidian vault to SurfSense
+description: Sync your Obsidian vault with the SurfSense plugin
 ---
 
-# Obsidian Integration Setup Guide
+# Obsidian Plugin Setup Guide
 
-This guide walks you through connecting your Obsidian vault to SurfSense for note search and AI-powered insights.
-
-<Callout type="warn">
-    This connector requires direct file system access and only works with self-hosted SurfSense installations.
-</Callout>
+SurfSense integrates with Obsidian through the SurfSense Obsidian plugin.
 
 ## How it works
 
-The Obsidian connector scans your local Obsidian vault directory and indexes all Markdown files. It preserves your note structure and extracts metadata from YAML frontmatter.
+The plugin runs inside your Obsidian app and pushes note updates to SurfSense over HTTPS.
+This works for cloud and self-hosted deployments, including desktop and mobile clients.
 
-- For follow-up indexing runs, the connector uses content hashing to skip unchanged files for faster sync.
-- Indexing should be configured to run periodically, so updates should appear in your search results within minutes.
-
----
-
-## What Gets Indexed
+## What gets indexed
 
 | Content Type | Description |
 |--------------|-------------|
-| Markdown Files | All `.md` files in your vault |
-| Frontmatter | YAML metadata (title, tags, aliases, dates) |
-| Wiki Links | Links between notes (`[[note]]`) |
-| Inline Tags | Tags throughout your notes (`#tag`) |
-| Note Content | Full content with intelligent chunking |
-
-<Callout type="warn">
-    Binary files and attachments are not indexed by default. Enable "Include Attachments" to index embedded files.
-</Callout>
+| Markdown files | Note content (`.md`) |
+| Frontmatter | YAML metadata like title, tags, aliases, dates |
+| Wiki links | Linked notes (`[[note]]`) |
+| Tags | Inline and frontmatter tags |
+| Vault metadata | Vault and path metadata used for deep links and sync state |
 
----
+## Quick start
 
-## Quick Start (Local Installation)
+1. Open **Connectors** in SurfSense and choose **Obsidian**.
+2. Install the plugin (recommended via BRAT) using the steps below.
+3. In Obsidian, open **Settings → SurfSense**.  
+4. Paste your SurfSense API token from the user settings section.
+5. Paste your Server URL in the plugin setting: either your SurfSense main domain (if `/api/v1` rewrites are enabled) or your direct backend URL.
+6. Choose the Search Space in the plugin, then the first sync should run automatically.
+7. Confirm the connector appears as **Obsidian - &lt;vault&gt;** in SurfSense.
 
-1. Navigate to **Connectors** → **Add Connector** → **Obsidian**
-2. Enter your vault path: `/Users/yourname/Documents/MyVault`
-3. Enter a vault name (e.g., `Personal Notes`)
-4. Click **Connect Obsidian**
+## Install via BRAT (recommended)
 
-<Callout type="info">
-    Find your vault path: In Obsidian, right-click any note → "Reveal in Finder" (macOS) or "Show in Explorer" (Windows).
-</Callout>
+1. In Obsidian, open **Settings → Community plugins** and install **[BRAT](obsidian://show-plugin?id=obsidian42-brat)**.
+2. Open BRAT settings and click **Add beta plugin** button.
+3. Paste the repository: `https://github.com/MODSetter/SurfSense/`.
+4. Select the latest plugin version, then click "Add plugin".
+5. Open **Settings → SurfSense** to finish setup.
 
-<Callout type="info" title="Periodic Sync">
-Enable periodic sync to automatically re-index notes when content changes. Available frequencies: Every 5 minutes, 15 minutes, hourly, every 6 hours, daily, or weekly.
-</Callout>
-
----
+## Migrating from the legacy connector
 
-## Docker Setup
+If you previously used the legacy Obsidian connector architecture, migrate to the plugin flow:
 
-For Docker deployments, you need to mount your Obsidian vault as a volume.
+1. Delete the old legacy Obsidian connector from SurfSense.
+2. Install and configure the SurfSense Obsidian plugin using the quick start above.
+3. Run the first plugin sync and verify the new **Obsidian - &lt;vault&gt;** connector is active.
 
-### Step 1: Update docker-compose.yml
-
-Add your vault as a volume mount to the SurfSense backend service:
-
-```yaml
-services:
-  surfsense:
-    # ... other config
-    volumes:
-      - /path/to/your/obsidian/vault:/app/obsidian_vaults/my-vault:ro
-```
-
-<Callout type="info">
-    The `:ro` flag mounts the vault as read-only, which is recommended for security.
+<Callout type="warn">
+    Deleting the legacy connector also deletes all documents that were indexed by that connector. Always finish and verify plugin sync before deleting the old connector.
 </Callout>
 
-### Step 2: Configure the Connector
-
-Use the **container path** (not your local path) when setting up the connector:
-
-| Your Local Path | Container Path (use this) |
-|-----------------|---------------------------|
-| `/Users/john/Documents/MyVault` | `/app/obsidian_vaults/my-vault` |
-| `C:\Users\john\Documents\MyVault` | `/app/obsidian_vaults/my-vault` |
-
-### Example: Multiple Vaults
-
-```yaml
-volumes:
-  - /Users/john/Documents/PersonalNotes:/app/obsidian_vaults/personal:ro
-  - /Users/john/Documents/WorkNotes:/app/obsidian_vaults/work:ro
-```
-
-Then create separate connectors for each vault using `/app/obsidian_vaults/personal` and `/app/obsidian_vaults/work`.
-
----
-
-## Connector Configuration
-
-| Field | Description | Required |
-|-------|-------------|----------|
-| **Connector Name** | A friendly name to identify this connector | Yes |
-| **Vault Path** | Absolute path to your vault (container path for Docker) | Yes |
-| **Vault Name** | Display name for your vault in search results | Yes |
-| **Exclude Folders** | Comma-separated folder names to skip | No |
-| **Include Attachments** | Index embedded files (images, PDFs) | No |
-
----
-
-## Recommended Exclusions
-
-Common folders to exclude from indexing:
-
-| Folder | Reason |
-|--------|--------|
-| `.obsidian` | Obsidian config files (always exclude) |
-| `.trash` | Obsidian's trash folder |
-| `templates` | Template files you don't want searchable |
-| `daily-notes` | If you want to exclude daily notes |
-| `attachments` | If not using "Include Attachments" |
-
-Default exclusions: `.obsidian,.trash`
-
----
-
 ## Troubleshooting
 
-**Vault not found / Permission denied**
-- Verify the path exists and is accessible
-- For Docker: ensure the volume is mounted correctly in `docker-compose.yml`
-- Check file permissions: SurfSense needs read access to the vault directory
+**Plugin connects but no files appear**
+- Verify the plugin is pointed to the correct Search Space.
+- Trigger a manual sync from the plugin settings.
+- Confirm your API token is valid and not expired.
 
-**No notes indexed**
-- Ensure your vault contains `.md` files
-- Check that notes aren't in excluded folders
-- Verify the path points to the vault root (contains `.obsidian` folder)
+**Self-hosted URL issues**
+- Use a public or LAN backend URL that your Obsidian device can reach.
+- If your instance is behind TLS, ensure the URL/certificate is valid for the device running Obsidian.
 
-**Changes not appearing**
-- Wait for the next sync cycle, or manually trigger re-indexing
-- For Docker: restart the container if you modified volume mounts
+**Unauthorized / 401 errors**
+- Regenerate and paste a fresh API token from SurfSense.
+- Ensure the token belongs to the same account and workspace you are syncing into.
 
-**Docker: "path not found" error**
-- Use the container path (`/app/obsidian_vaults/...`), not your local path
-- Verify the volume mount in `docker-compose.yml` matches
+**Cannot reach server URL**
+- Check that the backend URL is reachable from the Obsidian device.
+- For self-hosted setups, verify firewall and reverse proxy rules.
+- Avoid using localhost unless SurfSense and Obsidian run on the same machine.
diff --git a/surfsense_web/content/docs/docker-installation/docker-compose.mdx b/surfsense_web/content/docs/docker-installation/docker-compose.mdx
index bd7f579d00..0155969cd1 100644
--- a/surfsense_web/content/docs/docker-installation/docker-compose.mdx
+++ b/surfsense_web/content/docs/docker-installation/docker-compose.mdx
@@ -71,7 +71,7 @@ Defaults work out of the box. Change `ZERO_ADMIN_PASSWORD` for security in produ
 | `ZERO_UPSTREAM_DB` | PostgreSQL connection URL for replication (must be a direct connection, not via pgbouncer) | *(built from DB_* vars)* |
 | `ZERO_CVR_DB` | PostgreSQL connection URL for client view records | *(built from DB_* vars)* |
 | `ZERO_CHANGE_DB` | PostgreSQL connection URL for replication log entries | *(built from DB_* vars)* |
-| `ZERO_APP_PUBLICATIONS` | PostgreSQL publication restricting which tables are replicated (created by migration 116) | `zero_publication` |
+| `ZERO_APP_PUBLICATIONS` | PostgreSQL publication restricting which tables are replicated (created by migration 116, verified by the `migrations` service before `zero-cache` starts) | `zero_publication` |
 | `ZERO_NUM_SYNC_WORKERS` | Number of view-sync worker processes. Must be ≤ connection pool sizes | `4` |
 | `ZERO_UPSTREAM_MAX_CONNS` | Max connections to upstream PostgreSQL for mutations | `20` |
 | `ZERO_CVR_MAX_CONNS` | Max connections to the CVR database | `30` |
@@ -150,7 +150,9 @@ Uncomment the connectors you want to use. Redirect URIs follow the pattern `http
 | Service | Description |
 |---------|-------------|
 | `db` | PostgreSQL with pgvector extension |
+| `migrations` | Short-lived: runs `alembic upgrade head` and verifies `zero_publication`, then exits |
 | `redis` | Message broker for Celery |
+| `searxng` | Local privacy-respecting search backend |
 | `backend` | FastAPI application server |
 | `celery_worker` | Background task processing (document indexing, etc.) |
 | `celery_beat` | Periodic task scheduler (connector sync) |
@@ -159,7 +161,42 @@ Uncomment the connectors you want to use. Redirect URIs follow the pattern `http
 
 All services start automatically with `docker compose up -d`.
 
-The backend includes a health check. Dependent services (workers, frontend) wait until the API is fully ready before starting. You can monitor startup progress with `docker compose ps` (look for `(health: starting)` → `(healthy)`).
+### How startup ordering works
+
+Schema migrations run as a dedicated `migrations` service that exits 0 on
+success and non-zero on failure. Every other backend-image service gates on
+it via `condition: service_completed_successfully`:
+
+```text
+db (healthy) ──▶ migrations (alembic upgrade head + verify zero_publication)
+                     │
+                     ├── exit 0 ─▶ backend ──▶ frontend
+                     │            celery_worker
+                     │            celery_beat
+                     │            zero-cache ──▶ frontend
+                     │
+                     └── exit ≠ 0 ─▶ compose halts the rest of the stack
+```
+
+This guarantees `zero-cache` only starts after `zero_publication` exists in
+Postgres. Before this design, a silent migration failure would leave
+`zero-cache` crash-looping with `Unknown or invalid publications. Specified:
+[zero_publication]. Found: []`.
+
+### Readiness vs liveness
+
+The backend exposes two endpoints:
+
+- `GET /health` — lightweight liveness probe (always returns 200 if the
+  process is up).
+- `GET /ready` — readiness probe that confirms `zero_publication` exists.
+  Returns 503 if not. The compose `backend.healthcheck` uses `/ready` so the
+  container only reports `healthy` once the schema is actually usable by
+  zero-cache.
+
+You can also monitor startup progress with `docker compose ps` (look for
+`(health: starting)` → `(healthy)`). The install script polls these states
+automatically and times out after 5 minutes if the stack does not converge.
 
 ---
 
@@ -188,6 +225,90 @@ docker compose down -v
 
 - **Ports already in use**: Change the relevant `*_PORT` variable in `.env` and restart.
 - **Permission errors on Linux**: You may need to prefix `docker` commands with `sudo`.
-- **Zero-cache not starting**: Check `docker compose logs zero-cache`. Ensure PostgreSQL has `wal_level=logical` (configured automatically by the bundled `postgresql.conf`).
 - **Real-time updates not working**: Open DevTools → Console and check for WebSocket errors. Verify `NEXT_PUBLIC_ZERO_CACHE_URL` matches the running zero-cache address.
 - **Line ending issues on Windows**: Run `git config --global core.autocrlf true` before cloning.
+
+### Migration service exited non-zero
+
+The `migrations` service exits non-zero in two cases:
+
+1. `alembic upgrade head` failed (timeout or SQL error).
+2. `alembic` succeeded but `zero_publication` is still missing from
+   `pg_publication`.
+
+Inspect the logs and the alembic state:
+
+```bash
+docker compose logs migrations
+docker compose exec db psql -U surfsense -d surfsense \
+  -c 'SELECT * FROM alembic_version;'
+docker compose exec db psql -U surfsense -d surfsense \
+  -c 'SELECT pubname FROM pg_publication;'
+```
+
+The default migration timeout is 900 seconds. Slow disks (Windows / WSL2)
+may need more — set `MIGRATION_TIMEOUT` in `.env` to increase it.
+
+### Zero-cache stuck on `Unknown or invalid publications`
+
+Symptom (in `docker compose logs zero-cache`):
+
+```text
+Error: Unknown or invalid publications. Specified: [zero_publication]. Found: []
+```
+
+This means `zero-cache` started before `zero_publication` was created. With
+the current compose files this should be impossible — the `migrations`
+service blocks `zero-cache` from starting. If you see it, your stack
+predates the fix or you brought up `zero-cache` manually with `docker
+compose up zero-cache` before the migrations service ran.
+
+Recovery:
+
+```bash
+docker compose down
+docker volume rm surfsense-zero-cache   # wipe half-built SQLite replica
+docker compose up -d                    # migrations runs first, then zero-cache
+```
+
+The install script (`install.ps1` / `install.sh`) detects this case
+automatically: if it finds a `surfsense-zero-cache` volume from a previous
+install with no matching `surfsense-zero-init` volume, it removes the stale
+volume before bringing the stack up.
+
+### Zero-cache crashes with `_zero.tableMetadata` errors
+
+This indicates a half-initialized SQLite replica left behind by a previous
+crash. The `migrations` service writes a marker file on a shared volume
+(`surfsense-zero-init`) when the publication oid changes; zero-cache wipes
+its replica and re-syncs on next start. If the marker mechanism somehow did
+not trigger, run the recovery one-liner above.
+
+### Ensuring `wal_level = logical`
+
+Logical replication is required by zero-cache. The bundled
+`docker/postgresql.conf` sets `wal_level = logical` automatically. If you
+swap in your own config or use a managed Postgres, confirm with:
+
+```bash
+docker compose exec db psql -U surfsense -d surfsense \
+  -c "SHOW wal_level;"
+```
+
+### Using `docker-compose.deps-only.yml`
+
+`docker-compose.deps-only.yml` runs only the dependencies (Postgres, Redis,
+SearXNG, zero-cache) on Docker while the backend and frontend run on the
+host. Because there is no backend container in this stack, there is no
+`migrations` service either, and you must run alembic on the host **before**
+bringing the stack up:
+
+```bash
+cd surfsense_backend
+uv run alembic upgrade head
+cd ../docker
+docker compose -f docker-compose.deps-only.yml up -d
+```
+
+If you skip the alembic step, `zero-cache` will crash-loop with `Unknown or
+invalid publications. Specified: [zero_publication]`.
diff --git a/surfsense_web/content/docs/how-to/meta.json b/surfsense_web/content/docs/how-to/meta.json
index 477fcafc46..329b7172e7 100644
--- a/surfsense_web/content/docs/how-to/meta.json
+++ b/surfsense_web/content/docs/how-to/meta.json
@@ -1,6 +1,6 @@
 {
 	"title": "How to",
-	"pages": ["zero-sync", "realtime-collaboration", "web-search"],
+	"pages": ["zero-sync", "realtime-collaboration", "web-search", "ollama"],
 	"icon": "Compass",
 	"defaultOpen": false
 }
diff --git a/surfsense_web/content/docs/how-to/ollama.mdx b/surfsense_web/content/docs/how-to/ollama.mdx
new file mode 100644
index 0000000000..48b2317056
--- /dev/null
+++ b/surfsense_web/content/docs/how-to/ollama.mdx
@@ -0,0 +1,90 @@
+---
+title: Connect Ollama
+description: Simple setup guide for using Ollama with SurfSense across local, Docker, remote, and cloud setups
+---
+
+# Connect Ollama
+
+Use this page to choose the correct **API Base URL** when adding an Ollama provider in SurfSense.
+
+## 1) Pick your API Base URL
+
+| Ollama location | SurfSense location | API Base URL |
+|---|---|---|
+| Same machine | No Docker | `http://localhost:11434` |
+| Host machine (macOS/Windows) | Docker Desktop | `http://host.docker.internal:11434` |
+| Host machine (Linux) | Docker Compose | `http://host.docker.internal:11434` |
+| Same Docker Compose stack | Docker Compose | `http://ollama:11434` |
+| Another machine in your network | Any | `http://<lan-ip>:11434` |
+| Public Ollama endpoint / proxy / cloud | Any | `http(s)://<your-domain-or-endpoint>` |
+
+If SurfSense runs in Docker, do not use `localhost` unless Ollama is in the same container.
+
+## 2) Add Ollama in SurfSense
+
+Go to **Search Space Settings -> Agent Models -> Add Model** and set:
+
+- Provider: `OLLAMA`
+- Model name: your model tag, for example `llama3.2` or `qwen3:8b`
+- API Base URL: from the table above
+- API key:
+  - local/self-hosted Ollama: any non-empty value
+  - Ollama cloud/proxied auth: real key or token required by that endpoint
+
+Save. SurfSense validates the connection immediately.
+
+## 3) Common setups
+
+### A) SurfSense in Docker Desktop, Ollama on your host
+
+Use:
+
+```text
+http://host.docker.internal:11434
+```
+
+### B) Ollama as a service in the same Compose
+
+Use API Base URL:
+
+```text
+http://ollama:11434
+```
+
+Minimal service example:
+
+```yaml
+ollama:
+  image: ollama/ollama:latest
+  volumes:
+    - ollama_data:/root/.ollama
+  ports:
+    - "11434:11434"
+```
+
+### C) Ollama on another machine
+
+Ollama binds to `127.0.0.1` by default. Make it reachable on the network:
+
+- Set `OLLAMA_HOST=0.0.0.0:11434` on the machine/service running Ollama
+- Open firewall port `11434`
+- Use `http://<lan-ip>:11434` in SurfSense's API Base URL
+
+## 4) Quick troubleshooting
+
+| Error | Cause | Fix |
+|---|---|---|
+| `Cannot connect to host localhost:11434` | Wrong URL from Dockerized backend | Use `host.docker.internal` or `ollama` |
+| `Cannot connect to host <lan-ip>:11434` | Ollama not exposed on network or firewall blocked | Set `OLLAMA_HOST=0.0.0.0:11434`, allow port 11434 |
+| URL starts with `/%20http://...` | Leading space in URL | Re-enter API Base URL without spaces |
+| `model not found` | Model not pulled on Ollama | Run `ollama pull <model>` |
+
+If needed, test from the backend container using the same host you put in **API Base URL**:
+
+```bash
+docker compose exec backend curl -v <YOUR_API_BASE_URL>/api/tags
+```
+
+## See also
+
+- [Docker Installation](/docs/docker-installation/docker-compose)
\ No newline at end of file
diff --git a/surfsense_web/content/docs/how-to/web-search.mdx b/surfsense_web/content/docs/how-to/web-search.mdx
index edcd28522e..cbea11d369 100644
--- a/surfsense_web/content/docs/how-to/web-search.mdx
+++ b/surfsense_web/content/docs/how-to/web-search.mdx
@@ -7,6 +7,8 @@ description: How SurfSense web search works and how to configure it for producti
 
 SurfSense uses [SearXNG](https://docs.searxng.org/) as a bundled meta-search engine to provide web search across all search spaces. SearXNG aggregates results from multiple search engines (Google, DuckDuckGo, Brave, Bing, and more) without requiring any API keys.
 
+You can also add live search connectors such as Baidu Search, Tavily, and Linkup to a search space. When those connectors are active, SurfSense queries them in parallel with SearXNG and merges the results before passing sources to the assistant.
+
 ## How It Works
 
 When a user triggers a web search in SurfSense:
@@ -14,10 +16,25 @@ When a user triggers a web search in SurfSense:
 1. The backend sends a query to the bundled SearXNG instance via its JSON API
 2. SearXNG fans out the query to all enabled search engines simultaneously
 3. Results are aggregated, deduplicated, and ranked by engine weight
-4. The backend receives merged results and presents them to the user
+4. If the current search space has live search connectors, the backend queries them in parallel
+5. The backend deduplicates the merged results and presents them to the user
 
 SearXNG runs as a Docker container alongside the backend. It is never exposed to the internet. Only the backend communicates with it over the internal Docker network.
 
+## Live Search Connectors
+
+Live search connectors are optional API-backed search providers configured per search space. They are useful when you need a specialized index, authenticated search API, or stronger regional coverage.
+
+| Connector | Best For | Setup |
+|-----------|----------|-------|
+| Baidu Search | Chinese web search and China-focused current information | [Baidu Search connector](/docs/connectors/baidu-search) |
+| Tavily | General web research through Tavily's search API | Add the Tavily connector from the Connectors dashboard |
+| Linkup | General web search through Linkup's search API | Add the Linkup connector from the Connectors dashboard |
+
+<Callout type="info" title="Search Space Scoped">
+Live search connectors only run for the search space where they are configured. They do not replace SearXNG globally.
+</Callout>
+
 ## Docker Setup
 
 SearXNG is included in both `docker-compose.yml` and `docker-compose.dev.yml` and works out of the box with no configuration needed.
diff --git a/surfsense_web/content/docs/how-to/zero-sync.mdx b/surfsense_web/content/docs/how-to/zero-sync.mdx
index e764d384a9..1a7762f239 100644
--- a/surfsense_web/content/docs/how-to/zero-sync.mdx
+++ b/surfsense_web/content/docs/how-to/zero-sync.mdx
@@ -43,10 +43,31 @@ zero-cache is included in the Docker Compose setup. The key environment variable
 
 ### Manual / Local Development
 
-If running the frontend outside Docker (e.g., `pnpm dev`), you need:
-
-1. A running zero-cache instance pointing at your PostgreSQL database
-2. `NEXT_PUBLIC_ZERO_CACHE_URL` set in your `.env` file (default: `http://localhost:4848`)
+If running the frontend outside Docker (e.g. `pnpm dev`), you need:
+
+1. **A running zero-cache instance** pointing at your PostgreSQL database. The easiest path is the official Docker image:
+
+    ```bash
+    docker run -d --name surfsense-zero-cache \
+      -p 4848:4848 \
+      --add-host=host.docker.internal:host-gateway \
+      -e ZERO_UPSTREAM_DB="postgresql://postgres:postgres@host.docker.internal:5432/surfsense?sslmode=disable" \
+      -e ZERO_CVR_DB="postgresql://postgres:postgres@host.docker.internal:5432/surfsense?sslmode=disable" \
+      -e ZERO_CHANGE_DB="postgresql://postgres:postgres@host.docker.internal:5432/surfsense?sslmode=disable" \
+      -e ZERO_REPLICA_FILE=/data/zero.db \
+      -e ZERO_ADMIN_PASSWORD=surfsense-zero-admin \
+      -e ZERO_APP_PUBLICATIONS=zero_publication \
+      -e ZERO_QUERY_URL="http://host.docker.internal:3000/api/zero/query" \
+      -e ZERO_MUTATE_URL="http://host.docker.internal:3000/api/zero/mutate" \
+      -v surfsense-zero-cache:/data \
+      rocicorp/zero:1.4.0
+    ```
+
+    Run `uv run alembic upgrade head` from `surfsense_backend/` **before** starting this container so the `zero_publication` exists.
+2. **`NEXT_PUBLIC_ZERO_CACHE_URL`** set in `surfsense_web/.env` (default: `http://localhost:4848`).
+3. **`wal_level = logical`** in your PostgreSQL config (see [Manual Installation → Configure PostgreSQL for Zero Sync](/docs/manual-installation#3-configure-postgresql-for-zero-sync)).
+
+For the full manual setup walkthrough, see the [Manual Installation guide](/docs/manual-installation).
 
 ### Custom Domain / Reverse Proxy
 
diff --git a/surfsense_web/content/docs/manual-installation.mdx b/surfsense_web/content/docs/manual-installation.mdx
index f9eed94879..599cb6238b 100644
--- a/surfsense_web/content/docs/manual-installation.mdx
+++ b/surfsense_web/content/docs/manual-installation.mdx
@@ -15,9 +15,11 @@ Before beginning the manual installation, ensure you have the following installe
 ### Required Software
 - **Python 3.12+** - Backend runtime environment
 - **Node.js 20+** - Frontend runtime environment  
-- **PostgreSQL 14+** - Database server
+- **PostgreSQL 14+** - Database server (must be configured with `wal_level = logical` for [Zero real-time sync](/docs/how-to/zero-sync))
 - **PGVector** - PostgreSQL extension for vector similarity search
 - **Redis** - Message broker for Celery task queue
+- **Zero-cache** - Rocicorp Zero real-time sync server (run via Docker; see [Zero-Cache Setup](#zero-cache-setup) below)
+- **Docker** - Required to run zero-cache (the simplest way; the Postgres + Redis can be installed natively)
 - **Git** - Version control (to clone the repository)
 
 ### Required Services & API Keys
@@ -212,7 +214,82 @@ uv sync
 uv sync
 ```
 
-### 3. Start Redis Server
+### 3. Configure PostgreSQL for Zero Sync
+
+SurfSense uses [Rocicorp Zero](https://zero.rocicorp.dev/) for real-time data synchronization (notifications, document status, chat comments, indexing progress). Zero replicates data from PostgreSQL via **logical replication**, which requires a one-time PostgreSQL configuration change.
+
+Edit your `postgresql.conf` (typical locations: `/etc/postgresql/<version>/main/postgresql.conf` on Linux, `/usr/local/var/postgres/postgresql.conf` on macOS via Homebrew, `C:\Program Files\PostgreSQL\<version>\data\postgresql.conf` on Windows) and set:
+
+```ini
+wal_level = logical
+max_replication_slots = 10
+max_wal_senders = 10
+```
+
+Then restart PostgreSQL:
+
+**Linux:**
+
+```bash
+sudo systemctl restart postgresql
+```
+
+**macOS (Homebrew):**
+
+```bash
+brew services restart postgresql
+```
+
+**Windows (PowerShell, replace `17` with your major version):**
+
+```powershell
+Restart-Service postgresql-x64-17
+```
+
+Verify the change:
+
+```bash
+psql -U postgres -d surfsense -c "SHOW wal_level;"
+# Should return: logical
+```
+
+**Managed databases (RDS, Supabase, Cloud SQL, etc.):** Enable logical replication via your provider's parameter group (e.g. `rds.logical_replication=1` on RDS) and grant your database user the `REPLICATION` privilege:
+
+```sql
+ALTER USER surfsense WITH REPLICATION;
+GRANT CREATE ON DATABASE surfsense TO surfsense;
+```
+
+### 4. Run Database Migrations
+
+Before starting the backend, run Alembic migrations. This creates the schema **and** the `zero_publication` that zero-cache needs to start. Skipping this step will cause zero-cache to crash-loop with `Unknown or invalid publications. Specified: [zero_publication]`.
+
+**If using uv:**
+
+```bash
+# From surfsense_backend/
+uv run alembic upgrade head
+```
+
+**If using pip/venv:**
+
+```bash
+# Activate virtual environment first
+source .venv/bin/activate  # Linux/macOS
+# OR
+.venv\Scripts\activate     # Windows
+
+alembic upgrade head
+```
+
+Verify the publication was created:
+
+```bash
+psql -U postgres -d surfsense -c "SELECT pubname FROM pg_publication;"
+# Should include: zero_publication
+```
+
+### 5. Start Redis Server
 
 Redis is required for Celery task queue. Start the Redis server:
 
@@ -271,7 +348,7 @@ redis-cli ping
 # Should return: PONG
 ```
 
-### 4. Start Celery Worker
+### 6. Start Celery Worker
 
 In a new terminal window, start the Celery worker to handle background tasks:
 
@@ -316,7 +393,7 @@ celery -A celery_worker.celery_app flower --port=5555
 
 Access Flower at [http://localhost:5555](http://localhost:5555) to monitor your Celery tasks.
 
-### 5. Start Celery Beat (Scheduler)
+### 7. Start Celery Beat (Scheduler)
 
 In another new terminal window, start Celery Beat to enable periodic tasks (like scheduled connector indexing):
 
@@ -347,7 +424,7 @@ celery -A celery_worker.celery_app beat --loglevel=info
 
 **Important**: Celery Beat is required for the periodic indexing functionality to work. Without it, scheduled connector tasks won't run automatically. The schedule interval can be configured using the `SCHEDULE_CHECKER_INTERVAL` environment variable.
 
-### 6. Run the Backend
+### 8. Run the Backend
 
 Start the backend server:
 
@@ -378,6 +455,92 @@ python main.py --reload
 
 If everything is set up correctly, you should see output indicating the server is running on `http://localhost:8000`.
 
+## Zero-Cache Setup
+
+**zero-cache** is the Rocicorp Zero server that sits between PostgreSQL and the browser. It streams real-time updates (notifications, document indexing status, chat comments, collaboration indicators) to all connected clients via WebSocket. The frontend connects to it on startup — without zero-cache running, you will not see live updates and many parts of the UI will sit on stale data.
+
+For an overview of how Zero works and the list of synced tables, see the [Real-Time Sync with Zero](/docs/how-to/zero-sync) guide.
+
+### 1. Run Zero-Cache via Docker
+
+The simplest way to run zero-cache is the official Docker image. Open a new terminal:
+
+**Linux/macOS:**
+
+```bash
+docker run -d --name surfsense-zero-cache \
+  -p 4848:4848 \
+  --add-host=host.docker.internal:host-gateway \
+  -e ZERO_UPSTREAM_DB="postgresql://postgres:postgres@host.docker.internal:5432/surfsense?sslmode=disable" \
+  -e ZERO_CVR_DB="postgresql://postgres:postgres@host.docker.internal:5432/surfsense?sslmode=disable" \
+  -e ZERO_CHANGE_DB="postgresql://postgres:postgres@host.docker.internal:5432/surfsense?sslmode=disable" \
+  -e ZERO_REPLICA_FILE=/data/zero.db \
+  -e ZERO_ADMIN_PASSWORD=surfsense-zero-admin \
+  -e ZERO_APP_PUBLICATIONS=zero_publication \
+  -e ZERO_NUM_SYNC_WORKERS=4 \
+  -e ZERO_UPSTREAM_MAX_CONNS=20 \
+  -e ZERO_CVR_MAX_CONNS=30 \
+  -e ZERO_QUERY_URL="http://host.docker.internal:3000/api/zero/query" \
+  -e ZERO_MUTATE_URL="http://host.docker.internal:3000/api/zero/mutate" \
+  -v surfsense-zero-cache:/data \
+  rocicorp/zero:1.4.0
+```
+
+**Windows (PowerShell):**
+
+```powershell
+docker run -d --name surfsense-zero-cache `
+  -p 4848:4848 `
+  --add-host=host.docker.internal:host-gateway `
+  -e ZERO_UPSTREAM_DB="postgresql://postgres:postgres@host.docker.internal:5432/surfsense?sslmode=disable" `
+  -e ZERO_CVR_DB="postgresql://postgres:postgres@host.docker.internal:5432/surfsense?sslmode=disable" `
+  -e ZERO_CHANGE_DB="postgresql://postgres:postgres@host.docker.internal:5432/surfsense?sslmode=disable" `
+  -e ZERO_REPLICA_FILE=/data/zero.db `
+  -e ZERO_ADMIN_PASSWORD=surfsense-zero-admin `
+  -e ZERO_APP_PUBLICATIONS=zero_publication `
+  -e ZERO_NUM_SYNC_WORKERS=4 `
+  -e ZERO_UPSTREAM_MAX_CONNS=20 `
+  -e ZERO_CVR_MAX_CONNS=30 `
+  -e ZERO_QUERY_URL="http://host.docker.internal:3000/api/zero/query" `
+  -e ZERO_MUTATE_URL="http://host.docker.internal:3000/api/zero/mutate" `
+  -v surfsense-zero-cache:/data `
+  rocicorp/zero:1.4.0
+```
+
+**Adjustments to make for your setup:**
+
+- Replace `postgres:postgres` in the connection URLs with your actual `DB_USER:DB_PASSWORD`.
+- On Linux without Docker Desktop, `host.docker.internal` may not resolve. Either keep the `--add-host=host.docker.internal:host-gateway` flag (Docker 20.10+) or replace `host.docker.internal` with your host's IP / `--network=host` + `localhost`.
+- For production / custom domains, set `ZERO_QUERY_URL` and `ZERO_MUTATE_URL` to your public frontend URL (e.g. `https://app.yourdomain.com/api/zero/query`).
+
+### 2. Verify Zero-Cache
+
+Confirm zero-cache is healthy:
+
+```bash
+curl http://localhost:4848/keepalive
+# Should return HTTP 200
+```
+
+Tail the logs to confirm initial replication completed without errors:
+
+```bash
+docker logs -f surfsense-zero-cache
+```
+
+### Alternative: Use `docker-compose.deps-only.yml`
+
+If you would rather have Docker manage Postgres, Redis, SearXNG, and zero-cache together (while still running the backend and frontend natively), the repository ships a deps-only compose file. **Run alembic migrations on the host first** so `zero_publication` exists before zero-cache starts:
+
+```bash
+cd surfsense_backend
+uv run alembic upgrade head
+cd ../docker
+docker compose -f docker-compose.deps-only.yml up -d
+```
+
+The deps-only stack exposes zero-cache on port `4848` (default) — keep `NEXT_PUBLIC_ZERO_CACHE_URL=http://localhost:4848` in your `surfsense_web/.env`.
+
 ## Frontend Setup
 
 ### 1. Environment Configuration
@@ -510,9 +673,10 @@ Load the extension in your browser's developer mode and configure it with your S
 To verify your installation:
 
 1. Open your browser and navigate to `http://localhost:3000`
-2. Sign in with your Google account
+2. Sign in with your Google account (or local credentials if `AUTH_TYPE=LOCAL`)
 3. Create a search space and try uploading a document
-4. Test the chat functionality with your uploaded content
+4. Watch the upload status update live without refreshing — this confirms zero-cache is wired up correctly
+5. Test the chat functionality with your uploaded content
 
 ## Troubleshooting
 
@@ -522,6 +686,11 @@ To verify your installation:
 - **Authentication Problems**: Check your Google OAuth configuration and ensure redirect URIs are set correctly
 - **LLM Errors**: Confirm your LLM API keys are valid and the selected models are accessible
 - **File Upload Failures**: Validate your ETL service API key (Unstructured.io or LlamaCloud) or ensure Docling is properly configured
+- **Real-time updates not working / stale UI**: Verify zero-cache is running (`curl http://localhost:4848/keepalive` returns 200). Open browser DevTools → Console and look for WebSocket errors. Confirm `NEXT_PUBLIC_ZERO_CACHE_URL` in `surfsense_web/.env` matches the running zero-cache address.
+- **Zero-cache stuck on `Unknown or invalid publications. Specified: [zero_publication]`**: You skipped (or never ran) `uv run alembic upgrade head` from `surfsense_backend/`. Run it, then restart the zero-cache container with `docker restart surfsense-zero-cache`.
+- **Zero-cache crashes with `_zero.tableMetadata` errors**: A previous run left a half-built SQLite replica behind. Stop the container, remove the volume, and start fresh: `docker rm -f surfsense-zero-cache && docker volume rm surfsense-zero-cache && docker run -d ...` (re-run the command from [Zero-Cache Setup](#zero-cache-setup)).
+- **`wal_level` is not set to `logical`**: zero-cache requires logical replication. Set `wal_level = logical` in `postgresql.conf`, restart PostgreSQL, and verify with `SHOW wal_level;` in psql.
+- **Backend `/ready` returns 503**: The readiness probe verifies `zero_publication` exists. Run `uv run alembic upgrade head` to create it.
 - **Windows-specific**: If you encounter path issues, ensure you're using the correct path separator (`\` instead of `/`)
 - **macOS-specific**: If you encounter permission issues, you may need to use `sudo` for some installation commands
 
diff --git a/surfsense_web/contexts/anonymous-mode.tsx b/surfsense_web/contexts/anonymous-mode.tsx
new file mode 100644
index 0000000000..eaf0996c33
--- /dev/null
+++ b/surfsense_web/contexts/anonymous-mode.tsx
@@ -0,0 +1,74 @@
+"use client";
+
+import { createContext, type ReactNode, useContext, useEffect, useMemo, useState } from "react";
+import { anonymousChatApiService } from "@/lib/apis/anonymous-chat-api.service";
+
+export interface AnonymousModeContextValue {
+	isAnonymous: true;
+	modelSlug: string;
+	setModelSlug: (slug: string) => void;
+	uploadedDoc: { filename: string; sizeBytes: number } | null;
+	setUploadedDoc: (doc: { filename: string; sizeBytes: number } | null) => void;
+	resetKey: number;
+	resetChat: () => void;
+}
+
+interface AuthenticatedContextValue {
+	isAnonymous: false;
+}
+
+type ContextValue = AnonymousModeContextValue | AuthenticatedContextValue;
+
+const DEFAULT_VALUE: AuthenticatedContextValue = { isAnonymous: false };
+
+const AnonymousModeContext = createContext<ContextValue>(DEFAULT_VALUE);
+
+export function AnonymousModeProvider({
+	initialModelSlug,
+	children,
+}: {
+	initialModelSlug: string;
+	children: ReactNode;
+}) {
+	const [modelSlug, setModelSlug] = useState(initialModelSlug);
+	const [uploadedDoc, setUploadedDoc] = useState<{ filename: string; sizeBytes: number } | null>(
+		null
+	);
+	const [resetKey, setResetKey] = useState(0);
+
+	const resetChat = () => setResetKey((k) => k + 1);
+
+	useEffect(() => {
+		anonymousChatApiService
+			.getDocument()
+			.then((doc) => {
+				if (doc) {
+					setUploadedDoc({ filename: doc.filename, sizeBytes: doc.size_bytes });
+				}
+			})
+			.catch(() => {});
+	}, []);
+
+	const value = useMemo<AnonymousModeContextValue>(
+		() => ({
+			isAnonymous: true,
+			modelSlug,
+			setModelSlug,
+			uploadedDoc,
+			setUploadedDoc,
+			resetKey,
+			resetChat,
+		}),
+		[modelSlug, uploadedDoc, resetKey]
+	);
+
+	return <AnonymousModeContext.Provider value={value}>{children}</AnonymousModeContext.Provider>;
+}
+
+export function useAnonymousMode(): ContextValue {
+	return useContext(AnonymousModeContext);
+}
+
+export function useIsAnonymous(): boolean {
+	return useContext(AnonymousModeContext).isAnonymous;
+}
diff --git a/surfsense_web/contexts/login-gate.tsx b/surfsense_web/contexts/login-gate.tsx
new file mode 100644
index 0000000000..f72cb3a42e
--- /dev/null
+++ b/surfsense_web/contexts/login-gate.tsx
@@ -0,0 +1,84 @@
+"use client";
+
+import Link from "next/link";
+import { createContext, type ReactNode, useCallback, useContext, useState } from "react";
+import { Button } from "@/components/ui/button";
+import {
+	Dialog,
+	DialogContent,
+	DialogDescription,
+	DialogFooter,
+	DialogHeader,
+	DialogTitle,
+} from "@/components/ui/dialog";
+import { useIsAnonymous } from "./anonymous-mode";
+
+interface LoginGateContextValue {
+	gate: (feature: string) => void;
+}
+
+const LoginGateContext = createContext<LoginGateContextValue>({
+	gate: () => {},
+});
+
+export function LoginGateProvider({ children }: { children: ReactNode }) {
+	const isAnonymous = useIsAnonymous();
+	const [feature, setFeature] = useState<string | null>(null);
+
+	const gate = useCallback(
+		(feat: string) => {
+			if (isAnonymous) {
+				setFeature(feat);
+			}
+		},
+		[isAnonymous]
+	);
+
+	const close = () => setFeature(null);
+
+	return (
+		<LoginGateContext.Provider value={{ gate }}>
+			{children}
+			<Dialog open={feature !== null} onOpenChange={(open) => !open && close()}>
+				<DialogContent className="sm:max-w-md">
+					<DialogHeader>
+						<DialogTitle>Create a free account to {feature}</DialogTitle>
+						<DialogDescription>
+							Get $5 of premium credit, save chat history, upload documents, use all AI tools, and
+							connect 30+ integrations.
+						</DialogDescription>
+					</DialogHeader>
+					<DialogFooter className="flex flex-col gap-2 sm:flex-row">
+						<Button asChild>
+							<Link href="/register">Create Free Account</Link>
+						</Button>
+						<Button variant="outline" asChild>
+							<Link href="/login">Log In</Link>
+						</Button>
+					</DialogFooter>
+				</DialogContent>
+			</Dialog>
+		</LoginGateContext.Provider>
+	);
+}
+
+export function useLoginGate(): LoginGateContextValue {
+	return useContext(LoginGateContext);
+}
+
+/**
+ * Returns a click handler that triggers the login gate when anonymous,
+ * or calls the original handler when authenticated.
+ */
+export function useGatedHandler(handler: (() => void) | undefined, feature: string): () => void {
+	const { gate } = useLoginGate();
+	const isAnonymous = useIsAnonymous();
+
+	return useCallback(() => {
+		if (isAnonymous) {
+			gate(feature);
+		} else {
+			handler?.();
+		}
+	}, [isAnonymous, gate, feature, handler]);
+}
diff --git a/surfsense_web/contracts/enums/toolIcons.tsx b/surfsense_web/contracts/enums/toolIcons.tsx
index 8e5d1e4523..bb87be0ba8 100644
--- a/surfsense_web/contracts/enums/toolIcons.tsx
+++ b/surfsense_web/contracts/enums/toolIcons.tsx
@@ -1,32 +1,223 @@
 import {
 	BookOpen,
 	Brain,
+	Calendar,
+	FileEdit,
+	FilePlus,
 	FileText,
+	FileUser,
+	FileX,
 	Film,
+	FolderPlus,
+	FolderTree,
+	FolderX,
 	Globe,
 	ImageIcon,
+	ListTodo,
 	type LucideIcon,
+	Mail,
+	MessagesSquare,
+	Move,
+	Plus,
 	Podcast,
 	ScanLine,
+	Search,
+	SearchCheck,
+	Send,
+	Trash2,
 	Wrench,
 } from "lucide-react";
 
+/**
+ * Every tool now renders a card via ``ToolFallback``. The icon map is
+ * keyed on the canonical backend tool name (registered in
+ * ``surfsense_backend/app/agents/new_chat/tools/registry.py``); unknown
+ * names fall back to the generic ``Wrench`` icon so the card still
+ * communicates "this is a tool call".
+ */
 const TOOL_ICONS: Record<string, LucideIcon> = {
+	// Generators
 	generate_podcast: Podcast,
 	generate_video_presentation: Film,
 	generate_report: FileText,
+	generate_resume: FileUser,
 	generate_image: ImageIcon,
+	display_image: ImageIcon,
+	// Web / search
 	scrape_webpage: ScanLine,
 	web_search: Globe,
 	search_surfsense_docs: BookOpen,
-	save_memory: Brain,
-	recall_memory: Brain,
+	// Memory
+	update_memory: Brain,
+	// Filesystem (built-in deepagent + middleware)
+	read_file: FileText,
+	write_file: FilePlus,
+	edit_file: FileEdit,
+	move_file: Move,
+	rm: FileX,
+	rmdir: FolderX,
+	mkdir: FolderPlus,
+	ls: FolderTree,
+	write_todos: ListTodo,
+	// Calendar
+	search_calendar_events: Search,
+	create_calendar_event: Calendar,
+	update_calendar_event: Calendar,
+	delete_calendar_event: Calendar,
+	// Gmail
+	search_gmail: Search,
+	read_gmail_email: Mail,
+	create_gmail_draft: Mail,
+	update_gmail_draft: FileEdit,
+	send_gmail_email: Send,
+	trash_gmail_email: Trash2,
+	// Notion / Confluence pages
+	create_notion_page: FilePlus,
+	update_notion_page: FileEdit,
+	delete_notion_page: FileX,
+	create_confluence_page: FilePlus,
+	update_confluence_page: FileEdit,
+	delete_confluence_page: FileX,
+	// Linear / Jira issues
+	create_linear_issue: Plus,
+	update_linear_issue: FileEdit,
+	delete_linear_issue: Trash2,
+	create_jira_issue: Plus,
+	update_jira_issue: FileEdit,
+	delete_jira_issue: Trash2,
+	// Drive-like file connectors
+	create_google_drive_file: FilePlus,
+	delete_google_drive_file: FileX,
+	create_dropbox_file: FilePlus,
+	delete_dropbox_file: FileX,
+	create_onedrive_file: FilePlus,
+	delete_onedrive_file: FileX,
+	// Chat connectors
+	list_discord_channels: MessagesSquare,
+	read_discord_messages: MessagesSquare,
+	send_discord_message: Send,
+	list_teams_channels: MessagesSquare,
+	read_teams_messages: MessagesSquare,
+	send_teams_message: Send,
+	// Luma
+	list_luma_events: Calendar,
+	read_luma_event: Calendar,
+	create_luma_event: Calendar,
+	// Misc
+	get_connected_accounts: SearchCheck,
+	execute: Wrench,
+	execute_code: Wrench,
 };
 
 export function getToolIcon(name: string): LucideIcon {
 	return TOOL_ICONS[name] ?? Wrench;
 }
 
+/**
+ * Friendly display names for tools shown in the chat UI.
+ *
+ * Most users aren't engineers; they shouldn't see raw unix-style
+ * identifiers like ``rm`` / ``rmdir`` / ``ls`` / ``grep`` / ``glob`` or
+ * snake_cased function names. The map below renders each tool with
+ * plain English wording (verb + object) so non-technical users
+ * understand what the agent is doing at a glance.
+ *
+ * Unmapped tool names fall back to a snake_case-to-Title-Case
+ * conversion via :func:`getToolDisplayName`.
+ */
+const TOOL_DISPLAY_NAMES: Record<string, string> = {
+	// Filesystem / knowledge base
+	read_file: "Read file",
+	write_file: "Write file",
+	edit_file: "Edit file",
+	move_file: "Move file",
+	rm: "Delete file",
+	rmdir: "Delete folder",
+	mkdir: "Create folder",
+	ls: "List files",
+	glob: "Find files",
+	grep: "Search in files",
+	write_todos: "Plan tasks",
+	save_document: "Save document",
+	// Generators
+	generate_podcast: "Generate podcast",
+	generate_video_presentation: "Generate video presentation",
+	generate_report: "Generate report",
+	generate_resume: "Generate resume",
+	generate_image: "Generate image",
+	display_image: "Show image",
+	// Web / search
+	scrape_webpage: "Read webpage",
+	web_search: "Search the web",
+	search_surfsense_docs: "Search knowledge base",
+	// Memory
+	update_memory: "Update memory",
+	// Calendar
+	search_calendar_events: "Search calendar",
+	create_calendar_event: "Create event",
+	update_calendar_event: "Update event",
+	delete_calendar_event: "Delete event",
+	// Gmail
+	search_gmail: "Search Gmail",
+	read_gmail_email: "Read email",
+	create_gmail_draft: "Draft email",
+	update_gmail_draft: "Update draft",
+	send_gmail_email: "Send email",
+	trash_gmail_email: "Move email to trash",
+	// Notion
+	create_notion_page: "Create Notion page",
+	update_notion_page: "Update Notion page",
+	delete_notion_page: "Delete Notion page",
+	// Confluence
+	create_confluence_page: "Create Confluence page",
+	update_confluence_page: "Update Confluence page",
+	delete_confluence_page: "Delete Confluence page",
+	// Linear
+	create_linear_issue: "Create Linear issue",
+	update_linear_issue: "Update Linear issue",
+	delete_linear_issue: "Delete Linear issue",
+	// Jira
+	create_jira_issue: "Create Jira issue",
+	update_jira_issue: "Update Jira issue",
+	delete_jira_issue: "Delete Jira issue",
+	// Drive-like file connectors
+	create_google_drive_file: "Create Google Drive file",
+	delete_google_drive_file: "Delete Google Drive file",
+	create_dropbox_file: "Create Dropbox file",
+	delete_dropbox_file: "Delete Dropbox file",
+	create_onedrive_file: "Create OneDrive file",
+	delete_onedrive_file: "Delete OneDrive file",
+	// Discord
+	list_discord_channels: "List Discord channels",
+	read_discord_messages: "Read Discord messages",
+	send_discord_message: "Send Discord message",
+	// Teams
+	list_teams_channels: "List Teams channels",
+	read_teams_messages: "Read Teams messages",
+	send_teams_message: "Send Teams message",
+	// Luma
+	list_luma_events: "List Luma events",
+	read_luma_event: "Read Luma event",
+	create_luma_event: "Create Luma event",
+	// Misc
+	get_connected_accounts: "Discover connected accounts",
+	execute: "Run command",
+	execute_code: "Run code",
+};
+
+/**
+ * Format a tool's canonical (snake_case) name for display in the chat UI.
+ *
+ * Looks up :data:`TOOL_DISPLAY_NAMES` first; falls back to a
+ * snake_case-to-Title-Case rewrite for tools that don't have a curated
+ * label (e.g. dynamically registered MCP tools).
+ */
+export function getToolDisplayName(name: string): string {
+	const friendly = TOOL_DISPLAY_NAMES[name];
+	if (friendly) return friendly;
+	return name.replace(/_/g, " ").replace(/\b\w/g, (c) => c.toUpperCase());
+}
+
 export const CONNECTOR_TOOL_ICON_PATHS: Record<string, { src: string; alt: string }> = {
 	gmail: { src: "/connectors/google-gmail.svg", alt: "Gmail" },
 	google_calendar: { src: "/connectors/google-calendar.svg", alt: "Google Calendar" },
@@ -37,6 +228,9 @@ export const CONNECTOR_TOOL_ICON_PATHS: Record<string, { src: string; alt: strin
 	linear: { src: "/connectors/linear.svg", alt: "Linear" },
 	jira: { src: "/connectors/jira.svg", alt: "Jira" },
 	confluence: { src: "/connectors/confluence.svg", alt: "Confluence" },
+	discord: { src: "/connectors/discord.svg", alt: "Discord" },
+	teams: { src: "/connectors/microsoft-teams.svg", alt: "Microsoft Teams" },
+	luma: { src: "/connectors/luma.svg", alt: "Luma" },
 };
 
 export const CONNECTOR_ICON_TO_TYPES: Record<string, string[]> = {
@@ -49,4 +243,7 @@ export const CONNECTOR_ICON_TO_TYPES: Record<string, string[]> = {
 	linear: ["LINEAR_CONNECTOR"],
 	jira: ["JIRA_CONNECTOR"],
 	confluence: ["CONFLUENCE_CONNECTOR"],
+	discord: ["DISCORD_CONNECTOR"],
+	teams: ["TEAMS_CONNECTOR"],
+	luma: ["LUMA_CONNECTOR"],
 };
diff --git a/surfsense_web/contracts/types/anonymous-chat.types.ts b/surfsense_web/contracts/types/anonymous-chat.types.ts
new file mode 100644
index 0000000000..864810d8ed
--- /dev/null
+++ b/surfsense_web/contracts/types/anonymous-chat.types.ts
@@ -0,0 +1,49 @@
+import { z } from "zod";
+
+export const anonModel = z.object({
+	id: z.number(),
+	name: z.string(),
+	description: z.string().nullable().optional(),
+	provider: z.string(),
+	model_name: z.string(),
+	billing_tier: z.string().default("free"),
+	is_premium: z.boolean().default(false),
+	seo_slug: z.string().nullable().optional(),
+	seo_enabled: z.boolean().default(false),
+	seo_title: z.string().nullable().optional(),
+	seo_description: z.string().nullable().optional(),
+	quota_reserve_tokens: z.number().nullable().optional(),
+});
+
+export const getAnonModelsResponse = z.array(anonModel);
+
+export const getAnonModelResponse = anonModel;
+
+export const anonQuotaResponse = z.object({
+	used: z.number(),
+	limit: z.number(),
+	remaining: z.number(),
+	status: z.string(),
+	warning_threshold: z.number(),
+	captcha_required: z.boolean().default(false),
+});
+
+export const anonChatRequest = z.object({
+	model_slug: z.string().max(100),
+	messages: z
+		.array(
+			z.object({
+				role: z.enum(["system", "user", "assistant"]),
+				content: z.string(),
+			})
+		)
+		.min(1),
+	disabled_tools: z.array(z.string()).optional(),
+	turnstile_token: z.string().optional(),
+});
+
+export type AnonModel = z.infer<typeof anonModel>;
+export type GetAnonModelsResponse = z.infer<typeof getAnonModelsResponse>;
+export type GetAnonModelResponse = z.infer<typeof getAnonModelResponse>;
+export type AnonQuotaResponse = z.infer<typeof anonQuotaResponse>;
+export type AnonChatRequest = z.infer<typeof anonChatRequest>;
diff --git a/surfsense_web/contracts/types/chat-messages.types.ts b/surfsense_web/contracts/types/chat-messages.types.ts
index 0859f9f3b8..ef16bb3666 100644
--- a/surfsense_web/contracts/types/chat-messages.types.ts
+++ b/surfsense_web/contracts/types/chat-messages.types.ts
@@ -1,7 +1,13 @@
 import { z } from "zod";
 
 /**
- * Raw message from database (real-time sync)
+ * Raw message from database (real-time sync).
+ *
+ * ``turn_id`` is included so consumers (e.g. ``convertToThreadMessage``)
+ * can populate ``metadata.custom.chatTurnId`` on the
+ * ``ThreadMessageLike`` even after the live-collab Zero re-sync. The
+ * inline Revert button's ``(chat_turn_id, tool_name, position)``
+ * fallback in tool-fallback.tsx depends on it.
  */
 export const rawMessage = z.object({
 	id: z.number(),
@@ -10,6 +16,7 @@ export const rawMessage = z.object({
 	content: z.unknown(),
 	author_id: z.string().nullable(),
 	created_at: z.string(),
+	turn_id: z.string().nullable().optional(),
 });
 
 export type RawMessage = z.infer<typeof rawMessage>;
diff --git a/surfsense_web/contracts/types/connector.types.ts b/surfsense_web/contracts/types/connector.types.ts
index b83e05dccb..61d5ffc94a 100644
--- a/surfsense_web/contracts/types/connector.types.ts
+++ b/surfsense_web/contracts/types/connector.types.ts
@@ -44,6 +44,7 @@ export const searchSourceConnector = z.object({
 	last_indexed_at: z.string().nullable(),
 	config: z.record(z.string(), z.any()),
 	enable_summary: z.boolean().default(false),
+	enable_vision_llm: z.boolean().default(false),
 	periodic_indexing_enabled: z.boolean(),
 	indexing_frequency_minutes: z.number().nullable(),
 	next_scheduled_at: z.string().nullable(),
@@ -98,6 +99,7 @@ export const createConnectorRequest = z.object({
 		last_indexed_at: true,
 		config: true,
 		enable_summary: true,
+		enable_vision_llm: true,
 		periodic_indexing_enabled: true,
 		indexing_frequency_minutes: true,
 		next_scheduled_at: true,
@@ -123,6 +125,7 @@ export const updateConnectorRequest = z.object({
 			last_indexed_at: true,
 			config: true,
 			enable_summary: true,
+			enable_vision_llm: true,
 			periodic_indexing_enabled: true,
 			indexing_frequency_minutes: true,
 			next_scheduled_at: true,
diff --git a/surfsense_web/contracts/types/document.types.ts b/surfsense_web/contracts/types/document.types.ts
index a8a1a11d43..7b87845683 100644
--- a/surfsense_web/contracts/types/document.types.ts
+++ b/surfsense_web/contracts/types/document.types.ts
@@ -88,6 +88,7 @@ export const surfsenseDocsDocument = z.object({
 	id: z.number(),
 	title: z.string(),
 	source: z.string(),
+	public_url: z.string(),
 	content: z.string(),
 });
 
@@ -144,10 +145,14 @@ export const createDocumentResponse = z.object({
 /**
  * Upload documents
  */
+export const processingModeEnum = z.enum(["basic", "premium"]);
+
 export const uploadDocumentRequest = z.object({
 	files: z.array(z.instanceof(File)),
 	search_space_id: z.number(),
 	should_summarize: z.boolean().default(false),
+	use_vision_llm: z.boolean().default(false),
+	processing_mode: processingModeEnum.default("basic"),
 });
 
 export const uploadDocumentResponse = z.object({
@@ -361,3 +366,4 @@ export type GetSurfsenseDocsResponse = z.infer<typeof getSurfsenseDocsResponse>;
 export type GetDocumentChunksRequest = z.infer<typeof getDocumentChunksRequest>;
 export type GetDocumentChunksResponse = z.infer<typeof getDocumentChunksResponse>;
 export type ChunkRead = z.infer<typeof chunkRead>;
+export type ProcessingMode = z.infer<typeof processingModeEnum>;
diff --git a/surfsense_web/contracts/types/log.types.ts b/surfsense_web/contracts/types/log.types.ts
index eb9f7fe6c1..95aa33319e 100644
--- a/surfsense_web/contracts/types/log.types.ts
+++ b/surfsense_web/contracts/types/log.types.ts
@@ -117,6 +117,8 @@ export const getLogSummaryResponse = logSummary;
 export type Log = z.infer<typeof log>;
 export type LogLevelEnum = z.infer<typeof logLevelEnum>;
 export type LogStatusEnum = z.infer<typeof logStatusEnum>;
+export type LogLevel = LogLevelEnum;
+export type LogStatus = LogStatusEnum;
 export type LogFilters = z.infer<typeof logFilters>;
 export type CreateLogRequest = z.infer<typeof createLogRequest>;
 export type CreateLogResponse = z.infer<typeof createLogResponse>;
diff --git a/surfsense_web/contracts/types/new-llm-config.types.ts b/surfsense_web/contracts/types/new-llm-config.types.ts
index 6bef94bac5..b52b98ae48 100644
--- a/surfsense_web/contracts/types/new-llm-config.types.ts
+++ b/surfsense_web/contracts/types/new-llm-config.types.ts
@@ -65,6 +65,13 @@ export const newLLMConfig = z.object({
 	created_at: z.string(),
 	search_space_id: z.number(),
 	user_id: z.string(),
+
+	// Capability flag — derived server-side at the route boundary from
+	// LiteLLM's authoritative model map. There is no DB column. Default
+	// `true` is the conservative-allow stance for unknown / unmapped
+	// BYOK rows; the streaming-task safety net is the only place a
+	// `false` actually blocks a request.
+	supports_image_input: z.boolean().default(true),
 });
 
 /**
@@ -74,11 +81,16 @@ export const newLLMConfigPublic = newLLMConfig.omit({ api_key: true });
 
 /**
  * Create NewLLMConfig
+ *
+ * `supports_image_input` is omitted because it is derived server-side
+ * from LiteLLM's model map at read time — there is no DB column to
+ * persist a client-supplied value into.
  */
 export const createNewLLMConfigRequest = newLLMConfig.omit({
 	id: true,
 	created_at: true,
 	user_id: true,
+	supports_image_input: true,
 });
 
 export const createNewLLMConfigResponse = newLLMConfig;
@@ -114,6 +126,8 @@ export const updateNewLLMConfigRequest = z.object({
 			created_at: true,
 			search_space_id: true,
 			user_id: true,
+			// Derived server-side; not part of the writable surface.
+			supports_image_input: true,
 		})
 		.partial(),
 });
@@ -162,6 +176,26 @@ export const globalNewLLMConfig = z.object({
 
 	is_global: z.literal(true),
 	is_auto_mode: z.boolean().optional().default(false), // True only for Auto mode (ID 0)
+
+	// Token quota and billing policy
+	billing_tier: z.string().default("free"),
+	is_premium: z.boolean().default(false),
+	anonymous_enabled: z.boolean().default(false),
+	seo_enabled: z.boolean().default(false),
+	seo_slug: z.string().nullable().optional(),
+	seo_title: z.string().nullable().optional(),
+	seo_description: z.string().nullable().optional(),
+	quota_reserve_tokens: z.number().nullable().optional(),
+	// Capability flag — true when the model can accept image inputs.
+	// Resolved server-side (OpenRouter dynamic configs use the OR
+	// `architecture.input_modalities` field; YAML / BYOK use LiteLLM's
+	// authoritative `supports_vision` map). The chat selector renders
+	// an amber "No image" hint when this is false and there are
+	// pending image attachments, but does not block selection — the
+	// backend safety net only rejects when LiteLLM *explicitly* marks
+	// the model as text-only, so unknown / new models still flow
+	// through. Default `true` matches that conservative-allow stance.
+	supports_image_input: z.boolean().default(true),
 });
 
 export const getGlobalNewLLMConfigsResponse = z.array(globalNewLLMConfig);
@@ -248,6 +282,11 @@ export const globalImageGenConfig = z.object({
 	litellm_params: z.record(z.string(), z.any()).nullable().optional(),
 	is_global: z.literal(true),
 	is_auto_mode: z.boolean().optional().default(false),
+	billing_tier: z.string().default("free"),
+	// Mirrors `globalNewLLMConfig.is_premium` so the new-chat selector's
+	// Free/Premium badge logic lights up automatically for image-gen too.
+	is_premium: z.boolean().default(false),
+	quota_reserve_micros: z.number().nullable().optional(),
 });
 
 export const getGlobalImageGenConfigsResponse = z.array(globalImageGenConfig);
@@ -328,6 +367,13 @@ export const globalVisionLLMConfig = z.object({
 	litellm_params: z.record(z.string(), z.any()).nullable().optional(),
 	is_global: z.literal(true),
 	is_auto_mode: z.boolean().optional().default(false),
+	billing_tier: z.string().default("free"),
+	// Mirrors `globalNewLLMConfig.is_premium` so the new-chat selector's
+	// Free/Premium badge logic lights up automatically for vision too.
+	is_premium: z.boolean().default(false),
+	quota_reserve_tokens: z.number().nullable().optional(),
+	input_cost_per_token: z.number().nullable().optional(),
+	output_cost_per_token: z.number().nullable().optional(),
 });
 
 export const getGlobalVisionLLMConfigsResponse = z.array(globalVisionLLMConfig);
diff --git a/surfsense_web/contracts/types/search-space.types.ts b/surfsense_web/contracts/types/search-space.types.ts
index 8a0a2fb4cd..7449f82b16 100644
--- a/surfsense_web/contracts/types/search-space.types.ts
+++ b/surfsense_web/contracts/types/search-space.types.ts
@@ -9,6 +9,8 @@ export const searchSpace = z.object({
 	user_id: z.string(),
 	citations_enabled: z.boolean(),
 	qna_custom_instructions: z.string().nullable(),
+	shared_memory_md: z.string().nullable().optional(),
+	ai_file_sort_enabled: z.boolean().optional().default(false),
 	member_count: z.number(),
 	is_owner: z.boolean(),
 });
@@ -54,6 +56,8 @@ export const updateSearchSpaceRequest = z.object({
 			description: true,
 			citations_enabled: true,
 			qna_custom_instructions: true,
+			shared_memory_md: true,
+			ai_file_sort_enabled: true,
 		})
 		.partial(),
 });
diff --git a/surfsense_web/contracts/types/stripe.types.ts b/surfsense_web/contracts/types/stripe.types.ts
index c7a6bf387c..35ec0cb17f 100644
--- a/surfsense_web/contracts/types/stripe.types.ts
+++ b/surfsense_web/contracts/types/stripe.types.ts
@@ -32,9 +32,70 @@ export const getPagePurchasesResponse = z.object({
 	purchases: z.array(pagePurchase),
 });
 
+// Premium credit purchases
+export const createTokenCheckoutSessionRequest = z.object({
+	quantity: z.number().int().min(1).max(100),
+	search_space_id: z.number().int().min(1),
+});
+
+export const createTokenCheckoutSessionResponse = z.object({
+	checkout_url: z.string(),
+});
+
+// Premium credit balance + purchase records.
+//
+// The unit is integer micro-USD (1_000_000 == $1.00). The schema names
+// kept the ``Token`` prefix for API back-compat with pinned clients;
+// the field names below are authoritative.
+export const tokenStripeStatusResponse = z.object({
+	token_buying_enabled: z.boolean(),
+	premium_credit_micros_used: z.number().default(0),
+	premium_credit_micros_limit: z.number().default(0),
+	premium_credit_micros_remaining: z.number().default(0),
+});
+
+export const tokenPurchaseStatusEnum = pagePurchaseStatusEnum;
+
+export const tokenPurchase = z.object({
+	id: z.uuid(),
+	stripe_checkout_session_id: z.string(),
+	stripe_payment_intent_id: z.string().nullable(),
+	quantity: z.number(),
+	credit_micros_granted: z.number(),
+	amount_total: z.number().nullable(),
+	currency: z.string().nullable(),
+	status: tokenPurchaseStatusEnum,
+	completed_at: z.string().nullable(),
+	created_at: z.string(),
+});
+
+export const getTokenPurchasesResponse = z.object({
+	purchases: z.array(tokenPurchase),
+});
+
+// Response from /stripe/finalize-checkout. Either page or token fields
+// are populated depending on purchase_type.
+export const finalizeCheckoutResponse = z.object({
+	purchase_type: z.enum(["page_packs", "premium_tokens"]),
+	status: pagePurchaseStatusEnum,
+	pages_limit: z.number().nullable().optional(),
+	pages_used: z.number().nullable().optional(),
+	pages_granted: z.number().nullable().optional(),
+	premium_credit_micros_limit: z.number().nullable().optional(),
+	premium_credit_micros_used: z.number().nullable().optional(),
+	premium_credit_micros_granted: z.number().nullable().optional(),
+});
+
 export type PagePurchaseStatus = z.infer<typeof pagePurchaseStatusEnum>;
 export type CreateCheckoutSessionRequest = z.infer<typeof createCheckoutSessionRequest>;
 export type CreateCheckoutSessionResponse = z.infer<typeof createCheckoutSessionResponse>;
 export type StripeStatusResponse = z.infer<typeof stripeStatusResponse>;
 export type PagePurchase = z.infer<typeof pagePurchase>;
 export type GetPagePurchasesResponse = z.infer<typeof getPagePurchasesResponse>;
+export type CreateTokenCheckoutSessionRequest = z.infer<typeof createTokenCheckoutSessionRequest>;
+export type CreateTokenCheckoutSessionResponse = z.infer<typeof createTokenCheckoutSessionResponse>;
+export type TokenStripeStatusResponse = z.infer<typeof tokenStripeStatusResponse>;
+export type TokenPurchaseStatus = z.infer<typeof tokenPurchaseStatusEnum>;
+export type TokenPurchase = z.infer<typeof tokenPurchase>;
+export type GetTokenPurchasesResponse = z.infer<typeof getTokenPurchasesResponse>;
+export type FinalizeCheckoutResponse = z.infer<typeof finalizeCheckoutResponse>;
diff --git a/surfsense_web/features/chat-messages/hitl/approval-cards/doom-loop-approval.tsx b/surfsense_web/features/chat-messages/hitl/approval-cards/doom-loop-approval.tsx
new file mode 100644
index 0000000000..1b7cf645d5
--- /dev/null
+++ b/surfsense_web/features/chat-messages/hitl/approval-cards/doom-loop-approval.tsx
@@ -0,0 +1,191 @@
+"use client";
+
+import { CornerDownLeftIcon, OctagonAlert } from "lucide-react";
+import { useCallback, useEffect, useMemo } from "react";
+import { TextShimmerLoader } from "@/components/prompt-kit/loader";
+import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
+import { Badge } from "@/components/ui/badge";
+import { Button } from "@/components/ui/button";
+import { Separator } from "@/components/ui/separator";
+import type { HitlDecision, InterruptResult, PerToolApprovalCard } from "../types";
+import { isInterruptResult } from "../types";
+import { useHitlDecision } from "../use-hitl-decision";
+import { useHitlPhase } from "../use-hitl-phase";
+
+/**
+ * Specialized HITL card for ``DoomLoopMiddleware`` interrupts. The
+ * backend signals these by setting ``context.permission === "doom_loop"``
+ * on the ``permission_ask`` interrupt.
+ *
+ * The card replaces the generic "approve/reject" framing with a
+ * "continue/stop" affordance that better matches the user's mental
+ * model: the agent is stuck repeating itself, not asking permission
+ * for a destructive action.
+ */
+function DoomLoopCardView({
+	toolName,
+	args,
+	interruptData,
+	onDecision,
+}: {
+	toolName: string;
+	args: Record<string, unknown>;
+	interruptData: InterruptResult;
+	onDecision: (decision: HitlDecision) => void;
+}) {
+	const { phase, setProcessing, setRejected } = useHitlPhase(interruptData);
+
+	const context = (interruptData.context ?? {}) as Record<string, unknown>;
+	const threshold = typeof context.threshold === "number" ? context.threshold : 3;
+	const stuckTool = (typeof context.tool === "string" && context.tool) || toolName;
+	const recentSignatures = Array.isArray(context.recent_signatures)
+		? (context.recent_signatures as string[])
+		: [];
+	const displayName = stuckTool.replace(/_/g, " ").replace(/\b\w/g, (c) => c.toUpperCase());
+
+	const argPreview = useMemo(() => {
+		if (!args || Object.keys(args).length === 0) return null;
+		try {
+			const json = JSON.stringify(args, null, 2);
+			return json.length > 600 ? `${json.slice(0, 600)}…` : json;
+		} catch {
+			return null;
+		}
+	}, [args]);
+
+	const handleContinue = useCallback(() => {
+		if (phase !== "pending") return;
+		setProcessing();
+		onDecision({ type: "approve" });
+	}, [phase, setProcessing, onDecision]);
+
+	const handleStop = useCallback(() => {
+		if (phase !== "pending") return;
+		setRejected();
+		onDecision({ type: "reject", message: "Doom loop: user requested stop." });
+	}, [phase, setRejected, onDecision]);
+
+	useEffect(() => {
+		const handler = (e: KeyboardEvent) => {
+			if (phase !== "pending") return;
+			if (e.key === "Enter" && !e.shiftKey && !e.ctrlKey && !e.metaKey) {
+				e.preventDefault();
+				handleStop();
+			}
+		};
+		window.addEventListener("keydown", handler);
+		return () => window.removeEventListener("keydown", handler);
+	}, [phase, handleStop]);
+
+	const isResolved = phase === "complete" || phase === "rejected";
+
+	return (
+		<Alert variant={phase === "rejected" ? "default" : "destructive"} className="my-4 max-w-lg">
+			<OctagonAlert className="size-4" />
+			<AlertTitle className="flex items-center gap-2">
+				<span>
+					{phase === "rejected"
+						? "Stopped"
+						: phase === "processing"
+							? "Continuing…"
+							: phase === "complete"
+								? "Continued"
+								: "I might be stuck"}
+				</span>
+				{!isResolved && (
+					<Badge variant="outline" className="font-mono text-[10px]">
+						doom-loop
+					</Badge>
+				)}
+			</AlertTitle>
+			<AlertDescription className="flex flex-col gap-3">
+				{phase === "processing" ? (
+					<TextShimmerLoader text="Resuming…" size="sm" />
+				) : phase === "rejected" ? (
+					<p className="text-xs">
+						I stopped retrying <span className="font-medium">{displayName}</span> as you asked.
+					</p>
+				) : phase === "complete" ? (
+					<p className="text-xs">
+						Continuing to call <span className="font-medium">{displayName}</span> as you asked.
+					</p>
+				) : (
+					<p className="text-xs">
+						I called <span className="font-medium">{displayName}</span> {threshold} times in a row
+						with similar arguments. Should I keep going or stop and rethink?
+					</p>
+				)}
+
+				{argPreview && phase === "pending" && (
+					<>
+						<Separator />
+						<div className="flex flex-col gap-1">
+							<p className="text-[10px] font-medium uppercase tracking-wide text-muted-foreground">
+								Last arguments
+							</p>
+							<pre className="max-h-32 overflow-auto rounded-md bg-muted/50 p-2 text-[11px] text-foreground/80">
+								{argPreview}
+							</pre>
+						</div>
+					</>
+				)}
+
+				{recentSignatures.length > 0 && phase === "pending" && (
+					<details className="text-[11px] text-muted-foreground">
+						<summary className="cursor-pointer select-none">
+							Show repeated signatures ({recentSignatures.length})
+						</summary>
+						<ul className="mt-1 ml-4 list-disc">
+							{recentSignatures.map((sig) => (
+								<li key={sig} className="font-mono break-all">
+									{sig}
+								</li>
+							))}
+						</ul>
+					</details>
+				)}
+
+				{phase === "pending" && (
+					<div className="flex items-center gap-2">
+						<Button size="sm" variant="outline" className="rounded-lg gap-1.5" onClick={handleStop}>
+							Stop and rethink
+							<CornerDownLeftIcon className="size-3 opacity-60" />
+						</Button>
+						<Button size="sm" variant="ghost" onClick={handleContinue}>
+							Continue anyway
+						</Button>
+					</div>
+				)}
+			</AlertDescription>
+		</Alert>
+	);
+}
+
+/**
+ * Discriminator: returns true iff the result is a ``permission_ask``
+ * interrupt with ``context.permission === "doom_loop"``. The fallback
+ * uses this BEFORE mounting an approval card to choose between
+ * ``DoomLoopApproval`` and ``GenericHitlApproval``.
+ */
+export function isDoomLoopInterrupt(result: unknown): boolean {
+	if (!isInterruptResult(result)) return false;
+	const ctx = (result.context ?? {}) as Record<string, unknown>;
+	return ctx.permission === "doom_loop";
+}
+
+/**
+ * Specialized doom-loop approval mounted by ``FallbackToolBody`` when
+ * ``isDoomLoopInterrupt(result)`` is true. Caller is responsible for
+ * the discrimination; this card receives a known ``InterruptResult``.
+ */
+export const DoomLoopApproval: PerToolApprovalCard = ({ toolName, args, result }) => {
+	const { dispatch } = useHitlDecision();
+	return (
+		<DoomLoopCardView
+			toolName={toolName}
+			args={args}
+			interruptData={result}
+			onDecision={(decision) => dispatch([decision])}
+		/>
+	);
+};
diff --git a/surfsense_web/features/chat-messages/hitl/approval-cards/generic-approval.tsx b/surfsense_web/features/chat-messages/hitl/approval-cards/generic-approval.tsx
new file mode 100644
index 0000000000..a5be7eef17
--- /dev/null
+++ b/surfsense_web/features/chat-messages/hitl/approval-cards/generic-approval.tsx
@@ -0,0 +1,253 @@
+"use client";
+
+import { CornerDownLeftIcon, Pencil } from "lucide-react";
+import { useCallback, useEffect, useMemo, useState } from "react";
+import { TextShimmerLoader } from "@/components/prompt-kit/loader";
+import { Button } from "@/components/ui/button";
+import { Input } from "@/components/ui/input";
+import { Textarea } from "@/components/ui/textarea";
+import { getToolDisplayName } from "@/contracts/enums/toolIcons";
+import type { HitlDecision, InterruptResult, PerToolApprovalCard } from "../types";
+import { useHitlDecision } from "../use-hitl-decision";
+import { useHitlPhase } from "../use-hitl-phase";
+
+function ParamEditor({
+	params,
+	onChange,
+	disabled,
+}: {
+	params: Record<string, unknown>;
+	onChange: (updated: Record<string, unknown>) => void;
+	disabled: boolean;
+}) {
+	const entries = Object.entries(params);
+	if (entries.length === 0) return null;
+
+	return (
+		<div className="space-y-2">
+			{entries.map(([key, value]) => {
+				const strValue = value == null ? "" : String(value);
+				const isLong = strValue.length > 120;
+				const fieldId = `hitl-param-${key}`;
+
+				return (
+					<div key={key} className="space-y-1">
+						<label htmlFor={fieldId} className="text-xs font-medium text-muted-foreground">
+							{key}
+						</label>
+						{isLong ? (
+							<Textarea
+								id={fieldId}
+								value={strValue}
+								disabled={disabled}
+								rows={3}
+								onChange={(e) => onChange({ ...params, [key]: e.target.value })}
+								className="text-xs"
+							/>
+						) : (
+							<Input
+								id={fieldId}
+								value={strValue}
+								disabled={disabled}
+								onChange={(e) => onChange({ ...params, [key]: e.target.value })}
+								className="text-xs"
+							/>
+						)}
+					</div>
+				);
+			})}
+		</div>
+	);
+}
+
+function GenericApprovalCardView({
+	toolName,
+	args,
+	interruptData,
+	onDecision,
+}: {
+	toolName: string;
+	args: Record<string, unknown>;
+	interruptData: InterruptResult;
+	onDecision: (decision: HitlDecision) => void;
+}) {
+	const { phase, setProcessing, setRejected } = useHitlPhase(interruptData);
+	const [editedParams, setEditedParams] = useState<Record<string, unknown>>(args);
+	const [isEditing, setIsEditing] = useState(false);
+
+	const displayName = getToolDisplayName(toolName);
+
+	const mcpServer = interruptData.context?.mcp_server as string | undefined;
+	const toolDescription = interruptData.context?.tool_description as string | undefined;
+
+	const reviewConfig = interruptData.review_configs?.[0];
+	const allowedDecisions = reviewConfig?.allowed_decisions ?? ["approve", "reject"];
+	const canEdit = allowedDecisions.includes("edit");
+	const canApproveAlways = allowedDecisions.includes("approve_always");
+
+	const hasChanged = useMemo(() => {
+		return JSON.stringify(editedParams) !== JSON.stringify(args);
+	}, [editedParams, args]);
+
+	const handleApprove = useCallback(() => {
+		if (phase !== "pending") return;
+		const isEdited = isEditing && hasChanged;
+		setProcessing();
+		onDecision({
+			type: isEdited ? "edit" : "approve",
+			edited_action: isEdited
+				? { name: interruptData.action_requests[0]?.name ?? toolName, args: editedParams }
+				: undefined,
+		});
+	}, [
+		phase,
+		setProcessing,
+		isEditing,
+		hasChanged,
+		onDecision,
+		interruptData,
+		toolName,
+		editedParams,
+	]);
+
+	const handleApproveAlways = useCallback(() => {
+		if (phase !== "pending" || !canApproveAlways) return;
+		setProcessing();
+		onDecision({ type: "approve_always" });
+	}, [phase, setProcessing, onDecision, canApproveAlways]);
+
+	useEffect(() => {
+		const handler = (e: KeyboardEvent) => {
+			if (e.key === "Enter" && !e.shiftKey && !e.ctrlKey && !e.metaKey && phase === "pending") {
+				handleApprove();
+			}
+		};
+		window.addEventListener("keydown", handler);
+		return () => window.removeEventListener("keydown", handler);
+	}, [handleApprove, phase]);
+
+	return (
+		<div className="my-4 max-w-lg overflow-hidden rounded-2xl border bg-muted/30 transition-[box-shadow] duration-300">
+			<div className="flex items-start justify-between px-5 pt-5 pb-4 select-none">
+				<div>
+					<p className="text-sm font-semibold text-foreground">
+						{phase === "rejected"
+							? `${displayName} — Rejected`
+							: phase === "processing" || phase === "complete"
+								? `${displayName} — Approved`
+								: displayName}
+					</p>
+					{phase === "processing" ? (
+						<TextShimmerLoader text="Executing..." size="sm" />
+					) : phase === "complete" ? (
+						<p className="text-xs text-muted-foreground mt-0.5">Action completed</p>
+					) : phase === "rejected" ? (
+						<p className="text-xs text-muted-foreground mt-0.5">Action was cancelled</p>
+					) : (
+						<p className="text-xs text-muted-foreground mt-0.5">
+							Requires your approval to proceed
+						</p>
+					)}
+					{mcpServer && (
+						<p className="text-[10px] text-muted-foreground/70 mt-1">
+							via <span className="font-medium">{mcpServer}</span>
+						</p>
+					)}
+				</div>
+				{phase === "pending" && canEdit && !isEditing && (
+					<Button
+						size="sm"
+						variant="ghost"
+						className="rounded-lg text-muted-foreground -mt-1 -mr-2"
+						onClick={() => setIsEditing(true)}
+					>
+						<Pencil className="size-3.5" />
+						Edit
+					</Button>
+				)}
+			</div>
+
+			{toolDescription && phase === "pending" && (
+				<>
+					<div className="mx-5 h-px bg-border/50" />
+					<div className="px-5 py-3">
+						<p className="text-xs text-muted-foreground">{toolDescription}</p>
+					</div>
+				</>
+			)}
+
+			{Object.keys(args).length > 0 && (
+				<>
+					<div className="mx-5 h-px bg-border/50" />
+					<div className="px-5 py-4 space-y-2">
+						<p className="text-xs font-medium text-muted-foreground">Inputs</p>
+						{phase === "pending" && isEditing ? (
+							<ParamEditor
+								params={editedParams}
+								onChange={setEditedParams}
+								disabled={phase !== "pending"}
+							/>
+						) : (
+							<pre className="text-xs text-foreground/80 whitespace-pre-wrap break-all bg-muted/50 rounded-lg p-3">
+								{JSON.stringify(args, null, 2)}
+							</pre>
+						)}
+					</div>
+				</>
+			)}
+
+			{phase === "pending" && (
+				<>
+					<div className="mx-5 h-px bg-border/50" />
+					<div className="px-5 py-4 flex items-center gap-2 select-none">
+						{allowedDecisions.includes("approve") && (
+							<Button size="sm" className="rounded-lg gap-1.5" onClick={handleApprove}>
+								{isEditing && hasChanged ? "Approve with edits" : "Approve"}
+								<CornerDownLeftIcon className="size-3 opacity-60" />
+							</Button>
+						)}
+						{canApproveAlways && (
+							<Button size="sm" className="rounded-lg" onClick={handleApproveAlways}>
+								Always Allow
+							</Button>
+						)}
+						{allowedDecisions.includes("reject") && (
+							<Button
+								size="sm"
+								variant="ghost"
+								className="rounded-lg text-muted-foreground"
+								onClick={() => {
+									setRejected();
+									onDecision({ type: "reject", message: "User rejected the action." });
+								}}
+							>
+								Reject
+							</Button>
+						)}
+					</div>
+				</>
+			)}
+		</div>
+	);
+}
+
+/**
+ * Default approval card mounted by ``FallbackToolBody`` for unknown HITL
+ * tools. Per-tool integrations may also import and compose this card on
+ * top of their own framing.
+ *
+ * Caller (the fallback) is responsible for the ``isInterruptResult``
+ * guard; this card receives a known ``InterruptResult`` and skips the
+ * defensive runtime check.
+ */
+export const GenericHitlApproval: PerToolApprovalCard = ({ toolName, args, result }) => {
+	const { dispatch } = useHitlDecision();
+	return (
+		<GenericApprovalCardView
+			toolName={toolName}
+			args={args}
+			interruptData={result}
+			onDecision={(decision) => dispatch([decision])}
+		/>
+	);
+};
diff --git a/surfsense_web/features/chat-messages/hitl/approval-cards/index.ts b/surfsense_web/features/chat-messages/hitl/approval-cards/index.ts
new file mode 100644
index 0000000000..bb85ccccbd
--- /dev/null
+++ b/surfsense_web/features/chat-messages/hitl/approval-cards/index.ts
@@ -0,0 +1,2 @@
+export { DoomLoopApproval, isDoomLoopInterrupt } from "./doom-loop-approval";
+export { GenericHitlApproval } from "./generic-approval";
diff --git a/surfsense_web/features/chat-messages/hitl/approval/approval-context.tsx b/surfsense_web/features/chat-messages/hitl/approval/approval-context.tsx
new file mode 100644
index 0000000000..c407639684
--- /dev/null
+++ b/surfsense_web/features/chat-messages/hitl/approval/approval-context.tsx
@@ -0,0 +1,28 @@
+"use client";
+
+import { createContext, useContext } from "react";
+import type { HitlDecision } from "../types";
+
+/**
+ * Decisions are keyed by step index (not toolCallId) because the
+ * resume protocol is positional — backend pairs ``decisions[i]`` with
+ * ``action_requests[i]``. ``stage`` always targets the active step,
+ * so per-tool bodies stay tcId-agnostic.
+ */
+export interface HitlApprovalAPI {
+	total: number;
+	currentStep: number;
+	decisions: ReadonlyArray<HitlDecision | undefined>;
+	stage: (decision: HitlDecision) => void;
+	next: () => void;
+	prev: () => void;
+	goToStep: (i: number) => void;
+	canAdvance: boolean;
+	canSubmit: boolean;
+}
+
+export const HitlApprovalContext = createContext<HitlApprovalAPI | null>(null);
+
+export function useHitlApproval(): HitlApprovalAPI | null {
+	return useContext(HitlApprovalContext);
+}
diff --git a/surfsense_web/features/chat-messages/hitl/approval/hitl-approval-card.tsx b/surfsense_web/features/chat-messages/hitl/approval/hitl-approval-card.tsx
new file mode 100644
index 0000000000..8aa8c74993
--- /dev/null
+++ b/surfsense_web/features/chat-messages/hitl/approval/hitl-approval-card.tsx
@@ -0,0 +1,267 @@
+"use client";
+
+import { ChevronLeftIcon, ChevronRightIcon } from "lucide-react";
+import { type FC, useCallback, useMemo, useState } from "react";
+import { Button } from "@/components/ui/button";
+import { getToolDisplayName } from "@/contracts/enums/toolIcons";
+import {
+	FallbackToolBody,
+	getToolComponent,
+	type TimelineToolProps,
+} from "@/features/chat-messages/timeline/tool-registry";
+import type {
+	HitlDecision,
+	InterruptActionRequest,
+	InterruptResult,
+	InterruptReviewConfig,
+} from "../types";
+import { type HitlApprovalAPI, HitlApprovalContext } from "./approval-context";
+import type { PendingInterruptState } from "./pending-interrupt-context";
+
+/**
+ * Narrow the bundle interrupt to the active step so per-tool bodies
+ * see the same single-action shape they're written against. Mirrors
+ * any staged decision onto ``__decided__`` (and edited args onto
+ * ``args``) so revisiting a decided step via Prev shows the past
+ * choice instead of pristine Approve/Reject buttons.
+ */
+function sliceForStep(
+	interruptData: Record<string, unknown>,
+	action: InterruptActionRequest,
+	reviewConfig: InterruptReviewConfig | undefined,
+	stagedDecision: HitlDecision | undefined
+): InterruptResult {
+	const baseAction =
+		stagedDecision?.type === "edit" && stagedDecision.edited_action
+			? { ...action, args: stagedDecision.edited_action.args }
+			: action;
+
+	const sliced: InterruptResult = {
+		...(interruptData as Partial<InterruptResult>),
+		__interrupt__: true,
+		action_requests: [baseAction],
+		review_configs: reviewConfig ? [reviewConfig] : [],
+	} as InterruptResult;
+
+	if (stagedDecision) {
+		(sliced as unknown as Record<string, unknown>).__decided__ = stagedDecision.type;
+	}
+
+	return sliced;
+}
+
+/**
+ * Single chrome for every HITL approval flow. Branches on
+ * ``action_requests.length``: 1 → per-tool body alone with auto-
+ * submit on first decision; ≥2 → per-tool body + inline pager +
+ * Submit-decisions (fires only once every step has a decision).
+ * Decisions are positional to match the resume protocol.
+ */
+export const HitlApprovalCard: FC<{
+	pendingInterrupt: PendingInterruptState;
+	onSubmit: (decisions: HitlDecision[]) => void;
+}> = ({ pendingInterrupt, onSubmit }) => {
+	const interruptData = pendingInterrupt.interruptData as InterruptResult & Record<string, unknown>;
+	const actionRequests = (interruptData.action_requests ?? []) as InterruptActionRequest[];
+	const reviewConfigs = (interruptData.review_configs ?? []) as InterruptReviewConfig[];
+	const total = actionRequests.length;
+	const isMulti = total >= 2;
+
+	const [currentStep, setCurrentStep] = useState(0);
+	const [decisions, setDecisions] = useState<(HitlDecision | undefined)[]>(() =>
+		Array.from({ length: total }, () => undefined)
+	);
+
+	// Reset on a new interrupt-request while still mounted (rapid
+	// back-to-back resumes), otherwise stale decisions would leak.
+	const [prevActionsRef, setPrevActionsRef] = useState(actionRequests);
+	if (prevActionsRef !== actionRequests) {
+		setPrevActionsRef(actionRequests);
+		setCurrentStep(0);
+		setDecisions(Array.from({ length: total }, () => undefined));
+	}
+
+	const submitFromDecisions = useCallback(
+		(next: (HitlDecision | undefined)[]) => {
+			if (next.length !== total) return;
+			if (next.some((d) => d === undefined)) return;
+			onSubmit(next as HitlDecision[]);
+		},
+		[onSubmit, total]
+	);
+
+	const stage = useCallback(
+		(decision: HitlDecision) => {
+			// Compute next array outside the setter so the side effect
+			// (auto-submit / step advance) runs once under StrictMode.
+			const updated = decisions.slice();
+			updated[currentStep] = decision;
+			setDecisions(updated);
+
+			if (!isMulti) {
+				submitFromDecisions(updated);
+				return;
+			}
+
+			// Skip to the next undecided step rather than +1 so users
+			// who jumped via Prev don't get pulled back to a decided
+			// step.
+			let target = currentStep;
+			for (let i = currentStep + 1; i < updated.length; i++) {
+				if (updated[i] === undefined) {
+					target = i;
+					break;
+				}
+			}
+			if (target !== currentStep) setCurrentStep(target);
+		},
+		[currentStep, decisions, isMulti, submitFromDecisions]
+	);
+
+	const next = useCallback(() => {
+		setCurrentStep((s) => Math.min(s + 1, Math.max(0, total - 1)));
+	}, [total]);
+	const prev = useCallback(() => {
+		setCurrentStep((s) => Math.max(s - 1, 0));
+	}, []);
+	const goToStep = useCallback(
+		(i: number) => {
+			if (i < 0 || i >= total) return;
+			setCurrentStep(i);
+		},
+		[total]
+	);
+	const submit = useCallback(() => {
+		submitFromDecisions(decisions);
+	}, [decisions, submitFromDecisions]);
+
+	const stagedCount = useMemo(() => decisions.filter((d) => d !== undefined).length, [decisions]);
+	const canSubmit = stagedCount === total && total > 0;
+	const canAdvance = decisions[currentStep] !== undefined;
+
+	const api = useMemo<HitlApprovalAPI>(
+		() => ({
+			total,
+			currentStep,
+			decisions,
+			stage,
+			next,
+			prev,
+			goToStep,
+			canAdvance,
+			canSubmit,
+		}),
+		[total, currentStep, decisions, stage, next, prev, goToStep, canAdvance, canSubmit]
+	);
+
+	if (total === 0) return null;
+
+	const action = actionRequests[currentStep];
+	const reviewConfig = reviewConfigs[currentStep];
+	const stagedDecision = decisions[currentStep];
+	const sliced = sliceForStep(interruptData, action, reviewConfig, stagedDecision);
+
+	const Body = getToolComponent(action.name) ?? FallbackToolBody;
+	const bodyProps: TimelineToolProps = {
+		// Per-step key remounts the body on navigation so per-tool
+		// internal state (useHitlPhase, edit drafts) doesn't bleed
+		// between steps.
+		toolCallId: pendingInterrupt.bundleToolCallIds[currentStep] ?? `step-${currentStep}`,
+		toolName: action.name,
+		args: (sliced.action_requests[0]?.args ?? {}) as Record<string, unknown>,
+		argsText: undefined,
+		result: sliced,
+		langchainToolCallId: undefined,
+		status: stagedDecision ? "completed" : "running",
+	};
+
+	return (
+		<HitlApprovalContext.Provider value={api}>
+			<div className="space-y-2">
+				<div key={`approval-step-${currentStep}`}>
+					<Body {...bodyProps} />
+				</div>
+				{isMulti && (
+					<PagerBar
+						currentStep={currentStep}
+						total={total}
+						stagedCount={stagedCount}
+						canAdvance={canAdvance}
+						canSubmit={canSubmit}
+						actionName={action.name}
+						onPrev={prev}
+						onNext={next}
+						onSubmit={submit}
+					/>
+				)}
+			</div>
+		</HitlApprovalContext.Provider>
+	);
+};
+
+const PagerBar: FC<{
+	currentStep: number;
+	total: number;
+	stagedCount: number;
+	canAdvance: boolean;
+	canSubmit: boolean;
+	actionName: string;
+	onPrev: () => void;
+	onNext: () => void;
+	onSubmit: () => void;
+}> = ({
+	currentStep,
+	total,
+	stagedCount,
+	canAdvance,
+	canSubmit,
+	actionName,
+	onPrev,
+	onNext,
+	onSubmit,
+}) => (
+	<div className="flex items-center gap-2 rounded-md border border-border bg-muted/40 px-2 py-1.5 text-sm">
+		<Button
+			type="button"
+			size="sm"
+			variant="outline"
+			onClick={onPrev}
+			disabled={currentStep === 0}
+			aria-label="Previous approval"
+		>
+			<ChevronLeftIcon className="h-4 w-4" />
+		</Button>
+		<span className="font-medium tabular-nums">
+			{currentStep + 1} / {total}
+		</span>
+		<span className="text-muted-foreground">·</span>
+		<span className="text-muted-foreground">
+			{stagedCount} of {total} decided
+		</span>
+		<Button
+			type="button"
+			size="sm"
+			variant="outline"
+			onClick={onNext}
+			disabled={!canAdvance || currentStep >= total - 1}
+			aria-label="Next approval"
+			title={!canAdvance ? "Decide on this action first" : undefined}
+		>
+			<ChevronRightIcon className="h-4 w-4" />
+		</Button>
+		<span className="ml-2 truncate text-xs text-muted-foreground" title={actionName}>
+			{getToolDisplayName(actionName)}
+		</span>
+		<div className="ml-auto">
+			<Button
+				type="button"
+				size="sm"
+				onClick={onSubmit}
+				disabled={!canSubmit}
+				title={canSubmit ? "Submit decisions" : "Decide every action first"}
+			>
+				Submit decisions
+			</Button>
+		</div>
+	</div>
+);
diff --git a/surfsense_web/features/chat-messages/hitl/approval/index.ts b/surfsense_web/features/chat-messages/hitl/approval/index.ts
new file mode 100644
index 0000000000..cfe6ba0809
--- /dev/null
+++ b/surfsense_web/features/chat-messages/hitl/approval/index.ts
@@ -0,0 +1,9 @@
+export type { HitlApprovalAPI } from "./approval-context";
+export { HitlApprovalContext, useHitlApproval } from "./approval-context";
+export { HitlApprovalCard } from "./hitl-approval-card";
+export {
+	PendingInterruptProvider,
+	type PendingInterruptState,
+	type PendingInterruptValue,
+	usePendingInterrupt,
+} from "./pending-interrupt-context";
diff --git a/surfsense_web/features/chat-messages/hitl/approval/pending-interrupt-context.tsx b/surfsense_web/features/chat-messages/hitl/approval/pending-interrupt-context.tsx
new file mode 100644
index 0000000000..9e4153ff5c
--- /dev/null
+++ b/surfsense_web/features/chat-messages/hitl/approval/pending-interrupt-context.tsx
@@ -0,0 +1,57 @@
+"use client";
+
+import { createContext, type ReactNode, useContext } from "react";
+import type { HitlDecision } from "../types";
+
+/** One in-flight HITL interrupt (one paused subagent). */
+export interface PendingInterruptState {
+	/** Stable id keyed by the parent ``tool_call_id`` stamped on the interrupt. */
+	interruptId: string;
+	threadId: number;
+	assistantMsgId: string;
+	interruptData: Record<string, unknown>;
+	bundleToolCallIds: string[];
+}
+
+export interface PendingInterruptValue {
+	/**
+	 * Every paused subagent for the current turn, in the order the SSE stream
+	 * delivered them — which matches ``state.interrupts`` traversal on the
+	 * backend, which is the order ``slice_decisions_by_tool_call`` consumes.
+	 */
+	pendingInterrupts: PendingInterruptState[];
+	/**
+	 * Stage one card's decisions. The orchestrator (page-level) batches across
+	 * cards and dispatches the resume only once every pending interrupt has
+	 * submitted, so the backend slicer sees a single concatenated decisions
+	 * list whose total matches the parent state's pending action count.
+	 */
+	onSubmit: (interruptId: string, decisions: HitlDecision[]) => void;
+}
+
+const PendingInterruptContext = createContext<PendingInterruptValue | null>(null);
+
+/**
+ * Bridges page-level interrupt state to the Timeline, which is mounted
+ * by assistant-ui and can't be prop-drilled. Mount once at the chat
+ * page root.
+ */
+export function PendingInterruptProvider({
+	pendingInterrupts,
+	onSubmit,
+	children,
+}: {
+	pendingInterrupts: PendingInterruptState[];
+	onSubmit: (interruptId: string, decisions: HitlDecision[]) => void;
+	children: ReactNode;
+}) {
+	return (
+		<PendingInterruptContext.Provider value={{ pendingInterrupts, onSubmit }}>
+			{children}
+		</PendingInterruptContext.Provider>
+	);
+}
+
+export function usePendingInterrupt(): PendingInterruptValue | null {
+	return useContext(PendingInterruptContext);
+}
diff --git a/surfsense_web/atoms/chat/hitl-edit-panel.atom.ts b/surfsense_web/features/chat-messages/hitl/edit-panel/edit-panel.atom.ts
similarity index 100%
rename from surfsense_web/atoms/chat/hitl-edit-panel.atom.ts
rename to surfsense_web/features/chat-messages/hitl/edit-panel/edit-panel.atom.ts
diff --git a/surfsense_web/features/chat-messages/hitl/edit-panel/edit-panel.tsx b/surfsense_web/features/chat-messages/hitl/edit-panel/edit-panel.tsx
new file mode 100644
index 0000000000..98ee2a85ad
--- /dev/null
+++ b/surfsense_web/features/chat-messages/hitl/edit-panel/edit-panel.tsx
@@ -0,0 +1,203 @@
+"use client";
+
+import { useAtomValue, useSetAtom } from "jotai";
+import { XIcon } from "lucide-react";
+import dynamic from "next/dynamic";
+import { useCallback, useRef, useState } from "react";
+import { Button } from "@/components/ui/button";
+import { Drawer, DrawerContent, DrawerHandle, DrawerTitle } from "@/components/ui/drawer";
+import { Skeleton } from "@/components/ui/skeleton";
+import { useMediaQuery } from "@/hooks/use-media-query";
+import { closeHitlEditPanelAtom, type ExtraField, hitlEditPanelAtom } from "./edit-panel.atom";
+import { ExtraFieldsSection } from "./fields";
+
+const PlateEditor = dynamic(
+	() => import("@/components/editor/plate-editor").then((m) => ({ default: m.PlateEditor })),
+	{ ssr: false, loading: () => <Skeleton className="h-64 w-full" /> }
+);
+
+/**
+ * The actual editable form. Controlled by atom data via the
+ * Desktop/Mobile shells below; isolated from layout so the same form
+ * renders identically in either container.
+ */
+export function HitlEditPanelContent({
+	title: initialTitle,
+	content: initialContent,
+	contentFormat,
+	extraFields,
+	onSave,
+	onClose,
+	showCloseButton = true,
+}: {
+	title: string;
+	content: string;
+	toolName: string;
+	contentFormat?: "markdown" | "html";
+	extraFields?: ExtraField[];
+	onSave: (title: string, content: string, extraFieldValues?: Record<string, string>) => void;
+	onClose?: () => void;
+	showCloseButton?: boolean;
+}) {
+	const [editedTitle, setEditedTitle] = useState(initialTitle);
+	const contentRef = useRef(initialContent);
+	const [isSaving, setIsSaving] = useState(false);
+	const [extraFieldValues, setExtraFieldValues] = useState<Record<string, string>>(() => {
+		if (!extraFields) return {};
+		const initial: Record<string, string> = {};
+		for (const field of extraFields) {
+			initial[field.key] = field.value;
+		}
+		return initial;
+	});
+
+	const handleContentChange = useCallback((content: string) => {
+		contentRef.current = content;
+	}, []);
+
+	const handleExtraFieldChange = useCallback((key: string, value: string) => {
+		setExtraFieldValues((prev) => ({ ...prev, [key]: value }));
+	}, []);
+
+	const handleSave = useCallback(() => {
+		if (!editedTitle.trim()) return;
+		setIsSaving(true);
+		const extras = extraFields && extraFields.length > 0 ? extraFieldValues : undefined;
+		onSave(editedTitle, contentRef.current, extras);
+		onClose?.();
+	}, [editedTitle, onSave, onClose, extraFields, extraFieldValues]);
+
+	return (
+		<>
+			<div className="flex items-center gap-2 px-4 py-2 shrink-0 border-b">
+				<input
+					value={editedTitle}
+					onChange={(e) => setEditedTitle(e.target.value)}
+					placeholder="Untitled"
+					className="flex-1 min-w-0 bg-transparent text-sm font-semibold text-foreground outline-none placeholder:text-muted-foreground"
+					aria-label="Page title"
+				/>
+				{onClose && showCloseButton && (
+					<Button variant="ghost" size="icon" onClick={onClose} className="size-7 shrink-0">
+						<XIcon className="size-4" />
+						<span className="sr-only">Close panel</span>
+					</Button>
+				)}
+			</div>
+
+			{extraFields && extraFields.length > 0 && (
+				<ExtraFieldsSection
+					fields={extraFields}
+					values={extraFieldValues}
+					onFieldChange={handleExtraFieldChange}
+				/>
+			)}
+
+			<div className="flex-1 overflow-hidden">
+				<PlateEditor
+					{...(contentFormat === "html"
+						? { html: initialContent, onHtmlChange: handleContentChange }
+						: { markdown: initialContent, onMarkdownChange: handleContentChange })}
+					readOnly={false}
+					preset="full"
+					placeholder="Start writing..."
+					editorVariant="default"
+					defaultEditing
+					onSave={handleSave}
+					hasUnsavedChanges
+					isSaving={isSaving}
+					className="[&_[role=toolbar]]:!bg-sidebar"
+				/>
+			</div>
+		</>
+	);
+}
+
+function DesktopHitlEditPanel() {
+	const panelState = useAtomValue(hitlEditPanelAtom);
+	const closePanel = useSetAtom(closeHitlEditPanelAtom);
+
+	if (!panelState.isOpen || !panelState.onSave) return null;
+
+	return (
+		<div className="flex w-[50%] max-w-[700px] min-w-[380px] flex-col border-l bg-sidebar text-sidebar-foreground animate-in slide-in-from-right-4 duration-300 ease-out">
+			<HitlEditPanelContent
+				title={panelState.title}
+				content={panelState.content}
+				toolName={panelState.toolName}
+				contentFormat={panelState.contentFormat}
+				extraFields={panelState.extraFields}
+				onSave={panelState.onSave}
+				onClose={closePanel}
+			/>
+		</div>
+	);
+}
+
+function MobileHitlEditDrawer() {
+	const panelState = useAtomValue(hitlEditPanelAtom);
+	const closePanel = useSetAtom(closeHitlEditPanelAtom);
+
+	if (!panelState.onSave) return null;
+
+	return (
+		<Drawer
+			open={panelState.isOpen}
+			onOpenChange={(open) => {
+				if (!open) closePanel();
+			}}
+			shouldScaleBackground={false}
+		>
+			<DrawerContent
+				className="h-[90vh] max-h-[90vh] z-80 bg-sidebar overflow-hidden"
+				overlayClassName="z-80"
+			>
+				<DrawerHandle />
+				<DrawerTitle className="sr-only">Edit {panelState.toolName}</DrawerTitle>
+				<div className="min-h-0 flex-1 flex flex-col overflow-hidden">
+					<HitlEditPanelContent
+						title={panelState.title}
+						content={panelState.content}
+						toolName={panelState.toolName}
+						contentFormat={panelState.contentFormat}
+						extraFields={panelState.extraFields}
+						onSave={panelState.onSave}
+						onClose={closePanel}
+						showCloseButton={false}
+					/>
+				</div>
+			</DrawerContent>
+		</Drawer>
+	);
+}
+
+/**
+ * Entry point mounted by the right-panel layout. Renders the desktop
+ * panel on lg+ and the mobile drawer below; both share state via the
+ * ``hitlEditPanelAtom``.
+ */
+export function HitlEditPanel() {
+	const panelState = useAtomValue(hitlEditPanelAtom);
+	const isDesktop = useMediaQuery("(min-width: 1024px)");
+
+	if (!panelState.isOpen) return null;
+
+	if (isDesktop) {
+		return <DesktopHitlEditPanel />;
+	}
+
+	return <MobileHitlEditDrawer />;
+}
+
+/**
+ * Entry point mounted by chat pages so the mobile drawer can render
+ * outside the desktop right-panel container.
+ */
+export function MobileHitlEditPanel() {
+	const panelState = useAtomValue(hitlEditPanelAtom);
+	const isDesktop = useMediaQuery("(min-width: 1024px)");
+
+	if (isDesktop || !panelState.isOpen) return null;
+
+	return <MobileHitlEditDrawer />;
+}
diff --git a/surfsense_web/features/chat-messages/hitl/edit-panel/fields/calendar-field.tsx b/surfsense_web/features/chat-messages/hitl/edit-panel/fields/calendar-field.tsx
new file mode 100644
index 0000000000..534e9a2d59
--- /dev/null
+++ b/surfsense_web/features/chat-messages/hitl/edit-panel/fields/calendar-field.tsx
@@ -0,0 +1,114 @@
+"use client";
+
+import { format } from "date-fns";
+import { CalendarIcon } from "lucide-react";
+import type React from "react";
+import { useCallback, useMemo, useState } from "react";
+import { Button } from "@/components/ui/button";
+import { Calendar } from "@/components/ui/calendar";
+import { Input } from "@/components/ui/input";
+import { Popover, PopoverContent, PopoverTrigger } from "@/components/ui/popover";
+
+function parseDateTimeValue(value: string): { date: Date | undefined; time: string } {
+	if (!value) return { date: undefined, time: "09:00" };
+	try {
+		const d = new Date(value);
+		if (Number.isNaN(d.getTime())) return { date: undefined, time: "09:00" };
+		return {
+			date: d,
+			time: format(d, "HH:mm"),
+		};
+	} catch {
+		return { date: undefined, time: "09:00" };
+	}
+}
+
+function buildLocalDateTimeString(date: Date | undefined, time: string): string {
+	if (!date) return "";
+	const [hours, minutes] = time.split(":").map(Number);
+	const combined = new Date(date);
+	combined.setHours(hours ?? 9, minutes ?? 0, 0, 0);
+	const y = combined.getFullYear();
+	const m = String(combined.getMonth() + 1).padStart(2, "0");
+	const d = String(combined.getDate()).padStart(2, "0");
+	const h = String(combined.getHours()).padStart(2, "0");
+	const min = String(combined.getMinutes()).padStart(2, "0");
+	return `${y}-${m}-${d}T${h}:${min}:00`;
+}
+
+/**
+ * Calendar popover + 24h time input. Emits a local ISO-like string
+ * (``YYYY-MM-DDThh:mm:00``) on every change. Value is parsed back into
+ * date + time on every render so the picker stays in sync with
+ * controlled props.
+ */
+export function DateTimePickerField({
+	id,
+	value,
+	onChange,
+}: {
+	id: string;
+	value: string;
+	onChange: (value: string) => void;
+}) {
+	const parsed = useMemo(() => parseDateTimeValue(value), [value]);
+	const [selectedDate, setSelectedDate] = useState<Date | undefined>(parsed.date);
+	const [time, setTime] = useState(parsed.time);
+	const [open, setOpen] = useState(false);
+
+	const handleDateSelect = useCallback(
+		(day: Date | undefined) => {
+			setSelectedDate(day);
+			onChange(buildLocalDateTimeString(day, time));
+			setOpen(false);
+		},
+		[time, onChange]
+	);
+
+	const handleTimeChange = useCallback(
+		(e: React.ChangeEvent<HTMLInputElement>) => {
+			const newTime = e.target.value;
+			setTime(newTime);
+			onChange(buildLocalDateTimeString(selectedDate, newTime));
+		},
+		[selectedDate, onChange]
+	);
+
+	const displayLabel = selectedDate
+		? `${format(selectedDate, "MMM d, yyyy")} at ${time}`
+		: "Pick date & time";
+
+	return (
+		<div className="flex gap-2">
+			<Popover open={open} onOpenChange={setOpen}>
+				<PopoverTrigger asChild>
+					<Button
+						variant="outline"
+						id={id}
+						type="button"
+						className="h-9 flex-1 justify-start gap-2 bg-transparent px-3 py-1 text-sm"
+					>
+						<CalendarIcon data-icon="inline-start" className="text-muted-foreground" />
+						<span className={selectedDate ? "text-foreground" : "text-muted-foreground"}>
+							{displayLabel}
+						</span>
+					</Button>
+				</PopoverTrigger>
+				<PopoverContent className="w-auto p-0" align="start">
+					<Calendar
+						mode="single"
+						selected={selectedDate}
+						onSelect={handleDateSelect}
+						defaultMonth={selectedDate}
+					/>
+				</PopoverContent>
+			</Popover>
+			<Input
+				type="time"
+				value={time}
+				onChange={handleTimeChange}
+				className="w-[120px] text-sm shrink-0 appearance-none [&::-webkit-calendar-picker-indicator]:hidden [&::-webkit-calendar-picker-indicator]:appearance-none"
+			/>
+		</div>
+	);
+}
diff --git a/surfsense_web/features/chat-messages/hitl/edit-panel/fields/email-tags-field.tsx b/surfsense_web/features/chat-messages/hitl/edit-panel/fields/email-tags-field.tsx
new file mode 100644
index 0000000000..693eaac340
--- /dev/null
+++ b/surfsense_web/features/chat-messages/hitl/edit-panel/fields/email-tags-field.tsx
@@ -0,0 +1,86 @@
+"use client";
+
+import { TagInput, type Tag as TagType } from "emblor";
+import { useCallback, useEffect, useRef, useState } from "react";
+
+function parseEmailsToTags(value: string): TagType[] {
+	if (!value.trim()) return [];
+	return value
+		.split(",")
+		.map((s) => s.trim())
+		.filter(Boolean)
+		.map((email, i) => ({ id: `${Date.now()}-${i}`, text: email }));
+}
+
+function tagsToEmailString(tags: TagType[]): string {
+	return tags.map((t) => t.text).join(", ");
+}
+
+/**
+ * Comma-separated email field rendered as a tag input. Internal tag
+ * state is the source of truth; comma-string is propagated to the
+ * caller via ``onChange`` whenever tags change (skipping the initial
+ * mount to avoid spurious updates).
+ */
+export function EmailsTagField({
+	id,
+	value,
+	onChange,
+	placeholder,
+}: {
+	id: string;
+	value: string;
+	onChange: (value: string) => void;
+	placeholder?: string;
+}) {
+	const [tags, setTags] = useState<TagType[]>(() => parseEmailsToTags(value));
+	const [activeTagIndex, setActiveTagIndex] = useState<number | null>(null);
+	const isInitialMount = useRef(true);
+	const onChangeRef = useRef(onChange);
+	onChangeRef.current = onChange;
+
+	useEffect(() => {
+		if (isInitialMount.current) {
+			isInitialMount.current = false;
+			return;
+		}
+		onChangeRef.current(tagsToEmailString(tags));
+	}, [tags]);
+
+	const handleSetTags = useCallback((newTags: TagType[] | ((prev: TagType[]) => TagType[])) => {
+		setTags((prev) => (typeof newTags === "function" ? newTags(prev) : newTags));
+	}, []);
+
+	const handleAddTag = useCallback((text: string) => {
+		const trimmed = text.trim();
+		if (!trimmed) return;
+		setTags((prev) => {
+			if (prev.some((tag) => tag.text === trimmed)) return prev;
+			const newTag: TagType = { id: Date.now().toString(), text: trimmed };
+			return [...prev, newTag];
+		});
+	}, []);
+
+	return (
+		<TagInput
+			id={id}
+			tags={tags}
+			setTags={handleSetTags}
+			placeholder={placeholder ?? "Add email"}
+			onAddTag={handleAddTag}
+			styleClasses={{
+				inlineTagsContainer:
+					"border border-input rounded-md bg-transparent shadow-xs transition-[color,box-shadow] outline-none focus-within:border-ring p-1 gap-1",
+				input:
+					"w-full min-w-[80px] focus-visible:outline-none shadow-none px-2 h-7 text-foreground placeholder:text-muted-foreground bg-transparent text-sm md:text-sm",
+				tag: {
+					body: "h-7 relative bg-accent dark:bg-muted/60 border-0 hover:bg-accent/80 dark:hover:bg-muted rounded-md font-medium text-xs text-foreground/80 ps-2 pe-7 flex",
+					closeButton:
+						"absolute -inset-y-px -end-px p-0 rounded-e-md flex size-7 transition-colors outline-0 focus-visible:outline focus-visible:outline-2 focus-visible:outline-ring/70 text-foreground hover:text-foreground",
+				},
+			}}
+			activeTagIndex={activeTagIndex}
+			setActiveTagIndex={setActiveTagIndex}
+		/>
+	);
+}
diff --git a/surfsense_web/features/chat-messages/hitl/edit-panel/fields/extra-fields.tsx b/surfsense_web/features/chat-messages/hitl/edit-panel/fields/extra-fields.tsx
new file mode 100644
index 0000000000..4770d2bff6
--- /dev/null
+++ b/surfsense_web/features/chat-messages/hitl/edit-panel/fields/extra-fields.tsx
@@ -0,0 +1,74 @@
+"use client";
+
+import { Input } from "@/components/ui/input";
+import { Label } from "@/components/ui/label";
+import { Textarea } from "@/components/ui/textarea";
+import type { ExtraField } from "../edit-panel.atom";
+import { DateTimePickerField } from "./calendar-field";
+import { EmailsTagField } from "./email-tags-field";
+
+/**
+ * Renders ``ExtraField[]`` as a labelled vertical stack. Picks the
+ * input control from ``field.type``; unknown types fall back to a
+ * plain ``<Input type={field.type} />`` (covers "text" and "email").
+ *
+ * Pure presentational component — owns no state, just maps values to
+ * controls and propagates changes through ``onFieldChange(key, value)``.
+ */
+export function ExtraFieldsSection({
+	fields,
+	values,
+	onFieldChange,
+}: {
+	fields: ExtraField[];
+	values: Record<string, string>;
+	onFieldChange: (key: string, value: string) => void;
+}) {
+	if (fields.length === 0) return null;
+
+	return (
+		<div className="flex flex-col gap-3 px-4 py-3 border-b">
+			{fields.map((field) => {
+				const fieldId = `extra-field-${field.key}`;
+				const currentValue = values[field.key] ?? "";
+
+				return (
+					<div key={field.key} className="flex flex-col gap-1.5">
+						<Label htmlFor={fieldId} className="text-xs font-medium text-muted-foreground">
+							{field.label}
+						</Label>
+						{field.type === "emails" ? (
+							<EmailsTagField
+								id={fieldId}
+								value={currentValue}
+								onChange={(v) => onFieldChange(field.key, v)}
+								placeholder={`Add ${field.label.toLowerCase()}`}
+							/>
+						) : field.type === "datetime-local" ? (
+							<DateTimePickerField
+								id={fieldId}
+								value={currentValue}
+								onChange={(v) => onFieldChange(field.key, v)}
+							/>
+						) : field.type === "textarea" ? (
+							<Textarea
+								id={fieldId}
+								value={currentValue}
+								onChange={(e) => onFieldChange(field.key, e.target.value)}
+								className="text-sm min-h-[60px]"
+							/>
+						) : (
+							<Input
+								id={fieldId}
+								type={field.type}
+								value={currentValue}
+								onChange={(e) => onFieldChange(field.key, e.target.value)}
+								className="text-sm"
+							/>
+						)}
+					</div>
+				);
+			})}
+		</div>
+	);
+}
diff --git a/surfsense_web/features/chat-messages/hitl/edit-panel/fields/index.ts b/surfsense_web/features/chat-messages/hitl/edit-panel/fields/index.ts
new file mode 100644
index 0000000000..dba18fc479
--- /dev/null
+++ b/surfsense_web/features/chat-messages/hitl/edit-panel/fields/index.ts
@@ -0,0 +1,3 @@
+export { DateTimePickerField } from "./calendar-field";
+export { EmailsTagField } from "./email-tags-field";
+export { ExtraFieldsSection } from "./extra-fields";
diff --git a/surfsense_web/features/chat-messages/hitl/edit-panel/index.ts b/surfsense_web/features/chat-messages/hitl/edit-panel/index.ts
new file mode 100644
index 0000000000..141bccdab2
--- /dev/null
+++ b/surfsense_web/features/chat-messages/hitl/edit-panel/index.ts
@@ -0,0 +1,7 @@
+export { HitlEditPanel, HitlEditPanelContent, MobileHitlEditPanel } from "./edit-panel";
+export type { ExtraField } from "./edit-panel.atom";
+export {
+	closeHitlEditPanelAtom,
+	hitlEditPanelAtom,
+	openHitlEditPanelAtom,
+} from "./edit-panel.atom";
diff --git a/surfsense_web/features/chat-messages/hitl/index.ts b/surfsense_web/features/chat-messages/hitl/index.ts
new file mode 100644
index 0000000000..50cc8ad694
--- /dev/null
+++ b/surfsense_web/features/chat-messages/hitl/index.ts
@@ -0,0 +1,31 @@
+export {
+	type HitlApprovalAPI,
+	HitlApprovalCard,
+	PendingInterruptProvider,
+	type PendingInterruptState,
+	type PendingInterruptValue,
+	useHitlApproval,
+	usePendingInterrupt,
+} from "./approval";
+export { DoomLoopApproval, GenericHitlApproval, isDoomLoopInterrupt } from "./approval-cards";
+export {
+	closeHitlEditPanelAtom,
+	type ExtraField,
+	HitlEditPanel,
+	HitlEditPanelContent,
+	hitlEditPanelAtom,
+	MobileHitlEditPanel,
+	openHitlEditPanelAtom,
+} from "./edit-panel";
+export type {
+	HitlDecision,
+	HitlPhase,
+	InterruptActionRequest,
+	InterruptResult,
+	InterruptReviewConfig,
+	PerToolApprovalCard,
+	PerToolApprovalCardProps,
+} from "./types";
+export { isInterruptResult } from "./types";
+export { useHitlDecision } from "./use-hitl-decision";
+export { useHitlPhase } from "./use-hitl-phase";
diff --git a/surfsense_web/features/chat-messages/hitl/types.ts b/surfsense_web/features/chat-messages/hitl/types.ts
new file mode 100644
index 0000000000..76af439f17
--- /dev/null
+++ b/surfsense_web/features/chat-messages/hitl/types.ts
@@ -0,0 +1,59 @@
+import type { ReactNode } from "react";
+
+export interface InterruptActionRequest {
+	name: string;
+	args: Record<string, unknown>;
+}
+
+export interface InterruptReviewConfig {
+	action_name: string;
+	allowed_decisions: Array<"approve" | "edit" | "reject" | "approve_always">;
+}
+
+export interface InterruptResult<C extends Record<string, unknown> = Record<string, unknown>> {
+	__interrupt__: true;
+	__decided__?: "approve" | "reject" | "edit" | "approve_always";
+	__completed__?: boolean;
+	action_requests: InterruptActionRequest[];
+	review_configs: InterruptReviewConfig[];
+	interrupt_type?: string;
+	context?: C;
+	message?: string;
+}
+
+export function isInterruptResult(result: unknown): result is InterruptResult {
+	return (
+		typeof result === "object" &&
+		result !== null &&
+		"__interrupt__" in result &&
+		(result as InterruptResult).__interrupt__ === true
+	);
+}
+
+export interface HitlDecision {
+	type: "approve" | "reject" | "edit" | "approve_always";
+	message?: string;
+	edited_action?: {
+		name: string;
+		args: Record<string, unknown>;
+	};
+}
+
+export type HitlPhase = "pending" | "processing" | "complete" | "rejected";
+
+export interface PerToolApprovalCardProps {
+	toolName: string;
+	toolCallId: string;
+	args: Record<string, unknown>;
+	result: InterruptResult;
+}
+
+/**
+ * Type signature for per-tool fallback approval cards (e.g.
+ * ``GenericHitlApproval``, ``DoomLoopApproval``) mounted by
+ * ``FallbackToolBody`` for unregistered HITL tools.
+ *
+ * Distinct from ``HitlApprovalCard`` (the high-level multi/single
+ * chrome) — this is the per-tool body that the chrome wraps.
+ */
+export type PerToolApprovalCard = (props: PerToolApprovalCardProps) => ReactNode;
diff --git a/surfsense_web/features/chat-messages/hitl/use-hitl-decision.ts b/surfsense_web/features/chat-messages/hitl/use-hitl-decision.ts
new file mode 100644
index 0000000000..f35be46a3f
--- /dev/null
+++ b/surfsense_web/features/chat-messages/hitl/use-hitl-decision.ts
@@ -0,0 +1,32 @@
+import { useCallback } from "react";
+import { useHitlApproval } from "./approval/approval-context";
+import type { HitlDecision } from "./types";
+
+/**
+ * Per-tool components always call ``dispatch([decision])``. We route
+ * through ``HitlApprovalContext`` when mounted inside an approval
+ * card (so multi-approval can stage and pager-navigate), and fall
+ * back to the ``hitl-decision`` window event for standalone callers.
+ */
+export function useHitlDecision() {
+	const approval = useHitlApproval();
+
+	const dispatch = useCallback(
+		(decisions: HitlDecision[]) => {
+			if (approval && decisions.length > 0) {
+				if (decisions.length > 1 && process.env.NODE_ENV !== "production") {
+					console.warn(
+						"[hitl] dispatch received %d decisions inside an approval card; only [0] will be staged",
+						decisions.length
+					);
+				}
+				approval.stage(decisions[0]);
+				return;
+			}
+			window.dispatchEvent(new CustomEvent("hitl-decision", { detail: { decisions } }));
+		},
+		[approval]
+	);
+
+	return { dispatch };
+}
diff --git a/surfsense_web/hooks/use-hitl-phase.ts b/surfsense_web/features/chat-messages/hitl/use-hitl-phase.ts
similarity index 66%
rename from surfsense_web/hooks/use-hitl-phase.ts
rename to surfsense_web/features/chat-messages/hitl/use-hitl-phase.ts
index 2921dc221d..e2dfb3bbaf 100644
--- a/surfsense_web/hooks/use-hitl-phase.ts
+++ b/surfsense_web/features/chat-messages/hitl/use-hitl-phase.ts
@@ -1,6 +1,5 @@
 import { useEffect, useRef, useState } from "react";
-
-export type HitlPhase = "pending" | "processing" | "complete" | "rejected";
+import type { HitlPhase } from "./types";
 
 interface HitlInterruptLike {
 	__decided__?: string | null;
@@ -11,13 +10,20 @@ const MINIMUM_SHIMMER_MS = 500;
 const FALLBACK_TIMEOUT_MS = 30_000;
 
 /**
- * State machine for HITL approval card phases.
+ * Local UI state machine for a HITL approval card.
+ *
+ * Phase transitions:
+ *   pending    → user has not yet decided (show approve/edit/reject buttons)
+ *   processing → user clicked; awaiting backend confirmation (shimmer)
+ *   complete   → backend acknowledged via __completed__ (or fallback timeout)
+ *   rejected   → user explicitly rejected (terminal state, no backend wait)
+ *
+ * Initial phase is derived from the current ``__decided__`` /
+ * ``__completed__`` markers on the result, so cards rehydrate
+ * correctly from persisted history.
  *
- * Phases:
- *   pending    – waiting for user decision (show buttons)
- *   processing – user approved/edited, waiting for backend (shimmer)
- *   complete   – backend responded with __completed__ (done text)
- *   rejected   – user rejected (cancelled text)
+ * NOT shared across cards. Each approval card calls ``useHitlPhase``
+ * once with its own interrupt result.
  */
 export function useHitlPhase(interruptData: HitlInterruptLike): {
 	phase: HitlPhase;
@@ -32,7 +38,6 @@ export function useHitlPhase(interruptData: HitlInterruptLike): {
 
 	const shimmerStartRef = useRef<number | null>(null);
 
-	// processing → complete when __completed__ arrives (with min shimmer duration)
 	useEffect(() => {
 		if (phase !== "processing") return;
 		if (!interruptData.__completed__) return;
@@ -44,7 +49,6 @@ export function useHitlPhase(interruptData: HitlInterruptLike): {
 		return () => clearTimeout(timer);
 	}, [phase, interruptData.__completed__]);
 
-	// Fallback: processing → complete after 30s even if __completed__ never arrives
 	useEffect(() => {
 		if (phase !== "processing") return;
 		const fallback = setTimeout(() => setPhase("complete"), FALLBACK_TIMEOUT_MS);
diff --git a/surfsense_web/features/chat-messages/timeline/build-timeline.ts b/surfsense_web/features/chat-messages/timeline/build-timeline.ts
new file mode 100644
index 0000000000..d4365f211f
--- /dev/null
+++ b/surfsense_web/features/chat-messages/timeline/build-timeline.ts
@@ -0,0 +1,249 @@
+import type { ItemStatus, ReasoningItem, TimelineItem, ToolCallItem } from "./types";
+
+/**
+ * Structural shape of the relay's ``data-thinking-step`` payload.
+ * Declared here (not imported) so the builder stays free of the
+ * legacy ``thinking-steps.tsx`` dependency.
+ */
+export interface ThinkingStepInput {
+	id: string;
+	title: string;
+	items: string[];
+	status: "pending" | "in_progress" | "completed";
+	metadata?: Record<string, unknown>;
+}
+
+/** Narrowed tool-call shape; the assistant-ui content type is wider. */
+interface ToolCallPart {
+	type: "tool-call";
+	toolCallId: string;
+	toolName: string;
+	args?: Record<string, unknown>;
+	argsText?: string;
+	result?: unknown;
+	langchainToolCallId?: string;
+	metadata?: Record<string, unknown>;
+}
+
+function isToolCallPart(part: unknown): part is ToolCallPart {
+	if (!part || typeof part !== "object") return false;
+	const o = part as { type?: unknown; toolCallId?: unknown; toolName?: unknown };
+	return (
+		o.type === "tool-call" && typeof o.toolCallId === "string" && typeof o.toolName === "string"
+	);
+}
+
+function asNonEmptyString(v: unknown): string | undefined {
+	return typeof v === "string" && v.trim().length > 0 ? v.trim() : undefined;
+}
+
+/**
+ * True iff THIS tool-call is the actual interrupt request (carries an
+ * ``action_requests[]``), not just a parent ``task`` wrapper that
+ * inherited the propagated ``__interrupt__`` flag. Pending requests
+ * are hidden so ``HitlApprovalCard`` owns the pending UX; the
+ * ``length > 0`` guard keeps parent task wrappers visible so their
+ * children stay indented under the delegation span.
+ */
+function isPendingHitlInterrupt(result: unknown): boolean {
+	if (typeof result !== "object" || result === null) return false;
+	const r = result as {
+		__interrupt__?: unknown;
+		__decided__?: unknown;
+		action_requests?: unknown;
+	};
+	return (
+		r.__interrupt__ === true &&
+		r.__decided__ === undefined &&
+		Array.isArray(r.action_requests) &&
+		r.action_requests.length > 0
+	);
+}
+
+/**
+ * Stable interrupt signal across pre/post decision: the resume flow
+ * spreads the original result and only adds ``__decided__``, so
+ * ``__interrupt__`` alone is the right key.
+ */
+function hasInterruptMarker(result: unknown): boolean {
+	if (typeof result !== "object" || result === null) return false;
+	return (result as { __interrupt__?: unknown }).__interrupt__ === true;
+}
+
+interface ToolCallSlim {
+	toolName: string;
+	toolCallId: string;
+	result?: unknown;
+	spanId?: string;
+}
+
+/**
+ * During the live-resume window the in-memory message holds BOTH the
+ * OLD interrupt-frame parts AND the freshly-streamed resume parts in
+ * a new ``task`` scope. Without this filter we'd render both until
+ * the next reload (where ``reconcileInterruptedAssistantMessages``
+ * folds the OLD row into the resume row upstream).
+ *
+ * A tool-call is "interrupt-affected" when it either carries
+ * ``__interrupt__`` directly or sits in a span that contains one. An
+ * affected call is superseded iff a later same-name call in a
+ * different scope exists. The conservative branch (no successor)
+ * preserves rejects that ended the run with no replacement.
+ */
+function collectSupersededToolCallIds(content: readonly unknown[]): Set<string> {
+	const slims: ToolCallSlim[] = [];
+	for (const part of content) {
+		if (!isToolCallPart(part)) continue;
+		slims.push({
+			toolName: part.toolName,
+			toolCallId: part.toolCallId,
+			result: part.result,
+			spanId: asNonEmptyString(part.metadata?.spanId),
+		});
+	}
+
+	const interruptedSpans = new Set<string>();
+	for (const tc of slims) {
+		if (!hasInterruptMarker(tc.result)) continue;
+		if (tc.spanId) interruptedSpans.add(tc.spanId);
+	}
+
+	const superseded = new Set<string>();
+	for (let i = 0; i < slims.length; i++) {
+		const tc = slims[i];
+		const inInterruptedSpan = tc.spanId !== undefined && interruptedSpans.has(tc.spanId);
+		const isDirectInterrupt = hasInterruptMarker(tc.result);
+		if (!inInterruptedSpan && !isDirectInterrupt) continue;
+
+		for (let j = i + 1; j < slims.length; j++) {
+			// Both-undefined counts as different scopes so standalone
+			// HITL tools (no delegation) get caught.
+			const sameSpan = tc.spanId !== undefined && slims[j].spanId === tc.spanId;
+			if (slims[j].toolName === tc.toolName && !sameSpan) {
+				superseded.add(tc.toolCallId);
+				break;
+			}
+		}
+	}
+
+	return superseded;
+}
+
+/**
+ * Coarse status for orphan tool-calls (no joined thinking step). The
+ * per-tool body picks its own visual state from ``result``; this
+ * only feeds the chrome dot/header.
+ */
+function deriveToolCallStatus(result: unknown): ItemStatus {
+	if (!result) return "running";
+	if (typeof result === "object" && result !== null) {
+		const r = result as { __interrupt__?: unknown; __decided__?: unknown };
+		if (r.__interrupt__ === true && r.__decided__ === "reject") return "cancelled";
+	}
+	return "completed";
+}
+
+function mapStepStatus(status: ThinkingStepInput["status"]): ItemStatus {
+	if (status === "in_progress") return "running";
+	return status;
+}
+
+/**
+ * Pure builder: thinking steps + message content → ``TimelineItem[]``.
+ * Joins tool-calls to thinking steps via ``metadata.thinkingStepId``,
+ * appends unjoined tool-calls as orphans, drops superseded
+ * interrupt-frame parts and pending HITL requests (those are owned
+ * by ``HitlApprovalCard``). ``result`` is forwarded verbatim so
+ * per-tool bodies can discriminate.
+ */
+export function buildTimeline(
+	thinkingSteps: readonly ThinkingStepInput[],
+	content: readonly unknown[] | undefined
+): TimelineItem[] {
+	const toolByStepId = new Map<string, ToolCallPart>();
+	const supersededStepIds = new Set<string>();
+	const consumedToolCallIds = new Set<string>();
+	const superseded = content ? collectSupersededToolCallIds(content) : new Set<string>();
+
+	if (content) {
+		for (const part of content) {
+			if (!isToolCallPart(part)) continue;
+			const tid = asNonEmptyString(part.metadata?.thinkingStepId);
+			if (superseded.has(part.toolCallId)) {
+				if (tid) supersededStepIds.add(tid);
+				continue;
+			}
+			if (tid) toolByStepId.set(tid, part);
+		}
+	}
+
+	const items: TimelineItem[] = [];
+
+	for (const step of thinkingSteps) {
+		// Drop the step alongside its superseded tool-call, otherwise
+		// it'd render as an orphan reasoning row with the OLD title.
+		if (supersededStepIds.has(step.id)) continue;
+
+		const stepSpanId = asNonEmptyString(step.metadata?.spanId);
+		const joined = toolByStepId.get(step.id);
+
+		if (joined && isPendingHitlInterrupt(joined.result)) {
+			consumedToolCallIds.add(joined.toolCallId);
+			continue;
+		}
+
+		if (joined) {
+			consumedToolCallIds.add(joined.toolCallId);
+			const item: ToolCallItem = {
+				kind: "tool-call",
+				id: step.id,
+				status: mapStepStatus(step.status),
+				items: step.items.length > 0 ? step.items : undefined,
+				spanId: stepSpanId ?? asNonEmptyString(joined.metadata?.spanId),
+				toolName: joined.toolName,
+				toolCallId: joined.toolCallId,
+				args: joined.args ?? {},
+				argsText: joined.argsText,
+				result: joined.result,
+				langchainToolCallId: joined.langchainToolCallId,
+				thinkingStepId: step.id,
+			};
+			items.push(item);
+			continue;
+		}
+
+		const reasoning: ReasoningItem = {
+			kind: "reasoning",
+			id: step.id,
+			status: mapStepStatus(step.status),
+			items: step.items.length > 0 ? step.items : undefined,
+			spanId: stepSpanId,
+			title: step.title,
+		};
+		items.push(reasoning);
+	}
+
+	if (content) {
+		for (const part of content) {
+			if (!isToolCallPart(part)) continue;
+			if (consumedToolCallIds.has(part.toolCallId)) continue;
+			if (superseded.has(part.toolCallId)) continue;
+			if (isPendingHitlInterrupt(part.result)) continue;
+			const orphan: ToolCallItem = {
+				kind: "tool-call",
+				id: part.toolCallId,
+				status: deriveToolCallStatus(part.result),
+				spanId: asNonEmptyString(part.metadata?.spanId),
+				toolName: part.toolName,
+				toolCallId: part.toolCallId,
+				args: part.args ?? {},
+				argsText: part.argsText,
+				result: part.result,
+				langchainToolCallId: part.langchainToolCallId,
+			};
+			items.push(orphan);
+		}
+	}
+
+	return items;
+}
diff --git a/surfsense_web/features/chat-messages/timeline/data-renderer.tsx b/surfsense_web/features/chat-messages/timeline/data-renderer.tsx
new file mode 100644
index 0000000000..fb3dda047d
--- /dev/null
+++ b/surfsense_web/features/chat-messages/timeline/data-renderer.tsx
@@ -0,0 +1,56 @@
+"use client";
+
+import { makeAssistantDataUI, useAuiState } from "@assistant-ui/react";
+import { useMemo } from "react";
+import { PendingInterruptProvider, usePendingInterrupt } from "@/features/chat-messages/hitl";
+import { buildTimeline, type ThinkingStepInput } from "./build-timeline";
+import { Timeline } from "./timeline";
+
+const noopSubmit = () => {};
+
+/**
+ * assistant-ui data UI for the ``thinking-steps`` data-part.
+ *
+ * Re-scopes the global ``PendingInterruptProvider`` per message: approval
+ * cards only mount under the assistant message that owns the interrupt
+ * (otherwise every message in scrollback would render its own cards).
+ */
+function TimelineDataRenderer({ data }: { name: string; data: unknown }) {
+	const isThreadRunning = useAuiState(({ thread }) => thread.isRunning);
+	const isLastMessage = useAuiState(({ message }) => message?.isLast ?? false);
+	const isMessageStreaming = isThreadRunning && isLastMessage;
+	const content = useAuiState(({ message }) => message?.content);
+	const messageId = useAuiState(({ message }) => message?.id);
+	const pendingValue = usePendingInterrupt();
+	const pendingForThisMessage = useMemo(
+		() => (pendingValue?.pendingInterrupts ?? []).filter((p) => p.assistantMsgId === messageId),
+		[pendingValue?.pendingInterrupts, messageId]
+	);
+	const onSubmit = pendingValue?.onSubmit ?? noopSubmit;
+
+	const steps = useMemo<ThinkingStepInput[]>(
+		() => (data as { steps: ThinkingStepInput[] } | null)?.steps ?? [],
+		[data]
+	);
+
+	const items = useMemo(
+		() => buildTimeline(steps, Array.isArray(content) ? content : undefined),
+		[steps, content]
+	);
+
+	if (items.length === 0 && pendingForThisMessage.length === 0) return null;
+
+	return (
+		<div className="mb-3 -mx-2 leading-normal">
+			<PendingInterruptProvider pendingInterrupts={pendingForThisMessage} onSubmit={onSubmit}>
+				<Timeline items={items} isThreadRunning={isMessageStreaming} />
+			</PendingInterruptProvider>
+		</div>
+	);
+}
+
+/** Registers under ``thinking-steps`` so consumers swap the import only. */
+export const TimelineDataUI = makeAssistantDataUI({
+	name: "thinking-steps",
+	render: TimelineDataRenderer,
+});
diff --git a/surfsense_web/features/chat-messages/timeline/grouping.ts b/surfsense_web/features/chat-messages/timeline/grouping.ts
new file mode 100644
index 0000000000..478ec4b7bf
--- /dev/null
+++ b/surfsense_web/features/chat-messages/timeline/grouping.ts
@@ -0,0 +1,47 @@
+import type { TimelineGroup, TimelineItem } from "./types";
+
+/**
+ * Group delegated child items under their owning ``task`` parent.
+ *
+ * Backend invariant: ``metadata.spanId`` is set only while a ``task``
+ * tool is open, so every non-task item with ``spanId = X`` shares it
+ * with the ``task`` that owns the span. We promote that task to the
+ * group header.
+ *
+ * The owner-missing branch defends against the live-resume window
+ * where the OLD ``task`` wrapper can be superseded while its
+ * children briefly survive — without it, grouping would promote
+ * the first orphan child to parent and visually nest its siblings
+ * under it.
+ */
+export function groupItems(items: readonly TimelineItem[]): TimelineGroup[] {
+	const spanOwners = new Set<string>();
+	for (const item of items) {
+		if (item.kind === "tool-call" && item.toolName === "task" && item.spanId) {
+			spanOwners.add(item.spanId);
+		}
+	}
+
+	const groups: TimelineGroup[] = [];
+	const spanParent = new Map<string, TimelineGroup>();
+
+	for (const item of items) {
+		const sid = item.spanId;
+		if (!sid || !spanOwners.has(sid)) {
+			groups.push({ parent: item, children: [] });
+			continue;
+		}
+
+		const existing = spanParent.get(sid);
+		if (existing) {
+			existing.children.push(item);
+			continue;
+		}
+
+		const group: TimelineGroup = { parent: item, children: [] };
+		groups.push(group);
+		spanParent.set(sid, group);
+	}
+
+	return groups;
+}
diff --git a/surfsense_web/features/chat-messages/timeline/index.ts b/surfsense_web/features/chat-messages/timeline/index.ts
new file mode 100644
index 0000000000..5e731acd31
--- /dev/null
+++ b/surfsense_web/features/chat-messages/timeline/index.ts
@@ -0,0 +1,16 @@
+/**
+ * Public surface of the ``timeline/`` slice.
+ *
+ * Consumers (assistant-message, public-thread, free-chat-page, etc.)
+ * import ONLY from this barrel. Internal modules — ``items/``,
+ * ``tool-registry/``, ``timeline-group-row``, ``build-timeline``,
+ * ``grouping``, ``subagent-rename`` — are intentionally NOT
+ * re-exported. Adding consumers? Talk to the architecture doc first
+ * (see §6 layering rules).
+ */
+
+export type { ThinkingStepInput } from "./build-timeline";
+export { TimelineDataUI } from "./data-renderer";
+export { Timeline } from "./timeline";
+export type { TimelineToolComponent, TimelineToolProps } from "./tool-registry/types";
+export type { ItemStatus, ReasoningItem, TimelineGroup, TimelineItem, ToolCallItem } from "./types";
diff --git a/surfsense_web/features/chat-messages/timeline/items/index.ts b/surfsense_web/features/chat-messages/timeline/items/index.ts
new file mode 100644
index 0000000000..7b1817b61a
--- /dev/null
+++ b/surfsense_web/features/chat-messages/timeline/items/index.ts
@@ -0,0 +1,3 @@
+export { ItemHeader } from "./item-header";
+export { ReasoningItem } from "./reasoning-item";
+export { ToolCallItem } from "./tool-call-item";
diff --git a/surfsense_web/features/chat-messages/timeline/items/item-header.tsx b/surfsense_web/features/chat-messages/timeline/items/item-header.tsx
new file mode 100644
index 0000000000..1926559860
--- /dev/null
+++ b/surfsense_web/features/chat-messages/timeline/items/item-header.tsx
@@ -0,0 +1,52 @@
+import type { FC } from "react";
+import { ChainOfThoughtItem } from "@/components/prompt-kit/chain-of-thought";
+import { cn } from "@/lib/utils";
+import type { ItemStatus } from "../types";
+
+/**
+ * The title row + sub-bullets shared by every timeline item kind. The
+ * timeline's chrome (status dot, indent, vertical line) renders to the
+ * left; this fills the right column.
+ *
+ * Status-aware text styling matches the legacy ``StepBody`` semantics:
+ *   running   → emphasised (font-medium foreground)
+ *   completed → muted
+ *   pending   → muted/60
+ *   error     → destructive
+ *   cancelled → strikethrough muted
+ *
+ * Sub-bullets render via ``ChainOfThoughtItem`` (reused from
+ * ``components/prompt-kit/chain-of-thought``) — same component the
+ * legacy ``StepBody`` used.
+ */
+export const ItemHeader: FC<{
+	title: string;
+	status: ItemStatus;
+	items?: readonly string[];
+	itemKey: string;
+}> = ({ title, status, items, itemKey }) => (
+	<div className="min-w-0">
+		<div
+			className={cn(
+				"text-sm leading-5",
+				status === "running" && "text-foreground font-medium",
+				status === "completed" && "text-muted-foreground",
+				status === "pending" && "text-muted-foreground/60",
+				status === "error" && "text-destructive",
+				status === "cancelled" && "text-muted-foreground line-through"
+			)}
+		>
+			{title}
+		</div>
+
+		{items && items.length > 0 && (
+			<div className="mt-1 space-y-0.5">
+				{items.map((item) => (
+					<ChainOfThoughtItem key={`${itemKey}-${item}`} className="text-xs">
+						{item}
+					</ChainOfThoughtItem>
+				))}
+			</div>
+		)}
+	</div>
+);
diff --git a/surfsense_web/features/chat-messages/timeline/items/reasoning-item.tsx b/surfsense_web/features/chat-messages/timeline/items/reasoning-item.tsx
new file mode 100644
index 0000000000..3198d3375a
--- /dev/null
+++ b/surfsense_web/features/chat-messages/timeline/items/reasoning-item.tsx
@@ -0,0 +1,15 @@
+import type { FC } from "react";
+import type { ReasoningItem as ReasoningItemModel } from "../types";
+import { ItemHeader } from "./item-header";
+
+/**
+ * Renders a ``kind: "reasoning"`` row — pure agent narration with no
+ * tool component beneath it. Just the shared header.
+ *
+ * Native ``<think>`` blocks (model-level reasoning) are NOT rendered
+ * here — they live in the body via assistant-ui's ``Reasoning``
+ * component.
+ */
+export const ReasoningItem: FC<{ item: ReasoningItemModel }> = ({ item }) => (
+	<ItemHeader title={item.title} status={item.status} items={item.items} itemKey={item.id} />
+);
diff --git a/surfsense_web/features/chat-messages/timeline/items/tool-call-item.tsx b/surfsense_web/features/chat-messages/timeline/items/tool-call-item.tsx
new file mode 100644
index 0000000000..4f96c8bd5a
--- /dev/null
+++ b/surfsense_web/features/chat-messages/timeline/items/tool-call-item.tsx
@@ -0,0 +1,26 @@
+"use client";
+
+import type { FC } from "react";
+import { getToolDisplayName } from "@/contracts/enums/toolIcons";
+import { resolveItemTitle } from "../subagent-rename";
+import { adaptItemToProps, FallbackToolBody, getToolComponent } from "../tool-registry";
+import type { ToolCallItem as ToolCallItemModel } from "../types";
+import { ItemHeader } from "./item-header";
+
+/**
+ * Renders a tool-call row. Pending HITL interrupts are filtered
+ * upstream in ``buildTimeline`` (owned by ``HitlApprovalCard``); this
+ * component only sees running / completed / errored / decided rows.
+ */
+export const ToolCallItem: FC<{ item: ToolCallItemModel }> = ({ item }) => {
+	const title = resolveItemTitle(item, getToolDisplayName);
+	const Body = getToolComponent(item.toolName) ?? FallbackToolBody;
+	const props = adaptItemToProps(item);
+
+	return (
+		<>
+			<ItemHeader title={title} status={item.status} items={item.items} itemKey={item.id} />
+			<Body {...props} />
+		</>
+	);
+};
diff --git a/surfsense_web/features/chat-messages/timeline/subagent-rename.ts b/surfsense_web/features/chat-messages/timeline/subagent-rename.ts
new file mode 100644
index 0000000000..87accd8d79
--- /dev/null
+++ b/surfsense_web/features/chat-messages/timeline/subagent-rename.ts
@@ -0,0 +1,47 @@
+import type { TimelineItem, ToolCallItem } from "./types";
+
+function asNonEmptyString(v: unknown): string | undefined {
+	return typeof v === "string" && v.trim().length > 0 ? v.trim() : undefined;
+}
+
+/**
+ * Title-case a subagent identifier:
+ *   "notion"        → "Notion"
+ *   "doc_research"  → "Doc Research"
+ *   "ux-review"     → "Ux Review"
+ */
+export function titleCaseSubagent(raw: string): string {
+	return raw
+		.split(/[\s_-]+/)
+		.filter(Boolean)
+		.map((part) => part.charAt(0).toUpperCase() + part.slice(1))
+		.join(" ");
+}
+
+/**
+ * Display title for a tool-call item. For the ``task`` delegation
+ * primitive, substitute ``args.subagent_type`` (e.g. "Notion" instead
+ * of the generic "Task" label). Returns ``undefined`` if no rename
+ * applies — caller falls back to ``getToolDisplayName(toolName)``.
+ */
+export function resolveSubagentTitle(item: ToolCallItem): string | undefined {
+	if (item.toolName !== "task") return undefined;
+	const subagent = asNonEmptyString(item.args?.subagent_type);
+	return subagent ? titleCaseSubagent(subagent) : undefined;
+}
+
+/**
+ * Unified title resolver for any timeline item. Reasoning items use
+ * their own ``title``; tool-call items try the subagent rename first,
+ * then fall back to the resolver passed in (typically
+ * ``getToolDisplayName``).
+ *
+ * Pure: no React, no I/O. Trivially testable.
+ */
+export function resolveItemTitle(
+	item: TimelineItem,
+	getToolDisplayName: (toolName: string) => string
+): string {
+	if (item.kind === "reasoning") return item.title;
+	return resolveSubagentTitle(item) ?? getToolDisplayName(item.toolName);
+}
diff --git a/surfsense_web/features/chat-messages/timeline/timeline-group-row.tsx b/surfsense_web/features/chat-messages/timeline/timeline-group-row.tsx
new file mode 100644
index 0000000000..4c33e52bdd
--- /dev/null
+++ b/surfsense_web/features/chat-messages/timeline/timeline-group-row.tsx
@@ -0,0 +1,68 @@
+"use client";
+
+import type { FC } from "react";
+import { cn } from "@/lib/utils";
+import { ReasoningItem, ToolCallItem } from "./items";
+import type { ItemStatus, TimelineGroup, TimelineItem } from "./types";
+
+function renderItem(item: TimelineItem) {
+	if (item.kind === "reasoning") return <ReasoningItem item={item} />;
+	return <ToolCallItem item={item} />;
+}
+
+/**
+ * Single group row in the timeline tree: status dot + connector line in
+ * the gutter, parent item content + indented children in the body.
+ *
+ * The connector line overshoots by ~15px to land on the next group's
+ * dot center; the line passes BEHIND any indented children (whose
+ * column has no dot of its own) for a clean tree look.
+ */
+export const TimelineGroupRow: FC<{
+	group: TimelineGroup;
+	parentStatus: ItemStatus;
+	showParentLine: boolean;
+}> = ({ group, parentStatus, showParentLine }) => {
+	const hasChildren = group.children.length > 0;
+
+	return (
+		<div className="relative flex gap-3">
+			<div className="relative flex flex-col items-center w-2 self-stretch">
+				{showParentLine && (
+					<div className="absolute left-1/2 top-[15px] -bottom-[15px] w-px -translate-x-1/2 bg-muted-foreground/30" />
+				)}
+				<div className="relative z-10 mt-[7px] flex shrink-0 items-center justify-center">
+					{parentStatus === "running" ? (
+						<span className="relative flex size-2">
+							<span className="absolute inline-flex size-full animate-ping rounded-full bg-primary/60" />
+							<span className="relative inline-flex size-2 rounded-full bg-primary" />
+						</span>
+					) : (
+						<span
+							className={cn(
+								"size-2 rounded-full",
+								parentStatus === "error"
+									? "bg-destructive"
+									: parentStatus === "cancelled"
+										? "bg-muted-foreground/40"
+										: "bg-muted-foreground/30"
+							)}
+						/>
+					)}
+				</div>
+			</div>
+
+			<div className="flex-1 min-w-0 pb-4">
+				{renderItem(group.parent)}
+
+				{hasChildren && (
+					<div className="mt-2 ml-3 space-y-2">
+						{group.children.map((child) => (
+							<div key={child.id}>{renderItem(child)}</div>
+						))}
+					</div>
+				)}
+			</div>
+		</div>
+	);
+};
diff --git a/surfsense_web/features/chat-messages/timeline/timeline.tsx b/surfsense_web/features/chat-messages/timeline/timeline.tsx
new file mode 100644
index 0000000000..c0074722ee
--- /dev/null
+++ b/surfsense_web/features/chat-messages/timeline/timeline.tsx
@@ -0,0 +1,158 @@
+"use client";
+
+import { ChevronRightIcon } from "lucide-react";
+import { type FC, useEffect, useMemo, useState } from "react";
+import { TextShimmerLoader } from "@/components/prompt-kit/loader";
+import { Button } from "@/components/ui/button";
+import { getToolDisplayName } from "@/contracts/enums/toolIcons";
+import { HitlApprovalCard, usePendingInterrupt } from "@/features/chat-messages/hitl";
+import { cn } from "@/lib/utils";
+import { groupItems } from "./grouping";
+import { resolveItemTitle } from "./subagent-rename";
+import { TimelineGroupRow } from "./timeline-group-row";
+import type { ItemStatus, TimelineItem } from "./types";
+
+/**
+ * Force a stale "running" to read as "completed" once the thread
+ * stops, so the chrome doesn't keep pulsing forever after a stream
+ * is aborted or disconnected.
+ */
+function effectiveStatus(status: ItemStatus, isThreadRunning: boolean): ItemStatus {
+	if (status === "running" && !isThreadRunning) return "completed";
+	return status;
+}
+
+/**
+ * The "process" surface in the body | timeline split. Pure consumer
+ * of ``TimelineItem[]`` — owns the collapsible chrome and tree
+ * indent only. Pending HITL interrupts mount ``HitlApprovalCard`` at
+ * the bottom; the card owns its own decision/pager state.
+ */
+export const Timeline: FC<{
+	items: readonly TimelineItem[];
+	isThreadRunning?: boolean;
+}> = ({ items, isThreadRunning = true }) => {
+	const pendingValue = usePendingInterrupt();
+	const pendingInterrupts = pendingValue?.pendingInterrupts ?? [];
+	const onSubmit = pendingValue?.onSubmit;
+	const hasPending = pendingInterrupts.length > 0;
+
+	// Apply the override here so downstream (grouping, headers, dots)
+	// sees the corrected status without threading a callback. Keeps
+	// ``buildTimeline`` pure.
+	const effectiveItems = useMemo<TimelineItem[]>(
+		() =>
+			items.map((it) => ({
+				...it,
+				status: effectiveStatus(it.status, isThreadRunning),
+			})),
+		[items, isThreadRunning]
+	);
+
+	const inProgressItem = useMemo(
+		() => effectiveItems.find((it) => it.status === "running"),
+		[effectiveItems]
+	);
+	const inProgressTitle = useMemo(
+		() => (inProgressItem ? resolveItemTitle(inProgressItem, getToolDisplayName) : undefined),
+		[inProgressItem]
+	);
+
+	// "Settled" includes cancelled/errored, not just completed —
+	// rejecting an interrupt leaves items in ``cancelled`` and the
+	// timeline still needs to auto-collapse.
+	const allSettled = useMemo(
+		() =>
+			effectiveItems.length > 0 &&
+			!isThreadRunning &&
+			!hasPending &&
+			effectiveItems.every(
+				(it) => it.status === "completed" || it.status === "cancelled" || it.status === "error"
+			),
+		[effectiveItems, isThreadRunning, hasPending]
+	);
+	const isProcessing = (isThreadRunning || hasPending) && !allSettled;
+
+	const [isOpen, setIsOpen] = useState(() => isProcessing);
+	useEffect(() => {
+		if (isProcessing) {
+			setIsOpen(true);
+			return;
+		}
+		if (allSettled) {
+			setIsOpen(false);
+		}
+	}, [allSettled, isProcessing]);
+
+	const groups = useMemo(() => groupItems(effectiveItems), [effectiveItems]);
+
+	if (effectiveItems.length === 0 && !hasPending) return null;
+
+	const headerText = (() => {
+		if (allSettled) return "Reviewed";
+		if (hasPending) return "Awaiting your decision";
+		if (inProgressTitle) return inProgressTitle;
+		if (isProcessing) return "Processing";
+		return "Reviewed";
+	})();
+
+	return (
+		<div className="mx-auto w-full max-w-(--thread-max-width) px-2 py-2">
+			<div className="rounded-lg">
+				<Button
+					variant="ghost"
+					type="button"
+					onClick={() => setIsOpen((prev) => !prev)}
+					className={cn(
+						"h-auto w-full justify-start gap-1.5 p-0 text-left text-sm font-normal transition-colors hover:bg-transparent",
+						"text-muted-foreground hover:text-accent-foreground"
+					)}
+				>
+					{isProcessing ? (
+						<TextShimmerLoader text={headerText} size="sm" />
+					) : (
+						<span>{headerText}</span>
+					)}
+					<ChevronRightIcon
+						data-icon="inline-end"
+						className={cn("transition-transform duration-200", isOpen && "rotate-90")}
+					/>
+				</Button>
+
+				<div
+					className={cn(
+						"grid transition-[grid-template-rows] duration-300 ease-out",
+						isOpen ? "grid-rows-[1fr]" : "grid-rows-[0fr]"
+					)}
+				>
+					<div className="overflow-hidden">
+						<div className="mt-3 pl-1">
+							{groups.map((group, idx) => {
+								const showLine = idx < groups.length - 1 || hasPending;
+								return (
+									<TimelineGroupRow
+										key={group.parent.id}
+										group={group}
+										parentStatus={group.parent.status}
+										showParentLine={showLine}
+									/>
+								);
+							})}
+							{hasPending && onSubmit && (
+								<div className="pl-5 space-y-3">
+									{pendingInterrupts.map((pi) => (
+										<HitlApprovalCard
+											key={pi.interruptId}
+											pendingInterrupt={pi}
+											onSubmit={(decisions) => onSubmit(pi.interruptId, decisions)}
+										/>
+									))}
+								</div>
+							)}
+						</div>
+					</div>
+				</div>
+			</div>
+		</div>
+	);
+};
diff --git a/surfsense_web/features/chat-messages/timeline/tool-registry/adapt-props.ts b/surfsense_web/features/chat-messages/timeline/tool-registry/adapt-props.ts
new file mode 100644
index 0000000000..ff9671372b
--- /dev/null
+++ b/surfsense_web/features/chat-messages/timeline/tool-registry/adapt-props.ts
@@ -0,0 +1,23 @@
+import type { ToolCallItem } from "../types";
+import type { TimelineToolProps } from "./types";
+
+/**
+ * Lossless mapping ``ToolCallItem → TimelineToolProps``. Pure;
+ * extracts only the fields tool components actually consume.
+ *
+ * ``id``, ``kind``, ``items``, ``spanId``, ``thinkingStepId`` are
+ * intentionally dropped — they're timeline-internal concerns (React
+ * key, dispatch, indentation, back-correlation) that tool components
+ * have no reason to see.
+ */
+export function adaptItemToProps(item: ToolCallItem): TimelineToolProps {
+	return {
+		toolCallId: item.toolCallId,
+		toolName: item.toolName,
+		args: item.args,
+		argsText: item.argsText,
+		result: item.result,
+		langchainToolCallId: item.langchainToolCallId,
+		status: item.status,
+	};
+}
diff --git a/surfsense_web/features/chat-messages/timeline/tool-registry/fallback/default-fallback-card.tsx b/surfsense_web/features/chat-messages/timeline/tool-registry/fallback/default-fallback-card.tsx
new file mode 100644
index 0000000000..57165c05fc
--- /dev/null
+++ b/surfsense_web/features/chat-messages/timeline/tool-registry/fallback/default-fallback-card.tsx
@@ -0,0 +1,217 @@
+"use client";
+
+import { CheckIcon, ChevronDownIcon, XCircleIcon } from "lucide-react";
+import { useEffect, useMemo, useState } from "react";
+import { NestedScroll } from "@/components/assistant-ui/nested-scroll";
+import { Badge } from "@/components/ui/badge";
+import { Button } from "@/components/ui/button";
+import { Card } from "@/components/ui/card";
+import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible";
+import { Separator } from "@/components/ui/separator";
+import { Spinner } from "@/components/ui/spinner";
+import { getToolDisplayName } from "@/contracts/enums/toolIcons";
+import { cn } from "@/lib/utils";
+import type { TimelineToolComponent } from "../types";
+import { ToolCardRevertButton } from "./revert-button";
+
+/**
+ * Best-effort error/cancellation reason from a tool result. Used as
+ * the card subtitle when ``status`` is "error" or "cancelled". Returns
+ * ``null`` if no usable text can be extracted.
+ *
+ * Tries: plain string → ``result.error`` → ``result.message`` →
+ * stringified result. Per-tool components own richer error UIs; this
+ * is the generic fallback's coarse summary.
+ */
+function deriveResultMessage(result: unknown): string | null {
+	if (result == null) return null;
+	if (typeof result === "string") return result;
+	if (typeof result !== "object") return null;
+	const r = result as { error?: unknown; message?: unknown };
+	if (typeof r.error === "string") return r.error;
+	if (typeof r.message === "string") return r.message;
+	try {
+		return JSON.stringify(result);
+	} catch {
+		return null;
+	}
+}
+
+/**
+ * Compact tool-call card. Used by ``FallbackToolBody`` for unregistered
+ * tools whose result is not an HITL interrupt.
+ *
+ * shadcn composition note: ``Card`` is used as a visual frame WITHOUT
+ * ``CardHeader``/``CardContent`` — the full composition's ``p-6``
+ * doesn't fit a compact collapsible header that IS the trigger.
+ *
+ * Per-card expansion auto-syncs to ``isRunning`` (auto-expand on
+ * stream start, auto-collapse on completion); manual toggle takes over
+ * once streaming ends.
+ */
+export const DefaultFallbackCard: TimelineToolComponent = ({
+	toolCallId,
+	toolName,
+	argsText,
+	result,
+	status,
+	langchainToolCallId,
+}) => {
+	const isCancelled = status === "cancelled";
+	const isError = status === "error";
+	const isRunning = status === "running";
+
+	const [isExpanded, setIsExpanded] = useState(isRunning);
+	useEffect(() => {
+		setIsExpanded(isRunning);
+	}, [isRunning]);
+
+	const serializedResult = useMemo(
+		() =>
+			result !== undefined && typeof result !== "string" ? JSON.stringify(result, null, 2) : null,
+		[result]
+	);
+
+	const subtitle = useMemo(
+		() => (isError || isCancelled ? deriveResultMessage(result) : null),
+		[isError, isCancelled, result]
+	);
+
+	const displayName = getToolDisplayName(toolName);
+
+	return (
+		<Card
+			className={cn(
+				"my-4 max-w-lg overflow-hidden",
+				isCancelled && "opacity-60",
+				isError && "border-destructive/30"
+			)}
+		>
+			<Collapsible
+				className="group"
+				open={isExpanded}
+				onOpenChange={(next) => {
+					if (isRunning) return;
+					setIsExpanded(next);
+				}}
+			>
+				<div className="flex items-stretch transition-colors hover:bg-accent hover:text-accent-foreground">
+					<CollapsibleTrigger asChild>
+						<Button
+							variant="ghost"
+							type="button"
+							className={cn(
+								"h-auto flex-1 min-w-0 justify-start gap-3 rounded-none py-4 pl-5 pr-2 text-left font-normal hover:bg-transparent",
+								"focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-inset",
+								"disabled:cursor-default"
+							)}
+						>
+							<div
+								className={cn(
+									"flex size-8 shrink-0 items-center justify-center rounded-lg",
+									isError ? "bg-destructive/10" : isCancelled ? "bg-muted" : "bg-primary/10"
+								)}
+							>
+								{isError ? (
+									<XCircleIcon className="size-4 text-destructive" />
+								) : isCancelled ? (
+									<XCircleIcon className="size-4 text-muted-foreground" />
+								) : isRunning ? (
+									<Spinner size="sm" className="text-primary" />
+								) : (
+									<CheckIcon className="size-4 text-primary" />
+								)}
+							</div>
+
+							<div className="flex flex-1 min-w-0 flex-col gap-0.5">
+								<div className="flex items-center gap-2">
+									<p
+										className={cn(
+											"text-sm font-semibold truncate",
+											isCancelled && "text-muted-foreground line-through",
+											isError && "text-destructive"
+										)}
+									>
+										{displayName}
+									</p>
+									{isRunning && <Badge variant="secondary">Running</Badge>}
+									{isError && <Badge variant="destructive">Failed</Badge>}
+									{isCancelled && <Badge variant="outline">Cancelled</Badge>}
+								</div>
+								{subtitle && (
+									<p
+										className={cn(
+											"text-xs truncate",
+											isError ? "text-destructive/80" : "text-muted-foreground"
+										)}
+									>
+										{subtitle}
+									</p>
+								)}
+							</div>
+						</Button>
+					</CollapsibleTrigger>
+
+					<div className="flex shrink-0 items-center gap-2 pl-2 pr-5">
+						<ToolCardRevertButton
+							toolCallId={toolCallId}
+							toolName={toolName}
+							langchainToolCallId={langchainToolCallId}
+						/>
+						<CollapsibleTrigger asChild>
+							<Button
+								type="button"
+								variant="ghost"
+								size="icon"
+								aria-label={isExpanded ? "Collapse details" : "Expand details"}
+								className="size-7 shrink-0"
+							>
+								<ChevronDownIcon
+									className={cn(
+										"size-4 transition-transform duration-200",
+										"group-data-[state=open]:rotate-180"
+									)}
+								/>
+							</Button>
+						</CollapsibleTrigger>
+					</div>
+				</div>
+
+				<CollapsibleContent>
+					<Separator />
+					<div className="flex flex-col gap-3 px-5 py-3">
+						{(argsText || isRunning) && (
+							<div className="flex flex-col gap-1 min-w-0">
+								<p className="text-xs font-medium text-muted-foreground">Inputs</p>
+								<NestedScroll className="max-h-48 overflow-auto rounded-md bg-muted/40">
+									{argsText ? (
+										<pre className="px-3 py-2 text-xs text-foreground/80 whitespace-pre-wrap break-all font-mono">
+											{argsText}
+										</pre>
+									) : (
+										<p className="px-3 py-2 text-xs italic text-muted-foreground">
+											Waiting for input…
+										</p>
+									)}
+								</NestedScroll>
+							</div>
+						)}
+						{!isCancelled && result !== undefined && (
+							<>
+								<Separator />
+								<div className="flex flex-col gap-1 min-w-0">
+									<p className="text-xs font-medium text-muted-foreground">Result</p>
+									<NestedScroll className="max-h-64 overflow-auto rounded-md bg-muted/40">
+										<pre className="px-3 py-2 text-xs text-foreground/80 whitespace-pre-wrap break-all font-mono">
+											{typeof result === "string" ? result : serializedResult}
+										</pre>
+									</NestedScroll>
+								</div>
+							</>
+						)}
+					</div>
+				</CollapsibleContent>
+			</Collapsible>
+		</Card>
+	);
+};
diff --git a/surfsense_web/features/chat-messages/timeline/tool-registry/fallback/fallback-tool-body.tsx b/surfsense_web/features/chat-messages/timeline/tool-registry/fallback/fallback-tool-body.tsx
new file mode 100644
index 0000000000..562bbfc10a
--- /dev/null
+++ b/surfsense_web/features/chat-messages/timeline/tool-registry/fallback/fallback-tool-body.tsx
@@ -0,0 +1,41 @@
+"use client";
+
+import {
+	DoomLoopApproval,
+	GenericHitlApproval,
+	type InterruptResult,
+	isDoomLoopInterrupt,
+	isInterruptResult,
+} from "@/features/chat-messages/hitl";
+import type { TimelineToolComponent } from "../types";
+import { DefaultFallbackCard } from "./default-fallback-card";
+
+/**
+ * Mounted by the timeline for any tool name not in the registry. The
+ * fallback owns the inner discrimination between HITL approval cards
+ * and the default visual card:
+ *
+ *   isInterruptResult(result) ─┬─ isDoomLoopInterrupt → DoomLoopApproval
+ *                              └─ otherwise           → GenericHitlApproval
+ *   else                                              → DefaultFallbackCard
+ *
+ * This is the ONLY place ``isInterruptResult`` is checked for unknown
+ * tools. Per-tool components in ``components/tool-ui/*`` perform their
+ * own internal discrimination over richer result shapes; the fallback
+ * only knows the two top-level branches.
+ */
+export const FallbackToolBody: TimelineToolComponent = (props) => {
+	if (isInterruptResult(props.result)) {
+		const approvalProps = {
+			toolCallId: props.toolCallId,
+			toolName: props.toolName,
+			args: props.args,
+			result: props.result as InterruptResult,
+		};
+		if (isDoomLoopInterrupt(props.result)) {
+			return <DoomLoopApproval {...approvalProps} />;
+		}
+		return <GenericHitlApproval {...approvalProps} />;
+	}
+	return <DefaultFallbackCard {...props} />;
+};
diff --git a/surfsense_web/features/chat-messages/timeline/tool-registry/fallback/index.ts b/surfsense_web/features/chat-messages/timeline/tool-registry/fallback/index.ts
new file mode 100644
index 0000000000..0188b45c86
--- /dev/null
+++ b/surfsense_web/features/chat-messages/timeline/tool-registry/fallback/index.ts
@@ -0,0 +1 @@
+export { FallbackToolBody } from "./fallback-tool-body";
diff --git a/surfsense_web/features/chat-messages/timeline/tool-registry/fallback/revert-button.tsx b/surfsense_web/features/chat-messages/timeline/tool-registry/fallback/revert-button.tsx
new file mode 100644
index 0000000000..6b050523fc
--- /dev/null
+++ b/surfsense_web/features/chat-messages/timeline/tool-registry/fallback/revert-button.tsx
@@ -0,0 +1,137 @@
+"use client";
+
+import { useQueryClient } from "@tanstack/react-query";
+import { RotateCcw } from "lucide-react";
+import { useState } from "react";
+import { toast } from "sonner";
+import {
+	AlertDialog,
+	AlertDialogAction,
+	AlertDialogCancel,
+	AlertDialogContent,
+	AlertDialogDescription,
+	AlertDialogFooter,
+	AlertDialogHeader,
+	AlertDialogTitle,
+	AlertDialogTrigger,
+} from "@/components/ui/alert-dialog";
+import { Button } from "@/components/ui/button";
+import { Spinner } from "@/components/ui/spinner";
+import { getToolDisplayName } from "@/contracts/enums/toolIcons";
+import { markActionRevertedInCache } from "@/hooks/use-agent-actions-query";
+import { agentActionsApiService } from "@/lib/apis/agent-actions-api.service";
+import { AppError } from "@/lib/error";
+import { useToolAction } from "./use-tool-action";
+
+/**
+ * Inline Revert button rendered on a default-fallback tool card when
+ * the matching ``AgentActionLog`` row is reversible and hasn't been
+ * reverted yet.
+ *
+ * Renders ``null`` (silent) in any of these cases:
+ *  - no matching action row (still streaming, or never logged)
+ *  - action not reversible
+ *  - already reverted (``reverted_by_action_id`` set)
+ *  - this card IS itself a revert action
+ *  - tool errored
+ *  - no thread context
+ *
+ * 503 from the revert API means the deployment has revert gated off;
+ * we hide the failure silently rather than nag the user. Other errors
+ * surface as toasts.
+ */
+export function ToolCardRevertButton({
+	toolCallId,
+	toolName,
+	langchainToolCallId,
+}: {
+	toolCallId: string;
+	toolName: string;
+	langchainToolCallId?: string;
+}) {
+	const queryClient = useQueryClient();
+	const { threadId, action } = useToolAction({
+		toolCallId,
+		toolName,
+		langchainToolCallId,
+	});
+
+	const [isReverting, setIsReverting] = useState(false);
+	const [confirmOpen, setConfirmOpen] = useState(false);
+
+	if (!action) return null;
+	if (!action.reversible) return null;
+	if (action.reverted_by_action_id !== null && action.reverted_by_action_id !== undefined)
+		return null;
+	if (action.is_revert_action) return null;
+	if (action.error !== null && action.error !== undefined) return null;
+	if (!threadId) return null;
+
+	const handleRevert = async () => {
+		setIsReverting(true);
+		try {
+			const response = await agentActionsApiService.revert(threadId, action.id);
+			markActionRevertedInCache(queryClient, threadId, action.id, response.new_action_id ?? null);
+			toast.success(response.message || "Action reverted.");
+		} catch (err) {
+			if (err instanceof AppError && err.status === 503) {
+				return;
+			}
+			const message =
+				err instanceof AppError
+					? err.message
+					: err instanceof Error
+						? err.message
+						: "Failed to revert action.";
+			toast.error(message);
+		} finally {
+			setIsReverting(false);
+			setConfirmOpen(false);
+		}
+	};
+
+	return (
+		<AlertDialog open={confirmOpen} onOpenChange={setConfirmOpen}>
+			<AlertDialogTrigger asChild>
+				<Button
+					size="sm"
+					variant="outline"
+					className="gap-1.5"
+					onClick={(e) => {
+						e.stopPropagation();
+						setConfirmOpen(true);
+					}}
+					disabled={isReverting}
+				>
+					{isReverting ? <Spinner size="xs" /> : <RotateCcw data-icon="inline-start" />}
+					Revert
+				</Button>
+			</AlertDialogTrigger>
+			<AlertDialogContent>
+				<AlertDialogHeader>
+					<AlertDialogTitle>Revert this action?</AlertDialogTitle>
+					<AlertDialogDescription>
+						This will undo{" "}
+						<span className="font-medium">{getToolDisplayName(action.tool_name)}</span> and add a
+						new entry to the history. Your chat is preserved — only the changes the agent made to
+						your knowledge base or connected apps will be rolled back where possible.
+					</AlertDialogDescription>
+				</AlertDialogHeader>
+				<AlertDialogFooter>
+					<AlertDialogCancel disabled={isReverting}>Cancel</AlertDialogCancel>
+					<AlertDialogAction
+						onClick={(e) => {
+							e.preventDefault();
+							handleRevert();
+						}}
+						disabled={isReverting}
+						className="gap-1.5"
+					>
+						{isReverting && <Spinner size="xs" />}
+						Revert
+					</AlertDialogAction>
+				</AlertDialogFooter>
+			</AlertDialogContent>
+		</AlertDialog>
+	);
+}
diff --git a/surfsense_web/features/chat-messages/timeline/tool-registry/fallback/use-tool-action.ts b/surfsense_web/features/chat-messages/timeline/tool-registry/fallback/use-tool-action.ts
new file mode 100644
index 0000000000..cd5265a255
--- /dev/null
+++ b/surfsense_web/features/chat-messages/timeline/tool-registry/fallback/use-tool-action.ts
@@ -0,0 +1,90 @@
+"use client";
+
+import { useAuiState } from "@assistant-ui/react";
+import { useAtomValue } from "jotai";
+import { useMemo } from "react";
+import { chatSessionStateAtom } from "@/atoms/chat/chat-session-state.atom";
+import { useAgentActionsQuery } from "@/hooks/use-agent-actions-query";
+
+/**
+ * Resolve the ``AgentActionLog`` row for a given tool-call card. Tries
+ * three lookup strategies, in priority order, against the unified
+ * ``useAgentActionsQuery`` cache (the same react-query cache the
+ * agent-actions dialog consumes — keeps the card and the dialog in
+ * lockstep across reload, navigation, live stream, post-stream
+ * reversibility flips, and explicit revert clicks).
+ *
+ * **Tier 1+2 — direct id match (O(1) Map):**
+ *   - ``a.tool_call_id === toolCallId`` — hits when the model streamed
+ *     ``tool_call_chunks`` so the card id matches the LangChain id.
+ *   - ``a.tool_call_id === langchainToolCallId`` — synthetic card id
+ *     is ``call_<run_id>`` and the LangChain id was backfilled by
+ *     ``tool-output-available``.
+ *
+ * **Tier 3 — position-within-turn fallback:** only kicks in when the
+ *   card has a synthetic ``call_<run_id>`` id AND no
+ *   ``langchainToolCallId`` was ever backfilled (tool emitted as a
+ *   single non-chunked payload AND streaming pre-dated the
+ *   ``on_tool_end`` backfill, e.g. older threads).
+ *
+ * Returns ``null`` if no row matches OR if there's no thread context.
+ *
+ * Performance note: ``useAuiState`` returns a PRIMITIVE
+ * (``positionInTurn`` is a number; ``chatTurnId`` is a string) so the
+ * hook's ``Object.is`` short-circuit prevents re-renders on every
+ * text-delta of every other part in the same message during streaming.
+ * (See Vercel React rule ``rerender-defer-reads``.)
+ */
+export function useToolAction({
+	toolCallId,
+	toolName,
+	langchainToolCallId,
+}: {
+	toolCallId: string;
+	toolName: string;
+	langchainToolCallId?: string;
+}) {
+	const session = useAtomValue(chatSessionStateAtom);
+	const threadId = session?.threadId ?? null;
+	const { findByToolCallId, findByChatTurnAndTool } = useAgentActionsQuery(threadId);
+
+	const chatTurnId = useAuiState(({ message }) => {
+		const meta = message?.metadata as { custom?: { chatTurnId?: string } } | undefined;
+		return meta?.custom?.chatTurnId ?? null;
+	});
+	const positionInTurn = useAuiState(({ message }) => {
+		const content = message?.content;
+		if (!Array.isArray(content)) return -1;
+		let n = -1;
+		for (const part of content) {
+			if (
+				part &&
+				typeof part === "object" &&
+				(part as { type?: string }).type === "tool-call" &&
+				(part as { toolName?: string }).toolName === toolName
+			) {
+				n += 1;
+				if ((part as { toolCallId?: string }).toolCallId === toolCallId) return n;
+			}
+		}
+		return -1;
+	});
+
+	const action = useMemo(() => {
+		const direct = findByToolCallId(toolCallId) ?? findByToolCallId(langchainToolCallId);
+		if (direct) return direct;
+		if (!chatTurnId || positionInTurn < 0) return null;
+		const turnSameTool = findByChatTurnAndTool(chatTurnId, toolName);
+		return turnSameTool[positionInTurn] ?? null;
+	}, [
+		findByToolCallId,
+		findByChatTurnAndTool,
+		toolCallId,
+		langchainToolCallId,
+		chatTurnId,
+		toolName,
+		positionInTurn,
+	]);
+
+	return { threadId, action };
+}
diff --git a/surfsense_web/features/chat-messages/timeline/tool-registry/index.ts b/surfsense_web/features/chat-messages/timeline/tool-registry/index.ts
new file mode 100644
index 0000000000..212d1a7ea9
--- /dev/null
+++ b/surfsense_web/features/chat-messages/timeline/tool-registry/index.ts
@@ -0,0 +1,4 @@
+export { adaptItemToProps } from "./adapt-props";
+export { FallbackToolBody } from "./fallback";
+export { getToolComponent, TIMELINE_TOOL_NAMES } from "./registry";
+export type { TimelineToolComponent, TimelineToolProps } from "./types";
diff --git a/surfsense_web/features/chat-messages/timeline/tool-registry/registry.ts b/surfsense_web/features/chat-messages/timeline/tool-registry/registry.ts
new file mode 100644
index 0000000000..8acc6b4fa3
--- /dev/null
+++ b/surfsense_web/features/chat-messages/timeline/tool-registry/registry.ts
@@ -0,0 +1,229 @@
+"use client";
+
+import dynamic from "next/dynamic";
+import type { TimelineToolComponent } from "./types";
+
+// Dynamic imports keep the per-tool UI bundles out of the main chunk —
+// each component only loads when an assistant turn references it. Mirrors
+// the existing ``components/assistant-ui/assistant-message.tsx`` pattern.
+//
+// Phase A note: the imported components are still typed as
+// ``ToolCallMessagePartComponent`` from assistant-ui; the cast at the
+// bottom of this file bridges the contract until the cutover commit
+// retypes them to ``TimelineToolComponent``. The cast is a structural
+// no-op — every consumed prop overlaps.
+
+const UpdateMemoryToolUI = dynamic(
+	() => import("@/components/tool-ui/user-memory").then((m) => ({ default: m.UpdateMemoryToolUI })),
+	{ ssr: false }
+);
+const SandboxExecuteToolUI = dynamic(
+	() =>
+		import("@/components/tool-ui/sandbox-execute").then((m) => ({
+			default: m.SandboxExecuteToolUI,
+		})),
+	{ ssr: false }
+);
+const CreateNotionPageToolUI = dynamic(
+	() => import("@/components/tool-ui/notion").then((m) => ({ default: m.CreateNotionPageToolUI })),
+	{ ssr: false }
+);
+const UpdateNotionPageToolUI = dynamic(
+	() => import("@/components/tool-ui/notion").then((m) => ({ default: m.UpdateNotionPageToolUI })),
+	{ ssr: false }
+);
+const DeleteNotionPageToolUI = dynamic(
+	() => import("@/components/tool-ui/notion").then((m) => ({ default: m.DeleteNotionPageToolUI })),
+	{ ssr: false }
+);
+const CreateLinearIssueToolUI = dynamic(
+	() => import("@/components/tool-ui/linear").then((m) => ({ default: m.CreateLinearIssueToolUI })),
+	{ ssr: false }
+);
+const UpdateLinearIssueToolUI = dynamic(
+	() => import("@/components/tool-ui/linear").then((m) => ({ default: m.UpdateLinearIssueToolUI })),
+	{ ssr: false }
+);
+const DeleteLinearIssueToolUI = dynamic(
+	() => import("@/components/tool-ui/linear").then((m) => ({ default: m.DeleteLinearIssueToolUI })),
+	{ ssr: false }
+);
+const CreateGoogleDriveFileToolUI = dynamic(
+	() =>
+		import("@/components/tool-ui/google-drive").then((m) => ({
+			default: m.CreateGoogleDriveFileToolUI,
+		})),
+	{ ssr: false }
+);
+const DeleteGoogleDriveFileToolUI = dynamic(
+	() =>
+		import("@/components/tool-ui/google-drive").then((m) => ({
+			default: m.DeleteGoogleDriveFileToolUI,
+		})),
+	{ ssr: false }
+);
+const CreateOneDriveFileToolUI = dynamic(
+	() =>
+		import("@/components/tool-ui/onedrive").then((m) => ({ default: m.CreateOneDriveFileToolUI })),
+	{ ssr: false }
+);
+const DeleteOneDriveFileToolUI = dynamic(
+	() =>
+		import("@/components/tool-ui/onedrive").then((m) => ({ default: m.DeleteOneDriveFileToolUI })),
+	{ ssr: false }
+);
+const CreateDropboxFileToolUI = dynamic(
+	() =>
+		import("@/components/tool-ui/dropbox").then((m) => ({ default: m.CreateDropboxFileToolUI })),
+	{ ssr: false }
+);
+const DeleteDropboxFileToolUI = dynamic(
+	() =>
+		import("@/components/tool-ui/dropbox").then((m) => ({ default: m.DeleteDropboxFileToolUI })),
+	{ ssr: false }
+);
+const CreateCalendarEventToolUI = dynamic(
+	() =>
+		import("@/components/tool-ui/google-calendar").then((m) => ({
+			default: m.CreateCalendarEventToolUI,
+		})),
+	{ ssr: false }
+);
+const UpdateCalendarEventToolUI = dynamic(
+	() =>
+		import("@/components/tool-ui/google-calendar").then((m) => ({
+			default: m.UpdateCalendarEventToolUI,
+		})),
+	{ ssr: false }
+);
+const DeleteCalendarEventToolUI = dynamic(
+	() =>
+		import("@/components/tool-ui/google-calendar").then((m) => ({
+			default: m.DeleteCalendarEventToolUI,
+		})),
+	{ ssr: false }
+);
+const CreateGmailDraftToolUI = dynamic(
+	() => import("@/components/tool-ui/gmail").then((m) => ({ default: m.CreateGmailDraftToolUI })),
+	{ ssr: false }
+);
+const UpdateGmailDraftToolUI = dynamic(
+	() => import("@/components/tool-ui/gmail").then((m) => ({ default: m.UpdateGmailDraftToolUI })),
+	{ ssr: false }
+);
+const SendGmailEmailToolUI = dynamic(
+	() => import("@/components/tool-ui/gmail").then((m) => ({ default: m.SendGmailEmailToolUI })),
+	{ ssr: false }
+);
+const TrashGmailEmailToolUI = dynamic(
+	() => import("@/components/tool-ui/gmail").then((m) => ({ default: m.TrashGmailEmailToolUI })),
+	{ ssr: false }
+);
+const CreateJiraIssueToolUI = dynamic(
+	() => import("@/components/tool-ui/jira").then((m) => ({ default: m.CreateJiraIssueToolUI })),
+	{ ssr: false }
+);
+const UpdateJiraIssueToolUI = dynamic(
+	() => import("@/components/tool-ui/jira").then((m) => ({ default: m.UpdateJiraIssueToolUI })),
+	{ ssr: false }
+);
+const DeleteJiraIssueToolUI = dynamic(
+	() => import("@/components/tool-ui/jira").then((m) => ({ default: m.DeleteJiraIssueToolUI })),
+	{ ssr: false }
+);
+const CreateConfluencePageToolUI = dynamic(
+	() =>
+		import("@/components/tool-ui/confluence").then((m) => ({
+			default: m.CreateConfluencePageToolUI,
+		})),
+	{ ssr: false }
+);
+const UpdateConfluencePageToolUI = dynamic(
+	() =>
+		import("@/components/tool-ui/confluence").then((m) => ({
+			default: m.UpdateConfluencePageToolUI,
+		})),
+	{ ssr: false }
+);
+const DeleteConfluencePageToolUI = dynamic(
+	() =>
+		import("@/components/tool-ui/confluence").then((m) => ({
+			default: m.DeleteConfluencePageToolUI,
+		})),
+	{ ssr: false }
+);
+
+/**
+ * Headers-only tools — the timeline shows their ``ItemHeader`` (title +
+ * sub-bullets) but mounts no tool body beneath. Two reasons to use
+ * this:
+ *  - **Structural primitives** (``task``): the row IS the parent of a
+ *    delegation span; its job is to label the group. Children render
+ *    as their own indented entries.
+ *  - **Suppressed connectors** (``web_search``, ``link_preview``,
+ *    ``multi_link_preview``, ``scrape_webpage``): citations they
+ *    produce render inline in markdown; a separate card would be
+ *    redundant noise.
+ */
+const NullTimelineBody: TimelineToolComponent = () => null;
+
+/**
+ * The timeline's tool-name → component map. Mounted by
+ * ``timeline/items/tool-call-item.tsx`` via ``getToolComponent(name)``.
+ *
+ * Includes only "process" tools (connector CRUD, sandbox execute,
+ * memory updates) and the 4 invisible tools mapped to a null component.
+ * Deliverables (``generate_report``, ``generate_resume``,
+ * ``generate_podcast``, ``generate_video_presentation``,
+ * ``display_image``, ``generate_image``) live in ``BODY_TOOLS`` in
+ * ``assistant-message.tsx`` — they're product, not process.
+ *
+ * Tools NOT in this map fall through to ``FallbackToolBody`` (which
+ * itself dispatches between HITL approval cards and
+ * ``DefaultFallbackCard`` based on result discrimination).
+ */
+const TOOLS_BY_NAME = {
+	task: NullTimelineBody,
+	update_memory: UpdateMemoryToolUI,
+	execute: SandboxExecuteToolUI,
+	execute_code: SandboxExecuteToolUI,
+	create_notion_page: CreateNotionPageToolUI,
+	update_notion_page: UpdateNotionPageToolUI,
+	delete_notion_page: DeleteNotionPageToolUI,
+	create_linear_issue: CreateLinearIssueToolUI,
+	update_linear_issue: UpdateLinearIssueToolUI,
+	delete_linear_issue: DeleteLinearIssueToolUI,
+	create_google_drive_file: CreateGoogleDriveFileToolUI,
+	delete_google_drive_file: DeleteGoogleDriveFileToolUI,
+	create_onedrive_file: CreateOneDriveFileToolUI,
+	delete_onedrive_file: DeleteOneDriveFileToolUI,
+	create_dropbox_file: CreateDropboxFileToolUI,
+	delete_dropbox_file: DeleteDropboxFileToolUI,
+	create_calendar_event: CreateCalendarEventToolUI,
+	update_calendar_event: UpdateCalendarEventToolUI,
+	delete_calendar_event: DeleteCalendarEventToolUI,
+	create_gmail_draft: CreateGmailDraftToolUI,
+	update_gmail_draft: UpdateGmailDraftToolUI,
+	send_gmail_email: SendGmailEmailToolUI,
+	trash_gmail_email: TrashGmailEmailToolUI,
+	create_jira_issue: CreateJiraIssueToolUI,
+	update_jira_issue: UpdateJiraIssueToolUI,
+	delete_jira_issue: DeleteJiraIssueToolUI,
+	create_confluence_page: CreateConfluencePageToolUI,
+	update_confluence_page: UpdateConfluencePageToolUI,
+	delete_confluence_page: DeleteConfluencePageToolUI,
+	web_search: NullTimelineBody,
+	link_preview: NullTimelineBody,
+	multi_link_preview: NullTimelineBody,
+	scrape_webpage: NullTimelineBody,
+} as unknown as Record<string, TimelineToolComponent>;
+
+/**
+ * Lookup a tool component by name. Returns ``undefined`` for unknown
+ * tools so the caller can mount ``FallbackToolBody`` instead.
+ */
+export function getToolComponent(toolName: string): TimelineToolComponent | undefined {
+	return TOOLS_BY_NAME[toolName];
+}
+
+export const TIMELINE_TOOL_NAMES = Object.keys(TOOLS_BY_NAME) as readonly string[];
diff --git a/surfsense_web/features/chat-messages/timeline/tool-registry/types.ts b/surfsense_web/features/chat-messages/timeline/tool-registry/types.ts
new file mode 100644
index 0000000000..8483d67c34
--- /dev/null
+++ b/surfsense_web/features/chat-messages/timeline/tool-registry/types.ts
@@ -0,0 +1,37 @@
+import type { ReactNode } from "react";
+import type { ItemStatus } from "../types";
+
+/**
+ * The exact prop subset the timeline supplies when mounting a tool
+ * component. A strict subset of assistant-ui's
+ * ``ToolCallMessagePartProps`` — only the fields we actually have when
+ * rendering manually from a ``ToolCallItem``.
+ *
+ * Notably absent vs. assistant-ui:
+ * - ``addResult`` / ``resume`` (runtime-only, not available to us)
+ * - The complex ``status: ToolCallMessagePartState["status"]`` object
+ *   (replaced by our simple ``ItemStatus`` enum)
+ * - ``messageId`` and other parent-message context (not needed by any
+ *   of the 15 HITL-aware tool-ui components today)
+ */
+export interface TimelineToolProps {
+	toolCallId: string;
+	toolName: string;
+	args: Record<string, unknown>;
+	argsText?: string;
+	result?: unknown;
+	langchainToolCallId?: string;
+	status: ItemStatus;
+}
+
+/**
+ * Contract for every tool component mounted by the timeline. The 15
+ * existing HITL-aware ``components/tool-ui/*`` files retype to this
+ * during the cutover commit (a mechanical rename from
+ * ``ToolCallMessagePartComponent`` → ``TimelineToolComponent``).
+ *
+ * Components are expected to perform internal discrimination on
+ * ``result`` to pick a view (interrupt → approval card; success →
+ * result card; etc.) — see §2.2 of the architecture doc.
+ */
+export type TimelineToolComponent = (props: TimelineToolProps) => ReactNode;
diff --git a/surfsense_web/features/chat-messages/timeline/types.ts b/surfsense_web/features/chat-messages/timeline/types.ts
new file mode 100644
index 0000000000..d32496b5e8
--- /dev/null
+++ b/surfsense_web/features/chat-messages/timeline/types.ts
@@ -0,0 +1,84 @@
+/**
+ * Coarse status used by the timeline's chrome (the colored dot, the
+ * "in progress" pulse). NOT consulted by per-tool components — those
+ * own their own visual state machines (e.g. ``useHitlPhase`` for HITL
+ * approval cards).
+ *
+ * - ``pending``    — known but not started yet (rare; usually only seen briefly during streaming)
+ * - ``running``    — currently executing (assistant-ui ``in_progress``)
+ * - ``completed``  — finished without error
+ * - ``cancelled``  — user rejected (HITL ``__decided__: "reject"``)
+ * - ``error``      — threw or returned an error result
+ */
+export type ItemStatus = "pending" | "running" | "completed" | "cancelled" | "error";
+
+interface BaseItem {
+	/**
+	 * Stable React key for the timeline. When a thinking-step row is joined
+	 * with a tool-call part (via ``metadata.thinkingStepId``), this is the
+	 * thinking-step ID — preserves identity across rehydration. For
+	 * tool-calls with no joined step, this is the ``toolCallId``.
+	 */
+	id: string;
+	status: ItemStatus;
+	/**
+	 * Optional sub-bullets shown beneath the row's title. Forwarded
+	 * verbatim from ``ThinkingStep.items`` when the timeline item was
+	 * built from a thinking-step row.
+	 */
+	items?: string[];
+	/**
+	 * Groups items into the delegation tree. All items emitted while a
+	 * delegating ``task`` is open carry the same ``spanId``; the ``task``
+	 * step itself owns the span (see ``grouping.ts``).
+	 */
+	spanId?: string;
+}
+
+/**
+ * Pure agent narration (e.g. "Reviewing the request", "Planning"). NOT
+ * a model-level ``<think>`` block — those are rendered in the BODY by
+ * the assistant-ui ``Reasoning`` component. This kind covers thinking-
+ * step rows that are NOT linked to a tool call.
+ */
+export interface ReasoningItem extends BaseItem {
+	kind: "reasoning";
+	title: string;
+}
+
+/**
+ * A tool invocation. Per-tool components (mounted by the timeline's
+ * tool-registry) discriminate the ``result`` shape internally to pick
+ * a view (interrupt → approval card; success → result card; etc.).
+ *
+ * The timeline does NOT inspect ``result`` beyond deriving ``status``.
+ */
+export interface ToolCallItem extends BaseItem {
+	kind: "tool-call";
+	toolName: string;
+	/** The actual tool-call ID — passed to per-tool components (e.g. for the Revert button). */
+	toolCallId: string;
+	args: Record<string, unknown>;
+	argsText?: string;
+	result?: unknown;
+	langchainToolCallId?: string;
+	/**
+	 * Set when the tool-call was joined with a thinking-step row via
+	 * ``metadata.thinkingStepId``. In that case ``id`` is the
+	 * thinking-step ID, not the ``toolCallId``.
+	 */
+	thinkingStepId?: string;
+}
+
+export type TimelineItem = ReasoningItem | ToolCallItem;
+
+/**
+ * The output shape of the grouping pass. Each group is a parent item
+ * (typically a delegating ``task`` tool-call) plus the items emitted
+ * inside its span. Items with no ``spanId`` become parents with no
+ * children.
+ */
+export interface TimelineGroup {
+	parent: TimelineItem;
+	children: TimelineItem[];
+}
diff --git a/surfsense_web/hooks/use-agent-actions-query.ts b/surfsense_web/hooks/use-agent-actions-query.ts
new file mode 100644
index 0000000000..1e2f45a8a8
--- /dev/null
+++ b/surfsense_web/hooks/use-agent-actions-query.ts
@@ -0,0 +1,395 @@
+"use client";
+
+import { type QueryClient, useQuery } from "@tanstack/react-query";
+import { useCallback, useEffect, useMemo, useRef } from "react";
+import {
+	type AgentAction,
+	type AgentActionListResponse,
+	agentActionsApiService,
+} from "@/lib/apis/agent-actions-api.service";
+
+// =============================================================================
+// DIAGNOSTIC LOGGING — gated behind a single switch. Flip ``RevertDebug``
+// to ``true`` to trace the full SSE → cache → card → button pipeline in
+// the browser console. Off by default so we don't spam production. The
+// infrastructure stays in place because the underlying id-mismatch
+// failure mode is rare-but-real and surfaces only at runtime.
+// =============================================================================
+const RevertDebug = false;
+const dbg = (...args: unknown[]) => {
+	if (RevertDebug && typeof window !== "undefined") {
+		// eslint-disable-next-line no-console
+		console.log("[RevertDebug]", ...args);
+	}
+};
+
+/**
+ * Unified store for ``AgentActionLog`` rows scoped to one thread.
+ *
+ * Replaces the previous SSE side-channel atom mess
+ * (``agentActionByLcIdAtom`` / ``agentActionByToolCallIdAtom`` /
+ * ``agentActionsByChatTurnIdAtom``) and the standalone hydration hook.
+ * One react-query cache entry is now the single source of truth for:
+ *
+ * * the inline Revert button on every tool-call card
+ * * the per-turn "Revert turn" button under each assistant message
+ * * the edit-from-position pre-flight that decides whether to show
+ *   the confirmation dialog
+ * * the agent-actions dialog
+ *
+ * The cache is hydrated by ``GET /threads/{id}/actions`` (sized to
+ * 200, the server max) and updated incrementally by helpers that turn
+ * SSE events / revert RPC responses into ``setQueryData`` mutations.
+ * That keeps the card and the dialog in lockstep on every code path —
+ * page reload, navigation, live stream, post-stream reversibility flip,
+ * and explicit revert clicks.
+ */
+
+export const ACTION_LOG_PAGE_SIZE = 200;
+
+/** Stable react-query key for the per-thread action list. */
+export function agentActionsQueryKey(threadId: number | null) {
+	return threadId !== null
+		? (["agent-actions", threadId] as const)
+		: (["agent-actions", "none"] as const);
+}
+
+/** Subset of the SSE ``data-action-log`` payload we care about. */
+export interface ActionLogSseEvent {
+	id: number;
+	lc_tool_call_id: string | null;
+	chat_turn_id: string | null;
+	tool_name: string;
+	reversible: boolean;
+	reverse_descriptor_present: boolean;
+	error: boolean;
+	created_at: string | null;
+}
+
+/**
+ * Append or upsert a freshly-emitted ``AgentActionLog`` row into the
+ * thread-scoped query cache.
+ *
+ * The SSE payload is a strict subset of ``AgentAction``; missing
+ * fields (``args``, ``reverse_descriptor``, ``user_id``) are filled
+ * with ``null`` placeholders. The next refetch (dialog open, user
+ * focus, route stale) backfills them — but the inline Revert button
+ * only reads the fields the SSE payload carries, so it lights up
+ * immediately.
+ */
+export function applyActionLogSse(
+	queryClient: QueryClient,
+	threadId: number,
+	searchSpaceId: number,
+	event: ActionLogSseEvent
+): void {
+	dbg("applyActionLogSse: incoming SSE event", {
+		threadId,
+		searchSpaceId,
+		event,
+	});
+	queryClient.setQueryData<AgentActionListResponse>(agentActionsQueryKey(threadId), (prev) => {
+		const placeholder: AgentAction = {
+			id: event.id,
+			thread_id: threadId,
+			user_id: null,
+			search_space_id: searchSpaceId,
+			tool_name: event.tool_name,
+			args: null,
+			result_id: null,
+			reversible: event.reversible,
+			reverse_descriptor: event.reverse_descriptor_present ? {} : null,
+			error: event.error ? {} : null,
+			reverse_of: null,
+			reverted_by_action_id: null,
+			is_revert_action: false,
+			tool_call_id: event.lc_tool_call_id,
+			chat_turn_id: event.chat_turn_id,
+			created_at: event.created_at ?? new Date().toISOString(),
+		};
+		if (!prev) {
+			return {
+				items: [placeholder],
+				total: 1,
+				page: 0,
+				page_size: ACTION_LOG_PAGE_SIZE,
+				has_more: false,
+			};
+		}
+		const existingIdx = prev.items.findIndex((a) => a.id === event.id);
+		if (existingIdx >= 0) {
+			const merged = [...prev.items];
+			const existing = merged[existingIdx];
+			if (existing) {
+				merged[existingIdx] = {
+					...existing,
+					reversible: event.reversible,
+					tool_call_id: event.lc_tool_call_id ?? existing.tool_call_id,
+					chat_turn_id: event.chat_turn_id ?? existing.chat_turn_id,
+				};
+			}
+			dbg("applyActionLogSse: merged into existing entry", {
+				id: event.id,
+				tool_call_id: merged[existingIdx]?.tool_call_id,
+				reversible: merged[existingIdx]?.reversible,
+			});
+			return { ...prev, items: merged };
+		}
+		dbg("applyActionLogSse: appended new placeholder", {
+			id: event.id,
+			tool_call_id: placeholder.tool_call_id,
+			tool_name: placeholder.tool_name,
+			reversible: placeholder.reversible,
+			cacheSizeAfter: prev.items.length + 1,
+		});
+		// REST returns newest-first — keep that ordering when
+		// the server eventually refetches by prepending.
+		return {
+			...prev,
+			items: [placeholder, ...prev.items],
+			total: prev.total + 1,
+		};
+	});
+}
+
+/**
+ * Apply a post-SAVEPOINT reversibility flip
+ * (``data-action-log-updated`` SSE event) to the cache.
+ */
+export function applyActionLogUpdatedSse(
+	queryClient: QueryClient,
+	threadId: number,
+	id: number,
+	reversible: boolean
+): void {
+	dbg("applyActionLogUpdatedSse: reversibility flip", {
+		threadId,
+		id,
+		reversible,
+	});
+	queryClient.setQueryData<AgentActionListResponse>(agentActionsQueryKey(threadId), (prev) => {
+		if (!prev) {
+			dbg("applyActionLogUpdatedSse: NO prev cache for thread; flip dropped", {
+				threadId,
+				id,
+			});
+			return prev;
+		}
+		let mutated = false;
+		const items = prev.items.map((a) => {
+			if (a.id !== id) return a;
+			mutated = true;
+			return { ...a, reversible };
+		});
+		if (!mutated) {
+			dbg("applyActionLogUpdatedSse: id not in cache; flip dropped", {
+				threadId,
+				id,
+				cacheSize: prev.items.length,
+				cacheIds: prev.items.map((a) => a.id),
+			});
+		}
+		return mutated ? { ...prev, items } : prev;
+	});
+}
+
+/**
+ * Optimistically mark ``id`` as reverted.
+ *
+ * Used by the inline / per-turn Revert button immediately after the
+ * server returns success so the UI flips to "Reverted" without
+ * waiting for a refetch. ``newActionId`` is the id of the new
+ * ``is_revert_action`` row the server inserted; pass ``null`` if the
+ * server didn't return it.
+ */
+export function markActionRevertedInCache(
+	queryClient: QueryClient,
+	threadId: number,
+	id: number,
+	newActionId: number | null
+): void {
+	queryClient.setQueryData<AgentActionListResponse>(agentActionsQueryKey(threadId), (prev) => {
+		if (!prev) return prev;
+		let mutated = false;
+		const items = prev.items.map((a) => {
+			if (a.id !== id) return a;
+			mutated = true;
+			// ``-1`` is a sentinel meaning "we know it was reverted
+			// but the server didn't tell us the new row's id".
+			return {
+				...a,
+				reverted_by_action_id: newActionId ?? -1,
+			};
+		});
+		return mutated ? { ...prev, items } : prev;
+	});
+}
+
+/**
+ * Apply a batch of revert results (per-turn revert response) to the
+ * cache. Anything in the ``reverted`` / ``already_reverted`` buckets
+ * gets its ``reverted_by_action_id`` set; other rows are left alone.
+ */
+export function applyRevertTurnResultsToCache(
+	queryClient: QueryClient,
+	threadId: number,
+	entries: Array<{ id: number; newActionId: number | null }>
+): void {
+	if (entries.length === 0) return;
+	queryClient.setQueryData<AgentActionListResponse>(agentActionsQueryKey(threadId), (prev) => {
+		if (!prev) return prev;
+		const lookup = new Map(entries.map((e) => [e.id, e.newActionId]));
+		let mutated = false;
+		const items = prev.items.map((a) => {
+			if (!lookup.has(a.id)) return a;
+			mutated = true;
+			const newActionId = lookup.get(a.id) ?? null;
+			return { ...a, reverted_by_action_id: newActionId ?? -1 };
+		});
+		return mutated ? { ...prev, items } : prev;
+	});
+}
+
+/**
+ * Read-side hook used by the card, the turn button, the dialog, and
+ * the edit-from-position pre-flight.
+ *
+ * Returns the raw query state plus convenience selectors so consumers
+ * don't reach into ``data.items`` directly. ``enabled`` is the only
+ * knob — pass ``false`` to keep the query dormant when the consumer
+ * doesn't yet have a thread id.
+ */
+export function useAgentActionsQuery(threadId: number | null, options: { enabled?: boolean } = {}) {
+	const enabled = (options.enabled ?? true) && threadId !== null;
+	const query = useQuery({
+		queryKey: agentActionsQueryKey(threadId),
+		queryFn: async () => {
+			dbg("useAgentActionsQuery: REST fetch START", {
+				threadId,
+				pageSize: ACTION_LOG_PAGE_SIZE,
+			});
+			const res = await agentActionsApiService.listForThread(threadId as number, {
+				page: 0,
+				pageSize: ACTION_LOG_PAGE_SIZE,
+			});
+			dbg("useAgentActionsQuery: REST fetch DONE", {
+				threadId,
+				total: res.total,
+				returned: res.items.length,
+				items: res.items.map((a) => ({
+					id: a.id,
+					tool_name: a.tool_name,
+					tool_call_id: a.tool_call_id,
+					reversible: a.reversible,
+					reverted_by_action_id: a.reverted_by_action_id,
+					is_revert_action: a.is_revert_action,
+				})),
+			});
+			return res;
+		},
+		enabled,
+		staleTime: 15 * 1000,
+	});
+
+	const items = useMemo(() => query.data?.items ?? [], [query.data]);
+
+	// Index ``items`` once per change so the lookups below are O(1)
+	// instead of O(N) per card per render. With the cache sized to 200
+	// rows and many tool cards visible at once, the unindexed scan was
+	// the hottest path on every assistant text-delta. (Vercel React
+	// rule ``js-index-maps`` / ``js-set-map-lookups``.)
+	const byToolCallId = useMemo(() => {
+		const m = new Map<string, AgentAction>();
+		for (const a of items) {
+			if (a.tool_call_id) m.set(a.tool_call_id, a);
+		}
+		return m;
+	}, [items]);
+
+	// Pre-grouped + pre-sorted (oldest-first, the order the agent
+	// actually executed them in) so the (chat_turn_id, tool_name,
+	// position) fallback in ``tool-fallback.tsx`` is also O(1) per
+	// card. Excludes ``is_revert_action`` rows so the position index
+	// matches the agent's original execution order.
+	const byTurnAndTool = useMemo(() => {
+		const m = new Map<string, AgentAction[]>();
+		for (const a of items) {
+			if (!a.chat_turn_id || a.is_revert_action) continue;
+			const key = `${a.chat_turn_id}::${a.tool_name}`;
+			const bucket = m.get(key);
+			if (bucket) bucket.push(a);
+			else m.set(key, [a]);
+		}
+		for (const bucket of m.values()) {
+			bucket.sort((a, b) => new Date(a.created_at).getTime() - new Date(b.created_at).getTime());
+		}
+		return m;
+	}, [items]);
+
+	// Snapshot the cache shape when its size changes — easiest way to
+	// spot when the cache is empty or stale at the moment a card
+	// mounts. Tracked on a ref so we don't re-run the diff on
+	// reference-equal cache reads.
+	const lastSnapshotRef = useRef<{ threadId: number | null; size: number } | null>(null);
+	useEffect(() => {
+		const last = lastSnapshotRef.current;
+		if (!last || last.threadId !== threadId || last.size !== items.length) {
+			dbg("useAgentActionsQuery: cache snapshot", {
+				threadId,
+				enabled,
+				itemCount: items.length,
+				itemKeys: items.slice(0, 8).map((a) => ({
+					id: a.id,
+					tool_name: a.tool_name,
+					tool_call_id: a.tool_call_id,
+					chat_turn_id: a.chat_turn_id,
+					reversible: a.reversible,
+				})),
+			});
+			lastSnapshotRef.current = { threadId, size: items.length };
+		}
+	}, [threadId, enabled, items]);
+
+	const findByToolCallId = useCallback(
+		(toolCallId: string | null | undefined): AgentAction | null => {
+			if (!toolCallId) return null;
+			const found = byToolCallId.get(toolCallId) ?? null;
+			if (!found && items.length > 0) {
+				dbg("findByToolCallId: MISS", {
+					queriedToolCallId: toolCallId,
+					itemCount: items.length,
+					availableToolCallIds: Array.from(byToolCallId.keys()),
+				});
+			}
+			return found;
+		},
+		[byToolCallId, items.length]
+	);
+
+	const findByChatTurnId = useCallback(
+		(chatTurnId: string | null | undefined): AgentAction[] => {
+			if (!chatTurnId) return [];
+			// Per-turn aggregation is uncommon enough (only the
+			// "Revert turn" button uses it) that re-scanning is fine;
+			// indexing it would just bloat memory.
+			return items.filter((a) => a.chat_turn_id === chatTurnId);
+		},
+		[items]
+	);
+
+	const findByChatTurnAndTool = useCallback(
+		(chatTurnId: string | null | undefined, toolName: string | null | undefined): AgentAction[] => {
+			if (!chatTurnId || !toolName) return [];
+			return byTurnAndTool.get(`${chatTurnId}::${toolName}`) ?? [];
+		},
+		[byTurnAndTool]
+	);
+
+	return {
+		...query,
+		items,
+		findByToolCallId,
+		findByChatTurnId,
+		findByChatTurnAndTool,
+	};
+}
diff --git a/surfsense_web/hooks/use-logs.ts b/surfsense_web/hooks/use-logs.ts
index 0cf5975b65..b646b078df 100644
--- a/surfsense_web/hooks/use-logs.ts
+++ b/surfsense_web/hooks/use-logs.ts
@@ -1,55 +1,17 @@
 "use client";
 import { useQuery } from "@tanstack/react-query";
 import { useCallback, useMemo } from "react";
+import type { LogFilters } from "@/contracts/types/log.types";
 import { logsApiService } from "@/lib/apis/logs-api.service";
 import { cacheKeys } from "@/lib/query-client/cache-keys";
 
-export type LogLevel = "DEBUG" | "INFO" | "WARNING" | "ERROR" | "CRITICAL";
-export type LogStatus = "IN_PROGRESS" | "SUCCESS" | "FAILED";
-
-export interface Log {
-	id: number;
-	level: LogLevel;
-	status: LogStatus;
-	message: string;
-	source?: string;
-	log_metadata?: Record<string, unknown>;
-	created_at: string;
-	search_space_id: number;
-}
-
-export interface LogFilters {
-	search_space_id?: number;
-	level?: LogLevel;
-	status?: LogStatus;
-	source?: string;
-	start_date?: string;
-	end_date?: string;
-}
-
-export interface LogSummary {
-	total_logs: number;
-	time_window_hours: number;
-	by_status: Record<string, number>;
-	by_level: Record<string, number>;
-	by_source: Record<string, number>;
-	active_tasks: Array<{
-		id: number;
-		task_name: string;
-		message: string;
-		started_at: string;
-		source?: string;
-		document_id?: number;
-	}>;
-	recent_failures: Array<{
-		id: number;
-		task_name: string;
-		message: string;
-		failed_at: string;
-		source?: string;
-		error_details?: string;
-	}>;
-}
+export type {
+	Log,
+	LogFilters,
+	LogLevel,
+	LogStatus,
+	LogSummary,
+} from "@/contracts/types/log.types";
 
 export function useLogs(searchSpaceId?: number, filters: LogFilters = {}) {
 	const filtersKey = JSON.stringify(filters);
diff --git a/surfsense_web/hooks/use-messages-sync.ts b/surfsense_web/hooks/use-messages-sync.ts
index ddbe8a7575..5ccda23a5d 100644
--- a/surfsense_web/hooks/use-messages-sync.ts
+++ b/surfsense_web/hooks/use-messages-sync.ts
@@ -31,6 +31,14 @@ export function useMessagesSync(
 			content: msg.content,
 			author_id: msg.authorId ?? null,
 			created_at: new Date(msg.createdAt).toISOString(),
+			// Forward the per-turn correlation id so post-stream Zero
+			// re-syncs preserve ``metadata.custom.chatTurnId`` on the
+			// converted ``ThreadMessageLike``. Without this the inline
+			// Revert button's ``(chat_turn_id, tool_name, position)``
+			// fallback breaks the moment Zero overwrites the messages
+			// state after a live stream completes (see
+			// ``handleSyncedMessagesUpdate`` in the chat page).
+			turn_id: msg.turnId ?? null,
 		}));
 
 		onMessagesUpdateRef.current(mapped);
diff --git a/surfsense_web/hooks/use-typewriter.ts b/surfsense_web/hooks/use-typewriter.ts
index 1e1ce8b83f..54d33e08da 100644
--- a/surfsense_web/hooks/use-typewriter.ts
+++ b/surfsense_web/hooks/use-typewriter.ts
@@ -27,7 +27,6 @@ export function useTypewriter(text: string, speed = 35, skipFor = "New Chat"): s
 		}
 
 		let i = 0;
-		setDisplayed("");
 		intervalRef.current = setInterval(() => {
 			i++;
 			setDisplayed(text.slice(0, i));
diff --git a/surfsense_web/instrumentation-client.ts b/surfsense_web/instrumentation-client.ts
index dff2e9bfe5..3ae97fc0b5 100644
--- a/surfsense_web/instrumentation-client.ts
+++ b/surfsense_web/instrumentation-client.ts
@@ -1,18 +1,65 @@
 import posthog from "posthog-js";
 
-function initPostHog() {
+/**
+ * PostHog initialisation for the Next.js renderer.
+ *
+ * The same bundle ships in two contexts:
+ *   1. A normal browser session on surfsense.com  -> platform = "web"
+ *   2. The Electron desktop app (renders the Next app from localhost)
+ *      -> platform = "desktop"
+ *
+ * When running inside Electron we also seed `posthog-js` with the main
+ * process's machine distinctId so that events fired from both the renderer
+ * (e.g. `chat_message_sent`, page views) and the Electron main process
+ * (e.g. `desktop_quick_ask_opened`) share a single PostHog person before
+ * login, and can be merged into the authenticated user afterwards.
+ */
+
+function isElectron(): boolean {
+	return typeof window !== "undefined" && !!window.electronAPI;
+}
+
+function currentPlatform(): "desktop" | "web" {
+	return isElectron() ? "desktop" : "web";
+}
+
+async function resolveBootstrapDistinctId(): Promise<string | undefined> {
+	if (!isElectron() || !window.electronAPI?.getAnalyticsContext) return undefined;
+	try {
+		const ctx = await window.electronAPI.getAnalyticsContext();
+		return ctx?.machineId || ctx?.distinctId || undefined;
+	} catch {
+		return undefined;
+	}
+}
+
+async function initPostHog() {
 	try {
 		if (!process.env.NEXT_PUBLIC_POSTHOG_KEY) return;
 
+		const platform = currentPlatform();
+		const bootstrapDistinctId = await resolveBootstrapDistinctId();
+
 		posthog.init(process.env.NEXT_PUBLIC_POSTHOG_KEY, {
 			api_host: "https://assets.surfsense.com",
 			ui_host: "https://us.posthog.com",
 			defaults: "2026-01-30",
 			capture_pageview: "history_change",
 			capture_pageleave: true,
+			...(bootstrapDistinctId
+				? {
+						bootstrap: {
+							distinctID: bootstrapDistinctId,
+							isIdentifiedID: false,
+						},
+					}
+				: {}),
 			before_send: (event) => {
 				if (event?.properties) {
-					event.properties.platform = "web";
+					event.properties.platform = platform;
+					if (platform === "desktop") {
+						event.properties.is_desktop = true;
+					}
 
 					const params = new URLSearchParams(window.location.search);
 					const ref = params.get("ref");
@@ -30,9 +77,14 @@ function initPostHog() {
 
 					event.properties.$set = {
 						...event.properties.$set,
-						platform: "web",
+						platform,
 						last_seen_at: new Date().toISOString(),
 					};
+
+					event.properties.$set_once = {
+						...event.properties.$set_once,
+						first_seen_platform: platform,
+					};
 				}
 				return event;
 			},
@@ -51,8 +103,12 @@ if (typeof window !== "undefined") {
 	window.posthog = posthog;
 
 	if ("requestIdleCallback" in window) {
-		requestIdleCallback(initPostHog);
+		requestIdleCallback(() => {
+			void initPostHog();
+		});
 	} else {
-		setTimeout(initPostHog, 3500);
+		setTimeout(() => {
+			void initPostHog();
+		}, 3500);
 	}
 }
diff --git a/surfsense_web/lib/agent-filesystem.ts b/surfsense_web/lib/agent-filesystem.ts
new file mode 100644
index 0000000000..5f8066d27a
--- /dev/null
+++ b/surfsense_web/lib/agent-filesystem.ts
@@ -0,0 +1,72 @@
+export type AgentFilesystemMode = "cloud" | "desktop_local_folder";
+export type ClientPlatform = "web" | "desktop";
+
+export interface AgentFilesystemMountSelection {
+	mount_id: string;
+	root_path: string;
+}
+
+export interface AgentFilesystemSelection {
+	filesystem_mode: AgentFilesystemMode;
+	client_platform: ClientPlatform;
+	local_filesystem_mounts?: AgentFilesystemMountSelection[];
+}
+
+export interface AgentFilesystemSelectionOptions {
+	localFilesystemEnabled: boolean;
+}
+
+const DEFAULT_SELECTION: AgentFilesystemSelection = {
+	filesystem_mode: "cloud",
+	client_platform: "web",
+};
+
+export function getClientPlatform(): ClientPlatform {
+	if (typeof window === "undefined") return "web";
+	return window.electronAPI ? "desktop" : "web";
+}
+
+export async function getAgentFilesystemSelection(
+	searchSpaceId?: number | null,
+	options?: AgentFilesystemSelectionOptions
+): Promise<AgentFilesystemSelection> {
+	const platform = getClientPlatform();
+	if (
+		platform !== "desktop" ||
+		!options?.localFilesystemEnabled ||
+		!window.electronAPI?.getAgentFilesystemSettings
+	) {
+		return { ...DEFAULT_SELECTION, client_platform: platform };
+	}
+	try {
+		const settings = await window.electronAPI.getAgentFilesystemSettings(searchSpaceId);
+		if (settings.mode === "desktop_local_folder") {
+			const mounts = await window.electronAPI.getAgentFilesystemMounts?.(searchSpaceId);
+			const localFilesystemMounts =
+				mounts?.map((entry) => ({
+					mount_id: entry.mount,
+					root_path: entry.rootPath,
+				})) ?? [];
+			if (localFilesystemMounts.length === 0) {
+				return {
+					filesystem_mode: "cloud",
+					client_platform: "desktop",
+				};
+			}
+			return {
+				filesystem_mode: "desktop_local_folder",
+				client_platform: "desktop",
+				local_filesystem_mounts: localFilesystemMounts,
+			};
+		}
+		return {
+			filesystem_mode: "cloud",
+			client_platform: "desktop",
+		};
+	} catch {
+		return {
+			filesystem_mode: "cloud",
+			client_platform: "desktop",
+		};
+	}
+}
diff --git a/surfsense_web/lib/announcements/announcements-data.ts b/surfsense_web/lib/announcements/announcements-data.ts
index bde8b3f43c..ce44ec5395 100644
--- a/surfsense_web/lib/announcements/announcements-data.ts
+++ b/surfsense_web/lib/announcements/announcements-data.ts
@@ -15,8 +15,8 @@ import type { Announcement } from "@/contracts/types/announcement.types";
 export const announcements: Announcement[] = [
 	{
 		id: "announcement-1",
-		title: "Introducing Announcements",
-		description: "All major announcements will be posted here.",
+		title: "Introducing What's New",
+		description: "All major product updates will be posted here.",
 		category: "feature",
 		date: "2026-02-17T00:00:00Z",
 		startTime: "2026-02-17T00:00:00Z",
diff --git a/surfsense_web/lib/apis/agent-actions-api.service.ts b/surfsense_web/lib/apis/agent-actions-api.service.ts
new file mode 100644
index 0000000000..6634a11f71
--- /dev/null
+++ b/surfsense_web/lib/apis/agent-actions-api.service.ts
@@ -0,0 +1,120 @@
+import { z } from "zod";
+import { baseApiService } from "./base-api.service";
+
+const AgentActionReadSchema = z.object({
+	id: z.number(),
+	thread_id: z.number(),
+	user_id: z.string().nullable(),
+	search_space_id: z.number(),
+	tool_name: z.string(),
+	args: z.record(z.string(), z.unknown()).nullable(),
+	result_id: z.string().nullable(),
+	reversible: z.boolean(),
+	reverse_descriptor: z.record(z.string(), z.unknown()).nullable(),
+	error: z.record(z.string(), z.unknown()).nullable(),
+	reverse_of: z.number().nullable(),
+	reverted_by_action_id: z.number().nullable(),
+	is_revert_action: z.boolean(),
+	// Correlation ids added in migration 135. The LangChain
+	// ``tool_call_id`` joins this row to the chat tool card via the
+	// ``data-action-log.lc_tool_call_id`` SSE event, and
+	// ``chat_turn_id`` keys the per-turn revert endpoint.
+	tool_call_id: z.string().nullable().optional(),
+	chat_turn_id: z.string().nullable().optional(),
+	created_at: z.string(),
+});
+
+export type AgentAction = z.infer<typeof AgentActionReadSchema>;
+
+const AgentActionListResponseSchema = z.object({
+	items: z.array(AgentActionReadSchema),
+	total: z.number(),
+	page: z.number(),
+	page_size: z.number(),
+	has_more: z.boolean(),
+});
+
+export type AgentActionListResponse = z.infer<typeof AgentActionListResponseSchema>;
+
+const RevertResponseSchema = z.object({
+	status: z.literal("ok"),
+	message: z.string(),
+	new_action_id: z.number().nullable().optional(),
+});
+
+export type RevertResponse = z.infer<typeof RevertResponseSchema>;
+
+// Per-turn batch revert. The route never returns whole-batch 4xx;
+// partial success is the common case and surfaced as
+// ``status === "partial"`` with a per-action result list.
+const RevertTurnActionResultSchema = z.object({
+	action_id: z.number(),
+	tool_name: z.string(),
+	status: z.enum([
+		"reverted",
+		"already_reverted",
+		"not_reversible",
+		"permission_denied",
+		"failed",
+		"skipped",
+	]),
+	message: z.string().nullable().optional(),
+	new_action_id: z.number().nullable().optional(),
+	error: z.string().nullable().optional(),
+});
+
+export type RevertTurnActionResult = z.infer<typeof RevertTurnActionResultSchema>;
+
+const RevertTurnResponseSchema = z.object({
+	status: z.enum(["ok", "partial"]),
+	chat_turn_id: z.string(),
+	total: z.number(),
+	reverted: z.number(),
+	already_reverted: z.number(),
+	not_reversible: z.number(),
+	// ``permission_denied`` and ``skipped`` are first-class counters so
+	// ``total === reverted + already_reverted +
+	// not_reversible + permission_denied + failed + skipped`` always
+	// holds. ``.default(0)`` keeps the schema backwards-compatible
+	// with older deployments that haven't shipped the response model
+	// update yet.
+	permission_denied: z.number().default(0),
+	failed: z.number(),
+	skipped: z.number().default(0),
+	results: z.array(RevertTurnActionResultSchema),
+});
+
+export type RevertTurnResponse = z.infer<typeof RevertTurnResponseSchema>;
+
+class AgentActionsApiService {
+	listForThread = async (
+		threadId: number,
+		opts: { page?: number; pageSize?: number } = {}
+	): Promise<AgentActionListResponse> => {
+		const params = new URLSearchParams();
+		params.set("page", String(opts.page ?? 0));
+		params.set("page_size", String(opts.pageSize ?? 50));
+		return baseApiService.get(
+			`/api/v1/threads/${threadId}/actions?${params.toString()}`,
+			AgentActionListResponseSchema
+		);
+	};
+
+	revert = async (threadId: number, actionId: number): Promise<RevertResponse> => {
+		return baseApiService.post(
+			`/api/v1/threads/${threadId}/revert/${actionId}`,
+			RevertResponseSchema,
+			{ body: {} }
+		);
+	};
+
+	revertTurn = async (threadId: number, chatTurnId: string): Promise<RevertTurnResponse> => {
+		return baseApiService.post(
+			`/api/v1/threads/${threadId}/revert-turn/${encodeURIComponent(chatTurnId)}`,
+			RevertTurnResponseSchema,
+			{ body: {} }
+		);
+	};
+}
+
+export const agentActionsApiService = new AgentActionsApiService();
diff --git a/surfsense_web/lib/apis/agent-flags-api.service.ts b/surfsense_web/lib/apis/agent-flags-api.service.ts
new file mode 100644
index 0000000000..534810c0e5
--- /dev/null
+++ b/surfsense_web/lib/apis/agent-flags-api.service.ts
@@ -0,0 +1,42 @@
+import { z } from "zod";
+import { baseApiService } from "./base-api.service";
+
+const AgentFeatureFlagsSchema = z.object({
+	disable_new_agent_stack: z.boolean(),
+
+	enable_context_editing: z.boolean(),
+	enable_compaction_v2: z.boolean(),
+	enable_retry_after: z.boolean(),
+	enable_model_fallback: z.boolean(),
+	enable_model_call_limit: z.boolean(),
+	enable_tool_call_limit: z.boolean(),
+	enable_tool_call_repair: z.boolean(),
+	enable_doom_loop: z.boolean(),
+
+	enable_permission: z.boolean(),
+	enable_busy_mutex: z.boolean(),
+	enable_llm_tool_selector: z.boolean(),
+
+	enable_skills: z.boolean(),
+	enable_specialized_subagents: z.boolean(),
+	enable_kb_planner_runnable: z.boolean(),
+
+	enable_action_log: z.boolean(),
+	enable_revert_route: z.boolean(),
+
+	enable_plugin_loader: z.boolean(),
+
+	enable_otel: z.boolean(),
+
+	enable_desktop_local_filesystem: z.boolean(),
+});
+
+export type AgentFeatureFlags = z.infer<typeof AgentFeatureFlagsSchema>;
+
+class AgentFlagsApiService {
+	get = async (): Promise<AgentFeatureFlags> => {
+		return baseApiService.get(`/api/v1/agent/flags`, AgentFeatureFlagsSchema);
+	};
+}
+
+export const agentFlagsApiService = new AgentFlagsApiService();
diff --git a/surfsense_web/lib/apis/agent-permissions-api.service.ts b/surfsense_web/lib/apis/agent-permissions-api.service.ts
new file mode 100644
index 0000000000..6927c55d01
--- /dev/null
+++ b/surfsense_web/lib/apis/agent-permissions-api.service.ts
@@ -0,0 +1,90 @@
+import { z } from "zod";
+import { ValidationError } from "@/lib/error";
+import { baseApiService } from "./base-api.service";
+
+const ActionEnum = z.enum(["allow", "deny", "ask"]);
+export type AgentPermissionAction = z.infer<typeof ActionEnum>;
+
+const AgentPermissionRuleSchema = z.object({
+	id: z.number(),
+	search_space_id: z.number(),
+	user_id: z.string().nullable(),
+	thread_id: z.number().nullable(),
+	permission: z.string(),
+	pattern: z.string(),
+	action: ActionEnum,
+	created_at: z.string(),
+});
+
+export type AgentPermissionRule = z.infer<typeof AgentPermissionRuleSchema>;
+
+const AgentPermissionRuleListSchema = z.array(AgentPermissionRuleSchema);
+
+const AgentPermissionRuleCreateSchema = z.object({
+	permission: z
+		.string()
+		.min(1, "Permission is required")
+		.max(255)
+		.regex(/^[a-zA-Z0-9_:.\-*]+$/, "Use letters, digits, '.', '_', ':', '-', or '*' wildcards."),
+	pattern: z.string().min(1).max(255).default("*"),
+	action: ActionEnum,
+	user_id: z.string().nullable().optional(),
+	thread_id: z.number().nullable().optional(),
+});
+
+export type AgentPermissionRuleCreate = z.infer<typeof AgentPermissionRuleCreateSchema>;
+
+const AgentPermissionRuleUpdateSchema = z.object({
+	pattern: z.string().min(1).max(255).optional(),
+	action: ActionEnum.optional(),
+});
+
+export type AgentPermissionRuleUpdate = z.infer<typeof AgentPermissionRuleUpdateSchema>;
+
+class AgentPermissionsApiService {
+	list = async (searchSpaceId: number): Promise<AgentPermissionRule[]> => {
+		return baseApiService.get(
+			`/api/v1/searchspaces/${searchSpaceId}/agent/permissions/rules`,
+			AgentPermissionRuleListSchema
+		);
+	};
+
+	create = async (
+		searchSpaceId: number,
+		payload: AgentPermissionRuleCreate
+	): Promise<AgentPermissionRule> => {
+		const parsed = AgentPermissionRuleCreateSchema.safeParse(payload);
+		if (!parsed.success) {
+			throw new ValidationError(parsed.error.issues.map((i) => i.message).join(", "));
+		}
+		return baseApiService.post(
+			`/api/v1/searchspaces/${searchSpaceId}/agent/permissions/rules`,
+			AgentPermissionRuleSchema,
+			{ body: parsed.data }
+		);
+	};
+
+	update = async (
+		searchSpaceId: number,
+		ruleId: number,
+		payload: AgentPermissionRuleUpdate
+	): Promise<AgentPermissionRule> => {
+		const parsed = AgentPermissionRuleUpdateSchema.safeParse(payload);
+		if (!parsed.success) {
+			throw new ValidationError(parsed.error.issues.map((i) => i.message).join(", "));
+		}
+		return baseApiService.patch(
+			`/api/v1/searchspaces/${searchSpaceId}/agent/permissions/rules/${ruleId}`,
+			AgentPermissionRuleSchema,
+			{ body: parsed.data }
+		);
+	};
+
+	remove = async (searchSpaceId: number, ruleId: number): Promise<void> => {
+		await baseApiService.delete(
+			`/api/v1/searchspaces/${searchSpaceId}/agent/permissions/rules/${ruleId}`
+		);
+	};
+}
+
+export const agentPermissionsApiService = new AgentPermissionsApiService();
diff --git a/surfsense_web/lib/apis/anonymous-chat-api.service.ts b/surfsense_web/lib/apis/anonymous-chat-api.service.ts
new file mode 100644
index 0000000000..843576a50b
--- /dev/null
+++ b/surfsense_web/lib/apis/anonymous-chat-api.service.ts
@@ -0,0 +1,105 @@
+import {
+	type AnonChatRequest,
+	type AnonModel,
+	type AnonQuotaResponse,
+	anonChatRequest,
+	anonQuotaResponse,
+	getAnonModelResponse,
+	getAnonModelsResponse,
+} from "@/contracts/types/anonymous-chat.types";
+import { BACKEND_URL } from "../env-config";
+import { ValidationError } from "../error";
+
+const BASE = "/api/v1/public/anon-chat";
+
+export type AnonUploadResult =
+	| { ok: true; data: { filename: string; size_bytes: number } }
+	| { ok: false; reason: "quota_exceeded" };
+
+class AnonymousChatApiService {
+	private baseUrl: string;
+
+	constructor(baseUrl: string) {
+		this.baseUrl = baseUrl;
+	}
+
+	private fullUrl(path: string): string {
+		return `${this.baseUrl}${BASE}${path}`;
+	}
+
+	getModels = async (): Promise<AnonModel[]> => {
+		const res = await fetch(this.fullUrl("/models"), { credentials: "include" });
+		if (!res.ok) throw new Error(`Failed to fetch models: ${res.status}`);
+		const data = await res.json();
+		const parsed = getAnonModelsResponse.safeParse(data);
+		if (!parsed.success) console.error("Invalid anon models response:", parsed.error);
+		return data;
+	};
+
+	getModel = async (slug: string): Promise<AnonModel> => {
+		const res = await fetch(this.fullUrl(`/models/${encodeURIComponent(slug)}`), {
+			credentials: "include",
+		});
+		if (!res.ok) {
+			if (res.status === 404) throw new Error("Model not found");
+			throw new Error(`Failed to fetch model: ${res.status}`);
+		}
+		const data = await res.json();
+		const parsed = getAnonModelResponse.safeParse(data);
+		if (!parsed.success) console.error("Invalid anon model response:", parsed.error);
+		return data;
+	};
+
+	getQuota = async (): Promise<AnonQuotaResponse> => {
+		const res = await fetch(this.fullUrl("/quota"), { credentials: "include" });
+		if (!res.ok) throw new Error(`Failed to fetch quota: ${res.status}`);
+		const data = await res.json();
+		const parsed = anonQuotaResponse.safeParse(data);
+		if (!parsed.success) console.error("Invalid anon quota response:", parsed.error);
+		return data;
+	};
+
+	streamChat = async (request: AnonChatRequest): Promise<Response> => {
+		const validated = anonChatRequest.safeParse(request);
+		if (!validated.success) {
+			throw new ValidationError(
+				`Invalid request: ${validated.error.issues.map((i) => i.message).join(", ")}`
+			);
+		}
+
+		return fetch(this.fullUrl("/stream"), {
+			method: "POST",
+			headers: { "Content-Type": "application/json" },
+			credentials: "include",
+			body: JSON.stringify(validated.data),
+		});
+	};
+
+	uploadDocument = async (file: File): Promise<AnonUploadResult> => {
+		const formData = new FormData();
+		formData.append("file", file);
+		const res = await fetch(this.fullUrl("/upload"), {
+			method: "POST",
+			credentials: "include",
+			body: formData,
+		});
+		if (res.status === 409) {
+			return { ok: false, reason: "quota_exceeded" };
+		}
+		if (!res.ok) {
+			const body = await res.json().catch(() => ({}));
+			throw new Error(body.detail || `Upload failed: ${res.status}`);
+		}
+		const data = await res.json();
+		return { ok: true, data };
+	};
+
+	getDocument = async (): Promise<{ filename: string; size_bytes: number } | null> => {
+		const res = await fetch(this.fullUrl("/document"), { credentials: "include" });
+		if (res.status === 404) return null;
+		if (!res.ok) throw new Error(`Failed to fetch document: ${res.status}`);
+		return res.json();
+	};
+}
+
+export const anonymousChatApiService = new AnonymousChatApiService(BACKEND_URL);
diff --git a/surfsense_web/lib/apis/base-api.service.ts b/surfsense_web/lib/apis/base-api.service.ts
index a0e2cda675..752c607ad3 100644
--- a/surfsense_web/lib/apis/base-api.service.ts
+++ b/surfsense_web/lib/apis/base-api.service.ts
@@ -1,6 +1,14 @@
 import type { ZodType } from "zod";
+import { getClientPlatform } from "../agent-filesystem";
 import { getBearerToken, handleUnauthorized, refreshAccessToken } from "../auth-utils";
-import { AppError, AuthenticationError, AuthorizationError, NotFoundError } from "../error";
+import {
+	AbortedError,
+	AppError,
+	AuthenticationError,
+	AuthorizationError,
+	NetworkError,
+	NotFoundError,
+} from "../error";
 
 enum ResponseType {
 	JSON = "json",
@@ -66,11 +74,11 @@ class BaseApiService {
 			 * ----------
 			 */
 			const defaultOptions: RequestOptions = {
-				headers: this.bearerToken
-					? {
-							Authorization: `Bearer ${this.bearerToken}`,
-						}
-					: {},
+				headers: {
+					Authorization: `Bearer ${this.bearerToken || ""}`,
+					"X-SurfSense-Client-Platform":
+						typeof window === "undefined" ? "web" : getClientPlatform(),
+				},
 				method: "GET",
 				responseType: ResponseType.JSON,
 			};
@@ -138,12 +146,21 @@ class BaseApiService {
 					throw new AppError("Failed to parse response", response.status, response.statusText);
 				}
 
+				// Extract structured fields from new envelope or legacy shape
+				const envelope = typeof data === "object" && data?.error;
+				const errorMessage: string =
+					envelope?.message ??
+					(typeof data === "object" && typeof data?.detail === "string" ? data.detail : "");
+				const errorCode: string | undefined = envelope?.code;
+				const requestId: string | undefined =
+					envelope?.request_id ?? response.headers.get("X-Request-ID") ?? undefined;
+				const reportUrl: string | undefined = envelope?.report_url;
+
 				// Handle 401 - try to refresh token first (only once)
 				if (response.status === 401) {
 					if (!options?._isRetry) {
 						const newToken = await refreshAccessToken();
 						if (newToken) {
-							// Retry the request with the new token
 							return this.request(url, responseSchema, {
 								...mergedOptions,
 								headers: {
@@ -154,34 +171,37 @@ class BaseApiService {
 							} as RequestOptions & { responseType?: R });
 						}
 					}
-					// Refresh failed or retry failed, redirect to login
 					handleUnauthorized();
 					throw new AuthenticationError(
-						typeof data === "object" && "detail" in data
-							? data.detail
-							: "You are not authenticated. Please login again.",
+						errorMessage || "You are not authenticated. Please login again.",
 						response.status,
 						response.statusText
 					);
 				}
 
-				// For fastapi errors response
-				if (typeof data === "object" && "detail" in data) {
-					throw new AppError(data.detail, response.status, response.statusText);
-				}
-
+				// Map status to typed error
 				switch (response.status) {
 					case 403:
 						throw new AuthorizationError(
-							"You don't have permission to access this resource.",
+							errorMessage || "You don't have permission to access this resource.",
 							response.status,
 							response.statusText
 						);
 					case 404:
-						throw new NotFoundError("Resource not found", response.status, response.statusText);
-					//  Add more cases as needed
+						throw new NotFoundError(
+							errorMessage || "Resource not found",
+							response.status,
+							response.statusText
+						);
 					default:
-						throw new AppError("Something went wrong", response.status, response.statusText);
+						throw new AppError(
+							errorMessage || "Something went wrong",
+							response.status,
+							response.statusText,
+							errorCode,
+							requestId,
+							reportUrl
+						);
 				}
 			}
 
@@ -232,6 +252,16 @@ class BaseApiService {
 
 			return data;
 		} catch (error) {
+			// Normalize browser-level fetch failures before anything else
+			if (error instanceof DOMException && error.name === "AbortError") {
+				throw new AbortedError();
+			}
+			if (error instanceof TypeError && !(error instanceof AppError)) {
+				throw new NetworkError(
+					"Unable to connect to the server. Check your internet connection and try again."
+				);
+			}
+
 			console.error("Request failed:", JSON.stringify(error));
 			if (!(error instanceof AuthenticationError)) {
 				import("posthog-js")
@@ -242,11 +272,12 @@ class BaseApiService {
 							...(error instanceof AppError && {
 								status_code: error.status,
 								status_text: error.statusText,
+								error_code: error.code,
+								request_id: error.requestId,
 							}),
 						});
 					})
 					.catch(() => {
-						// PostHog is not available in the current environment
 						console.error("Failed to capture exception in PostHog");
 					});
 			}
diff --git a/surfsense_web/lib/apis/connectors-api.service.ts b/surfsense_web/lib/apis/connectors-api.service.ts
index abd16c7a7d..4b6d69883e 100644
--- a/surfsense_web/lib/apis/connectors-api.service.ts
+++ b/surfsense_web/lib/apis/connectors-api.service.ts
@@ -404,6 +404,26 @@ class ConnectorsApiService {
 			listDiscordChannelsResponse
 		);
 	};
+
+	/** Live stats for the Obsidian connector tile. */
+	getObsidianStats = async (vaultId: string): Promise<ObsidianStats> => {
+		return baseApiService.get<ObsidianStats>(
+			`/api/v1/obsidian/stats?vault_id=${encodeURIComponent(vaultId)}`
+		);
+	};
+
+	/** Revoke a previously-trusted MCP tool so the next call asks again. */
+	untrustMCPTool = async (connectorId: number, toolName: string): Promise<void> => {
+		await baseApiService.post(`/api/v1/connectors/mcp/${connectorId}/untrust-tool`, undefined, {
+			body: { tool_name: toolName },
+		});
+	};
+}
+
+export interface ObsidianStats {
+	vault_id: string;
+	files_synced: number;
+	last_sync_at: string | null;
 }
 
 export type { SlackChannel, DiscordChannel };
diff --git a/surfsense_web/lib/apis/documents-api.service.ts b/surfsense_web/lib/apis/documents-api.service.ts
index 584f2e212e..630c88d163 100644
--- a/surfsense_web/lib/apis/documents-api.service.ts
+++ b/surfsense_web/lib/apis/documents-api.service.ts
@@ -5,6 +5,7 @@ import {
 	type DeleteDocumentRequest,
 	deleteDocumentRequest,
 	deleteDocumentResponse,
+	documentTitleRead,
 	type GetDocumentByChunkRequest,
 	type GetDocumentChunksRequest,
 	type GetDocumentRequest,
@@ -127,7 +128,8 @@ class DocumentsApiService {
 			throw new ValidationError(`Invalid request: ${errorMessage}`);
 		}
 
-		const { files, search_space_id, should_summarize } = parsedRequest.data;
+		const { files, search_space_id, should_summarize, use_vision_llm, processing_mode } =
+			parsedRequest.data;
 		const UPLOAD_BATCH_SIZE = 5;
 
 		const batches: File[][] = [];
@@ -146,6 +148,8 @@ class DocumentsApiService {
 			for (const file of batch) formData.append("files", file);
 			formData.append("search_space_id", String(search_space_id));
 			formData.append("should_summarize", String(should_summarize));
+			formData.append("use_vision_llm", String(use_vision_llm));
+			formData.append("processing_mode", processing_mode);
 
 			const controller = new AbortController();
 			const timeoutId = setTimeout(() => controller.abort(), 120_000);
@@ -266,6 +270,17 @@ class DocumentsApiService {
 		);
 	};
 
+	getDocumentByVirtualPath = async (request: { search_space_id: number; virtual_path: string }) => {
+		const params = new URLSearchParams({
+			search_space_id: String(request.search_space_id),
+			virtual_path: request.virtual_path,
+		});
+		return baseApiService.get(
+			`/api/v1/documents/by-virtual-path?${params.toString()}`,
+			documentTitleRead
+		);
+	};
+
 	/**
 	 * Get document type counts
 	 */
@@ -442,6 +457,8 @@ class DocumentsApiService {
 			relative_paths: string[];
 			root_folder_id?: number | null;
 			enable_summary?: boolean;
+			use_vision_llm?: boolean;
+			processing_mode?: "basic" | "premium";
 		},
 		signal?: AbortSignal
 	): Promise<{ message: string; status: string; root_folder_id: number; file_count: number }> => {
@@ -456,6 +473,8 @@ class DocumentsApiService {
 			formData.append("root_folder_id", String(metadata.root_folder_id));
 		}
 		formData.append("enable_summary", String(metadata.enable_summary ?? false));
+		formData.append("use_vision_llm", String(metadata.use_vision_llm ?? false));
+		formData.append("processing_mode", metadata.processing_mode ?? "basic");
 
 		const totalSize = files.reduce((acc, f) => acc + f.size, 0);
 		const timeoutMs = Math.min(Math.max((totalSize / (1024 * 1024)) * 5000, 30_000), 600_000);
diff --git a/surfsense_web/lib/apis/search-spaces-api.service.ts b/surfsense_web/lib/apis/search-spaces-api.service.ts
index 3e2006e46d..e593245f8d 100644
--- a/surfsense_web/lib/apis/search-spaces-api.service.ts
+++ b/surfsense_web/lib/apis/search-spaces-api.service.ts
@@ -1,3 +1,4 @@
+import { z } from "zod";
 import {
 	type CreateSearchSpaceRequest,
 	createSearchSpaceRequest,
@@ -117,6 +118,17 @@ class SearchSpacesApiService {
 		return baseApiService.delete(`/api/v1/searchspaces/${request.id}`, deleteSearchSpaceResponse);
 	};
 
+	/**
+	 * Trigger AI file sorting for all documents in a search space
+	 */
+	triggerAiSort = async (searchSpaceId: number) => {
+		return baseApiService.post(
+			`/api/v1/searchspaces/${searchSpaceId}/ai-sort`,
+			z.object({ message: z.string() }),
+			{}
+		);
+	};
+
 	/**
 	 * Leave a search space (remove own membership)
 	 * This is used by non-owners to leave a shared search space
diff --git a/surfsense_web/lib/apis/stripe-api.service.ts b/surfsense_web/lib/apis/stripe-api.service.ts
index e22cfec89a..f119fbf6a1 100644
--- a/surfsense_web/lib/apis/stripe-api.service.ts
+++ b/surfsense_web/lib/apis/stripe-api.service.ts
@@ -1,11 +1,20 @@
 import {
 	type CreateCheckoutSessionRequest,
 	type CreateCheckoutSessionResponse,
+	type CreateTokenCheckoutSessionRequest,
+	type CreateTokenCheckoutSessionResponse,
 	createCheckoutSessionResponse,
+	createTokenCheckoutSessionResponse,
+	type FinalizeCheckoutResponse,
+	finalizeCheckoutResponse,
 	type GetPagePurchasesResponse,
+	type GetTokenPurchasesResponse,
 	getPagePurchasesResponse,
+	getTokenPurchasesResponse,
 	type StripeStatusResponse,
 	stripeStatusResponse,
+	type TokenStripeStatusResponse,
+	tokenStripeStatusResponse,
 } from "@/contracts/types/stripe.types";
 import { baseApiService } from "./base-api.service";
 
@@ -29,6 +38,38 @@ class StripeApiService {
 	getStatus = async (): Promise<StripeStatusResponse> => {
 		return baseApiService.get("/api/v1/stripe/status", stripeStatusResponse);
 	};
+
+	createTokenCheckoutSession = async (
+		request: CreateTokenCheckoutSessionRequest
+	): Promise<CreateTokenCheckoutSessionResponse> => {
+		return baseApiService.post(
+			"/api/v1/stripe/create-token-checkout-session",
+			createTokenCheckoutSessionResponse,
+			{ body: request }
+		);
+	};
+
+	getTokenStatus = async (): Promise<TokenStripeStatusResponse> => {
+		return baseApiService.get("/api/v1/stripe/token-status", tokenStripeStatusResponse);
+	};
+
+	getTokenPurchases = async (): Promise<GetTokenPurchasesResponse> => {
+		return baseApiService.get("/api/v1/stripe/token-purchases", getTokenPurchasesResponse);
+	};
+
+	/**
+	 * Synchronously fulfil a checkout session from the success page.
+	 *
+	 * Solves the race where the user lands on /purchase-success before
+	 * Stripe's checkout.session.completed webhook arrives. Idempotent —
+	 * safe to call concurrently with the webhook.
+	 */
+	finalizeCheckout = async (sessionId: string): Promise<FinalizeCheckoutResponse> => {
+		return baseApiService.get(
+			`/api/v1/stripe/finalize-checkout?session_id=${encodeURIComponent(sessionId)}`,
+			finalizeCheckoutResponse
+		);
+	};
 }
 
 export const stripeApiService = new StripeApiService();
diff --git a/surfsense_web/lib/auth-utils.ts b/surfsense_web/lib/auth-utils.ts
index 10bd31bdf2..d7bff10a08 100644
--- a/surfsense_web/lib/auth-utils.ts
+++ b/surfsense_web/lib/auth-utils.ts
@@ -18,6 +18,7 @@ const PUBLIC_ROUTE_PREFIXES = [
 	"/desktop/login",
 	"/docs",
 	"/public",
+	"/free",
 	"/invite",
 	"/contact",
 	"/pricing",
@@ -67,9 +68,9 @@ export function handleUnauthorized(): void {
 	if (isPublicRoute(pathname)) return;
 
 	const currentPath = pathname + window.location.search + window.location.hash;
-	const excludedPaths = ["/auth", "/"];
+	const excludedPaths = ["/auth", "/auth/callback", "/"];
 	if (!excludedPaths.includes(pathname)) {
-		localStorage.setItem(REDIRECT_PATH_KEY, currentPath);
+		setRedirectPath(currentPath);
 	}
 
 	// Redirect through oauth2-proxy /oauth2/sign_in. The dedicated auth subdomain
@@ -79,6 +80,14 @@ export function handleUnauthorized(): void {
 	window.location.href = `${oauthProxyUrl}/oauth2/sign_in?rd=${encodeURIComponent(rd)}`;
 }
 
+/**
+ * Stores the path to redirect to after successful authentication.
+ */
+export function setRedirectPath(path: string): void {
+	if (typeof window === "undefined") return;
+	localStorage.setItem(REDIRECT_PATH_KEY, path);
+}
+
 /**
  * Gets the stored redirect path and clears it from storage
  * Call this after successful login to redirect the user back
@@ -270,7 +279,7 @@ export function redirectToLogin(): void {
 	// Don't save auth-related paths or home page
 	const excludedPaths = ["/auth", "/", "/login", "/register"];
 	if (!excludedPaths.includes(window.location.pathname)) {
-		localStorage.setItem(REDIRECT_PATH_KEY, currentPath);
+		setRedirectPath(currentPath);
 	}
 
 	window.location.href = getLoginPath();
diff --git a/surfsense_web/lib/blog-faq.ts b/surfsense_web/lib/blog-faq.ts
new file mode 100644
index 0000000000..d1572612e1
--- /dev/null
+++ b/surfsense_web/lib/blog-faq.ts
@@ -0,0 +1,54 @@
+import { promises as fs } from "node:fs";
+import path from "node:path";
+
+export interface FaqEntry {
+	question: string;
+	answer: string;
+}
+
+/**
+ * Extracts FAQ items from a blog post MDX file by parsing the `## FAQ` section.
+ *
+ * The FAQ section is bounded by `## FAQ` and the next H2 heading. Each H3 inside
+ * is treated as a question, with the body until the next H3 (or the end of the
+ * FAQ section) as its answer. Common Markdown decorations (links, bold, inline
+ * code) are stripped so the JSON-LD output contains plain text suitable for
+ * Google's FAQ rich-result eligibility checks.
+ *
+ * Returns an empty array when the post has no FAQ section, or when the file
+ * cannot be read (e.g. for posts that do not yet exist on disk).
+ */
+export async function extractFaqFromBlogPost(slug: string): Promise<FaqEntry[]> {
+	try {
+		const filepath = path.join(process.cwd(), "blog", "content", `${slug}.mdx`);
+		const content = await fs.readFile(filepath, "utf-8");
+		return extractFaqFromContent(content);
+	} catch {
+		return [];
+	}
+}
+
+export function extractFaqFromContent(content: string): FaqEntry[] {
+	const faqHeading = content.match(/^##\s+FAQ\s*$/m);
+	if (!faqHeading || faqHeading.index === undefined) return [];
+
+	const afterFaq = content.slice(faqHeading.index + faqHeading[0].length);
+	const nextH2 = afterFaq.match(/^##\s+/m);
+	const faqBody = nextH2 ? afterFaq.slice(0, nextH2.index) : afterFaq;
+
+	const blocks = faqBody.split(/^###\s+/m).slice(1);
+
+	return blocks
+		.map((block) => {
+			const newlineIdx = block.indexOf("\n");
+			const question = (newlineIdx === -1 ? block : block.slice(0, newlineIdx)).trim();
+			const answer = (newlineIdx === -1 ? "" : block.slice(newlineIdx + 1))
+				.trim()
+				.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1")
+				.replace(/\*\*([^*]+)\*\*/g, "$1")
+				.replace(/`([^`]+)`/g, "$1")
+				.replace(/\s+/g, " ");
+			return { question, answer };
+		})
+		.filter((item) => item.question.length > 0 && item.answer.length > 0);
+}
diff --git a/surfsense_web/lib/chat/chat-error-classifier.ts b/surfsense_web/lib/chat/chat-error-classifier.ts
new file mode 100644
index 0000000000..1c67d59a15
--- /dev/null
+++ b/surfsense_web/lib/chat/chat-error-classifier.ts
@@ -0,0 +1,305 @@
+export type ChatFlow = "new" | "resume" | "regenerate";
+
+export type ChatErrorKind =
+	| "premium_quota_exhausted"
+	| "thread_busy"
+	| "send_failed_pre_accept"
+	| "auth_expired"
+	| "rate_limited"
+	| "network_offline"
+	| "stream_interrupted"
+	| "stream_parse_error"
+	| "tool_execution_error"
+	| "persist_message_failed"
+	| "server_error"
+	| "unknown";
+
+export type ChatErrorChannel = "pinned_inline" | "toast" | "silent";
+export type ChatTelemetryEvent = "chat_blocked" | "chat_error";
+export type ChatErrorSeverity = "info" | "warn" | "error";
+
+export interface NormalizedChatError {
+	kind: ChatErrorKind;
+	channel: ChatErrorChannel;
+	severity: ChatErrorSeverity;
+	telemetryEvent: ChatTelemetryEvent;
+	isExpected: boolean;
+	userMessage: string;
+	assistantMessage?: string;
+	rawMessage?: string;
+	errorCode?: string;
+	details?: Record<string, unknown>;
+}
+
+export interface RawChatErrorInput {
+	error: unknown;
+	flow: ChatFlow;
+	context?: {
+		searchSpaceId?: number;
+		threadId?: number | null;
+	};
+}
+
+export const PREMIUM_QUOTA_ASSISTANT_MESSAGE =
+	"I can’t continue with the current premium model because your premium credit is exhausted. Switch to a free model or top up your credit to continue.";
+
+function getErrorMessage(error: unknown): string {
+	if (error instanceof Error) return error.message;
+	if (typeof error === "string") return error;
+	try {
+		return JSON.stringify(error);
+	} catch {
+		return "Unknown error";
+	}
+}
+
+function getErrorCode(
+	error: unknown,
+	parsedJson: Record<string, unknown> | null
+): string | undefined {
+	if (error instanceof Error) {
+		const withCode = error as Error & { errorCode?: string; code?: string };
+		if (withCode.errorCode) return withCode.errorCode;
+		if (withCode.code) return withCode.code;
+	}
+
+	if (typeof error === "object" && error !== null) {
+		const withCode = error as { errorCode?: unknown };
+		if (typeof withCode.errorCode === "string" && withCode.errorCode) {
+			return withCode.errorCode;
+		}
+	}
+
+	if (parsedJson) {
+		const topLevelCode = parsedJson.errorCode;
+		if (typeof topLevelCode === "string" && topLevelCode) {
+			return topLevelCode;
+		}
+	}
+
+	return undefined;
+}
+
+function parseEmbeddedJson(text: string): Record<string, unknown> | null {
+	const candidates = [text];
+	const firstBraceIdx = text.indexOf("{");
+	if (firstBraceIdx >= 0) {
+		candidates.push(text.slice(firstBraceIdx));
+	}
+	for (const candidate of candidates) {
+		try {
+			const parsed = JSON.parse(candidate);
+			if (typeof parsed === "object" && parsed !== null) {
+				return parsed as Record<string, unknown>;
+			}
+		} catch {
+			// noop
+		}
+	}
+	return null;
+}
+
+function inferProviderErrorType(parsedJson: Record<string, unknown> | null): string | undefined {
+	if (!parsedJson) return undefined;
+	const topLevelType = parsedJson.type;
+	if (typeof topLevelType === "string" && topLevelType) return topLevelType;
+	const nestedError = parsedJson.error;
+	if (typeof nestedError === "object" && nestedError !== null) {
+		const nestedType = (nestedError as Record<string, unknown>).type;
+		if (typeof nestedType === "string" && nestedType) return nestedType;
+	}
+	return undefined;
+}
+
+export function classifyChatError(input: RawChatErrorInput): NormalizedChatError {
+	const { error } = input;
+	const rawMessage = getErrorMessage(error);
+	const parsedJson = parseEmbeddedJson(rawMessage);
+	const errorCode = getErrorCode(error, parsedJson);
+	const providerErrorType = inferProviderErrorType(parsedJson);
+	const providerTypeNormalized = providerErrorType?.toLowerCase() ?? "";
+	const errorName = error instanceof Error ? error.name : undefined;
+
+	if (errorName === "AbortError") {
+		return {
+			kind: "stream_interrupted",
+			channel: "silent",
+			severity: "info",
+			telemetryEvent: "chat_error",
+			isExpected: true,
+			userMessage: "Request canceled.",
+			rawMessage,
+			errorCode,
+			details: { flow: input.flow },
+		};
+	}
+
+	if (errorCode === "PREMIUM_QUOTA_EXHAUSTED") {
+		return {
+			kind: "premium_quota_exhausted",
+			channel: "pinned_inline",
+			severity: "info",
+			telemetryEvent: "chat_blocked",
+			isExpected: true,
+			userMessage: "Buy more tokens to continue with this model, or switch to a free model.",
+			assistantMessage: PREMIUM_QUOTA_ASSISTANT_MESSAGE,
+			rawMessage,
+			errorCode: errorCode ?? "PREMIUM_QUOTA_EXHAUSTED",
+			details: { flow: input.flow },
+		};
+	}
+
+	if (errorCode === "TURN_CANCELLING") {
+		return {
+			kind: "thread_busy",
+			channel: "toast",
+			severity: "info",
+			telemetryEvent: "chat_blocked",
+			isExpected: true,
+			userMessage: "A previous response is still stopping. Please try again in a moment.",
+			rawMessage,
+			errorCode: errorCode ?? "TURN_CANCELLING",
+			details: { flow: input.flow },
+		};
+	}
+
+	if (errorCode === "THREAD_BUSY") {
+		return {
+			kind: "thread_busy",
+			channel: "toast",
+			severity: "warn",
+			telemetryEvent: "chat_blocked",
+			isExpected: true,
+			userMessage:
+				"Another response is still finishing for this thread. Please try again in a moment.",
+			rawMessage,
+			errorCode: errorCode ?? "THREAD_BUSY",
+			details: { flow: input.flow },
+		};
+	}
+
+	if (errorCode === "SEND_FAILED_PRE_ACCEPT") {
+		return {
+			kind: "send_failed_pre_accept",
+			channel: "toast",
+			severity: "warn",
+			telemetryEvent: "chat_blocked",
+			isExpected: true,
+			userMessage: "Message not sent. Please retry.",
+			rawMessage,
+			errorCode: errorCode ?? "SEND_FAILED_PRE_ACCEPT",
+			details: { flow: input.flow },
+		};
+	}
+
+	if (errorCode === "AUTH_EXPIRED" || errorCode === "UNAUTHORIZED") {
+		return {
+			kind: "auth_expired",
+			channel: "toast",
+			severity: "warn",
+			telemetryEvent: "chat_error",
+			isExpected: true,
+			userMessage: "Your session expired. Please sign in again.",
+			rawMessage,
+			errorCode: errorCode ?? "AUTH_EXPIRED",
+			details: { flow: input.flow },
+		};
+	}
+
+	if (errorCode === "RATE_LIMITED" || providerTypeNormalized === "rate_limit_error") {
+		return {
+			kind: "rate_limited",
+			channel: "toast",
+			severity: "warn",
+			telemetryEvent: "chat_blocked",
+			isExpected: true,
+			userMessage:
+				"This model is temporarily rate-limited. Please try again in a few seconds or switch models.",
+			rawMessage,
+			errorCode: errorCode ?? "RATE_LIMITED",
+			details: { flow: input.flow, providerErrorType },
+		};
+	}
+
+	if (errorCode === "NETWORK_ERROR") {
+		return {
+			kind: "network_offline",
+			channel: "toast",
+			severity: "warn",
+			telemetryEvent: "chat_error",
+			isExpected: true,
+			userMessage: "Connection issue. Please try again.",
+			rawMessage,
+			errorCode: errorCode ?? "NETWORK_ERROR",
+			details: { flow: input.flow },
+		};
+	}
+
+	if (errorCode === "STREAM_PARSE_ERROR") {
+		return {
+			kind: "stream_parse_error",
+			channel: "toast",
+			severity: "error",
+			telemetryEvent: "chat_error",
+			isExpected: false,
+			userMessage: "We hit a response formatting issue. Please try again.",
+			rawMessage,
+			errorCode: errorCode ?? "STREAM_PARSE_ERROR",
+			details: { flow: input.flow },
+		};
+	}
+
+	if (errorCode === "TOOL_EXECUTION_ERROR") {
+		return {
+			kind: "tool_execution_error",
+			channel: "toast",
+			severity: "error",
+			telemetryEvent: "chat_error",
+			isExpected: false,
+			userMessage: "A tool failed while processing your request. Please try again.",
+			rawMessage,
+			errorCode: errorCode ?? "TOOL_EXECUTION_ERROR",
+			details: { flow: input.flow },
+		};
+	}
+
+	if (errorCode === "PERSIST_MESSAGE_FAILED") {
+		return {
+			kind: "persist_message_failed",
+			channel: "toast",
+			severity: "error",
+			telemetryEvent: "chat_error",
+			isExpected: false,
+			userMessage: "Response generated, but saving failed. Please retry once.",
+			rawMessage,
+			errorCode: errorCode ?? "PERSIST_MESSAGE_FAILED",
+			details: { flow: input.flow },
+		};
+	}
+
+	if (errorCode === "SERVER_ERROR") {
+		return {
+			kind: "server_error",
+			channel: "toast",
+			severity: "error",
+			telemetryEvent: "chat_error",
+			isExpected: false,
+			userMessage: "We couldn’t complete this response right now. Please try again.",
+			rawMessage,
+			errorCode: errorCode ?? "SERVER_ERROR",
+			details: { flow: input.flow, providerErrorType },
+		};
+	}
+
+	return {
+		kind: "unknown",
+		channel: "toast",
+		severity: "error",
+		telemetryEvent: "chat_error",
+		isExpected: false,
+		userMessage: "We couldn’t complete this response right now. Please try again.",
+		rawMessage,
+		errorCode,
+		details: { flow: input.flow, providerErrorType },
+	};
+}
diff --git a/surfsense_web/lib/chat/chat-request-errors.ts b/surfsense_web/lib/chat/chat-request-errors.ts
new file mode 100644
index 0000000000..e0dfb3cc46
--- /dev/null
+++ b/surfsense_web/lib/chat/chat-request-errors.ts
@@ -0,0 +1,110 @@
+export async function toHttpResponseError(
+	response: Response
+): Promise<Error & { errorCode?: string; retryAfterMs?: number }> {
+	const statusDefaultCode =
+		response.status === 409
+			? "THREAD_BUSY"
+			: response.status === 429
+				? "RATE_LIMITED"
+				: response.status === 401 || response.status === 403
+					? "AUTH_EXPIRED"
+					: "SERVER_ERROR";
+
+	let rawBody = "";
+	try {
+		rawBody = await response.text();
+	} catch {
+		// noop
+	}
+
+	let parsedBody: Record<string, unknown> | null = null;
+	if (rawBody) {
+		try {
+			const parsed = JSON.parse(rawBody);
+			if (typeof parsed === "object" && parsed !== null) {
+				parsedBody = parsed as Record<string, unknown>;
+			}
+		} catch {
+			// noop
+		}
+	}
+
+	const detail = parsedBody?.detail;
+	const detailObject =
+		typeof detail === "object" && detail !== null ? (detail as Record<string, unknown>) : null;
+	const detailMessage = typeof detail === "string" ? detail : undefined;
+	const topLevelMessage =
+		typeof parsedBody?.message === "string" ? (parsedBody.message as string) : undefined;
+	const detailNestedMessage =
+		typeof detailObject?.message === "string" ? (detailObject.message as string) : undefined;
+
+	const topLevelCode =
+		typeof parsedBody?.errorCode === "string"
+			? parsedBody.errorCode
+			: typeof parsedBody?.error_code === "string"
+				? parsedBody.error_code
+				: undefined;
+	const detailCode =
+		typeof detailObject?.errorCode === "string"
+			? detailObject.errorCode
+			: typeof detailObject?.error_code === "string"
+				? detailObject.error_code
+				: undefined;
+
+	const errorCode = detailCode ?? topLevelCode ?? statusDefaultCode;
+
+	const detailRetryAfterMs =
+		typeof detailObject?.retry_after_ms === "number"
+			? detailObject.retry_after_ms
+			: typeof detailObject?.retryAfterMs === "number"
+				? detailObject.retryAfterMs
+				: undefined;
+	const topRetryAfterMs =
+		typeof parsedBody?.retry_after_ms === "number"
+			? parsedBody.retry_after_ms
+			: typeof parsedBody?.retryAfterMs === "number"
+				? parsedBody.retryAfterMs
+				: undefined;
+	const headerRetryAfterMsRaw = response.headers.get("retry-after-ms");
+	const headerRetryAfterMs = headerRetryAfterMsRaw ? Number.parseFloat(headerRetryAfterMsRaw) : NaN;
+	const retryAfterHeader = response.headers.get("retry-after");
+	const retryAfterSeconds = retryAfterHeader ? Number.parseFloat(retryAfterHeader) : NaN;
+	const retryAfterMsFromHeader = Number.isFinite(headerRetryAfterMs)
+		? Math.max(0, Math.round(headerRetryAfterMs))
+		: Number.isFinite(retryAfterSeconds)
+			? Math.max(0, Math.round(retryAfterSeconds * 1000))
+			: undefined;
+	const retryAfterMs = detailRetryAfterMs ?? topRetryAfterMs ?? retryAfterMsFromHeader ?? undefined;
+	const message =
+		detailNestedMessage ?? detailMessage ?? topLevelMessage ?? `Backend error: ${response.status}`;
+
+	return Object.assign(new Error(message), { errorCode, retryAfterMs });
+}
+
+export function tagPreAcceptSendFailure(error: unknown): unknown {
+	if (error instanceof Error) {
+		const withCode = error as Error & { errorCode?: string; code?: string };
+		const existingCode = withCode.errorCode ?? withCode.code;
+		const passthroughCodes = new Set([
+			"PREMIUM_QUOTA_EXHAUSTED",
+			"THREAD_BUSY",
+			"TURN_CANCELLING",
+			"AUTH_EXPIRED",
+			"UNAUTHORIZED",
+			"RATE_LIMITED",
+			"NETWORK_ERROR",
+			"STREAM_PARSE_ERROR",
+			"TOOL_EXECUTION_ERROR",
+			"PERSIST_MESSAGE_FAILED",
+			"SERVER_ERROR",
+		]);
+		if (existingCode && passthroughCodes.has(existingCode)) {
+			return Object.assign(error, { errorCode: existingCode });
+		}
+		return Object.assign(error, { errorCode: "SEND_FAILED_PRE_ACCEPT" });
+	}
+
+	return Object.assign(new Error("Failed to send message before stream acceptance"), {
+		errorCode: "SEND_FAILED_PRE_ACCEPT",
+	});
+}
diff --git a/surfsense_web/lib/chat/display-media-capture.ts b/surfsense_web/lib/chat/display-media-capture.ts
new file mode 100644
index 0000000000..c2fb69aae3
--- /dev/null
+++ b/surfsense_web/lib/chat/display-media-capture.ts
@@ -0,0 +1,120 @@
+/** `getDisplayMedia` → single PNG frame (data URL). */
+function getImageCaptureCtor():
+	| (new (
+			track: MediaStreamTrack
+	  ) => { grabFrame: () => Promise<ImageBitmap> })
+	| undefined {
+	if (typeof window === "undefined") return undefined;
+	const IC = (
+		window as unknown as {
+			ImageCapture?: new (track: MediaStreamTrack) => { grabFrame: () => Promise<ImageBitmap> };
+		}
+	).ImageCapture;
+	return typeof IC === "function" ? IC : undefined;
+}
+
+function stopAllTracks(stream: MediaStream): void {
+	for (const t of stream.getTracks()) {
+		t.stop();
+	}
+}
+
+async function captureTrackToPngDataUrl(
+	track: MediaStreamTrack,
+	stream: MediaStream
+): Promise<string | null> {
+	const ImageCtor = getImageCaptureCtor();
+	if (ImageCtor !== undefined) {
+		try {
+			const ic = new ImageCtor(track);
+			const bitmap = await ic.grabFrame();
+			try {
+				const canvas = document.createElement("canvas");
+				canvas.width = bitmap.width;
+				canvas.height = bitmap.height;
+				const ctx = canvas.getContext("2d");
+				if (!ctx) {
+					stopAllTracks(stream);
+					return null;
+				}
+				ctx.drawImage(bitmap, 0, 0);
+				stopAllTracks(stream);
+				return canvas.toDataURL("image/png");
+			} finally {
+				if ("close" in bitmap && typeof bitmap.close === "function") {
+					bitmap.close();
+				}
+			}
+		} catch {
+			/* fall through to <video> */
+		}
+	}
+
+	const videoEl = document.createElement("video");
+	videoEl.srcObject = stream;
+	videoEl.muted = true;
+	const haveCurrentData = 2;
+	const dataReady = new Promise<void>((resolve) => {
+		if (videoEl.readyState >= haveCurrentData) {
+			resolve();
+			return;
+		}
+		videoEl.addEventListener("loadeddata", () => resolve(), { once: true });
+	});
+	await videoEl.play();
+	await Promise.race([
+		dataReady,
+		new Promise<void>((resolve) => {
+			setTimeout(resolve, 500);
+		}),
+	]);
+	const w = videoEl.videoWidth;
+	const h = videoEl.videoHeight;
+	if (!w || !h) {
+		stopAllTracks(stream);
+		return null;
+	}
+	const canvas = document.createElement("canvas");
+	canvas.width = w;
+	canvas.height = h;
+	const ctx = canvas.getContext("2d");
+	if (!ctx) {
+		stopAllTracks(stream);
+		return null;
+	}
+	ctx.drawImage(videoEl, 0, 0);
+	stopAllTracks(stream);
+	return canvas.toDataURL("image/png");
+}
+
+export async function captureDisplayToPngDataUrl(): Promise<string | null> {
+	if (typeof navigator === "undefined" || !navigator.mediaDevices?.getDisplayMedia) {
+		return null;
+	}
+	let stream: MediaStream | null = null;
+	try {
+		stream = await navigator.mediaDevices.getDisplayMedia({
+			video: { frameRate: { ideal: 1, max: 5 } },
+			audio: false,
+			selfBrowserSurface: "exclude",
+		} as Parameters<MediaDevices["getDisplayMedia"]>[0]);
+
+		const track = stream.getVideoTracks()[0];
+		if (!track) {
+			stopAllTracks(stream);
+			return null;
+		}
+
+		const dataUrl = await captureTrackToPngDataUrl(track, stream);
+		stream = null;
+		return dataUrl;
+	} catch (e) {
+		if (typeof process !== "undefined" && process.env?.NODE_ENV !== "production") {
+			console.warn("[captureDisplayToPngDataUrl]", e);
+		}
+		if (stream) {
+			stopAllTracks(stream);
+		}
+		return null;
+	}
+}
diff --git a/surfsense_web/lib/chat/mention-doc-key.ts b/surfsense_web/lib/chat/mention-doc-key.ts
new file mode 100644
index 0000000000..5c0bd62543
--- /dev/null
+++ b/surfsense_web/lib/chat/mention-doc-key.ts
@@ -0,0 +1,18 @@
+type MentionKeyInput = {
+	id: number;
+	document_type?: string | null;
+	kind?: "doc" | "folder";
+};
+
+/**
+ * Build a stable dedup key for a mention chip.
+ *
+ * The ``kind:document_type:id`` shape prevents a document and a folder
+ * with the same integer id from colliding in the chip array (folders
+ * use the ``FOLDER`` sentinel ``document_type``; the ``kind`` prefix
+ * is the belt-and-braces guard).
+ */
+export function getMentionDocKey(doc: MentionKeyInput): string {
+	const kind = doc.kind ?? "doc";
+	return `${kind}:${doc.document_type ?? "UNKNOWN"}:${doc.id}`;
+}
diff --git a/surfsense_web/lib/chat/message-utils.ts b/surfsense_web/lib/chat/message-utils.ts
index 7c0da03c48..935dda5394 100644
--- a/surfsense_web/lib/chat/message-utils.ts
+++ b/surfsense_web/lib/chat/message-utils.ts
@@ -1,9 +1,187 @@
 import type { ThreadMessageLike } from "@assistant-ui/react";
 import type { MessageRecord } from "./thread-persistence";
 
+/** Minimal shape used by the interrupt/resume reconciler. */
+interface AbortableMessage {
+	id: number;
+	role: string;
+	content: unknown;
+	turn_id?: string | null;
+}
+
+function isAssistant(msg: AbortableMessage): boolean {
+	return msg.role.toLowerCase() === "assistant";
+}
+
+/** True when the row carries at least one tool-call with ``state: "aborted"``. */
+function hasAbortedToolCall(msg: AbortableMessage): boolean {
+	if (!isAssistant(msg) || !Array.isArray(msg.content)) return false;
+	for (const part of msg.content) {
+		if (typeof part !== "object" || part === null) continue;
+		if ((part as { type?: string }).type !== "tool-call") continue;
+		if ((part as { state?: unknown }).state === "aborted") return true;
+	}
+	return false;
+}
+
+/**
+ * True when EVERY tool-call on the row is aborted. The row is then a
+ * frozen interrupt frame with no salvageable activity — safe to drop
+ * outright.
+ */
+function isFullyAbortedAssistantMessage(msg: AbortableMessage): boolean {
+	if (!isAssistant(msg) || !Array.isArray(msg.content)) return false;
+	let hasToolCalls = false;
+	for (const part of msg.content) {
+		if (typeof part !== "object" || part === null) continue;
+		if ((part as { type?: string }).type !== "tool-call") continue;
+		hasToolCalls = true;
+		if ((part as { state?: unknown }).state !== "aborted") return false;
+	}
+	return hasToolCalls;
+}
+
+/**
+ * Locate the resume row that supersedes ``messages[idx]``. The
+ * ``stream_resume_chat`` flow allocates a fresh ``turn_id`` so we
+ * can't pair on it; conversational adjacency (assistant → assistant
+ * with no user row between) is the unique signature. Skips already-
+ * dropped indices so chained interrupt-resumes still pair cleanly.
+ */
+function findResumeSuccessorIdx<T extends AbortableMessage>(
+	messages: readonly T[],
+	idx: number,
+	dropped: ReadonlySet<number>
+): number | null {
+	for (let i = idx + 1; i < messages.length; i++) {
+		if (dropped.has(i)) continue;
+		const role = messages[i].role.toLowerCase();
+		if (role === "user") return null;
+		if (role === "assistant") return i;
+	}
+	return null;
+}
+
+/** Read ``data.steps`` from either ``data-thinking-steps`` (modern) or ``thinking-steps`` (legacy). */
+function extractStepsFromPart(part: unknown): unknown[] | null {
+	if (typeof part !== "object" || part === null) return null;
+	const p = part as { type?: unknown; data?: unknown; steps?: unknown };
+	if (p.type === "data-thinking-steps") {
+		const data = p.data as { steps?: unknown } | undefined;
+		return Array.isArray(data?.steps) ? data.steps : [];
+	}
+	if (p.type === "thinking-steps") {
+		return Array.isArray(p.steps) ? (p.steps as unknown[]) : [];
+	}
+	return null;
+}
+
+/** Split a content array into (combined steps, all other parts in order). */
+function partitionContent(content: unknown): { steps: unknown[]; others: unknown[] } {
+	if (!Array.isArray(content)) return { steps: [], others: [] };
+	const steps: unknown[] = [];
+	const others: unknown[] = [];
+	for (const part of content) {
+		const partSteps = extractStepsFromPart(part);
+		if (partSteps !== null) {
+			steps.push(...partSteps);
+			continue;
+		}
+		others.push(part);
+	}
+	return { steps, others };
+}
+
+/**
+ * Fold an interrupt-frame row's content into its resume successor so
+ * the user sees one assistant turn instead of two stacked bubbles.
+ * Successor's metadata wins (id, created_at, turn_id, token_usage,
+ * author) — that's the row the per-turn revert button keys to.
+ *
+ * Order: combined ``data-thinking-steps`` (older steps then newer) at
+ * index 0, followed by older's other parts in order, then newer's. The
+ * older row's aborted ``task`` wrapper is preserved so the rejected
+ * attempt remains visible alongside the successful retry; both spans
+ * survive and ``groupItems`` renders them as sibling task branches in
+ * one timeline.
+ */
+function mergeInterruptedIntoResume<T extends AbortableMessage>(older: T, newer: T): T {
+	const olderParts = partitionContent(older.content);
+	const newerParts = partitionContent(newer.content);
+
+	const mergedSteps = [...olderParts.steps, ...newerParts.steps];
+	const mergedContent: unknown[] = [];
+	if (mergedSteps.length > 0) {
+		mergedContent.push({ type: "data-thinking-steps", data: { steps: mergedSteps } });
+	}
+	mergedContent.push(...olderParts.others, ...newerParts.others);
+
+	return { ...newer, content: mergedContent };
+}
+
+/**
+ * Reconcile interrupt-frame and resume rows so the UI shows one
+ * assistant turn per user turn even when the backend persists them as
+ * separate ``new_chat_messages`` rows.
+ *
+ * Two cases, both keyed on conversational adjacency (assistant →
+ * assistant with no user row between):
+ *
+ *  - **Fully aborted older row** (every tool-call ``state: "aborted"``,
+ *    no salvageable activity) → drop the older row.
+ *  - **Partially aborted older row** (mixed completed + aborted, e.g.
+ *    inner subagent tools ran before the interrupt) → fold its content
+ *    into the successor. Successor metadata wins.
+ *
+ * Never-resumed aborts (user navigated away mid-decision) survive so
+ * the user still sees what happened.
+ *
+ * Pure: returns a new array with new merged objects when needed.
+ * Caller passes messages in chronological order.
+ */
+export function reconcileInterruptedAssistantMessages<T extends AbortableMessage>(
+	messages: readonly T[]
+): T[] {
+	const dropped = new Set<number>();
+	const mergeInto = new Map<number, number[]>();
+
+	for (let i = 0; i < messages.length; i++) {
+		if (dropped.has(i)) continue;
+		const msg = messages[i];
+		if (!hasAbortedToolCall(msg)) continue;
+
+		const successorIdx = findResumeSuccessorIdx(messages, i, dropped);
+		if (successorIdx === null) continue;
+
+		dropped.add(i);
+		if (!isFullyAbortedAssistantMessage(msg)) {
+			const list = mergeInto.get(successorIdx) ?? [];
+			list.push(i);
+			mergeInto.set(successorIdx, list);
+		}
+	}
+
+	const result: T[] = [];
+	for (let i = 0; i < messages.length; i++) {
+		if (dropped.has(i)) continue;
+		const olderIdxs = mergeInto.get(i);
+		if (olderIdxs && olderIdxs.length > 0) {
+			let merged = messages[i];
+			for (const olderIdx of olderIdxs) {
+				merged = mergeInterruptedIntoResume(messages[olderIdx], merged);
+			}
+			result.push(merged);
+			continue;
+		}
+		result.push(messages[i]);
+	}
+	return result;
+}
+
 /**
- * Convert backend message to assistant-ui ThreadMessageLike format.
- * Migrates legacy `thinking-steps` parts to `data-thinking-steps` (assistant-ui data parts).
+ * Convert a backend ``MessageRecord`` to assistant-ui's
+ * ``ThreadMessageLike``. Also migrates legacy ``thinking-steps`` parts
+ * to ``data-thinking-steps``.
  */
 export function convertToThreadMessage(msg: MessageRecord): ThreadMessageLike {
 	let content: ThreadMessageLike["content"];
@@ -24,9 +202,10 @@ export function convertToThreadMessage(msg: MessageRecord): ThreadMessageLike {
 					"type" in part &&
 					(part as { type: string }).type === "thinking-steps"
 				) {
+					const steps = (part as unknown as { steps?: unknown[] }).steps;
 					return {
 						type: "data-thinking-steps",
-						data: { steps: (part as { steps: unknown[] }).steps ?? [] },
+						data: { steps: Array.isArray(steps) ? steps : [] },
 					};
 				}
 				return part;
@@ -39,16 +218,23 @@ export function convertToThreadMessage(msg: MessageRecord): ThreadMessageLike {
 		content = [{ type: "text", text: String(msg.content) }];
 	}
 
-	const metadata = msg.author_id
-		? {
-				custom: {
-					author: {
-						displayName: msg.author_display_name ?? null,
-						avatarUrl: msg.author_avatar_url ?? null,
+	const metadata =
+		msg.author_id || msg.token_usage || msg.turn_id
+			? {
+					custom: {
+						...(msg.author_id && {
+							author: {
+								displayName: msg.author_display_name ?? null,
+								avatarUrl: msg.author_avatar_url ?? null,
+							},
+						}),
+						...(msg.token_usage && { usage: msg.token_usage }),
+						// Surfaced for the assistant footer's per-turn
+						// "Revert turn" button. Null on legacy rows.
+						...(msg.turn_id && { chatTurnId: msg.turn_id }),
 					},
-				},
-			}
-		: undefined;
+				}
+			: undefined;
 
 	return {
 		id: `msg-${msg.id}`,
diff --git a/surfsense_web/lib/chat/parse-mention-segments.ts b/surfsense_web/lib/chat/parse-mention-segments.ts
new file mode 100644
index 0000000000..b9cf59792b
--- /dev/null
+++ b/surfsense_web/lib/chat/parse-mention-segments.ts
@@ -0,0 +1,54 @@
+import type { MentionedDocumentInfo } from "@/atoms/chat/mentioned-documents.atom";
+
+export type MentionSegment =
+	| { type: "text"; value: string; start: number }
+	| { type: "mention"; doc: MentionedDocumentInfo; start: number };
+
+/**
+ * Tokenizes a user message into text and `@mention` segments.
+ *
+ * Pure: no React, no DOM, no side effects. Safe to unit-test and reuse.
+ *
+ * Mentions are matched greedily by longest title first so that a longer title
+ * (e.g. `@Project Roadmap`) is never shadowed by a shorter prefix
+ * (e.g. `@Project`).
+ */
+export function parseMentionSegments(
+	text: string,
+	docs: ReadonlyArray<MentionedDocumentInfo>
+): MentionSegment[] {
+	if (text.length === 0) return [];
+	if (docs.length === 0) return [{ type: "text", value: text, start: 0 }];
+
+	const tokens = docs
+		.map((doc) => ({ doc, token: `@${doc.title}` }))
+		.sort((a, b) => b.token.length - a.token.length);
+
+	const segments: MentionSegment[] = [];
+	let i = 0;
+	let buffer = "";
+	let bufferStart = 0;
+
+	while (i < text.length) {
+		const tokenMatch = tokens.find(({ token }) => text.startsWith(token, i));
+		if (tokenMatch) {
+			if (buffer) {
+				segments.push({ type: "text", value: buffer, start: bufferStart });
+				buffer = "";
+			}
+			segments.push({ type: "mention", doc: tokenMatch.doc, start: i });
+			i += tokenMatch.token.length;
+			bufferStart = i;
+			continue;
+		}
+		if (!buffer) bufferStart = i;
+		buffer += text[i];
+		i += 1;
+	}
+
+	if (buffer) {
+		segments.push({ type: "text", value: buffer, start: bufferStart });
+	}
+
+	return segments;
+}
diff --git a/surfsense_web/lib/chat/stream-flush.ts b/surfsense_web/lib/chat/stream-flush.ts
new file mode 100644
index 0000000000..6d13c9237d
--- /dev/null
+++ b/surfsense_web/lib/chat/stream-flush.ts
@@ -0,0 +1,19 @@
+import { FrameBatchedUpdater } from "@/lib/chat/streaming-state";
+
+export function createStreamFlushHelpers(flushMessages: () => void): {
+	batcher: FrameBatchedUpdater;
+	scheduleFlush: () => void;
+	forceFlush: () => void;
+} {
+	const batcher = new FrameBatchedUpdater();
+	const scheduleFlush = () => batcher.schedule(flushMessages);
+	// Force-flush helper: ``batcher.flush()`` is a no-op when
+	// ``dirty=false`` (e.g. a tool starts before any text streamed).
+	// ``scheduleFlush(); batcher.flush()`` sets the dirty bit first so
+	// terminal events render promptly without the throttle delay.
+	const forceFlush = () => {
+		scheduleFlush();
+		batcher.flush();
+	};
+	return { batcher, scheduleFlush, forceFlush };
+}
diff --git a/surfsense_web/lib/chat/stream-pipeline.ts b/surfsense_web/lib/chat/stream-pipeline.ts
new file mode 100644
index 0000000000..a0f90afeb3
--- /dev/null
+++ b/surfsense_web/lib/chat/stream-pipeline.ts
@@ -0,0 +1,204 @@
+import {
+	addStepSeparator,
+	addToolCall,
+	appendReasoning,
+	appendText,
+	appendToolInputDelta,
+	type ContentPartsState,
+	endReasoning,
+	readSSEStream,
+	type SSEEvent,
+	type ThinkingStepData,
+	type ToolUIGate,
+	updateThinkingSteps,
+	updateToolCall,
+} from "@/lib/chat/streaming-state";
+
+export type SharedStreamEventContext = {
+	contentPartsState: ContentPartsState;
+	toolsWithUI: ToolUIGate;
+	currentThinkingSteps: Map<string, ThinkingStepData>;
+	scheduleFlush: () => void;
+	forceFlush: () => void;
+	onTokenUsage?: (data: Extract<SSEEvent, { type: "data-token-usage" }>["data"]) => void;
+	onTurnStatus?: (data: Extract<SSEEvent, { type: "data-turn-status" }>["data"]) => void;
+	onToolOutputAvailable?: (
+		event: Extract<SSEEvent, { type: "tool-output-available" }>,
+		context: {
+			contentPartsState: ContentPartsState;
+			toolCallIndices: Map<string, number>;
+		}
+	) => void;
+};
+
+/**
+ * After a tool produces output, mark any previously-decided interrupt tool
+ * calls as completed so the ApprovalCard can transition from shimmer to done.
+ */
+export function markInterruptsCompleted(
+	contentParts: Array<{ type: string; result?: unknown }>
+): void {
+	for (const part of contentParts) {
+		if (
+			part.type === "tool-call" &&
+			typeof part.result === "object" &&
+			part.result !== null &&
+			(part.result as Record<string, unknown>).__interrupt__ === true &&
+			(part.result as Record<string, unknown>).__decided__ &&
+			!(part.result as Record<string, unknown>).__completed__
+		) {
+			part.result = { ...(part.result as Record<string, unknown>), __completed__: true };
+		}
+	}
+}
+
+export function hasPersistableContent(
+	contentParts: ContentPartsState["contentParts"],
+	toolsWithUI: ToolUIGate
+) {
+	return contentParts.some(
+		(part) =>
+			(part.type === "text" && part.text.length > 0) ||
+			(part.type === "reasoning" && part.text.length > 0) ||
+			(part.type === "tool-call" && (toolsWithUI === "all" || toolsWithUI.has(part.toolName)))
+	);
+}
+
+function toStreamTerminalError(
+	event: Extract<SSEEvent, { type: "error" }>
+): Error & { errorCode?: string } {
+	return Object.assign(new Error(event.errorText || "Server error"), {
+		errorCode: event.errorCode,
+	});
+}
+
+export function processSharedStreamEvent(
+	parsed: SSEEvent,
+	context: SharedStreamEventContext
+): boolean {
+	const { contentPartsState, toolsWithUI, currentThinkingSteps, scheduleFlush, forceFlush } =
+		context;
+	const { contentParts, toolCallIndices } = contentPartsState;
+
+	switch (parsed.type) {
+		case "text-delta":
+			appendText(contentPartsState, parsed.delta);
+			scheduleFlush();
+			return true;
+
+		case "reasoning-delta":
+			appendReasoning(contentPartsState, parsed.delta);
+			scheduleFlush();
+			return true;
+
+		case "reasoning-end":
+			endReasoning(contentPartsState);
+			scheduleFlush();
+			return true;
+
+		case "start-step":
+			addStepSeparator(contentPartsState);
+			scheduleFlush();
+			return true;
+
+		case "finish-step":
+			return true;
+
+		case "tool-input-start":
+			addToolCall(
+				contentPartsState,
+				toolsWithUI,
+				parsed.toolCallId,
+				parsed.toolName,
+				{},
+				false,
+				parsed.langchainToolCallId,
+				parsed.metadata
+			);
+			forceFlush();
+			return true;
+
+		case "tool-input-delta":
+			// High-frequency event: deltas can fire dozens of times per call,
+			// so use throttled scheduleFlush (NOT forceFlush) to coalesce.
+			appendToolInputDelta(contentPartsState, parsed.toolCallId, parsed.inputTextDelta);
+			scheduleFlush();
+			return true;
+
+		case "tool-input-available": {
+			const finalArgsText = JSON.stringify(parsed.input ?? {}, null, 2);
+			if (toolCallIndices.has(parsed.toolCallId)) {
+				updateToolCall(contentPartsState, parsed.toolCallId, {
+					args: parsed.input || {},
+					argsText: finalArgsText,
+					langchainToolCallId: parsed.langchainToolCallId,
+					metadata: parsed.metadata,
+				});
+			} else {
+				addToolCall(
+					contentPartsState,
+					toolsWithUI,
+					parsed.toolCallId,
+					parsed.toolName,
+					parsed.input || {},
+					false,
+					parsed.langchainToolCallId,
+					parsed.metadata
+				);
+				// addToolCall doesn't accept argsText today; backfill via
+				// updateToolCall so the new card renders pretty-printed JSON.
+				updateToolCall(contentPartsState, parsed.toolCallId, {
+					argsText: finalArgsText,
+				});
+			}
+			forceFlush();
+			return true;
+		}
+
+		case "tool-output-available":
+			updateToolCall(contentPartsState, parsed.toolCallId, {
+				result: parsed.output,
+				langchainToolCallId: parsed.langchainToolCallId,
+				metadata: parsed.metadata,
+			});
+			markInterruptsCompleted(contentParts);
+			context.onToolOutputAvailable?.(parsed, { contentPartsState, toolCallIndices });
+			forceFlush();
+			return true;
+
+		case "data-thinking-step": {
+			const stepData = parsed.data as ThinkingStepData;
+			if (stepData?.id) {
+				currentThinkingSteps.set(stepData.id, stepData);
+				const didUpdate = updateThinkingSteps(contentPartsState, currentThinkingSteps);
+				if (didUpdate) {
+					scheduleFlush();
+				}
+			}
+			return true;
+		}
+
+		case "data-token-usage":
+			context.onTokenUsage?.(parsed.data);
+			return true;
+
+		case "data-turn-status":
+			context.onTurnStatus?.(parsed.data);
+			return true;
+
+		case "error":
+			throw toStreamTerminalError(parsed);
+
+		default:
+			return false;
+	}
+}
+
+export async function consumeSseEvents(
+	response: Response,
+	onEvent: (event: SSEEvent) => void | Promise<void>
+): Promise<void> {
+	for await (const parsed of readSSEStream(response)) {
+		await onEvent(parsed);
+	}
+}
diff --git a/surfsense_web/lib/chat/stream-side-effects.ts b/surfsense_web/lib/chat/stream-side-effects.ts
new file mode 100644
index 0000000000..cdc2f74c63
--- /dev/null
+++ b/surfsense_web/lib/chat/stream-side-effects.ts
@@ -0,0 +1,59 @@
+import type { ThreadMessageLike } from "@assistant-ui/react";
+
+/**
+ * When a streamed message is persisted, the backend returns the durable
+ * turn_id; merge it into assistant-ui metadata for turn-scoped actions.
+ */
+export function mergeChatTurnIdIntoMessage(
+	msg: ThreadMessageLike,
+	turnId: string | null | undefined
+): ThreadMessageLike {
+	if (!turnId) return msg;
+	const existingMeta = (msg.metadata ?? {}) as { custom?: Record<string, unknown> };
+	const existingCustom = existingMeta.custom ?? {};
+	if ((existingCustom as { chatTurnId?: string }).chatTurnId === turnId) return msg;
+	return {
+		...msg,
+		metadata: {
+			...existingMeta,
+			custom: { ...existingCustom, chatTurnId: turnId },
+		},
+	};
+}
+
+export function readStreamedChatTurnId(data: unknown): string | null {
+	if (typeof data !== "object" || data === null) return null;
+	const value = (data as { chat_turn_id?: unknown }).chat_turn_id;
+	return typeof value === "string" && value.length > 0 ? value : null;
+}
+
+/**
+ * Parse the payload of `data-user-message-id` / `data-assistant-message-id`
+ * SSE events emitted by `stream_new_chat` and `stream_resume_chat` after
+ * `persist_user_turn` / `persist_assistant_shell` resolve a canonical
+ * `new_chat_messages.id`. Mirrors {@link readStreamedChatTurnId}.
+ *
+ * Returns `null` when the payload is malformed (missing or non-numeric
+ * `message_id`); callers should treat this as "ignore the event" so a
+ * malformed BE payload never overwrites the optimistic id with a bogus
+ * value.
+ */
+export function readStreamedMessageId(
+	data: unknown
+): { messageId: number; turnId: string | null } | null {
+	if (typeof data !== "object" || data === null) return null;
+	const obj = data as { message_id?: unknown; turn_id?: unknown };
+	if (typeof obj.message_id !== "number" || !Number.isFinite(obj.message_id)) {
+		return null;
+	}
+	const turnId = typeof obj.turn_id === "string" && obj.turn_id.length > 0 ? obj.turn_id : null;
+	return { messageId: obj.message_id, turnId };
+}
+
+export function applyTurnIdToAssistantMessageList(
+	messages: ThreadMessageLike[],
+	assistantMsgId: string,
+	turnId: string
+): ThreadMessageLike[] {
+	return messages.map((m) => (m.id === assistantMsgId ? mergeChatTurnIdIntoMessage(m, turnId) : m));
+}
diff --git a/surfsense_web/lib/chat/streaming-state.ts b/surfsense_web/lib/chat/streaming-state.ts
index 2f612a5528..d9fb2ac99f 100644
--- a/surfsense_web/lib/chat/streaming-state.ts
+++ b/surfsense_web/lib/chat/streaming-state.ts
@@ -5,26 +5,86 @@ export interface ThinkingStepData {
 	title: string;
 	status: "pending" | "in_progress" | "completed";
 	items: string[];
+	/**
+	 * Optional relay fields from ``data-thinking-step`` when present on the wire
+	 * (e.g. ``spanId``). Populated in a later slice; equality helpers ignore until wired.
+	 */
+	metadata?: Record<string, unknown>;
 }
 
 export type ContentPart =
 	| { type: "text"; text: string }
+	| { type: "reasoning"; text: string }
 	| {
 			type: "tool-call";
 			toolCallId: string;
 			toolName: string;
 			args: Record<string, unknown>;
 			result?: unknown;
+			/**
+			 * Live / finalized JSON text for the tool's input arguments.
+			 *
+			 * - During streaming: accumulated partial JSON text from
+			 *   ``tool-input-delta`` events (may be invalid JSON
+			 *   mid-stream). assistant-ui's argsText parser tolerates
+			 *   invalid JSON gracefully (changelog 0.7.32 / 0.7.78).
+			 * - On completion (``tool-input-available``): replaced with
+			 *   ``JSON.stringify(input, null, 2)`` so the post-stream
+			 *   card renders pretty-printed JSON instead of the
+			 *   model's possibly-fragmented formatting.
+			 *
+			 * Per assistant-ui ``ThreadMessageLike`` precedence
+			 * (changelog 0.11.6 ``d318c83``), when ``argsText`` is
+			 * supplied it wins over ``JSON.stringify(args)``.
+			 */
+			argsText?: string;
+			/**
+			 * Authoritative LangChain ``tool_call.id`` propagated by the backend
+			 * via ``langchainToolCallId`` on tool-input-start/available and
+			 * tool-output-available events. Used to join a card to the
+			 * matching ``AgentActionLog`` row exposed by
+			 * ``GET /threads/{id}/actions`` and the streamed
+			 * ``data-action-log`` events.
+			 */
+			langchainToolCallId?: string;
+			/**
+			 * Relay correlation from tool SSE (e.g. ``spanId``, ``thinkingStepId``).
+			 * Merged by ``mergeToolPartMetadata`` when events carry ``metadata``.
+			 */
+			metadata?: Record<string, unknown>;
 	  }
 	| {
 			type: "data-thinking-steps";
 			data: { steps: ThinkingStepData[] };
+	  }
+	| {
+			/**
+			 * Between-step separator. Pushed by `addStepSeparator` when
+			 * a `start-step` SSE event arrives AFTER the message already
+			 * has non-step content. Rendered by `StepSeparatorDataUI`
+			 * (see assistant-ui/step-separator.tsx).
+			 */
+			type: "data-step-separator";
+			data: { stepIndex: number };
 	  };
 
 export interface ContentPartsState {
 	contentParts: ContentPart[];
 	currentTextPartIndex: number;
+	currentReasoningPartIndex: number;
 	toolCallIndices: Map<string, number>;
+	/**
+	 * Set by the resume flow's rehydration to suppress
+	 * ``data-step-separator`` for the rest of this turn. Without it,
+	 * the resume stream's first ``start-step`` fires
+	 * ``addStepSeparator`` while rehydrated OLD content already makes
+	 * ``hasContent`` true → a divider lands between OLD and NEW
+	 * content with no semantic value (OLD content is folded by
+	 * ``buildTimeline`` + ``reconcileInterruptedAssistantMessages``,
+	 * persisted state carries no separator, so the line vanishes on
+	 * reload).
+	 */
+	suppressStepSeparators?: boolean;
 }
 
 function areThinkingStepsEqual(current: ThinkingStepData[], next: ThinkingStepData[]): boolean {
@@ -74,6 +134,9 @@ export function updateThinkingSteps(
 		if (state.currentTextPartIndex >= 0) {
 			state.currentTextPartIndex += 1;
 		}
+		if (state.currentReasoningPartIndex >= 0) {
+			state.currentReasoningPartIndex += 1;
+		}
 		for (const [id, idx] of state.toolCallIndices) {
 			state.toolCallIndices.set(id, idx + 1);
 		}
@@ -131,6 +194,12 @@ export class FrameBatchedUpdater {
 }
 
 export function appendText(state: ContentPartsState, delta: string): void {
+	// First text delta after a reasoning block: close the reasoning so
+	// the assistant-ui renderer treats them as separate parts (the
+	// reasoning block collapses; the answer streams below).
+	if (state.currentReasoningPartIndex >= 0) {
+		state.currentReasoningPartIndex = -1;
+	}
 	if (
 		state.currentTextPartIndex >= 0 &&
 		state.contentParts[state.currentTextPartIndex]?.type === "text"
@@ -143,46 +212,207 @@ export function appendText(state: ContentPartsState, delta: string): void {
 	}
 }
 
+export function appendReasoning(state: ContentPartsState, delta: string): void {
+	// Symmetric to appendText: open a fresh reasoning block on first
+	// delta, then accumulate into it. ``endReasoning`` simply closes
+	// the active block; subsequent reasoning deltas would open a new
+	// one (matching ``text-start/end`` semantics on the wire).
+	if (state.currentTextPartIndex >= 0) {
+		state.currentTextPartIndex = -1;
+	}
+	if (
+		state.currentReasoningPartIndex >= 0 &&
+		state.contentParts[state.currentReasoningPartIndex]?.type === "reasoning"
+	) {
+		(
+			state.contentParts[state.currentReasoningPartIndex] as {
+				type: "reasoning";
+				text: string;
+			}
+		).text += delta;
+	} else {
+		state.contentParts.push({ type: "reasoning", text: delta });
+		state.currentReasoningPartIndex = state.contentParts.length - 1;
+	}
+}
+
+export function endReasoning(state: ContentPartsState): void {
+	state.currentReasoningPartIndex = -1;
+}
+
+export function addStepSeparator(state: ContentPartsState): void {
+	// Push a divider between consecutive model steps within a single
+	// assistant turn. We only emit it when the message already has
+	// non-step content (so the FIRST step of a turn doesn't
+	// generate a leading separator) and when the previous part isn't
+	// itself a separator (defensive against duplicate `start-step`
+	// events). Also skipped during a resume turn (see
+	// ``suppressStepSeparators`` on ``ContentPartsState``).
+	if (state.suppressStepSeparators) return;
+	const hasContent = state.contentParts.some(
+		(p) => p.type === "text" || p.type === "reasoning" || p.type === "tool-call"
+	);
+	if (!hasContent) return;
+	const last = state.contentParts[state.contentParts.length - 1];
+	if (last && last.type === "data-step-separator") return;
+
+	const stepIndex = state.contentParts.filter((p) => p.type === "data-step-separator").length;
+	state.contentParts.push({ type: "data-step-separator", data: { stepIndex } });
+	state.currentTextPartIndex = -1;
+	state.currentReasoningPartIndex = -1;
+}
+
+/**
+ * Allowlist of tool names that should produce a UI tool card. The
+ * sentinel ``"all"`` matches every tool — we dropped the legacy
+ * ``BASE_TOOLS_WITH_UI`` gate so that ALL tool calls render via the
+ * generic ``ToolFallback``. The backend's ``format_thinking_step``
+ * summarisation and the defensive ``result_length``-only default for
+ * unknown tools keep persisted message JSON from ballooning.
+ */
+export type ToolUIGate = Set<string> | "all";
+
+function _toolPasses(gate: ToolUIGate, toolName: string): boolean {
+	return gate === "all" || gate.has(toolName);
+}
+
+/**
+ * Shallow-merge relay ``metadata`` into a tool-call part (SSE → content part).
+ * Keys already set on ``into`` are left unchanged so chunk vs canonical tool
+ * events cannot reorder or overwrite ``spanId`` / ``thinkingStepId``.
+ * Matches server ``AssistantContentBuilder`` merge semantics.
+ */
+function mergeToolPartMetadata(
+	into: Record<string, unknown>,
+	incoming: Record<string, unknown> | undefined
+): void {
+	if (!incoming) return;
+	for (const [k, v] of Object.entries(incoming)) {
+		if (k === "__proto__" || k === "constructor") continue;
+		if (!(k in into)) into[k] = v;
+	}
+}
+
 export function addToolCall(
 	state: ContentPartsState,
-	toolsWithUI: Set<string>,
+	toolsWithUI: ToolUIGate,
 	toolCallId: string,
 	toolName: string,
-	args: Record<string, unknown>
+	args: Record<string, unknown>,
+	force = false,
+	langchainToolCallId?: string,
+	metadata?: Record<string, unknown>
 ): void {
-	if (toolsWithUI.has(toolName)) {
+	if (force || _toolPasses(toolsWithUI, toolName)) {
+		const relayMeta: Record<string, unknown> = {};
+		mergeToolPartMetadata(relayMeta, metadata);
 		state.contentParts.push({
 			type: "tool-call",
 			toolCallId,
 			toolName,
 			args,
+			...(langchainToolCallId ? { langchainToolCallId } : {}),
+			...(Object.keys(relayMeta).length > 0 ? { metadata: relayMeta } : {}),
 		});
 		state.toolCallIndices.set(toolCallId, state.contentParts.length - 1);
 		state.currentTextPartIndex = -1;
+		state.currentReasoningPartIndex = -1;
 	}
 }
 
+/**
+ * Reverse-lookup helper used by the SSE ``data-action-log`` handler:
+ * given the LangChain ``tool_call.id`` (set on the content part as
+ * ``langchainToolCallId``), return the synthetic ``toolCallId`` that
+ * the chat tool card uses (``call_<run-id>``). Returns ``null`` when no
+ * matching tool card has been seen yet — the action is still recorded
+ * in the LC-id-keyed atom so the card can pick it up when it eventually
+ * arrives.
+ */
+export function findToolCallIdByLcId(
+	state: ContentPartsState,
+	lcToolCallId: string
+): string | null {
+	for (const part of state.contentParts) {
+		if (part.type === "tool-call" && part.langchainToolCallId === lcToolCallId) {
+			return part.toolCallId;
+		}
+	}
+	return null;
+}
+
 export function updateToolCall(
 	state: ContentPartsState,
 	toolCallId: string,
-	update: { args?: Record<string, unknown>; result?: unknown }
+	update: {
+		args?: Record<string, unknown>;
+		argsText?: string;
+		result?: unknown;
+		langchainToolCallId?: string;
+		metadata?: Record<string, unknown>;
+	}
 ): void {
 	const index = state.toolCallIndices.get(toolCallId);
 	if (index !== undefined && state.contentParts[index]?.type === "tool-call") {
 		const tc = state.contentParts[index] as ContentPart & { type: "tool-call" };
 		if (update.args) tc.args = update.args;
+		// ``!== undefined`` (NOT a truthy check): an explicit empty
+		// string CAN clear, and a finalization with
+		// ``JSON.stringify({}, null, 2) === "{}"`` (truthy but
+		// represents an empty-input call) still applies.
+		if (update.argsText !== undefined) tc.argsText = update.argsText;
 		if (update.result !== undefined) tc.result = update.result;
+		// Only backfill langchainToolCallId if not already set — the
+		// authoritative ``on_tool_end`` value should override an earlier
+		// best-effort match, but a NULL late-arriving value should not
+		// blow away a known good early one.
+		if (update.langchainToolCallId && !tc.langchainToolCallId) {
+			tc.langchainToolCallId = update.langchainToolCallId;
+		}
+		if (update.metadata && Object.keys(update.metadata).length > 0) {
+			const md = (tc.metadata ?? {}) as Record<string, unknown>;
+			mergeToolPartMetadata(md, update.metadata);
+			tc.metadata = md;
+		}
 	}
 }
 
+/**
+ * Append a streamed args-delta chunk to the active tool call's
+ * ``argsText``. No-ops when no card has been registered yet for the
+ * given ``toolCallId`` (the matching ``tool-input-start`` either lost
+ * the wire race or this id never had a card — either way the deltas
+ * have nowhere safe to land).
+ */
+export function appendToolInputDelta(
+	state: ContentPartsState,
+	toolCallId: string,
+	delta: string
+): void {
+	const idx = state.toolCallIndices.get(toolCallId);
+	if (idx === undefined) return;
+	const tc = state.contentParts[idx];
+	if (tc?.type !== "tool-call") return;
+	tc.argsText = (tc.argsText ?? "") + delta;
+}
+
+function _hasInterruptResult(part: ContentPart): boolean {
+	if (part.type !== "tool-call") return false;
+	const r = (part as { result?: unknown }).result;
+	return typeof r === "object" && r !== null && "__interrupt__" in r;
+}
+
 export function buildContentForUI(
 	state: ContentPartsState,
-	toolsWithUI: Set<string>
+	toolsWithUI: ToolUIGate
 ): ThreadMessageLike["content"] {
 	const filtered = state.contentParts.filter((part) => {
 		if (part.type === "text") return part.text.length > 0;
-		if (part.type === "tool-call") return toolsWithUI.has(part.toolName);
+		if (part.type === "reasoning") return part.text.length > 0;
+		if (part.type === "tool-call")
+			return _toolPasses(toolsWithUI, part.toolName) || _hasInterruptResult(part);
 		if (part.type === "data-thinking-steps") return true;
+		if (part.type === "data-step-separator") return true;
 		return false;
 	});
 	return filtered.length > 0
@@ -192,17 +422,28 @@ export function buildContentForUI(
 
 export function buildContentForPersistence(
 	state: ContentPartsState,
-	toolsWithUI: Set<string>
+	toolsWithUI: ToolUIGate
 ): unknown[] {
 	const parts: unknown[] = [];
 
 	for (const part of state.contentParts) {
 		if (part.type === "text" && part.text.length > 0) {
 			parts.push(part);
-		} else if (part.type === "tool-call" && toolsWithUI.has(part.toolName)) {
+		} else if (part.type === "reasoning" && part.text.length > 0) {
+			// Persist reasoning blocks so a chat reload re-renders the
+			// collapsed thinking section instead of
+			// silently dropping it (mirrors the data-thinking-steps
+			// branch above).
+			parts.push(part);
+		} else if (
+			part.type === "tool-call" &&
+			(_toolPasses(toolsWithUI, part.toolName) || _hasInterruptResult(part))
+		) {
 			parts.push(part);
 		} else if (part.type === "data-thinking-steps") {
 			parts.push(part);
+		} else if (part.type === "data-step-separator") {
+			parts.push(part);
 		}
 	}
 
@@ -210,24 +451,202 @@ export function buildContentForPersistence(
 }
 
 export type SSEEvent =
-	| { type: "text-delta"; delta: string }
-	| { type: "tool-input-start"; toolCallId: string; toolName: string }
+	| { type: "start"; messageId?: string }
+	| { type: "finish" }
+	| { type: "start-step" }
+	| { type: "finish-step" }
+	| { type: "text-start"; id: string }
+	| { type: "text-delta"; id?: string; delta: string }
+	| { type: "text-end"; id: string }
+	| { type: "reasoning-start"; id: string }
+	| { type: "reasoning-delta"; id?: string; delta: string }
+	| { type: "reasoning-end"; id: string }
+	| {
+			type: "tool-input-start";
+			toolCallId: string;
+			toolName: string;
+			/** Authoritative LangChain ``tool_call.id``. Optional. */
+			langchainToolCallId?: string;
+			/** Optional JSON object from tool SSE (same keys as persisted tool-call metadata). */
+			metadata?: Record<string, unknown>;
+	  }
+	| {
+			/**
+			 * Live tool-call argument delta. Concatenated into
+			 * ``argsText`` on the matching ``tool-call`` content part
+			 * by ``appendToolInputDelta``. Some providers emit
+			 * ``tool-input-available`` without prior deltas.
+			 */
+			type: "tool-input-delta";
+			toolCallId: string;
+			inputTextDelta: string;
+	  }
 	| {
 			type: "tool-input-available";
 			toolCallId: string;
 			toolName: string;
 			input: Record<string, unknown>;
+			langchainToolCallId?: string;
+			metadata?: Record<string, unknown>;
 	  }
 	| {
 			type: "tool-output-available";
 			toolCallId: string;
 			output: Record<string, unknown>;
+			/** Authoritative LangChain ``tool_call.id`` extracted from
+			 * ``ToolMessage.tool_call_id`` at on_tool_end. Backfills cards
+			 * that didn't get the id at tool-input-start time. */
+			langchainToolCallId?: string;
+			metadata?: Record<string, unknown>;
 	  }
 	| { type: "data-thinking-step"; data: ThinkingStepData }
 	| { type: "data-thread-title-update"; data: { threadId: number; title: string } }
 	| { type: "data-interrupt-request"; data: Record<string, unknown> }
 	| { type: "data-documents-updated"; data: Record<string, unknown> }
-	| { type: "error"; errorText: string };
+	| {
+			/**
+			 * A freshly committed AgentActionLog row. Frontend stores
+			 * this in a Map keyed off ``lc_tool_call_id`` so the chat
+			 * tool card can light up its Revert button.
+			 */
+			type: "data-action-log";
+			data: {
+				id: number;
+				lc_tool_call_id: string | null;
+				chat_turn_id: string | null;
+				tool_name: string;
+				reversible: boolean;
+				reverse_descriptor_present: boolean;
+				created_at: string | null;
+				error: boolean;
+			};
+	  }
+	| {
+			/**
+			 * Reversibility flipped (filesystem op SAVEPOINT committed;
+			 * cf. ``kb_persistence._dispatch_reversibility_update``).
+			 */
+			type: "data-action-log-updated";
+			data: { id: number; reversible: boolean };
+	  }
+	| {
+			/**
+			 * Emitted at the start of every stream so the frontend can
+			 * stamp the per-turn correlation id onto the in-flight
+			 * assistant message and replay it via
+			 * ``appendMessage``. Pure-text turns never produce
+			 * action-log events; this event guarantees the frontend
+			 * always learns the turn id.
+			 */
+			type: "data-turn-info";
+			data: { chat_turn_id: string };
+	  }
+	| {
+			/**
+			 * Emitted by ``stream_new_chat`` AFTER ``data-turn-info`` /
+			 * ``data-turn-status`` and BEFORE any LLM streaming events,
+			 * once ``persist_user_turn`` has resolved the canonical
+			 * ``new_chat_messages.id`` for the user-side row of the
+			 * current turn. The frontend renames its optimistic
+			 * ``msg-user-XXX`` placeholder id to ``msg-{message_id}``
+			 * so DB-id-gated UI (comments, edit-from-this-message)
+			 * unlocks immediately. Not emitted by ``stream_resume_chat``
+			 * (resume reuses the original turn's user message).
+			 */
+			type: "data-user-message-id";
+			data: { message_id: number; turn_id: string };
+	  }
+	| {
+			/**
+			 * Emitted by ``stream_new_chat`` AND ``stream_resume_chat``
+			 * AFTER ``data-turn-info`` / ``data-turn-status`` and BEFORE
+			 * any LLM streaming events, once ``persist_assistant_shell``
+			 * has resolved the canonical ``new_chat_messages.id`` for
+			 * the assistant-side row of the current turn. The frontend
+			 * renames its optimistic ``msg-assistant-XXX`` placeholder
+			 * id, migrates the local ``tokenUsageStore`` and
+			 * ``pendingInterrupts`` entries, and binds the running
+			 * mutable ``assistantMsgId`` closure variable to the
+			 * canonical id for the rest of the stream.
+			 */
+			type: "data-assistant-message-id";
+			data: { message_id: number; turn_id: string };
+	  }
+	| {
+			/**
+			 * Best-effort revert pass that ran BEFORE this regeneration.
+			 * Per-action results are forwarded to the UI so the user
+			 * can see which downstream actions were rolled
+			 * back vs which couldn't be undone.
+			 */
+			type: "data-revert-results";
+			data: {
+				status: "ok" | "partial";
+				chat_turn_ids: string[];
+				total: number;
+				reverted: number;
+				already_reverted: number;
+				not_reversible: number;
+				/**
+				 * ``permission_denied`` and ``skipped`` are first-class
+				 * counters so the response invariant
+				 * ``total === sum(counters)`` always holds. Optional
+				 * for forward compatibility with older backends; the
+				 * frontend treats missing values as ``0``.
+				 */
+				permission_denied?: number;
+				failed: number;
+				skipped?: number;
+				results: Array<{
+					action_id: number;
+					tool_name: string;
+					status:
+						| "reverted"
+						| "already_reverted"
+						| "not_reversible"
+						| "permission_denied"
+						| "failed"
+						| "skipped";
+					message?: string | null;
+					new_action_id?: number | null;
+					error?: string | null;
+				}>;
+			};
+	  }
+	| {
+			type: "data-turn-status";
+			data: {
+				status: "idle" | "busy" | "cancelling";
+				retry_after_ms?: number;
+				retry_after_at?: number;
+			};
+	  }
+	| {
+			type: "data-token-usage";
+			data: {
+				usage: Record<
+					string,
+					{
+						prompt_tokens: number;
+						completion_tokens: number;
+						total_tokens: number;
+						cost_micros?: number;
+					}
+				>;
+				prompt_tokens: number;
+				completion_tokens: number;
+				total_tokens: number;
+				cost_micros?: number;
+				call_details: Array<{
+					model: string;
+					prompt_tokens: number;
+					completion_tokens: number;
+					total_tokens: number;
+					cost_micros?: number;
+				}>;
+			};
+	  }
+	| { type: "error"; errorText: string; errorCode?: string };
 
 /**
  * Async generator that reads an SSE stream and yields parsed JSON objects.
diff --git a/surfsense_web/lib/chat/thread-persistence.ts b/surfsense_web/lib/chat/thread-persistence.ts
index 08c08ba78f..4cf9b8e68f 100644
--- a/surfsense_web/lib/chat/thread-persistence.ts
+++ b/surfsense_web/lib/chat/thread-persistence.ts
@@ -26,6 +26,27 @@ export interface ThreadRecord {
 	has_comments?: boolean;
 }
 
+export interface TokenUsageSummary {
+	prompt_tokens: number;
+	completion_tokens: number;
+	total_tokens: number;
+	/**
+	 * Total provider USD cost for this assistant turn, in micro-USD
+	 * (1_000_000 = $1.00). Optional because rows persisted before the
+	 * cost-credits migration won't have it.
+	 */
+	cost_micros?: number;
+	model_breakdown?: Record<
+		string,
+		{
+			prompt_tokens: number;
+			completion_tokens: number;
+			total_tokens: number;
+			cost_micros?: number;
+		}
+	> | null;
+}
+
 export interface MessageRecord {
 	id: number;
 	thread_id: number;
@@ -35,6 +56,12 @@ export interface MessageRecord {
 	author_id?: string | null;
 	author_display_name?: string | null;
 	author_avatar_url?: string | null;
+	token_usage?: TokenUsageSummary | null;
+	// Per-turn correlation id from ``configurable.turn_id`` at streaming
+	// time (added in migration 136). Used by the per-turn revert
+	// endpoint and edit-from-arbitrary-position. Nullable on legacy
+	// rows that predate the column.
+	turn_id?: string | null;
 }
 
 export interface ThreadListResponse {
@@ -111,11 +138,32 @@ export async function getThreadMessages(threadId: number): Promise<ThreadHistory
 }
 
 /**
- * Append a message to a thread
+ * Append a message to a thread.
+ *
+ * ``turn_id`` is the per-turn correlation id streamed by the backend
+ * via ``data-turn-info``. Persisting it lets later edits locate the
+ * matching LangGraph checkpoint without HumanMessage scanning. Older
+ * callers can still omit it for back-compat.
+ *
+ * @deprecated Replaced by the SSE-based message ID handshake. The
+ * streaming generator (`stream_new_chat` / `stream_resume_chat`) now
+ * persists both the user and assistant rows server-side via
+ * `persist_user_turn` / `persist_assistant_shell` and emits
+ * `data-user-message-id` / `data-assistant-message-id` SSE events so
+ * the UI renames its optimistic IDs in real time. The only remaining
+ * caller is `persistAssistantErrorMessage` (pre-stream error fallback
+ * for requests the server never accepted — the server has nothing to
+ * persist in that case). After the legacy route is removed in a
+ * follow-up PR this function will be deleted entirely.
  */
 export async function appendMessage(
 	threadId: number,
-	message: { role: "user" | "assistant" | "system"; content: unknown }
+	message: {
+		role: "user" | "assistant" | "system";
+		content: unknown;
+		token_usage?: unknown;
+		turn_id?: string | null;
+	}
 ): Promise<MessageRecord> {
 	return baseApiService.post<MessageRecord>(`/api/v1/threads/${threadId}/messages`, undefined, {
 		body: message,
@@ -183,104 +231,3 @@ export function getRegenerateUrl(threadId: number): string {
 	const backendUrl = process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL || "http://localhost:8000";
 	return `${backendUrl}/api/v1/threads/${threadId}/regenerate`;
 }
-
-// =============================================================================
-// Thread List Manager (for thread list sidebar)
-// =============================================================================
-
-export interface ThreadListAdapterConfig {
-	searchSpaceId: number;
-	currentThreadId: number | null;
-	onThreadSwitch: (threadId: number) => void;
-	onNewThread: (threadId: number) => void;
-}
-
-export interface ThreadListState {
-	threads: ThreadListItem[];
-	archivedThreads: ThreadListItem[];
-	isLoading: boolean;
-	error: string | null;
-}
-
-/**
- * Creates a thread list management object.
- * This provides methods to manage the thread list for the sidebar.
- */
-export function createThreadListManager(config: ThreadListAdapterConfig) {
-	return {
-		async loadThreads(): Promise<ThreadListState> {
-			try {
-				const response = await fetchThreads(config.searchSpaceId);
-				return {
-					threads: response.threads,
-					archivedThreads: response.archived_threads,
-					isLoading: false,
-					error: null,
-				};
-			} catch (error) {
-				console.error("[ThreadListManager] Failed to load threads:", error);
-				return {
-					threads: [],
-					archivedThreads: [],
-					isLoading: false,
-					error: error instanceof Error ? error.message : "Failed to load threads",
-				};
-			}
-		},
-
-		async createNewThread(title = "New Chat"): Promise<number | null> {
-			try {
-				const thread = await createThread(config.searchSpaceId, title);
-				config.onNewThread(thread.id);
-				return thread.id;
-			} catch (error) {
-				console.error("[ThreadListManager] Failed to create thread:", error);
-				return null;
-			}
-		},
-
-		switchToThread(threadId: number) {
-			config.onThreadSwitch(threadId);
-		},
-
-		async renameThread(threadId: number, newTitle: string): Promise<boolean> {
-			try {
-				await updateThread(threadId, { title: newTitle });
-				return true;
-			} catch (error) {
-				console.error("[ThreadListManager] Failed to rename thread:", error);
-				return false;
-			}
-		},
-
-		async archiveThread(threadId: number): Promise<boolean> {
-			try {
-				await updateThread(threadId, { archived: true });
-				return true;
-			} catch (error) {
-				console.error("[ThreadListManager] Failed to archive thread:", error);
-				return false;
-			}
-		},
-
-		async unarchiveThread(threadId: number): Promise<boolean> {
-			try {
-				await updateThread(threadId, { archived: false });
-				return true;
-			} catch (error) {
-				console.error("[ThreadListManager] Failed to unarchive thread:", error);
-				return false;
-			}
-		},
-
-		async deleteThread(threadId: number): Promise<boolean> {
-			try {
-				await deleteThread(threadId);
-				return true;
-			} catch (error) {
-				console.error("[ThreadListManager] Failed to delete thread:", error);
-				return false;
-			}
-		},
-	};
-}
diff --git a/surfsense_web/lib/chat/user-turn-api-parts.ts b/surfsense_web/lib/chat/user-turn-api-parts.ts
new file mode 100644
index 0000000000..5e063492fb
--- /dev/null
+++ b/surfsense_web/lib/chat/user-turn-api-parts.ts
@@ -0,0 +1,56 @@
+import type { AppendMessage } from "@assistant-ui/react";
+
+const MAX_IMAGES = 4;
+
+export type NewChatUserImagePayload = {
+	media_type: "image/png" | "image/jpeg" | "image/webp";
+	data: string;
+};
+
+function dataUrlToPayload(dataUrl: string): NewChatUserImagePayload | null {
+	const m = /^data:(image\/(?:png|jpeg|webp|jpg));base64,([\s\S]+)$/i.exec(dataUrl.trim());
+	if (!m) return null;
+	let media = m[1].toLowerCase() as string;
+	if (media === "image/jpg") media = "image/jpeg";
+	if (media !== "image/png" && media !== "image/jpeg" && media !== "image/webp") return null;
+	const data = m[2].replace(/\s/g, "");
+	if (!data) return null;
+	return { media_type: media as NewChatUserImagePayload["media_type"], data };
+}
+
+function collectImageDataUrlsFromParts(parts: AppendMessage["content"]): string[] {
+	const out: string[] = [];
+	for (const part of parts) {
+		if (typeof part !== "object" || part === null || !("type" in part)) continue;
+		if (part.type !== "image") continue;
+		const img = "image" in part && typeof part.image === "string" ? part.image : null;
+		if (img && dataUrlToPayload(img)) out.push(img);
+	}
+	return out;
+}
+
+export function extractUserTurnForNewChatApi(
+	message: AppendMessage,
+	extraDataUrls: readonly string[]
+): { userQuery: string; userImages: NewChatUserImagePayload[] } {
+	let userQuery = "";
+	for (const part of message.content) {
+		if (part.type === "text") {
+			userQuery += part.text;
+		}
+	}
+
+	const merged = [...extraDataUrls, ...collectImageDataUrlsFromParts(message.content)];
+	const payloads: NewChatUserImagePayload[] = [];
+	const seen = new Set<string>();
+	for (const url of merged) {
+		const p = dataUrlToPayload(url);
+		if (!p) continue;
+		if (seen.has(p.data)) continue;
+		seen.add(p.data);
+		payloads.push(p);
+		if (payloads.length >= MAX_IMAGES) break;
+	}
+
+	return { userQuery, userImages: payloads };
+}
diff --git a/surfsense_web/lib/chat/virtual-path-display.ts b/surfsense_web/lib/chat/virtual-path-display.ts
new file mode 100644
index 0000000000..792a06394b
--- /dev/null
+++ b/surfsense_web/lib/chat/virtual-path-display.ts
@@ -0,0 +1,55 @@
+/**
+ * Pure helpers for turning agent-facing virtual paths into human-friendly
+ * chip labels.
+ *
+ * Why this lives in `lib` and not in the UI component:
+ * - Pure function = trivial to unit-test (no React, no DOM).
+ * - Used in two render sites today (the user-message chip and the AI-answer
+ *   `MentionChip`) and likely more (history search, share-card previews, etc).
+ *   Centralising the rules keeps the agent's path encoding and the UI's
+ *   decoding from drifting apart.
+ *
+ * The agent emits paths under `/documents/...` with two encoding rules
+ * applied by `surfsense_backend/app/agents/new_chat/path_resolver.py`:
+ *
+ * 1. Every basename ends with `.xml` (so the LLM treats KB documents as XML
+ *    files). Display layer strips this — users think of the underlying
+ *    filename, not the LLM's wrapper.
+ * 2. Title collisions get a ` (<doc_id>).xml` disambiguation suffix.
+ *    Display layer strips the parenthesised id since it's an implementation
+ *    detail, not user-facing identity.
+ */
+
+const XML_EXTENSION_RE = /\.xml$/i;
+const DOC_ID_DISAMBIG_RE = /\s\(\d+\)$/;
+
+export interface VirtualPathDisplay {
+	/** Human-friendly leaf name with `.xml` and ` (<doc_id>)` suffixes stripped. */
+	displayName: string;
+	/** Whether the path points to a folder (trailing slash) rather than a file. */
+	isFolder: boolean;
+}
+
+/**
+ * Decode a virtual path into the label that should appear in chip UI.
+ *
+ * Folder detection uses the trailing-slash convention the agent already
+ * follows in `<priority_documents>` and `KnowledgeTreeMiddleware`. Falls
+ * back to the raw path if nothing else can be extracted (defensive — the
+ * caller will at least show *something*).
+ */
+export function getVirtualPathDisplay(path: string): VirtualPathDisplay {
+	const trimmed = (path ?? "").trim();
+	if (!trimmed) return { displayName: "", isFolder: false };
+
+	const isFolder = trimmed.endsWith("/");
+	const normalized = trimmed.replace(/\/+$/, "");
+	const segments = normalized.split("/").filter(Boolean);
+	const leaf = segments.at(-1);
+	if (!leaf) return { displayName: trimmed, isFolder };
+
+	const withoutXml = leaf.replace(XML_EXTENSION_RE, "");
+	const displayName = withoutXml.replace(DOC_ID_DISAMBIG_RE, "");
+
+	return { displayName: displayName || leaf, isFolder };
+}
diff --git a/surfsense_web/lib/citations/citation-parser.ts b/surfsense_web/lib/citations/citation-parser.ts
new file mode 100644
index 0000000000..533c644c24
--- /dev/null
+++ b/surfsense_web/lib/citations/citation-parser.ts
@@ -0,0 +1,130 @@
+// Pure citation parsing for `[citation:...]` tokens emitted by SurfSense
+// agents. No React imports — consumed by both the React renderer
+// (markdown surfaces) and the Plate value transform (document viewer).
+//
+// The same logic previously lived inline in
+// `components/assistant-ui/markdown-text.tsx` with module-level mutable
+// state. This module exposes a per-call URL map so multiple concurrent
+// renderers / SSR contexts can't race each other.
+
+import { FENCED_OR_INLINE_CODE } from "@/lib/markdown/code-regions";
+
+/**
+ * Matches `[citation:...]` with numeric IDs (incl. negative, doc- prefix,
+ * comma-separated), URL-based IDs from live web search, or `urlciteN`
+ * placeholders produced by `preprocessCitationMarkdown`.
+ *
+ * Also matches Chinese brackets 【】 and zero-width spaces that LLMs
+ * sometimes emit.
+ */
+export const CITATION_REGEX =
+	/[[【]\u200B?citation:\s*(https?:\/\/[^\]】\u200B]+|urlcite\d+|(?:doc-)?-?\d+(?:\s*,\s*(?:doc-)?-?\d+)*)\s*\u200B?[\]】]/g;
+
+/** A single parsed citation reference. */
+export type CitationToken =
+	| { kind: "url"; url: string }
+	| { kind: "chunk"; chunkId: number; isDocsChunk: boolean };
+
+/** Output of `parseTextWithCitations` — interleaved text + citation tokens. */
+export type ParsedSegment = string | CitationToken;
+
+/** Per-call URL placeholder map; key is `urlciteN`, value is the original URL. */
+export type CitationUrlMap = Map<string, string>;
+
+/** Result of preprocessing raw markdown for downstream parsing. */
+export interface PreprocessedCitations {
+	/** Markdown with `[citation:URL]` tokens rewritten to `[citation:urlciteN]`. */
+	content: string;
+	/** Lookup table to recover the original URL from each placeholder. */
+	urlMap: CitationUrlMap;
+}
+
+/** Pattern matching only URL-form citations (used during preprocessing). */
+const URL_CITATION_REGEX = /[[【]\u200B?citation:\s*(https?:\/\/[^\]】\u200B]+)\s*\u200B?[\]】]/g;
+
+/**
+ * Replace `[citation:URL]` tokens with `[citation:urlciteN]` placeholders so
+ * GFM autolinks don't split the URL out of the brackets during markdown
+ * parsing. Returns both the rewritten content and a map for later lookup.
+ *
+ * Code-fence aware: skips fenced (``` ``` ```) and inline (`` ` ``) code
+ * regions so citation-shaped strings inside example code remain literal.
+ *
+ * Known limitations: `~~~` fences, 4-space indented code, and LaTeX math
+ * blocks are not skipped. Citation tokens inside those regions are rare in
+ * practice; documented in the plan.
+ */
+export function preprocessCitationMarkdown(content: string): PreprocessedCitations {
+	const urlMap: CitationUrlMap = new Map();
+	let counter = 0;
+
+	// Splitting on a regex with one capture group puts code regions at odd
+	// indexes (matched delimiters) and the surrounding text at even indexes.
+	// Only transform the even-indexed parts.
+	const parts = content.split(FENCED_OR_INLINE_CODE);
+	const transformed = parts.map((part, index) => {
+		if (index % 2 === 1) return part;
+		return part.replace(URL_CITATION_REGEX, (_match, url: string) => {
+			const key = `urlcite${counter++}`;
+			urlMap.set(key, url.trim());
+			return `[citation:${key}]`;
+		});
+	});
+
+	return { content: transformed.join(""), urlMap };
+}
+
+/**
+ * Parse a string into an array of plain text segments and citation tokens.
+ *
+ * Pure data — no React. The renderer module is responsible for mapping
+ * tokens to JSX. Negative chunk IDs are forwarded as-is so the consumer
+ * can decide how to render anonymous documents.
+ */
+export function parseTextWithCitations(text: string, urlMap: CitationUrlMap): ParsedSegment[] {
+	const segments: ParsedSegment[] = [];
+	let lastIndex = 0;
+	let match: RegExpExecArray | null;
+
+	CITATION_REGEX.lastIndex = 0;
+	match = CITATION_REGEX.exec(text);
+	while (match !== null) {
+		if (match.index > lastIndex) {
+			segments.push(text.substring(lastIndex, match.index));
+		}
+
+		const captured = match[1];
+
+		if (captured.startsWith("http://") || captured.startsWith("https://")) {
+			segments.push({ kind: "url", url: captured.trim() });
+		} else if (captured.startsWith("urlcite")) {
+			const url = urlMap.get(captured);
+			if (url) {
+				segments.push({ kind: "url", url });
+			}
+		} else {
+			const rawIds = captured.split(",").map((s) => s.trim());
+			for (const rawId of rawIds) {
+				const isDocsChunk = rawId.startsWith("doc-");
+				const chunkId = Number.parseInt(isDocsChunk ? rawId.slice(4) : rawId, 10);
+				if (!Number.isNaN(chunkId)) {
+					segments.push({ kind: "chunk", chunkId, isDocsChunk });
+				}
+			}
+		}
+
+		lastIndex = match.index + match[0].length;
+		match = CITATION_REGEX.exec(text);
+	}
+
+	if (lastIndex < text.length) {
+		segments.push(text.substring(lastIndex));
+	}
+
+	return segments.length > 0 ? segments : [text];
+}
+
+/** Type guard for the citation branch of `ParsedSegment`. */
+export function isCitationToken(segment: ParsedSegment): segment is CitationToken {
+	return typeof segment !== "string";
+}
diff --git a/surfsense_web/lib/comments/utils.ts b/surfsense_web/lib/comments/utils.ts
new file mode 100644
index 0000000000..acbcee5065
--- /dev/null
+++ b/surfsense_web/lib/comments/utils.ts
@@ -0,0 +1,4 @@
+export function convertRenderedToDisplay(contentRendered: string): string {
+	// Convert @{DisplayName} format to @DisplayName for editing
+	return contentRendered.replace(/@\{([^}]+)\}/g, "@$1");
+}
diff --git a/surfsense_web/lib/desktop-download-utils.ts b/surfsense_web/lib/desktop-download-utils.ts
new file mode 100644
index 0000000000..d4e2d4e68f
--- /dev/null
+++ b/surfsense_web/lib/desktop-download-utils.ts
@@ -0,0 +1,108 @@
+import { useEffect, useMemo, useState } from "react";
+
+export type OSInfo = {
+	os: "macOS" | "Windows" | "Linux";
+	arch: "arm64" | "x64";
+};
+
+export function useUserOS(): OSInfo {
+	const [info, setInfo] = useState<OSInfo>({ os: "macOS", arch: "arm64" });
+	useEffect(() => {
+		const ua = navigator.userAgent;
+		let os: OSInfo["os"] = "macOS";
+		let arch: OSInfo["arch"] = "x64";
+
+		if (/Windows/i.test(ua)) {
+			os = "Windows";
+			arch = "x64";
+		} else if (/Linux/i.test(ua)) {
+			os = "Linux";
+			arch = "x64";
+		} else {
+			os = "macOS";
+			arch = /Mac/.test(ua) && !/Intel/.test(ua) ? "arm64" : "arm64";
+		}
+
+		const uaData = (navigator as Navigator & { userAgentData?: { architecture?: string } })
+			.userAgentData;
+		if (uaData?.architecture === "arm") arch = "arm64";
+		else if (uaData?.architecture === "x86") arch = "x64";
+
+		setInfo({ os, arch });
+	}, []);
+	return info;
+}
+
+export interface ReleaseAsset {
+	name: string;
+	url: string;
+}
+
+export function useLatestRelease() {
+	const [assets, setAssets] = useState<ReleaseAsset[]>([]);
+
+	useEffect(() => {
+		const controller = new AbortController();
+		fetch("https://api.github.com/repos/MODSetter/SurfSense/releases/latest", {
+			signal: controller.signal,
+		})
+			.then((r) => r.json())
+			.then((data) => {
+				if (data?.assets) {
+					setAssets(
+						data.assets
+							.filter((a: { name: string }) => /\.(exe|dmg|AppImage|deb)$/.test(a.name))
+							.map((a: { name: string; browser_download_url: string }) => ({
+								name: a.name,
+								url: a.browser_download_url,
+							}))
+					);
+				}
+			})
+			.catch(() => {});
+		return () => controller.abort();
+	}, []);
+
+	return assets;
+}
+
+export const ASSET_LABELS: Record<string, string> = {
+	".exe": "Windows (exe)",
+	"-arm64.dmg": "macOS Apple Silicon (dmg)",
+	"-x64.dmg": "macOS Intel (dmg)",
+	"-arm64.zip": "macOS Apple Silicon (zip)",
+	"-x64.zip": "macOS Intel (zip)",
+	".AppImage": "Linux (AppImage)",
+	".deb": "Linux (deb)",
+};
+
+export function getAssetLabel(name: string): string {
+	for (const [suffix, label] of Object.entries(ASSET_LABELS)) {
+		if (name.endsWith(suffix)) return label;
+	}
+	return name;
+}
+
+export const GITHUB_RELEASES_URL = "https://github.com/MODSetter/SurfSense/releases/latest";
+
+export function usePrimaryDownload() {
+	const { os, arch } = useUserOS();
+	const assets = useLatestRelease();
+
+	const { primary, alternatives } = useMemo(() => {
+		if (assets.length === 0) return { primary: null, alternatives: [] };
+
+		const matchers: Record<string, (n: string) => boolean> = {
+			Windows: (n) => n.endsWith(".exe"),
+			macOS: (n) => n.endsWith(`-${arch}.dmg`),
+			Linux: (n) => n.endsWith(".AppImage"),
+		};
+
+		const match = matchers[os];
+		const primary = assets.find((a) => match(a.name)) ?? null;
+		const alternatives = assets.filter((a) => a !== primary);
+		return { primary, alternatives };
+	}, [assets, os, arch]);
+
+	return { os, arch, assets, primary, alternatives };
+}
diff --git a/surfsense_web/lib/documents/document-type-labels.ts b/surfsense_web/lib/documents/document-type-labels.ts
new file mode 100644
index 0000000000..8449618863
--- /dev/null
+++ b/surfsense_web/lib/documents/document-type-labels.ts
@@ -0,0 +1,41 @@
+export function getDocumentTypeLabel(type: string): string {
+	const labelMap: Record<string, string> = {
+		EXTENSION: "Extension",
+		CRAWLED_URL: "Web Page",
+		FILE: "File",
+		SLACK_CONNECTOR: "Slack",
+		TEAMS_CONNECTOR: "Microsoft Teams",
+		ONEDRIVE_FILE: "OneDrive",
+		DROPBOX_FILE: "Dropbox",
+		NOTION_CONNECTOR: "Notion",
+		YOUTUBE_VIDEO: "YouTube Video",
+		GITHUB_CONNECTOR: "GitHub",
+		LINEAR_CONNECTOR: "Linear",
+		DISCORD_CONNECTOR: "Discord",
+		JIRA_CONNECTOR: "Jira",
+		CONFLUENCE_CONNECTOR: "Confluence",
+		CLICKUP_CONNECTOR: "ClickUp",
+		GOOGLE_CALENDAR_CONNECTOR: "Google Calendar",
+		GOOGLE_GMAIL_CONNECTOR: "Gmail",
+		GOOGLE_DRIVE_FILE: "Google Drive",
+		AIRTABLE_CONNECTOR: "Airtable",
+		LUMA_CONNECTOR: "Luma",
+		ELASTICSEARCH_CONNECTOR: "Elasticsearch",
+		BOOKSTACK_CONNECTOR: "BookStack",
+		CIRCLEBACK: "Circleback",
+		OBSIDIAN_CONNECTOR: "Obsidian",
+		LOCAL_FOLDER_FILE: "Local Folder",
+		SURFSENSE_DOCS: "SurfSense Docs",
+		NOTE: "Note",
+		COMPOSIO_GOOGLE_DRIVE_CONNECTOR: "Composio Google Drive",
+		COMPOSIO_GMAIL_CONNECTOR: "Composio Gmail",
+		COMPOSIO_GOOGLE_CALENDAR_CONNECTOR: "Composio Google Calendar",
+	};
+	return (
+		labelMap[type] ||
+		type
+			.split("_")
+			.map((word) => word.charAt(0) + word.slice(1).toLowerCase())
+			.join(" ")
+	);
+}
diff --git a/surfsense_web/lib/editor-language.ts b/surfsense_web/lib/editor-language.ts
new file mode 100644
index 0000000000..17227c15d9
--- /dev/null
+++ b/surfsense_web/lib/editor-language.ts
@@ -0,0 +1,34 @@
+const EXTENSION_TO_MONACO_LANGUAGE: Record<string, string> = {
+	css: "css",
+	csv: "plaintext",
+	cjs: "javascript",
+	html: "html",
+	htm: "html",
+	ini: "ini",
+	js: "javascript",
+	json: "json",
+	markdown: "markdown",
+	md: "markdown",
+	mjs: "javascript",
+	py: "python",
+	sql: "sql",
+	toml: "plaintext",
+	ts: "typescript",
+	tsx: "typescript",
+	xml: "xml",
+	yaml: "yaml",
+	yml: "yaml",
+};
+
+export function inferMonacoLanguageFromPath(filePath: string | null | undefined): string {
+	if (!filePath) return "plaintext";
+
+	const fileName = filePath.split("/").pop() ?? filePath;
+	const extensionIndex = fileName.lastIndexOf(".");
+	if (extensionIndex <= 0 || extensionIndex === fileName.length - 1) {
+		return "plaintext";
+	}
+
+	const extension = fileName.slice(extensionIndex + 1).toLowerCase();
+	return EXTENSION_TO_MONACO_LANGUAGE[extension] ?? "plaintext";
+}
diff --git a/surfsense_web/lib/error-toast.ts b/surfsense_web/lib/error-toast.ts
new file mode 100644
index 0000000000..55be6b5442
--- /dev/null
+++ b/surfsense_web/lib/error-toast.ts
@@ -0,0 +1,72 @@
+import { toast } from "sonner";
+import { AbortedError, AppError, AuthenticationError, SURFSENSE_ISSUES_URL } from "./error";
+
+/**
+ * Build a GitHub issue URL pre-filled with diagnostic context.
+ * Avoids including PII — only structural metadata.
+ */
+export function buildIssueUrl(error: unknown): string {
+	const params = new URLSearchParams();
+
+	const lines: string[] = ["## Bug Report", "", "**Describe what happened:**", "", ""];
+
+	if (error instanceof AppError) {
+		lines.push("## Diagnostics (auto-filled)", "");
+		if (error.code) lines.push(`- **Error code:** \`${error.code}\``);
+		if (error.requestId) lines.push(`- **Request ID:** \`${error.requestId}\``);
+		if (error.status) lines.push(`- **HTTP status:** ${error.status}`);
+		lines.push(`- **Message:** ${error.message}`);
+	} else if (error instanceof Error) {
+		lines.push("## Diagnostics (auto-filled)", "");
+		lines.push(`- **Error:** ${error.message}`);
+	}
+
+	lines.push(`- **Timestamp:** ${new Date().toISOString()}`);
+	lines.push(
+		`- **Page:** \`${typeof window !== "undefined" ? window.location.pathname : "unknown"}\``
+	);
+	lines.push(
+		`- **User Agent:** \`${typeof navigator !== "undefined" ? navigator.userAgent : "unknown"}\``
+	);
+
+	params.set("body", lines.join("\n"));
+	params.set("labels", "bug");
+
+	return `${SURFSENSE_ISSUES_URL}/new?${params.toString()}`;
+}
+
+/**
+ * Show a standardized error toast with a "Report Issue" action.
+ *
+ * Suppressed for:
+ * - AbortedError (user-initiated cancellation)
+ * - AuthenticationError (handled by redirect)
+ */
+export function showErrorToast(error: unknown, fallbackMessage?: string) {
+	if (error instanceof AbortedError || error instanceof AuthenticationError) {
+		return;
+	}
+
+	const message =
+		error instanceof AppError
+			? error.message
+			: error instanceof Error
+				? error.message
+				: (fallbackMessage ?? "An unexpected error occurred.");
+
+	const code = error instanceof AppError ? error.code : undefined;
+	const requestId = error instanceof AppError ? error.requestId : undefined;
+
+	const descParts: string[] = [];
+	if (code) descParts.push(`Error: ${code}`);
+	if (requestId) descParts.push(`ID: ${requestId}`);
+
+	toast.error(message, {
+		description: descParts.length > 0 ? descParts.join(" | ") : undefined,
+		duration: 8000,
+		action: {
+			label: "Report Issue",
+			onClick: () => window.open(buildIssueUrl(error), "_blank"),
+		},
+	});
+}
diff --git a/surfsense_web/lib/error.ts b/surfsense_web/lib/error.ts
index c8d8283c89..10cb3706eb 100644
--- a/surfsense_web/lib/error.ts
+++ b/surfsense_web/lib/error.ts
@@ -1,40 +1,61 @@
+export const SURFSENSE_ISSUES_URL = "https://github.com/MODSetter/SurfSense/issues";
+
 export class AppError extends Error {
 	status?: number;
 	statusText?: string;
-	constructor(message: string, status?: number, statusText?: string) {
+	code?: string;
+	requestId?: string;
+	reportUrl?: string;
+	constructor(
+		message: string,
+		status?: number,
+		statusText?: string,
+		code?: string,
+		requestId?: string,
+		reportUrl?: string
+	) {
 		super(message);
-		this.name = this.constructor.name; // User friendly
+		this.name = this.constructor.name;
 		this.status = status;
-		this.statusText = statusText; // Dev friendly
+		this.statusText = statusText;
+		this.code = code;
+		this.requestId = requestId;
+		this.reportUrl = reportUrl ?? SURFSENSE_ISSUES_URL;
 	}
 }
 
 export class NetworkError extends AppError {
 	constructor(message: string, status?: number, statusText?: string) {
-		super(message, status, statusText);
+		super(message, status, statusText, "NETWORK_ERROR");
+	}
+}
+
+export class AbortedError extends AppError {
+	constructor(message = "Request was cancelled.") {
+		super(message, undefined, undefined, "REQUEST_ABORTED");
 	}
 }
 
 export class ValidationError extends AppError {
 	constructor(message: string, status?: number, statusText?: string) {
-		super(message, status, statusText);
+		super(message, status, statusText, "VALIDATION_ERROR");
 	}
 }
 
 export class AuthenticationError extends AppError {
 	constructor(message: string, status?: number, statusText?: string) {
-		super(message, status, statusText);
+		super(message, status, statusText, "UNAUTHORIZED");
 	}
 }
 
 export class AuthorizationError extends AppError {
 	constructor(message: string, status?: number, statusText?: string) {
-		super(message, status, statusText);
+		super(message, status, statusText, "FORBIDDEN");
 	}
 }
 
 export class NotFoundError extends AppError {
 	constructor(message: string, status?: number, statusText?: string) {
-		super(message, status, statusText);
+		super(message, status, statusText, "NOT_FOUND");
 	}
 }
diff --git a/surfsense_web/lib/folder-sync-upload.ts b/surfsense_web/lib/folder-sync-upload.ts
index 7a9810d763..46ee7230a5 100644
--- a/surfsense_web/lib/folder-sync-upload.ts
+++ b/surfsense_web/lib/folder-sync-upload.ts
@@ -17,6 +17,7 @@ export interface FolderSyncParams {
 	excludePatterns: string[];
 	fileExtensions: string[];
 	enableSummary: boolean;
+	processingMode?: "basic" | "premium";
 	rootFolderId?: number | null;
 	onProgress?: (progress: FolderSyncProgress) => void;
 	signal?: AbortSignal;
@@ -62,6 +63,7 @@ async function uploadBatchesWithConcurrency(
 		searchSpaceId: number;
 		rootFolderId: number | null;
 		enableSummary: boolean;
+		processingMode?: "basic" | "premium";
 		signal?: AbortSignal;
 		onBatchComplete?: (filesInBatch: number) => void;
 	}
@@ -99,6 +101,7 @@ async function uploadBatchesWithConcurrency(
 						relative_paths: batch.map((e) => e.relativePath),
 						root_folder_id: resolvedRootFolderId,
 						enable_summary: params.enableSummary,
+						processing_mode: params.processingMode,
 					},
 					params.signal
 				);
@@ -145,6 +148,7 @@ export async function uploadFolderScan(params: FolderSyncParams): Promise<number
 		excludePatterns,
 		fileExtensions,
 		enableSummary,
+		processingMode,
 		signal,
 	} = params;
 	let rootFolderId = params.rootFolderId ?? null;
@@ -190,6 +194,7 @@ export async function uploadFolderScan(params: FolderSyncParams): Promise<number
 			searchSpaceId,
 			rootFolderId: rootFolderId ?? null,
 			enableSummary,
+			processingMode,
 			signal,
 			onBatchComplete: (count) => {
 				uploaded += count;
diff --git a/surfsense_web/lib/format-date.ts b/surfsense_web/lib/format-date.ts
index ee60d113db..9decd34027 100644
--- a/surfsense_web/lib/format-date.ts
+++ b/surfsense_web/lib/format-date.ts
@@ -22,3 +22,11 @@ export function formatRelativeDate(dateString: string): string {
 	if (daysAgo < 7) return `${daysAgo}d ago`;
 	return format(date, "MMM d, yyyy");
 }
+
+/**
+ * Format a thread's last-updated timestamp for the chats sidebars.
+ * Example: "Mar 23, 2026 at 4:30 PM"
+ */
+export function formatThreadTimestamp(dateString: string): string {
+	return format(new Date(dateString), "MMM d, yyyy 'at' h:mm a");
+}
diff --git a/surfsense_web/lib/layout-events.ts b/surfsense_web/lib/layout-events.ts
new file mode 100644
index 0000000000..45c52f7a4e
--- /dev/null
+++ b/surfsense_web/lib/layout-events.ts
@@ -0,0 +1 @@
+export const SLIDEOUT_PANEL_OPENED_EVENT = "slideout-panel-opened";
diff --git a/surfsense_web/lib/markdown/code-regions.ts b/surfsense_web/lib/markdown/code-regions.ts
new file mode 100644
index 0000000000..336a87acbf
--- /dev/null
+++ b/surfsense_web/lib/markdown/code-regions.ts
@@ -0,0 +1,8 @@
+// Matches fenced (```...```) and inline (`...`) code regions. Used by MDX
+// escaping and citation preprocessing — single source of truth so future
+// edits stay in sync.
+//
+// String.split() with this capturing pattern places non-code parts at even
+// indexes and matched code regions at odd indexes — preserve odd-indexed
+// segments verbatim when transforming markdown.
+export const FENCED_OR_INLINE_CODE = /(```[\s\S]*?```|`[^`\n]+`)/g;
diff --git a/surfsense_web/lib/posthog/events.ts b/surfsense_web/lib/posthog/events.ts
index 53aaa71b91..127685fe8d 100644
--- a/surfsense_web/lib/posthog/events.ts
+++ b/surfsense_web/lib/posthog/events.ts
@@ -1,4 +1,6 @@
 import posthog from "posthog-js";
+import { getConnectorTelemetryMeta } from "@/components/assistant-ui/connector-popup/constants/connector-constants";
+import type { ChatErrorKind, ChatErrorSeverity, ChatFlow } from "@/lib/chat/chat-error-classifier";
 
 /**
  * PostHog Analytics Event Definitions
@@ -13,8 +15,8 @@ import posthog from "posthog-js";
  * - auth: Authentication events
  * - search_space: Search space management
  * - document: Document management
- * - chat: Chat and messaging
- * - connector: External connector events
+ * - chat: Chat and messaging (authenticated + anonymous)
+ * - connector: External connector events (all lifecycle stages)
  * - contact: Contact form events
  * - settings: Settings changes
  * - marketing: Marketing/referral tracking
@@ -28,6 +30,17 @@ function safeCapture(event: string, properties?: Record<string, unknown>) {
 	}
 }
 
+/**
+ * Drop undefined values so PostHog doesn't log `"foo": undefined` noise.
+ */
+function compact<T extends Record<string, unknown>>(obj: T): Record<string, unknown> {
+	const out: Record<string, unknown> = {};
+	for (const [k, v] of Object.entries(obj)) {
+		if (v !== undefined) out[k] = v;
+	}
+	return out;
+}
+
 // ============================================
 // AUTH EVENTS
 // ============================================
@@ -127,6 +140,77 @@ export function trackChatError(searchSpaceId: number, chatId: number, error?: st
 	});
 }
 
+export interface ChatFailureTelemetry {
+	flow: ChatFlow;
+	kind: ChatErrorKind;
+	error_code?: string;
+	severity: ChatErrorSeverity;
+	is_expected: boolean;
+	message?: string;
+}
+
+export function trackChatBlocked(
+	searchSpaceId: number,
+	chatId: number | null,
+	payload: ChatFailureTelemetry
+) {
+	safeCapture(
+		"chat_blocked",
+		compact({
+			search_space_id: searchSpaceId,
+			chat_id: chatId ?? undefined,
+			flow: payload.flow,
+			kind: payload.kind,
+			error_code: payload.error_code,
+			severity: payload.severity,
+			is_expected: payload.is_expected,
+			message: payload.message,
+		})
+	);
+}
+
+export function trackChatErrorDetailed(
+	searchSpaceId: number,
+	chatId: number | null,
+	payload: ChatFailureTelemetry
+) {
+	safeCapture(
+		"chat_error",
+		compact({
+			search_space_id: searchSpaceId,
+			chat_id: chatId ?? undefined,
+			flow: payload.flow,
+			kind: payload.kind,
+			error_code: payload.error_code,
+			severity: payload.severity,
+			is_expected: payload.is_expected,
+			message: payload.message,
+		})
+	);
+}
+
+/**
+ * Track a message sent from the unauthenticated "free" / anonymous chat
+ * flow. This is intentionally a separate event from `chat_message_sent`
+ * so WAU / retention queries on the authenticated event stay clean while
+ * still giving us visibility into top-of-funnel usage on /free/*.
+ */
+export function trackAnonymousChatMessageSent(options: {
+	modelSlug: string;
+	messageLength?: number;
+	hasUploadedDoc?: boolean;
+	webSearchEnabled?: boolean;
+	surface?: "free_chat_page" | "free_model_page";
+}) {
+	safeCapture("anonymous_chat_message_sent", {
+		model_slug: options.modelSlug,
+		message_length: options.messageLength,
+		has_uploaded_doc: options.hasUploadedDoc ?? false,
+		web_search_enabled: options.webSearchEnabled,
+		surface: options.surface,
+	});
+}
+
 // ============================================
 // DOCUMENT EVENTS
 // ============================================
@@ -179,37 +263,88 @@ export function trackYouTubeImport(searchSpaceId: number, url: string) {
 }
 
 // ============================================
-// CONNECTOR EVENTS
+// CONNECTOR EVENTS (generic lifecycle dispatcher)
 // ============================================
+//
+// All connector events go through `trackConnectorEvent`. The connector's
+// human-readable title and its group (oauth/composio/crawler/other) are
+// auto-attached from the shared registry in `connector-constants.ts`, so
+// adding a new connector to that list is the only change required for it
+// to show up correctly in PostHog dashboards.
+
+export type ConnectorEventStage =
+	| "setup_started"
+	| "setup_success"
+	| "setup_failure"
+	| "oauth_initiated"
+	| "connected"
+	| "deleted"
+	| "synced";
+
+export interface ConnectorEventOptions {
+	searchSpaceId?: number | null;
+	connectorId?: number | null;
+	/** Source of the action (e.g. "oauth_callback", "non_oauth_form", "webcrawler_quick_add"). */
+	source?: string;
+	/** Free-form error message for failure events. */
+	error?: string;
+	/** Extra properties specific to the stage (e.g. frequency_minutes for sync events). */
+	extra?: Record<string, unknown>;
+}
 
-export function trackConnectorSetupStarted(searchSpaceId: number, connectorType: string) {
-	safeCapture("connector_setup_started", {
-		search_space_id: searchSpaceId,
-		connector_type: connectorType,
+/**
+ * Generic connector lifecycle tracker. Every connector analytics event
+ * should funnel through here so the enrichment stays consistent.
+ */
+export function trackConnectorEvent(
+	stage: ConnectorEventStage,
+	connectorType: string,
+	options: ConnectorEventOptions = {}
+) {
+	const meta = getConnectorTelemetryMeta(connectorType);
+	safeCapture(`connector_${stage}`, {
+		...compact({
+			search_space_id: options.searchSpaceId ?? undefined,
+			connector_id: options.connectorId ?? undefined,
+			source: options.source,
+			error: options.error,
+		}),
+		connector_type: meta.connector_type,
+		connector_title: meta.connector_title,
+		connector_group: meta.connector_group,
+		is_oauth: meta.is_oauth,
+		...(options.extra ?? {}),
 	});
 }
 
+// ---- Convenience wrappers kept for backward compatibility ----
+
+export function trackConnectorSetupStarted(
+	searchSpaceId: number,
+	connectorType: string,
+	source?: string
+) {
+	trackConnectorEvent("setup_started", connectorType, { searchSpaceId, source });
+}
+
 export function trackConnectorSetupSuccess(
 	searchSpaceId: number,
 	connectorType: string,
 	connectorId: number
 ) {
-	safeCapture("connector_setup_success", {
-		search_space_id: searchSpaceId,
-		connector_type: connectorType,
-		connector_id: connectorId,
-	});
+	trackConnectorEvent("setup_success", connectorType, { searchSpaceId, connectorId });
 }
 
 export function trackConnectorSetupFailure(
-	searchSpaceId: number,
+	searchSpaceId: number | null | undefined,
 	connectorType: string,
-	error?: string
+	error?: string,
+	source?: string
 ) {
-	safeCapture("connector_setup_failure", {
-		search_space_id: searchSpaceId,
-		connector_type: connectorType,
+	trackConnectorEvent("setup_failure", connectorType, {
+		searchSpaceId: searchSpaceId ?? undefined,
 		error,
+		source,
 	});
 }
 
@@ -218,11 +353,7 @@ export function trackConnectorDeleted(
 	connectorType: string,
 	connectorId: number
 ) {
-	safeCapture("connector_deleted", {
-		search_space_id: searchSpaceId,
-		connector_type: connectorType,
-		connector_id: connectorId,
-	});
+	trackConnectorEvent("deleted", connectorType, { searchSpaceId, connectorId });
 }
 
 export function trackConnectorSynced(
@@ -230,11 +361,7 @@ export function trackConnectorSynced(
 	connectorType: string,
 	connectorId: number
 ) {
-	safeCapture("connector_synced", {
-		search_space_id: searchSpaceId,
-		connector_type: connectorType,
-		connector_id: connectorId,
-	});
+	trackConnectorEvent("synced", connectorType, { searchSpaceId, connectorId });
 }
 
 // ============================================
@@ -274,6 +401,16 @@ export function trackSourcesTabViewed(searchSpaceId: number, tab: string) {
 	});
 }
 
+export function trackDesktopDownloadClicked(options: {
+	os: string;
+	placement: "sidebar_collapsed" | "sidebar_expanded";
+}) {
+	safeCapture("desktop_download_clicked", {
+		os: options.os,
+		placement: options.placement,
+	});
+}
+
 // ============================================
 // SEARCH SPACE INVITE EVENTS
 // ============================================
@@ -345,10 +482,9 @@ export function trackConnectorConnected(
 	connectorType: string,
 	connectorId?: number
 ) {
-	safeCapture("connector_connected", {
-		search_space_id: searchSpaceId,
-		connector_type: connectorType,
-		connector_id: connectorId,
+	trackConnectorEvent("connected", connectorType, {
+		searchSpaceId,
+		connectorId: connectorId ?? undefined,
 	});
 }
 
@@ -467,8 +603,13 @@ export function trackReferralLanding(refCode: string, landingUrl: string) {
 // ============================================
 
 /**
- * Identify a user for PostHog analytics
- * Call this after successful authentication
+ * Identify a user for PostHog analytics.
+ * Call this after successful authentication.
+ *
+ * In the Electron desktop app the same call is mirrored into the
+ * main-process PostHog client so desktop-only events (e.g.
+ * `desktop_quick_ask_opened`, `desktop_autocomplete_accepted`) are
+ * attributed to the logged-in user rather than an anonymous machine ID.
  */
 export function identifyUser(userId: string, properties?: Record<string, unknown>) {
 	try {
@@ -476,10 +617,19 @@ export function identifyUser(userId: string, properties?: Record<string, unknown
 	} catch {
 		// Silently ignore – ad-blockers may break posthog
 	}
+
+	try {
+		if (typeof window !== "undefined" && window.electronAPI?.analyticsIdentify) {
+			void window.electronAPI.analyticsIdentify(userId, properties);
+		}
+	} catch {
+		// IPC errors must never break the app
+	}
 }
 
 /**
- * Reset user identity (call on logout)
+ * Reset user identity (call on logout). Mirrors the reset into the
+ * Electron main process when running inside the desktop app.
  */
 export function resetUser() {
 	try {
@@ -487,4 +637,12 @@ export function resetUser() {
 	} catch {
 		// Silently ignore – ad-blockers may break posthog
 	}
+
+	try {
+		if (typeof window !== "undefined" && window.electronAPI?.analyticsReset) {
+			void window.electronAPI.analyticsReset();
+		}
+	} catch {
+		// IPC errors must never break the app
+	}
 }
diff --git a/surfsense_web/lib/provider-icons.tsx b/surfsense_web/lib/provider-icons.tsx
index d017d9aa2c..e63c5eb2fd 100644
--- a/surfsense_web/lib/provider-icons.tsx
+++ b/surfsense_web/lib/provider-icons.tsx
@@ -13,6 +13,7 @@ import {
 	DeepSeekIcon,
 	FireworksAiIcon,
 	GeminiIcon,
+	GitHubModelsIcon,
 	GroqIcon,
 	HuggingFaceIcon,
 	MiniMaxIcon,
@@ -82,6 +83,8 @@ export function getProviderIcon(
 			return <FireworksAiIcon className={cn(className)} />;
 		case "GOOGLE":
 			return <GeminiIcon className={cn(className)} />;
+		case "GITHUB_MODELS":
+			return <GitHubModelsIcon className={cn(className)} />;
 		case "GROQ":
 			return <GroqIcon className={cn(className)} />;
 		case "HUGGINGFACE":
diff --git a/surfsense_web/lib/query-client/cache-keys.ts b/surfsense_web/lib/query-client/cache-keys.ts
index 10aba7ef4c..ce45ee1433 100644
--- a/surfsense_web/lib/query-client/cache-keys.ts
+++ b/surfsense_web/lib/query-client/cache-keys.ts
@@ -3,6 +3,19 @@ import type { GetDocumentsRequest } from "@/contracts/types/document.types";
 import type { GetLogsRequest } from "@/contracts/types/log.types";
 import type { GetSearchSpacesRequest } from "@/contracts/types/search-space.types";
 
+/**
+ * Convert an object to a stable array of [key, value] pairs sorted by key.
+ * This ensures cache keys are order-independent (avoiding Object.values order-dependency).
+ * Filters out undefined values.
+ */
+function stableEntries(obj: Record<string, unknown> | null | undefined): unknown[] {
+	if (!obj) return [];
+	return Object.entries(obj)
+		.filter(([, v]) => v !== undefined)
+		.sort(([a], [b]) => a.localeCompare(b))
+		.flat();
+}
+
 export const cacheKeys = {
 	// New chat threads (assistant-ui)
 	threads: {
@@ -13,9 +26,9 @@ export const cacheKeys = {
 	},
 	documents: {
 		globalQueryParams: (queries: GetDocumentsRequest["queryParams"]) =>
-			["documents", ...(queries ? Object.values(queries) : [])] as const,
+			["documents", ...stableEntries(queries)] as const,
 		withQueryParams: (queries: GetDocumentsRequest["queryParams"]) =>
-			["documents-with-queries", ...(queries ? Object.values(queries) : [])] as const,
+			["documents-with-queries", ...stableEntries(queries)] as const,
 		document: (documentId: string) => ["document", documentId] as const,
 		byChunk: (chunkId: string) => ["documents", "by-chunk", chunkId] as const,
 	},
@@ -24,7 +37,7 @@ export const cacheKeys = {
 		detail: (logId: number | string) => ["logs", "detail", logId] as const,
 		summary: (searchSpaceId?: number | string) => ["logs", "summary", searchSpaceId] as const,
 		withQueryParams: (queries: GetLogsRequest["queryParams"]) =>
-			["logs", "with-query-params", ...(queries ? Object.values(queries) : [])] as const,
+			["logs", "with-query-params", ...stableEntries(queries)] as const,
 	},
 	newLLMConfigs: {
 		all: (searchSpaceId: number) => ["new-llm-configs", searchSpaceId] as const,
@@ -51,7 +64,7 @@ export const cacheKeys = {
 	searchSpaces: {
 		all: ["search-spaces"] as const,
 		withQueryParams: (queries: GetSearchSpacesRequest["queryParams"]) =>
-			["search-spaces", ...(queries ? Object.values(queries) : [])] as const,
+			["search-spaces", ...stableEntries(queries)] as const,
 		detail: (searchSpaceId: string) => ["search-spaces", searchSpaceId] as const,
 	},
 	user: {
@@ -78,7 +91,7 @@ export const cacheKeys = {
 	connectors: {
 		all: (searchSpaceId: string) => ["connectors", searchSpaceId] as const,
 		withQueryParams: (queries: GetConnectorsRequest["queryParams"]) =>
-			["connectors", ...(queries ? Object.values(queries) : [])] as const,
+			["connectors", ...stableEntries(queries)] as const,
 		byId: (connectorId: string) => ["connector", connectorId] as const,
 		index: () => ["connector", "index"] as const,
 		googleDrive: {
@@ -92,6 +105,10 @@ export const cacheKeys = {
 	publicChat: {
 		byToken: (shareToken: string) => ["public-chat", shareToken] as const,
 	},
+	github: {
+		repoStars: (username: string, repo: string) =>
+			["github", "repo-stars", username, repo] as const,
+	},
 	publicChatSnapshots: {
 		all: ["public-chat-snapshots"] as const,
 		bySearchSpace: (searchSpaceId: number) =>
diff --git a/surfsense_web/lib/query-client/client.ts b/surfsense_web/lib/query-client/client.ts
index 0dcc2ef039..7b9bf4bda5 100644
--- a/surfsense_web/lib/query-client/client.ts
+++ b/surfsense_web/lib/query-client/client.ts
@@ -1,4 +1,5 @@
-import { QueryClient } from "@tanstack/react-query";
+import { MutationCache, QueryCache, QueryClient } from "@tanstack/react-query";
+import { showErrorToast } from "../error-toast";
 
 export const queryClient = new QueryClient({
 	defaultOptions: {
@@ -7,4 +8,16 @@ export const queryClient = new QueryClient({
 			refetchOnWindowFocus: false,
 		},
 	},
+	queryCache: new QueryCache({
+		onError: (error, query) => {
+			if (query.meta?.suppressGlobalErrorToast) return;
+			showErrorToast(error);
+		},
+	}),
+	mutationCache: new MutationCache({
+		onError: (error, _variables, _context, mutation) => {
+			if (mutation.meta?.suppressGlobalErrorToast) return;
+			showErrorToast(error);
+		},
+	}),
 });
diff --git a/surfsense_web/lib/source.ts b/surfsense_web/lib/source.ts
index 162cca57a3..b94f990abb 100644
--- a/surfsense_web/lib/source.ts
+++ b/surfsense_web/lib/source.ts
@@ -1,12 +1,39 @@
 import { loader } from "fumadocs-core/source";
-import { icons } from "lucide-react";
+import {
+	BookOpen,
+	ClipboardCheck,
+	Compass,
+	Container,
+	Download,
+	FlaskConical,
+	Heart,
+	Unplug,
+	Wrench,
+} from "lucide-react";
 import { createElement } from "react";
 import { docs } from "@/.source/server";
 
+/** Explicit whitelist of Lucide icons used in docs frontmatter / meta.json.
+ * Importing the full `icons` barrel would pull every Lucide icon (~1 400 SVGs)
+ * into the docs bundle even though only a handful are referenced. Add new icons
+ * here as docs pages are added.
+ */
+const DOCS_ICONS: Record<string, React.ComponentType> = {
+	BookOpen,
+	ClipboardCheck,
+	Compass,
+	Container,
+	Download,
+	FlaskConical,
+	Heart,
+	Unplug,
+	Wrench,
+};
+
 export const source = loader({
 	baseUrl: "/docs",
 	source: docs.toFumadocsSource(),
 	icon(icon) {
-		if (icon && icon in icons) return createElement(icons[icon as keyof typeof icons]);
+		if (icon && icon in DOCS_ICONS) return createElement(DOCS_ICONS[icon]);
 	},
 });
diff --git a/surfsense_web/lib/url.ts b/surfsense_web/lib/url.ts
new file mode 100644
index 0000000000..7bb0488d0f
--- /dev/null
+++ b/surfsense_web/lib/url.ts
@@ -0,0 +1,14 @@
+/**
+ * Extract a normalized hostname from a URL. Strips a leading `www.`.
+ * Returns `undefined` if the input is not a parseable URL.
+ *
+ * This is the canonical replacement for the four previously-duplicated
+ * `extractDomain` helpers that had subtly different error fallbacks.
+ */
+export function tryGetHostname(url: string): string | undefined {
+	try {
+		return new URL(url).hostname.replace(/^www\./, "");
+	} catch {
+		return undefined;
+	}
+}
diff --git a/surfsense_web/lib/user-avatar.ts b/surfsense_web/lib/user-avatar.ts
new file mode 100644
index 0000000000..66a972362a
--- /dev/null
+++ b/surfsense_web/lib/user-avatar.ts
@@ -0,0 +1,34 @@
+const USER_AVATAR_COLORS = [
+	"#6366f1",
+	"#8b5cf6",
+	"#a855f7",
+	"#d946ef",
+	"#ec4899",
+	"#f43f5e",
+	"#ef4444",
+	"#f97316",
+	"#eab308",
+	"#84cc16",
+	"#22c55e",
+	"#14b8a6",
+	"#06b6d4",
+	"#0ea5e9",
+	"#3b82f6",
+];
+
+export function getUserAvatarColor(email: string): string {
+	let hash = 0;
+	for (let i = 0; i < email.length; i++) {
+		hash = email.charCodeAt(i) + ((hash << 5) - hash);
+	}
+	return USER_AVATAR_COLORS[Math.abs(hash) % USER_AVATAR_COLORS.length];
+}
+
+export function getUserInitials(email: string): string {
+	const name = email.split("@")[0];
+	const parts = name.split(/[._-]/);
+	if (parts.length >= 2 && parts[0]?.[0] && parts[1]?.[0]) {
+		return (parts[0][0] + parts[1][0]).toUpperCase();
+	}
+	return name.slice(0, 2).toUpperCase();
+}
diff --git a/surfsense_web/messages/en.json b/surfsense_web/messages/en.json
index a3a4e8853f..97ae051017 100644
--- a/surfsense_web/messages/en.json
+++ b/surfsense_web/messages/en.json
@@ -123,7 +123,6 @@
 		"api_key_nav_description": "Manage your API access token",
 		"api_key_title": "API Key",
 		"api_key_description": "Use this key to authenticate API requests",
-		"api_key_warning_title": "Keep it secret",
 		"api_key_warning_description": "Your API key grants full access to your account. Never share it publicly or commit it to version control.",
 		"your_api_key": "Your API Key",
 		"copied": "Copied!",
@@ -396,7 +395,16 @@
 		"supported_file_types": "Supported File Types",
 		"file_too_large": "File Too Large",
 		"file_too_large_desc": "\"{name}\" exceeds the {maxMB}MB per-file limit.",
-		"no_supported_files_in_folder": "No supported file types found in the selected folder."
+		"no_supported_files_in_folder": "No supported file types found in the selected folder.",
+		"uploading_folder": "Uploading folder…",
+		"upload_folder_button": "Upload Folder ({count} {count, plural, one {file} other {files}})",
+		"select_files_or_folder": "Select files or folder",
+		"tap_select_files_or_folder": "Tap to select files or folder",
+		"processing_mode": "Processing Mode",
+		"basic_mode": "Basic",
+		"basic_mode_desc": "Standard processing (1 credit per page)",
+		"premium_mode": "Premium",
+		"premium_mode_desc": "Enhanced for financial, medical & legal docs (10 credits per page)"
 	},
 	"add_webpage": {
 		"title": "Add Webpages for Crawling",
@@ -693,6 +701,7 @@
 		"learn_more": "Learn more",
 		"documentation": "Documentation",
 		"github": "GitHub",
+		"download_for_os": "Download for {os}",
 		"inbox": "Inbox",
 		"search_inbox": "Search inbox",
 		"mark_all_read": "Mark all as read",
@@ -732,8 +741,8 @@
 		"back_to_app": "Back to app",
 		"nav_general": "General",
 		"nav_general_desc": "Name, description & basic info",
-		"nav_agent_configs": "Agent Configs",
-		"nav_agent_configs_desc": "Models with prompts & citations",
+		"nav_agent_models": "Agent Models",
+		"nav_agent_models_desc": "Models with prompts & citations",
 		"nav_role_assignments": "Role Assignments",
 		"nav_role_assignments_desc": "Assign configs to agent roles",
 		"nav_image_models": "Image Models",
@@ -780,7 +789,7 @@
 	"homepage": {
 		"hero_title_part1": "The AI Workspace",
 		"hero_title_part2": "Built for Teams",
-		"hero_description": "Connect any LLM to your internal knowledge sources and chat with it in real time alongside your team.",
+		"hero_description": "An open source, privacy focused alternative to NotebookLM for teams with no data limits.",
 		"cta_start_trial": "Start Free Trial",
 		"cta_explore": "Explore",
 		"integrations_title": "Integrations",
diff --git a/surfsense_web/messages/es.json b/surfsense_web/messages/es.json
index fa620e2713..2ac168d0a5 100644
--- a/surfsense_web/messages/es.json
+++ b/surfsense_web/messages/es.json
@@ -123,7 +123,6 @@
 		"api_key_nav_description": "Administra tu token de acceso a la API",
 		"api_key_title": "Clave API",
 		"api_key_description": "Usa esta clave para autenticar las solicitudes de la API",
-		"api_key_warning_title": "Mantenla en secreto",
 		"api_key_warning_description": "Tu clave API otorga acceso completo a tu cuenta. Nunca la compartas públicamente ni la incluyas en el control de versiones.",
 		"your_api_key": "Tu clave API",
 		"copied": "¡Copiado!",
@@ -396,7 +395,16 @@
 		"supported_file_types": "Tipos de archivo soportados",
 		"file_too_large": "Archivo demasiado grande",
 		"file_too_large_desc": "\"{name}\" excede el límite de {maxMB} MB por archivo.",
-		"no_supported_files_in_folder": "No se encontraron tipos de archivo compatibles en la carpeta seleccionada."
+		"no_supported_files_in_folder": "No se encontraron tipos de archivo compatibles en la carpeta seleccionada.",
+		"uploading_folder": "Subiendo carpeta…",
+		"upload_folder_button": "Subir carpeta ({count} {count, plural, one {archivo} other {archivos}})",
+		"select_files_or_folder": "Seleccionar archivos o carpeta",
+		"tap_select_files_or_folder": "Toca para seleccionar archivos o carpeta",
+		"processing_mode": "Modo de procesamiento",
+		"basic_mode": "Básico",
+		"basic_mode_desc": "Procesamiento estándar (1 crédito por página)",
+		"premium_mode": "Premium",
+		"premium_mode_desc": "Mejorado para documentos financieros, médicos y legales (10 créditos por página)"
 	},
 	"add_webpage": {
 		"title": "Agregar páginas web para rastreo",
@@ -693,6 +701,7 @@
 		"learn_more": "Más información",
 		"documentation": "Documentación",
 		"github": "GitHub",
+		"download_for_os": "Descargar para {os}",
 		"inbox": "Bandeja de entrada",
 		"search_inbox": "Buscar en bandeja de entrada",
 		"mark_all_read": "Marcar todo como leído",
@@ -732,8 +741,8 @@
 		"back_to_app": "Volver a la app",
 		"nav_general": "General",
 		"nav_general_desc": "Nombre, descripción e información básica",
-		"nav_agent_configs": "Configuraciones de agente",
-		"nav_agent_configs_desc": "Modelos LLM con prompts y citas",
+		"nav_agent_models": "Modelos de agente",
+		"nav_agent_models_desc": "Modelos LLM con prompts y citas",
 		"nav_role_assignments": "Asignaciones de roles",
 		"nav_role_assignments_desc": "Asignar configuraciones a roles de agente",
 		"nav_image_models": "Modelos de imagen",
diff --git a/surfsense_web/messages/hi.json b/surfsense_web/messages/hi.json
index faeb4cb942..2bfa9a59f0 100644
--- a/surfsense_web/messages/hi.json
+++ b/surfsense_web/messages/hi.json
@@ -123,7 +123,6 @@
 		"api_key_nav_description": "अपना API एक्सेस टोकन प्रबंधित करें",
 		"api_key_title": "API कुंजी",
 		"api_key_description": "API अनुरोधों को प्रमाणित करने के लिए इस कुंजी का उपयोग करें",
-		"api_key_warning_title": "इसे गुप्त रखें",
 		"api_key_warning_description": "आपकी API कुंजी आपके खाते तक पूर्ण पहुंच प्रदान करती है। इसे कभी सार्वजनिक रूप से साझा न करें या संस्करण नियंत्रण में शामिल न करें।",
 		"your_api_key": "आपकी API कुंजी",
 		"copied": "कॉपी किया गया!",
@@ -396,7 +395,16 @@
 		"supported_file_types": "समर्थित फ़ाइल प्रकार",
 		"file_too_large": "फ़ाइल बहुत बड़ी है",
 		"file_too_large_desc": "\"{name}\" प्रति फ़ाइल {maxMB}MB की सीमा से अधिक है।",
-		"no_supported_files_in_folder": "चयनित फ़ोल्डर में कोई समर्थित फ़ाइल प्रकार नहीं मिला।"
+		"no_supported_files_in_folder": "चयनित फ़ोल्डर में कोई समर्थित फ़ाइल प्रकार नहीं मिला।",
+		"uploading_folder": "फ़ोल्डर अपलोड हो रहा है…",
+		"upload_folder_button": "फ़ोल्डर अपलोड करें ({count} {count, plural, one {फ़ाइल} other {फ़ाइलें}})",
+		"select_files_or_folder": "फ़ाइलें या फ़ोल्डर चुनें",
+		"tap_select_files_or_folder": "फ़ाइलें या फ़ोल्डर चुनने के लिए टैप करें",
+		"processing_mode": "प्रोसेसिंग मोड",
+		"basic_mode": "बेसिक",
+		"basic_mode_desc": "मानक प्रोसेसिंग (1 क्रेडिट प्रति पेज)",
+		"premium_mode": "प्रीमियम",
+		"premium_mode_desc": "वित्तीय, चिकित्सा और कानूनी दस्तावेज़ों के लिए उन्नत (10 क्रेडिट प्रति पेज)"
 	},
 	"add_webpage": {
 		"title": "क्रॉलिंग के लिए वेबपेज जोड़ें",
@@ -693,6 +701,7 @@
 		"learn_more": "और जानें",
 		"documentation": "दस्तावेज़ीकरण",
 		"github": "GitHub",
+		"download_for_os": "{os} के लिए डाउनलोड करें",
 		"inbox": "इनबॉक्स",
 		"search_inbox": "इनबॉक्स में खोजें",
 		"mark_all_read": "सभी पढ़ा हुआ चिह्नित करें",
@@ -732,8 +741,8 @@
 		"back_to_app": "ऐप पर वापस जाएं",
 		"nav_general": "सामान्य",
 		"nav_general_desc": "नाम, विवरण और बुनियादी जानकारी",
-		"nav_agent_configs": "एजेंट कॉन्फ़िगरेशन",
-		"nav_agent_configs_desc": "प्रॉम्प्ट और उद्धरण के साथ LLM मॉडल",
+		"nav_agent_models": "एजेंट मॉडल",
+		"nav_agent_models_desc": "प्रॉम्प्ट और उद्धरण के साथ LLM मॉडल",
 		"nav_role_assignments": "भूमिका असाइनमेंट",
 		"nav_role_assignments_desc": "एजेंट भूमिकाओं को कॉन्फ़िगरेशन असाइन करें",
 		"nav_image_models": "इमेज मॉडल",
diff --git a/surfsense_web/messages/pt.json b/surfsense_web/messages/pt.json
index 0bed7c6ccf..0299596890 100644
--- a/surfsense_web/messages/pt.json
+++ b/surfsense_web/messages/pt.json
@@ -123,7 +123,6 @@
 		"api_key_nav_description": "Gerencie seu token de acesso à API",
 		"api_key_title": "Chave API",
 		"api_key_description": "Use esta chave para autenticar solicitações da API",
-		"api_key_warning_title": "Mantenha em segredo",
 		"api_key_warning_description": "Sua chave API concede acesso total à sua conta. Nunca a compartilhe publicamente nem a inclua no controle de versão.",
 		"your_api_key": "Sua chave API",
 		"copied": "Copiado!",
@@ -396,7 +395,16 @@
 		"supported_file_types": "Tipos de arquivo suportados",
 		"file_too_large": "Arquivo muito grande",
 		"file_too_large_desc": "\"{name}\" excede o limite de {maxMB} MB por arquivo.",
-		"no_supported_files_in_folder": "Nenhum tipo de arquivo suportado encontrado na pasta selecionada."
+		"no_supported_files_in_folder": "Nenhum tipo de arquivo suportado encontrado na pasta selecionada.",
+		"uploading_folder": "Enviando pasta…",
+		"upload_folder_button": "Enviar pasta ({count} {count, plural, one {arquivo} other {arquivos}})",
+		"select_files_or_folder": "Selecionar arquivos ou pasta",
+		"tap_select_files_or_folder": "Toque para selecionar arquivos ou pasta",
+		"processing_mode": "Modo de processamento",
+		"basic_mode": "Básico",
+		"basic_mode_desc": "Processamento padrão (1 crédito por página)",
+		"premium_mode": "Premium",
+		"premium_mode_desc": "Aprimorado para documentos financeiros, médicos e jurídicos (10 créditos por página)"
 	},
 	"add_webpage": {
 		"title": "Adicionar páginas web para rastreamento",
@@ -693,6 +701,7 @@
 		"learn_more": "Saiba mais",
 		"documentation": "Documentação",
 		"github": "GitHub",
+		"download_for_os": "Baixar para {os}",
 		"inbox": "Caixa de entrada",
 		"search_inbox": "Pesquisar caixa de entrada",
 		"mark_all_read": "Marcar tudo como lido",
@@ -732,8 +741,8 @@
 		"back_to_app": "Voltar ao app",
 		"nav_general": "Geral",
 		"nav_general_desc": "Nome, descrição e informações básicas",
-		"nav_agent_configs": "Configurações do agente",
-		"nav_agent_configs_desc": "Modelos LLM com prompts e citações",
+		"nav_agent_models": "Modelos do agente",
+		"nav_agent_models_desc": "Modelos LLM com prompts e citações",
 		"nav_role_assignments": "Atribuições de funções",
 		"nav_role_assignments_desc": "Atribuir configurações a funções do agente",
 		"nav_image_models": "Modelos de imagem",
diff --git a/surfsense_web/messages/zh.json b/surfsense_web/messages/zh.json
index 0d4f7e1c90..6f5b2cafb9 100644
--- a/surfsense_web/messages/zh.json
+++ b/surfsense_web/messages/zh.json
@@ -108,7 +108,6 @@
 		"api_key_nav_description": "管理您的API访问令牌",
 		"api_key_title": "API密钥",
 		"api_key_description": "使用此密钥验证API请求",
-		"api_key_warning_title": "请保密",
 		"api_key_warning_description": "您的API密钥可以完全访问您的账户。请勿公开分享或提交到版本控制。",
 		"your_api_key": "您的API密钥",
 		"copied": "已复制！",
@@ -380,7 +379,16 @@
 		"supported_file_types": "支持的文件类型",
 		"file_too_large": "文件过大",
 		"file_too_large_desc": "\"{name}\" 超过了每个文件 {maxMB}MB 的限制。",
-		"no_supported_files_in_folder": "所选文件夹中没有找到支持的文件类型。"
+		"no_supported_files_in_folder": "所选文件夹中没有找到支持的文件类型。",
+		"uploading_folder": "正在上传文件夹…",
+		"upload_folder_button": "上传文件夹（{count}个文件）",
+		"select_files_or_folder": "选择文件或文件夹",
+		"tap_select_files_or_folder": "点击选择文件或文件夹",
+		"processing_mode": "处理模式",
+		"basic_mode": "基础",
+		"basic_mode_desc": "标准处理（每页 1 积分）",
+		"premium_mode": "高级",
+		"premium_mode_desc": "增强处理财务、医疗和法律文档（每页 10 积分）"
 	},
 	"add_webpage": {
 		"title": "添加网页爬取",
@@ -677,6 +685,7 @@
 		"learn_more": "了解更多",
 		"documentation": "文档",
 		"github": "GitHub",
+		"download_for_os": "下载 {os} 版本",
 		"inbox": "收件箱",
 		"search_inbox": "搜索收件箱",
 		"mark_all_read": "全部标记为已读",
@@ -716,8 +725,8 @@
 		"back_to_app": "返回应用",
 		"nav_general": "常规",
 		"nav_general_desc": "名称、描述和基本信息",
-		"nav_agent_configs": "代理配置",
-		"nav_agent_configs_desc": "LLM 模型配置提示词和引用",
+		"nav_agent_models": "代理模型",
+		"nav_agent_models_desc": "LLM 模型配置提示词和引用",
 		"nav_role_assignments": "角色分配",
 		"nav_role_assignments_desc": "为代理角色分配配置",
 		"nav_image_models": "图像模型",
diff --git a/surfsense_web/next.config.ts b/surfsense_web/next.config.ts
index 5414d548d7..81f349f26d 100644
--- a/surfsense_web/next.config.ts
+++ b/surfsense_web/next.config.ts
@@ -18,11 +18,24 @@ const nextConfig: NextConfig = {
 	},
 	images: {
 		remotePatterns: [
+			{
+				protocol: "http",
+				hostname: "localhost",
+				port: "8000",
+				pathname: "/api/v1/image-generations/**",
+			},
 			{
 				protocol: "https",
 				hostname: "**",
 			},
 		],
+		// Allow remote SVGs (e.g. README badges from img.shields.io, trendshift.io,
+		// etc.) which are otherwise blocked by next/image. The CSP below sandboxes
+		// the SVG and forbids any embedded scripts, which is the mitigation
+		// recommended by Vercel's NEXTJS_SAFE_SVG_IMAGES conformance rule.
+		dangerouslyAllowSVG: true,
+		contentDispositionType: "attachment",
+		contentSecurityPolicy: "default-src 'self'; script-src 'none'; sandbox;",
 	},
 	experimental: {
 		optimizePackageImports: [
diff --git a/surfsense_web/package.json b/surfsense_web/package.json
index 8bfe03af65..640d5c207c 100644
--- a/surfsense_web/package.json
+++ b/surfsense_web/package.json
@@ -1,7 +1,8 @@
 {
 	"name": "surfsense_web",
-	"version": "0.0.15",
+	"version": "0.0.24",
 	"private": true,
+	"packageManager": "pnpm@10.26.0",
 	"description": "SurfSense Frontend",
 	"scripts": {
 		"dev": "next dev --turbopack",
@@ -18,7 +19,14 @@
 		"db:migrate": "drizzle-kit migrate",
 		"db:push": "drizzle-kit push",
 		"db:studio": "drizzle-kit studio",
-		"format:fix": "npx @biomejs/biome check --fix"
+		"format:fix": "npx @biomejs/biome check --fix",
+		"test:e2e": "playwright test",
+		"test:e2e:prod": "cross-env CI=1 playwright test",
+		"test:e2e:ui": "playwright test --ui",
+		"test:e2e:headed": "playwright test --headed",
+		"test:e2e:debug": "playwright test --debug",
+		"test:e2e:report": "playwright show-report",
+		"test:e2e:install": "playwright install --with-deps chromium"
 	},
 	"dependencies": {
 		"@ai-sdk/react": "^1.2.12",
@@ -27,6 +35,8 @@
 		"@assistant-ui/react-markdown": "^0.12.6",
 		"@babel/standalone": "^7.29.2",
 		"@hookform/resolvers": "^5.2.2",
+		"@marsidev/react-turnstile": "^1.5.0",
+		"@monaco-editor/react": "^4.7.0",
 		"@number-flow/react": "^0.5.10",
 		"@platejs/autoformat": "^52.0.11",
 		"@platejs/basic-nodes": "^52.0.11",
@@ -71,7 +81,7 @@
 		"@remotion/media": "^4.0.438",
 		"@remotion/player": "^4.0.438",
 		"@remotion/web-renderer": "^4.0.438",
-		"@rocicorp/zero": "^0.26.2",
+		"@rocicorp/zero": "1.4.0",
 		"@slate-serializers/html": "^2.2.3",
 		"@streamdown/code": "^1.0.2",
 		"@streamdown/math": "^1.0.2",
@@ -97,6 +107,7 @@
 		"fumadocs-core": "^16.3.1",
 		"fumadocs-mdx": "^14.2.1",
 		"fumadocs-ui": "^16.3.1",
+		"fuzzy-search": "^3.2.1",
 		"geist": "^1.4.2",
 		"jotai": "^2.15.1",
 		"jotai-tanstack-query": "^0.11.0",
@@ -104,10 +115,12 @@
 		"lenis": "^1.3.17",
 		"lowlight": "^3.3.0",
 		"lucide-react": "^0.577.0",
+		"monaco-editor": "^0.55.1",
 		"motion": "^12.23.22",
 		"next": "^16.1.0",
 		"next-intl": "^4.6.1",
 		"next-themes": "^0.4.6",
+		"pdfjs-dist": "^5.6.205",
 		"pg": "^8.16.3",
 		"platejs": "^52.0.17",
 		"postgres": "^3.4.7",
@@ -144,6 +157,7 @@
 	"devDependencies": {
 		"@biomejs/biome": "2.4.6",
 		"@eslint/eslintrc": "^3.3.1",
+		"@playwright/test": "^1.59.1",
 		"@svgr/webpack": "^8.1.0",
 		"@tailwindcss/postcss": "^4.1.11",
 		"@tailwindcss/typography": "^0.5.16",
diff --git a/surfsense_web/playwright.config.ts b/surfsense_web/playwright.config.ts
new file mode 100644
index 0000000000..ef066a9be8
--- /dev/null
+++ b/surfsense_web/playwright.config.ts
@@ -0,0 +1,73 @@
+import { defineConfig, devices } from "@playwright/test";
+
+const PORT = process.env.PORT || "3000";
+const BACKEND_PORT = process.env.BACKEND_PORT || "8000";
+const baseURL = process.env.PLAYWRIGHT_BASE_URL || `http://localhost:${PORT}`;
+
+process.env.PLAYWRIGHT_TEST_EMAIL ??= "e2e-test@surfsense.net";
+process.env.PLAYWRIGHT_TEST_PASSWORD ??= "E2eTestPassword123!";
+process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL ??= `http://localhost:${BACKEND_PORT}`;
+process.env.NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE ??= "LOCAL";
+
+/**
+ * Playwright configuration for SurfSense web E2E tests.
+ *
+ * Tests live under `tests/` and are NEVER bundled into the production Next.js
+ * build (`.next/standalone/`) or the Electron desktop build, because:
+ *   - This file and `tests/` are listed in `.dockerignore`.
+ *   - `electron-builder.yml` only ships `.next/standalone/`, not source files.
+ *   - `@playwright/test` is a `devDependency`, so production `pnpm install`
+ *     with `--prod` skips it entirely.
+ *
+ * @see https://playwright.dev/docs/test-configuration
+ */
+export default defineConfig({
+	testDir: "./tests",
+	timeout: 30_000,
+	expect: { timeout: 15_000 },
+	fullyParallel: true,
+	forbidOnly: !!process.env.CI,
+	retries: process.env.CI ? 1 : 0,
+	workers: 1,
+	reporter: process.env.CI
+		? [["html", { open: "never" }], ["github"], ["list"]]
+		: [["html", { open: "on-failure" }], ["list"]],
+	use: {
+		baseURL,
+		trace: "on-first-retry",
+		screenshot: "only-on-failure",
+		video: process.env.CI ? "off" : "retain-on-failure",
+		extraHTTPHeaders: {
+			"x-playwright-test": "true",
+		},
+	},
+	projects: [
+		{
+			name: "setup",
+			testMatch: /.*\.setup\.ts/,
+		},
+		{
+			name: "chromium",
+			dependencies: ["setup"],
+			use: {
+				...devices["Desktop Chrome"],
+				storageState: "playwright/.auth/user.json",
+			},
+		},
+	],
+	webServer: process.env.PLAYWRIGHT_NO_WEB_SERVER
+		? undefined
+		: {
+				// Local stays on webpack dev (Turbopack caused stale-lock panics in E2E).
+				command: process.env.CI ? "pnpm build && pnpm start" : "pnpm exec next dev",
+				url: `http://localhost:${PORT}`,
+				reuseExistingServer: !process.env.CI,
+				timeout: process.env.CI ? 300_000 : 180_000,
+				stdout: "pipe",
+				stderr: "pipe",
+				env: {
+					NEXT_PUBLIC_FASTAPI_BACKEND_URL: process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL,
+					NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE: process.env.NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE,
+				},
+			},
+});
diff --git a/surfsense_web/pnpm-lock.yaml b/surfsense_web/pnpm-lock.yaml
index c54226e779..8602feb8d1 100644
--- a/surfsense_web/pnpm-lock.yaml
+++ b/surfsense_web/pnpm-lock.yaml
@@ -26,6 +26,12 @@ importers:
       '@hookform/resolvers':
         specifier: ^5.2.2
         version: 5.2.2(react-hook-form@7.71.2(react@19.2.4))
+      '@marsidev/react-turnstile':
+        specifier: ^1.5.0
+        version: 1.5.0(react-dom@19.2.4(react@19.2.4))(react@19.2.4)
+      '@monaco-editor/react':
+        specifier: ^4.7.0
+        version: 4.7.0(monaco-editor@0.55.1)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)
       '@number-flow/react':
         specifier: ^0.5.10
         version: 0.5.14(react-dom@19.2.4(react@19.2.4))(react@19.2.4)
@@ -159,8 +165,8 @@ importers:
         specifier: ^4.0.438
         version: 4.0.438(react-dom@19.2.4(react@19.2.4))(react@19.2.4)
       '@rocicorp/zero':
-        specifier: ^0.26.2
-        version: 0.26.2(@opentelemetry/core@2.6.0(@opentelemetry/api@1.9.0))
+        specifier: 1.4.0
+        version: 1.4.0(@opentelemetry/core@2.7.1(@opentelemetry/api@1.9.0))
       '@slate-serializers/html':
         specifier: ^2.2.3
         version: 2.2.3
@@ -229,16 +235,19 @@ importers:
         version: 3.2.0
       fumadocs-core:
         specifier: ^16.3.1
-        version: 16.6.5(@mdx-js/mdx@3.1.1)(@types/estree-jsx@1.0.5)(@types/hast@3.0.4)(@types/mdast@4.0.4)(@types/react@19.2.14)(lucide-react@0.577.0(react@19.2.4))(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(zod@4.3.6)
+        version: 16.6.5(@mdx-js/mdx@3.1.1)(@types/estree-jsx@1.0.5)(@types/hast@3.0.4)(@types/mdast@4.0.4)(@types/react@19.2.14)(lucide-react@0.577.0(react@19.2.4))(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(@playwright/test@1.59.1)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(zod@4.3.6)
       fumadocs-mdx:
         specifier: ^14.2.1
-        version: 14.2.8(@types/mdast@4.0.4)(@types/mdx@2.0.13)(@types/react@19.2.14)(fumadocs-core@16.6.5(@mdx-js/mdx@3.1.1)(@types/estree-jsx@1.0.5)(@types/hast@3.0.4)(@types/mdast@4.0.4)(@types/react@19.2.14)(lucide-react@0.577.0(react@19.2.4))(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(zod@4.3.6))(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(react@19.2.4)(vite@7.3.1(@types/node@20.19.33)(jiti@2.6.1)(lightningcss@1.31.1)(tsx@4.21.0))
+        version: 14.2.8(@types/mdast@4.0.4)(@types/mdx@2.0.13)(@types/react@19.2.14)(fumadocs-core@16.6.5(@mdx-js/mdx@3.1.1)(@types/estree-jsx@1.0.5)(@types/hast@3.0.4)(@types/mdast@4.0.4)(@types/react@19.2.14)(lucide-react@0.577.0(react@19.2.4))(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(@playwright/test@1.59.1)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(zod@4.3.6))(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(@playwright/test@1.59.1)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(react@19.2.4)(vite@7.3.1(@types/node@20.19.33)(jiti@2.6.1)(lightningcss@1.31.1)(tsx@4.21.0))
       fumadocs-ui:
         specifier: ^16.3.1
-        version: 16.6.5(@types/react-dom@19.2.3(@types/react@19.2.14))(@types/react@19.2.14)(fumadocs-core@16.6.5(@mdx-js/mdx@3.1.1)(@types/estree-jsx@1.0.5)(@types/hast@3.0.4)(@types/mdast@4.0.4)(@types/react@19.2.14)(lucide-react@0.577.0(react@19.2.4))(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(zod@4.3.6))(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(tailwindcss@4.2.1)
+        version: 16.6.5(@types/react-dom@19.2.3(@types/react@19.2.14))(@types/react@19.2.14)(fumadocs-core@16.6.5(@mdx-js/mdx@3.1.1)(@types/estree-jsx@1.0.5)(@types/hast@3.0.4)(@types/mdast@4.0.4)(@types/react@19.2.14)(lucide-react@0.577.0(react@19.2.4))(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(@playwright/test@1.59.1)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(zod@4.3.6))(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(@playwright/test@1.59.1)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(tailwindcss@4.2.1)
+      fuzzy-search:
+        specifier: ^3.2.1
+        version: 3.2.1
       geist:
         specifier: ^1.4.2
-        version: 1.7.0(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))
+        version: 1.7.0(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(@playwright/test@1.59.1)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))
       jotai:
         specifier: ^2.15.1
         version: 2.18.0(@babel/core@7.29.0)(@babel/template@7.28.6)(@types/react@19.2.14)(react@19.2.4)
@@ -257,18 +266,24 @@ importers:
       lucide-react:
         specifier: ^0.577.0
         version: 0.577.0(react@19.2.4)
+      monaco-editor:
+        specifier: ^0.55.1
+        version: 0.55.1
       motion:
         specifier: ^12.23.22
         version: 12.34.3(react-dom@19.2.4(react@19.2.4))(react@19.2.4)
       next:
         specifier: ^16.1.0
-        version: 16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)
+        version: 16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(@playwright/test@1.59.1)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)
       next-intl:
         specifier: ^4.6.1
-        version: 4.8.3(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(react@19.2.4)(typescript@5.9.3)
+        version: 4.8.3(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(@playwright/test@1.59.1)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(react@19.2.4)(typescript@5.9.3)
       next-themes:
         specifier: ^0.4.6
         version: 0.4.6(react-dom@19.2.4(react@19.2.4))(react@19.2.4)
+      pdfjs-dist:
+        specifier: ^5.6.205
+        version: 5.6.205
       pg:
         specifier: ^8.16.3
         version: 8.18.0
@@ -372,6 +387,9 @@ importers:
       '@eslint/eslintrc':
         specifier: ^3.3.1
         version: 3.3.3
+      '@playwright/test':
+        specifier: ^1.59.1
+        version: 1.59.1
       '@svgr/webpack':
         specifier: ^8.1.0
         version: 8.1.0(typescript@5.9.3)
@@ -1079,10 +1097,6 @@ packages:
     peerDependencies:
       '@babel/core': ^7.0.0-0
 
-  '@babel/runtime@7.28.6':
-    resolution: {integrity: sha512-05WQkdpL9COIMz4LjTxGpPNCdlpyimKppYNoJ5Di5EUObifl8t4tuLuUBBZEpoLYOmfvIWrsp9fCl0HoPRVTdA==}
-    engines: {node: '>=6.9.0'}
-
   '@babel/runtime@7.29.2':
     resolution: {integrity: sha512-JiDShH45zKHWyGe4ZNVRrCjBz8Nh9TMmZG1kh4QTK8hCBTWBi8Da+i7s1fJw7/lYpM4ccepSNfqzZ/QvABBi5g==}
     engines: {node: '>=6.9.0'}
@@ -1172,8 +1186,8 @@ packages:
   '@date-fns/tz@1.4.1':
     resolution: {integrity: sha512-P5LUNhtbj6YfI3iJjw5EL9eUAG6OitD0W3fWQcpQjDRc/QIsL0tRNuO1PcDvPccWL1fSTXXdE1ds+l95DV/OFA==}
 
-  '@dotenvx/dotenvx@1.57.2':
-    resolution: {integrity: sha512-lv9+UZPnl/KOvShepevLWm3+/wc1It5kgO5Q580evnvOFMZcgKVEYFwxlL7Ohl9my1yjTsWo28N3PJYUEO8wFQ==}
+  '@dotenvx/dotenvx@1.65.0':
+    resolution: {integrity: sha512-v4FA/Lw3pTEloLxBqTOaYDX6MNo0Jo7lGBsPZhwnJBqRJp0AzQg1ZZNxrFsh6HVC6QWeWrfIKLn0y2eyIXaVDg==}
     hasBin: true
 
   '@drdgvhbh/postgres-error-codes@0.0.6':
@@ -1182,9 +1196,9 @@ packages:
   '@drizzle-team/brocli@0.10.2':
     resolution: {integrity: sha512-z33Il7l5dKjUgGULTqBsQBQwckHh5AbIuxhdsIxDDiZAzBOrZO6q9ogcWC65kU382AfynTfgNumVcNIjuIua6w==}
 
-  '@ecies/ciphers@0.2.5':
-    resolution: {integrity: sha512-GalEZH4JgOMHYYcYmVqnFirFsjZHeoGMDt9IxEnM9F7GRUUyUksJ7Ou53L83WHJq3RWKD3AcBpo0iQh0oMpf8A==}
-    engines: {bun: '>=1', deno: '>=2', node: '>=16'}
+  '@ecies/ciphers@0.2.6':
+    resolution: {integrity: sha512-patgsRPKGkhhoBjETV4XxD0En4ui5fbX0hzayqI3M8tvNMGUoUvmyYAIWwlxBc1KX5cturfqByYdj5bYGRpN9g==}
+    engines: {bun: '>=1', deno: '>=2.7.10', node: '>=16'}
     peerDependencies:
       '@noble/ciphers': ^1.0.0
 
@@ -1766,8 +1780,8 @@ packages:
     resolution: {integrity: sha512-Iq8QQQ/7X3Sac15oB6p0FmUg/klxQvXLeileoqrTRGJYLV+/9tubbr9ipz0GKHjmXVsgFPo/+W+2cA8eNcR+XA==}
     engines: {node: '>=12.10.0'}
 
-  '@grpc/proto-loader@0.8.0':
-    resolution: {integrity: sha512-rc1hOQtjIWGxcxpb9aHAfLpIctjEnsDehj0DAiVfBlmT84uvR0uUtN2hEi/ecvWVjXUGf5qPF4qEgiLOx1YIMQ==}
+  '@grpc/proto-loader@0.8.1':
+    resolution: {integrity: sha512-wtF6h+DY6M3YaDBPAmvuuA6jV8Sif9MjtOI5euKFWRgCDl5PeDpPsHR9u2l6St5ceY8AZgoNDww5+HvEsXFsGg==}
     engines: {node: '>=6'}
     hasBin: true
 
@@ -1951,6 +1965,12 @@ packages:
   '@juggle/resize-observer@3.4.0':
     resolution: {integrity: sha512-dfLbk+PwWvFzSxwk3n5ySL0hfBog779o8h68wK/7/APo/7cgyWp5jcXockbxdk5kFRkbeXWm4Fbi9FrdN381sA==}
 
+  '@marsidev/react-turnstile@1.5.0':
+    resolution: {integrity: sha512-Ph6mcj8u9WBDsBO7s9jKPsyRDz1sBPBJwrk+Ngx09vFInvKsQ6U6kW5amEcGq4dHOreB6DgFrOJk7/fy318YlQ==}
+    peerDependencies:
+      react: ^17.0.2 || ^18.0.0 || ^19.0
+      react-dom: ^17.0.2 || ^18.0.0 || ^19.0
+
   '@mdx-js/mdx@3.1.1':
     resolution: {integrity: sha512-f6ZO2ifpwAQIpzGWaBQT2TXxPv6z3RBzQKpVftEWN78Vl/YweF1uwussDx8ECAXVtr3Rs89fKyG9YlzUs9DyGQ==}
 
@@ -1969,6 +1989,86 @@ packages:
     peerDependencies:
       mediabunny: ^1.0.0
 
+  '@monaco-editor/loader@1.7.0':
+    resolution: {integrity: sha512-gIwR1HrJrrx+vfyOhYmCZ0/JcWqG5kbfG7+d3f/C1LXk2EvzAbHSg3MQ5lO2sMlo9izoAZ04shohfKLVT6crVA==}
+
+  '@monaco-editor/react@4.7.0':
+    resolution: {integrity: sha512-cyzXQCtO47ydzxpQtCGSQGOC8Gk3ZUeBXFAxD+CWXYFo5OqZyZUonFl0DwUlTyAfRHntBfw2p3w4s9R6oe1eCA==}
+    peerDependencies:
+      monaco-editor: '>= 0.25.0 < 1'
+      react: ^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0
+      react-dom: ^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0
+
+  '@napi-rs/canvas-android-arm64@0.1.97':
+    resolution: {integrity: sha512-V1c/WVw+NzH8vk7ZK/O8/nyBSCQimU8sfMsB/9qeSvdkGKNU7+mxy/bIF0gTgeBFmHpj30S4E9WHMSrxXGQuVQ==}
+    engines: {node: '>= 10'}
+    cpu: [arm64]
+    os: [android]
+
+  '@napi-rs/canvas-darwin-arm64@0.1.97':
+    resolution: {integrity: sha512-ok+SCEF4YejcxuJ9Rm+WWunHHpf2HmiPxfz6z1a/NFQECGXtsY7A4B8XocK1LmT1D7P174MzwPF9Wy3AUAwEPw==}
+    engines: {node: '>= 10'}
+    cpu: [arm64]
+    os: [darwin]
+
+  '@napi-rs/canvas-darwin-x64@0.1.97':
+    resolution: {integrity: sha512-PUP6e6/UGlclUvAQNnuXCcnkpdUou6VYZfQOQxExLp86epOylmiwLkqXIvpFmjoTEDmPmXrI+coL/9EFU1gKPA==}
+    engines: {node: '>= 10'}
+    cpu: [x64]
+    os: [darwin]
+
+  '@napi-rs/canvas-linux-arm-gnueabihf@0.1.97':
+    resolution: {integrity: sha512-XyXH2L/cic8eTNtbrXCcvqHtMX/nEOxN18+7rMrAM2XtLYC/EB5s0wnO1FsLMWmK+04ZSLN9FBGipo7kpIkcOw==}
+    engines: {node: '>= 10'}
+    cpu: [arm]
+    os: [linux]
+
+  '@napi-rs/canvas-linux-arm64-gnu@0.1.97':
+    resolution: {integrity: sha512-Kuq/M3djq0K8ktgz6nPlK7Ne5d4uWeDxPpyKWOjWDK2RIOhHVtLtyLiJw2fuldw7Vn4mhw05EZXCEr4Q76rs9w==}
+    engines: {node: '>= 10'}
+    cpu: [arm64]
+    os: [linux]
+
+  '@napi-rs/canvas-linux-arm64-musl@0.1.97':
+    resolution: {integrity: sha512-kKmSkQVnWeqg7qdsiXvYxKhAFuHz3tkBjW/zyQv5YKUPhotpaVhpBGv5LqCngzyuRV85SXoe+OFj+Tv0a0QXkQ==}
+    engines: {node: '>= 10'}
+    cpu: [arm64]
+    os: [linux]
+
+  '@napi-rs/canvas-linux-riscv64-gnu@0.1.97':
+    resolution: {integrity: sha512-Jc7I3A51jnEOIAXeLsN/M/+Z28LUeakcsXs07FLq9prXc0eYOtVwsDEv913Gr+06IRo34gJJVgT0TXvmz+N2VA==}
+    engines: {node: '>= 10'}
+    cpu: [riscv64]
+    os: [linux]
+
+  '@napi-rs/canvas-linux-x64-gnu@0.1.97':
+    resolution: {integrity: sha512-iDUBe7AilfuBSRbSa8/IGX38Mf+iCSBqoVKLSQ5XaY2JLOaqz1TVyPFEyIck7wT6mRQhQt5sN6ogfjIDfi74tg==}
+    engines: {node: '>= 10'}
+    cpu: [x64]
+    os: [linux]
+
+  '@napi-rs/canvas-linux-x64-musl@0.1.97':
+    resolution: {integrity: sha512-AKLFd/v0Z5fvgqBDqhvqtAdx+fHMJ5t9JcUNKq4FIZ5WH+iegGm8HPdj00NFlCSnm83Fp3Ln8I2f7uq1aIiWaA==}
+    engines: {node: '>= 10'}
+    cpu: [x64]
+    os: [linux]
+
+  '@napi-rs/canvas-win32-arm64-msvc@0.1.97':
+    resolution: {integrity: sha512-u883Yr6A6fO7Vpsy9YE4FVCIxzzo5sO+7pIUjjoDLjS3vQaNMkVzx5bdIpEL+ob+gU88WDK4VcxYMZ6nmnoX9A==}
+    engines: {node: '>= 10'}
+    cpu: [arm64]
+    os: [win32]
+
+  '@napi-rs/canvas-win32-x64-msvc@0.1.97':
+    resolution: {integrity: sha512-sWtD2EE3fV0IzN+iiQUqr/Q1SwqWhs2O1FKItFlxtdDkikpEj5g7DKQpY3x55H/MAOnL8iomnlk3mcEeGiUMoQ==}
+    engines: {node: '>= 10'}
+    cpu: [x64]
+    os: [win32]
+
+  '@napi-rs/canvas@0.1.97':
+    resolution: {integrity: sha512-8cFniXvrIEnVwuNSRCW9wirRZbHvrD3JVujdS2P5n5xiJZNZMOZcfOvJ1pb66c7jXMKHHglJEDVJGbm8XWFcXQ==}
+    engines: {node: '>= 10'}
+
   '@napi-rs/wasm-runtime@0.2.12':
     resolution: {integrity: sha512-ZVWUcfwY4E/yPitQJl481FjFo3K22D6qF0DuFH6Y/nbnE11GY5uguDxZMGXPQ8WQ0128MXQD7TnfHyK4oWoIJQ==}
 
@@ -2072,6 +2172,10 @@ packages:
     resolution: {integrity: sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==}
     engines: {node: '>=8.0.0'}
 
+  '@opentelemetry/api@1.9.1':
+    resolution: {integrity: sha512-gLyJlPHPZYdAk1JENA9LeHejZe1Ti77/pTeFm/nMXmQH/HFZlcS/O2XJB+L8fkbrNSqhdtlvjBVjxwUYanNH5Q==}
+    engines: {node: '>=8.0.0'}
+
   '@opentelemetry/auto-instrumentations-node@0.62.2':
     resolution: {integrity: sha512-Ipe6X7ddrCiRsuewyTU83IvKiSFT4piqmv9z8Ovg1E7v98pdTj1pUE6sDrHV50zl7/ypd+cONBgt+EYSZu4u9Q==}
     engines: {node: ^18.19.0 || >=20.6.0}
@@ -2085,8 +2189,8 @@ packages:
     peerDependencies:
       '@opentelemetry/api': '>=1.0.0 <1.10.0'
 
-  '@opentelemetry/context-async-hooks@2.6.0':
-    resolution: {integrity: sha512-L8UyDwqpTcbkIK5cgwDRDYDoEhQoj8wp8BwsO19w3LB1Z41yEQm2VJyNfAi9DrLP/YTqXqWpKHyZfR9/tFYo1Q==}
+  '@opentelemetry/context-async-hooks@2.7.1':
+    resolution: {integrity: sha512-OPFBYuXEn1E4ja3Y6eeA7O+ZnLBNcXTV5Cgsn1VaqBZ6hC5FnpZPLBNme1LJY8ZtF4aOujPKFoeWN4ik487KuQ==}
     engines: {node: ^18.19.0 || >=20.6.0}
     peerDependencies:
       '@opentelemetry/api': '>=1.0.0 <1.10.0'
@@ -2103,14 +2207,14 @@ packages:
     peerDependencies:
       '@opentelemetry/api': '>=1.0.0 <1.10.0'
 
-  '@opentelemetry/core@2.5.1':
-    resolution: {integrity: sha512-Dwlc+3HAZqpgTYq0MUyZABjFkcrKTePwuiFVLjahGD8cx3enqihmpAmdgNFO1R4m/sIe5afjJrA25Prqy4NXlA==}
+  '@opentelemetry/core@2.6.0':
+    resolution: {integrity: sha512-HLM1v2cbZ4TgYN6KEOj+Bbj8rAKriOdkF9Ed3tG25FoprSiQl7kYc+RRT6fUZGOvx0oMi5U67GoFdT+XUn8zEg==}
     engines: {node: ^18.19.0 || >=20.6.0}
     peerDependencies:
       '@opentelemetry/api': '>=1.0.0 <1.10.0'
 
-  '@opentelemetry/core@2.6.0':
-    resolution: {integrity: sha512-HLM1v2cbZ4TgYN6KEOj+Bbj8rAKriOdkF9Ed3tG25FoprSiQl7kYc+RRT6fUZGOvx0oMi5U67GoFdT+XUn8zEg==}
+  '@opentelemetry/core@2.7.1':
+    resolution: {integrity: sha512-QAqIj32AtK6+pEVNG7EOVxHdE06RP+FM5qpiEJ4RtDcFIqKUZHYhl7/7UY5efhwmwNAg7j8QbJVBLxMerc0+gw==}
     engines: {node: ^18.19.0 || >=20.6.0}
     peerDependencies:
       '@opentelemetry/api': '>=1.0.0 <1.10.0'
@@ -2475,8 +2579,8 @@ packages:
     peerDependencies:
       '@opentelemetry/api': '>=1.0.0 <1.10.0'
 
-  '@opentelemetry/redis-common@0.38.2':
-    resolution: {integrity: sha512-1BCcU93iwSRZvDAgwUxC/DV4T/406SkMfxGqu5ojc3AvNI+I9GhV7v0J1HljsczuuhcnFLYqD5VmwVXfCGHzxA==}
+  '@opentelemetry/redis-common@0.38.3':
+    resolution: {integrity: sha512-VCghU1JYs/4gP6Gqf/xro9MEsZ7LrMv2uONVsaESKL38ZOB9BqnI98FfS23wjMnHlpuE+TTaWSoAVNpTwYXzjw==}
     engines: {node: ^18.19.0 || >=20.6.0}
 
   '@opentelemetry/resource-detector-alibaba-cloud@0.31.11':
@@ -2485,8 +2589,8 @@ packages:
     peerDependencies:
       '@opentelemetry/api': ^1.0.0
 
-  '@opentelemetry/resource-detector-aws@2.13.0':
-    resolution: {integrity: sha512-ZPCn7gZhGqUYUoD+RCHIlayoHBMaJaEjfqlgz2EPKoXJ4y7Ru7CUm+Tm3yJVMKF92cN9xUQR0j5KALyF0fg9aw==}
+  '@opentelemetry/resource-detector-aws@2.17.0':
+    resolution: {integrity: sha512-IpeUtIdKW5PQsO2Ylfvxqr5ZawoObKigzQtEbT96NxSs7gej8Fp4LQZuooGAGu5H+qAI8/wjCZWLlJk2FT1Uzg==}
     engines: {node: ^18.19.0 || >=20.6.0}
     peerDependencies:
       '@opentelemetry/api': ^1.0.0
@@ -2521,14 +2625,14 @@ packages:
     peerDependencies:
       '@opentelemetry/api': '>=1.3.0 <1.10.0'
 
-  '@opentelemetry/resources@2.5.1':
-    resolution: {integrity: sha512-BViBCdE/GuXRlp9k7nS1w6wJvY5fnFX5XvuEtWsTAOQFIO89Eru7lGW3WbfbxtCuZ/GbrJfAziXG0w0dpxL7eQ==}
+  '@opentelemetry/resources@2.6.0':
+    resolution: {integrity: sha512-D4y/+OGe3JSuYUCBxtH5T9DSAWNcvCb/nQWIga8HNtXTVPQn59j0nTBAgaAXxUVBDl40mG3Tc76b46wPlZaiJQ==}
     engines: {node: ^18.19.0 || >=20.6.0}
     peerDependencies:
       '@opentelemetry/api': '>=1.3.0 <1.10.0'
 
-  '@opentelemetry/resources@2.6.0':
-    resolution: {integrity: sha512-D4y/+OGe3JSuYUCBxtH5T9DSAWNcvCb/nQWIga8HNtXTVPQn59j0nTBAgaAXxUVBDl40mG3Tc76b46wPlZaiJQ==}
+  '@opentelemetry/resources@2.7.1':
+    resolution: {integrity: sha512-DeT6KKolmC4e/dRQvMQ/RwlnzhaqeiFOXY5ngoOPJ07GgVVKxZOg9EcrNZb5aTzUn+iCrJldAgOfQm1O/QfPAQ==}
     engines: {node: ^18.19.0 || >=20.6.0}
     peerDependencies:
       '@opentelemetry/api': '>=1.3.0 <1.10.0'
@@ -2557,6 +2661,12 @@ packages:
     peerDependencies:
       '@opentelemetry/api': '>=1.9.0 <1.10.0'
 
+  '@opentelemetry/sdk-metrics@2.7.1':
+    resolution: {integrity: sha512-MpDJdkiFDs3Pm1RHO3KByuZbuBdJEXEAkiC0+yJdsZGVCdf1RpHR6n+LHDcS7ffmfrt5kVCzJSCfm4z2C7v0uQ==}
+    engines: {node: ^18.19.0 || >=20.6.0}
+    peerDependencies:
+      '@opentelemetry/api': '>=1.9.0 <1.10.0'
+
   '@opentelemetry/sdk-node@0.203.0':
     resolution: {integrity: sha512-zRMvrZGhGVMvAbbjiNQW3eKzW/073dlrSiAKPVWmkoQzah9wfynpVPeL55f9fVIm0GaBxTLcPeukWGy0/Wj7KQ==}
     engines: {node: ^18.19.0 || >=20.6.0}
@@ -2575,8 +2685,8 @@ packages:
     peerDependencies:
       '@opentelemetry/api': '>=1.3.0 <1.10.0'
 
-  '@opentelemetry/sdk-trace-base@2.6.0':
-    resolution: {integrity: sha512-g/OZVkqlxllgFM7qMKqbPV9c1DUPhQ7d4n3pgZFcrnrNft9eJXZM2TNHTPYREJBrtNdRytYyvwjgL5geDKl3EQ==}
+  '@opentelemetry/sdk-trace-base@2.7.1':
+    resolution: {integrity: sha512-NAYIlsF8MPUsKqJMiDQJTMPOmlbawC1Iz/omMLygZ1C9am8fTKYjTaI+OZM+WTY3t3Glo0wnOg/6/pac6RGPPw==}
     engines: {node: ^18.19.0 || >=20.6.0}
     peerDependencies:
       '@opentelemetry/api': '>=1.3.0 <1.10.0'
@@ -2587,8 +2697,8 @@ packages:
     peerDependencies:
       '@opentelemetry/api': '>=1.0.0 <1.10.0'
 
-  '@opentelemetry/sdk-trace-node@2.6.0':
-    resolution: {integrity: sha512-YhswtasmsbIGEFvLGvR9p/y3PVRTfFf+mgY8van4Ygpnv4sA3vooAjvh+qAn9PNWxs4/IwGGqiQS0PPsaRJ0vQ==}
+  '@opentelemetry/sdk-trace-node@2.7.1':
+    resolution: {integrity: sha512-pCpQxU68lV+I9s9svqMyVu5iHdDDUnqUpSxqwyCU8A9ejEsSnMPCbearwsUO4yk08ZJzAIUCFuReMdVQvHrdvg==}
     engines: {node: ^18.19.0 || >=20.6.0}
     peerDependencies:
       '@opentelemetry/api': '>=1.0.0 <1.10.0'
@@ -2597,6 +2707,10 @@ packages:
     resolution: {integrity: sha512-R5R9tb2AXs2IRLNKLBJDynhkfmx7mX0vi8NkhZb3gUkPWHn6HXk5J8iQ/dql0U3ApfWym4kXXmBDRGO+oeOfjg==}
     engines: {node: '>=14'}
 
+  '@opentelemetry/semantic-conventions@1.40.0':
+    resolution: {integrity: sha512-cifvXDhcqMwwTlTK04GBNeIe7yyo28Mfby85QXFe1Yk8nmi36Ab/5UQwptOx84SsoGNRg+EVSjwzfSZMy6pmlw==}
+    engines: {node: '>=14'}
+
   '@opentelemetry/sql-common@0.41.2':
     resolution: {integrity: sha512-4mhWm3Z8z+i508zQJ7r6Xi7y4mmoJpdvH0fZPFRkWrdp5fq7hhZ2HhYokEOLkfqSMgPR4Z9EyB3DBkbKGOqZiQ==}
     engines: {node: ^18.19.0 || >=20.6.0}
@@ -2607,6 +2721,120 @@ packages:
     resolution: {integrity: sha512-a61ljmRVVyG5MC/698C8/FfFDw5a8LOIvyOLW5fztgUXqUpc1jOfQzOitSCbge657OgXXThmY3Tk8fpiDb4UcA==}
     engines: {node: '>= 20.0.0'}
 
+  '@oxfmt/binding-android-arm-eabi@0.45.0':
+    resolution: {integrity: sha512-A/UMxFob1fefCuMeGxQBulGfFE38g2Gm23ynr3u6b+b7fY7/ajGbNsa3ikMIkGMLJW/TRoQaMoP1kME7S+815w==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [arm]
+    os: [android]
+
+  '@oxfmt/binding-android-arm64@0.45.0':
+    resolution: {integrity: sha512-L63z4uZmHjgvvqvMJD7mwff8aSBkM0+X4uFr6l6U5t6+Qc9DCLVZWIunJ7Gm4fn4zHPdSq6FFQnhu9yqqobxIg==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [arm64]
+    os: [android]
+
+  '@oxfmt/binding-darwin-arm64@0.45.0':
+    resolution: {integrity: sha512-UV34dd623FzqT+outIGndsCA/RBB+qgB3XVQhgmmJ9PJwa37NzPC9qzgKeOhPKxVk2HW+JKldQrVL54zs4Noww==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [arm64]
+    os: [darwin]
+
+  '@oxfmt/binding-darwin-x64@0.45.0':
+    resolution: {integrity: sha512-pMNJv0CMa1pDefVPeNbuQxibh8ITpWDFEhMC/IBB9Zlu76EbgzYwrzI4Cb11mqX2+rIYN70UTrh3z06TM59ptQ==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [x64]
+    os: [darwin]
+
+  '@oxfmt/binding-freebsd-x64@0.45.0':
+    resolution: {integrity: sha512-xTcRoxbbo61sW2+ZRPeH+vp/o9G8gkdhiVumFU+TpneiPm14c79l6GFlxPXlCE9bNWikigbsrvJw46zCVAQFfg==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [x64]
+    os: [freebsd]
+
+  '@oxfmt/binding-linux-arm-gnueabihf@0.45.0':
+    resolution: {integrity: sha512-hWL8Hdni+3U1mPFx1UtWeGp3tNb6EhBAUHRMbKUxVkOp3WwoJbpVO2bfUVbS4PfpledviXXNHSTl1veTa6FhkQ==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [arm]
+    os: [linux]
+
+  '@oxfmt/binding-linux-arm-musleabihf@0.45.0':
+    resolution: {integrity: sha512-6Blt/0OBT7vvfQpqYuYbpbFLPqSiaYpEJzUUWhinPEuADypDbtV1+LdjM0vYBNGPvnj85ex7lTerEX6JGcPt9w==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [arm]
+    os: [linux]
+
+  '@oxfmt/binding-linux-arm64-gnu@0.45.0':
+    resolution: {integrity: sha512-jLjoLfe+hGfjhA8hNBSdw85yCA8ePKq7ME4T+g6P9caQXvmt6IhE2X7iVjnVdkmYUWEzZrxlh4p6RkDmAMJY/A==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [arm64]
+    os: [linux]
+
+  '@oxfmt/binding-linux-arm64-musl@0.45.0':
+    resolution: {integrity: sha512-XQKXZIKYJC3GQJ8FnD3iMntpw69Wd9kDDK/Xt79p6xnFYlGGxSNv2vIBvRTDg5CKByWFWWZLCRDOXoP/m6YN4g==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [arm64]
+    os: [linux]
+
+  '@oxfmt/binding-linux-ppc64-gnu@0.45.0':
+    resolution: {integrity: sha512-+g5RiG+xOkdrCWkKodv407nTvMq4vYM18Uox2MhZBm/YoqFxxJpWKsloskFFG5NU13HGPw1wzYjjOVcyd9moCA==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [ppc64]
+    os: [linux]
+
+  '@oxfmt/binding-linux-riscv64-gnu@0.45.0':
+    resolution: {integrity: sha512-V7dXKoSyEbWAkkSF4JJNtF+NJZDmJoSarSoP30WCsB3X636Rehd3CvxBj49FIJxEBFWhvcUjGSHVeU8Erck1bQ==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [riscv64]
+    os: [linux]
+
+  '@oxfmt/binding-linux-riscv64-musl@0.45.0':
+    resolution: {integrity: sha512-Vdelft1sAEYojVGgcODEFXSWYQYlIvoyIGWebKCuUibd1tvS1TjTx413xG2ZLuHpYj45CkN/ztMLMX6jrgqpgg==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [riscv64]
+    os: [linux]
+
+  '@oxfmt/binding-linux-s390x-gnu@0.45.0':
+    resolution: {integrity: sha512-RR7xKgNpqwENnK0aYCGYg0JycY2n93J0reNjHyes+I9Gq52dH95x+CBlnlAQHCPfz6FGnKA9HirgUl14WO6o7w==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [s390x]
+    os: [linux]
+
+  '@oxfmt/binding-linux-x64-gnu@0.45.0':
+    resolution: {integrity: sha512-U/QQ0+BQNSHxjuXR/utvXnQ50Vu5kUuqEomZvQ1/3mhgbBiMc2WU9q5kZ5WwLp3gnFIx9ibkveoRSe2EZubkqg==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [x64]
+    os: [linux]
+
+  '@oxfmt/binding-linux-x64-musl@0.45.0':
+    resolution: {integrity: sha512-o5TLOUCF0RWQjsIS06yVC+kFgp092/yLe6qBGSUvtnmTVw9gxjpdQSXc3VN5Cnive4K11HNstEZF8ROKHfDFSw==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [x64]
+    os: [linux]
+
+  '@oxfmt/binding-openharmony-arm64@0.45.0':
+    resolution: {integrity: sha512-RnGcV3HgPuOjsGx/k9oyRNKmOp+NBLGzZTdPDYbc19r7NGeYPplnUU/BfU35bX2Y/O4ejvHxcfkvW2WoYL/gsg==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [arm64]
+    os: [openharmony]
+
+  '@oxfmt/binding-win32-arm64-msvc@0.45.0':
+    resolution: {integrity: sha512-v3Vj7iKKsUFwt9w5hsqIIoErKVoENC6LoqfDlteOQ5QMDCXihlqLoxpmviUhXnNncg4zV6U9BPwlBbwa+qm4wg==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [arm64]
+    os: [win32]
+
+  '@oxfmt/binding-win32-ia32-msvc@0.45.0':
+    resolution: {integrity: sha512-N8yotPBX6ph0H3toF4AEpdCeVPrdcSetj+8eGiZGsrLsng3bs/Q5HPu4bbSxip5GBPx5hGbGHrZwH4+rcrjhHA==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [ia32]
+    os: [win32]
+
+  '@oxfmt/binding-win32-x64-msvc@0.45.0':
+    resolution: {integrity: sha512-w5MMTRCK1dpQeRA+HHqXQXyN33DlG/N2LOYxJmaT4fJjcmZrbNnqw7SmIk7I2/a2493PPLZ+2E/Ar6t2iKVMug==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [x64]
+    os: [win32]
+
   '@parcel/watcher-android-arm64@2.5.6':
     resolution: {integrity: sha512-YQxSS34tPF/6ZG7r/Ih9xy+kP/WwediEUsqmtf0cuCV5TPPKw/PQHRhueUo6JdeFJaqV3pyjm0GdYjZotbRt/A==}
     engines: {node: '>= 10.0.0'}
@@ -2828,6 +3056,11 @@ packages:
       react: '>=18.0.0'
       react-dom: '>=18.0.0'
 
+  '@playwright/test@1.59.1':
+    resolution: {integrity: sha512-PG6q63nQg5c9rIi4/Z5lR5IVF7yU5MqmKaPOe0HSc0O2cX1fPi96sUQu5j7eo4gKCkB2AnNGoWt7y4/Xx3Kcqg==}
+    engines: {node: '>=18'}
+    hasBin: true
+
   '@postgresql-typed/oids@0.2.0':
     resolution: {integrity: sha512-jh1nIP/nmtlZkj1t0cO2NC2lFHg/fXQhtRFsL70Rh/5ELp5fqxH/calwPVTkS8gPae1k/PTqQYbU23E+Q2q0rg==}
     engines: {node: '>=16', pnpm: '>=8.6.0'}
@@ -2857,6 +3090,9 @@ packages:
   '@protobufjs/codegen@2.0.4':
     resolution: {integrity: sha512-YyFaikqM5sH0ziFZCN3xDC7zeGaB/d0IUb9CATugHWbd1FRFwWwt4ld4OYMPWu5a3Xe01mGAULCdqhMlPl29Jg==}
 
+  '@protobufjs/codegen@2.0.5':
+    resolution: {integrity: sha512-zgXFLzW3Ap33e6d0Wlj4MGIm6Ce8O89n/apUaGNB/jx+hw+ruWEp7EwGUshdLKVRCxZW12fp9r40E1mQrf/34g==}
+
   '@protobufjs/eventemitter@1.1.0':
     resolution: {integrity: sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q==}
 
@@ -2869,6 +3105,9 @@ packages:
   '@protobufjs/inquire@1.1.0':
     resolution: {integrity: sha512-kdSefcPdruJiFMVSbn801t4vFK7KB/5gd2fYvrxhuJYg8ILrmn9SKSX2tZdV6V+ksulWqS7aXjBcRXl3wHoD9Q==}
 
+  '@protobufjs/inquire@1.1.1':
+    resolution: {integrity: sha512-mnzgDV26ueAvk7rsbt9L7bE0SuAoqyuys/sMMrmVcN5x9VsxpcG3rqAUSgDyLp0UZlmNfIbQ4fHfCtreVBk8Ew==}
+
   '@protobufjs/path@1.1.2':
     resolution: {integrity: sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==}
 
@@ -2878,6 +3117,9 @@ packages:
   '@protobufjs/utf8@1.1.0':
     resolution: {integrity: sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw==}
 
+  '@protobufjs/utf8@1.1.1':
+    resolution: {integrity: sha512-oOAWABowe8EAbMyWKM0tYDKi8Yaox52D+HWZhAIJqQXbqe0xI/GV7FhLWqlEKreMkfDjshR5FKgi3mnle0h6Eg==}
+
   '@radix-ui/number@1.1.1':
     resolution: {integrity: sha512-MkKCwxlXTgz6CFoJx3pCwn07GKp36+aZyu/u2Ln2VrA5DcdyCZkASEDBTd8x5whTQQL5CiYf4prXKLcgQdv29g==}
 
@@ -3925,23 +4167,26 @@ packages:
     resolution: {integrity: sha512-TfjMTQp9cNNqNtHFfa+XHEGdA7NnmDRu+ZJH4YF3dso0Xk/b9DMhg/sl+b6CR4ThFZArXXDsG1j8Mwl34wcOZQ==}
     engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0}
 
-  '@rocicorp/zero-sqlite3@1.0.15':
-    resolution: {integrity: sha512-o4ezzTgNQvOM23X/mBRN4LAo0i7XkIDAL3Obg7FaJLFTuwJyYwqE45aqk9vg8/0mlBw2U+DEUGLP1q/Pf+VgYQ==}
-    engines: {bun: '>=1.1.0', node: 20.x || 22.x || 23.x || 24.x}
+  '@rocicorp/zero-sqlite3@1.0.18':
+    resolution: {integrity: sha512-JwHcCijxKj94NDij5UDCJsGHo/D8z4j5De/5zphQ+NctQ4TWr9Zx7L+Q1JBfie4ewVS82Ingu+QKbIwWvdNFXg==}
+    engines: {bun: '>=1.1.0', node: 20.x || 22.x || 23.x || 24.x || 25.x}
     hasBin: true
 
-  '@rocicorp/zero@0.26.2':
-    resolution: {integrity: sha512-fq67gwxvV3rWx/QeKTXu6ab2CLqR7O9GrcZPbI5bCBxHgRB2S+VSxEvcbEtYDjyZvI3GWmisa5UdsYN3H2vYsA==}
+  '@rocicorp/zero@1.4.0':
+    resolution: {integrity: sha512-BRgdF64JWNgIsHG4Fajgjr5ms0HBTdmZUWoJy09KE3TNwMo0Rmz1r1fte1MMH1zY4witcUJsFhGj4aHLsZAfTA==}
     engines: {node: '>=22'}
     hasBin: true
     peerDependencies:
       '@op-engineering/op-sqlite': '>=15'
       expo-sqlite: '>=15'
+      kysely: ^0.28.16
     peerDependenciesMeta:
       '@op-engineering/op-sqlite':
         optional: true
       expo-sqlite:
         optional: true
+      kysely:
+        optional: true
 
   '@rollup/rollup-android-arm-eabi@4.59.0':
     resolution: {integrity: sha512-upnNBkA6ZH2VKGcBj9Fyl9IGNPULcjXRlg0LLeaioQWueH30p6IXtJEbKAgvyv+mJaMxSm1l6xwDXYjpEMiLMg==}
@@ -4287,8 +4532,8 @@ packages:
     peerDependencies:
       react: '>= 16'
 
-  '@tabler/icons@3.37.1':
-    resolution: {integrity: sha512-neLCWkuyNHEPXCyYu6nbN4S3g/59BTa4qyITAugYVpq1YzYNDOZooW7/vRWH98ZItXAudxdKU8muFT7y1PqzuA==}
+  '@tabler/icons@3.41.1':
+    resolution: {integrity: sha512-OaRnVbRmH2nHtFeg+RmMJ/7m2oBIF9XCJAUD5gQnMrpK9f05ydj8MZrAf3NZQqOXyxGN1UBL0D5IKLLEUfr74Q==}
 
   '@tailwindcss/node@4.2.1':
     resolution: {integrity: sha512-jlx6sLk4EOwO6hHe1oCGm1Q4AN/s0rSrTTPBGPM0/RQ6Uylwq17FuU8IeJJKEjtc6K6O07zsvP+gDO6MMWo7pg==}
@@ -4729,6 +4974,7 @@ packages:
   '@xmldom/xmldom@0.8.11':
     resolution: {integrity: sha512-cQzWCtO6C8TQiYl1ruKNn2U6Ao4o4WBBcbL61yJl84x+j5sOWWFU9X7DpND8XZG3daDppSsigMdfAIl2upQBRw==}
     engines: {node: '>=10.0.0'}
+    deprecated: this version has critical issues, please update to the latest version
 
   abstract-logging@2.0.1:
     resolution: {integrity: sha512-2BjRTZxTPvheOvGbBslFSYOUkr+SjPtOnrLP33f+VIWLzezQpZcqVg7ja3L4dBXmzzgwT+a029jRx5PCi3JuiA==}
@@ -4781,8 +5027,8 @@ packages:
   ajv@6.14.0:
     resolution: {integrity: sha512-IWrosm/yrn43eiKqkfkHis7QioDleaXQHdDVPKg0FSwwd/DuvyX79TZnFOnYpB7dcsFAMmtFztZuXPDvSePkFw==}
 
-  ajv@8.18.0:
-    resolution: {integrity: sha512-PlXPeEWMXMZ7sPYOHqmDyCJzcfNrUr3fGNKtezX14ykXOEIvyK81d+qydx89KY5O71FKMPaQ2vBfBFI5NHR63A==}
+  ajv@8.20.0:
+    resolution: {integrity: sha512-Thbli+OlOj+iMPYFBVBfJ3OmCAnaSyNn4M1vz9T6Gka5Jt9ba/HIR56joy65tY6kx/FCF5VXNB819Y7/GUrBGA==}
 
   ansi-regex@5.0.1:
     resolution: {integrity: sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==}
@@ -5108,8 +5354,8 @@ packages:
     resolution: {integrity: sha512-OkTL9umf+He2DZkUq8f8J9of7yL6RJKI24dVITBmNfZBmri9zYZQrKkuXiKhyfPSu8tUhnVBB1iKXevvnlR4Ww==}
     engines: {node: '>= 12'}
 
-  compare-utf8@0.1.1:
-    resolution: {integrity: sha512-bND8Irz+KrF96w4Tkm1m8u5q8iE2fnvP196sHGy7XNrGNXlhyl07VnsCRYrXgEhhf/lM7hyCKRnMeh8Icis4Sw==}
+  compare-utf8@0.2.0:
+    resolution: {integrity: sha512-5BjVs4Be9AwMm4le3Qb3crp44OAn+uuJhvrjPWA8Es/DfVPq5lSc8JaRg/clrrHOMCU+CdDfv/9eBSPh6OOiUQ==}
 
   compute-scroll-into-view@3.1.1:
     resolution: {integrity: sha512-VRhuHOLoKYOy4UbilLbUzbYg93XLjv2PncJC50EuTWPA3gaja1UjBsUP/D/9/juV3vQFr6XBEzn9KCAHdUvOHw==}
@@ -5254,8 +5500,8 @@ packages:
     resolution: {integrity: sha512-8QmQKqEASLd5nx0U1B1okLElbUuuttJ/AnYmRXbbbGDWh6uS208EjD4Xqq/I9wK7u0v6O08XhTWnt5XtEbR6Dg==}
     engines: {node: '>= 0.4'}
 
-  defu@6.1.4:
-    resolution: {integrity: sha512-mEQCMmwJu317oSz8CwdIOdwf3xMif1ttiM8LTufzc3g6kR+9Pe236twL8j3IYT1F7GfRgGcW6MWxzZjLIkuHIg==}
+  defu@6.1.7:
+    resolution: {integrity: sha512-7z22QmUWiQ/2d0KkdYmANbRUVABpZ9SNYyH5vx6PZ+nE5bcC0l7uFvEfHlyld/HcGBFTL536ClDt3DEcSlEJAQ==}
 
   dequal@2.0.3:
     resolution: {integrity: sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA==}
@@ -5298,6 +5544,9 @@ packages:
     resolution: {integrity: sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==}
     engines: {node: '>= 4'}
 
+  dompurify@3.2.7:
+    resolution: {integrity: sha512-WhL/YuveyGXJaerVlMYGWhvQswa7myDG17P7Vu65EWC05o8vfeNbvNf4d/BOvH99+ZW+LlQsc1GDKMa1vNK6dw==}
+
   dompurify@3.3.1:
     resolution: {integrity: sha512-qkdCKzLNtrgPFP1Vo+98FRzJnBRGe4ffyCea9IwHB1fyxPOeNTHpLKYGd4Uk9xvNoH0ZoOjwZxNptyMwqrId1Q==}
 
@@ -5695,8 +5944,8 @@ packages:
   fast-json-stable-stringify@2.1.0:
     resolution: {integrity: sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw==}
 
-  fast-json-stringify@6.3.0:
-    resolution: {integrity: sha512-oRCntNDY/329HJPlmdNLIdogNtt6Vyjb1WuT01Soss3slIdyUp8kAcDU3saQTOquEK8KFVfwIIF7FebxUAu+yA==}
+  fast-json-stringify@6.4.0:
+    resolution: {integrity: sha512-ibRCQ0GZKJIQ+P3Et1h0LhPgp3PMTYk0MH8O+kW3lNYsvmaQww5Nn3f1jf73Q0jR1Yz3a1CDP4/NZD3vOajWJQ==}
 
   fast-levenshtein@2.0.6:
     resolution: {integrity: sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==}
@@ -5704,14 +5953,14 @@ packages:
   fast-querystring@1.1.2:
     resolution: {integrity: sha512-g6KuKWmFXc0fID8WWH0jit4g0AGBoJhCkJMb1RmbsSEUNvQ+ZC8D6CUZ+GtF8nMzSPXnhiePyyqqipzNNEnHjg==}
 
-  fast-uri@3.1.0:
-    resolution: {integrity: sha512-iPeeDKJSWf4IEOasVVrknXpaBV0IApz/gp7S2bb7Z4Lljbl2MGJRqInZiUrQwV16cpzw/D3S5j5Julj/gT52AA==}
+  fast-uri@3.1.2:
+    resolution: {integrity: sha512-rVjf7ArG3LTk+FS6Yw81V1DLuZl1bRbNrev6Tmd/9RaroeeRRJhAt7jg/6YFxbvAQXUCavSoZhPPj6oOx+5KjQ==}
 
   fastify-plugin@5.1.0:
     resolution: {integrity: sha512-FAIDA8eovSt5qcDgcBvDuX/v0Cjz0ohGhENZ/wpc3y+oZCY2afZ9Baqql3g/lC+OHRnciQol4ww7tuthOb9idw==}
 
-  fastify@5.8.4:
-    resolution: {integrity: sha512-sa42J1xylbBAYUWALSBoyXKPDUvM3OoNOibIefA+Oha57FryXKKCZarA1iDntOCWp3O35voZLuDg2mdODXtPzQ==}
+  fastify@5.8.5:
+    resolution: {integrity: sha512-Yqptv59pQzPgQUSIm87hMqHJmdkb1+GPxdE6vW6FRyVE9G86mt7rOghitiU4JHRaTyDUk9pfeKmDeu70lAwM4Q==}
 
   fastq@1.20.1:
     resolution: {integrity: sha512-GGToxJ/w1x32s/D2EKND7kTil4n8OVk/9mycTc4VDza13lOvpUZTGX3mFSCtV9ksdGBVzvsyAVLM6mHFThxXxw==}
@@ -5746,8 +5995,8 @@ packages:
     resolution: {integrity: sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==}
     engines: {node: '>=8'}
 
-  find-my-way@9.5.0:
-    resolution: {integrity: sha512-VW2RfnmscZO5KgBY5XVyKREMW5nMZcxDy+buTOsL+zIPnBlbKm+00sgzoQzq1EVh4aALZLfKdwv6atBGcjvjrQ==}
+  find-my-way@9.6.0:
+    resolution: {integrity: sha512-Zf4Xve4RymLl7NgaavNebZ01joJ8MfVerOG43wy7SHLO+r+K0C6d/SE0BiR7AV5V1VOCFlOP7ecdo+I4qmiHrQ==}
     engines: {node: '>=20'}
 
   find-replace@5.0.2:
@@ -5805,6 +6054,11 @@ packages:
   fs-constants@1.0.0:
     resolution: {integrity: sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==}
 
+  fsevents@2.3.2:
+    resolution: {integrity: sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==}
+    engines: {node: ^8.16.0 || ^10.6.0 || >=11.0.0}
+    os: [darwin]
+
   fsevents@2.3.3:
     resolution: {integrity: sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==}
     engines: {node: ^8.16.0 || ^10.6.0 || >=11.0.0}
@@ -5924,6 +6178,9 @@ packages:
   functions-have-names@1.2.3:
     resolution: {integrity: sha512-xckBUXyTIqT97tq2x2AMb+g163b5JFysYk0x4qxNFwbfQkmNZoiRHb6sPzI9/QV33WeuvVYBUIiD4NzNIyqaRQ==}
 
+  fuzzy-search@3.2.1:
+    resolution: {integrity: sha512-vAcPiyomt1ioKAsAL2uxSABHJ4Ju/e4UeDM+g1OlR0vV4YhLGMNsdLNvZTpEDY4JCSt0E4hASCNM5t2ETtsbyg==}
+
   gaxios@6.7.1:
     resolution: {integrity: sha512-LDODD4TMYx7XXdpwxAVRAIAuB0bzv0s+ywFonY46k126qzQHT9ygyoa9tncmOiQmmDrik65UYsEkv3lbfqQ3yQ==}
     engines: {node: '>=14'}
@@ -6032,6 +6289,10 @@ packages:
     resolution: {integrity: sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==}
     engines: {node: '>= 0.4'}
 
+  hasown@2.0.3:
+    resolution: {integrity: sha512-ej4AhfhfL2Q2zpMmLo7U1Uv9+PyhIZpgQLGT1F9miIGmiCJIoCgSmczFdrc97mWT4kVY72KA+WnnhJ5pghSvSg==}
+    engines: {node: '>= 0.4'}
+
   hast-util-from-dom@5.0.1:
     resolution: {integrity: sha512-N+LqofjR2zuzTjCPzyDUdSshy4Ma6li7p/c3pA78uTwzFgENbgbUrm2ugwsOdcjI1muO+o6Dgzp9p8WHtn/39Q==}
 
@@ -6183,8 +6444,8 @@ packages:
   intl-messageformat@11.1.2:
     resolution: {integrity: sha512-ucSrQmZGAxfiBHfBRXW/k7UC8MaGFlEj4Ry1tKiDcmgwQm1y3EDl40u+4VNHYomxJQMJi9NEI3riDRlth96jKg==}
 
-  ipaddr.js@2.3.0:
-    resolution: {integrity: sha512-Zv/pA+ciVFbCSBBjGfaKUya/CcGmUHzTydLMaTwrUUEM2DIEO3iZvueGxmacvmN50fGpGVKeTXpb2LcYQxeVdg==}
+  ipaddr.js@2.4.0:
+    resolution: {integrity: sha512-9VGk3HGanVE6JoZXHiCpnGy5X0jYDnN4EA4lntFPj+1vIWlFhIylq2CrrCOJH9EAhc5CYhq18F2Av2tgoAPsYQ==}
     engines: {node: '>= 10'}
 
   is-alphabetical@1.0.4:
@@ -6233,6 +6494,10 @@ packages:
     resolution: {integrity: sha512-UfoeMA6fIJ8wTYFEUjelnaGI67v6+N7qXJEvQuIGa99l4xsCruSYOVSQ0uPANn4dAzm8lkYPaKLrrijLq7x23w==}
     engines: {node: '>= 0.4'}
 
+  is-core-module@2.16.2:
+    resolution: {integrity: sha512-evOr8xfXKxE6qSR0hSXL2r3sd7ALj8+7jQEUvPYcm5sgZFdJ+AYzT6yNmJenvIYQBgIGwfwz08sL8zoL7yq2BA==}
+    engines: {node: '>= 0.4'}
+
   is-data-view@1.0.2:
     resolution: {integrity: sha512-RKtWF8pGmS87i2D6gqQu/l7EYRlVdfzemCJN/P3UOs//x1QE7mfhvzHIApBTRf7axvT6DMGwSwBXYCT0nfB9xw==}
     engines: {node: '>= 0.4'}
@@ -6672,6 +6937,11 @@ packages:
   markdown-table@3.0.4:
     resolution: {integrity: sha512-wiYz4+JrLyb/DqW2hkFJxP7Vd7JuTDm77fvbM8VfEQdmSMqcImWeeRbHwZjBjIFki/VaMK2BhFi7oUUZeM5bqw==}
 
+  marked@14.0.0:
+    resolution: {integrity: sha512-uIj4+faQ+MgHgwUW1l2PsPglZLOLOT1uErt06dAPtx2kjteLAkbsd/0FiYg/MGS+i7ZKLb7w2WClxHkzOOuryQ==}
+    engines: {node: '>= 18'}
+    hasBin: true
+
   marked@15.0.12:
     resolution: {integrity: sha512-8dD6FusOQSrpv9Z1rdNMdlSgQOIP880DHqnohobOmYLElGEqAL/JvxvuxZO16r4HtjTlfPRDC1hbvxC9dPN2nA==}
     engines: {node: '>= 18'}
@@ -6892,6 +7162,9 @@ packages:
   module-details-from-path@1.0.4:
     resolution: {integrity: sha512-EGWKgxALGMgzvxYF1UyGTy0HXX/2vHLkw6+NvDKW2jypWbHpjQuj4UMcqQWXHERJhVGKikolT06G3bcKe4fi7w==}
 
+  monaco-editor@0.55.1:
+    resolution: {integrity: sha512-jz4x+TJNFHwHtwuV9vA9rMujcZRb0CEilTEwG2rRSpe/A7Jdkuj8xPKttCgOh+v/lkHy7HsZ64oj+q3xoAFl9A==}
+
   motion-dom@12.34.3:
     resolution: {integrity: sha512-sYgFe+pR9aIM7o4fhs2aXtOI+oqlUd33N9Yoxcgo1Fv7M20sRkHtCmzE/VRNIcq7uNJ+qio+Xubt1FXH3pQ+eQ==}
 
@@ -6924,8 +7197,8 @@ packages:
     engines: {node: ^10 || ^12 || ^13.7 || ^14 || >=15.0.1}
     hasBin: true
 
-  nanoid@5.1.6:
-    resolution: {integrity: sha512-c7+7RQ+dMB5dPwwCp4ee1/iV/q2P6aK1mTZcfr1BTuVlyW9hJYiMPybJCcnBlQtuSmTIWNeazm/zqNoZSSElBg==}
+  nanoid@5.1.11:
+    resolution: {integrity: sha512-v+KEsUv2ps74PaSKv0gHTxTCgMXOIfBEbaqa6w6ISIGC7ZsvHN4N9oJ8d4cmf0n5oTzQz2SLmThbQWhjd/8eKg==}
     engines: {node: ^18 || >=20}
     hasBin: true
 
@@ -6992,8 +7265,8 @@ packages:
   no-case@3.0.4:
     resolution: {integrity: sha512-fgAN3jGAh+RoxUGZHTSOLJIqUc2wmoBwGR4tbpNAKmmovFoWq0OdRkb0VkldReO2a2iBT/OEulG9XSUc10r3zg==}
 
-  node-abi@3.89.0:
-    resolution: {integrity: sha512-6u9UwL0HlAl21+agMN3YAMXcKByMqwGx+pq+P76vii5f7hTPtKDp08/H9py6DY+cfDw7kQNTGEj/rly3IgbNQA==}
+  node-abi@3.92.0:
+    resolution: {integrity: sha512-KdHvFWZjEKDf0cakgFjebl371GPsISX2oZHcuyKqM7DtogIsHrqKeLTo8wBHxaXRAQlY2PsPlZmfo+9ZCxEREQ==}
     engines: {node: '>=10'}
 
   node-addon-api@7.1.1:
@@ -7012,6 +7285,9 @@ packages:
       encoding:
         optional: true
 
+  node-readable-to-web-readable-stream@0.4.2:
+    resolution: {integrity: sha512-/cMZNI34v//jUTrI+UIo4ieHAB5EZRY/+7OmXZgBxaWBMcW2tGdceIw06RFxWxrKZ5Jp3sI2i5TsRo+CBhtVLQ==}
+
   node-releases@2.0.27:
     resolution: {integrity: sha512-nmh3lCkYZ3grZvqcCH+fjmQ7X+H0OeZgP40OierEaAptX4XofMh5kwNbWh7lBduUzCcV/8kZ+NDLCwm2iorIlA==}
 
@@ -7101,6 +7377,11 @@ packages:
     resolution: {integrity: sha512-qFOyK5PjiWZd+QQIh+1jhdb9LpxTF0qs7Pm8o5QHYZ0M3vKqSqzsZaEB6oWlxZ+q2sJBMI/Ktgd2N5ZwQoRHfg==}
     engines: {node: '>= 0.4'}
 
+  oxfmt@0.45.0:
+    resolution: {integrity: sha512-0o/COoN9fY50bjVeM7PQsNgbhndKurBIeTIcspW033OumksjJJmIVDKjAk5HMwU/GHTxSOdGDdhJ6BRzGPmsHg==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    hasBin: true
+
   p-limit@3.1.0:
     resolution: {integrity: sha512-TYOanM3wGwNGsZN2cVTYPArw454xnXj5qmWF1bEoAc4+cU/ol7GVh7odevjp1FNHduHc3KZMcFduxU5Xc6uJRQ==}
     engines: {node: '>=10'}
@@ -7153,6 +7434,10 @@ packages:
     resolution: {integrity: sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw==}
     engines: {node: '>=8'}
 
+  pdfjs-dist@5.6.205:
+    resolution: {integrity: sha512-tlUj+2IDa7G1SbvBNN74UHRLJybZDWYom+k6p5KIZl7huBvsA4APi6mKL+zCxd3tLjN5hOOEE9Tv7VdzO88pfg==}
+    engines: {node: '>=20.19.0 || >=22.13.0 || >=24'}
+
   performance-now@2.1.0:
     resolution: {integrity: sha512-7EAHlyLHI56VEIdK57uwHdHKIaAGbnXPiw0yWbarQZOKaKpvUIgW0jWRVLiatnM+XXlSwsanIBH/hzGMJulMow==}
 
@@ -7178,6 +7463,9 @@ packages:
   pg-protocol@1.11.0:
     resolution: {integrity: sha512-pfsxk2M9M3BuGgDOfuy37VNRRX3jmKgMjcvAcWqNDpZSf4cUmv8HSOl5ViRQFsfARFn0KuUQTgLxVMbNq5NW3g==}
 
+  pg-protocol@1.13.0:
+    resolution: {integrity: sha512-zzdvXfS6v89r6v7OcFCHfHlyG/wvry1ALxZo4LqgUoy7W9xhBDMaqOuMiF3qEV45VqsN6rdlcehHrfDtlCPc8w==}
+
   pg-types@2.2.0:
     resolution: {integrity: sha512-qTAAlrEsl8s4OiEQY69wDvcMIdQN6wdz5ojQiOy6YRMuynxenON0O5oCpJI6lshc6scgAY8qvJ2On/p+CXY0GA==}
     engines: {node: '>=4'}
@@ -7205,6 +7493,10 @@ packages:
     resolution: {integrity: sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==}
     engines: {node: '>=12'}
 
+  picomatch@4.0.4:
+    resolution: {integrity: sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==}
+    engines: {node: '>=12'}
+
   pino-abstract-transport@3.0.0:
     resolution: {integrity: sha512-wlfUczU+n7Hy/Ha5j9a/gZNy7We5+cXp8YL+X+PG8S0KXxw7n/JXA3c46Y0zQznIJ83URJiwy7Lh56WLokNuxg==}
 
@@ -7221,6 +7513,16 @@ packages:
       react: '>=18.0.0'
       react-dom: '>=18.0.0'
 
+  playwright-core@1.59.1:
+    resolution: {integrity: sha512-HBV/RJg81z5BiiZ9yPzIiClYV/QMsDCKUyogwH9p3MCP6IYjUFu/MActgYAvK0oWyV9NlwM3GLBjADyWgydVyg==}
+    engines: {node: '>=18'}
+    hasBin: true
+
+  playwright@1.59.1:
+    resolution: {integrity: sha512-C8oWjPR3F81yljW9o5OxcWzfh6avkVwDD2VYdwIGqTkl+OGFISgypqzfu7dOe4QNLL2aqcWBmI3PMtLIK233lw==}
+    engines: {node: '>=18'}
+    hasBin: true
+
   po-parser@2.1.1:
     resolution: {integrity: sha512-ECF4zHLbUItpUgE3OTtLKlPjeBN+fKEczj2zYjDfCGOzicNs0GK3Vg2IoAYwx7LH/XYw43fZQP6xnZ4TkNxSLQ==}
 
@@ -7291,11 +7593,6 @@ packages:
     resolution: {integrity: sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g==}
     engines: {node: '>= 0.8.0'}
 
-  prettier@3.8.1:
-    resolution: {integrity: sha512-UOnG6LftzbdaHZcKoPFtOcCKztrQ57WkHDeRD9t/PTQtmT0NHSeWWepj6pS0z/N7+08BHFDQVUrfmfMRcZwbMg==}
-    engines: {node: '>=14'}
-    hasBin: true
-
   prismjs@1.27.0:
     resolution: {integrity: sha512-t13BGPUlFDR7wRB5kQDG4jjl7XeuH6jbJGt11JHPL96qwsEHNX2+68tFXqc1/k+/jALsbSWJKUOT/hcYAZ5LkA==}
     engines: {node: '>=6'}
@@ -7326,8 +7623,12 @@ packages:
   property-information@7.1.0:
     resolution: {integrity: sha512-TwEZ+X+yCJmYfL7TPUOcvBZ4QfoT5YenQiJuX//0th53DE6w0xxLEtfK3iyryQFddXuvkIk51EEgrJQ0WJkOmQ==}
 
-  protobufjs@7.5.4:
-    resolution: {integrity: sha512-CvexbZtbov6jW2eXAvLukXjXUW1TzFaivC46BpWc/3BpcCysb5Vffu+B3XHMm8lVEuy2Mm4XGex8hBSg1yapPg==}
+  protobufjs@7.5.5:
+    resolution: {integrity: sha512-3wY1AxV+VBNW8Yypfd1yQY9pXnqTAN+KwQxL8iYm3/BjKYMNg4i0owhEe26PWDOMaIrzeeF98Lqd5NGz4omiIg==}
+    engines: {node: '>=12.0.0'}
+
+  protobufjs@7.5.7:
+    resolution: {integrity: sha512-NGnrxS/nLKUo5nkbVQxlC71sB4hdfImdYIbFeSCidxtwATx0AHRPcANSLd0q5Bb2BkoSWo2iisQhGg5/r+ihbA==}
     engines: {node: '>=12.0.0'}
 
   proxy-compare@2.6.0:
@@ -7548,6 +7849,9 @@ packages:
     resolution: {integrity: sha512-57frrGM/OCTLqLOAh0mhVA9VBMHd+9U7Zb2THMGdBUoZVOtGbJzjxsYGDJ3A9AYYCP4hn6y1TVbaOfzWtm5GFg==}
     engines: {node: '>= 12.13.0'}
 
+  real-require@1.0.0:
+    resolution: {integrity: sha512-P4nbQYQfePJxRSmY+v/KINxVucm4NF3p3s7pJveMTtom52FR4YGltUQLB8idDXwDDWW+eYrWDFbuzUnjoWHF7g==}
+
   recma-build-jsx@1.0.0:
     resolution: {integrity: sha512-8GtdyqaBcDfva+GUKDr3nev3VpKAhup1+RvkMvUxURHpW7QyIvk9F5wz7Vzo06CEMSilw6uArgRqhpiUcWp8ew==}
 
@@ -7672,6 +7976,11 @@ packages:
     engines: {node: '>= 0.4'}
     hasBin: true
 
+  resolve@1.22.12:
+    resolution: {integrity: sha512-TyeJ1zif53BPfHootBGwPRYT1RUt6oGWsaQr8UyZW/eAm9bKoijtvruSDEmZHm92CwS9nj7/fWttqPCgzep8CA==}
+    engines: {node: '>= 0.4'}
+    hasBin: true
+
   resolve@2.0.0-next.6:
     resolution: {integrity: sha512-3JmVl5hMGtJ3kMmB3zi3DL25KfkCEyy3Tw7Gmw7z5w8M9WlwoPFnIvwChzu1+cF3iaK3sp18hhPz8ANeimdJfA==}
     engines: {node: '>= 0.4'}
@@ -7703,6 +8012,9 @@ packages:
   safe-buffer@5.1.2:
     resolution: {integrity: sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==}
 
+  safe-buffer@5.2.1:
+    resolution: {integrity: sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==}
+
   safe-push-apply@1.0.0:
     resolution: {integrity: sha512-iKE9w/Z7xCzUMIZqdBsp6pEQvwuEebH4vdpjcDWnyzaI6yl6O9FHvVpmGelvEHNsoY6wGblkxR6Zty/h00WiSA==}
     engines: {node: '>= 0.4'}
@@ -7711,8 +8023,8 @@ packages:
     resolution: {integrity: sha512-x/+Cz4YrimQxQccJf5mKEbIa1NzeCRNI5Ecl/ekmlYaampdNLPalVyIcCZNNH3MvmqBugV5TMYZXv0ljslUlaw==}
     engines: {node: '>= 0.4'}
 
-  safe-regex2@5.1.0:
-    resolution: {integrity: sha512-pNHAuBW7TrcleFHsxBr5QMi/Iyp0ENjUKz7GCcX1UO7cMh+NmVK6HxQckNL1tJp1XAJVjG6B8OKIPqodqj9rtw==}
+  safe-regex2@5.1.1:
+    resolution: {integrity: sha512-mOSBvHGDZMuIEZMdOz/aCEYDCv0E7nfcNsIhUF+/P+xC7Hyf3FkvymqgPbg9D1EdSGu+uKbJgy09K/RKKc7kJA==}
     hasBin: true
 
   safe-stable-stringify@2.5.0:
@@ -7740,6 +8052,11 @@ packages:
     engines: {node: '>=10'}
     hasBin: true
 
+  semver@7.8.0:
+    resolution: {integrity: sha512-AcM7dV/5ul4EekoQ29Agm5vri8JNqRyj39o0qpX6vDF2GZrtutZl5RwgD1XnZjiTAfncsJhMI48QQH3sN87YNA==}
+    engines: {node: '>=10'}
+    hasBin: true
+
   server-only@0.0.1:
     resolution: {integrity: sha512-qepMx2JxAa5jjfzxG79yPPq+8BuFToHd1hm7kI+Z4zAq1ftQiP7HcxMhDDItrbtwVeLg/cY2JnKnrcFkmiswNA==}
 
@@ -7868,6 +8185,9 @@ packages:
   stable-hash@0.0.5:
     resolution: {integrity: sha512-+L3ccpzibovGXFK+Ap/f8LOS0ahMrHTf3xu7mMLSpEGU0EO9ucaysSylKo9eRDFNhWve/y275iPmIZ4z39a9iA==}
 
+  state-local@1.0.7:
+    resolution: {integrity: sha512-HTEHMNieakEnoe33shBYcZ7NX83ACUjCu8c40iOGEZsngj9zRnkqS9j1pqQPXwobB0ZcVTk27REb7COQ0UR59w==}
+
   stop-iteration-iterator@1.1.0:
     resolution: {integrity: sha512-eLoXW/DHyl62zxY4SCaIgnRhuMr6ri4juEYARS8E6sCEqzKpOiE521Ucofdx+KnDZl5xmvGYaaKCk5FEOxJCoQ==}
     engines: {node: '>= 0.4'}
@@ -7914,6 +8234,9 @@ packages:
   string_decoder@1.1.1:
     resolution: {integrity: sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==}
 
+  string_decoder@1.3.0:
+    resolution: {integrity: sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==}
+
   stringify-entities@4.0.4:
     resolution: {integrity: sha512-IwfBptatlO+QCJUo19AqvrPNqlVMpW9YEL2LIVY+Rpv2qsjCGxaDLNRgeGsQWJhfItebuJhsGSLjaBbNSQ+ieg==}
 
@@ -7987,6 +8310,9 @@ packages:
   tailwind-merge@3.5.0:
     resolution: {integrity: sha512-I8K9wewnVDkL1NTGoqWmVEIlUcB9gFriAEkXkfCjX5ib8ezGxtR3xD7iZIxrfArjEsH7F1CHD4RFUtxefdqV/A==}
 
+  tailwind-merge@3.6.0:
+    resolution: {integrity: sha512-uxL7qAVQriqRQPAyK3pj66VqskWqoZ37PW94jwOTwNfq/z9oyu1V+eqrZqtR2+fCiXdYOZe/Modt8GtvqNzu+w==}
+
   tailwind-scrollbar-hide@4.0.0:
     resolution: {integrity: sha512-gobtvVcThB2Dxhy0EeYSS1RKQJ5baDFkamkhwBvzvevwX6L4XQfpZ3me9s25Ss1ecFVT5jPYJ50n+7xTBJG9WQ==}
     peerDependencies:
@@ -8014,8 +8340,8 @@ packages:
   text-segmentation@1.0.3:
     resolution: {integrity: sha512-iOiPUo/BGnZ6+54OsWxZidGCsdU8YbE4PSpdPinp7DeMtUJNJBoJ/ouUSTJjHkh1KntHaltHl/gDs2FC4i5+Nw==}
 
-  thread-stream@4.0.0:
-    resolution: {integrity: sha512-4iMVL6HAINXWf1ZKZjIPcz5wYaOdPhtO8ATvZ+Xqp3BTdaqtAwQkNmKORqcIo5YkQqGXq5cwfswDwMqqQNrpJA==}
+  thread-stream@4.1.0:
+    resolution: {integrity: sha512-Bw6h2iBDt16v6iHLChBIoVYU8CBo9GPsW8TG7h1hRVhqKhIkH6N8qkxNSmiOZTKsCLPbtWG4ViWLkU6KeKXpig==}
     engines: {node: '>=20'}
 
   throttleit@2.1.0:
@@ -8039,6 +8365,10 @@ packages:
     resolution: {integrity: sha512-j2Zq4NyQYG5XMST4cbs02Ak8iJUdxRM0XI5QyxXuZOzKOINmWurp3smXu3y5wDcJrptwpSjgXHzIQxR0omXljQ==}
     engines: {node: '>=12.0.0'}
 
+  tinypool@2.1.0:
+    resolution: {integrity: sha512-Pugqs6M0m7Lv1I7FtxN4aoyToKg1C4tu+/381vH35y8oENM/Ai7f7C4StcoK4/+BSw9ebcS8jRiVrORFKCALLw==}
+    engines: {node: ^20.0.0 || >=22.0.0}
+
   to-regex-range@5.0.1:
     resolution: {integrity: sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==}
     engines: {node: '>=8.0'}
@@ -8287,10 +8617,12 @@ packages:
 
   uuid@8.3.2:
     resolution: {integrity: sha512-+NYs2QeMWy+GWFOEm9xnn6HCDp0l7QBD7ml8zLUmJ+93Q5NF0NocErnwkTkXVFNiX3/fpC6afS8Dhb/gz7R7eg==}
+    deprecated: uuid@10 and below is no longer supported.  For ESM codebases, update to uuid@latest.  For CommonJS codebases, use uuid@11 (but be aware this version will likely be deprecated in 2028).
     hasBin: true
 
   uuid@9.0.1:
     resolution: {integrity: sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==}
+    deprecated: uuid@10 and below is no longer supported.  For ESM codebases, update to uuid@latest.  For CommonJS codebases, use uuid@11 (but be aware this version will likely be deprecated in 2028).
     hasBin: true
 
   vaul@1.1.2:
@@ -8436,6 +8768,14 @@ packages:
     resolution: {integrity: sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==}
     engines: {node: '>=10'}
 
+  yocto-spinner@1.2.0:
+    resolution: {integrity: sha512-Yw0hUB6UA3o4YUgKy3oSe9a4cxoaZ9sBfYDw+JSxo6Id0KoJGoxzPA24qqUXYKBWABs/zDSGTz9kww7t3F0XGw==}
+    engines: {node: '>=18.19'}
+
+  yoctocolors@2.1.2:
+    resolution: {integrity: sha512-CzhO+pFNo8ajLM2d2IW/R93ipy99LWjtwblvC1RsoSUMZgyLbYFr221TnSNT7GjGdYui6P459mw9JH/g/zW2ug==}
+    engines: {node: '>=18'}
+
   zod-to-json-schema@3.25.1:
     resolution: {integrity: sha512-pM/SU9d3YAggzi6MtR4h7ruuQlqKtad8e9S0fmxcMi+ueAK5Korys/aWcV9LIIHTVbj01NdzxcnXSN+O74ZIVA==}
     peerDependencies:
@@ -9292,8 +9632,6 @@ snapshots:
     transitivePeerDependencies:
       - supports-color
 
-  '@babel/runtime@7.28.6': {}
-
   '@babel/runtime@7.29.2': {}
 
   '@babel/standalone@7.29.2': {}
@@ -9368,23 +9706,24 @@ snapshots:
 
   '@date-fns/tz@1.4.1': {}
 
-  '@dotenvx/dotenvx@1.57.2':
+  '@dotenvx/dotenvx@1.65.0':
     dependencies:
       commander: 11.1.0
       dotenv: 17.3.1
       eciesjs: 0.4.18
       execa: 5.1.1
-      fdir: 6.5.0(picomatch@4.0.3)
+      fdir: 6.5.0(picomatch@4.0.4)
       ignore: 5.3.2
       object-treeify: 1.1.33
-      picomatch: 4.0.3
+      picomatch: 4.0.4
       which: 4.0.0
+      yocto-spinner: 1.2.0
 
   '@drdgvhbh/postgres-error-codes@0.0.6': {}
 
   '@drizzle-team/brocli@0.10.2': {}
 
-  '@ecies/ciphers@0.2.5(@noble/ciphers@1.3.0)':
+  '@ecies/ciphers@0.2.6(@noble/ciphers@1.3.0)':
     dependencies:
       '@noble/ciphers': 1.3.0
 
@@ -9687,9 +10026,9 @@ snapshots:
 
   '@fastify/ajv-compiler@4.0.5':
     dependencies:
-      ajv: 8.18.0
-      ajv-formats: 3.0.1(ajv@8.18.0)
-      fast-uri: 3.1.0
+      ajv: 8.20.0
+      ajv-formats: 3.0.1(ajv@8.20.0)
+      fast-uri: 3.1.2
 
   '@fastify/cors@10.1.0':
     dependencies:
@@ -9700,7 +10039,7 @@ snapshots:
 
   '@fastify/fast-json-stringify-compiler@5.0.3':
     dependencies:
-      fast-json-stringify: 6.3.0
+      fast-json-stringify: 6.4.0
 
   '@fastify/forwarded@3.0.1': {}
 
@@ -9711,7 +10050,7 @@ snapshots:
   '@fastify/proxy-addr@5.1.0':
     dependencies:
       '@fastify/forwarded': 3.0.1
-      ipaddr.js: 2.3.0
+      ipaddr.js: 2.4.0
 
   '@fastify/websocket@11.2.0':
     dependencies:
@@ -9784,14 +10123,14 @@ snapshots:
 
   '@grpc/grpc-js@1.14.3':
     dependencies:
-      '@grpc/proto-loader': 0.8.0
+      '@grpc/proto-loader': 0.8.1
       '@js-sdsl/ordered-map': 4.4.2
 
-  '@grpc/proto-loader@0.8.0':
+  '@grpc/proto-loader@0.8.1':
     dependencies:
       lodash.camelcase: 4.3.0
       long: 5.3.2
-      protobufjs: 7.5.4
+      protobufjs: 7.5.7
       yargs: 17.7.2
 
   '@hookform/resolvers@5.2.2(react-hook-form@7.71.2(react@19.2.4))':
@@ -9930,6 +10269,11 @@ snapshots:
 
   '@juggle/resize-observer@3.4.0': {}
 
+  '@marsidev/react-turnstile@1.5.0(react-dom@19.2.4(react@19.2.4))(react@19.2.4)':
+    dependencies:
+      react: 19.2.4
+      react-dom: 19.2.4(react@19.2.4)
+
   '@mdx-js/mdx@3.1.1':
     dependencies:
       '@types/estree': 1.0.8
@@ -9972,6 +10316,65 @@ snapshots:
     dependencies:
       mediabunny: 1.39.2
 
+  '@monaco-editor/loader@1.7.0':
+    dependencies:
+      state-local: 1.0.7
+
+  '@monaco-editor/react@4.7.0(monaco-editor@0.55.1)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)':
+    dependencies:
+      '@monaco-editor/loader': 1.7.0
+      monaco-editor: 0.55.1
+      react: 19.2.4
+      react-dom: 19.2.4(react@19.2.4)
+
+  '@napi-rs/canvas-android-arm64@0.1.97':
+    optional: true
+
+  '@napi-rs/canvas-darwin-arm64@0.1.97':
+    optional: true
+
+  '@napi-rs/canvas-darwin-x64@0.1.97':
+    optional: true
+
+  '@napi-rs/canvas-linux-arm-gnueabihf@0.1.97':
+    optional: true
+
+  '@napi-rs/canvas-linux-arm64-gnu@0.1.97':
+    optional: true
+
+  '@napi-rs/canvas-linux-arm64-musl@0.1.97':
+    optional: true
+
+  '@napi-rs/canvas-linux-riscv64-gnu@0.1.97':
+    optional: true
+
+  '@napi-rs/canvas-linux-x64-gnu@0.1.97':
+    optional: true
+
+  '@napi-rs/canvas-linux-x64-musl@0.1.97':
+    optional: true
+
+  '@napi-rs/canvas-win32-arm64-msvc@0.1.97':
+    optional: true
+
+  '@napi-rs/canvas-win32-x64-msvc@0.1.97':
+    optional: true
+
+  '@napi-rs/canvas@0.1.97':
+    optionalDependencies:
+      '@napi-rs/canvas-android-arm64': 0.1.97
+      '@napi-rs/canvas-darwin-arm64': 0.1.97
+      '@napi-rs/canvas-darwin-x64': 0.1.97
+      '@napi-rs/canvas-linux-arm-gnueabihf': 0.1.97
+      '@napi-rs/canvas-linux-arm64-gnu': 0.1.97
+      '@napi-rs/canvas-linux-arm64-musl': 0.1.97
+      '@napi-rs/canvas-linux-riscv64-gnu': 0.1.97
+      '@napi-rs/canvas-linux-x64-gnu': 0.1.97
+      '@napi-rs/canvas-linux-x64-musl': 0.1.97
+      '@napi-rs/canvas-win32-arm64-msvc': 0.1.97
+      '@napi-rs/canvas-win32-x64-msvc': 0.1.97
+    optional: true
+
   '@napi-rs/wasm-runtime@0.2.12':
     dependencies:
       '@emnapi/core': 1.8.1
@@ -10040,7 +10443,7 @@ snapshots:
 
   '@opentelemetry/api-logs@0.203.0':
     dependencies:
-      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/api': 1.9.1
 
   '@opentelemetry/api-logs@0.208.0':
     dependencies:
@@ -10048,108 +10451,115 @@ snapshots:
 
   '@opentelemetry/api@1.9.0': {}
 
-  '@opentelemetry/auto-instrumentations-node@0.62.2(@opentelemetry/api@1.9.0)(@opentelemetry/core@2.6.0(@opentelemetry/api@1.9.0))':
-    dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.6.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-amqplib': 0.50.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-aws-lambda': 0.54.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-aws-sdk': 0.58.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-bunyan': 0.49.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-cassandra-driver': 0.49.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-connect': 0.47.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-cucumber': 0.19.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-dataloader': 0.21.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-dns': 0.47.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-express': 0.52.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-fastify': 0.48.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-fs': 0.23.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-generic-pool': 0.47.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-graphql': 0.51.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-grpc': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-hapi': 0.50.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-http': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-ioredis': 0.51.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-kafkajs': 0.13.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-knex': 0.48.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-koa': 0.51.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-lru-memoizer': 0.48.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-memcached': 0.47.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-mongodb': 0.56.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-mongoose': 0.50.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-mysql': 0.49.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-mysql2': 0.50.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-nestjs-core': 0.49.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-net': 0.47.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-oracledb': 0.29.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-pg': 0.56.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-pino': 0.50.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-redis': 0.52.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-restify': 0.49.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-router': 0.48.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-runtime-node': 0.17.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-socket.io': 0.50.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-tedious': 0.22.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-undici': 0.14.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation-winston': 0.48.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/resource-detector-alibaba-cloud': 0.31.11(@opentelemetry/api@1.9.0)
-      '@opentelemetry/resource-detector-aws': 2.13.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/resource-detector-azure': 0.10.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/resource-detector-container': 0.7.11(@opentelemetry/api@1.9.0)
-      '@opentelemetry/resource-detector-gcp': 0.37.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/resources': 2.5.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/sdk-node': 0.203.0(@opentelemetry/api@1.9.0)
+  '@opentelemetry/api@1.9.1': {}
+
+  '@opentelemetry/auto-instrumentations-node@0.62.2(@opentelemetry/api@1.9.1)(@opentelemetry/core@2.7.1(@opentelemetry/api@1.9.0))':
+    dependencies:
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-amqplib': 0.50.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-aws-lambda': 0.54.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-aws-sdk': 0.58.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-bunyan': 0.49.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-cassandra-driver': 0.49.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-connect': 0.47.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-cucumber': 0.19.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-dataloader': 0.21.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-dns': 0.47.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-express': 0.52.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-fastify': 0.48.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-fs': 0.23.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-generic-pool': 0.47.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-graphql': 0.51.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-grpc': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-hapi': 0.50.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-http': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-ioredis': 0.51.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-kafkajs': 0.13.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-knex': 0.48.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-koa': 0.51.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-lru-memoizer': 0.48.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-memcached': 0.47.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-mongodb': 0.56.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-mongoose': 0.50.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-mysql': 0.49.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-mysql2': 0.50.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-nestjs-core': 0.49.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-net': 0.47.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-oracledb': 0.29.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-pg': 0.56.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-pino': 0.50.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-redis': 0.52.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-restify': 0.49.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-router': 0.48.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-runtime-node': 0.17.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-socket.io': 0.50.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-tedious': 0.22.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-undici': 0.14.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation-winston': 0.48.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/resource-detector-alibaba-cloud': 0.31.11(@opentelemetry/api@1.9.1)
+      '@opentelemetry/resource-detector-aws': 2.17.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/resource-detector-azure': 0.10.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/resource-detector-container': 0.7.11(@opentelemetry/api@1.9.1)
+      '@opentelemetry/resource-detector-gcp': 0.37.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/resources': 2.7.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/sdk-node': 0.203.0(@opentelemetry/api@1.9.1)
     transitivePeerDependencies:
       - encoding
       - supports-color
 
-  '@opentelemetry/context-async-hooks@2.0.1(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/context-async-hooks@2.0.1(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/api': 1.9.1
 
-  '@opentelemetry/context-async-hooks@2.6.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/context-async-hooks@2.7.1(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/api': 1.9.1
 
-  '@opentelemetry/core@2.0.1(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/core@2.0.1(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/semantic-conventions': 1.40.0
 
   '@opentelemetry/core@2.2.0(@opentelemetry/api@1.9.0)':
     dependencies:
       '@opentelemetry/api': 1.9.0
       '@opentelemetry/semantic-conventions': 1.39.0
 
-  '@opentelemetry/core@2.5.1(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/core@2.6.0(@opentelemetry/api@1.9.0)':
     dependencies:
       '@opentelemetry/api': 1.9.0
       '@opentelemetry/semantic-conventions': 1.39.0
 
-  '@opentelemetry/core@2.6.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/core@2.7.1(@opentelemetry/api@1.9.0)':
     dependencies:
       '@opentelemetry/api': 1.9.0
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/semantic-conventions': 1.40.0
+
+  '@opentelemetry/core@2.7.1(@opentelemetry/api@1.9.1)':
+    dependencies:
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/semantic-conventions': 1.40.0
 
-  '@opentelemetry/exporter-logs-otlp-grpc@0.203.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/exporter-logs-otlp-grpc@0.203.0(@opentelemetry/api@1.9.1)':
     dependencies:
       '@grpc/grpc-js': 1.14.3
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/otlp-exporter-base': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/otlp-grpc-exporter-base': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/otlp-transformer': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/sdk-logs': 0.203.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/otlp-exporter-base': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/otlp-grpc-exporter-base': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/otlp-transformer': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/sdk-logs': 0.203.0(@opentelemetry/api@1.9.1)
 
-  '@opentelemetry/exporter-logs-otlp-http@0.203.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/exporter-logs-otlp-http@0.203.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/api': 1.9.1
       '@opentelemetry/api-logs': 0.203.0
-      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/otlp-exporter-base': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/otlp-transformer': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/sdk-logs': 0.203.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/otlp-exporter-base': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/otlp-transformer': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/sdk-logs': 0.203.0(@opentelemetry/api@1.9.1)
 
   '@opentelemetry/exporter-logs-otlp-http@0.208.0(@opentelemetry/api@1.9.0)':
     dependencies:
@@ -10160,446 +10570,446 @@ snapshots:
       '@opentelemetry/otlp-transformer': 0.208.0(@opentelemetry/api@1.9.0)
       '@opentelemetry/sdk-logs': 0.208.0(@opentelemetry/api@1.9.0)
 
-  '@opentelemetry/exporter-logs-otlp-proto@0.203.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/exporter-logs-otlp-proto@0.203.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/api': 1.9.1
       '@opentelemetry/api-logs': 0.203.0
-      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/otlp-exporter-base': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/otlp-transformer': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/resources': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/sdk-logs': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/sdk-trace-base': 2.0.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/otlp-exporter-base': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/otlp-transformer': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/resources': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/sdk-logs': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/sdk-trace-base': 2.0.1(@opentelemetry/api@1.9.1)
 
-  '@opentelemetry/exporter-metrics-otlp-grpc@0.203.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/exporter-metrics-otlp-grpc@0.203.0(@opentelemetry/api@1.9.1)':
     dependencies:
       '@grpc/grpc-js': 1.14.3
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/exporter-metrics-otlp-http': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/otlp-exporter-base': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/otlp-grpc-exporter-base': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/otlp-transformer': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/resources': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/sdk-metrics': 2.0.1(@opentelemetry/api@1.9.0)
-
-  '@opentelemetry/exporter-metrics-otlp-http@0.203.0(@opentelemetry/api@1.9.0)':
-    dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/otlp-exporter-base': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/otlp-transformer': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/resources': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/sdk-metrics': 2.0.1(@opentelemetry/api@1.9.0)
-
-  '@opentelemetry/exporter-metrics-otlp-proto@0.203.0(@opentelemetry/api@1.9.0)':
-    dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/exporter-metrics-otlp-http': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/otlp-exporter-base': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/otlp-transformer': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/resources': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/sdk-metrics': 2.0.1(@opentelemetry/api@1.9.0)
-
-  '@opentelemetry/exporter-prometheus@0.203.0(@opentelemetry/api@1.9.0)':
-    dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/resources': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/sdk-metrics': 2.0.1(@opentelemetry/api@1.9.0)
-
-  '@opentelemetry/exporter-trace-otlp-grpc@0.203.0(@opentelemetry/api@1.9.0)':
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/exporter-metrics-otlp-http': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/otlp-exporter-base': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/otlp-grpc-exporter-base': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/otlp-transformer': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/resources': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/sdk-metrics': 2.0.1(@opentelemetry/api@1.9.1)
+
+  '@opentelemetry/exporter-metrics-otlp-http@0.203.0(@opentelemetry/api@1.9.1)':
+    dependencies:
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/otlp-exporter-base': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/otlp-transformer': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/resources': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/sdk-metrics': 2.0.1(@opentelemetry/api@1.9.1)
+
+  '@opentelemetry/exporter-metrics-otlp-proto@0.203.0(@opentelemetry/api@1.9.1)':
+    dependencies:
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/exporter-metrics-otlp-http': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/otlp-exporter-base': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/otlp-transformer': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/resources': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/sdk-metrics': 2.0.1(@opentelemetry/api@1.9.1)
+
+  '@opentelemetry/exporter-prometheus@0.203.0(@opentelemetry/api@1.9.1)':
+    dependencies:
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/resources': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/sdk-metrics': 2.0.1(@opentelemetry/api@1.9.1)
+
+  '@opentelemetry/exporter-trace-otlp-grpc@0.203.0(@opentelemetry/api@1.9.1)':
     dependencies:
       '@grpc/grpc-js': 1.14.3
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/otlp-exporter-base': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/otlp-grpc-exporter-base': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/otlp-transformer': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/resources': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/sdk-trace-base': 2.0.1(@opentelemetry/api@1.9.0)
-
-  '@opentelemetry/exporter-trace-otlp-http@0.203.0(@opentelemetry/api@1.9.0)':
-    dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/otlp-exporter-base': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/otlp-transformer': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/resources': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/sdk-trace-base': 2.0.1(@opentelemetry/api@1.9.0)
-
-  '@opentelemetry/exporter-trace-otlp-proto@0.203.0(@opentelemetry/api@1.9.0)':
-    dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/otlp-exporter-base': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/otlp-transformer': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/resources': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/sdk-trace-base': 2.0.1(@opentelemetry/api@1.9.0)
-
-  '@opentelemetry/exporter-zipkin@2.0.1(@opentelemetry/api@1.9.0)':
-    dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/resources': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/sdk-trace-base': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
-
-  '@opentelemetry/instrumentation-amqplib@0.50.0(@opentelemetry/api@1.9.0)':
-    dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.5.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/otlp-exporter-base': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/otlp-grpc-exporter-base': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/otlp-transformer': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/resources': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/sdk-trace-base': 2.0.1(@opentelemetry/api@1.9.1)
+
+  '@opentelemetry/exporter-trace-otlp-http@0.203.0(@opentelemetry/api@1.9.1)':
+    dependencies:
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/otlp-exporter-base': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/otlp-transformer': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/resources': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/sdk-trace-base': 2.0.1(@opentelemetry/api@1.9.1)
+
+  '@opentelemetry/exporter-trace-otlp-proto@0.203.0(@opentelemetry/api@1.9.1)':
+    dependencies:
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/otlp-exporter-base': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/otlp-transformer': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/resources': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/sdk-trace-base': 2.0.1(@opentelemetry/api@1.9.1)
+
+  '@opentelemetry/exporter-zipkin@2.0.1(@opentelemetry/api@1.9.1)':
+    dependencies:
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/resources': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/sdk-trace-base': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
+
+  '@opentelemetry/instrumentation-amqplib@0.50.0(@opentelemetry/api@1.9.1)':
+    dependencies:
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-aws-lambda@0.54.1(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-aws-lambda@0.54.1(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
       '@types/aws-lambda': 8.10.152
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-aws-sdk@0.58.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-aws-sdk@0.58.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.5.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-bunyan@0.49.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-bunyan@0.49.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/api': 1.9.1
       '@opentelemetry/api-logs': 0.203.0
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
       '@types/bunyan': 1.8.11
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-cassandra-driver@0.49.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-cassandra-driver@0.49.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-connect@0.47.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-connect@0.47.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.5.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
       '@types/connect': 3.4.38
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-cucumber@0.19.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-cucumber@0.19.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-dataloader@0.21.1(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-dataloader@0.21.1(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-dns@0.47.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-dns@0.47.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-express@0.52.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-express@0.52.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.5.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-fastify@0.48.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-fastify@0.48.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.5.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-fs@0.23.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-fs@0.23.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.5.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-generic-pool@0.47.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-generic-pool@0.47.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-graphql@0.51.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-graphql@0.51.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-grpc@0.203.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-grpc@0.203.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-hapi@0.50.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-hapi@0.50.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.5.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-http@0.203.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-http@0.203.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
       forwarded-parse: 2.1.2
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-ioredis@0.51.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-ioredis@0.51.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/redis-common': 0.38.2
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/redis-common': 0.38.3
+      '@opentelemetry/semantic-conventions': 1.40.0
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-kafkajs@0.13.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-kafkajs@0.13.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-knex@0.48.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-knex@0.48.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-koa@0.51.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-koa@0.51.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.5.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-lru-memoizer@0.48.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-lru-memoizer@0.48.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-memcached@0.47.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-memcached@0.47.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
       '@types/memcached': 2.2.10
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-mongodb@0.56.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-mongodb@0.56.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-mongoose@0.50.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-mongoose@0.50.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.5.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-mysql2@0.50.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-mysql2@0.50.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
-      '@opentelemetry/sql-common': 0.41.2(@opentelemetry/api@1.9.0)
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
+      '@opentelemetry/sql-common': 0.41.2(@opentelemetry/api@1.9.1)
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-mysql@0.49.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-mysql@0.49.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
       '@types/mysql': 2.15.27
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-nestjs-core@0.49.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-nestjs-core@0.49.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-net@0.47.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-net@0.47.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-oracledb@0.29.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-oracledb@0.29.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
       '@types/oracledb': 6.5.2
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-pg@0.56.1(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-pg@0.56.1(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.5.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
-      '@opentelemetry/sql-common': 0.41.2(@opentelemetry/api@1.9.0)
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
+      '@opentelemetry/sql-common': 0.41.2(@opentelemetry/api@1.9.1)
       '@types/pg': 8.15.5
       '@types/pg-pool': 2.0.6
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-pino@0.50.1(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-pino@0.50.1(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/api': 1.9.1
       '@opentelemetry/api-logs': 0.203.0
-      '@opentelemetry/core': 2.5.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-redis@0.52.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-redis@0.52.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/redis-common': 0.38.2
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/redis-common': 0.38.3
+      '@opentelemetry/semantic-conventions': 1.40.0
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-restify@0.49.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-restify@0.49.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.5.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-router@0.48.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-router@0.48.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-runtime-node@0.17.1(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-runtime-node@0.17.1(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-socket.io@0.50.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-socket.io@0.50.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-tedious@0.22.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-tedious@0.22.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
       '@types/tedious': 4.0.14
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-undici@0.14.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-undici@0.14.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.5.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation-winston@0.48.1(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation-winston@0.48.1(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/api': 1.9.1
       '@opentelemetry/api-logs': 0.203.0
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/instrumentation@0.203.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/instrumentation@0.203.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/api': 1.9.1
       '@opentelemetry/api-logs': 0.203.0
       import-in-the-middle: 1.15.0
       require-in-the-middle: 7.5.2
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/otlp-exporter-base@0.203.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/otlp-exporter-base@0.203.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/otlp-transformer': 0.203.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/otlp-transformer': 0.203.0(@opentelemetry/api@1.9.1)
 
   '@opentelemetry/otlp-exporter-base@0.208.0(@opentelemetry/api@1.9.0)':
     dependencies:
@@ -10607,24 +11017,24 @@ snapshots:
       '@opentelemetry/core': 2.2.0(@opentelemetry/api@1.9.0)
       '@opentelemetry/otlp-transformer': 0.208.0(@opentelemetry/api@1.9.0)
 
-  '@opentelemetry/otlp-grpc-exporter-base@0.203.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/otlp-grpc-exporter-base@0.203.0(@opentelemetry/api@1.9.1)':
     dependencies:
       '@grpc/grpc-js': 1.14.3
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/otlp-exporter-base': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/otlp-transformer': 0.203.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/otlp-exporter-base': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/otlp-transformer': 0.203.0(@opentelemetry/api@1.9.1)
 
-  '@opentelemetry/otlp-transformer@0.203.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/otlp-transformer@0.203.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/api': 1.9.1
       '@opentelemetry/api-logs': 0.203.0
-      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/resources': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/sdk-logs': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/sdk-metrics': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/sdk-trace-base': 2.0.1(@opentelemetry/api@1.9.0)
-      protobufjs: 7.5.4
+      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/resources': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/sdk-logs': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/sdk-metrics': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/sdk-trace-base': 2.0.1(@opentelemetry/api@1.9.1)
+      protobufjs: 7.5.7
 
   '@opentelemetry/otlp-transformer@0.208.0(@opentelemetry/api@1.9.0)':
     dependencies:
@@ -10635,62 +11045,62 @@ snapshots:
       '@opentelemetry/sdk-logs': 0.208.0(@opentelemetry/api@1.9.0)
       '@opentelemetry/sdk-metrics': 2.2.0(@opentelemetry/api@1.9.0)
       '@opentelemetry/sdk-trace-base': 2.2.0(@opentelemetry/api@1.9.0)
-      protobufjs: 7.5.4
+      protobufjs: 7.5.5
 
-  '@opentelemetry/propagator-b3@2.0.1(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/propagator-b3@2.0.1(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.1)
 
-  '@opentelemetry/propagator-jaeger@2.0.1(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/propagator-jaeger@2.0.1(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.1)
 
-  '@opentelemetry/redis-common@0.38.2': {}
+  '@opentelemetry/redis-common@0.38.3': {}
 
-  '@opentelemetry/resource-detector-alibaba-cloud@0.31.11(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/resource-detector-alibaba-cloud@0.31.11(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.5.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/resources': 2.5.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/resources': 2.7.1(@opentelemetry/api@1.9.1)
 
-  '@opentelemetry/resource-detector-aws@2.13.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/resource-detector-aws@2.17.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.5.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/resources': 2.5.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/resources': 2.7.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
 
-  '@opentelemetry/resource-detector-azure@0.10.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/resource-detector-azure@0.10.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.5.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/resources': 2.5.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/resources': 2.7.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
 
-  '@opentelemetry/resource-detector-container@0.7.11(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/resource-detector-container@0.7.11(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.5.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/resources': 2.5.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/resources': 2.7.1(@opentelemetry/api@1.9.1)
 
-  '@opentelemetry/resource-detector-gcp@0.37.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/resource-detector-gcp@0.37.0(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.5.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/resources': 2.5.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/resources': 2.7.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
       gcp-metadata: 6.1.1
     transitivePeerDependencies:
       - encoding
       - supports-color
 
-  '@opentelemetry/resources@2.0.1(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/resources@2.0.1(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
 
   '@opentelemetry/resources@2.2.0(@opentelemetry/api@1.9.0)':
     dependencies:
@@ -10698,24 +11108,24 @@ snapshots:
       '@opentelemetry/core': 2.2.0(@opentelemetry/api@1.9.0)
       '@opentelemetry/semantic-conventions': 1.39.0
 
-  '@opentelemetry/resources@2.5.1(@opentelemetry/api@1.9.0)':
-    dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.5.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
-
   '@opentelemetry/resources@2.6.0(@opentelemetry/api@1.9.0)':
     dependencies:
       '@opentelemetry/api': 1.9.0
       '@opentelemetry/core': 2.6.0(@opentelemetry/api@1.9.0)
       '@opentelemetry/semantic-conventions': 1.39.0
 
-  '@opentelemetry/sdk-logs@0.203.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/resources@2.7.1(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
+
+  '@opentelemetry/sdk-logs@0.203.0(@opentelemetry/api@1.9.1)':
+    dependencies:
+      '@opentelemetry/api': 1.9.1
       '@opentelemetry/api-logs': 0.203.0
-      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/resources': 2.0.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/resources': 2.0.1(@opentelemetry/api@1.9.1)
 
   '@opentelemetry/sdk-logs@0.208.0(@opentelemetry/api@1.9.0)':
     dependencies:
@@ -10724,11 +11134,11 @@ snapshots:
       '@opentelemetry/core': 2.2.0(@opentelemetry/api@1.9.0)
       '@opentelemetry/resources': 2.2.0(@opentelemetry/api@1.9.0)
 
-  '@opentelemetry/sdk-metrics@2.0.1(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/sdk-metrics@2.0.1(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/resources': 2.0.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/resources': 2.0.1(@opentelemetry/api@1.9.1)
 
   '@opentelemetry/sdk-metrics@2.2.0(@opentelemetry/api@1.9.0)':
     dependencies:
@@ -10736,40 +11146,46 @@ snapshots:
       '@opentelemetry/core': 2.2.0(@opentelemetry/api@1.9.0)
       '@opentelemetry/resources': 2.2.0(@opentelemetry/api@1.9.0)
 
-  '@opentelemetry/sdk-node@0.203.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/sdk-metrics@2.7.1(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/resources': 2.7.1(@opentelemetry/api@1.9.1)
+
+  '@opentelemetry/sdk-node@0.203.0(@opentelemetry/api@1.9.1)':
+    dependencies:
+      '@opentelemetry/api': 1.9.1
       '@opentelemetry/api-logs': 0.203.0
-      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/exporter-logs-otlp-grpc': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/exporter-logs-otlp-http': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/exporter-logs-otlp-proto': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/exporter-metrics-otlp-grpc': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/exporter-metrics-otlp-http': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/exporter-metrics-otlp-proto': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/exporter-prometheus': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/exporter-trace-otlp-grpc': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/exporter-trace-otlp-http': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/exporter-trace-otlp-proto': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/exporter-zipkin': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/propagator-b3': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/propagator-jaeger': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/resources': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/sdk-logs': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/sdk-metrics': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/sdk-trace-base': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/sdk-trace-node': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/exporter-logs-otlp-grpc': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/exporter-logs-otlp-http': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/exporter-logs-otlp-proto': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/exporter-metrics-otlp-grpc': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/exporter-metrics-otlp-http': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/exporter-metrics-otlp-proto': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/exporter-prometheus': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/exporter-trace-otlp-grpc': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/exporter-trace-otlp-http': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/exporter-trace-otlp-proto': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/exporter-zipkin': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/instrumentation': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/propagator-b3': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/propagator-jaeger': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/resources': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/sdk-logs': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/sdk-metrics': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/sdk-trace-base': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/sdk-trace-node': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
     transitivePeerDependencies:
       - supports-color
 
-  '@opentelemetry/sdk-trace-base@2.0.1(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/sdk-trace-base@2.0.1(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/resources': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/resources': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
 
   '@opentelemetry/sdk-trace-base@2.2.0(@opentelemetry/api@1.9.0)':
     dependencies:
@@ -10778,36 +11194,95 @@ snapshots:
       '@opentelemetry/resources': 2.2.0(@opentelemetry/api@1.9.0)
       '@opentelemetry/semantic-conventions': 1.39.0
 
-  '@opentelemetry/sdk-trace-base@2.6.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/sdk-trace-base@2.7.1(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.6.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/resources': 2.6.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/semantic-conventions': 1.39.0
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/resources': 2.7.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/semantic-conventions': 1.40.0
 
-  '@opentelemetry/sdk-trace-node@2.0.1(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/sdk-trace-node@2.0.1(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/context-async-hooks': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/sdk-trace-base': 2.0.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/context-async-hooks': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/core': 2.0.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/sdk-trace-base': 2.0.1(@opentelemetry/api@1.9.1)
 
-  '@opentelemetry/sdk-trace-node@2.6.0(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/sdk-trace-node@2.7.1(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/context-async-hooks': 2.6.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/core': 2.6.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/sdk-trace-base': 2.6.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/context-async-hooks': 2.7.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/sdk-trace-base': 2.7.1(@opentelemetry/api@1.9.1)
 
   '@opentelemetry/semantic-conventions@1.39.0': {}
 
-  '@opentelemetry/sql-common@0.41.2(@opentelemetry/api@1.9.0)':
+  '@opentelemetry/semantic-conventions@1.40.0': {}
+
+  '@opentelemetry/sql-common@0.41.2(@opentelemetry/api@1.9.1)':
     dependencies:
-      '@opentelemetry/api': 1.9.0
-      '@opentelemetry/core': 2.5.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/api': 1.9.1
+      '@opentelemetry/core': 2.7.1(@opentelemetry/api@1.9.1)
 
   '@orama/orama@3.1.18': {}
 
+  '@oxfmt/binding-android-arm-eabi@0.45.0':
+    optional: true
+
+  '@oxfmt/binding-android-arm64@0.45.0':
+    optional: true
+
+  '@oxfmt/binding-darwin-arm64@0.45.0':
+    optional: true
+
+  '@oxfmt/binding-darwin-x64@0.45.0':
+    optional: true
+
+  '@oxfmt/binding-freebsd-x64@0.45.0':
+    optional: true
+
+  '@oxfmt/binding-linux-arm-gnueabihf@0.45.0':
+    optional: true
+
+  '@oxfmt/binding-linux-arm-musleabihf@0.45.0':
+    optional: true
+
+  '@oxfmt/binding-linux-arm64-gnu@0.45.0':
+    optional: true
+
+  '@oxfmt/binding-linux-arm64-musl@0.45.0':
+    optional: true
+
+  '@oxfmt/binding-linux-ppc64-gnu@0.45.0':
+    optional: true
+
+  '@oxfmt/binding-linux-riscv64-gnu@0.45.0':
+    optional: true
+
+  '@oxfmt/binding-linux-riscv64-musl@0.45.0':
+    optional: true
+
+  '@oxfmt/binding-linux-s390x-gnu@0.45.0':
+    optional: true
+
+  '@oxfmt/binding-linux-x64-gnu@0.45.0':
+    optional: true
+
+  '@oxfmt/binding-linux-x64-musl@0.45.0':
+    optional: true
+
+  '@oxfmt/binding-openharmony-arm64@0.45.0':
+    optional: true
+
+  '@oxfmt/binding-win32-arm64-msvc@0.45.0':
+    optional: true
+
+  '@oxfmt/binding-win32-ia32-msvc@0.45.0':
+    optional: true
+
+  '@oxfmt/binding-win32-x64-msvc@0.45.0':
+    optional: true
+
   '@parcel/watcher-android-arm64@2.5.6':
     optional: true
 
@@ -10919,7 +11394,7 @@ snapshots:
       jotai-optics: 0.4.0(jotai@2.8.4(@types/react@19.2.14)(react@19.2.4))(optics-ts@2.4.1)
       jotai-x: 2.3.3(@types/react@19.2.14)(jotai@2.8.4(@types/react@19.2.14)(react@19.2.4))(react@19.2.4)
       lodash: 4.17.23
-      nanoid: 5.1.6
+      nanoid: 5.1.7
       optics-ts: 2.4.1
       react: 19.2.4
       react-compiler-runtime: 1.0.0(react@19.2.4)
@@ -11075,6 +11550,10 @@ snapshots:
       - scheduler
       - use-sync-external-store
 
+  '@playwright/test@1.59.1':
+    dependencies:
+      playwright: 1.59.1
+
   '@postgresql-typed/oids@0.2.0': {}
 
   '@posthog/core@1.23.1':
@@ -11096,6 +11575,8 @@ snapshots:
 
   '@protobufjs/codegen@2.0.4': {}
 
+  '@protobufjs/codegen@2.0.5': {}
+
   '@protobufjs/eventemitter@1.1.0': {}
 
   '@protobufjs/fetch@1.1.0':
@@ -11107,21 +11588,25 @@ snapshots:
 
   '@protobufjs/inquire@1.1.0': {}
 
+  '@protobufjs/inquire@1.1.1': {}
+
   '@protobufjs/path@1.1.2': {}
 
   '@protobufjs/pool@1.1.0': {}
 
   '@protobufjs/utf8@1.1.0': {}
 
+  '@protobufjs/utf8@1.1.1': {}
+
   '@radix-ui/number@1.1.1': {}
 
   '@radix-ui/primitive@1.0.0':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
 
   '@radix-ui/primitive@1.0.1':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
 
   '@radix-ui/primitive@1.1.3': {}
 
@@ -11255,12 +11740,12 @@ snapshots:
 
   '@radix-ui/react-compose-refs@1.0.0(react@19.2.4)':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
       react: 19.2.4
 
   '@radix-ui/react-compose-refs@1.0.1(@types/react@19.2.14)(react@19.2.4)':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
       react: 19.2.4
     optionalDependencies:
       '@types/react': 19.2.14
@@ -11287,12 +11772,12 @@ snapshots:
 
   '@radix-ui/react-context@1.0.0(react@19.2.4)':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
       react: 19.2.4
 
   '@radix-ui/react-context@1.0.1(@types/react@19.2.14)(react@19.2.4)':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
       react: 19.2.4
     optionalDependencies:
       '@types/react': 19.2.14
@@ -11311,7 +11796,7 @@ snapshots:
 
   '@radix-ui/react-dialog@1.0.0(@types/react@19.2.14)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
       '@radix-ui/primitive': 1.0.0
       '@radix-ui/react-compose-refs': 1.0.0(react@19.2.4)
       '@radix-ui/react-context': 1.0.0(react@19.2.4)
@@ -11333,7 +11818,7 @@ snapshots:
 
   '@radix-ui/react-dialog@1.0.4(@types/react-dom@19.2.3(@types/react@19.2.14))(@types/react@19.2.14)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
       '@radix-ui/primitive': 1.0.1
       '@radix-ui/react-compose-refs': 1.0.1(@types/react@19.2.14)(react@19.2.4)
       '@radix-ui/react-context': 1.0.1(@types/react@19.2.14)(react@19.2.4)
@@ -11384,7 +11869,7 @@ snapshots:
 
   '@radix-ui/react-dismissable-layer@1.0.0(react-dom@19.2.4(react@19.2.4))(react@19.2.4)':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
       '@radix-ui/primitive': 1.0.0
       '@radix-ui/react-compose-refs': 1.0.0(react@19.2.4)
       '@radix-ui/react-primitive': 1.0.0(react-dom@19.2.4(react@19.2.4))(react@19.2.4)
@@ -11395,7 +11880,7 @@ snapshots:
 
   '@radix-ui/react-dismissable-layer@1.0.4(@types/react-dom@19.2.3(@types/react@19.2.14))(@types/react@19.2.14)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
       '@radix-ui/primitive': 1.0.1
       '@radix-ui/react-compose-refs': 1.0.1(@types/react@19.2.14)(react@19.2.4)
       '@radix-ui/react-primitive': 1.0.3(@types/react-dom@19.2.3(@types/react@19.2.14))(@types/react@19.2.14)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)
@@ -11437,12 +11922,12 @@ snapshots:
 
   '@radix-ui/react-focus-guards@1.0.0(react@19.2.4)':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
       react: 19.2.4
 
   '@radix-ui/react-focus-guards@1.0.1(@types/react@19.2.14)(react@19.2.4)':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
       react: 19.2.4
     optionalDependencies:
       '@types/react': 19.2.14
@@ -11455,7 +11940,7 @@ snapshots:
 
   '@radix-ui/react-focus-scope@1.0.0(react-dom@19.2.4(react@19.2.4))(react@19.2.4)':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
       '@radix-ui/react-compose-refs': 1.0.0(react@19.2.4)
       '@radix-ui/react-primitive': 1.0.0(react-dom@19.2.4(react@19.2.4))(react@19.2.4)
       '@radix-ui/react-use-callback-ref': 1.0.0(react@19.2.4)
@@ -11464,7 +11949,7 @@ snapshots:
 
   '@radix-ui/react-focus-scope@1.0.3(@types/react-dom@19.2.3(@types/react@19.2.14))(@types/react@19.2.14)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
       '@radix-ui/react-compose-refs': 1.0.1(@types/react@19.2.14)(react@19.2.4)
       '@radix-ui/react-primitive': 1.0.3(@types/react-dom@19.2.3(@types/react@19.2.14))(@types/react@19.2.14)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)
       '@radix-ui/react-use-callback-ref': 1.0.1(@types/react@19.2.14)(react@19.2.4)
@@ -11518,13 +12003,13 @@ snapshots:
 
   '@radix-ui/react-id@1.0.0(react@19.2.4)':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
       '@radix-ui/react-use-layout-effect': 1.0.0(react@19.2.4)
       react: 19.2.4
 
   '@radix-ui/react-id@1.0.1(@types/react@19.2.14)(react@19.2.4)':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
       '@radix-ui/react-use-layout-effect': 1.0.1(@types/react@19.2.14)(react@19.2.4)
       react: 19.2.4
     optionalDependencies:
@@ -11700,14 +12185,14 @@ snapshots:
 
   '@radix-ui/react-portal@1.0.0(react-dom@19.2.4(react@19.2.4))(react@19.2.4)':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
       '@radix-ui/react-primitive': 1.0.0(react-dom@19.2.4(react@19.2.4))(react@19.2.4)
       react: 19.2.4
       react-dom: 19.2.4(react@19.2.4)
 
   '@radix-ui/react-portal@1.0.3(@types/react-dom@19.2.3(@types/react@19.2.14))(@types/react@19.2.14)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
       '@radix-ui/react-primitive': 1.0.3(@types/react-dom@19.2.3(@types/react@19.2.14))(@types/react@19.2.14)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)
       react: 19.2.4
       react-dom: 19.2.4(react@19.2.4)
@@ -11727,7 +12212,7 @@ snapshots:
 
   '@radix-ui/react-presence@1.0.0(react-dom@19.2.4(react@19.2.4))(react@19.2.4)':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
       '@radix-ui/react-compose-refs': 1.0.0(react@19.2.4)
       '@radix-ui/react-use-layout-effect': 1.0.0(react@19.2.4)
       react: 19.2.4
@@ -11735,7 +12220,7 @@ snapshots:
 
   '@radix-ui/react-presence@1.0.1(@types/react-dom@19.2.3(@types/react@19.2.14))(@types/react@19.2.14)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
       '@radix-ui/react-compose-refs': 1.0.1(@types/react@19.2.14)(react@19.2.4)
       '@radix-ui/react-use-layout-effect': 1.0.1(@types/react@19.2.14)(react@19.2.4)
       react: 19.2.4
@@ -11756,14 +12241,14 @@ snapshots:
 
   '@radix-ui/react-primitive@1.0.0(react-dom@19.2.4(react@19.2.4))(react@19.2.4)':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
       '@radix-ui/react-slot': 1.0.0(react@19.2.4)
       react: 19.2.4
       react-dom: 19.2.4(react@19.2.4)
 
   '@radix-ui/react-primitive@1.0.3(@types/react-dom@19.2.3(@types/react@19.2.14))(@types/react@19.2.14)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
       '@radix-ui/react-slot': 1.0.2(@types/react@19.2.14)(react@19.2.4)
       react: 19.2.4
       react-dom: 19.2.4(react@19.2.4)
@@ -11919,13 +12404,13 @@ snapshots:
 
   '@radix-ui/react-slot@1.0.0(react@19.2.4)':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
       '@radix-ui/react-compose-refs': 1.0.0(react@19.2.4)
       react: 19.2.4
 
   '@radix-ui/react-slot@1.0.2(@types/react@19.2.14)(react@19.2.4)':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
       '@radix-ui/react-compose-refs': 1.0.1(@types/react@19.2.14)(react@19.2.4)
       react: 19.2.4
     optionalDependencies:
@@ -12059,12 +12544,12 @@ snapshots:
 
   '@radix-ui/react-use-callback-ref@1.0.0(react@19.2.4)':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
       react: 19.2.4
 
   '@radix-ui/react-use-callback-ref@1.0.1(@types/react@19.2.14)(react@19.2.4)':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
       react: 19.2.4
     optionalDependencies:
       '@types/react': 19.2.14
@@ -12077,13 +12562,13 @@ snapshots:
 
   '@radix-ui/react-use-controllable-state@1.0.0(react@19.2.4)':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
       '@radix-ui/react-use-callback-ref': 1.0.0(react@19.2.4)
       react: 19.2.4
 
   '@radix-ui/react-use-controllable-state@1.0.1(@types/react@19.2.14)(react@19.2.4)':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
       '@radix-ui/react-use-callback-ref': 1.0.1(@types/react@19.2.14)(react@19.2.4)
       react: 19.2.4
     optionalDependencies:
@@ -12106,13 +12591,13 @@ snapshots:
 
   '@radix-ui/react-use-escape-keydown@1.0.0(react@19.2.4)':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
       '@radix-ui/react-use-callback-ref': 1.0.0(react@19.2.4)
       react: 19.2.4
 
   '@radix-ui/react-use-escape-keydown@1.0.3(@types/react@19.2.14)(react@19.2.4)':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
       '@radix-ui/react-use-callback-ref': 1.0.1(@types/react@19.2.14)(react@19.2.4)
       react: 19.2.4
     optionalDependencies:
@@ -12134,12 +12619,12 @@ snapshots:
 
   '@radix-ui/react-use-layout-effect@1.0.0(react@19.2.4)':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
       react: 19.2.4
 
   '@radix-ui/react-use-layout-effect@1.0.1(@types/react@19.2.14)(react@19.2.4)':
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
       react: 19.2.4
     optionalDependencies:
       '@types/react': 19.2.14
@@ -12222,59 +12707,58 @@ snapshots:
 
   '@rocicorp/resolver@1.0.2': {}
 
-  '@rocicorp/zero-sqlite3@1.0.15':
+  '@rocicorp/zero-sqlite3@1.0.18':
     dependencies:
       bindings: 1.5.0
       prebuild-install: 7.1.3
 
-  '@rocicorp/zero@0.26.2(@opentelemetry/core@2.6.0(@opentelemetry/api@1.9.0))':
+  '@rocicorp/zero@1.4.0(@opentelemetry/core@2.7.1(@opentelemetry/api@1.9.0))':
     dependencies:
       '@badrap/valita': 0.3.11
       '@databases/escape-identifier': 1.0.3
       '@databases/sql': 3.3.0
-      '@dotenvx/dotenvx': 1.57.2
+      '@dotenvx/dotenvx': 1.65.0
       '@drdgvhbh/postgres-error-codes': 0.0.6
       '@fastify/cors': 10.1.0
       '@fastify/websocket': 11.2.0
       '@google-cloud/precise-date': 4.0.0
-      '@opentelemetry/api': 1.9.0
+      '@opentelemetry/api': 1.9.1
       '@opentelemetry/api-logs': 0.203.0
-      '@opentelemetry/auto-instrumentations-node': 0.62.2(@opentelemetry/api@1.9.0)(@opentelemetry/core@2.6.0(@opentelemetry/api@1.9.0))
-      '@opentelemetry/exporter-metrics-otlp-http': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/resources': 2.5.1(@opentelemetry/api@1.9.0)
-      '@opentelemetry/sdk-metrics': 2.2.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/sdk-node': 0.203.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/sdk-trace-node': 2.6.0(@opentelemetry/api@1.9.0)
+      '@opentelemetry/auto-instrumentations-node': 0.62.2(@opentelemetry/api@1.9.1)(@opentelemetry/core@2.7.1(@opentelemetry/api@1.9.0))
+      '@opentelemetry/exporter-metrics-otlp-http': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/resources': 2.7.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/sdk-metrics': 2.7.1(@opentelemetry/api@1.9.1)
+      '@opentelemetry/sdk-node': 0.203.0(@opentelemetry/api@1.9.1)
+      '@opentelemetry/sdk-trace-node': 2.7.1(@opentelemetry/api@1.9.1)
       '@postgresql-typed/oids': 0.2.0
       '@rocicorp/lock': 1.0.4
       '@rocicorp/logger': 5.4.0
       '@rocicorp/resolver': 1.0.2
-      '@rocicorp/zero-sqlite3': 1.0.15
+      '@rocicorp/zero-sqlite3': 1.0.18
       '@standard-schema/spec': 1.1.0
       '@types/basic-auth': 1.1.8
       '@types/ws': 8.18.1
       basic-auth: 2.0.1
-      chalk: 5.6.2
       chalk-template: 1.1.2
       chokidar: 4.0.3
       cloudevents: 10.0.0
       command-line-args: 6.0.2
       command-line-usage: 7.0.4
-      compare-utf8: 0.1.1
-      defu: 6.1.4
+      compare-utf8: 0.2.0
+      defu: 6.1.7
       eventemitter3: 5.0.4
-      fastify: 5.8.4
+      fastify: 5.8.5
       is-in-subnet: 4.0.1
       jose: 5.10.0
       js-xxhash: 4.0.0
       json-custom-numbers: 3.1.1
       kasi: 1.1.2
-      nanoid: 5.1.6
+      nanoid: 5.1.11
+      oxfmt: 0.45.0
       parse-prometheus-text-format: 1.1.1
       pg-format: pg-format-fix@1.0.5
       postgres: 3.4.7
-      prettier: 3.8.1
-      semver: 7.7.4
+      semver: 7.8.0
       tsx: 4.21.0
       url-pattern: 1.0.3
       urlpattern-polyfill: 10.1.0
@@ -12608,10 +13092,10 @@ snapshots:
 
   '@tabler/icons-react@3.37.1(react@19.2.4)':
     dependencies:
-      '@tabler/icons': 3.37.1
+      '@tabler/icons': 3.41.1
       react: 19.2.4
 
-  '@tabler/icons@3.37.1': {}
+  '@tabler/icons@3.41.1': {}
 
   '@tailwindcss/node@4.2.1':
     dependencies:
@@ -12834,7 +13318,7 @@ snapshots:
   '@types/pg@8.15.5':
     dependencies:
       '@types/node': 20.19.33
-      pg-protocol: 1.11.0
+      pg-protocol: 1.13.0
       pg-types: 2.2.0
 
   '@types/pg@8.16.0':
@@ -13080,13 +13564,13 @@ snapshots:
     optionalDependencies:
       react: 19.2.4
 
-  ajv-formats@2.1.1(ajv@8.18.0):
+  ajv-formats@2.1.1(ajv@8.20.0):
     optionalDependencies:
-      ajv: 8.18.0
+      ajv: 8.20.0
 
-  ajv-formats@3.0.1(ajv@8.18.0):
+  ajv-formats@3.0.1(ajv@8.20.0):
     optionalDependencies:
-      ajv: 8.18.0
+      ajv: 8.20.0
 
   ajv@6.14.0:
     dependencies:
@@ -13095,10 +13579,10 @@ snapshots:
       json-schema-traverse: 0.4.1
       uri-js: 4.4.1
 
-  ajv@8.18.0:
+  ajv@8.20.0:
     dependencies:
       fast-deep-equal: 3.1.3
-      fast-uri: 3.1.0
+      fast-uri: 3.1.2
       json-schema-traverse: 1.0.0
       require-from-string: 2.0.2
 
@@ -13384,8 +13868,8 @@ snapshots:
 
   cloudevents@10.0.0:
     dependencies:
-      ajv: 8.18.0
-      ajv-formats: 2.1.1(ajv@8.18.0)
+      ajv: 8.20.0
+      ajv-formats: 2.1.1(ajv@8.20.0)
       json-bigint: 1.0.0
       process: 0.11.10
       util: 0.12.5
@@ -13445,7 +13929,7 @@ snapshots:
 
   commander@8.3.0: {}
 
-  compare-utf8@0.1.1: {}
+  compare-utf8@0.2.0: {}
 
   compute-scroll-into-view@3.1.1: {}
 
@@ -13586,7 +14070,7 @@ snapshots:
       has-property-descriptors: 1.0.2
       object-keys: 1.1.1
 
-  defu@6.1.4: {}
+  defu@6.1.7: {}
 
   dequal@2.0.3: {}
 
@@ -13633,6 +14117,10 @@ snapshots:
     dependencies:
       domelementtype: 2.3.0
 
+  dompurify@3.2.7:
+    optionalDependencies:
+      '@types/trusted-types': 2.0.7
+
   dompurify@3.3.1:
     optionalDependencies:
       '@types/trusted-types': 2.0.7
@@ -13682,7 +14170,7 @@ snapshots:
 
   eciesjs@0.4.18:
     dependencies:
-      '@ecies/ciphers': 0.2.5(@noble/ciphers@1.3.0)
+      '@ecies/ciphers': 0.2.6(@noble/ciphers@1.3.0)
       '@noble/ciphers': 1.3.0
       '@noble/curves': 1.9.7
       '@noble/hashes': 1.8.0
@@ -13700,7 +14188,7 @@ snapshots:
       react: 19.2.4
       react-dom: 19.2.4(react@19.2.4)
       react-easy-sort: 1.8.0(react-dom@19.2.4(react@19.2.4))(react@19.2.4)
-      tailwind-merge: 3.5.0
+      tailwind-merge: 3.6.0
     transitivePeerDependencies:
       - '@types/react'
       - '@types/react-dom'
@@ -14206,12 +14694,12 @@ snapshots:
 
   fast-json-stable-stringify@2.1.0: {}
 
-  fast-json-stringify@6.3.0:
+  fast-json-stringify@6.4.0:
     dependencies:
       '@fastify/merge-json-schemas': 0.2.1
-      ajv: 8.18.0
-      ajv-formats: 3.0.1(ajv@8.18.0)
-      fast-uri: 3.1.0
+      ajv: 8.20.0
+      ajv-formats: 3.0.1(ajv@8.20.0)
+      fast-uri: 3.1.2
       json-schema-ref-resolver: 3.0.0
       rfdc: 1.4.1
 
@@ -14221,11 +14709,11 @@ snapshots:
     dependencies:
       fast-decode-uri-component: 1.0.1
 
-  fast-uri@3.1.0: {}
+  fast-uri@3.1.2: {}
 
   fastify-plugin@5.1.0: {}
 
-  fastify@5.8.4:
+  fastify@5.8.5:
     dependencies:
       '@fastify/ajv-compiler': 4.0.5
       '@fastify/error': 4.2.0
@@ -14233,14 +14721,14 @@ snapshots:
       '@fastify/proxy-addr': 5.1.0
       abstract-logging: 2.0.1
       avvio: 9.2.0
-      fast-json-stringify: 6.3.0
-      find-my-way: 9.5.0
+      fast-json-stringify: 6.4.0
+      find-my-way: 9.6.0
       light-my-request: 6.6.0
       pino: 10.3.1
       process-warning: 5.0.0
       rfdc: 1.4.1
       secure-json-parse: 4.1.0
-      semver: 7.7.4
+      semver: 7.8.0
       toad-cache: 3.7.0
 
   fastq@1.20.1:
@@ -14255,6 +14743,10 @@ snapshots:
     optionalDependencies:
       picomatch: 4.0.3
 
+  fdir@6.5.0(picomatch@4.0.4):
+    optionalDependencies:
+      picomatch: 4.0.4
+
   fflate@0.4.8: {}
 
   file-entry-cache@8.0.0:
@@ -14271,11 +14763,11 @@ snapshots:
     dependencies:
       to-regex-range: 5.0.1
 
-  find-my-way@9.5.0:
+  find-my-way@9.6.0:
     dependencies:
       fast-deep-equal: 3.1.3
       fast-querystring: 1.1.2
-      safe-regex2: 5.1.0
+      safe-regex2: 5.1.1
 
   find-replace@5.0.2: {}
 
@@ -14316,10 +14808,13 @@ snapshots:
 
   fs-constants@1.0.0: {}
 
+  fsevents@2.3.2:
+    optional: true
+
   fsevents@2.3.3:
     optional: true
 
-  fumadocs-core@16.6.5(@mdx-js/mdx@3.1.1)(@types/estree-jsx@1.0.5)(@types/hast@3.0.4)(@types/mdast@4.0.4)(@types/react@19.2.14)(lucide-react@0.577.0(react@19.2.4))(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(zod@4.3.6):
+  fumadocs-core@16.6.5(@mdx-js/mdx@3.1.1)(@types/estree-jsx@1.0.5)(@types/hast@3.0.4)(@types/mdast@4.0.4)(@types/react@19.2.14)(lucide-react@0.577.0(react@19.2.4))(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(@playwright/test@1.59.1)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(zod@4.3.6):
     dependencies:
       '@formatjs/intl-localematcher': 0.8.1
       '@orama/orama': 3.1.18
@@ -14351,21 +14846,21 @@ snapshots:
       '@types/mdast': 4.0.4
       '@types/react': 19.2.14
       lucide-react: 0.577.0(react@19.2.4)
-      next: 16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)
+      next: 16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(@playwright/test@1.59.1)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)
       react: 19.2.4
       react-dom: 19.2.4(react@19.2.4)
       zod: 4.3.6
     transitivePeerDependencies:
       - supports-color
 
-  fumadocs-mdx@14.2.8(@types/mdast@4.0.4)(@types/mdx@2.0.13)(@types/react@19.2.14)(fumadocs-core@16.6.5(@mdx-js/mdx@3.1.1)(@types/estree-jsx@1.0.5)(@types/hast@3.0.4)(@types/mdast@4.0.4)(@types/react@19.2.14)(lucide-react@0.577.0(react@19.2.4))(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(zod@4.3.6))(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(react@19.2.4)(vite@7.3.1(@types/node@20.19.33)(jiti@2.6.1)(lightningcss@1.31.1)(tsx@4.21.0)):
+  fumadocs-mdx@14.2.8(@types/mdast@4.0.4)(@types/mdx@2.0.13)(@types/react@19.2.14)(fumadocs-core@16.6.5(@mdx-js/mdx@3.1.1)(@types/estree-jsx@1.0.5)(@types/hast@3.0.4)(@types/mdast@4.0.4)(@types/react@19.2.14)(lucide-react@0.577.0(react@19.2.4))(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(@playwright/test@1.59.1)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(zod@4.3.6))(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(@playwright/test@1.59.1)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(react@19.2.4)(vite@7.3.1(@types/node@20.19.33)(jiti@2.6.1)(lightningcss@1.31.1)(tsx@4.21.0)):
     dependencies:
       '@mdx-js/mdx': 3.1.1
       '@standard-schema/spec': 1.1.0
       chokidar: 5.0.0
       esbuild: 0.27.3
       estree-util-value-to-estree: 3.5.0
-      fumadocs-core: 16.6.5(@mdx-js/mdx@3.1.1)(@types/estree-jsx@1.0.5)(@types/hast@3.0.4)(@types/mdast@4.0.4)(@types/react@19.2.14)(lucide-react@0.577.0(react@19.2.4))(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(zod@4.3.6)
+      fumadocs-core: 16.6.5(@mdx-js/mdx@3.1.1)(@types/estree-jsx@1.0.5)(@types/hast@3.0.4)(@types/mdast@4.0.4)(@types/react@19.2.14)(lucide-react@0.577.0(react@19.2.4))(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(@playwright/test@1.59.1)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(zod@4.3.6)
       js-yaml: 4.1.1
       mdast-util-mdx: 3.0.0
       mdast-util-to-markdown: 2.1.2
@@ -14382,13 +14877,13 @@ snapshots:
       '@types/mdast': 4.0.4
       '@types/mdx': 2.0.13
       '@types/react': 19.2.14
-      next: 16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)
+      next: 16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(@playwright/test@1.59.1)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)
       react: 19.2.4
       vite: 7.3.1(@types/node@20.19.33)(jiti@2.6.1)(lightningcss@1.31.1)(tsx@4.21.0)
     transitivePeerDependencies:
       - supports-color
 
-  fumadocs-ui@16.6.5(@types/react-dom@19.2.3(@types/react@19.2.14))(@types/react@19.2.14)(fumadocs-core@16.6.5(@mdx-js/mdx@3.1.1)(@types/estree-jsx@1.0.5)(@types/hast@3.0.4)(@types/mdast@4.0.4)(@types/react@19.2.14)(lucide-react@0.577.0(react@19.2.4))(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(zod@4.3.6))(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(tailwindcss@4.2.1):
+  fumadocs-ui@16.6.5(@types/react-dom@19.2.3(@types/react@19.2.14))(@types/react@19.2.14)(fumadocs-core@16.6.5(@mdx-js/mdx@3.1.1)(@types/estree-jsx@1.0.5)(@types/hast@3.0.4)(@types/mdast@4.0.4)(@types/react@19.2.14)(lucide-react@0.577.0(react@19.2.4))(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(@playwright/test@1.59.1)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(zod@4.3.6))(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(@playwright/test@1.59.1)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(tailwindcss@4.2.1):
     dependencies:
       '@fumadocs/tailwind': 0.0.2(tailwindcss@4.2.1)
       '@radix-ui/react-accordion': 1.2.12(@types/react-dom@19.2.3(@types/react@19.2.14))(@types/react@19.2.14)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)
@@ -14402,7 +14897,7 @@ snapshots:
       '@radix-ui/react-slot': 1.2.4(@types/react@19.2.14)(react@19.2.4)
       '@radix-ui/react-tabs': 1.1.13(@types/react-dom@19.2.3(@types/react@19.2.14))(@types/react@19.2.14)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)
       class-variance-authority: 0.7.1
-      fumadocs-core: 16.6.5(@mdx-js/mdx@3.1.1)(@types/estree-jsx@1.0.5)(@types/hast@3.0.4)(@types/mdast@4.0.4)(@types/react@19.2.14)(lucide-react@0.577.0(react@19.2.4))(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(zod@4.3.6)
+      fumadocs-core: 16.6.5(@mdx-js/mdx@3.1.1)(@types/estree-jsx@1.0.5)(@types/hast@3.0.4)(@types/mdast@4.0.4)(@types/react@19.2.14)(lucide-react@0.577.0(react@19.2.4))(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(@playwright/test@1.59.1)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(zod@4.3.6)
       lucide-react: 0.570.0(react@19.2.4)
       motion: 12.34.3(react-dom@19.2.4(react@19.2.4))(react@19.2.4)
       next-themes: 0.4.6(react-dom@19.2.4(react@19.2.4))(react@19.2.4)
@@ -14416,7 +14911,7 @@ snapshots:
       unist-util-visit: 5.1.0
     optionalDependencies:
       '@types/react': 19.2.14
-      next: 16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)
+      next: 16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(@playwright/test@1.59.1)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)
     transitivePeerDependencies:
       - '@emotion/is-prop-valid'
       - '@types/react-dom'
@@ -14435,6 +14930,8 @@ snapshots:
 
   functions-have-names@1.2.3: {}
 
+  fuzzy-search@3.2.1: {}
+
   gaxios@6.7.1:
     dependencies:
       extend: 3.0.2
@@ -14455,9 +14952,9 @@ snapshots:
       - encoding
       - supports-color
 
-  geist@1.7.0(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)):
+  geist@1.7.0(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(@playwright/test@1.59.1)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)):
     dependencies:
-      next: 16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)
+      next: 16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(@playwright/test@1.59.1)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)
 
   generator-function@2.0.1: {}
 
@@ -14544,6 +15041,10 @@ snapshots:
     dependencies:
       function-bind: 1.1.2
 
+  hasown@2.0.3:
+    dependencies:
+      function-bind: 1.1.2
+
   hast-util-from-dom@5.0.1:
     dependencies:
       '@types/hast': 3.0.4
@@ -14797,7 +15298,7 @@ snapshots:
       '@formatjs/icu-messageformat-parser': 3.5.1
       tslib: 2.8.1
 
-  ipaddr.js@2.3.0: {}
+  ipaddr.js@2.4.0: {}
 
   is-alphabetical@1.0.4: {}
 
@@ -14853,6 +15354,10 @@ snapshots:
     dependencies:
       hasown: 2.0.2
 
+  is-core-module@2.16.2:
+    dependencies:
+      hasown: 2.0.3
+
   is-data-view@1.0.2:
     dependencies:
       call-bound: 1.0.4
@@ -15210,6 +15715,8 @@ snapshots:
 
   markdown-table@3.0.4: {}
 
+  marked@14.0.0: {}
+
   marked@15.0.12: {}
 
   marked@17.0.3: {}
@@ -15705,6 +16212,11 @@ snapshots:
 
   module-details-from-path@1.0.4: {}
 
+  monaco-editor@0.55.1:
+    dependencies:
+      dompurify: 3.2.7
+      marked: 14.0.0
+
   motion-dom@12.34.3:
     dependencies:
       motion-utils: 12.29.2
@@ -15725,7 +16237,7 @@ snapshots:
 
   nanoid@3.3.11: {}
 
-  nanoid@5.1.6: {}
+  nanoid@5.1.11: {}
 
   nanoid@5.1.7: {}
 
@@ -15739,14 +16251,14 @@ snapshots:
 
   next-intl-swc-plugin-extractor@4.8.3: {}
 
-  next-intl@4.8.3(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(react@19.2.4)(typescript@5.9.3):
+  next-intl@4.8.3(next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(@playwright/test@1.59.1)(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(react@19.2.4)(typescript@5.9.3):
     dependencies:
       '@formatjs/intl-localematcher': 0.8.1
       '@parcel/watcher': 2.5.6
       '@swc/core': 1.15.13
       icu-minify: 4.8.3
       negotiator: 1.0.0
-      next: 16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)
+      next: 16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(@playwright/test@1.59.1)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)
       next-intl-swc-plugin-extractor: 4.8.3
       po-parser: 2.1.1
       react: 19.2.4
@@ -15761,7 +16273,7 @@ snapshots:
       react: 19.2.4
       react-dom: 19.2.4(react@19.2.4)
 
-  next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4):
+  next@16.1.6(@babel/core@7.29.0)(@opentelemetry/api@1.9.0)(@playwright/test@1.59.1)(react-dom@19.2.4(react@19.2.4))(react@19.2.4):
     dependencies:
       '@next/env': 16.1.6
       '@swc/helpers': 0.5.15
@@ -15781,6 +16293,7 @@ snapshots:
       '@next/swc-win32-arm64-msvc': 16.1.6
       '@next/swc-win32-x64-msvc': 16.1.6
       '@opentelemetry/api': 1.9.0
+      '@playwright/test': 1.59.1
       sharp: 0.34.5
     transitivePeerDependencies:
       - '@babel/core'
@@ -15791,9 +16304,9 @@ snapshots:
       lower-case: 2.0.2
       tslib: 2.8.1
 
-  node-abi@3.89.0:
+  node-abi@3.92.0:
     dependencies:
-      semver: 7.7.4
+      semver: 7.8.0
 
   node-addon-api@7.1.1: {}
 
@@ -15808,6 +16321,9 @@ snapshots:
     dependencies:
       whatwg-url: 5.0.0
 
+  node-readable-to-web-readable-stream@0.4.2:
+    optional: true
+
   node-releases@2.0.27: {}
 
   npm-run-path@4.0.1:
@@ -15910,6 +16426,30 @@ snapshots:
       object-keys: 1.1.1
       safe-push-apply: 1.0.0
 
+  oxfmt@0.45.0:
+    dependencies:
+      tinypool: 2.1.0
+    optionalDependencies:
+      '@oxfmt/binding-android-arm-eabi': 0.45.0
+      '@oxfmt/binding-android-arm64': 0.45.0
+      '@oxfmt/binding-darwin-arm64': 0.45.0
+      '@oxfmt/binding-darwin-x64': 0.45.0
+      '@oxfmt/binding-freebsd-x64': 0.45.0
+      '@oxfmt/binding-linux-arm-gnueabihf': 0.45.0
+      '@oxfmt/binding-linux-arm-musleabihf': 0.45.0
+      '@oxfmt/binding-linux-arm64-gnu': 0.45.0
+      '@oxfmt/binding-linux-arm64-musl': 0.45.0
+      '@oxfmt/binding-linux-ppc64-gnu': 0.45.0
+      '@oxfmt/binding-linux-riscv64-gnu': 0.45.0
+      '@oxfmt/binding-linux-riscv64-musl': 0.45.0
+      '@oxfmt/binding-linux-s390x-gnu': 0.45.0
+      '@oxfmt/binding-linux-x64-gnu': 0.45.0
+      '@oxfmt/binding-linux-x64-musl': 0.45.0
+      '@oxfmt/binding-openharmony-arm64': 0.45.0
+      '@oxfmt/binding-win32-arm64-msvc': 0.45.0
+      '@oxfmt/binding-win32-ia32-msvc': 0.45.0
+      '@oxfmt/binding-win32-x64-msvc': 0.45.0
+
   p-limit@3.1.0:
     dependencies:
       yocto-queue: 0.1.0
@@ -15970,6 +16510,11 @@ snapshots:
 
   path-type@4.0.0: {}
 
+  pdfjs-dist@5.6.205:
+    optionalDependencies:
+      '@napi-rs/canvas': 0.1.97
+      node-readable-to-web-readable-stream: 0.4.2
+
   performance-now@2.1.0: {}
 
   pg-cloudflare@1.3.0:
@@ -15987,6 +16532,8 @@ snapshots:
 
   pg-protocol@1.11.0: {}
 
+  pg-protocol@1.13.0: {}
+
   pg-types@2.2.0:
     dependencies:
       pg-int8: 1.0.1
@@ -16015,6 +16562,8 @@ snapshots:
 
   picomatch@4.0.3: {}
 
+  picomatch@4.0.4: {}
+
   pino-abstract-transport@3.0.0:
     dependencies:
       split2: 4.2.0
@@ -16033,7 +16582,7 @@ snapshots:
       real-require: 0.2.0
       safe-stable-stringify: 2.5.0
       sonic-boom: 4.2.1
-      thread-stream: 4.0.0
+      thread-stream: 4.1.0
 
   platejs@52.0.17(@types/react@19.2.14)(immer@10.2.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(scheduler@0.27.0)(use-sync-external-store@1.6.0(react@19.2.4)):
     dependencies:
@@ -16053,6 +16602,14 @@ snapshots:
       - scheduler
       - use-sync-external-store
 
+  playwright-core@1.59.1: {}
+
+  playwright@1.59.1:
+    dependencies:
+      playwright-core: 1.59.1
+    optionalDependencies:
+      fsevents: 2.3.2
+
   po-parser@2.1.1: {}
 
   possible-typed-array-names@1.1.0: {}
@@ -16098,7 +16655,7 @@ snapshots:
       '@opentelemetry/api': 1.9.0
       '@opentelemetry/api-logs': 0.208.0
       '@opentelemetry/exporter-logs-otlp-http': 0.208.0(@opentelemetry/api@1.9.0)
-      '@opentelemetry/resources': 2.5.1(@opentelemetry/api@1.9.0)
+      '@opentelemetry/resources': 2.6.0(@opentelemetry/api@1.9.0)
       '@opentelemetry/sdk-logs': 0.208.0(@opentelemetry/api@1.9.0)
       '@posthog/core': 1.23.1
       '@posthog/types': 1.352.1
@@ -16130,7 +16687,7 @@ snapshots:
       minimist: 1.2.8
       mkdirp-classic: 0.5.3
       napi-build-utils: 2.0.0
-      node-abi: 3.89.0
+      node-abi: 3.92.0
       pump: 3.0.4
       rc: 1.2.8
       simple-get: 4.0.1
@@ -16139,8 +16696,6 @@ snapshots:
 
   prelude-ls@1.2.1: {}
 
-  prettier@3.8.1: {}
-
   prismjs@1.27.0: {}
 
   prismjs@1.30.0: {}
@@ -16165,7 +16720,7 @@ snapshots:
 
   property-information@7.1.0: {}
 
-  protobufjs@7.5.4:
+  protobufjs@7.5.5:
     dependencies:
       '@protobufjs/aspromise': 1.1.2
       '@protobufjs/base64': 1.1.2
@@ -16180,6 +16735,21 @@ snapshots:
       '@types/node': 20.19.33
       long: 5.3.2
 
+  protobufjs@7.5.7:
+    dependencies:
+      '@protobufjs/aspromise': 1.1.2
+      '@protobufjs/base64': 1.1.2
+      '@protobufjs/codegen': 2.0.5
+      '@protobufjs/eventemitter': 1.1.0
+      '@protobufjs/fetch': 1.1.0
+      '@protobufjs/float': 1.0.2
+      '@protobufjs/inquire': 1.1.1
+      '@protobufjs/path': 1.1.2
+      '@protobufjs/pool': 1.1.0
+      '@protobufjs/utf8': 1.1.1
+      '@types/node': 20.19.33
+      long: 5.3.2
+
   proxy-compare@2.6.0: {}
 
   pump@3.0.4:
@@ -16402,7 +16972,7 @@ snapshots:
 
   react-syntax-highlighter@15.6.6(react@19.2.4):
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
       highlight.js: 10.7.3
       highlightjs-vue: 1.0.0
       lowlight: 1.20.0
@@ -16447,7 +17017,7 @@ snapshots:
   readable-stream@3.6.2:
     dependencies:
       inherits: 2.0.4
-      string_decoder: 1.1.1
+      string_decoder: 1.3.0
       util-deprecate: 1.0.2
 
   readdirp@4.1.2: {}
@@ -16456,6 +17026,8 @@ snapshots:
 
   real-require@0.2.0: {}
 
+  real-require@1.0.0: {}
+
   recma-build-jsx@1.0.0:
     dependencies:
       '@types/estree': 1.0.8
@@ -16487,7 +17059,7 @@ snapshots:
 
   redux@4.2.1:
     dependencies:
-      '@babel/runtime': 7.28.6
+      '@babel/runtime': 7.29.2
 
   reflect.getprototypeof@1.0.10:
     dependencies:
@@ -16653,7 +17225,7 @@ snapshots:
     dependencies:
       debug: 4.4.3
       module-details-from-path: 1.0.4
-      resolve: 1.22.11
+      resolve: 1.22.12
     transitivePeerDependencies:
       - supports-color
 
@@ -16667,6 +17239,13 @@ snapshots:
       path-parse: 1.0.7
       supports-preserve-symlinks-flag: 1.0.0
 
+  resolve@1.22.12:
+    dependencies:
+      es-errors: 1.3.0
+      is-core-module: 2.16.2
+      path-parse: 1.0.7
+      supports-preserve-symlinks-flag: 1.0.0
+
   resolve@2.0.0-next.6:
     dependencies:
       es-errors: 1.3.0
@@ -16727,6 +17306,8 @@ snapshots:
 
   safe-buffer@5.1.2: {}
 
+  safe-buffer@5.2.1: {}
+
   safe-push-apply@1.0.0:
     dependencies:
       es-errors: 1.3.0
@@ -16738,7 +17319,7 @@ snapshots:
       es-errors: 1.3.0
       is-regex: 1.2.1
 
-  safe-regex2@5.1.0:
+  safe-regex2@5.1.1:
     dependencies:
       ret: 0.5.0
 
@@ -16758,6 +17339,8 @@ snapshots:
 
   semver@7.7.4: {}
 
+  semver@7.8.0: {}
+
   server-only@0.0.1: {}
 
   set-cookie-parser@2.7.2: {}
@@ -16950,6 +17533,8 @@ snapshots:
 
   stable-hash@0.0.5: {}
 
+  state-local@1.0.7: {}
+
   stop-iteration-iterator@1.1.0:
     dependencies:
       es-errors: 1.3.0
@@ -17041,6 +17626,10 @@ snapshots:
     dependencies:
       safe-buffer: 5.1.2
 
+  string_decoder@1.3.0:
+    dependencies:
+      safe-buffer: 5.2.1
+
   stringify-entities@4.0.4:
     dependencies:
       character-entities-html4: 2.1.0
@@ -17106,6 +17695,8 @@ snapshots:
 
   tailwind-merge@3.5.0: {}
 
+  tailwind-merge@3.6.0: {}
+
   tailwind-scrollbar-hide@4.0.0(tailwindcss@4.2.1):
     dependencies:
       tailwindcss: 4.2.1
@@ -17137,9 +17728,9 @@ snapshots:
     dependencies:
       utrie: 1.0.2
 
-  thread-stream@4.0.0:
+  thread-stream@4.1.0:
     dependencies:
-      real-require: 0.2.0
+      real-require: 1.0.0
 
   throttleit@2.1.0: {}
 
@@ -17156,6 +17747,8 @@ snapshots:
       fdir: 6.5.0(picomatch@4.0.3)
       picomatch: 4.0.3
 
+  tinypool@2.1.0: {}
+
   to-regex-range@5.0.1:
     dependencies:
       is-number: 7.0.0
@@ -17198,7 +17791,7 @@ snapshots:
 
   tunnel-agent@0.6.0:
     dependencies:
-      safe-buffer: 5.1.2
+      safe-buffer: 5.2.1
 
   type-check@0.4.0:
     dependencies:
@@ -17565,6 +18158,12 @@ snapshots:
 
   yocto-queue@0.1.0: {}
 
+  yocto-spinner@1.2.0:
+    dependencies:
+      yoctocolors: 2.1.2
+
+  yoctocolors@2.1.2: {}
+
   zod-to-json-schema@3.25.1(zod@4.3.6):
     dependencies:
       zod: 4.3.6
diff --git a/surfsense_web/pnpm-workspace.yaml b/surfsense_web/pnpm-workspace.yaml
new file mode 100644
index 0000000000..b33ba7657d
--- /dev/null
+++ b/surfsense_web/pnpm-workspace.yaml
@@ -0,0 +1,14 @@
+packages:
+  - "."
+
+allowBuilds:
+  "@parcel/watcher": true
+  "@rocicorp/zero-sqlite3": true
+  "@swc/core": true
+  core-js: true
+  esbuild: true
+  protobufjs: true
+  sharp: true
+  unrs-resolver: true
+
+minimumReleaseAge: 10080
\ No newline at end of file
diff --git a/surfsense_web/public/homepage/hero_tutorial/extreme_assist.gif b/surfsense_web/public/homepage/hero_tutorial/screenshot_assist.gif
similarity index 100%
rename from surfsense_web/public/homepage/hero_tutorial/extreme_assist.gif
rename to surfsense_web/public/homepage/hero_tutorial/screenshot_assist.gif
diff --git a/surfsense_web/public/homepage/hero_tutorial/extreme_assist.mp4 b/surfsense_web/public/homepage/hero_tutorial/screenshot_assist.mp4
similarity index 100%
rename from surfsense_web/public/homepage/hero_tutorial/extreme_assist.mp4
rename to surfsense_web/public/homepage/hero_tutorial/screenshot_assist.mp4
diff --git a/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-01-hero-image.png b/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-01-hero-image.png
new file mode 100644
index 0000000000..68ef7934eb
Binary files /dev/null and b/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-01-hero-image.png differ
diff --git a/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-02-architecture-diagram.png b/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-02-architecture-diagram.png
new file mode 100644
index 0000000000..13bf60204d
Binary files /dev/null and b/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-02-architecture-diagram.png differ
diff --git a/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-03-accuracy-bar-chart-dark.png b/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-03-accuracy-bar-chart-dark.png
new file mode 100644
index 0000000000..ebe443d6dd
Binary files /dev/null and b/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-03-accuracy-bar-chart-dark.png differ
diff --git a/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-03-accuracy-bar-chart-light.png b/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-03-accuracy-bar-chart-light.png
new file mode 100644
index 0000000000..187e514f4c
Binary files /dev/null and b/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-03-accuracy-bar-chart-light.png differ
diff --git a/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-04-cost-vs-accuracy-dark.png b/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-04-cost-vs-accuracy-dark.png
new file mode 100644
index 0000000000..79717ffcb6
Binary files /dev/null and b/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-04-cost-vs-accuracy-dark.png differ
diff --git a/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-04-cost-vs-accuracy-light.png b/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-04-cost-vs-accuracy-light.png
new file mode 100644
index 0000000000..0d6943a9cd
Binary files /dev/null and b/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-04-cost-vs-accuracy-light.png differ
diff --git a/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-05-decision-tree.png b/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-05-decision-tree.png
new file mode 100644
index 0000000000..3c083e6dda
Binary files /dev/null and b/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-05-decision-tree.png differ
diff --git a/surfsense_web/public/images/blog/no-login-ai-privacy-reality-check/placeholder-01-no-login-vs-no-tracking-hero.png b/surfsense_web/public/images/blog/no-login-ai-privacy-reality-check/placeholder-01-no-login-vs-no-tracking-hero.png
new file mode 100644
index 0000000000..b7e4c88802
Binary files /dev/null and b/surfsense_web/public/images/blog/no-login-ai-privacy-reality-check/placeholder-01-no-login-vs-no-tracking-hero.png differ
diff --git a/surfsense_web/public/images/blog/no-login-ai-privacy-reality-check/placeholder-02-three-tier-pyramid.png b/surfsense_web/public/images/blog/no-login-ai-privacy-reality-check/placeholder-02-three-tier-pyramid.png
new file mode 100644
index 0000000000..3bf0be0b8f
Binary files /dev/null and b/surfsense_web/public/images/blog/no-login-ai-privacy-reality-check/placeholder-02-three-tier-pyramid.png differ
diff --git a/surfsense_web/public/images/blog/no-login-ai-privacy-reality-check/placeholder-03-what-providers-log-conceptual.png b/surfsense_web/public/images/blog/no-login-ai-privacy-reality-check/placeholder-03-what-providers-log-conceptual.png
new file mode 100644
index 0000000000..33c4e24e5d
Binary files /dev/null and b/surfsense_web/public/images/blog/no-login-ai-privacy-reality-check/placeholder-03-what-providers-log-conceptual.png differ
diff --git a/surfsense_web/public/images/blog/no-login-ai-privacy-reality-check/placeholder-04-decision-tree-by-use-case.png b/surfsense_web/public/images/blog/no-login-ai-privacy-reality-check/placeholder-04-decision-tree-by-use-case.png
new file mode 100644
index 0000000000..ce1917068a
Binary files /dev/null and b/surfsense_web/public/images/blog/no-login-ai-privacy-reality-check/placeholder-04-decision-tree-by-use-case.png differ
diff --git a/surfsense_web/public/logo.png b/surfsense_web/public/logo.png
new file mode 100644
index 0000000000..dbda551dae
Binary files /dev/null and b/surfsense_web/public/logo.png differ
diff --git a/surfsense_web/public/og-image-twitter.png b/surfsense_web/public/og-image-twitter.png
new file mode 100644
index 0000000000..0de947ab56
Binary files /dev/null and b/surfsense_web/public/og-image-twitter.png differ
diff --git a/surfsense_web/public/og-image.png b/surfsense_web/public/og-image.png
index b28e33e0f2..0de947ab56 100644
Binary files a/surfsense_web/public/og-image.png and b/surfsense_web/public/og-image.png differ
diff --git a/surfsense_web/source.config.ts b/surfsense_web/source.config.ts
index c10ce3d909..7ba4b89b57 100644
--- a/surfsense_web/source.config.ts
+++ b/surfsense_web/source.config.ts
@@ -16,6 +16,25 @@ export const changelog = defineDocs({
 	},
 });
 
+export const blog = defineDocs({
+	dir: "blog/content",
+	docs: {
+		schema: frontmatterSchema.extend({
+			date: z.string(),
+			image: z.string().optional(),
+			author: z.string().default("SurfSense Team"),
+			authorAvatar: z.string().optional(),
+			tags: z.array(z.string()).optional(),
+			// Pin this post into the featured section above the archive grid.
+			// Multiple posts can be featured at once; ordering within the
+			// featured section follows `featured_order` ascending and falls
+			// back to `date` descending.
+			featured: z.boolean().optional().default(false),
+			featured_order: z.number().optional(),
+		}),
+	},
+});
+
 export default defineConfig({
 	lastModifiedTime: "git",
 	mdxOptions: {
diff --git a/surfsense_web/tests/README.md b/surfsense_web/tests/README.md
new file mode 100644
index 0000000000..89aab1f9ba
--- /dev/null
+++ b/surfsense_web/tests/README.md
@@ -0,0 +1,154 @@
+# Playwright E2E Suite
+
+End-to-end tests for the full SurfSense stack (Next.js + FastAPI +
+Celery + Postgres + Redis). Designed to scale from one connector
+(Composio Drive in Phase 1) to every connector + manual file upload
+without rewriting the harness.
+
+## How the deterministic harness works
+
+There are **three layers of defense** against accidental real-world
+calls. None of them touch production code.
+
+1. `surfsense_backend/tests/e2e/run_backend.py` and `run_celery.py` are
+   separate entrypoints (not used by `python main.py`). They hijack
+   `sys.modules["composio"]` BEFORE importing the app, swap in strict
+   fakes for `langchain_litellm`/`langchain_openai`, and mount the
+   `X-E2E-Scenario` middleware.
+2. The fakes themselves are **strict**: every class implements
+   `__getattr__` that raises `NotImplementedError` on unknown surface.
+   Adding a new SDK call site without updating the fake fails CI loudly.
+3. CI sets `HTTPS_PROXY=http://127.0.0.1:1` plus sentinel API keys
+   (`COMPOSIO_API_KEY=e2e-deny-real-call-sentinel`). Any leaked outbound
+   HTTP call fails before reaching the network.
+
+## Running locally
+
+The recommended flow runs only Postgres and Redis in Docker, and the backend
++ Celery worker on the host. The E2E entrypoints `setdefault` every backend
+variable they need, so no `.env` file is required on a fresh checkout.
+
+### One-time setup
+
+From `surfsense_web/`:
+
+```bash
+pnpm install
+pnpm exec playwright install --with-deps chromium
+```
+
+### Each run
+
+**1. Bring up Postgres + Redis** from the repo root:
+
+```bash
+docker compose -f docker/docker-compose.deps-only.yml up -d db redis
+```
+
+**2. Start the backend** in `surfsense_backend/`, terminal A:
+
+```bash
+uv sync
+uv run alembic upgrade head
+uv run python tests/e2e/run_backend.py
+```
+
+**3. Start the Celery worker** in `surfsense_backend/`, terminal B:
+
+```bash
+uv run python tests/e2e/run_celery.py
+```
+
+**4. Register the Playwright user**:
+
+```bash
+curl -X POST http://localhost:8000/auth/register \
+  -H "Content-Type: application/json" \
+  -d '{"email":"e2e-test@surfsense.net","password":"E2eTestPassword123!"}'
+```
+
+**5. Run Playwright** from `surfsense_web/`, terminal C:
+
+```bash
+pnpm test:e2e             # dev server (fast iteration)
+pnpm test:e2e:headed      # show the browser
+pnpm test:e2e:ui          # Playwright UI mode
+pnpm test:e2e:debug       # Playwright Inspector
+pnpm test:e2e:prod        # build + start (matches CI exactly)
+pnpm test:e2e:report      # open the last HTML report
+```
+
+`playwright.config.ts` and the backend run scripts share defaults, so the
+above works without exporting any env vars. Override
+`PLAYWRIGHT_TEST_EMAIL`, `PLAYWRIGHT_TEST_PASSWORD`, or
+`NEXT_PUBLIC_FASTAPI_BACKEND_URL` only when pointing tests at a different
+stack.
+
+To debug a single journey:
+
+```bash
+pnpm test:e2e:headed connectors/composio/drive/journey.spec.ts
+```
+
+### Hermetic alternative (matches CI)
+
+To reproduce the CI environment exactly: backend and Celery in containers
+with L3 egress denied, replace steps 1–3 with:
+
+```bash
+docker compose -f docker/docker-compose.e2e.yml up -d --build --wait
+```
+
+Then run steps 4 (curl register) and 5 (`pnpm test:e2e:prod`) as above. Tear
+down with:
+
+```bash
+docker compose -f docker/docker-compose.e2e.yml down -v --remove-orphans
+```
+
+This builds the ~9 GB e2e backend image, so the deps-only flow is faster for
+day-to-day work.
+
+## Adding a new connector
+
+The directory tree is designed so a new connector lives mostly inside
+its own folder. E2E is scoped to **one user expectation per connector**:
+the smallest browser journey that proves the user-visible outcome works.
+Follow this checklist:
+
+1. **Backend fake.** Add a new file under
+   `surfsense_backend/tests/e2e/fakes/<sdk>_module.py` mirroring
+   `composio_module.py`. Use `__getattr__` to raise on unknown surface.
+2. **Hijack.** Wire the new module into `run_backend.py` and
+   `run_celery.py` with `sys.modules["<sdk>"] = <fake>`.
+3. **Backend tests.** Put edge cases in backend tests, not Playwright:
+   OAuth state validation in unit tests, and route/error branches in
+   `surfsense_backend/tests/integration/<connector>/`.
+4. **Fixtures.** Drop a fixture file into `tests/fixtures/connectors/`
+   that returns a pre-connected connector row.
+5. **Journey spec.** Create exactly one
+   `tests/connectors/<vendor>/<service>/journey.spec.ts` for the user
+   expectation. For indexable connectors this usually means
+   connect -> select scope -> index -> assert canary content. For
+   connection-only connectors this means connect -> assert connected badge.
+6. **Update this README's directory diagram.**
+
+Do not add separate Playwright specs for expired OAuth state, duplicate
+connectors, auth-expired classification, or route config persistence.
+Those belong in backend unit/integration tests such as
+`surfsense_backend/tests/unit/utils/test_oauth_security.py` and
+`surfsense_backend/tests/integration/composio/`.
+
+## Why API-driven?
+
+Journey specs prefer a thin browser assertion followed by API-driven
+configuration/indexing because:
+
+- It keeps tests **deterministic** (no waiting on UI animation,
+  React hydration, or Next.js compile time).
+- It exercises the **same backend code path** the UI eventually calls.
+- The expensive E2E assertion stays focused on what only E2E can prove:
+  the cross-process seam from connector -> Celery -> indexing -> DB.
+
+UI-only tests live under `helpers/ui/` for future Phase 2 work
+(folder-tree drag-and-drop, indexing options switches, etc.).
diff --git a/surfsense_web/tests/auth.setup.ts b/surfsense_web/tests/auth.setup.ts
new file mode 100644
index 0000000000..a33a81b3c8
--- /dev/null
+++ b/surfsense_web/tests/auth.setup.ts
@@ -0,0 +1,32 @@
+import path from "node:path";
+import { expect, test as setup } from "@playwright/test";
+import { acquireTestToken } from "./helpers/api/auth";
+
+/**
+ * One-time authentication setup. Acquires a bearer token for the seeded
+ * e2e user (rate-limit-free /__e2e__/auth/token first, /auth/jwt/login
+ * fallback) and persists it via localStorage so every test in the
+ * chromium project starts already authenticated.
+ */
+
+const authFile = path.join(__dirname, "..", "playwright", ".auth", "user.json");
+
+const STORAGE_KEY = "surfsense_bearer_token";
+
+setup("authenticate", async ({ page, request }) => {
+	const access_token = await acquireTestToken(request);
+	expect(access_token, "Failed to acquire e2e bearer token").toBeTruthy();
+
+	await page.addInitScript(
+		({ key, token }) => {
+			localStorage.setItem(key, token);
+		},
+		{ key: STORAGE_KEY, token: access_token }
+	);
+
+	// Use a public page so the init script can write localStorage without
+	// racing the dashboard auth redirect.
+	await page.goto("/login", { waitUntil: "domcontentloaded" });
+
+	await page.context().storageState({ path: authFile });
+});
diff --git a/surfsense_web/tests/connectors/clickup/journey.spec.ts b/surfsense_web/tests/connectors/clickup/journey.spec.ts
new file mode 100644
index 0000000000..5d40b96090
--- /dev/null
+++ b/surfsense_web/tests/connectors/clickup/journey.spec.ts
@@ -0,0 +1,85 @@
+import { expect, clickupWithChatTest as test } from "../../fixtures";
+import { streamChatToCompletion } from "../../helpers/api/chat";
+import { listConnectors, triggerIndexExpectDisabled } from "../../helpers/api/connectors";
+import { listDocuments } from "../../helpers/api/documents";
+import { CANARY_TOKENS, FAKE_CLICKUP_TASKS } from "../../helpers/canary";
+import { openConnectorPopup } from "../../helpers/ui/connector-popup";
+
+/**
+ * Proves ClickUp MCP OAuth -> live MCP tool discovery/call -> chat.
+ *
+ * ClickUp is live-tool only: the public indexing route returns
+ * indexing_started=false and chat should call ClickUp MCP tools.
+ */
+test.describe("ClickUp connector journey", () => {
+	test("user connects ClickUp and chats through live MCP tools with indexing disabled", async ({
+		page,
+		request,
+		apiToken,
+		searchSpace,
+		clickupConnector,
+		chatThread,
+	}) => {
+		test.setTimeout(90_000); // worker cold-start + live tool chat
+
+		expect(clickupConnector.connector_type).toBe("CLICKUP_CONNECTOR");
+		expect(clickupConnector.is_indexable).toBe(false);
+		expect(clickupConnector.config._token_encrypted).toBe(true);
+		expect(clickupConnector.config.mcp_service).toBe("clickup");
+		expect(clickupConnector.config.server_config).toMatchObject({
+			transport: "streamable-http",
+			url: "https://mcp.clickup.com/mcp",
+		});
+		expect(clickupConnector.config.mcp_oauth).toMatchObject({
+			client_id: "fake-clickup-mcp-client-id",
+			token_endpoint: "https://mcp.clickup.com/token",
+		});
+		expect(
+			(clickupConnector.config.mcp_oauth as Record<string, unknown>).access_token
+		).toBeTruthy();
+		expect(clickupConnector.config.access_token).toBeUndefined();
+		expect(clickupConnector.config.refresh_token).toBeUndefined();
+
+		await page.goto(`/dashboard/${searchSpace.id}/new-chat`, {
+			waitUntil: "domcontentloaded",
+		});
+		await openConnectorPopup(page);
+		const connectorDialog = page.getByRole("dialog", { name: "Manage Connectors" });
+		await expect(connectorDialog).toBeVisible();
+		await expect(connectorDialog.getByText("ClickUp")).toBeVisible();
+
+		const beforeDocs = await listDocuments(request, apiToken, searchSpace.id);
+		expect(beforeDocs).toHaveLength(0);
+
+		const disabledIndex = await triggerIndexExpectDisabled(
+			request,
+			apiToken,
+			clickupConnector.id,
+			searchSpace.id
+		);
+		expect(disabledIndex.message ?? "").toContain("real-time agent tools");
+		expect(disabledIndex.message ?? "").toContain("background indexing is disabled");
+
+		const chat = await streamChatToCompletion(request, apiToken, {
+			searchSpaceId: searchSpace.id,
+			threadId: chatThread.id,
+			query: `What is in my ClickUp task titled "${FAKE_CLICKUP_TASKS.canary.name}"?`,
+		});
+		expect(
+			chat.assistantText,
+			`chat agent should surface ClickUp canary token from live MCP tools; got: ${chat.assistantText.slice(0, 200)}`
+		).toContain(CANARY_TOKENS.clickupCanary);
+
+		const eventText = JSON.stringify(chat.events);
+		expect(eventText).toContain("clickup_search");
+
+		const refreshedConnectors = await listConnectors(request, apiToken, searchSpace.id);
+		const refreshed = refreshedConnectors.find((c) => c.id === clickupConnector.id);
+		expect(refreshed?.connector_type).toBe("CLICKUP_CONNECTOR");
+		expect(refreshed?.is_indexable).toBe(false);
+		expect(refreshed?.last_indexed_at).toBeNull();
+
+		const afterDocs = await listDocuments(request, apiToken, searchSpace.id);
+		expect(afterDocs).toHaveLength(0);
+	});
+});
diff --git a/surfsense_web/tests/connectors/composio/calendar/journey.spec.ts b/surfsense_web/tests/connectors/composio/calendar/journey.spec.ts
new file mode 100644
index 0000000000..66749349e5
--- /dev/null
+++ b/surfsense_web/tests/connectors/composio/calendar/journey.spec.ts
@@ -0,0 +1,58 @@
+import { expect, composioCalendarWithChatTest as test } from "../../../fixtures";
+import { streamChatToCompletion } from "../../../helpers/api/chat";
+import { listConnectors } from "../../../helpers/api/connectors";
+import { listDocuments } from "../../../helpers/api/documents";
+import { CANARY_TOKENS, FAKE_CALENDAR_EVENTS } from "../../../helpers/canary";
+import { openConnectorPopup } from "../../../helpers/ui/connector-popup";
+
+/**
+ * Proves the Calendar wiring from OAuth fixture -> live Calendar tools -> chat.
+ *
+ * Composio Calendar is a live connector: it should not index documents into
+ * the knowledge base. Chat should call Calendar tools against the connected
+ * Composio account when the user asks about events.
+ */
+test.describe("Composio Calendar journey", () => {
+	test("user connects Calendar and chats through live Calendar tools with the canary token", async ({
+		page,
+		request,
+		apiToken,
+		searchSpace,
+		composioCalendarConnector,
+		chatThread,
+	}) => {
+		test.setTimeout(90_000); // worker cold-start + live tool chat
+
+		await page.goto(`/dashboard/${searchSpace.id}/new-chat`, {
+			waitUntil: "domcontentloaded",
+		});
+		await openConnectorPopup(page);
+		const connectorDialog = page.getByRole("dialog", { name: "Manage Connectors" });
+		await expect(connectorDialog).toBeVisible();
+
+		const beforeChatDocs = await listDocuments(request, apiToken, searchSpace.id);
+		expect(beforeChatDocs).toHaveLength(0);
+
+		const chat = await streamChatToCompletion(request, apiToken, {
+			searchSpaceId: searchSpace.id,
+			threadId: chatThread.id,
+			query:
+				`What Calendar event mentions ${CANARY_TOKENS.calendarCanary} next week? ` +
+				`The event title is "${FAKE_CALENDAR_EVENTS.canary.summary}".`,
+		});
+		expect(
+			chat.assistantText,
+			`chat agent should surface Calendar canary token from live tools; got: ${chat.assistantText.slice(0, 200)}`
+		).toContain(CANARY_TOKENS.calendarCanary);
+
+		const eventText = JSON.stringify(chat.events);
+		expect(eventText).toContain("search_calendar_events");
+
+		const refreshedConnectors = await listConnectors(request, apiToken, searchSpace.id);
+		const refreshed = refreshedConnectors.find((c) => c.id === composioCalendarConnector.id);
+		expect(refreshed?.last_indexed_at).toBeNull();
+
+		const afterChatDocs = await listDocuments(request, apiToken, searchSpace.id);
+		expect(afterChatDocs).toHaveLength(0);
+	});
+});
diff --git a/surfsense_web/tests/connectors/composio/drive/README.md b/surfsense_web/tests/connectors/composio/drive/README.md
new file mode 100644
index 0000000000..3e5d6f5c56
--- /dev/null
+++ b/surfsense_web/tests/connectors/composio/drive/README.md
@@ -0,0 +1,56 @@
+# Composio Google Drive — E2E
+
+Phase 1 Playwright coverage for the Composio Drive connector.
+
+## Journey in this folder
+
+| File | User expectation |
+| --- | --- |
+| `journey.spec.ts` | "I connect Google Drive, choose a file, wait for indexing, and SurfSense contains that file's content." |
+
+## What "passes" actually proves
+
+- The dashboard and connector dialog render for the authenticated user.
+- The Composio Drive connector fixture can complete the happy OAuth setup.
+- The selected Drive file config can be persisted.
+- Pipeline service summarizes/embeds/chunks an indexed file end-to-end.
+- Celery worker is reachable from the FastAPI process (queue + broker).
+- `Document.content` contains the Drive canary token after indexing.
+
+## Edge cases tested elsewhere
+
+Playwright does not own backend edge cases. They are cheaper and easier
+to localize in pytest:
+
+- OAuth state freshness/tamper/malformed validation:
+  `surfsense_backend/tests/unit/utils/test_oauth_security.py`
+- OAuth denied callback and duplicate/reconnection branch:
+  `surfsense_backend/tests/integration/composio/test_oauth_callback.py`
+- Folder listing, selected file config persistence, and auth-expired
+  classification:
+  `surfsense_backend/tests/integration/composio/test_drive_folders_route.py`
+
+## What "passes" does NOT prove
+
+- Real Composio.dev integration (mocked).
+- Real LLM summarization quality (`FakeListChatModel`).
+- Real embedding semantics (constant 0.1 vectors).
+
+These are intentional. Phase 1's deal is "the user-visible Drive
+journey crosses the connector/indexing seams". Phase 2 can add opt-in
+"live LLM" smoke tests under a separate workflow and a separate budget.
+
+## Adding a fourth Composio toolkit (e.g. Slack)
+
+1. Add fixture data to
+   `surfsense_backend/tests/e2e/fakes/fixtures/<toolkit>_*.json`.
+2. Extend `_Tools.execute()` in
+   `surfsense_backend/tests/e2e/fakes/composio_module.py` to handle the
+   new toolkit's tool slugs (`SLACK_FETCH_CONVERSATIONS`, etc.).
+3. Add the toolkit to `_AuthConfigs.list()`.
+4. Drop a sibling folder `tests/connectors/composio/<toolkit>/` with
+   one `journey.spec.ts` that matches the user's expectation for that
+   toolkit.
+
+The fixtures in `tests/fixtures/connectors/composio-drive.fixture.ts`
+are the template — copy + change `toolkit_id`.
diff --git a/surfsense_web/tests/connectors/composio/drive/journey.spec.ts b/surfsense_web/tests/connectors/composio/drive/journey.spec.ts
new file mode 100644
index 0000000000..195c8b8e87
--- /dev/null
+++ b/surfsense_web/tests/connectors/composio/drive/journey.spec.ts
@@ -0,0 +1,140 @@
+import { expect, composioDriveWithChatTest as test } from "../../../fixtures";
+import { streamChatToCompletion } from "../../../helpers/api/chat";
+import {
+	listConnectors,
+	triggerIndex,
+	updateConnectorConfig,
+} from "../../../helpers/api/connectors";
+import { getEditorContent, listDocuments } from "../../../helpers/api/documents";
+import { CANARY_TOKENS, FAKE_DRIVE_FILES } from "../../../helpers/canary";
+import { openConnectorPopup } from "../../../helpers/ui/connector-popup";
+import { waitForDocumentByTitle, waitForIndexingComplete } from "../../../helpers/waits/indexing";
+
+/**
+ * Proves the Drive wiring from OAuth fixture -> selection persistence ->
+ * indexing -> stored source_markdown -> editor-content retrieval -> chat.
+ *
+ * UI-driven file selection and LLM/embedding quality are
+ * covered by later phases or narrower tests.
+ */
+test.describe("Composio Drive journey", () => {
+	test("user connects Drive, selects a file, indexes it, and chats with the canary token", async ({
+		page,
+		request,
+		apiToken,
+		searchSpace,
+		composioDriveConnector,
+		chatThread,
+	}) => {
+		test.setTimeout(240_000); // worker cold-start + Docling + summarize + embed + chunk
+
+		await page.goto(`/dashboard/${searchSpace.id}/new-chat`, {
+			waitUntil: "domcontentloaded",
+		});
+		await openConnectorPopup(page);
+		const connectorDialog = page.getByRole("dialog", { name: "Manage Connectors" });
+		await expect(connectorDialog).toBeVisible();
+
+		const selectedFiles = [
+			{
+				id: FAKE_DRIVE_FILES.canary.id,
+				name: FAKE_DRIVE_FILES.canary.name,
+				mimeType: FAKE_DRIVE_FILES.canary.mimeType,
+			},
+			{
+				id: FAKE_DRIVE_FILES.pdfComposio.id,
+				name: FAKE_DRIVE_FILES.pdfComposio.name,
+				mimeType: FAKE_DRIVE_FILES.pdfComposio.mimeType,
+			},
+		];
+		const indexingOptions = {
+			max_files_per_folder: 10,
+			incremental_sync: false,
+			include_subfolders: false,
+		};
+
+		await updateConnectorConfig(request, apiToken, composioDriveConnector.id, {
+			...composioDriveConnector.config,
+			selected_folders: [],
+			selected_files: selectedFiles,
+			indexing_options: indexingOptions,
+		});
+
+		await triggerIndex(request, apiToken, composioDriveConnector.id, searchSpace.id, {
+			files: selectedFiles,
+			indexing_options: indexingOptions,
+		});
+
+		await waitForIndexingComplete(request, apiToken, composioDriveConnector.id, searchSpace.id, {
+			timeoutMs: 240_000,
+			intervalMs: 1_500,
+			minDocuments: 2,
+		});
+
+		await waitForDocumentByTitle(request, apiToken, searchSpace.id, FAKE_DRIVE_FILES.canary.name, {
+			timeoutMs: 30_000,
+		});
+		await waitForDocumentByTitle(
+			request,
+			apiToken,
+			searchSpace.id,
+			FAKE_DRIVE_FILES.pdfComposio.name,
+			{ timeoutMs: 60_000 }
+		);
+
+		const docs = await listDocuments(request, apiToken, searchSpace.id);
+		const canaryDoc = docs.find((d) => d.title === FAKE_DRIVE_FILES.canary.name);
+		const pdfDoc = docs.find((d) => d.title === FAKE_DRIVE_FILES.pdfComposio.name);
+
+		expect(canaryDoc, "canary document must exist after indexing").toBeDefined();
+		if (!canaryDoc) throw new Error("unreachable: canaryDoc asserted defined above");
+		expect(canaryDoc.document_type).toBe("GOOGLE_DRIVE_FILE");
+		expect(pdfDoc, "Composio Drive PDF document must exist after indexing").toBeDefined();
+		if (!pdfDoc) throw new Error("unreachable: pdfDoc asserted defined above");
+		expect(pdfDoc.document_type).toBe("GOOGLE_DRIVE_FILE");
+
+		// content holds the LLM summary; the raw file body lives in source_markdown.
+		// editor-content is the same endpoint the UI hits when opening a document.
+		const editor = await getEditorContent(request, apiToken, searchSpace.id, canaryDoc.id);
+		expect(
+			editor.source_markdown,
+			`canary token ${CANARY_TOKENS.driveCanaryFile} should appear in editor source_markdown; ` +
+				`got first 200 chars: ${editor.source_markdown.slice(0, 200)}`
+		).toContain(CANARY_TOKENS.driveCanaryFile);
+		expect(editor.document_type).toBe("GOOGLE_DRIVE_FILE");
+		expect(editor.chunk_count).toBeGreaterThan(0);
+
+		const pdfEditor = await getEditorContent(request, apiToken, searchSpace.id, pdfDoc.id);
+		expect(
+			pdfEditor.source_markdown,
+			`PDF canary token ${CANARY_TOKENS.composioDrivePdfCanary} should appear in editor source_markdown; ` +
+				`got first 200 chars: ${pdfEditor.source_markdown.slice(0, 200)}`
+		).toContain(CANARY_TOKENS.composioDrivePdfCanary);
+		expect(pdfEditor.document_type).toBe("GOOGLE_DRIVE_FILE");
+		expect(pdfEditor.chunk_count).toBeGreaterThan(0);
+
+		const refreshedConnectors = await listConnectors(request, apiToken, searchSpace.id);
+		const refreshed = refreshedConnectors.find((c) => c.id === composioDriveConnector.id);
+		expect(refreshed?.last_indexed_at).not.toBeNull();
+
+		const chat = await streamChatToCompletion(request, apiToken, {
+			searchSpaceId: searchSpace.id,
+			threadId: chatThread.id,
+			query: "What is in my e2e-canary.txt Drive file?",
+		});
+		expect(
+			chat.assistantText,
+			`chat agent should surface canary token after indexing; got: ${chat.assistantText.slice(0, 200)}`
+		).toContain(CANARY_TOKENS.driveCanaryFile);
+
+		const pdfChat = await streamChatToCompletion(request, apiToken, {
+			searchSpaceId: searchSpace.id,
+			threadId: chatThread.id,
+			query: "What is in my e2e-composio-canary.pdf Drive file?",
+		});
+		expect(
+			pdfChat.assistantText,
+			`chat agent should surface Composio Drive PDF canary token after indexing; got: ${pdfChat.assistantText.slice(0, 200)}`
+		).toContain(CANARY_TOKENS.composioDrivePdfCanary);
+	});
+});
diff --git a/surfsense_web/tests/connectors/composio/gmail/journey.spec.ts b/surfsense_web/tests/connectors/composio/gmail/journey.spec.ts
new file mode 100644
index 0000000000..fbca52e58f
--- /dev/null
+++ b/surfsense_web/tests/connectors/composio/gmail/journey.spec.ts
@@ -0,0 +1,57 @@
+import { expect, composioGmailWithChatTest as test } from "../../../fixtures";
+import { streamChatToCompletion } from "../../../helpers/api/chat";
+import { listConnectors } from "../../../helpers/api/connectors";
+import { listDocuments } from "../../../helpers/api/documents";
+import { CANARY_TOKENS, FAKE_GMAIL_MESSAGES } from "../../../helpers/canary";
+import { openConnectorPopup } from "../../../helpers/ui/connector-popup";
+
+/**
+ * Proves the Gmail wiring from OAuth fixture -> live Gmail tools -> chat.
+ *
+ * Gmail is a live connector: it should not index documents into the
+ * knowledge base. Chat should call Gmail tools against the connected
+ * Composio account when the user asks about email.
+ */
+test.describe("Composio Gmail journey", () => {
+	test("user connects Gmail and chats through live Gmail tools with the canary token", async ({
+		page,
+		request,
+		apiToken,
+		searchSpace,
+		composioGmailConnector,
+		chatThread,
+	}) => {
+		test.setTimeout(90_000); // worker cold-start + live tool chat
+
+		await page.goto(`/dashboard/${searchSpace.id}/new-chat`, {
+			waitUntil: "domcontentloaded",
+		});
+		await openConnectorPopup(page);
+		const connectorDialog = page.getByRole("dialog", { name: "Manage Connectors" });
+		await expect(connectorDialog).toBeVisible();
+
+		const beforeChatDocs = await listDocuments(request, apiToken, searchSpace.id);
+		expect(beforeChatDocs).toHaveLength(0);
+
+		const chat = await streamChatToCompletion(request, apiToken, {
+			searchSpaceId: searchSpace.id,
+			threadId: chatThread.id,
+			query: `What is in my Gmail message titled "${FAKE_GMAIL_MESSAGES.canary.subject}"?`,
+		});
+		expect(
+			chat.assistantText,
+			`chat agent should surface Gmail canary token from live tools; got: ${chat.assistantText.slice(0, 200)}`
+		).toContain(CANARY_TOKENS.gmailCanary);
+
+		const eventText = JSON.stringify(chat.events);
+		expect(eventText).toContain("search_gmail");
+		expect(eventText).toContain("read_gmail_email");
+
+		const refreshedConnectors = await listConnectors(request, apiToken, searchSpace.id);
+		const refreshed = refreshedConnectors.find((c) => c.id === composioGmailConnector.id);
+		expect(refreshed?.last_indexed_at).toBeNull();
+
+		const afterChatDocs = await listDocuments(request, apiToken, searchSpace.id);
+		expect(afterChatDocs).toHaveLength(0);
+	});
+});
diff --git a/surfsense_web/tests/connectors/confluence/journey.spec.ts b/surfsense_web/tests/connectors/confluence/journey.spec.ts
new file mode 100644
index 0000000000..bcd33095d4
--- /dev/null
+++ b/surfsense_web/tests/connectors/confluence/journey.spec.ts
@@ -0,0 +1,97 @@
+import { expect, confluenceWithChatTest as test } from "../../fixtures";
+import { streamChatToCompletion } from "../../helpers/api/chat";
+import { listConnectors, triggerIndex } from "../../helpers/api/connectors";
+import { getEditorContent, listDocuments } from "../../helpers/api/documents";
+import { CANARY_TOKENS, FAKE_CONFLUENCE_PAGES } from "../../helpers/canary";
+import { openConnectorPopup } from "../../helpers/ui/connector-popup";
+import { waitForDocumentByTitle, waitForIndexingComplete } from "../../helpers/waits/indexing";
+
+/**
+ * Proves Confluence OAuth -> indexed Confluence pages -> stored source_markdown -> chat.
+ *
+ * The external Atlassian provider is faked at the OAuth token/resource and
+ * Confluence page-fetch boundaries; SurfSense's own add/callback routes,
+ * encrypted config storage, indexing endpoint, indexing pipeline, and chat
+ * retrieval remain real.
+ */
+test.describe("Confluence connector journey", () => {
+	test("user connects Confluence through OAuth, indexes a page, and chats with the canary token", async ({
+		page,
+		request,
+		apiToken,
+		searchSpace,
+		confluenceConnector,
+		chatThread,
+	}) => {
+		test.setTimeout(180_000); // worker cold-start + summarize + embed + chunk
+
+		expect(confluenceConnector.connector_type).toBe("CONFLUENCE_CONNECTOR");
+		expect(confluenceConnector.is_indexable).toBe(true);
+		expect(confluenceConnector.config._token_encrypted).toBe(true);
+		expect(confluenceConnector.config.cloud_id).toBe(FAKE_CONFLUENCE_PAGES.canary.cloudId);
+		expect(confluenceConnector.config.base_url).toBe(FAKE_CONFLUENCE_PAGES.canary.baseUrl);
+		expect(confluenceConnector.config.access_token).toBeTruthy();
+		expect(confluenceConnector.config.CONFLUENCE_BASE_URL).toBeUndefined();
+		expect(confluenceConnector.config.CONFLUENCE_EMAIL).toBeUndefined();
+		expect(confluenceConnector.config.CONFLUENCE_API_TOKEN).toBeUndefined();
+
+		await page.goto(`/dashboard/${searchSpace.id}/new-chat`, {
+			waitUntil: "domcontentloaded",
+		});
+		await openConnectorPopup(page);
+		const connectorDialog = page.getByRole("dialog", { name: "Manage Connectors" });
+		await expect(connectorDialog).toBeVisible();
+		await connectorDialog.getByPlaceholder("Search").fill("Confluence");
+		await expect(connectorDialog.getByText("Confluence", { exact: true })).toBeVisible();
+
+		await triggerIndex(request, apiToken, confluenceConnector.id, searchSpace.id, {});
+
+		await waitForIndexingComplete(request, apiToken, confluenceConnector.id, searchSpace.id, {
+			timeoutMs: 150_000,
+			intervalMs: 1_500,
+			minDocuments: 1,
+		});
+
+		await waitForDocumentByTitle(
+			request,
+			apiToken,
+			searchSpace.id,
+			FAKE_CONFLUENCE_PAGES.canary.title,
+			{
+				timeoutMs: 30_000,
+			}
+		);
+
+		const docs = await listDocuments(request, apiToken, searchSpace.id);
+		const canaryDoc = docs.find((d) => d.title === FAKE_CONFLUENCE_PAGES.canary.title);
+
+		expect(canaryDoc, "Confluence canary document must exist after indexing").toBeDefined();
+		if (!canaryDoc) throw new Error("unreachable: canaryDoc asserted defined above");
+		expect(canaryDoc.document_type).toBe("CONFLUENCE_CONNECTOR");
+
+		const editor = await getEditorContent(request, apiToken, searchSpace.id, canaryDoc.id);
+		expect(
+			editor.source_markdown,
+			`canary token ${CANARY_TOKENS.confluenceCanary} should appear in editor source_markdown; ` +
+				`got first 200 chars: ${editor.source_markdown.slice(0, 200)}`
+		).toContain(CANARY_TOKENS.confluenceCanary);
+		expect(editor.document_type).toBe("CONFLUENCE_CONNECTOR");
+		expect(editor.chunk_count).toBeGreaterThan(0);
+
+		const refreshedConnectors = await listConnectors(request, apiToken, searchSpace.id);
+		const refreshed = refreshedConnectors.find((c) => c.id === confluenceConnector.id);
+		expect(refreshed?.connector_type).toBe("CONFLUENCE_CONNECTOR");
+		expect(refreshed?.is_indexable).toBe(true);
+		expect(refreshed?.last_indexed_at).not.toBeNull();
+
+		const chat = await streamChatToCompletion(request, apiToken, {
+			searchSpaceId: searchSpace.id,
+			threadId: chatThread.id,
+			query: `What is in my Confluence page titled "${FAKE_CONFLUENCE_PAGES.canary.title}"?`,
+		});
+		expect(
+			chat.assistantText,
+			`chat agent should surface Confluence canary token after indexing; got: ${chat.assistantText.slice(0, 200)}`
+		).toContain(CANARY_TOKENS.confluenceCanary);
+	});
+});
diff --git a/surfsense_web/tests/connectors/dropbox/journey.spec.ts b/surfsense_web/tests/connectors/dropbox/journey.spec.ts
new file mode 100644
index 0000000000..27616b2b90
--- /dev/null
+++ b/surfsense_web/tests/connectors/dropbox/journey.spec.ts
@@ -0,0 +1,145 @@
+import { expect, nativeDropboxWithChatTest as test } from "../../fixtures";
+import { streamChatToCompletion } from "../../helpers/api/chat";
+import { listConnectors, triggerIndex, updateConnectorConfig } from "../../helpers/api/connectors";
+import { getEditorContent, listDocuments } from "../../helpers/api/documents";
+import { CANARY_TOKENS, FAKE_DROPBOX_FILES } from "../../helpers/canary";
+import { openConnectorPopup } from "../../helpers/ui/connector-popup";
+import { waitForDocumentByTitle, waitForIndexingComplete } from "../../helpers/waits/indexing";
+
+/**
+ * Proves the native Dropbox wiring from Dropbox OAuth fixture -> selection
+ * persistence -> Dropbox API indexing -> stored source_markdown -> chat.
+ *
+ * The folder picker is intentionally bypassed; this journey exercises the
+ * persisted config and indexing contract the picker ultimately feeds.
+ */
+test.describe("Native Dropbox journey", () => {
+	test("user connects Dropbox, selects a file, indexes it, and chats with the canary token", async ({
+		page,
+		request,
+		apiToken,
+		searchSpace,
+		nativeDropboxConnector,
+		chatThread,
+	}) => {
+		test.setTimeout(240_000); // worker cold-start + Docling + summarize + embed + chunk
+
+		expect(nativeDropboxConnector.connector_type).toBe("DROPBOX_CONNECTOR");
+		expect(nativeDropboxConnector.is_indexable).toBe(true);
+		expect(nativeDropboxConnector.config._token_encrypted).toBe(true);
+		expect(nativeDropboxConnector.config.composio_connected_account_id).toBeUndefined();
+
+		await page.goto(`/dashboard/${searchSpace.id}/new-chat`, {
+			waitUntil: "domcontentloaded",
+		});
+		await openConnectorPopup(page);
+		const connectorDialog = page.getByRole("dialog", { name: "Manage Connectors" });
+		await expect(connectorDialog).toBeVisible();
+
+		const selectedFiles = [
+			{
+				id: FAKE_DROPBOX_FILES.canary.id,
+				name: FAKE_DROPBOX_FILES.canary.name,
+				mimeType: FAKE_DROPBOX_FILES.canary.mimeType,
+			},
+			{
+				id: FAKE_DROPBOX_FILES.pdf.id,
+				name: FAKE_DROPBOX_FILES.pdf.name,
+				mimeType: FAKE_DROPBOX_FILES.pdf.mimeType,
+			},
+		];
+		// Keep the shared Drive-style body shape. Dropbox currently defaults
+		// these internally because the indexer expects max_files/use_delta_sync.
+		const indexingOptions = {
+			max_files_per_folder: 10,
+			incremental_sync: false,
+			include_subfolders: false,
+		};
+
+		await updateConnectorConfig(request, apiToken, nativeDropboxConnector.id, {
+			...nativeDropboxConnector.config,
+			selected_folders: [],
+			selected_files: selectedFiles,
+			indexing_options: indexingOptions,
+		});
+
+		await triggerIndex(request, apiToken, nativeDropboxConnector.id, searchSpace.id, {
+			files: selectedFiles,
+			indexing_options: indexingOptions,
+		});
+
+		await waitForIndexingComplete(request, apiToken, nativeDropboxConnector.id, searchSpace.id, {
+			timeoutMs: 240_000,
+			intervalMs: 1_500,
+			minDocuments: 2,
+		});
+
+		await waitForDocumentByTitle(
+			request,
+			apiToken,
+			searchSpace.id,
+			FAKE_DROPBOX_FILES.canary.name,
+			{ timeoutMs: 30_000 }
+		);
+		await waitForDocumentByTitle(request, apiToken, searchSpace.id, FAKE_DROPBOX_FILES.pdf.name, {
+			timeoutMs: 60_000,
+		});
+
+		const docs = await listDocuments(request, apiToken, searchSpace.id);
+		const canaryDoc = docs.find((d) => d.title === FAKE_DROPBOX_FILES.canary.name);
+		const pdfDoc = docs.find((d) => d.title === FAKE_DROPBOX_FILES.pdf.name);
+
+		expect(canaryDoc, "Dropbox canary document must exist after indexing").toBeDefined();
+		if (!canaryDoc) throw new Error("unreachable: canaryDoc asserted defined above");
+		expect(canaryDoc.document_type).toBe("DROPBOX_FILE");
+		expect(pdfDoc, "Dropbox PDF document must exist after indexing").toBeDefined();
+		if (!pdfDoc) throw new Error("unreachable: pdfDoc asserted defined above");
+		expect(pdfDoc.document_type).toBe("DROPBOX_FILE");
+
+		const editor = await getEditorContent(request, apiToken, searchSpace.id, canaryDoc.id);
+		expect(
+			editor.source_markdown,
+			`canary token ${CANARY_TOKENS.dropboxCanary} should appear in editor source_markdown; ` +
+				`got first 200 chars: ${editor.source_markdown.slice(0, 200)}`
+		).toContain(CANARY_TOKENS.dropboxCanary);
+		expect(editor.document_type).toBe("DROPBOX_FILE");
+		expect(editor.chunk_count).toBeGreaterThan(0);
+
+		const pdfEditor = await getEditorContent(request, apiToken, searchSpace.id, pdfDoc.id);
+		expect(
+			pdfEditor.source_markdown,
+			`PDF canary token ${CANARY_TOKENS.dropboxPdfCanary} should appear in editor source_markdown; ` +
+				`got first 200 chars: ${pdfEditor.source_markdown.slice(0, 200)}`
+		).toContain(CANARY_TOKENS.dropboxPdfCanary);
+		expect(pdfEditor.document_type).toBe("DROPBOX_FILE");
+		expect(pdfEditor.chunk_count).toBeGreaterThan(0);
+
+		const refreshedConnectors = await listConnectors(request, apiToken, searchSpace.id);
+		const refreshed = refreshedConnectors.find((c) => c.id === nativeDropboxConnector.id);
+		expect(refreshed?.connector_type).toBe("DROPBOX_CONNECTOR");
+		expect(refreshed?.is_indexable).toBe(true);
+		expect(refreshed?.last_indexed_at).not.toBeNull();
+
+		const chat = await streamChatToCompletion(request, apiToken, {
+			searchSpaceId: searchSpace.id,
+			threadId: chatThread.id,
+			query: "What is in my e2e-dropbox-canary.txt Dropbox file?",
+		});
+		expect(
+			chat.assistantText,
+			"chat agent should surface Dropbox canary token after indexing; " +
+				`got: ${chat.assistantText.slice(0, 200)}`
+		).toContain(CANARY_TOKENS.dropboxCanary);
+
+		const pdfChat = await streamChatToCompletion(request, apiToken, {
+			searchSpaceId: searchSpace.id,
+			threadId: chatThread.id,
+			query: "What is in my e2e-dropbox-canary.pdf Dropbox file?",
+		});
+		expect(
+			pdfChat.assistantText,
+			"chat agent should surface Dropbox PDF canary token after indexing; " +
+				`got: ${pdfChat.assistantText.slice(0, 200)}`
+		).toContain(CANARY_TOKENS.dropboxPdfCanary);
+	});
+});
diff --git a/surfsense_web/tests/connectors/google/calendar/journey.spec.ts b/surfsense_web/tests/connectors/google/calendar/journey.spec.ts
new file mode 100644
index 0000000000..94cba01550
--- /dev/null
+++ b/surfsense_web/tests/connectors/google/calendar/journey.spec.ts
@@ -0,0 +1,74 @@
+import { expect, nativeCalendarWithChatTest as test } from "../../../fixtures";
+import { streamChatToCompletion } from "../../../helpers/api/chat";
+import { listConnectors, triggerIndexExpectDisabled } from "../../../helpers/api/connectors";
+import { listDocuments } from "../../../helpers/api/documents";
+import { CANARY_TOKENS, FAKE_CALENDAR_EVENTS } from "../../../helpers/canary";
+import { openConnectorPopup } from "../../../helpers/ui/connector-popup";
+
+/**
+ * Proves the native Calendar wiring from Google OAuth fixture -> live Calendar
+ * tools -> chat.
+ *
+ * Native Calendar is live-tool only: the public indexing route returns
+ * indexing_started=false and chat should call Calendar tools.
+ */
+test.describe("Native Google Calendar journey", () => {
+	test("user connects native Calendar and chats through live Calendar tools with indexing disabled", async ({
+		page,
+		request,
+		apiToken,
+		searchSpace,
+		nativeCalendarConnector,
+		chatThread,
+	}) => {
+		test.setTimeout(90_000); // worker cold-start + live tool chat
+
+		expect(nativeCalendarConnector.connector_type).toBe("GOOGLE_CALENDAR_CONNECTOR");
+		expect(nativeCalendarConnector.is_indexable).toBe(false);
+		expect(nativeCalendarConnector.config._token_encrypted).toBe(true);
+		expect(nativeCalendarConnector.config.composio_connected_account_id).toBeUndefined();
+
+		await page.goto(`/dashboard/${searchSpace.id}/new-chat`, {
+			waitUntil: "domcontentloaded",
+		});
+		await openConnectorPopup(page);
+		const connectorDialog = page.getByRole("dialog", { name: "Manage Connectors" });
+		await expect(connectorDialog).toBeVisible();
+
+		const beforeDocs = await listDocuments(request, apiToken, searchSpace.id);
+		expect(beforeDocs).toHaveLength(0);
+
+		const disabledIndex = await triggerIndexExpectDisabled(
+			request,
+			apiToken,
+			nativeCalendarConnector.id,
+			searchSpace.id
+		);
+		expect(disabledIndex.message ?? "").toContain("real-time agent tools");
+		expect(disabledIndex.message ?? "").toContain("background indexing is disabled");
+
+		const chat = await streamChatToCompletion(request, apiToken, {
+			searchSpaceId: searchSpace.id,
+			threadId: chatThread.id,
+			query:
+				`What Calendar event mentions ${CANARY_TOKENS.calendarCanary} next week? ` +
+				`The event title is "${FAKE_CALENDAR_EVENTS.canary.summary}".`,
+		});
+		expect(
+			chat.assistantText,
+			`chat agent should surface native Calendar canary token from live tools; got: ${chat.assistantText.slice(0, 200)}`
+		).toContain(CANARY_TOKENS.calendarCanary);
+
+		const eventText = JSON.stringify(chat.events);
+		expect(eventText).toContain("search_calendar_events");
+
+		const refreshedConnectors = await listConnectors(request, apiToken, searchSpace.id);
+		const refreshed = refreshedConnectors.find((c) => c.id === nativeCalendarConnector.id);
+		expect(refreshed?.connector_type).toBe("GOOGLE_CALENDAR_CONNECTOR");
+		expect(refreshed?.is_indexable).toBe(false);
+		expect(refreshed?.last_indexed_at).toBeNull();
+
+		const afterDocs = await listDocuments(request, apiToken, searchSpace.id);
+		expect(afterDocs).toHaveLength(0);
+	});
+});
diff --git a/surfsense_web/tests/connectors/google/drive/journey.spec.ts b/surfsense_web/tests/connectors/google/drive/journey.spec.ts
new file mode 100644
index 0000000000..258939c36a
--- /dev/null
+++ b/surfsense_web/tests/connectors/google/drive/journey.spec.ts
@@ -0,0 +1,145 @@
+import { expect, nativeDriveWithChatTest as test } from "../../../fixtures";
+import { streamChatToCompletion } from "../../../helpers/api/chat";
+import {
+	listConnectors,
+	triggerIndex,
+	updateConnectorConfig,
+} from "../../../helpers/api/connectors";
+import { getEditorContent, listDocuments } from "../../../helpers/api/documents";
+import { CANARY_TOKENS, FAKE_DRIVE_FILES } from "../../../helpers/canary";
+import { openConnectorPopup } from "../../../helpers/ui/connector-popup";
+import { waitForDocumentByTitle, waitForIndexingComplete } from "../../../helpers/waits/indexing";
+
+/**
+ * Proves the native Drive wiring from Google OAuth fixture -> selection
+ * persistence -> native Drive API indexing -> stored source_markdown -> chat.
+ *
+ * Google Picker is intentionally bypassed; this journey exercises the
+ * persisted config and indexing contract the Picker ultimately feeds.
+ */
+test.describe("Native Google Drive journey", () => {
+	test("user connects native Drive, selects a file, indexes it, and chats with the canary token", async ({
+		page,
+		request,
+		apiToken,
+		searchSpace,
+		nativeDriveConnector,
+		chatThread,
+	}) => {
+		test.setTimeout(240_000); // worker cold-start + Docling + summarize + embed + chunk
+
+		expect(nativeDriveConnector.connector_type).toBe("GOOGLE_DRIVE_CONNECTOR");
+		expect(nativeDriveConnector.is_indexable).toBe(true);
+		expect(nativeDriveConnector.config._token_encrypted).toBe(true);
+		expect(nativeDriveConnector.config.composio_connected_account_id).toBeUndefined();
+
+		await page.goto(`/dashboard/${searchSpace.id}/new-chat`, {
+			waitUntil: "domcontentloaded",
+		});
+		await openConnectorPopup(page);
+		const connectorDialog = page.getByRole("dialog", { name: "Manage Connectors" });
+		await expect(connectorDialog).toBeVisible();
+
+		const selectedFiles = [
+			{
+				id: FAKE_DRIVE_FILES.canary.id,
+				name: FAKE_DRIVE_FILES.canary.name,
+				mimeType: FAKE_DRIVE_FILES.canary.mimeType,
+			},
+			{
+				id: FAKE_DRIVE_FILES.pdfNative.id,
+				name: FAKE_DRIVE_FILES.pdfNative.name,
+				mimeType: FAKE_DRIVE_FILES.pdfNative.mimeType,
+			},
+		];
+		const indexingOptions = {
+			max_files_per_folder: 10,
+			incremental_sync: false,
+			include_subfolders: false,
+		};
+
+		await updateConnectorConfig(request, apiToken, nativeDriveConnector.id, {
+			...nativeDriveConnector.config,
+			selected_folders: [],
+			selected_files: selectedFiles,
+			indexing_options: indexingOptions,
+		});
+
+		await triggerIndex(request, apiToken, nativeDriveConnector.id, searchSpace.id, {
+			files: selectedFiles,
+			indexing_options: indexingOptions,
+		});
+
+		await waitForIndexingComplete(request, apiToken, nativeDriveConnector.id, searchSpace.id, {
+			timeoutMs: 240_000,
+			intervalMs: 1_500,
+			minDocuments: 2,
+		});
+
+		await waitForDocumentByTitle(request, apiToken, searchSpace.id, FAKE_DRIVE_FILES.canary.name, {
+			timeoutMs: 30_000,
+		});
+		await waitForDocumentByTitle(
+			request,
+			apiToken,
+			searchSpace.id,
+			FAKE_DRIVE_FILES.pdfNative.name,
+			{ timeoutMs: 60_000 }
+		);
+
+		const docs = await listDocuments(request, apiToken, searchSpace.id);
+		const canaryDoc = docs.find((d) => d.title === FAKE_DRIVE_FILES.canary.name);
+		const pdfDoc = docs.find((d) => d.title === FAKE_DRIVE_FILES.pdfNative.name);
+
+		expect(canaryDoc, "native Drive canary document must exist after indexing").toBeDefined();
+		if (!canaryDoc) throw new Error("unreachable: canaryDoc asserted defined above");
+		expect(canaryDoc.document_type).toBe("GOOGLE_DRIVE_FILE");
+		expect(pdfDoc, "native Drive PDF document must exist after indexing").toBeDefined();
+		if (!pdfDoc) throw new Error("unreachable: pdfDoc asserted defined above");
+		expect(pdfDoc.document_type).toBe("GOOGLE_DRIVE_FILE");
+
+		const editor = await getEditorContent(request, apiToken, searchSpace.id, canaryDoc.id);
+		expect(
+			editor.source_markdown,
+			`canary token ${CANARY_TOKENS.driveCanaryFile} should appear in editor source_markdown; ` +
+				`got first 200 chars: ${editor.source_markdown.slice(0, 200)}`
+		).toContain(CANARY_TOKENS.driveCanaryFile);
+		expect(editor.document_type).toBe("GOOGLE_DRIVE_FILE");
+		expect(editor.chunk_count).toBeGreaterThan(0);
+
+		const pdfEditor = await getEditorContent(request, apiToken, searchSpace.id, pdfDoc.id);
+		expect(
+			pdfEditor.source_markdown,
+			`PDF canary token ${CANARY_TOKENS.drivePdfCanary} should appear in editor source_markdown; ` +
+				`got first 200 chars: ${pdfEditor.source_markdown.slice(0, 200)}`
+		).toContain(CANARY_TOKENS.drivePdfCanary);
+		expect(pdfEditor.document_type).toBe("GOOGLE_DRIVE_FILE");
+		expect(pdfEditor.chunk_count).toBeGreaterThan(0);
+
+		const refreshedConnectors = await listConnectors(request, apiToken, searchSpace.id);
+		const refreshed = refreshedConnectors.find((c) => c.id === nativeDriveConnector.id);
+		expect(refreshed?.connector_type).toBe("GOOGLE_DRIVE_CONNECTOR");
+		expect(refreshed?.is_indexable).toBe(true);
+		expect(refreshed?.last_indexed_at).not.toBeNull();
+
+		const chat = await streamChatToCompletion(request, apiToken, {
+			searchSpaceId: searchSpace.id,
+			threadId: chatThread.id,
+			query: "What is in my e2e-canary.txt native Drive file?",
+		});
+		expect(
+			chat.assistantText,
+			`chat agent should surface native Drive canary token after indexing; got: ${chat.assistantText.slice(0, 200)}`
+		).toContain(CANARY_TOKENS.driveCanaryFile);
+
+		const pdfChat = await streamChatToCompletion(request, apiToken, {
+			searchSpaceId: searchSpace.id,
+			threadId: chatThread.id,
+			query: "What is in my e2e-canary.pdf native Drive file?",
+		});
+		expect(
+			pdfChat.assistantText,
+			`chat agent should surface native Drive PDF canary token after indexing; got: ${pdfChat.assistantText.slice(0, 200)}`
+		).toContain(CANARY_TOKENS.drivePdfCanary);
+	});
+});
diff --git a/surfsense_web/tests/connectors/google/gmail/journey.spec.ts b/surfsense_web/tests/connectors/google/gmail/journey.spec.ts
new file mode 100644
index 0000000000..6f61f6b821
--- /dev/null
+++ b/surfsense_web/tests/connectors/google/gmail/journey.spec.ts
@@ -0,0 +1,73 @@
+import { expect, nativeGmailWithChatTest as test } from "../../../fixtures";
+import { streamChatToCompletion } from "../../../helpers/api/chat";
+import { listConnectors, triggerIndexExpectDisabled } from "../../../helpers/api/connectors";
+import { listDocuments } from "../../../helpers/api/documents";
+import { CANARY_TOKENS, FAKE_GMAIL_MESSAGES } from "../../../helpers/canary";
+import { openConnectorPopup } from "../../../helpers/ui/connector-popup";
+
+/**
+ * Proves the native Gmail wiring from Google OAuth fixture -> live Gmail
+ * tools -> chat.
+ *
+ * Native Gmail is currently live-tool only: the public indexing route
+ * returns indexing_started=false and chat should call Gmail tools.
+ */
+test.describe("Native Google Gmail journey", () => {
+	test("user connects native Gmail and chats through live Gmail tools with indexing disabled", async ({
+		page,
+		request,
+		apiToken,
+		searchSpace,
+		nativeGmailConnector,
+		chatThread,
+	}) => {
+		test.setTimeout(90_000); // worker cold-start + live tool chat
+
+		expect(nativeGmailConnector.connector_type).toBe("GOOGLE_GMAIL_CONNECTOR");
+		expect(nativeGmailConnector.is_indexable).toBe(false);
+		expect(nativeGmailConnector.config._token_encrypted).toBe(true);
+		expect(nativeGmailConnector.config.composio_connected_account_id).toBeUndefined();
+
+		await page.goto(`/dashboard/${searchSpace.id}/new-chat`, {
+			waitUntil: "domcontentloaded",
+		});
+		await openConnectorPopup(page);
+		const connectorDialog = page.getByRole("dialog", { name: "Manage Connectors" });
+		await expect(connectorDialog).toBeVisible();
+
+		const beforeDocs = await listDocuments(request, apiToken, searchSpace.id);
+		expect(beforeDocs).toHaveLength(0);
+
+		const disabledIndex = await triggerIndexExpectDisabled(
+			request,
+			apiToken,
+			nativeGmailConnector.id,
+			searchSpace.id
+		);
+		expect(disabledIndex.message ?? "").toContain("real-time agent tools");
+		expect(disabledIndex.message ?? "").toContain("background indexing is disabled");
+
+		const chat = await streamChatToCompletion(request, apiToken, {
+			searchSpaceId: searchSpace.id,
+			threadId: chatThread.id,
+			query: `What is in my Gmail message titled "${FAKE_GMAIL_MESSAGES.canary.subject}"?`,
+		});
+		expect(
+			chat.assistantText,
+			`chat agent should surface native Gmail canary token from live tools; got: ${chat.assistantText.slice(0, 200)}`
+		).toContain(CANARY_TOKENS.gmailCanary);
+
+		const eventText = JSON.stringify(chat.events);
+		expect(eventText).toContain("search_gmail");
+		expect(eventText).toContain("read_gmail_email");
+
+		const refreshedConnectors = await listConnectors(request, apiToken, searchSpace.id);
+		const refreshed = refreshedConnectors.find((c) => c.id === nativeGmailConnector.id);
+		expect(refreshed?.connector_type).toBe("GOOGLE_GMAIL_CONNECTOR");
+		expect(refreshed?.is_indexable).toBe(false);
+		expect(refreshed?.last_indexed_at).toBeNull();
+
+		const afterDocs = await listDocuments(request, apiToken, searchSpace.id);
+		expect(afterDocs).toHaveLength(0);
+	});
+});
diff --git a/surfsense_web/tests/connectors/jira/journey.spec.ts b/surfsense_web/tests/connectors/jira/journey.spec.ts
new file mode 100644
index 0000000000..c62d9d41e5
--- /dev/null
+++ b/surfsense_web/tests/connectors/jira/journey.spec.ts
@@ -0,0 +1,84 @@
+import { expect, jiraWithChatTest as test } from "../../fixtures";
+import { streamChatToCompletion } from "../../helpers/api/chat";
+import { listConnectors, triggerIndexExpectDisabled } from "../../helpers/api/connectors";
+import { listDocuments } from "../../helpers/api/documents";
+import { CANARY_TOKENS, FAKE_JIRA_ISSUES } from "../../helpers/canary";
+import { openConnectorPopup } from "../../helpers/ui/connector-popup";
+
+/**
+ * Proves Jira MCP OAuth -> live MCP tool discovery/call -> chat.
+ *
+ * Jira is live-tool only: the public indexing route returns
+ * indexing_started=false and chat should call Jira MCP tools.
+ */
+test.describe("Jira connector journey", () => {
+	test("user connects Jira and chats through live MCP tools with indexing disabled", async ({
+		page,
+		request,
+		apiToken,
+		searchSpace,
+		jiraConnector,
+		chatThread,
+	}) => {
+		test.setTimeout(90_000); // worker cold-start + live tool chat
+
+		expect(jiraConnector.connector_type).toBe("JIRA_CONNECTOR");
+		expect(jiraConnector.is_indexable).toBe(false);
+		expect(jiraConnector.config._token_encrypted).toBe(true);
+		expect(jiraConnector.config.mcp_service).toBe("jira");
+		expect(jiraConnector.config.server_config).toMatchObject({
+			transport: "streamable-http",
+			url: "https://mcp.atlassian.com/v1/mcp",
+		});
+		expect(jiraConnector.config.mcp_oauth).toMatchObject({
+			client_id: "fake-jira-mcp-client-id",
+			token_endpoint: "https://cf.mcp.atlassian.com/v1/token",
+		});
+		expect((jiraConnector.config.mcp_oauth as Record<string, unknown>).access_token).toBeTruthy();
+		expect(jiraConnector.config.access_token).toBeUndefined();
+		expect(jiraConnector.config.refresh_token).toBeUndefined();
+
+		await page.goto(`/dashboard/${searchSpace.id}/new-chat`, {
+			waitUntil: "domcontentloaded",
+		});
+		await openConnectorPopup(page);
+		const connectorDialog = page.getByRole("dialog", { name: "Manage Connectors" });
+		await expect(connectorDialog).toBeVisible();
+		await connectorDialog.getByPlaceholder("Search").fill("Jira");
+		await expect(connectorDialog.getByText("Jira", { exact: true })).toBeVisible();
+
+		const beforeDocs = await listDocuments(request, apiToken, searchSpace.id);
+		expect(beforeDocs).toHaveLength(0);
+
+		const disabledIndex = await triggerIndexExpectDisabled(
+			request,
+			apiToken,
+			jiraConnector.id,
+			searchSpace.id
+		);
+		expect(disabledIndex.message ?? "").toContain("real-time agent tools");
+		expect(disabledIndex.message ?? "").toContain("background indexing is disabled");
+
+		const chat = await streamChatToCompletion(request, apiToken, {
+			searchSpaceId: searchSpace.id,
+			threadId: chatThread.id,
+			query: `What is in my Jira issue titled "${FAKE_JIRA_ISSUES.canary.summary}"?`,
+		});
+		expect(
+			chat.assistantText,
+			`chat agent should surface Jira canary token from live MCP tools; got: ${chat.assistantText.slice(0, 200)}`
+		).toContain(CANARY_TOKENS.jiraCanary);
+
+		const eventText = JSON.stringify(chat.events);
+		expect(eventText).toContain("searchJiraIssuesUsingJql");
+
+		const refreshedConnectors = await listConnectors(request, apiToken, searchSpace.id);
+		const refreshed = refreshedConnectors.find((c) => c.id === jiraConnector.id);
+		expect(refreshed?.connector_type).toBe("JIRA_CONNECTOR");
+		expect(refreshed?.is_indexable).toBe(false);
+		expect(refreshed?.last_indexed_at).toBeNull();
+
+		const afterDocs = await listDocuments(request, apiToken, searchSpace.id);
+		expect(afterDocs).toHaveLength(0);
+	});
+});
diff --git a/surfsense_web/tests/connectors/linear/journey.spec.ts b/surfsense_web/tests/connectors/linear/journey.spec.ts
new file mode 100644
index 0000000000..a1ca7fbc87
--- /dev/null
+++ b/surfsense_web/tests/connectors/linear/journey.spec.ts
@@ -0,0 +1,82 @@
+import { expect, linearWithChatTest as test } from "../../fixtures";
+import { streamChatToCompletion } from "../../helpers/api/chat";
+import { listConnectors, triggerIndexExpectDisabled } from "../../helpers/api/connectors";
+import { listDocuments } from "../../helpers/api/documents";
+import { CANARY_TOKENS, FAKE_LINEAR_ISSUES } from "../../helpers/canary";
+import { openConnectorPopup } from "../../helpers/ui/connector-popup";
+
+/**
+ * Proves Linear MCP OAuth -> live MCP tool discovery/call -> chat.
+ *
+ * Linear is live-tool only: the public indexing route returns
+ * indexing_started=false and chat should call Linear MCP tools.
+ */
+test.describe("Linear connector journey", () => {
+	test("user connects Linear and chats through live MCP tools with indexing disabled", async ({
+		page,
+		request,
+		apiToken,
+		searchSpace,
+		linearConnector,
+		chatThread,
+	}) => {
+		test.setTimeout(90_000); // worker cold-start + live tool chat
+
+		expect(linearConnector.connector_type).toBe("LINEAR_CONNECTOR");
+		expect(linearConnector.is_indexable).toBe(false);
+		expect(linearConnector.config._token_encrypted).toBe(true);
+		expect(linearConnector.config.mcp_service).toBe("linear");
+		expect(linearConnector.config.server_config).toMatchObject({
+			transport: "streamable-http",
+			url: "https://mcp.linear.app/mcp",
+		});
+		expect(linearConnector.config.mcp_oauth).toMatchObject({
+			client_id: "fake-linear-mcp-client-id",
+			token_endpoint: "https://mcp.linear.app/token",
+		});
+		expect((linearConnector.config.mcp_oauth as Record<string, unknown>).access_token).toBeTruthy();
+		expect(linearConnector.config.access_token).toBeUndefined();
+		expect(linearConnector.config.refresh_token).toBeUndefined();
+
+		await page.goto(`/dashboard/${searchSpace.id}/new-chat`, {
+			waitUntil: "domcontentloaded",
+		});
+		await openConnectorPopup(page);
+		const connectorDialog = page.getByRole("dialog", { name: "Manage Connectors" });
+		await expect(connectorDialog).toBeVisible();
+
+		const beforeDocs = await listDocuments(request, apiToken, searchSpace.id);
+		expect(beforeDocs).toHaveLength(0);
+
+		const disabledIndex = await triggerIndexExpectDisabled(
+			request,
+			apiToken,
+			linearConnector.id,
+			searchSpace.id
+		);
+		expect(disabledIndex.message ?? "").toContain("real-time agent tools");
+		expect(disabledIndex.message ?? "").toContain("background indexing is disabled");
+
+		const chat = await streamChatToCompletion(request, apiToken, {
+			searchSpaceId: searchSpace.id,
+			threadId: chatThread.id,
+			query: `What is in my Linear issue titled "${FAKE_LINEAR_ISSUES.canary.title}"?`,
+		});
+		expect(
+			chat.assistantText,
+			`chat agent should surface Linear canary token from live MCP tools; got: ${chat.assistantText.slice(0, 200)}`
+		).toContain(CANARY_TOKENS.linearCanary);
+
+		const eventText = JSON.stringify(chat.events);
+		expect(eventText).toContain("list_issues");
+
+		const refreshedConnectors = await listConnectors(request, apiToken, searchSpace.id);
+		const refreshed = refreshedConnectors.find((c) => c.id === linearConnector.id);
+		expect(refreshed?.connector_type).toBe("LINEAR_CONNECTOR");
+		expect(refreshed?.is_indexable).toBe(false);
+		expect(refreshed?.last_indexed_at).toBeNull();
+
+		const afterDocs = await listDocuments(request, apiToken, searchSpace.id);
+		expect(afterDocs).toHaveLength(0);
+	});
+});
diff --git a/surfsense_web/tests/connectors/notion/journey.spec.ts b/surfsense_web/tests/connectors/notion/journey.spec.ts
new file mode 100644
index 0000000000..948353011e
--- /dev/null
+++ b/surfsense_web/tests/connectors/notion/journey.spec.ts
@@ -0,0 +1,93 @@
+import { expect, notionWithChatTest as test } from "../../fixtures";
+import { streamChatToCompletion } from "../../helpers/api/chat";
+import { listConnectors, triggerIndex } from "../../helpers/api/connectors";
+import { getEditorContent, listDocuments } from "../../helpers/api/documents";
+import { CANARY_TOKENS, FAKE_NOTION_PAGES } from "../../helpers/canary";
+import { openConnectorPopup } from "../../helpers/ui/connector-popup";
+import { waitForDocumentByTitle, waitForIndexingComplete } from "../../helpers/waits/indexing";
+
+/**
+ * Proves Notion OAuth -> indexed Notion API pages -> stored source_markdown -> chat.
+ *
+ * The external Notion provider is faked at the OAuth token/API boundary;
+ * SurfSense's own add/callback routes, encrypted config storage, connector
+ * indexing endpoint, indexing pipeline, and chat retrieval remain real.
+ */
+test.describe("Notion connector journey", () => {
+	test("user connects Notion through OAuth, indexes a page, and chats with the canary token", async ({
+		page,
+		request,
+		apiToken,
+		searchSpace,
+		notionConnector,
+		chatThread,
+	}) => {
+		test.setTimeout(180_000); // worker cold-start + summarize + embed + chunk
+
+		expect(notionConnector.connector_type).toBe("NOTION_CONNECTOR");
+		expect(notionConnector.is_indexable).toBe(true);
+		expect(notionConnector.config._token_encrypted).toBe(true);
+		expect(notionConnector.config.NOTION_INTEGRATION_TOKEN).toBeUndefined();
+		expect(notionConnector.config.access_token).toBeTruthy();
+		expect(notionConnector.config.workspace_id).toBe(FAKE_NOTION_PAGES.canary.workspaceId);
+		expect(notionConnector.config.workspace_name).toBe(FAKE_NOTION_PAGES.canary.workspaceName);
+		expect(notionConnector.config.bot_id).toBe(FAKE_NOTION_PAGES.canary.botId);
+
+		await page.goto(`/dashboard/${searchSpace.id}/new-chat`, {
+			waitUntil: "domcontentloaded",
+		});
+		await openConnectorPopup(page);
+		const connectorDialog = page.getByRole("dialog", { name: "Manage Connectors" });
+		await expect(connectorDialog).toBeVisible();
+
+		await triggerIndex(request, apiToken, notionConnector.id, searchSpace.id, {});
+
+		await waitForIndexingComplete(request, apiToken, notionConnector.id, searchSpace.id, {
+			timeoutMs: 150_000,
+			intervalMs: 1_500,
+			minDocuments: 1,
+		});
+
+		await waitForDocumentByTitle(
+			request,
+			apiToken,
+			searchSpace.id,
+			FAKE_NOTION_PAGES.canary.title,
+			{
+				timeoutMs: 30_000,
+			}
+		);
+
+		const docs = await listDocuments(request, apiToken, searchSpace.id);
+		const canaryDoc = docs.find((d) => d.title === FAKE_NOTION_PAGES.canary.title);
+
+		expect(canaryDoc, "Notion canary document must exist after indexing").toBeDefined();
+		if (!canaryDoc) throw new Error("unreachable: canaryDoc asserted defined above");
+		expect(canaryDoc.document_type).toBe("NOTION_CONNECTOR");
+
+		const editor = await getEditorContent(request, apiToken, searchSpace.id, canaryDoc.id);
+		expect(
+			editor.source_markdown,
+			`canary token ${CANARY_TOKENS.notionCanary} should appear in editor source_markdown; ` +
+				`got first 200 chars: ${editor.source_markdown.slice(0, 200)}`
+		).toContain(CANARY_TOKENS.notionCanary);
+		expect(editor.document_type).toBe("NOTION_CONNECTOR");
+		expect(editor.chunk_count).toBeGreaterThan(0);
+
+		const refreshedConnectors = await listConnectors(request, apiToken, searchSpace.id);
+		const refreshed = refreshedConnectors.find((c) => c.id === notionConnector.id);
+		expect(refreshed?.connector_type).toBe("NOTION_CONNECTOR");
+		expect(refreshed?.is_indexable).toBe(true);
+		expect(refreshed?.last_indexed_at).not.toBeNull();
+
+		const chat = await streamChatToCompletion(request, apiToken, {
+			searchSpaceId: searchSpace.id,
+			threadId: chatThread.id,
+			query: `What is in my Notion page titled "${FAKE_NOTION_PAGES.canary.title}"?`,
+		});
+		expect(
+			chat.assistantText,
+			`chat agent should surface Notion canary token after indexing; got: ${chat.assistantText.slice(0, 200)}`
+		).toContain(CANARY_TOKENS.notionCanary);
+	});
+});
diff --git a/surfsense_web/tests/connectors/onedrive/journey.spec.ts b/surfsense_web/tests/connectors/onedrive/journey.spec.ts
new file mode 100644
index 0000000000..5ec0414159
--- /dev/null
+++ b/surfsense_web/tests/connectors/onedrive/journey.spec.ts
@@ -0,0 +1,145 @@
+import { expect, nativeOneDriveWithChatTest as test } from "../../fixtures";
+import { streamChatToCompletion } from "../../helpers/api/chat";
+import { listConnectors, triggerIndex, updateConnectorConfig } from "../../helpers/api/connectors";
+import { getEditorContent, listDocuments } from "../../helpers/api/documents";
+import { CANARY_TOKENS, FAKE_ONEDRIVE_FILES } from "../../helpers/canary";
+import { openConnectorPopup } from "../../helpers/ui/connector-popup";
+import { waitForDocumentByTitle, waitForIndexingComplete } from "../../helpers/waits/indexing";
+
+/**
+ * Proves the native OneDrive wiring from Microsoft OAuth fixture -> selection
+ * persistence -> OneDrive Graph API indexing -> stored source_markdown -> chat.
+ *
+ * The folder picker is intentionally bypassed; this journey exercises the
+ * persisted config and indexing contract the picker ultimately feeds.
+ */
+test.describe("Native OneDrive journey", () => {
+	test("user connects OneDrive, selects a file, indexes it, and chats with the canary token", async ({
+		page,
+		request,
+		apiToken,
+		searchSpace,
+		nativeOneDriveConnector,
+		chatThread,
+	}) => {
+		test.setTimeout(240_000); // worker cold-start + Docling + summarize + embed + chunk
+
+		expect(nativeOneDriveConnector.connector_type).toBe("ONEDRIVE_CONNECTOR");
+		expect(nativeOneDriveConnector.is_indexable).toBe(true);
+		expect(nativeOneDriveConnector.config._token_encrypted).toBe(true);
+		expect(nativeOneDriveConnector.config.composio_connected_account_id).toBeUndefined();
+
+		await page.goto(`/dashboard/${searchSpace.id}/new-chat`, {
+			waitUntil: "domcontentloaded",
+		});
+		await openConnectorPopup(page);
+		const connectorDialog = page.getByRole("dialog", { name: "Manage Connectors" });
+		await expect(connectorDialog).toBeVisible();
+
+		const selectedFiles = [
+			{
+				id: FAKE_ONEDRIVE_FILES.canary.id,
+				name: FAKE_ONEDRIVE_FILES.canary.name,
+				mimeType: FAKE_ONEDRIVE_FILES.canary.mimeType,
+			},
+			{
+				id: FAKE_ONEDRIVE_FILES.pdf.id,
+				name: FAKE_ONEDRIVE_FILES.pdf.name,
+				mimeType: FAKE_ONEDRIVE_FILES.pdf.mimeType,
+			},
+		];
+		// Keep the shared Drive-style body shape. OneDrive currently defaults
+		// these internally because the indexer expects max_files/use_delta_sync.
+		const indexingOptions = {
+			max_files_per_folder: 10,
+			incremental_sync: false,
+			include_subfolders: false,
+		};
+
+		await updateConnectorConfig(request, apiToken, nativeOneDriveConnector.id, {
+			...nativeOneDriveConnector.config,
+			selected_folders: [],
+			selected_files: selectedFiles,
+			indexing_options: indexingOptions,
+		});
+
+		await triggerIndex(request, apiToken, nativeOneDriveConnector.id, searchSpace.id, {
+			files: selectedFiles,
+			indexing_options: indexingOptions,
+		});
+
+		await waitForIndexingComplete(request, apiToken, nativeOneDriveConnector.id, searchSpace.id, {
+			timeoutMs: 240_000,
+			intervalMs: 1_500,
+			minDocuments: 2,
+		});
+
+		await waitForDocumentByTitle(
+			request,
+			apiToken,
+			searchSpace.id,
+			FAKE_ONEDRIVE_FILES.canary.name,
+			{ timeoutMs: 30_000 }
+		);
+		await waitForDocumentByTitle(request, apiToken, searchSpace.id, FAKE_ONEDRIVE_FILES.pdf.name, {
+			timeoutMs: 60_000,
+		});
+
+		const docs = await listDocuments(request, apiToken, searchSpace.id);
+		const canaryDoc = docs.find((d) => d.title === FAKE_ONEDRIVE_FILES.canary.name);
+		const pdfDoc = docs.find((d) => d.title === FAKE_ONEDRIVE_FILES.pdf.name);
+
+		expect(canaryDoc, "OneDrive canary document must exist after indexing").toBeDefined();
+		if (!canaryDoc) throw new Error("unreachable: canaryDoc asserted defined above");
+		expect(canaryDoc.document_type).toBe("ONEDRIVE_FILE");
+		expect(pdfDoc, "OneDrive PDF document must exist after indexing").toBeDefined();
+		if (!pdfDoc) throw new Error("unreachable: pdfDoc asserted defined above");
+		expect(pdfDoc.document_type).toBe("ONEDRIVE_FILE");
+
+		const editor = await getEditorContent(request, apiToken, searchSpace.id, canaryDoc.id);
+		expect(
+			editor.source_markdown,
+			`canary token ${CANARY_TOKENS.onedriveCanary} should appear in editor source_markdown; ` +
+				`got first 200 chars: ${editor.source_markdown.slice(0, 200)}`
+		).toContain(CANARY_TOKENS.onedriveCanary);
+		expect(editor.document_type).toBe("ONEDRIVE_FILE");
+		expect(editor.chunk_count).toBeGreaterThan(0);
+
+		const pdfEditor = await getEditorContent(request, apiToken, searchSpace.id, pdfDoc.id);
+		expect(
+			pdfEditor.source_markdown,
+			`PDF canary token ${CANARY_TOKENS.onedrivePdfCanary} should appear in editor source_markdown; ` +
+				`got first 200 chars: ${pdfEditor.source_markdown.slice(0, 200)}`
+		).toContain(CANARY_TOKENS.onedrivePdfCanary);
+		expect(pdfEditor.document_type).toBe("ONEDRIVE_FILE");
+		expect(pdfEditor.chunk_count).toBeGreaterThan(0);
+
+		const refreshedConnectors = await listConnectors(request, apiToken, searchSpace.id);
+		const refreshed = refreshedConnectors.find((c) => c.id === nativeOneDriveConnector.id);
+		expect(refreshed?.connector_type).toBe("ONEDRIVE_CONNECTOR");
+		expect(refreshed?.is_indexable).toBe(true);
+		expect(refreshed?.last_indexed_at).not.toBeNull();
+
+		const chat = await streamChatToCompletion(request, apiToken, {
+			searchSpaceId: searchSpace.id,
+			threadId: chatThread.id,
+			query: "What is in my e2e-onedrive-canary.txt OneDrive file?",
+		});
+		expect(
+			chat.assistantText,
+			"chat agent should surface OneDrive canary token after indexing; " +
+				`got: ${chat.assistantText.slice(0, 200)}`
+		).toContain(CANARY_TOKENS.onedriveCanary);
+
+		const pdfChat = await streamChatToCompletion(request, apiToken, {
+			searchSpaceId: searchSpace.id,
+			threadId: chatThread.id,
+			query: "What is in my e2e-onedrive-canary.pdf OneDrive file?",
+		});
+		expect(
+			pdfChat.assistantText,
+			"chat agent should surface OneDrive PDF canary token after indexing; " +
+				`got: ${pdfChat.assistantText.slice(0, 200)}`
+		).toContain(CANARY_TOKENS.onedrivePdfCanary);
+	});
+});
diff --git a/surfsense_web/tests/connectors/slack/journey.spec.ts b/surfsense_web/tests/connectors/slack/journey.spec.ts
new file mode 100644
index 0000000000..ae773befc1
--- /dev/null
+++ b/surfsense_web/tests/connectors/slack/journey.spec.ts
@@ -0,0 +1,86 @@
+import { expect, slackWithChatTest as test } from "../../fixtures";
+import { streamChatToCompletion } from "../../helpers/api/chat";
+import { listConnectors, triggerIndexExpectDisabled } from "../../helpers/api/connectors";
+import { listDocuments } from "../../helpers/api/documents";
+import { CANARY_TOKENS, FAKE_SLACK_CHANNELS } from "../../helpers/canary";
+import { openConnectorPopup } from "../../helpers/ui/connector-popup";
+
+/**
+ * Proves Slack MCP OAuth -> live MCP tool discovery/call -> chat.
+ *
+ * Slack is live-tool only: the public indexing route returns
+ * indexing_started=false and chat should call Slack MCP tools.
+ */
+test.describe("Slack connector journey", () => {
+	test("user connects Slack and chats through live MCP tools with indexing disabled", async ({
+		page,
+		request,
+		apiToken,
+		searchSpace,
+		slackConnector,
+		chatThread,
+	}) => {
+		test.setTimeout(90_000); // worker cold-start + live tool chat
+
+		expect(slackConnector.connector_type).toBe("SLACK_CONNECTOR");
+		expect(slackConnector.is_indexable).toBe(false);
+		expect(slackConnector.config._token_encrypted).toBe(true);
+		expect(slackConnector.config.mcp_service).toBe("slack");
+		expect(slackConnector.config.server_config).toMatchObject({
+			transport: "streamable-http",
+			url: "https://mcp.slack.com/mcp",
+		});
+		expect(slackConnector.config.mcp_oauth).toMatchObject({
+			client_id: "fake-slack-mcp-client-id",
+			token_endpoint: "https://slack.com/api/oauth.v2.user.access",
+		});
+		expect((slackConnector.config.mcp_oauth as Record<string, unknown>).access_token).toBeTruthy();
+		expect(slackConnector.config.access_token).toBeUndefined();
+		expect(slackConnector.config.refresh_token).toBeUndefined();
+		expect(slackConnector.config.team_id).toBe(FAKE_SLACK_CHANNELS.canary.teamId);
+		expect(slackConnector.config.team_name).toBe(FAKE_SLACK_CHANNELS.canary.teamName);
+
+		await page.goto(`/dashboard/${searchSpace.id}/new-chat`, {
+			waitUntil: "domcontentloaded",
+		});
+		await openConnectorPopup(page);
+		const connectorDialog = page.getByRole("dialog", { name: "Manage Connectors" });
+		await expect(connectorDialog).toBeVisible();
+		await connectorDialog.getByPlaceholder("Search").fill("Slack");
+		await expect(connectorDialog.getByText("Slack", { exact: true })).toBeVisible();
+
+		const beforeDocs = await listDocuments(request, apiToken, searchSpace.id);
+		expect(beforeDocs).toHaveLength(0);
+
+		const disabledIndex = await triggerIndexExpectDisabled(
+			request,
+			apiToken,
+			slackConnector.id,
+			searchSpace.id
+		);
+		expect(disabledIndex.message ?? "").toContain("real-time agent tools");
+		expect(disabledIndex.message ?? "").toContain("background indexing is disabled");
+
+		const chat = await streamChatToCompletion(request, apiToken, {
+			searchSpaceId: searchSpace.id,
+			threadId: chatThread.id,
+			query: `What is in my Slack channel "${FAKE_SLACK_CHANNELS.canary.name}"?`,
+		});
+		expect(
+			chat.assistantText,
+			`chat agent should surface Slack canary token from live MCP tools; got: ${chat.assistantText.slice(0, 200)}`
+		).toContain(CANARY_TOKENS.slackCanary);
+
+		const eventText = JSON.stringify(chat.events);
+		expect(eventText).toContain("slack_search_channels");
+
+		const refreshedConnectors = await listConnectors(request, apiToken, searchSpace.id);
+		const refreshed = refreshedConnectors.find((c) => c.id === slackConnector.id);
+		expect(refreshed?.connector_type).toBe("SLACK_CONNECTOR");
+		expect(refreshed?.is_indexable).toBe(false);
+		expect(refreshed?.last_indexed_at).toBeNull();
+
+		const afterDocs = await listDocuments(request, apiToken, searchSpace.id);
+		expect(afterDocs).toHaveLength(0);
+	});
+});
diff --git a/surfsense_web/tests/documents/file-upload/fixtures/canary.md b/surfsense_web/tests/documents/file-upload/fixtures/canary.md
new file mode 100644
index 0000000000..896a2a86f8
--- /dev/null
+++ b/surfsense_web/tests/documents/file-upload/fixtures/canary.md
@@ -0,0 +1,5 @@
+# Manual Upload Markdown Canary
+
+This markdown file proves the manual upload journey can index plaintext content.
+
+Canary token: E2E-MANUAL-UPLOAD-MD-CANARY-7f3a
diff --git a/surfsense_web/tests/documents/file-upload/fixtures/canary.pdf b/surfsense_web/tests/documents/file-upload/fixtures/canary.pdf
new file mode 100644
index 0000000000..562515f54c
Binary files /dev/null and b/surfsense_web/tests/documents/file-upload/fixtures/canary.pdf differ
diff --git a/surfsense_web/tests/documents/file-upload/journey.spec.ts b/surfsense_web/tests/documents/file-upload/journey.spec.ts
new file mode 100644
index 0000000000..711963bf0e
--- /dev/null
+++ b/surfsense_web/tests/documents/file-upload/journey.spec.ts
@@ -0,0 +1,128 @@
+import path from "node:path";
+import type { APIRequestContext, Page } from "@playwright/test";
+import { expect, manualUploadWithChatTest as test } from "../../fixtures";
+import type { ChatThreadRow } from "../../fixtures/chat-thread.fixture";
+import { streamChatToCompletion } from "../../helpers/api/chat";
+import { getEditorContent, listDocuments } from "../../helpers/api/documents";
+import { CANARY_TOKENS } from "../../helpers/canary";
+import { waitForDocumentByTitle } from "../../helpers/waits/indexing";
+
+type UploadFixture = {
+	path: string;
+	name: string;
+	canary: string;
+	chatQuery: string;
+};
+
+type SearchSpace = {
+	id: number;
+};
+
+const MD_FILE: UploadFixture = {
+	path: path.join(__dirname, "fixtures", "canary.md"),
+	name: "canary.md",
+	canary: CANARY_TOKENS.manualUploadMdCanary,
+	chatQuery: "What is in my uploaded canary.md markdown file?",
+};
+
+const PDF_FILE: UploadFixture = {
+	path: path.join(__dirname, "fixtures", "canary.pdf"),
+	name: "canary.pdf",
+	canary: CANARY_TOKENS.manualUploadPdfCanary,
+	chatQuery: "What is in my uploaded canary.pdf file?",
+};
+
+async function uploadAndAssert({
+	page,
+	request,
+	apiToken,
+	searchSpace,
+	chatThread,
+	file,
+}: {
+	page: Page;
+	request: APIRequestContext;
+	apiToken: string;
+	searchSpace: SearchSpace;
+	chatThread: ChatThreadRow;
+	file: UploadFixture;
+}) {
+	await page.goto(`/dashboard/${searchSpace.id}/new-chat`, {
+		waitUntil: "domcontentloaded",
+	});
+
+	await page.getByRole("button", { name: "Upload" }).click();
+	const dialog = page.getByRole("dialog", { name: "Upload Documents" });
+	await expect(dialog).toBeVisible();
+
+	await dialog.locator('input[type="file"]').first().setInputFiles(file.path);
+	await dialog.getByRole("button", { name: /Upload 1 file/i }).click();
+
+	await expect(page.getByText(/Upload Task Initiated/i)).toBeVisible({
+		timeout: 15_000,
+	});
+
+	await waitForDocumentByTitle(request, apiToken, searchSpace.id, file.name, {
+		timeoutMs: 200_000,
+	});
+
+	const docs = await listDocuments(request, apiToken, searchSpace.id);
+	const uploaded = docs.find((d) => d.title === file.name);
+	expect(uploaded, `${file.name} must exist after indexing`).toBeDefined();
+	if (!uploaded) throw new Error("unreachable: uploaded asserted defined above");
+	expect(uploaded.document_type).toBe("FILE");
+
+	const editor = await getEditorContent(request, apiToken, searchSpace.id, uploaded.id);
+	expect(editor.source_markdown).toContain(file.canary);
+	expect(editor.chunk_count).toBeGreaterThan(0);
+
+	const chat = await streamChatToCompletion(request, apiToken, {
+		searchSpaceId: searchSpace.id,
+		threadId: chatThread.id,
+		query: file.chatQuery,
+	});
+	expect(
+		chat.assistantText,
+		`chat agent should surface manual upload canary after indexing; got: ${chat.assistantText.slice(0, 200)}`
+	).toContain(file.canary);
+}
+
+test.describe("Manual file upload journey", () => {
+	test("user uploads a markdown file (PLAINTEXT branch)", async ({
+		page,
+		request,
+		apiToken,
+		searchSpace,
+		chatThread,
+	}) => {
+		test.setTimeout(180_000);
+
+		await uploadAndAssert({
+			page,
+			request,
+			apiToken,
+			searchSpace,
+			chatThread,
+			file: MD_FILE,
+		});
+	});
+
+	test("user uploads a PDF (DOCUMENT branch)", async ({
+		page,
+		request,
+		apiToken,
+		searchSpace,
+		chatThread,
+	}) => {
+		test.setTimeout(180_000);
+
+		await uploadAndAssert({
+			page,
+			request,
+			apiToken,
+			searchSpace,
+			chatThread,
+			file: PDF_FILE,
+		});
+	});
+});
diff --git a/surfsense_web/tests/fixtures/chat-thread.fixture.ts b/surfsense_web/tests/fixtures/chat-thread.fixture.ts
new file mode 100644
index 0000000000..81cf3b5c32
--- /dev/null
+++ b/surfsense_web/tests/fixtures/chat-thread.fixture.ts
@@ -0,0 +1,42 @@
+import type { APIRequestContext } from "@playwright/test";
+import { authHeaders, BACKEND_URL } from "../helpers/api/auth";
+import type { SearchSpaceFixtures } from "./search-space.fixture";
+
+export type ChatThreadRow = {
+	id: number;
+	title: string;
+	search_space_id: number;
+	visibility: string;
+	created_by_id: string | null;
+	created_at: string;
+	updated_at: string;
+};
+
+export type ChatThreadFixtures = {
+	chatThread: ChatThreadRow;
+};
+
+type ChatThreadFixtureArgs = SearchSpaceFixtures & {
+	request: APIRequestContext;
+};
+
+export const chatThreadFixtures = {
+	chatThread: async (
+		{ request, apiToken, searchSpace }: ChatThreadFixtureArgs,
+		use: (thread: ChatThreadRow) => Promise<void>
+	) => {
+		const response = await request.post(`${BACKEND_URL}/api/v1/threads`, {
+			headers: authHeaders(apiToken),
+			data: {
+				title: "e2e-drive-journey",
+				search_space_id: searchSpace.id,
+				visibility: "PRIVATE",
+			},
+		});
+		if (!response.ok()) {
+			throw new Error(`create chat thread failed (${response.status()}): ${await response.text()}`);
+		}
+
+		await use((await response.json()) as ChatThreadRow);
+	},
+};
diff --git a/surfsense_web/tests/fixtures/connectors/clickup.fixture.ts b/surfsense_web/tests/fixtures/connectors/clickup.fixture.ts
new file mode 100644
index 0000000000..52f94fc5a3
--- /dev/null
+++ b/surfsense_web/tests/fixtures/connectors/clickup.fixture.ts
@@ -0,0 +1,28 @@
+import { type ConnectorRow, deleteConnector, runClickupOAuth } from "../../helpers/api/connectors";
+import { searchSpaceFixtures } from "../search-space.fixture";
+
+export type ClickupFixtures = {
+	/**
+	 * A pre-connected ClickUp connector inside the fixture's `searchSpace`.
+	 * OAuth and MCP tool calls use E2E ClickUp fakes and are cleaned up
+	 * automatically after the test.
+	 */
+	clickupConnector: ConnectorRow;
+};
+
+export const clickupFixtures = searchSpaceFixtures.extend<ClickupFixtures>({
+	clickupConnector: async ({ request, apiToken, searchSpace }, use) => {
+		const { connector } = await runClickupOAuth(request, apiToken, searchSpace.id);
+		if (!connector) {
+			throw new Error(
+				"clickupConnector fixture: OAuth completed but no CLICKUP_CONNECTOR was created. " +
+					"Check the backend log — the ClickUp MCP fake likely rejected an unmodelled call."
+			);
+		}
+		try {
+			await use(connector);
+		} finally {
+			await deleteConnector(request, apiToken, connector.id);
+		}
+	},
+});
diff --git a/surfsense_web/tests/fixtures/connectors/composio-calendar.fixture.ts b/surfsense_web/tests/fixtures/connectors/composio-calendar.fixture.ts
new file mode 100644
index 0000000000..6636924953
--- /dev/null
+++ b/surfsense_web/tests/fixtures/connectors/composio-calendar.fixture.ts
@@ -0,0 +1,33 @@
+import { type ConnectorRow, deleteConnector, runComposioOAuth } from "../../helpers/api/connectors";
+import { searchSpaceFixtures } from "../search-space.fixture";
+
+export type ComposioCalendarFixtures = {
+	/**
+	 * A pre-connected Composio Google Calendar connector inside the
+	 * fixture's `searchSpace`. OAuth uses the strict fake and is cleaned
+	 * up automatically after the test.
+	 */
+	composioCalendarConnector: ConnectorRow;
+};
+
+export const composioCalendarFixtures = searchSpaceFixtures.extend<ComposioCalendarFixtures>({
+	composioCalendarConnector: async ({ request, apiToken, searchSpace }, use) => {
+		const { connector } = await runComposioOAuth(
+			request,
+			apiToken,
+			searchSpace.id,
+			"googlecalendar"
+		);
+		if (!connector) {
+			throw new Error(
+				"composioCalendarConnector fixture: OAuth completed but no connector was created. " +
+					"Check the backend log — the strict Composio fake likely rejected an unmodelled call."
+			);
+		}
+		try {
+			await use(connector);
+		} finally {
+			await deleteConnector(request, apiToken, connector.id);
+		}
+	},
+});
diff --git a/surfsense_web/tests/fixtures/connectors/composio-drive.fixture.ts b/surfsense_web/tests/fixtures/connectors/composio-drive.fixture.ts
new file mode 100644
index 0000000000..470469cfd2
--- /dev/null
+++ b/surfsense_web/tests/fixtures/connectors/composio-drive.fixture.ts
@@ -0,0 +1,28 @@
+import { type ConnectorRow, deleteConnector, runComposioOAuth } from "../../helpers/api/connectors";
+import { searchSpaceFixtures } from "../search-space.fixture";
+
+export type ComposioDriveFixtures = {
+	/**
+	 * A pre-connected Composio Google Drive connector inside the
+	 * fixture's `searchSpace`. OAuth happens against the strict fake
+	 * (no real network). Cleaned up automatically after the test.
+	 */
+	composioDriveConnector: ConnectorRow;
+};
+
+export const composioDriveFixtures = searchSpaceFixtures.extend<ComposioDriveFixtures>({
+	composioDriveConnector: async ({ request, apiToken, searchSpace }, use) => {
+		const { connector } = await runComposioOAuth(request, apiToken, searchSpace.id, "googledrive");
+		if (!connector) {
+			throw new Error(
+				"composioDriveConnector fixture: OAuth completed but no connector was created. " +
+					"Check the backend log — the strict Composio fake likely rejected an unmodelled call."
+			);
+		}
+		try {
+			await use(connector);
+		} finally {
+			await deleteConnector(request, apiToken, connector.id);
+		}
+	},
+});
diff --git a/surfsense_web/tests/fixtures/connectors/composio-gmail.fixture.ts b/surfsense_web/tests/fixtures/connectors/composio-gmail.fixture.ts
new file mode 100644
index 0000000000..ab2cf6f004
--- /dev/null
+++ b/surfsense_web/tests/fixtures/connectors/composio-gmail.fixture.ts
@@ -0,0 +1,28 @@
+import { type ConnectorRow, deleteConnector, runComposioOAuth } from "../../helpers/api/connectors";
+import { searchSpaceFixtures } from "../search-space.fixture";
+
+export type ComposioGmailFixtures = {
+	/**
+	 * A pre-connected Composio Gmail connector inside the fixture's
+	 * `searchSpace`. OAuth happens against the strict fake (no real
+	 * network). Cleaned up automatically after the test.
+	 */
+	composioGmailConnector: ConnectorRow;
+};
+
+export const composioGmailFixtures = searchSpaceFixtures.extend<ComposioGmailFixtures>({
+	composioGmailConnector: async ({ request, apiToken, searchSpace }, use) => {
+		const { connector } = await runComposioOAuth(request, apiToken, searchSpace.id, "gmail");
+		if (!connector) {
+			throw new Error(
+				"composioGmailConnector fixture: OAuth completed but no connector was created. " +
+					"Check the backend log — the strict Composio fake likely rejected an unmodelled call."
+			);
+		}
+		try {
+			await use(connector);
+		} finally {
+			await deleteConnector(request, apiToken, connector.id);
+		}
+	},
+});
diff --git a/surfsense_web/tests/fixtures/connectors/confluence.fixture.ts b/surfsense_web/tests/fixtures/connectors/confluence.fixture.ts
new file mode 100644
index 0000000000..bea950279d
--- /dev/null
+++ b/surfsense_web/tests/fixtures/connectors/confluence.fixture.ts
@@ -0,0 +1,32 @@
+import {
+	type ConnectorRow,
+	deleteConnector,
+	runConfluenceOAuth,
+} from "../../helpers/api/connectors";
+import { searchSpaceFixtures } from "../search-space.fixture";
+
+export type ConfluenceFixtures = {
+	/**
+	 * A pre-connected Confluence connector inside the fixture's `searchSpace`.
+	 * OAuth uses E2E Atlassian fakes and is cleaned up automatically after
+	 * the test.
+	 */
+	confluenceConnector: ConnectorRow;
+};
+
+export const confluenceFixtures = searchSpaceFixtures.extend<ConfluenceFixtures>({
+	confluenceConnector: async ({ request, apiToken, searchSpace }, use) => {
+		const { connector } = await runConfluenceOAuth(request, apiToken, searchSpace.id);
+		if (!connector) {
+			throw new Error(
+				"confluenceConnector fixture: OAuth completed but no CONFLUENCE_CONNECTOR was created. " +
+					"Check the backend log — the Confluence OAuth fake likely rejected an unmodelled call."
+			);
+		}
+		try {
+			await use(connector);
+		} finally {
+			await deleteConnector(request, apiToken, connector.id);
+		}
+	},
+});
diff --git a/surfsense_web/tests/fixtures/connectors/jira.fixture.ts b/surfsense_web/tests/fixtures/connectors/jira.fixture.ts
new file mode 100644
index 0000000000..4d1252e2ee
--- /dev/null
+++ b/surfsense_web/tests/fixtures/connectors/jira.fixture.ts
@@ -0,0 +1,28 @@
+import { type ConnectorRow, deleteConnector, runJiraOAuth } from "../../helpers/api/connectors";
+import { searchSpaceFixtures } from "../search-space.fixture";
+
+export type JiraFixtures = {
+	/**
+	 * A pre-connected Jira connector inside the fixture's `searchSpace`.
+	 * OAuth and MCP tool calls use E2E Jira fakes and are cleaned up
+	 * automatically after the test.
+	 */
+	jiraConnector: ConnectorRow;
+};
+
+export const jiraFixtures = searchSpaceFixtures.extend<JiraFixtures>({
+	jiraConnector: async ({ request, apiToken, searchSpace }, use) => {
+		const { connector } = await runJiraOAuth(request, apiToken, searchSpace.id);
+		if (!connector) {
+			throw new Error(
+				"jiraConnector fixture: OAuth completed but no JIRA_CONNECTOR was created. " +
+					"Check the backend log — the Jira MCP fake likely rejected an unmodelled call."
+			);
+		}
+		try {
+			await use(connector);
+		} finally {
+			await deleteConnector(request, apiToken, connector.id);
+		}
+	},
+});
diff --git a/surfsense_web/tests/fixtures/connectors/linear.fixture.ts b/surfsense_web/tests/fixtures/connectors/linear.fixture.ts
new file mode 100644
index 0000000000..b7d716f28a
--- /dev/null
+++ b/surfsense_web/tests/fixtures/connectors/linear.fixture.ts
@@ -0,0 +1,28 @@
+import { type ConnectorRow, deleteConnector, runLinearOAuth } from "../../helpers/api/connectors";
+import { searchSpaceFixtures } from "../search-space.fixture";
+
+export type LinearFixtures = {
+	/**
+	 * A pre-connected Linear connector inside the fixture's `searchSpace`.
+	 * OAuth and MCP tool calls use E2E Linear fakes and are cleaned up
+	 * automatically after the test.
+	 */
+	linearConnector: ConnectorRow;
+};
+
+export const linearFixtures = searchSpaceFixtures.extend<LinearFixtures>({
+	linearConnector: async ({ request, apiToken, searchSpace }, use) => {
+		const { connector } = await runLinearOAuth(request, apiToken, searchSpace.id);
+		if (!connector) {
+			throw new Error(
+				"linearConnector fixture: OAuth completed but no LINEAR_CONNECTOR was created. " +
+					"Check the backend log — the Linear MCP fake likely rejected an unmodelled call."
+			);
+		}
+		try {
+			await use(connector);
+		} finally {
+			await deleteConnector(request, apiToken, connector.id);
+		}
+	},
+});
diff --git a/surfsense_web/tests/fixtures/connectors/native-calendar.fixture.ts b/surfsense_web/tests/fixtures/connectors/native-calendar.fixture.ts
new file mode 100644
index 0000000000..be2ed4fb12
--- /dev/null
+++ b/surfsense_web/tests/fixtures/connectors/native-calendar.fixture.ts
@@ -0,0 +1,32 @@
+import {
+	type ConnectorRow,
+	deleteConnector,
+	runNativeGoogleCalendarOAuth,
+} from "../../helpers/api/connectors";
+import { searchSpaceFixtures } from "../search-space.fixture";
+
+export type NativeCalendarFixtures = {
+	/**
+	 * A pre-connected native Google Calendar connector inside the fixture's
+	 * `searchSpace`. OAuth uses E2E native Google fakes and is cleaned up
+	 * automatically after the test.
+	 */
+	nativeCalendarConnector: ConnectorRow;
+};
+
+export const nativeCalendarFixtures = searchSpaceFixtures.extend<NativeCalendarFixtures>({
+	nativeCalendarConnector: async ({ request, apiToken, searchSpace }, use) => {
+		const { connector } = await runNativeGoogleCalendarOAuth(request, apiToken, searchSpace.id);
+		if (!connector) {
+			throw new Error(
+				"nativeCalendarConnector fixture: OAuth completed but no GOOGLE_CALENDAR_CONNECTOR was created. " +
+					"Check the backend log — the native Google fake likely rejected an unmodelled call."
+			);
+		}
+		try {
+			await use(connector);
+		} finally {
+			await deleteConnector(request, apiToken, connector.id);
+		}
+	},
+});
diff --git a/surfsense_web/tests/fixtures/connectors/native-drive.fixture.ts b/surfsense_web/tests/fixtures/connectors/native-drive.fixture.ts
new file mode 100644
index 0000000000..35b4b5ddc0
--- /dev/null
+++ b/surfsense_web/tests/fixtures/connectors/native-drive.fixture.ts
@@ -0,0 +1,32 @@
+import {
+	type ConnectorRow,
+	deleteConnector,
+	runNativeGoogleDriveOAuth,
+} from "../../helpers/api/connectors";
+import { searchSpaceFixtures } from "../search-space.fixture";
+
+export type NativeDriveFixtures = {
+	/**
+	 * A pre-connected native Google Drive connector inside the fixture's
+	 * `searchSpace`. OAuth uses E2E native Google fakes and is cleaned up
+	 * automatically after the test.
+	 */
+	nativeDriveConnector: ConnectorRow;
+};
+
+export const nativeDriveFixtures = searchSpaceFixtures.extend<NativeDriveFixtures>({
+	nativeDriveConnector: async ({ request, apiToken, searchSpace }, use) => {
+		const { connector } = await runNativeGoogleDriveOAuth(request, apiToken, searchSpace.id);
+		if (!connector) {
+			throw new Error(
+				"nativeDriveConnector fixture: OAuth completed but no GOOGLE_DRIVE_CONNECTOR was created. " +
+					"Check the backend log — the native Google fake likely rejected an unmodelled call."
+			);
+		}
+		try {
+			await use(connector);
+		} finally {
+			await deleteConnector(request, apiToken, connector.id);
+		}
+	},
+});
diff --git a/surfsense_web/tests/fixtures/connectors/native-dropbox.fixture.ts b/surfsense_web/tests/fixtures/connectors/native-dropbox.fixture.ts
new file mode 100644
index 0000000000..aa6945acf4
--- /dev/null
+++ b/surfsense_web/tests/fixtures/connectors/native-dropbox.fixture.ts
@@ -0,0 +1,32 @@
+import {
+	type ConnectorRow,
+	deleteConnector,
+	runNativeDropboxOAuth,
+} from "../../helpers/api/connectors";
+import { searchSpaceFixtures } from "../search-space.fixture";
+
+export type NativeDropboxFixtures = {
+	/**
+	 * A pre-connected native Dropbox connector inside the fixture's
+	 * `searchSpace`. OAuth uses E2E Dropbox fakes and is cleaned up
+	 * automatically after the test.
+	 */
+	nativeDropboxConnector: ConnectorRow;
+};
+
+export const nativeDropboxFixtures = searchSpaceFixtures.extend<NativeDropboxFixtures>({
+	nativeDropboxConnector: async ({ request, apiToken, searchSpace }, use) => {
+		const { connector } = await runNativeDropboxOAuth(request, apiToken, searchSpace.id);
+		if (!connector) {
+			throw new Error(
+				"nativeDropboxConnector fixture: OAuth completed but no DROPBOX_CONNECTOR was created. " +
+					"Check the backend log — the Dropbox fake likely rejected an unmodelled call."
+			);
+		}
+		try {
+			await use(connector);
+		} finally {
+			await deleteConnector(request, apiToken, connector.id);
+		}
+	},
+});
diff --git a/surfsense_web/tests/fixtures/connectors/native-gmail.fixture.ts b/surfsense_web/tests/fixtures/connectors/native-gmail.fixture.ts
new file mode 100644
index 0000000000..73275decf2
--- /dev/null
+++ b/surfsense_web/tests/fixtures/connectors/native-gmail.fixture.ts
@@ -0,0 +1,32 @@
+import {
+	type ConnectorRow,
+	deleteConnector,
+	runNativeGoogleGmailOAuth,
+} from "../../helpers/api/connectors";
+import { searchSpaceFixtures } from "../search-space.fixture";
+
+export type NativeGmailFixtures = {
+	/**
+	 * A pre-connected native Google Gmail connector inside the fixture's
+	 * `searchSpace`. OAuth uses E2E native Google fakes and is cleaned up
+	 * automatically after the test.
+	 */
+	nativeGmailConnector: ConnectorRow;
+};
+
+export const nativeGmailFixtures = searchSpaceFixtures.extend<NativeGmailFixtures>({
+	nativeGmailConnector: async ({ request, apiToken, searchSpace }, use) => {
+		const { connector } = await runNativeGoogleGmailOAuth(request, apiToken, searchSpace.id);
+		if (!connector) {
+			throw new Error(
+				"nativeGmailConnector fixture: OAuth completed but no GOOGLE_GMAIL_CONNECTOR was created. " +
+					"Check the backend log — the native Google fake likely rejected an unmodelled call."
+			);
+		}
+		try {
+			await use(connector);
+		} finally {
+			await deleteConnector(request, apiToken, connector.id);
+		}
+	},
+});
diff --git a/surfsense_web/tests/fixtures/connectors/native-onedrive.fixture.ts b/surfsense_web/tests/fixtures/connectors/native-onedrive.fixture.ts
new file mode 100644
index 0000000000..bdcd9ec869
--- /dev/null
+++ b/surfsense_web/tests/fixtures/connectors/native-onedrive.fixture.ts
@@ -0,0 +1,32 @@
+import {
+	type ConnectorRow,
+	deleteConnector,
+	runNativeOneDriveOAuth,
+} from "../../helpers/api/connectors";
+import { searchSpaceFixtures } from "../search-space.fixture";
+
+export type NativeOneDriveFixtures = {
+	/**
+	 * A pre-connected native Microsoft OneDrive connector inside the
+	 * fixture's `searchSpace`. OAuth uses E2E Microsoft Graph fakes and is
+	 * cleaned up automatically after the test.
+	 */
+	nativeOneDriveConnector: ConnectorRow;
+};
+
+export const nativeOneDriveFixtures = searchSpaceFixtures.extend<NativeOneDriveFixtures>({
+	nativeOneDriveConnector: async ({ request, apiToken, searchSpace }, use) => {
+		const { connector } = await runNativeOneDriveOAuth(request, apiToken, searchSpace.id);
+		if (!connector) {
+			throw new Error(
+				"nativeOneDriveConnector fixture: OAuth completed but no ONEDRIVE_CONNECTOR was created. " +
+					"Check the backend log — the OneDrive fake likely rejected an unmodelled call."
+			);
+		}
+		try {
+			await use(connector);
+		} finally {
+			await deleteConnector(request, apiToken, connector.id);
+		}
+	},
+});
diff --git a/surfsense_web/tests/fixtures/connectors/notion.fixture.ts b/surfsense_web/tests/fixtures/connectors/notion.fixture.ts
new file mode 100644
index 0000000000..d0032663c8
--- /dev/null
+++ b/surfsense_web/tests/fixtures/connectors/notion.fixture.ts
@@ -0,0 +1,27 @@
+import { type ConnectorRow, deleteConnector, runNotionOAuth } from "../../helpers/api/connectors";
+import { searchSpaceFixtures } from "../search-space.fixture";
+
+export type NotionFixtures = {
+	/**
+	 * A pre-connected Notion connector inside the fixture's `searchSpace`.
+	 * OAuth uses E2E Notion fakes and is cleaned up automatically after the test.
+	 */
+	notionConnector: ConnectorRow;
+};
+
+export const notionFixtures = searchSpaceFixtures.extend<NotionFixtures>({
+	notionConnector: async ({ request, apiToken, searchSpace }, use) => {
+		const { connector } = await runNotionOAuth(request, apiToken, searchSpace.id);
+		if (!connector) {
+			throw new Error(
+				"notionConnector fixture: OAuth completed but no NOTION_CONNECTOR was created. " +
+					"Check the backend log — the Notion fake likely rejected an unmodelled call."
+			);
+		}
+		try {
+			await use(connector);
+		} finally {
+			await deleteConnector(request, apiToken, connector.id);
+		}
+	},
+});
diff --git a/surfsense_web/tests/fixtures/connectors/slack.fixture.ts b/surfsense_web/tests/fixtures/connectors/slack.fixture.ts
new file mode 100644
index 0000000000..898c2013bf
--- /dev/null
+++ b/surfsense_web/tests/fixtures/connectors/slack.fixture.ts
@@ -0,0 +1,28 @@
+import { type ConnectorRow, deleteConnector, runSlackOAuth } from "../../helpers/api/connectors";
+import { searchSpaceFixtures } from "../search-space.fixture";
+
+export type SlackFixtures = {
+	/**
+	 * A pre-connected Slack connector inside the fixture's `searchSpace`.
+	 * OAuth and MCP tool calls use E2E Slack fakes and are cleaned up
+	 * automatically after the test.
+	 */
+	slackConnector: ConnectorRow;
+};
+
+export const slackFixtures = searchSpaceFixtures.extend<SlackFixtures>({
+	slackConnector: async ({ request, apiToken, searchSpace }, use) => {
+		const { connector } = await runSlackOAuth(request, apiToken, searchSpace.id);
+		if (!connector) {
+			throw new Error(
+				"slackConnector fixture: OAuth completed but no SLACK_CONNECTOR was created. " +
+					"Check the backend log — the Slack MCP fake likely rejected an unmodelled call."
+			);
+		}
+		try {
+			await use(connector);
+		} finally {
+			await deleteConnector(request, apiToken, connector.id);
+		}
+	},
+});
diff --git a/surfsense_web/tests/fixtures/index.ts b/surfsense_web/tests/fixtures/index.ts
new file mode 100644
index 0000000000..605e8492aa
--- /dev/null
+++ b/surfsense_web/tests/fixtures/index.ts
@@ -0,0 +1,155 @@
+/**
+ * Central export of all Playwright fixtures used across the E2E suite.
+ *
+ * Specs import `test` and `expect` from here instead of `@playwright/test`
+ * directly so that adding a new fixture (e.g. for a new connector) is a
+ * one-line change for every spec that needs it.
+ *
+ * Inheritance chain:
+ *   base (@playwright/test)
+ *     └─ searchSpaceFixtures   — apiToken, searchSpace
+ *         ├─ composioDriveFixtures — composioDriveConnector
+ *         │   └─ composioDriveWithChatTest — chatThread
+ *         └─ composioGmailFixtures — composioGmailConnector
+ *             └─ composioGmailWithChatTest — chatThread
+ *         └─ composioCalendarFixtures — composioCalendarConnector
+ *             └─ composioCalendarWithChatTest — chatThread
+ *         └─ nativeDriveFixtures — nativeDriveConnector
+ *             └─ nativeDriveWithChatTest — chatThread
+ *         └─ nativeGmailFixtures — nativeGmailConnector
+ *             └─ nativeGmailWithChatTest — chatThread
+ *         └─ nativeCalendarFixtures — nativeCalendarConnector
+ *             └─ nativeCalendarWithChatTest — chatThread
+ *         └─ nativeOneDriveFixtures — nativeOneDriveConnector
+ *             └─ nativeOneDriveWithChatTest — chatThread
+ *         └─ nativeDropboxFixtures — nativeDropboxConnector
+ *             └─ nativeDropboxWithChatTest — chatThread
+ *         └─ notionFixtures — notionConnector
+ *             └─ notionWithChatTest — chatThread
+ *         └─ confluenceFixtures — confluenceConnector
+ *             └─ confluenceWithChatTest — chatThread
+ *         └─ linearFixtures — linearConnector
+ *             └─ linearWithChatTest — chatThread
+ *         └─ jiraFixtures — jiraConnector
+ *             └─ jiraWithChatTest — chatThread
+ *         └─ clickupFixtures — clickupConnector
+ *             └─ clickupWithChatTest — chatThread
+ *         └─ slackFixtures — slackConnector
+ *             └─ slackWithChatTest — chatThread
+ *         └─ manualUploadWithChatTest — chatThread (no connector; uses base search space)
+ *
+ * To add a new connector (Gmail, Slack, manual upload, etc.):
+ *   1. Add a fixture file under `fixtures/connectors/<name>.fixture.ts`.
+ *   2. Re-export it here under a new typed `test` if the new fixture
+ *      doesn't compose cleanly into the existing chain.
+ */
+export { expect } from "@playwright/test";
+export { chatThreadFixtures } from "./chat-thread.fixture";
+export { clickupFixtures } from "./connectors/clickup.fixture";
+export { composioCalendarFixtures } from "./connectors/composio-calendar.fixture";
+export { composioDriveFixtures } from "./connectors/composio-drive.fixture";
+export { composioGmailFixtures } from "./connectors/composio-gmail.fixture";
+export { confluenceFixtures } from "./connectors/confluence.fixture";
+export { jiraFixtures } from "./connectors/jira.fixture";
+export { linearFixtures } from "./connectors/linear.fixture";
+export { nativeCalendarFixtures } from "./connectors/native-calendar.fixture";
+export { nativeDriveFixtures } from "./connectors/native-drive.fixture";
+export { nativeDropboxFixtures } from "./connectors/native-dropbox.fixture";
+export { nativeGmailFixtures } from "./connectors/native-gmail.fixture";
+export { nativeOneDriveFixtures } from "./connectors/native-onedrive.fixture";
+export { notionFixtures } from "./connectors/notion.fixture";
+export { slackFixtures } from "./connectors/slack.fixture";
+export { searchSpaceFixtures } from "./search-space.fixture";
+
+import { type ChatThreadFixtures, chatThreadFixtures } from "./chat-thread.fixture";
+import { clickupFixtures } from "./connectors/clickup.fixture";
+import { composioCalendarFixtures } from "./connectors/composio-calendar.fixture";
+import { composioDriveFixtures } from "./connectors/composio-drive.fixture";
+import { composioGmailFixtures } from "./connectors/composio-gmail.fixture";
+import { confluenceFixtures } from "./connectors/confluence.fixture";
+import { jiraFixtures } from "./connectors/jira.fixture";
+import { linearFixtures } from "./connectors/linear.fixture";
+import { nativeCalendarFixtures } from "./connectors/native-calendar.fixture";
+import { nativeDriveFixtures } from "./connectors/native-drive.fixture";
+import { nativeDropboxFixtures } from "./connectors/native-dropbox.fixture";
+import { nativeGmailFixtures } from "./connectors/native-gmail.fixture";
+import { nativeOneDriveFixtures } from "./connectors/native-onedrive.fixture";
+import { notionFixtures } from "./connectors/notion.fixture";
+import { slackFixtures } from "./connectors/slack.fixture";
+import { searchSpaceFixtures } from "./search-space.fixture";
+
+/** Default `test` for specs that just need auth + a clean search space. */
+export const test = searchSpaceFixtures;
+/** `test` for specs that also need a pre-connected Composio Drive connector. */
+export const composioDriveTest = composioDriveFixtures;
+/** `test` for Drive specs that also need a chat thread. */
+export const composioDriveWithChatTest =
+	composioDriveFixtures.extend<ChatThreadFixtures>(chatThreadFixtures);
+/** `test` for specs that also need a pre-connected Composio Gmail connector. */
+export const composioGmailTest = composioGmailFixtures;
+/** `test` for Gmail specs that also need a chat thread. */
+export const composioGmailWithChatTest =
+	composioGmailFixtures.extend<ChatThreadFixtures>(chatThreadFixtures);
+/** `test` for specs that also need a pre-connected Composio Calendar connector. */
+export const composioCalendarTest = composioCalendarFixtures;
+/** `test` for Calendar specs that also need a chat thread. */
+export const composioCalendarWithChatTest =
+	composioCalendarFixtures.extend<ChatThreadFixtures>(chatThreadFixtures);
+/** `test` for specs that also need a pre-connected native Google Drive connector. */
+export const nativeDriveTest = nativeDriveFixtures;
+/** `test` for native Drive specs that also need a chat thread. */
+export const nativeDriveWithChatTest =
+	nativeDriveFixtures.extend<ChatThreadFixtures>(chatThreadFixtures);
+/** `test` for specs that also need a pre-connected native Gmail connector. */
+export const nativeGmailTest = nativeGmailFixtures;
+/** `test` for native Gmail specs that also need a chat thread. */
+export const nativeGmailWithChatTest =
+	nativeGmailFixtures.extend<ChatThreadFixtures>(chatThreadFixtures);
+/** `test` for specs that also need a pre-connected native Calendar connector. */
+export const nativeCalendarTest = nativeCalendarFixtures;
+/** `test` for native Calendar specs that also need a chat thread. */
+export const nativeCalendarWithChatTest =
+	nativeCalendarFixtures.extend<ChatThreadFixtures>(chatThreadFixtures);
+/** `test` for specs that also need a pre-connected native OneDrive connector. */
+export const nativeOneDriveTest = nativeOneDriveFixtures;
+/** `test` for native OneDrive specs that also need a chat thread. */
+export const nativeOneDriveWithChatTest =
+	nativeOneDriveFixtures.extend<ChatThreadFixtures>(chatThreadFixtures);
+/** `test` for specs that also need a pre-connected native Dropbox connector. */
+export const nativeDropboxTest = nativeDropboxFixtures;
+/** `test` for native Dropbox specs that also need a chat thread. */
+export const nativeDropboxWithChatTest =
+	nativeDropboxFixtures.extend<ChatThreadFixtures>(chatThreadFixtures);
+/** `test` for specs that also need a pre-connected Notion connector. */
+export const notionTest = notionFixtures;
+/** `test` for Notion specs that also need a chat thread. */
+export const notionWithChatTest = notionFixtures.extend<ChatThreadFixtures>(chatThreadFixtures);
+/** `test` for specs that also need a pre-connected Confluence connector. */
+export const confluenceTest = confluenceFixtures;
+/** `test` for Confluence specs that also need a chat thread. */
+export const confluenceWithChatTest =
+	confluenceFixtures.extend<ChatThreadFixtures>(chatThreadFixtures);
+/** `test` for specs that also need a pre-connected Linear connector. */
+export const linearTest = linearFixtures;
+/** `test` for Linear specs that also need a chat thread. */
+export const linearWithChatTest = linearFixtures.extend<ChatThreadFixtures>(chatThreadFixtures);
+/** `test` for specs that also need a pre-connected Jira connector. */
+export const jiraTest = jiraFixtures;
+/** `test` for Jira specs that also need a chat thread. */
+export const jiraWithChatTest = jiraFixtures.extend<ChatThreadFixtures>(chatThreadFixtures);
+/** `test` for specs that also need a pre-connected ClickUp connector. */
+export const clickupTest = clickupFixtures;
+/** `test` for ClickUp specs that also need a chat thread. */
+export const clickupWithChatTest = clickupFixtures.extend<ChatThreadFixtures>(chatThreadFixtures);
+/** `test` for specs that also need a pre-connected Slack connector. */
+export const slackTest = slackFixtures;
+/** `test` for Slack specs that also need a chat thread. */
+export const slackWithChatTest = slackFixtures.extend<ChatThreadFixtures>(chatThreadFixtures);
+/**
+ * `test` for manual upload specs that also need a chat thread.
+ *
+ * Manual upload has no connector fixture — the user uploads files directly via
+ * the Documents-sidebar UI — so this composes chat onto the bare search-space.
+ */
+export const manualUploadWithChatTest =
+	searchSpaceFixtures.extend<ChatThreadFixtures>(chatThreadFixtures);
diff --git a/surfsense_web/tests/fixtures/search-space.fixture.ts b/surfsense_web/tests/fixtures/search-space.fixture.ts
new file mode 100644
index 0000000000..62958caf4d
--- /dev/null
+++ b/surfsense_web/tests/fixtures/search-space.fixture.ts
@@ -0,0 +1,86 @@
+import fs from "node:fs";
+import path from "node:path";
+import { test as base } from "@playwright/test";
+import { acquireTestToken } from "../helpers/api/auth";
+import {
+	createSearchSpace,
+	deleteSearchSpace,
+	type SearchSpaceRow,
+} from "../helpers/api/search-spaces";
+import { uniqueSearchSpaceName } from "../helpers/canary";
+
+export type SearchSpaceFixtures = {
+	/**
+	 * Bearer token for the seeded test user. Worker-scoped so we only
+	 * log in once per worker (logins are cheap, but caching is cheaper).
+	 */
+	apiToken: string;
+	/**
+	 * A fresh, named search space for the current test. Cleaned up
+	 * automatically after the test finishes.
+	 */
+	searchSpace: SearchSpaceRow;
+};
+
+const STORAGE_KEY = "surfsense_bearer_token";
+
+// Reuse the token written by tests/auth.setup.ts; on cache miss we
+// mint a fresh one via /__e2e__/auth/token (rate-limit-free).
+const AUTH_STATE_PATH = path.join(__dirname, "..", "..", "playwright", ".auth", "user.json");
+
+function loadCachedBearerToken(): string | null {
+	try {
+		const raw = fs.readFileSync(AUTH_STATE_PATH, "utf8");
+		const parsed = JSON.parse(raw) as {
+			origins?: Array<{
+				origin?: string;
+				localStorage?: Array<{ name?: string; value?: string }>;
+			}>;
+		};
+		for (const origin of parsed.origins ?? []) {
+			for (const entry of origin.localStorage ?? []) {
+				if (entry.name === STORAGE_KEY && entry.value) {
+					return entry.value;
+				}
+			}
+		}
+	} catch {
+		// Fall back to a fresh login.
+	}
+	return null;
+}
+
+export const searchSpaceFixtures = base.extend<SearchSpaceFixtures, { apiTokenWorker: string }>({
+	apiTokenWorker: [
+		async ({ playwright }, use) => {
+			const cached = loadCachedBearerToken();
+			if (cached) {
+				await use(cached);
+				return;
+			}
+			const ctx = await playwright.request.newContext();
+			try {
+				const token = await acquireTestToken(ctx);
+				await use(token);
+			} finally {
+				await ctx.dispose();
+			}
+		},
+		{ scope: "worker" },
+	],
+	apiToken: async ({ apiTokenWorker }, use) => {
+		await use(apiTokenWorker);
+	},
+	searchSpace: async ({ request, apiToken }, use) => {
+		const space = await createSearchSpace(
+			request,
+			apiToken,
+			uniqueSearchSpaceName("composio-drive-e2e")
+		);
+		try {
+			await use(space);
+		} finally {
+			await deleteSearchSpace(request, apiToken, space.id);
+		}
+	},
+});
diff --git a/surfsense_web/tests/helpers/api/auth.ts b/surfsense_web/tests/helpers/api/auth.ts
new file mode 100644
index 0000000000..6492b09ba8
--- /dev/null
+++ b/surfsense_web/tests/helpers/api/auth.ts
@@ -0,0 +1,99 @@
+import type { APIRequestContext } from "@playwright/test";
+
+/**
+ * Direct backend auth helper. Uses the same /auth/jwt/login endpoint the
+ * UI uses; mirrors lib/apis/auth-api.service.ts.
+ *
+ * Returns a bearer token specs can attach to API calls when they don't
+ * want to go through the browser. The browser-side auth (localStorage)
+ * is set up separately by tests/auth.setup.ts.
+ */
+
+export const BACKEND_URL = process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL || "http://localhost:8000";
+
+const TEST_USER_EMAIL = process.env.PLAYWRIGHT_TEST_EMAIL || "e2e-test@surfsense.net";
+const TEST_USER_PASSWORD = process.env.PLAYWRIGHT_TEST_PASSWORD || "E2eTestPassword123!";
+const E2E_MINT_SECRET = process.env.E2E_MINT_SECRET || "local-e2e-mint-secret-not-for-production";
+
+/**
+ * Mints a JWT for the seeded e2e user via the test-only endpoint mounted
+ * by surfsense_backend/tests/e2e/run_backend.py. Bypasses the production
+ * /auth/jwt/login rate limit (5/min/IP), so it's safe to call from any
+ * worker / retry. Returns 404 from the backend when the endpoint isn't
+ * mounted (i.e. someone is pointing the suite at a non-e2e backend).
+ */
+export async function mintTestToken(
+	request: APIRequestContext,
+	email: string = TEST_USER_EMAIL
+): Promise<string> {
+	const response = await request.post(`${BACKEND_URL}/__e2e__/auth/token`, {
+		data: { email },
+		headers: {
+			"Content-Type": "application/json",
+			"X-E2E-Mint-Secret": E2E_MINT_SECRET,
+		},
+	});
+	if (!response.ok()) {
+		throw new Error(
+			`Mint token at ${BACKEND_URL}/__e2e__/auth/token failed (${response.status()}): ${await response.text()}`
+		);
+	}
+	const { access_token } = (await response.json()) as { access_token: string };
+	if (!access_token) {
+		throw new Error("Mint response missing access_token");
+	}
+	return access_token;
+}
+
+export async function loginAsTestUser(request: APIRequestContext): Promise<string> {
+	const response = await request.post(`${BACKEND_URL}/auth/jwt/login`, {
+		form: {
+			username: TEST_USER_EMAIL,
+			password: TEST_USER_PASSWORD,
+			grant_type: "password",
+		},
+		headers: { "Content-Type": "application/x-www-form-urlencoded" },
+	});
+
+	if (!response.ok()) {
+		throw new Error(
+			`Login to ${BACKEND_URL}/auth/jwt/login failed (${response.status()}): ${await response.text()}`
+		);
+	}
+
+	const { access_token } = (await response.json()) as { access_token: string };
+	if (!access_token) {
+		throw new Error("Backend response missing access_token");
+	}
+	return access_token;
+}
+
+/**
+ * Get a bearer token by trying the rate-limit-free mint endpoint first
+ * and falling back to /auth/jwt/login if the e2e endpoint isn't mounted
+ * (e.g. running against a non-e2e backend in local dev).
+ */
+export async function acquireTestToken(request: APIRequestContext): Promise<string> {
+	try {
+		return await mintTestToken(request);
+	} catch (err) {
+		const msg = err instanceof Error ? err.message : String(err);
+		if (msg.includes("(404)") || msg.includes("(405)")) {
+			return loginAsTestUser(request);
+		}
+		throw err;
+	}
+}
+
+/**
+ * Standard auth headers for backend API calls. Optionally injects an
+ * X-E2E-Scenario header that the test-only ScenarioMiddleware in
+ * surfsense_backend/tests/e2e/run_backend.py reads to flip fake behavior.
+ */
+export function authHeaders(token: string, extra?: Record<string, string>): Record<string, string> {
+	return {
+		Authorization: `Bearer ${token}`,
+		"Content-Type": "application/json",
+		...(extra ?? {}),
+	};
+}
diff --git a/surfsense_web/tests/helpers/api/chat.ts b/surfsense_web/tests/helpers/api/chat.ts
new file mode 100644
index 0000000000..f5e1f92bb0
--- /dev/null
+++ b/surfsense_web/tests/helpers/api/chat.ts
@@ -0,0 +1,67 @@
+import type { APIRequestContext } from "@playwright/test";
+import { authHeaders, BACKEND_URL } from "./auth";
+
+export type ChatStreamEvent = {
+	type: string;
+	payload: unknown;
+};
+
+export type ChatStreamResult = {
+	assistantText: string;
+	events: ChatStreamEvent[];
+};
+
+function isRecord(value: unknown): value is Record<string, unknown> {
+	return typeof value === "object" && value !== null && !Array.isArray(value);
+}
+
+export async function streamChatToCompletion(
+	request: APIRequestContext,
+	token: string,
+	args: { searchSpaceId: number; threadId: number; query: string }
+): Promise<ChatStreamResult> {
+	const response = await request.post(`${BACKEND_URL}/api/v1/new_chat`, {
+		headers: authHeaders(token),
+		data: {
+			chat_id: args.threadId,
+			search_space_id: args.searchSpaceId,
+			user_query: args.query,
+		},
+	});
+	if (!response.ok()) {
+		throw new Error(
+			`streamChatToCompletion failed (${response.status()}): ${await response.text()}`
+		);
+	}
+
+	const body = await response.text();
+	let assistantText = "";
+	let sawDone = false;
+	const events: ChatStreamEvent[] = [];
+
+	for (const rawFrame of body.split("\n\n")) {
+		const frame = rawFrame.trim();
+		if (!frame) continue;
+		if (!frame.startsWith("data: ")) continue;
+
+		const payloadText = frame.slice("data: ".length);
+		if (payloadText === "[DONE]") {
+			sawDone = true;
+			events.push({ type: "done", payload: "[DONE]" });
+			break;
+		}
+
+		const payload = JSON.parse(payloadText) as unknown;
+		const type = isRecord(payload) && typeof payload.type === "string" ? payload.type : "unknown";
+		if (type === "text-delta" && isRecord(payload) && typeof payload.delta === "string") {
+			assistantText += payload.delta;
+		}
+		events.push({ type, payload });
+	}
+
+	if (!sawDone) {
+		throw new Error(`Chat stream did not finish with [DONE]. Body: ${body.slice(0, 500)}`);
+	}
+
+	return { assistantText, events };
+}
diff --git a/surfsense_web/tests/helpers/api/connectors.ts b/surfsense_web/tests/helpers/api/connectors.ts
new file mode 100644
index 0000000000..051b026e0e
--- /dev/null
+++ b/surfsense_web/tests/helpers/api/connectors.ts
@@ -0,0 +1,761 @@
+import type { APIRequestContext } from "@playwright/test";
+import { authHeaders, BACKEND_URL } from "./auth";
+
+export type ConnectorRow = {
+	id: number;
+	name: string;
+	connector_type: string;
+	config: Record<string, unknown>;
+	last_indexed_at: string | null;
+	is_indexable: boolean;
+};
+
+export async function listConnectors(
+	request: APIRequestContext,
+	token: string,
+	searchSpaceId: number
+): Promise<ConnectorRow[]> {
+	const response = await request.get(
+		`${BACKEND_URL}/api/v1/search-source-connectors?search_space_id=${searchSpaceId}`,
+		{ headers: authHeaders(token) }
+	);
+	if (!response.ok()) {
+		throw new Error(`listConnectors failed (${response.status()}): ${await response.text()}`);
+	}
+	const data = await response.json();
+	return Array.isArray(data) ? data : (data?.items ?? []);
+}
+
+export async function getConnector(
+	request: APIRequestContext,
+	token: string,
+	connectorId: number
+): Promise<ConnectorRow> {
+	const response = await request.get(
+		`${BACKEND_URL}/api/v1/search-source-connectors/${connectorId}`,
+		{ headers: authHeaders(token) }
+	);
+	if (!response.ok()) {
+		throw new Error(
+			`getConnector(${connectorId}) failed (${response.status()}): ${await response.text()}`
+		);
+	}
+	return (await response.json()) as ConnectorRow;
+}
+
+export async function updateConnectorConfig(
+	request: APIRequestContext,
+	token: string,
+	connectorId: number,
+	config: Record<string, unknown>
+): Promise<ConnectorRow> {
+	const response = await request.put(
+		`${BACKEND_URL}/api/v1/search-source-connectors/${connectorId}`,
+		{ headers: authHeaders(token), data: { config } }
+	);
+	if (!response.ok()) {
+		throw new Error(
+			`updateConnectorConfig(${connectorId}) failed (${response.status()}): ${await response.text()}`
+		);
+	}
+	return (await response.json()) as ConnectorRow;
+}
+
+export async function deleteConnector(
+	request: APIRequestContext,
+	token: string,
+	connectorId: number
+): Promise<void> {
+	const response = await request.delete(
+		`${BACKEND_URL}/api/v1/search-source-connectors/${connectorId}`,
+		{ headers: authHeaders(token) }
+	);
+	if (!response.ok() && response.status() !== 404) {
+		throw new Error(
+			`deleteConnector(${connectorId}) failed (${response.status()}): ${await response.text()}`
+		);
+	}
+}
+
+export async function listComposioDriveFolders(
+	request: APIRequestContext,
+	token: string,
+	connectorId: number,
+	parentId?: string
+): Promise<{ items: Array<Record<string, unknown>> }> {
+	const url = parentId
+		? `${BACKEND_URL}/api/v1/connectors/${connectorId}/composio-drive/folders?parent_id=${encodeURIComponent(parentId)}`
+		: `${BACKEND_URL}/api/v1/connectors/${connectorId}/composio-drive/folders`;
+	const response = await request.get(url, {
+		headers: authHeaders(token),
+	});
+	if (!response.ok()) {
+		throw new Error(
+			`listComposioDriveFolders(${connectorId}) failed (${response.status()}): ${await response.text()}`
+		);
+	}
+	return (await response.json()) as { items: Array<Record<string, unknown>> };
+}
+
+export type IndexBody = {
+	folders?: Array<{ id: string; name: string; mimeType: string }>;
+	files?: Array<{ id: string; name: string; mimeType: string }>;
+	indexing_options?: {
+		max_files_per_folder?: number;
+		incremental_sync?: boolean;
+		include_subfolders?: boolean;
+	};
+};
+
+export async function triggerIndex(
+	request: APIRequestContext,
+	token: string,
+	connectorId: number,
+	searchSpaceId: number,
+	body: IndexBody
+): Promise<{ ok: true }> {
+	const response = await request.post(
+		`${BACKEND_URL}/api/v1/search-source-connectors/${connectorId}/index?search_space_id=${searchSpaceId}`,
+		{ headers: authHeaders(token), data: body }
+	);
+	if (!response.ok()) {
+		throw new Error(
+			`triggerIndex(${connectorId}) failed (${response.status()}): ${await response.text()}`
+		);
+	}
+	return { ok: true };
+}
+
+export async function triggerIndexExpectDisabled(
+	request: APIRequestContext,
+	token: string,
+	connectorId: number,
+	searchSpaceId: number,
+	body: IndexBody = {}
+): Promise<{ indexing_started: false; message?: string }> {
+	const response = await request.post(
+		`${BACKEND_URL}/api/v1/search-source-connectors/${connectorId}/index?search_space_id=${searchSpaceId}`,
+		{ headers: authHeaders(token), data: body }
+	);
+	if (!response.ok()) {
+		throw new Error(
+			`triggerIndexExpectDisabled(${connectorId}) failed (${response.status()}): ${await response.text()}`
+		);
+	}
+	const payload = (await response.json()) as { indexing_started?: boolean; message?: string };
+	if (payload.indexing_started !== false) {
+		throw new Error(
+			`triggerIndexExpectDisabled(${connectorId}) expected indexing_started=false, got ${JSON.stringify(payload)}`
+		);
+	}
+	return { indexing_started: false, message: payload.message };
+}
+
+/**
+ * Drives the OAuth flow for a Composio toolkit programmatically.
+ *
+ * Steps mirror what the UI does (see use-connector-dialog.ts):
+ *   1) GET /api/v1/auth/composio/connector/add?space_id=&toolkit_id=  -> { auth_url }
+ *   2) Follow the auth_url (which the E2E fake makes same-origin so it
+ *      lands on the callback directly with ?connectedAccountId=...).
+ *   3) Backend creates the connector and redirects to the frontend
+ *      success page.
+ *
+ * Returns the newly-created (or reconnected) connector row.
+ */
+export async function runComposioOAuth(
+	request: APIRequestContext,
+	token: string,
+	searchSpaceId: number,
+	toolkitId: "googledrive" | "gmail" | "googlecalendar" = "googledrive"
+): Promise<{
+	authUrl: string;
+	finalUrl: string;
+	connector: ConnectorRow | null;
+}> {
+	// Step 1: kick off OAuth, get auth_url.
+	const initiateResp = await request.get(
+		`${BACKEND_URL}/api/v1/auth/composio/connector/add?space_id=${searchSpaceId}&toolkit_id=${toolkitId}`,
+		{
+			headers: authHeaders(token),
+		}
+	);
+	if (!initiateResp.ok()) {
+		throw new Error(
+			`composio initiate failed (${initiateResp.status()}): ${await initiateResp.text()}`
+		);
+	}
+	const { auth_url } = (await initiateResp.json()) as { auth_url: string };
+	if (!auth_url) {
+		throw new Error("composio initiate response missing auth_url");
+	}
+
+	// Step 2: follow the auth_url. The fake makes this same-origin and
+	// pointing at the callback. Use maxRedirects=0 so we can inspect
+	// the final redirect target manually.
+	const callbackResp = await request.get(auth_url, {
+		headers: authHeaders(token),
+		maxRedirects: 0,
+		failOnStatusCode: false,
+	});
+
+	const location = callbackResp.headers().location ?? auth_url;
+
+	// Step 3: look up the resulting connector (if any).
+	const connectors = await listConnectors(request, token, searchSpaceId);
+	const composioType =
+		toolkitId === "googledrive"
+			? "COMPOSIO_GOOGLE_DRIVE_CONNECTOR"
+			: toolkitId === "gmail"
+				? "COMPOSIO_GMAIL_CONNECTOR"
+				: "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR";
+	const connector = connectors.find((c) => c.connector_type === composioType) ?? null;
+
+	return { authUrl: auth_url, finalUrl: location, connector };
+}
+
+/**
+ * Drives the native Google Drive OAuth flow programmatically.
+ *
+ * The E2E backend patches Google OAuth so the returned auth_url points
+ * straight back to the backend callback with a deterministic code/state.
+ */
+export async function runNativeGoogleDriveOAuth(
+	request: APIRequestContext,
+	token: string,
+	searchSpaceId: number
+): Promise<{
+	authUrl: string;
+	finalUrl: string;
+	connector: ConnectorRow | null;
+}> {
+	const initiateResp = await request.get(
+		`${BACKEND_URL}/api/v1/auth/google/drive/connector/add?space_id=${searchSpaceId}`,
+		{ headers: authHeaders(token) }
+	);
+	if (!initiateResp.ok()) {
+		throw new Error(
+			`native Google Drive initiate failed (${initiateResp.status()}): ${await initiateResp.text()}`
+		);
+	}
+	const { auth_url } = (await initiateResp.json()) as { auth_url: string };
+	if (!auth_url) {
+		throw new Error("native Google Drive initiate response missing auth_url");
+	}
+
+	const callbackResp = await request.get(auth_url, {
+		headers: authHeaders(token),
+		maxRedirects: 0,
+		failOnStatusCode: false,
+	});
+	const location = callbackResp.headers().location ?? auth_url;
+
+	const connectors = await listConnectors(request, token, searchSpaceId);
+	const connector = connectors.find((c) => c.connector_type === "GOOGLE_DRIVE_CONNECTOR") ?? null;
+
+	return { authUrl: auth_url, finalUrl: location, connector };
+}
+
+/**
+ * Drives the native Microsoft OneDrive OAuth flow programmatically.
+ *
+ * The OneDrive authorization URL is off-origin, so the helper extracts the
+ * signed state and calls the backend callback directly. The E2E backend fakes
+ * Microsoft's token and profile HTTP responses inside that callback.
+ */
+export async function runNativeOneDriveOAuth(
+	request: APIRequestContext,
+	token: string,
+	searchSpaceId: number
+): Promise<{
+	authUrl: string;
+	finalUrl: string;
+	connector: ConnectorRow | null;
+}> {
+	const initiateResp = await request.get(
+		`${BACKEND_URL}/api/v1/auth/onedrive/connector/add?space_id=${searchSpaceId}`,
+		{ headers: authHeaders(token) }
+	);
+	if (!initiateResp.ok()) {
+		throw new Error(
+			`native OneDrive initiate failed (${initiateResp.status()}): ${await initiateResp.text()}`
+		);
+	}
+	const { auth_url } = (await initiateResp.json()) as { auth_url: string };
+	if (!auth_url) {
+		throw new Error("native OneDrive initiate response missing auth_url");
+	}
+
+	const state = new URL(auth_url).searchParams.get("state");
+	if (!state) {
+		throw new Error(`native OneDrive auth_url missing state: ${auth_url}`);
+	}
+
+	const callbackResp = await request.get(
+		`${BACKEND_URL}/api/v1/auth/onedrive/connector/callback?code=fake-onedrive-oauth-code&state=${encodeURIComponent(state)}`,
+		{
+			headers: authHeaders(token),
+			maxRedirects: 0,
+			failOnStatusCode: false,
+		}
+	);
+	if (callbackResp.status() >= 400) {
+		throw new Error(
+			`native OneDrive callback failed (${callbackResp.status()}): ${await callbackResp.text()}`
+		);
+	}
+	const location = callbackResp.headers().location ?? auth_url;
+
+	const connectors = await listConnectors(request, token, searchSpaceId);
+	const connector = connectors.find((c) => c.connector_type === "ONEDRIVE_CONNECTOR") ?? null;
+
+	return { authUrl: auth_url, finalUrl: location, connector };
+}
+
+/**
+ * Drives the native Dropbox OAuth flow programmatically.
+ *
+ * The Dropbox authorization URL is off-origin, so the helper extracts the
+ * signed state and calls the backend callback directly. The E2E backend fakes
+ * Dropbox's token and account HTTP responses inside that callback.
+ */
+export async function runNativeDropboxOAuth(
+	request: APIRequestContext,
+	token: string,
+	searchSpaceId: number
+): Promise<{
+	authUrl: string;
+	finalUrl: string;
+	connector: ConnectorRow | null;
+}> {
+	const initiateResp = await request.get(
+		`${BACKEND_URL}/api/v1/auth/dropbox/connector/add?space_id=${searchSpaceId}`,
+		{ headers: authHeaders(token) }
+	);
+	if (!initiateResp.ok()) {
+		throw new Error(
+			`native Dropbox initiate failed (${initiateResp.status()}): ${await initiateResp.text()}`
+		);
+	}
+	const { auth_url } = (await initiateResp.json()) as { auth_url: string };
+	if (!auth_url) {
+		throw new Error("native Dropbox initiate response missing auth_url");
+	}
+
+	const state = new URL(auth_url).searchParams.get("state");
+	if (!state) {
+		throw new Error(`native Dropbox auth_url missing state: ${auth_url}`);
+	}
+
+	const callbackResp = await request.get(
+		`${BACKEND_URL}/api/v1/auth/dropbox/connector/callback?code=fake-dropbox-oauth-code&state=${encodeURIComponent(state)}`,
+		{
+			headers: authHeaders(token),
+			maxRedirects: 0,
+			failOnStatusCode: false,
+		}
+	);
+	if (callbackResp.status() >= 400) {
+		throw new Error(
+			`native Dropbox callback failed (${callbackResp.status()}): ${await callbackResp.text()}`
+		);
+	}
+	const location = callbackResp.headers().location ?? auth_url;
+
+	const connectors = await listConnectors(request, token, searchSpaceId);
+	const connector = connectors.find((c) => c.connector_type === "DROPBOX_CONNECTOR") ?? null;
+
+	return { authUrl: auth_url, finalUrl: location, connector };
+}
+
+/**
+ * Drives the native Google Gmail OAuth flow programmatically.
+ *
+ * The E2E backend patches Google OAuth so the returned auth_url points
+ * straight back to the backend callback with a deterministic code/state.
+ */
+export async function runNativeGoogleGmailOAuth(
+	request: APIRequestContext,
+	token: string,
+	searchSpaceId: number
+): Promise<{
+	authUrl: string;
+	finalUrl: string;
+	connector: ConnectorRow | null;
+}> {
+	const initiateResp = await request.get(
+		`${BACKEND_URL}/api/v1/auth/google/gmail/connector/add?space_id=${searchSpaceId}`,
+		{ headers: authHeaders(token) }
+	);
+	if (!initiateResp.ok()) {
+		throw new Error(
+			`native Google Gmail initiate failed (${initiateResp.status()}): ${await initiateResp.text()}`
+		);
+	}
+	const { auth_url } = (await initiateResp.json()) as { auth_url: string };
+	if (!auth_url) {
+		throw new Error("native Google Gmail initiate response missing auth_url");
+	}
+
+	const callbackResp = await request.get(auth_url, {
+		headers: authHeaders(token),
+		maxRedirects: 0,
+		failOnStatusCode: false,
+	});
+	const location = callbackResp.headers().location ?? auth_url;
+
+	const connectors = await listConnectors(request, token, searchSpaceId);
+	const connector = connectors.find((c) => c.connector_type === "GOOGLE_GMAIL_CONNECTOR") ?? null;
+
+	return { authUrl: auth_url, finalUrl: location, connector };
+}
+
+/**
+ * Drives the native Google Calendar OAuth flow programmatically.
+ *
+ * The E2E backend patches Google OAuth so the returned auth_url points
+ * straight back to the backend callback with a deterministic code/state.
+ */
+export async function runNativeGoogleCalendarOAuth(
+	request: APIRequestContext,
+	token: string,
+	searchSpaceId: number
+): Promise<{
+	authUrl: string;
+	finalUrl: string;
+	connector: ConnectorRow | null;
+}> {
+	const initiateResp = await request.get(
+		`${BACKEND_URL}/api/v1/auth/google/calendar/connector/add?space_id=${searchSpaceId}`,
+		{ headers: authHeaders(token) }
+	);
+	if (!initiateResp.ok()) {
+		throw new Error(
+			`native Google Calendar initiate failed (${initiateResp.status()}): ${await initiateResp.text()}`
+		);
+	}
+	const { auth_url } = (await initiateResp.json()) as { auth_url: string };
+	if (!auth_url) {
+		throw new Error("native Google Calendar initiate response missing auth_url");
+	}
+
+	const callbackResp = await request.get(auth_url, {
+		headers: authHeaders(token),
+		maxRedirects: 0,
+		failOnStatusCode: false,
+	});
+	const location = callbackResp.headers().location ?? auth_url;
+
+	const connectors = await listConnectors(request, token, searchSpaceId);
+	const connector =
+		connectors.find((c) => c.connector_type === "GOOGLE_CALENDAR_CONNECTOR") ?? null;
+
+	return { authUrl: auth_url, finalUrl: location, connector };
+}
+
+/**
+ * Drives the Notion OAuth flow programmatically.
+ *
+ * The E2E backend keeps SurfSense's OAuth add/callback routes real and
+ * patches only Notion's external token endpoint. Notion's authorization
+ * URL stays off-origin, so this helper extracts the signed state and calls
+ * the backend callback directly with the deterministic fake code.
+ */
+export async function runNotionOAuth(
+	request: APIRequestContext,
+	token: string,
+	searchSpaceId: number
+): Promise<{
+	authUrl: string;
+	finalUrl: string;
+	connector: ConnectorRow | null;
+}> {
+	const initiateResp = await request.get(
+		`${BACKEND_URL}/api/v1/auth/notion/connector/add?space_id=${searchSpaceId}`,
+		{ headers: authHeaders(token) }
+	);
+	if (!initiateResp.ok()) {
+		throw new Error(
+			`Notion initiate failed (${initiateResp.status()}): ${await initiateResp.text()}`
+		);
+	}
+	const { auth_url } = (await initiateResp.json()) as { auth_url: string };
+	if (!auth_url) {
+		throw new Error("Notion initiate response missing auth_url");
+	}
+
+	const state = new URL(auth_url).searchParams.get("state");
+	if (!state) {
+		throw new Error(`Notion auth_url missing state: ${auth_url}`);
+	}
+
+	const callbackResp = await request.get(
+		`${BACKEND_URL}/api/v1/auth/notion/connector/callback?code=fake-notion-oauth-code&state=${encodeURIComponent(state)}`,
+		{
+			headers: authHeaders(token),
+			maxRedirects: 0,
+			failOnStatusCode: false,
+		}
+	);
+	const location = callbackResp.headers().location ?? auth_url;
+
+	const connectors = await listConnectors(request, token, searchSpaceId);
+	const connector = connectors.find((c) => c.connector_type === "NOTION_CONNECTOR") ?? null;
+
+	return { authUrl: auth_url, finalUrl: location, connector };
+}
+
+/**
+ * Drives the Confluence OAuth flow programmatically.
+ *
+ * The E2E backend keeps SurfSense's OAuth add/callback routes real and
+ * patches only Atlassian's external token/resource endpoints.
+ */
+export async function runConfluenceOAuth(
+	request: APIRequestContext,
+	token: string,
+	searchSpaceId: number
+): Promise<{
+	authUrl: string;
+	finalUrl: string;
+	connector: ConnectorRow | null;
+}> {
+	const initiateResp = await request.get(
+		`${BACKEND_URL}/api/v1/auth/confluence/connector/add?space_id=${searchSpaceId}`,
+		{ headers: authHeaders(token) }
+	);
+	if (!initiateResp.ok()) {
+		throw new Error(
+			`Confluence initiate failed (${initiateResp.status()}): ${await initiateResp.text()}`
+		);
+	}
+	const { auth_url } = (await initiateResp.json()) as { auth_url: string };
+	if (!auth_url) {
+		throw new Error("Confluence initiate response missing auth_url");
+	}
+
+	const state = new URL(auth_url).searchParams.get("state");
+	if (!state) {
+		throw new Error(`Confluence auth_url missing state: ${auth_url}`);
+	}
+
+	const callbackResp = await request.get(
+		`${BACKEND_URL}/api/v1/auth/confluence/connector/callback?code=fake-confluence-oauth-code&state=${encodeURIComponent(state)}`,
+		{
+			headers: authHeaders(token),
+			maxRedirects: 0,
+			failOnStatusCode: false,
+		}
+	);
+	const location = callbackResp.headers().location ?? auth_url;
+
+	const connectors = await listConnectors(request, token, searchSpaceId);
+	const connector = connectors.find((c) => c.connector_type === "CONFLUENCE_CONNECTOR") ?? null;
+
+	return { authUrl: auth_url, finalUrl: location, connector };
+}
+
+/**
+ * Drives the Linear MCP OAuth flow programmatically.
+ *
+ * The E2E backend keeps SurfSense's generic MCP OAuth routes real and
+ * patches Linear's external discovery/DCR/token/MCP tool boundaries.
+ */
+export async function runLinearOAuth(
+	request: APIRequestContext,
+	token: string,
+	searchSpaceId: number
+): Promise<{
+	authUrl: string;
+	finalUrl: string;
+	connector: ConnectorRow | null;
+}> {
+	const initiateResp = await request.get(
+		`${BACKEND_URL}/api/v1/auth/mcp/linear/connector/add?space_id=${searchSpaceId}`,
+		{ headers: authHeaders(token) }
+	);
+	if (!initiateResp.ok()) {
+		throw new Error(
+			`Linear MCP initiate failed (${initiateResp.status()}): ${await initiateResp.text()}`
+		);
+	}
+	const { auth_url } = (await initiateResp.json()) as { auth_url: string };
+	if (!auth_url) {
+		throw new Error("Linear MCP initiate response missing auth_url");
+	}
+
+	const state = new URL(auth_url).searchParams.get("state");
+	if (!state) {
+		throw new Error(`Linear MCP auth_url missing state: ${auth_url}`);
+	}
+
+	const callbackResp = await request.get(
+		`${BACKEND_URL}/api/v1/auth/mcp/linear/connector/callback?code=fake-linear-oauth-code&state=${encodeURIComponent(state)}`,
+		{
+			headers: authHeaders(token),
+			maxRedirects: 0,
+			failOnStatusCode: false,
+		}
+	);
+	const location = callbackResp.headers().location ?? auth_url;
+
+	const connectors = await listConnectors(request, token, searchSpaceId);
+	const connector = connectors.find((c) => c.connector_type === "LINEAR_CONNECTOR") ?? null;
+
+	return { authUrl: auth_url, finalUrl: location, connector };
+}
+
+/**
+ * Drives the Jira MCP OAuth flow programmatically.
+ *
+ * The E2E backend keeps SurfSense's generic MCP OAuth routes real and
+ * patches Jira's external discovery/DCR/token/MCP tool boundaries.
+ */
+export async function runJiraOAuth(
+	request: APIRequestContext,
+	token: string,
+	searchSpaceId: number
+): Promise<{
+	authUrl: string;
+	finalUrl: string;
+	connector: ConnectorRow | null;
+}> {
+	const initiateResp = await request.get(
+		`${BACKEND_URL}/api/v1/auth/mcp/jira/connector/add?space_id=${searchSpaceId}`,
+		{ headers: authHeaders(token) }
+	);
+	if (!initiateResp.ok()) {
+		throw new Error(
+			`Jira MCP initiate failed (${initiateResp.status()}): ${await initiateResp.text()}`
+		);
+	}
+	const { auth_url } = (await initiateResp.json()) as { auth_url: string };
+	if (!auth_url) {
+		throw new Error("Jira MCP initiate response missing auth_url");
+	}
+
+	const state = new URL(auth_url).searchParams.get("state");
+	if (!state) {
+		throw new Error(`Jira MCP auth_url missing state: ${auth_url}`);
+	}
+
+	const callbackResp = await request.get(
+		`${BACKEND_URL}/api/v1/auth/mcp/jira/connector/callback?code=fake-jira-oauth-code&state=${encodeURIComponent(state)}`,
+		{
+			headers: authHeaders(token),
+			maxRedirects: 0,
+			failOnStatusCode: false,
+		}
+	);
+	const location = callbackResp.headers().location ?? auth_url;
+
+	const connectors = await listConnectors(request, token, searchSpaceId);
+	const connector = connectors.find((c) => c.connector_type === "JIRA_CONNECTOR") ?? null;
+
+	return { authUrl: auth_url, finalUrl: location, connector };
+}
+
+/**
+ * Drives the ClickUp MCP OAuth flow programmatically.
+ *
+ * The E2E backend keeps SurfSense's generic MCP OAuth routes real and
+ * patches ClickUp's external discovery/DCR/token/MCP tool boundaries.
+ */
+export async function runClickupOAuth(
+	request: APIRequestContext,
+	token: string,
+	searchSpaceId: number
+): Promise<{
+	authUrl: string;
+	finalUrl: string;
+	connector: ConnectorRow | null;
+}> {
+	const initiateResp = await request.get(
+		`${BACKEND_URL}/api/v1/auth/mcp/clickup/connector/add?space_id=${searchSpaceId}`,
+		{ headers: authHeaders(token) }
+	);
+	if (!initiateResp.ok()) {
+		throw new Error(
+			`ClickUp MCP initiate failed (${initiateResp.status()}): ${await initiateResp.text()}`
+		);
+	}
+	const { auth_url } = (await initiateResp.json()) as { auth_url: string };
+	if (!auth_url) {
+		throw new Error("ClickUp MCP initiate response missing auth_url");
+	}
+
+	const state = new URL(auth_url).searchParams.get("state");
+	if (!state) {
+		throw new Error(`ClickUp MCP auth_url missing state: ${auth_url}`);
+	}
+
+	const callbackResp = await request.get(
+		`${BACKEND_URL}/api/v1/auth/mcp/clickup/connector/callback?code=fake-clickup-oauth-code&state=${encodeURIComponent(state)}`,
+		{
+			headers: authHeaders(token),
+			maxRedirects: 0,
+			failOnStatusCode: false,
+		}
+	);
+	const location = callbackResp.headers().location ?? auth_url;
+
+	const connectors = await listConnectors(request, token, searchSpaceId);
+	const connector = connectors.find((c) => c.connector_type === "CLICKUP_CONNECTOR") ?? null;
+
+	return { authUrl: auth_url, finalUrl: location, connector };
+}
+
+/**
+ * Drives the Slack MCP OAuth flow programmatically.
+ *
+ * The E2E backend keeps SurfSense's generic MCP OAuth routes real and
+ * patches Slack's external discovery/token/MCP tool boundaries.
+ */
+export async function runSlackOAuth(
+	request: APIRequestContext,
+	token: string,
+	searchSpaceId: number
+): Promise<{
+	authUrl: string;
+	finalUrl: string;
+	connector: ConnectorRow | null;
+}> {
+	const initiateResp = await request.get(
+		`${BACKEND_URL}/api/v1/auth/mcp/slack/connector/add?space_id=${searchSpaceId}`,
+		{ headers: authHeaders(token) }
+	);
+	if (!initiateResp.ok()) {
+		throw new Error(
+			`Slack MCP initiate failed (${initiateResp.status()}): ${await initiateResp.text()}`
+		);
+	}
+	const { auth_url } = (await initiateResp.json()) as { auth_url: string };
+	if (!auth_url) {
+		throw new Error("Slack MCP initiate response missing auth_url");
+	}
+
+	const state = new URL(auth_url).searchParams.get("state");
+	if (!state) {
+		throw new Error(`Slack MCP auth_url missing state: ${auth_url}`);
+	}
+
+	const callbackResp = await request.get(
+		`${BACKEND_URL}/api/v1/auth/mcp/slack/connector/callback?code=fake-slack-oauth-code&state=${encodeURIComponent(state)}`,
+		{
+			headers: authHeaders(token),
+			maxRedirects: 0,
+			failOnStatusCode: false,
+		}
+	);
+	if (callbackResp.status() >= 400) {
+		throw new Error(
+			`Slack MCP callback failed (${callbackResp.status()}): ${await callbackResp.text()}`
+		);
+	}
+	const location = callbackResp.headers().location ?? auth_url;
+
+	const connectors = await listConnectors(request, token, searchSpaceId);
+	const connector = connectors.find((c) => c.connector_type === "SLACK_CONNECTOR") ?? null;
+
+	return { authUrl: auth_url, finalUrl: location, connector };
+}
diff --git a/surfsense_web/tests/helpers/api/documents.ts b/surfsense_web/tests/helpers/api/documents.ts
new file mode 100644
index 0000000000..92055602b1
--- /dev/null
+++ b/surfsense_web/tests/helpers/api/documents.ts
@@ -0,0 +1,64 @@
+import type { APIRequestContext } from "@playwright/test";
+import { authHeaders, BACKEND_URL } from "./auth";
+
+export type DocumentRow = {
+	id: number;
+	title: string;
+	content: string;
+	document_type: string;
+	status: { state?: string } | string;
+};
+
+type Paginated<T> = {
+	items?: T[];
+	total?: number;
+};
+
+export async function listDocuments(
+	request: APIRequestContext,
+	token: string,
+	searchSpaceId: number,
+	limit = 100
+): Promise<DocumentRow[]> {
+	const response = await request.get(
+		`${BACKEND_URL}/api/v1/documents?search_space_id=${searchSpaceId}&limit=${limit}`,
+		{ headers: authHeaders(token) }
+	);
+	if (!response.ok()) {
+		throw new Error(`listDocuments failed (${response.status()}): ${await response.text()}`);
+	}
+	const body = (await response.json()) as Paginated<DocumentRow> | DocumentRow[];
+	return Array.isArray(body) ? body : (body.items ?? []);
+}
+
+export function isDocumentReady(doc: DocumentRow): boolean {
+	const state = typeof doc.status === "string" ? doc.status : doc.status?.state;
+	return state === "ready" || state === "READY";
+}
+
+export type EditorContent = {
+	document_id: number;
+	title: string;
+	document_type: string;
+	source_markdown: string;
+	content_size_bytes: number;
+	chunk_count: number;
+	truncated: boolean;
+};
+
+// Same endpoint the UI hits when a user opens a document in the dashboard.
+export async function getEditorContent(
+	request: APIRequestContext,
+	token: string,
+	searchSpaceId: number,
+	documentId: number
+): Promise<EditorContent> {
+	const response = await request.get(
+		`${BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/editor-content`,
+		{ headers: authHeaders(token) }
+	);
+	if (!response.ok()) {
+		throw new Error(`getEditorContent failed (${response.status()}): ${await response.text()}`);
+	}
+	return (await response.json()) as EditorContent;
+}
diff --git a/surfsense_web/tests/helpers/api/search-spaces.ts b/surfsense_web/tests/helpers/api/search-spaces.ts
new file mode 100644
index 0000000000..06274790e7
--- /dev/null
+++ b/surfsense_web/tests/helpers/api/search-spaces.ts
@@ -0,0 +1,40 @@
+import type { APIRequestContext } from "@playwright/test";
+import { authHeaders, BACKEND_URL } from "./auth";
+
+export type SearchSpaceRow = {
+	id: number;
+	name: string;
+	description: string | null;
+};
+
+export async function createSearchSpace(
+	request: APIRequestContext,
+	token: string,
+	name: string,
+	description = "E2E test search space"
+): Promise<SearchSpaceRow> {
+	const response = await request.post(`${BACKEND_URL}/api/v1/searchspaces`, {
+		headers: authHeaders(token),
+		data: { name, description },
+	});
+	if (!response.ok()) {
+		throw new Error(`createSearchSpace failed (${response.status()}): ${await response.text()}`);
+	}
+	return (await response.json()) as SearchSpaceRow;
+}
+
+export async function deleteSearchSpace(
+	request: APIRequestContext,
+	token: string,
+	id: number
+): Promise<void> {
+	const response = await request.delete(`${BACKEND_URL}/api/v1/searchspaces/${id}`, {
+		headers: authHeaders(token),
+	});
+	if (!response.ok() && response.status() !== 404) {
+		// 404 is acceptable: the test may have already deleted the space.
+		throw new Error(
+			`deleteSearchSpace(${id}) failed (${response.status()}): ${await response.text()}`
+		);
+	}
+}
diff --git a/surfsense_web/tests/helpers/canary.ts b/surfsense_web/tests/helpers/canary.ts
new file mode 100644
index 0000000000..fe35ed191f
--- /dev/null
+++ b/surfsense_web/tests/helpers/canary.ts
@@ -0,0 +1,229 @@
+import { randomUUID } from "node:crypto";
+
+/**
+ * Canary tokens & deterministic test data.
+ *
+ * Embedded by the backend Composio fake into fake Drive file contents
+ * (see surfsense_backend/tests/e2e/fakes/fixtures/drive_files.json).
+ * Specs assert these strings appear in the resulting Document rows to
+ * prove the indexing pipeline ran end-to-end.
+ *
+ * Each token is a stable string keyed by file id so multi-test runs
+ * remain deterministic and the resulting Document.content is greppable
+ * in failure traces.
+ */
+export const CANARY_TOKENS = {
+	driveCanaryFile: "SURFSENSE_E2E_CANARY_TOKEN_DRIVE_001",
+	drivePdfCanary: "SURFSENSE_E2E_CANARY_TOKEN_DRIVE_PDF_001",
+	composioDrivePdfCanary: "SURFSENSE_E2E_CANARY_TOKEN_COMPOSIO_DRIVE_PDF_001",
+	driveReadme: "SURFSENSE_E2E_README_MARKER",
+	driveBudget: "SURFSENSE_E2E_BUDGET_MARKER",
+	driveRoadmap: "SURFSENSE_E2E_ROADMAP_MARKER",
+	driveArchive: "SURFSENSE_E2E_ARCHIVE_MARKER",
+	gmailCanary: "SURFSENSE_E2E_CANARY_TOKEN_GMAIL_001",
+	calendarCanary: "SURFSENSE_E2E_CANARY_TOKEN_CALENDAR_001",
+	onedriveCanary: "SURFSENSE_E2E_CANARY_TOKEN_ONEDRIVE_001",
+	onedrivePdfCanary: "SURFSENSE_E2E_CANARY_TOKEN_ONEDRIVE_PDF_001",
+	dropboxCanary: "SURFSENSE_E2E_CANARY_TOKEN_DROPBOX_001",
+	dropboxPdfCanary: "SURFSENSE_E2E_CANARY_TOKEN_DROPBOX_PDF_001",
+	notionCanary: "SURFSENSE_E2E_CANARY_TOKEN_NOTION_001",
+	confluenceCanary: "SURFSENSE_E2E_CANARY_TOKEN_CONFLUENCE_001",
+	linearCanary: "SURFSENSE_E2E_CANARY_TOKEN_LINEAR_001",
+	jiraCanary: "SURFSENSE_E2E_CANARY_TOKEN_JIRA_001",
+	slackCanary: "SURFSENSE_E2E_CANARY_TOKEN_SLACK_001",
+	clickupCanary: "SURFSENSE_E2E_CANARY_TOKEN_CLICKUP_001",
+	manualUploadMdCanary: "E2E-MANUAL-UPLOAD-MD-CANARY-7f3a",
+	manualUploadPdfCanary: "E2E-MANUAL-UPLOAD-PDF-CANARY-9d2b",
+} as const;
+
+/**
+ * Fake Drive file IDs that match what the backend fake returns from
+ * GOOGLEDRIVE_LIST_FILES. Keep in sync with drive_files.json.
+ */
+export const FAKE_DRIVE_FILES = {
+	canary: { id: "fake-file-canary", name: "e2e-canary.txt", mimeType: "text/plain" },
+	pdfNative: {
+		id: "fake-file-pdf-native",
+		name: "e2e-canary.pdf",
+		mimeType: "application/pdf",
+	},
+	pdfComposio: {
+		id: "fake-file-pdf-composio",
+		name: "e2e-composio-canary.pdf",
+		mimeType: "application/pdf",
+	},
+	readme: { id: "fake-file-readme", name: "README.md", mimeType: "text/markdown" },
+	budget: { id: "fake-file-budget", name: "Q1-Budget.csv", mimeType: "text/csv" },
+} as const;
+
+export const FAKE_DRIVE_FOLDERS = {
+	projects: {
+		id: "fake-folder-projects",
+		name: "Projects",
+		mimeType: "application/vnd.google-apps.folder",
+	},
+	archive: {
+		id: "fake-folder-archive",
+		name: "Archive",
+		mimeType: "application/vnd.google-apps.folder",
+	},
+} as const;
+
+/**
+ * Fake OneDrive file IDs that match the Microsoft Graph-shaped backend
+ * fake in onedrive_files.json.
+ */
+export const FAKE_ONEDRIVE_FILES = {
+	canary: {
+		id: "fake-onedrive-canary",
+		name: "e2e-onedrive-canary.txt",
+		mimeType: "text/plain",
+	},
+	pdf: {
+		id: "fake-onedrive-pdf-canary",
+		name: "e2e-onedrive-canary.pdf",
+		mimeType: "application/pdf",
+	},
+} as const;
+
+/**
+ * Fake Dropbox file paths that match the Dropbox-shaped backend fake in
+ * dropbox_files.json. Dropbox selected-file indexing accepts path_lower via id.
+ */
+export const FAKE_DROPBOX_FILES = {
+	canary: {
+		id: "/e2e-dropbox-canary.txt",
+		name: "e2e-dropbox-canary.txt",
+		mimeType: "text/plain",
+	},
+	pdf: {
+		id: "/e2e-dropbox-canary.pdf",
+		name: "e2e-dropbox-canary.pdf",
+		mimeType: "application/pdf",
+	},
+} as const;
+
+/**
+ * Fake Gmail message IDs that match what the backend fake returns from
+ * GMAIL_FETCH_EMAILS / GMAIL_FETCH_MESSAGE_BY_MESSAGE_ID.
+ */
+export const FAKE_GMAIL_MESSAGES = {
+	canary: {
+		id: "fake-msg-canary-001",
+		threadId: "fake-thread-canary-001",
+		subject: "E2E Canary Email",
+		from: "sender@surfsense.example",
+		to: "e2e-fake@surfsense.example",
+	},
+	planning: {
+		id: "fake-msg-planning-001",
+		threadId: "fake-thread-planning-001",
+		subject: "E2E Planning Notes",
+		from: "planner@surfsense.example",
+		to: "e2e-fake@surfsense.example",
+	},
+} as const;
+
+/**
+ * Fake Calendar event IDs that match what the backend fake returns from
+ * GOOGLECALENDAR_EVENTS_LIST.
+ */
+export const FAKE_CALENDAR_EVENTS = {
+	canary: {
+		id: "fake-calendar-event-canary-001",
+		summary: "E2E Canary Calendar Event",
+		location: "SurfSense E2E Room",
+	},
+	planning: {
+		id: "fake-calendar-event-planning-001",
+		summary: "E2E Planning Sync",
+		location: "SurfSense Planning Room",
+	},
+} as const;
+
+/**
+ * Fake Notion page IDs that match what the backend fake returns from
+ * notion_client.AsyncClient.search.
+ */
+export const FAKE_NOTION_PAGES = {
+	canary: {
+		id: "fake-notion-page-canary-001",
+		title: "E2E Canary Notion Page",
+		workspaceId: "fake-notion-workspace-001",
+		workspaceName: "SurfSense E2E Notion Workspace",
+		botId: "fake-notion-bot-001",
+	},
+} as const;
+
+/**
+ * Fake Confluence page IDs that match what the backend fake returns from
+ * ConfluenceHistoryConnector.get_pages_by_date_range.
+ */
+export const FAKE_CONFLUENCE_PAGES = {
+	canary: {
+		id: "fake-confluence-page-canary-001",
+		title: "E2E Canary Confluence Page",
+		spaceId: "fake-confluence-space-001",
+		cloudId: "fake-confluence-cloud-001",
+		baseUrl: "https://surfsense-e2e-confluence.atlassian.net",
+	},
+} as const;
+
+/**
+ * Fake Linear issue IDs that match what the backend MCP fake returns from
+ * list_issues / get_issue.
+ */
+export const FAKE_LINEAR_ISSUES = {
+	canary: {
+		id: "fake-linear-issue-canary-001",
+		identifier: "E2E-101",
+		title: "E2E Canary Linear Issue",
+		organizationName: "SurfSense E2E Linear Org",
+		organizationUrlKey: "surfsense-e2e",
+	},
+} as const;
+
+/**
+ * Fake Jira issue IDs that match what the backend MCP fake returns from
+ * searchJiraIssuesUsingJql.
+ */
+export const FAKE_JIRA_ISSUES = {
+	canary: {
+		id: "fake-jira-issue-canary-001",
+		key: "E2E-101",
+		summary: "E2E Canary Jira Issue",
+		cloudId: "fake-jira-cloud-001",
+		siteUrl: "https://surfsense-e2e.atlassian.net",
+	},
+} as const;
+
+/**
+ * Fake Slack channel IDs that match what the backend MCP fake returns from
+ * slack_search_channels.
+ */
+export const FAKE_SLACK_CHANNELS = {
+	canary: {
+		id: "C_FAKE_SLACK_CANARY",
+		name: "slack-e2e-canary",
+		teamId: "T_FAKE_SLACK_TEAM",
+		teamName: "SurfSense E2E Slack Workspace",
+	},
+} as const;
+
+/**
+ * Fake ClickUp task IDs that match what the backend MCP fake returns from
+ * clickup_search / clickup_get_task.
+ */
+export const FAKE_CLICKUP_TASKS = {
+	canary: {
+		id: "fake-clickup-task-canary-001",
+		name: "E2E Canary ClickUp Task",
+		workspaceId: "fake-clickup-workspace-001",
+		workspaceName: "SurfSense E2E ClickUp Workspace",
+	},
+} as const;
+
+/** Generate a unique-per-run search space name. Keeps parallel tests isolated. */
+export function uniqueSearchSpaceName(prefix = "e2e"): string {
+	return `${prefix}-${randomUUID().slice(0, 8)}`;
+}
diff --git a/surfsense_web/tests/helpers/mocks/composio-oauth.ts b/surfsense_web/tests/helpers/mocks/composio-oauth.ts
new file mode 100644
index 0000000000..6924a47150
--- /dev/null
+++ b/surfsense_web/tests/helpers/mocks/composio-oauth.ts
@@ -0,0 +1,25 @@
+import type { Page } from "@playwright/test";
+
+/**
+ * Frontend route mock for the Composio OAuth redirect.
+ *
+ * In normal E2E runs we DON'T need this: the backend Composio fake
+ * returns a same-origin auth_url that lands directly on our callback,
+ * so the browser never navigates to composio.dev.
+ *
+ * Reserved here for future negative tests that intentionally exercise
+ * a tampered/external auth_url (e.g. validating that the frontend
+ * doesn't blindly follow off-origin redirects).
+ */
+export async function mockComposioOAuthRedirect(
+	page: Page,
+	options: { rewriteTo: string }
+): Promise<void> {
+	await page.route(/composio\.dev/, async (route) => {
+		await route.fulfill({
+			status: 302,
+			headers: { Location: options.rewriteTo },
+			body: "",
+		});
+	});
+}
diff --git a/surfsense_web/tests/helpers/ui/composio-drive-config.ts b/surfsense_web/tests/helpers/ui/composio-drive-config.ts
new file mode 100644
index 0000000000..6fc404f301
--- /dev/null
+++ b/surfsense_web/tests/helpers/ui/composio-drive-config.ts
@@ -0,0 +1,14 @@
+import type { Page } from "@playwright/test";
+
+/**
+ * Placeholder for the Composio Drive configuration view (folder
+ * selector + indexing options) rendered by ConnectorEditView.
+ *
+ * Phase 1 specs drive folder/file selection through the API helper
+ * (`updateConnectorConfig`) and `triggerIndex` for determinism. UI-
+ * level interaction with the folder tree is a Phase 2 task; this
+ * module is reserved for those selectors.
+ */
+export async function reservedForPhaseTwo(_page: Page): Promise<void> {
+	// Intentionally empty. See README in tests/connectors/composio/drive/.
+}
diff --git a/surfsense_web/tests/helpers/ui/connector-popup.ts b/surfsense_web/tests/helpers/ui/connector-popup.ts
new file mode 100644
index 0000000000..19e2710645
--- /dev/null
+++ b/surfsense_web/tests/helpers/ui/connector-popup.ts
@@ -0,0 +1,34 @@
+import type { Page } from "@playwright/test";
+import { expect } from "@playwright/test";
+
+/**
+ * Page-object-style helpers for the connector dialog rendered by
+ * components/assistant-ui/connector-popup.tsx.
+ *
+ * Kept minimal in Phase 1: most spec interactions go through API
+ * fixtures for determinism. UI-driven coverage of every connector card
+ * is a Phase 2 task and will use this helper as the entry point.
+ */
+
+export async function openConnectorPopup(page: Page): Promise<void> {
+	// Label depends on whether the user already has connectors.
+	const trigger = page
+		.getByRole("button", { name: "Manage connectors" })
+		.or(page.getByRole("button", { name: "Connect your connectors" }))
+		.first();
+
+	// Long timeout absorbs Next.js dev cold-compile of the new-chat route.
+	await expect(trigger).toBeVisible({ timeout: 60_000 });
+	await trigger.click();
+
+	await expect(page.getByRole("dialog", { name: "Manage Connectors" })).toBeVisible();
+}
+
+export async function clickComposioDriveCard(page: Page): Promise<void> {
+	const composioDriveCard = page.getByText("Search your Drive files via Composio");
+	await composioDriveCard.scrollIntoViewIfNeeded();
+	const card = composioDriveCard
+		.locator("xpath=ancestor::*[self::article or self::div][1]")
+		.first();
+	await card.getByRole("button", { name: "Connect" }).click();
+}
diff --git a/surfsense_web/tests/helpers/ui/connector-status.ts b/surfsense_web/tests/helpers/ui/connector-status.ts
new file mode 100644
index 0000000000..49c617dad7
--- /dev/null
+++ b/surfsense_web/tests/helpers/ui/connector-status.ts
@@ -0,0 +1,10 @@
+import type { Page } from "@playwright/test";
+
+/**
+ * Selectors for connector status indicators (last_indexed_at badge,
+ * indexing spinner, auth-expired banner). Reserved for Phase 2 UI-level
+ * assertions; Phase 1 specs assert these via the API.
+ */
+export async function reservedForPhaseTwo(_page: Page): Promise<void> {
+	// Intentionally empty.
+}
diff --git a/surfsense_web/tests/helpers/ui/dashboard.ts b/surfsense_web/tests/helpers/ui/dashboard.ts
new file mode 100644
index 0000000000..df4c678234
--- /dev/null
+++ b/surfsense_web/tests/helpers/ui/dashboard.ts
@@ -0,0 +1,21 @@
+import type { Page } from "@playwright/test";
+import { expect } from "@playwright/test";
+
+/**
+ * Navigation helpers for dashboard routes. Centralized so that future
+ * route changes only require an update in one place.
+ */
+
+export function newChatUrl(searchSpaceId: number): string {
+	return `/dashboard/${searchSpaceId}/new-chat`;
+}
+
+export function connectorsCallbackUrl(searchSpaceId: number): string {
+	return `/dashboard/${searchSpaceId}/connectors/callback`;
+}
+
+export async function gotoNewChat(page: Page, searchSpaceId: number): Promise<void> {
+	const target = newChatUrl(searchSpaceId);
+	await page.goto(target, { waitUntil: "domcontentloaded" });
+	await expect(page).toHaveURL((url) => url.pathname === target);
+}
diff --git a/surfsense_web/tests/helpers/waits/indexing.ts b/surfsense_web/tests/helpers/waits/indexing.ts
new file mode 100644
index 0000000000..d60ab38e0e
--- /dev/null
+++ b/surfsense_web/tests/helpers/waits/indexing.ts
@@ -0,0 +1,75 @@
+import type { APIRequestContext } from "@playwright/test";
+import { getConnector } from "../api/connectors";
+import { isDocumentReady, listDocuments } from "../api/documents";
+
+/**
+ * Polls the backend until a connector finishes indexing OR the deadline
+ * passes. Replaces `waitForTimeout` (which is a Playwright anti-pattern)
+ * with deterministic polling on real signals.
+ */
+export async function waitForIndexingComplete(
+	request: APIRequestContext,
+	token: string,
+	connectorId: number,
+	searchSpaceId: number,
+	options: { timeoutMs?: number; intervalMs?: number; minDocuments?: number } = {}
+): Promise<void> {
+	const timeoutMs = options.timeoutMs ?? 60_000;
+	const intervalMs = options.intervalMs ?? 1_000;
+	const minDocuments = options.minDocuments ?? 1;
+
+	const startedAt = Date.now();
+	let lastState = "unknown";
+
+	while (Date.now() - startedAt < timeoutMs) {
+		const connector = await getConnector(request, token, connectorId);
+		const docs = await listDocuments(request, token, searchSpaceId);
+		const readyDocs = docs.filter(isDocumentReady);
+
+		const connectorIndexed = connector.last_indexed_at !== null;
+		const enoughReady = readyDocs.length >= minDocuments;
+
+		if (connectorIndexed && enoughReady) {
+			return;
+		}
+
+		lastState = `last_indexed_at=${connector.last_indexed_at} ready_docs=${readyDocs.length}/${minDocuments}`;
+		await new Promise((resolve) => setTimeout(resolve, intervalMs));
+	}
+
+	throw new Error(
+		`waitForIndexingComplete: timed out after ${timeoutMs}ms waiting for ` +
+			`connector ${connectorId} in space ${searchSpaceId}. Last observed: ${lastState}`
+	);
+}
+
+/**
+ * Polls until the given document title appears in the search space with
+ * status=ready. Useful when a spec wants to assert on a specific file
+ * by name rather than count.
+ */
+export async function waitForDocumentByTitle(
+	request: APIRequestContext,
+	token: string,
+	searchSpaceId: number,
+	title: string,
+	options: { timeoutMs?: number; intervalMs?: number } = {}
+): Promise<void> {
+	const timeoutMs = options.timeoutMs ?? 60_000;
+	const intervalMs = options.intervalMs ?? 1_000;
+	const startedAt = Date.now();
+
+	while (Date.now() - startedAt < timeoutMs) {
+		const docs = await listDocuments(request, token, searchSpaceId);
+		const match = docs.find((d) => d.title === title && isDocumentReady(d));
+		if (match) {
+			return;
+		}
+		await new Promise((resolve) => setTimeout(resolve, intervalMs));
+	}
+
+	throw new Error(
+		`waitForDocumentByTitle: timed out after ${timeoutMs}ms waiting for ` +
+			`title=${JSON.stringify(title)} in space ${searchSpaceId}.`
+	);
+}
diff --git a/surfsense_web/tests/smoke/chat-stream.spec.ts b/surfsense_web/tests/smoke/chat-stream.spec.ts
new file mode 100644
index 0000000000..20fe67f430
--- /dev/null
+++ b/surfsense_web/tests/smoke/chat-stream.spec.ts
@@ -0,0 +1,32 @@
+import { expect, test } from "../fixtures";
+import { authHeaders, BACKEND_URL } from "../helpers/api/auth";
+import { streamChatToCompletion } from "../helpers/api/chat";
+
+test.describe("Smoke", () => {
+	test("chat stream completes for an unrelated query", async ({
+		request,
+		apiToken,
+		searchSpace,
+	}) => {
+		const threadResponse = await request.post(`${BACKEND_URL}/api/v1/threads`, {
+			headers: authHeaders(apiToken),
+			data: {
+				title: "e2e-chat-stream-smoke",
+				search_space_id: searchSpace.id,
+				visibility: "PRIVATE",
+			},
+		});
+		expect(threadResponse.ok()).toBeTruthy();
+
+		const thread = (await threadResponse.json()) as { id: number };
+		const chat = await streamChatToCompletion(request, apiToken, {
+			searchSpaceId: searchSpace.id,
+			threadId: thread.id,
+			query: "E2E_NO_RELEVANT_CONTENT_SMOKE",
+		});
+
+		expect(chat.events.some((event) => event.type === "done")).toBeTruthy();
+		expect(chat.events.some((event) => event.type === "text-delta")).toBeTruthy();
+		expect(chat.assistantText).toContain("No relevant indexed content found.");
+	});
+});
diff --git a/surfsense_web/tests/smoke/dashboard.spec.ts b/surfsense_web/tests/smoke/dashboard.spec.ts
new file mode 100644
index 0000000000..a6249ddfad
--- /dev/null
+++ b/surfsense_web/tests/smoke/dashboard.spec.ts
@@ -0,0 +1,21 @@
+import { expect, test } from "@playwright/test";
+
+/**
+ * Tracer-bullet smoke test: proves the entire E2E pipeline is wired up.
+ *
+ * Verifies:
+ *   - Web server is reachable
+ *   - tests/auth.setup.ts ran and stored a valid bearer token
+ *   - Dashboard route renders for an authenticated user
+ *
+ * Keep minimal. Connector- or feature-specific behaviour belongs under
+ * tests/connectors/ or tests/<feature>/.
+ */
+test.describe("Smoke", () => {
+	test("dashboard loads for authenticated user", async ({ page }) => {
+		await page.goto("/dashboard");
+
+		// Sidebar is <aside> (role=complementary); its visibility implies redirect + auth fetch.
+		await expect(page.getByRole("complementary").first()).toBeVisible({ timeout: 60_000 });
+	});
+});
diff --git a/surfsense_web/types/fuzzy-search.d.ts b/surfsense_web/types/fuzzy-search.d.ts
new file mode 100644
index 0000000000..425a2c237d
--- /dev/null
+++ b/surfsense_web/types/fuzzy-search.d.ts
@@ -0,0 +1,13 @@
+declare module "fuzzy-search" {
+	interface FuzzySearchOptions {
+		caseSensitive?: boolean;
+		sort?: boolean;
+	}
+
+	class FuzzySearch<T> {
+		constructor(haystack: T[], keys?: string[], options?: FuzzySearchOptions);
+		search(needle: string): T[];
+	}
+
+	export default FuzzySearch;
+}
diff --git a/surfsense_web/types/window.d.ts b/surfsense_web/types/window.d.ts
index 004aefcd5f..f25d43f5e6 100644
--- a/surfsense_web/types/window.d.ts
+++ b/surfsense_web/types/window.d.ts
@@ -41,6 +41,48 @@ interface FolderFileEntry {
 	mtimeMs: number;
 }
 
+type AgentFilesystemMode = "cloud" | "desktop_local_folder";
+
+interface AgentFilesystemSettings {
+	mode: AgentFilesystemMode;
+	localRootPaths: string[];
+	updatedAt: string;
+}
+
+interface AgentFilesystemMount {
+	mount: string;
+	rootPath: string;
+}
+
+interface AgentFilesystemListOptions {
+	rootPath: string;
+	searchSpaceId?: number | null;
+	excludePatterns?: string[] | null;
+	fileExtensions?: string[] | null;
+}
+
+interface AgentFilesystemTreeWatchOptions {
+	searchSpaceId?: number | null;
+	rootPaths: string[];
+	excludePatterns?: string[] | null;
+	fileExtensions?: string[] | null;
+}
+
+interface AgentFilesystemTreeDirtyEvent {
+	searchSpaceId: number | null;
+	reason: "watcher_event" | "safety_poll";
+	rootPath: string;
+	changedPath: string | null;
+	timestamp: number;
+}
+
+interface LocalTextFileResult {
+	ok: boolean;
+	path: string;
+	content?: string;
+	error?: string;
+}
+
 interface ElectronAPI {
 	versions: {
 		electron: string;
@@ -51,6 +93,7 @@ interface ElectronAPI {
 	openExternal: (url: string) => void;
 	getAppVersion: () => Promise<string>;
 	onDeepLink: (callback: (url: string) => void) => () => void;
+	onChatScreenCapture: (callback: (dataUrl: string) => void) => () => void;
 	getQuickAskText: () => Promise<string>;
 	setQuickAskMode: (mode: string) => Promise<void>;
 	getQuickAskMode: () => Promise<string>;
@@ -62,20 +105,8 @@ interface ElectronAPI {
 	}>;
 	requestAccessibility: () => Promise<void>;
 	requestScreenRecording: () => Promise<void>;
+	captureFullScreen: () => Promise<string | null>;
 	restartApp: () => Promise<void>;
-	// Autocomplete
-	onAutocompleteContext: (
-		callback: (data: {
-			screenshot: string;
-			searchSpaceId?: string;
-			appName?: string;
-			windowTitle?: string;
-		}) => void
-	) => () => void;
-	acceptSuggestion: (text: string) => Promise<void>;
-	dismissSuggestion: () => Promise<void>;
-	setAutocompleteEnabled: (enabled: boolean) => Promise<void>;
-	getAutocompleteEnabled: () => Promise<boolean>;
 	// Folder sync
 	selectFolder: () => Promise<string | null>;
 	addWatchedFolder: (config: WatchedFolderConfig) => Promise<WatchedFolderConfig[]>;
@@ -94,17 +125,73 @@ interface ElectronAPI {
 	// Browse files/folders via native dialogs
 	browseFiles: () => Promise<string[] | null>;
 	readLocalFiles: (paths: string[]) => Promise<LocalFileData[]>;
+	readAgentLocalFileText: (
+		virtualPath: string,
+		searchSpaceId?: number | null
+	) => Promise<LocalTextFileResult>;
+	writeAgentLocalFileText: (
+		virtualPath: string,
+		content: string,
+		searchSpaceId?: number | null
+	) => Promise<LocalTextFileResult>;
 	// Auth token sync across windows
 	getAuthTokens: () => Promise<{ bearer: string; refresh: string } | null>;
 	setAuthTokens: (bearer: string, refresh: string) => Promise<void>;
 	// Keyboard shortcut configuration
-	getShortcuts: () => Promise<{ generalAssist: string; quickAsk: string; autocomplete: string }>;
+	getShortcuts: () => Promise<{
+		generalAssist: string;
+		quickAsk: string;
+		screenshotAssist: string;
+	}>;
 	setShortcuts: (
-		config: Partial<{ generalAssist: string; quickAsk: string; autocomplete: string }>
-	) => Promise<{ generalAssist: string; quickAsk: string; autocomplete: string }>;
+		config: Partial<{ generalAssist: string; quickAsk: string; screenshotAssist: string }>
+	) => Promise<{
+		generalAssist: string;
+		quickAsk: string;
+		screenshotAssist: string;
+	}>;
+	// Launch on system startup
+	getAutoLaunch: () => Promise<{
+		enabled: boolean;
+		openAsHidden: boolean;
+		supported: boolean;
+	}>;
+	setAutoLaunch: (
+		enabled: boolean,
+		openAsHidden?: boolean
+	) => Promise<{ enabled: boolean; openAsHidden: boolean; supported: boolean }>;
 	// Active search space
 	getActiveSearchSpace: () => Promise<string | null>;
 	setActiveSearchSpace: (id: string) => Promise<void>;
+	// Analytics bridge (PostHog mirror into the Electron main process)
+	analyticsIdentify: (userId: string, properties?: Record<string, unknown>) => Promise<void>;
+	analyticsReset: () => Promise<void>;
+	analyticsCapture: (event: string, properties?: Record<string, unknown>) => Promise<void>;
+	getAnalyticsContext: () => Promise<{
+		distinctId: string;
+		machineId: string;
+		appVersion: string;
+		platform: string;
+	}>;
+	// Agent filesystem mode
+	getAgentFilesystemSettings: (searchSpaceId?: number | null) => Promise<AgentFilesystemSettings>;
+	getAgentFilesystemMounts: (searchSpaceId?: number | null) => Promise<AgentFilesystemMount[]>;
+	listAgentFilesystemFiles: (options: AgentFilesystemListOptions) => Promise<FolderFileEntry[]>;
+	startAgentFilesystemTreeWatch: (
+		options: AgentFilesystemTreeWatchOptions
+	) => Promise<{ ok: true }>;
+	stopAgentFilesystemTreeWatch: (searchSpaceId?: number | null) => Promise<{ ok: true }>;
+	onAgentFilesystemTreeDirty: (
+		callback: (data: AgentFilesystemTreeDirtyEvent) => void
+	) => () => void;
+	setAgentFilesystemSettings: (
+		settings: {
+			mode?: AgentFilesystemMode;
+			localRootPaths?: string[] | null;
+		},
+		searchSpaceId?: number | null
+	) => Promise<AgentFilesystemSettings>;
+	pickAgentFilesystemRoot: () => Promise<string | null>;
 }
 
 declare global {
diff --git a/surfsense_web/zero/queries/index.ts b/surfsense_web/zero/queries/index.ts
index bc332114e7..fbf1bd76e3 100644
--- a/surfsense_web/zero/queries/index.ts
+++ b/surfsense_web/zero/queries/index.ts
@@ -3,6 +3,7 @@ import { chatSessionQueries, commentQueries, messageQueries } from "./chat";
 import { connectorQueries, documentQueries } from "./documents";
 import { folderQueries } from "./folders";
 import { notificationQueries } from "./inbox";
+import { userQueries } from "./user";
 
 export const queries = defineQueries({
 	notifications: notificationQueries,
@@ -12,4 +13,5 @@ export const queries = defineQueries({
 	messages: messageQueries,
 	comments: commentQueries,
 	chatSession: chatSessionQueries,
+	user: userQueries,
 });
diff --git a/surfsense_web/zero/queries/user.ts b/surfsense_web/zero/queries/user.ts
new file mode 100644
index 0000000000..30e71a482b
--- /dev/null
+++ b/surfsense_web/zero/queries/user.ts
@@ -0,0 +1,11 @@
+import { defineQuery } from "@rocicorp/zero";
+import { z } from "zod";
+import { zql } from "../schema/index";
+
+export const userQueries = {
+	me: defineQuery(z.object({}), ({ ctx }) => {
+		const userId = ctx?.userId;
+		if (!userId) return zql.user.where("id", "__none__").one();
+		return zql.user.where("id", userId).one();
+	}),
+};
diff --git a/surfsense_web/zero/schema/chat.ts b/surfsense_web/zero/schema/chat.ts
index 0293059fd7..fb3d7651e6 100644
--- a/surfsense_web/zero/schema/chat.ts
+++ b/surfsense_web/zero/schema/chat.ts
@@ -8,6 +8,13 @@ export const newChatMessageTable = table("new_chat_messages")
 		threadId: number().from("thread_id"),
 		authorId: string().optional().from("author_id"),
 		createdAt: number().from("created_at"),
+		// Per-turn correlation id sourced from ``configurable.turn_id``
+		// at streaming time. Required by the inline Revert button's
+		// (chat_turn_id, tool_name, position) fallback in tool-fallback.tsx
+		// — without it the live-collab Zero sync would clobber the
+		// metadata we set during streaming and the button would vanish
+		// the moment Zero re-syncs after the stream finishes.
+		turnId: string().optional().from("turn_id"),
 	})
 	.primaryKey("id");
 
diff --git a/surfsense_web/zero/schema/index.ts b/surfsense_web/zero/schema/index.ts
index bba561580a..3cca0f24a2 100644
--- a/surfsense_web/zero/schema/index.ts
+++ b/surfsense_web/zero/schema/index.ts
@@ -3,6 +3,7 @@ import { chatCommentTable, chatSessionStateTable, newChatMessageTable } from "./
 import { documentTable, searchSourceConnectorTable } from "./documents";
 import { folderTable } from "./folders";
 import { notificationTable } from "./inbox";
+import { userTable } from "./user";
 
 const chatCommentRelationships = relationships(chatCommentTable, ({ one }) => ({
 	message: one({
@@ -34,6 +35,7 @@ export const schema = createSchema({
 		newChatMessageTable,
 		chatCommentTable,
 		chatSessionStateTable,
+		userTable,
 	],
 	relationships: [chatCommentRelationships, newChatMessageRelationships],
 });
diff --git a/surfsense_web/zero/schema/user.ts b/surfsense_web/zero/schema/user.ts
new file mode 100644
index 0000000000..f483fa9b43
--- /dev/null
+++ b/surfsense_web/zero/schema/user.ts
@@ -0,0 +1,20 @@
+import { number, string, table } from "@rocicorp/zero";
+
+/**
+ * Live-meter slice of the ``user`` table replicated through Zero.
+ *
+ * ``premiumCreditMicrosLimit`` / ``premiumCreditMicrosUsed`` are stored
+ * as integer micro-USD (1_000_000 == $1.00). UI consumers divide by 1M
+ * when displaying. Sensitive fields (email, hashed_password, oauth, etc.)
+ * are intentionally omitted via the Postgres column-list publication so
+ * they never enter WAL replication.
+ */
+export const userTable = table("user")
+	.columns({
+		id: string(),
+		pagesLimit: number().from("pages_limit"),
+		pagesUsed: number().from("pages_used"),
+		premiumCreditMicrosLimit: number().from("premium_credit_micros_limit"),
+		premiumCreditMicrosUsed: number().from("premium_credit_micros_used"),
+	})
+	.primaryKey("id");
diff --git a/versions.json b/versions.json
new file mode 100644
index 0000000000..9a3c3429d1
--- /dev/null
+++ b/versions.json
@@ -0,0 +1,3 @@
+{
+	"0.1.0": "1.5.4"
+}